1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_TIMENS_H
#define _LINUX_TIMENS_H


#include <linux/sched.h>
#include <linux/nsproxy.h>
#include <linux/ns_common.h>
#include <linux/err.h>
#include <linux/time64.h>

struct user_namespace;
extern struct user_namespace init_user_ns;

struct seq_file;
struct vm_area_struct;

struct timens_offsets {
        struct timespec64 monotonic;
        struct timespec64 boottime;
};

struct time_namespace {
        struct user_namespace        *user_ns;
        struct ucounts                *ucounts;
        struct ns_common        ns;
        struct timens_offsets        offsets;
        struct page                *vvar_page;
        /* If set prevents changing offsets after any task joined namespace. */
        bool                        frozen_offsets;
} __randomize_layout;

extern struct time_namespace init_time_ns;

#ifdef CONFIG_TIME_NS
static inline struct time_namespace *to_time_ns(struct ns_common *ns)
{
        return container_of(ns, struct time_namespace, ns);
}
void __init time_ns_init(void);
extern int vdso_join_timens(struct task_struct *task,
                            struct time_namespace *ns);
extern void timens_commit(struct task_struct *tsk, struct time_namespace *ns);

static inline struct time_namespace *get_time_ns(struct time_namespace *ns)
{
        ns_ref_inc(ns);
        return ns;
}

struct time_namespace *copy_time_ns(u64 flags,
                                    struct user_namespace *user_ns,
                                    struct time_namespace *old_ns);
void free_time_ns(struct time_namespace *ns);
void timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk);
struct page *find_timens_vvar_page(struct vm_area_struct *vma);

static inline void put_time_ns(struct time_namespace *ns)
{
        if (ns_ref_put(ns))
                free_time_ns(ns);
}

void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m);

struct proc_timens_offset {
        int                        clockid;
        struct timespec64        val;
};

int proc_timens_set_offset(struct file *file, struct task_struct *p,
                           struct proc_timens_offset *offsets, int n);

static inline void timens_add_monotonic(struct timespec64 *ts)
{
        struct timens_offsets *ns_offsets = &current->nsproxy->time_ns->offsets;

        *ts = timespec64_add(*ts, ns_offsets->monotonic);
}

static inline void timens_add_boottime(struct timespec64 *ts)
{
        struct timens_offsets *ns_offsets = &current->nsproxy->time_ns->offsets;

        *ts = timespec64_add(*ts, ns_offsets->boottime);
}

static inline u64 timens_add_boottime_ns(u64 nsec)
{
        struct timens_offsets *ns_offsets = &current->nsproxy->time_ns->offsets;

        return nsec + timespec64_to_ns(&ns_offsets->boottime);
}

static inline void timens_sub_boottime(struct timespec64 *ts)
{
        struct timens_offsets *ns_offsets = &current->nsproxy->time_ns->offsets;

        *ts = timespec64_sub(*ts, ns_offsets->boottime);
}

ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim,
                                struct timens_offsets *offsets);

static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim)
{
        struct time_namespace *ns = current->nsproxy->time_ns;

        if (likely(ns == &init_time_ns))
                return tim;

        return do_timens_ktime_to_host(clockid, tim, &ns->offsets);
}

#else
static inline void __init time_ns_init(void)
{
}

static inline int vdso_join_timens(struct task_struct *task,
                                   struct time_namespace *ns)
{
        return 0;
}

static inline void timens_commit(struct task_struct *tsk,
                                 struct time_namespace *ns)
{
}

static inline struct time_namespace *get_time_ns(struct time_namespace *ns)
{
        return NULL;
}

static inline void put_time_ns(struct time_namespace *ns)
{
}

static inline
struct time_namespace *copy_time_ns(u64 flags,
                                    struct user_namespace *user_ns,
                                    struct time_namespace *old_ns)
{
        if (flags & CLONE_NEWTIME)
                return ERR_PTR(-EINVAL);

        return old_ns;
}

static inline void timens_on_fork(struct nsproxy *nsproxy,
                                 struct task_struct *tsk)
{
        return;
}

static inline struct page *find_timens_vvar_page(struct vm_area_struct *vma)
{
        return NULL;
}

static inline void timens_add_monotonic(struct timespec64 *ts) { }
static inline void timens_add_boottime(struct timespec64 *ts) { }

static inline u64 timens_add_boottime_ns(u64 nsec)
{
        return nsec;
}

static inline void timens_sub_boottime(struct timespec64 *ts) { }

static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim)
{
        return tim;
}
#endif

#endif /* _LINUX_TIMENS_H */



































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_UDP_TUNNEL_H
#define __NET_UDP_TUNNEL_H

#include <net/ip_tunnels.h>
#include <net/udp.h>

#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6.h>
#include <net/ipv6_stubs.h>
#endif

struct udp_port_cfg {
        u8                        family;

        /* Used only for kernel-created sockets */
        union {
                struct in_addr                local_ip;
#if IS_ENABLED(CONFIG_IPV6)
                struct in6_addr                local_ip6;
#endif
        };

        union {
                struct in_addr                peer_ip;
#if IS_ENABLED(CONFIG_IPV6)
                struct in6_addr                peer_ip6;
#endif
        };

        __be16                        local_udp_port;
        __be16                        peer_udp_port;
        int                        bind_ifindex;
        unsigned int                use_udp_checksums:1,
                                use_udp6_tx_checksums:1,
                                use_udp6_rx_checksums:1,
                                ipv6_v6only:1;
};

int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
                     struct socket **sockp);

#if IS_ENABLED(CONFIG_IPV6)
int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
                     struct socket **sockp);
#else
static inline int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
                                   struct socket **sockp)
{
        return 0;
}
#endif

static inline int udp_sock_create(struct net *net,
                                  struct udp_port_cfg *cfg,
                                  struct socket **sockp)
{
        if (cfg->family == AF_INET)
                return udp_sock_create4(net, cfg, sockp);

        if (cfg->family == AF_INET6)
                return udp_sock_create6(net, cfg, sockp);

        return -EPFNOSUPPORT;
}

typedef int (*udp_tunnel_encap_rcv_t)(struct sock *sk, struct sk_buff *skb);
typedef int (*udp_tunnel_encap_err_lookup_t)(struct sock *sk,
                                             struct sk_buff *skb);
typedef void (*udp_tunnel_encap_err_rcv_t)(struct sock *sk,
                                           struct sk_buff *skb, int err,
                                           __be16 port, u32 info, u8 *payload);
typedef void (*udp_tunnel_encap_destroy_t)(struct sock *sk);
typedef struct sk_buff *(*udp_tunnel_gro_receive_t)(struct sock *sk,
                                                    struct list_head *head,
                                                    struct sk_buff *skb);
typedef int (*udp_tunnel_gro_complete_t)(struct sock *sk, struct sk_buff *skb,
                                         int nhoff);

struct udp_tunnel_sock_cfg {
        void *sk_user_data;     /* user data used by encap_rcv call back */
        /* Used for setting up udp_sock fields, see udp.h for details */
        __u8  encap_type;
        udp_tunnel_encap_rcv_t encap_rcv;
        udp_tunnel_encap_err_lookup_t encap_err_lookup;
        udp_tunnel_encap_err_rcv_t encap_err_rcv;
        udp_tunnel_encap_destroy_t encap_destroy;
        udp_tunnel_gro_receive_t gro_receive;
        udp_tunnel_gro_complete_t gro_complete;
};

/* Setup the given (UDP) sock to receive UDP encapsulated packets */
void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
                           struct udp_tunnel_sock_cfg *sock_cfg);

/* -- List of parsable UDP tunnel types --
 *
 * Adding to this list will result in serious debate.  The main issue is
 * that this list is essentially a list of workarounds for either poorly
 * designed tunnels, or poorly designed device offloads.
 *
 * The parsing supported via these types should really be used for Rx
 * traffic only as the network stack will have already inserted offsets for
 * the location of the headers in the skb.  In addition any ports that are
 * pushed should be kept within the namespace without leaking to other
 * devices such as VFs or other ports on the same device.
 *
 * It is strongly encouraged to use CHECKSUM_COMPLETE for Rx to avoid the
 * need to use this for Rx checksum offload.  It should not be necessary to
 * call this function to perform Tx offloads on outgoing traffic.
 */
enum udp_parsable_tunnel_type {
        UDP_TUNNEL_TYPE_VXLAN          = BIT(0), /* RFC 7348 */
        UDP_TUNNEL_TYPE_GENEVE          = BIT(1), /* draft-ietf-nvo3-geneve */
        UDP_TUNNEL_TYPE_VXLAN_GPE = BIT(2), /* draft-ietf-nvo3-vxlan-gpe */
};

struct udp_tunnel_info {
        unsigned short type;
        sa_family_t sa_family;
        __be16 port;
        u8 hw_priv;
};

/* Notify network devices of offloadable types */
void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock,
                             unsigned short type);
void udp_tunnel_drop_rx_port(struct net_device *dev, struct socket *sock,
                             unsigned short type);
void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type);
void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type);

/* Transmit the skb using UDP encapsulation. */
void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
                         __be32 src, __be32 dst, __u8 tos, __u8 ttl,
                         __be16 df, __be16 src_port, __be16 dst_port,
                         bool xnet, bool nocheck, u16 ipcb_flags);

void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
                          struct sk_buff *skb,
                          struct net_device *dev,
                          const struct in6_addr *saddr,
                          const struct in6_addr *daddr,
                          __u8 prio, __u8 ttl, __be32 label,
                          __be16 src_port, __be16 dst_port, bool nocheck,
                          u16 ip6cb_flags);

void udp_tunnel_sock_release(struct socket *sock);

struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb,
                                     struct net_device *dev,
                                     struct net *net, int oif,
                                     __be32 *saddr,
                                     const struct ip_tunnel_key *key,
                                     __be16 sport, __be16 dport, u8 tos,
                                     struct dst_cache *dst_cache);
struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb,
                                         struct net_device *dev,
                                         struct net *net,
                                         struct socket *sock, int oif,
                                         struct in6_addr *saddr,
                                         const struct ip_tunnel_key *key,
                                         __be16 sport, __be16 dport, u8 dsfield,
                                         struct dst_cache *dst_cache);

struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family,
                                    const unsigned long *flags,
                                    __be64 tunnel_id, int md_size);

#ifdef CONFIG_INET
static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum)
{
        int type = udp_csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;

        return iptunnel_handle_offloads(skb, type);
}
#endif

#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL)
void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add);
void udp_tunnel_update_gro_rcv(struct sock *sk, bool add);
#else
static inline void udp_tunnel_update_gro_lookup(struct net *net,
                                                struct sock *sk, bool add) {}
static inline void udp_tunnel_update_gro_rcv(struct sock *sk, bool add) {}
#endif

static inline void udp_tunnel_cleanup_gro(struct sock *sk)
{
        udp_tunnel_update_gro_rcv(sk, false);
        udp_tunnel_update_gro_lookup(sock_net(sk), sk, false);
}

static inline void udp_tunnel_encap_enable(struct sock *sk)
{
        if (udp_test_and_set_bit(ENCAP_ENABLED, sk))
                return;

#if IS_ENABLED(CONFIG_IPV6)
        if (READ_ONCE(sk->sk_family) == PF_INET6)
                ipv6_stub->udpv6_encap_enable();
#endif
        udp_encap_enable();
}

#define UDP_TUNNEL_NIC_MAX_TABLES        4

enum udp_tunnel_nic_info_flags {
        /* Device only supports offloads when it's open, all ports
         * will be removed before close and re-added after open.
         */
        UDP_TUNNEL_NIC_INFO_OPEN_ONLY        = BIT(0),
        /* Device supports only IPv4 tunnels */
        UDP_TUNNEL_NIC_INFO_IPV4_ONLY        = BIT(1),
        /* Device has hard-coded the IANA VXLAN port (4789) as VXLAN.
         * This port must not be counted towards n_entries of any table.
         * Driver will not receive any callback associated with port 4789.
         */
        UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN        = BIT(2),
};

struct udp_tunnel_nic;

#define UDP_TUNNEL_NIC_MAX_SHARING_DEVICES        (U16_MAX / 2)

struct udp_tunnel_nic_shared {
        struct udp_tunnel_nic *udp_tunnel_nic_info;

        struct list_head devices;
};

struct udp_tunnel_nic_shared_node {
        struct net_device *dev;
        struct list_head list;
};

/**
 * struct udp_tunnel_nic_info - driver UDP tunnel offload information
 * @set_port:        callback for adding a new port
 * @unset_port:        callback for removing a port
 * @sync_table:        callback for syncing the entire port table at once
 * @shared:        reference to device global state (optional)
 * @flags:        device flags from enum udp_tunnel_nic_info_flags
 * @tables:        UDP port tables this device has
 * @tables.n_entries:                number of entries in this table
 * @tables.tunnel_types:        types of tunnels this table accepts
 *
 * Drivers are expected to provide either @set_port and @unset_port callbacks
 * or the @sync_table callback. Callbacks are invoked with rtnl lock held.
 *
 * Devices which (misguidedly) share the UDP tunnel port table across multiple
 * netdevs should allocate an instance of struct udp_tunnel_nic_shared and
 * point @shared at it.
 * There must never be more than %UDP_TUNNEL_NIC_MAX_SHARING_DEVICES devices
 * sharing a table.
 *
 * Known limitations:
 *  - UDP tunnel port notifications are fundamentally best-effort -
 *    it is likely the driver will both see skbs which use a UDP tunnel port,
 *    while not being a tunneled skb, and tunnel skbs from other ports -
 *    drivers should only use these ports for non-critical RX-side offloads,
 *    e.g. the checksum offload;
 *  - none of the devices care about the socket family at present, so we don't
 *    track it. Please extend this code if you care.
 */
struct udp_tunnel_nic_info {
        /* one-by-one */
        int (*set_port)(struct net_device *dev,
                        unsigned int table, unsigned int entry,
                        struct udp_tunnel_info *ti);
        int (*unset_port)(struct net_device *dev,
                          unsigned int table, unsigned int entry,
                          struct udp_tunnel_info *ti);

        /* all at once */
        int (*sync_table)(struct net_device *dev, unsigned int table);

        struct udp_tunnel_nic_shared *shared;

        unsigned int flags;

        struct udp_tunnel_nic_table_info {
                unsigned int n_entries;
                unsigned int tunnel_types;
        } tables[UDP_TUNNEL_NIC_MAX_TABLES];
};

/* UDP tunnel module dependencies
 *
 * Tunnel drivers are expected to have a hard dependency on the udp_tunnel
 * module. NIC drivers are not, they just attach their
 * struct udp_tunnel_nic_info to the netdev and wait for callbacks to come.
 * Loading a tunnel driver will cause the udp_tunnel module to be loaded
 * and only then will all the required state structures be allocated.
 * Since we want a weak dependency from the drivers and the core to udp_tunnel
 * we call things through the following stubs.
 */
struct udp_tunnel_nic_ops {
        void (*get_port)(struct net_device *dev, unsigned int table,
                         unsigned int idx, struct udp_tunnel_info *ti);
        void (*set_port_priv)(struct net_device *dev, unsigned int table,
                              unsigned int idx, u8 priv);
        void (*add_port)(struct net_device *dev, struct udp_tunnel_info *ti);
        void (*del_port)(struct net_device *dev, struct udp_tunnel_info *ti);
        void (*reset_ntf)(struct net_device *dev);

        size_t (*dump_size)(struct net_device *dev, unsigned int table);
        int (*dump_write)(struct net_device *dev, unsigned int table,
                          struct sk_buff *skb);
        void (*assert_locked)(struct net_device *dev);
        void (*lock)(struct net_device *dev);
        void (*unlock)(struct net_device *dev);
};

#ifdef CONFIG_INET
extern const struct udp_tunnel_nic_ops *udp_tunnel_nic_ops;
#else
#define udp_tunnel_nic_ops        ((struct udp_tunnel_nic_ops *)NULL)
#endif

static inline void
udp_tunnel_nic_get_port(struct net_device *dev, unsigned int table,
                        unsigned int idx, struct udp_tunnel_info *ti)
{
        /* This helper is used from .sync_table, we indicate empty entries
         * by zero'ed @ti. Drivers which need to know the details of a port
         * when it gets deleted should use the .set_port / .unset_port
         * callbacks.
         * Zero out here, otherwise !CONFIG_INET causes uninitilized warnings.
         */
        memset(ti, 0, sizeof(*ti));

        if (udp_tunnel_nic_ops)
                udp_tunnel_nic_ops->get_port(dev, table, idx, ti);
}

static inline void
udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table,
                             unsigned int idx, u8 priv)
{
        if (udp_tunnel_nic_ops) {
                udp_tunnel_nic_ops->assert_locked(dev);
                udp_tunnel_nic_ops->set_port_priv(dev, table, idx, priv);
        }
}

static inline void udp_tunnel_nic_assert_locked(struct net_device *dev)
{
        if (udp_tunnel_nic_ops)
                udp_tunnel_nic_ops->assert_locked(dev);
}

static inline void udp_tunnel_nic_lock(struct net_device *dev)
{
        if (udp_tunnel_nic_ops)
                udp_tunnel_nic_ops->lock(dev);
}

static inline void udp_tunnel_nic_unlock(struct net_device *dev)
{
        if (udp_tunnel_nic_ops)
                udp_tunnel_nic_ops->unlock(dev);
}

static inline void
udp_tunnel_nic_add_port(struct net_device *dev, struct udp_tunnel_info *ti)
{
        if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
                return;
        if (udp_tunnel_nic_ops)
                udp_tunnel_nic_ops->add_port(dev, ti);
}

static inline void
udp_tunnel_nic_del_port(struct net_device *dev, struct udp_tunnel_info *ti)
{
        if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
                return;
        if (udp_tunnel_nic_ops)
                udp_tunnel_nic_ops->del_port(dev, ti);
}

/**
 * udp_tunnel_nic_reset_ntf() - device-originating reset notification
 * @dev: network interface device structure
 *
 * Called by the driver to inform the core that the entire UDP tunnel port
 * state has been lost, usually due to device reset. Core will assume device
 * forgot all the ports and issue .set_port and .sync_table callbacks as
 * necessary.
 *
 * This function must be called with rtnl lock held, and will issue all
 * the callbacks before returning.
 */
static inline void udp_tunnel_nic_reset_ntf(struct net_device *dev)
{
        if (udp_tunnel_nic_ops)
                udp_tunnel_nic_ops->reset_ntf(dev);
}

static inline size_t
udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table)
{
        size_t ret;

        if (!udp_tunnel_nic_ops)
                return 0;

        udp_tunnel_nic_ops->lock(dev);
        ret = udp_tunnel_nic_ops->dump_size(dev, table);
        udp_tunnel_nic_ops->unlock(dev);

        return ret;
}

static inline int
udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table,
                          struct sk_buff *skb)
{
        int ret;

        if (!udp_tunnel_nic_ops)
                return 0;

        udp_tunnel_nic_ops->lock(dev);
        ret = udp_tunnel_nic_ops->dump_write(dev, table, skb);
        udp_tunnel_nic_ops->unlock(dev);

        return ret;
}

static inline void udp_tunnel_get_rx_info(struct net_device *dev)
{
        ASSERT_RTNL();
        if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
                return;
        udp_tunnel_nic_assert_locked(dev);
        call_netdevice_notifiers(NETDEV_UDP_TUNNEL_PUSH_INFO, dev);
}

static inline void udp_tunnel_drop_rx_info(struct net_device *dev)
{
        ASSERT_RTNL();
        if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
                return;
        udp_tunnel_nic_assert_locked(dev);
        call_netdevice_notifiers(NETDEV_UDP_TUNNEL_DROP_INFO, dev);
}

#endif









































































































































































































































































































































































































































  126 
  126 


  127 























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
// SPDX-License-Identifier: GPL-2.0-only
/*
 * x86 APERF/MPERF KHz calculation for
 * /sys/.../cpufreq/scaling_cur_freq
 *
 * Copyright (C) 2017 Intel Corp.
 * Author: Len Brown <len.brown@intel.com>
 */
#include <linux/cpufreq.h>
#include <linux/delay.h>
#include <linux/ktime.h>
#include <linux/math64.h>
#include <linux/percpu.h>
#include <linux/rcupdate.h>
#include <linux/sched/isolation.h>
#include <linux/sched/topology.h>
#include <linux/smp.h>
#include <linux/syscore_ops.h>

#include <asm/cpu.h>
#include <asm/cpu_device_id.h>
#include <asm/intel-family.h>
#include <asm/msr.h>

#include "cpu.h"

struct aperfmperf {
        seqcount_t        seq;
        unsigned long        last_update;
        u64                acnt;
        u64                mcnt;
        u64                aperf;
        u64                mperf;
};

static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = {
        .seq = SEQCNT_ZERO(cpu_samples.seq)
};

static void init_counter_refs(void)
{
        u64 aperf, mperf;

        rdmsrq(MSR_IA32_APERF, aperf);
        rdmsrq(MSR_IA32_MPERF, mperf);

        this_cpu_write(cpu_samples.aperf, aperf);
        this_cpu_write(cpu_samples.mperf, mperf);
}

#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
/*
 * APERF/MPERF frequency ratio computation.
 *
 * The scheduler wants to do frequency invariant accounting and needs a <1
 * ratio to account for the 'current' frequency, corresponding to
 * freq_curr / freq_max.
 *
 * Since the frequency freq_curr on x86 is controlled by micro-controller and
 * our P-state setting is little more than a request/hint, we need to observe
 * the effective frequency 'BusyMHz', i.e. the average frequency over a time
 * interval after discarding idle time. This is given by:
 *
 *   BusyMHz = delta_APERF / delta_MPERF * freq_base
 *
 * where freq_base is the max non-turbo P-state.
 *
 * The freq_max term has to be set to a somewhat arbitrary value, because we
 * can't know which turbo states will be available at a given point in time:
 * it all depends on the thermal headroom of the entire package. We set it to
 * the turbo level with 4 cores active.
 *
 * Benchmarks show that's a good compromise between the 1C turbo ratio
 * (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
 * which would ignore the entire turbo range (a conspicuous part, making
 * freq_curr/freq_max always maxed out).
 *
 * An exception to the heuristic above is the Atom uarch, where we choose the
 * highest turbo level for freq_max since Atom's are generally oriented towards
 * power efficiency.
 *
 * Setting freq_max to anything less than the 1C turbo ratio makes the ratio
 * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
 */

DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);

static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;

void arch_set_max_freq_ratio(bool turbo_disabled)
{
        arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
                                        arch_turbo_freq_ratio;
}
EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);

static bool __init turbo_disabled(void)
{
        u64 misc_en;
        int err;

        err = rdmsrq_safe(MSR_IA32_MISC_ENABLE, &misc_en);
        if (err)
                return false;

        return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
}

static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
{
        int err;

        err = rdmsrq_safe(MSR_ATOM_CORE_RATIOS, base_freq);
        if (err)
                return false;

        err = rdmsrq_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
        if (err)
                return false;

        *base_freq = (*base_freq >> 16) & 0x3F;     /* max P state */
        *turbo_freq = *turbo_freq & 0x3F;           /* 1C turbo    */

        return true;
}

#define X86_MATCH(vfm)                                                \
        X86_MATCH_VFM_FEATURE(vfm, X86_FEATURE_APERFMPERF, NULL)

static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = {
        X86_MATCH(INTEL_XEON_PHI_KNL),
        X86_MATCH(INTEL_XEON_PHI_KNM),
        {}
};

static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = {
        X86_MATCH(INTEL_SKYLAKE_X),
        {}
};

static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = {
        X86_MATCH(INTEL_ATOM_GOLDMONT),
        X86_MATCH(INTEL_ATOM_GOLDMONT_D),
        X86_MATCH(INTEL_ATOM_GOLDMONT_PLUS),
        {}
};

static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
                                          int num_delta_fratio)
{
        int fratio, delta_fratio, found;
        int err, i;
        u64 msr;

        err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq);
        if (err)
                return false;

        *base_freq = (*base_freq >> 8) & 0xFF;            /* max P state */

        err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr);
        if (err)
                return false;

        fratio = (msr >> 8) & 0xFF;
        i = 16;
        found = 0;
        do {
                if (found >= num_delta_fratio) {
                        *turbo_freq = fratio;
                        return true;
                }

                delta_fratio = (msr >> (i + 5)) & 0x7;

                if (delta_fratio) {
                        found += 1;
                        fratio -= delta_fratio;
                }

                i += 8;
        } while (i < 64);

        return true;
}

static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
{
        u64 ratios, counts;
        u32 group_size;
        int err, i;

        err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq);
        if (err)
                return false;

        *base_freq = (*base_freq >> 8) & 0xFF;      /* max P state */

        err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
        if (err)
                return false;

        err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
        if (err)
                return false;

        for (i = 0; i < 64; i += 8) {
                group_size = (counts >> i) & 0xFF;
                if (group_size >= size) {
                        *turbo_freq = (ratios >> i) & 0xFF;
                        return true;
                }
        }

        return false;
}

static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
{
        u64 msr;
        int err;

        err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq);
        if (err)
                return false;

        err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr);
        if (err)
                return false;

        *base_freq = (*base_freq >> 8) & 0xFF;    /* max P state */
        *turbo_freq = (msr >> 24) & 0xFF;         /* 4C turbo    */

        /* The CPU may have less than 4 cores */
        if (!*turbo_freq)
                *turbo_freq = msr & 0xFF;         /* 1C turbo    */

        return true;
}

static bool __init intel_set_max_freq_ratio(void)
{
        u64 base_freq, turbo_freq;
        u64 turbo_ratio;

        if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
                goto out;

        if (x86_match_cpu(has_glm_turbo_ratio_limits) &&
            skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
                goto out;

        if (x86_match_cpu(has_knl_turbo_ratio_limits) &&
            knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
                goto out;

        if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
            skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4))
                goto out;

        if (core_set_max_freq_ratio(&base_freq, &turbo_freq))
                goto out;

        return false;

out:
        /*
         * Some hypervisors advertise X86_FEATURE_APERFMPERF
         * but then fill all MSR's with zeroes.
         * Some CPUs have turbo boost but don't declare any turbo ratio
         * in MSR_TURBO_RATIO_LIMIT.
         */
        if (!base_freq || !turbo_freq) {
                pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
                return false;
        }

        turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq);
        if (!turbo_ratio) {
                pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
                return false;
        }

        arch_turbo_freq_ratio = turbo_ratio;
        arch_set_max_freq_ratio(turbo_disabled());

        return true;
}

#ifdef CONFIG_PM_SLEEP
static struct syscore_ops freq_invariance_syscore_ops = {
        .resume = init_counter_refs,
};

static void register_freq_invariance_syscore_ops(void)
{
        register_syscore_ops(&freq_invariance_syscore_ops);
}
#else
static inline void register_freq_invariance_syscore_ops(void) {}
#endif

static void freq_invariance_enable(void)
{
        if (static_branch_unlikely(&arch_scale_freq_key)) {
                WARN_ON_ONCE(1);
                return;
        }
        static_branch_enable_cpuslocked(&arch_scale_freq_key);
        register_freq_invariance_syscore_ops();
        pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
}

void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled)
{
        arch_turbo_freq_ratio = ratio;
        arch_set_max_freq_ratio(turbo_disabled);
        freq_invariance_enable();
}

static void __init bp_init_freq_invariance(void)
{
        if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
                return;

        if (intel_set_max_freq_ratio()) {
                guard(cpus_read_lock)();
                freq_invariance_enable();
        }
}

static void disable_freq_invariance_workfn(struct work_struct *work)
{
        int cpu;

        static_branch_disable(&arch_scale_freq_key);

        /*
         * Set arch_freq_scale to a default value on all cpus
         * This negates the effect of scaling
         */
        for_each_possible_cpu(cpu)
                per_cpu(arch_freq_scale, cpu) = SCHED_CAPACITY_SCALE;
}

static DECLARE_WORK(disable_freq_invariance_work,
                    disable_freq_invariance_workfn);

DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale);

static DEFINE_STATIC_KEY_FALSE(arch_hybrid_cap_scale_key);

struct arch_hybrid_cpu_scale {
        unsigned long capacity;
        unsigned long freq_ratio;
};

static struct arch_hybrid_cpu_scale __percpu *arch_cpu_scale;

/**
 * arch_enable_hybrid_capacity_scale() - Enable hybrid CPU capacity scaling
 *
 * Allocate memory for per-CPU data used by hybrid CPU capacity scaling,
 * initialize it and set the static key controlling its code paths.
 *
 * Must be called before arch_set_cpu_capacity().
 */
bool arch_enable_hybrid_capacity_scale(void)
{
        int cpu;

        if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) {
                WARN_ONCE(1, "Hybrid CPU capacity scaling already enabled");
                return true;
        }

        arch_cpu_scale = alloc_percpu(struct arch_hybrid_cpu_scale);
        if (!arch_cpu_scale)
                return false;

        for_each_possible_cpu(cpu) {
                per_cpu_ptr(arch_cpu_scale, cpu)->capacity = SCHED_CAPACITY_SCALE;
                per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio = arch_max_freq_ratio;
        }

        static_branch_enable(&arch_hybrid_cap_scale_key);

        pr_info("Hybrid CPU capacity scaling enabled\n");

        return true;
}

/**
 * arch_set_cpu_capacity() - Set scale-invariance parameters for a CPU
 * @cpu: Target CPU.
 * @cap: Capacity of @cpu at its maximum frequency, relative to @max_cap.
 * @max_cap: System-wide maximum CPU capacity.
 * @cap_freq: Frequency of @cpu corresponding to @cap.
 * @base_freq: Frequency of @cpu at which MPERF counts.
 *
 * The units in which @cap and @max_cap are expressed do not matter, so long
 * as they are consistent, because the former is effectively divided by the
 * latter.  Analogously for @cap_freq and @base_freq.
 *
 * After calling this function for all CPUs, call arch_rebuild_sched_domains()
 * to let the scheduler know that capacity-aware scheduling can be used going
 * forward.
 */
void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap,
                           unsigned long cap_freq, unsigned long base_freq)
{
        if (static_branch_likely(&arch_hybrid_cap_scale_key)) {
                WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity,
                           div_u64(cap << SCHED_CAPACITY_SHIFT, max_cap));
                WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio,
                           div_u64(cap_freq << SCHED_CAPACITY_SHIFT, base_freq));
        } else {
                WARN_ONCE(1, "Hybrid CPU capacity scaling not enabled");
        }
}

unsigned long arch_scale_cpu_capacity(int cpu)
{
        if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
                return READ_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity);

        return SCHED_CAPACITY_SCALE;
}
EXPORT_SYMBOL_GPL(arch_scale_cpu_capacity);

static void scale_freq_tick(u64 acnt, u64 mcnt)
{
        u64 freq_scale, freq_ratio;

        if (!arch_scale_freq_invariant())
                return;

        if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
                goto error;

        if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
                freq_ratio = READ_ONCE(this_cpu_ptr(arch_cpu_scale)->freq_ratio);
        else
                freq_ratio = arch_max_freq_ratio;

        if (check_mul_overflow(mcnt, freq_ratio, &mcnt) || !mcnt)
                goto error;

        freq_scale = div64_u64(acnt, mcnt);
        if (!freq_scale)
                goto error;

        if (freq_scale > SCHED_CAPACITY_SCALE)
                freq_scale = SCHED_CAPACITY_SCALE;

        this_cpu_write(arch_freq_scale, freq_scale);
        return;

error:
        pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
        schedule_work(&disable_freq_invariance_work);
}
#else
static inline void bp_init_freq_invariance(void) { }
static inline void scale_freq_tick(u64 acnt, u64 mcnt) { }
#endif /* CONFIG_X86_64 && CONFIG_SMP */

void arch_scale_freq_tick(void)
{
        struct aperfmperf *s = this_cpu_ptr(&cpu_samples);
        u64 acnt, mcnt, aperf, mperf;

        if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
                return;

        rdmsrq(MSR_IA32_APERF, aperf);
        rdmsrq(MSR_IA32_MPERF, mperf);
        acnt = aperf - s->aperf;
        mcnt = mperf - s->mperf;

        s->aperf = aperf;
        s->mperf = mperf;

        raw_write_seqcount_begin(&s->seq);
        s->last_update = jiffies;
        s->acnt = acnt;
        s->mcnt = mcnt;
        raw_write_seqcount_end(&s->seq);

        scale_freq_tick(acnt, mcnt);
}

/*
 * Discard samples older than the define maximum sample age of 20ms. There
 * is no point in sending IPIs in such a case. If the scheduler tick was
 * not running then the CPU is either idle or isolated.
 */
#define MAX_SAMPLE_AGE        ((unsigned long)HZ / 50)

int arch_freq_get_on_cpu(int cpu)
{
        struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu);
        unsigned int seq, freq;
        unsigned long last;
        u64 acnt, mcnt;

        if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
                goto fallback;

        do {
                seq = raw_read_seqcount_begin(&s->seq);
                last = s->last_update;
                acnt = s->acnt;
                mcnt = s->mcnt;
        } while (read_seqcount_retry(&s->seq, seq));

        /*
         * Bail on invalid count and when the last update was too long ago,
         * which covers idle and NOHZ full CPUs.
         */
        if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE)
                goto fallback;

        return div64_u64((cpu_khz * acnt), mcnt);

fallback:
        freq = cpufreq_quick_get(cpu);
        return freq ? freq : cpu_khz;
}

static int __init bp_init_aperfmperf(void)
{
        if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
                return 0;

        init_counter_refs();
        bp_init_freq_invariance();
        return 0;
}
early_initcall(bp_init_aperfmperf);

void ap_init_aperfmperf(void)
{
        if (cpu_feature_enabled(X86_FEATURE_APERFMPERF))
                init_counter_refs();
}
























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_HUGETLB_INLINE_H
#define _LINUX_HUGETLB_INLINE_H

#ifdef CONFIG_HUGETLB_PAGE

#include <linux/mm.h>

static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
{
        return !!(vma->vm_flags & VM_HUGETLB);
}

#else

static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
{
        return false;
}

#endif

#endif




















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (c) 2001-2003 Patrick Mochel <mochel@osdl.org>
 * Copyright (c) 2004-2009 Greg Kroah-Hartman <gregkh@suse.de>
 * Copyright (c) 2008-2012 Novell Inc.
 * Copyright (c) 2012-2019 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 * Copyright (c) 2012-2019 Linux Foundation
 *
 * Core driver model functions and structures that should not be
 * shared outside of the drivers/base/ directory.
 *
 */
#include <linux/notifier.h>

/**
 * struct subsys_private - structure to hold the private to the driver core portions of the bus_type/class structure.
 *
 * @subsys - the struct kset that defines this subsystem
 * @devices_kset - the subsystem's 'devices' directory
 * @interfaces - list of subsystem interfaces associated
 * @mutex - protect the devices, and interfaces lists.
 *
 * @drivers_kset - the list of drivers associated
 * @klist_devices - the klist to iterate over the @devices_kset
 * @klist_drivers - the klist to iterate over the @drivers_kset
 * @bus_notifier - the bus notifier list for anything that cares about things
 *                 on this bus.
 * @bus - pointer back to the struct bus_type that this structure is associated
 *        with.
 * @dev_root: Default device to use as the parent.
 *
 * @glue_dirs - "glue" directory to put in-between the parent device to
 *              avoid namespace conflicts
 * @class - pointer back to the struct class that this structure is associated
 *          with.
 * @lock_key:        Lock class key for use by the lock validator
 *
 * This structure is the one that is the actual kobject allowing struct
 * bus_type/class to be statically allocated safely.  Nothing outside of the
 * driver core should ever touch these fields.
 */
struct subsys_private {
        struct kset subsys;
        struct kset *devices_kset;
        struct list_head interfaces;
        struct mutex mutex;

        struct kset *drivers_kset;
        struct klist klist_devices;
        struct klist klist_drivers;
        struct blocking_notifier_head bus_notifier;
        unsigned int drivers_autoprobe:1;
        const struct bus_type *bus;
        struct device *dev_root;

        struct kset glue_dirs;
        const struct class *class;

        struct lock_class_key lock_key;
};
#define to_subsys_private(obj) container_of_const(obj, struct subsys_private, subsys.kobj)

static inline struct subsys_private *subsys_get(struct subsys_private *sp)
{
        if (sp)
                kset_get(&sp->subsys);
        return sp;
}

static inline void subsys_put(struct subsys_private *sp)
{
        if (sp)
                kset_put(&sp->subsys);
}

struct subsys_private *bus_to_subsys(const struct bus_type *bus);
struct subsys_private *class_to_subsys(const struct class *class);

struct driver_private {
        struct kobject kobj;
        struct klist klist_devices;
        struct klist_node knode_bus;
        struct module_kobject *mkobj;
        struct device_driver *driver;
};
#define to_driver(obj) container_of(obj, struct driver_private, kobj)

/**
 * struct device_private - structure to hold the private to the driver core portions of the device structure.
 *
 * @klist_children - klist containing all children of this device
 * @knode_parent - node in sibling list
 * @knode_driver - node in driver list
 * @knode_bus - node in bus list
 * @knode_class - node in class list
 * @deferred_probe - entry in deferred_probe_list which is used to retry the
 *        binding of drivers which were unable to get all the resources needed by
 *        the device; typically because it depends on another driver getting
 *        probed first.
 * @async_driver - pointer to device driver awaiting probe via async_probe
 * @device - pointer back to the struct device that this structure is
 * associated with.
 * @dead - This device is currently either in the process of or has been
 *        removed from the system. Any asynchronous events scheduled for this
 *        device should exit without taking any action.
 *
 * Nothing outside of the driver core should ever touch these fields.
 */
struct device_private {
        struct klist klist_children;
        struct klist_node knode_parent;
        struct klist_node knode_driver;
        struct klist_node knode_bus;
        struct klist_node knode_class;
        struct list_head deferred_probe;
        const struct device_driver *async_driver;
        char *deferred_probe_reason;
        struct device *device;
        u8 dead:1;
};
#define to_device_private_parent(obj)        \
        container_of(obj, struct device_private, knode_parent)
#define to_device_private_driver(obj)        \
        container_of(obj, struct device_private, knode_driver)
#define to_device_private_bus(obj)        \
        container_of(obj, struct device_private, knode_bus)
#define to_device_private_class(obj)        \
        container_of(obj, struct device_private, knode_class)

/* initialisation functions */
int devices_init(void);
int buses_init(void);
int classes_init(void);
int firmware_init(void);
#ifdef CONFIG_SYS_HYPERVISOR
int hypervisor_init(void);
#else
static inline int hypervisor_init(void) { return 0; }
#endif
int platform_bus_init(void);
int faux_bus_init(void);
void cpu_dev_init(void);
void container_dev_init(void);
#ifdef CONFIG_AUXILIARY_BUS
void auxiliary_bus_init(void);
#else
static inline void auxiliary_bus_init(void) { }
#endif

struct kobject *virtual_device_parent(void);

int bus_add_device(struct device *dev);
void bus_probe_device(struct device *dev);
void bus_remove_device(struct device *dev);
void bus_notify(struct device *dev, enum bus_notifier_event value);
bool bus_is_registered(const struct bus_type *bus);

int bus_add_driver(struct device_driver *drv);
void bus_remove_driver(struct device_driver *drv);
void device_release_driver_internal(struct device *dev, const struct device_driver *drv,
                                    struct device *parent);

void driver_detach(const struct device_driver *drv);
void driver_deferred_probe_del(struct device *dev);
void device_set_deferred_probe_reason(const struct device *dev, struct va_format *vaf);
static inline int driver_match_device(const struct device_driver *drv,
                                      struct device *dev)
{
        return drv->bus->match ? drv->bus->match(dev, drv) : 1;
}

static inline void dev_sync_state(struct device *dev)
{
        if (dev->bus->sync_state)
                dev->bus->sync_state(dev);
        else if (dev->driver && dev->driver->sync_state)
                dev->driver->sync_state(dev);
}

int driver_add_groups(const struct device_driver *drv, const struct attribute_group **groups);
void driver_remove_groups(const struct device_driver *drv, const struct attribute_group **groups);
void device_driver_detach(struct device *dev);

static inline void device_set_driver(struct device *dev, const struct device_driver *drv)
{
        /*
         * Majority (all?) read accesses to dev->driver happens either
         * while holding device lock or in bus/driver code that is only
         * invoked when the device is bound to a driver and there is no
         * concern of the pointer being changed while it is being read.
         * However when reading device's uevent file we read driver pointer
         * without taking device lock (so we do not block there for
         * arbitrary amount of time). We use WRITE_ONCE() here to prevent
         * tearing so that READ_ONCE() can safely be used in uevent code.
         */
        // FIXME - this cast should not be needed "soon"
        WRITE_ONCE(dev->driver, (struct device_driver *)drv);
}

int devres_release_all(struct device *dev);
void device_block_probing(void);
void device_unblock_probing(void);
void deferred_probe_extend_timeout(void);
void driver_deferred_probe_trigger(void);
const char *device_get_devnode(const struct device *dev, umode_t *mode,
                               kuid_t *uid, kgid_t *gid, const char **tmp);

/* /sys/devices directory */
extern struct kset *devices_kset;
void devices_kset_move_last(struct device *dev);

#if defined(CONFIG_MODULES) && defined(CONFIG_SYSFS)
int module_add_driver(struct module *mod, const struct device_driver *drv);
void module_remove_driver(const struct device_driver *drv);
#else
static inline int module_add_driver(struct module *mod,
                                    struct device_driver *drv)
{
        return 0;
}
static inline void module_remove_driver(struct device_driver *drv) { }
#endif

#ifdef CONFIG_DEVTMPFS
int devtmpfs_init(void);
#else
static inline int devtmpfs_init(void) { return 0; }
#endif

#ifdef CONFIG_BLOCK
extern const struct class block_class;
static inline bool is_blockdev(struct device *dev)
{
        return dev->class == &block_class;
}
#else
static inline bool is_blockdev(struct device *dev) { return false; }
#endif

/* Device links support */
int device_links_read_lock(void);
void device_links_read_unlock(int idx);
int device_links_read_lock_held(void);
int device_links_check_suppliers(struct device *dev);
void device_links_force_bind(struct device *dev);
void device_links_driver_bound(struct device *dev);
void device_links_driver_cleanup(struct device *dev);
void device_links_no_driver(struct device *dev);
bool device_links_busy(struct device *dev);
void device_links_unbind_consumers(struct device *dev);
bool device_link_flag_is_sync_state_only(u32 flags);
void fw_devlink_drivers_done(void);
void fw_devlink_probing_done(void);

#define dev_for_each_link_to_supplier(__link, __dev)        \
        list_for_each_entry_srcu(__link, &(__dev)->links.suppliers, c_node, \
                                 device_links_read_lock_held())

#define dev_for_each_link_to_consumer(__link, __dev)        \
        list_for_each_entry_srcu(__link, &(__dev)->links.consumers, s_node, \
                                 device_links_read_lock_held())

/* device pm support */
void device_pm_move_to_tail(struct device *dev);

#ifdef CONFIG_DEVTMPFS
int devtmpfs_create_node(struct device *dev);
int devtmpfs_delete_node(struct device *dev);
#else
static inline int devtmpfs_create_node(struct device *dev) { return 0; }
static inline int devtmpfs_delete_node(struct device *dev) { return 0; }
#endif

void software_node_notify(struct device *dev);
void software_node_notify_remove(struct device *dev);








































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_VMSTAT_H
#define _LINUX_VMSTAT_H

#include <linux/types.h>
#include <linux/percpu.h>
#include <linux/mmzone.h>
#include <linux/vm_event_item.h>
#include <linux/atomic.h>
#include <linux/static_key.h>
#include <linux/mmdebug.h>

#ifdef CONFIG_NUMA
DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key);
#endif

struct reclaim_stat {
        unsigned nr_dirty;
        unsigned nr_unqueued_dirty;
        unsigned nr_congested;
        unsigned nr_writeback;
        unsigned nr_immediate;
        unsigned nr_pageout;
        unsigned nr_activate[ANON_AND_FILE];
        unsigned nr_ref_keep;
        unsigned nr_unmap_fail;
        unsigned nr_lazyfree_fail;
        unsigned nr_demoted;
};

/* Stat data for system wide items */
enum vm_stat_item {
        NR_DIRTY_THRESHOLD,
        NR_DIRTY_BG_THRESHOLD,
        NR_MEMMAP_PAGES,        /* page metadata allocated through buddy allocator */
        NR_MEMMAP_BOOT_PAGES,        /* page metadata allocated through boot allocator */
        NR_VM_STAT_ITEMS,
};

#ifdef CONFIG_VM_EVENT_COUNTERS
/*
 * Light weight per cpu counter implementation.
 *
 * Counters should only be incremented and no critical kernel component
 * should rely on the counter values.
 *
 * Counters are handled completely inline. On many platforms the code
 * generated will simply be the increment of a global address.
 */

struct vm_event_state {
        unsigned long event[NR_VM_EVENT_ITEMS];
};

DECLARE_PER_CPU(struct vm_event_state, vm_event_states);

/*
 * vm counters are allowed to be racy. Use raw_cpu_ops to avoid the
 * local_irq_disable overhead.
 */
static inline void __count_vm_event(enum vm_event_item item)
{
        raw_cpu_inc(vm_event_states.event[item]);
}

static inline void count_vm_event(enum vm_event_item item)
{
        this_cpu_inc(vm_event_states.event[item]);
}

static inline void __count_vm_events(enum vm_event_item item, long delta)
{
        raw_cpu_add(vm_event_states.event[item], delta);
}

static inline void count_vm_events(enum vm_event_item item, long delta)
{
        this_cpu_add(vm_event_states.event[item], delta);
}

extern void all_vm_events(unsigned long *);

extern void vm_events_fold_cpu(int cpu);

#else

/* Disable counters */
static inline void count_vm_event(enum vm_event_item item)
{
}
static inline void count_vm_events(enum vm_event_item item, long delta)
{
}
static inline void __count_vm_event(enum vm_event_item item)
{
}
static inline void __count_vm_events(enum vm_event_item item, long delta)
{
}
static inline void all_vm_events(unsigned long *ret)
{
}
static inline void vm_events_fold_cpu(int cpu)
{
}

#endif /* CONFIG_VM_EVENT_COUNTERS */

#ifdef CONFIG_NUMA_BALANCING
#define count_vm_numa_event(x)     count_vm_event(x)
#define count_vm_numa_events(x, y) count_vm_events(x, y)
#else
#define count_vm_numa_event(x) do {} while (0)
#define count_vm_numa_events(x, y) do { (void)(y); } while (0)
#endif /* CONFIG_NUMA_BALANCING */

#ifdef CONFIG_DEBUG_TLBFLUSH
#define count_vm_tlb_event(x)           count_vm_event(x)
#define count_vm_tlb_events(x, y)  count_vm_events(x, y)
#else
#define count_vm_tlb_event(x)     do {} while (0)
#define count_vm_tlb_events(x, y) do { (void)(y); } while (0)
#endif

#ifdef CONFIG_PER_VMA_LOCK_STATS
#define count_vm_vma_lock_event(x) count_vm_event(x)
#else
#define count_vm_vma_lock_event(x) do {} while (0)
#endif

#define __count_zid_vm_events(item, zid, delta) \
        __count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta)

/*
 * Zone and node-based page accounting with per cpu differentials.
 */
extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS];
extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS];
extern atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];

#ifdef CONFIG_NUMA
static inline void zone_numa_event_add(long x, struct zone *zone,
                                enum numa_stat_item item)
{
        atomic_long_add(x, &zone->vm_numa_event[item]);
        atomic_long_add(x, &vm_numa_event[item]);
}

static inline unsigned long zone_numa_event_state(struct zone *zone,
                                        enum numa_stat_item item)
{
        return atomic_long_read(&zone->vm_numa_event[item]);
}

static inline unsigned long
global_numa_event_state(enum numa_stat_item item)
{
        return atomic_long_read(&vm_numa_event[item]);
}
#endif /* CONFIG_NUMA */

static inline void zone_page_state_add(long x, struct zone *zone,
                                 enum zone_stat_item item)
{
        atomic_long_add(x, &zone->vm_stat[item]);
        atomic_long_add(x, &vm_zone_stat[item]);
}

static inline void node_page_state_add(long x, struct pglist_data *pgdat,
                                 enum node_stat_item item)
{
        atomic_long_add(x, &pgdat->vm_stat[item]);
        atomic_long_add(x, &vm_node_stat[item]);
}

static inline unsigned long global_zone_page_state(enum zone_stat_item item)
{
        long x = atomic_long_read(&vm_zone_stat[item]);
#ifdef CONFIG_SMP
        if (x < 0)
                x = 0;
#endif
        return x;
}

static inline
unsigned long global_node_page_state_pages(enum node_stat_item item)
{
        long x = atomic_long_read(&vm_node_stat[item]);
#ifdef CONFIG_SMP
        if (x < 0)
                x = 0;
#endif
        return x;
}

static inline unsigned long global_node_page_state(enum node_stat_item item)
{
        VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));

        return global_node_page_state_pages(item);
}

static inline unsigned long zone_page_state(struct zone *zone,
                                        enum zone_stat_item item)
{
        long x = atomic_long_read(&zone->vm_stat[item]);
#ifdef CONFIG_SMP
        if (x < 0)
                x = 0;
#endif
        return x;
}

/*
 * More accurate version that also considers the currently pending
 * deltas. For that we need to loop over all cpus to find the current
 * deltas. There is no synchronization so the result cannot be
 * exactly accurate either.
 */
static inline unsigned long zone_page_state_snapshot(struct zone *zone,
                                        enum zone_stat_item item)
{
        long x = atomic_long_read(&zone->vm_stat[item]);

#ifdef CONFIG_SMP
        int cpu;
        for_each_online_cpu(cpu)
                x += per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_stat_diff[item];

        if (x < 0)
                x = 0;
#endif
        return x;
}

#ifdef CONFIG_NUMA
/* See __count_vm_event comment on why raw_cpu_inc is used. */
static inline void
__count_numa_event(struct zone *zone, enum numa_stat_item item)
{
        struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;

        raw_cpu_inc(pzstats->vm_numa_event[item]);
}

static inline void
__count_numa_events(struct zone *zone, enum numa_stat_item item, long delta)
{
        struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;

        raw_cpu_add(pzstats->vm_numa_event[item], delta);
}

extern unsigned long sum_zone_node_page_state(int node,
                                              enum zone_stat_item item);
extern unsigned long sum_zone_numa_event_state(int node, enum numa_stat_item item);
extern unsigned long node_page_state(struct pglist_data *pgdat,
                                                enum node_stat_item item);
extern unsigned long node_page_state_pages(struct pglist_data *pgdat,
                                           enum node_stat_item item);
extern void fold_vm_numa_events(void);
#else
#define sum_zone_node_page_state(node, item) global_zone_page_state(item)
#define node_page_state(node, item) global_node_page_state(item)
#define node_page_state_pages(node, item) global_node_page_state_pages(item)
static inline void fold_vm_numa_events(void)
{
}
#endif /* CONFIG_NUMA */

#ifdef CONFIG_SMP
void __mod_zone_page_state(struct zone *, enum zone_stat_item item, long);
void __inc_zone_page_state(struct page *, enum zone_stat_item);
void __dec_zone_page_state(struct page *, enum zone_stat_item);

void __mod_node_page_state(struct pglist_data *, enum node_stat_item item, long);
void __inc_node_page_state(struct page *, enum node_stat_item);
void __dec_node_page_state(struct page *, enum node_stat_item);

void mod_zone_page_state(struct zone *, enum zone_stat_item, long);
void inc_zone_page_state(struct page *, enum zone_stat_item);
void dec_zone_page_state(struct page *, enum zone_stat_item);

void mod_node_page_state(struct pglist_data *, enum node_stat_item, long);
void inc_node_page_state(struct page *, enum node_stat_item);
void dec_node_page_state(struct page *, enum node_stat_item);

extern void inc_node_state(struct pglist_data *, enum node_stat_item);
extern void __inc_zone_state(struct zone *, enum zone_stat_item);
extern void __inc_node_state(struct pglist_data *, enum node_stat_item);
extern void dec_zone_state(struct zone *, enum zone_stat_item);
extern void __dec_zone_state(struct zone *, enum zone_stat_item);
extern void __dec_node_state(struct pglist_data *, enum node_stat_item);

void quiet_vmstat(void);
void cpu_vm_stats_fold(int cpu);
void refresh_zone_stat_thresholds(void);

void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *);

int calculate_pressure_threshold(struct zone *zone);
int calculate_normal_threshold(struct zone *zone);
void set_pgdat_percpu_threshold(pg_data_t *pgdat,
                                int (*calculate_pressure)(struct zone *));
#else /* CONFIG_SMP */

/*
 * We do not maintain differentials in a single processor configuration.
 * The functions directly modify the zone and global counters.
 */
static inline void __mod_zone_page_state(struct zone *zone,
                        enum zone_stat_item item, long delta)
{
        zone_page_state_add(delta, zone, item);
}

static inline void __mod_node_page_state(struct pglist_data *pgdat,
                        enum node_stat_item item, int delta)
{
        if (vmstat_item_in_bytes(item)) {
                /*
                 * Only cgroups use subpage accounting right now; at
                 * the global level, these items still change in
                 * multiples of whole pages. Store them as pages
                 * internally to keep the per-cpu counters compact.
                 */
                VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
                delta >>= PAGE_SHIFT;
        }

        node_page_state_add(delta, pgdat, item);
}

static inline void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
        atomic_long_inc(&zone->vm_stat[item]);
        atomic_long_inc(&vm_zone_stat[item]);
}

static inline void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
{
        atomic_long_inc(&pgdat->vm_stat[item]);
        atomic_long_inc(&vm_node_stat[item]);
}

static inline void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
{
        atomic_long_dec(&zone->vm_stat[item]);
        atomic_long_dec(&vm_zone_stat[item]);
}

static inline void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
{
        atomic_long_dec(&pgdat->vm_stat[item]);
        atomic_long_dec(&vm_node_stat[item]);
}

static inline void __inc_zone_page_state(struct page *page,
                        enum zone_stat_item item)
{
        __inc_zone_state(page_zone(page), item);
}

static inline void __inc_node_page_state(struct page *page,
                        enum node_stat_item item)
{
        __inc_node_state(page_pgdat(page), item);
}


static inline void __dec_zone_page_state(struct page *page,
                        enum zone_stat_item item)
{
        __dec_zone_state(page_zone(page), item);
}

static inline void __dec_node_page_state(struct page *page,
                        enum node_stat_item item)
{
        __dec_node_state(page_pgdat(page), item);
}


/*
 * We only use atomic operations to update counters. So there is no need to
 * disable interrupts.
 */
#define inc_zone_page_state __inc_zone_page_state
#define dec_zone_page_state __dec_zone_page_state
#define mod_zone_page_state __mod_zone_page_state

#define inc_node_page_state __inc_node_page_state
#define dec_node_page_state __dec_node_page_state
#define mod_node_page_state __mod_node_page_state

#define inc_zone_state __inc_zone_state
#define inc_node_state __inc_node_state
#define dec_zone_state __dec_zone_state

#define set_pgdat_percpu_threshold(pgdat, callback) { }

static inline void refresh_zone_stat_thresholds(void) { }
static inline void cpu_vm_stats_fold(int cpu) { }
static inline void quiet_vmstat(void) { }

static inline void drain_zonestat(struct zone *zone,
                        struct per_cpu_zonestat *pzstats) { }
#endif                /* CONFIG_SMP */

static inline void __zone_stat_mod_folio(struct folio *folio,
                enum zone_stat_item item, long nr)
{
        __mod_zone_page_state(folio_zone(folio), item, nr);
}

static inline void __zone_stat_add_folio(struct folio *folio,
                enum zone_stat_item item)
{
        __mod_zone_page_state(folio_zone(folio), item, folio_nr_pages(folio));
}

static inline void __zone_stat_sub_folio(struct folio *folio,
                enum zone_stat_item item)
{
        __mod_zone_page_state(folio_zone(folio), item, -folio_nr_pages(folio));
}

static inline void zone_stat_mod_folio(struct folio *folio,
                enum zone_stat_item item, long nr)
{
        mod_zone_page_state(folio_zone(folio), item, nr);
}

static inline void zone_stat_add_folio(struct folio *folio,
                enum zone_stat_item item)
{
        mod_zone_page_state(folio_zone(folio), item, folio_nr_pages(folio));
}

static inline void zone_stat_sub_folio(struct folio *folio,
                enum zone_stat_item item)
{
        mod_zone_page_state(folio_zone(folio), item, -folio_nr_pages(folio));
}

static inline void __node_stat_mod_folio(struct folio *folio,
                enum node_stat_item item, long nr)
{
        __mod_node_page_state(folio_pgdat(folio), item, nr);
}

static inline void __node_stat_add_folio(struct folio *folio,
                enum node_stat_item item)
{
        __mod_node_page_state(folio_pgdat(folio), item, folio_nr_pages(folio));
}

static inline void __node_stat_sub_folio(struct folio *folio,
                enum node_stat_item item)
{
        __mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio));
}

static inline void node_stat_mod_folio(struct folio *folio,
                enum node_stat_item item, long nr)
{
        mod_node_page_state(folio_pgdat(folio), item, nr);
}

static inline void node_stat_add_folio(struct folio *folio,
                enum node_stat_item item)
{
        mod_node_page_state(folio_pgdat(folio), item, folio_nr_pages(folio));
}

static inline void node_stat_sub_folio(struct folio *folio,
                enum node_stat_item item)
{
        mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio));
}

extern const char * const vmstat_text[];

static inline const char *zone_stat_name(enum zone_stat_item item)
{
        return vmstat_text[item];
}

#ifdef CONFIG_NUMA
static inline const char *numa_stat_name(enum numa_stat_item item)
{
        return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
                           item];
}
#endif /* CONFIG_NUMA */

static inline const char *node_stat_name(enum node_stat_item item)
{
        return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
                           NR_VM_NUMA_EVENT_ITEMS +
                           item];
}

static inline const char *lru_list_name(enum lru_list lru)
{
        return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
}

#if defined(CONFIG_VM_EVENT_COUNTERS)
static inline const char *vm_event_name(enum vm_event_item item)
{
        return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
                           NR_VM_NUMA_EVENT_ITEMS +
                           NR_VM_NODE_STAT_ITEMS +
                           NR_VM_STAT_ITEMS +
                           item];
}
#endif /* CONFIG_VM_EVENT_COUNTERS */

#ifdef CONFIG_MEMCG

void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
                        int val);

static inline void mod_lruvec_state(struct lruvec *lruvec,
                                    enum node_stat_item idx, int val)
{
        unsigned long flags;

        local_irq_save(flags);
        __mod_lruvec_state(lruvec, idx, val);
        local_irq_restore(flags);
}

void __lruvec_stat_mod_folio(struct folio *folio,
                             enum node_stat_item idx, int val);

static inline void lruvec_stat_mod_folio(struct folio *folio,
                                         enum node_stat_item idx, int val)
{
        unsigned long flags;

        local_irq_save(flags);
        __lruvec_stat_mod_folio(folio, idx, val);
        local_irq_restore(flags);
}

static inline void mod_lruvec_page_state(struct page *page,
                                         enum node_stat_item idx, int val)
{
        lruvec_stat_mod_folio(page_folio(page), idx, val);
}

#else

static inline void __mod_lruvec_state(struct lruvec *lruvec,
                                      enum node_stat_item idx, int val)
{
        __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
}

static inline void mod_lruvec_state(struct lruvec *lruvec,
                                    enum node_stat_item idx, int val)
{
        mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
}

static inline void __lruvec_stat_mod_folio(struct folio *folio,
                                         enum node_stat_item idx, int val)
{
        __mod_node_page_state(folio_pgdat(folio), idx, val);
}

static inline void lruvec_stat_mod_folio(struct folio *folio,
                                         enum node_stat_item idx, int val)
{
        mod_node_page_state(folio_pgdat(folio), idx, val);
}

static inline void mod_lruvec_page_state(struct page *page,
                                         enum node_stat_item idx, int val)
{
        mod_node_page_state(page_pgdat(page), idx, val);
}

#endif /* CONFIG_MEMCG */

static inline void __lruvec_stat_add_folio(struct folio *folio,
                                           enum node_stat_item idx)
{
        __lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio));
}

static inline void __lruvec_stat_sub_folio(struct folio *folio,
                                           enum node_stat_item idx)
{
        __lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio));
}

static inline void lruvec_stat_add_folio(struct folio *folio,
                                         enum node_stat_item idx)
{
        lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio));
}

static inline void lruvec_stat_sub_folio(struct folio *folio,
                                         enum node_stat_item idx)
{
        lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio));
}

void memmap_boot_pages_add(long delta);
void memmap_pages_add(long delta);
#endif /* _LINUX_VMSTAT_H */































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MMAN_H
#define _LINUX_MMAN_H

#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/percpu_counter.h>

#include <linux/atomic.h>
#include <uapi/linux/mman.h>

/*
 * Arrange for legacy / undefined architecture specific flags to be
 * ignored by mmap handling code.
 */
#ifndef MAP_32BIT
#define MAP_32BIT 0
#endif
#ifndef MAP_ABOVE4G
#define MAP_ABOVE4G 0
#endif
#ifndef MAP_HUGE_2MB
#define MAP_HUGE_2MB 0
#endif
#ifndef MAP_HUGE_1GB
#define MAP_HUGE_1GB 0
#endif
#ifndef MAP_UNINITIALIZED
#define MAP_UNINITIALIZED 0
#endif
#ifndef MAP_SYNC
#define MAP_SYNC 0
#endif

/*
 * The historical set of flags that all mmap implementations implicitly
 * support when a ->mmap_validate() op is not provided in file_operations.
 *
 * MAP_EXECUTABLE and MAP_DENYWRITE are completely ignored throughout the
 * kernel.
 */
#define LEGACY_MAP_MASK (MAP_SHARED \
                | MAP_PRIVATE \
                | MAP_FIXED \
                | MAP_ANONYMOUS \
                | MAP_DENYWRITE \
                | MAP_EXECUTABLE \
                | MAP_UNINITIALIZED \
                | MAP_GROWSDOWN \
                | MAP_LOCKED \
                | MAP_NORESERVE \
                | MAP_POPULATE \
                | MAP_NONBLOCK \
                | MAP_STACK \
                | MAP_HUGETLB \
                | MAP_32BIT \
                | MAP_ABOVE4G \
                | MAP_HUGE_2MB \
                | MAP_HUGE_1GB)

extern int sysctl_overcommit_memory;
extern struct percpu_counter vm_committed_as;

#ifdef CONFIG_SMP
extern s32 vm_committed_as_batch;
extern void mm_compute_batch(int overcommit_policy);
#else
#define vm_committed_as_batch 0
static inline void mm_compute_batch(int overcommit_policy)
{
}
#endif

unsigned long vm_memory_committed(void);

static inline void vm_acct_memory(long pages)
{
        percpu_counter_add_batch(&vm_committed_as, pages, vm_committed_as_batch);
}

static inline void vm_unacct_memory(long pages)
{
        vm_acct_memory(-pages);
}

/*
 * Allow architectures to handle additional protection and flag bits. The
 * overriding macros must be defined in the arch-specific asm/mman.h file.
 */

#ifndef arch_calc_vm_prot_bits
#define arch_calc_vm_prot_bits(prot, pkey) 0
#endif

#ifndef arch_calc_vm_flag_bits
#define arch_calc_vm_flag_bits(file, flags) 0
#endif

#ifndef arch_validate_prot
/*
 * This is called from mprotect().  PROT_GROWSDOWN and PROT_GROWSUP have
 * already been masked out.
 *
 * Returns true if the prot flags are valid
 */
static inline bool arch_validate_prot(unsigned long prot, unsigned long addr)
{
        return (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM)) == 0;
}
#define arch_validate_prot arch_validate_prot
#endif

#ifndef arch_validate_flags
/*
 * This is called from mmap() and mprotect() with the updated vma->vm_flags.
 *
 * Returns true if the VM_* flags are valid.
 */
static inline bool arch_validate_flags(unsigned long flags)
{
        return true;
}
#define arch_validate_flags arch_validate_flags
#endif

/*
 * Optimisation macro.  It is equivalent to:
 *      (x & bit1) ? bit2 : 0
 * but this version is faster.
 * ("bit1" and "bit2" must be single bits)
 */
#define _calc_vm_trans(x, bit1, bit2) \
  ((!(bit1) || !(bit2)) ? 0 : \
  ((bit1) <= (bit2) ? ((x) & (bit1)) * ((bit2) / (bit1)) \
   : ((x) & (bit1)) / ((bit1) / (bit2))))

/*
 * Combine the mmap "prot" argument into "vm_flags" used internally.
 */
static inline vm_flags_t
calc_vm_prot_bits(unsigned long prot, unsigned long pkey)
{
        return _calc_vm_trans(prot, PROT_READ,  VM_READ ) |
               _calc_vm_trans(prot, PROT_WRITE, VM_WRITE) |
               _calc_vm_trans(prot, PROT_EXEC,  VM_EXEC) |
               arch_calc_vm_prot_bits(prot, pkey);
}

/*
 * Combine the mmap "flags" argument into "vm_flags" used internally.
 */
static inline vm_flags_t
calc_vm_flag_bits(struct file *file, unsigned long flags)
{
        return _calc_vm_trans(flags, MAP_GROWSDOWN,  VM_GROWSDOWN ) |
               _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    ) |
               _calc_vm_trans(flags, MAP_SYNC,             VM_SYNC      ) |
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
               _calc_vm_trans(flags, MAP_STACK,             VM_NOHUGEPAGE) |
#endif
               arch_calc_vm_flag_bits(file, flags);
}

unsigned long vm_commit_limit(void);

#ifndef arch_memory_deny_write_exec_supported
static inline bool arch_memory_deny_write_exec_supported(void)
{
        return true;
}
#define arch_memory_deny_write_exec_supported arch_memory_deny_write_exec_supported
#endif

/*
 * Denies creating a writable executable mapping or gaining executable permissions.
 *
 * This denies the following:
 *
 *         a)        mmap(PROT_WRITE | PROT_EXEC)
 *
 *        b)        mmap(PROT_WRITE)
 *                mprotect(PROT_EXEC)
 *
 *        c)        mmap(PROT_WRITE)
 *                mprotect(PROT_READ)
 *                mprotect(PROT_EXEC)
 *
 * But allows the following:
 *
 *        d)        mmap(PROT_READ | PROT_EXEC)
 *                mmap(PROT_READ | PROT_EXEC | PROT_BTI)
 *
 * This is only applicable if the user has set the Memory-Deny-Write-Execute
 * (MDWE) protection mask for the current process.
 *
 * @old specifies the VMA flags the VMA originally possessed, and @new the ones
 * we propose to set.
 *
 * Return: false if proposed change is OK, true if not ok and should be denied.
 */
static inline bool map_deny_write_exec(unsigned long old, unsigned long new)
{
        /* If MDWE is disabled, we have nothing to deny. */
        if (!mm_flags_test(MMF_HAS_MDWE, current->mm))
                return false;

        /* If the new VMA is not executable, we have nothing to deny. */
        if (!(new & VM_EXEC))
                return false;

        /* Under MDWE we do not accept newly writably executable VMAs... */
        if (new & VM_WRITE)
                return true;

        /* ...nor previously non-executable VMAs becoming executable. */
        if (!(old & VM_EXEC))
                return true;

        return false;
}

#endif /* _LINUX_MMAN_H */































































































































































































  148 
































   14 




   14 











































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __X86_KERNEL_FPU_XSTATE_H
#define __X86_KERNEL_FPU_XSTATE_H

#include <asm/cpufeature.h>
#include <asm/fpu/xstate.h>
#include <asm/fpu/xcr.h>
#include <asm/msr.h>

#ifdef CONFIG_X86_64
DECLARE_PER_CPU(u64, xfd_state);
#endif

static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask)
{
        /*
         * XRSTORS requires these bits set in xcomp_bv, or it will
         * trigger #GP:
         */
        if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED))
                xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT;
}

static inline u64 xstate_get_group_perm(bool guest)
{
        struct fpu *fpu = x86_task_fpu(current->group_leader);
        struct fpu_state_perm *perm;

        /* Pairs with WRITE_ONCE() in xstate_request_perm() */
        perm = guest ? &fpu->guest_perm : &fpu->perm;
        return READ_ONCE(perm->__state_perm);
}

static inline u64 xstate_get_host_group_perm(void)
{
        return xstate_get_group_perm(false);
}

enum xstate_copy_mode {
        XSTATE_COPY_FP,
        XSTATE_COPY_FX,
        XSTATE_COPY_XSAVE,
};

struct membuf;
extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
                                      u64 xfeatures, u32 pkru_val,
                                      enum xstate_copy_mode copy_mode);
extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
                                    enum xstate_copy_mode mode);
extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru);
extern int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void __user *ubuf);


extern void fpu__init_cpu_xstate(void);
extern void fpu__init_system_xstate(unsigned int legacy_size);

extern void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr);

static inline u64 xfeatures_mask_supervisor(void)
{
        return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED;
}

static inline u64 xfeatures_mask_independent(void)
{
        if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR))
                return fpu_kernel_cfg.independent_features & ~XFEATURE_MASK_LBR;

        return fpu_kernel_cfg.independent_features;
}

static inline int set_xfeature_in_sigframe(struct xregs_state __user *xbuf, u64 mask)
{
        u64 xfeatures;
        int err;

        /* Read the xfeatures value already saved in the user buffer */
        err  = __get_user(xfeatures, &xbuf->header.xfeatures);
        xfeatures |= mask;
        err |= __put_user(xfeatures, &xbuf->header.xfeatures);

        return err;
}

/*
 * Update the value of PKRU register that was already pushed onto the signal frame.
 */
static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u32 pkru)
{
        int err;

        if (unlikely(!cpu_feature_enabled(X86_FEATURE_OSPKE)))
                return 0;

        /* Mark PKRU as in-use so that it is restored correctly. */
        err = set_xfeature_in_sigframe(buf, XFEATURE_MASK_PKRU);
        if (err)
                return err;

        /* Update PKRU value in the userspace xsave buffer. */
        return __put_user(pkru, (unsigned int __user *)get_xsave_addr_user(buf, XFEATURE_PKRU));
}

/* XSAVE/XRSTOR wrapper functions */

#ifdef CONFIG_X86_64
#define REX_SUFFIX        "64"
#else
#define REX_SUFFIX
#endif

#define XSAVE                "xsave" REX_SUFFIX " %[xa]"
#define XSAVEOPT        "xsaveopt" REX_SUFFIX " %[xa]"
#define XSAVEC                "xsavec" REX_SUFFIX " %[xa]"
#define XSAVES                "xsaves" REX_SUFFIX " %[xa]"
#define XRSTOR                "xrstor" REX_SUFFIX " %[xa]"
#define XRSTORS                "xrstors" REX_SUFFIX " %[xa]"

/*
 * After this @err contains 0 on success or the trap number when the
 * operation raises an exception.
 *
 * The [xa] input parameter below represents the struct xregs_state pointer
 * and the asm symbolic name for the argument used in the XSAVE/XRSTOR insns
 * above.
 */
#define XSTATE_OP(op, st, lmask, hmask, err)                                \
        asm volatile("1:" op "\n\t"                                        \
                     "xor %[err], %[err]\n"                                \
                     "2:\n"                                                \
                     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE)        \
                     : [err] "=a" (err)                                        \
                     : [xa] "m" (*(st)), "a" (lmask), "d" (hmask)        \
                     : "memory")

/*
 * If XSAVES is enabled, it replaces XSAVEC because it supports supervisor
 * states in addition to XSAVEC.
 *
 * Otherwise if XSAVEC is enabled, it replaces XSAVEOPT because it supports
 * compacted storage format in addition to XSAVEOPT.
 *
 * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT
 * supports modified optimization which is not supported by XSAVE.
 *
 * Use XSAVE as a fallback.
 */
#define XSTATE_XSAVE(st, lmask, hmask, err)                                \
        asm volatile("1: " ALTERNATIVE_3(XSAVE,                                \
                                   XSAVEOPT, X86_FEATURE_XSAVEOPT,        \
                                   XSAVEC,   X86_FEATURE_XSAVEC,        \
                                   XSAVES,   X86_FEATURE_XSAVES)        \
                     "\n\t"                                                \
                     "xor %[err], %[err]\n"                                \
                     "3:\n"                                                \
                     _ASM_EXTABLE_TYPE_REG(1b, 3b, EX_TYPE_EFAULT_REG, %[err]) \
                     : [err] "=r" (err)                                        \
                     : [xa] "m" (*(st)), "a" (lmask), "d" (hmask)        \
                     : "memory")

/*
 * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact
 * XSAVE area format.
 */
#define XSTATE_XRESTORE(st, lmask, hmask)                                \
        asm volatile("1: " ALTERNATIVE(XRSTOR,                                \
                                 XRSTORS, X86_FEATURE_XSAVES)                \
                     "\n"                                                \
                     "3:\n"                                                \
                     _ASM_EXTABLE_TYPE(1b, 3b, EX_TYPE_FPU_RESTORE)        \
                     :                                                        \
                     : [xa] "m" (*(st)), "a" (lmask), "d" (hmask)        \
                     : "memory")

#if defined(CONFIG_X86_64) && defined(CONFIG_X86_DEBUG_FPU)
extern void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor);
#else
static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) { }
#endif

#ifdef CONFIG_X86_64
static inline void xfd_set_state(u64 xfd)
{
        wrmsrq(MSR_IA32_XFD, xfd);
        __this_cpu_write(xfd_state, xfd);
}

static inline void xfd_update_state(struct fpstate *fpstate)
{
        if (fpu_state_size_dynamic()) {
                u64 xfd = fpstate->xfd;

                if (__this_cpu_read(xfd_state) != xfd)
                        xfd_set_state(xfd);
        }
}

extern int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu);
#else
static inline void xfd_set_state(u64 xfd) { }

static inline void xfd_update_state(struct fpstate *fpstate) { }

static inline int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu) {
        return -EPERM;
}
#endif

/*
 * Save processor xstate to xsave area.
 *
 * Uses either XSAVE or XSAVEOPT or XSAVES depending on the CPU features
 * and command line options. The choice is permanent until the next reboot.
 */
static inline void os_xsave(struct fpstate *fpstate)
{
        u64 mask = fpstate->xfeatures;
        u32 lmask = mask;
        u32 hmask = mask >> 32;
        int err;

        WARN_ON_FPU(!alternatives_patched);
        xfd_validate_state(fpstate, mask, false);

        XSTATE_XSAVE(&fpstate->regs.xsave, lmask, hmask, err);

        /* We should never fault when copying to a kernel buffer: */
        WARN_ON_FPU(err);
}

/*
 * Restore processor xstate from xsave area.
 *
 * Uses XRSTORS when XSAVES is used, XRSTOR otherwise.
 */
static inline void os_xrstor(struct fpstate *fpstate, u64 mask)
{
        u32 lmask = mask;
        u32 hmask = mask >> 32;

        xfd_validate_state(fpstate, mask, true);
        XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask);
}

/* Restore of supervisor state. Does not require XFD */
static inline void os_xrstor_supervisor(struct fpstate *fpstate)
{
        u64 mask = xfeatures_mask_supervisor();
        u32 lmask = mask;
        u32 hmask = mask >> 32;

        XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask);
}

/*
 * XSAVE itself always writes all requested xfeatures.  Removing features
 * from the request bitmap reduces the features which are written.
 * Generate a mask of features which must be written to a sigframe.  The
 * unset features can be optimized away and not written.
 *
 * This optimization is user-visible.  Only use for states where
 * uninitialized sigframe contents are tolerable, like dynamic features.
 *
 * Users of buffers produced with this optimization must check XSTATE_BV
 * to determine which features have been optimized out.
 */
static inline u64 xfeatures_need_sigframe_write(void)
{
        u64 xfeaures_to_write;

        /* In-use features must be written: */
        xfeaures_to_write = xfeatures_in_use();

        /* Also write all non-optimizable sigframe features: */
        xfeaures_to_write |= XFEATURE_MASK_USER_SUPPORTED &
                             ~XFEATURE_MASK_SIGFRAME_INITOPT;

        return xfeaures_to_write;
}

/*
 * Save xstate to user space xsave area.
 *
 * We don't use modified optimization because xrstor/xrstors might track
 * a different application.
 *
 * We don't use compacted format xsave area for backward compatibility for
 * old applications which don't understand the compacted format of the
 * xsave area.
 *
 * The caller has to zero buf::header before calling this because XSAVE*
 * does not touch the reserved fields in the header.
 */
static inline int xsave_to_user_sigframe(struct xregs_state __user *buf, u32 pkru)
{
        /*
         * Include the features which are not xsaved/rstored by the kernel
         * internally, e.g. PKRU. That's user space ABI and also required
         * to allow the signal handler to modify PKRU.
         */
        struct fpstate *fpstate = x86_task_fpu(current)->fpstate;
        u64 mask = fpstate->user_xfeatures;
        u32 lmask;
        u32 hmask;
        int err;

        /* Optimize away writing unnecessary xfeatures: */
        if (fpu_state_size_dynamic())
                mask &= xfeatures_need_sigframe_write();

        lmask = mask;
        hmask = mask >> 32;
        xfd_validate_state(fpstate, mask, false);

        stac();
        XSTATE_OP(XSAVE, buf, lmask, hmask, err);
        clac();

        if (!err)
                err = update_pkru_in_sigframe(buf, pkru);

        return err;
}

/*
 * Restore xstate from user space xsave area.
 */
static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 mask)
{
        struct xregs_state *xstate = ((__force struct xregs_state *)buf);
        u32 lmask = mask;
        u32 hmask = mask >> 32;
        int err;

        xfd_validate_state(x86_task_fpu(current)->fpstate, mask, true);

        stac();
        XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
        clac();

        return err;
}

/*
 * Restore xstate from kernel space xsave area, return an error code instead of
 * an exception.
 */
static inline int os_xrstor_safe(struct fpstate *fpstate, u64 mask)
{
        struct xregs_state *xstate = &fpstate->regs.xsave;
        u32 lmask = mask;
        u32 hmask = mask >> 32;
        int err;

        /* Ensure that XFD is up to date */
        xfd_update_state(fpstate);

        if (cpu_feature_enabled(X86_FEATURE_XSAVES))
                XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
        else
                XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);

        return err;
}


#endif





























































































































































































































































































































































































































































































































































































































































































    2 







































































    2 



























































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        Linux INET6 implementation
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>
 */

#ifndef _NET_IPV6_H
#define _NET_IPV6_H

#include <linux/ipv6.h>
#include <linux/hardirq.h>
#include <linux/jhash.h>
#include <linux/refcount.h>
#include <linux/jump_label_ratelimit.h>
#include <net/if_inet6.h>
#include <net/flow.h>
#include <net/flow_dissector.h>
#include <net/inet_dscp.h>
#include <net/snmp.h>
#include <net/netns/hash.h>

struct ip_tunnel_info;

#define SIN6_LEN_RFC2133        24

#define IPV6_MAXPLEN                65535

/*
 *        NextHeader field of IPv6 header
 */

#define NEXTHDR_HOP                0        /* Hop-by-hop option header. */
#define NEXTHDR_IPV4                4        /* IPv4 in IPv6 */
#define NEXTHDR_TCP                6        /* TCP segment. */
#define NEXTHDR_UDP                17        /* UDP message. */
#define NEXTHDR_IPV6                41        /* IPv6 in IPv6 */
#define NEXTHDR_ROUTING                43        /* Routing header. */
#define NEXTHDR_FRAGMENT        44        /* Fragmentation/reassembly header. */
#define NEXTHDR_GRE                47        /* GRE header. */
#define NEXTHDR_ESP                50        /* Encapsulating security payload. */
#define NEXTHDR_AUTH                51        /* Authentication header. */
#define NEXTHDR_ICMP                58        /* ICMP for IPv6. */
#define NEXTHDR_NONE                59        /* No next header */
#define NEXTHDR_DEST                60        /* Destination options header. */
#define NEXTHDR_SCTP                132        /* SCTP message. */
#define NEXTHDR_MOBILITY        135        /* Mobility header. */

#define NEXTHDR_MAX                255

#define IPV6_DEFAULT_HOPLIMIT   64
#define IPV6_DEFAULT_MCASTHOPS        1

/* Limits on Hop-by-Hop and Destination options.
 *
 * Per RFC8200 there is no limit on the maximum number or lengths of options in
 * Hop-by-Hop or Destination options other then the packet must fit in an MTU.
 * We allow configurable limits in order to mitigate potential denial of
 * service attacks.
 *
 * There are three limits that may be set:
 *   - Limit the number of options in a Hop-by-Hop or Destination options
 *     extension header
 *   - Limit the byte length of a Hop-by-Hop or Destination options extension
 *     header
 *   - Disallow unknown options
 *
 * The limits are expressed in corresponding sysctls:
 *
 * ipv6.sysctl.max_dst_opts_cnt
 * ipv6.sysctl.max_hbh_opts_cnt
 * ipv6.sysctl.max_dst_opts_len
 * ipv6.sysctl.max_hbh_opts_len
 *
 * max_*_opts_cnt is the number of TLVs that are allowed for Destination
 * options or Hop-by-Hop options. If the number is less than zero then unknown
 * TLVs are disallowed and the number of known options that are allowed is the
 * absolute value. Setting the value to INT_MAX indicates no limit.
 *
 * max_*_opts_len is the length limit in bytes of a Destination or
 * Hop-by-Hop options extension header. Setting the value to INT_MAX
 * indicates no length limit.
 *
 * If a limit is exceeded when processing an extension header the packet is
 * silently discarded.
 */

/* Default limits for Hop-by-Hop and Destination options */
#define IP6_DEFAULT_MAX_DST_OPTS_CNT         8
#define IP6_DEFAULT_MAX_HBH_OPTS_CNT         8
#define IP6_DEFAULT_MAX_DST_OPTS_LEN         INT_MAX /* No limit */
#define IP6_DEFAULT_MAX_HBH_OPTS_LEN         INT_MAX /* No limit */

/*
 *        Addr type
 *        
 *        type        -        unicast | multicast
 *        scope        -        local        | site            | global
 *        v4        -        compat
 *        v4mapped
 *        any
 *        loopback
 */

#define IPV6_ADDR_ANY                0x0000U

#define IPV6_ADDR_UNICAST        0x0001U
#define IPV6_ADDR_MULTICAST        0x0002U

#define IPV6_ADDR_LOOPBACK        0x0010U
#define IPV6_ADDR_LINKLOCAL        0x0020U
#define IPV6_ADDR_SITELOCAL        0x0040U

#define IPV6_ADDR_COMPATv4        0x0080U

#define IPV6_ADDR_SCOPE_MASK        0x00f0U

#define IPV6_ADDR_MAPPED        0x1000U

/*
 *        Addr scopes
 */
#define IPV6_ADDR_MC_SCOPE(a)        \
        ((a)->s6_addr[1] & 0x0f)        /* nonstandard */
#define __IPV6_ADDR_SCOPE_INVALID        -1
#define IPV6_ADDR_SCOPE_NODELOCAL        0x01
#define IPV6_ADDR_SCOPE_LINKLOCAL        0x02
#define IPV6_ADDR_SCOPE_SITELOCAL        0x05
#define IPV6_ADDR_SCOPE_ORGLOCAL        0x08
#define IPV6_ADDR_SCOPE_GLOBAL                0x0e

/*
 *        Addr flags
 */
#define IPV6_ADDR_MC_FLAG_TRANSIENT(a)        \
        ((a)->s6_addr[1] & 0x10)
#define IPV6_ADDR_MC_FLAG_PREFIX(a)        \
        ((a)->s6_addr[1] & 0x20)
#define IPV6_ADDR_MC_FLAG_RENDEZVOUS(a)        \
        ((a)->s6_addr[1] & 0x40)

/*
 *        fragmentation header
 */

struct frag_hdr {
        __u8        nexthdr;
        __u8        reserved;
        __be16        frag_off;
        __be32        identification;
};

/*
 * Jumbo payload option, as described in RFC 2675 2.
 */
struct hop_jumbo_hdr {
        u8        nexthdr;
        u8        hdrlen;
        u8        tlv_type;        /* IPV6_TLV_JUMBO, 0xC2 */
        u8        tlv_len;        /* 4 */
        __be32        jumbo_payload_len;
};

#define        IP6_MF                0x0001
#define        IP6_OFFSET        0xFFF8

struct ip6_fraglist_iter {
        struct ipv6hdr        *tmp_hdr;
        struct sk_buff        *frag;
        int                offset;
        unsigned int        hlen;
        __be32                frag_id;
        u8                nexthdr;
};

int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
                      u8 nexthdr, __be32 frag_id,
                      struct ip6_fraglist_iter *iter);
void ip6_fraglist_prepare(struct sk_buff *skb, struct ip6_fraglist_iter *iter);

static inline struct sk_buff *ip6_fraglist_next(struct ip6_fraglist_iter *iter)
{
        struct sk_buff *skb = iter->frag;

        iter->frag = skb->next;
        skb_mark_not_on_list(skb);

        return skb;
}

struct ip6_frag_state {
        u8                *prevhdr;
        unsigned int        hlen;
        unsigned int        mtu;
        unsigned int        left;
        int                offset;
        int                ptr;
        int                hroom;
        int                troom;
        __be32                frag_id;
        u8                nexthdr;
};

void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
                   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
                   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state);
struct sk_buff *ip6_frag_next(struct sk_buff *skb,
                              struct ip6_frag_state *state);

#define IP6_REPLY_MARK(net, mark) \
        ((net)->ipv6.sysctl.fwmark_reflect ? (mark) : 0)

#include <net/sock.h>

/* sysctls */
extern int sysctl_mld_max_msf;
extern int sysctl_mld_qrv;

#define _DEVINC(net, statname, mod, idev, field)                        \
({                                                                        \
        struct inet6_dev *_idev = (idev);                                \
        if (likely(_idev != NULL))                                        \
                mod##SNMP_INC_STATS64((_idev)->stats.statname, (field));\
        mod##SNMP_INC_STATS64((net)->mib.statname##_statistics, (field));\
})

/* per device counters are atomic_long_t */
#define _DEVINCATOMIC(net, statname, mod, idev, field)                        \
({                                                                        \
        struct inet6_dev *_idev = (idev);                                \
        if (likely(_idev != NULL))                                        \
                SNMP_INC_STATS_ATOMIC_LONG((_idev)->stats.statname##dev, (field)); \
        mod##SNMP_INC_STATS((net)->mib.statname##_statistics, (field));\
})

/* per device and per net counters are atomic_long_t */
#define _DEVINC_ATOMIC_ATOMIC(net, statname, idev, field)                \
({                                                                        \
        struct inet6_dev *_idev = (idev);                                \
        if (likely(_idev != NULL))                                        \
                SNMP_INC_STATS_ATOMIC_LONG((_idev)->stats.statname##dev, (field)); \
        SNMP_INC_STATS_ATOMIC_LONG((net)->mib.statname##_statistics, (field));\
})

#define _DEVADD(net, statname, mod, idev, field, val)                        \
({                                                                        \
        struct inet6_dev *_idev = (idev);                                \
        unsigned long _field = (field);                                        \
        unsigned long _val = (val);                                        \
        if (likely(_idev != NULL))                                        \
                mod##SNMP_ADD_STATS((_idev)->stats.statname, _field,  _val); \
        mod##SNMP_ADD_STATS((net)->mib.statname##_statistics, _field, _val);\
})

#define _DEVUPD(net, statname, mod, idev, field, val)                        \
({                                                                        \
        struct inet6_dev *_idev = (idev);                                \
        unsigned long _val = (val);                                        \
        if (likely(_idev != NULL))                                        \
                mod##SNMP_UPD_PO_STATS((_idev)->stats.statname, field, _val); \
        mod##SNMP_UPD_PO_STATS((net)->mib.statname##_statistics, field, _val);\
})

/* MIBs */

#define IP6_INC_STATS(net, idev,field)                \
                _DEVINC(net, ipv6, , idev, field)
#define __IP6_INC_STATS(net, idev,field)        \
                _DEVINC(net, ipv6, __, idev, field)
#define IP6_ADD_STATS(net, idev,field,val)        \
                _DEVADD(net, ipv6, , idev, field, val)
#define __IP6_ADD_STATS(net, idev,field,val)        \
                _DEVADD(net, ipv6, __, idev, field, val)
#define IP6_UPD_PO_STATS(net, idev,field,val)   \
                _DEVUPD(net, ipv6, , idev, field, val)
#define __IP6_UPD_PO_STATS(net, idev,field,val)   \
                _DEVUPD(net, ipv6, __, idev, field, val)
#define ICMP6_INC_STATS(net, idev, field)        \
                _DEVINCATOMIC(net, icmpv6, , idev, field)
#define __ICMP6_INC_STATS(net, idev, field)        \
                _DEVINCATOMIC(net, icmpv6, __, idev, field)

#define ICMP6MSGOUT_INC_STATS(net, idev, field)                \
        _DEVINC_ATOMIC_ATOMIC(net, icmpv6msg, idev, field +256)
#define ICMP6MSGIN_INC_STATS(net, idev, field)        \
        _DEVINC_ATOMIC_ATOMIC(net, icmpv6msg, idev, field)

struct ip6_ra_chain {
        struct ip6_ra_chain        *next;
        struct sock                *sk;
        int                        sel;
        void                        (*destructor)(struct sock *);
};

extern struct ip6_ra_chain        *ip6_ra_chain;
extern rwlock_t ip6_ra_lock;

/*
   This structure is prepared by protocol, when parsing
   ancillary data and passed to IPv6.
 */

struct ipv6_txoptions {
        refcount_t                refcnt;
        /* Length of this structure */
        int                        tot_len;

        /* length of extension headers   */

        __u16                        opt_flen;        /* after fragment hdr */
        __u16                        opt_nflen;        /* before fragment hdr */

        struct ipv6_opt_hdr        *hopopt;
        struct ipv6_opt_hdr        *dst0opt;
        struct ipv6_rt_hdr        *srcrt;        /* Routing Header */
        struct ipv6_opt_hdr        *dst1opt;
        struct rcu_head                rcu;
        /* Option buffer, as read by IPV6_PKTOPTIONS, starts here. */
};

/* flowlabel_reflect sysctl values */
enum flowlabel_reflect {
        FLOWLABEL_REFLECT_ESTABLISHED                = 1,
        FLOWLABEL_REFLECT_TCP_RESET                = 2,
        FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES        = 4,
};

struct ip6_flowlabel {
        struct ip6_flowlabel __rcu *next;
        __be32                        label;
        atomic_t                users;
        struct in6_addr                dst;
        struct ipv6_txoptions        *opt;
        unsigned long                linger;
        struct rcu_head                rcu;
        u8                        share;
        union {
                struct pid *pid;
                kuid_t uid;
        } owner;
        unsigned long                lastuse;
        unsigned long                expires;
        struct net                *fl_net;
};

#define IPV6_FLOWINFO_MASK                cpu_to_be32(0x0FFFFFFF)
#define IPV6_FLOWLABEL_MASK                cpu_to_be32(0x000FFFFF)
#define IPV6_FLOWLABEL_STATELESS_FLAG        cpu_to_be32(0x00080000)

#define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK)
#define IPV6_TCLASS_SHIFT        20

struct ipv6_fl_socklist {
        struct ipv6_fl_socklist        __rcu        *next;
        struct ip6_flowlabel                *fl;
        struct rcu_head                        rcu;
};

struct ipcm6_cookie {
        struct sockcm_cookie sockc;
        __s16 hlimit;
        __s16 tclass;
        __u16 gso_size;
        __s8  dontfrag;
        struct ipv6_txoptions *opt;
};

static inline void ipcm6_init_sk(struct ipcm6_cookie *ipc6,
                                 const struct sock *sk)
{
        *ipc6 = (struct ipcm6_cookie) {
                .hlimit = -1,
                .tclass = inet6_sk(sk)->tclass,
                .dontfrag = inet6_test_bit(DONTFRAG, sk),
        };

        sockcm_init(&ipc6->sockc, sk);
}

static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np)
{
        struct ipv6_txoptions *opt;

        rcu_read_lock();
        opt = rcu_dereference(np->opt);
        if (opt) {
                if (!refcount_inc_not_zero(&opt->refcnt))
                        opt = NULL;
                else
                        opt = rcu_pointer_handoff(opt);
        }
        rcu_read_unlock();
        return opt;
}

static inline void txopt_put(struct ipv6_txoptions *opt)
{
        if (opt && refcount_dec_and_test(&opt->refcnt))
                kfree_rcu(opt, rcu);
}

#if IS_ENABLED(CONFIG_IPV6)
struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label);

extern struct static_key_false_deferred ipv6_flowlabel_exclusive;
static inline struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk,
                                                    __be32 label)
{
        if (static_branch_unlikely(&ipv6_flowlabel_exclusive.key) &&
            READ_ONCE(sock_net(sk)->ipv6.flowlabel_has_excl))
                return __fl6_sock_lookup(sk, label) ? : ERR_PTR(-ENOENT);

        return NULL;
}
#endif

struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space,
                                         struct ip6_flowlabel *fl,
                                         struct ipv6_txoptions *fopt);
void fl6_free_socklist(struct sock *sk);
int ipv6_flowlabel_opt(struct sock *sk, sockptr_t optval, int optlen);
int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq,
                           int flags);
int ip6_flowlabel_init(void);
void ip6_flowlabel_cleanup(void);
bool ip6_autoflowlabel(struct net *net, const struct sock *sk);

static inline void fl6_sock_release(struct ip6_flowlabel *fl)
{
        if (fl)
                atomic_dec(&fl->users);
}

enum skb_drop_reason icmpv6_notify(struct sk_buff *skb, u8 type,
                                   u8 code, __be32 info);

void icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
                                struct icmp6hdr *thdr, int len);

int ip6_ra_control(struct sock *sk, int sel);

int ipv6_parse_hopopts(struct sk_buff *skb);

struct ipv6_txoptions *ipv6_dup_options(struct sock *sk,
                                        struct ipv6_txoptions *opt);
struct ipv6_txoptions *ipv6_renew_options(struct sock *sk,
                                          struct ipv6_txoptions *opt,
                                          int newtype,
                                          struct ipv6_opt_hdr *newopt);
struct ipv6_txoptions *__ipv6_fixup_options(struct ipv6_txoptions *opt_space,
                                            struct ipv6_txoptions *opt);

static inline struct ipv6_txoptions *
ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt)
{
        if (!opt)
                return NULL;
        return __ipv6_fixup_options(opt_space, opt);
}

bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb,
                       const struct inet6_skb_parm *opt);
struct ipv6_txoptions *ipv6_update_options(struct sock *sk,
                                           struct ipv6_txoptions *opt);

/* This helper is specialized for BIG TCP needs.
 * It assumes the hop_jumbo_hdr will immediately follow the IPV6 header.
 * It assumes headers are already in skb->head.
 * Returns: 0, or IPPROTO_TCP if a BIG TCP packet is there.
 */
static inline int ipv6_has_hopopt_jumbo(const struct sk_buff *skb)
{
        const struct hop_jumbo_hdr *jhdr;
        const struct ipv6hdr *nhdr;

        if (likely(skb->len <= GRO_LEGACY_MAX_SIZE))
                return 0;

        if (skb->protocol != htons(ETH_P_IPV6))
                return 0;

        if (skb_network_offset(skb) +
            sizeof(struct ipv6hdr) +
            sizeof(struct hop_jumbo_hdr) > skb_headlen(skb))
                return 0;

        nhdr = ipv6_hdr(skb);

        if (nhdr->nexthdr != NEXTHDR_HOP)
                return 0;

        jhdr = (const struct hop_jumbo_hdr *) (nhdr + 1);
        if (jhdr->tlv_type != IPV6_TLV_JUMBO || jhdr->hdrlen != 0 ||
            jhdr->nexthdr != IPPROTO_TCP)
                return 0;
        return jhdr->nexthdr;
}

/* Return 0 if HBH header is successfully removed
 * Or if HBH removal is unnecessary (packet is not big TCP)
 * Return error to indicate dropping the packet
 */
static inline int ipv6_hopopt_jumbo_remove(struct sk_buff *skb)
{
        const int hophdr_len = sizeof(struct hop_jumbo_hdr);
        int nexthdr = ipv6_has_hopopt_jumbo(skb);
        struct ipv6hdr *h6;

        if (!nexthdr)
                return 0;

        if (skb_cow_head(skb, 0))
                return -1;

        /* Remove the HBH header.
         * Layout: [Ethernet header][IPv6 header][HBH][L4 Header]
         */
        memmove(skb_mac_header(skb) + hophdr_len, skb_mac_header(skb),
                skb_network_header(skb) - skb_mac_header(skb) +
                sizeof(struct ipv6hdr));

        __skb_pull(skb, hophdr_len);
        skb->network_header += hophdr_len;
        skb->mac_header += hophdr_len;

        h6 = ipv6_hdr(skb);
        h6->nexthdr = nexthdr;

        return 0;
}

static inline bool ipv6_accept_ra(const struct inet6_dev *idev)
{
        s32 accept_ra = READ_ONCE(idev->cnf.accept_ra);

        /* If forwarding is enabled, RA are not accepted unless the special
         * hybrid mode (accept_ra=2) is enabled.
         */
        return READ_ONCE(idev->cnf.forwarding) ? accept_ra == 2 :
                accept_ra;
}

#define IPV6_FRAG_HIGH_THRESH        (4 * 1024*1024)        /* 4194304 */
#define IPV6_FRAG_LOW_THRESH        (3 * 1024*1024)        /* 3145728 */
#define IPV6_FRAG_TIMEOUT        (60 * HZ)        /* 60 seconds */

int __ipv6_addr_type(const struct in6_addr *addr);
static inline int ipv6_addr_type(const struct in6_addr *addr)
{
        return __ipv6_addr_type(addr) & 0xffff;
}

static inline int ipv6_addr_scope(const struct in6_addr *addr)
{
        return __ipv6_addr_type(addr) & IPV6_ADDR_SCOPE_MASK;
}

static inline int __ipv6_addr_src_scope(int type)
{
        return (type == IPV6_ADDR_ANY) ? __IPV6_ADDR_SCOPE_INVALID : (type >> 16);
}

static inline int ipv6_addr_src_scope(const struct in6_addr *addr)
{
        return __ipv6_addr_src_scope(__ipv6_addr_type(addr));
}

static inline bool __ipv6_addr_needs_scope_id(int type)
{
        return type & IPV6_ADDR_LINKLOCAL ||
               (type & IPV6_ADDR_MULTICAST &&
                (type & (IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL)));
}

static inline __u32 ipv6_iface_scope_id(const struct in6_addr *addr, int iface)
{
        return __ipv6_addr_needs_scope_id(__ipv6_addr_type(addr)) ? iface : 0;
}

static inline int ipv6_addr_cmp(const struct in6_addr *a1, const struct in6_addr *a2)
{
        return memcmp(a1, a2, sizeof(struct in6_addr));
}

static inline bool
ipv6_masked_addr_cmp(const struct in6_addr *a1, const struct in6_addr *m,
                     const struct in6_addr *a2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        const unsigned long *ul1 = (const unsigned long *)a1;
        const unsigned long *ulm = (const unsigned long *)m;
        const unsigned long *ul2 = (const unsigned long *)a2;

        return !!(((ul1[0] ^ ul2[0]) & ulm[0]) |
                  ((ul1[1] ^ ul2[1]) & ulm[1]));
#else
        return !!(((a1->s6_addr32[0] ^ a2->s6_addr32[0]) & m->s6_addr32[0]) |
                  ((a1->s6_addr32[1] ^ a2->s6_addr32[1]) & m->s6_addr32[1]) |
                  ((a1->s6_addr32[2] ^ a2->s6_addr32[2]) & m->s6_addr32[2]) |
                  ((a1->s6_addr32[3] ^ a2->s6_addr32[3]) & m->s6_addr32[3]));
#endif
}

static inline void ipv6_addr_prefix(struct in6_addr *pfx,
                                    const struct in6_addr *addr,
                                    int plen)
{
        /* caller must guarantee 0 <= plen <= 128 */
        int o = plen >> 3,
            b = plen & 0x7;

        memset(pfx->s6_addr, 0, sizeof(pfx->s6_addr));
        memcpy(pfx->s6_addr, addr, o);
        if (b != 0)
                pfx->s6_addr[o] = addr->s6_addr[o] & (0xff00 >> b);
}

static inline void ipv6_addr_prefix_copy(struct in6_addr *addr,
                                         const struct in6_addr *pfx,
                                         int plen)
{
        /* caller must guarantee 0 <= plen <= 128 */
        int o = plen >> 3,
            b = plen & 0x7;

        memcpy(addr->s6_addr, pfx, o);
        if (b != 0) {
                addr->s6_addr[o] &= ~(0xff00 >> b);
                addr->s6_addr[o] |= (pfx->s6_addr[o] & (0xff00 >> b));
        }
}

static inline void __ipv6_addr_set_half(__be32 *addr,
                                        __be32 wh, __be32 wl)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
#if defined(__BIG_ENDIAN)
        if (__builtin_constant_p(wh) && __builtin_constant_p(wl)) {
                *(__force u64 *)addr = ((__force u64)(wh) << 32 | (__force u64)(wl));
                return;
        }
#elif defined(__LITTLE_ENDIAN)
        if (__builtin_constant_p(wl) && __builtin_constant_p(wh)) {
                *(__force u64 *)addr = ((__force u64)(wl) << 32 | (__force u64)(wh));
                return;
        }
#endif
#endif
        addr[0] = wh;
        addr[1] = wl;
}

static inline void ipv6_addr_set(struct in6_addr *addr,
                                     __be32 w1, __be32 w2,
                                     __be32 w3, __be32 w4)
{
        __ipv6_addr_set_half(&addr->s6_addr32[0], w1, w2);
        __ipv6_addr_set_half(&addr->s6_addr32[2], w3, w4);
}

static inline bool ipv6_addr_equal(const struct in6_addr *a1,
                                   const struct in6_addr *a2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        const unsigned long *ul1 = (const unsigned long *)a1;
        const unsigned long *ul2 = (const unsigned long *)a2;

        return ((ul1[0] ^ ul2[0]) | (ul1[1] ^ ul2[1])) == 0UL;
#else
        return ((a1->s6_addr32[0] ^ a2->s6_addr32[0]) |
                (a1->s6_addr32[1] ^ a2->s6_addr32[1]) |
                (a1->s6_addr32[2] ^ a2->s6_addr32[2]) |
                (a1->s6_addr32[3] ^ a2->s6_addr32[3])) == 0;
#endif
}

#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
static inline bool __ipv6_prefix_equal64_half(const __be64 *a1,
                                              const __be64 *a2,
                                              unsigned int len)
{
        if (len && ((*a1 ^ *a2) & cpu_to_be64((~0UL) << (64 - len))))
                return false;
        return true;
}

static inline bool ipv6_prefix_equal(const struct in6_addr *addr1,
                                     const struct in6_addr *addr2,
                                     unsigned int prefixlen)
{
        const __be64 *a1 = (const __be64 *)addr1;
        const __be64 *a2 = (const __be64 *)addr2;

        if (prefixlen >= 64) {
                if (a1[0] ^ a2[0])
                        return false;
                return __ipv6_prefix_equal64_half(a1 + 1, a2 + 1, prefixlen - 64);
        }
        return __ipv6_prefix_equal64_half(a1, a2, prefixlen);
}
#else
static inline bool ipv6_prefix_equal(const struct in6_addr *addr1,
                                     const struct in6_addr *addr2,
                                     unsigned int prefixlen)
{
        const __be32 *a1 = addr1->s6_addr32;
        const __be32 *a2 = addr2->s6_addr32;
        unsigned int pdw, pbi;

        /* check complete u32 in prefix */
        pdw = prefixlen >> 5;
        if (pdw && memcmp(a1, a2, pdw << 2))
                return false;

        /* check incomplete u32 in prefix */
        pbi = prefixlen & 0x1f;
        if (pbi && ((a1[pdw] ^ a2[pdw]) & htonl((0xffffffff) << (32 - pbi))))
                return false;

        return true;
}
#endif

static inline bool ipv6_addr_any(const struct in6_addr *a)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        const unsigned long *ul = (const unsigned long *)a;

        return (ul[0] | ul[1]) == 0UL;
#else
        return (a->s6_addr32[0] | a->s6_addr32[1] |
                a->s6_addr32[2] | a->s6_addr32[3]) == 0;
#endif
}

static inline u32 ipv6_addr_hash(const struct in6_addr *a)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        const unsigned long *ul = (const unsigned long *)a;
        unsigned long x = ul[0] ^ ul[1];

        return (u32)(x ^ (x >> 32));
#else
        return (__force u32)(a->s6_addr32[0] ^ a->s6_addr32[1] ^
                             a->s6_addr32[2] ^ a->s6_addr32[3]);
#endif
}

/* more secured version of ipv6_addr_hash() */
static inline u32 __ipv6_addr_jhash(const struct in6_addr *a, const u32 initval)
{
        return jhash2((__force const u32 *)a->s6_addr32,
                      ARRAY_SIZE(a->s6_addr32), initval);
}

static inline bool ipv6_addr_loopback(const struct in6_addr *a)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        const __be64 *be = (const __be64 *)a;

        return (be[0] | (be[1] ^ cpu_to_be64(1))) == 0UL;
#else
        return (a->s6_addr32[0] | a->s6_addr32[1] |
                a->s6_addr32[2] | (a->s6_addr32[3] ^ cpu_to_be32(1))) == 0;
#endif
}

/*
 * Note that we must __force cast these to unsigned long to make sparse happy,
 * since all of the endian-annotated types are fixed size regardless of arch.
 */
static inline bool ipv6_addr_v4mapped(const struct in6_addr *a)
{
        return (
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
                *(unsigned long *)a |
#else
                (__force unsigned long)(a->s6_addr32[0] | a->s6_addr32[1]) |
#endif
                (__force unsigned long)(a->s6_addr32[2] ^
                                        cpu_to_be32(0x0000ffff))) == 0UL;
}

static inline bool ipv6_addr_v4mapped_loopback(const struct in6_addr *a)
{
        return ipv6_addr_v4mapped(a) && ipv4_is_loopback(a->s6_addr32[3]);
}

static inline u32 ipv6_portaddr_hash(const struct net *net,
                                     const struct in6_addr *addr6,
                                     unsigned int port)
{
        unsigned int hash, mix = net_hash_mix(net);

        if (ipv6_addr_any(addr6))
                hash = jhash_1word(0, mix);
        else if (ipv6_addr_v4mapped(addr6))
                hash = jhash_1word((__force u32)addr6->s6_addr32[3], mix);
        else
                hash = jhash2((__force u32 *)addr6->s6_addr32, 4, mix);

        return hash ^ port;
}

/*
 * Check for a RFC 4843 ORCHID address
 * (Overlay Routable Cryptographic Hash Identifiers)
 */
static inline bool ipv6_addr_orchid(const struct in6_addr *a)
{
        return (a->s6_addr32[0] & htonl(0xfffffff0)) == htonl(0x20010010);
}

static inline bool ipv6_addr_is_multicast(const struct in6_addr *addr)
{
        return (addr->s6_addr32[0] & htonl(0xFF000000)) == htonl(0xFF000000);
}

static inline void ipv6_addr_set_v4mapped(const __be32 addr,
                                          struct in6_addr *v4mapped)
{
        ipv6_addr_set(v4mapped,
                        0, 0,
                        htonl(0x0000FFFF),
                        addr);
}

/*
 * find the first different bit between two addresses
 * length of address must be a multiple of 32bits
 */
static inline int __ipv6_addr_diff32(const void *token1, const void *token2, int addrlen)
{
        const __be32 *a1 = token1, *a2 = token2;
        int i;

        addrlen >>= 2;

        for (i = 0; i < addrlen; i++) {
                __be32 xb = a1[i] ^ a2[i];
                if (xb)
                        return i * 32 + 31 - __fls(ntohl(xb));
        }

        /*
         *        we should *never* get to this point since that
         *        would mean the addrs are equal
         *
         *        However, we do get to it 8) And exactly, when
         *        addresses are equal 8)
         *
         *        ip route add 1111::/128 via ...
         *        ip route add 1111::/64 via ...
         *        and we are here.
         *
         *        Ideally, this function should stop comparison
         *        at prefix length. It does not, but it is still OK,
         *        if returned value is greater than prefix length.
         *                                        --ANK (980803)
         */
        return addrlen << 5;
}

#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
static inline int __ipv6_addr_diff64(const void *token1, const void *token2, int addrlen)
{
        const __be64 *a1 = token1, *a2 = token2;
        int i;

        addrlen >>= 3;

        for (i = 0; i < addrlen; i++) {
                __be64 xb = a1[i] ^ a2[i];
                if (xb)
                        return i * 64 + 63 - __fls(be64_to_cpu(xb));
        }

        return addrlen << 6;
}
#endif

static inline int __ipv6_addr_diff(const void *token1, const void *token2, int addrlen)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        if (__builtin_constant_p(addrlen) && !(addrlen & 7))
                return __ipv6_addr_diff64(token1, token2, addrlen);
#endif
        return __ipv6_addr_diff32(token1, token2, addrlen);
}

static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_addr *a2)
{
        return __ipv6_addr_diff(a1, a2, sizeof(struct in6_addr));
}

__be32 ipv6_select_ident(struct net *net,
                         const struct in6_addr *daddr,
                         const struct in6_addr *saddr);
__be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb);

int ip6_dst_hoplimit(struct dst_entry *dst);

static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6,
                                      struct dst_entry *dst)
{
        int hlimit;

        if (ipv6_addr_is_multicast(&fl6->daddr))
                hlimit = READ_ONCE(np->mcast_hops);
        else
                hlimit = READ_ONCE(np->hop_limit);
        if (hlimit < 0)
                hlimit = ip6_dst_hoplimit(dst);
        return hlimit;
}

/* copy IPv6 saddr & daddr to flow_keys, possibly using 64bit load/store
 * Equivalent to :        flow->v6addrs.src = iph->saddr;
 *                        flow->v6addrs.dst = iph->daddr;
 */
static inline void iph_to_flow_copy_v6addrs(struct flow_keys *flow,
                                            const struct ipv6hdr *iph)
{
        BUILD_BUG_ON(offsetof(typeof(flow->addrs), v6addrs.dst) !=
                     offsetof(typeof(flow->addrs), v6addrs.src) +
                     sizeof(flow->addrs.v6addrs.src));
        memcpy(&flow->addrs.v6addrs, &iph->addrs, sizeof(flow->addrs.v6addrs));
        flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
}

#if IS_ENABLED(CONFIG_IPV6)

static inline bool ipv6_can_nonlocal_bind(struct net *net,
                                          struct inet_sock *inet)
{
        return net->ipv6.sysctl.ip_nonlocal_bind ||
                test_bit(INET_FLAGS_FREEBIND, &inet->inet_flags) ||
                test_bit(INET_FLAGS_TRANSPARENT, &inet->inet_flags);
}

/* Sysctl settings for net ipv6.auto_flowlabels */
#define IP6_AUTO_FLOW_LABEL_OFF                0
#define IP6_AUTO_FLOW_LABEL_OPTOUT        1
#define IP6_AUTO_FLOW_LABEL_OPTIN        2
#define IP6_AUTO_FLOW_LABEL_FORCED        3

#define IP6_AUTO_FLOW_LABEL_MAX                IP6_AUTO_FLOW_LABEL_FORCED

#define IP6_DEFAULT_AUTO_FLOW_LABELS        IP6_AUTO_FLOW_LABEL_OPTOUT

static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb,
                                        __be32 flowlabel, bool autolabel,
                                        struct flowi6 *fl6)
{
        u32 hash;

        /* @flowlabel may include more than a flow label, eg, the traffic class.
         * Here we want only the flow label value.
         */
        flowlabel &= IPV6_FLOWLABEL_MASK;

        if (flowlabel ||
            net->ipv6.sysctl.auto_flowlabels == IP6_AUTO_FLOW_LABEL_OFF ||
            (!autolabel &&
             net->ipv6.sysctl.auto_flowlabels != IP6_AUTO_FLOW_LABEL_FORCED))
                return flowlabel;

        hash = skb_get_hash_flowi6(skb, fl6);

        /* Since this is being sent on the wire obfuscate hash a bit
         * to minimize possibility that any useful information to an
         * attacker is leaked. Only lower 20 bits are relevant.
         */
        hash = rol32(hash, 16);

        flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK;

        if (net->ipv6.sysctl.flowlabel_state_ranges)
                flowlabel |= IPV6_FLOWLABEL_STATELESS_FLAG;

        return flowlabel;
}

static inline int ip6_default_np_autolabel(struct net *net)
{
        switch (net->ipv6.sysctl.auto_flowlabels) {
        case IP6_AUTO_FLOW_LABEL_OFF:
        case IP6_AUTO_FLOW_LABEL_OPTIN:
        default:
                return 0;
        case IP6_AUTO_FLOW_LABEL_OPTOUT:
        case IP6_AUTO_FLOW_LABEL_FORCED:
                return 1;
        }
}
#else
static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb,
                                        __be32 flowlabel, bool autolabel,
                                        struct flowi6 *fl6)
{
        return flowlabel;
}
static inline int ip6_default_np_autolabel(struct net *net)
{
        return 0;
}
#endif

#if IS_ENABLED(CONFIG_IPV6)
static inline int ip6_multipath_hash_policy(const struct net *net)
{
        return net->ipv6.sysctl.multipath_hash_policy;
}
static inline u32 ip6_multipath_hash_fields(const struct net *net)
{
        return net->ipv6.sysctl.multipath_hash_fields;
}
#else
static inline int ip6_multipath_hash_policy(const struct net *net)
{
        return 0;
}
static inline u32 ip6_multipath_hash_fields(const struct net *net)
{
        return 0;
}
#endif

/*
 *        Header manipulation
 */
static inline void ip6_flow_hdr(struct ipv6hdr *hdr, unsigned int tclass,
                                __be32 flowlabel)
{
        *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | flowlabel;
}

static inline __be32 ip6_flowinfo(const struct ipv6hdr *hdr)
{
        return *(__be32 *)hdr & IPV6_FLOWINFO_MASK;
}

static inline __be32 ip6_flowlabel(const struct ipv6hdr *hdr)
{
        return *(__be32 *)hdr & IPV6_FLOWLABEL_MASK;
}

static inline u8 ip6_tclass(__be32 flowinfo)
{
        return ntohl(flowinfo & IPV6_TCLASS_MASK) >> IPV6_TCLASS_SHIFT;
}

static inline dscp_t ip6_dscp(__be32 flowinfo)
{
        return inet_dsfield_to_dscp(ip6_tclass(flowinfo));
}

static inline __be32 ip6_make_flowinfo(unsigned int tclass, __be32 flowlabel)
{
        return htonl(tclass << IPV6_TCLASS_SHIFT) | flowlabel;
}

static inline __be32 flowi6_get_flowlabel(const struct flowi6 *fl6)
{
        return fl6->flowlabel & IPV6_FLOWLABEL_MASK;
}

/*
 *        Prototypes exported by ipv6
 */

/*
 *        rcv function (called from netdevice level)
 */

int ipv6_rcv(struct sk_buff *skb, struct net_device *dev,
             struct packet_type *pt, struct net_device *orig_dev);
void ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
                   struct net_device *orig_dev);

int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb);

/*
 *        upper-layer output functions
 */
int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
             __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority);

int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr);

int ip6_append_data(struct sock *sk,
                    int getfrag(void *from, char *to, int offset, int len,
                                int odd, struct sk_buff *skb),
                    void *from, size_t length, int transhdrlen,
                    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
                    struct rt6_info *rt, unsigned int flags);

int ip6_push_pending_frames(struct sock *sk);

void ip6_flush_pending_frames(struct sock *sk);

int ip6_send_skb(struct sk_buff *skb);

struct sk_buff *__ip6_make_skb(struct sock *sk, struct sk_buff_head *queue,
                               struct inet_cork_full *cork,
                               struct inet6_cork *v6_cork);
struct sk_buff *ip6_make_skb(struct sock *sk,
                             int getfrag(void *from, char *to, int offset,
                                         int len, int odd, struct sk_buff *skb),
                             void *from, size_t length, int transhdrlen,
                             struct ipcm6_cookie *ipc6,
                             struct rt6_info *rt, unsigned int flags,
                             struct inet_cork_full *cork);

static inline struct sk_buff *ip6_finish_skb(struct sock *sk)
{
        return __ip6_make_skb(sk, &sk->sk_write_queue, &inet_sk(sk)->cork,
                              &inet6_sk(sk)->cork);
}

int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
                   struct flowi6 *fl6);
struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
                                      const struct in6_addr *final_dst);
struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
                                         const struct in6_addr *final_dst,
                                         bool connected);
struct dst_entry *ip6_blackhole_route(struct net *net,
                                      struct dst_entry *orig_dst);

/*
 *        skb processing functions
 */

int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb);
int ip6_forward(struct sk_buff *skb);
int ip6_input(struct sk_buff *skb);
int ip6_mc_input(struct sk_buff *skb);
void ip6_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int nexthdr,
                              bool have_final);

int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);
int ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);

/*
 *        Extension header (options) processing
 */

void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt,
                          u8 *proto, struct in6_addr **daddr_p,
                          struct in6_addr *saddr);
void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt,
                         u8 *proto);

int ipv6_skip_exthdr(const struct sk_buff *, int start, u8 *nexthdrp,
                     __be16 *frag_offp);

bool ipv6_ext_hdr(u8 nexthdr);

enum {
        IP6_FH_F_FRAG                = (1 << 0),
        IP6_FH_F_AUTH                = (1 << 1),
        IP6_FH_F_SKIP_RH        = (1 << 2),
};

/* find specified header and get offset to it */
int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, int target,
                  unsigned short *fragoff, int *fragflg);

int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type);

struct in6_addr *fl6_update_dst(struct flowi6 *fl6,
                                const struct ipv6_txoptions *opt,
                                struct in6_addr *orig);

/*
 *        socket options (ipv6_sockglue.c)
 */
DECLARE_STATIC_KEY_FALSE(ip6_min_hopcount);

int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
                       unsigned int optlen);
int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
                    unsigned int optlen);
int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
                       sockptr_t optval, sockptr_t optlen);
int ipv6_getsockopt(struct sock *sk, int level, int optname,
                    char __user *optval, int __user *optlen);

int __ip6_datagram_connect(struct sock *sk, struct sockaddr *addr,
                           int addr_len);
int ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, int addr_len);
int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *addr,
                                 int addr_len);
int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr);
void ip6_datagram_release_cb(struct sock *sk);

int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len,
                    int *addr_len);
int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len,
                     int *addr_len);
void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port,
                     u32 info, u8 *payload);
void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info);
void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu);

void inet6_cleanup_sock(struct sock *sk);
void inet6_sock_destruct(struct sock *sk);
int inet6_release(struct socket *sock);
int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len);
int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
                  int peer);
int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
int inet6_compat_ioctl(struct socket *sock, unsigned int cmd,
                unsigned long arg);

int inet6_hash_connect(struct inet_timewait_death_row *death_row,
                              struct sock *sk);
int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size);
int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                  int flags);

/*
 * reassembly.c
 */
extern const struct proto_ops inet6_stream_ops;
extern const struct proto_ops inet6_dgram_ops;
extern const struct proto_ops inet6_sockraw_ops;

struct group_source_req;
struct group_filter;

int ip6_mc_source(int add, int omode, struct sock *sk,
                  struct group_source_req *pgsr);
int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf,
                  struct sockaddr_storage *list);
int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
                  sockptr_t optval, size_t ss_offset);

#ifdef CONFIG_PROC_FS
int ac6_proc_init(struct net *net);
void ac6_proc_exit(struct net *net);
int raw6_proc_init(void);
void raw6_proc_exit(void);
int tcp6_proc_init(struct net *net);
void tcp6_proc_exit(struct net *net);
int udp6_proc_init(struct net *net);
void udp6_proc_exit(struct net *net);
int udplite6_proc_init(void);
void udplite6_proc_exit(void);
int ipv6_misc_proc_init(void);
void ipv6_misc_proc_exit(void);
int snmp6_register_dev(struct inet6_dev *idev);
int snmp6_unregister_dev(struct inet6_dev *idev);

#else
static inline int ac6_proc_init(struct net *net) { return 0; }
static inline void ac6_proc_exit(struct net *net) { }
static inline int snmp6_register_dev(struct inet6_dev *idev) { return 0; }
static inline int snmp6_unregister_dev(struct inet6_dev *idev) { return 0; }
#endif

#ifdef CONFIG_SYSCTL
struct ctl_table *ipv6_icmp_sysctl_init(struct net *net);
size_t ipv6_icmp_sysctl_table_size(void);
struct ctl_table *ipv6_route_sysctl_init(struct net *net);
size_t ipv6_route_sysctl_table_size(struct net *net);
int ipv6_sysctl_register(void);
void ipv6_sysctl_unregister(void);
#endif

int ipv6_sock_mc_join(struct sock *sk, int ifindex,
                      const struct in6_addr *addr);
int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex,
                          const struct in6_addr *addr, unsigned int mode);
int ipv6_sock_mc_drop(struct sock *sk, int ifindex,
                      const struct in6_addr *addr);

static inline int ip6_sock_set_v6only(struct sock *sk)
{
        if (inet_sk(sk)->inet_num)
                return -EINVAL;
        lock_sock(sk);
        sk->sk_ipv6only = true;
        release_sock(sk);
        return 0;
}

static inline void ip6_sock_set_recverr(struct sock *sk)
{
        inet6_set_bit(RECVERR6, sk);
}

#define IPV6_PREFER_SRC_MASK (IPV6_PREFER_SRC_TMP | IPV6_PREFER_SRC_PUBLIC | \
                              IPV6_PREFER_SRC_COA)

static inline int ip6_sock_set_addr_preferences(struct sock *sk, int val)
{
        unsigned int prefmask = ~IPV6_PREFER_SRC_MASK;
        unsigned int pref = 0;

        /* check PUBLIC/TMP/PUBTMP_DEFAULT conflicts */
        switch (val & (IPV6_PREFER_SRC_PUBLIC |
                       IPV6_PREFER_SRC_TMP |
                       IPV6_PREFER_SRC_PUBTMP_DEFAULT)) {
        case IPV6_PREFER_SRC_PUBLIC:
                pref |= IPV6_PREFER_SRC_PUBLIC;
                prefmask &= ~(IPV6_PREFER_SRC_PUBLIC |
                              IPV6_PREFER_SRC_TMP);
                break;
        case IPV6_PREFER_SRC_TMP:
                pref |= IPV6_PREFER_SRC_TMP;
                prefmask &= ~(IPV6_PREFER_SRC_PUBLIC |
                              IPV6_PREFER_SRC_TMP);
                break;
        case IPV6_PREFER_SRC_PUBTMP_DEFAULT:
                prefmask &= ~(IPV6_PREFER_SRC_PUBLIC |
                              IPV6_PREFER_SRC_TMP);
                break;
        case 0:
                break;
        default:
                return -EINVAL;
        }

        /* check HOME/COA conflicts */
        switch (val & (IPV6_PREFER_SRC_HOME | IPV6_PREFER_SRC_COA)) {
        case IPV6_PREFER_SRC_HOME:
                prefmask &= ~IPV6_PREFER_SRC_COA;
                break;
        case IPV6_PREFER_SRC_COA:
                pref |= IPV6_PREFER_SRC_COA;
                break;
        case 0:
                break;
        default:
                return -EINVAL;
        }

        /* check CGA/NONCGA conflicts */
        switch (val & (IPV6_PREFER_SRC_CGA|IPV6_PREFER_SRC_NONCGA)) {
        case IPV6_PREFER_SRC_CGA:
        case IPV6_PREFER_SRC_NONCGA:
        case 0:
                break;
        default:
                return -EINVAL;
        }

        WRITE_ONCE(inet6_sk(sk)->srcprefs,
                   (READ_ONCE(inet6_sk(sk)->srcprefs) & prefmask) | pref);
        return 0;
}

static inline void ip6_sock_set_recvpktinfo(struct sock *sk)
{
        lock_sock(sk);
        inet6_sk(sk)->rxopt.bits.rxinfo = true;
        release_sock(sk);
}

#define IPV6_ADDR_WORDS 4

static inline void ipv6_addr_cpu_to_be32(__be32 *dst, const u32 *src)
{
        cpu_to_be32_array(dst, src, IPV6_ADDR_WORDS);
}

static inline void ipv6_addr_be32_to_cpu(u32 *dst, const __be32 *src)
{
        be32_to_cpu_array(dst, src, IPV6_ADDR_WORDS);
}

#endif /* _NET_IPV6_H */





































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_GENERIC_NETLINK_H
#define __NET_GENERIC_NETLINK_H

#include <linux/net.h>
#include <net/netlink.h>
#include <net/net_namespace.h>
#include <uapi/linux/genetlink.h>

#define GENLMSG_DEFAULT_SIZE (NLMSG_DEFAULT_SIZE - GENL_HDRLEN)

/* Non-parallel generic netlink requests are serialized by a global lock. */
void genl_lock(void);
void genl_unlock(void);

#define MODULE_ALIAS_GENL_FAMILY(family) \
 MODULE_ALIAS_NET_PF_PROTO_NAME(PF_NETLINK, NETLINK_GENERIC, "-family-" family)

/* Binding to multicast group requires %CAP_NET_ADMIN */
#define GENL_MCAST_CAP_NET_ADMIN        BIT(0)
/* Binding to multicast group requires %CAP_SYS_ADMIN */
#define GENL_MCAST_CAP_SYS_ADMIN        BIT(1)

/**
 * struct genl_multicast_group - generic netlink multicast group
 * @name: name of the multicast group, names are per-family
 * @flags: GENL_MCAST_* flags
 */
struct genl_multicast_group {
        char                        name[GENL_NAMSIZ];
        u8                        flags;
};

struct genl_split_ops;
struct genl_info;

/**
 * struct genl_family - generic netlink family
 * @hdrsize: length of user specific header in bytes
 * @name: name of family
 * @version: protocol version
 * @maxattr: maximum number of attributes supported
 * @policy: netlink policy
 * @netnsok: set to true if the family can handle network
 *        namespaces and should be presented in all of them
 * @parallel_ops: operations can be called in parallel and aren't
 *        synchronized by the core genetlink code
 * @pre_doit: called before an operation's doit callback, it may
 *        do additional, common, filtering and return an error
 * @post_doit: called after an operation's doit callback, it may
 *        undo operations done by pre_doit, for example release locks
 * @bind: called when family multicast group is added to a netlink socket
 * @unbind: called when family multicast group is removed from a netlink socket
 * @module: pointer to the owning module (set to THIS_MODULE)
 * @mcgrps: multicast groups used by this family
 * @n_mcgrps: number of multicast groups
 * @resv_start_op: first operation for which reserved fields of the header
 *        can be validated and policies are required (see below);
 *        new families should leave this field at zero
 * @ops: the operations supported by this family
 * @n_ops: number of operations supported by this family
 * @small_ops: the small-struct operations supported by this family
 * @n_small_ops: number of small-struct operations supported by this family
 * @split_ops: the split do/dump form of operation definition
 * @n_split_ops: number of entries in @split_ops, note that with split do/dump
 *        ops the number of entries is not the same as number of commands
 * @sock_priv_size: the size of per-socket private memory
 * @sock_priv_init: the per-socket private memory initializer
 * @sock_priv_destroy: the per-socket private memory destructor
 *
 * Attribute policies (the combination of @policy and @maxattr fields)
 * can be attached at the family level or at the operation level.
 * If both are present the per-operation policy takes precedence.
 * For operations before @resv_start_op lack of policy means that the core
 * will perform no attribute parsing or validation. For newer operations
 * if policy is not provided core will reject all TLV attributes.
 */
struct genl_family {
        unsigned int                hdrsize;
        char                        name[GENL_NAMSIZ];
        unsigned int                version;
        unsigned int                maxattr;
        u8                        netnsok:1;
        u8                        parallel_ops:1;
        u8                        n_ops;
        u8                        n_small_ops;
        u8                        n_split_ops;
        u8                        n_mcgrps;
        u8                        resv_start_op;
        const struct nla_policy *policy;
        int                        (*pre_doit)(const struct genl_split_ops *ops,
                                            struct sk_buff *skb,
                                            struct genl_info *info);
        void                        (*post_doit)(const struct genl_split_ops *ops,
                                             struct sk_buff *skb,
                                             struct genl_info *info);
        int                        (*bind)(int mcgrp);
        void                        (*unbind)(int mcgrp);
        const struct genl_ops *        ops;
        const struct genl_small_ops *small_ops;
        const struct genl_split_ops *split_ops;
        const struct genl_multicast_group *mcgrps;
        struct module                *module;

        size_t                        sock_priv_size;
        void                        (*sock_priv_init)(void *priv);
        void                        (*sock_priv_destroy)(void *priv);

/* private: internal use only */
        /* protocol family identifier */
        int                        id;
        /* starting number of multicast group IDs in this family */
        unsigned int                mcgrp_offset;
        /* list of per-socket privs */
        struct xarray                *sock_privs;
};

/**
 * struct genl_info - receiving information
 * @snd_seq: sending sequence number
 * @snd_portid: netlink portid of sender
 * @family: generic netlink family
 * @nlhdr: netlink message header
 * @genlhdr: generic netlink message header
 * @attrs: netlink attributes
 * @_net: network namespace
 * @ctx: storage space for the use by the family
 * @user_ptr: user pointers (deprecated, use ctx instead)
 * @extack: extended ACK report struct
 */
struct genl_info {
        u32                        snd_seq;
        u32                        snd_portid;
        const struct genl_family *family;
        const struct nlmsghdr *        nlhdr;
        struct genlmsghdr *        genlhdr;
        struct nlattr **        attrs;
        possible_net_t                _net;
        union {
                u8                ctx[NETLINK_CTX_SIZE];
                void *                user_ptr[2];
        };
        struct netlink_ext_ack *extack;
};

static inline struct net *genl_info_net(const struct genl_info *info)
{
        return read_pnet(&info->_net);
}

static inline void genl_info_net_set(struct genl_info *info, struct net *net)
{
        write_pnet(&info->_net, net);
}

static inline void *genl_info_userhdr(const struct genl_info *info)
{
        return (u8 *)info->genlhdr + GENL_HDRLEN;
}

#define GENL_SET_ERR_MSG(info, msg) NL_SET_ERR_MSG((info)->extack, msg)

#define GENL_SET_ERR_MSG_FMT(info, msg, args...) \
        NL_SET_ERR_MSG_FMT((info)->extack, msg, ##args)

/* Report that a root attribute is missing */
#define GENL_REQ_ATTR_CHECK(info, attr) ({                                \
        const struct genl_info *__info = (info);                        \
                                                                        \
        NL_REQ_ATTR_CHECK(__info->extack, NULL, __info->attrs, (attr)); \
})

enum genl_validate_flags {
        GENL_DONT_VALIDATE_STRICT                = BIT(0),
        GENL_DONT_VALIDATE_DUMP                        = BIT(1),
        GENL_DONT_VALIDATE_DUMP_STRICT                = BIT(2),
};

/**
 * struct genl_small_ops - generic netlink operations (small version)
 * @cmd: command identifier
 * @internal_flags: flags used by the family
 * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM)
 * @validate: validation flags from enum genl_validate_flags
 * @doit: standard command callback
 * @dumpit: callback for dumpers
 *
 * This is a cut-down version of struct genl_ops for users who don't need
 * most of the ancillary infra and want to save space.
 */
struct genl_small_ops {
        int        (*doit)(struct sk_buff *skb, struct genl_info *info);
        int        (*dumpit)(struct sk_buff *skb, struct netlink_callback *cb);
        u8        cmd;
        u8        internal_flags;
        u8        flags;
        u8        validate;
};

/**
 * struct genl_ops - generic netlink operations
 * @cmd: command identifier
 * @internal_flags: flags used by the family
 * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM)
 * @maxattr: maximum number of attributes supported
 * @policy: netlink policy (takes precedence over family policy)
 * @validate: validation flags from enum genl_validate_flags
 * @doit: standard command callback
 * @start: start callback for dumps
 * @dumpit: callback for dumpers
 * @done: completion callback for dumps
 */
struct genl_ops {
        int                       (*doit)(struct sk_buff *skb,
                                       struct genl_info *info);
        int                       (*start)(struct netlink_callback *cb);
        int                       (*dumpit)(struct sk_buff *skb,
                                         struct netlink_callback *cb);
        int                       (*done)(struct netlink_callback *cb);
        const struct nla_policy *policy;
        unsigned int                maxattr;
        u8                        cmd;
        u8                        internal_flags;
        u8                        flags;
        u8                        validate;
};

/**
 * struct genl_split_ops - generic netlink operations (do/dump split version)
 * @cmd: command identifier
 * @internal_flags: flags used by the family
 * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM)
 * @validate: validation flags from enum genl_validate_flags
 * @policy: netlink policy (takes precedence over family policy)
 * @maxattr: maximum number of attributes supported
 *
 * Do callbacks:
 * @pre_doit: called before an operation's @doit callback, it may
 *        do additional, common, filtering and return an error
 * @doit: standard command callback
 * @post_doit: called after an operation's @doit callback, it may
 *        undo operations done by pre_doit, for example release locks
 *
 * Dump callbacks:
 * @start: start callback for dumps
 * @dumpit: callback for dumpers
 * @done: completion callback for dumps
 *
 * Do callbacks can be used if %GENL_CMD_CAP_DO is set in @flags.
 * Dump callbacks can be used if %GENL_CMD_CAP_DUMP is set in @flags.
 * Exactly one of those flags must be set.
 */
struct genl_split_ops {
        union {
                struct {
                        int (*pre_doit)(const struct genl_split_ops *ops,
                                        struct sk_buff *skb,
                                        struct genl_info *info);
                        int (*doit)(struct sk_buff *skb,
                                    struct genl_info *info);
                        void (*post_doit)(const struct genl_split_ops *ops,
                                          struct sk_buff *skb,
                                          struct genl_info *info);
                };
                struct {
                        int (*start)(struct netlink_callback *cb);
                        int (*dumpit)(struct sk_buff *skb,
                                      struct netlink_callback *cb);
                        int (*done)(struct netlink_callback *cb);
                };
        };
        const struct nla_policy *policy;
        unsigned int                maxattr;
        u8                        cmd;
        u8                        internal_flags;
        u8                        flags;
        u8                        validate;
};

/**
 * struct genl_dumpit_info - info that is available during dumpit op call
 * @op: generic netlink ops - for internal genl code usage
 * @attrs: netlink attributes
 * @info: struct genl_info describing the request
 */
struct genl_dumpit_info {
        struct genl_split_ops op;
        struct genl_info info;
};

static inline const struct genl_dumpit_info *
genl_dumpit_info(struct netlink_callback *cb)
{
        return cb->data;
}

static inline const struct genl_info *
genl_info_dump(struct netlink_callback *cb)
{
        return &genl_dumpit_info(cb)->info;
}

/**
 * genl_info_init_ntf() - initialize genl_info for notifications
 * @info:   genl_info struct to set up
 * @family: pointer to the genetlink family
 * @cmd:    command to be used in the notification
 *
 * Initialize a locally declared struct genl_info to pass to various APIs.
 * Intended to be used when creating notifications.
 */
static inline void
genl_info_init_ntf(struct genl_info *info, const struct genl_family *family,
                   u8 cmd)
{
        struct genlmsghdr *hdr = (void *) &info->user_ptr[0];

        memset(info, 0, sizeof(*info));
        info->family = family;
        info->genlhdr = hdr;
        hdr->cmd = cmd;
}

static inline bool genl_info_is_ntf(const struct genl_info *info)
{
        return !info->nlhdr;
}

void *__genl_sk_priv_get(struct genl_family *family, struct sock *sk);
void *genl_sk_priv_get(struct genl_family *family, struct sock *sk);
int genl_register_family(struct genl_family *family);
int genl_unregister_family(const struct genl_family *family);
void genl_notify(const struct genl_family *family, struct sk_buff *skb,
                 struct genl_info *info, u32 group, gfp_t flags);

void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq,
                  const struct genl_family *family, int flags, u8 cmd);

static inline void *
__genlmsg_iput(struct sk_buff *skb, const struct genl_info *info, int flags)
{
        return genlmsg_put(skb, info->snd_portid, info->snd_seq, info->family,
                           flags, info->genlhdr->cmd);
}

/**
 * genlmsg_iput - start genetlink message based on genl_info
 * @skb: skb in which message header will be placed
 * @info: genl_info as provided to do/dump handlers
 *
 * Convenience wrapper which starts a genetlink message based on
 * information in user request. @info should be either the struct passed
 * by genetlink core to do/dump handlers (when constructing replies to
 * such requests) or a struct initialized by genl_info_init_ntf()
 * when constructing notifications.
 *
 * Returns: pointer to new genetlink header.
 */
static inline void *
genlmsg_iput(struct sk_buff *skb, const struct genl_info *info)
{
        return __genlmsg_iput(skb, info, 0);
}

/**
 * genlmsg_nlhdr - Obtain netlink header from user specified header
 * @user_hdr: user header as returned from genlmsg_put()
 *
 * Returns: pointer to netlink header.
 */
static inline struct nlmsghdr *genlmsg_nlhdr(void *user_hdr)
{
        return (struct nlmsghdr *)((char *)user_hdr -
                                   GENL_HDRLEN -
                                   NLMSG_HDRLEN);
}

/**
 * genlmsg_parse_deprecated - parse attributes of a genetlink message
 * @nlh: netlink message header
 * @family: genetlink message family
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @extack: extended ACK report struct
 */
static inline int genlmsg_parse_deprecated(const struct nlmsghdr *nlh,
                                           const struct genl_family *family,
                                           struct nlattr *tb[], int maxtype,
                                           const struct nla_policy *policy,
                                           struct netlink_ext_ack *extack)
{
        return __nlmsg_parse(nlh, family->hdrsize + GENL_HDRLEN, tb, maxtype,
                             policy, NL_VALIDATE_LIBERAL, extack);
}

/**
 * genlmsg_parse - parse attributes of a genetlink message
 * @nlh: netlink message header
 * @family: genetlink message family
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @extack: extended ACK report struct
 */
static inline int genlmsg_parse(const struct nlmsghdr *nlh,
                                const struct genl_family *family,
                                struct nlattr *tb[], int maxtype,
                                const struct nla_policy *policy,
                                struct netlink_ext_ack *extack)
{
        return __nlmsg_parse(nlh, family->hdrsize + GENL_HDRLEN, tb, maxtype,
                             policy, NL_VALIDATE_STRICT, extack);
}

/**
 * genl_dump_check_consistent - check if sequence is consistent and advertise if not
 * @cb: netlink callback structure that stores the sequence number
 * @user_hdr: user header as returned from genlmsg_put()
 *
 * Cf. nl_dump_check_consistent(), this just provides a wrapper to make it
 * simpler to use with generic netlink.
 */
static inline void genl_dump_check_consistent(struct netlink_callback *cb,
                                              void *user_hdr)
{
        nl_dump_check_consistent(cb, genlmsg_nlhdr(user_hdr));
}

/**
 * genlmsg_put_reply - Add generic netlink header to a reply message
 * @skb: socket buffer holding the message
 * @info: receiver info
 * @family: generic netlink family
 * @flags: netlink message flags
 * @cmd: generic netlink command
 *
 * Returns: pointer to user specific header
 */
static inline void *genlmsg_put_reply(struct sk_buff *skb,
                                      struct genl_info *info,
                                      const struct genl_family *family,
                                      int flags, u8 cmd)
{
        return genlmsg_put(skb, info->snd_portid, info->snd_seq, family,
                           flags, cmd);
}

/**
 * genlmsg_end - Finalize a generic netlink message
 * @skb: socket buffer the message is stored in
 * @hdr: user specific header
 */
static inline void genlmsg_end(struct sk_buff *skb, void *hdr)
{
        nlmsg_end(skb, hdr - GENL_HDRLEN - NLMSG_HDRLEN);
}

/**
 * genlmsg_cancel - Cancel construction of a generic netlink message
 * @skb: socket buffer the message is stored in
 * @hdr: generic netlink message header
 */
static inline void genlmsg_cancel(struct sk_buff *skb, void *hdr)
{
        if (hdr)
                nlmsg_cancel(skb, hdr - GENL_HDRLEN - NLMSG_HDRLEN);
}

/**
 * genlmsg_multicast_netns_filtered - multicast a netlink message
 *                                      to a specific netns with filter
 *                                      function
 * @family: the generic netlink family
 * @net: the net namespace
 * @skb: netlink message as socket buffer
 * @portid: own netlink portid to avoid sending to yourself
 * @group: offset of multicast group in groups array
 * @flags: allocation flags
 * @filter: filter function
 * @filter_data: filter function private data
 *
 * Return: 0 on success, negative error code for failure.
 */
static inline int
genlmsg_multicast_netns_filtered(const struct genl_family *family,
                                 struct net *net, struct sk_buff *skb,
                                 u32 portid, unsigned int group, gfp_t flags,
                                 netlink_filter_fn filter,
                                 void *filter_data)
{
        if (WARN_ON_ONCE(group >= family->n_mcgrps))
                return -EINVAL;
        group = family->mcgrp_offset + group;
        return nlmsg_multicast_filtered(net->genl_sock, skb, portid, group,
                                        flags, filter, filter_data);
}

/**
 * genlmsg_multicast_netns - multicast a netlink message to a specific netns
 * @family: the generic netlink family
 * @net: the net namespace
 * @skb: netlink message as socket buffer
 * @portid: own netlink portid to avoid sending to yourself
 * @group: offset of multicast group in groups array
 * @flags: allocation flags
 */
static inline int genlmsg_multicast_netns(const struct genl_family *family,
                                          struct net *net, struct sk_buff *skb,
                                          u32 portid, unsigned int group, gfp_t flags)
{
        return genlmsg_multicast_netns_filtered(family, net, skb, portid,
                                                group, flags, NULL, NULL);
}

/**
 * genlmsg_multicast - multicast a netlink message to the default netns
 * @family: the generic netlink family
 * @skb: netlink message as socket buffer
 * @portid: own netlink portid to avoid sending to yourself
 * @group: offset of multicast group in groups array
 * @flags: allocation flags
 */
static inline int genlmsg_multicast(const struct genl_family *family,
                                    struct sk_buff *skb, u32 portid,
                                    unsigned int group, gfp_t flags)
{
        return genlmsg_multicast_netns(family, &init_net, skb,
                                       portid, group, flags);
}

/**
 * genlmsg_multicast_allns - multicast a netlink message to all net namespaces
 * @family: the generic netlink family
 * @skb: netlink message as socket buffer
 * @portid: own netlink portid to avoid sending to yourself
 * @group: offset of multicast group in groups array
 *
 * This function must hold the RTNL or rcu_read_lock().
 */
int genlmsg_multicast_allns(const struct genl_family *family,
                            struct sk_buff *skb, u32 portid,
                            unsigned int group);

/**
 * genlmsg_unicast - unicast a netlink message
 * @net: network namespace to look up @portid in
 * @skb: netlink message as socket buffer
 * @portid: netlink portid of the destination socket
 */
static inline int genlmsg_unicast(struct net *net, struct sk_buff *skb, u32 portid)
{
        return nlmsg_unicast(net->genl_sock, skb, portid);
}

/**
 * genlmsg_reply - reply to a request
 * @skb: netlink message to be sent back
 * @info: receiver information
 */
static inline int genlmsg_reply(struct sk_buff *skb, struct genl_info *info)
{
        return genlmsg_unicast(genl_info_net(info), skb, info->snd_portid);
}

/**
 * genlmsg_data - head of message payload
 * @gnlh: genetlink message header
 */
static inline void *genlmsg_data(const struct genlmsghdr *gnlh)
{
        return ((unsigned char *) gnlh + GENL_HDRLEN);
}

/**
 * genlmsg_len - length of message payload
 * @gnlh: genetlink message header
 */
static inline int genlmsg_len(const struct genlmsghdr *gnlh)
{
        struct nlmsghdr *nlh = (struct nlmsghdr *)((unsigned char *)gnlh -
                                                        NLMSG_HDRLEN);
        return (nlh->nlmsg_len - GENL_HDRLEN - NLMSG_HDRLEN);
}

/**
 * genlmsg_msg_size - length of genetlink message not including padding
 * @payload: length of message payload
 */
static inline int genlmsg_msg_size(int payload)
{
        return GENL_HDRLEN + payload;
}

/**
 * genlmsg_total_size - length of genetlink message including padding
 * @payload: length of message payload
 */
static inline int genlmsg_total_size(int payload)
{
        return NLMSG_ALIGN(genlmsg_msg_size(payload));
}

/**
 * genlmsg_new - Allocate a new generic netlink message
 * @payload: size of the message payload
 * @flags: the type of memory to allocate.
 */
static inline struct sk_buff *genlmsg_new(size_t payload, gfp_t flags)
{
        return nlmsg_new(genlmsg_total_size(payload), flags);
}

/**
 * genl_set_err - report error to genetlink broadcast listeners
 * @family: the generic netlink family
 * @net: the network namespace to report the error to
 * @portid: the PORTID of a process that we want to skip (if any)
 * @group: the broadcast group that will notice the error
 *         (this is the offset of the multicast group in the groups array)
 * @code: error code, must be negative (as usual in kernelspace)
 *
 * This function returns the number of broadcast listeners that have set the
 * NETLINK_RECV_NO_ENOBUFS socket option.
 */
static inline int genl_set_err(const struct genl_family *family,
                               struct net *net, u32 portid,
                               u32 group, int code)
{
        if (WARN_ON_ONCE(group >= family->n_mcgrps))
                return -EINVAL;
        group = family->mcgrp_offset + group;
        return netlink_set_err(net->genl_sock, portid, group, code);
}

static inline int genl_has_listeners(const struct genl_family *family,
                                     struct net *net, unsigned int group)
{
        if (WARN_ON_ONCE(group >= family->n_mcgrps))
                return -EINVAL;
        group = family->mcgrp_offset + group;
        return netlink_has_listeners(net->genl_sock, group);
}
#endif        /* __NET_GENERIC_NETLINK_H */





























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
/* This file is automatically generated.  Do not edit. */
#ifndef _SELINUX_FLASK_H_
#define _SELINUX_FLASK_H_

#define SECCLASS_SECURITY                                 1
#define SECCLASS_PROCESS                                  2
#define SECCLASS_PROCESS2                                 3
#define SECCLASS_SYSTEM                                   4
#define SECCLASS_CAPABILITY                               5
#define SECCLASS_FILESYSTEM                               6
#define SECCLASS_FILE                                     7
#define SECCLASS_DIR                                      8
#define SECCLASS_FD                                       9
#define SECCLASS_LNK_FILE                                10
#define SECCLASS_CHR_FILE                                11
#define SECCLASS_BLK_FILE                                12
#define SECCLASS_SOCK_FILE                               13
#define SECCLASS_FIFO_FILE                               14
#define SECCLASS_SOCKET                                  15
#define SECCLASS_TCP_SOCKET                              16
#define SECCLASS_UDP_SOCKET                              17
#define SECCLASS_RAWIP_SOCKET                            18
#define SECCLASS_NODE                                    19
#define SECCLASS_NETIF                                   20
#define SECCLASS_NETLINK_SOCKET                          21
#define SECCLASS_PACKET_SOCKET                           22
#define SECCLASS_KEY_SOCKET                              23
#define SECCLASS_UNIX_STREAM_SOCKET                      24
#define SECCLASS_UNIX_DGRAM_SOCKET                       25
#define SECCLASS_SEM                                     26
#define SECCLASS_MSG                                     27
#define SECCLASS_MSGQ                                    28
#define SECCLASS_SHM                                     29
#define SECCLASS_IPC                                     30
#define SECCLASS_NETLINK_ROUTE_SOCKET                    31
#define SECCLASS_NETLINK_TCPDIAG_SOCKET                  32
#define SECCLASS_NETLINK_NFLOG_SOCKET                    33
#define SECCLASS_NETLINK_XFRM_SOCKET                     34
#define SECCLASS_NETLINK_SELINUX_SOCKET                  35
#define SECCLASS_NETLINK_ISCSI_SOCKET                    36
#define SECCLASS_NETLINK_AUDIT_SOCKET                    37
#define SECCLASS_NETLINK_FIB_LOOKUP_SOCKET               38
#define SECCLASS_NETLINK_CONNECTOR_SOCKET                39
#define SECCLASS_NETLINK_NETFILTER_SOCKET                40
#define SECCLASS_NETLINK_DNRT_SOCKET                     41
#define SECCLASS_ASSOCIATION                             42
#define SECCLASS_NETLINK_KOBJECT_UEVENT_SOCKET           43
#define SECCLASS_NETLINK_GENERIC_SOCKET                  44
#define SECCLASS_NETLINK_SCSITRANSPORT_SOCKET            45
#define SECCLASS_NETLINK_RDMA_SOCKET                     46
#define SECCLASS_NETLINK_CRYPTO_SOCKET                   47
#define SECCLASS_APPLETALK_SOCKET                        48
#define SECCLASS_PACKET                                  49
#define SECCLASS_KEY                                     50
#define SECCLASS_MEMPROTECT                              51
#define SECCLASS_PEER                                    52
#define SECCLASS_CAPABILITY2                             53
#define SECCLASS_KERNEL_SERVICE                          54
#define SECCLASS_TUN_SOCKET                              55
#define SECCLASS_BINDER                                  56
#define SECCLASS_CAP_USERNS                              57
#define SECCLASS_CAP2_USERNS                             58
#define SECCLASS_SCTP_SOCKET                             59
#define SECCLASS_ICMP_SOCKET                             60
#define SECCLASS_AX25_SOCKET                             61
#define SECCLASS_IPX_SOCKET                              62
#define SECCLASS_NETROM_SOCKET                           63
#define SECCLASS_ATMPVC_SOCKET                           64
#define SECCLASS_X25_SOCKET                              65
#define SECCLASS_ROSE_SOCKET                             66
#define SECCLASS_DECNET_SOCKET                           67
#define SECCLASS_ATMSVC_SOCKET                           68
#define SECCLASS_RDS_SOCKET                              69
#define SECCLASS_IRDA_SOCKET                             70
#define SECCLASS_PPPOX_SOCKET                            71
#define SECCLASS_LLC_SOCKET                              72
#define SECCLASS_CAN_SOCKET                              73
#define SECCLASS_TIPC_SOCKET                             74
#define SECCLASS_BLUETOOTH_SOCKET                        75
#define SECCLASS_IUCV_SOCKET                             76
#define SECCLASS_RXRPC_SOCKET                            77
#define SECCLASS_ISDN_SOCKET                             78
#define SECCLASS_PHONET_SOCKET                           79
#define SECCLASS_IEEE802154_SOCKET                       80
#define SECCLASS_CAIF_SOCKET                             81
#define SECCLASS_ALG_SOCKET                              82
#define SECCLASS_NFC_SOCKET                              83
#define SECCLASS_VSOCK_SOCKET                            84
#define SECCLASS_KCM_SOCKET                              85
#define SECCLASS_QIPCRTR_SOCKET                          86
#define SECCLASS_SMC_SOCKET                              87
#define SECCLASS_INFINIBAND_PKEY                         88
#define SECCLASS_INFINIBAND_ENDPORT                      89
#define SECCLASS_BPF                                     90
#define SECCLASS_XDP_SOCKET                              91
#define SECCLASS_MCTP_SOCKET                             92
#define SECCLASS_PERF_EVENT                              93
#define SECCLASS_ANON_INODE                              94
#define SECCLASS_IO_URING                                95
#define SECCLASS_USER_NAMESPACE                          96

#define SECINITSID_KERNEL                                   1
#define SECINITSID_SECURITY                                 2
#define SECINITSID_UNLABELED                                3
#define SECINITSID_FILE                                     5
#define SECINITSID_INIT                                     7
#define SECINITSID_ANY_SOCKET                               8
#define SECINITSID_PORT                                     9
#define SECINITSID_NETIF                                   10
#define SECINITSID_NETMSG                                  11
#define SECINITSID_NODE                                    12
#define SECINITSID_DEVNULL                                 27

#define SECINITSID_NUM 27

static inline bool security_is_socket_class(u16 kern_tclass)
{
        bool sock = false;

        switch (kern_tclass) {
        case SECCLASS_SOCKET:
        case SECCLASS_TCP_SOCKET:
        case SECCLASS_UDP_SOCKET:
        case SECCLASS_RAWIP_SOCKET:
        case SECCLASS_NETLINK_SOCKET:
        case SECCLASS_PACKET_SOCKET:
        case SECCLASS_KEY_SOCKET:
        case SECCLASS_UNIX_STREAM_SOCKET:
        case SECCLASS_UNIX_DGRAM_SOCKET:
        case SECCLASS_NETLINK_ROUTE_SOCKET:
        case SECCLASS_NETLINK_TCPDIAG_SOCKET:
        case SECCLASS_NETLINK_NFLOG_SOCKET:
        case SECCLASS_NETLINK_XFRM_SOCKET:
        case SECCLASS_NETLINK_SELINUX_SOCKET:
        case SECCLASS_NETLINK_ISCSI_SOCKET:
        case SECCLASS_NETLINK_AUDIT_SOCKET:
        case SECCLASS_NETLINK_FIB_LOOKUP_SOCKET:
        case SECCLASS_NETLINK_CONNECTOR_SOCKET:
        case SECCLASS_NETLINK_NETFILTER_SOCKET:
        case SECCLASS_NETLINK_DNRT_SOCKET:
        case SECCLASS_NETLINK_KOBJECT_UEVENT_SOCKET:
        case SECCLASS_NETLINK_GENERIC_SOCKET:
        case SECCLASS_NETLINK_SCSITRANSPORT_SOCKET:
        case SECCLASS_NETLINK_RDMA_SOCKET:
        case SECCLASS_NETLINK_CRYPTO_SOCKET:
        case SECCLASS_APPLETALK_SOCKET:
        case SECCLASS_TUN_SOCKET:
        case SECCLASS_SCTP_SOCKET:
        case SECCLASS_ICMP_SOCKET:
        case SECCLASS_AX25_SOCKET:
        case SECCLASS_IPX_SOCKET:
        case SECCLASS_NETROM_SOCKET:
        case SECCLASS_ATMPVC_SOCKET:
        case SECCLASS_X25_SOCKET:
        case SECCLASS_ROSE_SOCKET:
        case SECCLASS_DECNET_SOCKET:
        case SECCLASS_ATMSVC_SOCKET:
        case SECCLASS_RDS_SOCKET:
        case SECCLASS_IRDA_SOCKET:
        case SECCLASS_PPPOX_SOCKET:
        case SECCLASS_LLC_SOCKET:
        case SECCLASS_CAN_SOCKET:
        case SECCLASS_TIPC_SOCKET:
        case SECCLASS_BLUETOOTH_SOCKET:
        case SECCLASS_IUCV_SOCKET:
        case SECCLASS_RXRPC_SOCKET:
        case SECCLASS_ISDN_SOCKET:
        case SECCLASS_PHONET_SOCKET:
        case SECCLASS_IEEE802154_SOCKET:
        case SECCLASS_CAIF_SOCKET:
        case SECCLASS_ALG_SOCKET:
        case SECCLASS_NFC_SOCKET:
        case SECCLASS_VSOCK_SOCKET:
        case SECCLASS_KCM_SOCKET:
        case SECCLASS_QIPCRTR_SOCKET:
        case SECCLASS_SMC_SOCKET:
        case SECCLASS_XDP_SOCKET:
        case SECCLASS_MCTP_SOCKET:
                sock = true;
                break;
        default:
                break;
        }

        return sock;
}

#endif





















































































































































































  311 








































    4 

















  265 






































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 *  Security-Enhanced Linux (SELinux) security module
 *
 *  This file contains the SELinux security data structures for kernel objects.
 *
 *  Author(s):  Stephen Smalley, <stephen.smalley.work@gmail.com>
 *                Chris Vance, <cvance@nai.com>
 *                Wayne Salamon, <wsalamon@nai.com>
 *                James Morris <jmorris@redhat.com>
 *
 *  Copyright (C) 2001,2002 Networks Associates Technology, Inc.
 *  Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
 *  Copyright (C) 2016 Mellanox Technologies
 */

#ifndef _SELINUX_OBJSEC_H_
#define _SELINUX_OBJSEC_H_

#include <linux/list.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/binfmts.h>
#include <linux/in.h>
#include <linux/spinlock.h>
#include <linux/lsm_hooks.h>
#include <linux/msg.h>
#include <net/net_namespace.h>
#include <linux/bpf.h>
#include "flask.h"
#include "avc.h"

struct avdc_entry {
        u32 isid; /* inode SID */
        u32 allowed; /* allowed permission bitmask */
        u32 audited; /* audited permission bitmask */
        bool permissive; /* AVC permissive flag */
};

struct cred_security_struct {
        u32 osid; /* SID prior to last execve */
        u32 sid; /* current SID */
        u32 exec_sid; /* exec SID */
        u32 create_sid; /* fscreate SID */
        u32 keycreate_sid; /* keycreate SID */
        u32 sockcreate_sid; /* fscreate SID */
} __randomize_layout;

struct task_security_struct {
#define TSEC_AVDC_DIR_SIZE (1 << 2)
        struct {
                u32 sid; /* current SID for cached entries */
                u32 seqno; /* AVC sequence number */
                unsigned int dir_spot; /* dir cache index to check first */
                struct avdc_entry dir[TSEC_AVDC_DIR_SIZE]; /* dir entries */
                bool permissive_neveraudit; /* permissive and neveraudit */
        } avdcache;
} __randomize_layout;

static inline bool task_avdcache_permnoaudit(struct task_security_struct *tsec,
                                             u32 sid)
{
        return (tsec->avdcache.permissive_neveraudit &&
                sid == tsec->avdcache.sid &&
                tsec->avdcache.seqno == avc_policy_seqno());
}

enum label_initialized {
        LABEL_INVALID, /* invalid or not initialized */
        LABEL_INITIALIZED, /* initialized */
        LABEL_PENDING
};

struct inode_security_struct {
        struct inode *inode; /* back pointer to inode object */
        struct list_head list; /* list of inode_security_struct */
        u32 task_sid; /* SID of creating task */
        u32 sid; /* SID of this object */
        u16 sclass; /* security class of this object */
        unsigned char initialized; /* initialization flag */
        spinlock_t lock;
};

struct file_security_struct {
        u32 sid; /* SID of open file description */
        u32 fown_sid; /* SID of file owner (for SIGIO) */
        u32 isid; /* SID of inode at the time of file open */
        u32 pseqno; /* Policy seqno at the time of file open */
};

struct superblock_security_struct {
        u32 sid; /* SID of file system superblock */
        u32 def_sid; /* default SID for labeling */
        u32 mntpoint_sid; /* SECURITY_FS_USE_MNTPOINT context for files */
        unsigned short behavior; /* labeling behavior */
        unsigned short flags; /* which mount options were specified */
        struct mutex lock;
        struct list_head isec_head;
        spinlock_t isec_lock;
};

struct msg_security_struct {
        u32 sid; /* SID of message */
};

struct ipc_security_struct {
        u16 sclass; /* security class of this object */
        u32 sid; /* SID of IPC resource */
};

struct netif_security_struct {
        const struct net *ns; /* network namespace */
        int ifindex; /* device index */
        u32 sid; /* SID for this interface */
};

struct netnode_security_struct {
        union {
                __be32 ipv4; /* IPv4 node address */
                struct in6_addr ipv6; /* IPv6 node address */
        } addr;
        u32 sid; /* SID for this node */
        u16 family; /* address family */
};

struct netport_security_struct {
        u32 sid; /* SID for this node */
        u16 port; /* port number */
        u8 protocol; /* transport protocol */
};

struct sk_security_struct {
#ifdef CONFIG_NETLABEL
        enum { /* NetLabel state */
               NLBL_UNSET = 0,
               NLBL_REQUIRE,
               NLBL_LABELED,
               NLBL_REQSKB,
               NLBL_CONNLABELED,
        } nlbl_state;
        struct netlbl_lsm_secattr *nlbl_secattr; /* NetLabel sec attributes */
#endif
        u32 sid; /* SID of this object */
        u32 peer_sid; /* SID of peer */
        u16 sclass; /* sock security class */
        enum { /* SCTP association state */
               SCTP_ASSOC_UNSET = 0,
               SCTP_ASSOC_SET,
        } sctp_assoc_state;
};

struct tun_security_struct {
        u32 sid; /* SID for the tun device sockets */
};

struct key_security_struct {
        u32 sid; /* SID of key */
};

struct ib_security_struct {
        u32 sid; /* SID of the queue pair or MAD agent */
};

struct pkey_security_struct {
        u64 subnet_prefix; /* Port subnet prefix */
        u16 pkey; /* PKey number */
        u32 sid; /* SID of pkey */
};

struct bpf_security_struct {
        u32 sid; /* SID of bpf obj creator */
};

struct perf_event_security_struct {
        u32 sid; /* SID of perf_event obj creator */
};

extern struct lsm_blob_sizes selinux_blob_sizes;
static inline struct cred_security_struct *selinux_cred(const struct cred *cred)
{
        return cred->security + selinux_blob_sizes.lbs_cred;
}

static inline struct task_security_struct *
selinux_task(const struct task_struct *task)
{
        return task->security + selinux_blob_sizes.lbs_task;
}

static inline struct file_security_struct *selinux_file(const struct file *file)
{
        return file->f_security + selinux_blob_sizes.lbs_file;
}

static inline struct inode_security_struct *
selinux_inode(const struct inode *inode)
{
        if (unlikely(!inode->i_security))
                return NULL;
        return inode->i_security + selinux_blob_sizes.lbs_inode;
}

static inline struct msg_security_struct *
selinux_msg_msg(const struct msg_msg *msg_msg)
{
        return msg_msg->security + selinux_blob_sizes.lbs_msg_msg;
}

static inline struct ipc_security_struct *
selinux_ipc(const struct kern_ipc_perm *ipc)
{
        return ipc->security + selinux_blob_sizes.lbs_ipc;
}

/*
 * get the subjective security ID of the current task
 */
static inline u32 current_sid(void)
{
        const struct cred_security_struct *crsec = selinux_cred(current_cred());

        return crsec->sid;
}

static inline struct superblock_security_struct *
selinux_superblock(const struct super_block *superblock)
{
        return superblock->s_security + selinux_blob_sizes.lbs_superblock;
}

#ifdef CONFIG_KEYS
static inline struct key_security_struct *selinux_key(const struct key *key)
{
        return key->security + selinux_blob_sizes.lbs_key;
}
#endif /* CONFIG_KEYS */

static inline struct sk_security_struct *selinux_sock(const struct sock *sock)
{
        return sock->sk_security + selinux_blob_sizes.lbs_sock;
}

static inline struct tun_security_struct *selinux_tun_dev(void *security)
{
        return security + selinux_blob_sizes.lbs_tun_dev;
}

static inline struct ib_security_struct *selinux_ib(void *ib_sec)
{
        return ib_sec + selinux_blob_sizes.lbs_ib;
}

static inline struct perf_event_security_struct *
selinux_perf_event(void *perf_event)
{
        return perf_event + selinux_blob_sizes.lbs_perf_event;
}

#ifdef CONFIG_BPF_SYSCALL
static inline struct bpf_security_struct *
selinux_bpf_map_security(struct bpf_map *map)
{
        return map->security + selinux_blob_sizes.lbs_bpf_map;
}

static inline struct bpf_security_struct *
selinux_bpf_prog_security(struct bpf_prog *prog)
{
        return prog->aux->security + selinux_blob_sizes.lbs_bpf_prog;
}

static inline struct bpf_security_struct *
selinux_bpf_token_security(struct bpf_token *token)
{
        return token->security + selinux_blob_sizes.lbs_bpf_token;
}
#endif /* CONFIG_BPF_SYSCALL */
#endif /* _SELINUX_OBJSEC_H_ */






































































































































































































































































































































































































































































   11 




   17 




   11 




    3 









   23 


















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Scatterlist Cryptographic API.
 *
 * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
 * Copyright (c) 2002 David S. Miller (davem@redhat.com)
 * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au>
 *
 * Portions derived from Cryptoapi, by Alexander Kjeldaas <astor@fast.no>
 * and Nettle, by Niels Möller.
 */
#ifndef _LINUX_CRYPTO_H
#define _LINUX_CRYPTO_H

#include <linux/completion.h>
#include <linux/errno.h>
#include <linux/refcount_types.h>
#include <linux/slab.h>
#include <linux/types.h>

/*
 * Algorithm masks and types.
 */
#define CRYPTO_ALG_TYPE_MASK                0x0000000f
#define CRYPTO_ALG_TYPE_CIPHER                0x00000001
#define CRYPTO_ALG_TYPE_AEAD                0x00000003
#define CRYPTO_ALG_TYPE_LSKCIPHER        0x00000004
#define CRYPTO_ALG_TYPE_SKCIPHER        0x00000005
#define CRYPTO_ALG_TYPE_AKCIPHER        0x00000006
#define CRYPTO_ALG_TYPE_SIG                0x00000007
#define CRYPTO_ALG_TYPE_KPP                0x00000008
#define CRYPTO_ALG_TYPE_ACOMPRESS        0x0000000a
#define CRYPTO_ALG_TYPE_SCOMPRESS        0x0000000b
#define CRYPTO_ALG_TYPE_RNG                0x0000000c
#define CRYPTO_ALG_TYPE_HASH                0x0000000e
#define CRYPTO_ALG_TYPE_SHASH                0x0000000e
#define CRYPTO_ALG_TYPE_AHASH                0x0000000f

#define CRYPTO_ALG_TYPE_ACOMPRESS_MASK        0x0000000e

#define CRYPTO_ALG_LARVAL                0x00000010
#define CRYPTO_ALG_DEAD                        0x00000020
#define CRYPTO_ALG_DYING                0x00000040
#define CRYPTO_ALG_ASYNC                0x00000080

/*
 * Set if the algorithm (or an algorithm which it uses) requires another
 * algorithm of the same type to handle corner cases.
 */
#define CRYPTO_ALG_NEED_FALLBACK        0x00000100

/*
 * Set if the algorithm data structure should be duplicated into
 * kmalloc memory before registration.  This is useful for hardware
 * that can be disconnected at will.  Do not use this if the data
 * structure is embedded into a bigger one.  Duplicate the overall
 * data structure in the driver in that case.
 */
#define CRYPTO_ALG_DUP_FIRST                0x00000200

/*
 * Set if the algorithm has passed automated run-time testing.  Note that
 * if there is no run-time testing for a given algorithm it is considered
 * to have passed.
 */

#define CRYPTO_ALG_TESTED                0x00000400

/*
 * Set if the algorithm is an instance that is built from templates.
 */
#define CRYPTO_ALG_INSTANCE                0x00000800

/* Set this bit if the algorithm provided is hardware accelerated but
 * not available to userspace via instruction set or so.
 */
#define CRYPTO_ALG_KERN_DRIVER_ONLY        0x00001000

/*
 * Mark a cipher as a service implementation only usable by another
 * cipher and never by a normal user of the kernel crypto API
 */
#define CRYPTO_ALG_INTERNAL                0x00002000

/*
 * Set if the algorithm has a ->setkey() method but can be used without
 * calling it first, i.e. there is a default key.
 */
#define CRYPTO_ALG_OPTIONAL_KEY                0x00004000

/*
 * Don't trigger module loading
 */
#define CRYPTO_NOLOAD                        0x00008000

/*
 * The algorithm may allocate memory during request processing, i.e. during
 * encryption, decryption, or hashing.  Users can request an algorithm with this
 * flag unset if they can't handle memory allocation failures.
 *
 * This flag is currently only implemented for algorithms of type "skcipher",
 * "aead", "ahash", "shash", and "cipher".  Algorithms of other types might not
 * have this flag set even if they allocate memory.
 *
 * In some edge cases, algorithms can allocate memory regardless of this flag.
 * To avoid these cases, users must obey the following usage constraints:
 *    skcipher:
 *        - The IV buffer and all scatterlist elements must be aligned to the
 *          algorithm's alignmask.
 *        - If the data were to be divided into chunks of size
 *          crypto_skcipher_walksize() (with any remainder going at the end), no
 *          chunk can cross a page boundary or a scatterlist element boundary.
 *    aead:
 *        - The IV buffer and all scatterlist elements must be aligned to the
 *          algorithm's alignmask.
 *        - The first scatterlist element must contain all the associated data,
 *          and its pages must be !PageHighMem.
 *        - If the plaintext/ciphertext were to be divided into chunks of size
 *          crypto_aead_walksize() (with the remainder going at the end), no chunk
 *          can cross a page boundary or a scatterlist element boundary.
 *    ahash:
 *        - crypto_ahash_finup() must not be used unless the algorithm implements
 *          ->finup() natively.
 */
#define CRYPTO_ALG_ALLOCATES_MEMORY        0x00010000

/*
 * Mark an algorithm as a service implementation only usable by a
 * template and never by a normal user of the kernel crypto API.
 * This is intended to be used by algorithms that are themselves
 * not FIPS-approved but may instead be used to implement parts of
 * a FIPS-approved algorithm (e.g., dh vs. ffdhe2048(dh)).
 */
#define CRYPTO_ALG_FIPS_INTERNAL        0x00020000

/* Set if the algorithm supports virtual addresses. */
#define CRYPTO_ALG_REQ_VIRT                0x00040000

/* Set if the algorithm cannot have a fallback (e.g., phmac). */
#define CRYPTO_ALG_NO_FALLBACK                0x00080000

/* The high bits 0xff000000 are reserved for type-specific flags. */

/*
 * Transform masks and values (for crt_flags).
 */
#define CRYPTO_TFM_NEED_KEY                0x00000001

#define CRYPTO_TFM_REQ_MASK                0x000fff00
#define CRYPTO_TFM_REQ_FORBID_WEAK_KEYS        0x00000100
#define CRYPTO_TFM_REQ_MAY_SLEEP        0x00000200
#define CRYPTO_TFM_REQ_MAY_BACKLOG        0x00000400
#define CRYPTO_TFM_REQ_ON_STACK                0x00000800

/*
 * Miscellaneous stuff.
 */
#define CRYPTO_MAX_ALG_NAME                128

/*
 * The macro CRYPTO_MINALIGN_ATTR (along with the void * type in the actual
 * declaration) is used to ensure that the crypto_tfm context structure is
 * aligned correctly for the given architecture so that there are no alignment
 * faults for C data types.  On architectures that support non-cache coherent
 * DMA, such as ARM or arm64, it also takes into account the minimal alignment
 * that is required to ensure that the context struct member does not share any
 * cachelines with the rest of the struct. This is needed to ensure that cache
 * maintenance for non-coherent DMA (cache invalidation in particular) does not
 * affect data that may be accessed by the CPU concurrently.
 */
#define CRYPTO_MINALIGN ARCH_KMALLOC_MINALIGN

#define CRYPTO_MINALIGN_ATTR __attribute__ ((__aligned__(CRYPTO_MINALIGN)))

struct crypto_tfm;
struct crypto_type;
struct module;

typedef void (*crypto_completion_t)(void *req, int err);

/**
 * DOC: Block Cipher Context Data Structures
 *
 * These data structures define the operating context for each block cipher
 * type.
 */

struct crypto_async_request {
        struct list_head list;
        crypto_completion_t complete;
        void *data;
        struct crypto_tfm *tfm;

        u32 flags;
};

/**
 * DOC: Block Cipher Algorithm Definitions
 *
 * These data structures define modular crypto algorithm implementations,
 * managed via crypto_register_alg() and crypto_unregister_alg().
 */

/**
 * struct cipher_alg - single-block symmetric ciphers definition
 * @cia_min_keysize: Minimum key size supported by the transformation. This is
 *                     the smallest key length supported by this transformation
 *                     algorithm. This must be set to one of the pre-defined
 *                     values as this is not hardware specific. Possible values
 *                     for this field can be found via git grep "_MIN_KEY_SIZE"
 *                     include/crypto/
 * @cia_max_keysize: Maximum key size supported by the transformation. This is
 *                    the largest key length supported by this transformation
 *                    algorithm. This must be set to one of the pre-defined values
 *                    as this is not hardware specific. Possible values for this
 *                    field can be found via git grep "_MAX_KEY_SIZE"
 *                    include/crypto/
 * @cia_setkey: Set key for the transformation. This function is used to either
 *                program a supplied key into the hardware or store the key in the
 *                transformation context for programming it later. Note that this
 *                function does modify the transformation context. This function
 *                can be called multiple times during the existence of the
 *                transformation object, so one must make sure the key is properly
 *                reprogrammed into the hardware. This function is also
 *                responsible for checking the key length for validity.
 * @cia_encrypt: Encrypt a single block. This function is used to encrypt a
 *                 single block of data, which must be @cra_blocksize big. This
 *                 always operates on a full @cra_blocksize and it is not possible
 *                 to encrypt a block of smaller size. The supplied buffers must
 *                 therefore also be at least of @cra_blocksize size. Both the
 *                 input and output buffers are always aligned to @cra_alignmask.
 *                 In case either of the input or output buffer supplied by user
 *                 of the crypto API is not aligned to @cra_alignmask, the crypto
 *                 API will re-align the buffers. The re-alignment means that a
 *                 new buffer will be allocated, the data will be copied into the
 *                 new buffer, then the processing will happen on the new buffer,
 *                 then the data will be copied back into the original buffer and
 *                 finally the new buffer will be freed. In case a software
 *                 fallback was put in place in the @cra_init call, this function
 *                 might need to use the fallback if the algorithm doesn't support
 *                 all of the key sizes. In case the key was stored in
 *                 transformation context, the key might need to be re-programmed
 *                 into the hardware in this function. This function shall not
 *                 modify the transformation context, as this function may be
 *                 called in parallel with the same transformation object.
 * @cia_decrypt: Decrypt a single block. This is a reverse counterpart to
 *                 @cia_encrypt, and the conditions are exactly the same.
 *
 * All fields are mandatory and must be filled.
 */
struct cipher_alg {
        unsigned int cia_min_keysize;
        unsigned int cia_max_keysize;
        int (*cia_setkey)(struct crypto_tfm *tfm, const u8 *key,
                          unsigned int keylen);
        void (*cia_encrypt)(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
        void (*cia_decrypt)(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
};

#define cra_cipher        cra_u.cipher

/**
 * struct crypto_alg - definition of a cryptograpic cipher algorithm
 * @cra_flags: Flags describing this transformation. See include/linux/crypto.h
 *               CRYPTO_ALG_* flags for the flags which go in here. Those are
 *               used for fine-tuning the description of the transformation
 *               algorithm.
 * @cra_blocksize: Minimum block size of this transformation. The size in bytes
 *                   of the smallest possible unit which can be transformed with
 *                   this algorithm. The users must respect this value.
 *                   In case of HASH transformation, it is possible for a smaller
 *                   block than @cra_blocksize to be passed to the crypto API for
 *                   transformation, in case of any other transformation type, an
 *                    error will be returned upon any attempt to transform smaller
 *                   than @cra_blocksize chunks.
 * @cra_ctxsize: Size of the operational context of the transformation. This
 *                 value informs the kernel crypto API about the memory size
 *                 needed to be allocated for the transformation context.
 * @cra_alignmask: For cipher, skcipher, lskcipher, and aead algorithms this is
 *                   1 less than the alignment, in bytes, that the algorithm
 *                   implementation requires for input and output buffers.  When
 *                   the crypto API is invoked with buffers that are not aligned
 *                   to this alignment, the crypto API automatically utilizes
 *                   appropriately aligned temporary buffers to comply with what
 *                   the algorithm needs.  (For scatterlists this happens only if
 *                   the algorithm uses the skcipher_walk helper functions.)  This
 *                   misalignment handling carries a performance penalty, so it is
 *                   preferred that algorithms do not set a nonzero alignmask.
 *                   Also, crypto API users may wish to allocate buffers aligned
 *                   to the alignmask of the algorithm being used, in order to
 *                   avoid the API having to realign them.  Note: the alignmask is
 *                   not supported for hash algorithms and is always 0 for them.
 * @cra_reqsize: Size of the request context for this algorithm.
 * @cra_priority: Priority of this transformation implementation. In case
 *                  multiple transformations with same @cra_name are available to
 *                  the Crypto API, the kernel will use the one with highest
 *                  @cra_priority.
 * @cra_name: Generic name (usable by multiple implementations) of the
 *              transformation algorithm. This is the name of the transformation
 *              itself. This field is used by the kernel when looking up the
 *              providers of particular transformation.
 * @cra_driver_name: Unique name of the transformation provider. This is the
 *                     name of the provider of the transformation. This can be any
 *                     arbitrary value, but in the usual case, this contains the
 *                     name of the chip or provider and the name of the
 *                     transformation algorithm.
 * @cra_type: Type of the cryptographic transformation. This is a pointer to
 *              struct crypto_type, which implements callbacks common for all
 *              transformation types. There are multiple options, such as
 *              &crypto_skcipher_type, &crypto_ahash_type, &crypto_rng_type.
 *              This field might be empty. In that case, there are no common
 *              callbacks. This is the case for: cipher.
 * @cra_u: Callbacks implementing the transformation. This is a union of
 *           multiple structures. Depending on the type of transformation selected
 *           by @cra_type and @cra_flags above, the associated structure must be
 *           filled with callbacks. This field might be empty. This is the case
 *           for ahash, shash.
 * @cra_init: Deprecated, do not use.
 * @cra_exit: Deprecated, do not use.
 * @cra_u.cipher: Union member which contains a single-block symmetric cipher
 *                  definition. See @struct @cipher_alg.
 * @cra_module: Owner of this transformation implementation. Set to THIS_MODULE
 * @cra_list: internally used
 * @cra_users: internally used
 * @cra_refcnt: internally used
 * @cra_destroy: internally used
 *
 * The struct crypto_alg describes a generic Crypto API algorithm and is common
 * for all of the transformations. Any variable not documented here shall not
 * be used by a cipher implementation as it is internal to the Crypto API.
 */
struct crypto_alg {
        struct list_head cra_list;
        struct list_head cra_users;

        u32 cra_flags;
        unsigned int cra_blocksize;
        unsigned int cra_ctxsize;
        unsigned int cra_alignmask;
        unsigned int cra_reqsize;

        int cra_priority;
        refcount_t cra_refcnt;

        char cra_name[CRYPTO_MAX_ALG_NAME];
        char cra_driver_name[CRYPTO_MAX_ALG_NAME];

        const struct crypto_type *cra_type;

        union {
                struct cipher_alg cipher;
        } cra_u;

        int (*cra_init)(struct crypto_tfm *tfm);
        void (*cra_exit)(struct crypto_tfm *tfm);
        void (*cra_destroy)(struct crypto_alg *alg);
        
        struct module *cra_module;
} CRYPTO_MINALIGN_ATTR;

/*
 * A helper struct for waiting for completion of async crypto ops
 */
struct crypto_wait {
        struct completion completion;
        int err;
};

/*
 * Macro for declaring a crypto op async wait object on stack
 */
#define DECLARE_CRYPTO_WAIT(_wait) \
        struct crypto_wait _wait = { \
                COMPLETION_INITIALIZER_ONSTACK((_wait).completion), 0 }

/*
 * Async ops completion helper functioons
 */
void crypto_req_done(void *req, int err);

static inline int crypto_wait_req(int err, struct crypto_wait *wait)
{
        switch (err) {
        case -EINPROGRESS:
        case -EBUSY:
                wait_for_completion(&wait->completion);
                reinit_completion(&wait->completion);
                err = wait->err;
                break;
        }

        return err;
}

static inline void crypto_init_wait(struct crypto_wait *wait)
{
        init_completion(&wait->completion);
}

/*
 * Algorithm query interface.
 */
int crypto_has_alg(const char *name, u32 type, u32 mask);

/*
 * Transforms: user-instantiated objects which encapsulate algorithms
 * and core processing logic.  Managed via crypto_alloc_*() and
 * crypto_free_*(), as well as the various helpers below.
 */

struct crypto_tfm {
        refcount_t refcnt;

        u32 crt_flags;

        int node;

        struct crypto_tfm *fb;

        void (*exit)(struct crypto_tfm *tfm);

        struct crypto_alg *__crt_alg;

        void *__crt_ctx[] CRYPTO_MINALIGN_ATTR;
};

/* 
 * Transform user interface.
 */
 
struct crypto_tfm *crypto_alloc_base(const char *alg_name, u32 type, u32 mask);
void crypto_destroy_tfm(void *mem, struct crypto_tfm *tfm);

static inline void crypto_free_tfm(struct crypto_tfm *tfm)
{
        return crypto_destroy_tfm(tfm, tfm);
}

/*
 * Transform helpers which query the underlying algorithm.
 */
static inline const char *crypto_tfm_alg_name(struct crypto_tfm *tfm)
{
        return tfm->__crt_alg->cra_name;
}

static inline const char *crypto_tfm_alg_driver_name(struct crypto_tfm *tfm)
{
        return tfm->__crt_alg->cra_driver_name;
}

static inline unsigned int crypto_tfm_alg_blocksize(struct crypto_tfm *tfm)
{
        return tfm->__crt_alg->cra_blocksize;
}

static inline unsigned int crypto_tfm_alg_alignmask(struct crypto_tfm *tfm)
{
        return tfm->__crt_alg->cra_alignmask;
}

static inline unsigned int crypto_tfm_alg_reqsize(struct crypto_tfm *tfm)
{
        return tfm->__crt_alg->cra_reqsize;
}

static inline u32 crypto_tfm_get_flags(struct crypto_tfm *tfm)
{
        return tfm->crt_flags;
}

static inline void crypto_tfm_set_flags(struct crypto_tfm *tfm, u32 flags)
{
        tfm->crt_flags |= flags;
}

static inline void crypto_tfm_clear_flags(struct crypto_tfm *tfm, u32 flags)
{
        tfm->crt_flags &= ~flags;
}

static inline unsigned int crypto_tfm_ctx_alignment(void)
{
        struct crypto_tfm *tfm;
        return __alignof__(tfm->__crt_ctx);
}

static inline bool crypto_tfm_is_async(struct crypto_tfm *tfm)
{
        return tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC;
}

static inline bool crypto_req_on_stack(struct crypto_async_request *req)
{
        return req->flags & CRYPTO_TFM_REQ_ON_STACK;
}

static inline void crypto_request_set_callback(
        struct crypto_async_request *req, u32 flags,
        crypto_completion_t compl, void *data)
{
        u32 keep = CRYPTO_TFM_REQ_ON_STACK;

        req->complete = compl;
        req->data = data;
        req->flags &= keep;
        req->flags |= flags & ~keep;
}

static inline void crypto_request_set_tfm(struct crypto_async_request *req,
                                          struct crypto_tfm *tfm)
{
        req->tfm = tfm;
        req->flags &= ~CRYPTO_TFM_REQ_ON_STACK;
}

struct crypto_async_request *crypto_request_clone(
        struct crypto_async_request *req, size_t total, gfp_t gfp);

static inline void crypto_stack_request_init(struct crypto_async_request *req,
                                             struct crypto_tfm *tfm)
{
        req->flags = 0;
        crypto_request_set_tfm(req, tfm);
        req->flags |= CRYPTO_TFM_REQ_ON_STACK;
}

#endif        /* _LINUX_CRYPTO_H */



























































    9 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM notifier

#if !defined(_TRACE_NOTIFIERS_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_NOTIFIERS_H

#include <linux/tracepoint.h>

DECLARE_EVENT_CLASS(notifier_info,

        TP_PROTO(void *cb),

        TP_ARGS(cb),

        TP_STRUCT__entry(
                __field(void *, cb)
        ),

        TP_fast_assign(
                __entry->cb = cb;
        ),

        TP_printk("%ps", __entry->cb)
);

/*
 * notifier_register - called upon notifier callback registration
 *
 * @cb:                callback pointer
 *
 */
DEFINE_EVENT(notifier_info, notifier_register,

        TP_PROTO(void *cb),

        TP_ARGS(cb)
);

/*
 * notifier_unregister - called upon notifier callback unregistration
 *
 * @cb:                callback pointer
 *
 */
DEFINE_EVENT(notifier_info, notifier_unregister,

        TP_PROTO(void *cb),

        TP_ARGS(cb)
);

/*
 * notifier_run - called upon notifier callback execution
 *
 * @cb:                callback pointer
 *
 */
DEFINE_EVENT(notifier_info, notifier_run,

        TP_PROTO(void *cb),

        TP_ARGS(cb)
);

#endif /* _TRACE_NOTIFIERS_H */

/* This part must be outside protection */
#include <trace/define_trace.h>





















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        inet6 interface/address list definitions
 *        Linux INET6 implementation 
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>        
 */

#ifndef _NET_IF_INET6_H
#define _NET_IF_INET6_H

#include <net/snmp.h>
#include <linux/ipv6.h>
#include <linux/refcount.h>

/* inet6_dev.if_flags */

#define IF_RA_OTHERCONF        0x80
#define IF_RA_MANAGED        0x40
#define IF_RA_RCVD        0x20
#define IF_RS_SENT        0x10
#define IF_READY        0x80000000

enum {
        INET6_IFADDR_STATE_PREDAD,
        INET6_IFADDR_STATE_DAD,
        INET6_IFADDR_STATE_POSTDAD,
        INET6_IFADDR_STATE_ERRDAD,
        INET6_IFADDR_STATE_DEAD,
};

struct inet6_ifaddr {
        struct in6_addr                addr;
        __u32                        prefix_len;
        __u32                        rt_priority;

        /* In seconds, relative to tstamp. Expiry is at tstamp + HZ * lft. */
        __u32                        valid_lft;
        __u32                        prefered_lft;
        refcount_t                refcnt;
        spinlock_t                lock;

        int                        state;

        __u32                        flags;
        __u8                        dad_probes;
        __u8                        stable_privacy_retry;

        __u16                        scope;
        __u64                        dad_nonce;

        unsigned long                cstamp;        /* created timestamp */
        unsigned long                tstamp; /* updated timestamp */

        struct delayed_work        dad_work;

        struct inet6_dev        *idev;
        struct fib6_info        *rt;

        struct hlist_node        addr_lst;
        struct list_head        if_list;
        /*
         * Used to safely traverse idev->addr_list in process context
         * if the idev->lock needed to protect idev->addr_list cannot be held.
         * In that case, add the items to this list temporarily and iterate
         * without holding idev->lock.
         * See addrconf_ifdown and dev_forward_change.
         */
        struct list_head        if_list_aux;

        struct list_head        tmp_list;
        struct inet6_ifaddr        *ifpub;
        int                        regen_count;

        bool                        tokenized;

        u8                        ifa_proto;

        struct rcu_head                rcu;
        struct in6_addr                peer_addr;
};

struct ip6_sf_socklist {
        unsigned int                sl_max;
        unsigned int                sl_count;
        struct rcu_head                rcu;
        struct in6_addr                sl_addr[] __counted_by(sl_max);
};

#define IP6_SFBLOCK        10        /* allocate this many at once */

struct ipv6_mc_socklist {
        struct in6_addr                addr;
        int                        ifindex;
        unsigned int                sfmode;                /* MCAST_{INCLUDE,EXCLUDE} */
        struct ipv6_mc_socklist __rcu *next;
        struct ip6_sf_socklist        __rcu *sflist;
        struct rcu_head                rcu;
};

struct ip6_sf_list {
        struct ip6_sf_list __rcu *sf_next;
        struct in6_addr                sf_addr;
        unsigned long                sf_count[2];        /* include/exclude counts */
        unsigned char                sf_gsresp;        /* include in g & s response? */
        unsigned char                sf_oldin;        /* change state */
        unsigned char                sf_crcount;        /* retrans. left to send */
        struct rcu_head                rcu;
};

#define MAF_TIMER_RUNNING        0x01
#define MAF_LAST_REPORTER        0x02
#define MAF_LOADED                0x04
#define MAF_NOREPORT                0x08
#define MAF_GSQUERY                0x10

struct ifmcaddr6 {
        struct in6_addr                mca_addr;
        struct inet6_dev        *idev;
        struct ifmcaddr6        __rcu *next;
        struct ip6_sf_list        __rcu *mca_sources;
        struct ip6_sf_list        __rcu *mca_tomb;
        unsigned int                mca_sfmode;
        unsigned char                mca_crcount;
        unsigned long                mca_sfcount[2];
        struct delayed_work        mca_work;
        unsigned int                mca_flags;
        int                        mca_users;
        refcount_t                mca_refcnt;
        unsigned long                mca_cstamp;
        unsigned long                mca_tstamp;
        struct rcu_head                rcu;
};

/* Anycast stuff */

struct ipv6_ac_socklist {
        struct in6_addr                acl_addr;
        int                        acl_ifindex;
        struct ipv6_ac_socklist *acl_next;
};

struct ifacaddr6 {
        struct in6_addr                aca_addr;
        struct fib6_info        *aca_rt;
        struct ifacaddr6 __rcu        *aca_next;
        struct hlist_node        aca_addr_lst;
        int                        aca_users;
        refcount_t                aca_refcnt;
        unsigned long                aca_cstamp;
        unsigned long                aca_tstamp;
        struct rcu_head                rcu;
};

#define        IFA_HOST        IPV6_ADDR_LOOPBACK
#define        IFA_LINK        IPV6_ADDR_LINKLOCAL
#define        IFA_SITE        IPV6_ADDR_SITELOCAL

struct ipv6_devstat {
        struct proc_dir_entry        *proc_dir_entry;
        DEFINE_SNMP_STAT(struct ipstats_mib, ipv6);
        DEFINE_SNMP_STAT_ATOMIC(struct icmpv6_mib_device, icmpv6dev);
        DEFINE_SNMP_STAT_ATOMIC(struct icmpv6msg_mib_device, icmpv6msgdev);
};

struct inet6_dev {
        struct net_device        *dev;
        netdevice_tracker        dev_tracker;

        struct list_head        addr_list;

        struct ifmcaddr6        __rcu *mc_list;
        struct ifmcaddr6        __rcu *mc_tomb;

        unsigned char                mc_qrv;                /* Query Robustness Variable */
        unsigned char                mc_gq_running;
        unsigned char                mc_ifc_count;
        unsigned char                mc_dad_count;

        unsigned long                mc_v1_seen;        /* Max time we stay in MLDv1 mode */
        unsigned long                mc_qi;                /* Query Interval */
        unsigned long                mc_qri;                /* Query Response Interval */
        unsigned long                mc_maxdelay;

        struct delayed_work        mc_gq_work;        /* general query work */
        struct delayed_work        mc_ifc_work;        /* interface change work */
        struct delayed_work        mc_dad_work;        /* dad complete mc work */
        struct delayed_work        mc_query_work;        /* mld query work */
        struct delayed_work        mc_report_work;        /* mld report work */

        struct sk_buff_head        mc_query_queue;                /* mld query queue */
        struct sk_buff_head        mc_report_queue;        /* mld report queue */

        spinlock_t                mc_query_lock;        /* mld query queue lock */
        spinlock_t                mc_report_lock;        /* mld query report lock */
        struct mutex                mc_lock;        /* mld global lock */

        struct ifacaddr6 __rcu        *ac_list;
        rwlock_t                lock;
        refcount_t                refcnt;
        __u32                        if_flags;
        int                        dead;

        u32                        desync_factor;
        struct list_head        tempaddr_list;

        struct in6_addr                token;

        struct neigh_parms        *nd_parms;
        struct ipv6_devconf        cnf;
        struct ipv6_devstat        stats;

        struct timer_list        rs_timer;
        __s32                        rs_interval;        /* in jiffies */
        __u8                        rs_probes;

        unsigned long                tstamp; /* ipv6InterfaceTable update timestamp */
        struct rcu_head                rcu;

        unsigned int                ra_mtu;
};

static inline void ipv6_eth_mc_map(const struct in6_addr *addr, char *buf)
{
        /*
         *        +-------+-------+-------+-------+-------+-------+
         *      |   33  |   33  | DST13 | DST14 | DST15 | DST16 |
         *      +-------+-------+-------+-------+-------+-------+
         */

        buf[0]= 0x33;
        buf[1]= 0x33;

        memcpy(buf + 2, &addr->s6_addr32[3], sizeof(__u32));
}

static inline void ipv6_arcnet_mc_map(const struct in6_addr *addr, char *buf)
{
        buf[0] = 0x00;
}

static inline void ipv6_ib_mc_map(const struct in6_addr *addr,
                                  const unsigned char *broadcast, char *buf)
{
        unsigned char scope = broadcast[5] & 0xF;

        buf[0]  = 0;                /* Reserved */
        buf[1]  = 0xff;                /* Multicast QPN */
        buf[2]  = 0xff;
        buf[3]  = 0xff;
        buf[4]  = 0xff;
        buf[5]  = 0x10 | scope;        /* scope from broadcast address */
        buf[6]  = 0x60;                /* IPv6 signature */
        buf[7]  = 0x1b;
        buf[8]  = broadcast[8];        /* P_Key */
        buf[9]  = broadcast[9];
        memcpy(buf + 10, addr->s6_addr + 6, 10);
}

static inline int ipv6_ipgre_mc_map(const struct in6_addr *addr,
                                    const unsigned char *broadcast, char *buf)
{
        if ((broadcast[0] | broadcast[1] | broadcast[2] | broadcast[3]) != 0) {
                memcpy(buf, broadcast, 4);
        } else {
                /* v4mapped? */
                if ((addr->s6_addr32[0] | addr->s6_addr32[1] |
                     (addr->s6_addr32[2] ^ htonl(0x0000ffff))) != 0)
                        return -EINVAL;
                memcpy(buf, &addr->s6_addr32[3], 4);
        }
        return 0;
}

#endif


















































































































































































































  200 


















    4 

















































   18 








  319 


































   19 























































































































































































































































































































































































































































































































































































































































































































































  318 












































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Macros for manipulating and testing page->flags
 */

#ifndef PAGE_FLAGS_H
#define PAGE_FLAGS_H

#include <linux/types.h>
#include <linux/bug.h>
#include <linux/mmdebug.h>
#ifndef __GENERATING_BOUNDS_H
#include <linux/mm_types.h>
#include <generated/bounds.h>
#endif /* !__GENERATING_BOUNDS_H */

/*
 * Various page->flags bits:
 *
 * PG_reserved is set for special pages. The "struct page" of such a page
 * should in general not be touched (e.g. set dirty) except by its owner.
 * Pages marked as PG_reserved include:
 * - Pages part of the kernel image (including vDSO) and similar (e.g. BIOS,
 *   initrd, HW tables)
 * - Pages reserved or allocated early during boot (before the page allocator
 *   was initialized). This includes (depending on the architecture) the
 *   initial vmemmap, initial page tables, crashkernel, elfcorehdr, and much
 *   much more. Once (if ever) freed, PG_reserved is cleared and they will
 *   be given to the page allocator.
 * - Pages falling into physical memory gaps - not IORESOURCE_SYSRAM. Trying
 *   to read/write these pages might end badly. Don't touch!
 * - The zero page(s)
 * - Pages allocated in the context of kexec/kdump (loaded kernel image,
 *   control pages, vmcoreinfo)
 * - MMIO/DMA pages. Some architectures don't allow to ioremap pages that are
 *   not marked PG_reserved (as they might be in use by somebody else who does
 *   not respect the caching strategy).
 * - MCA pages on ia64
 * - Pages holding CPU notes for POWER Firmware Assisted Dump
 * - Device memory (e.g. PMEM, DAX, HMM)
 * Some PG_reserved pages will be excluded from the hibernation image.
 * PG_reserved does in general not hinder anybody from dumping or swapping
 * and is no longer required for remap_pfn_range(). ioremap might require it.
 * Consequently, PG_reserved for a page mapped into user space can indicate
 * the zero page, the vDSO, MMIO pages or device memory.
 *
 * The PG_private bitflag is set on pagecache pages if they contain filesystem
 * specific data (which is normally at page->private). It can be used by
 * private allocations for its own usage.
 *
 * During initiation of disk I/O, PG_locked is set. This bit is set before I/O
 * and cleared when writeback _starts_ or when read _completes_. PG_writeback
 * is set before writeback starts and cleared when it finishes.
 *
 * PG_locked also pins a page in pagecache, and blocks truncation of the file
 * while it is held.
 *
 * page_waitqueue(page) is a wait queue of all tasks waiting for the page
 * to become unlocked.
 *
 * PG_swapbacked is set when a page uses swap as a backing storage.  This are
 * usually PageAnon or shmem pages but please note that even anonymous pages
 * might lose their PG_swapbacked flag when they simply can be dropped (e.g. as
 * a result of MADV_FREE).
 *
 * PG_referenced, PG_reclaim are used for page reclaim for anonymous and
 * file-backed pagecache (see mm/vmscan.c).
 *
 * PG_arch_1 is an architecture specific page state bit.  The generic code
 * guarantees that this bit is cleared for a page when it first is entered into
 * the page cache.
 *
 * PG_hwpoison indicates that a page got corrupted in hardware and contains
 * data with incorrect ECC bits that triggered a machine check. Accessing is
 * not safe since it may cause another machine check. Don't touch!
 */

/*
 * Don't use the pageflags directly.  Use the PageFoo macros.
 *
 * The page flags field is split into two parts, the main flags area
 * which extends from the low bits upwards, and the fields area which
 * extends from the high bits downwards.
 *
 *  | FIELD | ... | FLAGS |
 *  N-1           ^       0
 *               (NR_PAGEFLAGS)
 *
 * The fields area is reserved for fields mapping zone, node (for NUMA) and
 * SPARSEMEM section (for variants of SPARSEMEM that require section ids like
 * SPARSEMEM_EXTREME with !SPARSEMEM_VMEMMAP).
 */
enum pageflags {
        PG_locked,                /* Page is locked. Don't touch. */
        PG_writeback,                /* Page is under writeback */
        PG_referenced,
        PG_uptodate,
        PG_dirty,
        PG_lru,
        PG_head,                /* Must be in bit 6 */
        PG_waiters,                /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */
        PG_active,
        PG_workingset,
        PG_owner_priv_1,        /* Owner use. If pagecache, fs may use */
        PG_owner_2,                /* Owner use. If pagecache, fs may use */
        PG_arch_1,
        PG_reserved,
        PG_private,                /* If pagecache, has fs-private data */
        PG_private_2,                /* If pagecache, has fs aux data */
        PG_reclaim,                /* To be reclaimed asap */
        PG_swapbacked,                /* Page is backed by RAM/swap */
        PG_unevictable,                /* Page is "unevictable"  */
        PG_dropbehind,                /* drop pages on IO completion */
#ifdef CONFIG_MMU
        PG_mlocked,                /* Page is vma mlocked */
#endif
#ifdef CONFIG_MEMORY_FAILURE
        PG_hwpoison,                /* hardware poisoned page. Don't touch */
#endif
#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
        PG_young,
        PG_idle,
#endif
#ifdef CONFIG_ARCH_USES_PG_ARCH_2
        PG_arch_2,
#endif
#ifdef CONFIG_ARCH_USES_PG_ARCH_3
        PG_arch_3,
#endif
        __NR_PAGEFLAGS,

        PG_readahead = PG_reclaim,

        /* Anonymous memory (and shmem) */
        PG_swapcache = PG_owner_priv_1, /* Swap page: swp_entry_t in private */
        /* Some filesystems */
        PG_checked = PG_owner_priv_1,

        /*
         * Depending on the way an anonymous folio can be mapped into a page
         * table (e.g., single PMD/PUD/CONT of the head page vs. PTE-mapped
         * THP), PG_anon_exclusive may be set only for the head page or for
         * tail pages of an anonymous folio. For now, we only expect it to be
         * set on tail pages for PTE-mapped THP.
         */
        PG_anon_exclusive = PG_owner_2,

        /*
         * Set if all buffer heads in the folio are mapped.
         * Filesystems which do not use BHs can use it for their own purpose.
         */
        PG_mappedtodisk = PG_owner_2,

        /* Two page bits are conscripted by FS-Cache to maintain local caching
         * state.  These bits are set on pages belonging to the netfs's inodes
         * when those inodes are being locally cached.
         */
        PG_fscache = PG_private_2,        /* page backed by cache */

        /* XEN */
        /* Pinned in Xen as a read-only pagetable page. */
        PG_pinned = PG_owner_priv_1,
        /* Pinned as part of domain save (see xen_mm_pin_all()). */
        PG_savepinned = PG_dirty,
        /* Has a grant mapping of another (foreign) domain's page. */
        PG_foreign = PG_owner_priv_1,
        /* Remapped by swiotlb-xen. */
        PG_xen_remapped = PG_owner_priv_1,

#ifdef CONFIG_MIGRATION
        /* movable_ops page that is isolated for migration */
        PG_movable_ops_isolated = PG_reclaim,
        /* this is a movable_ops page (for selected typed pages only) */
        PG_movable_ops = PG_uptodate,
#endif

        /* Only valid for buddy pages. Used to track pages that are reported */
        PG_reported = PG_uptodate,

#ifdef CONFIG_MEMORY_HOTPLUG
        /* For self-hosted memmap pages */
        PG_vmemmap_self_hosted = PG_owner_priv_1,
#endif

        /*
         * Flags only valid for compound pages.  Stored in first tail page's
         * flags word.  Cannot use the first 8 flags or any flag marked as
         * PF_ANY.
         */

        /* At least one page in this folio has the hwpoison flag set */
        PG_has_hwpoisoned = PG_active,
        PG_large_rmappable = PG_workingset, /* anon or file-backed */
        PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */
};

#define PAGEFLAGS_MASK                ((1UL << NR_PAGEFLAGS) - 1)

#ifndef __GENERATING_BOUNDS_H

#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
DECLARE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);

/*
 * Return the real head page struct iff the @page is a fake head page, otherwise
 * return the @page itself. See Documentation/mm/vmemmap_dedup.rst.
 */
static __always_inline const struct page *page_fixed_fake_head(const struct page *page)
{
        if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key))
                return page;

        /*
         * Only addresses aligned with PAGE_SIZE of struct page may be fake head
         * struct page. The alignment check aims to avoid access the fields (
         * e.g. compound_head) of the @page[1]. It can avoid touch a (possibly)
         * cold cacheline in some cases.
         */
        if (IS_ALIGNED((unsigned long)page, PAGE_SIZE) &&
            test_bit(PG_head, &page->flags.f)) {
                /*
                 * We can safely access the field of the @page[1] with PG_head
                 * because the @page is a compound page composed with at least
                 * two contiguous pages.
                 */
                unsigned long head = READ_ONCE(page[1].compound_head);

                if (likely(head & 1))
                        return (const struct page *)(head - 1);
        }
        return page;
}

static __always_inline bool page_count_writable(const struct page *page, int u)
{
        if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key))
                return true;

        /*
         * The refcount check is ordered before the fake-head check to prevent
         * the following race:
         *   CPU 1 (HVO)                     CPU 2 (speculative PFN walker)
         *
         *   page_ref_freeze()
         *   synchronize_rcu()
         *                                   rcu_read_lock()
         *                                   page_is_fake_head() is false
         *   vmemmap_remap_pte()
         *   XXX: struct page[] becomes r/o
         *
         *   page_ref_unfreeze()
         *                                   page_ref_count() is not zero
         *
         *                                   atomic_add_unless(&page->_refcount)
         *                                   XXX: try to modify r/o struct page[]
         *
         * The refcount check also prevents modification attempts to other (r/o)
         * tail pages that are not fake heads.
         */
        if (atomic_read_acquire(&page->_refcount) == u)
                return false;

        return page_fixed_fake_head(page) == page;
}
#else
static inline const struct page *page_fixed_fake_head(const struct page *page)
{
        return page;
}

static inline bool page_count_writable(const struct page *page, int u)
{
        return true;
}
#endif

static __always_inline int page_is_fake_head(const struct page *page)
{
        return page_fixed_fake_head(page) != page;
}

static __always_inline unsigned long _compound_head(const struct page *page)
{
        unsigned long head = READ_ONCE(page->compound_head);

        if (unlikely(head & 1))
                return head - 1;
        return (unsigned long)page_fixed_fake_head(page);
}

#define compound_head(page)        ((typeof(page))_compound_head(page))

/**
 * page_folio - Converts from page to folio.
 * @p: The page.
 *
 * Every page is part of a folio.  This function cannot be called on a
 * NULL pointer.
 *
 * Context: No reference, nor lock is required on @page.  If the caller
 * does not hold a reference, this call may race with a folio split, so
 * it should re-check the folio still contains this page after gaining
 * a reference on the folio.
 * Return: The folio which contains this page.
 */
#define page_folio(p)                (_Generic((p),                                \
        const struct page *:        (const struct folio *)_compound_head(p), \
        struct page *:                (struct folio *)_compound_head(p)))

/**
 * folio_page - Return a page from a folio.
 * @folio: The folio.
 * @n: The page number to return.
 *
 * @n is relative to the start of the folio.  This function does not
 * check that the page number lies within @folio; the caller is presumed
 * to have a reference to the page.
 */
#define folio_page(folio, n)        (&(folio)->page + (n))

static __always_inline int PageTail(const struct page *page)
{
        return READ_ONCE(page->compound_head) & 1 || page_is_fake_head(page);
}

static __always_inline int PageCompound(const struct page *page)
{
        return test_bit(PG_head, &page->flags.f) ||
               READ_ONCE(page->compound_head) & 1;
}

#define        PAGE_POISON_PATTERN        -1l
static inline int PagePoisoned(const struct page *page)
{
        return READ_ONCE(page->flags.f) == PAGE_POISON_PATTERN;
}

#ifdef CONFIG_DEBUG_VM
void page_init_poison(struct page *page, size_t size);
#else
static inline void page_init_poison(struct page *page, size_t size)
{
}
#endif

static const unsigned long *const_folio_flags(const struct folio *folio,
                unsigned n)
{
        const struct page *page = &folio->page;

        VM_BUG_ON_PGFLAGS(page->compound_head & 1, page);
        VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags.f), page);
        return &page[n].flags.f;
}

static unsigned long *folio_flags(struct folio *folio, unsigned n)
{
        struct page *page = &folio->page;

        VM_BUG_ON_PGFLAGS(page->compound_head & 1, page);
        VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags.f), page);
        return &page[n].flags.f;
}

/*
 * Page flags policies wrt compound pages
 *
 * PF_POISONED_CHECK
 *     check if this struct page poisoned/uninitialized
 *
 * PF_ANY:
 *     the page flag is relevant for small, head and tail pages.
 *
 * PF_HEAD:
 *     for compound page all operations related to the page flag applied to
 *     head page.
 *
 * PF_NO_TAIL:
 *     modifications of the page flag must be done on small or head pages,
 *     checks can be done on tail pages too.
 *
 * PF_NO_COMPOUND:
 *     the page flag is not relevant for compound pages.
 *
 * PF_SECOND:
 *     the page flag is stored in the first tail page.
 */
#define PF_POISONED_CHECK(page) ({                                        \
                VM_BUG_ON_PGFLAGS(PagePoisoned(page), page);                \
                page; })
#define PF_ANY(page, enforce)        PF_POISONED_CHECK(page)
#define PF_HEAD(page, enforce)        PF_POISONED_CHECK(compound_head(page))
#define PF_NO_TAIL(page, enforce) ({                                        \
                VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page);        \
                PF_POISONED_CHECK(compound_head(page)); })
#define PF_NO_COMPOUND(page, enforce) ({                                \
                VM_BUG_ON_PGFLAGS(enforce && PageCompound(page), page);        \
                PF_POISONED_CHECK(page); })
#define PF_SECOND(page, enforce) ({                                        \
                VM_BUG_ON_PGFLAGS(!PageHead(page), page);                \
                PF_POISONED_CHECK(&page[1]); })

/* Which page is the flag stored in */
#define FOLIO_PF_ANY                0
#define FOLIO_PF_HEAD                0
#define FOLIO_PF_NO_TAIL        0
#define FOLIO_PF_NO_COMPOUND        0
#define FOLIO_PF_SECOND                1

#define FOLIO_HEAD_PAGE                0
#define FOLIO_SECOND_PAGE        1

/*
 * Macros to create function definitions for page flags
 */
#define FOLIO_TEST_FLAG(name, page)                                        \
static __always_inline bool folio_test_##name(const struct folio *folio) \
{ return test_bit(PG_##name, const_folio_flags(folio, page)); }

#define FOLIO_SET_FLAG(name, page)                                        \
static __always_inline void folio_set_##name(struct folio *folio)        \
{ set_bit(PG_##name, folio_flags(folio, page)); }

#define FOLIO_CLEAR_FLAG(name, page)                                        \
static __always_inline void folio_clear_##name(struct folio *folio)        \
{ clear_bit(PG_##name, folio_flags(folio, page)); }

#define __FOLIO_SET_FLAG(name, page)                                        \
static __always_inline void __folio_set_##name(struct folio *folio)        \
{ __set_bit(PG_##name, folio_flags(folio, page)); }

#define __FOLIO_CLEAR_FLAG(name, page)                                        \
static __always_inline void __folio_clear_##name(struct folio *folio)        \
{ __clear_bit(PG_##name, folio_flags(folio, page)); }

#define FOLIO_TEST_SET_FLAG(name, page)                                        \
static __always_inline bool folio_test_set_##name(struct folio *folio)        \
{ return test_and_set_bit(PG_##name, folio_flags(folio, page)); }

#define FOLIO_TEST_CLEAR_FLAG(name, page)                                \
static __always_inline bool folio_test_clear_##name(struct folio *folio) \
{ return test_and_clear_bit(PG_##name, folio_flags(folio, page)); }

#define FOLIO_FLAG(name, page)                                                \
FOLIO_TEST_FLAG(name, page)                                                \
FOLIO_SET_FLAG(name, page)                                                \
FOLIO_CLEAR_FLAG(name, page)

#define TESTPAGEFLAG(uname, lname, policy)                                \
FOLIO_TEST_FLAG(lname, FOLIO_##policy)                                        \
static __always_inline int Page##uname(const struct page *page)                \
{ return test_bit(PG_##lname, &policy(page, 0)->flags.f); }

#define SETPAGEFLAG(uname, lname, policy)                                \
FOLIO_SET_FLAG(lname, FOLIO_##policy)                                        \
static __always_inline void SetPage##uname(struct page *page)                \
{ set_bit(PG_##lname, &policy(page, 1)->flags.f); }

#define CLEARPAGEFLAG(uname, lname, policy)                                \
FOLIO_CLEAR_FLAG(lname, FOLIO_##policy)                                        \
static __always_inline void ClearPage##uname(struct page *page)                \
{ clear_bit(PG_##lname, &policy(page, 1)->flags.f); }

#define __SETPAGEFLAG(uname, lname, policy)                                \
__FOLIO_SET_FLAG(lname, FOLIO_##policy)                                        \
static __always_inline void __SetPage##uname(struct page *page)                \
{ __set_bit(PG_##lname, &policy(page, 1)->flags.f); }

#define __CLEARPAGEFLAG(uname, lname, policy)                                \
__FOLIO_CLEAR_FLAG(lname, FOLIO_##policy)                                \
static __always_inline void __ClearPage##uname(struct page *page)        \
{ __clear_bit(PG_##lname, &policy(page, 1)->flags.f); }

#define TESTSETFLAG(uname, lname, policy)                                \
FOLIO_TEST_SET_FLAG(lname, FOLIO_##policy)                                \
static __always_inline int TestSetPage##uname(struct page *page)        \
{ return test_and_set_bit(PG_##lname, &policy(page, 1)->flags.f); }

#define TESTCLEARFLAG(uname, lname, policy)                                \
FOLIO_TEST_CLEAR_FLAG(lname, FOLIO_##policy)                                \
static __always_inline int TestClearPage##uname(struct page *page)        \
{ return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags.f); }

#define PAGEFLAG(uname, lname, policy)                                        \
        TESTPAGEFLAG(uname, lname, policy)                                \
        SETPAGEFLAG(uname, lname, policy)                                \
        CLEARPAGEFLAG(uname, lname, policy)

#define __PAGEFLAG(uname, lname, policy)                                \
        TESTPAGEFLAG(uname, lname, policy)                                \
        __SETPAGEFLAG(uname, lname, policy)                                \
        __CLEARPAGEFLAG(uname, lname, policy)

#define TESTSCFLAG(uname, lname, policy)                                \
        TESTSETFLAG(uname, lname, policy)                                \
        TESTCLEARFLAG(uname, lname, policy)

#define FOLIO_TEST_FLAG_FALSE(name)                                        \
static inline bool folio_test_##name(const struct folio *folio)                \
{ return false; }
#define FOLIO_SET_FLAG_NOOP(name)                                        \
static inline void folio_set_##name(struct folio *folio) { }
#define FOLIO_CLEAR_FLAG_NOOP(name)                                        \
static inline void folio_clear_##name(struct folio *folio) { }
#define __FOLIO_SET_FLAG_NOOP(name)                                        \
static inline void __folio_set_##name(struct folio *folio) { }
#define __FOLIO_CLEAR_FLAG_NOOP(name)                                        \
static inline void __folio_clear_##name(struct folio *folio) { }
#define FOLIO_TEST_SET_FLAG_FALSE(name)                                        \
static inline bool folio_test_set_##name(struct folio *folio)                \
{ return false; }
#define FOLIO_TEST_CLEAR_FLAG_FALSE(name)                                \
static inline bool folio_test_clear_##name(struct folio *folio)                \
{ return false; }

#define FOLIO_FLAG_FALSE(name)                                                \
FOLIO_TEST_FLAG_FALSE(name)                                                \
FOLIO_SET_FLAG_NOOP(name)                                                \
FOLIO_CLEAR_FLAG_NOOP(name)

#define TESTPAGEFLAG_FALSE(uname, lname)                                \
FOLIO_TEST_FLAG_FALSE(lname)                                                \
static inline int Page##uname(const struct page *page) { return 0; }

#define SETPAGEFLAG_NOOP(uname, lname)                                        \
FOLIO_SET_FLAG_NOOP(lname)                                                \
static inline void SetPage##uname(struct page *page) {  }

#define CLEARPAGEFLAG_NOOP(uname, lname)                                \
FOLIO_CLEAR_FLAG_NOOP(lname)                                                \
static inline void ClearPage##uname(struct page *page) {  }

#define __CLEARPAGEFLAG_NOOP(uname, lname)                                \
__FOLIO_CLEAR_FLAG_NOOP(lname)                                                \
static inline void __ClearPage##uname(struct page *page) {  }

#define TESTSETFLAG_FALSE(uname, lname)                                        \
FOLIO_TEST_SET_FLAG_FALSE(lname)                                        \
static inline int TestSetPage##uname(struct page *page) { return 0; }

#define TESTCLEARFLAG_FALSE(uname, lname)                                \
FOLIO_TEST_CLEAR_FLAG_FALSE(lname)                                        \
static inline int TestClearPage##uname(struct page *page) { return 0; }

#define PAGEFLAG_FALSE(uname, lname) TESTPAGEFLAG_FALSE(uname, lname)        \
        SETPAGEFLAG_NOOP(uname, lname) CLEARPAGEFLAG_NOOP(uname, lname)

#define TESTSCFLAG_FALSE(uname, lname)                                        \
        TESTSETFLAG_FALSE(uname, lname) TESTCLEARFLAG_FALSE(uname, lname)

__PAGEFLAG(Locked, locked, PF_NO_TAIL)
FOLIO_FLAG(waiters, FOLIO_HEAD_PAGE)
FOLIO_FLAG(referenced, FOLIO_HEAD_PAGE)
        FOLIO_TEST_CLEAR_FLAG(referenced, FOLIO_HEAD_PAGE)
        __FOLIO_SET_FLAG(referenced, FOLIO_HEAD_PAGE)
PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
        __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD)
PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
        TESTCLEARFLAG(LRU, lru, PF_HEAD)
FOLIO_FLAG(active, FOLIO_HEAD_PAGE)
        __FOLIO_CLEAR_FLAG(active, FOLIO_HEAD_PAGE)
        FOLIO_TEST_CLEAR_FLAG(active, FOLIO_HEAD_PAGE)
PAGEFLAG(Workingset, workingset, PF_HEAD)
        TESTCLEARFLAG(Workingset, workingset, PF_HEAD)
PAGEFLAG(Checked, checked, PF_NO_COMPOUND)           /* Used by some filesystems */

/* Xen */
PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND)
        TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND)
PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND);
PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND);
PAGEFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND)
        TESTCLEARFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND)

PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
        __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
        __SETPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
FOLIO_FLAG(swapbacked, FOLIO_HEAD_PAGE)
        __FOLIO_CLEAR_FLAG(swapbacked, FOLIO_HEAD_PAGE)
        __FOLIO_SET_FLAG(swapbacked, FOLIO_HEAD_PAGE)

/*
 * Private page markings that may be used by the filesystem that owns the page
 * for its own purposes.
 * - PG_private and PG_private_2 cause release_folio() and co to be invoked
 */
PAGEFLAG(Private, private, PF_ANY)
FOLIO_FLAG(private_2, FOLIO_HEAD_PAGE)

/* owner_2 can be set on tail pages for anon memory */
FOLIO_FLAG(owner_2, FOLIO_HEAD_PAGE)

/*
 * Only test-and-set exist for PG_writeback.  The unconditional operators are
 * risky: they bypass page accounting.
 */
TESTPAGEFLAG(Writeback, writeback, PF_NO_TAIL)
        TESTSCFLAG(Writeback, writeback, PF_NO_TAIL)
FOLIO_FLAG(mappedtodisk, FOLIO_HEAD_PAGE)

/* PG_readahead is only used for reads; PG_reclaim is only for writes */
PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL)
        TESTCLEARFLAG(Reclaim, reclaim, PF_NO_TAIL)
FOLIO_FLAG(readahead, FOLIO_HEAD_PAGE)
        FOLIO_TEST_CLEAR_FLAG(readahead, FOLIO_HEAD_PAGE)

FOLIO_FLAG(dropbehind, FOLIO_HEAD_PAGE)
        FOLIO_TEST_CLEAR_FLAG(dropbehind, FOLIO_HEAD_PAGE)
        __FOLIO_SET_FLAG(dropbehind, FOLIO_HEAD_PAGE)

#ifdef CONFIG_HIGHMEM
/*
 * Must use a macro here due to header dependency issues. page_zone() is not
 * available at this point.
 */
#define PageHighMem(__p) is_highmem_idx(page_zonenum(__p))
#define folio_test_highmem(__f)        is_highmem_idx(folio_zonenum(__f))
#else
PAGEFLAG_FALSE(HighMem, highmem)
#endif
#define PhysHighMem(__p) (PageHighMem(phys_to_page(__p)))

/* Does kmap_local_folio() only allow access to one page of the folio? */
#ifdef CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP
#define folio_test_partial_kmap(f)        true
#else
#define folio_test_partial_kmap(f)        folio_test_highmem(f)
#endif

#ifdef CONFIG_SWAP
static __always_inline bool folio_test_swapcache(const struct folio *folio)
{
        return folio_test_swapbacked(folio) &&
                        test_bit(PG_swapcache, const_folio_flags(folio, 0));
}

FOLIO_SET_FLAG(swapcache, FOLIO_HEAD_PAGE)
FOLIO_CLEAR_FLAG(swapcache, FOLIO_HEAD_PAGE)
#else
FOLIO_FLAG_FALSE(swapcache)
#endif

FOLIO_FLAG(unevictable, FOLIO_HEAD_PAGE)
        __FOLIO_CLEAR_FLAG(unevictable, FOLIO_HEAD_PAGE)
        FOLIO_TEST_CLEAR_FLAG(unevictable, FOLIO_HEAD_PAGE)

#ifdef CONFIG_MMU
FOLIO_FLAG(mlocked, FOLIO_HEAD_PAGE)
        __FOLIO_CLEAR_FLAG(mlocked, FOLIO_HEAD_PAGE)
        FOLIO_TEST_CLEAR_FLAG(mlocked, FOLIO_HEAD_PAGE)
        FOLIO_TEST_SET_FLAG(mlocked, FOLIO_HEAD_PAGE)
#else
FOLIO_FLAG_FALSE(mlocked)
        __FOLIO_CLEAR_FLAG_NOOP(mlocked)
        FOLIO_TEST_CLEAR_FLAG_FALSE(mlocked)
        FOLIO_TEST_SET_FLAG_FALSE(mlocked)
#endif

#ifdef CONFIG_MEMORY_FAILURE
PAGEFLAG(HWPoison, hwpoison, PF_ANY)
TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
#define __PG_HWPOISON (1UL << PG_hwpoison)
#else
PAGEFLAG_FALSE(HWPoison, hwpoison)
#define __PG_HWPOISON 0
#endif

#ifdef CONFIG_PAGE_IDLE_FLAG
#ifdef CONFIG_64BIT
FOLIO_TEST_FLAG(young, FOLIO_HEAD_PAGE)
FOLIO_SET_FLAG(young, FOLIO_HEAD_PAGE)
FOLIO_TEST_CLEAR_FLAG(young, FOLIO_HEAD_PAGE)
FOLIO_FLAG(idle, FOLIO_HEAD_PAGE)
#endif
/* See page_idle.h for !64BIT workaround */
#else /* !CONFIG_PAGE_IDLE_FLAG */
FOLIO_FLAG_FALSE(young)
FOLIO_TEST_CLEAR_FLAG_FALSE(young)
FOLIO_FLAG_FALSE(idle)
#endif

/*
 * PageReported() is used to track reported free pages within the Buddy
 * allocator. We can use the non-atomic version of the test and set
 * operations as both should be shielded with the zone lock to prevent
 * any possible races on the setting or clearing of the bit.
 */
__PAGEFLAG(Reported, reported, PF_NO_COMPOUND)

#ifdef CONFIG_MEMORY_HOTPLUG
PAGEFLAG(VmemmapSelfHosted, vmemmap_self_hosted, PF_ANY)
#else
PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted)
#endif

/*
 * On an anonymous folio mapped into a user virtual memory area,
 * folio->mapping points to its anon_vma, not to a struct address_space;
 * with the FOLIO_MAPPING_ANON bit set to distinguish it.  See rmap.h.
 *
 * On an anonymous folio in a VM_MERGEABLE area, if CONFIG_KSM is enabled,
 * the FOLIO_MAPPING_ANON_KSM bit may be set along with the FOLIO_MAPPING_ANON
 * bit; and then folio->mapping points, not to an anon_vma, but to a private
 * structure which KSM associates with that merged folio.  See ksm.h.
 *
 * Please note that, confusingly, "folio_mapping" refers to the inode
 * address_space which maps the folio from disk; whereas "folio_mapped"
 * refers to user virtual address space into which the folio is mapped.
 *
 * For slab pages, since slab reuses the bits in struct page to store its
 * internal states, the folio->mapping does not exist as such, nor do
 * these flags below.  So in order to avoid testing non-existent bits,
 * please make sure that folio_test_slab(folio) actually evaluates to
 * false before calling the following functions (e.g., folio_test_anon).
 * See mm/slab.h.
 */
#define FOLIO_MAPPING_ANON        0x1
#define FOLIO_MAPPING_ANON_KSM        0x2
#define FOLIO_MAPPING_KSM        (FOLIO_MAPPING_ANON | FOLIO_MAPPING_ANON_KSM)
#define FOLIO_MAPPING_FLAGS        (FOLIO_MAPPING_ANON | FOLIO_MAPPING_ANON_KSM)

static __always_inline bool folio_test_anon(const struct folio *folio)
{
        return ((unsigned long)folio->mapping & FOLIO_MAPPING_ANON) != 0;
}

static __always_inline bool PageAnonNotKsm(const struct page *page)
{
        unsigned long flags = (unsigned long)page_folio(page)->mapping;

        return (flags & FOLIO_MAPPING_FLAGS) == FOLIO_MAPPING_ANON;
}

static __always_inline bool PageAnon(const struct page *page)
{
        return folio_test_anon(page_folio(page));
}
#ifdef CONFIG_KSM
/*
 * A KSM page is one of those write-protected "shared pages" or "merged pages"
 * which KSM maps into multiple mms, wherever identical anonymous page content
 * is found in VM_MERGEABLE vmas.  It's a PageAnon page, pointing not to any
 * anon_vma, but to that page's node of the stable tree.
 */
static __always_inline bool folio_test_ksm(const struct folio *folio)
{
        return ((unsigned long)folio->mapping & FOLIO_MAPPING_FLAGS) ==
                                FOLIO_MAPPING_KSM;
}
#else
FOLIO_TEST_FLAG_FALSE(ksm)
#endif

u64 stable_page_flags(const struct page *page);

/**
 * folio_xor_flags_has_waiters - Change some folio flags.
 * @folio: The folio.
 * @mask: Bits set in this word will be changed.
 *
 * This must only be used for flags which are changed with the folio
 * lock held.  For example, it is unsafe to use for PG_dirty as that
 * can be set without the folio lock held.  It can also only be used
 * on flags which are in the range 0-6 as some of the implementations
 * only affect those bits.
 *
 * Return: Whether there are tasks waiting on the folio.
 */
static inline bool folio_xor_flags_has_waiters(struct folio *folio,
                unsigned long mask)
{
        return xor_unlock_is_negative_byte(mask, folio_flags(folio, 0));
}

/**
 * folio_test_uptodate - Is this folio up to date?
 * @folio: The folio.
 *
 * The uptodate flag is set on a folio when every byte in the folio is
 * at least as new as the corresponding bytes on storage.  Anonymous
 * and CoW folios are always uptodate.  If the folio is not uptodate,
 * some of the bytes in it may be; see the is_partially_uptodate()
 * address_space operation.
 */
static inline bool folio_test_uptodate(const struct folio *folio)
{
        bool ret = test_bit(PG_uptodate, const_folio_flags(folio, 0));
        /*
         * Must ensure that the data we read out of the folio is loaded
         * _after_ we've loaded folio->flags to check the uptodate bit.
         * We can skip the barrier if the folio is not uptodate, because
         * we wouldn't be reading anything from it.
         *
         * See folio_mark_uptodate() for the other side of the story.
         */
        if (ret)
                smp_rmb();

        return ret;
}

static inline bool PageUptodate(const struct page *page)
{
        return folio_test_uptodate(page_folio(page));
}

static __always_inline void __folio_mark_uptodate(struct folio *folio)
{
        smp_wmb();
        __set_bit(PG_uptodate, folio_flags(folio, 0));
}

static __always_inline void folio_mark_uptodate(struct folio *folio)
{
        /*
         * Memory barrier must be issued before setting the PG_uptodate bit,
         * so that all previous stores issued in order to bring the folio
         * uptodate are actually visible before folio_test_uptodate becomes true.
         */
        smp_wmb();
        set_bit(PG_uptodate, folio_flags(folio, 0));
}

static __always_inline void __SetPageUptodate(struct page *page)
{
        __folio_mark_uptodate((struct folio *)page);
}

static __always_inline void SetPageUptodate(struct page *page)
{
        folio_mark_uptodate((struct folio *)page);
}

CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL)

void __folio_start_writeback(struct folio *folio, bool keep_write);
void set_page_writeback(struct page *page);

#define folio_start_writeback(folio)                        \
        __folio_start_writeback(folio, false)

static __always_inline bool folio_test_head(const struct folio *folio)
{
        return test_bit(PG_head, const_folio_flags(folio, FOLIO_PF_ANY));
}

static __always_inline int PageHead(const struct page *page)
{
        PF_POISONED_CHECK(page);
        return test_bit(PG_head, &page->flags.f) && !page_is_fake_head(page);
}

__SETPAGEFLAG(Head, head, PF_ANY)
__CLEARPAGEFLAG(Head, head, PF_ANY)
CLEARPAGEFLAG(Head, head, PF_ANY)

/**
 * folio_test_large() - Does this folio contain more than one page?
 * @folio: The folio to test.
 *
 * Return: True if the folio is larger than one page.
 */
static inline bool folio_test_large(const struct folio *folio)
{
        return folio_test_head(folio);
}

static __always_inline void set_compound_head(struct page *page, struct page *head)
{
        WRITE_ONCE(page->compound_head, (unsigned long)head + 1);
}

static __always_inline void clear_compound_head(struct page *page)
{
        WRITE_ONCE(page->compound_head, 0);
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline void ClearPageCompound(struct page *page)
{
        BUG_ON(!PageHead(page));
        ClearPageHead(page);
}
FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
#else
FOLIO_FLAG_FALSE(large_rmappable)
FOLIO_FLAG_FALSE(partially_mapped)
#endif

#define PG_head_mask ((1UL << PG_head))

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
 * PageTransCompound returns true for both transparent huge pages
 * and hugetlbfs pages, so it should only be called when it's known
 * that hugetlbfs pages aren't involved.
 */
static inline int PageTransCompound(const struct page *page)
{
        return PageCompound(page);
}
#else
TESTPAGEFLAG_FALSE(TransCompound, transcompound)
#endif

#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
/*
 * PageHasHWPoisoned indicates that at least one subpage is hwpoisoned in the
 * compound page.
 *
 * This flag is set by hwpoison handler.  Cleared by THP split or free page.
 */
FOLIO_FLAG(has_hwpoisoned, FOLIO_SECOND_PAGE)
#else
FOLIO_FLAG_FALSE(has_hwpoisoned)
#endif

/*
 * For pages that do not use mapcount, page_type may be used.
 * The low 24 bits of pagetype may be used for your own purposes, as long
 * as you are careful to not affect the top 8 bits.  The low bits of
 * pagetype will be overwritten when you clear the page_type from the page.
 */
enum pagetype {
        /* 0x00-0x7f are positive numbers, ie mapcount */
        /* Reserve 0x80-0xef for mapcount overflow. */
        PGTY_buddy                = 0xf0,
        PGTY_offline                = 0xf1,
        PGTY_table                = 0xf2,
        PGTY_guard                = 0xf3,
        PGTY_hugetlb                = 0xf4,
        PGTY_slab                = 0xf5,
        PGTY_zsmalloc                = 0xf6,
        PGTY_unaccepted                = 0xf7,
        PGTY_large_kmalloc        = 0xf8,

        PGTY_mapcount_underflow = 0xff
};

static inline bool page_type_has_type(int page_type)
{
        return page_type < (PGTY_mapcount_underflow << 24);
}

/* This takes a mapcount which is one more than page->_mapcount */
static inline bool page_mapcount_is_type(unsigned int mapcount)
{
        return page_type_has_type(mapcount - 1);
}

static inline bool page_has_type(const struct page *page)
{
        return page_type_has_type(data_race(page->page_type));
}

#define FOLIO_TYPE_OPS(lname, fname)                                        \
static __always_inline bool folio_test_##fname(const struct folio *folio) \
{                                                                        \
        return data_race(folio->page.page_type >> 24) == PGTY_##lname;        \
}                                                                        \
static __always_inline void __folio_set_##fname(struct folio *folio)        \
{                                                                        \
        if (folio_test_##fname(folio))                                        \
                return;                                                        \
        VM_BUG_ON_FOLIO(data_race(folio->page.page_type) != UINT_MAX,        \
                        folio);                                                \
        folio->page.page_type = (unsigned int)PGTY_##lname << 24;        \
}                                                                        \
static __always_inline void __folio_clear_##fname(struct folio *folio)        \
{                                                                        \
        if (folio->page.page_type == UINT_MAX)                                \
                return;                                                        \
        VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio);                \
        folio->page.page_type = UINT_MAX;                                \
}

#define PAGE_TYPE_OPS(uname, lname, fname)                                \
FOLIO_TYPE_OPS(lname, fname)                                                \
static __always_inline int Page##uname(const struct page *page)                \
{                                                                        \
        return data_race(page->page_type >> 24) == PGTY_##lname;        \
}                                                                        \
static __always_inline void __SetPage##uname(struct page *page)                \
{                                                                        \
        if (Page##uname(page))                                                \
                return;                                                        \
        VM_BUG_ON_PAGE(data_race(page->page_type) != UINT_MAX, page);        \
        page->page_type = (unsigned int)PGTY_##lname << 24;                \
}                                                                        \
static __always_inline void __ClearPage##uname(struct page *page)        \
{                                                                        \
        if (page->page_type == UINT_MAX)                                \
                return;                                                        \
        VM_BUG_ON_PAGE(!Page##uname(page), page);                        \
        page->page_type = UINT_MAX;                                        \
}

/*
 * PageBuddy() indicates that the page is free and in the buddy system
 * (see mm/page_alloc.c).
 */
PAGE_TYPE_OPS(Buddy, buddy, buddy)

/*
 * PageOffline() indicates that the page is logically offline although the
 * containing section is online. (e.g. inflated in a balloon driver or
 * not onlined when onlining the section).
 * The content of these pages is effectively stale. Such pages should not
 * be touched (read/write/dump/save) except by their owner.
 *
 * When a memory block gets onlined, all pages are initialized with a
 * refcount of 1 and PageOffline(). generic_online_page() will
 * take care of clearing PageOffline().
 *
 * If a driver wants to allow to offline unmovable PageOffline() pages without
 * putting them back to the buddy, it can do so via the memory notifier by
 * decrementing the reference count in MEM_GOING_OFFLINE and incrementing the
 * reference count in MEM_CANCEL_OFFLINE. When offlining, the PageOffline()
 * pages (now with a reference count of zero) are treated like free (unmanaged)
 * pages, allowing the containing memory block to get offlined. A driver that
 * relies on this feature is aware that re-onlining the memory block will
 * require not giving them to the buddy via generic_online_page().
 *
 * Memory offlining code will not adjust the managed page count for any
 * PageOffline() pages, treating them like they were never exposed to the
 * buddy using generic_online_page().
 *
 * There are drivers that mark a page PageOffline() and expect there won't be
 * any further access to page content. PFN walkers that read content of random
 * pages should check PageOffline() and synchronize with such drivers using
 * page_offline_freeze()/page_offline_thaw().
 */
PAGE_TYPE_OPS(Offline, offline, offline)

extern void page_offline_freeze(void);
extern void page_offline_thaw(void);
extern void page_offline_begin(void);
extern void page_offline_end(void);

/*
 * Marks pages in use as page tables.
 */
PAGE_TYPE_OPS(Table, table, pgtable)

/*
 * Marks guardpages used with debug_pagealloc.
 */
PAGE_TYPE_OPS(Guard, guard, guard)

FOLIO_TYPE_OPS(slab, slab)

/**
 * PageSlab - Determine if the page belongs to the slab allocator
 * @page: The page to test.
 *
 * Context: Any context.
 * Return: True for slab pages, false for any other kind of page.
 */
static inline bool PageSlab(const struct page *page)
{
        return folio_test_slab(page_folio(page));
}

#ifdef CONFIG_HUGETLB_PAGE
FOLIO_TYPE_OPS(hugetlb, hugetlb)
#else
FOLIO_TEST_FLAG_FALSE(hugetlb)
#endif

PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc)

/*
 * Mark pages that has to be accepted before touched for the first time.
 *
 * Serialized with zone lock.
 */
PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted)
FOLIO_TYPE_OPS(large_kmalloc, large_kmalloc)

/**
 * PageHuge - Determine if the page belongs to hugetlbfs
 * @page: The page to test.
 *
 * Context: Any context.
 * Return: True for hugetlbfs pages, false for anon pages or pages
 * belonging to other filesystems.
 */
static inline bool PageHuge(const struct page *page)
{
        return folio_test_hugetlb(page_folio(page));
}

/*
 * Check if a page is currently marked HWPoisoned. Note that this check is
 * best effort only and inherently racy: there is no way to synchronize with
 * failing hardware.
 */
static inline bool is_page_hwpoison(const struct page *page)
{
        const struct folio *folio;

        if (PageHWPoison(page))
                return true;
        folio = page_folio(page);
        return folio_test_hugetlb(folio) && PageHWPoison(&folio->page);
}

static inline bool folio_contain_hwpoisoned_page(struct folio *folio)
{
        return folio_test_hwpoison(folio) ||
            (folio_test_large(folio) && folio_test_has_hwpoisoned(folio));
}

bool is_free_buddy_page(const struct page *page);

#ifdef CONFIG_MIGRATION
/*
 * This page is migratable through movable_ops (for selected typed pages
 * only).
 *
 * Page migration of such pages might fail, for example, if the page is
 * already isolated by somebody else, or if the page is about to get freed.
 *
 * While a subsystem might set selected typed pages that support page migration
 * as being movable through movable_ops, it must never clear this flag.
 *
 * This flag is only cleared when the page is freed back to the buddy.
 *
 * Only selected page types support this flag (see page_movable_ops()) and
 * the flag might be used in other context for other pages. Always use
 * page_has_movable_ops() instead.
 */
TESTPAGEFLAG(MovableOps, movable_ops, PF_NO_TAIL);
SETPAGEFLAG(MovableOps, movable_ops, PF_NO_TAIL);
/*
 * A movable_ops page has this flag set while it is isolated for migration.
 * This flag primarily protects against concurrent migration attempts.
 *
 * Once migration ended (success or failure), the flag is cleared. The
 * flag is managed by the migration core.
 */
PAGEFLAG(MovableOpsIsolated, movable_ops_isolated, PF_NO_TAIL);
#else /* !CONFIG_MIGRATION */
TESTPAGEFLAG_FALSE(MovableOps, movable_ops);
SETPAGEFLAG_NOOP(MovableOps, movable_ops);
PAGEFLAG_FALSE(MovableOpsIsolated, movable_ops_isolated);
#endif /* CONFIG_MIGRATION */

/**
 * page_has_movable_ops - test for a movable_ops page
 * @page: The page to test.
 *
 * Test whether this is a movable_ops page. Such pages will stay that
 * way until freed.
 *
 * Returns true if this is a movable_ops page, otherwise false.
 */
static inline bool page_has_movable_ops(const struct page *page)
{
        return PageMovableOps(page) &&
               (PageOffline(page) || PageZsmalloc(page));
}

static __always_inline int PageAnonExclusive(const struct page *page)
{
        VM_BUG_ON_PGFLAGS(!PageAnon(page), page);
        /*
         * HugeTLB stores this information on the head page; THP keeps it per
         * page
         */
        if (PageHuge(page))
                page = compound_head(page);
        return test_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f);
}

static __always_inline void SetPageAnonExclusive(struct page *page)
{
        VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page);
        VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page);
        set_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f);
}

static __always_inline void ClearPageAnonExclusive(struct page *page)
{
        VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page);
        VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page);
        clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f);
}

static __always_inline void __ClearPageAnonExclusive(struct page *page)
{
        VM_BUG_ON_PGFLAGS(!PageAnon(page), page);
        VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page);
        __clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f);
}

#ifdef CONFIG_MMU
#define __PG_MLOCKED                (1UL << PG_mlocked)
#else
#define __PG_MLOCKED                0
#endif

/*
 * Flags checked when a page is freed.  Pages being freed should not have
 * these flags set.  If they are, there is a problem.
 */
#define PAGE_FLAGS_CHECK_AT_FREE                                \
        (1UL << PG_lru                | 1UL << PG_locked        |        \
         1UL << PG_private        | 1UL << PG_private_2        |        \
         1UL << PG_writeback        | 1UL << PG_reserved        |        \
         1UL << PG_active         |                                \
         1UL << PG_unevictable        | __PG_MLOCKED | LRU_GEN_MASK)

/*
 * Flags checked when a page is prepped for return by the page allocator.
 * Pages being prepped should not have these flags set.  If they are set,
 * there has been a kernel bug or struct page corruption.
 *
 * __PG_HWPOISON is exceptional because it needs to be kept beyond page's
 * alloc-free cycle to prevent from reusing the page.
 */
#define PAGE_FLAGS_CHECK_AT_PREP        \
        ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)

/*
 * Flags stored in the second page of a compound page.  They may overlap
 * the CHECK_AT_FREE flags above, so need to be cleared.
 */
#define PAGE_FLAGS_SECOND                                                \
        (0xffUL /* order */                | 1UL << PG_has_hwpoisoned |        \
         1UL << PG_large_rmappable        | 1UL << PG_partially_mapped)

#define PAGE_FLAGS_PRIVATE                                \
        (1UL << PG_private | 1UL << PG_private_2)
/**
 * folio_has_private - Determine if folio has private stuff
 * @folio: The folio to be checked
 *
 * Determine if a folio has private stuff, indicating that release routines
 * should be invoked upon it.
 */
static inline int folio_has_private(const struct folio *folio)
{
        return !!(folio->flags.f & PAGE_FLAGS_PRIVATE);
}

#undef PF_ANY
#undef PF_HEAD
#undef PF_NO_TAIL
#undef PF_NO_COMPOUND
#undef PF_SECOND
#endif /* !__GENERATING_BOUNDS_H */

#endif        /* PAGE_FLAGS_H */























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * This is <linux/capability.h>
 *
 * Andrew G. Morgan <morgan@kernel.org>
 * Alexander Kjeldaas <astor@guardian.no>
 * with help from Aleph1, Roland Buresund and Andrew Main.
 *
 * See here for the libcap library ("POSIX draft" compliance):
 *
 * ftp://www.kernel.org/pub/linux/libs/security/linux-privs/kernel-2.6/
 */
#ifndef _LINUX_CAPABILITY_H
#define _LINUX_CAPABILITY_H

#include <uapi/linux/capability.h>
#include <linux/uidgid.h>
#include <linux/bits.h>

#define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3

extern int file_caps_enabled;

typedef struct { u64 val; } kernel_cap_t;

/* same as vfs_ns_cap_data but in cpu endian and always filled completely */
struct cpu_vfs_cap_data {
        __u32 magic_etc;
        kuid_t rootid;
        kernel_cap_t permitted;
        kernel_cap_t inheritable;
};

#define _USER_CAP_HEADER_SIZE  (sizeof(struct __user_cap_header_struct))
#define _KERNEL_CAP_T_SIZE     (sizeof(kernel_cap_t))

struct file;
struct inode;
struct dentry;
struct task_struct;
struct user_namespace;
struct mnt_idmap;

/*
 * CAP_FS_MASK and CAP_NFSD_MASKS:
 *
 * The fs mask is all the privileges that fsuid==0 historically meant.
 * At one time in the past, that included CAP_MKNOD and CAP_LINUX_IMMUTABLE.
 *
 * It has never meant setting security.* and trusted.* xattrs.
 *
 * We could also define fsmask as follows:
 *   1. CAP_FS_MASK is the privilege to bypass all fs-related DAC permissions
 *   2. The security.* and trusted.* xattrs are fs-related MAC permissions
 */

# define CAP_FS_MASK     (BIT_ULL(CAP_CHOWN)                \
                        | BIT_ULL(CAP_MKNOD)                \
                        | BIT_ULL(CAP_DAC_OVERRIDE)        \
                        | BIT_ULL(CAP_DAC_READ_SEARCH)        \
                        | BIT_ULL(CAP_FOWNER)                \
                        | BIT_ULL(CAP_FSETID)                \
                        | BIT_ULL(CAP_MAC_OVERRIDE))
#define CAP_VALID_MASK         (BIT_ULL(CAP_LAST_CAP+1)-1)

# define CAP_EMPTY_SET    ((kernel_cap_t) { 0 })
# define CAP_FULL_SET     ((kernel_cap_t) { CAP_VALID_MASK })
# define CAP_FS_SET       ((kernel_cap_t) { CAP_FS_MASK | BIT_ULL(CAP_LINUX_IMMUTABLE) })
# define CAP_NFSD_SET     ((kernel_cap_t) { CAP_FS_MASK | BIT_ULL(CAP_SYS_RESOURCE) })

# define cap_clear(c)         do { (c).val = 0; } while (0)

#define cap_raise(c, flag)  ((c).val |= BIT_ULL(flag))
#define cap_lower(c, flag)  ((c).val &= ~BIT_ULL(flag))
#define cap_raised(c, flag) (((c).val & BIT_ULL(flag)) != 0)

static inline kernel_cap_t cap_combine(const kernel_cap_t a,
                                       const kernel_cap_t b)
{
        return (kernel_cap_t) { a.val | b.val };
}

static inline kernel_cap_t cap_intersect(const kernel_cap_t a,
                                         const kernel_cap_t b)
{
        return (kernel_cap_t) { a.val & b.val };
}

static inline kernel_cap_t cap_drop(const kernel_cap_t a,
                                    const kernel_cap_t drop)
{
        return (kernel_cap_t) { a.val &~ drop.val };
}

static inline bool cap_isclear(const kernel_cap_t a)
{
        return !a.val;
}

static inline bool cap_isidentical(const kernel_cap_t a, const kernel_cap_t b)
{
        return a.val == b.val;
}

/*
 * Check if "a" is a subset of "set".
 * return true if ALL of the capabilities in "a" are also in "set"
 *        cap_issubset(0101, 1111) will return true
 * return false if ANY of the capabilities in "a" are not in "set"
 *        cap_issubset(1111, 0101) will return false
 */
static inline bool cap_issubset(const kernel_cap_t a, const kernel_cap_t set)
{
        return !(a.val & ~set.val);
}

/* Used to decide between falling back on the old suser() or fsuser(). */

static inline kernel_cap_t cap_drop_fs_set(const kernel_cap_t a)
{
        return cap_drop(a, CAP_FS_SET);
}

static inline kernel_cap_t cap_raise_fs_set(const kernel_cap_t a,
                                            const kernel_cap_t permitted)
{
        return cap_combine(a, cap_intersect(permitted, CAP_FS_SET));
}

static inline kernel_cap_t cap_drop_nfsd_set(const kernel_cap_t a)
{
        return cap_drop(a, CAP_NFSD_SET);
}

static inline kernel_cap_t cap_raise_nfsd_set(const kernel_cap_t a,
                                              const kernel_cap_t permitted)
{
        return cap_combine(a, cap_intersect(permitted, CAP_NFSD_SET));
}

#ifdef CONFIG_MULTIUSER
extern bool has_ns_capability(struct task_struct *t,
                              struct user_namespace *ns, int cap);
extern bool has_capability_noaudit(struct task_struct *t, int cap);
extern bool has_ns_capability_noaudit(struct task_struct *t,
                                      struct user_namespace *ns, int cap);
extern bool capable(int cap);
extern bool ns_capable(struct user_namespace *ns, int cap);
extern bool ns_capable_noaudit(struct user_namespace *ns, int cap);
extern bool ns_capable_setid(struct user_namespace *ns, int cap);
#else
static inline bool has_ns_capability(struct task_struct *t,
                              struct user_namespace *ns, int cap)
{
        return true;
}
static inline bool has_capability_noaudit(struct task_struct *t, int cap)
{
        return true;
}
static inline bool has_ns_capability_noaudit(struct task_struct *t,
                                      struct user_namespace *ns, int cap)
{
        return true;
}
static inline bool capable(int cap)
{
        return true;
}
static inline bool ns_capable(struct user_namespace *ns, int cap)
{
        return true;
}
static inline bool ns_capable_noaudit(struct user_namespace *ns, int cap)
{
        return true;
}
static inline bool ns_capable_setid(struct user_namespace *ns, int cap)
{
        return true;
}
#endif /* CONFIG_MULTIUSER */
bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
                                 struct mnt_idmap *idmap,
                                 const struct inode *inode);
bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap,
                              const struct inode *inode, int cap);
extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap);
extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns);
static inline bool perfmon_capable(void)
{
        return capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN);
}

static inline bool bpf_capable(void)
{
        return capable(CAP_BPF) || capable(CAP_SYS_ADMIN);
}

static inline bool checkpoint_restore_ns_capable(struct user_namespace *ns)
{
        return ns_capable(ns, CAP_CHECKPOINT_RESTORE) ||
                ns_capable(ns, CAP_SYS_ADMIN);
}

/* audit system wants to get cap info from files as well */
int get_vfs_caps_from_disk(struct mnt_idmap *idmap,
                           const struct dentry *dentry,
                           struct cpu_vfs_cap_data *cpu_caps);

int cap_convert_nscap(struct mnt_idmap *idmap, struct dentry *dentry,
                      const void **ivalue, size_t size);

#endif /* !_LINUX_CAPABILITY_H */


















    4 





    4 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_PGTABLE_INVERT_H
#define _ASM_PGTABLE_INVERT_H 1

#ifndef __ASSEMBLER__

/*
 * A clear pte value is special, and doesn't get inverted.
 *
 * Note that even users that only pass a pgprot_t (rather
 * than a full pte) won't trigger the special zero case,
 * because even PAGE_NONE has _PAGE_PROTNONE | _PAGE_ACCESSED
 * set. So the all zero case really is limited to just the
 * cleared page table entry case.
 */
static inline bool __pte_needs_invert(u64 val)
{
        return val && !(val & _PAGE_PRESENT);
}

/* Get a mask to xor with the page table entry to get the correct pfn. */
static inline u64 protnone_mask(u64 val)
{
        return __pte_needs_invert(val) ?  ~0ull : 0;
}

static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask)
{
        /*
         * When a PTE transitions from NONE to !NONE or vice-versa
         * invert the PFN part to stop speculation.
         * pte_pfn undoes this when needed.
         */
        if (__pte_needs_invert(oldval) != __pte_needs_invert(val))
                val = (val & ~mask) | (~val & mask);
        return val;
}

#endif /* __ASSEMBLER__ */

#endif









































   93 



   94 


   72 
    3 



   70 












   93 




   94 






   13 
    1 




   94 




   50 






    8 




    1 






    4 


    3 











  103 

   10 














    1 
    1 







   16 
    2 



   15 

    2 




   14 
    2 



   13 
    2 




    2 



    2 
    1 




    1 
    1 













    1 





    1 
    1 




   10 
    1 
    1 












    1 






















  123 
  123 

   95 


















  123 


  106 




   98 








   84 
    2 
















  121 

   16 
   12 
   31 



   16 
   15 
   14 

  101 





   56 
    2 



   56 
   52 
   10 








   48 
    7 



   54 

    3 


   95 
   51 
   50 
   50 
   50 

    3 
    4 





   34 
    2 



   33 
   33 
   32 
   31 
   30 
    5 




    1 







   14 
   13 
   12 
   11 
   10 
    9 
    8 
    7 
    9 



    2 
    1 






  100 



   50 
   72 
   95 
   78 

    1 

   92 

    1 
   91 
    2 
   18 
    7 


   81 







    1 



    1 





    3 





   80 
    3 
    1 




    2 
    1 






   12 

    1 




   11 
    2 




   10 
    2 




    9 
    2 




    9 





    8 
    1 




    8 




    2 
    3 







   66 
    7 
    2 




    6 



    1 




    1 
    2 






    4 
    2 




    3 
    2 




    2 
    1 




    2 
    2 





   59 
   22 




    4 
  123 









   22 






    1 


   21 





   64 










   59 
   57 








   10 





   58 












   56 
   57 





    2 


   13 





   11 


   11 


   54 








   63 
   61 





    1 


    9 
    1 


    8 



    1 

   61 
    5 
    1 

    8 









    8 






    1 


    7 





   58 


   62 















































    5 


    5 

    4 

    5 
    5 








    5 





   28 






    4 

    1 

















   32 











   27 















   27 

    1 




   27 








   27 
    1 

   27 
    1 

   27 
    1 
   27 




    2 
    2 


   55 












   63 


   32 

   62 
    7 





   64 
    5 





   64 
    1 

   64 
    1 
   61 

    2 
   56 


    2 

   59 

   64 


    2 

   58 


   57 

   57 

   58 
    2 

   57 
    5 

   58 

    2 

   57 
    2 


    1 




   57 

   57 

   30 



    1 


   29 








   27 
    2 
















   25 






   37 
   37 


   37 




  121 
    1 






  124 


   64 
   26 
   17 

   27 
   27 
   11 



   27 

   19 


   50 



   17 


   19 





   27 
   27 
   63 

    8 









    6 

    4 



    4 
    3 
    8 










    8 
    1 
    8 
    2 



    5 








    5 






















    5 



   39 

















































































    4 



    7 

    7 


    7 

    7 





   36 
    7 


    2 

    7 














    5 
    5 



    4 

    5 














    6 
    6 




    6 














    9 
    9 












    1 


























   37 











   35 




   36 



    2 

   37 
    5 



   36 
   36 

    1 

   37 
    6 



   37 
    9 



   36 
    1 

   37 

   37 




   37 




    5 


   37 




   37 

   36 
   34 
   37 



   37 




   37 


    1 

   37 

    2 

   37 




   37 




   37 


   37 





    2 
   37 



   18 








   18 

   18 
   18 




   18 
    1 





   18 





    1 




   18 


   10 


    7 



   10 



   12 












   12 





   12 


    9 

   11 
    8 


    1 




    8 

    2 




    9 


    9 



    9 

   12 
    1 



   15 





   14 








   15 










   53 


    3 


   54 





   53 









   54 
   53 






















    2 
    2 















    2 





    2 




    2 

    2 






    2 






    8 








    4 

    1 

    4 
    3 
    2 



    3 


    4 



    4 
    4 
    3 




    6 
    3 

    2 


    3 
    2 


    3 

    4 


    8 
    1 




    2 








    2 


    2 
    2 

    2 


















    1 
    1 










    1 






    1 






    1 








    1 


    1 
    1 

    1 




    3 








    3 









    3 





   21 










   14 
   15 




    3 









    2 

   18 
    2 






   17 

    1 
    1 





   17 









   17 

    2 

   15 


   15 
   15 





   15 









   14 


   17 

   21 
   18 



   37 






    4 
















    5 











   45 







    2 



   44 





    2 



   43 

   24 
    3 







   18 
    3 










    1 



   38 


   37 
    2 



   44 
    1 






    2 











    7 
    6 


    7 





















   13 
    1 
    1 




   12 







   12 


   12 


    2 
    2 







    2 




    8 
    1 



    7 

    9 







    1 



    8 
    2 




   13 
    1 






   13 
   25 

















   95 




    7 

    5 


    5 



   89 
   96 
   88 



   31 










   24 


















   31 



   31 

   32 





   31 
    8 

   31 
    2 
   25 
    8 




    3 












    9 



   19 




   33 








   33 

    3 
   26 



   18 
   25 









   16 












   17 

   16 
   26 






   25 
   24 

    8 



    6 


   25 

    8 
    8 






































   28 

















    3 








    3 

    3 
    3 




    3 

    1 
    3 
    3 
    3 
    3 

    3 

    3 






    3 










    3 



    1 







    2 









    2 























    1 








    1 
















    3 


    3 
    3 

    1 










    3 
    3 

    3 










    3 




    2 

    3 


    3 
    3 

    3 

    3 





    1 









    1 


    1 
    1 










    1 





   12 








    1 


   10 





   11 



   10 




    2 






    3 
    1 
    9 









    5 

    9 

    7 


    2 


    1 
    1 
    2 

    1 






    1 





    5 



    2 
   12 
    9 




    4 






    1 
    4 
    4 










    1 



    1 
    1 























































































    1 
















    1 





















    1 




    6 













    4 
    1 




    5 
    5 






















    3 






    5 
    6 





    4 






    4 



    2 


    1 








    2 
    3 




   12 







    1 





   11 
    1 
    1 

   10 


   10 

    5 






    3 

   10 









    2 

    7 

    7 


    1 
    1 



    1 


    3 


    1 
   12 
    6 




    5 








    2 


    3 



    2 
    3 












    3 
    3 
    5 





   14 











   14 
    9 




    3 


   12 


   12 

    3 
    9 
    1 





    3 

    5 






    5 
    4 

















    2 


    9 

   14 












   34 









   34 
   33 


   34 



   34 
   34 
   34 
   35 
   36 















   37 











   36 
    1 





    3 
   37 


   37 







    5 





   34 
    1 

   34 
    1 






   33 


   34 


    3 









    2 



































    2 
    2 













    2 



    2 





    1 




    2 


    1 

    2 


    2 
    2 
















    2 




    2 

    2 



    2 
    2 

    2 







































































































































  288 
   24 





    1 




  286 
    9 







    1 









  311 







  314 
    1 








  313 











   10 


  315 

   25 
   14 





   14 

  292 



   14 


  301 
  297 
  297 
    1 
    2 

  296 
  291 
  290 
  276 


  297 











   14 





  314 
  311 



  316 





















    3 
    3 








    3 
    3 





    3 





    3 





    3 
    4 








    3 


    7 




    3 




    1 



    1 


















    4 


    4 
    4 









   23 






    3 
   19 

    5 


   19 
    4 
   19 
    6 
   19 
    1 
   19 

   19 
    3 

   19 
   19 


   18 
    1 
   19 

   19 

   19 



   18 

   19 



   18 

   19 
    3 

   18 


   18 


   19 


   19 









   22 









   19 


   19 


   19 




   19 
   19 














    1 

   18 









   19 





   26 

   26 

    2 



   22 

   26 






   25 

    4 







    4 












    4 




    4 









    4 


    4 

    3 
    4 

    4 

    4 


    1 









    4 




    4 
    4 
    4 

    4 
    4 

    4 


































































    1 












    1 
    1 





    1 


    1 
    1 
    1 
    1 

    1 

   18 















    1 
    1 


   18 
    1 

    1 















    1 




   17 


   17 


   17 




   17 
   16 



    1 




   17 

    1 


    2 

    2 


   17 

   16 


   15 
   17 

   17 

   18 








   17 










    1 


    1 

    1 











    1 





   19 



   19 

    2 

   18 





   19 

    1 
















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
// SPDX-License-Identifier: GPL-2.0-only
/* xfrm_user.c: User interface to configure xfrm engine.
 *
 * Copyright (C) 2002 David S. Miller (davem@redhat.com)
 *
 * Changes:
 *        Mitsuru KANDA @USAGI
 *         Kazunori MIYAZAWA @USAGI
 *         Kunihiro Ishiguro <kunihiro@ipinfusion.com>
 *                 IPv6 support
 *
 */

#include <linux/compat.h>
#include <linux/crypto.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/socket.h>
#include <linux/string.h>
#include <linux/net.h>
#include <linux/skbuff.h>
#include <linux/pfkeyv2.h>
#include <linux/ipsec.h>
#include <linux/init.h>
#include <linux/security.h>
#include <net/sock.h>
#include <net/xfrm.h>
#include <net/netlink.h>
#include <net/ah.h>
#include <linux/uaccess.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <linux/in6.h>
#endif
#include <linux/unaligned.h>

static int verify_one_alg(struct nlattr **attrs, enum xfrm_attr_type_t type,
                          struct netlink_ext_ack *extack)
{
        struct nlattr *rt = attrs[type];
        struct xfrm_algo *algp;

        if (!rt)
                return 0;

        algp = nla_data(rt);
        if (nla_len(rt) < (int)xfrm_alg_len(algp)) {
                NL_SET_ERR_MSG(extack, "Invalid AUTH/CRYPT/COMP attribute length");
                return -EINVAL;
        }

        switch (type) {
        case XFRMA_ALG_AUTH:
        case XFRMA_ALG_CRYPT:
        case XFRMA_ALG_COMP:
                break;

        default:
                NL_SET_ERR_MSG(extack, "Invalid algorithm attribute type");
                return -EINVAL;
        }

        algp->alg_name[sizeof(algp->alg_name) - 1] = '\0';
        return 0;
}

static int verify_auth_trunc(struct nlattr **attrs,
                             struct netlink_ext_ack *extack)
{
        struct nlattr *rt = attrs[XFRMA_ALG_AUTH_TRUNC];
        struct xfrm_algo_auth *algp;

        if (!rt)
                return 0;

        algp = nla_data(rt);
        if (nla_len(rt) < (int)xfrm_alg_auth_len(algp)) {
                NL_SET_ERR_MSG(extack, "Invalid AUTH_TRUNC attribute length");
                return -EINVAL;
        }

        algp->alg_name[sizeof(algp->alg_name) - 1] = '\0';
        return 0;
}

static int verify_aead(struct nlattr **attrs, struct netlink_ext_ack *extack)
{
        struct nlattr *rt = attrs[XFRMA_ALG_AEAD];
        struct xfrm_algo_aead *algp;

        if (!rt)
                return 0;

        algp = nla_data(rt);
        if (nla_len(rt) < (int)aead_len(algp)) {
                NL_SET_ERR_MSG(extack, "Invalid AEAD attribute length");
                return -EINVAL;
        }

        algp->alg_name[sizeof(algp->alg_name) - 1] = '\0';
        return 0;
}

static void verify_one_addr(struct nlattr **attrs, enum xfrm_attr_type_t type,
                           xfrm_address_t **addrp)
{
        struct nlattr *rt = attrs[type];

        if (rt && addrp)
                *addrp = nla_data(rt);
}

static inline int verify_sec_ctx_len(struct nlattr **attrs, struct netlink_ext_ack *extack)
{
        struct nlattr *rt = attrs[XFRMA_SEC_CTX];
        struct xfrm_user_sec_ctx *uctx;

        if (!rt)
                return 0;

        uctx = nla_data(rt);
        if (uctx->len > nla_len(rt) ||
            uctx->len != (sizeof(struct xfrm_user_sec_ctx) + uctx->ctx_len)) {
                NL_SET_ERR_MSG(extack, "Invalid security context length");
                return -EINVAL;
        }

        return 0;
}

static inline int verify_replay(struct xfrm_usersa_info *p,
                                struct nlattr **attrs, u8 sa_dir,
                                struct netlink_ext_ack *extack)
{
        struct nlattr *rt = attrs[XFRMA_REPLAY_ESN_VAL];
        struct xfrm_replay_state_esn *rs;

        if (!rt) {
                if (p->flags & XFRM_STATE_ESN) {
                        NL_SET_ERR_MSG(extack, "Missing required attribute for ESN");
                        return -EINVAL;
                }
                return 0;
        }

        rs = nla_data(rt);

        if (rs->bmp_len > XFRMA_REPLAY_ESN_MAX / sizeof(rs->bmp[0]) / 8) {
                NL_SET_ERR_MSG(extack, "ESN bitmap length must be <= 128");
                return -EINVAL;
        }

        if (nla_len(rt) < (int)xfrm_replay_state_esn_len(rs) &&
            nla_len(rt) != sizeof(*rs)) {
                NL_SET_ERR_MSG(extack, "ESN attribute is too short to fit the full bitmap length");
                return -EINVAL;
        }

        /* As only ESP and AH support ESN feature. */
        if ((p->id.proto != IPPROTO_ESP) && (p->id.proto != IPPROTO_AH)) {
                NL_SET_ERR_MSG(extack, "ESN only supported for ESP and AH");
                return -EINVAL;
        }

        if (p->replay_window != 0) {
                NL_SET_ERR_MSG(extack, "ESN not compatible with legacy replay_window");
                return -EINVAL;
        }

        if (sa_dir == XFRM_SA_DIR_OUT)  {
                if (rs->replay_window) {
                        NL_SET_ERR_MSG(extack, "Replay window should be 0 for output SA");
                        return -EINVAL;
                }
                if (rs->seq || rs->seq_hi) {
                        NL_SET_ERR_MSG(extack,
                                       "Replay seq and seq_hi should be 0 for output SA");
                        return -EINVAL;
                }

                if (!(p->flags & XFRM_STATE_ESN)) {
                        if (rs->oseq_hi) {
                                NL_SET_ERR_MSG(
                                        extack,
                                        "Replay oseq_hi should be 0 in non-ESN mode for output SA");
                                return -EINVAL;
                        }
                        if (rs->oseq == U32_MAX) {
                                NL_SET_ERR_MSG(
                                        extack,
                                        "Replay oseq should be less than 0xFFFFFFFF in non-ESN mode for output SA");
                                return -EINVAL;
                        }
                } else {
                        if (rs->oseq == U32_MAX && rs->oseq_hi == U32_MAX) {
                                NL_SET_ERR_MSG(
                                        extack,
                                        "Replay oseq and oseq_hi should be less than 0xFFFFFFFF for output SA");
                                return -EINVAL;
                        }
                }
                if (rs->bmp_len) {
                        NL_SET_ERR_MSG(extack, "Replay bmp_len should 0 for output SA");
                        return -EINVAL;
                }
        }

        if (sa_dir == XFRM_SA_DIR_IN)  {
                if (rs->oseq || rs->oseq_hi) {
                        NL_SET_ERR_MSG(extack,
                                       "Replay oseq and oseq_hi should be 0 for input SA");
                        return -EINVAL;
                }
                if (!(p->flags & XFRM_STATE_ESN)) {
                        if (rs->seq_hi) {
                                NL_SET_ERR_MSG(
                                        extack,
                                        "Replay seq_hi should be 0 in non-ESN mode for input SA");
                                return -EINVAL;
                        }

                        if (rs->seq == U32_MAX) {
                                NL_SET_ERR_MSG(
                                        extack,
                                        "Replay seq should be less than 0xFFFFFFFF in non-ESN mode for input SA");
                                return -EINVAL;
                        }
                } else {
                        if (rs->seq == U32_MAX && rs->seq_hi == U32_MAX) {
                                NL_SET_ERR_MSG(
                                        extack,
                                        "Replay seq and seq_hi should be less than 0xFFFFFFFF for input SA");
                                return -EINVAL;
                        }
                }
        }

        return 0;
}

static int verify_newsa_info(struct xfrm_usersa_info *p,
                             struct nlattr **attrs,
                             struct netlink_ext_ack *extack)
{
        int err;
        u8 sa_dir = nla_get_u8_default(attrs[XFRMA_SA_DIR], 0);
        u16 family = p->sel.family;

        err = -EINVAL;
        switch (p->family) {
        case AF_INET:
                break;

        case AF_INET6:
#if IS_ENABLED(CONFIG_IPV6)
                break;
#else
                err = -EAFNOSUPPORT;
                NL_SET_ERR_MSG(extack, "IPv6 support disabled");
                goto out;
#endif

        default:
                NL_SET_ERR_MSG(extack, "Invalid address family");
                goto out;
        }

        if (!family && !(p->flags & XFRM_STATE_AF_UNSPEC))
                family = p->family;

        switch (family) {
        case AF_UNSPEC:
                break;

        case AF_INET:
                if (p->sel.prefixlen_d > 32 || p->sel.prefixlen_s > 32) {
                        NL_SET_ERR_MSG(extack, "Invalid prefix length in selector (must be <= 32 for IPv4)");
                        goto out;
                }

                break;

        case AF_INET6:
#if IS_ENABLED(CONFIG_IPV6)
                if (p->sel.prefixlen_d > 128 || p->sel.prefixlen_s > 128) {
                        NL_SET_ERR_MSG(extack, "Invalid prefix length in selector (must be <= 128 for IPv6)");
                        goto out;
                }

                break;
#else
                NL_SET_ERR_MSG(extack, "IPv6 support disabled");
                err = -EAFNOSUPPORT;
                goto out;
#endif

        default:
                NL_SET_ERR_MSG(extack, "Invalid address family in selector");
                goto out;
        }

        err = -EINVAL;
        switch (p->id.proto) {
        case IPPROTO_AH:
                if (!attrs[XFRMA_ALG_AUTH]        &&
                    !attrs[XFRMA_ALG_AUTH_TRUNC]) {
                        NL_SET_ERR_MSG(extack, "Missing required attribute for AH: AUTH_TRUNC or AUTH");
                        goto out;
                }

                if (attrs[XFRMA_ALG_AEAD]        ||
                    attrs[XFRMA_ALG_CRYPT]        ||
                    attrs[XFRMA_ALG_COMP]        ||
                    attrs[XFRMA_TFCPAD]) {
                        NL_SET_ERR_MSG(extack, "Invalid attributes for AH: AEAD, CRYPT, COMP, TFCPAD");
                        goto out;
                }
                break;

        case IPPROTO_ESP:
                if (attrs[XFRMA_ALG_COMP]) {
                        NL_SET_ERR_MSG(extack, "Invalid attribute for ESP: COMP");
                        goto out;
                }

                if (!attrs[XFRMA_ALG_AUTH] &&
                    !attrs[XFRMA_ALG_AUTH_TRUNC] &&
                    !attrs[XFRMA_ALG_CRYPT] &&
                    !attrs[XFRMA_ALG_AEAD]) {
                        NL_SET_ERR_MSG(extack, "Missing required attribute for ESP: at least one of AUTH, AUTH_TRUNC, CRYPT, AEAD");
                        goto out;
                }

                if ((attrs[XFRMA_ALG_AUTH] ||
                     attrs[XFRMA_ALG_AUTH_TRUNC] ||
                     attrs[XFRMA_ALG_CRYPT]) &&
                    attrs[XFRMA_ALG_AEAD]) {
                        NL_SET_ERR_MSG(extack, "Invalid attribute combination for ESP: AEAD can't be used with AUTH, AUTH_TRUNC, CRYPT");
                        goto out;
                }

                if (attrs[XFRMA_TFCPAD] &&
                    p->mode != XFRM_MODE_TUNNEL) {
                        NL_SET_ERR_MSG(extack, "TFC padding can only be used in tunnel mode");
                        goto out;
                }
                if ((attrs[XFRMA_IPTFS_DROP_TIME] ||
                     attrs[XFRMA_IPTFS_REORDER_WINDOW] ||
                     attrs[XFRMA_IPTFS_DONT_FRAG] ||
                     attrs[XFRMA_IPTFS_INIT_DELAY] ||
                     attrs[XFRMA_IPTFS_MAX_QSIZE] ||
                     attrs[XFRMA_IPTFS_PKT_SIZE]) &&
                    p->mode != XFRM_MODE_IPTFS) {
                        NL_SET_ERR_MSG(extack, "IP-TFS options can only be used in IP-TFS mode");
                        goto out;
                }
                break;

        case IPPROTO_COMP:
                if (!attrs[XFRMA_ALG_COMP]) {
                        NL_SET_ERR_MSG(extack, "Missing required attribute for COMP: COMP");
                        goto out;
                }

                if (attrs[XFRMA_ALG_AEAD]        ||
                    attrs[XFRMA_ALG_AUTH]        ||
                    attrs[XFRMA_ALG_AUTH_TRUNC]        ||
                    attrs[XFRMA_ALG_CRYPT]        ||
                    attrs[XFRMA_TFCPAD]) {
                        NL_SET_ERR_MSG(extack, "Invalid attributes for COMP: AEAD, AUTH, AUTH_TRUNC, CRYPT, TFCPAD");
                        goto out;
                }

                if (ntohl(p->id.spi) >= 0x10000) {
                        NL_SET_ERR_MSG(extack, "SPI is too large for COMP (must be < 0x10000)");
                        goto out;
                }
                break;

#if IS_ENABLED(CONFIG_IPV6)
        case IPPROTO_DSTOPTS:
        case IPPROTO_ROUTING:
                if (attrs[XFRMA_ALG_COMP]        ||
                    attrs[XFRMA_ALG_AUTH]        ||
                    attrs[XFRMA_ALG_AUTH_TRUNC]        ||
                    attrs[XFRMA_ALG_AEAD]        ||
                    attrs[XFRMA_ALG_CRYPT]        ||
                    attrs[XFRMA_ENCAP]                ||
                    attrs[XFRMA_SEC_CTX]        ||
                    attrs[XFRMA_TFCPAD]) {
                        NL_SET_ERR_MSG(extack, "Invalid attributes for DSTOPTS/ROUTING");
                        goto out;
                }

                if (!attrs[XFRMA_COADDR]) {
                        NL_SET_ERR_MSG(extack, "Missing required COADDR attribute for DSTOPTS/ROUTING");
                        goto out;
                }
                break;
#endif

        default:
                NL_SET_ERR_MSG(extack, "Unsupported protocol");
                goto out;
        }

        if ((err = verify_aead(attrs, extack)))
                goto out;
        if ((err = verify_auth_trunc(attrs, extack)))
                goto out;
        if ((err = verify_one_alg(attrs, XFRMA_ALG_AUTH, extack)))
                goto out;
        if ((err = verify_one_alg(attrs, XFRMA_ALG_CRYPT, extack)))
                goto out;
        if ((err = verify_one_alg(attrs, XFRMA_ALG_COMP, extack)))
                goto out;
        if ((err = verify_sec_ctx_len(attrs, extack)))
                goto out;
        if ((err = verify_replay(p, attrs, sa_dir, extack)))
                goto out;

        err = -EINVAL;
        switch (p->mode) {
        case XFRM_MODE_TRANSPORT:
        case XFRM_MODE_TUNNEL:
        case XFRM_MODE_ROUTEOPTIMIZATION:
        case XFRM_MODE_BEET:
                break;
        case XFRM_MODE_IPTFS:
                if (p->id.proto != IPPROTO_ESP) {
                        NL_SET_ERR_MSG(extack, "IP-TFS mode only supported with ESP");
                        goto out;
                }
                if (sa_dir == 0) {
                        NL_SET_ERR_MSG(extack, "IP-TFS mode requires in or out direction attribute");
                        goto out;
                }
                break;

        default:
                NL_SET_ERR_MSG(extack, "Unsupported mode");
                goto out;
        }

        err = 0;

        if (attrs[XFRMA_MTIMER_THRESH]) {
                if (!attrs[XFRMA_ENCAP]) {
                        NL_SET_ERR_MSG(extack, "MTIMER_THRESH attribute can only be set on ENCAP states");
                        err = -EINVAL;
                        goto out;
                }

                if (sa_dir == XFRM_SA_DIR_OUT) {
                        NL_SET_ERR_MSG(extack,
                                       "MTIMER_THRESH attribute should not be set on output SA");
                        err = -EINVAL;
                        goto out;
                }
        }

        if (sa_dir == XFRM_SA_DIR_OUT) {
                if (p->flags & XFRM_STATE_DECAP_DSCP) {
                        NL_SET_ERR_MSG(extack, "Flag DECAP_DSCP should not be set for output SA");
                        err = -EINVAL;
                        goto out;
                }

                if (p->flags & XFRM_STATE_ICMP) {
                        NL_SET_ERR_MSG(extack, "Flag ICMP should not be set for output SA");
                        err = -EINVAL;
                        goto out;
                }

                if (p->flags & XFRM_STATE_WILDRECV) {
                        NL_SET_ERR_MSG(extack, "Flag WILDRECV should not be set for output SA");
                        err = -EINVAL;
                        goto out;
                }

                if (p->replay_window) {
                        NL_SET_ERR_MSG(extack, "Replay window should be 0 for output SA");
                        err = -EINVAL;
                        goto out;
                }

                if (attrs[XFRMA_IPTFS_DROP_TIME]) {
                        NL_SET_ERR_MSG(extack, "IP-TFS drop time should not be set for output SA");
                        err = -EINVAL;
                        goto out;
                }

                if (attrs[XFRMA_IPTFS_REORDER_WINDOW]) {
                        NL_SET_ERR_MSG(extack, "IP-TFS reorder window should not be set for output SA");
                        err = -EINVAL;
                        goto out;
                }

                if (attrs[XFRMA_REPLAY_VAL]) {
                        struct xfrm_replay_state *replay;

                        replay = nla_data(attrs[XFRMA_REPLAY_VAL]);

                        if (replay->seq || replay->bitmap) {
                                NL_SET_ERR_MSG(extack,
                                               "Replay seq and bitmap should be 0 for output SA");
                                err = -EINVAL;
                                goto out;
                        }
                }
        }

        if (sa_dir == XFRM_SA_DIR_IN) {
                if (p->flags & XFRM_STATE_NOPMTUDISC) {
                        NL_SET_ERR_MSG(extack, "Flag NOPMTUDISC should not be set for input SA");
                        err = -EINVAL;
                        goto out;
                }

                if (attrs[XFRMA_SA_EXTRA_FLAGS]) {
                        u32 xflags = nla_get_u32(attrs[XFRMA_SA_EXTRA_FLAGS]);

                        if (xflags & XFRM_SA_XFLAG_DONT_ENCAP_DSCP) {
                                NL_SET_ERR_MSG(extack, "Flag DONT_ENCAP_DSCP should not be set for input SA");
                                err = -EINVAL;
                                goto out;
                        }

                        if (xflags & XFRM_SA_XFLAG_OSEQ_MAY_WRAP) {
                                NL_SET_ERR_MSG(extack, "Flag OSEQ_MAY_WRAP should not be set for input SA");
                                err = -EINVAL;
                                goto out;
                        }

                }

                if (attrs[XFRMA_IPTFS_DONT_FRAG]) {
                        NL_SET_ERR_MSG(extack, "IP-TFS don't fragment should not be set for input SA");
                        err = -EINVAL;
                        goto out;
                }

                if (attrs[XFRMA_IPTFS_INIT_DELAY]) {
                        NL_SET_ERR_MSG(extack, "IP-TFS initial delay should not be set for input SA");
                        err = -EINVAL;
                        goto out;
                }

                if (attrs[XFRMA_IPTFS_MAX_QSIZE]) {
                        NL_SET_ERR_MSG(extack, "IP-TFS max queue size should not be set for input SA");
                        err = -EINVAL;
                        goto out;
                }

                if (attrs[XFRMA_IPTFS_PKT_SIZE]) {
                        NL_SET_ERR_MSG(extack, "IP-TFS packet size should not be set for input SA");
                        err = -EINVAL;
                        goto out;
                }
        }

        if (!sa_dir && attrs[XFRMA_SA_PCPU]) {
                NL_SET_ERR_MSG(extack, "SA_PCPU only supported with SA_DIR");
                err = -EINVAL;
                goto out;
        }

out:
        return err;
}

static int attach_one_algo(struct xfrm_algo **algpp, u8 *props,
                           struct xfrm_algo_desc *(*get_byname)(const char *, int),
                           struct nlattr *rta, struct netlink_ext_ack *extack)
{
        struct xfrm_algo *p, *ualg;
        struct xfrm_algo_desc *algo;

        if (!rta)
                return 0;

        ualg = nla_data(rta);

        algo = get_byname(ualg->alg_name, 1);
        if (!algo) {
                NL_SET_ERR_MSG(extack, "Requested COMP algorithm not found");
                return -ENOSYS;
        }
        *props = algo->desc.sadb_alg_id;

        p = kmemdup(ualg, xfrm_alg_len(ualg), GFP_KERNEL);
        if (!p)
                return -ENOMEM;

        strscpy(p->alg_name, algo->name);
        *algpp = p;
        return 0;
}

static int attach_crypt(struct xfrm_state *x, struct nlattr *rta,
                        struct netlink_ext_ack *extack)
{
        struct xfrm_algo *p, *ualg;
        struct xfrm_algo_desc *algo;

        if (!rta)
                return 0;

        ualg = nla_data(rta);

        algo = xfrm_ealg_get_byname(ualg->alg_name, 1);
        if (!algo) {
                NL_SET_ERR_MSG(extack, "Requested CRYPT algorithm not found");
                return -ENOSYS;
        }
        x->props.ealgo = algo->desc.sadb_alg_id;

        p = kmemdup(ualg, xfrm_alg_len(ualg), GFP_KERNEL);
        if (!p)
                return -ENOMEM;

        strscpy(p->alg_name, algo->name);
        x->ealg = p;
        x->geniv = algo->uinfo.encr.geniv;
        return 0;
}

static int attach_auth(struct xfrm_algo_auth **algpp, u8 *props,
                       struct nlattr *rta, struct netlink_ext_ack *extack)
{
        struct xfrm_algo *ualg;
        struct xfrm_algo_auth *p;
        struct xfrm_algo_desc *algo;

        if (!rta)
                return 0;

        ualg = nla_data(rta);

        algo = xfrm_aalg_get_byname(ualg->alg_name, 1);
        if (!algo) {
                NL_SET_ERR_MSG(extack, "Requested AUTH algorithm not found");
                return -ENOSYS;
        }
        *props = algo->desc.sadb_alg_id;

        p = kmalloc(sizeof(*p) + (ualg->alg_key_len + 7) / 8, GFP_KERNEL);
        if (!p)
                return -ENOMEM;

        strscpy(p->alg_name, algo->name);
        p->alg_key_len = ualg->alg_key_len;
        p->alg_trunc_len = algo->uinfo.auth.icv_truncbits;
        memcpy(p->alg_key, ualg->alg_key, (ualg->alg_key_len + 7) / 8);

        *algpp = p;
        return 0;
}

static int attach_auth_trunc(struct xfrm_algo_auth **algpp, u8 *props,
                             struct nlattr *rta, struct netlink_ext_ack *extack)
{
        struct xfrm_algo_auth *p, *ualg;
        struct xfrm_algo_desc *algo;

        if (!rta)
                return 0;

        ualg = nla_data(rta);

        algo = xfrm_aalg_get_byname(ualg->alg_name, 1);
        if (!algo) {
                NL_SET_ERR_MSG(extack, "Requested AUTH_TRUNC algorithm not found");
                return -ENOSYS;
        }
        if (ualg->alg_trunc_len > algo->uinfo.auth.icv_fullbits) {
                NL_SET_ERR_MSG(extack, "Invalid length requested for truncated ICV");
                return -EINVAL;
        }
        *props = algo->desc.sadb_alg_id;

        p = kmemdup(ualg, xfrm_alg_auth_len(ualg), GFP_KERNEL);
        if (!p)
                return -ENOMEM;

        strscpy(p->alg_name, algo->name);
        if (!p->alg_trunc_len)
                p->alg_trunc_len = algo->uinfo.auth.icv_truncbits;

        *algpp = p;
        return 0;
}

static int attach_aead(struct xfrm_state *x, struct nlattr *rta,
                       struct netlink_ext_ack *extack)
{
        struct xfrm_algo_aead *p, *ualg;
        struct xfrm_algo_desc *algo;

        if (!rta)
                return 0;

        ualg = nla_data(rta);

        algo = xfrm_aead_get_byname(ualg->alg_name, ualg->alg_icv_len, 1);
        if (!algo) {
                NL_SET_ERR_MSG(extack, "Requested AEAD algorithm not found");
                return -ENOSYS;
        }
        x->props.ealgo = algo->desc.sadb_alg_id;

        p = kmemdup(ualg, aead_len(ualg), GFP_KERNEL);
        if (!p)
                return -ENOMEM;

        strscpy(p->alg_name, algo->name);
        x->aead = p;
        x->geniv = algo->uinfo.aead.geniv;
        return 0;
}

static inline int xfrm_replay_verify_len(struct xfrm_replay_state_esn *replay_esn,
                                         struct nlattr *rp,
                                         struct netlink_ext_ack *extack)
{
        struct xfrm_replay_state_esn *up;
        unsigned int ulen;

        if (!replay_esn || !rp)
                return 0;

        up = nla_data(rp);
        ulen = xfrm_replay_state_esn_len(up);

        /* Check the overall length and the internal bitmap length to avoid
         * potential overflow. */
        if (nla_len(rp) < (int)ulen) {
                NL_SET_ERR_MSG(extack, "ESN attribute is too short");
                return -EINVAL;
        }

        if (xfrm_replay_state_esn_len(replay_esn) != ulen) {
                NL_SET_ERR_MSG(extack, "New ESN size doesn't match the existing SA's ESN size");
                return -EINVAL;
        }

        if (replay_esn->bmp_len != up->bmp_len) {
                NL_SET_ERR_MSG(extack, "New ESN bitmap size doesn't match the existing SA's ESN bitmap");
                return -EINVAL;
        }

        if (up->replay_window > up->bmp_len * sizeof(__u32) * 8) {
                NL_SET_ERR_MSG(extack, "ESN replay window is longer than the bitmap");
                return -EINVAL;
        }

        return 0;
}

static int xfrm_alloc_replay_state_esn(struct xfrm_replay_state_esn **replay_esn,
                                       struct xfrm_replay_state_esn **preplay_esn,
                                       struct nlattr *rta)
{
        struct xfrm_replay_state_esn *p, *pp, *up;
        unsigned int klen, ulen;

        if (!rta)
                return 0;

        up = nla_data(rta);
        klen = xfrm_replay_state_esn_len(up);
        ulen = nla_len(rta) >= (int)klen ? klen : sizeof(*up);

        p = kzalloc(klen, GFP_KERNEL);
        if (!p)
                return -ENOMEM;

        pp = kzalloc(klen, GFP_KERNEL);
        if (!pp) {
                kfree(p);
                return -ENOMEM;
        }

        memcpy(p, up, ulen);
        memcpy(pp, up, ulen);

        *replay_esn = p;
        *preplay_esn = pp;

        return 0;
}

static inline unsigned int xfrm_user_sec_ctx_size(struct xfrm_sec_ctx *xfrm_ctx)
{
        unsigned int len = 0;

        if (xfrm_ctx) {
                len += sizeof(struct xfrm_user_sec_ctx);
                len += xfrm_ctx->ctx_len;
        }
        return len;
}

static void copy_from_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p)
{
        memcpy(&x->id, &p->id, sizeof(x->id));
        memcpy(&x->sel, &p->sel, sizeof(x->sel));
        memcpy(&x->lft, &p->lft, sizeof(x->lft));
        x->props.mode = p->mode;
        x->props.replay_window = min_t(unsigned int, p->replay_window,
                                        sizeof(x->replay.bitmap) * 8);
        x->props.reqid = p->reqid;
        x->props.family = p->family;
        memcpy(&x->props.saddr, &p->saddr, sizeof(x->props.saddr));
        x->props.flags = p->flags;

        if (!x->sel.family && !(p->flags & XFRM_STATE_AF_UNSPEC))
                x->sel.family = p->family;
}

/*
 * someday when pfkey also has support, we could have the code
 * somehow made shareable and move it to xfrm_state.c - JHS
 *
*/
static void xfrm_update_ae_params(struct xfrm_state *x, struct nlattr **attrs,
                                  int update_esn)
{
        struct nlattr *rp = attrs[XFRMA_REPLAY_VAL];
        struct nlattr *re = update_esn ? attrs[XFRMA_REPLAY_ESN_VAL] : NULL;
        struct nlattr *lt = attrs[XFRMA_LTIME_VAL];
        struct nlattr *et = attrs[XFRMA_ETIMER_THRESH];
        struct nlattr *rt = attrs[XFRMA_REPLAY_THRESH];
        struct nlattr *mt = attrs[XFRMA_MTIMER_THRESH];

        if (re && x->replay_esn && x->preplay_esn) {
                struct xfrm_replay_state_esn *replay_esn;
                replay_esn = nla_data(re);
                memcpy(x->replay_esn, replay_esn,
                       xfrm_replay_state_esn_len(replay_esn));
                memcpy(x->preplay_esn, replay_esn,
                       xfrm_replay_state_esn_len(replay_esn));
        }

        if (rp) {
                struct xfrm_replay_state *replay;
                replay = nla_data(rp);
                memcpy(&x->replay, replay, sizeof(*replay));
                memcpy(&x->preplay, replay, sizeof(*replay));
        }

        if (lt) {
                struct xfrm_lifetime_cur *ltime;
                ltime = nla_data(lt);
                x->curlft.bytes = ltime->bytes;
                x->curlft.packets = ltime->packets;
                x->curlft.add_time = ltime->add_time;
                x->curlft.use_time = ltime->use_time;
        }

        if (et)
                x->replay_maxage = nla_get_u32(et);

        if (rt)
                x->replay_maxdiff = nla_get_u32(rt);

        if (mt)
                x->mapping_maxage = nla_get_u32(mt);
}

static void xfrm_smark_init(struct nlattr **attrs, struct xfrm_mark *m)
{
        if (attrs[XFRMA_SET_MARK]) {
                m->v = nla_get_u32(attrs[XFRMA_SET_MARK]);
                m->m = nla_get_u32_default(attrs[XFRMA_SET_MARK_MASK],
                                           0xffffffff);
        } else {
                m->v = m->m = 0;
        }
}

static struct xfrm_state *xfrm_state_construct(struct net *net,
                                               struct xfrm_usersa_info *p,
                                               struct nlattr **attrs,
                                               int *errp,
                                               struct netlink_ext_ack *extack)
{
        struct xfrm_state *x = xfrm_state_alloc(net);
        int err = -ENOMEM;

        if (!x)
                goto error_no_put;

        copy_from_user_state(x, p);

        if (attrs[XFRMA_ENCAP]) {
                x->encap = kmemdup(nla_data(attrs[XFRMA_ENCAP]),
                                   sizeof(*x->encap), GFP_KERNEL);
                if (x->encap == NULL)
                        goto error;
        }

        if (attrs[XFRMA_COADDR]) {
                x->coaddr = kmemdup(nla_data(attrs[XFRMA_COADDR]),
                                    sizeof(*x->coaddr), GFP_KERNEL);
                if (x->coaddr == NULL)
                        goto error;
        }

        if (attrs[XFRMA_SA_EXTRA_FLAGS])
                x->props.extra_flags = nla_get_u32(attrs[XFRMA_SA_EXTRA_FLAGS]);

        if ((err = attach_aead(x, attrs[XFRMA_ALG_AEAD], extack)))
                goto error;
        if ((err = attach_auth_trunc(&x->aalg, &x->props.aalgo,
                                     attrs[XFRMA_ALG_AUTH_TRUNC], extack)))
                goto error;
        if (!x->props.aalgo) {
                if ((err = attach_auth(&x->aalg, &x->props.aalgo,
                                       attrs[XFRMA_ALG_AUTH], extack)))
                        goto error;
        }
        if ((err = attach_crypt(x, attrs[XFRMA_ALG_CRYPT], extack)))
                goto error;
        if ((err = attach_one_algo(&x->calg, &x->props.calgo,
                                   xfrm_calg_get_byname,
                                   attrs[XFRMA_ALG_COMP], extack)))
                goto error;

        if (attrs[XFRMA_TFCPAD])
                x->tfcpad = nla_get_u32(attrs[XFRMA_TFCPAD]);

        xfrm_mark_get(attrs, &x->mark);

        xfrm_smark_init(attrs, &x->props.smark);

        if (attrs[XFRMA_IF_ID])
                x->if_id = nla_get_u32(attrs[XFRMA_IF_ID]);

        if (attrs[XFRMA_SA_DIR])
                x->dir = nla_get_u8(attrs[XFRMA_SA_DIR]);

        if (attrs[XFRMA_NAT_KEEPALIVE_INTERVAL])
                x->nat_keepalive_interval =
                        nla_get_u32(attrs[XFRMA_NAT_KEEPALIVE_INTERVAL]);

        if (attrs[XFRMA_SA_PCPU]) {
                x->pcpu_num = nla_get_u32(attrs[XFRMA_SA_PCPU]);
                if (x->pcpu_num >= num_possible_cpus()) {
                        err = -ERANGE;
                        NL_SET_ERR_MSG(extack, "pCPU number too big");
                        goto error;
                }
        }

        err = __xfrm_init_state(x, extack);
        if (err)
                goto error;

        if (attrs[XFRMA_SEC_CTX]) {
                err = security_xfrm_state_alloc(x,
                                                nla_data(attrs[XFRMA_SEC_CTX]));
                if (err)
                        goto error;
        }

        if ((err = xfrm_alloc_replay_state_esn(&x->replay_esn, &x->preplay_esn,
                                               attrs[XFRMA_REPLAY_ESN_VAL])))
                goto error;

        x->km.seq = p->seq;
        x->replay_maxdiff = net->xfrm.sysctl_aevent_rseqth;
        /* sysctl_xfrm_aevent_etime is in 100ms units */
        x->replay_maxage = (net->xfrm.sysctl_aevent_etime*HZ)/XFRM_AE_ETH_M;

        if ((err = xfrm_init_replay(x, extack)))
                goto error;

        /* override default values from above */
        xfrm_update_ae_params(x, attrs, 0);

        xfrm_set_type_offload(x, attrs[XFRMA_OFFLOAD_DEV]);
        /* configure the hardware if offload is requested */
        if (attrs[XFRMA_OFFLOAD_DEV]) {
                err = xfrm_dev_state_add(net, x,
                                         nla_data(attrs[XFRMA_OFFLOAD_DEV]),
                                         extack);
                if (err)
                        goto error;
        }

        if (x->mode_cbs && x->mode_cbs->user_init) {
                err = x->mode_cbs->user_init(net, x, attrs, extack);
                if (err)
                        goto error;
        }

        return x;

error:
        x->km.state = XFRM_STATE_DEAD;
        xfrm_state_put(x);
error_no_put:
        *errp = err;
        return NULL;
}

static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
                       struct nlattr **attrs, struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct xfrm_usersa_info *p = nlmsg_data(nlh);
        struct xfrm_state *x;
        int err;
        struct km_event c;

        err = verify_newsa_info(p, attrs, extack);
        if (err)
                return err;

        x = xfrm_state_construct(net, p, attrs, &err, extack);
        if (!x)
                return err;

        xfrm_state_hold(x);
        if (nlh->nlmsg_type == XFRM_MSG_NEWSA)
                err = xfrm_state_add(x);
        else
                err = xfrm_state_update(x);

        xfrm_audit_state_add(x, err ? 0 : 1, true);

        if (err < 0) {
                x->km.state = XFRM_STATE_DEAD;
                xfrm_dev_state_delete(x);
                __xfrm_state_put(x);
                goto out;
        }

        if (x->km.state == XFRM_STATE_VOID)
                x->km.state = XFRM_STATE_VALID;

        c.seq = nlh->nlmsg_seq;
        c.portid = nlh->nlmsg_pid;
        c.event = nlh->nlmsg_type;

        km_state_notify(x, &c);
out:
        xfrm_state_put(x);
        return err;
}

static struct xfrm_state *xfrm_user_state_lookup(struct net *net,
                                                 struct xfrm_usersa_id *p,
                                                 struct nlattr **attrs,
                                                 int *errp)
{
        struct xfrm_state *x = NULL;
        struct xfrm_mark m;
        int err;
        u32 mark = xfrm_mark_get(attrs, &m);

        if (xfrm_id_proto_match(p->proto, IPSEC_PROTO_ANY)) {
                err = -ESRCH;
                x = xfrm_state_lookup(net, mark, &p->daddr, p->spi, p->proto, p->family);
        } else {
                xfrm_address_t *saddr = NULL;

                verify_one_addr(attrs, XFRMA_SRCADDR, &saddr);
                if (!saddr) {
                        err = -EINVAL;
                        goto out;
                }

                err = -ESRCH;
                x = xfrm_state_lookup_byaddr(net, mark,
                                             &p->daddr, saddr,
                                             p->proto, p->family);
        }

 out:
        if (!x && errp)
                *errp = err;
        return x;
}

static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
                       struct nlattr **attrs, struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct xfrm_state *x;
        int err = -ESRCH;
        struct km_event c;
        struct xfrm_usersa_id *p = nlmsg_data(nlh);

        x = xfrm_user_state_lookup(net, p, attrs, &err);
        if (x == NULL)
                return err;

        if ((err = security_xfrm_state_delete(x)) != 0)
                goto out;

        if (xfrm_state_kern(x)) {
                NL_SET_ERR_MSG(extack, "SA is in use by tunnels");
                err = -EPERM;
                goto out;
        }

        err = xfrm_state_delete(x);
        if (err < 0)
                goto out;

        c.seq = nlh->nlmsg_seq;
        c.portid = nlh->nlmsg_pid;
        c.event = nlh->nlmsg_type;
        km_state_notify(x, &c);

out:
        xfrm_audit_state_delete(x, err ? 0 : 1, true);
        xfrm_state_put(x);
        return err;
}

static void copy_to_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p)
{
        memset(p, 0, sizeof(*p));
        memcpy(&p->id, &x->id, sizeof(p->id));
        memcpy(&p->sel, &x->sel, sizeof(p->sel));
        memcpy(&p->lft, &x->lft, sizeof(p->lft));
        if (x->xso.dev)
                xfrm_dev_state_update_stats(x);
        memcpy(&p->curlft, &x->curlft, sizeof(p->curlft));
        put_unaligned(x->stats.replay_window, &p->stats.replay_window);
        put_unaligned(x->stats.replay, &p->stats.replay);
        put_unaligned(x->stats.integrity_failed, &p->stats.integrity_failed);
        memcpy(&p->saddr, &x->props.saddr, sizeof(p->saddr));
        p->mode = x->props.mode;
        p->replay_window = x->props.replay_window;
        p->reqid = x->props.reqid;
        p->family = x->props.family;
        p->flags = x->props.flags;
        p->seq = x->km.seq;
}

struct xfrm_dump_info {
        struct sk_buff *in_skb;
        struct sk_buff *out_skb;
        u32 nlmsg_seq;
        u16 nlmsg_flags;
};

static int copy_sec_ctx(struct xfrm_sec_ctx *s, struct sk_buff *skb)
{
        struct xfrm_user_sec_ctx *uctx;
        struct nlattr *attr;
        int ctx_size = sizeof(*uctx) + s->ctx_len;

        attr = nla_reserve(skb, XFRMA_SEC_CTX, ctx_size);
        if (attr == NULL)
                return -EMSGSIZE;

        uctx = nla_data(attr);
        uctx->exttype = XFRMA_SEC_CTX;
        uctx->len = ctx_size;
        uctx->ctx_doi = s->ctx_doi;
        uctx->ctx_alg = s->ctx_alg;
        uctx->ctx_len = s->ctx_len;
        memcpy(uctx + 1, s->ctx_str, s->ctx_len);

        return 0;
}

static int copy_user_offload(struct xfrm_dev_offload *xso, struct sk_buff *skb)
{
        struct xfrm_user_offload *xuo;
        struct nlattr *attr;

        attr = nla_reserve(skb, XFRMA_OFFLOAD_DEV, sizeof(*xuo));
        if (attr == NULL)
                return -EMSGSIZE;

        xuo = nla_data(attr);
        memset(xuo, 0, sizeof(*xuo));
        xuo->ifindex = xso->dev->ifindex;
        if (xso->dir == XFRM_DEV_OFFLOAD_IN)
                xuo->flags = XFRM_OFFLOAD_INBOUND;
        if (xso->type == XFRM_DEV_OFFLOAD_PACKET)
                xuo->flags |= XFRM_OFFLOAD_PACKET;

        return 0;
}

static bool xfrm_redact(void)
{
        return IS_ENABLED(CONFIG_SECURITY) &&
                security_locked_down(LOCKDOWN_XFRM_SECRET);
}

static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb)
{
        struct xfrm_algo *algo;
        struct xfrm_algo_auth *ap;
        struct nlattr *nla;
        bool redact_secret = xfrm_redact();

        nla = nla_reserve(skb, XFRMA_ALG_AUTH,
                          sizeof(*algo) + (auth->alg_key_len + 7) / 8);
        if (!nla)
                return -EMSGSIZE;
        algo = nla_data(nla);
        strscpy_pad(algo->alg_name, auth->alg_name);

        if (redact_secret && auth->alg_key_len)
                memset(algo->alg_key, 0, (auth->alg_key_len + 7) / 8);
        else
                memcpy(algo->alg_key, auth->alg_key,
                       (auth->alg_key_len + 7) / 8);
        algo->alg_key_len = auth->alg_key_len;

        nla = nla_reserve(skb, XFRMA_ALG_AUTH_TRUNC, xfrm_alg_auth_len(auth));
        if (!nla)
                return -EMSGSIZE;
        ap = nla_data(nla);
        strscpy_pad(ap->alg_name, auth->alg_name);
        ap->alg_key_len = auth->alg_key_len;
        ap->alg_trunc_len = auth->alg_trunc_len;
        if (redact_secret && auth->alg_key_len)
                memset(ap->alg_key, 0, (auth->alg_key_len + 7) / 8);
        else
                memcpy(ap->alg_key, auth->alg_key,
                       (auth->alg_key_len + 7) / 8);
        return 0;
}

static int copy_to_user_aead(struct xfrm_algo_aead *aead, struct sk_buff *skb)
{
        struct nlattr *nla = nla_reserve(skb, XFRMA_ALG_AEAD, aead_len(aead));
        struct xfrm_algo_aead *ap;
        bool redact_secret = xfrm_redact();

        if (!nla)
                return -EMSGSIZE;

        ap = nla_data(nla);
        strscpy_pad(ap->alg_name, aead->alg_name);
        ap->alg_key_len = aead->alg_key_len;
        ap->alg_icv_len = aead->alg_icv_len;

        if (redact_secret && aead->alg_key_len)
                memset(ap->alg_key, 0, (aead->alg_key_len + 7) / 8);
        else
                memcpy(ap->alg_key, aead->alg_key,
                       (aead->alg_key_len + 7) / 8);
        return 0;
}

static int copy_to_user_ealg(struct xfrm_algo *ealg, struct sk_buff *skb)
{
        struct xfrm_algo *ap;
        bool redact_secret = xfrm_redact();
        struct nlattr *nla = nla_reserve(skb, XFRMA_ALG_CRYPT,
                                         xfrm_alg_len(ealg));
        if (!nla)
                return -EMSGSIZE;

        ap = nla_data(nla);
        strscpy_pad(ap->alg_name, ealg->alg_name);
        ap->alg_key_len = ealg->alg_key_len;

        if (redact_secret && ealg->alg_key_len)
                memset(ap->alg_key, 0, (ealg->alg_key_len + 7) / 8);
        else
                memcpy(ap->alg_key, ealg->alg_key,
                       (ealg->alg_key_len + 7) / 8);

        return 0;
}

static int copy_to_user_calg(struct xfrm_algo *calg, struct sk_buff *skb)
{
        struct nlattr *nla = nla_reserve(skb, XFRMA_ALG_COMP, sizeof(*calg));
        struct xfrm_algo *ap;

        if (!nla)
                return -EMSGSIZE;

        ap = nla_data(nla);
        strscpy_pad(ap->alg_name, calg->alg_name);
        ap->alg_key_len = 0;

        return 0;
}

static int copy_to_user_encap(struct xfrm_encap_tmpl *ep, struct sk_buff *skb)
{
        struct nlattr *nla = nla_reserve(skb, XFRMA_ENCAP, sizeof(*ep));
        struct xfrm_encap_tmpl *uep;

        if (!nla)
                return -EMSGSIZE;

        uep = nla_data(nla);
        memset(uep, 0, sizeof(*uep));

        uep->encap_type = ep->encap_type;
        uep->encap_sport = ep->encap_sport;
        uep->encap_dport = ep->encap_dport;
        uep->encap_oa = ep->encap_oa;

        return 0;
}

static int xfrm_smark_put(struct sk_buff *skb, struct xfrm_mark *m)
{
        int ret = 0;

        if (m->v | m->m) {
                ret = nla_put_u32(skb, XFRMA_SET_MARK, m->v);
                if (!ret)
                        ret = nla_put_u32(skb, XFRMA_SET_MARK_MASK, m->m);
        }
        return ret;
}

/* Don't change this without updating xfrm_sa_len! */
static int copy_to_user_state_extra(struct xfrm_state *x,
                                    struct xfrm_usersa_info *p,
                                    struct sk_buff *skb)
{
        int ret = 0;

        copy_to_user_state(x, p);

        if (x->props.extra_flags) {
                ret = nla_put_u32(skb, XFRMA_SA_EXTRA_FLAGS,
                                  x->props.extra_flags);
                if (ret)
                        goto out;
        }

        if (x->coaddr) {
                ret = nla_put(skb, XFRMA_COADDR, sizeof(*x->coaddr), x->coaddr);
                if (ret)
                        goto out;
        }
        if (x->lastused) {
                ret = nla_put_u64_64bit(skb, XFRMA_LASTUSED, x->lastused,
                                        XFRMA_PAD);
                if (ret)
                        goto out;
        }
        if (x->aead) {
                ret = copy_to_user_aead(x->aead, skb);
                if (ret)
                        goto out;
        }
        if (x->aalg) {
                ret = copy_to_user_auth(x->aalg, skb);
                if (ret)
                        goto out;
        }
        if (x->ealg) {
                ret = copy_to_user_ealg(x->ealg, skb);
                if (ret)
                        goto out;
        }
        if (x->calg) {
                ret = copy_to_user_calg(x->calg, skb);
                if (ret)
                        goto out;
        }
        if (x->encap) {
                ret = copy_to_user_encap(x->encap, skb);
                if (ret)
                        goto out;
        }
        if (x->tfcpad) {
                ret = nla_put_u32(skb, XFRMA_TFCPAD, x->tfcpad);
                if (ret)
                        goto out;
        }
        ret = xfrm_mark_put(skb, &x->mark);
        if (ret)
                goto out;

        ret = xfrm_smark_put(skb, &x->props.smark);
        if (ret)
                goto out;

        if (x->replay_esn)
                ret = nla_put(skb, XFRMA_REPLAY_ESN_VAL,
                              xfrm_replay_state_esn_len(x->replay_esn),
                              x->replay_esn);
        else
                ret = nla_put(skb, XFRMA_REPLAY_VAL, sizeof(x->replay),
                              &x->replay);
        if (ret)
                goto out;
        if(x->xso.dev)
                ret = copy_user_offload(&x->xso, skb);
        if (ret)
                goto out;
        if (x->if_id) {
                ret = nla_put_u32(skb, XFRMA_IF_ID, x->if_id);
                if (ret)
                        goto out;
        }
        if (x->security) {
                ret = copy_sec_ctx(x->security, skb);
                if (ret)
                        goto out;
        }
        if (x->mode_cbs && x->mode_cbs->copy_to_user)
                ret = x->mode_cbs->copy_to_user(x, skb);
        if (ret)
                goto out;
        if (x->mapping_maxage) {
                ret = nla_put_u32(skb, XFRMA_MTIMER_THRESH, x->mapping_maxage);
                if (ret)
                        goto out;
        }
        if (x->pcpu_num != UINT_MAX) {
                ret = nla_put_u32(skb, XFRMA_SA_PCPU, x->pcpu_num);
                if (ret)
                        goto out;
        }
        if (x->dir)
                ret = nla_put_u8(skb, XFRMA_SA_DIR, x->dir);

        if (x->nat_keepalive_interval) {
                ret = nla_put_u32(skb, XFRMA_NAT_KEEPALIVE_INTERVAL,
                                  x->nat_keepalive_interval);
                if (ret)
                        goto out;
        }
out:
        return ret;
}

static int dump_one_state(struct xfrm_state *x, int count, void *ptr)
{
        struct xfrm_dump_info *sp = ptr;
        struct sk_buff *in_skb = sp->in_skb;
        struct sk_buff *skb = sp->out_skb;
        struct xfrm_translator *xtr;
        struct xfrm_usersa_info *p;
        struct nlmsghdr *nlh;
        int err;

        nlh = nlmsg_put(skb, NETLINK_CB(in_skb).portid, sp->nlmsg_seq,
                        XFRM_MSG_NEWSA, sizeof(*p), sp->nlmsg_flags);
        if (nlh == NULL)
                return -EMSGSIZE;

        p = nlmsg_data(nlh);

        err = copy_to_user_state_extra(x, p, skb);
        if (err) {
                nlmsg_cancel(skb, nlh);
                return err;
        }
        nlmsg_end(skb, nlh);

        xtr = xfrm_get_translator();
        if (xtr) {
                err = xtr->alloc_compat(skb, nlh);

                xfrm_put_translator(xtr);
                if (err) {
                        nlmsg_cancel(skb, nlh);
                        return err;
                }
        }

        return 0;
}

static int xfrm_dump_sa_done(struct netlink_callback *cb)
{
        struct xfrm_state_walk *walk = (struct xfrm_state_walk *) &cb->args[1];
        struct sock *sk = cb->skb->sk;
        struct net *net = sock_net(sk);

        if (cb->args[0])
                xfrm_state_walk_done(walk, net);
        return 0;
}

static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct net *net = sock_net(skb->sk);
        struct xfrm_state_walk *walk = (struct xfrm_state_walk *) &cb->args[1];
        struct xfrm_dump_info info;

        BUILD_BUG_ON(sizeof(struct xfrm_state_walk) >
                     sizeof(cb->args) - sizeof(cb->args[0]));

        info.in_skb = cb->skb;
        info.out_skb = skb;
        info.nlmsg_seq = cb->nlh->nlmsg_seq;
        info.nlmsg_flags = NLM_F_MULTI;

        if (!cb->args[0]) {
                struct nlattr *attrs[XFRMA_MAX+1];
                struct xfrm_address_filter *filter = NULL;
                u8 proto = 0;
                int err;

                err = nlmsg_parse_deprecated(cb->nlh, 0, attrs, XFRMA_MAX,
                                             xfrma_policy, cb->extack);
                if (err < 0)
                        return err;

                if (attrs[XFRMA_ADDRESS_FILTER]) {
                        filter = kmemdup(nla_data(attrs[XFRMA_ADDRESS_FILTER]),
                                         sizeof(*filter), GFP_KERNEL);
                        if (filter == NULL)
                                return -ENOMEM;

                        /* see addr_match(), (prefix length >> 5) << 2
                         * will be used to compare xfrm_address_t
                         */
                        if (filter->splen > (sizeof(xfrm_address_t) << 3) ||
                            filter->dplen > (sizeof(xfrm_address_t) << 3)) {
                                kfree(filter);
                                return -EINVAL;
                        }
                }

                if (attrs[XFRMA_PROTO])
                        proto = nla_get_u8(attrs[XFRMA_PROTO]);

                xfrm_state_walk_init(walk, proto, filter);
                cb->args[0] = 1;
        }

        (void) xfrm_state_walk(net, walk, dump_one_state, &info);

        return skb->len;
}

static struct sk_buff *xfrm_state_netlink(struct sk_buff *in_skb,
                                          struct xfrm_state *x, u32 seq)
{
        struct xfrm_dump_info info;
        struct sk_buff *skb;
        int err;

        skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
        if (!skb)
                return ERR_PTR(-ENOMEM);

        info.in_skb = in_skb;
        info.out_skb = skb;
        info.nlmsg_seq = seq;
        info.nlmsg_flags = 0;

        err = dump_one_state(x, 0, &info);
        if (err) {
                kfree_skb(skb);
                return ERR_PTR(err);
        }

        return skb;
}

/* A wrapper for nlmsg_multicast() checking that nlsk is still available.
 * Must be called with RCU read lock.
 */
static inline int xfrm_nlmsg_multicast(struct net *net, struct sk_buff *skb,
                                       u32 pid, unsigned int group)
{
        struct sock *nlsk = rcu_dereference(net->xfrm.nlsk);
        struct xfrm_translator *xtr;

        if (!nlsk) {
                kfree_skb(skb);
                return -EPIPE;
        }

        xtr = xfrm_get_translator();
        if (xtr) {
                int err = xtr->alloc_compat(skb, nlmsg_hdr(skb));

                xfrm_put_translator(xtr);
                if (err) {
                        kfree_skb(skb);
                        return err;
                }
        }

        return nlmsg_multicast(nlsk, skb, pid, group, GFP_ATOMIC);
}

static inline unsigned int xfrm_spdinfo_msgsize(void)
{
        return NLMSG_ALIGN(4)
               + nla_total_size(sizeof(struct xfrmu_spdinfo))
               + nla_total_size(sizeof(struct xfrmu_spdhinfo))
               + nla_total_size(sizeof(struct xfrmu_spdhthresh))
               + nla_total_size(sizeof(struct xfrmu_spdhthresh));
}

static int build_spdinfo(struct sk_buff *skb, struct net *net,
                         u32 portid, u32 seq, u32 flags)
{
        struct xfrmk_spdinfo si;
        struct xfrmu_spdinfo spc;
        struct xfrmu_spdhinfo sph;
        struct xfrmu_spdhthresh spt4, spt6;
        struct nlmsghdr *nlh;
        int err;
        u32 *f;
        unsigned lseq;

        nlh = nlmsg_put(skb, portid, seq, XFRM_MSG_NEWSPDINFO, sizeof(u32), 0);
        if (nlh == NULL) /* shouldn't really happen ... */
                return -EMSGSIZE;

        f = nlmsg_data(nlh);
        *f = flags;
        xfrm_spd_getinfo(net, &si);
        spc.incnt = si.incnt;
        spc.outcnt = si.outcnt;
        spc.fwdcnt = si.fwdcnt;
        spc.inscnt = si.inscnt;
        spc.outscnt = si.outscnt;
        spc.fwdscnt = si.fwdscnt;
        sph.spdhcnt = si.spdhcnt;
        sph.spdhmcnt = si.spdhmcnt;

        do {
                lseq = read_seqbegin(&net->xfrm.policy_hthresh.lock);

                spt4.lbits = net->xfrm.policy_hthresh.lbits4;
                spt4.rbits = net->xfrm.policy_hthresh.rbits4;
                spt6.lbits = net->xfrm.policy_hthresh.lbits6;
                spt6.rbits = net->xfrm.policy_hthresh.rbits6;
        } while (read_seqretry(&net->xfrm.policy_hthresh.lock, lseq));

        err = nla_put(skb, XFRMA_SPD_INFO, sizeof(spc), &spc);
        if (!err)
                err = nla_put(skb, XFRMA_SPD_HINFO, sizeof(sph), &sph);
        if (!err)
                err = nla_put(skb, XFRMA_SPD_IPV4_HTHRESH, sizeof(spt4), &spt4);
        if (!err)
                err = nla_put(skb, XFRMA_SPD_IPV6_HTHRESH, sizeof(spt6), &spt6);
        if (err) {
                nlmsg_cancel(skb, nlh);
                return err;
        }

        nlmsg_end(skb, nlh);
        return 0;
}

static int xfrm_set_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh,
                            struct nlattr **attrs,
                            struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct xfrmu_spdhthresh *thresh4 = NULL;
        struct xfrmu_spdhthresh *thresh6 = NULL;

        /* selector prefixlen thresholds to hash policies */
        if (attrs[XFRMA_SPD_IPV4_HTHRESH]) {
                struct nlattr *rta = attrs[XFRMA_SPD_IPV4_HTHRESH];

                if (nla_len(rta) < sizeof(*thresh4)) {
                        NL_SET_ERR_MSG(extack, "Invalid SPD_IPV4_HTHRESH attribute length");
                        return -EINVAL;
                }
                thresh4 = nla_data(rta);
                if (thresh4->lbits > 32 || thresh4->rbits > 32) {
                        NL_SET_ERR_MSG(extack, "Invalid hash threshold (must be <= 32 for IPv4)");
                        return -EINVAL;
                }
        }
        if (attrs[XFRMA_SPD_IPV6_HTHRESH]) {
                struct nlattr *rta = attrs[XFRMA_SPD_IPV6_HTHRESH];

                if (nla_len(rta) < sizeof(*thresh6)) {
                        NL_SET_ERR_MSG(extack, "Invalid SPD_IPV6_HTHRESH attribute length");
                        return -EINVAL;
                }
                thresh6 = nla_data(rta);
                if (thresh6->lbits > 128 || thresh6->rbits > 128) {
                        NL_SET_ERR_MSG(extack, "Invalid hash threshold (must be <= 128 for IPv6)");
                        return -EINVAL;
                }
        }

        if (thresh4 || thresh6) {
                write_seqlock(&net->xfrm.policy_hthresh.lock);
                if (thresh4) {
                        net->xfrm.policy_hthresh.lbits4 = thresh4->lbits;
                        net->xfrm.policy_hthresh.rbits4 = thresh4->rbits;
                }
                if (thresh6) {
                        net->xfrm.policy_hthresh.lbits6 = thresh6->lbits;
                        net->xfrm.policy_hthresh.rbits6 = thresh6->rbits;
                }
                write_sequnlock(&net->xfrm.policy_hthresh.lock);

                xfrm_policy_hash_rebuild(net);
        }

        return 0;
}

static int xfrm_get_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh,
                            struct nlattr **attrs,
                            struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct sk_buff *r_skb;
        u32 *flags = nlmsg_data(nlh);
        u32 sportid = NETLINK_CB(skb).portid;
        u32 seq = nlh->nlmsg_seq;
        int err;

        r_skb = nlmsg_new(xfrm_spdinfo_msgsize(), GFP_ATOMIC);
        if (r_skb == NULL)
                return -ENOMEM;

        err = build_spdinfo(r_skb, net, sportid, seq, *flags);
        BUG_ON(err < 0);

        return nlmsg_unicast(net->xfrm.nlsk, r_skb, sportid);
}

static inline unsigned int xfrm_sadinfo_msgsize(void)
{
        return NLMSG_ALIGN(4)
               + nla_total_size(sizeof(struct xfrmu_sadhinfo))
               + nla_total_size(4); /* XFRMA_SAD_CNT */
}

static int build_sadinfo(struct sk_buff *skb, struct net *net,
                         u32 portid, u32 seq, u32 flags)
{
        struct xfrmk_sadinfo si;
        struct xfrmu_sadhinfo sh;
        struct nlmsghdr *nlh;
        int err;
        u32 *f;

        nlh = nlmsg_put(skb, portid, seq, XFRM_MSG_NEWSADINFO, sizeof(u32), 0);
        if (nlh == NULL) /* shouldn't really happen ... */
                return -EMSGSIZE;

        f = nlmsg_data(nlh);
        *f = flags;
        xfrm_sad_getinfo(net, &si);

        sh.sadhmcnt = si.sadhmcnt;
        sh.sadhcnt = si.sadhcnt;

        err = nla_put_u32(skb, XFRMA_SAD_CNT, si.sadcnt);
        if (!err)
                err = nla_put(skb, XFRMA_SAD_HINFO, sizeof(sh), &sh);
        if (err) {
                nlmsg_cancel(skb, nlh);
                return err;
        }

        nlmsg_end(skb, nlh);
        return 0;
}

static int xfrm_get_sadinfo(struct sk_buff *skb, struct nlmsghdr *nlh,
                            struct nlattr **attrs,
                            struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct sk_buff *r_skb;
        u32 *flags = nlmsg_data(nlh);
        u32 sportid = NETLINK_CB(skb).portid;
        u32 seq = nlh->nlmsg_seq;
        int err;

        r_skb = nlmsg_new(xfrm_sadinfo_msgsize(), GFP_ATOMIC);
        if (r_skb == NULL)
                return -ENOMEM;

        err = build_sadinfo(r_skb, net, sportid, seq, *flags);
        BUG_ON(err < 0);

        return nlmsg_unicast(net->xfrm.nlsk, r_skb, sportid);
}

static int xfrm_get_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
                       struct nlattr **attrs, struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct xfrm_usersa_id *p = nlmsg_data(nlh);
        struct xfrm_state *x;
        struct sk_buff *resp_skb;
        int err = -ESRCH;

        x = xfrm_user_state_lookup(net, p, attrs, &err);
        if (x == NULL)
                goto out_noput;

        resp_skb = xfrm_state_netlink(skb, x, nlh->nlmsg_seq);
        if (IS_ERR(resp_skb)) {
                err = PTR_ERR(resp_skb);
        } else {
                err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, NETLINK_CB(skb).portid);
        }
        xfrm_state_put(x);
out_noput:
        return err;
}

static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh,
                              struct nlattr **attrs,
                              struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct xfrm_state *x;
        struct xfrm_userspi_info *p;
        struct xfrm_translator *xtr;
        struct sk_buff *resp_skb;
        xfrm_address_t *daddr;
        int family;
        int err;
        u32 mark;
        struct xfrm_mark m;
        u32 if_id = 0;
        u32 pcpu_num = UINT_MAX;

        p = nlmsg_data(nlh);
        err = verify_spi_info(p->info.id.proto, p->min, p->max, extack);
        if (err)
                goto out_noput;

        family = p->info.family;
        daddr = &p->info.id.daddr;

        x = NULL;

        mark = xfrm_mark_get(attrs, &m);

        if (attrs[XFRMA_IF_ID])
                if_id = nla_get_u32(attrs[XFRMA_IF_ID]);

        if (attrs[XFRMA_SA_PCPU]) {
                pcpu_num = nla_get_u32(attrs[XFRMA_SA_PCPU]);
                if (pcpu_num >= num_possible_cpus()) {
                        err = -EINVAL;
                        goto out_noput;
                }
        }

        if (p->info.seq) {
                x = xfrm_find_acq_byseq(net, mark, p->info.seq, pcpu_num);
                if (x && !xfrm_addr_equal(&x->id.daddr, daddr, family)) {
                        xfrm_state_put(x);
                        x = NULL;
                }
        }

        if (!x)
                x = xfrm_find_acq(net, &m, p->info.mode, p->info.reqid,
                                  if_id, pcpu_num, p->info.id.proto, daddr,
                                  &p->info.saddr, 1,
                                  family);
        err = -ENOENT;
        if (!x) {
                NL_SET_ERR_MSG(extack, "Target ACQUIRE not found");
                goto out_noput;
        }

        err = xfrm_alloc_spi(x, p->min, p->max, extack);
        if (err)
                goto out;

        if (attrs[XFRMA_SA_DIR])
                x->dir = nla_get_u8(attrs[XFRMA_SA_DIR]);

        resp_skb = xfrm_state_netlink(skb, x, nlh->nlmsg_seq);
        if (IS_ERR(resp_skb)) {
                err = PTR_ERR(resp_skb);
                goto out;
        }

        xtr = xfrm_get_translator();
        if (xtr) {
                err = xtr->alloc_compat(skb, nlmsg_hdr(skb));

                xfrm_put_translator(xtr);
                if (err) {
                        kfree_skb(resp_skb);
                        goto out;
                }
        }

        err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, NETLINK_CB(skb).portid);

out:
        xfrm_state_put(x);
out_noput:
        return err;
}

static int verify_policy_dir(u8 dir, struct netlink_ext_ack *extack)
{
        switch (dir) {
        case XFRM_POLICY_IN:
        case XFRM_POLICY_OUT:
        case XFRM_POLICY_FWD:
                break;

        default:
                NL_SET_ERR_MSG(extack, "Invalid policy direction");
                return -EINVAL;
        }

        return 0;
}

static int verify_policy_type(u8 type, struct netlink_ext_ack *extack)
{
        switch (type) {
        case XFRM_POLICY_TYPE_MAIN:
#ifdef CONFIG_XFRM_SUB_POLICY
        case XFRM_POLICY_TYPE_SUB:
#endif
                break;

        default:
                NL_SET_ERR_MSG(extack, "Invalid policy type");
                return -EINVAL;
        }

        return 0;
}

static int verify_newpolicy_info(struct xfrm_userpolicy_info *p,
                                 struct netlink_ext_ack *extack)
{
        int ret;

        switch (p->share) {
        case XFRM_SHARE_ANY:
        case XFRM_SHARE_SESSION:
        case XFRM_SHARE_USER:
        case XFRM_SHARE_UNIQUE:
                break;

        default:
                NL_SET_ERR_MSG(extack, "Invalid policy share");
                return -EINVAL;
        }

        switch (p->action) {
        case XFRM_POLICY_ALLOW:
        case XFRM_POLICY_BLOCK:
                break;

        default:
                NL_SET_ERR_MSG(extack, "Invalid policy action");
                return -EINVAL;
        }

        switch (p->sel.family) {
        case AF_INET:
                if (p->sel.prefixlen_d > 32 || p->sel.prefixlen_s > 32) {
                        NL_SET_ERR_MSG(extack, "Invalid prefix length in selector (must be <= 32 for IPv4)");
                        return -EINVAL;
                }

                break;

        case AF_INET6:
#if IS_ENABLED(CONFIG_IPV6)
                if (p->sel.prefixlen_d > 128 || p->sel.prefixlen_s > 128) {
                        NL_SET_ERR_MSG(extack, "Invalid prefix length in selector (must be <= 128 for IPv6)");
                        return -EINVAL;
                }

                break;
#else
                NL_SET_ERR_MSG(extack, "IPv6 support disabled");
                return  -EAFNOSUPPORT;
#endif

        default:
                NL_SET_ERR_MSG(extack, "Invalid selector family");
                return -EINVAL;
        }

        ret = verify_policy_dir(p->dir, extack);
        if (ret)
                return ret;
        if (p->index && (xfrm_policy_id2dir(p->index) != p->dir)) {
                NL_SET_ERR_MSG(extack, "Policy index doesn't match direction");
                return -EINVAL;
        }

        return 0;
}

static int copy_from_user_sec_ctx(struct xfrm_policy *pol, struct nlattr **attrs)
{
        struct nlattr *rt = attrs[XFRMA_SEC_CTX];
        struct xfrm_user_sec_ctx *uctx;

        if (!rt)
                return 0;

        uctx = nla_data(rt);
        return security_xfrm_policy_alloc(&pol->security, uctx, GFP_KERNEL);
}

static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut,
                           int nr)
{
        int i;

        xp->xfrm_nr = nr;
        for (i = 0; i < nr; i++, ut++) {
                struct xfrm_tmpl *t = &xp->xfrm_vec[i];

                memcpy(&t->id, &ut->id, sizeof(struct xfrm_id));
                memcpy(&t->saddr, &ut->saddr,
                       sizeof(xfrm_address_t));
                t->reqid = ut->reqid;
                t->mode = ut->mode;
                t->share = ut->share;
                t->optional = ut->optional;
                t->aalgos = ut->aalgos;
                t->ealgos = ut->ealgos;
                t->calgos = ut->calgos;
                /* If all masks are ~0, then we allow all algorithms. */
                t->allalgs = !~(t->aalgos & t->ealgos & t->calgos);
                t->encap_family = ut->family;
        }
}

static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family,
                         int dir, struct netlink_ext_ack *extack)
{
        u16 prev_family;
        int i;

        if (nr > XFRM_MAX_DEPTH) {
                NL_SET_ERR_MSG(extack, "Template count must be <= XFRM_MAX_DEPTH (" __stringify(XFRM_MAX_DEPTH) ")");
                return -EINVAL;
        }

        prev_family = family;

        for (i = 0; i < nr; i++) {
                /* We never validated the ut->family value, so many
                 * applications simply leave it at zero.  The check was
                 * never made and ut->family was ignored because all
                 * templates could be assumed to have the same family as
                 * the policy itself.  Now that we will have ipv4-in-ipv6
                 * and ipv6-in-ipv4 tunnels, this is no longer true.
                 */
                if (!ut[i].family)
                        ut[i].family = family;

                switch (ut[i].mode) {
                case XFRM_MODE_TUNNEL:
                case XFRM_MODE_BEET:
                        if (ut[i].optional && dir == XFRM_POLICY_OUT) {
                                NL_SET_ERR_MSG(extack, "Mode in optional template not allowed in outbound policy");
                                return -EINVAL;
                        }
                        break;
                case XFRM_MODE_IPTFS:
                        break;
                default:
                        if (ut[i].family != prev_family) {
                                NL_SET_ERR_MSG(extack, "Mode in template doesn't support a family change");
                                return -EINVAL;
                        }
                        break;
                }
                if (ut[i].mode >= XFRM_MODE_MAX) {
                        NL_SET_ERR_MSG(extack, "Mode in template must be < XFRM_MODE_MAX (" __stringify(XFRM_MODE_MAX) ")");
                        return -EINVAL;
                }

                prev_family = ut[i].family;

                switch (ut[i].family) {
                case AF_INET:
                        break;
#if IS_ENABLED(CONFIG_IPV6)
                case AF_INET6:
                        break;
#endif
                default:
                        NL_SET_ERR_MSG(extack, "Invalid family in template");
                        return -EINVAL;
                }

                if (!xfrm_id_proto_valid(ut[i].id.proto)) {
                        NL_SET_ERR_MSG(extack, "Invalid XFRM protocol in template");
                        return -EINVAL;
                }
        }

        return 0;
}

static int copy_from_user_tmpl(struct xfrm_policy *pol, struct nlattr **attrs,
                               int dir, struct netlink_ext_ack *extack)
{
        struct nlattr *rt = attrs[XFRMA_TMPL];

        if (!rt) {
                pol->xfrm_nr = 0;
        } else {
                struct xfrm_user_tmpl *utmpl = nla_data(rt);
                int nr = nla_len(rt) / sizeof(*utmpl);
                int err;

                err = validate_tmpl(nr, utmpl, pol->family, dir, extack);
                if (err)
                        return err;

                copy_templates(pol, utmpl, nr);
        }
        return 0;
}

static int copy_from_user_policy_type(u8 *tp, struct nlattr **attrs,
                                      struct netlink_ext_ack *extack)
{
        struct nlattr *rt = attrs[XFRMA_POLICY_TYPE];
        struct xfrm_userpolicy_type *upt;
        u8 type = XFRM_POLICY_TYPE_MAIN;
        int err;

        if (rt) {
                upt = nla_data(rt);
                type = upt->type;
        }

        err = verify_policy_type(type, extack);
        if (err)
                return err;

        *tp = type;
        return 0;
}

static void copy_from_user_policy(struct xfrm_policy *xp, struct xfrm_userpolicy_info *p)
{
        xp->priority = p->priority;
        xp->index = p->index;
        memcpy(&xp->selector, &p->sel, sizeof(xp->selector));
        memcpy(&xp->lft, &p->lft, sizeof(xp->lft));
        xp->action = p->action;
        xp->flags = p->flags;
        xp->family = p->sel.family;
        /* XXX xp->share = p->share; */
}

static void copy_to_user_policy(struct xfrm_policy *xp, struct xfrm_userpolicy_info *p, int dir)
{
        memset(p, 0, sizeof(*p));
        memcpy(&p->sel, &xp->selector, sizeof(p->sel));
        memcpy(&p->lft, &xp->lft, sizeof(p->lft));
        memcpy(&p->curlft, &xp->curlft, sizeof(p->curlft));
        p->priority = xp->priority;
        p->index = xp->index;
        p->sel.family = xp->family;
        p->dir = dir;
        p->action = xp->action;
        p->flags = xp->flags;
        p->share = XFRM_SHARE_ANY; /* XXX xp->share */
}

static struct xfrm_policy *xfrm_policy_construct(struct net *net,
                                                 struct xfrm_userpolicy_info *p,
                                                 struct nlattr **attrs,
                                                 int *errp,
                                                 struct netlink_ext_ack *extack)
{
        struct xfrm_policy *xp = xfrm_policy_alloc(net, GFP_KERNEL);
        int err;

        if (!xp) {
                *errp = -ENOMEM;
                return NULL;
        }

        copy_from_user_policy(xp, p);

        err = copy_from_user_policy_type(&xp->type, attrs, extack);
        if (err)
                goto error;

        if (!(err = copy_from_user_tmpl(xp, attrs, p->dir, extack)))
                err = copy_from_user_sec_ctx(xp, attrs);
        if (err)
                goto error;

        xfrm_mark_get(attrs, &xp->mark);

        if (attrs[XFRMA_IF_ID])
                xp->if_id = nla_get_u32(attrs[XFRMA_IF_ID]);

        /* configure the hardware if offload is requested */
        if (attrs[XFRMA_OFFLOAD_DEV]) {
                err = xfrm_dev_policy_add(net, xp,
                                          nla_data(attrs[XFRMA_OFFLOAD_DEV]),
                                          p->dir, extack);
                if (err)
                        goto error;
        }

        return xp;
 error:
        *errp = err;
        xp->walk.dead = 1;
        xfrm_policy_destroy(xp);
        return NULL;
}

static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
                           struct nlattr **attrs,
                           struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct xfrm_userpolicy_info *p = nlmsg_data(nlh);
        struct xfrm_policy *xp;
        struct km_event c;
        int err;
        int excl;

        err = verify_newpolicy_info(p, extack);
        if (err)
                return err;
        err = verify_sec_ctx_len(attrs, extack);
        if (err)
                return err;

        xp = xfrm_policy_construct(net, p, attrs, &err, extack);
        if (!xp)
                return err;

        /* shouldn't excl be based on nlh flags??
         * Aha! this is anti-netlink really i.e  more pfkey derived
         * in netlink excl is a flag and you wouldn't need
         * a type XFRM_MSG_UPDPOLICY - JHS */
        excl = nlh->nlmsg_type == XFRM_MSG_NEWPOLICY;
        err = xfrm_policy_insert(p->dir, xp, excl);
        xfrm_audit_policy_add(xp, err ? 0 : 1, true);

        if (err) {
                xfrm_dev_policy_delete(xp);
                xfrm_dev_policy_free(xp);
                security_xfrm_policy_free(xp->security);
                kfree(xp);
                return err;
        }

        c.event = nlh->nlmsg_type;
        c.seq = nlh->nlmsg_seq;
        c.portid = nlh->nlmsg_pid;
        km_policy_notify(xp, p->dir, &c);

        xfrm_pol_put(xp);

        return 0;
}

static int copy_to_user_tmpl(struct xfrm_policy *xp, struct sk_buff *skb)
{
        struct xfrm_user_tmpl vec[XFRM_MAX_DEPTH];
        int i;

        if (xp->xfrm_nr == 0)
                return 0;

        if (xp->xfrm_nr > XFRM_MAX_DEPTH)
                return -ENOBUFS;

        for (i = 0; i < xp->xfrm_nr; i++) {
                struct xfrm_user_tmpl *up = &vec[i];
                struct xfrm_tmpl *kp = &xp->xfrm_vec[i];

                memset(up, 0, sizeof(*up));
                memcpy(&up->id, &kp->id, sizeof(up->id));
                up->family = kp->encap_family;
                memcpy(&up->saddr, &kp->saddr, sizeof(up->saddr));
                up->reqid = kp->reqid;
                up->mode = kp->mode;
                up->share = kp->share;
                up->optional = kp->optional;
                up->aalgos = kp->aalgos;
                up->ealgos = kp->ealgos;
                up->calgos = kp->calgos;
        }

        return nla_put(skb, XFRMA_TMPL,
                       sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr, vec);
}

static inline int copy_to_user_state_sec_ctx(struct xfrm_state *x, struct sk_buff *skb)
{
        if (x->security) {
                return copy_sec_ctx(x->security, skb);
        }
        return 0;
}

static inline int copy_to_user_sec_ctx(struct xfrm_policy *xp, struct sk_buff *skb)
{
        if (xp->security)
                return copy_sec_ctx(xp->security, skb);
        return 0;
}
static inline unsigned int userpolicy_type_attrsize(void)
{
#ifdef CONFIG_XFRM_SUB_POLICY
        return nla_total_size(sizeof(struct xfrm_userpolicy_type));
#else
        return 0;
#endif
}

#ifdef CONFIG_XFRM_SUB_POLICY
static int copy_to_user_policy_type(u8 type, struct sk_buff *skb)
{
        struct xfrm_userpolicy_type upt;

        /* Sadly there are two holes in struct xfrm_userpolicy_type */
        memset(&upt, 0, sizeof(upt));
        upt.type = type;

        return nla_put(skb, XFRMA_POLICY_TYPE, sizeof(upt), &upt);
}

#else
static inline int copy_to_user_policy_type(u8 type, struct sk_buff *skb)
{
        return 0;
}
#endif

static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr)
{
        struct xfrm_dump_info *sp = ptr;
        struct xfrm_userpolicy_info *p;
        struct sk_buff *in_skb = sp->in_skb;
        struct sk_buff *skb = sp->out_skb;
        struct xfrm_translator *xtr;
        struct nlmsghdr *nlh;
        int err;

        nlh = nlmsg_put(skb, NETLINK_CB(in_skb).portid, sp->nlmsg_seq,
                        XFRM_MSG_NEWPOLICY, sizeof(*p), sp->nlmsg_flags);
        if (nlh == NULL)
                return -EMSGSIZE;

        p = nlmsg_data(nlh);
        copy_to_user_policy(xp, p, dir);
        err = copy_to_user_tmpl(xp, skb);
        if (!err)
                err = copy_to_user_sec_ctx(xp, skb);
        if (!err)
                err = copy_to_user_policy_type(xp->type, skb);
        if (!err)
                err = xfrm_mark_put(skb, &xp->mark);
        if (!err)
                err = xfrm_if_id_put(skb, xp->if_id);
        if (!err && xp->xdo.dev)
                err = copy_user_offload(&xp->xdo, skb);
        if (err) {
                nlmsg_cancel(skb, nlh);
                return err;
        }
        nlmsg_end(skb, nlh);

        xtr = xfrm_get_translator();
        if (xtr) {
                err = xtr->alloc_compat(skb, nlh);

                xfrm_put_translator(xtr);
                if (err) {
                        nlmsg_cancel(skb, nlh);
                        return err;
                }
        }

        return 0;
}

static int xfrm_dump_policy_done(struct netlink_callback *cb)
{
        struct xfrm_policy_walk *walk = (struct xfrm_policy_walk *)cb->args;
        struct net *net = sock_net(cb->skb->sk);

        xfrm_policy_walk_done(walk, net);
        return 0;
}

static int xfrm_dump_policy_start(struct netlink_callback *cb)
{
        struct xfrm_policy_walk *walk = (struct xfrm_policy_walk *)cb->args;

        BUILD_BUG_ON(sizeof(*walk) > sizeof(cb->args));

        xfrm_policy_walk_init(walk, XFRM_POLICY_TYPE_ANY);
        return 0;
}

static int xfrm_dump_policy(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct net *net = sock_net(skb->sk);
        struct xfrm_policy_walk *walk = (struct xfrm_policy_walk *)cb->args;
        struct xfrm_dump_info info;

        info.in_skb = cb->skb;
        info.out_skb = skb;
        info.nlmsg_seq = cb->nlh->nlmsg_seq;
        info.nlmsg_flags = NLM_F_MULTI;

        (void) xfrm_policy_walk(net, walk, dump_one_policy, &info);

        return skb->len;
}

static struct sk_buff *xfrm_policy_netlink(struct sk_buff *in_skb,
                                          struct xfrm_policy *xp,
                                          int dir, u32 seq)
{
        struct xfrm_dump_info info;
        struct sk_buff *skb;
        int err;

        skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!skb)
                return ERR_PTR(-ENOMEM);

        info.in_skb = in_skb;
        info.out_skb = skb;
        info.nlmsg_seq = seq;
        info.nlmsg_flags = 0;

        err = dump_one_policy(xp, dir, 0, &info);
        if (err) {
                kfree_skb(skb);
                return ERR_PTR(err);
        }

        return skb;
}

static int xfrm_notify_userpolicy(struct net *net)
{
        struct xfrm_userpolicy_default *up;
        int len = NLMSG_ALIGN(sizeof(*up));
        struct nlmsghdr *nlh;
        struct sk_buff *skb;
        int err;

        skb = nlmsg_new(len, GFP_ATOMIC);
        if (skb == NULL)
                return -ENOMEM;

        nlh = nlmsg_put(skb, 0, 0, XFRM_MSG_GETDEFAULT, sizeof(*up), 0);
        if (nlh == NULL) {
                kfree_skb(skb);
                return -EMSGSIZE;
        }

        up = nlmsg_data(nlh);
        up->in = net->xfrm.policy_default[XFRM_POLICY_IN];
        up->fwd = net->xfrm.policy_default[XFRM_POLICY_FWD];
        up->out = net->xfrm.policy_default[XFRM_POLICY_OUT];

        nlmsg_end(skb, nlh);

        rcu_read_lock();
        err = xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_POLICY);
        rcu_read_unlock();

        return err;
}

static bool xfrm_userpolicy_is_valid(__u8 policy)
{
        return policy == XFRM_USERPOLICY_BLOCK ||
               policy == XFRM_USERPOLICY_ACCEPT;
}

static int xfrm_set_default(struct sk_buff *skb, struct nlmsghdr *nlh,
                            struct nlattr **attrs, struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct xfrm_userpolicy_default *up = nlmsg_data(nlh);

        if (xfrm_userpolicy_is_valid(up->in))
                net->xfrm.policy_default[XFRM_POLICY_IN] = up->in;

        if (xfrm_userpolicy_is_valid(up->fwd))
                net->xfrm.policy_default[XFRM_POLICY_FWD] = up->fwd;

        if (xfrm_userpolicy_is_valid(up->out))
                net->xfrm.policy_default[XFRM_POLICY_OUT] = up->out;

        rt_genid_bump_all(net);

        xfrm_notify_userpolicy(net);
        return 0;
}

static int xfrm_get_default(struct sk_buff *skb, struct nlmsghdr *nlh,
                            struct nlattr **attrs, struct netlink_ext_ack *extack)
{
        struct sk_buff *r_skb;
        struct nlmsghdr *r_nlh;
        struct net *net = sock_net(skb->sk);
        struct xfrm_userpolicy_default *r_up;
        int len = NLMSG_ALIGN(sizeof(struct xfrm_userpolicy_default));
        u32 portid = NETLINK_CB(skb).portid;
        u32 seq = nlh->nlmsg_seq;

        r_skb = nlmsg_new(len, GFP_ATOMIC);
        if (!r_skb)
                return -ENOMEM;

        r_nlh = nlmsg_put(r_skb, portid, seq, XFRM_MSG_GETDEFAULT, sizeof(*r_up), 0);
        if (!r_nlh) {
                kfree_skb(r_skb);
                return -EMSGSIZE;
        }

        r_up = nlmsg_data(r_nlh);
        r_up->in = net->xfrm.policy_default[XFRM_POLICY_IN];
        r_up->fwd = net->xfrm.policy_default[XFRM_POLICY_FWD];
        r_up->out = net->xfrm.policy_default[XFRM_POLICY_OUT];
        nlmsg_end(r_skb, r_nlh);

        return nlmsg_unicast(net->xfrm.nlsk, r_skb, portid);
}

static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
                           struct nlattr **attrs,
                           struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct xfrm_policy *xp;
        struct xfrm_userpolicy_id *p;
        u8 type = XFRM_POLICY_TYPE_MAIN;
        int err;
        struct km_event c;
        int delete;
        struct xfrm_mark m;
        u32 if_id = 0;

        p = nlmsg_data(nlh);
        delete = nlh->nlmsg_type == XFRM_MSG_DELPOLICY;

        err = copy_from_user_policy_type(&type, attrs, extack);
        if (err)
                return err;

        err = verify_policy_dir(p->dir, extack);
        if (err)
                return err;

        if (attrs[XFRMA_IF_ID])
                if_id = nla_get_u32(attrs[XFRMA_IF_ID]);

        xfrm_mark_get(attrs, &m);

        if (p->index)
                xp = xfrm_policy_byid(net, &m, if_id, type, p->dir,
                                      p->index, delete, &err);
        else {
                struct nlattr *rt = attrs[XFRMA_SEC_CTX];
                struct xfrm_sec_ctx *ctx;

                err = verify_sec_ctx_len(attrs, extack);
                if (err)
                        return err;

                ctx = NULL;
                if (rt) {
                        struct xfrm_user_sec_ctx *uctx = nla_data(rt);

                        err = security_xfrm_policy_alloc(&ctx, uctx, GFP_KERNEL);
                        if (err)
                                return err;
                }
                xp = xfrm_policy_bysel_ctx(net, &m, if_id, type, p->dir,
                                           &p->sel, ctx, delete, &err);
                security_xfrm_policy_free(ctx);
        }
        if (xp == NULL)
                return -ENOENT;

        if (!delete) {
                struct sk_buff *resp_skb;

                resp_skb = xfrm_policy_netlink(skb, xp, p->dir, nlh->nlmsg_seq);
                if (IS_ERR(resp_skb)) {
                        err = PTR_ERR(resp_skb);
                } else {
                        err = nlmsg_unicast(net->xfrm.nlsk, resp_skb,
                                            NETLINK_CB(skb).portid);
                }
        } else {
                xfrm_audit_policy_delete(xp, err ? 0 : 1, true);

                if (err != 0)
                        goto out;

                c.data.byid = p->index;
                c.event = nlh->nlmsg_type;
                c.seq = nlh->nlmsg_seq;
                c.portid = nlh->nlmsg_pid;
                km_policy_notify(xp, p->dir, &c);
        }

out:
        xfrm_pol_put(xp);
        return err;
}

static int xfrm_flush_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
                         struct nlattr **attrs,
                         struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct km_event c;
        struct xfrm_usersa_flush *p = nlmsg_data(nlh);
        int err;

        err = xfrm_state_flush(net, p->proto, true);
        if (err) {
                if (err == -ESRCH) /* empty table */
                        return 0;
                return err;
        }
        c.data.proto = p->proto;
        c.event = nlh->nlmsg_type;
        c.seq = nlh->nlmsg_seq;
        c.portid = nlh->nlmsg_pid;
        c.net = net;
        km_state_notify(NULL, &c);

        return 0;
}

static inline unsigned int xfrm_aevent_msgsize(struct xfrm_state *x)
{
        unsigned int replay_size = x->replay_esn ?
                              xfrm_replay_state_esn_len(x->replay_esn) :
                              sizeof(struct xfrm_replay_state);

        return NLMSG_ALIGN(sizeof(struct xfrm_aevent_id))
               + nla_total_size(replay_size)
               + nla_total_size_64bit(sizeof(struct xfrm_lifetime_cur))
               + nla_total_size(sizeof(struct xfrm_mark))
               + nla_total_size(4) /* XFRM_AE_RTHR */
               + nla_total_size(4) /* XFRM_AE_ETHR */
               + nla_total_size(sizeof(x->dir)) /* XFRMA_SA_DIR */
               + nla_total_size(4); /* XFRMA_SA_PCPU */
}

static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct km_event *c)
{
        struct xfrm_aevent_id *id;
        struct nlmsghdr *nlh;
        int err;

        nlh = nlmsg_put(skb, c->portid, c->seq, XFRM_MSG_NEWAE, sizeof(*id), 0);
        if (nlh == NULL)
                return -EMSGSIZE;

        id = nlmsg_data(nlh);
        memset(&id->sa_id, 0, sizeof(id->sa_id));
        memcpy(&id->sa_id.daddr, &x->id.daddr, sizeof(x->id.daddr));
        id->sa_id.spi = x->id.spi;
        id->sa_id.family = x->props.family;
        id->sa_id.proto = x->id.proto;
        memcpy(&id->saddr, &x->props.saddr, sizeof(x->props.saddr));
        id->reqid = x->props.reqid;
        id->flags = c->data.aevent;

        if (x->replay_esn) {
                err = nla_put(skb, XFRMA_REPLAY_ESN_VAL,
                              xfrm_replay_state_esn_len(x->replay_esn),
                              x->replay_esn);
        } else {
                err = nla_put(skb, XFRMA_REPLAY_VAL, sizeof(x->replay),
                              &x->replay);
        }
        if (err)
                goto out_cancel;
        err = nla_put_64bit(skb, XFRMA_LTIME_VAL, sizeof(x->curlft), &x->curlft,
                            XFRMA_PAD);
        if (err)
                goto out_cancel;

        if (id->flags & XFRM_AE_RTHR) {
                err = nla_put_u32(skb, XFRMA_REPLAY_THRESH, x->replay_maxdiff);
                if (err)
                        goto out_cancel;
        }
        if (id->flags & XFRM_AE_ETHR) {
                err = nla_put_u32(skb, XFRMA_ETIMER_THRESH,
                                  x->replay_maxage * 10 / HZ);
                if (err)
                        goto out_cancel;
        }
        err = xfrm_mark_put(skb, &x->mark);
        if (err)
                goto out_cancel;

        err = xfrm_if_id_put(skb, x->if_id);
        if (err)
                goto out_cancel;
        if (x->pcpu_num != UINT_MAX) {
                err = nla_put_u32(skb, XFRMA_SA_PCPU, x->pcpu_num);
                if (err)
                        goto out_cancel;
        }

        if (x->dir) {
                err = nla_put_u8(skb, XFRMA_SA_DIR, x->dir);
                if (err)
                        goto out_cancel;
        }

        nlmsg_end(skb, nlh);
        return 0;

out_cancel:
        nlmsg_cancel(skb, nlh);
        return err;
}

static int xfrm_get_ae(struct sk_buff *skb, struct nlmsghdr *nlh,
                       struct nlattr **attrs, struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct xfrm_state *x;
        struct sk_buff *r_skb;
        int err;
        struct km_event c;
        u32 mark;
        struct xfrm_mark m;
        struct xfrm_aevent_id *p = nlmsg_data(nlh);
        struct xfrm_usersa_id *id = &p->sa_id;

        mark = xfrm_mark_get(attrs, &m);

        x = xfrm_state_lookup(net, mark, &id->daddr, id->spi, id->proto, id->family);
        if (x == NULL)
                return -ESRCH;

        r_skb = nlmsg_new(xfrm_aevent_msgsize(x), GFP_ATOMIC);
        if (r_skb == NULL) {
                xfrm_state_put(x);
                return -ENOMEM;
        }

        /*
         * XXX: is this lock really needed - none of the other
         * gets lock (the concern is things getting updated
         * while we are still reading) - jhs
        */
        spin_lock_bh(&x->lock);
        c.data.aevent = p->flags;
        c.seq = nlh->nlmsg_seq;
        c.portid = nlh->nlmsg_pid;

        err = build_aevent(r_skb, x, &c);
        BUG_ON(err < 0);

        err = nlmsg_unicast(net->xfrm.nlsk, r_skb, NETLINK_CB(skb).portid);
        spin_unlock_bh(&x->lock);
        xfrm_state_put(x);
        return err;
}

static int xfrm_new_ae(struct sk_buff *skb, struct nlmsghdr *nlh,
                       struct nlattr **attrs, struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct xfrm_state *x;
        struct km_event c;
        int err = -EINVAL;
        u32 mark = 0;
        struct xfrm_mark m;
        struct xfrm_aevent_id *p = nlmsg_data(nlh);
        struct nlattr *rp = attrs[XFRMA_REPLAY_VAL];
        struct nlattr *re = attrs[XFRMA_REPLAY_ESN_VAL];
        struct nlattr *lt = attrs[XFRMA_LTIME_VAL];
        struct nlattr *et = attrs[XFRMA_ETIMER_THRESH];
        struct nlattr *rt = attrs[XFRMA_REPLAY_THRESH];

        if (!lt && !rp && !re && !et && !rt) {
                NL_SET_ERR_MSG(extack, "Missing required attribute for AE");
                return err;
        }

        /* pedantic mode - thou shalt sayeth replaceth */
        if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) {
                NL_SET_ERR_MSG(extack, "NLM_F_REPLACE flag is required");
                return err;
        }

        mark = xfrm_mark_get(attrs, &m);

        x = xfrm_state_lookup(net, mark, &p->sa_id.daddr, p->sa_id.spi, p->sa_id.proto, p->sa_id.family);
        if (x == NULL)
                return -ESRCH;

        if (x->km.state != XFRM_STATE_VALID) {
                NL_SET_ERR_MSG(extack, "SA must be in VALID state");
                goto out;
        }

        err = xfrm_replay_verify_len(x->replay_esn, re, extack);
        if (err)
                goto out;

        spin_lock_bh(&x->lock);
        xfrm_update_ae_params(x, attrs, 1);
        spin_unlock_bh(&x->lock);

        c.event = nlh->nlmsg_type;
        c.seq = nlh->nlmsg_seq;
        c.portid = nlh->nlmsg_pid;
        c.data.aevent = XFRM_AE_CU;
        km_state_notify(x, &c);
        err = 0;
out:
        xfrm_state_put(x);
        return err;
}

static int xfrm_flush_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
                             struct nlattr **attrs,
                             struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct km_event c;
        u8 type = XFRM_POLICY_TYPE_MAIN;
        int err;

        err = copy_from_user_policy_type(&type, attrs, extack);
        if (err)
                return err;

        err = xfrm_policy_flush(net, type, true);
        if (err) {
                if (err == -ESRCH) /* empty table */
                        return 0;
                return err;
        }

        c.data.type = type;
        c.event = nlh->nlmsg_type;
        c.seq = nlh->nlmsg_seq;
        c.portid = nlh->nlmsg_pid;
        c.net = net;
        km_policy_notify(NULL, 0, &c);
        return 0;
}

static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
                               struct nlattr **attrs,
                               struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct xfrm_policy *xp;
        struct xfrm_user_polexpire *up = nlmsg_data(nlh);
        struct xfrm_userpolicy_info *p = &up->pol;
        u8 type = XFRM_POLICY_TYPE_MAIN;
        int err = -ENOENT;
        struct xfrm_mark m;
        u32 if_id = 0;

        err = copy_from_user_policy_type(&type, attrs, extack);
        if (err)
                return err;

        err = verify_policy_dir(p->dir, extack);
        if (err)
                return err;

        if (attrs[XFRMA_IF_ID])
                if_id = nla_get_u32(attrs[XFRMA_IF_ID]);

        xfrm_mark_get(attrs, &m);

        if (p->index)
                xp = xfrm_policy_byid(net, &m, if_id, type, p->dir, p->index,
                                      0, &err);
        else {
                struct nlattr *rt = attrs[XFRMA_SEC_CTX];
                struct xfrm_sec_ctx *ctx;

                err = verify_sec_ctx_len(attrs, extack);
                if (err)
                        return err;

                ctx = NULL;
                if (rt) {
                        struct xfrm_user_sec_ctx *uctx = nla_data(rt);

                        err = security_xfrm_policy_alloc(&ctx, uctx, GFP_KERNEL);
                        if (err)
                                return err;
                }
                xp = xfrm_policy_bysel_ctx(net, &m, if_id, type, p->dir,
                                           &p->sel, ctx, 0, &err);
                security_xfrm_policy_free(ctx);
        }
        if (xp == NULL)
                return -ENOENT;

        if (unlikely(xp->walk.dead))
                goto out;

        err = 0;
        if (up->hard) {
                xfrm_policy_delete(xp, p->dir);
                xfrm_audit_policy_delete(xp, 1, true);
        }
        km_policy_expired(xp, p->dir, up->hard, nlh->nlmsg_pid);

out:
        xfrm_pol_put(xp);
        return err;
}

static int xfrm_add_sa_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
                              struct nlattr **attrs,
                              struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct xfrm_state *x;
        int err;
        struct xfrm_user_expire *ue = nlmsg_data(nlh);
        struct xfrm_usersa_info *p = &ue->state;
        struct xfrm_mark m;
        u32 mark = xfrm_mark_get(attrs, &m);

        x = xfrm_state_lookup(net, mark, &p->id.daddr, p->id.spi, p->id.proto, p->family);

        err = -ENOENT;
        if (x == NULL)
                return err;

        spin_lock_bh(&x->lock);
        err = -EINVAL;
        if (x->km.state != XFRM_STATE_VALID) {
                NL_SET_ERR_MSG(extack, "SA must be in VALID state");
                goto out;
        }

        km_state_expired(x, ue->hard, nlh->nlmsg_pid);

        if (ue->hard) {
                __xfrm_state_delete(x);
                xfrm_audit_state_delete(x, 1, true);
        }
        err = 0;
out:
        spin_unlock_bh(&x->lock);
        xfrm_state_put(x);
        return err;
}

static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh,
                            struct nlattr **attrs,
                            struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct xfrm_policy *xp;
        struct xfrm_user_tmpl *ut;
        int i;
        struct nlattr *rt = attrs[XFRMA_TMPL];
        struct xfrm_mark mark;

        struct xfrm_user_acquire *ua = nlmsg_data(nlh);
        struct xfrm_state *x = xfrm_state_alloc(net);
        int err = -ENOMEM;

        if (!x)
                goto nomem;

        xfrm_mark_get(attrs, &mark);

        if (attrs[XFRMA_SA_PCPU]) {
                x->pcpu_num = nla_get_u32(attrs[XFRMA_SA_PCPU]);
                err = -EINVAL;
                if (x->pcpu_num >= num_possible_cpus())
                        goto free_state;
        }

        err = verify_newpolicy_info(&ua->policy, extack);
        if (err)
                goto free_state;
        err = verify_sec_ctx_len(attrs, extack);
        if (err)
                goto free_state;

        /*   build an XP */
        xp = xfrm_policy_construct(net, &ua->policy, attrs, &err, extack);
        if (!xp)
                goto free_state;

        memcpy(&x->id, &ua->id, sizeof(ua->id));
        memcpy(&x->props.saddr, &ua->saddr, sizeof(ua->saddr));
        memcpy(&x->sel, &ua->sel, sizeof(ua->sel));
        xp->mark.m = x->mark.m = mark.m;
        xp->mark.v = x->mark.v = mark.v;
        ut = nla_data(rt);
        /* extract the templates and for each call km_key */
        for (i = 0; i < xp->xfrm_nr; i++, ut++) {
                struct xfrm_tmpl *t = &xp->xfrm_vec[i];
                memcpy(&x->id, &t->id, sizeof(x->id));
                x->props.mode = t->mode;
                x->props.reqid = t->reqid;
                x->props.family = ut->family;
                t->aalgos = ua->aalgos;
                t->ealgos = ua->ealgos;
                t->calgos = ua->calgos;
                err = km_query(x, t, xp);

        }

        xfrm_state_free(x);
        xfrm_dev_policy_delete(xp);
        xfrm_dev_policy_free(xp);
        security_xfrm_policy_free(xp->security);
        kfree(xp);

        return 0;

free_state:
        xfrm_state_free(x);
nomem:
        return err;
}

#ifdef CONFIG_XFRM_MIGRATE
static int copy_from_user_migrate(struct xfrm_migrate *ma,
                                  struct xfrm_kmaddress *k,
                                  struct nlattr **attrs, int *num,
                                  struct netlink_ext_ack *extack)
{
        struct nlattr *rt = attrs[XFRMA_MIGRATE];
        struct xfrm_user_migrate *um;
        int i, num_migrate;

        if (k != NULL) {
                struct xfrm_user_kmaddress *uk;

                uk = nla_data(attrs[XFRMA_KMADDRESS]);
                memcpy(&k->local, &uk->local, sizeof(k->local));
                memcpy(&k->remote, &uk->remote, sizeof(k->remote));
                k->family = uk->family;
                k->reserved = uk->reserved;
        }

        um = nla_data(rt);
        num_migrate = nla_len(rt) / sizeof(*um);

        if (num_migrate <= 0 || num_migrate > XFRM_MAX_DEPTH) {
                NL_SET_ERR_MSG(extack, "Invalid number of SAs to migrate, must be 0 < num <= XFRM_MAX_DEPTH (6)");
                return -EINVAL;
        }

        for (i = 0; i < num_migrate; i++, um++, ma++) {
                memcpy(&ma->old_daddr, &um->old_daddr, sizeof(ma->old_daddr));
                memcpy(&ma->old_saddr, &um->old_saddr, sizeof(ma->old_saddr));
                memcpy(&ma->new_daddr, &um->new_daddr, sizeof(ma->new_daddr));
                memcpy(&ma->new_saddr, &um->new_saddr, sizeof(ma->new_saddr));

                ma->proto = um->proto;
                ma->mode = um->mode;
                ma->reqid = um->reqid;

                ma->old_family = um->old_family;
                ma->new_family = um->new_family;
        }

        *num = i;
        return 0;
}

static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh,
                           struct nlattr **attrs, struct netlink_ext_ack *extack)
{
        struct xfrm_userpolicy_id *pi = nlmsg_data(nlh);
        struct xfrm_migrate m[XFRM_MAX_DEPTH];
        struct xfrm_kmaddress km, *kmp;
        u8 type;
        int err;
        int n = 0;
        struct net *net = sock_net(skb->sk);
        struct xfrm_encap_tmpl  *encap = NULL;
        struct xfrm_user_offload *xuo = NULL;
        u32 if_id = 0;

        if (!attrs[XFRMA_MIGRATE]) {
                NL_SET_ERR_MSG(extack, "Missing required MIGRATE attribute");
                return -EINVAL;
        }

        kmp = attrs[XFRMA_KMADDRESS] ? &km : NULL;

        err = copy_from_user_policy_type(&type, attrs, extack);
        if (err)
                return err;

        err = copy_from_user_migrate(m, kmp, attrs, &n, extack);
        if (err)
                return err;

        if (!n)
                return 0;

        if (attrs[XFRMA_ENCAP]) {
                encap = kmemdup(nla_data(attrs[XFRMA_ENCAP]),
                                sizeof(*encap), GFP_KERNEL);
                if (!encap)
                        return -ENOMEM;
        }

        if (attrs[XFRMA_IF_ID])
                if_id = nla_get_u32(attrs[XFRMA_IF_ID]);

        if (attrs[XFRMA_OFFLOAD_DEV]) {
                xuo = kmemdup(nla_data(attrs[XFRMA_OFFLOAD_DEV]),
                              sizeof(*xuo), GFP_KERNEL);
                if (!xuo) {
                        err = -ENOMEM;
                        goto error;
                }
        }
        err = xfrm_migrate(&pi->sel, pi->dir, type, m, n, kmp, net, encap,
                           if_id, extack, xuo);
error:
        kfree(encap);
        kfree(xuo);
        return err;
}
#else
static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh,
                           struct nlattr **attrs, struct netlink_ext_ack *extack)
{
        return -ENOPROTOOPT;
}
#endif

#ifdef CONFIG_XFRM_MIGRATE
static int copy_to_user_migrate(const struct xfrm_migrate *m, struct sk_buff *skb)
{
        struct xfrm_user_migrate um;

        memset(&um, 0, sizeof(um));
        um.proto = m->proto;
        um.mode = m->mode;
        um.reqid = m->reqid;
        um.old_family = m->old_family;
        memcpy(&um.old_daddr, &m->old_daddr, sizeof(um.old_daddr));
        memcpy(&um.old_saddr, &m->old_saddr, sizeof(um.old_saddr));
        um.new_family = m->new_family;
        memcpy(&um.new_daddr, &m->new_daddr, sizeof(um.new_daddr));
        memcpy(&um.new_saddr, &m->new_saddr, sizeof(um.new_saddr));

        return nla_put(skb, XFRMA_MIGRATE, sizeof(um), &um);
}

static int copy_to_user_kmaddress(const struct xfrm_kmaddress *k, struct sk_buff *skb)
{
        struct xfrm_user_kmaddress uk;

        memset(&uk, 0, sizeof(uk));
        uk.family = k->family;
        uk.reserved = k->reserved;
        memcpy(&uk.local, &k->local, sizeof(uk.local));
        memcpy(&uk.remote, &k->remote, sizeof(uk.remote));

        return nla_put(skb, XFRMA_KMADDRESS, sizeof(uk), &uk);
}

static inline unsigned int xfrm_migrate_msgsize(int num_migrate, int with_kma,
                                                int with_encp)
{
        return NLMSG_ALIGN(sizeof(struct xfrm_userpolicy_id))
              + (with_kma ? nla_total_size(sizeof(struct xfrm_kmaddress)) : 0)
              + (with_encp ? nla_total_size(sizeof(struct xfrm_encap_tmpl)) : 0)
              + nla_total_size(sizeof(struct xfrm_user_migrate) * num_migrate)
              + userpolicy_type_attrsize();
}

static int build_migrate(struct sk_buff *skb, const struct xfrm_migrate *m,
                         int num_migrate, const struct xfrm_kmaddress *k,
                         const struct xfrm_selector *sel,
                         const struct xfrm_encap_tmpl *encap, u8 dir, u8 type)
{
        const struct xfrm_migrate *mp;
        struct xfrm_userpolicy_id *pol_id;
        struct nlmsghdr *nlh;
        int i, err;

        nlh = nlmsg_put(skb, 0, 0, XFRM_MSG_MIGRATE, sizeof(*pol_id), 0);
        if (nlh == NULL)
                return -EMSGSIZE;

        pol_id = nlmsg_data(nlh);
        /* copy data from selector, dir, and type to the pol_id */
        memset(pol_id, 0, sizeof(*pol_id));
        memcpy(&pol_id->sel, sel, sizeof(pol_id->sel));
        pol_id->dir = dir;

        if (k != NULL) {
                err = copy_to_user_kmaddress(k, skb);
                if (err)
                        goto out_cancel;
        }
        if (encap) {
                err = nla_put(skb, XFRMA_ENCAP, sizeof(*encap), encap);
                if (err)
                        goto out_cancel;
        }
        err = copy_to_user_policy_type(type, skb);
        if (err)
                goto out_cancel;
        for (i = 0, mp = m ; i < num_migrate; i++, mp++) {
                err = copy_to_user_migrate(mp, skb);
                if (err)
                        goto out_cancel;
        }

        nlmsg_end(skb, nlh);
        return 0;

out_cancel:
        nlmsg_cancel(skb, nlh);
        return err;
}

static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
                             const struct xfrm_migrate *m, int num_migrate,
                             const struct xfrm_kmaddress *k,
                             const struct xfrm_encap_tmpl *encap)
{
        struct net *net = &init_net;
        struct sk_buff *skb;
        int err;

        skb = nlmsg_new(xfrm_migrate_msgsize(num_migrate, !!k, !!encap),
                        GFP_ATOMIC);
        if (skb == NULL)
                return -ENOMEM;

        /* build migrate */
        err = build_migrate(skb, m, num_migrate, k, sel, encap, dir, type);
        BUG_ON(err < 0);

        return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_MIGRATE);
}
#else
static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
                             const struct xfrm_migrate *m, int num_migrate,
                             const struct xfrm_kmaddress *k,
                             const struct xfrm_encap_tmpl *encap)
{
        return -ENOPROTOOPT;
}
#endif

#define XMSGSIZE(type) sizeof(struct type)

const int xfrm_msg_min[XFRM_NR_MSGTYPES] = {
        [XFRM_MSG_NEWSA       - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_info),
        [XFRM_MSG_DELSA       - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_id),
        [XFRM_MSG_GETSA       - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_id),
        [XFRM_MSG_NEWPOLICY   - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_info),
        [XFRM_MSG_DELPOLICY   - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id),
        [XFRM_MSG_GETPOLICY   - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id),
        [XFRM_MSG_ALLOCSPI    - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userspi_info),
        [XFRM_MSG_ACQUIRE     - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_acquire),
        [XFRM_MSG_EXPIRE      - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_expire),
        [XFRM_MSG_UPDPOLICY   - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_info),
        [XFRM_MSG_UPDSA       - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_info),
        [XFRM_MSG_POLEXPIRE   - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_polexpire),
        [XFRM_MSG_FLUSHSA     - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_flush),
        [XFRM_MSG_FLUSHPOLICY - XFRM_MSG_BASE] = 0,
        [XFRM_MSG_NEWAE       - XFRM_MSG_BASE] = XMSGSIZE(xfrm_aevent_id),
        [XFRM_MSG_GETAE       - XFRM_MSG_BASE] = XMSGSIZE(xfrm_aevent_id),
        [XFRM_MSG_REPORT      - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_report),
        [XFRM_MSG_MIGRATE     - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id),
        [XFRM_MSG_GETSADINFO  - XFRM_MSG_BASE] = sizeof(u32),
        [XFRM_MSG_NEWSPDINFO  - XFRM_MSG_BASE] = sizeof(u32),
        [XFRM_MSG_GETSPDINFO  - XFRM_MSG_BASE] = sizeof(u32),
        [XFRM_MSG_SETDEFAULT  - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_default),
        [XFRM_MSG_GETDEFAULT  - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_default),
};
EXPORT_SYMBOL_GPL(xfrm_msg_min);

#undef XMSGSIZE

const struct nla_policy xfrma_policy[XFRMA_MAX+1] = {
        [XFRMA_UNSPEC]                = { .strict_start_type = XFRMA_SA_DIR },
        [XFRMA_SA]                = { .len = sizeof(struct xfrm_usersa_info)},
        [XFRMA_POLICY]                = { .len = sizeof(struct xfrm_userpolicy_info)},
        [XFRMA_LASTUSED]        = { .type = NLA_U64},
        [XFRMA_ALG_AUTH_TRUNC]        = { .len = sizeof(struct xfrm_algo_auth)},
        [XFRMA_ALG_AEAD]        = { .len = sizeof(struct xfrm_algo_aead) },
        [XFRMA_ALG_AUTH]        = { .len = sizeof(struct xfrm_algo) },
        [XFRMA_ALG_CRYPT]        = { .len = sizeof(struct xfrm_algo) },
        [XFRMA_ALG_COMP]        = { .len = sizeof(struct xfrm_algo) },
        [XFRMA_ENCAP]                = { .len = sizeof(struct xfrm_encap_tmpl) },
        [XFRMA_TMPL]                = { .len = sizeof(struct xfrm_user_tmpl) },
        [XFRMA_SEC_CTX]                = { .len = sizeof(struct xfrm_user_sec_ctx) },
        [XFRMA_LTIME_VAL]        = { .len = sizeof(struct xfrm_lifetime_cur) },
        [XFRMA_REPLAY_VAL]        = { .len = sizeof(struct xfrm_replay_state) },
        [XFRMA_REPLAY_THRESH]        = { .type = NLA_U32 },
        [XFRMA_ETIMER_THRESH]        = { .type = NLA_U32 },
        [XFRMA_SRCADDR]                = { .len = sizeof(xfrm_address_t) },
        [XFRMA_COADDR]                = { .len = sizeof(xfrm_address_t) },
        [XFRMA_POLICY_TYPE]        = { .len = sizeof(struct xfrm_userpolicy_type)},
        [XFRMA_MIGRATE]                = { .len = sizeof(struct xfrm_user_migrate) },
        [XFRMA_KMADDRESS]        = { .len = sizeof(struct xfrm_user_kmaddress) },
        [XFRMA_MARK]                = { .len = sizeof(struct xfrm_mark) },
        [XFRMA_TFCPAD]                = { .type = NLA_U32 },
        [XFRMA_REPLAY_ESN_VAL]        = { .len = sizeof(struct xfrm_replay_state_esn) },
        [XFRMA_SA_EXTRA_FLAGS]        = { .type = NLA_U32 },
        [XFRMA_PROTO]                = { .type = NLA_U8 },
        [XFRMA_ADDRESS_FILTER]        = { .len = sizeof(struct xfrm_address_filter) },
        [XFRMA_OFFLOAD_DEV]        = { .len = sizeof(struct xfrm_user_offload) },
        [XFRMA_SET_MARK]        = { .type = NLA_U32 },
        [XFRMA_SET_MARK_MASK]        = { .type = NLA_U32 },
        [XFRMA_IF_ID]                = { .type = NLA_U32 },
        [XFRMA_MTIMER_THRESH]   = { .type = NLA_U32 },
        [XFRMA_SA_DIR]          = NLA_POLICY_RANGE(NLA_U8, XFRM_SA_DIR_IN, XFRM_SA_DIR_OUT),
        [XFRMA_NAT_KEEPALIVE_INTERVAL] = { .type = NLA_U32 },
        [XFRMA_SA_PCPU]                = { .type = NLA_U32 },
        [XFRMA_IPTFS_DROP_TIME]                = { .type = NLA_U32 },
        [XFRMA_IPTFS_REORDER_WINDOW]        = { .type = NLA_U16 },
        [XFRMA_IPTFS_DONT_FRAG]                = { .type = NLA_FLAG },
        [XFRMA_IPTFS_INIT_DELAY]        = { .type = NLA_U32 },
        [XFRMA_IPTFS_MAX_QSIZE]                = { .type = NLA_U32 },
        [XFRMA_IPTFS_PKT_SIZE]        = { .type = NLA_U32 },
};
EXPORT_SYMBOL_GPL(xfrma_policy);

static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = {
        [XFRMA_SPD_IPV4_HTHRESH] = { .len = sizeof(struct xfrmu_spdhthresh) },
        [XFRMA_SPD_IPV6_HTHRESH] = { .len = sizeof(struct xfrmu_spdhthresh) },
};

static const struct xfrm_link {
        int (*doit)(struct sk_buff *, struct nlmsghdr *, struct nlattr **,
                    struct netlink_ext_ack *);
        int (*start)(struct netlink_callback *);
        int (*dump)(struct sk_buff *, struct netlink_callback *);
        int (*done)(struct netlink_callback *);
        const struct nla_policy *nla_pol;
        int nla_max;
} xfrm_dispatch[XFRM_NR_MSGTYPES] = {
        [XFRM_MSG_NEWSA       - XFRM_MSG_BASE] = { .doit = xfrm_add_sa        },
        [XFRM_MSG_DELSA       - XFRM_MSG_BASE] = { .doit = xfrm_del_sa        },
        [XFRM_MSG_GETSA       - XFRM_MSG_BASE] = { .doit = xfrm_get_sa,
                                                   .dump = xfrm_dump_sa,
                                                   .done = xfrm_dump_sa_done  },
        [XFRM_MSG_NEWPOLICY   - XFRM_MSG_BASE] = { .doit = xfrm_add_policy    },
        [XFRM_MSG_DELPOLICY   - XFRM_MSG_BASE] = { .doit = xfrm_get_policy    },
        [XFRM_MSG_GETPOLICY   - XFRM_MSG_BASE] = { .doit = xfrm_get_policy,
                                                   .start = xfrm_dump_policy_start,
                                                   .dump = xfrm_dump_policy,
                                                   .done = xfrm_dump_policy_done },
        [XFRM_MSG_ALLOCSPI    - XFRM_MSG_BASE] = { .doit = xfrm_alloc_userspi },
        [XFRM_MSG_ACQUIRE     - XFRM_MSG_BASE] = { .doit = xfrm_add_acquire   },
        [XFRM_MSG_EXPIRE      - XFRM_MSG_BASE] = { .doit = xfrm_add_sa_expire },
        [XFRM_MSG_UPDPOLICY   - XFRM_MSG_BASE] = { .doit = xfrm_add_policy    },
        [XFRM_MSG_UPDSA       - XFRM_MSG_BASE] = { .doit = xfrm_add_sa        },
        [XFRM_MSG_POLEXPIRE   - XFRM_MSG_BASE] = { .doit = xfrm_add_pol_expire},
        [XFRM_MSG_FLUSHSA     - XFRM_MSG_BASE] = { .doit = xfrm_flush_sa      },
        [XFRM_MSG_FLUSHPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_flush_policy  },
        [XFRM_MSG_NEWAE       - XFRM_MSG_BASE] = { .doit = xfrm_new_ae  },
        [XFRM_MSG_GETAE       - XFRM_MSG_BASE] = { .doit = xfrm_get_ae  },
        [XFRM_MSG_MIGRATE     - XFRM_MSG_BASE] = { .doit = xfrm_do_migrate    },
        [XFRM_MSG_GETSADINFO  - XFRM_MSG_BASE] = { .doit = xfrm_get_sadinfo   },
        [XFRM_MSG_NEWSPDINFO  - XFRM_MSG_BASE] = { .doit = xfrm_set_spdinfo,
                                                   .nla_pol = xfrma_spd_policy,
                                                   .nla_max = XFRMA_SPD_MAX },
        [XFRM_MSG_GETSPDINFO  - XFRM_MSG_BASE] = { .doit = xfrm_get_spdinfo   },
        [XFRM_MSG_SETDEFAULT  - XFRM_MSG_BASE] = { .doit = xfrm_set_default   },
        [XFRM_MSG_GETDEFAULT  - XFRM_MSG_BASE] = { .doit = xfrm_get_default   },
};

static int xfrm_reject_unused_attr(int type, struct nlattr **attrs,
                                   struct netlink_ext_ack *extack)
{
        if (attrs[XFRMA_SA_DIR]) {
                switch (type) {
                case XFRM_MSG_NEWSA:
                case XFRM_MSG_UPDSA:
                case XFRM_MSG_ALLOCSPI:
                        break;
                default:
                        NL_SET_ERR_MSG(extack, "Invalid attribute SA_DIR");
                        return -EINVAL;
                }
        }

        if (attrs[XFRMA_SA_PCPU]) {
                switch (type) {
                case XFRM_MSG_NEWSA:
                case XFRM_MSG_UPDSA:
                case XFRM_MSG_ALLOCSPI:
                case XFRM_MSG_ACQUIRE:

                        break;
                default:
                        NL_SET_ERR_MSG(extack, "Invalid attribute SA_PCPU");
                        return -EINVAL;
                }
        }

        return 0;
}

static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
                             struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct nlattr *attrs[XFRMA_MAX+1];
        const struct xfrm_link *link;
        struct nlmsghdr *nlh64 = NULL;
        int type, err;

        type = nlh->nlmsg_type;
        if (type > XFRM_MSG_MAX)
                return -EINVAL;

        type -= XFRM_MSG_BASE;
        link = &xfrm_dispatch[type];

        /* All operations require privileges, even GET */
        if (!netlink_net_capable(skb, CAP_NET_ADMIN))
                return -EPERM;

        if (in_compat_syscall()) {
                struct xfrm_translator *xtr = xfrm_get_translator();

                if (!xtr)
                        return -EOPNOTSUPP;

                nlh64 = xtr->rcv_msg_compat(nlh, link->nla_max,
                                            link->nla_pol, extack);
                xfrm_put_translator(xtr);
                if (IS_ERR(nlh64))
                        return PTR_ERR(nlh64);
                if (nlh64)
                        nlh = nlh64;
        }

        if ((type == (XFRM_MSG_GETSA - XFRM_MSG_BASE) ||
             type == (XFRM_MSG_GETPOLICY - XFRM_MSG_BASE)) &&
            (nlh->nlmsg_flags & NLM_F_DUMP)) {
                struct netlink_dump_control c = {
                        .start = link->start,
                        .dump = link->dump,
                        .done = link->done,
                };

                if (link->dump == NULL) {
                        err = -EINVAL;
                        goto err;
                }

                err = netlink_dump_start(net->xfrm.nlsk, skb, nlh, &c);
                goto err;
        }

        err = nlmsg_parse_deprecated(nlh, xfrm_msg_min[type], attrs,
                                     link->nla_max ? : XFRMA_MAX,
                                     link->nla_pol ? : xfrma_policy, extack);
        if (err < 0)
                goto err;

        if (!link->nla_pol || link->nla_pol == xfrma_policy) {
                err = xfrm_reject_unused_attr((type + XFRM_MSG_BASE), attrs, extack);
                if (err < 0)
                        goto err;
        }

        if (link->doit == NULL) {
                err = -EINVAL;
                goto err;
        }

        err = link->doit(skb, nlh, attrs, extack);

        /* We need to free skb allocated in xfrm_alloc_compat() before
         * returning from this function, because consume_skb() won't take
         * care of frag_list since netlink destructor sets
         * sbk->head to NULL. (see netlink_skb_destructor())
         */
        if (skb_has_frag_list(skb)) {
                kfree_skb(skb_shinfo(skb)->frag_list);
                skb_shinfo(skb)->frag_list = NULL;
        }

err:
        kvfree(nlh64);
        return err;
}

static void xfrm_netlink_rcv(struct sk_buff *skb)
{
        struct net *net = sock_net(skb->sk);

        mutex_lock(&net->xfrm.xfrm_cfg_mutex);
        netlink_rcv_skb(skb, &xfrm_user_rcv_msg);
        mutex_unlock(&net->xfrm.xfrm_cfg_mutex);
}

static inline unsigned int xfrm_expire_msgsize(void)
{
        return NLMSG_ALIGN(sizeof(struct xfrm_user_expire)) +
               nla_total_size(sizeof(struct xfrm_mark)) +
               nla_total_size(sizeof_field(struct xfrm_state, dir)) +
               nla_total_size(4); /* XFRMA_SA_PCPU */
}

static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct km_event *c)
{
        struct xfrm_user_expire *ue;
        struct nlmsghdr *nlh;
        int err;

        nlh = nlmsg_put(skb, c->portid, 0, XFRM_MSG_EXPIRE, sizeof(*ue), 0);
        if (nlh == NULL)
                return -EMSGSIZE;

        ue = nlmsg_data(nlh);
        copy_to_user_state(x, &ue->state);
        ue->hard = (c->data.hard != 0) ? 1 : 0;
        /* clear the padding bytes */
        memset_after(ue, 0, hard);

        err = xfrm_mark_put(skb, &x->mark);
        if (err)
                return err;

        err = xfrm_if_id_put(skb, x->if_id);
        if (err)
                return err;
        if (x->pcpu_num != UINT_MAX) {
                err = nla_put_u32(skb, XFRMA_SA_PCPU, x->pcpu_num);
                if (err)
                        return err;
        }

        if (x->dir) {
                err = nla_put_u8(skb, XFRMA_SA_DIR, x->dir);
                if (err)
                        return err;
        }

        nlmsg_end(skb, nlh);
        return 0;
}

static int xfrm_exp_state_notify(struct xfrm_state *x, const struct km_event *c)
{
        struct net *net = xs_net(x);
        struct sk_buff *skb;

        skb = nlmsg_new(xfrm_expire_msgsize(), GFP_ATOMIC);
        if (skb == NULL)
                return -ENOMEM;

        if (build_expire(skb, x, c) < 0) {
                kfree_skb(skb);
                return -EMSGSIZE;
        }

        return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_EXPIRE);
}

static int xfrm_aevent_state_notify(struct xfrm_state *x, const struct km_event *c)
{
        struct net *net = xs_net(x);
        struct sk_buff *skb;
        int err;

        skb = nlmsg_new(xfrm_aevent_msgsize(x), GFP_ATOMIC);
        if (skb == NULL)
                return -ENOMEM;

        err = build_aevent(skb, x, c);
        BUG_ON(err < 0);

        return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_AEVENTS);
}

static int xfrm_notify_sa_flush(const struct km_event *c)
{
        struct net *net = c->net;
        struct xfrm_usersa_flush *p;
        struct nlmsghdr *nlh;
        struct sk_buff *skb;
        int len = NLMSG_ALIGN(sizeof(struct xfrm_usersa_flush));

        skb = nlmsg_new(len, GFP_ATOMIC);
        if (skb == NULL)
                return -ENOMEM;

        nlh = nlmsg_put(skb, c->portid, c->seq, XFRM_MSG_FLUSHSA, sizeof(*p), 0);
        if (nlh == NULL) {
                kfree_skb(skb);
                return -EMSGSIZE;
        }

        p = nlmsg_data(nlh);
        p->proto = c->data.proto;

        nlmsg_end(skb, nlh);

        return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_SA);
}

static inline unsigned int xfrm_sa_len(struct xfrm_state *x)
{
        unsigned int l = 0;
        if (x->aead)
                l += nla_total_size(aead_len(x->aead));
        if (x->aalg) {
                l += nla_total_size(sizeof(struct xfrm_algo) +
                                    (x->aalg->alg_key_len + 7) / 8);
                l += nla_total_size(xfrm_alg_auth_len(x->aalg));
        }
        if (x->ealg)
                l += nla_total_size(xfrm_alg_len(x->ealg));
        if (x->calg)
                l += nla_total_size(sizeof(*x->calg));
        if (x->encap)
                l += nla_total_size(sizeof(*x->encap));
        if (x->tfcpad)
                l += nla_total_size(sizeof(x->tfcpad));
        if (x->replay_esn)
                l += nla_total_size(xfrm_replay_state_esn_len(x->replay_esn));
        else
                l += nla_total_size(sizeof(struct xfrm_replay_state));
        if (x->security)
                l += nla_total_size(sizeof(struct xfrm_user_sec_ctx) +
                                    x->security->ctx_len);
        if (x->coaddr)
                l += nla_total_size(sizeof(*x->coaddr));
        if (x->props.extra_flags)
                l += nla_total_size(sizeof(x->props.extra_flags));
        if (x->xso.dev)
                 l += nla_total_size(sizeof(struct xfrm_user_offload));
        if (x->props.smark.v | x->props.smark.m) {
                l += nla_total_size(sizeof(x->props.smark.v));
                l += nla_total_size(sizeof(x->props.smark.m));
        }
        if (x->if_id)
                l += nla_total_size(sizeof(x->if_id));
        if (x->pcpu_num)
                l += nla_total_size(sizeof(x->pcpu_num));

        /* Must count x->lastused as it may become non-zero behind our back. */
        l += nla_total_size_64bit(sizeof(u64));

        if (x->mapping_maxage)
                l += nla_total_size(sizeof(x->mapping_maxage));

        if (x->dir)
                l += nla_total_size(sizeof(x->dir));

        if (x->nat_keepalive_interval)
                l += nla_total_size(sizeof(x->nat_keepalive_interval));

        if (x->mode_cbs && x->mode_cbs->sa_len)
                l += x->mode_cbs->sa_len(x);

        return l;
}

static int xfrm_notify_sa(struct xfrm_state *x, const struct km_event *c)
{
        struct net *net = xs_net(x);
        struct xfrm_usersa_info *p;
        struct xfrm_usersa_id *id;
        struct nlmsghdr *nlh;
        struct sk_buff *skb;
        unsigned int len = xfrm_sa_len(x);
        unsigned int headlen;
        int err;

        headlen = sizeof(*p);
        if (c->event == XFRM_MSG_DELSA) {
                len += nla_total_size(headlen);
                headlen = sizeof(*id);
                len += nla_total_size(sizeof(struct xfrm_mark));
        }
        len += NLMSG_ALIGN(headlen);

        skb = nlmsg_new(len, GFP_ATOMIC);
        if (skb == NULL)
                return -ENOMEM;

        nlh = nlmsg_put(skb, c->portid, c->seq, c->event, headlen, 0);
        err = -EMSGSIZE;
        if (nlh == NULL)
                goto out_free_skb;

        p = nlmsg_data(nlh);
        if (c->event == XFRM_MSG_DELSA) {
                struct nlattr *attr;

                id = nlmsg_data(nlh);
                memset(id, 0, sizeof(*id));
                memcpy(&id->daddr, &x->id.daddr, sizeof(id->daddr));
                id->spi = x->id.spi;
                id->family = x->props.family;
                id->proto = x->id.proto;

                attr = nla_reserve(skb, XFRMA_SA, sizeof(*p));
                err = -EMSGSIZE;
                if (attr == NULL)
                        goto out_free_skb;

                p = nla_data(attr);
        }
        err = copy_to_user_state_extra(x, p, skb);
        if (err)
                goto out_free_skb;

        nlmsg_end(skb, nlh);

        return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_SA);

out_free_skb:
        kfree_skb(skb);
        return err;
}

static int xfrm_send_state_notify(struct xfrm_state *x, const struct km_event *c)
{

        switch (c->event) {
        case XFRM_MSG_EXPIRE:
                return xfrm_exp_state_notify(x, c);
        case XFRM_MSG_NEWAE:
                return xfrm_aevent_state_notify(x, c);
        case XFRM_MSG_DELSA:
        case XFRM_MSG_UPDSA:
        case XFRM_MSG_NEWSA:
                return xfrm_notify_sa(x, c);
        case XFRM_MSG_FLUSHSA:
                return xfrm_notify_sa_flush(c);
        default:
                printk(KERN_NOTICE "xfrm_user: Unknown SA event %d\n",
                       c->event);
                break;
        }

        return 0;

}

static inline unsigned int xfrm_acquire_msgsize(struct xfrm_state *x,
                                                struct xfrm_policy *xp)
{
        return NLMSG_ALIGN(sizeof(struct xfrm_user_acquire))
               + nla_total_size(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr)
               + nla_total_size(sizeof(struct xfrm_mark))
               + nla_total_size(xfrm_user_sec_ctx_size(x->security))
               + nla_total_size(4) /* XFRMA_SA_PCPU */
               + userpolicy_type_attrsize();
}

static int build_acquire(struct sk_buff *skb, struct xfrm_state *x,
                         struct xfrm_tmpl *xt, struct xfrm_policy *xp)
{
        __u32 seq = xfrm_get_acqseq();
        struct xfrm_user_acquire *ua;
        struct nlmsghdr *nlh;
        int err;

        nlh = nlmsg_put(skb, 0, 0, XFRM_MSG_ACQUIRE, sizeof(*ua), 0);
        if (nlh == NULL)
                return -EMSGSIZE;

        ua = nlmsg_data(nlh);
        memcpy(&ua->id, &x->id, sizeof(ua->id));
        memcpy(&ua->saddr, &x->props.saddr, sizeof(ua->saddr));
        memcpy(&ua->sel, &x->sel, sizeof(ua->sel));
        copy_to_user_policy(xp, &ua->policy, XFRM_POLICY_OUT);
        ua->aalgos = xt->aalgos;
        ua->ealgos = xt->ealgos;
        ua->calgos = xt->calgos;
        ua->seq = x->km.seq = seq;

        err = copy_to_user_tmpl(xp, skb);
        if (!err)
                err = copy_to_user_state_sec_ctx(x, skb);
        if (!err)
                err = copy_to_user_policy_type(xp->type, skb);
        if (!err)
                err = xfrm_mark_put(skb, &xp->mark);
        if (!err)
                err = xfrm_if_id_put(skb, xp->if_id);
        if (!err && xp->xdo.dev)
                err = copy_user_offload(&xp->xdo, skb);
        if (!err && x->pcpu_num != UINT_MAX)
                err = nla_put_u32(skb, XFRMA_SA_PCPU, x->pcpu_num);
        if (err) {
                nlmsg_cancel(skb, nlh);
                return err;
        }

        nlmsg_end(skb, nlh);
        return 0;
}

static int xfrm_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *xt,
                             struct xfrm_policy *xp)
{
        struct net *net = xs_net(x);
        struct sk_buff *skb;
        int err;

        skb = nlmsg_new(xfrm_acquire_msgsize(x, xp), GFP_ATOMIC);
        if (skb == NULL)
                return -ENOMEM;

        err = build_acquire(skb, x, xt, xp);
        BUG_ON(err < 0);

        return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_ACQUIRE);
}

/* User gives us xfrm_user_policy_info followed by an array of 0
 * or more templates.
 */
static struct xfrm_policy *xfrm_compile_policy(struct sock *sk, int opt,
                                               u8 *data, int len, int *dir)
{
        struct net *net = sock_net(sk);
        struct xfrm_userpolicy_info *p = (struct xfrm_userpolicy_info *)data;
        struct xfrm_user_tmpl *ut = (struct xfrm_user_tmpl *) (p + 1);
        struct xfrm_policy *xp;
        int nr;

        switch (sk->sk_family) {
        case AF_INET:
                if (opt != IP_XFRM_POLICY) {
                        *dir = -EOPNOTSUPP;
                        return NULL;
                }
                break;
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                if (opt != IPV6_XFRM_POLICY) {
                        *dir = -EOPNOTSUPP;
                        return NULL;
                }
                break;
#endif
        default:
                *dir = -EINVAL;
                return NULL;
        }

        *dir = -EINVAL;

        if (len < sizeof(*p) ||
            verify_newpolicy_info(p, NULL))
                return NULL;

        nr = ((len - sizeof(*p)) / sizeof(*ut));
        if (validate_tmpl(nr, ut, p->sel.family, p->dir, NULL))
                return NULL;

        if (p->dir > XFRM_POLICY_OUT)
                return NULL;

        xp = xfrm_policy_alloc(net, GFP_ATOMIC);
        if (xp == NULL) {
                *dir = -ENOBUFS;
                return NULL;
        }

        copy_from_user_policy(xp, p);
        xp->type = XFRM_POLICY_TYPE_MAIN;
        copy_templates(xp, ut, nr);

        *dir = p->dir;

        return xp;
}

static inline unsigned int xfrm_polexpire_msgsize(struct xfrm_policy *xp)
{
        return NLMSG_ALIGN(sizeof(struct xfrm_user_polexpire))
               + nla_total_size(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr)
               + nla_total_size(xfrm_user_sec_ctx_size(xp->security))
               + nla_total_size(sizeof(struct xfrm_mark))
               + userpolicy_type_attrsize();
}

static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp,
                           int dir, const struct km_event *c)
{
        struct xfrm_user_polexpire *upe;
        int hard = c->data.hard;
        struct nlmsghdr *nlh;
        int err;

        nlh = nlmsg_put(skb, c->portid, 0, XFRM_MSG_POLEXPIRE, sizeof(*upe), 0);
        if (nlh == NULL)
                return -EMSGSIZE;

        upe = nlmsg_data(nlh);
        copy_to_user_policy(xp, &upe->pol, dir);
        err = copy_to_user_tmpl(xp, skb);
        if (!err)
                err = copy_to_user_sec_ctx(xp, skb);
        if (!err)
                err = copy_to_user_policy_type(xp->type, skb);
        if (!err)
                err = xfrm_mark_put(skb, &xp->mark);
        if (!err)
                err = xfrm_if_id_put(skb, xp->if_id);
        if (!err && xp->xdo.dev)
                err = copy_user_offload(&xp->xdo, skb);
        if (err) {
                nlmsg_cancel(skb, nlh);
                return err;
        }
        upe->hard = !!hard;

        nlmsg_end(skb, nlh);
        return 0;
}

static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c)
{
        struct net *net = xp_net(xp);
        struct sk_buff *skb;
        int err;

        skb = nlmsg_new(xfrm_polexpire_msgsize(xp), GFP_ATOMIC);
        if (skb == NULL)
                return -ENOMEM;

        err = build_polexpire(skb, xp, dir, c);
        BUG_ON(err < 0);

        return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_EXPIRE);
}

static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, const struct km_event *c)
{
        unsigned int len = nla_total_size(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr);
        struct net *net = xp_net(xp);
        struct xfrm_userpolicy_info *p;
        struct xfrm_userpolicy_id *id;
        struct nlmsghdr *nlh;
        struct sk_buff *skb;
        unsigned int headlen;
        int err;

        headlen = sizeof(*p);
        if (c->event == XFRM_MSG_DELPOLICY) {
                len += nla_total_size(headlen);
                headlen = sizeof(*id);
        }
        len += userpolicy_type_attrsize();
        len += nla_total_size(sizeof(struct xfrm_mark));
        len += NLMSG_ALIGN(headlen);

        skb = nlmsg_new(len, GFP_ATOMIC);
        if (skb == NULL)
                return -ENOMEM;

        nlh = nlmsg_put(skb, c->portid, c->seq, c->event, headlen, 0);
        err = -EMSGSIZE;
        if (nlh == NULL)
                goto out_free_skb;

        p = nlmsg_data(nlh);
        if (c->event == XFRM_MSG_DELPOLICY) {
                struct nlattr *attr;

                id = nlmsg_data(nlh);
                memset(id, 0, sizeof(*id));
                id->dir = dir;
                if (c->data.byid)
                        id->index = xp->index;
                else
                        memcpy(&id->sel, &xp->selector, sizeof(id->sel));

                attr = nla_reserve(skb, XFRMA_POLICY, sizeof(*p));
                err = -EMSGSIZE;
                if (attr == NULL)
                        goto out_free_skb;

                p = nla_data(attr);
        }

        copy_to_user_policy(xp, p, dir);
        err = copy_to_user_tmpl(xp, skb);
        if (!err)
                err = copy_to_user_policy_type(xp->type, skb);
        if (!err)
                err = xfrm_mark_put(skb, &xp->mark);
        if (!err)
                err = xfrm_if_id_put(skb, xp->if_id);
        if (!err && xp->xdo.dev)
                err = copy_user_offload(&xp->xdo, skb);
        if (err)
                goto out_free_skb;

        nlmsg_end(skb, nlh);

        return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_POLICY);

out_free_skb:
        kfree_skb(skb);
        return err;
}

static int xfrm_notify_policy_flush(const struct km_event *c)
{
        struct net *net = c->net;
        struct nlmsghdr *nlh;
        struct sk_buff *skb;
        int err;

        skb = nlmsg_new(userpolicy_type_attrsize(), GFP_ATOMIC);
        if (skb == NULL)
                return -ENOMEM;

        nlh = nlmsg_put(skb, c->portid, c->seq, XFRM_MSG_FLUSHPOLICY, 0, 0);
        err = -EMSGSIZE;
        if (nlh == NULL)
                goto out_free_skb;
        err = copy_to_user_policy_type(c->data.type, skb);
        if (err)
                goto out_free_skb;

        nlmsg_end(skb, nlh);

        return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_POLICY);

out_free_skb:
        kfree_skb(skb);
        return err;
}

static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c)
{

        switch (c->event) {
        case XFRM_MSG_NEWPOLICY:
        case XFRM_MSG_UPDPOLICY:
        case XFRM_MSG_DELPOLICY:
                return xfrm_notify_policy(xp, dir, c);
        case XFRM_MSG_FLUSHPOLICY:
                return xfrm_notify_policy_flush(c);
        case XFRM_MSG_POLEXPIRE:
                return xfrm_exp_policy_notify(xp, dir, c);
        default:
                printk(KERN_NOTICE "xfrm_user: Unknown Policy event %d\n",
                       c->event);
        }

        return 0;

}

static inline unsigned int xfrm_report_msgsize(void)
{
        return NLMSG_ALIGN(sizeof(struct xfrm_user_report));
}

static int build_report(struct sk_buff *skb, u8 proto,
                        struct xfrm_selector *sel, xfrm_address_t *addr)
{
        struct xfrm_user_report *ur;
        struct nlmsghdr *nlh;

        nlh = nlmsg_put(skb, 0, 0, XFRM_MSG_REPORT, sizeof(*ur), 0);
        if (nlh == NULL)
                return -EMSGSIZE;

        ur = nlmsg_data(nlh);
        ur->proto = proto;
        memcpy(&ur->sel, sel, sizeof(ur->sel));

        if (addr) {
                int err = nla_put(skb, XFRMA_COADDR, sizeof(*addr), addr);
                if (err) {
                        nlmsg_cancel(skb, nlh);
                        return err;
                }
        }
        nlmsg_end(skb, nlh);
        return 0;
}

static int xfrm_send_report(struct net *net, u8 proto,
                            struct xfrm_selector *sel, xfrm_address_t *addr)
{
        struct sk_buff *skb;
        int err;

        skb = nlmsg_new(xfrm_report_msgsize(), GFP_ATOMIC);
        if (skb == NULL)
                return -ENOMEM;

        err = build_report(skb, proto, sel, addr);
        BUG_ON(err < 0);

        return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_REPORT);
}

static inline unsigned int xfrm_mapping_msgsize(void)
{
        return NLMSG_ALIGN(sizeof(struct xfrm_user_mapping));
}

static int build_mapping(struct sk_buff *skb, struct xfrm_state *x,
                         xfrm_address_t *new_saddr, __be16 new_sport)
{
        struct xfrm_user_mapping *um;
        struct nlmsghdr *nlh;

        nlh = nlmsg_put(skb, 0, 0, XFRM_MSG_MAPPING, sizeof(*um), 0);
        if (nlh == NULL)
                return -EMSGSIZE;

        um = nlmsg_data(nlh);

        memcpy(&um->id.daddr, &x->id.daddr, sizeof(um->id.daddr));
        um->id.spi = x->id.spi;
        um->id.family = x->props.family;
        um->id.proto = x->id.proto;
        memcpy(&um->new_saddr, new_saddr, sizeof(um->new_saddr));
        memcpy(&um->old_saddr, &x->props.saddr, sizeof(um->old_saddr));
        um->new_sport = new_sport;
        um->old_sport = x->encap->encap_sport;
        um->reqid = x->props.reqid;

        nlmsg_end(skb, nlh);
        return 0;
}

static int xfrm_send_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr,
                             __be16 sport)
{
        struct net *net = xs_net(x);
        struct sk_buff *skb;
        int err;

        if (x->id.proto != IPPROTO_ESP)
                return -EINVAL;

        if (!x->encap)
                return -EINVAL;

        skb = nlmsg_new(xfrm_mapping_msgsize(), GFP_ATOMIC);
        if (skb == NULL)
                return -ENOMEM;

        err = build_mapping(skb, x, ipaddr, sport);
        BUG_ON(err < 0);

        return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_MAPPING);
}

static bool xfrm_is_alive(const struct km_event *c)
{
        return (bool)xfrm_acquire_is_on(c->net);
}

static struct xfrm_mgr netlink_mgr = {
        .notify                = xfrm_send_state_notify,
        .acquire        = xfrm_send_acquire,
        .compile_policy        = xfrm_compile_policy,
        .notify_policy        = xfrm_send_policy_notify,
        .report                = xfrm_send_report,
        .migrate        = xfrm_send_migrate,
        .new_mapping        = xfrm_send_mapping,
        .is_alive        = xfrm_is_alive,
};

static int __net_init xfrm_user_net_init(struct net *net)
{
        struct sock *nlsk;
        struct netlink_kernel_cfg cfg = {
                .groups        = XFRMNLGRP_MAX,
                .input        = xfrm_netlink_rcv,
        };

        nlsk = netlink_kernel_create(net, NETLINK_XFRM, &cfg);
        if (nlsk == NULL)
                return -ENOMEM;
        net->xfrm.nlsk_stash = nlsk; /* Don't set to NULL */
        rcu_assign_pointer(net->xfrm.nlsk, nlsk);
        return 0;
}

static void __net_exit xfrm_user_net_pre_exit(struct net *net)
{
        RCU_INIT_POINTER(net->xfrm.nlsk, NULL);
}

static void __net_exit xfrm_user_net_exit(struct list_head *net_exit_list)
{
        struct net *net;

        list_for_each_entry(net, net_exit_list, exit_list)
                netlink_kernel_release(net->xfrm.nlsk_stash);
}

static struct pernet_operations xfrm_user_net_ops = {
        .init            = xfrm_user_net_init,
        .pre_exit   = xfrm_user_net_pre_exit,
        .exit_batch = xfrm_user_net_exit,
};

static int __init xfrm_user_init(void)
{
        int rv;

        printk(KERN_INFO "Initializing XFRM netlink socket\n");

        rv = register_pernet_subsys(&xfrm_user_net_ops);
        if (rv < 0)
                return rv;
        xfrm_register_km(&netlink_mgr);
        return 0;
}

static void __exit xfrm_user_exit(void)
{
        xfrm_unregister_km(&netlink_mgr);
        unregister_pernet_subsys(&xfrm_user_net_ops);
}

module_init(xfrm_user_init);
module_exit(xfrm_user_exit);
MODULE_DESCRIPTION("XFRM User interface");
MODULE_LICENSE("GPL");
MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_XFRM);















































  148 
































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM x86_fpu

#if !defined(_TRACE_FPU_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_FPU_H

#include <linux/tracepoint.h>

DECLARE_EVENT_CLASS(x86_fpu,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu),

        TP_STRUCT__entry(
                __field(struct fpu *, fpu)
                __field(bool, load_fpu)
                __field(u64, xfeatures)
                __field(u64, xcomp_bv)
                ),

        TP_fast_assign(
                __entry->fpu                = fpu;
                __entry->load_fpu        = test_thread_flag(TIF_NEED_FPU_LOAD);
                if (boot_cpu_has(X86_FEATURE_OSXSAVE)) {
                        __entry->xfeatures = fpu->fpstate->regs.xsave.header.xfeatures;
                        __entry->xcomp_bv  = fpu->fpstate->regs.xsave.header.xcomp_bv;
                }
        ),
        TP_printk("x86/fpu: %p load: %d xfeatures: %llx xcomp_bv: %llx",
                        __entry->fpu,
                        __entry->load_fpu,
                        __entry->xfeatures,
                        __entry->xcomp_bv
        )
);

DEFINE_EVENT(x86_fpu, x86_fpu_before_save,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_after_save,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_regs_activated,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_regs_deactivated,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_dropped,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_copy_dst,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_xstate_check_failed,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH asm/trace/
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE fpu
#endif /* _TRACE_FPU_H */

/* This part must be outside protection */
#include <trace/define_trace.h>































































































































































































































































































































































    1 




  308 







































  310 
   15 































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_SPINLOCK_H
#define __LINUX_SPINLOCK_H
#define __LINUX_INSIDE_SPINLOCK_H

/*
 * include/linux/spinlock.h - generic spinlock/rwlock declarations
 *
 * here's the role of the various spinlock/rwlock related include files:
 *
 * on SMP builds:
 *
 *  asm/spinlock_types.h: contains the arch_spinlock_t/arch_rwlock_t and the
 *                        initializers
 *
 *  linux/spinlock_types_raw:
 *                          The raw types and initializers
 *  linux/spinlock_types.h:
 *                        defines the generic type and initializers
 *
 *  asm/spinlock.h:       contains the arch_spin_*()/etc. lowlevel
 *                        implementations, mostly inline assembly code
 *
 *   (also included on UP-debug builds:)
 *
 *  linux/spinlock_api_smp.h:
 *                        contains the prototypes for the _spin_*() APIs.
 *
 *  linux/spinlock.h:     builds the final spin_*() APIs.
 *
 * on UP builds:
 *
 *  linux/spinlock_type_up.h:
 *                        contains the generic, simplified UP spinlock type.
 *                        (which is an empty structure on non-debug builds)
 *
 *  linux/spinlock_types_raw:
 *                          The raw RT types and initializers
 *  linux/spinlock_types.h:
 *                        defines the generic type and initializers
 *
 *  linux/spinlock_up.h:
 *                        contains the arch_spin_*()/etc. version of UP
 *                        builds. (which are NOPs on non-debug, non-preempt
 *                        builds)
 *
 *   (included on UP-non-debug builds:)
 *
 *  linux/spinlock_api_up.h:
 *                        builds the _spin_*() APIs.
 *
 *  linux/spinlock.h:     builds the final spin_*() APIs.
 */

#include <linux/typecheck.h>
#include <linux/preempt.h>
#include <linux/linkage.h>
#include <linux/compiler.h>
#include <linux/irqflags.h>
#include <linux/thread_info.h>
#include <linux/stringify.h>
#include <linux/bottom_half.h>
#include <linux/lockdep.h>
#include <linux/cleanup.h>
#include <asm/barrier.h>
#include <asm/mmiowb.h>


/*
 * Must define these before including other files, inline functions need them
 */
#define LOCK_SECTION_NAME ".text..lock."KBUILD_BASENAME

#define LOCK_SECTION_START(extra)               \
        ".subsection 1\n\t"                     \
        extra                                   \
        ".ifndef " LOCK_SECTION_NAME "\n\t"     \
        LOCK_SECTION_NAME ":\n\t"               \
        ".endif\n"

#define LOCK_SECTION_END                        \
        ".previous\n\t"

#define __lockfunc __section(".spinlock.text")

/*
 * Pull the arch_spinlock_t and arch_rwlock_t definitions:
 */
#include <linux/spinlock_types.h>

/*
 * Pull the arch_spin*() functions/declarations (UP-nondebug doesn't need them):
 */
#ifdef CONFIG_SMP
# include <asm/spinlock.h>
#else
# include <linux/spinlock_up.h>
#endif

#ifdef CONFIG_DEBUG_SPINLOCK
  extern void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
                                   struct lock_class_key *key, short inner);

# define raw_spin_lock_init(lock)                                        \
do {                                                                        \
        static struct lock_class_key __key;                                \
                                                                        \
        __raw_spin_lock_init((lock), #lock, &__key, LD_WAIT_SPIN);        \
} while (0)

#else
# define raw_spin_lock_init(lock)                                \
        do { *(lock) = __RAW_SPIN_LOCK_UNLOCKED(lock); } while (0)
#endif

#define raw_spin_is_locked(lock)        arch_spin_is_locked(&(lock)->raw_lock)

#ifdef arch_spin_is_contended
#define raw_spin_is_contended(lock)        arch_spin_is_contended(&(lock)->raw_lock)
#else
#define raw_spin_is_contended(lock)        (((void)(lock), 0))
#endif /*arch_spin_is_contended*/

/*
 * smp_mb__after_spinlock() provides the equivalent of a full memory barrier
 * between program-order earlier lock acquisitions and program-order later
 * memory accesses.
 *
 * This guarantees that the following two properties hold:
 *
 *   1) Given the snippet:
 *
 *          { X = 0;  Y = 0; }
 *
 *          CPU0                                CPU1
 *
 *          WRITE_ONCE(X, 1);                WRITE_ONCE(Y, 1);
 *          spin_lock(S);                        smp_mb();
 *          smp_mb__after_spinlock();        r1 = READ_ONCE(X);
 *          r0 = READ_ONCE(Y);
 *          spin_unlock(S);
 *
 *      it is forbidden that CPU0 does not observe CPU1's store to Y (r0 = 0)
 *      and CPU1 does not observe CPU0's store to X (r1 = 0); see the comments
 *      preceding the call to smp_mb__after_spinlock() in __schedule() and in
 *      try_to_wake_up().
 *
 *   2) Given the snippet:
 *
 *  { X = 0;  Y = 0; }
 *
 *  CPU0                CPU1                                CPU2
 *
 *  spin_lock(S);        spin_lock(S);                        r1 = READ_ONCE(Y);
 *  WRITE_ONCE(X, 1);        smp_mb__after_spinlock();        smp_rmb();
 *  spin_unlock(S);        r0 = READ_ONCE(X);                r2 = READ_ONCE(X);
 *                        WRITE_ONCE(Y, 1);
 *                        spin_unlock(S);
 *
 *      it is forbidden that CPU0's critical section executes before CPU1's
 *      critical section (r0 = 1), CPU2 observes CPU1's store to Y (r1 = 1)
 *      and CPU2 does not observe CPU0's store to X (r2 = 0); see the comments
 *      preceding the calls to smp_rmb() in try_to_wake_up() for similar
 *      snippets but "projected" onto two CPUs.
 *
 * Property (2) upgrades the lock to an RCsc lock.
 *
 * Since most load-store architectures implement ACQUIRE with an smp_mb() after
 * the LL/SC loop, they need no further barriers. Similarly all our TSO
 * architectures imply an smp_mb() for each atomic instruction and equally don't
 * need more.
 *
 * Architectures that can implement ACQUIRE better need to take care.
 */
#ifndef smp_mb__after_spinlock
#define smp_mb__after_spinlock()        kcsan_mb()
#endif

#ifdef CONFIG_DEBUG_SPINLOCK
 extern void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock);
 extern int do_raw_spin_trylock(raw_spinlock_t *lock);
 extern void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock);
#else
static inline void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock)
{
        __acquire(lock);
        arch_spin_lock(&lock->raw_lock);
        mmiowb_spin_lock();
}

static inline int do_raw_spin_trylock(raw_spinlock_t *lock)
{
        int ret = arch_spin_trylock(&(lock)->raw_lock);

        if (ret)
                mmiowb_spin_lock();

        return ret;
}

static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
{
        mmiowb_spin_unlock();
        arch_spin_unlock(&lock->raw_lock);
        __release(lock);
}
#endif

/*
 * Define the various spin_lock methods.  Note we define these
 * regardless of whether CONFIG_SMP or CONFIG_PREEMPTION are set. The
 * various methods are defined as nops in the case they are not
 * required.
 */
#define raw_spin_trylock(lock)        __cond_lock(lock, _raw_spin_trylock(lock))

#define raw_spin_lock(lock)        _raw_spin_lock(lock)

#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define raw_spin_lock_nested(lock, subclass) \
        _raw_spin_lock_nested(lock, subclass)

# define raw_spin_lock_nest_lock(lock, nest_lock)                        \
         do {                                                                \
                 typecheck(struct lockdep_map *, &(nest_lock)->dep_map);\
                 _raw_spin_lock_nest_lock(lock, &(nest_lock)->dep_map);        \
         } while (0)
#else
/*
 * Always evaluate the 'subclass' argument to avoid that the compiler
 * warns about set-but-not-used variables when building with
 * CONFIG_DEBUG_LOCK_ALLOC=n and with W=1.
 */
# define raw_spin_lock_nested(lock, subclass)                \
        _raw_spin_lock(((void)(subclass), (lock)))
# define raw_spin_lock_nest_lock(lock, nest_lock)        _raw_spin_lock(lock)
#endif

#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)

#define raw_spin_lock_irqsave(lock, flags)                        \
        do {                                                \
                typecheck(unsigned long, flags);        \
                flags = _raw_spin_lock_irqsave(lock);        \
        } while (0)

#ifdef CONFIG_DEBUG_LOCK_ALLOC
#define raw_spin_lock_irqsave_nested(lock, flags, subclass)                \
        do {                                                                \
                typecheck(unsigned long, flags);                        \
                flags = _raw_spin_lock_irqsave_nested(lock, subclass);        \
        } while (0)
#else
#define raw_spin_lock_irqsave_nested(lock, flags, subclass)                \
        do {                                                                \
                typecheck(unsigned long, flags);                        \
                flags = _raw_spin_lock_irqsave(lock);                        \
        } while (0)
#endif

#else

#define raw_spin_lock_irqsave(lock, flags)                \
        do {                                                \
                typecheck(unsigned long, flags);        \
                _raw_spin_lock_irqsave(lock, flags);        \
        } while (0)

#define raw_spin_lock_irqsave_nested(lock, flags, subclass)        \
        raw_spin_lock_irqsave(lock, flags)

#endif

#define raw_spin_lock_irq(lock)                _raw_spin_lock_irq(lock)
#define raw_spin_lock_bh(lock)                _raw_spin_lock_bh(lock)
#define raw_spin_unlock(lock)                _raw_spin_unlock(lock)
#define raw_spin_unlock_irq(lock)        _raw_spin_unlock_irq(lock)

#define raw_spin_unlock_irqrestore(lock, flags)                \
        do {                                                        \
                typecheck(unsigned long, flags);                \
                _raw_spin_unlock_irqrestore(lock, flags);        \
        } while (0)
#define raw_spin_unlock_bh(lock)        _raw_spin_unlock_bh(lock)

#define raw_spin_trylock_bh(lock) \
        __cond_lock(lock, _raw_spin_trylock_bh(lock))

#define raw_spin_trylock_irq(lock) \
({ \
        local_irq_disable(); \
        raw_spin_trylock(lock) ? \
        1 : ({ local_irq_enable(); 0;  }); \
})

#define raw_spin_trylock_irqsave(lock, flags) \
({ \
        local_irq_save(flags); \
        raw_spin_trylock(lock) ? \
        1 : ({ local_irq_restore(flags); 0; }); \
})

#ifndef CONFIG_PREEMPT_RT
/* Include rwlock functions for !RT */
#include <linux/rwlock.h>
#endif

/*
 * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
 */
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
# include <linux/spinlock_api_smp.h>
#else
# include <linux/spinlock_api_up.h>
#endif

/* Non PREEMPT_RT kernel, map to raw spinlocks: */
#ifndef CONFIG_PREEMPT_RT

/*
 * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
 */

static __always_inline raw_spinlock_t *spinlock_check(spinlock_t *lock)
{
        return &lock->rlock;
}

#ifdef CONFIG_DEBUG_SPINLOCK

# define spin_lock_init(lock)                                        \
do {                                                                \
        static struct lock_class_key __key;                        \
                                                                \
        __raw_spin_lock_init(spinlock_check(lock),                \
                             #lock, &__key, LD_WAIT_CONFIG);        \
} while (0)

#else

# define spin_lock_init(_lock)                        \
do {                                                \
        spinlock_check(_lock);                        \
        *(_lock) = __SPIN_LOCK_UNLOCKED(_lock);        \
} while (0)

#endif

static __always_inline void spin_lock(spinlock_t *lock)
{
        raw_spin_lock(&lock->rlock);
}

static __always_inline void spin_lock_bh(spinlock_t *lock)
{
        raw_spin_lock_bh(&lock->rlock);
}

static __always_inline int spin_trylock(spinlock_t *lock)
{
        return raw_spin_trylock(&lock->rlock);
}

#define spin_lock_nested(lock, subclass)                        \
do {                                                                \
        raw_spin_lock_nested(spinlock_check(lock), subclass);        \
} while (0)

#define spin_lock_nest_lock(lock, nest_lock)                                \
do {                                                                        \
        raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock);        \
} while (0)

static __always_inline void spin_lock_irq(spinlock_t *lock)
{
        raw_spin_lock_irq(&lock->rlock);
}

#define spin_lock_irqsave(lock, flags)                                \
do {                                                                \
        raw_spin_lock_irqsave(spinlock_check(lock), flags);        \
} while (0)

#define spin_lock_irqsave_nested(lock, flags, subclass)                        \
do {                                                                        \
        raw_spin_lock_irqsave_nested(spinlock_check(lock), flags, subclass); \
} while (0)

static __always_inline void spin_unlock(spinlock_t *lock)
{
        raw_spin_unlock(&lock->rlock);
}

static __always_inline void spin_unlock_bh(spinlock_t *lock)
{
        raw_spin_unlock_bh(&lock->rlock);
}

static __always_inline void spin_unlock_irq(spinlock_t *lock)
{
        raw_spin_unlock_irq(&lock->rlock);
}

static __always_inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
{
        raw_spin_unlock_irqrestore(&lock->rlock, flags);
}

static __always_inline int spin_trylock_bh(spinlock_t *lock)
{
        return raw_spin_trylock_bh(&lock->rlock);
}

static __always_inline int spin_trylock_irq(spinlock_t *lock)
{
        return raw_spin_trylock_irq(&lock->rlock);
}

#define spin_trylock_irqsave(lock, flags)                        \
({                                                                \
        raw_spin_trylock_irqsave(spinlock_check(lock), flags); \
})

/**
 * spin_is_locked() - Check whether a spinlock is locked.
 * @lock: Pointer to the spinlock.
 *
 * This function is NOT required to provide any memory ordering
 * guarantees; it could be used for debugging purposes or, when
 * additional synchronization is needed, accompanied with other
 * constructs (memory barriers) enforcing the synchronization.
 *
 * Returns: 1 if @lock is locked, 0 otherwise.
 *
 * Note that the function only tells you that the spinlock is
 * seen to be locked, not that it is locked on your CPU.
 *
 * Further, on CONFIG_SMP=n builds with CONFIG_DEBUG_SPINLOCK=n,
 * the return value is always 0 (see include/linux/spinlock_up.h).
 * Therefore you should not rely heavily on the return value.
 */
static __always_inline int spin_is_locked(spinlock_t *lock)
{
        return raw_spin_is_locked(&lock->rlock);
}

static __always_inline int spin_is_contended(spinlock_t *lock)
{
        return raw_spin_is_contended(&lock->rlock);
}

#define assert_spin_locked(lock)        assert_raw_spin_locked(&(lock)->rlock)

#else  /* !CONFIG_PREEMPT_RT */
# include <linux/spinlock_rt.h>
#endif /* CONFIG_PREEMPT_RT */

/*
 * Does a critical section need to be broken due to another
 * task waiting?: (technically does not depend on CONFIG_PREEMPTION,
 * but a general need for low latency)
 */
static inline int spin_needbreak(spinlock_t *lock)
{
        if (!preempt_model_preemptible())
                return 0;

        return spin_is_contended(lock);
}

/*
 * Check if a rwlock is contended.
 * Returns non-zero if there is another task waiting on the rwlock.
 * Returns zero if the lock is not contended or the system / underlying
 * rwlock implementation does not support contention detection.
 * Technically does not depend on CONFIG_PREEMPTION, but a general need
 * for low latency.
 */
static inline int rwlock_needbreak(rwlock_t *lock)
{
        if (!preempt_model_preemptible())
                return 0;

        return rwlock_is_contended(lock);
}

/*
 * Pull the atomic_t declaration:
 * (asm-mips/atomic.h needs above definitions)
 */
#include <linux/atomic.h>
/**
 * atomic_dec_and_lock - lock on reaching reference count zero
 * @atomic: the atomic counter
 * @lock: the spinlock in question
 *
 * Decrements @atomic by 1.  If the result is 0, returns true and locks
 * @lock.  Returns false for all other cases.
 */
extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
#define atomic_dec_and_lock(atomic, lock) \
                __cond_lock(lock, _atomic_dec_and_lock(atomic, lock))

extern int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock,
                                        unsigned long *flags);
#define atomic_dec_and_lock_irqsave(atomic, lock, flags) \
                __cond_lock(lock, _atomic_dec_and_lock_irqsave(atomic, lock, &(flags)))

extern int _atomic_dec_and_raw_lock(atomic_t *atomic, raw_spinlock_t *lock);
#define atomic_dec_and_raw_lock(atomic, lock) \
                __cond_lock(lock, _atomic_dec_and_raw_lock(atomic, lock))

extern int _atomic_dec_and_raw_lock_irqsave(atomic_t *atomic, raw_spinlock_t *lock,
                                        unsigned long *flags);
#define atomic_dec_and_raw_lock_irqsave(atomic, lock, flags) \
                __cond_lock(lock, _atomic_dec_and_raw_lock_irqsave(atomic, lock, &(flags)))

int __alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *lock_mask,
                             size_t max_size, unsigned int cpu_mult,
                             gfp_t gfp, const char *name,
                             struct lock_class_key *key);

#define alloc_bucket_spinlocks(locks, lock_mask, max_size, cpu_mult, gfp)    \
        ({                                                                     \
                static struct lock_class_key key;                             \
                int ret;                                                     \
                                                                             \
                ret = __alloc_bucket_spinlocks(locks, lock_mask, max_size,   \
                                               cpu_mult, gfp, #locks, &key); \
                ret;                                                             \
        })

void free_bucket_spinlocks(spinlock_t *locks);

DEFINE_LOCK_GUARD_1(raw_spinlock, raw_spinlock_t,
                    raw_spin_lock(_T->lock),
                    raw_spin_unlock(_T->lock))

DEFINE_LOCK_GUARD_1_COND(raw_spinlock, _try, raw_spin_trylock(_T->lock))

DEFINE_LOCK_GUARD_1(raw_spinlock_nested, raw_spinlock_t,
                    raw_spin_lock_nested(_T->lock, SINGLE_DEPTH_NESTING),
                    raw_spin_unlock(_T->lock))

DEFINE_LOCK_GUARD_1(raw_spinlock_irq, raw_spinlock_t,
                    raw_spin_lock_irq(_T->lock),
                    raw_spin_unlock_irq(_T->lock))

DEFINE_LOCK_GUARD_1_COND(raw_spinlock_irq, _try, raw_spin_trylock_irq(_T->lock))

DEFINE_LOCK_GUARD_1(raw_spinlock_bh, raw_spinlock_t,
                    raw_spin_lock_bh(_T->lock),
                    raw_spin_unlock_bh(_T->lock))

DEFINE_LOCK_GUARD_1_COND(raw_spinlock_bh, _try, raw_spin_trylock_bh(_T->lock))

DEFINE_LOCK_GUARD_1(raw_spinlock_irqsave, raw_spinlock_t,
                    raw_spin_lock_irqsave(_T->lock, _T->flags),
                    raw_spin_unlock_irqrestore(_T->lock, _T->flags),
                    unsigned long flags)

DEFINE_LOCK_GUARD_1_COND(raw_spinlock_irqsave, _try,
                         raw_spin_trylock_irqsave(_T->lock, _T->flags))

DEFINE_LOCK_GUARD_1(spinlock, spinlock_t,
                    spin_lock(_T->lock),
                    spin_unlock(_T->lock))

DEFINE_LOCK_GUARD_1_COND(spinlock, _try, spin_trylock(_T->lock))

DEFINE_LOCK_GUARD_1(spinlock_irq, spinlock_t,
                    spin_lock_irq(_T->lock),
                    spin_unlock_irq(_T->lock))

DEFINE_LOCK_GUARD_1_COND(spinlock_irq, _try,
                         spin_trylock_irq(_T->lock))

DEFINE_LOCK_GUARD_1(spinlock_bh, spinlock_t,
                    spin_lock_bh(_T->lock),
                    spin_unlock_bh(_T->lock))

DEFINE_LOCK_GUARD_1_COND(spinlock_bh, _try,
                         spin_trylock_bh(_T->lock))

DEFINE_LOCK_GUARD_1(spinlock_irqsave, spinlock_t,
                    spin_lock_irqsave(_T->lock, _T->flags),
                    spin_unlock_irqrestore(_T->lock, _T->flags),
                    unsigned long flags)

DEFINE_LOCK_GUARD_1_COND(spinlock_irqsave, _try,
                         spin_trylock_irqsave(_T->lock, _T->flags))

DEFINE_LOCK_GUARD_1(read_lock, rwlock_t,
                    read_lock(_T->lock),
                    read_unlock(_T->lock))

DEFINE_LOCK_GUARD_1(read_lock_irq, rwlock_t,
                    read_lock_irq(_T->lock),
                    read_unlock_irq(_T->lock))

DEFINE_LOCK_GUARD_1(read_lock_irqsave, rwlock_t,
                    read_lock_irqsave(_T->lock, _T->flags),
                    read_unlock_irqrestore(_T->lock, _T->flags),
                    unsigned long flags)

DEFINE_LOCK_GUARD_1(write_lock, rwlock_t,
                    write_lock(_T->lock),
                    write_unlock(_T->lock))

DEFINE_LOCK_GUARD_1(write_lock_irq, rwlock_t,
                    write_lock_irq(_T->lock),
                    write_unlock_irq(_T->lock))

DEFINE_LOCK_GUARD_1(write_lock_irqsave, rwlock_t,
                    write_lock_irqsave(_T->lock, _T->flags),
                    write_unlock_irqrestore(_T->lock, _T->flags),
                    unsigned long flags)

#undef __LINUX_INSIDE_SPINLOCK_H
#endif /* __LINUX_SPINLOCK_H */


































































   92 




   76 
   93 

   30 
























































































































































    9 






















































































































































































   27 
































































   13 



























   13 

















    3 


























































































































  316 
















  316 












  315 














  315 
















































































































































    2 

    2 





































    3 














    3 



































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_SEQLOCK_H
#define __LINUX_SEQLOCK_H

/*
 * seqcount_t / seqlock_t - a reader-writer consistency mechanism with
 * lockless readers (read-only retry loops), and no writer starvation.
 *
 * See Documentation/locking/seqlock.rst
 *
 * Copyrights:
 * - Based on x86_64 vsyscall gettimeofday: Keith Owens, Andrea Arcangeli
 * - Sequence counters with associated locks, (C) 2020 Linutronix GmbH
 */

#include <linux/compiler.h>
#include <linux/kcsan-checks.h>
#include <linux/lockdep.h>
#include <linux/mutex.h>
#include <linux/preempt.h>
#include <linux/seqlock_types.h>
#include <linux/spinlock.h>

#include <asm/processor.h>

/*
 * The seqlock seqcount_t interface does not prescribe a precise sequence of
 * read begin/retry/end. For readers, typically there is a call to
 * read_seqcount_begin() and read_seqcount_retry(), however, there are more
 * esoteric cases which do not follow this pattern.
 *
 * As a consequence, we take the following best-effort approach for raw usage
 * via seqcount_t under KCSAN: upon beginning a seq-reader critical section,
 * pessimistically mark the next KCSAN_SEQLOCK_REGION_MAX memory accesses as
 * atomics; if there is a matching read_seqcount_retry() call, no following
 * memory operations are considered atomic. Usage of the seqlock_t interface
 * is not affected.
 */
#define KCSAN_SEQLOCK_REGION_MAX 1000

static inline void __seqcount_init(seqcount_t *s, const char *name,
                                          struct lock_class_key *key)
{
        /*
         * Make sure we are not reinitializing a held lock:
         */
        lockdep_init_map(&s->dep_map, name, key, 0);
        s->sequence = 0;
}

#ifdef CONFIG_DEBUG_LOCK_ALLOC

# define SEQCOUNT_DEP_MAP_INIT(lockname)                                \
                .dep_map = { .name = #lockname }

/**
 * seqcount_init() - runtime initializer for seqcount_t
 * @s: Pointer to the seqcount_t instance
 */
# define seqcount_init(s)                                                \
        do {                                                                \
                static struct lock_class_key __key;                        \
                __seqcount_init((s), #s, &__key);                        \
        } while (0)

static inline void seqcount_lockdep_reader_access(const seqcount_t *s)
{
        seqcount_t *l = (seqcount_t *)s;
        unsigned long flags;

        local_irq_save(flags);
        seqcount_acquire_read(&l->dep_map, 0, 0, _RET_IP_);
        seqcount_release(&l->dep_map, _RET_IP_);
        local_irq_restore(flags);
}

#else
# define SEQCOUNT_DEP_MAP_INIT(lockname)
# define seqcount_init(s) __seqcount_init(s, NULL, NULL)
# define seqcount_lockdep_reader_access(x)
#endif

/**
 * SEQCNT_ZERO() - static initializer for seqcount_t
 * @name: Name of the seqcount_t instance
 */
#define SEQCNT_ZERO(name) { .sequence = 0, SEQCOUNT_DEP_MAP_INIT(name) }

/*
 * Sequence counters with associated locks (seqcount_LOCKNAME_t)
 *
 * A sequence counter which associates the lock used for writer
 * serialization at initialization time. This enables lockdep to validate
 * that the write side critical section is properly serialized.
 *
 * For associated locks which do not implicitly disable preemption,
 * preemption protection is enforced in the write side function.
 *
 * Lockdep is never used in any for the raw write variants.
 *
 * See Documentation/locking/seqlock.rst
 */

/*
 * typedef seqcount_LOCKNAME_t - sequence counter with LOCKNAME associated
 * @seqcount:        The real sequence counter
 * @lock:        Pointer to the associated lock
 *
 * A plain sequence counter with external writer synchronization by
 * LOCKNAME @lock. The lock is associated to the sequence counter in the
 * static initializer or init function. This enables lockdep to validate
 * that the write side critical section is properly serialized.
 *
 * LOCKNAME:        raw_spinlock, spinlock, rwlock or mutex
 */

/*
 * seqcount_LOCKNAME_init() - runtime initializer for seqcount_LOCKNAME_t
 * @s:                Pointer to the seqcount_LOCKNAME_t instance
 * @lock:        Pointer to the associated lock
 */

#define seqcount_LOCKNAME_init(s, _lock, lockname)                        \
        do {                                                                \
                seqcount_##lockname##_t *____s = (s);                        \
                seqcount_init(&____s->seqcount);                        \
                __SEQ_LOCK(____s->lock = (_lock));                        \
        } while (0)

#define seqcount_raw_spinlock_init(s, lock)        seqcount_LOCKNAME_init(s, lock, raw_spinlock)
#define seqcount_spinlock_init(s, lock)                seqcount_LOCKNAME_init(s, lock, spinlock)
#define seqcount_rwlock_init(s, lock)                seqcount_LOCKNAME_init(s, lock, rwlock)
#define seqcount_mutex_init(s, lock)                seqcount_LOCKNAME_init(s, lock, mutex)

/*
 * SEQCOUNT_LOCKNAME()        - Instantiate seqcount_LOCKNAME_t and helpers
 * seqprop_LOCKNAME_*()        - Property accessors for seqcount_LOCKNAME_t
 *
 * @lockname:                "LOCKNAME" part of seqcount_LOCKNAME_t
 * @locktype:                LOCKNAME canonical C data type
 * @preemptible:        preemptibility of above locktype
 * @lockbase:                prefix for associated lock/unlock
 */
#define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockbase)        \
static __always_inline seqcount_t *                                        \
__seqprop_##lockname##_ptr(seqcount_##lockname##_t *s)                        \
{                                                                        \
        return &s->seqcount;                                                \
}                                                                        \
                                                                        \
static __always_inline const seqcount_t *                                \
__seqprop_##lockname##_const_ptr(const seqcount_##lockname##_t *s)        \
{                                                                        \
        return &s->seqcount;                                                \
}                                                                        \
                                                                        \
static __always_inline unsigned                                                \
__seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s)        \
{                                                                        \
        unsigned seq = smp_load_acquire(&s->seqcount.sequence);                \
                                                                        \
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))                                \
                return seq;                                                \
                                                                        \
        if (preemptible && unlikely(seq & 1)) {                                \
                __SEQ_LOCK(lockbase##_lock(s->lock));                        \
                __SEQ_LOCK(lockbase##_unlock(s->lock));                        \
                                                                        \
                /*                                                        \
                 * Re-read the sequence counter since the (possibly        \
                 * preempted) writer made progress.                        \
                 */                                                        \
                seq = smp_load_acquire(&s->seqcount.sequence);                \
        }                                                                \
                                                                        \
        return seq;                                                        \
}                                                                        \
                                                                        \
static __always_inline bool                                                \
__seqprop_##lockname##_preemptible(const seqcount_##lockname##_t *s)        \
{                                                                        \
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))                                \
                return preemptible;                                        \
                                                                        \
        /* PREEMPT_RT relies on the above LOCK+UNLOCK */                \
        return false;                                                        \
}                                                                        \
                                                                        \
static __always_inline void                                                \
__seqprop_##lockname##_assert(const seqcount_##lockname##_t *s)                \
{                                                                        \
        __SEQ_LOCK(lockdep_assert_held(s->lock));                        \
}

/*
 * __seqprop() for seqcount_t
 */

static inline seqcount_t *__seqprop_ptr(seqcount_t *s)
{
        return s;
}

static inline const seqcount_t *__seqprop_const_ptr(const seqcount_t *s)
{
        return s;
}

static inline unsigned __seqprop_sequence(const seqcount_t *s)
{
        return smp_load_acquire(&s->sequence);
}

static inline bool __seqprop_preemptible(const seqcount_t *s)
{
        return false;
}

static inline void __seqprop_assert(const seqcount_t *s)
{
        lockdep_assert_preemption_disabled();
}

#define __SEQ_RT        IS_ENABLED(CONFIG_PREEMPT_RT)

SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t,  false,    raw_spin)
SEQCOUNT_LOCKNAME(spinlock,     spinlock_t,      __SEQ_RT, spin)
SEQCOUNT_LOCKNAME(rwlock,       rwlock_t,        __SEQ_RT, read)
SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     mutex)
#undef SEQCOUNT_LOCKNAME

/*
 * SEQCNT_LOCKNAME_ZERO - static initializer for seqcount_LOCKNAME_t
 * @name:        Name of the seqcount_LOCKNAME_t instance
 * @lock:        Pointer to the associated LOCKNAME
 */

#define SEQCOUNT_LOCKNAME_ZERO(seq_name, assoc_lock) {                        \
        .seqcount                = SEQCNT_ZERO(seq_name.seqcount),        \
        __SEQ_LOCK(.lock        = (assoc_lock))                                \
}

#define SEQCNT_RAW_SPINLOCK_ZERO(name, lock)        SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_SPINLOCK_ZERO(name, lock)        SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_RWLOCK_ZERO(name, lock)                SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_MUTEX_ZERO(name, lock)                SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_WW_MUTEX_ZERO(name, lock)         SEQCOUNT_LOCKNAME_ZERO(name, lock)

#define __seqprop_case(s, lockname, prop)                                \
        seqcount_##lockname##_t: __seqprop_##lockname##_##prop

#define __seqprop(s, prop) _Generic(*(s),                                \
        seqcount_t:                __seqprop_##prop,                        \
        __seqprop_case((s),        raw_spinlock,        prop),                        \
        __seqprop_case((s),        spinlock,        prop),                        \
        __seqprop_case((s),        rwlock,                prop),                        \
        __seqprop_case((s),        mutex,                prop))

#define seqprop_ptr(s)                        __seqprop(s, ptr)(s)
#define seqprop_const_ptr(s)                __seqprop(s, const_ptr)(s)
#define seqprop_sequence(s)                __seqprop(s, sequence)(s)
#define seqprop_preemptible(s)                __seqprop(s, preemptible)(s)
#define seqprop_assert(s)                __seqprop(s, assert)(s)

/**
 * __read_seqcount_begin() - begin a seqcount_t read section
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define __read_seqcount_begin(s)                                        \
({                                                                        \
        unsigned __seq;                                                        \
                                                                        \
        while (unlikely((__seq = seqprop_sequence(s)) & 1))                \
                cpu_relax();                                                \
                                                                        \
        kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);                        \
        __seq;                                                                \
})

/**
 * raw_read_seqcount_begin() - begin a seqcount_t read section w/o lockdep
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define raw_read_seqcount_begin(s) __read_seqcount_begin(s)

/**
 * read_seqcount_begin() - begin a seqcount_t read critical section
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define read_seqcount_begin(s)                                                \
({                                                                        \
        seqcount_lockdep_reader_access(seqprop_const_ptr(s));                \
        raw_read_seqcount_begin(s);                                        \
})

/**
 * raw_read_seqcount() - read the raw seqcount_t counter value
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * raw_read_seqcount opens a read critical section of the given
 * seqcount_t, without any lockdep checking, and without checking or
 * masking the sequence counter LSB. Calling code is responsible for
 * handling that.
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define raw_read_seqcount(s)                                                \
({                                                                        \
        unsigned __seq = seqprop_sequence(s);                                \
                                                                        \
        kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);                        \
        __seq;                                                                \
})

/**
 * raw_seqcount_try_begin() - begin a seqcount_t read critical section
 *                            w/o lockdep and w/o counter stabilization
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 * @start: count to be passed to read_seqcount_retry()
 *
 * Similar to raw_seqcount_begin(), except it enables eliding the critical
 * section entirely if odd, instead of doing the speculation knowing it will
 * fail.
 *
 * Useful when counter stabilization is more or less equivalent to taking
 * the lock and there is a slowpath that does that.
 *
 * If true, start will be set to the (even) sequence count read.
 *
 * Return: true when a read critical section is started.
 */
#define raw_seqcount_try_begin(s, start)                                \
({                                                                        \
        start = raw_read_seqcount(s);                                        \
        !(start & 1);                                                        \
})

/**
 * raw_seqcount_begin() - begin a seqcount_t read critical section w/o
 *                        lockdep and w/o counter stabilization
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * raw_seqcount_begin opens a read critical section of the given
 * seqcount_t. Unlike read_seqcount_begin(), this function will not wait
 * for the count to stabilize. If a writer is active when it begins, it
 * will fail the read_seqcount_retry() at the end of the read critical
 * section instead of stabilizing at the beginning of it.
 *
 * Use this only in special kernel hot paths where the read section is
 * small and has a high probability of success through other external
 * means. It will save a single branching instruction.
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define raw_seqcount_begin(s)                                                \
({                                                                        \
        /*                                                                \
         * If the counter is odd, let read_seqcount_retry() fail        \
         * by decrementing the counter.                                        \
         */                                                                \
        raw_read_seqcount(s) & ~1;                                        \
})

/**
 * __read_seqcount_retry() - end a seqcount_t read section w/o barrier
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 * @start: count, from read_seqcount_begin()
 *
 * __read_seqcount_retry is like read_seqcount_retry, but has no smp_rmb()
 * barrier. Callers should ensure that smp_rmb() or equivalent ordering is
 * provided before actually loading any of the variables that are to be
 * protected in this critical section.
 *
 * Use carefully, only in critical code, and comment how the barrier is
 * provided.
 *
 * Return: true if a read section retry is required, else false
 */
#define __read_seqcount_retry(s, start)                                        \
        do___read_seqcount_retry(seqprop_const_ptr(s), start)

static inline int do___read_seqcount_retry(const seqcount_t *s, unsigned start)
{
        kcsan_atomic_next(0);
        return unlikely(READ_ONCE(s->sequence) != start);
}

/**
 * read_seqcount_retry() - end a seqcount_t read critical section
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 * @start: count, from read_seqcount_begin()
 *
 * read_seqcount_retry closes the read critical section of given
 * seqcount_t.  If the critical section was invalid, it must be ignored
 * (and typically retried).
 *
 * Return: true if a read section retry is required, else false
 */
#define read_seqcount_retry(s, start)                                        \
        do_read_seqcount_retry(seqprop_const_ptr(s), start)

static inline int do_read_seqcount_retry(const seqcount_t *s, unsigned start)
{
        smp_rmb();
        return do___read_seqcount_retry(s, start);
}

/**
 * raw_write_seqcount_begin() - start a seqcount_t write section w/o lockdep
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Context: check write_seqcount_begin()
 */
#define raw_write_seqcount_begin(s)                                        \
do {                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_disable();                                        \
                                                                        \
        do_raw_write_seqcount_begin(seqprop_ptr(s));                        \
} while (0)

static inline void do_raw_write_seqcount_begin(seqcount_t *s)
{
        kcsan_nestable_atomic_begin();
        s->sequence++;
        smp_wmb();
}

/**
 * raw_write_seqcount_end() - end a seqcount_t write section w/o lockdep
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Context: check write_seqcount_end()
 */
#define raw_write_seqcount_end(s)                                        \
do {                                                                        \
        do_raw_write_seqcount_end(seqprop_ptr(s));                        \
                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_enable();                                        \
} while (0)

static inline void do_raw_write_seqcount_end(seqcount_t *s)
{
        smp_wmb();
        s->sequence++;
        kcsan_nestable_atomic_end();
}

/**
 * write_seqcount_begin_nested() - start a seqcount_t write section with
 *                                 custom lockdep nesting level
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 * @subclass: lockdep nesting level
 *
 * See Documentation/locking/lockdep-design.rst
 * Context: check write_seqcount_begin()
 */
#define write_seqcount_begin_nested(s, subclass)                        \
do {                                                                        \
        seqprop_assert(s);                                                \
                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_disable();                                        \
                                                                        \
        do_write_seqcount_begin_nested(seqprop_ptr(s), subclass);        \
} while (0)

static inline void do_write_seqcount_begin_nested(seqcount_t *s, int subclass)
{
        seqcount_acquire(&s->dep_map, subclass, 0, _RET_IP_);
        do_raw_write_seqcount_begin(s);
}

/**
 * write_seqcount_begin() - start a seqcount_t write side critical section
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Context: sequence counter write side sections must be serialized and
 * non-preemptible. Preemption will be automatically disabled if and
 * only if the seqcount write serialization lock is associated, and
 * preemptible.  If readers can be invoked from hardirq or softirq
 * context, interrupts or bottom halves must be respectively disabled.
 */
#define write_seqcount_begin(s)                                                \
do {                                                                        \
        seqprop_assert(s);                                                \
                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_disable();                                        \
                                                                        \
        do_write_seqcount_begin(seqprop_ptr(s));                        \
} while (0)

static inline void do_write_seqcount_begin(seqcount_t *s)
{
        do_write_seqcount_begin_nested(s, 0);
}

/**
 * write_seqcount_end() - end a seqcount_t write side critical section
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Context: Preemption will be automatically re-enabled if and only if
 * the seqcount write serialization lock is associated, and preemptible.
 */
#define write_seqcount_end(s)                                                \
do {                                                                        \
        do_write_seqcount_end(seqprop_ptr(s));                                \
                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_enable();                                        \
} while (0)

static inline void do_write_seqcount_end(seqcount_t *s)
{
        seqcount_release(&s->dep_map, _RET_IP_);
        do_raw_write_seqcount_end(s);
}

/**
 * raw_write_seqcount_barrier() - do a seqcount_t write barrier
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * This can be used to provide an ordering guarantee instead of the usual
 * consistency guarantee. It is one wmb cheaper, because it can collapse
 * the two back-to-back wmb()s.
 *
 * Note that writes surrounding the barrier should be declared atomic (e.g.
 * via WRITE_ONCE): a) to ensure the writes become visible to other threads
 * atomically, avoiding compiler optimizations; b) to document which writes are
 * meant to propagate to the reader critical section. This is necessary because
 * neither writes before nor after the barrier are enclosed in a seq-writer
 * critical section that would ensure readers are aware of ongoing writes::
 *
 *        seqcount_t seq;
 *        bool X = true, Y = false;
 *
 *        void read(void)
 *        {
 *                bool x, y;
 *
 *                do {
 *                        int s = read_seqcount_begin(&seq);
 *
 *                        x = X; y = Y;
 *
 *                } while (read_seqcount_retry(&seq, s));
 *
 *                BUG_ON(!x && !y);
 *      }
 *
 *      void write(void)
 *      {
 *                WRITE_ONCE(Y, true);
 *
 *                raw_write_seqcount_barrier(seq);
 *
 *                WRITE_ONCE(X, false);
 *      }
 */
#define raw_write_seqcount_barrier(s)                                        \
        do_raw_write_seqcount_barrier(seqprop_ptr(s))

static inline void do_raw_write_seqcount_barrier(seqcount_t *s)
{
        kcsan_nestable_atomic_begin();
        s->sequence++;
        smp_wmb();
        s->sequence++;
        kcsan_nestable_atomic_end();
}

/**
 * write_seqcount_invalidate() - invalidate in-progress seqcount_t read
 *                               side operations
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * After write_seqcount_invalidate, no seqcount_t read side operations
 * will complete successfully and see data older than this.
 */
#define write_seqcount_invalidate(s)                                        \
        do_write_seqcount_invalidate(seqprop_ptr(s))

static inline void do_write_seqcount_invalidate(seqcount_t *s)
{
        smp_wmb();
        kcsan_nestable_atomic_begin();
        s->sequence+=2;
        kcsan_nestable_atomic_end();
}

/*
 * Latch sequence counters (seqcount_latch_t)
 *
 * A sequence counter variant where the counter even/odd value is used to
 * switch between two copies of protected data. This allows the read path,
 * typically NMIs, to safely interrupt the write side critical section.
 *
 * As the write sections are fully preemptible, no special handling for
 * PREEMPT_RT is needed.
 */
typedef struct {
        seqcount_t seqcount;
} seqcount_latch_t;

/**
 * SEQCNT_LATCH_ZERO() - static initializer for seqcount_latch_t
 * @seq_name: Name of the seqcount_latch_t instance
 */
#define SEQCNT_LATCH_ZERO(seq_name) {                                        \
        .seqcount                = SEQCNT_ZERO(seq_name.seqcount),        \
}

/**
 * seqcount_latch_init() - runtime initializer for seqcount_latch_t
 * @s: Pointer to the seqcount_latch_t instance
 */
#define seqcount_latch_init(s) seqcount_init(&(s)->seqcount)

/**
 * raw_read_seqcount_latch() - pick even/odd latch data copy
 * @s: Pointer to seqcount_latch_t
 *
 * See raw_write_seqcount_latch() for details and a full reader/writer
 * usage example.
 *
 * Return: sequence counter raw value. Use the lowest bit as an index for
 * picking which data copy to read. The full counter must then be checked
 * with raw_read_seqcount_latch_retry().
 */
static __always_inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s)
{
        /*
         * Pairs with the first smp_wmb() in raw_write_seqcount_latch().
         * Due to the dependent load, a full smp_rmb() is not needed.
         */
        return READ_ONCE(s->seqcount.sequence);
}

/**
 * read_seqcount_latch() - pick even/odd latch data copy
 * @s: Pointer to seqcount_latch_t
 *
 * See write_seqcount_latch() for details and a full reader/writer usage
 * example.
 *
 * Return: sequence counter raw value. Use the lowest bit as an index for
 * picking which data copy to read. The full counter must then be checked
 * with read_seqcount_latch_retry().
 */
static __always_inline unsigned read_seqcount_latch(const seqcount_latch_t *s)
{
        kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);
        return raw_read_seqcount_latch(s);
}

/**
 * raw_read_seqcount_latch_retry() - end a seqcount_latch_t read section
 * @s:                Pointer to seqcount_latch_t
 * @start:        count, from raw_read_seqcount_latch()
 *
 * Return: true if a read section retry is required, else false
 */
static __always_inline int
raw_read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
{
        smp_rmb();
        return unlikely(READ_ONCE(s->seqcount.sequence) != start);
}

/**
 * read_seqcount_latch_retry() - end a seqcount_latch_t read section
 * @s:                Pointer to seqcount_latch_t
 * @start:        count, from read_seqcount_latch()
 *
 * Return: true if a read section retry is required, else false
 */
static __always_inline int
read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
{
        kcsan_atomic_next(0);
        return raw_read_seqcount_latch_retry(s, start);
}

/**
 * raw_write_seqcount_latch() - redirect latch readers to even/odd copy
 * @s: Pointer to seqcount_latch_t
 */
static __always_inline void raw_write_seqcount_latch(seqcount_latch_t *s)
{
        smp_wmb();        /* prior stores before incrementing "sequence" */
        s->seqcount.sequence++;
        smp_wmb();      /* increment "sequence" before following stores */
}

/**
 * write_seqcount_latch_begin() - redirect latch readers to odd copy
 * @s: Pointer to seqcount_latch_t
 *
 * The latch technique is a multiversion concurrency control method that allows
 * queries during non-atomic modifications. If you can guarantee queries never
 * interrupt the modification -- e.g. the concurrency is strictly between CPUs
 * -- you most likely do not need this.
 *
 * Where the traditional RCU/lockless data structures rely on atomic
 * modifications to ensure queries observe either the old or the new state the
 * latch allows the same for non-atomic updates. The trade-off is doubling the
 * cost of storage; we have to maintain two copies of the entire data
 * structure.
 *
 * Very simply put: we first modify one copy and then the other. This ensures
 * there is always one copy in a stable state, ready to give us an answer.
 *
 * The basic form is a data structure like::
 *
 *        struct latch_struct {
 *                seqcount_latch_t        seq;
 *                struct data_struct        data[2];
 *        };
 *
 * Where a modification, which is assumed to be externally serialized, does the
 * following::
 *
 *        void latch_modify(struct latch_struct *latch, ...)
 *        {
 *                write_seqcount_latch_begin(&latch->seq);
 *                modify(latch->data[0], ...);
 *                write_seqcount_latch(&latch->seq);
 *                modify(latch->data[1], ...);
 *                write_seqcount_latch_end(&latch->seq);
 *        }
 *
 * The query will have a form like::
 *
 *        struct entry *latch_query(struct latch_struct *latch, ...)
 *        {
 *                struct entry *entry;
 *                unsigned seq, idx;
 *
 *                do {
 *                        seq = read_seqcount_latch(&latch->seq);
 *
 *                        idx = seq & 0x01;
 *                        entry = data_query(latch->data[idx], ...);
 *
 *                // This includes needed smp_rmb()
 *                } while (read_seqcount_latch_retry(&latch->seq, seq));
 *
 *                return entry;
 *        }
 *
 * So during the modification, queries are first redirected to data[1]. Then we
 * modify data[0]. When that is complete, we redirect queries back to data[0]
 * and we can modify data[1].
 *
 * NOTE:
 *
 *        The non-requirement for atomic modifications does _NOT_ include
 *        the publishing of new entries in the case where data is a dynamic
 *        data structure.
 *
 *        An iteration might start in data[0] and get suspended long enough
 *        to miss an entire modification sequence, once it resumes it might
 *        observe the new entry.
 *
 * NOTE2:
 *
 *        When data is a dynamic data structure; one should use regular RCU
 *        patterns to manage the lifetimes of the objects within.
 */
static __always_inline void write_seqcount_latch_begin(seqcount_latch_t *s)
{
        kcsan_nestable_atomic_begin();
        raw_write_seqcount_latch(s);
}

/**
 * write_seqcount_latch() - redirect latch readers to even copy
 * @s: Pointer to seqcount_latch_t
 */
static __always_inline void write_seqcount_latch(seqcount_latch_t *s)
{
        raw_write_seqcount_latch(s);
}

/**
 * write_seqcount_latch_end() - end a seqcount_latch_t write section
 * @s:                Pointer to seqcount_latch_t
 *
 * Marks the end of a seqcount_latch_t writer section, after all copies of the
 * latch-protected data have been updated.
 */
static __always_inline void write_seqcount_latch_end(seqcount_latch_t *s)
{
        kcsan_nestable_atomic_end();
}

#define __SEQLOCK_UNLOCKED(lockname)                                        \
        {                                                                \
                .seqcount = SEQCNT_SPINLOCK_ZERO(lockname, &(lockname).lock), \
                .lock =        __SPIN_LOCK_UNLOCKED(lockname)                        \
        }

/**
 * seqlock_init() - dynamic initializer for seqlock_t
 * @sl: Pointer to the seqlock_t instance
 */
#define seqlock_init(sl)                                                \
        do {                                                                \
                spin_lock_init(&(sl)->lock);                                \
                seqcount_spinlock_init(&(sl)->seqcount, &(sl)->lock);        \
        } while (0)

/**
 * DEFINE_SEQLOCK(sl) - Define a statically allocated seqlock_t
 * @sl: Name of the seqlock_t instance
 */
#define DEFINE_SEQLOCK(sl) \
                seqlock_t sl = __SEQLOCK_UNLOCKED(sl)

/**
 * read_seqbegin() - start a seqlock_t read side critical section
 * @sl: Pointer to seqlock_t
 *
 * Return: count, to be passed to read_seqretry()
 */
static inline unsigned read_seqbegin(const seqlock_t *sl)
{
        return read_seqcount_begin(&sl->seqcount);
}

/**
 * read_seqretry() - end a seqlock_t read side section
 * @sl: Pointer to seqlock_t
 * @start: count, from read_seqbegin()
 *
 * read_seqretry closes the read side critical section of given seqlock_t.
 * If the critical section was invalid, it must be ignored (and typically
 * retried).
 *
 * Return: true if a read section retry is required, else false
 */
static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
{
        return read_seqcount_retry(&sl->seqcount, start);
}

/*
 * For all seqlock_t write side functions, use the internal
 * do_write_seqcount_begin() instead of generic write_seqcount_begin().
 * This way, no redundant lockdep_assert_held() checks are added.
 */

/**
 * write_seqlock() - start a seqlock_t write side critical section
 * @sl: Pointer to seqlock_t
 *
 * write_seqlock opens a write side critical section for the given
 * seqlock_t.  It also implicitly acquires the spinlock_t embedded inside
 * that sequential lock. All seqlock_t write side sections are thus
 * automatically serialized and non-preemptible.
 *
 * Context: if the seqlock_t read section, or other write side critical
 * sections, can be invoked from hardirq or softirq contexts, use the
 * _irqsave or _bh variants of this function instead.
 */
static inline void write_seqlock(seqlock_t *sl)
{
        spin_lock(&sl->lock);
        do_write_seqcount_begin(&sl->seqcount.seqcount);
}

/**
 * write_sequnlock() - end a seqlock_t write side critical section
 * @sl: Pointer to seqlock_t
 *
 * write_sequnlock closes the (serialized and non-preemptible) write side
 * critical section of given seqlock_t.
 */
static inline void write_sequnlock(seqlock_t *sl)
{
        do_write_seqcount_end(&sl->seqcount.seqcount);
        spin_unlock(&sl->lock);
}

/**
 * write_seqlock_bh() - start a softirqs-disabled seqlock_t write section
 * @sl: Pointer to seqlock_t
 *
 * _bh variant of write_seqlock(). Use only if the read side section, or
 * other write side sections, can be invoked from softirq contexts.
 */
static inline void write_seqlock_bh(seqlock_t *sl)
{
        spin_lock_bh(&sl->lock);
        do_write_seqcount_begin(&sl->seqcount.seqcount);
}

/**
 * write_sequnlock_bh() - end a softirqs-disabled seqlock_t write section
 * @sl: Pointer to seqlock_t
 *
 * write_sequnlock_bh closes the serialized, non-preemptible, and
 * softirqs-disabled, seqlock_t write side critical section opened with
 * write_seqlock_bh().
 */
static inline void write_sequnlock_bh(seqlock_t *sl)
{
        do_write_seqcount_end(&sl->seqcount.seqcount);
        spin_unlock_bh(&sl->lock);
}

/**
 * write_seqlock_irq() - start a non-interruptible seqlock_t write section
 * @sl: Pointer to seqlock_t
 *
 * _irq variant of write_seqlock(). Use only if the read side section, or
 * other write sections, can be invoked from hardirq contexts.
 */
static inline void write_seqlock_irq(seqlock_t *sl)
{
        spin_lock_irq(&sl->lock);
        do_write_seqcount_begin(&sl->seqcount.seqcount);
}

/**
 * write_sequnlock_irq() - end a non-interruptible seqlock_t write section
 * @sl: Pointer to seqlock_t
 *
 * write_sequnlock_irq closes the serialized and non-interruptible
 * seqlock_t write side section opened with write_seqlock_irq().
 */
static inline void write_sequnlock_irq(seqlock_t *sl)
{
        do_write_seqcount_end(&sl->seqcount.seqcount);
        spin_unlock_irq(&sl->lock);
}

static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
{
        unsigned long flags;

        spin_lock_irqsave(&sl->lock, flags);
        do_write_seqcount_begin(&sl->seqcount.seqcount);
        return flags;
}

/**
 * write_seqlock_irqsave() - start a non-interruptible seqlock_t write
 *                           section
 * @lock:  Pointer to seqlock_t
 * @flags: Stack-allocated storage for saving caller's local interrupt
 *         state, to be passed to write_sequnlock_irqrestore().
 *
 * _irqsave variant of write_seqlock(). Use it only if the read side
 * section, or other write sections, can be invoked from hardirq context.
 */
#define write_seqlock_irqsave(lock, flags)                                \
        do { flags = __write_seqlock_irqsave(lock); } while (0)

/**
 * write_sequnlock_irqrestore() - end non-interruptible seqlock_t write
 *                                section
 * @sl:    Pointer to seqlock_t
 * @flags: Caller's saved interrupt state, from write_seqlock_irqsave()
 *
 * write_sequnlock_irqrestore closes the serialized and non-interruptible
 * seqlock_t write section previously opened with write_seqlock_irqsave().
 */
static inline void
write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
{
        do_write_seqcount_end(&sl->seqcount.seqcount);
        spin_unlock_irqrestore(&sl->lock, flags);
}

/**
 * read_seqlock_excl() - begin a seqlock_t locking reader section
 * @sl:        Pointer to seqlock_t
 *
 * read_seqlock_excl opens a seqlock_t locking reader critical section.  A
 * locking reader exclusively locks out *both* other writers *and* other
 * locking readers, but it does not update the embedded sequence number.
 *
 * Locking readers act like a normal spin_lock()/spin_unlock().
 *
 * Context: if the seqlock_t write section, *or other read sections*, can
 * be invoked from hardirq or softirq contexts, use the _irqsave or _bh
 * variant of this function instead.
 *
 * The opened read section must be closed with read_sequnlock_excl().
 */
static inline void read_seqlock_excl(seqlock_t *sl)
{
        spin_lock(&sl->lock);
}

/**
 * read_sequnlock_excl() - end a seqlock_t locking reader critical section
 * @sl: Pointer to seqlock_t
 */
static inline void read_sequnlock_excl(seqlock_t *sl)
{
        spin_unlock(&sl->lock);
}

/**
 * read_seqlock_excl_bh() - start a seqlock_t locking reader section with
 *                            softirqs disabled
 * @sl: Pointer to seqlock_t
 *
 * _bh variant of read_seqlock_excl(). Use this variant only if the
 * seqlock_t write side section, *or other read sections*, can be invoked
 * from softirq contexts.
 */
static inline void read_seqlock_excl_bh(seqlock_t *sl)
{
        spin_lock_bh(&sl->lock);
}

/**
 * read_sequnlock_excl_bh() - stop a seqlock_t softirq-disabled locking
 *                              reader section
 * @sl: Pointer to seqlock_t
 */
static inline void read_sequnlock_excl_bh(seqlock_t *sl)
{
        spin_unlock_bh(&sl->lock);
}

/**
 * read_seqlock_excl_irq() - start a non-interruptible seqlock_t locking
 *                             reader section
 * @sl: Pointer to seqlock_t
 *
 * _irq variant of read_seqlock_excl(). Use this only if the seqlock_t
 * write side section, *or other read sections*, can be invoked from a
 * hardirq context.
 */
static inline void read_seqlock_excl_irq(seqlock_t *sl)
{
        spin_lock_irq(&sl->lock);
}

/**
 * read_sequnlock_excl_irq() - end an interrupts-disabled seqlock_t
 *                             locking reader section
 * @sl: Pointer to seqlock_t
 */
static inline void read_sequnlock_excl_irq(seqlock_t *sl)
{
        spin_unlock_irq(&sl->lock);
}

static inline unsigned long __read_seqlock_excl_irqsave(seqlock_t *sl)
{
        unsigned long flags;

        spin_lock_irqsave(&sl->lock, flags);
        return flags;
}

/**
 * read_seqlock_excl_irqsave() - start a non-interruptible seqlock_t
 *                                 locking reader section
 * @lock:  Pointer to seqlock_t
 * @flags: Stack-allocated storage for saving caller's local interrupt
 *         state, to be passed to read_sequnlock_excl_irqrestore().
 *
 * _irqsave variant of read_seqlock_excl(). Use this only if the seqlock_t
 * write side section, *or other read sections*, can be invoked from a
 * hardirq context.
 */
#define read_seqlock_excl_irqsave(lock, flags)                                \
        do { flags = __read_seqlock_excl_irqsave(lock); } while (0)

/**
 * read_sequnlock_excl_irqrestore() - end non-interruptible seqlock_t
 *                                      locking reader section
 * @sl:    Pointer to seqlock_t
 * @flags: Caller saved interrupt state, from read_seqlock_excl_irqsave()
 */
static inline void
read_sequnlock_excl_irqrestore(seqlock_t *sl, unsigned long flags)
{
        spin_unlock_irqrestore(&sl->lock, flags);
}

/**
 * read_seqbegin_or_lock() - begin a seqlock_t lockless or locking reader
 * @lock: Pointer to seqlock_t
 * @seq : Marker and return parameter. If the passed value is even, the
 * reader will become a *lockless* seqlock_t reader as in read_seqbegin().
 * If the passed value is odd, the reader will become a *locking* reader
 * as in read_seqlock_excl().  In the first call to this function, the
 * caller *must* initialize and pass an even value to @seq; this way, a
 * lockless read can be optimistically tried first.
 *
 * read_seqbegin_or_lock is an API designed to optimistically try a normal
 * lockless seqlock_t read section first.  If an odd counter is found, the
 * lockless read trial has failed, and the next read iteration transforms
 * itself into a full seqlock_t locking reader.
 *
 * This is typically used to avoid seqlock_t lockless readers starvation
 * (too much retry loops) in the case of a sharp spike in write side
 * activity.
 *
 * Context: if the seqlock_t write section, *or other read sections*, can
 * be invoked from hardirq or softirq contexts, use the _irqsave or _bh
 * variant of this function instead.
 *
 * Check Documentation/locking/seqlock.rst for template example code.
 *
 * Return: the encountered sequence counter value, through the @seq
 * parameter, which is overloaded as a return parameter. This returned
 * value must be checked with need_seqretry(). If the read section need to
 * be retried, this returned value must also be passed as the @seq
 * parameter of the next read_seqbegin_or_lock() iteration.
 */
static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq)
{
        if (!(*seq & 1))        /* Even */
                *seq = read_seqbegin(lock);
        else                        /* Odd */
                read_seqlock_excl(lock);
}

/**
 * need_seqretry() - validate seqlock_t "locking or lockless" read section
 * @lock: Pointer to seqlock_t
 * @seq: sequence count, from read_seqbegin_or_lock()
 *
 * Return: true if a read section retry is required, false otherwise
 */
static inline int need_seqretry(seqlock_t *lock, int seq)
{
        return !(seq & 1) && read_seqretry(lock, seq);
}

/**
 * done_seqretry() - end seqlock_t "locking or lockless" reader section
 * @lock: Pointer to seqlock_t
 * @seq: count, from read_seqbegin_or_lock()
 *
 * done_seqretry finishes the seqlock_t read side critical section started
 * with read_seqbegin_or_lock() and validated by need_seqretry().
 */
static inline void done_seqretry(seqlock_t *lock, int seq)
{
        if (seq & 1)
                read_sequnlock_excl(lock);
}

/**
 * read_seqbegin_or_lock_irqsave() - begin a seqlock_t lockless reader, or
 *                                   a non-interruptible locking reader
 * @lock: Pointer to seqlock_t
 * @seq:  Marker and return parameter. Check read_seqbegin_or_lock().
 *
 * This is the _irqsave variant of read_seqbegin_or_lock(). Use it only if
 * the seqlock_t write section, *or other read sections*, can be invoked
 * from hardirq context.
 *
 * Note: Interrupts will be disabled only for "locking reader" mode.
 *
 * Return:
 *
 *   1. The saved local interrupts state in case of a locking reader, to
 *      be passed to done_seqretry_irqrestore().
 *
 *   2. The encountered sequence counter value, returned through @seq
 *      overloaded as a return parameter. Check read_seqbegin_or_lock().
 */
static inline unsigned long
read_seqbegin_or_lock_irqsave(seqlock_t *lock, int *seq)
{
        unsigned long flags = 0;

        if (!(*seq & 1))        /* Even */
                *seq = read_seqbegin(lock);
        else                        /* Odd */
                read_seqlock_excl_irqsave(lock, flags);

        return flags;
}

/**
 * done_seqretry_irqrestore() - end a seqlock_t lockless reader, or a
 *                                non-interruptible locking reader section
 * @lock:  Pointer to seqlock_t
 * @seq:   Count, from read_seqbegin_or_lock_irqsave()
 * @flags: Caller's saved local interrupt state in case of a locking
 *           reader, also from read_seqbegin_or_lock_irqsave()
 *
 * This is the _irqrestore variant of done_seqretry(). The read section
 * must've been opened with read_seqbegin_or_lock_irqsave(), and validated
 * by need_seqretry().
 */
static inline void
done_seqretry_irqrestore(seqlock_t *lock, int seq, unsigned long flags)
{
        if (seq & 1)
                read_sequnlock_excl_irqrestore(lock, flags);
}
#endif /* __LINUX_SEQLOCK_H */



















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef _LINUX_RCUREF_H
#define _LINUX_RCUREF_H

#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/limits.h>
#include <linux/lockdep.h>
#include <linux/preempt.h>
#include <linux/rcupdate.h>

#define RCUREF_ONEREF                0x00000000U
#define RCUREF_MAXREF                0x7FFFFFFFU
#define RCUREF_SATURATED        0xA0000000U
#define RCUREF_RELEASED                0xC0000000U
#define RCUREF_DEAD                0xE0000000U
#define RCUREF_NOREF                0xFFFFFFFFU

/**
 * rcuref_init - Initialize a rcuref reference count with the given reference count
 * @ref:        Pointer to the reference count
 * @cnt:        The initial reference count typically '1'
 */
static inline void rcuref_init(rcuref_t *ref, unsigned int cnt)
{
        atomic_set(&ref->refcnt, cnt - 1);
}

/**
 * rcuref_read - Read the number of held reference counts of a rcuref
 * @ref:        Pointer to the reference count
 *
 * Return: The number of held references (0 ... N). The value 0 does not
 * indicate that it is safe to schedule the object, protected by this reference
 * counter, for deconstruction.
 * If you want to know if the reference counter has been marked DEAD (as
 * signaled by rcuref_put()) please use rcuread_is_dead().
 */
static inline unsigned int rcuref_read(rcuref_t *ref)
{
        unsigned int c = atomic_read(&ref->refcnt);

        /* Return 0 if within the DEAD zone. */
        return c >= RCUREF_RELEASED ? 0 : c + 1;
}

/**
 * rcuref_is_dead -        Check if the rcuref has been already marked dead
 * @ref:                Pointer to the reference count
 *
 * Return: True if the object has been marked DEAD. This signals that a previous
 * invocation of rcuref_put() returned true on this reference counter meaning
 * the protected object can safely be scheduled for deconstruction.
 * Otherwise, returns false.
 */
static inline bool rcuref_is_dead(rcuref_t *ref)
{
        unsigned int c = atomic_read(&ref->refcnt);

        return (c >= RCUREF_RELEASED) && (c < RCUREF_NOREF);
}

extern __must_check bool rcuref_get_slowpath(rcuref_t *ref);

/**
 * rcuref_get - Acquire one reference on a rcuref reference count
 * @ref:        Pointer to the reference count
 *
 * Similar to atomic_inc_not_zero() but saturates at RCUREF_MAXREF.
 *
 * Provides no memory ordering, it is assumed the caller has guaranteed the
 * object memory to be stable (RCU, etc.). It does provide a control dependency
 * and thereby orders future stores. See documentation in lib/rcuref.c
 *
 * Return:
 *        False if the attempt to acquire a reference failed. This happens
 *        when the last reference has been put already
 *
 *        True if a reference was successfully acquired
 */
static inline __must_check bool rcuref_get(rcuref_t *ref)
{
        /*
         * Unconditionally increase the reference count. The saturation and
         * dead zones provide enough tolerance for this.
         */
        if (likely(!atomic_add_negative_relaxed(1, &ref->refcnt)))
                return true;

        /* Handle the cases inside the saturation and dead zones */
        return rcuref_get_slowpath(ref);
}

extern __must_check bool rcuref_put_slowpath(rcuref_t *ref, unsigned int cnt);

/*
 * Internal helper. Do not invoke directly.
 */
static __always_inline __must_check bool __rcuref_put(rcuref_t *ref)
{
        int cnt;

        RCU_LOCKDEP_WARN(!rcu_read_lock_held() && preemptible(),
                         "suspicious rcuref_put_rcusafe() usage");
        /*
         * Unconditionally decrease the reference count. The saturation and
         * dead zones provide enough tolerance for this.
         */
        cnt = atomic_sub_return_release(1, &ref->refcnt);
        if (likely(cnt >= 0))
                return false;

        /*
         * Handle the last reference drop and cases inside the saturation
         * and dead zones.
         */
        return rcuref_put_slowpath(ref, cnt);
}

/**
 * rcuref_put_rcusafe -- Release one reference for a rcuref reference count RCU safe
 * @ref:        Pointer to the reference count
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before, and provides an acquire ordering on success such that free()
 * must come after.
 *
 * Can be invoked from contexts, which guarantee that no grace period can
 * happen which would free the object concurrently if the decrement drops
 * the last reference and the slowpath races against a concurrent get() and
 * put() pair. rcu_read_lock()'ed and atomic contexts qualify.
 *
 * Return:
 *        True if this was the last reference with no future references
 *        possible. This signals the caller that it can safely release the
 *        object which is protected by the reference counter.
 *
 *        False if there are still active references or the put() raced
 *        with a concurrent get()/put() pair. Caller is not allowed to
 *        release the protected object.
 */
static inline __must_check bool rcuref_put_rcusafe(rcuref_t *ref)
{
        return __rcuref_put(ref);
}

/**
 * rcuref_put -- Release one reference for a rcuref reference count
 * @ref:        Pointer to the reference count
 *
 * Can be invoked from any context.
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before, and provides an acquire ordering on success such that free()
 * must come after.
 *
 * Return:
 *
 *        True if this was the last reference with no future references
 *        possible. This signals the caller that it can safely schedule the
 *        object, which is protected by the reference counter, for
 *        deconstruction.
 *
 *        False if there are still active references or the put() raced
 *        with a concurrent get()/put() pair. Caller is not allowed to
 *        deconstruct the protected object.
 */
static inline __must_check bool rcuref_put(rcuref_t *ref)
{
        bool released;

        preempt_disable();
        released = __rcuref_put(ref);
        preempt_enable();
        return released;
}

#endif










































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions of the Internet Protocol.
 *
 * Version:        @(#)in.h        1.0.1        04/21/93
 *
 * Authors:        Original taken from the GNU Project <netinet/in.h> file.
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 */
#ifndef _LINUX_IN_H
#define _LINUX_IN_H


#include <linux/errno.h>
#include <uapi/linux/in.h>

static inline int proto_ports_offset(int proto)
{
        switch (proto) {
        case IPPROTO_TCP:
        case IPPROTO_UDP:
        case IPPROTO_DCCP:
        case IPPROTO_ESP:        /* SPI */
        case IPPROTO_SCTP:
        case IPPROTO_UDPLITE:
                return 0;
        case IPPROTO_AH:        /* SPI */
                return 4;
        default:
                return -EINVAL;
        }
}

static inline bool ipv4_is_loopback(__be32 addr)
{
        return (addr & htonl(0xff000000)) == htonl(0x7f000000);
}

static inline bool ipv4_is_multicast(__be32 addr)
{
        return (addr & htonl(0xf0000000)) == htonl(0xe0000000);
}

static inline bool ipv4_is_local_multicast(__be32 addr)
{
        return (addr & htonl(0xffffff00)) == htonl(0xe0000000);
}

static inline bool ipv4_is_lbcast(__be32 addr)
{
        /* limited broadcast */
        return addr == htonl(INADDR_BROADCAST);
}

static inline bool ipv4_is_all_snoopers(__be32 addr)
{
        return addr == htonl(INADDR_ALLSNOOPERS_GROUP);
}

static inline bool ipv4_is_zeronet(__be32 addr)
{
        return (addr == 0);
}

/* Special-Use IPv4 Addresses (RFC3330) */

static inline bool ipv4_is_private_10(__be32 addr)
{
        return (addr & htonl(0xff000000)) == htonl(0x0a000000);
}

static inline bool ipv4_is_private_172(__be32 addr)
{
        return (addr & htonl(0xfff00000)) == htonl(0xac100000);
}

static inline bool ipv4_is_private_192(__be32 addr)
{
        return (addr & htonl(0xffff0000)) == htonl(0xc0a80000);
}

static inline bool ipv4_is_linklocal_169(__be32 addr)
{
        return (addr & htonl(0xffff0000)) == htonl(0xa9fe0000);
}

static inline bool ipv4_is_anycast_6to4(__be32 addr)
{
        return (addr & htonl(0xffffff00)) == htonl(0xc0586300);
}

static inline bool ipv4_is_test_192(__be32 addr)
{
        return (addr & htonl(0xffffff00)) == htonl(0xc0000200);
}

static inline bool ipv4_is_test_198(__be32 addr)
{
        return (addr & htonl(0xfffe0000)) == htonl(0xc6120000);
}
#endif        /* _LINUX_IN_H */
















































  315 
  316 
  314 




  302 

  304 

  304 












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Resizable, Scalable, Concurrent Hash Table
 *
 * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au>
 * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch>
 * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
 *
 * Code partially derived from nft_hash
 * Rewritten with rehash code from br_multicast plus single list
 * pointer as suggested by Josh Triplett
 */

#include <linux/atomic.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/log2.h>
#include <linux/sched.h>
#include <linux/rculist.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/jhash.h>
#include <linux/random.h>
#include <linux/rhashtable.h>
#include <linux/err.h>
#include <linux/export.h>

#define HASH_DEFAULT_SIZE        64UL
#define HASH_MIN_SIZE                4U

union nested_table {
        union nested_table __rcu *table;
        struct rhash_lock_head __rcu *bucket;
};

static u32 head_hashfn(struct rhashtable *ht,
                       const struct bucket_table *tbl,
                       const struct rhash_head *he)
{
        return rht_head_hashfn(ht, tbl, he, ht->p);
}

#ifdef CONFIG_PROVE_LOCKING
#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT))

int lockdep_rht_mutex_is_held(struct rhashtable *ht)
{
        return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1;
}
EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);

int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
{
        if (!debug_locks)
                return 1;
        if (unlikely(tbl->nest))
                return 1;
        return bit_spin_is_locked(0, (unsigned long *)&tbl->buckets[hash]);
}
EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
#else
#define ASSERT_RHT_MUTEX(HT)
#endif

static inline union nested_table *nested_table_top(
        const struct bucket_table *tbl)
{
        /* The top-level bucket entry does not need RCU protection
         * because it's set at the same time as tbl->nest.
         */
        return (void *)rcu_dereference_protected(tbl->buckets[0], 1);
}

static void nested_table_free(union nested_table *ntbl, unsigned int size)
{
        const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
        const unsigned int len = 1 << shift;
        unsigned int i;

        ntbl = rcu_dereference_protected(ntbl->table, 1);
        if (!ntbl)
                return;

        if (size > len) {
                size >>= shift;
                for (i = 0; i < len; i++)
                        nested_table_free(ntbl + i, size);
        }

        kfree(ntbl);
}

static void nested_bucket_table_free(const struct bucket_table *tbl)
{
        unsigned int size = tbl->size >> tbl->nest;
        unsigned int len = 1 << tbl->nest;
        union nested_table *ntbl;
        unsigned int i;

        ntbl = nested_table_top(tbl);

        for (i = 0; i < len; i++)
                nested_table_free(ntbl + i, size);

        kfree(ntbl);
}

static void bucket_table_free(const struct bucket_table *tbl)
{
        if (tbl->nest)
                nested_bucket_table_free(tbl);

        kvfree(tbl);
}

static void bucket_table_free_rcu(struct rcu_head *head)
{
        bucket_table_free(container_of(head, struct bucket_table, rcu));
}

static union nested_table *nested_table_alloc(struct rhashtable *ht,
                                              union nested_table __rcu **prev,
                                              bool leaf)
{
        union nested_table *ntbl;
        int i;

        ntbl = rcu_dereference(*prev);
        if (ntbl)
                return ntbl;

        ntbl = alloc_hooks_tag(ht->alloc_tag,
                        kmalloc_noprof(PAGE_SIZE, GFP_ATOMIC|__GFP_ZERO));

        if (ntbl && leaf) {
                for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0]); i++)
                        INIT_RHT_NULLS_HEAD(ntbl[i].bucket);
        }

        if (cmpxchg((union nested_table **)prev, NULL, ntbl) == NULL)
                return ntbl;
        /* Raced with another thread. */
        kfree(ntbl);
        return rcu_dereference(*prev);
}

static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht,
                                                      size_t nbuckets,
                                                      gfp_t gfp)
{
        const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
        struct bucket_table *tbl;
        size_t size;

        if (nbuckets < (1 << (shift + 1)))
                return NULL;

        size = sizeof(*tbl) + sizeof(tbl->buckets[0]);

        tbl = alloc_hooks_tag(ht->alloc_tag,
                        kmalloc_noprof(size, gfp|__GFP_ZERO));
        if (!tbl)
                return NULL;

        if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets,
                                false)) {
                kfree(tbl);
                return NULL;
        }

        tbl->nest = (ilog2(nbuckets) - 1) % shift + 1;

        return tbl;
}

static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
                                               size_t nbuckets,
                                               gfp_t gfp)
{
        struct bucket_table *tbl = NULL;
        size_t size;
        int i;
        static struct lock_class_key __key;

        tbl = alloc_hooks_tag(ht->alloc_tag,
                        kvmalloc_node_align_noprof(struct_size(tbl, buckets, nbuckets),
                                             1, gfp|__GFP_ZERO, NUMA_NO_NODE));

        size = nbuckets;

        if (tbl == NULL && !gfpflags_allow_blocking(gfp)) {
                tbl = nested_bucket_table_alloc(ht, nbuckets, gfp);
                nbuckets = 0;
        }

        if (tbl == NULL)
                return NULL;

        lockdep_init_map(&tbl->dep_map, "rhashtable_bucket", &__key, 0);

        tbl->size = size;

        rcu_head_init(&tbl->rcu);
        INIT_LIST_HEAD(&tbl->walkers);

        tbl->hash_rnd = get_random_u32();

        for (i = 0; i < nbuckets; i++)
                INIT_RHT_NULLS_HEAD(tbl->buckets[i]);

        return tbl;
}

static struct bucket_table *rhashtable_last_table(struct rhashtable *ht,
                                                  struct bucket_table *tbl)
{
        struct bucket_table *new_tbl;

        do {
                new_tbl = tbl;
                tbl = rht_dereference_rcu(tbl->future_tbl, ht);
        } while (tbl);

        return new_tbl;
}

static int rhashtable_rehash_one(struct rhashtable *ht,
                                 struct rhash_lock_head __rcu **bkt,
                                 unsigned int old_hash)
{
        struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
        struct bucket_table *new_tbl = rhashtable_last_table(ht, old_tbl);
        int err = -EAGAIN;
        struct rhash_head *head, *next, *entry;
        struct rhash_head __rcu **pprev = NULL;
        unsigned int new_hash;
        unsigned long flags;

        if (new_tbl->nest)
                goto out;

        err = -ENOENT;

        rht_for_each_from(entry, rht_ptr(bkt, old_tbl, old_hash),
                          old_tbl, old_hash) {
                err = 0;
                next = rht_dereference_bucket(entry->next, old_tbl, old_hash);

                if (rht_is_a_nulls(next))
                        break;

                pprev = &entry->next;
        }

        if (err)
                goto out;

        new_hash = head_hashfn(ht, new_tbl, entry);

        flags = rht_lock_nested(new_tbl, &new_tbl->buckets[new_hash],
                                SINGLE_DEPTH_NESTING);

        head = rht_ptr(new_tbl->buckets + new_hash, new_tbl, new_hash);

        RCU_INIT_POINTER(entry->next, head);

        rht_assign_unlock(new_tbl, &new_tbl->buckets[new_hash], entry, flags);

        if (pprev)
                rcu_assign_pointer(*pprev, next);
        else
                /* Need to preserved the bit lock. */
                rht_assign_locked(bkt, next);

out:
        return err;
}

static int rhashtable_rehash_chain(struct rhashtable *ht,
                                    unsigned int old_hash)
{
        struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
        struct rhash_lock_head __rcu **bkt = rht_bucket_var(old_tbl, old_hash);
        unsigned long flags;
        int err;

        if (!bkt)
                return 0;
        flags = rht_lock(old_tbl, bkt);

        while (!(err = rhashtable_rehash_one(ht, bkt, old_hash)))
                ;

        if (err == -ENOENT)
                err = 0;
        rht_unlock(old_tbl, bkt, flags);

        return err;
}

static int rhashtable_rehash_attach(struct rhashtable *ht,
                                    struct bucket_table *old_tbl,
                                    struct bucket_table *new_tbl)
{
        /* Make insertions go into the new, empty table right away. Deletions
         * and lookups will be attempted in both tables until we synchronize.
         * As cmpxchg() provides strong barriers, we do not need
         * rcu_assign_pointer().
         */

        if (cmpxchg((struct bucket_table **)&old_tbl->future_tbl, NULL,
                    new_tbl) != NULL)
                return -EEXIST;

        return 0;
}

static int rhashtable_rehash_table(struct rhashtable *ht)
{
        struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
        struct bucket_table *new_tbl;
        struct rhashtable_walker *walker;
        unsigned int old_hash;
        int err;

        new_tbl = rht_dereference(old_tbl->future_tbl, ht);
        if (!new_tbl)
                return 0;

        for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
                err = rhashtable_rehash_chain(ht, old_hash);
                if (err)
                        return err;
                cond_resched();
        }

        /* Publish the new table pointer. */
        rcu_assign_pointer(ht->tbl, new_tbl);

        spin_lock(&ht->lock);
        list_for_each_entry(walker, &old_tbl->walkers, list)
                walker->tbl = NULL;

        /* Wait for readers. All new readers will see the new
         * table, and thus no references to the old table will
         * remain.
         * We do this inside the locked region so that
         * rhashtable_walk_stop() can use rcu_head_after_call_rcu()
         * to check if it should not re-link the table.
         */
        call_rcu(&old_tbl->rcu, bucket_table_free_rcu);
        spin_unlock(&ht->lock);

        return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0;
}

static int rhashtable_rehash_alloc(struct rhashtable *ht,
                                   struct bucket_table *old_tbl,
                                   unsigned int size)
{
        struct bucket_table *new_tbl;
        int err;

        ASSERT_RHT_MUTEX(ht);

        new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
        if (new_tbl == NULL)
                return -ENOMEM;

        err = rhashtable_rehash_attach(ht, old_tbl, new_tbl);
        if (err)
                bucket_table_free(new_tbl);

        return err;
}

/**
 * rhashtable_shrink - Shrink hash table while allowing concurrent lookups
 * @ht:                the hash table to shrink
 *
 * This function shrinks the hash table to fit, i.e., the smallest
 * size would not cause it to expand right away automatically.
 *
 * The caller must ensure that no concurrent resizing occurs by holding
 * ht->mutex.
 *
 * The caller must ensure that no concurrent table mutations take place.
 * It is however valid to have concurrent lookups if they are RCU protected.
 *
 * It is valid to have concurrent insertions and deletions protected by per
 * bucket locks or concurrent RCU protected lookups and traversals.
 */
static int rhashtable_shrink(struct rhashtable *ht)
{
        struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
        unsigned int nelems = atomic_read(&ht->nelems);
        unsigned int size = 0;

        if (nelems)
                size = roundup_pow_of_two(nelems * 3 / 2);
        if (size < ht->p.min_size)
                size = ht->p.min_size;

        if (old_tbl->size <= size)
                return 0;

        if (rht_dereference(old_tbl->future_tbl, ht))
                return -EEXIST;

        return rhashtable_rehash_alloc(ht, old_tbl, size);
}

static void rht_deferred_worker(struct work_struct *work)
{
        struct rhashtable *ht;
        struct bucket_table *tbl;
        int err = 0;

        ht = container_of(work, struct rhashtable, run_work);
        mutex_lock(&ht->mutex);

        tbl = rht_dereference(ht->tbl, ht);
        tbl = rhashtable_last_table(ht, tbl);

        if (rht_grow_above_75(ht, tbl))
                err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2);
        else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl))
                err = rhashtable_shrink(ht);
        else if (tbl->nest)
                err = rhashtable_rehash_alloc(ht, tbl, tbl->size);

        if (!err || err == -EEXIST) {
                int nerr;

                nerr = rhashtable_rehash_table(ht);
                err = err ?: nerr;
        }

        mutex_unlock(&ht->mutex);

        if (err)
                schedule_work(&ht->run_work);
}

static int rhashtable_insert_rehash(struct rhashtable *ht,
                                    struct bucket_table *tbl)
{
        struct bucket_table *old_tbl;
        struct bucket_table *new_tbl;
        unsigned int size;
        int err;

        old_tbl = rht_dereference_rcu(ht->tbl, ht);

        size = tbl->size;

        err = -EBUSY;

        if (rht_grow_above_75(ht, tbl))
                size *= 2;
        /* Do not schedule more than one rehash */
        else if (old_tbl != tbl)
                goto fail;

        err = -ENOMEM;

        new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC | __GFP_NOWARN);
        if (new_tbl == NULL)
                goto fail;

        err = rhashtable_rehash_attach(ht, tbl, new_tbl);
        if (err) {
                bucket_table_free(new_tbl);
                if (err == -EEXIST)
                        err = 0;
        } else
                schedule_work(&ht->run_work);

        return err;

fail:
        /* Do not fail the insert if someone else did a rehash. */
        if (likely(rcu_access_pointer(tbl->future_tbl)))
                return 0;

        /* Schedule async rehash to retry allocation in process context. */
        if (err == -ENOMEM)
                schedule_work(&ht->run_work);

        return err;
}

static void *rhashtable_lookup_one(struct rhashtable *ht,
                                   struct rhash_lock_head __rcu **bkt,
                                   struct bucket_table *tbl, unsigned int hash,
                                   const void *key, struct rhash_head *obj)
{
        struct rhashtable_compare_arg arg = {
                .ht = ht,
                .key = key,
        };
        struct rhash_head __rcu **pprev = NULL;
        struct rhash_head *head;
        int elasticity;

        elasticity = RHT_ELASTICITY;
        rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) {
                struct rhlist_head *list;
                struct rhlist_head *plist;

                elasticity--;
                if (!key ||
                    (ht->p.obj_cmpfn ?
                     ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) :
                     rhashtable_compare(&arg, rht_obj(ht, head)))) {
                        pprev = &head->next;
                        continue;
                }

                if (!ht->rhlist)
                        return rht_obj(ht, head);

                list = container_of(obj, struct rhlist_head, rhead);
                plist = container_of(head, struct rhlist_head, rhead);

                RCU_INIT_POINTER(list->next, plist);
                head = rht_dereference_bucket(head->next, tbl, hash);
                RCU_INIT_POINTER(list->rhead.next, head);
                if (pprev)
                        rcu_assign_pointer(*pprev, obj);
                else
                        /* Need to preserve the bit lock */
                        rht_assign_locked(bkt, obj);

                return NULL;
        }

        if (elasticity <= 0)
                return ERR_PTR(-EAGAIN);

        return ERR_PTR(-ENOENT);
}

static struct bucket_table *rhashtable_insert_one(
        struct rhashtable *ht, struct rhash_lock_head __rcu **bkt,
        struct bucket_table *tbl, unsigned int hash, struct rhash_head *obj,
        void *data)
{
        struct bucket_table *new_tbl;
        struct rhash_head *head;

        if (!IS_ERR_OR_NULL(data))
                return ERR_PTR(-EEXIST);

        if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT)
                return ERR_CAST(data);

        new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
        if (new_tbl)
                return new_tbl;

        if (PTR_ERR(data) != -ENOENT)
                return ERR_CAST(data);

        if (unlikely(rht_grow_above_max(ht, tbl)))
                return ERR_PTR(-E2BIG);

        if (unlikely(rht_grow_above_100(ht, tbl)))
                return ERR_PTR(-EAGAIN);

        head = rht_ptr(bkt, tbl, hash);

        RCU_INIT_POINTER(obj->next, head);
        if (ht->rhlist) {
                struct rhlist_head *list;

                list = container_of(obj, struct rhlist_head, rhead);
                RCU_INIT_POINTER(list->next, NULL);
        }

        /* bkt is always the head of the list, so it holds
         * the lock, which we need to preserve
         */
        rht_assign_locked(bkt, obj);

        return NULL;
}

static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
                                   struct rhash_head *obj)
{
        struct bucket_table *new_tbl;
        struct bucket_table *tbl;
        struct rhash_lock_head __rcu **bkt;
        unsigned long flags;
        unsigned int hash;
        void *data;

        new_tbl = rcu_dereference(ht->tbl);

        do {
                tbl = new_tbl;
                hash = rht_head_hashfn(ht, tbl, obj, ht->p);
                if (rcu_access_pointer(tbl->future_tbl))
                        /* Failure is OK */
                        bkt = rht_bucket_var(tbl, hash);
                else
                        bkt = rht_bucket_insert(ht, tbl, hash);
                if (bkt == NULL) {
                        new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
                        data = ERR_PTR(-EAGAIN);
                } else {
                        bool inserted;

                        flags = rht_lock(tbl, bkt);
                        data = rhashtable_lookup_one(ht, bkt, tbl,
                                                     hash, key, obj);
                        new_tbl = rhashtable_insert_one(ht, bkt, tbl,
                                                        hash, obj, data);
                        inserted = data && !new_tbl;
                        if (inserted)
                                atomic_inc(&ht->nelems);
                        if (PTR_ERR(new_tbl) != -EEXIST)
                                data = ERR_CAST(new_tbl);

                        rht_unlock(tbl, bkt, flags);

                        if (inserted && rht_grow_above_75(ht, tbl))
                                schedule_work(&ht->run_work);
                }
        } while (!IS_ERR_OR_NULL(new_tbl));

        if (PTR_ERR(data) == -EAGAIN)
                data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?:
                               -EAGAIN);

        return data;
}

void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
                             struct rhash_head *obj)
{
        void *data;

        do {
                rcu_read_lock();
                data = rhashtable_try_insert(ht, key, obj);
                rcu_read_unlock();
        } while (PTR_ERR(data) == -EAGAIN);

        return data;
}
EXPORT_SYMBOL_GPL(rhashtable_insert_slow);

/**
 * rhashtable_walk_enter - Initialise an iterator
 * @ht:                Table to walk over
 * @iter:        Hash table Iterator
 *
 * This function prepares a hash table walk.
 *
 * Note that if you restart a walk after rhashtable_walk_stop you
 * may see the same object twice.  Also, you may miss objects if
 * there are removals in between rhashtable_walk_stop and the next
 * call to rhashtable_walk_start.
 *
 * For a completely stable walk you should construct your own data
 * structure outside the hash table.
 *
 * This function may be called from any process context, including
 * non-preemptible context, but cannot be called from softirq or
 * hardirq context.
 *
 * You must call rhashtable_walk_exit after this function returns.
 */
void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter)
{
        iter->ht = ht;
        iter->p = NULL;
        iter->slot = 0;
        iter->skip = 0;
        iter->end_of_table = 0;

        spin_lock(&ht->lock);
        iter->walker.tbl =
                rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock));
        list_add(&iter->walker.list, &iter->walker.tbl->walkers);
        spin_unlock(&ht->lock);
}
EXPORT_SYMBOL_GPL(rhashtable_walk_enter);

/**
 * rhashtable_walk_exit - Free an iterator
 * @iter:        Hash table Iterator
 *
 * This function frees resources allocated by rhashtable_walk_enter.
 */
void rhashtable_walk_exit(struct rhashtable_iter *iter)
{
        spin_lock(&iter->ht->lock);
        if (iter->walker.tbl)
                list_del(&iter->walker.list);
        spin_unlock(&iter->ht->lock);
}
EXPORT_SYMBOL_GPL(rhashtable_walk_exit);

/**
 * rhashtable_walk_start_check - Start a hash table walk
 * @iter:        Hash table iterator
 *
 * Start a hash table walk at the current iterator position.  Note that we take
 * the RCU lock in all cases including when we return an error.  So you must
 * always call rhashtable_walk_stop to clean up.
 *
 * Returns zero if successful.
 *
 * Returns -EAGAIN if resize event occurred.  Note that the iterator
 * will rewind back to the beginning and you may use it immediately
 * by calling rhashtable_walk_next.
 *
 * rhashtable_walk_start is defined as an inline variant that returns
 * void. This is preferred in cases where the caller would ignore
 * resize events and always continue.
 */
int rhashtable_walk_start_check(struct rhashtable_iter *iter)
        __acquires(RCU)
{
        struct rhashtable *ht = iter->ht;
        bool rhlist = ht->rhlist;

        rcu_read_lock();

        spin_lock(&ht->lock);
        if (iter->walker.tbl)
                list_del(&iter->walker.list);
        spin_unlock(&ht->lock);

        if (iter->end_of_table)
                return 0;
        if (!iter->walker.tbl) {
                iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht);
                iter->slot = 0;
                iter->skip = 0;
                return -EAGAIN;
        }

        if (iter->p && !rhlist) {
                /*
                 * We need to validate that 'p' is still in the table, and
                 * if so, update 'skip'
                 */
                struct rhash_head *p;
                int skip = 0;
                rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
                        skip++;
                        if (p == iter->p) {
                                iter->skip = skip;
                                goto found;
                        }
                }
                iter->p = NULL;
        } else if (iter->p && rhlist) {
                /* Need to validate that 'list' is still in the table, and
                 * if so, update 'skip' and 'p'.
                 */
                struct rhash_head *p;
                struct rhlist_head *list;
                int skip = 0;
                rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
                        for (list = container_of(p, struct rhlist_head, rhead);
                             list;
                             list = rcu_dereference(list->next)) {
                                skip++;
                                if (list == iter->list) {
                                        iter->p = p;
                                        iter->skip = skip;
                                        goto found;
                                }
                        }
                }
                iter->p = NULL;
        }
found:
        return 0;
}
EXPORT_SYMBOL_GPL(rhashtable_walk_start_check);

/**
 * __rhashtable_walk_find_next - Find the next element in a table (or the first
 * one in case of a new walk).
 *
 * @iter:        Hash table iterator
 *
 * Returns the found object or NULL when the end of the table is reached.
 *
 * Returns -EAGAIN if resize event occurred.
 */
static void *__rhashtable_walk_find_next(struct rhashtable_iter *iter)
{
        struct bucket_table *tbl = iter->walker.tbl;
        struct rhlist_head *list = iter->list;
        struct rhashtable *ht = iter->ht;
        struct rhash_head *p = iter->p;
        bool rhlist = ht->rhlist;

        if (!tbl)
                return NULL;

        for (; iter->slot < tbl->size; iter->slot++) {
                int skip = iter->skip;

                rht_for_each_rcu(p, tbl, iter->slot) {
                        if (rhlist) {
                                list = container_of(p, struct rhlist_head,
                                                    rhead);
                                do {
                                        if (!skip)
                                                goto next;
                                        skip--;
                                        list = rcu_dereference(list->next);
                                } while (list);

                                continue;
                        }
                        if (!skip)
                                break;
                        skip--;
                }

next:
                if (!rht_is_a_nulls(p)) {
                        iter->skip++;
                        iter->p = p;
                        iter->list = list;
                        return rht_obj(ht, rhlist ? &list->rhead : p);
                }

                iter->skip = 0;
        }

        iter->p = NULL;

        /* Ensure we see any new tables. */
        smp_rmb();

        iter->walker.tbl = rht_dereference_rcu(tbl->future_tbl, ht);
        if (iter->walker.tbl) {
                iter->slot = 0;
                iter->skip = 0;
                return ERR_PTR(-EAGAIN);
        } else {
                iter->end_of_table = true;
        }

        return NULL;
}

/**
 * rhashtable_walk_next - Return the next object and advance the iterator
 * @iter:        Hash table iterator
 *
 * Note that you must call rhashtable_walk_stop when you are finished
 * with the walk.
 *
 * Returns the next object or NULL when the end of the table is reached.
 *
 * Returns -EAGAIN if resize event occurred.  Note that the iterator
 * will rewind back to the beginning and you may continue to use it.
 */
void *rhashtable_walk_next(struct rhashtable_iter *iter)
{
        struct rhlist_head *list = iter->list;
        struct rhashtable *ht = iter->ht;
        struct rhash_head *p = iter->p;
        bool rhlist = ht->rhlist;

        if (p) {
                if (!rhlist || !(list = rcu_dereference(list->next))) {
                        p = rcu_dereference(p->next);
                        list = container_of(p, struct rhlist_head, rhead);
                }
                if (!rht_is_a_nulls(p)) {
                        iter->skip++;
                        iter->p = p;
                        iter->list = list;
                        return rht_obj(ht, rhlist ? &list->rhead : p);
                }

                /* At the end of this slot, switch to next one and then find
                 * next entry from that point.
                 */
                iter->skip = 0;
                iter->slot++;
        }

        return __rhashtable_walk_find_next(iter);
}
EXPORT_SYMBOL_GPL(rhashtable_walk_next);

/**
 * rhashtable_walk_peek - Return the next object but don't advance the iterator
 * @iter:        Hash table iterator
 *
 * Returns the next object or NULL when the end of the table is reached.
 *
 * Returns -EAGAIN if resize event occurred.  Note that the iterator
 * will rewind back to the beginning and you may continue to use it.
 */
void *rhashtable_walk_peek(struct rhashtable_iter *iter)
{
        struct rhlist_head *list = iter->list;
        struct rhashtable *ht = iter->ht;
        struct rhash_head *p = iter->p;

        if (p)
                return rht_obj(ht, ht->rhlist ? &list->rhead : p);

        /* No object found in current iter, find next one in the table. */

        if (iter->skip) {
                /* A nonzero skip value points to the next entry in the table
                 * beyond that last one that was found. Decrement skip so
                 * we find the current value. __rhashtable_walk_find_next
                 * will restore the original value of skip assuming that
                 * the table hasn't changed.
                 */
                iter->skip--;
        }

        return __rhashtable_walk_find_next(iter);
}
EXPORT_SYMBOL_GPL(rhashtable_walk_peek);

/**
 * rhashtable_walk_stop - Finish a hash table walk
 * @iter:        Hash table iterator
 *
 * Finish a hash table walk.  Does not reset the iterator to the start of the
 * hash table.
 */
void rhashtable_walk_stop(struct rhashtable_iter *iter)
        __releases(RCU)
{
        struct rhashtable *ht;
        struct bucket_table *tbl = iter->walker.tbl;

        if (!tbl)
                goto out;

        ht = iter->ht;

        spin_lock(&ht->lock);
        if (rcu_head_after_call_rcu(&tbl->rcu, bucket_table_free_rcu))
                /* This bucket table is being freed, don't re-link it. */
                iter->walker.tbl = NULL;
        else
                list_add(&iter->walker.list, &tbl->walkers);
        spin_unlock(&ht->lock);

out:
        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(rhashtable_walk_stop);

static size_t rounded_hashtable_size(const struct rhashtable_params *params)
{
        size_t retsize;

        if (params->nelem_hint)
                retsize = max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
                              (unsigned long)params->min_size);
        else
                retsize = max(HASH_DEFAULT_SIZE,
                              (unsigned long)params->min_size);

        return retsize;
}

static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
{
        return jhash2(key, length, seed);
}

/**
 * rhashtable_init - initialize a new hash table
 * @ht:                hash table to be initialized
 * @params:        configuration parameters
 *
 * Initializes a new hash table based on the provided configuration
 * parameters. A table can be configured either with a variable or
 * fixed length key:
 *
 * Configuration Example 1: Fixed length keys
 * struct test_obj {
 *        int                        key;
 *        void *                        my_member;
 *        struct rhash_head        node;
 * };
 *
 * struct rhashtable_params params = {
 *        .head_offset = offsetof(struct test_obj, node),
 *        .key_offset = offsetof(struct test_obj, key),
 *        .key_len = sizeof(int),
 *        .hashfn = jhash,
 * };
 *
 * Configuration Example 2: Variable length keys
 * struct test_obj {
 *        [...]
 *        struct rhash_head        node;
 * };
 *
 * u32 my_hash_fn(const void *data, u32 len, u32 seed)
 * {
 *        struct test_obj *obj = data;
 *
 *        return [... hash ...];
 * }
 *
 * struct rhashtable_params params = {
 *        .head_offset = offsetof(struct test_obj, node),
 *        .hashfn = jhash,
 *        .obj_hashfn = my_hash_fn,
 * };
 */
int rhashtable_init_noprof(struct rhashtable *ht,
                    const struct rhashtable_params *params)
{
        struct bucket_table *tbl;
        size_t size;

        if ((!params->key_len && !params->obj_hashfn) ||
            (params->obj_hashfn && !params->obj_cmpfn))
                return -EINVAL;

        memset(ht, 0, sizeof(*ht));
        mutex_init(&ht->mutex);
        spin_lock_init(&ht->lock);
        memcpy(&ht->p, params, sizeof(*params));

        alloc_tag_record(ht->alloc_tag);

        if (params->min_size)
                ht->p.min_size = roundup_pow_of_two(params->min_size);

        /* Cap total entries at 2^31 to avoid nelems overflow. */
        ht->max_elems = 1u << 31;

        if (params->max_size) {
                ht->p.max_size = rounddown_pow_of_two(params->max_size);
                if (ht->p.max_size < ht->max_elems / 2)
                        ht->max_elems = ht->p.max_size * 2;
        }

        ht->p.min_size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE);

        size = rounded_hashtable_size(&ht->p);

        ht->key_len = ht->p.key_len;
        if (!params->hashfn) {
                ht->p.hashfn = jhash;

                if (!(ht->key_len & (sizeof(u32) - 1))) {
                        ht->key_len /= sizeof(u32);
                        ht->p.hashfn = rhashtable_jhash2;
                }
        }

        /*
         * This is api initialization and thus we need to guarantee the
         * initial rhashtable allocation. Upon failure, retry with the
         * smallest possible size with __GFP_NOFAIL semantics.
         */
        tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
        if (unlikely(tbl == NULL)) {
                size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE);
                tbl = bucket_table_alloc(ht, size, GFP_KERNEL | __GFP_NOFAIL);
        }

        atomic_set(&ht->nelems, 0);

        RCU_INIT_POINTER(ht->tbl, tbl);

        INIT_WORK(&ht->run_work, rht_deferred_worker);

        return 0;
}
EXPORT_SYMBOL_GPL(rhashtable_init_noprof);

/**
 * rhltable_init - initialize a new hash list table
 * @hlt:        hash list table to be initialized
 * @params:        configuration parameters
 *
 * Initializes a new hash list table.
 *
 * See documentation for rhashtable_init.
 */
int rhltable_init_noprof(struct rhltable *hlt, const struct rhashtable_params *params)
{
        int err;

        err = rhashtable_init_noprof(&hlt->ht, params);
        hlt->ht.rhlist = true;
        return err;
}
EXPORT_SYMBOL_GPL(rhltable_init_noprof);

static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj,
                                void (*free_fn)(void *ptr, void *arg),
                                void *arg)
{
        struct rhlist_head *list;

        if (!ht->rhlist) {
                free_fn(rht_obj(ht, obj), arg);
                return;
        }

        list = container_of(obj, struct rhlist_head, rhead);
        do {
                obj = &list->rhead;
                list = rht_dereference(list->next, ht);
                free_fn(rht_obj(ht, obj), arg);
        } while (list);
}

/**
 * rhashtable_free_and_destroy - free elements and destroy hash table
 * @ht:                the hash table to destroy
 * @free_fn:        callback to release resources of element
 * @arg:        pointer passed to free_fn
 *
 * Stops an eventual async resize. If defined, invokes free_fn for each
 * element to releasal resources. Please note that RCU protected
 * readers may still be accessing the elements. Releasing of resources
 * must occur in a compatible manner. Then frees the bucket array.
 *
 * This function will eventually sleep to wait for an async resize
 * to complete. The caller is responsible that no further write operations
 * occurs in parallel.
 */
void rhashtable_free_and_destroy(struct rhashtable *ht,
                                 void (*free_fn)(void *ptr, void *arg),
                                 void *arg)
{
        struct bucket_table *tbl, *next_tbl;
        unsigned int i;

        cancel_work_sync(&ht->run_work);

        mutex_lock(&ht->mutex);
        tbl = rht_dereference(ht->tbl, ht);
restart:
        if (free_fn) {
                for (i = 0; i < tbl->size; i++) {
                        struct rhash_head *pos, *next;

                        cond_resched();
                        for (pos = rht_ptr_exclusive(rht_bucket(tbl, i)),
                             next = !rht_is_a_nulls(pos) ?
                                        rht_dereference(pos->next, ht) : NULL;
                             !rht_is_a_nulls(pos);
                             pos = next,
                             next = !rht_is_a_nulls(pos) ?
                                        rht_dereference(pos->next, ht) : NULL)
                                rhashtable_free_one(ht, pos, free_fn, arg);
                }
        }

        next_tbl = rht_dereference(tbl->future_tbl, ht);
        bucket_table_free(tbl);
        if (next_tbl) {
                tbl = next_tbl;
                goto restart;
        }
        mutex_unlock(&ht->mutex);
}
EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy);

void rhashtable_destroy(struct rhashtable *ht)
{
        return rhashtable_free_and_destroy(ht, NULL, NULL);
}
EXPORT_SYMBOL_GPL(rhashtable_destroy);

struct rhash_lock_head __rcu **__rht_bucket_nested(
        const struct bucket_table *tbl, unsigned int hash)
{
        const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
        unsigned int index = hash & ((1 << tbl->nest) - 1);
        unsigned int size = tbl->size >> tbl->nest;
        unsigned int subhash = hash;
        union nested_table *ntbl;

        ntbl = nested_table_top(tbl);
        ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash);
        subhash >>= tbl->nest;

        while (ntbl && size > (1 << shift)) {
                index = subhash & ((1 << shift) - 1);
                ntbl = rht_dereference_bucket_rcu(ntbl[index].table,
                                                  tbl, hash);
                size >>= shift;
                subhash >>= shift;
        }

        if (!ntbl)
                return NULL;

        return &ntbl[subhash].bucket;

}
EXPORT_SYMBOL_GPL(__rht_bucket_nested);

struct rhash_lock_head __rcu **rht_bucket_nested(
        const struct bucket_table *tbl, unsigned int hash)
{
        static struct rhash_lock_head __rcu *rhnull;

        if (!rhnull)
                INIT_RHT_NULLS_HEAD(rhnull);
        return __rht_bucket_nested(tbl, hash) ?: &rhnull;
}
EXPORT_SYMBOL_GPL(rht_bucket_nested);

struct rhash_lock_head __rcu **rht_bucket_nested_insert(
        struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash)
{
        const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
        unsigned int index = hash & ((1 << tbl->nest) - 1);
        unsigned int size = tbl->size >> tbl->nest;
        union nested_table *ntbl;

        ntbl = nested_table_top(tbl);
        hash >>= tbl->nest;
        ntbl = nested_table_alloc(ht, &ntbl[index].table,
                                  size <= (1 << shift));

        while (ntbl && size > (1 << shift)) {
                index = hash & ((1 << shift) - 1);
                size >>= shift;
                hash >>= shift;
                ntbl = nested_table_alloc(ht, &ntbl[index].table,
                                          size <= (1 << shift));
        }

        if (!ntbl)
                return NULL;

        return &ntbl[hash].bucket;

}
EXPORT_SYMBOL_GPL(rht_bucket_nested_insert);














































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_GENERIC_PGALLOC_H
#define __ASM_GENERIC_PGALLOC_H

#ifdef CONFIG_MMU

#define GFP_PGTABLE_KERNEL        (GFP_KERNEL | __GFP_ZERO)
#define GFP_PGTABLE_USER        (GFP_PGTABLE_KERNEL | __GFP_ACCOUNT)

/**
 * __pte_alloc_one_kernel - allocate memory for a PTE-level kernel page table
 * @mm: the mm_struct of the current context
 *
 * This function is intended for architectures that need
 * anything beyond simple page allocation.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
static inline pte_t *__pte_alloc_one_kernel_noprof(struct mm_struct *mm)
{
        struct ptdesc *ptdesc = pagetable_alloc_noprof(GFP_PGTABLE_KERNEL &
                        ~__GFP_HIGHMEM, 0);

        if (!ptdesc)
                return NULL;
        if (!pagetable_pte_ctor(mm, ptdesc)) {
                pagetable_free(ptdesc);
                return NULL;
        }

        return ptdesc_address(ptdesc);
}
#define __pte_alloc_one_kernel(...)        alloc_hooks(__pte_alloc_one_kernel_noprof(__VA_ARGS__))

#ifndef __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL
/**
 * pte_alloc_one_kernel - allocate memory for a PTE-level kernel page table
 * @mm: the mm_struct of the current context
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
static inline pte_t *pte_alloc_one_kernel_noprof(struct mm_struct *mm)
{
        return __pte_alloc_one_kernel_noprof(mm);
}
#define pte_alloc_one_kernel(...)        alloc_hooks(pte_alloc_one_kernel_noprof(__VA_ARGS__))
#endif

/**
 * pte_free_kernel - free PTE-level kernel page table memory
 * @mm: the mm_struct of the current context
 * @pte: pointer to the memory containing the page table
 */
static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
{
        pagetable_dtor_free(virt_to_ptdesc(pte));
}

/**
 * __pte_alloc_one - allocate memory for a PTE-level user page table
 * @mm: the mm_struct of the current context
 * @gfp: GFP flags to use for the allocation
 *
 * Allocate memory for a page table and ptdesc and runs pagetable_pte_ctor().
 *
 * This function is intended for architectures that need
 * anything beyond simple page allocation or must have custom GFP flags.
 *
 * Return: `struct page` referencing the ptdesc or %NULL on error
 */
static inline pgtable_t __pte_alloc_one_noprof(struct mm_struct *mm, gfp_t gfp)
{
        struct ptdesc *ptdesc;

        ptdesc = pagetable_alloc_noprof(gfp, 0);
        if (!ptdesc)
                return NULL;
        if (!pagetable_pte_ctor(mm, ptdesc)) {
                pagetable_free(ptdesc);
                return NULL;
        }

        return ptdesc_page(ptdesc);
}
#define __pte_alloc_one(...)        alloc_hooks(__pte_alloc_one_noprof(__VA_ARGS__))

#ifndef __HAVE_ARCH_PTE_ALLOC_ONE
/**
 * pte_alloc_one - allocate a page for PTE-level user page table
 * @mm: the mm_struct of the current context
 *
 * Allocate memory for a page table and ptdesc and runs pagetable_pte_ctor().
 *
 * Return: `struct page` referencing the ptdesc or %NULL on error
 */
static inline pgtable_t pte_alloc_one_noprof(struct mm_struct *mm)
{
        return __pte_alloc_one_noprof(mm, GFP_PGTABLE_USER);
}
#define pte_alloc_one(...)        alloc_hooks(pte_alloc_one_noprof(__VA_ARGS__))
#endif

/*
 * Should really implement gc for free page table pages. This could be
 * done with a reference count in struct page.
 */

/**
 * pte_free - free PTE-level user page table memory
 * @mm: the mm_struct of the current context
 * @pte_page: the `struct page` referencing the ptdesc
 */
static inline void pte_free(struct mm_struct *mm, struct page *pte_page)
{
        struct ptdesc *ptdesc = page_ptdesc(pte_page);

        pagetable_dtor_free(ptdesc);
}


#if CONFIG_PGTABLE_LEVELS > 2

#ifndef __HAVE_ARCH_PMD_ALLOC_ONE
/**
 * pmd_alloc_one - allocate memory for a PMD-level page table
 * @mm: the mm_struct of the current context
 *
 * Allocate memory for a page table and ptdesc and runs pagetable_pmd_ctor().
 *
 * Allocations use %GFP_PGTABLE_USER in user context and
 * %GFP_PGTABLE_KERNEL in kernel context.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
static inline pmd_t *pmd_alloc_one_noprof(struct mm_struct *mm, unsigned long addr)
{
        struct ptdesc *ptdesc;
        gfp_t gfp = GFP_PGTABLE_USER;

        if (mm == &init_mm)
                gfp = GFP_PGTABLE_KERNEL;
        ptdesc = pagetable_alloc_noprof(gfp, 0);
        if (!ptdesc)
                return NULL;
        if (!pagetable_pmd_ctor(mm, ptdesc)) {
                pagetable_free(ptdesc);
                return NULL;
        }
        return ptdesc_address(ptdesc);
}
#define pmd_alloc_one(...)        alloc_hooks(pmd_alloc_one_noprof(__VA_ARGS__))
#endif

#ifndef __HAVE_ARCH_PMD_FREE
static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
{
        struct ptdesc *ptdesc = virt_to_ptdesc(pmd);

        BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
        pagetable_dtor_free(ptdesc);
}
#endif

#endif /* CONFIG_PGTABLE_LEVELS > 2 */

#if CONFIG_PGTABLE_LEVELS > 3

static inline pud_t *__pud_alloc_one_noprof(struct mm_struct *mm, unsigned long addr)
{
        gfp_t gfp = GFP_PGTABLE_USER;
        struct ptdesc *ptdesc;

        if (mm == &init_mm)
                gfp = GFP_PGTABLE_KERNEL;
        gfp &= ~__GFP_HIGHMEM;

        ptdesc = pagetable_alloc_noprof(gfp, 0);
        if (!ptdesc)
                return NULL;

        pagetable_pud_ctor(ptdesc);
        return ptdesc_address(ptdesc);
}
#define __pud_alloc_one(...)        alloc_hooks(__pud_alloc_one_noprof(__VA_ARGS__))

#ifndef __HAVE_ARCH_PUD_ALLOC_ONE
/**
 * pud_alloc_one - allocate memory for a PUD-level page table
 * @mm: the mm_struct of the current context
 *
 * Allocate memory for a page table using %GFP_PGTABLE_USER for user context
 * and %GFP_PGTABLE_KERNEL for kernel context.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
static inline pud_t *pud_alloc_one_noprof(struct mm_struct *mm, unsigned long addr)
{
        return __pud_alloc_one_noprof(mm, addr);
}
#define pud_alloc_one(...)        alloc_hooks(pud_alloc_one_noprof(__VA_ARGS__))
#endif

static inline void __pud_free(struct mm_struct *mm, pud_t *pud)
{
        struct ptdesc *ptdesc = virt_to_ptdesc(pud);

        BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
        pagetable_dtor_free(ptdesc);
}

#ifndef __HAVE_ARCH_PUD_FREE
static inline void pud_free(struct mm_struct *mm, pud_t *pud)
{
        __pud_free(mm, pud);
}
#endif

#endif /* CONFIG_PGTABLE_LEVELS > 3 */

#if CONFIG_PGTABLE_LEVELS > 4

static inline p4d_t *__p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long addr)
{
        gfp_t gfp = GFP_PGTABLE_USER;
        struct ptdesc *ptdesc;

        if (mm == &init_mm)
                gfp = GFP_PGTABLE_KERNEL;
        gfp &= ~__GFP_HIGHMEM;

        ptdesc = pagetable_alloc_noprof(gfp, 0);
        if (!ptdesc)
                return NULL;

        pagetable_p4d_ctor(ptdesc);
        return ptdesc_address(ptdesc);
}
#define __p4d_alloc_one(...)        alloc_hooks(__p4d_alloc_one_noprof(__VA_ARGS__))

#ifndef __HAVE_ARCH_P4D_ALLOC_ONE
static inline p4d_t *p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long addr)
{
        return __p4d_alloc_one_noprof(mm, addr);
}
#define p4d_alloc_one(...)        alloc_hooks(p4d_alloc_one_noprof(__VA_ARGS__))
#endif

static inline void __p4d_free(struct mm_struct *mm, p4d_t *p4d)
{
        struct ptdesc *ptdesc = virt_to_ptdesc(p4d);

        BUG_ON((unsigned long)p4d & (PAGE_SIZE-1));
        pagetable_dtor_free(ptdesc);
}

#ifndef __HAVE_ARCH_P4D_FREE
static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
{
        if (!mm_p4d_folded(mm))
                __p4d_free(mm, p4d);
}
#endif

#endif /* CONFIG_PGTABLE_LEVELS > 4 */

static inline pgd_t *__pgd_alloc_noprof(struct mm_struct *mm, unsigned int order)
{
        gfp_t gfp = GFP_PGTABLE_USER;
        struct ptdesc *ptdesc;

        if (mm == &init_mm)
                gfp = GFP_PGTABLE_KERNEL;
        gfp &= ~__GFP_HIGHMEM;

        ptdesc = pagetable_alloc_noprof(gfp, order);
        if (!ptdesc)
                return NULL;

        pagetable_pgd_ctor(ptdesc);
        return ptdesc_address(ptdesc);
}
#define __pgd_alloc(...)        alloc_hooks(__pgd_alloc_noprof(__VA_ARGS__))

static inline void __pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
        struct ptdesc *ptdesc = virt_to_ptdesc(pgd);

        BUG_ON((unsigned long)pgd & (PAGE_SIZE-1));
        pagetable_dtor_free(ptdesc);
}

#ifndef __HAVE_ARCH_PGD_FREE
static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
        __pgd_free(mm, pgd);
}
#endif

#endif /* CONFIG_MMU */

#endif /* __ASM_GENERIC_PGALLOC_H */





























































































































































































































































































































































































    1 











































































































































































    4 













































































































































































































































































































































































































































    4 




    4 


    4 
    4 
    4 

    4 
    4 
















































































































































































































































































































































































































































































































































































































































































































































































    1 













































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* memcontrol.h - Memory Controller
 *
 * Copyright IBM Corporation, 2007
 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
 *
 * Copyright 2007 OpenVZ SWsoft Inc
 * Author: Pavel Emelianov <xemul@openvz.org>
 */

#ifndef _LINUX_MEMCONTROL_H
#define _LINUX_MEMCONTROL_H
#include <linux/cgroup.h>
#include <linux/vm_event_item.h>
#include <linux/hardirq.h>
#include <linux/jump_label.h>
#include <linux/kernel.h>
#include <linux/page_counter.h>
#include <linux/vmpressure.h>
#include <linux/eventfd.h>
#include <linux/mm.h>
#include <linux/vmstat.h>
#include <linux/writeback.h>
#include <linux/page-flags.h>
#include <linux/shrinker.h>

struct mem_cgroup;
struct obj_cgroup;
struct page;
struct mm_struct;
struct kmem_cache;

/* Cgroup-specific page state, on top of universal node page state */
enum memcg_stat_item {
        MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS,
        MEMCG_SOCK,
        MEMCG_PERCPU_B,
        MEMCG_VMALLOC,
        MEMCG_KMEM,
        MEMCG_ZSWAP_B,
        MEMCG_ZSWAPPED,
        MEMCG_NR_STAT,
};

enum memcg_memory_event {
        MEMCG_LOW,
        MEMCG_HIGH,
        MEMCG_MAX,
        MEMCG_OOM,
        MEMCG_OOM_KILL,
        MEMCG_OOM_GROUP_KILL,
        MEMCG_SWAP_HIGH,
        MEMCG_SWAP_MAX,
        MEMCG_SWAP_FAIL,
        MEMCG_NR_MEMORY_EVENTS,
};

struct mem_cgroup_reclaim_cookie {
        pg_data_t *pgdat;
        int generation;
};

#ifdef CONFIG_MEMCG

#define MEM_CGROUP_ID_SHIFT        16

struct mem_cgroup_id {
        int id;
        refcount_t ref;
};

struct memcg_vmstats_percpu;
struct memcg1_events_percpu;
struct memcg_vmstats;
struct lruvec_stats_percpu;
struct lruvec_stats;

struct mem_cgroup_reclaim_iter {
        struct mem_cgroup *position;
        /* scan generation, increased every round-trip */
        atomic_t generation;
};

/*
 * per-node information in memory controller.
 */
struct mem_cgroup_per_node {
        /* Keep the read-only fields at the start */
        struct mem_cgroup        *memcg;                /* Back pointer, we cannot */
                                                /* use container_of           */

        struct lruvec_stats_percpu __percpu        *lruvec_stats_percpu;
        struct lruvec_stats                        *lruvec_stats;
        struct shrinker_info __rcu        *shrinker_info;

#ifdef CONFIG_MEMCG_V1
        /*
         * Memcg-v1 only stuff in middle as buffer between read mostly fields
         * and update often fields to avoid false sharing. If v1 stuff is
         * not present, an explicit padding is needed.
         */

        struct rb_node                tree_node;        /* RB tree node */
        unsigned long                usage_in_excess;/* Set to the value by which */
                                                /* the soft limit is exceeded*/
        bool                        on_tree;
#else
        CACHELINE_PADDING(_pad1_);
#endif

        /* Fields which get updated often at the end. */
        struct lruvec                lruvec;
        CACHELINE_PADDING(_pad2_);
        unsigned long                lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
        struct mem_cgroup_reclaim_iter        iter;

#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
        /* slab stats for nmi context */
        atomic_t                slab_reclaimable;
        atomic_t                slab_unreclaimable;
#endif
};

struct mem_cgroup_threshold {
        struct eventfd_ctx *eventfd;
        unsigned long threshold;
};

/* For threshold */
struct mem_cgroup_threshold_ary {
        /* An array index points to threshold just below or equal to usage. */
        int current_threshold;
        /* Size of entries[] */
        unsigned int size;
        /* Array of thresholds */
        struct mem_cgroup_threshold entries[] __counted_by(size);
};

struct mem_cgroup_thresholds {
        /* Primary thresholds array */
        struct mem_cgroup_threshold_ary *primary;
        /*
         * Spare threshold array.
         * This is needed to make mem_cgroup_unregister_event() "never fail".
         * It must be able to store at least primary->size - 1 entries.
         */
        struct mem_cgroup_threshold_ary *spare;
};

/*
 * Remember four most recent foreign writebacks with dirty pages in this
 * cgroup.  Inode sharing is expected to be uncommon and, even if we miss
 * one in a given round, we're likely to catch it later if it keeps
 * foreign-dirtying, so a fairly low count should be enough.
 *
 * See mem_cgroup_track_foreign_dirty_slowpath() for details.
 */
#define MEMCG_CGWB_FRN_CNT        4

struct memcg_cgwb_frn {
        u64 bdi_id;                        /* bdi->id of the foreign inode */
        int memcg_id;                        /* memcg->css.id of foreign inode */
        u64 at;                                /* jiffies_64 at the time of dirtying */
        struct wb_completion done;        /* tracks in-flight foreign writebacks */
};

/*
 * Bucket for arbitrarily byte-sized objects charged to a memory
 * cgroup. The bucket can be reparented in one piece when the cgroup
 * is destroyed, without having to round up the individual references
 * of all live memory objects in the wild.
 */
struct obj_cgroup {
        struct percpu_ref refcnt;
        struct mem_cgroup *memcg;
        atomic_t nr_charged_bytes;
        union {
                struct list_head list; /* protected by objcg_lock */
                struct rcu_head rcu;
        };
};

/*
 * The memory controller data structure. The memory controller controls both
 * page cache and RSS per cgroup. We would eventually like to provide
 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 * to help the administrator determine what knobs to tune.
 */
struct mem_cgroup {
        struct cgroup_subsys_state css;

        /* Private memcg ID. Used to ID objects that outlive the cgroup */
        struct mem_cgroup_id id;

        /* Accounted resources */
        struct page_counter memory;                /* Both v1 & v2 */

        union {
                struct page_counter swap;        /* v2 only */
                struct page_counter memsw;        /* v1 only */
        };

        /* registered local peak watchers */
        struct list_head memory_peaks;
        struct list_head swap_peaks;
        spinlock_t         peaks_lock;

        /* Range enforcement for interrupt charges */
        struct work_struct high_work;

#ifdef CONFIG_ZSWAP
        unsigned long zswap_max;

        /*
         * Prevent pages from this memcg from being written back from zswap to
         * swap, and from being swapped out on zswap store failures.
         */
        bool zswap_writeback;
#endif

        /* vmpressure notifications */
        struct vmpressure vmpressure;

        /*
         * Should the OOM killer kill all belonging tasks, had it kill one?
         */
        bool oom_group;

        int swappiness;

        /* memory.events and memory.events.local */
        struct cgroup_file events_file;
        struct cgroup_file events_local_file;

        /* handle for "memory.swap.events" */
        struct cgroup_file swap_events_file;

        /* memory.stat */
        struct memcg_vmstats        *vmstats;

        /* memory.events */
        atomic_long_t                memory_events[MEMCG_NR_MEMORY_EVENTS];
        atomic_long_t                memory_events_local[MEMCG_NR_MEMORY_EVENTS];

#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
        /* MEMCG_KMEM for nmi context */
        atomic_t                kmem_stat;
#endif
        /*
         * Hint of reclaim pressure for socket memroy management. Note
         * that this indicator should NOT be used in legacy cgroup mode
         * where socket memory is accounted/charged separately.
         */
        u64                        socket_pressure;
#if BITS_PER_LONG < 64
        seqlock_t                socket_pressure_seqlock;
#endif
        int kmemcg_id;
        /*
         * memcg->objcg is wiped out as a part of the objcg repaprenting
         * process. memcg->orig_objcg preserves a pointer (and a reference)
         * to the original objcg until the end of live of memcg.
         */
        struct obj_cgroup __rcu        *objcg;
        struct obj_cgroup        *orig_objcg;
        /* list of inherited objcgs, protected by objcg_lock */
        struct list_head objcg_list;

        struct memcg_vmstats_percpu __percpu *vmstats_percpu;

#ifdef CONFIG_CGROUP_WRITEBACK
        struct list_head cgwb_list;
        struct wb_domain cgwb_domain;
        struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT];
#endif

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        struct deferred_split deferred_split_queue;
#endif

#ifdef CONFIG_LRU_GEN_WALKS_MMU
        /* per-memcg mm_struct list */
        struct lru_gen_mm_list mm_list;
#endif

#ifdef CONFIG_MEMCG_V1
        /* Legacy consumer-oriented counters */
        struct page_counter kmem;                /* v1 only */
        struct page_counter tcpmem;                /* v1 only */

        struct memcg1_events_percpu __percpu *events_percpu;

        unsigned long soft_limit;

        /* protected by memcg_oom_lock */
        bool oom_lock;
        int under_oom;

        /* OOM-Killer disable */
        int oom_kill_disable;

        /* protect arrays of thresholds */
        struct mutex thresholds_lock;

        /* thresholds for memory usage. RCU-protected */
        struct mem_cgroup_thresholds thresholds;

        /* thresholds for mem+swap usage. RCU-protected */
        struct mem_cgroup_thresholds memsw_thresholds;

        /* For oom notifier event fd */
        struct list_head oom_notify;

        /* Legacy tcp memory accounting */
        bool tcpmem_active;
        int tcpmem_pressure;

        /* List of events which userspace want to receive */
        struct list_head event_list;
        spinlock_t event_list_lock;
#endif /* CONFIG_MEMCG_V1 */

        struct mem_cgroup_per_node *nodeinfo[];
};

/*
 * size of first charge trial.
 * TODO: maybe necessary to use big numbers in big irons or dynamic based of the
 * workload.
 */
#define MEMCG_CHARGE_BATCH 64U

extern struct mem_cgroup *root_mem_cgroup;

enum page_memcg_data_flags {
        /* page->memcg_data is a pointer to an slabobj_ext vector */
        MEMCG_DATA_OBJEXTS = (1UL << 0),
        /* page has been accounted as a non-slab kernel page */
        MEMCG_DATA_KMEM = (1UL << 1),
        /* the next bit after the last actual flag */
        __NR_MEMCG_DATA_FLAGS  = (1UL << 2),
};

#define __OBJEXTS_ALLOC_FAIL        MEMCG_DATA_OBJEXTS
#define __FIRST_OBJEXT_FLAG        __NR_MEMCG_DATA_FLAGS

#else /* CONFIG_MEMCG */

#define __OBJEXTS_ALLOC_FAIL        (1UL << 0)
#define __FIRST_OBJEXT_FLAG        (1UL << 0)

#endif /* CONFIG_MEMCG */

enum objext_flags {
        /*
         * Use bit 0 with zero other bits to signal that slabobj_ext vector
         * failed to allocate. The same bit 0 with valid upper bits means
         * MEMCG_DATA_OBJEXTS.
         */
        OBJEXTS_ALLOC_FAIL = __OBJEXTS_ALLOC_FAIL,
        /* slabobj_ext vector allocated with kmalloc_nolock() */
        OBJEXTS_NOSPIN_ALLOC = __FIRST_OBJEXT_FLAG,
        /* the next bit after the last actual flag */
        __NR_OBJEXTS_FLAGS  = (__FIRST_OBJEXT_FLAG << 1),
};

#define OBJEXTS_FLAGS_MASK (__NR_OBJEXTS_FLAGS - 1)

#ifdef CONFIG_MEMCG

static inline bool folio_memcg_kmem(struct folio *folio);

/*
 * After the initialization objcg->memcg is always pointing at
 * a valid memcg, but can be atomically swapped to the parent memcg.
 *
 * The caller must ensure that the returned memcg won't be released.
 */
static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
{
        lockdep_assert_once(rcu_read_lock_held() || lockdep_is_held(&cgroup_mutex));
        return READ_ONCE(objcg->memcg);
}

/*
 * __folio_memcg - Get the memory cgroup associated with a non-kmem folio
 * @folio: Pointer to the folio.
 *
 * Returns a pointer to the memory cgroup associated with the folio,
 * or NULL. This function assumes that the folio is known to have a
 * proper memory cgroup pointer. It's not safe to call this function
 * against some type of folios, e.g. slab folios or ex-slab folios or
 * kmem folios.
 */
static inline struct mem_cgroup *__folio_memcg(struct folio *folio)
{
        unsigned long memcg_data = folio->memcg_data;

        VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
        VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio);
        VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio);

        return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
}

/*
 * __folio_objcg - get the object cgroup associated with a kmem folio.
 * @folio: Pointer to the folio.
 *
 * Returns a pointer to the object cgroup associated with the folio,
 * or NULL. This function assumes that the folio is known to have a
 * proper object cgroup pointer. It's not safe to call this function
 * against some type of folios, e.g. slab folios or ex-slab folios or
 * LRU folios.
 */
static inline struct obj_cgroup *__folio_objcg(struct folio *folio)
{
        unsigned long memcg_data = folio->memcg_data;

        VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
        VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio);
        VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio);

        return (struct obj_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
}

/*
 * folio_memcg - Get the memory cgroup associated with a folio.
 * @folio: Pointer to the folio.
 *
 * Returns a pointer to the memory cgroup associated with the folio,
 * or NULL. This function assumes that the folio is known to have a
 * proper memory cgroup pointer. It's not safe to call this function
 * against some type of folios, e.g. slab folios or ex-slab folios.
 *
 * For a non-kmem folio any of the following ensures folio and memcg binding
 * stability:
 *
 * - the folio lock
 * - LRU isolation
 * - exclusive reference
 *
 * For a kmem folio a caller should hold an rcu read lock to protect memcg
 * associated with a kmem folio from being released.
 */
static inline struct mem_cgroup *folio_memcg(struct folio *folio)
{
        if (folio_memcg_kmem(folio))
                return obj_cgroup_memcg(__folio_objcg(folio));
        return __folio_memcg(folio);
}

/*
 * folio_memcg_charged - If a folio is charged to a memory cgroup.
 * @folio: Pointer to the folio.
 *
 * Returns true if folio is charged to a memory cgroup, otherwise returns false.
 */
static inline bool folio_memcg_charged(struct folio *folio)
{
        return folio->memcg_data != 0;
}

/*
 * folio_memcg_check - Get the memory cgroup associated with a folio.
 * @folio: Pointer to the folio.
 *
 * Returns a pointer to the memory cgroup associated with the folio,
 * or NULL. This function unlike folio_memcg() can take any folio
 * as an argument. It has to be used in cases when it's not known if a folio
 * has an associated memory cgroup pointer or an object cgroups vector or
 * an object cgroup.
 *
 * For a non-kmem folio any of the following ensures folio and memcg binding
 * stability:
 *
 * - the folio lock
 * - LRU isolation
 * - exclusive reference
 *
 * For a kmem folio a caller should hold an rcu read lock to protect memcg
 * associated with a kmem folio from being released.
 */
static inline struct mem_cgroup *folio_memcg_check(struct folio *folio)
{
        /*
         * Because folio->memcg_data might be changed asynchronously
         * for slabs, READ_ONCE() should be used here.
         */
        unsigned long memcg_data = READ_ONCE(folio->memcg_data);

        if (memcg_data & MEMCG_DATA_OBJEXTS)
                return NULL;

        if (memcg_data & MEMCG_DATA_KMEM) {
                struct obj_cgroup *objcg;

                objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
                return obj_cgroup_memcg(objcg);
        }

        return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
}

static inline struct mem_cgroup *page_memcg_check(struct page *page)
{
        if (PageTail(page))
                return NULL;
        return folio_memcg_check((struct folio *)page);
}

static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
{
        struct mem_cgroup *memcg;

        rcu_read_lock();
retry:
        memcg = obj_cgroup_memcg(objcg);
        if (unlikely(!css_tryget(&memcg->css)))
                goto retry;
        rcu_read_unlock();

        return memcg;
}

/*
 * folio_memcg_kmem - Check if the folio has the memcg_kmem flag set.
 * @folio: Pointer to the folio.
 *
 * Checks if the folio has MemcgKmem flag set. The caller must ensure
 * that the folio has an associated memory cgroup. It's not safe to call
 * this function against some types of folios, e.g. slab folios.
 */
static inline bool folio_memcg_kmem(struct folio *folio)
{
        VM_BUG_ON_PGFLAGS(PageTail(&folio->page), &folio->page);
        VM_BUG_ON_FOLIO(folio->memcg_data & MEMCG_DATA_OBJEXTS, folio);
        return folio->memcg_data & MEMCG_DATA_KMEM;
}

static inline bool PageMemcgKmem(struct page *page)
{
        return folio_memcg_kmem(page_folio(page));
}

static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
        return (memcg == root_mem_cgroup);
}

static inline bool mem_cgroup_disabled(void)
{
        return !cgroup_subsys_enabled(memory_cgrp_subsys);
}

static inline void mem_cgroup_protection(struct mem_cgroup *root,
                                         struct mem_cgroup *memcg,
                                         unsigned long *min,
                                         unsigned long *low)
{
        *min = *low = 0;

        if (mem_cgroup_disabled())
                return;

        /*
         * There is no reclaim protection applied to a targeted reclaim.
         * We are special casing this specific case here because
         * mem_cgroup_calculate_protection is not robust enough to keep
         * the protection invariant for calculated effective values for
         * parallel reclaimers with different reclaim target. This is
         * especially a problem for tail memcgs (as they have pages on LRU)
         * which would want to have effective values 0 for targeted reclaim
         * but a different value for external reclaim.
         *
         * Example
         * Let's have global and A's reclaim in parallel:
         *  |
         *  A (low=2G, usage = 3G, max = 3G, children_low_usage = 1.5G)
         *  |\
         *  | C (low = 1G, usage = 2.5G)
         *  B (low = 1G, usage = 0.5G)
         *
         * For the global reclaim
         * A.elow = A.low
         * B.elow = min(B.usage, B.low) because children_low_usage <= A.elow
         * C.elow = min(C.usage, C.low)
         *
         * With the effective values resetting we have A reclaim
         * A.elow = 0
         * B.elow = B.low
         * C.elow = C.low
         *
         * If the global reclaim races with A's reclaim then
         * B.elow = C.elow = 0 because children_low_usage > A.elow)
         * is possible and reclaiming B would be violating the protection.
         *
         */
        if (root == memcg)
                return;

        *min = READ_ONCE(memcg->memory.emin);
        *low = READ_ONCE(memcg->memory.elow);
}

void mem_cgroup_calculate_protection(struct mem_cgroup *root,
                                     struct mem_cgroup *memcg);

static inline bool mem_cgroup_unprotected(struct mem_cgroup *target,
                                          struct mem_cgroup *memcg)
{
        /*
         * The root memcg doesn't account charges, and doesn't support
         * protection. The target memcg's protection is ignored, see
         * mem_cgroup_calculate_protection() and mem_cgroup_protection()
         */
        return mem_cgroup_disabled() || mem_cgroup_is_root(memcg) ||
                memcg == target;
}

static inline bool mem_cgroup_below_low(struct mem_cgroup *target,
                                        struct mem_cgroup *memcg)
{
        if (mem_cgroup_unprotected(target, memcg))
                return false;

        return READ_ONCE(memcg->memory.elow) >=
                page_counter_read(&memcg->memory);
}

static inline bool mem_cgroup_below_min(struct mem_cgroup *target,
                                        struct mem_cgroup *memcg)
{
        if (mem_cgroup_unprotected(target, memcg))
                return false;

        return READ_ONCE(memcg->memory.emin) >=
                page_counter_read(&memcg->memory);
}

int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp);

/**
 * mem_cgroup_charge - Charge a newly allocated folio to a cgroup.
 * @folio: Folio to charge.
 * @mm: mm context of the allocating task.
 * @gfp: Reclaim mode.
 *
 * Try to charge @folio to the memcg that @mm belongs to, reclaiming
 * pages according to @gfp if necessary.  If @mm is NULL, try to
 * charge to the active memcg.
 *
 * Do not use this for folios allocated for swapin.
 *
 * Return: 0 on success. Otherwise, an error code is returned.
 */
static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm,
                                    gfp_t gfp)
{
        if (mem_cgroup_disabled())
                return 0;
        return __mem_cgroup_charge(folio, mm, gfp);
}

int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp);

int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
                                  gfp_t gfp, swp_entry_t entry);

void __mem_cgroup_uncharge(struct folio *folio);

/**
 * mem_cgroup_uncharge - Uncharge a folio.
 * @folio: Folio to uncharge.
 *
 * Uncharge a folio previously charged with mem_cgroup_charge().
 */
static inline void mem_cgroup_uncharge(struct folio *folio)
{
        if (mem_cgroup_disabled())
                return;
        __mem_cgroup_uncharge(folio);
}

void __mem_cgroup_uncharge_folios(struct folio_batch *folios);
static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios)
{
        if (mem_cgroup_disabled())
                return;
        __mem_cgroup_uncharge_folios(folios);
}

void mem_cgroup_replace_folio(struct folio *old, struct folio *new);
void mem_cgroup_migrate(struct folio *old, struct folio *new);

/**
 * mem_cgroup_lruvec - get the lru list vector for a memcg & node
 * @memcg: memcg of the wanted lruvec
 * @pgdat: pglist_data
 *
 * Returns the lru list vector holding pages for a given @memcg &
 * @pgdat combination. This can be the node lruvec, if the memory
 * controller is disabled.
 */
static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
                                               struct pglist_data *pgdat)
{
        struct mem_cgroup_per_node *mz;
        struct lruvec *lruvec;

        if (mem_cgroup_disabled()) {
                lruvec = &pgdat->__lruvec;
                goto out;
        }

        if (!memcg)
                memcg = root_mem_cgroup;

        mz = memcg->nodeinfo[pgdat->node_id];
        lruvec = &mz->lruvec;
out:
        /*
         * Since a node can be onlined after the mem_cgroup was created,
         * we have to be prepared to initialize lruvec->pgdat here;
         * and if offlined then reonlined, we need to reinitialize it.
         */
        if (unlikely(lruvec->pgdat != pgdat))
                lruvec->pgdat = pgdat;
        return lruvec;
}

/**
 * folio_lruvec - return lruvec for isolating/putting an LRU folio
 * @folio: Pointer to the folio.
 *
 * This function relies on folio->mem_cgroup being stable.
 */
static inline struct lruvec *folio_lruvec(struct folio *folio)
{
        struct mem_cgroup *memcg = folio_memcg(folio);

        VM_WARN_ON_ONCE_FOLIO(!memcg && !mem_cgroup_disabled(), folio);
        return mem_cgroup_lruvec(memcg, folio_pgdat(folio));
}

struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);

struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);

struct mem_cgroup *get_mem_cgroup_from_current(void);

struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio);

struct lruvec *folio_lruvec_lock(struct folio *folio);
struct lruvec *folio_lruvec_lock_irq(struct folio *folio);
struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
                                                unsigned long *flags);

#ifdef CONFIG_DEBUG_VM
void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio);
#else
static inline
void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
{
}
#endif

static inline
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
        return css ? container_of(css, struct mem_cgroup, css) : NULL;
}

static inline bool obj_cgroup_tryget(struct obj_cgroup *objcg)
{
        return percpu_ref_tryget(&objcg->refcnt);
}

static inline void obj_cgroup_get(struct obj_cgroup *objcg)
{
        percpu_ref_get(&objcg->refcnt);
}

static inline void obj_cgroup_get_many(struct obj_cgroup *objcg,
                                       unsigned long nr)
{
        percpu_ref_get_many(&objcg->refcnt, nr);
}

static inline void obj_cgroup_put(struct obj_cgroup *objcg)
{
        if (objcg)
                percpu_ref_put(&objcg->refcnt);
}

static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
{
        return !memcg || css_tryget(&memcg->css);
}

static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg)
{
        return !memcg || css_tryget_online(&memcg->css);
}

static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
        if (memcg)
                css_put(&memcg->css);
}

#define mem_cgroup_from_counter(counter, member)        \
        container_of(counter, struct mem_cgroup, member)

struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
                                   struct mem_cgroup *,
                                   struct mem_cgroup_reclaim_cookie *);
void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
                           int (*)(struct task_struct *, void *), void *arg);

static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
{
        if (mem_cgroup_disabled())
                return 0;

        return memcg->id.id;
}
struct mem_cgroup *mem_cgroup_from_id(unsigned short id);

#ifdef CONFIG_SHRINKER_DEBUG
static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg)
{
        return memcg ? cgroup_ino(memcg->css.cgroup) : 0;
}

struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino);
#endif

static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m)
{
        return mem_cgroup_from_css(seq_css(m));
}

static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec)
{
        struct mem_cgroup_per_node *mz;

        if (mem_cgroup_disabled())
                return NULL;

        mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
        return mz->memcg;
}

/**
 * parent_mem_cgroup - find the accounting parent of a memcg
 * @memcg: memcg whose parent to find
 *
 * Returns the parent memcg, or NULL if this is the root.
 */
static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
{
        return mem_cgroup_from_css(memcg->css.parent);
}

static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
                              struct mem_cgroup *root)
{
        if (root == memcg)
                return true;
        return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
}

static inline bool mm_match_cgroup(struct mm_struct *mm,
                                   struct mem_cgroup *memcg)
{
        struct mem_cgroup *task_memcg;
        bool match = false;

        rcu_read_lock();
        task_memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
        if (task_memcg)
                match = mem_cgroup_is_descendant(task_memcg, memcg);
        rcu_read_unlock();
        return match;
}

struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio);
ino_t page_cgroup_ino(struct page *page);

static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
{
        if (mem_cgroup_disabled())
                return true;
        return !!(memcg->css.flags & CSS_ONLINE);
}

void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
                int zid, int nr_pages);

static inline
unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
                enum lru_list lru, int zone_idx)
{
        struct mem_cgroup_per_node *mz;

        mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
        return READ_ONCE(mz->lru_zone_size[zone_idx][lru]);
}

void __mem_cgroup_handle_over_high(gfp_t gfp_mask);

static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask)
{
        if (unlikely(current->memcg_nr_pages_over_high))
                __mem_cgroup_handle_over_high(gfp_mask);
}

unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);

unsigned long mem_cgroup_size(struct mem_cgroup *memcg);

void mem_cgroup_print_oom_context(struct mem_cgroup *memcg,
                                struct task_struct *p);

void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg);

struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
                                            struct mem_cgroup *oom_domain);
void mem_cgroup_print_oom_group(struct mem_cgroup *memcg);

/* idx can be of type enum memcg_stat_item or node_stat_item */
void mod_memcg_state(struct mem_cgroup *memcg,
                     enum memcg_stat_item idx, int val);

static inline void mod_memcg_page_state(struct page *page,
                                        enum memcg_stat_item idx, int val)
{
        struct mem_cgroup *memcg;

        if (mem_cgroup_disabled())
                return;

        rcu_read_lock();
        memcg = folio_memcg(page_folio(page));
        if (memcg)
                mod_memcg_state(memcg, idx, val);
        rcu_read_unlock();
}

unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx);
unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx);
unsigned long lruvec_page_state_local(struct lruvec *lruvec,
                                      enum node_stat_item idx);

void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);

void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val);

static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
                                         int val)
{
        unsigned long flags;

        local_irq_save(flags);
        __mod_lruvec_kmem_state(p, idx, val);
        local_irq_restore(flags);
}

void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
                        unsigned long count);

static inline void count_memcg_folio_events(struct folio *folio,
                enum vm_event_item idx, unsigned long nr)
{
        struct mem_cgroup *memcg = folio_memcg(folio);

        if (memcg)
                count_memcg_events(memcg, idx, nr);
}

static inline void count_memcg_events_mm(struct mm_struct *mm,
                                        enum vm_event_item idx, unsigned long count)
{
        struct mem_cgroup *memcg;

        if (mem_cgroup_disabled())
                return;

        rcu_read_lock();
        memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
        if (likely(memcg))
                count_memcg_events(memcg, idx, count);
        rcu_read_unlock();
}

static inline void count_memcg_event_mm(struct mm_struct *mm,
                                        enum vm_event_item idx)
{
        count_memcg_events_mm(mm, idx, 1);
}

static inline void __memcg_memory_event(struct mem_cgroup *memcg,
                                        enum memcg_memory_event event,
                                        bool allow_spinning)
{
        bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX ||
                          event == MEMCG_SWAP_FAIL;

        /* For now only MEMCG_MAX can happen with !allow_spinning context. */
        VM_WARN_ON_ONCE(!allow_spinning && event != MEMCG_MAX);

        atomic_long_inc(&memcg->memory_events_local[event]);
        if (!swap_event && allow_spinning)
                cgroup_file_notify(&memcg->events_local_file);

        do {
                atomic_long_inc(&memcg->memory_events[event]);
                if (allow_spinning) {
                        if (swap_event)
                                cgroup_file_notify(&memcg->swap_events_file);
                        else
                                cgroup_file_notify(&memcg->events_file);
                }

                if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
                        break;
                if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
                        break;
        } while ((memcg = parent_mem_cgroup(memcg)) &&
                 !mem_cgroup_is_root(memcg));
}

static inline void memcg_memory_event(struct mem_cgroup *memcg,
                                      enum memcg_memory_event event)
{
        __memcg_memory_event(memcg, event, true);
}

static inline void memcg_memory_event_mm(struct mm_struct *mm,
                                         enum memcg_memory_event event)
{
        struct mem_cgroup *memcg;

        if (mem_cgroup_disabled())
                return;

        rcu_read_lock();
        memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
        if (likely(memcg))
                memcg_memory_event(memcg, event);
        rcu_read_unlock();
}

void split_page_memcg(struct page *first, unsigned order);
void folio_split_memcg_refs(struct folio *folio, unsigned old_order,
                unsigned new_order);

static inline u64 cgroup_id_from_mm(struct mm_struct *mm)
{
        struct mem_cgroup *memcg;
        u64 id;

        if (mem_cgroup_disabled())
                return 0;

        rcu_read_lock();
        memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
        if (!memcg)
                memcg = root_mem_cgroup;
        id = cgroup_id(memcg->css.cgroup);
        rcu_read_unlock();
        return id;
}

extern int mem_cgroup_init(void);
#else /* CONFIG_MEMCG */

#define MEM_CGROUP_ID_SHIFT        0

#define root_mem_cgroup                (NULL)

static inline struct mem_cgroup *folio_memcg(struct folio *folio)
{
        return NULL;
}

static inline bool folio_memcg_charged(struct folio *folio)
{
        return false;
}

static inline struct mem_cgroup *folio_memcg_check(struct folio *folio)
{
        return NULL;
}

static inline struct mem_cgroup *page_memcg_check(struct page *page)
{
        return NULL;
}

static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
{
        return NULL;
}

static inline bool folio_memcg_kmem(struct folio *folio)
{
        return false;
}

static inline bool PageMemcgKmem(struct page *page)
{
        return false;
}

static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
        return true;
}

static inline bool mem_cgroup_disabled(void)
{
        return true;
}

static inline void memcg_memory_event(struct mem_cgroup *memcg,
                                      enum memcg_memory_event event)
{
}

static inline void memcg_memory_event_mm(struct mm_struct *mm,
                                         enum memcg_memory_event event)
{
}

static inline void mem_cgroup_protection(struct mem_cgroup *root,
                                         struct mem_cgroup *memcg,
                                         unsigned long *min,
                                         unsigned long *low)
{
        *min = *low = 0;
}

static inline void mem_cgroup_calculate_protection(struct mem_cgroup *root,
                                                   struct mem_cgroup *memcg)
{
}

static inline bool mem_cgroup_unprotected(struct mem_cgroup *target,
                                          struct mem_cgroup *memcg)
{
        return true;
}
static inline bool mem_cgroup_below_low(struct mem_cgroup *target,
                                        struct mem_cgroup *memcg)
{
        return false;
}

static inline bool mem_cgroup_below_min(struct mem_cgroup *target,
                                        struct mem_cgroup *memcg)
{
        return false;
}

static inline int mem_cgroup_charge(struct folio *folio,
                struct mm_struct *mm, gfp_t gfp)
{
        return 0;
}

static inline int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp)
{
        return 0;
}

static inline int mem_cgroup_swapin_charge_folio(struct folio *folio,
                        struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
{
        return 0;
}

static inline void mem_cgroup_uncharge(struct folio *folio)
{
}

static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios)
{
}

static inline void mem_cgroup_replace_folio(struct folio *old,
                struct folio *new)
{
}

static inline void mem_cgroup_migrate(struct folio *old, struct folio *new)
{
}

static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
                                               struct pglist_data *pgdat)
{
        return &pgdat->__lruvec;
}

static inline struct lruvec *folio_lruvec(struct folio *folio)
{
        struct pglist_data *pgdat = folio_pgdat(folio);
        return &pgdat->__lruvec;
}

static inline
void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
{
}

static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
{
        return NULL;
}

static inline bool mm_match_cgroup(struct mm_struct *mm,
                struct mem_cgroup *memcg)
{
        return true;
}

static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
{
        return NULL;
}

static inline struct mem_cgroup *get_mem_cgroup_from_current(void)
{
        return NULL;
}

static inline struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio)
{
        return NULL;
}

static inline
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css)
{
        return NULL;
}

static inline void obj_cgroup_get(struct obj_cgroup *objcg)
{
}

static inline void obj_cgroup_put(struct obj_cgroup *objcg)
{
}

static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
{
        return true;
}

static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg)
{
        return true;
}

static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
}

static inline struct lruvec *folio_lruvec_lock(struct folio *folio)
{
        struct pglist_data *pgdat = folio_pgdat(folio);

        spin_lock(&pgdat->__lruvec.lru_lock);
        return &pgdat->__lruvec;
}

static inline struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
{
        struct pglist_data *pgdat = folio_pgdat(folio);

        spin_lock_irq(&pgdat->__lruvec.lru_lock);
        return &pgdat->__lruvec;
}

static inline struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
                unsigned long *flagsp)
{
        struct pglist_data *pgdat = folio_pgdat(folio);

        spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp);
        return &pgdat->__lruvec;
}

static inline struct mem_cgroup *
mem_cgroup_iter(struct mem_cgroup *root,
                struct mem_cgroup *prev,
                struct mem_cgroup_reclaim_cookie *reclaim)
{
        return NULL;
}

static inline void mem_cgroup_iter_break(struct mem_cgroup *root,
                                         struct mem_cgroup *prev)
{
}

static inline void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
                int (*fn)(struct task_struct *, void *), void *arg)
{
}

static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
{
        return 0;
}

static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
{
        WARN_ON_ONCE(id);
        /* XXX: This should always return root_mem_cgroup */
        return NULL;
}

#ifdef CONFIG_SHRINKER_DEBUG
static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg)
{
        return 0;
}

static inline struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
{
        return NULL;
}
#endif

static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m)
{
        return NULL;
}

static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec)
{
        return NULL;
}

static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
{
        return true;
}

static inline
unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
                enum lru_list lru, int zone_idx)
{
        return 0;
}

static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
{
        return 0;
}

static inline unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
{
        return 0;
}

static inline void
mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
{
}

static inline void
mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
{
}

static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask)
{
}

static inline struct mem_cgroup *mem_cgroup_get_oom_group(
        struct task_struct *victim, struct mem_cgroup *oom_domain)
{
        return NULL;
}

static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
{
}

static inline void mod_memcg_state(struct mem_cgroup *memcg,
                                   enum memcg_stat_item idx,
                                   int nr)
{
}

static inline void mod_memcg_page_state(struct page *page,
                                        enum memcg_stat_item idx, int val)
{
}

static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
{
        return 0;
}

static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
                                              enum node_stat_item idx)
{
        return node_page_state(lruvec_pgdat(lruvec), idx);
}

static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
                                                    enum node_stat_item idx)
{
        return node_page_state(lruvec_pgdat(lruvec), idx);
}

static inline void mem_cgroup_flush_stats(struct mem_cgroup *memcg)
{
}

static inline void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
{
}

static inline void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
                                           int val)
{
        struct page *page = virt_to_head_page(p);

        __mod_node_page_state(page_pgdat(page), idx, val);
}

static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
                                         int val)
{
        struct page *page = virt_to_head_page(p);

        mod_node_page_state(page_pgdat(page), idx, val);
}

static inline void count_memcg_events(struct mem_cgroup *memcg,
                                        enum vm_event_item idx,
                                        unsigned long count)
{
}

static inline void count_memcg_folio_events(struct folio *folio,
                enum vm_event_item idx, unsigned long nr)
{
}

static inline void count_memcg_events_mm(struct mm_struct *mm,
                                        enum vm_event_item idx, unsigned long count)
{
}

static inline
void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
{
}

static inline void split_page_memcg(struct page *first, unsigned order)
{
}

static inline void folio_split_memcg_refs(struct folio *folio,
                unsigned old_order, unsigned new_order)
{
}

static inline u64 cgroup_id_from_mm(struct mm_struct *mm)
{
        return 0;
}

static inline int mem_cgroup_init(void) { return 0; }
#endif /* CONFIG_MEMCG */

/*
 * Extended information for slab objects stored as an array in page->memcg_data
 * if MEMCG_DATA_OBJEXTS is set.
 */
struct slabobj_ext {
#ifdef CONFIG_MEMCG
        struct obj_cgroup *objcg;
#endif
#ifdef CONFIG_MEM_ALLOC_PROFILING
        union codetag_ref ref;
#endif
} __aligned(8);

static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
{
        __mod_lruvec_kmem_state(p, idx, 1);
}

static inline void __dec_lruvec_kmem_state(void *p, enum node_stat_item idx)
{
        __mod_lruvec_kmem_state(p, idx, -1);
}

static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
{
        struct mem_cgroup *memcg;

        memcg = lruvec_memcg(lruvec);
        if (!memcg)
                return NULL;
        memcg = parent_mem_cgroup(memcg);
        if (!memcg)
                return NULL;
        return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec));
}

static inline void unlock_page_lruvec(struct lruvec *lruvec)
{
        spin_unlock(&lruvec->lru_lock);
}

static inline void unlock_page_lruvec_irq(struct lruvec *lruvec)
{
        spin_unlock_irq(&lruvec->lru_lock);
}

static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec,
                unsigned long flags)
{
        spin_unlock_irqrestore(&lruvec->lru_lock, flags);
}

/* Test requires a stable folio->memcg binding, see folio_memcg() */
static inline bool folio_matches_lruvec(struct folio *folio,
                struct lruvec *lruvec)
{
        return lruvec_pgdat(lruvec) == folio_pgdat(folio) &&
               lruvec_memcg(lruvec) == folio_memcg(folio);
}

/* Don't lock again iff page's lruvec locked */
static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio,
                struct lruvec *locked_lruvec)
{
        if (locked_lruvec) {
                if (folio_matches_lruvec(folio, locked_lruvec))
                        return locked_lruvec;

                unlock_page_lruvec_irq(locked_lruvec);
        }

        return folio_lruvec_lock_irq(folio);
}

/* Don't lock again iff folio's lruvec locked */
static inline void folio_lruvec_relock_irqsave(struct folio *folio,
                struct lruvec **lruvecp, unsigned long *flags)
{
        if (*lruvecp) {
                if (folio_matches_lruvec(folio, *lruvecp))
                        return;

                unlock_page_lruvec_irqrestore(*lruvecp, *flags);
        }

        *lruvecp = folio_lruvec_lock_irqsave(folio, flags);
}

#ifdef CONFIG_CGROUP_WRITEBACK

struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
                         unsigned long *pheadroom, unsigned long *pdirty,
                         unsigned long *pwriteback);

void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
                                             struct bdi_writeback *wb);

static inline void mem_cgroup_track_foreign_dirty(struct folio *folio,
                                                  struct bdi_writeback *wb)
{
        struct mem_cgroup *memcg;

        if (mem_cgroup_disabled())
                return;

        memcg = folio_memcg(folio);
        if (unlikely(memcg && &memcg->css != wb->memcg_css))
                mem_cgroup_track_foreign_dirty_slowpath(folio, wb);
}

void mem_cgroup_flush_foreign(struct bdi_writeback *wb);

#else        /* CONFIG_CGROUP_WRITEBACK */

static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
{
        return NULL;
}

static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
                                       unsigned long *pfilepages,
                                       unsigned long *pheadroom,
                                       unsigned long *pdirty,
                                       unsigned long *pwriteback)
{
}

static inline void mem_cgroup_track_foreign_dirty(struct folio *folio,
                                                  struct bdi_writeback *wb)
{
}

static inline void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
{
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

struct sock;
#ifdef CONFIG_MEMCG
extern struct static_key_false memcg_sockets_enabled_key;
#define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key)

void mem_cgroup_sk_alloc(struct sock *sk);
void mem_cgroup_sk_free(struct sock *sk);
void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk);
bool mem_cgroup_sk_charge(const struct sock *sk, unsigned int nr_pages,
                          gfp_t gfp_mask);
void mem_cgroup_sk_uncharge(const struct sock *sk, unsigned int nr_pages);

#if BITS_PER_LONG < 64
static inline void mem_cgroup_set_socket_pressure(struct mem_cgroup *memcg)
{
        u64 val = get_jiffies_64() + HZ;
        unsigned long flags;

        write_seqlock_irqsave(&memcg->socket_pressure_seqlock, flags);
        memcg->socket_pressure = val;
        write_sequnlock_irqrestore(&memcg->socket_pressure_seqlock, flags);
}

static inline u64 mem_cgroup_get_socket_pressure(struct mem_cgroup *memcg)
{
        unsigned int seq;
        u64 val;

        do {
                seq = read_seqbegin(&memcg->socket_pressure_seqlock);
                val = memcg->socket_pressure;
        } while (read_seqretry(&memcg->socket_pressure_seqlock, seq));

        return val;
}
#else
static inline void mem_cgroup_set_socket_pressure(struct mem_cgroup *memcg)
{
        WRITE_ONCE(memcg->socket_pressure, jiffies + HZ);
}

static inline u64 mem_cgroup_get_socket_pressure(struct mem_cgroup *memcg)
{
        return READ_ONCE(memcg->socket_pressure);
}
#endif

int alloc_shrinker_info(struct mem_cgroup *memcg);
void free_shrinker_info(struct mem_cgroup *memcg);
void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id);
void reparent_shrinker_deferred(struct mem_cgroup *memcg);
#else
#define mem_cgroup_sockets_enabled 0

static inline void mem_cgroup_sk_alloc(struct sock *sk)
{
}

static inline void mem_cgroup_sk_free(struct sock *sk)
{
}

static inline void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk)
{
}

static inline bool mem_cgroup_sk_charge(const struct sock *sk,
                                        unsigned int nr_pages,
                                        gfp_t gfp_mask)
{
        return false;
}

static inline void mem_cgroup_sk_uncharge(const struct sock *sk,
                                          unsigned int nr_pages)
{
}

static inline void set_shrinker_bit(struct mem_cgroup *memcg,
                                    int nid, int shrinker_id)
{
}
#endif

#ifdef CONFIG_MEMCG
bool mem_cgroup_kmem_disabled(void);
int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order);
void __memcg_kmem_uncharge_page(struct page *page, int order);

/*
 * The returned objcg pointer is safe to use without additional
 * protection within a scope. The scope is defined either by
 * the current task (similar to the "current" global variable)
 * or by set_active_memcg() pair.
 * Please, use obj_cgroup_get() to get a reference if the pointer
 * needs to be used outside of the local scope.
 */
struct obj_cgroup *current_obj_cgroup(void);
struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio);

static inline struct obj_cgroup *get_obj_cgroup_from_current(void)
{
        struct obj_cgroup *objcg = current_obj_cgroup();

        if (objcg)
                obj_cgroup_get(objcg);

        return objcg;
}

int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size);
void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size);

extern struct static_key_false memcg_bpf_enabled_key;
static inline bool memcg_bpf_enabled(void)
{
        return static_branch_likely(&memcg_bpf_enabled_key);
}

extern struct static_key_false memcg_kmem_online_key;

static inline bool memcg_kmem_online(void)
{
        return static_branch_likely(&memcg_kmem_online_key);
}

static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp,
                                         int order)
{
        if (memcg_kmem_online())
                return __memcg_kmem_charge_page(page, gfp, order);
        return 0;
}

static inline void memcg_kmem_uncharge_page(struct page *page, int order)
{
        if (memcg_kmem_online())
                __memcg_kmem_uncharge_page(page, order);
}

/*
 * A helper for accessing memcg's kmem_id, used for getting
 * corresponding LRU lists.
 */
static inline int memcg_kmem_id(struct mem_cgroup *memcg)
{
        return memcg ? memcg->kmemcg_id : -1;
}

struct mem_cgroup *mem_cgroup_from_slab_obj(void *p);

static inline void count_objcg_events(struct obj_cgroup *objcg,
                                      enum vm_event_item idx,
                                      unsigned long count)
{
        struct mem_cgroup *memcg;

        if (!memcg_kmem_online())
                return;

        rcu_read_lock();
        memcg = obj_cgroup_memcg(objcg);
        count_memcg_events(memcg, idx, count);
        rcu_read_unlock();
}

bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid);

#else
static inline bool mem_cgroup_kmem_disabled(void)
{
        return true;
}

static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp,
                                         int order)
{
        return 0;
}

static inline void memcg_kmem_uncharge_page(struct page *page, int order)
{
}

static inline int __memcg_kmem_charge_page(struct page *page, gfp_t gfp,
                                           int order)
{
        return 0;
}

static inline void __memcg_kmem_uncharge_page(struct page *page, int order)
{
}

static inline struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
{
        return NULL;
}

static inline bool memcg_bpf_enabled(void)
{
        return false;
}

static inline bool memcg_kmem_online(void)
{
        return false;
}

static inline int memcg_kmem_id(struct mem_cgroup *memcg)
{
        return -1;
}

static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
{
        return NULL;
}

static inline void count_objcg_events(struct obj_cgroup *objcg,
                                      enum vm_event_item idx,
                                      unsigned long count)
{
}

static inline ino_t page_cgroup_ino(struct page *page)
{
        return 0;
}

static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
{
        return true;
}
#endif /* CONFIG_MEMCG */

#if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP)
bool obj_cgroup_may_zswap(struct obj_cgroup *objcg);
void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size);
void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size);
bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg);
#else
static inline bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
{
        return true;
}
static inline void obj_cgroup_charge_zswap(struct obj_cgroup *objcg,
                                           size_t size)
{
}
static inline void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg,
                                             size_t size)
{
}
static inline bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg)
{
        /* if zswap is disabled, do not block pages going to the swapping device */
        return true;
}
#endif


/* Cgroup v1-related declarations */

#ifdef CONFIG_MEMCG_V1
unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order,
                                        gfp_t gfp_mask,
                                        unsigned long *total_scanned);

bool mem_cgroup_oom_synchronize(bool wait);

static inline bool task_in_memcg_oom(struct task_struct *p)
{
        return p->memcg_in_oom;
}

static inline void mem_cgroup_enter_user_fault(void)
{
        WARN_ON(current->in_user_fault);
        current->in_user_fault = 1;
}

static inline void mem_cgroup_exit_user_fault(void)
{
        WARN_ON(!current->in_user_fault);
        current->in_user_fault = 0;
}

void memcg1_swapout(struct folio *folio, swp_entry_t entry);
void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages);

#else /* CONFIG_MEMCG_V1 */
static inline
unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order,
                                        gfp_t gfp_mask,
                                        unsigned long *total_scanned)
{
        return 0;
}

static inline bool task_in_memcg_oom(struct task_struct *p)
{
        return false;
}

static inline bool mem_cgroup_oom_synchronize(bool wait)
{
        return false;
}

static inline void mem_cgroup_enter_user_fault(void)
{
}

static inline void mem_cgroup_exit_user_fault(void)
{
}

static inline void memcg1_swapout(struct folio *folio, swp_entry_t entry)
{
}

static inline void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages)
{
}

#endif /* CONFIG_MEMCG_V1 */

#endif /* _LINUX_MEMCONTROL_H */

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_WAIT_H
#define _LINUX_WAIT_H
/*
 * Linux wait queue related types and methods
 */
#include <linux/list.h>
#include <linux/stddef.h>
#include <linux/spinlock.h>

#include <asm/current.h>

typedef struct wait_queue_entry wait_queue_entry_t;

typedef int (*wait_queue_func_t)(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key);
int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key);

/* wait_queue_entry::flags */
#define WQ_FLAG_EXCLUSIVE        0x01
#define WQ_FLAG_WOKEN                0x02
#define WQ_FLAG_CUSTOM                0x04
#define WQ_FLAG_DONE                0x08
#define WQ_FLAG_PRIORITY        0x10

/*
 * A single wait-queue entry structure:
 */
struct wait_queue_entry {
        unsigned int                flags;
        void                        *private;
        wait_queue_func_t        func;
        struct list_head        entry;
};

struct wait_queue_head {
        spinlock_t                lock;
        struct list_head        head;
};
typedef struct wait_queue_head wait_queue_head_t;

struct task_struct;

/*
 * Macros for declaration and initialisaton of the datatypes
 */

#define __WAITQUEUE_INITIALIZER(name, tsk) {                                        \
        .private        = tsk,                                                        \
        .func                = default_wake_function,                                \
        .entry                = { NULL, NULL } }

#define DECLARE_WAITQUEUE(name, tsk)                                                \
        struct wait_queue_entry name = __WAITQUEUE_INITIALIZER(name, tsk)

#define __WAIT_QUEUE_HEAD_INITIALIZER(name) {                                        \
        .lock                = __SPIN_LOCK_UNLOCKED(name.lock),                        \
        .head                = LIST_HEAD_INIT(name.head) }

#define DECLARE_WAIT_QUEUE_HEAD(name) \
        struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name)

extern void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *);

#define init_waitqueue_head(wq_head)                                                \
        do {                                                                        \
                static struct lock_class_key __key;                                \
                                                                                \
                __init_waitqueue_head((wq_head), #wq_head, &__key);                \
        } while (0)

#ifdef CONFIG_LOCKDEP
# define __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) \
        ({ init_waitqueue_head(&name); name; })
# define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) \
        struct wait_queue_head name = __WAIT_QUEUE_HEAD_INIT_ONSTACK(name)
#else
# define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) DECLARE_WAIT_QUEUE_HEAD(name)
#endif

static inline void init_waitqueue_entry(struct wait_queue_entry *wq_entry, struct task_struct *p)
{
        wq_entry->flags                = 0;
        wq_entry->private        = p;
        wq_entry->func                = default_wake_function;
}

static inline void
init_waitqueue_func_entry(struct wait_queue_entry *wq_entry, wait_queue_func_t func)
{
        wq_entry->flags                = 0;
        wq_entry->private        = NULL;
        wq_entry->func                = func;
}

/**
 * waitqueue_active -- locklessly test for waiters on the queue
 * @wq_head: the waitqueue to test for waiters
 *
 * returns true if the wait list is not empty
 *
 * NOTE: this function is lockless and requires care, incorrect usage _will_
 * lead to sporadic and non-obvious failure.
 *
 * Use either while holding wait_queue_head::lock or when used for wakeups
 * with an extra smp_mb() like::
 *
 *      CPU0 - waker                    CPU1 - waiter
 *
 *                                      for (;;) {
 *      @cond = true;                     prepare_to_wait(&wq_head, &wait, state);
 *      smp_mb();                         // smp_mb() from set_current_state()
 *      if (waitqueue_active(wq_head))         if (@cond)
 *        wake_up(wq_head);                      break;
 *                                        schedule();
 *                                      }
 *                                      finish_wait(&wq_head, &wait);
 *
 * Because without the explicit smp_mb() it's possible for the
 * waitqueue_active() load to get hoisted over the @cond store such that we'll
 * observe an empty wait list while the waiter might not observe @cond.
 *
 * Also note that this 'optimization' trades a spin_lock() for an smp_mb(),
 * which (when the lock is uncontended) are of roughly equal cost.
 */
static inline int waitqueue_active(struct wait_queue_head *wq_head)
{
        return !list_empty(&wq_head->head);
}

/**
 * wq_has_single_sleeper - check if there is only one sleeper
 * @wq_head: wait queue head
 *
 * Returns true of wq_head has only one sleeper on the list.
 *
 * Please refer to the comment for waitqueue_active.
 */
static inline bool wq_has_single_sleeper(struct wait_queue_head *wq_head)
{
        return list_is_singular(&wq_head->head);
}

/**
 * wq_has_sleeper - check if there are any waiting processes
 * @wq_head: wait queue head
 *
 * Returns true if wq_head has waiting processes
 *
 * Please refer to the comment for waitqueue_active.
 */
static inline bool wq_has_sleeper(struct wait_queue_head *wq_head)
{
        /*
         * We need to be sure we are in sync with the
         * add_wait_queue modifications to the wait queue.
         *
         * This memory barrier should be paired with one on the
         * waiting side.
         */
        smp_mb();
        return waitqueue_active(wq_head);
}

extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
extern int add_wait_queue_priority_exclusive(struct wait_queue_head *wq_head,
                                             struct wait_queue_entry *wq_entry);
extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);

static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        struct list_head *head = &wq_head->head;
        struct wait_queue_entry *wq;

        list_for_each_entry(wq, &wq_head->head, entry) {
                if (!(wq->flags & WQ_FLAG_PRIORITY))
                        break;
                head = &wq->entry;
        }
        list_add(&wq_entry->entry, head);
}

/*
 * Used for wake-one threads:
 */
static inline void
__add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
        __add_wait_queue(wq_head, wq_entry);
}

static inline void __add_wait_queue_entry_tail(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        list_add_tail(&wq_entry->entry, &wq_head->head);
}

static inline void
__add_wait_queue_entry_tail_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
        __add_wait_queue_entry_tail(wq_head, wq_entry);
}

static inline void
__remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        list_del(&wq_entry->entry);
}

int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode);
void __wake_up_pollfree(struct wait_queue_head *wq_head);

#define wake_up(x)                        __wake_up(x, TASK_NORMAL, 1, NULL)
#define wake_up_nr(x, nr)                __wake_up(x, TASK_NORMAL, nr, NULL)
#define wake_up_all(x)                        __wake_up(x, TASK_NORMAL, 0, NULL)
#define wake_up_locked(x)                __wake_up_locked((x), TASK_NORMAL, 1)
#define wake_up_all_locked(x)                __wake_up_locked((x), TASK_NORMAL, 0)
#define wake_up_sync(x)                        __wake_up_sync(x, TASK_NORMAL)

#define wake_up_interruptible(x)        __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL)
#define wake_up_interruptible_nr(x, nr)        __wake_up(x, TASK_INTERRUPTIBLE, nr, NULL)
#define wake_up_interruptible_all(x)        __wake_up(x, TASK_INTERRUPTIBLE, 0, NULL)
#define wake_up_interruptible_sync(x)        __wake_up_sync((x), TASK_INTERRUPTIBLE)

/*
 * Wakeup macros to be used to report events to the targets.
 */
#define poll_to_key(m) ((void *)(__force uintptr_t)(__poll_t)(m))
#define key_to_poll(m) ((__force __poll_t)(uintptr_t)(void *)(m))
#define wake_up_poll(x, m)                                                        \
        __wake_up(x, TASK_NORMAL, 1, poll_to_key(m))
#define wake_up_poll_on_current_cpu(x, m)                                        \
        __wake_up_on_current_cpu(x, TASK_NORMAL, poll_to_key(m))
#define wake_up_locked_poll(x, m)                                                \
        __wake_up_locked_key((x), TASK_NORMAL, poll_to_key(m))
#define wake_up_interruptible_poll(x, m)                                        \
        __wake_up(x, TASK_INTERRUPTIBLE, 1, poll_to_key(m))
#define wake_up_interruptible_sync_poll(x, m)                                        \
        __wake_up_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m))
#define wake_up_interruptible_sync_poll_locked(x, m)                                \
        __wake_up_locked_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m))

/**
 * wake_up_pollfree - signal that a polled waitqueue is going away
 * @wq_head: the wait queue head
 *
 * In the very rare cases where a ->poll() implementation uses a waitqueue whose
 * lifetime is tied to a task rather than to the 'struct file' being polled,
 * this function must be called before the waitqueue is freed so that
 * non-blocking polls (e.g. epoll) are notified that the queue is going away.
 *
 * The caller must also RCU-delay the freeing of the wait_queue_head, e.g. via
 * an explicit synchronize_rcu() or call_rcu(), or via SLAB_TYPESAFE_BY_RCU.
 */
static inline void wake_up_pollfree(struct wait_queue_head *wq_head)
{
        /*
         * For performance reasons, we don't always take the queue lock here.
         * Therefore, we might race with someone removing the last entry from
         * the queue, and proceed while they still hold the queue lock.
         * However, rcu_read_lock() is required to be held in such cases, so we
         * can safely proceed with an RCU-delayed free.
         */
        if (waitqueue_active(wq_head))
                __wake_up_pollfree(wq_head);
}

#define ___wait_cond_timeout(condition)                                                \
({                                                                                \
        bool __cond = (condition);                                                \
        if (__cond && !__ret)                                                        \
                __ret = 1;                                                        \
        __cond || !__ret;                                                        \
})

#define ___wait_is_interruptible(state)                                                \
        (!__builtin_constant_p(state) ||                                        \
         (state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))

extern void init_wait_entry(struct wait_queue_entry *wq_entry, int flags);

/*
 * The below macro ___wait_event() has an explicit shadow of the __ret
 * variable when used from the wait_event_*() macros.
 *
 * This is so that both can use the ___wait_cond_timeout() construct
 * to wrap the condition.
 *
 * The type inconsistency of the wait_event_*() __ret variable is also
 * on purpose; we use long where we can return timeout values and int
 * otherwise.
 */

#define ___wait_event(wq_head, condition, state, exclusive, ret, cmd)                \
({                                                                                \
        __label__ __out;                                                        \
        struct wait_queue_entry __wq_entry;                                        \
        long __ret = ret;        /* explicit shadow */                                \
                                                                                \
        init_wait_entry(&__wq_entry, exclusive ? WQ_FLAG_EXCLUSIVE : 0);        \
        for (;;) {                                                                \
                long __int = prepare_to_wait_event(&wq_head, &__wq_entry, state);\
                                                                                \
                if (condition)                                                        \
                        break;                                                        \
                                                                                \
                if (___wait_is_interruptible(state) && __int) {                        \
                        __ret = __int;                                                \
                        goto __out;                                                \
                }                                                                \
                                                                                \
                cmd;                                                                \
                                                                                \
                if (condition)                                                        \
                        break;                                                        \
        }                                                                        \
        finish_wait(&wq_head, &__wq_entry);                                        \
__out:        __ret;                                                                        \
})

#define __wait_event(wq_head, condition)                                        \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,        \
                            schedule())

/**
 * wait_event - sleep until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 */
#define wait_event(wq_head, condition)                                                \
do {                                                                                \
        might_sleep();                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event(wq_head, condition);                                        \
} while (0)

#define __io_wait_event(wq_head, condition)                                        \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,        \
                            io_schedule())

/*
 * io_wait_event() -- like wait_event() but with io_schedule()
 */
#define io_wait_event(wq_head, condition)                                        \
do {                                                                                \
        might_sleep();                                                                \
        if (condition)                                                                \
                break;                                                                \
        __io_wait_event(wq_head, condition);                                        \
} while (0)

#define __wait_event_freezable(wq_head, condition)                                \
        ___wait_event(wq_head, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE),        \
                        0, 0, schedule())

/**
 * wait_event_freezable - sleep (or freeze) until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE -- so as not to contribute
 * to system load) until the @condition evaluates to true. The
 * @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 */
#define wait_event_freezable(wq_head, condition)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_freezable(wq_head, condition);                \
        __ret;                                                                        \
})

#define __wait_event_timeout(wq_head, condition, timeout)                        \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_UNINTERRUPTIBLE, 0, timeout,                                \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_timeout - sleep until a condition gets true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * or the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed.
 */
#define wait_event_timeout(wq_head, condition, timeout)                                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_timeout(wq_head, condition, timeout);        \
        __ret;                                                                        \
})

#define __wait_event_freezable_timeout(wq_head, condition, timeout)                \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      (TASK_INTERRUPTIBLE|TASK_FREEZABLE), 0, timeout,                \
                      __ret = schedule_timeout(__ret))

/*
 * like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid
 * increasing load and is freezable.
 */
#define wait_event_freezable_timeout(wq_head, condition, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_freezable_timeout(wq_head, condition, timeout); \
        __ret;                                                                        \
})

#define __wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2)                \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 1, 0,        \
                            cmd1; schedule(); cmd2)
/*
 * Just like wait_event_cmd(), except it sets exclusive flag
 */
#define wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2)                \
do {                                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2);                \
} while (0)

#define __wait_event_cmd(wq_head, condition, cmd1, cmd2)                        \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,        \
                            cmd1; schedule(); cmd2)

/**
 * wait_event_cmd - sleep until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @cmd1: the command will be executed before sleep
 * @cmd2: the command will be executed after sleep
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 */
#define wait_event_cmd(wq_head, condition, cmd1, cmd2)                                \
do {                                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event_cmd(wq_head, condition, cmd1, cmd2);                        \
} while (0)

#define __wait_event_interruptible(wq_head, condition)                                \
        ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0,                \
                      schedule())

/**
 * wait_event_interruptible - sleep until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible(wq_head, condition)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_interruptible(wq_head, condition);                \
        __ret;                                                                        \
})

#define __wait_event_interruptible_timeout(wq_head, condition, timeout)                \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_INTERRUPTIBLE, 0, timeout,                                \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_interruptible_timeout - sleep until a condition gets true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was
 * interrupted by a signal.
 */
#define wait_event_interruptible_timeout(wq_head, condition, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_interruptible_timeout(wq_head,                \
                                                condition, timeout);                \
        __ret;                                                                        \
})

#define __wait_event_hrtimeout(wq_head, condition, timeout, state)                \
({                                                                                \
        int __ret = 0;                                                                \
        struct hrtimer_sleeper __t;                                                \
                                                                                \
        hrtimer_setup_sleeper_on_stack(&__t, CLOCK_MONOTONIC,                        \
                                       HRTIMER_MODE_REL);                        \
        if ((timeout) != KTIME_MAX) {                                                \
                hrtimer_set_expires_range_ns(&__t.timer, timeout,                \
                                        current->timer_slack_ns);                \
                hrtimer_sleeper_start_expires(&__t, HRTIMER_MODE_REL);                \
        }                                                                        \
                                                                                \
        __ret = ___wait_event(wq_head, condition, state, 0, 0,                        \
                if (!__t.task) {                                                \
                        __ret = -ETIME;                                                \
                        break;                                                        \
                }                                                                \
                schedule());                                                        \
                                                                                \
        hrtimer_cancel(&__t.timer);                                                \
        destroy_hrtimer_on_stack(&__t.timer);                                        \
        __ret;                                                                        \
})

/**
 * wait_event_hrtimeout - sleep until a condition gets true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, as a ktime_t
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function returns 0 if @condition became true, or -ETIME if the timeout
 * elapsed.
 */
#define wait_event_hrtimeout(wq_head, condition, timeout)                        \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_hrtimeout(wq_head, condition, timeout,        \
                                               TASK_UNINTERRUPTIBLE);                \
        __ret;                                                                        \
})

/**
 * wait_event_interruptible_hrtimeout - sleep until a condition gets true or a timeout elapses
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, as a ktime_t
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function returns 0 if @condition became true, -ERESTARTSYS if it was
 * interrupted by a signal, or -ETIME if the timeout elapsed.
 */
#define wait_event_interruptible_hrtimeout(wq, condition, timeout)                \
({                                                                                \
        long __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_hrtimeout(wq, condition, timeout,                \
                                               TASK_INTERRUPTIBLE);                \
        __ret;                                                                        \
})

#define __wait_event_interruptible_exclusive(wq, condition)                        \
        ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0,                        \
                      schedule())

#define wait_event_interruptible_exclusive(wq, condition)                        \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_interruptible_exclusive(wq, condition);        \
        __ret;                                                                        \
})

#define __wait_event_killable_exclusive(wq, condition)                                \
        ___wait_event(wq, condition, TASK_KILLABLE, 1, 0,                        \
                      schedule())

#define wait_event_killable_exclusive(wq, condition)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_killable_exclusive(wq, condition);                \
        __ret;                                                                        \
})


#define __wait_event_freezable_exclusive(wq, condition)                                \
        ___wait_event(wq, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE), 1, 0,\
                        schedule())

#define wait_event_freezable_exclusive(wq, condition)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_freezable_exclusive(wq, condition);        \
        __ret;                                                                        \
})

/**
 * wait_event_idle - wait for a condition without contributing to system load
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_IDLE) until the
 * @condition evaluates to true.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 */
#define wait_event_idle(wq_head, condition)                                        \
do {                                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                ___wait_event(wq_head, condition, TASK_IDLE, 0, 0, schedule());        \
} while (0)

/**
 * wait_event_idle_exclusive - wait for a condition with contributing to system load
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_IDLE) until the
 * @condition evaluates to true.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
 * set thus if other processes wait on the same list, when this
 * process is woken further processes are not considered.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 */
#define wait_event_idle_exclusive(wq_head, condition)                                \
do {                                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                ___wait_event(wq_head, condition, TASK_IDLE, 1, 0, schedule());        \
} while (0)

#define __wait_event_idle_timeout(wq_head, condition, timeout)                        \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_IDLE, 0, timeout,                                        \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_idle_timeout - sleep without load until a condition becomes true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_IDLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * or the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed.
 */
#define wait_event_idle_timeout(wq_head, condition, timeout)                        \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_idle_timeout(wq_head, condition, timeout);        \
        __ret;                                                                        \
})

#define __wait_event_idle_exclusive_timeout(wq_head, condition, timeout)        \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_IDLE, 1, timeout,                                        \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_idle_exclusive_timeout - sleep without load until a condition becomes true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_IDLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
 * set thus if other processes wait on the same list, when this
 * process is woken further processes are not considered.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * or the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed.
 */
#define wait_event_idle_exclusive_timeout(wq_head, condition, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_idle_exclusive_timeout(wq_head, condition, timeout);\
        __ret;                                                                        \
})

extern int do_wait_intr(wait_queue_head_t *, wait_queue_entry_t *);
extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *);

#define __wait_event_interruptible_locked(wq, condition, exclusive, fn)                \
({                                                                                \
        int __ret;                                                                \
        DEFINE_WAIT(__wait);                                                        \
        if (exclusive)                                                                \
                __wait.flags |= WQ_FLAG_EXCLUSIVE;                                \
        do {                                                                        \
                __ret = fn(&(wq), &__wait);                                        \
                if (__ret)                                                        \
                        break;                                                        \
        } while (!(condition));                                                        \
        __remove_wait_queue(&(wq), &__wait);                                        \
        __set_current_state(TASK_RUNNING);                                        \
        __ret;                                                                        \
})


/**
 * wait_event_interruptible_locked - sleep until a condition gets true
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * It must be called with wq.lock being held.  This spinlock is
 * unlocked while sleeping but @condition testing is done while lock
 * is held and when this macro exits the lock is held.
 *
 * The lock is locked/unlocked using spin_lock()/spin_unlock()
 * functions which must match the way they are locked/unlocked outside
 * of this macro.
 *
 * wake_up_locked() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_locked(wq, condition)                                \
        ((condition)                                                                \
         ? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr))

/**
 * wait_event_interruptible_locked_irq - sleep until a condition gets true
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * It must be called with wq.lock being held.  This spinlock is
 * unlocked while sleeping but @condition testing is done while lock
 * is held and when this macro exits the lock is held.
 *
 * The lock is locked/unlocked using spin_lock_irq()/spin_unlock_irq()
 * functions which must match the way they are locked/unlocked outside
 * of this macro.
 *
 * wake_up_locked() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_locked_irq(wq, condition)                        \
        ((condition)                                                                \
         ? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr_irq))

/**
 * wait_event_interruptible_exclusive_locked - sleep exclusively until a condition gets true
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * It must be called with wq.lock being held.  This spinlock is
 * unlocked while sleeping but @condition testing is done while lock
 * is held and when this macro exits the lock is held.
 *
 * The lock is locked/unlocked using spin_lock()/spin_unlock()
 * functions which must match the way they are locked/unlocked outside
 * of this macro.
 *
 * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
 * set thus when other process waits process on the list if this
 * process is awaken further processes are not considered.
 *
 * wake_up_locked() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_exclusive_locked(wq, condition)                \
        ((condition)                                                                \
         ? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr))

/**
 * wait_event_interruptible_exclusive_locked_irq - sleep until a condition gets true
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * It must be called with wq.lock being held.  This spinlock is
 * unlocked while sleeping but @condition testing is done while lock
 * is held and when this macro exits the lock is held.
 *
 * The lock is locked/unlocked using spin_lock_irq()/spin_unlock_irq()
 * functions which must match the way they are locked/unlocked outside
 * of this macro.
 *
 * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
 * set thus when other process waits process on the list if this
 * process is awaken further processes are not considered.
 *
 * wake_up_locked() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_exclusive_locked_irq(wq, condition)                \
        ((condition)                                                                \
         ? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr_irq))


#define __wait_event_killable(wq, condition)                                        \
        ___wait_event(wq, condition, TASK_KILLABLE, 0, 0, schedule())

/**
 * wait_event_killable - sleep until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_KILLABLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_killable(wq_head, condition)                                        \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_killable(wq_head, condition);                \
        __ret;                                                                        \
})

#define __wait_event_state(wq, condition, state)                                \
        ___wait_event(wq, condition, state, 0, 0, schedule())

/**
 * wait_event_state - sleep until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @state: state to sleep in
 *
 * The process is put to sleep (@state) until the @condition evaluates to true
 * or a signal is received (when allowed by @state).  The @condition is checked
 * each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a signal
 * (when allowed by @state) and 0 if @condition evaluated to true.
 */
#define wait_event_state(wq_head, condition, state)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_state(wq_head, condition, state);                \
        __ret;                                                                        \
})

#define __wait_event_state_exclusive(wq, condition, state)                        \
        ___wait_event(wq, condition, state, 1, 0, schedule())

#define wait_event_state_exclusive(wq, condition, state)                        \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_state_exclusive(wq, condition, state);        \
        __ret;                                                                        \
})

#define __wait_event_killable_timeout(wq_head, condition, timeout)                \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_KILLABLE, 0, timeout,                                \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_killable_timeout - sleep until a condition gets true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_KILLABLE) until the
 * @condition evaluates to true or a kill signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was
 * interrupted by a kill signal.
 *
 * Only kill signals interrupt this process.
 */
#define wait_event_killable_timeout(wq_head, condition, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_killable_timeout(wq_head,                        \
                                                condition, timeout);                \
        __ret;                                                                        \
})


#define __wait_event_lock_irq(wq_head, condition, lock, cmd)                        \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,        \
                            spin_unlock_irq(&lock);                                \
                            cmd;                                                \
                            schedule();                                                \
                            spin_lock_irq(&lock))

/**
 * wait_event_lock_irq_cmd - sleep until a condition gets true. The
 *                             condition is checked under the lock. This
 *                             is expected to be called with the lock
 *                             taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before cmd
 *          and schedule() and reacquired afterwards.
 * @cmd: a command which is invoked outside the critical section before
 *         sleep
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before invoking the cmd and going to sleep and is reacquired
 * afterwards.
 */
#define wait_event_lock_irq_cmd(wq_head, condition, lock, cmd)                        \
do {                                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event_lock_irq(wq_head, condition, lock, cmd);                        \
} while (0)

/**
 * wait_event_lock_irq - sleep until a condition gets true. The
 *                         condition is checked under the lock. This
 *                         is expected to be called with the lock
 *                         taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before schedule()
 *          and reacquired afterwards.
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before going to sleep and is reacquired afterwards.
 */
#define wait_event_lock_irq(wq_head, condition, lock)                                \
do {                                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event_lock_irq(wq_head, condition, lock, );                        \
} while (0)


#define __wait_event_interruptible_lock_irq(wq_head, condition, lock, cmd)        \
        ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0,                \
                      spin_unlock_irq(&lock);                                        \
                      cmd;                                                        \
                      schedule();                                                \
                      spin_lock_irq(&lock))

/**
 * wait_event_interruptible_lock_irq_cmd - sleep until a condition gets true.
 *                The condition is checked under the lock. This is expected to
 *                be called with the lock taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before cmd and
 *          schedule() and reacquired afterwards.
 * @cmd: a command which is invoked outside the critical section before
 *         sleep
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received. The @condition is
 * checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before invoking the cmd and going to sleep and is reacquired
 * afterwards.
 *
 * The macro will return -ERESTARTSYS if it was interrupted by a signal
 * and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_lock_irq_cmd(wq_head, condition, lock, cmd)        \
({                                                                                \
        int __ret = 0;                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_interruptible_lock_irq(wq_head,                \
                                                condition, lock, cmd);                \
        __ret;                                                                        \
})

/**
 * wait_event_interruptible_lock_irq - sleep until a condition gets true.
 *                The condition is checked under the lock. This is expected
 *                to be called with the lock taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before schedule()
 *          and reacquired afterwards.
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or signal is received. The @condition is
 * checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before going to sleep and is reacquired afterwards.
 *
 * The macro will return -ERESTARTSYS if it was interrupted by a signal
 * and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_lock_irq(wq_head, condition, lock)                \
({                                                                                \
        int __ret = 0;                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_interruptible_lock_irq(wq_head,                \
                                                condition, lock,);                \
        __ret;                                                                        \
})

#define __wait_event_lock_irq_timeout(wq_head, condition, lock, timeout, state)        \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      state, 0, timeout,                                        \
                      spin_unlock_irq(&lock);                                        \
                      __ret = schedule_timeout(__ret);                                \
                      spin_lock_irq(&lock));

/**
 * wait_event_interruptible_lock_irq_timeout - sleep until a condition gets
 *                true or a timeout elapses. The condition is checked under
 *                the lock. This is expected to be called with the lock taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before schedule()
 *          and reacquired afterwards.
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or signal is received. The @condition is
 * checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before going to sleep and is reacquired afterwards.
 *
 * The function returns 0 if the @timeout elapsed, -ERESTARTSYS if it
 * was interrupted by a signal, and the remaining jiffies otherwise
 * if the condition evaluated to true before the timeout elapsed.
 */
#define wait_event_interruptible_lock_irq_timeout(wq_head, condition, lock,        \
                                                  timeout)                        \
({                                                                                \
        long __ret = timeout;                                                        \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_lock_irq_timeout(                                \
                                        wq_head, condition, lock, timeout,        \
                                        TASK_INTERRUPTIBLE);                        \
        __ret;                                                                        \
})

#define wait_event_lock_irq_timeout(wq_head, condition, lock, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_lock_irq_timeout(                                \
                                        wq_head, condition, lock, timeout,        \
                                        TASK_UNINTERRUPTIBLE);                        \
        __ret;                                                                        \
})

/*
 * Waitqueues which are removed from the waitqueue_head at wakeup time
 */
void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout);
int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);
int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);

#define DEFINE_WAIT_FUNC(name, function)                                        \
        struct wait_queue_entry name = {                                        \
                .private        = current,                                        \
                .func                = function,                                        \
                .entry                = LIST_HEAD_INIT((name).entry),                        \
        }

#define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function)

#define init_wait_func(wait, function)                                                \
        do {                                                                        \
                (wait)->private = current;                                        \
                (wait)->func = function;                                        \
                INIT_LIST_HEAD(&(wait)->entry);                                        \
                (wait)->flags = 0;                                                \
        } while (0)

#define init_wait(wait)        init_wait_func(wait, autoremove_wake_function)

typedef int (*task_call_f)(struct task_struct *p, void *arg);
extern int task_call_func(struct task_struct *p, task_call_f func, void *arg);

#endif /* _LINUX_WAIT_H */


















































    8 

    8 








    8 




    8 




    8 






    8 





    6 






    6 







    1 






    6 

    5 











































































































































































    6 








    6 







    6 












    6 























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Authenc: Simple AEAD wrapper for IPsec
 *
 * Copyright (c) 2007-2015 Herbert Xu <herbert@gondor.apana.org.au>
 */

#include <crypto/internal/aead.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/skcipher.h>
#include <crypto/authenc.h>
#include <crypto/scatterwalk.h>
#include <linux/err.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/rtnetlink.h>
#include <linux/slab.h>
#include <linux/spinlock.h>

struct authenc_instance_ctx {
        struct crypto_ahash_spawn auth;
        struct crypto_skcipher_spawn enc;
        unsigned int reqoff;
};

struct crypto_authenc_ctx {
        struct crypto_ahash *auth;
        struct crypto_skcipher *enc;
};

struct authenc_request_ctx {
        struct scatterlist src[2];
        struct scatterlist dst[2];
        char tail[];
};

static void authenc_request_complete(struct aead_request *req, int err)
{
        if (err != -EINPROGRESS)
                aead_request_complete(req, err);
}

int crypto_authenc_extractkeys(struct crypto_authenc_keys *keys, const u8 *key,
                               unsigned int keylen)
{
        struct rtattr *rta = (struct rtattr *)key;
        struct crypto_authenc_key_param *param;

        if (!RTA_OK(rta, keylen))
                return -EINVAL;
        if (rta->rta_type != CRYPTO_AUTHENC_KEYA_PARAM)
                return -EINVAL;

        /*
         * RTA_OK() didn't align the rtattr's payload when validating that it
         * fits in the buffer.  Yet, the keys should start on the next 4-byte
         * aligned boundary.  To avoid confusion, require that the rtattr
         * payload be exactly the param struct, which has a 4-byte aligned size.
         */
        if (RTA_PAYLOAD(rta) != sizeof(*param))
                return -EINVAL;
        BUILD_BUG_ON(sizeof(*param) % RTA_ALIGNTO);

        param = RTA_DATA(rta);
        keys->enckeylen = be32_to_cpu(param->enckeylen);

        key += rta->rta_len;
        keylen -= rta->rta_len;

        if (keylen < keys->enckeylen)
                return -EINVAL;

        keys->authkeylen = keylen - keys->enckeylen;
        keys->authkey = key;
        keys->enckey = key + keys->authkeylen;

        return 0;
}
EXPORT_SYMBOL_GPL(crypto_authenc_extractkeys);

static int crypto_authenc_setkey(struct crypto_aead *authenc, const u8 *key,
                                 unsigned int keylen)
{
        struct crypto_authenc_ctx *ctx = crypto_aead_ctx(authenc);
        struct crypto_ahash *auth = ctx->auth;
        struct crypto_skcipher *enc = ctx->enc;
        struct crypto_authenc_keys keys;
        int err = -EINVAL;

        if (crypto_authenc_extractkeys(&keys, key, keylen) != 0)
                goto out;

        crypto_ahash_clear_flags(auth, CRYPTO_TFM_REQ_MASK);
        crypto_ahash_set_flags(auth, crypto_aead_get_flags(authenc) &
                                    CRYPTO_TFM_REQ_MASK);
        err = crypto_ahash_setkey(auth, keys.authkey, keys.authkeylen);
        if (err)
                goto out;

        crypto_skcipher_clear_flags(enc, CRYPTO_TFM_REQ_MASK);
        crypto_skcipher_set_flags(enc, crypto_aead_get_flags(authenc) &
                                       CRYPTO_TFM_REQ_MASK);
        err = crypto_skcipher_setkey(enc, keys.enckey, keys.enckeylen);
out:
        memzero_explicit(&keys, sizeof(keys));
        return err;
}

static void authenc_geniv_ahash_done(void *data, int err)
{
        struct aead_request *req = data;
        struct crypto_aead *authenc = crypto_aead_reqtfm(req);
        struct aead_instance *inst = aead_alg_instance(authenc);
        struct authenc_instance_ctx *ictx = aead_instance_ctx(inst);
        struct authenc_request_ctx *areq_ctx = aead_request_ctx(req);
        struct ahash_request *ahreq = (void *)(areq_ctx->tail + ictx->reqoff);

        if (err)
                goto out;

        scatterwalk_map_and_copy(ahreq->result, req->dst,
                                 req->assoclen + req->cryptlen,
                                 crypto_aead_authsize(authenc), 1);

out:
        aead_request_complete(req, err);
}

static int crypto_authenc_genicv(struct aead_request *req, unsigned int flags)
{
        struct crypto_aead *authenc = crypto_aead_reqtfm(req);
        struct aead_instance *inst = aead_alg_instance(authenc);
        struct crypto_authenc_ctx *ctx = crypto_aead_ctx(authenc);
        struct authenc_instance_ctx *ictx = aead_instance_ctx(inst);
        struct crypto_ahash *auth = ctx->auth;
        struct authenc_request_ctx *areq_ctx = aead_request_ctx(req);
        struct ahash_request *ahreq = (void *)(areq_ctx->tail + ictx->reqoff);
        u8 *hash = areq_ctx->tail;
        int err;

        ahash_request_set_tfm(ahreq, auth);
        ahash_request_set_crypt(ahreq, req->dst, hash,
                                req->assoclen + req->cryptlen);
        ahash_request_set_callback(ahreq, flags,
                                   authenc_geniv_ahash_done, req);

        err = crypto_ahash_digest(ahreq);
        if (err)
                return err;

        scatterwalk_map_and_copy(hash, req->dst, req->assoclen + req->cryptlen,
                                 crypto_aead_authsize(authenc), 1);

        return 0;
}

static void crypto_authenc_encrypt_done(void *data, int err)
{
        struct aead_request *areq = data;

        if (err)
                goto out;

        err = crypto_authenc_genicv(areq, 0);

out:
        authenc_request_complete(areq, err);
}

static int crypto_authenc_encrypt(struct aead_request *req)
{
        struct crypto_aead *authenc = crypto_aead_reqtfm(req);
        struct aead_instance *inst = aead_alg_instance(authenc);
        struct crypto_authenc_ctx *ctx = crypto_aead_ctx(authenc);
        struct authenc_instance_ctx *ictx = aead_instance_ctx(inst);
        struct authenc_request_ctx *areq_ctx = aead_request_ctx(req);
        struct crypto_skcipher *enc = ctx->enc;
        unsigned int cryptlen = req->cryptlen;
        struct skcipher_request *skreq = (void *)(areq_ctx->tail +
                                                  ictx->reqoff);
        struct scatterlist *src, *dst;
        int err;

        src = scatterwalk_ffwd(areq_ctx->src, req->src, req->assoclen);
        dst = src;

        if (req->src != req->dst) {
                memcpy_sglist(req->dst, req->src, req->assoclen);
                dst = scatterwalk_ffwd(areq_ctx->dst, req->dst, req->assoclen);
        }

        skcipher_request_set_tfm(skreq, enc);
        skcipher_request_set_callback(skreq, aead_request_flags(req),
                                      crypto_authenc_encrypt_done, req);
        skcipher_request_set_crypt(skreq, src, dst, cryptlen, req->iv);

        err = crypto_skcipher_encrypt(skreq);
        if (err)
                return err;

        return crypto_authenc_genicv(req, aead_request_flags(req));
}

static int crypto_authenc_decrypt_tail(struct aead_request *req,
                                       unsigned int flags)
{
        struct crypto_aead *authenc = crypto_aead_reqtfm(req);
        struct aead_instance *inst = aead_alg_instance(authenc);
        struct crypto_authenc_ctx *ctx = crypto_aead_ctx(authenc);
        struct authenc_instance_ctx *ictx = aead_instance_ctx(inst);
        struct authenc_request_ctx *areq_ctx = aead_request_ctx(req);
        struct ahash_request *ahreq = (void *)(areq_ctx->tail + ictx->reqoff);
        struct skcipher_request *skreq = (void *)(areq_ctx->tail +
                                                  ictx->reqoff);
        unsigned int authsize = crypto_aead_authsize(authenc);
        u8 *ihash = ahreq->result + authsize;
        struct scatterlist *src, *dst;

        scatterwalk_map_and_copy(ihash, req->src, ahreq->nbytes, authsize, 0);

        if (crypto_memneq(ihash, ahreq->result, authsize))
                return -EBADMSG;

        src = scatterwalk_ffwd(areq_ctx->src, req->src, req->assoclen);
        dst = src;

        if (req->src != req->dst)
                dst = scatterwalk_ffwd(areq_ctx->dst, req->dst, req->assoclen);

        skcipher_request_set_tfm(skreq, ctx->enc);
        skcipher_request_set_callback(skreq, flags,
                                      req->base.complete, req->base.data);
        skcipher_request_set_crypt(skreq, src, dst,
                                   req->cryptlen - authsize, req->iv);

        return crypto_skcipher_decrypt(skreq);
}

static void authenc_verify_ahash_done(void *data, int err)
{
        struct aead_request *req = data;

        if (err)
                goto out;

        err = crypto_authenc_decrypt_tail(req, 0);

out:
        authenc_request_complete(req, err);
}

static int crypto_authenc_decrypt(struct aead_request *req)
{
        struct crypto_aead *authenc = crypto_aead_reqtfm(req);
        unsigned int authsize = crypto_aead_authsize(authenc);
        struct aead_instance *inst = aead_alg_instance(authenc);
        struct crypto_authenc_ctx *ctx = crypto_aead_ctx(authenc);
        struct authenc_instance_ctx *ictx = aead_instance_ctx(inst);
        struct crypto_ahash *auth = ctx->auth;
        struct authenc_request_ctx *areq_ctx = aead_request_ctx(req);
        struct ahash_request *ahreq = (void *)(areq_ctx->tail + ictx->reqoff);
        u8 *hash = areq_ctx->tail;
        int err;

        ahash_request_set_tfm(ahreq, auth);
        ahash_request_set_crypt(ahreq, req->src, hash,
                                req->assoclen + req->cryptlen - authsize);
        ahash_request_set_callback(ahreq, aead_request_flags(req),
                                   authenc_verify_ahash_done, req);

        err = crypto_ahash_digest(ahreq);
        if (err)
                return err;

        return crypto_authenc_decrypt_tail(req, aead_request_flags(req));
}

static int crypto_authenc_init_tfm(struct crypto_aead *tfm)
{
        struct aead_instance *inst = aead_alg_instance(tfm);
        struct authenc_instance_ctx *ictx = aead_instance_ctx(inst);
        struct crypto_authenc_ctx *ctx = crypto_aead_ctx(tfm);
        struct crypto_ahash *auth;
        struct crypto_skcipher *enc;
        int err;

        auth = crypto_spawn_ahash(&ictx->auth);
        if (IS_ERR(auth))
                return PTR_ERR(auth);

        enc = crypto_spawn_skcipher(&ictx->enc);
        err = PTR_ERR(enc);
        if (IS_ERR(enc))
                goto err_free_ahash;

        ctx->auth = auth;
        ctx->enc = enc;

        crypto_aead_set_reqsize(
                tfm,
                sizeof(struct authenc_request_ctx) +
                ictx->reqoff +
                max_t(unsigned int,
                      crypto_ahash_reqsize(auth) +
                      sizeof(struct ahash_request),
                      sizeof(struct skcipher_request) +
                      crypto_skcipher_reqsize(enc)));

        return 0;

err_free_ahash:
        crypto_free_ahash(auth);
        return err;
}

static void crypto_authenc_exit_tfm(struct crypto_aead *tfm)
{
        struct crypto_authenc_ctx *ctx = crypto_aead_ctx(tfm);

        crypto_free_ahash(ctx->auth);
        crypto_free_skcipher(ctx->enc);
}

static void crypto_authenc_free(struct aead_instance *inst)
{
        struct authenc_instance_ctx *ctx = aead_instance_ctx(inst);

        crypto_drop_skcipher(&ctx->enc);
        crypto_drop_ahash(&ctx->auth);
        kfree(inst);
}

static int crypto_authenc_create(struct crypto_template *tmpl,
                                 struct rtattr **tb)
{
        u32 mask;
        struct aead_instance *inst;
        struct authenc_instance_ctx *ctx;
        struct skcipher_alg_common *enc;
        struct hash_alg_common *auth;
        struct crypto_alg *auth_base;
        int err;

        err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_AEAD, &mask);
        if (err)
                return err;

        inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL);
        if (!inst)
                return -ENOMEM;
        ctx = aead_instance_ctx(inst);

        err = crypto_grab_ahash(&ctx->auth, aead_crypto_instance(inst),
                                crypto_attr_alg_name(tb[1]), 0, mask);
        if (err)
                goto err_free_inst;
        auth = crypto_spawn_ahash_alg(&ctx->auth);
        auth_base = &auth->base;

        err = crypto_grab_skcipher(&ctx->enc, aead_crypto_instance(inst),
                                   crypto_attr_alg_name(tb[2]), 0, mask);
        if (err)
                goto err_free_inst;
        enc = crypto_spawn_skcipher_alg_common(&ctx->enc);

        ctx->reqoff = 2 * auth->digestsize;

        err = -ENAMETOOLONG;
        if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME,
                     "authenc(%s,%s)", auth_base->cra_name,
                     enc->base.cra_name) >=
            CRYPTO_MAX_ALG_NAME)
                goto err_free_inst;

        if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME,
                     "authenc(%s,%s)", auth_base->cra_driver_name,
                     enc->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME)
                goto err_free_inst;

        inst->alg.base.cra_priority = enc->base.cra_priority * 10 +
                                      auth_base->cra_priority;
        inst->alg.base.cra_blocksize = enc->base.cra_blocksize;
        inst->alg.base.cra_alignmask = enc->base.cra_alignmask;
        inst->alg.base.cra_ctxsize = sizeof(struct crypto_authenc_ctx);

        inst->alg.ivsize = enc->ivsize;
        inst->alg.chunksize = enc->chunksize;
        inst->alg.maxauthsize = auth->digestsize;

        inst->alg.init = crypto_authenc_init_tfm;
        inst->alg.exit = crypto_authenc_exit_tfm;

        inst->alg.setkey = crypto_authenc_setkey;
        inst->alg.encrypt = crypto_authenc_encrypt;
        inst->alg.decrypt = crypto_authenc_decrypt;

        inst->free = crypto_authenc_free;

        err = aead_register_instance(tmpl, inst);
        if (err) {
err_free_inst:
                crypto_authenc_free(inst);
        }
        return err;
}

static struct crypto_template crypto_authenc_tmpl = {
        .name = "authenc",
        .create = crypto_authenc_create,
        .module = THIS_MODULE,
};

static int __init crypto_authenc_module_init(void)
{
        return crypto_register_template(&crypto_authenc_tmpl);
}

static void __exit crypto_authenc_module_exit(void)
{
        crypto_unregister_template(&crypto_authenc_tmpl);
}

module_init(crypto_authenc_module_init);
module_exit(crypto_authenc_module_exit);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Simple AEAD wrapper for IPsec");
MODULE_ALIAS_CRYPTO("authenc");


























  317 































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM capability

#if !defined(_TRACE_CAPABILITY_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_CAPABILITY_H

#include <linux/cred.h>
#include <linux/tracepoint.h>
#include <linux/user_namespace.h>

/**
 * cap_capable - called after it's determined if a task has a particular
 * effective capability
 *
 * @cred: The credentials used
 * @target_ns: The user namespace of the resource being accessed
 * @capable_ns: The user namespace in which the credential provides the
 *              capability to access the targeted resource.
 *              This will be NULL if ret is not 0.
 * @cap: The capability to check for
 * @ret: The return value of the check: 0 if it does, -ve if it does not
 *
 * Allows to trace calls to cap_capable in commoncap.c
 */
TRACE_EVENT(cap_capable,

        TP_PROTO(const struct cred *cred, struct user_namespace *target_ns,
                const struct user_namespace *capable_ns, int cap, int ret),

        TP_ARGS(cred, target_ns, capable_ns, cap, ret),

        TP_STRUCT__entry(
                __field(const struct cred *, cred)
                __field(struct user_namespace *, target_ns)
                __field(const struct user_namespace *, capable_ns)
                __field(int, cap)
                __field(int, ret)
        ),

        TP_fast_assign(
                __entry->cred       = cred;
                __entry->target_ns    = target_ns;
                __entry->capable_ns = ret == 0 ? capable_ns : NULL;
                __entry->cap        = cap;
                __entry->ret        = ret;
        ),

        TP_printk("cred %p, target_ns %p, capable_ns %p, cap %d, ret %d",
                __entry->cred, __entry->target_ns, __entry->capable_ns, __entry->cap,
                __entry->ret)
);

#endif /* _TRACE_CAPABILITY_H */

/* This part must be outside protection */
#include <trace/define_trace.h>
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
/* SPDX-License-Identifier: GPL-2.0-only */
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
 */
#ifndef _LINUX_BPF_H
#define _LINUX_BPF_H 1

#include <uapi/linux/bpf.h>
#include <uapi/linux/filter.h>

#include <crypto/sha2.h>
#include <linux/workqueue.h>
#include <linux/file.h>
#include <linux/percpu.h>
#include <linux/err.h>
#include <linux/rbtree_latch.h>
#include <linux/numa.h>
#include <linux/mm_types.h>
#include <linux/wait.h>
#include <linux/refcount.h>
#include <linux/mutex.h>
#include <linux/module.h>
#include <linux/kallsyms.h>
#include <linux/capability.h>
#include <linux/sched/mm.h>
#include <linux/slab.h>
#include <linux/percpu-refcount.h>
#include <linux/stddef.h>
#include <linux/bpfptr.h>
#include <linux/btf.h>
#include <linux/rcupdate_trace.h>
#include <linux/static_call.h>
#include <linux/memcontrol.h>
#include <linux/cfi.h>
#include <asm/rqspinlock.h>

struct bpf_verifier_env;
struct bpf_verifier_log;
struct perf_event;
struct bpf_prog;
struct bpf_prog_aux;
struct bpf_map;
struct bpf_arena;
struct sock;
struct seq_file;
struct btf;
struct btf_type;
struct exception_table_entry;
struct seq_operations;
struct bpf_iter_aux_info;
struct bpf_local_storage;
struct bpf_local_storage_map;
struct kobject;
struct mem_cgroup;
struct module;
struct bpf_func_state;
struct ftrace_ops;
struct cgroup;
struct bpf_token;
struct user_namespace;
struct super_block;
struct inode;

extern struct idr btf_idr;
extern spinlock_t btf_idr_lock;
extern struct kobject *btf_kobj;
extern struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
extern bool bpf_global_ma_set;

typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64);
typedef int (*bpf_iter_init_seq_priv_t)(void *private_data,
                                        struct bpf_iter_aux_info *aux);
typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data);
typedef unsigned int (*bpf_func_t)(const void *,
                                   const struct bpf_insn *);
struct bpf_iter_seq_info {
        const struct seq_operations *seq_ops;
        bpf_iter_init_seq_priv_t init_seq_private;
        bpf_iter_fini_seq_priv_t fini_seq_private;
        u32 seq_priv_size;
};

/* map is generic key/value storage optionally accessible by eBPF programs */
struct bpf_map_ops {
        /* funcs callable from userspace (via syscall) */
        int (*map_alloc_check)(union bpf_attr *attr);
        struct bpf_map *(*map_alloc)(union bpf_attr *attr);
        void (*map_release)(struct bpf_map *map, struct file *map_file);
        void (*map_free)(struct bpf_map *map);
        int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key);
        void (*map_release_uref)(struct bpf_map *map);
        void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key);
        int (*map_lookup_batch)(struct bpf_map *map, const union bpf_attr *attr,
                                union bpf_attr __user *uattr);
        int (*map_lookup_and_delete_elem)(struct bpf_map *map, void *key,
                                          void *value, u64 flags);
        int (*map_lookup_and_delete_batch)(struct bpf_map *map,
                                           const union bpf_attr *attr,
                                           union bpf_attr __user *uattr);
        int (*map_update_batch)(struct bpf_map *map, struct file *map_file,
                                const union bpf_attr *attr,
                                union bpf_attr __user *uattr);
        int (*map_delete_batch)(struct bpf_map *map, const union bpf_attr *attr,
                                union bpf_attr __user *uattr);

        /* funcs callable from userspace and from eBPF programs */
        void *(*map_lookup_elem)(struct bpf_map *map, void *key);
        long (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
        long (*map_delete_elem)(struct bpf_map *map, void *key);
        long (*map_push_elem)(struct bpf_map *map, void *value, u64 flags);
        long (*map_pop_elem)(struct bpf_map *map, void *value);
        long (*map_peek_elem)(struct bpf_map *map, void *value);
        void *(*map_lookup_percpu_elem)(struct bpf_map *map, void *key, u32 cpu);
        int (*map_get_hash)(struct bpf_map *map, u32 hash_buf_size, void *hash_buf);

        /* funcs called by prog_array and perf_event_array map */
        void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
                                int fd);
        /* If need_defer is true, the implementation should guarantee that
         * the to-be-put element is still alive before the bpf program, which
         * may manipulate it, exists.
         */
        void (*map_fd_put_ptr)(struct bpf_map *map, void *ptr, bool need_defer);
        int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
        u32 (*map_fd_sys_lookup_elem)(void *ptr);
        void (*map_seq_show_elem)(struct bpf_map *map, void *key,
                                  struct seq_file *m);
        int (*map_check_btf)(const struct bpf_map *map,
                             const struct btf *btf,
                             const struct btf_type *key_type,
                             const struct btf_type *value_type);

        /* Prog poke tracking helpers. */
        int (*map_poke_track)(struct bpf_map *map, struct bpf_prog_aux *aux);
        void (*map_poke_untrack)(struct bpf_map *map, struct bpf_prog_aux *aux);
        void (*map_poke_run)(struct bpf_map *map, u32 key, struct bpf_prog *old,
                             struct bpf_prog *new);

        /* Direct value access helpers. */
        int (*map_direct_value_addr)(const struct bpf_map *map,
                                     u64 *imm, u32 off);
        int (*map_direct_value_meta)(const struct bpf_map *map,
                                     u64 imm, u32 *off);
        int (*map_mmap)(struct bpf_map *map, struct vm_area_struct *vma);
        __poll_t (*map_poll)(struct bpf_map *map, struct file *filp,
                             struct poll_table_struct *pts);
        unsigned long (*map_get_unmapped_area)(struct file *filep, unsigned long addr,
                                               unsigned long len, unsigned long pgoff,
                                               unsigned long flags);

        /* Functions called by bpf_local_storage maps */
        int (*map_local_storage_charge)(struct bpf_local_storage_map *smap,
                                        void *owner, u32 size);
        void (*map_local_storage_uncharge)(struct bpf_local_storage_map *smap,
                                           void *owner, u32 size);
        struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner);

        /* Misc helpers.*/
        long (*map_redirect)(struct bpf_map *map, u64 key, u64 flags);

        /* map_meta_equal must be implemented for maps that can be
         * used as an inner map.  It is a runtime check to ensure
         * an inner map can be inserted to an outer map.
         *
         * Some properties of the inner map has been used during the
         * verification time.  When inserting an inner map at the runtime,
         * map_meta_equal has to ensure the inserting map has the same
         * properties that the verifier has used earlier.
         */
        bool (*map_meta_equal)(const struct bpf_map *meta0,
                               const struct bpf_map *meta1);


        int (*map_set_for_each_callback_args)(struct bpf_verifier_env *env,
                                              struct bpf_func_state *caller,
                                              struct bpf_func_state *callee);
        long (*map_for_each_callback)(struct bpf_map *map,
                                     bpf_callback_t callback_fn,
                                     void *callback_ctx, u64 flags);

        u64 (*map_mem_usage)(const struct bpf_map *map);

        /* BTF id of struct allocated by map_alloc */
        int *map_btf_id;

        /* bpf_iter info used to open a seq_file */
        const struct bpf_iter_seq_info *iter_seq_info;
};

enum {
        /* Support at most 11 fields in a BTF type */
        BTF_FIELDS_MAX           = 11,
};

enum btf_field_type {
        BPF_SPIN_LOCK  = (1 << 0),
        BPF_TIMER      = (1 << 1),
        BPF_KPTR_UNREF = (1 << 2),
        BPF_KPTR_REF   = (1 << 3),
        BPF_KPTR_PERCPU = (1 << 4),
        BPF_KPTR       = BPF_KPTR_UNREF | BPF_KPTR_REF | BPF_KPTR_PERCPU,
        BPF_LIST_HEAD  = (1 << 5),
        BPF_LIST_NODE  = (1 << 6),
        BPF_RB_ROOT    = (1 << 7),
        BPF_RB_NODE    = (1 << 8),
        BPF_GRAPH_NODE = BPF_RB_NODE | BPF_LIST_NODE,
        BPF_GRAPH_ROOT = BPF_RB_ROOT | BPF_LIST_HEAD,
        BPF_REFCOUNT   = (1 << 9),
        BPF_WORKQUEUE  = (1 << 10),
        BPF_UPTR       = (1 << 11),
        BPF_RES_SPIN_LOCK = (1 << 12),
        BPF_TASK_WORK  = (1 << 13),
};

enum bpf_cgroup_storage_type {
        BPF_CGROUP_STORAGE_SHARED,
        BPF_CGROUP_STORAGE_PERCPU,
        __BPF_CGROUP_STORAGE_MAX
#define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX
};

#ifdef CONFIG_CGROUP_BPF
# define for_each_cgroup_storage_type(stype) \
        for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++)
#else
# define for_each_cgroup_storage_type(stype) for (; false; )
#endif /* CONFIG_CGROUP_BPF */

typedef void (*btf_dtor_kfunc_t)(void *);

struct btf_field_kptr {
        struct btf *btf;
        struct module *module;
        /* dtor used if btf_is_kernel(btf), otherwise the type is
         * program-allocated, dtor is NULL,  and __bpf_obj_drop_impl is used
         */
        btf_dtor_kfunc_t dtor;
        u32 btf_id;
};

struct btf_field_graph_root {
        struct btf *btf;
        u32 value_btf_id;
        u32 node_offset;
        struct btf_record *value_rec;
};

struct btf_field {
        u32 offset;
        u32 size;
        enum btf_field_type type;
        union {
                struct btf_field_kptr kptr;
                struct btf_field_graph_root graph_root;
        };
};

struct btf_record {
        u32 cnt;
        u32 field_mask;
        int spin_lock_off;
        int res_spin_lock_off;
        int timer_off;
        int wq_off;
        int refcount_off;
        int task_work_off;
        struct btf_field fields[];
};

/* Non-opaque version of bpf_rb_node in uapi/linux/bpf.h */
struct bpf_rb_node_kern {
        struct rb_node rb_node;
        void *owner;
} __attribute__((aligned(8)));

/* Non-opaque version of bpf_list_node in uapi/linux/bpf.h */
struct bpf_list_node_kern {
        struct list_head list_head;
        void *owner;
} __attribute__((aligned(8)));

/* 'Ownership' of program-containing map is claimed by the first program
 * that is going to use this map or by the first program which FD is
 * stored in the map to make sure that all callers and callees have the
 * same prog type, JITed flag and xdp_has_frags flag.
 */
struct bpf_map_owner {
        enum bpf_prog_type type;
        bool jited;
        bool xdp_has_frags;
        u64 storage_cookie[MAX_BPF_CGROUP_STORAGE_TYPE];
        const struct btf_type *attach_func_proto;
        enum bpf_attach_type expected_attach_type;
};

struct bpf_map {
        u8 sha[SHA256_DIGEST_SIZE];
        const struct bpf_map_ops *ops;
        struct bpf_map *inner_map_meta;
#ifdef CONFIG_SECURITY
        void *security;
#endif
        enum bpf_map_type map_type;
        u32 key_size;
        u32 value_size;
        u32 max_entries;
        u64 map_extra; /* any per-map-type extra fields */
        u32 map_flags;
        u32 id;
        struct btf_record *record;
        int numa_node;
        u32 btf_key_type_id;
        u32 btf_value_type_id;
        u32 btf_vmlinux_value_type_id;
        struct btf *btf;
#ifdef CONFIG_MEMCG
        struct obj_cgroup *objcg;
#endif
        char name[BPF_OBJ_NAME_LEN];
        struct mutex freeze_mutex;
        atomic64_t refcnt;
        atomic64_t usercnt;
        /* rcu is used before freeing and work is only used during freeing */
        union {
                struct work_struct work;
                struct rcu_head rcu;
        };
        atomic64_t writecnt;
        spinlock_t owner_lock;
        struct bpf_map_owner *owner;
        bool bypass_spec_v1;
        bool frozen; /* write-once; write-protected by freeze_mutex */
        bool free_after_mult_rcu_gp;
        bool free_after_rcu_gp;
        atomic64_t sleepable_refcnt;
        s64 __percpu *elem_count;
        u64 cookie; /* write-once */
        char *excl_prog_sha;
};

static inline const char *btf_field_type_name(enum btf_field_type type)
{
        switch (type) {
        case BPF_SPIN_LOCK:
                return "bpf_spin_lock";
        case BPF_RES_SPIN_LOCK:
                return "bpf_res_spin_lock";
        case BPF_TIMER:
                return "bpf_timer";
        case BPF_WORKQUEUE:
                return "bpf_wq";
        case BPF_KPTR_UNREF:
        case BPF_KPTR_REF:
                return "kptr";
        case BPF_KPTR_PERCPU:
                return "percpu_kptr";
        case BPF_UPTR:
                return "uptr";
        case BPF_LIST_HEAD:
                return "bpf_list_head";
        case BPF_LIST_NODE:
                return "bpf_list_node";
        case BPF_RB_ROOT:
                return "bpf_rb_root";
        case BPF_RB_NODE:
                return "bpf_rb_node";
        case BPF_REFCOUNT:
                return "bpf_refcount";
        case BPF_TASK_WORK:
                return "bpf_task_work";
        default:
                WARN_ON_ONCE(1);
                return "unknown";
        }
}

#if IS_ENABLED(CONFIG_DEBUG_KERNEL)
#define BPF_WARN_ONCE(cond, format...) WARN_ONCE(cond, format)
#else
#define BPF_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond)
#endif

static inline u32 btf_field_type_size(enum btf_field_type type)
{
        switch (type) {
        case BPF_SPIN_LOCK:
                return sizeof(struct bpf_spin_lock);
        case BPF_RES_SPIN_LOCK:
                return sizeof(struct bpf_res_spin_lock);
        case BPF_TIMER:
                return sizeof(struct bpf_timer);
        case BPF_WORKQUEUE:
                return sizeof(struct bpf_wq);
        case BPF_KPTR_UNREF:
        case BPF_KPTR_REF:
        case BPF_KPTR_PERCPU:
        case BPF_UPTR:
                return sizeof(u64);
        case BPF_LIST_HEAD:
                return sizeof(struct bpf_list_head);
        case BPF_LIST_NODE:
                return sizeof(struct bpf_list_node);
        case BPF_RB_ROOT:
                return sizeof(struct bpf_rb_root);
        case BPF_RB_NODE:
                return sizeof(struct bpf_rb_node);
        case BPF_REFCOUNT:
                return sizeof(struct bpf_refcount);
        case BPF_TASK_WORK:
                return sizeof(struct bpf_task_work);
        default:
                WARN_ON_ONCE(1);
                return 0;
        }
}

static inline u32 btf_field_type_align(enum btf_field_type type)
{
        switch (type) {
        case BPF_SPIN_LOCK:
                return __alignof__(struct bpf_spin_lock);
        case BPF_RES_SPIN_LOCK:
                return __alignof__(struct bpf_res_spin_lock);
        case BPF_TIMER:
                return __alignof__(struct bpf_timer);
        case BPF_WORKQUEUE:
                return __alignof__(struct bpf_wq);
        case BPF_KPTR_UNREF:
        case BPF_KPTR_REF:
        case BPF_KPTR_PERCPU:
        case BPF_UPTR:
                return __alignof__(u64);
        case BPF_LIST_HEAD:
                return __alignof__(struct bpf_list_head);
        case BPF_LIST_NODE:
                return __alignof__(struct bpf_list_node);
        case BPF_RB_ROOT:
                return __alignof__(struct bpf_rb_root);
        case BPF_RB_NODE:
                return __alignof__(struct bpf_rb_node);
        case BPF_REFCOUNT:
                return __alignof__(struct bpf_refcount);
        case BPF_TASK_WORK:
                return __alignof__(struct bpf_task_work);
        default:
                WARN_ON_ONCE(1);
                return 0;
        }
}

static inline void bpf_obj_init_field(const struct btf_field *field, void *addr)
{
        memset(addr, 0, field->size);

        switch (field->type) {
        case BPF_REFCOUNT:
                refcount_set((refcount_t *)addr, 1);
                break;
        case BPF_RB_NODE:
                RB_CLEAR_NODE((struct rb_node *)addr);
                break;
        case BPF_LIST_HEAD:
        case BPF_LIST_NODE:
                INIT_LIST_HEAD((struct list_head *)addr);
                break;
        case BPF_RB_ROOT:
                /* RB_ROOT_CACHED 0-inits, no need to do anything after memset */
        case BPF_SPIN_LOCK:
        case BPF_RES_SPIN_LOCK:
        case BPF_TIMER:
        case BPF_WORKQUEUE:
        case BPF_KPTR_UNREF:
        case BPF_KPTR_REF:
        case BPF_KPTR_PERCPU:
        case BPF_UPTR:
        case BPF_TASK_WORK:
                break;
        default:
                WARN_ON_ONCE(1);
                return;
        }
}

static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_field_type type)
{
        if (IS_ERR_OR_NULL(rec))
                return false;
        return rec->field_mask & type;
}

static inline void bpf_obj_init(const struct btf_record *rec, void *obj)
{
        int i;

        if (IS_ERR_OR_NULL(rec))
                return;
        for (i = 0; i < rec->cnt; i++)
                bpf_obj_init_field(&rec->fields[i], obj + rec->fields[i].offset);
}

/* 'dst' must be a temporary buffer and should not point to memory that is being
 * used in parallel by a bpf program or bpf syscall, otherwise the access from
 * the bpf program or bpf syscall may be corrupted by the reinitialization,
 * leading to weird problems. Even 'dst' is newly-allocated from bpf memory
 * allocator, it is still possible for 'dst' to be used in parallel by a bpf
 * program or bpf syscall.
 */
static inline void check_and_init_map_value(struct bpf_map *map, void *dst)
{
        bpf_obj_init(map->record, dst);
}

/* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
 * forced to use 'long' read/writes to try to atomically copy long counters.
 * Best-effort only.  No barriers here, since it _will_ race with concurrent
 * updates from BPF programs. Called from bpf syscall and mostly used with
 * size 8 or 16 bytes, so ask compiler to inline it.
 */
static inline void bpf_long_memcpy(void *dst, const void *src, u32 size)
{
        const long *lsrc = src;
        long *ldst = dst;

        size /= sizeof(long);
        while (size--)
                data_race(*ldst++ = *lsrc++);
}

/* copy everything but bpf_spin_lock, bpf_timer, and kptrs. There could be one of each. */
static inline void bpf_obj_memcpy(struct btf_record *rec,
                                  void *dst, void *src, u32 size,
                                  bool long_memcpy)
{
        u32 curr_off = 0;
        int i;

        if (IS_ERR_OR_NULL(rec)) {
                if (long_memcpy)
                        bpf_long_memcpy(dst, src, round_up(size, 8));
                else
                        memcpy(dst, src, size);
                return;
        }

        for (i = 0; i < rec->cnt; i++) {
                u32 next_off = rec->fields[i].offset;
                u32 sz = next_off - curr_off;

                memcpy(dst + curr_off, src + curr_off, sz);
                curr_off += rec->fields[i].size + sz;
        }
        memcpy(dst + curr_off, src + curr_off, size - curr_off);
}

static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
{
        bpf_obj_memcpy(map->record, dst, src, map->value_size, false);
}

static inline void copy_map_value_long(struct bpf_map *map, void *dst, void *src)
{
        bpf_obj_memcpy(map->record, dst, src, map->value_size, true);
}

static inline void bpf_obj_swap_uptrs(const struct btf_record *rec, void *dst, void *src)
{
        unsigned long *src_uptr, *dst_uptr;
        const struct btf_field *field;
        int i;

        if (!btf_record_has_field(rec, BPF_UPTR))
                return;

        for (i = 0, field = rec->fields; i < rec->cnt; i++, field++) {
                if (field->type != BPF_UPTR)
                        continue;

                src_uptr = src + field->offset;
                dst_uptr = dst + field->offset;
                swap(*src_uptr, *dst_uptr);
        }
}

static inline void bpf_obj_memzero(struct btf_record *rec, void *dst, u32 size)
{
        u32 curr_off = 0;
        int i;

        if (IS_ERR_OR_NULL(rec)) {
                memset(dst, 0, size);
                return;
        }

        for (i = 0; i < rec->cnt; i++) {
                u32 next_off = rec->fields[i].offset;
                u32 sz = next_off - curr_off;

                memset(dst + curr_off, 0, sz);
                curr_off += rec->fields[i].size + sz;
        }
        memset(dst + curr_off, 0, size - curr_off);
}

static inline void zero_map_value(struct bpf_map *map, void *dst)
{
        bpf_obj_memzero(map->record, dst, map->value_size);
}

void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
                           bool lock_src);
void bpf_timer_cancel_and_free(void *timer);
void bpf_wq_cancel_and_free(void *timer);
void bpf_task_work_cancel_and_free(void *timer);
void bpf_list_head_free(const struct btf_field *field, void *list_head,
                        struct bpf_spin_lock *spin_lock);
void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
                      struct bpf_spin_lock *spin_lock);
u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena);
u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena);
int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size);

struct bpf_offload_dev;
struct bpf_offloaded_map;

struct bpf_map_dev_ops {
        int (*map_get_next_key)(struct bpf_offloaded_map *map,
                                void *key, void *next_key);
        int (*map_lookup_elem)(struct bpf_offloaded_map *map,
                               void *key, void *value);
        int (*map_update_elem)(struct bpf_offloaded_map *map,
                               void *key, void *value, u64 flags);
        int (*map_delete_elem)(struct bpf_offloaded_map *map, void *key);
};

struct bpf_offloaded_map {
        struct bpf_map map;
        struct net_device *netdev;
        const struct bpf_map_dev_ops *dev_ops;
        void *dev_priv;
        struct list_head offloads;
};

static inline struct bpf_offloaded_map *map_to_offmap(struct bpf_map *map)
{
        return container_of(map, struct bpf_offloaded_map, map);
}

static inline bool bpf_map_offload_neutral(const struct bpf_map *map)
{
        return map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY;
}

static inline bool bpf_map_support_seq_show(const struct bpf_map *map)
{
        return (map->btf_value_type_id || map->btf_vmlinux_value_type_id) &&
                map->ops->map_seq_show_elem;
}

int map_check_no_btf(const struct bpf_map *map,
                     const struct btf *btf,
                     const struct btf_type *key_type,
                     const struct btf_type *value_type);

bool bpf_map_meta_equal(const struct bpf_map *meta0,
                        const struct bpf_map *meta1);

extern const struct bpf_map_ops bpf_map_offload_ops;

/* bpf_type_flag contains a set of flags that are applicable to the values of
 * arg_type, ret_type and reg_type. For example, a pointer value may be null,
 * or a memory is read-only. We classify types into two categories: base types
 * and extended types. Extended types are base types combined with a type flag.
 *
 * Currently there are no more than 32 base types in arg_type, ret_type and
 * reg_types.
 */
#define BPF_BASE_TYPE_BITS        8

enum bpf_type_flag {
        /* PTR may be NULL. */
        PTR_MAYBE_NULL                = BIT(0 + BPF_BASE_TYPE_BITS),

        /* MEM is read-only. When applied on bpf_arg, it indicates the arg is
         * compatible with both mutable and immutable memory.
         */
        MEM_RDONLY                = BIT(1 + BPF_BASE_TYPE_BITS),

        /* MEM points to BPF ring buffer reservation. */
        MEM_RINGBUF                = BIT(2 + BPF_BASE_TYPE_BITS),

        /* MEM is in user address space. */
        MEM_USER                = BIT(3 + BPF_BASE_TYPE_BITS),

        /* MEM is a percpu memory. MEM_PERCPU tags PTR_TO_BTF_ID. When tagged
         * with MEM_PERCPU, PTR_TO_BTF_ID _cannot_ be directly accessed. In
         * order to drop this tag, it must be passed into bpf_per_cpu_ptr()
         * or bpf_this_cpu_ptr(), which will return the pointer corresponding
         * to the specified cpu.
         */
        MEM_PERCPU                = BIT(4 + BPF_BASE_TYPE_BITS),

        /* Indicates that the argument will be released. */
        OBJ_RELEASE                = BIT(5 + BPF_BASE_TYPE_BITS),

        /* PTR is not trusted. This is only used with PTR_TO_BTF_ID, to mark
         * unreferenced and referenced kptr loaded from map value using a load
         * instruction, so that they can only be dereferenced but not escape the
         * BPF program into the kernel (i.e. cannot be passed as arguments to
         * kfunc or bpf helpers).
         */
        PTR_UNTRUSTED                = BIT(6 + BPF_BASE_TYPE_BITS),

        /* MEM can be uninitialized. */
        MEM_UNINIT                = BIT(7 + BPF_BASE_TYPE_BITS),

        /* DYNPTR points to memory local to the bpf program. */
        DYNPTR_TYPE_LOCAL        = BIT(8 + BPF_BASE_TYPE_BITS),

        /* DYNPTR points to a kernel-produced ringbuf record. */
        DYNPTR_TYPE_RINGBUF        = BIT(9 + BPF_BASE_TYPE_BITS),

        /* Size is known at compile time. */
        MEM_FIXED_SIZE                = BIT(10 + BPF_BASE_TYPE_BITS),

        /* MEM is of an allocated object of type in program BTF. This is used to
         * tag PTR_TO_BTF_ID allocated using bpf_obj_new.
         */
        MEM_ALLOC                = BIT(11 + BPF_BASE_TYPE_BITS),

        /* PTR was passed from the kernel in a trusted context, and may be
         * passed to KF_TRUSTED_ARGS kfuncs or BPF helper functions.
         * Confusingly, this is _not_ the opposite of PTR_UNTRUSTED above.
         * PTR_UNTRUSTED refers to a kptr that was read directly from a map
         * without invoking bpf_kptr_xchg(). What we really need to know is
         * whether a pointer is safe to pass to a kfunc or BPF helper function.
         * While PTR_UNTRUSTED pointers are unsafe to pass to kfuncs and BPF
         * helpers, they do not cover all possible instances of unsafe
         * pointers. For example, a pointer that was obtained from walking a
         * struct will _not_ get the PTR_UNTRUSTED type modifier, despite the
         * fact that it may be NULL, invalid, etc. This is due to backwards
         * compatibility requirements, as this was the behavior that was first
         * introduced when kptrs were added. The behavior is now considered
         * deprecated, and PTR_UNTRUSTED will eventually be removed.
         *
         * PTR_TRUSTED, on the other hand, is a pointer that the kernel
         * guarantees to be valid and safe to pass to kfuncs and BPF helpers.
         * For example, pointers passed to tracepoint arguments are considered
         * PTR_TRUSTED, as are pointers that are passed to struct_ops
         * callbacks. As alluded to above, pointers that are obtained from
         * walking PTR_TRUSTED pointers are _not_ trusted. For example, if a
         * struct task_struct *task is PTR_TRUSTED, then accessing
         * task->last_wakee will lose the PTR_TRUSTED modifier when it's stored
         * in a BPF register. Similarly, pointers passed to certain programs
         * types such as kretprobes are not guaranteed to be valid, as they may
         * for example contain an object that was recently freed.
         */
        PTR_TRUSTED                = BIT(12 + BPF_BASE_TYPE_BITS),

        /* MEM is tagged with rcu and memory access needs rcu_read_lock protection. */
        MEM_RCU                        = BIT(13 + BPF_BASE_TYPE_BITS),

        /* Used to tag PTR_TO_BTF_ID | MEM_ALLOC references which are non-owning.
         * Currently only valid for linked-list and rbtree nodes. If the nodes
         * have a bpf_refcount_field, they must be tagged MEM_RCU as well.
         */
        NON_OWN_REF                = BIT(14 + BPF_BASE_TYPE_BITS),

        /* DYNPTR points to sk_buff */
        DYNPTR_TYPE_SKB                = BIT(15 + BPF_BASE_TYPE_BITS),

        /* DYNPTR points to xdp_buff */
        DYNPTR_TYPE_XDP                = BIT(16 + BPF_BASE_TYPE_BITS),

        /* Memory must be aligned on some architectures, used in combination with
         * MEM_FIXED_SIZE.
         */
        MEM_ALIGNED                = BIT(17 + BPF_BASE_TYPE_BITS),

        /* MEM is being written to, often combined with MEM_UNINIT. Non-presence
         * of MEM_WRITE means that MEM is only being read. MEM_WRITE without the
         * MEM_UNINIT means that memory needs to be initialized since it is also
         * read.
         */
        MEM_WRITE                = BIT(18 + BPF_BASE_TYPE_BITS),

        /* DYNPTR points to skb_metadata_end()-skb_metadata_len() */
        DYNPTR_TYPE_SKB_META        = BIT(19 + BPF_BASE_TYPE_BITS),

        __BPF_TYPE_FLAG_MAX,
        __BPF_TYPE_LAST_FLAG        = __BPF_TYPE_FLAG_MAX - 1,
};

#define DYNPTR_TYPE_FLAG_MASK        (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB \
                                 | DYNPTR_TYPE_XDP | DYNPTR_TYPE_SKB_META)

/* Max number of base types. */
#define BPF_BASE_TYPE_LIMIT        (1UL << BPF_BASE_TYPE_BITS)

/* Max number of all types. */
#define BPF_TYPE_LIMIT                (__BPF_TYPE_LAST_FLAG | (__BPF_TYPE_LAST_FLAG - 1))

/* function argument constraints */
enum bpf_arg_type {
        ARG_DONTCARE = 0,        /* unused argument in helper function */

        /* the following constraints used to prototype
         * bpf_map_lookup/update/delete_elem() functions
         */
        ARG_CONST_MAP_PTR,        /* const argument used as pointer to bpf_map */
        ARG_PTR_TO_MAP_KEY,        /* pointer to stack used as map key */
        ARG_PTR_TO_MAP_VALUE,        /* pointer to stack used as map value */

        /* Used to prototype bpf_memcmp() and other functions that access data
         * on eBPF program stack
         */
        ARG_PTR_TO_MEM,                /* pointer to valid memory (stack, packet, map value) */
        ARG_PTR_TO_ARENA,

        ARG_CONST_SIZE,                /* number of bytes accessed from memory */
        ARG_CONST_SIZE_OR_ZERO,        /* number of bytes accessed from memory or 0 */

        ARG_PTR_TO_CTX,                /* pointer to context */
        ARG_ANYTHING,                /* any (initialized) argument is ok */
        ARG_PTR_TO_SPIN_LOCK,        /* pointer to bpf_spin_lock */
        ARG_PTR_TO_SOCK_COMMON,        /* pointer to sock_common */
        ARG_PTR_TO_SOCKET,        /* pointer to bpf_sock (fullsock) */
        ARG_PTR_TO_BTF_ID,        /* pointer to in-kernel struct */
        ARG_PTR_TO_RINGBUF_MEM,        /* pointer to dynamically reserved ringbuf memory */
        ARG_CONST_ALLOC_SIZE_OR_ZERO,        /* number of allocated bytes requested */
        ARG_PTR_TO_BTF_ID_SOCK_COMMON,        /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */
        ARG_PTR_TO_PERCPU_BTF_ID,        /* pointer to in-kernel percpu type */
        ARG_PTR_TO_FUNC,        /* pointer to a bpf program function */
        ARG_PTR_TO_STACK,        /* pointer to stack */
        ARG_PTR_TO_CONST_STR,        /* pointer to a null terminated read-only string */
        ARG_PTR_TO_TIMER,        /* pointer to bpf_timer */
        ARG_KPTR_XCHG_DEST,        /* pointer to destination that kptrs are bpf_kptr_xchg'd into */
        ARG_PTR_TO_DYNPTR,      /* pointer to bpf_dynptr. See bpf_type_flag for dynptr type */
        __BPF_ARG_TYPE_MAX,

        /* Extended arg_types. */
        ARG_PTR_TO_MAP_VALUE_OR_NULL        = PTR_MAYBE_NULL | ARG_PTR_TO_MAP_VALUE,
        ARG_PTR_TO_MEM_OR_NULL                = PTR_MAYBE_NULL | ARG_PTR_TO_MEM,
        ARG_PTR_TO_CTX_OR_NULL                = PTR_MAYBE_NULL | ARG_PTR_TO_CTX,
        ARG_PTR_TO_SOCKET_OR_NULL        = PTR_MAYBE_NULL | ARG_PTR_TO_SOCKET,
        ARG_PTR_TO_STACK_OR_NULL        = PTR_MAYBE_NULL | ARG_PTR_TO_STACK,
        ARG_PTR_TO_BTF_ID_OR_NULL        = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID,
        /* Pointer to memory does not need to be initialized, since helper function
         * fills all bytes or clears them in error case.
         */
        ARG_PTR_TO_UNINIT_MEM                = MEM_UNINIT | MEM_WRITE | ARG_PTR_TO_MEM,
        /* Pointer to valid memory of size known at compile time. */
        ARG_PTR_TO_FIXED_SIZE_MEM        = MEM_FIXED_SIZE | ARG_PTR_TO_MEM,

        /* This must be the last entry. Its purpose is to ensure the enum is
         * wide enough to hold the higher bits reserved for bpf_type_flag.
         */
        __BPF_ARG_TYPE_LIMIT        = BPF_TYPE_LIMIT,
};
static_assert(__BPF_ARG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);

/* type of values returned from helper functions */
enum bpf_return_type {
        RET_INTEGER,                        /* function returns integer */
        RET_VOID,                        /* function doesn't return anything */
        RET_PTR_TO_MAP_VALUE,                /* returns a pointer to map elem value */
        RET_PTR_TO_SOCKET,                /* returns a pointer to a socket */
        RET_PTR_TO_TCP_SOCK,                /* returns a pointer to a tcp_sock */
        RET_PTR_TO_SOCK_COMMON,                /* returns a pointer to a sock_common */
        RET_PTR_TO_MEM,                        /* returns a pointer to memory */
        RET_PTR_TO_MEM_OR_BTF_ID,        /* returns a pointer to a valid memory or a btf_id */
        RET_PTR_TO_BTF_ID,                /* returns a pointer to a btf_id */
        __BPF_RET_TYPE_MAX,

        /* Extended ret_types. */
        RET_PTR_TO_MAP_VALUE_OR_NULL        = PTR_MAYBE_NULL | RET_PTR_TO_MAP_VALUE,
        RET_PTR_TO_SOCKET_OR_NULL        = PTR_MAYBE_NULL | RET_PTR_TO_SOCKET,
        RET_PTR_TO_TCP_SOCK_OR_NULL        = PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK,
        RET_PTR_TO_SOCK_COMMON_OR_NULL        = PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON,
        RET_PTR_TO_RINGBUF_MEM_OR_NULL        = PTR_MAYBE_NULL | MEM_RINGBUF | RET_PTR_TO_MEM,
        RET_PTR_TO_DYNPTR_MEM_OR_NULL        = PTR_MAYBE_NULL | RET_PTR_TO_MEM,
        RET_PTR_TO_BTF_ID_OR_NULL        = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID,
        RET_PTR_TO_BTF_ID_TRUSTED        = PTR_TRUSTED         | RET_PTR_TO_BTF_ID,

        /* This must be the last entry. Its purpose is to ensure the enum is
         * wide enough to hold the higher bits reserved for bpf_type_flag.
         */
        __BPF_RET_TYPE_LIMIT        = BPF_TYPE_LIMIT,
};
static_assert(__BPF_RET_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);

/* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
 * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL
 * instructions after verifying
 */
struct bpf_func_proto {
        u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
        bool gpl_only;
        bool pkt_access;
        bool might_sleep;
        /* set to true if helper follows contract for llvm
         * attribute bpf_fastcall:
         * - void functions do not scratch r0
         * - functions taking N arguments scratch only registers r1-rN
         */
        bool allow_fastcall;
        enum bpf_return_type ret_type;
        union {
                struct {
                        enum bpf_arg_type arg1_type;
                        enum bpf_arg_type arg2_type;
                        enum bpf_arg_type arg3_type;
                        enum bpf_arg_type arg4_type;
                        enum bpf_arg_type arg5_type;
                };
                enum bpf_arg_type arg_type[5];
        };
        union {
                struct {
                        u32 *arg1_btf_id;
                        u32 *arg2_btf_id;
                        u32 *arg3_btf_id;
                        u32 *arg4_btf_id;
                        u32 *arg5_btf_id;
                };
                u32 *arg_btf_id[5];
                struct {
                        size_t arg1_size;
                        size_t arg2_size;
                        size_t arg3_size;
                        size_t arg4_size;
                        size_t arg5_size;
                };
                size_t arg_size[5];
        };
        int *ret_btf_id; /* return value btf_id */
        bool (*allowed)(const struct bpf_prog *prog);
};

/* bpf_context is intentionally undefined structure. Pointer to bpf_context is
 * the first argument to eBPF programs.
 * For socket filters: 'struct bpf_context *' == 'struct sk_buff *'
 */
struct bpf_context;

enum bpf_access_type {
        BPF_READ = 1,
        BPF_WRITE = 2
};

/* types of values stored in eBPF registers */
/* Pointer types represent:
 * pointer
 * pointer + imm
 * pointer + (u16) var
 * pointer + (u16) var + imm
 * if (range > 0) then [ptr, ptr + range - off) is safe to access
 * if (id > 0) means that some 'var' was added
 * if (off > 0) means that 'imm' was added
 */
enum bpf_reg_type {
        NOT_INIT = 0,                 /* nothing was written into register */
        SCALAR_VALUE,                 /* reg doesn't contain a valid pointer */
        PTR_TO_CTX,                 /* reg points to bpf_context */
        CONST_PTR_TO_MAP,         /* reg points to struct bpf_map */
        PTR_TO_MAP_VALUE,         /* reg points to map element value */
        PTR_TO_MAP_KEY,                 /* reg points to a map element key */
        PTR_TO_STACK,                 /* reg == frame_pointer + offset */
        PTR_TO_PACKET_META,         /* skb->data - meta_len */
        PTR_TO_PACKET,                 /* reg points to skb->data */
        PTR_TO_PACKET_END,         /* skb->data + headlen */
        PTR_TO_FLOW_KEYS,         /* reg points to bpf_flow_keys */
        PTR_TO_SOCKET,                 /* reg points to struct bpf_sock */
        PTR_TO_SOCK_COMMON,         /* reg points to sock_common */
        PTR_TO_TCP_SOCK,         /* reg points to struct tcp_sock */
        PTR_TO_TP_BUFFER,         /* reg points to a writable raw tp's buffer */
        PTR_TO_XDP_SOCK,         /* reg points to struct xdp_sock */
        /* PTR_TO_BTF_ID points to a kernel struct that does not need
         * to be null checked by the BPF program. This does not imply the
         * pointer is _not_ null and in practice this can easily be a null
         * pointer when reading pointer chains. The assumption is program
         * context will handle null pointer dereference typically via fault
         * handling. The verifier must keep this in mind and can make no
         * assumptions about null or non-null when doing branch analysis.
         * Further, when passed into helpers the helpers can not, without
         * additional context, assume the value is non-null.
         */
        PTR_TO_BTF_ID,
        PTR_TO_MEM,                 /* reg points to valid memory region */
        PTR_TO_ARENA,
        PTR_TO_BUF,                 /* reg points to a read/write buffer */
        PTR_TO_FUNC,                 /* reg points to a bpf program function */
        CONST_PTR_TO_DYNPTR,         /* reg points to a const struct bpf_dynptr */
        __BPF_REG_TYPE_MAX,

        /* Extended reg_types. */
        PTR_TO_MAP_VALUE_OR_NULL        = PTR_MAYBE_NULL | PTR_TO_MAP_VALUE,
        PTR_TO_SOCKET_OR_NULL                = PTR_MAYBE_NULL | PTR_TO_SOCKET,
        PTR_TO_SOCK_COMMON_OR_NULL        = PTR_MAYBE_NULL | PTR_TO_SOCK_COMMON,
        PTR_TO_TCP_SOCK_OR_NULL                = PTR_MAYBE_NULL | PTR_TO_TCP_SOCK,
        /* PTR_TO_BTF_ID_OR_NULL points to a kernel struct that has not
         * been checked for null. Used primarily to inform the verifier
         * an explicit null check is required for this struct.
         */
        PTR_TO_BTF_ID_OR_NULL                = PTR_MAYBE_NULL | PTR_TO_BTF_ID,

        /* This must be the last entry. Its purpose is to ensure the enum is
         * wide enough to hold the higher bits reserved for bpf_type_flag.
         */
        __BPF_REG_TYPE_LIMIT        = BPF_TYPE_LIMIT,
};
static_assert(__BPF_REG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);

/* The information passed from prog-specific *_is_valid_access
 * back to the verifier.
 */
struct bpf_insn_access_aux {
        enum bpf_reg_type reg_type;
        bool is_ldsx;
        union {
                int ctx_field_size;
                struct {
                        struct btf *btf;
                        u32 btf_id;
                        u32 ref_obj_id;
                };
        };
        struct bpf_verifier_log *log; /* for verbose logs */
        bool is_retval; /* is accessing function return value ? */
};

static inline void
bpf_ctx_record_field_size(struct bpf_insn_access_aux *aux, u32 size)
{
        aux->ctx_field_size = size;
}

static bool bpf_is_ldimm64(const struct bpf_insn *insn)
{
        return insn->code == (BPF_LD | BPF_IMM | BPF_DW);
}

static inline bool bpf_pseudo_func(const struct bpf_insn *insn)
{
        return bpf_is_ldimm64(insn) && insn->src_reg == BPF_PSEUDO_FUNC;
}

/* Given a BPF_ATOMIC instruction @atomic_insn, return true if it is an
 * atomic load or store, and false if it is a read-modify-write instruction.
 */
static inline bool
bpf_atomic_is_load_store(const struct bpf_insn *atomic_insn)
{
        switch (atomic_insn->imm) {
        case BPF_LOAD_ACQ:
        case BPF_STORE_REL:
                return true;
        default:
                return false;
        }
}

struct bpf_prog_ops {
        int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr,
                        union bpf_attr __user *uattr);
};

struct bpf_reg_state;
struct bpf_verifier_ops {
        /* return eBPF function prototype for verification */
        const struct bpf_func_proto *
        (*get_func_proto)(enum bpf_func_id func_id,
                          const struct bpf_prog *prog);

        /* return true if 'size' wide access at offset 'off' within bpf_context
         * with 'type' (read or write) is allowed
         */
        bool (*is_valid_access)(int off, int size, enum bpf_access_type type,
                                const struct bpf_prog *prog,
                                struct bpf_insn_access_aux *info);
        int (*gen_prologue)(struct bpf_insn *insn, bool direct_write,
                            const struct bpf_prog *prog);
        int (*gen_epilogue)(struct bpf_insn *insn, const struct bpf_prog *prog,
                            s16 ctx_stack_off);
        int (*gen_ld_abs)(const struct bpf_insn *orig,
                          struct bpf_insn *insn_buf);
        u32 (*convert_ctx_access)(enum bpf_access_type type,
                                  const struct bpf_insn *src,
                                  struct bpf_insn *dst,
                                  struct bpf_prog *prog, u32 *target_size);
        int (*btf_struct_access)(struct bpf_verifier_log *log,
                                 const struct bpf_reg_state *reg,
                                 int off, int size);
};

struct bpf_prog_offload_ops {
        /* verifier basic callbacks */
        int (*insn_hook)(struct bpf_verifier_env *env,
                         int insn_idx, int prev_insn_idx);
        int (*finalize)(struct bpf_verifier_env *env);
        /* verifier optimization callbacks (called after .finalize) */
        int (*replace_insn)(struct bpf_verifier_env *env, u32 off,
                            struct bpf_insn *insn);
        int (*remove_insns)(struct bpf_verifier_env *env, u32 off, u32 cnt);
        /* program management callbacks */
        int (*prepare)(struct bpf_prog *prog);
        int (*translate)(struct bpf_prog *prog);
        void (*destroy)(struct bpf_prog *prog);
};

struct bpf_prog_offload {
        struct bpf_prog                *prog;
        struct net_device        *netdev;
        struct bpf_offload_dev        *offdev;
        void                        *dev_priv;
        struct list_head        offloads;
        bool                        dev_state;
        bool                        opt_failed;
        void                        *jited_image;
        u32                        jited_len;
};

/* The longest tracepoint has 12 args.
 * See include/trace/bpf_probe.h
 */
#define MAX_BPF_FUNC_ARGS 12

/* The maximum number of arguments passed through registers
 * a single function may have.
 */
#define MAX_BPF_FUNC_REG_ARGS 5

/* The argument is a structure or a union. */
#define BTF_FMODEL_STRUCT_ARG                BIT(0)

/* The argument is signed. */
#define BTF_FMODEL_SIGNED_ARG                BIT(1)

struct btf_func_model {
        u8 ret_size;
        u8 ret_flags;
        u8 nr_args;
        u8 arg_size[MAX_BPF_FUNC_ARGS];
        u8 arg_flags[MAX_BPF_FUNC_ARGS];
};

/* Restore arguments before returning from trampoline to let original function
 * continue executing. This flag is used for fentry progs when there are no
 * fexit progs.
 */
#define BPF_TRAMP_F_RESTORE_REGS        BIT(0)
/* Call original function after fentry progs, but before fexit progs.
 * Makes sense for fentry/fexit, normal calls and indirect calls.
 */
#define BPF_TRAMP_F_CALL_ORIG                BIT(1)
/* Skip current frame and return to parent.  Makes sense for fentry/fexit
 * programs only. Should not be used with normal calls and indirect calls.
 */
#define BPF_TRAMP_F_SKIP_FRAME                BIT(2)
/* Store IP address of the caller on the trampoline stack,
 * so it's available for trampoline's programs.
 */
#define BPF_TRAMP_F_IP_ARG                BIT(3)
/* Return the return value of fentry prog. Only used by bpf_struct_ops. */
#define BPF_TRAMP_F_RET_FENTRY_RET        BIT(4)

/* Get original function from stack instead of from provided direct address.
 * Makes sense for trampolines with fexit or fmod_ret programs.
 */
#define BPF_TRAMP_F_ORIG_STACK                BIT(5)

/* This trampoline is on a function with another ftrace_ops with IPMODIFY,
 * e.g., a live patch. This flag is set and cleared by ftrace call backs,
 */
#define BPF_TRAMP_F_SHARE_IPMODIFY        BIT(6)

/* Indicate that current trampoline is in a tail call context. Then, it has to
 * cache and restore tail_call_cnt to avoid infinite tail call loop.
 */
#define BPF_TRAMP_F_TAIL_CALL_CTX        BIT(7)

/*
 * Indicate the trampoline should be suitable to receive indirect calls;
 * without this indirectly calling the generated code can result in #UD/#CP,
 * depending on the CFI options.
 *
 * Used by bpf_struct_ops.
 *
 * Incompatible with FENTRY usage, overloads @func_addr argument.
 */
#define BPF_TRAMP_F_INDIRECT                BIT(8)

/* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
 * bytes on x86.
 */
enum {
#if defined(__s390x__)
        BPF_MAX_TRAMP_LINKS = 27,
#else
        BPF_MAX_TRAMP_LINKS = 38,
#endif
};

struct bpf_tramp_links {
        struct bpf_tramp_link *links[BPF_MAX_TRAMP_LINKS];
        int nr_links;
};

struct bpf_tramp_run_ctx;

/* Different use cases for BPF trampoline:
 * 1. replace nop at the function entry (kprobe equivalent)
 *    flags = BPF_TRAMP_F_RESTORE_REGS
 *    fentry = a set of programs to run before returning from trampoline
 *
 * 2. replace nop at the function entry (kprobe + kretprobe equivalent)
 *    flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME
 *    orig_call = fentry_ip + MCOUNT_INSN_SIZE
 *    fentry = a set of program to run before calling original function
 *    fexit = a set of program to run after original function
 *
 * 3. replace direct call instruction anywhere in the function body
 *    or assign a function pointer for indirect call (like tcp_congestion_ops->cong_avoid)
 *    With flags = 0
 *      fentry = a set of programs to run before returning from trampoline
 *    With flags = BPF_TRAMP_F_CALL_ORIG
 *      orig_call = original callback addr or direct function addr
 *      fentry = a set of program to run before calling original function
 *      fexit = a set of program to run after original function
 */
struct bpf_tramp_image;
int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
                                const struct btf_func_model *m, u32 flags,
                                struct bpf_tramp_links *tlinks,
                                void *func_addr);
void *arch_alloc_bpf_trampoline(unsigned int size);
void arch_free_bpf_trampoline(void *image, unsigned int size);
int __must_check arch_protect_bpf_trampoline(void *image, unsigned int size);
int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
                             struct bpf_tramp_links *tlinks, void *func_addr);

u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
                                             struct bpf_tramp_run_ctx *run_ctx);
void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start,
                                             struct bpf_tramp_run_ctx *run_ctx);
void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr);
void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr);
typedef u64 (*bpf_trampoline_enter_t)(struct bpf_prog *prog,
                                      struct bpf_tramp_run_ctx *run_ctx);
typedef void (*bpf_trampoline_exit_t)(struct bpf_prog *prog, u64 start,
                                      struct bpf_tramp_run_ctx *run_ctx);
bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog);
bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog);

struct bpf_ksym {
        unsigned long                 start;
        unsigned long                 end;
        char                         name[KSYM_NAME_LEN];
        struct list_head         lnode;
        struct latch_tree_node         tnode;
        bool                         prog;
};

enum bpf_tramp_prog_type {
        BPF_TRAMP_FENTRY,
        BPF_TRAMP_FEXIT,
        BPF_TRAMP_MODIFY_RETURN,
        BPF_TRAMP_MAX,
        BPF_TRAMP_REPLACE, /* more than MAX */
};

struct bpf_tramp_image {
        void *image;
        int size;
        struct bpf_ksym ksym;
        struct percpu_ref pcref;
        void *ip_after_call;
        void *ip_epilogue;
        union {
                struct rcu_head rcu;
                struct work_struct work;
        };
};

struct bpf_trampoline {
        /* hlist for trampoline_table */
        struct hlist_node hlist;
        struct ftrace_ops *fops;
        /* serializes access to fields of this trampoline */
        struct mutex mutex;
        refcount_t refcnt;
        u32 flags;
        u64 key;
        struct {
                struct btf_func_model model;
                void *addr;
                bool ftrace_managed;
        } func;
        /* if !NULL this is BPF_PROG_TYPE_EXT program that extends another BPF
         * program by replacing one of its functions. func.addr is the address
         * of the function it replaced.
         */
        struct bpf_prog *extension_prog;
        /* list of BPF programs using this trampoline */
        struct hlist_head progs_hlist[BPF_TRAMP_MAX];
        /* Number of attached programs. A counter per kind. */
        int progs_cnt[BPF_TRAMP_MAX];
        /* Executable image of trampoline */
        struct bpf_tramp_image *cur_image;
};

struct bpf_attach_target_info {
        struct btf_func_model fmodel;
        long tgt_addr;
        struct module *tgt_mod;
        const char *tgt_name;
        const struct btf_type *tgt_type;
};

#define BPF_DISPATCHER_MAX 48 /* Fits in 2048B */

struct bpf_dispatcher_prog {
        struct bpf_prog *prog;
        refcount_t users;
};

struct bpf_dispatcher {
        /* dispatcher mutex */
        struct mutex mutex;
        void *func;
        struct bpf_dispatcher_prog progs[BPF_DISPATCHER_MAX];
        int num_progs;
        void *image;
        void *rw_image;
        u32 image_off;
        struct bpf_ksym ksym;
#ifdef CONFIG_HAVE_STATIC_CALL
        struct static_call_key *sc_key;
        void *sc_tramp;
#endif
};

#ifndef __bpfcall
#define __bpfcall __nocfi
#endif

static __always_inline __bpfcall unsigned int bpf_dispatcher_nop_func(
        const void *ctx,
        const struct bpf_insn *insnsi,
        bpf_func_t bpf_func)
{
        return bpf_func(ctx, insnsi);
}

/* the implementation of the opaque uapi struct bpf_dynptr */
struct bpf_dynptr_kern {
        void *data;
        /* Size represents the number of usable bytes of dynptr data.
         * If for example the offset is at 4 for a local dynptr whose data is
         * of type u64, the number of usable bytes is 4.
         *
         * The upper 8 bits are reserved. It is as follows:
         * Bits 0 - 23 = size
         * Bits 24 - 30 = dynptr type
         * Bit 31 = whether dynptr is read-only
         */
        u32 size;
        u32 offset;
} __aligned(8);

enum bpf_dynptr_type {
        BPF_DYNPTR_TYPE_INVALID,
        /* Points to memory that is local to the bpf program */
        BPF_DYNPTR_TYPE_LOCAL,
        /* Underlying data is a ringbuf record */
        BPF_DYNPTR_TYPE_RINGBUF,
        /* Underlying data is a sk_buff */
        BPF_DYNPTR_TYPE_SKB,
        /* Underlying data is a xdp_buff */
        BPF_DYNPTR_TYPE_XDP,
        /* Points to skb_metadata_end()-skb_metadata_len() */
        BPF_DYNPTR_TYPE_SKB_META,
};

int bpf_dynptr_check_size(u32 size);
u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr);
const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len);
void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len);
bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr);
int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset,
                       void *src, u32 len, u64 flags);
void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset,
                            void *buffer__opt, u32 buffer__szk);

static inline int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len)
{
        u32 size = __bpf_dynptr_size(ptr);

        if (len > size || offset > size - len)
                return -E2BIG;

        return 0;
}

#ifdef CONFIG_BPF_JIT
int bpf_trampoline_link_prog(struct bpf_tramp_link *link,
                             struct bpf_trampoline *tr,
                             struct bpf_prog *tgt_prog);
int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
                               struct bpf_trampoline *tr,
                               struct bpf_prog *tgt_prog);
struct bpf_trampoline *bpf_trampoline_get(u64 key,
                                          struct bpf_attach_target_info *tgt_info);
void bpf_trampoline_put(struct bpf_trampoline *tr);
int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs);

/*
 * When the architecture supports STATIC_CALL replace the bpf_dispatcher_fn
 * indirection with a direct call to the bpf program. If the architecture does
 * not have STATIC_CALL, avoid a double-indirection.
 */
#ifdef CONFIG_HAVE_STATIC_CALL

#define __BPF_DISPATCHER_SC_INIT(_name)                                \
        .sc_key = &STATIC_CALL_KEY(_name),                        \
        .sc_tramp = STATIC_CALL_TRAMP_ADDR(_name),

#define __BPF_DISPATCHER_SC(name)                                \
        DEFINE_STATIC_CALL(bpf_dispatcher_##name##_call, bpf_dispatcher_nop_func)

#define __BPF_DISPATCHER_CALL(name)                                \
        static_call(bpf_dispatcher_##name##_call)(ctx, insnsi, bpf_func)

#define __BPF_DISPATCHER_UPDATE(_d, _new)                        \
        __static_call_update((_d)->sc_key, (_d)->sc_tramp, (_new))

#else
#define __BPF_DISPATCHER_SC_INIT(name)
#define __BPF_DISPATCHER_SC(name)
#define __BPF_DISPATCHER_CALL(name)                bpf_func(ctx, insnsi)
#define __BPF_DISPATCHER_UPDATE(_d, _new)
#endif

#define BPF_DISPATCHER_INIT(_name) {                                \
        .mutex = __MUTEX_INITIALIZER(_name.mutex),                \
        .func = &_name##_func,                                        \
        .progs = {},                                                \
        .num_progs = 0,                                                \
        .image = NULL,                                                \
        .image_off = 0,                                                \
        .ksym = {                                                \
                .name  = #_name,                                \
                .lnode = LIST_HEAD_INIT(_name.ksym.lnode),        \
        },                                                        \
        __BPF_DISPATCHER_SC_INIT(_name##_call)                        \
}

#define DEFINE_BPF_DISPATCHER(name)                                        \
        __BPF_DISPATCHER_SC(name);                                        \
        noinline __bpfcall unsigned int bpf_dispatcher_##name##_func(        \
                const void *ctx,                                        \
                const struct bpf_insn *insnsi,                                \
                bpf_func_t bpf_func)                                        \
        {                                                                \
                return __BPF_DISPATCHER_CALL(name);                        \
        }                                                                \
        EXPORT_SYMBOL(bpf_dispatcher_##name##_func);                        \
        struct bpf_dispatcher bpf_dispatcher_##name =                        \
                BPF_DISPATCHER_INIT(bpf_dispatcher_##name);

#define DECLARE_BPF_DISPATCHER(name)                                        \
        unsigned int bpf_dispatcher_##name##_func(                        \
                const void *ctx,                                        \
                const struct bpf_insn *insnsi,                                \
                bpf_func_t bpf_func);                                        \
        extern struct bpf_dispatcher bpf_dispatcher_##name;

#define BPF_DISPATCHER_FUNC(name) bpf_dispatcher_##name##_func
#define BPF_DISPATCHER_PTR(name) (&bpf_dispatcher_##name)
void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
                                struct bpf_prog *to);
/* Called only from JIT-enabled code, so there's no need for stubs. */
void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym);
void bpf_image_ksym_add(struct bpf_ksym *ksym);
void bpf_image_ksym_del(struct bpf_ksym *ksym);
void bpf_ksym_add(struct bpf_ksym *ksym);
void bpf_ksym_del(struct bpf_ksym *ksym);
int bpf_jit_charge_modmem(u32 size);
void bpf_jit_uncharge_modmem(u32 size);
bool bpf_prog_has_trampoline(const struct bpf_prog *prog);
#else
static inline int bpf_trampoline_link_prog(struct bpf_tramp_link *link,
                                           struct bpf_trampoline *tr,
                                           struct bpf_prog *tgt_prog)
{
        return -ENOTSUPP;
}
static inline int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
                                             struct bpf_trampoline *tr,
                                             struct bpf_prog *tgt_prog)
{
        return -ENOTSUPP;
}
static inline struct bpf_trampoline *bpf_trampoline_get(u64 key,
                                                        struct bpf_attach_target_info *tgt_info)
{
        return NULL;
}
static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {}
#define DEFINE_BPF_DISPATCHER(name)
#define DECLARE_BPF_DISPATCHER(name)
#define BPF_DISPATCHER_FUNC(name) bpf_dispatcher_nop_func
#define BPF_DISPATCHER_PTR(name) NULL
static inline void bpf_dispatcher_change_prog(struct bpf_dispatcher *d,
                                              struct bpf_prog *from,
                                              struct bpf_prog *to) {}
static inline bool is_bpf_image_address(unsigned long address)
{
        return false;
}
static inline bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
{
        return false;
}
#endif

struct bpf_func_info_aux {
        u16 linkage;
        bool unreliable;
        bool called : 1;
        bool verified : 1;
};

enum bpf_jit_poke_reason {
        BPF_POKE_REASON_TAIL_CALL,
};

/* Descriptor of pokes pointing /into/ the JITed image. */
struct bpf_jit_poke_descriptor {
        void *tailcall_target;
        void *tailcall_bypass;
        void *bypass_addr;
        void *aux;
        union {
                struct {
                        struct bpf_map *map;
                        u32 key;
                } tail_call;
        };
        bool tailcall_target_stable;
        u8 adj_off;
        u16 reason;
        u32 insn_idx;
};

/* reg_type info for ctx arguments */
struct bpf_ctx_arg_aux {
        u32 offset;
        enum bpf_reg_type reg_type;
        struct btf *btf;
        u32 btf_id;
        u32 ref_obj_id;
        bool refcounted;
};

struct btf_mod_pair {
        struct btf *btf;
        struct module *module;
};

struct bpf_kfunc_desc_tab;

enum bpf_stream_id {
        BPF_STDOUT = 1,
        BPF_STDERR = 2,
};

struct bpf_stream_elem {
        struct llist_node node;
        int total_len;
        int consumed_len;
        char str[];
};

enum {
        /* 100k bytes */
        BPF_STREAM_MAX_CAPACITY = 100000ULL,
};

struct bpf_stream {
        atomic_t capacity;
        struct llist_head log;        /* list of in-flight stream elements in LIFO order */

        struct mutex lock;  /* lock protecting backlog_{head,tail} */
        struct llist_node *backlog_head; /* list of in-flight stream elements in FIFO order */
        struct llist_node *backlog_tail; /* tail of the list above */
};

struct bpf_stream_stage {
        struct llist_head log;
        int len;
};

struct bpf_prog_aux {
        atomic64_t refcnt;
        u32 used_map_cnt;
        u32 used_btf_cnt;
        u32 max_ctx_offset;
        u32 max_pkt_offset;
        u32 max_tp_access;
        u32 stack_depth;
        u32 id;
        u32 func_cnt; /* used by non-func prog as the number of func progs */
        u32 real_func_cnt; /* includes hidden progs, only used for JIT and freeing progs */
        u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */
        u32 attach_btf_id; /* in-kernel BTF type id to attach to */
        u32 attach_st_ops_member_off;
        u32 ctx_arg_info_size;
        u32 max_rdonly_access;
        u32 max_rdwr_access;
        struct btf *attach_btf;
        struct bpf_ctx_arg_aux *ctx_arg_info;
        void __percpu *priv_stack_ptr;
        struct mutex dst_mutex; /* protects dst_* pointers below, *after* prog becomes visible */
        struct bpf_prog *dst_prog;
        struct bpf_trampoline *dst_trampoline;
        enum bpf_prog_type saved_dst_prog_type;
        enum bpf_attach_type saved_dst_attach_type;
        bool verifier_zext; /* Zero extensions has been inserted by verifier. */
        bool dev_bound; /* Program is bound to the netdev. */
        bool offload_requested; /* Program is bound and offloaded to the netdev. */
        bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */
        bool attach_tracing_prog; /* true if tracing another tracing program */
        bool func_proto_unreliable;
        bool tail_call_reachable;
        bool xdp_has_frags;
        bool exception_cb;
        bool exception_boundary;
        bool is_extended; /* true if extended by freplace program */
        bool jits_use_priv_stack;
        bool priv_stack_requested;
        bool changes_pkt_data;
        bool might_sleep;
        bool kprobe_write_ctx;
        u64 prog_array_member_cnt; /* counts how many times as member of prog_array */
        struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */
        struct bpf_arena *arena;
        void (*recursion_detected)(struct bpf_prog *prog); /* callback if recursion is detected */
        /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
        const struct btf_type *attach_func_proto;
        /* function name for valid attach_btf_id */
        const char *attach_func_name;
        struct bpf_prog **func;
        struct bpf_prog_aux *main_prog_aux;
        void *jit_data; /* JIT specific data. arch dependent */
        struct bpf_jit_poke_descriptor *poke_tab;
        struct bpf_kfunc_desc_tab *kfunc_tab;
        struct bpf_kfunc_btf_tab *kfunc_btf_tab;
        u32 size_poke_tab;
#ifdef CONFIG_FINEIBT
        struct bpf_ksym ksym_prefix;
#endif
        struct bpf_ksym ksym;
        const struct bpf_prog_ops *ops;
        const struct bpf_struct_ops *st_ops;
        struct bpf_map **used_maps;
        struct mutex used_maps_mutex; /* mutex for used_maps and used_map_cnt */
        struct btf_mod_pair *used_btfs;
        struct bpf_prog *prog;
        struct user_struct *user;
        u64 load_time; /* ns since boottime */
        u32 verified_insns;
        int cgroup_atype; /* enum cgroup_bpf_attach_type */
        struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
        char name[BPF_OBJ_NAME_LEN];
        u64 (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp, u64, u64);
#ifdef CONFIG_SECURITY
        void *security;
#endif
        struct bpf_token *token;
        struct bpf_prog_offload *offload;
        struct btf *btf;
        struct bpf_func_info *func_info;
        struct bpf_func_info_aux *func_info_aux;
        /* bpf_line_info loaded from userspace.  linfo->insn_off
         * has the xlated insn offset.
         * Both the main and sub prog share the same linfo.
         * The subprog can access its first linfo by
         * using the linfo_idx.
         */
        struct bpf_line_info *linfo;
        /* jited_linfo is the jited addr of the linfo.  It has a
         * one to one mapping to linfo:
         * jited_linfo[i] is the jited addr for the linfo[i]->insn_off.
         * Both the main and sub prog share the same jited_linfo.
         * The subprog can access its first jited_linfo by
         * using the linfo_idx.
         */
        void **jited_linfo;
        u32 func_info_cnt;
        u32 nr_linfo;
        /* subprog can use linfo_idx to access its first linfo and
         * jited_linfo.
         * main prog always has linfo_idx == 0
         */
        u32 linfo_idx;
        struct module *mod;
        u32 num_exentries;
        struct exception_table_entry *extable;
        union {
                struct work_struct work;
                struct rcu_head        rcu;
        };
        struct bpf_stream stream[2];
};

struct bpf_prog {
        u16                        pages;                /* Number of allocated pages */
        u16                        jited:1,        /* Is our filter JIT'ed? */
                                jit_requested:1,/* archs need to JIT the prog */
                                gpl_compatible:1, /* Is filter GPL compatible? */
                                cb_access:1,        /* Is control block accessed? */
                                dst_needed:1,        /* Do we need dst entry? */
                                blinding_requested:1, /* needs constant blinding */
                                blinded:1,        /* Was blinded */
                                is_func:1,        /* program is a bpf function */
                                kprobe_override:1, /* Do we override a kprobe? */
                                has_callchain_buf:1, /* callchain buffer allocated? */
                                enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
                                call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
                                call_get_func_ip:1, /* Do we call get_func_ip() */
                                tstamp_type_access:1, /* Accessed __sk_buff->tstamp_type */
                                sleepable:1;        /* BPF program is sleepable */
        enum bpf_prog_type        type;                /* Type of BPF program */
        enum bpf_attach_type        expected_attach_type; /* For some prog types */
        u32                        len;                /* Number of filter blocks */
        u32                        jited_len;        /* Size of jited insns in bytes */
        union {
                u8 digest[SHA256_DIGEST_SIZE];
                u8 tag[BPF_TAG_SIZE];
        };
        struct bpf_prog_stats __percpu *stats;
        int __percpu                *active;
        unsigned int                (*bpf_func)(const void *ctx,
                                            const struct bpf_insn *insn);
        struct bpf_prog_aux        *aux;                /* Auxiliary fields */
        struct sock_fprog_kern        *orig_prog;        /* Original BPF program */
        /* Instructions for interpreter */
        union {
                DECLARE_FLEX_ARRAY(struct sock_filter, insns);
                DECLARE_FLEX_ARRAY(struct bpf_insn, insnsi);
        };
};

struct bpf_array_aux {
        /* Programs with direct jumps into programs part of this array. */
        struct list_head poke_progs;
        struct bpf_map *map;
        struct mutex poke_mutex;
        struct work_struct work;
};

struct bpf_link {
        atomic64_t refcnt;
        u32 id;
        enum bpf_link_type type;
        const struct bpf_link_ops *ops;
        struct bpf_prog *prog;

        u32 flags;
        enum bpf_attach_type attach_type;

        /* rcu is used before freeing, work can be used to schedule that
         * RCU-based freeing before that, so they never overlap
         */
        union {
                struct rcu_head rcu;
                struct work_struct work;
        };
        /* whether BPF link itself has "sleepable" semantics, which can differ
         * from underlying BPF program having a "sleepable" semantics, as BPF
         * link's semantics is determined by target attach hook
         */
        bool sleepable;
};

struct bpf_link_ops {
        void (*release)(struct bpf_link *link);
        /* deallocate link resources callback, called without RCU grace period
         * waiting
         */
        void (*dealloc)(struct bpf_link *link);
        /* deallocate link resources callback, called after RCU grace period;
         * if either the underlying BPF program is sleepable or BPF link's
         * target hook is sleepable, we'll go through tasks trace RCU GP and
         * then "classic" RCU GP; this need for chaining tasks trace and
         * classic RCU GPs is designated by setting bpf_link->sleepable flag
         */
        void (*dealloc_deferred)(struct bpf_link *link);
        int (*detach)(struct bpf_link *link);
        int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog,
                           struct bpf_prog *old_prog);
        void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq);
        int (*fill_link_info)(const struct bpf_link *link,
                              struct bpf_link_info *info);
        int (*update_map)(struct bpf_link *link, struct bpf_map *new_map,
                          struct bpf_map *old_map);
        __poll_t (*poll)(struct file *file, struct poll_table_struct *pts);
};

struct bpf_tramp_link {
        struct bpf_link link;
        struct hlist_node tramp_hlist;
        u64 cookie;
};

struct bpf_shim_tramp_link {
        struct bpf_tramp_link link;
        struct bpf_trampoline *trampoline;
};

struct bpf_tracing_link {
        struct bpf_tramp_link link;
        struct bpf_trampoline *trampoline;
        struct bpf_prog *tgt_prog;
};

struct bpf_raw_tp_link {
        struct bpf_link link;
        struct bpf_raw_event_map *btp;
        u64 cookie;
};

struct bpf_link_primer {
        struct bpf_link *link;
        struct file *file;
        int fd;
        u32 id;
};

struct bpf_mount_opts {
        kuid_t uid;
        kgid_t gid;
        umode_t mode;

        /* BPF token-related delegation options */
        u64 delegate_cmds;
        u64 delegate_maps;
        u64 delegate_progs;
        u64 delegate_attachs;
};

struct bpf_token {
        struct work_struct work;
        atomic64_t refcnt;
        struct user_namespace *userns;
        u64 allowed_cmds;
        u64 allowed_maps;
        u64 allowed_progs;
        u64 allowed_attachs;
#ifdef CONFIG_SECURITY
        void *security;
#endif
};

struct bpf_struct_ops_value;
struct btf_member;

#define BPF_STRUCT_OPS_MAX_NR_MEMBERS 64
/**
 * struct bpf_struct_ops - A structure of callbacks allowing a subsystem to
 *                           define a BPF_MAP_TYPE_STRUCT_OPS map type composed
 *                           of BPF_PROG_TYPE_STRUCT_OPS progs.
 * @verifier_ops: A structure of callbacks that are invoked by the verifier
 *                  when determining whether the struct_ops progs in the
 *                  struct_ops map are valid.
 * @init: A callback that is invoked a single time, and before any other
 *          callback, to initialize the structure. A nonzero return value means
 *          the subsystem could not be initialized.
 * @check_member: When defined, a callback invoked by the verifier to allow
 *                  the subsystem to determine if an entry in the struct_ops map
 *                  is valid. A nonzero return value means that the map is
 *                  invalid and should be rejected by the verifier.
 * @init_member: A callback that is invoked for each member of the struct_ops
 *                 map to allow the subsystem to initialize the member. A nonzero
 *                 value means the member could not be initialized. This callback
 *                 is exclusive with the @type, @type_id, @value_type, and
 *                 @value_id fields.
 * @reg: A callback that is invoked when the struct_ops map has been
 *         initialized and is being attached to. Zero means the struct_ops map
 *         has been successfully registered and is live. A nonzero return value
 *         means the struct_ops map could not be registered.
 * @unreg: A callback that is invoked when the struct_ops map should be
 *           unregistered.
 * @update: A callback that is invoked when the live struct_ops map is being
 *            updated to contain new values. This callback is only invoked when
 *            the struct_ops map is loaded with BPF_F_LINK. If not defined, the
 *            it is assumed that the struct_ops map cannot be updated.
 * @validate: A callback that is invoked after all of the members have been
 *              initialized. This callback should perform static checks on the
 *              map, meaning that it should either fail or succeed
 *              deterministically. A struct_ops map that has been validated may
 *              not necessarily succeed in being registered if the call to @reg
 *              fails. For example, a valid struct_ops map may be loaded, but
 *              then fail to be registered due to there being another active
 *              struct_ops map on the system in the subsystem already. For this
 *              reason, if this callback is not defined, the check is skipped as
 *              the struct_ops map will have final verification performed in
 *              @reg.
 * @type: BTF type.
 * @value_type: Value type.
 * @name: The name of the struct bpf_struct_ops object.
 * @func_models: Func models
 * @type_id: BTF type id.
 * @value_id: BTF value id.
 */
struct bpf_struct_ops {
        const struct bpf_verifier_ops *verifier_ops;
        int (*init)(struct btf *btf);
        int (*check_member)(const struct btf_type *t,
                            const struct btf_member *member,
                            const struct bpf_prog *prog);
        int (*init_member)(const struct btf_type *t,
                           const struct btf_member *member,
                           void *kdata, const void *udata);
        int (*reg)(void *kdata, struct bpf_link *link);
        void (*unreg)(void *kdata, struct bpf_link *link);
        int (*update)(void *kdata, void *old_kdata, struct bpf_link *link);
        int (*validate)(void *kdata);
        void *cfi_stubs;
        struct module *owner;
        const char *name;
        struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS];
};

/* Every member of a struct_ops type has an instance even a member is not
 * an operator (function pointer). The "info" field will be assigned to
 * prog->aux->ctx_arg_info of BPF struct_ops programs to provide the
 * argument information required by the verifier to verify the program.
 *
 * btf_ctx_access() will lookup prog->aux->ctx_arg_info to find the
 * corresponding entry for an given argument.
 */
struct bpf_struct_ops_arg_info {
        struct bpf_ctx_arg_aux *info;
        u32 cnt;
};

struct bpf_struct_ops_desc {
        struct bpf_struct_ops *st_ops;

        const struct btf_type *type;
        const struct btf_type *value_type;
        u32 type_id;
        u32 value_id;

        /* Collection of argument information for each member */
        struct bpf_struct_ops_arg_info *arg_info;
};

enum bpf_struct_ops_state {
        BPF_STRUCT_OPS_STATE_INIT,
        BPF_STRUCT_OPS_STATE_INUSE,
        BPF_STRUCT_OPS_STATE_TOBEFREE,
        BPF_STRUCT_OPS_STATE_READY,
};

struct bpf_struct_ops_common_value {
        refcount_t refcnt;
        enum bpf_struct_ops_state state;
};

#if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL)
/* This macro helps developer to register a struct_ops type and generate
 * type information correctly. Developers should use this macro to register
 * a struct_ops type instead of calling __register_bpf_struct_ops() directly.
 */
#define register_bpf_struct_ops(st_ops, type)                                \
        ({                                                                \
                struct bpf_struct_ops_##type {                                \
                        struct bpf_struct_ops_common_value common;        \
                        struct type data ____cacheline_aligned_in_smp;        \
                };                                                        \
                BTF_TYPE_EMIT(struct bpf_struct_ops_##type);                \
                __register_bpf_struct_ops(st_ops);                        \
        })
#define BPF_MODULE_OWNER ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA))
bool bpf_struct_ops_get(const void *kdata);
void bpf_struct_ops_put(const void *kdata);
int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff);
int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
                                       void *value);
int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
                                      struct bpf_tramp_link *link,
                                      const struct btf_func_model *model,
                                      void *stub_func,
                                      void **image, u32 *image_off,
                                      bool allow_alloc);
void bpf_struct_ops_image_free(void *image);
static inline bool bpf_try_module_get(const void *data, struct module *owner)
{
        if (owner == BPF_MODULE_OWNER)
                return bpf_struct_ops_get(data);
        else
                return try_module_get(owner);
}
static inline void bpf_module_put(const void *data, struct module *owner)
{
        if (owner == BPF_MODULE_OWNER)
                bpf_struct_ops_put(data);
        else
                module_put(owner);
}
int bpf_struct_ops_link_create(union bpf_attr *attr);
u32 bpf_struct_ops_id(const void *kdata);

#ifdef CONFIG_NET
/* Define it here to avoid the use of forward declaration */
struct bpf_dummy_ops_state {
        int val;
};

struct bpf_dummy_ops {
        int (*test_1)(struct bpf_dummy_ops_state *cb);
        int (*test_2)(struct bpf_dummy_ops_state *cb, int a1, unsigned short a2,
                      char a3, unsigned long a4);
        int (*test_sleepable)(struct bpf_dummy_ops_state *cb);
};

int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
                            union bpf_attr __user *uattr);
#endif
int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
                             struct btf *btf,
                             struct bpf_verifier_log *log);
void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map);
void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_ops_desc);
#else
#define register_bpf_struct_ops(st_ops, type) ({ (void *)(st_ops); 0; })
static inline bool bpf_try_module_get(const void *data, struct module *owner)
{
        return try_module_get(owner);
}
static inline void bpf_module_put(const void *data, struct module *owner)
{
        module_put(owner);
}
static inline int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff)
{
        return -ENOTSUPP;
}
static inline int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map,
                                                     void *key,
                                                     void *value)
{
        return -EINVAL;
}
static inline int bpf_struct_ops_link_create(union bpf_attr *attr)
{
        return -EOPNOTSUPP;
}
static inline void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map)
{
}

static inline void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_ops_desc)
{
}

#endif

int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog,
                               const struct bpf_ctx_arg_aux *info, u32 cnt);

#if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM)
int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
                                    int cgroup_atype,
                                    enum bpf_attach_type attach_type);
void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog);
#else
static inline int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
                                                  int cgroup_atype,
                                                  enum bpf_attach_type attach_type)
{
        return -EOPNOTSUPP;
}
static inline void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog)
{
}
#endif

struct bpf_array {
        struct bpf_map map;
        u32 elem_size;
        u32 index_mask;
        struct bpf_array_aux *aux;
        union {
                DECLARE_FLEX_ARRAY(char, value) __aligned(8);
                DECLARE_FLEX_ARRAY(void *, ptrs) __aligned(8);
                DECLARE_FLEX_ARRAY(void __percpu *, pptrs) __aligned(8);
        };
};

#define BPF_COMPLEXITY_LIMIT_INSNS      1000000 /* yes. 1M insns */
#define MAX_TAIL_CALL_CNT 33

/* Maximum number of loops for bpf_loop and bpf_iter_num.
 * It's enum to expose it (and thus make it discoverable) through BTF.
 */
enum {
        BPF_MAX_LOOPS = 8 * 1024 * 1024,
        BPF_MAX_TIMED_LOOPS = 0xffff,
};

#define BPF_F_ACCESS_MASK        (BPF_F_RDONLY |                \
                                 BPF_F_RDONLY_PROG |        \
                                 BPF_F_WRONLY |                \
                                 BPF_F_WRONLY_PROG)

#define BPF_MAP_CAN_READ        BIT(0)
#define BPF_MAP_CAN_WRITE        BIT(1)

/* Maximum number of user-producer ring buffer samples that can be drained in
 * a call to bpf_user_ringbuf_drain().
 */
#define BPF_MAX_USER_RINGBUF_SAMPLES (128 * 1024)

static inline u32 bpf_map_flags_to_cap(struct bpf_map *map)
{
        u32 access_flags = map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG);

        /* Combination of BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG is
         * not possible.
         */
        if (access_flags & BPF_F_RDONLY_PROG)
                return BPF_MAP_CAN_READ;
        else if (access_flags & BPF_F_WRONLY_PROG)
                return BPF_MAP_CAN_WRITE;
        else
                return BPF_MAP_CAN_READ | BPF_MAP_CAN_WRITE;
}

static inline bool bpf_map_flags_access_ok(u32 access_flags)
{
        return (access_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) !=
               (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG);
}

static inline struct bpf_map_owner *bpf_map_owner_alloc(struct bpf_map *map)
{
        return kzalloc(sizeof(*map->owner), GFP_ATOMIC);
}

static inline void bpf_map_owner_free(struct bpf_map *map)
{
        kfree(map->owner);
}

struct bpf_event_entry {
        struct perf_event *event;
        struct file *perf_file;
        struct file *map_file;
        struct rcu_head rcu;
};

static inline bool map_type_contains_progs(struct bpf_map *map)
{
        return map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
               map->map_type == BPF_MAP_TYPE_DEVMAP ||
               map->map_type == BPF_MAP_TYPE_CPUMAP;
}

bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp);
int bpf_prog_calc_tag(struct bpf_prog *fp);

const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void);

const struct bpf_func_proto *bpf_get_perf_event_read_value_proto(void);

typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src,
                                        unsigned long off, unsigned long len);
typedef u32 (*bpf_convert_ctx_access_t)(enum bpf_access_type type,
                                        const struct bpf_insn *src,
                                        struct bpf_insn *dst,
                                        struct bpf_prog *prog,
                                        u32 *target_size);

u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
                     void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy);

/* an array of programs to be executed under rcu_lock.
 *
 * Typical usage:
 * ret = bpf_prog_run_array(rcu_dereference(&bpf_prog_array), ctx, bpf_prog_run);
 *
 * the structure returned by bpf_prog_array_alloc() should be populated
 * with program pointers and the last pointer must be NULL.
 * The user has to keep refcnt on the program and make sure the program
 * is removed from the array before bpf_prog_put().
 * The 'struct bpf_prog_array *' should only be replaced with xchg()
 * since other cpus are walking the array of pointers in parallel.
 */
struct bpf_prog_array_item {
        struct bpf_prog *prog;
        union {
                struct bpf_cgroup_storage *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
                u64 bpf_cookie;
        };
};

struct bpf_prog_array {
        struct rcu_head rcu;
        struct bpf_prog_array_item items[];
};

struct bpf_empty_prog_array {
        struct bpf_prog_array hdr;
        struct bpf_prog *null_prog;
};

/* to avoid allocating empty bpf_prog_array for cgroups that
 * don't have bpf program attached use one global 'bpf_empty_prog_array'
 * It will not be modified the caller of bpf_prog_array_alloc()
 * (since caller requested prog_cnt == 0)
 * that pointer should be 'freed' by bpf_prog_array_free()
 */
extern struct bpf_empty_prog_array bpf_empty_prog_array;

struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
void bpf_prog_array_free(struct bpf_prog_array *progs);
/* Use when traversal over the bpf_prog_array uses tasks_trace rcu */
void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs);
int bpf_prog_array_length(struct bpf_prog_array *progs);
bool bpf_prog_array_is_empty(struct bpf_prog_array *array);
int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs,
                                __u32 __user *prog_ids, u32 cnt);

void bpf_prog_array_delete_safe(struct bpf_prog_array *progs,
                                struct bpf_prog *old_prog);
int bpf_prog_array_delete_safe_at(struct bpf_prog_array *array, int index);
int bpf_prog_array_update_at(struct bpf_prog_array *array, int index,
                             struct bpf_prog *prog);
int bpf_prog_array_copy_info(struct bpf_prog_array *array,
                             u32 *prog_ids, u32 request_cnt,
                             u32 *prog_cnt);
int bpf_prog_array_copy(struct bpf_prog_array *old_array,
                        struct bpf_prog *exclude_prog,
                        struct bpf_prog *include_prog,
                        u64 bpf_cookie,
                        struct bpf_prog_array **new_array);

struct bpf_run_ctx {};

struct bpf_cg_run_ctx {
        struct bpf_run_ctx run_ctx;
        const struct bpf_prog_array_item *prog_item;
        int retval;
};

struct bpf_trace_run_ctx {
        struct bpf_run_ctx run_ctx;
        u64 bpf_cookie;
        bool is_uprobe;
};

struct bpf_tramp_run_ctx {
        struct bpf_run_ctx run_ctx;
        u64 bpf_cookie;
        struct bpf_run_ctx *saved_run_ctx;
};

static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx)
{
        struct bpf_run_ctx *old_ctx = NULL;

#ifdef CONFIG_BPF_SYSCALL
        old_ctx = current->bpf_ctx;
        current->bpf_ctx = new_ctx;
#endif
        return old_ctx;
}

static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx)
{
#ifdef CONFIG_BPF_SYSCALL
        current->bpf_ctx = old_ctx;
#endif
}

/* BPF program asks to bypass CAP_NET_BIND_SERVICE in bind. */
#define BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE                        (1 << 0)
/* BPF program asks to set CN on the packet. */
#define BPF_RET_SET_CN                                                (1 << 0)

typedef u32 (*bpf_prog_run_fn)(const struct bpf_prog *prog, const void *ctx);

static __always_inline u32
bpf_prog_run_array(const struct bpf_prog_array *array,
                   const void *ctx, bpf_prog_run_fn run_prog)
{
        const struct bpf_prog_array_item *item;
        const struct bpf_prog *prog;
        struct bpf_run_ctx *old_run_ctx;
        struct bpf_trace_run_ctx run_ctx;
        u32 ret = 1;

        RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held");

        if (unlikely(!array))
                return ret;

        run_ctx.is_uprobe = false;

        migrate_disable();
        old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
        item = &array->items[0];
        while ((prog = READ_ONCE(item->prog))) {
                run_ctx.bpf_cookie = item->bpf_cookie;
                ret &= run_prog(prog, ctx);
                item++;
        }
        bpf_reset_run_ctx(old_run_ctx);
        migrate_enable();
        return ret;
}

/* Notes on RCU design for bpf_prog_arrays containing sleepable programs:
 *
 * We use the tasks_trace rcu flavor read section to protect the bpf_prog_array
 * overall. As a result, we must use the bpf_prog_array_free_sleepable
 * in order to use the tasks_trace rcu grace period.
 *
 * When a non-sleepable program is inside the array, we take the rcu read
 * section and disable preemption for that program alone, so it can access
 * rcu-protected dynamically sized maps.
 */
static __always_inline u32
bpf_prog_run_array_uprobe(const struct bpf_prog_array *array,
                          const void *ctx, bpf_prog_run_fn run_prog)
{
        const struct bpf_prog_array_item *item;
        const struct bpf_prog *prog;
        struct bpf_run_ctx *old_run_ctx;
        struct bpf_trace_run_ctx run_ctx;
        u32 ret = 1;

        might_fault();
        RCU_LOCKDEP_WARN(!rcu_read_lock_trace_held(), "no rcu lock held");

        if (unlikely(!array))
                return ret;

        migrate_disable();

        run_ctx.is_uprobe = true;

        old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
        item = &array->items[0];
        while ((prog = READ_ONCE(item->prog))) {
                if (!prog->sleepable)
                        rcu_read_lock();

                run_ctx.bpf_cookie = item->bpf_cookie;
                ret &= run_prog(prog, ctx);
                item++;

                if (!prog->sleepable)
                        rcu_read_unlock();
        }
        bpf_reset_run_ctx(old_run_ctx);
        migrate_enable();
        return ret;
}

bool bpf_jit_bypass_spec_v1(void);
bool bpf_jit_bypass_spec_v4(void);

#ifdef CONFIG_BPF_SYSCALL
DECLARE_PER_CPU(int, bpf_prog_active);
extern struct mutex bpf_stats_enabled_mutex;

/*
 * Block execution of BPF programs attached to instrumentation (perf,
 * kprobes, tracepoints) to prevent deadlocks on map operations as any of
 * these events can happen inside a region which holds a map bucket lock
 * and can deadlock on it.
 */
static inline void bpf_disable_instrumentation(void)
{
        migrate_disable();
        this_cpu_inc(bpf_prog_active);
}

static inline void bpf_enable_instrumentation(void)
{
        this_cpu_dec(bpf_prog_active);
        migrate_enable();
}

extern const struct super_operations bpf_super_ops;
extern const struct file_operations bpf_map_fops;
extern const struct file_operations bpf_prog_fops;
extern const struct file_operations bpf_iter_fops;
extern const struct file_operations bpf_token_fops;

#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
        extern const struct bpf_prog_ops _name ## _prog_ops; \
        extern const struct bpf_verifier_ops _name ## _verifier_ops;
#define BPF_MAP_TYPE(_id, _ops) \
        extern const struct bpf_map_ops _ops;
#define BPF_LINK_TYPE(_id, _name)
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
#undef BPF_MAP_TYPE
#undef BPF_LINK_TYPE

extern const struct bpf_prog_ops bpf_offload_prog_ops;
extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops;
extern const struct bpf_verifier_ops xdp_analyzer_ops;

struct bpf_prog *bpf_prog_get(u32 ufd);
struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
                                       bool attach_drv);
void bpf_prog_add(struct bpf_prog *prog, int i);
void bpf_prog_sub(struct bpf_prog *prog, int i);
void bpf_prog_inc(struct bpf_prog *prog);
struct bpf_prog * __must_check bpf_prog_inc_not_zero(struct bpf_prog *prog);
void bpf_prog_put(struct bpf_prog *prog);

void bpf_prog_free_id(struct bpf_prog *prog);
void bpf_map_free_id(struct bpf_map *map);

struct btf_field *btf_record_find(const struct btf_record *rec,
                                  u32 offset, u32 field_mask);
void btf_record_free(struct btf_record *rec);
void bpf_map_free_record(struct bpf_map *map);
struct btf_record *btf_record_dup(const struct btf_record *rec);
bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b);
void bpf_obj_free_timer(const struct btf_record *rec, void *obj);
void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj);
void bpf_obj_free_task_work(const struct btf_record *rec, void *obj);
void bpf_obj_free_fields(const struct btf_record *rec, void *obj);
void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu);

struct bpf_map *bpf_map_get(u32 ufd);
struct bpf_map *bpf_map_get_with_uref(u32 ufd);

/*
 * The __bpf_map_get() and __btf_get_by_fd() functions parse a file
 * descriptor and return a corresponding map or btf object.
 * Their names are double underscored to emphasize the fact that they
 * do not increase refcnt. To also increase refcnt use corresponding
 * bpf_map_get() and btf_get_by_fd() functions.
 */

static inline struct bpf_map *__bpf_map_get(struct fd f)
{
        if (fd_empty(f))
                return ERR_PTR(-EBADF);
        if (unlikely(fd_file(f)->f_op != &bpf_map_fops))
                return ERR_PTR(-EINVAL);
        return fd_file(f)->private_data;
}

static inline struct btf *__btf_get_by_fd(struct fd f)
{
        if (fd_empty(f))
                return ERR_PTR(-EBADF);
        if (unlikely(fd_file(f)->f_op != &btf_fops))
                return ERR_PTR(-EINVAL);
        return fd_file(f)->private_data;
}

void bpf_map_inc(struct bpf_map *map);
void bpf_map_inc_with_uref(struct bpf_map *map);
struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref);
struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map);
void bpf_map_put_with_uref(struct bpf_map *map);
void bpf_map_put(struct bpf_map *map);
void *bpf_map_area_alloc(u64 size, int numa_node);
void *bpf_map_area_mmapable_alloc(u64 size, int numa_node);
void bpf_map_area_free(void *base);
bool bpf_map_write_active(const struct bpf_map *map);
void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
int  generic_map_lookup_batch(struct bpf_map *map,
                              const union bpf_attr *attr,
                              union bpf_attr __user *uattr);
int  generic_map_update_batch(struct bpf_map *map, struct file *map_file,
                              const union bpf_attr *attr,
                              union bpf_attr __user *uattr);
int  generic_map_delete_batch(struct bpf_map *map,
                              const union bpf_attr *attr,
                              union bpf_attr __user *uattr);
struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id);


int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
                        unsigned long nr_pages, struct page **page_array);
#ifdef CONFIG_MEMCG
void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
                           int node);
void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags,
                             int node);
void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags);
void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size,
                       gfp_t flags);
void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
                                    size_t align, gfp_t flags);
#else
/*
 * These specialized allocators have to be macros for their allocations to be
 * accounted separately (to have separate alloc_tag).
 */
#define bpf_map_kmalloc_node(_map, _size, _flags, _node)        \
                kmalloc_node(_size, _flags, _node)
#define bpf_map_kmalloc_nolock(_map, _size, _flags, _node)        \
                kmalloc_nolock(_size, _flags, _node)
#define bpf_map_kzalloc(_map, _size, _flags)                        \
                kzalloc(_size, _flags)
#define bpf_map_kvcalloc(_map, _n, _size, _flags)                \
                kvcalloc(_n, _size, _flags)
#define bpf_map_alloc_percpu(_map, _size, _align, _flags)        \
                __alloc_percpu_gfp(_size, _align, _flags)
#endif

static inline int
bpf_map_init_elem_count(struct bpf_map *map)
{
        size_t size = sizeof(*map->elem_count), align = size;
        gfp_t flags = GFP_USER | __GFP_NOWARN;

        map->elem_count = bpf_map_alloc_percpu(map, size, align, flags);
        if (!map->elem_count)
                return -ENOMEM;

        return 0;
}

static inline void
bpf_map_free_elem_count(struct bpf_map *map)
{
        free_percpu(map->elem_count);
}

static inline void bpf_map_inc_elem_count(struct bpf_map *map)
{
        this_cpu_inc(*map->elem_count);
}

static inline void bpf_map_dec_elem_count(struct bpf_map *map)
{
        this_cpu_dec(*map->elem_count);
}

extern int sysctl_unprivileged_bpf_disabled;

bool bpf_token_capable(const struct bpf_token *token, int cap);

static inline bool bpf_allow_ptr_leaks(const struct bpf_token *token)
{
        return bpf_token_capable(token, CAP_PERFMON);
}

static inline bool bpf_allow_uninit_stack(const struct bpf_token *token)
{
        return bpf_token_capable(token, CAP_PERFMON);
}

static inline bool bpf_bypass_spec_v1(const struct bpf_token *token)
{
        return bpf_jit_bypass_spec_v1() ||
                cpu_mitigations_off() ||
                bpf_token_capable(token, CAP_PERFMON);
}

static inline bool bpf_bypass_spec_v4(const struct bpf_token *token)
{
        return bpf_jit_bypass_spec_v4() ||
                cpu_mitigations_off() ||
                bpf_token_capable(token, CAP_PERFMON);
}

int bpf_map_new_fd(struct bpf_map *map, int flags);
int bpf_prog_new_fd(struct bpf_prog *prog);

void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
                   const struct bpf_link_ops *ops, struct bpf_prog *prog,
                   enum bpf_attach_type attach_type);
void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type,
                             const struct bpf_link_ops *ops, struct bpf_prog *prog,
                             enum bpf_attach_type attach_type, bool sleepable);
int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer);
int bpf_link_settle(struct bpf_link_primer *primer);
void bpf_link_cleanup(struct bpf_link_primer *primer);
void bpf_link_inc(struct bpf_link *link);
struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link);
void bpf_link_put(struct bpf_link *link);
int bpf_link_new_fd(struct bpf_link *link);
struct bpf_link *bpf_link_get_from_fd(u32 ufd);
struct bpf_link *bpf_link_get_curr_or_next(u32 *id);

void bpf_token_inc(struct bpf_token *token);
void bpf_token_put(struct bpf_token *token);
int bpf_token_create(union bpf_attr *attr);
struct bpf_token *bpf_token_get_from_fd(u32 ufd);
int bpf_token_get_info_by_fd(struct bpf_token *token,
                             const union bpf_attr *attr,
                             union bpf_attr __user *uattr);

bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd);
bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type);
bool bpf_token_allow_prog_type(const struct bpf_token *token,
                               enum bpf_prog_type prog_type,
                               enum bpf_attach_type attach_type);

int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname);
int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags);
struct inode *bpf_get_inode(struct super_block *sb, const struct inode *dir,
                            umode_t mode);

#define BPF_ITER_FUNC_PREFIX "bpf_iter_"
#define DEFINE_BPF_ITER_FUNC(target, args...)                        \
        extern int bpf_iter_ ## target(args);                        \
        int __init bpf_iter_ ## target(args) { return 0; }

/*
 * The task type of iterators.
 *
 * For BPF task iterators, they can be parameterized with various
 * parameters to visit only some of tasks.
 *
 * BPF_TASK_ITER_ALL (default)
 *        Iterate over resources of every task.
 *
 * BPF_TASK_ITER_TID
 *        Iterate over resources of a task/tid.
 *
 * BPF_TASK_ITER_TGID
 *        Iterate over resources of every task of a process / task group.
 */
enum bpf_iter_task_type {
        BPF_TASK_ITER_ALL = 0,
        BPF_TASK_ITER_TID,
        BPF_TASK_ITER_TGID,
};

struct bpf_iter_aux_info {
        /* for map_elem iter */
        struct bpf_map *map;

        /* for cgroup iter */
        struct {
                struct cgroup *start; /* starting cgroup */
                enum bpf_cgroup_iter_order order;
        } cgroup;
        struct {
                enum bpf_iter_task_type        type;
                u32 pid;
        } task;
};

typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
                                        union bpf_iter_link_info *linfo,
                                        struct bpf_iter_aux_info *aux);
typedef void (*bpf_iter_detach_target_t)(struct bpf_iter_aux_info *aux);
typedef void (*bpf_iter_show_fdinfo_t) (const struct bpf_iter_aux_info *aux,
                                        struct seq_file *seq);
typedef int (*bpf_iter_fill_link_info_t)(const struct bpf_iter_aux_info *aux,
                                         struct bpf_link_info *info);
typedef const struct bpf_func_proto *
(*bpf_iter_get_func_proto_t)(enum bpf_func_id func_id,
                             const struct bpf_prog *prog);

enum bpf_iter_feature {
        BPF_ITER_RESCHED        = BIT(0),
};

#define BPF_ITER_CTX_ARG_MAX 2
struct bpf_iter_reg {
        const char *target;
        bpf_iter_attach_target_t attach_target;
        bpf_iter_detach_target_t detach_target;
        bpf_iter_show_fdinfo_t show_fdinfo;
        bpf_iter_fill_link_info_t fill_link_info;
        bpf_iter_get_func_proto_t get_func_proto;
        u32 ctx_arg_info_size;
        u32 feature;
        struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX];
        const struct bpf_iter_seq_info *seq_info;
};

struct bpf_iter_meta {
        __bpf_md_ptr(struct seq_file *, seq);
        u64 session_id;
        u64 seq_num;
};

struct bpf_iter__bpf_map_elem {
        __bpf_md_ptr(struct bpf_iter_meta *, meta);
        __bpf_md_ptr(struct bpf_map *, map);
        __bpf_md_ptr(void *, key);
        __bpf_md_ptr(void *, value);
};

int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info);
void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info);
int bpf_iter_prog_supported(struct bpf_prog *prog);
const struct bpf_func_proto *
bpf_iter_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog);
int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_prog *prog);
int bpf_iter_new_fd(struct bpf_link *link);
bool bpf_link_is_iter(struct bpf_link *link);
struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop);
int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx);
void bpf_iter_map_show_fdinfo(const struct bpf_iter_aux_info *aux,
                              struct seq_file *seq);
int bpf_iter_map_fill_link_info(const struct bpf_iter_aux_info *aux,
                                struct bpf_link_info *info);

int map_set_for_each_callback_args(struct bpf_verifier_env *env,
                                   struct bpf_func_state *caller,
                                   struct bpf_func_state *callee);

int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
                           u64 flags);
int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
                            u64 flags);

int bpf_stackmap_extract(struct bpf_map *map, void *key, void *value, bool delete);

int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
                                 void *key, void *value, u64 map_flags);
int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value);
int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
                                void *key, void *value, u64 map_flags);
int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value);

int bpf_get_file_flag(int flags);
int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size,
                             size_t actual_size);

/* verify correctness of eBPF program */
int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size);

#ifndef CONFIG_BPF_JIT_ALWAYS_ON
void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth);
#endif

struct btf *bpf_get_btf_vmlinux(void);

/* Map specifics */
struct xdp_frame;
struct sk_buff;
struct bpf_dtab_netdev;
struct bpf_cpu_map_entry;

void __dev_flush(struct list_head *flush_list);
int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
                    struct net_device *dev_rx);
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
                    struct net_device *dev_rx);
int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
                          struct bpf_map *map, bool exclude_ingress);
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
                             const struct bpf_prog *xdp_prog);
int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
                           const struct bpf_prog *xdp_prog,
                           struct bpf_map *map, bool exclude_ingress);

void __cpu_map_flush(struct list_head *flush_list);
int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf,
                    struct net_device *dev_rx);
int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu,
                             struct sk_buff *skb);

/* Return map's numa specified by userspace */
static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
{
        return (attr->map_flags & BPF_F_NUMA_NODE) ?
                attr->numa_node : NUMA_NO_NODE;
}

struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type);
int array_map_alloc_check(union bpf_attr *attr);

int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
                          union bpf_attr __user *uattr);
int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
                          union bpf_attr __user *uattr);
int bpf_prog_test_run_tracing(struct bpf_prog *prog,
                              const union bpf_attr *kattr,
                              union bpf_attr __user *uattr);
int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
                                     const union bpf_attr *kattr,
                                     union bpf_attr __user *uattr);
int bpf_prog_test_run_raw_tp(struct bpf_prog *prog,
                             const union bpf_attr *kattr,
                             union bpf_attr __user *uattr);
int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog,
                                const union bpf_attr *kattr,
                                union bpf_attr __user *uattr);
int bpf_prog_test_run_nf(struct bpf_prog *prog,
                         const union bpf_attr *kattr,
                         union bpf_attr __user *uattr);
bool btf_ctx_access(int off, int size, enum bpf_access_type type,
                    const struct bpf_prog *prog,
                    struct bpf_insn_access_aux *info);

static inline bool bpf_tracing_ctx_access(int off, int size,
                                          enum bpf_access_type type)
{
        if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
                return false;
        if (type != BPF_READ)
                return false;
        if (off % size != 0)
                return false;
        return true;
}

static inline bool bpf_tracing_btf_ctx_access(int off, int size,
                                              enum bpf_access_type type,
                                              const struct bpf_prog *prog,
                                              struct bpf_insn_access_aux *info)
{
        if (!bpf_tracing_ctx_access(off, size, type))
                return false;
        return btf_ctx_access(off, size, type, prog, info);
}

int btf_struct_access(struct bpf_verifier_log *log,
                      const struct bpf_reg_state *reg,
                      int off, int size, enum bpf_access_type atype,
                      u32 *next_btf_id, enum bpf_type_flag *flag, const char **field_name);
bool btf_struct_ids_match(struct bpf_verifier_log *log,
                          const struct btf *btf, u32 id, int off,
                          const struct btf *need_btf, u32 need_type_id,
                          bool strict);

int btf_distill_func_proto(struct bpf_verifier_log *log,
                           struct btf *btf,
                           const struct btf_type *func_proto,
                           const char *func_name,
                           struct btf_func_model *m);

struct bpf_reg_state;
int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog);
int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
                         struct btf *btf, const struct btf_type *t);
const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
                                    int comp_idx, const char *tag_key);
int btf_find_next_decl_tag(const struct btf *btf, const struct btf_type *pt,
                           int comp_idx, const char *tag_key, int last_id);

struct bpf_prog *bpf_prog_by_id(u32 id);
struct bpf_link *bpf_link_by_id(u32 id);

const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id,
                                                 const struct bpf_prog *prog);
void bpf_task_storage_free(struct task_struct *task);
void bpf_cgrp_storage_free(struct cgroup *cgroup);
bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog);
const struct btf_func_model *
bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
                         const struct bpf_insn *insn);
int bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id,
                       u16 btf_fd_idx, u8 **func_addr);

struct bpf_core_ctx {
        struct bpf_verifier_log *log;
        const struct btf *btf;
};

bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
                                const struct bpf_reg_state *reg,
                                const char *field_name, u32 btf_id, const char *suffix);

bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log,
                               const struct btf *reg_btf, u32 reg_id,
                               const struct btf *arg_btf, u32 arg_id);

int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
                   int relo_idx, void *insn);

static inline bool unprivileged_ebpf_enabled(void)
{
        return !sysctl_unprivileged_bpf_disabled;
}

/* Not all bpf prog type has the bpf_ctx.
 * For the bpf prog type that has initialized the bpf_ctx,
 * this function can be used to decide if a kernel function
 * is called by a bpf program.
 */
static inline bool has_current_bpf_ctx(void)
{
        return !!current->bpf_ctx;
}

void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog);

void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
                     enum bpf_dynptr_type type, u32 offset, u32 size);
void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr);
void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr);
void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip);

#else /* !CONFIG_BPF_SYSCALL */
static inline struct bpf_prog *bpf_prog_get(u32 ufd)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd,
                                                     enum bpf_prog_type type,
                                                     bool attach_drv)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline void bpf_prog_add(struct bpf_prog *prog, int i)
{
}

static inline void bpf_prog_sub(struct bpf_prog *prog, int i)
{
}

static inline void bpf_prog_put(struct bpf_prog *prog)
{
}

static inline void bpf_prog_inc(struct bpf_prog *prog)
{
}

static inline struct bpf_prog *__must_check
bpf_prog_inc_not_zero(struct bpf_prog *prog)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
                                 const struct bpf_link_ops *ops,
                                 struct bpf_prog *prog, enum bpf_attach_type attach_type)
{
}

static inline void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type,
                                           const struct bpf_link_ops *ops, struct bpf_prog *prog,
                                           enum bpf_attach_type attach_type, bool sleepable)
{
}

static inline int bpf_link_prime(struct bpf_link *link,
                                 struct bpf_link_primer *primer)
{
        return -EOPNOTSUPP;
}

static inline int bpf_link_settle(struct bpf_link_primer *primer)
{
        return -EOPNOTSUPP;
}

static inline void bpf_link_cleanup(struct bpf_link_primer *primer)
{
}

static inline void bpf_link_inc(struct bpf_link *link)
{
}

static inline struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
{
        return NULL;
}

static inline void bpf_link_put(struct bpf_link *link)
{
}

static inline int bpf_obj_get_user(const char __user *pathname, int flags)
{
        return -EOPNOTSUPP;
}

static inline bool bpf_token_capable(const struct bpf_token *token, int cap)
{
        return capable(cap) || (cap != CAP_SYS_ADMIN && capable(CAP_SYS_ADMIN));
}

static inline void bpf_token_inc(struct bpf_token *token)
{
}

static inline void bpf_token_put(struct bpf_token *token)
{
}

static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline int bpf_token_get_info_by_fd(struct bpf_token *token,
                                           const union bpf_attr *attr,
                                           union bpf_attr __user *uattr)
{
        return -EOPNOTSUPP;
}

static inline void __dev_flush(struct list_head *flush_list)
{
}

struct xdp_frame;
struct bpf_dtab_netdev;
struct bpf_cpu_map_entry;

static inline
int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
                    struct net_device *dev_rx)
{
        return 0;
}

static inline
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
                    struct net_device *dev_rx)
{
        return 0;
}

static inline
int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
                          struct bpf_map *map, bool exclude_ingress)
{
        return 0;
}

struct sk_buff;

static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst,
                                           struct sk_buff *skb,
                                           const struct bpf_prog *xdp_prog)
{
        return 0;
}

static inline
int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
                           const struct bpf_prog *xdp_prog,
                           struct bpf_map *map, bool exclude_ingress)
{
        return 0;
}

static inline void __cpu_map_flush(struct list_head *flush_list)
{
}

static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
                                  struct xdp_frame *xdpf,
                                  struct net_device *dev_rx)
{
        return 0;
}

static inline int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu,
                                           struct sk_buff *skb)
{
        return -EOPNOTSUPP;
}

static inline struct bpf_prog *bpf_prog_get_type_path(const char *name,
                                enum bpf_prog_type type)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline int bpf_prog_test_run_xdp(struct bpf_prog *prog,
                                        const union bpf_attr *kattr,
                                        union bpf_attr __user *uattr)
{
        return -ENOTSUPP;
}

static inline int bpf_prog_test_run_skb(struct bpf_prog *prog,
                                        const union bpf_attr *kattr,
                                        union bpf_attr __user *uattr)
{
        return -ENOTSUPP;
}

static inline int bpf_prog_test_run_tracing(struct bpf_prog *prog,
                                            const union bpf_attr *kattr,
                                            union bpf_attr __user *uattr)
{
        return -ENOTSUPP;
}

static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
                                                   const union bpf_attr *kattr,
                                                   union bpf_attr __user *uattr)
{
        return -ENOTSUPP;
}

static inline int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog,
                                              const union bpf_attr *kattr,
                                              union bpf_attr __user *uattr)
{
        return -ENOTSUPP;
}

static inline void bpf_map_put(struct bpf_map *map)
{
}

static inline struct bpf_prog *bpf_prog_by_id(u32 id)
{
        return ERR_PTR(-ENOTSUPP);
}

static inline int btf_struct_access(struct bpf_verifier_log *log,
                                    const struct bpf_reg_state *reg,
                                    int off, int size, enum bpf_access_type atype,
                                    u32 *next_btf_id, enum bpf_type_flag *flag,
                                    const char **field_name)
{
        return -EACCES;
}

static inline const struct bpf_func_proto *
bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        return NULL;
}

static inline void bpf_task_storage_free(struct task_struct *task)
{
}

static inline bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog)
{
        return false;
}

static inline const struct btf_func_model *
bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
                         const struct bpf_insn *insn)
{
        return NULL;
}

static inline int
bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id,
                   u16 btf_fd_idx, u8 **func_addr)
{
        return -ENOTSUPP;
}

static inline bool unprivileged_ebpf_enabled(void)
{
        return false;
}

static inline bool has_current_bpf_ctx(void)
{
        return false;
}

static inline void bpf_prog_inc_misses_counter(struct bpf_prog *prog)
{
}

static inline void bpf_cgrp_storage_free(struct cgroup *cgroup)
{
}

static inline void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
                                   enum bpf_dynptr_type type, u32 offset, u32 size)
{
}

static inline void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
{
}

static inline void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr)
{
}

static inline void bpf_prog_report_arena_violation(bool write, unsigned long addr,
                                                   unsigned long fault_ip)
{
}
#endif /* CONFIG_BPF_SYSCALL */

static __always_inline int
bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr)
{
        int ret = -EFAULT;

        if (IS_ENABLED(CONFIG_BPF_EVENTS))
                ret = copy_from_kernel_nofault(dst, unsafe_ptr, size);
        if (unlikely(ret < 0))
                memset(dst, 0, size);
        return ret;
}

void __bpf_free_used_btfs(struct btf_mod_pair *used_btfs, u32 len);

static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
                                                 enum bpf_prog_type type)
{
        return bpf_prog_get_type_dev(ufd, type, false);
}

void __bpf_free_used_maps(struct bpf_prog_aux *aux,
                          struct bpf_map **used_maps, u32 len);

bool bpf_prog_get_ok(struct bpf_prog *, enum bpf_prog_type *, bool);

int bpf_prog_offload_compile(struct bpf_prog *prog);
void bpf_prog_dev_bound_destroy(struct bpf_prog *prog);
int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
                               struct bpf_prog *prog);

int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map);

int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value);
int bpf_map_offload_update_elem(struct bpf_map *map,
                                void *key, void *value, u64 flags);
int bpf_map_offload_delete_elem(struct bpf_map *map, void *key);
int bpf_map_offload_get_next_key(struct bpf_map *map,
                                 void *key, void *next_key);

bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map);

struct bpf_offload_dev *
bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv);
void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev);
void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev);
int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev,
                                    struct net_device *netdev);
void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev,
                                       struct net_device *netdev);
bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev);

void unpriv_ebpf_notify(int new_state);

#if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log,
                              struct bpf_prog_aux *prog_aux);
void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id);
int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr);
int bpf_prog_dev_bound_inherit(struct bpf_prog *new_prog, struct bpf_prog *old_prog);
void bpf_dev_bound_netdev_unregister(struct net_device *dev);

static inline bool bpf_prog_is_dev_bound(const struct bpf_prog_aux *aux)
{
        return aux->dev_bound;
}

static inline bool bpf_prog_is_offloaded(const struct bpf_prog_aux *aux)
{
        return aux->offload_requested;
}

bool bpf_prog_dev_bound_match(const struct bpf_prog *lhs, const struct bpf_prog *rhs);

static inline bool bpf_map_is_offloaded(struct bpf_map *map)
{
        return unlikely(map->ops == &bpf_map_offload_ops);
}

struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr);
void bpf_map_offload_map_free(struct bpf_map *map);
u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map);
int bpf_prog_test_run_syscall(struct bpf_prog *prog,
                              const union bpf_attr *kattr,
                              union bpf_attr __user *uattr);

int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog);
int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype);
int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags);
int sock_map_bpf_prog_query(const union bpf_attr *attr,
                            union bpf_attr __user *uattr);
int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog);

void sock_map_unhash(struct sock *sk);
void sock_map_destroy(struct sock *sk);
void sock_map_close(struct sock *sk, long timeout);
#else
static inline int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log,
                                            struct bpf_prog_aux *prog_aux)
{
        return -EOPNOTSUPP;
}

static inline void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog,
                                                u32 func_id)
{
        return NULL;
}

static inline int bpf_prog_dev_bound_init(struct bpf_prog *prog,
                                          union bpf_attr *attr)
{
        return -EOPNOTSUPP;
}

static inline int bpf_prog_dev_bound_inherit(struct bpf_prog *new_prog,
                                             struct bpf_prog *old_prog)
{
        return -EOPNOTSUPP;
}

static inline void bpf_dev_bound_netdev_unregister(struct net_device *dev)
{
}

static inline bool bpf_prog_is_dev_bound(const struct bpf_prog_aux *aux)
{
        return false;
}

static inline bool bpf_prog_is_offloaded(struct bpf_prog_aux *aux)
{
        return false;
}

static inline bool bpf_prog_dev_bound_match(const struct bpf_prog *lhs, const struct bpf_prog *rhs)
{
        return false;
}

static inline bool bpf_map_is_offloaded(struct bpf_map *map)
{
        return false;
}

static inline struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline void bpf_map_offload_map_free(struct bpf_map *map)
{
}

static inline u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map)
{
        return 0;
}

static inline int bpf_prog_test_run_syscall(struct bpf_prog *prog,
                                            const union bpf_attr *kattr,
                                            union bpf_attr __user *uattr)
{
        return -ENOTSUPP;
}

#ifdef CONFIG_BPF_SYSCALL
static inline int sock_map_get_from_fd(const union bpf_attr *attr,
                                       struct bpf_prog *prog)
{
        return -EINVAL;
}

static inline int sock_map_prog_detach(const union bpf_attr *attr,
                                       enum bpf_prog_type ptype)
{
        return -EOPNOTSUPP;
}

static inline int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value,
                                           u64 flags)
{
        return -EOPNOTSUPP;
}

static inline int sock_map_bpf_prog_query(const union bpf_attr *attr,
                                          union bpf_attr __user *uattr)
{
        return -EINVAL;
}

static inline int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog)
{
        return -EOPNOTSUPP;
}
#endif /* CONFIG_BPF_SYSCALL */
#endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */

static __always_inline void
bpf_prog_inc_misses_counters(const struct bpf_prog_array *array)
{
        const struct bpf_prog_array_item *item;
        struct bpf_prog *prog;

        if (unlikely(!array))
                return;

        item = &array->items[0];
        while ((prog = READ_ONCE(item->prog))) {
                bpf_prog_inc_misses_counter(prog);
                item++;
        }
}

#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
void bpf_sk_reuseport_detach(struct sock *sk);
int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
                                       void *value);
int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key,
                                       void *value, u64 map_flags);
#else
static inline void bpf_sk_reuseport_detach(struct sock *sk)
{
}

#ifdef CONFIG_BPF_SYSCALL
static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map,
                                                     void *key, void *value)
{
        return -EOPNOTSUPP;
}

static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map,
                                                     void *key, void *value,
                                                     u64 map_flags)
{
        return -EOPNOTSUPP;
}
#endif /* CONFIG_BPF_SYSCALL */
#endif /* defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) */

#if defined(CONFIG_KEYS) && defined(CONFIG_BPF_SYSCALL)

struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags);
struct bpf_key *bpf_lookup_system_key(u64 id);
void bpf_key_put(struct bpf_key *bkey);
int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p,
                               struct bpf_dynptr *sig_p,
                               struct bpf_key *trusted_keyring);

#else
static inline struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags)
{
        return NULL;
}

static inline struct bpf_key *bpf_lookup_system_key(u64 id)
{
        return NULL;
}

static inline void bpf_key_put(struct bpf_key *bkey)
{
}

static inline int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p,
                                             struct bpf_dynptr *sig_p,
                                             struct bpf_key *trusted_keyring)
{
        return -EOPNOTSUPP;
}
#endif /* defined(CONFIG_KEYS) && defined(CONFIG_BPF_SYSCALL) */

/* verifier prototypes for helper functions called from eBPF programs */
extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
extern const struct bpf_func_proto bpf_map_update_elem_proto;
extern const struct bpf_func_proto bpf_map_delete_elem_proto;
extern const struct bpf_func_proto bpf_map_push_elem_proto;
extern const struct bpf_func_proto bpf_map_pop_elem_proto;
extern const struct bpf_func_proto bpf_map_peek_elem_proto;
extern const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto;

extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
extern const struct bpf_func_proto bpf_get_numa_node_id_proto;
extern const struct bpf_func_proto bpf_tail_call_proto;
extern const struct bpf_func_proto bpf_ktime_get_ns_proto;
extern const struct bpf_func_proto bpf_ktime_get_boot_ns_proto;
extern const struct bpf_func_proto bpf_ktime_get_tai_ns_proto;
extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto;
extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
extern const struct bpf_func_proto bpf_get_current_comm_proto;
extern const struct bpf_func_proto bpf_get_stackid_proto;
extern const struct bpf_func_proto bpf_get_stack_proto;
extern const struct bpf_func_proto bpf_get_stack_sleepable_proto;
extern const struct bpf_func_proto bpf_get_task_stack_proto;
extern const struct bpf_func_proto bpf_get_task_stack_sleepable_proto;
extern const struct bpf_func_proto bpf_get_stackid_proto_pe;
extern const struct bpf_func_proto bpf_get_stack_proto_pe;
extern const struct bpf_func_proto bpf_sock_map_update_proto;
extern const struct bpf_func_proto bpf_sock_hash_update_proto;
extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
extern const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto;
extern const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto;
extern const struct bpf_func_proto bpf_current_task_under_cgroup_proto;
extern const struct bpf_func_proto bpf_msg_redirect_hash_proto;
extern const struct bpf_func_proto bpf_msg_redirect_map_proto;
extern const struct bpf_func_proto bpf_sk_redirect_hash_proto;
extern const struct bpf_func_proto bpf_sk_redirect_map_proto;
extern const struct bpf_func_proto bpf_spin_lock_proto;
extern const struct bpf_func_proto bpf_spin_unlock_proto;
extern const struct bpf_func_proto bpf_get_local_storage_proto;
extern const struct bpf_func_proto bpf_strtol_proto;
extern const struct bpf_func_proto bpf_strtoul_proto;
extern const struct bpf_func_proto bpf_tcp_sock_proto;
extern const struct bpf_func_proto bpf_jiffies64_proto;
extern const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto;
extern const struct bpf_func_proto bpf_event_output_data_proto;
extern const struct bpf_func_proto bpf_ringbuf_output_proto;
extern const struct bpf_func_proto bpf_ringbuf_reserve_proto;
extern const struct bpf_func_proto bpf_ringbuf_submit_proto;
extern const struct bpf_func_proto bpf_ringbuf_discard_proto;
extern const struct bpf_func_proto bpf_ringbuf_query_proto;
extern const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto;
extern const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto;
extern const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto;
extern const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto;
extern const struct bpf_func_proto bpf_skc_to_tcp_sock_proto;
extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto;
extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto;
extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto;
extern const struct bpf_func_proto bpf_skc_to_unix_sock_proto;
extern const struct bpf_func_proto bpf_skc_to_mptcp_sock_proto;
extern const struct bpf_func_proto bpf_copy_from_user_proto;
extern const struct bpf_func_proto bpf_snprintf_btf_proto;
extern const struct bpf_func_proto bpf_snprintf_proto;
extern const struct bpf_func_proto bpf_per_cpu_ptr_proto;
extern const struct bpf_func_proto bpf_this_cpu_ptr_proto;
extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto;
extern const struct bpf_func_proto bpf_sock_from_file_proto;
extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto;
extern const struct bpf_func_proto bpf_task_storage_get_recur_proto;
extern const struct bpf_func_proto bpf_task_storage_get_proto;
extern const struct bpf_func_proto bpf_task_storage_delete_recur_proto;
extern const struct bpf_func_proto bpf_task_storage_delete_proto;
extern const struct bpf_func_proto bpf_for_each_map_elem_proto;
extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto;
extern const struct bpf_func_proto bpf_sk_setsockopt_proto;
extern const struct bpf_func_proto bpf_sk_getsockopt_proto;
extern const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto;
extern const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto;
extern const struct bpf_func_proto bpf_find_vma_proto;
extern const struct bpf_func_proto bpf_loop_proto;
extern const struct bpf_func_proto bpf_copy_from_user_task_proto;
extern const struct bpf_func_proto bpf_set_retval_proto;
extern const struct bpf_func_proto bpf_get_retval_proto;
extern const struct bpf_func_proto bpf_user_ringbuf_drain_proto;
extern const struct bpf_func_proto bpf_cgrp_storage_get_proto;
extern const struct bpf_func_proto bpf_cgrp_storage_delete_proto;

const struct bpf_func_proto *tracing_prog_func_proto(
  enum bpf_func_id func_id, const struct bpf_prog *prog);

/* Shared helpers among cBPF and eBPF. */
void bpf_user_rnd_init_once(void);
u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
u64 bpf_get_raw_cpu_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);

#if defined(CONFIG_NET)
bool bpf_sock_common_is_valid_access(int off, int size,
                                     enum bpf_access_type type,
                                     struct bpf_insn_access_aux *info);
bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
                              struct bpf_insn_access_aux *info);
u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
                                const struct bpf_insn *si,
                                struct bpf_insn *insn_buf,
                                struct bpf_prog *prog,
                                u32 *target_size);
int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags,
                               struct bpf_dynptr *ptr);
#else
static inline bool bpf_sock_common_is_valid_access(int off, int size,
                                                   enum bpf_access_type type,
                                                   struct bpf_insn_access_aux *info)
{
        return false;
}
static inline bool bpf_sock_is_valid_access(int off, int size,
                                            enum bpf_access_type type,
                                            struct bpf_insn_access_aux *info)
{
        return false;
}
static inline u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
                                              const struct bpf_insn *si,
                                              struct bpf_insn *insn_buf,
                                              struct bpf_prog *prog,
                                              u32 *target_size)
{
        return 0;
}
static inline int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags,
                                             struct bpf_dynptr *ptr)
{
        return -EOPNOTSUPP;
}
#endif

#ifdef CONFIG_INET
struct sk_reuseport_kern {
        struct sk_buff *skb;
        struct sock *sk;
        struct sock *selected_sk;
        struct sock *migrating_sk;
        void *data_end;
        u32 hash;
        u32 reuseport_id;
        bool bind_inany;
};
bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
                                  struct bpf_insn_access_aux *info);

u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
                                    const struct bpf_insn *si,
                                    struct bpf_insn *insn_buf,
                                    struct bpf_prog *prog,
                                    u32 *target_size);

bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
                                  struct bpf_insn_access_aux *info);

u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
                                    const struct bpf_insn *si,
                                    struct bpf_insn *insn_buf,
                                    struct bpf_prog *prog,
                                    u32 *target_size);
#else
static inline bool bpf_tcp_sock_is_valid_access(int off, int size,
                                                enum bpf_access_type type,
                                                struct bpf_insn_access_aux *info)
{
        return false;
}

static inline u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
                                                  const struct bpf_insn *si,
                                                  struct bpf_insn *insn_buf,
                                                  struct bpf_prog *prog,
                                                  u32 *target_size)
{
        return 0;
}
static inline bool bpf_xdp_sock_is_valid_access(int off, int size,
                                                enum bpf_access_type type,
                                                struct bpf_insn_access_aux *info)
{
        return false;
}

static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
                                                  const struct bpf_insn *si,
                                                  struct bpf_insn *insn_buf,
                                                  struct bpf_prog *prog,
                                                  u32 *target_size)
{
        return 0;
}
#endif /* CONFIG_INET */

enum bpf_text_poke_type {
        BPF_MOD_CALL,
        BPF_MOD_JUMP,
};

int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
                       void *addr1, void *addr2);

void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
                               struct bpf_prog *new, struct bpf_prog *old);

void *bpf_arch_text_copy(void *dst, void *src, size_t len);
int bpf_arch_text_invalidate(void *dst, size_t len);

struct btf_id_set;
bool btf_id_set_contains(const struct btf_id_set *set, u32 id);

#define MAX_BPRINTF_VARARGS                12
#define MAX_BPRINTF_BUF                        1024

/* Per-cpu temp buffers used by printf-like helpers to store the bprintf binary
 * arguments representation.
 */
#define MAX_BPRINTF_BIN_ARGS        512

struct bpf_bprintf_buffers {
        char bin_args[MAX_BPRINTF_BIN_ARGS];
        char buf[MAX_BPRINTF_BUF];
};

struct bpf_bprintf_data {
        u32 *bin_args;
        char *buf;
        bool get_bin_args;
        bool get_buf;
};

int bpf_bprintf_prepare(const char *fmt, u32 fmt_size, const u64 *raw_args,
                        u32 num_args, struct bpf_bprintf_data *data);
void bpf_bprintf_cleanup(struct bpf_bprintf_data *data);
int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs);
void bpf_put_buffers(void);

void bpf_prog_stream_init(struct bpf_prog *prog);
void bpf_prog_stream_free(struct bpf_prog *prog);
int bpf_prog_stream_read(struct bpf_prog *prog, enum bpf_stream_id stream_id, void __user *buf, int len);
void bpf_stream_stage_init(struct bpf_stream_stage *ss);
void bpf_stream_stage_free(struct bpf_stream_stage *ss);
__printf(2, 3)
int bpf_stream_stage_printk(struct bpf_stream_stage *ss, const char *fmt, ...);
int bpf_stream_stage_commit(struct bpf_stream_stage *ss, struct bpf_prog *prog,
                            enum bpf_stream_id stream_id);
int bpf_stream_stage_dump_stack(struct bpf_stream_stage *ss);

#define bpf_stream_printk(ss, ...) bpf_stream_stage_printk(&ss, __VA_ARGS__)
#define bpf_stream_dump_stack(ss) bpf_stream_stage_dump_stack(&ss)

#define bpf_stream_stage(ss, prog, stream_id, expr)            \
        ({                                                     \
                bpf_stream_stage_init(&ss);                    \
                (expr);                                        \
                bpf_stream_stage_commit(&ss, prog, stream_id); \
                bpf_stream_stage_free(&ss);                    \
        })

#ifdef CONFIG_BPF_LSM
void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype);
void bpf_cgroup_atype_put(int cgroup_atype);
#else
static inline void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype) {}
static inline void bpf_cgroup_atype_put(int cgroup_atype) {}
#endif /* CONFIG_BPF_LSM */

struct key;

#ifdef CONFIG_KEYS
struct bpf_key {
        struct key *key;
        bool has_ref;
};
#endif /* CONFIG_KEYS */

static inline bool type_is_alloc(u32 type)
{
        return type & MEM_ALLOC;
}

static inline gfp_t bpf_memcg_flags(gfp_t flags)
{
        if (memcg_bpf_enabled())
                return flags | __GFP_ACCOUNT;
        return flags;
}

static inline bool bpf_is_subprog(const struct bpf_prog *prog)
{
        return prog->aux->func_idx != 0;
}

int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char **filep,
                           const char **linep, int *nump);
struct bpf_prog *bpf_prog_find_from_stack(void);

#endif /* _LINUX_BPF_H */



























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __IPC_NAMESPACE_H__
#define __IPC_NAMESPACE_H__

#include <linux/err.h>
#include <linux/idr.h>
#include <linux/rwsem.h>
#include <linux/notifier.h>
#include <linux/nsproxy.h>
#include <linux/ns_common.h>
#include <linux/refcount.h>
#include <linux/rhashtable-types.h>
#include <linux/sysctl.h>
#include <linux/percpu_counter.h>

struct user_namespace;

struct ipc_ids {
        int in_use;
        unsigned short seq;
        struct rw_semaphore rwsem;
        struct idr ipcs_idr;
        int max_idx;
        int last_idx;        /* For wrap around detection */
#ifdef CONFIG_CHECKPOINT_RESTORE
        int next_id;
#endif
        struct rhashtable key_ht;
};

struct ipc_namespace {
        struct ipc_ids        ids[3];

        int                sem_ctls[4];
        int                used_sems;

        unsigned int        msg_ctlmax;
        unsigned int        msg_ctlmnb;
        unsigned int        msg_ctlmni;
        struct percpu_counter percpu_msg_bytes;
        struct percpu_counter percpu_msg_hdrs;

        size_t                shm_ctlmax;
        size_t                shm_ctlall;
        unsigned long        shm_tot;
        int                shm_ctlmni;
        /*
         * Defines whether IPC_RMID is forced for _all_ shm segments regardless
         * of shmctl()
         */
        int                shm_rmid_forced;

        struct notifier_block ipcns_nb;

        /* The kern_mount of the mqueuefs sb.  We take a ref on it */
        struct vfsmount        *mq_mnt;

        /* # queues in this ns, protected by mq_lock */
        unsigned int    mq_queues_count;

        /* next fields are set through sysctl */
        unsigned int    mq_queues_max;   /* initialized to DFLT_QUEUESMAX */
        unsigned int    mq_msg_max;      /* initialized to DFLT_MSGMAX */
        unsigned int    mq_msgsize_max;  /* initialized to DFLT_MSGSIZEMAX */
        unsigned int    mq_msg_default;
        unsigned int    mq_msgsize_default;

        struct ctl_table_set        mq_set;
        struct ctl_table_header        *mq_sysctls;

        struct ctl_table_set        ipc_set;
        struct ctl_table_header        *ipc_sysctls;

        /* user_ns which owns the ipc ns */
        struct user_namespace *user_ns;
        struct ucounts *ucounts;

        struct llist_node mnt_llist;

        struct ns_common ns;
} __randomize_layout;

extern struct ipc_namespace init_ipc_ns;
extern spinlock_t mq_lock;

#ifdef CONFIG_SYSVIPC
extern void shm_destroy_orphaned(struct ipc_namespace *ns);
#else /* CONFIG_SYSVIPC */
static inline void shm_destroy_orphaned(struct ipc_namespace *ns) {}
#endif /* CONFIG_SYSVIPC */

#ifdef CONFIG_POSIX_MQUEUE
extern int mq_init_ns(struct ipc_namespace *ns);
/*
 * POSIX Message Queue default values:
 *
 * MIN_*: Lowest value an admin can set the maximum unprivileged limit to
 * DFLT_*MAX: Default values for the maximum unprivileged limits
 * DFLT_{MSG,MSGSIZE}: Default values used when the user doesn't supply
 *   an attribute to the open call and the queue must be created
 * HARD_*: Highest value the maximums can be set to.  These are enforced
 *   on CAP_SYS_RESOURCE apps as well making them inviolate (so make them
 *   suitably high)
 *
 * POSIX Requirements:
 *   Per app minimum openable message queues - 8.  This does not map well
 *     to the fact that we limit the number of queues on a per namespace
 *     basis instead of a per app basis.  So, make the default high enough
 *     that no given app should have a hard time opening 8 queues.
 *   Minimum maximum for HARD_MSGMAX - 32767.  I bumped this to 65536.
 *   Minimum maximum for HARD_MSGSIZEMAX - POSIX is silent on this.  However,
 *     we have run into a situation where running applications in the wild
 *     require this to be at least 5MB, and preferably 10MB, so I set the
 *     value to 16MB in hopes that this user is the worst of the bunch and
 *     the new maximum will handle anyone else.  I may have to revisit this
 *     in the future.
 */
#define DFLT_QUEUESMAX                      256
#define MIN_MSGMAX                        1
#define DFLT_MSG                       10U
#define DFLT_MSGMAX                       10
#define HARD_MSGMAX                    65536
#define MIN_MSGSIZEMAX                      128
#define DFLT_MSGSIZE                     8192U
#define DFLT_MSGSIZEMAX                     8192
#define HARD_MSGSIZEMAX            (16*1024*1024)
#else
static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; }
#endif

#if defined(CONFIG_IPC_NS)
static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns)
{
        return container_of(ns, struct ipc_namespace, ns);
}

extern struct ipc_namespace *copy_ipcs(u64 flags,
        struct user_namespace *user_ns, struct ipc_namespace *ns);

static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
{
        if (ns)
                ns_ref_inc(ns);
        return ns;
}

static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns)
{
        if (ns) {
                if (ns_ref_get(ns))
                        return ns;
        }

        return NULL;
}

extern void put_ipc_ns(struct ipc_namespace *ns);
#else
static inline struct ipc_namespace *copy_ipcs(u64 flags,
        struct user_namespace *user_ns, struct ipc_namespace *ns)
{
        if (flags & CLONE_NEWIPC)
                return ERR_PTR(-EINVAL);

        return ns;
}

static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
{
        return ns;
}

static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns)
{
        return ns;
}

static inline void put_ipc_ns(struct ipc_namespace *ns)
{
}
#endif

#ifdef CONFIG_POSIX_MQUEUE_SYSCTL

void retire_mq_sysctls(struct ipc_namespace *ns);
bool setup_mq_sysctls(struct ipc_namespace *ns);

#else /* CONFIG_POSIX_MQUEUE_SYSCTL */

static inline void retire_mq_sysctls(struct ipc_namespace *ns)
{
}

static inline bool setup_mq_sysctls(struct ipc_namespace *ns)
{
        return true;
}

#endif /* CONFIG_POSIX_MQUEUE_SYSCTL */

#ifdef CONFIG_SYSVIPC_SYSCTL

bool setup_ipc_sysctls(struct ipc_namespace *ns);
void retire_ipc_sysctls(struct ipc_namespace *ns);

#else /* CONFIG_SYSVIPC_SYSCTL */

static inline void retire_ipc_sysctls(struct ipc_namespace *ns)
{
}

static inline bool setup_ipc_sysctls(struct ipc_namespace *ns)
{
        return true;
}

#endif /* CONFIG_SYSVIPC_SYSCTL */
#endif








































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Portions of this file
 * Copyright(c) 2016-2017 Intel Deutschland GmbH
 * Copyright (C) 2018, 2020-2025 Intel Corporation
 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM cfg80211

#if !defined(__RDEV_OPS_TRACE) || defined(TRACE_HEADER_MULTI_READ)
#define __RDEV_OPS_TRACE

#include <linux/tracepoint.h>

#include <linux/rtnetlink.h>
#include <linux/etherdevice.h>
#include <net/cfg80211.h>
#include "core.h"

#define MAC_ENTRY(entry_mac) __array(u8, entry_mac, ETH_ALEN)
#define MAC_ASSIGN(entry_mac, given_mac) do {                             \
        if (given_mac)                                                     \
                memcpy(__entry->entry_mac, given_mac, ETH_ALEN);     \
        else                                                             \
                eth_zero_addr(__entry->entry_mac);                     \
        } while (0)

#define MAXNAME                32
#define WIPHY_ENTRY        __array(char, wiphy_name, 32)
#define WIPHY_ASSIGN        strscpy(__entry->wiphy_name, wiphy_name(wiphy), MAXNAME)
#define WIPHY_PR_FMT        "%s"
#define WIPHY_PR_ARG        __entry->wiphy_name

#define WDEV_ENTRY        __field(u32, id)
#define WDEV_ASSIGN        (__entry->id) = (!IS_ERR_OR_NULL(wdev)        \
                                         ? wdev->identifier : 0)
#define WDEV_PR_FMT        "wdev(%u)"
#define WDEV_PR_ARG        (__entry->id)

#define NETDEV_ENTRY        __array(char, name, IFNAMSIZ) \
                        __field(int, ifindex)
#define NETDEV_ASSIGN                                               \
        do {                                                       \
                memcpy(__entry->name, netdev->name, IFNAMSIZ); \
                (__entry->ifindex) = (netdev->ifindex);               \
        } while (0)
#define NETDEV_PR_FMT        "netdev:%s(%d)"
#define NETDEV_PR_ARG        __entry->name, __entry->ifindex

#define MESH_CFG_ENTRY __field(u16, dot11MeshRetryTimeout)                   \
                       __field(u16, dot11MeshConfirmTimeout)                   \
                       __field(u16, dot11MeshHoldingTimeout)                   \
                       __field(u16, dot11MeshMaxPeerLinks)                   \
                       __field(u8, dot11MeshMaxRetries)                           \
                       __field(u8, dot11MeshTTL)                           \
                       __field(u8, element_ttl)                                   \
                       __field(bool, auto_open_plinks)                           \
                       __field(u32, dot11MeshNbrOffsetMaxNeighbor)           \
                       __field(u8, dot11MeshHWMPmaxPREQretries)                   \
                       __field(u32, path_refresh_time)                           \
                       __field(u32, dot11MeshHWMPactivePathTimeout)           \
                       __field(u16, min_discovery_timeout)                   \
                       __field(u16, dot11MeshHWMPpreqMinInterval)           \
                       __field(u16, dot11MeshHWMPperrMinInterval)           \
                       __field(u16, dot11MeshHWMPnetDiameterTraversalTime) \
                       __field(u8, dot11MeshHWMPRootMode)                   \
                       __field(u16, dot11MeshHWMPRannInterval)                   \
                       __field(bool, dot11MeshGateAnnouncementProtocol)           \
                       __field(bool, dot11MeshForwarding)                   \
                       __field(s32, rssi_threshold)                           \
                       __field(u16, ht_opmode)                                   \
                       __field(u32, dot11MeshHWMPactivePathToRootTimeout)  \
                       __field(u16, dot11MeshHWMProotInterval)                   \
                       __field(u16, dot11MeshHWMPconfirmationInterval)           \
                       __field(bool, dot11MeshNolearn)
#define MESH_CFG_ASSIGN                                                              \
        do {                                                                      \
                __entry->dot11MeshRetryTimeout = conf->dot11MeshRetryTimeout; \
                __entry->dot11MeshConfirmTimeout =                              \
                                conf->dot11MeshConfirmTimeout;                      \
                __entry->dot11MeshHoldingTimeout =                              \
                                conf->dot11MeshHoldingTimeout;                      \
                __entry->dot11MeshMaxPeerLinks = conf->dot11MeshMaxPeerLinks; \
                __entry->dot11MeshMaxRetries = conf->dot11MeshMaxRetries;     \
                __entry->dot11MeshTTL = conf->dot11MeshTTL;                      \
                __entry->element_ttl = conf->element_ttl;                      \
                __entry->auto_open_plinks = conf->auto_open_plinks;              \
                __entry->dot11MeshNbrOffsetMaxNeighbor =                      \
                                conf->dot11MeshNbrOffsetMaxNeighbor;              \
                __entry->dot11MeshHWMPmaxPREQretries =                              \
                                conf->dot11MeshHWMPmaxPREQretries;              \
                __entry->path_refresh_time = conf->path_refresh_time;              \
                __entry->dot11MeshHWMPactivePathTimeout =                      \
                                conf->dot11MeshHWMPactivePathTimeout;              \
                __entry->min_discovery_timeout = conf->min_discovery_timeout; \
                __entry->dot11MeshHWMPpreqMinInterval =                              \
                                conf->dot11MeshHWMPpreqMinInterval;              \
                __entry->dot11MeshHWMPperrMinInterval =                              \
                                conf->dot11MeshHWMPperrMinInterval;              \
                __entry->dot11MeshHWMPnetDiameterTraversalTime =              \
                                conf->dot11MeshHWMPnetDiameterTraversalTime;  \
                __entry->dot11MeshHWMPRootMode = conf->dot11MeshHWMPRootMode; \
                __entry->dot11MeshHWMPRannInterval =                              \
                                conf->dot11MeshHWMPRannInterval;              \
                __entry->dot11MeshGateAnnouncementProtocol =                      \
                                conf->dot11MeshGateAnnouncementProtocol;      \
                __entry->dot11MeshForwarding = conf->dot11MeshForwarding;     \
                __entry->rssi_threshold = conf->rssi_threshold;                      \
                __entry->ht_opmode = conf->ht_opmode;                              \
                __entry->dot11MeshHWMPactivePathToRootTimeout =                      \
                                conf->dot11MeshHWMPactivePathToRootTimeout;   \
                __entry->dot11MeshHWMProotInterval =                              \
                                conf->dot11MeshHWMProotInterval;              \
                __entry->dot11MeshHWMPconfirmationInterval =                      \
                                conf->dot11MeshHWMPconfirmationInterval;      \
                __entry->dot11MeshNolearn = conf->dot11MeshNolearn;              \
        } while (0)

#define CHAN_ENTRY __field(enum nl80211_band, band) \
                   __field(u32, center_freq)                \
                   __field(u16, freq_offset)
#define CHAN_ASSIGN(chan)                                          \
        do {                                                          \
                if (chan) {                                          \
                        __entry->band = chan->band;                  \
                        __entry->center_freq = chan->center_freq; \
                        __entry->freq_offset = chan->freq_offset; \
                } else {                                          \
                        __entry->band = 0;                          \
                        __entry->center_freq = 0;                  \
                        __entry->freq_offset = 0;                  \
                }                                                  \
        } while (0)
#define CHAN_PR_FMT "band: %d, freq: %u.%03u"
#define CHAN_PR_ARG __entry->band, __entry->center_freq, __entry->freq_offset

#define CHAN_DEF_ENTRY __field(enum nl80211_band, band)                \
                       __field(u32, control_freq)                        \
                       __field(u32, freq_offset)                        \
                       __field(u32, width)                                \
                       __field(u32, center_freq1)                        \
                       __field(u32, freq1_offset)                        \
                       __field(u32, center_freq2)                        \
                       __field(u16, punctured)
#define CHAN_DEF_ASSIGN(chandef)                                        \
        do {                                                                \
                if ((chandef) && (chandef)->chan) {                        \
                        __entry->band = (chandef)->chan->band;                \
                        __entry->control_freq =                                \
                                (chandef)->chan->center_freq;                \
                        __entry->freq_offset =                                \
                                (chandef)->chan->freq_offset;                \
                        __entry->width = (chandef)->width;                \
                        __entry->center_freq1 = (chandef)->center_freq1;\
                        __entry->freq1_offset = (chandef)->freq1_offset;\
                        __entry->center_freq2 = (chandef)->center_freq2;\
                        __entry->punctured = (chandef)->punctured;        \
                } else {                                                \
                        __entry->band = 0;                                \
                        __entry->control_freq = 0;                        \
                        __entry->freq_offset = 0;                        \
                        __entry->width = 0;                                \
                        __entry->center_freq1 = 0;                        \
                        __entry->freq1_offset = 0;                        \
                        __entry->center_freq2 = 0;                        \
                        __entry->punctured = 0;                                \
                }                                                        \
        } while (0)
#define CHAN_DEF_PR_FMT                                                        \
        "band: %d, control freq: %u.%03u, width: %d, cf1: %u.%03u, cf2: %u, punct: 0x%x"
#define CHAN_DEF_PR_ARG __entry->band, __entry->control_freq,                \
                        __entry->freq_offset, __entry->width,                \
                        __entry->center_freq1, __entry->freq1_offset,        \
                        __entry->center_freq2, __entry->punctured

#define FILS_AAD_ASSIGN(fa)                                                \
        do {                                                                \
                if (fa) {                                                \
                        ether_addr_copy(__entry->macaddr, fa->macaddr);        \
                        __entry->kek_len = fa->kek_len;                        \
                } else {                                                \
                        eth_zero_addr(__entry->macaddr);                \
                        __entry->kek_len = 0;                                \
                }                                                        \
        } while (0)
#define FILS_AAD_PR_FMT                                                        \
        "macaddr: %pM, kek_len: %d"

#define SINFO_ENTRY __field(int, generation)            \
                    __field(u32, connected_time)    \
                    __field(u32, inactive_time)            \
                    __field(u32, rx_bytes)            \
                    __field(u32, tx_bytes)            \
                    __field(u32, rx_packets)            \
                    __field(u32, tx_packets)            \
                    __field(u32, tx_retries)            \
                    __field(u32, tx_failed)            \
                    __field(u32, rx_dropped_misc)   \
                    __field(u32, beacon_loss_count) \
                    __field(u16, llid)                    \
                    __field(u16, plid)                    \
                    __field(u8, plink_state)
#define SINFO_ASSIGN                                                       \
        do {                                                               \
                __entry->generation = sinfo->generation;               \
                __entry->connected_time = sinfo->connected_time;       \
                __entry->inactive_time = sinfo->inactive_time;               \
                __entry->rx_bytes = sinfo->rx_bytes;                       \
                __entry->tx_bytes = sinfo->tx_bytes;                       \
                __entry->rx_packets = sinfo->rx_packets;               \
                __entry->tx_packets = sinfo->tx_packets;               \
                __entry->tx_retries = sinfo->tx_retries;               \
                __entry->tx_failed = sinfo->tx_failed;                       \
                __entry->rx_dropped_misc = sinfo->rx_dropped_misc;     \
                __entry->beacon_loss_count = sinfo->beacon_loss_count; \
                __entry->llid = sinfo->llid;                               \
                __entry->plid = sinfo->plid;                               \
                __entry->plink_state = sinfo->plink_state;               \
        } while (0)

#define BOOL_TO_STR(bo) (bo) ? "true" : "false"

#define QOS_MAP_ENTRY __field(u8, num_des)                        \
                      __array(u8, dscp_exception,                \
                              2 * IEEE80211_QOS_MAP_MAX_EX)        \
                      __array(u8, up, IEEE80211_QOS_MAP_LEN_MIN)
#define QOS_MAP_ASSIGN(qos_map)                                        \
        do {                                                        \
                if ((qos_map)) {                                \
                        __entry->num_des = (qos_map)->num_des;        \
                        memcpy(__entry->dscp_exception,                \
                               &(qos_map)->dscp_exception,        \
                               2 * IEEE80211_QOS_MAP_MAX_EX);        \
                        memcpy(__entry->up, &(qos_map)->up,        \
                               IEEE80211_QOS_MAP_LEN_MIN);        \
                } else {                                        \
                        __entry->num_des = 0;                        \
                        memset(__entry->dscp_exception, 0,        \
                               2 * IEEE80211_QOS_MAP_MAX_EX);        \
                        memset(__entry->up, 0,                        \
                               IEEE80211_QOS_MAP_LEN_MIN);        \
                }                                                \
        } while (0)

/*************************************************************
 *                        wiphy work traces                     *
 *************************************************************/

DECLARE_EVENT_CLASS(wiphy_work_event,
        TP_PROTO(struct wiphy *wiphy, struct wiphy_work *work),
        TP_ARGS(wiphy, work),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(void *, instance)
                __field(void *, func)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->instance = work;
                __entry->func = work ? work->func : NULL;
        ),
        TP_printk(WIPHY_PR_FMT " instance=%p func=%pS",
                  WIPHY_PR_ARG, __entry->instance, __entry->func)
);

DEFINE_EVENT(wiphy_work_event, wiphy_work_queue,
        TP_PROTO(struct wiphy *wiphy, struct wiphy_work *work),
        TP_ARGS(wiphy, work)
);

DEFINE_EVENT(wiphy_work_event, wiphy_work_run,
        TP_PROTO(struct wiphy *wiphy, struct wiphy_work *work),
        TP_ARGS(wiphy, work)
);

DEFINE_EVENT(wiphy_work_event, wiphy_work_cancel,
        TP_PROTO(struct wiphy *wiphy, struct wiphy_work *work),
        TP_ARGS(wiphy, work)
);

DEFINE_EVENT(wiphy_work_event, wiphy_work_flush,
        TP_PROTO(struct wiphy *wiphy, struct wiphy_work *work),
        TP_ARGS(wiphy, work)
);

TRACE_EVENT(wiphy_delayed_work_queue,
        TP_PROTO(struct wiphy *wiphy, struct wiphy_work *work,
                 unsigned long delay),
        TP_ARGS(wiphy, work, delay),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(void *, instance)
                __field(void *, func)
                __field(unsigned long, delay)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->instance = work;
                __entry->func = work->func;
                __entry->delay = delay;
        ),
        TP_printk(WIPHY_PR_FMT " instance=%p func=%pS delay=%ld",
                  WIPHY_PR_ARG, __entry->instance, __entry->func,
                  __entry->delay)
);

TRACE_EVENT(wiphy_hrtimer_work_queue,
        TP_PROTO(struct wiphy *wiphy, struct wiphy_work *work,
                 ktime_t delay),
        TP_ARGS(wiphy, work, delay),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(void *, instance)
                __field(void *, func)
                __field(ktime_t, delay)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->instance = work;
                __entry->func = work->func;
                __entry->delay = delay;
        ),
        TP_printk(WIPHY_PR_FMT " instance=%p func=%pS delay=%llu",
                  WIPHY_PR_ARG, __entry->instance, __entry->func,
                  __entry->delay)
);

TRACE_EVENT(wiphy_work_worker_start,
        TP_PROTO(struct wiphy *wiphy),
        TP_ARGS(wiphy),
        TP_STRUCT__entry(
                WIPHY_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT, WIPHY_PR_ARG)
);

/*************************************************************
 *                        rdev->ops traces                     *
 *************************************************************/

TRACE_EVENT(rdev_suspend,
        TP_PROTO(struct wiphy *wiphy, struct cfg80211_wowlan *wow),
        TP_ARGS(wiphy, wow),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(bool, any)
                __field(bool, disconnect)
                __field(bool, magic_pkt)
                __field(bool, gtk_rekey_failure)
                __field(bool, eap_identity_req)
                __field(bool, four_way_handshake)
                __field(bool, rfkill_release)
                __field(bool, valid_wow)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                if (wow) {
                        __entry->any = wow->any;
                        __entry->disconnect = wow->disconnect;
                        __entry->magic_pkt = wow->magic_pkt;
                        __entry->gtk_rekey_failure = wow->gtk_rekey_failure;
                        __entry->eap_identity_req = wow->eap_identity_req;
                        __entry->four_way_handshake = wow->four_way_handshake;
                        __entry->rfkill_release = wow->rfkill_release;
                        __entry->valid_wow = true;
                } else {
                        __entry->valid_wow = false;
                }
        ),
        TP_printk(WIPHY_PR_FMT ", wow%s - any: %d, disconnect: %d, "
                  "magic pkt: %d, gtk rekey failure: %d, eap identify req: %d, "
                  "four way handshake: %d, rfkill release: %d.",
                  WIPHY_PR_ARG, __entry->valid_wow ? "" : "(Not configured!)",
                  __entry->any, __entry->disconnect, __entry->magic_pkt,
                  __entry->gtk_rekey_failure, __entry->eap_identity_req,
                  __entry->four_way_handshake, __entry->rfkill_release)
);

TRACE_EVENT(rdev_return_int,
        TP_PROTO(struct wiphy *wiphy, int ret),
        TP_ARGS(wiphy, ret),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(int, ret)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->ret = ret;
        ),
        TP_printk(WIPHY_PR_FMT ", returned: %d", WIPHY_PR_ARG, __entry->ret)
);

TRACE_EVENT(rdev_scan,
        TP_PROTO(struct wiphy *wiphy,
                 struct cfg80211_scan_request_int *request),
        TP_ARGS(wiphy, request),
        TP_STRUCT__entry(
                WIPHY_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT, WIPHY_PR_ARG)
);

DECLARE_EVENT_CLASS(wiphy_only_evt,
        TP_PROTO(struct wiphy *wiphy),
        TP_ARGS(wiphy),
        TP_STRUCT__entry(
                WIPHY_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT, WIPHY_PR_ARG)
);

DEFINE_EVENT(wiphy_only_evt, rdev_resume,
        TP_PROTO(struct wiphy *wiphy),
        TP_ARGS(wiphy)
);

DEFINE_EVENT(wiphy_only_evt, rdev_return_void,
        TP_PROTO(struct wiphy *wiphy),
        TP_ARGS(wiphy)
);

TRACE_EVENT(rdev_get_antenna,
        TP_PROTO(struct wiphy *wiphy, int radio_idx),
        TP_ARGS(wiphy, radio_idx),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(int, radio_idx)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->radio_idx = radio_idx;
        ),
        TP_printk(WIPHY_PR_FMT ", radio_idx: %d",
                  WIPHY_PR_ARG, __entry->radio_idx)
);

DEFINE_EVENT(wiphy_only_evt, rdev_rfkill_poll,
        TP_PROTO(struct wiphy *wiphy),
        TP_ARGS(wiphy)
);

DECLARE_EVENT_CLASS(wiphy_enabled_evt,
        TP_PROTO(struct wiphy *wiphy, bool enabled),
        TP_ARGS(wiphy, enabled),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(bool, enabled)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->enabled = enabled;
        ),
        TP_printk(WIPHY_PR_FMT ", %senabled ",
                  WIPHY_PR_ARG, __entry->enabled ? "" : "not ")
);

DEFINE_EVENT(wiphy_enabled_evt, rdev_set_wakeup,
        TP_PROTO(struct wiphy *wiphy, bool enabled),
        TP_ARGS(wiphy, enabled)
);

TRACE_EVENT(rdev_add_virtual_intf,
        TP_PROTO(struct wiphy *wiphy, char *name, enum nl80211_iftype type),
        TP_ARGS(wiphy, name, type),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __string(vir_intf_name, name ? name : "<noname>")
                __field(enum nl80211_iftype, type)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __assign_str(vir_intf_name);
                __entry->type = type;
        ),
        TP_printk(WIPHY_PR_FMT ", virtual intf name: %s, type: %d",
                  WIPHY_PR_ARG, __get_str(vir_intf_name), __entry->type)
);

DECLARE_EVENT_CLASS(wiphy_wdev_evt,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG)
);

DECLARE_EVENT_CLASS(wiphy_wdev_cookie_evt,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie),
        TP_ARGS(wiphy, wdev, cookie),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(u64, cookie)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->cookie = cookie;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie: %lld",
                  WIPHY_PR_ARG, WDEV_PR_ARG,
                  (unsigned long long)__entry->cookie)
);

DEFINE_EVENT(wiphy_wdev_evt, rdev_return_wdev,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev)
);

DEFINE_EVENT(wiphy_wdev_evt, rdev_del_virtual_intf,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev)
);

TRACE_EVENT(rdev_change_virtual_intf,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 enum nl80211_iftype type),
        TP_ARGS(wiphy, netdev, type),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(enum nl80211_iftype, type)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->type = type;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", type: %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->type)
);

DECLARE_EVENT_CLASS(key_handle,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id,
                 u8 key_index, bool pairwise, const u8 *mac_addr),
        TP_ARGS(wiphy, netdev, link_id, key_index, pairwise, mac_addr),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(mac_addr)
                __field(int, link_id)
                __field(u8, key_index)
                __field(bool, pairwise)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(mac_addr, mac_addr);
                __entry->link_id = link_id;
                __entry->key_index = key_index;
                __entry->pairwise = pairwise;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, "
                  "key_index: %u, pairwise: %s, mac addr: %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id,
                  __entry->key_index, BOOL_TO_STR(__entry->pairwise),
                  __entry->mac_addr)
);

DEFINE_EVENT(key_handle, rdev_get_key,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id,
                 u8 key_index, bool pairwise, const u8 *mac_addr),
        TP_ARGS(wiphy, netdev, link_id, key_index, pairwise, mac_addr)
);

DEFINE_EVENT(key_handle, rdev_del_key,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id,
                 u8 key_index, bool pairwise, const u8 *mac_addr),
        TP_ARGS(wiphy, netdev, link_id, key_index, pairwise, mac_addr)
);

TRACE_EVENT(rdev_add_key,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id,
                 u8 key_index, bool pairwise, const u8 *mac_addr, u8 mode),
        TP_ARGS(wiphy, netdev, link_id, key_index, pairwise, mac_addr, mode),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(mac_addr)
                __field(int, link_id)
                __field(u8, key_index)
                __field(bool, pairwise)
                __field(u8, mode)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(mac_addr, mac_addr);
                __entry->link_id = link_id;
                __entry->key_index = key_index;
                __entry->pairwise = pairwise;
                __entry->mode = mode;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, "
                  "key_index: %u, mode: %u, pairwise: %s, "
                  "mac addr: %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id,
                  __entry->key_index, __entry->mode,
                  BOOL_TO_STR(__entry->pairwise), __entry->mac_addr)
);

TRACE_EVENT(rdev_set_default_key,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id,
                 u8 key_index, bool unicast, bool multicast),
        TP_ARGS(wiphy, netdev, link_id, key_index, unicast, multicast),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(int, link_id)
                __field(u8, key_index)
                __field(bool, unicast)
                __field(bool, multicast)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->link_id = link_id;
                __entry->key_index = key_index;
                __entry->unicast = unicast;
                __entry->multicast = multicast;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, "
                  "key index: %u, unicast: %s, multicast: %s",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id,
                  __entry->key_index, BOOL_TO_STR(__entry->unicast),
                  BOOL_TO_STR(__entry->multicast))
);

TRACE_EVENT(rdev_set_default_mgmt_key,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id,
                 u8 key_index),
        TP_ARGS(wiphy, netdev, link_id, key_index),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(int, link_id)
                __field(u8, key_index)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->link_id = link_id;
                __entry->key_index = key_index;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, "
                  "key index: %u", WIPHY_PR_ARG, NETDEV_PR_ARG,
                  __entry->link_id, __entry->key_index)
);

TRACE_EVENT(rdev_set_default_beacon_key,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id,
                 u8 key_index),
        TP_ARGS(wiphy, netdev, link_id, key_index),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(int, link_id)
                __field(u8, key_index)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->link_id = link_id;
                __entry->key_index = key_index;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, "
                  "key index: %u", WIPHY_PR_ARG, NETDEV_PR_ARG,
                  __entry->link_id, __entry->key_index)
);

TRACE_EVENT(rdev_start_ap,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_ap_settings *settings),
        TP_ARGS(wiphy, netdev, settings),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                CHAN_DEF_ENTRY
                __field(int, beacon_interval)
                __field(int, dtim_period)
                __array(char, ssid, IEEE80211_MAX_SSID_LEN + 1)
                __field(enum nl80211_hidden_ssid, hidden_ssid)
                __field(u32, wpa_ver)
                __field(bool, privacy)
                __field(enum nl80211_auth_type, auth_type)
                __field(int, inactivity_timeout)
                __field(unsigned int, link_id)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                CHAN_DEF_ASSIGN(&settings->chandef);
                __entry->beacon_interval = settings->beacon_interval;
                __entry->dtim_period = settings->dtim_period;
                __entry->hidden_ssid = settings->hidden_ssid;
                __entry->wpa_ver = settings->crypto.wpa_versions;
                __entry->privacy = settings->privacy;
                __entry->auth_type = settings->auth_type;
                __entry->inactivity_timeout = settings->inactivity_timeout;
                memset(__entry->ssid, 0, IEEE80211_MAX_SSID_LEN + 1);
                memcpy(__entry->ssid, settings->ssid, settings->ssid_len);
                __entry->link_id = settings->beacon.link_id;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", AP settings - ssid: %s, "
                  CHAN_DEF_PR_FMT ", beacon interval: %d, dtim period: %d, "
                  "hidden ssid: %d, wpa versions: %u, privacy: %s, "
                  "auth type: %d, inactivity timeout: %d, link_id: %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->ssid, CHAN_DEF_PR_ARG,
                  __entry->beacon_interval, __entry->dtim_period,
                  __entry->hidden_ssid, __entry->wpa_ver,
                  BOOL_TO_STR(__entry->privacy), __entry->auth_type,
                  __entry->inactivity_timeout, __entry->link_id)
);

TRACE_EVENT(rdev_change_beacon,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_ap_update *info),
        TP_ARGS(wiphy, netdev, info),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(int, link_id)
                __dynamic_array(u8, head, info->beacon.head_len)
                __dynamic_array(u8, tail, info->beacon.tail_len)
                __dynamic_array(u8, beacon_ies, info->beacon.beacon_ies_len)
                __dynamic_array(u8, proberesp_ies, info->beacon.proberesp_ies_len)
                __dynamic_array(u8, assocresp_ies, info->beacon.assocresp_ies_len)
                __dynamic_array(u8, probe_resp, info->beacon.probe_resp_len)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->link_id = info->beacon.link_id;
                if (info->beacon.head)
                        memcpy(__get_dynamic_array(head),
                               info->beacon.head,
                               info->beacon.head_len);
                if (info->beacon.tail)
                        memcpy(__get_dynamic_array(tail),
                               info->beacon.tail,
                               info->beacon.tail_len);
                if (info->beacon.beacon_ies)
                        memcpy(__get_dynamic_array(beacon_ies),
                               info->beacon.beacon_ies,
                               info->beacon.beacon_ies_len);
                if (info->beacon.proberesp_ies)
                        memcpy(__get_dynamic_array(proberesp_ies),
                               info->beacon.proberesp_ies,
                               info->beacon.proberesp_ies_len);
                if (info->beacon.assocresp_ies)
                        memcpy(__get_dynamic_array(assocresp_ies),
                               info->beacon.assocresp_ies,
                               info->beacon.assocresp_ies_len);
                if (info->beacon.probe_resp)
                        memcpy(__get_dynamic_array(probe_resp),
                               info->beacon.probe_resp,
                               info->beacon.probe_resp_len);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id:%d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id)
);

TRACE_EVENT(rdev_stop_ap,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 unsigned int link_id),
        TP_ARGS(wiphy, netdev, link_id),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(unsigned int, link_id)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->link_id = link_id;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id)
);

DECLARE_EVENT_CLASS(wiphy_netdev_evt,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
        TP_ARGS(wiphy, netdev),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT, WIPHY_PR_ARG, NETDEV_PR_ARG)
);

DEFINE_EVENT(wiphy_netdev_evt, rdev_set_rekey_data,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
        TP_ARGS(wiphy, netdev)
);

DEFINE_EVENT(wiphy_netdev_evt, rdev_get_mesh_config,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
        TP_ARGS(wiphy, netdev)
);

DEFINE_EVENT(wiphy_netdev_evt, rdev_leave_mesh,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
        TP_ARGS(wiphy, netdev)
);

DEFINE_EVENT(wiphy_netdev_evt, rdev_leave_ibss,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
        TP_ARGS(wiphy, netdev)
);

DEFINE_EVENT(wiphy_netdev_evt, rdev_leave_ocb,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
        TP_ARGS(wiphy, netdev)
);

DEFINE_EVENT(wiphy_netdev_evt, rdev_flush_pmksa,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
        TP_ARGS(wiphy, netdev)
);

TRACE_EVENT(rdev_end_cac,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 unsigned int link_id),
        TP_ARGS(wiphy, netdev, link_id),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(unsigned int, link_id)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->link_id = link_id;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id)
);

DECLARE_EVENT_CLASS(station_add_change,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *mac,
                 struct station_parameters *params),
        TP_ARGS(wiphy, netdev, mac, params),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(sta_mac)
                __field(u32, sta_flags_mask)
                __field(u32, sta_flags_set)
                __field(u32, sta_modify_mask)
                __field(int, listen_interval)
                __field(u16, capability)
                __field(u16, aid)
                __field(u8, plink_action)
                __field(u8, plink_state)
                __field(u8, uapsd_queues)
                __field(u8, max_sp)
                __field(u8, opmode_notif)
                __field(bool, opmode_notif_used)
                __array(u8, ht_capa, (int)sizeof(struct ieee80211_ht_cap))
                __array(u8, vht_capa, (int)sizeof(struct ieee80211_vht_cap))
                __array(char, vlan, IFNAMSIZ)
                __dynamic_array(u8, supported_rates,
                                params->link_sta_params.supported_rates_len)
                __dynamic_array(u8, ext_capab, params->ext_capab_len)
                __dynamic_array(u8, supported_channels,
                                params->supported_channels_len)
                __dynamic_array(u8, supported_oper_classes,
                                params->supported_oper_classes_len)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(sta_mac, mac);
                __entry->sta_flags_mask = params->sta_flags_mask;
                __entry->sta_flags_set = params->sta_flags_set;
                __entry->sta_modify_mask = params->sta_modify_mask;
                __entry->listen_interval = params->listen_interval;
                __entry->aid = params->aid;
                __entry->plink_action = params->plink_action;
                __entry->plink_state = params->plink_state;
                __entry->uapsd_queues = params->uapsd_queues;
                memset(__entry->ht_capa, 0, sizeof(struct ieee80211_ht_cap));
                if (params->link_sta_params.ht_capa)
                        memcpy(__entry->ht_capa,
                               params->link_sta_params.ht_capa,
                               sizeof(struct ieee80211_ht_cap));
                memset(__entry->vht_capa, 0, sizeof(struct ieee80211_vht_cap));
                if (params->link_sta_params.vht_capa)
                        memcpy(__entry->vht_capa,
                               params->link_sta_params.vht_capa,
                               sizeof(struct ieee80211_vht_cap));
                memset(__entry->vlan, 0, sizeof(__entry->vlan));
                if (params->vlan)
                        memcpy(__entry->vlan, params->vlan->name, IFNAMSIZ);
                if (params->link_sta_params.supported_rates &&
                    params->link_sta_params.supported_rates_len)
                        memcpy(__get_dynamic_array(supported_rates),
                               params->link_sta_params.supported_rates,
                               params->link_sta_params.supported_rates_len);
                if (params->ext_capab && params->ext_capab_len)
                        memcpy(__get_dynamic_array(ext_capab),
                               params->ext_capab,
                               params->ext_capab_len);
                if (params->supported_channels &&
                    params->supported_channels_len)
                        memcpy(__get_dynamic_array(supported_channels),
                               params->supported_channels,
                               params->supported_channels_len);
                if (params->supported_oper_classes &&
                    params->supported_oper_classes_len)
                        memcpy(__get_dynamic_array(supported_oper_classes),
                               params->supported_oper_classes,
                               params->supported_oper_classes_len);
                __entry->max_sp = params->max_sp;
                __entry->capability = params->capability;
                __entry->opmode_notif = params->link_sta_params.opmode_notif;
                __entry->opmode_notif_used =
                        params->link_sta_params.opmode_notif_used;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: %pM"
                  ", station flags mask: 0x%x, station flags set: 0x%x, "
                  "station modify mask: 0x%x, listen interval: %d, aid: %u, "
                  "plink action: %u, plink state: %u, uapsd queues: %u, vlan:%s",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->sta_mac,
                  __entry->sta_flags_mask, __entry->sta_flags_set,
                  __entry->sta_modify_mask, __entry->listen_interval,
                  __entry->aid, __entry->plink_action, __entry->plink_state,
                  __entry->uapsd_queues, __entry->vlan)
);

DEFINE_EVENT(station_add_change, rdev_add_station,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *mac,
                 struct station_parameters *params),
        TP_ARGS(wiphy, netdev, mac, params)
);

DEFINE_EVENT(station_add_change, rdev_change_station,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *mac,
                 struct station_parameters *params),
        TP_ARGS(wiphy, netdev, mac, params)
);

DECLARE_EVENT_CLASS(wiphy_netdev_mac_evt,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *mac),
        TP_ARGS(wiphy, netdev, mac),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(sta_mac)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(sta_mac, mac);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", mac: %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->sta_mac)
);

DECLARE_EVENT_CLASS(station_del,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct station_del_parameters *params),
        TP_ARGS(wiphy, netdev, params),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(sta_mac)
                __field(u8, subtype)
                __field(u16, reason_code)
                __field(int, link_id)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(sta_mac, params->mac);
                __entry->subtype = params->subtype;
                __entry->reason_code = params->reason_code;
                __entry->link_id = params->link_id;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: %pM"
                  ", subtype: %u, reason_code: %u, link_id: %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->sta_mac,
                  __entry->subtype, __entry->reason_code,
                  __entry->link_id)
);

DEFINE_EVENT(station_del, rdev_del_station,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct station_del_parameters *params),
        TP_ARGS(wiphy, netdev, params)
);

DEFINE_EVENT(wiphy_netdev_mac_evt, rdev_get_station,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *mac),
        TP_ARGS(wiphy, netdev, mac)
);

DEFINE_EVENT(wiphy_netdev_mac_evt, rdev_del_mpath,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *mac),
        TP_ARGS(wiphy, netdev, mac)
);

TRACE_EVENT(rdev_dump_station,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int _idx,
                 u8 *mac),
        TP_ARGS(wiphy, netdev, _idx, mac),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(sta_mac)
                __field(int, idx)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(sta_mac, mac);
                __entry->idx = _idx;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: %pM, idx: %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->sta_mac,
                  __entry->idx)
);

TRACE_EVENT(rdev_return_int_station_info,
        TP_PROTO(struct wiphy *wiphy, int ret, struct station_info *sinfo),
        TP_ARGS(wiphy, ret, sinfo),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(int, ret)
                SINFO_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->ret = ret;
                SINFO_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT ", returned %d" ,
                  WIPHY_PR_ARG, __entry->ret)
);

DECLARE_EVENT_CLASS(mpath_evt,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *dst,
                 u8 *next_hop),
        TP_ARGS(wiphy, netdev, dst, next_hop),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(dst)
                MAC_ENTRY(next_hop)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(dst, dst);
                MAC_ASSIGN(next_hop, next_hop);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", destination: %pM, next hop: %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->dst,
                  __entry->next_hop)
);

DEFINE_EVENT(mpath_evt, rdev_add_mpath,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *dst,
                 u8 *next_hop),
        TP_ARGS(wiphy, netdev, dst, next_hop)
);

DEFINE_EVENT(mpath_evt, rdev_change_mpath,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *dst,
                 u8 *next_hop),
        TP_ARGS(wiphy, netdev, dst, next_hop)
);

DEFINE_EVENT(mpath_evt, rdev_get_mpath,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *dst,
                 u8 *next_hop),
        TP_ARGS(wiphy, netdev, dst, next_hop)
);

TRACE_EVENT(rdev_dump_mpath,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int _idx,
                 u8 *dst, u8 *next_hop),
        TP_ARGS(wiphy, netdev, _idx, dst, next_hop),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(dst)
                MAC_ENTRY(next_hop)
                __field(int, idx)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(dst, dst);
                MAC_ASSIGN(next_hop, next_hop);
                __entry->idx = _idx;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", index: %d, destination: %pM, next hop: %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->idx, __entry->dst,
                  __entry->next_hop)
);

TRACE_EVENT(rdev_get_mpp,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 u8 *dst, u8 *mpp),
        TP_ARGS(wiphy, netdev, dst, mpp),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(dst)
                MAC_ENTRY(mpp)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(dst, dst);
                MAC_ASSIGN(mpp, mpp);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", destination: %pM"
                  ", mpp: %pM", WIPHY_PR_ARG, NETDEV_PR_ARG,
                  __entry->dst, __entry->mpp)
);

TRACE_EVENT(rdev_dump_mpp,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int _idx,
                 u8 *dst, u8 *mpp),
        TP_ARGS(wiphy, netdev, _idx, dst, mpp),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(dst)
                MAC_ENTRY(mpp)
                __field(int, idx)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(dst, dst);
                MAC_ASSIGN(mpp, mpp);
                __entry->idx = _idx;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", index: %d, destination: %pM, mpp: %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->idx, __entry->dst,
                  __entry->mpp)
);

TRACE_EVENT(rdev_return_int_mpath_info,
        TP_PROTO(struct wiphy *wiphy, int ret, struct mpath_info *pinfo),
        TP_ARGS(wiphy, ret, pinfo),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(int, ret)
                __field(int, generation)
                __field(u32, filled)
                __field(u32, frame_qlen)
                __field(u32, sn)
                __field(u32, metric)
                __field(u32, exptime)
                __field(u32, discovery_timeout)
                __field(u8, discovery_retries)
                __field(u8, flags)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->ret = ret;
                __entry->generation = pinfo->generation;
                __entry->filled = pinfo->filled;
                __entry->frame_qlen = pinfo->frame_qlen;
                __entry->sn = pinfo->sn;
                __entry->metric = pinfo->metric;
                __entry->exptime = pinfo->exptime;
                __entry->discovery_timeout = pinfo->discovery_timeout;
                __entry->discovery_retries = pinfo->discovery_retries;
                __entry->flags = pinfo->flags;
        ),
        TP_printk(WIPHY_PR_FMT ", returned %d. mpath info - generation: %d, "
                  "filled: %u, frame qlen: %u, sn: %u, metric: %u, exptime: %u,"
                  " discovery timeout: %u, discovery retries: %u, flags: 0x%x",
                  WIPHY_PR_ARG, __entry->ret, __entry->generation,
                  __entry->filled, __entry->frame_qlen, __entry->sn,
                  __entry->metric, __entry->exptime, __entry->discovery_timeout,
                  __entry->discovery_retries, __entry->flags)
);

TRACE_EVENT(rdev_return_int_mesh_config,
        TP_PROTO(struct wiphy *wiphy, int ret, struct mesh_config *conf),
        TP_ARGS(wiphy, ret, conf),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                MESH_CFG_ENTRY
                __field(int, ret)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                MESH_CFG_ASSIGN;
                __entry->ret = ret;
        ),
        TP_printk(WIPHY_PR_FMT ", returned: %d",
                  WIPHY_PR_ARG, __entry->ret)
);

TRACE_EVENT(rdev_update_mesh_config,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u32 mask,
                 const struct mesh_config *conf),
        TP_ARGS(wiphy, netdev, mask, conf),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MESH_CFG_ENTRY
                __field(u32, mask)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MESH_CFG_ASSIGN;
                __entry->mask = mask;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", mask: %u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->mask)
);

TRACE_EVENT(rdev_join_mesh,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 const struct mesh_config *conf,
                 const struct mesh_setup *setup),
        TP_ARGS(wiphy, netdev, conf, setup),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MESH_CFG_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MESH_CFG_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT,
                  WIPHY_PR_ARG, NETDEV_PR_ARG)
);

TRACE_EVENT(rdev_change_bss,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct bss_parameters *params),
        TP_ARGS(wiphy, netdev, params),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(int, use_cts_prot)
                __field(int, use_short_preamble)
                __field(int, use_short_slot_time)
                __field(int, ap_isolate)
                __field(int, ht_opmode)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->use_cts_prot = params->use_cts_prot;
                __entry->use_short_preamble = params->use_short_preamble;
                __entry->use_short_slot_time = params->use_short_slot_time;
                __entry->ap_isolate = params->ap_isolate;
                __entry->ht_opmode = params->ht_opmode;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", use cts prot: %d, "
                  "use short preamble: %d, use short slot time: %d, "
                  "ap isolate: %d, ht opmode: %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->use_cts_prot,
                  __entry->use_short_preamble, __entry->use_short_slot_time,
                  __entry->ap_isolate, __entry->ht_opmode)
);

TRACE_EVENT(rdev_inform_bss,
        TP_PROTO(struct wiphy *wiphy, struct cfg80211_bss *bss),
        TP_ARGS(wiphy, bss),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                MAC_ENTRY(bssid)
                CHAN_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                MAC_ASSIGN(bssid, bss->bssid);
                CHAN_ASSIGN(bss->channel);
        ),
        TP_printk(WIPHY_PR_FMT ", %pM, " CHAN_PR_FMT,
                  WIPHY_PR_ARG, __entry->bssid, CHAN_PR_ARG)
);

TRACE_EVENT(rdev_set_txq_params,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct ieee80211_txq_params *params),
        TP_ARGS(wiphy, netdev, params),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(enum nl80211_ac, ac)
                __field(u16, txop)
                __field(u16, cwmin)
                __field(u16, cwmax)
                __field(u8, aifs)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->ac = params->ac;
                __entry->txop = params->txop;
                __entry->cwmin = params->cwmin;
                __entry->cwmax = params->cwmax;
                __entry->aifs = params->aifs;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", ac: %d, txop: %u, cwmin: %u, cwmax: %u, aifs: %u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->ac, __entry->txop,
                  __entry->cwmin, __entry->cwmax, __entry->aifs)
);

TRACE_EVENT(rdev_libertas_set_mesh_channel,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct ieee80211_channel *chan),
        TP_ARGS(wiphy, netdev, chan),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                CHAN_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                CHAN_ASSIGN(chan);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_PR_FMT, WIPHY_PR_ARG,
                  NETDEV_PR_ARG, CHAN_PR_ARG)
);

TRACE_EVENT(rdev_set_monitor_channel,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_chan_def *chandef),
        TP_ARGS(wiphy, netdev, chandef),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                CHAN_DEF_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                CHAN_DEF_ASSIGN(chandef);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT,
                  WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG)
);

TRACE_EVENT(rdev_auth,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_auth_request *req),
        TP_ARGS(wiphy, netdev, req),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(bssid)
                __field(enum nl80211_auth_type, auth_type)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                if (req->bss)
                        MAC_ASSIGN(bssid, req->bss->bssid);
                else
                        eth_zero_addr(__entry->bssid);
                __entry->auth_type = req->auth_type;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", auth type: %d, bssid: %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->auth_type,
                  __entry->bssid)
);

TRACE_EVENT(rdev_assoc,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_assoc_request *req),
        TP_ARGS(wiphy, netdev, req),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(bssid)
                MAC_ENTRY(prev_bssid)
                __field(bool, use_mfp)
                __field(u32, flags)
                __dynamic_array(u8, elements, req->ie_len)
                __array(u8, ht_capa, sizeof(struct ieee80211_ht_cap))
                __array(u8, ht_capa_mask, sizeof(struct ieee80211_ht_cap))
                __array(u8, vht_capa, sizeof(struct ieee80211_vht_cap))
                __array(u8, vht_capa_mask, sizeof(struct ieee80211_vht_cap))
                __dynamic_array(u8, fils_kek, req->fils_kek_len)
                __dynamic_array(u8, fils_nonces,
                                req->fils_nonces ? 2 * FILS_NONCE_LEN : 0)
                __field(u16, ext_mld_capa_ops)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                if (req->bss)
                        MAC_ASSIGN(bssid, req->bss->bssid);
                else
                        eth_zero_addr(__entry->bssid);
                MAC_ASSIGN(prev_bssid, req->prev_bssid);
                __entry->use_mfp = req->use_mfp;
                __entry->flags = req->flags;
                if (req->ie)
                        memcpy(__get_dynamic_array(elements),
                               req->ie, req->ie_len);
                memcpy(__entry->ht_capa, &req->ht_capa, sizeof(req->ht_capa));
                memcpy(__entry->ht_capa_mask, &req->ht_capa_mask,
                       sizeof(req->ht_capa_mask));
                memcpy(__entry->vht_capa, &req->vht_capa, sizeof(req->vht_capa));
                memcpy(__entry->vht_capa_mask, &req->vht_capa_mask,
                       sizeof(req->vht_capa_mask));
                if (req->fils_kek)
                        memcpy(__get_dynamic_array(fils_kek),
                               req->fils_kek, req->fils_kek_len);
                if (req->fils_nonces)
                        memcpy(__get_dynamic_array(fils_nonces),
                               req->fils_nonces, 2 * FILS_NONCE_LEN);
                __entry->ext_mld_capa_ops = req->ext_mld_capa_ops;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: %pM"
                  ", previous bssid: %pM, use mfp: %s, flags: 0x%x",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->bssid,
                  __entry->prev_bssid, BOOL_TO_STR(__entry->use_mfp),
                  __entry->flags)
);

TRACE_EVENT(rdev_deauth,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_deauth_request *req),
        TP_ARGS(wiphy, netdev, req),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(bssid)
                __field(u16, reason_code)
                __field(bool, local_state_change)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(bssid, req->bssid);
                __entry->reason_code = req->reason_code;
                __entry->local_state_change = req->local_state_change;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: %pM, reason: %u, local_state_change:%d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->bssid,
                  __entry->reason_code, __entry->local_state_change)
);

TRACE_EVENT(rdev_disassoc,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_disassoc_request *req),
        TP_ARGS(wiphy, netdev, req),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(bssid)
                __field(u16, reason_code)
                __field(bool, local_state_change)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(bssid, req->ap_addr);
                __entry->reason_code = req->reason_code;
                __entry->local_state_change = req->local_state_change;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: %pM"
                  ", reason: %u, local state change: %s",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->bssid,
                  __entry->reason_code,
                  BOOL_TO_STR(__entry->local_state_change))
);

TRACE_EVENT(rdev_mgmt_tx_cancel_wait,
        TP_PROTO(struct wiphy *wiphy,
                 struct wireless_dev *wdev, u64 cookie),
        TP_ARGS(wiphy, wdev, cookie),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(u64, cookie)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->cookie = cookie;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie: %llu ",
                  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->cookie)
);

TRACE_EVENT(rdev_set_power_mgmt,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 bool enabled, int timeout),
        TP_ARGS(wiphy, netdev, enabled, timeout),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(bool, enabled)
                __field(int, timeout)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->enabled = enabled;
                __entry->timeout = timeout;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %senabled, timeout: %d ",
                  WIPHY_PR_ARG, NETDEV_PR_ARG,
                  __entry->enabled ? "" : "not ", __entry->timeout)
);

TRACE_EVENT(rdev_connect,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_connect_params *sme),
        TP_ARGS(wiphy, netdev, sme),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(bssid)
                __array(char, ssid, IEEE80211_MAX_SSID_LEN + 1)
                __field(enum nl80211_auth_type, auth_type)
                __field(bool, privacy)
                __field(u32, wpa_versions)
                __field(u32, flags)
                MAC_ENTRY(prev_bssid)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(bssid, sme->bssid);
                memset(__entry->ssid, 0, IEEE80211_MAX_SSID_LEN + 1);
                memcpy(__entry->ssid, sme->ssid, sme->ssid_len);
                __entry->auth_type = sme->auth_type;
                __entry->privacy = sme->privacy;
                __entry->wpa_versions = sme->crypto.wpa_versions;
                __entry->flags = sme->flags;
                MAC_ASSIGN(prev_bssid, sme->prev_bssid);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: %pM"
                  ", ssid: %s, auth type: %d, privacy: %s, wpa versions: %u, "
                  "flags: 0x%x, previous bssid: %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->bssid, __entry->ssid,
                  __entry->auth_type, BOOL_TO_STR(__entry->privacy),
                  __entry->wpa_versions, __entry->flags, __entry->prev_bssid)
);

TRACE_EVENT(rdev_update_connect_params,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_connect_params *sme, u32 changed),
        TP_ARGS(wiphy, netdev, sme, changed),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(u32, changed)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->changed = changed;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", parameters changed: %u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG,  __entry->changed)
);

TRACE_EVENT(rdev_set_cqm_rssi_config,
        TP_PROTO(struct wiphy *wiphy,
                 struct net_device *netdev, s32 rssi_thold,
                 u32 rssi_hyst),
        TP_ARGS(wiphy, netdev, rssi_thold, rssi_hyst),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(s32, rssi_thold)
                __field(u32, rssi_hyst)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->rssi_thold = rssi_thold;
                __entry->rssi_hyst = rssi_hyst;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT
                  ", rssi_thold: %d, rssi_hyst: %u ",
                  WIPHY_PR_ARG, NETDEV_PR_ARG,
                 __entry->rssi_thold, __entry->rssi_hyst)
);

TRACE_EVENT(rdev_set_cqm_rssi_range_config,
        TP_PROTO(struct wiphy *wiphy,
                 struct net_device *netdev, s32 low, s32 high),
        TP_ARGS(wiphy, netdev, low, high),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(s32, rssi_low)
                __field(s32, rssi_high)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->rssi_low = low;
                __entry->rssi_high = high;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT
                  ", range: %d - %d ",
                  WIPHY_PR_ARG, NETDEV_PR_ARG,
                  __entry->rssi_low, __entry->rssi_high)
);

TRACE_EVENT(rdev_set_cqm_txe_config,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u32 rate,
                 u32 pkts, u32 intvl),
        TP_ARGS(wiphy, netdev, rate, pkts, intvl),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(u32, rate)
                __field(u32, pkts)
                __field(u32, intvl)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->rate = rate;
                __entry->pkts = pkts;
                __entry->intvl = intvl;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", rate: %u, packets: %u, interval: %u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->rate, __entry->pkts,
                  __entry->intvl)
);

TRACE_EVENT(rdev_disconnect,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 u16 reason_code),
        TP_ARGS(wiphy, netdev, reason_code),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(u16, reason_code)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->reason_code = reason_code;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", reason code: %u", WIPHY_PR_ARG,
                  NETDEV_PR_ARG, __entry->reason_code)
);

TRACE_EVENT(rdev_join_ibss,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_ibss_params *params),
        TP_ARGS(wiphy, netdev, params),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(bssid)
                __array(char, ssid, IEEE80211_MAX_SSID_LEN + 1)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(bssid, params->bssid);
                memset(__entry->ssid, 0, IEEE80211_MAX_SSID_LEN + 1);
                memcpy(__entry->ssid, params->ssid, params->ssid_len);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: %pM, ssid: %s",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->bssid, __entry->ssid)
);

TRACE_EVENT(rdev_join_ocb,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 const struct ocb_setup *setup),
        TP_ARGS(wiphy, netdev, setup),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT,
                  WIPHY_PR_ARG, NETDEV_PR_ARG)
);

TRACE_EVENT(rdev_set_wiphy_params,
        TP_PROTO(struct wiphy *wiphy, int radio_idx, u32 changed),
        TP_ARGS(wiphy, radio_idx, changed),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(int, radio_idx)
                __field(u32, changed)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->radio_idx = radio_idx;
                __entry->changed = changed;
        ),
        TP_printk(WIPHY_PR_FMT ", radio_idx: %d, changed: %u",
                  WIPHY_PR_ARG, __entry->radio_idx, __entry->changed)
);

DECLARE_EVENT_CLASS(wiphy_wdev_link_evt,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 unsigned int link_id),
        TP_ARGS(wiphy, wdev, link_id),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(unsigned int, link_id)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->link_id = link_id;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", link_id: %u",
                  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->link_id)
);

TRACE_EVENT(rdev_get_tx_power,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 int radio_idx, unsigned int link_id),
        TP_ARGS(wiphy, wdev, radio_idx, link_id),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(int, radio_idx)
                __field(unsigned int, link_id)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->radio_idx = radio_idx;
                __entry->link_id = link_id;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT
                  ", radio_idx: %d, link_id: %u",
                  WIPHY_PR_ARG, WDEV_PR_ARG,
                  __entry->radio_idx, __entry->link_id)
);

TRACE_EVENT(rdev_set_tx_power,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 int radio_idx, enum nl80211_tx_power_setting type,
                 int mbm),
        TP_ARGS(wiphy, wdev, radio_idx, type, mbm),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(int, radio_idx)
                __field(enum nl80211_tx_power_setting, type)
                __field(int, mbm)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->radio_idx = radio_idx;
                __entry->type = type;
                __entry->mbm = mbm;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT
                  ", radio_idx: %d, type: %u, mbm: %d",
                  WIPHY_PR_ARG, WDEV_PR_ARG,
                  __entry->radio_idx, __entry->type, __entry->mbm)
);

TRACE_EVENT(rdev_return_int_int,
        TP_PROTO(struct wiphy *wiphy, int func_ret, int func_fill),
        TP_ARGS(wiphy, func_ret, func_fill),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(int, func_ret)
                __field(int, func_fill)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->func_ret = func_ret;
                __entry->func_fill = func_fill;
        ),
        TP_printk(WIPHY_PR_FMT ", function returns: %d, function filled: %d",
                  WIPHY_PR_ARG, __entry->func_ret, __entry->func_fill)
);

#ifdef CONFIG_NL80211_TESTMODE
TRACE_EVENT(rdev_testmode_cmd,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG)
);

TRACE_EVENT(rdev_testmode_dump,
        TP_PROTO(struct wiphy *wiphy),
        TP_ARGS(wiphy),
        TP_STRUCT__entry(
                WIPHY_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT, WIPHY_PR_ARG)
);
#endif /* CONFIG_NL80211_TESTMODE */

TRACE_EVENT(rdev_set_bitrate_mask,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 unsigned int link_id,
                 const u8 *peer, const struct cfg80211_bitrate_mask *mask),
        TP_ARGS(wiphy, netdev, link_id, peer, mask),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(unsigned int, link_id)
                MAC_ENTRY(peer)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->link_id = link_id;
                MAC_ASSIGN(peer, peer);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, peer: %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id,
                  __entry->peer)
);

TRACE_EVENT(rdev_update_mgmt_frame_registrations,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 struct mgmt_frame_regs *upd),
        TP_ARGS(wiphy, wdev, upd),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(u16, global_stypes)
                __field(u16, interface_stypes)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->global_stypes = upd->global_stypes;
                __entry->interface_stypes = upd->interface_stypes;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", global: 0x%.2x, intf: 0x%.2x",
                  WIPHY_PR_ARG, WDEV_PR_ARG,
                  __entry->global_stypes, __entry->interface_stypes)
);

TRACE_EVENT(rdev_return_int_tx_rx,
        TP_PROTO(struct wiphy *wiphy, int ret, u32 tx, u32 rx),
        TP_ARGS(wiphy, ret, tx, rx),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(int, ret)
                __field(u32, tx)
                __field(u32, rx)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->ret = ret;
                __entry->tx = tx;
                __entry->rx = rx;
        ),
        TP_printk(WIPHY_PR_FMT ", returned %d, tx: %u, rx: %u",
                  WIPHY_PR_ARG, __entry->ret, __entry->tx, __entry->rx)
);

TRACE_EVENT(rdev_return_void_tx_rx,
        TP_PROTO(struct wiphy *wiphy, u32 tx, u32 tx_max,
                 u32 rx, u32 rx_max),
        TP_ARGS(wiphy, tx, tx_max, rx, rx_max),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(u32, tx)
                __field(u32, tx_max)
                __field(u32, rx)
                __field(u32, rx_max)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->tx = tx;
                __entry->tx_max = tx_max;
                __entry->rx = rx;
                __entry->rx_max = rx_max;
        ),
        TP_printk(WIPHY_PR_FMT ", tx: %u, tx_max: %u, rx: %u, rx_max: %u ",
                  WIPHY_PR_ARG, __entry->tx, __entry->tx_max, __entry->rx,
                  __entry->rx_max)
);

TRACE_EVENT(rdev_set_antenna,
        TP_PROTO(struct wiphy *wiphy, int radio_idx, u32 tx, u32 rx),
        TP_ARGS(wiphy, radio_idx, tx, rx),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(int, radio_idx)
                __field(u32, tx)
                __field(u32, rx)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->radio_idx = radio_idx;
                __entry->tx = tx;
                __entry->rx = rx;
        ),
        TP_printk(WIPHY_PR_FMT ", radio_idx: %d, tx: %u, rx: %u ",
                  WIPHY_PR_ARG, __entry->radio_idx,
                  __entry->tx, __entry->rx)
);

DECLARE_EVENT_CLASS(wiphy_netdev_id_evt,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u64 id),
        TP_ARGS(wiphy, netdev, id),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(u64, id)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->id = id;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", id: %llu",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->id)
);

DEFINE_EVENT(wiphy_netdev_id_evt, rdev_sched_scan_start,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u64 id),
        TP_ARGS(wiphy, netdev, id)
);

DEFINE_EVENT(wiphy_netdev_id_evt, rdev_sched_scan_stop,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u64 id),
        TP_ARGS(wiphy, netdev, id)
);

TRACE_EVENT(rdev_tdls_mgmt,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 u8 *peer, int link_id, u8 action_code, u8 dialog_token,
                 u16 status_code, u32 peer_capability,
                 bool initiator, const u8 *buf, size_t len),
        TP_ARGS(wiphy, netdev, peer, link_id, action_code, dialog_token,
                status_code, peer_capability, initiator, buf, len),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(peer)
                __field(int, link_id)
                __field(u8, action_code)
                __field(u8, dialog_token)
                __field(u16, status_code)
                __field(u32, peer_capability)
                __field(bool, initiator)
                __dynamic_array(u8, buf, len)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(peer, peer);
                __entry->link_id = link_id;
                __entry->action_code = action_code;
                __entry->dialog_token = dialog_token;
                __entry->status_code = status_code;
                __entry->peer_capability = peer_capability;
                __entry->initiator = initiator;
                memcpy(__get_dynamic_array(buf), buf, len);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM"
                  ", link_id: %d, action_code: %u "
                  "dialog_token: %u, status_code: %u, peer_capability: %u "
                  "initiator: %s buf: %#.2x ",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer,
                  __entry->link_id, __entry->action_code, __entry->dialog_token,
                  __entry->status_code, __entry->peer_capability,
                  BOOL_TO_STR(__entry->initiator),
                  ((u8 *)__get_dynamic_array(buf))[0])
);

TRACE_EVENT(rdev_dump_survey,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int _idx),
        TP_ARGS(wiphy, netdev, _idx),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(int, idx)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->idx = _idx;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", index: %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->idx)
);

TRACE_EVENT(rdev_return_int_survey_info,
        TP_PROTO(struct wiphy *wiphy, int ret, struct survey_info *info),
        TP_ARGS(wiphy, ret, info),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                CHAN_ENTRY
                __field(int, ret)
                __field(u64, time)
                __field(u64, time_busy)
                __field(u64, time_ext_busy)
                __field(u64, time_rx)
                __field(u64, time_tx)
                __field(u64, time_scan)
                __field(u32, filled)
                __field(s8, noise)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                CHAN_ASSIGN(info->channel);
                __entry->ret = ret;
                __entry->time = info->time;
                __entry->time_busy = info->time_busy;
                __entry->time_ext_busy = info->time_ext_busy;
                __entry->time_rx = info->time_rx;
                __entry->time_tx = info->time_tx;
                __entry->time_scan = info->time_scan;
                __entry->filled = info->filled;
                __entry->noise = info->noise;
        ),
        TP_printk(WIPHY_PR_FMT ", returned: %d, " CHAN_PR_FMT
                  ", channel time: %llu, channel time busy: %llu, "
                  "channel time extension busy: %llu, channel time rx: %llu, "
                  "channel time tx: %llu, scan time: %llu, filled: %u, noise: %d",
                  WIPHY_PR_ARG, __entry->ret, CHAN_PR_ARG,
                  __entry->time, __entry->time_busy,
                  __entry->time_ext_busy, __entry->time_rx,
                  __entry->time_tx, __entry->time_scan,
                  __entry->filled, __entry->noise)
);

TRACE_EVENT(rdev_tdls_oper,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 u8 *peer, enum nl80211_tdls_operation oper),
        TP_ARGS(wiphy, netdev, peer, oper),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(peer)
                __field(enum nl80211_tdls_operation, oper)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(peer, peer);
                __entry->oper = oper;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM, oper: %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer, __entry->oper)
);

DECLARE_EVENT_CLASS(rdev_pmksa,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_pmksa *pmksa),
        TP_ARGS(wiphy, netdev, pmksa),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(bssid)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(bssid, pmksa->bssid);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->bssid)
);

TRACE_EVENT(rdev_probe_client,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 const u8 *peer),
        TP_ARGS(wiphy, netdev, peer),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(peer)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(peer, peer);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer)
);

DEFINE_EVENT(rdev_pmksa, rdev_set_pmksa,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_pmksa *pmksa),
        TP_ARGS(wiphy, netdev, pmksa)
);

DEFINE_EVENT(rdev_pmksa, rdev_del_pmksa,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_pmksa *pmksa),
        TP_ARGS(wiphy, netdev, pmksa)
);

TRACE_EVENT(rdev_remain_on_channel,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 struct ieee80211_channel *chan,
                 unsigned int duration),
        TP_ARGS(wiphy, wdev, chan, duration),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                CHAN_ENTRY
                __field(unsigned int, duration)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                CHAN_ASSIGN(chan);
                __entry->duration = duration;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", " CHAN_PR_FMT ", duration: %u",
                  WIPHY_PR_ARG, WDEV_PR_ARG, CHAN_PR_ARG, __entry->duration)
);

TRACE_EVENT(rdev_return_int_cookie,
        TP_PROTO(struct wiphy *wiphy, int ret, u64 cookie),
        TP_ARGS(wiphy, ret, cookie),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(int, ret)
                __field(u64, cookie)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->ret = ret;
                __entry->cookie = cookie;
        ),
        TP_printk(WIPHY_PR_FMT ", returned %d, cookie: %llu",
                  WIPHY_PR_ARG, __entry->ret, __entry->cookie)
);

TRACE_EVENT(rdev_cancel_remain_on_channel,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie),
        TP_ARGS(wiphy, wdev, cookie),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(u64, cookie)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->cookie = cookie;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie: %llu",
                  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->cookie)
);

TRACE_EVENT(rdev_mgmt_tx,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 struct cfg80211_mgmt_tx_params *params),
        TP_ARGS(wiphy, wdev, params),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                CHAN_ENTRY
                __field(bool, offchan)
                __field(unsigned int, wait)
                __field(bool, no_cck)
                __field(bool, dont_wait_for_ack)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                CHAN_ASSIGN(params->chan);
                __entry->offchan = params->offchan;
                __entry->wait = params->wait;
                __entry->no_cck = params->no_cck;
                __entry->dont_wait_for_ack = params->dont_wait_for_ack;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", " CHAN_PR_FMT ", offchan: %s,"
                  " wait: %u, no cck: %s, dont wait for ack: %s",
                  WIPHY_PR_ARG, WDEV_PR_ARG, CHAN_PR_ARG,
                  BOOL_TO_STR(__entry->offchan), __entry->wait,
                  BOOL_TO_STR(__entry->no_cck),
                  BOOL_TO_STR(__entry->dont_wait_for_ack))
);

TRACE_EVENT(rdev_tx_control_port,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 const u8 *buf, size_t len, const u8 *dest, __be16 proto,
                 bool unencrypted, int link_id),
        TP_ARGS(wiphy, netdev, buf, len, dest, proto, unencrypted, link_id),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(dest)
                __field(__be16, proto)
                __field(bool, unencrypted)
                __field(int, link_id)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(dest, dest);
                __entry->proto = proto;
                __entry->unencrypted = unencrypted;
                __entry->link_id = link_id;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM,"
                  " proto: 0x%x, unencrypted: %s, link: %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->dest,
                  be16_to_cpu(__entry->proto),
                  BOOL_TO_STR(__entry->unencrypted),
                  __entry->link_id)
);

TRACE_EVENT(rdev_set_noack_map,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 u16 noack_map),
        TP_ARGS(wiphy, netdev, noack_map),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(u16, noack_map)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->noack_map = noack_map;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", noack_map: %u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->noack_map)
);
DEFINE_EVENT(wiphy_wdev_link_evt, rdev_get_channel,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 unsigned int link_id),
        TP_ARGS(wiphy, wdev, link_id)
);

TRACE_EVENT(rdev_return_chandef,
        TP_PROTO(struct wiphy *wiphy, int ret,
                 struct cfg80211_chan_def *chandef),
        TP_ARGS(wiphy, ret, chandef),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(int, ret)
                CHAN_DEF_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                if (ret == 0)
                        CHAN_DEF_ASSIGN(chandef);
                else
                        CHAN_DEF_ASSIGN((struct cfg80211_chan_def *)NULL);
                __entry->ret = ret;
        ),
        TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT ", ret: %d",
                  WIPHY_PR_ARG, CHAN_DEF_PR_ARG, __entry->ret)
);

DEFINE_EVENT(wiphy_wdev_evt, rdev_start_p2p_device,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev)
);

DEFINE_EVENT(wiphy_wdev_evt, rdev_stop_p2p_device,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev)
);

TRACE_EVENT(rdev_start_nan,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 struct cfg80211_nan_conf *conf),
        TP_ARGS(wiphy, wdev, conf),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(u8, master_pref)
                __field(u8, bands)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->master_pref = conf->master_pref;
                __entry->bands = conf->bands;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT
                  ", master preference: %u, bands: 0x%0x",
                  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref,
                  __entry->bands)
);

TRACE_EVENT(rdev_nan_change_conf,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 struct cfg80211_nan_conf *conf, u32 changes),
        TP_ARGS(wiphy, wdev, conf, changes),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(u8, master_pref)
                __field(u8, bands)
                __field(u32, changes)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->master_pref = conf->master_pref;
                __entry->bands = conf->bands;
                __entry->changes = changes;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT
                  ", master preference: %u, bands: 0x%0x, changes: %x",
                  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref,
                  __entry->bands, __entry->changes)
);

DEFINE_EVENT(wiphy_wdev_evt, rdev_stop_nan,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev)
);

TRACE_EVENT(rdev_add_nan_func,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 const struct cfg80211_nan_func *func),
        TP_ARGS(wiphy, wdev, func),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(u8, func_type)
                __field(u64, cookie)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->func_type = func->type;
                __entry->cookie = func->cookie
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", type=%u, cookie=%llu",
                  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->func_type,
                  __entry->cookie)
);

TRACE_EVENT(rdev_del_nan_func,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 u64 cookie),
        TP_ARGS(wiphy, wdev, cookie),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(u64, cookie)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->cookie = cookie;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie=%llu",
                  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->cookie)
);

TRACE_EVENT(rdev_set_mac_acl,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_acl_data *params),
        TP_ARGS(wiphy, netdev, params),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(u32, acl_policy)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->acl_policy = params->acl_policy;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", acl policy: %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->acl_policy)
);

TRACE_EVENT(rdev_update_ft_ies,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_update_ft_ies_params *ftie),
        TP_ARGS(wiphy, netdev, ftie),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(u16, md)
                __dynamic_array(u8, ie, ftie->ie_len)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->md = ftie->md;
                memcpy(__get_dynamic_array(ie), ftie->ie, ftie->ie_len);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", md: 0x%x",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->md)
);

TRACE_EVENT(rdev_crit_proto_start,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 enum nl80211_crit_proto_id protocol, u16 duration),
        TP_ARGS(wiphy, wdev, protocol, duration),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(u16, proto)
                __field(u16, duration)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->proto = protocol;
                __entry->duration = duration;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", proto=%x, duration=%u",
                  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->proto, __entry->duration)
);

TRACE_EVENT(rdev_crit_proto_stop,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT,
                  WIPHY_PR_ARG, WDEV_PR_ARG)
);

TRACE_EVENT(rdev_channel_switch,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_csa_settings *params),
        TP_ARGS(wiphy, netdev, params),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                CHAN_DEF_ENTRY
                __field(bool, radar_required)
                __field(bool, block_tx)
                __field(u8, count)
                __dynamic_array(u16, bcn_ofs, params->n_counter_offsets_beacon)
                __dynamic_array(u16, pres_ofs, params->n_counter_offsets_presp)
                __field(u8, link_id)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                CHAN_DEF_ASSIGN(&params->chandef);
                __entry->radar_required = params->radar_required;
                __entry->block_tx = params->block_tx;
                __entry->count = params->count;
                memcpy(__get_dynamic_array(bcn_ofs),
                       params->counter_offsets_beacon,
                       params->n_counter_offsets_beacon * sizeof(u16));

                /* probe response offsets are optional */
                if (params->n_counter_offsets_presp)
                        memcpy(__get_dynamic_array(pres_ofs),
                               params->counter_offsets_presp,
                               params->n_counter_offsets_presp * sizeof(u16));
                __entry->link_id = params->link_id;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT
                  ", block_tx: %d, count: %u, radar_required: %d, link_id: %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG,
                  __entry->block_tx, __entry->count, __entry->radar_required,
                  __entry->link_id)
);

TRACE_EVENT(rdev_set_qos_map,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_qos_map *qos_map),
        TP_ARGS(wiphy, netdev, qos_map),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                QOS_MAP_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                QOS_MAP_ASSIGN(qos_map);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", num_des: %u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->num_des)
);

TRACE_EVENT(rdev_set_ap_chanwidth,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 unsigned int link_id,
                 struct cfg80211_chan_def *chandef),
        TP_ARGS(wiphy, netdev, link_id, chandef),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                CHAN_DEF_ENTRY
                __field(unsigned int, link_id)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                CHAN_DEF_ASSIGN(chandef);
                __entry->link_id = link_id;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT ", link:%d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG,
                  __entry->link_id)
);

TRACE_EVENT(rdev_add_tx_ts,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 u8 tsid, const u8 *peer, u8 user_prio, u16 admitted_time),
        TP_ARGS(wiphy, netdev, tsid, peer, user_prio, admitted_time),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(peer)
                __field(u8, tsid)
                __field(u8, user_prio)
                __field(u16, admitted_time)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(peer, peer);
                __entry->tsid = tsid;
                __entry->user_prio = user_prio;
                __entry->admitted_time = admitted_time;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM, TSID %d, UP %d, time %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer,
                  __entry->tsid, __entry->user_prio, __entry->admitted_time)
);

TRACE_EVENT(rdev_del_tx_ts,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 u8 tsid, const u8 *peer),
        TP_ARGS(wiphy, netdev, tsid, peer),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(peer)
                __field(u8, tsid)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(peer, peer);
                __entry->tsid = tsid;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM, TSID %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer, __entry->tsid)
);

TRACE_EVENT(rdev_tdls_channel_switch,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 const u8 *addr, u8 oper_class,
                 struct cfg80211_chan_def *chandef),
        TP_ARGS(wiphy, netdev, addr, oper_class, chandef),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(addr)
                __field(u8, oper_class)
                CHAN_DEF_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(addr, addr);
                CHAN_DEF_ASSIGN(chandef);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM"
                  " oper class %d, " CHAN_DEF_PR_FMT,
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->addr,
                  __entry->oper_class, CHAN_DEF_PR_ARG)
);

TRACE_EVENT(rdev_tdls_cancel_channel_switch,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 const u8 *addr),
        TP_ARGS(wiphy, netdev, addr),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(addr)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(addr, addr);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->addr)
);

TRACE_EVENT(rdev_set_pmk,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_pmk_conf *pmk_conf),

        TP_ARGS(wiphy, netdev, pmk_conf),

        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(aa)
                __field(u8, pmk_len)
                __field(u8, pmk_r0_name_len)
                __dynamic_array(u8, pmk, pmk_conf->pmk_len)
                __dynamic_array(u8, pmk_r0_name, WLAN_PMK_NAME_LEN)
        ),

        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(aa, pmk_conf->aa);
                __entry->pmk_len = pmk_conf->pmk_len;
                __entry->pmk_r0_name_len =
                pmk_conf->pmk_r0_name ? WLAN_PMK_NAME_LEN : 0;
                memcpy(__get_dynamic_array(pmk), pmk_conf->pmk,
                       pmk_conf->pmk_len);
                memcpy(__get_dynamic_array(pmk_r0_name), pmk_conf->pmk_r0_name,
                       pmk_conf->pmk_r0_name ? WLAN_PMK_NAME_LEN : 0);
        ),

        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM"
                  "pmk_len=%u, pmk: %s pmk_r0_name: %s", WIPHY_PR_ARG,
                  NETDEV_PR_ARG, __entry->aa, __entry->pmk_len,
                  __print_array(__get_dynamic_array(pmk),
                                __get_dynamic_array_len(pmk), 1),
                  __entry->pmk_r0_name_len ?
                  __print_array(__get_dynamic_array(pmk_r0_name),
                                __get_dynamic_array_len(pmk_r0_name), 1) : "")
);

TRACE_EVENT(rdev_del_pmk,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *aa),

        TP_ARGS(wiphy, netdev, aa),

        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(aa)
        ),

        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(aa, aa);
        ),

        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->aa)
);

TRACE_EVENT(rdev_external_auth,
            TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                     struct cfg80211_external_auth_params *params),
            TP_ARGS(wiphy, netdev, params),
            TP_STRUCT__entry(WIPHY_ENTRY
                             NETDEV_ENTRY
                             MAC_ENTRY(bssid)
                             __array(u8, ssid, IEEE80211_MAX_SSID_LEN + 1)
                             __field(u16, status)
                             MAC_ENTRY(mld_addr)
            ),
            TP_fast_assign(WIPHY_ASSIGN;
                           NETDEV_ASSIGN;
                           MAC_ASSIGN(bssid, params->bssid);
                           memset(__entry->ssid, 0, IEEE80211_MAX_SSID_LEN + 1);
                           memcpy(__entry->ssid, params->ssid.ssid,
                                  params->ssid.ssid_len);
                           __entry->status = params->status;
                           MAC_ASSIGN(mld_addr, params->mld_addr);
            ),
            TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: %pM"
                      ", ssid: %s, status: %u, mld_addr: %pM",
                      WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->bssid,
                      __entry->ssid, __entry->status, __entry->mld_addr)
);

TRACE_EVENT(rdev_start_radar_detection,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_chan_def *chandef,
                 u32 cac_time_ms, int link_id),
        TP_ARGS(wiphy, netdev, chandef, cac_time_ms, link_id),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                CHAN_DEF_ENTRY
                __field(u32, cac_time_ms)
                __field(int, link_id)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                CHAN_DEF_ASSIGN(chandef);
                __entry->cac_time_ms = cac_time_ms;
                __entry->link_id = link_id;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT
                  ", cac_time_ms=%u, link_id=%d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG,
                  __entry->cac_time_ms, __entry->link_id)
);

TRACE_EVENT(rdev_set_mcast_rate,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 int *mcast_rate),
        TP_ARGS(wiphy, netdev, mcast_rate),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __array(int, mcast_rate, NUM_NL80211_BANDS)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                memcpy(__entry->mcast_rate, mcast_rate,
                       sizeof(int) * NUM_NL80211_BANDS);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", "
                  "mcast_rates [2.4GHz=0x%x, 5.2GHz=0x%x, 6GHz=0x%x, 60GHz=0x%x]",
                  WIPHY_PR_ARG, NETDEV_PR_ARG,
                  __entry->mcast_rate[NL80211_BAND_2GHZ],
                  __entry->mcast_rate[NL80211_BAND_5GHZ],
                  __entry->mcast_rate[NL80211_BAND_6GHZ],
                  __entry->mcast_rate[NL80211_BAND_60GHZ])
);

TRACE_EVENT(rdev_set_coalesce,
        TP_PROTO(struct wiphy *wiphy, struct cfg80211_coalesce *coalesce),
        TP_ARGS(wiphy, coalesce),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(int, n_rules)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->n_rules = coalesce ? coalesce->n_rules : 0;
        ),
        TP_printk(WIPHY_PR_FMT ", n_rules=%d",
                  WIPHY_PR_ARG, __entry->n_rules)
);

DEFINE_EVENT(wiphy_wdev_evt, rdev_abort_scan,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev)
);

TRACE_EVENT(rdev_set_multicast_to_unicast,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 const bool enabled),
        TP_ARGS(wiphy, netdev, enabled),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(bool, enabled)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->enabled = enabled;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", unicast: %s",
                  WIPHY_PR_ARG, NETDEV_PR_ARG,
                  BOOL_TO_STR(__entry->enabled))
);

DEFINE_EVENT(wiphy_wdev_evt, rdev_get_txq_stats,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev)
);

TRACE_EVENT(rdev_get_ftm_responder_stats,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_ftm_responder_stats *ftm_stats),

        TP_ARGS(wiphy, netdev, ftm_stats),

        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(u64, timestamp)
                __field(u32, success_num)
                __field(u32, partial_num)
                __field(u32, failed_num)
                __field(u32, asap_num)
                __field(u32, non_asap_num)
                __field(u64, duration)
                __field(u32, unknown_triggers)
                __field(u32, reschedule)
                __field(u32, out_of_window)
        ),

        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->success_num = ftm_stats->success_num;
                __entry->partial_num = ftm_stats->partial_num;
                __entry->failed_num = ftm_stats->failed_num;
                __entry->asap_num = ftm_stats->asap_num;
                __entry->non_asap_num = ftm_stats->non_asap_num;
                __entry->duration = ftm_stats->total_duration_ms;
                __entry->unknown_triggers = ftm_stats->unknown_triggers_num;
                __entry->reschedule = ftm_stats->reschedule_requests_num;
                __entry->out_of_window = ftm_stats->out_of_window_triggers_num;
        ),

        TP_printk(WIPHY_PR_FMT "Ftm responder stats: success %u, partial %u, "
                "failed %u, asap %u, non asap %u, total duration %llu, unknown "
                "triggers %u, rescheduled %u, out of window %u", WIPHY_PR_ARG,
                __entry->success_num, __entry->partial_num, __entry->failed_num,
                __entry->asap_num, __entry->non_asap_num, __entry->duration,
                __entry->unknown_triggers, __entry->reschedule,
                __entry->out_of_window)
);

DEFINE_EVENT(wiphy_wdev_cookie_evt, rdev_start_pmsr,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie),
        TP_ARGS(wiphy, wdev, cookie)
);

DEFINE_EVENT(wiphy_wdev_cookie_evt, rdev_abort_pmsr,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie),
        TP_ARGS(wiphy, wdev, cookie)
);

TRACE_EVENT(rdev_set_fils_aad,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_fils_aad *fils_aad),
        TP_ARGS(wiphy, netdev, fils_aad),
        TP_STRUCT__entry(WIPHY_ENTRY
                NETDEV_ENTRY
                __array(u8, macaddr, ETH_ALEN)
                __field(u8, kek_len)
        ),
        TP_fast_assign(WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                FILS_AAD_ASSIGN(fils_aad);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " FILS_AAD_PR_FMT,
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->macaddr,
                  __entry->kek_len)
);

TRACE_EVENT(rdev_update_owe_info,
            TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                     struct cfg80211_update_owe_info *owe_info),
            TP_ARGS(wiphy, netdev, owe_info),
            TP_STRUCT__entry(WIPHY_ENTRY
                             NETDEV_ENTRY
                             MAC_ENTRY(peer)
                             __field(u16, status)
                             __dynamic_array(u8, ie, owe_info->ie_len)),
            TP_fast_assign(WIPHY_ASSIGN;
                           NETDEV_ASSIGN;
                           MAC_ASSIGN(peer, owe_info->peer);
                           __entry->status = owe_info->status;
                           memcpy(__get_dynamic_array(ie),
                                  owe_info->ie, owe_info->ie_len);),
            TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: %pM"
                  " status %d", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer,
                  __entry->status)
);

TRACE_EVENT(rdev_probe_mesh_link,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 const u8 *dest, const u8 *buf, size_t len),
        TP_ARGS(wiphy, netdev, dest, buf, len),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(dest)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(dest, dest);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->dest)
);

TRACE_EVENT(rdev_set_tid_config,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_tid_config *tid_conf),
        TP_ARGS(wiphy, netdev, tid_conf),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(peer)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(peer, tid_conf->peer);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer)
);

TRACE_EVENT(rdev_reset_tid_config,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 const u8 *peer, u8 tids),
        TP_ARGS(wiphy, netdev, peer, tids),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(peer)
                __field(u8, tids)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(peer, peer);
                __entry->tids = tids;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: %pM, tids: 0x%x",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer, __entry->tids)
);

TRACE_EVENT(rdev_set_sar_specs,
        TP_PROTO(struct wiphy *wiphy, struct cfg80211_sar_specs *sar),
        TP_ARGS(wiphy, sar),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(u16, type)
                __field(u16, num)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->type = sar->type;
                __entry->num = sar->num_sub_specs;

        ),
        TP_printk(WIPHY_PR_FMT ", Set type:%d, num_specs:%d",
                  WIPHY_PR_ARG, __entry->type, __entry->num)
);

TRACE_EVENT(rdev_color_change,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_color_change_settings *params),
        TP_ARGS(wiphy, netdev, params),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(u8, count)
                __field(u16, bcn_ofs)
                __field(u16, pres_ofs)
                __field(u8, link_id)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->count = params->count;
                __entry->bcn_ofs = params->counter_offset_beacon;
                __entry->pres_ofs = params->counter_offset_presp;
                __entry->link_id = params->link_id;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT
                  ", count: %u, link_id: %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG,
                  __entry->count, __entry->link_id)
);

TRACE_EVENT(rdev_set_radar_background,
        TP_PROTO(struct wiphy *wiphy, struct cfg80211_chan_def *chandef),

        TP_ARGS(wiphy, chandef),

        TP_STRUCT__entry(
                WIPHY_ENTRY
                CHAN_DEF_ENTRY
        ),

        TP_fast_assign(
                WIPHY_ASSIGN;
                CHAN_DEF_ASSIGN(chandef)
        ),

        TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT,
                  WIPHY_PR_ARG, CHAN_DEF_PR_ARG)
);

DEFINE_EVENT(wiphy_wdev_link_evt, rdev_add_intf_link,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 unsigned int link_id),
        TP_ARGS(wiphy, wdev, link_id)
);

DEFINE_EVENT(wiphy_wdev_link_evt, rdev_del_intf_link,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 unsigned int link_id),
        TP_ARGS(wiphy, wdev, link_id)
);

TRACE_EVENT(rdev_del_link_station,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct link_station_del_parameters *params),
        TP_ARGS(wiphy, netdev, params),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __array(u8, mld_mac, 6)
                __field(u32, link_id)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                memset(__entry->mld_mac, 0, 6);
                if (params->mld_mac)
                        memcpy(__entry->mld_mac, params->mld_mac, 6);
                __entry->link_id = params->link_id;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: %pM"
                  ", link id: %u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->mld_mac,
                  __entry->link_id)
);

TRACE_EVENT(rdev_set_hw_timestamp,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_set_hw_timestamp *hwts),

        TP_ARGS(wiphy, netdev, hwts),

        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(macaddr)
                __field(bool, enable)
        ),

        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(macaddr, hwts->macaddr);
                __entry->enable = hwts->enable;
        ),

        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", mac %pM, enable: %u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->macaddr,
                  __entry->enable)
);

TRACE_EVENT(rdev_set_ttlm,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_ttlm_params *params),
        TP_ARGS(wiphy, netdev, params),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __array(u8, dlink, sizeof(u16) * 8)
                __array(u8, ulink, sizeof(u16) * 8)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                memcpy(__entry->dlink, params->dlink, sizeof(params->dlink));
                memcpy(__entry->ulink, params->ulink, sizeof(params->ulink));
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT,
                  WIPHY_PR_ARG, NETDEV_PR_ARG)
);

TRACE_EVENT(rdev_set_epcs,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 bool val),
        TP_ARGS(wiphy, netdev, val),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(bool, val)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->val = val;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", config=%u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->val)
);

/*************************************************************
 *             cfg80211 exported functions traces                     *
 *************************************************************/

TRACE_EVENT(cfg80211_return_bool,
        TP_PROTO(bool ret),
        TP_ARGS(ret),
        TP_STRUCT__entry(
                __field(bool, ret)
        ),
        TP_fast_assign(
                __entry->ret = ret;
        ),
        TP_printk("returned %s", BOOL_TO_STR(__entry->ret))
);

DECLARE_EVENT_CLASS(cfg80211_netdev_mac_evt,
        TP_PROTO(struct net_device *netdev, const u8 *macaddr),
        TP_ARGS(netdev, macaddr),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                MAC_ENTRY(macaddr)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                MAC_ASSIGN(macaddr, macaddr);
        ),
        TP_printk(NETDEV_PR_FMT ", mac: %pM",
                  NETDEV_PR_ARG, __entry->macaddr)
);

DEFINE_EVENT(cfg80211_netdev_mac_evt, cfg80211_notify_new_peer_candidate,
        TP_PROTO(struct net_device *netdev, const u8 *macaddr),
        TP_ARGS(netdev, macaddr)
);

TRACE_EVENT(cfg80211_send_rx_assoc,
        TP_PROTO(struct net_device *netdev,
                 const struct cfg80211_rx_assoc_resp_data *data),
        TP_ARGS(netdev, data),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                MAC_ENTRY(ap_addr)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                MAC_ASSIGN(ap_addr,
                           data->ap_mld_addr ?: data->links[0].bss->bssid);
        ),
        TP_printk(NETDEV_PR_FMT ", %pM",
                  NETDEV_PR_ARG, __entry->ap_addr)
);

DECLARE_EVENT_CLASS(netdev_frame_event,
        TP_PROTO(struct net_device *netdev, const u8 *buf, int len),
        TP_ARGS(netdev, buf, len),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                __dynamic_array(u8, frame, len)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                memcpy(__get_dynamic_array(frame), buf, len);
        ),
        TP_printk(NETDEV_PR_FMT ", ftype:0x%.2x",
                  NETDEV_PR_ARG,
                  le16_to_cpup((__le16 *)__get_dynamic_array(frame)))
);

DEFINE_EVENT(netdev_frame_event, cfg80211_rx_unprot_mlme_mgmt,
        TP_PROTO(struct net_device *netdev, const u8 *buf, int len),
        TP_ARGS(netdev, buf, len)
);

DEFINE_EVENT(netdev_frame_event, cfg80211_rx_mlme_mgmt,
        TP_PROTO(struct net_device *netdev, const u8 *buf, int len),
        TP_ARGS(netdev, buf, len)
);

TRACE_EVENT(cfg80211_tx_mlme_mgmt,
        TP_PROTO(struct net_device *netdev, const u8 *buf, int len,
                 bool reconnect),
        TP_ARGS(netdev, buf, len, reconnect),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                __dynamic_array(u8, frame, len)
                __field(int, reconnect)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                memcpy(__get_dynamic_array(frame), buf, len);
                __entry->reconnect = reconnect;
        ),
        TP_printk(NETDEV_PR_FMT ", ftype:0x%.2x reconnect:%d",
                  NETDEV_PR_ARG,
                  le16_to_cpup((__le16 *)__get_dynamic_array(frame)),
                  __entry->reconnect)
);

DECLARE_EVENT_CLASS(netdev_mac_evt,
        TP_PROTO(struct net_device *netdev, const u8 *mac),
        TP_ARGS(netdev, mac),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                MAC_ENTRY(mac)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                MAC_ASSIGN(mac, mac)
        ),
        TP_printk(NETDEV_PR_FMT ", mac: %pM",
                  NETDEV_PR_ARG, __entry->mac)
);

DEFINE_EVENT(netdev_mac_evt, cfg80211_send_auth_timeout,
        TP_PROTO(struct net_device *netdev, const u8 *mac),
        TP_ARGS(netdev, mac)
);

TRACE_EVENT(cfg80211_send_assoc_failure,
        TP_PROTO(struct net_device *netdev,
                 struct cfg80211_assoc_failure *data),
        TP_ARGS(netdev, data),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                MAC_ENTRY(ap_addr)
                __field(bool, timeout)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                MAC_ASSIGN(ap_addr, data->ap_mld_addr ?: data->bss[0]->bssid);
                __entry->timeout = data->timeout;
        ),
        TP_printk(NETDEV_PR_FMT ", mac: %pM, timeout: %d",
                  NETDEV_PR_ARG, __entry->ap_addr, __entry->timeout)
);

TRACE_EVENT(cfg80211_michael_mic_failure,
        TP_PROTO(struct net_device *netdev, const u8 *addr,
                 enum nl80211_key_type key_type, int key_id, const u8 *tsc),
        TP_ARGS(netdev, addr, key_type, key_id, tsc),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                MAC_ENTRY(addr)
                __field(enum nl80211_key_type, key_type)
                __field(int, key_id)
                __array(u8, tsc, 6)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                MAC_ASSIGN(addr, addr);
                __entry->key_type = key_type;
                __entry->key_id = key_id;
                if (tsc)
                        memcpy(__entry->tsc, tsc, 6);
        ),
        TP_printk(NETDEV_PR_FMT ", %pM, key type: %d, key id: %d, tsc: %pm",
                  NETDEV_PR_ARG, __entry->addr, __entry->key_type,
                  __entry->key_id, __entry->tsc)
);

TRACE_EVENT(cfg80211_ready_on_channel,
        TP_PROTO(struct wireless_dev *wdev, u64 cookie,
                 struct ieee80211_channel *chan,
                 unsigned int duration),
        TP_ARGS(wdev, cookie, chan, duration),
        TP_STRUCT__entry(
                WDEV_ENTRY
                __field(u64, cookie)
                CHAN_ENTRY
                __field(unsigned int, duration)
        ),
        TP_fast_assign(
                WDEV_ASSIGN;
                __entry->cookie = cookie;
                CHAN_ASSIGN(chan);
                __entry->duration = duration;
        ),
        TP_printk(WDEV_PR_FMT ", cookie: %llu, " CHAN_PR_FMT ", duration: %u",
                  WDEV_PR_ARG, __entry->cookie, CHAN_PR_ARG,
                  __entry->duration)
);

TRACE_EVENT(cfg80211_ready_on_channel_expired,
        TP_PROTO(struct wireless_dev *wdev, u64 cookie,
                 struct ieee80211_channel *chan),
        TP_ARGS(wdev, cookie, chan),
        TP_STRUCT__entry(
                WDEV_ENTRY
                __field(u64, cookie)
                CHAN_ENTRY
        ),
        TP_fast_assign(
                WDEV_ASSIGN;
                __entry->cookie = cookie;
                CHAN_ASSIGN(chan);
        ),
        TP_printk(WDEV_PR_FMT ", cookie: %llu, " CHAN_PR_FMT,
                  WDEV_PR_ARG, __entry->cookie, CHAN_PR_ARG)
);

TRACE_EVENT(cfg80211_tx_mgmt_expired,
        TP_PROTO(struct wireless_dev *wdev, u64 cookie,
                 struct ieee80211_channel *chan),
        TP_ARGS(wdev, cookie, chan),
        TP_STRUCT__entry(
                WDEV_ENTRY
                __field(u64, cookie)
                CHAN_ENTRY
        ),
        TP_fast_assign(
                WDEV_ASSIGN;
                __entry->cookie = cookie;
                CHAN_ASSIGN(chan);
        ),
        TP_printk(WDEV_PR_FMT ", cookie: %llu, " CHAN_PR_FMT,
                  WDEV_PR_ARG, __entry->cookie, CHAN_PR_ARG)
);

TRACE_EVENT(cfg80211_new_sta,
        TP_PROTO(struct net_device *netdev, const u8 *mac_addr,
                 struct station_info *sinfo),
        TP_ARGS(netdev, mac_addr, sinfo),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                MAC_ENTRY(mac_addr)
                SINFO_ENTRY
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                MAC_ASSIGN(mac_addr, mac_addr);
                SINFO_ASSIGN;
        ),
        TP_printk(NETDEV_PR_FMT ", %pM",
                  NETDEV_PR_ARG, __entry->mac_addr)
);

DEFINE_EVENT(cfg80211_netdev_mac_evt, cfg80211_del_sta,
        TP_PROTO(struct net_device *netdev, const u8 *macaddr),
        TP_ARGS(netdev, macaddr)
);

TRACE_EVENT(cfg80211_rx_mgmt,
        TP_PROTO(struct wireless_dev *wdev, struct cfg80211_rx_info *info),
        TP_ARGS(wdev, info),
        TP_STRUCT__entry(
                WDEV_ENTRY
                __field(int, freq)
                __field(int, sig_dbm)
        ),
        TP_fast_assign(
                WDEV_ASSIGN;
                __entry->freq = info->freq;
                __entry->sig_dbm = info->sig_dbm;
        ),
        TP_printk(WDEV_PR_FMT ", freq: "KHZ_F", sig dbm: %d",
                  WDEV_PR_ARG, PR_KHZ(__entry->freq), __entry->sig_dbm)
);

TRACE_EVENT(cfg80211_mgmt_tx_status,
        TP_PROTO(struct wireless_dev *wdev, u64 cookie, bool ack),
        TP_ARGS(wdev, cookie, ack),
        TP_STRUCT__entry(
                WDEV_ENTRY
                __field(u64, cookie)
                __field(bool, ack)
        ),
        TP_fast_assign(
                WDEV_ASSIGN;
                __entry->cookie = cookie;
                __entry->ack = ack;
        ),
        TP_printk(WDEV_PR_FMT", cookie: %llu, ack: %s",
                  WDEV_PR_ARG, __entry->cookie, BOOL_TO_STR(__entry->ack))
);

TRACE_EVENT(cfg80211_control_port_tx_status,
        TP_PROTO(struct wireless_dev *wdev, u64 cookie, bool ack),
        TP_ARGS(wdev, cookie, ack),
        TP_STRUCT__entry(
                WDEV_ENTRY
                __field(u64, cookie)
                __field(bool, ack)
        ),
        TP_fast_assign(
                WDEV_ASSIGN;
                __entry->cookie = cookie;
                __entry->ack = ack;
        ),
        TP_printk(WDEV_PR_FMT", cookie: %llu, ack: %s",
                  WDEV_PR_ARG, __entry->cookie, BOOL_TO_STR(__entry->ack))
);

TRACE_EVENT(cfg80211_rx_control_port,
        TP_PROTO(struct net_device *netdev, struct sk_buff *skb,
                 bool unencrypted, int link_id),
        TP_ARGS(netdev, skb, unencrypted, link_id),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                __field(int, len)
                MAC_ENTRY(from)
                __field(u16, proto)
                __field(bool, unencrypted)
                __field(int, link_id)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                __entry->len = skb->len;
                MAC_ASSIGN(from, eth_hdr(skb)->h_source);
                __entry->proto = be16_to_cpu(skb->protocol);
                __entry->unencrypted = unencrypted;
                __entry->link_id = link_id;
        ),
        TP_printk(NETDEV_PR_FMT ", len=%d, %pM, proto: 0x%x, unencrypted: %s, link: %d",
                  NETDEV_PR_ARG, __entry->len, __entry->from,
                  __entry->proto, BOOL_TO_STR(__entry->unencrypted),
                  __entry->link_id)
);

TRACE_EVENT(cfg80211_cqm_rssi_notify,
        TP_PROTO(struct net_device *netdev,
                 enum nl80211_cqm_rssi_threshold_event rssi_event,
                 s32 rssi_level),
        TP_ARGS(netdev, rssi_event, rssi_level),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                __field(enum nl80211_cqm_rssi_threshold_event, rssi_event)
                __field(s32, rssi_level)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                __entry->rssi_event = rssi_event;
                __entry->rssi_level = rssi_level;
        ),
        TP_printk(NETDEV_PR_FMT ", rssi event: %d, level: %d",
                  NETDEV_PR_ARG, __entry->rssi_event, __entry->rssi_level)
);

TRACE_EVENT(cfg80211_reg_can_beacon,
        TP_PROTO(struct wiphy *wiphy, struct cfg80211_chan_def *chandef,
                 enum nl80211_iftype iftype, u32 prohibited_flags,
                 u32 permitting_flags),
        TP_ARGS(wiphy, chandef, iftype, prohibited_flags, permitting_flags),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                CHAN_DEF_ENTRY
                __field(enum nl80211_iftype, iftype)
                __field(u32, prohibited_flags)
                __field(u32, permitting_flags)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                CHAN_DEF_ASSIGN(chandef);
                __entry->iftype = iftype;
                __entry->prohibited_flags = prohibited_flags;
                __entry->permitting_flags = permitting_flags;
        ),
        TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT ", iftype=%d prohibited_flags=0x%x permitting_flags=0x%x",
                  WIPHY_PR_ARG, CHAN_DEF_PR_ARG, __entry->iftype,
                  __entry->prohibited_flags, __entry->permitting_flags)
);

TRACE_EVENT(cfg80211_ch_switch_notify,
        TP_PROTO(struct net_device *netdev,
                 struct cfg80211_chan_def *chandef,
                 unsigned int link_id),
        TP_ARGS(netdev, chandef, link_id),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                CHAN_DEF_ENTRY
                __field(unsigned int, link_id)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                CHAN_DEF_ASSIGN(chandef);
                __entry->link_id = link_id;
        ),
        TP_printk(NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT ", link:%d",
                  NETDEV_PR_ARG, CHAN_DEF_PR_ARG, __entry->link_id)
);

TRACE_EVENT(cfg80211_ch_switch_started_notify,
        TP_PROTO(struct net_device *netdev,
                 struct cfg80211_chan_def *chandef,
                 unsigned int link_id),
        TP_ARGS(netdev, chandef, link_id),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                CHAN_DEF_ENTRY
                __field(unsigned int, link_id)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                CHAN_DEF_ASSIGN(chandef);
                __entry->link_id = link_id;
        ),
        TP_printk(NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT ", link:%d",
                  NETDEV_PR_ARG, CHAN_DEF_PR_ARG, __entry->link_id)
);

TRACE_EVENT(cfg80211_radar_event,
        TP_PROTO(struct wiphy *wiphy, struct cfg80211_chan_def *chandef,
                 bool offchan),
        TP_ARGS(wiphy, chandef, offchan),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                CHAN_DEF_ENTRY
                __field(bool, offchan)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                CHAN_DEF_ASSIGN(chandef);
                __entry->offchan = offchan;
        ),
        TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT ", offchan %d",
                  WIPHY_PR_ARG, CHAN_DEF_PR_ARG, __entry->offchan)
);

TRACE_EVENT(cfg80211_cac_event,
        TP_PROTO(struct net_device *netdev, enum nl80211_radar_event evt,
                 unsigned int link_id),
        TP_ARGS(netdev, evt, link_id),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                __field(enum nl80211_radar_event, evt)
                __field(unsigned int, link_id)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                __entry->evt = evt;
                __entry->link_id = link_id;
        ),
        TP_printk(NETDEV_PR_FMT ",  event: %d, link_id=%u",
                  NETDEV_PR_ARG, __entry->evt, __entry->link_id)
);

DECLARE_EVENT_CLASS(cfg80211_rx_evt,
        TP_PROTO(struct net_device *netdev, const u8 *addr, int link_id),
        TP_ARGS(netdev, addr, link_id),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                MAC_ENTRY(addr)
                __field(int, link_id)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                MAC_ASSIGN(addr, addr);
                __entry->link_id = link_id;
        ),
        TP_printk(NETDEV_PR_FMT ", %pM, link_id:%d", NETDEV_PR_ARG,
                  __entry->addr, __entry->link_id)
);

DEFINE_EVENT(cfg80211_rx_evt, cfg80211_rx_spurious_frame,
        TP_PROTO(struct net_device *netdev, const u8 *addr, int link_id),
        TP_ARGS(netdev, addr, link_id)
);

DEFINE_EVENT(cfg80211_rx_evt, cfg80211_rx_unexpected_4addr_frame,
        TP_PROTO(struct net_device *netdev, const u8 *addr, int link_id),
        TP_ARGS(netdev, addr, link_id)
);

TRACE_EVENT(cfg80211_ibss_joined,
        TP_PROTO(struct net_device *netdev, const u8 *bssid,
                 struct ieee80211_channel *channel),
        TP_ARGS(netdev, bssid, channel),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                MAC_ENTRY(bssid)
                CHAN_ENTRY
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                MAC_ASSIGN(bssid, bssid);
                CHAN_ASSIGN(channel);
        ),
        TP_printk(NETDEV_PR_FMT ", bssid: %pM, " CHAN_PR_FMT,
                  NETDEV_PR_ARG, __entry->bssid, CHAN_PR_ARG)
);

TRACE_EVENT(cfg80211_probe_status,
        TP_PROTO(struct net_device *netdev, const u8 *addr, u64 cookie,
                 bool acked),
        TP_ARGS(netdev, addr, cookie, acked),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                MAC_ENTRY(addr)
                __field(u64, cookie)
                __field(bool, acked)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                MAC_ASSIGN(addr, addr);
                __entry->cookie = cookie;
                __entry->acked = acked;
        ),
        TP_printk(NETDEV_PR_FMT " addr:%pM, cookie: %llu, acked: %s",
                  NETDEV_PR_ARG, __entry->addr, __entry->cookie,
                  BOOL_TO_STR(__entry->acked))
);

TRACE_EVENT(cfg80211_cqm_pktloss_notify,
        TP_PROTO(struct net_device *netdev, const u8 *peer, u32 num_packets),
        TP_ARGS(netdev, peer, num_packets),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                MAC_ENTRY(peer)
                __field(u32, num_packets)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                MAC_ASSIGN(peer, peer);
                __entry->num_packets = num_packets;
        ),
        TP_printk(NETDEV_PR_FMT ", peer: %pM, num of lost packets: %u",
                  NETDEV_PR_ARG, __entry->peer, __entry->num_packets)
);

DEFINE_EVENT(cfg80211_netdev_mac_evt, cfg80211_gtk_rekey_notify,
        TP_PROTO(struct net_device *netdev, const u8 *macaddr),
        TP_ARGS(netdev, macaddr)
);

TRACE_EVENT(cfg80211_pmksa_candidate_notify,
        TP_PROTO(struct net_device *netdev, int index, const u8 *bssid,
                 bool preauth),
        TP_ARGS(netdev, index, bssid, preauth),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                __field(int, index)
                MAC_ENTRY(bssid)
                __field(bool, preauth)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                __entry->index = index;
                MAC_ASSIGN(bssid, bssid);
                __entry->preauth = preauth;
        ),
        TP_printk(NETDEV_PR_FMT ", index:%d, bssid: %pM, pre auth: %s",
                  NETDEV_PR_ARG, __entry->index, __entry->bssid,
                  BOOL_TO_STR(__entry->preauth))
);

TRACE_EVENT(cfg80211_report_obss_beacon,
        TP_PROTO(struct wiphy *wiphy, const u8 *frame, size_t len,
                 int freq, int sig_dbm),
        TP_ARGS(wiphy, frame, len, freq, sig_dbm),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(int, freq)
                __field(int, sig_dbm)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->freq = freq;
                __entry->sig_dbm = sig_dbm;
        ),
        TP_printk(WIPHY_PR_FMT ", freq: "KHZ_F", sig_dbm: %d",
                  WIPHY_PR_ARG, PR_KHZ(__entry->freq), __entry->sig_dbm)
);

TRACE_EVENT(cfg80211_tdls_oper_request,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *peer,
                 enum nl80211_tdls_operation oper, u16 reason_code),
        TP_ARGS(wiphy, netdev, peer, oper, reason_code),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(peer)
                __field(enum nl80211_tdls_operation, oper)
                __field(u16, reason_code)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(peer, peer);
                __entry->oper = oper;
                __entry->reason_code = reason_code;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: %pM, oper: %d, reason_code %u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer, __entry->oper,
                  __entry->reason_code)
        );

TRACE_EVENT(cfg80211_scan_done,
        TP_PROTO(struct cfg80211_scan_request_int *request,
                 struct cfg80211_scan_info *info),
        TP_ARGS(request, info),
        TP_STRUCT__entry(
                __field(u32, n_channels)
                __dynamic_array(u8, ie, request ? request->req.ie_len : 0)
                __array(u32, rates, NUM_NL80211_BANDS)
                __field(u32, wdev_id)
                MAC_ENTRY(wiphy_mac)
                __field(bool, no_cck)
                __field(bool, aborted)
                __field(u64, scan_start_tsf)
                MAC_ENTRY(tsf_bssid)
        ),
        TP_fast_assign(
                if (request) {
                        memcpy(__get_dynamic_array(ie), request->req.ie,
                               request->req.ie_len);
                        memcpy(__entry->rates, request->req.rates,
                               NUM_NL80211_BANDS);
                        __entry->wdev_id = request->req.wdev ?
                                        request->req.wdev->identifier : 0;
                        if (request->req.wiphy)
                                MAC_ASSIGN(wiphy_mac,
                                           request->req.wiphy->perm_addr);
                        __entry->no_cck = request->req.no_cck;
                }
                if (info) {
                        __entry->aborted = info->aborted;
                        __entry->scan_start_tsf = info->scan_start_tsf;
                        MAC_ASSIGN(tsf_bssid, info->tsf_bssid);
                }
        ),
        TP_printk("aborted: %s, scan start (TSF): %llu, tsf_bssid: %pM",
                  BOOL_TO_STR(__entry->aborted),
                  (unsigned long long)__entry->scan_start_tsf,
                  __entry->tsf_bssid)
);

DECLARE_EVENT_CLASS(wiphy_id_evt,
        TP_PROTO(struct wiphy *wiphy, u64 id),
        TP_ARGS(wiphy, id),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(u64, id)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->id = id;
        ),
        TP_printk(WIPHY_PR_FMT ", id: %llu", WIPHY_PR_ARG, __entry->id)
);

DEFINE_EVENT(wiphy_id_evt, cfg80211_sched_scan_stopped,
        TP_PROTO(struct wiphy *wiphy, u64 id),
        TP_ARGS(wiphy, id)
);

DEFINE_EVENT(wiphy_id_evt, cfg80211_sched_scan_results,
        TP_PROTO(struct wiphy *wiphy, u64 id),
        TP_ARGS(wiphy, id)
);

TRACE_EVENT(cfg80211_get_bss,
        TP_PROTO(struct wiphy *wiphy, struct ieee80211_channel *channel,
                 const u8 *bssid, const u8 *ssid, size_t ssid_len,
                 enum ieee80211_bss_type bss_type,
                 enum ieee80211_privacy privacy),
        TP_ARGS(wiphy, channel, bssid, ssid, ssid_len, bss_type, privacy),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                CHAN_ENTRY
                MAC_ENTRY(bssid)
                __dynamic_array(u8, ssid, ssid_len)
                __field(enum ieee80211_bss_type, bss_type)
                __field(enum ieee80211_privacy, privacy)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                CHAN_ASSIGN(channel);
                MAC_ASSIGN(bssid, bssid);
                memcpy(__get_dynamic_array(ssid), ssid, ssid_len);
                __entry->bss_type = bss_type;
                __entry->privacy = privacy;
        ),
        TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT ", %pM"
                  ", buf: %#.2x, bss_type: %d, privacy: %d",
                  WIPHY_PR_ARG, CHAN_PR_ARG, __entry->bssid,
                  ((u8 *)__get_dynamic_array(ssid))[0], __entry->bss_type,
                  __entry->privacy)
);

TRACE_EVENT(cfg80211_inform_bss_frame,
        TP_PROTO(struct wiphy *wiphy, struct cfg80211_inform_bss *data,
                 struct ieee80211_mgmt *mgmt, size_t len),
        TP_ARGS(wiphy, data, mgmt, len),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                CHAN_ENTRY
                __dynamic_array(u8, mgmt, len)
                __field(s32, signal)
                __field(u64, ts_boottime)
                __field(u64, parent_tsf)
                MAC_ENTRY(parent_bssid)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                CHAN_ASSIGN(data->chan);
                if (mgmt)
                        memcpy(__get_dynamic_array(mgmt), mgmt, len);
                __entry->signal = data->signal;
                __entry->ts_boottime = data->boottime_ns;
                __entry->parent_tsf = data->parent_tsf;
                MAC_ASSIGN(parent_bssid, data->parent_bssid);
        ),
        TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT
                  "signal: %d, tsb:%llu, detect_tsf:%llu, tsf_bssid: %pM",
                  WIPHY_PR_ARG, CHAN_PR_ARG,
                  __entry->signal, (unsigned long long)__entry->ts_boottime,
                  (unsigned long long)__entry->parent_tsf,
                  __entry->parent_bssid)
);

DECLARE_EVENT_CLASS(cfg80211_bss_evt,
        TP_PROTO(struct cfg80211_bss *pub),
        TP_ARGS(pub),
        TP_STRUCT__entry(
                MAC_ENTRY(bssid)
                CHAN_ENTRY
        ),
        TP_fast_assign(
                MAC_ASSIGN(bssid, pub->bssid);
                CHAN_ASSIGN(pub->channel);
        ),
        TP_printk("%pM, " CHAN_PR_FMT, __entry->bssid, CHAN_PR_ARG)
);

DEFINE_EVENT(cfg80211_bss_evt, cfg80211_return_bss,
        TP_PROTO(struct cfg80211_bss *pub),
        TP_ARGS(pub)
);

TRACE_EVENT(cfg80211_report_wowlan_wakeup,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 struct cfg80211_wowlan_wakeup *wakeup),
        TP_ARGS(wiphy, wdev, wakeup),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(bool, non_wireless)
                __field(bool, disconnect)
                __field(bool, magic_pkt)
                __field(bool, gtk_rekey_failure)
                __field(bool, eap_identity_req)
                __field(bool, four_way_handshake)
                __field(bool, rfkill_release)
                __field(s32, pattern_idx)
                __field(u32, packet_len)
                __dynamic_array(u8, packet,
                                wakeup ? wakeup->packet_present_len : 0)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->non_wireless = !wakeup;
                __entry->disconnect = wakeup ? wakeup->disconnect : false;
                __entry->magic_pkt = wakeup ? wakeup->magic_pkt : false;
                __entry->gtk_rekey_failure = wakeup ? wakeup->gtk_rekey_failure : false;
                __entry->eap_identity_req = wakeup ? wakeup->eap_identity_req : false;
                __entry->four_way_handshake = wakeup ? wakeup->four_way_handshake : false;
                __entry->rfkill_release = wakeup ? wakeup->rfkill_release : false;
                __entry->pattern_idx = wakeup ? wakeup->pattern_idx : false;
                __entry->packet_len = wakeup ? wakeup->packet_len : false;
                if (wakeup && wakeup->packet && wakeup->packet_present_len)
                        memcpy(__get_dynamic_array(packet), wakeup->packet,
                               wakeup->packet_present_len);
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG)
);

TRACE_EVENT(cfg80211_ft_event,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_ft_event_params *ft_event),
        TP_ARGS(wiphy, netdev, ft_event),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __dynamic_array(u8, ies, ft_event->ies_len)
                MAC_ENTRY(target_ap)
                __dynamic_array(u8, ric_ies, ft_event->ric_ies_len)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                if (ft_event->ies)
                        memcpy(__get_dynamic_array(ies), ft_event->ies,
                               ft_event->ies_len);
                MAC_ASSIGN(target_ap, ft_event->target_ap);
                if (ft_event->ric_ies)
                        memcpy(__get_dynamic_array(ric_ies), ft_event->ric_ies,
                               ft_event->ric_ies_len);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", target_ap: %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->target_ap)
);

TRACE_EVENT(cfg80211_stop_iface,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT,
                  WIPHY_PR_ARG, WDEV_PR_ARG)
);

TRACE_EVENT(cfg80211_pmsr_report,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 u64 cookie, const u8 *addr),
        TP_ARGS(wiphy, wdev, cookie, addr),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(u64, cookie)
                MAC_ENTRY(addr)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->cookie = cookie;
                MAC_ASSIGN(addr, addr);
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie:%lld, %pM",
                  WIPHY_PR_ARG, WDEV_PR_ARG,
                  (unsigned long long)__entry->cookie,
                  __entry->addr)
);

TRACE_EVENT(cfg80211_pmsr_complete,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie),
        TP_ARGS(wiphy, wdev, cookie),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(u64, cookie)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->cookie = cookie;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie:%lld",
                  WIPHY_PR_ARG, WDEV_PR_ARG,
                  (unsigned long long)__entry->cookie)
);

TRACE_EVENT(cfg80211_update_owe_info_event,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_update_owe_info *owe_info),
        TP_ARGS(wiphy, netdev, owe_info),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(peer)
                __dynamic_array(u8, ie, owe_info->ie_len)
                __field(int, assoc_link_id)
                MAC_ENTRY(peer_mld_addr)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(peer, owe_info->peer);
                memcpy(__get_dynamic_array(ie), owe_info->ie,
                       owe_info->ie_len);
                __entry->assoc_link_id = owe_info->assoc_link_id;
                MAC_ASSIGN(peer_mld_addr, owe_info->peer_mld_addr);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: %pM,"
                  " assoc_link_id: %d, peer_mld_addr: %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer,
                  __entry->assoc_link_id, __entry->peer_mld_addr)
);

TRACE_EVENT(cfg80211_bss_color_notify,
        TP_PROTO(struct net_device *netdev,
                 enum nl80211_commands cmd,
                 u8 count, u64 color_bitmap),
        TP_ARGS(netdev, cmd, count, color_bitmap),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                __field(u32, cmd)
                __field(u8, count)
                __field(u64, color_bitmap)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                __entry->cmd = cmd;
                __entry->count = count;
                __entry->color_bitmap = color_bitmap;
        ),
        TP_printk(NETDEV_PR_FMT ", cmd: %x, count: %u, bitmap: %llx",
                  NETDEV_PR_ARG, __entry->cmd, __entry->count,
                  __entry->color_bitmap)
);

TRACE_EVENT(cfg80211_assoc_comeback,
        TP_PROTO(struct wireless_dev *wdev, const u8 *ap_addr, u32 timeout),
        TP_ARGS(wdev, ap_addr, timeout),
        TP_STRUCT__entry(
                WDEV_ENTRY
                MAC_ENTRY(ap_addr)
                __field(u32, timeout)
        ),
        TP_fast_assign(
                WDEV_ASSIGN;
                MAC_ASSIGN(ap_addr, ap_addr);
                __entry->timeout = timeout;
        ),
        TP_printk(WDEV_PR_FMT ", %pM, timeout: %u TUs",
                  WDEV_PR_ARG, __entry->ap_addr, __entry->timeout)
);

DECLARE_EVENT_CLASS(link_station_add_mod,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct link_station_parameters *params),
        TP_ARGS(wiphy, netdev, params),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __array(u8, mld_mac, 6)
                __array(u8, link_mac, 6)
                __field(u32, link_id)
                __dynamic_array(u8, supported_rates,
                                params->supported_rates_len)
                __array(u8, ht_capa, (int)sizeof(struct ieee80211_ht_cap))
                __array(u8, vht_capa, (int)sizeof(struct ieee80211_vht_cap))
                __field(u8, opmode_notif)
                __field(bool, opmode_notif_used)
                __dynamic_array(u8, he_capa, params->he_capa_len)
                __array(u8, he_6ghz_capa, (int)sizeof(struct ieee80211_he_6ghz_capa))
                __dynamic_array(u8, eht_capa, params->eht_capa_len)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                memset(__entry->mld_mac, 0, 6);
                memset(__entry->link_mac, 0, 6);
                if (params->mld_mac)
                        memcpy(__entry->mld_mac, params->mld_mac, 6);
                if (params->link_mac)
                        memcpy(__entry->link_mac, params->link_mac, 6);
                __entry->link_id = params->link_id;
                if (params->supported_rates && params->supported_rates_len)
                        memcpy(__get_dynamic_array(supported_rates),
                               params->supported_rates,
                               params->supported_rates_len);
                memset(__entry->ht_capa, 0, sizeof(struct ieee80211_ht_cap));
                if (params->ht_capa)
                        memcpy(__entry->ht_capa, params->ht_capa,
                               sizeof(struct ieee80211_ht_cap));
                memset(__entry->vht_capa, 0, sizeof(struct ieee80211_vht_cap));
                if (params->vht_capa)
                        memcpy(__entry->vht_capa, params->vht_capa,
                               sizeof(struct ieee80211_vht_cap));
                __entry->opmode_notif = params->opmode_notif;
                __entry->opmode_notif_used = params->opmode_notif_used;
                if (params->he_capa && params->he_capa_len)
                        memcpy(__get_dynamic_array(he_capa), params->he_capa,
                               params->he_capa_len);
                memset(__entry->he_6ghz_capa, 0, sizeof(struct ieee80211_he_6ghz_capa));
                if (params->he_6ghz_capa)
                        memcpy(__entry->he_6ghz_capa, params->he_6ghz_capa,
                               sizeof(struct ieee80211_he_6ghz_capa));
                if (params->eht_capa && params->eht_capa_len)
                        memcpy(__get_dynamic_array(eht_capa), params->eht_capa,
                               params->eht_capa_len);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: %pM"
                  ", link mac: %pM, link id: %u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->mld_mac,
                  __entry->link_mac, __entry->link_id)
);

DEFINE_EVENT(link_station_add_mod, rdev_add_link_station,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct link_station_parameters *params),
        TP_ARGS(wiphy, netdev, params)
);

DEFINE_EVENT(link_station_add_mod, rdev_mod_link_station,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct link_station_parameters *params),
        TP_ARGS(wiphy, netdev, params)
);

TRACE_EVENT(cfg80211_links_removed,
        TP_PROTO(struct net_device *netdev, u16 link_mask),
        TP_ARGS(netdev, link_mask),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                __field(u16, link_mask)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                __entry->link_mask = link_mask;
        ),
        TP_printk(NETDEV_PR_FMT ", link_mask:0x%x", NETDEV_PR_ARG,
                  __entry->link_mask)
);

TRACE_EVENT(cfg80211_mlo_reconf_add_done,
        TP_PROTO(struct net_device *netdev, u16 link_mask,
                 const u8 *buf, size_t len, bool driver_initiated),
        TP_ARGS(netdev, link_mask, buf, len, driver_initiated),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                __field(u16, link_mask)
                __dynamic_array(u8, buf, len)
                __field(bool, driver_initiated)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                __entry->link_mask = link_mask;
                memcpy(__get_dynamic_array(buf), buf, len);
                __entry->driver_initiated = driver_initiated;
        ),
        TP_printk(NETDEV_PR_FMT ", link_mask:0x%x, driver_initiated:%d",
                  NETDEV_PR_ARG, __entry->link_mask, __entry->driver_initiated)
);

TRACE_EVENT(rdev_assoc_ml_reconf,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_ml_reconf_req *req),
        TP_ARGS(wiphy, netdev, req),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(u16, add_links)
                __field(u16, rem_links)
                __field(u16, ext_mld_capa_ops)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                u32 i;

                __entry->add_links = 0;
                __entry->rem_links = req->rem_links;
                for (i = 0; i < IEEE80211_MLD_MAX_NUM_LINKS; i++)
                        if (req->add_links[i].bss)
                                __entry->add_links |= BIT(i);
                __entry->ext_mld_capa_ops = req->ext_mld_capa_ops;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", add_links=0x%x, rem_links=0x%x",
                  WIPHY_PR_ARG, NETDEV_PR_ARG,
                  __entry->add_links, __entry->rem_links)
);

TRACE_EVENT(cfg80211_epcs_changed,
        TP_PROTO(struct wireless_dev *wdev, bool enabled),
        TP_ARGS(wdev, enabled),
        TP_STRUCT__entry(
                WDEV_ENTRY
                __field(u32, enabled)
        ),
        TP_fast_assign(
                WDEV_ASSIGN;
                __entry->enabled = enabled;
        ),
        TP_printk(WDEV_PR_FMT ", enabled=%u",
                  WDEV_PR_ARG, __entry->enabled)
);

TRACE_EVENT(cfg80211_next_nan_dw_notif,
        TP_PROTO(struct wireless_dev *wdev,
                 struct ieee80211_channel *chan),
        TP_ARGS(wdev, chan),
        TP_STRUCT__entry(
                WDEV_ENTRY
                CHAN_ENTRY
        ),
        TP_fast_assign(
                WDEV_ASSIGN;
                CHAN_ASSIGN(chan);
        ),
        TP_printk(WDEV_PR_FMT " " CHAN_PR_FMT,
                  WDEV_PR_ARG, CHAN_PR_ARG)
);

TRACE_EVENT(cfg80211_nan_cluster_joined,
        TP_PROTO(struct wireless_dev *wdev,
                 const u8 *cluster_id,
                 bool new_cluster),
        TP_ARGS(wdev, cluster_id, new_cluster),
        TP_STRUCT__entry(
                WDEV_ENTRY
                MAC_ENTRY(cluster_id)
                __field(bool, new_cluster)
        ),
        TP_fast_assign(
                WDEV_ASSIGN;
                MAC_ASSIGN(cluster_id, cluster_id);
                __entry->new_cluster = new_cluster;
        ),
        TP_printk(WDEV_PR_FMT " cluster_id %pMF%s",
                  WDEV_PR_ARG, __entry->cluster_id,
                  __entry->new_cluster ? " [new]" : "")
);
#endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */

#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE trace
#include <trace/define_trace.h>




































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Descending-priority-sorted double-linked list
 *
 * (C) 2002-2003 Intel Corp
 * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>.
 *
 * 2001-2005 (c) MontaVista Software, Inc.
 * Daniel Walker <dwalker@mvista.com>
 *
 * (C) 2005 Thomas Gleixner <tglx@linutronix.de>
 *
 * Simplifications of the original code by
 * Oleg Nesterov <oleg@tv-sign.ru>
 *
 * Based on simple lists (include/linux/list.h).
 *
 * This is a priority-sorted list of nodes; each node has a
 * priority from INT_MIN (highest) to INT_MAX (lowest).
 *
 * Addition is O(K), removal is O(1), change of priority of a node is
 * O(K) and K is the number of RT priority levels used in the system.
 * (1 <= K <= 99)
 *
 * This list is really a list of lists:
 *
 *  - The tier 1 list is the prio_list, different priority nodes.
 *
 *  - The tier 2 list is the node_list, serialized nodes.
 *
 * Simple ASCII art explanation:
 *
 * pl:prio_list (only for plist_node)
 * nl:node_list
 *   HEAD|             NODE(S)
 *       |
 *       ||------------------------------------|
 *       ||->|pl|<->|pl|<--------------->|pl|<-|
 *       |   |10|   |21|   |21|   |21|   |40|   (prio)
 *       |   |  |   |  |   |  |   |  |   |  |
 *       |   |  |   |  |   |  |   |  |   |  |
 * |->|nl|<->|nl|<->|nl|<->|nl|<->|nl|<->|nl|<-|
 * |-------------------------------------------|
 *
 * The nodes on the prio_list list are sorted by priority to simplify
 * the insertion of new nodes. There are no nodes with duplicate
 * priorites on the list.
 *
 * The nodes on the node_list are ordered by priority and can contain
 * entries which have the same priority. Those entries are ordered
 * FIFO
 *
 * Addition means: look for the prio_list node in the prio_list
 * for the priority of the node and insert it before the node_list
 * entry of the next prio_list node. If it is the first node of
 * that priority, add it to the prio_list in the right position and
 * insert it into the serialized node_list list
 *
 * Removal means remove it from the node_list and remove it from
 * the prio_list if the node_list list_head is non empty. In case
 * of removal from the prio_list it must be checked whether other
 * entries of the same priority are on the list or not. If there
 * is another entry of the same priority then this entry has to
 * replace the removed entry on the prio_list. If the entry which
 * is removed is the only entry of this priority then a simple
 * remove from both list is sufficient.
 *
 * INT_MIN is the highest priority, 0 is the medium highest, INT_MAX
 * is lowest priority.
 *
 * No locking is done, up to the caller.
 */
#ifndef _LINUX_PLIST_H_
#define _LINUX_PLIST_H_

#include <linux/container_of.h>
#include <linux/list.h>
#include <linux/plist_types.h>

#include <asm/bug.h>

/**
 * PLIST_HEAD_INIT - static struct plist_head initializer
 * @head:        struct plist_head variable name
 */
#define PLIST_HEAD_INIT(head)                                \
{                                                        \
        .node_list = LIST_HEAD_INIT((head).node_list)        \
}

/**
 * PLIST_HEAD - declare and init plist_head
 * @head:        name for struct plist_head variable
 */
#define PLIST_HEAD(head) \
        struct plist_head head = PLIST_HEAD_INIT(head)

/**
 * PLIST_NODE_INIT - static struct plist_node initializer
 * @node:        struct plist_node variable name
 * @__prio:        initial node priority
 */
#define PLIST_NODE_INIT(node, __prio)                        \
{                                                        \
        .prio  = (__prio),                                \
        .prio_list = LIST_HEAD_INIT((node).prio_list),        \
        .node_list = LIST_HEAD_INIT((node).node_list),        \
}

/**
 * plist_head_init - dynamic struct plist_head initializer
 * @head:        &struct plist_head pointer
 */
static inline void
plist_head_init(struct plist_head *head)
{
        INIT_LIST_HEAD(&head->node_list);
}

/**
 * plist_node_init - Dynamic struct plist_node initializer
 * @node:        &struct plist_node pointer
 * @prio:        initial node priority
 */
static inline void plist_node_init(struct plist_node *node, int prio)
{
        node->prio = prio;
        INIT_LIST_HEAD(&node->prio_list);
        INIT_LIST_HEAD(&node->node_list);
}

extern void plist_add(struct plist_node *node, struct plist_head *head);
extern void plist_del(struct plist_node *node, struct plist_head *head);

extern void plist_requeue(struct plist_node *node, struct plist_head *head);

/**
 * plist_for_each - iterate over the plist
 * @pos:        the type * to use as a loop counter
 * @head:        the head for your list
 */
#define plist_for_each(pos, head)        \
         list_for_each_entry(pos, &(head)->node_list, node_list)

/**
 * plist_for_each_continue - continue iteration over the plist
 * @pos:        the type * to use as a loop cursor
 * @head:        the head for your list
 *
 * Continue to iterate over plist, continuing after the current position.
 */
#define plist_for_each_continue(pos, head)        \
         list_for_each_entry_continue(pos, &(head)->node_list, node_list)

/**
 * plist_for_each_safe - iterate safely over a plist of given type
 * @pos:        the type * to use as a loop counter
 * @n:        another type * to use as temporary storage
 * @head:        the head for your list
 *
 * Iterate over a plist of given type, safe against removal of list entry.
 */
#define plist_for_each_safe(pos, n, head)        \
         list_for_each_entry_safe(pos, n, &(head)->node_list, node_list)

/**
 * plist_for_each_entry        - iterate over list of given type
 * @pos:        the type * to use as a loop counter
 * @head:        the head for your list
 * @mem:        the name of the list_head within the struct
 */
#define plist_for_each_entry(pos, head, mem)        \
         list_for_each_entry(pos, &(head)->node_list, mem.node_list)

/**
 * plist_for_each_entry_continue - continue iteration over list of given type
 * @pos:        the type * to use as a loop cursor
 * @head:        the head for your list
 * @m:                the name of the list_head within the struct
 *
 * Continue to iterate over list of given type, continuing after
 * the current position.
 */
#define plist_for_each_entry_continue(pos, head, m)        \
        list_for_each_entry_continue(pos, &(head)->node_list, m.node_list)

/**
 * plist_for_each_entry_safe - iterate safely over list of given type
 * @pos:        the type * to use as a loop counter
 * @n:                another type * to use as temporary storage
 * @head:        the head for your list
 * @m:                the name of the list_head within the struct
 *
 * Iterate over list of given type, safe against removal of list entry.
 */
#define plist_for_each_entry_safe(pos, n, head, m)        \
        list_for_each_entry_safe(pos, n, &(head)->node_list, m.node_list)

/**
 * plist_head_empty - return !0 if a plist_head is empty
 * @head:        &struct plist_head pointer
 */
static inline int plist_head_empty(const struct plist_head *head)
{
        return list_empty(&head->node_list);
}

/**
 * plist_node_empty - return !0 if plist_node is not on a list
 * @node:        &struct plist_node pointer
 */
static inline int plist_node_empty(const struct plist_node *node)
{
        return list_empty(&node->node_list);
}

/* All functions below assume the plist_head is not empty. */

/**
 * plist_first_entry - get the struct for the first entry
 * @head:        the &struct plist_head pointer
 * @type:        the type of the struct this is embedded in
 * @member:        the name of the list_head within the struct
 */
#ifdef CONFIG_DEBUG_PLIST
# define plist_first_entry(head, type, member)        \
({ \
        WARN_ON(plist_head_empty(head)); \
        container_of(plist_first(head), type, member); \
})
#else
# define plist_first_entry(head, type, member)        \
        container_of(plist_first(head), type, member)
#endif

/**
 * plist_last_entry - get the struct for the last entry
 * @head:        the &struct plist_head pointer
 * @type:        the type of the struct this is embedded in
 * @member:        the name of the list_head within the struct
 */
#ifdef CONFIG_DEBUG_PLIST
# define plist_last_entry(head, type, member)        \
({ \
        WARN_ON(plist_head_empty(head)); \
        container_of(plist_last(head), type, member); \
})
#else
# define plist_last_entry(head, type, member)        \
        container_of(plist_last(head), type, member)
#endif

/**
 * plist_next - get the next entry in list
 * @pos:        the type * to cursor
 */
#define plist_next(pos) \
        list_next_entry(pos, node_list)

/**
 * plist_prev - get the prev entry in list
 * @pos:        the type * to cursor
 */
#define plist_prev(pos) \
        list_prev_entry(pos, node_list)

/**
 * plist_first - return the first node (and thus, highest priority)
 * @head:        the &struct plist_head pointer
 *
 * Assumes the plist is _not_ empty.
 */
static inline struct plist_node *plist_first(const struct plist_head *head)
{
        return list_entry(head->node_list.next,
                          struct plist_node, node_list);
}

/**
 * plist_last - return the last node (and thus, lowest priority)
 * @head:        the &struct plist_head pointer
 *
 * Assumes the plist is _not_ empty.
 */
static inline struct plist_node *plist_last(const struct plist_head *head)
{
        return list_entry(head->node_list.prev,
                          struct plist_node, node_list);
}

#endif





























































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_MM_H
#define _LINUX_SCHED_MM_H

#include <linux/kernel.h>
#include <linux/atomic.h>
#include <linux/sched.h>
#include <linux/mm_types.h>
#include <linux/gfp.h>
#include <linux/sync_core.h>
#include <linux/sched/coredump.h>

/*
 * Routines for handling mm_structs
 */
extern struct mm_struct *mm_alloc(void);

/**
 * mmgrab() - Pin a &struct mm_struct.
 * @mm: The &struct mm_struct to pin.
 *
 * Make sure that @mm will not get freed even after the owning task
 * exits. This doesn't guarantee that the associated address space
 * will still exist later on and mmget_not_zero() has to be used before
 * accessing it.
 *
 * This is a preferred way to pin @mm for a longer/unbounded amount
 * of time.
 *
 * Use mmdrop() to release the reference acquired by mmgrab().
 *
 * See also <Documentation/mm/active_mm.rst> for an in-depth explanation
 * of &mm_struct.mm_count vs &mm_struct.mm_users.
 */
static inline void mmgrab(struct mm_struct *mm)
{
        atomic_inc(&mm->mm_count);
}

static inline void smp_mb__after_mmgrab(void)
{
        smp_mb__after_atomic();
}

extern void __mmdrop(struct mm_struct *mm);

static inline void mmdrop(struct mm_struct *mm)
{
        /*
         * The implicit full barrier implied by atomic_dec_and_test() is
         * required by the membarrier system call before returning to
         * user-space, after storing to rq->curr.
         */
        if (unlikely(atomic_dec_and_test(&mm->mm_count)))
                __mmdrop(mm);
}

#ifdef CONFIG_PREEMPT_RT
/*
 * RCU callback for delayed mm drop. Not strictly RCU, but call_rcu() is
 * by far the least expensive way to do that.
 */
static inline void __mmdrop_delayed(struct rcu_head *rhp)
{
        struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);

        __mmdrop(mm);
}

/*
 * Invoked from finish_task_switch(). Delegates the heavy lifting on RT
 * kernels via RCU.
 */
static inline void mmdrop_sched(struct mm_struct *mm)
{
        /* Provides a full memory barrier. See mmdrop() */
        if (atomic_dec_and_test(&mm->mm_count))
                call_rcu(&mm->delayed_drop, __mmdrop_delayed);
}
#else
static inline void mmdrop_sched(struct mm_struct *mm)
{
        mmdrop(mm);
}
#endif

/* Helpers for lazy TLB mm refcounting */
static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
{
        if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
                mmgrab(mm);
}

static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
{
        if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) {
                mmdrop(mm);
        } else {
                /*
                 * mmdrop_lazy_tlb must provide a full memory barrier, see the
                 * membarrier comment finish_task_switch which relies on this.
                 */
                smp_mb();
        }
}

static inline void mmdrop_lazy_tlb_sched(struct mm_struct *mm)
{
        if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
                mmdrop_sched(mm);
        else
                smp_mb(); /* see mmdrop_lazy_tlb() above */
}

/**
 * mmget() - Pin the address space associated with a &struct mm_struct.
 * @mm: The address space to pin.
 *
 * Make sure that the address space of the given &struct mm_struct doesn't
 * go away. This does not protect against parts of the address space being
 * modified or freed, however.
 *
 * Never use this function to pin this address space for an
 * unbounded/indefinite amount of time.
 *
 * Use mmput() to release the reference acquired by mmget().
 *
 * See also <Documentation/mm/active_mm.rst> for an in-depth explanation
 * of &mm_struct.mm_count vs &mm_struct.mm_users.
 */
static inline void mmget(struct mm_struct *mm)
{
        atomic_inc(&mm->mm_users);
}

static inline bool mmget_not_zero(struct mm_struct *mm)
{
        return atomic_inc_not_zero(&mm->mm_users);
}

/* mmput gets rid of the mappings and all user-space */
extern void mmput(struct mm_struct *);
#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH)
/* same as above but performs the slow path from the async context. Can
 * be called from the atomic context as well
 */
void mmput_async(struct mm_struct *);
#endif

/* Grab a reference to a task's mm, if it is not already going away */
extern struct mm_struct *get_task_mm(struct task_struct *task);
/*
 * Grab a reference to a task's mm, if it is not already going away
 * and ptrace_may_access with the mode parameter passed to it
 * succeeds.
 */
extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode);
/* Remove the current tasks stale references to the old mm_struct on exit() */
extern void exit_mm_release(struct task_struct *, struct mm_struct *);
/* Remove the current tasks stale references to the old mm_struct on exec() */
extern void exec_mm_release(struct task_struct *, struct mm_struct *);

#ifdef CONFIG_MEMCG
extern void mm_update_next_owner(struct mm_struct *mm);
#else
static inline void mm_update_next_owner(struct mm_struct *mm)
{
}
#endif /* CONFIG_MEMCG */

#ifdef CONFIG_MMU
#ifndef arch_get_mmap_end
#define arch_get_mmap_end(addr, len, flags)        (TASK_SIZE)
#endif

#ifndef arch_get_mmap_base
#define arch_get_mmap_base(addr, base) (base)
#endif

extern void arch_pick_mmap_layout(struct mm_struct *mm,
                                  const struct rlimit *rlim_stack);

unsigned long
arch_get_unmapped_area(struct file *filp, unsigned long addr,
                       unsigned long len, unsigned long pgoff,
                       unsigned long flags, vm_flags_t vm_flags);
unsigned long
arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
                               unsigned long len, unsigned long pgoff,
                               unsigned long flags, vm_flags_t);

unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *filp,
                                   unsigned long addr, unsigned long len,
                                   unsigned long pgoff, unsigned long flags);

unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm,
                                           struct file *filp,
                                           unsigned long addr,
                                           unsigned long len,
                                           unsigned long pgoff,
                                           unsigned long flags,
                                           vm_flags_t vm_flags);

unsigned long
generic_get_unmapped_area(struct file *filp, unsigned long addr,
                          unsigned long len, unsigned long pgoff,
                          unsigned long flags, vm_flags_t vm_flags);
unsigned long
generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
                                  unsigned long len, unsigned long pgoff,
                                  unsigned long flags, vm_flags_t vm_flags);
#else
static inline void arch_pick_mmap_layout(struct mm_struct *mm,
                                         const struct rlimit *rlim_stack) {}
#endif

static inline bool in_vfork(struct task_struct *tsk)
{
        bool ret;

        /*
         * need RCU to access ->real_parent if CLONE_VM was used along with
         * CLONE_PARENT.
         *
         * We check real_parent->mm == tsk->mm because CLONE_VFORK does not
         * imply CLONE_VM
         *
         * CLONE_VFORK can be used with CLONE_PARENT/CLONE_THREAD and thus
         * ->real_parent is not necessarily the task doing vfork(), so in
         * theory we can't rely on task_lock() if we want to dereference it.
         *
         * And in this case we can't trust the real_parent->mm == tsk->mm
         * check, it can be false negative. But we do not care, if init or
         * another oom-unkillable task does this it should blame itself.
         */
        rcu_read_lock();
        ret = tsk->vfork_done &&
                        rcu_dereference(tsk->real_parent)->mm == tsk->mm;
        rcu_read_unlock();

        return ret;
}

/*
 * Applies per-task gfp context to the given allocation flags.
 * PF_MEMALLOC_NOIO implies GFP_NOIO
 * PF_MEMALLOC_NOFS implies GFP_NOFS
 * PF_MEMALLOC_PIN  implies !GFP_MOVABLE
 */
static inline gfp_t current_gfp_context(gfp_t flags)
{
        unsigned int pflags = READ_ONCE(current->flags);

        if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_PIN))) {
                /*
                 * NOIO implies both NOIO and NOFS and it is a weaker context
                 * so always make sure it makes precedence
                 */
                if (pflags & PF_MEMALLOC_NOIO)
                        flags &= ~(__GFP_IO | __GFP_FS);
                else if (pflags & PF_MEMALLOC_NOFS)
                        flags &= ~__GFP_FS;

                if (pflags & PF_MEMALLOC_PIN)
                        flags &= ~__GFP_MOVABLE;
        }
        return flags;
}

#ifdef CONFIG_LOCKDEP
extern void __fs_reclaim_acquire(unsigned long ip);
extern void __fs_reclaim_release(unsigned long ip);
extern void fs_reclaim_acquire(gfp_t gfp_mask);
extern void fs_reclaim_release(gfp_t gfp_mask);
#else
static inline void __fs_reclaim_acquire(unsigned long ip) { }
static inline void __fs_reclaim_release(unsigned long ip) { }
static inline void fs_reclaim_acquire(gfp_t gfp_mask) { }
static inline void fs_reclaim_release(gfp_t gfp_mask) { }
#endif

/* Any memory-allocation retry loop should use
 * memalloc_retry_wait(), and pass the flags for the most
 * constrained allocation attempt that might have failed.
 * This provides useful documentation of where loops are,
 * and a central place to fine tune the waiting as the MM
 * implementation changes.
 */
static inline void memalloc_retry_wait(gfp_t gfp_flags)
{
        /* We use io_schedule_timeout because waiting for memory
         * typically included waiting for dirty pages to be
         * written out, which requires IO.
         */
        __set_current_state(TASK_UNINTERRUPTIBLE);
        gfp_flags = current_gfp_context(gfp_flags);
        if (gfpflags_allow_blocking(gfp_flags) &&
            !(gfp_flags & __GFP_NORETRY))
                /* Probably waited already, no need for much more */
                io_schedule_timeout(1);
        else
                /* Probably didn't wait, and has now released a lock,
                 * so now is a good time to wait
                 */
                io_schedule_timeout(HZ/50);
}

/**
 * might_alloc - Mark possible allocation sites
 * @gfp_mask: gfp_t flags that would be used to allocate
 *
 * Similar to might_sleep() and other annotations, this can be used in functions
 * that might allocate, but often don't. Compiles to nothing without
 * CONFIG_LOCKDEP. Includes a conditional might_sleep() if @gfp allows blocking.
 */
static inline void might_alloc(gfp_t gfp_mask)
{
        fs_reclaim_acquire(gfp_mask);
        fs_reclaim_release(gfp_mask);

        might_sleep_if(gfpflags_allow_blocking(gfp_mask));
}

/**
 * memalloc_flags_save - Add a PF_* flag to current->flags, save old value
 *
 * This allows PF_* flags to be conveniently added, irrespective of current
 * value, and then the old version restored with memalloc_flags_restore().
 */
static inline unsigned memalloc_flags_save(unsigned flags)
{
        unsigned oldflags = ~current->flags & flags;
        current->flags |= flags;
        return oldflags;
}

static inline void memalloc_flags_restore(unsigned flags)
{
        current->flags &= ~flags;
}

/**
 * memalloc_noio_save - Marks implicit GFP_NOIO allocation scope.
 *
 * This functions marks the beginning of the GFP_NOIO allocation scope.
 * All further allocations will implicitly drop __GFP_IO flag and so
 * they are safe for the IO critical section from the allocation recursion
 * point of view. Use memalloc_noio_restore to end the scope with flags
 * returned by this function.
 *
 * Context: This function is safe to be used from any context.
 * Return: The saved flags to be passed to memalloc_noio_restore.
 */
static inline unsigned int memalloc_noio_save(void)
{
        return memalloc_flags_save(PF_MEMALLOC_NOIO);
}

/**
 * memalloc_noio_restore - Ends the implicit GFP_NOIO scope.
 * @flags: Flags to restore.
 *
 * Ends the implicit GFP_NOIO scope started by memalloc_noio_save function.
 * Always make sure that the given flags is the return value from the
 * pairing memalloc_noio_save call.
 */
static inline void memalloc_noio_restore(unsigned int flags)
{
        memalloc_flags_restore(flags);
}

/**
 * memalloc_nofs_save - Marks implicit GFP_NOFS allocation scope.
 *
 * This functions marks the beginning of the GFP_NOFS allocation scope.
 * All further allocations will implicitly drop __GFP_FS flag and so
 * they are safe for the FS critical section from the allocation recursion
 * point of view. Use memalloc_nofs_restore to end the scope with flags
 * returned by this function.
 *
 * Context: This function is safe to be used from any context.
 * Return: The saved flags to be passed to memalloc_nofs_restore.
 */
static inline unsigned int memalloc_nofs_save(void)
{
        return memalloc_flags_save(PF_MEMALLOC_NOFS);
}

/**
 * memalloc_nofs_restore - Ends the implicit GFP_NOFS scope.
 * @flags: Flags to restore.
 *
 * Ends the implicit GFP_NOFS scope started by memalloc_nofs_save function.
 * Always make sure that the given flags is the return value from the
 * pairing memalloc_nofs_save call.
 */
static inline void memalloc_nofs_restore(unsigned int flags)
{
        memalloc_flags_restore(flags);
}

/**
 * memalloc_noreclaim_save - Marks implicit __GFP_MEMALLOC scope.
 *
 * This function marks the beginning of the __GFP_MEMALLOC allocation scope.
 * All further allocations will implicitly add the __GFP_MEMALLOC flag, which
 * prevents entering reclaim and allows access to all memory reserves. This
 * should only be used when the caller guarantees the allocation will allow more
 * memory to be freed very shortly, i.e. it needs to allocate some memory in
 * the process of freeing memory, and cannot reclaim due to potential recursion.
 *
 * Users of this scope have to be extremely careful to not deplete the reserves
 * completely and implement a throttling mechanism which controls the
 * consumption of the reserve based on the amount of freed memory. Usage of a
 * pre-allocated pool (e.g. mempool) should be always considered before using
 * this scope.
 *
 * Individual allocations under the scope can opt out using __GFP_NOMEMALLOC
 *
 * Context: This function should not be used in an interrupt context as that one
 *          does not give PF_MEMALLOC access to reserves.
 *          See __gfp_pfmemalloc_flags().
 * Return: The saved flags to be passed to memalloc_noreclaim_restore.
 */
static inline unsigned int memalloc_noreclaim_save(void)
{
        return memalloc_flags_save(PF_MEMALLOC);
}

/**
 * memalloc_noreclaim_restore - Ends the implicit __GFP_MEMALLOC scope.
 * @flags: Flags to restore.
 *
 * Ends the implicit __GFP_MEMALLOC scope started by memalloc_noreclaim_save
 * function. Always make sure that the given flags is the return value from the
 * pairing memalloc_noreclaim_save call.
 */
static inline void memalloc_noreclaim_restore(unsigned int flags)
{
        memalloc_flags_restore(flags);
}

/**
 * memalloc_pin_save - Marks implicit ~__GFP_MOVABLE scope.
 *
 * This function marks the beginning of the ~__GFP_MOVABLE allocation scope.
 * All further allocations will implicitly remove the __GFP_MOVABLE flag, which
 * will constraint the allocations to zones that allow long term pinning, i.e.
 * not ZONE_MOVABLE zones.
 *
 * Return: The saved flags to be passed to memalloc_pin_restore.
 */
static inline unsigned int memalloc_pin_save(void)
{
        return memalloc_flags_save(PF_MEMALLOC_PIN);
}

/**
 * memalloc_pin_restore - Ends the implicit ~__GFP_MOVABLE scope.
 * @flags: Flags to restore.
 *
 * Ends the implicit ~__GFP_MOVABLE scope started by memalloc_pin_save function.
 * Always make sure that the given flags is the return value from the pairing
 * memalloc_pin_save call.
 */
static inline void memalloc_pin_restore(unsigned int flags)
{
        memalloc_flags_restore(flags);
}

#ifdef CONFIG_MEMCG
DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg);
/**
 * set_active_memcg - Starts the remote memcg charging scope.
 * @memcg: memcg to charge.
 *
 * This function marks the beginning of the remote memcg charging scope. All the
 * __GFP_ACCOUNT allocations till the end of the scope will be charged to the
 * given memcg.
 *
 * Please, make sure that caller has a reference to the passed memcg structure,
 * so its lifetime is guaranteed to exceed the scope between two
 * set_active_memcg() calls.
 *
 * NOTE: This function can nest. Users must save the return value and
 * reset the previous value after their own charging scope is over.
 */
static inline struct mem_cgroup *
set_active_memcg(struct mem_cgroup *memcg)
{
        struct mem_cgroup *old;

        if (!in_task()) {
                old = this_cpu_read(int_active_memcg);
                this_cpu_write(int_active_memcg, memcg);
        } else {
                old = current->active_memcg;
                current->active_memcg = memcg;
        }

        return old;
}
#else
static inline struct mem_cgroup *
set_active_memcg(struct mem_cgroup *memcg)
{
        return NULL;
}
#endif

#ifdef CONFIG_MEMBARRIER
enum {
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY                = (1U << 0),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED                        = (1U << 1),
        MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY                        = (1U << 2),
        MEMBARRIER_STATE_GLOBAL_EXPEDITED                        = (1U << 3),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY        = (1U << 4),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE                = (1U << 5),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY                = (1U << 6),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ                        = (1U << 7),
};

enum {
        MEMBARRIER_FLAG_SYNC_CORE        = (1U << 0),
        MEMBARRIER_FLAG_RSEQ                = (1U << 1),
};

#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
#include <asm/membarrier.h>
#endif

static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
{
        /*
         * The atomic_read() below prevents CSE. The following should
         * help the compiler generate more efficient code on architectures
         * where sync_core_before_usermode() is a no-op.
         */
        if (!IS_ENABLED(CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE))
                return;
        if (current->mm != mm)
                return;
        if (likely(!(atomic_read(&mm->membarrier_state) &
                     MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE)))
                return;
        sync_core_before_usermode();
}

extern void membarrier_exec_mmap(struct mm_struct *mm);

extern void membarrier_update_current_mm(struct mm_struct *next_mm);

#else
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
                                             struct mm_struct *next,
                                             struct task_struct *tsk)
{
}
#endif
static inline void membarrier_exec_mmap(struct mm_struct *mm)
{
}
static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
{
}
static inline void membarrier_update_current_mm(struct mm_struct *next_mm)
{
}
#endif

#endif /* _LINUX_SCHED_MM_H */




















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Filesystem access notification for Linux
 *
 *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
 */

#ifndef __LINUX_FSNOTIFY_BACKEND_H
#define __LINUX_FSNOTIFY_BACKEND_H

#ifdef __KERNEL__

#include <linux/idr.h> /* inotify uses this */
#include <linux/fs.h> /* struct inode */
#include <linux/list.h>
#include <linux/path.h> /* struct path */
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/atomic.h>
#include <linux/user_namespace.h>
#include <linux/refcount.h>
#include <linux/mempool.h>
#include <linux/sched/mm.h>

/*
 * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily
 * convert between them.  dnotify only needs conversion at watch creation
 * so no perf loss there.  fanotify isn't defined yet, so it can use the
 * wholes if it needs more events.
 */
#define FS_ACCESS                0x00000001        /* File was accessed */
#define FS_MODIFY                0x00000002        /* File was modified */
#define FS_ATTRIB                0x00000004        /* Metadata changed */
#define FS_CLOSE_WRITE                0x00000008        /* Writable file was closed */
#define FS_CLOSE_NOWRITE        0x00000010        /* Unwritable file closed */
#define FS_OPEN                        0x00000020        /* File was opened */
#define FS_MOVED_FROM                0x00000040        /* File was moved from X */
#define FS_MOVED_TO                0x00000080        /* File was moved to Y */
#define FS_CREATE                0x00000100        /* Subfile was created */
#define FS_DELETE                0x00000200        /* Subfile was deleted */
#define FS_DELETE_SELF                0x00000400        /* Self was deleted */
#define FS_MOVE_SELF                0x00000800        /* Self was moved */
#define FS_OPEN_EXEC                0x00001000        /* File was opened for exec */

#define FS_UNMOUNT                0x00002000        /* inode on umount fs */
#define FS_Q_OVERFLOW                0x00004000        /* Event queued overflowed */
#define FS_ERROR                0x00008000        /* Filesystem Error (fanotify) */

/*
 * FS_IN_IGNORED overloads FS_ERROR.  It is only used internally by inotify
 * which does not support FS_ERROR.
 */
#define FS_IN_IGNORED                0x00008000        /* last inotify event here */

#define FS_OPEN_PERM                0x00010000        /* open event in an permission hook */
#define FS_ACCESS_PERM                0x00020000        /* access event in a permissions hook */
#define FS_OPEN_EXEC_PERM        0x00040000        /* open/exec event in a permission hook */
/* #define FS_DIR_MODIFY        0x00080000 */        /* Deprecated (reserved) */

#define FS_PRE_ACCESS                0x00100000        /* Pre-content access hook */

#define FS_MNT_ATTACH                0x01000000        /* Mount was attached */
#define FS_MNT_DETACH                0x02000000        /* Mount was detached */
#define FS_MNT_MOVE                (FS_MNT_ATTACH | FS_MNT_DETACH)

/*
 * Set on inode mark that cares about things that happen to its children.
 * Always set for dnotify and inotify.
 * Set on inode/sb/mount marks that care about parent/name info.
 */
#define FS_EVENT_ON_CHILD        0x08000000

#define FS_RENAME                0x10000000        /* File was renamed */
#define FS_DN_MULTISHOT                0x20000000        /* dnotify multishot */
#define FS_ISDIR                0x40000000        /* event occurred against dir */

#define FS_MOVE                        (FS_MOVED_FROM | FS_MOVED_TO)

/*
 * Directory entry modification events - reported only to directory
 * where entry is modified and not to a watching parent.
 * The watching parent may get an FS_ATTRIB|FS_EVENT_ON_CHILD event
 * when a directory entry inside a child subdir changes.
 */
#define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | FS_RENAME)

/* Mount namespace events */
#define FSNOTIFY_MNT_EVENTS (FS_MNT_ATTACH | FS_MNT_DETACH)

/* Content events can be used to inspect file content */
#define FSNOTIFY_CONTENT_PERM_EVENTS (FS_OPEN_PERM | FS_OPEN_EXEC_PERM | \
                                      FS_ACCESS_PERM)
/* Pre-content events can be used to fill file content */
#define FSNOTIFY_PRE_CONTENT_EVENTS  (FS_PRE_ACCESS)

#define ALL_FSNOTIFY_PERM_EVENTS (FSNOTIFY_CONTENT_PERM_EVENTS | \
                                  FSNOTIFY_PRE_CONTENT_EVENTS)

/*
 * This is a list of all events that may get sent to a parent that is watching
 * with flag FS_EVENT_ON_CHILD based on fs event on a child of that directory.
 */
#define FS_EVENTS_POSS_ON_CHILD   (ALL_FSNOTIFY_PERM_EVENTS | \
                                   FS_ACCESS | FS_MODIFY | FS_ATTRIB | \
                                   FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | \
                                   FS_OPEN | FS_OPEN_EXEC)

/*
 * This is a list of all events that may get sent with the parent inode as the
 * @to_tell argument of fsnotify().
 * It may include events that can be sent to an inode/sb/mount mark, but cannot
 * be sent to a parent watching children.
 */
#define FS_EVENTS_POSS_TO_PARENT (FS_EVENTS_POSS_ON_CHILD)

/* Events that can be reported to backends */
#define ALL_FSNOTIFY_EVENTS (ALL_FSNOTIFY_DIRENT_EVENTS | \
                             FSNOTIFY_MNT_EVENTS | \
                             FS_EVENTS_POSS_ON_CHILD | \
                             FS_DELETE_SELF | FS_MOVE_SELF | \
                             FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED | \
                             FS_ERROR)

/* Extra flags that may be reported with event or control handling of events */
#define ALL_FSNOTIFY_FLAGS  (FS_ISDIR | FS_EVENT_ON_CHILD | FS_DN_MULTISHOT)

#define ALL_FSNOTIFY_BITS   (ALL_FSNOTIFY_EVENTS | ALL_FSNOTIFY_FLAGS)

struct fsnotify_group;
struct fsnotify_event;
struct fsnotify_mark;
struct fsnotify_event_private_data;
struct fsnotify_fname;
struct fsnotify_iter_info;

struct mem_cgroup;

/*
 * Each group much define these ops.  The fsnotify infrastructure will call
 * these operations for each relevant group.
 *
 * handle_event - main call for a group to handle an fs event
 * @group:        group to notify
 * @mask:        event type and flags
 * @data:        object that event happened on
 * @data_type:        type of object for fanotify_data_XXX() accessors
 * @dir:        optional directory associated with event -
 *                if @file_name is not NULL, this is the directory that
 *                @file_name is relative to
 * @file_name:        optional file name associated with event
 * @cookie:        inotify rename cookie
 * @iter_info:        array of marks from this group that are interested in the event
 *
 * handle_inode_event - simple variant of handle_event() for groups that only
 *                have inode marks and don't have ignore mask
 * @mark:        mark to notify
 * @mask:        event type and flags
 * @inode:        inode that event happened on
 * @dir:        optional directory associated with event -
 *                if @file_name is not NULL, this is the directory that
 *                @file_name is relative to.
 *                Either @inode or @dir must be non-NULL.
 * @file_name:        optional file name associated with event
 * @cookie:        inotify rename cookie
 *
 * free_group_priv - called when a group refcnt hits 0 to clean up the private union
 * freeing_mark - called when a mark is being destroyed for some reason.  The group
 *                MUST be holding a reference on each mark and that reference must be
 *                dropped in this function.  inotify uses this function to send
 *                userspace messages that marks have been removed.
 */
struct fsnotify_ops {
        int (*handle_event)(struct fsnotify_group *group, u32 mask,
                            const void *data, int data_type, struct inode *dir,
                            const struct qstr *file_name, u32 cookie,
                            struct fsnotify_iter_info *iter_info);
        int (*handle_inode_event)(struct fsnotify_mark *mark, u32 mask,
                            struct inode *inode, struct inode *dir,
                            const struct qstr *file_name, u32 cookie);
        void (*free_group_priv)(struct fsnotify_group *group);
        void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group);
        void (*free_event)(struct fsnotify_group *group, struct fsnotify_event *event);
        /* called on final put+free to free memory */
        void (*free_mark)(struct fsnotify_mark *mark);
};

/*
 * all of the information about the original object we want to now send to
 * a group.  If you want to carry more info from the accessing task to the
 * listener this structure is where you need to be adding fields.
 */
struct fsnotify_event {
        struct list_head list;
};

/*
 * fsnotify group priorities.
 * Events are sent in order from highest priority to lowest priority.
 */
enum fsnotify_group_prio {
        FSNOTIFY_PRIO_NORMAL = 0,        /* normal notifiers, no permissions */
        FSNOTIFY_PRIO_CONTENT,                /* fanotify permission events */
        FSNOTIFY_PRIO_PRE_CONTENT,        /* fanotify pre-content events */
        __FSNOTIFY_PRIO_NUM
};

/*
 * A group is a "thing" that wants to receive notification about filesystem
 * events.  The mask holds the subset of event types this group cares about.
 * refcnt on a group is up to the implementor and at any moment if it goes 0
 * everything will be cleaned up.
 */
struct fsnotify_group {
        const struct fsnotify_ops *ops;        /* how this group handles things */

        /*
         * How the refcnt is used is up to each group.  When the refcnt hits 0
         * fsnotify will clean up all of the resources associated with this group.
         * As an example, the dnotify group will always have a refcnt=1 and that
         * will never change.  Inotify, on the other hand, has a group per
         * inotify_init() and the refcnt will hit 0 only when that fd has been
         * closed.
         */
        refcount_t refcnt;                /* things with interest in this group */

        /* needed to send notification to userspace */
        spinlock_t notification_lock;                /* protect the notification_list */
        struct list_head notification_list;        /* list of event_holder this group needs to send to userspace */
        wait_queue_head_t notification_waitq;        /* read() on the notification file blocks on this waitq */
        unsigned int q_len;                        /* events on the queue */
        unsigned int max_events;                /* maximum events allowed on the list */
        enum fsnotify_group_prio priority;        /* priority for sending events */
        bool shutdown;                /* group is being shut down, don't queue more events */

#define FSNOTIFY_GROUP_USER        0x01 /* user allocated group */
#define FSNOTIFY_GROUP_DUPS        0x02 /* allow multiple marks per object */
        int flags;
        unsigned int owner_flags;        /* stored flags of mark_mutex owner */

        /* stores all fastpath marks assoc with this group so they can be cleaned on unregister */
        struct mutex mark_mutex;        /* protect marks_list */
        atomic_t user_waits;                /* Number of tasks waiting for user
                                         * response */
        struct list_head marks_list;        /* all inode marks for this group */

        struct fasync_struct *fsn_fa;    /* async notification */

        struct fsnotify_event *overflow_event;        /* Event we queue when the
                                                 * notification list is too
                                                 * full */

        struct mem_cgroup *memcg;        /* memcg to charge allocations */
        struct user_namespace *user_ns;        /* user ns where group was created */

        /* groups can define private fields here or use the void *private */
        union {
                void *private;
#ifdef CONFIG_INOTIFY_USER
                struct inotify_group_private_data {
                        spinlock_t        idr_lock;
                        struct idr      idr;
                        struct ucounts *ucounts;
                } inotify_data;
#endif
#ifdef CONFIG_FANOTIFY
                struct fanotify_group_private_data {
                        /* Hash table of events for merge */
                        struct hlist_head *merge_hash;
                        /* allows a group to block waiting for a userspace response */
                        struct list_head access_list;
                        wait_queue_head_t access_waitq;
                        int flags;           /* flags from fanotify_init() */
                        int f_flags; /* event_f_flags from fanotify_init() */
                        struct ucounts *ucounts;
                        mempool_t error_events_pool;
                        /* chained on perm_group_list */
                        struct list_head perm_grp_list;
                } fanotify_data;
#endif /* CONFIG_FANOTIFY */
        };
};

/*
 * These helpers are used to prevent deadlock when reclaiming inodes with
 * evictable marks of the same group that is allocating a new mark.
 */
static inline void fsnotify_group_lock(struct fsnotify_group *group)
{
        mutex_lock(&group->mark_mutex);
        group->owner_flags = memalloc_nofs_save();
}

static inline void fsnotify_group_unlock(struct fsnotify_group *group)
{
        memalloc_nofs_restore(group->owner_flags);
        mutex_unlock(&group->mark_mutex);
}

static inline void fsnotify_group_assert_locked(struct fsnotify_group *group)
{
        WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex));
        WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS));
}

/* When calling fsnotify tell it if the data is a path or inode */
enum fsnotify_data_type {
        FSNOTIFY_EVENT_NONE,
        FSNOTIFY_EVENT_FILE_RANGE,
        FSNOTIFY_EVENT_PATH,
        FSNOTIFY_EVENT_INODE,
        FSNOTIFY_EVENT_DENTRY,
        FSNOTIFY_EVENT_MNT,
        FSNOTIFY_EVENT_ERROR,
};

struct fs_error_report {
        int error;
        struct inode *inode;
        struct super_block *sb;
};

struct file_range {
        const struct path *path;
        loff_t pos;
        size_t count;
};

static inline const struct path *file_range_path(const struct file_range *range)
{
        return range->path;
}

struct fsnotify_mnt {
        const struct mnt_namespace *ns;
        u64 mnt_id;
};

static inline struct inode *fsnotify_data_inode(const void *data, int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_INODE:
                return (struct inode *)data;
        case FSNOTIFY_EVENT_DENTRY:
                return d_inode(data);
        case FSNOTIFY_EVENT_PATH:
                return d_inode(((const struct path *)data)->dentry);
        case FSNOTIFY_EVENT_FILE_RANGE:
                return d_inode(file_range_path(data)->dentry);
        case FSNOTIFY_EVENT_ERROR:
                return ((struct fs_error_report *)data)->inode;
        default:
                return NULL;
        }
}

static inline struct dentry *fsnotify_data_dentry(const void *data, int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_DENTRY:
                /* Non const is needed for dget() */
                return (struct dentry *)data;
        case FSNOTIFY_EVENT_PATH:
                return ((const struct path *)data)->dentry;
        case FSNOTIFY_EVENT_FILE_RANGE:
                return file_range_path(data)->dentry;
        default:
                return NULL;
        }
}

static inline const struct path *fsnotify_data_path(const void *data,
                                                    int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_PATH:
                return data;
        case FSNOTIFY_EVENT_FILE_RANGE:
                return file_range_path(data);
        default:
                return NULL;
        }
}

static inline struct super_block *fsnotify_data_sb(const void *data,
                                                   int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_INODE:
                return ((struct inode *)data)->i_sb;
        case FSNOTIFY_EVENT_DENTRY:
                return ((struct dentry *)data)->d_sb;
        case FSNOTIFY_EVENT_PATH:
                return ((const struct path *)data)->dentry->d_sb;
        case FSNOTIFY_EVENT_FILE_RANGE:
                return file_range_path(data)->dentry->d_sb;
        case FSNOTIFY_EVENT_ERROR:
                return ((struct fs_error_report *) data)->sb;
        default:
                return NULL;
        }
}

static inline const struct fsnotify_mnt *fsnotify_data_mnt(const void *data,
                                                           int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_MNT:
                return data;
        default:
                return NULL;
        }
}

static inline u64 fsnotify_data_mnt_id(const void *data, int data_type)
{
        const struct fsnotify_mnt *mnt_data = fsnotify_data_mnt(data, data_type);

        return mnt_data ? mnt_data->mnt_id : 0;
}

static inline struct fs_error_report *fsnotify_data_error_report(
                                                        const void *data,
                                                        int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_ERROR:
                return (struct fs_error_report *) data;
        default:
                return NULL;
        }
}

static inline const struct file_range *fsnotify_data_file_range(
                                                        const void *data,
                                                        int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_FILE_RANGE:
                return (struct file_range *)data;
        default:
                return NULL;
        }
}

/*
 * Index to merged marks iterator array that correlates to a type of watch.
 * The type of watched object can be deduced from the iterator type, but not
 * the other way around, because an event can match different watched objects
 * of the same object type.
 * For example, both parent and child are watching an object of type inode.
 */
enum fsnotify_iter_type {
        FSNOTIFY_ITER_TYPE_INODE,
        FSNOTIFY_ITER_TYPE_VFSMOUNT,
        FSNOTIFY_ITER_TYPE_SB,
        FSNOTIFY_ITER_TYPE_PARENT,
        FSNOTIFY_ITER_TYPE_INODE2,
        FSNOTIFY_ITER_TYPE_MNTNS,
        FSNOTIFY_ITER_TYPE_COUNT
};

/* The type of object that a mark is attached to */
enum fsnotify_obj_type {
        FSNOTIFY_OBJ_TYPE_ANY = -1,
        FSNOTIFY_OBJ_TYPE_INODE,
        FSNOTIFY_OBJ_TYPE_VFSMOUNT,
        FSNOTIFY_OBJ_TYPE_SB,
        FSNOTIFY_OBJ_TYPE_MNTNS,
        FSNOTIFY_OBJ_TYPE_COUNT,
        FSNOTIFY_OBJ_TYPE_DETACHED = FSNOTIFY_OBJ_TYPE_COUNT
};

static inline bool fsnotify_valid_obj_type(unsigned int obj_type)
{
        return (obj_type < FSNOTIFY_OBJ_TYPE_COUNT);
}

struct fsnotify_iter_info {
        struct fsnotify_mark *marks[FSNOTIFY_ITER_TYPE_COUNT];
        struct fsnotify_group *current_group;
        unsigned int report_mask;
        int srcu_idx;
};

static inline bool fsnotify_iter_should_report_type(
                struct fsnotify_iter_info *iter_info, int iter_type)
{
        return (iter_info->report_mask & (1U << iter_type));
}

static inline void fsnotify_iter_set_report_type(
                struct fsnotify_iter_info *iter_info, int iter_type)
{
        iter_info->report_mask |= (1U << iter_type);
}

static inline struct fsnotify_mark *fsnotify_iter_mark(
                struct fsnotify_iter_info *iter_info, int iter_type)
{
        if (fsnotify_iter_should_report_type(iter_info, iter_type))
                return iter_info->marks[iter_type];
        return NULL;
}

static inline int fsnotify_iter_step(struct fsnotify_iter_info *iter, int type,
                                     struct fsnotify_mark **markp)
{
        while (type < FSNOTIFY_ITER_TYPE_COUNT) {
                *markp = fsnotify_iter_mark(iter, type);
                if (*markp)
                        break;
                type++;
        }
        return type;
}

#define FSNOTIFY_ITER_FUNCS(name, NAME) \
static inline struct fsnotify_mark *fsnotify_iter_##name##_mark( \
                struct fsnotify_iter_info *iter_info) \
{ \
        return fsnotify_iter_mark(iter_info, FSNOTIFY_ITER_TYPE_##NAME); \
}

FSNOTIFY_ITER_FUNCS(inode, INODE)
FSNOTIFY_ITER_FUNCS(parent, PARENT)
FSNOTIFY_ITER_FUNCS(vfsmount, VFSMOUNT)
FSNOTIFY_ITER_FUNCS(sb, SB)

#define fsnotify_foreach_iter_type(type) \
        for (type = 0; type < FSNOTIFY_ITER_TYPE_COUNT; type++)
#define fsnotify_foreach_iter_mark_type(iter, mark, type) \
        for (type = 0; \
             type = fsnotify_iter_step(iter, type, &mark), \
             type < FSNOTIFY_ITER_TYPE_COUNT; \
             type++)

/*
 * Inode/vfsmount/sb point to this structure which tracks all marks attached to
 * the inode/vfsmount/sb. The reference to inode/vfsmount/sb is held by this
 * structure. We destroy this structure when there are no more marks attached
 * to it. The structure is protected by fsnotify_mark_srcu.
 */
struct fsnotify_mark_connector {
        spinlock_t lock;
        unsigned char type;        /* Type of object [lock] */
        unsigned char prio;        /* Highest priority group */
#define FSNOTIFY_CONN_FLAG_IS_WATCHED        0x01
#define FSNOTIFY_CONN_FLAG_HAS_IREF        0x02
        unsigned short flags;        /* flags [lock] */
        union {
                /* Object pointer [lock] */
                void *obj;
                /* Used listing heads to free after srcu period expires */
                struct fsnotify_mark_connector *destroy_next;
        };
        struct hlist_head list;
};

/*
 * Container for per-sb fsnotify state (sb marks and more).
 * Attached lazily on first marked object on the sb and freed when killing sb.
 */
struct fsnotify_sb_info {
        struct fsnotify_mark_connector __rcu *sb_marks;
        /*
         * Number of inode/mount/sb objects that are being watched in this sb.
         * Note that inodes objects are currently double-accounted.
         *
         * The value in watched_objects[prio] is the number of objects that are
         * watched by groups of priority >= prio, so watched_objects[0] is the
         * total number of watched objects in this sb.
         */
        atomic_long_t watched_objects[__FSNOTIFY_PRIO_NUM];
};

static inline struct fsnotify_sb_info *fsnotify_sb_info(struct super_block *sb)
{
#ifdef CONFIG_FSNOTIFY
        return READ_ONCE(sb->s_fsnotify_info);
#else
        return NULL;
#endif
}

static inline atomic_long_t *fsnotify_sb_watched_objects(struct super_block *sb)
{
        return &fsnotify_sb_info(sb)->watched_objects[0];
}

/*
 * A mark is simply an object attached to an in core inode which allows an
 * fsnotify listener to indicate they are either no longer interested in events
 * of a type matching mask or only interested in those events.
 *
 * These are flushed when an inode is evicted from core and may be flushed
 * when the inode is modified (as seen by fsnotify_access).  Some fsnotify
 * users (such as dnotify) will flush these when the open fd is closed and not
 * at inode eviction or modification.
 *
 * Text in brackets is showing the lock(s) protecting modifications of a
 * particular entry. obj_lock means either inode->i_lock or
 * mnt->mnt_root->d_lock depending on the mark type.
 */
struct fsnotify_mark {
        /* Mask this mark is for [mark->lock, group->mark_mutex] */
        __u32 mask;
        /* We hold one for presence in g_list. Also one ref for each 'thing'
         * in kernel that found and may be using this mark. */
        refcount_t refcnt;
        /* Group this mark is for. Set on mark creation, stable until last ref
         * is dropped */
        struct fsnotify_group *group;
        /* List of marks by group->marks_list. Also reused for queueing
         * mark into destroy_list when it's waiting for the end of SRCU period
         * before it can be freed. [group->mark_mutex] */
        struct list_head g_list;
        /* Protects inode / mnt pointers, flags, masks */
        spinlock_t lock;
        /* List of marks for inode / vfsmount [connector->lock, mark ref] */
        struct hlist_node obj_list;
        /* Head of list of marks for an object [mark ref] */
        struct fsnotify_mark_connector *connector;
        /* Events types and flags to ignore [mark->lock, group->mark_mutex] */
        __u32 ignore_mask;
        /* General fsnotify mark flags */
#define FSNOTIFY_MARK_FLAG_ALIVE                0x0001
#define FSNOTIFY_MARK_FLAG_ATTACHED                0x0002
        /* inotify mark flags */
#define FSNOTIFY_MARK_FLAG_EXCL_UNLINK                0x0010
#define FSNOTIFY_MARK_FLAG_IN_ONESHOT                0x0020
        /* fanotify mark flags */
#define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY        0x0100
#define FSNOTIFY_MARK_FLAG_NO_IREF                0x0200
#define FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS        0x0400
#define FSNOTIFY_MARK_FLAG_HAS_FSID                0x0800
#define FSNOTIFY_MARK_FLAG_WEAK_FSID                0x1000
        unsigned int flags;                /* flags [mark->lock] */
};

#ifdef CONFIG_FSNOTIFY

/* called from the vfs helpers */

/* main fsnotify call to send events */
extern int fsnotify(__u32 mask, const void *data, int data_type,
                    struct inode *dir, const struct qstr *name,
                    struct inode *inode, u32 cookie);
extern int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
                           int data_type);
extern void __fsnotify_inode_delete(struct inode *inode);
extern void __fsnotify_vfsmount_delete(struct vfsmount *mnt);
extern void fsnotify_sb_delete(struct super_block *sb);
extern void __fsnotify_mntns_delete(struct mnt_namespace *mntns);
extern void fsnotify_sb_free(struct super_block *sb);
extern u32 fsnotify_get_cookie(void);
extern void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt);

static inline __u32 fsnotify_parent_needed_mask(__u32 mask)
{
        /* FS_EVENT_ON_CHILD is set on marks that want parent/name info */
        if (!(mask & FS_EVENT_ON_CHILD))
                return 0;
        /*
         * This object might be watched by a mark that cares about parent/name
         * info, does it care about the specific set of events that can be
         * reported with parent/name info?
         */
        return mask & FS_EVENTS_POSS_TO_PARENT;
}

static inline int fsnotify_inode_watches_children(struct inode *inode)
{
        __u32 parent_mask = READ_ONCE(inode->i_fsnotify_mask);

        /* FS_EVENT_ON_CHILD is set if the inode may care */
        if (!(parent_mask & FS_EVENT_ON_CHILD))
                return 0;
        /* this inode might care about child events, does it care about the
         * specific set of events that can happen on a child? */
        return parent_mask & FS_EVENTS_POSS_ON_CHILD;
}

/*
 * Update the dentry with a flag indicating the interest of its parent to receive
 * filesystem events when those events happens to this dentry->d_inode.
 */
static inline void fsnotify_update_flags(struct dentry *dentry)
{
        assert_spin_locked(&dentry->d_lock);

        /*
         * Serialisation of setting PARENT_WATCHED on the dentries is provided
         * by d_lock. If inotify_inode_watched changes after we have taken
         * d_lock, the following fsnotify_set_children_dentry_flags call will
         * find our entry, so it will spin until we complete here, and update
         * us with the new state.
         */
        if (fsnotify_inode_watches_children(dentry->d_parent->d_inode))
                dentry->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
        else
                dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
}

/* called from fsnotify listeners, such as fanotify or dnotify */

/* create a new group */
extern struct fsnotify_group *fsnotify_alloc_group(
                                const struct fsnotify_ops *ops,
                                int flags);
/* get reference to a group */
extern void fsnotify_get_group(struct fsnotify_group *group);
/* drop reference on a group from fsnotify_alloc_group */
extern void fsnotify_put_group(struct fsnotify_group *group);
/* group destruction begins, stop queuing new events */
extern void fsnotify_group_stop_queueing(struct fsnotify_group *group);
/* destroy group */
extern void fsnotify_destroy_group(struct fsnotify_group *group);
/* fasync handler function */
extern int fsnotify_fasync(int fd, struct file *file, int on);
/* Free event from memory */
extern void fsnotify_destroy_event(struct fsnotify_group *group,
                                   struct fsnotify_event *event);
/* attach the event to the group notification queue */
extern int fsnotify_insert_event(struct fsnotify_group *group,
                                 struct fsnotify_event *event,
                                 int (*merge)(struct fsnotify_group *,
                                              struct fsnotify_event *),
                                 void (*insert)(struct fsnotify_group *,
                                                struct fsnotify_event *));

static inline int fsnotify_add_event(struct fsnotify_group *group,
                                     struct fsnotify_event *event,
                                     int (*merge)(struct fsnotify_group *,
                                                  struct fsnotify_event *))
{
        return fsnotify_insert_event(group, event, merge, NULL);
}

/* Queue overflow event to a notification group */
static inline void fsnotify_queue_overflow(struct fsnotify_group *group)
{
        fsnotify_add_event(group, group->overflow_event, NULL);
}

static inline bool fsnotify_is_overflow_event(u32 mask)
{
        return mask & FS_Q_OVERFLOW;
}

static inline bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
{
        assert_spin_locked(&group->notification_lock);

        return list_empty(&group->notification_list);
}

extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group);
/* return, but do not dequeue the first event on the notification queue */
extern struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group);
/* return AND dequeue the first event on the notification queue */
extern struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group);
/* Remove event queued in the notification list */
extern void fsnotify_remove_queued_event(struct fsnotify_group *group,
                                         struct fsnotify_event *event);

/* functions used to manipulate the marks attached to inodes */

/*
 * Canonical "ignore mask" including event flags.
 *
 * Note the subtle semantic difference from the legacy ->ignored_mask.
 * ->ignored_mask traditionally only meant which events should be ignored,
 * while ->ignore_mask also includes flags regarding the type of objects on
 * which events should be ignored.
 */
static inline __u32 fsnotify_ignore_mask(struct fsnotify_mark *mark)
{
        __u32 ignore_mask = mark->ignore_mask;

        /* The event flags in ignore mask take effect */
        if (mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS)
                return ignore_mask;

        /*
         * Legacy behavior:
         * - Always ignore events on dir
         * - Ignore events on child if parent is watching children
         */
        ignore_mask |= FS_ISDIR;
        ignore_mask &= ~FS_EVENT_ON_CHILD;
        ignore_mask |= mark->mask & FS_EVENT_ON_CHILD;

        return ignore_mask;
}

/* Legacy ignored_mask - only event types to ignore */
static inline __u32 fsnotify_ignored_events(struct fsnotify_mark *mark)
{
        return mark->ignore_mask & ALL_FSNOTIFY_EVENTS;
}

/*
 * Check if mask (or ignore mask) should be applied depending if victim is a
 * directory and whether it is reported to a watching parent.
 */
static inline bool fsnotify_mask_applicable(__u32 mask, bool is_dir,
                                            int iter_type)
{
        /* Should mask be applied to a directory? */
        if (is_dir && !(mask & FS_ISDIR))
                return false;

        /* Should mask be applied to a child? */
        if (iter_type == FSNOTIFY_ITER_TYPE_PARENT &&
            !(mask & FS_EVENT_ON_CHILD))
                return false;

        return true;
}

/*
 * Effective ignore mask taking into account if event victim is a
 * directory and whether it is reported to a watching parent.
 */
static inline __u32 fsnotify_effective_ignore_mask(struct fsnotify_mark *mark,
                                                   bool is_dir, int iter_type)
{
        __u32 ignore_mask = fsnotify_ignored_events(mark);

        if (!ignore_mask)
                return 0;

        /* For non-dir and non-child, no need to consult the event flags */
        if (!is_dir && iter_type != FSNOTIFY_ITER_TYPE_PARENT)
                return ignore_mask;

        ignore_mask = fsnotify_ignore_mask(mark);
        if (!fsnotify_mask_applicable(ignore_mask, is_dir, iter_type))
                return 0;

        return ignore_mask & ALL_FSNOTIFY_EVENTS;
}

/* Get mask for calculating object interest taking ignore mask into account */
static inline __u32 fsnotify_calc_mask(struct fsnotify_mark *mark)
{
        __u32 mask = mark->mask;

        if (!fsnotify_ignored_events(mark))
                return mask;

        /* Interest in FS_MODIFY may be needed for clearing ignore mask */
        if (!(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
                mask |= FS_MODIFY;

        /*
         * If mark is interested in ignoring events on children, the object must
         * show interest in those events for fsnotify_parent() to notice it.
         */
        return mask | mark->ignore_mask;
}

/* Get mask of events for a list of marks */
extern __u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn);
/* Calculate mask of events for a list of marks */
extern void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn);
extern void fsnotify_init_mark(struct fsnotify_mark *mark,
                               struct fsnotify_group *group);
/* Find mark belonging to given group in the list of marks */
struct fsnotify_mark *fsnotify_find_mark(void *obj, unsigned int obj_type,
                                         struct fsnotify_group *group);
/* attach the mark to the object */
int fsnotify_add_mark(struct fsnotify_mark *mark, void *obj,
                      unsigned int obj_type, int add_flags);
int fsnotify_add_mark_locked(struct fsnotify_mark *mark, void *obj,
                             unsigned int obj_type, int add_flags);

/* attach the mark to the inode */
static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
                                          struct inode *inode,
                                          int add_flags)
{
        return fsnotify_add_mark(mark, inode, FSNOTIFY_OBJ_TYPE_INODE,
                                 add_flags);
}
static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark,
                                                 struct inode *inode,
                                                 int add_flags)
{
        return fsnotify_add_mark_locked(mark, inode, FSNOTIFY_OBJ_TYPE_INODE,
                                        add_flags);
}

static inline struct fsnotify_mark *fsnotify_find_inode_mark(
                                                struct inode *inode,
                                                struct fsnotify_group *group)
{
        return fsnotify_find_mark(inode, FSNOTIFY_OBJ_TYPE_INODE, group);
}

/* given a group and a mark, flag mark to be freed when all references are dropped */
extern void fsnotify_destroy_mark(struct fsnotify_mark *mark,
                                  struct fsnotify_group *group);
/* detach mark from inode / mount list, group list, drop inode reference */
extern void fsnotify_detach_mark(struct fsnotify_mark *mark);
/* free mark */
extern void fsnotify_free_mark(struct fsnotify_mark *mark);
/* Wait until all marks queued for destruction are destroyed */
extern void fsnotify_wait_marks_destroyed(void);
/* Clear all of the marks of a group attached to a given object type */
extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
                                          unsigned int obj_type);
extern void fsnotify_get_mark(struct fsnotify_mark *mark);
extern void fsnotify_put_mark(struct fsnotify_mark *mark);
extern void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info);
extern bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info);

static inline void fsnotify_init_event(struct fsnotify_event *event)
{
        INIT_LIST_HEAD(&event->list);
}
int fsnotify_pre_content(const struct path *path, const loff_t *ppos,
                         size_t count);

#else

static inline int fsnotify_pre_content(const struct path *path,
                                       const loff_t *ppos, size_t count)
{
        return 0;
}

static inline int fsnotify(__u32 mask, const void *data, int data_type,
                           struct inode *dir, const struct qstr *name,
                           struct inode *inode, u32 cookie)
{
        return 0;
}

static inline int __fsnotify_parent(struct dentry *dentry, __u32 mask,
                                  const void *data, int data_type)
{
        return 0;
}

static inline void __fsnotify_inode_delete(struct inode *inode)
{}

static inline void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
{}

static inline void fsnotify_sb_delete(struct super_block *sb)
{}

static inline void __fsnotify_mntns_delete(struct mnt_namespace *mntns)
{}

static inline void fsnotify_sb_free(struct super_block *sb)
{}

static inline void fsnotify_update_flags(struct dentry *dentry)
{}

static inline u32 fsnotify_get_cookie(void)
{
        return 0;
}

static inline void fsnotify_unmount_inodes(struct super_block *sb)
{}

static inline void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt)
{}

#endif        /* CONFIG_FSNOTIFY */

#endif        /* __KERNEL __ */

#endif        /* __LINUX_FSNOTIFY_BACKEND_H */



























































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_INETDEVICE_H
#define _LINUX_INETDEVICE_H

#ifdef __KERNEL__

#include <linux/bitmap.h>
#include <linux/if.h>
#include <linux/ip.h>
#include <linux/netdevice.h>
#include <linux/rcupdate.h>
#include <linux/timer.h>
#include <linux/sysctl.h>
#include <linux/rtnetlink.h>
#include <linux/refcount.h>

struct ipv4_devconf {
        void        *sysctl;
        int        data[IPV4_DEVCONF_MAX];
        DECLARE_BITMAP(state, IPV4_DEVCONF_MAX);
};

#define MC_HASH_SZ_LOG 9

struct in_device {
        struct net_device        *dev;
        netdevice_tracker        dev_tracker;

        refcount_t                refcnt;
        int                        dead;
        struct in_ifaddr        __rcu *ifa_list;/* IP ifaddr chain                */

        struct ip_mc_list __rcu        *mc_list;        /* IP multicast filter chain    */
        struct ip_mc_list __rcu        * __rcu *mc_hash;

        int                        mc_count;        /* Number of installed mcasts        */
        spinlock_t                mc_tomb_lock;
        struct ip_mc_list        *mc_tomb;
        unsigned long                mr_v1_seen;
        unsigned long                mr_v2_seen;
        unsigned long                mr_maxdelay;
        unsigned long                mr_qi;                /* Query Interval */
        unsigned long                mr_qri;                /* Query Response Interval */
        unsigned char                mr_qrv;                /* Query Robustness Variable */
        unsigned char                mr_gq_running;
        u32                        mr_ifc_count;
        struct timer_list        mr_gq_timer;        /* general query timer */
        struct timer_list        mr_ifc_timer;        /* interface change timer */

        struct neigh_parms        *arp_parms;
        struct ipv4_devconf        cnf;
        struct rcu_head                rcu_head;
};

#define IPV4_DEVCONF(cnf, attr) ((cnf).data[IPV4_DEVCONF_ ## attr - 1])
#define IPV4_DEVCONF_RO(cnf, attr) READ_ONCE(IPV4_DEVCONF(cnf, attr))
#define IPV4_DEVCONF_ALL(net, attr) \
        IPV4_DEVCONF((*(net)->ipv4.devconf_all), attr)
#define IPV4_DEVCONF_ALL_RO(net, attr) READ_ONCE(IPV4_DEVCONF_ALL(net, attr))

static inline int ipv4_devconf_get(const struct in_device *in_dev, int index)
{
        index--;
        return READ_ONCE(in_dev->cnf.data[index]);
}

static inline void ipv4_devconf_set(struct in_device *in_dev, int index,
                                    int val)
{
        index--;
        set_bit(index, in_dev->cnf.state);
        WRITE_ONCE(in_dev->cnf.data[index], val);
}

static inline void ipv4_devconf_setall(struct in_device *in_dev)
{
        bitmap_fill(in_dev->cnf.state, IPV4_DEVCONF_MAX);
}

#define IN_DEV_CONF_GET(in_dev, attr) \
        ipv4_devconf_get((in_dev), IPV4_DEVCONF_ ## attr)
#define IN_DEV_CONF_SET(in_dev, attr, val) \
        ipv4_devconf_set((in_dev), IPV4_DEVCONF_ ## attr, (val))

#define IN_DEV_ANDCONF(in_dev, attr) \
        (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), attr) && \
         IN_DEV_CONF_GET((in_dev), attr))

#define IN_DEV_NET_ORCONF(in_dev, net, attr) \
        (IPV4_DEVCONF_ALL_RO(net, attr) || \
         IN_DEV_CONF_GET((in_dev), attr))

#define IN_DEV_ORCONF(in_dev, attr) \
        IN_DEV_NET_ORCONF(in_dev, dev_net(in_dev->dev), attr)

#define IN_DEV_MAXCONF(in_dev, attr) \
        (max(IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), attr), \
             IN_DEV_CONF_GET((in_dev), attr)))

#define IN_DEV_FORWARD(in_dev)                IN_DEV_CONF_GET((in_dev), FORWARDING)
#define IN_DEV_MFORWARD(in_dev)                IN_DEV_ANDCONF((in_dev), MC_FORWARDING)
#define IN_DEV_BFORWARD(in_dev)                IN_DEV_ANDCONF((in_dev), BC_FORWARDING)
#define IN_DEV_RPFILTER(in_dev)                IN_DEV_MAXCONF((in_dev), RP_FILTER)
#define IN_DEV_SRC_VMARK(in_dev)            IN_DEV_ORCONF((in_dev), SRC_VMARK)
#define IN_DEV_SOURCE_ROUTE(in_dev)        IN_DEV_ANDCONF((in_dev), \
                                                       ACCEPT_SOURCE_ROUTE)
#define IN_DEV_ACCEPT_LOCAL(in_dev)        IN_DEV_ORCONF((in_dev), ACCEPT_LOCAL)
#define IN_DEV_BOOTP_RELAY(in_dev)        IN_DEV_ANDCONF((in_dev), BOOTP_RELAY)

#define IN_DEV_LOG_MARTIANS(in_dev)        IN_DEV_ORCONF((in_dev), LOG_MARTIANS)
#define IN_DEV_PROXY_ARP(in_dev)        IN_DEV_ORCONF((in_dev), PROXY_ARP)
#define IN_DEV_PROXY_ARP_PVLAN(in_dev)        IN_DEV_ORCONF((in_dev), PROXY_ARP_PVLAN)
#define IN_DEV_SHARED_MEDIA(in_dev)        IN_DEV_ORCONF((in_dev), SHARED_MEDIA)
#define IN_DEV_TX_REDIRECTS(in_dev)        IN_DEV_ORCONF((in_dev), SEND_REDIRECTS)
#define IN_DEV_SEC_REDIRECTS(in_dev)        IN_DEV_ORCONF((in_dev), \
                                                      SECURE_REDIRECTS)
#define IN_DEV_IDTAG(in_dev)                IN_DEV_CONF_GET(in_dev, TAG)
#define IN_DEV_MEDIUM_ID(in_dev)        IN_DEV_CONF_GET(in_dev, MEDIUM_ID)
#define IN_DEV_PROMOTE_SECONDARIES(in_dev) \
                                        IN_DEV_ORCONF((in_dev), \
                                                      PROMOTE_SECONDARIES)
#define IN_DEV_ROUTE_LOCALNET(in_dev)        IN_DEV_ORCONF(in_dev, ROUTE_LOCALNET)
#define IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)        \
        IN_DEV_NET_ORCONF(in_dev, net, ROUTE_LOCALNET)

#define IN_DEV_RX_REDIRECTS(in_dev) \
        ((IN_DEV_FORWARD(in_dev) && \
          IN_DEV_ANDCONF((in_dev), ACCEPT_REDIRECTS)) \
         || (!IN_DEV_FORWARD(in_dev) && \
          IN_DEV_ORCONF((in_dev), ACCEPT_REDIRECTS)))

#define IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) \
        IN_DEV_ORCONF((in_dev), IGNORE_ROUTES_WITH_LINKDOWN)

#define IN_DEV_ARPFILTER(in_dev)        IN_DEV_ORCONF((in_dev), ARPFILTER)
#define IN_DEV_ARP_ACCEPT(in_dev)        IN_DEV_MAXCONF((in_dev), ARP_ACCEPT)
#define IN_DEV_ARP_ANNOUNCE(in_dev)        IN_DEV_MAXCONF((in_dev), ARP_ANNOUNCE)
#define IN_DEV_ARP_IGNORE(in_dev)        IN_DEV_MAXCONF((in_dev), ARP_IGNORE)
#define IN_DEV_ARP_NOTIFY(in_dev)        IN_DEV_MAXCONF((in_dev), ARP_NOTIFY)
#define IN_DEV_ARP_EVICT_NOCARRIER(in_dev) IN_DEV_ANDCONF((in_dev), \
                                                          ARP_EVICT_NOCARRIER)

struct in_ifaddr {
        struct hlist_node        addr_lst;
        struct in_ifaddr        __rcu *ifa_next;
        struct in_device        *ifa_dev;
        struct rcu_head                rcu_head;
        __be32                        ifa_local;
        __be32                        ifa_address;
        __be32                        ifa_mask;
        __u32                        ifa_rt_priority;
        __be32                        ifa_broadcast;
        unsigned char                ifa_scope;
        unsigned char                ifa_prefixlen;
        unsigned char                ifa_proto;
        __u32                        ifa_flags;
        char                        ifa_label[IFNAMSIZ];

        /* In seconds, relative to tstamp. Expiry is at tstamp + HZ * lft. */
        __u32                        ifa_valid_lft;
        __u32                        ifa_preferred_lft;
        unsigned long                ifa_cstamp; /* created timestamp */
        unsigned long                ifa_tstamp; /* updated timestamp */
};

struct in_validator_info {
        __be32                        ivi_addr;
        struct in_device        *ivi_dev;
        struct netlink_ext_ack        *extack;
};

int register_inetaddr_notifier(struct notifier_block *nb);
int unregister_inetaddr_notifier(struct notifier_block *nb);
int register_inetaddr_validator_notifier(struct notifier_block *nb);
int unregister_inetaddr_validator_notifier(struct notifier_block *nb);

void inet_netconf_notify_devconf(struct net *net, int event, int type,
                                 int ifindex, struct ipv4_devconf *devconf);

struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref);
static inline struct net_device *ip_dev_find(struct net *net, __be32 addr)
{
        return __ip_dev_find(net, addr, true);
}

int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b);
int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *);
#ifdef CONFIG_INET
int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size);
#else
static inline int inet_gifconf(struct net_device *dev, char __user *buf,
                               int len, int size)
{
        return 0;
}
#endif
void devinet_init(void);
struct in_device *inetdev_by_index(struct net *, int);
__be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope);
__be32 inet_confirm_addr(struct net *net, struct in_device *in_dev, __be32 dst,
                         __be32 local, int scope);
struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
                                    __be32 mask);
struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr);
static inline bool inet_ifa_match(__be32 addr, const struct in_ifaddr *ifa)
{
        return !((addr^ifa->ifa_address)&ifa->ifa_mask);
}

/*
 *        Check if a mask is acceptable.
 */
 
static __inline__ bool bad_mask(__be32 mask, __be32 addr)
{
        __u32 hmask;
        if (addr & (mask = ~mask))
                return true;
        hmask = ntohl(mask);
        if (hmask & (hmask+1))
                return true;
        return false;
}

#define in_dev_for_each_ifa_rtnl(ifa, in_dev)                        \
        for (ifa = rtnl_dereference((in_dev)->ifa_list); ifa;        \
             ifa = rtnl_dereference(ifa->ifa_next))

#define in_dev_for_each_ifa_rtnl_net(net, ifa, in_dev)                        \
        for (ifa = rtnl_net_dereference(net, (in_dev)->ifa_list); ifa;        \
             ifa = rtnl_net_dereference(net, ifa->ifa_next))

#define in_dev_for_each_ifa_rcu(ifa, in_dev)                        \
        for (ifa = rcu_dereference((in_dev)->ifa_list); ifa;        \
             ifa = rcu_dereference(ifa->ifa_next))

static inline struct in_device *__in_dev_get_rcu(const struct net_device *dev)
{
        return rcu_dereference(dev->ip_ptr);
}

static inline struct in_device *in_dev_get(const struct net_device *dev)
{
        struct in_device *in_dev;

        rcu_read_lock();
        in_dev = __in_dev_get_rcu(dev);
        if (in_dev)
                refcount_inc(&in_dev->refcnt);
        rcu_read_unlock();
        return in_dev;
}

static inline struct in_device *__in_dev_get_rtnl(const struct net_device *dev)
{
        return rtnl_dereference(dev->ip_ptr);
}

static inline struct in_device *__in_dev_get_rtnl_net(const struct net_device *dev)
{
        return rtnl_net_dereference(dev_net(dev), dev->ip_ptr);
}

/* called with rcu_read_lock or rtnl held */
static inline bool ip_ignore_linkdown(const struct net_device *dev)
{
        struct in_device *in_dev;
        bool rc = false;

        in_dev = rcu_dereference_rtnl(dev->ip_ptr);
        if (in_dev &&
            IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev))
                rc = true;

        return rc;
}

static inline struct neigh_parms *__in_dev_arp_parms_get_rcu(const struct net_device *dev)
{
        struct in_device *in_dev = __in_dev_get_rcu(dev);

        return in_dev ? in_dev->arp_parms : NULL;
}

void in_dev_finish_destroy(struct in_device *idev);

static inline void in_dev_put(struct in_device *idev)
{
        if (refcount_dec_and_test(&idev->refcnt))
                in_dev_finish_destroy(idev);
}

#define __in_dev_put(idev)  refcount_dec(&(idev)->refcnt)
#define in_dev_hold(idev)   refcount_inc(&(idev)->refcnt)

#endif /* __KERNEL__ */

static __inline__ __be32 inet_make_mask(int logmask)
{
        if (logmask)
                return htonl(~((1U<<(32-logmask))-1));
        return 0;
}

static __inline__ int inet_mask_len(__be32 mask)
{
        __u32 hmask = ntohl(mask);
        if (!hmask)
                return 0;
        return 32 - ffz(~hmask);
}


#endif /* _LINUX_INETDEVICE_H */


































































   13 


















    8 


   13 



















   12 













































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
// SPDX-License-Identifier: GPL-2.0
/*
 *  Kernel internal schedule timeout and sleeping functions
 */

#include <linux/delay.h>
#include <linux/jiffies.h>
#include <linux/timer.h>
#include <linux/sched/signal.h>
#include <linux/sched/debug.h>

#include "tick-internal.h"

/*
 * Since schedule_timeout()'s timer is defined on the stack, it must store
 * the target task on the stack as well.
 */
struct process_timer {
        struct timer_list timer;
        struct task_struct *task;
};

static void process_timeout(struct timer_list *t)
{
        struct process_timer *timeout = timer_container_of(timeout, t, timer);

        wake_up_process(timeout->task);
}

/**
 * schedule_timeout - sleep until timeout
 * @timeout: timeout value in jiffies
 *
 * Make the current task sleep until @timeout jiffies have elapsed.
 * The function behavior depends on the current task state
 * (see also set_current_state() description):
 *
 * %TASK_RUNNING - the scheduler is called, but the task does not sleep
 * at all. That happens because sched_submit_work() does nothing for
 * tasks in %TASK_RUNNING state.
 *
 * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
 * pass before the routine returns unless the current task is explicitly
 * woken up, (e.g. by wake_up_process()).
 *
 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
 * delivered to the current task or the current task is explicitly woken
 * up.
 *
 * The current task state is guaranteed to be %TASK_RUNNING when this
 * routine returns.
 *
 * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
 * the CPU away without a bound on the timeout. In this case the return
 * value will be %MAX_SCHEDULE_TIMEOUT.
 *
 * Returns: 0 when the timer has expired otherwise the remaining time in
 * jiffies will be returned. In all cases the return value is guaranteed
 * to be non-negative.
 */
signed long __sched schedule_timeout(signed long timeout)
{
        struct process_timer timer;
        unsigned long expire;

        switch (timeout) {
        case MAX_SCHEDULE_TIMEOUT:
                /*
                 * These two special cases are useful to be comfortable
                 * in the caller. Nothing more. We could take
                 * MAX_SCHEDULE_TIMEOUT from one of the negative value
                 * but I' d like to return a valid offset (>=0) to allow
                 * the caller to do everything it want with the retval.
                 */
                schedule();
                goto out;
        default:
                /*
                 * Another bit of PARANOID. Note that the retval will be
                 * 0 since no piece of kernel is supposed to do a check
                 * for a negative retval of schedule_timeout() (since it
                 * should never happens anyway). You just have the printk()
                 * that will tell you if something is gone wrong and where.
                 */
                if (timeout < 0) {
                        pr_err("%s: wrong timeout value %lx\n", __func__, timeout);
                        dump_stack();
                        __set_current_state(TASK_RUNNING);
                        goto out;
                }
        }

        expire = timeout + jiffies;

        timer.task = current;
        timer_setup_on_stack(&timer.timer, process_timeout, 0);
        timer.timer.expires = expire;
        add_timer(&timer.timer);
        schedule();
        timer_delete_sync(&timer.timer);

        /* Remove the timer from the object tracker */
        timer_destroy_on_stack(&timer.timer);

        timeout = expire - jiffies;

 out:
        return timeout < 0 ? 0 : timeout;
}
EXPORT_SYMBOL(schedule_timeout);

/*
 * __set_current_state() can be used in schedule_timeout_*() functions, because
 * schedule_timeout() calls schedule() unconditionally.
 */

/**
 * schedule_timeout_interruptible - sleep until timeout (interruptible)
 * @timeout: timeout value in jiffies
 *
 * See schedule_timeout() for details.
 *
 * Task state is set to TASK_INTERRUPTIBLE before starting the timeout.
 */
signed long __sched schedule_timeout_interruptible(signed long timeout)
{
        __set_current_state(TASK_INTERRUPTIBLE);
        return schedule_timeout(timeout);
}
EXPORT_SYMBOL(schedule_timeout_interruptible);

/**
 * schedule_timeout_killable - sleep until timeout (killable)
 * @timeout: timeout value in jiffies
 *
 * See schedule_timeout() for details.
 *
 * Task state is set to TASK_KILLABLE before starting the timeout.
 */
signed long __sched schedule_timeout_killable(signed long timeout)
{
        __set_current_state(TASK_KILLABLE);
        return schedule_timeout(timeout);
}
EXPORT_SYMBOL(schedule_timeout_killable);

/**
 * schedule_timeout_uninterruptible - sleep until timeout (uninterruptible)
 * @timeout: timeout value in jiffies
 *
 * See schedule_timeout() for details.
 *
 * Task state is set to TASK_UNINTERRUPTIBLE before starting the timeout.
 */
signed long __sched schedule_timeout_uninterruptible(signed long timeout)
{
        __set_current_state(TASK_UNINTERRUPTIBLE);
        return schedule_timeout(timeout);
}
EXPORT_SYMBOL(schedule_timeout_uninterruptible);

/**
 * schedule_timeout_idle - sleep until timeout (idle)
 * @timeout: timeout value in jiffies
 *
 * See schedule_timeout() for details.
 *
 * Task state is set to TASK_IDLE before starting the timeout. It is similar to
 * schedule_timeout_uninterruptible(), except this task will not contribute to
 * load average.
 */
signed long __sched schedule_timeout_idle(signed long timeout)
{
        __set_current_state(TASK_IDLE);
        return schedule_timeout(timeout);
}
EXPORT_SYMBOL(schedule_timeout_idle);

/**
 * schedule_hrtimeout_range_clock - sleep until timeout
 * @expires:        timeout value (ktime_t)
 * @delta:        slack in expires timeout (ktime_t)
 * @mode:        timer mode
 * @clock_id:        timer clock to be used
 *
 * Details are explained in schedule_hrtimeout_range() function description as
 * this function is commonly used.
 */
int __sched schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
                                           const enum hrtimer_mode mode, clockid_t clock_id)
{
        struct hrtimer_sleeper t;

        /*
         * Optimize when a zero timeout value is given. It does not
         * matter whether this is an absolute or a relative time.
         */
        if (expires && *expires == 0) {
                __set_current_state(TASK_RUNNING);
                return 0;
        }

        /*
         * A NULL parameter means "infinite"
         */
        if (!expires) {
                schedule();
                return -EINTR;
        }

        hrtimer_setup_sleeper_on_stack(&t, clock_id, mode);
        hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
        hrtimer_sleeper_start_expires(&t, mode);

        if (likely(t.task))
                schedule();

        hrtimer_cancel(&t.timer);
        destroy_hrtimer_on_stack(&t.timer);

        __set_current_state(TASK_RUNNING);

        return !t.task ? 0 : -EINTR;
}
EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock);

/**
 * schedule_hrtimeout_range - sleep until timeout
 * @expires:        timeout value (ktime_t)
 * @delta:        slack in expires timeout (ktime_t)
 * @mode:        timer mode
 *
 * Make the current task sleep until the given expiry time has
 * elapsed. The routine will return immediately unless
 * the current task state has been set (see set_current_state()).
 *
 * The @delta argument gives the kernel the freedom to schedule the
 * actual wakeup to a time that is both power and performance friendly
 * for regular (non RT/DL) tasks.
 * The kernel give the normal best effort behavior for "@expires+@delta",
 * but may decide to fire the timer earlier, but no earlier than @expires.
 *
 * You can set the task state as follows -
 *
 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
 * pass before the routine returns unless the current task is explicitly
 * woken up, (e.g. by wake_up_process()).
 *
 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
 * delivered to the current task or the current task is explicitly woken
 * up.
 *
 * The current task state is guaranteed to be TASK_RUNNING when this
 * routine returns.
 *
 * Returns: 0 when the timer has expired. If the task was woken before the
 * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or
 * by an explicit wakeup, it returns -EINTR.
 */
int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta,
                                     const enum hrtimer_mode mode)
{
        return schedule_hrtimeout_range_clock(expires, delta, mode,
                                              CLOCK_MONOTONIC);
}
EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);

/**
 * schedule_hrtimeout - sleep until timeout
 * @expires:        timeout value (ktime_t)
 * @mode:        timer mode
 *
 * See schedule_hrtimeout_range() for details. @delta argument of
 * schedule_hrtimeout_range() is set to 0 and has therefore no impact.
 */
int __sched schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode)
{
        return schedule_hrtimeout_range(expires, 0, mode);
}
EXPORT_SYMBOL_GPL(schedule_hrtimeout);

/**
 * msleep - sleep safely even with waitqueue interruptions
 * @msecs:        Requested sleep duration in milliseconds
 *
 * msleep() uses jiffy based timeouts for the sleep duration. Because of the
 * design of the timer wheel, the maximum additional percentage delay (slack) is
 * 12.5%. This is only valid for timers which will end up in level 1 or a higher
 * level of the timer wheel. For explanation of those 12.5% please check the
 * detailed description about the basics of the timer wheel.
 *
 * The slack of timers which will end up in level 0 depends on sleep duration
 * (msecs) and HZ configuration and can be calculated in the following way (with
 * the timer wheel design restriction that the slack is not less than 12.5%):
 *
 *   ``slack = MSECS_PER_TICK / msecs``
 *
 * When the allowed slack of the callsite is known, the calculation could be
 * turned around to find the minimal allowed sleep duration to meet the
 * constraints. For example:
 *
 * * ``HZ=1000`` with ``slack=25%``: ``MSECS_PER_TICK / slack = 1 / (1/4) = 4``:
 *   all sleep durations greater or equal 4ms will meet the constraints.
 * * ``HZ=1000`` with ``slack=12.5%``: ``MSECS_PER_TICK / slack = 1 / (1/8) = 8``:
 *   all sleep durations greater or equal 8ms will meet the constraints.
 * * ``HZ=250`` with ``slack=25%``: ``MSECS_PER_TICK / slack = 4 / (1/4) = 16``:
 *   all sleep durations greater or equal 16ms will meet the constraints.
 * * ``HZ=250`` with ``slack=12.5%``: ``MSECS_PER_TICK / slack = 4 / (1/8) = 32``:
 *   all sleep durations greater or equal 32ms will meet the constraints.
 *
 * See also the signal aware variant msleep_interruptible().
 */
void msleep(unsigned int msecs)
{
        unsigned long timeout = msecs_to_jiffies(msecs);

        while (timeout)
                timeout = schedule_timeout_uninterruptible(timeout);
}
EXPORT_SYMBOL(msleep);

/**
 * msleep_interruptible - sleep waiting for signals
 * @msecs:        Requested sleep duration in milliseconds
 *
 * See msleep() for some basic information.
 *
 * The difference between msleep() and msleep_interruptible() is that the sleep
 * could be interrupted by a signal delivery and then returns early.
 *
 * Returns: The remaining time of the sleep duration transformed to msecs (see
 * schedule_timeout() for details).
 */
unsigned long msleep_interruptible(unsigned int msecs)
{
        unsigned long timeout = msecs_to_jiffies(msecs);

        while (timeout && !signal_pending(current))
                timeout = schedule_timeout_interruptible(timeout);
        return jiffies_to_msecs(timeout);
}
EXPORT_SYMBOL(msleep_interruptible);

/**
 * usleep_range_state - Sleep for an approximate time in a given state
 * @min:        Minimum time in usecs to sleep
 * @max:        Maximum time in usecs to sleep
 * @state:        State of the current task that will be while sleeping
 *
 * usleep_range_state() sleeps at least for the minimum specified time but not
 * longer than the maximum specified amount of time. The range might reduce
 * power usage by allowing hrtimers to coalesce an already scheduled interrupt
 * with this hrtimer. In the worst case, an interrupt is scheduled for the upper
 * bound.
 *
 * The sleeping task is set to the specified state before starting the sleep.
 *
 * In non-atomic context where the exact wakeup time is flexible, use
 * usleep_range() or its variants instead of udelay(). The sleep improves
 * responsiveness by avoiding the CPU-hogging busy-wait of udelay().
 */
void __sched usleep_range_state(unsigned long min, unsigned long max, unsigned int state)
{
        ktime_t exp = ktime_add_us(ktime_get(), min);
        u64 delta = (u64)(max - min) * NSEC_PER_USEC;

        if (WARN_ON_ONCE(max < min))
                delta = 0;

        for (;;) {
                __set_current_state(state);
                /* Do not return before the requested sleep time has elapsed */
                if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS))
                        break;
        }
}
EXPORT_SYMBOL(usleep_range_state);




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2020 ARM Ltd.
 */
#ifndef __ASM_VDSO_PROCESSOR_H
#define __ASM_VDSO_PROCESSOR_H

#ifndef __ASSEMBLER__

/* PAUSE is a good thing to insert into busy-wait loops. */
static __always_inline void native_pause(void)
{
        asm volatile("pause" ::: "memory");
}

static __always_inline void cpu_relax(void)
{
        native_pause();
}

struct getcpu_cache;

notrace long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused);

#endif /* __ASSEMBLER__ */

#endif /* __ASM_VDSO_PROCESSOR_H */















































































































































































    6 







    6 






    6 











































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
/* SPDX-License-Identifier: GPL-2.0 */
/* thread_info.h: low-level thread information
 *
 * Copyright (C) 2002  David Howells (dhowells@redhat.com)
 * - Incorporating suggestions made by Linus Torvalds and Dave Miller
 */

#ifndef _ASM_X86_THREAD_INFO_H
#define _ASM_X86_THREAD_INFO_H

#include <linux/compiler.h>
#include <asm/page.h>
#include <asm/percpu.h>
#include <asm/types.h>

/*
 * TOP_OF_KERNEL_STACK_PADDING is a number of unused bytes that we
 * reserve at the top of the kernel stack.  We do it because of a nasty
 * 32-bit corner case.  On x86_32, the hardware stack frame is
 * variable-length.  Except for vm86 mode, struct pt_regs assumes a
 * maximum-length frame.  If we enter from CPL 0, the top 8 bytes of
 * pt_regs don't actually exist.  Ordinarily this doesn't matter, but it
 * does in at least one case:
 *
 * If we take an NMI early enough in SYSENTER, then we can end up with
 * pt_regs that extends above sp0.  On the way out, in the espfix code,
 * we can read the saved SS value, but that value will be above sp0.
 * Without this offset, that can result in a page fault.  (We are
 * careful that, in this case, the value we read doesn't matter.)
 *
 * In vm86 mode, the hardware frame is much longer still, so add 16
 * bytes to make room for the real-mode segments.
 *
 * x86-64 has a fixed-length stack frame, but it depends on whether
 * or not FRED is enabled. Future versions of FRED might make this
 * dynamic, but for now it is always 2 words longer.
 */
#ifdef CONFIG_X86_32
# ifdef CONFIG_VM86
#  define TOP_OF_KERNEL_STACK_PADDING 16
# else
#  define TOP_OF_KERNEL_STACK_PADDING 8
# endif
#else /* x86-64 */
# ifdef CONFIG_X86_FRED
#  define TOP_OF_KERNEL_STACK_PADDING (2 * 8)
# else
#  define TOP_OF_KERNEL_STACK_PADDING 0
# endif
#endif

/*
 * low level task data that entry.S needs immediate access to
 * - this struct should fit entirely inside of one cache line
 * - this struct shares the supervisor stack pages
 */
#ifndef __ASSEMBLER__
struct task_struct;
#include <asm/cpufeature.h>
#include <linux/atomic.h>

struct thread_info {
        unsigned long                flags;                /* low level flags */
        unsigned long                syscall_work;        /* SYSCALL_WORK_ flags */
        u32                        status;                /* thread synchronous flags */
#ifdef CONFIG_SMP
        u32                        cpu;                /* current CPU */
#endif
};

#define INIT_THREAD_INFO(tsk)                        \
{                                                \
        .flags                = 0,                        \
}

#else /* !__ASSEMBLER__ */

#include <asm/asm-offsets.h>

#endif

/*
 * Tell the generic TIF infrastructure which bits x86 supports
 */
#define HAVE_TIF_NEED_RESCHED_LAZY
#define HAVE_TIF_POLLING_NRFLAG
#define HAVE_TIF_SINGLESTEP

#include <asm-generic/thread_info_tif.h>

/* Architecture specific TIF space starts at 16 */
#define TIF_SSBD                16        /* Speculative store bypass disable */
#define TIF_SPEC_IB                17        /* Indirect branch speculation mitigation */
#define TIF_SPEC_L1D_FLUSH        18        /* Flush L1D on mm switches (processes) */
#define TIF_NEED_FPU_LOAD        19        /* load FPU on return to userspace */
#define TIF_NOCPUID                20        /* CPUID is not accessible in userland */
#define TIF_NOTSC                21        /* TSC is not accessible in userland */
#define TIF_IO_BITMAP                22        /* uses I/O bitmap */
#define TIF_SPEC_FORCE_UPDATE        23        /* Force speculation MSR update in context switch */
#define TIF_FORCED_TF                24        /* true if TF in eflags artificially */
#define TIF_SINGLESTEP                25        /* reenable singlestep on user return*/
#define TIF_BLOCKSTEP                26        /* set when we want DEBUGCTLMSR_BTF */
#define TIF_LAZY_MMU_UPDATES        27        /* task is updating the mmu lazily */
#define TIF_ADDR32                28        /* 32-bit address space on 64 bits */

#define _TIF_SSBD                BIT(TIF_SSBD)
#define _TIF_SPEC_IB                BIT(TIF_SPEC_IB)
#define _TIF_SPEC_L1D_FLUSH        BIT(TIF_SPEC_L1D_FLUSH)
#define _TIF_NEED_FPU_LOAD        BIT(TIF_NEED_FPU_LOAD)
#define _TIF_NOCPUID                BIT(TIF_NOCPUID)
#define _TIF_NOTSC                BIT(TIF_NOTSC)
#define _TIF_IO_BITMAP                BIT(TIF_IO_BITMAP)
#define _TIF_SPEC_FORCE_UPDATE        BIT(TIF_SPEC_FORCE_UPDATE)
#define _TIF_FORCED_TF                BIT(TIF_FORCED_TF)
#define _TIF_BLOCKSTEP                BIT(TIF_BLOCKSTEP)
#define _TIF_SINGLESTEP                BIT(TIF_SINGLESTEP)
#define _TIF_LAZY_MMU_UPDATES        BIT(TIF_LAZY_MMU_UPDATES)
#define _TIF_ADDR32                BIT(TIF_ADDR32)

/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW_BASE                                        \
        (_TIF_NOCPUID | _TIF_NOTSC | _TIF_BLOCKSTEP |                \
         _TIF_SSBD | _TIF_SPEC_FORCE_UPDATE)

/*
 * Avoid calls to __switch_to_xtra() on UP as STIBP is not evaluated.
 */
#ifdef CONFIG_SMP
# define _TIF_WORK_CTXSW        (_TIF_WORK_CTXSW_BASE | _TIF_SPEC_IB)
#else
# define _TIF_WORK_CTXSW        (_TIF_WORK_CTXSW_BASE)
#endif

#ifdef CONFIG_X86_IOPL_IOPERM
# define _TIF_WORK_CTXSW_PREV        (_TIF_WORK_CTXSW| _TIF_USER_RETURN_NOTIFY | \
                                 _TIF_IO_BITMAP)
#else
# define _TIF_WORK_CTXSW_PREV        (_TIF_WORK_CTXSW| _TIF_USER_RETURN_NOTIFY)
#endif

#define _TIF_WORK_CTXSW_NEXT        (_TIF_WORK_CTXSW)

#define STACK_WARN                (THREAD_SIZE/8)

/*
 * macros/functions for gaining access to the thread information structure
 *
 * preempt_count needs to be 1 initially, until the scheduler is functional.
 */
#ifndef __ASSEMBLER__

/*
 * Walks up the stack frames to make sure that the specified object is
 * entirely contained by a single stack frame.
 *
 * Returns:
 *        GOOD_FRAME        if within a frame
 *        BAD_STACK        if placed across a frame boundary (or outside stack)
 *        NOT_STACK        unable to determine (no frame pointers, etc)
 *
 * This function reads pointers from the stack and dereferences them. The
 * pointers may not have their KMSAN shadow set up properly, which may result
 * in false positive reports. Disable instrumentation to avoid those.
 */
__no_kmsan_checks
static inline int arch_within_stack_frames(const void * const stack,
                                           const void * const stackend,
                                           const void *obj, unsigned long len)
{
#if defined(CONFIG_FRAME_POINTER)
        const void *frame = NULL;
        const void *oldframe;

        oldframe = __builtin_frame_address(1);
        if (oldframe)
                frame = __builtin_frame_address(2);
        /*
         * low ----------------------------------------------> high
         * [saved bp][saved ip][args][local vars][saved bp][saved ip]
         *                     ^----------------^
         *               allow copies only within here
         */
        while (stack <= frame && frame < stackend) {
                /*
                 * If obj + len extends past the last frame, this
                 * check won't pass and the next frame will be 0,
                 * causing us to bail out and correctly report
                 * the copy as invalid.
                 */
                if (obj + len <= frame)
                        return obj >= oldframe + 2 * sizeof(void *) ?
                                GOOD_FRAME : BAD_STACK;
                oldframe = frame;
                frame = *(const void * const *)frame;
        }
        return BAD_STACK;
#else
        return NOT_STACK;
#endif
}

#endif  /* !__ASSEMBLER__ */

/*
 * Thread-synchronous status.
 *
 * This is different from the flags in that nobody else
 * ever touches our thread-synchronous status, so we don't
 * have to worry about atomic accesses.
 */
#define TS_COMPAT                0x0002        /* 32bit syscall active (64BIT)*/

#ifndef __ASSEMBLER__
#ifdef CONFIG_COMPAT
#define TS_I386_REGS_POKED        0x0004        /* regs poked by 32-bit ptracer */

#define arch_set_restart_data(restart)        \
        do { restart->arch_data = current_thread_info()->status; } while (0)

#endif

#ifdef CONFIG_X86_32
#define in_ia32_syscall() true
#else
#define in_ia32_syscall() (IS_ENABLED(CONFIG_IA32_EMULATION) && \
                           current_thread_info()->status & TS_COMPAT)
#endif

extern void arch_setup_new_exec(void);
#define arch_setup_new_exec arch_setup_new_exec
#endif        /* !__ASSEMBLER__ */

#endif /* _ASM_X86_THREAD_INFO_H */






































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#ifndef _LINUX_HASH_H
#define _LINUX_HASH_H
/* Fast hashing routine for ints,  longs and pointers.
   (C) 2002 Nadia Yvette Chambers, IBM */

#include <asm/types.h>
#include <linux/compiler.h>

/*
 * The "GOLDEN_RATIO_PRIME" is used in ifs/btrfs/brtfs_inode.h and
 * fs/inode.c.  It's not actually prime any more (the previous primes
 * were actively bad for hashing), but the name remains.
 */
#if BITS_PER_LONG == 32
#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_32
#define hash_long(val, bits) hash_32(val, bits)
#elif BITS_PER_LONG == 64
#define hash_long(val, bits) hash_64(val, bits)
#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_64
#else
#error Wordsize not 32 or 64
#endif

/*
 * This hash multiplies the input by a large odd number and takes the
 * high bits.  Since multiplication propagates changes to the most
 * significant end only, it is essential that the high bits of the
 * product be used for the hash value.
 *
 * Chuck Lever verified the effectiveness of this technique:
 * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
 *
 * Although a random odd number will do, it turns out that the golden
 * ratio phi = (sqrt(5)-1)/2, or its negative, has particularly nice
 * properties.  (See Knuth vol 3, section 6.4, exercise 9.)
 *
 * These are the negative, (1 - phi) = phi**2 = (3 - sqrt(5))/2,
 * which is very slightly easier to multiply by and makes no
 * difference to the hash distribution.
 */
#define GOLDEN_RATIO_32 0x61C88647
#define GOLDEN_RATIO_64 0x61C8864680B583EBull

#ifdef CONFIG_HAVE_ARCH_HASH
/* This header may use the GOLDEN_RATIO_xx constants */
#include <asm/hash.h>
#endif

/*
 * The _generic versions exist only so lib/test_hash.c can compare
 * the arch-optimized versions with the generic.
 *
 * Note that if you change these, any <asm/hash.h> that aren't updated
 * to match need to have their HAVE_ARCH_* define values updated so the
 * self-test will not false-positive.
 */
#ifndef HAVE_ARCH__HASH_32
#define __hash_32 __hash_32_generic
#endif
static inline u32 __hash_32_generic(u32 val)
{
        return val * GOLDEN_RATIO_32;
}

static inline u32 hash_32(u32 val, unsigned int bits)
{
        /* High bits are more random, so use them. */
        return __hash_32(val) >> (32 - bits);
}

#ifndef HAVE_ARCH_HASH_64
#define hash_64 hash_64_generic
#endif
static __always_inline u32 hash_64_generic(u64 val, unsigned int bits)
{
#if BITS_PER_LONG == 64
        /* 64x64-bit multiply is efficient on all 64-bit processors */
        return val * GOLDEN_RATIO_64 >> (64 - bits);
#else
        /* Hash 64 bits using only 32x32-bit multiply. */
        return hash_32((u32)val ^ __hash_32(val >> 32), bits);
#endif
}

static inline u32 hash_ptr(const void *ptr, unsigned int bits)
{
        return hash_long((unsigned long)ptr, bits);
}

/* This really should be called fold32_ptr; it does no hashing to speak of. */
static inline u32 hash32_ptr(const void *ptr)
{
        unsigned long val = (unsigned long)ptr;

#if BITS_PER_LONG == 64
        val ^= (val >> 32);
#endif
        return (u32)val;
}

#endif /* _LINUX_HASH_H */




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_COOKIE_H
#define __LINUX_COOKIE_H

#include <linux/atomic.h>
#include <linux/percpu.h>
#include <asm/local.h>

struct pcpu_gen_cookie {
        local_t nesting;
        u64 last;
} __aligned(16);

struct gen_cookie {
        struct pcpu_gen_cookie __percpu *local;
        atomic64_t forward_last ____cacheline_aligned_in_smp;
        atomic64_t reverse_last;
};

#define COOKIE_LOCAL_BATCH        4096

#define DEFINE_COOKIE(name)                                                \
        static DEFINE_PER_CPU(struct pcpu_gen_cookie, __##name);        \
        static struct gen_cookie name = {                                \
                .local                = &__##name,                                \
                .forward_last        = ATOMIC64_INIT(0),                        \
                .reverse_last        = ATOMIC64_INIT(0),                        \
        }

static __always_inline u64 gen_cookie_next(struct gen_cookie *gc)
{
        struct pcpu_gen_cookie *local = this_cpu_ptr(gc->local);
        u64 val;

        if (likely(local_inc_return(&local->nesting) == 1)) {
                val = local->last;
                if (__is_defined(CONFIG_SMP) &&
                    unlikely((val & (COOKIE_LOCAL_BATCH - 1)) == 0)) {
                        s64 next = atomic64_add_return(COOKIE_LOCAL_BATCH,
                                                       &gc->forward_last);
                        val = next - COOKIE_LOCAL_BATCH;
                }
                local->last = ++val;
        } else {
                val = atomic64_dec_return(&gc->reverse_last);
        }
        local_dec(&local->nesting);
        return val;
}

#endif /* __LINUX_COOKIE_H */






















































































  316 


















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) */
/* Copyright (C) 2016-2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
 *
 * SipHash: a fast short-input PRF
 * https://131002.net/siphash/
 *
 * This implementation is specifically for SipHash2-4 for a secure PRF
 * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for
 * hashtables.
 */

#ifndef _LINUX_SIPHASH_H
#define _LINUX_SIPHASH_H

#include <linux/types.h>
#include <linux/kernel.h>

#define SIPHASH_ALIGNMENT __alignof__(u64)
typedef struct {
        u64 key[2];
} siphash_key_t;

#define siphash_aligned_key_t siphash_key_t __aligned(16)

static inline bool siphash_key_is_zero(const siphash_key_t *key)
{
        return !(key->key[0] | key->key[1]);
}

u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key);
u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key);

u64 siphash_1u64(const u64 a, const siphash_key_t *key);
u64 siphash_2u64(const u64 a, const u64 b, const siphash_key_t *key);
u64 siphash_3u64(const u64 a, const u64 b, const u64 c,
                 const siphash_key_t *key);
u64 siphash_4u64(const u64 a, const u64 b, const u64 c, const u64 d,
                 const siphash_key_t *key);
u64 siphash_1u32(const u32 a, const siphash_key_t *key);
u64 siphash_3u32(const u32 a, const u32 b, const u32 c,
                 const siphash_key_t *key);

static inline u64 siphash_2u32(const u32 a, const u32 b,
                               const siphash_key_t *key)
{
        return siphash_1u64((u64)b << 32 | a, key);
}
static inline u64 siphash_4u32(const u32 a, const u32 b, const u32 c,
                               const u32 d, const siphash_key_t *key)
{
        return siphash_2u64((u64)b << 32 | a, (u64)d << 32 | c, key);
}


static inline u64 ___siphash_aligned(const __le64 *data, size_t len,
                                     const siphash_key_t *key)
{
        if (__builtin_constant_p(len) && len == 4)
                return siphash_1u32(le32_to_cpup((const __le32 *)data), key);
        if (__builtin_constant_p(len) && len == 8)
                return siphash_1u64(le64_to_cpu(data[0]), key);
        if (__builtin_constant_p(len) && len == 16)
                return siphash_2u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]),
                                    key);
        if (__builtin_constant_p(len) && len == 24)
                return siphash_3u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]),
                                    le64_to_cpu(data[2]), key);
        if (__builtin_constant_p(len) && len == 32)
                return siphash_4u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]),
                                    le64_to_cpu(data[2]), le64_to_cpu(data[3]),
                                    key);
        return __siphash_aligned(data, len, key);
}

/**
 * siphash - compute 64-bit siphash PRF value
 * @data: buffer to hash
 * @size: size of @data
 * @key: the siphash key
 */
static inline u64 siphash(const void *data, size_t len,
                          const siphash_key_t *key)
{
        if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
            !IS_ALIGNED((unsigned long)data, SIPHASH_ALIGNMENT))
                return __siphash_unaligned(data, len, key);
        return ___siphash_aligned(data, len, key);
}

#define HSIPHASH_ALIGNMENT __alignof__(unsigned long)
typedef struct {
        unsigned long key[2];
} hsiphash_key_t;

u32 __hsiphash_aligned(const void *data, size_t len,
                       const hsiphash_key_t *key);
u32 __hsiphash_unaligned(const void *data, size_t len,
                         const hsiphash_key_t *key);

u32 hsiphash_1u32(const u32 a, const hsiphash_key_t *key);
u32 hsiphash_2u32(const u32 a, const u32 b, const hsiphash_key_t *key);
u32 hsiphash_3u32(const u32 a, const u32 b, const u32 c,
                  const hsiphash_key_t *key);
u32 hsiphash_4u32(const u32 a, const u32 b, const u32 c, const u32 d,
                  const hsiphash_key_t *key);

static inline u32 ___hsiphash_aligned(const __le32 *data, size_t len,
                                      const hsiphash_key_t *key)
{
        if (__builtin_constant_p(len) && len == 4)
                return hsiphash_1u32(le32_to_cpu(data[0]), key);
        if (__builtin_constant_p(len) && len == 8)
                return hsiphash_2u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]),
                                     key);
        if (__builtin_constant_p(len) && len == 12)
                return hsiphash_3u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]),
                                     le32_to_cpu(data[2]), key);
        if (__builtin_constant_p(len) && len == 16)
                return hsiphash_4u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]),
                                     le32_to_cpu(data[2]), le32_to_cpu(data[3]),
                                     key);
        return __hsiphash_aligned(data, len, key);
}

/**
 * hsiphash - compute 32-bit hsiphash PRF value
 * @data: buffer to hash
 * @size: size of @data
 * @key: the hsiphash key
 */
static inline u32 hsiphash(const void *data, size_t len,
                           const hsiphash_key_t *key)
{
        if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
            !IS_ALIGNED((unsigned long)data, HSIPHASH_ALIGNMENT))
                return __hsiphash_unaligned(data, len, key);
        return ___hsiphash_aligned(data, len, key);
}

/*
 * These macros expose the raw SipHash and HalfSipHash permutations.
 * Do not use them directly! If you think you have a use for them,
 * be sure to CC the maintainer of this file explaining why.
 */

#define SIPHASH_PERMUTATION(a, b, c, d) ( \
        (a) += (b), (b) = rol64((b), 13), (b) ^= (a), (a) = rol64((a), 32), \
        (c) += (d), (d) = rol64((d), 16), (d) ^= (c), \
        (a) += (d), (d) = rol64((d), 21), (d) ^= (a), \
        (c) += (b), (b) = rol64((b), 17), (b) ^= (c), (c) = rol64((c), 32))

#define SIPHASH_CONST_0 0x736f6d6570736575ULL
#define SIPHASH_CONST_1 0x646f72616e646f6dULL
#define SIPHASH_CONST_2 0x6c7967656e657261ULL
#define SIPHASH_CONST_3 0x7465646279746573ULL

#define HSIPHASH_PERMUTATION(a, b, c, d) ( \
        (a) += (b), (b) = rol32((b), 5), (b) ^= (a), (a) = rol32((a), 16), \
        (c) += (d), (d) = rol32((d), 8), (d) ^= (c), \
        (a) += (d), (d) = rol32((d), 7), (d) ^= (a), \
        (c) += (b), (b) = rol32((b), 13), (b) ^= (c), (c) = rol32((c), 16))

#define HSIPHASH_CONST_0 0U
#define HSIPHASH_CONST_1 0U
#define HSIPHASH_CONST_2 0x6c796765U
#define HSIPHASH_CONST_3 0x74656462U

#endif /* _LINUX_SIPHASH_H */


























































































































































































































































































    4 





    4 


    4 

    4 



    4 



    4 



    4 
















    4 



    4 

    4 
    4 
    4 





























































    4 




    4 
    4 

    4 


    4 
    4 



    4 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
// SPDX-License-Identifier: GPL-2.0
/*
 *  mm/pgtable-generic.c
 *
 *  Generic pgtable methods declared in linux/pgtable.h
 *
 *  Copyright (C) 2010  Linus Torvalds
 */

#include <linux/pagemap.h>
#include <linux/hugetlb.h>
#include <linux/pgtable.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/mm_inline.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>

/*
 * If a p?d_bad entry is found while walking page tables, report
 * the error, before resetting entry to p?d_none.  Usually (but
 * very seldom) called out from the p?d_none_or_clear_bad macros.
 */

void pgd_clear_bad(pgd_t *pgd)
{
        pgd_ERROR(*pgd);
        pgd_clear(pgd);
}

#ifndef __PAGETABLE_P4D_FOLDED
void p4d_clear_bad(p4d_t *p4d)
{
        p4d_ERROR(*p4d);
        p4d_clear(p4d);
}
#endif

#ifndef __PAGETABLE_PUD_FOLDED
void pud_clear_bad(pud_t *pud)
{
        pud_ERROR(*pud);
        pud_clear(pud);
}
#endif

/*
 * Note that the pmd variant below can't be stub'ed out just as for p4d/pud
 * above. pmd folding is special and typically pmd_* macros refer to upper
 * level even when folded
 */
void pmd_clear_bad(pmd_t *pmd)
{
        pmd_ERROR(*pmd);
        pmd_clear(pmd);
}

#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
/*
 * Only sets the access flags (dirty, accessed), as well as write
 * permission. Furthermore, we know it always gets set to a "more
 * permissive" setting, which allows most architectures to optimize
 * this. We return whether the PTE actually changed, which in turn
 * instructs the caller to do things like update__mmu_cache.  This
 * used to be done in the caller, but sparc needs minor faults to
 * force that call on sun4c so we changed this macro slightly
 */
int ptep_set_access_flags(struct vm_area_struct *vma,
                          unsigned long address, pte_t *ptep,
                          pte_t entry, int dirty)
{
        int changed = !pte_same(ptep_get(ptep), entry);
        if (changed) {
                set_pte_at(vma->vm_mm, address, ptep, entry);
                flush_tlb_fix_spurious_fault(vma, address, ptep);
        }
        return changed;
}
#endif

#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
int ptep_clear_flush_young(struct vm_area_struct *vma,
                           unsigned long address, pte_t *ptep)
{
        int young;
        young = ptep_test_and_clear_young(vma, address, ptep);
        if (young)
                flush_tlb_page(vma, address);
        return young;
}
#endif

#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
                       pte_t *ptep)
{
        struct mm_struct *mm = (vma)->vm_mm;
        pte_t pte;
        pte = ptep_get_and_clear(mm, address, ptep);
        if (pte_accessible(mm, pte))
                flush_tlb_page(vma, address);
        return pte;
}
#endif

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
int pmdp_set_access_flags(struct vm_area_struct *vma,
                          unsigned long address, pmd_t *pmdp,
                          pmd_t entry, int dirty)
{
        int changed = !pmd_same(*pmdp, entry);
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
        if (changed) {
                set_pmd_at(vma->vm_mm, address, pmdp, entry);
                flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        }
        return changed;
}
#endif

#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
int pmdp_clear_flush_young(struct vm_area_struct *vma,
                           unsigned long address, pmd_t *pmdp)
{
        int young;
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
        young = pmdp_test_and_clear_young(vma, address, pmdp);
        if (young)
                flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        return young;
}
#endif

#ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
                            pmd_t *pmdp)
{
        pmd_t pmd;
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
        VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp));
        pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
        flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        return pmd;
}

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
                            pud_t *pudp)
{
        pud_t pud;

        VM_BUG_ON(address & ~HPAGE_PUD_MASK);
        VM_BUG_ON(!pud_trans_huge(*pudp));
        pud = pudp_huge_get_and_clear(vma->vm_mm, address, pudp);
        flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
        return pud;
}
#endif
#endif

#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
                                pgtable_t pgtable)
{
        assert_spin_locked(pmd_lockptr(mm, pmdp));

        /* FIFO */
        if (!pmd_huge_pte(mm, pmdp))
                INIT_LIST_HEAD(&pgtable->lru);
        else
                list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru);
        pmd_huge_pte(mm, pmdp) = pgtable;
}
#endif

#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
/* no "address" argument so destroys page coloring of some arch */
pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
{
        pgtable_t pgtable;

        assert_spin_locked(pmd_lockptr(mm, pmdp));

        /* FIFO */
        pgtable = pmd_huge_pte(mm, pmdp);
        pmd_huge_pte(mm, pmdp) = list_first_entry_or_null(&pgtable->lru,
                                                          struct page, lru);
        if (pmd_huge_pte(mm, pmdp))
                list_del(&pgtable->lru);
        return pgtable;
}
#endif

#ifndef __HAVE_ARCH_PMDP_INVALIDATE
pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
                     pmd_t *pmdp)
{
        VM_WARN_ON_ONCE(!pmd_present(*pmdp));
        pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
        flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        return old;
}
#endif

#ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD
pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
                         pmd_t *pmdp)
{
        VM_WARN_ON_ONCE(!pmd_present(*pmdp));
        return pmdp_invalidate(vma, address, pmdp);
}
#endif

#ifndef pmdp_collapse_flush
pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
                          pmd_t *pmdp)
{
        /*
         * pmd and hugepage pte format are same. So we could
         * use the same function.
         */
        pmd_t pmd;

        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
        VM_BUG_ON(pmd_trans_huge(*pmdp));
        pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);

        /* collapse entails shooting down ptes not pmd */
        flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        return pmd;
}
#endif

/* arch define pte_free_defer in asm/pgalloc.h for its own implementation */
#ifndef pte_free_defer
static void pte_free_now(struct rcu_head *head)
{
        struct page *page;

        page = container_of(head, struct page, rcu_head);
        pte_free(NULL /* mm not passed and not used */, (pgtable_t)page);
}

void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
{
        struct page *page;

        page = pgtable;
        call_rcu(&page->rcu_head, pte_free_now);
}
#endif /* pte_free_defer */
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#if defined(CONFIG_GUP_GET_PXX_LOW_HIGH) && \
        (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RCU))
/*
 * See the comment above ptep_get_lockless() in include/linux/pgtable.h:
 * the barriers in pmdp_get_lockless() cannot guarantee that the value in
 * pmd_high actually belongs with the value in pmd_low; but holding interrupts
 * off blocks the TLB flush between present updates, which guarantees that a
 * successful __pte_offset_map() points to a page from matched halves.
 */
static unsigned long pmdp_get_lockless_start(void)
{
        unsigned long irqflags;

        local_irq_save(irqflags);
        return irqflags;
}
static void pmdp_get_lockless_end(unsigned long irqflags)
{
        local_irq_restore(irqflags);
}
#else
static unsigned long pmdp_get_lockless_start(void) { return 0; }
static void pmdp_get_lockless_end(unsigned long irqflags) { }
#endif

pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
{
        unsigned long irqflags;
        pmd_t pmdval;

        rcu_read_lock();
        irqflags = pmdp_get_lockless_start();
        pmdval = pmdp_get_lockless(pmd);
        pmdp_get_lockless_end(irqflags);

        if (pmdvalp)
                *pmdvalp = pmdval;
        if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval)))
                goto nomap;
        if (unlikely(pmd_trans_huge(pmdval)))
                goto nomap;
        if (unlikely(pmd_bad(pmdval))) {
                pmd_clear_bad(pmd);
                goto nomap;
        }
        return __pte_map(&pmdval, addr);
nomap:
        rcu_read_unlock();
        return NULL;
}

pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd,
                                unsigned long addr, spinlock_t **ptlp)
{
        pmd_t pmdval;
        pte_t *pte;

        pte = __pte_offset_map(pmd, addr, &pmdval);
        if (likely(pte))
                *ptlp = pte_lockptr(mm, &pmdval);
        return pte;
}

pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd,
                                unsigned long addr, pmd_t *pmdvalp,
                                spinlock_t **ptlp)
{
        pte_t *pte;

        VM_WARN_ON_ONCE(!pmdvalp);
        pte = __pte_offset_map(pmd, addr, pmdvalp);
        if (likely(pte))
                *ptlp = pte_lockptr(mm, pmdvalp);
        return pte;
}

/*
 * pte_offset_map_lock(mm, pmd, addr, ptlp), and its internal implementation
 * __pte_offset_map_lock() below, is usually called with the pmd pointer for
 * addr, reached by walking down the mm's pgd, p4d, pud for addr: either while
 * holding mmap_lock or vma lock for read or for write; or in truncate or rmap
 * context, while holding file's i_mmap_lock or anon_vma lock for read (or for
 * write). In a few cases, it may be used with pmd pointing to a pmd_t already
 * copied to or constructed on the stack.
 *
 * When successful, it returns the pte pointer for addr, with its page table
 * kmapped if necessary (when CONFIG_HIGHPTE), and locked against concurrent
 * modification by software, with a pointer to that spinlock in ptlp (in some
 * configs mm->page_table_lock, in SPLIT_PTLOCK configs a spinlock in table's
 * struct page).  pte_unmap_unlock(pte, ptl) to unlock and unmap afterwards.
 *
 * But it is unsuccessful, returning NULL with *ptlp unchanged, if there is no
 * page table at *pmd: if, for example, the page table has just been removed,
 * or replaced by the huge pmd of a THP.  (When successful, *pmd is rechecked
 * after acquiring the ptlock, and retried internally if it changed: so that a
 * page table can be safely removed or replaced by THP while holding its lock.)
 *
 * pte_offset_map(pmd, addr), and its internal helper __pte_offset_map() above,
 * just returns the pte pointer for addr, its page table kmapped if necessary;
 * or NULL if there is no page table at *pmd.  It does not attempt to lock the
 * page table, so cannot normally be used when the page table is to be updated,
 * or when entries read must be stable.  But it does take rcu_read_lock(): so
 * that even when page table is racily removed, it remains a valid though empty
 * and disconnected table.  Until pte_unmap(pte) unmaps and rcu_read_unlock()s
 * afterwards.
 *
 * pte_offset_map_ro_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map();
 * but when successful, it also outputs a pointer to the spinlock in ptlp - as
 * pte_offset_map_lock() does, but in this case without locking it.  This helps
 * the caller to avoid a later pte_lockptr(mm, *pmd), which might by that time
 * act on a changed *pmd: pte_offset_map_ro_nolock() provides the correct spinlock
 * pointer for the page table that it returns. Even after grabbing the spinlock,
 * we might be looking either at a page table that is still mapped or one that
 * was unmapped and is about to get freed. But for R/O access this is sufficient.
 * So it is only applicable for read-only cases where any modification operations
 * to the page table are not allowed even if the corresponding spinlock is held
 * afterwards.
 *
 * pte_offset_map_rw_nolock(mm, pmd, addr, pmdvalp, ptlp), above, is like
 * pte_offset_map_ro_nolock(); but when successful, it also outputs the pdmval.
 * It is applicable for may-write cases where any modification operations to the
 * page table may happen after the corresponding spinlock is held afterwards.
 * But the users should make sure the page table is stable like checking pte_same()
 * or checking pmd_same() by using the output pmdval before performing the write
 * operations.
 *
 * Note: "RO" / "RW" expresses the intended semantics, not that the *kmap* will
 * be read-only/read-write protected.
 *
 * Note that free_pgtables(), used after unmapping detached vmas, or when
 * exiting the whole mm, does not take page table lock before freeing a page
 * table, and may not use RCU at all: "outsiders" like khugepaged should avoid
 * pte_offset_map() and co once the vma is detached from mm or mm_users is zero.
 */
pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
                             unsigned long addr, spinlock_t **ptlp)
{
        spinlock_t *ptl;
        pmd_t pmdval;
        pte_t *pte;
again:
        pte = __pte_offset_map(pmd, addr, &pmdval);
        if (unlikely(!pte))
                return pte;
        ptl = pte_lockptr(mm, &pmdval);
        spin_lock(ptl);
        if (likely(pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
                *ptlp = ptl;
                return pte;
        }
        pte_unmap_unlock(pte, ptl);
        goto again;
}









































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_CGROUP_H
#define _LINUX_CGROUP_H
/*
 *  cgroup interface
 *
 *  Copyright (C) 2003 BULL SA
 *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
 *
 */

#include <linux/sched.h>
#include <linux/nodemask.h>
#include <linux/list.h>
#include <linux/rculist.h>
#include <linux/cgroupstats.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
#include <linux/kernfs.h>
#include <linux/jump_label.h>
#include <linux/types.h>
#include <linux/notifier.h>
#include <linux/ns_common.h>
#include <linux/nsproxy.h>
#include <linux/user_namespace.h>
#include <linux/refcount.h>
#include <linux/kernel_stat.h>

#include <linux/cgroup-defs.h>
#include <linux/cgroup_namespace.h>

struct kernel_clone_args;

/*
 * All weight knobs on the default hierarchy should use the following min,
 * default and max values.  The default value is the logarithmic center of
 * MIN and MAX and allows 100x to be expressed in both directions.
 */
#define CGROUP_WEIGHT_MIN                1
#define CGROUP_WEIGHT_DFL                100
#define CGROUP_WEIGHT_MAX                10000

#ifdef CONFIG_CGROUPS

enum css_task_iter_flags {
        CSS_TASK_ITER_PROCS    = (1U << 0),  /* walk only threadgroup leaders */
        CSS_TASK_ITER_THREADED = (1U << 1),  /* walk all threaded css_sets in the domain */
        CSS_TASK_ITER_SKIPPED  = (1U << 16), /* internal flags */
};

/* a css_task_iter should be treated as an opaque object */
struct css_task_iter {
        struct cgroup_subsys                *ss;
        unsigned int                        flags;

        struct list_head                *cset_pos;
        struct list_head                *cset_head;

        struct list_head                *tcset_pos;
        struct list_head                *tcset_head;

        struct list_head                *task_pos;

        struct list_head                *cur_tasks_head;
        struct css_set                        *cur_cset;
        struct css_set                        *cur_dcset;
        struct task_struct                *cur_task;
        struct list_head                iters_node;        /* css_set->task_iters */
};

enum cgroup_lifetime_events {
        CGROUP_LIFETIME_ONLINE,
        CGROUP_LIFETIME_OFFLINE,
};

extern struct file_system_type cgroup_fs_type;
extern struct cgroup_root cgrp_dfl_root;
extern struct css_set init_css_set;
extern spinlock_t css_set_lock;
extern struct blocking_notifier_head cgroup_lifetime_notifier;

#define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
#include <linux/cgroup_subsys.h>
#undef SUBSYS

#define SUBSYS(_x)                                                                \
        extern struct static_key_true _x ## _cgrp_subsys_enabled_key;                \
        extern struct static_key_true _x ## _cgrp_subsys_on_dfl_key;
#include <linux/cgroup_subsys.h>
#undef SUBSYS

/**
 * cgroup_subsys_enabled - fast test on whether a subsys is enabled
 * @ss: subsystem in question
 */
#define cgroup_subsys_enabled(ss)                                                \
        static_branch_likely(&ss ## _enabled_key)

/**
 * cgroup_subsys_on_dfl - fast test on whether a subsys is on default hierarchy
 * @ss: subsystem in question
 */
#define cgroup_subsys_on_dfl(ss)                                                \
        static_branch_likely(&ss ## _on_dfl_key)

bool css_has_online_children(struct cgroup_subsys_state *css);
struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss);
struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup,
                                         struct cgroup_subsys *ss);
struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup,
                                             struct cgroup_subsys *ss);
struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
                                                       struct cgroup_subsys *ss);

struct cgroup *cgroup_get_from_path(const char *path);
struct cgroup *cgroup_get_from_fd(int fd);
struct cgroup *cgroup_v1v2_get_from_fd(int fd);

int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);

int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
int cgroup_rm_cftypes(struct cftype *cfts);
void cgroup_file_notify(struct cgroup_file *cfile);
void cgroup_file_show(struct cgroup_file *cfile, bool show);

int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry);
int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
                     struct pid *pid, struct task_struct *tsk);

void cgroup_fork(struct task_struct *p);
extern int cgroup_can_fork(struct task_struct *p,
                           struct kernel_clone_args *kargs);
extern void cgroup_cancel_fork(struct task_struct *p,
                               struct kernel_clone_args *kargs);
extern void cgroup_post_fork(struct task_struct *p,
                             struct kernel_clone_args *kargs);
void cgroup_exit(struct task_struct *p);
void cgroup_release(struct task_struct *p);
void cgroup_free(struct task_struct *p);

int cgroup_init_early(void);
int cgroup_init(void);

int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v);

/*
 * Iteration helpers and macros.
 */

struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
                                           struct cgroup_subsys_state *parent);
struct cgroup_subsys_state *css_next_descendant_pre(struct cgroup_subsys_state *pos,
                                                    struct cgroup_subsys_state *css);
struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state *pos);
struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos,
                                                     struct cgroup_subsys_state *css);

struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
                                         struct cgroup_subsys_state **dst_cssp);
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
                                        struct cgroup_subsys_state **dst_cssp);

void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
                         struct css_task_iter *it);
struct task_struct *css_task_iter_next(struct css_task_iter *it);
void css_task_iter_end(struct css_task_iter *it);

/**
 * css_for_each_child - iterate through children of a css
 * @pos: the css * to use as the loop cursor
 * @parent: css whose children to walk
 *
 * Walk @parent's children.  Must be called under rcu_read_lock().
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
 *
 * It is allowed to temporarily drop RCU read lock during iteration.  The
 * caller is responsible for ensuring that @pos remains accessible until
 * the start of the next iteration by, for example, bumping the css refcnt.
 */
#define css_for_each_child(pos, parent)                                        \
        for ((pos) = css_next_child(NULL, (parent)); (pos);                \
             (pos) = css_next_child((pos), (parent)))

/**
 * css_for_each_descendant_pre - pre-order walk of a css's descendants
 * @pos: the css * to use as the loop cursor
 * @root: css whose descendants to walk
 *
 * Walk @root's descendants.  @root is included in the iteration and the
 * first node to be visited.  Must be called under rcu_read_lock().
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
 *
 * For example, the following guarantees that a descendant can't escape
 * state updates of its ancestors.
 *
 * my_online(@css)
 * {
 *        Lock @css's parent and @css;
 *        Inherit state from the parent;
 *        Unlock both.
 * }
 *
 * my_update_state(@css)
 * {
 *        css_for_each_descendant_pre(@pos, @css) {
 *                Lock @pos;
 *                if (@pos == @css)
 *                        Update @css's state;
 *                else
 *                        Verify @pos is alive and inherit state from its parent;
 *                Unlock @pos;
 *        }
 * }
 *
 * As long as the inheriting step, including checking the parent state, is
 * enclosed inside @pos locking, double-locking the parent isn't necessary
 * while inheriting.  The state update to the parent is guaranteed to be
 * visible by walking order and, as long as inheriting operations to the
 * same @pos are atomic to each other, multiple updates racing each other
 * still result in the correct state.  It's guaranateed that at least one
 * inheritance happens for any css after the latest update to its parent.
 *
 * If checking parent's state requires locking the parent, each inheriting
 * iteration should lock and unlock both @pos->parent and @pos.
 *
 * Alternatively, a subsystem may choose to use a single global lock to
 * synchronize ->css_online() and ->css_offline() against tree-walking
 * operations.
 *
 * It is allowed to temporarily drop RCU read lock during iteration.  The
 * caller is responsible for ensuring that @pos remains accessible until
 * the start of the next iteration by, for example, bumping the css refcnt.
 */
#define css_for_each_descendant_pre(pos, css)                                \
        for ((pos) = css_next_descendant_pre(NULL, (css)); (pos);        \
             (pos) = css_next_descendant_pre((pos), (css)))

/**
 * css_for_each_descendant_post - post-order walk of a css's descendants
 * @pos: the css * to use as the loop cursor
 * @css: css whose descendants to walk
 *
 * Similar to css_for_each_descendant_pre() but performs post-order
 * traversal instead.  @root is included in the iteration and the last
 * node to be visited.
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
 *
 * Note that the walk visibility guarantee example described in pre-order
 * walk doesn't apply the same to post-order walks.
 */
#define css_for_each_descendant_post(pos, css)                                \
        for ((pos) = css_next_descendant_post(NULL, (css)); (pos);        \
             (pos) = css_next_descendant_post((pos), (css)))

/**
 * cgroup_taskset_for_each - iterate cgroup_taskset
 * @task: the loop cursor
 * @dst_css: the destination css
 * @tset: taskset to iterate
 *
 * @tset may contain multiple tasks and they may belong to multiple
 * processes.
 *
 * On the v2 hierarchy, there may be tasks from multiple processes and they
 * may not share the source or destination csses.
 *
 * On traditional hierarchies, when there are multiple tasks in @tset, if a
 * task of a process is in @tset, all tasks of the process are in @tset.
 * Also, all are guaranteed to share the same source and destination csses.
 *
 * Iteration is not in any specific order.
 */
#define cgroup_taskset_for_each(task, dst_css, tset)                        \
        for ((task) = cgroup_taskset_first((tset), &(dst_css));                \
             (task);                                                        \
             (task) = cgroup_taskset_next((tset), &(dst_css)))

/**
 * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset
 * @leader: the loop cursor
 * @dst_css: the destination css
 * @tset: taskset to iterate
 *
 * Iterate threadgroup leaders of @tset.  For single-task migrations, @tset
 * may not contain any.
 */
#define cgroup_taskset_for_each_leader(leader, dst_css, tset)                \
        for ((leader) = cgroup_taskset_first((tset), &(dst_css));        \
             (leader);                                                        \
             (leader) = cgroup_taskset_next((tset), &(dst_css)))        \
                if ((leader) != (leader)->group_leader)                        \
                        ;                                                \
                else

/*
 * Inline functions.
 */

#ifdef CONFIG_DEBUG_CGROUP_REF
void css_get(struct cgroup_subsys_state *css);
void css_get_many(struct cgroup_subsys_state *css, unsigned int n);
bool css_tryget(struct cgroup_subsys_state *css);
bool css_tryget_online(struct cgroup_subsys_state *css);
void css_put(struct cgroup_subsys_state *css);
void css_put_many(struct cgroup_subsys_state *css, unsigned int n);
#else
#define CGROUP_REF_FN_ATTRS        static inline
#define CGROUP_REF_EXPORT(fn)
#include <linux/cgroup_refcnt.h>
#endif

static inline u64 cgroup_id(const struct cgroup *cgrp)
{
        return cgrp->kn->id;
}

/**
 * css_is_dying - test whether the specified css is dying
 * @css: target css
 *
 * Test whether @css is in the process of offlining or already offline.  In
 * most cases, ->css_online() and ->css_offline() callbacks should be
 * enough; however, the actual offline operations are RCU delayed and this
 * test returns %true also when @css is scheduled to be offlined.
 *
 * This is useful, for example, when the use case requires synchronous
 * behavior with respect to cgroup removal.  cgroup removal schedules css
 * offlining but the css can seem alive while the operation is being
 * delayed.  If the delay affects user visible semantics, this test can be
 * used to resolve the situation.
 */
static inline bool css_is_dying(struct cgroup_subsys_state *css)
{
        return css->flags & CSS_DYING;
}

static inline bool css_is_online(struct cgroup_subsys_state *css)
{
        return css->flags & CSS_ONLINE;
}

static inline bool css_is_self(struct cgroup_subsys_state *css)
{
        if (css == &css->cgroup->self) {
                /* cgroup::self should not have subsystem association */
                WARN_ON(css->ss != NULL);
                return true;
        }

        return false;
}

static inline void cgroup_get(struct cgroup *cgrp)
{
        css_get(&cgrp->self);
}

static inline bool cgroup_tryget(struct cgroup *cgrp)
{
        return css_tryget(&cgrp->self);
}

static inline void cgroup_put(struct cgroup *cgrp)
{
        css_put(&cgrp->self);
}

extern struct mutex cgroup_mutex;

static inline void cgroup_lock(void)
{
        mutex_lock(&cgroup_mutex);
}

static inline void cgroup_unlock(void)
{
        mutex_unlock(&cgroup_mutex);
}

/**
 * task_css_set_check - obtain a task's css_set with extra access conditions
 * @task: the task to obtain css_set for
 * @__c: extra condition expression to be passed to rcu_dereference_check()
 *
 * A task's css_set is RCU protected, initialized and exited while holding
 * task_lock(), and can only be modified while holding both cgroup_mutex
 * and task_lock() while the task is alive.  This macro verifies that the
 * caller is inside proper critical section and returns @task's css_set.
 *
 * The caller can also specify additional allowed conditions via @__c, such
 * as locks used during the cgroup_subsys::attach() methods.
 */
#ifdef CONFIG_PROVE_RCU
#define task_css_set_check(task, __c)                                        \
        rcu_dereference_check((task)->cgroups,                                \
                rcu_read_lock_sched_held() ||                                \
                lockdep_is_held(&cgroup_mutex) ||                        \
                lockdep_is_held(&css_set_lock) ||                        \
                ((task)->flags & PF_EXITING) || (__c))
#else
#define task_css_set_check(task, __c)                                        \
        rcu_dereference((task)->cgroups)
#endif

/**
 * task_css_check - obtain css for (task, subsys) w/ extra access conds
 * @task: the target task
 * @subsys_id: the target subsystem ID
 * @__c: extra condition expression to be passed to rcu_dereference_check()
 *
 * Return the cgroup_subsys_state for the (@task, @subsys_id) pair.  The
 * synchronization rules are the same as task_css_set_check().
 */
#define task_css_check(task, subsys_id, __c)                                \
        task_css_set_check((task), (__c))->subsys[(subsys_id)]

/**
 * task_css_set - obtain a task's css_set
 * @task: the task to obtain css_set for
 *
 * See task_css_set_check().
 */
static inline struct css_set *task_css_set(struct task_struct *task)
{
        return task_css_set_check(task, false);
}

/**
 * task_css - obtain css for (task, subsys)
 * @task: the target task
 * @subsys_id: the target subsystem ID
 *
 * See task_css_check().
 */
static inline struct cgroup_subsys_state *task_css(struct task_struct *task,
                                                   int subsys_id)
{
        return task_css_check(task, subsys_id, false);
}

/**
 * task_get_css - find and get the css for (task, subsys)
 * @task: the target task
 * @subsys_id: the target subsystem ID
 *
 * Find the css for the (@task, @subsys_id) combination, increment a
 * reference on and return it.  This function is guaranteed to return a
 * valid css.  The returned css may already have been offlined.
 */
static inline struct cgroup_subsys_state *
task_get_css(struct task_struct *task, int subsys_id)
{
        struct cgroup_subsys_state *css;

        rcu_read_lock();
        while (true) {
                css = task_css(task, subsys_id);
                /*
                 * Can't use css_tryget_online() here.  A task which has
                 * PF_EXITING set may stay associated with an offline css.
                 * If such task calls this function, css_tryget_online()
                 * will keep failing.
                 */
                if (likely(css_tryget(css)))
                        break;
                cpu_relax();
        }
        rcu_read_unlock();
        return css;
}

/**
 * task_css_is_root - test whether a task belongs to the root css
 * @task: the target task
 * @subsys_id: the target subsystem ID
 *
 * Test whether @task belongs to the root css on the specified subsystem.
 * May be invoked in any context.
 */
static inline bool task_css_is_root(struct task_struct *task, int subsys_id)
{
        return task_css_check(task, subsys_id, true) ==
                init_css_set.subsys[subsys_id];
}

static inline struct cgroup *task_cgroup(struct task_struct *task,
                                         int subsys_id)
{
        return task_css(task, subsys_id)->cgroup;
}

static inline struct cgroup *task_dfl_cgroup(struct task_struct *task)
{
        return task_css_set(task)->dfl_cgrp;
}

static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
{
        struct cgroup_subsys_state *parent_css = cgrp->self.parent;

        if (parent_css)
                return container_of(parent_css, struct cgroup, self);
        return NULL;
}

/**
 * cgroup_is_descendant - test ancestry
 * @cgrp: the cgroup to be tested
 * @ancestor: possible ancestor of @cgrp
 *
 * Test whether @cgrp is a descendant of @ancestor.  It also returns %true
 * if @cgrp == @ancestor.  This function is safe to call as long as @cgrp
 * and @ancestor are accessible.
 */
static inline bool cgroup_is_descendant(struct cgroup *cgrp,
                                        struct cgroup *ancestor)
{
        if (cgrp->root != ancestor->root || cgrp->level < ancestor->level)
                return false;
        return cgrp->ancestors[ancestor->level] == ancestor;
}

/**
 * cgroup_ancestor - find ancestor of cgroup
 * @cgrp: cgroup to find ancestor of
 * @ancestor_level: level of ancestor to find starting from root
 *
 * Find ancestor of cgroup at specified level starting from root if it exists
 * and return pointer to it. Return NULL if @cgrp doesn't have ancestor at
 * @ancestor_level.
 *
 * This function is safe to call as long as @cgrp is accessible.
 */
static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp,
                                             int ancestor_level)
{
        if (ancestor_level < 0 || ancestor_level > cgrp->level)
                return NULL;
        return cgrp->ancestors[ancestor_level];
}

/**
 * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry
 * @task: the task to be tested
 * @ancestor: possible ancestor of @task's cgroup
 *
 * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor.
 * It follows all the same rules as cgroup_is_descendant, and only applies
 * to the default hierarchy.
 */
static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
                                               struct cgroup *ancestor)
{
        struct css_set *cset = task_css_set(task);

        return cgroup_is_descendant(cset->dfl_cgrp, ancestor);
}

/* no synchronization, the result can only be used as a hint */
static inline bool cgroup_is_populated(struct cgroup *cgrp)
{
        return cgrp->nr_populated_csets + cgrp->nr_populated_domain_children +
                cgrp->nr_populated_threaded_children;
}

/* returns ino associated with a cgroup */
static inline ino_t cgroup_ino(struct cgroup *cgrp)
{
        return kernfs_ino(cgrp->kn);
}

/* cft/css accessors for cftype->write() operation */
static inline struct cftype *of_cft(struct kernfs_open_file *of)
{
        return of->kn->priv;
}

struct cgroup_subsys_state *of_css(struct kernfs_open_file *of);

/* cft/css accessors for cftype->seq_*() operations */
static inline struct cftype *seq_cft(struct seq_file *seq)
{
        return of_cft(seq->private);
}

static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq)
{
        return of_css(seq->private);
}

/*
 * Name / path handling functions.  All are thin wrappers around the kernfs
 * counterparts and can be called under any context.
 */

static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen)
{
        return kernfs_name(cgrp->kn, buf, buflen);
}

static inline int cgroup_path(struct cgroup *cgrp, char *buf, size_t buflen)
{
        return kernfs_path(cgrp->kn, buf, buflen);
}

static inline void pr_cont_cgroup_name(struct cgroup *cgrp)
{
        pr_cont_kernfs_name(cgrp->kn);
}

static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
{
        pr_cont_kernfs_path(cgrp->kn);
}

bool cgroup_psi_enabled(void);

static inline void cgroup_init_kthreadd(void)
{
        /*
         * kthreadd is inherited by all kthreads, keep it in the root so
         * that the new kthreads are guaranteed to stay in the root until
         * initialization is finished.
         */
        current->no_cgroup_migration = 1;
}

static inline void cgroup_kthread_ready(void)
{
        /*
         * This kthread finished initialization.  The creator should have
         * set PF_NO_SETAFFINITY if this kthread should stay in the root.
         */
        current->no_cgroup_migration = 0;
}

void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen);
struct cgroup *__cgroup_get_from_id(u64 id);
struct cgroup *cgroup_get_from_id(u64 id);
#else /* !CONFIG_CGROUPS */

struct cgroup_subsys_state;
struct cgroup;

static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
static inline void css_get(struct cgroup_subsys_state *css) {}
static inline void css_put(struct cgroup_subsys_state *css) {}
static inline void cgroup_lock(void) {}
static inline void cgroup_unlock(void) {}
static inline int cgroup_attach_task_all(struct task_struct *from,
                                         struct task_struct *t) { return 0; }
static inline int cgroupstats_build(struct cgroupstats *stats,
                                    struct dentry *dentry) { return -EINVAL; }

static inline void cgroup_fork(struct task_struct *p) {}
static inline int cgroup_can_fork(struct task_struct *p,
                                  struct kernel_clone_args *kargs) { return 0; }
static inline void cgroup_cancel_fork(struct task_struct *p,
                                      struct kernel_clone_args *kargs) {}
static inline void cgroup_post_fork(struct task_struct *p,
                                    struct kernel_clone_args *kargs) {}
static inline void cgroup_exit(struct task_struct *p) {}
static inline void cgroup_release(struct task_struct *p) {}
static inline void cgroup_free(struct task_struct *p) {}

static inline int cgroup_init_early(void) { return 0; }
static inline int cgroup_init(void) { return 0; }
static inline void cgroup_init_kthreadd(void) {}
static inline void cgroup_kthread_ready(void) {}

static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
{
        return NULL;
}

static inline bool cgroup_psi_enabled(void)
{
        return false;
}

static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
                                               struct cgroup *ancestor)
{
        return true;
}

static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
{}
#endif /* !CONFIG_CGROUPS */

#ifdef CONFIG_CGROUPS
/*
 * cgroup scalable recursive statistics.
 */
void css_rstat_updated(struct cgroup_subsys_state *css, int cpu);
void css_rstat_flush(struct cgroup_subsys_state *css);

/*
 * Basic resource stats.
 */
#ifdef CONFIG_CGROUP_CPUACCT
void cpuacct_charge(struct task_struct *tsk, u64 cputime);
void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
#else
static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
static inline void cpuacct_account_field(struct task_struct *tsk, int index,
                                         u64 val) {}
#endif

void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec);
void __cgroup_account_cputime_field(struct cgroup *cgrp,
                                    enum cpu_usage_stat index, u64 delta_exec);

static inline void cgroup_account_cputime(struct task_struct *task,
                                          u64 delta_exec)
{
        struct cgroup *cgrp;

        cpuacct_charge(task, delta_exec);

        cgrp = task_dfl_cgroup(task);
        if (cgroup_parent(cgrp))
                __cgroup_account_cputime(cgrp, delta_exec);
}

static inline void cgroup_account_cputime_field(struct task_struct *task,
                                                enum cpu_usage_stat index,
                                                u64 delta_exec)
{
        struct cgroup *cgrp;

        cpuacct_account_field(task, index, delta_exec);

        cgrp = task_dfl_cgroup(task);
        if (cgroup_parent(cgrp))
                __cgroup_account_cputime_field(cgrp, index, delta_exec);
}

#else        /* CONFIG_CGROUPS */

static inline void cgroup_account_cputime(struct task_struct *task,
                                          u64 delta_exec) {}
static inline void cgroup_account_cputime_field(struct task_struct *task,
                                                enum cpu_usage_stat index,
                                                u64 delta_exec) {}

#endif        /* CONFIG_CGROUPS */

/*
 * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
 * definition in cgroup-defs.h.
 */
#ifdef CONFIG_SOCK_CGROUP_DATA

void cgroup_sk_alloc(struct sock_cgroup_data *skcd);
void cgroup_sk_clone(struct sock_cgroup_data *skcd);
void cgroup_sk_free(struct sock_cgroup_data *skcd);

static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd)
{
        return skcd->cgroup;
}

#else        /* CONFIG_CGROUP_DATA */

static inline void cgroup_sk_alloc(struct sock_cgroup_data *skcd) {}
static inline void cgroup_sk_clone(struct sock_cgroup_data *skcd) {}
static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {}

#endif        /* CONFIG_CGROUP_DATA */

#ifdef CONFIG_CGROUPS

void cgroup_enter_frozen(void);
void cgroup_leave_frozen(bool always_leave);
void cgroup_update_frozen(struct cgroup *cgrp);
void cgroup_freeze(struct cgroup *cgrp, bool freeze);
void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src,
                                 struct cgroup *dst);

static inline bool cgroup_task_frozen(struct task_struct *task)
{
        return task->frozen;
}

#else /* !CONFIG_CGROUPS */

static inline void cgroup_enter_frozen(void) { }
static inline void cgroup_leave_frozen(bool always_leave) { }
static inline bool cgroup_task_frozen(struct task_struct *task)
{
        return false;
}

#endif /* !CONFIG_CGROUPS */

#ifdef CONFIG_CGROUP_BPF
static inline void cgroup_bpf_get(struct cgroup *cgrp)
{
        percpu_ref_get(&cgrp->bpf.refcnt);
}

static inline void cgroup_bpf_put(struct cgroup *cgrp)
{
        percpu_ref_put(&cgrp->bpf.refcnt);
}

#else /* CONFIG_CGROUP_BPF */

static inline void cgroup_bpf_get(struct cgroup *cgrp) {}
static inline void cgroup_bpf_put(struct cgroup *cgrp) {}

#endif /* CONFIG_CGROUP_BPF */

struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id);

struct cgroup_of_peak *of_peak(struct kernfs_open_file *of);

#endif /* _LINUX_CGROUP_H */



























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * A hash table (hashtab) maintains associations between
 * key values and datum values.  The type of the key values
 * and the type of the datum values is arbitrary.  The
 * functions for hash computation and key comparison are
 * provided by the creator of the table.
 *
 * Author : Stephen Smalley, <stephen.smalley.work@gmail.com>
 */

#ifndef _SS_HASHTAB_H_
#define _SS_HASHTAB_H_

#include <linux/types.h>
#include <linux/errno.h>
#include <linux/sched.h>

#define HASHTAB_MAX_NODES U32_MAX

struct hashtab_key_params {
        u32 (*hash)(const void *key); /* hash func */
        int (*cmp)(const void *key1, const void *key2); /* comparison func */
};

struct hashtab_node {
        void *key;
        void *datum;
        struct hashtab_node *next;
};

struct hashtab {
        struct hashtab_node **htable; /* hash table */
        u32 size; /* number of slots in hash table */
        u32 nel; /* number of elements in hash table */
};

struct hashtab_info {
        u32 slots_used;
        u32 max_chain_len;
        u64 chain2_len_sum;
};

/*
 * Initializes a new hash table with the specified characteristics.
 *
 * Returns -ENOMEM if insufficient space is available or 0 otherwise.
 */
int hashtab_init(struct hashtab *h, u32 nel_hint);

int __hashtab_insert(struct hashtab *h, struct hashtab_node **dst, void *key,
                     void *datum);

/*
 * Inserts the specified (key, datum) pair into the specified hash table.
 *
 * Returns -ENOMEM on memory allocation error,
 * -EEXIST if there is already an entry with the same key,
 * -EINVAL for general errors or
  0 otherwise.
 */
static inline int hashtab_insert(struct hashtab *h, void *key, void *datum,
                                 struct hashtab_key_params key_params)
{
        u32 hvalue;
        struct hashtab_node *prev, *cur;

        cond_resched();

        if (!h->size || h->nel == HASHTAB_MAX_NODES)
                return -EINVAL;

        hvalue = key_params.hash(key) & (h->size - 1);
        prev = NULL;
        cur = h->htable[hvalue];
        while (cur) {
                int cmp = key_params.cmp(key, cur->key);

                if (cmp == 0)
                        return -EEXIST;
                if (cmp < 0)
                        break;
                prev = cur;
                cur = cur->next;
        }

        return __hashtab_insert(h, prev ? &prev->next : &h->htable[hvalue], key,
                                datum);
}

/*
 * Searches for the entry with the specified key in the hash table.
 *
 * Returns NULL if no entry has the specified key or
 * the datum of the entry otherwise.
 */
static inline void *hashtab_search(struct hashtab *h, const void *key,
                                   struct hashtab_key_params key_params)
{
        u32 hvalue;
        struct hashtab_node *cur;

        if (!h->size)
                return NULL;

        hvalue = key_params.hash(key) & (h->size - 1);
        cur = h->htable[hvalue];
        while (cur) {
                int cmp = key_params.cmp(key, cur->key);

                if (cmp == 0)
                        return cur->datum;
                if (cmp < 0)
                        break;
                cur = cur->next;
        }
        return NULL;
}

/*
 * Destroys the specified hash table.
 */
void hashtab_destroy(struct hashtab *h);

/*
 * Applies the specified apply function to (key,datum,args)
 * for each entry in the specified hash table.
 *
 * The order in which the function is applied to the entries
 * is dependent upon the internal structure of the hash table.
 *
 * If apply returns a non-zero status, then hashtab_map will cease
 * iterating through the hash table and will propagate the error
 * return to its caller.
 */
int hashtab_map(struct hashtab *h, int (*apply)(void *k, void *d, void *args),
                void *args);

int hashtab_duplicate(struct hashtab *new, const struct hashtab *orig,
                      int (*copy)(struct hashtab_node *new,
                                  const struct hashtab_node *orig, void *args),
                      int (*destroy)(void *k, void *d, void *args), void *args);

#ifdef CONFIG_SECURITY_SELINUX_DEBUG
/* Fill info with some hash table statistics */
void hashtab_stat(struct hashtab *h, struct hashtab_info *info);
#else
static inline void hashtab_stat(struct hashtab *h, struct hashtab_info *info)
{
        return;
}
#endif

#endif /* _SS_HASHTAB_H */
























































































































































































































































































































   39 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
/* SPDX-License-Identifier: GPL-2.0+ */
/*
 * Sleepable Read-Copy Update mechanism for mutual exclusion,
 *        tree variant.
 *
 * Copyright (C) IBM Corporation, 2017
 *
 * Author: Paul McKenney <paulmck@linux.ibm.com>
 */

#ifndef _LINUX_SRCU_TREE_H
#define _LINUX_SRCU_TREE_H

#include <linux/rcu_node_tree.h>
#include <linux/completion.h>

struct srcu_node;
struct srcu_struct;

/* One element of the srcu_data srcu_ctrs array. */
struct srcu_ctr {
        atomic_long_t srcu_locks;        /* Locks per CPU. */
        atomic_long_t srcu_unlocks;        /* Unlocks per CPU. */
};

/*
 * Per-CPU structure feeding into leaf srcu_node, similar in function
 * to rcu_node.
 */
struct srcu_data {
        /* Read-side state. */
        struct srcu_ctr srcu_ctrs[2];                /* Locks and unlocks per CPU. */
        int srcu_reader_flavor;                        /* Reader flavor for srcu_struct structure? */
                                                /* Values: SRCU_READ_FLAVOR_.*  */

        /* Update-side state. */
        spinlock_t __private lock ____cacheline_internodealigned_in_smp;
        struct rcu_segcblist srcu_cblist;        /* List of callbacks.*/
        unsigned long srcu_gp_seq_needed;        /* Furthest future GP needed. */
        unsigned long srcu_gp_seq_needed_exp;        /* Furthest future exp GP. */
        bool srcu_cblist_invoking;                /* Invoking these CBs? */
        struct timer_list delay_work;                /* Delay for CB invoking */
        struct work_struct work;                /* Context for CB invoking. */
        struct rcu_head srcu_barrier_head;        /* For srcu_barrier() use. */
        struct srcu_node *mynode;                /* Leaf srcu_node. */
        unsigned long grpmask;                        /* Mask for leaf srcu_node */
                                                /*  ->srcu_data_have_cbs[]. */
        int cpu;
        struct srcu_struct *ssp;
};

/*
 * Node in SRCU combining tree, similar in function to rcu_data.
 */
struct srcu_node {
        spinlock_t __private lock;
        unsigned long srcu_have_cbs[4];                /* GP seq for children having CBs, but only */
                                                /*  if greater than ->srcu_gp_seq. */
        unsigned long srcu_data_have_cbs[4];        /* Which srcu_data structs have CBs for given GP? */
        unsigned long srcu_gp_seq_needed_exp;        /* Furthest future exp GP. */
        struct srcu_node *srcu_parent;                /* Next up in tree. */
        int grplo;                                /* Least CPU for node. */
        int grphi;                                /* Biggest CPU for node. */
};

/*
 * Per-SRCU-domain structure, update-side data linked from srcu_struct.
 */
struct srcu_usage {
        struct srcu_node *node;                        /* Combining tree. */
        struct srcu_node *level[RCU_NUM_LVLS + 1];
                                                /* First node at each level. */
        int srcu_size_state;                        /* Small-to-big transition state. */
        struct mutex srcu_cb_mutex;                /* Serialize CB preparation. */
        spinlock_t __private lock;                /* Protect counters and size state. */
        struct mutex srcu_gp_mutex;                /* Serialize GP work. */
        unsigned long srcu_gp_seq;                /* Grace-period seq #. */
        unsigned long srcu_gp_seq_needed;        /* Latest gp_seq needed. */
        unsigned long srcu_gp_seq_needed_exp;        /* Furthest future exp GP. */
        unsigned long srcu_gp_start;                /* Last GP start timestamp (jiffies) */
        unsigned long srcu_last_gp_end;                /* Last GP end timestamp (ns) */
        unsigned long srcu_size_jiffies;        /* Current contention-measurement interval. */
        unsigned long srcu_n_lock_retries;        /* Contention events in current interval. */
        unsigned long srcu_n_exp_nodelay;        /* # expedited no-delays in current GP phase. */
        bool sda_is_static;                        /* May ->sda be passed to free_percpu()? */
        unsigned long srcu_barrier_seq;                /* srcu_barrier seq #. */
        struct mutex srcu_barrier_mutex;        /* Serialize barrier ops. */
        struct completion srcu_barrier_completion;
                                                /* Awaken barrier rq at end. */
        atomic_t srcu_barrier_cpu_cnt;                /* # CPUs not yet posting a */
                                                /*  callback for the barrier */
                                                /*  operation. */
        unsigned long reschedule_jiffies;
        unsigned long reschedule_count;
        struct delayed_work work;
        struct srcu_struct *srcu_ssp;
};

/*
 * Per-SRCU-domain structure, similar in function to rcu_state.
 */
struct srcu_struct {
        struct srcu_ctr __percpu *srcu_ctrp;
        struct srcu_data __percpu *sda;                /* Per-CPU srcu_data array. */
        struct lockdep_map dep_map;
        struct srcu_usage *srcu_sup;                /* Update-side data. */
};

// Values for size state variable (->srcu_size_state).  Once the state
// has been set to SRCU_SIZE_ALLOC, the grace-period code advances through
// this state machine one step per grace period until the SRCU_SIZE_BIG state
// is reached.  Otherwise, the state machine remains in the SRCU_SIZE_SMALL
// state indefinitely.
#define SRCU_SIZE_SMALL                0        // No srcu_node combining tree, ->node == NULL
#define SRCU_SIZE_ALLOC                1        // An srcu_node tree is being allocated, initialized,
                                        //  and then referenced by ->node.  It will not be used.
#define SRCU_SIZE_WAIT_BARRIER        2        // The srcu_node tree starts being used by everything
                                        //  except call_srcu(), especially by srcu_barrier().
                                        //  By the end of this state, all CPUs and threads
                                        //  are aware of this tree's existence.
#define SRCU_SIZE_WAIT_CALL        3        // The srcu_node tree starts being used by call_srcu().
                                        //  By the end of this state, all of the call_srcu()
                                        //  invocations that were running on a non-boot CPU
                                        //  and using the boot CPU's callback queue will have
                                        //  completed.
#define SRCU_SIZE_WAIT_CBS1        4        // Don't trust the ->srcu_have_cbs[] grace-period
#define SRCU_SIZE_WAIT_CBS2        5        //  sequence elements or the ->srcu_data_have_cbs[]
#define SRCU_SIZE_WAIT_CBS3        6        //  CPU-bitmask elements until all four elements of
#define SRCU_SIZE_WAIT_CBS4        7        //  each array have been initialized.
#define SRCU_SIZE_BIG                8        // The srcu_node combining tree is fully initialized
                                        //  and all aspects of it are being put to use.

/* Values for state variable (bottom bits of ->srcu_gp_seq). */
#define SRCU_STATE_IDLE                0
#define SRCU_STATE_SCAN1        1
#define SRCU_STATE_SCAN2        2

/*
 * Values for initializing gp sequence fields. Higher values allow wrap arounds to
 * occur earlier.
 * The second value with state is useful in the case of static initialization of
 * srcu_usage where srcu_gp_seq_needed is expected to have some state value in its
 * lower bits (or else it will appear to be already initialized within
 * the call check_init_srcu_struct()).
 */
#define SRCU_GP_SEQ_INITIAL_VAL ((0UL - 100UL) << RCU_SEQ_CTR_SHIFT)
#define SRCU_GP_SEQ_INITIAL_VAL_WITH_STATE (SRCU_GP_SEQ_INITIAL_VAL - 1)

#define __SRCU_USAGE_INIT(name)                                                                        \
{                                                                                                \
        .lock = __SPIN_LOCK_UNLOCKED(name.lock),                                                \
        .srcu_gp_seq = SRCU_GP_SEQ_INITIAL_VAL,                                                        \
        .srcu_gp_seq_needed = SRCU_GP_SEQ_INITIAL_VAL_WITH_STATE,                                \
        .srcu_gp_seq_needed_exp = SRCU_GP_SEQ_INITIAL_VAL,                                        \
        .work = __DELAYED_WORK_INITIALIZER(name.work, NULL, 0),                                        \
}

#define __SRCU_STRUCT_INIT_COMMON(name, usage_name)                                                \
        .srcu_sup = &usage_name,                                                                \
        __SRCU_DEP_MAP_INIT(name)

#define __SRCU_STRUCT_INIT_MODULE(name, usage_name)                                                \
{                                                                                                \
        __SRCU_STRUCT_INIT_COMMON(name, usage_name)                                                \
}

#define __SRCU_STRUCT_INIT(name, usage_name, pcpu_name)                                                \
{                                                                                                \
        .sda = &pcpu_name,                                                                        \
        .srcu_ctrp = &pcpu_name.srcu_ctrs[0],                                                        \
        __SRCU_STRUCT_INIT_COMMON(name, usage_name)                                                \
}

/*
 * Define and initialize a srcu struct at build time.
 * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it.
 *
 * Note that although DEFINE_STATIC_SRCU() hides the name from other
 * files, the per-CPU variable rules nevertheless require that the
 * chosen name be globally unique.  These rules also prohibit use of
 * DEFINE_STATIC_SRCU() within a function.  If these rules are too
 * restrictive, declare the srcu_struct manually.  For example, in
 * each file:
 *
 *        static struct srcu_struct my_srcu;
 *
 * Then, before the first use of each my_srcu, manually initialize it:
 *
 *        init_srcu_struct(&my_srcu);
 *
 * See include/linux/percpu-defs.h for the rules on per-CPU variables.
 */
#ifdef MODULE
# define __DEFINE_SRCU(name, is_static)                                                                \
        static struct srcu_usage name##_srcu_usage = __SRCU_USAGE_INIT(name##_srcu_usage);        \
        is_static struct srcu_struct name = __SRCU_STRUCT_INIT_MODULE(name, name##_srcu_usage);        \
        extern struct srcu_struct * const __srcu_struct_##name;                                        \
        struct srcu_struct * const __srcu_struct_##name                                                \
                __section("___srcu_struct_ptrs") = &name
#else
# define __DEFINE_SRCU(name, is_static)                                                                \
        static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data);                                \
        static struct srcu_usage name##_srcu_usage = __SRCU_USAGE_INIT(name##_srcu_usage);        \
        is_static struct srcu_struct name =                                                        \
                __SRCU_STRUCT_INIT(name, name##_srcu_usage, name##_srcu_data)
#endif
#define DEFINE_SRCU(name)                __DEFINE_SRCU(name, /* not static */)
#define DEFINE_STATIC_SRCU(name)        __DEFINE_SRCU(name, static)

int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp);
void synchronize_srcu_expedited(struct srcu_struct *ssp);
void srcu_barrier(struct srcu_struct *ssp);
void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf);

// Converts a per-CPU pointer to an ->srcu_ctrs[] array element to that
// element's index.
static inline bool __srcu_ptr_to_ctr(struct srcu_struct *ssp, struct srcu_ctr __percpu *scpp)
{
        return scpp - &ssp->sda->srcu_ctrs[0];
}

// Converts an integer to a per-CPU pointer to the corresponding
// ->srcu_ctrs[] array element.
static inline struct srcu_ctr __percpu *__srcu_ctr_to_ptr(struct srcu_struct *ssp, int idx)
{
        return &ssp->sda->srcu_ctrs[idx];
}

/*
 * Counts the new reader in the appropriate per-CPU element of the
 * srcu_struct.  Returns a pointer that must be passed to the matching
 * srcu_read_unlock_fast().
 *
 * Note that both this_cpu_inc() and atomic_long_inc() are RCU read-side
 * critical sections either because they disables interrupts, because
 * they are a single instruction, or because they are read-modify-write
 * atomic operations, depending on the whims of the architecture.
 * This matters because the SRCU-fast grace-period mechanism uses either
 * synchronize_rcu() or synchronize_rcu_expedited(), that is, RCU,
 * *not* SRCU, in order to eliminate the need for the read-side smp_mb()
 * invocations that are used by srcu_read_lock() and srcu_read_unlock().
 * The __srcu_read_unlock_fast() function also relies on this same RCU
 * (again, *not* SRCU) trick to eliminate the need for smp_mb().
 *
 * The key point behind this RCU trick is that if any part of a given
 * RCU reader precedes the beginning of a given RCU grace period, then
 * the entirety of that RCU reader and everything preceding it happens
 * before the end of that same RCU grace period.  Similarly, if any part
 * of a given RCU reader follows the end of a given RCU grace period,
 * then the entirety of that RCU reader and everything following it
 * happens after the beginning of that same RCU grace period.  Therefore,
 * the operations labeled Y in __srcu_read_lock_fast() and those labeled Z
 * in __srcu_read_unlock_fast() are ordered against the corresponding SRCU
 * read-side critical section from the viewpoint of the SRCU grace period.
 * This is all the ordering that is required, hence no calls to smp_mb().
 *
 * This means that __srcu_read_lock_fast() is not all that fast
 * on architectures that support NMIs but do not supply NMI-safe
 * implementations of this_cpu_inc().
 */
static inline struct srcu_ctr __percpu notrace *__srcu_read_lock_fast(struct srcu_struct *ssp)
{
        struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp);

        if (!IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE))
                this_cpu_inc(scp->srcu_locks.counter); // Y, and implicit RCU reader.
        else
                atomic_long_inc(raw_cpu_ptr(&scp->srcu_locks));  // Y, and implicit RCU reader.
        barrier(); /* Avoid leaking the critical section. */
        return scp;
}

/*
 * Removes the count for the old reader from the appropriate
 * per-CPU element of the srcu_struct.  Note that this may well be a
 * different CPU than that which was incremented by the corresponding
 * srcu_read_lock_fast(), but it must be within the same task.
 *
 * Please see the __srcu_read_lock_fast() function's header comment for
 * information on implicit RCU readers and NMI safety.
 */
static inline void notrace
__srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp)
{
        barrier();  /* Avoid leaking the critical section. */
        if (!IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE))
                this_cpu_inc(scp->srcu_unlocks.counter);  // Z, and implicit RCU reader.
        else
                atomic_long_inc(raw_cpu_ptr(&scp->srcu_unlocks));  // Z, and implicit RCU reader.
}

void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor);

// Record reader usage even for CONFIG_PROVE_RCU=n kernels.  This is
// needed only for flavors that require grace-period smp_mb() calls to be
// promoted to synchronize_rcu().
static inline void srcu_check_read_flavor_force(struct srcu_struct *ssp, int read_flavor)
{
        struct srcu_data *sdp = raw_cpu_ptr(ssp->sda);

        if (likely(READ_ONCE(sdp->srcu_reader_flavor) & read_flavor))
                return;

        // Note that the cmpxchg() in __srcu_check_read_flavor() is fully ordered.
        __srcu_check_read_flavor(ssp, read_flavor);
}

// Record non-_lite() usage only for CONFIG_PROVE_RCU=y kernels.
static inline void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor)
{
        if (IS_ENABLED(CONFIG_PROVE_RCU))
                __srcu_check_read_flavor(ssp, read_flavor);
}

#endif























  320 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_CURRENT_H
#define _ASM_X86_CURRENT_H

#include <linux/build_bug.h>
#include <linux/compiler.h>

#ifndef __ASSEMBLER__

#include <linux/cache.h>
#include <asm/percpu.h>

struct task_struct;

DECLARE_PER_CPU_CACHE_HOT(struct task_struct *, current_task);
/* const-qualified alias provided by the linker. */
DECLARE_PER_CPU_CACHE_HOT(struct task_struct * const __percpu_seg_override,
                          const_current_task);

static __always_inline struct task_struct *get_current(void)
{
        if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT))
                return this_cpu_read_const(const_current_task);

        return this_cpu_read_stable(current_task);
}

#define current get_current()

#endif /* __ASSEMBLER__ */

#endif /* _ASM_X86_CURRENT_H */




































    4 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM exceptions

#if !defined(_TRACE_PAGE_FAULT_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_PAGE_FAULT_H

#include <linux/tracepoint.h>

DECLARE_EVENT_CLASS(exceptions,

        TP_PROTO(unsigned long address, struct pt_regs *regs,
                 unsigned long error_code),

        TP_ARGS(address, regs, error_code),

        TP_STRUCT__entry(
                __field(                unsigned long, address        )
                __field(                unsigned long, ip        )
                __field(                unsigned long, error_code )
        ),

        TP_fast_assign(
                __entry->address = address;
                __entry->ip = instruction_pointer(regs);
                __entry->error_code = error_code;
        ),

        TP_printk("address=%ps ip=%ps error_code=0x%lx",
                  (void *)__entry->address, (void *)__entry->ip,
                  __entry->error_code) );

DEFINE_EVENT(exceptions, page_fault_user,
        TP_PROTO(unsigned long address,        struct pt_regs *regs, unsigned long error_code),
        TP_ARGS(address, regs, error_code));
DEFINE_EVENT(exceptions, page_fault_kernel,
        TP_PROTO(unsigned long address,        struct pt_regs *regs, unsigned long error_code),
        TP_ARGS(address, regs, error_code));

#endif /*  _TRACE_PAGE_FAULT_H */

/* This part must be outside protection */
#include <trace/define_trace.h>























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * netprio_cgroup.h                        Control Group Priority set
 *
 * Authors:        Neil Horman <nhorman@tuxdriver.com>
 */

#ifndef _NETPRIO_CGROUP_H
#define _NETPRIO_CGROUP_H

#include <linux/cgroup.h>
#include <linux/hardirq.h>
#include <linux/rcupdate.h>

#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
struct netprio_map {
        struct rcu_head rcu;
        u32 priomap_len;
        u32 priomap[];
};

static inline u32 task_netprioidx(struct task_struct *p)
{
        struct cgroup_subsys_state *css;
        u32 idx;

        rcu_read_lock();
        css = task_css(p, net_prio_cgrp_id);
        idx = css->id;
        rcu_read_unlock();
        return idx;
}

static inline void sock_update_netprioidx(struct sock_cgroup_data *skcd)
{
        if (in_interrupt())
                return;

        sock_cgroup_set_prioidx(skcd, task_netprioidx(current));
}

#else /* !CONFIG_CGROUP_NET_PRIO */

static inline u32 task_netprioidx(struct task_struct *p)
{
        return 0;
}

static inline void sock_update_netprioidx(struct sock_cgroup_data *skcd)
{
}

#endif /* CONFIG_CGROUP_NET_PRIO */
#endif  /* _NET_CLS_CGROUP_H */















































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __CGROUP_INTERNAL_H
#define __CGROUP_INTERNAL_H

#include <linux/cgroup.h>
#include <linux/kernfs.h>
#include <linux/workqueue.h>
#include <linux/list.h>
#include <linux/refcount.h>
#include <linux/fs_parser.h>

#define TRACE_CGROUP_PATH_LEN 1024
extern spinlock_t trace_cgroup_path_lock;
extern char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
extern void __init enable_debug_cgroup(void);

/*
 * cgroup_path() takes a spin lock. It is good practice not to take
 * spin locks within trace point handlers, as they are mostly hidden
 * from normal view. As cgroup_path() can take the kernfs_rename_lock
 * spin lock, it is best to not call that function from the trace event
 * handler.
 *
 * Note: trace_cgroup_##type##_enabled() is a static branch that will only
 *       be set when the trace event is enabled.
 */
#define TRACE_CGROUP_PATH(type, cgrp, ...)                                \
        do {                                                                \
                if (trace_cgroup_##type##_enabled()) {                        \
                        unsigned long flags;                                \
                        spin_lock_irqsave(&trace_cgroup_path_lock,        \
                                          flags);                        \
                        cgroup_path(cgrp, trace_cgroup_path,                \
                                    TRACE_CGROUP_PATH_LEN);                \
                        trace_cgroup_##type(cgrp, trace_cgroup_path,        \
                                            ##__VA_ARGS__);                \
                        spin_unlock_irqrestore(&trace_cgroup_path_lock, \
                                               flags);                        \
                }                                                        \
        } while (0)

/*
 * The cgroup filesystem superblock creation/mount context.
 */
struct cgroup_fs_context {
        struct kernfs_fs_context kfc;
        struct cgroup_root        *root;
        struct cgroup_namespace        *ns;
        unsigned int        flags;                        /* CGRP_ROOT_* flags */

        /* cgroup1 bits */
        bool                cpuset_clone_children;
        bool                none;                        /* User explicitly requested empty subsystem */
        bool                all_ss;                        /* Seen 'all' option */
        u16                subsys_mask;                /* Selected subsystems */
        char                *name;                        /* Hierarchy name */
        char                *release_agent;                /* Path for release notifications */
};

static inline struct cgroup_fs_context *cgroup_fc2context(struct fs_context *fc)
{
        struct kernfs_fs_context *kfc = fc->fs_private;

        return container_of(kfc, struct cgroup_fs_context, kfc);
}

struct cgroup_pidlist;

struct cgroup_file_ctx {
        struct cgroup_namespace        *ns;

        struct {
                void                        *trigger;
        } psi;

        struct {
                bool                        started;
                struct css_task_iter        iter;
        } procs;

        struct {
                struct cgroup_pidlist        *pidlist;
        } procs1;

        struct cgroup_of_peak peak;
};

/*
 * A cgroup can be associated with multiple css_sets as different tasks may
 * belong to different cgroups on different hierarchies.  In the other
 * direction, a css_set is naturally associated with multiple cgroups.
 * This M:N relationship is represented by the following link structure
 * which exists for each association and allows traversing the associations
 * from both sides.
 */
struct cgrp_cset_link {
        /* the cgroup and css_set this link associates */
        struct cgroup                *cgrp;
        struct css_set                *cset;

        /* list of cgrp_cset_links anchored at cgrp->cset_links */
        struct list_head        cset_link;

        /* list of cgrp_cset_links anchored at css_set->cgrp_links */
        struct list_head        cgrp_link;
};

/* used to track tasks and csets during migration */
struct cgroup_taskset {
        /* the src and dst cset list running through cset->mg_node */
        struct list_head        src_csets;
        struct list_head        dst_csets;

        /* the number of tasks in the set */
        int                        nr_tasks;

        /* the subsys currently being processed */
        int                        ssid;

        /*
         * Fields for cgroup_taskset_*() iteration.
         *
         * Before migration is committed, the target migration tasks are on
         * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
         * the csets on ->dst_csets.  ->csets point to either ->src_csets
         * or ->dst_csets depending on whether migration is committed.
         *
         * ->cur_csets and ->cur_task point to the current task position
         * during iteration.
         */
        struct list_head        *csets;
        struct css_set                *cur_cset;
        struct task_struct        *cur_task;
};

/* migration context also tracks preloading */
struct cgroup_mgctx {
        /*
         * Preloaded source and destination csets.  Used to guarantee
         * atomic success or failure on actual migration.
         */
        struct list_head        preloaded_src_csets;
        struct list_head        preloaded_dst_csets;

        /* tasks and csets to migrate */
        struct cgroup_taskset        tset;

        /* subsystems affected by migration */
        u16                        ss_mask;
};

#define CGROUP_TASKSET_INIT(tset)                                                \
{                                                                                \
        .src_csets                = LIST_HEAD_INIT(tset.src_csets),                \
        .dst_csets                = LIST_HEAD_INIT(tset.dst_csets),                \
        .csets                        = &tset.src_csets,                                \
}

#define CGROUP_MGCTX_INIT(name)                                                        \
{                                                                                \
        LIST_HEAD_INIT(name.preloaded_src_csets),                                \
        LIST_HEAD_INIT(name.preloaded_dst_csets),                                \
        CGROUP_TASKSET_INIT(name.tset),                                                \
}

#define DEFINE_CGROUP_MGCTX(name)                                                \
        struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)

extern struct cgroup_subsys *cgroup_subsys[];
extern struct list_head cgroup_roots;
extern bool cgrp_dfl_visible;

/* iterate across the hierarchies */
#define for_each_root(root)                                                \
        list_for_each_entry_rcu((root), &cgroup_roots, root_list,        \
                                lockdep_is_held(&cgroup_mutex))

/**
 * for_each_subsys - iterate all enabled cgroup subsystems
 * @ss: the iteration cursor
 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
 */
#define for_each_subsys(ss, ssid)                                        \
        for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&                \
             (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)

static inline bool cgroup_is_dead(const struct cgroup *cgrp)
{
        return !(cgrp->self.flags & CSS_ONLINE);
}

static inline bool notify_on_release(const struct cgroup *cgrp)
{
        return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
}

void put_css_set_locked(struct css_set *cset);

static inline void put_css_set(struct css_set *cset)
{
        unsigned long flags;

        /*
         * Ensure that the refcount doesn't hit zero while any readers
         * can see it. Similar to atomic_dec_and_lock(), but for an
         * rwlock
         */
        if (refcount_dec_not_one(&cset->refcount))
                return;

        spin_lock_irqsave(&css_set_lock, flags);
        put_css_set_locked(cset);
        spin_unlock_irqrestore(&css_set_lock, flags);
}

/*
 * refcounted get/put for css_set objects
 */
static inline void get_css_set(struct css_set *cset)
{
        refcount_inc(&cset->refcount);
}

bool cgroup_ssid_enabled(int ssid);
bool cgroup_on_dfl(const struct cgroup *cgrp);

struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root);
struct cgroup *task_cgroup_from_root(struct task_struct *task,
                                     struct cgroup_root *root);
struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline);
void cgroup_kn_unlock(struct kernfs_node *kn);
int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
                          struct cgroup_namespace *ns);

void cgroup_favor_dynmods(struct cgroup_root *root, bool favor);
void cgroup_free_root(struct cgroup_root *root);
void init_cgroup_root(struct cgroup_fs_context *ctx);
int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
int cgroup_do_get_tree(struct fs_context *fc);

int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp);
void cgroup_migrate_finish(struct cgroup_mgctx *mgctx);
void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp,
                            struct cgroup_mgctx *mgctx);
int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx);
int cgroup_migrate(struct task_struct *leader, bool threadgroup,
                   struct cgroup_mgctx *mgctx);

int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
                       bool threadgroup);
void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode,
                        struct task_struct *tsk);
void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode,
                          struct task_struct *tsk);
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
                                             enum cgroup_attach_lock_mode *lock_mode)
        __acquires(&cgroup_threadgroup_rwsem);
void cgroup_procs_write_finish(struct task_struct *task,
                               enum cgroup_attach_lock_mode lock_mode)
        __releases(&cgroup_threadgroup_rwsem);

void cgroup_lock_and_drain_offline(struct cgroup *cgrp);

int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode);
int cgroup_rmdir(struct kernfs_node *kn);
int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
                     struct kernfs_root *kf_root);

int __cgroup_task_count(const struct cgroup *cgrp);
int cgroup_task_count(const struct cgroup *cgrp);

/*
 * rstat.c
 */
int css_rstat_init(struct cgroup_subsys_state *css);
void css_rstat_exit(struct cgroup_subsys_state *css);
int ss_rstat_init(struct cgroup_subsys *ss);
void cgroup_base_stat_cputime_show(struct seq_file *seq);

/*
 * namespace.c
 */
extern const struct proc_ns_operations cgroupns_operations;

/*
 * cgroup-v1.c
 */
extern struct cftype cgroup1_base_files[];
extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops;
extern const struct fs_parameter_spec cgroup1_fs_parameters[];

int proc_cgroupstats_show(struct seq_file *m, void *v);
bool cgroup1_ssid_disabled(int ssid);
void cgroup1_pidlist_destroy_all(struct cgroup *cgrp);
void cgroup1_release_agent(struct work_struct *work);
void cgroup1_check_for_release(struct cgroup *cgrp);
int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param);
int cgroup1_get_tree(struct fs_context *fc);
int cgroup1_reconfigure(struct fs_context *ctx);

#endif /* __CGROUP_INTERNAL_H */

























































































































































  318 




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  319 











































































  318 





  316 












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  318 

















































































































































































































































  317 










































































































































































































































































































































































































































































































  318 








  319 














  318 











  319 



















































































































































































































































































































































































































































































































































































































































































































































































































































  315 


  318 
































































































  317 


  315 























  314 






















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the Interfaces handler.
 *
 * Version:        @(#)dev.h        1.0.10        08/12/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Corey Minyard <wf-rch!minyard@relay.EU.net>
 *                Donald J. Becker, <becker@cesdis.gsfc.nasa.gov>
 *                Alan Cox, <alan@lxorguk.ukuu.org.uk>
 *                Bjorn Ekwall. <bj0rn@blox.se>
 *              Pekka Riikonen <priikone@poseidon.pspt.fi>
 *
 *                Moved to /usr/include/linux for NET3
 */
#ifndef _LINUX_NETDEVICE_H
#define _LINUX_NETDEVICE_H

#include <linux/timer.h>
#include <linux/bug.h>
#include <linux/delay.h>
#include <linux/atomic.h>
#include <linux/prefetch.h>
#include <asm/cache.h>
#include <asm/byteorder.h>
#include <asm/local.h>

#include <linux/percpu.h>
#include <linux/rculist.h>
#include <linux/workqueue.h>
#include <linux/dynamic_queue_limits.h>

#include <net/net_namespace.h>
#ifdef CONFIG_DCB
#include <net/dcbnl.h>
#endif
#include <net/netprio_cgroup.h>
#include <linux/netdev_features.h>
#include <linux/neighbour.h>
#include <linux/netdevice_xmit.h>
#include <uapi/linux/netdevice.h>
#include <uapi/linux/if_bonding.h>
#include <uapi/linux/pkt_cls.h>
#include <uapi/linux/netdev.h>
#include <linux/hashtable.h>
#include <linux/rbtree.h>
#include <net/net_trackers.h>
#include <net/net_debug.h>
#include <net/dropreason-core.h>
#include <net/neighbour_tables.h>

struct netpoll_info;
struct device;
struct ethtool_ops;
struct kernel_hwtstamp_config;
struct phy_device;
struct dsa_port;
struct ip_tunnel_parm_kern;
struct macsec_context;
struct macsec_ops;
struct netdev_config;
struct netdev_name_node;
struct sd_flow_limit;
struct sfp_bus;
/* 802.11 specific */
struct wireless_dev;
/* 802.15.4 specific */
struct wpan_dev;
struct mpls_dev;
/* UDP Tunnel offloads */
struct udp_tunnel_info;
struct udp_tunnel_nic_info;
struct udp_tunnel_nic;
struct bpf_prog;
struct xdp_buff;
struct xdp_frame;
struct xdp_metadata_ops;
struct xdp_md;
struct ethtool_netdev_state;
struct phy_link_topology;
struct hwtstamp_provider;

typedef u32 xdp_features_t;

void synchronize_net(void);
void netdev_set_default_ethtool_ops(struct net_device *dev,
                                    const struct ethtool_ops *ops);
void netdev_sw_irq_coalesce_default_on(struct net_device *dev);

/* Backlog congestion levels */
#define NET_RX_SUCCESS                0        /* keep 'em coming, baby */
#define NET_RX_DROP                1        /* packet dropped */

#define MAX_NEST_DEV 8

/*
 * Transmit return codes: transmit return codes originate from three different
 * namespaces:
 *
 * - qdisc return codes
 * - driver transmit return codes
 * - errno values
 *
 * Drivers are allowed to return any one of those in their hard_start_xmit()
 * function. Real network devices commonly used with qdiscs should only return
 * the driver transmit return codes though - when qdiscs are used, the actual
 * transmission happens asynchronously, so the value is not propagated to
 * higher layers. Virtual network devices transmit synchronously; in this case
 * the driver transmit return codes are consumed by dev_queue_xmit(), and all
 * others are propagated to higher layers.
 */

/* qdisc ->enqueue() return codes. */
#define NET_XMIT_SUCCESS        0x00
#define NET_XMIT_DROP                0x01        /* skb dropped                        */
#define NET_XMIT_CN                0x02        /* congestion notification        */
#define NET_XMIT_MASK                0x0f        /* qdisc flags in net/sch_generic.h */

/* NET_XMIT_CN is special. It does not guarantee that this packet is lost. It
 * indicates that the device will soon be dropping packets, or already drops
 * some packets of the same priority; prompting us to send less aggressively. */
#define net_xmit_eval(e)        ((e) == NET_XMIT_CN ? 0 : (e))
#define net_xmit_errno(e)        ((e) != NET_XMIT_CN ? -ENOBUFS : 0)

/* Driver transmit return codes */
#define NETDEV_TX_MASK                0xf0

enum netdev_tx {
        __NETDEV_TX_MIN         = INT_MIN,        /* make sure enum is signed */
        NETDEV_TX_OK         = 0x00,        /* driver took care of packet */
        NETDEV_TX_BUSY         = 0x10,        /* driver tx path was busy*/
};
typedef enum netdev_tx netdev_tx_t;

/*
 * Current order: NETDEV_TX_MASK > NET_XMIT_MASK >= 0 is significant;
 * hard_start_xmit() return < NET_XMIT_MASK means skb was consumed.
 */
static inline bool dev_xmit_complete(int rc)
{
        /*
         * Positive cases with an skb consumed by a driver:
         * - successful transmission (rc == NETDEV_TX_OK)
         * - error while transmitting (rc < 0)
         * - error while queueing to a different device (rc & NET_XMIT_MASK)
         */
        if (likely(rc < NET_XMIT_MASK))
                return true;

        return false;
}

/*
 *        Compute the worst-case header length according to the protocols
 *        used.
 */

#if defined(CONFIG_HYPERV_NET)
# define LL_MAX_HEADER 128
#elif defined(CONFIG_WLAN) || IS_ENABLED(CONFIG_AX25)
# if defined(CONFIG_MAC80211_MESH)
#  define LL_MAX_HEADER 128
# else
#  define LL_MAX_HEADER 96
# endif
#else
# define LL_MAX_HEADER 32
#endif

#if !IS_ENABLED(CONFIG_NET_IPIP) && !IS_ENABLED(CONFIG_NET_IPGRE) && \
    !IS_ENABLED(CONFIG_IPV6_SIT) && !IS_ENABLED(CONFIG_IPV6_TUNNEL)
#define MAX_HEADER LL_MAX_HEADER
#else
#define MAX_HEADER (LL_MAX_HEADER + 48)
#endif

/*
 *        Old network device statistics. Fields are native words
 *        (unsigned long) so they can be read and written atomically.
 */

#define NET_DEV_STAT(FIELD)                        \
        union {                                        \
                unsigned long FIELD;                \
                atomic_long_t __##FIELD;        \
        }

struct net_device_stats {
        NET_DEV_STAT(rx_packets);
        NET_DEV_STAT(tx_packets);
        NET_DEV_STAT(rx_bytes);
        NET_DEV_STAT(tx_bytes);
        NET_DEV_STAT(rx_errors);
        NET_DEV_STAT(tx_errors);
        NET_DEV_STAT(rx_dropped);
        NET_DEV_STAT(tx_dropped);
        NET_DEV_STAT(multicast);
        NET_DEV_STAT(collisions);
        NET_DEV_STAT(rx_length_errors);
        NET_DEV_STAT(rx_over_errors);
        NET_DEV_STAT(rx_crc_errors);
        NET_DEV_STAT(rx_frame_errors);
        NET_DEV_STAT(rx_fifo_errors);
        NET_DEV_STAT(rx_missed_errors);
        NET_DEV_STAT(tx_aborted_errors);
        NET_DEV_STAT(tx_carrier_errors);
        NET_DEV_STAT(tx_fifo_errors);
        NET_DEV_STAT(tx_heartbeat_errors);
        NET_DEV_STAT(tx_window_errors);
        NET_DEV_STAT(rx_compressed);
        NET_DEV_STAT(tx_compressed);
};
#undef NET_DEV_STAT

/* per-cpu stats, allocated on demand.
 * Try to fit them in a single cache line, for dev_get_stats() sake.
 */
struct net_device_core_stats {
        unsigned long        rx_dropped;
        unsigned long        tx_dropped;
        unsigned long        rx_nohandler;
        unsigned long        rx_otherhost_dropped;
} __aligned(4 * sizeof(unsigned long));

#include <linux/cache.h>
#include <linux/skbuff.h>

struct neighbour;
struct neigh_parms;
struct sk_buff;

struct netdev_hw_addr {
        struct list_head        list;
        struct rb_node                node;
        unsigned char                addr[MAX_ADDR_LEN];
        unsigned char                type;
#define NETDEV_HW_ADDR_T_LAN                1
#define NETDEV_HW_ADDR_T_SAN                2
#define NETDEV_HW_ADDR_T_UNICAST        3
#define NETDEV_HW_ADDR_T_MULTICAST        4
        bool                        global_use;
        int                        sync_cnt;
        int                        refcount;
        int                        synced;
        struct rcu_head                rcu_head;
};

struct netdev_hw_addr_list {
        struct list_head        list;
        int                        count;

        /* Auxiliary tree for faster lookup on addition and deletion */
        struct rb_root                tree;
};

#define netdev_hw_addr_list_count(l) ((l)->count)
#define netdev_hw_addr_list_empty(l) (netdev_hw_addr_list_count(l) == 0)
#define netdev_hw_addr_list_for_each(ha, l) \
        list_for_each_entry(ha, &(l)->list, list)

#define netdev_uc_count(dev) netdev_hw_addr_list_count(&(dev)->uc)
#define netdev_uc_empty(dev) netdev_hw_addr_list_empty(&(dev)->uc)
#define netdev_for_each_uc_addr(ha, dev) \
        netdev_hw_addr_list_for_each(ha, &(dev)->uc)
#define netdev_for_each_synced_uc_addr(_ha, _dev) \
        netdev_for_each_uc_addr((_ha), (_dev)) \
                if ((_ha)->sync_cnt)

#define netdev_mc_count(dev) netdev_hw_addr_list_count(&(dev)->mc)
#define netdev_mc_empty(dev) netdev_hw_addr_list_empty(&(dev)->mc)
#define netdev_for_each_mc_addr(ha, dev) \
        netdev_hw_addr_list_for_each(ha, &(dev)->mc)
#define netdev_for_each_synced_mc_addr(_ha, _dev) \
        netdev_for_each_mc_addr((_ha), (_dev)) \
                if ((_ha)->sync_cnt)

struct hh_cache {
        unsigned int        hh_len;
        seqlock_t        hh_lock;

        /* cached hardware header; allow for machine alignment needs.        */
#define HH_DATA_MOD        16
#define HH_DATA_OFF(__len) \
        (HH_DATA_MOD - (((__len - 1) & (HH_DATA_MOD - 1)) + 1))
#define HH_DATA_ALIGN(__len) \
        (((__len)+(HH_DATA_MOD-1))&~(HH_DATA_MOD - 1))
        unsigned long        hh_data[HH_DATA_ALIGN(LL_MAX_HEADER) / sizeof(long)];
};

/* Reserve HH_DATA_MOD byte-aligned hard_header_len, but at least that much.
 * Alternative is:
 *   dev->hard_header_len ? (dev->hard_header_len +
 *                           (HH_DATA_MOD - 1)) & ~(HH_DATA_MOD - 1) : 0
 *
 * We could use other alignment values, but we must maintain the
 * relationship HH alignment <= LL alignment.
 */
#define LL_RESERVED_SPACE(dev) \
        ((((dev)->hard_header_len + READ_ONCE((dev)->needed_headroom)) \
          & ~(HH_DATA_MOD - 1)) + HH_DATA_MOD)
#define LL_RESERVED_SPACE_EXTRA(dev,extra) \
        ((((dev)->hard_header_len + READ_ONCE((dev)->needed_headroom) + (extra)) \
          & ~(HH_DATA_MOD - 1)) + HH_DATA_MOD)

struct header_ops {
        int        (*create) (struct sk_buff *skb, struct net_device *dev,
                           unsigned short type, const void *daddr,
                           const void *saddr, unsigned int len);
        int        (*parse)(const struct sk_buff *skb, unsigned char *haddr);
        int        (*cache)(const struct neighbour *neigh, struct hh_cache *hh, __be16 type);
        void        (*cache_update)(struct hh_cache *hh,
                                const struct net_device *dev,
                                const unsigned char *haddr);
        bool        (*validate)(const char *ll_header, unsigned int len);
        __be16        (*parse_protocol)(const struct sk_buff *skb);
};

/* These flag bits are private to the generic network queueing
 * layer; they may not be explicitly referenced by any other
 * code.
 */

enum netdev_state_t {
        __LINK_STATE_START,
        __LINK_STATE_PRESENT,
        __LINK_STATE_NOCARRIER,
        __LINK_STATE_LINKWATCH_PENDING,
        __LINK_STATE_DORMANT,
        __LINK_STATE_TESTING,
};

struct gro_list {
        struct list_head        list;
        int                        count;
};

/*
 * size of gro hash buckets, must be <= the number of bits in
 * gro_node::bitmask
 */
#define GRO_HASH_BUCKETS        8

/**
 * struct gro_node - structure to support Generic Receive Offload
 * @bitmask: bitmask to indicate used buckets in @hash
 * @hash: hashtable of pending aggregated skbs, separated by flows
 * @rx_list: list of pending ``GRO_NORMAL`` skbs
 * @rx_count: cached current length of @rx_list
 * @cached_napi_id: napi_struct::napi_id cached for hotpath, 0 for standalone
 */
struct gro_node {
        unsigned long                bitmask;
        struct gro_list                hash[GRO_HASH_BUCKETS];
        struct list_head        rx_list;
        u32                        rx_count;
        u32                        cached_napi_id;
};

/*
 * Structure for per-NAPI config
 */
struct napi_config {
        u64 gro_flush_timeout;
        u64 irq_suspend_timeout;
        u32 defer_hard_irqs;
        cpumask_t affinity_mask;
        u8 threaded;
        unsigned int napi_id;
};

/*
 * Structure for NAPI scheduling similar to tasklet but with weighting
 */
struct napi_struct {
        /* The poll_list must only be managed by the entity which
         * changes the state of the NAPI_STATE_SCHED bit.  This means
         * whoever atomically sets that bit can add this napi_struct
         * to the per-CPU poll_list, and whoever clears that bit
         * can remove from the list right before clearing the bit.
         */
        struct list_head        poll_list;

        unsigned long                state;
        int                        weight;
        u32                        defer_hard_irqs_count;
        int                        (*poll)(struct napi_struct *, int);
#ifdef CONFIG_NETPOLL
        /* CPU actively polling if netpoll is configured */
        int                        poll_owner;
#endif
        /* CPU on which NAPI has been scheduled for processing */
        int                        list_owner;
        struct net_device        *dev;
        struct sk_buff                *skb;
        struct gro_node                gro;
        struct hrtimer                timer;
        /* all fields past this point are write-protected by netdev_lock */
        struct task_struct        *thread;
        unsigned long                gro_flush_timeout;
        unsigned long                irq_suspend_timeout;
        u32                        defer_hard_irqs;
        /* control-path-only fields follow */
        u32                        napi_id;
        struct list_head        dev_list;
        struct hlist_node        napi_hash_node;
        int                        irq;
        struct irq_affinity_notify notify;
        int                        napi_rmap_idx;
        int                        index;
        struct napi_config        *config;
};

enum {
        NAPI_STATE_SCHED,                /* Poll is scheduled */
        NAPI_STATE_MISSED,                /* reschedule a napi */
        NAPI_STATE_DISABLE,                /* Disable pending */
        NAPI_STATE_NPSVC,                /* Netpoll - don't dequeue from poll_list */
        NAPI_STATE_LISTED,                /* NAPI added to system lists */
        NAPI_STATE_NO_BUSY_POLL,        /* Do not add in napi_hash, no busy polling */
        NAPI_STATE_IN_BUSY_POLL,        /* sk_busy_loop() owns this NAPI */
        NAPI_STATE_PREFER_BUSY_POLL,        /* prefer busy-polling over softirq processing*/
        NAPI_STATE_THREADED,                /* The poll is performed inside its own thread*/
        NAPI_STATE_SCHED_THREADED,        /* Napi is currently scheduled in threaded mode */
        NAPI_STATE_HAS_NOTIFIER,        /* Napi has an IRQ notifier */
};

enum {
        NAPIF_STATE_SCHED                = BIT(NAPI_STATE_SCHED),
        NAPIF_STATE_MISSED                = BIT(NAPI_STATE_MISSED),
        NAPIF_STATE_DISABLE                = BIT(NAPI_STATE_DISABLE),
        NAPIF_STATE_NPSVC                = BIT(NAPI_STATE_NPSVC),
        NAPIF_STATE_LISTED                = BIT(NAPI_STATE_LISTED),
        NAPIF_STATE_NO_BUSY_POLL        = BIT(NAPI_STATE_NO_BUSY_POLL),
        NAPIF_STATE_IN_BUSY_POLL        = BIT(NAPI_STATE_IN_BUSY_POLL),
        NAPIF_STATE_PREFER_BUSY_POLL        = BIT(NAPI_STATE_PREFER_BUSY_POLL),
        NAPIF_STATE_THREADED                = BIT(NAPI_STATE_THREADED),
        NAPIF_STATE_SCHED_THREADED        = BIT(NAPI_STATE_SCHED_THREADED),
        NAPIF_STATE_HAS_NOTIFIER        = BIT(NAPI_STATE_HAS_NOTIFIER),
};

enum gro_result {
        GRO_MERGED,
        GRO_MERGED_FREE,
        GRO_HELD,
        GRO_NORMAL,
        GRO_CONSUMED,
};
typedef enum gro_result gro_result_t;

/*
 * enum rx_handler_result - Possible return values for rx_handlers.
 * @RX_HANDLER_CONSUMED: skb was consumed by rx_handler, do not process it
 * further.
 * @RX_HANDLER_ANOTHER: Do another round in receive path. This is indicated in
 * case skb->dev was changed by rx_handler.
 * @RX_HANDLER_EXACT: Force exact delivery, no wildcard.
 * @RX_HANDLER_PASS: Do nothing, pass the skb as if no rx_handler was called.
 *
 * rx_handlers are functions called from inside __netif_receive_skb(), to do
 * special processing of the skb, prior to delivery to protocol handlers.
 *
 * Currently, a net_device can only have a single rx_handler registered. Trying
 * to register a second rx_handler will return -EBUSY.
 *
 * To register a rx_handler on a net_device, use netdev_rx_handler_register().
 * To unregister a rx_handler on a net_device, use
 * netdev_rx_handler_unregister().
 *
 * Upon return, rx_handler is expected to tell __netif_receive_skb() what to
 * do with the skb.
 *
 * If the rx_handler consumed the skb in some way, it should return
 * RX_HANDLER_CONSUMED. This is appropriate when the rx_handler arranged for
 * the skb to be delivered in some other way.
 *
 * If the rx_handler changed skb->dev, to divert the skb to another
 * net_device, it should return RX_HANDLER_ANOTHER. The rx_handler for the
 * new device will be called if it exists.
 *
 * If the rx_handler decides the skb should be ignored, it should return
 * RX_HANDLER_EXACT. The skb will only be delivered to protocol handlers that
 * are registered on exact device (ptype->dev == skb->dev).
 *
 * If the rx_handler didn't change skb->dev, but wants the skb to be normally
 * delivered, it should return RX_HANDLER_PASS.
 *
 * A device without a registered rx_handler will behave as if rx_handler
 * returned RX_HANDLER_PASS.
 */

enum rx_handler_result {
        RX_HANDLER_CONSUMED,
        RX_HANDLER_ANOTHER,
        RX_HANDLER_EXACT,
        RX_HANDLER_PASS,
};
typedef enum rx_handler_result rx_handler_result_t;
typedef rx_handler_result_t rx_handler_func_t(struct sk_buff **pskb);

void __napi_schedule(struct napi_struct *n);
void __napi_schedule_irqoff(struct napi_struct *n);

static inline bool napi_disable_pending(struct napi_struct *n)
{
        return test_bit(NAPI_STATE_DISABLE, &n->state);
}

static inline bool napi_prefer_busy_poll(struct napi_struct *n)
{
        return test_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
}

/**
 * napi_is_scheduled - test if NAPI is scheduled
 * @n: NAPI context
 *
 * This check is "best-effort". With no locking implemented,
 * a NAPI can be scheduled or terminate right after this check
 * and produce not precise results.
 *
 * NAPI_STATE_SCHED is an internal state, napi_is_scheduled
 * should not be used normally and napi_schedule should be
 * used instead.
 *
 * Use only if the driver really needs to check if a NAPI
 * is scheduled for example in the context of delayed timer
 * that can be skipped if a NAPI is already scheduled.
 *
 * Return: True if NAPI is scheduled, False otherwise.
 */
static inline bool napi_is_scheduled(struct napi_struct *n)
{
        return test_bit(NAPI_STATE_SCHED, &n->state);
}

bool napi_schedule_prep(struct napi_struct *n);

/**
 *        napi_schedule - schedule NAPI poll
 *        @n: NAPI context
 *
 * Schedule NAPI poll routine to be called if it is not already
 * running.
 * Return: true if we schedule a NAPI or false if not.
 * Refer to napi_schedule_prep() for additional reason on why
 * a NAPI might not be scheduled.
 */
static inline bool napi_schedule(struct napi_struct *n)
{
        if (napi_schedule_prep(n)) {
                __napi_schedule(n);
                return true;
        }

        return false;
}

/**
 *        napi_schedule_irqoff - schedule NAPI poll
 *        @n: NAPI context
 *
 * Variant of napi_schedule(), assuming hard irqs are masked.
 */
static inline void napi_schedule_irqoff(struct napi_struct *n)
{
        if (napi_schedule_prep(n))
                __napi_schedule_irqoff(n);
}

/**
 * napi_complete_done - NAPI processing complete
 * @n: NAPI context
 * @work_done: number of packets processed
 *
 * Mark NAPI processing as complete. Should only be called if poll budget
 * has not been completely consumed.
 * Prefer over napi_complete().
 * Return: false if device should avoid rearming interrupts.
 */
bool napi_complete_done(struct napi_struct *n, int work_done);

static inline bool napi_complete(struct napi_struct *n)
{
        return napi_complete_done(n, 0);
}

void netif_threaded_enable(struct net_device *dev);
int dev_set_threaded(struct net_device *dev,
                     enum netdev_napi_threaded threaded);

void napi_disable(struct napi_struct *n);
void napi_disable_locked(struct napi_struct *n);

void napi_enable(struct napi_struct *n);
void napi_enable_locked(struct napi_struct *n);

/**
 *        napi_synchronize - wait until NAPI is not running
 *        @n: NAPI context
 *
 * Wait until NAPI is done being scheduled on this context.
 * Waits till any outstanding processing completes but
 * does not disable future activations.
 */
static inline void napi_synchronize(const struct napi_struct *n)
{
        if (IS_ENABLED(CONFIG_SMP))
                while (test_bit(NAPI_STATE_SCHED, &n->state))
                        msleep(1);
        else
                barrier();
}

/**
 *        napi_if_scheduled_mark_missed - if napi is running, set the
 *        NAPIF_STATE_MISSED
 *        @n: NAPI context
 *
 * If napi is running, set the NAPIF_STATE_MISSED, and return true if
 * NAPI is scheduled.
 **/
static inline bool napi_if_scheduled_mark_missed(struct napi_struct *n)
{
        unsigned long val, new;

        val = READ_ONCE(n->state);
        do {
                if (val & NAPIF_STATE_DISABLE)
                        return true;

                if (!(val & NAPIF_STATE_SCHED))
                        return false;

                new = val | NAPIF_STATE_MISSED;
        } while (!try_cmpxchg(&n->state, &val, new));

        return true;
}

enum netdev_queue_state_t {
        __QUEUE_STATE_DRV_XOFF,
        __QUEUE_STATE_STACK_XOFF,
        __QUEUE_STATE_FROZEN,
};

#define QUEUE_STATE_DRV_XOFF        (1 << __QUEUE_STATE_DRV_XOFF)
#define QUEUE_STATE_STACK_XOFF        (1 << __QUEUE_STATE_STACK_XOFF)
#define QUEUE_STATE_FROZEN        (1 << __QUEUE_STATE_FROZEN)

#define QUEUE_STATE_ANY_XOFF        (QUEUE_STATE_DRV_XOFF | QUEUE_STATE_STACK_XOFF)
#define QUEUE_STATE_ANY_XOFF_OR_FROZEN (QUEUE_STATE_ANY_XOFF | \
                                        QUEUE_STATE_FROZEN)
#define QUEUE_STATE_DRV_XOFF_OR_FROZEN (QUEUE_STATE_DRV_XOFF | \
                                        QUEUE_STATE_FROZEN)

/*
 * __QUEUE_STATE_DRV_XOFF is used by drivers to stop the transmit queue.  The
 * netif_tx_* functions below are used to manipulate this flag.  The
 * __QUEUE_STATE_STACK_XOFF flag is used by the stack to stop the transmit
 * queue independently.  The netif_xmit_*stopped functions below are called
 * to check if the queue has been stopped by the driver or stack (either
 * of the XOFF bits are set in the state).  Drivers should not need to call
 * netif_xmit*stopped functions, they should only be using netif_tx_*.
 */

struct netdev_queue {
/*
 * read-mostly part
 */
        struct net_device        *dev;
        netdevice_tracker        dev_tracker;

        struct Qdisc __rcu        *qdisc;
        struct Qdisc __rcu        *qdisc_sleeping;
#ifdef CONFIG_SYSFS
        struct kobject                kobj;
        const struct attribute_group        **groups;
#endif
        unsigned long                tx_maxrate;
        /*
         * Number of TX timeouts for this queue
         * (/sys/class/net/DEV/Q/trans_timeout)
         */
        atomic_long_t                trans_timeout;

        /* Subordinate device that the queue has been assigned to */
        struct net_device        *sb_dev;
#ifdef CONFIG_XDP_SOCKETS
        /* "ops protected", see comment about net_device::lock */
        struct xsk_buff_pool    *pool;
#endif

/*
 * write-mostly part
 */
#ifdef CONFIG_BQL
        struct dql                dql;
#endif
        spinlock_t                _xmit_lock ____cacheline_aligned_in_smp;
        int                        xmit_lock_owner;
        /*
         * Time (in jiffies) of last Tx
         */
        unsigned long                trans_start;

        unsigned long                state;

/*
 * slow- / control-path part
 */
        /* NAPI instance for the queue
         * "ops protected", see comment about net_device::lock
         */
        struct napi_struct        *napi;

#if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
        int                        numa_node;
#endif
} ____cacheline_aligned_in_smp;

extern int sysctl_fb_tunnels_only_for_init_net;
extern int sysctl_devconf_inherit_init_net;

/*
 * sysctl_fb_tunnels_only_for_init_net == 0 : For all netns
 *                                     == 1 : For initns only
 *                                     == 2 : For none.
 */
static inline bool net_has_fallback_tunnels(const struct net *net)
{
#if IS_ENABLED(CONFIG_SYSCTL)
        int fb_tunnels_only_for_init_net = READ_ONCE(sysctl_fb_tunnels_only_for_init_net);

        return !fb_tunnels_only_for_init_net ||
                (net_eq(net, &init_net) && fb_tunnels_only_for_init_net == 1);
#else
        return true;
#endif
}

static inline int net_inherit_devconf(void)
{
#if IS_ENABLED(CONFIG_SYSCTL)
        return READ_ONCE(sysctl_devconf_inherit_init_net);
#else
        return 0;
#endif
}

static inline int netdev_queue_numa_node_read(const struct netdev_queue *q)
{
#if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
        return q->numa_node;
#else
        return NUMA_NO_NODE;
#endif
}

static inline void netdev_queue_numa_node_write(struct netdev_queue *q, int node)
{
#if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
        q->numa_node = node;
#endif
}

#ifdef CONFIG_RFS_ACCEL
bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id,
                         u16 filter_id);
#endif

/* XPS map type and offset of the xps map within net_device->xps_maps[]. */
enum xps_map_type {
        XPS_CPUS = 0,
        XPS_RXQS,
        XPS_MAPS_MAX,
};

#ifdef CONFIG_XPS
/*
 * This structure holds an XPS map which can be of variable length.  The
 * map is an array of queues.
 */
struct xps_map {
        unsigned int len;
        unsigned int alloc_len;
        struct rcu_head rcu;
        u16 queues[];
};
#define XPS_MAP_SIZE(_num) (sizeof(struct xps_map) + ((_num) * sizeof(u16)))
#define XPS_MIN_MAP_ALLOC ((L1_CACHE_ALIGN(offsetof(struct xps_map, queues[1])) \
       - sizeof(struct xps_map)) / sizeof(u16))

/*
 * This structure holds all XPS maps for device.  Maps are indexed by CPU.
 *
 * We keep track of the number of cpus/rxqs used when the struct is allocated,
 * in nr_ids. This will help not accessing out-of-bound memory.
 *
 * We keep track of the number of traffic classes used when the struct is
 * allocated, in num_tc. This will be used to navigate the maps, to ensure we're
 * not crossing its upper bound, as the original dev->num_tc can be updated in
 * the meantime.
 */
struct xps_dev_maps {
        struct rcu_head rcu;
        unsigned int nr_ids;
        s16 num_tc;
        struct xps_map __rcu *attr_map[]; /* Either CPUs map or RXQs map */
};

#define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) +        \
        (nr_cpu_ids * (_tcs) * sizeof(struct xps_map *)))

#define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\
        (_rxqs * (_tcs) * sizeof(struct xps_map *)))

#endif /* CONFIG_XPS */

#define TC_MAX_QUEUE        16
#define TC_BITMASK        15
/* HW offloaded queuing disciplines txq count and offset maps */
struct netdev_tc_txq {
        u16 count;
        u16 offset;
};

#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
/*
 * This structure is to hold information about the device
 * configured to run FCoE protocol stack.
 */
struct netdev_fcoe_hbainfo {
        char        manufacturer[64];
        char        serial_number[64];
        char        hardware_version[64];
        char        driver_version[64];
        char        optionrom_version[64];
        char        firmware_version[64];
        char        model[256];
        char        model_description[256];
};
#endif

#define MAX_PHYS_ITEM_ID_LEN 32

/* This structure holds a unique identifier to identify some
 * physical item (port for example) used by a netdevice.
 */
struct netdev_phys_item_id {
        unsigned char id[MAX_PHYS_ITEM_ID_LEN];
        unsigned char id_len;
};

static inline bool netdev_phys_item_id_same(struct netdev_phys_item_id *a,
                                            struct netdev_phys_item_id *b)
{
        return a->id_len == b->id_len &&
               memcmp(a->id, b->id, a->id_len) == 0;
}

typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
                                       struct sk_buff *skb,
                                       struct net_device *sb_dev);

enum net_device_path_type {
        DEV_PATH_ETHERNET = 0,
        DEV_PATH_VLAN,
        DEV_PATH_BRIDGE,
        DEV_PATH_PPPOE,
        DEV_PATH_DSA,
        DEV_PATH_MTK_WDMA,
};

struct net_device_path {
        enum net_device_path_type        type;
        const struct net_device                *dev;
        union {
                struct {
                        u16                id;
                        __be16                proto;
                        u8                h_dest[ETH_ALEN];
                } encap;
                struct {
                        enum {
                                DEV_PATH_BR_VLAN_KEEP,
                                DEV_PATH_BR_VLAN_TAG,
                                DEV_PATH_BR_VLAN_UNTAG,
                                DEV_PATH_BR_VLAN_UNTAG_HW,
                        }                vlan_mode;
                        u16                vlan_id;
                        __be16                vlan_proto;
                } bridge;
                struct {
                        int port;
                        u16 proto;
                } dsa;
                struct {
                        u8 wdma_idx;
                        u8 queue;
                        u16 wcid;
                        u8 bss;
                        u8 amsdu;
                } mtk_wdma;
        };
};

#define NET_DEVICE_PATH_STACK_MAX        5
#define NET_DEVICE_PATH_VLAN_MAX        2

struct net_device_path_stack {
        int                        num_paths;
        struct net_device_path        path[NET_DEVICE_PATH_STACK_MAX];
};

struct net_device_path_ctx {
        const struct net_device *dev;
        u8                        daddr[ETH_ALEN];

        int                        num_vlans;
        struct {
                u16                id;
                __be16                proto;
        } vlan[NET_DEVICE_PATH_VLAN_MAX];
};

enum tc_setup_type {
        TC_QUERY_CAPS,
        TC_SETUP_QDISC_MQPRIO,
        TC_SETUP_CLSU32,
        TC_SETUP_CLSFLOWER,
        TC_SETUP_CLSMATCHALL,
        TC_SETUP_CLSBPF,
        TC_SETUP_BLOCK,
        TC_SETUP_QDISC_CBS,
        TC_SETUP_QDISC_RED,
        TC_SETUP_QDISC_PRIO,
        TC_SETUP_QDISC_MQ,
        TC_SETUP_QDISC_ETF,
        TC_SETUP_ROOT_QDISC,
        TC_SETUP_QDISC_GRED,
        TC_SETUP_QDISC_TAPRIO,
        TC_SETUP_FT,
        TC_SETUP_QDISC_ETS,
        TC_SETUP_QDISC_TBF,
        TC_SETUP_QDISC_FIFO,
        TC_SETUP_QDISC_HTB,
        TC_SETUP_ACT,
};

/* These structures hold the attributes of bpf state that are being passed
 * to the netdevice through the bpf op.
 */
enum bpf_netdev_command {
        /* Set or clear a bpf program used in the earliest stages of packet
         * rx. The prog will have been loaded as BPF_PROG_TYPE_XDP. The callee
         * is responsible for calling bpf_prog_put on any old progs that are
         * stored. In case of error, the callee need not release the new prog
         * reference, but on success it takes ownership and must bpf_prog_put
         * when it is no longer used.
         */
        XDP_SETUP_PROG,
        XDP_SETUP_PROG_HW,
        /* BPF program for offload callbacks, invoked at program load time. */
        BPF_OFFLOAD_MAP_ALLOC,
        BPF_OFFLOAD_MAP_FREE,
        XDP_SETUP_XSK_POOL,
};

struct bpf_prog_offload_ops;
struct netlink_ext_ack;
struct xdp_umem;
struct xdp_dev_bulk_queue;
struct bpf_xdp_link;

enum bpf_xdp_mode {
        XDP_MODE_SKB = 0,
        XDP_MODE_DRV = 1,
        XDP_MODE_HW = 2,
        __MAX_XDP_MODE
};

struct bpf_xdp_entity {
        struct bpf_prog *prog;
        struct bpf_xdp_link *link;
};

struct netdev_bpf {
        enum bpf_netdev_command command;
        union {
                /* XDP_SETUP_PROG */
                struct {
                        u32 flags;
                        struct bpf_prog *prog;
                        struct netlink_ext_ack *extack;
                };
                /* BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE */
                struct {
                        struct bpf_offloaded_map *offmap;
                };
                /* XDP_SETUP_XSK_POOL */
                struct {
                        struct xsk_buff_pool *pool;
                        u16 queue_id;
                } xsk;
        };
};

/* Flags for ndo_xsk_wakeup. */
#define XDP_WAKEUP_RX (1 << 0)
#define XDP_WAKEUP_TX (1 << 1)

#ifdef CONFIG_XFRM_OFFLOAD
struct xfrmdev_ops {
        int        (*xdo_dev_state_add)(struct net_device *dev,
                                     struct xfrm_state *x,
                                     struct netlink_ext_ack *extack);
        void        (*xdo_dev_state_delete)(struct net_device *dev,
                                        struct xfrm_state *x);
        void        (*xdo_dev_state_free)(struct net_device *dev,
                                      struct xfrm_state *x);
        bool        (*xdo_dev_offload_ok) (struct sk_buff *skb,
                                       struct xfrm_state *x);
        void        (*xdo_dev_state_advance_esn) (struct xfrm_state *x);
        void        (*xdo_dev_state_update_stats) (struct xfrm_state *x);
        int        (*xdo_dev_policy_add) (struct xfrm_policy *x, struct netlink_ext_ack *extack);
        void        (*xdo_dev_policy_delete) (struct xfrm_policy *x);
        void        (*xdo_dev_policy_free) (struct xfrm_policy *x);
};
#endif

struct dev_ifalias {
        struct rcu_head rcuhead;
        char ifalias[];
};

struct devlink;
struct tlsdev_ops;

struct netdev_net_notifier {
        struct list_head list;
        struct notifier_block *nb;
};

/*
 * This structure defines the management hooks for network devices.
 * The following hooks can be defined; unless noted otherwise, they are
 * optional and can be filled with a null pointer.
 *
 * int (*ndo_init)(struct net_device *dev);
 *     This function is called once when a network device is registered.
 *     The network device can use this for any late stage initialization
 *     or semantic validation. It can fail with an error code which will
 *     be propagated back to register_netdev.
 *
 * void (*ndo_uninit)(struct net_device *dev);
 *     This function is called when device is unregistered or when registration
 *     fails. It is not called if init fails.
 *
 * int (*ndo_open)(struct net_device *dev);
 *     This function is called when a network device transitions to the up
 *     state.
 *
 * int (*ndo_stop)(struct net_device *dev);
 *     This function is called when a network device transitions to the down
 *     state.
 *
 * netdev_tx_t (*ndo_start_xmit)(struct sk_buff *skb,
 *                               struct net_device *dev);
 *        Called when a packet needs to be transmitted.
 *        Returns NETDEV_TX_OK.  Can return NETDEV_TX_BUSY, but you should stop
 *        the queue before that can happen; it's for obsolete devices and weird
 *        corner cases, but the stack really does a non-trivial amount
 *        of useless work if you return NETDEV_TX_BUSY.
 *        Required; cannot be NULL.
 *
 * netdev_features_t (*ndo_features_check)(struct sk_buff *skb,
 *                                           struct net_device *dev
 *                                           netdev_features_t features);
 *        Called by core transmit path to determine if device is capable of
 *        performing offload operations on a given packet. This is to give
 *        the device an opportunity to implement any restrictions that cannot
 *        be otherwise expressed by feature flags. The check is called with
 *        the set of features that the stack has calculated and it returns
 *        those the driver believes to be appropriate.
 *
 * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb,
 *                         struct net_device *sb_dev);
 *        Called to decide which queue to use when device supports multiple
 *        transmit queues.
 *
 * void (*ndo_change_rx_flags)(struct net_device *dev, int flags);
 *        This function is called to allow device receiver to make
 *        changes to configuration when multicast or promiscuous is enabled.
 *
 * void (*ndo_set_rx_mode)(struct net_device *dev);
 *        This function is called device changes address list filtering.
 *        If driver handles unicast address filtering, it should set
 *        IFF_UNICAST_FLT in its priv_flags.
 *
 * int (*ndo_set_mac_address)(struct net_device *dev, void *addr);
 *        This function  is called when the Media Access Control address
 *        needs to be changed. If this interface is not defined, the
 *        MAC address can not be changed.
 *
 * int (*ndo_validate_addr)(struct net_device *dev);
 *        Test if Media Access Control address is valid for the device.
 *
 * int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd);
 *        Old-style ioctl entry point. This is used internally by the
 *        ieee802154 subsystem but is no longer called by the device
 *        ioctl handler.
 *
 * int (*ndo_siocbond)(struct net_device *dev, struct ifreq *ifr, int cmd);
 *        Used by the bonding driver for its device specific ioctls:
 *        SIOCBONDENSLAVE, SIOCBONDRELEASE, SIOCBONDSETHWADDR, SIOCBONDCHANGEACTIVE,
 *        SIOCBONDSLAVEINFOQUERY, and SIOCBONDINFOQUERY
 *
 * * int (*ndo_eth_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd);
 *        Called for ethernet specific ioctls: SIOCGMIIPHY, SIOCGMIIREG,
 *        SIOCSMIIREG, SIOCSHWTSTAMP and SIOCGHWTSTAMP.
 *
 * int (*ndo_set_config)(struct net_device *dev, struct ifmap *map);
 *        Used to set network devices bus interface parameters. This interface
 *        is retained for legacy reasons; new devices should use the bus
 *        interface (PCI) for low level management.
 *
 * int (*ndo_change_mtu)(struct net_device *dev, int new_mtu);
 *        Called when a user wants to change the Maximum Transfer Unit
 *        of a device.
 *
 * void (*ndo_tx_timeout)(struct net_device *dev, unsigned int txqueue);
 *        Callback used when the transmitter has not made any progress
 *        for dev->watchdog ticks.
 *
 * void (*ndo_get_stats64)(struct net_device *dev,
 *                         struct rtnl_link_stats64 *storage);
 * struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);
 *        Called when a user wants to get the network device usage
 *        statistics. Drivers must do one of the following:
 *        1. Define @ndo_get_stats64 to fill in a zero-initialised
 *           rtnl_link_stats64 structure passed by the caller.
 *        2. Define @ndo_get_stats to update a net_device_stats structure
 *           (which should normally be dev->stats) and return a pointer to
 *           it. The structure may be changed asynchronously only if each
 *           field is written atomically.
 *        3. Update dev->stats asynchronously and atomically, and define
 *           neither operation.
 *
 * bool (*ndo_has_offload_stats)(const struct net_device *dev, int attr_id)
 *        Return true if this device supports offload stats of this attr_id.
 *
 * int (*ndo_get_offload_stats)(int attr_id, const struct net_device *dev,
 *        void *attr_data)
 *        Get statistics for offload operations by attr_id. Write it into the
 *        attr_data pointer.
 *
 * int (*ndo_vlan_rx_add_vid)(struct net_device *dev, __be16 proto, u16 vid);
 *        If device supports VLAN filtering this function is called when a
 *        VLAN id is registered.
 *
 * int (*ndo_vlan_rx_kill_vid)(struct net_device *dev, __be16 proto, u16 vid);
 *        If device supports VLAN filtering this function is called when a
 *        VLAN id is unregistered.
 *
 * void (*ndo_poll_controller)(struct net_device *dev);
 *
 *        SR-IOV management functions.
 * int (*ndo_set_vf_mac)(struct net_device *dev, int vf, u8* mac);
 * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan,
 *                          u8 qos, __be16 proto);
 * int (*ndo_set_vf_rate)(struct net_device *dev, int vf, int min_tx_rate,
 *                          int max_tx_rate);
 * int (*ndo_set_vf_spoofchk)(struct net_device *dev, int vf, bool setting);
 * int (*ndo_set_vf_trust)(struct net_device *dev, int vf, bool setting);
 * int (*ndo_get_vf_config)(struct net_device *dev,
 *                            int vf, struct ifla_vf_info *ivf);
 * int (*ndo_set_vf_link_state)(struct net_device *dev, int vf, int link_state);
 * int (*ndo_set_vf_port)(struct net_device *dev, int vf,
 *                          struct nlattr *port[]);
 *
 *      Enable or disable the VF ability to query its RSS Redirection Table and
 *      Hash Key. This is needed since on some devices VF share this information
 *      with PF and querying it may introduce a theoretical security risk.
 * int (*ndo_set_vf_rss_query_en)(struct net_device *dev, int vf, bool setting);
 * int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb);
 * int (*ndo_setup_tc)(struct net_device *dev, enum tc_setup_type type,
 *                       void *type_data);
 *        Called to setup any 'tc' scheduler, classifier or action on @dev.
 *        This is always called from the stack with the rtnl lock held and netif
 *        tx queues stopped. This allows the netdevice to perform queue
 *        management safely.
 *
 *        Fiber Channel over Ethernet (FCoE) offload functions.
 * int (*ndo_fcoe_enable)(struct net_device *dev);
 *        Called when the FCoE protocol stack wants to start using LLD for FCoE
 *        so the underlying device can perform whatever needed configuration or
 *        initialization to support acceleration of FCoE traffic.
 *
 * int (*ndo_fcoe_disable)(struct net_device *dev);
 *        Called when the FCoE protocol stack wants to stop using LLD for FCoE
 *        so the underlying device can perform whatever needed clean-ups to
 *        stop supporting acceleration of FCoE traffic.
 *
 * int (*ndo_fcoe_ddp_setup)(struct net_device *dev, u16 xid,
 *                             struct scatterlist *sgl, unsigned int sgc);
 *        Called when the FCoE Initiator wants to initialize an I/O that
 *        is a possible candidate for Direct Data Placement (DDP). The LLD can
 *        perform necessary setup and returns 1 to indicate the device is set up
 *        successfully to perform DDP on this I/O, otherwise this returns 0.
 *
 * int (*ndo_fcoe_ddp_done)(struct net_device *dev,  u16 xid);
 *        Called when the FCoE Initiator/Target is done with the DDPed I/O as
 *        indicated by the FC exchange id 'xid', so the underlying device can
 *        clean up and reuse resources for later DDP requests.
 *
 * int (*ndo_fcoe_ddp_target)(struct net_device *dev, u16 xid,
 *                              struct scatterlist *sgl, unsigned int sgc);
 *        Called when the FCoE Target wants to initialize an I/O that
 *        is a possible candidate for Direct Data Placement (DDP). The LLD can
 *        perform necessary setup and returns 1 to indicate the device is set up
 *        successfully to perform DDP on this I/O, otherwise this returns 0.
 *
 * int (*ndo_fcoe_get_hbainfo)(struct net_device *dev,
 *                               struct netdev_fcoe_hbainfo *hbainfo);
 *        Called when the FCoE Protocol stack wants information on the underlying
 *        device. This information is utilized by the FCoE protocol stack to
 *        register attributes with Fiber Channel management service as per the
 *        FC-GS Fabric Device Management Information(FDMI) specification.
 *
 * int (*ndo_fcoe_get_wwn)(struct net_device *dev, u64 *wwn, int type);
 *        Called when the underlying device wants to override default World Wide
 *        Name (WWN) generation mechanism in FCoE protocol stack to pass its own
 *        World Wide Port Name (WWPN) or World Wide Node Name (WWNN) to the FCoE
 *        protocol stack to use.
 *
 *        RFS acceleration.
 * int (*ndo_rx_flow_steer)(struct net_device *dev, const struct sk_buff *skb,
 *                            u16 rxq_index, u32 flow_id);
 *        Set hardware filter for RFS.  rxq_index is the target queue index;
 *        flow_id is a flow ID to be passed to rps_may_expire_flow() later.
 *        Return the filter ID on success, or a negative error code.
 *
 *        Slave management functions (for bridge, bonding, etc).
 * int (*ndo_add_slave)(struct net_device *dev, struct net_device *slave_dev);
 *        Called to make another netdev an underling.
 *
 * int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev);
 *        Called to release previously enslaved netdev.
 *
 * struct net_device *(*ndo_get_xmit_slave)(struct net_device *dev,
 *                                            struct sk_buff *skb,
 *                                            bool all_slaves);
 *        Get the xmit slave of master device. If all_slaves is true, function
 *        assume all the slaves can transmit.
 *
 *      Feature/offload setting functions.
 * netdev_features_t (*ndo_fix_features)(struct net_device *dev,
 *                netdev_features_t features);
 *        Adjusts the requested feature flags according to device-specific
 *        constraints, and returns the resulting flags. Must not modify
 *        the device state.
 *
 * int (*ndo_set_features)(struct net_device *dev, netdev_features_t features);
 *        Called to update device configuration to new features. Passed
 *        feature set might be less than what was returned by ndo_fix_features()).
 *        Must return >0 or -errno if it changed dev->features itself.
 *
 * int (*ndo_fdb_add)(struct ndmsg *ndm, struct nlattr *tb[],
 *                      struct net_device *dev,
 *                      const unsigned char *addr, u16 vid, u16 flags,
 *                      bool *notified, struct netlink_ext_ack *extack);
 *        Adds an FDB entry to dev for addr.
 *        Callee shall set *notified to true if it sent any appropriate
 *        notification(s). Otherwise core will send a generic one.
 * int (*ndo_fdb_del)(struct ndmsg *ndm, struct nlattr *tb[],
 *                      struct net_device *dev,
 *                      const unsigned char *addr, u16 vid
 *                      bool *notified, struct netlink_ext_ack *extack);
 *        Deletes the FDB entry from dev corresponding to addr.
 *        Callee shall set *notified to true if it sent any appropriate
 *        notification(s). Otherwise core will send a generic one.
 * int (*ndo_fdb_del_bulk)(struct nlmsghdr *nlh, struct net_device *dev,
 *                           struct netlink_ext_ack *extack);
 * int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb,
 *                       struct net_device *dev, struct net_device *filter_dev,
 *                       int *idx)
 *        Used to add FDB entries to dump requests. Implementers should add
 *        entries to skb and update idx with the number of entries.
 *
 * int (*ndo_mdb_add)(struct net_device *dev, struct nlattr *tb[],
 *                      u16 nlmsg_flags, struct netlink_ext_ack *extack);
 *        Adds an MDB entry to dev.
 * int (*ndo_mdb_del)(struct net_device *dev, struct nlattr *tb[],
 *                      struct netlink_ext_ack *extack);
 *        Deletes the MDB entry from dev.
 * int (*ndo_mdb_del_bulk)(struct net_device *dev, struct nlattr *tb[],
 *                           struct netlink_ext_ack *extack);
 *        Bulk deletes MDB entries from dev.
 * int (*ndo_mdb_dump)(struct net_device *dev, struct sk_buff *skb,
 *                       struct netlink_callback *cb);
 *        Dumps MDB entries from dev. The first argument (marker) in the netlink
 *        callback is used by core rtnetlink code.
 *
 * int (*ndo_bridge_setlink)(struct net_device *dev, struct nlmsghdr *nlh,
 *                             u16 flags, struct netlink_ext_ack *extack)
 * int (*ndo_bridge_getlink)(struct sk_buff *skb, u32 pid, u32 seq,
 *                             struct net_device *dev, u32 filter_mask,
 *                             int nlflags)
 * int (*ndo_bridge_dellink)(struct net_device *dev, struct nlmsghdr *nlh,
 *                             u16 flags);
 *
 * int (*ndo_change_carrier)(struct net_device *dev, bool new_carrier);
 *        Called to change device carrier. Soft-devices (like dummy, team, etc)
 *        which do not represent real hardware may define this to allow their
 *        userspace components to manage their virtual carrier state. Devices
 *        that determine carrier state from physical hardware properties (eg
 *        network cables) or protocol-dependent mechanisms (eg
 *        USB_CDC_NOTIFY_NETWORK_CONNECTION) should NOT implement this function.
 *
 * int (*ndo_get_phys_port_id)(struct net_device *dev,
 *                               struct netdev_phys_item_id *ppid);
 *        Called to get ID of physical port of this device. If driver does
 *        not implement this, it is assumed that the hw is not able to have
 *        multiple net devices on single physical port.
 *
 * int (*ndo_get_port_parent_id)(struct net_device *dev,
 *                                 struct netdev_phys_item_id *ppid)
 *        Called to get the parent ID of the physical port of this device.
 *
 * void* (*ndo_dfwd_add_station)(struct net_device *pdev,
 *                                 struct net_device *dev)
 *        Called by upper layer devices to accelerate switching or other
 *        station functionality into hardware. 'pdev is the lowerdev
 *        to use for the offload and 'dev' is the net device that will
 *        back the offload. Returns a pointer to the private structure
 *        the upper layer will maintain.
 * void (*ndo_dfwd_del_station)(struct net_device *pdev, void *priv)
 *        Called by upper layer device to delete the station created
 *        by 'ndo_dfwd_add_station'. 'pdev' is the net device backing
 *        the station and priv is the structure returned by the add
 *        operation.
 * int (*ndo_set_tx_maxrate)(struct net_device *dev,
 *                             int queue_index, u32 maxrate);
 *        Called when a user wants to set a max-rate limitation of specific
 *        TX queue.
 * int (*ndo_get_iflink)(const struct net_device *dev);
 *        Called to get the iflink value of this device.
 * int (*ndo_fill_metadata_dst)(struct net_device *dev, struct sk_buff *skb);
 *        This function is used to get egress tunnel information for given skb.
 *        This is useful for retrieving outer tunnel header parameters while
 *        sampling packet.
 * void (*ndo_set_rx_headroom)(struct net_device *dev, int needed_headroom);
 *        This function is used to specify the headroom that the skb must
 *        consider when allocation skb during packet reception. Setting
 *        appropriate rx headroom value allows avoiding skb head copy on
 *        forward. Setting a negative value resets the rx headroom to the
 *        default value.
 * int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf);
 *        This function is used to set or query state related to XDP on the
 *        netdevice and manage BPF offload. See definition of
 *        enum bpf_netdev_command for details.
 * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp,
 *                        u32 flags);
 *        This function is used to submit @n XDP packets for transmit on a
 *        netdevice. Returns number of frames successfully transmitted, frames
 *        that got dropped are freed/returned via xdp_return_frame().
 *        Returns negative number, means general error invoking ndo, meaning
 *        no frames were xmit'ed and core-caller will free all frames.
 * struct net_device *(*ndo_xdp_get_xmit_slave)(struct net_device *dev,
 *                                                struct xdp_buff *xdp);
 *      Get the xmit slave of master device based on the xdp_buff.
 * int (*ndo_xsk_wakeup)(struct net_device *dev, u32 queue_id, u32 flags);
 *      This function is used to wake up the softirq, ksoftirqd or kthread
 *        responsible for sending and/or receiving packets on a specific
 *        queue id bound to an AF_XDP socket. The flags field specifies if
 *        only RX, only Tx, or both should be woken up using the flags
 *        XDP_WAKEUP_RX and XDP_WAKEUP_TX.
 * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm_kern *p,
 *                         int cmd);
 *        Add, change, delete or get information on an IPv4 tunnel.
 * struct net_device *(*ndo_get_peer_dev)(struct net_device *dev);
 *        If a device is paired with a peer device, return the peer instance.
 *        The caller must be under RCU read context.
 * int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx, struct net_device_path *path);
 *     Get the forwarding path to reach the real device from the HW destination address
 * ktime_t (*ndo_get_tstamp)(struct net_device *dev,
 *                             const struct skb_shared_hwtstamps *hwtstamps,
 *                             bool cycles);
 *        Get hardware timestamp based on normal/adjustable time or free running
 *        cycle counter. This function is required if physical clock supports a
 *        free running cycle counter.
 *
 * int (*ndo_hwtstamp_get)(struct net_device *dev,
 *                           struct kernel_hwtstamp_config *kernel_config);
 *        Get the currently configured hardware timestamping parameters for the
 *        NIC device.
 *
 * int (*ndo_hwtstamp_set)(struct net_device *dev,
 *                           struct kernel_hwtstamp_config *kernel_config,
 *                           struct netlink_ext_ack *extack);
 *        Change the hardware timestamping parameters for NIC device.
 */
struct net_device_ops {
        int                        (*ndo_init)(struct net_device *dev);
        void                        (*ndo_uninit)(struct net_device *dev);
        int                        (*ndo_open)(struct net_device *dev);
        int                        (*ndo_stop)(struct net_device *dev);
        netdev_tx_t                (*ndo_start_xmit)(struct sk_buff *skb,
                                                  struct net_device *dev);
        netdev_features_t        (*ndo_features_check)(struct sk_buff *skb,
                                                      struct net_device *dev,
                                                      netdev_features_t features);
        u16                        (*ndo_select_queue)(struct net_device *dev,
                                                    struct sk_buff *skb,
                                                    struct net_device *sb_dev);
        void                        (*ndo_change_rx_flags)(struct net_device *dev,
                                                       int flags);
        void                        (*ndo_set_rx_mode)(struct net_device *dev);
        int                        (*ndo_set_mac_address)(struct net_device *dev,
                                                       void *addr);
        int                        (*ndo_validate_addr)(struct net_device *dev);
        int                        (*ndo_do_ioctl)(struct net_device *dev,
                                                struct ifreq *ifr, int cmd);
        int                        (*ndo_eth_ioctl)(struct net_device *dev,
                                                 struct ifreq *ifr, int cmd);
        int                        (*ndo_siocbond)(struct net_device *dev,
                                                struct ifreq *ifr, int cmd);
        int                        (*ndo_siocwandev)(struct net_device *dev,
                                                  struct if_settings *ifs);
        int                        (*ndo_siocdevprivate)(struct net_device *dev,
                                                      struct ifreq *ifr,
                                                      void __user *data, int cmd);
        int                        (*ndo_set_config)(struct net_device *dev,
                                                  struct ifmap *map);
        int                        (*ndo_change_mtu)(struct net_device *dev,
                                                  int new_mtu);
        int                        (*ndo_neigh_setup)(struct net_device *dev,
                                                   struct neigh_parms *);
        void                        (*ndo_tx_timeout) (struct net_device *dev,
                                                   unsigned int txqueue);

        void                        (*ndo_get_stats64)(struct net_device *dev,
                                                   struct rtnl_link_stats64 *storage);
        bool                        (*ndo_has_offload_stats)(const struct net_device *dev, int attr_id);
        int                        (*ndo_get_offload_stats)(int attr_id,
                                                         const struct net_device *dev,
                                                         void *attr_data);
        struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);

        int                        (*ndo_vlan_rx_add_vid)(struct net_device *dev,
                                                       __be16 proto, u16 vid);
        int                        (*ndo_vlan_rx_kill_vid)(struct net_device *dev,
                                                        __be16 proto, u16 vid);
#ifdef CONFIG_NET_POLL_CONTROLLER
        void                    (*ndo_poll_controller)(struct net_device *dev);
        int                        (*ndo_netpoll_setup)(struct net_device *dev);
        void                        (*ndo_netpoll_cleanup)(struct net_device *dev);
#endif
        int                        (*ndo_set_vf_mac)(struct net_device *dev,
                                                  int queue, u8 *mac);
        int                        (*ndo_set_vf_vlan)(struct net_device *dev,
                                                   int queue, u16 vlan,
                                                   u8 qos, __be16 proto);
        int                        (*ndo_set_vf_rate)(struct net_device *dev,
                                                   int vf, int min_tx_rate,
                                                   int max_tx_rate);
        int                        (*ndo_set_vf_spoofchk)(struct net_device *dev,
                                                       int vf, bool setting);
        int                        (*ndo_set_vf_trust)(struct net_device *dev,
                                                    int vf, bool setting);
        int                        (*ndo_get_vf_config)(struct net_device *dev,
                                                     int vf,
                                                     struct ifla_vf_info *ivf);
        int                        (*ndo_set_vf_link_state)(struct net_device *dev,
                                                         int vf, int link_state);
        int                        (*ndo_get_vf_stats)(struct net_device *dev,
                                                    int vf,
                                                    struct ifla_vf_stats
                                                    *vf_stats);
        int                        (*ndo_set_vf_port)(struct net_device *dev,
                                                   int vf,
                                                   struct nlattr *port[]);
        int                        (*ndo_get_vf_port)(struct net_device *dev,
                                                   int vf, struct sk_buff *skb);
        int                        (*ndo_get_vf_guid)(struct net_device *dev,
                                                   int vf,
                                                   struct ifla_vf_guid *node_guid,
                                                   struct ifla_vf_guid *port_guid);
        int                        (*ndo_set_vf_guid)(struct net_device *dev,
                                                   int vf, u64 guid,
                                                   int guid_type);
        int                        (*ndo_set_vf_rss_query_en)(
                                                   struct net_device *dev,
                                                   int vf, bool setting);
        int                        (*ndo_setup_tc)(struct net_device *dev,
                                                enum tc_setup_type type,
                                                void *type_data);
#if IS_ENABLED(CONFIG_FCOE)
        int                        (*ndo_fcoe_enable)(struct net_device *dev);
        int                        (*ndo_fcoe_disable)(struct net_device *dev);
        int                        (*ndo_fcoe_ddp_setup)(struct net_device *dev,
                                                      u16 xid,
                                                      struct scatterlist *sgl,
                                                      unsigned int sgc);
        int                        (*ndo_fcoe_ddp_done)(struct net_device *dev,
                                                     u16 xid);
        int                        (*ndo_fcoe_ddp_target)(struct net_device *dev,
                                                       u16 xid,
                                                       struct scatterlist *sgl,
                                                       unsigned int sgc);
        int                        (*ndo_fcoe_get_hbainfo)(struct net_device *dev,
                                                        struct netdev_fcoe_hbainfo *hbainfo);
#endif

#if IS_ENABLED(CONFIG_LIBFCOE)
#define NETDEV_FCOE_WWNN 0
#define NETDEV_FCOE_WWPN 1
        int                        (*ndo_fcoe_get_wwn)(struct net_device *dev,
                                                    u64 *wwn, int type);
#endif

#ifdef CONFIG_RFS_ACCEL
        int                        (*ndo_rx_flow_steer)(struct net_device *dev,
                                                     const struct sk_buff *skb,
                                                     u16 rxq_index,
                                                     u32 flow_id);
#endif
        int                        (*ndo_add_slave)(struct net_device *dev,
                                                 struct net_device *slave_dev,
                                                 struct netlink_ext_ack *extack);
        int                        (*ndo_del_slave)(struct net_device *dev,
                                                 struct net_device *slave_dev);
        struct net_device*        (*ndo_get_xmit_slave)(struct net_device *dev,
                                                      struct sk_buff *skb,
                                                      bool all_slaves);
        struct net_device*        (*ndo_sk_get_lower_dev)(struct net_device *dev,
                                                        struct sock *sk);
        netdev_features_t        (*ndo_fix_features)(struct net_device *dev,
                                                    netdev_features_t features);
        int                        (*ndo_set_features)(struct net_device *dev,
                                                    netdev_features_t features);
        int                        (*ndo_neigh_construct)(struct net_device *dev,
                                                       struct neighbour *n);
        void                        (*ndo_neigh_destroy)(struct net_device *dev,
                                                     struct neighbour *n);

        int                        (*ndo_fdb_add)(struct ndmsg *ndm,
                                               struct nlattr *tb[],
                                               struct net_device *dev,
                                               const unsigned char *addr,
                                               u16 vid,
                                               u16 flags,
                                               bool *notified,
                                               struct netlink_ext_ack *extack);
        int                        (*ndo_fdb_del)(struct ndmsg *ndm,
                                               struct nlattr *tb[],
                                               struct net_device *dev,
                                               const unsigned char *addr,
                                               u16 vid,
                                               bool *notified,
                                               struct netlink_ext_ack *extack);
        int                        (*ndo_fdb_del_bulk)(struct nlmsghdr *nlh,
                                                    struct net_device *dev,
                                                    struct netlink_ext_ack *extack);
        int                        (*ndo_fdb_dump)(struct sk_buff *skb,
                                                struct netlink_callback *cb,
                                                struct net_device *dev,
                                                struct net_device *filter_dev,
                                                int *idx);
        int                        (*ndo_fdb_get)(struct sk_buff *skb,
                                               struct nlattr *tb[],
                                               struct net_device *dev,
                                               const unsigned char *addr,
                                               u16 vid, u32 portid, u32 seq,
                                               struct netlink_ext_ack *extack);
        int                        (*ndo_mdb_add)(struct net_device *dev,
                                               struct nlattr *tb[],
                                               u16 nlmsg_flags,
                                               struct netlink_ext_ack *extack);
        int                        (*ndo_mdb_del)(struct net_device *dev,
                                               struct nlattr *tb[],
                                               struct netlink_ext_ack *extack);
        int                        (*ndo_mdb_del_bulk)(struct net_device *dev,
                                                    struct nlattr *tb[],
                                                    struct netlink_ext_ack *extack);
        int                        (*ndo_mdb_dump)(struct net_device *dev,
                                                struct sk_buff *skb,
                                                struct netlink_callback *cb);
        int                        (*ndo_mdb_get)(struct net_device *dev,
                                               struct nlattr *tb[], u32 portid,
                                               u32 seq,
                                               struct netlink_ext_ack *extack);
        int                        (*ndo_bridge_setlink)(struct net_device *dev,
                                                      struct nlmsghdr *nlh,
                                                      u16 flags,
                                                      struct netlink_ext_ack *extack);
        int                        (*ndo_bridge_getlink)(struct sk_buff *skb,
                                                      u32 pid, u32 seq,
                                                      struct net_device *dev,
                                                      u32 filter_mask,
                                                      int nlflags);
        int                        (*ndo_bridge_dellink)(struct net_device *dev,
                                                      struct nlmsghdr *nlh,
                                                      u16 flags);
        int                        (*ndo_change_carrier)(struct net_device *dev,
                                                      bool new_carrier);
        int                        (*ndo_get_phys_port_id)(struct net_device *dev,
                                                        struct netdev_phys_item_id *ppid);
        int                        (*ndo_get_port_parent_id)(struct net_device *dev,
                                                          struct netdev_phys_item_id *ppid);
        int                        (*ndo_get_phys_port_name)(struct net_device *dev,
                                                          char *name, size_t len);
        void*                        (*ndo_dfwd_add_station)(struct net_device *pdev,
                                                        struct net_device *dev);
        void                        (*ndo_dfwd_del_station)(struct net_device *pdev,
                                                        void *priv);

        int                        (*ndo_set_tx_maxrate)(struct net_device *dev,
                                                      int queue_index,
                                                      u32 maxrate);
        int                        (*ndo_get_iflink)(const struct net_device *dev);
        int                        (*ndo_fill_metadata_dst)(struct net_device *dev,
                                                       struct sk_buff *skb);
        void                        (*ndo_set_rx_headroom)(struct net_device *dev,
                                                       int needed_headroom);
        int                        (*ndo_bpf)(struct net_device *dev,
                                           struct netdev_bpf *bpf);
        int                        (*ndo_xdp_xmit)(struct net_device *dev, int n,
                                                struct xdp_frame **xdp,
                                                u32 flags);
        struct net_device *        (*ndo_xdp_get_xmit_slave)(struct net_device *dev,
                                                          struct xdp_buff *xdp);
        int                        (*ndo_xsk_wakeup)(struct net_device *dev,
                                                  u32 queue_id, u32 flags);
        int                        (*ndo_tunnel_ctl)(struct net_device *dev,
                                                  struct ip_tunnel_parm_kern *p,
                                                  int cmd);
        struct net_device *        (*ndo_get_peer_dev)(struct net_device *dev);
        int                     (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx,
                                                         struct net_device_path *path);
        ktime_t                        (*ndo_get_tstamp)(struct net_device *dev,
                                                  const struct skb_shared_hwtstamps *hwtstamps,
                                                  bool cycles);
        int                        (*ndo_hwtstamp_get)(struct net_device *dev,
                                                    struct kernel_hwtstamp_config *kernel_config);
        int                        (*ndo_hwtstamp_set)(struct net_device *dev,
                                                    struct kernel_hwtstamp_config *kernel_config,
                                                    struct netlink_ext_ack *extack);

#if IS_ENABLED(CONFIG_NET_SHAPER)
        /**
         * @net_shaper_ops: Device shaping offload operations
         * see include/net/net_shapers.h
         */
        const struct net_shaper_ops *net_shaper_ops;
#endif
};

/**
 * enum netdev_priv_flags - &struct net_device priv_flags
 *
 * These are the &struct net_device, they are only set internally
 * by drivers and used in the kernel. These flags are invisible to
 * userspace; this means that the order of these flags can change
 * during any kernel release.
 *
 * You should add bitfield booleans after either net_device::priv_flags
 * (hotpath) or ::threaded (slowpath) instead of extending these flags.
 *
 * @IFF_802_1Q_VLAN: 802.1Q VLAN device
 * @IFF_EBRIDGE: Ethernet bridging device
 * @IFF_BONDING: bonding master or slave
 * @IFF_ISATAP: ISATAP interface (RFC4214)
 * @IFF_WAN_HDLC: WAN HDLC device
 * @IFF_XMIT_DST_RELEASE: dev_hard_start_xmit() is allowed to
 *        release skb->dst
 * @IFF_DONT_BRIDGE: disallow bridging this ether dev
 * @IFF_DISABLE_NETPOLL: disable netpoll at run-time
 * @IFF_MACVLAN_PORT: device used as macvlan port
 * @IFF_BRIDGE_PORT: device used as bridge port
 * @IFF_OVS_DATAPATH: device used as Open vSwitch datapath port
 * @IFF_TX_SKB_SHARING: The interface supports sharing skbs on transmit
 * @IFF_UNICAST_FLT: Supports unicast filtering
 * @IFF_TEAM_PORT: device used as team port
 * @IFF_SUPP_NOFCS: device supports sending custom FCS
 * @IFF_LIVE_ADDR_CHANGE: device supports hardware address
 *        change when it's running
 * @IFF_MACVLAN: Macvlan device
 * @IFF_XMIT_DST_RELEASE_PERM: IFF_XMIT_DST_RELEASE not taking into account
 *        underlying stacked devices
 * @IFF_L3MDEV_MASTER: device is an L3 master device
 * @IFF_NO_QUEUE: device can run without qdisc attached
 * @IFF_OPENVSWITCH: device is a Open vSwitch master
 * @IFF_L3MDEV_SLAVE: device is enslaved to an L3 master device
 * @IFF_TEAM: device is a team device
 * @IFF_RXFH_CONFIGURED: device has had Rx Flow indirection table configured
 * @IFF_PHONY_HEADROOM: the headroom value is controlled by an external
 *        entity (i.e. the master device for bridged veth)
 * @IFF_MACSEC: device is a MACsec device
 * @IFF_NO_RX_HANDLER: device doesn't support the rx_handler hook
 * @IFF_FAILOVER: device is a failover master device
 * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device
 * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device
 * @IFF_NO_ADDRCONF: prevent ipv6 addrconf
 * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with
 *        skb_headlen(skb) == 0 (data starts from frag0)
 */
enum netdev_priv_flags {
        IFF_802_1Q_VLAN                        = 1<<0,
        IFF_EBRIDGE                        = 1<<1,
        IFF_BONDING                        = 1<<2,
        IFF_ISATAP                        = 1<<3,
        IFF_WAN_HDLC                        = 1<<4,
        IFF_XMIT_DST_RELEASE                = 1<<5,
        IFF_DONT_BRIDGE                        = 1<<6,
        IFF_DISABLE_NETPOLL                = 1<<7,
        IFF_MACVLAN_PORT                = 1<<8,
        IFF_BRIDGE_PORT                        = 1<<9,
        IFF_OVS_DATAPATH                = 1<<10,
        IFF_TX_SKB_SHARING                = 1<<11,
        IFF_UNICAST_FLT                        = 1<<12,
        IFF_TEAM_PORT                        = 1<<13,
        IFF_SUPP_NOFCS                        = 1<<14,
        IFF_LIVE_ADDR_CHANGE                = 1<<15,
        IFF_MACVLAN                        = 1<<16,
        IFF_XMIT_DST_RELEASE_PERM        = 1<<17,
        IFF_L3MDEV_MASTER                = 1<<18,
        IFF_NO_QUEUE                        = 1<<19,
        IFF_OPENVSWITCH                        = 1<<20,
        IFF_L3MDEV_SLAVE                = 1<<21,
        IFF_TEAM                        = 1<<22,
        IFF_RXFH_CONFIGURED                = 1<<23,
        IFF_PHONY_HEADROOM                = 1<<24,
        IFF_MACSEC                        = 1<<25,
        IFF_NO_RX_HANDLER                = 1<<26,
        IFF_FAILOVER                        = 1<<27,
        IFF_FAILOVER_SLAVE                = 1<<28,
        IFF_L3MDEV_RX_HANDLER                = 1<<29,
        IFF_NO_ADDRCONF                        = BIT_ULL(30),
        IFF_TX_SKB_NO_LINEAR                = BIT_ULL(31),
};

/* Specifies the type of the struct net_device::ml_priv pointer */
enum netdev_ml_priv_type {
        ML_PRIV_NONE,
        ML_PRIV_CAN,
};

enum netdev_stat_type {
        NETDEV_PCPU_STAT_NONE,
        NETDEV_PCPU_STAT_LSTATS, /* struct pcpu_lstats */
        NETDEV_PCPU_STAT_TSTATS, /* struct pcpu_sw_netstats */
        NETDEV_PCPU_STAT_DSTATS, /* struct pcpu_dstats */
};

enum netdev_reg_state {
        NETREG_UNINITIALIZED = 0,
        NETREG_REGISTERED,        /* completed register_netdevice */
        NETREG_UNREGISTERING,        /* called unregister_netdevice */
        NETREG_UNREGISTERED,        /* completed unregister todo */
        NETREG_RELEASED,        /* called free_netdev */
        NETREG_DUMMY,                /* dummy device for NAPI poll */
};

/**
 *        struct net_device - The DEVICE structure.
 *
 *        Actually, this whole structure is a big mistake.  It mixes I/O
 *        data with strictly "high-level" data, and it has to know about
 *        almost every data structure used in the INET module.
 *
 *        @priv_flags:        flags invisible to userspace defined as bits, see
 *                        enum netdev_priv_flags for the definitions
 *        @lltx:                device supports lockless Tx. Deprecated for real HW
 *                        drivers. Mainly used by logical interfaces, such as
 *                        bonding and tunnels
 *        @netmem_tx:        device support netmem_tx.
 *
 *        @name:        This is the first field of the "visible" part of this structure
 *                (i.e. as seen by users in the "Space.c" file).  It is the name
 *                of the interface.
 *
 *        @name_node:        Name hashlist node
 *        @ifalias:        SNMP alias
 *        @mem_end:        Shared memory end
 *        @mem_start:        Shared memory start
 *        @base_addr:        Device I/O address
 *        @irq:                Device IRQ number
 *
 *        @state:                Generic network queuing layer state, see netdev_state_t
 *        @dev_list:        The global list of network devices
 *        @napi_list:        List entry used for polling NAPI devices
 *        @unreg_list:        List entry  when we are unregistering the
 *                        device; see the function unregister_netdev
 *        @close_list:        List entry used when we are closing the device
 *        @ptype_all:     Device-specific packet handlers for all protocols
 *        @ptype_specific: Device-specific, protocol-specific packet handlers
 *
 *        @adj_list:        Directly linked devices, like slaves for bonding
 *        @features:        Currently active device features
 *        @hw_features:        User-changeable features
 *
 *        @wanted_features:        User-requested features
 *        @vlan_features:                Mask of features inheritable by VLAN devices
 *
 *        @hw_enc_features:        Mask of features inherited by encapsulating devices
 *                                This field indicates what encapsulation
 *                                offloads the hardware is capable of doing,
 *                                and drivers will need to set them appropriately.
 *
 *        @mpls_features:        Mask of features inheritable by MPLS
 *        @gso_partial_features: value(s) from NETIF_F_GSO\*
 *
 *        @ifindex:        interface index
 *        @group:                The group the device belongs to
 *
 *        @stats:                Statistics struct, which was left as a legacy, use
 *                        rtnl_link_stats64 instead
 *
 *        @core_stats:        core networking counters,
 *                        do not use this in drivers
 *        @carrier_up_count:        Number of times the carrier has been up
 *        @carrier_down_count:        Number of times the carrier has been down
 *
 *        @wireless_handlers:        List of functions to handle Wireless Extensions,
 *                                instead of ioctl,
 *                                see <net/iw_handler.h> for details.
 *
 *        @netdev_ops:        Includes several pointers to callbacks,
 *                        if one wants to override the ndo_*() functions
 *        @xdp_metadata_ops:        Includes pointers to XDP metadata callbacks.
 *        @xsk_tx_metadata_ops:        Includes pointers to AF_XDP TX metadata callbacks.
 *        @ethtool_ops:        Management operations
 *        @l3mdev_ops:        Layer 3 master device operations
 *        @ndisc_ops:        Includes callbacks for different IPv6 neighbour
 *                        discovery handling. Necessary for e.g. 6LoWPAN.
 *        @xfrmdev_ops:        Transformation offload operations
 *        @tlsdev_ops:        Transport Layer Security offload operations
 *        @header_ops:        Includes callbacks for creating,parsing,caching,etc
 *                        of Layer 2 headers.
 *
 *        @flags:                Interface flags (a la BSD)
 *        @xdp_features:        XDP capability supported by the device
 *        @gflags:        Global flags ( kept as legacy )
 *        @priv_len:        Size of the ->priv flexible array
 *        @priv:                Flexible array containing private data
 *        @operstate:        RFC2863 operstate
 *        @link_mode:        Mapping policy to operstate
 *        @if_port:        Selectable AUI, TP, ...
 *        @dma:                DMA channel
 *        @mtu:                Interface MTU value
 *        @min_mtu:        Interface Minimum MTU value
 *        @max_mtu:        Interface Maximum MTU value
 *        @type:                Interface hardware type
 *        @hard_header_len: Maximum hardware header length.
 *        @min_header_len:  Minimum hardware header length
 *
 *        @needed_headroom: Extra headroom the hardware may need, but not in all
 *                          cases can this be guaranteed
 *        @needed_tailroom: Extra tailroom the hardware may need, but not in all
 *                          cases can this be guaranteed. Some cases also use
 *                          LL_MAX_HEADER instead to allocate the skb
 *
 *        interface address info:
 *
 *         @perm_addr:                Permanent hw address
 *         @addr_assign_type:        Hw address assignment type
 *         @addr_len:                Hardware address length
 *        @upper_level:                Maximum depth level of upper devices.
 *        @lower_level:                Maximum depth level of lower devices.
 *        @threaded:                napi threaded state.
 *        @neigh_priv_len:        Used in neigh_alloc()
 *         @dev_id:                Used to differentiate devices that share
 *                                 the same link layer address
 *         @dev_port:                Used to differentiate devices that share
 *                                 the same function
 *        @addr_list_lock:        XXX: need comments on this one
 *        @name_assign_type:        network interface name assignment type
 *        @uc_promisc:                Counter that indicates promiscuous mode
 *                                has been enabled due to the need to listen to
 *                                additional unicast addresses in a device that
 *                                does not implement ndo_set_rx_mode()
 *        @uc:                        unicast mac addresses
 *        @mc:                        multicast mac addresses
 *        @dev_addrs:                list of device hw addresses
 *        @queues_kset:                Group of all Kobjects in the Tx and RX queues
 *        @promiscuity:                Number of times the NIC is told to work in
 *                                promiscuous mode; if it becomes 0 the NIC will
 *                                exit promiscuous mode
 *        @allmulti:                Counter, enables or disables allmulticast mode
 *
 *        @vlan_info:        VLAN info
 *        @dsa_ptr:        dsa specific data
 *        @tipc_ptr:        TIPC specific data
 *        @atalk_ptr:        AppleTalk link
 *        @ip_ptr:        IPv4 specific data
 *        @ip6_ptr:        IPv6 specific data
 *        @ax25_ptr:        AX.25 specific data
 *        @ieee80211_ptr:        IEEE 802.11 specific data, assign before registering
 *        @ieee802154_ptr: IEEE 802.15.4 low-rate Wireless Personal Area Network
 *                         device struct
 *        @mpls_ptr:        mpls_dev struct pointer
 *        @mctp_ptr:        MCTP specific data
 *        @psp_dev:        PSP crypto device registered for this netdev
 *
 *        @dev_addr:        Hw address (before bcast,
 *                        because most packets are unicast)
 *
 *        @_rx:                        Array of RX queues
 *        @num_rx_queues:                Number of RX queues
 *                                allocated at register_netdev() time
 *        @real_num_rx_queues:         Number of RX queues currently active in device
 *        @xdp_prog:                XDP sockets filter program pointer
 *
 *        @rx_handler:                handler for received packets
 *        @rx_handler_data:         XXX: need comments on this one
 *        @tcx_ingress:                BPF & clsact qdisc specific data for ingress processing
 *        @ingress_queue:                XXX: need comments on this one
 *        @nf_hooks_ingress:        netfilter hooks executed for ingress packets
 *        @broadcast:                hw bcast address
 *
 *        @rx_cpu_rmap:        CPU reverse-mapping for RX completion interrupts,
 *                        indexed by RX queue number. Assigned by driver.
 *                        This must only be set if the ndo_rx_flow_steer
 *                        operation is defined
 *        @index_hlist:                Device index hash chain
 *
 *        @_tx:                        Array of TX queues
 *        @num_tx_queues:                Number of TX queues allocated at alloc_netdev_mq() time
 *        @real_num_tx_queues:         Number of TX queues currently active in device
 *        @qdisc:                        Root qdisc from userspace point of view
 *        @tx_queue_len:                Max frames per queue allowed
 *        @tx_global_lock:         XXX: need comments on this one
 *        @xdp_bulkq:                XDP device bulk queue
 *        @xps_maps:                all CPUs/RXQs maps for XPS device
 *
 *        @xps_maps:        XXX: need comments on this one
 *        @tcx_egress:                BPF & clsact qdisc specific data for egress processing
 *        @nf_hooks_egress:        netfilter hooks executed for egress packets
 *        @qdisc_hash:                qdisc hash table
 *        @watchdog_timeo:        Represents the timeout that is used by
 *                                the watchdog (see dev_watchdog())
 *        @watchdog_timer:        List of timers
 *
 *        @proto_down_reason:        reason a netdev interface is held down
 *        @pcpu_refcnt:                Number of references to this device
 *        @dev_refcnt:                Number of references to this device
 *        @refcnt_tracker:        Tracker directory for tracked references to this device
 *        @todo_list:                Delayed register/unregister
 *        @link_watch_list:        XXX: need comments on this one
 *
 *        @reg_state:                Register/unregister state machine
 *        @dismantle:                Device is going to be freed
 *        @needs_free_netdev:        Should unregister perform free_netdev?
 *        @priv_destructor:        Called from unregister
 *        @npinfo:                XXX: need comments on this one
 *         @nd_net:                Network namespace this network device is inside
 *                                protected by @lock
 *
 *         @ml_priv:        Mid-layer private
 *        @ml_priv_type:  Mid-layer private type
 *
 *        @pcpu_stat_type:        Type of device statistics which the core should
 *                                allocate/free: none, lstats, tstats, dstats. none
 *                                means the driver is handling statistics allocation/
 *                                freeing internally.
 *        @lstats:                Loopback statistics: packets, bytes
 *        @tstats:                Tunnel statistics: RX/TX packets, RX/TX bytes
 *        @dstats:                Dummy statistics: RX/TX/drop packets, RX/TX bytes
 *
 *        @garp_port:        GARP
 *        @mrp_port:        MRP
 *
 *        @dm_private:        Drop monitor private
 *
 *        @dev:                Class/net/name entry
 *        @sysfs_groups:        Space for optional device, statistics and wireless
 *                        sysfs groups
 *
 *        @sysfs_rx_queue_group:        Space for optional per-rx queue attributes
 *        @rtnl_link_ops:        Rtnl_link_ops
 *        @stat_ops:        Optional ops for queue-aware statistics
 *        @queue_mgmt_ops:        Optional ops for queue management
 *
 *        @gso_max_size:        Maximum size of generic segmentation offload
 *        @tso_max_size:        Device (as in HW) limit on the max TSO request size
 *        @gso_max_segs:        Maximum number of segments that can be passed to the
 *                        NIC for GSO
 *        @tso_max_segs:        Device (as in HW) limit on the max TSO segment count
 *         @gso_ipv4_max_size:        Maximum size of generic segmentation offload,
 *                                 for IPv4.
 *
 *        @dcbnl_ops:        Data Center Bridging netlink ops
 *        @num_tc:        Number of traffic classes in the net device
 *        @tc_to_txq:        XXX: need comments on this one
 *        @prio_tc_map:        XXX: need comments on this one
 *
 *        @fcoe_ddp_xid:        Max exchange id for FCoE LRO by ddp
 *
 *        @priomap:        XXX: need comments on this one
 *        @link_topo:        Physical link topology tracking attached PHYs
 *        @phydev:        Physical device may attach itself
 *                        for hardware timestamping
 *        @sfp_bus:        attached &struct sfp_bus structure.
 *
 *        @qdisc_tx_busylock: lockdep class annotating Qdisc->busylock spinlock
 *
 *        @proto_down:        protocol port state information can be sent to the
 *                        switch driver and used to set the phys state of the
 *                        switch port.
 *
 *        @irq_affinity_auto: driver wants the core to store and re-assign the IRQ
 *                            affinity. Set by netif_enable_irq_affinity(), then
 *                            the driver must create a persistent napi by
 *                            netif_napi_add_config() and finally bind the napi to
 *                            IRQ (via netif_napi_set_irq()).
 *
 *        @rx_cpu_rmap_auto: driver wants the core to manage the ARFS rmap.
 *                           Set by calling netif_enable_cpu_rmap().
 *
 *        @see_all_hwtstamp_requests: device wants to see calls to
 *                        ndo_hwtstamp_set() for all timestamp requests
 *                        regardless of source, even if those aren't
 *                        HWTSTAMP_SOURCE_NETDEV
 *        @change_proto_down: device supports setting carrier via IFLA_PROTO_DOWN
 *        @netns_immutable: interface can't change network namespaces
 *        @fcoe_mtu:        device supports maximum FCoE MTU, 2158 bytes
 *
 *        @net_notifier_list:        List of per-net netdev notifier block
 *                                that follow this device when it is moved
 *                                to another network namespace.
 *
 *        @macsec_ops:    MACsec offloading ops
 *
 *        @udp_tunnel_nic_info:        static structure describing the UDP tunnel
 *                                offload capabilities of the device
 *        @udp_tunnel_nic:        UDP tunnel offload state
 *        @ethtool:        ethtool related state
 *        @xdp_state:                stores info on attached XDP BPF programs
 *
 *        @nested_level:        Used as a parameter of spin_lock_nested() of
 *                        dev->addr_list_lock.
 *        @unlink_list:        As netif_addr_lock() can be called recursively,
 *                        keep a list of interfaces to be deleted.
 *        @gro_max_size:        Maximum size of aggregated packet in generic
 *                        receive offload (GRO)
 *         @gro_ipv4_max_size:        Maximum size of aggregated packet in generic
 *                                 receive offload (GRO), for IPv4.
 *        @xdp_zc_max_segs:        Maximum number of segments supported by AF_XDP
 *                                zero copy driver
 *
 *        @dev_addr_shadow:        Copy of @dev_addr to catch direct writes.
 *        @linkwatch_dev_tracker:        refcount tracker used by linkwatch.
 *        @watchdog_dev_tracker:        refcount tracker used by watchdog.
 *        @dev_registered_tracker:        tracker for reference held while
 *                                        registered
 *        @offload_xstats_l3:        L3 HW stats for this netdevice.
 *
 *        @devlink_port:        Pointer to related devlink port structure.
 *                        Assigned by a driver before netdev registration using
 *                        SET_NETDEV_DEVLINK_PORT macro. This pointer is static
 *                        during the time netdevice is registered.
 *
 *        @dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem,
 *                   where the clock is recovered.
 *
 *        @max_pacing_offload_horizon: max EDT offload horizon in nsec.
 *        @napi_config: An array of napi_config structures containing per-NAPI
 *                      settings.
 *        @num_napi_configs:        number of allocated NAPI config structs,
 *                always >= max(num_rx_queues, num_tx_queues).
 *        @gro_flush_timeout:        timeout for GRO layer in NAPI
 *        @napi_defer_hard_irqs:        If not zero, provides a counter that would
 *                                allow to avoid NIC hard IRQ, on busy queues.
 *
 *        @neighbours:        List heads pointing to this device's neighbours'
 *                        dev_list, one per address-family.
 *        @hwprov: Tracks which PTP performs hardware packet time stamping.
 *
 *        FIXME: cleanup struct net_device such that network protocol info
 *        moves out.
 */

struct net_device {
        /* Cacheline organization can be found documented in
         * Documentation/networking/net_cachelines/net_device.rst.
         * Please update the document when adding new fields.
         */

        /* TX read-mostly hotpath */
        __cacheline_group_begin(net_device_read_tx);
        struct_group(priv_flags_fast,
                unsigned long                priv_flags:32;
                unsigned long                lltx:1;
                unsigned long                netmem_tx:1;
        );
        const struct net_device_ops *netdev_ops;
        const struct header_ops *header_ops;
        struct netdev_queue        *_tx;
        netdev_features_t        gso_partial_features;
        unsigned int                real_num_tx_queues;
        unsigned int                gso_max_size;
        unsigned int                gso_ipv4_max_size;
        u16                        gso_max_segs;
        s16                        num_tc;
        /* Note : dev->mtu is often read without holding a lock.
         * Writers usually hold RTNL.
         * It is recommended to use READ_ONCE() to annotate the reads,
         * and to use WRITE_ONCE() to annotate the writes.
         */
        unsigned int                mtu;
        unsigned short                needed_headroom;
        struct netdev_tc_txq        tc_to_txq[TC_MAX_QUEUE];
#ifdef CONFIG_XPS
        struct xps_dev_maps __rcu *xps_maps[XPS_MAPS_MAX];
#endif
#ifdef CONFIG_NETFILTER_EGRESS
        struct nf_hook_entries __rcu *nf_hooks_egress;
#endif
#ifdef CONFIG_NET_XGRESS
        struct bpf_mprog_entry __rcu *tcx_egress;
#endif
        __cacheline_group_end(net_device_read_tx);

        /* TXRX read-mostly hotpath */
        __cacheline_group_begin(net_device_read_txrx);
        union {
                struct pcpu_lstats __percpu                *lstats;
                struct pcpu_sw_netstats __percpu        *tstats;
                struct pcpu_dstats __percpu                *dstats;
        };
        unsigned long                state;
        unsigned int                flags;
        unsigned short                hard_header_len;
        netdev_features_t        features;
        struct inet6_dev __rcu        *ip6_ptr;
        __cacheline_group_end(net_device_read_txrx);

        /* RX read-mostly hotpath */
        __cacheline_group_begin(net_device_read_rx);
        struct bpf_prog __rcu        *xdp_prog;
        struct list_head        ptype_specific;
        int                        ifindex;
        unsigned int                real_num_rx_queues;
        struct netdev_rx_queue        *_rx;
        unsigned int                gro_max_size;
        unsigned int                gro_ipv4_max_size;
        rx_handler_func_t __rcu        *rx_handler;
        void __rcu                *rx_handler_data;
        possible_net_t                        nd_net;
#ifdef CONFIG_NETPOLL
        struct netpoll_info __rcu        *npinfo;
#endif
#ifdef CONFIG_NET_XGRESS
        struct bpf_mprog_entry __rcu *tcx_ingress;
#endif
        __cacheline_group_end(net_device_read_rx);

        char                        name[IFNAMSIZ];
        struct netdev_name_node        *name_node;
        struct dev_ifalias        __rcu *ifalias;
        /*
         *        I/O specific fields
         *        FIXME: Merge these and struct ifmap into one
         */
        unsigned long                mem_end;
        unsigned long                mem_start;
        unsigned long                base_addr;

        /*
         *        Some hardware also needs these fields (state,dev_list,
         *        napi_list,unreg_list,close_list) but they are not
         *        part of the usual set specified in Space.c.
         */


        struct list_head        dev_list;
        struct list_head        napi_list;
        struct list_head        unreg_list;
        struct list_head        close_list;
        struct list_head        ptype_all;

        struct {
                struct list_head upper;
                struct list_head lower;
        } adj_list;

        /* Read-mostly cache-line for fast-path access */
        xdp_features_t                xdp_features;
        const struct xdp_metadata_ops *xdp_metadata_ops;
        const struct xsk_tx_metadata_ops *xsk_tx_metadata_ops;
        unsigned short                gflags;

        unsigned short                needed_tailroom;

        netdev_features_t        hw_features;
        netdev_features_t        wanted_features;
        netdev_features_t        vlan_features;
        netdev_features_t        hw_enc_features;
        netdev_features_t        mpls_features;

        unsigned int                min_mtu;
        unsigned int                max_mtu;
        unsigned short                type;
        unsigned char                min_header_len;
        unsigned char                name_assign_type;

        int                        group;

        struct net_device_stats        stats; /* not used by modern drivers */

        struct net_device_core_stats __percpu *core_stats;

        /* Stats to monitor link on/off, flapping */
        atomic_t                carrier_up_count;
        atomic_t                carrier_down_count;

#ifdef CONFIG_WIRELESS_EXT
        const struct iw_handler_def *wireless_handlers;
#endif
        const struct ethtool_ops *ethtool_ops;
#ifdef CONFIG_NET_L3_MASTER_DEV
        const struct l3mdev_ops        *l3mdev_ops;
#endif
#if IS_ENABLED(CONFIG_IPV6)
        const struct ndisc_ops *ndisc_ops;
#endif

#ifdef CONFIG_XFRM_OFFLOAD
        const struct xfrmdev_ops *xfrmdev_ops;
#endif

#if IS_ENABLED(CONFIG_TLS_DEVICE)
        const struct tlsdev_ops *tlsdev_ops;
#endif

        unsigned int                operstate;
        unsigned char                link_mode;

        unsigned char                if_port;
        unsigned char                dma;

        /* Interface address info. */
        unsigned char                perm_addr[MAX_ADDR_LEN];
        unsigned char                addr_assign_type;
        unsigned char                addr_len;
        unsigned char                upper_level;
        unsigned char                lower_level;
        u8                        threaded;

        unsigned short                neigh_priv_len;
        unsigned short          dev_id;
        unsigned short          dev_port;
        int                        irq;
        u32                        priv_len;

        spinlock_t                addr_list_lock;

        struct netdev_hw_addr_list        uc;
        struct netdev_hw_addr_list        mc;
        struct netdev_hw_addr_list        dev_addrs;

#ifdef CONFIG_SYSFS
        struct kset                *queues_kset;
#endif
#ifdef CONFIG_LOCKDEP
        struct list_head        unlink_list;
#endif
        unsigned int                promiscuity;
        unsigned int                allmulti;
        bool                        uc_promisc;
#ifdef CONFIG_LOCKDEP
        unsigned char                nested_level;
#endif


        /* Protocol-specific pointers */
        struct in_device __rcu        *ip_ptr;
        /** @fib_nh_head: nexthops associated with this netdev */
        struct hlist_head        fib_nh_head;

#if IS_ENABLED(CONFIG_VLAN_8021Q)
        struct vlan_info __rcu        *vlan_info;
#endif
#if IS_ENABLED(CONFIG_NET_DSA)
        struct dsa_port                *dsa_ptr;
#endif
#if IS_ENABLED(CONFIG_TIPC)
        struct tipc_bearer __rcu *tipc_ptr;
#endif
#if IS_ENABLED(CONFIG_ATALK)
        void                         *atalk_ptr;
#endif
#if IS_ENABLED(CONFIG_AX25)
        struct ax25_dev        __rcu        *ax25_ptr;
#endif
#if IS_ENABLED(CONFIG_CFG80211)
        struct wireless_dev        *ieee80211_ptr;
#endif
#if IS_ENABLED(CONFIG_IEEE802154) || IS_ENABLED(CONFIG_6LOWPAN)
        struct wpan_dev                *ieee802154_ptr;
#endif
#if IS_ENABLED(CONFIG_MPLS_ROUTING)
        struct mpls_dev __rcu        *mpls_ptr;
#endif
#if IS_ENABLED(CONFIG_MCTP)
        struct mctp_dev __rcu        *mctp_ptr;
#endif
#if IS_ENABLED(CONFIG_INET_PSP)
        struct psp_dev __rcu        *psp_dev;
#endif

/*
 * Cache lines mostly used on receive path (including eth_type_trans())
 */
        /* Interface address info used in eth_type_trans() */
        const unsigned char        *dev_addr;

        unsigned int                num_rx_queues;
#define GRO_LEGACY_MAX_SIZE        65536u
/* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE),
 * and shinfo->gso_segs is a 16bit field.
 */
#define GRO_MAX_SIZE                (8 * 65535u)
        unsigned int                xdp_zc_max_segs;
        struct netdev_queue __rcu *ingress_queue;
#ifdef CONFIG_NETFILTER_INGRESS
        struct nf_hook_entries __rcu *nf_hooks_ingress;
#endif

        unsigned char                broadcast[MAX_ADDR_LEN];
#ifdef CONFIG_RFS_ACCEL
        struct cpu_rmap                *rx_cpu_rmap;
#endif
        struct hlist_node        index_hlist;

/*
 * Cache lines mostly used on transmit path
 */
        unsigned int                num_tx_queues;
        struct Qdisc __rcu        *qdisc;
        unsigned int                tx_queue_len;
        spinlock_t                tx_global_lock;

        struct xdp_dev_bulk_queue __percpu *xdp_bulkq;

#ifdef CONFIG_NET_SCHED
        DECLARE_HASHTABLE        (qdisc_hash, 4);
#endif
        /* These may be needed for future network-power-down code. */
        struct timer_list        watchdog_timer;
        int                        watchdog_timeo;

        u32                     proto_down_reason;

        struct list_head        todo_list;

#ifdef CONFIG_PCPU_DEV_REFCNT
        int __percpu                *pcpu_refcnt;
#else
        refcount_t                dev_refcnt;
#endif
        struct ref_tracker_dir        refcnt_tracker;

        struct list_head        link_watch_list;

        u8 reg_state;

        bool dismantle;

        /** @moving_ns: device is changing netns, protected by @lock */
        bool moving_ns;
        /** @rtnl_link_initializing: Device being created, suppress events */
        bool rtnl_link_initializing;

        bool needs_free_netdev;
        void (*priv_destructor)(struct net_device *dev);

        /* mid-layer private */
        void                                *ml_priv;
        enum netdev_ml_priv_type        ml_priv_type;

        enum netdev_stat_type                pcpu_stat_type:8;

#if IS_ENABLED(CONFIG_GARP)
        struct garp_port __rcu        *garp_port;
#endif
#if IS_ENABLED(CONFIG_MRP)
        struct mrp_port __rcu        *mrp_port;
#endif
#if IS_ENABLED(CONFIG_NET_DROP_MONITOR)
        struct dm_hw_stat_delta __rcu *dm_private;
#endif
        struct device                dev;
        const struct attribute_group *sysfs_groups[5];
        const struct attribute_group *sysfs_rx_queue_group;

        const struct rtnl_link_ops *rtnl_link_ops;

        const struct netdev_stat_ops *stat_ops;

        const struct netdev_queue_mgmt_ops *queue_mgmt_ops;

        /* for setting kernel sock attribute on TCP connection setup */
#define GSO_MAX_SEGS                65535u
#define GSO_LEGACY_MAX_SIZE        65536u
/* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE),
 * and shinfo->gso_segs is a 16bit field.
 */
#define GSO_MAX_SIZE                (8 * GSO_MAX_SEGS)

#define TSO_LEGACY_MAX_SIZE        65536
#define TSO_MAX_SIZE                UINT_MAX
        unsigned int                tso_max_size;
#define TSO_MAX_SEGS                U16_MAX
        u16                        tso_max_segs;

#ifdef CONFIG_DCB
        const struct dcbnl_rtnl_ops *dcbnl_ops;
#endif
        u8                        prio_tc_map[TC_BITMASK + 1];

#if IS_ENABLED(CONFIG_FCOE)
        unsigned int                fcoe_ddp_xid;
#endif
#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
        struct netprio_map __rcu *priomap;
#endif
        struct phy_link_topology        *link_topo;
        struct phy_device        *phydev;
        struct sfp_bus                *sfp_bus;
        struct lock_class_key        *qdisc_tx_busylock;
        bool                        proto_down;
        bool                        irq_affinity_auto;
        bool                        rx_cpu_rmap_auto;

        /* priv_flags_slow, ungrouped to save space */
        unsigned long                see_all_hwtstamp_requests:1;
        unsigned long                change_proto_down:1;
        unsigned long                netns_immutable:1;
        unsigned long                fcoe_mtu:1;

        struct list_head        net_notifier_list;

#if IS_ENABLED(CONFIG_MACSEC)
        /* MACsec management functions */
        const struct macsec_ops *macsec_ops;
#endif
        const struct udp_tunnel_nic_info        *udp_tunnel_nic_info;
        struct udp_tunnel_nic        *udp_tunnel_nic;

        /** @cfg: net_device queue-related configuration */
        struct netdev_config        *cfg;
        /**
         * @cfg_pending: same as @cfg but when device is being actively
         *        reconfigured includes any changes to the configuration
         *        requested by the user, but which may or may not be rejected.
         */
        struct netdev_config        *cfg_pending;
        struct ethtool_netdev_state *ethtool;

        /* protected by rtnl_lock */
        struct bpf_xdp_entity        xdp_state[__MAX_XDP_MODE];

        u8 dev_addr_shadow[MAX_ADDR_LEN];
        netdevice_tracker        linkwatch_dev_tracker;
        netdevice_tracker        watchdog_dev_tracker;
        netdevice_tracker        dev_registered_tracker;
        struct rtnl_hw_stats64        *offload_xstats_l3;

        struct devlink_port        *devlink_port;

#if IS_ENABLED(CONFIG_DPLL)
        struct dpll_pin        __rcu        *dpll_pin;
#endif
#if IS_ENABLED(CONFIG_PAGE_POOL)
        /** @page_pools: page pools created for this netdevice */
        struct hlist_head        page_pools;
#endif

        /** @irq_moder: dim parameters used if IS_ENABLED(CONFIG_DIMLIB). */
        struct dim_irq_moder        *irq_moder;

        u64                        max_pacing_offload_horizon;
        struct napi_config        *napi_config;
        u32                        num_napi_configs;
        u32                        napi_defer_hard_irqs;
        unsigned long                gro_flush_timeout;

        /**
         * @up: copy of @state's IFF_UP, but safe to read with just @lock.
         *        May report false negatives while the device is being opened
         *        or closed (@lock does not protect .ndo_open, or .ndo_close).
         */
        bool                        up;

        /**
         * @request_ops_lock: request the core to run all @netdev_ops and
         * @ethtool_ops under the @lock.
         */
        bool                        request_ops_lock;

        /**
         * @lock: netdev-scope lock, protects a small selection of fields.
         * Should always be taken using netdev_lock() / netdev_unlock() helpers.
         * Drivers are free to use it for other protection.
         *
         * For the drivers that implement shaper or queue API, the scope
         * of this lock is expanded to cover most ndo/queue/ethtool/sysfs
         * operations. Drivers may opt-in to this behavior by setting
         * @request_ops_lock.
         *
         * @lock protection mixes with rtnl_lock in multiple ways, fields are
         * either:
         *
         * - simply protected by the instance @lock;
         *
         * - double protected - writers hold both locks, readers hold either;
         *
         * - ops protected - protected by the lock held around the NDOs
         *   and other callbacks, that is the instance lock on devices for
         *   which netdev_need_ops_lock() returns true, otherwise by rtnl_lock;
         *
         * - double ops protected - always protected by rtnl_lock but for
         *   devices for which netdev_need_ops_lock() returns true - also
         *   the instance lock.
         *
         * Simply protects:
         *        @gro_flush_timeout, @napi_defer_hard_irqs, @napi_list,
         *        @net_shaper_hierarchy, @reg_state, @threaded
         *
         * Double protects:
         *        @up, @moving_ns, @nd_net, @xdp_features
         *
         * Double ops protects:
         *        @real_num_rx_queues, @real_num_tx_queues
         *
         * Also protects some fields in:
         *        struct napi_struct, struct netdev_queue, struct netdev_rx_queue
         *
         * Ordering: take after rtnl_lock.
         */
        struct mutex                lock;

#if IS_ENABLED(CONFIG_NET_SHAPER)
        /**
         * @net_shaper_hierarchy: data tracking the current shaper status
         *  see include/net/net_shapers.h
         */
        struct net_shaper_hierarchy *net_shaper_hierarchy;
#endif

        struct hlist_head neighbours[NEIGH_NR_TABLES];

        struct hwtstamp_provider __rcu        *hwprov;

        u8                        priv[] ____cacheline_aligned
                                       __counted_by(priv_len);
} ____cacheline_aligned;
#define to_net_dev(d) container_of(d, struct net_device, dev)

/*
 * Driver should use this to assign devlink port instance to a netdevice
 * before it registers the netdevice. Therefore devlink_port is static
 * during the netdev lifetime after it is registered.
 */
#define SET_NETDEV_DEVLINK_PORT(dev, port)                        \
({                                                                \
        WARN_ON((dev)->reg_state != NETREG_UNINITIALIZED);        \
        ((dev)->devlink_port = (port));                                \
})

static inline bool netif_elide_gro(const struct net_device *dev)
{
        if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
                return true;
        return false;
}

#define        NETDEV_ALIGN                32

static inline
int netdev_get_prio_tc_map(const struct net_device *dev, u32 prio)
{
        return dev->prio_tc_map[prio & TC_BITMASK];
}

static inline
int netdev_set_prio_tc_map(struct net_device *dev, u8 prio, u8 tc)
{
        if (tc >= dev->num_tc)
                return -EINVAL;

        dev->prio_tc_map[prio & TC_BITMASK] = tc & TC_BITMASK;
        return 0;
}

int netdev_txq_to_tc(struct net_device *dev, unsigned int txq);
void netdev_reset_tc(struct net_device *dev);
int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset);
int netdev_set_num_tc(struct net_device *dev, u8 num_tc);

static inline
int netdev_get_num_tc(struct net_device *dev)
{
        return dev->num_tc;
}

static inline void net_prefetch(void *p)
{
        prefetch(p);
#if L1_CACHE_BYTES < 128
        prefetch((u8 *)p + L1_CACHE_BYTES);
#endif
}

static inline void net_prefetchw(void *p)
{
        prefetchw(p);
#if L1_CACHE_BYTES < 128
        prefetchw((u8 *)p + L1_CACHE_BYTES);
#endif
}

void netdev_unbind_sb_channel(struct net_device *dev,
                              struct net_device *sb_dev);
int netdev_bind_sb_channel_queue(struct net_device *dev,
                                 struct net_device *sb_dev,
                                 u8 tc, u16 count, u16 offset);
int netdev_set_sb_channel(struct net_device *dev, u16 channel);
static inline int netdev_get_sb_channel(struct net_device *dev)
{
        return max_t(int, -dev->num_tc, 0);
}

static inline
struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev,
                                         unsigned int index)
{
        DEBUG_NET_WARN_ON_ONCE(index >= dev->num_tx_queues);
        return &dev->_tx[index];
}

static inline struct netdev_queue *skb_get_tx_queue(const struct net_device *dev,
                                                    const struct sk_buff *skb)
{
        return netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
}

static inline void netdev_for_each_tx_queue(struct net_device *dev,
                                            void (*f)(struct net_device *,
                                                      struct netdev_queue *,
                                                      void *),
                                            void *arg)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++)
                f(dev, &dev->_tx[i], arg);
}

u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
                     struct net_device *sb_dev);
struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
                                         struct sk_buff *skb,
                                         struct net_device *sb_dev);

/* returns the headroom that the master device needs to take in account
 * when forwarding to this dev
 */
static inline unsigned netdev_get_fwd_headroom(struct net_device *dev)
{
        return dev->priv_flags & IFF_PHONY_HEADROOM ? 0 : dev->needed_headroom;
}

static inline void netdev_set_rx_headroom(struct net_device *dev, int new_hr)
{
        if (dev->netdev_ops->ndo_set_rx_headroom)
                dev->netdev_ops->ndo_set_rx_headroom(dev, new_hr);
}

/* set the device rx headroom to the dev's default */
static inline void netdev_reset_rx_headroom(struct net_device *dev)
{
        netdev_set_rx_headroom(dev, -1);
}

static inline void *netdev_get_ml_priv(struct net_device *dev,
                                       enum netdev_ml_priv_type type)
{
        if (dev->ml_priv_type != type)
                return NULL;

        return dev->ml_priv;
}

static inline void netdev_set_ml_priv(struct net_device *dev,
                                      void *ml_priv,
                                      enum netdev_ml_priv_type type)
{
        WARN(dev->ml_priv_type && dev->ml_priv_type != type,
             "Overwriting already set ml_priv_type (%u) with different ml_priv_type (%u)!\n",
             dev->ml_priv_type, type);
        WARN(!dev->ml_priv_type && dev->ml_priv,
             "Overwriting already set ml_priv and ml_priv_type is ML_PRIV_NONE!\n");

        dev->ml_priv = ml_priv;
        dev->ml_priv_type = type;
}

/*
 * Net namespace inlines
 */
static inline
struct net *dev_net(const struct net_device *dev)
{
        return read_pnet(&dev->nd_net);
}

static inline
struct net *dev_net_rcu(const struct net_device *dev)
{
        return read_pnet_rcu(&dev->nd_net);
}

static inline
void dev_net_set(struct net_device *dev, struct net *net)
{
        write_pnet(&dev->nd_net, net);
}

/**
 *        netdev_priv - access network device private data
 *        @dev: network device
 *
 * Get network device private data
 */
static inline void *netdev_priv(const struct net_device *dev)
{
        return (void *)dev->priv;
}

/* Set the sysfs physical device reference for the network logical device
 * if set prior to registration will cause a symlink during initialization.
 */
#define SET_NETDEV_DEV(net, pdev)        ((net)->dev.parent = (pdev))

/* Set the sysfs device type for the network logical device to allow
 * fine-grained identification of different network device types. For
 * example Ethernet, Wireless LAN, Bluetooth, WiMAX etc.
 */
#define SET_NETDEV_DEVTYPE(net, devtype)        ((net)->dev.type = (devtype))

void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
                          enum netdev_queue_type type,
                          struct napi_struct *napi);

static inline void netdev_lock(struct net_device *dev)
{
        mutex_lock(&dev->lock);
}

static inline void netdev_unlock(struct net_device *dev)
{
        mutex_unlock(&dev->lock);
}
/* Additional netdev_lock()-related helpers are in net/netdev_lock.h */

void netif_napi_set_irq_locked(struct napi_struct *napi, int irq);

static inline void netif_napi_set_irq(struct napi_struct *napi, int irq)
{
        netdev_lock(napi->dev);
        netif_napi_set_irq_locked(napi, irq);
        netdev_unlock(napi->dev);
}

/* Default NAPI poll() weight
 * Device drivers are strongly advised to not use bigger value
 */
#define NAPI_POLL_WEIGHT 64

void netif_napi_add_weight_locked(struct net_device *dev,
                                  struct napi_struct *napi,
                                  int (*poll)(struct napi_struct *, int),
                                  int weight);

static inline void
netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
                      int (*poll)(struct napi_struct *, int), int weight)
{
        netdev_lock(dev);
        netif_napi_add_weight_locked(dev, napi, poll, weight);
        netdev_unlock(dev);
}

/**
 * netif_napi_add() - initialize a NAPI context
 * @dev:  network device
 * @napi: NAPI context
 * @poll: polling function
 *
 * netif_napi_add() must be used to initialize a NAPI context prior to calling
 * *any* of the other NAPI-related functions.
 */
static inline void
netif_napi_add(struct net_device *dev, struct napi_struct *napi,
               int (*poll)(struct napi_struct *, int))
{
        netif_napi_add_weight(dev, napi, poll, NAPI_POLL_WEIGHT);
}

static inline void
netif_napi_add_locked(struct net_device *dev, struct napi_struct *napi,
                      int (*poll)(struct napi_struct *, int))
{
        netif_napi_add_weight_locked(dev, napi, poll, NAPI_POLL_WEIGHT);
}

static inline void
netif_napi_add_tx_weight(struct net_device *dev,
                         struct napi_struct *napi,
                         int (*poll)(struct napi_struct *, int),
                         int weight)
{
        set_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state);
        netif_napi_add_weight(dev, napi, poll, weight);
}

static inline void
netif_napi_add_config_locked(struct net_device *dev, struct napi_struct *napi,
                             int (*poll)(struct napi_struct *, int), int index)
{
        napi->index = index;
        napi->config = &dev->napi_config[index];
        netif_napi_add_weight_locked(dev, napi, poll, NAPI_POLL_WEIGHT);
}

/**
 * netif_napi_add_config - initialize a NAPI context with persistent config
 * @dev: network device
 * @napi: NAPI context
 * @poll: polling function
 * @index: the NAPI index
 */
static inline void
netif_napi_add_config(struct net_device *dev, struct napi_struct *napi,
                      int (*poll)(struct napi_struct *, int), int index)
{
        netdev_lock(dev);
        netif_napi_add_config_locked(dev, napi, poll, index);
        netdev_unlock(dev);
}

/**
 * netif_napi_add_tx() - initialize a NAPI context to be used for Tx only
 * @dev:  network device
 * @napi: NAPI context
 * @poll: polling function
 *
 * This variant of netif_napi_add() should be used from drivers using NAPI
 * to exclusively poll a TX queue.
 * This will avoid we add it into napi_hash[], thus polluting this hash table.
 */
static inline void netif_napi_add_tx(struct net_device *dev,
                                     struct napi_struct *napi,
                                     int (*poll)(struct napi_struct *, int))
{
        netif_napi_add_tx_weight(dev, napi, poll, NAPI_POLL_WEIGHT);
}

void __netif_napi_del_locked(struct napi_struct *napi);

/**
 *  __netif_napi_del - remove a NAPI context
 *  @napi: NAPI context
 *
 * Warning: caller must observe RCU grace period before freeing memory
 * containing @napi. Drivers might want to call this helper to combine
 * all the needed RCU grace periods into a single one.
 */
static inline void __netif_napi_del(struct napi_struct *napi)
{
        netdev_lock(napi->dev);
        __netif_napi_del_locked(napi);
        netdev_unlock(napi->dev);
}

static inline void netif_napi_del_locked(struct napi_struct *napi)
{
        __netif_napi_del_locked(napi);
        synchronize_net();
}

/**
 *  netif_napi_del - remove a NAPI context
 *  @napi: NAPI context
 *
 *  netif_napi_del() removes a NAPI context from the network device NAPI list
 */
static inline void netif_napi_del(struct napi_struct *napi)
{
        __netif_napi_del(napi);
        synchronize_net();
}

int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs);
void netif_set_affinity_auto(struct net_device *dev);

struct packet_type {
        __be16                        type;        /* This is really htons(ether_type). */
        bool                        ignore_outgoing;
        struct net_device        *dev;        /* NULL is wildcarded here             */
        netdevice_tracker        dev_tracker;
        int                        (*func) (struct sk_buff *,
                                         struct net_device *,
                                         struct packet_type *,
                                         struct net_device *);
        void                        (*list_func) (struct list_head *,
                                              struct packet_type *,
                                              struct net_device *);
        bool                        (*id_match)(struct packet_type *ptype,
                                            struct sock *sk);
        struct net                *af_packet_net;
        void                        *af_packet_priv;
        struct list_head        list;
};

struct offload_callbacks {
        struct sk_buff                *(*gso_segment)(struct sk_buff *skb,
                                                netdev_features_t features);
        struct sk_buff                *(*gro_receive)(struct list_head *head,
                                                struct sk_buff *skb);
        int                        (*gro_complete)(struct sk_buff *skb, int nhoff);
};

struct packet_offload {
        __be16                         type;        /* This is really htons(ether_type). */
        u16                         priority;
        struct offload_callbacks callbacks;
        struct list_head         list;
};

/* often modified stats are per-CPU, other are shared (netdev->stats) */
struct pcpu_sw_netstats {
        u64_stats_t                rx_packets;
        u64_stats_t                rx_bytes;
        u64_stats_t                tx_packets;
        u64_stats_t                tx_bytes;
        struct u64_stats_sync   syncp;
} __aligned(4 * sizeof(u64));

struct pcpu_dstats {
        u64_stats_t                rx_packets;
        u64_stats_t                rx_bytes;
        u64_stats_t                tx_packets;
        u64_stats_t                tx_bytes;
        u64_stats_t                rx_drops;
        u64_stats_t                tx_drops;
        struct u64_stats_sync        syncp;
} __aligned(8 * sizeof(u64));

struct pcpu_lstats {
        u64_stats_t packets;
        u64_stats_t bytes;
        struct u64_stats_sync syncp;
} __aligned(2 * sizeof(u64));

void dev_lstats_read(struct net_device *dev, u64 *packets, u64 *bytes);

static inline void dev_sw_netstats_rx_add(struct net_device *dev, unsigned int len)
{
        struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);

        u64_stats_update_begin(&tstats->syncp);
        u64_stats_add(&tstats->rx_bytes, len);
        u64_stats_inc(&tstats->rx_packets);
        u64_stats_update_end(&tstats->syncp);
}

static inline void dev_sw_netstats_tx_add(struct net_device *dev,
                                          unsigned int packets,
                                          unsigned int len)
{
        struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);

        u64_stats_update_begin(&tstats->syncp);
        u64_stats_add(&tstats->tx_bytes, len);
        u64_stats_add(&tstats->tx_packets, packets);
        u64_stats_update_end(&tstats->syncp);
}

static inline void dev_lstats_add(struct net_device *dev, unsigned int len)
{
        struct pcpu_lstats *lstats = this_cpu_ptr(dev->lstats);

        u64_stats_update_begin(&lstats->syncp);
        u64_stats_add(&lstats->bytes, len);
        u64_stats_inc(&lstats->packets);
        u64_stats_update_end(&lstats->syncp);
}

static inline void dev_dstats_rx_add(struct net_device *dev,
                                     unsigned int len)
{
        struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);

        u64_stats_update_begin(&dstats->syncp);
        u64_stats_inc(&dstats->rx_packets);
        u64_stats_add(&dstats->rx_bytes, len);
        u64_stats_update_end(&dstats->syncp);
}

static inline void dev_dstats_rx_dropped(struct net_device *dev)
{
        struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);

        u64_stats_update_begin(&dstats->syncp);
        u64_stats_inc(&dstats->rx_drops);
        u64_stats_update_end(&dstats->syncp);
}

static inline void dev_dstats_rx_dropped_add(struct net_device *dev,
                                             unsigned int packets)
{
        struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);

        u64_stats_update_begin(&dstats->syncp);
        u64_stats_add(&dstats->rx_drops, packets);
        u64_stats_update_end(&dstats->syncp);
}

static inline void dev_dstats_tx_add(struct net_device *dev,
                                     unsigned int len)
{
        struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);

        u64_stats_update_begin(&dstats->syncp);
        u64_stats_inc(&dstats->tx_packets);
        u64_stats_add(&dstats->tx_bytes, len);
        u64_stats_update_end(&dstats->syncp);
}

static inline void dev_dstats_tx_dropped(struct net_device *dev)
{
        struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);

        u64_stats_update_begin(&dstats->syncp);
        u64_stats_inc(&dstats->tx_drops);
        u64_stats_update_end(&dstats->syncp);
}

#define __netdev_alloc_pcpu_stats(type, gfp)                                \
({                                                                        \
        typeof(type) __percpu *pcpu_stats = alloc_percpu_gfp(type, gfp);\
        if (pcpu_stats)        {                                                \
                int __cpu;                                                \
                for_each_possible_cpu(__cpu) {                                \
                        typeof(type) *stat;                                \
                        stat = per_cpu_ptr(pcpu_stats, __cpu);                \
                        u64_stats_init(&stat->syncp);                        \
                }                                                        \
        }                                                                \
        pcpu_stats;                                                        \
})

#define netdev_alloc_pcpu_stats(type)                                        \
        __netdev_alloc_pcpu_stats(type, GFP_KERNEL)

#define devm_netdev_alloc_pcpu_stats(dev, type)                                \
({                                                                        \
        typeof(type) __percpu *pcpu_stats = devm_alloc_percpu(dev, type);\
        if (pcpu_stats) {                                                \
                int __cpu;                                                \
                for_each_possible_cpu(__cpu) {                                \
                        typeof(type) *stat;                                \
                        stat = per_cpu_ptr(pcpu_stats, __cpu);                \
                        u64_stats_init(&stat->syncp);                        \
                }                                                        \
        }                                                                \
        pcpu_stats;                                                        \
})

enum netdev_lag_tx_type {
        NETDEV_LAG_TX_TYPE_UNKNOWN,
        NETDEV_LAG_TX_TYPE_RANDOM,
        NETDEV_LAG_TX_TYPE_BROADCAST,
        NETDEV_LAG_TX_TYPE_ROUNDROBIN,
        NETDEV_LAG_TX_TYPE_ACTIVEBACKUP,
        NETDEV_LAG_TX_TYPE_HASH,
};

enum netdev_lag_hash {
        NETDEV_LAG_HASH_NONE,
        NETDEV_LAG_HASH_L2,
        NETDEV_LAG_HASH_L34,
        NETDEV_LAG_HASH_L23,
        NETDEV_LAG_HASH_E23,
        NETDEV_LAG_HASH_E34,
        NETDEV_LAG_HASH_VLAN_SRCMAC,
        NETDEV_LAG_HASH_UNKNOWN,
};

struct netdev_lag_upper_info {
        enum netdev_lag_tx_type tx_type;
        enum netdev_lag_hash hash_type;
};

struct netdev_lag_lower_state_info {
        u8 link_up : 1,
           tx_enabled : 1;
};

#include <linux/notifier.h>

/* netdevice notifier chain. Please remember to update netdev_cmd_to_name()
 * and the rtnetlink notification exclusion list in rtnetlink_event() when
 * adding new types.
 */
enum netdev_cmd {
        NETDEV_UP        = 1,        /* For now you can't veto a device up/down */
        NETDEV_DOWN,
        NETDEV_REBOOT,                /* Tell a protocol stack a network interface
                                   detected a hardware crash and restarted
                                   - we can use this eg to kick tcp sessions
                                   once done */
        NETDEV_CHANGE,                /* Notify device state change */
        NETDEV_REGISTER,
        NETDEV_UNREGISTER,
        NETDEV_CHANGEMTU,        /* notify after mtu change happened */
        NETDEV_CHANGEADDR,        /* notify after the address change */
        NETDEV_PRE_CHANGEADDR,        /* notify before the address change */
        NETDEV_GOING_DOWN,
        NETDEV_CHANGENAME,
        NETDEV_FEAT_CHANGE,
        NETDEV_BONDING_FAILOVER,
        NETDEV_PRE_UP,
        NETDEV_PRE_TYPE_CHANGE,
        NETDEV_POST_TYPE_CHANGE,
        NETDEV_POST_INIT,
        NETDEV_PRE_UNINIT,
        NETDEV_RELEASE,
        NETDEV_NOTIFY_PEERS,
        NETDEV_JOIN,
        NETDEV_CHANGEUPPER,
        NETDEV_RESEND_IGMP,
        NETDEV_PRECHANGEMTU,        /* notify before mtu change happened */
        NETDEV_CHANGEINFODATA,
        NETDEV_BONDING_INFO,
        NETDEV_PRECHANGEUPPER,
        NETDEV_CHANGELOWERSTATE,
        NETDEV_UDP_TUNNEL_PUSH_INFO,
        NETDEV_UDP_TUNNEL_DROP_INFO,
        NETDEV_CHANGE_TX_QUEUE_LEN,
        NETDEV_CVLAN_FILTER_PUSH_INFO,
        NETDEV_CVLAN_FILTER_DROP_INFO,
        NETDEV_SVLAN_FILTER_PUSH_INFO,
        NETDEV_SVLAN_FILTER_DROP_INFO,
        NETDEV_OFFLOAD_XSTATS_ENABLE,
        NETDEV_OFFLOAD_XSTATS_DISABLE,
        NETDEV_OFFLOAD_XSTATS_REPORT_USED,
        NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
        NETDEV_XDP_FEAT_CHANGE,
};
const char *netdev_cmd_to_name(enum netdev_cmd cmd);

int register_netdevice_notifier(struct notifier_block *nb);
int unregister_netdevice_notifier(struct notifier_block *nb);
int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb);
int unregister_netdevice_notifier_net(struct net *net,
                                      struct notifier_block *nb);
int register_netdevice_notifier_dev_net(struct net_device *dev,
                                        struct notifier_block *nb,
                                        struct netdev_net_notifier *nn);
int unregister_netdevice_notifier_dev_net(struct net_device *dev,
                                          struct notifier_block *nb,
                                          struct netdev_net_notifier *nn);

struct netdev_notifier_info {
        struct net_device        *dev;
        struct netlink_ext_ack        *extack;
};

struct netdev_notifier_info_ext {
        struct netdev_notifier_info info; /* must be first */
        union {
                u32 mtu;
        } ext;
};

struct netdev_notifier_change_info {
        struct netdev_notifier_info info; /* must be first */
        unsigned int flags_changed;
};

struct netdev_notifier_changeupper_info {
        struct netdev_notifier_info info; /* must be first */
        struct net_device *upper_dev; /* new upper dev */
        bool master; /* is upper dev master */
        bool linking; /* is the notification for link or unlink */
        void *upper_info; /* upper dev info */
};

struct netdev_notifier_changelowerstate_info {
        struct netdev_notifier_info info; /* must be first */
        void *lower_state_info; /* is lower dev state */
};

struct netdev_notifier_pre_changeaddr_info {
        struct netdev_notifier_info info; /* must be first */
        const unsigned char *dev_addr;
};

enum netdev_offload_xstats_type {
        NETDEV_OFFLOAD_XSTATS_TYPE_L3 = 1,
};

struct netdev_notifier_offload_xstats_info {
        struct netdev_notifier_info info; /* must be first */
        enum netdev_offload_xstats_type type;

        union {
                /* NETDEV_OFFLOAD_XSTATS_REPORT_DELTA */
                struct netdev_notifier_offload_xstats_rd *report_delta;
                /* NETDEV_OFFLOAD_XSTATS_REPORT_USED */
                struct netdev_notifier_offload_xstats_ru *report_used;
        };
};

int netdev_offload_xstats_enable(struct net_device *dev,
                                 enum netdev_offload_xstats_type type,
                                 struct netlink_ext_ack *extack);
int netdev_offload_xstats_disable(struct net_device *dev,
                                  enum netdev_offload_xstats_type type);
bool netdev_offload_xstats_enabled(const struct net_device *dev,
                                   enum netdev_offload_xstats_type type);
int netdev_offload_xstats_get(struct net_device *dev,
                              enum netdev_offload_xstats_type type,
                              struct rtnl_hw_stats64 *stats, bool *used,
                              struct netlink_ext_ack *extack);
void
netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *rd,
                                   const struct rtnl_hw_stats64 *stats);
void
netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *ru);
void netdev_offload_xstats_push_delta(struct net_device *dev,
                                      enum netdev_offload_xstats_type type,
                                      const struct rtnl_hw_stats64 *stats);

static inline void netdev_notifier_info_init(struct netdev_notifier_info *info,
                                             struct net_device *dev)
{
        info->dev = dev;
        info->extack = NULL;
}

static inline struct net_device *
netdev_notifier_info_to_dev(const struct netdev_notifier_info *info)
{
        return info->dev;
}

static inline struct netlink_ext_ack *
netdev_notifier_info_to_extack(const struct netdev_notifier_info *info)
{
        return info->extack;
}

int call_netdevice_notifiers(unsigned long val, struct net_device *dev);
int call_netdevice_notifiers_info(unsigned long val,
                                  struct netdev_notifier_info *info);

#define for_each_netdev(net, d)                \
                list_for_each_entry(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_reverse(net, d)        \
                list_for_each_entry_reverse(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_rcu(net, d)                \
                list_for_each_entry_rcu(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_safe(net, d, n)        \
                list_for_each_entry_safe(d, n, &(net)->dev_base_head, dev_list)
#define for_each_netdev_continue(net, d)                \
                list_for_each_entry_continue(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_continue_reverse(net, d)                \
                list_for_each_entry_continue_reverse(d, &(net)->dev_base_head, \
                                                     dev_list)
#define for_each_netdev_continue_rcu(net, d)                \
        list_for_each_entry_continue_rcu(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_in_bond_rcu(bond, slave)        \
                for_each_netdev_rcu(dev_net_rcu(bond), slave)        \
                        if (netdev_master_upper_dev_get_rcu(slave) == (bond))
#define net_device_entry(lh)        list_entry(lh, struct net_device, dev_list)

#define for_each_netdev_dump(net, d, ifindex)                                \
        for (; (d = xa_find(&(net)->dev_by_index, &ifindex,                \
                            ULONG_MAX, XA_PRESENT)); ifindex++)

static inline struct net_device *next_net_device(struct net_device *dev)
{
        struct list_head *lh;
        struct net *net;

        net = dev_net(dev);
        lh = dev->dev_list.next;
        return lh == &net->dev_base_head ? NULL : net_device_entry(lh);
}

static inline struct net_device *next_net_device_rcu(struct net_device *dev)
{
        struct list_head *lh;
        struct net *net;

        net = dev_net(dev);
        lh = rcu_dereference(list_next_rcu(&dev->dev_list));
        return lh == &net->dev_base_head ? NULL : net_device_entry(lh);
}

static inline struct net_device *first_net_device(struct net *net)
{
        return list_empty(&net->dev_base_head) ? NULL :
                net_device_entry(net->dev_base_head.next);
}

int netdev_boot_setup_check(struct net_device *dev);
struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type,
                                   const char *hwaddr);
struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
                                       const char *hwaddr);
struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type);
void dev_add_pack(struct packet_type *pt);
void dev_remove_pack(struct packet_type *pt);
void __dev_remove_pack(struct packet_type *pt);
void dev_add_offload(struct packet_offload *po);
void dev_remove_offload(struct packet_offload *po);

int dev_get_iflink(const struct net_device *dev);
int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb);
int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
                          struct net_device_path_stack *stack);
struct net_device *dev_get_by_name(struct net *net, const char *name);
struct net_device *dev_get_by_name_rcu(struct net *net, const char *name);
struct net_device *__dev_get_by_name(struct net *net, const char *name);
bool netdev_name_in_use(struct net *net, const char *name);
int dev_alloc_name(struct net_device *dev, const char *name);
int netif_open(struct net_device *dev, struct netlink_ext_ack *extack);
int dev_open(struct net_device *dev, struct netlink_ext_ack *extack);
void netif_close(struct net_device *dev);
void dev_close(struct net_device *dev);
void netif_close_many(struct list_head *head, bool unlink);
void netif_disable_lro(struct net_device *dev);
void dev_disable_lro(struct net_device *dev);
int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb);
u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
                     struct net_device *sb_dev);

int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev);
int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id);

static inline int dev_queue_xmit(struct sk_buff *skb)
{
        return __dev_queue_xmit(skb, NULL);
}

static inline int dev_queue_xmit_accel(struct sk_buff *skb,
                                       struct net_device *sb_dev)
{
        return __dev_queue_xmit(skb, sb_dev);
}

static inline int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
{
        int ret;

        ret = __dev_direct_xmit(skb, queue_id);
        if (!dev_xmit_complete(ret))
                kfree_skb(skb);
        return ret;
}

int register_netdevice(struct net_device *dev);
void unregister_netdevice_queue(struct net_device *dev, struct list_head *head);
void unregister_netdevice_many(struct list_head *head);
static inline void unregister_netdevice(struct net_device *dev)
{
        unregister_netdevice_queue(dev, NULL);
}

int netdev_refcnt_read(const struct net_device *dev);
void free_netdev(struct net_device *dev);

struct net_device *netdev_get_xmit_slave(struct net_device *dev,
                                         struct sk_buff *skb,
                                         bool all_slaves);
struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
                                            struct sock *sk);
struct net_device *dev_get_by_index(struct net *net, int ifindex);
struct net_device *__dev_get_by_index(struct net *net, int ifindex);
struct net_device *netdev_get_by_index(struct net *net, int ifindex,
                                       netdevice_tracker *tracker, gfp_t gfp);
struct net_device *netdev_get_by_name(struct net *net, const char *name,
                                      netdevice_tracker *tracker, gfp_t gfp);
struct net_device *netdev_get_by_flags_rcu(struct net *net, netdevice_tracker *tracker,
                                           unsigned short flags, unsigned short mask);
struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
void netdev_copy_name(struct net_device *dev, char *name);

static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
                                  unsigned short type,
                                  const void *daddr, const void *saddr,
                                  unsigned int len)
{
        if (!dev->header_ops || !dev->header_ops->create)
                return 0;

        return dev->header_ops->create(skb, dev, type, daddr, saddr, len);
}

static inline int dev_parse_header(const struct sk_buff *skb,
                                   unsigned char *haddr)
{
        const struct net_device *dev = skb->dev;

        if (!dev->header_ops || !dev->header_ops->parse)
                return 0;
        return dev->header_ops->parse(skb, haddr);
}

static inline __be16 dev_parse_header_protocol(const struct sk_buff *skb)
{
        const struct net_device *dev = skb->dev;

        if (!dev->header_ops || !dev->header_ops->parse_protocol)
                return 0;
        return dev->header_ops->parse_protocol(skb);
}

/* ll_header must have at least hard_header_len allocated */
static inline bool dev_validate_header(const struct net_device *dev,
                                       char *ll_header, int len)
{
        if (likely(len >= dev->hard_header_len))
                return true;
        if (len < dev->min_header_len)
                return false;

        if (capable(CAP_SYS_RAWIO)) {
                memset(ll_header + len, 0, dev->hard_header_len - len);
                return true;
        }

        if (dev->header_ops && dev->header_ops->validate)
                return dev->header_ops->validate(ll_header, len);

        return false;
}

static inline bool dev_has_header(const struct net_device *dev)
{
        return dev->header_ops && dev->header_ops->create;
}

struct numa_drop_counters {
        atomic_t        drops0 ____cacheline_aligned_in_smp;
        atomic_t        drops1 ____cacheline_aligned_in_smp;
};

static inline int numa_drop_read(const struct numa_drop_counters *ndc)
{
        return atomic_read(&ndc->drops0) + atomic_read(&ndc->drops1);
}

static inline void numa_drop_add(struct numa_drop_counters *ndc, int val)
{
        int n = numa_node_id() % 2;

        if (n)
                atomic_add(val, &ndc->drops1);
        else
                atomic_add(val, &ndc->drops0);
}

static inline void numa_drop_reset(struct numa_drop_counters *ndc)
{
        atomic_set(&ndc->drops0, 0);
        atomic_set(&ndc->drops1, 0);
}

/*
 * Incoming packets are placed on per-CPU queues
 */
struct softnet_data {
        struct list_head        poll_list;
        struct sk_buff_head        process_queue;
        local_lock_t                process_queue_bh_lock;

        /* stats */
        unsigned int                processed;
        unsigned int                time_squeeze;
#ifdef CONFIG_RPS
        struct softnet_data        *rps_ipi_list;
#endif

        unsigned int                received_rps;
        bool                        in_net_rx_action;
        bool                        in_napi_threaded_poll;

#ifdef CONFIG_NET_FLOW_LIMIT
        struct sd_flow_limit __rcu *flow_limit;
#endif
        struct Qdisc                *output_queue;
        struct Qdisc                **output_queue_tailp;
        struct sk_buff                *completion_queue;
#ifdef CONFIG_XFRM_OFFLOAD
        struct sk_buff_head        xfrm_backlog;
#endif
        /* written and read only by owning cpu: */
        struct netdev_xmit xmit;
#ifdef CONFIG_RPS
        /* input_queue_head should be written by cpu owning this struct,
         * and only read by other cpus. Worth using a cache line.
         */
        unsigned int                input_queue_head ____cacheline_aligned_in_smp;

        /* Elements below can be accessed between CPUs for RPS/RFS */
        call_single_data_t        csd ____cacheline_aligned_in_smp;
        struct softnet_data        *rps_ipi_next;
        unsigned int                cpu;
        unsigned int                input_queue_tail;
#endif
        struct sk_buff_head        input_pkt_queue;
        struct napi_struct        backlog;

        struct numa_drop_counters drop_counters;

        int                        defer_ipi_scheduled ____cacheline_aligned_in_smp;
        call_single_data_t        defer_csd;
};

DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);

struct page_pool_bh {
        struct page_pool *pool;
        local_lock_t bh_lock;
};
DECLARE_PER_CPU(struct page_pool_bh, system_page_pool);

#ifndef CONFIG_PREEMPT_RT
static inline int dev_recursion_level(void)
{
        return this_cpu_read(softnet_data.xmit.recursion);
}
#else
static inline int dev_recursion_level(void)
{
        return current->net_xmit.recursion;
}

#endif

void __netif_schedule(struct Qdisc *q);
void netif_schedule_queue(struct netdev_queue *txq);

static inline void netif_tx_schedule_all(struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++)
                netif_schedule_queue(netdev_get_tx_queue(dev, i));
}

static __always_inline void netif_tx_start_queue(struct netdev_queue *dev_queue)
{
        clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state);
}

/**
 *        netif_start_queue - allow transmit
 *        @dev: network device
 *
 *        Allow upper layers to call the device hard_start_xmit routine.
 */
static inline void netif_start_queue(struct net_device *dev)
{
        netif_tx_start_queue(netdev_get_tx_queue(dev, 0));
}

static inline void netif_tx_start_all_queues(struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
                netif_tx_start_queue(txq);
        }
}

void netif_tx_wake_queue(struct netdev_queue *dev_queue);

/**
 *        netif_wake_queue - restart transmit
 *        @dev: network device
 *
 *        Allow upper layers to call the device hard_start_xmit routine.
 *        Used for flow control when transmit resources are available.
 */
static inline void netif_wake_queue(struct net_device *dev)
{
        netif_tx_wake_queue(netdev_get_tx_queue(dev, 0));
}

static inline void netif_tx_wake_all_queues(struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
                netif_tx_wake_queue(txq);
        }
}

static __always_inline void netif_tx_stop_queue(struct netdev_queue *dev_queue)
{
        /* Paired with READ_ONCE() from dev_watchdog() */
        WRITE_ONCE(dev_queue->trans_start, jiffies);

        /* This barrier is paired with smp_mb() from dev_watchdog() */
        smp_mb__before_atomic();

        /* Must be an atomic op see netif_txq_try_stop() */
        set_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state);
}

/**
 *        netif_stop_queue - stop transmitted packets
 *        @dev: network device
 *
 *        Stop upper layers calling the device hard_start_xmit routine.
 *        Used for flow control when transmit resources are unavailable.
 */
static inline void netif_stop_queue(struct net_device *dev)
{
        netif_tx_stop_queue(netdev_get_tx_queue(dev, 0));
}

void netif_tx_stop_all_queues(struct net_device *dev);

static inline bool netif_tx_queue_stopped(const struct netdev_queue *dev_queue)
{
        return test_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state);
}

/**
 *        netif_queue_stopped - test if transmit queue is flowblocked
 *        @dev: network device
 *
 *        Test if transmit queue on device is currently unable to send.
 */
static inline bool netif_queue_stopped(const struct net_device *dev)
{
        return netif_tx_queue_stopped(netdev_get_tx_queue(dev, 0));
}

static inline bool netif_xmit_stopped(const struct netdev_queue *dev_queue)
{
        return dev_queue->state & QUEUE_STATE_ANY_XOFF;
}

static inline bool
netif_xmit_frozen_or_stopped(const struct netdev_queue *dev_queue)
{
        return dev_queue->state & QUEUE_STATE_ANY_XOFF_OR_FROZEN;
}

static inline bool
netif_xmit_frozen_or_drv_stopped(const struct netdev_queue *dev_queue)
{
        return dev_queue->state & QUEUE_STATE_DRV_XOFF_OR_FROZEN;
}

/**
 *        netdev_queue_set_dql_min_limit - set dql minimum limit
 *        @dev_queue: pointer to transmit queue
 *        @min_limit: dql minimum limit
 *
 * Forces xmit_more() to return true until the minimum threshold
 * defined by @min_limit is reached (or until the tx queue is
 * empty). Warning: to be use with care, misuse will impact the
 * latency.
 */
static inline void netdev_queue_set_dql_min_limit(struct netdev_queue *dev_queue,
                                                  unsigned int min_limit)
{
#ifdef CONFIG_BQL
        dev_queue->dql.min_limit = min_limit;
#endif
}

static inline int netdev_queue_dql_avail(const struct netdev_queue *txq)
{
#ifdef CONFIG_BQL
        /* Non-BQL migrated drivers will return 0, too. */
        return dql_avail(&txq->dql);
#else
        return 0;
#endif
}

/**
 *        netdev_txq_bql_enqueue_prefetchw - prefetch bql data for write
 *        @dev_queue: pointer to transmit queue
 *
 * BQL enabled drivers might use this helper in their ndo_start_xmit(),
 * to give appropriate hint to the CPU.
 */
static inline void netdev_txq_bql_enqueue_prefetchw(struct netdev_queue *dev_queue)
{
#ifdef CONFIG_BQL
        prefetchw(&dev_queue->dql.num_queued);
#endif
}

/**
 *        netdev_txq_bql_complete_prefetchw - prefetch bql data for write
 *        @dev_queue: pointer to transmit queue
 *
 * BQL enabled drivers might use this helper in their TX completion path,
 * to give appropriate hint to the CPU.
 */
static inline void netdev_txq_bql_complete_prefetchw(struct netdev_queue *dev_queue)
{
#ifdef CONFIG_BQL
        prefetchw(&dev_queue->dql.limit);
#endif
}

/**
 *        netdev_tx_sent_queue - report the number of bytes queued to a given tx queue
 *        @dev_queue: network device queue
 *        @bytes: number of bytes queued to the device queue
 *
 *        Report the number of bytes queued for sending/completion to the network
 *        device hardware queue. @bytes should be a good approximation and should
 *        exactly match netdev_completed_queue() @bytes.
 *        This is typically called once per packet, from ndo_start_xmit().
 */
static inline void netdev_tx_sent_queue(struct netdev_queue *dev_queue,
                                        unsigned int bytes)
{
#ifdef CONFIG_BQL
        dql_queued(&dev_queue->dql, bytes);

        if (likely(dql_avail(&dev_queue->dql) >= 0))
                return;

        /* Paired with READ_ONCE() from dev_watchdog() */
        WRITE_ONCE(dev_queue->trans_start, jiffies);

        /* This barrier is paired with smp_mb() from dev_watchdog() */
        smp_mb__before_atomic();

        set_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state);

        /*
         * The XOFF flag must be set before checking the dql_avail below,
         * because in netdev_tx_completed_queue we update the dql_completed
         * before checking the XOFF flag.
         */
        smp_mb__after_atomic();

        /* check again in case another CPU has just made room avail */
        if (unlikely(dql_avail(&dev_queue->dql) >= 0))
                clear_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state);
#endif
}

/* Variant of netdev_tx_sent_queue() for drivers that are aware
 * that they should not test BQL status themselves.
 * We do want to change __QUEUE_STATE_STACK_XOFF only for the last
 * skb of a batch.
 * Returns true if the doorbell must be used to kick the NIC.
 */
static inline bool __netdev_tx_sent_queue(struct netdev_queue *dev_queue,
                                          unsigned int bytes,
                                          bool xmit_more)
{
        if (xmit_more) {
#ifdef CONFIG_BQL
                dql_queued(&dev_queue->dql, bytes);
#endif
                return netif_tx_queue_stopped(dev_queue);
        }
        netdev_tx_sent_queue(dev_queue, bytes);
        return true;
}

/**
 *        netdev_sent_queue - report the number of bytes queued to hardware
 *        @dev: network device
 *        @bytes: number of bytes queued to the hardware device queue
 *
 *        Report the number of bytes queued for sending/completion to the network
 *        device hardware queue#0. @bytes should be a good approximation and should
 *        exactly match netdev_completed_queue() @bytes.
 *        This is typically called once per packet, from ndo_start_xmit().
 */
static inline void netdev_sent_queue(struct net_device *dev, unsigned int bytes)
{
        netdev_tx_sent_queue(netdev_get_tx_queue(dev, 0), bytes);
}

static inline bool __netdev_sent_queue(struct net_device *dev,
                                       unsigned int bytes,
                                       bool xmit_more)
{
        return __netdev_tx_sent_queue(netdev_get_tx_queue(dev, 0), bytes,
                                      xmit_more);
}

/**
 *        netdev_tx_completed_queue - report number of packets/bytes at TX completion.
 *        @dev_queue: network device queue
 *        @pkts: number of packets (currently ignored)
 *        @bytes: number of bytes dequeued from the device queue
 *
 *        Must be called at most once per TX completion round (and not per
 *        individual packet), so that BQL can adjust its limits appropriately.
 */
static inline void netdev_tx_completed_queue(struct netdev_queue *dev_queue,
                                             unsigned int pkts, unsigned int bytes)
{
#ifdef CONFIG_BQL
        if (unlikely(!bytes))
                return;

        dql_completed(&dev_queue->dql, bytes);

        /*
         * Without the memory barrier there is a small possibility that
         * netdev_tx_sent_queue will miss the update and cause the queue to
         * be stopped forever
         */
        smp_mb(); /* NOTE: netdev_txq_completed_mb() assumes this exists */

        if (unlikely(dql_avail(&dev_queue->dql) < 0))
                return;

        if (test_and_clear_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state))
                netif_schedule_queue(dev_queue);
#endif
}

/**
 *         netdev_completed_queue - report bytes and packets completed by device
 *         @dev: network device
 *         @pkts: actual number of packets sent over the medium
 *         @bytes: actual number of bytes sent over the medium
 *
 *         Report the number of bytes and packets transmitted by the network device
 *         hardware queue over the physical medium, @bytes must exactly match the
 *         @bytes amount passed to netdev_sent_queue()
 */
static inline void netdev_completed_queue(struct net_device *dev,
                                          unsigned int pkts, unsigned int bytes)
{
        netdev_tx_completed_queue(netdev_get_tx_queue(dev, 0), pkts, bytes);
}

static inline void netdev_tx_reset_queue(struct netdev_queue *q)
{
#ifdef CONFIG_BQL
        clear_bit(__QUEUE_STATE_STACK_XOFF, &q->state);
        dql_reset(&q->dql);
#endif
}

/**
 * netdev_tx_reset_subqueue - reset the BQL stats and state of a netdev queue
 * @dev: network device
 * @qid: stack index of the queue to reset
 */
static inline void netdev_tx_reset_subqueue(const struct net_device *dev,
                                            u32 qid)
{
        netdev_tx_reset_queue(netdev_get_tx_queue(dev, qid));
}

/**
 *         netdev_reset_queue - reset the packets and bytes count of a network device
 *         @dev_queue: network device
 *
 *         Reset the bytes and packet count of a network device and clear the
 *         software flow control OFF bit for this network device
 */
static inline void netdev_reset_queue(struct net_device *dev_queue)
{
        netdev_tx_reset_subqueue(dev_queue, 0);
}

/**
 *         netdev_cap_txqueue - check if selected tx queue exceeds device queues
 *         @dev: network device
 *         @queue_index: given tx queue index
 *
 *         Returns 0 if given tx queue index >= number of device tx queues,
 *         otherwise returns the originally passed tx queue index.
 */
static inline u16 netdev_cap_txqueue(struct net_device *dev, u16 queue_index)
{
        if (unlikely(queue_index >= dev->real_num_tx_queues)) {
                net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
                                     dev->name, queue_index,
                                     dev->real_num_tx_queues);
                return 0;
        }

        return queue_index;
}

/**
 *        netif_running - test if up
 *        @dev: network device
 *
 *        Test if the device has been brought up.
 */
static inline bool netif_running(const struct net_device *dev)
{
        return test_bit(__LINK_STATE_START, &dev->state);
}

/*
 * Routines to manage the subqueues on a device.  We only need start,
 * stop, and a check if it's stopped.  All other device management is
 * done at the overall netdevice level.
 * Also test the device if we're multiqueue.
 */

/**
 *        netif_start_subqueue - allow sending packets on subqueue
 *        @dev: network device
 *        @queue_index: sub queue index
 *
 * Start individual transmit queue of a device with multiple transmit queues.
 */
static inline void netif_start_subqueue(struct net_device *dev, u16 queue_index)
{
        struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);

        netif_tx_start_queue(txq);
}

/**
 *        netif_stop_subqueue - stop sending packets on subqueue
 *        @dev: network device
 *        @queue_index: sub queue index
 *
 * Stop individual transmit queue of a device with multiple transmit queues.
 */
static inline void netif_stop_subqueue(struct net_device *dev, u16 queue_index)
{
        struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
        netif_tx_stop_queue(txq);
}

/**
 *        __netif_subqueue_stopped - test status of subqueue
 *        @dev: network device
 *        @queue_index: sub queue index
 *
 * Check individual transmit queue of a device with multiple transmit queues.
 */
static inline bool __netif_subqueue_stopped(const struct net_device *dev,
                                            u16 queue_index)
{
        struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);

        return netif_tx_queue_stopped(txq);
}

/**
 *        netif_subqueue_stopped - test status of subqueue
 *        @dev: network device
 *        @skb: sub queue buffer pointer
 *
 * Check individual transmit queue of a device with multiple transmit queues.
 */
static inline bool netif_subqueue_stopped(const struct net_device *dev,
                                          struct sk_buff *skb)
{
        return __netif_subqueue_stopped(dev, skb_get_queue_mapping(skb));
}

/**
 *        netif_wake_subqueue - allow sending packets on subqueue
 *        @dev: network device
 *        @queue_index: sub queue index
 *
 * Resume individual transmit queue of a device with multiple transmit queues.
 */
static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
{
        struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);

        netif_tx_wake_queue(txq);
}

#ifdef CONFIG_XPS
int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
                        u16 index);
int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
                          u16 index, enum xps_map_type type);

/**
 *        netif_attr_test_mask - Test a CPU or Rx queue set in a mask
 *        @j: CPU/Rx queue index
 *        @mask: bitmask of all cpus/rx queues
 *        @nr_bits: number of bits in the bitmask
 *
 * Test if a CPU or Rx queue index is set in a mask of all CPU/Rx queues.
 */
static inline bool netif_attr_test_mask(unsigned long j,
                                        const unsigned long *mask,
                                        unsigned int nr_bits)
{
        cpu_max_bits_warn(j, nr_bits);
        return test_bit(j, mask);
}

/**
 *        netif_attr_test_online - Test for online CPU/Rx queue
 *        @j: CPU/Rx queue index
 *        @online_mask: bitmask for CPUs/Rx queues that are online
 *        @nr_bits: number of bits in the bitmask
 *
 * Returns: true if a CPU/Rx queue is online.
 */
static inline bool netif_attr_test_online(unsigned long j,
                                          const unsigned long *online_mask,
                                          unsigned int nr_bits)
{
        cpu_max_bits_warn(j, nr_bits);

        if (online_mask)
                return test_bit(j, online_mask);

        return (j < nr_bits);
}

/**
 *        netif_attrmask_next - get the next CPU/Rx queue in a cpu/Rx queues mask
 *        @n: CPU/Rx queue index
 *        @srcp: the cpumask/Rx queue mask pointer
 *        @nr_bits: number of bits in the bitmask
 *
 * Returns: next (after n) CPU/Rx queue index in the mask;
 * >= nr_bits if no further CPUs/Rx queues set.
 */
static inline unsigned int netif_attrmask_next(int n, const unsigned long *srcp,
                                               unsigned int nr_bits)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpu_max_bits_warn(n, nr_bits);

        if (srcp)
                return find_next_bit(srcp, nr_bits, n + 1);

        return n + 1;
}

/**
 *        netif_attrmask_next_and - get the next CPU/Rx queue in \*src1p & \*src2p
 *        @n: CPU/Rx queue index
 *        @src1p: the first CPUs/Rx queues mask pointer
 *        @src2p: the second CPUs/Rx queues mask pointer
 *        @nr_bits: number of bits in the bitmask
 *
 * Returns: next (after n) CPU/Rx queue index set in both masks;
 * >= nr_bits if no further CPUs/Rx queues set in both.
 */
static inline int netif_attrmask_next_and(int n, const unsigned long *src1p,
                                          const unsigned long *src2p,
                                          unsigned int nr_bits)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpu_max_bits_warn(n, nr_bits);

        if (src1p && src2p)
                return find_next_and_bit(src1p, src2p, nr_bits, n + 1);
        else if (src1p)
                return find_next_bit(src1p, nr_bits, n + 1);
        else if (src2p)
                return find_next_bit(src2p, nr_bits, n + 1);

        return n + 1;
}
#else
static inline int netif_set_xps_queue(struct net_device *dev,
                                      const struct cpumask *mask,
                                      u16 index)
{
        return 0;
}

static inline int __netif_set_xps_queue(struct net_device *dev,
                                        const unsigned long *mask,
                                        u16 index, enum xps_map_type type)
{
        return 0;
}
#endif

/**
 *        netif_is_multiqueue - test if device has multiple transmit queues
 *        @dev: network device
 *
 * Check if device has multiple transmit queues
 */
static inline bool netif_is_multiqueue(const struct net_device *dev)
{
        return dev->num_tx_queues > 1;
}

int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq);
int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq);
int netif_set_real_num_queues(struct net_device *dev,
                              unsigned int txq, unsigned int rxq);

int netif_get_num_default_rss_queues(void);

void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason);
void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason);

/*
 * It is not allowed to call kfree_skb() or consume_skb() from hardware
 * interrupt context or with hardware interrupts being disabled.
 * (in_hardirq() || irqs_disabled())
 *
 * We provide four helpers that can be used in following contexts :
 *
 * dev_kfree_skb_irq(skb) when caller drops a packet from irq context,
 *  replacing kfree_skb(skb)
 *
 * dev_consume_skb_irq(skb) when caller consumes a packet from irq context.
 *  Typically used in place of consume_skb(skb) in TX completion path
 *
 * dev_kfree_skb_any(skb) when caller doesn't know its current irq context,
 *  replacing kfree_skb(skb)
 *
 * dev_consume_skb_any(skb) when caller doesn't know its current irq context,
 *  and consumed a packet. Used in place of consume_skb(skb)
 */
static inline void dev_kfree_skb_irq(struct sk_buff *skb)
{
        dev_kfree_skb_irq_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED);
}

static inline void dev_consume_skb_irq(struct sk_buff *skb)
{
        dev_kfree_skb_irq_reason(skb, SKB_CONSUMED);
}

static inline void dev_kfree_skb_any(struct sk_buff *skb)
{
        dev_kfree_skb_any_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED);
}

static inline void dev_consume_skb_any(struct sk_buff *skb)
{
        dev_kfree_skb_any_reason(skb, SKB_CONSUMED);
}

u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
                             const struct bpf_prog *xdp_prog);
void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog);
int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb);
int netif_rx(struct sk_buff *skb);
int __netif_rx(struct sk_buff *skb);

int netif_receive_skb(struct sk_buff *skb);
int netif_receive_skb_core(struct sk_buff *skb);
void netif_receive_skb_list_internal(struct list_head *head);
void netif_receive_skb_list(struct list_head *head);
gro_result_t gro_receive_skb(struct gro_node *gro, struct sk_buff *skb);

static inline gro_result_t napi_gro_receive(struct napi_struct *napi,
                                            struct sk_buff *skb)
{
        return gro_receive_skb(&napi->gro, skb);
}

struct sk_buff *napi_get_frags(struct napi_struct *napi);
gro_result_t napi_gro_frags(struct napi_struct *napi);

static inline void napi_free_frags(struct napi_struct *napi)
{
        kfree_skb(napi->skb);
        napi->skb = NULL;
}

bool netdev_is_rx_handler_busy(struct net_device *dev);
int netdev_rx_handler_register(struct net_device *dev,
                               rx_handler_func_t *rx_handler,
                               void *rx_handler_data);
void netdev_rx_handler_unregister(struct net_device *dev);

bool dev_valid_name(const char *name);
static inline bool is_socket_ioctl_cmd(unsigned int cmd)
{
        return _IOC_TYPE(cmd) == SOCK_IOC_TYPE;
}
int get_user_ifreq(struct ifreq *ifr, void __user **ifrdata, void __user *arg);
int put_user_ifreq(struct ifreq *ifr, void __user *arg);
int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
                void __user *data, bool *need_copyout);
int dev_ifconf(struct net *net, struct ifconf __user *ifc);
int dev_eth_ioctl(struct net_device *dev,
                  struct ifreq *ifr, unsigned int cmd);
int generic_hwtstamp_get_lower(struct net_device *dev,
                               struct kernel_hwtstamp_config *kernel_cfg);
int generic_hwtstamp_set_lower(struct net_device *dev,
                               struct kernel_hwtstamp_config *kernel_cfg,
                               struct netlink_ext_ack *extack);
int dev_ethtool(struct net *net, struct ifreq *ifr, void __user *userdata);
unsigned int netif_get_flags(const struct net_device *dev);
int __dev_change_flags(struct net_device *dev, unsigned int flags,
                       struct netlink_ext_ack *extack);
int netif_change_flags(struct net_device *dev, unsigned int flags,
                       struct netlink_ext_ack *extack);
int dev_change_flags(struct net_device *dev, unsigned int flags,
                     struct netlink_ext_ack *extack);
int netif_set_alias(struct net_device *dev, const char *alias, size_t len);
int dev_set_alias(struct net_device *, const char *, size_t);
int dev_get_alias(const struct net_device *, char *, size_t);
int __dev_change_net_namespace(struct net_device *dev, struct net *net,
                               const char *pat, int new_ifindex,
                               struct netlink_ext_ack *extack);
int dev_change_net_namespace(struct net_device *dev, struct net *net,
                             const char *pat);
int __netif_set_mtu(struct net_device *dev, int new_mtu);
int netif_set_mtu(struct net_device *dev, int new_mtu);
int dev_set_mtu(struct net_device *, int);
int netif_pre_changeaddr_notify(struct net_device *dev, const char *addr,
                                struct netlink_ext_ack *extack);
int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss,
                          struct netlink_ext_ack *extack);
int dev_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss,
                        struct netlink_ext_ack *extack);
int dev_set_mac_address_user(struct net_device *dev, struct sockaddr_storage *ss,
                             struct netlink_ext_ack *extack);
int netif_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name);
int netif_get_port_parent_id(struct net_device *dev,
                             struct netdev_phys_item_id *ppid, bool recurse);
bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b);

struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again);
struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
                                    struct netdev_queue *txq, int *ret);

int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
u8 dev_xdp_prog_count(struct net_device *dev);
int netif_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf);
int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf);
u8 dev_xdp_sb_prog_count(struct net_device *dev);
u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode);

u32 dev_get_min_mp_channel_count(const struct net_device *dev);

int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb);
bool is_skb_forwardable(const struct net_device *dev,
                        const struct sk_buff *skb);

static __always_inline bool __is_skb_forwardable(const struct net_device *dev,
                                                 const struct sk_buff *skb,
                                                 const bool check_mtu)
{
        const u32 vlan_hdr_len = 4; /* VLAN_HLEN */
        unsigned int len;

        if (!(dev->flags & IFF_UP))
                return false;

        if (!check_mtu)
                return true;

        len = dev->mtu + dev->hard_header_len + vlan_hdr_len;
        if (skb->len <= len)
                return true;

        /* if TSO is enabled, we don't care about the length as the packet
         * could be forwarded without being segmented before
         */
        if (skb_is_gso(skb))
                return true;

        return false;
}

void netdev_core_stats_inc(struct net_device *dev, u32 offset);

#define DEV_CORE_STATS_INC(FIELD)                                                \
static inline void dev_core_stats_##FIELD##_inc(struct net_device *dev)                \
{                                                                                \
        netdev_core_stats_inc(dev,                                                \
                        offsetof(struct net_device_core_stats, FIELD));                \
}
DEV_CORE_STATS_INC(rx_dropped)
DEV_CORE_STATS_INC(tx_dropped)
DEV_CORE_STATS_INC(rx_nohandler)
DEV_CORE_STATS_INC(rx_otherhost_dropped)
#undef DEV_CORE_STATS_INC

static __always_inline int ____dev_forward_skb(struct net_device *dev,
                                               struct sk_buff *skb,
                                               const bool check_mtu)
{
        if (skb_orphan_frags(skb, GFP_ATOMIC) ||
            unlikely(!__is_skb_forwardable(dev, skb, check_mtu))) {
                dev_core_stats_rx_dropped_inc(dev);
                kfree_skb(skb);
                return NET_RX_DROP;
        }

        skb_scrub_packet(skb, !net_eq(dev_net(dev), dev_net(skb->dev)));
        skb->priority = 0;
        return 0;
}

bool dev_nit_active_rcu(const struct net_device *dev);
static inline bool dev_nit_active(const struct net_device *dev)
{
        bool ret;

        rcu_read_lock();
        ret = dev_nit_active_rcu(dev);
        rcu_read_unlock();
        return ret;
}

void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);

static inline void __dev_put(struct net_device *dev)
{
        if (dev) {
#ifdef CONFIG_PCPU_DEV_REFCNT
                this_cpu_dec(*dev->pcpu_refcnt);
#else
                refcount_dec(&dev->dev_refcnt);
#endif
        }
}

static inline void __dev_hold(struct net_device *dev)
{
        if (dev) {
#ifdef CONFIG_PCPU_DEV_REFCNT
                this_cpu_inc(*dev->pcpu_refcnt);
#else
                refcount_inc(&dev->dev_refcnt);
#endif
        }
}

static inline void __netdev_tracker_alloc(struct net_device *dev,
                                          netdevice_tracker *tracker,
                                          gfp_t gfp)
{
#ifdef CONFIG_NET_DEV_REFCNT_TRACKER
        ref_tracker_alloc(&dev->refcnt_tracker, tracker, gfp);
#endif
}

/* netdev_tracker_alloc() can upgrade a prior untracked reference
 * taken by dev_get_by_name()/dev_get_by_index() to a tracked one.
 */
static inline void netdev_tracker_alloc(struct net_device *dev,
                                        netdevice_tracker *tracker, gfp_t gfp)
{
#ifdef CONFIG_NET_DEV_REFCNT_TRACKER
        refcount_dec(&dev->refcnt_tracker.no_tracker);
        __netdev_tracker_alloc(dev, tracker, gfp);
#endif
}

static inline void netdev_tracker_free(struct net_device *dev,
                                       netdevice_tracker *tracker)
{
#ifdef CONFIG_NET_DEV_REFCNT_TRACKER
        ref_tracker_free(&dev->refcnt_tracker, tracker);
#endif
}

static inline void netdev_hold(struct net_device *dev,
                               netdevice_tracker *tracker, gfp_t gfp)
{
        if (dev) {
                __dev_hold(dev);
                __netdev_tracker_alloc(dev, tracker, gfp);
        }
}

static inline void netdev_put(struct net_device *dev,
                              netdevice_tracker *tracker)
{
        if (dev) {
                netdev_tracker_free(dev, tracker);
                __dev_put(dev);
        }
}

/**
 *        dev_hold - get reference to device
 *        @dev: network device
 *
 * Hold reference to device to keep it from being freed.
 * Try using netdev_hold() instead.
 */
static inline void dev_hold(struct net_device *dev)
{
        netdev_hold(dev, NULL, GFP_ATOMIC);
}

/**
 *        dev_put - release reference to device
 *        @dev: network device
 *
 * Release reference to device to allow it to be freed.
 * Try using netdev_put() instead.
 */
static inline void dev_put(struct net_device *dev)
{
        netdev_put(dev, NULL);
}

DEFINE_FREE(dev_put, struct net_device *, if (_T) dev_put(_T))

static inline void netdev_ref_replace(struct net_device *odev,
                                      struct net_device *ndev,
                                      netdevice_tracker *tracker,
                                      gfp_t gfp)
{
        if (odev)
                netdev_tracker_free(odev, tracker);

        __dev_hold(ndev);
        __dev_put(odev);

        if (ndev)
                __netdev_tracker_alloc(ndev, tracker, gfp);
}

/* Carrier loss detection, dial on demand. The functions netif_carrier_on
 * and _off may be called from IRQ context, but it is caller
 * who is responsible for serialization of these calls.
 *
 * The name carrier is inappropriate, these functions should really be
 * called netif_lowerlayer_*() because they represent the state of any
 * kind of lower layer not just hardware media.
 */
void linkwatch_fire_event(struct net_device *dev);

/**
 * linkwatch_sync_dev - sync linkwatch for the given device
 * @dev: network device to sync linkwatch for
 *
 * Sync linkwatch for the given device, removing it from the
 * pending work list (if queued).
 */
void linkwatch_sync_dev(struct net_device *dev);
void __linkwatch_sync_dev(struct net_device *dev);

/**
 *        netif_carrier_ok - test if carrier present
 *        @dev: network device
 *
 * Check if carrier is present on device
 */
static inline bool netif_carrier_ok(const struct net_device *dev)
{
        return !test_bit(__LINK_STATE_NOCARRIER, &dev->state);
}

unsigned long dev_trans_start(struct net_device *dev);

void netdev_watchdog_up(struct net_device *dev);

void netif_carrier_on(struct net_device *dev);
void netif_carrier_off(struct net_device *dev);
void netif_carrier_event(struct net_device *dev);

/**
 *        netif_dormant_on - mark device as dormant.
 *        @dev: network device
 *
 * Mark device as dormant (as per RFC2863).
 *
 * The dormant state indicates that the relevant interface is not
 * actually in a condition to pass packets (i.e., it is not 'up') but is
 * in a "pending" state, waiting for some external event.  For "on-
 * demand" interfaces, this new state identifies the situation where the
 * interface is waiting for events to place it in the up state.
 */
static inline void netif_dormant_on(struct net_device *dev)
{
        if (!test_and_set_bit(__LINK_STATE_DORMANT, &dev->state))
                linkwatch_fire_event(dev);
}

/**
 *        netif_dormant_off - set device as not dormant.
 *        @dev: network device
 *
 * Device is not in dormant state.
 */
static inline void netif_dormant_off(struct net_device *dev)
{
        if (test_and_clear_bit(__LINK_STATE_DORMANT, &dev->state))
                linkwatch_fire_event(dev);
}

/**
 *        netif_dormant - test if device is dormant
 *        @dev: network device
 *
 * Check if device is dormant.
 */
static inline bool netif_dormant(const struct net_device *dev)
{
        return test_bit(__LINK_STATE_DORMANT, &dev->state);
}


/**
 *        netif_testing_on - mark device as under test.
 *        @dev: network device
 *
 * Mark device as under test (as per RFC2863).
 *
 * The testing state indicates that some test(s) must be performed on
 * the interface. After completion, of the test, the interface state
 * will change to up, dormant, or down, as appropriate.
 */
static inline void netif_testing_on(struct net_device *dev)
{
        if (!test_and_set_bit(__LINK_STATE_TESTING, &dev->state))
                linkwatch_fire_event(dev);
}

/**
 *        netif_testing_off - set device as not under test.
 *        @dev: network device
 *
 * Device is not in testing state.
 */
static inline void netif_testing_off(struct net_device *dev)
{
        if (test_and_clear_bit(__LINK_STATE_TESTING, &dev->state))
                linkwatch_fire_event(dev);
}

/**
 *        netif_testing - test if device is under test
 *        @dev: network device
 *
 * Check if device is under test
 */
static inline bool netif_testing(const struct net_device *dev)
{
        return test_bit(__LINK_STATE_TESTING, &dev->state);
}


/**
 *        netif_oper_up - test if device is operational
 *        @dev: network device
 *
 * Check if carrier is operational
 */
static inline bool netif_oper_up(const struct net_device *dev)
{
        unsigned int operstate = READ_ONCE(dev->operstate);

        return        operstate == IF_OPER_UP ||
                operstate == IF_OPER_UNKNOWN /* backward compat */;
}

/**
 *        netif_device_present - is device available or removed
 *        @dev: network device
 *
 * Check if device has not been removed from system.
 */
static inline bool netif_device_present(const struct net_device *dev)
{
        return test_bit(__LINK_STATE_PRESENT, &dev->state);
}

void netif_device_detach(struct net_device *dev);

void netif_device_attach(struct net_device *dev);

/*
 * Network interface message level settings
 */

enum {
        NETIF_MSG_DRV_BIT,
        NETIF_MSG_PROBE_BIT,
        NETIF_MSG_LINK_BIT,
        NETIF_MSG_TIMER_BIT,
        NETIF_MSG_IFDOWN_BIT,
        NETIF_MSG_IFUP_BIT,
        NETIF_MSG_RX_ERR_BIT,
        NETIF_MSG_TX_ERR_BIT,
        NETIF_MSG_TX_QUEUED_BIT,
        NETIF_MSG_INTR_BIT,
        NETIF_MSG_TX_DONE_BIT,
        NETIF_MSG_RX_STATUS_BIT,
        NETIF_MSG_PKTDATA_BIT,
        NETIF_MSG_HW_BIT,
        NETIF_MSG_WOL_BIT,

        /* When you add a new bit above, update netif_msg_class_names array
         * in net/ethtool/common.c
         */
        NETIF_MSG_CLASS_COUNT,
};
/* Both ethtool_ops interface and internal driver implementation use u32 */
static_assert(NETIF_MSG_CLASS_COUNT <= 32);

#define __NETIF_MSG_BIT(bit)        ((u32)1 << (bit))
#define __NETIF_MSG(name)        __NETIF_MSG_BIT(NETIF_MSG_ ## name ## _BIT)

#define NETIF_MSG_DRV                __NETIF_MSG(DRV)
#define NETIF_MSG_PROBE                __NETIF_MSG(PROBE)
#define NETIF_MSG_LINK                __NETIF_MSG(LINK)
#define NETIF_MSG_TIMER                __NETIF_MSG(TIMER)
#define NETIF_MSG_IFDOWN        __NETIF_MSG(IFDOWN)
#define NETIF_MSG_IFUP                __NETIF_MSG(IFUP)
#define NETIF_MSG_RX_ERR        __NETIF_MSG(RX_ERR)
#define NETIF_MSG_TX_ERR        __NETIF_MSG(TX_ERR)
#define NETIF_MSG_TX_QUEUED        __NETIF_MSG(TX_QUEUED)
#define NETIF_MSG_INTR                __NETIF_MSG(INTR)
#define NETIF_MSG_TX_DONE        __NETIF_MSG(TX_DONE)
#define NETIF_MSG_RX_STATUS        __NETIF_MSG(RX_STATUS)
#define NETIF_MSG_PKTDATA        __NETIF_MSG(PKTDATA)
#define NETIF_MSG_HW                __NETIF_MSG(HW)
#define NETIF_MSG_WOL                __NETIF_MSG(WOL)

#define netif_msg_drv(p)        ((p)->msg_enable & NETIF_MSG_DRV)
#define netif_msg_probe(p)        ((p)->msg_enable & NETIF_MSG_PROBE)
#define netif_msg_link(p)        ((p)->msg_enable & NETIF_MSG_LINK)
#define netif_msg_timer(p)        ((p)->msg_enable & NETIF_MSG_TIMER)
#define netif_msg_ifdown(p)        ((p)->msg_enable & NETIF_MSG_IFDOWN)
#define netif_msg_ifup(p)        ((p)->msg_enable & NETIF_MSG_IFUP)
#define netif_msg_rx_err(p)        ((p)->msg_enable & NETIF_MSG_RX_ERR)
#define netif_msg_tx_err(p)        ((p)->msg_enable & NETIF_MSG_TX_ERR)
#define netif_msg_tx_queued(p)        ((p)->msg_enable & NETIF_MSG_TX_QUEUED)
#define netif_msg_intr(p)        ((p)->msg_enable & NETIF_MSG_INTR)
#define netif_msg_tx_done(p)        ((p)->msg_enable & NETIF_MSG_TX_DONE)
#define netif_msg_rx_status(p)        ((p)->msg_enable & NETIF_MSG_RX_STATUS)
#define netif_msg_pktdata(p)        ((p)->msg_enable & NETIF_MSG_PKTDATA)
#define netif_msg_hw(p)                ((p)->msg_enable & NETIF_MSG_HW)
#define netif_msg_wol(p)        ((p)->msg_enable & NETIF_MSG_WOL)

static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits)
{
        /* use default */
        if (debug_value < 0 || debug_value >= (sizeof(u32) * 8))
                return default_msg_enable_bits;
        if (debug_value == 0)        /* no output */
                return 0;
        /* set low N bits */
        return (1U << debug_value) - 1;
}

static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu)
{
        spin_lock(&txq->_xmit_lock);
        /* Pairs with READ_ONCE() in __dev_queue_xmit() */
        WRITE_ONCE(txq->xmit_lock_owner, cpu);
}

static inline bool __netif_tx_acquire(struct netdev_queue *txq)
{
        __acquire(&txq->_xmit_lock);
        return true;
}

static inline void __netif_tx_release(struct netdev_queue *txq)
{
        __release(&txq->_xmit_lock);
}

static inline void __netif_tx_lock_bh(struct netdev_queue *txq)
{
        spin_lock_bh(&txq->_xmit_lock);
        /* Pairs with READ_ONCE() in __dev_queue_xmit() */
        WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id());
}

static inline bool __netif_tx_trylock(struct netdev_queue *txq)
{
        bool ok = spin_trylock(&txq->_xmit_lock);

        if (likely(ok)) {
                /* Pairs with READ_ONCE() in __dev_queue_xmit() */
                WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id());
        }
        return ok;
}

static inline void __netif_tx_unlock(struct netdev_queue *txq)
{
        /* Pairs with READ_ONCE() in __dev_queue_xmit() */
        WRITE_ONCE(txq->xmit_lock_owner, -1);
        spin_unlock(&txq->_xmit_lock);
}

static inline void __netif_tx_unlock_bh(struct netdev_queue *txq)
{
        /* Pairs with READ_ONCE() in __dev_queue_xmit() */
        WRITE_ONCE(txq->xmit_lock_owner, -1);
        spin_unlock_bh(&txq->_xmit_lock);
}

/*
 * txq->trans_start can be read locklessly from dev_watchdog()
 */
static inline void txq_trans_update(const struct net_device *dev,
                                    struct netdev_queue *txq)
{
        if (!dev->lltx)
                WRITE_ONCE(txq->trans_start, jiffies);
}

static inline void txq_trans_cond_update(struct netdev_queue *txq)
{
        unsigned long now = jiffies;

        if (READ_ONCE(txq->trans_start) != now)
                WRITE_ONCE(txq->trans_start, now);
}

/* legacy drivers only, netdev_start_xmit() sets txq->trans_start */
static inline void netif_trans_update(struct net_device *dev)
{
        struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);

        txq_trans_cond_update(txq);
}

/**
 *        netif_tx_lock - grab network device transmit lock
 *        @dev: network device
 *
 * Get network device transmit lock
 */
void netif_tx_lock(struct net_device *dev);

static inline void netif_tx_lock_bh(struct net_device *dev)
{
        local_bh_disable();
        netif_tx_lock(dev);
}

void netif_tx_unlock(struct net_device *dev);

static inline void netif_tx_unlock_bh(struct net_device *dev)
{
        netif_tx_unlock(dev);
        local_bh_enable();
}

#define HARD_TX_LOCK(dev, txq, cpu) {                        \
        if (!(dev)->lltx) {                                \
                __netif_tx_lock(txq, cpu);                \
        } else {                                        \
                __netif_tx_acquire(txq);                \
        }                                                \
}

#define HARD_TX_TRYLOCK(dev, txq)                        \
        (!(dev)->lltx ?                                        \
                __netif_tx_trylock(txq) :                \
                __netif_tx_acquire(txq))

#define HARD_TX_UNLOCK(dev, txq) {                        \
        if (!(dev)->lltx) {                                \
                __netif_tx_unlock(txq);                        \
        } else {                                        \
                __netif_tx_release(txq);                \
        }                                                \
}

static inline void netif_tx_disable(struct net_device *dev)
{
        unsigned int i;
        int cpu;

        local_bh_disable();
        cpu = smp_processor_id();
        spin_lock(&dev->tx_global_lock);
        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);

                __netif_tx_lock(txq, cpu);
                netif_tx_stop_queue(txq);
                __netif_tx_unlock(txq);
        }
        spin_unlock(&dev->tx_global_lock);
        local_bh_enable();
}

static inline void netif_addr_lock(struct net_device *dev)
{
        unsigned char nest_level = 0;

#ifdef CONFIG_LOCKDEP
        nest_level = dev->nested_level;
#endif
        spin_lock_nested(&dev->addr_list_lock, nest_level);
}

static inline void netif_addr_lock_bh(struct net_device *dev)
{
        unsigned char nest_level = 0;

#ifdef CONFIG_LOCKDEP
        nest_level = dev->nested_level;
#endif
        local_bh_disable();
        spin_lock_nested(&dev->addr_list_lock, nest_level);
}

static inline void netif_addr_unlock(struct net_device *dev)
{
        spin_unlock(&dev->addr_list_lock);
}

static inline void netif_addr_unlock_bh(struct net_device *dev)
{
        spin_unlock_bh(&dev->addr_list_lock);
}

/*
 * dev_addrs walker. Should be used only for read access. Call with
 * rcu_read_lock held.
 */
#define for_each_dev_addr(dev, ha) \
                list_for_each_entry_rcu(ha, &dev->dev_addrs.list, list)

/* These functions live elsewhere (drivers/net/net_init.c, but related) */

void ether_setup(struct net_device *dev);

/* Allocate dummy net_device */
struct net_device *alloc_netdev_dummy(int sizeof_priv);

/* Support for loadable net-drivers */
struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
                                    unsigned char name_assign_type,
                                    void (*setup)(struct net_device *),
                                    unsigned int txqs, unsigned int rxqs);
#define alloc_netdev(sizeof_priv, name, name_assign_type, setup) \
        alloc_netdev_mqs(sizeof_priv, name, name_assign_type, setup, 1, 1)

#define alloc_netdev_mq(sizeof_priv, name, name_assign_type, setup, count) \
        alloc_netdev_mqs(sizeof_priv, name, name_assign_type, setup, count, \
                         count)

int register_netdev(struct net_device *dev);
void unregister_netdev(struct net_device *dev);

int devm_register_netdev(struct device *dev, struct net_device *ndev);

/* General hardware address lists handling functions */
int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
                   struct netdev_hw_addr_list *from_list, int addr_len);
int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list,
                            struct netdev_hw_addr_list *from_list,
                            int addr_len);
void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
                      struct netdev_hw_addr_list *from_list, int addr_len);
int __hw_addr_sync_dev(struct netdev_hw_addr_list *list,
                       struct net_device *dev,
                       int (*sync)(struct net_device *, const unsigned char *),
                       int (*unsync)(struct net_device *,
                                     const unsigned char *));
int __hw_addr_ref_sync_dev(struct netdev_hw_addr_list *list,
                           struct net_device *dev,
                           int (*sync)(struct net_device *,
                                       const unsigned char *, int),
                           int (*unsync)(struct net_device *,
                                         const unsigned char *, int));
void __hw_addr_ref_unsync_dev(struct netdev_hw_addr_list *list,
                              struct net_device *dev,
                              int (*unsync)(struct net_device *,
                                            const unsigned char *, int));
void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list,
                          struct net_device *dev,
                          int (*unsync)(struct net_device *,
                                        const unsigned char *));
void __hw_addr_init(struct netdev_hw_addr_list *list);

/* Functions used for device addresses handling */
void dev_addr_mod(struct net_device *dev, unsigned int offset,
                  const void *addr, size_t len);

static inline void
__dev_addr_set(struct net_device *dev, const void *addr, size_t len)
{
        dev_addr_mod(dev, 0, addr, len);
}

static inline void dev_addr_set(struct net_device *dev, const u8 *addr)
{
        __dev_addr_set(dev, addr, dev->addr_len);
}

int dev_addr_add(struct net_device *dev, const unsigned char *addr,
                 unsigned char addr_type);
int dev_addr_del(struct net_device *dev, const unsigned char *addr,
                 unsigned char addr_type);

/* Functions used for unicast addresses handling */
int dev_uc_add(struct net_device *dev, const unsigned char *addr);
int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr);
int dev_uc_del(struct net_device *dev, const unsigned char *addr);
int dev_uc_sync(struct net_device *to, struct net_device *from);
int dev_uc_sync_multiple(struct net_device *to, struct net_device *from);
void dev_uc_unsync(struct net_device *to, struct net_device *from);
void dev_uc_flush(struct net_device *dev);
void dev_uc_init(struct net_device *dev);

/**
 *  __dev_uc_sync - Synchronize device's unicast list
 *  @dev:  device to sync
 *  @sync: function to call if address should be added
 *  @unsync: function to call if address should be removed
 *
 *  Add newly added addresses to the interface, and release
 *  addresses that have been deleted.
 */
static inline int __dev_uc_sync(struct net_device *dev,
                                int (*sync)(struct net_device *,
                                            const unsigned char *),
                                int (*unsync)(struct net_device *,
                                              const unsigned char *))
{
        return __hw_addr_sync_dev(&dev->uc, dev, sync, unsync);
}

/**
 *  __dev_uc_unsync - Remove synchronized addresses from device
 *  @dev:  device to sync
 *  @unsync: function to call if address should be removed
 *
 *  Remove all addresses that were added to the device by dev_uc_sync().
 */
static inline void __dev_uc_unsync(struct net_device *dev,
                                   int (*unsync)(struct net_device *,
                                                 const unsigned char *))
{
        __hw_addr_unsync_dev(&dev->uc, dev, unsync);
}

/* Functions used for multicast addresses handling */
int dev_mc_add(struct net_device *dev, const unsigned char *addr);
int dev_mc_add_global(struct net_device *dev, const unsigned char *addr);
int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr);
int dev_mc_del(struct net_device *dev, const unsigned char *addr);
int dev_mc_del_global(struct net_device *dev, const unsigned char *addr);
int dev_mc_sync(struct net_device *to, struct net_device *from);
int dev_mc_sync_multiple(struct net_device *to, struct net_device *from);
void dev_mc_unsync(struct net_device *to, struct net_device *from);
void dev_mc_flush(struct net_device *dev);
void dev_mc_init(struct net_device *dev);

/**
 *  __dev_mc_sync - Synchronize device's multicast list
 *  @dev:  device to sync
 *  @sync: function to call if address should be added
 *  @unsync: function to call if address should be removed
 *
 *  Add newly added addresses to the interface, and release
 *  addresses that have been deleted.
 */
static inline int __dev_mc_sync(struct net_device *dev,
                                int (*sync)(struct net_device *,
                                            const unsigned char *),
                                int (*unsync)(struct net_device *,
                                              const unsigned char *))
{
        return __hw_addr_sync_dev(&dev->mc, dev, sync, unsync);
}

/**
 *  __dev_mc_unsync - Remove synchronized addresses from device
 *  @dev:  device to sync
 *  @unsync: function to call if address should be removed
 *
 *  Remove all addresses that were added to the device by dev_mc_sync().
 */
static inline void __dev_mc_unsync(struct net_device *dev,
                                   int (*unsync)(struct net_device *,
                                                 const unsigned char *))
{
        __hw_addr_unsync_dev(&dev->mc, dev, unsync);
}

/* Functions used for secondary unicast and multicast support */
void dev_set_rx_mode(struct net_device *dev);
int netif_set_promiscuity(struct net_device *dev, int inc);
int dev_set_promiscuity(struct net_device *dev, int inc);
int netif_set_allmulti(struct net_device *dev, int inc, bool notify);
int dev_set_allmulti(struct net_device *dev, int inc);
void netif_state_change(struct net_device *dev);
void netdev_state_change(struct net_device *dev);
void __netdev_notify_peers(struct net_device *dev);
void netdev_notify_peers(struct net_device *dev);
void netdev_features_change(struct net_device *dev);
/* Load a device via the kmod */
void dev_load(struct net *net, const char *name);
struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
                                        struct rtnl_link_stats64 *storage);
void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
                             const struct net_device_stats *netdev_stats);
void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
                           const struct pcpu_sw_netstats __percpu *netstats);
void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s);

enum {
        NESTED_SYNC_IMM_BIT,
        NESTED_SYNC_TODO_BIT,
};

#define __NESTED_SYNC_BIT(bit)        ((u32)1 << (bit))
#define __NESTED_SYNC(name)        __NESTED_SYNC_BIT(NESTED_SYNC_ ## name ## _BIT)

#define NESTED_SYNC_IMM                __NESTED_SYNC(IMM)
#define NESTED_SYNC_TODO        __NESTED_SYNC(TODO)

struct netdev_nested_priv {
        unsigned char flags;
        void *data;
};

bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev);
struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
                                                     struct list_head **iter);

/* iterate through upper list, must be called under RCU read lock */
#define netdev_for_each_upper_dev_rcu(dev, updev, iter) \
        for (iter = &(dev)->adj_list.upper, \
             updev = netdev_upper_get_next_dev_rcu(dev, &(iter)); \
             updev; \
             updev = netdev_upper_get_next_dev_rcu(dev, &(iter)))

int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
                                  int (*fn)(struct net_device *upper_dev,
                                            struct netdev_nested_priv *priv),
                                  struct netdev_nested_priv *priv);

bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
                                  struct net_device *upper_dev);

bool netdev_has_any_upper_dev(struct net_device *dev);

void *netdev_lower_get_next_private(struct net_device *dev,
                                    struct list_head **iter);
void *netdev_lower_get_next_private_rcu(struct net_device *dev,
                                        struct list_head **iter);

#define netdev_for_each_lower_private(dev, priv, iter) \
        for (iter = (dev)->adj_list.lower.next, \
             priv = netdev_lower_get_next_private(dev, &(iter)); \
             priv; \
             priv = netdev_lower_get_next_private(dev, &(iter)))

#define netdev_for_each_lower_private_rcu(dev, priv, iter) \
        for (iter = &(dev)->adj_list.lower, \
             priv = netdev_lower_get_next_private_rcu(dev, &(iter)); \
             priv; \
             priv = netdev_lower_get_next_private_rcu(dev, &(iter)))

void *netdev_lower_get_next(struct net_device *dev,
                                struct list_head **iter);

#define netdev_for_each_lower_dev(dev, ldev, iter) \
        for (iter = (dev)->adj_list.lower.next, \
             ldev = netdev_lower_get_next(dev, &(iter)); \
             ldev; \
             ldev = netdev_lower_get_next(dev, &(iter)))

struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
                                             struct list_head **iter);
int netdev_walk_all_lower_dev(struct net_device *dev,
                              int (*fn)(struct net_device *lower_dev,
                                        struct netdev_nested_priv *priv),
                              struct netdev_nested_priv *priv);
int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
                                  int (*fn)(struct net_device *lower_dev,
                                            struct netdev_nested_priv *priv),
                                  struct netdev_nested_priv *priv);

void *netdev_adjacent_get_private(struct list_head *adj_list);
void *netdev_lower_get_first_private_rcu(struct net_device *dev);
struct net_device *netdev_master_upper_dev_get(struct net_device *dev);
struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev);
int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev,
                          struct netlink_ext_ack *extack);
int netdev_master_upper_dev_link(struct net_device *dev,
                                 struct net_device *upper_dev,
                                 void *upper_priv, void *upper_info,
                                 struct netlink_ext_ack *extack);
void netdev_upper_dev_unlink(struct net_device *dev,
                             struct net_device *upper_dev);
int netdev_adjacent_change_prepare(struct net_device *old_dev,
                                   struct net_device *new_dev,
                                   struct net_device *dev,
                                   struct netlink_ext_ack *extack);
void netdev_adjacent_change_commit(struct net_device *old_dev,
                                   struct net_device *new_dev,
                                   struct net_device *dev);
void netdev_adjacent_change_abort(struct net_device *old_dev,
                                  struct net_device *new_dev,
                                  struct net_device *dev);
void netdev_adjacent_rename_links(struct net_device *dev, char *oldname);
void *netdev_lower_dev_get_private(struct net_device *dev,
                                   struct net_device *lower_dev);
void netdev_lower_state_changed(struct net_device *lower_dev,
                                void *lower_state_info);

/* RSS keys are 40 or 52 bytes long */
#define NETDEV_RSS_KEY_LEN 52
extern u8 netdev_rss_key[NETDEV_RSS_KEY_LEN] __read_mostly;
void netdev_rss_key_fill(void *buffer, size_t len);

int skb_checksum_help(struct sk_buff *skb);
int skb_crc32c_csum_help(struct sk_buff *skb);
int skb_csum_hwoffload_help(struct sk_buff *skb,
                            const netdev_features_t features);

struct netdev_bonding_info {
        ifslave        slave;
        ifbond        master;
};

struct netdev_notifier_bonding_info {
        struct netdev_notifier_info info; /* must be first */
        struct netdev_bonding_info  bonding_info;
};

void netdev_bonding_info_change(struct net_device *dev,
                                struct netdev_bonding_info *bonding_info);

#if IS_ENABLED(CONFIG_ETHTOOL_NETLINK)
void ethtool_notify(struct net_device *dev, unsigned int cmd);
#else
static inline void ethtool_notify(struct net_device *dev, unsigned int cmd)
{
}
#endif

__be16 skb_network_protocol(struct sk_buff *skb, int *depth);

static inline bool can_checksum_protocol(netdev_features_t features,
                                         __be16 protocol)
{
        if (protocol == htons(ETH_P_FCOE))
                return !!(features & NETIF_F_FCOE_CRC);

        /* Assume this is an IP checksum (not SCTP CRC) */

        if (features & NETIF_F_HW_CSUM) {
                /* Can checksum everything */
                return true;
        }

        switch (protocol) {
        case htons(ETH_P_IP):
                return !!(features & NETIF_F_IP_CSUM);
        case htons(ETH_P_IPV6):
                return !!(features & NETIF_F_IPV6_CSUM);
        default:
                return false;
        }
}

#ifdef CONFIG_BUG
void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb);
#else
static inline void netdev_rx_csum_fault(struct net_device *dev,
                                        struct sk_buff *skb)
{
}
#endif
/* rx skb timestamps */
void net_enable_timestamp(void);
void net_disable_timestamp(void);

static inline ktime_t netdev_get_tstamp(struct net_device *dev,
                                        const struct skb_shared_hwtstamps *hwtstamps,
                                        bool cycles)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (ops->ndo_get_tstamp)
                return ops->ndo_get_tstamp(dev, hwtstamps, cycles);

        return hwtstamps->hwtstamp;
}

#ifndef CONFIG_PREEMPT_RT
static inline void netdev_xmit_set_more(bool more)
{
        __this_cpu_write(softnet_data.xmit.more, more);
}

static inline bool netdev_xmit_more(void)
{
        return __this_cpu_read(softnet_data.xmit.more);
}
#else
static inline void netdev_xmit_set_more(bool more)
{
        current->net_xmit.more = more;
}

static inline bool netdev_xmit_more(void)
{
        return current->net_xmit.more;
}
#endif

static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops,
                                              struct sk_buff *skb, struct net_device *dev,
                                              bool more)
{
        netdev_xmit_set_more(more);
        return ops->ndo_start_xmit(skb, dev);
}

static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev,
                                            struct netdev_queue *txq, bool more)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        netdev_tx_t rc;

        rc = __netdev_start_xmit(ops, skb, dev, more);
        if (rc == NETDEV_TX_OK)
                txq_trans_update(dev, txq);

        return rc;
}

int netdev_class_create_file_ns(const struct class_attribute *class_attr,
                                const void *ns);
void netdev_class_remove_file_ns(const struct class_attribute *class_attr,
                                 const void *ns);

extern const struct kobj_ns_type_operations net_ns_type_operations;

const char *netdev_drivername(const struct net_device *dev);

static inline netdev_features_t netdev_intersect_features(netdev_features_t f1,
                                                          netdev_features_t f2)
{
        if ((f1 ^ f2) & NETIF_F_HW_CSUM) {
                if (f1 & NETIF_F_HW_CSUM)
                        f1 |= (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
                else
                        f2 |= (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
        }

        return f1 & f2;
}

static inline netdev_features_t netdev_get_wanted_features(
        struct net_device *dev)
{
        return (dev->features & ~dev->hw_features) | dev->wanted_features;
}
netdev_features_t netdev_increment_features(netdev_features_t all,
        netdev_features_t one, netdev_features_t mask);

/* Allow TSO being used on stacked device :
 * Performing the GSO segmentation before last device
 * is a performance improvement.
 */
static inline netdev_features_t netdev_add_tso_features(netdev_features_t features,
                                                        netdev_features_t mask)
{
        return netdev_increment_features(features, NETIF_F_ALL_TSO, mask);
}

int __netdev_update_features(struct net_device *dev);
void netdev_update_features(struct net_device *dev);
void netdev_change_features(struct net_device *dev);

void netif_stacked_transfer_operstate(const struct net_device *rootdev,
                                        struct net_device *dev);

netdev_features_t passthru_features_check(struct sk_buff *skb,
                                          struct net_device *dev,
                                          netdev_features_t features);
netdev_features_t netif_skb_features(struct sk_buff *skb);
void skb_warn_bad_offload(const struct sk_buff *skb);

static inline bool net_gso_ok(netdev_features_t features, int gso_type)
{
        netdev_features_t feature;

        if (gso_type & (SKB_GSO_TCP_FIXEDID | SKB_GSO_TCP_FIXEDID_INNER))
                gso_type |= __SKB_GSO_TCP_FIXEDID;

        feature = ((netdev_features_t)gso_type << NETIF_F_GSO_SHIFT) & NETIF_F_GSO_MASK;

        /* check flags correspondence */
        BUILD_BUG_ON(SKB_GSO_TCPV4   != (NETIF_F_TSO >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_DODGY   != (NETIF_F_GSO_ROBUST >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_TCP_ECN != (NETIF_F_TSO_ECN >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(__SKB_GSO_TCP_FIXEDID != (NETIF_F_TSO_MANGLEID >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_TCPV6   != (NETIF_F_TSO6 >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_FCOE    != (NETIF_F_FSO >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_GRE     != (NETIF_F_GSO_GRE >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_GRE_CSUM != (NETIF_F_GSO_GRE_CSUM >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_IPXIP4  != (NETIF_F_GSO_IPXIP4 >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_IPXIP6  != (NETIF_F_GSO_IPXIP6 >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL != (NETIF_F_GSO_UDP_TUNNEL >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_PARTIAL != (NETIF_F_GSO_PARTIAL >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_SCTP    != (NETIF_F_GSO_SCTP >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_ESP != (NETIF_F_GSO_ESP >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_GSO_UDP >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_UDP_L4 != (NETIF_F_GSO_UDP_L4 >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_FRAGLIST != (NETIF_F_GSO_FRAGLIST >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_TCP_ACCECN !=
                     (NETIF_F_GSO_ACCECN >> NETIF_F_GSO_SHIFT));

        return (features & feature) == feature;
}

static inline bool skb_gso_ok(struct sk_buff *skb, netdev_features_t features)
{
        return net_gso_ok(features, skb_shinfo(skb)->gso_type) &&
               (!skb_has_frag_list(skb) || (features & NETIF_F_FRAGLIST));
}

static inline bool netif_needs_gso(struct sk_buff *skb,
                                   netdev_features_t features)
{
        return skb_is_gso(skb) && (!skb_gso_ok(skb, features) ||
                unlikely((skb->ip_summed != CHECKSUM_PARTIAL) &&
                         (skb->ip_summed != CHECKSUM_UNNECESSARY)));
}

void netif_set_tso_max_size(struct net_device *dev, unsigned int size);
void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs);
void netif_inherit_tso_max(struct net_device *to,
                           const struct net_device *from);

static inline unsigned int
netif_get_gro_max_size(const struct net_device *dev, const struct sk_buff *skb)
{
        /* pairs with WRITE_ONCE() in netif_set_gro(_ipv4)_max_size() */
        return skb->protocol == htons(ETH_P_IPV6) ?
               READ_ONCE(dev->gro_max_size) :
               READ_ONCE(dev->gro_ipv4_max_size);
}

static inline unsigned int
netif_get_gso_max_size(const struct net_device *dev, const struct sk_buff *skb)
{
        /* pairs with WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
        return skb->protocol == htons(ETH_P_IPV6) ?
               READ_ONCE(dev->gso_max_size) :
               READ_ONCE(dev->gso_ipv4_max_size);
}

static inline bool netif_is_macsec(const struct net_device *dev)
{
        return dev->priv_flags & IFF_MACSEC;
}

static inline bool netif_is_macvlan(const struct net_device *dev)
{
        return dev->priv_flags & IFF_MACVLAN;
}

static inline bool netif_is_macvlan_port(const struct net_device *dev)
{
        return dev->priv_flags & IFF_MACVLAN_PORT;
}

static inline bool netif_is_bond_master(const struct net_device *dev)
{
        return dev->flags & IFF_MASTER && dev->priv_flags & IFF_BONDING;
}

static inline bool netif_is_bond_slave(const struct net_device *dev)
{
        return dev->flags & IFF_SLAVE && dev->priv_flags & IFF_BONDING;
}

static inline bool netif_supports_nofcs(struct net_device *dev)
{
        return dev->priv_flags & IFF_SUPP_NOFCS;
}

static inline bool netif_has_l3_rx_handler(const struct net_device *dev)
{
        return dev->priv_flags & IFF_L3MDEV_RX_HANDLER;
}

static inline bool netif_is_l3_master(const struct net_device *dev)
{
        return dev->priv_flags & IFF_L3MDEV_MASTER;
}

static inline bool netif_is_l3_slave(const struct net_device *dev)
{
        return dev->priv_flags & IFF_L3MDEV_SLAVE;
}

static inline int dev_sdif(const struct net_device *dev)
{
#ifdef CONFIG_NET_L3_MASTER_DEV
        if (netif_is_l3_slave(dev))
                return dev->ifindex;
#endif
        return 0;
}

static inline bool netif_is_bridge_master(const struct net_device *dev)
{
        return dev->priv_flags & IFF_EBRIDGE;
}

static inline bool netif_is_bridge_port(const struct net_device *dev)
{
        return dev->priv_flags & IFF_BRIDGE_PORT;
}

static inline bool netif_is_ovs_master(const struct net_device *dev)
{
        return dev->priv_flags & IFF_OPENVSWITCH;
}

static inline bool netif_is_ovs_port(const struct net_device *dev)
{
        return dev->priv_flags & IFF_OVS_DATAPATH;
}

static inline bool netif_is_any_bridge_master(const struct net_device *dev)
{
        return netif_is_bridge_master(dev) || netif_is_ovs_master(dev);
}

static inline bool netif_is_any_bridge_port(const struct net_device *dev)
{
        return netif_is_bridge_port(dev) || netif_is_ovs_port(dev);
}

static inline bool netif_is_team_master(const struct net_device *dev)
{
        return dev->priv_flags & IFF_TEAM;
}

static inline bool netif_is_team_port(const struct net_device *dev)
{
        return dev->priv_flags & IFF_TEAM_PORT;
}

static inline bool netif_is_lag_master(const struct net_device *dev)
{
        return netif_is_bond_master(dev) || netif_is_team_master(dev);
}

static inline bool netif_is_lag_port(const struct net_device *dev)
{
        return netif_is_bond_slave(dev) || netif_is_team_port(dev);
}

static inline bool netif_is_rxfh_configured(const struct net_device *dev)
{
        return dev->priv_flags & IFF_RXFH_CONFIGURED;
}

static inline bool netif_is_failover(const struct net_device *dev)
{
        return dev->priv_flags & IFF_FAILOVER;
}

static inline bool netif_is_failover_slave(const struct net_device *dev)
{
        return dev->priv_flags & IFF_FAILOVER_SLAVE;
}

/* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
static inline void netif_keep_dst(struct net_device *dev)
{
        dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM);
}

/* return true if dev can't cope with mtu frames that need vlan tag insertion */
static inline bool netif_reduces_vlan_mtu(struct net_device *dev)
{
        /* TODO: reserve and use an additional IFF bit, if we get more users */
        return netif_is_macsec(dev);
}

extern struct pernet_operations __net_initdata loopback_net_ops;

/* Logging, debugging and troubleshooting/diagnostic helpers. */

/* netdev_printk helpers, similar to dev_printk */

static inline const char *netdev_name(const struct net_device *dev)
{
        if (!dev->name[0] || strchr(dev->name, '%'))
                return "(unnamed net_device)";
        return dev->name;
}

static inline const char *netdev_reg_state(const struct net_device *dev)
{
        u8 reg_state = READ_ONCE(dev->reg_state);

        switch (reg_state) {
        case NETREG_UNINITIALIZED: return " (uninitialized)";
        case NETREG_REGISTERED: return "";
        case NETREG_UNREGISTERING: return " (unregistering)";
        case NETREG_UNREGISTERED: return " (unregistered)";
        case NETREG_RELEASED: return " (released)";
        case NETREG_DUMMY: return " (dummy)";
        }

        WARN_ONCE(1, "%s: unknown reg_state %d\n", dev->name, reg_state);
        return " (unknown)";
}

#define MODULE_ALIAS_NETDEV(device) \
        MODULE_ALIAS("netdev-" device)

/*
 * netdev_WARN() acts like dev_printk(), but with the key difference
 * of using a WARN/WARN_ON to get the message out, including the
 * file/line information and a backtrace.
 */
#define netdev_WARN(dev, format, args...)                        \
        WARN(1, "netdevice: %s%s: " format, netdev_name(dev),        \
             netdev_reg_state(dev), ##args)

#define netdev_WARN_ONCE(dev, format, args...)                                \
        WARN_ONCE(1, "netdevice: %s%s: " format, netdev_name(dev),        \
                  netdev_reg_state(dev), ##args)

/*
 *        The list of packet types we will receive (as opposed to discard)
 *        and the routines to invoke.
 *
 *        Why 16. Because with 16 the only overlap we get on a hash of the
 *        low nibble of the protocol value is RARP/SNAP/X.25.
 *
 *                0800        IP
 *                0001        802.3
 *                0002        AX.25
 *                0004        802.2
 *                8035        RARP
 *                0005        SNAP
 *                0805        X.25
 *                0806        ARP
 *                8137        IPX
 *                0009        Localtalk
 *                86DD        IPv6
 */
#define PTYPE_HASH_SIZE        (16)
#define PTYPE_HASH_MASK        (PTYPE_HASH_SIZE - 1)

extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;

extern struct net_device *blackhole_netdev;

/* Note: Avoid these macros in fast path, prefer per-cpu or per-queue counters. */
#define DEV_STATS_INC(DEV, FIELD) atomic_long_inc(&(DEV)->stats.__##FIELD)
#define DEV_STATS_ADD(DEV, FIELD, VAL)         \
                atomic_long_add((VAL), &(DEV)->stats.__##FIELD)
#define DEV_STATS_READ(DEV, FIELD) atomic_long_read(&(DEV)->stats.__##FIELD)

#endif        /* _LINUX_NETDEVICE_H */










































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NET_DST_OPS_H
#define _NET_DST_OPS_H
#include <linux/types.h>
#include <linux/percpu_counter.h>
#include <linux/cache.h>

struct dst_entry;
struct kmem_cachep;
struct net_device;
struct sk_buff;
struct sock;
struct net;

struct dst_ops {
        unsigned short                family;
        unsigned int                gc_thresh;

        void                        (*gc)(struct dst_ops *ops);
        struct dst_entry *        (*check)(struct dst_entry *, __u32 cookie);
        unsigned int                (*default_advmss)(const struct dst_entry *);
        unsigned int                (*mtu)(const struct dst_entry *);
        u32 *                        (*cow_metrics)(struct dst_entry *, unsigned long);
        void                        (*destroy)(struct dst_entry *);
        void                        (*ifdown)(struct dst_entry *,
                                          struct net_device *dev);
        void                        (*negative_advice)(struct sock *sk, struct dst_entry *);
        void                        (*link_failure)(struct sk_buff *);
        void                        (*update_pmtu)(struct dst_entry *dst, struct sock *sk,
                                               struct sk_buff *skb, u32 mtu,
                                               bool confirm_neigh);
        void                        (*redirect)(struct dst_entry *dst, struct sock *sk,
                                            struct sk_buff *skb);
        int                        (*local_out)(struct net *net, struct sock *sk, struct sk_buff *skb);
        struct neighbour *        (*neigh_lookup)(const struct dst_entry *dst,
                                                struct sk_buff *skb,
                                                const void *daddr);
        void                        (*confirm_neigh)(const struct dst_entry *dst,
                                                 const void *daddr);

        struct kmem_cache        *kmem_cachep;

        struct percpu_counter        pcpuc_entries ____cacheline_aligned_in_smp;
};

static inline int dst_entries_get_fast(struct dst_ops *dst)
{
        return percpu_counter_read_positive(&dst->pcpuc_entries);
}

static inline int dst_entries_get_slow(struct dst_ops *dst)
{
        return percpu_counter_sum_positive(&dst->pcpuc_entries);
}

#define DST_PERCPU_COUNTER_BATCH 32
static inline void dst_entries_add(struct dst_ops *dst, int val)
{
        percpu_counter_add_batch(&dst->pcpuc_entries, val,
                                 DST_PERCPU_COUNTER_BATCH);
}

static inline int dst_entries_init(struct dst_ops *dst)
{
        return percpu_counter_init(&dst->pcpuc_entries, 0, GFP_KERNEL);
}

static inline void dst_entries_destroy(struct dst_ops *dst)
{
        percpu_counter_destroy(&dst->pcpuc_entries);
}

#endif

































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
/*
 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
 */

#include <asm/cpufeature.h>
#include <asm/fpu/api.h>
#include <asm/processor.h>
#include <asm/simd.h>
#include <linux/jump_label.h>
#include <linux/kernel.h>
#include <linux/sizes.h>

asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state,
                                       const u8 *block, const size_t nblocks,
                                       const u32 inc);
asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
                                        const u8 *block, const size_t nblocks,
                                        const u32 inc);

static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);

static void blake2s_compress(struct blake2s_state *state, const u8 *block,
                             size_t nblocks, const u32 inc)
{
        /* SIMD disables preemption, so relax after processing each page. */
        BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);

        if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) {
                blake2s_compress_generic(state, block, nblocks, inc);
                return;
        }

        do {
                const size_t blocks = min_t(size_t, nblocks,
                                            SZ_4K / BLAKE2S_BLOCK_SIZE);

                kernel_fpu_begin();
                if (static_branch_likely(&blake2s_use_avx512))
                        blake2s_compress_avx512(state, block, blocks, inc);
                else
                        blake2s_compress_ssse3(state, block, blocks, inc);
                kernel_fpu_end();

                nblocks -= blocks;
                block += blocks * BLAKE2S_BLOCK_SIZE;
        } while (nblocks);
}

#define blake2s_mod_init_arch blake2s_mod_init_arch
static void blake2s_mod_init_arch(void)
{
        if (boot_cpu_has(X86_FEATURE_SSSE3))
                static_branch_enable(&blake2s_use_ssse3);

        if (boot_cpu_has(X86_FEATURE_AVX) &&
            boot_cpu_has(X86_FEATURE_AVX2) &&
            boot_cpu_has(X86_FEATURE_AVX512F) &&
            boot_cpu_has(X86_FEATURE_AVX512VL) &&
            cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
                              XFEATURE_MASK_AVX512, NULL))
                static_branch_enable(&blake2s_use_avx512);
}








































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * acpi.h - ACPI Interface
 *
 * Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
 */

#ifndef _LINUX_ACPI_H
#define _LINUX_ACPI_H

#include <linux/errno.h>
#include <linux/ioport.h>        /* for struct resource */
#include <linux/resource_ext.h>
#include <linux/device.h>
#include <linux/mod_devicetable.h>
#include <linux/property.h>
#include <linux/uuid.h>
#include <linux/node.h>

struct irq_domain;
struct irq_domain_ops;

#ifndef _LINUX
#define _LINUX
#endif
#include <acpi/acpi.h>
#include <acpi/acpi_numa.h>

#ifdef        CONFIG_ACPI

#include <linux/list.h>
#include <linux/dynamic_debug.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/fw_table.h>

#include <acpi/acpi_bus.h>
#include <acpi/acpi_drivers.h>
#include <acpi/acpi_io.h>
#include <asm/acpi.h>

#ifdef CONFIG_ACPI_TABLE_LIB
#define EXPORT_SYMBOL_ACPI_LIB(x) EXPORT_SYMBOL_NS_GPL(x, "ACPI")
#define __init_or_acpilib
#define __initdata_or_acpilib
#else
#define EXPORT_SYMBOL_ACPI_LIB(x)
#define __init_or_acpilib __init
#define __initdata_or_acpilib __initdata
#endif

static inline acpi_handle acpi_device_handle(struct acpi_device *adev)
{
        return adev ? adev->handle : NULL;
}

#define ACPI_COMPANION(dev)                to_acpi_device_node((dev)->fwnode)
#define ACPI_COMPANION_SET(dev, adev)        set_primary_fwnode(dev, (adev) ? \
        acpi_fwnode_handle(adev) : NULL)
#define ACPI_HANDLE(dev)                acpi_device_handle(ACPI_COMPANION(dev))
#define ACPI_HANDLE_FWNODE(fwnode)        \
                                acpi_device_handle(to_acpi_device_node(fwnode))

static inline struct fwnode_handle *acpi_alloc_fwnode_static(void)
{
        struct fwnode_handle *fwnode;

        fwnode = kzalloc(sizeof(struct fwnode_handle), GFP_KERNEL);
        if (!fwnode)
                return NULL;

        fwnode_init(fwnode, &acpi_static_fwnode_ops);

        return fwnode;
}

static inline void acpi_free_fwnode_static(struct fwnode_handle *fwnode)
{
        if (WARN_ON(!is_acpi_static_node(fwnode)))
                return;

        kfree(fwnode);
}

static inline bool has_acpi_companion(struct device *dev)
{
        return is_acpi_device_node(dev->fwnode);
}

static inline void acpi_preset_companion(struct device *dev,
                                         struct acpi_device *parent, u64 addr)
{
        ACPI_COMPANION_SET(dev, acpi_find_child_device(parent, addr, false));
}

static inline const char *acpi_dev_name(struct acpi_device *adev)
{
        return dev_name(&adev->dev);
}

struct device *acpi_get_first_physical_node(struct acpi_device *adev);

enum acpi_irq_model_id {
        ACPI_IRQ_MODEL_PIC = 0,
        ACPI_IRQ_MODEL_IOAPIC,
        ACPI_IRQ_MODEL_IOSAPIC,
        ACPI_IRQ_MODEL_PLATFORM,
        ACPI_IRQ_MODEL_GIC,
        ACPI_IRQ_MODEL_LPIC,
        ACPI_IRQ_MODEL_RINTC,
        ACPI_IRQ_MODEL_COUNT
};

extern enum acpi_irq_model_id        acpi_irq_model;

enum acpi_interrupt_id {
        ACPI_INTERRUPT_PMI        = 1,
        ACPI_INTERRUPT_INIT,
        ACPI_INTERRUPT_CPEI,
        ACPI_INTERRUPT_COUNT
};

#define        ACPI_SPACE_MEM                0

enum acpi_address_range_id {
        ACPI_ADDRESS_RANGE_MEMORY = 1,
        ACPI_ADDRESS_RANGE_RESERVED = 2,
        ACPI_ADDRESS_RANGE_ACPI = 3,
        ACPI_ADDRESS_RANGE_NVS        = 4,
        ACPI_ADDRESS_RANGE_COUNT
};


/* Table Handlers */
typedef int (*acpi_tbl_table_handler)(struct acpi_table_header *table);

/* Debugger support */

struct acpi_debugger_ops {
        int (*create_thread)(acpi_osd_exec_callback function, void *context);
        ssize_t (*write_log)(const char *msg);
        ssize_t (*read_cmd)(char *buffer, size_t length);
        int (*wait_command_ready)(bool single_step, char *buffer, size_t length);
        int (*notify_command_complete)(void);
};

struct acpi_debugger {
        const struct acpi_debugger_ops *ops;
        struct module *owner;
        struct mutex lock;
};

#ifdef CONFIG_ACPI_DEBUGGER
int __init acpi_debugger_init(void);
int acpi_register_debugger(struct module *owner,
                           const struct acpi_debugger_ops *ops);
void acpi_unregister_debugger(const struct acpi_debugger_ops *ops);
int acpi_debugger_create_thread(acpi_osd_exec_callback function, void *context);
ssize_t acpi_debugger_write_log(const char *msg);
ssize_t acpi_debugger_read_cmd(char *buffer, size_t buffer_length);
int acpi_debugger_wait_command_ready(void);
int acpi_debugger_notify_command_complete(void);
#else
static inline int acpi_debugger_init(void)
{
        return -ENODEV;
}

static inline int acpi_register_debugger(struct module *owner,
                                         const struct acpi_debugger_ops *ops)
{
        return -ENODEV;
}

static inline void acpi_unregister_debugger(const struct acpi_debugger_ops *ops)
{
}

static inline int acpi_debugger_create_thread(acpi_osd_exec_callback function,
                                              void *context)
{
        return -ENODEV;
}

static inline int acpi_debugger_write_log(const char *msg)
{
        return -ENODEV;
}

static inline int acpi_debugger_read_cmd(char *buffer, u32 buffer_length)
{
        return -ENODEV;
}

static inline int acpi_debugger_wait_command_ready(void)
{
        return -ENODEV;
}

static inline int acpi_debugger_notify_command_complete(void)
{
        return -ENODEV;
}
#endif

#define BAD_MADT_ENTRY(entry, end) (                                            \
                (!entry) || (unsigned long)entry + sizeof(*entry) > end ||  \
                ((struct acpi_subtable_header *)entry)->length < sizeof(*entry))

void __iomem *__acpi_map_table(unsigned long phys, unsigned long size);
void __acpi_unmap_table(void __iomem *map, unsigned long size);
int early_acpi_boot_init(void);
int acpi_boot_init (void);
void acpi_boot_table_prepare (void);
void acpi_boot_table_init (void);
int acpi_mps_check (void);
int acpi_numa_init (void);

int acpi_locate_initial_tables (void);
void acpi_reserve_initial_tables (void);
void acpi_table_init_complete (void);
int acpi_table_init (void);

int acpi_table_parse(char *id, acpi_tbl_table_handler handler);
int __init_or_acpilib acpi_table_parse_entries(char *id,
                unsigned long table_size, int entry_id,
                acpi_tbl_entry_handler handler, unsigned int max_entries);
int __init_or_acpilib acpi_table_parse_entries_array(char *id,
                unsigned long table_size, struct acpi_subtable_proc *proc,
                int proc_num, unsigned int max_entries);
int acpi_table_parse_madt(enum acpi_madt_type id,
                          acpi_tbl_entry_handler handler,
                          unsigned int max_entries);
int __init_or_acpilib
acpi_table_parse_cedt(enum acpi_cedt_type id,
                      acpi_tbl_entry_handler_arg handler_arg, void *arg);

int acpi_parse_mcfg (struct acpi_table_header *header);
void acpi_table_print_madt_entry (struct acpi_subtable_header *madt);

#if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH)
void acpi_numa_processor_affinity_init (struct acpi_srat_cpu_affinity *pa);
#else
static inline void
acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) { }
#endif

void acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa);

#if defined(CONFIG_ARM64) || defined(CONFIG_LOONGARCH)
void acpi_arch_dma_setup(struct device *dev);
#else
static inline void acpi_arch_dma_setup(struct device *dev) { }
#endif

#ifdef CONFIG_ARM64
void acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa);
#else
static inline void
acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa) { }
#endif

#ifdef CONFIG_RISCV
void acpi_numa_rintc_affinity_init(struct acpi_srat_rintc_affinity *pa);
#else
static inline void acpi_numa_rintc_affinity_init(struct acpi_srat_rintc_affinity *pa) { }
#endif

#ifndef PHYS_CPUID_INVALID
typedef u32 phys_cpuid_t;
#define PHYS_CPUID_INVALID (phys_cpuid_t)(-1)
#endif

static inline bool invalid_logical_cpuid(u32 cpuid)
{
        return (int)cpuid < 0;
}

static inline bool invalid_phys_cpuid(phys_cpuid_t phys_id)
{
        return phys_id == PHYS_CPUID_INVALID;
}


int __init acpi_get_madt_revision(void);

/* Validate the processor object's proc_id */
bool acpi_duplicate_processor_id(int proc_id);
/* Processor _CTS control */
struct acpi_processor_power;

#ifdef CONFIG_ACPI_PROCESSOR_CSTATE
bool acpi_processor_claim_cst_control(void);
int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu,
                                struct acpi_processor_power *info);
#else
static inline bool acpi_processor_claim_cst_control(void) { return false; }
static inline int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu,
                                              struct acpi_processor_power *info)
{
        return -ENODEV;
}
#endif

#ifdef CONFIG_ACPI_HOTPLUG_CPU
/* Arch dependent functions for cpu hotplug support */
int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id,
                 int *pcpu);
int acpi_unmap_cpu(int cpu);
#endif /* CONFIG_ACPI_HOTPLUG_CPU */

acpi_handle acpi_get_processor_handle(int cpu);

#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
int acpi_get_ioapic_id(acpi_handle handle, u32 gsi_base, u64 *phys_addr);
#endif

int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base);
int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base);
int acpi_ioapic_registered(acpi_handle handle, u32 gsi_base);
void acpi_irq_stats_init(void);
extern u32 acpi_irq_handled;
extern u32 acpi_irq_not_handled;
extern unsigned int acpi_sci_irq;
extern bool acpi_no_s5;
#define INVALID_ACPI_IRQ        ((unsigned)-1)
static inline bool acpi_sci_irq_valid(void)
{
        return acpi_sci_irq != INVALID_ACPI_IRQ;
}

extern int sbf_port;

int acpi_register_gsi (struct device *dev, u32 gsi, int triggering, int polarity);
int acpi_gsi_to_irq (u32 gsi, unsigned int *irq);
int acpi_isa_irq_to_gsi (unsigned isa_irq, u32 *gsi);

typedef struct fwnode_handle *(*acpi_gsi_domain_disp_fn)(u32);

void acpi_set_irq_model(enum acpi_irq_model_id model,
                        acpi_gsi_domain_disp_fn fn);
acpi_gsi_domain_disp_fn acpi_get_gsi_dispatcher(void);
void acpi_set_gsi_to_irq_fallback(u32 (*)(u32));

struct irq_domain *acpi_irq_create_hierarchy(unsigned int flags,
                                             unsigned int size,
                                             struct fwnode_handle *fwnode,
                                             const struct irq_domain_ops *ops,
                                             void *host_data);

#ifdef CONFIG_X86_IO_APIC
extern int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity);
#else
static inline int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
{
        return -1;
}
#endif
/*
 * This function undoes the effect of one call to acpi_register_gsi().
 * If this matches the last registration, any IRQ resources for gsi
 * are freed.
 */
void acpi_unregister_gsi (u32 gsi);

struct pci_dev;

struct acpi_prt_entry *acpi_pci_irq_lookup(struct pci_dev *dev, int pin);
int acpi_pci_irq_enable (struct pci_dev *dev);
void acpi_penalize_isa_irq(int irq, int active);
bool acpi_isa_irq_available(int irq);
#ifdef CONFIG_PCI
void acpi_penalize_sci_irq(int irq, int trigger, int polarity);
#else
static inline void acpi_penalize_sci_irq(int irq, int trigger,
                                        int polarity)
{
}
#endif
void acpi_pci_irq_disable (struct pci_dev *dev);

extern int ec_read(u8 addr, u8 *val);
extern int ec_write(u8 addr, u8 val);
extern int ec_transaction(u8 command,
                          const u8 *wdata, unsigned wdata_len,
                          u8 *rdata, unsigned rdata_len);
extern acpi_handle ec_get_handle(void);

extern bool acpi_is_pnp_device(struct acpi_device *);

#if defined(CONFIG_ACPI_WMI) || defined(CONFIG_ACPI_WMI_MODULE)

typedef void (*wmi_notify_handler) (union acpi_object *data, void *context);

int wmi_instance_count(const char *guid);

extern acpi_status wmi_evaluate_method(const char *guid, u8 instance,
                                        u32 method_id,
                                        const struct acpi_buffer *in,
                                        struct acpi_buffer *out);
extern acpi_status wmi_query_block(const char *guid, u8 instance,
                                        struct acpi_buffer *out);
extern acpi_status wmi_set_block(const char *guid, u8 instance,
                                        const struct acpi_buffer *in);
extern acpi_status wmi_install_notify_handler(const char *guid,
                                        wmi_notify_handler handler, void *data);
extern acpi_status wmi_remove_notify_handler(const char *guid);
extern bool wmi_has_guid(const char *guid);
extern char *wmi_get_acpi_device_uid(const char *guid);

#endif        /* CONFIG_ACPI_WMI */

#define ACPI_VIDEO_OUTPUT_SWITCHING                        0x0001
#define ACPI_VIDEO_DEVICE_POSTING                        0x0002
#define ACPI_VIDEO_ROM_AVAILABLE                        0x0004
#define ACPI_VIDEO_BACKLIGHT                                0x0008
#define ACPI_VIDEO_BACKLIGHT_FORCE_VENDOR                0x0010
#define ACPI_VIDEO_BACKLIGHT_FORCE_VIDEO                0x0020
#define ACPI_VIDEO_OUTPUT_SWITCHING_FORCE_VENDOR        0x0040
#define ACPI_VIDEO_OUTPUT_SWITCHING_FORCE_VIDEO                0x0080
#define ACPI_VIDEO_BACKLIGHT_DMI_VENDOR                        0x0100
#define ACPI_VIDEO_BACKLIGHT_DMI_VIDEO                        0x0200
#define ACPI_VIDEO_OUTPUT_SWITCHING_DMI_VENDOR                0x0400
#define ACPI_VIDEO_OUTPUT_SWITCHING_DMI_VIDEO                0x0800

extern char acpi_video_backlight_string[];
extern long acpi_is_video_device(acpi_handle handle);

extern void acpi_osi_setup(char *str);
extern bool acpi_osi_is_win8(void);

#ifdef CONFIG_ACPI_THERMAL_LIB
int thermal_acpi_active_trip_temp(struct acpi_device *adev, int id, int *ret_temp);
int thermal_acpi_passive_trip_temp(struct acpi_device *adev, int *ret_temp);
int thermal_acpi_hot_trip_temp(struct acpi_device *adev, int *ret_temp);
int thermal_acpi_critical_trip_temp(struct acpi_device *adev, int *ret_temp);
#endif

#ifdef CONFIG_ACPI_HMAT
int acpi_get_genport_coordinates(u32 uid, struct access_coordinate *coord);
#else
static inline int acpi_get_genport_coordinates(u32 uid,
                                               struct access_coordinate *coord)
{
        return -EOPNOTSUPP;
}
#endif

#ifdef CONFIG_ACPI_NUMA
int acpi_map_pxm_to_node(int pxm);
int acpi_get_node(acpi_handle handle);

/**
 * pxm_to_online_node - Map proximity ID to online node
 * @pxm: ACPI proximity ID
 *
 * This is similar to pxm_to_node(), but always returns an online
 * node.  When the mapped node from a given proximity ID is offline, it
 * looks up the node distance table and returns the nearest online node.
 *
 * ACPI device drivers, which are called after the NUMA initialization has
 * completed in the kernel, can call this interface to obtain their device
 * NUMA topology from ACPI tables.  Such drivers do not have to deal with
 * offline nodes.  A node may be offline when SRAT memory entry does not exist,
 * or NUMA is disabled, ex. "numa=off" on x86.
 */
static inline int pxm_to_online_node(int pxm)
{
        int node = pxm_to_node(pxm);

        return numa_map_to_online_node(node);
}
#else
static inline int pxm_to_online_node(int pxm)
{
        return 0;
}
static inline int acpi_map_pxm_to_node(int pxm)
{
        return 0;
}
static inline int acpi_get_node(acpi_handle handle)
{
        return 0;
}
#endif
extern int pnpacpi_disabled;

#define PXM_INVAL        (-1)

bool acpi_dev_resource_memory(struct acpi_resource *ares, struct resource *res);
bool acpi_dev_resource_io(struct acpi_resource *ares, struct resource *res);
bool acpi_dev_resource_address_space(struct acpi_resource *ares,
                                     struct resource_win *win);
bool acpi_dev_resource_ext_address_space(struct acpi_resource *ares,
                                         struct resource_win *win);
unsigned long acpi_dev_irq_flags(u8 triggering, u8 polarity, u8 shareable, u8 wake_capable);
unsigned int acpi_dev_get_irq_type(int triggering, int polarity);
bool acpi_dev_resource_interrupt(struct acpi_resource *ares, int index,
                                 struct resource *res);

void acpi_dev_free_resource_list(struct list_head *list);
int acpi_dev_get_resources(struct acpi_device *adev, struct list_head *list,
                           int (*preproc)(struct acpi_resource *, void *),
                           void *preproc_data);
int acpi_dev_get_dma_resources(struct acpi_device *adev,
                               struct list_head *list);
int acpi_dev_get_memory_resources(struct acpi_device *adev, struct list_head *list);
int acpi_dev_filter_resource_type(struct acpi_resource *ares,
                                  unsigned long types);

static inline int acpi_dev_filter_resource_type_cb(struct acpi_resource *ares,
                                                   void *arg)
{
        return acpi_dev_filter_resource_type(ares, (unsigned long)arg);
}

struct acpi_device *acpi_resource_consumer(struct resource *res);

int acpi_check_resource_conflict(const struct resource *res);

int acpi_check_region(resource_size_t start, resource_size_t n,
                      const char *name);

int acpi_resources_are_enforced(void);

#ifdef CONFIG_HIBERNATION
extern int acpi_check_s4_hw_signature;
#endif

#ifdef CONFIG_PM_SLEEP
void __init acpi_old_suspend_ordering(void);
void __init acpi_nvs_nosave(void);
void __init acpi_nvs_nosave_s3(void);
void __init acpi_sleep_no_blacklist(void);
#endif /* CONFIG_PM_SLEEP */

int acpi_register_wakeup_handler(
        int wake_irq, bool (*wakeup)(void *context), void *context);
void acpi_unregister_wakeup_handler(
        bool (*wakeup)(void *context), void *context);

struct acpi_osc_context {
        char *uuid_str;                        /* UUID string */
        int rev;
        struct acpi_buffer cap;                /* list of DWORD capabilities */
        struct acpi_buffer ret;                /* free by caller if success */
};

acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context);

/* Number of _OSC capability DWORDS depends on bridge type */
#define OSC_PCI_CAPABILITY_DWORDS                3
#define OSC_CXL_CAPABILITY_DWORDS                5

/* Indexes into _OSC Capabilities Buffer (DWORDs 2 to 5 are device-specific) */
#define OSC_QUERY_DWORD                                0        /* DWORD 1 */
#define OSC_SUPPORT_DWORD                        1        /* DWORD 2 */
#define OSC_CONTROL_DWORD                        2        /* DWORD 3 */
#define OSC_EXT_SUPPORT_DWORD                        3        /* DWORD 4 */
#define OSC_EXT_CONTROL_DWORD                        4        /* DWORD 5 */

/* _OSC Capabilities DWORD 1: Query/Control and Error Returns (generic) */
#define OSC_QUERY_ENABLE                        0x00000001  /* input */
#define OSC_REQUEST_ERROR                        0x00000002  /* return */
#define OSC_INVALID_UUID_ERROR                        0x00000004  /* return */
#define OSC_INVALID_REVISION_ERROR                0x00000008  /* return */
#define OSC_CAPABILITIES_MASK_ERROR                0x00000010  /* return */

/* Platform-Wide Capabilities _OSC: Capabilities DWORD 2: Support Field */
#define OSC_SB_PAD_SUPPORT                        0x00000001
#define OSC_SB_PPC_OST_SUPPORT                        0x00000002
#define OSC_SB_PR3_SUPPORT                        0x00000004
#define OSC_SB_HOTPLUG_OST_SUPPORT                0x00000008
#define OSC_SB_APEI_SUPPORT                        0x00000010
#define OSC_SB_CPC_SUPPORT                        0x00000020
#define OSC_SB_CPCV2_SUPPORT                        0x00000040
#define OSC_SB_PCLPI_SUPPORT                        0x00000080
#define OSC_SB_OSLPI_SUPPORT                        0x00000100
#define OSC_SB_FAST_THERMAL_SAMPLING_SUPPORT        0x00000200
#define OSC_SB_OVER_16_PSTATES_SUPPORT                0x00000400
#define OSC_SB_GED_SUPPORT                        0x00000800
#define OSC_SB_CPC_DIVERSE_HIGH_SUPPORT                0x00001000
#define OSC_SB_IRQ_RESOURCE_SOURCE_SUPPORT        0x00002000
#define OSC_SB_CPC_FLEXIBLE_ADR_SPACE                0x00004000
#define OSC_SB_GENERIC_INITIATOR_SUPPORT        0x00020000
#define OSC_SB_NATIVE_USB4_SUPPORT                0x00040000
#define OSC_SB_BATTERY_CHARGE_LIMITING_SUPPORT        0x00080000
#define OSC_SB_PRM_SUPPORT                        0x00200000
#define OSC_SB_FFH_OPR_SUPPORT                        0x00400000

extern bool osc_sb_apei_support_acked;
extern bool osc_pc_lpi_support_confirmed;
extern bool osc_sb_native_usb4_support_confirmed;
extern bool osc_sb_cppc2_support_acked;
extern bool osc_cpc_flexible_adr_space_confirmed;

/* USB4 Capabilities */
#define OSC_USB_USB3_TUNNELING                        0x00000001
#define OSC_USB_DP_TUNNELING                        0x00000002
#define OSC_USB_PCIE_TUNNELING                        0x00000004
#define OSC_USB_XDOMAIN                                0x00000008

extern u32 osc_sb_native_usb4_control;

/* PCI Host Bridge _OSC: Capabilities DWORD 2: Support Field */
#define OSC_PCI_EXT_CONFIG_SUPPORT                0x00000001
#define OSC_PCI_ASPM_SUPPORT                        0x00000002
#define OSC_PCI_CLOCK_PM_SUPPORT                0x00000004
#define OSC_PCI_SEGMENT_GROUPS_SUPPORT                0x00000008
#define OSC_PCI_MSI_SUPPORT                        0x00000010
#define OSC_PCI_EDR_SUPPORT                        0x00000080
#define OSC_PCI_HPX_TYPE_3_SUPPORT                0x00000100

/* PCI Host Bridge _OSC: Capabilities DWORD 3: Control Field */
#define OSC_PCI_EXPRESS_NATIVE_HP_CONTROL        0x00000001
#define OSC_PCI_SHPC_NATIVE_HP_CONTROL                0x00000002
#define OSC_PCI_EXPRESS_PME_CONTROL                0x00000004
#define OSC_PCI_EXPRESS_AER_CONTROL                0x00000008
#define OSC_PCI_EXPRESS_CAPABILITY_CONTROL        0x00000010
#define OSC_PCI_EXPRESS_LTR_CONTROL                0x00000020
#define OSC_PCI_EXPRESS_DPC_CONTROL                0x00000080

/* CXL _OSC: Capabilities DWORD 4: Support Field */
#define OSC_CXL_1_1_PORT_REG_ACCESS_SUPPORT        0x00000001
#define OSC_CXL_2_0_PORT_DEV_REG_ACCESS_SUPPORT        0x00000002
#define OSC_CXL_PROTOCOL_ERR_REPORTING_SUPPORT        0x00000004
#define OSC_CXL_NATIVE_HP_SUPPORT                0x00000008

/* CXL _OSC: Capabilities DWORD 5: Control Field */
#define OSC_CXL_ERROR_REPORTING_CONTROL                0x00000001

static inline u32 acpi_osc_ctx_get_pci_control(struct acpi_osc_context *context)
{
        u32 *ret = context->ret.pointer;

        return ret[OSC_CONTROL_DWORD];
}

static inline u32 acpi_osc_ctx_get_cxl_control(struct acpi_osc_context *context)
{
        u32 *ret = context->ret.pointer;

        return ret[OSC_EXT_CONTROL_DWORD];
}

#define ACPI_GSB_ACCESS_ATTRIB_QUICK                0x00000002
#define ACPI_GSB_ACCESS_ATTRIB_SEND_RCV         0x00000004
#define ACPI_GSB_ACCESS_ATTRIB_BYTE                0x00000006
#define ACPI_GSB_ACCESS_ATTRIB_WORD                0x00000008
#define ACPI_GSB_ACCESS_ATTRIB_BLOCK                0x0000000A
#define ACPI_GSB_ACCESS_ATTRIB_MULTIBYTE        0x0000000B
#define ACPI_GSB_ACCESS_ATTRIB_WORD_CALL        0x0000000C
#define ACPI_GSB_ACCESS_ATTRIB_BLOCK_CALL        0x0000000D
#define ACPI_GSB_ACCESS_ATTRIB_RAW_BYTES        0x0000000E
#define ACPI_GSB_ACCESS_ATTRIB_RAW_PROCESS        0x0000000F

/* Enable _OST when all relevant hotplug operations are enabled */
#if defined(CONFIG_ACPI_HOTPLUG_CPU) &&                        \
        defined(CONFIG_ACPI_HOTPLUG_MEMORY) &&                \
        defined(CONFIG_ACPI_CONTAINER)
#define ACPI_HOTPLUG_OST
#endif

/* _OST Source Event Code (OSPM Action) */
#define ACPI_OST_EC_OSPM_SHUTDOWN                0x100
#define ACPI_OST_EC_OSPM_EJECT                        0x103
#define ACPI_OST_EC_OSPM_INSERTION                0x200

/* _OST General Processing Status Code */
#define ACPI_OST_SC_SUCCESS                        0x0
#define ACPI_OST_SC_NON_SPECIFIC_FAILURE        0x1
#define ACPI_OST_SC_UNRECOGNIZED_NOTIFY                0x2

/* _OST OS Shutdown Processing (0x100) Status Code */
#define ACPI_OST_SC_OS_SHUTDOWN_DENIED                0x80
#define ACPI_OST_SC_OS_SHUTDOWN_IN_PROGRESS        0x81
#define ACPI_OST_SC_OS_SHUTDOWN_COMPLETED        0x82
#define ACPI_OST_SC_OS_SHUTDOWN_NOT_SUPPORTED        0x83

/* _OST Ejection Request (0x3, 0x103) Status Code */
#define ACPI_OST_SC_EJECT_NOT_SUPPORTED                0x80
#define ACPI_OST_SC_DEVICE_IN_USE                0x81
#define ACPI_OST_SC_DEVICE_BUSY                        0x82
#define ACPI_OST_SC_EJECT_DEPENDENCY_BUSY        0x83
#define ACPI_OST_SC_EJECT_IN_PROGRESS                0x84

/* _OST Insertion Request (0x200) Status Code */
#define ACPI_OST_SC_INSERT_IN_PROGRESS                0x80
#define ACPI_OST_SC_DRIVER_LOAD_FAILURE                0x81
#define ACPI_OST_SC_INSERT_NOT_SUPPORTED        0x82

enum acpi_predicate {
        all_versions,
        less_than_or_equal,
        equal,
        greater_than_or_equal,
};

/* Table must be terminted by a NULL entry */
struct acpi_platform_list {
        char        oem_id[ACPI_OEM_ID_SIZE+1];
        char        oem_table_id[ACPI_OEM_TABLE_ID_SIZE+1];
        u32        oem_revision;
        char        *table;
        enum acpi_predicate pred;
        char        *reason;
        u32        data;
};
int acpi_match_platform_list(const struct acpi_platform_list *plat);

extern void acpi_early_init(void);
extern void acpi_subsystem_init(void);

extern int acpi_nvs_register(__u64 start, __u64 size);

extern int acpi_nvs_for_each_region(int (*func)(__u64, __u64, void *),
                                    void *data);

const struct acpi_device_id *acpi_match_acpi_device(const struct acpi_device_id *ids,
                                                    const struct acpi_device *adev);

const struct acpi_device_id *acpi_match_device(const struct acpi_device_id *ids,
                                               const struct device *dev);

const void *acpi_device_get_match_data(const struct device *dev);
extern bool acpi_driver_match_device(struct device *dev,
                                     const struct device_driver *drv);
int acpi_device_uevent_modalias(const struct device *, struct kobj_uevent_env *);
int acpi_device_modalias(struct device *, char *, int);

struct platform_device *acpi_create_platform_device(struct acpi_device *,
                                                    const struct property_entry *);
#define ACPI_PTR(_ptr)        (_ptr)

static inline void acpi_device_set_enumerated(struct acpi_device *adev)
{
        adev->flags.visited = true;
}

static inline void acpi_device_clear_enumerated(struct acpi_device *adev)
{
        adev->flags.visited = false;
}

enum acpi_reconfig_event  {
        ACPI_RECONFIG_DEVICE_ADD = 0,
        ACPI_RECONFIG_DEVICE_REMOVE,
};

int acpi_reconfig_notifier_register(struct notifier_block *nb);
int acpi_reconfig_notifier_unregister(struct notifier_block *nb);

#ifdef CONFIG_ACPI_GTDT
int acpi_gtdt_init(struct acpi_table_header *table, int *platform_timer_count);
int acpi_gtdt_map_ppi(int type);
bool acpi_gtdt_c3stop(int type);
int acpi_arch_timer_mem_init(struct arch_timer_mem *timer_mem, int *timer_count);
#endif

#ifndef ACPI_HAVE_ARCH_SET_ROOT_POINTER
static __always_inline void acpi_arch_set_root_pointer(u64 addr)
{
}
#endif

#ifndef ACPI_HAVE_ARCH_GET_ROOT_POINTER
static __always_inline u64 acpi_arch_get_root_pointer(void)
{
        return 0;
}
#endif

int acpi_get_local_u64_address(acpi_handle handle, u64 *addr);
int acpi_get_local_address(acpi_handle handle, u32 *addr);
const char *acpi_get_subsystem_id(acpi_handle handle);

#ifdef CONFIG_ACPI_MRRM
int acpi_mrrm_max_mem_region(void);
#endif

#else        /* !CONFIG_ACPI */

#define acpi_disabled 1

#define ACPI_COMPANION(dev)                (NULL)
#define ACPI_COMPANION_SET(dev, adev)        do { } while (0)
#define ACPI_HANDLE(dev)                (NULL)
#define ACPI_HANDLE_FWNODE(fwnode)        (NULL)

/* Get rid of the -Wunused-variable for adev */
#define acpi_dev_uid_match(adev, uid2)                        (adev && false)
#define acpi_dev_hid_uid_match(adev, hid2, uid2)        (adev && false)

struct fwnode_handle;

static inline bool acpi_dev_found(const char *hid)
{
        return false;
}

static inline bool acpi_dev_present(const char *hid, const char *uid, s64 hrv)
{
        return false;
}

struct acpi_device;

static inline int acpi_dev_uid_to_integer(struct acpi_device *adev, u64 *integer)
{
        return -ENODEV;
}

static inline struct acpi_device *
acpi_dev_get_first_match_dev(const char *hid, const char *uid, s64 hrv)
{
        return NULL;
}

static inline bool acpi_reduced_hardware(void)
{
        return false;
}

static inline void acpi_dev_put(struct acpi_device *adev) {}

static inline bool is_acpi_node(const struct fwnode_handle *fwnode)
{
        return false;
}

static inline bool is_acpi_device_node(const struct fwnode_handle *fwnode)
{
        return false;
}

static inline struct acpi_device *to_acpi_device_node(const struct fwnode_handle *fwnode)
{
        return NULL;
}

static inline bool is_acpi_data_node(const struct fwnode_handle *fwnode)
{
        return false;
}

static inline struct acpi_data_node *to_acpi_data_node(const struct fwnode_handle *fwnode)
{
        return NULL;
}

static inline bool acpi_data_node_match(const struct fwnode_handle *fwnode,
                                        const char *name)
{
        return false;
}

static inline struct fwnode_handle *acpi_fwnode_handle(struct acpi_device *adev)
{
        return NULL;
}

static inline acpi_handle acpi_device_handle(struct acpi_device *adev)
{
        return NULL;
}

static inline bool has_acpi_companion(struct device *dev)
{
        return false;
}

static inline void acpi_preset_companion(struct device *dev,
                                         struct acpi_device *parent, u64 addr)
{
}

static inline const char *acpi_dev_name(struct acpi_device *adev)
{
        return NULL;
}

static inline struct device *acpi_get_first_physical_node(struct acpi_device *adev)
{
        return NULL;
}

static inline void acpi_early_init(void) { }
static inline void acpi_subsystem_init(void) { }

static inline int early_acpi_boot_init(void)
{
        return 0;
}
static inline int acpi_boot_init(void)
{
        return 0;
}

static inline void acpi_boot_table_prepare(void)
{
}

static inline void acpi_boot_table_init(void)
{
}

static inline int acpi_mps_check(void)
{
        return 0;
}

static inline int acpi_check_resource_conflict(struct resource *res)
{
        return 0;
}

static inline int acpi_check_region(resource_size_t start, resource_size_t n,
                                    const char *name)
{
        return 0;
}

struct acpi_table_header;
static inline int acpi_table_parse(char *id,
                                int (*handler)(struct acpi_table_header *))
{
        return -ENODEV;
}

static inline int acpi_nvs_register(__u64 start, __u64 size)
{
        return 0;
}

static inline int acpi_nvs_for_each_region(int (*func)(__u64, __u64, void *),
                                           void *data)
{
        return 0;
}

struct acpi_device_id;

static inline const struct acpi_device_id *acpi_match_acpi_device(
        const struct acpi_device_id *ids, const struct acpi_device *adev)
{
        return NULL;
}

static inline const struct acpi_device_id *acpi_match_device(
        const struct acpi_device_id *ids, const struct device *dev)
{
        return NULL;
}

static inline const void *acpi_device_get_match_data(const struct device *dev)
{
        return NULL;
}

static inline bool acpi_driver_match_device(struct device *dev,
                                            const struct device_driver *drv)
{
        return false;
}

static inline bool acpi_check_dsm(acpi_handle handle, const guid_t *guid,
                                  u64 rev, u64 funcs)
{
        return false;
}

static inline union acpi_object *acpi_evaluate_dsm(acpi_handle handle,
                                                   const guid_t *guid,
                                                   u64 rev, u64 func,
                                                   union acpi_object *argv4)
{
        return NULL;
}

static inline union acpi_object *acpi_evaluate_dsm_typed(acpi_handle handle,
                                                         const guid_t *guid,
                                                         u64 rev, u64 func,
                                                         union acpi_object *argv4,
                                                         acpi_object_type type)
{
        return NULL;
}

static inline int acpi_device_uevent_modalias(const struct device *dev,
                                struct kobj_uevent_env *env)
{
        return -ENODEV;
}

static inline int acpi_device_modalias(struct device *dev,
                                char *buf, int size)
{
        return -ENODEV;
}

static inline struct platform_device *
acpi_create_platform_device(struct acpi_device *adev,
                            const struct property_entry *properties)
{
        return NULL;
}

static inline bool acpi_dma_supported(const struct acpi_device *adev)
{
        return false;
}

static inline enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev)
{
        return DEV_DMA_NOT_SUPPORTED;
}

static inline int acpi_dma_get_range(struct device *dev, const struct bus_dma_region **map)
{
        return -ENODEV;
}

static inline int acpi_dma_configure(struct device *dev,
                                     enum dev_dma_attr attr)
{
        return 0;
}

static inline int acpi_dma_configure_id(struct device *dev,
                                        enum dev_dma_attr attr,
                                        const u32 *input_id)
{
        return 0;
}

#define ACPI_PTR(_ptr)        (NULL)

static inline void acpi_device_set_enumerated(struct acpi_device *adev)
{
}

static inline void acpi_device_clear_enumerated(struct acpi_device *adev)
{
}

static inline int acpi_reconfig_notifier_register(struct notifier_block *nb)
{
        return -EINVAL;
}

static inline int acpi_reconfig_notifier_unregister(struct notifier_block *nb)
{
        return -EINVAL;
}

static inline struct acpi_device *acpi_resource_consumer(struct resource *res)
{
        return NULL;
}

static inline int acpi_get_local_address(acpi_handle handle, u32 *addr)
{
        return -ENODEV;
}

static inline const char *acpi_get_subsystem_id(acpi_handle handle)
{
        return ERR_PTR(-ENODEV);
}

static inline int acpi_register_wakeup_handler(int wake_irq,
        bool (*wakeup)(void *context), void *context)
{
        return -ENXIO;
}

static inline void acpi_unregister_wakeup_handler(
        bool (*wakeup)(void *context), void *context) { }

struct acpi_osc_context;
static inline u32 acpi_osc_ctx_get_pci_control(struct acpi_osc_context *context)
{
        return 0;
}

static inline u32 acpi_osc_ctx_get_cxl_control(struct acpi_osc_context *context)
{
        return 0;
}

static inline bool acpi_sleep_state_supported(u8 sleep_state)
{
        return false;
}

static inline acpi_handle acpi_get_processor_handle(int cpu)
{
        return NULL;
}

static inline int acpi_mrrm_max_mem_region(void)
{
        return 1;
}

#endif        /* !CONFIG_ACPI */

#ifdef CONFIG_ACPI_HMAT
int hmat_get_extended_linear_cache_size(struct resource *backing_res, int nid,
                                        resource_size_t *size);
#else
static inline int hmat_get_extended_linear_cache_size(struct resource *backing_res,
                                                      int nid, resource_size_t *size)
{
        return -EOPNOTSUPP;
}
#endif

extern void arch_post_acpi_subsys_init(void);

#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
int acpi_ioapic_add(acpi_handle root);
#else
static inline int acpi_ioapic_add(acpi_handle root) { return 0; }
#endif

#ifdef CONFIG_ACPI
void acpi_os_set_prepare_sleep(int (*func)(u8 sleep_state,
                               u32 pm1a_ctrl,  u32 pm1b_ctrl));

acpi_status acpi_os_prepare_sleep(u8 sleep_state,
                                  u32 pm1a_control, u32 pm1b_control);

void acpi_os_set_prepare_extended_sleep(int (*func)(u8 sleep_state,
                                        u32 val_a,  u32 val_b));

acpi_status acpi_os_prepare_extended_sleep(u8 sleep_state,
                                           u32 val_a, u32 val_b);
struct acpi_s2idle_dev_ops {
        struct list_head list_node;
        void (*prepare)(void);
        void (*check)(void);
        void (*restore)(void);
};
#if defined(CONFIG_SUSPEND) && defined(CONFIG_X86)
int acpi_register_lps0_dev(struct acpi_s2idle_dev_ops *arg);
void acpi_unregister_lps0_dev(struct acpi_s2idle_dev_ops *arg);
int acpi_get_lps0_constraint(struct acpi_device *adev);
#else /* CONFIG_SUSPEND && CONFIG_X86 */
static inline int acpi_get_lps0_constraint(struct device *dev)
{
        return ACPI_STATE_UNKNOWN;
}
static inline int acpi_register_lps0_dev(struct acpi_s2idle_dev_ops *arg)
{
        return -ENODEV;
}
static inline void acpi_unregister_lps0_dev(struct acpi_s2idle_dev_ops *arg)
{
}
#endif /* CONFIG_SUSPEND && CONFIG_X86 */
void arch_reserve_mem_area(acpi_physical_address addr, size_t size);
#else
#define acpi_os_set_prepare_sleep(func, pm1a_ctrl, pm1b_ctrl) do { } while (0)
#endif

#if defined(CONFIG_ACPI) && defined(CONFIG_PM)
int acpi_dev_suspend(struct device *dev, bool wakeup);
int acpi_dev_resume(struct device *dev);
int acpi_subsys_runtime_suspend(struct device *dev);
int acpi_subsys_runtime_resume(struct device *dev);
int acpi_dev_pm_attach(struct device *dev, bool power_on);
bool acpi_storage_d3(struct device *dev);
bool acpi_dev_state_d0(struct device *dev);
#else
static inline int acpi_subsys_runtime_suspend(struct device *dev) { return 0; }
static inline int acpi_subsys_runtime_resume(struct device *dev) { return 0; }
static inline int acpi_dev_pm_attach(struct device *dev, bool power_on)
{
        return 0;
}
static inline bool acpi_storage_d3(struct device *dev)
{
        return false;
}
static inline bool acpi_dev_state_d0(struct device *dev)
{
        return true;
}
#endif

#if defined(CONFIG_ACPI) && defined(CONFIG_PM_SLEEP)
int acpi_subsys_prepare(struct device *dev);
void acpi_subsys_complete(struct device *dev);
int acpi_subsys_suspend_late(struct device *dev);
int acpi_subsys_suspend_noirq(struct device *dev);
int acpi_subsys_suspend(struct device *dev);
int acpi_subsys_freeze(struct device *dev);
int acpi_subsys_poweroff(struct device *dev);
int acpi_subsys_restore_early(struct device *dev);
#else
static inline int acpi_subsys_prepare(struct device *dev) { return 0; }
static inline void acpi_subsys_complete(struct device *dev) {}
static inline int acpi_subsys_suspend_late(struct device *dev) { return 0; }
static inline int acpi_subsys_suspend_noirq(struct device *dev) { return 0; }
static inline int acpi_subsys_suspend(struct device *dev) { return 0; }
static inline int acpi_subsys_freeze(struct device *dev) { return 0; }
static inline int acpi_subsys_poweroff(struct device *dev) { return 0; }
static inline int acpi_subsys_restore_early(struct device *dev) { return 0; }
#endif

#if defined(CONFIG_ACPI_EC) && defined(CONFIG_PM_SLEEP)
void acpi_ec_mark_gpe_for_wake(void);
void acpi_ec_set_gpe_wake_mask(u8 action);
#else
static inline void acpi_ec_mark_gpe_for_wake(void) {}
static inline void acpi_ec_set_gpe_wake_mask(u8 action) {}
#endif

#ifdef CONFIG_ACPI
char *acpi_handle_path(acpi_handle handle);
__printf(3, 4)
void acpi_handle_printk(const char *level, acpi_handle handle,
                        const char *fmt, ...);
void acpi_evaluation_failure_warn(acpi_handle handle, const char *name,
                                  acpi_status status);
#else        /* !CONFIG_ACPI */
static inline __printf(3, 4) void
acpi_handle_printk(const char *level, void *handle, const char *fmt, ...) {}
static inline void acpi_evaluation_failure_warn(acpi_handle handle,
                                                const char *name,
                                                acpi_status status) {}
#endif        /* !CONFIG_ACPI */

#if defined(CONFIG_ACPI) && defined(CONFIG_DYNAMIC_DEBUG)
__printf(3, 4)
void __acpi_handle_debug(struct _ddebug *descriptor, acpi_handle handle, const char *fmt, ...);
#endif

/*
 * acpi_handle_<level>: Print message with ACPI prefix and object path
 *
 * These interfaces acquire the global namespace mutex to obtain an object
 * path.  In interrupt context, it shows the object path as <n/a>.
 */
#define acpi_handle_emerg(handle, fmt, ...)                                \
        acpi_handle_printk(KERN_EMERG, handle, fmt, ##__VA_ARGS__)
#define acpi_handle_alert(handle, fmt, ...)                                \
        acpi_handle_printk(KERN_ALERT, handle, fmt, ##__VA_ARGS__)
#define acpi_handle_crit(handle, fmt, ...)                                \
        acpi_handle_printk(KERN_CRIT, handle, fmt, ##__VA_ARGS__)
#define acpi_handle_err(handle, fmt, ...)                                \
        acpi_handle_printk(KERN_ERR, handle, fmt, ##__VA_ARGS__)
#define acpi_handle_warn(handle, fmt, ...)                                \
        acpi_handle_printk(KERN_WARNING, handle, fmt, ##__VA_ARGS__)
#define acpi_handle_notice(handle, fmt, ...)                                \
        acpi_handle_printk(KERN_NOTICE, handle, fmt, ##__VA_ARGS__)
#define acpi_handle_info(handle, fmt, ...)                                \
        acpi_handle_printk(KERN_INFO, handle, fmt, ##__VA_ARGS__)

#if defined(DEBUG)
#define acpi_handle_debug(handle, fmt, ...)                                \
        acpi_handle_printk(KERN_DEBUG, handle, fmt, ##__VA_ARGS__)
#else
#if defined(CONFIG_DYNAMIC_DEBUG)
#define acpi_handle_debug(handle, fmt, ...)                                \
        _dynamic_func_call(fmt, __acpi_handle_debug,                        \
                           handle, pr_fmt(fmt), ##__VA_ARGS__)
#else
#define acpi_handle_debug(handle, fmt, ...)                                \
({                                                                        \
        if (0)                                                                \
                acpi_handle_printk(KERN_DEBUG, handle, fmt, ##__VA_ARGS__); \
        0;                                                                \
})
#endif
#endif

#if defined(CONFIG_ACPI) && defined(CONFIG_GPIOLIB)
bool acpi_gpio_get_irq_resource(struct acpi_resource *ares,
                                struct acpi_resource_gpio **agpio);
bool acpi_gpio_get_io_resource(struct acpi_resource *ares,
                               struct acpi_resource_gpio **agpio);
int acpi_dev_gpio_irq_wake_get_by(struct acpi_device *adev, const char *con_id, int index,
                                  bool *wake_capable);
#else
static inline bool acpi_gpio_get_irq_resource(struct acpi_resource *ares,
                                              struct acpi_resource_gpio **agpio)
{
        return false;
}
static inline bool acpi_gpio_get_io_resource(struct acpi_resource *ares,
                                             struct acpi_resource_gpio **agpio)
{
        return false;
}
static inline int acpi_dev_gpio_irq_wake_get_by(struct acpi_device *adev, const char *con_id,
                                                int index, bool *wake_capable)
{
        return -ENXIO;
}
#endif

static inline int acpi_dev_gpio_irq_wake_get(struct acpi_device *adev, int index,
                                             bool *wake_capable)
{
        return acpi_dev_gpio_irq_wake_get_by(adev, NULL, index, wake_capable);
}

static inline int acpi_dev_gpio_irq_get_by(struct acpi_device *adev, const char *con_id,
                                           int index)
{
        return acpi_dev_gpio_irq_wake_get_by(adev, con_id, index, NULL);
}

static inline int acpi_dev_gpio_irq_get(struct acpi_device *adev, int index)
{
        return acpi_dev_gpio_irq_wake_get_by(adev, NULL, index, NULL);
}

/* Device properties */

#ifdef CONFIG_ACPI
int acpi_dev_get_property(const struct acpi_device *adev, const char *name,
                          acpi_object_type type, const union acpi_object **obj);
int __acpi_node_get_property_reference(const struct fwnode_handle *fwnode,
                                const char *name, size_t index, size_t num_args,
                                struct fwnode_reference_args *args);

static inline int acpi_node_get_property_reference(
                                const struct fwnode_handle *fwnode,
                                const char *name, size_t index,
                                struct fwnode_reference_args *args)
{
        return __acpi_node_get_property_reference(fwnode, name, index,
                NR_FWNODE_REFERENCE_ARGS, args);
}

static inline bool acpi_dev_has_props(const struct acpi_device *adev)
{
        return !list_empty(&adev->data.properties);
}

struct acpi_device_properties *
acpi_data_add_props(struct acpi_device_data *data, const guid_t *guid,
                    union acpi_object *properties);

int acpi_node_prop_get(const struct fwnode_handle *fwnode, const char *propname,
                       void **valptr);

struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode,
                                            struct fwnode_handle *child);

struct acpi_probe_entry;
typedef bool (*acpi_probe_entry_validate_subtbl)(struct acpi_subtable_header *,
                                                 struct acpi_probe_entry *);

#define ACPI_TABLE_ID_LEN        5

/**
 * struct acpi_probe_entry - boot-time probing entry
 * @id:                        ACPI table name
 * @type:                Optional subtable type to match
 *                        (if @id contains subtables)
 * @subtable_valid:        Optional callback to check the validity of
 *                        the subtable
 * @probe_table:        Callback to the driver being probed when table
 *                        match is successful
 * @probe_subtbl:        Callback to the driver being probed when table and
 *                        subtable match (and optional callback is successful)
 * @driver_data:        Sideband data provided back to the driver
 */
struct acpi_probe_entry {
        __u8 id[ACPI_TABLE_ID_LEN];
        __u8 type;
        acpi_probe_entry_validate_subtbl subtable_valid;
        union {
                acpi_tbl_table_handler probe_table;
                acpi_tbl_entry_handler probe_subtbl;
        };
        kernel_ulong_t driver_data;
};

void arch_sort_irqchip_probe(struct acpi_probe_entry *ap_head, int nr);

#define ACPI_DECLARE_PROBE_ENTRY(table, name, table_id, subtable,        \
                                 valid, data, fn)                        \
        static const struct acpi_probe_entry __acpi_probe_##name        \
                __used __section("__" #table "_acpi_probe_table") = {        \
                        .id = table_id,                                        \
                        .type = subtable,                                \
                        .subtable_valid = valid,                        \
                        .probe_table = fn,                                \
                        .driver_data = data,                                \
                }

#define ACPI_DECLARE_SUBTABLE_PROBE_ENTRY(table, name, table_id,        \
                                          subtable, valid, data, fn)        \
        static const struct acpi_probe_entry __acpi_probe_##name        \
                __used __section("__" #table "_acpi_probe_table") = {        \
                        .id = table_id,                                        \
                        .type = subtable,                                \
                        .subtable_valid = valid,                        \
                        .probe_subtbl = fn,                                \
                        .driver_data = data,                                \
                }

#define ACPI_PROBE_TABLE(name)                __##name##_acpi_probe_table
#define ACPI_PROBE_TABLE_END(name)        __##name##_acpi_probe_table_end

int __acpi_probe_device_table(struct acpi_probe_entry *start, int nr);

#define acpi_probe_device_table(t)                                        \
        ({                                                                 \
                extern struct acpi_probe_entry ACPI_PROBE_TABLE(t),        \
                                               ACPI_PROBE_TABLE_END(t);        \
                __acpi_probe_device_table(&ACPI_PROBE_TABLE(t),                \
                                          (&ACPI_PROBE_TABLE_END(t) -        \
                                           &ACPI_PROBE_TABLE(t)));        \
        })
#else
static inline int acpi_dev_get_property(struct acpi_device *adev,
                                        const char *name, acpi_object_type type,
                                        const union acpi_object **obj)
{
        return -ENXIO;
}

static inline int
__acpi_node_get_property_reference(const struct fwnode_handle *fwnode,
                                const char *name, size_t index, size_t num_args,
                                struct fwnode_reference_args *args)
{
        return -ENXIO;
}

static inline int
acpi_node_get_property_reference(const struct fwnode_handle *fwnode,
                                 const char *name, size_t index,
                                 struct fwnode_reference_args *args)
{
        return -ENXIO;
}

static inline int acpi_node_prop_get(const struct fwnode_handle *fwnode,
                                     const char *propname,
                                     void **valptr)
{
        return -ENXIO;
}

static inline struct fwnode_handle *
acpi_get_next_subnode(const struct fwnode_handle *fwnode,
                      struct fwnode_handle *child)
{
        return NULL;
}

static inline struct fwnode_handle *
acpi_graph_get_next_endpoint(const struct fwnode_handle *fwnode,
                             struct fwnode_handle *prev)
{
        return ERR_PTR(-ENXIO);
}

static inline int
acpi_graph_get_remote_endpoint(const struct fwnode_handle *fwnode,
                               struct fwnode_handle **remote,
                               struct fwnode_handle **port,
                               struct fwnode_handle **endpoint)
{
        return -ENXIO;
}

#define ACPI_DECLARE_PROBE_ENTRY(table, name, table_id, subtable, valid, data, fn) \
        static const void * __acpi_table_##name[]                        \
                __attribute__((unused))                                        \
                 = { (void *) table_id,                                        \
                     (void *) subtable,                                        \
                     (void *) valid,                                        \
                     (void *) fn,                                        \
                     (void *) data }

#define acpi_probe_device_table(t)        ({ int __r = 0; __r;})
#endif

#ifdef CONFIG_ACPI_TABLE_UPGRADE
void acpi_table_upgrade(void);
#else
static inline void acpi_table_upgrade(void) { }
#endif

#if defined(CONFIG_ACPI) && defined(CONFIG_ACPI_WATCHDOG)
extern bool acpi_has_watchdog(void);
#else
static inline bool acpi_has_watchdog(void) { return false; }
#endif

#ifdef CONFIG_ACPI_SPCR_TABLE
extern bool qdf2400_e44_present;
int acpi_parse_spcr(bool enable_earlycon, bool enable_console);
#else
static inline int acpi_parse_spcr(bool enable_earlycon, bool enable_console)
{
        return -ENODEV;
}
#endif

#if IS_ENABLED(CONFIG_ACPI_GENERIC_GSI)
int acpi_irq_get(acpi_handle handle, unsigned int index, struct resource *res);
#else
static inline
int acpi_irq_get(acpi_handle handle, unsigned int index, struct resource *res)
{
        return -EINVAL;
}
#endif

#ifdef CONFIG_ACPI_LPIT
int lpit_read_residency_count_address(u64 *address);
#else
static inline int lpit_read_residency_count_address(u64 *address)
{
        return -EINVAL;
}
#endif

#ifdef CONFIG_ACPI_PROCESSOR_IDLE
#ifndef arch_get_idle_state_flags
static inline unsigned int arch_get_idle_state_flags(u32 arch_flags)
{
        return 0;
}
#endif
#endif /* CONFIG_ACPI_PROCESSOR_IDLE */

#ifdef CONFIG_ACPI_PPTT
int acpi_pptt_cpu_is_thread(unsigned int cpu);
int find_acpi_cpu_topology(unsigned int cpu, int level);
int find_acpi_cpu_topology_cluster(unsigned int cpu);
int find_acpi_cpu_topology_package(unsigned int cpu);
int find_acpi_cpu_topology_hetero_id(unsigned int cpu);
#else
static inline int acpi_pptt_cpu_is_thread(unsigned int cpu)
{
        return -EINVAL;
}
static inline int find_acpi_cpu_topology(unsigned int cpu, int level)
{
        return -EINVAL;
}
static inline int find_acpi_cpu_topology_cluster(unsigned int cpu)
{
        return -EINVAL;
}
static inline int find_acpi_cpu_topology_package(unsigned int cpu)
{
        return -EINVAL;
}
static inline int find_acpi_cpu_topology_hetero_id(unsigned int cpu)
{
        return -EINVAL;
}
#endif

void acpi_arch_init(void);

#ifdef CONFIG_ACPI_PCC
void acpi_init_pcc(void);
#else
static inline void acpi_init_pcc(void) { }
#endif

#ifdef CONFIG_ACPI_FFH
void acpi_init_ffh(void);
extern int acpi_ffh_address_space_arch_setup(void *handler_ctxt,
                                             void **region_ctxt);
extern int acpi_ffh_address_space_arch_handler(acpi_integer *value,
                                               void *region_context);
#else
static inline void acpi_init_ffh(void) { }
#endif

#ifdef CONFIG_ACPI
extern void acpi_device_notify(struct device *dev);
extern void acpi_device_notify_remove(struct device *dev);
#else
static inline void acpi_device_notify(struct device *dev) { }
static inline void acpi_device_notify_remove(struct device *dev) { }
#endif

static inline void acpi_use_parent_companion(struct device *dev)
{
        ACPI_COMPANION_SET(dev, ACPI_COMPANION(dev->parent));
}

#ifdef CONFIG_ACPI_NUMA
bool acpi_node_backed_by_real_pxm(int nid);
#else
static inline bool acpi_node_backed_by_real_pxm(int nid)
{
        return false;
}
#endif

#endif        /*_LINUX_ACPI_H*/





























































































































































































































































  318 



















































































































































  316 



































    1 















    4 




















































































































































































































































































































































































































































































































   39 



































































































































































































































































































































   13 

   12 

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   39 

   39 









































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
// SPDX-License-Identifier: GPL-2.0

// Generated by scripts/atomic/gen-atomic-instrumented.sh
// DO NOT MODIFY THIS FILE DIRECTLY

/*
 * This file provoides atomic operations with explicit instrumentation (e.g.
 * KASAN, KCSAN), which should be used unless it is necessary to avoid
 * instrumentation. Where it is necessary to aovid instrumenation, the
 * raw_atomic*() operations should be used.
 */
#ifndef _LINUX_ATOMIC_INSTRUMENTED_H
#define _LINUX_ATOMIC_INSTRUMENTED_H

#include <linux/build_bug.h>
#include <linux/compiler.h>
#include <linux/instrumented.h>

/**
 * atomic_read() - atomic load with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically loads the value of @v with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_read() there.
 *
 * Return: The value loaded from @v.
 */
static __always_inline int
atomic_read(const atomic_t *v)
{
        instrument_atomic_read(v, sizeof(*v));
        return raw_atomic_read(v);
}

/**
 * atomic_read_acquire() - atomic load with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically loads the value of @v with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_read_acquire() there.
 *
 * Return: The value loaded from @v.
 */
static __always_inline int
atomic_read_acquire(const atomic_t *v)
{
        instrument_atomic_read(v, sizeof(*v));
        return raw_atomic_read_acquire(v);
}

/**
 * atomic_set() - atomic set with relaxed ordering
 * @v: pointer to atomic_t
 * @i: int value to assign
 *
 * Atomically sets @v to @i with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_set() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_set(atomic_t *v, int i)
{
        instrument_atomic_write(v, sizeof(*v));
        raw_atomic_set(v, i);
}

/**
 * atomic_set_release() - atomic set with release ordering
 * @v: pointer to atomic_t
 * @i: int value to assign
 *
 * Atomically sets @v to @i with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_set_release() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_set_release(atomic_t *v, int i)
{
        kcsan_release();
        instrument_atomic_write(v, sizeof(*v));
        raw_atomic_set_release(v, i);
}

/**
 * atomic_add() - atomic add with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_add(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_add(i, v);
}

/**
 * atomic_add_return() - atomic add with full ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_add_return(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_return(i, v);
}

/**
 * atomic_add_return_acquire() - atomic add with acquire ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_add_return_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_return_acquire(i, v);
}

/**
 * atomic_add_return_release() - atomic add with release ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_add_return_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_return_release(i, v);
}

/**
 * atomic_add_return_relaxed() - atomic add with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_add_return_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_return_relaxed(i, v);
}

/**
 * atomic_fetch_add() - atomic add with full ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_add() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_add(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_add(i, v);
}

/**
 * atomic_fetch_add_acquire() - atomic add with acquire ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_add_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_add_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_add_acquire(i, v);
}

/**
 * atomic_fetch_add_release() - atomic add with release ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_add_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_add_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_add_release(i, v);
}

/**
 * atomic_fetch_add_relaxed() - atomic add with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_add_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_add_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_add_relaxed(i, v);
}

/**
 * atomic_sub() - atomic subtract with relaxed ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_sub() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_sub(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_sub(i, v);
}

/**
 * atomic_sub_return() - atomic subtract with full ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_sub_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_sub_return(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_sub_return(i, v);
}

/**
 * atomic_sub_return_acquire() - atomic subtract with acquire ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_sub_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_sub_return_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_sub_return_acquire(i, v);
}

/**
 * atomic_sub_return_release() - atomic subtract with release ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_sub_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_sub_return_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_sub_return_release(i, v);
}

/**
 * atomic_sub_return_relaxed() - atomic subtract with relaxed ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_sub_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_sub_return_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_sub_return_relaxed(i, v);
}

/**
 * atomic_fetch_sub() - atomic subtract with full ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_sub() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_sub(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_sub(i, v);
}

/**
 * atomic_fetch_sub_acquire() - atomic subtract with acquire ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_sub_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_sub_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_sub_acquire(i, v);
}

/**
 * atomic_fetch_sub_release() - atomic subtract with release ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_sub_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_sub_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_sub_release(i, v);
}

/**
 * atomic_fetch_sub_relaxed() - atomic subtract with relaxed ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_sub_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_sub_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_sub_relaxed(i, v);
}

/**
 * atomic_inc() - atomic increment with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_inc(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_inc(v);
}

/**
 * atomic_inc_return() - atomic increment with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_inc_return(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_return(v);
}

/**
 * atomic_inc_return_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_inc_return_acquire(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_return_acquire(v);
}

/**
 * atomic_inc_return_release() - atomic increment with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_inc_return_release(atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_return_release(v);
}

/**
 * atomic_inc_return_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_inc_return_relaxed(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_return_relaxed(v);
}

/**
 * atomic_fetch_inc() - atomic increment with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_inc() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_inc(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_inc(v);
}

/**
 * atomic_fetch_inc_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_inc_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_inc_acquire(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_inc_acquire(v);
}

/**
 * atomic_fetch_inc_release() - atomic increment with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_inc_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_inc_release(atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_inc_release(v);
}

/**
 * atomic_fetch_inc_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_inc_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_inc_relaxed(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_inc_relaxed(v);
}

/**
 * atomic_dec() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_dec(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_dec(v);
}

/**
 * atomic_dec_return() - atomic decrement with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_dec_return(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_return(v);
}

/**
 * atomic_dec_return_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_dec_return_acquire(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_return_acquire(v);
}

/**
 * atomic_dec_return_release() - atomic decrement with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_dec_return_release(atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_return_release(v);
}

/**
 * atomic_dec_return_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_dec_return_relaxed(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_return_relaxed(v);
}

/**
 * atomic_fetch_dec() - atomic decrement with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_dec() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_dec(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_dec(v);
}

/**
 * atomic_fetch_dec_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_dec_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_dec_acquire(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_dec_acquire(v);
}

/**
 * atomic_fetch_dec_release() - atomic decrement with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_dec_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_dec_release(atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_dec_release(v);
}

/**
 * atomic_fetch_dec_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_dec_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_dec_relaxed(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_dec_relaxed(v);
}

/**
 * atomic_and() - atomic bitwise AND with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_and() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_and(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_and(i, v);
}

/**
 * atomic_fetch_and() - atomic bitwise AND with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_and() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_and(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_and(i, v);
}

/**
 * atomic_fetch_and_acquire() - atomic bitwise AND with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_and_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_and_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_and_acquire(i, v);
}

/**
 * atomic_fetch_and_release() - atomic bitwise AND with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_and_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_and_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_and_release(i, v);
}

/**
 * atomic_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_and_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_and_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_and_relaxed(i, v);
}

/**
 * atomic_andnot() - atomic bitwise AND NOT with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_andnot() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_andnot(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_andnot(i, v);
}

/**
 * atomic_fetch_andnot() - atomic bitwise AND NOT with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_andnot() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_andnot(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_andnot(i, v);
}

/**
 * atomic_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_andnot_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_andnot_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_andnot_acquire(i, v);
}

/**
 * atomic_fetch_andnot_release() - atomic bitwise AND NOT with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_andnot_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_andnot_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_andnot_release(i, v);
}

/**
 * atomic_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_andnot_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_andnot_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_andnot_relaxed(i, v);
}

/**
 * atomic_or() - atomic bitwise OR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_or() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_or(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_or(i, v);
}

/**
 * atomic_fetch_or() - atomic bitwise OR with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_or() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_or(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_or(i, v);
}

/**
 * atomic_fetch_or_acquire() - atomic bitwise OR with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_or_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_or_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_or_acquire(i, v);
}

/**
 * atomic_fetch_or_release() - atomic bitwise OR with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_or_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_or_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_or_release(i, v);
}

/**
 * atomic_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_or_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_or_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_or_relaxed(i, v);
}

/**
 * atomic_xor() - atomic bitwise XOR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_xor() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_xor(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_xor(i, v);
}

/**
 * atomic_fetch_xor() - atomic bitwise XOR with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_xor() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_xor(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_xor(i, v);
}

/**
 * atomic_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_xor_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_xor_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_xor_acquire(i, v);
}

/**
 * atomic_fetch_xor_release() - atomic bitwise XOR with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_xor_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_xor_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_xor_release(i, v);
}

/**
 * atomic_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_xor_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_xor_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_xor_relaxed(i, v);
}

/**
 * atomic_xchg() - atomic exchange with full ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_xchg() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_xchg(atomic_t *v, int new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_xchg(v, new);
}

/**
 * atomic_xchg_acquire() - atomic exchange with acquire ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_xchg_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_xchg_acquire(atomic_t *v, int new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_xchg_acquire(v, new);
}

/**
 * atomic_xchg_release() - atomic exchange with release ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_xchg_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_xchg_release(atomic_t *v, int new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_xchg_release(v, new);
}

/**
 * atomic_xchg_relaxed() - atomic exchange with relaxed ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_xchg_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_xchg_relaxed(atomic_t *v, int new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_xchg_relaxed(v, new);
}

/**
 * atomic_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_cmpxchg() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_cmpxchg(atomic_t *v, int old, int new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_cmpxchg(v, old, new);
}

/**
 * atomic_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_cmpxchg_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_cmpxchg_acquire(atomic_t *v, int old, int new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_cmpxchg_acquire(v, old, new);
}

/**
 * atomic_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_cmpxchg_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_cmpxchg_release(atomic_t *v, int old, int new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_cmpxchg_release(v, old, new);
}

/**
 * atomic_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_cmpxchg_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_cmpxchg_relaxed(atomic_t *v, int old, int new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_cmpxchg_relaxed(v, old, new);
}

/**
 * atomic_try_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic_try_cmpxchg(atomic_t *v, int *old, int new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic_try_cmpxchg(v, old, new);
}

/**
 * atomic_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg_acquire() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic_try_cmpxchg_acquire(v, old, new);
}

/**
 * atomic_try_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg_release() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic_try_cmpxchg_release(atomic_t *v, int *old, int new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic_try_cmpxchg_release(v, old, new);
}

/**
 * atomic_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg_relaxed() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic_try_cmpxchg_relaxed(v, old, new);
}

/**
 * atomic_sub_and_test() - atomic subtract and test if zero with full ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_sub_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic_sub_and_test(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_sub_and_test(i, v);
}

/**
 * atomic_dec_and_test() - atomic decrement and test if zero with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic_dec_and_test(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_and_test(v);
}

/**
 * atomic_inc_and_test() - atomic increment and test if zero with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic_inc_and_test(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_and_test(v);
}

/**
 * atomic_add_negative() - atomic add and test if negative with full ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_negative() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_add_negative(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_negative(i, v);
}

/**
 * atomic_add_negative_acquire() - atomic add and test if negative with acquire ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_negative_acquire() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_add_negative_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_negative_acquire(i, v);
}

/**
 * atomic_add_negative_release() - atomic add and test if negative with release ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_negative_release() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_add_negative_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_negative_release(i, v);
}

/**
 * atomic_add_negative_relaxed() - atomic add and test if negative with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_negative_relaxed() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_add_negative_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_negative_relaxed(i, v);
}

/**
 * atomic_fetch_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_t
 * @a: int value to add
 * @u: int value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_add_unless() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_add_unless(atomic_t *v, int a, int u)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_add_unless(v, a, u);
}

/**
 * atomic_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_t
 * @a: int value to add
 * @u: int value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_unless() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_add_unless(atomic_t *v, int a, int u)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_unless(v, a, u);
}

/**
 * atomic_inc_not_zero() - atomic increment unless zero with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_not_zero() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_inc_not_zero(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_not_zero(v);
}

/**
 * atomic_inc_unless_negative() - atomic increment unless negative with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_unless_negative() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_inc_unless_negative(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_unless_negative(v);
}

/**
 * atomic_dec_unless_positive() - atomic decrement unless positive with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_unless_positive() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_dec_unless_positive(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_unless_positive(v);
}

/**
 * atomic_dec_if_positive() - atomic decrement if positive with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_if_positive() there.
 *
 * Return: The old value of (@v - 1), regardless of whether @v was updated.
 */
static __always_inline int
atomic_dec_if_positive(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_if_positive(v);
}

/**
 * atomic64_read() - atomic load with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically loads the value of @v with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_read() there.
 *
 * Return: The value loaded from @v.
 */
static __always_inline s64
atomic64_read(const atomic64_t *v)
{
        instrument_atomic_read(v, sizeof(*v));
        return raw_atomic64_read(v);
}

/**
 * atomic64_read_acquire() - atomic load with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically loads the value of @v with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_read_acquire() there.
 *
 * Return: The value loaded from @v.
 */
static __always_inline s64
atomic64_read_acquire(const atomic64_t *v)
{
        instrument_atomic_read(v, sizeof(*v));
        return raw_atomic64_read_acquire(v);
}

/**
 * atomic64_set() - atomic set with relaxed ordering
 * @v: pointer to atomic64_t
 * @i: s64 value to assign
 *
 * Atomically sets @v to @i with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_set() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_set(atomic64_t *v, s64 i)
{
        instrument_atomic_write(v, sizeof(*v));
        raw_atomic64_set(v, i);
}

/**
 * atomic64_set_release() - atomic set with release ordering
 * @v: pointer to atomic64_t
 * @i: s64 value to assign
 *
 * Atomically sets @v to @i with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_set_release() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_set_release(atomic64_t *v, s64 i)
{
        kcsan_release();
        instrument_atomic_write(v, sizeof(*v));
        raw_atomic64_set_release(v, i);
}

/**
 * atomic64_add() - atomic add with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_add(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_add(i, v);
}

/**
 * atomic64_add_return() - atomic add with full ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_add_return(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_return(i, v);
}

/**
 * atomic64_add_return_acquire() - atomic add with acquire ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_add_return_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_return_acquire(i, v);
}

/**
 * atomic64_add_return_release() - atomic add with release ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_add_return_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_return_release(i, v);
}

/**
 * atomic64_add_return_relaxed() - atomic add with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_add_return_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_return_relaxed(i, v);
}

/**
 * atomic64_fetch_add() - atomic add with full ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_add() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_add(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_add(i, v);
}

/**
 * atomic64_fetch_add_acquire() - atomic add with acquire ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_add_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_add_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_add_acquire(i, v);
}

/**
 * atomic64_fetch_add_release() - atomic add with release ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_add_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_add_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_add_release(i, v);
}

/**
 * atomic64_fetch_add_relaxed() - atomic add with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_add_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_add_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_add_relaxed(i, v);
}

/**
 * atomic64_sub() - atomic subtract with relaxed ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_sub() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_sub(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_sub(i, v);
}

/**
 * atomic64_sub_return() - atomic subtract with full ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_sub_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_sub_return(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_sub_return(i, v);
}

/**
 * atomic64_sub_return_acquire() - atomic subtract with acquire ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_sub_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_sub_return_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_sub_return_acquire(i, v);
}

/**
 * atomic64_sub_return_release() - atomic subtract with release ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_sub_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_sub_return_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_sub_return_release(i, v);
}

/**
 * atomic64_sub_return_relaxed() - atomic subtract with relaxed ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_sub_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_sub_return_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_sub_return_relaxed(i, v);
}

/**
 * atomic64_fetch_sub() - atomic subtract with full ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_sub() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_sub(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_sub(i, v);
}

/**
 * atomic64_fetch_sub_acquire() - atomic subtract with acquire ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_sub_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_sub_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_sub_acquire(i, v);
}

/**
 * atomic64_fetch_sub_release() - atomic subtract with release ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_sub_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_sub_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_sub_release(i, v);
}

/**
 * atomic64_fetch_sub_relaxed() - atomic subtract with relaxed ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_sub_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_sub_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_sub_relaxed(i, v);
}

/**
 * atomic64_inc() - atomic increment with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_inc(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_inc(v);
}

/**
 * atomic64_inc_return() - atomic increment with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_inc_return(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_return(v);
}

/**
 * atomic64_inc_return_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_inc_return_acquire(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_return_acquire(v);
}

/**
 * atomic64_inc_return_release() - atomic increment with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_inc_return_release(atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_return_release(v);
}

/**
 * atomic64_inc_return_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_inc_return_relaxed(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_return_relaxed(v);
}

/**
 * atomic64_fetch_inc() - atomic increment with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_inc() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_inc(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_inc(v);
}

/**
 * atomic64_fetch_inc_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_inc_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_inc_acquire(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_inc_acquire(v);
}

/**
 * atomic64_fetch_inc_release() - atomic increment with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_inc_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_inc_release(atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_inc_release(v);
}

/**
 * atomic64_fetch_inc_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_inc_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_inc_relaxed(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_inc_relaxed(v);
}

/**
 * atomic64_dec() - atomic decrement with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_dec(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_dec(v);
}

/**
 * atomic64_dec_return() - atomic decrement with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_dec_return(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_return(v);
}

/**
 * atomic64_dec_return_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_dec_return_acquire(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_return_acquire(v);
}

/**
 * atomic64_dec_return_release() - atomic decrement with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_dec_return_release(atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_return_release(v);
}

/**
 * atomic64_dec_return_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_dec_return_relaxed(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_return_relaxed(v);
}

/**
 * atomic64_fetch_dec() - atomic decrement with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_dec() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_dec(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_dec(v);
}

/**
 * atomic64_fetch_dec_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_dec_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_dec_acquire(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_dec_acquire(v);
}

/**
 * atomic64_fetch_dec_release() - atomic decrement with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_dec_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_dec_release(atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_dec_release(v);
}

/**
 * atomic64_fetch_dec_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_dec_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_dec_relaxed(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_dec_relaxed(v);
}

/**
 * atomic64_and() - atomic bitwise AND with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_and() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_and(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_and(i, v);
}

/**
 * atomic64_fetch_and() - atomic bitwise AND with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_and() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_and(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_and(i, v);
}

/**
 * atomic64_fetch_and_acquire() - atomic bitwise AND with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_and_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_and_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_and_acquire(i, v);
}

/**
 * atomic64_fetch_and_release() - atomic bitwise AND with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_and_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_and_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_and_release(i, v);
}

/**
 * atomic64_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_and_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_and_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_and_relaxed(i, v);
}

/**
 * atomic64_andnot() - atomic bitwise AND NOT with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_andnot() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_andnot(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_andnot(i, v);
}

/**
 * atomic64_fetch_andnot() - atomic bitwise AND NOT with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_andnot() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_andnot(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_andnot(i, v);
}

/**
 * atomic64_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_andnot_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_andnot_acquire(i, v);
}

/**
 * atomic64_fetch_andnot_release() - atomic bitwise AND NOT with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_andnot_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_andnot_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_andnot_release(i, v);
}

/**
 * atomic64_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_andnot_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_andnot_relaxed(i, v);
}

/**
 * atomic64_or() - atomic bitwise OR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_or() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_or(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_or(i, v);
}

/**
 * atomic64_fetch_or() - atomic bitwise OR with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_or() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_or(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_or(i, v);
}

/**
 * atomic64_fetch_or_acquire() - atomic bitwise OR with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_or_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_or_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_or_acquire(i, v);
}

/**
 * atomic64_fetch_or_release() - atomic bitwise OR with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_or_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_or_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_or_release(i, v);
}

/**
 * atomic64_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_or_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_or_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_or_relaxed(i, v);
}

/**
 * atomic64_xor() - atomic bitwise XOR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_xor() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_xor(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_xor(i, v);
}

/**
 * atomic64_fetch_xor() - atomic bitwise XOR with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_xor() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_xor(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_xor(i, v);
}

/**
 * atomic64_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_xor_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_xor_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_xor_acquire(i, v);
}

/**
 * atomic64_fetch_xor_release() - atomic bitwise XOR with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_xor_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_xor_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_xor_release(i, v);
}

/**
 * atomic64_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_xor_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_xor_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_xor_relaxed(i, v);
}

/**
 * atomic64_xchg() - atomic exchange with full ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_xchg() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_xchg(atomic64_t *v, s64 new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_xchg(v, new);
}

/**
 * atomic64_xchg_acquire() - atomic exchange with acquire ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_xchg_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_xchg_acquire(atomic64_t *v, s64 new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_xchg_acquire(v, new);
}

/**
 * atomic64_xchg_release() - atomic exchange with release ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_xchg_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_xchg_release(atomic64_t *v, s64 new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_xchg_release(v, new);
}

/**
 * atomic64_xchg_relaxed() - atomic exchange with relaxed ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_xchg_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_xchg_relaxed(atomic64_t *v, s64 new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_xchg_relaxed(v, new);
}

/**
 * atomic64_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_cmpxchg(v, old, new);
}

/**
 * atomic64_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_cmpxchg_acquire(v, old, new);
}

/**
 * atomic64_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_cmpxchg_release(v, old, new);
}

/**
 * atomic64_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_cmpxchg_relaxed(atomic64_t *v, s64 old, s64 new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_cmpxchg_relaxed(v, old, new);
}

/**
 * atomic64_try_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic64_try_cmpxchg(v, old, new);
}

/**
 * atomic64_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg_acquire() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic64_try_cmpxchg_acquire(v, old, new);
}

/**
 * atomic64_try_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg_release() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic64_try_cmpxchg_release(v, old, new);
}

/**
 * atomic64_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg_relaxed() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic64_try_cmpxchg_relaxed(v, old, new);
}

/**
 * atomic64_sub_and_test() - atomic subtract and test if zero with full ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_sub_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic64_sub_and_test(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_sub_and_test(i, v);
}

/**
 * atomic64_dec_and_test() - atomic decrement and test if zero with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic64_dec_and_test(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_and_test(v);
}

/**
 * atomic64_inc_and_test() - atomic increment and test if zero with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic64_inc_and_test(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_and_test(v);
}

/**
 * atomic64_add_negative() - atomic add and test if negative with full ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_negative() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic64_add_negative(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_negative(i, v);
}

/**
 * atomic64_add_negative_acquire() - atomic add and test if negative with acquire ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_negative_acquire() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic64_add_negative_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_negative_acquire(i, v);
}

/**
 * atomic64_add_negative_release() - atomic add and test if negative with release ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_negative_release() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic64_add_negative_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_negative_release(i, v);
}

/**
 * atomic64_add_negative_relaxed() - atomic add and test if negative with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_negative_relaxed() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic64_add_negative_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_negative_relaxed(i, v);
}

/**
 * atomic64_fetch_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic64_t
 * @a: s64 value to add
 * @u: s64 value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_add_unless() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_add_unless(v, a, u);
}

/**
 * atomic64_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic64_t
 * @a: s64 value to add
 * @u: s64 value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_unless() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_unless(v, a, u);
}

/**
 * atomic64_inc_not_zero() - atomic increment unless zero with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_not_zero() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic64_inc_not_zero(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_not_zero(v);
}

/**
 * atomic64_inc_unless_negative() - atomic increment unless negative with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_unless_negative() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic64_inc_unless_negative(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_unless_negative(v);
}

/**
 * atomic64_dec_unless_positive() - atomic decrement unless positive with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_unless_positive() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic64_dec_unless_positive(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_unless_positive(v);
}

/**
 * atomic64_dec_if_positive() - atomic decrement if positive with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_if_positive() there.
 *
 * Return: The old value of (@v - 1), regardless of whether @v was updated.
 */
static __always_inline s64
atomic64_dec_if_positive(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_if_positive(v);
}

/**
 * atomic_long_read() - atomic load with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically loads the value of @v with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_read() there.
 *
 * Return: The value loaded from @v.
 */
static __always_inline long
atomic_long_read(const atomic_long_t *v)
{
        instrument_atomic_read(v, sizeof(*v));
        return raw_atomic_long_read(v);
}

/**
 * atomic_long_read_acquire() - atomic load with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically loads the value of @v with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_read_acquire() there.
 *
 * Return: The value loaded from @v.
 */
static __always_inline long
atomic_long_read_acquire(const atomic_long_t *v)
{
        instrument_atomic_read(v, sizeof(*v));
        return raw_atomic_long_read_acquire(v);
}

/**
 * atomic_long_set() - atomic set with relaxed ordering
 * @v: pointer to atomic_long_t
 * @i: long value to assign
 *
 * Atomically sets @v to @i with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_set() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_set(atomic_long_t *v, long i)
{
        instrument_atomic_write(v, sizeof(*v));
        raw_atomic_long_set(v, i);
}

/**
 * atomic_long_set_release() - atomic set with release ordering
 * @v: pointer to atomic_long_t
 * @i: long value to assign
 *
 * Atomically sets @v to @i with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_set_release() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_set_release(atomic_long_t *v, long i)
{
        kcsan_release();
        instrument_atomic_write(v, sizeof(*v));
        raw_atomic_long_set_release(v, i);
}

/**
 * atomic_long_add() - atomic add with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_add(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_add(i, v);
}

/**
 * atomic_long_add_return() - atomic add with full ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_add_return(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_return(i, v);
}

/**
 * atomic_long_add_return_acquire() - atomic add with acquire ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_add_return_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_return_acquire(i, v);
}

/**
 * atomic_long_add_return_release() - atomic add with release ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_add_return_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_return_release(i, v);
}

/**
 * atomic_long_add_return_relaxed() - atomic add with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_add_return_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_return_relaxed(i, v);
}

/**
 * atomic_long_fetch_add() - atomic add with full ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_add(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_add(i, v);
}

/**
 * atomic_long_fetch_add_acquire() - atomic add with acquire ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_add_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_add_acquire(i, v);
}

/**
 * atomic_long_fetch_add_release() - atomic add with release ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_add_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_add_release(i, v);
}

/**
 * atomic_long_fetch_add_relaxed() - atomic add with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_add_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_add_relaxed(i, v);
}

/**
 * atomic_long_sub() - atomic subtract with relaxed ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_sub() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_sub(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_sub(i, v);
}

/**
 * atomic_long_sub_return() - atomic subtract with full ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_sub_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_sub_return(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_sub_return(i, v);
}

/**
 * atomic_long_sub_return_acquire() - atomic subtract with acquire ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_sub_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_sub_return_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_sub_return_acquire(i, v);
}

/**
 * atomic_long_sub_return_release() - atomic subtract with release ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_sub_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_sub_return_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_sub_return_release(i, v);
}

/**
 * atomic_long_sub_return_relaxed() - atomic subtract with relaxed ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_sub_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_sub_return_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_sub_return_relaxed(i, v);
}

/**
 * atomic_long_fetch_sub() - atomic subtract with full ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_sub() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_sub(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_sub(i, v);
}

/**
 * atomic_long_fetch_sub_acquire() - atomic subtract with acquire ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_sub_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_sub_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_sub_acquire(i, v);
}

/**
 * atomic_long_fetch_sub_release() - atomic subtract with release ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_sub_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_sub_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_sub_release(i, v);
}

/**
 * atomic_long_fetch_sub_relaxed() - atomic subtract with relaxed ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_sub_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_sub_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_sub_relaxed(i, v);
}

/**
 * atomic_long_inc() - atomic increment with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_inc(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_inc(v);
}

/**
 * atomic_long_inc_return() - atomic increment with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_inc_return(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_return(v);
}

/**
 * atomic_long_inc_return_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_inc_return_acquire(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_return_acquire(v);
}

/**
 * atomic_long_inc_return_release() - atomic increment with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_inc_return_release(atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_return_release(v);
}

/**
 * atomic_long_inc_return_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_inc_return_relaxed(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_return_relaxed(v);
}

/**
 * atomic_long_fetch_inc() - atomic increment with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_inc() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_inc(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_inc(v);
}

/**
 * atomic_long_fetch_inc_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_inc_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_inc_acquire(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_inc_acquire(v);
}

/**
 * atomic_long_fetch_inc_release() - atomic increment with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_inc_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_inc_release(atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_inc_release(v);
}

/**
 * atomic_long_fetch_inc_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_inc_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_inc_relaxed(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_inc_relaxed(v);
}

/**
 * atomic_long_dec() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_dec(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_dec(v);
}

/**
 * atomic_long_dec_return() - atomic decrement with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_dec_return(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_return(v);
}

/**
 * atomic_long_dec_return_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_dec_return_acquire(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_return_acquire(v);
}

/**
 * atomic_long_dec_return_release() - atomic decrement with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_dec_return_release(atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_return_release(v);
}

/**
 * atomic_long_dec_return_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_dec_return_relaxed(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_return_relaxed(v);
}

/**
 * atomic_long_fetch_dec() - atomic decrement with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_dec() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_dec(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_dec(v);
}

/**
 * atomic_long_fetch_dec_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_dec_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_dec_acquire(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_dec_acquire(v);
}

/**
 * atomic_long_fetch_dec_release() - atomic decrement with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_dec_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_dec_release(atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_dec_release(v);
}

/**
 * atomic_long_fetch_dec_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_dec_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_dec_relaxed(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_dec_relaxed(v);
}

/**
 * atomic_long_and() - atomic bitwise AND with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_and() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_and(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_and(i, v);
}

/**
 * atomic_long_fetch_and() - atomic bitwise AND with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_and() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_and(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_and(i, v);
}

/**
 * atomic_long_fetch_and_acquire() - atomic bitwise AND with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_and_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_and_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_and_acquire(i, v);
}

/**
 * atomic_long_fetch_and_release() - atomic bitwise AND with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_and_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_and_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_and_release(i, v);
}

/**
 * atomic_long_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_and_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_and_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_and_relaxed(i, v);
}

/**
 * atomic_long_andnot() - atomic bitwise AND NOT with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_andnot() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_andnot(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_andnot(i, v);
}

/**
 * atomic_long_fetch_andnot() - atomic bitwise AND NOT with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_andnot() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_andnot(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_andnot(i, v);
}

/**
 * atomic_long_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_andnot_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_andnot_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_andnot_acquire(i, v);
}

/**
 * atomic_long_fetch_andnot_release() - atomic bitwise AND NOT with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_andnot_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_andnot_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_andnot_release(i, v);
}

/**
 * atomic_long_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_andnot_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_andnot_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_andnot_relaxed(i, v);
}

/**
 * atomic_long_or() - atomic bitwise OR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_or() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_or(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_or(i, v);
}

/**
 * atomic_long_fetch_or() - atomic bitwise OR with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_or() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_or(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_or(i, v);
}

/**
 * atomic_long_fetch_or_acquire() - atomic bitwise OR with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_or_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_or_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_or_acquire(i, v);
}

/**
 * atomic_long_fetch_or_release() - atomic bitwise OR with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_or_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_or_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_or_release(i, v);
}

/**
 * atomic_long_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_or_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_or_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_or_relaxed(i, v);
}

/**
 * atomic_long_xor() - atomic bitwise XOR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_xor() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_xor(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_xor(i, v);
}

/**
 * atomic_long_fetch_xor() - atomic bitwise XOR with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_xor() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_xor(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_xor(i, v);
}

/**
 * atomic_long_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_xor_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_xor_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_xor_acquire(i, v);
}

/**
 * atomic_long_fetch_xor_release() - atomic bitwise XOR with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_xor_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_xor_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_xor_release(i, v);
}

/**
 * atomic_long_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_xor_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_xor_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_xor_relaxed(i, v);
}

/**
 * atomic_long_xchg() - atomic exchange with full ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_xchg() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_xchg(atomic_long_t *v, long new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_xchg(v, new);
}

/**
 * atomic_long_xchg_acquire() - atomic exchange with acquire ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_xchg_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_xchg_acquire(atomic_long_t *v, long new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_xchg_acquire(v, new);
}

/**
 * atomic_long_xchg_release() - atomic exchange with release ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_xchg_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_xchg_release(atomic_long_t *v, long new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_xchg_release(v, new);
}

/**
 * atomic_long_xchg_relaxed() - atomic exchange with relaxed ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_xchg_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_xchg_relaxed(atomic_long_t *v, long new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_xchg_relaxed(v, new);
}

/**
 * atomic_long_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_cmpxchg(atomic_long_t *v, long old, long new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_cmpxchg(v, old, new);
}

/**
 * atomic_long_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_cmpxchg_acquire(atomic_long_t *v, long old, long new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_cmpxchg_acquire(v, old, new);
}

/**
 * atomic_long_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_cmpxchg_release(atomic_long_t *v, long old, long new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_cmpxchg_release(v, old, new);
}

/**
 * atomic_long_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_cmpxchg_relaxed(atomic_long_t *v, long old, long new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_cmpxchg_relaxed(v, old, new);
}

/**
 * atomic_long_try_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic_long_try_cmpxchg(v, old, new);
}

/**
 * atomic_long_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg_acquire() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic_long_try_cmpxchg_acquire(v, old, new);
}

/**
 * atomic_long_try_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg_release() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic_long_try_cmpxchg_release(v, old, new);
}

/**
 * atomic_long_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg_relaxed() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic_long_try_cmpxchg_relaxed(v, old, new);
}

/**
 * atomic_long_sub_and_test() - atomic subtract and test if zero with full ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_sub_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic_long_sub_and_test(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_sub_and_test(i, v);
}

/**
 * atomic_long_dec_and_test() - atomic decrement and test if zero with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic_long_dec_and_test(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_and_test(v);
}

/**
 * atomic_long_inc_and_test() - atomic increment and test if zero with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic_long_inc_and_test(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_and_test(v);
}

/**
 * atomic_long_add_negative() - atomic add and test if negative with full ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_negative() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_long_add_negative(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_negative(i, v);
}

/**
 * atomic_long_add_negative_acquire() - atomic add and test if negative with acquire ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_negative_acquire() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_long_add_negative_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_negative_acquire(i, v);
}

/**
 * atomic_long_add_negative_release() - atomic add and test if negative with release ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_negative_release() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_long_add_negative_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_negative_release(i, v);
}

/**
 * atomic_long_add_negative_relaxed() - atomic add and test if negative with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_negative_relaxed() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_long_add_negative_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_negative_relaxed(i, v);
}

/**
 * atomic_long_fetch_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_long_t
 * @a: long value to add
 * @u: long value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_unless() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_add_unless(v, a, u);
}

/**
 * atomic_long_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_long_t
 * @a: long value to add
 * @u: long value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_unless() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_long_add_unless(atomic_long_t *v, long a, long u)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_unless(v, a, u);
}

/**
 * atomic_long_inc_not_zero() - atomic increment unless zero with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_not_zero() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_long_inc_not_zero(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_not_zero(v);
}

/**
 * atomic_long_inc_unless_negative() - atomic increment unless negative with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_unless_negative() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_long_inc_unless_negative(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_unless_negative(v);
}

/**
 * atomic_long_dec_unless_positive() - atomic decrement unless positive with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_unless_positive() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_long_dec_unless_positive(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_unless_positive(v);
}

/**
 * atomic_long_dec_if_positive() - atomic decrement if positive with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_if_positive() there.
 *
 * Return: The old value of (@v - 1), regardless of whether @v was updated.
 */
static __always_inline long
atomic_long_dec_if_positive(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_if_positive(v);
}

#define xchg(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_xchg(__ai_ptr, __VA_ARGS__); \
})

#define xchg_acquire(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_xchg_acquire(__ai_ptr, __VA_ARGS__); \
})

#define xchg_release(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_xchg_release(__ai_ptr, __VA_ARGS__); \
})

#define xchg_relaxed(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_xchg_relaxed(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg_acquire(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg_acquire(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg_release(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg_release(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg_relaxed(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg_relaxed(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg64(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg64(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg64_acquire(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg64_acquire(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg64_release(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg64_release(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg64_relaxed(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg64_relaxed(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg128(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg128(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg128_acquire(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg128_acquire(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg128_release(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg128_release(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg128_relaxed(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg128_relaxed(__ai_ptr, __VA_ARGS__); \
})

#define try_cmpxchg(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg_acquire(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg_acquire(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg_release(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg_relaxed(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg64(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg64(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg64_acquire(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg64_acquire(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg64_release(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg64_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg64_relaxed(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg64_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg128(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg128(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg128_acquire(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg128_acquire(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg128_release(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg128_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg128_relaxed(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg128_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define cmpxchg_local(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg_local(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg64_local(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg64_local(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg128_local(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg128_local(__ai_ptr, __VA_ARGS__); \
})

#define sync_cmpxchg(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_sync_cmpxchg(__ai_ptr, __VA_ARGS__); \
})

#define try_cmpxchg_local(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg64_local(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg64_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg128_local(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg128_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define sync_try_cmpxchg(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_sync_try_cmpxchg(__ai_ptr, __VA_ARGS__); \
})


#endif /* _LINUX_ATOMIC_INSTRUMENTED_H */
// 8829b337928e9508259079d32581775ececd415b





















































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        Linux INET6 implementation 
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>        
 */

#ifndef _IP6_FIB_H
#define _IP6_FIB_H

#include <linux/ipv6_route.h>
#include <linux/rtnetlink.h>
#include <linux/spinlock.h>
#include <linux/notifier.h>
#include <net/dst.h>
#include <net/flow.h>
#include <net/ip_fib.h>
#include <net/netlink.h>
#include <net/inetpeer.h>
#include <net/fib_notifier.h>
#include <linux/indirect_call_wrapper.h>
#include <uapi/linux/bpf.h>

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
#define FIB6_TABLE_HASHSZ 256
#else
#define FIB6_TABLE_HASHSZ 1
#endif

#define RT6_DEBUG 2

struct rt6_info;
struct fib6_info;

struct fib6_config {
        u32                fc_table;
        u32                fc_metric;
        int                fc_dst_len;
        int                fc_src_len;
        int                fc_ifindex;
        u32                fc_flags;
        u32                fc_protocol;
        u16                fc_type;        /* only 8 bits are used */
        u16                fc_delete_all_nh : 1,
                        fc_ignore_dev_down:1,
                        __unused : 14;
        u32                fc_nh_id;

        struct in6_addr        fc_dst;
        struct in6_addr        fc_src;
        struct in6_addr        fc_prefsrc;
        struct in6_addr        fc_gateway;

        unsigned long        fc_expires;
        struct nlattr        *fc_mx;
        int                fc_mx_len;
        int                fc_mp_len;
        struct nlattr        *fc_mp;

        struct nl_info        fc_nlinfo;
        struct nlattr        *fc_encap;
        u16                fc_encap_type;
        bool                fc_is_fdb;
};

struct fib6_node {
        struct fib6_node __rcu        *parent;
        struct fib6_node __rcu        *left;
        struct fib6_node __rcu        *right;
#ifdef CONFIG_IPV6_SUBTREES
        struct fib6_node __rcu        *subtree;
#endif
        struct fib6_info __rcu        *leaf;

        __u16                        fn_bit;                /* bit key */
        __u16                        fn_flags;
        int                        fn_sernum;
        struct fib6_info __rcu        *rr_ptr;
        struct rcu_head                rcu;
};

struct fib6_gc_args {
        int                        timeout;
        int                        more;
};

#ifndef CONFIG_IPV6_SUBTREES
#define FIB6_SUBTREE(fn)        NULL

static inline bool fib6_routes_require_src(const struct net *net)
{
        return false;
}

static inline void fib6_routes_require_src_inc(struct net *net) {}
static inline void fib6_routes_require_src_dec(struct net *net) {}

#else

static inline bool fib6_routes_require_src(const struct net *net)
{
        return net->ipv6.fib6_routes_require_src > 0;
}

static inline void fib6_routes_require_src_inc(struct net *net)
{
        net->ipv6.fib6_routes_require_src++;
}

static inline void fib6_routes_require_src_dec(struct net *net)
{
        net->ipv6.fib6_routes_require_src--;
}

#define FIB6_SUBTREE(fn)        (rcu_dereference_protected((fn)->subtree, 1))
#endif

/*
 *        routing information
 *
 */

struct rt6key {
        struct in6_addr        addr;
        int                plen;
};

struct fib6_table;

struct rt6_exception_bucket {
        struct hlist_head        chain;
        int                        depth;
};

struct rt6_exception {
        struct hlist_node        hlist;
        struct rt6_info                *rt6i;
        unsigned long                stamp;
        struct rcu_head                rcu;
};

#define FIB6_EXCEPTION_BUCKET_SIZE_SHIFT 10
#define FIB6_EXCEPTION_BUCKET_SIZE (1 << FIB6_EXCEPTION_BUCKET_SIZE_SHIFT)
#define FIB6_MAX_DEPTH 5

struct fib6_nh {
        struct fib_nh_common        nh_common;

#ifdef CONFIG_IPV6_ROUTER_PREF
        unsigned long                last_probe;
#endif

        struct rt6_info * __percpu *rt6i_pcpu;
        struct rt6_exception_bucket __rcu *rt6i_exception_bucket;
};

struct fib6_info {
        struct fib6_table                *fib6_table;
        struct fib6_info __rcu                *fib6_next;
        struct fib6_node __rcu                *fib6_node;

        /* Multipath routes:
         * siblings is a list of fib6_info that have the same metric/weight,
         * destination, but not the same gateway. nsiblings is just a cache
         * to speed up lookup.
         */
        union {
                struct list_head        fib6_siblings;
                struct list_head        nh_list;
        };
        unsigned int                        fib6_nsiblings;

        refcount_t                        fib6_ref;
        unsigned long                        expires;

        struct hlist_node                gc_link;

        struct dst_metrics                *fib6_metrics;
#define fib6_pmtu                fib6_metrics->metrics[RTAX_MTU-1]

        struct rt6key                        fib6_dst;
        u32                                fib6_flags;
        struct rt6key                        fib6_src;
        struct rt6key                        fib6_prefsrc;

        u32                                fib6_metric;
        u8                                fib6_protocol;
        u8                                fib6_type;

        u8                                offload;
        u8                                trap;
        u8                                offload_failed;

        u8                                should_flush:1,
                                        dst_nocount:1,
                                        dst_nopolicy:1,
                                        fib6_destroying:1,
                                        unused:4;

        struct list_head                purge_link;
        struct rcu_head                        rcu;
        struct nexthop                        *nh;
        struct fib6_nh                        fib6_nh[];
};

struct rt6_info {
        struct dst_entry                dst;
        struct fib6_info __rcu                *from;
        int                                sernum;

        struct rt6key                        rt6i_dst;
        struct rt6key                        rt6i_src;
        struct in6_addr                        rt6i_gateway;
        struct inet6_dev                *rt6i_idev;
        u32                                rt6i_flags;

        /* more non-fragment space at head required */
        unsigned short                        rt6i_nfheader_len;
};

struct fib6_result {
        struct fib6_nh                *nh;
        struct fib6_info        *f6i;
        u32                        fib6_flags;
        u8                        fib6_type;
        struct rt6_info                *rt6;
};

#define for_each_fib6_node_rt_rcu(fn)                                        \
        for (rt = rcu_dereference((fn)->leaf); rt;                        \
             rt = rcu_dereference(rt->fib6_next))

#define for_each_fib6_walker_rt(w)                                        \
        for (rt = (w)->leaf; rt;                                        \
             rt = rcu_dereference_protected(rt->fib6_next, 1))

#define dst_rt6_info(_ptr) container_of_const(_ptr, struct rt6_info, dst)

static inline struct inet6_dev *ip6_dst_idev(const struct dst_entry *dst)
{
        return dst_rt6_info(dst)->rt6i_idev;
}

static inline bool fib6_requires_src(const struct fib6_info *rt)
{
        return rt->fib6_src.plen > 0;
}

/* The callers should hold f6i->fib6_table->tb6_lock if a route has ever
 * been added to a table before.
 */
static inline void fib6_clean_expires(struct fib6_info *f6i)
{
        f6i->fib6_flags &= ~RTF_EXPIRES;
        f6i->expires = 0;
}

/* The callers should hold f6i->fib6_table->tb6_lock if a route has ever
 * been added to a table before.
 */
static inline void fib6_set_expires(struct fib6_info *f6i,
                                    unsigned long expires)
{
        f6i->expires = expires;
        f6i->fib6_flags |= RTF_EXPIRES;
}

static inline bool fib6_check_expired(const struct fib6_info *f6i)
{
        if (f6i->fib6_flags & RTF_EXPIRES)
                return time_after(jiffies, f6i->expires);
        return false;
}

/* Function to safely get fn->fn_sernum for passed in rt
 * and store result in passed in cookie.
 * Return true if we can get cookie safely
 * Return false if not
 */
static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i,
                                        u32 *cookie)
{
        struct fib6_node *fn;
        bool status = false;

        fn = rcu_dereference(f6i->fib6_node);

        if (fn) {
                *cookie = READ_ONCE(fn->fn_sernum);
                /* pairs with smp_wmb() in __fib6_update_sernum_upto_root() */
                smp_rmb();
                status = true;
        }

        return status;
}

static inline u32 rt6_get_cookie(const struct rt6_info *rt)
{
        struct fib6_info *from;
        u32 cookie = 0;

        if (rt->sernum)
                return rt->sernum;

        rcu_read_lock();

        from = rcu_dereference(rt->from);
        if (from)
                fib6_get_cookie_safe(from, &cookie);

        rcu_read_unlock();

        return cookie;
}

static inline void ip6_rt_put(struct rt6_info *rt)
{
        /* dst_release() accepts a NULL parameter.
         * We rely on dst being first structure in struct rt6_info
         */
        BUILD_BUG_ON(offsetof(struct rt6_info, dst) != 0);
        dst_release(&rt->dst);
}

struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh);
void fib6_info_destroy_rcu(struct rcu_head *head);

static inline void fib6_info_hold(struct fib6_info *f6i)
{
        refcount_inc(&f6i->fib6_ref);
}

static inline bool fib6_info_hold_safe(struct fib6_info *f6i)
{
        return refcount_inc_not_zero(&f6i->fib6_ref);
}

static inline void fib6_info_release(struct fib6_info *f6i)
{
        if (f6i && refcount_dec_and_test(&f6i->fib6_ref)) {
                DEBUG_NET_WARN_ON_ONCE(!hlist_unhashed(&f6i->gc_link));
                call_rcu_hurry(&f6i->rcu, fib6_info_destroy_rcu);
        }
}

enum fib6_walk_state {
#ifdef CONFIG_IPV6_SUBTREES
        FWS_S,
#endif
        FWS_L,
        FWS_R,
        FWS_C,
        FWS_U
};

struct fib6_walker {
        struct list_head lh;
        struct fib6_node *root, *node;
        struct fib6_info *leaf;
        enum fib6_walk_state state;
        unsigned int skip;
        unsigned int count;
        unsigned int skip_in_node;
        int (*func)(struct fib6_walker *);
        void *args;
};

struct rt6_statistics {
        __u32                fib_nodes;                /* all fib6 nodes */
        __u32                fib_route_nodes;        /* intermediate nodes */
        __u32                fib_rt_entries;                /* rt entries in fib table */
        __u32                fib_rt_cache;                /* cached rt entries in exception table */
        __u32                fib_discarded_routes;        /* total number of routes delete */

        /* The following stat is not protected by any lock */
        atomic_t        fib_rt_alloc;                /* total number of routes alloced */
};

#define RTN_TL_ROOT        0x0001
#define RTN_ROOT        0x0002                /* tree root node                */
#define RTN_RTINFO        0x0004                /* node with valid routing info        */

/*
 *        priority levels (or metrics)
 *
 */


struct fib6_table {
        struct hlist_node        tb6_hlist;
        u32                        tb6_id;
        spinlock_t                tb6_lock;
        struct fib6_node        tb6_root;
        struct inet_peer_base        tb6_peers;
        unsigned int                flags;
        unsigned int                fib_seq; /* writes protected by rtnl_mutex */
        struct hlist_head       tb6_gc_hlist;        /* GC candidates */
#define RT6_TABLE_HAS_DFLT_ROUTER        BIT(0)
};

#define RT6_TABLE_UNSPEC        RT_TABLE_UNSPEC
#define RT6_TABLE_MAIN                RT_TABLE_MAIN
#define RT6_TABLE_DFLT                RT6_TABLE_MAIN
#define RT6_TABLE_INFO                RT6_TABLE_MAIN
#define RT6_TABLE_PREFIX        RT6_TABLE_MAIN

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
#define FIB6_TABLE_MIN                1
#define FIB6_TABLE_MAX                RT_TABLE_MAX
#define RT6_TABLE_LOCAL                RT_TABLE_LOCAL
#else
#define FIB6_TABLE_MIN                RT_TABLE_MAIN
#define FIB6_TABLE_MAX                FIB6_TABLE_MIN
#define RT6_TABLE_LOCAL                RT6_TABLE_MAIN
#endif

typedef struct rt6_info *(*pol_lookup_t)(struct net *,
                                         struct fib6_table *,
                                         struct flowi6 *,
                                         const struct sk_buff *, int);

struct fib6_entry_notifier_info {
        struct fib_notifier_info info; /* must be first */
        struct fib6_info *rt;
        unsigned int nsiblings;
};

/*
 *        exported functions
 */

struct fib6_table *fib6_get_table(struct net *net, u32 id);
struct fib6_table *fib6_new_table(struct net *net, u32 id);
struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
                                   const struct sk_buff *skb,
                                   int flags, pol_lookup_t lookup);

/* called with rcu lock held; can return error pointer
 * caller needs to select path
 */
int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
                struct fib6_result *res, int flags);

/* called with rcu lock held; caller needs to select path */
int fib6_table_lookup(struct net *net, struct fib6_table *table,
                      int oif, struct flowi6 *fl6, struct fib6_result *res,
                      int strict);

void fib6_select_path(const struct net *net, struct fib6_result *res,
                      struct flowi6 *fl6, int oif, bool have_oif_match,
                      const struct sk_buff *skb, int strict);
struct fib6_node *fib6_node_lookup(struct fib6_node *root,
                                   const struct in6_addr *daddr,
                                   const struct in6_addr *saddr);

struct fib6_node *fib6_locate(struct fib6_node *root,
                              const struct in6_addr *daddr, int dst_len,
                              const struct in6_addr *saddr, int src_len,
                              bool exact_match);

void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *arg),
                    void *arg);
void fib6_clean_all_skip_notify(struct net *net,
                                int (*func)(struct fib6_info *, void *arg),
                                void *arg);

int fib6_add(struct fib6_node *root, struct fib6_info *rt,
             struct nl_info *info, struct netlink_ext_ack *extack);
int fib6_del(struct fib6_info *rt, struct nl_info *info);

static inline
void rt6_get_prefsrc(const struct rt6_info *rt, struct in6_addr *addr)
{
        const struct fib6_info *from;

        rcu_read_lock();

        from = rcu_dereference(rt->from);
        if (from)
                *addr = from->fib6_prefsrc.addr;
        else
                *addr = in6addr_any;

        rcu_read_unlock();
}

int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
                 struct fib6_config *cfg, gfp_t gfp_flags,
                 struct netlink_ext_ack *extack);
void fib6_nh_release(struct fib6_nh *fib6_nh);
void fib6_nh_release_dsts(struct fib6_nh *fib6_nh);

int call_fib6_entry_notifiers(struct net *net,
                              enum fib_event_type event_type,
                              struct fib6_info *rt,
                              struct netlink_ext_ack *extack);
int call_fib6_multipath_entry_notifiers(struct net *net,
                                        enum fib_event_type event_type,
                                        struct fib6_info *rt,
                                        unsigned int nsiblings,
                                        struct netlink_ext_ack *extack);
int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt);
void fib6_rt_update(struct net *net, struct fib6_info *rt,
                    struct nl_info *info);
void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
                     unsigned int flags);

void fib6_run_gc(unsigned long expires, struct net *net, bool force);

void fib6_gc_cleanup(void);

int fib6_init(void);

/* Add the route to the gc list if it is not already there
 *
 * The callers should hold f6i->fib6_table->tb6_lock.
 */
static inline void fib6_add_gc_list(struct fib6_info *f6i)
{
        /* If fib6_node is null, the f6i is not in (or removed from) the
         * table.
         *
         * There is a gap between finding the f6i from the table and
         * calling this function without the protection of the tb6_lock.
         * This check makes sure the f6i is not added to the gc list when
         * it is not on the table.
         */
        if (!rcu_dereference_protected(f6i->fib6_node,
                                       lockdep_is_held(&f6i->fib6_table->tb6_lock)))
                return;

        if (hlist_unhashed(&f6i->gc_link))
                hlist_add_head(&f6i->gc_link, &f6i->fib6_table->tb6_gc_hlist);
}

/* Remove the route from the gc list if it is on the list.
 *
 * The callers should hold f6i->fib6_table->tb6_lock.
 */
static inline void fib6_remove_gc_list(struct fib6_info *f6i)
{
        if (!hlist_unhashed(&f6i->gc_link))
                hlist_del_init(&f6i->gc_link);
}

struct ipv6_route_iter {
        struct seq_net_private p;
        struct fib6_walker w;
        loff_t skip;
        struct fib6_table *tbl;
        int sernum;
};

extern const struct seq_operations ipv6_route_seq_ops;

int call_fib6_notifier(struct notifier_block *nb,
                       enum fib_event_type event_type,
                       struct fib_notifier_info *info);
int call_fib6_notifiers(struct net *net, enum fib_event_type event_type,
                        struct fib_notifier_info *info);

int __net_init fib6_notifier_init(struct net *net);
void __net_exit fib6_notifier_exit(struct net *net);

unsigned int fib6_tables_seq_read(const struct net *net);
int fib6_tables_dump(struct net *net, struct notifier_block *nb,
                     struct netlink_ext_ack *extack);

void fib6_update_sernum(struct net *net, struct fib6_info *rt);
void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt);
void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i);

void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val);
static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric)
{
        return !!(f6i->fib6_metrics->metrics[RTAX_LOCK - 1] & (1 << metric));
}
void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i,
                            bool offload, bool trap, bool offload_failed);

#if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL)
struct bpf_iter__ipv6_route {
        __bpf_md_ptr(struct bpf_iter_meta *, meta);
        __bpf_md_ptr(struct fib6_info *, rt);
};
#endif

INDIRECT_CALLABLE_DECLARE(struct rt6_info *ip6_pol_route_output(struct net *net,
                                             struct fib6_table *table,
                                             struct flowi6 *fl6,
                                             const struct sk_buff *skb,
                                             int flags));
INDIRECT_CALLABLE_DECLARE(struct rt6_info *ip6_pol_route_input(struct net *net,
                                             struct fib6_table *table,
                                             struct flowi6 *fl6,
                                             const struct sk_buff *skb,
                                             int flags));
INDIRECT_CALLABLE_DECLARE(struct rt6_info *__ip6_route_redirect(struct net *net,
                                             struct fib6_table *table,
                                             struct flowi6 *fl6,
                                             const struct sk_buff *skb,
                                             int flags));
INDIRECT_CALLABLE_DECLARE(struct rt6_info *ip6_pol_route_lookup(struct net *net,
                                             struct fib6_table *table,
                                             struct flowi6 *fl6,
                                             const struct sk_buff *skb,
                                             int flags));
static inline struct rt6_info *pol_lookup_func(pol_lookup_t lookup,
                                                struct net *net,
                                                struct fib6_table *table,
                                                struct flowi6 *fl6,
                                                const struct sk_buff *skb,
                                                int flags)
{
        return INDIRECT_CALL_4(lookup,
                               ip6_pol_route_output,
                               ip6_pol_route_input,
                               ip6_pol_route_lookup,
                               __ip6_route_redirect,
                               net, table, fl6, skb, flags);
}

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
static inline bool fib6_has_custom_rules(const struct net *net)
{
        return net->ipv6.fib6_has_custom_rules;
}

int fib6_rules_init(void);
void fib6_rules_cleanup(void);
bool fib6_rule_default(const struct fib_rule *rule);
int fib6_rules_dump(struct net *net, struct notifier_block *nb,
                    struct netlink_ext_ack *extack);
unsigned int fib6_rules_seq_read(const struct net *net);

static inline bool fib6_rules_early_flow_dissect(struct net *net,
                                                 struct sk_buff *skb,
                                                 struct flowi6 *fl6,
                                                 struct flow_keys *flkeys)
{
        unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;

        if (!net->ipv6.fib6_rules_require_fldissect)
                return false;

        memset(flkeys, 0, sizeof(*flkeys));
        __skb_flow_dissect(net, skb, &flow_keys_dissector,
                           flkeys, NULL, 0, 0, 0, flag);

        fl6->fl6_sport = flkeys->ports.src;
        fl6->fl6_dport = flkeys->ports.dst;
        fl6->flowi6_proto = flkeys->basic.ip_proto;

        return true;
}
#else
static inline bool fib6_has_custom_rules(const struct net *net)
{
        return false;
}
static inline int               fib6_rules_init(void)
{
        return 0;
}
static inline void              fib6_rules_cleanup(void)
{
        return ;
}
static inline bool fib6_rule_default(const struct fib_rule *rule)
{
        return true;
}
static inline int fib6_rules_dump(struct net *net, struct notifier_block *nb,
                                  struct netlink_ext_ack *extack)
{
        return 0;
}
static inline unsigned int fib6_rules_seq_read(const struct net *net)
{
        return 0;
}
static inline bool fib6_rules_early_flow_dissect(struct net *net,
                                                 struct sk_buff *skb,
                                                 struct flowi6 *fl6,
                                                 struct flow_keys *flkeys)
{
        return false;
}
#endif
#endif





















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _INET_COMMON_H
#define _INET_COMMON_H

#include <linux/indirect_call_wrapper.h>
#include <linux/net.h>
#include <linux/netdev_features.h>
#include <linux/types.h>
#include <net/sock.h>

extern const struct proto_ops inet_stream_ops;
extern const struct proto_ops inet_dgram_ops;

/*
 *        INET4 prototypes used by INET6
 */

struct msghdr;
struct net;
struct page;
struct sock;
struct sockaddr;
struct socket;

int inet_release(struct socket *sock);
int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
                        int addr_len, int flags);
int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
                          int addr_len, int flags, int is_sendmsg);
int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
                       int addr_len, int flags);
int inet_accept(struct socket *sock, struct socket *newsock,
                struct proto_accept_arg *arg);
void __inet_accept(struct socket *sock, struct socket *newsock,
                   struct sock *newsk);
int inet_send_prepare(struct sock *sk);
int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size);
void inet_splice_eof(struct socket *sock);
int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                 int flags);
int inet_shutdown(struct socket *sock, int how);
int inet_listen(struct socket *sock, int backlog);
int __inet_listen_sk(struct sock *sk, int backlog);
void inet_sock_destruct(struct sock *sk);
int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len);
/* Don't allocate port at this moment, defer to connect. */
#define BIND_FORCE_ADDRESS_NO_PORT        (1 << 0)
/* Grab and release socket lock. */
#define BIND_WITH_LOCK                        (1 << 1)
/* Called from BPF program. */
#define BIND_FROM_BPF                        (1 << 2)
/* Skip CAP_NET_BIND_SERVICE check. */
#define BIND_NO_CAP_NET_BIND_SERVICE        (1 << 3)
int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
                u32 flags);
int inet_getname(struct socket *sock, struct sockaddr *uaddr,
                 int peer);
int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
int inet_ctl_sock_create(struct sock **sk, unsigned short family,
                         unsigned short type, unsigned char protocol,
                         struct net *net);
int inet_recv_error(struct sock *sk, struct msghdr *msg, int len,
                    int *addr_len);

struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb);
int inet_gro_complete(struct sk_buff *skb, int nhoff);
struct sk_buff *inet_gso_segment(struct sk_buff *skb,
                                 netdev_features_t features);

static inline void inet_ctl_sock_destroy(struct sock *sk)
{
        if (sk)
                sock_release(sk->sk_socket);
}

#define indirect_call_gro_receive(f2, f1, cb, head, skb)        \
({                                                                \
        unlikely(gro_recursion_inc_test(skb)) ?                        \
                NAPI_GRO_CB(skb)->flush |= 1, NULL :                \
                INDIRECT_CALL_2(cb, f2, f1, head, skb);                \
})

#endif





























































































































































































































































































































































































































































    1 







































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_BITMAP_H
#define __LINUX_BITMAP_H

#ifndef __ASSEMBLY__

#include <linux/align.h>
#include <linux/bitops.h>
#include <linux/cleanup.h>
#include <linux/errno.h>
#include <linux/find.h>
#include <linux/limits.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/bitmap-str.h>

struct device;

/*
 * bitmaps provide bit arrays that consume one or more unsigned
 * longs.  The bitmap interface and available operations are listed
 * here, in bitmap.h
 *
 * Function implementations generic to all architectures are in
 * lib/bitmap.c.  Functions implementations that are architecture
 * specific are in various arch/<arch>/include/asm/bitops.h headers
 * and other arch/<arch> specific files.
 *
 * See lib/bitmap.c for more details.
 */

/**
 * DOC: bitmap overview
 *
 * The available bitmap operations and their rough meaning in the
 * case that the bitmap is a single unsigned long are thus:
 *
 * The generated code is more efficient when nbits is known at
 * compile-time and at most BITS_PER_LONG.
 *
 * ::
 *
 *  bitmap_zero(dst, nbits)                     *dst = 0UL
 *  bitmap_fill(dst, nbits)                     *dst = ~0UL
 *  bitmap_copy(dst, src, nbits)                *dst = *src
 *  bitmap_and(dst, src1, src2, nbits)          *dst = *src1 & *src2
 *  bitmap_or(dst, src1, src2, nbits)           *dst = *src1 | *src2
 *  bitmap_xor(dst, src1, src2, nbits)          *dst = *src1 ^ *src2
 *  bitmap_andnot(dst, src1, src2, nbits)       *dst = *src1 & ~(*src2)
 *  bitmap_complement(dst, src, nbits)          *dst = ~(*src)
 *  bitmap_equal(src1, src2, nbits)             Are *src1 and *src2 equal?
 *  bitmap_intersects(src1, src2, nbits)        Do *src1 and *src2 overlap?
 *  bitmap_subset(src1, src2, nbits)            Is *src1 a subset of *src2?
 *  bitmap_empty(src, nbits)                    Are all bits zero in *src?
 *  bitmap_full(src, nbits)                     Are all bits set in *src?
 *  bitmap_weight(src, nbits)                   Hamming Weight: number set bits
 *  bitmap_weight_and(src1, src2, nbits)        Hamming Weight of and'ed bitmap
 *  bitmap_weight_andnot(src1, src2, nbits)     Hamming Weight of andnot'ed bitmap
 *  bitmap_set(dst, pos, nbits)                 Set specified bit area
 *  bitmap_clear(dst, pos, nbits)               Clear specified bit area
 *  bitmap_find_next_zero_area(buf, len, pos, n, mask)  Find bit free area
 *  bitmap_find_next_zero_area_off(buf, len, pos, n, mask, mask_off)  as above
 *  bitmap_shift_right(dst, src, n, nbits)      *dst = *src >> n
 *  bitmap_shift_left(dst, src, n, nbits)       *dst = *src << n
 *  bitmap_cut(dst, src, first, n, nbits)       Cut n bits from first, copy rest
 *  bitmap_replace(dst, old, new, mask, nbits)  *dst = (*old & ~(*mask)) | (*new & *mask)
 *  bitmap_scatter(dst, src, mask, nbits)        *dst = map(dense, sparse)(src)
 *  bitmap_gather(dst, src, mask, nbits)        *dst = map(sparse, dense)(src)
 *  bitmap_remap(dst, src, old, new, nbits)     *dst = map(old, new)(src)
 *  bitmap_bitremap(oldbit, old, new, nbits)    newbit = map(old, new)(oldbit)
 *  bitmap_onto(dst, orig, relmap, nbits)       *dst = orig relative to relmap
 *  bitmap_fold(dst, orig, sz, nbits)           dst bits = orig bits mod sz
 *  bitmap_parse(buf, buflen, dst, nbits)       Parse bitmap dst from kernel buf
 *  bitmap_parse_user(ubuf, ulen, dst, nbits)   Parse bitmap dst from user buf
 *  bitmap_parselist(buf, dst, nbits)           Parse bitmap dst from kernel buf
 *  bitmap_parselist_user(buf, dst, nbits)      Parse bitmap dst from user buf
 *  bitmap_find_free_region(bitmap, bits, order)  Find and allocate bit region
 *  bitmap_release_region(bitmap, pos, order)   Free specified bit region
 *  bitmap_allocate_region(bitmap, pos, order)  Allocate specified bit region
 *  bitmap_from_arr32(dst, buf, nbits)          Copy nbits from u32[] buf to dst
 *  bitmap_from_arr64(dst, buf, nbits)          Copy nbits from u64[] buf to dst
 *  bitmap_to_arr32(buf, src, nbits)            Copy nbits from buf to u32[] dst
 *  bitmap_to_arr64(buf, src, nbits)            Copy nbits from buf to u64[] dst
 *  bitmap_get_value8(map, start)               Get 8bit value from map at start
 *  bitmap_set_value8(map, value, start)        Set 8bit value to map at start
 *  bitmap_read(map, start, nbits)              Read an nbits-sized value from
 *                                              map at start
 *  bitmap_write(map, value, start, nbits)      Write an nbits-sized value to
 *                                              map at start
 *
 * Note, bitmap_zero() and bitmap_fill() operate over the region of
 * unsigned longs, that is, bits behind bitmap till the unsigned long
 * boundary will be zeroed or filled as well. Consider to use
 * bitmap_clear() or bitmap_set() to make explicit zeroing or filling
 * respectively.
 */

/**
 * DOC: bitmap bitops
 *
 * Also the following operations in asm/bitops.h apply to bitmaps.::
 *
 *  set_bit(bit, addr)                  *addr |= bit
 *  clear_bit(bit, addr)                *addr &= ~bit
 *  change_bit(bit, addr)               *addr ^= bit
 *  test_bit(bit, addr)                 Is bit set in *addr?
 *  test_and_set_bit(bit, addr)         Set bit and return old value
 *  test_and_clear_bit(bit, addr)       Clear bit and return old value
 *  test_and_change_bit(bit, addr)      Change bit and return old value
 *  find_first_zero_bit(addr, nbits)    Position first zero bit in *addr
 *  find_first_bit(addr, nbits)         Position first set bit in *addr
 *  find_next_zero_bit(addr, nbits, bit)
 *                                      Position next zero bit in *addr >= bit
 *  find_next_bit(addr, nbits, bit)     Position next set bit in *addr >= bit
 *  find_next_and_bit(addr1, addr2, nbits, bit)
 *                                      Same as find_next_bit, but in
 *                                      (*addr1 & *addr2)
 *
 */

/**
 * DOC: declare bitmap
 * The DECLARE_BITMAP(name,bits) macro, in linux/types.h, can be used
 * to declare an array named 'name' of just enough unsigned longs to
 * contain all bit positions from 0 to 'bits' - 1.
 */

/*
 * Allocation and deallocation of bitmap.
 * Provided in lib/bitmap.c to avoid circular dependency.
 */
unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags);
unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags);
unsigned long *bitmap_alloc_node(unsigned int nbits, gfp_t flags, int node);
unsigned long *bitmap_zalloc_node(unsigned int nbits, gfp_t flags, int node);
void bitmap_free(const unsigned long *bitmap);

DEFINE_FREE(bitmap, unsigned long *, if (_T) bitmap_free(_T))

/* Managed variants of the above. */
unsigned long *devm_bitmap_alloc(struct device *dev,
                                 unsigned int nbits, gfp_t flags);
unsigned long *devm_bitmap_zalloc(struct device *dev,
                                  unsigned int nbits, gfp_t flags);

/*
 * lib/bitmap.c provides these functions:
 */

bool __bitmap_equal(const unsigned long *bitmap1,
                    const unsigned long *bitmap2, unsigned int nbits);
bool __pure __bitmap_or_equal(const unsigned long *src1,
                              const unsigned long *src2,
                              const unsigned long *src3,
                              unsigned int nbits);
void __bitmap_complement(unsigned long *dst, const unsigned long *src,
                         unsigned int nbits);
void __bitmap_shift_right(unsigned long *dst, const unsigned long *src,
                          unsigned int shift, unsigned int nbits);
void __bitmap_shift_left(unsigned long *dst, const unsigned long *src,
                         unsigned int shift, unsigned int nbits);
void bitmap_cut(unsigned long *dst, const unsigned long *src,
                unsigned int first, unsigned int cut, unsigned int nbits);
bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
                 const unsigned long *bitmap2, unsigned int nbits);
void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
                 const unsigned long *bitmap2, unsigned int nbits);
void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
                  const unsigned long *bitmap2, unsigned int nbits);
bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
                    const unsigned long *bitmap2, unsigned int nbits);
void __bitmap_replace(unsigned long *dst,
                      const unsigned long *old, const unsigned long *new,
                      const unsigned long *mask, unsigned int nbits);
bool __bitmap_intersects(const unsigned long *bitmap1,
                         const unsigned long *bitmap2, unsigned int nbits);
bool __bitmap_subset(const unsigned long *bitmap1,
                     const unsigned long *bitmap2, unsigned int nbits);
unsigned int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits);
unsigned int __bitmap_weight_and(const unsigned long *bitmap1,
                                 const unsigned long *bitmap2, unsigned int nbits);
unsigned int __bitmap_weight_andnot(const unsigned long *bitmap1,
                                    const unsigned long *bitmap2, unsigned int nbits);
void __bitmap_set(unsigned long *map, unsigned int start, int len);
void __bitmap_clear(unsigned long *map, unsigned int start, int len);

unsigned long bitmap_find_next_zero_area_off(unsigned long *map,
                                             unsigned long size,
                                             unsigned long start,
                                             unsigned int nr,
                                             unsigned long align_mask,
                                             unsigned long align_offset);

/**
 * bitmap_find_next_zero_area - find a contiguous aligned zero area
 * @map: The address to base the search on
 * @size: The bitmap size in bits
 * @start: The bitnumber to start searching at
 * @nr: The number of zeroed bits we're looking for
 * @align_mask: Alignment mask for zero area
 *
 * The @align_mask should be one less than a power of 2; the effect is that
 * the bit offset of all zero areas this function finds is multiples of that
 * power of 2. A @align_mask of 0 means no alignment is required.
 */
static __always_inline
unsigned long bitmap_find_next_zero_area(unsigned long *map,
                                         unsigned long size,
                                         unsigned long start,
                                         unsigned int nr,
                                         unsigned long align_mask)
{
        return bitmap_find_next_zero_area_off(map, size, start, nr,
                                              align_mask, 0);
}

void bitmap_remap(unsigned long *dst, const unsigned long *src,
                const unsigned long *old, const unsigned long *new, unsigned int nbits);
int bitmap_bitremap(int oldbit,
                const unsigned long *old, const unsigned long *new, int bits);
void bitmap_onto(unsigned long *dst, const unsigned long *orig,
                const unsigned long *relmap, unsigned int bits);
void bitmap_fold(unsigned long *dst, const unsigned long *orig,
                unsigned int sz, unsigned int nbits);

#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))

#define bitmap_size(nbits)        (ALIGN(nbits, BITS_PER_LONG) / BITS_PER_BYTE)

static __always_inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
{
        unsigned int len = bitmap_size(nbits);

        if (small_const_nbits(nbits))
                *dst = 0;
        else
                memset(dst, 0, len);
}

static __always_inline void bitmap_fill(unsigned long *dst, unsigned int nbits)
{
        unsigned int len = bitmap_size(nbits);

        if (small_const_nbits(nbits))
                *dst = ~0UL;
        else
                memset(dst, 0xff, len);
}

static __always_inline
void bitmap_copy(unsigned long *dst, const unsigned long *src, unsigned int nbits)
{
        unsigned int len = bitmap_size(nbits);

        if (small_const_nbits(nbits))
                *dst = *src;
        else
                memcpy(dst, src, len);
}

/*
 * Copy bitmap and clear tail bits in last word.
 */
static __always_inline
void bitmap_copy_clear_tail(unsigned long *dst, const unsigned long *src, unsigned int nbits)
{
        bitmap_copy(dst, src, nbits);
        if (nbits % BITS_PER_LONG)
                dst[nbits / BITS_PER_LONG] &= BITMAP_LAST_WORD_MASK(nbits);
}

static inline void bitmap_copy_and_extend(unsigned long *to,
                                          const unsigned long *from,
                                          unsigned int count, unsigned int size)
{
        unsigned int copy = BITS_TO_LONGS(count);

        memcpy(to, from, copy * sizeof(long));
        if (count % BITS_PER_LONG)
                to[copy - 1] &= BITMAP_LAST_WORD_MASK(count);
        memset(to + copy, 0, bitmap_size(size) - copy * sizeof(long));
}

/*
 * On 32-bit systems bitmaps are represented as u32 arrays internally. On LE64
 * machines the order of hi and lo parts of numbers match the bitmap structure.
 * In both cases conversion is not needed when copying data from/to arrays of
 * u32. But in LE64 case, typecast in bitmap_copy_clear_tail() may lead
 * to out-of-bound access. To avoid that, both LE and BE variants of 64-bit
 * architectures are not using bitmap_copy_clear_tail().
 */
#if BITS_PER_LONG == 64
void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf,
                                                        unsigned int nbits);
void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap,
                                                        unsigned int nbits);
#else
#define bitmap_from_arr32(bitmap, buf, nbits)                        \
        bitmap_copy_clear_tail((unsigned long *) (bitmap),        \
                        (const unsigned long *) (buf), (nbits))
#define bitmap_to_arr32(buf, bitmap, nbits)                        \
        bitmap_copy_clear_tail((unsigned long *) (buf),                \
                        (const unsigned long *) (bitmap), (nbits))
#endif

/*
 * On 64-bit systems bitmaps are represented as u64 arrays internally. So,
 * the conversion is not needed when copying data from/to arrays of u64.
 */
#if BITS_PER_LONG == 32
void bitmap_from_arr64(unsigned long *bitmap, const u64 *buf, unsigned int nbits);
void bitmap_to_arr64(u64 *buf, const unsigned long *bitmap, unsigned int nbits);
#else
#define bitmap_from_arr64(bitmap, buf, nbits)                        \
        bitmap_copy_clear_tail((unsigned long *)(bitmap), (const unsigned long *)(buf), (nbits))
#define bitmap_to_arr64(buf, bitmap, nbits)                        \
        bitmap_copy_clear_tail((unsigned long *)(buf), (const unsigned long *)(bitmap), (nbits))
#endif

static __always_inline
bool bitmap_and(unsigned long *dst, const unsigned long *src1,
                const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return (*dst = *src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits)) != 0;
        return __bitmap_and(dst, src1, src2, nbits);
}

static __always_inline
void bitmap_or(unsigned long *dst, const unsigned long *src1,
               const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                *dst = *src1 | *src2;
        else
                __bitmap_or(dst, src1, src2, nbits);
}

static __always_inline
void bitmap_xor(unsigned long *dst, const unsigned long *src1,
                const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                *dst = *src1 ^ *src2;
        else
                __bitmap_xor(dst, src1, src2, nbits);
}

static __always_inline
bool bitmap_andnot(unsigned long *dst, const unsigned long *src1,
                   const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return (*dst = *src1 & ~(*src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0;
        return __bitmap_andnot(dst, src1, src2, nbits);
}

static __always_inline
void bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                *dst = ~(*src);
        else
                __bitmap_complement(dst, src, nbits);
}

#ifdef __LITTLE_ENDIAN
#define BITMAP_MEM_ALIGNMENT 8
#else
#define BITMAP_MEM_ALIGNMENT (8 * sizeof(unsigned long))
#endif
#define BITMAP_MEM_MASK (BITMAP_MEM_ALIGNMENT - 1)

static __always_inline
bool bitmap_equal(const unsigned long *src1, const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return !((*src1 ^ *src2) & BITMAP_LAST_WORD_MASK(nbits));
        if (__builtin_constant_p(nbits & BITMAP_MEM_MASK) &&
            IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT))
                return !memcmp(src1, src2, nbits / 8);
        return __bitmap_equal(src1, src2, nbits);
}

/**
 * bitmap_or_equal - Check whether the or of two bitmaps is equal to a third
 * @src1:        Pointer to bitmap 1
 * @src2:        Pointer to bitmap 2 will be or'ed with bitmap 1
 * @src3:        Pointer to bitmap 3. Compare to the result of *@src1 | *@src2
 * @nbits:        number of bits in each of these bitmaps
 *
 * Returns: True if (*@src1 | *@src2) == *@src3, false otherwise
 */
static __always_inline
bool bitmap_or_equal(const unsigned long *src1, const unsigned long *src2,
                     const unsigned long *src3, unsigned int nbits)
{
        if (!small_const_nbits(nbits))
                return __bitmap_or_equal(src1, src2, src3, nbits);

        return !(((*src1 | *src2) ^ *src3) & BITMAP_LAST_WORD_MASK(nbits));
}

static __always_inline
bool bitmap_intersects(const unsigned long *src1, const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return ((*src1 & *src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0;
        else
                return __bitmap_intersects(src1, src2, nbits);
}

static __always_inline
bool bitmap_subset(const unsigned long *src1, const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return ! ((*src1 & ~(*src2)) & BITMAP_LAST_WORD_MASK(nbits));
        else
                return __bitmap_subset(src1, src2, nbits);
}

static __always_inline
bool bitmap_empty(const unsigned long *src, unsigned nbits)
{
        if (small_const_nbits(nbits))
                return ! (*src & BITMAP_LAST_WORD_MASK(nbits));

        return find_first_bit(src, nbits) == nbits;
}

static __always_inline
bool bitmap_full(const unsigned long *src, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits));

        return find_first_zero_bit(src, nbits) == nbits;
}

static __always_inline
unsigned int bitmap_weight(const unsigned long *src, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
        return __bitmap_weight(src, nbits);
}

static __always_inline
unsigned long bitmap_weight_and(const unsigned long *src1,
                                const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return hweight_long(*src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits));
        return __bitmap_weight_and(src1, src2, nbits);
}

static __always_inline
unsigned long bitmap_weight_andnot(const unsigned long *src1,
                                   const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return hweight_long(*src1 & ~(*src2) & BITMAP_LAST_WORD_MASK(nbits));
        return __bitmap_weight_andnot(src1, src2, nbits);
}

static __always_inline
void bitmap_set(unsigned long *map, unsigned int start, unsigned int nbits)
{
        if (__builtin_constant_p(nbits) && nbits == 1)
                __set_bit(start, map);
        else if (small_const_nbits(start + nbits))
                *map |= GENMASK(start + nbits - 1, start);
        else if (__builtin_constant_p(start & BITMAP_MEM_MASK) &&
                 IS_ALIGNED(start, BITMAP_MEM_ALIGNMENT) &&
                 __builtin_constant_p(nbits & BITMAP_MEM_MASK) &&
                 IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT))
                memset((char *)map + start / 8, 0xff, nbits / 8);
        else
                __bitmap_set(map, start, nbits);
}

static __always_inline
void bitmap_clear(unsigned long *map, unsigned int start, unsigned int nbits)
{
        if (__builtin_constant_p(nbits) && nbits == 1)
                __clear_bit(start, map);
        else if (small_const_nbits(start + nbits))
                *map &= ~GENMASK(start + nbits - 1, start);
        else if (__builtin_constant_p(start & BITMAP_MEM_MASK) &&
                 IS_ALIGNED(start, BITMAP_MEM_ALIGNMENT) &&
                 __builtin_constant_p(nbits & BITMAP_MEM_MASK) &&
                 IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT))
                memset((char *)map + start / 8, 0, nbits / 8);
        else
                __bitmap_clear(map, start, nbits);
}

static __always_inline
void bitmap_shift_right(unsigned long *dst, const unsigned long *src,
                        unsigned int shift, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                *dst = (*src & BITMAP_LAST_WORD_MASK(nbits)) >> shift;
        else
                __bitmap_shift_right(dst, src, shift, nbits);
}

static __always_inline
void bitmap_shift_left(unsigned long *dst, const unsigned long *src,
                       unsigned int shift, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                *dst = (*src << shift) & BITMAP_LAST_WORD_MASK(nbits);
        else
                __bitmap_shift_left(dst, src, shift, nbits);
}

static __always_inline
void bitmap_replace(unsigned long *dst,
                    const unsigned long *old,
                    const unsigned long *new,
                    const unsigned long *mask,
                    unsigned int nbits)
{
        if (small_const_nbits(nbits))
                *dst = (*old & ~(*mask)) | (*new & *mask);
        else
                __bitmap_replace(dst, old, new, mask, nbits);
}

/**
 * bitmap_scatter - Scatter a bitmap according to the given mask
 * @dst: scattered bitmap
 * @src: gathered bitmap
 * @mask: mask representing bits to assign to in the scattered bitmap
 * @nbits: number of bits in each of these bitmaps
 *
 * Scatters bitmap with sequential bits according to the given @mask.
 *
 * Example:
 * If @src bitmap = 0x005a, with @mask = 0x1313, @dst will be 0x0302.
 *
 * Or in binary form
 * @src                        @mask                        @dst
 * 0000000001011010        0001001100010011        0000001100000010
 *
 * (Bits 0, 1, 2, 3, 4, 5 are copied to the bits 0, 1, 4, 8, 9, 12)
 *
 * A more 'visual' description of the operation::
 *
 *        src:  0000000001011010
 *                        ||||||
 *                 +------+|||||
 *                 |  +----+||||
 *                 |  |+----+|||
 *                 |  ||   +-+||
 *                 |  ||   |  ||
 *        mask: ...v..vv...v..vv
 *              ...0..11...0..10
 *        dst:  0000001100000010
 *
 * A relationship exists between bitmap_scatter() and bitmap_gather(). See
 * bitmap_gather() for the bitmap gather detailed operations. TL;DR:
 * bitmap_gather() can be seen as the 'reverse' bitmap_scatter() operation.
 */
static __always_inline
void bitmap_scatter(unsigned long *dst, const unsigned long *src,
                    const unsigned long *mask, unsigned int nbits)
{
        unsigned int n = 0;
        unsigned int bit;

        bitmap_zero(dst, nbits);

        for_each_set_bit(bit, mask, nbits)
                __assign_bit(bit, dst, test_bit(n++, src));
}

/**
 * bitmap_gather - Gather a bitmap according to given mask
 * @dst: gathered bitmap
 * @src: scattered bitmap
 * @mask: mask representing bits to extract from in the scattered bitmap
 * @nbits: number of bits in each of these bitmaps
 *
 * Gathers bitmap with sparse bits according to the given @mask.
 *
 * Example:
 * If @src bitmap = 0x0302, with @mask = 0x1313, @dst will be 0x001a.
 *
 * Or in binary form
 * @src                        @mask                        @dst
 * 0000001100000010        0001001100010011        0000000000011010
 *
 * (Bits 0, 1, 4, 8, 9, 12 are copied to the bits 0, 1, 2, 3, 4, 5)
 *
 * A more 'visual' description of the operation::
 *
 *        mask: ...v..vv...v..vv
 *        src:  0000001100000010
 *                 ^  ^^   ^   0
 *                 |  ||   |  10
 *                 |  ||   > 010
 *                 |  |+--> 1010
 *                 |  +--> 11010
 *                 +----> 011010
 *        dst:  0000000000011010
 *
 * A relationship exists between bitmap_gather() and bitmap_scatter(). See
 * bitmap_scatter() for the bitmap scatter detailed operations. TL;DR:
 * bitmap_scatter() can be seen as the 'reverse' bitmap_gather() operation.
 *
 * Suppose scattered computed using bitmap_scatter(scattered, src, mask, n).
 * The operation bitmap_gather(result, scattered, mask, n) leads to a result
 * equal or equivalent to src.
 *
 * The result can be 'equivalent' because bitmap_scatter() and bitmap_gather()
 * are not bijective.
 * The result and src values are equivalent in that sense that a call to
 * bitmap_scatter(res, src, mask, n) and a call to
 * bitmap_scatter(res, result, mask, n) will lead to the same res value.
 */
static __always_inline
void bitmap_gather(unsigned long *dst, const unsigned long *src,
                   const unsigned long *mask, unsigned int nbits)
{
        unsigned int n = 0;
        unsigned int bit;

        bitmap_zero(dst, nbits);

        for_each_set_bit(bit, mask, nbits)
                __assign_bit(n++, dst, test_bit(bit, src));
}

static __always_inline
void bitmap_next_set_region(unsigned long *bitmap, unsigned int *rs,
                            unsigned int *re, unsigned int end)
{
        *rs = find_next_bit(bitmap, end, *rs);
        *re = find_next_zero_bit(bitmap, end, *rs + 1);
}

/**
 * bitmap_release_region - release allocated bitmap region
 *        @bitmap: array of unsigned longs corresponding to the bitmap
 *        @pos: beginning of bit region to release
 *        @order: region size (log base 2 of number of bits) to release
 *
 * This is the complement to __bitmap_find_free_region() and releases
 * the found region (by clearing it in the bitmap).
 */
static __always_inline
void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order)
{
        bitmap_clear(bitmap, pos, BIT(order));
}

/**
 * bitmap_allocate_region - allocate bitmap region
 *        @bitmap: array of unsigned longs corresponding to the bitmap
 *        @pos: beginning of bit region to allocate
 *        @order: region size (log base 2 of number of bits) to allocate
 *
 * Allocate (set bits in) a specified region of a bitmap.
 *
 * Returns: 0 on success, or %-EBUSY if specified region wasn't
 * free (not all bits were zero).
 */
static __always_inline
int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order)
{
        unsigned int len = BIT(order);

        if (find_next_bit(bitmap, pos + len, pos) < pos + len)
                return -EBUSY;
        bitmap_set(bitmap, pos, len);
        return 0;
}

/**
 * bitmap_find_free_region - find a contiguous aligned mem region
 *        @bitmap: array of unsigned longs corresponding to the bitmap
 *        @bits: number of bits in the bitmap
 *        @order: region size (log base 2 of number of bits) to find
 *
 * Find a region of free (zero) bits in a @bitmap of @bits bits and
 * allocate them (set them to one).  Only consider regions of length
 * a power (@order) of two, aligned to that power of two, which
 * makes the search algorithm much faster.
 *
 * Returns: the bit offset in bitmap of the allocated region,
 * or -errno on failure.
 */
static __always_inline
int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order)
{
        unsigned int pos, end;                /* scans bitmap by regions of size order */

        for (pos = 0; (end = pos + BIT(order)) <= bits; pos = end) {
                if (!bitmap_allocate_region(bitmap, pos, order))
                        return pos;
        }
        return -ENOMEM;
}

/**
 * BITMAP_FROM_U64() - Represent u64 value in the format suitable for bitmap.
 * @n: u64 value
 *
 * Linux bitmaps are internally arrays of unsigned longs, i.e. 32-bit
 * integers in 32-bit environment, and 64-bit integers in 64-bit one.
 *
 * There are four combinations of endianness and length of the word in linux
 * ABIs: LE64, BE64, LE32 and BE32.
 *
 * On 64-bit kernels 64-bit LE and BE numbers are naturally ordered in
 * bitmaps and therefore don't require any special handling.
 *
 * On 32-bit kernels 32-bit LE ABI orders lo word of 64-bit number in memory
 * prior to hi, and 32-bit BE orders hi word prior to lo. The bitmap on the
 * other hand is represented as an array of 32-bit words and the position of
 * bit N may therefore be calculated as: word #(N/32) and bit #(N%32) in that
 * word.  For example, bit #42 is located at 10th position of 2nd word.
 * It matches 32-bit LE ABI, and we can simply let the compiler store 64-bit
 * values in memory as it usually does. But for BE we need to swap hi and lo
 * words manually.
 *
 * With all that, the macro BITMAP_FROM_U64() does explicit reordering of hi and
 * lo parts of u64.  For LE32 it does nothing, and for BE environment it swaps
 * hi and lo words, as is expected by bitmap.
 */
#if __BITS_PER_LONG == 64
#define BITMAP_FROM_U64(n) (n)
#else
#define BITMAP_FROM_U64(n) ((unsigned long) ((u64)(n) & ULONG_MAX)), \
                                ((unsigned long) ((u64)(n) >> 32))
#endif

/**
 * bitmap_from_u64 - Check and swap words within u64.
 *  @mask: source bitmap
 *  @dst:  destination bitmap
 *
 * In 32-bit Big Endian kernel, when using ``(u32 *)(&val)[*]``
 * to read u64 mask, we will get the wrong word.
 * That is ``(u32 *)(&val)[0]`` gets the upper 32 bits,
 * but we expect the lower 32-bits of u64.
 */
static __always_inline void bitmap_from_u64(unsigned long *dst, u64 mask)
{
        bitmap_from_arr64(dst, &mask, 64);
}

/**
 * bitmap_read - read a value of n-bits from the memory region
 * @map: address to the bitmap memory region
 * @start: bit offset of the n-bit value
 * @nbits: size of value in bits, nonzero, up to BITS_PER_LONG
 *
 * Returns: value of @nbits bits located at the @start bit offset within the
 * @map memory region. For @nbits = 0 and @nbits > BITS_PER_LONG the return
 * value is undefined.
 */
static __always_inline
unsigned long bitmap_read(const unsigned long *map, unsigned long start, unsigned long nbits)
{
        size_t index = BIT_WORD(start);
        unsigned long offset = start % BITS_PER_LONG;
        unsigned long space = BITS_PER_LONG - offset;
        unsigned long value_low, value_high;

        if (unlikely(!nbits || nbits > BITS_PER_LONG))
                return 0;

        if (space >= nbits)
                return (map[index] >> offset) & BITMAP_LAST_WORD_MASK(nbits);

        value_low = map[index] & BITMAP_FIRST_WORD_MASK(start);
        value_high = map[index + 1] & BITMAP_LAST_WORD_MASK(start + nbits);
        return (value_low >> offset) | (value_high << space);
}

/**
 * bitmap_write - write n-bit value within a memory region
 * @map: address to the bitmap memory region
 * @value: value to write, clamped to nbits
 * @start: bit offset of the n-bit value
 * @nbits: size of value in bits, nonzero, up to BITS_PER_LONG.
 *
 * bitmap_write() behaves as-if implemented as @nbits calls of __assign_bit(),
 * i.e. bits beyond @nbits are ignored:
 *
 *   for (bit = 0; bit < nbits; bit++)
 *           __assign_bit(start + bit, bitmap, val & BIT(bit));
 *
 * For @nbits == 0 and @nbits > BITS_PER_LONG no writes are performed.
 */
static __always_inline
void bitmap_write(unsigned long *map, unsigned long value,
                  unsigned long start, unsigned long nbits)
{
        size_t index;
        unsigned long offset;
        unsigned long space;
        unsigned long mask;
        bool fit;

        if (unlikely(!nbits || nbits > BITS_PER_LONG))
                return;

        mask = BITMAP_LAST_WORD_MASK(nbits);
        value &= mask;
        offset = start % BITS_PER_LONG;
        space = BITS_PER_LONG - offset;
        fit = space >= nbits;
        index = BIT_WORD(start);

        map[index] &= (fit ? (~(mask << offset)) : ~BITMAP_FIRST_WORD_MASK(start));
        map[index] |= value << offset;
        if (fit)
                return;

        map[index + 1] &= BITMAP_FIRST_WORD_MASK(start + nbits);
        map[index + 1] |= (value >> space);
}

#define bitmap_get_value8(map, start)                        \
        bitmap_read(map, start, BITS_PER_BYTE)
#define bitmap_set_value8(map, value, start)                \
        bitmap_write(map, value, start, BITS_PER_BYTE)

#endif /* __ASSEMBLY__ */

#endif /* __LINUX_BITMAP_H */














  319 

























































  319 







































































  319 















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM net

#if !defined(_TRACE_NET_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_NET_H

#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/if_vlan.h>
#include <linux/ip.h>
#include <linux/tracepoint.h>

TRACE_EVENT(net_dev_start_xmit,

        TP_PROTO(const struct sk_buff *skb, const struct net_device *dev),

        TP_ARGS(skb, dev),

        TP_STRUCT__entry(
                __string(        name,                        dev->name        )
                __field(        u16,                        queue_mapping        )
                __field(        const void *,                skbaddr                )
                __field(        bool,                        vlan_tagged        )
                __field(        u16,                        vlan_proto        )
                __field(        u16,                        vlan_tci        )
                __field(        u16,                        protocol        )
                __field(        u8,                        ip_summed        )
                __field(        unsigned int,                len                )
                __field(        unsigned int,                data_len        )
                __field(        int,                        network_offset        )
                __field(        bool,                        transport_offset_valid)
                __field(        int,                        transport_offset)
                __field(        u8,                        tx_flags        )
                __field(        u16,                        gso_size        )
                __field(        u16,                        gso_segs        )
                __field(        u16,                        gso_type        )
        ),

        TP_fast_assign(
                __assign_str(name);
                __entry->queue_mapping = skb->queue_mapping;
                __entry->skbaddr = skb;
                __entry->vlan_tagged = skb_vlan_tag_present(skb);
                __entry->vlan_proto = ntohs(skb->vlan_proto);
                __entry->vlan_tci = skb_vlan_tag_get(skb);
                __entry->protocol = ntohs(skb->protocol);
                __entry->ip_summed = skb->ip_summed;
                __entry->len = skb->len;
                __entry->data_len = skb->data_len;
                __entry->network_offset = skb_network_offset(skb);
                __entry->transport_offset_valid =
                        skb_transport_header_was_set(skb);
                __entry->transport_offset = skb_transport_header_was_set(skb) ?
                        skb_transport_offset(skb) : 0;
                __entry->tx_flags = skb_shinfo(skb)->tx_flags;
                __entry->gso_size = skb_shinfo(skb)->gso_size;
                __entry->gso_segs = skb_shinfo(skb)->gso_segs;
                __entry->gso_type = skb_shinfo(skb)->gso_type;
        ),

        TP_printk("dev=%s queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d len=%u data_len=%u network_offset=%d transport_offset_valid=%d transport_offset=%d tx_flags=%d gso_size=%d gso_segs=%d gso_type=%#x",
                  __get_str(name), __entry->queue_mapping, __entry->skbaddr,
                  __entry->vlan_tagged, __entry->vlan_proto, __entry->vlan_tci,
                  __entry->protocol, __entry->ip_summed, __entry->len,
                  __entry->data_len,
                  __entry->network_offset, __entry->transport_offset_valid,
                  __entry->transport_offset, __entry->tx_flags,
                  __entry->gso_size, __entry->gso_segs, __entry->gso_type)
);

TRACE_EVENT(net_dev_xmit,

        TP_PROTO(struct sk_buff *skb,
                 int rc,
                 struct net_device *dev,
                 unsigned int skb_len),

        TP_ARGS(skb, rc, dev, skb_len),

        TP_STRUCT__entry(
                __field(        void *,                skbaddr                )
                __field(        unsigned int,        len                )
                __field(        int,                rc                )
                __string(        name,                dev->name        )
        ),

        TP_fast_assign(
                __entry->skbaddr = skb;
                __entry->len = skb_len;
                __entry->rc = rc;
                __assign_str(name);
        ),

        TP_printk("dev=%s skbaddr=%p len=%u rc=%d",
                __get_str(name), __entry->skbaddr, __entry->len, __entry->rc)
);

TRACE_EVENT(net_dev_xmit_timeout,

        TP_PROTO(struct net_device *dev,
                 int queue_index),

        TP_ARGS(dev, queue_index),

        TP_STRUCT__entry(
                __string(        name,                dev->name        )
                __string(        driver,                netdev_drivername(dev))
                __field(        int,                queue_index        )
        ),

        TP_fast_assign(
                __assign_str(name);
                __assign_str(driver);
                __entry->queue_index = queue_index;
        ),

        TP_printk("dev=%s driver=%s queue=%d",
                __get_str(name), __get_str(driver), __entry->queue_index)
);

DECLARE_EVENT_CLASS(net_dev_template,

        TP_PROTO(struct sk_buff *skb),

        TP_ARGS(skb),

        TP_STRUCT__entry(
                __field(        void *,                skbaddr                )
                __field(        unsigned int,        len                )
                __string(        name,                skb->dev->name        )
        ),

        TP_fast_assign(
                __entry->skbaddr = skb;
                __entry->len = skb->len;
                __assign_str(name);
        ),

        TP_printk("dev=%s skbaddr=%p len=%u",
                __get_str(name), __entry->skbaddr, __entry->len)
)

DEFINE_EVENT(net_dev_template, net_dev_queue,

        TP_PROTO(struct sk_buff *skb),

        TP_ARGS(skb)
);

DEFINE_EVENT(net_dev_template, netif_receive_skb,

        TP_PROTO(struct sk_buff *skb),

        TP_ARGS(skb)
);

DEFINE_EVENT(net_dev_template, netif_rx,

        TP_PROTO(struct sk_buff *skb),

        TP_ARGS(skb)
);

DECLARE_EVENT_CLASS(net_dev_rx_verbose_template,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb),

        TP_STRUCT__entry(
                __string(        name,                        skb->dev->name        )
                __field(        unsigned int,                napi_id                )
                __field(        u16,                        queue_mapping        )
                __field(        const void *,                skbaddr                )
                __field(        bool,                        vlan_tagged        )
                __field(        u16,                        vlan_proto        )
                __field(        u16,                        vlan_tci        )
                __field(        u16,                        protocol        )
                __field(        u8,                        ip_summed        )
                __field(        u32,                        hash                )
                __field(        bool,                        l4_hash                )
                __field(        unsigned int,                len                )
                __field(        unsigned int,                data_len        )
                __field(        unsigned int,                truesize        )
                __field(        bool,                        mac_header_valid)
                __field(        int,                        mac_header        )
                __field(        unsigned char,                nr_frags        )
                __field(        u16,                        gso_size        )
                __field(        u16,                        gso_type        )
        ),

        TP_fast_assign(
                __assign_str(name);
#ifdef CONFIG_NET_RX_BUSY_POLL
                __entry->napi_id = skb->napi_id;
#else
                __entry->napi_id = 0;
#endif
                __entry->queue_mapping = skb->queue_mapping;
                __entry->skbaddr = skb;
                __entry->vlan_tagged = skb_vlan_tag_present(skb);
                __entry->vlan_proto = ntohs(skb->vlan_proto);
                __entry->vlan_tci = skb_vlan_tag_get(skb);
                __entry->protocol = ntohs(skb->protocol);
                __entry->ip_summed = skb->ip_summed;
                __entry->hash = skb->hash;
                __entry->l4_hash = skb->l4_hash;
                __entry->len = skb->len;
                __entry->data_len = skb->data_len;
                __entry->truesize = skb->truesize;
                __entry->mac_header_valid = skb_mac_header_was_set(skb);
                __entry->mac_header = skb_mac_header(skb) - skb->data;
                __entry->nr_frags = skb_shinfo(skb)->nr_frags;
                __entry->gso_size = skb_shinfo(skb)->gso_size;
                __entry->gso_type = skb_shinfo(skb)->gso_type;
        ),

        TP_printk("dev=%s napi_id=%#x queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d hash=0x%08x l4_hash=%d len=%u data_len=%u truesize=%u mac_header_valid=%d mac_header=%d nr_frags=%d gso_size=%d gso_type=%#x",
                  __get_str(name), __entry->napi_id, __entry->queue_mapping,
                  __entry->skbaddr, __entry->vlan_tagged, __entry->vlan_proto,
                  __entry->vlan_tci, __entry->protocol, __entry->ip_summed,
                  __entry->hash, __entry->l4_hash, __entry->len,
                  __entry->data_len, __entry->truesize,
                  __entry->mac_header_valid, __entry->mac_header,
                  __entry->nr_frags, __entry->gso_size, __entry->gso_type)
);

DEFINE_EVENT(net_dev_rx_verbose_template, napi_gro_frags_entry,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb)
);

DEFINE_EVENT(net_dev_rx_verbose_template, napi_gro_receive_entry,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb)
);

DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_entry,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb)
);

DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_list_entry,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb)
);

DEFINE_EVENT(net_dev_rx_verbose_template, netif_rx_entry,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb)
);

DECLARE_EVENT_CLASS(net_dev_rx_exit_template,

        TP_PROTO(int ret),

        TP_ARGS(ret),

        TP_STRUCT__entry(
                __field(int,        ret)
        ),

        TP_fast_assign(
                __entry->ret = ret;
        ),

        TP_printk("ret=%d", __entry->ret)
);

DEFINE_EVENT(net_dev_rx_exit_template, napi_gro_frags_exit,

        TP_PROTO(int ret),

        TP_ARGS(ret)
);

DEFINE_EVENT(net_dev_rx_exit_template, napi_gro_receive_exit,

        TP_PROTO(int ret),

        TP_ARGS(ret)
);

DEFINE_EVENT(net_dev_rx_exit_template, netif_receive_skb_exit,

        TP_PROTO(int ret),

        TP_ARGS(ret)
);

DEFINE_EVENT(net_dev_rx_exit_template, netif_rx_exit,

        TP_PROTO(int ret),

        TP_ARGS(ret)
);

DEFINE_EVENT(net_dev_rx_exit_template, netif_receive_skb_list_exit,

        TP_PROTO(int ret),

        TP_ARGS(ret)
);

#endif /* _TRACE_NET_H */

/* This part must be outside protection */
#include <trace/define_trace.h>
























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * fs/kernfs/kernfs-internal.h - kernfs internal header file
 *
 * Copyright (c) 2001-3 Patrick Mochel
 * Copyright (c) 2007 SUSE Linux Products GmbH
 * Copyright (c) 2007, 2013 Tejun Heo <teheo@suse.de>
 */

#ifndef __KERNFS_INTERNAL_H
#define __KERNFS_INTERNAL_H

#include <linux/lockdep.h>
#include <linux/fs.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/xattr.h>

#include <linux/kernfs.h>
#include <linux/fs_context.h>

struct kernfs_iattrs {
        kuid_t                        ia_uid;
        kgid_t                        ia_gid;
        struct timespec64        ia_atime;
        struct timespec64        ia_mtime;
        struct timespec64        ia_ctime;

        struct simple_xattrs        xattrs;
        atomic_t                nr_user_xattrs;
        atomic_t                user_xattr_size;
};

struct kernfs_root {
        /* published fields */
        struct kernfs_node        *kn;
        unsigned int                flags;        /* KERNFS_ROOT_* flags */

        /* private fields, do not use outside kernfs proper */
        struct idr                ino_idr;
        spinlock_t                kernfs_idr_lock;        /* root->ino_idr */
        u32                        last_id_lowbits;
        u32                        id_highbits;
        struct kernfs_syscall_ops *syscall_ops;

        /* list of kernfs_super_info of this root, protected by kernfs_rwsem */
        struct list_head        supers;

        wait_queue_head_t        deactivate_waitq;
        struct rw_semaphore        kernfs_rwsem;
        struct rw_semaphore        kernfs_iattr_rwsem;
        struct rw_semaphore        kernfs_supers_rwsem;

        /* kn->parent and kn->name */
        rwlock_t                kernfs_rename_lock;

        struct rcu_head                rcu;
};

/* +1 to avoid triggering overflow warning when negating it */
#define KN_DEACTIVATED_BIAS                (INT_MIN + 1)

/* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */

/**
 * kernfs_root - find out the kernfs_root a kernfs_node belongs to
 * @kn: kernfs_node of interest
 *
 * Return: the kernfs_root @kn belongs to.
 */
static inline struct kernfs_root *kernfs_root(const struct kernfs_node *kn)
{
        const struct kernfs_node *knp;
        /* if parent exists, it's always a dir; otherwise, @sd is a dir */
        guard(rcu)();
        knp = rcu_dereference(kn->__parent);
        if (knp)
                kn = knp;
        return kn->dir.root;
}

/*
 * mount.c
 */
struct kernfs_super_info {
        struct super_block        *sb;

        /*
         * The root associated with this super_block.  Each super_block is
         * identified by the root and ns it's associated with.
         */
        struct kernfs_root        *root;

        /*
         * Each sb is associated with one namespace tag, currently the
         * network namespace of the task which mounted this kernfs
         * instance.  If multiple tags become necessary, make the following
         * an array and compare kernfs_node tag against every entry.
         */
        const void                *ns;

        /* anchored at kernfs_root->supers, protected by kernfs_rwsem */
        struct list_head        node;
};
#define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info))

static inline bool kernfs_root_is_locked(const struct kernfs_node *kn)
{
        return lockdep_is_held(&kernfs_root(kn)->kernfs_rwsem);
}

static inline bool kernfs_rename_is_locked(const struct kernfs_node *kn)
{
        return lockdep_is_held(&kernfs_root(kn)->kernfs_rename_lock);
}

static inline const char *kernfs_rcu_name(const struct kernfs_node *kn)
{
        return rcu_dereference_check(kn->name, kernfs_root_is_locked(kn));
}

static inline struct kernfs_node *kernfs_parent(const struct kernfs_node *kn)
{
        /*
         * The kernfs_node::__parent remains valid within a RCU section. The kn
         * can be reparented (and renamed) which changes the entry. This can be
         * avoided by locking kernfs_root::kernfs_rwsem or
         * kernfs_root::kernfs_rename_lock.
         * Both locks can be used to obtain a reference on __parent. Once the
         * reference count reaches 0 then the node is about to be freed
         * and can not be renamed (or become a different parent) anymore.
         */
        return rcu_dereference_check(kn->__parent,
                                     kernfs_root_is_locked(kn) ||
                                     kernfs_rename_is_locked(kn) ||
                                     !atomic_read(&kn->count));
}

static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry)
{
        if (d_really_is_negative(dentry))
                return NULL;
        return d_inode(dentry)->i_private;
}

static inline void kernfs_set_rev(struct kernfs_node *parent,
                                  struct dentry *dentry)
{
        dentry->d_time = parent->dir.rev;
}

static inline void kernfs_inc_rev(struct kernfs_node *parent)
{
        parent->dir.rev++;
}

static inline bool kernfs_dir_changed(struct kernfs_node *parent,
                                      struct dentry *dentry)
{
        if (parent->dir.rev != dentry->d_time)
                return true;
        return false;
}

extern const struct super_operations kernfs_sops;
extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;

/*
 * inode.c
 */
extern const struct xattr_handler * const kernfs_xattr_handlers[];
void kernfs_evict_inode(struct inode *inode);
int kernfs_iop_permission(struct mnt_idmap *idmap,
                          struct inode *inode, int mask);
int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                       struct iattr *iattr);
int kernfs_iop_getattr(struct mnt_idmap *idmap,
                       const struct path *path, struct kstat *stat,
                       u32 request_mask, unsigned int query_flags);
ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size);
int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr);

/*
 * dir.c
 */
extern const struct dentry_operations kernfs_dops;
extern const struct file_operations kernfs_dir_fops;
extern const struct inode_operations kernfs_dir_iops;

struct kernfs_node *kernfs_get_active(struct kernfs_node *kn);
void kernfs_put_active(struct kernfs_node *kn);
int kernfs_add_one(struct kernfs_node *kn);
struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
                                    const char *name, umode_t mode,
                                    kuid_t uid, kgid_t gid,
                                    unsigned flags);

/*
 * file.c
 */
extern const struct file_operations kernfs_file_fops;

bool kernfs_should_drain_open_files(struct kernfs_node *kn);
void kernfs_drain_open_files(struct kernfs_node *kn);

/*
 * symlink.c
 */
extern const struct inode_operations kernfs_symlink_iops;

/*
 * kernfs locks
 */
extern struct kernfs_global_locks *kernfs_locks;
#endif        /* __KERNFS_INTERNAL_H */













































































































































































   61 























    1 




    1 



    1 






    1 





































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/bitmap.h>
#include <linux/bug.h>
#include <linux/export.h>
#include <linux/idr.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/xarray.h>

/**
 * idr_alloc_u32() - Allocate an ID.
 * @idr: IDR handle.
 * @ptr: Pointer to be associated with the new ID.
 * @nextid: Pointer to an ID.
 * @max: The maximum ID to allocate (inclusive).
 * @gfp: Memory allocation flags.
 *
 * Allocates an unused ID in the range specified by @nextid and @max.
 * Note that @max is inclusive whereas the @end parameter to idr_alloc()
 * is exclusive.  The new ID is assigned to @nextid before the pointer
 * is inserted into the IDR, so if @nextid points into the object pointed
 * to by @ptr, a concurrent lookup will not find an uninitialised ID.
 *
 * The caller should provide their own locking to ensure that two
 * concurrent modifications to the IDR are not possible.  Read-only
 * accesses to the IDR may be done under the RCU read lock or may
 * exclude simultaneous writers.
 *
 * Return: 0 if an ID was allocated, -ENOMEM if memory allocation failed,
 * or -ENOSPC if no free IDs could be found.  If an error occurred,
 * @nextid is unchanged.
 */
int idr_alloc_u32(struct idr *idr, void *ptr, u32 *nextid,
                        unsigned long max, gfp_t gfp)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        unsigned int base = idr->idr_base;
        unsigned int id = *nextid;

        if (WARN_ON_ONCE(!(idr->idr_rt.xa_flags & ROOT_IS_IDR)))
                idr->idr_rt.xa_flags |= IDR_RT_MARKER;

        id = (id < base) ? 0 : id - base;
        radix_tree_iter_init(&iter, id);
        slot = idr_get_free(&idr->idr_rt, &iter, gfp, max - base);
        if (IS_ERR(slot))
                return PTR_ERR(slot);

        *nextid = iter.index + base;
        /* there is a memory barrier inside radix_tree_iter_replace() */
        radix_tree_iter_replace(&idr->idr_rt, &iter, slot, ptr);
        radix_tree_iter_tag_clear(&idr->idr_rt, &iter, IDR_FREE);

        return 0;
}
EXPORT_SYMBOL_GPL(idr_alloc_u32);

/**
 * idr_alloc() - Allocate an ID.
 * @idr: IDR handle.
 * @ptr: Pointer to be associated with the new ID.
 * @start: The minimum ID (inclusive).
 * @end: The maximum ID (exclusive).
 * @gfp: Memory allocation flags.
 *
 * Allocates an unused ID in the range specified by @start and @end.  If
 * @end is <= 0, it is treated as one larger than %INT_MAX.  This allows
 * callers to use @start + N as @end as long as N is within integer range.
 *
 * The caller should provide their own locking to ensure that two
 * concurrent modifications to the IDR are not possible.  Read-only
 * accesses to the IDR may be done under the RCU read lock or may
 * exclude simultaneous writers.
 *
 * Return: The newly allocated ID, -ENOMEM if memory allocation failed,
 * or -ENOSPC if no free IDs could be found.
 */
int idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp)
{
        u32 id = start;
        int ret;

        if (WARN_ON_ONCE(start < 0))
                return -EINVAL;

        ret = idr_alloc_u32(idr, ptr, &id, end > 0 ? end - 1 : INT_MAX, gfp);
        if (ret)
                return ret;

        return id;
}
EXPORT_SYMBOL_GPL(idr_alloc);

/**
 * idr_alloc_cyclic() - Allocate an ID cyclically.
 * @idr: IDR handle.
 * @ptr: Pointer to be associated with the new ID.
 * @start: The minimum ID (inclusive).
 * @end: The maximum ID (exclusive).
 * @gfp: Memory allocation flags.
 *
 * Allocates an unused ID in the range specified by @start and @end.  If
 * @end is <= 0, it is treated as one larger than %INT_MAX.  This allows
 * callers to use @start + N as @end as long as N is within integer range.
 * The search for an unused ID will start at the last ID allocated and will
 * wrap around to @start if no free IDs are found before reaching @end.
 *
 * The caller should provide their own locking to ensure that two
 * concurrent modifications to the IDR are not possible.  Read-only
 * accesses to the IDR may be done under the RCU read lock or may
 * exclude simultaneous writers.
 *
 * Return: The newly allocated ID, -ENOMEM if memory allocation failed,
 * or -ENOSPC if no free IDs could be found.
 */
int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end, gfp_t gfp)
{
        u32 id = idr->idr_next;
        int err, max = end > 0 ? end - 1 : INT_MAX;

        if ((int)id < start)
                id = start;

        err = idr_alloc_u32(idr, ptr, &id, max, gfp);
        if ((err == -ENOSPC) && (id > start)) {
                id = start;
                err = idr_alloc_u32(idr, ptr, &id, max, gfp);
        }
        if (err)
                return err;

        idr->idr_next = id + 1;
        return id;
}
EXPORT_SYMBOL(idr_alloc_cyclic);

/**
 * idr_remove() - Remove an ID from the IDR.
 * @idr: IDR handle.
 * @id: Pointer ID.
 *
 * Removes this ID from the IDR.  If the ID was not previously in the IDR,
 * this function returns %NULL.
 *
 * Since this function modifies the IDR, the caller should provide their
 * own locking to ensure that concurrent modification of the same IDR is
 * not possible.
 *
 * Return: The pointer formerly associated with this ID.
 */
void *idr_remove(struct idr *idr, unsigned long id)
{
        return radix_tree_delete_item(&idr->idr_rt, id - idr->idr_base, NULL);
}
EXPORT_SYMBOL_GPL(idr_remove);

/**
 * idr_find() - Return pointer for given ID.
 * @idr: IDR handle.
 * @id: Pointer ID.
 *
 * Looks up the pointer associated with this ID.  A %NULL pointer may
 * indicate that @id is not allocated or that the %NULL pointer was
 * associated with this ID.
 *
 * This function can be called under rcu_read_lock(), given that the leaf
 * pointers lifetimes are correctly managed.
 *
 * Return: The pointer associated with this ID.
 */
void *idr_find(const struct idr *idr, unsigned long id)
{
        return radix_tree_lookup(&idr->idr_rt, id - idr->idr_base);
}
EXPORT_SYMBOL_GPL(idr_find);

/**
 * idr_for_each() - Iterate through all stored pointers.
 * @idr: IDR handle.
 * @fn: Function to be called for each pointer.
 * @data: Data passed to callback function.
 *
 * The callback function will be called for each entry in @idr, passing
 * the ID, the entry and @data.
 *
 * If @fn returns anything other than %0, the iteration stops and that
 * value is returned from this function.
 *
 * idr_for_each() can be called concurrently with idr_alloc() and
 * idr_remove() if protected by RCU.  Newly added entries may not be
 * seen and deleted entries may be seen, but adding and removing entries
 * will not cause other entries to be skipped, nor spurious ones to be seen.
 */
int idr_for_each(const struct idr *idr,
                int (*fn)(int id, void *p, void *data), void *data)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        int base = idr->idr_base;

        radix_tree_for_each_slot(slot, &idr->idr_rt, &iter, 0) {
                int ret;
                unsigned long id = iter.index + base;

                if (WARN_ON_ONCE(id > INT_MAX))
                        break;
                ret = fn(id, rcu_dereference_raw(*slot), data);
                if (ret)
                        return ret;
        }

        return 0;
}
EXPORT_SYMBOL(idr_for_each);

/**
 * idr_get_next_ul() - Find next populated entry.
 * @idr: IDR handle.
 * @nextid: Pointer to an ID.
 *
 * Returns the next populated entry in the tree with an ID greater than
 * or equal to the value pointed to by @nextid.  On exit, @nextid is updated
 * to the ID of the found value.  To use in a loop, the value pointed to by
 * nextid must be incremented by the user.
 */
void *idr_get_next_ul(struct idr *idr, unsigned long *nextid)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        void *entry = NULL;
        unsigned long base = idr->idr_base;
        unsigned long id = *nextid;

        id = (id < base) ? 0 : id - base;
        radix_tree_for_each_slot(slot, &idr->idr_rt, &iter, id) {
                entry = rcu_dereference_raw(*slot);
                if (!entry)
                        continue;
                if (!xa_is_internal(entry))
                        break;
                if (slot != &idr->idr_rt.xa_head && !xa_is_retry(entry))
                        break;
                slot = radix_tree_iter_retry(&iter);
        }
        if (!slot)
                return NULL;

        *nextid = iter.index + base;
        return entry;
}
EXPORT_SYMBOL(idr_get_next_ul);

/**
 * idr_get_next() - Find next populated entry.
 * @idr: IDR handle.
 * @nextid: Pointer to an ID.
 *
 * Returns the next populated entry in the tree with an ID greater than
 * or equal to the value pointed to by @nextid.  On exit, @nextid is updated
 * to the ID of the found value.  To use in a loop, the value pointed to by
 * nextid must be incremented by the user.
 */
void *idr_get_next(struct idr *idr, int *nextid)
{
        unsigned long id = *nextid;
        void *entry = idr_get_next_ul(idr, &id);

        if (WARN_ON_ONCE(id > INT_MAX))
                return NULL;
        *nextid = id;
        return entry;
}
EXPORT_SYMBOL(idr_get_next);

/**
 * idr_replace() - replace pointer for given ID.
 * @idr: IDR handle.
 * @ptr: New pointer to associate with the ID.
 * @id: ID to change.
 *
 * Replace the pointer registered with an ID and return the old value.
 * This function can be called under the RCU read lock concurrently with
 * idr_alloc() and idr_remove() (as long as the ID being removed is not
 * the one being replaced!).
 *
 * Returns: the old value on success.  %-ENOENT indicates that @id was not
 * found.  %-EINVAL indicates that @ptr was not valid.
 */
void *idr_replace(struct idr *idr, void *ptr, unsigned long id)
{
        struct radix_tree_node *node;
        void __rcu **slot = NULL;
        void *entry;

        id -= idr->idr_base;

        entry = __radix_tree_lookup(&idr->idr_rt, id, &node, &slot);
        if (!slot || radix_tree_tag_get(&idr->idr_rt, id, IDR_FREE))
                return ERR_PTR(-ENOENT);

        __radix_tree_replace(&idr->idr_rt, node, slot, ptr);

        return entry;
}
EXPORT_SYMBOL(idr_replace);

/**
 * DOC: IDA description
 *
 * The IDA is an ID allocator which does not provide the ability to
 * associate an ID with a pointer.  As such, it only needs to store one
 * bit per ID, and so is more space efficient than an IDR.  To use an IDA,
 * define it using DEFINE_IDA() (or embed a &struct ida in a data structure,
 * then initialise it using ida_init()).  To allocate a new ID, call
 * ida_alloc(), ida_alloc_min(), ida_alloc_max() or ida_alloc_range().
 * To free an ID, call ida_free().
 *
 * ida_destroy() can be used to dispose of an IDA without needing to
 * free the individual IDs in it.  You can use ida_is_empty() to find
 * out whether the IDA has any IDs currently allocated.
 *
 * The IDA handles its own locking.  It is safe to call any of the IDA
 * functions without synchronisation in your code.
 *
 * IDs are currently limited to the range [0-INT_MAX].  If this is an awkward
 * limitation, it should be quite straightforward to raise the maximum.
 */

/*
 * Developer's notes:
 *
 * The IDA uses the functionality provided by the XArray to store bitmaps in
 * each entry.  The XA_FREE_MARK is only cleared when all bits in the bitmap
 * have been set.
 *
 * I considered telling the XArray that each slot is an order-10 node
 * and indexing by bit number, but the XArray can't allow a single multi-index
 * entry in the head, which would significantly increase memory consumption
 * for the IDA.  So instead we divide the index by the number of bits in the
 * leaf bitmap before doing a radix tree lookup.
 *
 * As an optimisation, if there are only a few low bits set in any given
 * leaf, instead of allocating a 128-byte bitmap, we store the bits
 * as a value entry.  Value entries never have the XA_FREE_MARK cleared
 * because we can always convert them into a bitmap entry.
 *
 * It would be possible to optimise further; once we've run out of a
 * single 128-byte bitmap, we currently switch to a 576-byte node, put
 * the 128-byte bitmap in the first entry and then start allocating extra
 * 128-byte entries.  We could instead use the 512 bytes of the node's
 * data as a bitmap before moving to that scheme.  I do not believe this
 * is a worthwhile optimisation; Rasmus Villemoes surveyed the current
 * users of the IDA and almost none of them use more than 1024 entries.
 * Those that do use more than the 8192 IDs that the 512 bytes would
 * provide.
 *
 * The IDA always uses a lock to alloc/free.  If we add a 'test_bit'
 * equivalent, it will still need locking.  Going to RCU lookup would require
 * using RCU to free bitmaps, and that's not trivial without embedding an
 * RCU head in the bitmap, which adds a 2-pointer overhead to each 128-byte
 * bitmap, which is excessive.
 */

/**
 * ida_alloc_range() - Allocate an unused ID.
 * @ida: IDA handle.
 * @min: Lowest ID to allocate.
 * @max: Highest ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Allocate an ID between @min and @max, inclusive.  The allocated ID will
 * not exceed %INT_MAX, even if @max is larger.
 *
 * Context: Any context. It is safe to call this function without
 * locking in your code.
 * Return: The allocated ID, or %-ENOMEM if memory could not be allocated,
 * or %-ENOSPC if there are no free IDs.
 */
int ida_alloc_range(struct ida *ida, unsigned int min, unsigned int max,
                        gfp_t gfp)
{
        XA_STATE(xas, &ida->xa, min / IDA_BITMAP_BITS);
        unsigned bit = min % IDA_BITMAP_BITS;
        unsigned long flags;
        struct ida_bitmap *bitmap, *alloc = NULL;

        if ((int)min < 0)
                return -ENOSPC;

        if ((int)max < 0)
                max = INT_MAX;

retry:
        xas_lock_irqsave(&xas, flags);
next:
        bitmap = xas_find_marked(&xas, max / IDA_BITMAP_BITS, XA_FREE_MARK);
        if (xas.xa_index > min / IDA_BITMAP_BITS)
                bit = 0;
        if (xas.xa_index * IDA_BITMAP_BITS + bit > max)
                goto nospc;

        if (xa_is_value(bitmap)) {
                unsigned long tmp = xa_to_value(bitmap);

                if (bit < BITS_PER_XA_VALUE) {
                        bit = find_next_zero_bit(&tmp, BITS_PER_XA_VALUE, bit);
                        if (xas.xa_index * IDA_BITMAP_BITS + bit > max)
                                goto nospc;
                        if (bit < BITS_PER_XA_VALUE) {
                                tmp |= 1UL << bit;
                                xas_store(&xas, xa_mk_value(tmp));
                                goto out;
                        }
                }
                bitmap = alloc;
                if (!bitmap)
                        bitmap = kzalloc(sizeof(*bitmap), GFP_NOWAIT);
                if (!bitmap)
                        goto alloc;
                bitmap->bitmap[0] = tmp;
                xas_store(&xas, bitmap);
                if (xas_error(&xas)) {
                        bitmap->bitmap[0] = 0;
                        goto out;
                }
        }

        if (bitmap) {
                bit = find_next_zero_bit(bitmap->bitmap, IDA_BITMAP_BITS, bit);
                if (xas.xa_index * IDA_BITMAP_BITS + bit > max)
                        goto nospc;
                if (bit == IDA_BITMAP_BITS)
                        goto next;

                __set_bit(bit, bitmap->bitmap);
                if (bitmap_full(bitmap->bitmap, IDA_BITMAP_BITS))
                        xas_clear_mark(&xas, XA_FREE_MARK);
        } else {
                if (bit < BITS_PER_XA_VALUE) {
                        bitmap = xa_mk_value(1UL << bit);
                } else {
                        bitmap = alloc;
                        if (!bitmap)
                                bitmap = kzalloc(sizeof(*bitmap), GFP_NOWAIT);
                        if (!bitmap)
                                goto alloc;
                        __set_bit(bit, bitmap->bitmap);
                }
                xas_store(&xas, bitmap);
        }
out:
        xas_unlock_irqrestore(&xas, flags);
        if (xas_nomem(&xas, gfp)) {
                xas.xa_index = min / IDA_BITMAP_BITS;
                bit = min % IDA_BITMAP_BITS;
                goto retry;
        }
        if (bitmap != alloc)
                kfree(alloc);
        if (xas_error(&xas))
                return xas_error(&xas);
        return xas.xa_index * IDA_BITMAP_BITS + bit;
alloc:
        xas_unlock_irqrestore(&xas, flags);
        alloc = kzalloc(sizeof(*bitmap), gfp);
        if (!alloc)
                return -ENOMEM;
        xas_set(&xas, min / IDA_BITMAP_BITS);
        bit = min % IDA_BITMAP_BITS;
        goto retry;
nospc:
        xas_unlock_irqrestore(&xas, flags);
        kfree(alloc);
        return -ENOSPC;
}
EXPORT_SYMBOL(ida_alloc_range);

/**
 * ida_find_first_range - Get the lowest used ID.
 * @ida: IDA handle.
 * @min: Lowest ID to get.
 * @max: Highest ID to get.
 *
 * Get the lowest used ID between @min and @max, inclusive.  The returned
 * ID will not exceed %INT_MAX, even if @max is larger.
 *
 * Context: Any context. Takes and releases the xa_lock.
 * Return: The lowest used ID, or errno if no used ID is found.
 */
int ida_find_first_range(struct ida *ida, unsigned int min, unsigned int max)
{
        unsigned long index = min / IDA_BITMAP_BITS;
        unsigned int offset = min % IDA_BITMAP_BITS;
        unsigned long *addr, size, bit;
        unsigned long tmp = 0;
        unsigned long flags;
        void *entry;
        int ret;

        if ((int)min < 0)
                return -EINVAL;
        if ((int)max < 0)
                max = INT_MAX;

        xa_lock_irqsave(&ida->xa, flags);

        entry = xa_find(&ida->xa, &index, max / IDA_BITMAP_BITS, XA_PRESENT);
        if (!entry) {
                ret = -ENOENT;
                goto err_unlock;
        }

        if (index > min / IDA_BITMAP_BITS)
                offset = 0;
        if (index * IDA_BITMAP_BITS + offset > max) {
                ret = -ENOENT;
                goto err_unlock;
        }

        if (xa_is_value(entry)) {
                tmp = xa_to_value(entry);
                addr = &tmp;
                size = BITS_PER_XA_VALUE;
        } else {
                addr = ((struct ida_bitmap *)entry)->bitmap;
                size = IDA_BITMAP_BITS;
        }

        bit = find_next_bit(addr, size, offset);

        xa_unlock_irqrestore(&ida->xa, flags);

        if (bit == size ||
            index * IDA_BITMAP_BITS + bit > max)
                return -ENOENT;

        return index * IDA_BITMAP_BITS + bit;

err_unlock:
        xa_unlock_irqrestore(&ida->xa, flags);
        return ret;
}
EXPORT_SYMBOL(ida_find_first_range);

/**
 * ida_free() - Release an allocated ID.
 * @ida: IDA handle.
 * @id: Previously allocated ID.
 *
 * Context: Any context. It is safe to call this function without
 * locking in your code.
 */
void ida_free(struct ida *ida, unsigned int id)
{
        XA_STATE(xas, &ida->xa, id / IDA_BITMAP_BITS);
        unsigned bit = id % IDA_BITMAP_BITS;
        struct ida_bitmap *bitmap;
        unsigned long flags;

        if ((int)id < 0)
                return;

        xas_lock_irqsave(&xas, flags);
        bitmap = xas_load(&xas);

        if (xa_is_value(bitmap)) {
                unsigned long v = xa_to_value(bitmap);
                if (bit >= BITS_PER_XA_VALUE)
                        goto err;
                if (!(v & (1UL << bit)))
                        goto err;
                v &= ~(1UL << bit);
                if (!v)
                        goto delete;
                xas_store(&xas, xa_mk_value(v));
        } else {
                if (!bitmap || !test_bit(bit, bitmap->bitmap))
                        goto err;
                __clear_bit(bit, bitmap->bitmap);
                xas_set_mark(&xas, XA_FREE_MARK);
                if (bitmap_empty(bitmap->bitmap, IDA_BITMAP_BITS)) {
                        kfree(bitmap);
delete:
                        xas_store(&xas, NULL);
                }
        }
        xas_unlock_irqrestore(&xas, flags);
        return;
 err:
        xas_unlock_irqrestore(&xas, flags);
        WARN(1, "ida_free called for id=%d which is not allocated.\n", id);
}
EXPORT_SYMBOL(ida_free);

/**
 * ida_destroy() - Free all IDs.
 * @ida: IDA handle.
 *
 * Calling this function frees all IDs and releases all resources used
 * by an IDA.  When this call returns, the IDA is empty and can be reused
 * or freed.  If the IDA is already empty, there is no need to call this
 * function.
 *
 * Context: Any context. It is safe to call this function without
 * locking in your code.
 */
void ida_destroy(struct ida *ida)
{
        XA_STATE(xas, &ida->xa, 0);
        struct ida_bitmap *bitmap;
        unsigned long flags;

        xas_lock_irqsave(&xas, flags);
        xas_for_each(&xas, bitmap, ULONG_MAX) {
                if (!xa_is_value(bitmap))
                        kfree(bitmap);
                xas_store(&xas, NULL);
        }
        xas_unlock_irqrestore(&xas, flags);
}
EXPORT_SYMBOL(ida_destroy);

#ifndef __KERNEL__
extern void xa_dump_index(unsigned long index, unsigned int shift);
#define IDA_CHUNK_SHIFT                ilog2(IDA_BITMAP_BITS)

static void ida_dump_entry(void *entry, unsigned long index)
{
        unsigned long i;

        if (!entry)
                return;

        if (xa_is_node(entry)) {
                struct xa_node *node = xa_to_node(entry);
                unsigned int shift = node->shift + IDA_CHUNK_SHIFT +
                        XA_CHUNK_SHIFT;

                xa_dump_index(index * IDA_BITMAP_BITS, shift);
                xa_dump_node(node);
                for (i = 0; i < XA_CHUNK_SIZE; i++)
                        ida_dump_entry(node->slots[i],
                                        index | (i << node->shift));
        } else if (xa_is_value(entry)) {
                xa_dump_index(index * IDA_BITMAP_BITS, ilog2(BITS_PER_LONG));
                pr_cont("value: data %lx [%px]\n", xa_to_value(entry), entry);
        } else {
                struct ida_bitmap *bitmap = entry;

                xa_dump_index(index * IDA_BITMAP_BITS, IDA_CHUNK_SHIFT);
                pr_cont("bitmap: %p data", bitmap);
                for (i = 0; i < IDA_BITMAP_LONGS; i++)
                        pr_cont(" %lx", bitmap->bitmap[i]);
                pr_cont("\n");
        }
}

static void ida_dump(struct ida *ida)
{
        struct xarray *xa = &ida->xa;
        pr_debug("ida: %p node %p free %d\n", ida, xa->xa_head,
                                xa->xa_flags >> ROOT_TAG_SHIFT);
        ida_dump_entry(xa->xa_head, 0);
}
#endif































































































































































































































































































































































































































































































































   42 

























   39 
   38 





   39 


    1 
   39 
   38 

   39 

    8 

    8 




























































   40 



   40 
















   40 


   39 




































































































































































































  120 






















  118 


















  119 
  120 
  118 



   13 





    8 
   13 
    8 
   13 





   42 

   13 



    8 
   40 


   13 
   43 
   13 



   41 






   43 

   43 




   37 








   40 




   43 
















   27 




   39 






   40 
















   43 









   43 

   43 
   43 




























   40 































   33 




















   40 





   39 
   40 

   40 


   39 
   38 
   39 

   40 
























   38 









   40 


   38 

   38 
   31 



















































   33 




















































    8 

    8 















































































































   15 


















    5 



    5 


   15 
   15 
















   14 














































    8 








    7 





    8 




























































































































    8 


















    8 









    8 





    8 















































    8 








































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
// SPDX-License-Identifier: GPL-2.0
/*
 *  Kernel internal timers
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  1997-01-28  Modified by Finn Arne Gangstad to make timers scale better.
 *
 *  1997-09-10  Updated NTP code according to technical memorandum Jan '96
 *              "A Kernel Model for Precision Timekeeping" by Dave Mills
 *  1998-12-24  Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
 *              serialize accesses to xtime/lost_ticks).
 *                              Copyright (C) 1998  Andrea Arcangeli
 *  1999-03-10  Improved NTP compatibility by Ulrich Windl
 *  2002-05-31        Move sys_sysinfo here and make its locking sane, Robert Love
 *  2000-10-05  Implemented scalable SMP per-CPU timer handling.
 *                              Copyright (C) 2000, 2001, 2002  Ingo Molnar
 *              Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
 */

#include <linux/kernel_stat.h>
#include <linux/export.h>
#include <linux/interrupt.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/pid_namespace.h>
#include <linux/notifier.h>
#include <linux/thread_info.h>
#include <linux/time.h>
#include <linux/jiffies.h>
#include <linux/posix-timers.h>
#include <linux/cpu.h>
#include <linux/syscalls.h>
#include <linux/delay.h>
#include <linux/tick.h>
#include <linux/kallsyms.h>
#include <linux/irq_work.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/nohz.h>
#include <linux/sched/debug.h>
#include <linux/slab.h>
#include <linux/compat.h>
#include <linux/random.h>
#include <linux/sysctl.h>

#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <asm/div64.h>
#include <asm/timex.h>
#include <asm/io.h>

#include "tick-internal.h"
#include "timer_migration.h"

#define CREATE_TRACE_POINTS
#include <trace/events/timer.h>

__visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;

EXPORT_SYMBOL(jiffies_64);

/*
 * The timer wheel has LVL_DEPTH array levels. Each level provides an array of
 * LVL_SIZE buckets. Each level is driven by its own clock and therefore each
 * level has a different granularity.
 *
 * The level granularity is:                LVL_CLK_DIV ^ level
 * The level clock frequency is:        HZ / (LVL_CLK_DIV ^ level)
 *
 * The array level of a newly armed timer depends on the relative expiry
 * time. The farther the expiry time is away the higher the array level and
 * therefore the granularity becomes.
 *
 * Contrary to the original timer wheel implementation, which aims for 'exact'
 * expiry of the timers, this implementation removes the need for recascading
 * the timers into the lower array levels. The previous 'classic' timer wheel
 * implementation of the kernel already violated the 'exact' expiry by adding
 * slack to the expiry time to provide batched expiration. The granularity
 * levels provide implicit batching.
 *
 * This is an optimization of the original timer wheel implementation for the
 * majority of the timer wheel use cases: timeouts. The vast majority of
 * timeout timers (networking, disk I/O ...) are canceled before expiry. If
 * the timeout expires it indicates that normal operation is disturbed, so it
 * does not matter much whether the timeout comes with a slight delay.
 *
 * The only exception to this are networking timers with a small expiry
 * time. They rely on the granularity. Those fit into the first wheel level,
 * which has HZ granularity.
 *
 * We don't have cascading anymore. timers with a expiry time above the
 * capacity of the last wheel level are force expired at the maximum timeout
 * value of the last wheel level. From data sampling we know that the maximum
 * value observed is 5 days (network connection tracking), so this should not
 * be an issue.
 *
 * The currently chosen array constants values are a good compromise between
 * array size and granularity.
 *
 * This results in the following granularity and range levels:
 *
 * HZ 1000 steps
 * Level Offset  Granularity            Range
 *  0      0         1 ms                0 ms -         63 ms
 *  1     64         8 ms               64 ms -        511 ms
 *  2    128        64 ms              512 ms -       4095 ms (512ms - ~4s)
 *  3    192       512 ms             4096 ms -      32767 ms (~4s - ~32s)
 *  4    256      4096 ms (~4s)      32768 ms -     262143 ms (~32s - ~4m)
 *  5    320     32768 ms (~32s)    262144 ms -    2097151 ms (~4m - ~34m)
 *  6    384    262144 ms (~4m)    2097152 ms -   16777215 ms (~34m - ~4h)
 *  7    448   2097152 ms (~34m)  16777216 ms -  134217727 ms (~4h - ~1d)
 *  8    512  16777216 ms (~4h)  134217728 ms - 1073741822 ms (~1d - ~12d)
 *
 * HZ  300
 * Level Offset  Granularity            Range
 *  0           0         3 ms                0 ms -        210 ms
 *  1          64        26 ms              213 ms -       1703 ms (213ms - ~1s)
 *  2         128       213 ms             1706 ms -      13650 ms (~1s - ~13s)
 *  3         192      1706 ms (~1s)      13653 ms -     109223 ms (~13s - ~1m)
 *  4         256     13653 ms (~13s)    109226 ms -     873810 ms (~1m - ~14m)
 *  5         320    109226 ms (~1m)     873813 ms -    6990503 ms (~14m - ~1h)
 *  6         384    873813 ms (~14m)   6990506 ms -   55924050 ms (~1h - ~15h)
 *  7         448   6990506 ms (~1h)   55924053 ms -  447392423 ms (~15h - ~5d)
 *  8    512  55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d)
 *
 * HZ  250
 * Level Offset  Granularity            Range
 *  0           0         4 ms                0 ms -        255 ms
 *  1          64        32 ms              256 ms -       2047 ms (256ms - ~2s)
 *  2         128       256 ms             2048 ms -      16383 ms (~2s - ~16s)
 *  3         192      2048 ms (~2s)      16384 ms -     131071 ms (~16s - ~2m)
 *  4         256     16384 ms (~16s)    131072 ms -    1048575 ms (~2m - ~17m)
 *  5         320    131072 ms (~2m)    1048576 ms -    8388607 ms (~17m - ~2h)
 *  6         384   1048576 ms (~17m)   8388608 ms -   67108863 ms (~2h - ~18h)
 *  7         448   8388608 ms (~2h)   67108864 ms -  536870911 ms (~18h - ~6d)
 *  8    512  67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d)
 *
 * HZ  100
 * Level Offset  Granularity            Range
 *  0           0         10 ms               0 ms -        630 ms
 *  1          64         80 ms             640 ms -       5110 ms (640ms - ~5s)
 *  2         128        640 ms            5120 ms -      40950 ms (~5s - ~40s)
 *  3         192       5120 ms (~5s)     40960 ms -     327670 ms (~40s - ~5m)
 *  4         256      40960 ms (~40s)   327680 ms -    2621430 ms (~5m - ~43m)
 *  5         320     327680 ms (~5m)   2621440 ms -   20971510 ms (~43m - ~5h)
 *  6         384    2621440 ms (~43m) 20971520 ms -  167772150 ms (~5h - ~1d)
 *  7         448   20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d)
 */

/* Clock divisor for the next level */
#define LVL_CLK_SHIFT        3
#define LVL_CLK_DIV        (1UL << LVL_CLK_SHIFT)
#define LVL_CLK_MASK        (LVL_CLK_DIV - 1)
#define LVL_SHIFT(n)        ((n) * LVL_CLK_SHIFT)
#define LVL_GRAN(n)        (1UL << LVL_SHIFT(n))

/*
 * The time start value for each level to select the bucket at enqueue
 * time. We start from the last possible delta of the previous level
 * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()).
 */
#define LVL_START(n)        ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))

/* Size of each clock level */
#define LVL_BITS        6
#define LVL_SIZE        (1UL << LVL_BITS)
#define LVL_MASK        (LVL_SIZE - 1)
#define LVL_OFFS(n)        ((n) * LVL_SIZE)

/* Level depth */
#if HZ > 100
# define LVL_DEPTH        9
# else
# define LVL_DEPTH        8
#endif

/* The cutoff (max. capacity of the wheel) */
#define WHEEL_TIMEOUT_CUTOFF        (LVL_START(LVL_DEPTH))
#define WHEEL_TIMEOUT_MAX        (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1))

/*
 * The resulting wheel size. If NOHZ is configured we allocate two
 * wheels so we have a separate storage for the deferrable timers.
 */
#define WHEEL_SIZE        (LVL_SIZE * LVL_DEPTH)

#ifdef CONFIG_NO_HZ_COMMON
/*
 * If multiple bases need to be locked, use the base ordering for lock
 * nesting, i.e. lowest number first.
 */
# define NR_BASES        3
# define BASE_LOCAL        0
# define BASE_GLOBAL        1
# define BASE_DEF        2
#else
# define NR_BASES        1
# define BASE_LOCAL        0
# define BASE_GLOBAL        0
# define BASE_DEF        0
#endif

/**
 * struct timer_base - Per CPU timer base (number of base depends on config)
 * @lock:                Lock protecting the timer_base
 * @running_timer:        When expiring timers, the lock is dropped. To make
 *                        sure not to race against deleting/modifying a
 *                        currently running timer, the pointer is set to the
 *                        timer, which expires at the moment. If no timer is
 *                        running, the pointer is NULL.
 * @expiry_lock:        PREEMPT_RT only: Lock is taken in softirq around
 *                        timer expiry callback execution and when trying to
 *                        delete a running timer and it wasn't successful in
 *                        the first glance. It prevents priority inversion
 *                        when callback was preempted on a remote CPU and a
 *                        caller tries to delete the running timer. It also
 *                        prevents a life lock, when the task which tries to
 *                        delete a timer preempted the softirq thread which
 *                        is running the timer callback function.
 * @timer_waiters:        PREEMPT_RT only: Tells, if there is a waiter
 *                        waiting for the end of the timer callback function
 *                        execution.
 * @clk:                clock of the timer base; is updated before enqueue
 *                        of a timer; during expiry, it is 1 offset ahead of
 *                        jiffies to avoid endless requeuing to current
 *                        jiffies
 * @next_expiry:        expiry value of the first timer; it is updated when
 *                        finding the next timer and during enqueue; the
 *                        value is not valid, when next_expiry_recalc is set
 * @cpu:                Number of CPU the timer base belongs to
 * @next_expiry_recalc: States, whether a recalculation of next_expiry is
 *                        required. Value is set true, when a timer was
 *                        deleted.
 * @is_idle:                Is set, when timer_base is idle. It is triggered by NOHZ
 *                        code. This state is only used in standard
 *                        base. Deferrable timers, which are enqueued remotely
 *                        never wake up an idle CPU. So no matter of supporting it
 *                        for this base.
 * @timers_pending:        Is set, when a timer is pending in the base. It is only
 *                        reliable when next_expiry_recalc is not set.
 * @pending_map:        bitmap of the timer wheel; each bit reflects a
 *                        bucket of the wheel. When a bit is set, at least a
 *                        single timer is enqueued in the related bucket.
 * @vectors:                Array of lists; Each array member reflects a bucket
 *                        of the timer wheel. The list contains all timers
 *                        which are enqueued into a specific bucket.
 */
struct timer_base {
        raw_spinlock_t                lock;
        struct timer_list        *running_timer;
#ifdef CONFIG_PREEMPT_RT
        spinlock_t                expiry_lock;
        atomic_t                timer_waiters;
#endif
        unsigned long                clk;
        unsigned long                next_expiry;
        unsigned int                cpu;
        bool                        next_expiry_recalc;
        bool                        is_idle;
        bool                        timers_pending;
        DECLARE_BITMAP(pending_map, WHEEL_SIZE);
        struct hlist_head        vectors[WHEEL_SIZE];
} ____cacheline_aligned;

static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);

#ifdef CONFIG_NO_HZ_COMMON

static DEFINE_STATIC_KEY_FALSE(timers_nohz_active);
static DEFINE_MUTEX(timer_keys_mutex);

static void timer_update_keys(struct work_struct *work);
static DECLARE_WORK(timer_update_work, timer_update_keys);

#ifdef CONFIG_SMP
static unsigned int sysctl_timer_migration = 1;

DEFINE_STATIC_KEY_FALSE(timers_migration_enabled);

static void timers_update_migration(void)
{
        if (sysctl_timer_migration && tick_nohz_active)
                static_branch_enable(&timers_migration_enabled);
        else
                static_branch_disable(&timers_migration_enabled);
}

#ifdef CONFIG_SYSCTL
static int timer_migration_handler(const struct ctl_table *table, int write,
                            void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;

        mutex_lock(&timer_keys_mutex);
        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (!ret && write)
                timers_update_migration();
        mutex_unlock(&timer_keys_mutex);
        return ret;
}

static const struct ctl_table timer_sysctl[] = {
        {
                .procname        = "timer_migration",
                .data                = &sysctl_timer_migration,
                .maxlen                = sizeof(unsigned int),
                .mode                = 0644,
                .proc_handler        = timer_migration_handler,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
};

static int __init timer_sysctl_init(void)
{
        register_sysctl("kernel", timer_sysctl);
        return 0;
}
device_initcall(timer_sysctl_init);
#endif /* CONFIG_SYSCTL */
#else /* CONFIG_SMP */
static inline void timers_update_migration(void) { }
#endif /* !CONFIG_SMP */

static void timer_update_keys(struct work_struct *work)
{
        mutex_lock(&timer_keys_mutex);
        timers_update_migration();
        static_branch_enable(&timers_nohz_active);
        mutex_unlock(&timer_keys_mutex);
}

void timers_update_nohz(void)
{
        schedule_work(&timer_update_work);
}

static inline bool is_timers_nohz_active(void)
{
        return static_branch_unlikely(&timers_nohz_active);
}
#else
static inline bool is_timers_nohz_active(void) { return false; }
#endif /* NO_HZ_COMMON */

static unsigned long round_jiffies_common(unsigned long j, int cpu,
                bool force_up)
{
        int rem;
        unsigned long original = j;

        /*
         * We don't want all cpus firing their timers at once hitting the
         * same lock or cachelines, so we skew each extra cpu with an extra
         * 3 jiffies. This 3 jiffies came originally from the mm/ code which
         * already did this.
         * The skew is done by adding 3*cpunr, then round, then subtract this
         * extra offset again.
         */
        j += cpu * 3;

        rem = j % HZ;

        /*
         * If the target jiffy is just after a whole second (which can happen
         * due to delays of the timer irq, long irq off times etc etc) then
         * we should round down to the whole second, not up. Use 1/4th second
         * as cutoff for this rounding as an extreme upper bound for this.
         * But never round down if @force_up is set.
         */
        if (rem < HZ/4 && !force_up) /* round down */
                j = j - rem;
        else /* round up */
                j = j - rem + HZ;

        /* now that we have rounded, subtract the extra skew again */
        j -= cpu * 3;

        /*
         * Make sure j is still in the future. Otherwise return the
         * unmodified value.
         */
        return time_is_after_jiffies(j) ? j : original;
}

/**
 * __round_jiffies_relative - function to round jiffies to a full second
 * @j: the time in (relative) jiffies that should be rounded
 * @cpu: the processor number on which the timeout will happen
 *
 * __round_jiffies_relative() rounds a time delta  in the future (in jiffies)
 * up or down to (approximately) full seconds. This is useful for timers
 * for which the exact time they fire does not matter too much, as long as
 * they fire approximately every X seconds.
 *
 * By rounding these timers to whole seconds, all such timers will fire
 * at the same time, rather than at various times spread out. The goal
 * of this is to have the CPU wake up less, which saves power.
 *
 * The exact rounding is skewed for each processor to avoid all
 * processors firing at the exact same time, which could lead
 * to lock contention or spurious cache line bouncing.
 *
 * The return value is the rounded version of the @j parameter.
 */
unsigned long __round_jiffies_relative(unsigned long j, int cpu)
{
        unsigned long j0 = jiffies;

        /* Use j0 because jiffies might change while we run */
        return round_jiffies_common(j + j0, cpu, false) - j0;
}
EXPORT_SYMBOL_GPL(__round_jiffies_relative);

/**
 * round_jiffies - function to round jiffies to a full second
 * @j: the time in (absolute) jiffies that should be rounded
 *
 * round_jiffies() rounds an absolute time in the future (in jiffies)
 * up or down to (approximately) full seconds. This is useful for timers
 * for which the exact time they fire does not matter too much, as long as
 * they fire approximately every X seconds.
 *
 * By rounding these timers to whole seconds, all such timers will fire
 * at the same time, rather than at various times spread out. The goal
 * of this is to have the CPU wake up less, which saves power.
 *
 * The return value is the rounded version of the @j parameter.
 */
unsigned long round_jiffies(unsigned long j)
{
        return round_jiffies_common(j, raw_smp_processor_id(), false);
}
EXPORT_SYMBOL_GPL(round_jiffies);

/**
 * round_jiffies_relative - function to round jiffies to a full second
 * @j: the time in (relative) jiffies that should be rounded
 *
 * round_jiffies_relative() rounds a time delta  in the future (in jiffies)
 * up or down to (approximately) full seconds. This is useful for timers
 * for which the exact time they fire does not matter too much, as long as
 * they fire approximately every X seconds.
 *
 * By rounding these timers to whole seconds, all such timers will fire
 * at the same time, rather than at various times spread out. The goal
 * of this is to have the CPU wake up less, which saves power.
 *
 * The return value is the rounded version of the @j parameter.
 */
unsigned long round_jiffies_relative(unsigned long j)
{
        return __round_jiffies_relative(j, raw_smp_processor_id());
}
EXPORT_SYMBOL_GPL(round_jiffies_relative);

/**
 * __round_jiffies_up_relative - function to round jiffies up to a full second
 * @j: the time in (relative) jiffies that should be rounded
 * @cpu: the processor number on which the timeout will happen
 *
 * This is the same as __round_jiffies_relative() except that it will never
 * round down.  This is useful for timeouts for which the exact time
 * of firing does not matter too much, as long as they don't fire too
 * early.
 */
unsigned long __round_jiffies_up_relative(unsigned long j, int cpu)
{
        unsigned long j0 = jiffies;

        /* Use j0 because jiffies might change while we run */
        return round_jiffies_common(j + j0, cpu, true) - j0;
}
EXPORT_SYMBOL_GPL(__round_jiffies_up_relative);

/**
 * round_jiffies_up - function to round jiffies up to a full second
 * @j: the time in (absolute) jiffies that should be rounded
 *
 * This is the same as round_jiffies() except that it will never
 * round down.  This is useful for timeouts for which the exact time
 * of firing does not matter too much, as long as they don't fire too
 * early.
 */
unsigned long round_jiffies_up(unsigned long j)
{
        return round_jiffies_common(j, raw_smp_processor_id(), true);
}
EXPORT_SYMBOL_GPL(round_jiffies_up);

/**
 * round_jiffies_up_relative - function to round jiffies up to a full second
 * @j: the time in (relative) jiffies that should be rounded
 *
 * This is the same as round_jiffies_relative() except that it will never
 * round down.  This is useful for timeouts for which the exact time
 * of firing does not matter too much, as long as they don't fire too
 * early.
 */
unsigned long round_jiffies_up_relative(unsigned long j)
{
        return __round_jiffies_up_relative(j, raw_smp_processor_id());
}
EXPORT_SYMBOL_GPL(round_jiffies_up_relative);


static inline unsigned int timer_get_idx(struct timer_list *timer)
{
        return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT;
}

static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)
{
        timer->flags = (timer->flags & ~TIMER_ARRAYMASK) |
                        idx << TIMER_ARRAYSHIFT;
}

/*
 * Helper function to calculate the array index for a given expiry
 * time.
 */
static inline unsigned calc_index(unsigned long expires, unsigned lvl,
                                  unsigned long *bucket_expiry)
{

        /*
         * The timer wheel has to guarantee that a timer does not fire
         * early. Early expiry can happen due to:
         * - Timer is armed at the edge of a tick
         * - Truncation of the expiry time in the outer wheel levels
         *
         * Round up with level granularity to prevent this.
         */
        expires = (expires >> LVL_SHIFT(lvl)) + 1;
        *bucket_expiry = expires << LVL_SHIFT(lvl);
        return LVL_OFFS(lvl) + (expires & LVL_MASK);
}

static int calc_wheel_index(unsigned long expires, unsigned long clk,
                            unsigned long *bucket_expiry)
{
        unsigned long delta = expires - clk;
        unsigned int idx;

        if (delta < LVL_START(1)) {
                idx = calc_index(expires, 0, bucket_expiry);
        } else if (delta < LVL_START(2)) {
                idx = calc_index(expires, 1, bucket_expiry);
        } else if (delta < LVL_START(3)) {
                idx = calc_index(expires, 2, bucket_expiry);
        } else if (delta < LVL_START(4)) {
                idx = calc_index(expires, 3, bucket_expiry);
        } else if (delta < LVL_START(5)) {
                idx = calc_index(expires, 4, bucket_expiry);
        } else if (delta < LVL_START(6)) {
                idx = calc_index(expires, 5, bucket_expiry);
        } else if (delta < LVL_START(7)) {
                idx = calc_index(expires, 6, bucket_expiry);
        } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) {
                idx = calc_index(expires, 7, bucket_expiry);
        } else if ((long) delta < 0) {
                idx = clk & LVL_MASK;
                *bucket_expiry = clk;
        } else {
                /*
                 * Force expire obscene large timeouts to expire at the
                 * capacity limit of the wheel.
                 */
                if (delta >= WHEEL_TIMEOUT_CUTOFF)
                        expires = clk + WHEEL_TIMEOUT_MAX;

                idx = calc_index(expires, LVL_DEPTH - 1, bucket_expiry);
        }
        return idx;
}

static void
trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
{
        /*
         * Deferrable timers do not prevent the CPU from entering dynticks and
         * are not taken into account on the idle/nohz_full path. An IPI when a
         * new deferrable timer is enqueued will wake up the remote CPU but
         * nothing will be done with the deferrable timer base. Therefore skip
         * the remote IPI for deferrable timers completely.
         */
        if (!is_timers_nohz_active() || timer->flags & TIMER_DEFERRABLE)
                return;

        /*
         * We might have to IPI the remote CPU if the base is idle and the
         * timer is pinned. If it is a non pinned timer, it is only queued
         * on the remote CPU, when timer was running during queueing. Then
         * everything is handled by remote CPU anyway. If the other CPU is
         * on the way to idle then it can't set base->is_idle as we hold
         * the base lock:
         */
        if (base->is_idle) {
                WARN_ON_ONCE(!(timer->flags & TIMER_PINNED ||
                               tick_nohz_full_cpu(base->cpu)));
                wake_up_nohz_cpu(base->cpu);
        }
}

/*
 * Enqueue the timer into the hash bucket, mark it pending in
 * the bitmap, store the index in the timer flags then wake up
 * the target CPU if needed.
 */
static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
                          unsigned int idx, unsigned long bucket_expiry)
{

        hlist_add_head(&timer->entry, base->vectors + idx);
        __set_bit(idx, base->pending_map);
        timer_set_idx(timer, idx);

        trace_timer_start(timer, bucket_expiry);

        /*
         * Check whether this is the new first expiring timer. The
         * effective expiry time of the timer is required here
         * (bucket_expiry) instead of timer->expires.
         */
        if (time_before(bucket_expiry, base->next_expiry)) {
                /*
                 * Set the next expiry time and kick the CPU so it
                 * can reevaluate the wheel:
                 */
                WRITE_ONCE(base->next_expiry, bucket_expiry);
                base->timers_pending = true;
                base->next_expiry_recalc = false;
                trigger_dyntick_cpu(base, timer);
        }
}

static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
{
        unsigned long bucket_expiry;
        unsigned int idx;

        idx = calc_wheel_index(timer->expires, base->clk, &bucket_expiry);
        enqueue_timer(base, timer, idx, bucket_expiry);
}

#ifdef CONFIG_DEBUG_OBJECTS_TIMERS

static const struct debug_obj_descr timer_debug_descr;

struct timer_hint {
        void        (*function)(struct timer_list *t);
        long        offset;
};

#define TIMER_HINT(fn, container, timr, hintfn)                        \
        {                                                        \
                .function = fn,                                        \
                .offset          = offsetof(container, hintfn) -        \
                            offsetof(container, timr)                \
        }

static const struct timer_hint timer_hints[] = {
        TIMER_HINT(delayed_work_timer_fn,
                   struct delayed_work, timer, work.func),
        TIMER_HINT(kthread_delayed_work_timer_fn,
                   struct kthread_delayed_work, timer, work.func),
};

static void *timer_debug_hint(void *addr)
{
        struct timer_list *timer = addr;
        int i;

        for (i = 0; i < ARRAY_SIZE(timer_hints); i++) {
                if (timer_hints[i].function == timer->function) {
                        void (**fn)(void) = addr + timer_hints[i].offset;

                        return *fn;
                }
        }

        return timer->function;
}

static bool timer_is_static_object(void *addr)
{
        struct timer_list *timer = addr;

        return (timer->entry.pprev == NULL &&
                timer->entry.next == TIMER_ENTRY_STATIC);
}

/*
 * timer_fixup_init is called when:
 * - an active object is initialized
 */
static bool timer_fixup_init(void *addr, enum debug_obj_state state)
{
        struct timer_list *timer = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                timer_delete_sync(timer);
                debug_object_init(timer, &timer_debug_descr);
                return true;
        default:
                return false;
        }
}

/* Stub timer callback for improperly used timers. */
static void stub_timer(struct timer_list *unused)
{
        WARN_ON(1);
}

/*
 * timer_fixup_activate is called when:
 * - an active object is activated
 * - an unknown non-static object is activated
 */
static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
{
        struct timer_list *timer = addr;

        switch (state) {
        case ODEBUG_STATE_NOTAVAILABLE:
                timer_setup(timer, stub_timer, 0);
                return true;

        case ODEBUG_STATE_ACTIVE:
                WARN_ON(1);
                fallthrough;
        default:
                return false;
        }
}

/*
 * timer_fixup_free is called when:
 * - an active object is freed
 */
static bool timer_fixup_free(void *addr, enum debug_obj_state state)
{
        struct timer_list *timer = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                timer_delete_sync(timer);
                debug_object_free(timer, &timer_debug_descr);
                return true;
        default:
                return false;
        }
}

/*
 * timer_fixup_assert_init is called when:
 * - an untracked/uninit-ed object is found
 */
static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
{
        struct timer_list *timer = addr;

        switch (state) {
        case ODEBUG_STATE_NOTAVAILABLE:
                timer_setup(timer, stub_timer, 0);
                return true;
        default:
                return false;
        }
}

static const struct debug_obj_descr timer_debug_descr = {
        .name                        = "timer_list",
        .debug_hint                = timer_debug_hint,
        .is_static_object        = timer_is_static_object,
        .fixup_init                = timer_fixup_init,
        .fixup_activate                = timer_fixup_activate,
        .fixup_free                = timer_fixup_free,
        .fixup_assert_init        = timer_fixup_assert_init,
};

static inline void debug_timer_init(struct timer_list *timer)
{
        debug_object_init(timer, &timer_debug_descr);
}

static inline void debug_timer_activate(struct timer_list *timer)
{
        debug_object_activate(timer, &timer_debug_descr);
}

static inline void debug_timer_deactivate(struct timer_list *timer)
{
        debug_object_deactivate(timer, &timer_debug_descr);
}

static inline void debug_timer_assert_init(struct timer_list *timer)
{
        debug_object_assert_init(timer, &timer_debug_descr);
}

static void do_init_timer(struct timer_list *timer,
                          void (*func)(struct timer_list *),
                          unsigned int flags,
                          const char *name, struct lock_class_key *key);

void timer_init_key_on_stack(struct timer_list *timer,
                             void (*func)(struct timer_list *),
                             unsigned int flags,
                             const char *name, struct lock_class_key *key)
{
        debug_object_init_on_stack(timer, &timer_debug_descr);
        do_init_timer(timer, func, flags, name, key);
}
EXPORT_SYMBOL_GPL(timer_init_key_on_stack);

void timer_destroy_on_stack(struct timer_list *timer)
{
        debug_object_free(timer, &timer_debug_descr);
}
EXPORT_SYMBOL_GPL(timer_destroy_on_stack);

#else
static inline void debug_timer_init(struct timer_list *timer) { }
static inline void debug_timer_activate(struct timer_list *timer) { }
static inline void debug_timer_deactivate(struct timer_list *timer) { }
static inline void debug_timer_assert_init(struct timer_list *timer) { }
#endif

static inline void debug_init(struct timer_list *timer)
{
        debug_timer_init(timer);
        trace_timer_init(timer);
}

static inline void debug_deactivate(struct timer_list *timer)
{
        debug_timer_deactivate(timer);
        trace_timer_cancel(timer);
}

static inline void debug_assert_init(struct timer_list *timer)
{
        debug_timer_assert_init(timer);
}

static void do_init_timer(struct timer_list *timer,
                          void (*func)(struct timer_list *),
                          unsigned int flags,
                          const char *name, struct lock_class_key *key)
{
        timer->entry.pprev = NULL;
        timer->function = func;
        if (WARN_ON_ONCE(flags & ~TIMER_INIT_FLAGS))
                flags &= TIMER_INIT_FLAGS;
        timer->flags = flags | raw_smp_processor_id();
        lockdep_init_map(&timer->lockdep_map, name, key, 0);
}

/**
 * timer_init_key - initialize a timer
 * @timer: the timer to be initialized
 * @func: timer callback function
 * @flags: timer flags
 * @name: name of the timer
 * @key: lockdep class key of the fake lock used for tracking timer
 *       sync lock dependencies
 *
 * timer_init_key() must be done to a timer prior to calling *any* of the
 * other timer functions.
 */
void timer_init_key(struct timer_list *timer,
                    void (*func)(struct timer_list *), unsigned int flags,
                    const char *name, struct lock_class_key *key)
{
        debug_init(timer);
        do_init_timer(timer, func, flags, name, key);
}
EXPORT_SYMBOL(timer_init_key);

static inline void detach_timer(struct timer_list *timer, bool clear_pending)
{
        struct hlist_node *entry = &timer->entry;

        debug_deactivate(timer);

        __hlist_del(entry);
        if (clear_pending)
                entry->pprev = NULL;
        entry->next = LIST_POISON2;
}

static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
                             bool clear_pending)
{
        unsigned idx = timer_get_idx(timer);

        if (!timer_pending(timer))
                return 0;

        if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) {
                __clear_bit(idx, base->pending_map);
                base->next_expiry_recalc = true;
        }

        detach_timer(timer, clear_pending);
        return 1;
}

static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
{
        int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL;

        /*
         * If the timer is deferrable and NO_HZ_COMMON is set then we need
         * to use the deferrable base.
         */
        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
                index = BASE_DEF;

        return per_cpu_ptr(&timer_bases[index], cpu);
}

static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
{
        int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL;

        /*
         * If the timer is deferrable and NO_HZ_COMMON is set then we need
         * to use the deferrable base.
         */
        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
                index = BASE_DEF;

        return this_cpu_ptr(&timer_bases[index]);
}

static inline struct timer_base *get_timer_base(u32 tflags)
{
        return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
}

static inline void __forward_timer_base(struct timer_base *base,
                                        unsigned long basej)
{
        /*
         * Check whether we can forward the base. We can only do that when
         * @basej is past base->clk otherwise we might rewind base->clk.
         */
        if (time_before_eq(basej, base->clk))
                return;

        /*
         * If the next expiry value is > jiffies, then we fast forward to
         * jiffies otherwise we forward to the next expiry value.
         */
        if (time_after(base->next_expiry, basej)) {
                base->clk = basej;
        } else {
                if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk)))
                        return;
                base->clk = base->next_expiry;
        }

}

static inline void forward_timer_base(struct timer_base *base)
{
        __forward_timer_base(base, READ_ONCE(jiffies));
}

/*
 * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
 * that all timers which are tied to this base are locked, and the base itself
 * is locked too.
 *
 * So __run_timers/migrate_timers can safely modify all timers which could
 * be found in the base->vectors array.
 *
 * When a timer is migrating then the TIMER_MIGRATING flag is set and we need
 * to wait until the migration is done.
 */
static struct timer_base *lock_timer_base(struct timer_list *timer,
                                          unsigned long *flags)
        __acquires(timer->base->lock)
{
        for (;;) {
                struct timer_base *base;
                u32 tf;

                /*
                 * We need to use READ_ONCE() here, otherwise the compiler
                 * might re-read @tf between the check for TIMER_MIGRATING
                 * and spin_lock().
                 */
                tf = READ_ONCE(timer->flags);

                if (!(tf & TIMER_MIGRATING)) {
                        base = get_timer_base(tf);
                        raw_spin_lock_irqsave(&base->lock, *flags);
                        if (timer->flags == tf)
                                return base;
                        raw_spin_unlock_irqrestore(&base->lock, *flags);
                }
                cpu_relax();
        }
}

#define MOD_TIMER_PENDING_ONLY                0x01
#define MOD_TIMER_REDUCE                0x02
#define MOD_TIMER_NOTPENDING                0x04

static inline int
__mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options)
{
        unsigned long clk = 0, flags, bucket_expiry;
        struct timer_base *base, *new_base;
        unsigned int idx = UINT_MAX;
        int ret = 0;

        debug_assert_init(timer);

        /*
         * This is a common optimization triggered by the networking code - if
         * the timer is re-modified to have the same timeout or ends up in the
         * same array bucket then just return:
         */
        if (!(options & MOD_TIMER_NOTPENDING) && timer_pending(timer)) {
                /*
                 * The downside of this optimization is that it can result in
                 * larger granularity than you would get from adding a new
                 * timer with this expiry.
                 */
                long diff = timer->expires - expires;

                if (!diff)
                        return 1;
                if (options & MOD_TIMER_REDUCE && diff <= 0)
                        return 1;

                /*
                 * We lock timer base and calculate the bucket index right
                 * here. If the timer ends up in the same bucket, then we
                 * just update the expiry time and avoid the whole
                 * dequeue/enqueue dance.
                 */
                base = lock_timer_base(timer, &flags);
                /*
                 * Has @timer been shutdown? This needs to be evaluated
                 * while holding base lock to prevent a race against the
                 * shutdown code.
                 */
                if (!timer->function)
                        goto out_unlock;

                forward_timer_base(base);

                if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) &&
                    time_before_eq(timer->expires, expires)) {
                        ret = 1;
                        goto out_unlock;
                }

                clk = base->clk;
                idx = calc_wheel_index(expires, clk, &bucket_expiry);

                /*
                 * Retrieve and compare the array index of the pending
                 * timer. If it matches set the expiry to the new value so a
                 * subsequent call will exit in the expires check above.
                 */
                if (idx == timer_get_idx(timer)) {
                        if (!(options & MOD_TIMER_REDUCE))
                                timer->expires = expires;
                        else if (time_after(timer->expires, expires))
                                timer->expires = expires;
                        ret = 1;
                        goto out_unlock;
                }
        } else {
                base = lock_timer_base(timer, &flags);
                /*
                 * Has @timer been shutdown? This needs to be evaluated
                 * while holding base lock to prevent a race against the
                 * shutdown code.
                 */
                if (!timer->function)
                        goto out_unlock;

                forward_timer_base(base);
        }

        ret = detach_if_pending(timer, base, false);
        if (!ret && (options & MOD_TIMER_PENDING_ONLY))
                goto out_unlock;

        new_base = get_timer_this_cpu_base(timer->flags);

        if (base != new_base) {
                /*
                 * We are trying to schedule the timer on the new base.
                 * However we can't change timer's base while it is running,
                 * otherwise timer_delete_sync() can't detect that the timer's
                 * handler yet has not finished. This also guarantees that the
                 * timer is serialized wrt itself.
                 */
                if (likely(base->running_timer != timer)) {
                        /* See the comment in lock_timer_base() */
                        timer->flags |= TIMER_MIGRATING;

                        raw_spin_unlock(&base->lock);
                        base = new_base;
                        raw_spin_lock(&base->lock);
                        WRITE_ONCE(timer->flags,
                                   (timer->flags & ~TIMER_BASEMASK) | base->cpu);
                        forward_timer_base(base);
                }
        }

        debug_timer_activate(timer);

        timer->expires = expires;
        /*
         * If 'idx' was calculated above and the base time did not advance
         * between calculating 'idx' and possibly switching the base, only
         * enqueue_timer() is required. Otherwise we need to (re)calculate
         * the wheel index via internal_add_timer().
         */
        if (idx != UINT_MAX && clk == base->clk)
                enqueue_timer(base, timer, idx, bucket_expiry);
        else
                internal_add_timer(base, timer);

out_unlock:
        raw_spin_unlock_irqrestore(&base->lock, flags);

        return ret;
}

/**
 * mod_timer_pending - Modify a pending timer's timeout
 * @timer:        The pending timer to be modified
 * @expires:        New absolute timeout in jiffies
 *
 * mod_timer_pending() is the same for pending timers as mod_timer(), but
 * will not activate inactive timers.
 *
 * If @timer->function == NULL then the start operation is silently
 * discarded.
 *
 * Return:
 * * %0 - The timer was inactive and not modified or was in
 *          shutdown state and the operation was discarded
 * * %1 - The timer was active and requeued to expire at @expires
 */
int mod_timer_pending(struct timer_list *timer, unsigned long expires)
{
        return __mod_timer(timer, expires, MOD_TIMER_PENDING_ONLY);
}
EXPORT_SYMBOL(mod_timer_pending);

/**
 * mod_timer - Modify a timer's timeout
 * @timer:        The timer to be modified
 * @expires:        New absolute timeout in jiffies
 *
 * mod_timer(timer, expires) is equivalent to:
 *
 *     timer_delete(timer); timer->expires = expires; add_timer(timer);
 *
 * mod_timer() is more efficient than the above open coded sequence. In
 * case that the timer is inactive, the timer_delete() part is a NOP. The
 * timer is in any case activated with the new expiry time @expires.
 *
 * Note that if there are multiple unserialized concurrent users of the
 * same timer, then mod_timer() is the only safe way to modify the timeout,
 * since add_timer() cannot modify an already running timer.
 *
 * If @timer->function == NULL then the start operation is silently
 * discarded. In this case the return value is 0 and meaningless.
 *
 * Return:
 * * %0 - The timer was inactive and started or was in shutdown
 *          state and the operation was discarded
 * * %1 - The timer was active and requeued to expire at @expires or
 *          the timer was active and not modified because @expires did
 *          not change the effective expiry time
 */
int mod_timer(struct timer_list *timer, unsigned long expires)
{
        return __mod_timer(timer, expires, 0);
}
EXPORT_SYMBOL(mod_timer);

/**
 * timer_reduce - Modify a timer's timeout if it would reduce the timeout
 * @timer:        The timer to be modified
 * @expires:        New absolute timeout in jiffies
 *
 * timer_reduce() is very similar to mod_timer(), except that it will only
 * modify an enqueued timer if that would reduce the expiration time. If
 * @timer is not enqueued it starts the timer.
 *
 * If @timer->function == NULL then the start operation is silently
 * discarded.
 *
 * Return:
 * * %0 - The timer was inactive and started or was in shutdown
 *          state and the operation was discarded
 * * %1 - The timer was active and requeued to expire at @expires or
 *          the timer was active and not modified because @expires
 *          did not change the effective expiry time such that the
 *          timer would expire earlier than already scheduled
 */
int timer_reduce(struct timer_list *timer, unsigned long expires)
{
        return __mod_timer(timer, expires, MOD_TIMER_REDUCE);
}
EXPORT_SYMBOL(timer_reduce);

/**
 * add_timer - Start a timer
 * @timer:        The timer to be started
 *
 * Start @timer to expire at @timer->expires in the future. @timer->expires
 * is the absolute expiry time measured in 'jiffies'. When the timer expires
 * timer->function(timer) will be invoked from soft interrupt context.
 *
 * The @timer->expires and @timer->function fields must be set prior
 * to calling this function.
 *
 * If @timer->function == NULL then the start operation is silently
 * discarded.
 *
 * If @timer->expires is already in the past @timer will be queued to
 * expire at the next timer tick.
 *
 * This can only operate on an inactive timer. Attempts to invoke this on
 * an active timer are rejected with a warning.
 */
void add_timer(struct timer_list *timer)
{
        if (WARN_ON_ONCE(timer_pending(timer)))
                return;
        __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
}
EXPORT_SYMBOL(add_timer);

/**
 * add_timer_local() - Start a timer on the local CPU
 * @timer:        The timer to be started
 *
 * Same as add_timer() except that the timer flag TIMER_PINNED is set.
 *
 * See add_timer() for further details.
 */
void add_timer_local(struct timer_list *timer)
{
        if (WARN_ON_ONCE(timer_pending(timer)))
                return;
        timer->flags |= TIMER_PINNED;
        __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
}
EXPORT_SYMBOL(add_timer_local);

/**
 * add_timer_global() - Start a timer without TIMER_PINNED flag set
 * @timer:        The timer to be started
 *
 * Same as add_timer() except that the timer flag TIMER_PINNED is unset.
 *
 * See add_timer() for further details.
 */
void add_timer_global(struct timer_list *timer)
{
        if (WARN_ON_ONCE(timer_pending(timer)))
                return;
        timer->flags &= ~TIMER_PINNED;
        __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
}
EXPORT_SYMBOL(add_timer_global);

/**
 * add_timer_on - Start a timer on a particular CPU
 * @timer:        The timer to be started
 * @cpu:        The CPU to start it on
 *
 * Same as add_timer() except that it starts the timer on the given CPU and
 * the TIMER_PINNED flag is set. When timer shouldn't be a pinned timer in
 * the next round, add_timer_global() should be used instead as it unsets
 * the TIMER_PINNED flag.
 *
 * See add_timer() for further details.
 */
void add_timer_on(struct timer_list *timer, int cpu)
{
        struct timer_base *new_base, *base;
        unsigned long flags;

        debug_assert_init(timer);

        if (WARN_ON_ONCE(timer_pending(timer)))
                return;

        /* Make sure timer flags have TIMER_PINNED flag set */
        timer->flags |= TIMER_PINNED;

        new_base = get_timer_cpu_base(timer->flags, cpu);

        /*
         * If @timer was on a different CPU, it should be migrated with the
         * old base locked to prevent other operations proceeding with the
         * wrong base locked.  See lock_timer_base().
         */
        base = lock_timer_base(timer, &flags);
        /*
         * Has @timer been shutdown? This needs to be evaluated while
         * holding base lock to prevent a race against the shutdown code.
         */
        if (!timer->function)
                goto out_unlock;

        if (base != new_base) {
                timer->flags |= TIMER_MIGRATING;

                raw_spin_unlock(&base->lock);
                base = new_base;
                raw_spin_lock(&base->lock);
                WRITE_ONCE(timer->flags,
                           (timer->flags & ~TIMER_BASEMASK) | cpu);
        }
        forward_timer_base(base);

        debug_timer_activate(timer);
        internal_add_timer(base, timer);
out_unlock:
        raw_spin_unlock_irqrestore(&base->lock, flags);
}
EXPORT_SYMBOL_GPL(add_timer_on);

/**
 * __timer_delete - Internal function: Deactivate a timer
 * @timer:        The timer to be deactivated
 * @shutdown:        If true, this indicates that the timer is about to be
 *                shutdown permanently.
 *
 * If @shutdown is true then @timer->function is set to NULL under the
 * timer base lock which prevents further rearming of the time. In that
 * case any attempt to rearm @timer after this function returns will be
 * silently ignored.
 *
 * Return:
 * * %0 - The timer was not pending
 * * %1 - The timer was pending and deactivated
 */
static int __timer_delete(struct timer_list *timer, bool shutdown)
{
        struct timer_base *base;
        unsigned long flags;
        int ret = 0;

        debug_assert_init(timer);

        /*
         * If @shutdown is set then the lock has to be taken whether the
         * timer is pending or not to protect against a concurrent rearm
         * which might hit between the lockless pending check and the lock
         * acquisition. By taking the lock it is ensured that such a newly
         * enqueued timer is dequeued and cannot end up with
         * timer->function == NULL in the expiry code.
         *
         * If timer->function is currently executed, then this makes sure
         * that the callback cannot requeue the timer.
         */
        if (timer_pending(timer) || shutdown) {
                base = lock_timer_base(timer, &flags);
                ret = detach_if_pending(timer, base, true);
                if (shutdown)
                        timer->function = NULL;
                raw_spin_unlock_irqrestore(&base->lock, flags);
        }

        return ret;
}

/**
 * timer_delete - Deactivate a timer
 * @timer:        The timer to be deactivated
 *
 * The function only deactivates a pending timer, but contrary to
 * timer_delete_sync() it does not take into account whether the timer's
 * callback function is concurrently executed on a different CPU or not.
 * It neither prevents rearming of the timer.  If @timer can be rearmed
 * concurrently then the return value of this function is meaningless.
 *
 * Return:
 * * %0 - The timer was not pending
 * * %1 - The timer was pending and deactivated
 */
int timer_delete(struct timer_list *timer)
{
        return __timer_delete(timer, false);
}
EXPORT_SYMBOL(timer_delete);

/**
 * timer_shutdown - Deactivate a timer and prevent rearming
 * @timer:        The timer to be deactivated
 *
 * The function does not wait for an eventually running timer callback on a
 * different CPU but it prevents rearming of the timer. Any attempt to arm
 * @timer after this function returns will be silently ignored.
 *
 * This function is useful for teardown code and should only be used when
 * timer_shutdown_sync() cannot be invoked due to locking or context constraints.
 *
 * Return:
 * * %0 - The timer was not pending
 * * %1 - The timer was pending
 */
int timer_shutdown(struct timer_list *timer)
{
        return __timer_delete(timer, true);
}
EXPORT_SYMBOL_GPL(timer_shutdown);

/**
 * __try_to_del_timer_sync - Internal function: Try to deactivate a timer
 * @timer:        Timer to deactivate
 * @shutdown:        If true, this indicates that the timer is about to be
 *                shutdown permanently.
 *
 * If @shutdown is true then @timer->function is set to NULL under the
 * timer base lock which prevents further rearming of the timer. Any
 * attempt to rearm @timer after this function returns will be silently
 * ignored.
 *
 * This function cannot guarantee that the timer cannot be rearmed
 * right after dropping the base lock if @shutdown is false. That
 * needs to be prevented by the calling code if necessary.
 *
 * Return:
 * * %0  - The timer was not pending
 * * %1  - The timer was pending and deactivated
 * * %-1 - The timer callback function is running on a different CPU
 */
static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown)
{
        struct timer_base *base;
        unsigned long flags;
        int ret = -1;

        debug_assert_init(timer);

        base = lock_timer_base(timer, &flags);

        if (base->running_timer != timer) {
                ret = detach_if_pending(timer, base, true);
                if (shutdown)
                        timer->function = NULL;
        }

        raw_spin_unlock_irqrestore(&base->lock, flags);

        return ret;
}

/**
 * timer_delete_sync_try - Try to deactivate a timer
 * @timer:        Timer to deactivate
 *
 * This function tries to deactivate a timer. On success the timer is not
 * queued and the timer callback function is not running on any CPU.
 *
 * This function does not guarantee that the timer cannot be rearmed right
 * after dropping the base lock. That needs to be prevented by the calling
 * code if necessary.
 *
 * Return:
 * * %0  - The timer was not pending
 * * %1  - The timer was pending and deactivated
 * * %-1 - The timer callback function is running on a different CPU
 */
int timer_delete_sync_try(struct timer_list *timer)
{
        return __try_to_del_timer_sync(timer, false);
}
EXPORT_SYMBOL(timer_delete_sync_try);

#ifdef CONFIG_PREEMPT_RT
static __init void timer_base_init_expiry_lock(struct timer_base *base)
{
        spin_lock_init(&base->expiry_lock);
}

static inline void timer_base_lock_expiry(struct timer_base *base)
{
        spin_lock(&base->expiry_lock);
}

static inline void timer_base_unlock_expiry(struct timer_base *base)
{
        spin_unlock(&base->expiry_lock);
}

/*
 * The counterpart to del_timer_wait_running().
 *
 * If there is a waiter for base->expiry_lock, then it was waiting for the
 * timer callback to finish. Drop expiry_lock and reacquire it. That allows
 * the waiter to acquire the lock and make progress.
 */
static void timer_sync_wait_running(struct timer_base *base)
        __releases(&base->lock) __releases(&base->expiry_lock)
        __acquires(&base->expiry_lock) __acquires(&base->lock)
{
        if (atomic_read(&base->timer_waiters)) {
                raw_spin_unlock_irq(&base->lock);
                spin_unlock(&base->expiry_lock);
                spin_lock(&base->expiry_lock);
                raw_spin_lock_irq(&base->lock);
        }
}

/*
 * This function is called on PREEMPT_RT kernels when the fast path
 * deletion of a timer failed because the timer callback function was
 * running.
 *
 * This prevents priority inversion, if the softirq thread on a remote CPU
 * got preempted, and it prevents a life lock when the task which tries to
 * delete a timer preempted the softirq thread running the timer callback
 * function.
 */
static void del_timer_wait_running(struct timer_list *timer)
{
        u32 tf;

        tf = READ_ONCE(timer->flags);
        if (!(tf & (TIMER_MIGRATING | TIMER_IRQSAFE))) {
                struct timer_base *base = get_timer_base(tf);

                /*
                 * Mark the base as contended and grab the expiry lock,
                 * which is held by the softirq across the timer
                 * callback. Drop the lock immediately so the softirq can
                 * expire the next timer. In theory the timer could already
                 * be running again, but that's more than unlikely and just
                 * causes another wait loop.
                 */
                atomic_inc(&base->timer_waiters);
                spin_lock_bh(&base->expiry_lock);
                atomic_dec(&base->timer_waiters);
                spin_unlock_bh(&base->expiry_lock);
        }
}
#else
static inline void timer_base_init_expiry_lock(struct timer_base *base) { }
static inline void timer_base_lock_expiry(struct timer_base *base) { }
static inline void timer_base_unlock_expiry(struct timer_base *base) { }
static inline void timer_sync_wait_running(struct timer_base *base) { }
static inline void del_timer_wait_running(struct timer_list *timer) { }
#endif

/**
 * __timer_delete_sync - Internal function: Deactivate a timer and wait
 *                         for the handler to finish.
 * @timer:        The timer to be deactivated
 * @shutdown:        If true, @timer->function will be set to NULL under the
 *                timer base lock which prevents rearming of @timer
 *
 * If @shutdown is not set the timer can be rearmed later. If the timer can
 * be rearmed concurrently, i.e. after dropping the base lock then the
 * return value is meaningless.
 *
 * If @shutdown is set then @timer->function is set to NULL under timer
 * base lock which prevents rearming of the timer. Any attempt to rearm
 * a shutdown timer is silently ignored.
 *
 * If the timer should be reused after shutdown it has to be initialized
 * again.
 *
 * Return:
 * * %0        - The timer was not pending
 * * %1        - The timer was pending and deactivated
 */
static int __timer_delete_sync(struct timer_list *timer, bool shutdown)
{
        int ret;

#ifdef CONFIG_LOCKDEP
        unsigned long flags;

        /*
         * If lockdep gives a backtrace here, please reference
         * the synchronization rules above.
         */
        local_irq_save(flags);
        lock_map_acquire(&timer->lockdep_map);
        lock_map_release(&timer->lockdep_map);
        local_irq_restore(flags);
#endif
        /*
         * don't use it in hardirq context, because it
         * could lead to deadlock.
         */
        WARN_ON(in_hardirq() && !(timer->flags & TIMER_IRQSAFE));

        /*
         * Must be able to sleep on PREEMPT_RT because of the slowpath in
         * del_timer_wait_running().
         */
        if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(timer->flags & TIMER_IRQSAFE))
                lockdep_assert_preemption_enabled();

        do {
                ret = __try_to_del_timer_sync(timer, shutdown);

                if (unlikely(ret < 0)) {
                        del_timer_wait_running(timer);
                        cpu_relax();
                }
        } while (ret < 0);

        return ret;
}

/**
 * timer_delete_sync - Deactivate a timer and wait for the handler to finish.
 * @timer:        The timer to be deactivated
 *
 * Synchronization rules: Callers must prevent restarting of the timer,
 * otherwise this function is meaningless. It must not be called from
 * interrupt contexts unless the timer is an irqsafe one. The caller must
 * not hold locks which would prevent completion of the timer's callback
 * function. The timer's handler must not call add_timer_on(). Upon exit
 * the timer is not queued and the handler is not running on any CPU.
 *
 * For !irqsafe timers, the caller must not hold locks that are held in
 * interrupt context. Even if the lock has nothing to do with the timer in
 * question.  Here's why::
 *
 *    CPU0                             CPU1
 *    ----                             ----
 *                                     <SOFTIRQ>
 *                                       call_timer_fn();
 *                                       base->running_timer = mytimer;
 *    spin_lock_irq(somelock);
 *                                     <IRQ>
 *                                        spin_lock(somelock);
 *    timer_delete_sync(mytimer);
 *    while (base->running_timer == mytimer);
 *
 * Now timer_delete_sync() will never return and never release somelock.
 * The interrupt on the other CPU is waiting to grab somelock but it has
 * interrupted the softirq that CPU0 is waiting to finish.
 *
 * This function cannot guarantee that the timer is not rearmed again by
 * some concurrent or preempting code, right after it dropped the base
 * lock. If there is the possibility of a concurrent rearm then the return
 * value of the function is meaningless.
 *
 * If such a guarantee is needed, e.g. for teardown situations then use
 * timer_shutdown_sync() instead.
 *
 * Return:
 * * %0        - The timer was not pending
 * * %1        - The timer was pending and deactivated
 */
int timer_delete_sync(struct timer_list *timer)
{
        return __timer_delete_sync(timer, false);
}
EXPORT_SYMBOL(timer_delete_sync);

/**
 * timer_shutdown_sync - Shutdown a timer and prevent rearming
 * @timer: The timer to be shutdown
 *
 * When the function returns it is guaranteed that:
 *   - @timer is not queued
 *   - The callback function of @timer is not running
 *   - @timer cannot be enqueued again. Any attempt to rearm
 *     @timer is silently ignored.
 *
 * See timer_delete_sync() for synchronization rules.
 *
 * This function is useful for final teardown of an infrastructure where
 * the timer is subject to a circular dependency problem.
 *
 * A common pattern for this is a timer and a workqueue where the timer can
 * schedule work and work can arm the timer. On shutdown the workqueue must
 * be destroyed and the timer must be prevented from rearming. Unless the
 * code has conditionals like 'if (mything->in_shutdown)' to prevent that
 * there is no way to get this correct with timer_delete_sync().
 *
 * timer_shutdown_sync() is solving the problem. The correct ordering of
 * calls in this case is:
 *
 *        timer_shutdown_sync(&mything->timer);
 *        workqueue_destroy(&mything->workqueue);
 *
 * After this 'mything' can be safely freed.
 *
 * This obviously implies that the timer is not required to be functional
 * for the rest of the shutdown operation.
 *
 * Return:
 * * %0 - The timer was not pending
 * * %1 - The timer was pending
 */
int timer_shutdown_sync(struct timer_list *timer)
{
        return __timer_delete_sync(timer, true);
}
EXPORT_SYMBOL_GPL(timer_shutdown_sync);

static void call_timer_fn(struct timer_list *timer,
                          void (*fn)(struct timer_list *),
                          unsigned long baseclk)
{
        int count = preempt_count();

#ifdef CONFIG_LOCKDEP
        /*
         * It is permissible to free the timer from inside the
         * function that is called from it, this we need to take into
         * account for lockdep too. To avoid bogus "held lock freed"
         * warnings as well as problems when looking into
         * timer->lockdep_map, make a copy and use that here.
         */
        struct lockdep_map lockdep_map;

        lockdep_copy_map(&lockdep_map, &timer->lockdep_map);
#endif
        /*
         * Couple the lock chain with the lock chain at
         * timer_delete_sync() by acquiring the lock_map around the fn()
         * call here and in timer_delete_sync().
         */
        lock_map_acquire(&lockdep_map);

        trace_timer_expire_entry(timer, baseclk);
        fn(timer);
        trace_timer_expire_exit(timer);

        lock_map_release(&lockdep_map);

        if (count != preempt_count()) {
                WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n",
                          fn, count, preempt_count());
                /*
                 * Restore the preempt count. That gives us a decent
                 * chance to survive and extract information. If the
                 * callback kept a lock held, bad luck, but not worse
                 * than the BUG() we had.
                 */
                preempt_count_set(count);
        }
}

static void expire_timers(struct timer_base *base, struct hlist_head *head)
{
        /*
         * This value is required only for tracing. base->clk was
         * incremented directly before expire_timers was called. But expiry
         * is related to the old base->clk value.
         */
        unsigned long baseclk = base->clk - 1;

        while (!hlist_empty(head)) {
                struct timer_list *timer;
                void (*fn)(struct timer_list *);

                timer = hlist_entry(head->first, struct timer_list, entry);

                base->running_timer = timer;
                detach_timer(timer, true);

                fn = timer->function;

                if (WARN_ON_ONCE(!fn)) {
                        /* Should never happen. Emphasis on should! */
                        base->running_timer = NULL;
                        continue;
                }

                if (timer->flags & TIMER_IRQSAFE) {
                        raw_spin_unlock(&base->lock);
                        call_timer_fn(timer, fn, baseclk);
                        raw_spin_lock(&base->lock);
                        base->running_timer = NULL;
                } else {
                        raw_spin_unlock_irq(&base->lock);
                        call_timer_fn(timer, fn, baseclk);
                        raw_spin_lock_irq(&base->lock);
                        base->running_timer = NULL;
                        timer_sync_wait_running(base);
                }
        }
}

static int collect_expired_timers(struct timer_base *base,
                                  struct hlist_head *heads)
{
        unsigned long clk = base->clk = base->next_expiry;
        struct hlist_head *vec;
        int i, levels = 0;
        unsigned int idx;

        for (i = 0; i < LVL_DEPTH; i++) {
                idx = (clk & LVL_MASK) + i * LVL_SIZE;

                if (__test_and_clear_bit(idx, base->pending_map)) {
                        vec = base->vectors + idx;
                        hlist_move_list(vec, heads++);
                        levels++;
                }
                /* Is it time to look at the next level? */
                if (clk & LVL_CLK_MASK)
                        break;
                /* Shift clock for the next level granularity */
                clk >>= LVL_CLK_SHIFT;
        }
        return levels;
}

/*
 * Find the next pending bucket of a level. Search from level start (@offset)
 * + @clk upwards and if nothing there, search from start of the level
 * (@offset) up to @offset + clk.
 */
static int next_pending_bucket(struct timer_base *base, unsigned offset,
                               unsigned clk)
{
        unsigned pos, start = offset + clk;
        unsigned end = offset + LVL_SIZE;

        pos = find_next_bit(base->pending_map, end, start);
        if (pos < end)
                return pos - start;

        pos = find_next_bit(base->pending_map, start, offset);
        return pos < start ? pos + LVL_SIZE - start : -1;
}

/*
 * Search the first expiring timer in the various clock levels. Caller must
 * hold base->lock.
 *
 * Store next expiry time in base->next_expiry.
 */
static void timer_recalc_next_expiry(struct timer_base *base)
{
        unsigned long clk, next, adj;
        unsigned lvl, offset = 0;

        next = base->clk + TIMER_NEXT_MAX_DELTA;
        clk = base->clk;
        for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
                int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
                unsigned long lvl_clk = clk & LVL_CLK_MASK;

                if (pos >= 0) {
                        unsigned long tmp = clk + (unsigned long) pos;

                        tmp <<= LVL_SHIFT(lvl);
                        if (time_before(tmp, next))
                                next = tmp;

                        /*
                         * If the next expiration happens before we reach
                         * the next level, no need to check further.
                         */
                        if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK))
                                break;
                }
                /*
                 * Clock for the next level. If the current level clock lower
                 * bits are zero, we look at the next level as is. If not we
                 * need to advance it by one because that's going to be the
                 * next expiring bucket in that level. base->clk is the next
                 * expiring jiffy. So in case of:
                 *
                 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
                 *  0    0    0    0    0    0
                 *
                 * we have to look at all levels @index 0. With
                 *
                 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
                 *  0    0    0    0    0    2
                 *
                 * LVL0 has the next expiring bucket @index 2. The upper
                 * levels have the next expiring bucket @index 1.
                 *
                 * In case that the propagation wraps the next level the same
                 * rules apply:
                 *
                 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
                 *  0    0    0    0    F    2
                 *
                 * So after looking at LVL0 we get:
                 *
                 * LVL5 LVL4 LVL3 LVL2 LVL1
                 *  0    0    0    1    0
                 *
                 * So no propagation from LVL1 to LVL2 because that happened
                 * with the add already, but then we need to propagate further
                 * from LVL2 to LVL3.
                 *
                 * So the simple check whether the lower bits of the current
                 * level are 0 or not is sufficient for all cases.
                 */
                adj = lvl_clk ? 1 : 0;
                clk >>= LVL_CLK_SHIFT;
                clk += adj;
        }

        WRITE_ONCE(base->next_expiry, next);
        base->next_expiry_recalc = false;
        base->timers_pending = !(next == base->clk + TIMER_NEXT_MAX_DELTA);
}

#ifdef CONFIG_NO_HZ_COMMON
/*
 * Check, if the next hrtimer event is before the next timer wheel
 * event:
 */
static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
{
        u64 nextevt = hrtimer_get_next_event();

        /*
         * If high resolution timers are enabled
         * hrtimer_get_next_event() returns KTIME_MAX.
         */
        if (expires <= nextevt)
                return expires;

        /*
         * If the next timer is already expired, return the tick base
         * time so the tick is fired immediately.
         */
        if (nextevt <= basem)
                return basem;

        /*
         * Round up to the next jiffy. High resolution timers are
         * off, so the hrtimers are expired in the tick and we need to
         * make sure that this tick really expires the timer to avoid
         * a ping pong of the nohz stop code.
         *
         * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3
         */
        return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC;
}

static unsigned long next_timer_interrupt(struct timer_base *base,
                                          unsigned long basej)
{
        if (base->next_expiry_recalc)
                timer_recalc_next_expiry(base);

        /*
         * Move next_expiry for the empty base into the future to prevent an
         * unnecessary raise of the timer softirq when the next_expiry value
         * will be reached even if there is no timer pending.
         *
         * This update is also required to make timer_base::next_expiry values
         * easy comparable to find out which base holds the first pending timer.
         */
        if (!base->timers_pending)
                WRITE_ONCE(base->next_expiry, basej + TIMER_NEXT_MAX_DELTA);

        return base->next_expiry;
}

static unsigned long fetch_next_timer_interrupt(unsigned long basej, u64 basem,
                                                struct timer_base *base_local,
                                                struct timer_base *base_global,
                                                struct timer_events *tevt)
{
        unsigned long nextevt, nextevt_local, nextevt_global;
        bool local_first;

        nextevt_local = next_timer_interrupt(base_local, basej);
        nextevt_global = next_timer_interrupt(base_global, basej);

        local_first = time_before_eq(nextevt_local, nextevt_global);

        nextevt = local_first ? nextevt_local : nextevt_global;

        /*
         * If the @nextevt is at max. one tick away, use @nextevt and store
         * it in the local expiry value. The next global event is irrelevant in
         * this case and can be left as KTIME_MAX.
         */
        if (time_before_eq(nextevt, basej + 1)) {
                /* If we missed a tick already, force 0 delta */
                if (time_before(nextevt, basej))
                        nextevt = basej;
                tevt->local = basem + (u64)(nextevt - basej) * TICK_NSEC;

                /*
                 * This is required for the remote check only but it doesn't
                 * hurt, when it is done for both call sites:
                 *
                 * * The remote callers will only take care of the global timers
                 *   as local timers will be handled by CPU itself. When not
                 *   updating tevt->global with the already missed first global
                 *   timer, it is possible that it will be missed completely.
                 *
                 * * The local callers will ignore the tevt->global anyway, when
                 *   nextevt is max. one tick away.
                 */
                if (!local_first)
                        tevt->global = tevt->local;
                return nextevt;
        }

        /*
         * Update tevt.* values:
         *
         * If the local queue expires first, then the global event can be
         * ignored. If the global queue is empty, nothing to do either.
         */
        if (!local_first && base_global->timers_pending)
                tevt->global = basem + (u64)(nextevt_global - basej) * TICK_NSEC;

        if (base_local->timers_pending)
                tevt->local = basem + (u64)(nextevt_local - basej) * TICK_NSEC;

        return nextevt;
}

# ifdef CONFIG_SMP
/**
 * fetch_next_timer_interrupt_remote() - Store next timers into @tevt
 * @basej:        base time jiffies
 * @basem:        base time clock monotonic
 * @tevt:        Pointer to the storage for the expiry values
 * @cpu:        Remote CPU
 *
 * Stores the next pending local and global timer expiry values in the
 * struct pointed to by @tevt. If a queue is empty the corresponding
 * field is set to KTIME_MAX. If local event expires before global
 * event, global event is set to KTIME_MAX as well.
 *
 * Caller needs to make sure timer base locks are held (use
 * timer_lock_remote_bases() for this purpose).
 */
void fetch_next_timer_interrupt_remote(unsigned long basej, u64 basem,
                                       struct timer_events *tevt,
                                       unsigned int cpu)
{
        struct timer_base *base_local, *base_global;

        /* Preset local / global events */
        tevt->local = tevt->global = KTIME_MAX;

        base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu);
        base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);

        lockdep_assert_held(&base_local->lock);
        lockdep_assert_held(&base_global->lock);

        fetch_next_timer_interrupt(basej, basem, base_local, base_global, tevt);
}

/**
 * timer_unlock_remote_bases - unlock timer bases of cpu
 * @cpu:        Remote CPU
 *
 * Unlocks the remote timer bases.
 */
void timer_unlock_remote_bases(unsigned int cpu)
        __releases(timer_bases[BASE_LOCAL]->lock)
        __releases(timer_bases[BASE_GLOBAL]->lock)
{
        struct timer_base *base_local, *base_global;

        base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu);
        base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);

        raw_spin_unlock(&base_global->lock);
        raw_spin_unlock(&base_local->lock);
}

/**
 * timer_lock_remote_bases - lock timer bases of cpu
 * @cpu:        Remote CPU
 *
 * Locks the remote timer bases.
 */
void timer_lock_remote_bases(unsigned int cpu)
        __acquires(timer_bases[BASE_LOCAL]->lock)
        __acquires(timer_bases[BASE_GLOBAL]->lock)
{
        struct timer_base *base_local, *base_global;

        base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu);
        base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);

        lockdep_assert_irqs_disabled();

        raw_spin_lock(&base_local->lock);
        raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING);
}

/**
 * timer_base_is_idle() - Return whether timer base is set idle
 *
 * Returns value of local timer base is_idle value.
 */
bool timer_base_is_idle(void)
{
        return __this_cpu_read(timer_bases[BASE_LOCAL].is_idle);
}

static void __run_timer_base(struct timer_base *base);

/**
 * timer_expire_remote() - expire global timers of cpu
 * @cpu:        Remote CPU
 *
 * Expire timers of global base of remote CPU.
 */
void timer_expire_remote(unsigned int cpu)
{
        struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);

        __run_timer_base(base);
}

static void timer_use_tmigr(unsigned long basej, u64 basem,
                            unsigned long *nextevt, bool *tick_stop_path,
                            bool timer_base_idle, struct timer_events *tevt)
{
        u64 next_tmigr;

        if (timer_base_idle)
                next_tmigr = tmigr_cpu_new_timer(tevt->global);
        else if (tick_stop_path)
                next_tmigr = tmigr_cpu_deactivate(tevt->global);
        else
                next_tmigr = tmigr_quick_check(tevt->global);

        /*
         * If the CPU is the last going idle in timer migration hierarchy, make
         * sure the CPU will wake up in time to handle remote timers.
         * next_tmigr == KTIME_MAX if other CPUs are still active.
         */
        if (next_tmigr < tevt->local) {
                u64 tmp;

                /* If we missed a tick already, force 0 delta */
                if (next_tmigr < basem)
                        next_tmigr = basem;

                tmp = div_u64(next_tmigr - basem, TICK_NSEC);

                *nextevt = basej + (unsigned long)tmp;
                tevt->local = next_tmigr;
        }
}
# else
static void timer_use_tmigr(unsigned long basej, u64 basem,
                            unsigned long *nextevt, bool *tick_stop_path,
                            bool timer_base_idle, struct timer_events *tevt)
{
        /*
         * Make sure first event is written into tevt->local to not miss a
         * timer on !SMP systems.
         */
        tevt->local = min_t(u64, tevt->local, tevt->global);
}
# endif /* CONFIG_SMP */

static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem,
                                             bool *idle)
{
        struct timer_events tevt = { .local = KTIME_MAX, .global = KTIME_MAX };
        struct timer_base *base_local, *base_global;
        unsigned long nextevt;
        bool idle_is_possible;

        /*
         * When the CPU is offline, the tick is cancelled and nothing is supposed
         * to try to stop it.
         */
        if (WARN_ON_ONCE(cpu_is_offline(smp_processor_id()))) {
                if (idle)
                        *idle = true;
                return tevt.local;
        }

        base_local = this_cpu_ptr(&timer_bases[BASE_LOCAL]);
        base_global = this_cpu_ptr(&timer_bases[BASE_GLOBAL]);

        raw_spin_lock(&base_local->lock);
        raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING);

        nextevt = fetch_next_timer_interrupt(basej, basem, base_local,
                                             base_global, &tevt);

        /*
         * If the next event is only one jiffy ahead there is no need to call
         * timer migration hierarchy related functions. The value for the next
         * global timer in @tevt struct equals then KTIME_MAX. This is also
         * true, when the timer base is idle.
         *
         * The proper timer migration hierarchy function depends on the callsite
         * and whether timer base is idle or not. @nextevt will be updated when
         * this CPU needs to handle the first timer migration hierarchy
         * event. See timer_use_tmigr() for detailed information.
         */
        idle_is_possible = time_after(nextevt, basej + 1);
        if (idle_is_possible)
                timer_use_tmigr(basej, basem, &nextevt, idle,
                                base_local->is_idle, &tevt);

        /*
         * We have a fresh next event. Check whether we can forward the
         * base.
         */
        __forward_timer_base(base_local, basej);
        __forward_timer_base(base_global, basej);

        /*
         * Set base->is_idle only when caller is timer_base_try_to_set_idle()
         */
        if (idle) {
                /*
                 * Bases are idle if the next event is more than a tick
                 * away. Caution: @nextevt could have changed by enqueueing a
                 * global timer into timer migration hierarchy. Therefore a new
                 * check is required here.
                 *
                 * If the base is marked idle then any timer add operation must
                 * forward the base clk itself to keep granularity small. This
                 * idle logic is only maintained for the BASE_LOCAL and
                 * BASE_GLOBAL base, deferrable timers may still see large
                 * granularity skew (by design).
                 */
                if (!base_local->is_idle && time_after(nextevt, basej + 1)) {
                        base_local->is_idle = true;
                        /*
                         * Global timers queued locally while running in a task
                         * in nohz_full mode need a self-IPI to kick reprogramming
                         * in IRQ tail.
                         */
                        if (tick_nohz_full_cpu(base_local->cpu))
                                base_global->is_idle = true;
                        trace_timer_base_idle(true, base_local->cpu);
                }
                *idle = base_local->is_idle;

                /*
                 * When timer base is not set idle, undo the effect of
                 * tmigr_cpu_deactivate() to prevent inconsistent states - active
                 * timer base but inactive timer migration hierarchy.
                 *
                 * When timer base was already marked idle, nothing will be
                 * changed here.
                 */
                if (!base_local->is_idle && idle_is_possible)
                        tmigr_cpu_activate();
        }

        raw_spin_unlock(&base_global->lock);
        raw_spin_unlock(&base_local->lock);

        return cmp_next_hrtimer_event(basem, tevt.local);
}

/**
 * get_next_timer_interrupt() - return the time (clock mono) of the next timer
 * @basej:        base time jiffies
 * @basem:        base time clock monotonic
 *
 * Returns the tick aligned clock monotonic time of the next pending timer or
 * KTIME_MAX if no timer is pending. If timer of global base was queued into
 * timer migration hierarchy, first global timer is not taken into account. If
 * it was the last CPU of timer migration hierarchy going idle, first global
 * event is taken into account.
 */
u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
{
        return __get_next_timer_interrupt(basej, basem, NULL);
}

/**
 * timer_base_try_to_set_idle() - Try to set the idle state of the timer bases
 * @basej:        base time jiffies
 * @basem:        base time clock monotonic
 * @idle:        pointer to store the value of timer_base->is_idle on return;
 *                *idle contains the information whether tick was already stopped
 *
 * Returns the tick aligned clock monotonic time of the next pending timer or
 * KTIME_MAX if no timer is pending. When tick was already stopped KTIME_MAX is
 * returned as well.
 */
u64 timer_base_try_to_set_idle(unsigned long basej, u64 basem, bool *idle)
{
        if (*idle)
                return KTIME_MAX;

        return __get_next_timer_interrupt(basej, basem, idle);
}

/**
 * timer_clear_idle - Clear the idle state of the timer base
 *
 * Called with interrupts disabled
 */
void timer_clear_idle(void)
{
        /*
         * We do this unlocked. The worst outcome is a remote pinned timer
         * enqueue sending a pointless IPI, but taking the lock would just
         * make the window for sending the IPI a few instructions smaller
         * for the cost of taking the lock in the exit from idle
         * path. Required for BASE_LOCAL only.
         */
        __this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false);
        if (tick_nohz_full_cpu(smp_processor_id()))
                __this_cpu_write(timer_bases[BASE_GLOBAL].is_idle, false);
        trace_timer_base_idle(false, smp_processor_id());

        /* Activate without holding the timer_base->lock */
        tmigr_cpu_activate();
}
#endif

/**
 * __run_timers - run all expired timers (if any) on this CPU.
 * @base: the timer vector to be processed.
 */
static inline void __run_timers(struct timer_base *base)
{
        struct hlist_head heads[LVL_DEPTH];
        int levels;

        lockdep_assert_held(&base->lock);

        if (base->running_timer)
                return;

        while (time_after_eq(jiffies, base->clk) &&
               time_after_eq(jiffies, base->next_expiry)) {
                levels = collect_expired_timers(base, heads);
                /*
                 * The two possible reasons for not finding any expired
                 * timer at this clk are that all matching timers have been
                 * dequeued or no timer has been queued since
                 * base::next_expiry was set to base::clk +
                 * TIMER_NEXT_MAX_DELTA.
                 */
                WARN_ON_ONCE(!levels && !base->next_expiry_recalc
                             && base->timers_pending);
                /*
                 * While executing timers, base->clk is set 1 offset ahead of
                 * jiffies to avoid endless requeuing to current jiffies.
                 */
                base->clk++;
                timer_recalc_next_expiry(base);

                while (levels--)
                        expire_timers(base, heads + levels);
        }
}

static void __run_timer_base(struct timer_base *base)
{
        /* Can race against a remote CPU updating next_expiry under the lock */
        if (time_before(jiffies, READ_ONCE(base->next_expiry)))
                return;

        timer_base_lock_expiry(base);
        raw_spin_lock_irq(&base->lock);
        __run_timers(base);
        raw_spin_unlock_irq(&base->lock);
        timer_base_unlock_expiry(base);
}

static void run_timer_base(int index)
{
        struct timer_base *base = this_cpu_ptr(&timer_bases[index]);

        __run_timer_base(base);
}

/*
 * This function runs timers and the timer-tq in bottom half context.
 */
static __latent_entropy void run_timer_softirq(void)
{
        run_timer_base(BASE_LOCAL);
        if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) {
                run_timer_base(BASE_GLOBAL);
                run_timer_base(BASE_DEF);

                if (is_timers_nohz_active())
                        tmigr_handle_remote();
        }
}

/*
 * Called by the local, per-CPU timer interrupt on SMP.
 */
static void run_local_timers(void)
{
        struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_LOCAL]);

        hrtimer_run_queues();

        for (int i = 0; i < NR_BASES; i++, base++) {
                /*
                 * Raise the softirq only if required.
                 *
                 * timer_base::next_expiry can be written by a remote CPU while
                 * holding the lock. If this write happens at the same time than
                 * the lockless local read, sanity checker could complain about
                 * data corruption.
                 *
                 * There are two possible situations where
                 * timer_base::next_expiry is written by a remote CPU:
                 *
                 * 1. Remote CPU expires global timers of this CPU and updates
                 * timer_base::next_expiry of BASE_GLOBAL afterwards in
                 * next_timer_interrupt() or timer_recalc_next_expiry(). The
                 * worst outcome is a superfluous raise of the timer softirq
                 * when the not yet updated value is read.
                 *
                 * 2. A new first pinned timer is enqueued by a remote CPU
                 * and therefore timer_base::next_expiry of BASE_LOCAL is
                 * updated. When this update is missed, this isn't a
                 * problem, as an IPI is executed nevertheless when the CPU
                 * was idle before. When the CPU wasn't idle but the update
                 * is missed, then the timer would expire one jiffy late -
                 * bad luck.
                 *
                 * Those unlikely corner cases where the worst outcome is only a
                 * one jiffy delay or a superfluous raise of the softirq are
                 * not that expensive as doing the check always while holding
                 * the lock.
                 *
                 * Possible remote writers are using WRITE_ONCE(). Local reader
                 * uses therefore READ_ONCE().
                 */
                if (time_after_eq(jiffies, READ_ONCE(base->next_expiry)) ||
                    (i == BASE_DEF && tmigr_requires_handle_remote())) {
                        raise_timer_softirq(TIMER_SOFTIRQ);
                        return;
                }
        }
}

/*
 * Called from the timer interrupt handler to charge one tick to the current
 * process.  user_tick is 1 if the tick is user time, 0 for system.
 */
void update_process_times(int user_tick)
{
        struct task_struct *p = current;

        /* Note: this timer irq context must be accounted for as well. */
        account_process_tick(p, user_tick);
        run_local_timers();
        rcu_sched_clock_irq(user_tick);
#ifdef CONFIG_IRQ_WORK
        if (in_irq())
                irq_work_tick();
#endif
        sched_tick();
        if (IS_ENABLED(CONFIG_POSIX_TIMERS))
                run_posix_cpu_timers();
}

#ifdef CONFIG_HOTPLUG_CPU
static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head)
{
        struct timer_list *timer;
        int cpu = new_base->cpu;

        while (!hlist_empty(head)) {
                timer = hlist_entry(head->first, struct timer_list, entry);
                detach_timer(timer, false);
                timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
                internal_add_timer(new_base, timer);
        }
}

int timers_prepare_cpu(unsigned int cpu)
{
        struct timer_base *base;
        int b;

        for (b = 0; b < NR_BASES; b++) {
                base = per_cpu_ptr(&timer_bases[b], cpu);
                base->clk = jiffies;
                base->next_expiry = base->clk + TIMER_NEXT_MAX_DELTA;
                base->next_expiry_recalc = false;
                base->timers_pending = false;
                base->is_idle = false;
        }
        return 0;
}

int timers_dead_cpu(unsigned int cpu)
{
        struct timer_base *old_base;
        struct timer_base *new_base;
        int b, i;

        for (b = 0; b < NR_BASES; b++) {
                old_base = per_cpu_ptr(&timer_bases[b], cpu);
                new_base = get_cpu_ptr(&timer_bases[b]);
                /*
                 * The caller is globally serialized and nobody else
                 * takes two locks at once, deadlock is not possible.
                 */
                raw_spin_lock_irq(&new_base->lock);
                raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);

                /*
                 * The current CPUs base clock might be stale. Update it
                 * before moving the timers over.
                 */
                forward_timer_base(new_base);

                WARN_ON_ONCE(old_base->running_timer);
                old_base->running_timer = NULL;

                for (i = 0; i < WHEEL_SIZE; i++)
                        migrate_timer_list(new_base, old_base->vectors + i);

                raw_spin_unlock(&old_base->lock);
                raw_spin_unlock_irq(&new_base->lock);
                put_cpu_ptr(&timer_bases);
        }
        return 0;
}

#endif /* CONFIG_HOTPLUG_CPU */

static void __init init_timer_cpu(int cpu)
{
        struct timer_base *base;
        int i;

        for (i = 0; i < NR_BASES; i++) {
                base = per_cpu_ptr(&timer_bases[i], cpu);
                base->cpu = cpu;
                raw_spin_lock_init(&base->lock);
                base->clk = jiffies;
                base->next_expiry = base->clk + TIMER_NEXT_MAX_DELTA;
                timer_base_init_expiry_lock(base);
        }
}

static void __init init_timer_cpus(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                init_timer_cpu(cpu);
}

void __init timers_init(void)
{
        init_timer_cpus();
        posix_cputimers_init_work();
        open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}







































































































































































































































































































































































































   39 



















































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
// SPDX-License-Identifier: GPL-2.0+
/*
 *  Universal/legacy driver for 8250/16550-type serial ports
 *
 *  Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o.
 *
 *  Copyright (C) 2001 Russell King.
 *
 *  Supports:
 *              early_serial_setup() ports
 *              userspace-configurable "phantom" ports
 *              serial8250_register_8250_port() ports
 */

#include <linux/acpi.h>
#include <linux/hashtable.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/ioport.h>
#include <linux/init.h>
#include <linux/console.h>
#include <linux/sysrq.h>
#include <linux/delay.h>
#include <linux/platform_device.h>
#include <linux/pm_runtime.h>
#include <linux/tty.h>
#include <linux/ratelimit.h>
#include <linux/tty_flip.h>
#include <linux/serial.h>
#include <linux/serial_8250.h>
#include <linux/nmi.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/string_helpers.h>
#include <linux/uaccess.h>
#include <linux/io.h>

#include <asm/irq.h>

#include "8250.h"

#define PASS_LIMIT        512

struct irq_info {
        struct                        hlist_node node;
        int                        irq;
        spinlock_t                lock;        /* Protects list not the hash */
        struct list_head        *head;
};

#define IRQ_HASH_BITS                5        /* Can be adjusted later */
static DEFINE_HASHTABLE(irq_lists, IRQ_HASH_BITS);
static DEFINE_MUTEX(hash_mutex);        /* Used to walk the hash */

/*
 * This is the serial driver's interrupt routine.
 *
 * Arjan thinks the old way was overly complex, so it got simplified.
 * Alan disagrees, saying that need the complexity to handle the weird
 * nature of ISA shared interrupts.  (This is a special exception.)
 *
 * In order to handle ISA shared interrupts properly, we need to check
 * that all ports have been serviced, and therefore the ISA interrupt
 * line has been de-asserted.
 *
 * This means we need to loop through all ports. checking that they
 * don't have an interrupt pending.
 */
static irqreturn_t serial8250_interrupt(int irq, void *dev_id)
{
        struct irq_info *i = dev_id;
        struct list_head *l, *end = NULL;
        int pass_counter = 0, handled = 0;

        guard(spinlock)(&i->lock);

        l = i->head;
        do {
                struct uart_8250_port *up = list_entry(l, struct uart_8250_port, list);
                struct uart_port *port = &up->port;

                if (port->handle_irq(port)) {
                        handled = 1;
                        end = NULL;
                } else if (end == NULL)
                        end = l;

                l = l->next;

                if (l == i->head && pass_counter++ > PASS_LIMIT)
                        break;
        } while (l != end);

        return IRQ_RETVAL(handled);
}

/*
 * To support ISA shared interrupts, we need to have one interrupt
 * handler that ensures that the IRQ line has been deasserted
 * before returning.  Failing to do this will result in the IRQ
 * line being stuck active, and, since ISA irqs are edge triggered,
 * no more IRQs will be seen.
 */
static void serial_do_unlink(struct irq_info *i, struct uart_8250_port *up)
{
        spin_lock_irq(&i->lock);

        if (!list_empty(i->head)) {
                if (i->head == &up->list)
                        i->head = i->head->next;
                list_del(&up->list);
        } else {
                BUG_ON(i->head != &up->list);
                i->head = NULL;
        }
        spin_unlock_irq(&i->lock);
        /* List empty so throw away the hash node */
        if (i->head == NULL) {
                hlist_del(&i->node);
                kfree(i);
        }
}

/*
 * Either:
 * - find the corresponding info in the hashtable and return it, or
 * - allocate a new one, add it to the hashtable and return it.
 */
static struct irq_info *serial_get_or_create_irq_info(const struct uart_8250_port *up)
{
        struct irq_info *i;

        guard(mutex)(&hash_mutex);

        hash_for_each_possible(irq_lists, i, node, up->port.irq)
                if (i->irq == up->port.irq)
                        return i;

        i = kzalloc(sizeof(*i), GFP_KERNEL);
        if (i == NULL)
                return ERR_PTR(-ENOMEM);

        spin_lock_init(&i->lock);
        i->irq = up->port.irq;
        hash_add(irq_lists, &i->node, i->irq);

        return i;
}

static int serial_link_irq_chain(struct uart_8250_port *up)
{
        struct irq_info *i;
        int ret;

        i = serial_get_or_create_irq_info(up);
        if (IS_ERR(i))
                return PTR_ERR(i);

        scoped_guard(spinlock_irq, &i->lock) {
                if (i->head) {
                        list_add(&up->list, i->head);

                        return 0;
                }

                INIT_LIST_HEAD(&up->list);
                i->head = &up->list;
        }

        ret = request_irq(up->port.irq, serial8250_interrupt, up->port.irqflags, up->port.name, i);
        if (ret < 0)
                serial_do_unlink(i, up);

        return ret;
}

static void serial_unlink_irq_chain(struct uart_8250_port *up)
{
        struct irq_info *i;

        guard(mutex)(&hash_mutex);

        hash_for_each_possible(irq_lists, i, node, up->port.irq)
                if (i->irq == up->port.irq) {
                        if (WARN_ON(i->head == NULL))
                                return;

                        if (list_empty(i->head))
                                free_irq(up->port.irq, i);

                        serial_do_unlink(i, up);

                        return;
                }

        WARN_ON(1);
}

/*
 * This function is used to handle ports that do not have an
 * interrupt.  This doesn't work very well for 16450's, but gives
 * barely passable results for a 16550A.  (Although at the expense
 * of much CPU overhead).
 */
static void serial8250_timeout(struct timer_list *t)
{
        struct uart_8250_port *up = timer_container_of(up, t, timer);

        up->port.handle_irq(&up->port);
        mod_timer(&up->timer, jiffies + uart_poll_timeout(&up->port));
}

static void serial8250_backup_timeout(struct timer_list *t)
{
        struct uart_8250_port *up = timer_container_of(up, t, timer);
        unsigned int iir, ier = 0, lsr;
        unsigned long flags;

        uart_port_lock_irqsave(&up->port, &flags);

        /*
         * Must disable interrupts or else we risk racing with the interrupt
         * based handler.
         */
        if (up->port.irq) {
                ier = serial_in(up, UART_IER);
                serial_out(up, UART_IER, 0);
        }

        iir = serial_in(up, UART_IIR);

        /*
         * This should be a safe test for anyone who doesn't trust the
         * IIR bits on their UART, but it's specifically designed for
         * the "Diva" UART used on the management processor on many HP
         * ia64 and parisc boxes.
         */
        lsr = serial_lsr_in(up);
        if ((iir & UART_IIR_NO_INT) && (up->ier & UART_IER_THRI) &&
            (!kfifo_is_empty(&up->port.state->port.xmit_fifo) ||
             up->port.x_char) &&
            (lsr & UART_LSR_THRE)) {
                iir &= ~(UART_IIR_ID | UART_IIR_NO_INT);
                iir |= UART_IIR_THRI;
        }

        if (!(iir & UART_IIR_NO_INT))
                serial8250_tx_chars(up);

        if (up->port.irq)
                serial_out(up, UART_IER, ier);

        uart_port_unlock_irqrestore(&up->port, flags);

        /* Standard timer interval plus 0.2s to keep the port running */
        mod_timer(&up->timer,
                jiffies + uart_poll_timeout(&up->port) + HZ / 5);
}

static void univ8250_setup_timer(struct uart_8250_port *up)
{
        struct uart_port *port = &up->port;

        /*
         * The above check will only give an accurate result the first time
         * the port is opened so this value needs to be preserved.
         */
        if (up->bugs & UART_BUG_THRE) {
                pr_debug("%s - using backup timer\n", port->name);

                up->timer.function = serial8250_backup_timeout;
                mod_timer(&up->timer, jiffies +
                          uart_poll_timeout(port) + HZ / 5);
        }

        /*
         * If the "interrupt" for this port doesn't correspond with any
         * hardware interrupt, we use a timer-based system.  The original
         * driver used to do this with IRQ0.
         */
        if (!port->irq)
                mod_timer(&up->timer, jiffies + uart_poll_timeout(port));
}

static int univ8250_setup_irq(struct uart_8250_port *up)
{
        struct uart_port *port = &up->port;

        if (port->irq)
                return serial_link_irq_chain(up);

        return 0;
}

static void univ8250_release_irq(struct uart_8250_port *up)
{
        struct uart_port *port = &up->port;

        timer_delete_sync(&up->timer);
        up->timer.function = serial8250_timeout;
        if (port->irq)
                serial_unlink_irq_chain(up);
}

const struct uart_ops *univ8250_port_base_ops;
struct uart_ops univ8250_port_ops;

static const struct uart_8250_ops univ8250_driver_ops = {
        .setup_irq        = univ8250_setup_irq,
        .release_irq        = univ8250_release_irq,
        .setup_timer        = univ8250_setup_timer,
};

static struct uart_8250_port serial8250_ports[UART_NR];

/**
 * serial8250_get_port - retrieve struct uart_8250_port
 * @line: serial line number
 *
 * This function retrieves struct uart_8250_port for the specific line.
 * This struct *must* *not* be used to perform a 8250 or serial core operation
 * which is not accessible otherwise. Its only purpose is to make the struct
 * accessible to the runtime-pm callbacks for context suspend/restore.
 * The lock assumption made here is none because runtime-pm suspend/resume
 * callbacks should not be invoked if there is any operation performed on the
 * port.
 */
struct uart_8250_port *serial8250_get_port(int line)
{
        return &serial8250_ports[line];
}
EXPORT_SYMBOL_GPL(serial8250_get_port);

static inline void serial8250_apply_quirks(struct uart_8250_port *up)
{
        up->port.quirks |= skip_txen_test ? UPQ_NO_TXEN_TEST : 0;
}

struct uart_8250_port *serial8250_setup_port(int index)
{
        struct uart_8250_port *up;

        if (index >= UART_NR)
                return NULL;

        up = &serial8250_ports[index];
        up->port.line = index;
        up->port.port_id = index;

        serial8250_init_port(up);
        if (!univ8250_port_base_ops)
                univ8250_port_base_ops = up->port.ops;
        up->port.ops = &univ8250_port_ops;

        timer_setup(&up->timer, serial8250_timeout, 0);

        up->ops = &univ8250_driver_ops;

        serial8250_set_defaults(up);

        return up;
}

void __init serial8250_register_ports(struct uart_driver *drv, struct device *dev)
{
        int i;

        for (i = 0; i < nr_uarts; i++) {
                struct uart_8250_port *up = &serial8250_ports[i];

                if (up->port.type == PORT_8250_CIR)
                        continue;

                if (up->port.dev)
                        continue;

                up->port.dev = dev;

                if (uart_console_registered(&up->port))
                        pm_runtime_get_sync(up->port.dev);

                serial8250_apply_quirks(up);
                uart_add_one_port(drv, &up->port);
        }
}

#ifdef CONFIG_SERIAL_8250_CONSOLE

static void univ8250_console_write(struct console *co, const char *s,
                                   unsigned int count)
{
        struct uart_8250_port *up = &serial8250_ports[co->index];

        serial8250_console_write(up, s, count);
}

static int univ8250_console_setup(struct console *co, char *options)
{
        struct uart_8250_port *up;
        struct uart_port *port;
        int retval, i;

        /*
         * Check whether an invalid uart number has been specified, and
         * if so, search for the first available port that does have
         * console support.
         */
        if (co->index < 0 || co->index >= UART_NR)
                co->index = 0;

        /*
         * If the console is past the initial isa ports, init more ports up to
         * co->index as needed and increment nr_uarts accordingly.
         */
        for (i = nr_uarts; i <= co->index; i++) {
                up = serial8250_setup_port(i);
                if (!up)
                        return -ENODEV;
                nr_uarts++;
        }

        port = &serial8250_ports[co->index].port;
        /* link port to console */
        uart_port_set_cons(port, co);

        retval = serial8250_console_setup(port, options, false);
        if (retval != 0)
                uart_port_set_cons(port, NULL);
        return retval;
}

static int univ8250_console_exit(struct console *co)
{
        struct uart_port *port;

        port = &serial8250_ports[co->index].port;
        return serial8250_console_exit(port);
}

/**
 *        univ8250_console_match - non-standard console matching
 *        @co:          registering console
 *        @name:          name from console command line
 *        @idx:          index from console command line
 *        @options: ptr to option string from console command line
 *
 *        Only attempts to match console command lines of the form:
 *            console=uart[8250],io|mmio|mmio16|mmio32,<addr>[,<options>]
 *            console=uart[8250],0x<addr>[,<options>]
 *        This form is used to register an initial earlycon boot console and
 *        replace it with the serial8250_console at 8250 driver init.
 *
 *        Performs console setup for a match (as required by interface)
 *        If no <options> are specified, then assume the h/w is already setup.
 *
 *        Returns 0 if console matches; otherwise non-zero to use default matching
 */
static int univ8250_console_match(struct console *co, char *name, int idx,
                                  char *options)
{
        char match[] = "uart";        /* 8250-specific earlycon name */
        enum uart_iotype iotype;
        resource_size_t addr;
        int i;

        if (strncmp(name, match, 4) != 0)
                return -ENODEV;

        if (uart_parse_earlycon(options, &iotype, &addr, &options))
                return -ENODEV;

        /* try to match the port specified on the command line */
        for (i = 0; i < nr_uarts; i++) {
                struct uart_port *port = &serial8250_ports[i].port;

                if (port->iotype != iotype)
                        continue;
                if ((iotype == UPIO_MEM || iotype == UPIO_MEM16 ||
                     iotype == UPIO_MEM32 || iotype == UPIO_MEM32BE)
                    && (port->mapbase != addr))
                        continue;
                if (iotype == UPIO_PORT && port->iobase != addr)
                        continue;

                co->index = i;
                uart_port_set_cons(port, co);
                return serial8250_console_setup(port, options, true);
        }

        return -ENODEV;
}

static struct console univ8250_console = {
        .name                = "ttyS",
        .write                = univ8250_console_write,
        .device                = uart_console_device,
        .setup                = univ8250_console_setup,
        .exit                = univ8250_console_exit,
        .match                = univ8250_console_match,
        .flags                = CON_PRINTBUFFER | CON_ANYTIME,
        .index                = -1,
        .data                = &serial8250_reg,
};

static int __init univ8250_console_init(void)
{
        if (nr_uarts == 0)
                return -ENODEV;

        serial8250_isa_init_ports();
        register_console(&univ8250_console);
        return 0;
}
console_initcall(univ8250_console_init);

#define SERIAL8250_CONSOLE        (&univ8250_console)
#else
#define SERIAL8250_CONSOLE        NULL
#endif

struct uart_driver serial8250_reg = {
        .owner                        = THIS_MODULE,
        .driver_name                = "serial",
        .dev_name                = "ttyS",
        .major                        = TTY_MAJOR,
        .minor                        = 64,
        .cons                        = SERIAL8250_CONSOLE,
};

/*
 * early_serial_setup - early registration for 8250 ports
 *
 * Setup an 8250 port structure prior to console initialisation.  Use
 * after console initialisation will cause undefined behaviour.
 */
int __init early_serial_setup(struct uart_port *port)
{
        struct uart_port *p;

        if (port->line >= ARRAY_SIZE(serial8250_ports) || nr_uarts == 0)
                return -ENODEV;

        serial8250_isa_init_ports();
        p = &serial8250_ports[port->line].port;
        p->iobase       = port->iobase;
        p->membase      = port->membase;
        p->irq          = port->irq;
        p->irqflags     = port->irqflags;
        p->uartclk      = port->uartclk;
        p->fifosize     = port->fifosize;
        p->regshift     = port->regshift;
        p->iotype       = port->iotype;
        p->flags        = port->flags;
        p->mapbase      = port->mapbase;
        p->mapsize      = port->mapsize;
        p->private_data = port->private_data;
        p->type                = port->type;
        p->line                = port->line;

        serial8250_set_defaults(up_to_u8250p(p));

        if (port->serial_in)
                p->serial_in = port->serial_in;
        if (port->serial_out)
                p->serial_out = port->serial_out;
        if (port->handle_irq)
                p->handle_irq = port->handle_irq;

        return 0;
}

/**
 *        serial8250_suspend_port - suspend one serial port
 *        @line:  serial line number
 *
 *        Suspend one serial port.
 */
void serial8250_suspend_port(int line)
{
        struct uart_8250_port *up = &serial8250_ports[line];
        struct uart_port *port = &up->port;

        if (!console_suspend_enabled && uart_console(port) &&
            port->type != PORT_8250) {
                unsigned char canary = 0xa5;

                serial_out(up, UART_SCR, canary);
                if (serial_in(up, UART_SCR) == canary)
                        up->canary = canary;
        }

        uart_suspend_port(&serial8250_reg, port);
}
EXPORT_SYMBOL(serial8250_suspend_port);

/**
 *        serial8250_resume_port - resume one serial port
 *        @line:  serial line number
 *
 *        Resume one serial port.
 */
void serial8250_resume_port(int line)
{
        struct uart_8250_port *up = &serial8250_ports[line];
        struct uart_port *port = &up->port;

        up->canary = 0;

        if (up->capabilities & UART_NATSEMI) {
                /* Ensure it's still in high speed mode */
                serial_port_out(port, UART_LCR, 0xE0);

                ns16550a_goto_highspeed(up);

                serial_port_out(port, UART_LCR, 0);
                port->uartclk = 921600*16;
        }
        uart_resume_port(&serial8250_reg, port);
}
EXPORT_SYMBOL(serial8250_resume_port);

/*
 * serial8250_register_8250_port and serial8250_unregister_port allows for
 * 16x50 serial ports to be configured at run-time, to support PCMCIA
 * modems and PCI multiport cards.
 */
static DEFINE_MUTEX(serial_mutex);

static struct uart_8250_port *serial8250_find_match_or_unused(const struct uart_port *port)
{
        int i;

        /*
         * First, find a port entry which matches.
         */
        for (i = 0; i < nr_uarts; i++)
                if (uart_match_port(&serial8250_ports[i].port, port))
                        return &serial8250_ports[i];

        /* try line number first if still available */
        i = port->line;
        if (i < nr_uarts && serial8250_ports[i].port.type == PORT_UNKNOWN &&
                        serial8250_ports[i].port.iobase == 0)
                return &serial8250_ports[i];
        /*
         * We didn't find a matching entry, so look for the first
         * free entry.  We look for one which hasn't been previously
         * used (indicated by zero iobase).
         */
        for (i = 0; i < nr_uarts; i++)
                if (serial8250_ports[i].port.type == PORT_UNKNOWN &&
                    serial8250_ports[i].port.iobase == 0)
                        return &serial8250_ports[i];

        /*
         * That also failed.  Last resort is to find any entry which
         * doesn't have a real port associated with it.
         */
        for (i = 0; i < nr_uarts; i++)
                if (serial8250_ports[i].port.type == PORT_UNKNOWN)
                        return &serial8250_ports[i];

        return NULL;
}

static void serial_8250_overrun_backoff_work(struct work_struct *work)
{
        struct uart_8250_port *up = container_of(to_delayed_work(work), struct uart_8250_port,
                                                 overrun_backoff);

        guard(uart_port_lock_irqsave)(&up->port);
        up->ier |= UART_IER_RLSI | UART_IER_RDI;
        serial_out(up, UART_IER, up->ier);
}

/**
 *        serial8250_register_8250_port - register a serial port
 *        @up: serial port template
 *
 *        Configure the serial port specified by the request. If the
 *        port exists and is in use, it is hung up and unregistered
 *        first.
 *
 *        The port is then probed and if necessary the IRQ is autodetected
 *        If this fails an error is returned.
 *
 *        On success the port is ready to use and the line number is returned.
 */
int serial8250_register_8250_port(const struct uart_8250_port *up)
{
        struct uart_8250_port *uart;
        int ret;

        if (up->port.uartclk == 0)
                return -EINVAL;

        guard(mutex)(&serial_mutex);

        uart = serial8250_find_match_or_unused(&up->port);
        if (!uart) {
                /*
                 * If the port is past the initial isa ports, initialize a new
                 * port and increment nr_uarts accordingly.
                 */
                uart = serial8250_setup_port(nr_uarts);
                if (!uart)
                        return -ENOSPC;
                nr_uarts++;
        }

        /* Check if it is CIR already. We check this below again, see there why. */
        if (uart->port.type == PORT_8250_CIR)
                return -ENODEV;

        if (uart->port.dev)
                uart_remove_one_port(&serial8250_reg, &uart->port);

        uart->port.ctrl_id        = up->port.ctrl_id;
        uart->port.port_id        = up->port.port_id;
        uart->port.iobase       = up->port.iobase;
        uart->port.membase      = up->port.membase;
        uart->port.irq          = up->port.irq;
        uart->port.irqflags     = up->port.irqflags;
        uart->port.uartclk      = up->port.uartclk;
        uart->port.fifosize     = up->port.fifosize;
        uart->port.regshift     = up->port.regshift;
        uart->port.iotype       = up->port.iotype;
        uart->port.flags        = up->port.flags | UPF_BOOT_AUTOCONF;
        uart->bugs                = up->bugs;
        uart->port.mapbase      = up->port.mapbase;
        uart->port.mapsize      = up->port.mapsize;
        uart->port.private_data = up->port.private_data;
        uart->tx_loadsz                = up->tx_loadsz;
        uart->capabilities        = up->capabilities;
        uart->port.throttle        = up->port.throttle;
        uart->port.unthrottle        = up->port.unthrottle;
        uart->port.rs485_config        = up->port.rs485_config;
        uart->port.rs485_supported = up->port.rs485_supported;
        uart->port.rs485        = up->port.rs485;
        uart->rs485_start_tx        = up->rs485_start_tx;
        uart->rs485_stop_tx        = up->rs485_stop_tx;
        uart->lsr_save_mask        = up->lsr_save_mask;
        uart->dma                = up->dma;

        /* Take tx_loadsz from fifosize if it wasn't set separately */
        if (uart->port.fifosize && !uart->tx_loadsz)
                uart->tx_loadsz = uart->port.fifosize;

        if (up->port.dev) {
                uart->port.dev = up->port.dev;
                ret = uart_get_rs485_mode(&uart->port);
                if (ret)
                        goto err;
        }

        if (up->port.flags & UPF_FIXED_TYPE)
                uart->port.type = up->port.type;

        /*
         * Only call mctrl_gpio_init(), if the device has no ACPI
         * companion device
         */
        if (!has_acpi_companion(uart->port.dev)) {
                struct mctrl_gpios *gpios = mctrl_gpio_init(&uart->port, 0);
                if (IS_ERR(gpios)) {
                        ret = PTR_ERR(gpios);
                        goto err;
                } else {
                        uart->gpios = gpios;
                }
        }

        serial8250_set_defaults(uart);

        /* Possibly override default I/O functions.  */
        if (up->port.serial_in)
                uart->port.serial_in = up->port.serial_in;
        if (up->port.serial_out)
                uart->port.serial_out = up->port.serial_out;
        if (up->port.handle_irq)
                uart->port.handle_irq = up->port.handle_irq;
        /*  Possibly override set_termios call */
        if (up->port.set_termios)
                uart->port.set_termios = up->port.set_termios;
        if (up->port.set_ldisc)
                uart->port.set_ldisc = up->port.set_ldisc;
        if (up->port.get_mctrl)
                uart->port.get_mctrl = up->port.get_mctrl;
        if (up->port.set_mctrl)
                uart->port.set_mctrl = up->port.set_mctrl;
        if (up->port.get_divisor)
                uart->port.get_divisor = up->port.get_divisor;
        if (up->port.set_divisor)
                uart->port.set_divisor = up->port.set_divisor;
        if (up->port.startup)
                uart->port.startup = up->port.startup;
        if (up->port.shutdown)
                uart->port.shutdown = up->port.shutdown;
        if (up->port.pm)
                uart->port.pm = up->port.pm;
        if (up->port.handle_break)
                uart->port.handle_break = up->port.handle_break;
        if (up->dl_read)
                uart->dl_read = up->dl_read;
        if (up->dl_write)
                uart->dl_write = up->dl_write;

        /* Check the type (again)! It might have changed by the port.type assignment above. */
        if (uart->port.type != PORT_8250_CIR) {
                if (uart_console_registered(&uart->port))
                        pm_runtime_get_sync(uart->port.dev);

                if (serial8250_isa_config != NULL)
                        serial8250_isa_config(0, &uart->port,
                                        &uart->capabilities);

                serial8250_apply_quirks(uart);
                ret = uart_add_one_port(&serial8250_reg,
                                        &uart->port);
                if (ret)
                        goto err;

                ret = uart->port.line;
        } else {
                dev_info(uart->port.dev,
                        "skipping CIR port at 0x%lx / 0x%llx, IRQ %d\n",
                        uart->port.iobase,
                        (unsigned long long)uart->port.mapbase,
                        uart->port.irq);

                ret = 0;
        }

        if (!uart->lsr_save_mask)
                uart->lsr_save_mask = LSR_SAVE_FLAGS;        /* Use default LSR mask */

        /* Initialise interrupt backoff work if required */
        if (up->overrun_backoff_time_ms > 0) {
                uart->overrun_backoff_time_ms =
                        up->overrun_backoff_time_ms;
                INIT_DELAYED_WORK(&uart->overrun_backoff,
                                serial_8250_overrun_backoff_work);
        } else {
                uart->overrun_backoff_time_ms = 0;
        }

        return ret;

err:
        uart->port.dev = NULL;
        return ret;
}
EXPORT_SYMBOL(serial8250_register_8250_port);

/**
 *        serial8250_unregister_port - remove a 16x50 serial port at runtime
 *        @line: serial line number
 *
 *        Remove one serial port.  This may not be called from interrupt
 *        context.  We hand the port back to the our control.
 */
void serial8250_unregister_port(int line)
{
        struct uart_8250_port *uart = &serial8250_ports[line];

        guard(mutex)(&serial_mutex);

        if (uart->em485) {
                guard(uart_port_lock_irqsave)(&uart->port);
                serial8250_em485_destroy(uart);
        }

        uart_remove_one_port(&serial8250_reg, &uart->port);
        if (serial8250_isa_devs) {
                uart->port.flags &= ~UPF_BOOT_AUTOCONF;
                uart->port.type = PORT_UNKNOWN;
                uart->port.dev = &serial8250_isa_devs->dev;
                uart->port.port_id = line;
                uart->capabilities = 0;
                serial8250_init_port(uart);
                serial8250_apply_quirks(uart);
                uart_add_one_port(&serial8250_reg, &uart->port);
        } else {
                uart->port.dev = NULL;
        }
}
EXPORT_SYMBOL(serial8250_unregister_port);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Generic 8250/16x50 serial driver");




































































    9 
    9 




    8 
    8 








    9 



    8 

    8 


    1 

    9 


























































































































































































































































































    9 




    9 



























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kdebug.h>
#include <linux/kprobes.h>
#include <linux/export.h>
#include <linux/notifier.h>
#include <linux/rcupdate.h>
#include <linux/vmalloc.h>

#define CREATE_TRACE_POINTS
#include <trace/events/notifier.h>

/*
 *        Notifier chain core routines.  The exported routines below
 *        are layered on top of these, with appropriate locking added.
 */

static int notifier_chain_register(struct notifier_block **nl,
                                   struct notifier_block *n,
                                   bool unique_priority)
{
        while ((*nl) != NULL) {
                if (unlikely((*nl) == n)) {
                        WARN(1, "notifier callback %ps already registered",
                             n->notifier_call);
                        return -EEXIST;
                }
                if (n->priority > (*nl)->priority)
                        break;
                if (n->priority == (*nl)->priority && unique_priority)
                        return -EBUSY;
                nl = &((*nl)->next);
        }
        n->next = *nl;
        rcu_assign_pointer(*nl, n);
        trace_notifier_register((void *)n->notifier_call);
        return 0;
}

static int notifier_chain_unregister(struct notifier_block **nl,
                struct notifier_block *n)
{
        while ((*nl) != NULL) {
                if ((*nl) == n) {
                        rcu_assign_pointer(*nl, n->next);
                        trace_notifier_unregister((void *)n->notifier_call);
                        return 0;
                }
                nl = &((*nl)->next);
        }
        return -ENOENT;
}

/**
 * notifier_call_chain - Informs the registered notifiers about an event.
 *        @nl:                Pointer to head of the blocking notifier chain
 *        @val:                Value passed unmodified to notifier function
 *        @v:                Pointer passed unmodified to notifier function
 *        @nr_to_call:        Number of notifier functions to be called. Don't care
 *                        value of this parameter is -1.
 *        @nr_calls:        Records the number of notifications sent. Don't care
 *                        value of this field is NULL.
 *        Return:                notifier_call_chain returns the value returned by the
 *                        last notifier function called.
 */
static int notifier_call_chain(struct notifier_block **nl,
                               unsigned long val, void *v,
                               int nr_to_call, int *nr_calls)
{
        int ret = NOTIFY_DONE;
        struct notifier_block *nb, *next_nb;

        nb = rcu_dereference_raw(*nl);

        while (nb && nr_to_call) {
                next_nb = rcu_dereference_raw(nb->next);

#ifdef CONFIG_DEBUG_NOTIFIERS
                if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
                        WARN(1, "Invalid notifier called!");
                        nb = next_nb;
                        continue;
                }
#endif
                trace_notifier_run((void *)nb->notifier_call);
                ret = nb->notifier_call(nb, val, v);

                if (nr_calls)
                        (*nr_calls)++;

                if (ret & NOTIFY_STOP_MASK)
                        break;
                nb = next_nb;
                nr_to_call--;
        }
        return ret;
}
NOKPROBE_SYMBOL(notifier_call_chain);

/**
 * notifier_call_chain_robust - Inform the registered notifiers about an event
 *                              and rollback on error.
 * @nl:                Pointer to head of the blocking notifier chain
 * @val_up:        Value passed unmodified to the notifier function
 * @val_down:        Value passed unmodified to the notifier function when recovering
 *              from an error on @val_up
 * @v:                Pointer passed unmodified to the notifier function
 *
 * NOTE:        It is important the @nl chain doesn't change between the two
 *                invocations of notifier_call_chain() such that we visit the
 *                exact same notifier callbacks; this rules out any RCU usage.
 *
 * Return:        the return value of the @val_up call.
 */
static int notifier_call_chain_robust(struct notifier_block **nl,
                                     unsigned long val_up, unsigned long val_down,
                                     void *v)
{
        int ret, nr = 0;

        ret = notifier_call_chain(nl, val_up, v, -1, &nr);
        if (ret & NOTIFY_STOP_MASK)
                notifier_call_chain(nl, val_down, v, nr-1, NULL);

        return ret;
}

/*
 *        Atomic notifier chain routines.  Registration and unregistration
 *        use a spinlock, and call_chain is synchronized by RCU (no locks).
 */

/**
 *        atomic_notifier_chain_register - Add notifier to an atomic notifier chain
 *        @nh: Pointer to head of the atomic notifier chain
 *        @n: New entry in notifier chain
 *
 *        Adds a notifier to an atomic notifier chain.
 *
 *        Returns 0 on success, %-EEXIST on error.
 */
int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
                struct notifier_block *n)
{
        unsigned long flags;
        int ret;

        spin_lock_irqsave(&nh->lock, flags);
        ret = notifier_chain_register(&nh->head, n, false);
        spin_unlock_irqrestore(&nh->lock, flags);
        return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);

/**
 *        atomic_notifier_chain_register_unique_prio - Add notifier to an atomic notifier chain
 *        @nh: Pointer to head of the atomic notifier chain
 *        @n: New entry in notifier chain
 *
 *        Adds a notifier to an atomic notifier chain if there is no other
 *        notifier registered using the same priority.
 *
 *        Returns 0 on success, %-EEXIST or %-EBUSY on error.
 */
int atomic_notifier_chain_register_unique_prio(struct atomic_notifier_head *nh,
                                               struct notifier_block *n)
{
        unsigned long flags;
        int ret;

        spin_lock_irqsave(&nh->lock, flags);
        ret = notifier_chain_register(&nh->head, n, true);
        spin_unlock_irqrestore(&nh->lock, flags);
        return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_chain_register_unique_prio);

/**
 *        atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
 *        @nh: Pointer to head of the atomic notifier chain
 *        @n: Entry to remove from notifier chain
 *
 *        Removes a notifier from an atomic notifier chain.
 *
 *        Returns zero on success or %-ENOENT on failure.
 */
int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
                struct notifier_block *n)
{
        unsigned long flags;
        int ret;

        spin_lock_irqsave(&nh->lock, flags);
        ret = notifier_chain_unregister(&nh->head, n);
        spin_unlock_irqrestore(&nh->lock, flags);
        synchronize_rcu();
        return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);

/**
 *        atomic_notifier_call_chain - Call functions in an atomic notifier chain
 *        @nh: Pointer to head of the atomic notifier chain
 *        @val: Value passed unmodified to notifier function
 *        @v: Pointer passed unmodified to notifier function
 *
 *        Calls each function in a notifier chain in turn.  The functions
 *        run in an atomic context, so they must not block.
 *        This routine uses RCU to synchronize with changes to the chain.
 *
 *        If the return value of the notifier can be and'ed
 *        with %NOTIFY_STOP_MASK then atomic_notifier_call_chain()
 *        will return immediately, with the return value of
 *        the notifier function which halted execution.
 *        Otherwise the return value is the return value
 *        of the last notifier function called.
 */
int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
                               unsigned long val, void *v)
{
        int ret;

        rcu_read_lock();
        ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
NOKPROBE_SYMBOL(atomic_notifier_call_chain);

/**
 *        atomic_notifier_call_chain_is_empty - Check whether notifier chain is empty
 *        @nh: Pointer to head of the atomic notifier chain
 *
 *        Checks whether notifier chain is empty.
 *
 *        Returns true is notifier chain is empty, false otherwise.
 */
bool atomic_notifier_call_chain_is_empty(struct atomic_notifier_head *nh)
{
        return !rcu_access_pointer(nh->head);
}

/*
 *        Blocking notifier chain routines.  All access to the chain is
 *        synchronized by an rwsem.
 */

static int __blocking_notifier_chain_register(struct blocking_notifier_head *nh,
                                              struct notifier_block *n,
                                              bool unique_priority)
{
        int ret;

        /*
         * This code gets used during boot-up, when task switching is
         * not yet working and interrupts must remain disabled.  At
         * such times we must not call down_write().
         */
        if (unlikely(system_state == SYSTEM_BOOTING))
                return notifier_chain_register(&nh->head, n, unique_priority);

        down_write(&nh->rwsem);
        ret = notifier_chain_register(&nh->head, n, unique_priority);
        up_write(&nh->rwsem);
        return ret;
}

/**
 *        blocking_notifier_chain_register - Add notifier to a blocking notifier chain
 *        @nh: Pointer to head of the blocking notifier chain
 *        @n: New entry in notifier chain
 *
 *        Adds a notifier to a blocking notifier chain.
 *        Must be called in process context.
 *
 *        Returns 0 on success, %-EEXIST on error.
 */
int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
                struct notifier_block *n)
{
        return __blocking_notifier_chain_register(nh, n, false);
}
EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);

/**
 *        blocking_notifier_chain_register_unique_prio - Add notifier to a blocking notifier chain
 *        @nh: Pointer to head of the blocking notifier chain
 *        @n: New entry in notifier chain
 *
 *        Adds a notifier to an blocking notifier chain if there is no other
 *        notifier registered using the same priority.
 *
 *        Returns 0 on success, %-EEXIST or %-EBUSY on error.
 */
int blocking_notifier_chain_register_unique_prio(struct blocking_notifier_head *nh,
                                                 struct notifier_block *n)
{
        return __blocking_notifier_chain_register(nh, n, true);
}
EXPORT_SYMBOL_GPL(blocking_notifier_chain_register_unique_prio);

/**
 *        blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
 *        @nh: Pointer to head of the blocking notifier chain
 *        @n: Entry to remove from notifier chain
 *
 *        Removes a notifier from a blocking notifier chain.
 *        Must be called from process context.
 *
 *        Returns zero on success or %-ENOENT on failure.
 */
int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
                struct notifier_block *n)
{
        int ret;

        /*
         * This code gets used during boot-up, when task switching is
         * not yet working and interrupts must remain disabled.  At
         * such times we must not call down_write().
         */
        if (unlikely(system_state == SYSTEM_BOOTING))
                return notifier_chain_unregister(&nh->head, n);

        down_write(&nh->rwsem);
        ret = notifier_chain_unregister(&nh->head, n);
        up_write(&nh->rwsem);
        return ret;
}
EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);

int blocking_notifier_call_chain_robust(struct blocking_notifier_head *nh,
                unsigned long val_up, unsigned long val_down, void *v)
{
        int ret = NOTIFY_DONE;

        /*
         * We check the head outside the lock, but if this access is
         * racy then it does not matter what the result of the test
         * is, we re-check the list after having taken the lock anyway:
         */
        if (rcu_access_pointer(nh->head)) {
                down_read(&nh->rwsem);
                ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v);
                up_read(&nh->rwsem);
        }
        return ret;
}
EXPORT_SYMBOL_GPL(blocking_notifier_call_chain_robust);

/**
 *        blocking_notifier_call_chain - Call functions in a blocking notifier chain
 *        @nh: Pointer to head of the blocking notifier chain
 *        @val: Value passed unmodified to notifier function
 *        @v: Pointer passed unmodified to notifier function
 *
 *        Calls each function in a notifier chain in turn.  The functions
 *        run in a process context, so they are allowed to block.
 *
 *        If the return value of the notifier can be and'ed
 *        with %NOTIFY_STOP_MASK then blocking_notifier_call_chain()
 *        will return immediately, with the return value of
 *        the notifier function which halted execution.
 *        Otherwise the return value is the return value
 *        of the last notifier function called.
 */
int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
                unsigned long val, void *v)
{
        int ret = NOTIFY_DONE;

        /*
         * We check the head outside the lock, but if this access is
         * racy then it does not matter what the result of the test
         * is, we re-check the list after having taken the lock anyway:
         */
        if (rcu_access_pointer(nh->head)) {
                down_read(&nh->rwsem);
                ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
                up_read(&nh->rwsem);
        }
        return ret;
}
EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);

/*
 *        Raw notifier chain routines.  There is no protection;
 *        the caller must provide it.  Use at your own risk!
 */

/**
 *        raw_notifier_chain_register - Add notifier to a raw notifier chain
 *        @nh: Pointer to head of the raw notifier chain
 *        @n: New entry in notifier chain
 *
 *        Adds a notifier to a raw notifier chain.
 *        All locking must be provided by the caller.
 *
 *        Returns 0 on success, %-EEXIST on error.
 */
int raw_notifier_chain_register(struct raw_notifier_head *nh,
                struct notifier_block *n)
{
        return notifier_chain_register(&nh->head, n, false);
}
EXPORT_SYMBOL_GPL(raw_notifier_chain_register);

/**
 *        raw_notifier_chain_unregister - Remove notifier from a raw notifier chain
 *        @nh: Pointer to head of the raw notifier chain
 *        @n: Entry to remove from notifier chain
 *
 *        Removes a notifier from a raw notifier chain.
 *        All locking must be provided by the caller.
 *
 *        Returns zero on success or %-ENOENT on failure.
 */
int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
                struct notifier_block *n)
{
        return notifier_chain_unregister(&nh->head, n);
}
EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);

int raw_notifier_call_chain_robust(struct raw_notifier_head *nh,
                unsigned long val_up, unsigned long val_down, void *v)
{
        return notifier_call_chain_robust(&nh->head, val_up, val_down, v);
}
EXPORT_SYMBOL_GPL(raw_notifier_call_chain_robust);

/**
 *        raw_notifier_call_chain - Call functions in a raw notifier chain
 *        @nh: Pointer to head of the raw notifier chain
 *        @val: Value passed unmodified to notifier function
 *        @v: Pointer passed unmodified to notifier function
 *
 *        Calls each function in a notifier chain in turn.  The functions
 *        run in an undefined context.
 *        All locking must be provided by the caller.
 *
 *        If the return value of the notifier can be and'ed
 *        with %NOTIFY_STOP_MASK then raw_notifier_call_chain()
 *        will return immediately, with the return value of
 *        the notifier function which halted execution.
 *        Otherwise the return value is the return value
 *        of the last notifier function called.
 */
int raw_notifier_call_chain(struct raw_notifier_head *nh,
                unsigned long val, void *v)
{
        return notifier_call_chain(&nh->head, val, v, -1, NULL);
}
EXPORT_SYMBOL_GPL(raw_notifier_call_chain);

/*
 *        SRCU notifier chain routines.    Registration and unregistration
 *        use a mutex, and call_chain is synchronized by SRCU (no locks).
 */

/**
 *        srcu_notifier_chain_register - Add notifier to an SRCU notifier chain
 *        @nh: Pointer to head of the SRCU notifier chain
 *        @n: New entry in notifier chain
 *
 *        Adds a notifier to an SRCU notifier chain.
 *        Must be called in process context.
 *
 *        Returns 0 on success, %-EEXIST on error.
 */
int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
                struct notifier_block *n)
{
        int ret;

        /*
         * This code gets used during boot-up, when task switching is
         * not yet working and interrupts must remain disabled.  At
         * such times we must not call mutex_lock().
         */
        if (unlikely(system_state == SYSTEM_BOOTING))
                return notifier_chain_register(&nh->head, n, false);

        mutex_lock(&nh->mutex);
        ret = notifier_chain_register(&nh->head, n, false);
        mutex_unlock(&nh->mutex);
        return ret;
}
EXPORT_SYMBOL_GPL(srcu_notifier_chain_register);

/**
 *        srcu_notifier_chain_unregister - Remove notifier from an SRCU notifier chain
 *        @nh: Pointer to head of the SRCU notifier chain
 *        @n: Entry to remove from notifier chain
 *
 *        Removes a notifier from an SRCU notifier chain.
 *        Must be called from process context.
 *
 *        Returns zero on success or %-ENOENT on failure.
 */
int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
                struct notifier_block *n)
{
        int ret;

        /*
         * This code gets used during boot-up, when task switching is
         * not yet working and interrupts must remain disabled.  At
         * such times we must not call mutex_lock().
         */
        if (unlikely(system_state == SYSTEM_BOOTING))
                return notifier_chain_unregister(&nh->head, n);

        mutex_lock(&nh->mutex);
        ret = notifier_chain_unregister(&nh->head, n);
        mutex_unlock(&nh->mutex);
        synchronize_srcu(&nh->srcu);
        return ret;
}
EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);

/**
 *        srcu_notifier_call_chain - Call functions in an SRCU notifier chain
 *        @nh: Pointer to head of the SRCU notifier chain
 *        @val: Value passed unmodified to notifier function
 *        @v: Pointer passed unmodified to notifier function
 *
 *        Calls each function in a notifier chain in turn.  The functions
 *        run in a process context, so they are allowed to block.
 *
 *        If the return value of the notifier can be and'ed
 *        with %NOTIFY_STOP_MASK then srcu_notifier_call_chain()
 *        will return immediately, with the return value of
 *        the notifier function which halted execution.
 *        Otherwise the return value is the return value
 *        of the last notifier function called.
 */
int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
                unsigned long val, void *v)
{
        int ret;
        int idx;

        idx = srcu_read_lock(&nh->srcu);
        ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
        srcu_read_unlock(&nh->srcu, idx);
        return ret;
}
EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);

/**
 *        srcu_init_notifier_head - Initialize an SRCU notifier head
 *        @nh: Pointer to head of the srcu notifier chain
 *
 *        Unlike other sorts of notifier heads, SRCU notifier heads require
 *        dynamic initialization.  Be sure to call this routine before
 *        calling any of the other SRCU notifier routines for this head.
 *
 *        If an SRCU notifier head is deallocated, it must first be cleaned
 *        up by calling srcu_cleanup_notifier_head().  Otherwise the head's
 *        per-cpu data (used by the SRCU mechanism) will leak.
 */
void srcu_init_notifier_head(struct srcu_notifier_head *nh)
{
        mutex_init(&nh->mutex);
        if (init_srcu_struct(&nh->srcu) < 0)
                BUG();
        nh->head = NULL;
}
EXPORT_SYMBOL_GPL(srcu_init_notifier_head);

static ATOMIC_NOTIFIER_HEAD(die_chain);

int notrace notify_die(enum die_val val, const char *str,
               struct pt_regs *regs, long err, int trap, int sig)
{
        struct die_args args = {
                .regs        = regs,
                .str        = str,
                .err        = err,
                .trapnr        = trap,
                .signr        = sig,

        };
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                           "notify_die called but RCU thinks we're quiescent");
        return atomic_notifier_call_chain(&die_chain, val, &args);
}
NOKPROBE_SYMBOL(notify_die);

int register_die_notifier(struct notifier_block *nb)
{
        return atomic_notifier_chain_register(&die_chain, nb);
}
EXPORT_SYMBOL_GPL(register_die_notifier);

int unregister_die_notifier(struct notifier_block *nb)
{
        return atomic_notifier_chain_unregister(&die_chain, nb);
}
EXPORT_SYMBOL_GPL(unregister_die_notifier);











    1 






























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM percpu

#if !defined(_TRACE_PERCPU_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_PERCPU_H

#include <linux/tracepoint.h>
#include <trace/events/mmflags.h>

TRACE_EVENT(percpu_alloc_percpu,

        TP_PROTO(unsigned long call_site,
                 bool reserved, bool is_atomic, size_t size,
                 size_t align, void *base_addr, int off,
                 void __percpu *ptr, size_t bytes_alloc, gfp_t gfp_flags),

        TP_ARGS(call_site, reserved, is_atomic, size, align, base_addr, off,
                ptr, bytes_alloc, gfp_flags),

        TP_STRUCT__entry(
                __field(        unsigned long,                call_site        )
                __field(        bool,                        reserved        )
                __field(        bool,                        is_atomic        )
                __field(        size_t,                        size                )
                __field(        size_t,                        align                )
                __field(        void *,                        base_addr        )
                __field(        int,                        off                )
                __field(        void __percpu *,        ptr                )
                __field(        size_t,                        bytes_alloc        )
                __field(        unsigned long,                gfp_flags        )
        ),
        TP_fast_assign(
                __entry->call_site        = call_site;
                __entry->reserved        = reserved;
                __entry->is_atomic        = is_atomic;
                __entry->size                = size;
                __entry->align                = align;
                __entry->base_addr        = base_addr;
                __entry->off                = off;
                __entry->ptr                = ptr;
                __entry->bytes_alloc        = bytes_alloc;
                __entry->gfp_flags        = (__force unsigned long)gfp_flags;
        ),

        TP_printk("call_site=%pS reserved=%d is_atomic=%d size=%zu align=%zu base_addr=%p off=%d ptr=%p bytes_alloc=%zu gfp_flags=%s",
                  (void *)__entry->call_site,
                  __entry->reserved, __entry->is_atomic,
                  __entry->size, __entry->align,
                  __entry->base_addr, __entry->off, __entry->ptr,
                  __entry->bytes_alloc, show_gfp_flags(__entry->gfp_flags))
);

TRACE_EVENT(percpu_free_percpu,

        TP_PROTO(void *base_addr, int off, void __percpu *ptr),

        TP_ARGS(base_addr, off, ptr),

        TP_STRUCT__entry(
                __field(        void *,                        base_addr        )
                __field(        int,                        off                )
                __field(        void __percpu *,        ptr                )
        ),

        TP_fast_assign(
                __entry->base_addr        = base_addr;
                __entry->off                = off;
                __entry->ptr                = ptr;
        ),

        TP_printk("base_addr=%p off=%d ptr=%p",
                __entry->base_addr, __entry->off, __entry->ptr)
);

TRACE_EVENT(percpu_alloc_percpu_fail,

        TP_PROTO(bool reserved, bool is_atomic, size_t size, size_t align),

        TP_ARGS(reserved, is_atomic, size, align),

        TP_STRUCT__entry(
                __field(        bool,        reserved        )
                __field(        bool,        is_atomic        )
                __field(        size_t,        size                )
                __field(        size_t, align                )
        ),

        TP_fast_assign(
                __entry->reserved        = reserved;
                __entry->is_atomic        = is_atomic;
                __entry->size                = size;
                __entry->align                = align;
        ),

        TP_printk("reserved=%d is_atomic=%d size=%zu align=%zu",
                  __entry->reserved, __entry->is_atomic,
                  __entry->size, __entry->align)
);

TRACE_EVENT(percpu_create_chunk,

        TP_PROTO(void *base_addr),

        TP_ARGS(base_addr),

        TP_STRUCT__entry(
                __field(        void *, base_addr        )
        ),

        TP_fast_assign(
                __entry->base_addr        = base_addr;
        ),

        TP_printk("base_addr=%p", __entry->base_addr)
);

TRACE_EVENT(percpu_destroy_chunk,

        TP_PROTO(void *base_addr),

        TP_ARGS(base_addr),

        TP_STRUCT__entry(
                __field(        void *,        base_addr        )
        ),

        TP_fast_assign(
                __entry->base_addr        = base_addr;
        ),

        TP_printk("base_addr=%p", __entry->base_addr)
);

#endif /* _TRACE_PERCPU_H */

#include <trace/define_trace.h>



















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_LIST_NULLS_H
#define _LINUX_LIST_NULLS_H

#include <linux/poison.h>
#include <linux/const.h>

/*
 * Special version of lists, where end of list is not a NULL pointer,
 * but a 'nulls' marker, which can have many different values.
 * (up to 2^31 different values guaranteed on all platforms)
 *
 * In the standard hlist, termination of a list is the NULL pointer.
 * In this special 'nulls' variant, we use the fact that objects stored in
 * a list are aligned on a word (4 or 8 bytes alignment).
 * We therefore use the last significant bit of 'ptr' :
 * Set to 1 : This is a 'nulls' end-of-list marker (ptr >> 1)
 * Set to 0 : This is a pointer to some object (ptr)
 */

struct hlist_nulls_head {
        struct hlist_nulls_node *first;
};

struct hlist_nulls_node {
        struct hlist_nulls_node *next, **pprev;
};
#define NULLS_MARKER(value) (1UL | (((long)value) << 1))
#define INIT_HLIST_NULLS_HEAD(ptr, nulls) \
        ((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls))
#define HLIST_NULLS_HEAD_INIT(nulls) {.first = (struct hlist_nulls_node *)NULLS_MARKER(nulls)}

#define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member)

#define hlist_nulls_entry_safe(ptr, type, member) \
        ({ typeof(ptr) ____ptr = (ptr); \
           !is_a_nulls(____ptr) ? hlist_nulls_entry(____ptr, type, member) : NULL; \
        })
/**
 * ptr_is_a_nulls - Test if a ptr is a nulls
 * @ptr: ptr to be tested
 *
 */
static inline int is_a_nulls(const struct hlist_nulls_node *ptr)
{
        return ((unsigned long)ptr & 1);
}

/**
 * get_nulls_value - Get the 'nulls' value of the end of chain
 * @ptr: end of chain
 *
 * Should be called only if is_a_nulls(ptr);
 */
static inline unsigned long get_nulls_value(const struct hlist_nulls_node *ptr)
{
        return ((unsigned long)ptr) >> 1;
}

/**
 * hlist_nulls_unhashed - Has node been removed and reinitialized?
 * @h: Node to be checked
 *
 * Not that not all removal functions will leave a node in unhashed state.
 * For example, hlist_del_init_rcu() leaves the node in unhashed state,
 * but hlist_nulls_del() does not.
 */
static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h)
{
        return !h->pprev;
}

/**
 * hlist_nulls_unhashed_lockless - Has node been removed and reinitialized?
 * @h: Node to be checked
 *
 * Not that not all removal functions will leave a node in unhashed state.
 * For example, hlist_del_init_rcu() leaves the node in unhashed state,
 * but hlist_nulls_del() does not.  Unlike hlist_nulls_unhashed(), this
 * function may be used locklessly.
 */
static inline int hlist_nulls_unhashed_lockless(const struct hlist_nulls_node *h)
{
        return !READ_ONCE(h->pprev);
}

static inline int hlist_nulls_empty(const struct hlist_nulls_head *h)
{
        return is_a_nulls(READ_ONCE(h->first));
}

static inline void hlist_nulls_add_head(struct hlist_nulls_node *n,
                                        struct hlist_nulls_head *h)
{
        struct hlist_nulls_node *first = h->first;

        n->next = first;
        WRITE_ONCE(n->pprev, &h->first);
        h->first = n;
        if (!is_a_nulls(first))
                WRITE_ONCE(first->pprev, &n->next);
}

static inline void __hlist_nulls_del(struct hlist_nulls_node *n)
{
        struct hlist_nulls_node *next = n->next;
        struct hlist_nulls_node **pprev = n->pprev;

        WRITE_ONCE(*pprev, next);
        if (!is_a_nulls(next))
                WRITE_ONCE(next->pprev, pprev);
}

static inline void hlist_nulls_del(struct hlist_nulls_node *n)
{
        __hlist_nulls_del(n);
        WRITE_ONCE(n->pprev, LIST_POISON2);
}

/**
 * hlist_nulls_for_each_entry        - iterate over list of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_node to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 *
 */
#define hlist_nulls_for_each_entry(tpos, pos, head, member)                       \
        for (pos = (head)->first;                                               \
             (!is_a_nulls(pos)) &&                                               \
                ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
             pos = pos->next)

/**
 * hlist_nulls_for_each_entry_from - iterate over a hlist continuing from current point
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_node to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 *
 */
#define hlist_nulls_for_each_entry_from(tpos, pos, member)        \
        for (; (!is_a_nulls(pos)) &&                                 \
                ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
             pos = pos->next)

#endif

























































   61 































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Copyright (C) 2001 Momchil Velikov
 * Portions Copyright (C) 2001 Christoph Hellwig
 * Copyright (C) 2006 Nick Piggin
 * Copyright (C) 2012 Konstantin Khlebnikov
 */
#ifndef _LINUX_RADIX_TREE_H
#define _LINUX_RADIX_TREE_H

#include <linux/bitops.h>
#include <linux/gfp_types.h>
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/math.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/xarray.h>
#include <linux/local_lock.h>

/* Keep unconverted code working */
#define radix_tree_root                xarray
#define radix_tree_node                xa_node

struct radix_tree_preload {
        local_lock_t lock;
        unsigned nr;
        /* nodes->parent points to next preallocated node */
        struct radix_tree_node *nodes;
};
DECLARE_PER_CPU(struct radix_tree_preload, radix_tree_preloads);

/*
 * The bottom two bits of the slot determine how the remaining bits in the
 * slot are interpreted:
 *
 * 00 - data pointer
 * 10 - internal entry
 * x1 - value entry
 *
 * The internal entry may be a pointer to the next level in the tree, a
 * sibling entry, or an indicator that the entry in this slot has been moved
 * to another location in the tree and the lookup should be restarted.  While
 * NULL fits the 'data pointer' pattern, it means that there is no entry in
 * the tree for this index (no matter what level of the tree it is found at).
 * This means that storing a NULL entry in the tree is the same as deleting
 * the entry from the tree.
 */
#define RADIX_TREE_ENTRY_MASK                3UL
#define RADIX_TREE_INTERNAL_NODE        2UL

static inline bool radix_tree_is_internal_node(void *ptr)
{
        return ((unsigned long)ptr & RADIX_TREE_ENTRY_MASK) ==
                                RADIX_TREE_INTERNAL_NODE;
}

/*** radix-tree API starts here ***/

#define RADIX_TREE_MAP_SHIFT        XA_CHUNK_SHIFT
#define RADIX_TREE_MAP_SIZE        (1UL << RADIX_TREE_MAP_SHIFT)
#define RADIX_TREE_MAP_MASK        (RADIX_TREE_MAP_SIZE-1)

#define RADIX_TREE_MAX_TAGS        XA_MAX_MARKS
#define RADIX_TREE_TAG_LONGS        XA_MARK_LONGS

#define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
                                          RADIX_TREE_MAP_SHIFT))

/* The IDR tag is stored in the low bits of xa_flags */
#define ROOT_IS_IDR        ((__force gfp_t)4)
/* The top bits of xa_flags are used to store the root tags */
#define ROOT_TAG_SHIFT        (__GFP_BITS_SHIFT)

#define RADIX_TREE_INIT(name, mask)        XARRAY_INIT(name, mask)

#define RADIX_TREE(name, mask) \
        struct radix_tree_root name = RADIX_TREE_INIT(name, mask)

#define INIT_RADIX_TREE(root, mask) xa_init_flags(root, mask)

static inline bool radix_tree_empty(const struct radix_tree_root *root)
{
        return root->xa_head == NULL;
}

/**
 * struct radix_tree_iter - radix tree iterator state
 *
 * @index:        index of current slot
 * @next_index:        one beyond the last index for this chunk
 * @tags:        bit-mask for tag-iterating
 * @node:        node that contains current slot
 *
 * This radix tree iterator works in terms of "chunks" of slots.  A chunk is a
 * subinterval of slots contained within one radix tree leaf node.  It is
 * described by a pointer to its first slot and a struct radix_tree_iter
 * which holds the chunk's position in the tree and its size.  For tagged
 * iteration radix_tree_iter also holds the slots' bit-mask for one chosen
 * radix tree tag.
 */
struct radix_tree_iter {
        unsigned long        index;
        unsigned long        next_index;
        unsigned long        tags;
        struct radix_tree_node *node;
};

/**
 * Radix-tree synchronization
 *
 * The radix-tree API requires that users provide all synchronisation (with
 * specific exceptions, noted below).
 *
 * Synchronization of access to the data items being stored in the tree, and
 * management of their lifetimes must be completely managed by API users.
 *
 * For API usage, in general,
 * - any function _modifying_ the tree or tags (inserting or deleting
 *   items, setting or clearing tags) must exclude other modifications, and
 *   exclude any functions reading the tree.
 * - any function _reading_ the tree or tags (looking up items or tags,
 *   gang lookups) must exclude modifications to the tree, but may occur
 *   concurrently with other readers.
 *
 * The notable exceptions to this rule are the following functions:
 * __radix_tree_lookup
 * radix_tree_lookup
 * radix_tree_lookup_slot
 * radix_tree_tag_get
 * radix_tree_gang_lookup
 * radix_tree_gang_lookup_tag
 * radix_tree_gang_lookup_tag_slot
 * radix_tree_tagged
 *
 * The first 7 functions are able to be called locklessly, using RCU. The
 * caller must ensure calls to these functions are made within rcu_read_lock()
 * regions. Other readers (lock-free or otherwise) and modifications may be
 * running concurrently.
 *
 * It is still required that the caller manage the synchronization and lifetimes
 * of the items. So if RCU lock-free lookups are used, typically this would mean
 * that the items have their own locks, or are amenable to lock-free access; and
 * that the items are freed by RCU (or only freed after having been deleted from
 * the radix tree *and* a synchronize_rcu() grace period).
 *
 * (Note, rcu_assign_pointer and rcu_dereference are not needed to control
 * access to data items when inserting into or looking up from the radix tree)
 *
 * Note that the value returned by radix_tree_tag_get() may not be relied upon
 * if only the RCU read lock is held.  Functions to set/clear tags and to
 * delete nodes running concurrently with it may affect its result such that
 * two consecutive reads in the same locked section may return different
 * values.  If reliability is required, modification functions must also be
 * excluded from concurrency.
 *
 * radix_tree_tagged is able to be called without locking or RCU.
 */

/**
 * radix_tree_deref_slot - dereference a slot
 * @slot: slot pointer, returned by radix_tree_lookup_slot
 *
 * For use with radix_tree_lookup_slot().  Caller must hold tree at least read
 * locked across slot lookup and dereference. Not required if write lock is
 * held (ie. items cannot be concurrently inserted).
 *
 * radix_tree_deref_retry must be used to confirm validity of the pointer if
 * only the read lock is held.
 *
 * Return: entry stored in that slot.
 */
static inline void *radix_tree_deref_slot(void __rcu **slot)
{
        return rcu_dereference(*slot);
}

/**
 * radix_tree_deref_slot_protected - dereference a slot with tree lock held
 * @slot: slot pointer, returned by radix_tree_lookup_slot
 *
 * Similar to radix_tree_deref_slot.  The caller does not hold the RCU read
 * lock but it must hold the tree lock to prevent parallel updates.
 *
 * Return: entry stored in that slot.
 */
static inline void *radix_tree_deref_slot_protected(void __rcu **slot,
                                                        spinlock_t *treelock)
{
        return rcu_dereference_protected(*slot, lockdep_is_held(treelock));
}

/**
 * radix_tree_deref_retry        - check radix_tree_deref_slot
 * @arg:        pointer returned by radix_tree_deref_slot
 * Returns:        0 if retry is not required, otherwise retry is required
 *
 * radix_tree_deref_retry must be used with radix_tree_deref_slot.
 */
static inline int radix_tree_deref_retry(void *arg)
{
        return unlikely(radix_tree_is_internal_node(arg));
}

/**
 * radix_tree_exception        - radix_tree_deref_slot returned either exception?
 * @arg:        value returned by radix_tree_deref_slot
 * Returns:        0 if well-aligned pointer, non-0 if either kind of exception.
 */
static inline int radix_tree_exception(void *arg)
{
        return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK);
}

int radix_tree_insert(struct radix_tree_root *, unsigned long index,
                        void *);
void *__radix_tree_lookup(const struct radix_tree_root *, unsigned long index,
                          struct radix_tree_node **nodep, void __rcu ***slotp);
void *radix_tree_lookup(const struct radix_tree_root *, unsigned long);
void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *,
                                        unsigned long index);
void __radix_tree_replace(struct radix_tree_root *, struct radix_tree_node *,
                          void __rcu **slot, void *entry);
void radix_tree_iter_replace(struct radix_tree_root *,
                const struct radix_tree_iter *, void __rcu **slot, void *entry);
void radix_tree_replace_slot(struct radix_tree_root *,
                             void __rcu **slot, void *entry);
void radix_tree_iter_delete(struct radix_tree_root *,
                        struct radix_tree_iter *iter, void __rcu **slot);
void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
void *radix_tree_delete(struct radix_tree_root *, unsigned long);
unsigned int radix_tree_gang_lookup(const struct radix_tree_root *,
                        void **results, unsigned long first_index,
                        unsigned int max_items);
int radix_tree_preload(gfp_t gfp_mask);
int radix_tree_maybe_preload(gfp_t gfp_mask);
void radix_tree_init(void);
void *radix_tree_tag_set(struct radix_tree_root *,
                        unsigned long index, unsigned int tag);
void *radix_tree_tag_clear(struct radix_tree_root *,
                        unsigned long index, unsigned int tag);
int radix_tree_tag_get(const struct radix_tree_root *,
                        unsigned long index, unsigned int tag);
void radix_tree_iter_tag_clear(struct radix_tree_root *,
                const struct radix_tree_iter *iter, unsigned int tag);
unsigned int radix_tree_gang_lookup_tag(const struct radix_tree_root *,
                void **results, unsigned long first_index,
                unsigned int max_items, unsigned int tag);
unsigned int radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *,
                void __rcu ***results, unsigned long first_index,
                unsigned int max_items, unsigned int tag);
int radix_tree_tagged(const struct radix_tree_root *, unsigned int tag);

static inline void radix_tree_preload_end(void)
{
        local_unlock(&radix_tree_preloads.lock);
}

void __rcu **idr_get_free(struct radix_tree_root *root,
                              struct radix_tree_iter *iter, gfp_t gfp,
                              unsigned long max);

enum {
        RADIX_TREE_ITER_TAG_MASK = 0x0f,        /* tag index in lower nybble */
        RADIX_TREE_ITER_TAGGED   = 0x10,        /* lookup tagged slots */
        RADIX_TREE_ITER_CONTIG   = 0x20,        /* stop at first hole */
};

/**
 * radix_tree_iter_init - initialize radix tree iterator
 *
 * @iter:        pointer to iterator state
 * @start:        iteration starting index
 * Returns:        NULL
 */
static __always_inline void __rcu **
radix_tree_iter_init(struct radix_tree_iter *iter, unsigned long start)
{
        /*
         * Leave iter->tags uninitialized. radix_tree_next_chunk() will fill it
         * in the case of a successful tagged chunk lookup.  If the lookup was
         * unsuccessful or non-tagged then nobody cares about ->tags.
         *
         * Set index to zero to bypass next_index overflow protection.
         * See the comment in radix_tree_next_chunk() for details.
         */
        iter->index = 0;
        iter->next_index = start;
        return NULL;
}

/**
 * radix_tree_next_chunk - find next chunk of slots for iteration
 *
 * @root:        radix tree root
 * @iter:        iterator state
 * @flags:        RADIX_TREE_ITER_* flags and tag index
 * Returns:        pointer to chunk first slot, or NULL if there no more left
 *
 * This function looks up the next chunk in the radix tree starting from
 * @iter->next_index.  It returns a pointer to the chunk's first slot.
 * Also it fills @iter with data about chunk: position in the tree (index),
 * its end (next_index), and constructs a bit mask for tagged iterating (tags).
 */
void __rcu **radix_tree_next_chunk(const struct radix_tree_root *,
                             struct radix_tree_iter *iter, unsigned flags);

/**
 * radix_tree_iter_lookup - look up an index in the radix tree
 * @root: radix tree root
 * @iter: iterator state
 * @index: key to look up
 *
 * If @index is present in the radix tree, this function returns the slot
 * containing it and updates @iter to describe the entry.  If @index is not
 * present, it returns NULL.
 */
static inline void __rcu **
radix_tree_iter_lookup(const struct radix_tree_root *root,
                        struct radix_tree_iter *iter, unsigned long index)
{
        radix_tree_iter_init(iter, index);
        return radix_tree_next_chunk(root, iter, RADIX_TREE_ITER_CONTIG);
}

/**
 * radix_tree_iter_retry - retry this chunk of the iteration
 * @iter:        iterator state
 *
 * If we iterate over a tree protected only by the RCU lock, a race
 * against deletion or creation may result in seeing a slot for which
 * radix_tree_deref_retry() returns true.  If so, call this function
 * and continue the iteration.
 */
static inline __must_check
void __rcu **radix_tree_iter_retry(struct radix_tree_iter *iter)
{
        iter->next_index = iter->index;
        iter->tags = 0;
        return NULL;
}

static inline unsigned long
__radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots)
{
        return iter->index + slots;
}

/**
 * radix_tree_iter_resume - resume iterating when the chunk may be invalid
 * @slot: pointer to current slot
 * @iter: iterator state
 * Returns: New slot pointer
 *
 * If the iterator needs to release then reacquire a lock, the chunk may
 * have been invalidated by an insertion or deletion.  Call this function
 * before releasing the lock to continue the iteration from the next index.
 */
void __rcu **__must_check radix_tree_iter_resume(void __rcu **slot,
                                        struct radix_tree_iter *iter);

/**
 * radix_tree_chunk_size - get current chunk size
 *
 * @iter:        pointer to radix tree iterator
 * Returns:        current chunk size
 */
static __always_inline long
radix_tree_chunk_size(struct radix_tree_iter *iter)
{
        return iter->next_index - iter->index;
}

/**
 * radix_tree_next_slot - find next slot in chunk
 *
 * @slot:        pointer to current slot
 * @iter:        pointer to iterator state
 * @flags:        RADIX_TREE_ITER_*, should be constant
 * Returns:        pointer to next slot, or NULL if there no more left
 *
 * This function updates @iter->index in the case of a successful lookup.
 * For tagged lookup it also eats @iter->tags.
 *
 * There are several cases where 'slot' can be passed in as NULL to this
 * function.  These cases result from the use of radix_tree_iter_resume() or
 * radix_tree_iter_retry().  In these cases we don't end up dereferencing
 * 'slot' because either:
 * a) we are doing tagged iteration and iter->tags has been set to 0, or
 * b) we are doing non-tagged iteration, and iter->index and iter->next_index
 *    have been set up so that radix_tree_chunk_size() returns 1 or 0.
 */
static __always_inline void __rcu **radix_tree_next_slot(void __rcu **slot,
                                struct radix_tree_iter *iter, unsigned flags)
{
        if (flags & RADIX_TREE_ITER_TAGGED) {
                iter->tags >>= 1;
                if (unlikely(!iter->tags))
                        return NULL;
                if (likely(iter->tags & 1ul)) {
                        iter->index = __radix_tree_iter_add(iter, 1);
                        slot++;
                        goto found;
                }
                if (!(flags & RADIX_TREE_ITER_CONTIG)) {
                        unsigned offset = __ffs(iter->tags);

                        iter->tags >>= offset++;
                        iter->index = __radix_tree_iter_add(iter, offset);
                        slot += offset;
                        goto found;
                }
        } else {
                long count = radix_tree_chunk_size(iter);

                while (--count > 0) {
                        slot++;
                        iter->index = __radix_tree_iter_add(iter, 1);

                        if (likely(*slot))
                                goto found;
                        if (flags & RADIX_TREE_ITER_CONTIG) {
                                /* forbid switching to the next chunk */
                                iter->next_index = 0;
                                break;
                        }
                }
        }
        return NULL;

 found:
        return slot;
}

/**
 * radix_tree_for_each_slot - iterate over non-empty slots
 *
 * @slot:        the void** variable for pointer to slot
 * @root:        the struct radix_tree_root pointer
 * @iter:        the struct radix_tree_iter pointer
 * @start:        iteration starting index
 *
 * @slot points to radix tree slot, @iter->index contains its index.
 */
#define radix_tree_for_each_slot(slot, root, iter, start)                \
        for (slot = radix_tree_iter_init(iter, start) ;                        \
             slot || (slot = radix_tree_next_chunk(root, iter, 0)) ;        \
             slot = radix_tree_next_slot(slot, iter, 0))

/**
 * radix_tree_for_each_tagged - iterate over tagged slots
 *
 * @slot:        the void** variable for pointer to slot
 * @root:        the struct radix_tree_root pointer
 * @iter:        the struct radix_tree_iter pointer
 * @start:        iteration starting index
 * @tag:        tag index
 *
 * @slot points to radix tree slot, @iter->index contains its index.
 */
#define radix_tree_for_each_tagged(slot, root, iter, start, tag)        \
        for (slot = radix_tree_iter_init(iter, start) ;                        \
             slot || (slot = radix_tree_next_chunk(root, iter,                \
                              RADIX_TREE_ITER_TAGGED | tag)) ;                \
             slot = radix_tree_next_slot(slot, iter,                        \
                                RADIX_TREE_ITER_TAGGED | tag))

#endif /* _LINUX_RADIX_TREE_H */





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  317 




















































  319 
  320 








  319 














  317 


  315 
  320 
  317 


  318 






























































































  316 
  315 




  318 



  315 
  320 






































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/file.c
 *
 *  Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes
 *
 *  Manage the dynamic fd arrays in the process files_struct.
 */

#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/bitops.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/close_range.h>
#include <linux/file_ref.h>
#include <net/sock.h>
#include <linux/init_task.h>

#include "internal.h"

static noinline bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt)
{
        /*
         * If the reference count was already in the dead zone, then this
         * put() operation is imbalanced. Warn, put the reference count back to
         * DEAD and tell the caller to not deconstruct the object.
         */
        if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) {
                atomic_long_set(&ref->refcnt, FILE_REF_DEAD);
                return false;
        }

        /*
         * This is a put() operation on a saturated refcount. Restore the
         * mean saturation value and tell the caller to not deconstruct the
         * object.
         */
        if (cnt > FILE_REF_MAXREF)
                atomic_long_set(&ref->refcnt, FILE_REF_SATURATED);
        return false;
}

/**
 * __file_ref_put - Slowpath of file_ref_put()
 * @ref:        Pointer to the reference count
 * @cnt:        Current reference count
 *
 * Invoked when the reference count is outside of the valid zone.
 *
 * Return:
 *        True if this was the last reference with no future references
 *        possible. This signals the caller that it can safely schedule the
 *        object, which is protected by the reference counter, for
 *        deconstruction.
 *
 *        False if there are still active references or the put() raced
 *        with a concurrent get()/put() pair. Caller is not allowed to
 *        deconstruct the protected object.
 */
bool __file_ref_put(file_ref_t *ref, unsigned long cnt)
{
        /* Did this drop the last reference? */
        if (likely(cnt == FILE_REF_NOREF)) {
                /*
                 * Carefully try to set the reference count to FILE_REF_DEAD.
                 *
                 * This can fail if a concurrent get() operation has
                 * elevated it again or the corresponding put() even marked
                 * it dead already. Both are valid situations and do not
                 * require a retry. If this fails the caller is not
                 * allowed to deconstruct the object.
                 */
                if (!atomic_long_try_cmpxchg_release(&ref->refcnt, &cnt, FILE_REF_DEAD))
                        return false;

                /*
                 * The caller can safely schedule the object for
                 * deconstruction. Provide acquire ordering.
                 */
                smp_acquire__after_ctrl_dep();
                return true;
        }

        return __file_ref_put_badval(ref, cnt);
}
EXPORT_SYMBOL_GPL(__file_ref_put);

unsigned int sysctl_nr_open __read_mostly = 1024*1024;
unsigned int sysctl_nr_open_min = BITS_PER_LONG;
/* our min() is unusable in constant expressions ;-/ */
#define __const_min(x, y) ((x) < (y) ? (x) : (y))
unsigned int sysctl_nr_open_max =
        __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG;

static void __free_fdtable(struct fdtable *fdt)
{
        kvfree(fdt->fd);
        kvfree(fdt->open_fds);
        kfree(fdt);
}

static void free_fdtable_rcu(struct rcu_head *rcu)
{
        __free_fdtable(container_of(rcu, struct fdtable, rcu));
}

#define BITBIT_NR(nr)        BITS_TO_LONGS(BITS_TO_LONGS(nr))
#define BITBIT_SIZE(nr)        (BITBIT_NR(nr) * sizeof(long))

#define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds
/*
 * Copy 'count' fd bits from the old table to the new table and clear the extra
 * space if any.  This does not copy the file pointers.  Called with the files
 * spinlock held for write.
 */
static inline void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
                            unsigned int copy_words)
{
        unsigned int nwords = fdt_words(nfdt);

        bitmap_copy_and_extend(nfdt->open_fds, ofdt->open_fds,
                        copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG);
        bitmap_copy_and_extend(nfdt->close_on_exec, ofdt->close_on_exec,
                        copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG);
        bitmap_copy_and_extend(nfdt->full_fds_bits, ofdt->full_fds_bits,
                        copy_words, nwords);
}

/*
 * Copy all file descriptors from the old table to the new, expanded table and
 * clear the extra space.  Called with the files spinlock held for write.
 */
static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
{
        size_t cpy, set;

        BUG_ON(nfdt->max_fds < ofdt->max_fds);

        cpy = ofdt->max_fds * sizeof(struct file *);
        set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
        memcpy(nfdt->fd, ofdt->fd, cpy);
        memset((char *)nfdt->fd + cpy, 0, set);

        copy_fd_bitmaps(nfdt, ofdt, fdt_words(ofdt));
}

/*
 * Note how the fdtable bitmap allocations very much have to be a multiple of
 * BITS_PER_LONG. This is not only because we walk those things in chunks of
 * 'unsigned long' in some places, but simply because that is how the Linux
 * kernel bitmaps are defined to work: they are not "bits in an array of bytes",
 * they are very much "bits in an array of unsigned long".
 */
static struct fdtable *alloc_fdtable(unsigned int slots_wanted)
{
        struct fdtable *fdt;
        unsigned int nr;
        void *data;

        /*
         * Figure out how many fds we actually want to support in this fdtable.
         * Allocation steps are keyed to the size of the fdarray, since it
         * grows far faster than any of the other dynamic data. We try to fit
         * the fdarray into comfortable page-tuned chunks: starting at 1024B
         * and growing in powers of two from there on.  Since we called only
         * with slots_wanted > BITS_PER_LONG (embedded instance in files->fdtab
         * already gives BITS_PER_LONG slots), the above boils down to
         * 1.  use the smallest power of two large enough to give us that many
         * slots.
         * 2.  on 32bit skip 64 and 128 - the minimal capacity we want there is
         * 256 slots (i.e. 1Kb fd array).
         * 3.  on 64bit don't skip anything, 1Kb fd array means 128 slots there
         * and we are never going to be asked for 64 or less.
         */
        if (IS_ENABLED(CONFIG_32BIT) && slots_wanted < 256)
                nr = 256;
        else
                nr = roundup_pow_of_two(slots_wanted);
        /*
         * Note that this can drive nr *below* what we had passed if sysctl_nr_open
         * had been set lower between the check in expand_files() and here.
         *
         * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
         * bitmaps handling below becomes unpleasant, to put it mildly...
         */
        if (unlikely(nr > sysctl_nr_open)) {
                nr = round_down(sysctl_nr_open, BITS_PER_LONG);
                if (nr < slots_wanted)
                        return ERR_PTR(-EMFILE);
        }

        /*
         * Check if the allocation size would exceed INT_MAX. kvmalloc_array()
         * and kvmalloc() will warn if the allocation size is greater than
         * INT_MAX, as filp_cachep objects are not __GFP_NOWARN.
         *
         * This can happen when sysctl_nr_open is set to a very high value and
         * a process tries to use a file descriptor near that limit. For example,
         * if sysctl_nr_open is set to 1073741816 (0x3ffffff8) - which is what
         * systemd typically sets it to - then trying to use a file descriptor
         * close to that value will require allocating a file descriptor table
         * that exceeds 8GB in size.
         */
        if (unlikely(nr > INT_MAX / sizeof(struct file *)))
                return ERR_PTR(-EMFILE);

        fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
        if (!fdt)
                goto out;
        fdt->max_fds = nr;
        data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT);
        if (!data)
                goto out_fdt;
        fdt->fd = data;

        data = kvmalloc(max_t(size_t,
                                 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES),
                                 GFP_KERNEL_ACCOUNT);
        if (!data)
                goto out_arr;
        fdt->open_fds = data;
        data += nr / BITS_PER_BYTE;
        fdt->close_on_exec = data;
        data += nr / BITS_PER_BYTE;
        fdt->full_fds_bits = data;

        return fdt;

out_arr:
        kvfree(fdt->fd);
out_fdt:
        kfree(fdt);
out:
        return ERR_PTR(-ENOMEM);
}

/*
 * Expand the file descriptor table.
 * This function will allocate a new fdtable and both fd array and fdset, of
 * the given size.
 * Return <0 error code on error; 0 on successful completion.
 * The files->file_lock should be held on entry, and will be held on exit.
 */
static int expand_fdtable(struct files_struct *files, unsigned int nr)
        __releases(files->file_lock)
        __acquires(files->file_lock)
{
        struct fdtable *new_fdt, *cur_fdt;

        spin_unlock(&files->file_lock);
        new_fdt = alloc_fdtable(nr + 1);

        /* make sure all fd_install() have seen resize_in_progress
         * or have finished their rcu_read_lock_sched() section.
         */
        if (atomic_read(&files->count) > 1)
                synchronize_rcu();

        spin_lock(&files->file_lock);
        if (IS_ERR(new_fdt))
                return PTR_ERR(new_fdt);
        cur_fdt = files_fdtable(files);
        BUG_ON(nr < cur_fdt->max_fds);
        copy_fdtable(new_fdt, cur_fdt);
        rcu_assign_pointer(files->fdt, new_fdt);
        if (cur_fdt != &files->fdtab)
                call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
        /* coupled with smp_rmb() in fd_install() */
        smp_wmb();
        return 0;
}

/*
 * Expand files.
 * This function will expand the file structures, if the requested size exceeds
 * the current capacity and there is room for expansion.
 * Return <0 error code on error; 0 on success.
 * The files->file_lock should be held on entry, and will be held on exit.
 */
static int expand_files(struct files_struct *files, unsigned int nr)
        __releases(files->file_lock)
        __acquires(files->file_lock)
{
        struct fdtable *fdt;
        int error;

repeat:
        fdt = files_fdtable(files);

        /* Do we need to expand? */
        if (nr < fdt->max_fds)
                return 0;

        if (unlikely(files->resize_in_progress)) {
                spin_unlock(&files->file_lock);
                wait_event(files->resize_wait, !files->resize_in_progress);
                spin_lock(&files->file_lock);
                goto repeat;
        }

        /* Can we expand? */
        if (unlikely(nr >= sysctl_nr_open))
                return -EMFILE;

        /* All good, so we try */
        files->resize_in_progress = true;
        error = expand_fdtable(files, nr);
        files->resize_in_progress = false;

        wake_up_all(&files->resize_wait);
        return error;
}

static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt,
                                       bool set)
{
        if (set) {
                __set_bit(fd, fdt->close_on_exec);
        } else {
                if (test_bit(fd, fdt->close_on_exec))
                        __clear_bit(fd, fdt->close_on_exec);
        }
}

static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt, bool set)
{
        __set_bit(fd, fdt->open_fds);
        __set_close_on_exec(fd, fdt, set);
        fd /= BITS_PER_LONG;
        if (!~fdt->open_fds[fd])
                __set_bit(fd, fdt->full_fds_bits);
}

static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
{
        __clear_bit(fd, fdt->open_fds);
        fd /= BITS_PER_LONG;
        if (test_bit(fd, fdt->full_fds_bits))
                __clear_bit(fd, fdt->full_fds_bits);
}

static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
{
        return test_bit(fd, fdt->open_fds);
}

/*
 * Note that a sane fdtable size always has to be a multiple of
 * BITS_PER_LONG, since we have bitmaps that are sized by this.
 *
 * punch_hole is optional - when close_range() is asked to unshare
 * and close, we don't need to copy descriptors in that range, so
 * a smaller cloned descriptor table might suffice if the last
 * currently opened descriptor falls into that range.
 */
static unsigned int sane_fdtable_size(struct fdtable *fdt, struct fd_range *punch_hole)
{
        unsigned int last = find_last_bit(fdt->open_fds, fdt->max_fds);

        if (last == fdt->max_fds)
                return NR_OPEN_DEFAULT;
        if (punch_hole && punch_hole->to >= last && punch_hole->from <= last) {
                last = find_last_bit(fdt->open_fds, punch_hole->from);
                if (last == punch_hole->from)
                        return NR_OPEN_DEFAULT;
        }
        return ALIGN(last + 1, BITS_PER_LONG);
}

/*
 * Allocate a new descriptor table and copy contents from the passed in
 * instance.  Returns a pointer to cloned table on success, ERR_PTR()
 * on failure.  For 'punch_hole' see sane_fdtable_size().
 */
struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_hole)
{
        struct files_struct *newf;
        struct file **old_fds, **new_fds;
        unsigned int open_files, i;
        struct fdtable *old_fdt, *new_fdt;

        newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
        if (!newf)
                return ERR_PTR(-ENOMEM);

        atomic_set(&newf->count, 1);

        spin_lock_init(&newf->file_lock);
        newf->resize_in_progress = false;
        init_waitqueue_head(&newf->resize_wait);
        newf->next_fd = 0;
        new_fdt = &newf->fdtab;
        new_fdt->max_fds = NR_OPEN_DEFAULT;
        new_fdt->close_on_exec = newf->close_on_exec_init;
        new_fdt->open_fds = newf->open_fds_init;
        new_fdt->full_fds_bits = newf->full_fds_bits_init;
        new_fdt->fd = &newf->fd_array[0];

        spin_lock(&oldf->file_lock);
        old_fdt = files_fdtable(oldf);
        open_files = sane_fdtable_size(old_fdt, punch_hole);

        /*
         * Check whether we need to allocate a larger fd array and fd set.
         */
        while (unlikely(open_files > new_fdt->max_fds)) {
                spin_unlock(&oldf->file_lock);

                if (new_fdt != &newf->fdtab)
                        __free_fdtable(new_fdt);

                new_fdt = alloc_fdtable(open_files);
                if (IS_ERR(new_fdt)) {
                        kmem_cache_free(files_cachep, newf);
                        return ERR_CAST(new_fdt);
                }

                /*
                 * Reacquire the oldf lock and a pointer to its fd table
                 * who knows it may have a new bigger fd table. We need
                 * the latest pointer.
                 */
                spin_lock(&oldf->file_lock);
                old_fdt = files_fdtable(oldf);
                open_files = sane_fdtable_size(old_fdt, punch_hole);
        }

        copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG);

        old_fds = old_fdt->fd;
        new_fds = new_fdt->fd;

        /*
         * We may be racing against fd allocation from other threads using this
         * files_struct, despite holding ->file_lock.
         *
         * alloc_fd() might have already claimed a slot, while fd_install()
         * did not populate it yet. Note the latter operates locklessly, so
         * the file can show up as we are walking the array below.
         *
         * At the same time we know no files will disappear as all other
         * operations take the lock.
         *
         * Instead of trying to placate userspace racing with itself, we
         * ref the file if we see it and mark the fd slot as unused otherwise.
         */
        for (i = open_files; i != 0; i--) {
                struct file *f = rcu_dereference_raw(*old_fds++);
                if (f) {
                        get_file(f);
                } else {
                        __clear_open_fd(open_files - i, new_fdt);
                }
                rcu_assign_pointer(*new_fds++, f);
        }
        spin_unlock(&oldf->file_lock);

        /* clear the remainder */
        memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *));

        rcu_assign_pointer(newf->fdt, new_fdt);

        return newf;
}

static struct fdtable *close_files(struct files_struct * files)
{
        /*
         * It is safe to dereference the fd table without RCU or
         * ->file_lock because this is the last reference to the
         * files structure.
         */
        struct fdtable *fdt = rcu_dereference_raw(files->fdt);
        unsigned int i, j = 0;

        for (;;) {
                unsigned long set;
                i = j * BITS_PER_LONG;
                if (i >= fdt->max_fds)
                        break;
                set = fdt->open_fds[j++];
                while (set) {
                        if (set & 1) {
                                struct file *file = fdt->fd[i];
                                if (file) {
                                        filp_close(file, files);
                                        cond_resched();
                                }
                        }
                        i++;
                        set >>= 1;
                }
        }

        return fdt;
}

void put_files_struct(struct files_struct *files)
{
        if (atomic_dec_and_test(&files->count)) {
                struct fdtable *fdt = close_files(files);

                /* free the arrays if they are not embedded */
                if (fdt != &files->fdtab)
                        __free_fdtable(fdt);
                kmem_cache_free(files_cachep, files);
        }
}

void exit_files(struct task_struct *tsk)
{
        struct files_struct * files = tsk->files;

        if (files) {
                task_lock(tsk);
                tsk->files = NULL;
                task_unlock(tsk);
                put_files_struct(files);
        }
}

struct files_struct init_files = {
        .count                = ATOMIC_INIT(1),
        .fdt                = &init_files.fdtab,
        .fdtab                = {
                .max_fds        = NR_OPEN_DEFAULT,
                .fd                = &init_files.fd_array[0],
                .close_on_exec        = init_files.close_on_exec_init,
                .open_fds        = init_files.open_fds_init,
                .full_fds_bits        = init_files.full_fds_bits_init,
        },
        .file_lock        = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
        .resize_wait        = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait),
};

static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
{
        unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */
        unsigned int maxbit = maxfd / BITS_PER_LONG;
        unsigned int bitbit = start / BITS_PER_LONG;
        unsigned int bit;

        /*
         * Try to avoid looking at the second level bitmap
         */
        bit = find_next_zero_bit(&fdt->open_fds[bitbit], BITS_PER_LONG,
                                 start & (BITS_PER_LONG - 1));
        if (bit < BITS_PER_LONG)
                return bit + bitbit * BITS_PER_LONG;

        bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
        if (bitbit >= maxfd)
                return maxfd;
        if (bitbit > start)
                start = bitbit;
        return find_next_zero_bit(fdt->open_fds, maxfd, start);
}

/*
 * allocate a file descriptor, mark it busy.
 */
static int alloc_fd(unsigned start, unsigned end, unsigned flags)
{
        struct files_struct *files = current->files;
        unsigned int fd;
        int error;
        struct fdtable *fdt;

        spin_lock(&files->file_lock);
repeat:
        fdt = files_fdtable(files);
        fd = start;
        if (fd < files->next_fd)
                fd = files->next_fd;

        if (likely(fd < fdt->max_fds))
                fd = find_next_fd(fdt, fd);

        /*
         * N.B. For clone tasks sharing a files structure, this test
         * will limit the total number of files that can be opened.
         */
        error = -EMFILE;
        if (unlikely(fd >= end))
                goto out;

        if (unlikely(fd >= fdt->max_fds)) {
                error = expand_files(files, fd);
                if (error < 0)
                        goto out;

                goto repeat;
        }

        if (start <= files->next_fd)
                files->next_fd = fd + 1;

        __set_open_fd(fd, fdt, flags & O_CLOEXEC);
        error = fd;
        VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);

out:
        spin_unlock(&files->file_lock);
        return error;
}

int __get_unused_fd_flags(unsigned flags, unsigned long nofile)
{
        return alloc_fd(0, nofile, flags);
}

int get_unused_fd_flags(unsigned flags)
{
        return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE));
}
EXPORT_SYMBOL(get_unused_fd_flags);

static void __put_unused_fd(struct files_struct *files, unsigned int fd)
{
        struct fdtable *fdt = files_fdtable(files);
        __clear_open_fd(fd, fdt);
        if (fd < files->next_fd)
                files->next_fd = fd;
}

void put_unused_fd(unsigned int fd)
{
        struct files_struct *files = current->files;
        spin_lock(&files->file_lock);
        __put_unused_fd(files, fd);
        spin_unlock(&files->file_lock);
}

EXPORT_SYMBOL(put_unused_fd);

/**
 * fd_install - install a file pointer in the fd array
 * @fd: file descriptor to install the file in
 * @file: the file to install
 *
 * This consumes the "file" refcount, so callers should treat it
 * as if they had called fput(file).
 */
void fd_install(unsigned int fd, struct file *file)
{
        struct files_struct *files = current->files;
        struct fdtable *fdt;

        if (WARN_ON_ONCE(unlikely(file->f_mode & FMODE_BACKING)))
                return;

        rcu_read_lock_sched();

        if (unlikely(files->resize_in_progress)) {
                rcu_read_unlock_sched();
                spin_lock(&files->file_lock);
                fdt = files_fdtable(files);
                VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
                rcu_assign_pointer(fdt->fd[fd], file);
                spin_unlock(&files->file_lock);
                return;
        }
        /* coupled with smp_wmb() in expand_fdtable() */
        smp_rmb();
        fdt = rcu_dereference_sched(files->fdt);
        VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
        rcu_assign_pointer(fdt->fd[fd], file);
        rcu_read_unlock_sched();
}

EXPORT_SYMBOL(fd_install);

/**
 * file_close_fd_locked - return file associated with fd
 * @files: file struct to retrieve file from
 * @fd: file descriptor to retrieve file for
 *
 * Doesn't take a separate reference count.
 *
 * Context: files_lock must be held.
 *
 * Returns: The file associated with @fd (NULL if @fd is not open)
 */
struct file *file_close_fd_locked(struct files_struct *files, unsigned fd)
{
        struct fdtable *fdt = files_fdtable(files);
        struct file *file;

        lockdep_assert_held(&files->file_lock);

        if (fd >= fdt->max_fds)
                return NULL;

        fd = array_index_nospec(fd, fdt->max_fds);
        file = rcu_dereference_raw(fdt->fd[fd]);
        if (file) {
                rcu_assign_pointer(fdt->fd[fd], NULL);
                __put_unused_fd(files, fd);
        }
        return file;
}

int close_fd(unsigned fd)
{
        struct files_struct *files = current->files;
        struct file *file;

        spin_lock(&files->file_lock);
        file = file_close_fd_locked(files, fd);
        spin_unlock(&files->file_lock);
        if (!file)
                return -EBADF;

        return filp_close(file, files);
}
EXPORT_SYMBOL(close_fd);

/**
 * last_fd - return last valid index into fd table
 * @fdt: File descriptor table.
 *
 * Context: Either rcu read lock or files_lock must be held.
 *
 * Returns: Last valid index into fdtable.
 */
static inline unsigned last_fd(struct fdtable *fdt)
{
        return fdt->max_fds - 1;
}

static inline void __range_cloexec(struct files_struct *cur_fds,
                                   unsigned int fd, unsigned int max_fd)
{
        struct fdtable *fdt;

        /* make sure we're using the correct maximum value */
        spin_lock(&cur_fds->file_lock);
        fdt = files_fdtable(cur_fds);
        max_fd = min(last_fd(fdt), max_fd);
        if (fd <= max_fd)
                bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1);
        spin_unlock(&cur_fds->file_lock);
}

static inline void __range_close(struct files_struct *files, unsigned int fd,
                                 unsigned int max_fd)
{
        struct file *file;
        unsigned n;

        spin_lock(&files->file_lock);
        n = last_fd(files_fdtable(files));
        max_fd = min(max_fd, n);

        for (; fd <= max_fd; fd++) {
                file = file_close_fd_locked(files, fd);
                if (file) {
                        spin_unlock(&files->file_lock);
                        filp_close(file, files);
                        cond_resched();
                        spin_lock(&files->file_lock);
                } else if (need_resched()) {
                        spin_unlock(&files->file_lock);
                        cond_resched();
                        spin_lock(&files->file_lock);
                }
        }
        spin_unlock(&files->file_lock);
}

/**
 * sys_close_range() - Close all file descriptors in a given range.
 *
 * @fd:     starting file descriptor to close
 * @max_fd: last file descriptor to close
 * @flags:  CLOSE_RANGE flags.
 *
 * This closes a range of file descriptors. All file descriptors
 * from @fd up to and including @max_fd are closed.
 * Currently, errors to close a given file descriptor are ignored.
 */
SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd,
                unsigned int, flags)
{
        struct task_struct *me = current;
        struct files_struct *cur_fds = me->files, *fds = NULL;

        if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC))
                return -EINVAL;

        if (fd > max_fd)
                return -EINVAL;

        if ((flags & CLOSE_RANGE_UNSHARE) && atomic_read(&cur_fds->count) > 1) {
                struct fd_range range = {fd, max_fd}, *punch_hole = &range;

                /*
                 * If the caller requested all fds to be made cloexec we always
                 * copy all of the file descriptors since they still want to
                 * use them.
                 */
                if (flags & CLOSE_RANGE_CLOEXEC)
                        punch_hole = NULL;

                fds = dup_fd(cur_fds, punch_hole);
                if (IS_ERR(fds))
                        return PTR_ERR(fds);
                /*
                 * We used to share our file descriptor table, and have now
                 * created a private one, make sure we're using it below.
                 */
                swap(cur_fds, fds);
        }

        if (flags & CLOSE_RANGE_CLOEXEC)
                __range_cloexec(cur_fds, fd, max_fd);
        else
                __range_close(cur_fds, fd, max_fd);

        if (fds) {
                /*
                 * We're done closing the files we were supposed to. Time to install
                 * the new file descriptor table and drop the old one.
                 */
                task_lock(me);
                me->files = cur_fds;
                task_unlock(me);
                put_files_struct(fds);
        }

        return 0;
}

/**
 * file_close_fd - return file associated with fd
 * @fd: file descriptor to retrieve file for
 *
 * Doesn't take a separate reference count.
 *
 * Returns: The file associated with @fd (NULL if @fd is not open)
 */
struct file *file_close_fd(unsigned int fd)
{
        struct files_struct *files = current->files;
        struct file *file;

        spin_lock(&files->file_lock);
        file = file_close_fd_locked(files, fd);
        spin_unlock(&files->file_lock);

        return file;
}

void do_close_on_exec(struct files_struct *files)
{
        unsigned i;
        struct fdtable *fdt;

        /* exec unshares first */
        spin_lock(&files->file_lock);
        for (i = 0; ; i++) {
                unsigned long set;
                unsigned fd = i * BITS_PER_LONG;
                fdt = files_fdtable(files);
                if (fd >= fdt->max_fds)
                        break;
                set = fdt->close_on_exec[i];
                if (!set)
                        continue;
                fdt->close_on_exec[i] = 0;
                for ( ; set ; fd++, set >>= 1) {
                        struct file *file;
                        if (!(set & 1))
                                continue;
                        file = fdt->fd[fd];
                        if (!file)
                                continue;
                        rcu_assign_pointer(fdt->fd[fd], NULL);
                        __put_unused_fd(files, fd);
                        spin_unlock(&files->file_lock);
                        filp_close(file, files);
                        cond_resched();
                        spin_lock(&files->file_lock);
                }

        }
        spin_unlock(&files->file_lock);
}

static struct file *__get_file_rcu(struct file __rcu **f)
{
        struct file __rcu *file;
        struct file __rcu *file_reloaded;
        struct file __rcu *file_reloaded_cmp;

        file = rcu_dereference_raw(*f);
        if (!file)
                return NULL;

        if (unlikely(!file_ref_get(&file->f_ref)))
                return ERR_PTR(-EAGAIN);

        file_reloaded = rcu_dereference_raw(*f);

        /*
         * Ensure that all accesses have a dependency on the load from
         * rcu_dereference_raw() above so we get correct ordering
         * between reuse/allocation and the pointer check below.
         */
        file_reloaded_cmp = file_reloaded;
        OPTIMIZER_HIDE_VAR(file_reloaded_cmp);

        /*
         * file_ref_get() above provided a full memory barrier when we
         * acquired a reference.
         *
         * This is paired with the write barrier from assigning to the
         * __rcu protected file pointer so that if that pointer still
         * matches the current file, we know we have successfully
         * acquired a reference to the right file.
         *
         * If the pointers don't match the file has been reallocated by
         * SLAB_TYPESAFE_BY_RCU.
         */
        if (file == file_reloaded_cmp)
                return file_reloaded;

        fput(file);
        return ERR_PTR(-EAGAIN);
}

/**
 * get_file_rcu - try go get a reference to a file under rcu
 * @f: the file to get a reference on
 *
 * This function tries to get a reference on @f carefully verifying that
 * @f hasn't been reused.
 *
 * This function should rarely have to be used and only by users who
 * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
 *
 * Return: Returns @f with the reference count increased or NULL.
 */
struct file *get_file_rcu(struct file __rcu **f)
{
        for (;;) {
                struct file __rcu *file;

                file = __get_file_rcu(f);
                if (!IS_ERR(file))
                        return file;
        }
}
EXPORT_SYMBOL_GPL(get_file_rcu);

/**
 * get_file_active - try go get a reference to a file
 * @f: the file to get a reference on
 *
 * In contast to get_file_rcu() the pointer itself isn't part of the
 * reference counting.
 *
 * This function should rarely have to be used and only by users who
 * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
 *
 * Return: Returns @f with the reference count increased or NULL.
 */
struct file *get_file_active(struct file **f)
{
        struct file __rcu *file;

        rcu_read_lock();
        file = __get_file_rcu(f);
        rcu_read_unlock();
        if (IS_ERR(file))
                file = NULL;
        return file;
}
EXPORT_SYMBOL_GPL(get_file_active);

static inline struct file *__fget_files_rcu(struct files_struct *files,
       unsigned int fd, fmode_t mask)
{
        for (;;) {
                struct file *file;
                struct fdtable *fdt = rcu_dereference_raw(files->fdt);
                struct file __rcu **fdentry;
                unsigned long nospec_mask;

                /* Mask is a 0 for invalid fd's, ~0 for valid ones */
                nospec_mask = array_index_mask_nospec(fd, fdt->max_fds);

                /*
                 * fdentry points to the 'fd' offset, or fdt->fd[0].
                 * Loading from fdt->fd[0] is always safe, because the
                 * array always exists.
                 */
                fdentry = fdt->fd + (fd & nospec_mask);

                /* Do the load, then mask any invalid result */
                file = rcu_dereference_raw(*fdentry);
                file = (void *)(nospec_mask & (unsigned long)file);
                if (unlikely(!file))
                        return NULL;

                /*
                 * Ok, we have a file pointer that was valid at
                 * some point, but it might have become stale since.
                 *
                 * We need to confirm it by incrementing the refcount
                 * and then check the lookup again.
                 *
                 * file_ref_get() gives us a full memory barrier. We
                 * only really need an 'acquire' one to protect the
                 * loads below, but we don't have that.
                 */
                if (unlikely(!file_ref_get(&file->f_ref)))
                        continue;

                /*
                 * Such a race can take two forms:
                 *
                 *  (a) the file ref already went down to zero and the
                 *      file hasn't been reused yet or the file count
                 *      isn't zero but the file has already been reused.
                 *
                 *  (b) the file table entry has changed under us.
                 *       Note that we don't need to re-check the 'fdt->fd'
                 *       pointer having changed, because it always goes
                 *       hand-in-hand with 'fdt'.
                 *
                 * If so, we need to put our ref and try again.
                 */
                if (unlikely(file != rcu_dereference_raw(*fdentry)) ||
                    unlikely(rcu_dereference_raw(files->fdt) != fdt)) {
                        fput(file);
                        continue;
                }

                /*
                 * This isn't the file we're looking for or we're not
                 * allowed to get a reference to it.
                 */
                if (unlikely(file->f_mode & mask)) {
                        fput(file);
                        return NULL;
                }

                /*
                 * Ok, we have a ref to the file, and checked that it
                 * still exists.
                 */
                return file;
        }
}

static struct file *__fget_files(struct files_struct *files, unsigned int fd,
                                 fmode_t mask)
{
        struct file *file;

        rcu_read_lock();
        file = __fget_files_rcu(files, fd, mask);
        rcu_read_unlock();

        return file;
}

static inline struct file *__fget(unsigned int fd, fmode_t mask)
{
        return __fget_files(current->files, fd, mask);
}

struct file *fget(unsigned int fd)
{
        return __fget(fd, FMODE_PATH);
}
EXPORT_SYMBOL(fget);

struct file *fget_raw(unsigned int fd)
{
        return __fget(fd, 0);
}
EXPORT_SYMBOL(fget_raw);

struct file *fget_task(struct task_struct *task, unsigned int fd)
{
        struct file *file = NULL;

        task_lock(task);
        if (task->files)
                file = __fget_files(task->files, fd, 0);
        task_unlock(task);

        return file;
}

struct file *fget_task_next(struct task_struct *task, unsigned int *ret_fd)
{
        /* Must be called with rcu_read_lock held */
        struct files_struct *files;
        unsigned int fd = *ret_fd;
        struct file *file = NULL;

        task_lock(task);
        files = task->files;
        if (files) {
                rcu_read_lock();
                for (; fd < files_fdtable(files)->max_fds; fd++) {
                        file = __fget_files_rcu(files, fd, 0);
                        if (file)
                                break;
                }
                rcu_read_unlock();
        }
        task_unlock(task);
        *ret_fd = fd;
        return file;
}
EXPORT_SYMBOL(fget_task_next);

/*
 * Lightweight file lookup - no refcnt increment if fd table isn't shared.
 *
 * You can use this instead of fget if you satisfy all of the following
 * conditions:
 * 1) You must call fput_light before exiting the syscall and returning control
 *    to userspace (i.e. you cannot remember the returned struct file * after
 *    returning to userspace).
 * 2) You must not call filp_close on the returned struct file * in between
 *    calls to fget_light and fput_light.
 * 3) You must not clone the current task in between the calls to fget_light
 *    and fput_light.
 *
 * The fput_needed flag returned by fget_light should be passed to the
 * corresponding fput_light.
 *
 * (As an exception to rule 2, you can call filp_close between fget_light and
 * fput_light provided that you capture a real refcount with get_file before
 * the call to filp_close, and ensure that this real refcount is fput *after*
 * the fput_light call.)
 *
 * See also the documentation in rust/kernel/file.rs.
 */
static inline struct fd __fget_light(unsigned int fd, fmode_t mask)
{
        struct files_struct *files = current->files;
        struct file *file;

        /*
         * If another thread is concurrently calling close_fd() followed
         * by put_files_struct(), we must not observe the old table
         * entry combined with the new refcount - otherwise we could
         * return a file that is concurrently being freed.
         *
         * atomic_read_acquire() pairs with atomic_dec_and_test() in
         * put_files_struct().
         */
        if (likely(atomic_read_acquire(&files->count) == 1)) {
                file = files_lookup_fd_raw(files, fd);
                if (!file || unlikely(file->f_mode & mask))
                        return EMPTY_FD;
                return BORROWED_FD(file);
        } else {
                file = __fget_files(files, fd, mask);
                if (!file)
                        return EMPTY_FD;
                return CLONED_FD(file);
        }
}
struct fd fdget(unsigned int fd)
{
        return __fget_light(fd, FMODE_PATH);
}
EXPORT_SYMBOL(fdget);

struct fd fdget_raw(unsigned int fd)
{
        return __fget_light(fd, 0);
}

/*
 * Try to avoid f_pos locking. We only need it if the
 * file is marked for FMODE_ATOMIC_POS, and it can be
 * accessed multiple ways.
 *
 * Always do it for directories, because pidfd_getfd()
 * can make a file accessible even if it otherwise would
 * not be, and for directories this is a correctness
 * issue, not a "POSIX requirement".
 */
static inline bool file_needs_f_pos_lock(struct file *file)
{
        if (!(file->f_mode & FMODE_ATOMIC_POS))
                return false;
        if (__file_ref_read_raw(&file->f_ref) != FILE_REF_ONEREF)
                return true;
        if (file->f_op->iterate_shared)
                return true;
        return false;
}

bool file_seek_cur_needs_f_lock(struct file *file)
{
        if (!(file->f_mode & FMODE_ATOMIC_POS) && !file->f_op->iterate_shared)
                return false;

        /*
         * Note that we are not guaranteed to be called after fdget_pos() on
         * this file obj, in which case the caller is expected to provide the
         * appropriate locking.
         */

        return true;
}

struct fd fdget_pos(unsigned int fd)
{
        struct fd f = fdget(fd);
        struct file *file = fd_file(f);

        if (likely(file) && file_needs_f_pos_lock(file)) {
                f.word |= FDPUT_POS_UNLOCK;
                mutex_lock(&file->f_pos_lock);
        }
        return f;
}

void __f_unlock_pos(struct file *f)
{
        mutex_unlock(&f->f_pos_lock);
}

/*
 * We only lock f_pos if we have threads or if the file might be
 * shared with another process. In both cases we'll have an elevated
 * file count (done either by fdget() or by fork()).
 */

void set_close_on_exec(unsigned int fd, int flag)
{
        struct files_struct *files = current->files;
        spin_lock(&files->file_lock);
        __set_close_on_exec(fd, files_fdtable(files), flag);
        spin_unlock(&files->file_lock);
}

bool get_close_on_exec(unsigned int fd)
{
        bool res;
        rcu_read_lock();
        res = close_on_exec(fd, current->files);
        rcu_read_unlock();
        return res;
}

static int do_dup2(struct files_struct *files,
        struct file *file, unsigned fd, unsigned flags)
__releases(&files->file_lock)
{
        struct file *tofree;
        struct fdtable *fdt;

        /*
         * dup2() is expected to close the file installed in the target fd slot
         * (if any). However, userspace hand-picking a fd may be racing against
         * its own threads which happened to allocate it in open() et al but did
         * not populate it yet.
         *
         * Broadly speaking we may be racing against the following:
         * fd = get_unused_fd_flags();     // fd slot reserved, ->fd[fd] == NULL
         * file = hard_work_goes_here();
         * fd_install(fd, file);           // only now ->fd[fd] == file
         *
         * It is an invariant that a successfully allocated fd has a NULL entry
         * in the array until the matching fd_install().
         *
         * If we fit the window, we have the fd to populate, yet no target file
         * to close. Trying to ignore it and install our new file would violate
         * the invariant and make fd_install() overwrite our file.
         *
         * Things can be done(tm) to handle this. However, the issue does not
         * concern legitimate programs and we only need to make sure the kernel
         * does not trip over it.
         *
         * The simplest way out is to return an error if we find ourselves here.
         *
         * POSIX is silent on the issue, we return -EBUSY.
         */
        fdt = files_fdtable(files);
        fd = array_index_nospec(fd, fdt->max_fds);
        tofree = rcu_dereference_raw(fdt->fd[fd]);
        if (!tofree && fd_is_open(fd, fdt))
                goto Ebusy;
        get_file(file);
        rcu_assign_pointer(fdt->fd[fd], file);
        __set_open_fd(fd, fdt, flags & O_CLOEXEC);
        spin_unlock(&files->file_lock);

        if (tofree)
                filp_close(tofree, files);

        return fd;

Ebusy:
        spin_unlock(&files->file_lock);
        return -EBUSY;
}

int replace_fd(unsigned fd, struct file *file, unsigned flags)
{
        int err;
        struct files_struct *files = current->files;

        if (!file)
                return close_fd(fd);

        if (fd >= rlimit(RLIMIT_NOFILE))
                return -EBADF;

        spin_lock(&files->file_lock);
        err = expand_files(files, fd);
        if (unlikely(err < 0))
                goto out_unlock;
        err = do_dup2(files, file, fd, flags);
        if (err < 0)
                return err;
        return 0;

out_unlock:
        spin_unlock(&files->file_lock);
        return err;
}

/**
 * receive_fd() - Install received file into file descriptor table
 * @file: struct file that was received from another process
 * @ufd: __user pointer to write new fd number to
 * @o_flags: the O_* flags to apply to the new fd entry
 *
 * Installs a received file into the file descriptor table, with appropriate
 * checks and count updates. Optionally writes the fd number to userspace, if
 * @ufd is non-NULL.
 *
 * This helper handles its own reference counting of the incoming
 * struct file.
 *
 * Returns newly install fd or -ve on error.
 */
int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
{
        int new_fd;
        int error;

        error = security_file_receive(file);
        if (error)
                return error;

        new_fd = get_unused_fd_flags(o_flags);
        if (new_fd < 0)
                return new_fd;

        if (ufd) {
                error = put_user(new_fd, ufd);
                if (error) {
                        put_unused_fd(new_fd);
                        return error;
                }
        }

        fd_install(new_fd, get_file(file));
        __receive_sock(file);
        return new_fd;
}
EXPORT_SYMBOL_GPL(receive_fd);

int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags)
{
        int error;

        error = security_file_receive(file);
        if (error)
                return error;
        error = replace_fd(new_fd, file, o_flags);
        if (error)
                return error;
        __receive_sock(file);
        return new_fd;
}

static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
{
        int err = -EBADF;
        struct file *file;
        struct files_struct *files = current->files;

        if ((flags & ~O_CLOEXEC) != 0)
                return -EINVAL;

        if (unlikely(oldfd == newfd))
                return -EINVAL;

        if (newfd >= rlimit(RLIMIT_NOFILE))
                return -EBADF;

        spin_lock(&files->file_lock);
        err = expand_files(files, newfd);
        file = files_lookup_fd_locked(files, oldfd);
        if (unlikely(!file))
                goto Ebadf;
        if (unlikely(err < 0)) {
                if (err == -EMFILE)
                        goto Ebadf;
                goto out_unlock;
        }
        return do_dup2(files, file, newfd, flags);

Ebadf:
        err = -EBADF;
out_unlock:
        spin_unlock(&files->file_lock);
        return err;
}

SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
{
        return ksys_dup3(oldfd, newfd, flags);
}

SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
{
        if (unlikely(newfd == oldfd)) { /* corner case */
                struct files_struct *files = current->files;
                struct file *f;
                int retval = oldfd;

                rcu_read_lock();
                f = __fget_files_rcu(files, oldfd, 0);
                if (!f)
                        retval = -EBADF;
                rcu_read_unlock();
                if (f)
                        fput(f);
                return retval;
        }
        return ksys_dup3(oldfd, newfd, 0);
}

SYSCALL_DEFINE1(dup, unsigned int, fildes)
{
        int ret = -EBADF;
        struct file *file = fget_raw(fildes);

        if (file) {
                ret = get_unused_fd_flags(0);
                if (ret >= 0)
                        fd_install(ret, file);
                else
                        fput(file);
        }
        return ret;
}

int f_dupfd(unsigned int from, struct file *file, unsigned flags)
{
        unsigned long nofile = rlimit(RLIMIT_NOFILE);
        int err;
        if (from >= nofile)
                return -EINVAL;
        err = alloc_fd(from, nofile, flags);
        if (err >= 0) {
                get_file(file);
                fd_install(err, file);
        }
        return err;
}

int iterate_fd(struct files_struct *files, unsigned n,
                int (*f)(const void *, struct file *, unsigned),
                const void *p)
{
        struct fdtable *fdt;
        int res = 0;
        if (!files)
                return 0;
        spin_lock(&files->file_lock);
        for (fdt = files_fdtable(files); n < fdt->max_fds; n++) {
                struct file *file;
                file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
                if (!file)
                        continue;
                res = f(p, file, n);
                if (res)
                        break;
        }
        spin_unlock(&files->file_lock);
        return res;
}
EXPORT_SYMBOL(iterate_fd);


































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * sysfs.h - definitions for the device driver filesystem
 *
 * Copyright (c) 2001,2002 Patrick Mochel
 * Copyright (c) 2004 Silicon Graphics, Inc.
 * Copyright (c) 2007 SUSE Linux Products GmbH
 * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
 *
 * Please see Documentation/filesystems/sysfs.rst for more information.
 */

#ifndef _SYSFS_H_
#define _SYSFS_H_

#include <linux/kernfs.h>
#include <linux/compiler.h>
#include <linux/errno.h>
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/kobject_ns.h>
#include <linux/stat.h>
#include <linux/atomic.h>

struct kobject;
struct module;
struct bin_attribute;
enum kobj_ns_type;

struct attribute {
        const char                *name;
        umode_t                        mode;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        bool                        ignore_lockdep:1;
        struct lock_class_key        *key;
        struct lock_class_key        skey;
#endif
};

/**
 *        sysfs_attr_init - initialize a dynamically allocated sysfs attribute
 *        @attr: struct attribute to initialize
 *
 *        Initialize a dynamically allocated struct attribute so we can
 *        make lockdep happy.  This is a new requirement for attributes
 *        and initially this is only needed when lockdep is enabled.
 *        Lockdep gives a nice error when your attribute is added to
 *        sysfs if you don't have this.
 */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
#define sysfs_attr_init(attr)                                \
do {                                                        \
        static struct lock_class_key __key;                \
                                                        \
        (attr)->key = &__key;                                \
} while (0)
#else
#define sysfs_attr_init(attr) do {} while (0)
#endif

/**
 * struct attribute_group - data structure used to declare an attribute group.
 * @name:        Optional: Attribute group name
 *                If specified, the attribute group will be created in a
 *                new subdirectory with this name. Additionally when a
 *                group is named, @is_visible and @is_bin_visible may
 *                return SYSFS_GROUP_INVISIBLE to control visibility of
 *                the directory itself.
 * @is_visible:        Optional: Function to return permissions associated with an
 *                attribute of the group. Will be called repeatedly for
 *                each non-binary attribute in the group. Only read/write
 *                permissions as well as SYSFS_PREALLOC are accepted. Must
 *                return 0 if an attribute is not visible. The returned
 *                value will replace static permissions defined in struct
 *                attribute. Use SYSFS_GROUP_VISIBLE() when assigning this
 *                callback to specify separate _group_visible() and
 *                _attr_visible() handlers.
 * @is_bin_visible:
 *                Optional: Function to return permissions associated with a
 *                binary attribute of the group. Will be called repeatedly
 *                for each binary attribute in the group. Only read/write
 *                permissions as well as SYSFS_PREALLOC (and the
 *                visibility flags for named groups) are accepted. Must
 *                return 0 if a binary attribute is not visible. The
 *                returned value will replace static permissions defined
 *                in struct bin_attribute. If @is_visible is not set, Use
 *                SYSFS_GROUP_VISIBLE() when assigning this callback to
 *                specify separate _group_visible() and _attr_visible()
 *                handlers.
 * @bin_size:
 *                Optional: Function to return the size of a binary attribute
 *                of the group. Will be called repeatedly for each binary
 *                attribute in the group. Overwrites the size field embedded
 *                inside the attribute itself.
 * @attrs:        Pointer to NULL terminated list of attributes.
 * @bin_attrs:        Pointer to NULL terminated list of binary attributes.
 *                Either attrs or bin_attrs or both must be provided.
 */
struct attribute_group {
        const char                *name;
        umode_t                        (*is_visible)(struct kobject *,
                                              struct attribute *, int);
        umode_t                        (*is_bin_visible)(struct kobject *,
                                                  const struct bin_attribute *, int);
        size_t                        (*bin_size)(struct kobject *,
                                            const struct bin_attribute *,
                                            int);
        struct attribute        **attrs;
        const struct bin_attribute        *const *bin_attrs;
};

#define SYSFS_PREALLOC                010000
#define SYSFS_GROUP_INVISIBLE        020000

/*
 * DEFINE_SYSFS_GROUP_VISIBLE(name):
 *        A helper macro to pair with the assignment of ".is_visible =
 *        SYSFS_GROUP_VISIBLE(name)", that arranges for the directory
 *        associated with a named attribute_group to optionally be hidden.
 *        This allows for static declaration of attribute_groups, and the
 *        simplification of attribute visibility lifetime that implies,
 *        without polluting sysfs with empty attribute directories.
 * Ex.
 *
 * static umode_t example_attr_visible(struct kobject *kobj,
 *                                   struct attribute *attr, int n)
 * {
 *       if (example_attr_condition)
 *               return 0;
 *       else if (ro_attr_condition)
 *               return 0444;
 *       return a->mode;
 * }
 *
 * static bool example_group_visible(struct kobject *kobj)
 * {
 *       if (example_group_condition)
 *               return false;
 *       return true;
 * }
 *
 * DEFINE_SYSFS_GROUP_VISIBLE(example);
 *
 * static struct attribute_group example_group = {
 *       .name = "example",
 *       .is_visible = SYSFS_GROUP_VISIBLE(example),
 *       .attrs = &example_attrs,
 * };
 *
 * Note that it expects <name>_attr_visible and <name>_group_visible to
 * be defined. For cases where individual attributes do not need
 * separate visibility consideration, only entire group visibility at
 * once, see DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE().
 */
#define DEFINE_SYSFS_GROUP_VISIBLE(name)                             \
        static inline umode_t sysfs_group_visible_##name(            \
                struct kobject *kobj, struct attribute *attr, int n) \
        {                                                            \
                if (n == 0 && !name##_group_visible(kobj))           \
                        return SYSFS_GROUP_INVISIBLE;                \
                return name##_attr_visible(kobj, attr, n);           \
        }

/*
 * DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(name):
 *        A helper macro to pair with SYSFS_GROUP_VISIBLE() that like
 *        DEFINE_SYSFS_GROUP_VISIBLE() controls group visibility, but does
 *        not require the implementation of a per-attribute visibility
 *        callback.
 * Ex.
 *
 * static bool example_group_visible(struct kobject *kobj)
 * {
 *       if (example_group_condition)
 *               return false;
 *       return true;
 * }
 *
 * DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(example);
 *
 * static struct attribute_group example_group = {
 *       .name = "example",
 *       .is_visible = SYSFS_GROUP_VISIBLE(example),
 *       .attrs = &example_attrs,
 * };
 */
#define DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(name)                   \
        static inline umode_t sysfs_group_visible_##name(         \
                struct kobject *kobj, struct attribute *a, int n) \
        {                                                         \
                if (n == 0 && !name##_group_visible(kobj))        \
                        return SYSFS_GROUP_INVISIBLE;             \
                return a->mode;                                   \
        }

/*
 * Same as DEFINE_SYSFS_GROUP_VISIBLE, but for groups with only binary
 * attributes. If an attribute_group defines both text and binary
 * attributes, the group visibility is determined by the function
 * specified to is_visible() not is_bin_visible()
 */
#define DEFINE_SYSFS_BIN_GROUP_VISIBLE(name)                                   \
        static inline umode_t sysfs_group_visible_##name(                      \
                struct kobject *kobj, const struct bin_attribute *attr, int n) \
        {                                                                      \
                if (n == 0 && !name##_group_visible(kobj))                     \
                        return SYSFS_GROUP_INVISIBLE;                          \
                return name##_attr_visible(kobj, attr, n);                     \
        }

#define DEFINE_SIMPLE_SYSFS_BIN_GROUP_VISIBLE(name)                         \
        static inline umode_t sysfs_group_visible_##name(                   \
                struct kobject *kobj, const struct bin_attribute *a, int n) \
        {                                                                   \
                if (n == 0 && !name##_group_visible(kobj))                  \
                        return SYSFS_GROUP_INVISIBLE;                       \
                return a->mode;                                             \
        }

#define SYSFS_GROUP_VISIBLE(fn) sysfs_group_visible_##fn

/*
 * Use these macros to make defining attributes easier.
 * See include/linux/device.h for examples..
 */

#define __ATTR(_name, _mode, _show, _store) {                                \
        .attr = {.name = __stringify(_name),                                \
                 .mode = VERIFY_OCTAL_PERMISSIONS(_mode) },                \
        .show        = _show,                                                \
        .store        = _store,                                                \
}

#define __ATTR_PREALLOC(_name, _mode, _show, _store) {                        \
        .attr = {.name = __stringify(_name),                                \
                 .mode = SYSFS_PREALLOC | VERIFY_OCTAL_PERMISSIONS(_mode) },\
        .show        = _show,                                                \
        .store        = _store,                                                \
}

#define __ATTR_RO(_name) {                                                \
        .attr        = { .name = __stringify(_name), .mode = 0444 },                \
        .show        = _name##_show,                                                \
}

#define __ATTR_RO_MODE(_name, _mode) {                                        \
        .attr        = { .name = __stringify(_name),                                \
                    .mode = VERIFY_OCTAL_PERMISSIONS(_mode) },                \
        .show        = _name##_show,                                                \
}

#define __ATTR_RW_MODE(_name, _mode) {                                        \
        .attr        = { .name = __stringify(_name),                                \
                    .mode = VERIFY_OCTAL_PERMISSIONS(_mode) },                \
        .show        = _name##_show,                                                \
        .store        = _name##_store,                                        \
}

#define __ATTR_WO(_name) {                                                \
        .attr        = { .name = __stringify(_name), .mode = 0200 },                \
        .store        = _name##_store,                                        \
}

#define __ATTR_RW(_name) __ATTR(_name, 0644, _name##_show, _name##_store)

#define __ATTR_NULL { .attr = { .name = NULL } }

#ifdef CONFIG_DEBUG_LOCK_ALLOC
#define __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) {        \
        .attr = {.name = __stringify(_name), .mode = _mode,        \
                        .ignore_lockdep = true },                \
        .show                = _show,                                \
        .store                = _store,                                \
}
#else
#define __ATTR_IGNORE_LOCKDEP        __ATTR
#endif

#define __ATTRIBUTE_GROUPS(_name)                                \
static const struct attribute_group *_name##_groups[] = {        \
        &_name##_group,                                                \
        NULL,                                                        \
}

#define ATTRIBUTE_GROUPS(_name)                                        \
static const struct attribute_group _name##_group = {                \
        .attrs = _name##_attrs,                                        \
};                                                                \
__ATTRIBUTE_GROUPS(_name)

#define BIN_ATTRIBUTE_GROUPS(_name)                                \
static const struct attribute_group _name##_group = {                \
        .bin_attrs = _name##_attrs,                                \
};                                                                \
__ATTRIBUTE_GROUPS(_name)

struct file;
struct vm_area_struct;
struct address_space;

struct bin_attribute {
        struct attribute        attr;
        size_t                        size;
        void                        *private;
        struct address_space *(*f_mapping)(void);
        ssize_t (*read)(struct file *, struct kobject *, const struct bin_attribute *,
                        char *, loff_t, size_t);
        ssize_t (*write)(struct file *, struct kobject *, const struct bin_attribute *,
                         char *, loff_t, size_t);
        loff_t (*llseek)(struct file *, struct kobject *, const struct bin_attribute *,
                         loff_t, int);
        int (*mmap)(struct file *, struct kobject *, const struct bin_attribute *attr,
                    struct vm_area_struct *vma);
};

/**
 *        sysfs_bin_attr_init - initialize a dynamically allocated bin_attribute
 *        @attr: struct bin_attribute to initialize
 *
 *        Initialize a dynamically allocated struct bin_attribute so we
 *        can make lockdep happy.  This is a new requirement for
 *        attributes and initially this is only needed when lockdep is
 *        enabled.  Lockdep gives a nice error when your attribute is
 *        added to sysfs if you don't have this.
 */
#define sysfs_bin_attr_init(bin_attr) sysfs_attr_init(&(bin_attr)->attr)

/* macros to create static binary attributes easier */
#define __BIN_ATTR(_name, _mode, _read, _write, _size) {                \
        .attr = { .name = __stringify(_name), .mode = _mode },                \
        .read = _read,                                                        \
        .write = _write,                                                \
        .size        = _size,                                                \
}

#define __BIN_ATTR_RO(_name, _size)                                        \
        __BIN_ATTR(_name, 0444, _name##_read, NULL, _size)

#define __BIN_ATTR_WO(_name, _size)                                        \
        __BIN_ATTR(_name, 0200, NULL, _name##_write, _size)

#define __BIN_ATTR_RW(_name, _size)                                        \
        __BIN_ATTR(_name, 0644, _name##_read, _name##_write, _size)

#define __BIN_ATTR_NULL __ATTR_NULL

#define BIN_ATTR(_name, _mode, _read, _write, _size)                        \
struct bin_attribute bin_attr_##_name = __BIN_ATTR(_name, _mode, _read,        \
                                        _write, _size)

#define BIN_ATTR_RO(_name, _size)                                        \
struct bin_attribute bin_attr_##_name = __BIN_ATTR_RO(_name, _size)

#define BIN_ATTR_WO(_name, _size)                                        \
struct bin_attribute bin_attr_##_name = __BIN_ATTR_WO(_name, _size)

#define BIN_ATTR_RW(_name, _size)                                        \
struct bin_attribute bin_attr_##_name = __BIN_ATTR_RW(_name, _size)


#define __BIN_ATTR_ADMIN_RO(_name, _size)                                \
        __BIN_ATTR(_name, 0400, _name##_read, NULL, _size)

#define __BIN_ATTR_ADMIN_RW(_name, _size)                                        \
        __BIN_ATTR(_name, 0600, _name##_read, _name##_write, _size)

#define BIN_ATTR_ADMIN_RO(_name, _size)                                        \
struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RO(_name, _size)

#define BIN_ATTR_ADMIN_RW(_name, _size)                                        \
struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RW(_name, _size)

#define __BIN_ATTR_SIMPLE_RO(_name, _mode)                                \
        __BIN_ATTR(_name, _mode, sysfs_bin_attr_simple_read, NULL, 0)

#define BIN_ATTR_SIMPLE_RO(_name)                                        \
struct bin_attribute bin_attr_##_name = __BIN_ATTR_SIMPLE_RO(_name, 0444)

#define BIN_ATTR_SIMPLE_ADMIN_RO(_name)                                        \
struct bin_attribute bin_attr_##_name = __BIN_ATTR_SIMPLE_RO(_name, 0400)

struct sysfs_ops {
        ssize_t        (*show)(struct kobject *, struct attribute *, char *);
        ssize_t        (*store)(struct kobject *, struct attribute *, const char *, size_t);
};

#ifdef CONFIG_SYSFS

int __must_check sysfs_create_dir_ns(struct kobject *kobj, const void *ns);
void sysfs_remove_dir(struct kobject *kobj);
int __must_check sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name,
                                     const void *new_ns);
int __must_check sysfs_move_dir_ns(struct kobject *kobj,
                                   struct kobject *new_parent_kobj,
                                   const void *new_ns);
int __must_check sysfs_create_mount_point(struct kobject *parent_kobj,
                                          const char *name);
void sysfs_remove_mount_point(struct kobject *parent_kobj,
                              const char *name);

int __must_check sysfs_create_file_ns(struct kobject *kobj,
                                      const struct attribute *attr,
                                      const void *ns);
int __must_check sysfs_create_files(struct kobject *kobj,
                                   const struct attribute * const *attr);
int __must_check sysfs_chmod_file(struct kobject *kobj,
                                  const struct attribute *attr, umode_t mode);
struct kernfs_node *sysfs_break_active_protection(struct kobject *kobj,
                                                  const struct attribute *attr);
void sysfs_unbreak_active_protection(struct kernfs_node *kn);
void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr,
                          const void *ns);
bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr);
void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *attr);

int __must_check sysfs_create_bin_file(struct kobject *kobj,
                                       const struct bin_attribute *attr);
void sysfs_remove_bin_file(struct kobject *kobj,
                           const struct bin_attribute *attr);

int __must_check sysfs_create_link(struct kobject *kobj, struct kobject *target,
                                   const char *name);
int __must_check sysfs_create_link_nowarn(struct kobject *kobj,
                                          struct kobject *target,
                                          const char *name);
void sysfs_remove_link(struct kobject *kobj, const char *name);

int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *target,
                         const char *old_name, const char *new_name,
                         const void *new_ns);

void sysfs_delete_link(struct kobject *dir, struct kobject *targ,
                        const char *name);

int __must_check sysfs_create_group(struct kobject *kobj,
                                    const struct attribute_group *grp);
int __must_check sysfs_create_groups(struct kobject *kobj,
                                     const struct attribute_group **groups);
int __must_check sysfs_update_groups(struct kobject *kobj,
                                     const struct attribute_group **groups);
int sysfs_update_group(struct kobject *kobj,
                       const struct attribute_group *grp);
void sysfs_remove_group(struct kobject *kobj,
                        const struct attribute_group *grp);
void sysfs_remove_groups(struct kobject *kobj,
                         const struct attribute_group **groups);
int sysfs_add_file_to_group(struct kobject *kobj,
                        const struct attribute *attr, const char *group);
void sysfs_remove_file_from_group(struct kobject *kobj,
                        const struct attribute *attr, const char *group);
int sysfs_merge_group(struct kobject *kobj,
                       const struct attribute_group *grp);
void sysfs_unmerge_group(struct kobject *kobj,
                       const struct attribute_group *grp);
int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name,
                            struct kobject *target, const char *link_name);
void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
                                  const char *link_name);
int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj,
                                         struct kobject *target_kobj,
                                         const char *target_name,
                                         const char *symlink_name);

void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr);

int __must_check sysfs_init(void);

static inline void sysfs_enable_ns(struct kernfs_node *kn)
{
        return kernfs_enable_ns(kn);
}

int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid,
                            kgid_t kgid);
int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid);
int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ,
                            const char *name, kuid_t kuid, kgid_t kgid);
int sysfs_groups_change_owner(struct kobject *kobj,
                              const struct attribute_group **groups,
                              kuid_t kuid, kgid_t kgid);
int sysfs_group_change_owner(struct kobject *kobj,
                             const struct attribute_group *groups, kuid_t kuid,
                             kgid_t kgid);
__printf(2, 3)
int sysfs_emit(char *buf, const char *fmt, ...);
__printf(3, 4)
int sysfs_emit_at(char *buf, int at, const char *fmt, ...);

ssize_t sysfs_bin_attr_simple_read(struct file *file, struct kobject *kobj,
                                   const struct bin_attribute *attr, char *buf,
                                   loff_t off, size_t count);

#else /* CONFIG_SYSFS */

static inline int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
{
        return 0;
}

static inline void sysfs_remove_dir(struct kobject *kobj)
{
}

static inline int sysfs_rename_dir_ns(struct kobject *kobj,
                                      const char *new_name, const void *new_ns)
{
        return 0;
}

static inline int sysfs_move_dir_ns(struct kobject *kobj,
                                    struct kobject *new_parent_kobj,
                                    const void *new_ns)
{
        return 0;
}

static inline int sysfs_create_mount_point(struct kobject *parent_kobj,
                                           const char *name)
{
        return 0;
}

static inline void sysfs_remove_mount_point(struct kobject *parent_kobj,
                                            const char *name)
{
}

static inline int sysfs_create_file_ns(struct kobject *kobj,
                                       const struct attribute *attr,
                                       const void *ns)
{
        return 0;
}

static inline int sysfs_create_files(struct kobject *kobj,
                                    const struct attribute * const *attr)
{
        return 0;
}

static inline int sysfs_chmod_file(struct kobject *kobj,
                                   const struct attribute *attr, umode_t mode)
{
        return 0;
}

static inline struct kernfs_node *
sysfs_break_active_protection(struct kobject *kobj,
                              const struct attribute *attr)
{
        return NULL;
}

static inline void sysfs_unbreak_active_protection(struct kernfs_node *kn)
{
}

static inline void sysfs_remove_file_ns(struct kobject *kobj,
                                        const struct attribute *attr,
                                        const void *ns)
{
}

static inline bool sysfs_remove_file_self(struct kobject *kobj,
                                          const struct attribute *attr)
{
        return false;
}

static inline void sysfs_remove_files(struct kobject *kobj,
                                     const struct attribute * const *attr)
{
}

static inline int sysfs_create_bin_file(struct kobject *kobj,
                                        const struct bin_attribute *attr)
{
        return 0;
}

static inline void sysfs_remove_bin_file(struct kobject *kobj,
                                         const struct bin_attribute *attr)
{
}

static inline int sysfs_create_link(struct kobject *kobj,
                                    struct kobject *target, const char *name)
{
        return 0;
}

static inline int sysfs_create_link_nowarn(struct kobject *kobj,
                                           struct kobject *target,
                                           const char *name)
{
        return 0;
}

static inline void sysfs_remove_link(struct kobject *kobj, const char *name)
{
}

static inline int sysfs_rename_link_ns(struct kobject *k, struct kobject *t,
                                       const char *old_name,
                                       const char *new_name, const void *ns)
{
        return 0;
}

static inline void sysfs_delete_link(struct kobject *k, struct kobject *t,
                                     const char *name)
{
}

static inline int sysfs_create_group(struct kobject *kobj,
                                     const struct attribute_group *grp)
{
        return 0;
}

static inline int sysfs_create_groups(struct kobject *kobj,
                                      const struct attribute_group **groups)
{
        return 0;
}

static inline int sysfs_update_groups(struct kobject *kobj,
                                      const struct attribute_group **groups)
{
        return 0;
}

static inline int sysfs_update_group(struct kobject *kobj,
                                const struct attribute_group *grp)
{
        return 0;
}

static inline void sysfs_remove_group(struct kobject *kobj,
                                      const struct attribute_group *grp)
{
}

static inline void sysfs_remove_groups(struct kobject *kobj,
                                       const struct attribute_group **groups)
{
}

static inline int sysfs_add_file_to_group(struct kobject *kobj,
                const struct attribute *attr, const char *group)
{
        return 0;
}

static inline void sysfs_remove_file_from_group(struct kobject *kobj,
                const struct attribute *attr, const char *group)
{
}

static inline int sysfs_merge_group(struct kobject *kobj,
                       const struct attribute_group *grp)
{
        return 0;
}

static inline void sysfs_unmerge_group(struct kobject *kobj,
                       const struct attribute_group *grp)
{
}

static inline int sysfs_add_link_to_group(struct kobject *kobj,
                const char *group_name, struct kobject *target,
                const char *link_name)
{
        return 0;
}

static inline void sysfs_remove_link_from_group(struct kobject *kobj,
                const char *group_name, const char *link_name)
{
}

static inline int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj,
                                                       struct kobject *target_kobj,
                                                       const char *target_name,
                                                       const char *symlink_name)
{
        return 0;
}

static inline void sysfs_notify(struct kobject *kobj, const char *dir,
                                const char *attr)
{
}

static inline int __must_check sysfs_init(void)
{
        return 0;
}

static inline void sysfs_enable_ns(struct kernfs_node *kn)
{
}

static inline int sysfs_file_change_owner(struct kobject *kobj,
                                          const char *name, kuid_t kuid,
                                          kgid_t kgid)
{
        return 0;
}

static inline int sysfs_link_change_owner(struct kobject *kobj,
                                          struct kobject *targ,
                                          const char *name, kuid_t kuid,
                                          kgid_t kgid)
{
        return 0;
}

static inline int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid)
{
        return 0;
}

static inline int sysfs_groups_change_owner(struct kobject *kobj,
                          const struct attribute_group **groups,
                          kuid_t kuid, kgid_t kgid)
{
        return 0;
}

static inline int sysfs_group_change_owner(struct kobject *kobj,
                                           const struct attribute_group *groups,
                                           kuid_t kuid, kgid_t kgid)
{
        return 0;
}

__printf(2, 3)
static inline int sysfs_emit(char *buf, const char *fmt, ...)
{
        return 0;
}

__printf(3, 4)
static inline int sysfs_emit_at(char *buf, int at, const char *fmt, ...)
{
        return 0;
}

static inline ssize_t sysfs_bin_attr_simple_read(struct file *file,
                                                 struct kobject *kobj,
                                                 const struct bin_attribute *attr,
                                                 char *buf, loff_t off,
                                                 size_t count)
{
        return 0;
}
#endif /* CONFIG_SYSFS */

static inline int __must_check sysfs_create_file(struct kobject *kobj,
                                                 const struct attribute *attr)
{
        return sysfs_create_file_ns(kobj, attr, NULL);
}

static inline void sysfs_remove_file(struct kobject *kobj,
                                     const struct attribute *attr)
{
        sysfs_remove_file_ns(kobj, attr, NULL);
}

static inline int sysfs_rename_link(struct kobject *kobj, struct kobject *target,
                                    const char *old_name, const char *new_name)
{
        return sysfs_rename_link_ns(kobj, target, old_name, new_name, NULL);
}

static inline void sysfs_notify_dirent(struct kernfs_node *kn)
{
        kernfs_notify(kn);
}

static inline struct kernfs_node *sysfs_get_dirent(struct kernfs_node *parent,
                                                   const char *name)
{
        return kernfs_find_and_get(parent, name);
}

static inline struct kernfs_node *sysfs_get(struct kernfs_node *kn)
{
        kernfs_get(kn);
        return kn;
}

static inline void sysfs_put(struct kernfs_node *kn)
{
        kernfs_put(kn);
}

#endif /* _SYSFS_H_ */










































    7 
   57 











   55 
   57 















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_LOCAL_LOCK_H
# error "Do not include directly, include linux/local_lock.h"
#endif

#include <linux/percpu-defs.h>
#include <linux/lockdep.h>

#ifndef CONFIG_PREEMPT_RT

typedef struct {
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map        dep_map;
        struct task_struct        *owner;
#endif
} local_lock_t;

/* local_trylock() and local_trylock_irqsave() only work with local_trylock_t */
typedef struct {
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map        dep_map;
        struct task_struct        *owner;
#endif
        u8                acquired;
} local_trylock_t;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define LOCAL_LOCK_DEBUG_INIT(lockname)                \
        .dep_map = {                                        \
                .name = #lockname,                        \
                .wait_type_inner = LD_WAIT_CONFIG,        \
                .lock_type = LD_LOCK_PERCPU,                \
        },                                                \
        .owner = NULL,

# define LOCAL_TRYLOCK_DEBUG_INIT(lockname)                \
        LOCAL_LOCK_DEBUG_INIT(lockname)

static inline void local_lock_acquire(local_lock_t *l)
{
        lock_map_acquire(&l->dep_map);
        DEBUG_LOCKS_WARN_ON(l->owner);
        l->owner = current;
}

static inline void local_trylock_acquire(local_lock_t *l)
{
        lock_map_acquire_try(&l->dep_map);
        DEBUG_LOCKS_WARN_ON(l->owner);
        l->owner = current;
}

static inline void local_lock_release(local_lock_t *l)
{
        DEBUG_LOCKS_WARN_ON(l->owner != current);
        l->owner = NULL;
        lock_map_release(&l->dep_map);
}

static inline void local_lock_debug_init(local_lock_t *l)
{
        l->owner = NULL;
}
#else /* CONFIG_DEBUG_LOCK_ALLOC */
# define LOCAL_LOCK_DEBUG_INIT(lockname)
# define LOCAL_TRYLOCK_DEBUG_INIT(lockname)
static inline void local_lock_acquire(local_lock_t *l) { }
static inline void local_trylock_acquire(local_lock_t *l) { }
static inline void local_lock_release(local_lock_t *l) { }
static inline void local_lock_debug_init(local_lock_t *l) { }
#endif /* !CONFIG_DEBUG_LOCK_ALLOC */

#define INIT_LOCAL_LOCK(lockname)        { LOCAL_LOCK_DEBUG_INIT(lockname) }
#define INIT_LOCAL_TRYLOCK(lockname)        { LOCAL_TRYLOCK_DEBUG_INIT(lockname) }

#define __local_lock_init(lock)                                        \
do {                                                                \
        static struct lock_class_key __key;                        \
                                                                \
        debug_check_no_locks_freed((void *)lock, sizeof(*lock));\
        lockdep_init_map_type(&(lock)->dep_map, #lock, &__key,  \
                              0, LD_WAIT_CONFIG, LD_WAIT_INV,        \
                              LD_LOCK_PERCPU);                        \
        local_lock_debug_init(lock);                                \
} while (0)

#define __local_trylock_init(lock) __local_lock_init((local_lock_t *)lock)

#define __spinlock_nested_bh_init(lock)                                \
do {                                                                \
        static struct lock_class_key __key;                        \
                                                                \
        debug_check_no_locks_freed((void *)lock, sizeof(*lock));\
        lockdep_init_map_type(&(lock)->dep_map, #lock, &__key,  \
                              0, LD_WAIT_CONFIG, LD_WAIT_INV,        \
                              LD_LOCK_NORMAL);                        \
        local_lock_debug_init(lock);                                \
} while (0)

#define __local_lock_acquire(lock)                                        \
        do {                                                                \
                local_trylock_t *tl;                                        \
                local_lock_t *l;                                        \
                                                                        \
                l = (local_lock_t *)(lock);                                \
                tl = (local_trylock_t *)l;                                \
                _Generic((lock),                                        \
                        local_trylock_t *: ({                                \
                                lockdep_assert(tl->acquired == 0);        \
                                WRITE_ONCE(tl->acquired, 1);                \
                        }),                                                \
                        local_lock_t *: (void)0);                        \
                local_lock_acquire(l);                                        \
        } while (0)

#define __local_lock(lock)                                        \
        do {                                                        \
                preempt_disable();                                \
                __local_lock_acquire(lock);                        \
        } while (0)

#define __local_lock_irq(lock)                                        \
        do {                                                        \
                local_irq_disable();                                \
                __local_lock_acquire(lock);                        \
        } while (0)

#define __local_lock_irqsave(lock, flags)                        \
        do {                                                        \
                local_irq_save(flags);                                \
                __local_lock_acquire(lock);                        \
        } while (0)

#define __local_trylock(lock)                                        \
        ({                                                        \
                local_trylock_t *tl;                                \
                                                                \
                preempt_disable();                                \
                tl = (lock);                                        \
                if (READ_ONCE(tl->acquired)) {                        \
                        preempt_enable();                        \
                        tl = NULL;                                \
                } else {                                        \
                        WRITE_ONCE(tl->acquired, 1);                \
                        local_trylock_acquire(                        \
                                (local_lock_t *)tl);                \
                }                                                \
                !!tl;                                                \
        })

#define __local_trylock_irqsave(lock, flags)                        \
        ({                                                        \
                local_trylock_t *tl;                                \
                                                                \
                local_irq_save(flags);                                \
                tl = (lock);                                        \
                if (READ_ONCE(tl->acquired)) {                        \
                        local_irq_restore(flags);                \
                        tl = NULL;                                \
                } else {                                        \
                        WRITE_ONCE(tl->acquired, 1);                \
                        local_trylock_acquire(                        \
                                (local_lock_t *)tl);                \
                }                                                \
                !!tl;                                                \
        })

/* preemption or migration must be disabled before calling __local_lock_is_locked */
#define __local_lock_is_locked(lock) READ_ONCE(this_cpu_ptr(lock)->acquired)

#define __local_lock_release(lock)                                        \
        do {                                                                \
                local_trylock_t *tl;                                        \
                local_lock_t *l;                                        \
                                                                        \
                l = (local_lock_t *)(lock);                                \
                tl = (local_trylock_t *)l;                                \
                local_lock_release(l);                                        \
                _Generic((lock),                                        \
                        local_trylock_t *: ({                                \
                                lockdep_assert(tl->acquired == 1);        \
                                WRITE_ONCE(tl->acquired, 0);                \
                        }),                                                \
                        local_lock_t *: (void)0);                        \
        } while (0)

#define __local_unlock(lock)                                        \
        do {                                                        \
                __local_lock_release(lock);                        \
                preempt_enable();                                \
        } while (0)

#define __local_unlock_irq(lock)                                \
        do {                                                        \
                __local_lock_release(lock);                        \
                local_irq_enable();                                \
        } while (0)

#define __local_unlock_irqrestore(lock, flags)                        \
        do {                                                        \
                __local_lock_release(lock);                        \
                local_irq_restore(flags);                        \
        } while (0)

#define __local_lock_nested_bh(lock)                                \
        do {                                                        \
                lockdep_assert_in_softirq();                        \
                local_lock_acquire((lock));                        \
        } while (0)

#define __local_unlock_nested_bh(lock)                                \
        local_lock_release((lock))

#else /* !CONFIG_PREEMPT_RT */

/*
 * On PREEMPT_RT local_lock maps to a per CPU spinlock, which protects the
 * critical section while staying preemptible.
 */
typedef spinlock_t local_lock_t;
typedef spinlock_t local_trylock_t;

#define INIT_LOCAL_LOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname))
#define INIT_LOCAL_TRYLOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname))

#define __local_lock_init(l)                                        \
        do {                                                        \
                local_spin_lock_init((l));                        \
        } while (0)

#define __local_trylock_init(l)                        __local_lock_init(l)

#define __local_lock(__lock)                                        \
        do {                                                        \
                migrate_disable();                                \
                spin_lock((__lock));                                \
        } while (0)

#define __local_lock_irq(lock)                        __local_lock(lock)

#define __local_lock_irqsave(lock, flags)                        \
        do {                                                        \
                typecheck(unsigned long, flags);                \
                flags = 0;                                        \
                __local_lock(lock);                                \
        } while (0)

#define __local_unlock(__lock)                                        \
        do {                                                        \
                spin_unlock((__lock));                                \
                migrate_enable();                                \
        } while (0)

#define __local_unlock_irq(lock)                __local_unlock(lock)

#define __local_unlock_irqrestore(lock, flags)        __local_unlock(lock)

#define __local_lock_nested_bh(lock)                                \
do {                                                                \
        lockdep_assert_in_softirq_func();                        \
        spin_lock((lock));                                        \
} while (0)

#define __local_unlock_nested_bh(lock)                                \
do {                                                                \
        spin_unlock((lock));                                        \
} while (0)

#define __local_trylock(lock)                                        \
        ({                                                        \
                int __locked;                                        \
                                                                \
                if (in_nmi() | in_hardirq()) {                        \
                        __locked = 0;                                \
                } else {                                        \
                        migrate_disable();                        \
                        __locked = spin_trylock((lock));        \
                        if (!__locked)                                \
                                migrate_enable();                \
                }                                                \
                __locked;                                        \
        })

#define __local_trylock_irqsave(lock, flags)                        \
        ({                                                        \
                typecheck(unsigned long, flags);                \
                flags = 0;                                        \
                __local_trylock(lock);                                \
        })

/* migration must be disabled before calling __local_lock_is_locked */
#define __local_lock_is_locked(__lock)                                        \
        (rt_mutex_owner(&this_cpu_ptr(__lock)->lock) == current)

#endif /* CONFIG_PREEMPT_RT */

































  120 


















   40 





















































































   13 






















































   95 




























   42 






























































































    2 





































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM timer

#if !defined(_TRACE_TIMER_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_TIMER_H

#include <linux/tracepoint.h>
#include <linux/hrtimer.h>
#include <linux/timer.h>

DECLARE_EVENT_CLASS(timer_class,

        TP_PROTO(struct timer_list *timer),

        TP_ARGS(timer),

        TP_STRUCT__entry(
                __field( void *,        timer        )
        ),

        TP_fast_assign(
                __entry->timer        = timer;
        ),

        TP_printk("timer=%p", __entry->timer)
);

/**
 * timer_init - called when the timer is initialized
 * @timer:        pointer to struct timer_list
 */
DEFINE_EVENT(timer_class, timer_init,

        TP_PROTO(struct timer_list *timer),

        TP_ARGS(timer)
);

#define decode_timer_flags(flags)                        \
        __print_flags(flags, "|",                        \
                {  TIMER_MIGRATING,        "M" },                \
                {  TIMER_DEFERRABLE,        "D" },                \
                {  TIMER_PINNED,        "P" },                \
                {  TIMER_IRQSAFE,        "I" })

/**
 * timer_start - called when the timer is started
 * @timer:                pointer to struct timer_list
 * @bucket_expiry:        the bucket expiry time
 */
TRACE_EVENT(timer_start,

        TP_PROTO(struct timer_list *timer,
                unsigned long bucket_expiry),

        TP_ARGS(timer, bucket_expiry),

        TP_STRUCT__entry(
                __field( void *,        timer                )
                __field( void *,        function        )
                __field( unsigned long,        expires                )
                __field( unsigned long,        bucket_expiry        )
                __field( unsigned long,        now                )
                __field( unsigned int,        flags                )
        ),

        TP_fast_assign(
                __entry->timer                = timer;
                __entry->function        = timer->function;
                __entry->expires        = timer->expires;
                __entry->bucket_expiry        = bucket_expiry;
                __entry->now                = jiffies;
                __entry->flags                = timer->flags;
        ),

        TP_printk("timer=%p function=%ps expires=%lu [timeout=%ld] bucket_expiry=%lu cpu=%u idx=%u flags=%s",
                  __entry->timer, __entry->function, __entry->expires,
                  (long)__entry->expires - __entry->now,
                  __entry->bucket_expiry, __entry->flags & TIMER_CPUMASK,
                  __entry->flags >> TIMER_ARRAYSHIFT,
                  decode_timer_flags(__entry->flags & TIMER_TRACE_FLAGMASK))
);

/**
 * timer_expire_entry - called immediately before the timer callback
 * @timer:        pointer to struct timer_list
 * @baseclk:        value of timer_base::clk when timer expires
 *
 * Allows to determine the timer latency.
 */
TRACE_EVENT(timer_expire_entry,

        TP_PROTO(struct timer_list *timer, unsigned long baseclk),

        TP_ARGS(timer, baseclk),

        TP_STRUCT__entry(
                __field( void *,        timer        )
                __field( unsigned long,        now        )
                __field( void *,        function)
                __field( unsigned long,        baseclk        )
        ),

        TP_fast_assign(
                __entry->timer                = timer;
                __entry->now                = jiffies;
                __entry->function        = timer->function;
                __entry->baseclk        = baseclk;
        ),

        TP_printk("timer=%p function=%ps now=%lu baseclk=%lu",
                  __entry->timer, __entry->function, __entry->now,
                  __entry->baseclk)
);

/**
 * timer_expire_exit - called immediately after the timer callback returns
 * @timer:        pointer to struct timer_list
 *
 * When used in combination with the timer_expire_entry tracepoint we can
 * determine the runtime of the timer callback function.
 *
 * NOTE: Do NOT dereference timer in TP_fast_assign. The pointer might
 * be invalid. We solely track the pointer.
 */
DEFINE_EVENT(timer_class, timer_expire_exit,

        TP_PROTO(struct timer_list *timer),

        TP_ARGS(timer)
);

/**
 * timer_cancel - called when the timer is canceled
 * @timer:        pointer to struct timer_list
 */
DEFINE_EVENT(timer_class, timer_cancel,

        TP_PROTO(struct timer_list *timer),

        TP_ARGS(timer)
);

TRACE_EVENT(timer_base_idle,

        TP_PROTO(bool is_idle, unsigned int cpu),

        TP_ARGS(is_idle, cpu),

        TP_STRUCT__entry(
                __field( bool,                is_idle        )
                __field( unsigned int,        cpu        )
        ),

        TP_fast_assign(
                __entry->is_idle        = is_idle;
                __entry->cpu                = cpu;
        ),

        TP_printk("is_idle=%d cpu=%d",
                  __entry->is_idle, __entry->cpu)
);

#define decode_clockid(type)                                                \
        __print_symbolic(type,                                                \
                { CLOCK_REALTIME,        "CLOCK_REALTIME"        },        \
                { CLOCK_MONOTONIC,        "CLOCK_MONOTONIC"        },        \
                { CLOCK_BOOTTIME,        "CLOCK_BOOTTIME"        },        \
                { CLOCK_TAI,                "CLOCK_TAI"                })

#define decode_hrtimer_mode(mode)                                        \
        __print_symbolic(mode,                                                \
                { HRTIMER_MODE_ABS,                "ABS"                },        \
                { HRTIMER_MODE_REL,                "REL"                },        \
                { HRTIMER_MODE_ABS_PINNED,        "ABS|PINNED"        },        \
                { HRTIMER_MODE_REL_PINNED,        "REL|PINNED"        },        \
                { HRTIMER_MODE_ABS_SOFT,        "ABS|SOFT"        },        \
                { HRTIMER_MODE_REL_SOFT,        "REL|SOFT"        },        \
                { HRTIMER_MODE_ABS_PINNED_SOFT,        "ABS|PINNED|SOFT" },        \
                { HRTIMER_MODE_REL_PINNED_SOFT,        "REL|PINNED|SOFT" },        \
                { HRTIMER_MODE_ABS_HARD,        "ABS|HARD" },                \
                { HRTIMER_MODE_REL_HARD,        "REL|HARD" },                \
                { HRTIMER_MODE_ABS_PINNED_HARD, "ABS|PINNED|HARD" },        \
                { HRTIMER_MODE_REL_PINNED_HARD,        "REL|PINNED|HARD" })

/**
 * hrtimer_setup - called when the hrtimer is initialized
 * @hrtimer:        pointer to struct hrtimer
 * @clockid:        the hrtimers clock
 * @mode:        the hrtimers mode
 */
TRACE_EVENT(hrtimer_setup,

        TP_PROTO(struct hrtimer *hrtimer, clockid_t clockid,
                 enum hrtimer_mode mode),

        TP_ARGS(hrtimer, clockid, mode),

        TP_STRUCT__entry(
                __field( void *,                hrtimer                )
                __field( clockid_t,                clockid                )
                __field( enum hrtimer_mode,        mode                )
        ),

        TP_fast_assign(
                __entry->hrtimer        = hrtimer;
                __entry->clockid        = clockid;
                __entry->mode                = mode;
        ),

        TP_printk("hrtimer=%p clockid=%s mode=%s", __entry->hrtimer,
                  decode_clockid(__entry->clockid),
                  decode_hrtimer_mode(__entry->mode))
);

/**
 * hrtimer_start - called when the hrtimer is started
 * @hrtimer:        pointer to struct hrtimer
 * @mode:        the hrtimers mode
 */
TRACE_EVENT(hrtimer_start,

        TP_PROTO(struct hrtimer *hrtimer, enum hrtimer_mode mode),

        TP_ARGS(hrtimer, mode),

        TP_STRUCT__entry(
                __field( void *,        hrtimer                )
                __field( void *,        function        )
                __field( s64,                expires                )
                __field( s64,                softexpires        )
                __field( enum hrtimer_mode,        mode        )
        ),

        TP_fast_assign(
                __entry->hrtimer        = hrtimer;
                __entry->function        = ACCESS_PRIVATE(hrtimer, function);
                __entry->expires        = hrtimer_get_expires(hrtimer);
                __entry->softexpires        = hrtimer_get_softexpires(hrtimer);
                __entry->mode                = mode;
        ),

        TP_printk("hrtimer=%p function=%ps expires=%llu softexpires=%llu "
                  "mode=%s", __entry->hrtimer, __entry->function,
                  (unsigned long long) __entry->expires,
                  (unsigned long long) __entry->softexpires,
                  decode_hrtimer_mode(__entry->mode))
);

/**
 * hrtimer_expire_entry - called immediately before the hrtimer callback
 * @hrtimer:        pointer to struct hrtimer
 * @now:        pointer to variable which contains current time of the
 *                timers base.
 *
 * Allows to determine the timer latency.
 */
TRACE_EVENT(hrtimer_expire_entry,

        TP_PROTO(struct hrtimer *hrtimer, ktime_t *now),

        TP_ARGS(hrtimer, now),

        TP_STRUCT__entry(
                __field( void *,        hrtimer        )
                __field( s64,                now        )
                __field( void *,        function)
        ),

        TP_fast_assign(
                __entry->hrtimer        = hrtimer;
                __entry->now                = *now;
                __entry->function        = ACCESS_PRIVATE(hrtimer, function);
        ),

        TP_printk("hrtimer=%p function=%ps now=%llu",
                  __entry->hrtimer, __entry->function,
                  (unsigned long long) __entry->now)
);

DECLARE_EVENT_CLASS(hrtimer_class,

        TP_PROTO(struct hrtimer *hrtimer),

        TP_ARGS(hrtimer),

        TP_STRUCT__entry(
                __field( void *,        hrtimer        )
        ),

        TP_fast_assign(
                __entry->hrtimer        = hrtimer;
        ),

        TP_printk("hrtimer=%p", __entry->hrtimer)
);

/**
 * hrtimer_expire_exit - called immediately after the hrtimer callback returns
 * @hrtimer:        pointer to struct hrtimer
 *
 * When used in combination with the hrtimer_expire_entry tracepoint we can
 * determine the runtime of the callback function.
 */
DEFINE_EVENT(hrtimer_class, hrtimer_expire_exit,

        TP_PROTO(struct hrtimer *hrtimer),

        TP_ARGS(hrtimer)
);

/**
 * hrtimer_cancel - called when the hrtimer is canceled
 * @hrtimer:        pointer to struct hrtimer
 */
DEFINE_EVENT(hrtimer_class, hrtimer_cancel,

        TP_PROTO(struct hrtimer *hrtimer),

        TP_ARGS(hrtimer)
);

/**
 * itimer_state - called when itimer is started or canceled
 * @which:        name of the interval timer
 * @value:        the itimers value, itimer is canceled if value->it_value is
 *                zero, otherwise it is started
 * @expires:        the itimers expiry time
 */
TRACE_EVENT(itimer_state,

        TP_PROTO(int which, const struct itimerspec64 *const value,
                 unsigned long long expires),

        TP_ARGS(which, value, expires),

        TP_STRUCT__entry(
                __field(        int,                        which                )
                __field(        unsigned long long,        expires                )
                __field(        long,                        value_sec        )
                __field(        long,                        value_nsec        )
                __field(        long,                        interval_sec        )
                __field(        long,                        interval_nsec        )
        ),

        TP_fast_assign(
                __entry->which                = which;
                __entry->expires        = expires;
                __entry->value_sec        = value->it_value.tv_sec;
                __entry->value_nsec        = value->it_value.tv_nsec;
                __entry->interval_sec        = value->it_interval.tv_sec;
                __entry->interval_nsec        = value->it_interval.tv_nsec;
        ),

        TP_printk("which=%d expires=%llu it_value=%ld.%06ld it_interval=%ld.%06ld",
                  __entry->which, __entry->expires,
                  __entry->value_sec, __entry->value_nsec / NSEC_PER_USEC,
                  __entry->interval_sec, __entry->interval_nsec / NSEC_PER_USEC)
);

/**
 * itimer_expire - called when itimer expires
 * @which:        type of the interval timer
 * @pid:        pid of the process which owns the timer
 * @now:        current time, used to calculate the latency of itimer
 */
TRACE_EVENT(itimer_expire,

        TP_PROTO(int which, struct pid *pid, unsigned long long now),

        TP_ARGS(which, pid, now),

        TP_STRUCT__entry(
                __field( int ,                        which        )
                __field( pid_t,                        pid        )
                __field( unsigned long long,        now        )
        ),

        TP_fast_assign(
                __entry->which        = which;
                __entry->now        = now;
                __entry->pid        = pid_nr(pid);
        ),

        TP_printk("which=%d pid=%d now=%llu", __entry->which,
                  (int) __entry->pid, __entry->now)
);

#ifdef CONFIG_NO_HZ_COMMON

#define TICK_DEP_NAMES                                        \
                tick_dep_mask_name(NONE)                \
                tick_dep_name(POSIX_TIMER)                \
                tick_dep_name(PERF_EVENTS)                \
                tick_dep_name(SCHED)                        \
                tick_dep_name(CLOCK_UNSTABLE)                \
                tick_dep_name(RCU)                        \
                tick_dep_name_end(RCU_EXP)

#undef tick_dep_name
#undef tick_dep_mask_name
#undef tick_dep_name_end

/* The MASK will convert to their bits and they need to be processed too */
#define tick_dep_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_BIT_##sdep); \
        TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);
#define tick_dep_name_end(sdep)  TRACE_DEFINE_ENUM(TICK_DEP_BIT_##sdep); \
        TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);
/* NONE only has a mask defined for it */
#define tick_dep_mask_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);

TICK_DEP_NAMES

#undef tick_dep_name
#undef tick_dep_mask_name
#undef tick_dep_name_end

#define tick_dep_name(sdep) { TICK_DEP_MASK_##sdep, #sdep },
#define tick_dep_mask_name(sdep) { TICK_DEP_MASK_##sdep, #sdep },
#define tick_dep_name_end(sdep) { TICK_DEP_MASK_##sdep, #sdep }

#define show_tick_dep_name(val)                                \
        __print_symbolic(val, TICK_DEP_NAMES)

TRACE_EVENT(tick_stop,

        TP_PROTO(int success, int dependency),

        TP_ARGS(success, dependency),

        TP_STRUCT__entry(
                __field( int ,                success        )
                __field( int ,                dependency )
        ),

        TP_fast_assign(
                __entry->success        = success;
                __entry->dependency        = dependency;
        ),

        TP_printk("success=%d dependency=%s",  __entry->success, \
                        show_tick_dep_name(__entry->dependency))
);
#endif

#endif /*  _TRACE_TIMER_H */

/* This part must be outside protection */
#include <trace/define_trace.h>









































































































































































































    1 


























































































































































































































































































































































































































































    9 












    9 








    9 





    9 

    9 
    9 
    9 
    9 





    9 


    9 















    9 


    9 





    9 




    9 

    9 
    9 



    9 




    9 


    9 
    9 















































































































































































































































































































































    1 






































    9 






























    1 

    1 









    1 













    1 

    1 





    1 



    1 




























































    1 


























































    1 




    1 
    1 
    1 






















    1 















    1 




































    9 



    9 


























    9 


    9 








    9 



    9 
    9 


    9 


    9 


    9 



    9 





























































    9 








    9 
    9 

    9 



    9 

    9 





    1 



    1 




























    1 


























    1 



    1 














































































    1 





    1 
    1 






    1 


    1 
















    9 

    9 


    9 






    9 
    9 























































































































































































    1 
    1 
    1 


    1 





    1 
























    9 










    9 







    1 





    1 

    1 






    1 
























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
/*
 * DRBG: Deterministic Random Bits Generator
 *       Based on NIST Recommended DRBG from NIST SP800-90A with the following
 *       properties:
 *                * CTR DRBG with DF with AES-128, AES-192, AES-256 cores
 *                * Hash DRBG with DF with SHA-1, SHA-256, SHA-384, SHA-512 cores
 *                * HMAC DRBG with DF with SHA-1, SHA-256, SHA-384, SHA-512 cores
 *                * with and without prediction resistance
 *
 * Copyright Stephan Mueller <smueller@chronox.de>, 2014
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, and the entire permission notice in its entirety,
 *    including the disclaimer of warranties.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the author may not be used to endorse or promote
 *    products derived from this software without specific prior
 *    written permission.
 *
 * ALTERNATIVELY, this product may be distributed under the terms of
 * the GNU General Public License, in which case the provisions of the GPL are
 * required INSTEAD OF the above restrictions.  (This clause is
 * necessary due to a potential bad interaction between the GPL and
 * the restrictions contained in a BSD-style copyright.)
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
 * WHICH ARE HEREBY DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
 * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
 * DAMAGE.
 *
 * DRBG Usage
 * ==========
 * The SP 800-90A DRBG allows the user to specify a personalization string
 * for initialization as well as an additional information string for each
 * random number request. The following code fragments show how a caller
 * uses the kernel crypto API to use the full functionality of the DRBG.
 *
 * Usage without any additional data
 * ---------------------------------
 * struct crypto_rng *drng;
 * int err;
 * char data[DATALEN];
 *
 * drng = crypto_alloc_rng(drng_name, 0, 0);
 * err = crypto_rng_get_bytes(drng, &data, DATALEN);
 * crypto_free_rng(drng);
 *
 *
 * Usage with personalization string during initialization
 * -------------------------------------------------------
 * struct crypto_rng *drng;
 * int err;
 * char data[DATALEN];
 * struct drbg_string pers;
 * char personalization[11] = "some-string";
 *
 * drbg_string_fill(&pers, personalization, strlen(personalization));
 * drng = crypto_alloc_rng(drng_name, 0, 0);
 * // The reset completely re-initializes the DRBG with the provided
 * // personalization string
 * err = crypto_rng_reset(drng, &personalization, strlen(personalization));
 * err = crypto_rng_get_bytes(drng, &data, DATALEN);
 * crypto_free_rng(drng);
 *
 *
 * Usage with additional information string during random number request
 * ---------------------------------------------------------------------
 * struct crypto_rng *drng;
 * int err;
 * char data[DATALEN];
 * char addtl_string[11] = "some-string";
 * string drbg_string addtl;
 *
 * drbg_string_fill(&addtl, addtl_string, strlen(addtl_string));
 * drng = crypto_alloc_rng(drng_name, 0, 0);
 * // The following call is a wrapper to crypto_rng_get_bytes() and returns
 * // the same error codes.
 * err = crypto_drbg_get_bytes_addtl(drng, &data, DATALEN, &addtl);
 * crypto_free_rng(drng);
 *
 *
 * Usage with personalization and additional information strings
 * -------------------------------------------------------------
 * Just mix both scenarios above.
 */

#include <crypto/drbg.h>
#include <crypto/internal/cipher.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/string_choices.h>

/***************************************************************
 * Backend cipher definitions available to DRBG
 ***************************************************************/

/*
 * The order of the DRBG definitions here matter: every DRBG is registered
 * as stdrng. Each DRBG receives an increasing cra_priority values the later
 * they are defined in this array (see drbg_fill_array).
 *
 * HMAC DRBGs are favored over Hash DRBGs over CTR DRBGs, and the
 * HMAC-SHA512 / SHA256 / AES 256 over other ciphers. Thus, the
 * favored DRBGs are the latest entries in this array.
 */
static const struct drbg_core drbg_cores[] = {
#ifdef CONFIG_CRYPTO_DRBG_CTR
        {
                .flags = DRBG_CTR | DRBG_STRENGTH128,
                .statelen = 32, /* 256 bits as defined in 10.2.1 */
                .blocklen_bytes = 16,
                .cra_name = "ctr_aes128",
                .backend_cra_name = "aes",
        }, {
                .flags = DRBG_CTR | DRBG_STRENGTH192,
                .statelen = 40, /* 320 bits as defined in 10.2.1 */
                .blocklen_bytes = 16,
                .cra_name = "ctr_aes192",
                .backend_cra_name = "aes",
        }, {
                .flags = DRBG_CTR | DRBG_STRENGTH256,
                .statelen = 48, /* 384 bits as defined in 10.2.1 */
                .blocklen_bytes = 16,
                .cra_name = "ctr_aes256",
                .backend_cra_name = "aes",
        },
#endif /* CONFIG_CRYPTO_DRBG_CTR */
#ifdef CONFIG_CRYPTO_DRBG_HASH
        {
                .flags = DRBG_HASH | DRBG_STRENGTH256,
                .statelen = 111, /* 888 bits */
                .blocklen_bytes = 48,
                .cra_name = "sha384",
                .backend_cra_name = "sha384",
        }, {
                .flags = DRBG_HASH | DRBG_STRENGTH256,
                .statelen = 111, /* 888 bits */
                .blocklen_bytes = 64,
                .cra_name = "sha512",
                .backend_cra_name = "sha512",
        }, {
                .flags = DRBG_HASH | DRBG_STRENGTH256,
                .statelen = 55, /* 440 bits */
                .blocklen_bytes = 32,
                .cra_name = "sha256",
                .backend_cra_name = "sha256",
        },
#endif /* CONFIG_CRYPTO_DRBG_HASH */
#ifdef CONFIG_CRYPTO_DRBG_HMAC
        {
                .flags = DRBG_HMAC | DRBG_STRENGTH256,
                .statelen = 48, /* block length of cipher */
                .blocklen_bytes = 48,
                .cra_name = "hmac_sha384",
                .backend_cra_name = "hmac(sha384)",
        }, {
                .flags = DRBG_HMAC | DRBG_STRENGTH256,
                .statelen = 32, /* block length of cipher */
                .blocklen_bytes = 32,
                .cra_name = "hmac_sha256",
                .backend_cra_name = "hmac(sha256)",
        }, {
                .flags = DRBG_HMAC | DRBG_STRENGTH256,
                .statelen = 64, /* block length of cipher */
                .blocklen_bytes = 64,
                .cra_name = "hmac_sha512",
                .backend_cra_name = "hmac(sha512)",
        },
#endif /* CONFIG_CRYPTO_DRBG_HMAC */
};

static int drbg_uninstantiate(struct drbg_state *drbg);

/******************************************************************
 * Generic helper functions
 ******************************************************************/

/*
 * Return strength of DRBG according to SP800-90A section 8.4
 *
 * @flags DRBG flags reference
 *
 * Return: normalized strength in *bytes* value or 32 as default
 *           to counter programming errors
 */
static inline unsigned short drbg_sec_strength(drbg_flag_t flags)
{
        switch (flags & DRBG_STRENGTH_MASK) {
        case DRBG_STRENGTH128:
                return 16;
        case DRBG_STRENGTH192:
                return 24;
        case DRBG_STRENGTH256:
                return 32;
        default:
                return 32;
        }
}

/*
 * FIPS 140-2 continuous self test for the noise source
 * The test is performed on the noise source input data. Thus, the function
 * implicitly knows the size of the buffer to be equal to the security
 * strength.
 *
 * Note, this function disregards the nonce trailing the entropy data during
 * initial seeding.
 *
 * drbg->drbg_mutex must have been taken.
 *
 * @drbg DRBG handle
 * @entropy buffer of seed data to be checked
 *
 * return:
 *        0 on success
 *        -EAGAIN on when the CTRNG is not yet primed
 *        < 0 on error
 */
static int drbg_fips_continuous_test(struct drbg_state *drbg,
                                     const unsigned char *entropy)
{
        unsigned short entropylen = drbg_sec_strength(drbg->core->flags);
        int ret = 0;

        if (!IS_ENABLED(CONFIG_CRYPTO_FIPS))
                return 0;

        /* skip test if we test the overall system */
        if (list_empty(&drbg->test_data.list))
                return 0;
        /* only perform test in FIPS mode */
        if (!fips_enabled)
                return 0;

        if (!drbg->fips_primed) {
                /* Priming of FIPS test */
                memcpy(drbg->prev, entropy, entropylen);
                drbg->fips_primed = true;
                /* priming: another round is needed */
                return -EAGAIN;
        }
        ret = memcmp(drbg->prev, entropy, entropylen);
        if (!ret)
                panic("DRBG continuous self test failed\n");
        memcpy(drbg->prev, entropy, entropylen);

        /* the test shall pass when the two values are not equal */
        return 0;
}

/*
 * Convert an integer into a byte representation of this integer.
 * The byte representation is big-endian
 *
 * @val value to be converted
 * @buf buffer holding the converted integer -- caller must ensure that
 *      buffer size is at least 32 bit
 */
#if (defined(CONFIG_CRYPTO_DRBG_HASH) || defined(CONFIG_CRYPTO_DRBG_CTR))
static inline void drbg_cpu_to_be32(__u32 val, unsigned char *buf)
{
        struct s {
                __be32 conv;
        };
        struct s *conversion = (struct s *) buf;

        conversion->conv = cpu_to_be32(val);
}
#endif /* defined(CONFIG_CRYPTO_DRBG_HASH) || defined(CONFIG_CRYPTO_DRBG_CTR) */

/******************************************************************
 * CTR DRBG callback functions
 ******************************************************************/

#ifdef CONFIG_CRYPTO_DRBG_CTR
#define CRYPTO_DRBG_CTR_STRING "CTR "
MODULE_ALIAS_CRYPTO("drbg_pr_ctr_aes256");
MODULE_ALIAS_CRYPTO("drbg_nopr_ctr_aes256");
MODULE_ALIAS_CRYPTO("drbg_pr_ctr_aes192");
MODULE_ALIAS_CRYPTO("drbg_nopr_ctr_aes192");
MODULE_ALIAS_CRYPTO("drbg_pr_ctr_aes128");
MODULE_ALIAS_CRYPTO("drbg_nopr_ctr_aes128");

static void drbg_kcapi_symsetkey(struct drbg_state *drbg,
                                 const unsigned char *key);
static int drbg_kcapi_sym(struct drbg_state *drbg, unsigned char *outval,
                          const struct drbg_string *in);
static int drbg_init_sym_kernel(struct drbg_state *drbg);
static int drbg_fini_sym_kernel(struct drbg_state *drbg);
static int drbg_kcapi_sym_ctr(struct drbg_state *drbg,
                              u8 *inbuf, u32 inbuflen,
                              u8 *outbuf, u32 outlen);
#define DRBG_OUTSCRATCHLEN 256

/* BCC function for CTR DRBG as defined in 10.4.3 */
static int drbg_ctr_bcc(struct drbg_state *drbg,
                        unsigned char *out, const unsigned char *key,
                        struct list_head *in)
{
        int ret = 0;
        struct drbg_string *curr = NULL;
        struct drbg_string data;
        short cnt = 0;

        drbg_string_fill(&data, out, drbg_blocklen(drbg));

        /* 10.4.3 step 2 / 4 */
        drbg_kcapi_symsetkey(drbg, key);
        list_for_each_entry(curr, in, list) {
                const unsigned char *pos = curr->buf;
                size_t len = curr->len;
                /* 10.4.3 step 4.1 */
                while (len) {
                        /* 10.4.3 step 4.2 */
                        if (drbg_blocklen(drbg) == cnt) {
                                cnt = 0;
                                ret = drbg_kcapi_sym(drbg, out, &data);
                                if (ret)
                                        return ret;
                        }
                        out[cnt] ^= *pos;
                        pos++;
                        cnt++;
                        len--;
                }
        }
        /* 10.4.3 step 4.2 for last block */
        if (cnt)
                ret = drbg_kcapi_sym(drbg, out, &data);

        return ret;
}

/*
 * scratchpad usage: drbg_ctr_update is interlinked with drbg_ctr_df
 * (and drbg_ctr_bcc, but this function does not need any temporary buffers),
 * the scratchpad is used as follows:
 * drbg_ctr_update:
 *        temp
 *                start: drbg->scratchpad
 *                length: drbg_statelen(drbg) + drbg_blocklen(drbg)
 *                        note: the cipher writing into this variable works
 *                        blocklen-wise. Now, when the statelen is not a multiple
 *                        of blocklen, the generateion loop below "spills over"
 *                        by at most blocklen. Thus, we need to give sufficient
 *                        memory.
 *        df_data
 *                start: drbg->scratchpad +
 *                                drbg_statelen(drbg) + drbg_blocklen(drbg)
 *                length: drbg_statelen(drbg)
 *
 * drbg_ctr_df:
 *        pad
 *                start: df_data + drbg_statelen(drbg)
 *                length: drbg_blocklen(drbg)
 *        iv
 *                start: pad + drbg_blocklen(drbg)
 *                length: drbg_blocklen(drbg)
 *        temp
 *                start: iv + drbg_blocklen(drbg)
 *                length: drbg_satelen(drbg) + drbg_blocklen(drbg)
 *                        note: temp is the buffer that the BCC function operates
 *                        on. BCC operates blockwise. drbg_statelen(drbg)
 *                        is sufficient when the DRBG state length is a multiple
 *                        of the block size. For AES192 (and maybe other ciphers)
 *                        this is not correct and the length for temp is
 *                        insufficient (yes, that also means for such ciphers,
 *                        the final output of all BCC rounds are truncated).
 *                        Therefore, add drbg_blocklen(drbg) to cover all
 *                        possibilities.
 */

/* Derivation Function for CTR DRBG as defined in 10.4.2 */
static int drbg_ctr_df(struct drbg_state *drbg,
                       unsigned char *df_data, size_t bytes_to_return,
                       struct list_head *seedlist)
{
        int ret = -EFAULT;
        unsigned char L_N[8];
        /* S3 is input */
        struct drbg_string S1, S2, S4, cipherin;
        LIST_HEAD(bcc_list);
        unsigned char *pad = df_data + drbg_statelen(drbg);
        unsigned char *iv = pad + drbg_blocklen(drbg);
        unsigned char *temp = iv + drbg_blocklen(drbg);
        size_t padlen = 0;
        unsigned int templen = 0;
        /* 10.4.2 step 7 */
        unsigned int i = 0;
        /* 10.4.2 step 8 */
        const unsigned char *K = (unsigned char *)
                           "\x00\x01\x02\x03\x04\x05\x06\x07"
                           "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
                           "\x10\x11\x12\x13\x14\x15\x16\x17"
                           "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f";
        unsigned char *X;
        size_t generated_len = 0;
        size_t inputlen = 0;
        struct drbg_string *seed = NULL;

        memset(pad, 0, drbg_blocklen(drbg));
        memset(iv, 0, drbg_blocklen(drbg));

        /* 10.4.2 step 1 is implicit as we work byte-wise */

        /* 10.4.2 step 2 */
        if ((512/8) < bytes_to_return)
                return -EINVAL;

        /* 10.4.2 step 2 -- calculate the entire length of all input data */
        list_for_each_entry(seed, seedlist, list)
                inputlen += seed->len;
        drbg_cpu_to_be32(inputlen, &L_N[0]);

        /* 10.4.2 step 3 */
        drbg_cpu_to_be32(bytes_to_return, &L_N[4]);

        /* 10.4.2 step 5: length is L_N, input_string, one byte, padding */
        padlen = (inputlen + sizeof(L_N) + 1) % (drbg_blocklen(drbg));
        /* wrap the padlen appropriately */
        if (padlen)
                padlen = drbg_blocklen(drbg) - padlen;
        /*
         * pad / padlen contains the 0x80 byte and the following zero bytes.
         * As the calculated padlen value only covers the number of zero
         * bytes, this value has to be incremented by one for the 0x80 byte.
         */
        padlen++;
        pad[0] = 0x80;

        /* 10.4.2 step 4 -- first fill the linked list and then order it */
        drbg_string_fill(&S1, iv, drbg_blocklen(drbg));
        list_add_tail(&S1.list, &bcc_list);
        drbg_string_fill(&S2, L_N, sizeof(L_N));
        list_add_tail(&S2.list, &bcc_list);
        list_splice_tail(seedlist, &bcc_list);
        drbg_string_fill(&S4, pad, padlen);
        list_add_tail(&S4.list, &bcc_list);

        /* 10.4.2 step 9 */
        while (templen < (drbg_keylen(drbg) + (drbg_blocklen(drbg)))) {
                /*
                 * 10.4.2 step 9.1 - the padding is implicit as the buffer
                 * holds zeros after allocation -- even the increment of i
                 * is irrelevant as the increment remains within length of i
                 */
                drbg_cpu_to_be32(i, iv);
                /* 10.4.2 step 9.2 -- BCC and concatenation with temp */
                ret = drbg_ctr_bcc(drbg, temp + templen, K, &bcc_list);
                if (ret)
                        goto out;
                /* 10.4.2 step 9.3 */
                i++;
                templen += drbg_blocklen(drbg);
        }

        /* 10.4.2 step 11 */
        X = temp + (drbg_keylen(drbg));
        drbg_string_fill(&cipherin, X, drbg_blocklen(drbg));

        /* 10.4.2 step 12: overwriting of outval is implemented in next step */

        /* 10.4.2 step 13 */
        drbg_kcapi_symsetkey(drbg, temp);
        while (generated_len < bytes_to_return) {
                short blocklen = 0;
                /*
                 * 10.4.2 step 13.1: the truncation of the key length is
                 * implicit as the key is only drbg_blocklen in size based on
                 * the implementation of the cipher function callback
                 */
                ret = drbg_kcapi_sym(drbg, X, &cipherin);
                if (ret)
                        goto out;
                blocklen = (drbg_blocklen(drbg) <
                                (bytes_to_return - generated_len)) ?
                            drbg_blocklen(drbg) :
                                (bytes_to_return - generated_len);
                /* 10.4.2 step 13.2 and 14 */
                memcpy(df_data + generated_len, X, blocklen);
                generated_len += blocklen;
        }

        ret = 0;

out:
        memset(iv, 0, drbg_blocklen(drbg));
        memset(temp, 0, drbg_statelen(drbg) + drbg_blocklen(drbg));
        memset(pad, 0, drbg_blocklen(drbg));
        return ret;
}

/*
 * update function of CTR DRBG as defined in 10.2.1.2
 *
 * The reseed variable has an enhanced meaning compared to the update
 * functions of the other DRBGs as follows:
 * 0 => initial seed from initialization
 * 1 => reseed via drbg_seed
 * 2 => first invocation from drbg_ctr_update when addtl is present. In
 *      this case, the df_data scratchpad is not deleted so that it is
 *      available for another calls to prevent calling the DF function
 *      again.
 * 3 => second invocation from drbg_ctr_update. When the update function
 *      was called with addtl, the df_data memory already contains the
 *      DFed addtl information and we do not need to call DF again.
 */
static int drbg_ctr_update(struct drbg_state *drbg, struct list_head *seed,
                           int reseed)
{
        int ret = -EFAULT;
        /* 10.2.1.2 step 1 */
        unsigned char *temp = drbg->scratchpad;
        unsigned char *df_data = drbg->scratchpad + drbg_statelen(drbg) +
                                 drbg_blocklen(drbg);

        if (3 > reseed)
                memset(df_data, 0, drbg_statelen(drbg));

        if (!reseed) {
                /*
                 * The DRBG uses the CTR mode of the underlying AES cipher. The
                 * CTR mode increments the counter value after the AES operation
                 * but SP800-90A requires that the counter is incremented before
                 * the AES operation. Hence, we increment it at the time we set
                 * it by one.
                 */
                crypto_inc(drbg->V, drbg_blocklen(drbg));

                ret = crypto_skcipher_setkey(drbg->ctr_handle, drbg->C,
                                             drbg_keylen(drbg));
                if (ret)
                        goto out;
        }

        /* 10.2.1.3.2 step 2 and 10.2.1.4.2 step 2 */
        if (seed) {
                ret = drbg_ctr_df(drbg, df_data, drbg_statelen(drbg), seed);
                if (ret)
                        goto out;
        }

        ret = drbg_kcapi_sym_ctr(drbg, df_data, drbg_statelen(drbg),
                                 temp, drbg_statelen(drbg));
        if (ret)
                return ret;

        /* 10.2.1.2 step 5 */
        ret = crypto_skcipher_setkey(drbg->ctr_handle, temp,
                                     drbg_keylen(drbg));
        if (ret)
                goto out;
        /* 10.2.1.2 step 6 */
        memcpy(drbg->V, temp + drbg_keylen(drbg), drbg_blocklen(drbg));
        /* See above: increment counter by one to compensate timing of CTR op */
        crypto_inc(drbg->V, drbg_blocklen(drbg));
        ret = 0;

out:
        memset(temp, 0, drbg_statelen(drbg) + drbg_blocklen(drbg));
        if (2 != reseed)
                memset(df_data, 0, drbg_statelen(drbg));
        return ret;
}

/*
 * scratchpad use: drbg_ctr_update is called independently from
 * drbg_ctr_extract_bytes. Therefore, the scratchpad is reused
 */
/* Generate function of CTR DRBG as defined in 10.2.1.5.2 */
static int drbg_ctr_generate(struct drbg_state *drbg,
                             unsigned char *buf, unsigned int buflen,
                             struct list_head *addtl)
{
        int ret;
        int len = min_t(int, buflen, INT_MAX);

        /* 10.2.1.5.2 step 2 */
        if (addtl && !list_empty(addtl)) {
                ret = drbg_ctr_update(drbg, addtl, 2);
                if (ret)
                        return 0;
        }

        /* 10.2.1.5.2 step 4.1 */
        ret = drbg_kcapi_sym_ctr(drbg, NULL, 0, buf, len);
        if (ret)
                return ret;

        /* 10.2.1.5.2 step 6 */
        ret = drbg_ctr_update(drbg, NULL, 3);
        if (ret)
                len = ret;

        return len;
}

static const struct drbg_state_ops drbg_ctr_ops = {
        .update                = drbg_ctr_update,
        .generate        = drbg_ctr_generate,
        .crypto_init        = drbg_init_sym_kernel,
        .crypto_fini        = drbg_fini_sym_kernel,
};
#endif /* CONFIG_CRYPTO_DRBG_CTR */

/******************************************************************
 * HMAC DRBG callback functions
 ******************************************************************/

#if defined(CONFIG_CRYPTO_DRBG_HASH) || defined(CONFIG_CRYPTO_DRBG_HMAC)
static int drbg_kcapi_hash(struct drbg_state *drbg, unsigned char *outval,
                           const struct list_head *in);
static void drbg_kcapi_hmacsetkey(struct drbg_state *drbg,
                                  const unsigned char *key);
static int drbg_init_hash_kernel(struct drbg_state *drbg);
static int drbg_fini_hash_kernel(struct drbg_state *drbg);
#endif /* (CONFIG_CRYPTO_DRBG_HASH || CONFIG_CRYPTO_DRBG_HMAC) */

#ifdef CONFIG_CRYPTO_DRBG_HMAC
#define CRYPTO_DRBG_HMAC_STRING "HMAC "
MODULE_ALIAS_CRYPTO("drbg_pr_hmac_sha512");
MODULE_ALIAS_CRYPTO("drbg_nopr_hmac_sha512");
MODULE_ALIAS_CRYPTO("drbg_pr_hmac_sha384");
MODULE_ALIAS_CRYPTO("drbg_nopr_hmac_sha384");
MODULE_ALIAS_CRYPTO("drbg_pr_hmac_sha256");
MODULE_ALIAS_CRYPTO("drbg_nopr_hmac_sha256");

/* update function of HMAC DRBG as defined in 10.1.2.2 */
static int drbg_hmac_update(struct drbg_state *drbg, struct list_head *seed,
                            int reseed)
{
        int ret = -EFAULT;
        int i = 0;
        struct drbg_string seed1, seed2, vdata;
        LIST_HEAD(seedlist);
        LIST_HEAD(vdatalist);

        if (!reseed) {
                /* 10.1.2.3 step 2 -- memset(0) of C is implicit with kzalloc */
                memset(drbg->V, 1, drbg_statelen(drbg));
                drbg_kcapi_hmacsetkey(drbg, drbg->C);
        }

        drbg_string_fill(&seed1, drbg->V, drbg_statelen(drbg));
        list_add_tail(&seed1.list, &seedlist);
        /* buffer of seed2 will be filled in for loop below with one byte */
        drbg_string_fill(&seed2, NULL, 1);
        list_add_tail(&seed2.list, &seedlist);
        /* input data of seed is allowed to be NULL at this point */
        if (seed)
                list_splice_tail(seed, &seedlist);

        drbg_string_fill(&vdata, drbg->V, drbg_statelen(drbg));
        list_add_tail(&vdata.list, &vdatalist);
        for (i = 2; 0 < i; i--) {
                /* first round uses 0x0, second 0x1 */
                unsigned char prefix = DRBG_PREFIX0;
                if (1 == i)
                        prefix = DRBG_PREFIX1;
                /* 10.1.2.2 step 1 and 4 -- concatenation and HMAC for key */
                seed2.buf = &prefix;
                ret = drbg_kcapi_hash(drbg, drbg->C, &seedlist);
                if (ret)
                        return ret;
                drbg_kcapi_hmacsetkey(drbg, drbg->C);

                /* 10.1.2.2 step 2 and 5 -- HMAC for V */
                ret = drbg_kcapi_hash(drbg, drbg->V, &vdatalist);
                if (ret)
                        return ret;

                /* 10.1.2.2 step 3 */
                if (!seed)
                        return ret;
        }

        return 0;
}

/* generate function of HMAC DRBG as defined in 10.1.2.5 */
static int drbg_hmac_generate(struct drbg_state *drbg,
                              unsigned char *buf,
                              unsigned int buflen,
                              struct list_head *addtl)
{
        int len = 0;
        int ret = 0;
        struct drbg_string data;
        LIST_HEAD(datalist);

        /* 10.1.2.5 step 2 */
        if (addtl && !list_empty(addtl)) {
                ret = drbg_hmac_update(drbg, addtl, 1);
                if (ret)
                        return ret;
        }

        drbg_string_fill(&data, drbg->V, drbg_statelen(drbg));
        list_add_tail(&data.list, &datalist);
        while (len < buflen) {
                unsigned int outlen = 0;
                /* 10.1.2.5 step 4.1 */
                ret = drbg_kcapi_hash(drbg, drbg->V, &datalist);
                if (ret)
                        return ret;
                outlen = (drbg_blocklen(drbg) < (buflen - len)) ?
                          drbg_blocklen(drbg) : (buflen - len);

                /* 10.1.2.5 step 4.2 */
                memcpy(buf + len, drbg->V, outlen);
                len += outlen;
        }

        /* 10.1.2.5 step 6 */
        if (addtl && !list_empty(addtl))
                ret = drbg_hmac_update(drbg, addtl, 1);
        else
                ret = drbg_hmac_update(drbg, NULL, 1);
        if (ret)
                return ret;

        return len;
}

static const struct drbg_state_ops drbg_hmac_ops = {
        .update                = drbg_hmac_update,
        .generate        = drbg_hmac_generate,
        .crypto_init        = drbg_init_hash_kernel,
        .crypto_fini        = drbg_fini_hash_kernel,
};
#endif /* CONFIG_CRYPTO_DRBG_HMAC */

/******************************************************************
 * Hash DRBG callback functions
 ******************************************************************/

#ifdef CONFIG_CRYPTO_DRBG_HASH
#define CRYPTO_DRBG_HASH_STRING "HASH "
MODULE_ALIAS_CRYPTO("drbg_pr_sha512");
MODULE_ALIAS_CRYPTO("drbg_nopr_sha512");
MODULE_ALIAS_CRYPTO("drbg_pr_sha384");
MODULE_ALIAS_CRYPTO("drbg_nopr_sha384");
MODULE_ALIAS_CRYPTO("drbg_pr_sha256");
MODULE_ALIAS_CRYPTO("drbg_nopr_sha256");

/*
 * Increment buffer
 *
 * @dst buffer to increment
 * @add value to add
 */
static inline void drbg_add_buf(unsigned char *dst, size_t dstlen,
                                const unsigned char *add, size_t addlen)
{
        /* implied: dstlen > addlen */
        unsigned char *dstptr;
        const unsigned char *addptr;
        unsigned int remainder = 0;
        size_t len = addlen;

        dstptr = dst + (dstlen-1);
        addptr = add + (addlen-1);
        while (len) {
                remainder += *dstptr + *addptr;
                *dstptr = remainder & 0xff;
                remainder >>= 8;
                len--; dstptr--; addptr--;
        }
        len = dstlen - addlen;
        while (len && remainder > 0) {
                remainder = *dstptr + 1;
                *dstptr = remainder & 0xff;
                remainder >>= 8;
                len--; dstptr--;
        }
}

/*
 * scratchpad usage: as drbg_hash_update and drbg_hash_df are used
 * interlinked, the scratchpad is used as follows:
 * drbg_hash_update
 *        start: drbg->scratchpad
 *        length: drbg_statelen(drbg)
 * drbg_hash_df:
 *        start: drbg->scratchpad + drbg_statelen(drbg)
 *        length: drbg_blocklen(drbg)
 *
 * drbg_hash_process_addtl uses the scratchpad, but fully completes
 * before either of the functions mentioned before are invoked. Therefore,
 * drbg_hash_process_addtl does not need to be specifically considered.
 */

/* Derivation Function for Hash DRBG as defined in 10.4.1 */
static int drbg_hash_df(struct drbg_state *drbg,
                        unsigned char *outval, size_t outlen,
                        struct list_head *entropylist)
{
        int ret = 0;
        size_t len = 0;
        unsigned char input[5];
        unsigned char *tmp = drbg->scratchpad + drbg_statelen(drbg);
        struct drbg_string data;

        /* 10.4.1 step 3 */
        input[0] = 1;
        drbg_cpu_to_be32((outlen * 8), &input[1]);

        /* 10.4.1 step 4.1 -- concatenation of data for input into hash */
        drbg_string_fill(&data, input, 5);
        list_add(&data.list, entropylist);

        /* 10.4.1 step 4 */
        while (len < outlen) {
                short blocklen = 0;
                /* 10.4.1 step 4.1 */
                ret = drbg_kcapi_hash(drbg, tmp, entropylist);
                if (ret)
                        goto out;
                /* 10.4.1 step 4.2 */
                input[0]++;
                blocklen = (drbg_blocklen(drbg) < (outlen - len)) ?
                            drbg_blocklen(drbg) : (outlen - len);
                memcpy(outval + len, tmp, blocklen);
                len += blocklen;
        }

out:
        memset(tmp, 0, drbg_blocklen(drbg));
        return ret;
}

/* update function for Hash DRBG as defined in 10.1.1.2 / 10.1.1.3 */
static int drbg_hash_update(struct drbg_state *drbg, struct list_head *seed,
                            int reseed)
{
        int ret = 0;
        struct drbg_string data1, data2;
        LIST_HEAD(datalist);
        LIST_HEAD(datalist2);
        unsigned char *V = drbg->scratchpad;
        unsigned char prefix = DRBG_PREFIX1;

        if (!seed)
                return -EINVAL;

        if (reseed) {
                /* 10.1.1.3 step 1 */
                memcpy(V, drbg->V, drbg_statelen(drbg));
                drbg_string_fill(&data1, &prefix, 1);
                list_add_tail(&data1.list, &datalist);
                drbg_string_fill(&data2, V, drbg_statelen(drbg));
                list_add_tail(&data2.list, &datalist);
        }
        list_splice_tail(seed, &datalist);

        /* 10.1.1.2 / 10.1.1.3 step 2 and 3 */
        ret = drbg_hash_df(drbg, drbg->V, drbg_statelen(drbg), &datalist);
        if (ret)
                goto out;

        /* 10.1.1.2 / 10.1.1.3 step 4  */
        prefix = DRBG_PREFIX0;
        drbg_string_fill(&data1, &prefix, 1);
        list_add_tail(&data1.list, &datalist2);
        drbg_string_fill(&data2, drbg->V, drbg_statelen(drbg));
        list_add_tail(&data2.list, &datalist2);
        /* 10.1.1.2 / 10.1.1.3 step 4 */
        ret = drbg_hash_df(drbg, drbg->C, drbg_statelen(drbg), &datalist2);

out:
        memset(drbg->scratchpad, 0, drbg_statelen(drbg));
        return ret;
}

/* processing of additional information string for Hash DRBG */
static int drbg_hash_process_addtl(struct drbg_state *drbg,
                                   struct list_head *addtl)
{
        int ret = 0;
        struct drbg_string data1, data2;
        LIST_HEAD(datalist);
        unsigned char prefix = DRBG_PREFIX2;

        /* 10.1.1.4 step 2 */
        if (!addtl || list_empty(addtl))
                return 0;

        /* 10.1.1.4 step 2a */
        drbg_string_fill(&data1, &prefix, 1);
        drbg_string_fill(&data2, drbg->V, drbg_statelen(drbg));
        list_add_tail(&data1.list, &datalist);
        list_add_tail(&data2.list, &datalist);
        list_splice_tail(addtl, &datalist);
        ret = drbg_kcapi_hash(drbg, drbg->scratchpad, &datalist);
        if (ret)
                goto out;

        /* 10.1.1.4 step 2b */
        drbg_add_buf(drbg->V, drbg_statelen(drbg),
                     drbg->scratchpad, drbg_blocklen(drbg));

out:
        memset(drbg->scratchpad, 0, drbg_blocklen(drbg));
        return ret;
}

/* Hashgen defined in 10.1.1.4 */
static int drbg_hash_hashgen(struct drbg_state *drbg,
                             unsigned char *buf,
                             unsigned int buflen)
{
        int len = 0;
        int ret = 0;
        unsigned char *src = drbg->scratchpad;
        unsigned char *dst = drbg->scratchpad + drbg_statelen(drbg);
        struct drbg_string data;
        LIST_HEAD(datalist);

        /* 10.1.1.4 step hashgen 2 */
        memcpy(src, drbg->V, drbg_statelen(drbg));

        drbg_string_fill(&data, src, drbg_statelen(drbg));
        list_add_tail(&data.list, &datalist);
        while (len < buflen) {
                unsigned int outlen = 0;
                /* 10.1.1.4 step hashgen 4.1 */
                ret = drbg_kcapi_hash(drbg, dst, &datalist);
                if (ret) {
                        len = ret;
                        goto out;
                }
                outlen = (drbg_blocklen(drbg) < (buflen - len)) ?
                          drbg_blocklen(drbg) : (buflen - len);
                /* 10.1.1.4 step hashgen 4.2 */
                memcpy(buf + len, dst, outlen);
                len += outlen;
                /* 10.1.1.4 hashgen step 4.3 */
                if (len < buflen)
                        crypto_inc(src, drbg_statelen(drbg));
        }

out:
        memset(drbg->scratchpad, 0,
               (drbg_statelen(drbg) + drbg_blocklen(drbg)));
        return len;
}

/* generate function for Hash DRBG as defined in  10.1.1.4 */
static int drbg_hash_generate(struct drbg_state *drbg,
                              unsigned char *buf, unsigned int buflen,
                              struct list_head *addtl)
{
        int len = 0;
        int ret = 0;
        union {
                unsigned char req[8];
                __be64 req_int;
        } u;
        unsigned char prefix = DRBG_PREFIX3;
        struct drbg_string data1, data2;
        LIST_HEAD(datalist);

        /* 10.1.1.4 step 2 */
        ret = drbg_hash_process_addtl(drbg, addtl);
        if (ret)
                return ret;
        /* 10.1.1.4 step 3 */
        len = drbg_hash_hashgen(drbg, buf, buflen);

        /* this is the value H as documented in 10.1.1.4 */
        /* 10.1.1.4 step 4 */
        drbg_string_fill(&data1, &prefix, 1);
        list_add_tail(&data1.list, &datalist);
        drbg_string_fill(&data2, drbg->V, drbg_statelen(drbg));
        list_add_tail(&data2.list, &datalist);
        ret = drbg_kcapi_hash(drbg, drbg->scratchpad, &datalist);
        if (ret) {
                len = ret;
                goto out;
        }

        /* 10.1.1.4 step 5 */
        drbg_add_buf(drbg->V, drbg_statelen(drbg),
                     drbg->scratchpad, drbg_blocklen(drbg));
        drbg_add_buf(drbg->V, drbg_statelen(drbg),
                     drbg->C, drbg_statelen(drbg));
        u.req_int = cpu_to_be64(drbg->reseed_ctr);
        drbg_add_buf(drbg->V, drbg_statelen(drbg), u.req, 8);

out:
        memset(drbg->scratchpad, 0, drbg_blocklen(drbg));
        return len;
}

/*
 * scratchpad usage: as update and generate are used isolated, both
 * can use the scratchpad
 */
static const struct drbg_state_ops drbg_hash_ops = {
        .update                = drbg_hash_update,
        .generate        = drbg_hash_generate,
        .crypto_init        = drbg_init_hash_kernel,
        .crypto_fini        = drbg_fini_hash_kernel,
};
#endif /* CONFIG_CRYPTO_DRBG_HASH */

/******************************************************************
 * Functions common for DRBG implementations
 ******************************************************************/

static inline int __drbg_seed(struct drbg_state *drbg, struct list_head *seed,
                              int reseed, enum drbg_seed_state new_seed_state)
{
        int ret = drbg->d_ops->update(drbg, seed, reseed);

        if (ret)
                return ret;

        drbg->seeded = new_seed_state;
        drbg->last_seed_time = jiffies;
        /* 10.1.1.2 / 10.1.1.3 step 5 */
        drbg->reseed_ctr = 1;

        switch (drbg->seeded) {
        case DRBG_SEED_STATE_UNSEEDED:
                /* Impossible, but handle it to silence compiler warnings. */
                fallthrough;
        case DRBG_SEED_STATE_PARTIAL:
                /*
                 * Require frequent reseeds until the seed source is
                 * fully initialized.
                 */
                drbg->reseed_threshold = 50;
                break;

        case DRBG_SEED_STATE_FULL:
                /*
                 * Seed source has become fully initialized, frequent
                 * reseeds no longer required.
                 */
                drbg->reseed_threshold = drbg_max_requests(drbg);
                break;
        }

        return ret;
}

static inline int drbg_get_random_bytes(struct drbg_state *drbg,
                                        unsigned char *entropy,
                                        unsigned int entropylen)
{
        int ret;

        do {
                get_random_bytes(entropy, entropylen);
                ret = drbg_fips_continuous_test(drbg, entropy);
                if (ret && ret != -EAGAIN)
                        return ret;
        } while (ret);

        return 0;
}

static int drbg_seed_from_random(struct drbg_state *drbg)
{
        struct drbg_string data;
        LIST_HEAD(seedlist);
        unsigned int entropylen = drbg_sec_strength(drbg->core->flags);
        unsigned char entropy[32];
        int ret;

        BUG_ON(!entropylen);
        BUG_ON(entropylen > sizeof(entropy));

        drbg_string_fill(&data, entropy, entropylen);
        list_add_tail(&data.list, &seedlist);

        ret = drbg_get_random_bytes(drbg, entropy, entropylen);
        if (ret)
                goto out;

        ret = __drbg_seed(drbg, &seedlist, true, DRBG_SEED_STATE_FULL);

out:
        memzero_explicit(entropy, entropylen);
        return ret;
}

static bool drbg_nopr_reseed_interval_elapsed(struct drbg_state *drbg)
{
        unsigned long next_reseed;

        /* Don't ever reseed from get_random_bytes() in test mode. */
        if (list_empty(&drbg->test_data.list))
                return false;

        /*
         * Obtain fresh entropy for the nopr DRBGs after 300s have
         * elapsed in order to still achieve sort of partial
         * prediction resistance over the time domain at least. Note
         * that the period of 300s has been chosen to match the
         * CRNG_RESEED_INTERVAL of the get_random_bytes()' chacha
         * rngs.
         */
        next_reseed = drbg->last_seed_time + 300 * HZ;
        return time_after(jiffies, next_reseed);
}

/*
 * Seeding or reseeding of the DRBG
 *
 * @drbg: DRBG state struct
 * @pers: personalization / additional information buffer
 * @reseed: 0 for initial seed process, 1 for reseeding
 *
 * return:
 *        0 on success
 *        error value otherwise
 */
static int drbg_seed(struct drbg_state *drbg, struct drbg_string *pers,
                     bool reseed)
{
        int ret;
        unsigned char entropy[((32 + 16) * 2)];
        unsigned int entropylen = drbg_sec_strength(drbg->core->flags);
        struct drbg_string data1;
        LIST_HEAD(seedlist);
        enum drbg_seed_state new_seed_state = DRBG_SEED_STATE_FULL;

        /* 9.1 / 9.2 / 9.3.1 step 3 */
        if (pers && pers->len > (drbg_max_addtl(drbg))) {
                pr_devel("DRBG: personalization string too long %zu\n",
                         pers->len);
                return -EINVAL;
        }

        if (list_empty(&drbg->test_data.list)) {
                drbg_string_fill(&data1, drbg->test_data.buf,
                                 drbg->test_data.len);
                pr_devel("DRBG: using test entropy\n");
        } else {
                /*
                 * Gather entropy equal to the security strength of the DRBG.
                 * With a derivation function, a nonce is required in addition
                 * to the entropy. A nonce must be at least 1/2 of the security
                 * strength of the DRBG in size. Thus, entropy + nonce is 3/2
                 * of the strength. The consideration of a nonce is only
                 * applicable during initial seeding.
                 */
                BUG_ON(!entropylen);
                if (!reseed)
                        entropylen = ((entropylen + 1) / 2) * 3;
                BUG_ON((entropylen * 2) > sizeof(entropy));

                /* Get seed from in-kernel /dev/urandom */
                if (!rng_is_initialized())
                        new_seed_state = DRBG_SEED_STATE_PARTIAL;

                ret = drbg_get_random_bytes(drbg, entropy, entropylen);
                if (ret)
                        goto out;

                if (!drbg->jent) {
                        drbg_string_fill(&data1, entropy, entropylen);
                        pr_devel("DRBG: (re)seeding with %u bytes of entropy\n",
                                 entropylen);
                } else {
                        /*
                         * Get seed from Jitter RNG, failures are
                         * fatal only in FIPS mode.
                         */
                        ret = crypto_rng_get_bytes(drbg->jent,
                                                   entropy + entropylen,
                                                   entropylen);
                        if (fips_enabled && ret) {
                                pr_devel("DRBG: jent failed with %d\n", ret);

                                /*
                                 * Do not treat the transient failure of the
                                 * Jitter RNG as an error that needs to be
                                 * reported. The combined number of the
                                 * maximum reseed threshold times the maximum
                                 * number of Jitter RNG transient errors is
                                 * less than the reseed threshold required by
                                 * SP800-90A allowing us to treat the
                                 * transient errors as such.
                                 *
                                 * However, we mandate that at least the first
                                 * seeding operation must succeed with the
                                 * Jitter RNG.
                                 */
                                if (!reseed || ret != -EAGAIN)
                                        goto out;
                        }

                        drbg_string_fill(&data1, entropy, entropylen * 2);
                        pr_devel("DRBG: (re)seeding with %u bytes of entropy\n",
                                 entropylen * 2);
                }
        }
        list_add_tail(&data1.list, &seedlist);

        /*
         * concatenation of entropy with personalization str / addtl input)
         * the variable pers is directly handed in by the caller, so check its
         * contents whether it is appropriate
         */
        if (pers && pers->buf && 0 < pers->len) {
                list_add_tail(&pers->list, &seedlist);
                pr_devel("DRBG: using personalization string\n");
        }

        if (!reseed) {
                memset(drbg->V, 0, drbg_statelen(drbg));
                memset(drbg->C, 0, drbg_statelen(drbg));
        }

        ret = __drbg_seed(drbg, &seedlist, reseed, new_seed_state);

out:
        memzero_explicit(entropy, entropylen * 2);

        return ret;
}

/* Free all substructures in a DRBG state without the DRBG state structure */
static inline void drbg_dealloc_state(struct drbg_state *drbg)
{
        if (!drbg)
                return;
        kfree_sensitive(drbg->Vbuf);
        drbg->Vbuf = NULL;
        drbg->V = NULL;
        kfree_sensitive(drbg->Cbuf);
        drbg->Cbuf = NULL;
        drbg->C = NULL;
        kfree_sensitive(drbg->scratchpadbuf);
        drbg->scratchpadbuf = NULL;
        drbg->reseed_ctr = 0;
        drbg->d_ops = NULL;
        drbg->core = NULL;
        if (IS_ENABLED(CONFIG_CRYPTO_FIPS)) {
                kfree_sensitive(drbg->prev);
                drbg->prev = NULL;
                drbg->fips_primed = false;
        }
}

/*
 * Allocate all sub-structures for a DRBG state.
 * The DRBG state structure must already be allocated.
 */
static inline int drbg_alloc_state(struct drbg_state *drbg)
{
        int ret = -ENOMEM;
        unsigned int sb_size = 0;

        switch (drbg->core->flags & DRBG_TYPE_MASK) {
#ifdef CONFIG_CRYPTO_DRBG_HMAC
        case DRBG_HMAC:
                drbg->d_ops = &drbg_hmac_ops;
                break;
#endif /* CONFIG_CRYPTO_DRBG_HMAC */
#ifdef CONFIG_CRYPTO_DRBG_HASH
        case DRBG_HASH:
                drbg->d_ops = &drbg_hash_ops;
                break;
#endif /* CONFIG_CRYPTO_DRBG_HASH */
#ifdef CONFIG_CRYPTO_DRBG_CTR
        case DRBG_CTR:
                drbg->d_ops = &drbg_ctr_ops;
                break;
#endif /* CONFIG_CRYPTO_DRBG_CTR */
        default:
                ret = -EOPNOTSUPP;
                goto err;
        }

        ret = drbg->d_ops->crypto_init(drbg);
        if (ret < 0)
                goto err;

        drbg->Vbuf = kmalloc(drbg_statelen(drbg) + ret, GFP_KERNEL);
        if (!drbg->Vbuf) {
                ret = -ENOMEM;
                goto fini;
        }
        drbg->V = PTR_ALIGN(drbg->Vbuf, ret + 1);
        drbg->Cbuf = kmalloc(drbg_statelen(drbg) + ret, GFP_KERNEL);
        if (!drbg->Cbuf) {
                ret = -ENOMEM;
                goto fini;
        }
        drbg->C = PTR_ALIGN(drbg->Cbuf, ret + 1);
        /* scratchpad is only generated for CTR and Hash */
        if (drbg->core->flags & DRBG_HMAC)
                sb_size = 0;
        else if (drbg->core->flags & DRBG_CTR)
                sb_size = drbg_statelen(drbg) + drbg_blocklen(drbg) + /* temp */
                          drbg_statelen(drbg) +        /* df_data */
                          drbg_blocklen(drbg) +        /* pad */
                          drbg_blocklen(drbg) +        /* iv */
                          drbg_statelen(drbg) + drbg_blocklen(drbg); /* temp */
        else
                sb_size = drbg_statelen(drbg) + drbg_blocklen(drbg);

        if (0 < sb_size) {
                drbg->scratchpadbuf = kzalloc(sb_size + ret, GFP_KERNEL);
                if (!drbg->scratchpadbuf) {
                        ret = -ENOMEM;
                        goto fini;
                }
                drbg->scratchpad = PTR_ALIGN(drbg->scratchpadbuf, ret + 1);
        }

        if (IS_ENABLED(CONFIG_CRYPTO_FIPS)) {
                drbg->prev = kzalloc(drbg_sec_strength(drbg->core->flags),
                                     GFP_KERNEL);
                if (!drbg->prev) {
                        ret = -ENOMEM;
                        goto fini;
                }
                drbg->fips_primed = false;
        }

        return 0;

fini:
        drbg->d_ops->crypto_fini(drbg);
err:
        drbg_dealloc_state(drbg);
        return ret;
}

/*************************************************************************
 * DRBG interface functions
 *************************************************************************/

/*
 * DRBG generate function as required by SP800-90A - this function
 * generates random numbers
 *
 * @drbg DRBG state handle
 * @buf Buffer where to store the random numbers -- the buffer must already
 *      be pre-allocated by caller
 * @buflen Length of output buffer - this value defines the number of random
 *           bytes pulled from DRBG
 * @addtl Additional input that is mixed into state, may be NULL -- note
 *          the entropy is pulled by the DRBG internally unconditionally
 *          as defined in SP800-90A. The additional input is mixed into
 *          the state in addition to the pulled entropy.
 *
 * return: 0 when all bytes are generated; < 0 in case of an error
 */
static int drbg_generate(struct drbg_state *drbg,
                         unsigned char *buf, unsigned int buflen,
                         struct drbg_string *addtl)
{
        int len = 0;
        LIST_HEAD(addtllist);

        if (!drbg->core) {
                pr_devel("DRBG: not yet seeded\n");
                return -EINVAL;
        }
        if (0 == buflen || !buf) {
                pr_devel("DRBG: no output buffer provided\n");
                return -EINVAL;
        }
        if (addtl && NULL == addtl->buf && 0 < addtl->len) {
                pr_devel("DRBG: wrong format of additional information\n");
                return -EINVAL;
        }

        /* 9.3.1 step 2 */
        len = -EINVAL;
        if (buflen > (drbg_max_request_bytes(drbg))) {
                pr_devel("DRBG: requested random numbers too large %u\n",
                         buflen);
                goto err;
        }

        /* 9.3.1 step 3 is implicit with the chosen DRBG */

        /* 9.3.1 step 4 */
        if (addtl && addtl->len > (drbg_max_addtl(drbg))) {
                pr_devel("DRBG: additional information string too long %zu\n",
                         addtl->len);
                goto err;
        }
        /* 9.3.1 step 5 is implicit with the chosen DRBG */

        /*
         * 9.3.1 step 6 and 9 supplemented by 9.3.2 step c is implemented
         * here. The spec is a bit convoluted here, we make it simpler.
         */
        if (drbg->reseed_threshold < drbg->reseed_ctr)
                drbg->seeded = DRBG_SEED_STATE_UNSEEDED;

        if (drbg->pr || drbg->seeded == DRBG_SEED_STATE_UNSEEDED) {
                pr_devel("DRBG: reseeding before generation (prediction "
                         "resistance: %s, state %s)\n",
                         str_true_false(drbg->pr),
                         (drbg->seeded ==  DRBG_SEED_STATE_FULL ?
                          "seeded" : "unseeded"));
                /* 9.3.1 steps 7.1 through 7.3 */
                len = drbg_seed(drbg, addtl, true);
                if (len)
                        goto err;
                /* 9.3.1 step 7.4 */
                addtl = NULL;
        } else if (rng_is_initialized() &&
                   (drbg->seeded == DRBG_SEED_STATE_PARTIAL ||
                    drbg_nopr_reseed_interval_elapsed(drbg))) {
                len = drbg_seed_from_random(drbg);
                if (len)
                        goto err;
        }

        if (addtl && 0 < addtl->len)
                list_add_tail(&addtl->list, &addtllist);
        /* 9.3.1 step 8 and 10 */
        len = drbg->d_ops->generate(drbg, buf, buflen, &addtllist);

        /* 10.1.1.4 step 6, 10.1.2.5 step 7, 10.2.1.5.2 step 7 */
        drbg->reseed_ctr++;
        if (0 >= len)
                goto err;

        /*
         * Section 11.3.3 requires to re-perform self tests after some
         * generated random numbers. The chosen value after which self
         * test is performed is arbitrary, but it should be reasonable.
         * However, we do not perform the self tests because of the following
         * reasons: it is mathematically impossible that the initial self tests
         * were successfully and the following are not. If the initial would
         * pass and the following would not, the kernel integrity is violated.
         * In this case, the entire kernel operation is questionable and it
         * is unlikely that the integrity violation only affects the
         * correct operation of the DRBG.
         *
         * Albeit the following code is commented out, it is provided in
         * case somebody has a need to implement the test of 11.3.3.
         */
#if 0
        if (drbg->reseed_ctr && !(drbg->reseed_ctr % 4096)) {
                int err = 0;
                pr_devel("DRBG: start to perform self test\n");
                if (drbg->core->flags & DRBG_HMAC)
                        err = alg_test("drbg_pr_hmac_sha512",
                                       "drbg_pr_hmac_sha512", 0, 0);
                else if (drbg->core->flags & DRBG_CTR)
                        err = alg_test("drbg_pr_ctr_aes256",
                                       "drbg_pr_ctr_aes256", 0, 0);
                else
                        err = alg_test("drbg_pr_sha256",
                                       "drbg_pr_sha256", 0, 0);
                if (err) {
                        pr_err("DRBG: periodical self test failed\n");
                        /*
                         * uninstantiate implies that from now on, only errors
                         * are returned when reusing this DRBG cipher handle
                         */
                        drbg_uninstantiate(drbg);
                        return 0;
                } else {
                        pr_devel("DRBG: self test successful\n");
                }
        }
#endif

        /*
         * All operations were successful, return 0 as mandated by
         * the kernel crypto API interface.
         */
        len = 0;
err:
        return len;
}

/*
 * Wrapper around drbg_generate which can pull arbitrary long strings
 * from the DRBG without hitting the maximum request limitation.
 *
 * Parameters: see drbg_generate
 * Return codes: see drbg_generate -- if one drbg_generate request fails,
 *                 the entire drbg_generate_long request fails
 */
static int drbg_generate_long(struct drbg_state *drbg,
                              unsigned char *buf, unsigned int buflen,
                              struct drbg_string *addtl)
{
        unsigned int len = 0;
        unsigned int slice = 0;
        do {
                int err = 0;
                unsigned int chunk = 0;
                slice = ((buflen - len) / drbg_max_request_bytes(drbg));
                chunk = slice ? drbg_max_request_bytes(drbg) : (buflen - len);
                mutex_lock(&drbg->drbg_mutex);
                err = drbg_generate(drbg, buf + len, chunk, addtl);
                mutex_unlock(&drbg->drbg_mutex);
                if (0 > err)
                        return err;
                len += chunk;
        } while (slice > 0 && (len < buflen));
        return 0;
}

static int drbg_prepare_hrng(struct drbg_state *drbg)
{
        /* We do not need an HRNG in test mode. */
        if (list_empty(&drbg->test_data.list))
                return 0;

        drbg->jent = crypto_alloc_rng("jitterentropy_rng", 0, 0);
        if (IS_ERR(drbg->jent)) {
                const int err = PTR_ERR(drbg->jent);

                drbg->jent = NULL;
                if (fips_enabled)
                        return err;
                pr_info("DRBG: Continuing without Jitter RNG\n");
        }

        return 0;
}

/*
 * DRBG instantiation function as required by SP800-90A - this function
 * sets up the DRBG handle, performs the initial seeding and all sanity
 * checks required by SP800-90A
 *
 * @drbg memory of state -- if NULL, new memory is allocated
 * @pers Personalization string that is mixed into state, may be NULL -- note
 *         the entropy is pulled by the DRBG internally unconditionally
 *         as defined in SP800-90A. The additional input is mixed into
 *         the state in addition to the pulled entropy.
 * @coreref reference to core
 * @pr prediction resistance enabled
 *
 * return
 *        0 on success
 *        error value otherwise
 */
static int drbg_instantiate(struct drbg_state *drbg, struct drbg_string *pers,
                            int coreref, bool pr)
{
        int ret;
        bool reseed = true;

        pr_devel("DRBG: Initializing DRBG core %d with prediction resistance "
                 "%s\n", coreref, str_enabled_disabled(pr));
        mutex_lock(&drbg->drbg_mutex);

        /* 9.1 step 1 is implicit with the selected DRBG type */

        /*
         * 9.1 step 2 is implicit as caller can select prediction resistance
         * and the flag is copied into drbg->flags --
         * all DRBG types support prediction resistance
         */

        /* 9.1 step 4 is implicit in  drbg_sec_strength */

        if (!drbg->core) {
                drbg->core = &drbg_cores[coreref];
                drbg->pr = pr;
                drbg->seeded = DRBG_SEED_STATE_UNSEEDED;
                drbg->last_seed_time = 0;
                drbg->reseed_threshold = drbg_max_requests(drbg);

                ret = drbg_alloc_state(drbg);
                if (ret)
                        goto unlock;

                ret = drbg_prepare_hrng(drbg);
                if (ret)
                        goto free_everything;

                reseed = false;
        }

        ret = drbg_seed(drbg, pers, reseed);

        if (ret && !reseed)
                goto free_everything;

        mutex_unlock(&drbg->drbg_mutex);
        return ret;

unlock:
        mutex_unlock(&drbg->drbg_mutex);
        return ret;

free_everything:
        mutex_unlock(&drbg->drbg_mutex);
        drbg_uninstantiate(drbg);
        return ret;
}

/*
 * DRBG uninstantiate function as required by SP800-90A - this function
 * frees all buffers and the DRBG handle
 *
 * @drbg DRBG state handle
 *
 * return
 *        0 on success
 */
static int drbg_uninstantiate(struct drbg_state *drbg)
{
        if (!IS_ERR_OR_NULL(drbg->jent))
                crypto_free_rng(drbg->jent);
        drbg->jent = NULL;

        if (drbg->d_ops)
                drbg->d_ops->crypto_fini(drbg);
        drbg_dealloc_state(drbg);
        /* no scrubbing of test_data -- this shall survive an uninstantiate */
        return 0;
}

/*
 * Helper function for setting the test data in the DRBG
 *
 * @drbg DRBG state handle
 * @data test data
 * @len test data length
 */
static void drbg_kcapi_set_entropy(struct crypto_rng *tfm,
                                   const u8 *data, unsigned int len)
{
        struct drbg_state *drbg = crypto_rng_ctx(tfm);

        mutex_lock(&drbg->drbg_mutex);
        drbg_string_fill(&drbg->test_data, data, len);
        mutex_unlock(&drbg->drbg_mutex);
}

/***************************************************************
 * Kernel crypto API cipher invocations requested by DRBG
 ***************************************************************/

#if defined(CONFIG_CRYPTO_DRBG_HASH) || defined(CONFIG_CRYPTO_DRBG_HMAC)
struct sdesc {
        struct shash_desc shash;
        char ctx[];
};

static int drbg_init_hash_kernel(struct drbg_state *drbg)
{
        struct sdesc *sdesc;
        struct crypto_shash *tfm;

        tfm = crypto_alloc_shash(drbg->core->backend_cra_name, 0, 0);
        if (IS_ERR(tfm)) {
                pr_info("DRBG: could not allocate digest TFM handle: %s\n",
                                drbg->core->backend_cra_name);
                return PTR_ERR(tfm);
        }
        BUG_ON(drbg_blocklen(drbg) != crypto_shash_digestsize(tfm));
        sdesc = kzalloc(sizeof(struct shash_desc) + crypto_shash_descsize(tfm),
                        GFP_KERNEL);
        if (!sdesc) {
                crypto_free_shash(tfm);
                return -ENOMEM;
        }

        sdesc->shash.tfm = tfm;
        drbg->priv_data = sdesc;

        return 0;
}

static int drbg_fini_hash_kernel(struct drbg_state *drbg)
{
        struct sdesc *sdesc = drbg->priv_data;
        if (sdesc) {
                crypto_free_shash(sdesc->shash.tfm);
                kfree_sensitive(sdesc);
        }
        drbg->priv_data = NULL;
        return 0;
}

static void drbg_kcapi_hmacsetkey(struct drbg_state *drbg,
                                  const unsigned char *key)
{
        struct sdesc *sdesc = drbg->priv_data;

        crypto_shash_setkey(sdesc->shash.tfm, key, drbg_statelen(drbg));
}

static int drbg_kcapi_hash(struct drbg_state *drbg, unsigned char *outval,
                           const struct list_head *in)
{
        struct sdesc *sdesc = drbg->priv_data;
        struct drbg_string *input = NULL;

        crypto_shash_init(&sdesc->shash);
        list_for_each_entry(input, in, list)
                crypto_shash_update(&sdesc->shash, input->buf, input->len);
        return crypto_shash_final(&sdesc->shash, outval);
}
#endif /* (CONFIG_CRYPTO_DRBG_HASH || CONFIG_CRYPTO_DRBG_HMAC) */

#ifdef CONFIG_CRYPTO_DRBG_CTR
static int drbg_fini_sym_kernel(struct drbg_state *drbg)
{
        struct crypto_cipher *tfm =
                (struct crypto_cipher *)drbg->priv_data;
        if (tfm)
                crypto_free_cipher(tfm);
        drbg->priv_data = NULL;

        if (drbg->ctr_handle)
                crypto_free_skcipher(drbg->ctr_handle);
        drbg->ctr_handle = NULL;

        if (drbg->ctr_req)
                skcipher_request_free(drbg->ctr_req);
        drbg->ctr_req = NULL;

        kfree(drbg->outscratchpadbuf);
        drbg->outscratchpadbuf = NULL;

        return 0;
}

static int drbg_init_sym_kernel(struct drbg_state *drbg)
{
        struct crypto_cipher *tfm;
        struct crypto_skcipher *sk_tfm;
        struct skcipher_request *req;
        unsigned int alignmask;
        char ctr_name[CRYPTO_MAX_ALG_NAME];

        tfm = crypto_alloc_cipher(drbg->core->backend_cra_name, 0, 0);
        if (IS_ERR(tfm)) {
                pr_info("DRBG: could not allocate cipher TFM handle: %s\n",
                                drbg->core->backend_cra_name);
                return PTR_ERR(tfm);
        }
        BUG_ON(drbg_blocklen(drbg) != crypto_cipher_blocksize(tfm));
        drbg->priv_data = tfm;

        if (snprintf(ctr_name, CRYPTO_MAX_ALG_NAME, "ctr(%s)",
            drbg->core->backend_cra_name) >= CRYPTO_MAX_ALG_NAME) {
                drbg_fini_sym_kernel(drbg);
                return -EINVAL;
        }
        sk_tfm = crypto_alloc_skcipher(ctr_name, 0, 0);
        if (IS_ERR(sk_tfm)) {
                pr_info("DRBG: could not allocate CTR cipher TFM handle: %s\n",
                                ctr_name);
                drbg_fini_sym_kernel(drbg);
                return PTR_ERR(sk_tfm);
        }
        drbg->ctr_handle = sk_tfm;
        crypto_init_wait(&drbg->ctr_wait);

        req = skcipher_request_alloc(sk_tfm, GFP_KERNEL);
        if (!req) {
                pr_info("DRBG: could not allocate request queue\n");
                drbg_fini_sym_kernel(drbg);
                return -ENOMEM;
        }
        drbg->ctr_req = req;
        skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG |
                                                CRYPTO_TFM_REQ_MAY_SLEEP,
                                        crypto_req_done, &drbg->ctr_wait);

        alignmask = crypto_skcipher_alignmask(sk_tfm);
        drbg->outscratchpadbuf = kmalloc(DRBG_OUTSCRATCHLEN + alignmask,
                                         GFP_KERNEL);
        if (!drbg->outscratchpadbuf) {
                drbg_fini_sym_kernel(drbg);
                return -ENOMEM;
        }
        drbg->outscratchpad = (u8 *)PTR_ALIGN(drbg->outscratchpadbuf,
                                              alignmask + 1);

        sg_init_table(&drbg->sg_in, 1);
        sg_init_one(&drbg->sg_out, drbg->outscratchpad, DRBG_OUTSCRATCHLEN);

        return alignmask;
}

static void drbg_kcapi_symsetkey(struct drbg_state *drbg,
                                 const unsigned char *key)
{
        struct crypto_cipher *tfm = drbg->priv_data;

        crypto_cipher_setkey(tfm, key, (drbg_keylen(drbg)));
}

static int drbg_kcapi_sym(struct drbg_state *drbg, unsigned char *outval,
                          const struct drbg_string *in)
{
        struct crypto_cipher *tfm = drbg->priv_data;

        /* there is only component in *in */
        BUG_ON(in->len < drbg_blocklen(drbg));
        crypto_cipher_encrypt_one(tfm, outval, in->buf);
        return 0;
}

static int drbg_kcapi_sym_ctr(struct drbg_state *drbg,
                              u8 *inbuf, u32 inlen,
                              u8 *outbuf, u32 outlen)
{
        struct scatterlist *sg_in = &drbg->sg_in, *sg_out = &drbg->sg_out;
        u32 scratchpad_use = min_t(u32, outlen, DRBG_OUTSCRATCHLEN);
        int ret;

        if (inbuf) {
                /* Use caller-provided input buffer */
                sg_set_buf(sg_in, inbuf, inlen);
        } else {
                /* Use scratchpad for in-place operation */
                inlen = scratchpad_use;
                memset(drbg->outscratchpad, 0, scratchpad_use);
                sg_set_buf(sg_in, drbg->outscratchpad, scratchpad_use);
        }

        while (outlen) {
                u32 cryptlen = min3(inlen, outlen, (u32)DRBG_OUTSCRATCHLEN);

                /* Output buffer may not be valid for SGL, use scratchpad */
                skcipher_request_set_crypt(drbg->ctr_req, sg_in, sg_out,
                                           cryptlen, drbg->V);
                ret = crypto_wait_req(crypto_skcipher_encrypt(drbg->ctr_req),
                                        &drbg->ctr_wait);
                if (ret)
                        goto out;

                crypto_init_wait(&drbg->ctr_wait);

                memcpy(outbuf, drbg->outscratchpad, cryptlen);
                memzero_explicit(drbg->outscratchpad, cryptlen);

                outlen -= cryptlen;
                outbuf += cryptlen;
        }
        ret = 0;

out:
        return ret;
}
#endif /* CONFIG_CRYPTO_DRBG_CTR */

/***************************************************************
 * Kernel crypto API interface to register DRBG
 ***************************************************************/

/*
 * Look up the DRBG flags by given kernel crypto API cra_name
 * The code uses the drbg_cores definition to do this
 *
 * @cra_name kernel crypto API cra_name
 * @coreref reference to integer which is filled with the pointer to
 *  the applicable core
 * @pr reference for setting prediction resistance
 *
 * return: flags
 */
static inline void drbg_convert_tfm_core(const char *cra_driver_name,
                                         int *coreref, bool *pr)
{
        int i = 0;
        size_t start = 0;
        int len = 0;

        *pr = true;
        /* disassemble the names */
        if (!memcmp(cra_driver_name, "drbg_nopr_", 10)) {
                start = 10;
                *pr = false;
        } else if (!memcmp(cra_driver_name, "drbg_pr_", 8)) {
                start = 8;
        } else {
                return;
        }

        /* remove the first part */
        len = strlen(cra_driver_name) - start;
        for (i = 0; ARRAY_SIZE(drbg_cores) > i; i++) {
                if (!memcmp(cra_driver_name + start, drbg_cores[i].cra_name,
                            len)) {
                        *coreref = i;
                        return;
                }
        }
}

static int drbg_kcapi_init(struct crypto_tfm *tfm)
{
        struct drbg_state *drbg = crypto_tfm_ctx(tfm);

        mutex_init(&drbg->drbg_mutex);

        return 0;
}

static void drbg_kcapi_cleanup(struct crypto_tfm *tfm)
{
        drbg_uninstantiate(crypto_tfm_ctx(tfm));
}

/*
 * Generate random numbers invoked by the kernel crypto API:
 * The API of the kernel crypto API is extended as follows:
 *
 * src is additional input supplied to the RNG.
 * slen is the length of src.
 * dst is the output buffer where random data is to be stored.
 * dlen is the length of dst.
 */
static int drbg_kcapi_random(struct crypto_rng *tfm,
                             const u8 *src, unsigned int slen,
                             u8 *dst, unsigned int dlen)
{
        struct drbg_state *drbg = crypto_rng_ctx(tfm);
        struct drbg_string *addtl = NULL;
        struct drbg_string string;

        if (slen) {
                /* linked list variable is now local to allow modification */
                drbg_string_fill(&string, src, slen);
                addtl = &string;
        }

        return drbg_generate_long(drbg, dst, dlen, addtl);
}

/*
 * Seed the DRBG invoked by the kernel crypto API
 */
static int drbg_kcapi_seed(struct crypto_rng *tfm,
                           const u8 *seed, unsigned int slen)
{
        struct drbg_state *drbg = crypto_rng_ctx(tfm);
        struct crypto_tfm *tfm_base = crypto_rng_tfm(tfm);
        bool pr = false;
        struct drbg_string string;
        struct drbg_string *seed_string = NULL;
        int coreref = 0;

        drbg_convert_tfm_core(crypto_tfm_alg_driver_name(tfm_base), &coreref,
                              &pr);
        if (0 < slen) {
                drbg_string_fill(&string, seed, slen);
                seed_string = &string;
        }

        return drbg_instantiate(drbg, seed_string, coreref, pr);
}

/***************************************************************
 * Kernel module: code to load the module
 ***************************************************************/

/*
 * Tests as defined in 11.3.2 in addition to the cipher tests: testing
 * of the error handling.
 *
 * Note: testing of failing seed source as defined in 11.3.2 is not applicable
 * as seed source of get_random_bytes does not fail.
 *
 * Note 2: There is no sensible way of testing the reseed counter
 * enforcement, so skip it.
 */
static inline int __init drbg_healthcheck_sanity(void)
{
        int len = 0;
#define OUTBUFLEN 16
        unsigned char buf[OUTBUFLEN];
        struct drbg_state *drbg = NULL;
        int ret;
        int rc = -EFAULT;
        bool pr = false;
        int coreref = 0;
        struct drbg_string addtl;
        size_t max_addtllen, max_request_bytes;

        /* only perform test in FIPS mode */
        if (!fips_enabled)
                return 0;

#ifdef CONFIG_CRYPTO_DRBG_CTR
        drbg_convert_tfm_core("drbg_nopr_ctr_aes256", &coreref, &pr);
#endif
#ifdef CONFIG_CRYPTO_DRBG_HASH
        drbg_convert_tfm_core("drbg_nopr_sha256", &coreref, &pr);
#endif
#ifdef CONFIG_CRYPTO_DRBG_HMAC
        drbg_convert_tfm_core("drbg_nopr_hmac_sha512", &coreref, &pr);
#endif

        drbg = kzalloc(sizeof(struct drbg_state), GFP_KERNEL);
        if (!drbg)
                return -ENOMEM;

        mutex_init(&drbg->drbg_mutex);
        drbg->core = &drbg_cores[coreref];
        drbg->reseed_threshold = drbg_max_requests(drbg);

        /*
         * if the following tests fail, it is likely that there is a buffer
         * overflow as buf is much smaller than the requested or provided
         * string lengths -- in case the error handling does not succeed
         * we may get an OOPS. And we want to get an OOPS as this is a
         * grave bug.
         */

        max_addtllen = drbg_max_addtl(drbg);
        max_request_bytes = drbg_max_request_bytes(drbg);
        drbg_string_fill(&addtl, buf, max_addtllen + 1);
        /* overflow addtllen with additonal info string */
        len = drbg_generate(drbg, buf, OUTBUFLEN, &addtl);
        BUG_ON(0 < len);
        /* overflow max_bits */
        len = drbg_generate(drbg, buf, (max_request_bytes + 1), NULL);
        BUG_ON(0 < len);

        /* overflow max addtllen with personalization string */
        ret = drbg_seed(drbg, &addtl, false);
        BUG_ON(0 == ret);
        /* all tests passed */
        rc = 0;

        pr_devel("DRBG: Sanity tests for failure code paths successfully "
                 "completed\n");

        kfree(drbg);
        return rc;
}

static struct rng_alg drbg_algs[22];

/*
 * Fill the array drbg_algs used to register the different DRBGs
 * with the kernel crypto API. To fill the array, the information
 * from drbg_cores[] is used.
 */
static inline void __init drbg_fill_array(struct rng_alg *alg,
                                          const struct drbg_core *core, int pr)
{
        int pos = 0;
        static int priority = 200;

        memcpy(alg->base.cra_name, "stdrng", 6);
        if (pr) {
                memcpy(alg->base.cra_driver_name, "drbg_pr_", 8);
                pos = 8;
        } else {
                memcpy(alg->base.cra_driver_name, "drbg_nopr_", 10);
                pos = 10;
        }
        memcpy(alg->base.cra_driver_name + pos, core->cra_name,
               strlen(core->cra_name));

        alg->base.cra_priority = priority;
        priority++;
        /*
         * If FIPS mode enabled, the selected DRBG shall have the
         * highest cra_priority over other stdrng instances to ensure
         * it is selected.
         */
        if (fips_enabled)
                alg->base.cra_priority += 200;

        alg->base.cra_ctxsize         = sizeof(struct drbg_state);
        alg->base.cra_module        = THIS_MODULE;
        alg->base.cra_init        = drbg_kcapi_init;
        alg->base.cra_exit        = drbg_kcapi_cleanup;
        alg->generate                = drbg_kcapi_random;
        alg->seed                = drbg_kcapi_seed;
        alg->set_ent                = drbg_kcapi_set_entropy;
        alg->seedsize                = 0;
}

static int __init drbg_init(void)
{
        unsigned int i = 0; /* pointer to drbg_algs */
        unsigned int j = 0; /* pointer to drbg_cores */
        int ret;

        ret = drbg_healthcheck_sanity();
        if (ret)
                return ret;

        if (ARRAY_SIZE(drbg_cores) * 2 > ARRAY_SIZE(drbg_algs)) {
                pr_info("DRBG: Cannot register all DRBG types"
                        "(slots needed: %zu, slots available: %zu)\n",
                        ARRAY_SIZE(drbg_cores) * 2, ARRAY_SIZE(drbg_algs));
                return -EFAULT;
        }

        /*
         * each DRBG definition can be used with PR and without PR, thus
         * we instantiate each DRBG in drbg_cores[] twice.
         *
         * As the order of placing them into the drbg_algs array matters
         * (the later DRBGs receive a higher cra_priority) we register the
         * prediction resistance DRBGs first as the should not be too
         * interesting.
         */
        for (j = 0; ARRAY_SIZE(drbg_cores) > j; j++, i++)
                drbg_fill_array(&drbg_algs[i], &drbg_cores[j], 1);
        for (j = 0; ARRAY_SIZE(drbg_cores) > j; j++, i++)
                drbg_fill_array(&drbg_algs[i], &drbg_cores[j], 0);
        return crypto_register_rngs(drbg_algs, (ARRAY_SIZE(drbg_cores) * 2));
}

static void __exit drbg_exit(void)
{
        crypto_unregister_rngs(drbg_algs, (ARRAY_SIZE(drbg_cores) * 2));
}

module_init(drbg_init);
module_exit(drbg_exit);
#ifndef CRYPTO_DRBG_HASH_STRING
#define CRYPTO_DRBG_HASH_STRING ""
#endif
#ifndef CRYPTO_DRBG_HMAC_STRING
#define CRYPTO_DRBG_HMAC_STRING ""
#endif
#ifndef CRYPTO_DRBG_CTR_STRING
#define CRYPTO_DRBG_CTR_STRING ""
#endif
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Stephan Mueller <smueller@chronox.de>");
MODULE_DESCRIPTION("NIST SP800-90A Deterministic Random Bit Generator (DRBG) "
                   "using following cores: "
                   CRYPTO_DRBG_HASH_STRING
                   CRYPTO_DRBG_HMAC_STRING
                   CRYPTO_DRBG_CTR_STRING);
MODULE_ALIAS_CRYPTO("stdrng");
MODULE_IMPORT_NS("CRYPTO_INTERNAL");
































    1 














    1 



    1 



    1 

















































    1 





    9 







    1 











    8 



    9 


    1 



    9 
























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Cryptographic API.
 *
 * RNG operations.
 *
 * Copyright (c) 2008 Neil Horman <nhorman@tuxdriver.com>
 * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au>
 */

#include <crypto/internal/rng.h>
#include <linux/atomic.h>
#include <linux/cryptouser.h>
#include <linux/err.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/random.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <net/netlink.h>

#include "internal.h"

static DEFINE_MUTEX(crypto_default_rng_lock);
struct crypto_rng *crypto_default_rng;
EXPORT_SYMBOL_GPL(crypto_default_rng);
static int crypto_default_rng_refcnt;

int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen)
{
        u8 *buf = NULL;
        int err;

        if (!seed && slen) {
                buf = kmalloc(slen, GFP_KERNEL);
                if (!buf)
                        return -ENOMEM;

                err = get_random_bytes_wait(buf, slen);
                if (err)
                        goto out;
                seed = buf;
        }

        err = crypto_rng_alg(tfm)->seed(tfm, seed, slen);
out:
        kfree_sensitive(buf);
        return err;
}
EXPORT_SYMBOL_GPL(crypto_rng_reset);

static int crypto_rng_init_tfm(struct crypto_tfm *tfm)
{
        return 0;
}

static unsigned int seedsize(struct crypto_alg *alg)
{
        struct rng_alg *ralg = container_of(alg, struct rng_alg, base);

        return ralg->seedsize;
}

static int __maybe_unused crypto_rng_report(
        struct sk_buff *skb, struct crypto_alg *alg)
{
        struct crypto_report_rng rrng;

        memset(&rrng, 0, sizeof(rrng));

        strscpy(rrng.type, "rng", sizeof(rrng.type));

        rrng.seedsize = seedsize(alg);

        return nla_put(skb, CRYPTOCFGA_REPORT_RNG, sizeof(rrng), &rrng);
}

static void crypto_rng_show(struct seq_file *m, struct crypto_alg *alg)
        __maybe_unused;
static void crypto_rng_show(struct seq_file *m, struct crypto_alg *alg)
{
        seq_printf(m, "type         : rng\n");
        seq_printf(m, "seedsize     : %u\n", seedsize(alg));
}

static const struct crypto_type crypto_rng_type = {
        .extsize = crypto_alg_extsize,
        .init_tfm = crypto_rng_init_tfm,
#ifdef CONFIG_PROC_FS
        .show = crypto_rng_show,
#endif
#if IS_ENABLED(CONFIG_CRYPTO_USER)
        .report = crypto_rng_report,
#endif
        .maskclear = ~CRYPTO_ALG_TYPE_MASK,
        .maskset = CRYPTO_ALG_TYPE_MASK,
        .type = CRYPTO_ALG_TYPE_RNG,
        .tfmsize = offsetof(struct crypto_rng, base),
        .algsize = offsetof(struct rng_alg, base),
};

struct crypto_rng *crypto_alloc_rng(const char *alg_name, u32 type, u32 mask)
{
        return crypto_alloc_tfm(alg_name, &crypto_rng_type, type, mask);
}
EXPORT_SYMBOL_GPL(crypto_alloc_rng);

int crypto_get_default_rng(void)
{
        struct crypto_rng *rng;
        int err;

        mutex_lock(&crypto_default_rng_lock);
        if (!crypto_default_rng) {
                rng = crypto_alloc_rng("stdrng", 0, 0);
                err = PTR_ERR(rng);
                if (IS_ERR(rng))
                        goto unlock;

                err = crypto_rng_reset(rng, NULL, crypto_rng_seedsize(rng));
                if (err) {
                        crypto_free_rng(rng);
                        goto unlock;
                }

                crypto_default_rng = rng;
        }

        crypto_default_rng_refcnt++;
        err = 0;

unlock:
        mutex_unlock(&crypto_default_rng_lock);

        return err;
}
EXPORT_SYMBOL_GPL(crypto_get_default_rng);

void crypto_put_default_rng(void)
{
        mutex_lock(&crypto_default_rng_lock);
        crypto_default_rng_refcnt--;
        mutex_unlock(&crypto_default_rng_lock);
}
EXPORT_SYMBOL_GPL(crypto_put_default_rng);

#if defined(CONFIG_CRYPTO_RNG) || defined(CONFIG_CRYPTO_RNG_MODULE)
int crypto_del_default_rng(void)
{
        int err = -EBUSY;

        mutex_lock(&crypto_default_rng_lock);
        if (crypto_default_rng_refcnt)
                goto out;

        crypto_free_rng(crypto_default_rng);
        crypto_default_rng = NULL;

        err = 0;

out:
        mutex_unlock(&crypto_default_rng_lock);

        return err;
}
EXPORT_SYMBOL_GPL(crypto_del_default_rng);
#endif

static void rng_default_set_ent(struct crypto_rng *tfm, const u8 *data,
                                unsigned int len)
{
}

int crypto_register_rng(struct rng_alg *alg)
{
        struct crypto_alg *base = &alg->base;

        if (alg->seedsize > PAGE_SIZE / 8)
                return -EINVAL;

        base->cra_type = &crypto_rng_type;
        base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
        base->cra_flags |= CRYPTO_ALG_TYPE_RNG;

        if (!alg->set_ent)
                alg->set_ent = rng_default_set_ent;

        return crypto_register_alg(base);
}
EXPORT_SYMBOL_GPL(crypto_register_rng);

void crypto_unregister_rng(struct rng_alg *alg)
{
        crypto_unregister_alg(&alg->base);
}
EXPORT_SYMBOL_GPL(crypto_unregister_rng);

int crypto_register_rngs(struct rng_alg *algs, int count)
{
        int i, ret;

        for (i = 0; i < count; i++) {
                ret = crypto_register_rng(algs + i);
                if (ret)
                        goto err;
        }

        return 0;

err:
        for (--i; i >= 0; --i)
                crypto_unregister_rng(algs + i);

        return ret;
}
EXPORT_SYMBOL_GPL(crypto_register_rngs);

void crypto_unregister_rngs(struct rng_alg *algs, int count)
{
        int i;

        for (i = count - 1; i >= 0; --i)
                crypto_unregister_rng(algs + i);
}
EXPORT_SYMBOL_GPL(crypto_unregister_rngs);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Random Number Generator");













































    3 
    3 







    3 

































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Cryptographic API.
 *
 * Single-block cipher operations.
 *
 * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
 * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au>
 */

#include <crypto/algapi.h>
#include <crypto/internal/cipher.h>
#include <linux/kernel.h>
#include <linux/crypto.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/string.h>
#include "internal.h"

static int setkey_unaligned(struct crypto_cipher *tfm, const u8 *key,
                            unsigned int keylen)
{
        struct cipher_alg *cia = crypto_cipher_alg(tfm);
        unsigned long alignmask = crypto_cipher_alignmask(tfm);
        int ret;
        u8 *buffer, *alignbuffer;
        unsigned long absize;

        absize = keylen + alignmask;
        buffer = kmalloc(absize, GFP_ATOMIC);
        if (!buffer)
                return -ENOMEM;

        alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1);
        memcpy(alignbuffer, key, keylen);
        ret = cia->cia_setkey(crypto_cipher_tfm(tfm), alignbuffer, keylen);
        kfree_sensitive(buffer);
        return ret;

}

int crypto_cipher_setkey(struct crypto_cipher *tfm,
                         const u8 *key, unsigned int keylen)
{
        struct cipher_alg *cia = crypto_cipher_alg(tfm);
        unsigned long alignmask = crypto_cipher_alignmask(tfm);

        if (keylen < cia->cia_min_keysize || keylen > cia->cia_max_keysize)
                return -EINVAL;

        if ((unsigned long)key & alignmask)
                return setkey_unaligned(tfm, key, keylen);

        return cia->cia_setkey(crypto_cipher_tfm(tfm), key, keylen);
}
EXPORT_SYMBOL_NS_GPL(crypto_cipher_setkey, "CRYPTO_INTERNAL");

static inline void cipher_crypt_one(struct crypto_cipher *tfm,
                                    u8 *dst, const u8 *src, bool enc)
{
        unsigned long alignmask = crypto_cipher_alignmask(tfm);
        struct cipher_alg *cia = crypto_cipher_alg(tfm);
        void (*fn)(struct crypto_tfm *, u8 *, const u8 *) =
                enc ? cia->cia_encrypt : cia->cia_decrypt;

        if (unlikely(((unsigned long)dst | (unsigned long)src) & alignmask)) {
                unsigned int bs = crypto_cipher_blocksize(tfm);
                u8 buffer[MAX_CIPHER_BLOCKSIZE + MAX_CIPHER_ALIGNMASK];
                u8 *tmp = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1);

                memcpy(tmp, src, bs);
                fn(crypto_cipher_tfm(tfm), tmp, tmp);
                memcpy(dst, tmp, bs);
        } else {
                fn(crypto_cipher_tfm(tfm), dst, src);
        }
}

void crypto_cipher_encrypt_one(struct crypto_cipher *tfm,
                               u8 *dst, const u8 *src)
{
        cipher_crypt_one(tfm, dst, src, true);
}
EXPORT_SYMBOL_NS_GPL(crypto_cipher_encrypt_one, "CRYPTO_INTERNAL");

void crypto_cipher_decrypt_one(struct crypto_cipher *tfm,
                               u8 *dst, const u8 *src)
{
        cipher_crypt_one(tfm, dst, src, false);
}
EXPORT_SYMBOL_NS_GPL(crypto_cipher_decrypt_one, "CRYPTO_INTERNAL");

struct crypto_cipher *crypto_clone_cipher(struct crypto_cipher *cipher)
{
        struct crypto_tfm *tfm = crypto_cipher_tfm(cipher);
        struct crypto_alg *alg = tfm->__crt_alg;
        struct crypto_cipher *ncipher;
        struct crypto_tfm *ntfm;

        if (alg->cra_init)
                return ERR_PTR(-ENOSYS);

        if (unlikely(!crypto_mod_get(alg)))
                return ERR_PTR(-ESTALE);

        ntfm = __crypto_alloc_tfmgfp(alg, CRYPTO_ALG_TYPE_CIPHER,
                                     CRYPTO_ALG_TYPE_MASK, GFP_ATOMIC);
        if (IS_ERR(ntfm)) {
                crypto_mod_put(alg);
                return ERR_CAST(ntfm);
        }

        ntfm->crt_flags = tfm->crt_flags;

        ncipher = __crypto_cipher_cast(ntfm);

        return ncipher;
}
EXPORT_SYMBOL_GPL(crypto_clone_cipher);












































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_TLBFLUSH_H
#define _ASM_X86_TLBFLUSH_H

#include <linux/mm_types.h>
#include <linux/mmu_notifier.h>
#include <linux/sched.h>

#include <asm/barrier.h>
#include <asm/processor.h>
#include <asm/cpufeature.h>
#include <asm/special_insns.h>
#include <asm/smp.h>
#include <asm/invpcid.h>
#include <asm/pti.h>
#include <asm/processor-flags.h>
#include <asm/pgtable.h>

DECLARE_PER_CPU(u64, tlbstate_untag_mask);

void __flush_tlb_all(void);

#define TLB_FLUSH_ALL        -1UL
#define TLB_GENERATION_INVALID        0

void cr4_update_irqsoff(unsigned long set, unsigned long clear);
unsigned long cr4_read_shadow(void);

/* Set in this cpu's CR4. */
static inline void cr4_set_bits_irqsoff(unsigned long mask)
{
        cr4_update_irqsoff(mask, 0);
}

/* Clear in this cpu's CR4. */
static inline void cr4_clear_bits_irqsoff(unsigned long mask)
{
        cr4_update_irqsoff(0, mask);
}

/* Set in this cpu's CR4. */
static inline void cr4_set_bits(unsigned long mask)
{
        unsigned long flags;

        local_irq_save(flags);
        cr4_set_bits_irqsoff(mask);
        local_irq_restore(flags);
}

/* Clear in this cpu's CR4. */
static inline void cr4_clear_bits(unsigned long mask)
{
        unsigned long flags;

        local_irq_save(flags);
        cr4_clear_bits_irqsoff(mask);
        local_irq_restore(flags);
}

#ifndef MODULE
/*
 * 6 because 6 should be plenty and struct tlb_state will fit in two cache
 * lines.
 */
#define TLB_NR_DYN_ASIDS        6

struct tlb_context {
        u64 ctx_id;
        u64 tlb_gen;
};

struct tlb_state {
        /*
         * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts
         * are on.  This means that it may not match current->active_mm,
         * which will contain the previous user mm when we're in lazy TLB
         * mode even if we've already switched back to swapper_pg_dir.
         *
         * During switch_mm_irqs_off(), loaded_mm will be set to
         * LOADED_MM_SWITCHING during the brief interrupts-off window
         * when CR3 and loaded_mm would otherwise be inconsistent.  This
         * is for nmi_uaccess_okay()'s benefit.
         */
        struct mm_struct *loaded_mm;

#define LOADED_MM_SWITCHING ((struct mm_struct *)1UL)

        /* Last user mm for optimizing IBPB */
        union {
                struct mm_struct        *last_user_mm;
                unsigned long                last_user_mm_spec;
        };

        u16 loaded_mm_asid;
        u16 next_asid;

        /*
         * If set we changed the page tables in such a way that we
         * needed an invalidation of all contexts (aka. PCIDs / ASIDs).
         * This tells us to go invalidate all the non-loaded ctxs[]
         * on the next context switch.
         *
         * The current ctx was kept up-to-date as it ran and does not
         * need to be invalidated.
         */
        bool invalidate_other;

#ifdef CONFIG_ADDRESS_MASKING
        /*
         * Active LAM mode.
         *
         * X86_CR3_LAM_U57/U48 shifted right by X86_CR3_LAM_U57_BIT or 0 if LAM
         * disabled.
         */
        u8 lam;
#endif

        /*
         * Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate
         * the corresponding user PCID needs a flush next time we
         * switch to it; see SWITCH_TO_USER_CR3.
         */
        unsigned short user_pcid_flush_mask;

        /*
         * Access to this CR4 shadow and to H/W CR4 is protected by
         * disabling interrupts when modifying either one.
         */
        unsigned long cr4;

        /*
         * This is a list of all contexts that might exist in the TLB.
         * There is one per ASID that we use, and the ASID (what the
         * CPU calls PCID) is the index into ctxts.
         *
         * For each context, ctx_id indicates which mm the TLB's user
         * entries came from.  As an invariant, the TLB will never
         * contain entries that are out-of-date as when that mm reached
         * the tlb_gen in the list.
         *
         * To be clear, this means that it's legal for the TLB code to
         * flush the TLB without updating tlb_gen.  This can happen
         * (for now, at least) due to paravirt remote flushes.
         *
         * NB: context 0 is a bit special, since it's also used by
         * various bits of init code.  This is fine -- code that
         * isn't aware of PCID will end up harmlessly flushing
         * context 0.
         */
        struct tlb_context ctxs[TLB_NR_DYN_ASIDS];
};
DECLARE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate);

struct tlb_state_shared {
        /*
         * We can be in one of several states:
         *
         *  - Actively using an mm.  Our CPU's bit will be set in
         *    mm_cpumask(loaded_mm) and is_lazy == false;
         *
         *  - Not using a real mm.  loaded_mm == &init_mm.  Our CPU's bit
         *    will not be set in mm_cpumask(&init_mm) and is_lazy == false.
         *
         *  - Lazily using a real mm.  loaded_mm != &init_mm, our bit
         *    is set in mm_cpumask(loaded_mm), but is_lazy == true.
         *    We're heuristically guessing that the CR3 load we
         *    skipped more than makes up for the overhead added by
         *    lazy mode.
         */
        bool is_lazy;
};
DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared);

bool nmi_uaccess_okay(void);
#define nmi_uaccess_okay nmi_uaccess_okay

/* Initialize cr4 shadow for this CPU. */
static inline void cr4_init_shadow(void)
{
        this_cpu_write(cpu_tlbstate.cr4, __read_cr4());
}

extern unsigned long mmu_cr4_features;
extern u32 *trampoline_cr4_features;

/* How many pages can be invalidated with one INVLPGB. */
extern u16 invlpgb_count_max;

extern void initialize_tlbstate_and_flush(void);

/*
 * TLB flushing:
 *
 *  - flush_tlb_all() flushes all processes TLBs
 *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
 *  - flush_tlb_page(vma, vmaddr) flushes one page
 *  - flush_tlb_range(vma, start, end) flushes a range of pages
 *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
 *  - flush_tlb_multi(cpumask, info) flushes TLBs on multiple cpus
 *
 * ..but the i386 has somewhat limited tlb flushing capabilities,
 * and page-granular flushes are available only on i486 and up.
 */
struct flush_tlb_info {
        /*
         * We support several kinds of flushes.
         *
         * - Fully flush a single mm.  .mm will be set, .end will be
         *   TLB_FLUSH_ALL, and .new_tlb_gen will be the tlb_gen to
         *   which the IPI sender is trying to catch us up.
         *
         * - Partially flush a single mm.  .mm will be set, .start and
         *   .end will indicate the range, and .new_tlb_gen will be set
         *   such that the changes between generation .new_tlb_gen-1 and
         *   .new_tlb_gen are entirely contained in the indicated range.
         *
         * - Fully flush all mms whose tlb_gens have been updated.  .mm
         *   will be NULL, .end will be TLB_FLUSH_ALL, and .new_tlb_gen
         *   will be zero.
         */
        struct mm_struct        *mm;
        unsigned long                start;
        unsigned long                end;
        u64                        new_tlb_gen;
        unsigned int                initiating_cpu;
        u8                        stride_shift;
        u8                        freed_tables;
        u8                        trim_cpumask;
};

void flush_tlb_local(void);
void flush_tlb_one_user(unsigned long addr);
void flush_tlb_one_kernel(unsigned long addr);
void flush_tlb_multi(const struct cpumask *cpumask,
                      const struct flush_tlb_info *info);

static inline bool is_dyn_asid(u16 asid)
{
        return asid < TLB_NR_DYN_ASIDS;
}

static inline bool is_global_asid(u16 asid)
{
        return !is_dyn_asid(asid);
}

#ifdef CONFIG_BROADCAST_TLB_FLUSH
static inline u16 mm_global_asid(struct mm_struct *mm)
{
        u16 asid;

        if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
                return 0;

        asid = smp_load_acquire(&mm->context.global_asid);

        /* mm->context.global_asid is either 0, or a global ASID */
        VM_WARN_ON_ONCE(asid && is_dyn_asid(asid));

        return asid;
}

static inline void mm_init_global_asid(struct mm_struct *mm)
{
        if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
                mm->context.global_asid = 0;
                mm->context.asid_transition = false;
        }
}

static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid)
{
        /*
         * Notably flush_tlb_mm_range() -> broadcast_tlb_flush() ->
         * finish_asid_transition() needs to observe asid_transition = true
         * once it observes global_asid.
         */
        mm->context.asid_transition = true;
        smp_store_release(&mm->context.global_asid, asid);
}

static inline void mm_clear_asid_transition(struct mm_struct *mm)
{
        WRITE_ONCE(mm->context.asid_transition, false);
}

static inline bool mm_in_asid_transition(struct mm_struct *mm)
{
        if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
                return false;

        return mm && READ_ONCE(mm->context.asid_transition);
}
#else
static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; }
static inline void mm_init_global_asid(struct mm_struct *mm) { }
static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { }
static inline void mm_clear_asid_transition(struct mm_struct *mm) { }
static inline bool mm_in_asid_transition(struct mm_struct *mm) { return false; }
#endif /* CONFIG_BROADCAST_TLB_FLUSH */

#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#endif

#define flush_tlb_mm(mm)                                                \
                flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL, true)

#define flush_tlb_range(vma, start, end)                                \
        flush_tlb_mm_range((vma)->vm_mm, start, end,                        \
                           ((vma)->vm_flags & VM_HUGETLB)                \
                                ? huge_page_shift(hstate_vma(vma))        \
                                : PAGE_SHIFT, true)

extern void flush_tlb_all(void);
extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
                                unsigned long end, unsigned int stride_shift,
                                bool freed_tables);
extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);

static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
{
        flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false);
}

static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm)
{
        bool should_defer = false;

        /* If remote CPUs need to be flushed then defer batch the flush */
        if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
                should_defer = true;
        put_cpu();

        return should_defer;
}

static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
{
        /*
         * Bump the generation count.  This also serves as a full barrier
         * that synchronizes with switch_mm(): callers are required to order
         * their read of mm_cpumask after their writes to the paging
         * structures.
         */
        return atomic64_inc_return(&mm->context.tlb_gen);
}

static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
                struct mm_struct *mm, unsigned long start, unsigned long end)
{
        inc_mm_tlb_gen(mm);
        cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
        batch->unmapped_pages = true;
        mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
}

extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);

static inline bool pte_flags_need_flush(unsigned long oldflags,
                                        unsigned long newflags,
                                        bool ignore_access)
{
        /*
         * Flags that require a flush when cleared but not when they are set.
         * Only include flags that would not trigger spurious page-faults.
         * Non-present entries are not cached. Hardware would set the
         * dirty/access bit if needed without a fault.
         */
        const pteval_t flush_on_clear = _PAGE_DIRTY | _PAGE_PRESENT |
                                        _PAGE_ACCESSED;
        const pteval_t software_flags = _PAGE_SOFTW1 | _PAGE_SOFTW2 |
                                        _PAGE_SOFTW3 | _PAGE_SOFTW4 |
                                        _PAGE_SAVED_DIRTY;
        const pteval_t flush_on_change = _PAGE_RW | _PAGE_USER | _PAGE_PWT |
                          _PAGE_PCD | _PAGE_PSE | _PAGE_GLOBAL | _PAGE_PAT |
                          _PAGE_PAT_LARGE | _PAGE_PKEY_BIT0 | _PAGE_PKEY_BIT1 |
                          _PAGE_PKEY_BIT2 | _PAGE_PKEY_BIT3 | _PAGE_NX;
        unsigned long diff = oldflags ^ newflags;

        BUILD_BUG_ON(flush_on_clear & software_flags);
        BUILD_BUG_ON(flush_on_clear & flush_on_change);
        BUILD_BUG_ON(flush_on_change & software_flags);

        /* Ignore software flags */
        diff &= ~software_flags;

        if (ignore_access)
                diff &= ~_PAGE_ACCESSED;

        /*
         * Did any of the 'flush_on_clear' flags was clleared set from between
         * 'oldflags' and 'newflags'?
         */
        if (diff & oldflags & flush_on_clear)
                return true;

        /* Flush on modified flags. */
        if (diff & flush_on_change)
                return true;

        /* Ensure there are no flags that were left behind */
        if (IS_ENABLED(CONFIG_DEBUG_VM) &&
            (diff & ~(flush_on_clear | software_flags | flush_on_change))) {
                VM_WARN_ON_ONCE(1);
                return true;
        }

        return false;
}

/*
 * pte_needs_flush() checks whether permissions were demoted and require a
 * flush. It should only be used for userspace PTEs.
 */
static inline bool pte_needs_flush(pte_t oldpte, pte_t newpte)
{
        /* !PRESENT -> * ; no need for flush */
        if (!(pte_flags(oldpte) & _PAGE_PRESENT))
                return false;

        /* PFN changed ; needs flush */
        if (pte_pfn(oldpte) != pte_pfn(newpte))
                return true;

        /*
         * check PTE flags; ignore access-bit; see comment in
         * ptep_clear_flush_young().
         */
        return pte_flags_need_flush(pte_flags(oldpte), pte_flags(newpte),
                                    true);
}
#define pte_needs_flush pte_needs_flush

/*
 * huge_pmd_needs_flush() checks whether permissions were demoted and require a
 * flush. It should only be used for userspace huge PMDs.
 */
static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd)
{
        /* !PRESENT -> * ; no need for flush */
        if (!(pmd_flags(oldpmd) & _PAGE_PRESENT))
                return false;

        /* PFN changed ; needs flush */
        if (pmd_pfn(oldpmd) != pmd_pfn(newpmd))
                return true;

        /*
         * check PMD flags; do not ignore access-bit; see
         * pmdp_clear_flush_young().
         */
        return pte_flags_need_flush(pmd_flags(oldpmd), pmd_flags(newpmd),
                                    false);
}
#define huge_pmd_needs_flush huge_pmd_needs_flush

#ifdef CONFIG_ADDRESS_MASKING
static inline  u64 tlbstate_lam_cr3_mask(void)
{
        u64 lam = this_cpu_read(cpu_tlbstate.lam);

        return lam << X86_CR3_LAM_U57_BIT;
}

static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask)
{
        this_cpu_write(cpu_tlbstate.lam, lam >> X86_CR3_LAM_U57_BIT);
        this_cpu_write(tlbstate_untag_mask, untag_mask);
}

#else

static inline u64 tlbstate_lam_cr3_mask(void)
{
        return 0;
}

static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask)
{
}
#endif
#endif /* !MODULE */

static inline void __native_tlb_flush_global(unsigned long cr4)
{
        native_write_cr4(cr4 ^ X86_CR4_PGE);
        native_write_cr4(cr4);
}
#endif /* _ASM_X86_TLBFLUSH_H */






















































































   60 
















































































   43 














































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
/*
 * include/linux/topology.h
 *
 * Written by: Matthew Dobson, IBM Corporation
 *
 * Copyright (C) 2002, IBM Corp.
 *
 * All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
 * NON INFRINGEMENT.  See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 * Send feedback to <colpatch@us.ibm.com>
 */
#ifndef _LINUX_TOPOLOGY_H
#define _LINUX_TOPOLOGY_H

#include <linux/arch_topology.h>
#include <linux/cpumask.h>
#include <linux/nodemask.h>
#include <linux/bitops.h>
#include <linux/mmzone.h>
#include <linux/smp.h>
#include <linux/percpu.h>
#include <asm/topology.h>

#ifndef nr_cpus_node
#define nr_cpus_node(node) cpumask_weight(cpumask_of_node(node))
#endif

int arch_update_cpu_topology(void);

/* Conform to ACPI 2.0 SLIT distance definitions */
#define LOCAL_DISTANCE                10
#define REMOTE_DISTANCE                20
#define DISTANCE_BITS           8
#ifndef node_distance
#define node_distance(from,to)        ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE)
#endif
#ifndef RECLAIM_DISTANCE
/*
 * If the distance between nodes in a system is larger than RECLAIM_DISTANCE
 * (in whatever arch specific measurement units returned by node_distance())
 * and node_reclaim_mode is enabled then the VM will only call node_reclaim()
 * on nodes within this distance.
 */
#define RECLAIM_DISTANCE 30
#endif

/*
 * The following tunable allows platforms to override the default node
 * reclaim distance (RECLAIM_DISTANCE) if remote memory accesses are
 * sufficiently fast that the default value actually hurts
 * performance.
 *
 * AMD EPYC machines use this because even though the 2-hop distance
 * is 32 (3.2x slower than a local memory access) performance actually
 * *improves* if allowed to reclaim memory and load balance tasks
 * between NUMA nodes 2-hops apart.
 */
extern int __read_mostly node_reclaim_distance;

#ifndef PENALTY_FOR_NODE_WITH_CPUS
#define PENALTY_FOR_NODE_WITH_CPUS        (1)
#endif

#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DECLARE_PER_CPU(int, numa_node);

#ifndef numa_node_id
/* Returns the number of the current Node. */
static inline int numa_node_id(void)
{
        return raw_cpu_read(numa_node);
}
#endif

#ifndef cpu_to_node
static inline int cpu_to_node(int cpu)
{
        return per_cpu(numa_node, cpu);
}
#endif

#ifndef set_numa_node
static inline void set_numa_node(int node)
{
        this_cpu_write(numa_node, node);
}
#endif

#ifndef set_cpu_numa_node
static inline void set_cpu_numa_node(int cpu, int node)
{
        per_cpu(numa_node, cpu) = node;
}
#endif

#else        /* !CONFIG_USE_PERCPU_NUMA_NODE_ID */

/* Returns the number of the current Node. */
#ifndef numa_node_id
static inline int numa_node_id(void)
{
        return cpu_to_node(raw_smp_processor_id());
}
#endif

#endif        /* [!]CONFIG_USE_PERCPU_NUMA_NODE_ID */

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

/*
 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem().
 */
DECLARE_PER_CPU(int, _numa_mem_);

#ifndef set_numa_mem
static inline void set_numa_mem(int node)
{
        this_cpu_write(_numa_mem_, node);
}
#endif

#ifndef numa_mem_id
/* Returns the number of the nearest Node with memory */
static inline int numa_mem_id(void)
{
        return raw_cpu_read(_numa_mem_);
}
#endif

#ifndef cpu_to_mem
static inline int cpu_to_mem(int cpu)
{
        return per_cpu(_numa_mem_, cpu);
}
#endif

#ifndef set_cpu_numa_mem
static inline void set_cpu_numa_mem(int cpu, int node)
{
        per_cpu(_numa_mem_, cpu) = node;
}
#endif

#else        /* !CONFIG_HAVE_MEMORYLESS_NODES */

#ifndef numa_mem_id
/* Returns the number of the nearest Node with memory */
static inline int numa_mem_id(void)
{
        return numa_node_id();
}
#endif

#ifndef cpu_to_mem
static inline int cpu_to_mem(int cpu)
{
        return cpu_to_node(cpu);
}
#endif

#endif        /* [!]CONFIG_HAVE_MEMORYLESS_NODES */

#if defined(topology_die_id) && defined(topology_die_cpumask)
#define TOPOLOGY_DIE_SYSFS
#endif
#if defined(topology_cluster_id) && defined(topology_cluster_cpumask)
#define TOPOLOGY_CLUSTER_SYSFS
#endif
#if defined(topology_book_id) && defined(topology_book_cpumask)
#define TOPOLOGY_BOOK_SYSFS
#endif
#if defined(topology_drawer_id) && defined(topology_drawer_cpumask)
#define TOPOLOGY_DRAWER_SYSFS
#endif

#ifndef topology_physical_package_id
#define topology_physical_package_id(cpu)        ((void)(cpu), -1)
#endif
#ifndef topology_die_id
#define topology_die_id(cpu)                        ((void)(cpu), -1)
#endif
#ifndef topology_cluster_id
#define topology_cluster_id(cpu)                ((void)(cpu), -1)
#endif
#ifndef topology_core_id
#define topology_core_id(cpu)                        ((void)(cpu), 0)
#endif
#ifndef topology_book_id
#define topology_book_id(cpu)                        ((void)(cpu), -1)
#endif
#ifndef topology_drawer_id
#define topology_drawer_id(cpu)                        ((void)(cpu), -1)
#endif
#ifndef topology_ppin
#define topology_ppin(cpu)                        ((void)(cpu), 0ull)
#endif
#ifndef topology_sibling_cpumask
#define topology_sibling_cpumask(cpu)                cpumask_of(cpu)
#endif
#ifndef topology_core_cpumask
#define topology_core_cpumask(cpu)                cpumask_of(cpu)
#endif
#ifndef topology_cluster_cpumask
#define topology_cluster_cpumask(cpu)                cpumask_of(cpu)
#endif
#ifndef topology_die_cpumask
#define topology_die_cpumask(cpu)                cpumask_of(cpu)
#endif
#ifndef topology_book_cpumask
#define topology_book_cpumask(cpu)                cpumask_of(cpu)
#endif
#ifndef topology_drawer_cpumask
#define topology_drawer_cpumask(cpu)                cpumask_of(cpu)
#endif

#if defined(CONFIG_SCHED_SMT) && !defined(cpu_smt_mask)
static inline const struct cpumask *cpu_smt_mask(int cpu)
{
        return topology_sibling_cpumask(cpu);
}
#endif

#ifndef topology_is_primary_thread

static inline bool topology_is_primary_thread(unsigned int cpu)
{
        /*
         * When disabling SMT, the primary thread of the SMT will remain
         * enabled/active. Architectures that have a special primary thread
         * (e.g. x86) need to override this function. Otherwise the first
         * thread in the SMT can be made the primary thread.
         *
         * The sibling cpumask of an offline CPU always contains the CPU
         * itself on architectures using the implementation of
         * CONFIG_GENERIC_ARCH_TOPOLOGY for building their topology.
         * Other architectures not using CONFIG_GENERIC_ARCH_TOPOLOGY for
         * building their topology have to check whether to use this default
         * implementation or to override it.
         */
        return cpu == cpumask_first(topology_sibling_cpumask(cpu));
}
#define topology_is_primary_thread topology_is_primary_thread

#endif

static inline const struct cpumask *cpu_node_mask(int cpu)
{
        return cpumask_of_node(cpu_to_node(cpu));
}

#ifdef CONFIG_NUMA
int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node);
extern const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops);
#else
static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
{
        return cpumask_nth_and(cpu, cpus, cpu_online_mask);
}

static inline const struct cpumask *
sched_numa_hop_mask(unsigned int node, unsigned int hops)
{
        return ERR_PTR(-EOPNOTSUPP);
}
#endif        /* CONFIG_NUMA */

/**
 * for_each_node_numadist() - iterate over nodes in increasing distance
 *                              order, starting from a given node
 * @node: the iteration variable and the starting node.
 * @unvisited: a nodemask to keep track of the unvisited nodes.
 *
 * This macro iterates over NUMA node IDs in increasing distance from the
 * starting @node and yields MAX_NUMNODES when all the nodes have been
 * visited.
 *
 * Note that by the time the loop completes, the @unvisited nodemask will
 * be fully cleared, unless the loop exits early.
 *
 * The difference between for_each_node() and for_each_node_numadist() is
 * that the former allows to iterate over nodes in numerical order, whereas
 * the latter iterates over nodes in increasing order of distance.
 *
 * This complexity of this iterator is O(N^2), where N represents the
 * number of nodes, as each iteration involves scanning all nodes to
 * find the one with the shortest distance.
 *
 * Requires rcu_lock to be held.
 */
#define for_each_node_numadist(node, unvisited)                                        \
        for (int __start = (node),                                                \
             (node) = nearest_node_nodemask((__start), &(unvisited));                \
             (node) < MAX_NUMNODES;                                                \
             node_clear((node), (unvisited)),                                        \
             (node) = nearest_node_nodemask((__start), &(unvisited)))

/**
 * for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance
 *                          from a given node.
 * @mask: the iteration variable.
 * @node: the NUMA node to start the search from.
 *
 * Requires rcu_lock to be held.
 *
 * Yields cpu_online_mask for @node == NUMA_NO_NODE.
 */
#define for_each_numa_hop_mask(mask, node)                                       \
        for (unsigned int __hops = 0;                                               \
             mask = (node != NUMA_NO_NODE || __hops) ?                               \
                     sched_numa_hop_mask(node, __hops) :                       \
                     cpu_online_mask,                                               \
             !IS_ERR_OR_NULL(mask);                                               \
             __hops++)

DECLARE_PER_CPU(unsigned long, cpu_scale);

static inline unsigned long topology_get_cpu_scale(int cpu)
{
        return per_cpu(cpu_scale, cpu);
}

void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity);

#endif /* _LINUX_TOPOLOGY_H */






















    6 

    7 
    7 







































    6 













    7 







    7 

    7 

























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * The "hash function" used as the core of the ChaCha stream cipher (RFC7539)
 *
 * Copyright (C) 2015 Martin Willi
 */

#include <crypto/chacha.h>
#include <linux/bitops.h>
#include <linux/bug.h>
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/unaligned.h>

static void chacha_permute(struct chacha_state *state, int nrounds)
{
        u32 *x = state->x;
        int i;

        /* whitelist the allowed round counts */
        WARN_ON_ONCE(nrounds != 20 && nrounds != 12);

        for (i = 0; i < nrounds; i += 2) {
                x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],  16);
                x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],  16);
                x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],  16);
                x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],  16);

                x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],  12);
                x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],  12);
                x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10], 12);
                x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11], 12);

                x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],   8);
                x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],   8);
                x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],   8);
                x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],   8);

                x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],   7);
                x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],   7);
                x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10],  7);
                x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11],  7);

                x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],  16);
                x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],  16);
                x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],  16);
                x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],  16);

                x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10], 12);
                x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11], 12);
                x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],  12);
                x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],  12);

                x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],   8);
                x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],   8);
                x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],   8);
                x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],   8);

                x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10],  7);
                x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11],  7);
                x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],   7);
                x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],   7);
        }
}

/**
 * chacha_block_generic - generate one keystream block and increment block counter
 * @state: input state matrix
 * @out: output keystream block
 * @nrounds: number of rounds (20 or 12; 20 is recommended)
 *
 * This is the ChaCha core, a function from 64-byte strings to 64-byte strings.
 * The caller has already converted the endianness of the input.  This function
 * also handles incrementing the block counter in the input matrix.
 */
void chacha_block_generic(struct chacha_state *state,
                          u8 out[CHACHA_BLOCK_SIZE], int nrounds)
{
        struct chacha_state permuted_state = *state;
        int i;

        chacha_permute(&permuted_state, nrounds);

        for (i = 0; i < ARRAY_SIZE(state->x); i++)
                put_unaligned_le32(permuted_state.x[i] + state->x[i],
                                   &out[i * sizeof(u32)]);

        state->x[12]++;
}
EXPORT_SYMBOL(chacha_block_generic);

/**
 * hchacha_block_generic - abbreviated ChaCha core, for XChaCha
 * @state: input state matrix
 * @out: the output words
 * @nrounds: number of rounds (20 or 12; 20 is recommended)
 *
 * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step
 * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf).  HChaCha
 * skips the final addition of the initial state, and outputs only certain words
 * of the state.  It should not be used for streaming directly.
 */
void hchacha_block_generic(const struct chacha_state *state,
                           u32 out[HCHACHA_OUT_WORDS], int nrounds)
{
        struct chacha_state permuted_state = *state;

        chacha_permute(&permuted_state, nrounds);

        memcpy(&out[0], &permuted_state.x[0], 16);
        memcpy(&out[4], &permuted_state.x[12], 16);
}
EXPORT_SYMBOL(hchacha_block_generic);


































    4 






    4 





    4 























    4 


























































































































































































































































































































    4 

    4 





    4 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MMAP_LOCK_H
#define _LINUX_MMAP_LOCK_H

/* Avoid a dependency loop by declaring here. */
extern int rcuwait_wake_up(struct rcuwait *w);

#include <linux/lockdep.h>
#include <linux/mm_types.h>
#include <linux/mmdebug.h>
#include <linux/rwsem.h>
#include <linux/tracepoint-defs.h>
#include <linux/types.h>
#include <linux/cleanup.h>
#include <linux/sched/mm.h>

#define MMAP_LOCK_INITIALIZER(name) \
        .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock),

DECLARE_TRACEPOINT(mmap_lock_start_locking);
DECLARE_TRACEPOINT(mmap_lock_acquire_returned);
DECLARE_TRACEPOINT(mmap_lock_released);

#ifdef CONFIG_TRACING

void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write);
void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
                                           bool success);
void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write);

static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm,
                                                   bool write)
{
        if (tracepoint_enabled(mmap_lock_start_locking))
                __mmap_lock_do_trace_start_locking(mm, write);
}

static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm,
                                                      bool write, bool success)
{
        if (tracepoint_enabled(mmap_lock_acquire_returned))
                __mmap_lock_do_trace_acquire_returned(mm, write, success);
}

static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write)
{
        if (tracepoint_enabled(mmap_lock_released))
                __mmap_lock_do_trace_released(mm, write);
}

#else /* !CONFIG_TRACING */

static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm,
                                                   bool write)
{
}

static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm,
                                                      bool write, bool success)
{
}

static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write)
{
}

#endif /* CONFIG_TRACING */

static inline void mmap_assert_locked(const struct mm_struct *mm)
{
        rwsem_assert_held(&mm->mmap_lock);
}

static inline void mmap_assert_write_locked(const struct mm_struct *mm)
{
        rwsem_assert_held_write(&mm->mmap_lock);
}

#ifdef CONFIG_PER_VMA_LOCK

static inline void mm_lock_seqcount_init(struct mm_struct *mm)
{
        seqcount_init(&mm->mm_lock_seq);
}

static inline void mm_lock_seqcount_begin(struct mm_struct *mm)
{
        do_raw_write_seqcount_begin(&mm->mm_lock_seq);
}

static inline void mm_lock_seqcount_end(struct mm_struct *mm)
{
        ASSERT_EXCLUSIVE_WRITER(mm->mm_lock_seq);
        do_raw_write_seqcount_end(&mm->mm_lock_seq);
}

static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq)
{
        /*
         * Since mmap_lock is a sleeping lock, and waiting for it to become
         * unlocked is more or less equivalent with taking it ourselves, don't
         * bother with the speculative path if mmap_lock is already write-locked
         * and take the slow path, which takes the lock.
         */
        return raw_seqcount_try_begin(&mm->mm_lock_seq, *seq);
}

static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq)
{
        return read_seqcount_retry(&mm->mm_lock_seq, seq);
}

static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt)
{
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        static struct lock_class_key lockdep_key;

        lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0);
#endif
        if (reset_refcnt)
                refcount_set(&vma->vm_refcnt, 0);
        vma->vm_lock_seq = UINT_MAX;
}

static inline bool is_vma_writer_only(int refcnt)
{
        /*
         * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma
         * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on
         * a detached vma happens only in vma_mark_detached() and is a rare
         * case, therefore most of the time there will be no unnecessary wakeup.
         */
        return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1;
}

static inline void vma_refcount_put(struct vm_area_struct *vma)
{
        /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */
        struct mm_struct *mm = vma->vm_mm;
        int oldcnt;

        rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
        if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) {

                if (is_vma_writer_only(oldcnt - 1))
                        rcuwait_wake_up(&mm->vma_writer_wait);
        }
}

/*
 * Use only while holding mmap read lock which guarantees that locking will not
 * fail (nobody can concurrently write-lock the vma). vma_start_read() should
 * not be used in such cases because it might fail due to mm_lock_seq overflow.
 * This functionality is used to obtain vma read lock and drop the mmap read lock.
 */
static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass)
{
        int oldcnt;

        mmap_assert_locked(vma->vm_mm);
        if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
                                                              VMA_REF_LIMIT)))
                return false;

        rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
        return true;
}

/*
 * Use only while holding mmap read lock which guarantees that locking will not
 * fail (nobody can concurrently write-lock the vma). vma_start_read() should
 * not be used in such cases because it might fail due to mm_lock_seq overflow.
 * This functionality is used to obtain vma read lock and drop the mmap read lock.
 */
static inline bool vma_start_read_locked(struct vm_area_struct *vma)
{
        return vma_start_read_locked_nested(vma, 0);
}

static inline void vma_end_read(struct vm_area_struct *vma)
{
        vma_refcount_put(vma);
}

/* WARNING! Can only be used if mmap_lock is expected to be write-locked */
static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq)
{
        mmap_assert_write_locked(vma->vm_mm);

        /*
         * current task is holding mmap_write_lock, both vma->vm_lock_seq and
         * mm->mm_lock_seq can't be concurrently modified.
         */
        *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence;
        return (vma->vm_lock_seq == *mm_lock_seq);
}

void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq);

/*
 * Begin writing to a VMA.
 * Exclude concurrent readers under the per-VMA lock until the currently
 * write-locked mmap_lock is dropped or downgraded.
 */
static inline void vma_start_write(struct vm_area_struct *vma)
{
        unsigned int mm_lock_seq;

        if (__is_vma_write_locked(vma, &mm_lock_seq))
                return;

        __vma_start_write(vma, mm_lock_seq);
}

static inline void vma_assert_write_locked(struct vm_area_struct *vma)
{
        unsigned int mm_lock_seq;

        VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
}

static inline void vma_assert_locked(struct vm_area_struct *vma)
{
        unsigned int mm_lock_seq;

        VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt) <= 1 &&
                      !__is_vma_write_locked(vma, &mm_lock_seq), vma);
}

/*
 * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these
 * assertions should be made either under mmap_write_lock or when the object
 * has been isolated under mmap_write_lock, ensuring no competing writers.
 */
static inline void vma_assert_attached(struct vm_area_struct *vma)
{
        WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt));
}

static inline void vma_assert_detached(struct vm_area_struct *vma)
{
        WARN_ON_ONCE(refcount_read(&vma->vm_refcnt));
}

static inline void vma_mark_attached(struct vm_area_struct *vma)
{
        vma_assert_write_locked(vma);
        vma_assert_detached(vma);
        refcount_set_release(&vma->vm_refcnt, 1);
}

void vma_mark_detached(struct vm_area_struct *vma);

struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
                                          unsigned long address);

/*
 * Locks next vma pointed by the iterator. Confirms the locked vma has not
 * been modified and will retry under mmap_lock protection if modification
 * was detected. Should be called from read RCU section.
 * Returns either a valid locked VMA, NULL if no more VMAs or -EINTR if the
 * process was interrupted.
 */
struct vm_area_struct *lock_next_vma(struct mm_struct *mm,
                                     struct vma_iterator *iter,
                                     unsigned long address);

#else /* CONFIG_PER_VMA_LOCK */

static inline void mm_lock_seqcount_init(struct mm_struct *mm) {}
static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {}
static inline void mm_lock_seqcount_end(struct mm_struct *mm) {}

static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq)
{
        return false;
}

static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq)
{
        return true;
}
static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {}
static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
                                                    struct vm_area_struct *vma)
                { return NULL; }
static inline void vma_end_read(struct vm_area_struct *vma) {}
static inline void vma_start_write(struct vm_area_struct *vma) {}
static inline void vma_assert_write_locked(struct vm_area_struct *vma)
                { mmap_assert_write_locked(vma->vm_mm); }
static inline void vma_assert_attached(struct vm_area_struct *vma) {}
static inline void vma_assert_detached(struct vm_area_struct *vma) {}
static inline void vma_mark_attached(struct vm_area_struct *vma) {}
static inline void vma_mark_detached(struct vm_area_struct *vma) {}

static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
                unsigned long address)
{
        return NULL;
}

static inline void vma_assert_locked(struct vm_area_struct *vma)
{
        mmap_assert_locked(vma->vm_mm);
}

#endif /* CONFIG_PER_VMA_LOCK */

static inline void mmap_write_lock(struct mm_struct *mm)
{
        __mmap_lock_trace_start_locking(mm, true);
        down_write(&mm->mmap_lock);
        mm_lock_seqcount_begin(mm);
        __mmap_lock_trace_acquire_returned(mm, true, true);
}

static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass)
{
        __mmap_lock_trace_start_locking(mm, true);
        down_write_nested(&mm->mmap_lock, subclass);
        mm_lock_seqcount_begin(mm);
        __mmap_lock_trace_acquire_returned(mm, true, true);
}

static inline int mmap_write_lock_killable(struct mm_struct *mm)
{
        int ret;

        __mmap_lock_trace_start_locking(mm, true);
        ret = down_write_killable(&mm->mmap_lock);
        if (!ret)
                mm_lock_seqcount_begin(mm);
        __mmap_lock_trace_acquire_returned(mm, true, ret == 0);
        return ret;
}

/*
 * Drop all currently-held per-VMA locks.
 * This is called from the mmap_lock implementation directly before releasing
 * a write-locked mmap_lock (or downgrading it to read-locked).
 * This should normally NOT be called manually from other places.
 * If you want to call this manually anyway, keep in mind that this will release
 * *all* VMA write locks, including ones from further up the stack.
 */
static inline void vma_end_write_all(struct mm_struct *mm)
{
        mmap_assert_write_locked(mm);
        mm_lock_seqcount_end(mm);
}

static inline void mmap_write_unlock(struct mm_struct *mm)
{
        __mmap_lock_trace_released(mm, true);
        vma_end_write_all(mm);
        up_write(&mm->mmap_lock);
}

static inline void mmap_write_downgrade(struct mm_struct *mm)
{
        __mmap_lock_trace_acquire_returned(mm, false, true);
        vma_end_write_all(mm);
        downgrade_write(&mm->mmap_lock);
}

static inline void mmap_read_lock(struct mm_struct *mm)
{
        __mmap_lock_trace_start_locking(mm, false);
        down_read(&mm->mmap_lock);
        __mmap_lock_trace_acquire_returned(mm, false, true);
}

static inline int mmap_read_lock_killable(struct mm_struct *mm)
{
        int ret;

        __mmap_lock_trace_start_locking(mm, false);
        ret = down_read_killable(&mm->mmap_lock);
        __mmap_lock_trace_acquire_returned(mm, false, ret == 0);
        return ret;
}

static inline bool mmap_read_trylock(struct mm_struct *mm)
{
        bool ret;

        __mmap_lock_trace_start_locking(mm, false);
        ret = down_read_trylock(&mm->mmap_lock) != 0;
        __mmap_lock_trace_acquire_returned(mm, false, ret);
        return ret;
}

static inline void mmap_read_unlock(struct mm_struct *mm)
{
        __mmap_lock_trace_released(mm, false);
        up_read(&mm->mmap_lock);
}

DEFINE_GUARD(mmap_read_lock, struct mm_struct *,
             mmap_read_lock(_T), mmap_read_unlock(_T))

static inline void mmap_read_unlock_non_owner(struct mm_struct *mm)
{
        __mmap_lock_trace_released(mm, false);
        up_read_non_owner(&mm->mmap_lock);
}

static inline int mmap_lock_is_contended(struct mm_struct *mm)
{
        return rwsem_is_contended(&mm->mmap_lock);
}

#endif /* _LINUX_MMAP_LOCK_H */





























































































































































































































































   57 
































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_NODEMASK_H
#define __LINUX_NODEMASK_H

/*
 * Nodemasks provide a bitmap suitable for representing the
 * set of Node's in a system, one bit position per Node number.
 *
 * See detailed comments in the file linux/bitmap.h describing the
 * data type on which these nodemasks are based.
 *
 * For details of nodemask_parse_user(), see bitmap_parse_user() in
 * lib/bitmap.c.  For details of nodelist_parse(), see bitmap_parselist(),
 * also in bitmap.c.  For details of node_remap(), see bitmap_bitremap in
 * lib/bitmap.c.  For details of nodes_remap(), see bitmap_remap in
 * lib/bitmap.c.  For details of nodes_onto(), see bitmap_onto in
 * lib/bitmap.c.  For details of nodes_fold(), see bitmap_fold in
 * lib/bitmap.c.
 *
 * The available nodemask operations are:
 *
 * void node_set(node, mask)                turn on bit 'node' in mask
 * void node_clear(node, mask)                turn off bit 'node' in mask
 * void nodes_setall(mask)                set all bits
 * void nodes_clear(mask)                clear all bits
 * int node_isset(node, mask)                true iff bit 'node' set in mask
 * int node_test_and_set(node, mask)        test and set bit 'node' in mask
 *
 * void nodes_and(dst, src1, src2)        dst = src1 & src2  [intersection]
 * void nodes_or(dst, src1, src2)        dst = src1 | src2  [union]
 * void nodes_xor(dst, src1, src2)        dst = src1 ^ src2
 * void nodes_andnot(dst, src1, src2)        dst = src1 & ~src2
 * void nodes_complement(dst, src)        dst = ~src
 *
 * int nodes_equal(mask1, mask2)        Does mask1 == mask2?
 * int nodes_intersects(mask1, mask2)        Do mask1 and mask2 intersect?
 * int nodes_subset(mask1, mask2)        Is mask1 a subset of mask2?
 * int nodes_empty(mask)                Is mask empty (no bits sets)?
 * int nodes_full(mask)                        Is mask full (all bits sets)?
 * int nodes_weight(mask)                Hamming weight - number of set bits
 *
 * unsigned int first_node(mask)        Number lowest set bit, or MAX_NUMNODES
 * unsigend int next_node(node, mask)        Next node past 'node', or MAX_NUMNODES
 * unsigned int next_node_in(node, mask) Next node past 'node', or wrap to first,
 *                                        or MAX_NUMNODES
 * unsigned int first_unset_node(mask)        First node not set in mask, or
 *                                        MAX_NUMNODES
 *
 * nodemask_t nodemask_of_node(node)        Return nodemask with bit 'node' set
 * NODE_MASK_ALL                        Initializer - all bits set
 * NODE_MASK_NONE                        Initializer - no bits set
 * unsigned long *nodes_addr(mask)        Array of unsigned long's in mask
 *
 * int nodemask_parse_user(ubuf, ulen, mask)        Parse ascii string as nodemask
 * int nodelist_parse(buf, map)                Parse ascii string as nodelist
 * int node_remap(oldbit, old, new)        newbit = map(old, new)(oldbit)
 * void nodes_remap(dst, src, old, new)        *dst = map(old, new)(src)
 * void nodes_onto(dst, orig, relmap)        *dst = orig relative to relmap
 * void nodes_fold(dst, orig, sz)        dst bits = orig bits mod sz
 *
 * for_each_node_mask(node, mask)        for-loop node over mask
 *
 * int num_online_nodes()                Number of online Nodes
 * int num_possible_nodes()                Number of all possible Nodes
 *
 * int node_random(mask)                Random node with set bit in mask
 *
 * int node_online(node)                Is some node online?
 * int node_possible(node)                Is some node possible?
 *
 * node_set_online(node)                set bit 'node' in node_online_map
 * node_set_offline(node)                clear bit 'node' in node_online_map
 *
 * for_each_node(node)                        for-loop node over node_possible_map
 * for_each_online_node(node)                for-loop node over node_online_map
 *
 * Subtlety:
 * 1) The 'type-checked' form of node_isset() causes gcc (3.3.2, anyway)
 *    to generate slightly worse code.  So use a simple one-line #define
 *    for node_isset(), instead of wrapping an inline inside a macro, the
 *    way we do the other calls.
 *
 * NODEMASK_SCRATCH
 * When doing above logical AND, OR, XOR, Remap operations the callers tend to
 * need temporary nodemask_t's on the stack. But if NODES_SHIFT is large,
 * nodemask_t's consume too much stack space.  NODEMASK_SCRATCH is a helper
 * for such situations. See below and CPUMASK_ALLOC also.
 */

#include <linux/threads.h>
#include <linux/bitmap.h>
#include <linux/minmax.h>
#include <linux/nodemask_types.h>
#include <linux/random.h>

extern nodemask_t _unused_nodemask_arg_;

/**
 * nodemask_pr_args - printf args to output a nodemask
 * @maskp: nodemask to be printed
 *
 * Can be used to provide arguments for '%*pb[l]' when printing a nodemask.
 */
#define nodemask_pr_args(maskp)        __nodemask_pr_numnodes(maskp), \
                                __nodemask_pr_bits(maskp)
static __always_inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m)
{
        return m ? MAX_NUMNODES : 0;
}
static __always_inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m)
{
        return m ? m->bits : NULL;
}

/*
 * The inline keyword gives the compiler room to decide to inline, or
 * not inline a function as it sees best.  However, as these functions
 * are called in both __init and non-__init functions, if they are not
 * inlined we will end up with a section mismatch error (of the type of
 * freeable items not being freed).  So we must use __always_inline here
 * to fix the problem.  If other functions in the future also end up in
 * this situation they will also need to be annotated as __always_inline
 */
#define node_set(node, dst) __node_set((node), &(dst))
static __always_inline void __node_set(int node, volatile nodemask_t *dstp)
{
        set_bit(node, dstp->bits);
}

#define node_clear(node, dst) __node_clear((node), &(dst))
static __always_inline void __node_clear(int node, volatile nodemask_t *dstp)
{
        clear_bit(node, dstp->bits);
}

#define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES)
static __always_inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits)
{
        bitmap_fill(dstp->bits, nbits);
}

#define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES)
static __always_inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits)
{
        bitmap_zero(dstp->bits, nbits);
}

/* No static inline type checking - see Subtlety (1) above. */
#define node_isset(node, nodemask) test_bit((node), (nodemask).bits)

#define node_test_and_set(node, nodemask) \
                        __node_test_and_set((node), &(nodemask))
static __always_inline bool __node_test_and_set(int node, nodemask_t *addr)
{
        return test_and_set_bit(node, addr->bits);
}

#define nodes_and(dst, src1, src2) \
                        __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES)
static __always_inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits);
}

#define nodes_or(dst, src1, src2) \
                        __nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES)
static __always_inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits);
}

#define nodes_xor(dst, src1, src2) \
                        __nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES)
static __always_inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits);
}

#define nodes_andnot(dst, src1, src2) \
                        __nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES)
static __always_inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
}

#define nodes_copy(dst, src) __nodes_copy(&(dst), &(src), MAX_NUMNODES)
static __always_inline void __nodes_copy(nodemask_t *dstp,
                                        const nodemask_t *srcp, unsigned int nbits)
{
        bitmap_copy(dstp->bits, srcp->bits, nbits);
}

#define nodes_complement(dst, src) \
                        __nodes_complement(&(dst), &(src), MAX_NUMNODES)
static __always_inline void __nodes_complement(nodemask_t *dstp,
                                        const nodemask_t *srcp, unsigned int nbits)
{
        bitmap_complement(dstp->bits, srcp->bits, nbits);
}

#define nodes_equal(src1, src2) \
                        __nodes_equal(&(src1), &(src2), MAX_NUMNODES)
static __always_inline bool __nodes_equal(const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        return bitmap_equal(src1p->bits, src2p->bits, nbits);
}

#define nodes_intersects(src1, src2) \
                        __nodes_intersects(&(src1), &(src2), MAX_NUMNODES)
static __always_inline bool __nodes_intersects(const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        return bitmap_intersects(src1p->bits, src2p->bits, nbits);
}

#define nodes_subset(src1, src2) \
                        __nodes_subset(&(src1), &(src2), MAX_NUMNODES)
static __always_inline bool __nodes_subset(const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        return bitmap_subset(src1p->bits, src2p->bits, nbits);
}

#define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES)
static __always_inline bool __nodes_empty(const nodemask_t *srcp, unsigned int nbits)
{
        return bitmap_empty(srcp->bits, nbits);
}

#define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES)
static __always_inline bool __nodes_full(const nodemask_t *srcp, unsigned int nbits)
{
        return bitmap_full(srcp->bits, nbits);
}

#define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES)
static __always_inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits)
{
        return bitmap_weight(srcp->bits, nbits);
}

/* FIXME: better would be to fix all architectures to never return
          > MAX_NUMNODES, then the silly min_ts could be dropped. */

#define first_node(src) __first_node(&(src))
static __always_inline unsigned int __first_node(const nodemask_t *srcp)
{
        return min_t(unsigned int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES));
}

#define next_node(n, src) __next_node((n), &(src))
static __always_inline unsigned int __next_node(int n, const nodemask_t *srcp)
{
        return min_t(unsigned int, MAX_NUMNODES, find_next_bit(srcp->bits, MAX_NUMNODES, n+1));
}

/*
 * Find the next present node in src, starting after node n, wrapping around to
 * the first node in src if needed.  Returns MAX_NUMNODES if src is empty.
 */
#define next_node_in(n, src) __next_node_in((n), &(src))
static __always_inline unsigned int __next_node_in(int node, const nodemask_t *srcp)
{
        unsigned int ret = __next_node(node, srcp);

        if (ret == MAX_NUMNODES)
                ret = __first_node(srcp);
        return ret;
}

static __always_inline void init_nodemask_of_node(nodemask_t *mask, int node)
{
        nodes_clear(*mask);
        node_set(node, *mask);
}

#define nodemask_of_node(node)                                                \
({                                                                        \
        typeof(_unused_nodemask_arg_) m;                                \
        if (sizeof(m) == sizeof(unsigned long)) {                        \
                m.bits[0] = 1UL << (node);                                \
        } else {                                                        \
                init_nodemask_of_node(&m, (node));                        \
        }                                                                \
        m;                                                                \
})

#define first_unset_node(mask) __first_unset_node(&(mask))
static __always_inline unsigned int __first_unset_node(const nodemask_t *maskp)
{
        return min_t(unsigned int, MAX_NUMNODES,
                        find_first_zero_bit(maskp->bits, MAX_NUMNODES));
}

#define NODE_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(MAX_NUMNODES)

#if MAX_NUMNODES <= BITS_PER_LONG

#define NODE_MASK_ALL                                                        \
((nodemask_t) { {                                                        \
        [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD                \
} })

#else

#define NODE_MASK_ALL                                                        \
((nodemask_t) { {                                                        \
        [0 ... BITS_TO_LONGS(MAX_NUMNODES)-2] = ~0UL,                        \
        [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD                \
} })

#endif

#define NODE_MASK_NONE                                                        \
((nodemask_t) { {                                                        \
        [0 ... BITS_TO_LONGS(MAX_NUMNODES)-1] =  0UL                        \
} })

#define nodes_addr(src) ((src).bits)

#define nodemask_parse_user(ubuf, ulen, dst) \
                __nodemask_parse_user((ubuf), (ulen), &(dst), MAX_NUMNODES)
static __always_inline int __nodemask_parse_user(const char __user *buf, int len,
                                        nodemask_t *dstp, int nbits)
{
        return bitmap_parse_user(buf, len, dstp->bits, nbits);
}

#define nodelist_parse(buf, dst) __nodelist_parse((buf), &(dst), MAX_NUMNODES)
static __always_inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits)
{
        return bitmap_parselist(buf, dstp->bits, nbits);
}

#define node_remap(oldbit, old, new) \
                __node_remap((oldbit), &(old), &(new), MAX_NUMNODES)
static __always_inline int __node_remap(int oldbit,
                const nodemask_t *oldp, const nodemask_t *newp, int nbits)
{
        return bitmap_bitremap(oldbit, oldp->bits, newp->bits, nbits);
}

#define nodes_remap(dst, src, old, new) \
                __nodes_remap(&(dst), &(src), &(old), &(new), MAX_NUMNODES)
static __always_inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp,
                const nodemask_t *oldp, const nodemask_t *newp, int nbits)
{
        bitmap_remap(dstp->bits, srcp->bits, oldp->bits, newp->bits, nbits);
}

#define nodes_onto(dst, orig, relmap) \
                __nodes_onto(&(dst), &(orig), &(relmap), MAX_NUMNODES)
static __always_inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp,
                const nodemask_t *relmapp, int nbits)
{
        bitmap_onto(dstp->bits, origp->bits, relmapp->bits, nbits);
}

#define nodes_fold(dst, orig, sz) \
                __nodes_fold(&(dst), &(orig), sz, MAX_NUMNODES)
static __always_inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp,
                int sz, int nbits)
{
        bitmap_fold(dstp->bits, origp->bits, sz, nbits);
}

#if MAX_NUMNODES > 1
#define for_each_node_mask(node, mask)                                    \
        for ((node) = first_node(mask);                                    \
             (node) < MAX_NUMNODES;                                    \
             (node) = next_node((node), (mask)))
#else /* MAX_NUMNODES == 1 */
#define for_each_node_mask(node, mask)                                  \
        for ((node) = 0; (node) < 1 && !nodes_empty(mask); (node)++)
#endif /* MAX_NUMNODES */

/*
 * Bitmasks that are kept for all the nodes.
 */
enum node_states {
        N_POSSIBLE,                /* The node could become online at some point */
        N_ONLINE,                /* The node is online */
        N_NORMAL_MEMORY,        /* The node has regular memory */
#ifdef CONFIG_HIGHMEM
        N_HIGH_MEMORY,                /* The node has regular or high memory */
#else
        N_HIGH_MEMORY = N_NORMAL_MEMORY,
#endif
        N_MEMORY,                /* The node has memory(regular, high, movable) */
        N_CPU,                /* The node has one or more cpus */
        N_GENERIC_INITIATOR,        /* The node has one or more Generic Initiators */
        NR_NODE_STATES
};

/*
 * The following particular system nodemasks and operations
 * on them manage all possible and online nodes.
 */

extern nodemask_t node_states[NR_NODE_STATES];

#if MAX_NUMNODES > 1
static __always_inline int node_state(int node, enum node_states state)
{
        return node_isset(node, node_states[state]);
}

static __always_inline void node_set_state(int node, enum node_states state)
{
        __node_set(node, &node_states[state]);
}

static __always_inline void node_clear_state(int node, enum node_states state)
{
        __node_clear(node, &node_states[state]);
}

static __always_inline int num_node_state(enum node_states state)
{
        return nodes_weight(node_states[state]);
}

#define for_each_node_state(__node, __state) \
        for_each_node_mask((__node), node_states[__state])

#define first_online_node        first_node(node_states[N_ONLINE])
#define first_memory_node        first_node(node_states[N_MEMORY])
static __always_inline unsigned int next_online_node(int nid)
{
        return next_node(nid, node_states[N_ONLINE]);
}
static __always_inline unsigned int next_memory_node(int nid)
{
        return next_node(nid, node_states[N_MEMORY]);
}

extern unsigned int nr_node_ids;
extern unsigned int nr_online_nodes;

static __always_inline void node_set_online(int nid)
{
        node_set_state(nid, N_ONLINE);
        nr_online_nodes = num_node_state(N_ONLINE);
}

static __always_inline void node_set_offline(int nid)
{
        node_clear_state(nid, N_ONLINE);
        nr_online_nodes = num_node_state(N_ONLINE);
}

#else

static __always_inline int node_state(int node, enum node_states state)
{
        return node == 0;
}

static __always_inline void node_set_state(int node, enum node_states state)
{
}

static __always_inline void node_clear_state(int node, enum node_states state)
{
}

static __always_inline int num_node_state(enum node_states state)
{
        return 1;
}

#define for_each_node_state(node, __state) \
        for ( (node) = 0; (node) == 0; (node) = 1)

#define first_online_node        0
#define first_memory_node        0
#define next_online_node(nid)        (MAX_NUMNODES)
#define next_memory_node(nid)        (MAX_NUMNODES)
#define nr_node_ids                1U
#define nr_online_nodes                1U

#define node_set_online(node)           node_set_state((node), N_ONLINE)
#define node_set_offline(node)           node_clear_state((node), N_ONLINE)

#endif

static __always_inline int node_random(const nodemask_t *maskp)
{
#if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1)
        int node = find_random_bit(maskp->bits, MAX_NUMNODES);

        return node < MAX_NUMNODES ? node : NUMA_NO_NODE;
#else
        return 0;
#endif
}

#define node_online_map         node_states[N_ONLINE]
#define node_possible_map         node_states[N_POSSIBLE]

#define num_online_nodes()        num_node_state(N_ONLINE)
#define num_possible_nodes()        num_node_state(N_POSSIBLE)
#define node_online(node)        node_state((node), N_ONLINE)
#define node_possible(node)        node_state((node), N_POSSIBLE)

#define for_each_node(node)           for_each_node_state(node, N_POSSIBLE)
#define for_each_online_node(node) for_each_node_state(node, N_ONLINE)
#define for_each_node_with_cpus(node)        for_each_node_state(node, N_CPU)

/*
 * For nodemask scratch area.
 * NODEMASK_ALLOC(type, name) allocates an object with a specified type and
 * name.
 */
#if NODES_SHIFT > 8 /* nodemask_t > 32 bytes */
#define NODEMASK_ALLOC(type, name, gfp_flags)        \
                        type *name = kmalloc(sizeof(*name), gfp_flags)
#define NODEMASK_FREE(m)                        kfree(m)
#else
#define NODEMASK_ALLOC(type, name, gfp_flags)        type _##name, *name = &_##name
#define NODEMASK_FREE(m)                        do {} while (0)
#endif

/* Example structure for using NODEMASK_ALLOC, used in mempolicy. */
struct nodemask_scratch {
        nodemask_t        mask1;
        nodemask_t        mask2;
};

#define NODEMASK_SCRATCH(x)                                                \
                        NODEMASK_ALLOC(struct nodemask_scratch, x,        \
                                        GFP_KERNEL | __GFP_NORETRY)
#define NODEMASK_SCRATCH_FREE(x)        NODEMASK_FREE(x)


#endif /* __LINUX_NODEMASK_H */



























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    4 









































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
/*
 * Performance events:
 *
 *    Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de>
 *    Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar
 *    Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra
 *
 * Data type definitions, declarations, prototypes.
 *
 *    Started by: Thomas Gleixner and Ingo Molnar
 *
 * For licencing details see kernel-base/COPYING
 */
#ifndef _LINUX_PERF_EVENT_H
#define _LINUX_PERF_EVENT_H

#include <uapi/linux/perf_event.h>
#include <uapi/linux/bpf_perf_event.h>

/*
 * Kernel-internal data types and definitions:
 */

#ifdef CONFIG_PERF_EVENTS
# include <asm/perf_event.h>
# include <asm/local64.h>
#endif

#ifdef CONFIG_HAVE_HW_BREAKPOINT
# include <linux/rhashtable-types.h>
# include <asm/hw_breakpoint.h>
#endif

#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
#include <linux/hrtimer.h>
#include <linux/fs.h>
#include <linux/pid_namespace.h>
#include <linux/workqueue.h>
#include <linux/ftrace.h>
#include <linux/cpu.h>
#include <linux/irq_work.h>
#include <linux/static_key.h>
#include <linux/jump_label_ratelimit.h>
#include <linux/atomic.h>
#include <linux/sysfs.h>
#include <linux/perf_regs.h>
#include <linux/cgroup.h>
#include <linux/refcount.h>
#include <linux/security.h>
#include <linux/static_call.h>
#include <linux/lockdep.h>

#include <asm/local.h>

struct perf_callchain_entry {
        u64                                nr;
        u64                                ip[]; /* /proc/sys/kernel/perf_event_max_stack */
};

struct perf_callchain_entry_ctx {
        struct perf_callchain_entry        *entry;
        u32                                max_stack;
        u32                                nr;
        short                                contexts;
        bool                                contexts_maxed;
};

typedef unsigned long (*perf_copy_f)(void *dst, const void *src,
                                     unsigned long off, unsigned long len);

struct perf_raw_frag {
        union {
                struct perf_raw_frag        *next;
                unsigned long                pad;
        };
        perf_copy_f                        copy;
        void                                *data;
        u32                                size;
} __packed;

struct perf_raw_record {
        struct perf_raw_frag                frag;
        u32                                size;
};

static __always_inline bool perf_raw_frag_last(const struct perf_raw_frag *frag)
{
        return frag->pad < sizeof(u64);
}

/*
 * branch stack layout:
 *  nr: number of taken branches stored in entries[]
 *  hw_idx: The low level index of raw branch records
 *          for the most recent branch.
 *          -1ULL means invalid/unknown.
 *
 * Note that nr can vary from sample to sample
 * branches (to, from) are stored from most recent
 * to least recent, i.e., entries[0] contains the most
 * recent branch.
 * The entries[] is an abstraction of raw branch records,
 * which may not be stored in age order in HW, e.g. Intel LBR.
 * The hw_idx is to expose the low level index of raw
 * branch record for the most recent branch aka entries[0].
 * The hw_idx index is between -1 (unknown) and max depth,
 * which can be retrieved in /sys/devices/cpu/caps/branches.
 * For the architectures whose raw branch records are
 * already stored in age order, the hw_idx should be 0.
 */
struct perf_branch_stack {
        u64                                nr;
        u64                                hw_idx;
        struct perf_branch_entry        entries[];
};

struct task_struct;

/*
 * extra PMU register associated with an event
 */
struct hw_perf_event_extra {
        u64                                config;        /* register value */
        unsigned int                        reg;        /* register address or index */
        int                                alloc;        /* extra register already allocated */
        int                                idx;        /* index in shared_regs->regs[] */
};

/**
 * hw_perf_event::flag values
 *
 * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific
 * usage.
 */
#define PERF_EVENT_FLAG_ARCH                0x0fffffff
#define PERF_EVENT_FLAG_USER_READ_CNT        0x80000000

static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0);

/**
 * struct hw_perf_event - performance event hardware details:
 */
struct hw_perf_event {
#ifdef CONFIG_PERF_EVENTS
        union {
                struct { /* hardware */
                        u64                config;
                        u64                config1;
                        u64                last_tag;
                        u64                dyn_constraint;
                        unsigned long        config_base;
                        unsigned long        event_base;
                        int                event_base_rdpmc;
                        int                idx;
                        int                last_cpu;
                        int                flags;

                        struct hw_perf_event_extra extra_reg;
                        struct hw_perf_event_extra branch_reg;
                };
                struct { /* aux / Intel-PT */
                        u64                aux_config;
                        /*
                         * For AUX area events, aux_paused cannot be a state
                         * flag because it can be updated asynchronously to
                         * state.
                         */
                        unsigned int        aux_paused;
                };
                struct { /* software */
                        struct hrtimer        hrtimer;
                };
                struct { /* tracepoint */
                        /* for tp_event->class */
                        struct list_head        tp_list;
                };
                struct { /* amd_power */
                        u64        pwr_acc;
                        u64        ptsc;
                };
#ifdef CONFIG_HAVE_HW_BREAKPOINT
                struct { /* breakpoint */
                        /*
                         * Crufty hack to avoid the chicken and egg
                         * problem hw_breakpoint has with context
                         * creation and event initalization.
                         */
                        struct arch_hw_breakpoint        info;
                        struct rhlist_head                bp_list;
                };
#endif
                struct { /* amd_iommu */
                        u8        iommu_bank;
                        u8        iommu_cntr;
                        u16        padding;
                        u64        conf;
                        u64        conf1;
                };
        };
        /*
         * If the event is a per task event, this will point to the task in
         * question. See the comment in perf_event_alloc().
         */
        struct task_struct                *target;

        /*
         * PMU would store hardware filter configuration
         * here.
         */
        void                                *addr_filters;

        /* Last sync'ed generation of filters */
        unsigned long                        addr_filters_gen;

/*
 * hw_perf_event::state flags; used to track the PERF_EF_* state.
 */

/* the counter is stopped */
#define PERF_HES_STOPPED                0x01

/* event->count up-to-date */
#define PERF_HES_UPTODATE                0x02

#define PERF_HES_ARCH                        0x04

        int                                state;

        /*
         * The last observed hardware counter value, updated with a
         * local64_cmpxchg() such that pmu::read() can be called nested.
         */
        local64_t                        prev_count;

        /*
         * The period to start the next sample with.
         */
        u64                                sample_period;

        union {
                struct { /* Sampling */
                        /*
                         * The period we started this sample with.
                         */
                        u64                                last_period;

                        /*
                         * However much is left of the current period;
                         * note that this is a full 64bit value and
                         * allows for generation of periods longer
                         * than hardware might allow.
                         */
                        local64_t                        period_left;
                };
                struct { /* Topdown events counting for context switch */
                        u64                                saved_metric;
                        u64                                saved_slots;
                };
        };

        /*
         * State for throttling the event, see __perf_event_overflow() and
         * perf_adjust_freq_unthr_context().
         */
        u64                             interrupts_seq;
        u64                                interrupts;

        /*
         * State for freq target events, see __perf_event_overflow() and
         * perf_adjust_freq_unthr_context().
         */
        u64                                freq_time_stamp;
        u64                                freq_count_stamp;
#endif /* CONFIG_PERF_EVENTS */
};

struct perf_event;
struct perf_event_pmu_context;

/*
 * Common implementation detail of pmu::{start,commit,cancel}_txn
 */

/* txn to add/schedule event on PMU */
#define PERF_PMU_TXN_ADD                0x1

/* txn to read event group from PMU */
#define PERF_PMU_TXN_READ                0x2

/**
 * pmu::capabilities flags
 */
#define PERF_PMU_CAP_NO_INTERRUPT        0x0001
#define PERF_PMU_CAP_NO_NMI                0x0002
#define PERF_PMU_CAP_AUX_NO_SG                0x0004
#define PERF_PMU_CAP_EXTENDED_REGS        0x0008
#define PERF_PMU_CAP_EXCLUSIVE                0x0010
#define PERF_PMU_CAP_ITRACE                0x0020
#define PERF_PMU_CAP_NO_EXCLUDE                0x0040
#define PERF_PMU_CAP_AUX_OUTPUT                0x0080
#define PERF_PMU_CAP_EXTENDED_HW_TYPE        0x0100
#define PERF_PMU_CAP_AUX_PAUSE                0x0200
#define PERF_PMU_CAP_AUX_PREFER_LARGE        0x0400

/**
 * pmu::scope
 */
enum perf_pmu_scope {
        PERF_PMU_SCOPE_NONE = 0,
        PERF_PMU_SCOPE_CORE,
        PERF_PMU_SCOPE_DIE,
        PERF_PMU_SCOPE_CLUSTER,
        PERF_PMU_SCOPE_PKG,
        PERF_PMU_SCOPE_SYS_WIDE,
        PERF_PMU_MAX_SCOPE,
};

struct perf_output_handle;

#define PMU_NULL_DEV        ((void *)(~0UL))

/**
 * struct pmu - generic performance monitoring unit
 */
struct pmu {
        struct list_head                entry;

        spinlock_t                        events_lock;
        struct list_head                events;

        struct module                        *module;
        struct device                        *dev;
        struct device                        *parent;
        const struct attribute_group        **attr_groups;
        const struct attribute_group        **attr_update;
        const char                        *name;
        int                                type;

        /*
         * various common per-pmu feature flags
         */
        int                                capabilities;

        /*
         * PMU scope
         */
        unsigned int                        scope;

        struct perf_cpu_pmu_context * __percpu *cpu_pmu_context;
        atomic_t                        exclusive_cnt; /* < 0: cpu; > 0: tsk */
        int                                task_ctx_nr;
        int                                hrtimer_interval_ms;

        /* number of address filters this PMU can do */
        unsigned int                        nr_addr_filters;

        /*
         * Fully disable/enable this PMU, can be used to protect from the PMI
         * as well as for lazy/batch writing of the MSRs.
         */
        void (*pmu_enable)                (struct pmu *pmu); /* optional */
        void (*pmu_disable)                (struct pmu *pmu); /* optional */

        /*
         * Try and initialize the event for this PMU.
         *
         * Returns:
         *  -ENOENT        -- @event is not for this PMU
         *
         *  -ENODEV        -- @event is for this PMU but PMU not present
         *  -EBUSY        -- @event is for this PMU but PMU temporarily unavailable
         *  -EINVAL        -- @event is for this PMU but @event is not valid
         *  -EOPNOTSUPP -- @event is for this PMU, @event is valid, but not supported
         *  -EACCES        -- @event is for this PMU, @event is valid, but no privileges
         *
         *  0                -- @event is for this PMU and valid
         *
         * Other error return values are allowed.
         */
        int (*event_init)                (struct perf_event *event);

        /*
         * Notification that the event was mapped or unmapped.  Called
         * in the context of the mapping task.
         */
        void (*event_mapped)                (struct perf_event *event, struct mm_struct *mm); /* optional */
        void (*event_unmapped)                (struct perf_event *event, struct mm_struct *mm); /* optional */

        /*
         * Flags for ->add()/->del()/ ->start()/->stop(). There are
         * matching hw_perf_event::state flags.
         */

/* start the counter when adding    */
#define PERF_EF_START                        0x01

/* reload the counter when starting */
#define PERF_EF_RELOAD                        0x02

/* update the counter when stopping */
#define PERF_EF_UPDATE                        0x04

/* AUX area event, pause tracing */
#define PERF_EF_PAUSE                        0x08

/* AUX area event, resume tracing */
#define PERF_EF_RESUME                        0x10

        /*
         * Adds/Removes a counter to/from the PMU, can be done inside a
         * transaction, see the ->*_txn() methods.
         *
         * The add/del callbacks will reserve all hardware resources required
         * to service the event, this includes any counter constraint
         * scheduling etc.
         *
         * Called with IRQs disabled and the PMU disabled on the CPU the event
         * is on.
         *
         * ->add() called without PERF_EF_START should result in the same state
         *  as ->add() followed by ->stop().
         *
         * ->del() must always PERF_EF_UPDATE stop an event. If it calls
         *  ->stop() that must deal with already being stopped without
         *  PERF_EF_UPDATE.
         */
        int  (*add)                        (struct perf_event *event, int flags);
        void (*del)                        (struct perf_event *event, int flags);

        /*
         * Starts/Stops a counter present on the PMU.
         *
         * The PMI handler should stop the counter when perf_event_overflow()
         * returns !0. ->start() will be used to continue.
         *
         * Also used to change the sample period.
         *
         * Called with IRQs disabled and the PMU disabled on the CPU the event
         * is on -- will be called from NMI context with the PMU generates
         * NMIs.
         *
         * ->stop() with PERF_EF_UPDATE will read the counter and update
         *  period/count values like ->read() would.
         *
         * ->start() with PERF_EF_RELOAD will reprogram the counter
         *  value, must be preceded by a ->stop() with PERF_EF_UPDATE.
         *
         * ->stop() with PERF_EF_PAUSE will stop as simply as possible. Will not
         * overlap another ->stop() with PERF_EF_PAUSE nor ->start() with
         * PERF_EF_RESUME.
         *
         * ->start() with PERF_EF_RESUME will start as simply as possible but
         * only if the counter is not otherwise stopped. Will not overlap
         * another ->start() with PERF_EF_RESUME nor ->stop() with
         * PERF_EF_PAUSE.
         *
         * Notably, PERF_EF_PAUSE/PERF_EF_RESUME *can* be concurrent with other
         * ->stop()/->start() invocations, just not itself.
         */
        void (*start)                        (struct perf_event *event, int flags);
        void (*stop)                        (struct perf_event *event, int flags);

        /*
         * Updates the counter value of the event.
         *
         * For sampling capable PMUs this will also update the software period
         * hw_perf_event::period_left field.
         */
        void (*read)                        (struct perf_event *event);

        /*
         * Group events scheduling is treated as a transaction, add
         * group events as a whole and perform one schedulability test.
         * If the test fails, roll back the whole group
         *
         * Start the transaction, after this ->add() doesn't need to
         * do schedulability tests.
         *
         * Optional.
         */
        void (*start_txn)                (struct pmu *pmu, unsigned int txn_flags);
        /*
         * If ->start_txn() disabled the ->add() schedulability test
         * then ->commit_txn() is required to perform one. On success
         * the transaction is closed. On error the transaction is kept
         * open until ->cancel_txn() is called.
         *
         * Optional.
         */
        int  (*commit_txn)                (struct pmu *pmu);
        /*
         * Will cancel the transaction, assumes ->del() is called
         * for each successful ->add() during the transaction.
         *
         * Optional.
         */
        void (*cancel_txn)                (struct pmu *pmu);

        /*
         * Will return the value for perf_event_mmap_page::index for this event,
         * if no implementation is provided it will default to 0 (see
         * perf_event_idx_default).
         */
        int (*event_idx)                (struct perf_event *event); /*optional */

        /*
         * context-switches callback
         */
        void (*sched_task)                (struct perf_event_pmu_context *pmu_ctx,
                                         struct task_struct *task, bool sched_in);

        /*
         * Kmem cache of PMU specific data
         */
        struct kmem_cache                *task_ctx_cache;

        /*
         * Set up pmu-private data structures for an AUX area
         */
        void *(*setup_aux)                (struct perf_event *event, void **pages,
                                         int nr_pages, bool overwrite);
                                        /* optional */

        /*
         * Free pmu-private AUX data structures
         */
        void (*free_aux)                (void *aux); /* optional */

        /*
         * Take a snapshot of the AUX buffer without touching the event
         * state, so that preempting ->start()/->stop() callbacks does
         * not interfere with their logic. Called in PMI context.
         *
         * Returns the size of AUX data copied to the output handle.
         *
         * Optional.
         */
        long (*snapshot_aux)                (struct perf_event *event,
                                         struct perf_output_handle *handle,
                                         unsigned long size);

        /*
         * Validate address range filters: make sure the HW supports the
         * requested configuration and number of filters; return 0 if the
         * supplied filters are valid, -errno otherwise.
         *
         * Runs in the context of the ioctl()ing process and is not serialized
         * with the rest of the PMU callbacks.
         */
        int (*addr_filters_validate)        (struct list_head *filters);
                                        /* optional */

        /*
         * Synchronize address range filter configuration:
         * translate hw-agnostic filters into hardware configuration in
         * event::hw::addr_filters.
         *
         * Runs as a part of filter sync sequence that is done in ->start()
         * callback by calling perf_event_addr_filters_sync().
         *
         * May (and should) traverse event::addr_filters::list, for which its
         * caller provides necessary serialization.
         */
        void (*addr_filters_sync)        (struct perf_event *event);
                                        /* optional */

        /*
         * Check if event can be used for aux_output purposes for
         * events of this PMU.
         *
         * Runs from perf_event_open(). Should return 0 for "no match"
         * or non-zero for "match".
         */
        int (*aux_output_match)                (struct perf_event *event);
                                        /* optional */

        /*
         * Skip programming this PMU on the given CPU. Typically needed for
         * big.LITTLE things.
         */
        bool (*filter)                        (struct pmu *pmu, int cpu); /* optional */

        /*
         * Check period value for PERF_EVENT_IOC_PERIOD ioctl.
         */
        int (*check_period)                (struct perf_event *event, u64 value); /* optional */
};

enum perf_addr_filter_action_t {
        PERF_ADDR_FILTER_ACTION_STOP = 0,
        PERF_ADDR_FILTER_ACTION_START,
        PERF_ADDR_FILTER_ACTION_FILTER,
};

/**
 * struct perf_addr_filter - address range filter definition
 * @entry:        event's filter list linkage
 * @path:        object file's path for file-based filters
 * @offset:        filter range offset
 * @size:        filter range size (size==0 means single address trigger)
 * @action:        filter/start/stop
 *
 * This is a hardware-agnostic filter configuration as specified by the user.
 */
struct perf_addr_filter {
        struct list_head                entry;
        struct path                        path;
        unsigned long                        offset;
        unsigned long                        size;
        enum perf_addr_filter_action_t        action;
};

/**
 * struct perf_addr_filters_head - container for address range filters
 * @list:        list of filters for this event
 * @lock:        spinlock that serializes accesses to the @list and event's
 *                (and its children's) filter generations.
 * @nr_file_filters:        number of file-based filters
 *
 * A child event will use parent's @list (and therefore @lock), so they are
 * bundled together; see perf_event_addr_filters().
 */
struct perf_addr_filters_head {
        struct list_head                list;
        raw_spinlock_t                        lock;
        unsigned int                        nr_file_filters;
};

struct perf_addr_filter_range {
        unsigned long                        start;
        unsigned long                        size;
};

/*
 * The normal states are:
 *
 *            ACTIVE    --.
 *               ^        |
 *               |        |
 *       sched_{in,out}() |
 *               |        |
 *               v        |
 *      ,---> INACTIVE  --+ <-.
 *      |                 |   |
 *      |                {dis,en}able()
 *   sched_in()           |   |
 *      |       OFF    <--' --+
 *      |                     |
 *      `--->  ERROR    ------'
 *
 * That is:
 *
 * sched_in:       INACTIVE          -> {ACTIVE,ERROR}
 * sched_out:      ACTIVE            -> INACTIVE
 * disable:        {ACTIVE,INACTIVE} -> OFF
 * enable:         {OFF,ERROR}       -> INACTIVE
 *
 * Where {OFF,ERROR} are disabled states.
 *
 * Then we have the {EXIT,REVOKED,DEAD} states which are various shades of
 * defunct events:
 *
 *  - EXIT means task that the even was assigned to died, but child events
 *    still live, and further children can still be created. But the event
 *    itself will never be active again. It can only transition to
 *    {REVOKED,DEAD};
 *
 *  - REVOKED means the PMU the event was associated with is gone; all
 *    functionality is stopped but the event is still alive. Can only
 *    transition to DEAD;
 *
 *  - DEAD event really is DYING tearing down state and freeing bits.
 *
 */
enum perf_event_state {
        PERF_EVENT_STATE_DEAD                = -5,
        PERF_EVENT_STATE_REVOKED        = -4, /* pmu gone, must not touch */
        PERF_EVENT_STATE_EXIT                = -3, /* task died, still inherit */
        PERF_EVENT_STATE_ERROR                = -2, /* scheduling error, can enable */
        PERF_EVENT_STATE_OFF                = -1,
        PERF_EVENT_STATE_INACTIVE        =  0,
        PERF_EVENT_STATE_ACTIVE                =  1,
};

struct file;
struct perf_sample_data;

typedef void (*perf_overflow_handler_t)(struct perf_event *,
                                        struct perf_sample_data *,
                                        struct pt_regs *regs);

/*
 * Event capabilities. For event_caps and groups caps.
 *
 * PERF_EV_CAP_SOFTWARE: Is a software event.
 * PERF_EV_CAP_READ_ACTIVE_PKG: A CPU event (or cgroup event) that can be read
 * from any CPU in the package where it is active.
 * PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and
 * cannot be a group leader. If an event with this flag is detached from the
 * group it is scheduled out and moved into an unrecoverable ERROR state.
 * PERF_EV_CAP_READ_SCOPE: A CPU event that can be read from any CPU of the
 * PMU scope where it is active.
 */
#define PERF_EV_CAP_SOFTWARE                BIT(0)
#define PERF_EV_CAP_READ_ACTIVE_PKG        BIT(1)
#define PERF_EV_CAP_SIBLING                BIT(2)
#define PERF_EV_CAP_READ_SCOPE                BIT(3)

#define SWEVENT_HLIST_BITS                8
#define SWEVENT_HLIST_SIZE                (1 << SWEVENT_HLIST_BITS)

struct swevent_hlist {
        struct hlist_head                heads[SWEVENT_HLIST_SIZE];
        struct rcu_head                        rcu_head;
};

#define PERF_ATTACH_CONTEXT                0x0001
#define PERF_ATTACH_GROUP                0x0002
#define PERF_ATTACH_TASK                0x0004
#define PERF_ATTACH_TASK_DATA                0x0008
#define PERF_ATTACH_GLOBAL_DATA                0x0010
#define PERF_ATTACH_SCHED_CB                0x0020
#define PERF_ATTACH_CHILD                0x0040
#define PERF_ATTACH_EXCLUSIVE                0x0080
#define PERF_ATTACH_CALLCHAIN                0x0100
#define PERF_ATTACH_ITRACE                0x0200

struct bpf_prog;
struct perf_cgroup;
struct perf_buffer;

struct pmu_event_list {
        raw_spinlock_t                        lock;
        struct list_head                list;
};

/*
 * event->sibling_list is modified whole holding both ctx->lock and ctx->mutex
 * as such iteration must hold either lock. However, since ctx->lock is an IRQ
 * safe lock, and is only held by the CPU doing the modification, having IRQs
 * disabled is sufficient since it will hold-off the IPIs.
 */
#ifdef CONFIG_PROVE_LOCKING
# define lockdep_assert_event_ctx(event)                        \
        WARN_ON_ONCE(__lockdep_enabled &&                        \
                     (this_cpu_read(hardirqs_enabled) &&        \
                      lockdep_is_held(&(event)->ctx->mutex) != LOCK_STATE_HELD))
#else
# define lockdep_assert_event_ctx(event)
#endif

#define for_each_sibling_event(sibling, event)                        \
        lockdep_assert_event_ctx(event);                        \
        if ((event)->group_leader == (event))                        \
                list_for_each_entry((sibling), &(event)->sibling_list, sibling_list)

/**
 * struct perf_event - performance event kernel representation:
 */
struct perf_event {
#ifdef CONFIG_PERF_EVENTS
        /*
         * entry onto perf_event_context::event_list;
         *   modifications require ctx->lock
         *   RCU safe iterations.
         */
        struct list_head                event_entry;

        /*
         * Locked for modification by both ctx->mutex and ctx->lock; holding
         * either sufficies for read.
         */
        struct list_head                sibling_list;
        struct list_head                active_list;
        /*
         * Node on the pinned or flexible tree located at the event context;
         */
        struct rb_node                        group_node;
        u64                                group_index;
        /*
         * We need storage to track the entries in perf_pmu_migrate_context; we
         * cannot use the event_entry because of RCU and we want to keep the
         * group in tact which avoids us using the other two entries.
         */
        struct list_head                migrate_entry;

        struct hlist_node                hlist_entry;
        struct list_head                active_entry;
        int                                nr_siblings;

        /* Not serialized. Only written during event initialization. */
        int                                event_caps;
        /* The cumulative AND of all event_caps for events in this group. */
        int                                group_caps;

        unsigned int                        group_generation;
        struct perf_event                *group_leader;
        /*
         * event->pmu will always point to pmu in which this event belongs.
         * Whereas event->pmu_ctx->pmu may point to other pmu when group of
         * different pmu events is created.
         */
        struct pmu                        *pmu;
        void                                *pmu_private;

        enum perf_event_state                state;
        unsigned int                        attach_state;
        local64_t                        count;
        atomic64_t                        child_count;

        /*
         * These are the total time in nanoseconds that the event
         * has been enabled (i.e. eligible to run, and the task has
         * been scheduled in, if this is a per-task event)
         * and running (scheduled onto the CPU), respectively.
         */
        u64                                total_time_enabled;
        u64                                total_time_running;
        u64                                tstamp;

        struct perf_event_attr                attr;
        u16                                header_size;
        u16                                id_header_size;
        u16                                read_size;
        struct hw_perf_event                hw;

        struct perf_event_context        *ctx;
        /*
         * event->pmu_ctx points to perf_event_pmu_context in which the event
         * is added. This pmu_ctx can be of other pmu for sw event when that
         * sw event is part of a group which also contains non-sw events.
         */
        struct perf_event_pmu_context        *pmu_ctx;
        atomic_long_t                        refcount;

        /*
         * These accumulate total time (in nanoseconds) that children
         * events have been enabled and running, respectively.
         */
        atomic64_t                        child_total_time_enabled;
        atomic64_t                        child_total_time_running;

        /*
         * Protect attach/detach and child_list:
         */
        struct mutex                        child_mutex;
        struct list_head                child_list;
        struct perf_event                *parent;

        int                                oncpu;
        int                                cpu;

        struct list_head                owner_entry;
        struct task_struct                *owner;

        /* mmap bits */
        struct mutex                        mmap_mutex;
        refcount_t                        mmap_count;

        struct perf_buffer                *rb;
        struct list_head                rb_entry;
        unsigned long                        rcu_batches;
        int                                rcu_pending;

        /* poll related */
        wait_queue_head_t                waitq;
        struct fasync_struct                *fasync;

        /* delayed work for NMIs and such */
        unsigned int                        pending_wakeup;
        unsigned int                        pending_kill;
        unsigned int                        pending_disable;
        unsigned long                        pending_addr;        /* SIGTRAP */
        struct irq_work                        pending_irq;
        struct irq_work                        pending_disable_irq;
        struct callback_head                pending_task;
        unsigned int                        pending_work;

        atomic_t                        event_limit;

        /* address range filters */
        struct perf_addr_filters_head        addr_filters;
        /* vma address array for file-based filders */
        struct perf_addr_filter_range        *addr_filter_ranges;
        unsigned long                        addr_filters_gen;

        /* for aux_output events */
        struct perf_event                *aux_event;

        void (*destroy)(struct perf_event *);
        struct rcu_head                        rcu_head;

        struct pid_namespace                *ns;
        u64                                id;

        atomic64_t                        lost_samples;

        u64                                (*clock)(void);
        perf_overflow_handler_t                overflow_handler;
        void                                *overflow_handler_context;
        struct bpf_prog                        *prog;
        u64                                bpf_cookie;

#ifdef CONFIG_EVENT_TRACING
        struct trace_event_call                *tp_event;
        struct event_filter                *filter;
# ifdef CONFIG_FUNCTION_TRACER
        struct ftrace_ops               ftrace_ops;
# endif
#endif

#ifdef CONFIG_CGROUP_PERF
        struct perf_cgroup                *cgrp; /* cgroup event is attach to */
#endif

#ifdef CONFIG_SECURITY
        void *security;
#endif
        struct list_head                sb_list;
        struct list_head                pmu_list;

        /*
         * Certain events gets forwarded to another pmu internally by over-
         * writing kernel copy of event->attr.type without user being aware
         * of it. event->orig_type contains original 'type' requested by
         * user.
         */
        u32                                orig_type;
#endif /* CONFIG_PERF_EVENTS */
};

/*
 *           ,-----------------------[1:n]------------------------.
 *           V                                                    V
 * perf_event_context <-[1:n]-> perf_event_pmu_context <-[1:n]- perf_event
 *                                        |                       |
 *                                        `--[n:1]-> pmu <-[1:n]--'
 *
 *
 * struct perf_event_pmu_context  lifetime is refcount based and RCU freed
 * (similar to perf_event_context). Locking is as if it were a member of
 * perf_event_context; specifically:
 *
 *   modification, both: ctx->mutex && ctx->lock
 *   reading, either:    ctx->mutex || ctx->lock
 *
 * There is one exception to this; namely put_pmu_ctx() isn't always called
 * with ctx->mutex held; this means that as long as we can guarantee the epc
 * has events the above rules hold.
 *
 * Specificially, sys_perf_event_open()'s group_leader case depends on
 * ctx->mutex pinning the configuration. Since we hold a reference on
 * group_leader (through the filedesc) it can't go away, therefore it's
 * associated pmu_ctx must exist and cannot change due to ctx->mutex.
 *
 * perf_event holds a refcount on perf_event_context
 * perf_event holds a refcount on perf_event_pmu_context
 */
struct perf_event_pmu_context {
        struct pmu                        *pmu;
        struct perf_event_context       *ctx;

        struct list_head                pmu_ctx_entry;

        struct list_head                pinned_active;
        struct list_head                flexible_active;

        /* Used to identify the per-cpu perf_event_pmu_context */
        unsigned int                        embedded : 1;

        unsigned int                        nr_events;
        unsigned int                        nr_cgroups;
        unsigned int                        nr_freq;

        atomic_t                        refcount; /* event <-> epc */
        struct rcu_head                        rcu_head;

        /*
         * Set when one or more (plausibly active) event can't be scheduled
         * due to pmu overcommit or pmu constraints, except tolerant to
         * events not necessary to be active due to scheduling constraints,
         * such as cgroups.
         */
        int                                rotate_necessary;
};

static inline bool perf_pmu_ctx_is_active(struct perf_event_pmu_context *epc)
{
        return !list_empty(&epc->flexible_active) || !list_empty(&epc->pinned_active);
}

struct perf_event_groups {
        struct rb_root                        tree;
        u64                                index;
};


/**
 * struct perf_event_context - event context structure
 *
 * Used as a container for task events and CPU events as well:
 */
struct perf_event_context {
        /*
         * Protect the states of the events in the list,
         * nr_active, and the list:
         */
        raw_spinlock_t                        lock;
        /*
         * Protect the list of events.  Locking either mutex or lock
         * is sufficient to ensure the list doesn't change; to change
         * the list you need to lock both the mutex and the spinlock.
         */
        struct mutex                        mutex;

        struct list_head                pmu_ctx_list;
        struct perf_event_groups        pinned_groups;
        struct perf_event_groups        flexible_groups;
        struct list_head                event_list;

        int                                nr_events;
        int                                nr_user;
        int                                is_active;

        int                                nr_stat;
        int                                nr_freq;
        int                                rotate_disable;

        refcount_t                        refcount; /* event <-> ctx */
        struct task_struct                *task;

        /*
         * Context clock, runs when context enabled.
         */
        u64                                time;
        u64                                timestamp;
        u64                                timeoffset;

        /*
         * These fields let us detect when two contexts have both
         * been cloned (inherited) from a common ancestor.
         */
        struct perf_event_context        *parent_ctx;
        u64                                parent_gen;
        u64                                generation;
        int                                pin_count;
#ifdef CONFIG_CGROUP_PERF
        int                                nr_cgroups;         /* cgroup evts */
#endif
        struct rcu_head                        rcu_head;

        /*
         * The count of events for which using the switch-out fast path
         * should be avoided.
         *
         * Sum (event->pending_work + events with
         *    (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ)))
         *
         * The SIGTRAP is targeted at ctx->task, as such it won't do changing
         * that until the signal is delivered.
         */
        local_t                                nr_no_switch_fast;
};

/**
 * struct perf_ctx_data - PMU specific data for a task
 * @rcu_head:  To avoid the race on free PMU specific data
 * @refcount:  To track users
 * @global:    To track system-wide users
 * @ctx_cache: Kmem cache of PMU specific data
 * @data:      PMU specific data
 *
 * Currently, the struct is only used in Intel LBR call stack mode to
 * save/restore the call stack of a task on context switches.
 *
 * The rcu_head is used to prevent the race on free the data.
 * The data only be allocated when Intel LBR call stack mode is enabled.
 * The data will be freed when the mode is disabled.
 * The content of the data will only be accessed in context switch, which
 * should be protected by rcu_read_lock().
 *
 * Because of the alignment requirement of Intel Arch LBR, the Kmem cache
 * is used to allocate the PMU specific data. The ctx_cache is to track
 * the Kmem cache.
 *
 * Careful: Struct perf_ctx_data is added as a pointer in struct task_struct.
 * When system-wide Intel LBR call stack mode is enabled, a buffer with
 * constant size will be allocated for each task.
 * Also, system memory consumption can further grow when the size of
 * struct perf_ctx_data enlarges.
 */
struct perf_ctx_data {
        struct rcu_head                        rcu_head;
        refcount_t                        refcount;
        int                                global;
        struct kmem_cache                *ctx_cache;
        void                                *data;
};

struct perf_cpu_pmu_context {
        struct perf_event_pmu_context        epc;
        struct perf_event_pmu_context        *task_epc;

        struct list_head                sched_cb_entry;
        int                                sched_cb_usage;

        int                                active_oncpu;
        int                                exclusive;
        int                                pmu_disable_count;

        raw_spinlock_t                        hrtimer_lock;
        struct hrtimer                        hrtimer;
        ktime_t                                hrtimer_interval;
        unsigned int                        hrtimer_active;
};

/**
 * struct perf_event_cpu_context - per cpu event context structure
 */
struct perf_cpu_context {
        struct perf_event_context        ctx;
        struct perf_event_context        *task_ctx;
        int                                online;

#ifdef CONFIG_CGROUP_PERF
        struct perf_cgroup                *cgrp;
#endif

        /*
         * Per-CPU storage for iterators used in visit_groups_merge. The default
         * storage is of size 2 to hold the CPU and any CPU event iterators.
         */
        int                                heap_size;
        struct perf_event                **heap;
        struct perf_event                *heap_default[2];
};

struct perf_output_handle {
        struct perf_event                *event;
        struct perf_buffer                *rb;
        unsigned long                        wakeup;
        unsigned long                        size;
        union {
                u64                        flags;                /* perf_output*() */
                u64                        aux_flags;        /* perf_aux_output*() */
                struct {
                        u64                skip_read : 1;
                };
        };
        union {
                void                        *addr;
                unsigned long                head;
        };
        int                                page;
};

struct bpf_perf_event_data_kern {
        bpf_user_pt_regs_t *regs;
        struct perf_sample_data *data;
        struct perf_event *event;
};

#ifdef CONFIG_CGROUP_PERF

/*
 * perf_cgroup_info keeps track of time_enabled for a cgroup.
 * This is a per-cpu dynamically allocated data structure.
 */
struct perf_cgroup_info {
        u64                                time;
        u64                                timestamp;
        u64                                timeoffset;
        int                                active;
};

struct perf_cgroup {
        struct cgroup_subsys_state        css;
        struct perf_cgroup_info        __percpu *info;
};

/*
 * Must ensure cgroup is pinned (css_get) before calling
 * this function. In other words, we cannot call this function
 * if there is no cgroup event for the current CPU context.
 */
static inline struct perf_cgroup *
perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx)
{
        return container_of(task_css_check(task, perf_event_cgrp_id,
                                           ctx ? lockdep_is_held(&ctx->lock)
                                               : true),
                            struct perf_cgroup, css);
}
#endif /* CONFIG_CGROUP_PERF */

#ifdef CONFIG_PERF_EVENTS

extern struct perf_event_context *perf_cpu_task_ctx(void);

extern void *perf_aux_output_begin(struct perf_output_handle *handle,
                                   struct perf_event *event);
extern void perf_aux_output_end(struct perf_output_handle *handle,
                                unsigned long size);
extern int perf_aux_output_skip(struct perf_output_handle *handle,
                                unsigned long size);
extern void *perf_get_aux(struct perf_output_handle *handle);
extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags);
extern void perf_event_itrace_started(struct perf_event *event);

extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
extern int perf_pmu_unregister(struct pmu *pmu);

extern void __perf_event_task_sched_in(struct task_struct *prev,
                                       struct task_struct *task);
extern void __perf_event_task_sched_out(struct task_struct *prev,
                                        struct task_struct *next);
extern int perf_event_init_task(struct task_struct *child, u64 clone_flags);
extern void perf_event_exit_task(struct task_struct *child);
extern void perf_event_free_task(struct task_struct *task);
extern void perf_event_delayed_put(struct task_struct *task);
extern struct file *perf_event_get(unsigned int fd);
extern const struct perf_event *perf_get_event(struct file *file);
extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event);
extern void perf_event_print_debug(void);
extern void perf_pmu_disable(struct pmu *pmu);
extern void perf_pmu_enable(struct pmu *pmu);
extern void perf_sched_cb_dec(struct pmu *pmu);
extern void perf_sched_cb_inc(struct pmu *pmu);
extern int perf_event_task_disable(void);
extern int perf_event_task_enable(void);

extern void perf_pmu_resched(struct pmu *pmu);

extern int perf_event_refresh(struct perf_event *event, int refresh);
extern void perf_event_update_userpage(struct perf_event *event);
extern int perf_event_release_kernel(struct perf_event *event);

extern struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr *attr,
                                 int cpu,
                                 struct task_struct *task,
                                 perf_overflow_handler_t callback,
                                 void *context);

extern void perf_pmu_migrate_context(struct pmu *pmu,
                                     int src_cpu, int dst_cpu);
extern int perf_event_read_local(struct perf_event *event, u64 *value,
                                 u64 *enabled, u64 *running);
extern u64 perf_event_read_value(struct perf_event *event,
                                 u64 *enabled, u64 *running);

extern struct perf_callchain_entry *perf_callchain(struct perf_event *event, struct pt_regs *regs);

static inline bool branch_sample_no_flags(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_FLAGS;
}

static inline bool branch_sample_no_cycles(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_CYCLES;
}

static inline bool branch_sample_type(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_TYPE_SAVE;
}

static inline bool branch_sample_hw_index(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
}

static inline bool branch_sample_priv(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_PRIV_SAVE;
}

static inline bool branch_sample_counters(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS;
}

static inline bool branch_sample_call_stack(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK;
}

struct perf_sample_data {
        /*
         * Fields set by perf_sample_data_init() unconditionally,
         * group so as to minimize the cachelines touched.
         */
        u64                                sample_flags;
        u64                                period;
        u64                                dyn_size;

        /*
         * Fields commonly set by __perf_event_header__init_id(),
         * group so as to minimize the cachelines touched.
         */
        u64                                type;
        struct {
                u32        pid;
                u32        tid;
        }                                tid_entry;
        u64                                time;
        u64                                id;
        struct {
                u32        cpu;
                u32        reserved;
        }                                cpu_entry;

        /*
         * The other fields, optionally {set,used} by
         * perf_{prepare,output}_sample().
         */
        u64                                ip;
        struct perf_callchain_entry        *callchain;
        struct perf_raw_record                *raw;
        struct perf_branch_stack        *br_stack;
        u64                                *br_stack_cntr;
        union perf_sample_weight        weight;
        union  perf_mem_data_src        data_src;
        u64                                txn;

        struct perf_regs                regs_user;
        struct perf_regs                regs_intr;
        u64                                stack_user_size;

        u64                                stream_id;
        u64                                cgroup;
        u64                                addr;
        u64                                phys_addr;
        u64                                data_page_size;
        u64                                code_page_size;
        u64                                aux_size;
} ____cacheline_aligned;

/* default value for data source */
#define PERF_MEM_NA (PERF_MEM_S(OP, NA)   |\
                    PERF_MEM_S(LVL, NA)   |\
                    PERF_MEM_S(SNOOP, NA) |\
                    PERF_MEM_S(LOCK, NA)  |\
                    PERF_MEM_S(TLB, NA)   |\
                    PERF_MEM_S(LVLNUM, NA))

static inline void perf_sample_data_init(struct perf_sample_data *data,
                                         u64 addr, u64 period)
{
        /* remaining struct members initialized in perf_prepare_sample() */
        data->sample_flags = PERF_SAMPLE_PERIOD;
        data->period = period;
        data->dyn_size = 0;

        if (addr) {
                data->addr = addr;
                data->sample_flags |= PERF_SAMPLE_ADDR;
        }
}

static inline void perf_sample_save_callchain(struct perf_sample_data *data,
                                              struct perf_event *event,
                                              struct pt_regs *regs)
{
        int size = 1;

        if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
                return;
        if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_CALLCHAIN))
                return;

        data->callchain = perf_callchain(event, regs);
        size += data->callchain->nr;

        data->dyn_size += size * sizeof(u64);
        data->sample_flags |= PERF_SAMPLE_CALLCHAIN;
}

static inline void perf_sample_save_raw_data(struct perf_sample_data *data,
                                             struct perf_event *event,
                                             struct perf_raw_record *raw)
{
        struct perf_raw_frag *frag = &raw->frag;
        u32 sum = 0;
        int size;

        if (!(event->attr.sample_type & PERF_SAMPLE_RAW))
                return;
        if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_RAW))
                return;

        do {
                sum += frag->size;
                if (perf_raw_frag_last(frag))
                        break;
                frag = frag->next;
        } while (1);

        size = round_up(sum + sizeof(u32), sizeof(u64));
        raw->size = size - sizeof(u32);
        frag->pad = raw->size - sum;

        data->raw = raw;
        data->dyn_size += size;
        data->sample_flags |= PERF_SAMPLE_RAW;
}

static inline bool has_branch_stack(struct perf_event *event)
{
        return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK;
}

static inline void perf_sample_save_brstack(struct perf_sample_data *data,
                                            struct perf_event *event,
                                            struct perf_branch_stack *brs,
                                            u64 *brs_cntr)
{
        int size = sizeof(u64); /* nr */

        if (!has_branch_stack(event))
                return;
        if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_BRANCH_STACK))
                return;

        if (branch_sample_hw_index(event))
                size += sizeof(u64);

        brs->nr = min_t(u16, event->attr.sample_max_stack, brs->nr);

        size += brs->nr * sizeof(struct perf_branch_entry);

        /*
         * The extension space for counters is appended after the
         * struct perf_branch_stack. It is used to store the occurrences
         * of events of each branch.
         */
        if (brs_cntr)
                size += brs->nr * sizeof(u64);

        data->br_stack = brs;
        data->br_stack_cntr = brs_cntr;
        data->dyn_size += size;
        data->sample_flags |= PERF_SAMPLE_BRANCH_STACK;
}

static inline u32 perf_sample_data_size(struct perf_sample_data *data,
                                        struct perf_event *event)
{
        u32 size = sizeof(struct perf_event_header);

        size += event->header_size + event->id_header_size;
        size += data->dyn_size;

        return size;
}

/*
 * Clear all bitfields in the perf_branch_entry.
 * The to and from fields are not cleared because they are
 * systematically modified by caller.
 */
static inline void perf_clear_branch_entry_bitfields(struct perf_branch_entry *br)
{
        br->mispred        = 0;
        br->predicted        = 0;
        br->in_tx        = 0;
        br->abort        = 0;
        br->cycles        = 0;
        br->type        = 0;
        br->spec        = PERF_BR_SPEC_NA;
        br->reserved        = 0;
}

extern void perf_output_sample(struct perf_output_handle *handle,
                               struct perf_event_header *header,
                               struct perf_sample_data *data,
                               struct perf_event *event);
extern void perf_prepare_sample(struct perf_sample_data *data,
                                struct perf_event *event,
                                struct pt_regs *regs);
extern void perf_prepare_header(struct perf_event_header *header,
                                struct perf_sample_data *data,
                                struct perf_event *event,
                                struct pt_regs *regs);

extern int perf_event_overflow(struct perf_event *event,
                                 struct perf_sample_data *data,
                                 struct pt_regs *regs);

extern void perf_event_output_forward(struct perf_event *event,
                                     struct perf_sample_data *data,
                                     struct pt_regs *regs);
extern void perf_event_output_backward(struct perf_event *event,
                                       struct perf_sample_data *data,
                                       struct pt_regs *regs);
extern int perf_event_output(struct perf_event *event,
                             struct perf_sample_data *data,
                             struct pt_regs *regs);

static inline bool
is_default_overflow_handler(struct perf_event *event)
{
        perf_overflow_handler_t overflow_handler = event->overflow_handler;

        if (likely(overflow_handler == perf_event_output_forward))
                return true;
        if (unlikely(overflow_handler == perf_event_output_backward))
                return true;
        return false;
}

extern void
perf_event_header__init_id(struct perf_event_header *header,
                           struct perf_sample_data *data,
                           struct perf_event *event);
extern void
perf_event__output_id_sample(struct perf_event *event,
                             struct perf_output_handle *handle,
                             struct perf_sample_data *sample);

extern void
perf_log_lost_samples(struct perf_event *event, u64 lost);

static inline bool event_has_any_exclude_flag(struct perf_event *event)
{
        struct perf_event_attr *attr = &event->attr;

        return attr->exclude_idle || attr->exclude_user ||
               attr->exclude_kernel || attr->exclude_hv ||
               attr->exclude_guest || attr->exclude_host;
}

static inline bool is_sampling_event(struct perf_event *event)
{
        return event->attr.sample_period != 0;
}

/*
 * Return 1 for a software event, 0 for a hardware event
 */
static inline int is_software_event(struct perf_event *event)
{
        return event->event_caps & PERF_EV_CAP_SOFTWARE;
}

/*
 * Return 1 for event in sw context, 0 for event in hw context
 */
static inline int in_software_context(struct perf_event *event)
{
        return event->pmu_ctx->pmu->task_ctx_nr == perf_sw_context;
}

static inline int is_exclusive_pmu(struct pmu *pmu)
{
        return pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE;
}

extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];

extern void ___perf_sw_event(u32, u64, struct pt_regs *, u64);
extern void __perf_sw_event(u32, u64, struct pt_regs *, u64);

#ifndef perf_arch_fetch_caller_regs
static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip) { }
#endif

/*
 * When generating a perf sample in-line, instead of from an interrupt /
 * exception, we lack a pt_regs. This is typically used from software events
 * like: SW_CONTEXT_SWITCHES, SW_MIGRATIONS and the tie-in with tracepoints.
 *
 * We typically don't need a full set, but (for x86) do require:
 * - ip for PERF_SAMPLE_IP
 * - cs for user_mode() tests
 * - sp for PERF_SAMPLE_CALLCHAIN
 * - eflags for MISC bits and CALLCHAIN (see: perf_hw_regs())
 *
 * NOTE: assumes @regs is otherwise already 0 filled; this is important for
 * things like PERF_SAMPLE_REGS_INTR.
 */
static inline void perf_fetch_caller_regs(struct pt_regs *regs)
{
        perf_arch_fetch_caller_regs(regs, CALLER_ADDR0);
}

static __always_inline void
perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
{
        if (static_key_false(&perf_swevent_enabled[event_id]))
                __perf_sw_event(event_id, nr, regs, addr);
}

DECLARE_PER_CPU(struct pt_regs, __perf_regs[4]);

/*
 * 'Special' version for the scheduler, it hard assumes no recursion,
 * which is guaranteed by us not actually scheduling inside other swevents
 * because those disable preemption.
 */
static __always_inline void __perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)
{
        struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);

        perf_fetch_caller_regs(regs);
        ___perf_sw_event(event_id, nr, regs, addr);
}

extern struct static_key_false perf_sched_events;

static __always_inline bool __perf_sw_enabled(int swevt)
{
        return static_key_false(&perf_swevent_enabled[swevt]);
}

static inline void perf_event_task_migrate(struct task_struct *task)
{
        if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS))
                task->sched_migrated = 1;
}

static inline void perf_event_task_sched_in(struct task_struct *prev,
                                            struct task_struct *task)
{
        if (static_branch_unlikely(&perf_sched_events))
                __perf_event_task_sched_in(prev, task);

        if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS) &&
            task->sched_migrated) {
                __perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
                task->sched_migrated = 0;
        }
}

static inline void perf_event_task_sched_out(struct task_struct *prev,
                                             struct task_struct *next)
{
        if (__perf_sw_enabled(PERF_COUNT_SW_CONTEXT_SWITCHES))
                __perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);

#ifdef CONFIG_CGROUP_PERF
        if (__perf_sw_enabled(PERF_COUNT_SW_CGROUP_SWITCHES) &&
            perf_cgroup_from_task(prev, NULL) !=
            perf_cgroup_from_task(next, NULL))
                __perf_sw_event_sched(PERF_COUNT_SW_CGROUP_SWITCHES, 1, 0);
#endif

        if (static_branch_unlikely(&perf_sched_events))
                __perf_event_task_sched_out(prev, next);
}

extern void perf_event_mmap(struct vm_area_struct *vma);

extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len,
                               bool unregister, const char *sym);
extern void perf_event_bpf_event(struct bpf_prog *prog,
                                 enum perf_bpf_event_type type,
                                 u16 flags);

#define PERF_GUEST_ACTIVE                0x01
#define PERF_GUEST_USER                        0x02

struct perf_guest_info_callbacks {
        unsigned int                        (*state)(void);
        unsigned long                        (*get_ip)(void);
        unsigned int                        (*handle_intel_pt_intr)(void);
};

#ifdef CONFIG_GUEST_PERF_EVENTS

extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs;

DECLARE_STATIC_CALL(__perf_guest_state, *perf_guest_cbs->state);
DECLARE_STATIC_CALL(__perf_guest_get_ip, *perf_guest_cbs->get_ip);
DECLARE_STATIC_CALL(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr);

static inline unsigned int perf_guest_state(void)
{
        return static_call(__perf_guest_state)();
}

static inline unsigned long perf_guest_get_ip(void)
{
        return static_call(__perf_guest_get_ip)();
}

static inline unsigned int perf_guest_handle_intel_pt_intr(void)
{
        return static_call(__perf_guest_handle_intel_pt_intr)();
}

extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs);
extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs);

#else /* !CONFIG_GUEST_PERF_EVENTS: */

static inline unsigned int perf_guest_state(void)                 { return 0; }
static inline unsigned long perf_guest_get_ip(void)                 { return 0; }
static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return 0; }

#endif /* !CONFIG_GUEST_PERF_EVENTS */

extern void perf_event_exec(void);
extern void perf_event_comm(struct task_struct *tsk, bool exec);
extern void perf_event_namespaces(struct task_struct *tsk);
extern void perf_event_fork(struct task_struct *tsk);
extern void perf_event_text_poke(const void *addr,
                                 const void *old_bytes, size_t old_len,
                                 const void *new_bytes, size_t new_len);

/* Callchains */
DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);

extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
extern struct perf_callchain_entry *
get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
                   u32 max_stack, bool crosstask, bool add_mark);
extern int get_callchain_buffers(int max_stack);
extern void put_callchain_buffers(void);
extern struct perf_callchain_entry *get_callchain_entry(int *rctx);
extern void put_callchain_entry(int rctx);

extern int sysctl_perf_event_max_stack;
extern int sysctl_perf_event_max_contexts_per_stack;

static inline int perf_callchain_store_context(struct perf_callchain_entry_ctx *ctx, u64 ip)
{
        if (ctx->contexts < sysctl_perf_event_max_contexts_per_stack) {
                struct perf_callchain_entry *entry = ctx->entry;

                entry->ip[entry->nr++] = ip;
                ++ctx->contexts;
                return 0;
        } else {
                ctx->contexts_maxed = true;
                return -1; /* no more room, stop walking the stack */
        }
}

static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip)
{
        if (ctx->nr < ctx->max_stack && !ctx->contexts_maxed) {
                struct perf_callchain_entry *entry = ctx->entry;

                entry->ip[entry->nr++] = ip;
                ++ctx->nr;
                return 0;
        } else {
                return -1; /* no more room, stop walking the stack */
        }
}

extern int sysctl_perf_event_paranoid;
extern int sysctl_perf_event_sample_rate;

extern void perf_sample_event_took(u64 sample_len_ns);

/* Access to perf_event_open(2) syscall. */
#define PERF_SECURITY_OPEN                0

/* Finer grained perf_event_open(2) access control. */
#define PERF_SECURITY_CPU                1
#define PERF_SECURITY_KERNEL                2
#define PERF_SECURITY_TRACEPOINT        3

static inline int perf_is_paranoid(void)
{
        return sysctl_perf_event_paranoid > -1;
}

extern int perf_allow_kernel(void);

static inline int perf_allow_cpu(void)
{
        if (sysctl_perf_event_paranoid > 0 && !perfmon_capable())
                return -EACCES;

        return security_perf_event_open(PERF_SECURITY_CPU);
}

static inline int perf_allow_tracepoint(void)
{
        if (sysctl_perf_event_paranoid > -1 && !perfmon_capable())
                return -EPERM;

        return security_perf_event_open(PERF_SECURITY_TRACEPOINT);
}

extern int perf_exclude_event(struct perf_event *event, struct pt_regs *regs);

extern void perf_event_init(void);
extern void perf_tp_event(u16 event_type, u64 count, void *record,
                          int entry_size, struct pt_regs *regs,
                          struct hlist_head *head, int rctx,
                          struct task_struct *task);
extern void perf_bp_event(struct perf_event *event, void *data);

extern unsigned long perf_misc_flags(struct perf_event *event, struct pt_regs *regs);
extern unsigned long perf_instruction_pointer(struct perf_event *event,
                                              struct pt_regs *regs);

#ifndef perf_arch_misc_flags
# define perf_arch_misc_flags(regs) \
                (user_mode(regs) ? PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL)
# define perf_arch_instruction_pointer(regs)        instruction_pointer(regs)
#endif
#ifndef perf_arch_bpf_user_pt_regs
# define perf_arch_bpf_user_pt_regs(regs) regs
#endif

#ifndef perf_arch_guest_misc_flags
static inline unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs)
{
        unsigned long guest_state = perf_guest_state();

        if (!(guest_state & PERF_GUEST_ACTIVE))
                return 0;

        if (guest_state & PERF_GUEST_USER)
                return PERF_RECORD_MISC_GUEST_USER;
        else
                return PERF_RECORD_MISC_GUEST_KERNEL;
}
# define perf_arch_guest_misc_flags(regs)        perf_arch_guest_misc_flags(regs)
#endif

static inline bool needs_branch_stack(struct perf_event *event)
{
        return event->attr.branch_sample_type != 0;
}

static inline bool has_aux(struct perf_event *event)
{
        return event->pmu && event->pmu->setup_aux;
}

static inline bool has_aux_action(struct perf_event *event)
{
        return event->attr.aux_sample_size ||
               event->attr.aux_pause ||
               event->attr.aux_resume;
}

static inline bool is_write_backward(struct perf_event *event)
{
        return !!event->attr.write_backward;
}

static inline bool has_addr_filter(struct perf_event *event)
{
        return event->pmu->nr_addr_filters;
}

/*
 * An inherited event uses parent's filters
 */
static inline struct perf_addr_filters_head *
perf_event_addr_filters(struct perf_event *event)
{
        struct perf_addr_filters_head *ifh = &event->addr_filters;

        if (event->parent)
                ifh = &event->parent->addr_filters;

        return ifh;
}

static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
{
        /* Only the parent has fasync state */
        if (event->parent)
                event = event->parent;
        return &event->fasync;
}

extern void perf_event_addr_filters_sync(struct perf_event *event);
extern void perf_report_aux_output_id(struct perf_event *event, u64 hw_id);

extern int perf_output_begin(struct perf_output_handle *handle,
                             struct perf_sample_data *data,
                             struct perf_event *event, unsigned int size);
extern int perf_output_begin_forward(struct perf_output_handle *handle,
                                     struct perf_sample_data *data,
                                     struct perf_event *event,
                                     unsigned int size);
extern int perf_output_begin_backward(struct perf_output_handle *handle,
                                      struct perf_sample_data *data,
                                      struct perf_event *event,
                                      unsigned int size);

extern void perf_output_end(struct perf_output_handle *handle);
extern unsigned int perf_output_copy(struct perf_output_handle *handle,
                                     const void *buf, unsigned int len);
extern unsigned int perf_output_skip(struct perf_output_handle *handle,
                                     unsigned int len);
extern long perf_output_copy_aux(struct perf_output_handle *aux_handle,
                                 struct perf_output_handle *handle,
                                 unsigned long from, unsigned long to);
extern int perf_swevent_get_recursion_context(void);
extern void perf_swevent_put_recursion_context(int rctx);
extern u64 perf_swevent_set_period(struct perf_event *event);
extern void perf_event_enable(struct perf_event *event);
extern void perf_event_disable(struct perf_event *event);
extern void perf_event_disable_local(struct perf_event *event);
extern void perf_event_disable_inatomic(struct perf_event *event);
extern void perf_event_task_tick(void);
extern int perf_event_account_interrupt(struct perf_event *event);
extern int perf_event_period(struct perf_event *event, u64 value);
extern u64 perf_event_pause(struct perf_event *event, bool reset);

#else /* !CONFIG_PERF_EVENTS: */

static inline void *
perf_aux_output_begin(struct perf_output_handle *handle,
                      struct perf_event *event)                                { return NULL; }
static inline void
perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
                                                                        { }
static inline int
perf_aux_output_skip(struct perf_output_handle *handle,
                     unsigned long size)                                { return -EINVAL; }
static inline void *
perf_get_aux(struct perf_output_handle *handle)                                { return NULL; }
static inline void
perf_event_task_migrate(struct task_struct *task)                        { }
static inline void
perf_event_task_sched_in(struct task_struct *prev,
                         struct task_struct *task)                        { }
static inline void
perf_event_task_sched_out(struct task_struct *prev,
                          struct task_struct *next)                        { }
static inline int perf_event_init_task(struct task_struct *child,
                                       u64 clone_flags)                        { return 0; }
static inline void perf_event_exit_task(struct task_struct *child)        { }
static inline void perf_event_free_task(struct task_struct *task)        { }
static inline void perf_event_delayed_put(struct task_struct *task)        { }
static inline struct file *perf_event_get(unsigned int fd)        { return ERR_PTR(-EINVAL); }
static inline const struct perf_event *perf_get_event(struct file *file)
{
        return ERR_PTR(-EINVAL);
}
static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
{
        return ERR_PTR(-EINVAL);
}
static inline int perf_event_read_local(struct perf_event *event, u64 *value,
                                        u64 *enabled, u64 *running)
{
        return -EINVAL;
}
static inline void perf_event_print_debug(void)                                { }
static inline int perf_event_task_disable(void)                                { return -EINVAL; }
static inline int perf_event_task_enable(void)                                { return -EINVAL; }
static inline int perf_event_refresh(struct perf_event *event, int refresh)
{
        return -EINVAL;
}

static inline void
perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)        { }
static inline void
perf_bp_event(struct perf_event *event, void *data)                        { }

static inline void perf_event_mmap(struct vm_area_struct *vma)                { }

typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data);
static inline void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len,
                                      bool unregister, const char *sym)        { }
static inline void perf_event_bpf_event(struct bpf_prog *prog,
                                        enum perf_bpf_event_type type,
                                        u16 flags)                        { }
static inline void perf_event_exec(void)                                { }
static inline void perf_event_comm(struct task_struct *tsk, bool exec)        { }
static inline void perf_event_namespaces(struct task_struct *tsk)        { }
static inline void perf_event_fork(struct task_struct *tsk)                { }
static inline void perf_event_text_poke(const void *addr,
                                        const void *old_bytes,
                                        size_t old_len,
                                        const void *new_bytes,
                                        size_t new_len)                        { }
static inline void perf_event_init(void)                                { }
static inline int  perf_swevent_get_recursion_context(void)                { return -1; }
static inline void perf_swevent_put_recursion_context(int rctx)                { }
static inline u64 perf_swevent_set_period(struct perf_event *event)        { return 0; }
static inline void perf_event_enable(struct perf_event *event)                { }
static inline void perf_event_disable(struct perf_event *event)                { }
static inline int __perf_event_disable(void *info)                        { return -1; }
static inline void perf_event_task_tick(void)                                { }
static inline int perf_event_release_kernel(struct perf_event *event)        { return 0; }
static inline int
perf_event_period(struct perf_event *event, u64 value)                        { return -EINVAL; }
static inline u64
perf_event_pause(struct perf_event *event, bool reset)                        { return 0; }
static inline int
perf_exclude_event(struct perf_event *event, struct pt_regs *regs)        { return 0; }

#endif /* !CONFIG_PERF_EVENTS */

#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
extern void perf_restore_debug_store(void);
#else
static inline void perf_restore_debug_store(void)                        { }
#endif

#define perf_output_put(handle, x)        perf_output_copy((handle), &(x), sizeof(x))

struct perf_pmu_events_attr {
        struct device_attribute                attr;
        u64                                id;
        const char                        *event_str;
};

struct perf_pmu_events_ht_attr {
        struct device_attribute                attr;
        u64                                id;
        const char                        *event_str_ht;
        const char                        *event_str_noht;
};

struct perf_pmu_events_hybrid_attr {
        struct device_attribute                attr;
        u64                                id;
        const char                        *event_str;
        u64                                pmu_type;
};

struct perf_pmu_format_hybrid_attr {
        struct device_attribute                attr;
        u64                                pmu_type;
};

ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
                              char *page);

#define PMU_EVENT_ATTR(_name, _var, _id, _show)                                \
static struct perf_pmu_events_attr _var = {                                \
        .attr = __ATTR(_name, 0444, _show, NULL),                        \
        .id   =  _id,                                                        \
};

#define PMU_EVENT_ATTR_STRING(_name, _var, _str)                            \
static struct perf_pmu_events_attr _var = {                                    \
        .attr                = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \
        .id                = 0,                                                    \
        .event_str        = _str,                                                    \
};

#define PMU_EVENT_ATTR_ID(_name, _show, _id)                                \
        (&((struct perf_pmu_events_attr[]) {                                \
                { .attr = __ATTR(_name, 0444, _show, NULL),                \
                  .id = _id, }                                                \
        })[0].attr.attr)

#define PMU_FORMAT_ATTR_SHOW(_name, _format)                                \
static ssize_t                                                                \
_name##_show(struct device *dev,                                        \
                               struct device_attribute *attr,                \
                               char *page)                                \
{                                                                        \
        BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);                        \
        return sprintf(page, _format "\n");                                \
}                                                                        \

#define PMU_FORMAT_ATTR(_name, _format)                                        \
        PMU_FORMAT_ATTR_SHOW(_name, _format)                                \
                                                                        \
static struct device_attribute format_attr_##_name = __ATTR_RO(_name)

/* Performance counter hotplug functions */
#ifdef CONFIG_PERF_EVENTS
extern int perf_event_init_cpu(unsigned int cpu);
extern int perf_event_exit_cpu(unsigned int cpu);
#else
# define perf_event_init_cpu                NULL
# define perf_event_exit_cpu                NULL
#endif

extern void arch_perf_update_userpage(struct perf_event *event,
                                      struct perf_event_mmap_page *userpg,
                                      u64 now);

/*
 * Snapshot branch stack on software events.
 *
 * Branch stack can be very useful in understanding software events. For
 * example, when a long function, e.g. sys_perf_event_open, returns an
 * errno, it is not obvious why the function failed. Branch stack could
 * provide very helpful information in this type of scenarios.
 *
 * On software event, it is necessary to stop the hardware branch recorder
 * fast. Otherwise, the hardware register/buffer will be flushed with
 * entries of the triggering event. Therefore, static call is used to
 * stop the hardware recorder.
 */

/*
 * cnt is the number of entries allocated for entries.
 * Return number of entries copied to .
 */
typedef int (perf_snapshot_branch_stack_t)(struct perf_branch_entry *entries,
                                           unsigned int cnt);
DECLARE_STATIC_CALL(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t);

#ifndef PERF_NEEDS_LOPWR_CB
static inline void perf_lopwr_cb(bool mode)
{
}
#endif

#endif /* _LINUX_PERF_EVENT_H */

















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
/* SPDX-License-Identifier: GPL-2.0 */
/*
 *
 * Definitions for mount interface. This describes the in the kernel build 
 * linkedlist with mounted filesystems.
 *
 * Author:  Marco van Wieringen <mvw@planets.elm.net>
 *
 */
#ifndef _LINUX_MOUNT_H
#define _LINUX_MOUNT_H

#include <linux/types.h>
#include <asm/barrier.h>

struct super_block;
struct dentry;
struct user_namespace;
struct mnt_idmap;
struct file_system_type;
struct fs_context;
struct file;
struct path;

enum mount_flags {
        MNT_NOSUID        = 0x01,
        MNT_NODEV        = 0x02,
        MNT_NOEXEC        = 0x04,
        MNT_NOATIME        = 0x08,
        MNT_NODIRATIME        = 0x10,
        MNT_RELATIME        = 0x20,
        MNT_READONLY        = 0x40, /* does the user want this to be r/o? */
        MNT_NOSYMFOLLOW        = 0x80,

        MNT_SHRINKABLE        = 0x100,

        MNT_INTERNAL        = 0x4000,

        MNT_LOCK_ATIME                = 0x040000,
        MNT_LOCK_NOEXEC                = 0x080000,
        MNT_LOCK_NOSUID                = 0x100000,
        MNT_LOCK_NODEV                = 0x200000,
        MNT_LOCK_READONLY        = 0x400000,
        MNT_LOCKED                = 0x800000,
        MNT_DOOMED                = 0x1000000,
        MNT_SYNC_UMOUNT                = 0x2000000,
        MNT_UMOUNT                = 0x8000000,

        MNT_USER_SETTABLE_MASK  = MNT_NOSUID | MNT_NODEV | MNT_NOEXEC
                                  | MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME
                                  | MNT_READONLY | MNT_NOSYMFOLLOW,
        MNT_ATIME_MASK = MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME,

        MNT_INTERNAL_FLAGS = MNT_INTERNAL | MNT_DOOMED |
                             MNT_SYNC_UMOUNT | MNT_LOCKED
};

struct vfsmount {
        struct dentry *mnt_root;        /* root of the mounted tree */
        struct super_block *mnt_sb;        /* pointer to superblock */
        int mnt_flags;
        struct mnt_idmap *mnt_idmap;
} __randomize_layout;

static inline struct mnt_idmap *mnt_idmap(const struct vfsmount *mnt)
{
        /* Pairs with smp_store_release() in do_idmap_mount(). */
        return READ_ONCE(mnt->mnt_idmap);
}

extern int mnt_want_write(struct vfsmount *mnt);
extern int mnt_want_write_file(struct file *file);
extern void mnt_drop_write(struct vfsmount *mnt);
extern void mnt_drop_write_file(struct file *file);
extern void mntput(struct vfsmount *mnt);
extern struct vfsmount *mntget(struct vfsmount *mnt);
extern void mnt_make_shortterm(struct vfsmount *mnt);
extern struct vfsmount *mnt_clone_internal(const struct path *path);
extern bool __mnt_is_readonly(const struct vfsmount *mnt);
extern bool mnt_may_suid(struct vfsmount *mnt);

extern struct vfsmount *clone_private_mount(const struct path *path);
int mnt_get_write_access(struct vfsmount *mnt);
void mnt_put_write_access(struct vfsmount *mnt);

extern struct vfsmount *fc_mount(struct fs_context *fc);
extern struct vfsmount *fc_mount_longterm(struct fs_context *fc);
extern struct vfsmount *vfs_create_mount(struct fs_context *fc);
extern struct vfsmount *vfs_kern_mount(struct file_system_type *type,
                                      int flags, const char *name,
                                      void *data);

extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list);
extern void mark_mounts_for_expiry(struct list_head *mounts);

extern bool path_is_mountpoint(const struct path *path);

extern bool our_mnt(struct vfsmount *mnt);

extern struct vfsmount *kern_mount(struct file_system_type *);
extern void kern_unmount(struct vfsmount *mnt);
extern int may_umount_tree(struct vfsmount *);
extern int may_umount(struct vfsmount *);
int do_mount(const char *, const char __user *,
                     const char *, unsigned long, void *);
extern const struct path *collect_paths(const struct path *, struct path *, unsigned);
extern void drop_collected_paths(const struct path *, const struct path *);
extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int num);

extern int cifs_root_data(char **dev, char **opts);

#endif /* _LINUX_MOUNT_H */
































































































































































































































































































































































































































































































































































  320 

  316 










































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/file_table.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *  Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
 */

#include <linux/string.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/filelock.h>
#include <linux/security.h>
#include <linux/cred.h>
#include <linux/eventpoll.h>
#include <linux/rcupdate.h>
#include <linux/mount.h>
#include <linux/capability.h>
#include <linux/cdev.h>
#include <linux/fsnotify.h>
#include <linux/sysctl.h>
#include <linux/percpu_counter.h>
#include <linux/percpu.h>
#include <linux/task_work.h>
#include <linux/swap.h>
#include <linux/kmemleak.h>

#include <linux/atomic.h>

#include "internal.h"

/* sysctl tunables... */
static struct files_stat_struct files_stat = {
        .max_files = NR_FILE
};

/* SLAB cache for file structures */
static struct kmem_cache *filp_cachep __ro_after_init;
static struct kmem_cache *bfilp_cachep __ro_after_init;

static struct percpu_counter nr_files __cacheline_aligned_in_smp;

/* Container for backing file with optional user path */
struct backing_file {
        struct file file;
        union {
                struct path user_path;
                freeptr_t bf_freeptr;
        };
};

#define backing_file(f) container_of(f, struct backing_file, file)

const struct path *backing_file_user_path(const struct file *f)
{
        return &backing_file(f)->user_path;
}
EXPORT_SYMBOL_GPL(backing_file_user_path);

void backing_file_set_user_path(struct file *f, const struct path *path)
{
        backing_file(f)->user_path = *path;
}
EXPORT_SYMBOL_GPL(backing_file_set_user_path);

static inline void file_free(struct file *f)
{
        security_file_free(f);
        if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
                percpu_counter_dec(&nr_files);
        put_cred(f->f_cred);
        if (unlikely(f->f_mode & FMODE_BACKING)) {
                path_put(backing_file_user_path(f));
                kmem_cache_free(bfilp_cachep, backing_file(f));
        } else {
                kmem_cache_free(filp_cachep, f);
        }
}

/*
 * Return the total number of open files in the system
 */
static long get_nr_files(void)
{
        return percpu_counter_read_positive(&nr_files);
}

/*
 * Return the maximum number of open files in the system
 */
unsigned long get_max_files(void)
{
        return files_stat.max_files;
}
EXPORT_SYMBOL_GPL(get_max_files);

#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)

/*
 * Handle nr_files sysctl
 */
static int proc_nr_files(const struct ctl_table *table, int write, void *buffer,
                         size_t *lenp, loff_t *ppos)
{
        files_stat.nr_files = percpu_counter_sum_positive(&nr_files);
        return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}

static const struct ctl_table fs_stat_sysctls[] = {
        {
                .procname        = "file-nr",
                .data                = &files_stat,
                .maxlen                = sizeof(files_stat),
                .mode                = 0444,
                .proc_handler        = proc_nr_files,
        },
        {
                .procname        = "file-max",
                .data                = &files_stat.max_files,
                .maxlen                = sizeof(files_stat.max_files),
                .mode                = 0644,
                .proc_handler        = proc_doulongvec_minmax,
                .extra1                = SYSCTL_LONG_ZERO,
                .extra2                = SYSCTL_LONG_MAX,
        },
        {
                .procname        = "nr_open",
                .data                = &sysctl_nr_open,
                .maxlen                = sizeof(unsigned int),
                .mode                = 0644,
                .proc_handler        = proc_douintvec_minmax,
                .extra1                = &sysctl_nr_open_min,
                .extra2                = &sysctl_nr_open_max,
        },
};

static int __init init_fs_stat_sysctls(void)
{
        register_sysctl_init("fs", fs_stat_sysctls);
        if (IS_ENABLED(CONFIG_BINFMT_MISC)) {
                struct ctl_table_header *hdr;

                hdr = register_sysctl_mount_point("fs/binfmt_misc");
                kmemleak_not_leak(hdr);
        }
        return 0;
}
fs_initcall(init_fs_stat_sysctls);
#endif

static int init_file(struct file *f, int flags, const struct cred *cred)
{
        int error;

        f->f_cred = get_cred(cred);
        error = security_file_alloc(f);
        if (unlikely(error)) {
                put_cred(f->f_cred);
                return error;
        }

        spin_lock_init(&f->f_lock);
        /*
         * Note that f_pos_lock is only used for files raising
         * FMODE_ATOMIC_POS and directories. Other files such as pipes
         * don't need it and since f_pos_lock is in a union may reuse
         * the space for other purposes. They are expected to initialize
         * the respective member when opening the file.
         */
        mutex_init(&f->f_pos_lock);
        memset(&f->__f_path, 0, sizeof(f->f_path));
        memset(&f->f_ra, 0, sizeof(f->f_ra));

        f->f_flags        = flags;
        f->f_mode        = OPEN_FMODE(flags);

        f->f_op                = NULL;
        f->f_mapping        = NULL;
        f->private_data = NULL;
        f->f_inode        = NULL;
        f->f_owner        = NULL;
#ifdef CONFIG_EPOLL
        f->f_ep                = NULL;
#endif

        f->f_iocb_flags = 0;
        f->f_pos        = 0;
        f->f_wb_err        = 0;
        f->f_sb_err        = 0;

        /*
         * We're SLAB_TYPESAFE_BY_RCU so initialize f_ref last. While
         * fget-rcu pattern users need to be able to handle spurious
         * refcount bumps we should reinitialize the reused file first.
         */
        file_ref_init(&f->f_ref, 1);
        /*
         * Disable permission and pre-content events for all files by default.
         * They may be enabled later by fsnotify_open_perm_and_set_mode().
         */
        file_set_fsnotify_mode(f, FMODE_NONOTIFY_PERM);
        return 0;
}

/* Find an unused file structure and return a pointer to it.
 * Returns an error pointer if some error happend e.g. we over file
 * structures limit, run out of memory or operation is not permitted.
 *
 * Be very careful using this.  You are responsible for
 * getting write access to any mount that you might assign
 * to this filp, if it is opened for write.  If this is not
 * done, you will imbalance int the mount's writer count
 * and a warning at __fput() time.
 */
struct file *alloc_empty_file(int flags, const struct cred *cred)
{
        static long old_max;
        struct file *f;
        int error;

        /*
         * Privileged users can go above max_files
         */
        if (unlikely(get_nr_files() >= files_stat.max_files) &&
            !capable(CAP_SYS_ADMIN)) {
                /*
                 * percpu_counters are inaccurate.  Do an expensive check before
                 * we go and fail.
                 */
                if (percpu_counter_sum_positive(&nr_files) >= files_stat.max_files)
                        goto over;
        }

        f = kmem_cache_alloc(filp_cachep, GFP_KERNEL);
        if (unlikely(!f))
                return ERR_PTR(-ENOMEM);

        error = init_file(f, flags, cred);
        if (unlikely(error)) {
                kmem_cache_free(filp_cachep, f);
                return ERR_PTR(error);
        }

        percpu_counter_inc(&nr_files);

        return f;

over:
        /* Ran out of filps - report that */
        if (get_nr_files() > old_max) {
                pr_info("VFS: file-max limit %lu reached\n", get_max_files());
                old_max = get_nr_files();
        }
        return ERR_PTR(-ENFILE);
}

/*
 * Variant of alloc_empty_file() that doesn't check and modify nr_files.
 *
 * This is only for kernel internal use, and the allocate file must not be
 * installed into file tables or such.
 */
struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred)
{
        struct file *f;
        int error;

        f = kmem_cache_alloc(filp_cachep, GFP_KERNEL);
        if (unlikely(!f))
                return ERR_PTR(-ENOMEM);

        error = init_file(f, flags, cred);
        if (unlikely(error)) {
                kmem_cache_free(filp_cachep, f);
                return ERR_PTR(error);
        }

        f->f_mode |= FMODE_NOACCOUNT;

        return f;
}

/*
 * Variant of alloc_empty_file() that allocates a backing_file container
 * and doesn't check and modify nr_files.
 *
 * This is only for kernel internal use, and the allocate file must not be
 * installed into file tables or such.
 */
struct file *alloc_empty_backing_file(int flags, const struct cred *cred)
{
        struct backing_file *ff;
        int error;

        ff = kmem_cache_alloc(bfilp_cachep, GFP_KERNEL);
        if (unlikely(!ff))
                return ERR_PTR(-ENOMEM);

        error = init_file(&ff->file, flags, cred);
        if (unlikely(error)) {
                kmem_cache_free(bfilp_cachep, ff);
                return ERR_PTR(error);
        }

        ff->file.f_mode |= FMODE_BACKING | FMODE_NOACCOUNT;
        return &ff->file;
}

/**
 * file_init_path - initialize a 'struct file' based on path
 *
 * @file: the file to set up
 * @path: the (dentry, vfsmount) pair for the new file
 * @fop: the 'struct file_operations' for the new file
 */
static void file_init_path(struct file *file, const struct path *path,
                           const struct file_operations *fop)
{
        file->__f_path = *path;
        file->f_inode = path->dentry->d_inode;
        file->f_mapping = path->dentry->d_inode->i_mapping;
        file->f_wb_err = filemap_sample_wb_err(file->f_mapping);
        file->f_sb_err = file_sample_sb_err(file);
        if (fop->llseek)
                file->f_mode |= FMODE_LSEEK;
        if ((file->f_mode & FMODE_READ) &&
             likely(fop->read || fop->read_iter))
                file->f_mode |= FMODE_CAN_READ;
        if ((file->f_mode & FMODE_WRITE) &&
             likely(fop->write || fop->write_iter))
                file->f_mode |= FMODE_CAN_WRITE;
        file->f_iocb_flags = iocb_flags(file);
        file->f_mode |= FMODE_OPENED;
        file->f_op = fop;
        if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
                i_readcount_inc(path->dentry->d_inode);
}

/**
 * alloc_file - allocate and initialize a 'struct file'
 *
 * @path: the (dentry, vfsmount) pair for the new file
 * @flags: O_... flags with which the new file will be opened
 * @fop: the 'struct file_operations' for the new file
 */
static struct file *alloc_file(const struct path *path, int flags,
                const struct file_operations *fop)
{
        struct file *file;

        file = alloc_empty_file(flags, current_cred());
        if (!IS_ERR(file))
                file_init_path(file, path, fop);
        return file;
}

static inline int alloc_path_pseudo(const char *name, struct inode *inode,
                                    struct vfsmount *mnt, struct path *path)
{
        path->dentry = d_alloc_pseudo(mnt->mnt_sb, &QSTR(name));
        if (!path->dentry)
                return -ENOMEM;
        path->mnt = mntget(mnt);
        d_instantiate(path->dentry, inode);
        return 0;
}

struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
                               const char *name, int flags,
                               const struct file_operations *fops)
{
        int ret;
        struct path path;
        struct file *file;

        ret = alloc_path_pseudo(name, inode, mnt, &path);
        if (ret)
                return ERR_PTR(ret);

        file = alloc_file(&path, flags, fops);
        if (IS_ERR(file)) {
                ihold(inode);
                path_put(&path);
                return file;
        }
        /*
         * Disable all fsnotify events for pseudo files by default.
         * They may be enabled by caller with file_set_fsnotify_mode().
         */
        file_set_fsnotify_mode(file, FMODE_NONOTIFY);
        return file;
}
EXPORT_SYMBOL(alloc_file_pseudo);

struct file *alloc_file_pseudo_noaccount(struct inode *inode,
                                         struct vfsmount *mnt, const char *name,
                                         int flags,
                                         const struct file_operations *fops)
{
        int ret;
        struct path path;
        struct file *file;

        ret = alloc_path_pseudo(name, inode, mnt, &path);
        if (ret)
                return ERR_PTR(ret);

        file = alloc_empty_file_noaccount(flags, current_cred());
        if (IS_ERR(file)) {
                ihold(inode);
                path_put(&path);
                return file;
        }
        file_init_path(file, &path, fops);
        /*
         * Disable all fsnotify events for pseudo files by default.
         * They may be enabled by caller with file_set_fsnotify_mode().
         */
        file_set_fsnotify_mode(file, FMODE_NONOTIFY);
        return file;
}
EXPORT_SYMBOL_GPL(alloc_file_pseudo_noaccount);

struct file *alloc_file_clone(struct file *base, int flags,
                                const struct file_operations *fops)
{
        struct file *f;

        f = alloc_file(&base->f_path, flags, fops);
        if (!IS_ERR(f)) {
                path_get(&f->f_path);
                f->f_mapping = base->f_mapping;
        }
        return f;
}

/* the real guts of fput() - releasing the last reference to file
 */
static void __fput(struct file *file)
{
        struct dentry *dentry = file->f_path.dentry;
        struct vfsmount *mnt = file->f_path.mnt;
        struct inode *inode = file->f_inode;
        fmode_t mode = file->f_mode;

        if (unlikely(!(file->f_mode & FMODE_OPENED)))
                goto out;

        might_sleep();

        fsnotify_close(file);
        /*
         * The function eventpoll_release() should be the first called
         * in the file cleanup chain.
         */
        eventpoll_release(file);
        locks_remove_file(file);

        security_file_release(file);
        if (unlikely(file->f_flags & FASYNC)) {
                if (file->f_op->fasync)
                        file->f_op->fasync(-1, file, 0);
        }
        if (file->f_op->release)
                file->f_op->release(inode, file);
        if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
                     !(mode & FMODE_PATH))) {
                cdev_put(inode->i_cdev);
        }
        fops_put(file->f_op);
        file_f_owner_release(file);
        put_file_access(file);
        dput(dentry);
        if (unlikely(mode & FMODE_NEED_UNMOUNT))
                dissolve_on_fput(mnt);
        mntput(mnt);
out:
        file_free(file);
}

static LLIST_HEAD(delayed_fput_list);
static void delayed_fput(struct work_struct *unused)
{
        struct llist_node *node = llist_del_all(&delayed_fput_list);
        struct file *f, *t;

        llist_for_each_entry_safe(f, t, node, f_llist)
                __fput(f);
}

static void ____fput(struct callback_head *work)
{
        __fput(container_of(work, struct file, f_task_work));
}

static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);

/*
 * If kernel thread really needs to have the final fput() it has done
 * to complete, call this.  The only user right now is the boot - we
 * *do* need to make sure our writes to binaries on initramfs has
 * not left us with opened struct file waiting for __fput() - execve()
 * won't work without that.  Please, don't add more callers without
 * very good reasons; in particular, never call that with locks
 * held and never call that from a thread that might need to do
 * some work on any kind of umount.
 */
void flush_delayed_fput(void)
{
        delayed_fput(NULL);
        flush_delayed_work(&delayed_fput_work);
}
EXPORT_SYMBOL_GPL(flush_delayed_fput);

static void __fput_deferred(struct file *file)
{
        struct task_struct *task = current;

        if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) {
                file_free(file);
                return;
        }

        if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
                init_task_work(&file->f_task_work, ____fput);
                if (!task_work_add(task, &file->f_task_work, TWA_RESUME))
                        return;
                /*
                 * After this task has run exit_task_work(),
                 * task_work_add() will fail.  Fall through to delayed
                 * fput to avoid leaking *file.
                 */
        }

        if (llist_add(&file->f_llist, &delayed_fput_list))
                schedule_delayed_work(&delayed_fput_work, 1);
}

void fput(struct file *file)
{
        if (unlikely(file_ref_put(&file->f_ref)))
                __fput_deferred(file);
}
EXPORT_SYMBOL(fput);

/*
 * synchronous analog of fput(); for kernel threads that might be needed
 * in some umount() (and thus can't use flush_delayed_fput() without
 * risking deadlocks), need to wait for completion of __fput() and know
 * for this specific struct file it won't involve anything that would
 * need them.  Use only if you really need it - at the very least,
 * don't blindly convert fput() by kernel thread to that.
 */
void __fput_sync(struct file *file)
{
        if (file_ref_put(&file->f_ref))
                __fput(file);
}
EXPORT_SYMBOL(__fput_sync);

/*
 * Equivalent to __fput_sync(), but optimized for being called with the last
 * reference.
 *
 * See file_ref_put_close() for details.
 */
void fput_close_sync(struct file *file)
{
        if (likely(file_ref_put_close(&file->f_ref)))
                __fput(file);
}

/*
 * Equivalent to fput(), but optimized for being called with the last
 * reference.
 *
 * See file_ref_put_close() for details.
 */
void fput_close(struct file *file)
{
        if (file_ref_put_close(&file->f_ref))
                __fput_deferred(file);
}

void __init files_init(void)
{
        struct kmem_cache_args args = {
                .use_freeptr_offset = true,
                .freeptr_offset = offsetof(struct file, f_freeptr),
        };

        filp_cachep = kmem_cache_create("filp", sizeof(struct file), &args,
                                SLAB_HWCACHE_ALIGN | SLAB_PANIC |
                                SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU);

        args.freeptr_offset = offsetof(struct backing_file, bf_freeptr);
        bfilp_cachep = kmem_cache_create("bfilp", sizeof(struct backing_file),
                                &args, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
                                SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU);
        percpu_counter_init(&nr_files, 0, GFP_KERNEL);
}

/*
 * One file with associated inode and dcache is very roughly 1K. Per default
 * do not use more than 10% of our memory for files.
 */
void __init files_maxfiles_init(void)
{
        unsigned long n;
        unsigned long nr_pages = totalram_pages();
        unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2;

        memreserve = min(memreserve, nr_pages - 1);
        n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10;

        files_stat.max_files = max_t(unsigned long, n, NR_FILE);
}
















































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_MSR_H
#define _ASM_X86_MSR_H

#include "msr-index.h"

#ifndef __ASSEMBLER__

#include <asm/asm.h>
#include <asm/errno.h>
#include <asm/cpumask.h>
#include <uapi/asm/msr.h>
#include <asm/shared/msr.h>

#include <linux/types.h>
#include <linux/percpu.h>

struct msr_info {
        u32                        msr_no;
        struct msr                reg;
        struct msr __percpu        *msrs;
        int                        err;
};

struct msr_regs_info {
        u32 *regs;
        int err;
};

struct saved_msr {
        bool valid;
        struct msr_info info;
};

struct saved_msrs {
        unsigned int num;
        struct saved_msr *array;
};

/*
 * Be very careful with includes. This header is prone to include loops.
 */
#include <asm/atomic.h>
#include <linux/tracepoint-defs.h>

#ifdef CONFIG_TRACEPOINTS
DECLARE_TRACEPOINT(read_msr);
DECLARE_TRACEPOINT(write_msr);
DECLARE_TRACEPOINT(rdpmc);
extern void do_trace_write_msr(u32 msr, u64 val, int failed);
extern void do_trace_read_msr(u32 msr, u64 val, int failed);
extern void do_trace_rdpmc(u32 msr, u64 val, int failed);
#else
static inline void do_trace_write_msr(u32 msr, u64 val, int failed) {}
static inline void do_trace_read_msr(u32 msr, u64 val, int failed) {}
static inline void do_trace_rdpmc(u32 msr, u64 val, int failed) {}
#endif

/*
 * __rdmsr() and __wrmsr() are the two primitives which are the bare minimum MSR
 * accessors and should not have any tracing or other functionality piggybacking
 * on them - those are *purely* for accessing MSRs and nothing more. So don't even
 * think of extending them - you will be slapped with a stinking trout or a frozen
 * shark will reach you, wherever you are! You've been warned.
 */
static __always_inline u64 __rdmsr(u32 msr)
{
        EAX_EDX_DECLARE_ARGS(val, low, high);

        asm volatile("1: rdmsr\n"
                     "2:\n"
                     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR)
                     : EAX_EDX_RET(val, low, high) : "c" (msr));

        return EAX_EDX_VAL(val, low, high);
}

static __always_inline void __wrmsrq(u32 msr, u64 val)
{
        asm volatile("1: wrmsr\n"
                     "2:\n"
                     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR)
                     : : "c" (msr), "a" ((u32)val), "d" ((u32)(val >> 32)) : "memory");
}

#define native_rdmsr(msr, val1, val2)                        \
do {                                                        \
        u64 __val = __rdmsr((msr));                        \
        (void)((val1) = (u32)__val);                        \
        (void)((val2) = (u32)(__val >> 32));                \
} while (0)

static __always_inline u64 native_rdmsrq(u32 msr)
{
        return __rdmsr(msr);
}

#define native_wrmsr(msr, low, high)                        \
        __wrmsrq((msr), (u64)(high) << 32 | (low))

#define native_wrmsrq(msr, val)                                \
        __wrmsrq((msr), (val))

static inline u64 native_read_msr(u32 msr)
{
        u64 val;

        val = __rdmsr(msr);

        if (tracepoint_enabled(read_msr))
                do_trace_read_msr(msr, val, 0);

        return val;
}

static inline int native_read_msr_safe(u32 msr, u64 *p)
{
        int err;
        EAX_EDX_DECLARE_ARGS(val, low, high);

        asm volatile("1: rdmsr ; xor %[err],%[err]\n"
                     "2:\n\t"
                     _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_RDMSR_SAFE, %[err])
                     : [err] "=r" (err), EAX_EDX_RET(val, low, high)
                     : "c" (msr));
        if (tracepoint_enabled(read_msr))
                do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), err);

        *p = EAX_EDX_VAL(val, low, high);

        return err;
}

/* Can be uninlined because referenced by paravirt */
static inline void notrace native_write_msr(u32 msr, u64 val)
{
        native_wrmsrq(msr, val);

        if (tracepoint_enabled(write_msr))
                do_trace_write_msr(msr, val, 0);
}

/* Can be uninlined because referenced by paravirt */
static inline int notrace native_write_msr_safe(u32 msr, u64 val)
{
        int err;

        asm volatile("1: wrmsr ; xor %[err],%[err]\n"
                     "2:\n\t"
                     _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_WRMSR_SAFE, %[err])
                     : [err] "=a" (err)
                     : "c" (msr), "0" ((u32)val), "d" ((u32)(val >> 32))
                     : "memory");
        if (tracepoint_enabled(write_msr))
                do_trace_write_msr(msr, val, err);
        return err;
}

extern int rdmsr_safe_regs(u32 regs[8]);
extern int wrmsr_safe_regs(u32 regs[8]);

static inline u64 native_read_pmc(int counter)
{
        EAX_EDX_DECLARE_ARGS(val, low, high);

        asm volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter));
        if (tracepoint_enabled(rdpmc))
                do_trace_rdpmc(counter, EAX_EDX_VAL(val, low, high), 0);
        return EAX_EDX_VAL(val, low, high);
}

#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
#include <linux/errno.h>
/*
 * Access to machine-specific registers (available on 586 and better only)
 * Note: the rd* operations modify the parameters directly (without using
 * pointer indirection), this allows gcc to optimize better
 */

#define rdmsr(msr, low, high)                                        \
do {                                                                \
        u64 __val = native_read_msr((msr));                        \
        (void)((low) = (u32)__val);                                \
        (void)((high) = (u32)(__val >> 32));                        \
} while (0)

static inline void wrmsr(u32 msr, u32 low, u32 high)
{
        native_write_msr(msr, (u64)high << 32 | low);
}

#define rdmsrq(msr, val)                        \
        ((val) = native_read_msr((msr)))

static inline void wrmsrq(u32 msr, u64 val)
{
        native_write_msr(msr, val);
}

/* wrmsr with exception handling */
static inline int wrmsrq_safe(u32 msr, u64 val)
{
        return native_write_msr_safe(msr, val);
}

/* rdmsr with exception handling */
#define rdmsr_safe(msr, low, high)                                \
({                                                                \
        u64 __val;                                                \
        int __err = native_read_msr_safe((msr), &__val);        \
        (*low) = (u32)__val;                                        \
        (*high) = (u32)(__val >> 32);                                \
        __err;                                                        \
})

static inline int rdmsrq_safe(u32 msr, u64 *p)
{
        return native_read_msr_safe(msr, p);
}

static __always_inline u64 rdpmc(int counter)
{
        return native_read_pmc(counter);
}

#endif        /* !CONFIG_PARAVIRT_XXL */

/* Instruction opcode for WRMSRNS supported in binutils >= 2.40 */
#define ASM_WRMSRNS _ASM_BYTES(0x0f,0x01,0xc6)

/* Non-serializing WRMSR, when available.  Falls back to a serializing WRMSR. */
static __always_inline void wrmsrns(u32 msr, u64 val)
{
        /*
         * WRMSR is 2 bytes.  WRMSRNS is 3 bytes.  Pad WRMSR with a redundant
         * DS prefix to avoid a trailing NOP.
         */
        asm volatile("1: " ALTERNATIVE("ds wrmsr", ASM_WRMSRNS, X86_FEATURE_WRMSRNS)
                     "2: " _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR)
                     : : "c" (msr), "a" ((u32)val), "d" ((u32)(val >> 32)));
}

/*
 * Dual u32 version of wrmsrq_safe():
 */
static inline int wrmsr_safe(u32 msr, u32 low, u32 high)
{
        return wrmsrq_safe(msr, (u64)high << 32 | low);
}

struct msr __percpu *msrs_alloc(void);
void msrs_free(struct msr __percpu *msrs);
int msr_set_bit(u32 msr, u8 bit);
int msr_clear_bit(u32 msr, u8 bit);

#ifdef CONFIG_SMP
int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
int rdmsrq_on_cpu(unsigned int cpu, u32 msr_no, u64 *q);
int wrmsrq_on_cpu(unsigned int cpu, u32 msr_no, u64 q);
void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs);
void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs);
int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
int rdmsrq_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q);
int wrmsrq_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q);
int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]);
int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]);
#else  /*  CONFIG_SMP  */
static inline int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
{
        rdmsr(msr_no, *l, *h);
        return 0;
}
static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
{
        wrmsr(msr_no, l, h);
        return 0;
}
static inline int rdmsrq_on_cpu(unsigned int cpu, u32 msr_no, u64 *q)
{
        rdmsrq(msr_no, *q);
        return 0;
}
static inline int wrmsrq_on_cpu(unsigned int cpu, u32 msr_no, u64 q)
{
        wrmsrq(msr_no, q);
        return 0;
}
static inline void rdmsr_on_cpus(const struct cpumask *m, u32 msr_no,
                                struct msr __percpu *msrs)
{
        rdmsr_on_cpu(0, msr_no, raw_cpu_ptr(&msrs->l), raw_cpu_ptr(&msrs->h));
}
static inline void wrmsr_on_cpus(const struct cpumask *m, u32 msr_no,
                                struct msr __percpu *msrs)
{
        wrmsr_on_cpu(0, msr_no, raw_cpu_read(msrs->l), raw_cpu_read(msrs->h));
}
static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no,
                                    u32 *l, u32 *h)
{
        return rdmsr_safe(msr_no, l, h);
}
static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
{
        return wrmsr_safe(msr_no, l, h);
}
static inline int rdmsrq_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q)
{
        return rdmsrq_safe(msr_no, q);
}
static inline int wrmsrq_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q)
{
        return wrmsrq_safe(msr_no, q);
}
static inline int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8])
{
        return rdmsr_safe_regs(regs);
}
static inline int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8])
{
        return wrmsr_safe_regs(regs);
}
#endif  /* CONFIG_SMP */

/* Compatibility wrappers: */
#define rdmsrl(msr, val) rdmsrq(msr, val)
#define wrmsrl(msr, val) wrmsrq(msr, val)
#define rdmsrl_on_cpu(cpu, msr, q) rdmsrq_on_cpu(cpu, msr, q)

#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_MSR_H */

































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
/**
 * css_get - obtain a reference on the specified css
 * @css: target css
 *
 * The caller must already have a reference.
 */
CGROUP_REF_FN_ATTRS
void css_get(struct cgroup_subsys_state *css)
{
        if (!(css->flags & CSS_NO_REF))
                percpu_ref_get(&css->refcnt);
}
CGROUP_REF_EXPORT(css_get)

/**
 * css_get_many - obtain references on the specified css
 * @css: target css
 * @n: number of references to get
 *
 * The caller must already have a reference.
 */
CGROUP_REF_FN_ATTRS
void css_get_many(struct cgroup_subsys_state *css, unsigned int n)
{
        if (!(css->flags & CSS_NO_REF))
                percpu_ref_get_many(&css->refcnt, n);
}
CGROUP_REF_EXPORT(css_get_many)

/**
 * css_tryget - try to obtain a reference on the specified css
 * @css: target css
 *
 * Obtain a reference on @css unless it already has reached zero and is
 * being released.  This function doesn't care whether @css is on or
 * offline.  The caller naturally needs to ensure that @css is accessible
 * but doesn't have to be holding a reference on it - IOW, RCU protected
 * access is good enough for this function.  Returns %true if a reference
 * count was successfully obtained; %false otherwise.
 */
CGROUP_REF_FN_ATTRS
bool css_tryget(struct cgroup_subsys_state *css)
{
        if (!(css->flags & CSS_NO_REF))
                return percpu_ref_tryget(&css->refcnt);
        return true;
}
CGROUP_REF_EXPORT(css_tryget)

/**
 * css_tryget_online - try to obtain a reference on the specified css if online
 * @css: target css
 *
 * Obtain a reference on @css if it's online.  The caller naturally needs
 * to ensure that @css is accessible but doesn't have to be holding a
 * reference on it - IOW, RCU protected access is good enough for this
 * function.  Returns %true if a reference count was successfully obtained;
 * %false otherwise.
 */
CGROUP_REF_FN_ATTRS
bool css_tryget_online(struct cgroup_subsys_state *css)
{
        if (!(css->flags & CSS_NO_REF))
                return percpu_ref_tryget_live(&css->refcnt);
        return true;
}
CGROUP_REF_EXPORT(css_tryget_online)

/**
 * css_put - put a css reference
 * @css: target css
 *
 * Put a reference obtained via css_get() and css_tryget_online().
 */
CGROUP_REF_FN_ATTRS
void css_put(struct cgroup_subsys_state *css)
{
        if (!(css->flags & CSS_NO_REF))
                percpu_ref_put(&css->refcnt);
}
CGROUP_REF_EXPORT(css_put)

/**
 * css_put_many - put css references
 * @css: target css
 * @n: number of references to put
 *
 * Put references obtained via css_get() and css_tryget_online().
 */
CGROUP_REF_FN_ATTRS
void css_put_many(struct cgroup_subsys_state *css, unsigned int n)
{
        if (!(css->flags & CSS_NO_REF))
                percpu_ref_put_many(&css->refcnt, n);
}
CGROUP_REF_EXPORT(css_put_many)








































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * linux/cgroup-defs.h - basic definitions for cgroup
 *
 * This file provides basic type and interface.  Include this file directly
 * only if necessary to avoid cyclic dependencies.
 */
#ifndef _LINUX_CGROUP_DEFS_H
#define _LINUX_CGROUP_DEFS_H

#include <linux/limits.h>
#include <linux/list.h>
#include <linux/idr.h>
#include <linux/wait.h>
#include <linux/mutex.h>
#include <linux/rcupdate.h>
#include <linux/refcount.h>
#include <linux/percpu-refcount.h>
#include <linux/percpu-rwsem.h>
#include <linux/u64_stats_sync.h>
#include <linux/workqueue.h>
#include <linux/bpf-cgroup-defs.h>
#include <linux/psi_types.h>

#ifdef CONFIG_CGROUPS

struct cgroup;
struct cgroup_root;
struct cgroup_subsys;
struct cgroup_taskset;
struct kernfs_node;
struct kernfs_ops;
struct kernfs_open_file;
struct seq_file;
struct poll_table_struct;

#define MAX_CGROUP_TYPE_NAMELEN 32
#define MAX_CGROUP_ROOT_NAMELEN 64
#define MAX_CFTYPE_NAME                64

/* define the enumeration of all cgroup subsystems */
#define SUBSYS(_x) _x ## _cgrp_id,
enum cgroup_subsys_id {
#include <linux/cgroup_subsys.h>
        CGROUP_SUBSYS_COUNT,
};
#undef SUBSYS

/* bits in struct cgroup_subsys_state flags field */
enum {
        CSS_NO_REF        = (1 << 0), /* no reference counting for this css */
        CSS_ONLINE        = (1 << 1), /* between ->css_online() and ->css_offline() */
        CSS_RELEASED        = (1 << 2), /* refcnt reached zero, released */
        CSS_VISIBLE        = (1 << 3), /* css is visible to userland */
        CSS_DYING        = (1 << 4), /* css is dying */
};

/* bits in struct cgroup flags field */
enum {
        /* Control Group requires release notifications to userspace */
        CGRP_NOTIFY_ON_RELEASE,
        /*
         * Clone the parent's configuration when creating a new child
         * cpuset cgroup.  For historical reasons, this option can be
         * specified at mount time and thus is implemented here.
         */
        CGRP_CPUSET_CLONE_CHILDREN,

        /* Control group has to be frozen. */
        CGRP_FREEZE,

        /* Cgroup is frozen. */
        CGRP_FROZEN,
};

/* cgroup_root->flags */
enum {
        CGRP_ROOT_NOPREFIX        = (1 << 1), /* mounted subsystems have no named prefix */
        CGRP_ROOT_XATTR                = (1 << 2), /* supports extended attributes */

        /*
         * Consider namespaces as delegation boundaries.  If this flag is
         * set, controller specific interface files in a namespace root
         * aren't writeable from inside the namespace.
         */
        CGRP_ROOT_NS_DELEGATE        = (1 << 3),

        /*
         * Reduce latencies on dynamic cgroup modifications such as task
         * migrations and controller on/offs by disabling percpu operation on
         * cgroup_threadgroup_rwsem. This makes hot path operations such as
         * forks and exits into the slow path and more expensive.
         *
         * Alleviate the contention between fork, exec, exit operations and
         * writing to cgroup.procs by taking a per threadgroup rwsem instead of
         * the global cgroup_threadgroup_rwsem. Fork and other operations
         * from threads in different thread groups no longer contend with
         * writing to cgroup.procs.
         *
         * The static usage pattern of creating a cgroup, enabling controllers,
         * and then seeding it with CLONE_INTO_CGROUP doesn't require write
         * locking cgroup_threadgroup_rwsem and thus doesn't benefit from
         * favordynmod.
         */
        CGRP_ROOT_FAVOR_DYNMODS = (1 << 4),

        /*
         * Enable cpuset controller in v1 cgroup to use v2 behavior.
         */
        CGRP_ROOT_CPUSET_V2_MODE = (1 << 16),

        /*
         * Enable legacy local memory.events.
         */
        CGRP_ROOT_MEMORY_LOCAL_EVENTS = (1 << 17),

        /*
         * Enable recursive subtree protection
         */
        CGRP_ROOT_MEMORY_RECURSIVE_PROT = (1 << 18),

        /*
         * Enable hugetlb accounting for the memory controller.
         */
        CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19),

        /*
         * Enable legacy local pids.events.
         */
        CGRP_ROOT_PIDS_LOCAL_EVENTS = (1 << 20),
};

/* cftype->flags */
enum {
        CFTYPE_ONLY_ON_ROOT        = (1 << 0),        /* only create on root cgrp */
        CFTYPE_NOT_ON_ROOT        = (1 << 1),        /* don't create on root cgrp */
        CFTYPE_NS_DELEGATABLE        = (1 << 2),        /* writeable beyond delegation boundaries */

        CFTYPE_NO_PREFIX        = (1 << 3),        /* (DON'T USE FOR NEW FILES) no subsys prefix */
        CFTYPE_WORLD_WRITABLE        = (1 << 4),        /* (DON'T USE FOR NEW FILES) S_IWUGO */
        CFTYPE_DEBUG                = (1 << 5),        /* create when cgroup_debug */

        /* internal flags, do not use outside cgroup core proper */
        __CFTYPE_ONLY_ON_DFL        = (1 << 16),        /* only on default hierarchy */
        __CFTYPE_NOT_ON_DFL        = (1 << 17),        /* not on default hierarchy */
        __CFTYPE_ADDED                = (1 << 18),
};

enum cgroup_attach_lock_mode {
        /* Default */
        CGRP_ATTACH_LOCK_GLOBAL,

        /* When pid=0 && threadgroup=false, see comments in cgroup_procs_write_start */
        CGRP_ATTACH_LOCK_NONE,

        /* When favordynmods is on, see comments above CGRP_ROOT_FAVOR_DYNMODS */
        CGRP_ATTACH_LOCK_PER_THREADGROUP,
};

/*
 * cgroup_file is the handle for a file instance created in a cgroup which
 * is used, for example, to generate file changed notifications.  This can
 * be obtained by setting cftype->file_offset.
 */
struct cgroup_file {
        /* do not access any fields from outside cgroup core */
        struct kernfs_node *kn;
        unsigned long notified_at;
        struct timer_list notify_timer;
};

/*
 * Per-subsystem/per-cgroup state maintained by the system.  This is the
 * fundamental structural building block that controllers deal with.
 *
 * Fields marked with "PI:" are public and immutable and may be accessed
 * directly without synchronization.
 */
struct cgroup_subsys_state {
        /* PI: the cgroup that this css is attached to */
        struct cgroup *cgroup;

        /* PI: the cgroup subsystem that this css is attached to */
        struct cgroup_subsys *ss;

        /* reference count - access via css_[try]get() and css_put() */
        struct percpu_ref refcnt;

        /*
         * Depending on the context, this field is initialized
         * via css_rstat_init() at different places:
         *
         * when css is associated with cgroup::self
         *   when css->cgroup is the root cgroup
         *     performed in cgroup_init()
         *   when css->cgroup is not the root cgroup
         *     performed in cgroup_create()
         * when css is associated with a subsystem
         *   when css->cgroup is the root cgroup
         *     performed in cgroup_init_subsys() in the non-early path
         *   when css->cgroup is not the root cgroup
         *     performed in css_create()
         */
        struct css_rstat_cpu __percpu *rstat_cpu;

        /*
         * siblings list anchored at the parent's ->children
         *
         * linkage is protected by cgroup_mutex or RCU
         */
        struct list_head sibling;
        struct list_head children;

        /*
         * PI: Subsys-unique ID.  0 is unused and root is always 1.  The
         * matching css can be looked up using css_from_id().
         */
        int id;

        unsigned int flags;

        /*
         * Monotonically increasing unique serial number which defines a
         * uniform order among all csses.  It's guaranteed that all
         * ->children lists are in the ascending order of ->serial_nr and
         * used to allow interrupting and resuming iterations.
         */
        u64 serial_nr;

        /*
         * Incremented by online self and children.  Used to guarantee that
         * parents are not offlined before their children.
         */
        atomic_t online_cnt;

        /* percpu_ref killing and RCU release */
        struct work_struct destroy_work;
        struct rcu_work destroy_rwork;

        /*
         * PI: the parent css.        Placed here for cache proximity to following
         * fields of the containing structure.
         */
        struct cgroup_subsys_state *parent;

        /*
         * Keep track of total numbers of visible descendant CSSes.
         * The total number of dying CSSes is tracked in
         * css->cgroup->nr_dying_subsys[ssid].
         * Protected by cgroup_mutex.
         */
        int nr_descendants;

        /*
         * A singly-linked list of css structures to be rstat flushed.
         * This is a scratch field to be used exclusively by
         * css_rstat_flush().
         *
         * Protected by rstat_base_lock when css is cgroup::self.
         * Protected by css->ss->rstat_ss_lock otherwise.
         */
        struct cgroup_subsys_state *rstat_flush_next;
};

/*
 * A css_set is a structure holding pointers to a set of
 * cgroup_subsys_state objects. This saves space in the task struct
 * object and speeds up fork()/exit(), since a single inc/dec and a
 * list_add()/del() can bump the reference count on the entire cgroup
 * set for a task.
 */
struct css_set {
        /*
         * Set of subsystem states, one for each subsystem. This array is
         * immutable after creation apart from the init_css_set during
         * subsystem registration (at boot time).
         */
        struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];

        /* reference count */
        refcount_t refcount;

        /*
         * For a domain cgroup, the following points to self.  If threaded,
         * to the matching cset of the nearest domain ancestor.  The
         * dom_cset provides access to the domain cgroup and its csses to
         * which domain level resource consumptions should be charged.
         */
        struct css_set *dom_cset;

        /* the default cgroup associated with this css_set */
        struct cgroup *dfl_cgrp;

        /* internal task count, protected by css_set_lock */
        int nr_tasks;

        /*
         * Lists running through all tasks using this cgroup group.
         * mg_tasks lists tasks which belong to this cset but are in the
         * process of being migrated out or in.  Protected by
         * css_set_lock, but, during migration, once tasks are moved to
         * mg_tasks, it can be read safely while holding cgroup_mutex.
         */
        struct list_head tasks;
        struct list_head mg_tasks;
        struct list_head dying_tasks;

        /* all css_task_iters currently walking this cset */
        struct list_head task_iters;

        /*
         * On the default hierarchy, ->subsys[ssid] may point to a css
         * attached to an ancestor instead of the cgroup this css_set is
         * associated with.  The following node is anchored at
         * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
         * iterate through all css's attached to a given cgroup.
         */
        struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];

        /* all threaded csets whose ->dom_cset points to this cset */
        struct list_head threaded_csets;
        struct list_head threaded_csets_node;

        /*
         * List running through all cgroup groups in the same hash
         * slot. Protected by css_set_lock
         */
        struct hlist_node hlist;

        /*
         * List of cgrp_cset_links pointing at cgroups referenced from this
         * css_set.  Protected by css_set_lock.
         */
        struct list_head cgrp_links;

        /*
         * List of csets participating in the on-going migration either as
         * source or destination.  Protected by cgroup_mutex.
         */
        struct list_head mg_src_preload_node;
        struct list_head mg_dst_preload_node;
        struct list_head mg_node;

        /*
         * If this cset is acting as the source of migration the following
         * two fields are set.  mg_src_cgrp and mg_dst_cgrp are
         * respectively the source and destination cgroups of the on-going
         * migration.  mg_dst_cset is the destination cset the target tasks
         * on this cset should be migrated to.  Protected by cgroup_mutex.
         */
        struct cgroup *mg_src_cgrp;
        struct cgroup *mg_dst_cgrp;
        struct css_set *mg_dst_cset;

        /* dead and being drained, ignore for migration */
        bool dead;

        /* For RCU-protected deletion */
        struct rcu_head rcu_head;
};

struct cgroup_base_stat {
        struct task_cputime cputime;

#ifdef CONFIG_SCHED_CORE
        u64 forceidle_sum;
#endif
        u64 ntime;
};

/*
 * rstat - cgroup scalable recursive statistics.  Accounting is done
 * per-cpu in css_rstat_cpu which is then lazily propagated up the
 * hierarchy on reads.
 *
 * When a stat gets updated, the css_rstat_cpu and its ancestors are
 * linked into the updated tree.  On the following read, propagation only
 * considers and consumes the updated tree.  This makes reading O(the
 * number of descendants which have been active since last read) instead of
 * O(the total number of descendants).
 *
 * This is important because there can be a lot of (draining) cgroups which
 * aren't active and stat may be read frequently.  The combination can
 * become very expensive.  By propagating selectively, increasing reading
 * frequency decreases the cost of each read.
 *
 * This struct hosts both the fields which implement the above -
 * updated_children and updated_next.
 */
struct css_rstat_cpu {
        /*
         * Child cgroups with stat updates on this cpu since the last read
         * are linked on the parent's ->updated_children through
         * ->updated_next. updated_children is terminated by its container css.
         */
        struct cgroup_subsys_state *updated_children;
        struct cgroup_subsys_state *updated_next;        /* NULL if not on the list */

        struct llist_node lnode;                /* lockless list for update */
        struct cgroup_subsys_state *owner;        /* back pointer */
};

/*
 * This struct hosts the fields which track basic resource statistics on
 * top of it - bsync, bstat and last_bstat.
 */
struct cgroup_rstat_base_cpu {
        /*
         * ->bsync protects ->bstat.  These are the only fields which get
         * updated in the hot path.
         */
        struct u64_stats_sync bsync;
        struct cgroup_base_stat bstat;

        /*
         * Snapshots at the last reading.  These are used to calculate the
         * deltas to propagate to the global counters.
         */
        struct cgroup_base_stat last_bstat;

        /*
         * This field is used to record the cumulative per-cpu time of
         * the cgroup and its descendants. Currently it can be read via
         * eBPF/drgn etc, and we are still trying to determine how to
         * expose it in the cgroupfs interface.
         */
        struct cgroup_base_stat subtree_bstat;

        /*
         * Snapshots at the last reading. These are used to calculate the
         * deltas to propagate to the per-cpu subtree_bstat.
         */
        struct cgroup_base_stat last_subtree_bstat;
};

struct cgroup_freezer_state {
        /* Should the cgroup and its descendants be frozen. */
        bool freeze;

        /* Should the cgroup actually be frozen? */
        bool e_freeze;

        /* Fields below are protected by css_set_lock */

        /* Number of frozen descendant cgroups */
        int nr_frozen_descendants;

        /*
         * Number of tasks, which are counted as frozen:
         * frozen, SIGSTOPped, and PTRACEd.
         */
        int nr_frozen_tasks;

        /* Freeze time data consistency protection */
        seqcount_spinlock_t freeze_seq;

        /*
         * Most recent time the cgroup was requested to freeze.
         * Accesses guarded by freeze_seq counter. Writes serialized
         * by css_set_lock.
         */
        u64 freeze_start_nsec;

        /*
         * Total duration the cgroup has spent freezing.
         * Accesses guarded by freeze_seq counter. Writes serialized
         * by css_set_lock.
         */
        u64 frozen_nsec;
};

struct cgroup {
        /* self css with NULL ->ss, points back to this cgroup */
        struct cgroup_subsys_state self;

        unsigned long flags;                /* "unsigned long" so bitops work */

        /*
         * The depth this cgroup is at.  The root is at depth zero and each
         * step down the hierarchy increments the level.  This along with
         * ancestors[] can determine whether a given cgroup is a
         * descendant of another without traversing the hierarchy.
         */
        int level;

        /* Maximum allowed descent tree depth */
        int max_depth;

        /*
         * Keep track of total numbers of visible and dying descent cgroups.
         * Dying cgroups are cgroups which were deleted by a user,
         * but are still existing because someone else is holding a reference.
         * max_descendants is a maximum allowed number of descent cgroups.
         *
         * nr_descendants and nr_dying_descendants are protected
         * by cgroup_mutex and css_set_lock. It's fine to read them holding
         * any of cgroup_mutex and css_set_lock; for writing both locks
         * should be held.
         */
        int nr_descendants;
        int nr_dying_descendants;
        int max_descendants;

        /*
         * Each non-empty css_set associated with this cgroup contributes
         * one to nr_populated_csets.  The counter is zero iff this cgroup
         * doesn't have any tasks.
         *
         * All children which have non-zero nr_populated_csets and/or
         * nr_populated_children of their own contribute one to either
         * nr_populated_domain_children or nr_populated_threaded_children
         * depending on their type.  Each counter is zero iff all cgroups
         * of the type in the subtree proper don't have any tasks.
         */
        int nr_populated_csets;
        int nr_populated_domain_children;
        int nr_populated_threaded_children;

        int nr_threaded_children;        /* # of live threaded child cgroups */

        /* sequence number for cgroup.kill, serialized by css_set_lock. */
        unsigned int kill_seq;

        struct kernfs_node *kn;                /* cgroup kernfs entry */
        struct cgroup_file procs_file;        /* handle for "cgroup.procs" */
        struct cgroup_file events_file;        /* handle for "cgroup.events" */

        /* handles for "{cpu,memory,io,irq}.pressure" */
        struct cgroup_file psi_files[NR_PSI_RESOURCES];

        /*
         * The bitmask of subsystems enabled on the child cgroups.
         * ->subtree_control is the one configured through
         * "cgroup.subtree_control" while ->subtree_ss_mask is the effective
         * one which may have more subsystems enabled.  Controller knobs
         * are made available iff it's enabled in ->subtree_control.
         */
        u16 subtree_control;
        u16 subtree_ss_mask;
        u16 old_subtree_control;
        u16 old_subtree_ss_mask;

        /* Private pointers for each registered subsystem */
        struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];

        /*
         * Keep track of total number of dying CSSes at and below this cgroup.
         * Protected by cgroup_mutex.
         */
        int nr_dying_subsys[CGROUP_SUBSYS_COUNT];

        struct cgroup_root *root;

        /*
         * List of cgrp_cset_links pointing at css_sets with tasks in this
         * cgroup.  Protected by css_set_lock.
         */
        struct list_head cset_links;

        /*
         * On the default hierarchy, a css_set for a cgroup with some
         * susbsys disabled will point to css's which are associated with
         * the closest ancestor which has the subsys enabled.  The
         * following lists all css_sets which point to this cgroup's css
         * for the given subsystem.
         */
        struct list_head e_csets[CGROUP_SUBSYS_COUNT];

        /*
         * If !threaded, self.  If threaded, it points to the nearest
         * domain ancestor.  Inside a threaded subtree, cgroups are exempt
         * from process granularity and no-internal-task constraint.
         * Domain level resource consumptions which aren't tied to a
         * specific task are charged to the dom_cgrp.
         */
        struct cgroup *dom_cgrp;
        struct cgroup *old_dom_cgrp;                /* used while enabling threaded */

        /*
         * Depending on the context, this field is initialized via
         * css_rstat_init() at different places:
         *
         * when cgroup is the root cgroup
         *   performed in cgroup_setup_root()
         * otherwise
         *   performed in cgroup_create()
         */
        struct cgroup_rstat_base_cpu __percpu *rstat_base_cpu;

        /*
         * Add padding to keep the read mostly rstat per-cpu pointer on a
         * different cacheline than the following *bstat fields which can have
         * frequent updates.
         */
        CACHELINE_PADDING(_pad_);

        /* cgroup basic resource statistics */
        struct cgroup_base_stat last_bstat;
        struct cgroup_base_stat bstat;
        struct prev_cputime prev_cputime;        /* for printing out cputime */

        /*
         * list of pidlists, up to two for each namespace (one for procs, one
         * for tasks); created on demand.
         */
        struct list_head pidlists;
        struct mutex pidlist_mutex;

        /* used to wait for offlining of csses */
        wait_queue_head_t offline_waitq;

        /* used to schedule release agent */
        struct work_struct release_agent_work;

        /* used to track pressure stalls */
        struct psi_group *psi;

        /* used to store eBPF programs */
        struct cgroup_bpf bpf;

        /* Used to store internal freezer state */
        struct cgroup_freezer_state freezer;

#ifdef CONFIG_BPF_SYSCALL
        struct bpf_local_storage __rcu  *bpf_cgrp_storage;
#endif

        /* All ancestors including self */
        struct cgroup *ancestors[];
};

/*
 * A cgroup_root represents the root of a cgroup hierarchy, and may be
 * associated with a kernfs_root to form an active hierarchy.  This is
 * internal to cgroup core.  Don't access directly from controllers.
 */
struct cgroup_root {
        struct kernfs_root *kf_root;

        /* The bitmask of subsystems attached to this hierarchy */
        unsigned int subsys_mask;

        /* Unique id for this hierarchy. */
        int hierarchy_id;

        /* A list running through the active hierarchies */
        struct list_head root_list;
        struct rcu_head rcu;        /* Must be near the top */

        /*
         * The root cgroup. The containing cgroup_root will be destroyed on its
         * release. cgrp->ancestors[0] will be used overflowing into the
         * following field. cgrp_ancestor_storage must immediately follow.
         */
        struct cgroup cgrp;

        /* must follow cgrp for cgrp->ancestors[0], see above */
        struct cgroup *cgrp_ancestor_storage;

        /* Number of cgroups in the hierarchy, used only for /proc/cgroups */
        atomic_t nr_cgrps;

        /* Hierarchy-specific flags */
        unsigned int flags;

        /* The path to use for release notifications. */
        char release_agent_path[PATH_MAX];

        /* The name for this hierarchy - may be empty */
        char name[MAX_CGROUP_ROOT_NAMELEN];
};

/*
 * struct cftype: handler definitions for cgroup control files
 *
 * When reading/writing to a file:
 *        - the cgroup to use is file->f_path.dentry->d_parent->d_fsdata
 *        - the 'cftype' of the file is file->f_path.dentry->d_fsdata
 */
struct cftype {
        /*
         * Name of the subsystem is prepended in cgroup_file_name().
         * Zero length string indicates end of cftype array.
         */
        char name[MAX_CFTYPE_NAME];
        unsigned long private;

        /*
         * The maximum length of string, excluding trailing nul, that can
         * be passed to write.  If < PAGE_SIZE-1, PAGE_SIZE-1 is assumed.
         */
        size_t max_write_len;

        /* CFTYPE_* flags */
        unsigned int flags;

        /*
         * If non-zero, should contain the offset from the start of css to
         * a struct cgroup_file field.  cgroup will record the handle of
         * the created file into it.  The recorded handle can be used as
         * long as the containing css remains accessible.
         */
        unsigned int file_offset;

        /*
         * Fields used for internal bookkeeping.  Initialized automatically
         * during registration.
         */
        struct cgroup_subsys *ss;        /* NULL for cgroup core files */
        struct list_head node;                /* anchored at ss->cfts */
        struct kernfs_ops *kf_ops;

        int (*open)(struct kernfs_open_file *of);
        void (*release)(struct kernfs_open_file *of);

        /*
         * read_u64() is a shortcut for the common case of returning a
         * single integer. Use it in place of read()
         */
        u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft);
        /*
         * read_s64() is a signed version of read_u64()
         */
        s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft);

        /* generic seq_file read interface */
        int (*seq_show)(struct seq_file *sf, void *v);

        /* optional ops, implement all or none */
        void *(*seq_start)(struct seq_file *sf, loff_t *ppos);
        void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos);
        void (*seq_stop)(struct seq_file *sf, void *v);

        /*
         * write_u64() is a shortcut for the common case of accepting
         * a single integer (as parsed by simple_strtoull) from
         * userspace. Use in place of write(); return 0 or error.
         */
        int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft,
                         u64 val);
        /*
         * write_s64() is a signed version of write_u64()
         */
        int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft,
                         s64 val);

        /*
         * write() is the generic write callback which maps directly to
         * kernfs write operation and overrides all other operations.
         * Maximum write size is determined by ->max_write_len.  Use
         * of_css/cft() to access the associated css and cft.
         */
        ssize_t (*write)(struct kernfs_open_file *of,
                         char *buf, size_t nbytes, loff_t off);

        __poll_t (*poll)(struct kernfs_open_file *of,
                         struct poll_table_struct *pt);

        struct lock_class_key        lockdep_key;
};

/*
 * Control Group subsystem type.
 * See Documentation/admin-guide/cgroup-v1/cgroups.rst for details
 */
struct cgroup_subsys {
        struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css);
        int (*css_online)(struct cgroup_subsys_state *css);
        void (*css_offline)(struct cgroup_subsys_state *css);
        void (*css_released)(struct cgroup_subsys_state *css);
        void (*css_free)(struct cgroup_subsys_state *css);
        void (*css_reset)(struct cgroup_subsys_state *css);
        void (*css_killed)(struct cgroup_subsys_state *css);
        void (*css_rstat_flush)(struct cgroup_subsys_state *css, int cpu);
        int (*css_extra_stat_show)(struct seq_file *seq,
                                   struct cgroup_subsys_state *css);
        int (*css_local_stat_show)(struct seq_file *seq,
                                   struct cgroup_subsys_state *css);

        int (*can_attach)(struct cgroup_taskset *tset);
        void (*cancel_attach)(struct cgroup_taskset *tset);
        void (*attach)(struct cgroup_taskset *tset);
        int (*can_fork)(struct task_struct *task,
                        struct css_set *cset);
        void (*cancel_fork)(struct task_struct *task, struct css_set *cset);
        void (*fork)(struct task_struct *task);
        void (*exit)(struct task_struct *task);
        void (*release)(struct task_struct *task);
        void (*bind)(struct cgroup_subsys_state *root_css);

        bool early_init:1;

        /*
         * If %true, the controller, on the default hierarchy, doesn't show
         * up in "cgroup.controllers" or "cgroup.subtree_control", is
         * implicitly enabled on all cgroups on the default hierarchy, and
         * bypasses the "no internal process" constraint.  This is for
         * utility type controllers which is transparent to userland.
         *
         * An implicit controller can be stolen from the default hierarchy
         * anytime and thus must be okay with offline csses from previous
         * hierarchies coexisting with csses for the current one.
         */
        bool implicit_on_dfl:1;

        /*
         * If %true, the controller, supports threaded mode on the default
         * hierarchy.  In a threaded subtree, both process granularity and
         * no-internal-process constraint are ignored and a threaded
         * controllers should be able to handle that.
         *
         * Note that as an implicit controller is automatically enabled on
         * all cgroups on the default hierarchy, it should also be
         * threaded.  implicit && !threaded is not supported.
         */
        bool threaded:1;

        /* the following two fields are initialized automatically during boot */
        int id;
        const char *name;

        /* optional, initialized automatically during boot if not set */
        const char *legacy_name;

        /* link to parent, protected by cgroup_lock() */
        struct cgroup_root *root;

        /* idr for css->id */
        struct idr css_idr;

        /*
         * List of cftypes.  Each entry is the first entry of an array
         * terminated by zero length name.
         */
        struct list_head cfts;

        /*
         * Base cftypes which are automatically registered.  The two can
         * point to the same array.
         */
        struct cftype *dfl_cftypes;        /* for the default hierarchy */
        struct cftype *legacy_cftypes;        /* for the legacy hierarchies */

        /*
         * A subsystem may depend on other subsystems.  When such subsystem
         * is enabled on a cgroup, the depended-upon subsystems are enabled
         * together if available.  Subsystems enabled due to dependency are
         * not visible to userland until explicitly enabled.  The following
         * specifies the mask of subsystems that this one depends on.
         */
        unsigned int depends_on;

        spinlock_t rstat_ss_lock;
        struct llist_head __percpu *lhead; /* lockless update list head */
};

extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
extern bool cgroup_enable_per_threadgroup_rwsem;

struct cgroup_of_peak {
        unsigned long                value;
        struct list_head        list;
};

/**
 * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups
 * @tsk: target task
 *
 * Allows cgroup operations to synchronize against threadgroup changes
 * using a global percpu_rw_semaphore and a per threadgroup rw_semaphore when
 * favordynmods is on. See the comment above CGRP_ROOT_FAVOR_DYNMODS definition.
 */
static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
{
        percpu_down_read(&cgroup_threadgroup_rwsem);
        if (cgroup_enable_per_threadgroup_rwsem)
                down_read(&tsk->signal->cgroup_threadgroup_rwsem);
}

/**
 * cgroup_threadgroup_change_end - threadgroup exclusion for cgroups
 * @tsk: target task
 *
 * Counterpart of cgroup_threadcgroup_change_begin().
 */
static inline void cgroup_threadgroup_change_end(struct task_struct *tsk)
{
        if (cgroup_enable_per_threadgroup_rwsem)
                up_read(&tsk->signal->cgroup_threadgroup_rwsem);
        percpu_up_read(&cgroup_threadgroup_rwsem);
}

#else        /* CONFIG_CGROUPS */

#define CGROUP_SUBSYS_COUNT 0

static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
{
        might_sleep();
}

static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {}

#endif        /* CONFIG_CGROUPS */

#ifdef CONFIG_SOCK_CGROUP_DATA

/*
 * sock_cgroup_data is embedded at sock->sk_cgrp_data and contains
 * per-socket cgroup information except for memcg association.
 *
 * On legacy hierarchies, net_prio and net_cls controllers directly
 * set attributes on each sock which can then be tested by the network
 * layer. On the default hierarchy, each sock is associated with the
 * cgroup it was created in and the networking layer can match the
 * cgroup directly.
 */
struct sock_cgroup_data {
        struct cgroup        *cgroup; /* v2 */
#ifdef CONFIG_CGROUP_NET_CLASSID
        u32                classid; /* v1 */
#endif
#ifdef CONFIG_CGROUP_NET_PRIO
        u16                prioidx; /* v1 */
#endif
};

static inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd)
{
#ifdef CONFIG_CGROUP_NET_PRIO
        return READ_ONCE(skcd->prioidx);
#else
        return 1;
#endif
}

#ifdef CONFIG_CGROUP_NET_CLASSID
static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd)
{
        return READ_ONCE(skcd->classid);
}
#endif

static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd,
                                           u16 prioidx)
{
#ifdef CONFIG_CGROUP_NET_PRIO
        WRITE_ONCE(skcd->prioidx, prioidx);
#endif
}

#ifdef CONFIG_CGROUP_NET_CLASSID
static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd,
                                           u32 classid)
{
        WRITE_ONCE(skcd->classid, classid);
}
#endif

#else        /* CONFIG_SOCK_CGROUP_DATA */

struct sock_cgroup_data {
};

#endif        /* CONFIG_SOCK_CGROUP_DATA */

#endif        /* _LINUX_CGROUP_DEFS_H */



















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __INCLUDE_LINUX_OOM_H
#define __INCLUDE_LINUX_OOM_H


#include <linux/sched/signal.h>
#include <linux/types.h>
#include <linux/nodemask.h>
#include <uapi/linux/oom.h>
#include <linux/mm.h> /* VM_FAULT* */

struct zonelist;
struct notifier_block;
struct mem_cgroup;
struct task_struct;

enum oom_constraint {
        CONSTRAINT_NONE,
        CONSTRAINT_CPUSET,
        CONSTRAINT_MEMORY_POLICY,
        CONSTRAINT_MEMCG,
};

/*
 * Details of the page allocation that triggered the oom killer that are used to
 * determine what should be killed.
 */
struct oom_control {
        /* Used to determine cpuset */
        struct zonelist *zonelist;

        /* Used to determine mempolicy */
        nodemask_t *nodemask;

        /* Memory cgroup in which oom is invoked, or NULL for global oom */
        struct mem_cgroup *memcg;

        /* Used to determine cpuset and node locality requirement */
        const gfp_t gfp_mask;

        /*
         * order == -1 means the oom kill is required by sysrq, otherwise only
         * for display purposes.
         */
        const int order;

        /* Used by oom implementation, do not set */
        unsigned long totalpages;
        struct task_struct *chosen;
        long chosen_points;

        /* Used to print the constraint info. */
        enum oom_constraint constraint;
};

extern struct mutex oom_lock;
extern struct mutex oom_adj_mutex;

static inline void set_current_oom_origin(void)
{
        current->signal->oom_flag_origin = true;
}

static inline void clear_current_oom_origin(void)
{
        current->signal->oom_flag_origin = false;
}

static inline bool oom_task_origin(const struct task_struct *p)
{
        return p->signal->oom_flag_origin;
}

static inline bool tsk_is_oom_victim(struct task_struct * tsk)
{
        return tsk->signal->oom_mm;
}

/*
 * Checks whether a page fault on the given mm is still reliable.
 * This is no longer true if the oom reaper started to reap the
 * address space which is reflected by MMF_UNSTABLE flag set in
 * the mm. At that moment any !shared mapping would lose the content
 * and could cause a memory corruption (zero pages instead of the
 * original content).
 *
 * User should call this before establishing a page table entry for
 * a !shared mapping and under the proper page table lock.
 *
 * Return 0 when the PF is safe VM_FAULT_SIGBUS otherwise.
 */
static inline vm_fault_t check_stable_address_space(struct mm_struct *mm)
{
        if (unlikely(mm_flags_test(MMF_UNSTABLE, mm)))
                return VM_FAULT_SIGBUS;
        return 0;
}

long oom_badness(struct task_struct *p,
                unsigned long totalpages);

extern bool out_of_memory(struct oom_control *oc);

extern void exit_oom_victim(void);

extern int register_oom_notifier(struct notifier_block *nb);
extern int unregister_oom_notifier(struct notifier_block *nb);

extern bool oom_killer_disable(signed long timeout);
extern void oom_killer_enable(void);

extern struct task_struct *find_lock_task_mm(struct task_struct *p);

#endif /* _INCLUDE_LINUX_OOM_H */




































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
// SPDX-License-Identifier: GPL-2.0
/*
 * device.h - generic, centralized driver model
 *
 * Copyright (c) 2001-2003 Patrick Mochel <mochel@osdl.org>
 * Copyright (c) 2004-2009 Greg Kroah-Hartman <gregkh@suse.de>
 * Copyright (c) 2008-2009 Novell Inc.
 *
 * See Documentation/driver-api/driver-model/ for more information.
 */

#ifndef _DEVICE_H_
#define _DEVICE_H_

#include <linux/dev_printk.h>
#include <linux/energy_model.h>
#include <linux/ioport.h>
#include <linux/kobject.h>
#include <linux/klist.h>
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/compiler.h>
#include <linux/types.h>
#include <linux/mutex.h>
#include <linux/pm.h>
#include <linux/atomic.h>
#include <linux/uidgid.h>
#include <linux/gfp.h>
#include <linux/device/bus.h>
#include <linux/device/class.h>
#include <linux/device/devres.h>
#include <linux/device/driver.h>
#include <linux/cleanup.h>
#include <asm/device.h>

struct device;
struct device_private;
struct device_driver;
struct driver_private;
struct module;
struct class;
struct subsys_private;
struct device_node;
struct fwnode_handle;
struct iommu_group;
struct dev_pin_info;
struct dev_iommu;
struct msi_device_data;

/**
 * struct subsys_interface - interfaces to device functions
 * @name:       name of the device function
 * @subsys:     subsystem of the devices to attach to
 * @node:       the list of functions registered at the subsystem
 * @add_dev:    device hookup to device function handler
 * @remove_dev: device hookup to device function handler
 *
 * Simple interfaces attached to a subsystem. Multiple interfaces can
 * attach to a subsystem and its devices. Unlike drivers, they do not
 * exclusively claim or control devices. Interfaces usually represent
 * a specific functionality of a subsystem/class of devices.
 */
struct subsys_interface {
        const char *name;
        const struct bus_type *subsys;
        struct list_head node;
        int (*add_dev)(struct device *dev, struct subsys_interface *sif);
        void (*remove_dev)(struct device *dev, struct subsys_interface *sif);
};

int subsys_interface_register(struct subsys_interface *sif);
void subsys_interface_unregister(struct subsys_interface *sif);

int subsys_system_register(const struct bus_type *subsys,
                           const struct attribute_group **groups);
int subsys_virtual_register(const struct bus_type *subsys,
                            const struct attribute_group **groups);

/*
 * The type of device, "struct device" is embedded in. A class
 * or bus can contain devices of different types
 * like "partitions" and "disks", "mouse" and "event".
 * This identifies the device type and carries type-specific
 * information, equivalent to the kobj_type of a kobject.
 * If "name" is specified, the uevent will contain it in
 * the DEVTYPE variable.
 */
struct device_type {
        const char *name;
        const struct attribute_group **groups;
        int (*uevent)(const struct device *dev, struct kobj_uevent_env *env);
        char *(*devnode)(const struct device *dev, umode_t *mode,
                         kuid_t *uid, kgid_t *gid);
        void (*release)(struct device *dev);

        const struct dev_pm_ops *pm;
};

/**
 * struct device_attribute - Interface for exporting device attributes.
 * @attr: sysfs attribute definition.
 * @show: Show handler.
 * @store: Store handler.
 */
struct device_attribute {
        struct attribute        attr;
        ssize_t (*show)(struct device *dev, struct device_attribute *attr,
                        char *buf);
        ssize_t (*store)(struct device *dev, struct device_attribute *attr,
                         const char *buf, size_t count);
};

/**
 * struct dev_ext_attribute - Exported device attribute with extra context.
 * @attr: Exported device attribute.
 * @var: Pointer to context.
 */
struct dev_ext_attribute {
        struct device_attribute attr;
        void *var;
};

ssize_t device_show_ulong(struct device *dev, struct device_attribute *attr,
                          char *buf);
ssize_t device_store_ulong(struct device *dev, struct device_attribute *attr,
                           const char *buf, size_t count);
ssize_t device_show_int(struct device *dev, struct device_attribute *attr,
                        char *buf);
ssize_t device_store_int(struct device *dev, struct device_attribute *attr,
                         const char *buf, size_t count);
ssize_t device_show_bool(struct device *dev, struct device_attribute *attr,
                        char *buf);
ssize_t device_store_bool(struct device *dev, struct device_attribute *attr,
                         const char *buf, size_t count);
ssize_t device_show_string(struct device *dev, struct device_attribute *attr,
                           char *buf);

/**
 * DEVICE_ATTR - Define a device attribute.
 * @_name: Attribute name.
 * @_mode: File mode.
 * @_show: Show handler. Optional, but mandatory if attribute is readable.
 * @_store: Store handler. Optional, but mandatory if attribute is writable.
 *
 * Convenience macro for defining a struct device_attribute.
 *
 * For example, ``DEVICE_ATTR(foo, 0644, foo_show, foo_store);`` expands to:
 *
 * .. code-block:: c
 *
 *        struct device_attribute dev_attr_foo = {
 *                .attr        = { .name = "foo", .mode = 0644 },
 *                .show        = foo_show,
 *                .store        = foo_store,
 *        };
 */
#define DEVICE_ATTR(_name, _mode, _show, _store) \
        struct device_attribute dev_attr_##_name = __ATTR(_name, _mode, _show, _store)

/**
 * DEVICE_ATTR_PREALLOC - Define a preallocated device attribute.
 * @_name: Attribute name.
 * @_mode: File mode.
 * @_show: Show handler. Optional, but mandatory if attribute is readable.
 * @_store: Store handler. Optional, but mandatory if attribute is writable.
 *
 * Like DEVICE_ATTR(), but ``SYSFS_PREALLOC`` is set on @_mode.
 */
#define DEVICE_ATTR_PREALLOC(_name, _mode, _show, _store) \
        struct device_attribute dev_attr_##_name = \
                __ATTR_PREALLOC(_name, _mode, _show, _store)

/**
 * DEVICE_ATTR_RW - Define a read-write device attribute.
 * @_name: Attribute name.
 *
 * Like DEVICE_ATTR(), but @_mode is 0644, @_show is <_name>_show,
 * and @_store is <_name>_store.
 */
#define DEVICE_ATTR_RW(_name) \
        struct device_attribute dev_attr_##_name = __ATTR_RW(_name)

/**
 * DEVICE_ATTR_ADMIN_RW - Define an admin-only read-write device attribute.
 * @_name: Attribute name.
 *
 * Like DEVICE_ATTR_RW(), but @_mode is 0600.
 */
#define DEVICE_ATTR_ADMIN_RW(_name) \
        struct device_attribute dev_attr_##_name = __ATTR_RW_MODE(_name, 0600)

/**
 * DEVICE_ATTR_RO - Define a readable device attribute.
 * @_name: Attribute name.
 *
 * Like DEVICE_ATTR(), but @_mode is 0444 and @_show is <_name>_show.
 */
#define DEVICE_ATTR_RO(_name) \
        struct device_attribute dev_attr_##_name = __ATTR_RO(_name)

/**
 * DEVICE_ATTR_ADMIN_RO - Define an admin-only readable device attribute.
 * @_name: Attribute name.
 *
 * Like DEVICE_ATTR_RO(), but @_mode is 0400.
 */
#define DEVICE_ATTR_ADMIN_RO(_name) \
        struct device_attribute dev_attr_##_name = __ATTR_RO_MODE(_name, 0400)

/**
 * DEVICE_ATTR_WO - Define an admin-only writable device attribute.
 * @_name: Attribute name.
 *
 * Like DEVICE_ATTR(), but @_mode is 0200 and @_store is <_name>_store.
 */
#define DEVICE_ATTR_WO(_name) \
        struct device_attribute dev_attr_##_name = __ATTR_WO(_name)

/**
 * DEVICE_ULONG_ATTR - Define a device attribute backed by an unsigned long.
 * @_name: Attribute name.
 * @_mode: File mode.
 * @_var: Identifier of unsigned long.
 *
 * Like DEVICE_ATTR(), but @_show and @_store are automatically provided
 * such that reads and writes to the attribute from userspace affect @_var.
 */
#define DEVICE_ULONG_ATTR(_name, _mode, _var) \
        struct dev_ext_attribute dev_attr_##_name = \
                { __ATTR(_name, _mode, device_show_ulong, device_store_ulong), &(_var) }

/**
 * DEVICE_INT_ATTR - Define a device attribute backed by an int.
 * @_name: Attribute name.
 * @_mode: File mode.
 * @_var: Identifier of int.
 *
 * Like DEVICE_ULONG_ATTR(), but @_var is an int.
 */
#define DEVICE_INT_ATTR(_name, _mode, _var) \
        struct dev_ext_attribute dev_attr_##_name = \
                { __ATTR(_name, _mode, device_show_int, device_store_int), &(_var) }

/**
 * DEVICE_BOOL_ATTR - Define a device attribute backed by a bool.
 * @_name: Attribute name.
 * @_mode: File mode.
 * @_var: Identifier of bool.
 *
 * Like DEVICE_ULONG_ATTR(), but @_var is a bool.
 */
#define DEVICE_BOOL_ATTR(_name, _mode, _var) \
        struct dev_ext_attribute dev_attr_##_name = \
                { __ATTR(_name, _mode, device_show_bool, device_store_bool), &(_var) }

/**
 * DEVICE_STRING_ATTR_RO - Define a device attribute backed by a r/o string.
 * @_name: Attribute name.
 * @_mode: File mode.
 * @_var: Identifier of string.
 *
 * Like DEVICE_ULONG_ATTR(), but @_var is a string. Because the length of the
 * string allocation is unknown, the attribute must be read-only.
 */
#define DEVICE_STRING_ATTR_RO(_name, _mode, _var) \
        struct dev_ext_attribute dev_attr_##_name = \
                { __ATTR(_name, (_mode) & ~0222, device_show_string, NULL), (_var) }

#define DEVICE_ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) \
        struct device_attribute dev_attr_##_name =                \
                __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store)

int device_create_file(struct device *device,
                       const struct device_attribute *entry);
void device_remove_file(struct device *dev,
                        const struct device_attribute *attr);
bool device_remove_file_self(struct device *dev,
                             const struct device_attribute *attr);
int __must_check device_create_bin_file(struct device *dev,
                                        const struct bin_attribute *attr);
void device_remove_bin_file(struct device *dev,
                            const struct bin_attribute *attr);

/**
 * devm_alloc_percpu - Resource-managed alloc_percpu
 * @dev: Device to allocate per-cpu memory for
 * @type: Type to allocate per-cpu memory for
 *
 * Managed alloc_percpu. Per-cpu memory allocated with this function is
 * automatically freed on driver detach.
 *
 * RETURNS:
 * Pointer to allocated memory on success, NULL on failure.
 */
#define devm_alloc_percpu(dev, type)      \
        ((typeof(type) __percpu *)__devm_alloc_percpu((dev), sizeof(type), \
                                                      __alignof__(type)))

void __percpu *__devm_alloc_percpu(struct device *dev, size_t size,
                                   size_t align);
void devm_free_percpu(struct device *dev, void __percpu *pdata);

struct device_dma_parameters {
        /*
         * a low level driver may set these to teach IOMMU code about
         * sg limitations.
         */
        unsigned int max_segment_size;
        unsigned int min_align_mask;
        unsigned long segment_boundary_mask;
};

/**
 * enum device_link_state - Device link states.
 * @DL_STATE_NONE: The presence of the drivers is not being tracked.
 * @DL_STATE_DORMANT: None of the supplier/consumer drivers is present.
 * @DL_STATE_AVAILABLE: The supplier driver is present, but the consumer is not.
 * @DL_STATE_CONSUMER_PROBE: The consumer is probing (supplier driver present).
 * @DL_STATE_ACTIVE: Both the supplier and consumer drivers are present.
 * @DL_STATE_SUPPLIER_UNBIND: The supplier driver is unbinding.
 */
enum device_link_state {
        DL_STATE_NONE = -1,
        DL_STATE_DORMANT = 0,
        DL_STATE_AVAILABLE,
        DL_STATE_CONSUMER_PROBE,
        DL_STATE_ACTIVE,
        DL_STATE_SUPPLIER_UNBIND,
};

/*
 * Device link flags.
 *
 * STATELESS: The core will not remove this link automatically.
 * AUTOREMOVE_CONSUMER: Remove the link automatically on consumer driver unbind.
 * PM_RUNTIME: If set, the runtime PM framework will use this link.
 * RPM_ACTIVE: Run pm_runtime_get_sync() on the supplier during link creation.
 * AUTOREMOVE_SUPPLIER: Remove the link automatically on supplier driver unbind.
 * AUTOPROBE_CONSUMER: Probe consumer driver automatically after supplier binds.
 * MANAGED: The core tracks presence of supplier/consumer drivers (internal).
 * SYNC_STATE_ONLY: Link only affects sync_state() behavior.
 * INFERRED: Inferred from data (eg: firmware) and not from driver actions.
 */
#define DL_FLAG_STATELESS                BIT(0)
#define DL_FLAG_AUTOREMOVE_CONSUMER        BIT(1)
#define DL_FLAG_PM_RUNTIME                BIT(2)
#define DL_FLAG_RPM_ACTIVE                BIT(3)
#define DL_FLAG_AUTOREMOVE_SUPPLIER        BIT(4)
#define DL_FLAG_AUTOPROBE_CONSUMER        BIT(5)
#define DL_FLAG_MANAGED                        BIT(6)
#define DL_FLAG_SYNC_STATE_ONLY                BIT(7)
#define DL_FLAG_INFERRED                BIT(8)
#define DL_FLAG_CYCLE                        BIT(9)

/**
 * enum dl_dev_state - Device driver presence tracking information.
 * @DL_DEV_NO_DRIVER: There is no driver attached to the device.
 * @DL_DEV_PROBING: A driver is probing.
 * @DL_DEV_DRIVER_BOUND: The driver has been bound to the device.
 * @DL_DEV_UNBINDING: The driver is unbinding from the device.
 */
enum dl_dev_state {
        DL_DEV_NO_DRIVER = 0,
        DL_DEV_PROBING,
        DL_DEV_DRIVER_BOUND,
        DL_DEV_UNBINDING,
};

/**
 * enum device_removable - Whether the device is removable. The criteria for a
 * device to be classified as removable is determined by its subsystem or bus.
 * @DEVICE_REMOVABLE_NOT_SUPPORTED: This attribute is not supported for this
 *                                    device (default).
 * @DEVICE_REMOVABLE_UNKNOWN:  Device location is Unknown.
 * @DEVICE_FIXED: Device is not removable by the user.
 * @DEVICE_REMOVABLE: Device is removable by the user.
 */
enum device_removable {
        DEVICE_REMOVABLE_NOT_SUPPORTED = 0, /* must be 0 */
        DEVICE_REMOVABLE_UNKNOWN,
        DEVICE_FIXED,
        DEVICE_REMOVABLE,
};

/**
 * struct dev_links_info - Device data related to device links.
 * @suppliers: List of links to supplier devices.
 * @consumers: List of links to consumer devices.
 * @defer_sync: Hook to global list of devices that have deferred sync_state.
 * @status: Driver status information.
 */
struct dev_links_info {
        struct list_head suppliers;
        struct list_head consumers;
        struct list_head defer_sync;
        enum dl_dev_state status;
};

/**
 * struct dev_msi_info - Device data related to MSI
 * @domain:        The MSI interrupt domain associated to the device
 * @data:        Pointer to MSI device data
 */
struct dev_msi_info {
#ifdef CONFIG_GENERIC_MSI_IRQ
        struct irq_domain        *domain;
        struct msi_device_data        *data;
#endif
};

/**
 * enum device_physical_location_panel - Describes which panel surface of the
 * system's housing the device connection point resides on.
 * @DEVICE_PANEL_TOP: Device connection point is on the top panel.
 * @DEVICE_PANEL_BOTTOM: Device connection point is on the bottom panel.
 * @DEVICE_PANEL_LEFT: Device connection point is on the left panel.
 * @DEVICE_PANEL_RIGHT: Device connection point is on the right panel.
 * @DEVICE_PANEL_FRONT: Device connection point is on the front panel.
 * @DEVICE_PANEL_BACK: Device connection point is on the back panel.
 * @DEVICE_PANEL_UNKNOWN: The panel with device connection point is unknown.
 */
enum device_physical_location_panel {
        DEVICE_PANEL_TOP,
        DEVICE_PANEL_BOTTOM,
        DEVICE_PANEL_LEFT,
        DEVICE_PANEL_RIGHT,
        DEVICE_PANEL_FRONT,
        DEVICE_PANEL_BACK,
        DEVICE_PANEL_UNKNOWN,
};

/**
 * enum device_physical_location_vertical_position - Describes vertical
 * position of the device connection point on the panel surface.
 * @DEVICE_VERT_POS_UPPER: Device connection point is at upper part of panel.
 * @DEVICE_VERT_POS_CENTER: Device connection point is at center part of panel.
 * @DEVICE_VERT_POS_LOWER: Device connection point is at lower part of panel.
 */
enum device_physical_location_vertical_position {
        DEVICE_VERT_POS_UPPER,
        DEVICE_VERT_POS_CENTER,
        DEVICE_VERT_POS_LOWER,
};

/**
 * enum device_physical_location_horizontal_position - Describes horizontal
 * position of the device connection point on the panel surface.
 * @DEVICE_HORI_POS_LEFT: Device connection point is at left part of panel.
 * @DEVICE_HORI_POS_CENTER: Device connection point is at center part of panel.
 * @DEVICE_HORI_POS_RIGHT: Device connection point is at right part of panel.
 */
enum device_physical_location_horizontal_position {
        DEVICE_HORI_POS_LEFT,
        DEVICE_HORI_POS_CENTER,
        DEVICE_HORI_POS_RIGHT,
};

/**
 * struct device_physical_location - Device data related to physical location
 * of the device connection point.
 * @panel: Panel surface of the system's housing that the device connection
 *         point resides on.
 * @vertical_position: Vertical position of the device connection point within
 *                     the panel.
 * @horizontal_position: Horizontal position of the device connection point
 *                       within the panel.
 * @dock: Set if the device connection point resides in a docking station or
 *        port replicator.
 * @lid: Set if this device connection point resides on the lid of laptop
 *       system.
 */
struct device_physical_location {
        enum device_physical_location_panel panel;
        enum device_physical_location_vertical_position vertical_position;
        enum device_physical_location_horizontal_position horizontal_position;
        bool dock;
        bool lid;
};

/**
 * struct device - The basic device structure
 * @parent:        The device's "parent" device, the device to which it is attached.
 *                 In most cases, a parent device is some sort of bus or host
 *                 controller. If parent is NULL, the device, is a top-level device,
 *                 which is not usually what you want.
 * @p:                Holds the private data of the driver core portions of the device.
 *                 See the comment of the struct device_private for detail.
 * @kobj:        A top-level, abstract class from which other classes are derived.
 * @init_name:        Initial name of the device.
 * @type:        The type of device.
 *                 This identifies the device type and carries type-specific
 *                 information.
 * @mutex:        Mutex to synchronize calls to its driver.
 * @bus:        Type of bus device is on.
 * @driver:        Which driver has allocated this
 * @platform_data: Platform data specific to the device.
 *                 Example: For devices on custom boards, as typical of embedded
 *                 and SOC based hardware, Linux often uses platform_data to point
 *                 to board-specific structures describing devices and how they
 *                 are wired.  That can include what ports are available, chip
 *                 variants, which GPIO pins act in what additional roles, and so
 *                 on.  This shrinks the "Board Support Packages" (BSPs) and
 *                 minimizes board-specific #ifdefs in drivers.
 * @driver_data: Private pointer for driver specific info.
 * @links:        Links to suppliers and consumers of this device.
 * @power:        For device power management.
 *                See Documentation/driver-api/pm/devices.rst for details.
 * @pm_domain:        Provide callbacks that are executed during system suspend,
 *                 hibernation, system resume and during runtime PM transitions
 *                 along with subsystem-level and driver-level callbacks.
 * @em_pd:        device's energy model performance domain
 * @pins:        For device pin management.
 *                See Documentation/driver-api/pin-control.rst for details.
 * @msi:        MSI related data
 * @numa_node:        NUMA node this device is close to.
 * @dma_ops:    DMA mapping operations for this device.
 * @dma_mask:        Dma mask (if dma'ble device).
 * @coherent_dma_mask: Like dma_mask, but for alloc_coherent mapping as not all
 *                 hardware supports 64-bit addresses for consistent allocations
 *                 such descriptors.
 * @bus_dma_limit: Limit of an upstream bridge or bus which imposes a smaller
 *                DMA limit than the device itself supports.
 * @dma_range_map: map for DMA memory ranges relative to that of RAM
 * @dma_parms:        A low level driver may set these to teach IOMMU code about
 *                 segment limitations.
 * @dma_pools:        Dma pools (if dma'ble device).
 * @dma_mem:        Internal for coherent mem override.
 * @cma_area:        Contiguous memory area for dma allocations
 * @dma_io_tlb_mem: Software IO TLB allocator.  Not for driver use.
 * @dma_io_tlb_pools:        List of transient swiotlb memory pools.
 * @dma_io_tlb_lock:        Protects changes to the list of active pools.
 * @dma_uses_io_tlb: %true if device has used the software IO TLB.
 * @archdata:        For arch-specific additions.
 * @of_node:        Associated device tree node.
 * @fwnode:        Associated device node supplied by platform firmware.
 * @devt:        For creating the sysfs "dev".
 * @id:                device instance
 * @devres_lock: Spinlock to protect the resource of the device.
 * @devres_head: The resources list of the device.
 * @class:        The class of the device.
 * @groups:        Optional attribute groups.
 * @release:        Callback to free the device after all references have
 *                 gone away. This should be set by the allocator of the
 *                 device (i.e. the bus driver that discovered the device).
 * @iommu_group: IOMMU group the device belongs to.
 * @iommu:        Per device generic IOMMU runtime data
 * @physical_location: Describes physical location of the device connection
 *                point in the system housing.
 * @removable:  Whether the device can be removed from the system. This
 *              should be set by the subsystem / bus driver that discovered
 *              the device.
 *
 * @offline_disabled: If set, the device is permanently online.
 * @offline:        Set after successful invocation of bus type's .offline().
 * @of_node_reused: Set if the device-tree node is shared with an ancestor
 *              device.
 * @state_synced: The hardware state of this device has been synced to match
 *                  the software state of this device by calling the driver/bus
 *                  sync_state() callback.
 * @can_match:        The device has matched with a driver at least once or it is in
 *                a bus (like AMBA) which can't check for matching drivers until
 *                other devices probe successfully.
 * @dma_coherent: this particular device is dma coherent, even if the
 *                architecture supports non-coherent devices.
 * @dma_ops_bypass: If set to %true then the dma_ops are bypassed for the
 *                streaming DMA operations (->map_* / ->unmap_* / ->sync_*),
 *                and optionall (if the coherent mask is large enough) also
 *                for dma allocations.  This flag is managed by the dma ops
 *                instance from ->dma_supported.
 * @dma_skip_sync: DMA sync operations can be skipped for coherent buffers.
 * @dma_iommu: Device is using default IOMMU implementation for DMA and
 *                doesn't rely on dma_ops structure.
 *
 * At the lowest level, every device in a Linux system is represented by an
 * instance of struct device. The device structure contains the information
 * that the device model core needs to model the system. Most subsystems,
 * however, track additional information about the devices they host. As a
 * result, it is rare for devices to be represented by bare device structures;
 * instead, that structure, like kobject structures, is usually embedded within
 * a higher-level representation of the device.
 */
struct device {
        struct kobject kobj;
        struct device                *parent;

        struct device_private        *p;

        const char                *init_name; /* initial name of the device */
        const struct device_type *type;

        const struct bus_type        *bus;        /* type of bus device is on */
        struct device_driver *driver;        /* which driver has allocated this
                                           device */
        void                *platform_data;        /* Platform specific data, device
                                           core doesn't touch it */
        void                *driver_data;        /* Driver data, set and get with
                                           dev_set_drvdata/dev_get_drvdata */
        struct mutex                mutex;        /* mutex to synchronize calls to
                                         * its driver.
                                         */

        struct dev_links_info        links;
        struct dev_pm_info        power;
        struct dev_pm_domain        *pm_domain;

#ifdef CONFIG_ENERGY_MODEL
        struct em_perf_domain        *em_pd;
#endif

#ifdef CONFIG_PINCTRL
        struct dev_pin_info        *pins;
#endif
        struct dev_msi_info        msi;
#ifdef CONFIG_ARCH_HAS_DMA_OPS
        const struct dma_map_ops *dma_ops;
#endif
        u64                *dma_mask;        /* dma mask (if dma'able device) */
        u64                coherent_dma_mask;/* Like dma_mask, but for
                                             alloc_coherent mappings as
                                             not all hardware supports
                                             64 bit addresses for consistent
                                             allocations such descriptors. */
        u64                bus_dma_limit;        /* upstream dma constraint */
        const struct bus_dma_region *dma_range_map;

        struct device_dma_parameters *dma_parms;

        struct list_head        dma_pools;        /* dma pools (if dma'ble) */

#ifdef CONFIG_DMA_DECLARE_COHERENT
        struct dma_coherent_mem        *dma_mem; /* internal for coherent mem
                                             override */
#endif
#ifdef CONFIG_DMA_CMA
        struct cma *cma_area;                /* contiguous memory area for dma
                                           allocations */
#endif
#ifdef CONFIG_SWIOTLB
        struct io_tlb_mem *dma_io_tlb_mem;
#endif
#ifdef CONFIG_SWIOTLB_DYNAMIC
        struct list_head dma_io_tlb_pools;
        spinlock_t dma_io_tlb_lock;
        bool dma_uses_io_tlb;
#endif
        /* arch specific additions */
        struct dev_archdata        archdata;

        struct device_node        *of_node; /* associated device tree node */
        struct fwnode_handle        *fwnode; /* firmware device node */

#ifdef CONFIG_NUMA
        int                numa_node;        /* NUMA node this device is close to */
#endif
        dev_t                        devt;        /* dev_t, creates the sysfs "dev" */
        u32                        id;        /* device instance */

        spinlock_t                devres_lock;
        struct list_head        devres_head;

        const struct class        *class;
        const struct attribute_group **groups;        /* optional groups */

        void        (*release)(struct device *dev);
        struct iommu_group        *iommu_group;
        struct dev_iommu        *iommu;

        struct device_physical_location *physical_location;

        enum device_removable        removable;

        bool                        offline_disabled:1;
        bool                        offline:1;
        bool                        of_node_reused:1;
        bool                        state_synced:1;
        bool                        can_match:1;
#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
        bool                        dma_coherent:1;
#endif
#ifdef CONFIG_DMA_OPS_BYPASS
        bool                        dma_ops_bypass : 1;
#endif
#ifdef CONFIG_DMA_NEED_SYNC
        bool                        dma_skip_sync:1;
#endif
#ifdef CONFIG_IOMMU_DMA
        bool                        dma_iommu:1;
#endif
};

/**
 * struct device_link - Device link representation.
 * @supplier: The device on the supplier end of the link.
 * @s_node: Hook to the supplier device's list of links to consumers.
 * @consumer: The device on the consumer end of the link.
 * @c_node: Hook to the consumer device's list of links to suppliers.
 * @link_dev: device used to expose link details in sysfs
 * @status: The state of the link (with respect to the presence of drivers).
 * @flags: Link flags.
 * @rpm_active: Whether or not the consumer device is runtime-PM-active.
 * @kref: Count repeated addition of the same link.
 * @rm_work: Work structure used for removing the link.
 * @supplier_preactivated: Supplier has been made active before consumer probe.
 */
struct device_link {
        struct device *supplier;
        struct list_head s_node;
        struct device *consumer;
        struct list_head c_node;
        struct device link_dev;
        enum device_link_state status;
        u32 flags;
        refcount_t rpm_active;
        struct kref kref;
        struct work_struct rm_work;
        bool supplier_preactivated; /* Owned by consumer probe. */
};

#define kobj_to_dev(__kobj)        container_of_const(__kobj, struct device, kobj)

/**
 * device_iommu_mapped - Returns true when the device DMA is translated
 *                         by an IOMMU
 * @dev: Device to perform the check on
 */
static inline bool device_iommu_mapped(struct device *dev)
{
        return (dev->iommu_group != NULL);
}

/* Get the wakeup routines, which depend on struct device */
#include <linux/pm_wakeup.h>

/**
 * dev_name - Return a device's name.
 * @dev: Device with name to get.
 * Return: The kobject name of the device, or its initial name if unavailable.
 */
static inline const char *dev_name(const struct device *dev)
{
        /* Use the init name until the kobject becomes available */
        if (dev->init_name)
                return dev->init_name;

        return kobject_name(&dev->kobj);
}

/**
 * dev_bus_name - Return a device's bus/class name, if at all possible
 * @dev: struct device to get the bus/class name of
 *
 * Will return the name of the bus/class the device is attached to.  If it is
 * not attached to a bus/class, an empty string will be returned.
 */
static inline const char *dev_bus_name(const struct device *dev)
{
        return dev->bus ? dev->bus->name : (dev->class ? dev->class->name : "");
}

__printf(2, 3) int dev_set_name(struct device *dev, const char *name, ...);

#ifdef CONFIG_NUMA
static inline int dev_to_node(struct device *dev)
{
        return dev->numa_node;
}
static inline void set_dev_node(struct device *dev, int node)
{
        dev->numa_node = node;
}
#else
static inline int dev_to_node(struct device *dev)
{
        return NUMA_NO_NODE;
}
static inline void set_dev_node(struct device *dev, int node)
{
}
#endif

static inline struct irq_domain *dev_get_msi_domain(const struct device *dev)
{
#ifdef CONFIG_GENERIC_MSI_IRQ
        return dev->msi.domain;
#else
        return NULL;
#endif
}

static inline void dev_set_msi_domain(struct device *dev, struct irq_domain *d)
{
#ifdef CONFIG_GENERIC_MSI_IRQ
        dev->msi.domain = d;
#endif
}

static inline void *dev_get_drvdata(const struct device *dev)
{
        return dev->driver_data;
}

static inline void dev_set_drvdata(struct device *dev, void *data)
{
        dev->driver_data = data;
}

static inline struct pm_subsys_data *dev_to_psd(struct device *dev)
{
        return dev ? dev->power.subsys_data : NULL;
}

static inline unsigned int dev_get_uevent_suppress(const struct device *dev)
{
        return dev->kobj.uevent_suppress;
}

static inline void dev_set_uevent_suppress(struct device *dev, int val)
{
        dev->kobj.uevent_suppress = val;
}

static inline int device_is_registered(struct device *dev)
{
        return dev->kobj.state_in_sysfs;
}

static inline void device_enable_async_suspend(struct device *dev)
{
        if (!dev->power.is_prepared)
                dev->power.async_suspend = true;
}

static inline void device_disable_async_suspend(struct device *dev)
{
        if (!dev->power.is_prepared)
                dev->power.async_suspend = false;
}

static inline bool device_async_suspend_enabled(struct device *dev)
{
        return !!dev->power.async_suspend;
}

static inline bool device_pm_not_required(struct device *dev)
{
        return dev->power.no_pm;
}

static inline void device_set_pm_not_required(struct device *dev)
{
        dev->power.no_pm = true;
#ifdef CONFIG_PM
        dev->power.no_callbacks = true;
#endif
}

static inline void dev_pm_syscore_device(struct device *dev, bool val)
{
#ifdef CONFIG_PM_SLEEP
        dev->power.syscore = val;
#endif
}

static inline void dev_pm_set_driver_flags(struct device *dev, u32 flags)
{
        dev->power.driver_flags = flags;
}

static inline bool dev_pm_test_driver_flags(struct device *dev, u32 flags)
{
        return !!(dev->power.driver_flags & flags);
}

static inline bool dev_pm_smart_suspend(struct device *dev)
{
#ifdef CONFIG_PM_SLEEP
        return dev->power.smart_suspend;
#else
        return false;
#endif
}

/*
 * dev_pm_set_strict_midlayer - Update the device's power.strict_midlayer flag
 * @dev: Target device.
 * @val: New flag value.
 *
 * When set, power.strict_midlayer means that the middle layer power management
 * code (typically, a bus type or a PM domain) does not expect its runtime PM
 * suspend callback to be invoked at all during system-wide PM transitions and
 * it does not expect its runtime PM resume callback to be invoked at any point
 * when runtime PM is disabled for the device during system-wide PM transitions.
 */
static inline void dev_pm_set_strict_midlayer(struct device *dev, bool val)
{
#ifdef CONFIG_PM_SLEEP
        dev->power.strict_midlayer = val;
#endif
}

static inline bool dev_pm_strict_midlayer_is_set(struct device *dev)
{
#ifdef CONFIG_PM_SLEEP
        return dev->power.strict_midlayer;
#else
        return false;
#endif
}

static inline void device_lock(struct device *dev)
{
        mutex_lock(&dev->mutex);
}

static inline int device_lock_interruptible(struct device *dev)
{
        return mutex_lock_interruptible(&dev->mutex);
}

static inline int device_trylock(struct device *dev)
{
        return mutex_trylock(&dev->mutex);
}

static inline void device_unlock(struct device *dev)
{
        mutex_unlock(&dev->mutex);
}

DEFINE_GUARD(device, struct device *, device_lock(_T), device_unlock(_T))

static inline void device_lock_assert(struct device *dev)
{
        lockdep_assert_held(&dev->mutex);
}

static inline bool dev_has_sync_state(struct device *dev)
{
        if (!dev)
                return false;
        if (dev->driver && dev->driver->sync_state)
                return true;
        if (dev->bus && dev->bus->sync_state)
                return true;
        return false;
}

static inline int dev_set_drv_sync_state(struct device *dev,
                                         void (*fn)(struct device *dev))
{
        if (!dev || !dev->driver)
                return 0;
        if (dev->driver->sync_state && dev->driver->sync_state != fn)
                return -EBUSY;
        if (!dev->driver->sync_state)
                dev->driver->sync_state = fn;
        return 0;
}

static inline void dev_set_removable(struct device *dev,
                                     enum device_removable removable)
{
        dev->removable = removable;
}

static inline bool dev_is_removable(struct device *dev)
{
        return dev->removable == DEVICE_REMOVABLE;
}

static inline bool dev_removable_is_valid(struct device *dev)
{
        return dev->removable != DEVICE_REMOVABLE_NOT_SUPPORTED;
}

/*
 * High level routines for use by the bus drivers
 */
int __must_check device_register(struct device *dev);
void device_unregister(struct device *dev);
void device_initialize(struct device *dev);
int __must_check device_add(struct device *dev);
void device_del(struct device *dev);

DEFINE_FREE(device_del, struct device *, if (_T) device_del(_T))

int device_for_each_child(struct device *parent, void *data,
                          device_iter_t fn);
int device_for_each_child_reverse(struct device *parent, void *data,
                                  device_iter_t fn);
int device_for_each_child_reverse_from(struct device *parent,
                                       struct device *from, void *data,
                                       device_iter_t fn);
struct device *device_find_child(struct device *parent, const void *data,
                                 device_match_t match);
/**
 * device_find_child_by_name - device iterator for locating a child device.
 * @parent: parent struct device
 * @name: name of the child device
 *
 * This is similar to the device_find_child() function above, but it
 * returns a reference to a device that has the name @name.
 *
 * NOTE: you will need to drop the reference with put_device() after use.
 */
static inline struct device *device_find_child_by_name(struct device *parent,
                                                       const char *name)
{
        return device_find_child(parent, name, device_match_name);
}

/**
 * device_find_any_child - device iterator for locating a child device, if any.
 * @parent: parent struct device
 *
 * This is similar to the device_find_child() function above, but it
 * returns a reference to a child device, if any.
 *
 * NOTE: you will need to drop the reference with put_device() after use.
 */
static inline struct device *device_find_any_child(struct device *parent)
{
        return device_find_child(parent, NULL, device_match_any);
}

int device_rename(struct device *dev, const char *new_name);
int device_move(struct device *dev, struct device *new_parent,
                enum dpm_order dpm_order);
int device_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid);

static inline bool device_supports_offline(struct device *dev)
{
        return dev->bus && dev->bus->offline && dev->bus->online;
}

#define __device_lock_set_class(dev, name, key)                        \
do {                                                                   \
        struct device *__d2 __maybe_unused = dev;                      \
        lock_set_class(&__d2->mutex.dep_map, name, key, 0, _THIS_IP_); \
} while (0)

/**
 * device_lock_set_class - Specify a temporary lock class while a device
 *                           is attached to a driver
 * @dev: device to modify
 * @key: lock class key data
 *
 * This must be called with the device_lock() already held, for example
 * from driver ->probe(). Take care to only override the default
 * lockdep_no_validate class.
 */
#ifdef CONFIG_LOCKDEP
#define device_lock_set_class(dev, key)                                    \
do {                                                                       \
        struct device *__d = dev;                                          \
        dev_WARN_ONCE(__d, !lockdep_match_class(&__d->mutex,               \
                                                &__lockdep_no_validate__), \
                 "overriding existing custom lock class\n");               \
        __device_lock_set_class(__d, #key, key);                           \
} while (0)
#else
#define device_lock_set_class(dev, key) __device_lock_set_class(dev, #key, key)
#endif

/**
 * device_lock_reset_class - Return a device to the default lockdep novalidate state
 * @dev: device to modify
 *
 * This must be called with the device_lock() already held, for example
 * from driver ->remove().
 */
#define device_lock_reset_class(dev) \
do { \
        struct device *__d __maybe_unused = dev;                       \
        lock_set_novalidate_class(&__d->mutex.dep_map, "&dev->mutex",  \
                                  _THIS_IP_);                          \
} while (0)

void lock_device_hotplug(void);
void unlock_device_hotplug(void);
int lock_device_hotplug_sysfs(void);
int device_offline(struct device *dev);
int device_online(struct device *dev);

void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode);
void set_secondary_fwnode(struct device *dev, struct fwnode_handle *fwnode);
void device_set_node(struct device *dev, struct fwnode_handle *fwnode);
int device_add_of_node(struct device *dev, struct device_node *of_node);
void device_remove_of_node(struct device *dev);
void device_set_of_node_from_dev(struct device *dev, const struct device *dev2);
struct device *get_dev_from_fwnode(struct fwnode_handle *fwnode);

static inline struct device_node *dev_of_node(struct device *dev)
{
        if (!IS_ENABLED(CONFIG_OF) || !dev)
                return NULL;
        return dev->of_node;
}

static inline int dev_num_vf(struct device *dev)
{
        if (dev->bus && dev->bus->num_vf)
                return dev->bus->num_vf(dev);
        return 0;
}

/*
 * Root device objects for grouping under /sys/devices
 */
struct device *__root_device_register(const char *name, struct module *owner);

/* This is a macro to avoid include problems with THIS_MODULE */
#define root_device_register(name) \
        __root_device_register(name, THIS_MODULE)

void root_device_unregister(struct device *root);

static inline void *dev_get_platdata(const struct device *dev)
{
        return dev->platform_data;
}

/*
 * Manual binding of a device to driver. See drivers/base/bus.c
 * for information on use.
 */
int __must_check device_driver_attach(const struct device_driver *drv,
                                      struct device *dev);
int __must_check device_bind_driver(struct device *dev);
void device_release_driver(struct device *dev);
int  __must_check device_attach(struct device *dev);
int __must_check driver_attach(const struct device_driver *drv);
void device_initial_probe(struct device *dev);
int __must_check device_reprobe(struct device *dev);

bool device_is_bound(struct device *dev);

/*
 * Easy functions for dynamically creating devices on the fly
 */
__printf(5, 6) struct device *
device_create(const struct class *cls, struct device *parent, dev_t devt,
              void *drvdata, const char *fmt, ...);
__printf(6, 7) struct device *
device_create_with_groups(const struct class *cls, struct device *parent, dev_t devt,
                          void *drvdata, const struct attribute_group **groups,
                          const char *fmt, ...);
void device_destroy(const struct class *cls, dev_t devt);

int __must_check device_add_groups(struct device *dev,
                                   const struct attribute_group **groups);
void device_remove_groups(struct device *dev,
                          const struct attribute_group **groups);

static inline int __must_check device_add_group(struct device *dev,
                                        const struct attribute_group *grp)
{
        const struct attribute_group *groups[] = { grp, NULL };

        return device_add_groups(dev, groups);
}

static inline void device_remove_group(struct device *dev,
                                       const struct attribute_group *grp)
{
        const struct attribute_group *groups[] = { grp, NULL };

        device_remove_groups(dev, groups);
}

int __must_check devm_device_add_group(struct device *dev,
                                       const struct attribute_group *grp);

/*
 * get_device - atomically increment the reference count for the device.
 *
 */
struct device *get_device(struct device *dev);
void put_device(struct device *dev);

DEFINE_FREE(put_device, struct device *, if (_T) put_device(_T))

bool kill_device(struct device *dev);

#ifdef CONFIG_DEVTMPFS
int devtmpfs_mount(void);
#else
static inline int devtmpfs_mount(void) { return 0; }
#endif

/* drivers/base/power/shutdown.c */
void device_shutdown(void);

/* debugging and troubleshooting/diagnostic helpers. */
const char *dev_driver_string(const struct device *dev);

/* Device links interface. */
struct device_link *device_link_add(struct device *consumer,
                                    struct device *supplier, u32 flags);
void device_link_del(struct device_link *link);
void device_link_remove(void *consumer, struct device *supplier);
void device_links_supplier_sync_state_pause(void);
void device_links_supplier_sync_state_resume(void);
void device_link_wait_removal(void);

static inline bool device_link_test(const struct device_link *link, u32 flags)
{
        return !!(link->flags & flags);
}

/* Create alias, so I can be autoloaded. */
#define MODULE_ALIAS_CHARDEV(major,minor) \
        MODULE_ALIAS("char-major-" __stringify(major) "-" __stringify(minor))
#define MODULE_ALIAS_CHARDEV_MAJOR(major) \
        MODULE_ALIAS("char-major-" __stringify(major) "-*")

#endif /* _DEVICE_H_ */


















































































































































































































































































































































































































































































































































































































































































































  313 





























































































































  319 




















































































































































































































  264 






  315 














































































































































































































































































































































































































































































































































































  312 





















  312 













































































































































































  302 

















































































































































































    1 

  319 
  316 















































































































































































































































































































































































































  268 













































































































































































  264 

  267 




























































  317 


























































  273 














  251 







































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the AF_INET socket handler.
 *
 * Version:        @(#)sock.h        1.0.4        05/13/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Corey Minyard <wf-rch!minyard@relay.EU.net>
 *                Florian La Roche <flla@stud.uni-sb.de>
 *
 * Fixes:
 *                Alan Cox        :        Volatiles in skbuff pointers. See
 *                                        skbuff comments. May be overdone,
 *                                        better to prove they can be removed
 *                                        than the reverse.
 *                Alan Cox        :        Added a zapped field for tcp to note
 *                                        a socket is reset and must stay shut up
 *                Alan Cox        :        New fields for options
 *        Pauline Middelink        :        identd support
 *                Alan Cox        :        Eliminate low level recv/recvfrom
 *                David S. Miller        :        New socket lookup architecture.
 *              Steve Whitehouse:       Default routines for sock_ops
 *              Arnaldo C. Melo :        removed net_pinfo, tp_pinfo and made
 *                                      protinfo be just a void pointer, as the
 *                                      protocol specific parts were moved to
 *                                      respective headers and ipv4/v6, etc now
 *                                      use private slabcaches for its socks
 *              Pedro Hortas        :        New flags field for socket options
 */
#ifndef _SOCK_H
#define _SOCK_H

#include <linux/hardirq.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/list_nulls.h>
#include <linux/timer.h>
#include <linux/cache.h>
#include <linux/bitops.h>
#include <linux/lockdep.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>        /* struct sk_buff */
#include <linux/mm.h>
#include <linux/security.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/page_counter.h>
#include <linux/memcontrol.h>
#include <linux/static_key.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/cgroup-defs.h>
#include <linux/rbtree.h>
#include <linux/rculist_nulls.h>
#include <linux/poll.h>
#include <linux/sockptr.h>
#include <linux/indirect_call_wrapper.h>
#include <linux/atomic.h>
#include <linux/refcount.h>
#include <linux/llist.h>
#include <net/dst.h>
#include <net/checksum.h>
#include <net/tcp_states.h>
#include <linux/net_tstamp.h>
#include <net/l3mdev.h>
#include <uapi/linux/socket.h>

/*
 * This structure really needs to be cleaned up.
 * Most of it is for TCP, and not used by any of
 * the other protocols.
 */

/* This is the per-socket lock.  The spinlock provides a synchronization
 * between user contexts and software interrupt processing, whereas the
 * mini-semaphore synchronizes multiple users amongst themselves.
 */
typedef struct {
        spinlock_t                slock;
        int                        owned;
        wait_queue_head_t        wq;
        /*
         * We express the mutex-alike socket_lock semantics
         * to the lock validator by explicitly managing
         * the slock as a lock variant (in addition to
         * the slock itself):
         */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map dep_map;
#endif
} socket_lock_t;

struct sock;
struct proto;
struct net;

typedef __u32 __bitwise __portpair;
typedef __u64 __bitwise __addrpair;

/**
 *        struct sock_common - minimal network layer representation of sockets
 *        @skc_daddr: Foreign IPv4 addr
 *        @skc_rcv_saddr: Bound local IPv4 addr
 *        @skc_addrpair: 8-byte-aligned __u64 union of @skc_daddr & @skc_rcv_saddr
 *        @skc_hash: hash value used with various protocol lookup tables
 *        @skc_u16hashes: two u16 hash values used by UDP lookup tables
 *        @skc_dport: placeholder for inet_dport/tw_dport
 *        @skc_num: placeholder for inet_num/tw_num
 *        @skc_portpair: __u32 union of @skc_dport & @skc_num
 *        @skc_family: network address family
 *        @skc_state: Connection state
 *        @skc_reuse: %SO_REUSEADDR setting
 *        @skc_reuseport: %SO_REUSEPORT setting
 *        @skc_ipv6only: socket is IPV6 only
 *        @skc_net_refcnt: socket is using net ref counting
 *        @skc_bound_dev_if: bound device index if != 0
 *        @skc_bind_node: bind hash linkage for various protocol lookup tables
 *        @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol
 *        @skc_prot: protocol handlers inside a network family
 *        @skc_net: reference to the network namespace of this socket
 *        @skc_v6_daddr: IPV6 destination address
 *        @skc_v6_rcv_saddr: IPV6 source address
 *        @skc_cookie: socket's cookie value
 *        @skc_node: main hash linkage for various protocol lookup tables
 *        @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
 *        @skc_tx_queue_mapping: tx queue number for this connection
 *        @skc_rx_queue_mapping: rx queue number for this connection
 *        @skc_flags: place holder for sk_flags
 *                %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
 *                %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
 *        @skc_listener: connection request listener socket (aka rsk_listener)
 *                [union with @skc_flags]
 *        @skc_tw_dr: (aka tw_dr) ptr to &struct inet_timewait_death_row
 *                [union with @skc_flags]
 *        @skc_incoming_cpu: record/match cpu processing incoming packets
 *        @skc_rcv_wnd: (aka rsk_rcv_wnd) TCP receive window size (possibly scaled)
 *                [union with @skc_incoming_cpu]
 *        @skc_tw_rcv_nxt: (aka tw_rcv_nxt) TCP window next expected seq number
 *                [union with @skc_incoming_cpu]
 *        @skc_refcnt: reference count
 *
 *        This is the minimal network layer representation of sockets, the header
 *        for struct sock and struct inet_timewait_sock.
 */
struct sock_common {
        union {
                __addrpair        skc_addrpair;
                struct {
                        __be32        skc_daddr;
                        __be32        skc_rcv_saddr;
                };
        };
        union  {
                unsigned int        skc_hash;
                __u16                skc_u16hashes[2];
        };
        /* skc_dport && skc_num must be grouped as well */
        union {
                __portpair        skc_portpair;
                struct {
                        __be16        skc_dport;
                        __u16        skc_num;
                };
        };

        unsigned short                skc_family;
        volatile unsigned char        skc_state;
        unsigned char                skc_reuse:4;
        unsigned char                skc_reuseport:1;
        unsigned char                skc_ipv6only:1;
        unsigned char                skc_net_refcnt:1;
        int                        skc_bound_dev_if;
        union {
                struct hlist_node        skc_bind_node;
                struct hlist_node        skc_portaddr_node;
        };
        struct proto                *skc_prot;
        possible_net_t                skc_net;

#if IS_ENABLED(CONFIG_IPV6)
        struct in6_addr                skc_v6_daddr;
        struct in6_addr                skc_v6_rcv_saddr;
#endif

        atomic64_t                skc_cookie;

        /* following fields are padding to force
         * offset(struct sock, sk_refcnt) == 128 on 64bit arches
         * assuming IPV6 is enabled. We use this padding differently
         * for different kind of 'sockets'
         */
        union {
                unsigned long        skc_flags;
                struct sock        *skc_listener; /* request_sock */
                struct inet_timewait_death_row *skc_tw_dr; /* inet_timewait_sock */
        };
        /*
         * fields between dontcopy_begin/dontcopy_end
         * are not copied in sock_copy()
         */
        /* private: */
        int                        skc_dontcopy_begin[0];
        /* public: */
        union {
                struct hlist_node        skc_node;
                struct hlist_nulls_node skc_nulls_node;
        };
        unsigned short                skc_tx_queue_mapping;
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
        unsigned short                skc_rx_queue_mapping;
#endif
        union {
                int                skc_incoming_cpu;
                u32                skc_rcv_wnd;
                u32                skc_tw_rcv_nxt; /* struct tcp_timewait_sock  */
        };

        refcount_t                skc_refcnt;
        /* private: */
        int                     skc_dontcopy_end[0];
        union {
                u32                skc_rxhash;
                u32                skc_window_clamp;
                u32                skc_tw_snd_nxt; /* struct tcp_timewait_sock */
        };
        /* public: */
};

struct bpf_local_storage;
struct sk_filter;

/**
  *        struct sock - network layer representation of sockets
  *        @__sk_common: shared layout with inet_timewait_sock
  *        @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN
  *        @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings
  *        @sk_lock:        synchronizer
  *        @sk_kern_sock: True if sock is using kernel lock classes
  *        @sk_rcvbuf: size of receive buffer in bytes
  *        @sk_wq: sock wait queue and async head
  *        @sk_rx_dst: receive input route used by early demux
  *        @sk_rx_dst_ifindex: ifindex for @sk_rx_dst
  *        @sk_rx_dst_cookie: cookie for @sk_rx_dst
  *        @sk_dst_cache: destination cache
  *        @sk_dst_pending_confirm: need to confirm neighbour
  *        @sk_policy: flow policy
  *        @psp_assoc: PSP association, if socket is PSP-secured
  *        @sk_receive_queue: incoming packets
  *        @sk_wmem_alloc: transmit queue bytes committed
  *        @sk_tsq_flags: TCP Small Queues flags
  *        @sk_write_queue: Packet sending queue
  *        @sk_omem_alloc: "o" is "option" or "other"
  *        @sk_wmem_queued: persistent queue size
  *        @sk_forward_alloc: space allocated forward
  *        @sk_reserved_mem: space reserved and non-reclaimable for the socket
  *        @sk_napi_id: id of the last napi context to receive data for sk
  *        @sk_ll_usec: usecs to busypoll when there is no data
  *        @sk_allocation: allocation mode
  *        @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler)
  *        @sk_pacing_status: Pacing status (requested, handled by sch_fq)
  *        @sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE)
  *        @sk_sndbuf: size of send buffer in bytes
  *        @sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets
  *        @sk_no_check_rx: allow zero checksum in RX packets
  *        @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO)
  *        @sk_gso_disabled: if set, NETIF_F_GSO_MASK is forbidden.
  *        @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4)
  *        @sk_gso_max_size: Maximum GSO segment size to build
  *        @sk_gso_max_segs: Maximum number of GSO segments
  *        @sk_pacing_shift: scaling factor for TCP Small Queues
  *        @sk_lingertime: %SO_LINGER l_linger setting
  *        @sk_backlog: always used with the per-socket spinlock held
  *        @sk_callback_lock: used with the callbacks in the end of this struct
  *        @sk_error_queue: rarely used
  *        @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt,
  *                          IPV6_ADDRFORM for instance)
  *        @sk_err: last error
  *        @sk_err_soft: errors that don't cause failure but are the cause of a
  *                      persistent failure not just 'timed out'
  *        @sk_drops: raw/udp drops counter
  *        @sk_drop_counters: optional pointer to numa_drop_counters
  *        @sk_ack_backlog: current listen backlog
  *        @sk_max_ack_backlog: listen backlog set in listen()
  *        @sk_uid: user id of owner
  *        @sk_ino: inode number (zero if orphaned)
  *        @sk_prefer_busy_poll: prefer busypolling over softirq processing
  *        @sk_busy_poll_budget: napi processing budget when busypolling
  *        @sk_priority: %SO_PRIORITY setting
  *        @sk_type: socket type (%SOCK_STREAM, etc)
  *        @sk_protocol: which protocol this socket belongs in this network family
  *        @sk_peer_lock: lock protecting @sk_peer_pid and @sk_peer_cred
  *        @sk_peer_pid: &struct pid for this socket's peer
  *        @sk_peer_cred: %SO_PEERCRED setting
  *        @sk_rcvlowat: %SO_RCVLOWAT setting
  *        @sk_rcvtimeo: %SO_RCVTIMEO setting
  *        @sk_sndtimeo: %SO_SNDTIMEO setting
  *        @sk_txhash: computed flow hash for use on transmit
  *        @sk_txrehash: enable TX hash rethink
  *        @sk_filter: socket filtering instructions
  *        @sk_timer: sock cleanup timer
  *        @sk_stamp: time stamp of last packet received
  *        @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
  *        @sk_tsflags: SO_TIMESTAMPING flags
  *        @sk_bpf_cb_flags: used in bpf_setsockopt()
  *        @sk_use_task_frag: allow sk_page_frag() to use current->task_frag.
  *                           Sockets that can be used under memory reclaim should
  *                           set this to false.
  *        @sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock
  *                      for timestamping
  *        @sk_tskey: counter to disambiguate concurrent tstamp requests
  *        @sk_zckey: counter to order MSG_ZEROCOPY notifications
  *        @sk_socket: Identd and reporting IO signals
  *        @sk_user_data: RPC layer private data. Write-protected by @sk_callback_lock.
  *        @sk_frag: cached page frag
  *        @sk_peek_off: current peek_offset value
  *        @sk_send_head: front of stuff to transmit
  *        @tcp_rtx_queue: TCP re-transmit queue [union with @sk_send_head]
  *        @sk_security: used by security modules
  *        @sk_mark: generic packet mark
  *        @sk_cgrp_data: cgroup data for this cgroup
  *        @sk_memcg: this socket's memory cgroup association
  *        @sk_write_pending: a write to stream socket waits to start
  *        @sk_disconnects: number of disconnect operations performed on this sock
  *        @sk_state_change: callback to indicate change in the state of the sock
  *        @sk_data_ready: callback to indicate there is data to be processed
  *        @sk_write_space: callback to indicate there is bf sending space available
  *        @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE)
  *        @sk_backlog_rcv: callback to process the backlog
  *        @sk_validate_xmit_skb: ptr to an optional validate function
  *        @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0
  *        @sk_reuseport_cb: reuseport group container
  *        @sk_bpf_storage: ptr to cache and control for bpf_sk_storage
  *        @sk_rcu: used during RCU grace period
  *        @sk_clockid: clockid used by time-based scheduling (SO_TXTIME)
  *        @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME
  *        @sk_txtime_report_errors: set report errors mode for SO_TXTIME
  *        @sk_txtime_unused: unused txtime flags
  *        @sk_scm_recv_flags: all flags used by scm_recv()
  *        @sk_scm_credentials: flagged by SO_PASSCRED to recv SCM_CREDENTIALS
  *        @sk_scm_security: flagged by SO_PASSSEC to recv SCM_SECURITY
  *        @sk_scm_pidfd: flagged by SO_PASSPIDFD to recv SCM_PIDFD
  *        @sk_scm_rights: flagged by SO_PASSRIGHTS to recv SCM_RIGHTS
  *        @sk_scm_unused: unused flags for scm_recv()
  *        @ns_tracker: tracker for netns reference
  *        @sk_user_frags: xarray of pages the user is holding a reference on.
  *        @sk_owner: reference to the real owner of the socket that calls
  *                   sock_lock_init_class_and_name().
  */
struct sock {
        /*
         * Now struct inet_timewait_sock also uses sock_common, so please just
         * don't add nothing before this first member (__sk_common) --acme
         */
        struct sock_common        __sk_common;
#define sk_node                        __sk_common.skc_node
#define sk_nulls_node                __sk_common.skc_nulls_node
#define sk_refcnt                __sk_common.skc_refcnt
#define sk_tx_queue_mapping        __sk_common.skc_tx_queue_mapping
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
#define sk_rx_queue_mapping        __sk_common.skc_rx_queue_mapping
#endif

#define sk_dontcopy_begin        __sk_common.skc_dontcopy_begin
#define sk_dontcopy_end                __sk_common.skc_dontcopy_end
#define sk_hash                        __sk_common.skc_hash
#define sk_portpair                __sk_common.skc_portpair
#define sk_num                        __sk_common.skc_num
#define sk_dport                __sk_common.skc_dport
#define sk_addrpair                __sk_common.skc_addrpair
#define sk_daddr                __sk_common.skc_daddr
#define sk_rcv_saddr                __sk_common.skc_rcv_saddr
#define sk_family                __sk_common.skc_family
#define sk_state                __sk_common.skc_state
#define sk_reuse                __sk_common.skc_reuse
#define sk_reuseport                __sk_common.skc_reuseport
#define sk_ipv6only                __sk_common.skc_ipv6only
#define sk_net_refcnt                __sk_common.skc_net_refcnt
#define sk_bound_dev_if                __sk_common.skc_bound_dev_if
#define sk_bind_node                __sk_common.skc_bind_node
#define sk_prot                        __sk_common.skc_prot
#define sk_net                        __sk_common.skc_net
#define sk_v6_daddr                __sk_common.skc_v6_daddr
#define sk_v6_rcv_saddr        __sk_common.skc_v6_rcv_saddr
#define sk_cookie                __sk_common.skc_cookie
#define sk_incoming_cpu                __sk_common.skc_incoming_cpu
#define sk_flags                __sk_common.skc_flags
#define sk_rxhash                __sk_common.skc_rxhash

        __cacheline_group_begin(sock_write_rx);

        atomic_t                sk_drops;
        __s32                        sk_peek_off;
        struct sk_buff_head        sk_error_queue;
        struct sk_buff_head        sk_receive_queue;
        /*
         * The backlog queue is special, it is always used with
         * the per-socket spinlock held and requires low latency
         * access. Therefore we special case it's implementation.
         * Note : rmem_alloc is in this structure to fill a hole
         * on 64bit arches, not because its logically part of
         * backlog.
         */
        struct {
                atomic_t        rmem_alloc;
                int                len;
                struct sk_buff        *head;
                struct sk_buff        *tail;
        } sk_backlog;
#define sk_rmem_alloc sk_backlog.rmem_alloc

        __cacheline_group_end(sock_write_rx);

        __cacheline_group_begin(sock_read_rx);
        /* early demux fields */
        struct dst_entry __rcu        *sk_rx_dst;
        int                        sk_rx_dst_ifindex;
        u32                        sk_rx_dst_cookie;

#ifdef CONFIG_NET_RX_BUSY_POLL
        unsigned int                sk_ll_usec;
        unsigned int                sk_napi_id;
        u16                        sk_busy_poll_budget;
        u8                        sk_prefer_busy_poll;
#endif
        u8                        sk_userlocks;
        int                        sk_rcvbuf;

        struct sk_filter __rcu        *sk_filter;
        union {
                struct socket_wq __rcu        *sk_wq;
                /* private: */
                struct socket_wq        *sk_wq_raw;
                /* public: */
        };

        void                        (*sk_data_ready)(struct sock *sk);
        long                        sk_rcvtimeo;
        int                        sk_rcvlowat;
        __cacheline_group_end(sock_read_rx);

        __cacheline_group_begin(sock_read_rxtx);
        int                        sk_err;
        struct socket                *sk_socket;
#ifdef CONFIG_MEMCG
        struct mem_cgroup        *sk_memcg;
#endif
#ifdef CONFIG_XFRM
        struct xfrm_policy __rcu *sk_policy[2];
#endif
#if IS_ENABLED(CONFIG_INET_PSP)
        struct psp_assoc __rcu        *psp_assoc;
#endif
        __cacheline_group_end(sock_read_rxtx);

        __cacheline_group_begin(sock_write_rxtx);
        socket_lock_t                sk_lock;
        u32                        sk_reserved_mem;
        int                        sk_forward_alloc;
        u32                        sk_tsflags;
        __cacheline_group_end(sock_write_rxtx);

        __cacheline_group_begin(sock_write_tx);
        int                        sk_write_pending;
        atomic_t                sk_omem_alloc;
        int                        sk_err_soft;

        int                        sk_wmem_queued;
        refcount_t                sk_wmem_alloc;
        unsigned long                sk_tsq_flags;
        union {
                struct sk_buff        *sk_send_head;
                struct rb_root        tcp_rtx_queue;
        };
        struct sk_buff_head        sk_write_queue;
        u32                        sk_dst_pending_confirm;
        u32                        sk_pacing_status; /* see enum sk_pacing */
        struct page_frag        sk_frag;
        struct timer_list        sk_timer;

        unsigned long                sk_pacing_rate; /* bytes per second */
        atomic_t                sk_zckey;
        atomic_t                sk_tskey;
        __cacheline_group_end(sock_write_tx);

        __cacheline_group_begin(sock_read_tx);
        unsigned long                sk_max_pacing_rate;
        long                        sk_sndtimeo;
        u32                        sk_priority;
        u32                        sk_mark;
        kuid_t                        sk_uid;
        u16                        sk_protocol;
        u16                        sk_type;
        struct dst_entry __rcu        *sk_dst_cache;
        netdev_features_t        sk_route_caps;
#ifdef CONFIG_SOCK_VALIDATE_XMIT
        struct sk_buff*                (*sk_validate_xmit_skb)(struct sock *sk,
                                                        struct net_device *dev,
                                                        struct sk_buff *skb);
#endif
        u16                        sk_gso_type;
        u16                        sk_gso_max_segs;
        unsigned int                sk_gso_max_size;
        gfp_t                        sk_allocation;
        u32                        sk_txhash;
        int                        sk_sndbuf;
        u8                        sk_pacing_shift;
        bool                        sk_use_task_frag;
        __cacheline_group_end(sock_read_tx);

        /*
         * Because of non atomicity rules, all
         * changes are protected by socket lock.
         */
        u8                        sk_gso_disabled : 1,
                                sk_kern_sock : 1,
                                sk_no_check_tx : 1,
                                sk_no_check_rx : 1;
        u8                        sk_shutdown;
        unsigned long                sk_lingertime;
        struct proto                *sk_prot_creator;
        rwlock_t                sk_callback_lock;
        u32                        sk_ack_backlog;
        u32                        sk_max_ack_backlog;
        unsigned long                sk_ino;
        spinlock_t                sk_peer_lock;
        int                        sk_bind_phc;
        struct pid                *sk_peer_pid;
        const struct cred        *sk_peer_cred;

        ktime_t                        sk_stamp;
#if BITS_PER_LONG==32
        seqlock_t                sk_stamp_seq;
#endif
        int                        sk_disconnects;

        union {
                u8                sk_txrehash;
                u8                sk_scm_recv_flags;
                struct {
                        u8        sk_scm_credentials : 1,
                                sk_scm_security : 1,
                                sk_scm_pidfd : 1,
                                sk_scm_rights : 1,
                                sk_scm_unused : 4;
                };
        };
        u8                        sk_clockid;
        u8                        sk_txtime_deadline_mode : 1,
                                sk_txtime_report_errors : 1,
                                sk_txtime_unused : 6;
#define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG))
        u8                        sk_bpf_cb_flags;

        void                        *sk_user_data;
#ifdef CONFIG_SECURITY
        void                        *sk_security;
#endif
        struct sock_cgroup_data        sk_cgrp_data;
        void                        (*sk_state_change)(struct sock *sk);
        void                        (*sk_write_space)(struct sock *sk);
        void                        (*sk_error_report)(struct sock *sk);
        int                        (*sk_backlog_rcv)(struct sock *sk,
                                                  struct sk_buff *skb);
        void                    (*sk_destruct)(struct sock *sk);
        struct sock_reuseport __rcu        *sk_reuseport_cb;
#ifdef CONFIG_BPF_SYSCALL
        struct bpf_local_storage __rcu        *sk_bpf_storage;
#endif
        struct numa_drop_counters *sk_drop_counters;
        struct rcu_head                sk_rcu;
        netns_tracker                ns_tracker;
        struct xarray                sk_user_frags;

#if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES)
        struct module                *sk_owner;
#endif
};

struct sock_bh_locked {
        struct sock *sock;
        local_lock_t bh_lock;
};

enum sk_pacing {
        SK_PACING_NONE                = 0,
        SK_PACING_NEEDED        = 1,
        SK_PACING_FQ                = 2,
};

/* flag bits in sk_user_data
 *
 * - SK_USER_DATA_NOCOPY:      Pointer stored in sk_user_data might
 *   not be suitable for copying when cloning the socket. For instance,
 *   it can point to a reference counted object. sk_user_data bottom
 *   bit is set if pointer must not be copied.
 *
 * - SK_USER_DATA_BPF:         Mark whether sk_user_data field is
 *   managed/owned by a BPF reuseport array. This bit should be set
 *   when sk_user_data's sk is added to the bpf's reuseport_array.
 *
 * - SK_USER_DATA_PSOCK:       Mark whether pointer stored in
 *   sk_user_data points to psock type. This bit should be set
 *   when sk_user_data is assigned to a psock object.
 */
#define SK_USER_DATA_NOCOPY        1UL
#define SK_USER_DATA_BPF        2UL
#define SK_USER_DATA_PSOCK        4UL
#define SK_USER_DATA_PTRMASK        ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF |\
                                  SK_USER_DATA_PSOCK)

/**
 * sk_user_data_is_nocopy - Test if sk_user_data pointer must not be copied
 * @sk: socket
 */
static inline bool sk_user_data_is_nocopy(const struct sock *sk)
{
        return ((uintptr_t)sk->sk_user_data & SK_USER_DATA_NOCOPY);
}

#define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data)))

/**
 * __locked_read_sk_user_data_with_flags - return the pointer
 * only if argument flags all has been set in sk_user_data. Otherwise
 * return NULL
 *
 * @sk: socket
 * @flags: flag bits
 *
 * The caller must be holding sk->sk_callback_lock.
 */
static inline void *
__locked_read_sk_user_data_with_flags(const struct sock *sk,
                                      uintptr_t flags)
{
        uintptr_t sk_user_data =
                (uintptr_t)rcu_dereference_check(__sk_user_data(sk),
                                                 lockdep_is_held(&sk->sk_callback_lock));

        WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK);

        if ((sk_user_data & flags) == flags)
                return (void *)(sk_user_data & SK_USER_DATA_PTRMASK);
        return NULL;
}

/**
 * __rcu_dereference_sk_user_data_with_flags - return the pointer
 * only if argument flags all has been set in sk_user_data. Otherwise
 * return NULL
 *
 * @sk: socket
 * @flags: flag bits
 */
static inline void *
__rcu_dereference_sk_user_data_with_flags(const struct sock *sk,
                                          uintptr_t flags)
{
        uintptr_t sk_user_data = (uintptr_t)rcu_dereference(__sk_user_data(sk));

        WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK);

        if ((sk_user_data & flags) == flags)
                return (void *)(sk_user_data & SK_USER_DATA_PTRMASK);
        return NULL;
}

#define rcu_dereference_sk_user_data(sk)                                \
        __rcu_dereference_sk_user_data_with_flags(sk, 0)
#define __rcu_assign_sk_user_data_with_flags(sk, ptr, flags)                \
({                                                                        \
        uintptr_t __tmp1 = (uintptr_t)(ptr),                                \
                  __tmp2 = (uintptr_t)(flags);                                \
        WARN_ON_ONCE(__tmp1 & ~SK_USER_DATA_PTRMASK);                        \
        WARN_ON_ONCE(__tmp2 & SK_USER_DATA_PTRMASK);                        \
        rcu_assign_pointer(__sk_user_data((sk)),                        \
                           __tmp1 | __tmp2);                                \
})
#define rcu_assign_sk_user_data(sk, ptr)                                \
        __rcu_assign_sk_user_data_with_flags(sk, ptr, 0)

static inline
struct net *sock_net(const struct sock *sk)
{
        return read_pnet(&sk->sk_net);
}

static inline
void sock_net_set(struct sock *sk, struct net *net)
{
        write_pnet(&sk->sk_net, net);
}

/*
 * SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK
 * or not whether his port will be reused by someone else. SK_FORCE_REUSE
 * on a socket means that the socket will reuse everybody else's port
 * without looking at the other's sk_reuse value.
 */

#define SK_NO_REUSE        0
#define SK_CAN_REUSE        1
#define SK_FORCE_REUSE        2

int sk_set_peek_off(struct sock *sk, int val);

static inline int sk_peek_offset(const struct sock *sk, int flags)
{
        if (unlikely(flags & MSG_PEEK)) {
                return READ_ONCE(sk->sk_peek_off);
        }

        return 0;
}

static inline void sk_peek_offset_bwd(struct sock *sk, int val)
{
        s32 off = READ_ONCE(sk->sk_peek_off);

        if (unlikely(off >= 0)) {
                off = max_t(s32, off - val, 0);
                WRITE_ONCE(sk->sk_peek_off, off);
        }
}

static inline void sk_peek_offset_fwd(struct sock *sk, int val)
{
        sk_peek_offset_bwd(sk, -val);
}

/*
 * Hashed lists helper routines
 */
static inline struct sock *sk_entry(const struct hlist_node *node)
{
        return hlist_entry(node, struct sock, sk_node);
}

static inline struct sock *__sk_head(const struct hlist_head *head)
{
        return hlist_entry(head->first, struct sock, sk_node);
}

static inline struct sock *sk_head(const struct hlist_head *head)
{
        return hlist_empty(head) ? NULL : __sk_head(head);
}

static inline struct sock *__sk_nulls_head(const struct hlist_nulls_head *head)
{
        return hlist_nulls_entry(head->first, struct sock, sk_nulls_node);
}

static inline struct sock *sk_nulls_head(const struct hlist_nulls_head *head)
{
        return hlist_nulls_empty(head) ? NULL : __sk_nulls_head(head);
}

static inline struct sock *sk_next(const struct sock *sk)
{
        return hlist_entry_safe(sk->sk_node.next, struct sock, sk_node);
}

static inline struct sock *sk_nulls_next(const struct sock *sk)
{
        return (!is_a_nulls(sk->sk_nulls_node.next)) ?
                hlist_nulls_entry(sk->sk_nulls_node.next,
                                  struct sock, sk_nulls_node) :
                NULL;
}

static inline bool sk_unhashed(const struct sock *sk)
{
        return hlist_unhashed(&sk->sk_node);
}

static inline bool sk_hashed(const struct sock *sk)
{
        return !sk_unhashed(sk);
}

static inline void sk_node_init(struct hlist_node *node)
{
        node->pprev = NULL;
}

static inline void __sk_del_node(struct sock *sk)
{
        __hlist_del(&sk->sk_node);
}

/* NB: equivalent to hlist_del_init_rcu */
static inline bool __sk_del_node_init(struct sock *sk)
{
        if (sk_hashed(sk)) {
                __sk_del_node(sk);
                sk_node_init(&sk->sk_node);
                return true;
        }
        return false;
}

/* Grab socket reference count. This operation is valid only
   when sk is ALREADY grabbed f.e. it is found in hash table
   or a list and the lookup is made under lock preventing hash table
   modifications.
 */

static __always_inline void sock_hold(struct sock *sk)
{
        refcount_inc(&sk->sk_refcnt);
}

/* Ungrab socket in the context, which assumes that socket refcnt
   cannot hit zero, f.e. it is true in context of any socketcall.
 */
static __always_inline void __sock_put(struct sock *sk)
{
        refcount_dec(&sk->sk_refcnt);
}

static inline bool sk_del_node_init(struct sock *sk)
{
        bool rc = __sk_del_node_init(sk);

        if (rc) {
                /* paranoid for a while -acme */
                WARN_ON(refcount_read(&sk->sk_refcnt) == 1);
                __sock_put(sk);
        }
        return rc;
}
#define sk_del_node_init_rcu(sk)        sk_del_node_init(sk)

static inline bool __sk_nulls_del_node_init_rcu(struct sock *sk)
{
        if (sk_hashed(sk)) {
                hlist_nulls_del_init_rcu(&sk->sk_nulls_node);
                return true;
        }
        return false;
}

static inline bool sk_nulls_del_node_init_rcu(struct sock *sk)
{
        bool rc = __sk_nulls_del_node_init_rcu(sk);

        if (rc) {
                /* paranoid for a while -acme */
                WARN_ON(refcount_read(&sk->sk_refcnt) == 1);
                __sock_put(sk);
        }
        return rc;
}

static inline void __sk_add_node(struct sock *sk, struct hlist_head *list)
{
        hlist_add_head(&sk->sk_node, list);
}

static inline void sk_add_node(struct sock *sk, struct hlist_head *list)
{
        sock_hold(sk);
        __sk_add_node(sk, list);
}

static inline void sk_add_node_rcu(struct sock *sk, struct hlist_head *list)
{
        sock_hold(sk);
        if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
            sk->sk_family == AF_INET6)
                hlist_add_tail_rcu(&sk->sk_node, list);
        else
                hlist_add_head_rcu(&sk->sk_node, list);
}

static inline void sk_add_node_tail_rcu(struct sock *sk, struct hlist_head *list)
{
        sock_hold(sk);
        hlist_add_tail_rcu(&sk->sk_node, list);
}

static inline void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
{
        hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list);
}

static inline void __sk_nulls_add_node_tail_rcu(struct sock *sk, struct hlist_nulls_head *list)
{
        hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list);
}

static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
{
        sock_hold(sk);
        __sk_nulls_add_node_rcu(sk, list);
}

static inline void __sk_del_bind_node(struct sock *sk)
{
        __hlist_del(&sk->sk_bind_node);
}

static inline void sk_add_bind_node(struct sock *sk,
                                        struct hlist_head *list)
{
        hlist_add_head(&sk->sk_bind_node, list);
}

#define sk_for_each(__sk, list) \
        hlist_for_each_entry(__sk, list, sk_node)
#define sk_for_each_rcu(__sk, list) \
        hlist_for_each_entry_rcu(__sk, list, sk_node)
#define sk_nulls_for_each(__sk, node, list) \
        hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node)
#define sk_nulls_for_each_rcu(__sk, node, list) \
        hlist_nulls_for_each_entry_rcu(__sk, node, list, sk_nulls_node)
#define sk_for_each_from(__sk) \
        hlist_for_each_entry_from(__sk, sk_node)
#define sk_nulls_for_each_from(__sk, node) \
        if (__sk && ({ node = &(__sk)->sk_nulls_node; 1; })) \
                hlist_nulls_for_each_entry_from(__sk, node, sk_nulls_node)
#define sk_for_each_safe(__sk, tmp, list) \
        hlist_for_each_entry_safe(__sk, tmp, list, sk_node)
#define sk_for_each_bound(__sk, list) \
        hlist_for_each_entry(__sk, list, sk_bind_node)
#define sk_for_each_bound_safe(__sk, tmp, list) \
        hlist_for_each_entry_safe(__sk, tmp, list, sk_bind_node)

/**
 * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_node to use as a loop cursor.
 * @head:        the head for your list.
 * @offset:        offset of hlist_node within the struct.
 *
 */
#define sk_for_each_entry_offset_rcu(tpos, pos, head, offset)                       \
        for (pos = rcu_dereference(hlist_first_rcu(head));                       \
             pos != NULL &&                                                       \
                ({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;});       \
             pos = rcu_dereference(hlist_next_rcu(pos)))

static inline struct user_namespace *sk_user_ns(const struct sock *sk)
{
        /* Careful only use this in a context where these parameters
         * can not change and must all be valid, such as recvmsg from
         * userspace.
         */
        return sk->sk_socket->file->f_cred->user_ns;
}

/* Sock flags */
enum sock_flags {
        SOCK_DEAD,
        SOCK_DONE,
        SOCK_URGINLINE,
        SOCK_KEEPOPEN,
        SOCK_LINGER,
        SOCK_DESTROY,
        SOCK_BROADCAST,
        SOCK_TIMESTAMP,
        SOCK_ZAPPED,
        SOCK_USE_WRITE_QUEUE, /* whether to call sk->sk_write_space in sock_wfree */
        SOCK_DBG, /* %SO_DEBUG setting */
        SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */
        SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */
        SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */
        SOCK_MEMALLOC, /* VM depends on this socket for swapping */
        SOCK_TIMESTAMPING_RX_SOFTWARE,  /* %SOF_TIMESTAMPING_RX_SOFTWARE */
        SOCK_FASYNC, /* fasync() active */
        SOCK_RXQ_OVFL,
        SOCK_ZEROCOPY, /* buffers from userspace */
        SOCK_WIFI_STATUS, /* push wifi status to userspace */
        SOCK_NOFCS, /* Tell NIC not to do the Ethernet FCS.
                     * Will use last 4 bytes of packet sent from
                     * user-space instead.
                     */
        SOCK_FILTER_LOCKED, /* Filter cannot be changed anymore */
        SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */
        SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */
        SOCK_TXTIME,
        SOCK_XDP, /* XDP is attached */
        SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */
        SOCK_RCVMARK, /* Receive SO_MARK  ancillary data with packet */
        SOCK_RCVPRIORITY, /* Receive SO_PRIORITY ancillary data with packet */
        SOCK_TIMESTAMPING_ANY, /* Copy of sk_tsflags & TSFLAGS_ANY */
};

#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
/*
 * The highest bit of sk_tsflags is reserved for kernel-internal
 * SOCKCM_FLAG_TS_OPT_ID. There is a check in core/sock.c to control that
 * SOF_TIMESTAMPING* values do not reach this reserved area
 */
#define SOCKCM_FLAG_TS_OPT_ID        BIT(31)

static inline void sock_copy_flags(struct sock *nsk, const struct sock *osk)
{
        nsk->sk_flags = osk->sk_flags;
}

static inline void sock_set_flag(struct sock *sk, enum sock_flags flag)
{
        __set_bit(flag, &sk->sk_flags);
}

static inline void sock_reset_flag(struct sock *sk, enum sock_flags flag)
{
        __clear_bit(flag, &sk->sk_flags);
}

static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit,
                                     int valbool)
{
        if (valbool)
                sock_set_flag(sk, bit);
        else
                sock_reset_flag(sk, bit);
}

static inline bool sock_flag(const struct sock *sk, enum sock_flags flag)
{
        return test_bit(flag, &sk->sk_flags);
}

#ifdef CONFIG_NET
DECLARE_STATIC_KEY_FALSE(memalloc_socks_key);
static inline int sk_memalloc_socks(void)
{
        return static_branch_unlikely(&memalloc_socks_key);
}

void __receive_sock(struct file *file);
#else

static inline int sk_memalloc_socks(void)
{
        return 0;
}

static inline void __receive_sock(struct file *file)
{ }
#endif

static inline gfp_t sk_gfp_mask(const struct sock *sk, gfp_t gfp_mask)
{
        return gfp_mask | (sk->sk_allocation & __GFP_MEMALLOC);
}

static inline void sk_acceptq_removed(struct sock *sk)
{
        WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog - 1);
}

static inline void sk_acceptq_added(struct sock *sk)
{
        WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog + 1);
}

/* Note: If you think the test should be:
 *        return READ_ONCE(sk->sk_ack_backlog) >= READ_ONCE(sk->sk_max_ack_backlog);
 * Then please take a look at commit 64a146513f8f ("[NET]: Revert incorrect accept queue backlog changes.")
 */
static inline bool sk_acceptq_is_full(const struct sock *sk)
{
        return READ_ONCE(sk->sk_ack_backlog) > READ_ONCE(sk->sk_max_ack_backlog);
}

/*
 * Compute minimal free write space needed to queue new packets.
 */
static inline int sk_stream_min_wspace(const struct sock *sk)
{
        return READ_ONCE(sk->sk_wmem_queued) >> 1;
}

static inline int sk_stream_wspace(const struct sock *sk)
{
        return READ_ONCE(sk->sk_sndbuf) - READ_ONCE(sk->sk_wmem_queued);
}

static inline void sk_wmem_queued_add(struct sock *sk, int val)
{
        WRITE_ONCE(sk->sk_wmem_queued, sk->sk_wmem_queued + val);
}

static inline void sk_forward_alloc_add(struct sock *sk, int val)
{
        /* Paired with lockless reads of sk->sk_forward_alloc */
        WRITE_ONCE(sk->sk_forward_alloc, sk->sk_forward_alloc + val);
}

void sk_stream_write_space(struct sock *sk);

/* OOB backlog add */
static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
{
        /* dont let skb dst not refcounted, we are going to leave rcu lock */
        skb_dst_force(skb);

        if (!sk->sk_backlog.tail)
                WRITE_ONCE(sk->sk_backlog.head, skb);
        else
                sk->sk_backlog.tail->next = skb;

        WRITE_ONCE(sk->sk_backlog.tail, skb);
        skb->next = NULL;
}

/*
 * Take into account size of receive queue and backlog queue
 * Do not take into account this skb truesize,
 * to allow even a single big packet to come.
 */
static inline bool sk_rcvqueues_full(const struct sock *sk, unsigned int limit)
{
        unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc);

        return qsize > limit;
}

/* The per-socket spinlock must be held here. */
static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *skb,
                                              unsigned int limit)
{
        if (sk_rcvqueues_full(sk, limit))
                return -ENOBUFS;

        /*
         * If the skb was allocated from pfmemalloc reserves, only
         * allow SOCK_MEMALLOC sockets to use it as this socket is
         * helping free memory
         */
        if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
                return -ENOMEM;

        __sk_add_backlog(sk, skb);
        sk->sk_backlog.len += skb->truesize;
        return 0;
}

int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb);

INDIRECT_CALLABLE_DECLARE(int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb));
INDIRECT_CALLABLE_DECLARE(int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb));

static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
{
        if (sk_memalloc_socks() && skb_pfmemalloc(skb))
                return __sk_backlog_rcv(sk, skb);

        return INDIRECT_CALL_INET(sk->sk_backlog_rcv,
                                  tcp_v6_do_rcv,
                                  tcp_v4_do_rcv,
                                  sk, skb);
}

static inline void sk_incoming_cpu_update(struct sock *sk)
{
        int cpu = raw_smp_processor_id();

        if (unlikely(READ_ONCE(sk->sk_incoming_cpu) != cpu))
                WRITE_ONCE(sk->sk_incoming_cpu, cpu);
}


static inline void sock_rps_save_rxhash(struct sock *sk,
                                        const struct sk_buff *skb)
{
#ifdef CONFIG_RPS
        /* The following WRITE_ONCE() is paired with the READ_ONCE()
         * here, and another one in sock_rps_record_flow().
         */
        if (unlikely(READ_ONCE(sk->sk_rxhash) != skb->hash))
                WRITE_ONCE(sk->sk_rxhash, skb->hash);
#endif
}

static inline void sock_rps_reset_rxhash(struct sock *sk)
{
#ifdef CONFIG_RPS
        /* Paired with READ_ONCE() in sock_rps_record_flow() */
        WRITE_ONCE(sk->sk_rxhash, 0);
#endif
}

#define sk_wait_event(__sk, __timeo, __condition, __wait)                \
        ({        int __rc, __dis = __sk->sk_disconnects;                        \
                release_sock(__sk);                                        \
                __rc = __condition;                                        \
                if (!__rc) {                                                \
                        *(__timeo) = wait_woken(__wait,                        \
                                                TASK_INTERRUPTIBLE,        \
                                                *(__timeo));                \
                }                                                        \
                sched_annotate_sleep();                                        \
                lock_sock(__sk);                                        \
                __rc = __dis == __sk->sk_disconnects ? __condition : -EPIPE; \
                __rc;                                                        \
        })

int sk_stream_wait_connect(struct sock *sk, long *timeo_p);
int sk_stream_wait_memory(struct sock *sk, long *timeo_p);
void sk_stream_wait_close(struct sock *sk, long timeo_p);
int sk_stream_error(struct sock *sk, int flags, int err);
void sk_stream_kill_queues(struct sock *sk);
void sk_set_memalloc(struct sock *sk);
void sk_clear_memalloc(struct sock *sk);

void __sk_flush_backlog(struct sock *sk);

static inline bool sk_flush_backlog(struct sock *sk)
{
        if (unlikely(READ_ONCE(sk->sk_backlog.tail))) {
                __sk_flush_backlog(sk);
                return true;
        }
        return false;
}

int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb);

struct request_sock_ops;
struct timewait_sock_ops;
struct inet_hashinfo;
struct raw_hashinfo;
struct smc_hashinfo;
struct module;
struct sk_psock;

/*
 * caches using SLAB_TYPESAFE_BY_RCU should let .next pointer from nulls nodes
 * un-modified. Special care is taken when initializing object to zero.
 */
static inline void sk_prot_clear_nulls(struct sock *sk, int size)
{
        if (offsetof(struct sock, sk_node.next) != 0)
                memset(sk, 0, offsetof(struct sock, sk_node.next));
        memset(&sk->sk_node.pprev, 0,
               size - offsetof(struct sock, sk_node.pprev));
}

struct proto_accept_arg {
        int flags;
        int err;
        int is_empty;
        bool kern;
};

/* Networking protocol blocks we attach to sockets.
 * socket layer -> transport layer interface
 */
struct proto {
        void                        (*close)(struct sock *sk,
                                        long timeout);
        int                        (*pre_connect)(struct sock *sk,
                                        struct sockaddr *uaddr,
                                        int addr_len);
        int                        (*connect)(struct sock *sk,
                                        struct sockaddr *uaddr,
                                        int addr_len);
        int                        (*disconnect)(struct sock *sk, int flags);

        struct sock *                (*accept)(struct sock *sk,
                                          struct proto_accept_arg *arg);

        int                        (*ioctl)(struct sock *sk, int cmd,
                                         int *karg);
        int                        (*init)(struct sock *sk);
        void                        (*destroy)(struct sock *sk);
        void                        (*shutdown)(struct sock *sk, int how);
        int                        (*setsockopt)(struct sock *sk, int level,
                                        int optname, sockptr_t optval,
                                        unsigned int optlen);
        int                        (*getsockopt)(struct sock *sk, int level,
                                        int optname, char __user *optval,
                                        int __user *option);
        void                        (*keepalive)(struct sock *sk, int valbool);
#ifdef CONFIG_COMPAT
        int                        (*compat_ioctl)(struct sock *sk,
                                        unsigned int cmd, unsigned long arg);
#endif
        int                        (*sendmsg)(struct sock *sk, struct msghdr *msg,
                                           size_t len);
        int                        (*recvmsg)(struct sock *sk, struct msghdr *msg,
                                           size_t len, int flags, int *addr_len);
        void                        (*splice_eof)(struct socket *sock);
        int                        (*bind)(struct sock *sk,
                                        struct sockaddr *addr, int addr_len);
        int                        (*bind_add)(struct sock *sk,
                                        struct sockaddr *addr, int addr_len);

        int                        (*backlog_rcv) (struct sock *sk,
                                                struct sk_buff *skb);
        bool                        (*bpf_bypass_getsockopt)(int level,
                                                         int optname);

        void                (*release_cb)(struct sock *sk);

        /* Keeping track of sk's, looking them up, and port selection methods. */
        int                        (*hash)(struct sock *sk);
        void                        (*unhash)(struct sock *sk);
        void                        (*rehash)(struct sock *sk);
        int                        (*get_port)(struct sock *sk, unsigned short snum);
        void                        (*put_port)(struct sock *sk);
#ifdef CONFIG_BPF_SYSCALL
        int                        (*psock_update_sk_prot)(struct sock *sk,
                                                        struct sk_psock *psock,
                                                        bool restore);
#endif

        /* Keeping track of sockets in use */
#ifdef CONFIG_PROC_FS
        unsigned int                inuse_idx;
#endif

        bool                        (*stream_memory_free)(const struct sock *sk, int wake);
        bool                        (*sock_is_readable)(struct sock *sk);
        /* Memory pressure */
        void                        (*enter_memory_pressure)(struct sock *sk);
        void                        (*leave_memory_pressure)(struct sock *sk);
        atomic_long_t                *memory_allocated;        /* Current allocated memory. */
        int  __percpu                *per_cpu_fw_alloc;
        struct percpu_counter        *sockets_allocated;        /* Current number of sockets. */

        /*
         * Pressure flag: try to collapse.
         * Technical note: it is used by multiple contexts non atomically.
         * Make sure to use READ_ONCE()/WRITE_ONCE() for all reads/writes.
         * All the __sk_mem_schedule() is of this nature: accounting
         * is strict, actions are advisory and have some latency.
         */
        unsigned long                *memory_pressure;
        long                        *sysctl_mem;

        int                        *sysctl_wmem;
        int                        *sysctl_rmem;
        u32                        sysctl_wmem_offset;
        u32                        sysctl_rmem_offset;

        int                        max_header;
        bool                        no_autobind;

        struct kmem_cache        *slab;
        unsigned int                obj_size;
        unsigned int                ipv6_pinfo_offset;
        slab_flags_t                slab_flags;
        unsigned int                useroffset;        /* Usercopy region offset */
        unsigned int                usersize;        /* Usercopy region size */

        struct request_sock_ops        *rsk_prot;
        struct timewait_sock_ops *twsk_prot;

        union {
                struct inet_hashinfo        *hashinfo;
                struct udp_table        *udp_table;
                struct raw_hashinfo        *raw_hash;
                struct smc_hashinfo        *smc_hash;
        } h;

        struct module                *owner;

        char                        name[32];

        struct list_head        node;
        int                        (*diag_destroy)(struct sock *sk, int err);
} __randomize_layout;

int proto_register(struct proto *prot, int alloc_slab);
void proto_unregister(struct proto *prot);
int sock_load_diag_module(int family, int protocol);

INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int wake));

static inline bool __sk_stream_memory_free(const struct sock *sk, int wake)
{
        if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf))
                return false;

        return sk->sk_prot->stream_memory_free ?
                INDIRECT_CALL_INET_1(sk->sk_prot->stream_memory_free,
                                     tcp_stream_memory_free, sk, wake) : true;
}

static inline bool sk_stream_memory_free(const struct sock *sk)
{
        return __sk_stream_memory_free(sk, 0);
}

static inline bool __sk_stream_is_writeable(const struct sock *sk, int wake)
{
        return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) &&
               __sk_stream_memory_free(sk, wake);
}

static inline bool sk_stream_is_writeable(const struct sock *sk)
{
        return __sk_stream_is_writeable(sk, 0);
}

static inline int sk_under_cgroup_hierarchy(struct sock *sk,
                                            struct cgroup *ancestor)
{
#ifdef CONFIG_SOCK_CGROUP_DATA
        return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data),
                                    ancestor);
#else
        return -ENOTSUPP;
#endif
}

#define SK_ALLOC_PERCPU_COUNTER_BATCH 16

static inline void sk_sockets_allocated_dec(struct sock *sk)
{
        percpu_counter_add_batch(sk->sk_prot->sockets_allocated, -1,
                                 SK_ALLOC_PERCPU_COUNTER_BATCH);
}

static inline void sk_sockets_allocated_inc(struct sock *sk)
{
        percpu_counter_add_batch(sk->sk_prot->sockets_allocated, 1,
                                 SK_ALLOC_PERCPU_COUNTER_BATCH);
}

static inline u64
sk_sockets_allocated_read_positive(struct sock *sk)
{
        return percpu_counter_read_positive(sk->sk_prot->sockets_allocated);
}

static inline int
proto_sockets_allocated_sum_positive(struct proto *prot)
{
        return percpu_counter_sum_positive(prot->sockets_allocated);
}

#ifdef CONFIG_PROC_FS
#define PROTO_INUSE_NR        64        /* should be enough for the first time */
struct prot_inuse {
        int all;
        int val[PROTO_INUSE_NR];
};

static inline void sock_prot_inuse_add(const struct net *net,
                                       const struct proto *prot, int val)
{
        this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
}

static inline void sock_inuse_add(const struct net *net, int val)
{
        this_cpu_add(net->core.prot_inuse->all, val);
}

int sock_prot_inuse_get(struct net *net, struct proto *proto);
int sock_inuse_get(struct net *net);
#else
static inline void sock_prot_inuse_add(const struct net *net,
                                       const struct proto *prot, int val)
{
}

static inline void sock_inuse_add(const struct net *net, int val)
{
}
#endif


/* With per-bucket locks this operation is not-atomic, so that
 * this version is not worse.
 */
static inline int __sk_prot_rehash(struct sock *sk)
{
        sk->sk_prot->unhash(sk);
        return sk->sk_prot->hash(sk);
}

/* About 10 seconds */
#define SOCK_DESTROY_TIME (10*HZ)

/* Sockets 0-1023 can't be bound to unless you are superuser */
#define PROT_SOCK        1024

#define SHUTDOWN_MASK        3
#define RCV_SHUTDOWN        1
#define SEND_SHUTDOWN        2

#define SOCK_BINDADDR_LOCK        4
#define SOCK_BINDPORT_LOCK        8
/**
 * define SOCK_CONNECT_BIND - &sock->sk_userlocks flag for auto-bind at connect() time
 */
#define SOCK_CONNECT_BIND        16

struct socket_alloc {
        struct socket socket;
        struct inode vfs_inode;
};

static inline struct socket *SOCKET_I(struct inode *inode)
{
        return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
}

static inline struct inode *SOCK_INODE(struct socket *socket)
{
        return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
}

/*
 * Functions for memory accounting
 */
int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind);
int __sk_mem_schedule(struct sock *sk, int size, int kind);
void __sk_mem_reduce_allocated(struct sock *sk, int amount);
void __sk_mem_reclaim(struct sock *sk, int amount);

#define SK_MEM_SEND        0
#define SK_MEM_RECV        1

/* sysctl_mem values are in pages */
static inline long sk_prot_mem_limits(const struct sock *sk, int index)
{
        return READ_ONCE(sk->sk_prot->sysctl_mem[index]);
}

static inline int sk_mem_pages(int amt)
{
        return (amt + PAGE_SIZE - 1) >> PAGE_SHIFT;
}

static inline bool sk_has_account(struct sock *sk)
{
        /* return true if protocol supports memory accounting */
        return !!sk->sk_prot->memory_allocated;
}

static inline bool sk_wmem_schedule(struct sock *sk, int size)
{
        int delta;

        if (!sk_has_account(sk))
                return true;
        delta = size - sk->sk_forward_alloc;
        return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_SEND);
}

static inline bool
__sk_rmem_schedule(struct sock *sk, int size, bool pfmemalloc)
{
        int delta;

        if (!sk_has_account(sk))
                return true;
        delta = size - sk->sk_forward_alloc;
        return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_RECV) ||
               pfmemalloc;
}

static inline bool
sk_rmem_schedule(struct sock *sk, const struct sk_buff *skb, int size)
{
        return __sk_rmem_schedule(sk, size, skb_pfmemalloc(skb));
}

static inline int sk_unused_reserved_mem(const struct sock *sk)
{
        int unused_mem;

        if (likely(!sk->sk_reserved_mem))
                return 0;

        unused_mem = sk->sk_reserved_mem - sk->sk_wmem_queued -
                        atomic_read(&sk->sk_rmem_alloc);

        return unused_mem > 0 ? unused_mem : 0;
}

static inline void sk_mem_reclaim(struct sock *sk)
{
        int reclaimable;

        if (!sk_has_account(sk))
                return;

        reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk);

        if (reclaimable >= (int)PAGE_SIZE)
                __sk_mem_reclaim(sk, reclaimable);
}

static inline void sk_mem_reclaim_final(struct sock *sk)
{
        sk->sk_reserved_mem = 0;
        sk_mem_reclaim(sk);
}

static inline void sk_mem_charge(struct sock *sk, int size)
{
        if (!sk_has_account(sk))
                return;
        sk_forward_alloc_add(sk, -size);
}

static inline void sk_mem_uncharge(struct sock *sk, int size)
{
        if (!sk_has_account(sk))
                return;
        sk_forward_alloc_add(sk, size);
        sk_mem_reclaim(sk);
}

#if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES)
static inline void sk_owner_set(struct sock *sk, struct module *owner)
{
        __module_get(owner);
        sk->sk_owner = owner;
}

static inline void sk_owner_clear(struct sock *sk)
{
        sk->sk_owner = NULL;
}

static inline void sk_owner_put(struct sock *sk)
{
        module_put(sk->sk_owner);
}
#else
static inline void sk_owner_set(struct sock *sk, struct module *owner)
{
}

static inline void sk_owner_clear(struct sock *sk)
{
}

static inline void sk_owner_put(struct sock *sk)
{
}
#endif
/*
 * Macro so as to not evaluate some arguments when
 * lockdep is not enabled.
 *
 * Mark both the sk_lock and the sk_lock.slock as a
 * per-address-family lock class.
 */
#define sock_lock_init_class_and_name(sk, sname, skey, name, key)        \
do {                                                                        \
        sk_owner_set(sk, THIS_MODULE);                                        \
        sk->sk_lock.owned = 0;                                                \
        init_waitqueue_head(&sk->sk_lock.wq);                                \
        spin_lock_init(&(sk)->sk_lock.slock);                                \
        debug_check_no_locks_freed((void *)&(sk)->sk_lock,                \
                                   sizeof((sk)->sk_lock));                \
        lockdep_set_class_and_name(&(sk)->sk_lock.slock,                \
                                   (skey), (sname));                        \
        lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0);        \
} while (0)

static inline bool lockdep_sock_is_held(const struct sock *sk)
{
        return lockdep_is_held(&sk->sk_lock) ||
               lockdep_is_held(&sk->sk_lock.slock);
}

void lock_sock_nested(struct sock *sk, int subclass);

static inline void lock_sock(struct sock *sk)
{
        lock_sock_nested(sk, 0);
}

void __lock_sock(struct sock *sk);
void __release_sock(struct sock *sk);
void release_sock(struct sock *sk);

/* BH context may only use the following locking interface. */
#define bh_lock_sock(__sk)        spin_lock(&((__sk)->sk_lock.slock))
#define bh_lock_sock_nested(__sk) \
                                spin_lock_nested(&((__sk)->sk_lock.slock), \
                                SINGLE_DEPTH_NESTING)
#define bh_unlock_sock(__sk)        spin_unlock(&((__sk)->sk_lock.slock))

bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock);

/**
 * lock_sock_fast - fast version of lock_sock
 * @sk: socket
 *
 * This version should be used for very small section, where process won't block
 * return false if fast path is taken:
 *
 *   sk_lock.slock locked, owned = 0, BH disabled
 *
 * return true if slow path is taken:
 *
 *   sk_lock.slock unlocked, owned = 1, BH enabled
 */
static inline bool lock_sock_fast(struct sock *sk)
{
        /* The sk_lock has mutex_lock() semantics here. */
        mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);

        return __lock_sock_fast(sk);
}

/* fast socket lock variant for caller already holding a [different] socket lock */
static inline bool lock_sock_fast_nested(struct sock *sk)
{
        mutex_acquire(&sk->sk_lock.dep_map, SINGLE_DEPTH_NESTING, 0, _RET_IP_);

        return __lock_sock_fast(sk);
}

/**
 * unlock_sock_fast - complement of lock_sock_fast
 * @sk: socket
 * @slow: slow mode
 *
 * fast unlock socket for user context.
 * If slow mode is on, we call regular release_sock()
 */
static inline void unlock_sock_fast(struct sock *sk, bool slow)
        __releases(&sk->sk_lock.slock)
{
        if (slow) {
                release_sock(sk);
                __release(&sk->sk_lock.slock);
        } else {
                mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
                spin_unlock_bh(&sk->sk_lock.slock);
        }
}

void sockopt_lock_sock(struct sock *sk);
void sockopt_release_sock(struct sock *sk);
bool sockopt_ns_capable(struct user_namespace *ns, int cap);
bool sockopt_capable(int cap);

/* Used by processes to "lock" a socket state, so that
 * interrupts and bottom half handlers won't change it
 * from under us. It essentially blocks any incoming
 * packets, so that we won't get any new data or any
 * packets that change the state of the socket.
 *
 * While locked, BH processing will add new packets to
 * the backlog queue.  This queue is processed by the
 * owner of the socket lock right before it is released.
 *
 * Since ~2.3.5 it is also exclusive sleep lock serializing
 * accesses from user process context.
 */

static inline void sock_owned_by_me(const struct sock *sk)
{
#ifdef CONFIG_LOCKDEP
        WARN_ON_ONCE(!lockdep_sock_is_held(sk) && debug_locks);
#endif
}

static inline void sock_not_owned_by_me(const struct sock *sk)
{
#ifdef CONFIG_LOCKDEP
        WARN_ON_ONCE(lockdep_sock_is_held(sk) && debug_locks);
#endif
}

static inline bool sock_owned_by_user(const struct sock *sk)
{
        sock_owned_by_me(sk);
        return sk->sk_lock.owned;
}

static inline bool sock_owned_by_user_nocheck(const struct sock *sk)
{
        return sk->sk_lock.owned;
}

static inline void sock_release_ownership(struct sock *sk)
{
        DEBUG_NET_WARN_ON_ONCE(!sock_owned_by_user_nocheck(sk));
        sk->sk_lock.owned = 0;

        /* The sk_lock has mutex_unlock() semantics: */
        mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
}

/* no reclassification while locks are held */
static inline bool sock_allow_reclassification(const struct sock *csk)
{
        struct sock *sk = (struct sock *)csk;

        return !sock_owned_by_user_nocheck(sk) &&
                !spin_is_locked(&sk->sk_lock.slock);
}

struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
                      struct proto *prot, int kern);
void sk_free(struct sock *sk);
void sk_net_refcnt_upgrade(struct sock *sk);
void sk_destruct(struct sock *sk);
struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority);

struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
                             gfp_t priority);
void __sock_wfree(struct sk_buff *skb);
void sock_wfree(struct sk_buff *skb);
struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
                             gfp_t priority);
void skb_orphan_partial(struct sk_buff *skb);
void sock_rfree(struct sk_buff *skb);
void sock_efree(struct sk_buff *skb);
#ifdef CONFIG_INET
void sock_edemux(struct sk_buff *skb);
void sock_pfree(struct sk_buff *skb);

static inline void skb_set_owner_edemux(struct sk_buff *skb, struct sock *sk)
{
        skb_orphan(skb);
        if (refcount_inc_not_zero(&sk->sk_refcnt)) {
                skb->sk = sk;
                skb->destructor = sock_edemux;
        }
}
#else
#define sock_edemux sock_efree
#endif

int sk_setsockopt(struct sock *sk, int level, int optname,
                  sockptr_t optval, unsigned int optlen);
int sock_setsockopt(struct socket *sock, int level, int op,
                    sockptr_t optval, unsigned int optlen);
int do_sock_setsockopt(struct socket *sock, bool compat, int level,
                       int optname, sockptr_t optval, int optlen);
int do_sock_getsockopt(struct socket *sock, bool compat, int level,
                       int optname, sockptr_t optval, sockptr_t optlen);

int sk_getsockopt(struct sock *sk, int level, int optname,
                  sockptr_t optval, sockptr_t optlen);
int sock_gettstamp(struct socket *sock, void __user *userstamp,
                   bool timeval, bool time32);
struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
                                     unsigned long data_len, int noblock,
                                     int *errcode, int max_page_order);

static inline struct sk_buff *sock_alloc_send_skb(struct sock *sk,
                                                  unsigned long size,
                                                  int noblock, int *errcode)
{
        return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
}

void *sock_kmalloc(struct sock *sk, int size, gfp_t priority);
void *sock_kmemdup(struct sock *sk, const void *src,
                   int size, gfp_t priority);
void sock_kfree_s(struct sock *sk, void *mem, int size);
void sock_kzfree_s(struct sock *sk, void *mem, int size);
void sk_send_sigurg(struct sock *sk);

static inline void sock_replace_proto(struct sock *sk, struct proto *proto)
{
        if (sk->sk_socket)
                clear_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
        WRITE_ONCE(sk->sk_prot, proto);
}

struct sockcm_cookie {
        u64 transmit_time;
        u32 mark;
        u32 tsflags;
        u32 ts_opt_id;
        u32 priority;
        u32 dmabuf_id;
};

static inline void sockcm_init(struct sockcm_cookie *sockc,
                               const struct sock *sk)
{
        *sockc = (struct sockcm_cookie) {
                .mark = READ_ONCE(sk->sk_mark),
                .tsflags = READ_ONCE(sk->sk_tsflags),
                .priority = READ_ONCE(sk->sk_priority),
        };
}

int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
                     struct sockcm_cookie *sockc);
int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
                   struct sockcm_cookie *sockc);

/*
 * Functions to fill in entries in struct proto_ops when a protocol
 * does not implement a particular function.
 */
int sock_no_bind(struct socket *, struct sockaddr *, int);
int sock_no_connect(struct socket *, struct sockaddr *, int, int);
int sock_no_socketpair(struct socket *, struct socket *);
int sock_no_accept(struct socket *, struct socket *, struct proto_accept_arg *);
int sock_no_getname(struct socket *, struct sockaddr *, int);
int sock_no_ioctl(struct socket *, unsigned int, unsigned long);
int sock_no_listen(struct socket *, int);
int sock_no_shutdown(struct socket *, int);
int sock_no_sendmsg(struct socket *, struct msghdr *, size_t);
int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t len);
int sock_no_recvmsg(struct socket *, struct msghdr *, size_t, int);
int sock_no_mmap(struct file *file, struct socket *sock,
                 struct vm_area_struct *vma);

/*
 * Functions to fill in entries in struct proto_ops when a protocol
 * uses the inet style.
 */
int sock_common_getsockopt(struct socket *sock, int level, int optname,
                                  char __user *optval, int __user *optlen);
int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                        int flags);
int sock_common_setsockopt(struct socket *sock, int level, int optname,
                           sockptr_t optval, unsigned int optlen);

void sk_common_release(struct sock *sk);

/*
 *        Default socket callbacks and setup code
 */

/* Initialise core socket variables using an explicit uid. */
void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid);

/* Initialise core socket variables.
 * Assumes struct socket *sock is embedded in a struct socket_alloc.
 */
void sock_init_data(struct socket *sock, struct sock *sk);

/*
 * Socket reference counting postulates.
 *
 * * Each user of socket SHOULD hold a reference count.
 * * Each access point to socket (an hash table bucket, reference from a list,
 *   running timer, skb in flight MUST hold a reference count.
 * * When reference count hits 0, it means it will never increase back.
 * * When reference count hits 0, it means that no references from
 *   outside exist to this socket and current process on current CPU
 *   is last user and may/should destroy this socket.
 * * sk_free is called from any context: process, BH, IRQ. When
 *   it is called, socket has no references from outside -> sk_free
 *   may release descendant resources allocated by the socket, but
 *   to the time when it is called, socket is NOT referenced by any
 *   hash tables, lists etc.
 * * Packets, delivered from outside (from network or from another process)
 *   and enqueued on receive/error queues SHOULD NOT grab reference count,
 *   when they sit in queue. Otherwise, packets will leak to hole, when
 *   socket is looked up by one cpu and unhasing is made by another CPU.
 *   It is true for udp/raw, netlink (leak to receive and error queues), tcp
 *   (leak to backlog). Packet socket does all the processing inside
 *   BR_NETPROTO_LOCK, so that it has not this race condition. UNIX sockets
 *   use separate SMP lock, so that they are prone too.
 */

/* Ungrab socket and destroy it, if it was the last reference. */
static inline void sock_put(struct sock *sk)
{
        if (refcount_dec_and_test(&sk->sk_refcnt))
                sk_free(sk);
}
/* Generic version of sock_put(), dealing with all sockets
 * (TCP_TIMEWAIT, TCP_NEW_SYN_RECV, ESTABLISHED...)
 */
void sock_gen_put(struct sock *sk);

int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested,
                     unsigned int trim_cap, bool refcounted);
static inline int sk_receive_skb(struct sock *sk, struct sk_buff *skb,
                                 const int nested)
{
        return __sk_receive_skb(sk, skb, nested, 1, true);
}

static inline void sk_tx_queue_set(struct sock *sk, int tx_queue)
{
        /* sk_tx_queue_mapping accept only upto a 16-bit value */
        if (WARN_ON_ONCE((unsigned short)tx_queue >= USHRT_MAX))
                return;
        /* Paired with READ_ONCE() in sk_tx_queue_get() and
         * other WRITE_ONCE() because socket lock might be not held.
         */
        WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue);
}

#define NO_QUEUE_MAPPING        USHRT_MAX

static inline void sk_tx_queue_clear(struct sock *sk)
{
        /* Paired with READ_ONCE() in sk_tx_queue_get() and
         * other WRITE_ONCE() because socket lock might be not held.
         */
        WRITE_ONCE(sk->sk_tx_queue_mapping, NO_QUEUE_MAPPING);
}

static inline int sk_tx_queue_get(const struct sock *sk)
{
        if (sk) {
                /* Paired with WRITE_ONCE() in sk_tx_queue_clear()
                 * and sk_tx_queue_set().
                 */
                int val = READ_ONCE(sk->sk_tx_queue_mapping);

                if (val != NO_QUEUE_MAPPING)
                        return val;
        }
        return -1;
}

static inline void __sk_rx_queue_set(struct sock *sk,
                                     const struct sk_buff *skb,
                                     bool force_set)
{
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
        if (skb_rx_queue_recorded(skb)) {
                u16 rx_queue = skb_get_rx_queue(skb);

                if (force_set ||
                    unlikely(READ_ONCE(sk->sk_rx_queue_mapping) != rx_queue))
                        WRITE_ONCE(sk->sk_rx_queue_mapping, rx_queue);
        }
#endif
}

static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb)
{
        __sk_rx_queue_set(sk, skb, true);
}

static inline void sk_rx_queue_update(struct sock *sk, const struct sk_buff *skb)
{
        __sk_rx_queue_set(sk, skb, false);
}

static inline void sk_rx_queue_clear(struct sock *sk)
{
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
        WRITE_ONCE(sk->sk_rx_queue_mapping, NO_QUEUE_MAPPING);
#endif
}

static inline int sk_rx_queue_get(const struct sock *sk)
{
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
        if (sk) {
                int res = READ_ONCE(sk->sk_rx_queue_mapping);

                if (res != NO_QUEUE_MAPPING)
                        return res;
        }
#endif

        return -1;
}

static inline void sk_set_socket(struct sock *sk, struct socket *sock)
{
        sk->sk_socket = sock;
        if (sock) {
                WRITE_ONCE(sk->sk_uid, SOCK_INODE(sock)->i_uid);
                WRITE_ONCE(sk->sk_ino, SOCK_INODE(sock)->i_ino);
        } else {
                /* Note: sk_uid is unchanged. */
                WRITE_ONCE(sk->sk_ino, 0);
        }
}

static inline wait_queue_head_t *sk_sleep(struct sock *sk)
{
        BUILD_BUG_ON(offsetof(struct socket_wq, wait) != 0);
        return &rcu_dereference_raw(sk->sk_wq)->wait;
}
/* Detach socket from process context.
 * Announce socket dead, detach it from wait queue and inode.
 * Note that parent inode held reference count on this struct sock,
 * we do not release it in this function, because protocol
 * probably wants some additional cleanups or even continuing
 * to work with this socket (TCP).
 */
static inline void sock_orphan(struct sock *sk)
{
        write_lock_bh(&sk->sk_callback_lock);
        sock_set_flag(sk, SOCK_DEAD);
        sk_set_socket(sk, NULL);
        sk->sk_wq  = NULL;
        write_unlock_bh(&sk->sk_callback_lock);
}

static inline void sock_graft(struct sock *sk, struct socket *parent)
{
        WARN_ON(parent->sk);
        write_lock_bh(&sk->sk_callback_lock);
        rcu_assign_pointer(sk->sk_wq, &parent->wq);
        parent->sk = sk;
        sk_set_socket(sk, parent);
        security_sock_graft(sk, parent);
        write_unlock_bh(&sk->sk_callback_lock);
}

static inline unsigned long sock_i_ino(const struct sock *sk)
{
        /* Paired with WRITE_ONCE() in sock_graft() and sock_orphan() */
        return READ_ONCE(sk->sk_ino);
}

static inline kuid_t sk_uid(const struct sock *sk)
{
        /* Paired with WRITE_ONCE() in sockfs_setattr() */
        return READ_ONCE(sk->sk_uid);
}

static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk)
{
        return sk ? sk_uid(sk) : make_kuid(net->user_ns, 0);
}

static inline u32 net_tx_rndhash(void)
{
        u32 v = get_random_u32();

        return v ?: 1;
}

static inline void sk_set_txhash(struct sock *sk)
{
        /* This pairs with READ_ONCE() in skb_set_hash_from_sk() */
        WRITE_ONCE(sk->sk_txhash, net_tx_rndhash());
}

static inline bool sk_rethink_txhash(struct sock *sk)
{
        if (sk->sk_txhash && sk->sk_txrehash == SOCK_TXREHASH_ENABLED) {
                sk_set_txhash(sk);
                return true;
        }
        return false;
}

static inline struct dst_entry *
__sk_dst_get(const struct sock *sk)
{
        return rcu_dereference_check(sk->sk_dst_cache,
                                     lockdep_sock_is_held(sk));
}

static inline struct dst_entry *
sk_dst_get(const struct sock *sk)
{
        struct dst_entry *dst;

        rcu_read_lock();
        dst = rcu_dereference(sk->sk_dst_cache);
        if (dst && !rcuref_get(&dst->__rcuref))
                dst = NULL;
        rcu_read_unlock();
        return dst;
}

static inline void __dst_negative_advice(struct sock *sk)
{
        struct dst_entry *dst = __sk_dst_get(sk);

        if (dst && dst->ops->negative_advice)
                dst->ops->negative_advice(sk, dst);
}

static inline void dst_negative_advice(struct sock *sk)
{
        sk_rethink_txhash(sk);
        __dst_negative_advice(sk);
}

static inline void
__sk_dst_set(struct sock *sk, struct dst_entry *dst)
{
        struct dst_entry *old_dst;

        sk_tx_queue_clear(sk);
        WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
        old_dst = rcu_dereference_protected(sk->sk_dst_cache,
                                            lockdep_sock_is_held(sk));
        rcu_assign_pointer(sk->sk_dst_cache, dst);
        dst_release(old_dst);
}

static inline void
sk_dst_set(struct sock *sk, struct dst_entry *dst)
{
        struct dst_entry *old_dst;

        sk_tx_queue_clear(sk);
        WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
        old_dst = unrcu_pointer(xchg(&sk->sk_dst_cache, RCU_INITIALIZER(dst)));
        dst_release(old_dst);
}

static inline void
__sk_dst_reset(struct sock *sk)
{
        __sk_dst_set(sk, NULL);
}

static inline void
sk_dst_reset(struct sock *sk)
{
        sk_dst_set(sk, NULL);
}

struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie);

struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie);

static inline void sk_dst_confirm(struct sock *sk)
{
        if (!READ_ONCE(sk->sk_dst_pending_confirm))
                WRITE_ONCE(sk->sk_dst_pending_confirm, 1);
}

static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n)
{
        if (skb_get_dst_pending_confirm(skb)) {
                struct sock *sk = skb->sk;

                if (sk && READ_ONCE(sk->sk_dst_pending_confirm))
                        WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
                neigh_confirm(n);
        }
}

bool sk_mc_loop(const struct sock *sk);

static inline bool sk_can_gso(const struct sock *sk)
{
        return net_gso_ok(sk->sk_route_caps, sk->sk_gso_type);
}

void sk_setup_caps(struct sock *sk, struct dst_entry *dst);

static inline void sk_gso_disable(struct sock *sk)
{
        sk->sk_gso_disabled = 1;
        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
}

static inline int skb_do_copy_data_nocache(struct sock *sk, struct sk_buff *skb,
                                           struct iov_iter *from, char *to,
                                           int copy, int offset)
{
        if (skb->ip_summed == CHECKSUM_NONE) {
                __wsum csum = 0;
                if (!csum_and_copy_from_iter_full(to, copy, &csum, from))
                        return -EFAULT;
                skb->csum = csum_block_add(skb->csum, csum, offset);
        } else if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) {
                if (!copy_from_iter_full_nocache(to, copy, from))
                        return -EFAULT;
        } else if (!copy_from_iter_full(to, copy, from))
                return -EFAULT;

        return 0;
}

static inline int skb_add_data_nocache(struct sock *sk, struct sk_buff *skb,
                                       struct iov_iter *from, int copy)
{
        int err, offset = skb->len;

        err = skb_do_copy_data_nocache(sk, skb, from, skb_put(skb, copy),
                                       copy, offset);
        if (err)
                __skb_trim(skb, offset);

        return err;
}

static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *from,
                                           struct sk_buff *skb,
                                           struct page *page,
                                           int off, int copy)
{
        int err;

        err = skb_do_copy_data_nocache(sk, skb, from, page_address(page) + off,
                                       copy, skb->len);
        if (err)
                return err;

        skb_len_add(skb, copy);
        sk_wmem_queued_add(sk, copy);
        sk_mem_charge(sk, copy);
        return 0;
}

/**
 * sk_wmem_alloc_get - returns write allocations
 * @sk: socket
 *
 * Return: sk_wmem_alloc minus initial offset of one
 */
static inline int sk_wmem_alloc_get(const struct sock *sk)
{
        return refcount_read(&sk->sk_wmem_alloc) - 1;
}

/**
 * sk_rmem_alloc_get - returns read allocations
 * @sk: socket
 *
 * Return: sk_rmem_alloc
 */
static inline int sk_rmem_alloc_get(const struct sock *sk)
{
        return atomic_read(&sk->sk_rmem_alloc);
}

/**
 * sk_has_allocations - check if allocations are outstanding
 * @sk: socket
 *
 * Return: true if socket has write or read allocations
 */
static inline bool sk_has_allocations(const struct sock *sk)
{
        return sk_wmem_alloc_get(sk) || sk_rmem_alloc_get(sk);
}

/**
 * skwq_has_sleeper - check if there are any waiting processes
 * @wq: struct socket_wq
 *
 * Return: true if socket_wq has waiting processes
 *
 * The purpose of the skwq_has_sleeper and sock_poll_wait is to wrap the memory
 * barrier call. They were added due to the race found within the tcp code.
 *
 * Consider following tcp code paths::
 *
 *   CPU1                CPU2
 *   sys_select          receive packet
 *   ...                 ...
 *   __add_wait_queue    update tp->rcv_nxt
 *   ...                 ...
 *   tp->rcv_nxt check   sock_def_readable
 *   ...                 {
 *   schedule               rcu_read_lock();
 *                          wq = rcu_dereference(sk->sk_wq);
 *                          if (wq && waitqueue_active(&wq->wait))
 *                              wake_up_interruptible(&wq->wait)
 *                          ...
 *                       }
 *
 * The race for tcp fires when the __add_wait_queue changes done by CPU1 stay
 * in its cache, and so does the tp->rcv_nxt update on CPU2 side.  The CPU1
 * could then endup calling schedule and sleep forever if there are no more
 * data on the socket.
 *
 */
static inline bool skwq_has_sleeper(struct socket_wq *wq)
{
        return wq && wq_has_sleeper(&wq->wait);
}

/**
 * sock_poll_wait - wrapper for the poll_wait call.
 * @filp:           file
 * @sock:           socket to wait on
 * @p:              poll_table
 *
 * See the comments in the wq_has_sleeper function.
 */
static inline void sock_poll_wait(struct file *filp, struct socket *sock,
                                  poll_table *p)
{
        /* Provides a barrier we need to be sure we are in sync
         * with the socket flags modification.
         *
         * This memory barrier is paired in the wq_has_sleeper.
         */
        poll_wait(filp, &sock->wq.wait, p);
}

static inline void skb_set_hash_from_sk(struct sk_buff *skb, struct sock *sk)
{
        /* This pairs with WRITE_ONCE() in sk_set_txhash() */
        u32 txhash = READ_ONCE(sk->sk_txhash);

        if (txhash) {
                skb->l4_hash = 1;
                skb->hash = txhash;
        }
}

void skb_set_owner_w(struct sk_buff *skb, struct sock *sk);

/*
 *        Queue a received datagram if it will fit. Stream and sequenced
 *        protocols can't normally use this as they need to fit buffers in
 *        and play with them.
 *
 *        Inlined as it's very short and called for pretty much every
 *        packet ever received.
 */
static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
{
        skb_orphan(skb);
        skb->sk = sk;
        skb->destructor = sock_rfree;
        atomic_add(skb->truesize, &sk->sk_rmem_alloc);
        sk_mem_charge(sk, skb->truesize);
}

static inline __must_check bool skb_set_owner_sk_safe(struct sk_buff *skb, struct sock *sk)
{
        if (sk && refcount_inc_not_zero(&sk->sk_refcnt)) {
                skb_orphan(skb);
                skb->destructor = sock_efree;
                skb->sk = sk;
                return true;
        }
        return false;
}

static inline struct sk_buff *skb_clone_and_charge_r(struct sk_buff *skb, struct sock *sk)
{
        skb = skb_clone(skb, sk_gfp_mask(sk, GFP_ATOMIC));
        if (skb) {
                if (sk_rmem_schedule(sk, skb, skb->truesize)) {
                        skb_set_owner_r(skb, sk);
                        return skb;
                }
                __kfree_skb(skb);
        }
        return NULL;
}

static inline void skb_prepare_for_gro(struct sk_buff *skb)
{
        if (skb->destructor != sock_wfree) {
                skb_orphan(skb);
                return;
        }
        skb->slow_gro = 1;
}

void sk_reset_timer(struct sock *sk, struct timer_list *timer,
                    unsigned long expires);

void sk_stop_timer(struct sock *sk, struct timer_list *timer);

void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer);

int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue,
                        struct sk_buff *skb, unsigned int flags,
                        void (*destructor)(struct sock *sk,
                                           struct sk_buff *skb));
int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);

int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
                              enum skb_drop_reason *reason);

static inline int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        return sock_queue_rcv_skb_reason(sk, skb, NULL);
}

int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb);
struct sk_buff *sock_dequeue_err_skb(struct sock *sk);

/*
 *        Recover an error report and clear atomically
 */

static inline int sock_error(struct sock *sk)
{
        int err;

        /* Avoid an atomic operation for the common case.
         * This is racy since another cpu/thread can change sk_err under us.
         */
        if (likely(data_race(!sk->sk_err)))
                return 0;

        err = xchg(&sk->sk_err, 0);
        return -err;
}

void sk_error_report(struct sock *sk);

static inline unsigned long sock_wspace(struct sock *sk)
{
        int amt = 0;

        if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
                amt = sk->sk_sndbuf - refcount_read(&sk->sk_wmem_alloc);
                if (amt < 0)
                        amt = 0;
        }
        return amt;
}

/* Note:
 *  We use sk->sk_wq_raw, from contexts knowing this
 *  pointer is not NULL and cannot disappear/change.
 */
static inline void sk_set_bit(int nr, struct sock *sk)
{
        if ((nr == SOCKWQ_ASYNC_NOSPACE || nr == SOCKWQ_ASYNC_WAITDATA) &&
            !sock_flag(sk, SOCK_FASYNC))
                return;

        set_bit(nr, &sk->sk_wq_raw->flags);
}

static inline void sk_clear_bit(int nr, struct sock *sk)
{
        if ((nr == SOCKWQ_ASYNC_NOSPACE || nr == SOCKWQ_ASYNC_WAITDATA) &&
            !sock_flag(sk, SOCK_FASYNC))
                return;

        clear_bit(nr, &sk->sk_wq_raw->flags);
}

static inline void sk_wake_async(const struct sock *sk, int how, int band)
{
        if (sock_flag(sk, SOCK_FASYNC)) {
                rcu_read_lock();
                sock_wake_async(rcu_dereference(sk->sk_wq), how, band);
                rcu_read_unlock();
        }
}

static inline void sk_wake_async_rcu(const struct sock *sk, int how, int band)
{
        if (unlikely(sock_flag(sk, SOCK_FASYNC)))
                sock_wake_async(rcu_dereference(sk->sk_wq), how, band);
}

/* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might
 * need sizeof(sk_buff) + MTU + padding, unless net driver perform copybreak.
 * Note: for send buffers, TCP works better if we can build two skbs at
 * minimum.
 */
#define TCP_SKB_MIN_TRUESIZE        (2048 + SKB_DATA_ALIGN(sizeof(struct sk_buff)))

#define SOCK_MIN_SNDBUF                (TCP_SKB_MIN_TRUESIZE * 2)
#define SOCK_MIN_RCVBUF                 TCP_SKB_MIN_TRUESIZE

static inline void sk_stream_moderate_sndbuf(struct sock *sk)
{
        u32 val;

        if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
                return;

        val = min(sk->sk_sndbuf, sk->sk_wmem_queued >> 1);
        val = max_t(u32, val, sk_unused_reserved_mem(sk));

        WRITE_ONCE(sk->sk_sndbuf, max_t(u32, val, SOCK_MIN_SNDBUF));
}

/**
 * sk_page_frag - return an appropriate page_frag
 * @sk: socket
 *
 * Use the per task page_frag instead of the per socket one for
 * optimization when we know that we're in process context and own
 * everything that's associated with %current.
 *
 * Both direct reclaim and page faults can nest inside other
 * socket operations and end up recursing into sk_page_frag()
 * while it's already in use: explicitly avoid task page_frag
 * when users disable sk_use_task_frag.
 *
 * Return: a per task page_frag if context allows that,
 * otherwise a per socket one.
 */
static inline struct page_frag *sk_page_frag(struct sock *sk)
{
        if (sk->sk_use_task_frag)
                return &current->task_frag;

        return &sk->sk_frag;
}

bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag);

/*
 *        Default write policy as shown to user space via poll/select/SIGIO
 */
static inline bool sock_writeable(const struct sock *sk)
{
        return refcount_read(&sk->sk_wmem_alloc) < (READ_ONCE(sk->sk_sndbuf) >> 1);
}

static inline gfp_t gfp_any(void)
{
        return in_softirq() ? GFP_ATOMIC : GFP_KERNEL;
}

static inline gfp_t gfp_memcg_charge(void)
{
        return in_softirq() ? GFP_ATOMIC : GFP_KERNEL;
}

#ifdef CONFIG_MEMCG
static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk)
{
        return sk->sk_memcg;
}

static inline bool mem_cgroup_sk_enabled(const struct sock *sk)
{
        return mem_cgroup_sockets_enabled && mem_cgroup_from_sk(sk);
}

static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk)
{
        struct mem_cgroup *memcg = mem_cgroup_from_sk(sk);

#ifdef CONFIG_MEMCG_V1
        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
                return !!memcg->tcpmem_pressure;
#endif /* CONFIG_MEMCG_V1 */

        do {
                if (time_before64(get_jiffies_64(), mem_cgroup_get_socket_pressure(memcg)))
                        return true;
        } while ((memcg = parent_mem_cgroup(memcg)));

        return false;
}
#else
static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk)
{
        return NULL;
}

static inline bool mem_cgroup_sk_enabled(const struct sock *sk)
{
        return false;
}

static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk)
{
        return false;
}
#endif

static inline long sock_rcvtimeo(const struct sock *sk, bool noblock)
{
        return noblock ? 0 : READ_ONCE(sk->sk_rcvtimeo);
}

static inline long sock_sndtimeo(const struct sock *sk, bool noblock)
{
        return noblock ? 0 : READ_ONCE(sk->sk_sndtimeo);
}

static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len)
{
        int v = waitall ? len : min_t(int, READ_ONCE(sk->sk_rcvlowat), len);

        return v ?: 1;
}

/* Alas, with timeout socket operations are not restartable.
 * Compare this to poll().
 */
static inline int sock_intr_errno(long timeo)
{
        return timeo == MAX_SCHEDULE_TIMEOUT ? -ERESTARTSYS : -EINTR;
}

struct sock_skb_cb {
        u32 dropcount;
};

/* Store sock_skb_cb at the end of skb->cb[] so protocol families
 * using skb->cb[] would keep using it directly and utilize its
 * alignment guarantee.
 */
#define SOCK_SKB_CB_OFFSET (sizeof_field(struct sk_buff, cb) - \
                            sizeof(struct sock_skb_cb))

#define SOCK_SKB_CB(__skb) ((struct sock_skb_cb *)((__skb)->cb + \
                            SOCK_SKB_CB_OFFSET))

#define sock_skb_cb_check_size(size) \
        BUILD_BUG_ON((size) > SOCK_SKB_CB_OFFSET)

static inline void sk_drops_add(struct sock *sk, int segs)
{
        struct numa_drop_counters *ndc = sk->sk_drop_counters;

        if (ndc)
                numa_drop_add(ndc, segs);
        else
                atomic_add(segs, &sk->sk_drops);
}

static inline void sk_drops_inc(struct sock *sk)
{
        sk_drops_add(sk, 1);
}

static inline int sk_drops_read(const struct sock *sk)
{
        const struct numa_drop_counters *ndc = sk->sk_drop_counters;

        if (ndc) {
                DEBUG_NET_WARN_ON_ONCE(atomic_read(&sk->sk_drops));
                return numa_drop_read(ndc);
        }
        return atomic_read(&sk->sk_drops);
}

static inline void sk_drops_reset(struct sock *sk)
{
        struct numa_drop_counters *ndc = sk->sk_drop_counters;

        if (ndc)
                numa_drop_reset(ndc);
        atomic_set(&sk->sk_drops, 0);
}

static inline void
sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb)
{
        SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ?
                                                sk_drops_read(sk) : 0;
}

static inline void sk_drops_skbadd(struct sock *sk, const struct sk_buff *skb)
{
        int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs);

        sk_drops_add(sk, segs);
}

static inline ktime_t sock_read_timestamp(struct sock *sk)
{
#if BITS_PER_LONG==32
        unsigned int seq;
        ktime_t kt;

        do {
                seq = read_seqbegin(&sk->sk_stamp_seq);
                kt = sk->sk_stamp;
        } while (read_seqretry(&sk->sk_stamp_seq, seq));

        return kt;
#else
        return READ_ONCE(sk->sk_stamp);
#endif
}

static inline void sock_write_timestamp(struct sock *sk, ktime_t kt)
{
#if BITS_PER_LONG==32
        write_seqlock(&sk->sk_stamp_seq);
        sk->sk_stamp = kt;
        write_sequnlock(&sk->sk_stamp_seq);
#else
        WRITE_ONCE(sk->sk_stamp, kt);
#endif
}

void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
                           struct sk_buff *skb);
void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
                             struct sk_buff *skb);

bool skb_has_tx_timestamp(struct sk_buff *skb, const struct sock *sk);
int skb_get_tx_timestamp(struct sk_buff *skb, struct sock *sk,
                         struct timespec64 *ts);

static inline void
sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
{
        struct skb_shared_hwtstamps *hwtstamps = skb_hwtstamps(skb);
        u32 tsflags = READ_ONCE(sk->sk_tsflags);
        ktime_t kt = skb->tstamp;
        /*
         * generate control messages if
         * - receive time stamping in software requested
         * - software time stamp available and wanted
         * - hardware time stamps available and wanted
         */
        if (sock_flag(sk, SOCK_RCVTSTAMP) ||
            (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) ||
            (kt && tsflags & SOF_TIMESTAMPING_SOFTWARE) ||
            (hwtstamps->hwtstamp &&
             (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)))
                __sock_recv_timestamp(msg, sk, skb);
        else
                sock_write_timestamp(sk, kt);

        if (sock_flag(sk, SOCK_WIFI_STATUS) && skb_wifi_acked_valid(skb))
                __sock_recv_wifi_status(msg, sk, skb);
}

void __sock_recv_cmsgs(struct msghdr *msg, struct sock *sk,
                       struct sk_buff *skb);

#define SK_DEFAULT_STAMP (-1L * NSEC_PER_SEC)
static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk,
                                   struct sk_buff *skb)
{
#define FLAGS_RECV_CMSGS ((1UL << SOCK_RXQ_OVFL)                        | \
                           (1UL << SOCK_RCVTSTAMP)                        | \
                           (1UL << SOCK_RCVMARK)                        | \
                           (1UL << SOCK_RCVPRIORITY)                        | \
                           (1UL << SOCK_TIMESTAMPING_ANY))
#define TSFLAGS_ANY          (SOF_TIMESTAMPING_SOFTWARE                        | \
                           SOF_TIMESTAMPING_RAW_HARDWARE)

        if (READ_ONCE(sk->sk_flags) & FLAGS_RECV_CMSGS)
                __sock_recv_cmsgs(msg, sk, skb);
        else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP)))
                sock_write_timestamp(sk, skb->tstamp);
        else if (unlikely(sock_read_timestamp(sk) == SK_DEFAULT_STAMP))
                sock_write_timestamp(sk, 0);
}

void __sock_tx_timestamp(__u32 tsflags, __u8 *tx_flags);

/**
 * _sock_tx_timestamp - checks whether the outgoing packet is to be time stamped
 * @sk:                socket sending this packet
 * @sockc:        pointer to socket cmsg cookie to get timestamping info
 * @tx_flags:        completed with instructions for time stamping
 * @tskey:      filled in with next sk_tskey (not for TCP, which uses seqno)
 *
 * Note: callers should take care of initial ``*tx_flags`` value (usually 0)
 */
static inline void _sock_tx_timestamp(struct sock *sk,
                                      const struct sockcm_cookie *sockc,
                                      __u8 *tx_flags, __u32 *tskey)
{
        __u32 tsflags = sockc->tsflags;

        if (unlikely(tsflags)) {
                __sock_tx_timestamp(tsflags, tx_flags);
                if (tsflags & SOF_TIMESTAMPING_OPT_ID && tskey &&
                    tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) {
                        if (tsflags & SOCKCM_FLAG_TS_OPT_ID)
                                *tskey = sockc->ts_opt_id;
                        else
                                *tskey = atomic_inc_return(&sk->sk_tskey) - 1;
                }
        }
}

static inline void sock_tx_timestamp(struct sock *sk,
                                     const struct sockcm_cookie *sockc,
                                     __u8 *tx_flags)
{
        _sock_tx_timestamp(sk, sockc, tx_flags, NULL);
}

static inline void skb_setup_tx_timestamp(struct sk_buff *skb,
                                          const struct sockcm_cookie *sockc)
{
        _sock_tx_timestamp(skb->sk, sockc, &skb_shinfo(skb)->tx_flags,
                           &skb_shinfo(skb)->tskey);
}

static inline bool sk_is_inet(const struct sock *sk)
{
        int family = READ_ONCE(sk->sk_family);

        return family == AF_INET || family == AF_INET6;
}

static inline bool sk_is_tcp(const struct sock *sk)
{
        return sk_is_inet(sk) &&
               sk->sk_type == SOCK_STREAM &&
               sk->sk_protocol == IPPROTO_TCP;
}

static inline bool sk_is_udp(const struct sock *sk)
{
        return sk_is_inet(sk) &&
               sk->sk_type == SOCK_DGRAM &&
               sk->sk_protocol == IPPROTO_UDP;
}

static inline bool sk_is_unix(const struct sock *sk)
{
        return sk->sk_family == AF_UNIX;
}

static inline bool sk_is_stream_unix(const struct sock *sk)
{
        return sk_is_unix(sk) && sk->sk_type == SOCK_STREAM;
}

static inline bool sk_is_vsock(const struct sock *sk)
{
        return sk->sk_family == AF_VSOCK;
}

static inline bool sk_may_scm_recv(const struct sock *sk)
{
        return (IS_ENABLED(CONFIG_UNIX) && sk->sk_family == AF_UNIX) ||
                sk->sk_family == AF_NETLINK ||
                (IS_ENABLED(CONFIG_BT) && sk->sk_family == AF_BLUETOOTH);
}

/**
 * sk_eat_skb - Release a skb if it is no longer needed
 * @sk: socket to eat this skb from
 * @skb: socket buffer to eat
 *
 * This routine must be called with interrupts disabled or with the socket
 * locked so that the sk_buff queue operation is ok.
*/
static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
{
        __skb_unlink(skb, &sk->sk_receive_queue);
        __kfree_skb(skb);
}

static inline bool
skb_sk_is_prefetched(struct sk_buff *skb)
{
#ifdef CONFIG_INET
        return skb->destructor == sock_pfree;
#else
        return false;
#endif /* CONFIG_INET */
}

/* This helper checks if a socket is a full socket,
 * ie _not_ a timewait or request socket.
 */
static inline bool sk_fullsock(const struct sock *sk)
{
        return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV);
}

static inline bool
sk_is_refcounted(struct sock *sk)
{
        /* Only full sockets have sk->sk_flags. */
        return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE);
}

static inline bool
sk_requests_wifi_status(struct sock *sk)
{
        return sk && sk_fullsock(sk) && sock_flag(sk, SOCK_WIFI_STATUS);
}

/* This helper checks if a socket is a LISTEN or NEW_SYN_RECV
 * SYNACK messages can be attached to either ones (depending on SYNCOOKIE)
 */
static inline bool sk_listener(const struct sock *sk)
{
        return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV);
}

/* This helper checks if a socket is a LISTEN or NEW_SYN_RECV or TIME_WAIT
 * TCP SYNACK messages can be attached to LISTEN or NEW_SYN_RECV (depending on SYNCOOKIE)
 * TCP RST and ACK can be attached to TIME_WAIT.
 */
static inline bool sk_listener_or_tw(const struct sock *sk)
{
        return (1 << READ_ONCE(sk->sk_state)) &
               (TCPF_LISTEN | TCPF_NEW_SYN_RECV | TCPF_TIME_WAIT);
}

void sock_enable_timestamp(struct sock *sk, enum sock_flags flag);
int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level,
                       int type);

bool sk_ns_capable(const struct sock *sk,
                   struct user_namespace *user_ns, int cap);
bool sk_capable(const struct sock *sk, int cap);
bool sk_net_capable(const struct sock *sk, int cap);

void sk_get_meminfo(const struct sock *sk, u32 *meminfo);

/* Take into consideration the size of the struct sk_buff overhead in the
 * determination of these values, since that is non-constant across
 * platforms.  This makes socket queueing behavior and performance
 * not depend upon such differences.
 */
#define _SK_MEM_PACKETS                256
#define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
#define SK_WMEM_DEFAULT                (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
#define SK_RMEM_DEFAULT                (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)

extern __u32 sysctl_wmem_max;
extern __u32 sysctl_rmem_max;

extern __u32 sysctl_wmem_default;
extern __u32 sysctl_rmem_default;

#define SKB_FRAG_PAGE_ORDER        get_order(32768)
DECLARE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);

static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto)
{
        /* Does this proto have per netns sysctl_wmem ? */
        if (proto->sysctl_wmem_offset)
                return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset));

        return READ_ONCE(*proto->sysctl_wmem);
}

static inline int sk_get_rmem0(const struct sock *sk, const struct proto *proto)
{
        /* Does this proto have per netns sysctl_rmem ? */
        if (proto->sysctl_rmem_offset)
                return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset));

        return READ_ONCE(*proto->sysctl_rmem);
}

/* Default TCP Small queue budget is ~1 ms of data (1sec >> 10)
 * Some wifi drivers need to tweak it to get more chunks.
 * They can use this helper from their ndo_start_xmit()
 */
static inline void sk_pacing_shift_update(struct sock *sk, int val)
{
        if (!sk || !sk_fullsock(sk) || READ_ONCE(sk->sk_pacing_shift) == val)
                return;
        WRITE_ONCE(sk->sk_pacing_shift, val);
}

/* if a socket is bound to a device, check that the given device
 * index is either the same or that the socket is bound to an L3
 * master device and the given device index is also enslaved to
 * that L3 master
 */
static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif)
{
        int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
        int mdif;

        if (!bound_dev_if || bound_dev_if == dif)
                return true;

        mdif = l3mdev_master_ifindex_by_index(sock_net(sk), dif);
        if (mdif && mdif == bound_dev_if)
                return true;

        return false;
}

void sock_def_readable(struct sock *sk);

int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk);
void sock_set_timestamp(struct sock *sk, int optname, bool valbool);
int sock_set_timestamping(struct sock *sk, int optname,
                          struct so_timestamping timestamping);

#if defined(CONFIG_CGROUP_BPF)
void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op);
#else
static inline void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
{
}
#endif
void sock_no_linger(struct sock *sk);
void sock_set_keepalive(struct sock *sk);
void sock_set_priority(struct sock *sk, u32 priority);
void sock_set_rcvbuf(struct sock *sk, int val);
void sock_set_mark(struct sock *sk, u32 val);
void sock_set_reuseaddr(struct sock *sk);
void sock_set_reuseport(struct sock *sk);
void sock_set_sndtimeo(struct sock *sk, s64 secs);

int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len);

int sock_get_timeout(long timeo, void *optval, bool old_timeval);
int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
                           sockptr_t optval, int optlen, bool old_timeval);

int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
                     void __user *arg, void *karg, size_t size);
int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg);
static inline bool sk_is_readable(struct sock *sk)
{
        const struct proto *prot = READ_ONCE(sk->sk_prot);

        if (prot->sock_is_readable)
                return prot->sock_is_readable(sk);

        return false;
}
#endif        /* _SOCK_H */
































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PTRACE_H
#define _LINUX_PTRACE_H

#include <linux/compiler.h>                /* For unlikely.  */
#include <linux/sched.h>                /* For struct task_struct.  */
#include <linux/sched/signal.h>                /* For send_sig(), same_thread_group(), etc. */
#include <linux/err.h>                        /* for IS_ERR_VALUE */
#include <linux/bug.h>                        /* For BUG_ON.  */
#include <linux/pid_namespace.h>        /* For task_active_pid_ns.  */
#include <uapi/linux/ptrace.h>
#include <linux/seccomp.h>

/* Add sp to seccomp_data, as seccomp is user API, we don't want to modify it */
struct syscall_info {
        __u64                        sp;
        struct seccomp_data        data;
};

extern int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
                            void *buf, int len, unsigned int gup_flags);

/*
 * Ptrace flags
 *
 * The owner ship rules for task->ptrace which holds the ptrace
 * flags is simple.  When a task is running it owns it's task->ptrace
 * flags.  When the a task is stopped the ptracer owns task->ptrace.
 */

#define PT_SEIZED        0x00010000        /* SEIZE used, enable new behavior */
#define PT_PTRACED        0x00000001

#define PT_OPT_FLAG_SHIFT        3
/* PT_TRACE_* event enable flags */
#define PT_EVENT_FLAG(event)        (1 << (PT_OPT_FLAG_SHIFT + (event)))
#define PT_TRACESYSGOOD                PT_EVENT_FLAG(0)
#define PT_TRACE_FORK                PT_EVENT_FLAG(PTRACE_EVENT_FORK)
#define PT_TRACE_VFORK                PT_EVENT_FLAG(PTRACE_EVENT_VFORK)
#define PT_TRACE_CLONE                PT_EVENT_FLAG(PTRACE_EVENT_CLONE)
#define PT_TRACE_EXEC                PT_EVENT_FLAG(PTRACE_EVENT_EXEC)
#define PT_TRACE_VFORK_DONE        PT_EVENT_FLAG(PTRACE_EVENT_VFORK_DONE)
#define PT_TRACE_EXIT                PT_EVENT_FLAG(PTRACE_EVENT_EXIT)
#define PT_TRACE_SECCOMP        PT_EVENT_FLAG(PTRACE_EVENT_SECCOMP)

#define PT_EXITKILL                (PTRACE_O_EXITKILL << PT_OPT_FLAG_SHIFT)
#define PT_SUSPEND_SECCOMP        (PTRACE_O_SUSPEND_SECCOMP << PT_OPT_FLAG_SHIFT)

extern long arch_ptrace(struct task_struct *child, long request,
                        unsigned long addr, unsigned long data);
extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len);
extern int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len);
extern void ptrace_disable(struct task_struct *);
extern int ptrace_request(struct task_struct *child, long request,
                          unsigned long addr, unsigned long data);
extern int ptrace_notify(int exit_code, unsigned long message);
extern void __ptrace_link(struct task_struct *child,
                          struct task_struct *new_parent,
                          const struct cred *ptracer_cred);
extern void __ptrace_unlink(struct task_struct *child);
extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead);
#define PTRACE_MODE_READ        0x01
#define PTRACE_MODE_ATTACH        0x02
#define PTRACE_MODE_NOAUDIT        0x04
#define PTRACE_MODE_FSCREDS        0x08
#define PTRACE_MODE_REALCREDS        0x10

/* shorthands for READ/ATTACH and FSCREDS/REALCREDS combinations */
#define PTRACE_MODE_READ_FSCREDS (PTRACE_MODE_READ | PTRACE_MODE_FSCREDS)
#define PTRACE_MODE_READ_REALCREDS (PTRACE_MODE_READ | PTRACE_MODE_REALCREDS)
#define PTRACE_MODE_ATTACH_FSCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_FSCREDS)
#define PTRACE_MODE_ATTACH_REALCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_REALCREDS)

/**
 * ptrace_may_access - check whether the caller is permitted to access
 * a target task.
 * @task: target task
 * @mode: selects type of access and caller credentials
 *
 * Returns true on success, false on denial.
 *
 * One of the flags PTRACE_MODE_FSCREDS and PTRACE_MODE_REALCREDS must
 * be set in @mode to specify whether the access was requested through
 * a filesystem syscall (should use effective capabilities and fsuid
 * of the caller) or through an explicit syscall such as
 * process_vm_writev or ptrace (and should use the real credentials).
 */
extern bool ptrace_may_access(struct task_struct *task, unsigned int mode);

static inline int ptrace_reparented(struct task_struct *child)
{
        return !same_thread_group(child->real_parent, child->parent);
}

static inline void ptrace_unlink(struct task_struct *child)
{
        if (unlikely(child->ptrace))
                __ptrace_unlink(child);
}

int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
                            unsigned long data);
int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
                            unsigned long data);

/**
 * ptrace_parent - return the task that is tracing the given task
 * @task: task to consider
 *
 * Returns %NULL if no one is tracing @task, or the &struct task_struct
 * pointer to its tracer.
 *
 * Must called under rcu_read_lock().  The pointer returned might be kept
 * live only by RCU.  During exec, this may be called with task_lock() held
 * on @task, still held from when check_unsafe_exec() was called.
 */
static inline struct task_struct *ptrace_parent(struct task_struct *task)
{
        if (unlikely(task->ptrace))
                return rcu_dereference(task->parent);
        return NULL;
}

/**
 * ptrace_event_enabled - test whether a ptrace event is enabled
 * @task: ptracee of interest
 * @event: %PTRACE_EVENT_* to test
 *
 * Test whether @event is enabled for ptracee @task.
 *
 * Returns %true if @event is enabled, %false otherwise.
 */
static inline bool ptrace_event_enabled(struct task_struct *task, int event)
{
        return task->ptrace & PT_EVENT_FLAG(event);
}

/**
 * ptrace_event - possibly stop for a ptrace event notification
 * @event:        %PTRACE_EVENT_* value to report
 * @message:        value for %PTRACE_GETEVENTMSG to return
 *
 * Check whether @event is enabled and, if so, report @event and @message
 * to the ptrace parent.
 *
 * Called without locks.
 */
static inline void ptrace_event(int event, unsigned long message)
{
        if (unlikely(ptrace_event_enabled(current, event))) {
                ptrace_notify((event << 8) | SIGTRAP, message);
        } else if (event == PTRACE_EVENT_EXEC) {
                /* legacy EXEC report via SIGTRAP */
                if ((current->ptrace & (PT_PTRACED|PT_SEIZED)) == PT_PTRACED)
                        send_sig(SIGTRAP, current, 0);
        }
}

/**
 * ptrace_event_pid - possibly stop for a ptrace event notification
 * @event:        %PTRACE_EVENT_* value to report
 * @pid:        process identifier for %PTRACE_GETEVENTMSG to return
 *
 * Check whether @event is enabled and, if so, report @event and @pid
 * to the ptrace parent.  @pid is reported as the pid_t seen from the
 * ptrace parent's pid namespace.
 *
 * Called without locks.
 */
static inline void ptrace_event_pid(int event, struct pid *pid)
{
        /*
         * FIXME: There's a potential race if a ptracer in a different pid
         * namespace than parent attaches between computing message below and
         * when we acquire tasklist_lock in ptrace_stop().  If this happens,
         * the ptracer will get a bogus pid from PTRACE_GETEVENTMSG.
         */
        unsigned long message = 0;
        struct pid_namespace *ns;

        rcu_read_lock();
        ns = task_active_pid_ns(rcu_dereference(current->parent));
        if (ns)
                message = pid_nr_ns(pid, ns);
        rcu_read_unlock();

        ptrace_event(event, message);
}

/**
 * ptrace_init_task - initialize ptrace state for a new child
 * @child:                new child task
 * @ptrace:                true if child should be ptrace'd by parent's tracer
 *
 * This is called immediately after adding @child to its parent's children
 * list.  @ptrace is false in the normal case, and true to ptrace @child.
 *
 * Called with current's siglock and write_lock_irq(&tasklist_lock) held.
 */
static inline void ptrace_init_task(struct task_struct *child, bool ptrace)
{
        INIT_LIST_HEAD(&child->ptrace_entry);
        INIT_LIST_HEAD(&child->ptraced);
        child->jobctl = 0;
        child->ptrace = 0;
        child->parent = child->real_parent;

        if (unlikely(ptrace) && current->ptrace) {
                child->ptrace = current->ptrace;
                __ptrace_link(child, current->parent, current->ptracer_cred);

                if (child->ptrace & PT_SEIZED)
                        task_set_jobctl_pending(child, JOBCTL_TRAP_STOP);
                else
                        sigaddset(&child->pending.signal, SIGSTOP);
        }
        else
                child->ptracer_cred = NULL;
}

/**
 * ptrace_release_task - final ptrace-related cleanup of a zombie being reaped
 * @task:        task in %EXIT_DEAD state
 *
 * Called with write_lock(&tasklist_lock) held.
 */
static inline void ptrace_release_task(struct task_struct *task)
{
        BUG_ON(!list_empty(&task->ptraced));
        ptrace_unlink(task);
        BUG_ON(!list_empty(&task->ptrace_entry));
}

#ifndef force_successful_syscall_return
/*
 * System call handlers that, upon successful completion, need to return a
 * negative value should call force_successful_syscall_return() right before
 * returning.  On architectures where the syscall convention provides for a
 * separate error flag (e.g., alpha, ia64, ppc{,64}, sparc{,64}, possibly
 * others), this macro can be used to ensure that the error flag will not get
 * set.  On architectures which do not support a separate error flag, the macro
 * is a no-op and the spurious error condition needs to be filtered out by some
 * other means (e.g., in user-level, by passing an extra argument to the
 * syscall handler, or something along those lines).
 */
#define force_successful_syscall_return() do { } while (0)
#endif

#ifndef is_syscall_success
/*
 * On most systems we can tell if a syscall is a success based on if the retval
 * is an error value.  On some systems like ia64 and powerpc they have different
 * indicators of success/failure and must define their own.
 */
#define is_syscall_success(regs) (!IS_ERR_VALUE((unsigned long)(regs_return_value(regs))))
#endif

/*
 * <asm/ptrace.h> should define the following things inside #ifdef __KERNEL__.
 *
 * These do-nothing inlines are used when the arch does not
 * implement single-step.  The kerneldoc comments are here
 * to document the interface for all arch definitions.
 */

#ifndef arch_has_single_step
/**
 * arch_has_single_step - does this CPU support user-mode single-step?
 *
 * If this is defined, then there must be function declarations or
 * inlines for user_enable_single_step() and user_disable_single_step().
 * arch_has_single_step() should evaluate to nonzero iff the machine
 * supports instruction single-step for user mode.
 * It can be a constant or it can test a CPU feature bit.
 */
#define arch_has_single_step()                (0)

/**
 * user_enable_single_step - single-step in user-mode task
 * @task: either current or a task stopped in %TASK_TRACED
 *
 * This can only be called when arch_has_single_step() has returned nonzero.
 * Set @task so that when it returns to user mode, it will trap after the
 * next single instruction executes.  If arch_has_block_step() is defined,
 * this must clear the effects of user_enable_block_step() too.
 */
static inline void user_enable_single_step(struct task_struct *task)
{
        BUG();                        /* This can never be called.  */
}

/**
 * user_disable_single_step - cancel user-mode single-step
 * @task: either current or a task stopped in %TASK_TRACED
 *
 * Clear @task of the effects of user_enable_single_step() and
 * user_enable_block_step().  This can be called whether or not either
 * of those was ever called on @task, and even if arch_has_single_step()
 * returned zero.
 */
static inline void user_disable_single_step(struct task_struct *task)
{
}
#else
extern void user_enable_single_step(struct task_struct *);
extern void user_disable_single_step(struct task_struct *);
#endif        /* arch_has_single_step */

#ifndef arch_has_block_step
/**
 * arch_has_block_step - does this CPU support user-mode block-step?
 *
 * If this is defined, then there must be a function declaration or inline
 * for user_enable_block_step(), and arch_has_single_step() must be defined
 * too.  arch_has_block_step() should evaluate to nonzero iff the machine
 * supports step-until-branch for user mode.  It can be a constant or it
 * can test a CPU feature bit.
 */
#define arch_has_block_step()                (0)

/**
 * user_enable_block_step - step until branch in user-mode task
 * @task: either current or a task stopped in %TASK_TRACED
 *
 * This can only be called when arch_has_block_step() has returned nonzero,
 * and will never be called when single-instruction stepping is being used.
 * Set @task so that when it returns to user mode, it will trap after the
 * next branch or trap taken.
 */
static inline void user_enable_block_step(struct task_struct *task)
{
        BUG();                        /* This can never be called.  */
}
#else
extern void user_enable_block_step(struct task_struct *);
#endif        /* arch_has_block_step */

#ifdef ARCH_HAS_USER_SINGLE_STEP_REPORT
extern void user_single_step_report(struct pt_regs *regs);
#else
static inline void user_single_step_report(struct pt_regs *regs)
{
        kernel_siginfo_t info;
        clear_siginfo(&info);
        info.si_signo = SIGTRAP;
        info.si_errno = 0;
        info.si_code = SI_USER;
        info.si_pid = 0;
        info.si_uid = 0;
        force_sig_info(&info);
}
#endif

#ifndef arch_ptrace_stop_needed
/**
 * arch_ptrace_stop_needed - Decide whether arch_ptrace_stop() should be called
 *
 * This is called with the siglock held, to decide whether or not it's
 * necessary to release the siglock and call arch_ptrace_stop().  It can be
 * defined to a constant if arch_ptrace_stop() is never required, or always
 * is.  On machines where this makes sense, it should be defined to a quick
 * test to optimize out calling arch_ptrace_stop() when it would be
 * superfluous.  For example, if the thread has not been back to user mode
 * since the last stop, the thread state might indicate that nothing needs
 * to be done.
 *
 * This is guaranteed to be invoked once before a task stops for ptrace and
 * may include arch-specific operations necessary prior to a ptrace stop.
 */
#define arch_ptrace_stop_needed()        (0)
#endif

#ifndef arch_ptrace_stop
/**
 * arch_ptrace_stop - Do machine-specific work before stopping for ptrace
 *
 * This is called with no locks held when arch_ptrace_stop_needed() has
 * just returned nonzero.  It is allowed to block, e.g. for user memory
 * access.  The arch can have machine-specific work to be done before
 * ptrace stops.  On ia64, register backing store gets written back to user
 * memory here.  Since this can be costly (requires dropping the siglock),
 * we only do it when the arch requires it for this particular stop, as
 * indicated by arch_ptrace_stop_needed().
 */
#define arch_ptrace_stop()                do { } while (0)
#endif

#ifndef current_pt_regs
#define current_pt_regs() task_pt_regs(current)
#endif

#ifndef current_user_stack_pointer
#define current_user_stack_pointer() user_stack_pointer(current_pt_regs())
#endif

#ifndef exception_ip
#define exception_ip(x) instruction_pointer(x)
#endif

extern int task_current_syscall(struct task_struct *target, struct syscall_info *info);

extern void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact);

/*
 * ptrace report for syscall entry and exit looks identical.
 */
static inline int ptrace_report_syscall(unsigned long message)
{
        int ptrace = current->ptrace;
        int signr;

        if (!(ptrace & PT_PTRACED))
                return 0;

        signr = ptrace_notify(SIGTRAP | ((ptrace & PT_TRACESYSGOOD) ? 0x80 : 0),
                              message);

        /*
         * this isn't the same as continuing with a signal, but it will do
         * for normal use.  strace only continues with a signal if the
         * stopping signal is not SIGTRAP.  -brl
         */
        if (signr)
                send_sig(signr, current, 1);

        return fatal_signal_pending(current);
}

/**
 * ptrace_report_syscall_entry - task is about to attempt a system call
 * @regs:                user register state of current task
 *
 * This will be called if %SYSCALL_WORK_SYSCALL_TRACE or
 * %SYSCALL_WORK_SYSCALL_EMU have been set, when the current task has just
 * entered the kernel for a system call.  Full user register state is
 * available here.  Changing the values in @regs can affect the system
 * call number and arguments to be tried.  It is safe to block here,
 * preventing the system call from beginning.
 *
 * Returns zero normally, or nonzero if the calling arch code should abort
 * the system call.  That must prevent normal entry so no system call is
 * made.  If @task ever returns to user mode after this, its register state
 * is unspecified, but should be something harmless like an %ENOSYS error
 * return.  It should preserve enough information so that syscall_rollback()
 * can work (see asm-generic/syscall.h).
 *
 * Called without locks, just after entering kernel mode.
 */
static inline __must_check int ptrace_report_syscall_entry(
        struct pt_regs *regs)
{
        return ptrace_report_syscall(PTRACE_EVENTMSG_SYSCALL_ENTRY);
}

/**
 * ptrace_report_syscall_exit - task has just finished a system call
 * @regs:                user register state of current task
 * @step:                nonzero if simulating single-step or block-step
 *
 * This will be called if %SYSCALL_WORK_SYSCALL_TRACE has been set, when
 * the current task has just finished an attempted system call.  Full
 * user register state is available here.  It is safe to block here,
 * preventing signals from being processed.
 *
 * If @step is nonzero, this report is also in lieu of the normal
 * trap that would follow the system call instruction because
 * user_enable_block_step() or user_enable_single_step() was used.
 * In this case, %SYSCALL_WORK_SYSCALL_TRACE might not be set.
 *
 * Called without locks, just before checking for pending signals.
 */
static inline void ptrace_report_syscall_exit(struct pt_regs *regs, int step)
{
        if (step)
                user_single_step_report(regs);
        else
                ptrace_report_syscall(PTRACE_EVENTMSG_SYSCALL_EXIT);
}
#endif














































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ADDRCONF_H
#define _ADDRCONF_H

#define MAX_RTR_SOLICITATIONS                -1                /* unlimited */
#define RTR_SOLICITATION_INTERVAL        (4*HZ)
#define RTR_SOLICITATION_MAX_INTERVAL        (3600*HZ)        /* 1 hour */

#define MIN_VALID_LIFETIME                (2*3600)        /* 2 hours */

#define TEMP_VALID_LIFETIME                (7*86400)       /* 1 week */
#define TEMP_PREFERRED_LIFETIME                (86400)         /* 24 hours */
#define REGEN_MIN_ADVANCE                (2)             /* 2 seconds */
#define REGEN_MAX_RETRY                        (3)
#define MAX_DESYNC_FACTOR                (600)

#define ADDR_CHECK_FREQUENCY                (120*HZ)

#define IPV6_MAX_ADDRESSES                16

#define ADDRCONF_TIMER_FUZZ_MINUS        (HZ > 50 ? HZ / 50 : 1)
#define ADDRCONF_TIMER_FUZZ                (HZ / 4)
#define ADDRCONF_TIMER_FUZZ_MAX                (HZ)

#define ADDRCONF_NOTIFY_PRIORITY        0

#include <linux/in.h>
#include <linux/in6.h>

struct prefix_info {
        __u8                        type;
        __u8                        length;
        __u8                        prefix_len;

        union __packed {
                __u8                flags;
                struct __packed {
#if defined(__BIG_ENDIAN_BITFIELD)
                        __u8        onlink : 1,
                                autoconf : 1,
                                routeraddr : 1,
                                preferpd : 1,
                                reserved : 4;
#elif defined(__LITTLE_ENDIAN_BITFIELD)
                        __u8        reserved : 4,
                                preferpd : 1,
                                routeraddr : 1,
                                autoconf : 1,
                                onlink : 1;
#else
#error "Please fix <asm/byteorder.h>"
#endif
                };
        };
        __be32                        valid;
        __be32                        prefered;
        __be32                        reserved2;

        struct in6_addr                prefix;
};

/* rfc4861 4.6.2: IPv6 PIO is 32 bytes in size */
static_assert(sizeof(struct prefix_info) == 32);

#include <linux/ipv6.h>
#include <linux/netdevice.h>
#include <net/if_inet6.h>
#include <net/ipv6.h>

struct in6_validator_info {
        struct in6_addr                i6vi_addr;
        struct inet6_dev        *i6vi_dev;
        struct netlink_ext_ack        *extack;
};

struct ifa6_config {
        const struct in6_addr        *pfx;
        unsigned int                plen;

        u8                        ifa_proto;

        const struct in6_addr        *peer_pfx;

        u32                        rt_priority;
        u32                        ifa_flags;
        u32                        preferred_lft;
        u32                        valid_lft;
        u16                        scope;
};

enum addr_type_t {
        UNICAST_ADDR,
        MULTICAST_ADDR,
        ANYCAST_ADDR,
};

struct inet6_fill_args {
        u32 portid;
        u32 seq;
        int event;
        unsigned int flags;
        int netnsid;
        int ifindex;
        enum addr_type_t type;
        bool force_rt_scope_universe;
};

int addrconf_init(void);
void addrconf_cleanup(void);

int addrconf_add_ifaddr(struct net *net, void __user *arg);
int addrconf_del_ifaddr(struct net *net, void __user *arg);
int addrconf_set_dstaddr(struct net *net, void __user *arg);

int ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
                  const struct net_device *dev, int strict);
int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
                            const struct net_device *dev, bool skip_dev_check,
                            int strict, u32 banned_flags);

#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr);
#endif

int ipv6_chk_rpl_srh_loop(struct net *net, const struct in6_addr *segs,
                          unsigned char nsegs);

bool ipv6_chk_custom_prefix(const struct in6_addr *addr,
                                   const unsigned int prefix_len,
                                   struct net_device *dev);

int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev);

struct net_device *ipv6_dev_find(struct net *net, const struct in6_addr *addr,
                                 struct net_device *dev);

struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net,
                                     const struct in6_addr *addr,
                                     struct net_device *dev, int strict);

int ipv6_dev_get_saddr(struct net *net, const struct net_device *dev,
                       const struct in6_addr *daddr, unsigned int srcprefs,
                       struct in6_addr *saddr);
int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr,
                    u32 banned_flags);
bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
                          bool match_wildcard);
bool inet_rcv_saddr_any(const struct sock *sk);
void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr);
void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr);

void addrconf_add_linklocal(struct inet6_dev *idev,
                            const struct in6_addr *addr, u32 flags);

int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
                                 const struct prefix_info *pinfo,
                                 struct inet6_dev *in6_dev,
                                 const struct in6_addr *addr, int addr_type,
                                 u32 addr_flags, bool sllao, bool tokenized,
                                 __u32 valid_lft, u32 prefered_lft);

static inline void addrconf_addr_eui48_base(u8 *eui, const char *const addr)
{
        memcpy(eui, addr, 3);
        eui[3] = 0xFF;
        eui[4] = 0xFE;
        memcpy(eui + 5, addr + 3, 3);
}

static inline void addrconf_addr_eui48(u8 *eui, const char *const addr)
{
        addrconf_addr_eui48_base(eui, addr);
        eui[0] ^= 2;
}

static inline int addrconf_ifid_eui48(u8 *eui, struct net_device *dev)
{
        if (dev->addr_len != ETH_ALEN)
                return -1;

        /*
         * The zSeries OSA network cards can be shared among various
         * OS instances, but the OSA cards have only one MAC address.
         * This leads to duplicate address conflicts in conjunction
         * with IPv6 if more than one instance uses the same card.
         *
         * The driver for these cards can deliver a unique 16-bit
         * identifier for each instance sharing the same card.  It is
         * placed instead of 0xFFFE in the interface identifier.  The
         * "u" bit of the interface identifier is not inverted in this
         * case.  Hence the resulting interface identifier has local
         * scope according to RFC2373.
         */

        addrconf_addr_eui48_base(eui, dev->dev_addr);

        if (dev->dev_id) {
                eui[3] = (dev->dev_id >> 8) & 0xFF;
                eui[4] = dev->dev_id & 0xFF;
        } else {
                eui[0] ^= 2;
        }

        return 0;
}

#define INFINITY_LIFE_TIME 0xFFFFFFFF

static inline unsigned long addrconf_timeout_fixup(u32 timeout,
                                                   unsigned int unit)
{
        if (timeout == INFINITY_LIFE_TIME)
                return ~0UL;

        /*
         * Avoid arithmetic overflow.
         * Assuming unit is constant and non-zero, this "if" statement
         * will go away on 64bit archs.
         */
        if (0xfffffffe > LONG_MAX / unit && timeout > LONG_MAX / unit)
                return LONG_MAX / unit;

        return timeout;
}

static inline int addrconf_finite_timeout(unsigned long timeout)
{
        return ~timeout;
}

/*
 *        IPv6 Address Label subsystem (addrlabel.c)
 */
int ipv6_addr_label_init(void);
void ipv6_addr_label_cleanup(void);
int ipv6_addr_label_rtnl_register(void);
u32 ipv6_addr_label(struct net *net, const struct in6_addr *addr,
                    int type, int ifindex);

/*
 *        multicast prototypes (mcast.c)
 */
static inline bool ipv6_mc_may_pull(struct sk_buff *skb,
                                    unsigned int len)
{
        if (skb_transport_offset(skb) + ipv6_transport_len(skb) < len)
                return false;

        return pskb_may_pull(skb, len);
}

int ipv6_sock_mc_join(struct sock *sk, int ifindex,
                      const struct in6_addr *addr);
int ipv6_sock_mc_drop(struct sock *sk, int ifindex,
                      const struct in6_addr *addr);
void __ipv6_sock_mc_close(struct sock *sk);
void ipv6_sock_mc_close(struct sock *sk);
bool inet6_mc_check(const struct sock *sk, const struct in6_addr *mc_addr,
                    const struct in6_addr *src_addr);

int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr);
int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr);
int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr);
void ipv6_mc_up(struct inet6_dev *idev);
void ipv6_mc_down(struct inet6_dev *idev);
void ipv6_mc_unmap(struct inet6_dev *idev);
void ipv6_mc_remap(struct inet6_dev *idev);
void ipv6_mc_init_dev(struct inet6_dev *idev);
void ipv6_mc_destroy_dev(struct inet6_dev *idev);
int ipv6_mc_check_mld(struct sk_buff *skb);
void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp);

bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group,
                         const struct in6_addr *src_addr);

void ipv6_mc_dad_complete(struct inet6_dev *idev);

/*
 * identify MLD packets for MLD filter exceptions
 */
static inline bool ipv6_is_mld(struct sk_buff *skb, int nexthdr, int offset)
{
        struct icmp6hdr *hdr;

        if (nexthdr != IPPROTO_ICMPV6 ||
            !pskb_network_may_pull(skb, offset + sizeof(struct icmp6hdr)))
                return false;

        hdr = (struct icmp6hdr *)(skb_network_header(skb) + offset);

        switch (hdr->icmp6_type) {
        case ICMPV6_MGM_QUERY:
        case ICMPV6_MGM_REPORT:
        case ICMPV6_MGM_REDUCTION:
        case ICMPV6_MLD2_REPORT:
                return true;
        default:
                break;
        }
        return false;
}

void addrconf_prefix_rcv(struct net_device *dev,
                         u8 *opt, int len, bool sllao);

/*
 *        anycast prototypes (anycast.c)
 */
int ipv6_sock_ac_join(struct sock *sk, int ifindex,
                      const struct in6_addr *addr);
int ipv6_sock_ac_drop(struct sock *sk, int ifindex,
                      const struct in6_addr *addr);
void __ipv6_sock_ac_close(struct sock *sk);
void ipv6_sock_ac_close(struct sock *sk);

int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr);
int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr);
void ipv6_ac_destroy_dev(struct inet6_dev *idev);
bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev,
                         const struct in6_addr *addr);
bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev,
                             const struct in6_addr *addr);
int ipv6_anycast_init(void);
void ipv6_anycast_cleanup(void);

/* Device notifier */
int register_inet6addr_notifier(struct notifier_block *nb);
int unregister_inet6addr_notifier(struct notifier_block *nb);
int inet6addr_notifier_call_chain(unsigned long val, void *v);

int register_inet6addr_validator_notifier(struct notifier_block *nb);
int unregister_inet6addr_validator_notifier(struct notifier_block *nb);
int inet6addr_validator_notifier_call_chain(unsigned long val, void *v);

void inet6_netconf_notify_devconf(struct net *net, int event, int type,
                                  int ifindex, struct ipv6_devconf *devconf);

/**
 * __in6_dev_get - get inet6_dev pointer from netdevice
 * @dev: network device
 *
 * Caller must hold rcu_read_lock or RTNL, because this function
 * does not take a reference on the inet6_dev.
 */
static inline struct inet6_dev *__in6_dev_get(const struct net_device *dev)
{
        return rcu_dereference_rtnl(dev->ip6_ptr);
}

static inline struct inet6_dev *__in6_dev_get_rtnl_net(const struct net_device *dev)
{
        return rtnl_net_dereference(dev_net(dev), dev->ip6_ptr);
}

/**
 * __in6_dev_stats_get - get inet6_dev pointer for stats
 * @dev: network device
 * @skb: skb for original incoming interface if needed
 *
 * Caller must hold rcu_read_lock or RTNL, because this function
 * does not take a reference on the inet6_dev.
 */
static inline struct inet6_dev *__in6_dev_stats_get(const struct net_device *dev,
                                                    const struct sk_buff *skb)
{
        if (netif_is_l3_master(dev))
                dev = dev_get_by_index_rcu(dev_net(dev), inet6_iif(skb));
        return __in6_dev_get(dev);
}

/**
 * __in6_dev_get_safely - get inet6_dev pointer from netdevice
 * @dev: network device
 *
 * This is a safer version of __in6_dev_get
 */
static inline struct inet6_dev *__in6_dev_get_safely(const struct net_device *dev)
{
        if (likely(dev))
                return rcu_dereference_rtnl(dev->ip6_ptr);
        else
                return NULL;
}

/**
 * in6_dev_get - get inet6_dev pointer from netdevice
 * @dev: network device
 *
 * This version can be used in any context, and takes a reference
 * on the inet6_dev. Callers must use in6_dev_put() later to
 * release this reference.
 */
static inline struct inet6_dev *in6_dev_get(const struct net_device *dev)
{
        struct inet6_dev *idev;

        rcu_read_lock();
        idev = rcu_dereference(dev->ip6_ptr);
        if (idev)
                refcount_inc(&idev->refcnt);
        rcu_read_unlock();
        return idev;
}

static inline struct neigh_parms *__in6_dev_nd_parms_get_rcu(const struct net_device *dev)
{
        struct inet6_dev *idev = __in6_dev_get(dev);

        return idev ? idev->nd_parms : NULL;
}

void in6_dev_finish_destroy(struct inet6_dev *idev);

static inline void in6_dev_put(struct inet6_dev *idev)
{
        if (refcount_dec_and_test(&idev->refcnt))
                in6_dev_finish_destroy(idev);
}

static inline void in6_dev_put_clear(struct inet6_dev **pidev)
{
        struct inet6_dev *idev = *pidev;

        if (idev) {
                in6_dev_put(idev);
                *pidev = NULL;
        }
}

static inline void __in6_dev_put(struct inet6_dev *idev)
{
        refcount_dec(&idev->refcnt);
}

static inline void in6_dev_hold(struct inet6_dev *idev)
{
        refcount_inc(&idev->refcnt);
}

/* called with rcu_read_lock held */
static inline bool ip6_ignore_linkdown(const struct net_device *dev)
{
        const struct inet6_dev *idev = __in6_dev_get(dev);

        if (unlikely(!idev))
                return true;

        return !!READ_ONCE(idev->cnf.ignore_routes_with_linkdown);
}

void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp);

static inline void in6_ifa_put(struct inet6_ifaddr *ifp)
{
        if (refcount_dec_and_test(&ifp->refcnt))
                inet6_ifa_finish_destroy(ifp);
}

static inline void __in6_ifa_put(struct inet6_ifaddr *ifp)
{
        refcount_dec(&ifp->refcnt);
}

static inline void in6_ifa_hold(struct inet6_ifaddr *ifp)
{
        refcount_inc(&ifp->refcnt);
}

static inline bool in6_ifa_hold_safe(struct inet6_ifaddr *ifp)
{
        return refcount_inc_not_zero(&ifp->refcnt);
}

/*
 *        compute link-local solicited-node multicast address
 */

static inline void addrconf_addr_solict_mult(const struct in6_addr *addr,
                                             struct in6_addr *solicited)
{
        ipv6_addr_set(solicited,
                      htonl(0xFF020000), 0,
                      htonl(0x1),
                      htonl(0xFF000000) | addr->s6_addr32[3]);
}

static inline bool ipv6_addr_is_ll_all_nodes(const struct in6_addr *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        __be64 *p = (__force __be64 *)addr;
        return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | (p[1] ^ cpu_to_be64(1))) == 0UL;
#else
        return ((addr->s6_addr32[0] ^ htonl(0xff020000)) |
                addr->s6_addr32[1] | addr->s6_addr32[2] |
                (addr->s6_addr32[3] ^ htonl(0x00000001))) == 0;
#endif
}

static inline bool ipv6_addr_is_ll_all_routers(const struct in6_addr *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        __be64 *p = (__force __be64 *)addr;
        return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | (p[1] ^ cpu_to_be64(2))) == 0UL;
#else
        return ((addr->s6_addr32[0] ^ htonl(0xff020000)) |
                addr->s6_addr32[1] | addr->s6_addr32[2] |
                (addr->s6_addr32[3] ^ htonl(0x00000002))) == 0;
#endif
}

static inline bool ipv6_addr_is_isatap(const struct in6_addr *addr)
{
        return (addr->s6_addr32[2] | htonl(0x02000000)) == htonl(0x02005EFE);
}

static inline bool ipv6_addr_is_solict_mult(const struct in6_addr *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        __be64 *p = (__force __be64 *)addr;
        return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) |
                ((p[1] ^ cpu_to_be64(0x00000001ff000000UL)) &
                 cpu_to_be64(0xffffffffff000000UL))) == 0UL;
#else
        return ((addr->s6_addr32[0] ^ htonl(0xff020000)) |
                addr->s6_addr32[1] |
                (addr->s6_addr32[2] ^ htonl(0x00000001)) |
                (addr->s6_addr[12] ^ 0xff)) == 0;
#endif
}

static inline bool ipv6_addr_is_all_snoopers(const struct in6_addr *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        __be64 *p = (__force __be64 *)addr;

        return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) |
                (p[1] ^ cpu_to_be64(0x6a))) == 0UL;
#else
        return ((addr->s6_addr32[0] ^ htonl(0xff020000)) |
                addr->s6_addr32[1] | addr->s6_addr32[2] |
                (addr->s6_addr32[3] ^ htonl(0x0000006a))) == 0;
#endif
}

#ifdef CONFIG_PROC_FS
int if6_proc_init(void);
void if6_proc_exit(void);
#endif

int inet6_fill_ifmcaddr(struct sk_buff *skb,
                        const struct ifmcaddr6 *ifmca,
                        struct inet6_fill_args *args);

int inet6_fill_ifacaddr(struct sk_buff *skb,
                        const struct ifacaddr6 *ifaca,
                        struct inet6_fill_args *args);
#endif












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  314 






























































































  314 


  319 


  317 


































































  318 




  314 






























































  319 












































































  312 


  317 

  319 

  318 

  316 





  318 
  319 

  311 



















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MMZONE_H
#define _LINUX_MMZONE_H

#ifndef __ASSEMBLY__
#ifndef __GENERATING_BOUNDS_H

#include <linux/spinlock.h>
#include <linux/list.h>
#include <linux/list_nulls.h>
#include <linux/wait.h>
#include <linux/bitops.h>
#include <linux/cache.h>
#include <linux/threads.h>
#include <linux/numa.h>
#include <linux/init.h>
#include <linux/seqlock.h>
#include <linux/nodemask.h>
#include <linux/pageblock-flags.h>
#include <linux/page-flags-layout.h>
#include <linux/atomic.h>
#include <linux/mm_types.h>
#include <linux/page-flags.h>
#include <linux/local_lock.h>
#include <linux/zswap.h>
#include <asm/page.h>

/* Free memory management - zoned buddy allocator.  */
#ifndef CONFIG_ARCH_FORCE_MAX_ORDER
#define MAX_PAGE_ORDER 10
#else
#define MAX_PAGE_ORDER CONFIG_ARCH_FORCE_MAX_ORDER
#endif
#define MAX_ORDER_NR_PAGES (1 << MAX_PAGE_ORDER)

#define IS_MAX_ORDER_ALIGNED(pfn) IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES)

#define NR_PAGE_ORDERS (MAX_PAGE_ORDER + 1)

/* Defines the order for the number of pages that have a migrate type. */
#ifndef CONFIG_PAGE_BLOCK_MAX_ORDER
#define PAGE_BLOCK_MAX_ORDER MAX_PAGE_ORDER
#else
#define PAGE_BLOCK_MAX_ORDER CONFIG_PAGE_BLOCK_MAX_ORDER
#endif /* CONFIG_PAGE_BLOCK_MAX_ORDER */

/*
 * The MAX_PAGE_ORDER, which defines the max order of pages to be allocated
 * by the buddy allocator, has to be larger or equal to the PAGE_BLOCK_MAX_ORDER,
 * which defines the order for the number of pages that can have a migrate type
 */
#if (PAGE_BLOCK_MAX_ORDER > MAX_PAGE_ORDER)
#error MAX_PAGE_ORDER must be >= PAGE_BLOCK_MAX_ORDER
#endif

/*
 * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
 * costly to service.  That is between allocation orders which should
 * coalesce naturally under reasonable reclaim pressure and those which
 * will not.
 */
#define PAGE_ALLOC_COSTLY_ORDER 3

enum migratetype {
        MIGRATE_UNMOVABLE,
        MIGRATE_MOVABLE,
        MIGRATE_RECLAIMABLE,
        MIGRATE_PCPTYPES,        /* the number of types on the pcp lists */
        MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES,
#ifdef CONFIG_CMA
        /*
         * MIGRATE_CMA migration type is designed to mimic the way
         * ZONE_MOVABLE works.  Only movable pages can be allocated
         * from MIGRATE_CMA pageblocks and page allocator never
         * implicitly change migration type of MIGRATE_CMA pageblock.
         *
         * The way to use it is to change migratetype of a range of
         * pageblocks to MIGRATE_CMA which can be done by
         * __free_pageblock_cma() function.
         */
        MIGRATE_CMA,
        __MIGRATE_TYPE_END = MIGRATE_CMA,
#else
        __MIGRATE_TYPE_END = MIGRATE_HIGHATOMIC,
#endif
#ifdef CONFIG_MEMORY_ISOLATION
        MIGRATE_ISOLATE,        /* can't allocate from here */
#endif
        MIGRATE_TYPES
};

/* In mm/page_alloc.c; keep in sync also with show_migration_types() there */
extern const char * const migratetype_names[MIGRATE_TYPES];

#ifdef CONFIG_CMA
#  define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
#  define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA)
/*
 * __dump_folio() in mm/debug.c passes a folio pointer to on-stack struct folio,
 * so folio_pfn() cannot be used and pfn is needed.
 */
#  define is_migrate_cma_folio(folio, pfn) \
        (get_pfnblock_migratetype(&folio->page, pfn) == MIGRATE_CMA)
#else
#  define is_migrate_cma(migratetype) false
#  define is_migrate_cma_page(_page) false
#  define is_migrate_cma_folio(folio, pfn) false
#endif

static inline bool is_migrate_movable(int mt)
{
        return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE;
}

/*
 * Check whether a migratetype can be merged with another migratetype.
 *
 * It is only mergeable when it can fall back to other migratetypes for
 * allocation. See fallbacks[MIGRATE_TYPES][3] in page_alloc.c.
 */
static inline bool migratetype_is_mergeable(int mt)
{
        return mt < MIGRATE_PCPTYPES;
}

#define for_each_migratetype_order(order, type) \
        for (order = 0; order < NR_PAGE_ORDERS; order++) \
                for (type = 0; type < MIGRATE_TYPES; type++)

extern int page_group_by_mobility_disabled;

#define get_pageblock_migratetype(page) \
        get_pfnblock_migratetype(page, page_to_pfn(page))

#define folio_migratetype(folio) \
        get_pageblock_migratetype(&folio->page)

struct free_area {
        struct list_head        free_list[MIGRATE_TYPES];
        unsigned long                nr_free;
};

struct pglist_data;

#ifdef CONFIG_NUMA
enum numa_stat_item {
        NUMA_HIT,                /* allocated in intended node */
        NUMA_MISS,                /* allocated in non intended node */
        NUMA_FOREIGN,                /* was intended here, hit elsewhere */
        NUMA_INTERLEAVE_HIT,        /* interleaver preferred this zone */
        NUMA_LOCAL,                /* allocation from local node */
        NUMA_OTHER,                /* allocation from other node */
        NR_VM_NUMA_EVENT_ITEMS
};
#else
#define NR_VM_NUMA_EVENT_ITEMS 0
#endif

enum zone_stat_item {
        /* First 128 byte cacheline (assuming 64 bit words) */
        NR_FREE_PAGES,
        NR_FREE_PAGES_BLOCKS,
        NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */
        NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE,
        NR_ZONE_ACTIVE_ANON,
        NR_ZONE_INACTIVE_FILE,
        NR_ZONE_ACTIVE_FILE,
        NR_ZONE_UNEVICTABLE,
        NR_ZONE_WRITE_PENDING,        /* Count of dirty, writeback and unstable pages */
        NR_MLOCK,                /* mlock()ed pages found and moved off LRU */
        /* Second 128 byte cacheline */
#if IS_ENABLED(CONFIG_ZSMALLOC)
        NR_ZSPAGES,                /* allocated in zsmalloc */
#endif
        NR_FREE_CMA_PAGES,
#ifdef CONFIG_UNACCEPTED_MEMORY
        NR_UNACCEPTED,
#endif
        NR_VM_ZONE_STAT_ITEMS };

enum node_stat_item {
        NR_LRU_BASE,
        NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
        NR_ACTIVE_ANON,                /*  "     "     "   "       "         */
        NR_INACTIVE_FILE,        /*  "     "     "   "       "         */
        NR_ACTIVE_FILE,                /*  "     "     "   "       "         */
        NR_UNEVICTABLE,                /*  "     "     "   "       "         */
        NR_SLAB_RECLAIMABLE_B,
        NR_SLAB_UNRECLAIMABLE_B,
        NR_ISOLATED_ANON,        /* Temporary isolated pages from anon lru */
        NR_ISOLATED_FILE,        /* Temporary isolated pages from file lru */
        WORKINGSET_NODES,
        WORKINGSET_REFAULT_BASE,
        WORKINGSET_REFAULT_ANON = WORKINGSET_REFAULT_BASE,
        WORKINGSET_REFAULT_FILE,
        WORKINGSET_ACTIVATE_BASE,
        WORKINGSET_ACTIVATE_ANON = WORKINGSET_ACTIVATE_BASE,
        WORKINGSET_ACTIVATE_FILE,
        WORKINGSET_RESTORE_BASE,
        WORKINGSET_RESTORE_ANON = WORKINGSET_RESTORE_BASE,
        WORKINGSET_RESTORE_FILE,
        WORKINGSET_NODERECLAIM,
        NR_ANON_MAPPED,        /* Mapped anonymous pages */
        NR_FILE_MAPPED,        /* pagecache pages mapped into pagetables.
                           only modified from process context */
        NR_FILE_PAGES,
        NR_FILE_DIRTY,
        NR_WRITEBACK,
        NR_SHMEM,                /* shmem pages (included tmpfs/GEM pages) */
        NR_SHMEM_THPS,
        NR_SHMEM_PMDMAPPED,
        NR_FILE_THPS,
        NR_FILE_PMDMAPPED,
        NR_ANON_THPS,
        NR_VMSCAN_WRITE,
        NR_VMSCAN_IMMEDIATE,        /* Prioritise for reclaim when writeback ends */
        NR_DIRTIED,                /* page dirtyings since bootup */
        NR_WRITTEN,                /* page writings since bootup */
        NR_THROTTLED_WRITTEN,        /* NR_WRITTEN while reclaim throttled */
        NR_KERNEL_MISC_RECLAIMABLE,        /* reclaimable non-slab kernel pages */
        NR_FOLL_PIN_ACQUIRED,        /* via: pin_user_page(), gup flag: FOLL_PIN */
        NR_FOLL_PIN_RELEASED,        /* pages returned via unpin_user_page() */
        NR_KERNEL_STACK_KB,        /* measured in KiB */
#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
        NR_KERNEL_SCS_KB,        /* measured in KiB */
#endif
        NR_PAGETABLE,                /* used for pagetables */
        NR_SECONDARY_PAGETABLE, /* secondary pagetables, KVM & IOMMU */
#ifdef CONFIG_IOMMU_SUPPORT
        NR_IOMMU_PAGES,                /* # of pages allocated by IOMMU */
#endif
#ifdef CONFIG_SWAP
        NR_SWAPCACHE,
#endif
#ifdef CONFIG_NUMA_BALANCING
        PGPROMOTE_SUCCESS,        /* promote successfully */
        /**
         * Candidate pages for promotion based on hint fault latency.  This
         * counter is used to control the promotion rate and adjust the hot
         * threshold.
         */
        PGPROMOTE_CANDIDATE,
        /**
         * Not rate-limited (NRL) candidate pages for those can be promoted
         * without considering hot threshold because of enough free pages in
         * fast-tier node.  These promotions bypass the regular hotness checks
         * and do NOT influence the promotion rate-limiter or
         * threshold-adjustment logic.
         * This is for statistics/monitoring purposes.
         */
        PGPROMOTE_CANDIDATE_NRL,
#endif
        /* PGDEMOTE_*: pages demoted */
        PGDEMOTE_KSWAPD,
        PGDEMOTE_DIRECT,
        PGDEMOTE_KHUGEPAGED,
        PGDEMOTE_PROACTIVE,
#ifdef CONFIG_HUGETLB_PAGE
        NR_HUGETLB,
#endif
        NR_BALLOON_PAGES,
        NR_KERNEL_FILE_PAGES,
        NR_VM_NODE_STAT_ITEMS
};

/*
 * Returns true if the item should be printed in THPs (/proc/vmstat
 * currently prints number of anon, file and shmem THPs. But the item
 * is charged in pages).
 */
static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item)
{
        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                return false;

        return item == NR_ANON_THPS ||
               item == NR_FILE_THPS ||
               item == NR_SHMEM_THPS ||
               item == NR_SHMEM_PMDMAPPED ||
               item == NR_FILE_PMDMAPPED;
}

/*
 * Returns true if the value is measured in bytes (most vmstat values are
 * measured in pages). This defines the API part, the internal representation
 * might be different.
 */
static __always_inline bool vmstat_item_in_bytes(int idx)
{
        /*
         * Global and per-node slab counters track slab pages.
         * It's expected that changes are multiples of PAGE_SIZE.
         * Internally values are stored in pages.
         *
         * Per-memcg and per-lruvec counters track memory, consumed
         * by individual slab objects. These counters are actually
         * byte-precise.
         */
        return (idx == NR_SLAB_RECLAIMABLE_B ||
                idx == NR_SLAB_UNRECLAIMABLE_B);
}

/*
 * We do arithmetic on the LRU lists in various places in the code,
 * so it is important to keep the active lists LRU_ACTIVE higher in
 * the array than the corresponding inactive lists, and to keep
 * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists.
 *
 * This has to be kept in sync with the statistics in zone_stat_item
 * above and the descriptions in vmstat_text in mm/vmstat.c
 */
#define LRU_BASE 0
#define LRU_ACTIVE 1
#define LRU_FILE 2

enum lru_list {
        LRU_INACTIVE_ANON = LRU_BASE,
        LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,
        LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
        LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
        LRU_UNEVICTABLE,
        NR_LRU_LISTS
};

enum vmscan_throttle_state {
        VMSCAN_THROTTLE_WRITEBACK,
        VMSCAN_THROTTLE_ISOLATED,
        VMSCAN_THROTTLE_NOPROGRESS,
        VMSCAN_THROTTLE_CONGESTED,
        NR_VMSCAN_THROTTLE,
};

#define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++)

#define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++)

static inline bool is_file_lru(enum lru_list lru)
{
        return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE);
}

static inline bool is_active_lru(enum lru_list lru)
{
        return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE);
}

#define WORKINGSET_ANON 0
#define WORKINGSET_FILE 1
#define ANON_AND_FILE 2

enum lruvec_flags {
        /*
         * An lruvec has many dirty pages backed by a congested BDI:
         * 1. LRUVEC_CGROUP_CONGESTED is set by cgroup-level reclaim.
         *    It can be cleared by cgroup reclaim or kswapd.
         * 2. LRUVEC_NODE_CONGESTED is set by kswapd node-level reclaim.
         *    It can only be cleared by kswapd.
         *
         * Essentially, kswapd can unthrottle an lruvec throttled by cgroup
         * reclaim, but not vice versa. This only applies to the root cgroup.
         * The goal is to prevent cgroup reclaim on the root cgroup (e.g.
         * memory.reclaim) to unthrottle an unbalanced node (that was throttled
         * by kswapd).
         */
        LRUVEC_CGROUP_CONGESTED,
        LRUVEC_NODE_CONGESTED,
};

#endif /* !__GENERATING_BOUNDS_H */

/*
 * Evictable folios are divided into multiple generations. The youngest and the
 * oldest generation numbers, max_seq and min_seq, are monotonically increasing.
 * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
 * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
 * corresponding generation. The gen counter in folio->flags stores gen+1 while
 * a folio is on one of lrugen->folios[]. Otherwise it stores 0.
 *
 * After a folio is faulted in, the aging needs to check the accessed bit at
 * least twice before handing this folio over to the eviction. The first check
 * clears the accessed bit from the initial fault; the second check makes sure
 * this folio hasn't been used since then. This process, AKA second chance,
 * requires a minimum of two generations, hence MIN_NR_GENS. And to maintain ABI
 * compatibility with the active/inactive LRU, e.g., /proc/vmstat, these two
 * generations are considered active; the rest of generations, if they exist,
 * are considered inactive. See lru_gen_is_active().
 *
 * PG_active is always cleared while a folio is on one of lrugen->folios[] so
 * that the sliding window needs not to worry about it. And it's set again when
 * a folio considered active is isolated for non-reclaiming purposes, e.g.,
 * migration. See lru_gen_add_folio() and lru_gen_del_folio().
 *
 * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the
 * number of categories of the active/inactive LRU when keeping track of
 * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits
 * in folio->flags, masked by LRU_GEN_MASK.
 */
#define MIN_NR_GENS                2U
#define MAX_NR_GENS                4U

/*
 * Each generation is divided into multiple tiers. A folio accessed N times
 * through file descriptors is in tier order_base_2(N). A folio in the first
 * tier (N=0,1) is marked by PG_referenced unless it was faulted in through page
 * tables or read ahead. A folio in the last tier (MAX_NR_TIERS-1) is marked by
 * PG_workingset. A folio in any other tier (1<N<5) between the first and last
 * is marked by additional bits of LRU_REFS_WIDTH in folio->flags.
 *
 * In contrast to moving across generations which requires the LRU lock, moving
 * across tiers only involves atomic operations on folio->flags and therefore
 * has a negligible cost in the buffered access path. In the eviction path,
 * comparisons of refaulted/(evicted+protected) from the first tier and the rest
 * infer whether folios accessed multiple times through file descriptors are
 * statistically hot and thus worth protecting.
 *
 * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the
 * number of categories of the active/inactive LRU when keeping track of
 * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in
 * folio->flags, masked by LRU_REFS_MASK.
 */
#define MAX_NR_TIERS                4U

#ifndef __GENERATING_BOUNDS_H

#define LRU_GEN_MASK                ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
#define LRU_REFS_MASK                ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)

/*
 * For folios accessed multiple times through file descriptors,
 * lru_gen_inc_refs() sets additional bits of LRU_REFS_WIDTH in folio->flags
 * after PG_referenced, then PG_workingset after LRU_REFS_WIDTH. After all its
 * bits are set, i.e., LRU_REFS_FLAGS|BIT(PG_workingset), a folio is lazily
 * promoted into the second oldest generation in the eviction path. And when
 * folio_inc_gen() does that, it clears LRU_REFS_FLAGS so that
 * lru_gen_inc_refs() can start over. Note that for this case, LRU_REFS_MASK is
 * only valid when PG_referenced is set.
 *
 * For folios accessed multiple times through page tables, folio_update_gen()
 * from a page table walk or lru_gen_set_refs() from a rmap walk sets
 * PG_referenced after the accessed bit is cleared for the first time.
 * Thereafter, those two paths set PG_workingset and promote folios to the
 * youngest generation. Like folio_inc_gen(), folio_update_gen() also clears
 * PG_referenced. Note that for this case, LRU_REFS_MASK is not used.
 *
 * For both cases above, after PG_workingset is set on a folio, it remains until
 * this folio is either reclaimed, or "deactivated" by lru_gen_clear_refs(). It
 * can be set again if lru_gen_test_recent() returns true upon a refault.
 */
#define LRU_REFS_FLAGS                (LRU_REFS_MASK | BIT(PG_referenced))

struct lruvec;
struct page_vma_mapped_walk;

#ifdef CONFIG_LRU_GEN

enum {
        LRU_GEN_ANON,
        LRU_GEN_FILE,
};

enum {
        LRU_GEN_CORE,
        LRU_GEN_MM_WALK,
        LRU_GEN_NONLEAF_YOUNG,
        NR_LRU_GEN_CAPS
};

#define MIN_LRU_BATCH                BITS_PER_LONG
#define MAX_LRU_BATCH                (MIN_LRU_BATCH * 64)

/* whether to keep historical stats from evicted generations */
#ifdef CONFIG_LRU_GEN_STATS
#define NR_HIST_GENS                MAX_NR_GENS
#else
#define NR_HIST_GENS                1U
#endif

/*
 * The youngest generation number is stored in max_seq for both anon and file
 * types as they are aged on an equal footing. The oldest generation numbers are
 * stored in min_seq[] separately for anon and file types so that they can be
 * incremented independently. Ideally min_seq[] are kept in sync when both anon
 * and file types are evictable. However, to adapt to situations like extreme
 * swappiness, they are allowed to be out of sync by at most
 * MAX_NR_GENS-MIN_NR_GENS-1.
 *
 * The number of pages in each generation is eventually consistent and therefore
 * can be transiently negative when reset_batch_size() is pending.
 */
struct lru_gen_folio {
        /* the aging increments the youngest generation number */
        unsigned long max_seq;
        /* the eviction increments the oldest generation numbers */
        unsigned long min_seq[ANON_AND_FILE];
        /* the birth time of each generation in jiffies */
        unsigned long timestamps[MAX_NR_GENS];
        /* the multi-gen LRU lists, lazily sorted on eviction */
        struct list_head folios[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
        /* the multi-gen LRU sizes, eventually consistent */
        long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
        /* the exponential moving average of refaulted */
        unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS];
        /* the exponential moving average of evicted+protected */
        unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS];
        /* can only be modified under the LRU lock */
        unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
        /* can be modified without holding the LRU lock */
        atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
        atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
        /* whether the multi-gen LRU is enabled */
        bool enabled;
        /* the memcg generation this lru_gen_folio belongs to */
        u8 gen;
        /* the list segment this lru_gen_folio belongs to */
        u8 seg;
        /* per-node lru_gen_folio list for global reclaim */
        struct hlist_nulls_node list;
};

enum {
        MM_LEAF_TOTAL,                /* total leaf entries */
        MM_LEAF_YOUNG,                /* young leaf entries */
        MM_NONLEAF_FOUND,        /* non-leaf entries found in Bloom filters */
        MM_NONLEAF_ADDED,        /* non-leaf entries added to Bloom filters */
        NR_MM_STATS
};

/* double-buffering Bloom filters */
#define NR_BLOOM_FILTERS        2

struct lru_gen_mm_state {
        /* synced with max_seq after each iteration */
        unsigned long seq;
        /* where the current iteration continues after */
        struct list_head *head;
        /* where the last iteration ended before */
        struct list_head *tail;
        /* Bloom filters flip after each iteration */
        unsigned long *filters[NR_BLOOM_FILTERS];
        /* the mm stats for debugging */
        unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
};

struct lru_gen_mm_walk {
        /* the lruvec under reclaim */
        struct lruvec *lruvec;
        /* max_seq from lru_gen_folio: can be out of date */
        unsigned long seq;
        /* the next address within an mm to scan */
        unsigned long next_addr;
        /* to batch promoted pages */
        int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
        /* to batch the mm stats */
        int mm_stats[NR_MM_STATS];
        /* total batched items */
        int batched;
        int swappiness;
        bool force_scan;
};

/*
 * For each node, memcgs are divided into two generations: the old and the
 * young. For each generation, memcgs are randomly sharded into multiple bins
 * to improve scalability. For each bin, the hlist_nulls is virtually divided
 * into three segments: the head, the tail and the default.
 *
 * An onlining memcg is added to the tail of a random bin in the old generation.
 * The eviction starts at the head of a random bin in the old generation. The
 * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes
 * the old generation, is incremented when all its bins become empty.
 *
 * There are four operations:
 * 1. MEMCG_LRU_HEAD, which moves a memcg to the head of a random bin in its
 *    current generation (old or young) and updates its "seg" to "head";
 * 2. MEMCG_LRU_TAIL, which moves a memcg to the tail of a random bin in its
 *    current generation (old or young) and updates its "seg" to "tail";
 * 3. MEMCG_LRU_OLD, which moves a memcg to the head of a random bin in the old
 *    generation, updates its "gen" to "old" and resets its "seg" to "default";
 * 4. MEMCG_LRU_YOUNG, which moves a memcg to the tail of a random bin in the
 *    young generation, updates its "gen" to "young" and resets its "seg" to
 *    "default".
 *
 * The events that trigger the above operations are:
 * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
 * 2. The first attempt to reclaim a memcg below low, which triggers
 *    MEMCG_LRU_TAIL;
 * 3. The first attempt to reclaim a memcg offlined or below reclaimable size
 *    threshold, which triggers MEMCG_LRU_TAIL;
 * 4. The second attempt to reclaim a memcg offlined or below reclaimable size
 *    threshold, which triggers MEMCG_LRU_YOUNG;
 * 5. Attempting to reclaim a memcg below min, which triggers MEMCG_LRU_YOUNG;
 * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG;
 * 7. Offlining a memcg, which triggers MEMCG_LRU_OLD.
 *
 * Notes:
 * 1. Memcg LRU only applies to global reclaim, and the round-robin incrementing
 *    of their max_seq counters ensures the eventual fairness to all eligible
 *    memcgs. For memcg reclaim, it still relies on mem_cgroup_iter().
 * 2. There are only two valid generations: old (seq) and young (seq+1).
 *    MEMCG_NR_GENS is set to three so that when reading the generation counter
 *    locklessly, a stale value (seq-1) does not wraparound to young.
 */
#define MEMCG_NR_GENS        3
#define MEMCG_NR_BINS        8

struct lru_gen_memcg {
        /* the per-node memcg generation counter */
        unsigned long seq;
        /* each memcg has one lru_gen_folio per node */
        unsigned long nr_memcgs[MEMCG_NR_GENS];
        /* per-node lru_gen_folio list for global reclaim */
        struct hlist_nulls_head        fifo[MEMCG_NR_GENS][MEMCG_NR_BINS];
        /* protects the above */
        spinlock_t lock;
};

void lru_gen_init_pgdat(struct pglist_data *pgdat);
void lru_gen_init_lruvec(struct lruvec *lruvec);
bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw);

void lru_gen_init_memcg(struct mem_cgroup *memcg);
void lru_gen_exit_memcg(struct mem_cgroup *memcg);
void lru_gen_online_memcg(struct mem_cgroup *memcg);
void lru_gen_offline_memcg(struct mem_cgroup *memcg);
void lru_gen_release_memcg(struct mem_cgroup *memcg);
void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid);

#else /* !CONFIG_LRU_GEN */

static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
{
}

static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
{
}

static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
{
        return false;
}

static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
{
}

static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
{
}

static inline void lru_gen_online_memcg(struct mem_cgroup *memcg)
{
}

static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg)
{
}

static inline void lru_gen_release_memcg(struct mem_cgroup *memcg)
{
}

static inline void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
{
}

#endif /* CONFIG_LRU_GEN */

struct lruvec {
        struct list_head                lists[NR_LRU_LISTS];
        /* per lruvec lru_lock for memcg */
        spinlock_t                        lru_lock;
        /*
         * These track the cost of reclaiming one LRU - file or anon -
         * over the other. As the observed cost of reclaiming one LRU
         * increases, the reclaim scan balance tips toward the other.
         */
        unsigned long                        anon_cost;
        unsigned long                        file_cost;
        /* Non-resident age, driven by LRU movement */
        atomic_long_t                        nonresident_age;
        /* Refaults at the time of last reclaim cycle */
        unsigned long                        refaults[ANON_AND_FILE];
        /* Various lruvec state flags (enum lruvec_flags) */
        unsigned long                        flags;
#ifdef CONFIG_LRU_GEN
        /* evictable pages divided into generations */
        struct lru_gen_folio                lrugen;
#ifdef CONFIG_LRU_GEN_WALKS_MMU
        /* to concurrently iterate lru_gen_mm_list */
        struct lru_gen_mm_state                mm_state;
#endif
#endif /* CONFIG_LRU_GEN */
#ifdef CONFIG_MEMCG
        struct pglist_data *pgdat;
#endif
        struct zswap_lruvec_state zswap_lruvec_state;
};

/* Isolate for asynchronous migration */
#define ISOLATE_ASYNC_MIGRATE        ((__force isolate_mode_t)0x4)
/* Isolate unevictable pages */
#define ISOLATE_UNEVICTABLE        ((__force isolate_mode_t)0x8)

/* LRU Isolation modes. */
typedef unsigned __bitwise isolate_mode_t;

enum zone_watermarks {
        WMARK_MIN,
        WMARK_LOW,
        WMARK_HIGH,
        WMARK_PROMO,
        NR_WMARK
};

/*
 * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. Two additional lists
 * are added for THP. One PCP list is used by GPF_MOVABLE, and the other PCP list
 * is used by GFP_UNMOVABLE and GFP_RECLAIMABLE.
 */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define NR_PCP_THP 2
#else
#define NR_PCP_THP 0
#endif
#define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1))
#define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP)

/*
 * Flags used in pcp->flags field.
 *
 * PCPF_PREV_FREE_HIGH_ORDER: a high-order page is freed in the
 * previous page freeing.  To avoid to drain PCP for an accident
 * high-order page freeing.
 *
 * PCPF_FREE_HIGH_BATCH: preserve "pcp->batch" pages in PCP before
 * draining PCP for consecutive high-order pages freeing without
 * allocation if data cache slice of CPU is large enough.  To reduce
 * zone lock contention and keep cache-hot pages reusing.
 */
#define        PCPF_PREV_FREE_HIGH_ORDER        BIT(0)
#define        PCPF_FREE_HIGH_BATCH                BIT(1)

struct per_cpu_pages {
        spinlock_t lock;        /* Protects lists field */
        int count;                /* number of pages in the list */
        int high;                /* high watermark, emptying needed */
        int high_min;                /* min high watermark */
        int high_max;                /* max high watermark */
        int batch;                /* chunk size for buddy add/remove */
        u8 flags;                /* protected by pcp->lock */
        u8 alloc_factor;        /* batch scaling factor during allocate */
#ifdef CONFIG_NUMA
        u8 expire;                /* When 0, remote pagesets are drained */
#endif
        short free_count;        /* consecutive free count */

        /* Lists of pages, one per migrate type stored on the pcp-lists */
        struct list_head lists[NR_PCP_LISTS];
} ____cacheline_aligned_in_smp;

struct per_cpu_zonestat {
#ifdef CONFIG_SMP
        s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
        s8 stat_threshold;
#endif
#ifdef CONFIG_NUMA
        /*
         * Low priority inaccurate counters that are only folded
         * on demand. Use a large type to avoid the overhead of
         * folding during refresh_cpu_vm_stats.
         */
        unsigned long vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
#endif
};

struct per_cpu_nodestat {
        s8 stat_threshold;
        s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS];
};

#endif /* !__GENERATING_BOUNDS.H */

enum zone_type {
        /*
         * ZONE_DMA and ZONE_DMA32 are used when there are peripherals not able
         * to DMA to all of the addressable memory (ZONE_NORMAL).
         * On architectures where this area covers the whole 32 bit address
         * space ZONE_DMA32 is used. ZONE_DMA is left for the ones with smaller
         * DMA addressing constraints. This distinction is important as a 32bit
         * DMA mask is assumed when ZONE_DMA32 is defined. Some 64-bit
         * platforms may need both zones as they support peripherals with
         * different DMA addressing limitations.
         */
#ifdef CONFIG_ZONE_DMA
        ZONE_DMA,
#endif
#ifdef CONFIG_ZONE_DMA32
        ZONE_DMA32,
#endif
        /*
         * Normal addressable memory is in ZONE_NORMAL. DMA operations can be
         * performed on pages in ZONE_NORMAL if the DMA devices support
         * transfers to all addressable memory.
         */
        ZONE_NORMAL,
#ifdef CONFIG_HIGHMEM
        /*
         * A memory area that is only addressable by the kernel through
         * mapping portions into its own address space. This is for example
         * used by i386 to allow the kernel to address the memory beyond
         * 900MB. The kernel will set up special mappings (page
         * table entries on i386) for each page that the kernel needs to
         * access.
         */
        ZONE_HIGHMEM,
#endif
        /*
         * ZONE_MOVABLE is similar to ZONE_NORMAL, except that it contains
         * movable pages with few exceptional cases described below. Main use
         * cases for ZONE_MOVABLE are to make memory offlining/unplug more
         * likely to succeed, and to locally limit unmovable allocations - e.g.,
         * to increase the number of THP/huge pages. Notable special cases are:
         *
         * 1. Pinned pages: (long-term) pinning of movable pages might
         *    essentially turn such pages unmovable. Therefore, we do not allow
         *    pinning long-term pages in ZONE_MOVABLE. When pages are pinned and
         *    faulted, they come from the right zone right away. However, it is
         *    still possible that address space already has pages in
         *    ZONE_MOVABLE at the time when pages are pinned (i.e. user has
         *    touches that memory before pinning). In such case we migrate them
         *    to a different zone. When migration fails - pinning fails.
         * 2. memblock allocations: kernelcore/movablecore setups might create
         *    situations where ZONE_MOVABLE contains unmovable allocations
         *    after boot. Memory offlining and allocations fail early.
         * 3. Memory holes: kernelcore/movablecore setups might create very rare
         *    situations where ZONE_MOVABLE contains memory holes after boot,
         *    for example, if we have sections that are only partially
         *    populated. Memory offlining and allocations fail early.
         * 4. PG_hwpoison pages: while poisoned pages can be skipped during
         *    memory offlining, such pages cannot be allocated.
         * 5. Unmovable PG_offline pages: in paravirtualized environments,
         *    hotplugged memory blocks might only partially be managed by the
         *    buddy (e.g., via XEN-balloon, Hyper-V balloon, virtio-mem). The
         *    parts not manged by the buddy are unmovable PG_offline pages. In
         *    some cases (virtio-mem), such pages can be skipped during
         *    memory offlining, however, cannot be moved/allocated. These
         *    techniques might use alloc_contig_range() to hide previously
         *    exposed pages from the buddy again (e.g., to implement some sort
         *    of memory unplug in virtio-mem).
         * 6. ZERO_PAGE(0), kernelcore/movablecore setups might create
         *    situations where ZERO_PAGE(0) which is allocated differently
         *    on different platforms may end up in a movable zone. ZERO_PAGE(0)
         *    cannot be migrated.
         * 7. Memory-hotplug: when using memmap_on_memory and onlining the
         *    memory to the MOVABLE zone, the vmemmap pages are also placed in
         *    such zone. Such pages cannot be really moved around as they are
         *    self-stored in the range, but they are treated as movable when
         *    the range they describe is about to be offlined.
         *
         * In general, no unmovable allocations that degrade memory offlining
         * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range())
         * have to expect that migrating pages in ZONE_MOVABLE can fail (even
         * if has_unmovable_pages() states that there are no unmovable pages,
         * there can be false negatives).
         */
        ZONE_MOVABLE,
#ifdef CONFIG_ZONE_DEVICE
        ZONE_DEVICE,
#endif
        __MAX_NR_ZONES

};

#ifndef __GENERATING_BOUNDS_H

#define ASYNC_AND_SYNC 2

struct zone {
        /* Read-mostly fields */

        /* zone watermarks, access with *_wmark_pages(zone) macros */
        unsigned long _watermark[NR_WMARK];
        unsigned long watermark_boost;

        unsigned long nr_reserved_highatomic;
        unsigned long nr_free_highatomic;

        /*
         * We don't know if the memory that we're going to allocate will be
         * freeable or/and it will be released eventually, so to avoid totally
         * wasting several GB of ram we must reserve some of the lower zone
         * memory (otherwise we risk to run OOM on the lower zones despite
         * there being tons of freeable ram on the higher zones).  This array is
         * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl
         * changes.
         */
        long lowmem_reserve[MAX_NR_ZONES];

#ifdef CONFIG_NUMA
        int node;
#endif
        struct pglist_data        *zone_pgdat;
        struct per_cpu_pages        __percpu *per_cpu_pageset;
        struct per_cpu_zonestat        __percpu *per_cpu_zonestats;
        /*
         * the high and batch values are copied to individual pagesets for
         * faster access
         */
        int pageset_high_min;
        int pageset_high_max;
        int pageset_batch;

#ifndef CONFIG_SPARSEMEM
        /*
         * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
         * In SPARSEMEM, this map is stored in struct mem_section
         */
        unsigned long                *pageblock_flags;
#endif /* CONFIG_SPARSEMEM */

        /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
        unsigned long                zone_start_pfn;

        /*
         * spanned_pages is the total pages spanned by the zone, including
         * holes, which is calculated as:
         *         spanned_pages = zone_end_pfn - zone_start_pfn;
         *
         * present_pages is physical pages existing within the zone, which
         * is calculated as:
         *        present_pages = spanned_pages - absent_pages(pages in holes);
         *
         * present_early_pages is present pages existing within the zone
         * located on memory available since early boot, excluding hotplugged
         * memory.
         *
         * managed_pages is present pages managed by the buddy system, which
         * is calculated as (reserved_pages includes pages allocated by the
         * bootmem allocator):
         *        managed_pages = present_pages - reserved_pages;
         *
         * cma pages is present pages that are assigned for CMA use
         * (MIGRATE_CMA).
         *
         * So present_pages may be used by memory hotplug or memory power
         * management logic to figure out unmanaged pages by checking
         * (present_pages - managed_pages). And managed_pages should be used
         * by page allocator and vm scanner to calculate all kinds of watermarks
         * and thresholds.
         *
         * Locking rules:
         *
         * zone_start_pfn and spanned_pages are protected by span_seqlock.
         * It is a seqlock because it has to be read outside of zone->lock,
         * and it is done in the main allocator path.  But, it is written
         * quite infrequently.
         *
         * The span_seq lock is declared along with zone->lock because it is
         * frequently read in proximity to zone->lock.  It's good to
         * give them a chance of being in the same cacheline.
         *
         * Write access to present_pages at runtime should be protected by
         * mem_hotplug_begin/done(). Any reader who can't tolerant drift of
         * present_pages should use get_online_mems() to get a stable value.
         */
        atomic_long_t                managed_pages;
        unsigned long                spanned_pages;
        unsigned long                present_pages;
#if defined(CONFIG_MEMORY_HOTPLUG)
        unsigned long                present_early_pages;
#endif
#ifdef CONFIG_CMA
        unsigned long                cma_pages;
#endif

        const char                *name;

#ifdef CONFIG_MEMORY_ISOLATION
        /*
         * Number of isolated pageblock. It is used to solve incorrect
         * freepage counting problem due to racy retrieving migratetype
         * of pageblock. Protected by zone->lock.
         */
        unsigned long                nr_isolate_pageblock;
#endif

#ifdef CONFIG_MEMORY_HOTPLUG
        /* see spanned/present_pages for more description */
        seqlock_t                span_seqlock;
#endif

        int initialized;

        /* Write-intensive fields used from the page allocator */
        CACHELINE_PADDING(_pad1_);

        /* free areas of different sizes */
        struct free_area        free_area[NR_PAGE_ORDERS];

#ifdef CONFIG_UNACCEPTED_MEMORY
        /* Pages to be accepted. All pages on the list are MAX_PAGE_ORDER */
        struct list_head        unaccepted_pages;

        /* To be called once the last page in the zone is accepted */
        struct work_struct        unaccepted_cleanup;
#endif

        /* zone flags, see below */
        unsigned long                flags;

        /* Primarily protects free_area */
        spinlock_t                lock;

        /* Pages to be freed when next trylock succeeds */
        struct llist_head        trylock_free_pages;

        /* Write-intensive fields used by compaction and vmstats. */
        CACHELINE_PADDING(_pad2_);

        /*
         * When free pages are below this point, additional steps are taken
         * when reading the number of free pages to avoid per-cpu counter
         * drift allowing watermarks to be breached
         */
        unsigned long percpu_drift_mark;

#if defined CONFIG_COMPACTION || defined CONFIG_CMA
        /* pfn where compaction free scanner should start */
        unsigned long                compact_cached_free_pfn;
        /* pfn where compaction migration scanner should start */
        unsigned long                compact_cached_migrate_pfn[ASYNC_AND_SYNC];
        unsigned long                compact_init_migrate_pfn;
        unsigned long                compact_init_free_pfn;
#endif

#ifdef CONFIG_COMPACTION
        /*
         * On compaction failure, 1<<compact_defer_shift compactions
         * are skipped before trying again. The number attempted since
         * last failure is tracked with compact_considered.
         * compact_order_failed is the minimum compaction failed order.
         */
        unsigned int                compact_considered;
        unsigned int                compact_defer_shift;
        int                        compact_order_failed;
#endif

#if defined CONFIG_COMPACTION || defined CONFIG_CMA
        /* Set to true when the PG_migrate_skip bits should be cleared */
        bool                        compact_blockskip_flush;
#endif

        bool                        contiguous;

        CACHELINE_PADDING(_pad3_);
        /* Zone statistics */
        atomic_long_t                vm_stat[NR_VM_ZONE_STAT_ITEMS];
        atomic_long_t                vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
} ____cacheline_internodealigned_in_smp;

enum pgdat_flags {
        PGDAT_DIRTY,                        /* reclaim scanning has recently found
                                         * many dirty file pages at the tail
                                         * of the LRU.
                                         */
        PGDAT_WRITEBACK,                /* reclaim scanning has recently found
                                         * many pages under writeback
                                         */
        PGDAT_RECLAIM_LOCKED,                /* prevents concurrent reclaim */
};

enum zone_flags {
        ZONE_BOOSTED_WATERMARK,                /* zone recently boosted watermarks.
                                         * Cleared when kswapd is woken.
                                         */
        ZONE_RECLAIM_ACTIVE,                /* kswapd may be scanning the zone. */
        ZONE_BELOW_HIGH,                /* zone is below high watermark. */
};

static inline unsigned long wmark_pages(const struct zone *z,
                                        enum zone_watermarks w)
{
        return z->_watermark[w] + z->watermark_boost;
}

static inline unsigned long min_wmark_pages(const struct zone *z)
{
        return wmark_pages(z, WMARK_MIN);
}

static inline unsigned long low_wmark_pages(const struct zone *z)
{
        return wmark_pages(z, WMARK_LOW);
}

static inline unsigned long high_wmark_pages(const struct zone *z)
{
        return wmark_pages(z, WMARK_HIGH);
}

static inline unsigned long promo_wmark_pages(const struct zone *z)
{
        return wmark_pages(z, WMARK_PROMO);
}

static inline unsigned long zone_managed_pages(const struct zone *zone)
{
        return (unsigned long)atomic_long_read(&zone->managed_pages);
}

static inline unsigned long zone_cma_pages(struct zone *zone)
{
#ifdef CONFIG_CMA
        return zone->cma_pages;
#else
        return 0;
#endif
}

static inline unsigned long zone_end_pfn(const struct zone *zone)
{
        return zone->zone_start_pfn + zone->spanned_pages;
}

static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
{
        return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone);
}

static inline bool zone_is_initialized(const struct zone *zone)
{
        return zone->initialized;
}

static inline bool zone_is_empty(const struct zone *zone)
{
        return zone->spanned_pages == 0;
}

#ifndef BUILD_VDSO32_64
/*
 * The zone field is never updated after free_area_init_core()
 * sets it, so none of the operations on it need to be atomic.
 */

/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */
#define SECTIONS_PGOFF                ((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
#define NODES_PGOFF                (SECTIONS_PGOFF - NODES_WIDTH)
#define ZONES_PGOFF                (NODES_PGOFF - ZONES_WIDTH)
#define LAST_CPUPID_PGOFF        (ZONES_PGOFF - LAST_CPUPID_WIDTH)
#define KASAN_TAG_PGOFF                (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
#define LRU_GEN_PGOFF                (KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
#define LRU_REFS_PGOFF                (LRU_GEN_PGOFF - LRU_REFS_WIDTH)

/*
 * Define the bit shifts to access each section.  For non-existent
 * sections we define the shift as 0; that plus a 0 mask ensures
 * the compiler will optimise away reference to them.
 */
#define SECTIONS_PGSHIFT        (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
#define NODES_PGSHIFT                (NODES_PGOFF * (NODES_WIDTH != 0))
#define ZONES_PGSHIFT                (ZONES_PGOFF * (ZONES_WIDTH != 0))
#define LAST_CPUPID_PGSHIFT        (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0))
#define KASAN_TAG_PGSHIFT        (KASAN_TAG_PGOFF * (KASAN_TAG_WIDTH != 0))

/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */
#ifdef NODE_NOT_IN_PAGE_FLAGS
#define ZONEID_SHIFT                (SECTIONS_SHIFT + ZONES_SHIFT)
#define ZONEID_PGOFF                ((SECTIONS_PGOFF < ZONES_PGOFF) ? \
                                                SECTIONS_PGOFF : ZONES_PGOFF)
#else
#define ZONEID_SHIFT                (NODES_SHIFT + ZONES_SHIFT)
#define ZONEID_PGOFF                ((NODES_PGOFF < ZONES_PGOFF) ? \
                                                NODES_PGOFF : ZONES_PGOFF)
#endif

#define ZONEID_PGSHIFT                (ZONEID_PGOFF * (ZONEID_SHIFT != 0))

#define ZONES_MASK                ((1UL << ZONES_WIDTH) - 1)
#define NODES_MASK                ((1UL << NODES_WIDTH) - 1)
#define SECTIONS_MASK                ((1UL << SECTIONS_WIDTH) - 1)
#define LAST_CPUPID_MASK        ((1UL << LAST_CPUPID_SHIFT) - 1)
#define KASAN_TAG_MASK                ((1UL << KASAN_TAG_WIDTH) - 1)
#define ZONEID_MASK                ((1UL << ZONEID_SHIFT) - 1)

static inline enum zone_type memdesc_zonenum(memdesc_flags_t flags)
{
        ASSERT_EXCLUSIVE_BITS(flags.f, ZONES_MASK << ZONES_PGSHIFT);
        return (flags.f >> ZONES_PGSHIFT) & ZONES_MASK;
}

static inline enum zone_type page_zonenum(const struct page *page)
{
        return memdesc_zonenum(page->flags);
}

static inline enum zone_type folio_zonenum(const struct folio *folio)
{
        return memdesc_zonenum(folio->flags);
}

#ifdef CONFIG_ZONE_DEVICE
static inline bool memdesc_is_zone_device(memdesc_flags_t mdf)
{
        return memdesc_zonenum(mdf) == ZONE_DEVICE;
}

static inline struct dev_pagemap *page_pgmap(const struct page *page)
{
        VM_WARN_ON_ONCE_PAGE(!memdesc_is_zone_device(page->flags), page);
        return page_folio(page)->pgmap;
}

/*
 * Consecutive zone device pages should not be merged into the same sgl
 * or bvec segment with other types of pages or if they belong to different
 * pgmaps. Otherwise getting the pgmap of a given segment is not possible
 * without scanning the entire segment. This helper returns true either if
 * both pages are not zone device pages or both pages are zone device pages
 * with the same pgmap.
 */
static inline bool zone_device_pages_have_same_pgmap(const struct page *a,
                                                     const struct page *b)
{
        if (memdesc_is_zone_device(a->flags) != memdesc_is_zone_device(b->flags))
                return false;
        if (!memdesc_is_zone_device(a->flags))
                return true;
        return page_pgmap(a) == page_pgmap(b);
}

extern void memmap_init_zone_device(struct zone *, unsigned long,
                                    unsigned long, struct dev_pagemap *);
#else
static inline bool memdesc_is_zone_device(memdesc_flags_t mdf)
{
        return false;
}
static inline bool zone_device_pages_have_same_pgmap(const struct page *a,
                                                     const struct page *b)
{
        return true;
}
static inline struct dev_pagemap *page_pgmap(const struct page *page)
{
        return NULL;
}
#endif

static inline bool is_zone_device_page(const struct page *page)
{
        return memdesc_is_zone_device(page->flags);
}

static inline bool folio_is_zone_device(const struct folio *folio)
{
        return memdesc_is_zone_device(folio->flags);
}

static inline bool is_zone_movable_page(const struct page *page)
{
        return page_zonenum(page) == ZONE_MOVABLE;
}

static inline bool folio_is_zone_movable(const struct folio *folio)
{
        return folio_zonenum(folio) == ZONE_MOVABLE;
}
#endif

/*
 * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty
 * intersection with the given zone
 */
static inline bool zone_intersects(const struct zone *zone,
                unsigned long start_pfn, unsigned long nr_pages)
{
        if (zone_is_empty(zone))
                return false;
        if (start_pfn >= zone_end_pfn(zone) ||
            start_pfn + nr_pages <= zone->zone_start_pfn)
                return false;

        return true;
}

/*
 * The "priority" of VM scanning is how much of the queues we will scan in one
 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
 * queues ("queue_length >> 12") during an aging round.
 */
#define DEF_PRIORITY 12

/* Maximum number of zones on a zonelist */
#define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES)

enum {
        ZONELIST_FALLBACK,        /* zonelist with fallback */
#ifdef CONFIG_NUMA
        /*
         * The NUMA zonelists are doubled because we need zonelists that
         * restrict the allocations to a single node for __GFP_THISNODE.
         */
        ZONELIST_NOFALLBACK,        /* zonelist without fallback (__GFP_THISNODE) */
#endif
        MAX_ZONELISTS
};

/*
 * This struct contains information about a zone in a zonelist. It is stored
 * here to avoid dereferences into large structures and lookups of tables
 */
struct zoneref {
        struct zone *zone;        /* Pointer to actual zone */
        int zone_idx;                /* zone_idx(zoneref->zone) */
};

/*
 * One allocation request operates on a zonelist. A zonelist
 * is a list of zones, the first one is the 'goal' of the
 * allocation, the other zones are fallback zones, in decreasing
 * priority.
 *
 * To speed the reading of the zonelist, the zonerefs contain the zone index
 * of the entry being read. Helper functions to access information given
 * a struct zoneref are
 *
 * zonelist_zone()        - Return the struct zone * for an entry in _zonerefs
 * zonelist_zone_idx()        - Return the index of the zone for an entry
 * zonelist_node_idx()        - Return the index of the node for an entry
 */
struct zonelist {
        struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
};

/*
 * The array of struct pages for flatmem.
 * It must be declared for SPARSEMEM as well because there are configurations
 * that rely on that.
 */
extern struct page *mem_map;

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct deferred_split {
        spinlock_t split_queue_lock;
        struct list_head split_queue;
        unsigned long split_queue_len;
};
#endif

#ifdef CONFIG_MEMORY_FAILURE
/*
 * Per NUMA node memory failure handling statistics.
 */
struct memory_failure_stats {
        /*
         * Number of raw pages poisoned.
         * Cases not accounted: memory outside kernel control, offline page,
         * arch-specific memory_failure (SGX), hwpoison_filter() filtered
         * error events, and unpoison actions from hwpoison_unpoison.
         */
        unsigned long total;
        /*
         * Recovery results of poisoned raw pages handled by memory_failure,
         * in sync with mf_result.
         * total = ignored + failed + delayed + recovered.
         * total * PAGE_SIZE * #nodes = /proc/meminfo/HardwareCorrupted.
         */
        unsigned long ignored;
        unsigned long failed;
        unsigned long delayed;
        unsigned long recovered;
};
#endif

/*
 * On NUMA machines, each NUMA node would have a pg_data_t to describe
 * it's memory layout. On UMA machines there is a single pglist_data which
 * describes the whole memory.
 *
 * Memory statistics and page replacement data structures are maintained on a
 * per-zone basis.
 */
typedef struct pglist_data {
        /*
         * node_zones contains just the zones for THIS node. Not all of the
         * zones may be populated, but it is the full list. It is referenced by
         * this node's node_zonelists as well as other node's node_zonelists.
         */
        struct zone node_zones[MAX_NR_ZONES];

        /*
         * node_zonelists contains references to all zones in all nodes.
         * Generally the first zones will be references to this node's
         * node_zones.
         */
        struct zonelist node_zonelists[MAX_ZONELISTS];

        int nr_zones; /* number of populated zones in this node */
#ifdef CONFIG_FLATMEM        /* means !SPARSEMEM */
        struct page *node_mem_map;
#ifdef CONFIG_PAGE_EXTENSION
        struct page_ext *node_page_ext;
#endif
#endif
#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)
        /*
         * Must be held any time you expect node_start_pfn,
         * node_present_pages, node_spanned_pages or nr_zones to stay constant.
         * Also synchronizes pgdat->first_deferred_pfn during deferred page
         * init.
         *
         * pgdat_resize_lock() and pgdat_resize_unlock() are provided to
         * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG
         * or CONFIG_DEFERRED_STRUCT_PAGE_INIT.
         *
         * Nests above zone->lock and zone->span_seqlock
         */
        spinlock_t node_size_lock;
#endif
        unsigned long node_start_pfn;
        unsigned long node_present_pages; /* total number of physical pages */
        unsigned long node_spanned_pages; /* total size of physical page
                                             range, including holes */
        int node_id;
        wait_queue_head_t kswapd_wait;
        wait_queue_head_t pfmemalloc_wait;

        /* workqueues for throttling reclaim for different reasons. */
        wait_queue_head_t reclaim_wait[NR_VMSCAN_THROTTLE];

        atomic_t nr_writeback_throttled;/* nr of writeback-throttled tasks */
        unsigned long nr_reclaim_start;        /* nr pages written while throttled
                                         * when throttling started. */
#ifdef CONFIG_MEMORY_HOTPLUG
        struct mutex kswapd_lock;
#endif
        struct task_struct *kswapd;        /* Protected by kswapd_lock */
        int kswapd_order;
        enum zone_type kswapd_highest_zoneidx;

        atomic_t kswapd_failures;        /* Number of 'reclaimed == 0' runs */

#ifdef CONFIG_COMPACTION
        int kcompactd_max_order;
        enum zone_type kcompactd_highest_zoneidx;
        wait_queue_head_t kcompactd_wait;
        struct task_struct *kcompactd;
        bool proactive_compact_trigger;
#endif
        /*
         * This is a per-node reserve of pages that are not available
         * to userspace allocations.
         */
        unsigned long                totalreserve_pages;

#ifdef CONFIG_NUMA
        /*
         * node reclaim becomes active if more unmapped pages exist.
         */
        unsigned long                min_unmapped_pages;
        unsigned long                min_slab_pages;
#endif /* CONFIG_NUMA */

        /* Write-intensive fields used by page reclaim */
        CACHELINE_PADDING(_pad1_);

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
        /*
         * If memory initialisation on large machines is deferred then this
         * is the first PFN that needs to be initialised.
         */
        unsigned long first_deferred_pfn;
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        struct deferred_split deferred_split_queue;
#endif

#ifdef CONFIG_NUMA_BALANCING
        /* start time in ms of current promote rate limit period */
        unsigned int nbp_rl_start;
        /* number of promote candidate pages at start time of current rate limit period */
        unsigned long nbp_rl_nr_cand;
        /* promote threshold in ms */
        unsigned int nbp_threshold;
        /* start time in ms of current promote threshold adjustment period */
        unsigned int nbp_th_start;
        /*
         * number of promote candidate pages at start time of current promote
         * threshold adjustment period
         */
        unsigned long nbp_th_nr_cand;
#endif
        /* Fields commonly accessed by the page reclaim scanner */

        /*
         * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED.
         *
         * Use mem_cgroup_lruvec() to look up lruvecs.
         */
        struct lruvec                __lruvec;

        unsigned long                flags;

#ifdef CONFIG_LRU_GEN
        /* kswap mm walk data */
        struct lru_gen_mm_walk mm_walk;
        /* lru_gen_folio list */
        struct lru_gen_memcg memcg_lru;
#endif

        CACHELINE_PADDING(_pad2_);

        /* Per-node vmstats */
        struct per_cpu_nodestat __percpu *per_cpu_nodestats;
        atomic_long_t                vm_stat[NR_VM_NODE_STAT_ITEMS];
#ifdef CONFIG_NUMA
        struct memory_tier __rcu *memtier;
#endif
#ifdef CONFIG_MEMORY_FAILURE
        struct memory_failure_stats mf_stats;
#endif
} pg_data_t;

#define node_present_pages(nid)        (NODE_DATA(nid)->node_present_pages)
#define node_spanned_pages(nid)        (NODE_DATA(nid)->node_spanned_pages)

#define node_start_pfn(nid)        (NODE_DATA(nid)->node_start_pfn)
#define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid))

static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
{
        return pgdat->node_start_pfn + pgdat->node_spanned_pages;
}

#include <linux/memory_hotplug.h>

void build_all_zonelists(pg_data_t *pgdat);
void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
                   enum zone_type highest_zoneidx);
bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
                         int highest_zoneidx, unsigned int alloc_flags,
                         long free_pages);
bool zone_watermark_ok(struct zone *z, unsigned int order,
                unsigned long mark, int highest_zoneidx,
                unsigned int alloc_flags);
/*
 * Memory initialization context, use to differentiate memory added by
 * the platform statically or via memory hotplug interface.
 */
enum meminit_context {
        MEMINIT_EARLY,
        MEMINIT_HOTPLUG,
};

extern void init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
                                     unsigned long size);

extern void lruvec_init(struct lruvec *lruvec);

static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
{
#ifdef CONFIG_MEMCG
        return lruvec->pgdat;
#else
        return container_of(lruvec, struct pglist_data, __lruvec);
#endif
}

#ifdef CONFIG_HAVE_MEMORYLESS_NODES
int local_memory_node(int node_id);
#else
static inline int local_memory_node(int node_id) { return node_id; };
#endif

/*
 * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
 */
#define zone_idx(zone)                ((zone) - (zone)->zone_pgdat->node_zones)

#ifdef CONFIG_ZONE_DEVICE
static inline bool zone_is_zone_device(const struct zone *zone)
{
        return zone_idx(zone) == ZONE_DEVICE;
}
#else
static inline bool zone_is_zone_device(const struct zone *zone)
{
        return false;
}
#endif

/*
 * Returns true if a zone has pages managed by the buddy allocator.
 * All the reclaim decisions have to use this function rather than
 * populated_zone(). If the whole zone is reserved then we can easily
 * end up with populated_zone() && !managed_zone().
 */
static inline bool managed_zone(const struct zone *zone)
{
        return zone_managed_pages(zone);
}

/* Returns true if a zone has memory */
static inline bool populated_zone(const struct zone *zone)
{
        return zone->present_pages;
}

#ifdef CONFIG_NUMA
static inline int zone_to_nid(const struct zone *zone)
{
        return zone->node;
}

static inline void zone_set_nid(struct zone *zone, int nid)
{
        zone->node = nid;
}
#else
static inline int zone_to_nid(const struct zone *zone)
{
        return 0;
}

static inline void zone_set_nid(struct zone *zone, int nid) {}
#endif

extern int movable_zone;

static inline int is_highmem_idx(enum zone_type idx)
{
#ifdef CONFIG_HIGHMEM
        return (idx == ZONE_HIGHMEM ||
                (idx == ZONE_MOVABLE && movable_zone == ZONE_HIGHMEM));
#else
        return 0;
#endif
}

/**
 * is_highmem - helper function to quickly check if a struct zone is a
 *              highmem zone or not.  This is an attempt to keep references
 *              to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum.
 * @zone: pointer to struct zone variable
 * Return: 1 for a highmem zone, 0 otherwise
 */
static inline int is_highmem(const struct zone *zone)
{
        return is_highmem_idx(zone_idx(zone));
}

#ifdef CONFIG_ZONE_DMA
bool has_managed_dma(void);
#else
static inline bool has_managed_dma(void)
{
        return false;
}
#endif


#ifndef CONFIG_NUMA

extern struct pglist_data contig_page_data;
static inline struct pglist_data *NODE_DATA(int nid)
{
        return &contig_page_data;
}

#else /* CONFIG_NUMA */

#include <asm/mmzone.h>

#endif /* !CONFIG_NUMA */

extern struct pglist_data *first_online_pgdat(void);
extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat);
extern struct zone *next_zone(struct zone *zone);

/**
 * for_each_online_pgdat - helper macro to iterate over all online nodes
 * @pgdat: pointer to a pg_data_t variable
 */
#define for_each_online_pgdat(pgdat)                        \
        for (pgdat = first_online_pgdat();                \
             pgdat;                                        \
             pgdat = next_online_pgdat(pgdat))
/**
 * for_each_zone - helper macro to iterate over all memory zones
 * @zone: pointer to struct zone variable
 *
 * The user only needs to declare the zone variable, for_each_zone
 * fills it in.
 */
#define for_each_zone(zone)                                \
        for (zone = (first_online_pgdat())->node_zones; \
             zone;                                        \
             zone = next_zone(zone))

#define for_each_populated_zone(zone)                        \
        for (zone = (first_online_pgdat())->node_zones; \
             zone;                                        \
             zone = next_zone(zone))                        \
                if (!populated_zone(zone))                \
                        ; /* do nothing */                \
                else

static inline struct zone *zonelist_zone(struct zoneref *zoneref)
{
        return zoneref->zone;
}

static inline int zonelist_zone_idx(const struct zoneref *zoneref)
{
        return zoneref->zone_idx;
}

static inline int zonelist_node_idx(const struct zoneref *zoneref)
{
        return zone_to_nid(zoneref->zone);
}

struct zoneref *__next_zones_zonelist(struct zoneref *z,
                                        enum zone_type highest_zoneidx,
                                        nodemask_t *nodes);

/**
 * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point
 * @z: The cursor used as a starting point for the search
 * @highest_zoneidx: The zone index of the highest zone to return
 * @nodes: An optional nodemask to filter the zonelist with
 *
 * This function returns the next zone at or below a given zone index that is
 * within the allowed nodemask using a cursor as the starting point for the
 * search. The zoneref returned is a cursor that represents the current zone
 * being examined. It should be advanced by one before calling
 * next_zones_zonelist again.
 *
 * Return: the next zone at or below highest_zoneidx within the allowed
 * nodemask using a cursor within a zonelist as a starting point
 */
static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z,
                                        enum zone_type highest_zoneidx,
                                        nodemask_t *nodes)
{
        if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx))
                return z;
        return __next_zones_zonelist(z, highest_zoneidx, nodes);
}

/**
 * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
 * @zonelist: The zonelist to search for a suitable zone
 * @highest_zoneidx: The zone index of the highest zone to return
 * @nodes: An optional nodemask to filter the zonelist with
 *
 * This function returns the first zone at or below a given zone index that is
 * within the allowed nodemask. The zoneref returned is a cursor that can be
 * used to iterate the zonelist with next_zones_zonelist by advancing it by
 * one before calling.
 *
 * When no eligible zone is found, zoneref->zone is NULL (zoneref itself is
 * never NULL). This may happen either genuinely, or due to concurrent nodemask
 * update due to cpuset modification.
 *
 * Return: Zoneref pointer for the first suitable zone found
 */
static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
                                        enum zone_type highest_zoneidx,
                                        nodemask_t *nodes)
{
        return next_zones_zonelist(zonelist->_zonerefs,
                                                        highest_zoneidx, nodes);
}

/**
 * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask
 * @zone: The current zone in the iterator
 * @z: The current pointer within zonelist->_zonerefs being iterated
 * @zlist: The zonelist being iterated
 * @highidx: The zone index of the highest zone to return
 * @nodemask: Nodemask allowed by the allocator
 *
 * This iterator iterates though all zones at or below a given zone index and
 * within a given nodemask
 */
#define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
        for (z = first_zones_zonelist(zlist, highidx, nodemask), zone = zonelist_zone(z);        \
                zone;                                                        \
                z = next_zones_zonelist(++z, highidx, nodemask),        \
                        zone = zonelist_zone(z))

#define for_next_zone_zonelist_nodemask(zone, z, highidx, nodemask) \
        for (zone = zonelist_zone(z);        \
                zone;                                                        \
                z = next_zones_zonelist(++z, highidx, nodemask),        \
                        zone = zonelist_zone(z))


/**
 * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
 * @zone: The current zone in the iterator
 * @z: The current pointer within zonelist->zones being iterated
 * @zlist: The zonelist being iterated
 * @highidx: The zone index of the highest zone to return
 *
 * This iterator iterates though all zones at or below a given zone index.
 */
#define for_each_zone_zonelist(zone, z, zlist, highidx) \
        for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL)

/* Whether the 'nodes' are all movable nodes */
static inline bool movable_only_nodes(nodemask_t *nodes)
{
        struct zonelist *zonelist;
        struct zoneref *z;
        int nid;

        if (nodes_empty(*nodes))
                return false;

        /*
         * We can chose arbitrary node from the nodemask to get a
         * zonelist as they are interlinked. We just need to find
         * at least one zone that can satisfy kernel allocations.
         */
        nid = first_node(*nodes);
        zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
        z = first_zones_zonelist(zonelist, ZONE_NORMAL,        nodes);
        return (!zonelist_zone(z)) ? true : false;
}


#ifdef CONFIG_SPARSEMEM
#include <asm/sparsemem.h>
#endif

#ifdef CONFIG_FLATMEM
#define pfn_to_nid(pfn)                (0)
#endif

#ifdef CONFIG_SPARSEMEM

/*
 * PA_SECTION_SHIFT                physical address to/from section number
 * PFN_SECTION_SHIFT                pfn to/from section number
 */
#define PA_SECTION_SHIFT        (SECTION_SIZE_BITS)
#define PFN_SECTION_SHIFT        (SECTION_SIZE_BITS - PAGE_SHIFT)

#define NR_MEM_SECTIONS                (1UL << SECTIONS_SHIFT)

#define PAGES_PER_SECTION       (1UL << PFN_SECTION_SHIFT)
#define PAGE_SECTION_MASK        (~(PAGES_PER_SECTION-1))

#define SECTION_BLOCKFLAGS_BITS \
        ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS)

#if (MAX_PAGE_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS
#error Allocator MAX_PAGE_ORDER exceeds SECTION_SIZE
#endif

static inline unsigned long pfn_to_section_nr(unsigned long pfn)
{
        return pfn >> PFN_SECTION_SHIFT;
}
static inline unsigned long section_nr_to_pfn(unsigned long sec)
{
        return sec << PFN_SECTION_SHIFT;
}

#define SECTION_ALIGN_UP(pfn)        (((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK)
#define SECTION_ALIGN_DOWN(pfn)        ((pfn) & PAGE_SECTION_MASK)

#define SUBSECTION_SHIFT 21
#define SUBSECTION_SIZE (1UL << SUBSECTION_SHIFT)

#define PFN_SUBSECTION_SHIFT (SUBSECTION_SHIFT - PAGE_SHIFT)
#define PAGES_PER_SUBSECTION (1UL << PFN_SUBSECTION_SHIFT)
#define PAGE_SUBSECTION_MASK (~(PAGES_PER_SUBSECTION-1))

#if SUBSECTION_SHIFT > SECTION_SIZE_BITS
#error Subsection size exceeds section size
#else
#define SUBSECTIONS_PER_SECTION (1UL << (SECTION_SIZE_BITS - SUBSECTION_SHIFT))
#endif

#define SUBSECTION_ALIGN_UP(pfn) ALIGN((pfn), PAGES_PER_SUBSECTION)
#define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK)

struct mem_section_usage {
        struct rcu_head rcu;
#ifdef CONFIG_SPARSEMEM_VMEMMAP
        DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
#endif
        /* See declaration of similar field in struct zone */
        unsigned long pageblock_flags[0];
};

void subsection_map_init(unsigned long pfn, unsigned long nr_pages);

struct page;
struct page_ext;
struct mem_section {
        /*
         * This is, logically, a pointer to an array of struct
         * pages.  However, it is stored with some other magic.
         * (see sparse.c::sparse_init_one_section())
         *
         * Additionally during early boot we encode node id of
         * the location of the section here to guide allocation.
         * (see sparse.c::memory_present())
         *
         * Making it a UL at least makes someone do a cast
         * before using it wrong.
         */
        unsigned long section_mem_map;

        struct mem_section_usage *usage;
#ifdef CONFIG_PAGE_EXTENSION
        /*
         * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use
         * section. (see page_ext.h about this.)
         */
        struct page_ext *page_ext;
        unsigned long pad;
#endif
        /*
         * WARNING: mem_section must be a power-of-2 in size for the
         * calculation and use of SECTION_ROOT_MASK to make sense.
         */
};

#ifdef CONFIG_SPARSEMEM_EXTREME
#define SECTIONS_PER_ROOT       (PAGE_SIZE / sizeof (struct mem_section))
#else
#define SECTIONS_PER_ROOT        1
#endif

#define SECTION_NR_TO_ROOT(sec)        ((sec) / SECTIONS_PER_ROOT)
#define NR_SECTION_ROOTS        DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT)
#define SECTION_ROOT_MASK        (SECTIONS_PER_ROOT - 1)

#ifdef CONFIG_SPARSEMEM_EXTREME
extern struct mem_section **mem_section;
#else
extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
#endif

static inline unsigned long *section_to_usemap(struct mem_section *ms)
{
        return ms->usage->pageblock_flags;
}

static inline struct mem_section *__nr_to_section(unsigned long nr)
{
        unsigned long root = SECTION_NR_TO_ROOT(nr);

        if (unlikely(root >= NR_SECTION_ROOTS))
                return NULL;

#ifdef CONFIG_SPARSEMEM_EXTREME
        if (!mem_section || !mem_section[root])
                return NULL;
#endif
        return &mem_section[root][nr & SECTION_ROOT_MASK];
}
extern size_t mem_section_usage_size(void);

/*
 * We use the lower bits of the mem_map pointer to store
 * a little bit of information.  The pointer is calculated
 * as mem_map - section_nr_to_pfn(pnum).  The result is
 * aligned to the minimum alignment of the two values:
 *   1. All mem_map arrays are page-aligned.
 *   2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT
 *      lowest bits.  PFN_SECTION_SHIFT is arch-specific
 *      (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the
 *      worst combination is powerpc with 256k pages,
 *      which results in PFN_SECTION_SHIFT equal 6.
 * To sum it up, at least 6 bits are available on all architectures.
 * However, we can exceed 6 bits on some other architectures except
 * powerpc (e.g. 15 bits are available on x86_64, 13 bits are available
 * with the worst case of 64K pages on arm64) if we make sure the
 * exceeded bit is not applicable to powerpc.
 */
enum {
        SECTION_MARKED_PRESENT_BIT,
        SECTION_HAS_MEM_MAP_BIT,
        SECTION_IS_ONLINE_BIT,
        SECTION_IS_EARLY_BIT,
#ifdef CONFIG_ZONE_DEVICE
        SECTION_TAINT_ZONE_DEVICE_BIT,
#endif
#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
        SECTION_IS_VMEMMAP_PREINIT_BIT,
#endif
        SECTION_MAP_LAST_BIT,
};

#define SECTION_MARKED_PRESENT                BIT(SECTION_MARKED_PRESENT_BIT)
#define SECTION_HAS_MEM_MAP                BIT(SECTION_HAS_MEM_MAP_BIT)
#define SECTION_IS_ONLINE                BIT(SECTION_IS_ONLINE_BIT)
#define SECTION_IS_EARLY                BIT(SECTION_IS_EARLY_BIT)
#ifdef CONFIG_ZONE_DEVICE
#define SECTION_TAINT_ZONE_DEVICE        BIT(SECTION_TAINT_ZONE_DEVICE_BIT)
#endif
#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
#define SECTION_IS_VMEMMAP_PREINIT        BIT(SECTION_IS_VMEMMAP_PREINIT_BIT)
#endif
#define SECTION_MAP_MASK                (~(BIT(SECTION_MAP_LAST_BIT) - 1))
#define SECTION_NID_SHIFT                SECTION_MAP_LAST_BIT

static inline struct page *__section_mem_map_addr(struct mem_section *section)
{
        unsigned long map = section->section_mem_map;
        map &= SECTION_MAP_MASK;
        return (struct page *)map;
}

static inline int present_section(const struct mem_section *section)
{
        return (section && (section->section_mem_map & SECTION_MARKED_PRESENT));
}

static inline int present_section_nr(unsigned long nr)
{
        return present_section(__nr_to_section(nr));
}

static inline int valid_section(const struct mem_section *section)
{
        return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP));
}

static inline int early_section(const struct mem_section *section)
{
        return (section && (section->section_mem_map & SECTION_IS_EARLY));
}

static inline int valid_section_nr(unsigned long nr)
{
        return valid_section(__nr_to_section(nr));
}

static inline int online_section(const struct mem_section *section)
{
        return (section && (section->section_mem_map & SECTION_IS_ONLINE));
}

#ifdef CONFIG_ZONE_DEVICE
static inline int online_device_section(const struct mem_section *section)
{
        unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE;

        return section && ((section->section_mem_map & flags) == flags);
}
#else
static inline int online_device_section(const struct mem_section *section)
{
        return 0;
}
#endif

#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
static inline int preinited_vmemmap_section(const struct mem_section *section)
{
        return (section &&
                (section->section_mem_map & SECTION_IS_VMEMMAP_PREINIT));
}

void sparse_vmemmap_init_nid_early(int nid);
void sparse_vmemmap_init_nid_late(int nid);

#else
static inline int preinited_vmemmap_section(const struct mem_section *section)
{
        return 0;
}
static inline void sparse_vmemmap_init_nid_early(int nid)
{
}

static inline void sparse_vmemmap_init_nid_late(int nid)
{
}
#endif

static inline int online_section_nr(unsigned long nr)
{
        return online_section(__nr_to_section(nr));
}

#ifdef CONFIG_MEMORY_HOTPLUG
void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
#endif

static inline struct mem_section *__pfn_to_section(unsigned long pfn)
{
        return __nr_to_section(pfn_to_section_nr(pfn));
}

extern unsigned long __highest_present_section_nr;

static inline int subsection_map_index(unsigned long pfn)
{
        return (pfn & ~(PAGE_SECTION_MASK)) / PAGES_PER_SUBSECTION;
}

#ifdef CONFIG_SPARSEMEM_VMEMMAP
static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
{
        int idx = subsection_map_index(pfn);
        struct mem_section_usage *usage = READ_ONCE(ms->usage);

        return usage ? test_bit(idx, usage->subsection_map) : 0;
}

static inline bool pfn_section_first_valid(struct mem_section *ms, unsigned long *pfn)
{
        struct mem_section_usage *usage = READ_ONCE(ms->usage);
        int idx = subsection_map_index(*pfn);
        unsigned long bit;

        if (!usage)
                return false;

        if (test_bit(idx, usage->subsection_map))
                return true;

        /* Find the next subsection that exists */
        bit = find_next_bit(usage->subsection_map, SUBSECTIONS_PER_SECTION, idx);
        if (bit == SUBSECTIONS_PER_SECTION)
                return false;

        *pfn = (*pfn & PAGE_SECTION_MASK) + (bit * PAGES_PER_SUBSECTION);
        return true;
}
#else
static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
{
        return 1;
}

static inline bool pfn_section_first_valid(struct mem_section *ms, unsigned long *pfn)
{
        return true;
}
#endif

void sparse_init_early_section(int nid, struct page *map, unsigned long pnum,
                               unsigned long flags);

#ifndef CONFIG_HAVE_ARCH_PFN_VALID
/**
 * pfn_valid - check if there is a valid memory map entry for a PFN
 * @pfn: the page frame number to check
 *
 * Check if there is a valid memory map entry aka struct page for the @pfn.
 * Note, that availability of the memory map entry does not imply that
 * there is actual usable memory at that @pfn. The struct page may
 * represent a hole or an unusable page frame.
 *
 * Return: 1 for PFNs that have memory map entries and 0 otherwise
 */
static inline int pfn_valid(unsigned long pfn)
{
        struct mem_section *ms;
        int ret;

        /*
         * Ensure the upper PAGE_SHIFT bits are clear in the
         * pfn. Else it might lead to false positives when
         * some of the upper bits are set, but the lower bits
         * match a valid pfn.
         */
        if (PHYS_PFN(PFN_PHYS(pfn)) != pfn)
                return 0;

        if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
                return 0;
        ms = __pfn_to_section(pfn);
        rcu_read_lock_sched();
        if (!valid_section(ms)) {
                rcu_read_unlock_sched();
                return 0;
        }
        /*
         * Traditionally early sections always returned pfn_valid() for
         * the entire section-sized span.
         */
        ret = early_section(ms) || pfn_section_valid(ms, pfn);
        rcu_read_unlock_sched();

        return ret;
}

/* Returns end_pfn or higher if no valid PFN remaining in range */
static inline unsigned long first_valid_pfn(unsigned long pfn, unsigned long end_pfn)
{
        unsigned long nr = pfn_to_section_nr(pfn);

        rcu_read_lock_sched();

        while (nr <= __highest_present_section_nr && pfn < end_pfn) {
                struct mem_section *ms = __pfn_to_section(pfn);

                if (valid_section(ms) &&
                    (early_section(ms) || pfn_section_first_valid(ms, &pfn))) {
                        rcu_read_unlock_sched();
                        return pfn;
                }

                /* Nothing left in this section? Skip to next section */
                nr++;
                pfn = section_nr_to_pfn(nr);
        }

        rcu_read_unlock_sched();
        return end_pfn;
}

static inline unsigned long next_valid_pfn(unsigned long pfn, unsigned long end_pfn)
{
        pfn++;

        if (pfn >= end_pfn)
                return end_pfn;

        /*
         * Either every PFN within the section (or subsection for VMEMMAP) is
         * valid, or none of them are. So there's no point repeating the check
         * for every PFN; only call first_valid_pfn() again when crossing a
         * (sub)section boundary (i.e. !(pfn & ~PAGE_{SUB,}SECTION_MASK)).
         */
        if (pfn & ~(IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP) ?
                   PAGE_SUBSECTION_MASK : PAGE_SECTION_MASK))
                return pfn;

        return first_valid_pfn(pfn, end_pfn);
}


#define for_each_valid_pfn(_pfn, _start_pfn, _end_pfn)                        \
        for ((_pfn) = first_valid_pfn((_start_pfn), (_end_pfn));        \
             (_pfn) < (_end_pfn);                                        \
             (_pfn) = next_valid_pfn((_pfn), (_end_pfn)))

#endif

static inline int pfn_in_present_section(unsigned long pfn)
{
        if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
                return 0;
        return present_section(__pfn_to_section(pfn));
}

static inline unsigned long next_present_section_nr(unsigned long section_nr)
{
        while (++section_nr <= __highest_present_section_nr) {
                if (present_section_nr(section_nr))
                        return section_nr;
        }

        return -1;
}

#define for_each_present_section_nr(start, section_nr)                \
        for (section_nr = next_present_section_nr(start - 1);        \
             section_nr != -1;                                        \
             section_nr = next_present_section_nr(section_nr))

/*
 * These are _only_ used during initialisation, therefore they
 * can use __initdata ...  They could have names to indicate
 * this restriction.
 */
#ifdef CONFIG_NUMA
#define pfn_to_nid(pfn)                                                        \
({                                                                        \
        unsigned long __pfn_to_nid_pfn = (pfn);                                \
        page_to_nid(pfn_to_page(__pfn_to_nid_pfn));                        \
})
#else
#define pfn_to_nid(pfn)                (0)
#endif

void sparse_init(void);
#else
#define sparse_init()        do {} while (0)
#define sparse_index_init(_sec, _nid)  do {} while (0)
#define sparse_vmemmap_init_nid_early(_nid, _use) do {} while (0)
#define sparse_vmemmap_init_nid_late(_nid) do {} while (0)
#define pfn_in_present_section pfn_valid
#define subsection_map_init(_pfn, _nr_pages) do {} while (0)
#endif /* CONFIG_SPARSEMEM */

/*
 * Fallback case for when the architecture provides its own pfn_valid() but
 * not a corresponding for_each_valid_pfn().
 */
#ifndef for_each_valid_pfn
#define for_each_valid_pfn(_pfn, _start_pfn, _end_pfn)                        \
        for ((_pfn) = (_start_pfn); (_pfn) < (_end_pfn); (_pfn)++)        \
                if (pfn_valid(_pfn))
#endif

#endif /* !__GENERATING_BOUNDS.H */
#endif /* !__ASSEMBLY__ */
#endif /* _LINUX_MMZONE_H */























   65 



































   65 









































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM workqueue

#if !defined(_TRACE_WORKQUEUE_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_WORKQUEUE_H

#include <linux/tracepoint.h>
#include <linux/workqueue.h>

struct pool_workqueue;

/**
 * workqueue_queue_work - called when a work gets queued
 * @req_cpu:        the requested cpu
 * @pwq:        pointer to struct pool_workqueue
 * @work:        pointer to struct work_struct
 *
 * This event occurs when a work is queued immediately or once a
 * delayed work is actually queued on a workqueue (ie: once the delay
 * has been reached).
 */
TRACE_EVENT(workqueue_queue_work,

        TP_PROTO(int req_cpu, struct pool_workqueue *pwq,
                 struct work_struct *work),

        TP_ARGS(req_cpu, pwq, work),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
                __string( workqueue,        pwq->wq->name)
                __field( int,        req_cpu        )
                __field( int,        cpu        )
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = work->func;
                __assign_str(workqueue);
                __entry->req_cpu        = req_cpu;
                __entry->cpu                = pwq->pool->cpu;
        ),

        TP_printk("work struct=%p function=%ps workqueue=%s req_cpu=%d cpu=%d",
                  __entry->work, __entry->function, __get_str(workqueue),
                  __entry->req_cpu, __entry->cpu)
);

/**
 * workqueue_activate_work - called when a work gets activated
 * @work:        pointer to struct work_struct
 *
 * This event occurs when a queued work is put on the active queue,
 * which happens immediately after queueing unless @max_active limit
 * is reached.
 */
TRACE_EVENT(workqueue_activate_work,

        TP_PROTO(struct work_struct *work),

        TP_ARGS(work),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = work->func;
        ),

        TP_printk("work struct %p function=%ps ", __entry->work, __entry->function)
);

/**
 * workqueue_execute_start - called immediately before the workqueue callback
 * @work:        pointer to struct work_struct
 *
 * Allows to track workqueue execution.
 */
TRACE_EVENT(workqueue_execute_start,

        TP_PROTO(struct work_struct *work),

        TP_ARGS(work),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = work->func;
        ),

        TP_printk("work struct %p: function %ps", __entry->work, __entry->function)
);

/**
 * workqueue_execute_end - called immediately after the workqueue callback
 * @work:        pointer to struct work_struct
 * @function:   pointer to worker function
 *
 * Allows to track workqueue execution.
 */
TRACE_EVENT(workqueue_execute_end,

        TP_PROTO(struct work_struct *work, work_func_t function),

        TP_ARGS(work, function),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = function;
        ),

        TP_printk("work struct %p: function %ps", __entry->work, __entry->function)
);

#endif /*  _TRACE_WORKQUEUE_H */

/* This part must be outside protection */
#include <trace/define_trace.h>











































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef IOCONTEXT_H
#define IOCONTEXT_H

#include <linux/radix-tree.h>
#include <linux/rcupdate.h>
#include <linux/workqueue.h>

enum {
        ICQ_EXITED                = 1 << 2,
        ICQ_DESTROYED                = 1 << 3,
};

/*
 * An io_cq (icq) is association between an io_context (ioc) and a
 * request_queue (q).  This is used by elevators which need to track
 * information per ioc - q pair.
 *
 * Elevator can request use of icq by setting elevator_type->icq_size and
 * ->icq_align.  Both size and align must be larger than that of struct
 * io_cq and elevator can use the tail area for private information.  The
 * recommended way to do this is defining a struct which contains io_cq as
 * the first member followed by private members and using its size and
 * align.  For example,
 *
 *        struct snail_io_cq {
 *                struct io_cq        icq;
 *                int                poke_snail;
 *                int                feed_snail;
 *        };
 *
 *        struct elevator_type snail_elv_type {
 *                .ops =                { ... },
 *                .icq_size =        sizeof(struct snail_io_cq),
 *                .icq_align =        __alignof__(struct snail_io_cq),
 *                ...
 *        };
 *
 * If icq_size is set, block core will manage icq's.  All requests will
 * have its ->elv.icq field set before elevator_ops->elevator_set_req_fn()
 * is called and be holding a reference to the associated io_context.
 *
 * Whenever a new icq is created, elevator_ops->elevator_init_icq_fn() is
 * called and, on destruction, ->elevator_exit_icq_fn().  Both functions
 * are called with both the associated io_context and queue locks held.
 *
 * Elevator is allowed to lookup icq using ioc_lookup_icq() while holding
 * queue lock but the returned icq is valid only until the queue lock is
 * released.  Elevators can not and should not try to create or destroy
 * icq's.
 *
 * As icq's are linked from both ioc and q, the locking rules are a bit
 * complex.
 *
 * - ioc lock nests inside q lock.
 *
 * - ioc->icq_list and icq->ioc_node are protected by ioc lock.
 *   q->icq_list and icq->q_node by q lock.
 *
 * - ioc->icq_tree and ioc->icq_hint are protected by ioc lock, while icq
 *   itself is protected by q lock.  However, both the indexes and icq
 *   itself are also RCU managed and lookup can be performed holding only
 *   the q lock.
 *
 * - icq's are not reference counted.  They are destroyed when either the
 *   ioc or q goes away.  Each request with icq set holds an extra
 *   reference to ioc to ensure it stays until the request is completed.
 *
 * - Linking and unlinking icq's are performed while holding both ioc and q
 *   locks.  Due to the lock ordering, q exit is simple but ioc exit
 *   requires reverse-order double lock dance.
 */
struct io_cq {
        struct request_queue        *q;
        struct io_context        *ioc;

        /*
         * q_node and ioc_node link io_cq through icq_list of q and ioc
         * respectively.  Both fields are unused once ioc_exit_icq() is
         * called and shared with __rcu_icq_cache and __rcu_head which are
         * used for RCU free of io_cq.
         */
        union {
                struct list_head        q_node;
                struct kmem_cache        *__rcu_icq_cache;
        };
        union {
                struct hlist_node        ioc_node;
                struct rcu_head                __rcu_head;
        };

        unsigned int                flags;
};

/*
 * I/O subsystem state of the associated processes.  It is refcounted
 * and kmalloc'ed. These could be shared between processes.
 */
struct io_context {
        atomic_long_t refcount;
        atomic_t active_ref;

        unsigned short ioprio;

#ifdef CONFIG_BLK_ICQ
        /* all the fields below are protected by this lock */
        spinlock_t lock;

        struct radix_tree_root        icq_tree;
        struct io_cq __rcu        *icq_hint;
        struct hlist_head        icq_list;

        struct work_struct release_work;
#endif /* CONFIG_BLK_ICQ */
};

struct task_struct;
#ifdef CONFIG_BLOCK
void put_io_context(struct io_context *ioc);
void exit_io_context(struct task_struct *task);
int __copy_io(u64 clone_flags, struct task_struct *tsk);
static inline int copy_io(u64 clone_flags, struct task_struct *tsk)
{
        if (!current->io_context)
                return 0;
        return __copy_io(clone_flags, tsk);
}
#else
struct io_context;
static inline void put_io_context(struct io_context *ioc) { }
static inline void exit_io_context(struct task_struct *task) { }
static inline int copy_io(u64 clone_flags, struct task_struct *tsk)
{
        return 0;
}
#endif /* CONFIG_BLOCK */

#endif /* IOCONTEXT_H */

















































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the IP module.
 *
 * Version:        @(#)ip.h        1.0.2        05/07/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Alan Cox, <gw4pts@gw4pts.ampr.org>
 *
 * Changes:
 *                Mike McLagan    :       Routing by source
 */
#ifndef _IP_H
#define _IP_H

#include <linux/types.h>
#include <linux/ip.h>
#include <linux/in.h>
#include <linux/skbuff.h>
#include <linux/jhash.h>
#include <linux/sockptr.h>
#include <linux/static_key.h>

#include <net/inet_sock.h>
#include <net/route.h>
#include <net/snmp.h>
#include <net/flow.h>
#include <net/flow_dissector.h>
#include <net/netns/hash.h>
#include <net/lwtunnel.h>
#include <net/inet_dscp.h>

#define IPV4_MAX_PMTU                65535U                /* RFC 2675, Section 5.1 */
#define IPV4_MIN_MTU                68                        /* RFC 791 */

extern unsigned int sysctl_fib_sync_mem;
extern unsigned int sysctl_fib_sync_mem_min;
extern unsigned int sysctl_fib_sync_mem_max;

struct sock;

struct inet_skb_parm {
        int                        iif;
        struct ip_options        opt;                /* Compiled IP options                */
        u16                        flags;

#define IPSKB_FORWARDED                BIT(0)
#define IPSKB_XFRM_TUNNEL_SIZE        BIT(1)
#define IPSKB_XFRM_TRANSFORMED        BIT(2)
#define IPSKB_FRAG_COMPLETE        BIT(3)
#define IPSKB_REROUTED                BIT(4)
#define IPSKB_DOREDIRECT        BIT(5)
#define IPSKB_FRAG_PMTU                BIT(6)
#define IPSKB_L3SLAVE                BIT(7)
#define IPSKB_NOPOLICY                BIT(8)
#define IPSKB_MULTIPATH                BIT(9)
#define IPSKB_MCROUTE                BIT(10)

        u16                        frag_max_size;
};

static inline bool ipv4_l3mdev_skb(u16 flags)
{
        return !!(flags & IPSKB_L3SLAVE);
}

static inline unsigned int ip_hdrlen(const struct sk_buff *skb)
{
        return ip_hdr(skb)->ihl * 4;
}

struct ipcm_cookie {
        struct sockcm_cookie        sockc;
        __be32                        addr;
        int                        oif;
        struct ip_options_rcu        *opt;
        __u8                        protocol;
        __u8                        ttl;
        __s16                        tos;
        __u16                        gso_size;
};

static inline void ipcm_init(struct ipcm_cookie *ipcm)
{
        *ipcm = (struct ipcm_cookie) { .tos = -1 };
}

static inline void ipcm_init_sk(struct ipcm_cookie *ipcm,
                                const struct inet_sock *inet)
{
        *ipcm = (struct ipcm_cookie) {
                .tos = READ_ONCE(inet->tos),
        };

        sockcm_init(&ipcm->sockc, &inet->sk);

        ipcm->oif = READ_ONCE(inet->sk.sk_bound_dev_if);
        ipcm->addr = inet->inet_saddr;
        ipcm->protocol = inet->inet_num;
}

#define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb))
#define PKTINFO_SKB_CB(skb) ((struct in_pktinfo *)((skb)->cb))

/* return enslaved device index if relevant */
static inline int inet_sdif(const struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
        if (skb && ipv4_l3mdev_skb(IPCB(skb)->flags))
                return IPCB(skb)->iif;
#endif
        return 0;
}

/* Special input handler for packets caught by router alert option.
   They are selected only by protocol field, and then processed likely
   local ones; but only if someone wants them! Otherwise, router
   not running rsvpd will kill RSVP.

   It is user level problem, what it will make with them.
   I have no idea, how it will masquearde or NAT them (it is joke, joke :-)),
   but receiver should be enough clever f.e. to forward mtrace requests,
   sent to multicast group to reach destination designated router.
 */

struct ip_ra_chain {
        struct ip_ra_chain __rcu *next;
        struct sock                *sk;
        union {
                void                        (*destructor)(struct sock *);
                struct sock                *saved_sk;
        };
        struct rcu_head                rcu;
};

/* IP flags. */
#define IP_CE                0x8000                /* Flag: "Congestion"                */
#define IP_DF                0x4000                /* Flag: "Don't Fragment"        */
#define IP_MF                0x2000                /* Flag: "More Fragments"        */
#define IP_OFFSET        0x1FFF                /* "Fragment Offset" part        */

#define IP_FRAG_TIME        (30 * HZ)                /* fragment lifetime        */

struct msghdr;
struct net_device;
struct packet_type;
struct rtable;
struct sockaddr;

int igmp_mc_init(void);

/*
 *        Functions provided by ip.c
 */

int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
                          __be32 saddr, __be32 daddr,
                          struct ip_options_rcu *opt, u8 tos);
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
           struct net_device *orig_dev);
void ip_list_rcv(struct list_head *head, struct packet_type *pt,
                 struct net_device *orig_dev);
int ip_local_deliver(struct sk_buff *skb);
void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int proto);
int ip_mr_input(struct sk_buff *skb);
int ip_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb);
int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb);
int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb);
int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
                   int (*output)(struct net *, struct sock *, struct sk_buff *));

struct ip_fraglist_iter {
        struct sk_buff        *frag;
        struct iphdr        *iph;
        int                offset;
        unsigned int        hlen;
};

void ip_fraglist_init(struct sk_buff *skb, struct iphdr *iph,
                      unsigned int hlen, struct ip_fraglist_iter *iter);
void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter);

static inline struct sk_buff *ip_fraglist_next(struct ip_fraglist_iter *iter)
{
        struct sk_buff *skb = iter->frag;

        iter->frag = skb->next;
        skb_mark_not_on_list(skb);

        return skb;
}

struct ip_frag_state {
        bool                DF;
        unsigned int        hlen;
        unsigned int        ll_rs;
        unsigned int        mtu;
        unsigned int        left;
        int                offset;
        int                ptr;
        __be16                not_last_frag;
};

void ip_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int ll_rs,
                  unsigned int mtu, bool DF, struct ip_frag_state *state);
struct sk_buff *ip_frag_next(struct sk_buff *skb,
                             struct ip_frag_state *state);

void ip_send_check(struct iphdr *ip);
int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);
int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);

int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
                    __u8 tos);
void ip_init(void);
int ip_append_data(struct sock *sk, struct flowi4 *fl4,
                   int getfrag(void *from, char *to, int offset, int len,
                               int odd, struct sk_buff *skb),
                   void *from, int len, int protolen,
                   struct ipcm_cookie *ipc,
                   struct rtable **rt,
                   unsigned int flags);
int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd,
                       struct sk_buff *skb);
struct sk_buff *__ip_make_skb(struct sock *sk, struct flowi4 *fl4,
                              struct sk_buff_head *queue,
                              struct inet_cork *cork);
int ip_send_skb(struct net *net, struct sk_buff *skb);
int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4);
void ip_flush_pending_frames(struct sock *sk);
struct sk_buff *ip_make_skb(struct sock *sk, struct flowi4 *fl4,
                            int getfrag(void *from, char *to, int offset,
                                        int len, int odd, struct sk_buff *skb),
                            void *from, int length, int transhdrlen,
                            struct ipcm_cookie *ipc, struct rtable **rtp,
                            struct inet_cork *cork, unsigned int flags);

int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl);

static inline struct sk_buff *ip_finish_skb(struct sock *sk, struct flowi4 *fl4)
{
        return __ip_make_skb(sk, fl4, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
}

/* Get the route scope that should be used when sending a packet. */
static inline u8 ip_sendmsg_scope(const struct inet_sock *inet,
                                  const struct ipcm_cookie *ipc,
                                  const struct msghdr *msg)
{
        if (sock_flag(&inet->sk, SOCK_LOCALROUTE) ||
            msg->msg_flags & MSG_DONTROUTE ||
            (ipc->opt && ipc->opt->opt.is_strictroute))
                return RT_SCOPE_LINK;

        return RT_SCOPE_UNIVERSE;
}

/* datagram.c */
int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);

void ip4_datagram_release_cb(struct sock *sk);

struct ip_reply_arg {
        struct kvec iov[1];
        int            flags;
        __wsum             csum;
        int            csumoffset; /* u16 offset of csum in iov[0].iov_base */
                                /* -1 if not needed */
        int            bound_dev_if;
        u8              tos;
        kuid_t            uid;
};

#define IP_REPLY_ARG_NOSRCCHECK 1

static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg)
{
        return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0;
}

void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk,
                           struct sk_buff *skb,
                           const struct ip_options *sopt,
                           __be32 daddr, __be32 saddr,
                           const struct ip_reply_arg *arg,
                           unsigned int len, u64 transmit_time, u32 txhash);

#define IP_INC_STATS(net, field)        SNMP_INC_STATS64((net)->mib.ip_statistics, field)
#define __IP_INC_STATS(net, field)        __SNMP_INC_STATS64((net)->mib.ip_statistics, field)
#define IP_ADD_STATS(net, field, val)        SNMP_ADD_STATS64((net)->mib.ip_statistics, field, val)
#define __IP_ADD_STATS(net, field, val) __SNMP_ADD_STATS64((net)->mib.ip_statistics, field, val)
#define IP_UPD_PO_STATS(net, field, val) SNMP_UPD_PO_STATS64((net)->mib.ip_statistics, field, val)
#define __IP_UPD_PO_STATS(net, field, val) __SNMP_UPD_PO_STATS64((net)->mib.ip_statistics, field, val)
#define NET_INC_STATS(net, field)        SNMP_INC_STATS((net)->mib.net_statistics, field)
#define __NET_INC_STATS(net, field)        __SNMP_INC_STATS((net)->mib.net_statistics, field)
#define NET_ADD_STATS(net, field, adnd)        SNMP_ADD_STATS((net)->mib.net_statistics, field, adnd)
#define __NET_ADD_STATS(net, field, adnd) __SNMP_ADD_STATS((net)->mib.net_statistics, field, adnd)

static inline u64 snmp_get_cpu_field(void __percpu *mib, int cpu, int offt)
{
        return  *(((unsigned long *)per_cpu_ptr(mib, cpu)) + offt);
}

unsigned long snmp_fold_field(void __percpu *mib, int offt);
#if BITS_PER_LONG==32
u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offct,
                         size_t syncp_offset);
u64 snmp_fold_field64(void __percpu *mib, int offt, size_t sync_off);
#else
static inline u64  snmp_get_cpu_field64(void __percpu *mib, int cpu, int offct,
                                        size_t syncp_offset)
{
        return snmp_get_cpu_field(mib, cpu, offct);

}

static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_off)
{
        return snmp_fold_field(mib, offt);
}
#endif

#define snmp_get_cpu_field64_batch_cnt(buff64, stats_list, cnt,        \
                                       mib_statistic, offset)        \
{ \
        int i, c; \
        for_each_possible_cpu(c) { \
                for (i = 0; i < cnt; i++) \
                        buff64[i] += snmp_get_cpu_field64( \
                                        mib_statistic, \
                                        c, stats_list[i].entry, \
                                        offset); \
        } \
}

#define snmp_get_cpu_field_batch_cnt(buff, stats_list, cnt, mib_statistic) \
{ \
        int i, c; \
        for_each_possible_cpu(c) { \
                for (i = 0; i < cnt; i++) \
                        buff[i] += snmp_get_cpu_field( \
                                                mib_statistic, \
                                                c, stats_list[i].entry); \
        } \
}

static inline void inet_get_local_port_range(const struct net *net, int *low, int *high)
{
        u32 range = READ_ONCE(net->ipv4.ip_local_ports.range);

        *low = range & 0xffff;
        *high = range >> 16;
}
bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high);

#ifdef CONFIG_SYSCTL
static inline bool inet_is_local_reserved_port(const struct net *net, unsigned short port)
{
        if (!net->ipv4.sysctl_local_reserved_ports)
                return false;
        return test_bit(port, net->ipv4.sysctl_local_reserved_ports);
}

static inline bool sysctl_dev_name_is_allowed(const char *name)
{
        return strcmp(name, "default") != 0  && strcmp(name, "all") != 0;
}

static inline bool inet_port_requires_bind_service(struct net *net, unsigned short port)
{
        return port < READ_ONCE(net->ipv4.sysctl_ip_prot_sock);
}

#else
static inline bool inet_is_local_reserved_port(struct net *net, unsigned short port)
{
        return false;
}

static inline bool inet_port_requires_bind_service(struct net *net, unsigned short port)
{
        return port < PROT_SOCK;
}
#endif

__be32 inet_current_timestamp(void);

/* From inetpeer.c */
extern int inet_peer_threshold;
extern int inet_peer_minttl;
extern int inet_peer_maxttl;

void ipfrag_init(void);

void ip_static_sysctl_init(void);

#define IP4_REPLY_MARK(net, mark) \
        (READ_ONCE((net)->ipv4.sysctl_fwmark_reflect) ? (mark) : 0)

static inline bool ip_is_fragment(const struct iphdr *iph)
{
        return (iph->frag_off & htons(IP_MF | IP_OFFSET)) != 0;
}

#ifdef CONFIG_INET
#include <net/dst.h>

/* The function in 2.2 was invalid, producing wrong result for
 * check=0xFEFF. It was noticed by Arthur Skawina _year_ ago. --ANK(000625) */
static inline
int ip_decrease_ttl(struct iphdr *iph)
{
        u32 check = (__force u32)iph->check;
        check += (__force u32)htons(0x0100);
        iph->check = (__force __sum16)(check + (check>=0xFFFF));
        return --iph->ttl;
}

static inline dscp_t ip4h_dscp(const struct iphdr *ip4h)
{
        return inet_dsfield_to_dscp(ip4h->tos);
}

static inline int ip_mtu_locked(const struct dst_entry *dst)
{
        const struct rtable *rt = dst_rtable(dst);

        return rt->rt_mtu_locked || dst_metric_locked(dst, RTAX_MTU);
}

static inline
int ip_dont_fragment(const struct sock *sk, const struct dst_entry *dst)
{
        u8 pmtudisc = READ_ONCE(inet_sk(sk)->pmtudisc);

        return  pmtudisc == IP_PMTUDISC_DO ||
                (pmtudisc == IP_PMTUDISC_WANT &&
                 !ip_mtu_locked(dst));
}

static inline bool ip_sk_accept_pmtu(const struct sock *sk)
{
        u8 pmtudisc = READ_ONCE(inet_sk(sk)->pmtudisc);

        return pmtudisc != IP_PMTUDISC_INTERFACE &&
               pmtudisc != IP_PMTUDISC_OMIT;
}

static inline bool ip_sk_use_pmtu(const struct sock *sk)
{
        return READ_ONCE(inet_sk(sk)->pmtudisc) < IP_PMTUDISC_PROBE;
}

static inline bool ip_sk_ignore_df(const struct sock *sk)
{
        u8 pmtudisc = READ_ONCE(inet_sk(sk)->pmtudisc);

        return pmtudisc < IP_PMTUDISC_DO || pmtudisc == IP_PMTUDISC_OMIT;
}

static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
                                                    bool forwarding)
{
        const struct rtable *rt = dst_rtable(dst);
        const struct net_device *dev;
        unsigned int mtu, res;
        struct net *net;

        rcu_read_lock();

        dev = dst_dev_rcu(dst);
        net = dev_net_rcu(dev);
        if (READ_ONCE(net->ipv4.sysctl_ip_fwd_use_pmtu) ||
            ip_mtu_locked(dst) ||
            !forwarding) {
                mtu = rt->rt_pmtu;
                if (mtu && time_before(jiffies, READ_ONCE(rt->dst.expires)))
                        goto out;
        }

        /* 'forwarding = true' case should always honour route mtu */
        mtu = dst_metric_raw(dst, RTAX_MTU);
        if (mtu)
                goto out;

        mtu = READ_ONCE(dev->mtu);

        if (unlikely(ip_mtu_locked(dst))) {
                if (rt->rt_uses_gateway && mtu > 576)
                        mtu = 576;
        }

out:
        mtu = min_t(unsigned int, mtu, IP_MAX_MTU);

        res = mtu - lwtunnel_headroom(dst->lwtstate, mtu);

        rcu_read_unlock();

        return res;
}

static inline unsigned int ip_skb_dst_mtu(struct sock *sk,
                                          const struct sk_buff *skb)
{
        const struct dst_entry *dst = skb_dst(skb);
        unsigned int mtu;

        if (!sk || !sk_fullsock(sk) || ip_sk_use_pmtu(sk)) {
                bool forwarding = IPCB(skb)->flags & IPSKB_FORWARDED;

                return ip_dst_mtu_maybe_forward(dst, forwarding);
        }

        mtu = min(READ_ONCE(dst_dev(dst)->mtu), IP_MAX_MTU);
        return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
}

struct dst_metrics *ip_fib_metrics_init(struct nlattr *fc_mx, int fc_mx_len,
                                        struct netlink_ext_ack *extack);
static inline void ip_fib_metrics_put(struct dst_metrics *fib_metrics)
{
        if (fib_metrics != &dst_default_metrics &&
            refcount_dec_and_test(&fib_metrics->refcnt))
                kfree(fib_metrics);
}

/* ipv4 and ipv6 both use refcounted metrics if it is not the default */
static inline
void ip_dst_init_metrics(struct dst_entry *dst, struct dst_metrics *fib_metrics)
{
        dst_init_metrics(dst, fib_metrics->metrics, true);

        if (fib_metrics != &dst_default_metrics) {
                dst->_metrics |= DST_METRICS_REFCOUNTED;
                refcount_inc(&fib_metrics->refcnt);
        }
}

static inline
void ip_dst_metrics_put(struct dst_entry *dst)
{
        struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);

        if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
                kfree(p);
}

void __ip_select_ident(struct net *net, struct iphdr *iph, int segs);

static inline void ip_select_ident_segs(struct net *net, struct sk_buff *skb,
                                        struct sock *sk, int segs)
{
        struct iphdr *iph = ip_hdr(skb);

        /* We had many attacks based on IPID, use the private
         * generator as much as we can.
         */
        if (sk && inet_sk(sk)->inet_daddr) {
                int val;

                /* avoid atomic operations for TCP,
                 * as we hold socket lock at this point.
                 */
                if (sk_is_tcp(sk)) {
                        sock_owned_by_me(sk);
                        val = atomic_read(&inet_sk(sk)->inet_id);
                        atomic_set(&inet_sk(sk)->inet_id, val + segs);
                } else {
                        val = atomic_add_return(segs, &inet_sk(sk)->inet_id);
                }
                iph->id = htons(val);
                return;
        }
        if ((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) {
                iph->id = 0;
        } else {
                /* Unfortunately we need the big hammer to get a suitable IPID */
                __ip_select_ident(net, iph, segs);
        }
}

static inline void ip_select_ident(struct net *net, struct sk_buff *skb,
                                   struct sock *sk)
{
        ip_select_ident_segs(net, skb, sk, 1);
}

static inline __wsum inet_compute_pseudo(struct sk_buff *skb, int proto)
{
        return csum_tcpudp_nofold(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
                                  skb->len, proto, 0);
}

/* copy IPv4 saddr & daddr to flow_keys, possibly using 64bit load/store
 * Equivalent to :        flow->v4addrs.src = iph->saddr;
 *                        flow->v4addrs.dst = iph->daddr;
 */
static inline void iph_to_flow_copy_v4addrs(struct flow_keys *flow,
                                            const struct iphdr *iph)
{
        BUILD_BUG_ON(offsetof(typeof(flow->addrs), v4addrs.dst) !=
                     offsetof(typeof(flow->addrs), v4addrs.src) +
                              sizeof(flow->addrs.v4addrs.src));
        memcpy(&flow->addrs.v4addrs, &iph->addrs, sizeof(flow->addrs.v4addrs));
        flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
}

/*
 *        Map a multicast IP onto multicast MAC for type ethernet.
 */

static inline void ip_eth_mc_map(__be32 naddr, char *buf)
{
        __u32 addr=ntohl(naddr);
        buf[0]=0x01;
        buf[1]=0x00;
        buf[2]=0x5e;
        buf[5]=addr&0xFF;
        addr>>=8;
        buf[4]=addr&0xFF;
        addr>>=8;
        buf[3]=addr&0x7F;
}

/*
 *        Map a multicast IP onto multicast MAC for type IP-over-InfiniBand.
 *        Leave P_Key as 0 to be filled in by driver.
 */

static inline void ip_ib_mc_map(__be32 naddr, const unsigned char *broadcast, char *buf)
{
        __u32 addr;
        unsigned char scope = broadcast[5] & 0xF;

        buf[0]  = 0;                /* Reserved */
        buf[1]  = 0xff;                /* Multicast QPN */
        buf[2]  = 0xff;
        buf[3]  = 0xff;
        addr    = ntohl(naddr);
        buf[4]  = 0xff;
        buf[5]  = 0x10 | scope;        /* scope from broadcast address */
        buf[6]  = 0x40;                /* IPv4 signature */
        buf[7]  = 0x1b;
        buf[8]  = broadcast[8];                /* P_Key */
        buf[9]  = broadcast[9];
        buf[10] = 0;
        buf[11] = 0;
        buf[12] = 0;
        buf[13] = 0;
        buf[14] = 0;
        buf[15] = 0;
        buf[19] = addr & 0xff;
        addr  >>= 8;
        buf[18] = addr & 0xff;
        addr  >>= 8;
        buf[17] = addr & 0xff;
        addr  >>= 8;
        buf[16] = addr & 0x0f;
}

static inline void ip_ipgre_mc_map(__be32 naddr, const unsigned char *broadcast, char *buf)
{
        if ((broadcast[0] | broadcast[1] | broadcast[2] | broadcast[3]) != 0)
                memcpy(buf, broadcast, 4);
        else
                memcpy(buf, &naddr, sizeof(naddr));
}

#if IS_ENABLED(CONFIG_IPV6)
#include <linux/ipv6.h>
#endif

static __inline__ void inet_reset_saddr(struct sock *sk)
{
        inet_sk(sk)->inet_rcv_saddr = inet_sk(sk)->inet_saddr = 0;
#if IS_ENABLED(CONFIG_IPV6)
        if (sk->sk_family == PF_INET6) {
                struct ipv6_pinfo *np = inet6_sk(sk);

                memset(&np->saddr, 0, sizeof(np->saddr));
                memset(&sk->sk_v6_rcv_saddr, 0, sizeof(sk->sk_v6_rcv_saddr));
        }
#endif
}

#endif

#if IS_MODULE(CONFIG_IPV6)
#define EXPORT_IPV6_MOD(X) EXPORT_SYMBOL(X)
#define EXPORT_IPV6_MOD_GPL(X) EXPORT_SYMBOL_GPL(X)
#else
#define EXPORT_IPV6_MOD(X)
#define EXPORT_IPV6_MOD_GPL(X)
#endif

static inline unsigned int ipv4_addr_hash(__be32 ip)
{
        return (__force unsigned int) ip;
}

static inline u32 __ipv4_addr_hash(const __be32 ip, const u32 initval)
{
        return jhash_1word((__force u32)ip, initval);
}

static inline u32 ipv4_portaddr_hash(const struct net *net,
                                     __be32 saddr,
                                     unsigned int port)
{
        return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port;
}

bool ip_call_ra_chain(struct sk_buff *skb);

/*
 *        Functions provided by ip_fragment.c
 */

enum ip_defrag_users {
        IP_DEFRAG_LOCAL_DELIVER,
        IP_DEFRAG_CALL_RA_CHAIN,
        IP_DEFRAG_CONNTRACK_IN,
        __IP_DEFRAG_CONNTRACK_IN_END        = IP_DEFRAG_CONNTRACK_IN + USHRT_MAX,
        IP_DEFRAG_CONNTRACK_OUT,
        __IP_DEFRAG_CONNTRACK_OUT_END        = IP_DEFRAG_CONNTRACK_OUT + USHRT_MAX,
        IP_DEFRAG_CONNTRACK_BRIDGE_IN,
        __IP_DEFRAG_CONNTRACK_BRIDGE_IN = IP_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX,
        IP_DEFRAG_VS_IN,
        IP_DEFRAG_VS_OUT,
        IP_DEFRAG_VS_FWD,
        IP_DEFRAG_AF_PACKET,
        IP_DEFRAG_MACVLAN,
};

/* Return true if the value of 'user' is between 'lower_bond'
 * and 'upper_bond' inclusively.
 */
static inline bool ip_defrag_user_in_between(u32 user,
                                             enum ip_defrag_users lower_bond,
                                             enum ip_defrag_users upper_bond)
{
        return user >= lower_bond && user <= upper_bond;
}

int ip_defrag(struct net *net, struct sk_buff *skb, u32 user);
#ifdef CONFIG_INET
struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user);
#else
static inline struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
{
        return skb;
}
#endif

/*
 *        Functions provided by ip_forward.c
 */

int ip_forward(struct sk_buff *skb);

/*
 *        Functions provided by ip_options.c
 */

void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
                      __be32 daddr, struct rtable *rt);

int __ip_options_echo(struct net *net, struct ip_options *dopt,
                      struct sk_buff *skb, const struct ip_options *sopt);
static inline int ip_options_echo(struct net *net, struct ip_options *dopt,
                                  struct sk_buff *skb)
{
        return __ip_options_echo(net, dopt, skb, &IPCB(skb)->opt);
}

void ip_options_fragment(struct sk_buff *skb);
int __ip_options_compile(struct net *net, struct ip_options *opt,
                         struct sk_buff *skb, __be32 *info);
int ip_options_compile(struct net *net, struct ip_options *opt,
                       struct sk_buff *skb);
int ip_options_get(struct net *net, struct ip_options_rcu **optp,
                   sockptr_t data, int optlen);
void ip_options_undo(struct ip_options *opt);
void ip_forward_options(struct sk_buff *skb);
int ip_options_rcv_srr(struct sk_buff *skb, struct net_device *dev);

/*
 *        Functions provided by ip_sockglue.c
 */

void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb, bool drop_dst);
void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk,
                         struct sk_buff *skb, int tlen, int offset);
int ip_cmsg_send(struct sock *sk, struct msghdr *msg,
                 struct ipcm_cookie *ipc, bool allow_ipv6);
DECLARE_STATIC_KEY_FALSE(ip4_min_ttl);
int do_ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
                     unsigned int optlen);
int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
                  unsigned int optlen);
int do_ip_getsockopt(struct sock *sk, int level, int optname,
                     sockptr_t optval, sockptr_t optlen);
int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
                  int __user *optlen);
int ip_ra_control(struct sock *sk, unsigned char on,
                  void (*destructor)(struct sock *));

int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len);
void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port,
                   u32 info, u8 *payload);
void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 dport,
                    u32 info);

static inline void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
{
        ip_cmsg_recv_offset(msg, skb->sk, skb, 0, 0);
}

bool icmp_global_allow(struct net *net);
void icmp_global_consume(struct net *net);

#ifdef CONFIG_PROC_FS
int ip_misc_proc_init(void);
#endif

int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto, u8 family,
                                struct netlink_ext_ack *extack);

static inline bool inetdev_valid_mtu(unsigned int mtu)
{
        return likely(mtu >= IPV4_MIN_MTU);
}

void ip_sock_set_freebind(struct sock *sk);
int ip_sock_set_mtu_discover(struct sock *sk, int val);
void ip_sock_set_pktinfo(struct sock *sk);
void ip_sock_set_recverr(struct sock *sk);
void ip_sock_set_tos(struct sock *sk, int val);
void  __ip_sock_set_tos(struct sock *sk, int val);

#endif        /* _IP_H */




































   20 

    9 










   49 
   22 








    1 




    4 




   13 


   30 







   30 
   31 








   20 


   52 




   52 







   37 
   20 


   55 




   56 







   24 


   52 




   57 











   16 












    9 
    8 
    7 

   22 



   15 

   10 

   15 
    1 


   23 










   16 

   27 
   28 


   16 



   28 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _XFRM_HASH_H
#define _XFRM_HASH_H

#include <linux/xfrm.h>
#include <linux/socket.h>
#include <linux/jhash.h>

static inline unsigned int __xfrm4_addr_hash(const xfrm_address_t *addr)
{
        return ntohl(addr->a4);
}

static inline unsigned int __xfrm6_addr_hash(const xfrm_address_t *addr)
{
        return jhash2((__force u32 *)addr->a6, 4, 0);
}

static inline unsigned int __xfrm4_daddr_saddr_hash(const xfrm_address_t *daddr,
                                                    const xfrm_address_t *saddr)
{
        u32 sum = (__force u32)daddr->a4 + (__force u32)saddr->a4;
        return ntohl((__force __be32)sum);
}

static inline unsigned int __xfrm6_daddr_saddr_hash(const xfrm_address_t *daddr,
                                                    const xfrm_address_t *saddr)
{
        return __xfrm6_addr_hash(daddr) ^ __xfrm6_addr_hash(saddr);
}

static inline u32 __bits2mask32(__u8 bits)
{
        u32 mask32 = 0xffffffff;

        if (bits == 0)
                mask32 = 0;
        else if (bits < 32)
                mask32 <<= (32 - bits);

        return mask32;
}

static inline unsigned int __xfrm4_dpref_spref_hash(const xfrm_address_t *daddr,
                                                    const xfrm_address_t *saddr,
                                                    __u8 dbits,
                                                    __u8 sbits)
{
        return jhash_2words(ntohl(daddr->a4) & __bits2mask32(dbits),
                            ntohl(saddr->a4) & __bits2mask32(sbits),
                            0);
}

static inline unsigned int __xfrm6_pref_hash(const xfrm_address_t *addr,
                                             __u8 prefixlen)
{
        unsigned int pdw;
        unsigned int pbi;
        u32 initval = 0;

        pdw = prefixlen >> 5;     /* num of whole u32 in prefix */
        pbi = prefixlen &  0x1f;  /* num of bits in incomplete u32 in prefix */

        if (pbi) {
                __be32 mask;

                mask = htonl((0xffffffff) << (32 - pbi));

                initval = (__force u32)(addr->a6[pdw] & mask);
        }

        return jhash2((__force u32 *)addr->a6, pdw, initval);
}

static inline unsigned int __xfrm6_dpref_spref_hash(const xfrm_address_t *daddr,
                                                    const xfrm_address_t *saddr,
                                                    __u8 dbits,
                                                    __u8 sbits)
{
        return __xfrm6_pref_hash(daddr, dbits) ^
               __xfrm6_pref_hash(saddr, sbits);
}

static inline unsigned int __xfrm_dst_hash(const xfrm_address_t *daddr,
                                           const xfrm_address_t *saddr,
                                           u32 reqid, unsigned short family,
                                           unsigned int hmask)
{
        unsigned int h = family ^ reqid;
        switch (family) {
        case AF_INET:
                h ^= __xfrm4_daddr_saddr_hash(daddr, saddr);
                break;
        case AF_INET6:
                h ^= __xfrm6_daddr_saddr_hash(daddr, saddr);
                break;
        }
        return (h ^ (h >> 16)) & hmask;
}

static inline unsigned int __xfrm_src_hash(const xfrm_address_t *daddr,
                                           const xfrm_address_t *saddr,
                                           unsigned short family,
                                           unsigned int hmask)
{
        unsigned int h = family;
        switch (family) {
        case AF_INET:
                h ^= __xfrm4_daddr_saddr_hash(daddr, saddr);
                break;
        case AF_INET6:
                h ^= __xfrm6_daddr_saddr_hash(daddr, saddr);
                break;
        }
        return (h ^ (h >> 16)) & hmask;
}

static inline unsigned int
__xfrm_spi_hash(const xfrm_address_t *daddr, __be32 spi, u8 proto,
                unsigned short family, unsigned int hmask)
{
        unsigned int h = (__force u32)spi ^ proto;
        switch (family) {
        case AF_INET:
                h ^= __xfrm4_addr_hash(daddr);
                break;
        case AF_INET6:
                h ^= __xfrm6_addr_hash(daddr);
                break;
        }
        return (h ^ (h >> 10) ^ (h >> 20)) & hmask;
}

static inline unsigned int
__xfrm_seq_hash(u32 seq, unsigned int hmask)
{
        unsigned int h = seq;
        return (h ^ (h >> 10) ^ (h >> 20)) & hmask;
}

static inline unsigned int __idx_hash(u32 index, unsigned int hmask)
{
        return (index ^ (index >> 8)) & hmask;
}

static inline unsigned int __sel_hash(const struct xfrm_selector *sel,
                                      unsigned short family, unsigned int hmask,
                                      u8 dbits, u8 sbits)
{
        const xfrm_address_t *daddr = &sel->daddr;
        const xfrm_address_t *saddr = &sel->saddr;
        unsigned int h = 0;

        switch (family) {
        case AF_INET:
                if (sel->prefixlen_d < dbits ||
                    sel->prefixlen_s < sbits)
                        return hmask + 1;

                h = __xfrm4_dpref_spref_hash(daddr, saddr, dbits, sbits);
                break;

        case AF_INET6:
                if (sel->prefixlen_d < dbits ||
                    sel->prefixlen_s < sbits)
                        return hmask + 1;

                h = __xfrm6_dpref_spref_hash(daddr, saddr, dbits, sbits);
                break;
        }
        h ^= (h >> 16);
        return h & hmask;
}

static inline unsigned int __addr_hash(const xfrm_address_t *daddr,
                                       const xfrm_address_t *saddr,
                                       unsigned short family,
                                       unsigned int hmask,
                                       u8 dbits, u8 sbits)
{
        unsigned int h = 0;

        switch (family) {
        case AF_INET:
                h = __xfrm4_dpref_spref_hash(daddr, saddr, dbits, sbits);
                break;

        case AF_INET6:
                h = __xfrm6_dpref_spref_hash(daddr, saddr, dbits, sbits);
                break;
        }
        h ^= (h >> 16);
        return h & hmask;
}

struct hlist_head *xfrm_hash_alloc(unsigned int sz);
void xfrm_hash_free(struct hlist_head *n, unsigned int sz);

#endif /* _XFRM_HASH_H */




















































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
/* SPDX-License-Identifier: GPL-2.0-or-later */

#ifndef __CPUSET_INTERNAL_H
#define __CPUSET_INTERNAL_H

#include <linux/cgroup.h>
#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/cpuset.h>
#include <linux/spinlock.h>
#include <linux/union_find.h>

/* See "Frequency meter" comments, below. */

struct fmeter {
        int cnt;                /* unprocessed events count */
        int val;                /* most recent output value */
        time64_t time;                /* clock (secs) when val computed */
        spinlock_t lock;        /* guards read or write of above */
};

/*
 * Invalid partition error code
 */
enum prs_errcode {
        PERR_NONE = 0,
        PERR_INVCPUS,
        PERR_INVPARENT,
        PERR_NOTPART,
        PERR_NOTEXCL,
        PERR_NOCPUS,
        PERR_HOTPLUG,
        PERR_CPUSEMPTY,
        PERR_HKEEPING,
        PERR_ACCESS,
        PERR_REMOTE,
};

/* bits in struct cpuset flags field */
typedef enum {
        CS_CPU_EXCLUSIVE,
        CS_MEM_EXCLUSIVE,
        CS_MEM_HARDWALL,
        CS_MEMORY_MIGRATE,
        CS_SCHED_LOAD_BALANCE,
        CS_SPREAD_PAGE,
        CS_SPREAD_SLAB,
} cpuset_flagbits_t;

/* The various types of files and directories in a cpuset file system */

typedef enum {
        FILE_MEMORY_MIGRATE,
        FILE_CPULIST,
        FILE_MEMLIST,
        FILE_EFFECTIVE_CPULIST,
        FILE_EFFECTIVE_MEMLIST,
        FILE_SUBPARTS_CPULIST,
        FILE_EXCLUSIVE_CPULIST,
        FILE_EFFECTIVE_XCPULIST,
        FILE_ISOLATED_CPULIST,
        FILE_CPU_EXCLUSIVE,
        FILE_MEM_EXCLUSIVE,
        FILE_MEM_HARDWALL,
        FILE_SCHED_LOAD_BALANCE,
        FILE_PARTITION_ROOT,
        FILE_SCHED_RELAX_DOMAIN_LEVEL,
        FILE_MEMORY_PRESSURE_ENABLED,
        FILE_MEMORY_PRESSURE,
        FILE_SPREAD_PAGE,
        FILE_SPREAD_SLAB,
} cpuset_filetype_t;

struct cpuset {
        struct cgroup_subsys_state css;

        unsigned long flags;                /* "unsigned long" so bitops work */

        /*
         * On default hierarchy:
         *
         * The user-configured masks can only be changed by writing to
         * cpuset.cpus and cpuset.mems, and won't be limited by the
         * parent masks.
         *
         * The effective masks is the real masks that apply to the tasks
         * in the cpuset. They may be changed if the configured masks are
         * changed or hotplug happens.
         *
         * effective_mask == configured_mask & parent's effective_mask,
         * and if it ends up empty, it will inherit the parent's mask.
         *
         *
         * On legacy hierarchy:
         *
         * The user-configured masks are always the same with effective masks.
         */

        /* user-configured CPUs and Memory Nodes allow to tasks */
        cpumask_var_t cpus_allowed;
        nodemask_t mems_allowed;

        /* effective CPUs and Memory Nodes allow to tasks */
        cpumask_var_t effective_cpus;
        nodemask_t effective_mems;

        /*
         * Exclusive CPUs dedicated to current cgroup (default hierarchy only)
         *
         * The effective_cpus of a valid partition root comes solely from its
         * effective_xcpus and some of the effective_xcpus may be distributed
         * to sub-partitions below & hence excluded from its effective_cpus.
         * For a valid partition root, its effective_cpus have no relationship
         * with cpus_allowed unless its exclusive_cpus isn't set.
         *
         * This value will only be set if either exclusive_cpus is set or
         * when this cpuset becomes a local partition root.
         */
        cpumask_var_t effective_xcpus;

        /*
         * Exclusive CPUs as requested by the user (default hierarchy only)
         *
         * Its value is independent of cpus_allowed and designates the set of
         * CPUs that can be granted to the current cpuset or its children when
         * it becomes a valid partition root. The effective set of exclusive
         * CPUs granted (effective_xcpus) depends on whether those exclusive
         * CPUs are passed down by its ancestors and not yet taken up by
         * another sibling partition root along the way.
         *
         * If its value isn't set, it defaults to cpus_allowed.
         */
        cpumask_var_t exclusive_cpus;

        /*
         * This is old Memory Nodes tasks took on.
         *
         * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
         * - A new cpuset's old_mems_allowed is initialized when some
         *   task is moved into it.
         * - old_mems_allowed is used in cpuset_migrate_mm() when we change
         *   cpuset.mems_allowed and have tasks' nodemask updated, and
         *   then old_mems_allowed is updated to mems_allowed.
         */
        nodemask_t old_mems_allowed;

        struct fmeter fmeter;                /* memory_pressure filter */

        /*
         * Tasks are being attached to this cpuset.  Used to prevent
         * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
         */
        int attach_in_progress;

        /* for custom sched domain */
        int relax_domain_level;

        /* number of valid local child partitions */
        int nr_subparts;

        /* partition root state */
        int partition_root_state;

        /*
         * number of SCHED_DEADLINE tasks attached to this cpuset, so that we
         * know when to rebuild associated root domain bandwidth information.
         */
        int nr_deadline_tasks;
        int nr_migrate_dl_tasks;
        u64 sum_migrate_dl_bw;

        /* Invalid partition error code, not lock protected */
        enum prs_errcode prs_err;

        /* Handle for cpuset.cpus.partition */
        struct cgroup_file partition_file;

        /* Remote partition silbling list anchored at remote_children */
        struct list_head remote_sibling;

        /* Used to merge intersecting subsets for generate_sched_domains */
        struct uf_node node;
};

static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
{
        return css ? container_of(css, struct cpuset, css) : NULL;
}

/* Retrieve the cpuset for a task */
static inline struct cpuset *task_cs(struct task_struct *task)
{
        return css_cs(task_css(task, cpuset_cgrp_id));
}

static inline struct cpuset *parent_cs(struct cpuset *cs)
{
        return css_cs(cs->css.parent);
}

/* convenient tests for these bits */
static inline bool is_cpuset_online(struct cpuset *cs)
{
        return css_is_online(&cs->css) && !css_is_dying(&cs->css);
}

static inline int is_cpu_exclusive(const struct cpuset *cs)
{
        return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
}

static inline int is_mem_exclusive(const struct cpuset *cs)
{
        return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
}

static inline int is_mem_hardwall(const struct cpuset *cs)
{
        return test_bit(CS_MEM_HARDWALL, &cs->flags);
}

static inline int is_sched_load_balance(const struct cpuset *cs)
{
        return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
}

static inline int is_memory_migrate(const struct cpuset *cs)
{
        return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
}

static inline int is_spread_page(const struct cpuset *cs)
{
        return test_bit(CS_SPREAD_PAGE, &cs->flags);
}

static inline int is_spread_slab(const struct cpuset *cs)
{
        return test_bit(CS_SPREAD_SLAB, &cs->flags);
}

/**
 * cpuset_for_each_child - traverse online children of a cpuset
 * @child_cs: loop cursor pointing to the current child
 * @pos_css: used for iteration
 * @parent_cs: target cpuset to walk children of
 *
 * Walk @child_cs through the online children of @parent_cs.  Must be used
 * with RCU read locked.
 */
#define cpuset_for_each_child(child_cs, pos_css, parent_cs)                \
        css_for_each_child((pos_css), &(parent_cs)->css)                \
                if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))

/**
 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
 * @des_cs: loop cursor pointing to the current descendant
 * @pos_css: used for iteration
 * @root_cs: target cpuset to walk ancestor of
 *
 * Walk @des_cs through the online descendants of @root_cs.  Must be used
 * with RCU read locked.  The caller may modify @pos_css by calling
 * css_rightmost_descendant() to skip subtree.  @root_cs is included in the
 * iteration and the first node to be visited.
 */
#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs)        \
        css_for_each_descendant_pre((pos_css), &(root_cs)->css)                \
                if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))

void rebuild_sched_domains_locked(void);
void cpuset_callback_lock_irq(void);
void cpuset_callback_unlock_irq(void);
void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus);
void cpuset_update_tasks_nodemask(struct cpuset *cs);
int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on);
ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
                                    char *buf, size_t nbytes, loff_t off);
int cpuset_common_seq_show(struct seq_file *sf, void *v);
void cpuset_full_lock(void);
void cpuset_full_unlock(void);

/*
 * cpuset-v1.c
 */
#ifdef CONFIG_CPUSETS_V1
extern struct cftype cpuset1_files[];
void fmeter_init(struct fmeter *fmp);
void cpuset1_update_task_spread_flags(struct cpuset *cs,
                                        struct task_struct *tsk);
void cpuset1_update_tasks_flags(struct cpuset *cs);
void cpuset1_hotplug_update_tasks(struct cpuset *cs,
                            struct cpumask *new_cpus, nodemask_t *new_mems,
                            bool cpus_updated, bool mems_updated);
int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial);
#else
static inline void fmeter_init(struct fmeter *fmp) {}
static inline void cpuset1_update_task_spread_flags(struct cpuset *cs,
                                        struct task_struct *tsk) {}
static inline void cpuset1_update_tasks_flags(struct cpuset *cs) {}
static inline void cpuset1_hotplug_update_tasks(struct cpuset *cs,
                            struct cpumask *new_cpus, nodemask_t *new_mems,
                            bool cpus_updated, bool mems_updated) {}
static inline int cpuset1_validate_change(struct cpuset *cur,
                                struct cpuset *trial) { return 0; }
#endif /* CONFIG_CPUSETS_V1 */

#endif /* __CPUSET_INTERNAL_H */





























































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_PKT_SCHED_H
#define __NET_PKT_SCHED_H

#include <linux/jiffies.h>
#include <linux/ktime.h>
#include <linux/if_vlan.h>
#include <linux/netdevice.h>
#include <net/sch_generic.h>
#include <net/net_namespace.h>
#include <uapi/linux/pkt_sched.h>

#define DEFAULT_TX_QUEUE_LEN        1000
#define STAB_SIZE_LOG_MAX        30

struct qdisc_walker {
        int        stop;
        int        skip;
        int        count;
        int        (*fn)(struct Qdisc *, unsigned long cl, struct qdisc_walker *);
};

#define qdisc_priv(q)                                                        \
        _Generic(q,                                                        \
                 const struct Qdisc * : (const void *)&q->privdata,        \
                 struct Qdisc * : (void *)&q->privdata)

static inline struct Qdisc *qdisc_from_priv(void *priv)
{
        return container_of(priv, struct Qdisc, privdata);
}

/* 
   Timer resolution MUST BE < 10% of min_schedulable_packet_size/bandwidth
   
   Normal IP packet size ~ 512byte, hence:

   0.5Kbyte/1Mbyte/sec = 0.5msec, so that we need 50usec timer for
   10Mbit ethernet.

   10msec resolution -> <50Kbit/sec.
   
   The result: [34]86 is not good choice for QoS router :-(

   The things are not so bad, because we may use artificial
   clock evaluated by integration of network data flow
   in the most critical places.
 */

typedef u64        psched_time_t;
typedef long        psched_tdiff_t;

/* Avoid doing 64 bit divide */
#define PSCHED_SHIFT                        6
#define PSCHED_TICKS2NS(x)                ((s64)(x) << PSCHED_SHIFT)
#define PSCHED_NS2TICKS(x)                ((x) >> PSCHED_SHIFT)

#define PSCHED_TICKS_PER_SEC                PSCHED_NS2TICKS(NSEC_PER_SEC)
#define PSCHED_PASTPERFECT                0

static inline psched_time_t psched_get_time(void)
{
        return PSCHED_NS2TICKS(ktime_get_ns());
}

struct qdisc_watchdog {
        struct hrtimer        timer;
        struct Qdisc        *qdisc;
};

void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
                                 clockid_t clockid);
void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc);

void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
                                      u64 delta_ns);

static inline void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd,
                                              u64 expires)
{
        return qdisc_watchdog_schedule_range_ns(wd, expires, 0ULL);
}

static inline void qdisc_watchdog_schedule(struct qdisc_watchdog *wd,
                                           psched_time_t expires)
{
        qdisc_watchdog_schedule_ns(wd, PSCHED_TICKS2NS(expires));
}

void qdisc_watchdog_cancel(struct qdisc_watchdog *wd);

extern struct Qdisc_ops pfifo_qdisc_ops;
extern struct Qdisc_ops bfifo_qdisc_ops;
extern struct Qdisc_ops pfifo_head_drop_qdisc_ops;

int fifo_set_limit(struct Qdisc *q, unsigned int limit);
struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops,
                               unsigned int limit,
                               struct netlink_ext_ack *extack);

int register_qdisc(struct Qdisc_ops *qops);
void unregister_qdisc(struct Qdisc_ops *qops);
#define NET_SCH_ALIAS_PREFIX "net-sch-"
#define MODULE_ALIAS_NET_SCH(id)        MODULE_ALIAS(NET_SCH_ALIAS_PREFIX id)
void qdisc_get_default(char *id, size_t len);
int qdisc_set_default(const char *id);

void qdisc_hash_add(struct Qdisc *q, bool invisible);
void qdisc_hash_del(struct Qdisc *q);
struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle);
struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle);
struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
                                        struct nlattr *tab,
                                        struct netlink_ext_ack *extack);
void qdisc_put_rtab(struct qdisc_rate_table *tab);
void qdisc_put_stab(struct qdisc_size_table *tab);
bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
                     struct net_device *dev, struct netdev_queue *txq,
                     spinlock_t *root_lock, bool validate);

void __qdisc_run(struct Qdisc *q);

static inline void qdisc_run(struct Qdisc *q)
{
        if (qdisc_run_begin(q)) {
                __qdisc_run(q);
                qdisc_run_end(q);
        }
}

extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1];

/* Calculate maximal size of packet seen by hard_start_xmit
   routine of this device.
 */
static inline unsigned int psched_mtu(const struct net_device *dev)
{
        return READ_ONCE(dev->mtu) + dev->hard_header_len;
}

static inline struct net *qdisc_net(struct Qdisc *q)
{
        return dev_net(q->dev_queue->dev);
}

struct tc_query_caps_base {
        enum tc_setup_type type;
        void *caps;
};

struct tc_cbs_qopt_offload {
        u8 enable;
        s32 queue;
        s32 hicredit;
        s32 locredit;
        s32 idleslope;
        s32 sendslope;
};

struct tc_etf_qopt_offload {
        u8 enable;
        s32 queue;
};

struct tc_mqprio_caps {
        bool validate_queue_counts:1;
};

struct tc_mqprio_qopt_offload {
        /* struct tc_mqprio_qopt must always be the first element */
        struct tc_mqprio_qopt qopt;
        struct netlink_ext_ack *extack;
        u16 mode;
        u16 shaper;
        u32 flags;
        u64 min_rate[TC_QOPT_MAX_QUEUE];
        u64 max_rate[TC_QOPT_MAX_QUEUE];
        unsigned long preemptible_tcs;
};

struct tc_taprio_caps {
        bool supports_queue_max_sdu:1;
        bool gate_mask_per_txq:1;
        /* Device expects lower TXQ numbers to have higher priority over higher
         * TXQs, regardless of their TC mapping. DO NOT USE FOR NEW DRIVERS,
         * INSTEAD ENFORCE A PROPER TC:TXQ MAPPING COMING FROM USER SPACE.
         */
        bool broken_mqprio:1;
};

enum tc_taprio_qopt_cmd {
        TAPRIO_CMD_REPLACE,
        TAPRIO_CMD_DESTROY,
        TAPRIO_CMD_STATS,
        TAPRIO_CMD_QUEUE_STATS,
};

/**
 * struct tc_taprio_qopt_stats - IEEE 802.1Qbv statistics
 * @window_drops: Frames that were dropped because they were too large to be
 *        transmitted in any of the allotted time windows (open gates) for their
 *        traffic class.
 * @tx_overruns: Frames still being transmitted by the MAC after the
 *        transmission gate associated with their traffic class has closed.
 *        Equivalent to `12.29.1.1.2 TransmissionOverrun` from 802.1Q-2018.
 */
struct tc_taprio_qopt_stats {
        u64 window_drops;
        u64 tx_overruns;
};

struct tc_taprio_qopt_queue_stats {
        int queue;
        struct tc_taprio_qopt_stats stats;
};

struct tc_taprio_sched_entry {
        u8 command; /* TC_TAPRIO_CMD_* */

        /* The gate_mask in the offloading side refers to traffic classes */
        u32 gate_mask;
        u32 interval;
};

struct tc_taprio_qopt_offload {
        enum tc_taprio_qopt_cmd cmd;

        union {
                /* TAPRIO_CMD_STATS */
                struct tc_taprio_qopt_stats stats;
                /* TAPRIO_CMD_QUEUE_STATS */
                struct tc_taprio_qopt_queue_stats queue_stats;
                /* TAPRIO_CMD_REPLACE */
                struct {
                        struct tc_mqprio_qopt_offload mqprio;
                        struct netlink_ext_ack *extack;
                        ktime_t base_time;
                        u64 cycle_time;
                        u64 cycle_time_extension;
                        u32 max_sdu[TC_MAX_QUEUE];

                        size_t num_entries;
                        struct tc_taprio_sched_entry entries[];
                };
        };
};

#if IS_ENABLED(CONFIG_NET_SCH_TAPRIO)

/* Reference counting */
struct tc_taprio_qopt_offload *taprio_offload_get(struct tc_taprio_qopt_offload
                                                  *offload);
void taprio_offload_free(struct tc_taprio_qopt_offload *offload);

#else

/* Reference counting */
static inline struct tc_taprio_qopt_offload *
taprio_offload_get(struct tc_taprio_qopt_offload *offload)
{
        return NULL;
}

static inline void taprio_offload_free(struct tc_taprio_qopt_offload *offload)
{
}

#endif

/* Ensure skb_mstamp_ns, which might have been populated with the txtime, is
 * not mistaken for a software timestamp, because this will otherwise prevent
 * the dispatch of hardware timestamps to the socket.
 */
static inline void skb_txtime_consumed(struct sk_buff *skb)
{
        skb->tstamp = ktime_set(0, 0);
}

static inline bool tc_qdisc_stats_dump(struct Qdisc *sch,
                                       unsigned long cl,
                                       struct qdisc_walker *arg)
{
        if (arg->count >= arg->skip && arg->fn(sch, cl, arg) < 0) {
                arg->stop = 1;
                return false;
        }

        arg->count++;
        return true;
}

static inline void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
{
        if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
                pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
                        txt, qdisc->ops->id, qdisc->handle >> 16);
                qdisc->flags |= TCQ_F_WARN_NONWC;
        }
}

static inline unsigned int qdisc_peek_len(struct Qdisc *sch)
{
        struct sk_buff *skb;
        unsigned int len;

        skb = sch->ops->peek(sch);
        if (unlikely(skb == NULL)) {
                qdisc_warn_nonwc("qdisc_peek_len", sch);
                return 0;
        }
        len = qdisc_pkt_len(skb);

        return len;
}

#endif






























































































































   39 


   39 




















   38 























































































   39 




   39 







   39 
   39 
   39 



















































   39 










































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * internal.h - printk internal definitions
 */
#include <linux/console.h>
#include <linux/percpu.h>
#include <linux/types.h>

#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)
struct ctl_table;
void __init printk_sysctl_init(void);
int devkmsg_sysctl_set_loglvl(const struct ctl_table *table, int write,
                              void *buffer, size_t *lenp, loff_t *ppos);
#else
#define printk_sysctl_init() do { } while (0)
#endif

#define con_printk(lvl, con, fmt, ...)                                \
        printk(lvl pr_fmt("%s%sconsole [%s%d] " fmt),                \
                (con->flags & CON_NBCON) ? "" : "legacy ",        \
                (con->flags & CON_BOOT) ? "boot" : "",                \
                con->name, con->index, ##__VA_ARGS__)

/*
 * Identify if legacy printing is forced in a dedicated kthread. If
 * true, all printing via console lock occurs within a dedicated
 * legacy printer thread. The only exception is on panic, after the
 * nbcon consoles have had their chance to print the panic messages
 * first.
 */
#ifdef CONFIG_PREEMPT_RT
# define force_legacy_kthread()        (true)
#else
# define force_legacy_kthread()        (false)
#endif

#ifdef CONFIG_PRINTK

#ifdef CONFIG_PRINTK_CALLER
#define PRINTK_PREFIX_MAX        48
#else
#define PRINTK_PREFIX_MAX        32
#endif

/*
 * the maximum size of a formatted record (i.e. with prefix added
 * per line and dropped messages or in extended message format)
 */
#define PRINTK_MESSAGE_MAX        2048

/* the maximum size allowed to be reserved for a record */
#define PRINTKRB_RECORD_MAX        1024

/* Flags for a single printk record. */
enum printk_info_flags {
        /* always show on console, ignore console_loglevel */
        LOG_FORCE_CON        = 1,
        LOG_NEWLINE        = 2,        /* text ended with a newline */
        LOG_CONT        = 8,        /* text is a fragment of a continuation line */
};

struct printk_ringbuffer;
struct dev_printk_info;

extern struct printk_ringbuffer *prb;
extern bool printk_kthreads_running;
extern bool printk_kthreads_ready;
extern bool debug_non_panic_cpus;

__printf(4, 0)
int vprintk_store(int facility, int level,
                  const struct dev_printk_info *dev_info,
                  const char *fmt, va_list args);

__printf(1, 0) int vprintk_default(const char *fmt, va_list args);

void __printk_safe_enter(void);
void __printk_safe_exit(void);

bool printk_percpu_data_ready(void);

#define printk_safe_enter_irqsave(flags)        \
        do {                                        \
                local_irq_save(flags);                \
                __printk_safe_enter();                \
        } while (0)

#define printk_safe_exit_irqrestore(flags)        \
        do {                                        \
                __printk_safe_exit();                \
                local_irq_restore(flags);        \
        } while (0)

void defer_console_output(void);
bool is_printk_legacy_deferred(void);
bool is_printk_force_console(void);

u16 printk_parse_prefix(const char *text, int *level,
                        enum printk_info_flags *flags);
void console_lock_spinning_enable(void);
int console_lock_spinning_disable_and_check(int cookie);

u64 nbcon_seq_read(struct console *con);
void nbcon_seq_force(struct console *con, u64 seq);
bool nbcon_alloc(struct console *con);
void nbcon_free(struct console *con);
enum nbcon_prio nbcon_get_default_prio(void);
void nbcon_atomic_flush_pending(void);
bool nbcon_legacy_emit_next_record(struct console *con, bool *handover,
                                   int cookie, bool use_atomic);
bool nbcon_kthread_create(struct console *con);
void nbcon_kthread_stop(struct console *con);
void nbcon_kthreads_wake(void);

/*
 * Check if the given console is currently capable and allowed to print
 * records. Note that this function does not consider the current context,
 * which can also play a role in deciding if @con can be used to print
 * records.
 */
static inline bool console_is_usable(struct console *con, short flags, bool use_atomic)
{
        if (!(flags & CON_ENABLED))
                return false;

        if ((flags & CON_SUSPENDED))
                return false;

        if (flags & CON_NBCON) {
                /* The write_atomic() callback is optional. */
                if (use_atomic && !con->write_atomic)
                        return false;

                /*
                 * For the !use_atomic case, @printk_kthreads_running is not
                 * checked because the write_thread() callback is also used
                 * via the legacy loop when the printer threads are not
                 * available.
                 */
        } else {
                if (!con->write)
                        return false;
        }

        /*
         * Console drivers may assume that per-cpu resources have been
         * allocated. So unless they're explicitly marked as being able to
         * cope (CON_ANYTIME) don't call them until this CPU is officially up.
         */
        if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME))
                return false;

        return true;
}

/**
 * nbcon_kthread_wake - Wake up a console printing thread
 * @con:        Console to operate on
 */
static inline void nbcon_kthread_wake(struct console *con)
{
        /*
         * Guarantee any new records can be seen by tasks preparing to wait
         * before this context checks if the rcuwait is empty.
         *
         * The full memory barrier in rcuwait_wake_up() pairs with the full
         * memory barrier within set_current_state() of
         * ___rcuwait_wait_event(), which is called after prepare_to_rcuwait()
         * adds the waiter but before it has checked the wait condition.
         *
         * This pairs with nbcon_kthread_func:A.
         */
        rcuwait_wake_up(&con->rcuwait); /* LMM(nbcon_kthread_wake:A) */
}

#else

#define PRINTK_PREFIX_MAX        0
#define PRINTK_MESSAGE_MAX        0
#define PRINTKRB_RECORD_MAX        0

#define printk_kthreads_running (false)
#define printk_kthreads_ready (false)

/*
 * In !PRINTK builds we still export console_sem
 * semaphore and some of console functions (console_unlock()/etc.), so
 * printk-safe must preserve the existing local IRQ guarantees.
 */
#define printk_safe_enter_irqsave(flags) local_irq_save(flags)
#define printk_safe_exit_irqrestore(flags) local_irq_restore(flags)

static inline bool printk_percpu_data_ready(void) { return false; }
static inline void defer_console_output(void) { }
static inline bool is_printk_legacy_deferred(void) { return false; }
static inline u64 nbcon_seq_read(struct console *con) { return 0; }
static inline void nbcon_seq_force(struct console *con, u64 seq) { }
static inline bool nbcon_alloc(struct console *con) { return false; }
static inline void nbcon_free(struct console *con) { }
static inline enum nbcon_prio nbcon_get_default_prio(void) { return NBCON_PRIO_NONE; }
static inline void nbcon_atomic_flush_pending(void) { }
static inline bool nbcon_legacy_emit_next_record(struct console *con, bool *handover,
                                                 int cookie, bool use_atomic) { return false; }
static inline void nbcon_kthread_wake(struct console *con) { }
static inline void nbcon_kthreads_wake(void) { }

static inline bool console_is_usable(struct console *con, short flags,
                                     bool use_atomic) { return false; }

#endif /* CONFIG_PRINTK */

extern bool have_boot_console;
extern bool have_nbcon_console;
extern bool have_legacy_console;
extern bool legacy_allow_panic_sync;

/**
 * struct console_flush_type - Define available console flush methods
 * @nbcon_atomic:        Flush directly using nbcon_atomic() callback
 * @nbcon_offload:        Offload flush to printer thread
 * @legacy_direct:        Call the legacy loop in this context
 * @legacy_offload:        Offload the legacy loop into IRQ or legacy thread
 *
 * Note that the legacy loop also flushes the nbcon consoles.
 */
struct console_flush_type {
        bool        nbcon_atomic;
        bool        nbcon_offload;
        bool        legacy_direct;
        bool        legacy_offload;
};

/*
 * Identify which console flushing methods should be used in the context of
 * the caller.
 */
static inline void printk_get_console_flush_type(struct console_flush_type *ft)
{
        memset(ft, 0, sizeof(*ft));

        switch (nbcon_get_default_prio()) {
        case NBCON_PRIO_NORMAL:
                if (have_nbcon_console && !have_boot_console) {
                        if (printk_kthreads_running)
                                ft->nbcon_offload = true;
                        else
                                ft->nbcon_atomic = true;
                }

                /* Legacy consoles are flushed directly when possible. */
                if (have_legacy_console || have_boot_console) {
                        if (!is_printk_legacy_deferred())
                                ft->legacy_direct = true;
                        else
                                ft->legacy_offload = true;
                }
                break;

        case NBCON_PRIO_EMERGENCY:
                if (have_nbcon_console && !have_boot_console)
                        ft->nbcon_atomic = true;

                /* Legacy consoles are flushed directly when possible. */
                if (have_legacy_console || have_boot_console) {
                        if (!is_printk_legacy_deferred())
                                ft->legacy_direct = true;
                        else
                                ft->legacy_offload = true;
                }
                break;

        case NBCON_PRIO_PANIC:
                /*
                 * In panic, the nbcon consoles will directly print. But
                 * only allowed if there are no boot consoles.
                 */
                if (have_nbcon_console && !have_boot_console)
                        ft->nbcon_atomic = true;

                if (have_legacy_console || have_boot_console) {
                        /*
                         * This is the same decision as NBCON_PRIO_NORMAL
                         * except that offloading never occurs in panic.
                         *
                         * Note that console_flush_on_panic() will flush
                         * legacy consoles anyway, even if unsafe.
                         */
                        if (!is_printk_legacy_deferred())
                                ft->legacy_direct = true;

                        /*
                         * In panic, if nbcon atomic printing occurs,
                         * the legacy consoles must remain silent until
                         * explicitly allowed.
                         */
                        if (ft->nbcon_atomic && !legacy_allow_panic_sync)
                                ft->legacy_direct = false;
                }
                break;

        default:
                WARN_ON_ONCE(1);
                break;
        }
}

extern struct printk_buffers printk_shared_pbufs;

/**
 * struct printk_buffers - Buffers to read/format/output printk messages.
 * @outbuf:        After formatting, contains text to output.
 * @scratchbuf:        Used as temporary ringbuffer reading and string-print space.
 */
struct printk_buffers {
        char        outbuf[PRINTK_MESSAGE_MAX];
        char        scratchbuf[PRINTKRB_RECORD_MAX];
};

/**
 * struct printk_message - Container for a prepared printk message.
 * @pbufs:        printk buffers used to prepare the message.
 * @outbuf_len:        The length of prepared text in @pbufs->outbuf to output. This
 *                does not count the terminator. A value of 0 means there is
 *                nothing to output and this record should be skipped.
 * @seq:        The sequence number of the record used for @pbufs->outbuf.
 * @dropped:        The number of dropped records from reading @seq.
 */
struct printk_message {
        struct printk_buffers        *pbufs;
        unsigned int                outbuf_len;
        u64                        seq;
        unsigned long                dropped;
};

bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
                             bool is_extended, bool may_supress);

#ifdef CONFIG_PRINTK
void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped);
void console_prepend_replay(struct printk_message *pmsg);
#endif

#ifdef CONFIG_SMP
bool is_printk_cpu_sync_owner(void);
#else
static inline bool is_printk_cpu_sync_owner(void) { return false; }
#endif















































































































































































































    4 







































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
/* SPDX-License-Identifier: GPL-2.0 */
/* rwsem.h: R/W semaphores, public interface
 *
 * Written by David Howells (dhowells@redhat.com).
 * Derived from asm-i386/semaphore.h
 */

#ifndef _LINUX_RWSEM_H
#define _LINUX_RWSEM_H

#include <linux/linkage.h>

#include <linux/types.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/atomic.h>
#include <linux/err.h>
#include <linux/cleanup.h>

#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define __RWSEM_DEP_MAP_INIT(lockname)                        \
        .dep_map = {                                        \
                .name = #lockname,                        \
                .wait_type_inner = LD_WAIT_SLEEP,        \
        },
#else
# define __RWSEM_DEP_MAP_INIT(lockname)
#endif

#ifndef CONFIG_PREEMPT_RT

#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
#include <linux/osq_lock.h>
#endif

/*
 * For an uncontended rwsem, count and owner are the only fields a task
 * needs to touch when acquiring the rwsem. So they are put next to each
 * other to increase the chance that they will share the same cacheline.
 *
 * In a contended rwsem, the owner is likely the most frequently accessed
 * field in the structure as the optimistic waiter that holds the osq lock
 * will spin on owner. For an embedded rwsem, other hot fields in the
 * containing structure should be moved further away from the rwsem to
 * reduce the chance that they will share the same cacheline causing
 * cacheline bouncing problem.
 */
struct rw_semaphore {
        atomic_long_t count;
        /*
         * Write owner or one of the read owners as well flags regarding
         * the current state of the rwsem. Can be used as a speculative
         * check to see if the write owner is running on the cpu.
         */
        atomic_long_t owner;
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
        struct optimistic_spin_queue osq; /* spinner MCS lock */
#endif
        raw_spinlock_t wait_lock;
        struct list_head wait_list;
#ifdef CONFIG_DEBUG_RWSEMS
        void *magic;
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map        dep_map;
#endif
};

#define RWSEM_UNLOCKED_VALUE                0UL
#define RWSEM_WRITER_LOCKED                (1UL << 0)
#define __RWSEM_COUNT_INIT(name)        .count = ATOMIC_LONG_INIT(RWSEM_UNLOCKED_VALUE)

static inline int rwsem_is_locked(struct rw_semaphore *sem)
{
        return atomic_long_read(&sem->count) != RWSEM_UNLOCKED_VALUE;
}

static inline void rwsem_assert_held_nolockdep(const struct rw_semaphore *sem)
{
        WARN_ON(atomic_long_read(&sem->count) == RWSEM_UNLOCKED_VALUE);
}

static inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *sem)
{
        WARN_ON(!(atomic_long_read(&sem->count) & RWSEM_WRITER_LOCKED));
}

/* Common initializer macros and functions */

#ifdef CONFIG_DEBUG_RWSEMS
# define __RWSEM_DEBUG_INIT(lockname) .magic = &lockname,
#else
# define __RWSEM_DEBUG_INIT(lockname)
#endif

#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
#define __RWSEM_OPT_INIT(lockname) .osq = OSQ_LOCK_UNLOCKED,
#else
#define __RWSEM_OPT_INIT(lockname)
#endif

#define __RWSEM_INITIALIZER(name)                                \
        { __RWSEM_COUNT_INIT(name),                                \
          .owner = ATOMIC_LONG_INIT(0),                                \
          __RWSEM_OPT_INIT(name)                                \
          .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),\
          .wait_list = LIST_HEAD_INIT((name).wait_list),        \
          __RWSEM_DEBUG_INIT(name)                                \
          __RWSEM_DEP_MAP_INIT(name) }

#define DECLARE_RWSEM(name) \
        struct rw_semaphore name = __RWSEM_INITIALIZER(name)

extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
                         struct lock_class_key *key);

#define init_rwsem(sem)                                                \
do {                                                                \
        static struct lock_class_key __key;                        \
                                                                \
        __init_rwsem((sem), #sem, &__key);                        \
} while (0)

/*
 * This is the same regardless of which rwsem implementation that is being used.
 * It is just a heuristic meant to be called by somebody already holding the
 * rwsem to see if somebody from an incompatible type is wanting access to the
 * lock.
 */
static inline int rwsem_is_contended(struct rw_semaphore *sem)
{
        return !list_empty(&sem->wait_list);
}

#if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKER)
/*
 * Return just the real task structure pointer of the owner
 */
extern struct task_struct *rwsem_owner(struct rw_semaphore *sem);

/*
 * Return true if the rwsem is owned by a reader.
 */
extern bool is_rwsem_reader_owned(struct rw_semaphore *sem);
#endif

#else /* !CONFIG_PREEMPT_RT */

#include <linux/rwbase_rt.h>

struct rw_semaphore {
        struct rwbase_rt        rwbase;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map        dep_map;
#endif
};

#define __RWSEM_INITIALIZER(name)                                \
        {                                                        \
                .rwbase = __RWBASE_INITIALIZER(name),                \
                __RWSEM_DEP_MAP_INIT(name)                        \
        }

#define DECLARE_RWSEM(lockname) \
        struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)

extern void  __init_rwsem(struct rw_semaphore *rwsem, const char *name,
                          struct lock_class_key *key);

#define init_rwsem(sem)                                                \
do {                                                                \
        static struct lock_class_key __key;                        \
                                                                \
        __init_rwsem((sem), #sem, &__key);                        \
} while (0)

static __always_inline int rwsem_is_locked(const struct rw_semaphore *sem)
{
        return rw_base_is_locked(&sem->rwbase);
}

static __always_inline void rwsem_assert_held_nolockdep(const struct rw_semaphore *sem)
{
        WARN_ON(!rwsem_is_locked(sem));
}

static __always_inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *sem)
{
        WARN_ON(!rw_base_is_write_locked(&sem->rwbase));
}

static __always_inline int rwsem_is_contended(struct rw_semaphore *sem)
{
        return rw_base_is_contended(&sem->rwbase);
}

#endif /* CONFIG_PREEMPT_RT */

/*
 * The functions below are the same for all rwsem implementations including
 * the RT specific variant.
 */

static inline void rwsem_assert_held(const struct rw_semaphore *sem)
{
        if (IS_ENABLED(CONFIG_LOCKDEP))
                lockdep_assert_held(sem);
        else
                rwsem_assert_held_nolockdep(sem);
}

static inline void rwsem_assert_held_write(const struct rw_semaphore *sem)
{
        if (IS_ENABLED(CONFIG_LOCKDEP))
                lockdep_assert_held_write(sem);
        else
                rwsem_assert_held_write_nolockdep(sem);
}

/*
 * lock for reading
 */
extern void down_read(struct rw_semaphore *sem);
extern int __must_check down_read_interruptible(struct rw_semaphore *sem);
extern int __must_check down_read_killable(struct rw_semaphore *sem);

/*
 * trylock for reading -- returns 1 if successful, 0 if contention
 */
extern int down_read_trylock(struct rw_semaphore *sem);

/*
 * lock for writing
 */
extern void down_write(struct rw_semaphore *sem);
extern int __must_check down_write_killable(struct rw_semaphore *sem);

/*
 * trylock for writing -- returns 1 if successful, 0 if contention
 */
extern int down_write_trylock(struct rw_semaphore *sem);

/*
 * release a read lock
 */
extern void up_read(struct rw_semaphore *sem);

/*
 * release a write lock
 */
extern void up_write(struct rw_semaphore *sem);

DEFINE_GUARD(rwsem_read, struct rw_semaphore *, down_read(_T), up_read(_T))
DEFINE_GUARD_COND(rwsem_read, _try, down_read_trylock(_T))
DEFINE_GUARD_COND(rwsem_read, _intr, down_read_interruptible(_T), _RET == 0)

DEFINE_GUARD(rwsem_write, struct rw_semaphore *, down_write(_T), up_write(_T))
DEFINE_GUARD_COND(rwsem_write, _try, down_write_trylock(_T))
DEFINE_GUARD_COND(rwsem_write, _kill, down_write_killable(_T), _RET == 0)

/*
 * downgrade write lock to read lock
 */
extern void downgrade_write(struct rw_semaphore *sem);

#ifdef CONFIG_DEBUG_LOCK_ALLOC
/*
 * nested locking. NOTE: rwsems are not allowed to recurse
 * (which occurs if the same task tries to acquire the same
 * lock instance multiple times), but multiple locks of the
 * same lock class might be taken, if the order of the locks
 * is always the same. This ordering rule can be expressed
 * to lockdep via the _nested() APIs, but enumerating the
 * subclasses that are used. (If the nesting relationship is
 * static then another method for expressing nested locking is
 * the explicit definition of lock class keys and the use of
 * lockdep_set_class() at lock initialization time.
 * See Documentation/locking/lockdep-design.rst for more details.)
 */
extern void down_read_nested(struct rw_semaphore *sem, int subclass);
extern int __must_check down_read_killable_nested(struct rw_semaphore *sem, int subclass);
extern void down_write_nested(struct rw_semaphore *sem, int subclass);
extern int down_write_killable_nested(struct rw_semaphore *sem, int subclass);
extern void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest_lock);

# define down_write_nest_lock(sem, nest_lock)                        \
do {                                                                \
        typecheck(struct lockdep_map *, &(nest_lock)->dep_map);        \
        _down_write_nest_lock(sem, &(nest_lock)->dep_map);        \
} while (0)

/*
 * Take/release a lock when not the owner will release it.
 *
 * [ This API should be avoided as much as possible - the
 *   proper abstraction for this case is completions. ]
 */
extern void down_read_non_owner(struct rw_semaphore *sem);
extern void up_read_non_owner(struct rw_semaphore *sem);
#else
# define down_read_nested(sem, subclass)                down_read(sem)
# define down_read_killable_nested(sem, subclass)        down_read_killable(sem)
# define down_write_nest_lock(sem, nest_lock)        down_write(sem)
# define down_write_nested(sem, subclass)        down_write(sem)
# define down_write_killable_nested(sem, subclass)        down_write_killable(sem)
# define down_read_non_owner(sem)                down_read(sem)
# define up_read_non_owner(sem)                        up_read(sem)
#endif

#endif /* _LINUX_RWSEM_H */

















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_TIME64_H
#define _LINUX_TIME64_H

#include <linux/math64.h>
#include <vdso/time64.h>

typedef __s64 time64_t;
typedef __u64 timeu64_t;

#include <uapi/linux/time.h>

struct timespec64 {
        time64_t        tv_sec;                        /* seconds */
        long                tv_nsec;                /* nanoseconds */
};

struct itimerspec64 {
        struct timespec64 it_interval;
        struct timespec64 it_value;
};

/* Parameters used to convert the timespec values: */
#define PSEC_PER_NSEC                        1000L

/* Located here for timespec[64]_valid_strict */
#define TIME64_MAX                        ((s64)~((u64)1 << 63))
#define TIME64_MIN                        (-TIME64_MAX - 1)

#define KTIME_MAX                        ((s64)~((u64)1 << 63))
#define KTIME_MIN                        (-KTIME_MAX - 1)
#define KTIME_SEC_MAX                        (KTIME_MAX / NSEC_PER_SEC)
#define KTIME_SEC_MIN                        (KTIME_MIN / NSEC_PER_SEC)

/*
 * Limits for settimeofday():
 *
 * To prevent setting the time close to the wraparound point time setting
 * is limited so a reasonable uptime can be accomodated. Uptime of 30 years
 * should be really sufficient, which means the cutoff is 2232. At that
 * point the cutoff is just a small part of the larger problem.
 */
#define TIME_UPTIME_SEC_MAX                (30LL * 365 * 24 *3600)
#define TIME_SETTOD_SEC_MAX                (KTIME_SEC_MAX - TIME_UPTIME_SEC_MAX)

static inline int timespec64_equal(const struct timespec64 *a,
                                   const struct timespec64 *b)
{
        return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec);
}

static inline bool timespec64_is_epoch(const struct timespec64 *ts)
{
        return ts->tv_sec == 0 && ts->tv_nsec == 0;
}

/*
 * lhs < rhs:  return <0
 * lhs == rhs: return 0
 * lhs > rhs:  return >0
 */
static inline int timespec64_compare(const struct timespec64 *lhs, const struct timespec64 *rhs)
{
        if (lhs->tv_sec < rhs->tv_sec)
                return -1;
        if (lhs->tv_sec > rhs->tv_sec)
                return 1;
        return lhs->tv_nsec - rhs->tv_nsec;
}

extern void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec);

static inline struct timespec64 timespec64_add(struct timespec64 lhs,
                                                struct timespec64 rhs)
{
        struct timespec64 ts_delta;
        set_normalized_timespec64(&ts_delta, lhs.tv_sec + rhs.tv_sec,
                                lhs.tv_nsec + rhs.tv_nsec);
        return ts_delta;
}

/*
 * sub = lhs - rhs, in normalized form
 */
static inline struct timespec64 timespec64_sub(struct timespec64 lhs,
                                                struct timespec64 rhs)
{
        struct timespec64 ts_delta;
        set_normalized_timespec64(&ts_delta, lhs.tv_sec - rhs.tv_sec,
                                lhs.tv_nsec - rhs.tv_nsec);
        return ts_delta;
}

/*
 * Returns true if the timespec64 is norm, false if denorm:
 */
static inline bool timespec64_valid(const struct timespec64 *ts)
{
        /* Dates before 1970 are bogus */
        if (ts->tv_sec < 0)
                return false;
        /* Can't have more nanoseconds then a second */
        if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
                return false;
        return true;
}

static inline bool timespec64_valid_strict(const struct timespec64 *ts)
{
        if (!timespec64_valid(ts))
                return false;
        /* Disallow values that could overflow ktime_t */
        if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX)
                return false;
        return true;
}

static inline bool timespec64_valid_settod(const struct timespec64 *ts)
{
        if (!timespec64_valid(ts))
                return false;
        /* Disallow values which cause overflow issues vs. CLOCK_REALTIME */
        if ((unsigned long long)ts->tv_sec >= TIME_SETTOD_SEC_MAX)
                return false;
        return true;
}

/**
 * timespec64_to_ns - Convert timespec64 to nanoseconds
 * @ts:                pointer to the timespec64 variable to be converted
 *
 * Returns the scalar nanosecond representation of the timespec64
 * parameter.
 */
static inline s64 timespec64_to_ns(const struct timespec64 *ts)
{
        /* Prevent multiplication overflow / underflow */
        if (ts->tv_sec >= KTIME_SEC_MAX)
                return KTIME_MAX;

        if (ts->tv_sec <= KTIME_SEC_MIN)
                return KTIME_MIN;

        return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec;
}

/**
 * ns_to_timespec64 - Convert nanoseconds to timespec64
 * @nsec:        the nanoseconds value to be converted
 *
 * Returns the timespec64 representation of the nsec parameter.
 */
extern struct timespec64 ns_to_timespec64(s64 nsec);

/**
 * timespec64_add_ns - Adds nanoseconds to a timespec64
 * @a:                pointer to timespec64 to be incremented
 * @ns:                unsigned nanoseconds value to be added
 *
 * This must always be inlined because its used from the x86-64 vdso,
 * which cannot call other kernel functions.
 */
static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns)
{
        a->tv_sec += __iter_div_u64_rem(a->tv_nsec + ns, NSEC_PER_SEC, &ns);
        a->tv_nsec = ns;
}

/*
 * timespec64_add_safe assumes both values are positive and checks for
 * overflow. It will return TIME64_MAX in case of overflow.
 */
extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
                                         const struct timespec64 rhs);

#endif /* _LINUX_TIME64_H */






























































































  313 





  313 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_COMPAT_H
#define _ASM_X86_COMPAT_H

/*
 * Architecture specific compatibility types
 */
#include <linux/types.h>
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
#include <asm/processor.h>
#include <asm/user32.h>
#include <asm/unistd.h>

#define compat_mode_t        compat_mode_t
typedef u16                compat_mode_t;

#define __compat_uid_t        __compat_uid_t
typedef u16                __compat_uid_t;
typedef u16                __compat_gid_t;

#define compat_dev_t        compat_dev_t
typedef u16                compat_dev_t;

#define compat_ipc_pid_t compat_ipc_pid_t
typedef u16                 compat_ipc_pid_t;

#define compat_statfs        compat_statfs

#include <asm-generic/compat.h>

#define COMPAT_UTS_MACHINE        "i686\0\0"

typedef u16                compat_nlink_t;

struct compat_stat {
        u32                st_dev;
        compat_ino_t        st_ino;
        compat_mode_t        st_mode;
        compat_nlink_t        st_nlink;
        __compat_uid_t        st_uid;
        __compat_gid_t        st_gid;
        u32                st_rdev;
        u32                st_size;
        u32                st_blksize;
        u32                st_blocks;
        u32                st_atime;
        u32                st_atime_nsec;
        u32                st_mtime;
        u32                st_mtime_nsec;
        u32                st_ctime;
        u32                st_ctime_nsec;
        u32                __unused4;
        u32                __unused5;
};

/*
 * IA32 uses 4 byte alignment for 64 bit quantities, so we need to pack the
 * compat flock64 structure.
 */
#define __ARCH_NEED_COMPAT_FLOCK64_PACKED

struct compat_statfs {
        int                f_type;
        int                f_bsize;
        int                f_blocks;
        int                f_bfree;
        int                f_bavail;
        int                f_files;
        int                f_ffree;
        compat_fsid_t        f_fsid;
        int                f_namelen;        /* SunOS ignores this field. */
        int                f_frsize;
        int                f_flags;
        int                f_spare[4];
};

#ifdef CONFIG_X86_X32_ABI
#define COMPAT_USE_64BIT_TIME \
        (!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT))
#endif

static inline bool in_x32_syscall(void)
{
#ifdef CONFIG_X86_X32_ABI
        if (task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)
                return true;
#endif
        return false;
}

static inline bool in_32bit_syscall(void)
{
        return in_ia32_syscall() || in_x32_syscall();
}

#ifdef CONFIG_COMPAT
static inline bool in_compat_syscall(void)
{
        return in_32bit_syscall();
}
#define in_compat_syscall in_compat_syscall        /* override the generic impl */
#define compat_need_64bit_alignment_fixup in_ia32_syscall
#endif

struct compat_siginfo;

#ifdef CONFIG_X86_X32_ABI
int copy_siginfo_to_user32(struct compat_siginfo __user *to,
                const kernel_siginfo_t *from);
#define copy_siginfo_to_user32 copy_siginfo_to_user32
#endif /* CONFIG_X86_X32_ABI */

#endif /* _ASM_X86_COMPAT_H */










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        Linux ethernet bridge
 *
 *        Authors:
 *        Lennert Buytenhek                <buytenh@gnu.org>
 */

#ifndef _BR_PRIVATE_H
#define _BR_PRIVATE_H

#include <linux/netdevice.h>
#include <linux/if_bridge.h>
#include <linux/netpoll.h>
#include <linux/u64_stats_sync.h>
#include <net/route.h>
#include <net/ip6_fib.h>
#include <net/pkt_cls.h>
#include <linux/if_vlan.h>
#include <linux/rhashtable.h>
#include <linux/refcount.h>

#define BR_HASH_BITS 8
#define BR_HASH_SIZE (1 << BR_HASH_BITS)

#define BR_HOLD_TIME (1*HZ)

#define BR_PORT_BITS        10
#define BR_MAX_PORTS        (1<<BR_PORT_BITS)

#define BR_MULTICAST_DEFAULT_HASH_MAX 4096
#define BR_MULTICAST_QUERY_INTVL_MIN msecs_to_jiffies(1000)
#define BR_MULTICAST_STARTUP_QUERY_INTVL_MIN BR_MULTICAST_QUERY_INTVL_MIN
#define BR_MULTICAST_QUERY_INTVL_MAX msecs_to_jiffies(86400000) /* 24 hours */
#define BR_MULTICAST_STARTUP_QUERY_INTVL_MAX BR_MULTICAST_QUERY_INTVL_MAX

#define BR_HWDOM_MAX BITS_PER_LONG

#define BR_VERSION        "2.3"

/* Control of forwarding link local multicast */
#define BR_GROUPFWD_DEFAULT        0
/* Don't allow forwarding of control protocols like STP, MAC PAUSE and LACP */
enum {
        BR_GROUPFWD_STP                = BIT(0),
        BR_GROUPFWD_MACPAUSE        = BIT(1),
        BR_GROUPFWD_LACP        = BIT(2),
};

#define BR_GROUPFWD_RESTRICTED (BR_GROUPFWD_STP | BR_GROUPFWD_MACPAUSE | \
                                BR_GROUPFWD_LACP)
/* The Nearest Customer Bridge Group Address, 01-80-C2-00-00-[00,0B,0C,0D,0F] */
#define BR_GROUPFWD_8021AD        0xB801u

/* Path to usermode spanning tree program */
#define BR_STP_PROG        "/sbin/bridge-stp"

#define BR_FDB_NOTIFY_SETTABLE_BITS (FDB_NOTIFY_BIT | FDB_NOTIFY_INACTIVE_BIT)

typedef struct bridge_id bridge_id;
typedef struct mac_addr mac_addr;
typedef __u16 port_id;

struct bridge_id {
        unsigned char        prio[2];
        unsigned char        addr[ETH_ALEN];
};

struct mac_addr {
        unsigned char        addr[ETH_ALEN];
};

#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
/* our own querier */
struct bridge_mcast_own_query {
        struct timer_list        timer;
        u32                        startup_sent;
};

/* other querier */
struct bridge_mcast_other_query {
        struct timer_list                timer;
        struct timer_list                delay_timer;
};

/* selected querier */
struct bridge_mcast_querier {
        struct br_ip addr;
        int port_ifidx;
        seqcount_spinlock_t seq;
};

/* IGMP/MLD statistics */
struct bridge_mcast_stats {
        struct br_mcast_stats mstats;
        struct u64_stats_sync syncp;
};

struct br_mdb_src_entry {
        struct br_ip                        addr;
};

struct br_mdb_config {
        struct net_bridge                *br;
        struct net_bridge_port                *p;
        struct br_mdb_entry                *entry;
        struct br_ip                        group;
        bool                                src_entry;
        u8                                filter_mode;
        u16                                nlflags;
        struct br_mdb_src_entry                *src_entries;
        int                                num_src_entries;
        u8                                rt_protocol;
};
#endif

/* net_bridge_mcast_port must be always defined due to forwarding stubs */
struct net_bridge_mcast_port {
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
        struct net_bridge_port                *port;
        struct net_bridge_vlan                *vlan;

        struct bridge_mcast_own_query        ip4_own_query;
        struct timer_list                ip4_mc_router_timer;
        struct hlist_node                ip4_rlist;
#if IS_ENABLED(CONFIG_IPV6)
        struct bridge_mcast_own_query        ip6_own_query;
        struct timer_list                ip6_mc_router_timer;
        struct hlist_node                ip6_rlist;
#endif /* IS_ENABLED(CONFIG_IPV6) */
        unsigned char                        multicast_router;
        u32                                mdb_n_entries;
        u32                                mdb_max_entries;
#endif /* CONFIG_BRIDGE_IGMP_SNOOPING */
};

/* net_bridge_mcast must be always defined due to forwarding stubs */
struct net_bridge_mcast {
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
        struct net_bridge                *br;
        struct net_bridge_vlan                *vlan;

        u32                                multicast_last_member_count;
        u32                                multicast_startup_query_count;

        u8                                multicast_querier;
        u8                                multicast_igmp_version;
        u8                                multicast_router;
#if IS_ENABLED(CONFIG_IPV6)
        u8                                multicast_mld_version;
#endif
        unsigned long                        multicast_last_member_interval;
        unsigned long                        multicast_membership_interval;
        unsigned long                        multicast_querier_interval;
        unsigned long                        multicast_query_interval;
        unsigned long                        multicast_query_response_interval;
        unsigned long                        multicast_startup_query_interval;
        struct hlist_head                ip4_mc_router_list;
        struct timer_list                ip4_mc_router_timer;
        struct bridge_mcast_other_query        ip4_other_query;
        struct bridge_mcast_own_query        ip4_own_query;
        struct bridge_mcast_querier        ip4_querier;
#if IS_ENABLED(CONFIG_IPV6)
        struct hlist_head                ip6_mc_router_list;
        struct timer_list                ip6_mc_router_timer;
        struct bridge_mcast_other_query        ip6_other_query;
        struct bridge_mcast_own_query        ip6_own_query;
        struct bridge_mcast_querier        ip6_querier;
#endif /* IS_ENABLED(CONFIG_IPV6) */
#endif /* CONFIG_BRIDGE_IGMP_SNOOPING */
};

struct br_tunnel_info {
        __be64                                tunnel_id;
        struct metadata_dst __rcu        *tunnel_dst;
};

/* private vlan flags */
enum {
        BR_VLFLAG_PER_PORT_STATS = BIT(0),
        BR_VLFLAG_ADDED_BY_SWITCHDEV = BIT(1),
        BR_VLFLAG_MCAST_ENABLED = BIT(2),
        BR_VLFLAG_GLOBAL_MCAST_ENABLED = BIT(3),
        BR_VLFLAG_NEIGH_SUPPRESS_ENABLED = BIT(4),
};

/**
 * struct net_bridge_vlan - per-vlan entry
 *
 * @vnode: rhashtable member
 * @tnode: rhashtable member
 * @vid: VLAN id
 * @flags: bridge vlan flags
 * @priv_flags: private (in-kernel) bridge vlan flags
 * @state: STP state (e.g. blocking, learning, forwarding)
 * @stats: per-cpu VLAN statistics
 * @br: if MASTER flag set, this points to a bridge struct
 * @port: if MASTER flag unset, this points to a port struct
 * @refcnt: if MASTER flag set, this is bumped for each port referencing it
 * @brvlan: if MASTER flag unset, this points to the global per-VLAN context
 *          for this VLAN entry
 * @tinfo: bridge tunnel info
 * @br_mcast_ctx: if MASTER flag set, this is the global vlan multicast context
 * @port_mcast_ctx: if MASTER flag unset, this is the per-port/vlan multicast
 *                  context
 * @msti: if MASTER flag set, this holds the VLANs MST instance
 * @vlist: sorted list of VLAN entries
 * @rcu: used for entry destruction
 *
 * This structure is shared between the global per-VLAN entries contained in
 * the bridge rhashtable and the local per-port per-VLAN entries contained in
 * the port's rhashtable. The union entries should be interpreted depending on
 * the entry flags that are set.
 */
struct net_bridge_vlan {
        struct rhash_head                vnode;
        struct rhash_head                tnode;
        u16                                vid;
        u16                                flags;
        u16                                priv_flags;
        u8                                state;
        struct pcpu_sw_netstats __percpu *stats;
        union {
                struct net_bridge        *br;
                struct net_bridge_port        *port;
        };
        union {
                refcount_t                refcnt;
                struct net_bridge_vlan        *brvlan;
        };

        struct br_tunnel_info                tinfo;

        union {
                struct net_bridge_mcast                br_mcast_ctx;
                struct net_bridge_mcast_port        port_mcast_ctx;
        };

        u16                                msti;

        struct list_head                vlist;

        struct rcu_head                        rcu;
};

/**
 * struct net_bridge_vlan_group
 *
 * @vlan_hash: VLAN entry rhashtable
 * @vlan_list: sorted VLAN entry list
 * @num_vlans: number of total VLAN entries
 * @pvid: PVID VLAN id
 * @pvid_state: PVID's STP state (e.g. forwarding, learning, blocking)
 *
 * IMPORTANT: Be careful when checking if there're VLAN entries using list
 *            primitives because the bridge can have entries in its list which
 *            are just for global context but not for filtering, i.e. they have
 *            the master flag set but not the brentry flag. If you have to check
 *            if there're "real" entries in the bridge please test @num_vlans
 */
struct net_bridge_vlan_group {
        struct rhashtable                vlan_hash;
        struct rhashtable                tunnel_hash;
        struct list_head                vlan_list;
        u16                                num_vlans;
        u16                                pvid;
        u8                                pvid_state;
};

/* bridge fdb flags */
enum {
        BR_FDB_LOCAL,
        BR_FDB_STATIC,
        BR_FDB_STICKY,
        BR_FDB_ADDED_BY_USER,
        BR_FDB_ADDED_BY_EXT_LEARN,
        BR_FDB_OFFLOADED,
        BR_FDB_NOTIFY,
        BR_FDB_NOTIFY_INACTIVE,
        BR_FDB_LOCKED,
        BR_FDB_DYNAMIC_LEARNED,
};

struct net_bridge_fdb_key {
        mac_addr addr;
        u16 vlan_id;
};

struct net_bridge_fdb_entry {
        struct rhash_head                rhnode;
        struct net_bridge_port                *dst;

        struct net_bridge_fdb_key        key;
        struct hlist_node                fdb_node;
        unsigned long                        flags;

        /* write-heavy members should not affect lookups */
        unsigned long                        updated ____cacheline_aligned_in_smp;
        unsigned long                        used;

        struct rcu_head                        rcu;
};

struct net_bridge_fdb_flush_desc {
        unsigned long                        flags;
        unsigned long                        flags_mask;
        int                                port_ifindex;
        u16                                vlan_id;
};

#define MDB_PG_FLAGS_PERMANENT                BIT(0)
#define MDB_PG_FLAGS_OFFLOAD                BIT(1)
#define MDB_PG_FLAGS_FAST_LEAVE                BIT(2)
#define MDB_PG_FLAGS_STAR_EXCL                BIT(3)
#define MDB_PG_FLAGS_BLOCKED                BIT(4)
#define MDB_PG_FLAGS_OFFLOAD_FAILED        BIT(5)

#define PG_SRC_ENT_LIMIT        32

#define BR_SGRP_F_DELETE        BIT(0)
#define BR_SGRP_F_SEND                BIT(1)
#define BR_SGRP_F_INSTALLED        BIT(2)
#define BR_SGRP_F_USER_ADDED        BIT(3)

struct net_bridge_mcast_gc {
        struct hlist_node                gc_node;
        void                                (*destroy)(struct net_bridge_mcast_gc *gc);
};

struct net_bridge_group_src {
        struct hlist_node                node;

        struct br_ip                        addr;
        struct net_bridge_port_group        *pg;
        u8                                flags;
        u8                                src_query_rexmit_cnt;
        struct timer_list                timer;

        struct net_bridge                *br;
        struct net_bridge_mcast_gc        mcast_gc;
        struct rcu_head                        rcu;
};

struct net_bridge_port_group_sg_key {
        struct net_bridge_port                *port;
        struct br_ip                        addr;
};

struct net_bridge_port_group {
        struct net_bridge_port_group __rcu *next;
        struct net_bridge_port_group_sg_key key;
        unsigned char                        eth_addr[ETH_ALEN] __aligned(2);
        unsigned char                        flags;
        unsigned char                        filter_mode;
        unsigned char                        grp_query_rexmit_cnt;
        unsigned char                        rt_protocol;

        struct hlist_head                src_list;
        unsigned int                        src_ents;
        struct timer_list                timer;
        struct timer_list                rexmit_timer;
        struct hlist_node                mglist;
        struct rb_root                        eht_set_tree;
        struct rb_root                        eht_host_tree;

        struct rhash_head                rhnode;
        struct net_bridge_mcast_gc        mcast_gc;
        struct rcu_head                        rcu;
};

struct net_bridge_mdb_entry {
        struct rhash_head                rhnode;
        struct net_bridge                *br;
        struct net_bridge_port_group __rcu *ports;
        struct br_ip                        addr;
        bool                                host_joined;

        struct timer_list                timer;
        struct hlist_node                mdb_node;

        struct net_bridge_mcast_gc        mcast_gc;
        struct rcu_head                        rcu;
};

struct net_bridge_port {
        struct net_bridge                *br;
        struct net_device                *dev;
        netdevice_tracker                dev_tracker;
        struct list_head                list;

        unsigned long                        flags;
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
        struct net_bridge_vlan_group        __rcu *vlgrp;
#endif
        struct net_bridge_port                __rcu *backup_port;
        u32                                backup_nhid;

        /* STP */
        u8                                priority;
        u8                                state;
        u16                                port_no;
        unsigned char                        topology_change_ack;
        unsigned char                        config_pending;
        port_id                                port_id;
        port_id                                designated_port;
        bridge_id                        designated_root;
        bridge_id                        designated_bridge;
        u32                                path_cost;
        u32                                designated_cost;
        unsigned long                        designated_age;

        struct timer_list                forward_delay_timer;
        struct timer_list                hold_timer;
        struct timer_list                message_age_timer;
        struct kobject                        kobj;
        struct rcu_head                        rcu;

        struct net_bridge_mcast_port        multicast_ctx;

#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
        struct bridge_mcast_stats        __percpu *mcast_stats;

        u32                                multicast_eht_hosts_limit;
        u32                                multicast_eht_hosts_cnt;
        struct hlist_head                mglist;
#endif

#ifdef CONFIG_SYSFS
        char                                sysfs_name[IFNAMSIZ];
#endif

#ifdef CONFIG_NET_POLL_CONTROLLER
        struct netpoll                        *np;
#endif
#ifdef CONFIG_NET_SWITCHDEV
        /* Identifier used to group ports that share the same switchdev
         * hardware domain.
         */
        int                                hwdom;
        int                                offload_count;
        struct netdev_phys_item_id        ppid;
#endif
        u16                                group_fwd_mask;
        u16                                backup_redirected_cnt;

        struct bridge_stp_xstats        stp_xstats;
};

#define kobj_to_brport(obj)        container_of(obj, struct net_bridge_port, kobj)

#define br_auto_port(p) ((p)->flags & BR_AUTO_MASK)
#define br_promisc_port(p) ((p)->flags & BR_PROMISC)

static inline struct net_bridge_port *br_port_get_rcu(const struct net_device *dev)
{
        return rcu_dereference(dev->rx_handler_data);
}

static inline struct net_bridge_port *br_port_get_rtnl(const struct net_device *dev)
{
        return netif_is_bridge_port(dev) ?
                rtnl_dereference(dev->rx_handler_data) : NULL;
}

static inline struct net_bridge_port *br_port_get_rtnl_rcu(const struct net_device *dev)
{
        return netif_is_bridge_port(dev) ?
                rcu_dereference_rtnl(dev->rx_handler_data) : NULL;
}

enum net_bridge_opts {
        BROPT_VLAN_ENABLED,
        BROPT_VLAN_STATS_ENABLED,
        BROPT_NF_CALL_IPTABLES,
        BROPT_NF_CALL_IP6TABLES,
        BROPT_NF_CALL_ARPTABLES,
        BROPT_GROUP_ADDR_SET,
        BROPT_MULTICAST_ENABLED,
        BROPT_MULTICAST_QUERY_USE_IFADDR,
        BROPT_MULTICAST_STATS_ENABLED,
        BROPT_HAS_IPV6_ADDR,
        BROPT_NEIGH_SUPPRESS_ENABLED,
        BROPT_MTU_SET_BY_USER,
        BROPT_VLAN_STATS_PER_PORT,
        BROPT_NO_LL_LEARN,
        BROPT_VLAN_BRIDGE_BINDING,
        BROPT_MCAST_VLAN_SNOOPING_ENABLED,
        BROPT_MST_ENABLED,
        BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION,
        BROPT_FDB_LOCAL_VLAN_0,
};

struct net_bridge {
        spinlock_t                        lock;
        spinlock_t                        hash_lock;
        struct hlist_head                frame_type_list;
        struct net_device                *dev;
        unsigned long                        options;
        /* These fields are accessed on each packet */
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
        __be16                                vlan_proto;
        u16                                default_pvid;
        struct net_bridge_vlan_group        __rcu *vlgrp;
#endif

        struct rhashtable                fdb_hash_tbl;
        struct list_head                port_list;
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
        union {
                struct rtable                fake_rtable;
                struct rt6_info                fake_rt6_info;
        };
        u32                                metrics[RTAX_MAX];
#endif
        u16                                group_fwd_mask;
        u16                                group_fwd_mask_required;

        /* STP */
        bridge_id                        designated_root;
        bridge_id                        bridge_id;
        unsigned char                        topology_change;
        unsigned char                        topology_change_detected;
        u16                                root_port;
        unsigned long                        max_age;
        unsigned long                        hello_time;
        unsigned long                        forward_delay;
        unsigned long                        ageing_time;
        unsigned long                        bridge_max_age;
        unsigned long                        bridge_hello_time;
        unsigned long                        bridge_forward_delay;
        unsigned long                        bridge_ageing_time;
        u32                                root_path_cost;

        u8                                group_addr[ETH_ALEN];

        enum {
                BR_NO_STP,                 /* no spanning tree */
                BR_KERNEL_STP,                /* old STP in kernel */
                BR_USER_STP,                /* new RSTP in userspace */
        } stp_enabled;

        struct net_bridge_mcast                multicast_ctx;

#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
        struct bridge_mcast_stats        __percpu *mcast_stats;

        u32                                hash_max;

        spinlock_t                        multicast_lock;

        struct rhashtable                mdb_hash_tbl;
        struct rhashtable                sg_port_tbl;

        struct hlist_head                mcast_gc_list;
        struct hlist_head                mdb_list;

        struct work_struct                mcast_gc_work;
#endif

        struct timer_list                hello_timer;
        struct timer_list                tcn_timer;
        struct timer_list                topology_change_timer;
        struct delayed_work                gc_work;
        struct kobject                        *ifobj;
        u32                                auto_cnt;

        atomic_t                        fdb_n_learned;
        u32                                fdb_max_learned;

#ifdef CONFIG_NET_SWITCHDEV
        /* Counter used to make sure that hardware domains get unique
         * identifiers in case a bridge spans multiple switchdev instances.
         */
        int                                last_hwdom;
        /* Bit mask of hardware domain numbers in use */
        unsigned long                        busy_hwdoms;
#endif
        struct hlist_head                fdb_list;

#if IS_ENABLED(CONFIG_BRIDGE_MRP)
        struct hlist_head                mrp_list;
#endif
#if IS_ENABLED(CONFIG_BRIDGE_CFM)
        struct hlist_head                mep_list;
#endif
};

struct br_input_skb_cb {
        struct net_device *brdev;

        u16 frag_max_size;
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
        u8 igmp;
        u8 mrouters_only:1;
#endif
        u8 proxyarp_replied:1;
        u8 src_port_isolated:1;
        u8 promisc:1;
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
        u8 vlan_filtered:1;
#endif
#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
        u8 br_netfilter_broute:1;
#endif

#ifdef CONFIG_NET_SWITCHDEV
        /* Set if TX data plane offloading is used towards at least one
         * hardware domain.
         */
        u8 tx_fwd_offload:1;
        /* The switchdev hardware domain from which this packet was received.
         * If skb->offload_fwd_mark was set, then this packet was already
         * forwarded by hardware to the other ports in the source hardware
         * domain, otherwise it wasn't.
         */
        int src_hwdom;
        /* Bit mask of hardware domains towards this packet has already been
         * transmitted using the TX data plane offload.
         */
        unsigned long fwd_hwdoms;
#endif

        u32 backup_nhid;
};

#define BR_INPUT_SKB_CB(__skb)        ((struct br_input_skb_cb *)(__skb)->cb)

#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
# define BR_INPUT_SKB_CB_MROUTERS_ONLY(__skb)        (BR_INPUT_SKB_CB(__skb)->mrouters_only)
#else
# define BR_INPUT_SKB_CB_MROUTERS_ONLY(__skb)        (0)
#endif

#define br_printk(level, br, format, args...)        \
        printk(level "%s: " format, (br)->dev->name, ##args)

#define br_err(__br, format, args...)                        \
        br_printk(KERN_ERR, __br, format, ##args)
#define br_warn(__br, format, args...)                        \
        br_printk(KERN_WARNING, __br, format, ##args)
#define br_notice(__br, format, args...)                \
        br_printk(KERN_NOTICE, __br, format, ##args)
#define br_info(__br, format, args...)                        \
        br_printk(KERN_INFO, __br, format, ##args)

#define br_debug(br, format, args...)                        \
        pr_debug("%s: " format,  (br)->dev->name, ##args)

/* called under bridge lock */
static inline int br_is_root_bridge(const struct net_bridge *br)
{
        return !memcmp(&br->bridge_id, &br->designated_root, 8);
}

/* check if a VLAN entry is global */
static inline bool br_vlan_is_master(const struct net_bridge_vlan *v)
{
        return v->flags & BRIDGE_VLAN_INFO_MASTER;
}

/* check if a VLAN entry is used by the bridge */
static inline bool br_vlan_is_brentry(const struct net_bridge_vlan *v)
{
        return v->flags & BRIDGE_VLAN_INFO_BRENTRY;
}

/* check if we should use the vlan entry, returns false if it's only context */
static inline bool br_vlan_should_use(const struct net_bridge_vlan *v)
{
        if (br_vlan_is_master(v)) {
                if (br_vlan_is_brentry(v))
                        return true;
                else
                        return false;
        }

        return true;
}

static inline bool nbp_state_should_learn(const struct net_bridge_port *p)
{
        return p->state == BR_STATE_LEARNING || p->state == BR_STATE_FORWARDING;
}

static inline bool br_vlan_valid_id(u16 vid, struct netlink_ext_ack *extack)
{
        bool ret = vid > 0 && vid < VLAN_VID_MASK;

        if (!ret)
                NL_SET_ERR_MSG_MOD(extack, "Vlan id is invalid");

        return ret;
}

static inline bool br_vlan_valid_range(const struct bridge_vlan_info *cur,
                                       const struct bridge_vlan_info *last,
                                       struct netlink_ext_ack *extack)
{
        /* pvid flag is not allowed in ranges */
        if (cur->flags & BRIDGE_VLAN_INFO_PVID) {
                NL_SET_ERR_MSG_MOD(extack, "Pvid isn't allowed in a range");
                return false;
        }

        /* when cur is the range end, check if:
         *  - it has range start flag
         *  - range ids are invalid (end is equal to or before start)
         */
        if (last) {
                if (cur->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) {
                        NL_SET_ERR_MSG_MOD(extack, "Found a new vlan range start while processing one");
                        return false;
                } else if (!(cur->flags & BRIDGE_VLAN_INFO_RANGE_END)) {
                        NL_SET_ERR_MSG_MOD(extack, "Vlan range end flag is missing");
                        return false;
                } else if (cur->vid <= last->vid) {
                        NL_SET_ERR_MSG_MOD(extack, "End vlan id is less than or equal to start vlan id");
                        return false;
                }
        }

        /* check for required range flags */
        if (!(cur->flags & (BRIDGE_VLAN_INFO_RANGE_BEGIN |
                            BRIDGE_VLAN_INFO_RANGE_END))) {
                NL_SET_ERR_MSG_MOD(extack, "Both vlan range flags are missing");
                return false;
        }

        return true;
}

static inline u8 br_vlan_multicast_router(const struct net_bridge_vlan *v)
{
        u8 mcast_router = MDB_RTR_TYPE_DISABLED;

#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
        if (!br_vlan_is_master(v))
                mcast_router = v->port_mcast_ctx.multicast_router;
        else
                mcast_router = v->br_mcast_ctx.multicast_router;
#endif

        return mcast_router;
}

static inline int br_afspec_cmd_to_rtm(int cmd)
{
        switch (cmd) {
        case RTM_SETLINK:
                return RTM_NEWVLAN;
        case RTM_DELLINK:
                return RTM_DELVLAN;
        }

        return 0;
}

static inline int br_opt_get(const struct net_bridge *br,
                             enum net_bridge_opts opt)
{
        return test_bit(opt, &br->options);
}

int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on,
                      struct netlink_ext_ack *extack);
int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt);
int br_boolopt_multi_toggle(struct net_bridge *br,
                            struct br_boolopt_multi *bm,
                            struct netlink_ext_ack *extack);
void br_boolopt_multi_get(const struct net_bridge *br,
                          struct br_boolopt_multi *bm);
void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on);

#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
static inline void br_tc_skb_miss_set(struct sk_buff *skb, bool miss)
{
        struct tc_skb_ext *ext;

        if (!tc_skb_ext_tc_enabled())
                return;

        ext = skb_ext_find(skb, TC_SKB_EXT);
        if (ext) {
                ext->l2_miss = miss;
                return;
        }
        if (!miss)
                return;
        ext = tc_skb_ext_alloc(skb);
        if (!ext)
                return;
        ext->l2_miss = true;
}
#else
static inline void br_tc_skb_miss_set(struct sk_buff *skb, bool miss)
{
}
#endif

/* br_device.c */
void br_dev_setup(struct net_device *dev);
void br_dev_delete(struct net_device *dev, struct list_head *list);
netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev);
#ifdef CONFIG_NET_POLL_CONTROLLER
static inline void br_netpoll_send_skb(const struct net_bridge_port *p,
                                       struct sk_buff *skb)
{
        netpoll_send_skb(p->np, skb);
}

int br_netpoll_enable(struct net_bridge_port *p);
void br_netpoll_disable(struct net_bridge_port *p);
#else
static inline void br_netpoll_send_skb(const struct net_bridge_port *p,
                                       struct sk_buff *skb)
{
}

static inline int br_netpoll_enable(struct net_bridge_port *p)
{
        return 0;
}

static inline void br_netpoll_disable(struct net_bridge_port *p)
{
}
#endif

/* br_fdb.c */
#define FDB_FLUSH_IGNORED_NDM_FLAGS (NTF_MASTER | NTF_SELF)
#define FDB_FLUSH_ALLOWED_NDM_STATES (NUD_PERMANENT | NUD_NOARP)
#define FDB_FLUSH_ALLOWED_NDM_FLAGS (NTF_USE | NTF_EXT_LEARNED | \
                                     NTF_STICKY | NTF_OFFLOADED)

int br_fdb_init(void);
void br_fdb_fini(void);
int br_fdb_hash_init(struct net_bridge *br);
void br_fdb_hash_fini(struct net_bridge *br);
void br_fdb_flush(struct net_bridge *br,
                  const struct net_bridge_fdb_flush_desc *desc);
void br_fdb_find_delete_local(struct net_bridge *br,
                              const struct net_bridge_port *p,
                              const unsigned char *addr, u16 vid);
void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr);
void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr);
void br_fdb_cleanup(struct work_struct *work);
int br_fdb_toggle_local_vlan_0(struct net_bridge *br, bool on,
                               struct netlink_ext_ack *extack);
void br_fdb_delete_by_port(struct net_bridge *br,
                           const struct net_bridge_port *p, u16 vid, int do_all);
struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br,
                                             const unsigned char *addr,
                                             __u16 vid);
int br_fdb_test_addr(struct net_device *dev, unsigned char *addr);
int br_fdb_fillbuf(struct net_bridge *br, void *buf, unsigned long count,
                   unsigned long off);
int br_fdb_add_local(struct net_bridge *br, struct net_bridge_port *source,
                     const unsigned char *addr, u16 vid);
void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
                   const unsigned char *addr, u16 vid, unsigned long flags);

int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
                  struct net_device *dev, const unsigned char *addr, u16 vid,
                  bool *notified, struct netlink_ext_ack *extack);
int br_fdb_delete_bulk(struct nlmsghdr *nlh, struct net_device *dev,
                       struct netlink_ext_ack *extack);
int br_fdb_add(struct ndmsg *nlh, struct nlattr *tb[], struct net_device *dev,
               const unsigned char *addr, u16 vid, u16 nlh_flags,
               bool *notified, struct netlink_ext_ack *extack);
int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
                struct net_device *dev, struct net_device *fdev, int *idx);
int br_fdb_get(struct sk_buff *skb, struct nlattr *tb[], struct net_device *dev,
               const unsigned char *addr, u16 vid, u32 portid, u32 seq,
               struct netlink_ext_ack *extack);
int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p);
void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p);
int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
                              const unsigned char *addr, u16 vid,
                              bool locked, bool swdev_notify);
int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
                              const unsigned char *addr, u16 vid,
                              bool swdev_notify);
void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p,
                          const unsigned char *addr, u16 vid, bool offloaded);

/* br_forward.c */
enum br_pkt_type {
        BR_PKT_UNICAST,
        BR_PKT_MULTICAST,
        BR_PKT_BROADCAST
};
int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb);
void br_forward(const struct net_bridge_port *to, struct sk_buff *skb,
                bool local_rcv, bool local_orig);
int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
void br_flood(struct net_bridge *br, struct sk_buff *skb,
              enum br_pkt_type pkt_type, bool local_rcv, bool local_orig,
              u16 vid);

/* return true if both source port and dest port are isolated */
static inline bool br_skb_isolated(const struct net_bridge_port *to,
                                   const struct sk_buff *skb)
{
        return BR_INPUT_SKB_CB(skb)->src_port_isolated &&
               (to->flags & BR_ISOLATED);
}

/* br_if.c */
void br_port_carrier_check(struct net_bridge_port *p, bool *notified);
int br_add_bridge(struct net *net, const char *name);
int br_del_bridge(struct net *net, const char *name);
int br_add_if(struct net_bridge *br, struct net_device *dev,
              struct netlink_ext_ack *extack);
int br_del_if(struct net_bridge *br, struct net_device *dev);
void br_mtu_auto_adjust(struct net_bridge *br);
netdev_features_t br_features_recompute(struct net_bridge *br,
                                        netdev_features_t features);
void br_port_flags_change(struct net_bridge_port *port, unsigned long mask);
void br_manage_promisc(struct net_bridge *br);
int nbp_backup_change(struct net_bridge_port *p, struct net_device *backup_dev);

/* br_input.c */
int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
rx_handler_func_t *br_get_rx_handler(const struct net_device *dev);

struct br_frame_type {
        __be16                        type;
        int                        (*frame_handler)(struct net_bridge_port *port,
                                                 struct sk_buff *skb);
        struct hlist_node        list;
};

void br_add_frame(struct net_bridge *br, struct br_frame_type *ft);
void br_del_frame(struct net_bridge *br, struct br_frame_type *ft);

static inline bool br_rx_handler_check_rcu(const struct net_device *dev)
{
        return rcu_dereference(dev->rx_handler) == br_get_rx_handler(dev);
}

static inline bool br_rx_handler_check_rtnl(const struct net_device *dev)
{
        return rcu_dereference_rtnl(dev->rx_handler) == br_get_rx_handler(dev);
}

static inline struct net_bridge_port *br_port_get_check_rcu(const struct net_device *dev)
{
        return br_rx_handler_check_rcu(dev) ? br_port_get_rcu(dev) : NULL;
}

static inline struct net_bridge_port *
br_port_get_check_rtnl(const struct net_device *dev)
{
        return br_rx_handler_check_rtnl(dev) ? br_port_get_rtnl_rcu(dev) : NULL;
}

/* br_ioctl.c */
int br_dev_siocdevprivate(struct net_device *dev, struct ifreq *rq,
                          void __user *data, int cmd);
int br_ioctl_stub(struct net *net, unsigned int cmd, void __user *uarg);

/* br_multicast.c */
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
int br_multicast_rcv(struct net_bridge_mcast **brmctx,
                     struct net_bridge_mcast_port **pmctx,
                     struct net_bridge_vlan *vlan,
                     struct sk_buff *skb, u16 vid);
struct net_bridge_mdb_entry *
br_mdb_entry_skb_get(struct net_bridge_mcast *brmctx, struct sk_buff *skb,
                     u16 vid);
int br_multicast_add_port(struct net_bridge_port *port);
void br_multicast_del_port(struct net_bridge_port *port);
void br_multicast_enable_port(struct net_bridge_port *port);
void br_multicast_disable_port(struct net_bridge_port *port);
void br_multicast_init(struct net_bridge *br);
void br_multicast_join_snoopers(struct net_bridge *br);
void br_multicast_leave_snoopers(struct net_bridge *br);
void br_multicast_open(struct net_bridge *br);
void br_multicast_stop(struct net_bridge *br);
void br_multicast_dev_del(struct net_bridge *br);
void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb,
                        struct net_bridge_mcast *brmctx,
                        bool local_rcv, bool local_orig);
int br_multicast_set_router(struct net_bridge_mcast *brmctx, unsigned long val);
int br_multicast_set_port_router(struct net_bridge_mcast_port *pmctx,
                                 unsigned long val);
int br_multicast_set_vlan_router(struct net_bridge_vlan *v, u8 mcast_router);
int br_multicast_toggle(struct net_bridge *br, unsigned long val,
                        struct netlink_ext_ack *extack);
int br_multicast_set_querier(struct net_bridge_mcast *brmctx, unsigned long val);
int br_multicast_set_igmp_version(struct net_bridge_mcast *brmctx,
                                  unsigned long val);
#if IS_ENABLED(CONFIG_IPV6)
int br_multicast_set_mld_version(struct net_bridge_mcast *brmctx,
                                 unsigned long val);
#endif
struct net_bridge_mdb_entry *
br_mdb_ip_get(struct net_bridge *br, struct br_ip *dst);
struct net_bridge_mdb_entry *
br_multicast_new_group(struct net_bridge *br, struct br_ip *group);
struct net_bridge_port_group *
br_multicast_new_port_group(struct net_bridge_port *port,
                            const struct br_ip *group,
                            struct net_bridge_port_group __rcu *next,
                            unsigned char flags, const unsigned char *src,
                            u8 filter_mode, u8 rt_protocol,
                            struct netlink_ext_ack *extack);
void br_multicast_del_port_group(struct net_bridge_port_group *p);
int br_mdb_hash_init(struct net_bridge *br);
void br_mdb_hash_fini(struct net_bridge *br);
void br_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp,
                   struct net_bridge_port_group *pg, int type);
void br_mdb_flag_change_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp,
                               struct net_bridge_port_group *pg);
void br_rtr_notify(struct net_device *dev, struct net_bridge_mcast_port *pmctx,
                   int type);
void br_multicast_del_pg(struct net_bridge_mdb_entry *mp,
                         struct net_bridge_port_group *pg,
                         struct net_bridge_port_group __rcu **pp);
void br_multicast_count(struct net_bridge *br,
                        const struct net_bridge_port *p,
                        const struct sk_buff *skb, u8 type, u8 dir);
int br_multicast_init_stats(struct net_bridge *br);
void br_multicast_uninit_stats(struct net_bridge *br);
void br_multicast_get_stats(const struct net_bridge *br,
                            const struct net_bridge_port *p,
                            struct br_mcast_stats *dest);
u32 br_multicast_ngroups_get(const struct net_bridge_mcast_port *pmctx);
void br_multicast_ngroups_set_max(struct net_bridge_mcast_port *pmctx, u32 max);
u32 br_multicast_ngroups_get_max(const struct net_bridge_mcast_port *pmctx);
int br_mdb_add(struct net_device *dev, struct nlattr *tb[], u16 nlmsg_flags,
               struct netlink_ext_ack *extack);
int br_mdb_del(struct net_device *dev, struct nlattr *tb[],
               struct netlink_ext_ack *extack);
int br_mdb_del_bulk(struct net_device *dev, struct nlattr *tb[],
                    struct netlink_ext_ack *extack);
int br_mdb_dump(struct net_device *dev, struct sk_buff *skb,
                struct netlink_callback *cb);
int br_mdb_get(struct net_device *dev, struct nlattr *tb[], u32 portid, u32 seq,
               struct netlink_ext_ack *extack);
void br_multicast_host_join(const struct net_bridge_mcast *brmctx,
                            struct net_bridge_mdb_entry *mp, bool notify);
void br_multicast_host_leave(struct net_bridge_mdb_entry *mp, bool notify);
void br_multicast_star_g_handle_mode(struct net_bridge_port_group *pg,
                                     u8 filter_mode);
void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp,
                                       struct net_bridge_port_group *sg);
struct net_bridge_group_src *
br_multicast_find_group_src(struct net_bridge_port_group *pg, struct br_ip *ip);
struct net_bridge_group_src *
br_multicast_new_group_src(struct net_bridge_port_group *pg,
                           struct br_ip *src_ip);
void __br_multicast_del_group_src(struct net_bridge_group_src *src);
void br_multicast_del_group_src(struct net_bridge_group_src *src,
                                bool fastleave);
void br_multicast_ctx_init(struct net_bridge *br,
                           struct net_bridge_vlan *vlan,
                           struct net_bridge_mcast *brmctx);
void br_multicast_ctx_deinit(struct net_bridge_mcast *brmctx);
void br_multicast_port_ctx_init(struct net_bridge_port *port,
                                struct net_bridge_vlan *vlan,
                                struct net_bridge_mcast_port *pmctx);
void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx);
void br_multicast_update_vlan_mcast_ctx(struct net_bridge_vlan *v, u8 state);
void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on);
int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on,
                                      struct netlink_ext_ack *extack);
bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan, bool on);

int br_rports_fill_info(struct sk_buff *skb,
                        const struct net_bridge_mcast *brmctx);
int br_multicast_dump_querier_state(struct sk_buff *skb,
                                    const struct net_bridge_mcast *brmctx,
                                    int nest_attr);
size_t br_multicast_querier_state_size(void);
size_t br_rports_size(const struct net_bridge_mcast *brmctx);
void br_multicast_set_query_intvl(struct net_bridge_mcast *brmctx,
                                  unsigned long val);
void br_multicast_set_startup_query_intvl(struct net_bridge_mcast *brmctx,
                                          unsigned long val);

static inline bool br_group_is_l2(const struct br_ip *group)
{
        return group->proto == 0;
}

#define mlock_dereference(X, br) \
        rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock))

static inline struct hlist_node *
br_multicast_get_first_rport_node(struct net_bridge_mcast *brmctx,
                                  struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_IPV6)
        if (skb->protocol == htons(ETH_P_IPV6))
                return rcu_dereference(hlist_first_rcu(&brmctx->ip6_mc_router_list));
#endif
        return rcu_dereference(hlist_first_rcu(&brmctx->ip4_mc_router_list));
}

static inline struct net_bridge_port *
br_multicast_rport_from_node_skb(struct hlist_node *rp, struct sk_buff *skb)
{
        struct net_bridge_mcast_port *mctx;

#if IS_ENABLED(CONFIG_IPV6)
        if (skb->protocol == htons(ETH_P_IPV6))
                mctx = hlist_entry_safe(rp, struct net_bridge_mcast_port,
                                        ip6_rlist);
        else
#endif
                mctx = hlist_entry_safe(rp, struct net_bridge_mcast_port,
                                        ip4_rlist);

        if (mctx)
                return mctx->port;
        else
                return NULL;
}

static inline bool br_ip4_multicast_is_router(struct net_bridge_mcast *brmctx)
{
        return timer_pending(&brmctx->ip4_mc_router_timer);
}

static inline bool br_ip6_multicast_is_router(struct net_bridge_mcast *brmctx)
{
#if IS_ENABLED(CONFIG_IPV6)
        return timer_pending(&brmctx->ip6_mc_router_timer);
#else
        return false;
#endif
}

static inline bool
br_multicast_is_router(struct net_bridge_mcast *brmctx, struct sk_buff *skb)
{
        switch (brmctx->multicast_router) {
        case MDB_RTR_TYPE_PERM:
                return true;
        case MDB_RTR_TYPE_TEMP_QUERY:
                if (skb) {
                        if (skb->protocol == htons(ETH_P_IP))
                                return br_ip4_multicast_is_router(brmctx);
                        else if (skb->protocol == htons(ETH_P_IPV6))
                                return br_ip6_multicast_is_router(brmctx);
                } else {
                        return br_ip4_multicast_is_router(brmctx) ||
                               br_ip6_multicast_is_router(brmctx);
                }
                fallthrough;
        default:
                return false;
        }
}

static inline bool
__br_multicast_querier_exists(struct net_bridge_mcast *brmctx,
                              struct bridge_mcast_other_query *querier,
                              const bool is_ipv6)
{
        bool own_querier_enabled;

        if (brmctx->multicast_querier) {
                if (is_ipv6 && !br_opt_get(brmctx->br, BROPT_HAS_IPV6_ADDR))
                        own_querier_enabled = false;
                else
                        own_querier_enabled = true;
        } else {
                own_querier_enabled = false;
        }

        return !timer_pending(&querier->delay_timer) &&
               (own_querier_enabled || timer_pending(&querier->timer));
}

static inline bool br_multicast_querier_exists(struct net_bridge_mcast *brmctx,
                                               struct ethhdr *eth,
                                               const struct net_bridge_mdb_entry *mdb)
{
        switch (eth->h_proto) {
        case (htons(ETH_P_IP)):
                return __br_multicast_querier_exists(brmctx,
                        &brmctx->ip4_other_query, false);
#if IS_ENABLED(CONFIG_IPV6)
        case (htons(ETH_P_IPV6)):
                return __br_multicast_querier_exists(brmctx,
                        &brmctx->ip6_other_query, true);
#endif
        default:
                return !!mdb && br_group_is_l2(&mdb->addr);
        }
}

static inline bool br_multicast_is_star_g(const struct br_ip *ip)
{
        switch (ip->proto) {
        case htons(ETH_P_IP):
                return ipv4_is_zeronet(ip->src.ip4);
#if IS_ENABLED(CONFIG_IPV6)
        case htons(ETH_P_IPV6):
                return ipv6_addr_any(&ip->src.ip6);
#endif
        default:
                return false;
        }
}

static inline bool
br_multicast_should_handle_mode(const struct net_bridge_mcast *brmctx,
                                __be16 proto)
{
        switch (proto) {
        case htons(ETH_P_IP):
                return !!(brmctx->multicast_igmp_version == 3);
#if IS_ENABLED(CONFIG_IPV6)
        case htons(ETH_P_IPV6):
                return !!(brmctx->multicast_mld_version == 2);
#endif
        default:
                return false;
        }
}

static inline int br_multicast_igmp_type(const struct sk_buff *skb)
{
        return BR_INPUT_SKB_CB(skb)->igmp;
}

static inline unsigned long br_multicast_lmqt(const struct net_bridge_mcast *brmctx)
{
        return brmctx->multicast_last_member_interval *
               brmctx->multicast_last_member_count;
}

static inline unsigned long br_multicast_gmi(const struct net_bridge_mcast *brmctx)
{
        return brmctx->multicast_membership_interval;
}

static inline bool
br_multicast_ctx_is_vlan(const struct net_bridge_mcast *brmctx)
{
        return !!brmctx->vlan;
}

static inline bool
br_multicast_port_ctx_is_vlan(const struct net_bridge_mcast_port *pmctx)
{
        return !!pmctx->vlan;
}

static inline struct net_bridge_mcast *
br_multicast_port_ctx_get_global(const struct net_bridge_mcast_port *pmctx)
{
        if (!br_multicast_port_ctx_is_vlan(pmctx))
                return &pmctx->port->br->multicast_ctx;
        else
                return &pmctx->vlan->brvlan->br_mcast_ctx;
}

static inline bool
br_multicast_ctx_vlan_global_disabled(const struct net_bridge_mcast *brmctx)
{
        return br_multicast_ctx_is_vlan(brmctx) &&
               (!br_opt_get(brmctx->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) ||
                !(brmctx->vlan->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED));
}

static inline bool
br_multicast_ctx_vlan_disabled(const struct net_bridge_mcast *brmctx)
{
        return br_multicast_ctx_is_vlan(brmctx) &&
               !(brmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED);
}

static inline bool
br_multicast_port_ctx_vlan_disabled(const struct net_bridge_mcast_port *pmctx)
{
        return br_multicast_port_ctx_is_vlan(pmctx) &&
               !(pmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED);
}

static inline bool
br_multicast_port_ctx_state_disabled(const struct net_bridge_mcast_port *pmctx)
{
        return pmctx->port->state == BR_STATE_DISABLED ||
               (br_multicast_port_ctx_is_vlan(pmctx) &&
                (br_multicast_port_ctx_vlan_disabled(pmctx) ||
                 pmctx->vlan->state == BR_STATE_DISABLED));
}

static inline bool
br_multicast_port_ctx_state_stopped(const struct net_bridge_mcast_port *pmctx)
{
        return br_multicast_port_ctx_state_disabled(pmctx) ||
               pmctx->port->state == BR_STATE_BLOCKING ||
               (br_multicast_port_ctx_is_vlan(pmctx) &&
                pmctx->vlan->state == BR_STATE_BLOCKING);
}

static inline bool
br_rports_have_mc_router(const struct net_bridge_mcast *brmctx)
{
#if IS_ENABLED(CONFIG_IPV6)
        return !hlist_empty(&brmctx->ip4_mc_router_list) ||
               !hlist_empty(&brmctx->ip6_mc_router_list);
#else
        return !hlist_empty(&brmctx->ip4_mc_router_list);
#endif
}

static inline bool
br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
                               const struct net_bridge_mcast *brmctx2)
{
        return brmctx1->multicast_igmp_version ==
               brmctx2->multicast_igmp_version &&
               brmctx1->multicast_last_member_count ==
               brmctx2->multicast_last_member_count &&
               brmctx1->multicast_startup_query_count ==
               brmctx2->multicast_startup_query_count &&
               brmctx1->multicast_last_member_interval ==
               brmctx2->multicast_last_member_interval &&
               brmctx1->multicast_membership_interval ==
               brmctx2->multicast_membership_interval &&
               brmctx1->multicast_querier_interval ==
               brmctx2->multicast_querier_interval &&
               brmctx1->multicast_query_interval ==
               brmctx2->multicast_query_interval &&
               brmctx1->multicast_query_response_interval ==
               brmctx2->multicast_query_response_interval &&
               brmctx1->multicast_startup_query_interval ==
               brmctx2->multicast_startup_query_interval &&
               brmctx1->multicast_querier == brmctx2->multicast_querier &&
               brmctx1->multicast_router == brmctx2->multicast_router &&
               !br_rports_have_mc_router(brmctx1) &&
               !br_rports_have_mc_router(brmctx2) &&
#if IS_ENABLED(CONFIG_IPV6)
               brmctx1->multicast_mld_version ==
               brmctx2->multicast_mld_version &&
#endif
               true;
}

static inline bool
br_multicast_ctx_matches_vlan_snooping(const struct net_bridge_mcast *brmctx)
{
        bool vlan_snooping_enabled;

        vlan_snooping_enabled = !!br_opt_get(brmctx->br,
                                             BROPT_MCAST_VLAN_SNOOPING_ENABLED);

        return !!(vlan_snooping_enabled == br_multicast_ctx_is_vlan(brmctx));
}

static inline void
br_multicast_set_pg_offload_flags(struct net_bridge_port_group *p,
                                  bool offloaded)
{
        p->flags &= ~(MDB_PG_FLAGS_OFFLOAD | MDB_PG_FLAGS_OFFLOAD_FAILED);
        p->flags |= (offloaded ? MDB_PG_FLAGS_OFFLOAD :
                MDB_PG_FLAGS_OFFLOAD_FAILED);
}

static inline bool
br_mdb_should_notify(const struct net_bridge *br, u8 changed_flags)
{
        return br_opt_get(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION) &&
                (changed_flags & MDB_PG_FLAGS_OFFLOAD_FAILED);
}
#else
static inline int br_multicast_rcv(struct net_bridge_mcast **brmctx,
                                   struct net_bridge_mcast_port **pmctx,
                                   struct net_bridge_vlan *vlan,
                                   struct sk_buff *skb,
                                   u16 vid)
{
        return 0;
}

static inline struct net_bridge_mdb_entry *
br_mdb_entry_skb_get(struct net_bridge_mcast *brmctx, struct sk_buff *skb,
                     u16 vid)
{
        return NULL;
}

static inline int br_multicast_add_port(struct net_bridge_port *port)
{
        return 0;
}

static inline void br_multicast_del_port(struct net_bridge_port *port)
{
}

static inline void br_multicast_enable_port(struct net_bridge_port *port)
{
}

static inline void br_multicast_disable_port(struct net_bridge_port *port)
{
}

static inline void br_multicast_init(struct net_bridge *br)
{
}

static inline void br_multicast_join_snoopers(struct net_bridge *br)
{
}

static inline void br_multicast_leave_snoopers(struct net_bridge *br)
{
}

static inline void br_multicast_open(struct net_bridge *br)
{
}

static inline void br_multicast_stop(struct net_bridge *br)
{
}

static inline void br_multicast_dev_del(struct net_bridge *br)
{
}

static inline void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
                                      struct sk_buff *skb,
                                      struct net_bridge_mcast *brmctx,
                                      bool local_rcv, bool local_orig)
{
}

static inline bool br_multicast_is_router(struct net_bridge_mcast *brmctx,
                                          struct sk_buff *skb)
{
        return false;
}

static inline bool br_multicast_querier_exists(struct net_bridge_mcast *brmctx,
                                               struct ethhdr *eth,
                                               const struct net_bridge_mdb_entry *mdb)
{
        return false;
}

static inline int br_mdb_add(struct net_device *dev, struct nlattr *tb[],
                             u16 nlmsg_flags, struct netlink_ext_ack *extack)
{
        return -EOPNOTSUPP;
}

static inline int br_mdb_del(struct net_device *dev, struct nlattr *tb[],
                             struct netlink_ext_ack *extack)
{
        return -EOPNOTSUPP;
}

static inline int br_mdb_del_bulk(struct net_device *dev, struct nlattr *tb[],
                                  struct netlink_ext_ack *extack)
{
        return -EOPNOTSUPP;
}

static inline int br_mdb_dump(struct net_device *dev, struct sk_buff *skb,
                              struct netlink_callback *cb)
{
        return 0;
}

static inline int br_mdb_get(struct net_device *dev, struct nlattr *tb[],
                             u32 portid, u32 seq,
                             struct netlink_ext_ack *extack)
{
        return -EOPNOTSUPP;
}

static inline int br_mdb_hash_init(struct net_bridge *br)
{
        return 0;
}

static inline void br_mdb_hash_fini(struct net_bridge *br)
{
}

static inline void br_multicast_count(struct net_bridge *br,
                                      const struct net_bridge_port *p,
                                      const struct sk_buff *skb,
                                      u8 type, u8 dir)
{
}

static inline int br_multicast_init_stats(struct net_bridge *br)
{
        return 0;
}

static inline void br_multicast_uninit_stats(struct net_bridge *br)
{
}

static inline int br_multicast_igmp_type(const struct sk_buff *skb)
{
        return 0;
}

static inline void br_multicast_ctx_init(struct net_bridge *br,
                                         struct net_bridge_vlan *vlan,
                                         struct net_bridge_mcast *brmctx)
{
}

static inline void br_multicast_ctx_deinit(struct net_bridge_mcast *brmctx)
{
}

static inline void br_multicast_port_ctx_init(struct net_bridge_port *port,
                                              struct net_bridge_vlan *vlan,
                                              struct net_bridge_mcast_port *pmctx)
{
}

static inline void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx)
{
}

static inline void br_multicast_update_vlan_mcast_ctx(struct net_bridge_vlan *v,
                                                      u8 state)
{
}

static inline void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan,
                                                bool on)
{
}

static inline int br_multicast_toggle_vlan_snooping(struct net_bridge *br,
                                                    bool on,
                                                    struct netlink_ext_ack *extack)
{
        return -EOPNOTSUPP;
}

static inline bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan,
                                                   bool on)
{
        return false;
}

static inline bool
br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
                               const struct net_bridge_mcast *brmctx2)
{
        return true;
}
#endif

/* br_vlan.c */
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
bool br_allowed_ingress(const struct net_bridge *br,
                        struct net_bridge_vlan_group *vg, struct sk_buff *skb,
                        u16 *vid, u8 *state,
                        struct net_bridge_vlan **vlan);
bool br_allowed_egress(struct net_bridge_vlan_group *vg,
                       const struct sk_buff *skb);
bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid);
struct sk_buff *br_handle_vlan(struct net_bridge *br,
                               const struct net_bridge_port *port,
                               struct net_bridge_vlan_group *vg,
                               struct sk_buff *skb);
int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags,
                bool *changed, struct netlink_ext_ack *extack);
int br_vlan_delete(struct net_bridge *br, u16 vid);
void br_vlan_flush(struct net_bridge *br);
struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid);
void br_recalculate_fwd_mask(struct net_bridge *br);
int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val,
                          struct netlink_ext_ack *extack);
int __br_vlan_set_proto(struct net_bridge *br, __be16 proto,
                        struct netlink_ext_ack *extack);
int br_vlan_set_proto(struct net_bridge *br, unsigned long val,
                      struct netlink_ext_ack *extack);
int br_vlan_set_stats(struct net_bridge *br, unsigned long val);
int br_vlan_set_stats_per_port(struct net_bridge *br, unsigned long val);
int br_vlan_init(struct net_bridge *br);
int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val,
                             struct netlink_ext_ack *extack);
int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid,
                               struct netlink_ext_ack *extack);
int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags,
                 bool *changed, struct netlink_ext_ack *extack);
int nbp_vlan_delete(struct net_bridge_port *port, u16 vid);
void nbp_vlan_flush(struct net_bridge_port *port);
int nbp_vlan_init(struct net_bridge_port *port, struct netlink_ext_ack *extack);
int nbp_get_num_vlan_infos(struct net_bridge_port *p, u32 filter_mask);
void br_vlan_get_stats(const struct net_bridge_vlan *v,
                       struct pcpu_sw_netstats *stats);
void br_vlan_port_event(struct net_bridge_port *p, unsigned long event);
int br_vlan_bridge_event(struct net_device *dev, unsigned long event,
                         void *ptr);
void br_vlan_vlan_upper_event(struct net_device *br_dev,
                              struct net_device *vlan_dev,
                              unsigned long event);
int br_vlan_rtnl_init(void);
void br_vlan_rtnl_uninit(void);
void br_vlan_notify(const struct net_bridge *br,
                    const struct net_bridge_port *p,
                    u16 vid, u16 vid_range,
                    int cmd);
bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr,
                             const struct net_bridge_vlan *range_end);

void br_vlan_fill_forward_path_pvid(struct net_bridge *br,
                                    struct net_device_path_ctx *ctx,
                                    struct net_device_path *path);
int br_vlan_fill_forward_path_mode(struct net_bridge *br,
                                   struct net_bridge_port *dst,
                                   struct net_device_path *path);

static inline struct net_bridge_vlan_group *br_vlan_group(
                                        const struct net_bridge *br)
{
        return rtnl_dereference(br->vlgrp);
}

static inline struct net_bridge_vlan_group *nbp_vlan_group(
                                        const struct net_bridge_port *p)
{
        return rtnl_dereference(p->vlgrp);
}

static inline struct net_bridge_vlan_group *br_vlan_group_rcu(
                                        const struct net_bridge *br)
{
        return rcu_dereference(br->vlgrp);
}

static inline struct net_bridge_vlan_group *nbp_vlan_group_rcu(
                                        const struct net_bridge_port *p)
{
        return rcu_dereference(p->vlgrp);
}

/* Since bridge now depends on 8021Q module, but the time bridge sees the
 * skb, the vlan tag will always be present if the frame was tagged.
 */
static inline int br_vlan_get_tag(const struct sk_buff *skb, u16 *vid)
{
        int err = 0;

        if (skb_vlan_tag_present(skb)) {
                *vid = skb_vlan_tag_get_id(skb);
        } else {
                *vid = 0;
                err = -EINVAL;
        }

        return err;
}

static inline u16 br_get_pvid(const struct net_bridge_vlan_group *vg)
{
        if (!vg)
                return 0;

        smp_rmb();
        return vg->pvid;
}

static inline u16 br_vlan_flags(const struct net_bridge_vlan *v, u16 pvid)
{
        return v->vid == pvid ? v->flags | BRIDGE_VLAN_INFO_PVID : v->flags;
}
#else
static inline bool br_allowed_ingress(const struct net_bridge *br,
                                      struct net_bridge_vlan_group *vg,
                                      struct sk_buff *skb,
                                      u16 *vid, u8 *state,
                                      struct net_bridge_vlan **vlan)

{
        *vlan = NULL;
        return true;
}

static inline bool br_allowed_egress(struct net_bridge_vlan_group *vg,
                                     const struct sk_buff *skb)
{
        return true;
}

static inline bool br_should_learn(struct net_bridge_port *p,
                                   struct sk_buff *skb, u16 *vid)
{
        return true;
}

static inline struct sk_buff *br_handle_vlan(struct net_bridge *br,
                                             const struct net_bridge_port *port,
                                             struct net_bridge_vlan_group *vg,
                                             struct sk_buff *skb)
{
        return skb;
}

static inline int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags,
                              bool *changed, struct netlink_ext_ack *extack)
{
        *changed = false;
        return -EOPNOTSUPP;
}

static inline int br_vlan_delete(struct net_bridge *br, u16 vid)
{
        return -EOPNOTSUPP;
}

static inline void br_vlan_flush(struct net_bridge *br)
{
}

static inline void br_recalculate_fwd_mask(struct net_bridge *br)
{
}

static inline int br_vlan_init(struct net_bridge *br)
{
        return 0;
}

static inline int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags,
                               bool *changed, struct netlink_ext_ack *extack)
{
        *changed = false;
        return -EOPNOTSUPP;
}

static inline int nbp_vlan_delete(struct net_bridge_port *port, u16 vid)
{
        return -EOPNOTSUPP;
}

static inline void nbp_vlan_flush(struct net_bridge_port *port)
{
}

static inline struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg,
                                                   u16 vid)
{
        return NULL;
}

static inline int nbp_vlan_init(struct net_bridge_port *port,
                                struct netlink_ext_ack *extack)
{
        return 0;
}

static inline u16 br_vlan_get_tag(const struct sk_buff *skb, u16 *tag)
{
        return 0;
}

static inline u16 br_get_pvid(const struct net_bridge_vlan_group *vg)
{
        return 0;
}

static inline int br_vlan_filter_toggle(struct net_bridge *br,
                                        unsigned long val,
                                        struct netlink_ext_ack *extack)
{
        return -EOPNOTSUPP;
}

static inline int nbp_get_num_vlan_infos(struct net_bridge_port *p,
                                         u32 filter_mask)
{
        return 0;
}

static inline void br_vlan_fill_forward_path_pvid(struct net_bridge *br,
                                                  struct net_device_path_ctx *ctx,
                                                  struct net_device_path *path)
{
}

static inline int br_vlan_fill_forward_path_mode(struct net_bridge *br,
                                                 struct net_bridge_port *dst,
                                                 struct net_device_path *path)
{
        return 0;
}

static inline struct net_bridge_vlan_group *br_vlan_group(
                                        const struct net_bridge *br)
{
        return NULL;
}

static inline struct net_bridge_vlan_group *nbp_vlan_group(
                                        const struct net_bridge_port *p)
{
        return NULL;
}

static inline struct net_bridge_vlan_group *br_vlan_group_rcu(
                                        const struct net_bridge *br)
{
        return NULL;
}

static inline struct net_bridge_vlan_group *nbp_vlan_group_rcu(
                                        const struct net_bridge_port *p)
{
        return NULL;
}

static inline void br_vlan_get_stats(const struct net_bridge_vlan *v,
                                     struct pcpu_sw_netstats *stats)
{
}

static inline void br_vlan_port_event(struct net_bridge_port *p,
                                      unsigned long event)
{
}

static inline int br_vlan_bridge_event(struct net_device *dev,
                                       unsigned long event, void *ptr)
{
        return 0;
}

static inline void br_vlan_vlan_upper_event(struct net_device *br_dev,
                                            struct net_device *vlan_dev,
                                            unsigned long event)
{
}

static inline int br_vlan_rtnl_init(void)
{
        return 0;
}

static inline void br_vlan_rtnl_uninit(void)
{
}

static inline void br_vlan_notify(const struct net_bridge *br,
                                  const struct net_bridge_port *p,
                                  u16 vid, u16 vid_range,
                                  int cmd)
{
}

static inline bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr,
                                           const struct net_bridge_vlan *range_end)
{
        return true;
}

static inline u16 br_vlan_flags(const struct net_bridge_vlan *v, u16 pvid)
{
        return 0;
}

#endif

/* br_vlan_options.c */
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
bool br_vlan_opts_eq_range(const struct net_bridge_vlan *v_curr,
                           const struct net_bridge_vlan *range_end);
bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v,
                       const struct net_bridge_port *p);
size_t br_vlan_opts_nl_size(void);
int br_vlan_process_options(const struct net_bridge *br,
                            const struct net_bridge_port *p,
                            struct net_bridge_vlan *range_start,
                            struct net_bridge_vlan *range_end,
                            struct nlattr **tb,
                            struct netlink_ext_ack *extack);
int br_vlan_rtm_process_global_options(struct net_device *dev,
                                       const struct nlattr *attr,
                                       int cmd,
                                       struct netlink_ext_ack *extack);
bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr,
                                         const struct net_bridge_vlan *r_end);
bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
                              const struct net_bridge_vlan *v_opts);

/* vlan state manipulation helpers using *_ONCE to annotate lock-free access,
 * while br_vlan_set_state() may access data protected by multicast_lock.
 */
static inline u8 br_vlan_get_state(const struct net_bridge_vlan *v)
{
        return READ_ONCE(v->state);
}

static inline void br_vlan_set_state(struct net_bridge_vlan *v, u8 state)
{
        WRITE_ONCE(v->state, state);
        br_multicast_update_vlan_mcast_ctx(v, state);
}

static inline u8 br_vlan_get_pvid_state(const struct net_bridge_vlan_group *vg)
{
        return READ_ONCE(vg->pvid_state);
}

static inline void br_vlan_set_pvid_state(struct net_bridge_vlan_group *vg,
                                          u8 state)
{
        WRITE_ONCE(vg->pvid_state, state);
}

/* learn_allow is true at ingress and false at egress */
static inline bool br_vlan_state_allowed(u8 state, bool learn_allow)
{
        switch (state) {
        case BR_STATE_LEARNING:
                return learn_allow;
        case BR_STATE_FORWARDING:
                return true;
        default:
                return false;
        }
}
#endif

/* br_mst.c */
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
DECLARE_STATIC_KEY_FALSE(br_mst_used);
static inline bool br_mst_is_enabled(const struct net_bridge_port *p)
{
        /* check the port's vlan group to avoid racing with port deletion */
        return static_branch_unlikely(&br_mst_used) &&
               br_opt_get(p->br, BROPT_MST_ENABLED) &&
               rcu_access_pointer(p->vlgrp);
}

int br_mst_set_state(struct net_bridge_port *p, u16 msti, u8 state,
                     struct netlink_ext_ack *extack);
int br_mst_vlan_set_msti(struct net_bridge_vlan *v, u16 msti);
void br_mst_vlan_init_state(struct net_bridge_vlan *v);
int br_mst_set_enabled(struct net_bridge *br, bool on,
                       struct netlink_ext_ack *extack);
size_t br_mst_info_size(const struct net_bridge_vlan_group *vg);
int br_mst_fill_info(struct sk_buff *skb,
                     const struct net_bridge_vlan_group *vg);
int br_mst_process(struct net_bridge_port *p, const struct nlattr *mst_attr,
                   struct netlink_ext_ack *extack);
void br_mst_uninit(struct net_bridge *br);
#else
static inline bool br_mst_is_enabled(const struct net_bridge_port *p)
{
        return false;
}

static inline int br_mst_set_state(struct net_bridge_port *p, u16 msti,
                                   u8 state, struct netlink_ext_ack *extack)
{
        return -EOPNOTSUPP;
}

static inline int br_mst_set_enabled(struct net_bridge *br, bool on,
                                     struct netlink_ext_ack *extack)
{
        return -EOPNOTSUPP;
}

static inline size_t br_mst_info_size(const struct net_bridge_vlan_group *vg)
{
        return 0;
}

static inline int br_mst_fill_info(struct sk_buff *skb,
                                   const struct net_bridge_vlan_group *vg)
{
        return -EOPNOTSUPP;
}

static inline int br_mst_process(struct net_bridge_port *p,
                                 const struct nlattr *mst_attr,
                                 struct netlink_ext_ack *extack)
{
        return -EOPNOTSUPP;
}

static inline void br_mst_uninit(struct net_bridge *br)
{
}
#endif

struct nf_br_ops {
        int (*br_dev_xmit_hook)(struct sk_buff *skb);
};
extern const struct nf_br_ops __rcu *nf_br_ops;

/* br_netfilter.c */
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
int br_nf_core_init(void);
void br_nf_core_fini(void);
void br_netfilter_rtable_init(struct net_bridge *);
#else
static inline int br_nf_core_init(void) { return 0; }
static inline void br_nf_core_fini(void) {}
#define br_netfilter_rtable_init(x)
#endif

/* br_stp.c */
void br_set_state(struct net_bridge_port *p, unsigned int state);
struct net_bridge_port *br_get_port(struct net_bridge *br, u16 port_no);
void br_init_port(struct net_bridge_port *p);
void br_become_designated_port(struct net_bridge_port *p);

void __br_set_forward_delay(struct net_bridge *br, unsigned long t);
int br_set_forward_delay(struct net_bridge *br, unsigned long x);
int br_set_hello_time(struct net_bridge *br, unsigned long x);
int br_set_max_age(struct net_bridge *br, unsigned long x);
int __set_ageing_time(struct net_device *dev, unsigned long t);
int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time);


/* br_stp_if.c */
void br_stp_enable_bridge(struct net_bridge *br);
void br_stp_disable_bridge(struct net_bridge *br);
int br_stp_set_enabled(struct net_bridge *br, unsigned long val,
                       struct netlink_ext_ack *extack);
void br_stp_enable_port(struct net_bridge_port *p);
void br_stp_disable_port(struct net_bridge_port *p);
bool br_stp_recalculate_bridge_id(struct net_bridge *br);
void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *a);
void br_stp_set_bridge_priority(struct net_bridge *br, u16 newprio);
int br_stp_set_port_priority(struct net_bridge_port *p, unsigned long newprio);
int br_stp_set_path_cost(struct net_bridge_port *p, unsigned long path_cost);
ssize_t br_show_bridge_id(char *buf, const struct bridge_id *id);

/* br_stp_bpdu.c */
struct stp_proto;
void br_stp_rcv(const struct stp_proto *proto, struct sk_buff *skb,
                struct net_device *dev);

/* br_stp_timer.c */
void br_stp_timer_init(struct net_bridge *br);
void br_stp_port_timer_init(struct net_bridge_port *p);
unsigned long br_timer_value(const struct timer_list *timer);

/* br.c */
#if IS_ENABLED(CONFIG_ATM_LANE)
extern int (*br_fdb_test_addr_hook)(struct net_device *dev, unsigned char *addr);
#endif

/* br_mrp.c */
#if IS_ENABLED(CONFIG_BRIDGE_MRP)
int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p,
                 struct nlattr *attr, int cmd, struct netlink_ext_ack *extack);
bool br_mrp_enabled(struct net_bridge *br);
void br_mrp_port_del(struct net_bridge *br, struct net_bridge_port *p);
int br_mrp_fill_info(struct sk_buff *skb, struct net_bridge *br);
#else
static inline int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p,
                               struct nlattr *attr, int cmd,
                               struct netlink_ext_ack *extack)
{
        return -EOPNOTSUPP;
}

static inline bool br_mrp_enabled(struct net_bridge *br)
{
        return false;
}

static inline void br_mrp_port_del(struct net_bridge *br,
                                   struct net_bridge_port *p)
{
}

static inline int br_mrp_fill_info(struct sk_buff *skb, struct net_bridge *br)
{
        return 0;
}

#endif

/* br_cfm.c */
#if IS_ENABLED(CONFIG_BRIDGE_CFM)
int br_cfm_parse(struct net_bridge *br, struct net_bridge_port *p,
                 struct nlattr *attr, int cmd, struct netlink_ext_ack *extack);
bool br_cfm_created(struct net_bridge *br);
void br_cfm_port_del(struct net_bridge *br, struct net_bridge_port *p);
int br_cfm_config_fill_info(struct sk_buff *skb, struct net_bridge *br);
int br_cfm_status_fill_info(struct sk_buff *skb,
                            struct net_bridge *br,
                            bool getlink);
int br_cfm_mep_count(struct net_bridge *br, u32 *count);
int br_cfm_peer_mep_count(struct net_bridge *br, u32 *count);
#else
static inline int br_cfm_parse(struct net_bridge *br, struct net_bridge_port *p,
                               struct nlattr *attr, int cmd,
                               struct netlink_ext_ack *extack)
{
        return -EOPNOTSUPP;
}

static inline bool br_cfm_created(struct net_bridge *br)
{
        return false;
}

static inline void br_cfm_port_del(struct net_bridge *br,
                                   struct net_bridge_port *p)
{
}

static inline int br_cfm_config_fill_info(struct sk_buff *skb, struct net_bridge *br)
{
        return -EOPNOTSUPP;
}

static inline int br_cfm_status_fill_info(struct sk_buff *skb,
                                          struct net_bridge *br,
                                          bool getlink)
{
        return -EOPNOTSUPP;
}

static inline int br_cfm_mep_count(struct net_bridge *br, u32 *count)
{
        *count = 0;
        return -EOPNOTSUPP;
}

static inline int br_cfm_peer_mep_count(struct net_bridge *br, u32 *count)
{
        *count = 0;
        return -EOPNOTSUPP;
}
#endif

/* br_netlink.c */
extern struct rtnl_link_ops br_link_ops;
int br_netlink_init(void);
void br_netlink_fini(void);
void br_ifinfo_notify(int event, const struct net_bridge *br,
                      const struct net_bridge_port *port);
void br_info_notify(int event, const struct net_bridge *br,
                    const struct net_bridge_port *port, u32 filter);
int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags,
               struct netlink_ext_ack *extack);
int br_dellink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags);
int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev,
               u32 filter_mask, int nlflags);
int br_process_vlan_info(struct net_bridge *br,
                         struct net_bridge_port *p, int cmd,
                         struct bridge_vlan_info *vinfo_curr,
                         struct bridge_vlan_info **vinfo_last,
                         bool *changed,
                         struct netlink_ext_ack *extack);

#ifdef CONFIG_SYSFS
/* br_sysfs_if.c */
extern const struct sysfs_ops brport_sysfs_ops;
int br_sysfs_addif(struct net_bridge_port *p);
int br_sysfs_renameif(struct net_bridge_port *p);

/* br_sysfs_br.c */
int br_sysfs_addbr(struct net_device *dev);
void br_sysfs_delbr(struct net_device *dev);

#else

static inline int br_sysfs_addif(struct net_bridge_port *p) { return 0; }
static inline int br_sysfs_renameif(struct net_bridge_port *p) { return 0; }
static inline int br_sysfs_addbr(struct net_device *dev) { return 0; }
static inline void br_sysfs_delbr(struct net_device *dev) { return; }
#endif /* CONFIG_SYSFS */

/* br_switchdev.c */
#ifdef CONFIG_NET_SWITCHDEV
int br_switchdev_port_offload(struct net_bridge_port *p,
                              struct net_device *dev, const void *ctx,
                              struct notifier_block *atomic_nb,
                              struct notifier_block *blocking_nb,
                              bool tx_fwd_offload,
                              struct netlink_ext_ack *extack);

void br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx,
                                 struct notifier_block *atomic_nb,
                                 struct notifier_block *blocking_nb);

int br_switchdev_port_replay(struct net_bridge_port *p,
                             struct net_device *dev, const void *ctx,
                             struct notifier_block *atomic_nb,
                             struct notifier_block *blocking_nb,
                             struct netlink_ext_ack *extack);

bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb);

void br_switchdev_frame_set_offload_fwd_mark(struct sk_buff *skb);

void nbp_switchdev_frame_mark_tx_fwd_offload(const struct net_bridge_port *p,
                                             struct sk_buff *skb);
void nbp_switchdev_frame_mark_tx_fwd_to_hwdom(const struct net_bridge_port *p,
                                              struct sk_buff *skb);
void nbp_switchdev_frame_mark(const struct net_bridge_port *p,
                              struct sk_buff *skb);
bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p,
                                  const struct sk_buff *skb);
int br_switchdev_set_port_flag(struct net_bridge_port *p,
                               unsigned long flags,
                               unsigned long mask,
                               struct netlink_ext_ack *extack);
void br_switchdev_fdb_notify(struct net_bridge *br,
                             const struct net_bridge_fdb_entry *fdb, int type);
void br_switchdev_mdb_notify(struct net_device *dev,
                             struct net_bridge_mdb_entry *mp,
                             struct net_bridge_port_group *pg,
                             int type);
int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags,
                               bool changed, struct netlink_ext_ack *extack);
int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid);
void br_switchdev_init(struct net_bridge *br);

static inline void br_switchdev_frame_unmark(struct sk_buff *skb)
{
        skb->offload_fwd_mark = 0;
}
#else
static inline int
br_switchdev_port_offload(struct net_bridge_port *p,
                          struct net_device *dev, const void *ctx,
                          struct notifier_block *atomic_nb,
                          struct notifier_block *blocking_nb,
                          bool tx_fwd_offload,
                          struct netlink_ext_ack *extack)
{
        return -EOPNOTSUPP;
}

static inline void
br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx,
                            struct notifier_block *atomic_nb,
                            struct notifier_block *blocking_nb)
{
}

static inline int
br_switchdev_port_replay(struct net_bridge_port *p,
                         struct net_device *dev, const void *ctx,
                         struct notifier_block *atomic_nb,
                         struct notifier_block *blocking_nb,
                         struct netlink_ext_ack *extack)
{
        return -EOPNOTSUPP;
}

static inline bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb)
{
        return false;
}

static inline void br_switchdev_frame_set_offload_fwd_mark(struct sk_buff *skb)
{
}

static inline void
nbp_switchdev_frame_mark_tx_fwd_offload(const struct net_bridge_port *p,
                                        struct sk_buff *skb)
{
}

static inline void
nbp_switchdev_frame_mark_tx_fwd_to_hwdom(const struct net_bridge_port *p,
                                         struct sk_buff *skb)
{
}

static inline void nbp_switchdev_frame_mark(const struct net_bridge_port *p,
                                            struct sk_buff *skb)
{
}

static inline bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p,
                                                const struct sk_buff *skb)
{
        return true;
}

static inline int br_switchdev_set_port_flag(struct net_bridge_port *p,
                                             unsigned long flags,
                                             unsigned long mask,
                                             struct netlink_ext_ack *extack)
{
        return 0;
}

static inline int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid,
                                             u16 flags, bool changed,
                                             struct netlink_ext_ack *extack)
{
        return -EOPNOTSUPP;
}

static inline int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid)
{
        return -EOPNOTSUPP;
}

static inline void
br_switchdev_fdb_notify(struct net_bridge *br,
                        const struct net_bridge_fdb_entry *fdb, int type)
{
}

static inline void br_switchdev_mdb_notify(struct net_device *dev,
                                           struct net_bridge_mdb_entry *mp,
                                           struct net_bridge_port_group *pg,
                                           int type)
{
}

static inline void br_switchdev_frame_unmark(struct sk_buff *skb)
{
}

static inline void br_switchdev_init(struct net_bridge *br)
{
}

#endif /* CONFIG_NET_SWITCHDEV */

/* br_arp_nd_proxy.c */
void br_recalculate_neigh_suppress_enabled(struct net_bridge *br);
void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
                              u16 vid, struct net_bridge_port *p);
void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
                       u16 vid, struct net_bridge_port *p, struct nd_msg *msg);
struct nd_msg *br_is_nd_neigh_msg(const struct sk_buff *skb, struct nd_msg *m);
bool br_is_neigh_suppress_enabled(const struct net_bridge_port *p, u16 vid);
#endif



























































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Queue of folios definitions
 *
 * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * See:
 *
 *        Documentation/core-api/folio_queue.rst
 *
 * for a description of the API.
 */

#ifndef _LINUX_FOLIO_QUEUE_H
#define _LINUX_FOLIO_QUEUE_H

#include <linux/pagevec.h>
#include <linux/mm.h>

/*
 * Segment in a queue of running buffers.  Each segment can hold a number of
 * folios and a portion of the queue can be referenced with the ITER_FOLIOQ
 * iterator.  The possibility exists of inserting non-folio elements into the
 * queue (such as gaps).
 *
 * Explicit prev and next pointers are used instead of a list_head to make it
 * easier to add segments to tail and remove them from the head without the
 * need for a lock.
 */
struct folio_queue {
        struct folio_batch        vec;                /* Folios in the queue segment */
        u8                        orders[PAGEVEC_SIZE]; /* Order of each folio */
        struct folio_queue        *next;                /* Next queue segment or NULL */
        struct folio_queue        *prev;                /* Previous queue segment of NULL */
        unsigned long                marks;                /* 1-bit mark per folio */
        unsigned long                marks2;                /* Second 1-bit mark per folio */
#if PAGEVEC_SIZE > BITS_PER_LONG
#error marks is not big enough
#endif
        unsigned int                rreq_id;
        unsigned int                debug_id;
};

/**
 * folioq_init - Initialise a folio queue segment
 * @folioq: The segment to initialise
 * @rreq_id: The request identifier to use in tracelines.
 *
 * Initialise a folio queue segment and set an identifier to be used in traces.
 *
 * Note that the folio pointers are left uninitialised.
 */
static inline void folioq_init(struct folio_queue *folioq, unsigned int rreq_id)
{
        folio_batch_init(&folioq->vec);
        folioq->next = NULL;
        folioq->prev = NULL;
        folioq->marks = 0;
        folioq->marks2 = 0;
        folioq->rreq_id = rreq_id;
        folioq->debug_id = 0;
}

/**
 * folioq_nr_slots: Query the capacity of a folio queue segment
 * @folioq: The segment to query
 *
 * Query the number of folios that a particular folio queue segment might hold.
 * [!] NOTE: This must not be assumed to be the same for every segment!
 */
static inline unsigned int folioq_nr_slots(const struct folio_queue *folioq)
{
        return PAGEVEC_SIZE;
}

/**
 * folioq_count: Query the occupancy of a folio queue segment
 * @folioq: The segment to query
 *
 * Query the number of folios that have been added to a folio queue segment.
 * Note that this is not decreased as folios are removed from a segment.
 */
static inline unsigned int folioq_count(struct folio_queue *folioq)
{
        return folio_batch_count(&folioq->vec);
}

/**
 * folioq_full: Query if a folio queue segment is full
 * @folioq: The segment to query
 *
 * Query if a folio queue segment is fully occupied.  Note that this does not
 * change if folios are removed from a segment.
 */
static inline bool folioq_full(struct folio_queue *folioq)
{
        //return !folio_batch_space(&folioq->vec);
        return folioq_count(folioq) >= folioq_nr_slots(folioq);
}

/**
 * folioq_is_marked: Check first folio mark in a folio queue segment
 * @folioq: The segment to query
 * @slot: The slot number of the folio to query
 *
 * Determine if the first mark is set for the folio in the specified slot in a
 * folio queue segment.
 */
static inline bool folioq_is_marked(const struct folio_queue *folioq, unsigned int slot)
{
        return test_bit(slot, &folioq->marks);
}

/**
 * folioq_mark: Set the first mark on a folio in a folio queue segment
 * @folioq: The segment to modify
 * @slot: The slot number of the folio to modify
 *
 * Set the first mark for the folio in the specified slot in a folio queue
 * segment.
 */
static inline void folioq_mark(struct folio_queue *folioq, unsigned int slot)
{
        set_bit(slot, &folioq->marks);
}

/**
 * folioq_unmark: Clear the first mark on a folio in a folio queue segment
 * @folioq: The segment to modify
 * @slot: The slot number of the folio to modify
 *
 * Clear the first mark for the folio in the specified slot in a folio queue
 * segment.
 */
static inline void folioq_unmark(struct folio_queue *folioq, unsigned int slot)
{
        clear_bit(slot, &folioq->marks);
}

/**
 * folioq_is_marked2: Check second folio mark in a folio queue segment
 * @folioq: The segment to query
 * @slot: The slot number of the folio to query
 *
 * Determine if the second mark is set for the folio in the specified slot in a
 * folio queue segment.
 */
static inline bool folioq_is_marked2(const struct folio_queue *folioq, unsigned int slot)
{
        return test_bit(slot, &folioq->marks2);
}

/**
 * folioq_mark2: Set the second mark on a folio in a folio queue segment
 * @folioq: The segment to modify
 * @slot: The slot number of the folio to modify
 *
 * Set the second mark for the folio in the specified slot in a folio queue
 * segment.
 */
static inline void folioq_mark2(struct folio_queue *folioq, unsigned int slot)
{
        set_bit(slot, &folioq->marks2);
}

/**
 * folioq_unmark2: Clear the second mark on a folio in a folio queue segment
 * @folioq: The segment to modify
 * @slot: The slot number of the folio to modify
 *
 * Clear the second mark for the folio in the specified slot in a folio queue
 * segment.
 */
static inline void folioq_unmark2(struct folio_queue *folioq, unsigned int slot)
{
        clear_bit(slot, &folioq->marks2);
}

/**
 * folioq_append: Add a folio to a folio queue segment
 * @folioq: The segment to add to
 * @folio: The folio to add
 *
 * Add a folio to the tail of the sequence in a folio queue segment, increasing
 * the occupancy count and returning the slot number for the folio just added.
 * The folio size is extracted and stored in the queue and the marks are left
 * unmodified.
 *
 * Note that it's left up to the caller to check that the segment capacity will
 * not be exceeded and to extend the queue.
 */
static inline unsigned int folioq_append(struct folio_queue *folioq, struct folio *folio)
{
        unsigned int slot = folioq->vec.nr++;

        folioq->vec.folios[slot] = folio;
        folioq->orders[slot] = folio_order(folio);
        return slot;
}

/**
 * folioq_append_mark: Add a folio to a folio queue segment
 * @folioq: The segment to add to
 * @folio: The folio to add
 *
 * Add a folio to the tail of the sequence in a folio queue segment, increasing
 * the occupancy count and returning the slot number for the folio just added.
 * The folio size is extracted and stored in the queue, the first mark is set
 * and and the second and third marks are left unmodified.
 *
 * Note that it's left up to the caller to check that the segment capacity will
 * not be exceeded and to extend the queue.
 */
static inline unsigned int folioq_append_mark(struct folio_queue *folioq, struct folio *folio)
{
        unsigned int slot = folioq->vec.nr++;

        folioq->vec.folios[slot] = folio;
        folioq->orders[slot] = folio_order(folio);
        folioq_mark(folioq, slot);
        return slot;
}

/**
 * folioq_folio: Get a folio from a folio queue segment
 * @folioq: The segment to access
 * @slot: The folio slot to access
 *
 * Retrieve the folio in the specified slot from a folio queue segment.  Note
 * that no bounds check is made and if the slot hasn't been added into yet, the
 * pointer will be undefined.  If the slot has been cleared, NULL will be
 * returned.
 */
static inline struct folio *folioq_folio(const struct folio_queue *folioq, unsigned int slot)
{
        return folioq->vec.folios[slot];
}

/**
 * folioq_folio_order: Get the order of a folio from a folio queue segment
 * @folioq: The segment to access
 * @slot: The folio slot to access
 *
 * Retrieve the order of the folio in the specified slot from a folio queue
 * segment.  Note that no bounds check is made and if the slot hasn't been
 * added into yet, the order returned will be 0.
 */
static inline unsigned int folioq_folio_order(const struct folio_queue *folioq, unsigned int slot)
{
        return folioq->orders[slot];
}

/**
 * folioq_folio_size: Get the size of a folio from a folio queue segment
 * @folioq: The segment to access
 * @slot: The folio slot to access
 *
 * Retrieve the size of the folio in the specified slot from a folio queue
 * segment.  Note that no bounds check is made and if the slot hasn't been
 * added into yet, the size returned will be PAGE_SIZE.
 */
static inline size_t folioq_folio_size(const struct folio_queue *folioq, unsigned int slot)
{
        return PAGE_SIZE << folioq_folio_order(folioq, slot);
}

/**
 * folioq_clear: Clear a folio from a folio queue segment
 * @folioq: The segment to clear
 * @slot: The folio slot to clear
 *
 * Clear a folio from a sequence in a folio queue segment and clear its marks.
 * The occupancy count is left unchanged.
 */
static inline void folioq_clear(struct folio_queue *folioq, unsigned int slot)
{
        folioq->vec.folios[slot] = NULL;
        folioq_unmark(folioq, slot);
        folioq_unmark2(folioq, slot);
}

#endif /* _LINUX_FOLIO_QUEUE_H */








































  318 















  318 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        Skb ref helpers.
 *
 */

#ifndef _LINUX_SKBUFF_REF_H
#define _LINUX_SKBUFF_REF_H

#include <linux/skbuff.h>

/**
 * __skb_frag_ref - take an addition reference on a paged fragment.
 * @frag: the paged fragment
 *
 * Takes an additional reference on the paged fragment @frag.
 */
static inline void __skb_frag_ref(skb_frag_t *frag)
{
        get_netmem(skb_frag_netmem(frag));
}

/**
 * skb_frag_ref - take an addition reference on a paged fragment of an skb.
 * @skb: the buffer
 * @f: the fragment offset.
 *
 * Takes an additional reference on the @f'th paged fragment of @skb.
 */
static inline void skb_frag_ref(struct sk_buff *skb, int f)
{
        __skb_frag_ref(&skb_shinfo(skb)->frags[f]);
}

bool napi_pp_put_page(netmem_ref netmem);

static inline void skb_page_unref(netmem_ref netmem, bool recycle)
{
#ifdef CONFIG_PAGE_POOL
        if (recycle && napi_pp_put_page(netmem))
                return;
#endif
        put_netmem(netmem);
}

/**
 * __skb_frag_unref - release a reference on a paged fragment.
 * @frag: the paged fragment
 * @recycle: recycle the page if allocated via page_pool
 *
 * Releases a reference on the paged fragment @frag
 * or recycles the page via the page_pool API.
 */
static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
{
        skb_page_unref(skb_frag_netmem(frag), recycle);
}

/**
 * skb_frag_unref - release a reference on a paged fragment of an skb.
 * @skb: the buffer
 * @f: the fragment offset
 *
 * Releases a reference on the @f'th paged fragment of @skb.
 */
static inline void skb_frag_unref(struct sk_buff *skb, int f)
{
        struct skb_shared_info *shinfo = skb_shinfo(skb);

        if (!skb_zcopy_managed(skb))
                __skb_frag_unref(&shinfo->frags[f], skb->pp_recycle);
}

#endif        /* _LINUX_SKBUFF_REF_H */





























































































































































































































































































































































































































































  318 












  316 






  319 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NET_FLOW_DISSECTOR_H
#define _NET_FLOW_DISSECTOR_H

#include <linux/types.h>
#include <linux/in6.h>
#include <linux/siphash.h>
#include <linux/string.h>
#include <uapi/linux/if_ether.h>
#include <uapi/linux/pkt_cls.h>

struct bpf_prog;
struct net;
struct sk_buff;

/**
 * struct flow_dissector_key_control:
 * @thoff:     Transport header offset
 * @addr_type: Type of key. One of FLOW_DISSECTOR_KEY_*
 * @flags:     Key flags.
 *             Any of FLOW_DIS_(IS_FRAGMENT|FIRST_FRAG|ENCAPSULATION|F_*)
 */
struct flow_dissector_key_control {
        u16        thoff;
        u16        addr_type;
        u32        flags;
};

/* The control flags are kept in sync with TCA_FLOWER_KEY_FLAGS_*, as those
 * flags are exposed to userspace in some error paths, ie. unsupported flags.
 */
enum flow_dissector_ctrl_flags {
        FLOW_DIS_IS_FRAGMENT                = TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT,
        FLOW_DIS_FIRST_FRAG                = TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST,
        FLOW_DIS_F_TUNNEL_CSUM                = TCA_FLOWER_KEY_FLAGS_TUNNEL_CSUM,
        FLOW_DIS_F_TUNNEL_DONT_FRAGMENT        = TCA_FLOWER_KEY_FLAGS_TUNNEL_DONT_FRAGMENT,
        FLOW_DIS_F_TUNNEL_OAM                = TCA_FLOWER_KEY_FLAGS_TUNNEL_OAM,
        FLOW_DIS_F_TUNNEL_CRIT_OPT        = TCA_FLOWER_KEY_FLAGS_TUNNEL_CRIT_OPT,

        /* These flags are internal to the kernel */
        FLOW_DIS_ENCAPSULATION                = (TCA_FLOWER_KEY_FLAGS_MAX << 1),
};

enum flow_dissect_ret {
        FLOW_DISSECT_RET_OUT_GOOD,
        FLOW_DISSECT_RET_OUT_BAD,
        FLOW_DISSECT_RET_PROTO_AGAIN,
        FLOW_DISSECT_RET_IPPROTO_AGAIN,
        FLOW_DISSECT_RET_CONTINUE,
};

/**
 * struct flow_dissector_key_basic:
 * @n_proto:  Network header protocol (eg. IPv4/IPv6)
 * @ip_proto: Transport header protocol (eg. TCP/UDP)
 * @padding:  Unused
 */
struct flow_dissector_key_basic {
        __be16        n_proto;
        u8        ip_proto;
        u8        padding;
};

struct flow_dissector_key_tags {
        u32        flow_label;
};

struct flow_dissector_key_vlan {
        union {
                struct {
                        u16        vlan_id:12,
                                vlan_dei:1,
                                vlan_priority:3;
                };
                __be16        vlan_tci;
        };
        __be16        vlan_tpid;
        __be16        vlan_eth_type;
        u16        padding;
};

struct flow_dissector_mpls_lse {
        u32        mpls_ttl:8,
                mpls_bos:1,
                mpls_tc:3,
                mpls_label:20;
};

#define FLOW_DIS_MPLS_MAX 7
struct flow_dissector_key_mpls {
        struct flow_dissector_mpls_lse ls[FLOW_DIS_MPLS_MAX]; /* Label Stack */
        u8 used_lses; /* One bit set for each Label Stack Entry in use */
};

static inline void dissector_set_mpls_lse(struct flow_dissector_key_mpls *mpls,
                                          int lse_index)
{
        mpls->used_lses |= 1 << lse_index;
}

#define FLOW_DIS_TUN_OPTS_MAX 255
/**
 * struct flow_dissector_key_enc_opts:
 * @data: tunnel option data
 * @len: length of tunnel option data
 * @dst_opt_type: tunnel option type
 */
struct flow_dissector_key_enc_opts {
        u8 data[FLOW_DIS_TUN_OPTS_MAX];        /* Using IP_TUNNEL_OPTS_MAX is desired
                                         * here but seems difficult to #include
                                         */
        u8 len;
        u32 dst_opt_type;
};

struct flow_dissector_key_keyid {
        __be32        keyid;
};

/**
 * struct flow_dissector_key_ipv4_addrs:
 * @src: source ip address
 * @dst: destination ip address
 */
struct flow_dissector_key_ipv4_addrs {
        /* (src,dst) must be grouped, in the same way than in IP header */
        __be32 src;
        __be32 dst;
};

/**
 * struct flow_dissector_key_ipv6_addrs:
 * @src: source ip address
 * @dst: destination ip address
 */
struct flow_dissector_key_ipv6_addrs {
        /* (src,dst) must be grouped, in the same way than in IP header */
        struct in6_addr src;
        struct in6_addr dst;
};

/**
 * struct flow_dissector_key_tipc:
 * @key: source node address combined with selector
 */
struct flow_dissector_key_tipc {
        __be32 key;
};

/**
 * struct flow_dissector_key_addrs:
 * @v4addrs: IPv4 addresses
 * @v6addrs: IPv6 addresses
 * @tipckey: TIPC key
 */
struct flow_dissector_key_addrs {
        union {
                struct flow_dissector_key_ipv4_addrs v4addrs;
                struct flow_dissector_key_ipv6_addrs v6addrs;
                struct flow_dissector_key_tipc tipckey;
        };
};

/**
 * struct flow_dissector_key_arp:
 * @sip: Sender IP address
 * @tip: Target IP address
 * @op:  Operation
 * @sha: Sender hardware address
 * @tha: Target hardware address
 */
struct flow_dissector_key_arp {
        __u32 sip;
        __u32 tip;
        __u8 op;
        unsigned char sha[ETH_ALEN];
        unsigned char tha[ETH_ALEN];
};

/**
 * struct flow_dissector_key_ports:
 * @ports: port numbers of Transport header
 * @src: source port number
 * @dst: destination port number
 */
struct flow_dissector_key_ports {
        union {
                __be32 ports;
                struct {
                        __be16 src;
                        __be16 dst;
                };
        };
};

/**
 * struct flow_dissector_key_ports_range
 * @tp: port number from packet
 * @tp_min: min port number in range
 * @tp_max: max port number in range
 */
struct flow_dissector_key_ports_range {
        union {
                struct flow_dissector_key_ports tp;
                struct {
                        struct flow_dissector_key_ports tp_min;
                        struct flow_dissector_key_ports tp_max;
                };
        };
};

/**
 * struct flow_dissector_key_icmp:
 * @type: ICMP type
 * @code: ICMP code
 * @id:   Session identifier
 */
struct flow_dissector_key_icmp {
        struct {
                u8 type;
                u8 code;
        };
        u16 id;
};

/**
 * struct flow_dissector_key_eth_addrs:
 * @src: source Ethernet address
 * @dst: destination Ethernet address
 */
struct flow_dissector_key_eth_addrs {
        /* (dst,src) must be grouped, in the same way than in ETH header */
        unsigned char dst[ETH_ALEN];
        unsigned char src[ETH_ALEN];
};

/**
 * struct flow_dissector_key_tcp:
 * @flags: flags
 */
struct flow_dissector_key_tcp {
        __be16 flags;
};

/**
 * struct flow_dissector_key_ip:
 * @tos: tos
 * @ttl: ttl
 */
struct flow_dissector_key_ip {
        __u8        tos;
        __u8        ttl;
};

/**
 * struct flow_dissector_key_meta:
 * @ingress_ifindex: ingress ifindex
 * @ingress_iftype: ingress interface type
 * @l2_miss: packet did not match an L2 entry during forwarding
 */
struct flow_dissector_key_meta {
        int ingress_ifindex;
        u16 ingress_iftype;
        u8 l2_miss;
};

/**
 * struct flow_dissector_key_ct:
 * @ct_state: conntrack state after converting with map
 * @ct_mark: conttrack mark
 * @ct_zone: conntrack zone
 * @ct_labels: conntrack labels
 */
struct flow_dissector_key_ct {
        u16        ct_state;
        u16        ct_zone;
        u32        ct_mark;
        u32        ct_labels[4];
};

/**
 * struct flow_dissector_key_hash:
 * @hash: hash value
 */
struct flow_dissector_key_hash {
        u32 hash;
};

/**
 * struct flow_dissector_key_num_of_vlans:
 * @num_of_vlans: num_of_vlans value
 */
struct flow_dissector_key_num_of_vlans {
        u8 num_of_vlans;
};

/**
 * struct flow_dissector_key_pppoe:
 * @session_id: pppoe session id
 * @ppp_proto: ppp protocol
 * @type: pppoe eth type
 */
struct flow_dissector_key_pppoe {
        __be16 session_id;
        __be16 ppp_proto;
        __be16 type;
};

/**
 * struct flow_dissector_key_l2tpv3:
 * @session_id: identifier for a l2tp session
 */
struct flow_dissector_key_l2tpv3 {
        __be32 session_id;
};

/**
 * struct flow_dissector_key_ipsec:
 * @spi: identifier for a ipsec connection
 */
struct flow_dissector_key_ipsec {
        __be32 spi;
};

/**
 * struct flow_dissector_key_cfm
 * @mdl_ver: maintenance domain level (mdl) and cfm protocol version
 * @opcode: code specifying a type of cfm protocol packet
 *
 * See 802.1ag, ITU-T G.8013/Y.1731
 *         1               2
 * |7 6 5 4 3 2 1 0|7 6 5 4 3 2 1 0|
 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 * | mdl | version |     opcode    |
 * +-----+---------+-+-+-+-+-+-+-+-+
 */
struct flow_dissector_key_cfm {
        u8        mdl_ver;
        u8        opcode;
};

#define FLOW_DIS_CFM_MDL_MASK GENMASK(7, 5)
#define FLOW_DIS_CFM_MDL_MAX 7

enum flow_dissector_key_id {
        FLOW_DISSECTOR_KEY_CONTROL, /* struct flow_dissector_key_control */
        FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */
        FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */
        FLOW_DISSECTOR_KEY_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */
        FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */
        FLOW_DISSECTOR_KEY_PORTS_RANGE, /* struct flow_dissector_key_ports */
        FLOW_DISSECTOR_KEY_ICMP, /* struct flow_dissector_key_icmp */
        FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */
        FLOW_DISSECTOR_KEY_TIPC, /* struct flow_dissector_key_tipc */
        FLOW_DISSECTOR_KEY_ARP, /* struct flow_dissector_key_arp */
        FLOW_DISSECTOR_KEY_VLAN, /* struct flow_dissector_key_vlan */
        FLOW_DISSECTOR_KEY_FLOW_LABEL, /* struct flow_dissector_key_tags */
        FLOW_DISSECTOR_KEY_GRE_KEYID, /* struct flow_dissector_key_keyid */
        FLOW_DISSECTOR_KEY_MPLS_ENTROPY, /* struct flow_dissector_key_keyid */
        FLOW_DISSECTOR_KEY_ENC_KEYID, /* struct flow_dissector_key_keyid */
        FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */
        FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */
        FLOW_DISSECTOR_KEY_ENC_CONTROL, /* struct flow_dissector_key_control */
        FLOW_DISSECTOR_KEY_ENC_PORTS, /* struct flow_dissector_key_ports */
        FLOW_DISSECTOR_KEY_MPLS, /* struct flow_dissector_key_mpls */
        FLOW_DISSECTOR_KEY_TCP, /* struct flow_dissector_key_tcp */
        FLOW_DISSECTOR_KEY_IP, /* struct flow_dissector_key_ip */
        FLOW_DISSECTOR_KEY_CVLAN, /* struct flow_dissector_key_vlan */
        FLOW_DISSECTOR_KEY_ENC_IP, /* struct flow_dissector_key_ip */
        FLOW_DISSECTOR_KEY_ENC_OPTS, /* struct flow_dissector_key_enc_opts */
        FLOW_DISSECTOR_KEY_META, /* struct flow_dissector_key_meta */
        FLOW_DISSECTOR_KEY_CT, /* struct flow_dissector_key_ct */
        FLOW_DISSECTOR_KEY_HASH, /* struct flow_dissector_key_hash */
        FLOW_DISSECTOR_KEY_NUM_OF_VLANS, /* struct flow_dissector_key_num_of_vlans */
        FLOW_DISSECTOR_KEY_PPPOE, /* struct flow_dissector_key_pppoe */
        FLOW_DISSECTOR_KEY_L2TPV3, /* struct flow_dissector_key_l2tpv3 */
        FLOW_DISSECTOR_KEY_CFM, /* struct flow_dissector_key_cfm */
        FLOW_DISSECTOR_KEY_IPSEC, /* struct flow_dissector_key_ipsec */

        FLOW_DISSECTOR_KEY_MAX,
};

#define FLOW_DISSECTOR_F_PARSE_1ST_FRAG                BIT(0)
#define FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL        BIT(1)
#define FLOW_DISSECTOR_F_STOP_AT_ENCAP                BIT(2)
#define FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP        BIT(3)

struct flow_dissector_key {
        enum flow_dissector_key_id key_id;
        size_t offset; /* offset of struct flow_dissector_key_*
                          in target the struct */
};

struct flow_dissector {
        unsigned long long  used_keys;
                /* each bit represents presence of one key id */
        unsigned short int offset[FLOW_DISSECTOR_KEY_MAX];
};

struct flow_keys_basic {
        struct flow_dissector_key_control control;
        struct flow_dissector_key_basic basic;
};

struct flow_keys {
        struct flow_dissector_key_control control;
#define FLOW_KEYS_HASH_START_FIELD basic
        struct flow_dissector_key_basic basic __aligned(SIPHASH_ALIGNMENT);
        struct flow_dissector_key_tags tags;
        struct flow_dissector_key_vlan vlan;
        struct flow_dissector_key_vlan cvlan;
        struct flow_dissector_key_keyid keyid;
        struct flow_dissector_key_ports ports;
        struct flow_dissector_key_icmp icmp;
        /* 'addrs' must be the last member */
        struct flow_dissector_key_addrs addrs;
};

#define FLOW_KEYS_HASH_OFFSET                \
        offsetof(struct flow_keys, FLOW_KEYS_HASH_START_FIELD)

__be32 flow_get_u32_src(const struct flow_keys *flow);
__be32 flow_get_u32_dst(const struct flow_keys *flow);

extern struct flow_dissector flow_keys_dissector;
extern struct flow_dissector flow_keys_basic_dissector;

/* struct flow_keys_digest:
 *
 * This structure is used to hold a digest of the full flow keys. This is a
 * larger "hash" of a flow to allow definitively matching specific flows where
 * the 32 bit skb->hash is not large enough. The size is limited to 16 bytes so
 * that it can be used in CB of skb (see sch_choke for an example).
 */
#define FLOW_KEYS_DIGEST_LEN        16
struct flow_keys_digest {
        u8        data[FLOW_KEYS_DIGEST_LEN];
};

void make_flow_keys_digest(struct flow_keys_digest *digest,
                           const struct flow_keys *flow);

static inline bool flow_keys_have_l4(const struct flow_keys *keys)
{
        return (keys->ports.ports || keys->tags.flow_label);
}

u32 flow_hash_from_keys(struct flow_keys *keys);
u32 flow_hash_from_keys_seed(struct flow_keys *keys,
                             const siphash_key_t *keyval);
void skb_flow_get_icmp_tci(const struct sk_buff *skb,
                           struct flow_dissector_key_icmp *key_icmp,
                           const void *data, int thoff, int hlen);

static inline bool dissector_uses_key(const struct flow_dissector *flow_dissector,
                                      enum flow_dissector_key_id key_id)
{
        return flow_dissector->used_keys & (1ULL << key_id);
}

static inline void *skb_flow_dissector_target(struct flow_dissector *flow_dissector,
                                              enum flow_dissector_key_id key_id,
                                              void *target_container)
{
        return ((char *)target_container) + flow_dissector->offset[key_id];
}

struct bpf_flow_dissector {
        struct bpf_flow_keys        *flow_keys;
        const struct sk_buff        *skb;
        const void                *data;
        const void                *data_end;
};

static inline void
flow_dissector_init_keys(struct flow_dissector_key_control *key_control,
                         struct flow_dissector_key_basic *key_basic)
{
        memset(key_control, 0, sizeof(*key_control));
        memset(key_basic, 0, sizeof(*key_basic));
}

#ifdef CONFIG_BPF_SYSCALL
int flow_dissector_bpf_prog_attach_check(struct net *net,
                                         struct bpf_prog *prog);
#endif /* CONFIG_BPF_SYSCALL */

#endif





























































































  319 













   61 







































   39 






























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_ATOMIC_H
#define _ASM_X86_ATOMIC_H

#include <linux/compiler.h>
#include <linux/types.h>
#include <asm/alternative.h>
#include <asm/cmpxchg.h>
#include <asm/rmwcc.h>
#include <asm/barrier.h>

/*
 * Atomic operations that C can't guarantee us.  Useful for
 * resource counting etc..
 */

static __always_inline int arch_atomic_read(const atomic_t *v)
{
        /*
         * Note for KASAN: we deliberately don't use READ_ONCE_NOCHECK() here,
         * it's non-inlined function that increases binary size and stack usage.
         */
        return __READ_ONCE((v)->counter);
}

static __always_inline void arch_atomic_set(atomic_t *v, int i)
{
        __WRITE_ONCE(v->counter, i);
}

static __always_inline void arch_atomic_add(int i, atomic_t *v)
{
        asm_inline volatile(LOCK_PREFIX "addl %1, %0"
                     : "+m" (v->counter)
                     : "ir" (i) : "memory");
}

static __always_inline void arch_atomic_sub(int i, atomic_t *v)
{
        asm_inline volatile(LOCK_PREFIX "subl %1, %0"
                     : "+m" (v->counter)
                     : "ir" (i) : "memory");
}

static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, e, "er", i);
}
#define arch_atomic_sub_and_test arch_atomic_sub_and_test

static __always_inline void arch_atomic_inc(atomic_t *v)
{
        asm_inline volatile(LOCK_PREFIX "incl %0"
                     : "+m" (v->counter) :: "memory");
}
#define arch_atomic_inc arch_atomic_inc

static __always_inline void arch_atomic_dec(atomic_t *v)
{
        asm_inline volatile(LOCK_PREFIX "decl %0"
                     : "+m" (v->counter) :: "memory");
}
#define arch_atomic_dec arch_atomic_dec

static __always_inline bool arch_atomic_dec_and_test(atomic_t *v)
{
        return GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, e);
}
#define arch_atomic_dec_and_test arch_atomic_dec_and_test

static __always_inline bool arch_atomic_inc_and_test(atomic_t *v)
{
        return GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, e);
}
#define arch_atomic_inc_and_test arch_atomic_inc_and_test

static __always_inline bool arch_atomic_add_negative(int i, atomic_t *v)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, s, "er", i);
}
#define arch_atomic_add_negative arch_atomic_add_negative

static __always_inline int arch_atomic_add_return(int i, atomic_t *v)
{
        return i + xadd(&v->counter, i);
}
#define arch_atomic_add_return arch_atomic_add_return

#define arch_atomic_sub_return(i, v) arch_atomic_add_return(-(i), v)

static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v)
{
        return xadd(&v->counter, i);
}
#define arch_atomic_fetch_add arch_atomic_fetch_add

#define arch_atomic_fetch_sub(i, v) arch_atomic_fetch_add(-(i), v)

static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new)
{
        return arch_cmpxchg(&v->counter, old, new);
}
#define arch_atomic_cmpxchg arch_atomic_cmpxchg

static __always_inline bool arch_atomic_try_cmpxchg(atomic_t *v, int *old, int new)
{
        return arch_try_cmpxchg(&v->counter, old, new);
}
#define arch_atomic_try_cmpxchg arch_atomic_try_cmpxchg

static __always_inline int arch_atomic_xchg(atomic_t *v, int new)
{
        return arch_xchg(&v->counter, new);
}
#define arch_atomic_xchg arch_atomic_xchg

static __always_inline void arch_atomic_and(int i, atomic_t *v)
{
        asm_inline volatile(LOCK_PREFIX "andl %1, %0"
                        : "+m" (v->counter)
                        : "ir" (i)
                        : "memory");
}

static __always_inline int arch_atomic_fetch_and(int i, atomic_t *v)
{
        int val = arch_atomic_read(v);

        do { } while (!arch_atomic_try_cmpxchg(v, &val, val & i));

        return val;
}
#define arch_atomic_fetch_and arch_atomic_fetch_and

static __always_inline void arch_atomic_or(int i, atomic_t *v)
{
        asm_inline volatile(LOCK_PREFIX "orl %1, %0"
                        : "+m" (v->counter)
                        : "ir" (i)
                        : "memory");
}

static __always_inline int arch_atomic_fetch_or(int i, atomic_t *v)
{
        int val = arch_atomic_read(v);

        do { } while (!arch_atomic_try_cmpxchg(v, &val, val | i));

        return val;
}
#define arch_atomic_fetch_or arch_atomic_fetch_or

static __always_inline void arch_atomic_xor(int i, atomic_t *v)
{
        asm_inline volatile(LOCK_PREFIX "xorl %1, %0"
                        : "+m" (v->counter)
                        : "ir" (i)
                        : "memory");
}

static __always_inline int arch_atomic_fetch_xor(int i, atomic_t *v)
{
        int val = arch_atomic_read(v);

        do { } while (!arch_atomic_try_cmpxchg(v, &val, val ^ i));

        return val;
}
#define arch_atomic_fetch_xor arch_atomic_fetch_xor

#ifdef CONFIG_X86_32
# include <asm/atomic64_32.h>
#else
# include <asm/atomic64_64.h>
#endif

#endif /* _ASM_X86_ATOMIC_H */








































































   15 












































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (C) 1994 Linus Torvalds
 *
 * Pentium III FXSR, SSE support
 * General FPU state handling cleanups
 *        Gareth Hughes <gareth@valinux.com>, May 2000
 * x86-64 work by Andi Kleen 2002
 */

#ifndef _ASM_X86_FPU_API_H
#define _ASM_X86_FPU_API_H
#include <linux/bottom_half.h>

#include <asm/fpu/types.h>

/*
 * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It
 * disables preemption and softirq processing, so be careful if you intend to
 * use it for long periods of time.  Kernel-mode FPU cannot be used in all
 * contexts -- see irq_fpu_usable() for details.
 */

/* Kernel FPU states to initialize in kernel_fpu_begin_mask() */
#define KFPU_387        _BITUL(0)        /* 387 state will be initialized */
#define KFPU_MXCSR        _BITUL(1)        /* MXCSR will be initialized */

extern void kernel_fpu_begin_mask(unsigned int kfpu_mask);
extern void kernel_fpu_end(void);
extern bool irq_fpu_usable(void);
extern void fpregs_mark_activate(void);

/* Code that is unaware of kernel_fpu_begin_mask() can use this */
static inline void kernel_fpu_begin(void)
{
#ifdef CONFIG_X86_64
        /*
         * Any 64-bit code that uses 387 instructions must explicitly request
         * KFPU_387.
         */
        kernel_fpu_begin_mask(KFPU_MXCSR);
#else
        /*
         * 32-bit kernel code may use 387 operations as well as SSE2, etc,
         * as long as it checks that the CPU has the required capability.
         */
        kernel_fpu_begin_mask(KFPU_387 | KFPU_MXCSR);
#endif
}

/*
 * Use fpregs_lock() while editing CPU's FPU registers or fpu->fpstate, or while
 * using the FPU in kernel mode.  A context switch will (and softirq might) save
 * CPU's FPU registers to fpu->fpstate.regs and set TIF_NEED_FPU_LOAD leaving
 * CPU's FPU registers in a random state.
 *
 * local_bh_disable() protects against both preemption and soft interrupts
 * on !RT kernels.
 *
 * On RT kernels local_bh_disable() is not sufficient because it only
 * serializes soft interrupt related sections via a local lock, but stays
 * preemptible. Disabling preemption is the right choice here as bottom
 * half processing is always in thread context on RT kernels so it
 * implicitly prevents bottom half processing as well.
 */
static inline void fpregs_lock(void)
{
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                local_bh_disable();
        else
                preempt_disable();
}

static inline void fpregs_unlock(void)
{
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                local_bh_enable();
        else
                preempt_enable();
}

/*
 * FPU state gets lazily restored before returning to userspace. So when in the
 * kernel, the valid FPU state may be kept in the buffer. This function will force
 * restore all the fpu state to the registers early if needed, and lock them from
 * being automatically saved/restored. Then FPU state can be modified safely in the
 * registers, before unlocking with fpregs_unlock().
 */
void fpregs_lock_and_load(void);

#ifdef CONFIG_X86_DEBUG_FPU
extern void fpregs_assert_state_consistent(void);
#else
static inline void fpregs_assert_state_consistent(void) { }
#endif

/*
 * Load the task FPU state before returning to userspace.
 */
extern void switch_fpu_return(void);

/*
 * Query the presence of one or more xfeatures. Works on any legacy CPU as well.
 *
 * If 'feature_name' is set then put a human-readable description of
 * the feature there as well - this can be used to print error (or success)
 * messages.
 */
extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name);

/* Trap handling */
extern int  fpu__exception_code(struct fpu *fpu, int trap_nr);
extern void fpu_sync_fpstate(struct fpu *fpu);
extern void fpu_reset_from_exception_fixup(void);

/* Boot, hotplug and resume */
extern void fpu__init_cpu(void);
extern void fpu__init_system(void);
extern void fpu__init_check_bugs(void);
extern void fpu__resume_cpu(void);

#ifdef CONFIG_MATH_EMULATION
extern void fpstate_init_soft(struct swregs_state *soft);
#else
static inline void fpstate_init_soft(struct swregs_state *soft) {}
#endif

/* State tracking */
DECLARE_PER_CPU(bool, kernel_fpu_allowed);
DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);

/* Process cleanup */
#ifdef CONFIG_X86_64
extern void fpstate_free(struct fpu *fpu);
#else
static inline void fpstate_free(struct fpu *fpu) { }
#endif

/* fpstate-related functions which are exported to KVM */
extern void fpstate_clear_xstate_component(struct fpstate *fpstate, unsigned int xfeature);

extern u64 xstate_get_guest_group_perm(void);

extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);


/* KVM specific functions */
extern bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu);
extern void fpu_free_guest_fpstate(struct fpu_guest *gfpu);
extern int fpu_swap_kvm_fpstate(struct fpu_guest *gfpu, bool enter_guest);
extern int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures);

#ifdef CONFIG_X86_64
extern void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd);
extern void fpu_sync_guest_vmexit_xfd_state(void);
#else
static inline void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) { }
static inline void fpu_sync_guest_vmexit_xfd_state(void) { }
#endif

extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf,
                                           unsigned int size, u64 xfeatures, u32 pkru);
extern int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, u64 xcr0, u32 *vpkru);

static inline void fpstate_set_confidential(struct fpu_guest *gfpu)
{
        gfpu->fpstate->is_confidential = true;
}

static inline bool fpstate_is_confidential(struct fpu_guest *gfpu)
{
        return gfpu->fpstate->is_confidential;
}

/* prctl */
extern long fpu_xstate_prctl(int option, unsigned long arg2);

extern void fpu_idle_fpregs(void);

#endif /* _ASM_X86_FPU_API_H */






































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  145 

  148 



















































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
// SPDX-License-Identifier: GPL-2.0-only
/*
 * xsave/xrstor support.
 *
 * Author: Suresh Siddha <suresh.b.siddha@intel.com>
 */
#include <linux/bitops.h>
#include <linux/compat.h>
#include <linux/cpu.h>
#include <linux/mman.h>
#include <linux/nospec.h>
#include <linux/pkeys.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/vmalloc.h>
#include <linux/coredump.h>
#include <linux/sort.h>

#include <asm/fpu/api.h>
#include <asm/fpu/regset.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/xcr.h>

#include <asm/cpuid/api.h>
#include <asm/msr.h>
#include <asm/tlbflush.h>
#include <asm/prctl.h>
#include <asm/elf.h>

#include <uapi/asm/elf.h>

#include "context.h"
#include "internal.h"
#include "legacy.h"
#include "xstate.h"

#define for_each_extended_xfeature(bit, mask)                                \
        (bit) = FIRST_EXTENDED_XFEATURE;                                \
        for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask))

/*
 * Although we spell it out in here, the Processor Trace
 * xfeature is completely unused.  We use other mechanisms
 * to save/restore PT state in Linux.
 */
static const char *xfeature_names[] =
{
        "x87 floating point registers",
        "SSE registers",
        "AVX registers",
        "MPX bounds registers",
        "MPX CSR",
        "AVX-512 opmask",
        "AVX-512 Hi256",
        "AVX-512 ZMM_Hi256",
        "Processor Trace (unused)",
        "Protection Keys User registers",
        "PASID state",
        "Control-flow User registers",
        "Control-flow Kernel registers (KVM only)",
        "unknown xstate feature",
        "unknown xstate feature",
        "unknown xstate feature",
        "unknown xstate feature",
        "AMX Tile config",
        "AMX Tile data",
        "APX registers",
        "unknown xstate feature",
};

static unsigned short xsave_cpuid_features[] __initdata = {
        [XFEATURE_FP]                                = X86_FEATURE_FPU,
        [XFEATURE_SSE]                                = X86_FEATURE_XMM,
        [XFEATURE_YMM]                                = X86_FEATURE_AVX,
        [XFEATURE_BNDREGS]                        = X86_FEATURE_MPX,
        [XFEATURE_BNDCSR]                        = X86_FEATURE_MPX,
        [XFEATURE_OPMASK]                        = X86_FEATURE_AVX512F,
        [XFEATURE_ZMM_Hi256]                        = X86_FEATURE_AVX512F,
        [XFEATURE_Hi16_ZMM]                        = X86_FEATURE_AVX512F,
        [XFEATURE_PT_UNIMPLEMENTED_SO_FAR]        = X86_FEATURE_INTEL_PT,
        [XFEATURE_PKRU]                                = X86_FEATURE_OSPKE,
        [XFEATURE_PASID]                        = X86_FEATURE_ENQCMD,
        [XFEATURE_CET_USER]                        = X86_FEATURE_SHSTK,
        [XFEATURE_CET_KERNEL]                        = X86_FEATURE_SHSTK,
        [XFEATURE_XTILE_CFG]                        = X86_FEATURE_AMX_TILE,
        [XFEATURE_XTILE_DATA]                        = X86_FEATURE_AMX_TILE,
        [XFEATURE_APX]                                = X86_FEATURE_APX,
};

static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
        { [ 0 ... XFEATURE_MAX - 1] = -1};
static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
        { [ 0 ... XFEATURE_MAX - 1] = -1};
static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init;

/*
 * Ordering of xstate components in uncompacted format:  The xfeature
 * number does not necessarily indicate its position in the XSAVE buffer.
 * This array defines the traversal order of xstate features.
 */
static unsigned int xfeature_uncompact_order[XFEATURE_MAX] __ro_after_init =
        { [ 0 ... XFEATURE_MAX - 1] = -1};

static inline unsigned int next_xfeature_order(unsigned int i, u64 mask)
{
        for (; xfeature_uncompact_order[i] != -1; i++) {
                if (mask & BIT_ULL(xfeature_uncompact_order[i]))
                        break;
        }

        return i;
}

/* Iterate xstate features in uncompacted order: */
#define for_each_extended_xfeature_in_order(i, mask)        \
        for (i = 0;                                        \
             i = next_xfeature_order(i, mask),                \
             xfeature_uncompact_order[i] != -1;                \
             i++)

#define XSTATE_FLAG_SUPERVISOR        BIT(0)
#define XSTATE_FLAG_ALIGNED64        BIT(1)

/*
 * Return whether the system supports a given xfeature.
 *
 * Also return the name of the (most advanced) feature that the caller requested:
 */
int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
{
        u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features;

        if (unlikely(feature_name)) {
                long xfeature_idx, max_idx;
                u64 xfeatures_print;
                /*
                 * So we use FLS here to be able to print the most advanced
                 * feature that was requested but is missing. So if a driver
                 * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the
                 * missing AVX feature - this is the most informative message
                 * to users:
                 */
                if (xfeatures_missing)
                        xfeatures_print = xfeatures_missing;
                else
                        xfeatures_print = xfeatures_needed;

                xfeature_idx = fls64(xfeatures_print)-1;
                max_idx = ARRAY_SIZE(xfeature_names)-1;
                xfeature_idx = min(xfeature_idx, max_idx);

                *feature_name = xfeature_names[xfeature_idx];
        }

        if (xfeatures_missing)
                return 0;

        return 1;
}
EXPORT_SYMBOL_GPL(cpu_has_xfeatures);

static bool xfeature_is_aligned64(int xfeature_nr)
{
        return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64;
}

static bool xfeature_is_supervisor(int xfeature_nr)
{
        return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR;
}

static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature)
{
        unsigned int offs, i;

        /*
         * Non-compacted format and legacy features use the cached fixed
         * offsets.
         */
        if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) ||
            xfeature <= XFEATURE_SSE)
                return xstate_offsets[xfeature];

        /*
         * Compacted format offsets depend on the actual content of the
         * compacted xsave area which is determined by the xcomp_bv header
         * field.
         */
        offs = FXSAVE_SIZE + XSAVE_HDR_SIZE;
        for_each_extended_xfeature(i, xcomp_bv) {
                if (xfeature_is_aligned64(i))
                        offs = ALIGN(offs, 64);
                if (i == xfeature)
                        break;
                offs += xstate_sizes[i];
        }
        return offs;
}

/*
 * Enable the extended processor state save/restore feature.
 * Called once per CPU onlining.
 */
void fpu__init_cpu_xstate(void)
{
        if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features)
                return;

        cr4_set_bits(X86_CR4_OSXSAVE);

        /*
         * Must happen after CR4 setup and before xsetbv() to allow KVM
         * lazy passthrough.  Write independent of the dynamic state static
         * key as that does not work on the boot CPU. This also ensures
         * that any stale state is wiped out from XFD. Reset the per CPU
         * xfd cache too.
         */
        if (cpu_feature_enabled(X86_FEATURE_XFD))
                xfd_set_state(init_fpstate.xfd);

        /*
         * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features
         * managed by XSAVE{C, OPT, S} and XRSTOR{S}.  Only XSAVE user
         * states can be set here.
         */
        xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);

        /*
         * MSR_IA32_XSS sets supervisor states managed by XSAVES.
         */
        if (boot_cpu_has(X86_FEATURE_XSAVES)) {
                wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() |
                                     xfeatures_mask_independent());
        }
}

static bool xfeature_enabled(enum xfeature xfeature)
{
        return fpu_kernel_cfg.max_features & BIT_ULL(xfeature);
}

static int compare_xstate_offsets(const void *xfeature1, const void *xfeature2)
{
        return  xstate_offsets[*(unsigned int *)xfeature1] -
                xstate_offsets[*(unsigned int *)xfeature2];
}

/*
 * Record the offsets and sizes of various xstates contained
 * in the XSAVE state memory layout. Also, create an ordered
 * list of xfeatures for handling out-of-order offsets.
 */
static void __init setup_xstate_cache(void)
{
        u32 eax, ebx, ecx, edx, xfeature, i = 0;
        /*
         * The FP xstates and SSE xstates are legacy states. They are always
         * in the fixed offsets in the xsave area in either compacted form
         * or standard form.
         */
        xstate_offsets[XFEATURE_FP]        = 0;
        xstate_sizes[XFEATURE_FP]        = offsetof(struct fxregs_state,
                                                   xmm_space);

        xstate_offsets[XFEATURE_SSE]        = xstate_sizes[XFEATURE_FP];
        xstate_sizes[XFEATURE_SSE]        = sizeof_field(struct fxregs_state,
                                                       xmm_space);

        for_each_extended_xfeature(xfeature, fpu_kernel_cfg.max_features) {
                cpuid_count(CPUID_LEAF_XSTATE, xfeature, &eax, &ebx, &ecx, &edx);

                xstate_sizes[xfeature] = eax;
                xstate_flags[xfeature] = ecx;

                /*
                 * If an xfeature is supervisor state, the offset in EBX is
                 * invalid, leave it to -1.
                 */
                if (xfeature_is_supervisor(xfeature))
                        continue;

                xstate_offsets[xfeature] = ebx;

                /* Populate the list of xfeatures before sorting */
                xfeature_uncompact_order[i++] = xfeature;
        }

        /*
         * Sort xfeatures by their offsets to support out-of-order
         * offsets in the uncompacted format.
         */
        sort(xfeature_uncompact_order, i, sizeof(unsigned int), compare_xstate_offsets, NULL);
}

/*
 * Print out all the supported xstate features:
 */
static void __init print_xstate_features(void)
{
        int i;

        for (i = 0; i < XFEATURE_MAX; i++) {
                u64 mask = BIT_ULL(i);
                const char *name;

                if (cpu_has_xfeatures(mask, &name))
                        pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", mask, name);
        }
}

/*
 * This check is important because it is easy to get XSTATE_*
 * confused with XSTATE_BIT_*.
 */
#define CHECK_XFEATURE(nr) do {                \
        WARN_ON(nr < FIRST_EXTENDED_XFEATURE);        \
        WARN_ON(nr >= XFEATURE_MAX);        \
} while (0)

/*
 * Print out xstate component offsets and sizes
 */
static void __init print_xstate_offset_size(void)
{
        int i;

        for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
                pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
                        i, xfeature_get_offset(fpu_kernel_cfg.max_features, i),
                        i, xstate_sizes[i]);
        }
}

/*
 * This function is called only during boot time when x86 caps are not set
 * up and alternative can not be used yet.
 */
static __init void os_xrstor_booting(struct xregs_state *xstate)
{
        u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE;
        u32 lmask = mask;
        u32 hmask = mask >> 32;
        int err;

        if (cpu_feature_enabled(X86_FEATURE_XSAVES))
                XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
        else
                XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);

        /*
         * We should never fault when copying from a kernel buffer, and the FPU
         * state we set at boot time should be valid.
         */
        WARN_ON_FPU(err);
}

/*
 * All supported features have either init state all zeros or are
 * handled in setup_init_fpu() individually. This is an explicit
 * feature list and does not use XFEATURE_MASK*SUPPORTED to catch
 * newly added supported features at build time and make people
 * actually look at the init state for the new feature.
 */
#define XFEATURES_INIT_FPSTATE_HANDLED                \
        (XFEATURE_MASK_FP |                        \
         XFEATURE_MASK_SSE |                        \
         XFEATURE_MASK_YMM |                        \
         XFEATURE_MASK_OPMASK |                        \
         XFEATURE_MASK_ZMM_Hi256 |                \
         XFEATURE_MASK_Hi16_ZMM         |                \
         XFEATURE_MASK_PKRU |                        \
         XFEATURE_MASK_BNDREGS |                \
         XFEATURE_MASK_BNDCSR |                        \
         XFEATURE_MASK_PASID |                        \
         XFEATURE_MASK_CET_USER |                \
         XFEATURE_MASK_CET_KERNEL |                \
         XFEATURE_MASK_XTILE |                        \
         XFEATURE_MASK_APX)

/*
 * setup the xstate image representing the init state
 */
static void __init setup_init_fpu_buf(void)
{
        BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED |
                      XFEATURE_MASK_SUPERVISOR_SUPPORTED) !=
                     XFEATURES_INIT_FPSTATE_HANDLED);

        if (!boot_cpu_has(X86_FEATURE_XSAVE))
                return;

        print_xstate_features();

        xstate_init_xcomp_bv(&init_fpstate.regs.xsave, init_fpstate.xfeatures);

        /*
         * Init all the features state with header.xfeatures being 0x0
         */
        os_xrstor_booting(&init_fpstate.regs.xsave);

        /*
         * All components are now in init state. Read the state back so
         * that init_fpstate contains all non-zero init state. This only
         * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because
         * those use the init optimization which skips writing data for
         * components in init state.
         *
         * XSAVE could be used, but that would require to reshuffle the
         * data when XSAVEC/S is available because XSAVEC/S uses xstate
         * compaction. But doing so is a pointless exercise because most
         * components have an all zeros init state except for the legacy
         * ones (FP and SSE). Those can be saved with FXSAVE into the
         * legacy area. Adding new features requires to ensure that init
         * state is all zeroes or if not to add the necessary handling
         * here.
         */
        fxsave(&init_fpstate.regs.fxsave);
}

int xfeature_size(int xfeature_nr)
{
        u32 eax, ebx, ecx, edx;

        CHECK_XFEATURE(xfeature_nr);
        cpuid_count(CPUID_LEAF_XSTATE, xfeature_nr, &eax, &ebx, &ecx, &edx);
        return eax;
}

/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
static int validate_user_xstate_header(const struct xstate_header *hdr,
                                       struct fpstate *fpstate)
{
        /* No unknown or supervisor features may be set */
        if (hdr->xfeatures & ~fpstate->user_xfeatures)
                return -EINVAL;

        /* Userspace must use the uncompacted format */
        if (hdr->xcomp_bv)
                return -EINVAL;

        /*
         * If 'reserved' is shrunken to add a new field, make sure to validate
         * that new field here!
         */
        BUILD_BUG_ON(sizeof(hdr->reserved) != 48);

        /* No reserved bits may be set */
        if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved)))
                return -EINVAL;

        return 0;
}

static void __init __xstate_dump_leaves(void)
{
        int i;
        u32 eax, ebx, ecx, edx;
        static int should_dump = 1;

        if (!should_dump)
                return;
        should_dump = 0;
        /*
         * Dump out a few leaves past the ones that we support
         * just in case there are some goodies up there
         */
        for (i = 0; i < XFEATURE_MAX + 10; i++) {
                cpuid_count(CPUID_LEAF_XSTATE, i, &eax, &ebx, &ecx, &edx);
                pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
                        CPUID_LEAF_XSTATE, i, eax, ebx, ecx, edx);
        }
}

#define XSTATE_WARN_ON(x, fmt, ...) do {                                        \
        if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) {        \
                __xstate_dump_leaves();                                                \
        }                                                                        \
} while (0)

#define XCHECK_SZ(sz, nr, __struct) ({                                        \
        if (WARN_ONCE(sz != sizeof(__struct),                                \
            "[%s]: struct is %zu bytes, cpu state %d bytes\n",                \
            xfeature_names[nr], sizeof(__struct), sz)) {                \
                __xstate_dump_leaves();                                        \
        }                                                                \
        true;                                                                \
})


/**
 * check_xtile_data_against_struct - Check tile data state size.
 *
 * Calculate the state size by multiplying the single tile size which is
 * recorded in a C struct, and the number of tiles that the CPU informs.
 * Compare the provided size with the calculation.
 *
 * @size:        The tile data state size
 *
 * Returns:        0 on success, -EINVAL on mismatch.
 */
static int __init check_xtile_data_against_struct(int size)
{
        u32 max_palid, palid, state_size;
        u32 eax, ebx, ecx, edx;
        u16 max_tile;

        /*
         * Check the maximum palette id:
         *   eax: the highest numbered palette subleaf.
         */
        cpuid_count(CPUID_LEAF_TILE, 0, &max_palid, &ebx, &ecx, &edx);

        /*
         * Cross-check each tile size and find the maximum number of
         * supported tiles.
         */
        for (palid = 1, max_tile = 0; palid <= max_palid; palid++) {
                u16 tile_size, max;

                /*
                 * Check the tile size info:
                 *   eax[31:16]:  bytes per title
                 *   ebx[31:16]:  the max names (or max number of tiles)
                 */
                cpuid_count(CPUID_LEAF_TILE, palid, &eax, &ebx, &edx, &edx);
                tile_size = eax >> 16;
                max = ebx >> 16;

                if (tile_size != sizeof(struct xtile_data)) {
                        pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n",
                               __stringify(XFEATURE_XTILE_DATA),
                               sizeof(struct xtile_data), tile_size);
                        __xstate_dump_leaves();
                        return -EINVAL;
                }

                if (max > max_tile)
                        max_tile = max;
        }

        state_size = sizeof(struct xtile_data) * max_tile;
        if (size != state_size) {
                pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n",
                       __stringify(XFEATURE_XTILE_DATA), state_size, size);
                __xstate_dump_leaves();
                return -EINVAL;
        }
        return 0;
}

/*
 * We have a C struct for each 'xstate'.  We need to ensure
 * that our software representation matches what the CPU
 * tells us about the state's size.
 */
static bool __init check_xstate_against_struct(int nr)
{
        /*
         * Ask the CPU for the size of the state.
         */
        int sz = xfeature_size(nr);

        /*
         * Match each CPU state with the corresponding software
         * structure.
         */
        switch (nr) {
        case XFEATURE_YMM:          return XCHECK_SZ(sz, nr, struct ymmh_struct);
        case XFEATURE_BNDREGS:          return XCHECK_SZ(sz, nr, struct mpx_bndreg_state);
        case XFEATURE_BNDCSR:          return XCHECK_SZ(sz, nr, struct mpx_bndcsr_state);
        case XFEATURE_OPMASK:          return XCHECK_SZ(sz, nr, struct avx_512_opmask_state);
        case XFEATURE_ZMM_Hi256:  return XCHECK_SZ(sz, nr, struct avx_512_zmm_uppers_state);
        case XFEATURE_Hi16_ZMM:          return XCHECK_SZ(sz, nr, struct avx_512_hi16_state);
        case XFEATURE_PKRU:          return XCHECK_SZ(sz, nr, struct pkru_state);
        case XFEATURE_PASID:          return XCHECK_SZ(sz, nr, struct ia32_pasid_state);
        case XFEATURE_XTILE_CFG:  return XCHECK_SZ(sz, nr, struct xtile_cfg);
        case XFEATURE_CET_USER:          return XCHECK_SZ(sz, nr, struct cet_user_state);
        case XFEATURE_CET_KERNEL: return XCHECK_SZ(sz, nr, struct cet_supervisor_state);
        case XFEATURE_APX:        return XCHECK_SZ(sz, nr, struct apx_state);
        case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true;
        default:
                XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr);
                return false;
        }

        return true;
}

static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
{
        unsigned int topmost = fls64(xfeatures) -  1;
        unsigned int offset, i;

        if (topmost <= XFEATURE_SSE)
                return sizeof(struct xregs_state);

        if (compacted) {
                offset = xfeature_get_offset(xfeatures, topmost);
        } else {
                /* Walk through the xfeature order to pick the last */
                for_each_extended_xfeature_in_order(i, xfeatures)
                        topmost = xfeature_uncompact_order[i];
                offset = xstate_offsets[topmost];
        }

        return offset + xstate_sizes[topmost];
}

/*
 * This essentially double-checks what the cpu told us about
 * how large the XSAVE buffer needs to be.  We are recalculating
 * it to be safe.
 *
 * Independent XSAVE features allocate their own buffers and are not
 * covered by these checks. Only the size of the buffer for task->fpu
 * is checked here.
 */
static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
{
        bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
        bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES);
        unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
        int i;

        for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
                if (!check_xstate_against_struct(i))
                        return false;
                /*
                 * Supervisor state components can be managed only by
                 * XSAVES.
                 */
                if (!xsaves && xfeature_is_supervisor(i)) {
                        XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i);
                        return false;
                }
        }
        size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted);
        XSTATE_WARN_ON(size != kernel_size,
                       "size %u != kernel_size %u\n", size, kernel_size);
        return size == kernel_size;
}

/*
 * Get total size of enabled xstates in XCR0 | IA32_XSS.
 *
 * Note the SDM's wording here.  "sub-function 0" only enumerates
 * the size of the *user* states.  If we use it to size a buffer
 * that we use 'XSAVES' on, we could potentially overflow the
 * buffer because 'XSAVES' saves system states too.
 *
 * This also takes compaction into account. So this works for
 * XSAVEC as well.
 */
static unsigned int __init get_compacted_size(void)
{
        unsigned int eax, ebx, ecx, edx;
        /*
         * - CPUID function 0DH, sub-function 1:
         *    EBX enumerates the size (in bytes) required by
         *    the XSAVES instruction for an XSAVE area
         *    containing all the state components
         *    corresponding to bits currently set in
         *    XCR0 | IA32_XSS.
         *
         * When XSAVES is not available but XSAVEC is (virt), then there
         * are no supervisor states, but XSAVEC still uses compacted
         * format.
         */
        cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx);
        return ebx;
}

/*
 * Get the total size of the enabled xstates without the independent supervisor
 * features.
 */
static unsigned int __init get_xsave_compacted_size(void)
{
        u64 mask = xfeatures_mask_independent();
        unsigned int size;

        if (!mask)
                return get_compacted_size();

        /* Disable independent features. */
        wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor());

        /*
         * Ask the hardware what size is required of the buffer.
         * This is the size required for the task->fpu buffer.
         */
        size = get_compacted_size();

        /* Re-enable independent features so XSAVES will work on them again. */
        wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask);

        return size;
}

static unsigned int __init get_xsave_size_user(void)
{
        unsigned int eax, ebx, ecx, edx;
        /*
         * - CPUID function 0DH, sub-function 0:
         *    EBX enumerates the size (in bytes) required by
         *    the XSAVE instruction for an XSAVE area
         *    containing all the *user* state components
         *    corresponding to bits currently set in XCR0.
         */
        cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx);
        return ebx;
}

static int __init init_xstate_size(void)
{
        /* Recompute the context size for enabled features: */
        unsigned int user_size, kernel_size, kernel_default_size;
        bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);

        /* Uncompacted user space size */
        user_size = get_xsave_size_user();

        /*
         * XSAVES kernel size includes supervisor states and uses compacted
         * format. XSAVEC uses compacted format, but does not save
         * supervisor states.
         *
         * XSAVE[OPT] do not support supervisor states so kernel and user
         * size is identical.
         */
        if (compacted)
                kernel_size = get_xsave_compacted_size();
        else
                kernel_size = user_size;

        kernel_default_size =
                xstate_calculate_size(fpu_kernel_cfg.default_features, compacted);

        if (!paranoid_xstate_size_valid(kernel_size))
                return -EINVAL;

        fpu_kernel_cfg.max_size = kernel_size;
        fpu_user_cfg.max_size = user_size;

        fpu_kernel_cfg.default_size = kernel_default_size;
        fpu_user_cfg.default_size =
                xstate_calculate_size(fpu_user_cfg.default_features, false);

        guest_default_cfg.size =
                xstate_calculate_size(guest_default_cfg.features, compacted);

        return 0;
}

/*
 * We enabled the XSAVE hardware, but something went wrong and
 * we can not use it.  Disable it.
 */
static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
{
        pr_info("x86/fpu: XSAVE disabled\n");

        fpu_kernel_cfg.max_features = 0;
        cr4_clear_bits(X86_CR4_OSXSAVE);
        setup_clear_cpu_cap(X86_FEATURE_XSAVE);

        /* Restore the legacy size.*/
        fpu_kernel_cfg.max_size = legacy_size;
        fpu_kernel_cfg.default_size = legacy_size;
        fpu_user_cfg.max_size = legacy_size;
        fpu_user_cfg.default_size = legacy_size;
        guest_default_cfg.size = legacy_size;

        /*
         * Prevent enabling the static branch which enables writes to the
         * XFD MSR.
         */
        init_fpstate.xfd = 0;

        fpstate_reset(x86_task_fpu(current));
}

static u64 __init host_default_mask(void)
{
        /*
         * Exclude dynamic features (require userspace opt-in) and features
         * that are supported only for KVM guests.
         */
        return ~((u64)XFEATURE_MASK_USER_DYNAMIC | XFEATURE_MASK_GUEST_SUPERVISOR);
}

static u64 __init guest_default_mask(void)
{
        /*
         * Exclude dynamic features, which require userspace opt-in even
         * for KVM guests.
         */
        return ~(u64)XFEATURE_MASK_USER_DYNAMIC;
}

/*
 * Enable and initialize the xsave feature.
 * Called once per system bootup.
 */
void __init fpu__init_system_xstate(unsigned int legacy_size)
{
        unsigned int eax, ebx, ecx, edx;
        u64 xfeatures;
        int err;
        int i;

        if (!boot_cpu_has(X86_FEATURE_FPU)) {
                pr_info("x86/fpu: No FPU detected\n");
                return;
        }

        if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
                pr_info("x86/fpu: x87 FPU will use %s\n",
                        boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE");
                return;
        }

        /*
         * Find user xstates supported by the processor.
         */
        cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx);
        fpu_kernel_cfg.max_features = eax + ((u64)edx << 32);

        /*
         * Find supervisor xstates supported by the processor.
         */
        cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx);
        fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32);

        if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
                /*
                 * This indicates that something really unexpected happened
                 * with the enumeration.  Disable XSAVE and try to continue
                 * booting without it.  This is too early to BUG().
                 */
                pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n",
                       fpu_kernel_cfg.max_features);
                goto out_disable;
        }

        if (fpu_kernel_cfg.max_features & XFEATURE_MASK_APX &&
            fpu_kernel_cfg.max_features & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)) {
                /*
                 * This is a problematic CPU configuration where two
                 * conflicting state components are both enumerated.
                 */
                pr_err("x86/fpu: Both APX/MPX present in the CPU's xstate features: 0x%llx.\n",
                       fpu_kernel_cfg.max_features);
                goto out_disable;
        }

        fpu_kernel_cfg.independent_features = fpu_kernel_cfg.max_features &
                                              XFEATURE_MASK_INDEPENDENT;

        /*
         * Clear XSAVE features that are disabled in the normal CPUID.
         */
        for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
                unsigned short cid = xsave_cpuid_features[i];

                /* Careful: X86_FEATURE_FPU is 0! */
                if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid))
                        fpu_kernel_cfg.max_features &= ~BIT_ULL(i);
        }

        if (!cpu_feature_enabled(X86_FEATURE_XFD))
                fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC;

        if (!cpu_feature_enabled(X86_FEATURE_XSAVES))
                fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
        else
                fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED |
                                        XFEATURE_MASK_SUPERVISOR_SUPPORTED;

        fpu_user_cfg.max_features = fpu_kernel_cfg.max_features;
        fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;

        /*
         * Now, given maximum feature set, determine default values by
         * applying default masks.
         */
        fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features & host_default_mask();
        fpu_user_cfg.default_features   = fpu_user_cfg.max_features & host_default_mask();
        guest_default_cfg.features      = fpu_kernel_cfg.max_features & guest_default_mask();

        /* Store it for paranoia check at the end */
        xfeatures = fpu_kernel_cfg.max_features;

        /*
         * Initialize the default XFD state in initfp_state and enable the
         * dynamic sizing mechanism if dynamic states are available.  The
         * static key cannot be enabled here because this runs before
         * jump_label_init(). This is delayed to an initcall.
         */
        init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC;

        /* Set up compaction feature bit */
        if (cpu_feature_enabled(X86_FEATURE_XSAVEC) ||
            cpu_feature_enabled(X86_FEATURE_XSAVES))
                setup_force_cpu_cap(X86_FEATURE_XCOMPACTED);

        /* Enable xstate instructions to be able to continue with initialization: */
        fpu__init_cpu_xstate();

        /* Cache size, offset and flags for initialization */
        setup_xstate_cache();

        err = init_xstate_size();
        if (err)
                goto out_disable;

        /*
         * Update info used for ptrace frames; use standard-format size and no
         * supervisor xstates:
         */
        update_regset_xstate_info(fpu_user_cfg.max_size,
                                  fpu_user_cfg.max_features);

        /*
         * init_fpstate excludes dynamic states as they are large but init
         * state is zero.
         */
        init_fpstate.size                = fpu_kernel_cfg.default_size;
        init_fpstate.xfeatures                = fpu_kernel_cfg.default_features;

        if (init_fpstate.size > sizeof(init_fpstate.regs)) {
                pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d)\n",
                        sizeof(init_fpstate.regs), init_fpstate.size);
                goto out_disable;
        }

        setup_init_fpu_buf();

        /*
         * Paranoia check whether something in the setup modified the
         * xfeatures mask.
         */
        if (xfeatures != fpu_kernel_cfg.max_features) {
                pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init\n",
                       xfeatures, fpu_kernel_cfg.max_features);
                goto out_disable;
        }

        /*
         * CPU capabilities initialization runs before FPU init. So
         * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely
         * functional, set the feature bit so depending code works.
         */
        setup_force_cpu_cap(X86_FEATURE_OSXSAVE);

        print_xstate_offset_size();
        pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
                fpu_kernel_cfg.max_features,
                fpu_kernel_cfg.max_size,
                boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard");
        return;

out_disable:
        /* something went wrong, try to boot without any XSAVE support */
        fpu__init_disable_system_xstate(legacy_size);
}

/*
 * Restore minimal FPU state after suspend:
 */
void fpu__resume_cpu(void)
{
        /*
         * Restore XCR0 on xsave capable CPUs:
         */
        if (cpu_feature_enabled(X86_FEATURE_XSAVE))
                xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);

        /*
         * Restore IA32_XSS. The same CPUID bit enumerates support
         * of XSAVES and MSR_IA32_XSS.
         */
        if (cpu_feature_enabled(X86_FEATURE_XSAVES)) {
                wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor()  |
                                     xfeatures_mask_independent());
        }

        if (fpu_state_size_dynamic())
                wrmsrq(MSR_IA32_XFD, x86_task_fpu(current)->fpstate->xfd);
}

/*
 * Given an xstate feature nr, calculate where in the xsave
 * buffer the state is.  Callers should ensure that the buffer
 * is valid.
 */
static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
{
        u64 xcomp_bv = xsave->header.xcomp_bv;

        if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
                return NULL;

        if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) {
                if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr))))
                        return NULL;
        }

        return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr);
}

/*
 * Given the xsave area and a state inside, this function returns the
 * address of the state.
 *
 * This is the API that is called to get xstate address in either
 * standard format or compacted format of xsave area.
 *
 * Note that if there is no data for the field in the xsave buffer
 * this will return NULL.
 *
 * Inputs:
 *        xstate: the thread's storage area for all FPU data
 *        xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP,
 *        XFEATURE_SSE, etc...)
 * Output:
 *        address of the state in the xsave area, or NULL if the
 *        field is not present in the xsave buffer.
 */
void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
{
        /*
         * Do we even *have* xsave state?
         */
        if (!boot_cpu_has(X86_FEATURE_XSAVE))
                return NULL;

        /*
         * We should not ever be requesting features that we
         * have not enabled.
         */
        if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
                return NULL;

        /*
         * This assumes the last 'xsave*' instruction to
         * have requested that 'xfeature_nr' be saved.
         * If it did not, we might be seeing and old value
         * of the field in the buffer.
         *
         * This can happen because the last 'xsave' did not
         * request that this feature be saved (unlikely)
         * or because the "init optimization" caused it
         * to not be saved.
         */
        if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr)))
                return NULL;

        return __raw_xsave_addr(xsave, xfeature_nr);
}
EXPORT_SYMBOL_GPL(get_xsave_addr);

/*
 * Given an xstate feature nr, calculate where in the xsave buffer the state is.
 * The xsave buffer should be in standard format, not compacted (e.g. user mode
 * signal frames).
 */
void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr)
{
        if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
                return NULL;

        return (void __user *)xsave + xstate_offsets[xfeature_nr];
}

#ifdef CONFIG_ARCH_HAS_PKEYS

/*
 * This will go out and modify PKRU register to set the access
 * rights for @pkey to @init_val.
 */
int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
                              unsigned long init_val)
{
        u32 old_pkru, new_pkru_bits = 0;
        int pkey_shift;

        /*
         * This check implies XSAVE support.  OSPKE only gets
         * set if we enable XSAVE and we enable PKU in XCR0.
         */
        if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
                return -EINVAL;

        /*
         * This code should only be called with valid 'pkey'
         * values originating from in-kernel users.  Complain
         * if a bad value is observed.
         */
        if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
                return -EINVAL;

        /* Set the bits we need in PKRU:  */
        if (init_val & PKEY_DISABLE_ACCESS)
                new_pkru_bits |= PKRU_AD_BIT;
        if (init_val & PKEY_DISABLE_WRITE)
                new_pkru_bits |= PKRU_WD_BIT;

        /* Shift the bits in to the correct place in PKRU for pkey: */
        pkey_shift = pkey * PKRU_BITS_PER_PKEY;
        new_pkru_bits <<= pkey_shift;

        /* Get old PKRU and mask off any old bits in place: */
        old_pkru = read_pkru();
        old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);

        /* Write old part along with new part: */
        write_pkru(old_pkru | new_pkru_bits);

        return 0;
}
#endif /* ! CONFIG_ARCH_HAS_PKEYS */

static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
                         void *init_xstate, unsigned int size)
{
        membuf_write(to, from_xstate ? xstate : init_xstate, size);
}

/**
 * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
 * @to:                membuf descriptor
 * @fpstate:        The fpstate buffer from which to copy
 * @xfeatures:        The mask of xfeatures to save (XSAVE mode only)
 * @pkru_val:        The PKRU value to store in the PKRU component
 * @copy_mode:        The requested copy mode
 *
 * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
 * format, i.e. from the kernel internal hardware dependent storage format
 * to the requested @mode. UABI XSTATE is always uncompacted!
 *
 * It supports partial copy but @to.pos always starts from zero.
 */
void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
                               u64 xfeatures, u32 pkru_val,
                               enum xstate_copy_mode copy_mode)
{
        const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
        struct xregs_state *xinit = &init_fpstate.regs.xsave;
        struct xregs_state *xsave = &fpstate->regs.xsave;
        unsigned int zerofrom, i, xfeature;
        struct xstate_header header;
        u64 mask;

        memset(&header, 0, sizeof(header));
        header.xfeatures = xsave->header.xfeatures;

        /* Mask out the feature bits depending on copy mode */
        switch (copy_mode) {
        case XSTATE_COPY_FP:
                header.xfeatures &= XFEATURE_MASK_FP;
                break;

        case XSTATE_COPY_FX:
                header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE;
                break;

        case XSTATE_COPY_XSAVE:
                header.xfeatures &= fpstate->user_xfeatures & xfeatures;
                break;
        }

        /* Copy FP state up to MXCSR */
        copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387,
                     &xinit->i387, off_mxcsr);

        /* Copy MXCSR when SSE or YMM are set in the feature mask */
        copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM),
                     &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr,
                     MXCSR_AND_FLAGS_SIZE);

        /* Copy the remaining FP state */
        copy_feature(header.xfeatures & XFEATURE_MASK_FP,
                     &to, &xsave->i387.st_space, &xinit->i387.st_space,
                     sizeof(xsave->i387.st_space));

        /* Copy the SSE state - shared with YMM, but independently managed */
        copy_feature(header.xfeatures & XFEATURE_MASK_SSE,
                     &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space,
                     sizeof(xsave->i387.xmm_space));

        if (copy_mode != XSTATE_COPY_XSAVE)
                goto out;

        /* Zero the padding area */
        membuf_zero(&to, sizeof(xsave->i387.padding));

        /* Copy xsave->i387.sw_reserved */
        membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved));

        /* Copy the user space relevant state of @xsave->header */
        membuf_write(&to, &header, sizeof(header));

        zerofrom = offsetof(struct xregs_state, extended_state_area);

        /*
         * This 'mask' indicates which states to copy from fpstate.
         * Those extended states that are not present in fpstate are
         * either disabled or initialized:
         *
         * In non-compacted format, disabled features still occupy
         * state space but there is no state to copy from in the
         * compacted init_fpstate. The gap tracking will zero these
         * states.
         *
         * The extended features have an all zeroes init state. Thus,
         * remove them from 'mask' to zero those features in the user
         * buffer instead of retrieving them from init_fpstate.
         */
        mask = header.xfeatures;

        for_each_extended_xfeature_in_order(i, mask) {
                xfeature = xfeature_uncompact_order[i];
                /*
                 * If there was a feature or alignment gap, zero the space
                 * in the destination buffer.
                 */
                if (zerofrom < xstate_offsets[xfeature])
                        membuf_zero(&to, xstate_offsets[xfeature] - zerofrom);

                if (xfeature == XFEATURE_PKRU) {
                        struct pkru_state pkru = {0};
                        /*
                         * PKRU is not necessarily up to date in the
                         * XSAVE buffer. Use the provided value.
                         */
                        pkru.pkru = pkru_val;
                        membuf_write(&to, &pkru, sizeof(pkru));
                } else {
                        membuf_write(&to,
                                     __raw_xsave_addr(xsave, xfeature),
                                     xstate_sizes[xfeature]);
                }
                /*
                 * Keep track of the last copied state in the non-compacted
                 * target buffer for gap zeroing.
                 */
                zerofrom = xstate_offsets[xfeature] + xstate_sizes[xfeature];
        }

out:
        if (to.left)
                membuf_zero(&to, to.left);
}

/**
 * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
 * @to:                membuf descriptor
 * @tsk:        The task from which to copy the saved xstate
 * @copy_mode:        The requested copy mode
 *
 * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
 * format, i.e. from the kernel internal hardware dependent storage format
 * to the requested @mode. UABI XSTATE is always uncompacted!
 *
 * It supports partial copy but @to.pos always starts from zero.
 */
void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
                             enum xstate_copy_mode copy_mode)
{
        __copy_xstate_to_uabi_buf(to, x86_task_fpu(tsk)->fpstate,
                                  x86_task_fpu(tsk)->fpstate->user_xfeatures,
                                  tsk->thread.pkru, copy_mode);
}

static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
                            const void *kbuf, const void __user *ubuf)
{
        if (kbuf) {
                memcpy(dst, kbuf + offset, size);
        } else {
                if (copy_from_user(dst, ubuf + offset, size))
                        return -EFAULT;
        }
        return 0;
}


/**
 * copy_uabi_to_xstate - Copy a UABI format buffer to the kernel xstate
 * @fpstate:        The fpstate buffer to copy to
 * @kbuf:        The UABI format buffer, if it comes from the kernel
 * @ubuf:        The UABI format buffer, if it comes from userspace
 * @pkru:        The location to write the PKRU value to
 *
 * Converts from the UABI format into the kernel internal hardware
 * dependent format.
 *
 * This function ultimately has three different callers with distinct PKRU
 * behavior.
 * 1.        When called from sigreturn the PKRU register will be restored from
 *        @fpstate via an XRSTOR. Correctly copying the UABI format buffer to
 *        @fpstate is sufficient to cover this case, but the caller will also
 *        pass a pointer to the thread_struct's pkru field in @pkru and updating
 *        it is harmless.
 * 2.        When called from ptrace the PKRU register will be restored from the
 *        thread_struct's pkru field. A pointer to that is passed in @pkru.
 *        The kernel will restore it manually, so the XRSTOR behavior that resets
 *        the PKRU register to the hardware init value (0) if the corresponding
 *        xfeatures bit is not set is emulated here.
 * 3.        When called from KVM the PKRU register will be restored from the vcpu's
 *        pkru field. A pointer to that is passed in @pkru. KVM hasn't used
 *        XRSTOR and hasn't had the PKRU resetting behavior described above. To
 *        preserve that KVM behavior, it passes NULL for @pkru if the xfeatures
 *        bit is not set.
 */
static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
                               const void __user *ubuf, u32 *pkru)
{
        struct xregs_state *xsave = &fpstate->regs.xsave;
        unsigned int offset, size;
        struct xstate_header hdr;
        u64 mask;
        int i;

        offset = offsetof(struct xregs_state, header);
        if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf))
                return -EFAULT;

        if (validate_user_xstate_header(&hdr, fpstate))
                return -EINVAL;

        /* Validate MXCSR when any of the related features is in use */
        mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM;
        if (hdr.xfeatures & mask) {
                u32 mxcsr[2];

                offset = offsetof(struct fxregs_state, mxcsr);
                if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf))
                        return -EFAULT;

                /* Reserved bits in MXCSR must be zero. */
                if (mxcsr[0] & ~mxcsr_feature_mask)
                        return -EINVAL;

                /* SSE and YMM require MXCSR even when FP is not in use. */
                if (!(hdr.xfeatures & XFEATURE_MASK_FP)) {
                        xsave->i387.mxcsr = mxcsr[0];
                        xsave->i387.mxcsr_mask = mxcsr[1];
                }
        }

        for (i = 0; i < XFEATURE_MAX; i++) {
                mask = BIT_ULL(i);

                if (hdr.xfeatures & mask) {
                        void *dst = __raw_xsave_addr(xsave, i);

                        offset = xstate_offsets[i];
                        size = xstate_sizes[i];

                        if (copy_from_buffer(dst, offset, size, kbuf, ubuf))
                                return -EFAULT;
                }
        }

        if (hdr.xfeatures & XFEATURE_MASK_PKRU) {
                struct pkru_state *xpkru;

                xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU);
                *pkru = xpkru->pkru;
        } else {
                /*
                 * KVM may pass NULL here to indicate that it does not need
                 * PKRU updated.
                 */
                if (pkru)
                        *pkru = 0;
        }

        /*
         * The state that came in from userspace was user-state only.
         * Mask all the user states out of 'xfeatures':
         */
        xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL;

        /*
         * Add back in the features that came in from userspace:
         */
        xsave->header.xfeatures |= hdr.xfeatures;

        return 0;
}

/*
 * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S]
 * format and copy to the target thread. Used by ptrace and KVM.
 */
int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru)
{
        return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru);
}

/*
 * Convert from a sigreturn standard-format user-space buffer to kernel
 * XSAVE[S] format and copy to the target thread. This is called from the
 * sigreturn() and rt_sigreturn() system calls.
 */
int copy_sigframe_from_user_to_xstate(struct task_struct *tsk,
                                      const void __user *ubuf)
{
        return copy_uabi_to_xstate(x86_task_fpu(tsk)->fpstate, NULL, ubuf, &tsk->thread.pkru);
}

static bool validate_independent_components(u64 mask)
{
        u64 xchk;

        if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES)))
                return false;

        xchk = ~xfeatures_mask_independent();

        if (WARN_ON_ONCE(!mask || mask & xchk))
                return false;

        return true;
}

/**
 * xsaves - Save selected components to a kernel xstate buffer
 * @xstate:        Pointer to the buffer
 * @mask:        Feature mask to select the components to save
 *
 * The @xstate buffer must be 64 byte aligned and correctly initialized as
 * XSAVES does not write the full xstate header. Before first use the
 * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer
 * can #GP.
 *
 * The feature mask must be a subset of the independent features.
 */
void xsaves(struct xregs_state *xstate, u64 mask)
{
        int err;

        if (!validate_independent_components(mask))
                return;

        XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err);
        WARN_ON_ONCE(err);
}

/**
 * xrstors - Restore selected components from a kernel xstate buffer
 * @xstate:        Pointer to the buffer
 * @mask:        Feature mask to select the components to restore
 *
 * The @xstate buffer must be 64 byte aligned and correctly initialized
 * otherwise XRSTORS from that buffer can #GP.
 *
 * Proper usage is to restore the state which was saved with
 * xsaves() into @xstate.
 *
 * The feature mask must be a subset of the independent features.
 */
void xrstors(struct xregs_state *xstate, u64 mask)
{
        int err;

        if (!validate_independent_components(mask))
                return;

        XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err);
        WARN_ON_ONCE(err);
}

#if IS_ENABLED(CONFIG_KVM)
void fpstate_clear_xstate_component(struct fpstate *fpstate, unsigned int xfeature)
{
        void *addr = get_xsave_addr(&fpstate->regs.xsave, xfeature);

        if (addr)
                memset(addr, 0, xstate_sizes[xfeature]);
}
EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component);
#endif

#ifdef CONFIG_X86_64

#ifdef CONFIG_X86_DEBUG_FPU
/*
 * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask
 * can safely operate on the @fpstate buffer.
 */
static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor)
{
        u64 xfd = __this_cpu_read(xfd_state);

        if (fpstate->xfd == xfd)
                return true;

         /*
          * The XFD MSR does not match fpstate->xfd. That's invalid when
          * the passed in fpstate is current's fpstate.
          */
        if (fpstate->xfd == x86_task_fpu(current)->fpstate->xfd)
                return false;

        /*
         * XRSTOR(S) from init_fpstate are always correct as it will just
         * bring all components into init state and not read from the
         * buffer. XSAVE(S) raises #PF after init.
         */
        if (fpstate == &init_fpstate)
                return rstor;

        /*
         * XSAVE(S): clone(), fpu_swap_kvm_fpstate()
         * XRSTORS(S): fpu_swap_kvm_fpstate()
         */

        /*
         * No XSAVE/XRSTOR instructions (except XSAVE itself) touch
         * the buffer area for XFD-disabled state components.
         */
        mask &= ~xfd;

        /*
         * Remove features which are valid in fpstate. They
         * have space allocated in fpstate.
         */
        mask &= ~fpstate->xfeatures;

        /*
         * Any remaining state components in 'mask' might be written
         * by XSAVE/XRSTOR. Fail validation it found.
         */
        return !mask;
}

void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor)
{
        WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor));
}
#endif /* CONFIG_X86_DEBUG_FPU */

static int __init xfd_update_static_branch(void)
{
        /*
         * If init_fpstate.xfd has bits set then dynamic features are
         * available and the dynamic sizing must be enabled.
         */
        if (init_fpstate.xfd)
                static_branch_enable(&__fpu_state_size_dynamic);
        return 0;
}
arch_initcall(xfd_update_static_branch)

void fpstate_free(struct fpu *fpu)
{
        if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate)
                vfree(fpu->fpstate);
}

/**
 * fpstate_realloc - Reallocate struct fpstate for the requested new features
 *
 * @xfeatures:        A bitmap of xstate features which extend the enabled features
 *                of that task
 * @ksize:        The required size for the kernel buffer
 * @usize:        The required size for user space buffers
 * @guest_fpu:        Pointer to a guest FPU container. NULL for host allocations
 *
 * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer
 * terminates quickly, vfree()-induced IPIs may be a concern, but tasks
 * with large states are likely to live longer.
 *
 * Returns: 0 on success, -ENOMEM on allocation error.
 */
static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
                           unsigned int usize, struct fpu_guest *guest_fpu)
{
        struct fpu *fpu = x86_task_fpu(current);
        struct fpstate *curfps, *newfps = NULL;
        unsigned int fpsize;
        bool in_use;

        fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64);

        newfps = vzalloc(fpsize);
        if (!newfps)
                return -ENOMEM;
        newfps->size = ksize;
        newfps->user_size = usize;
        newfps->is_valloc = true;

        /*
         * When a guest FPU is supplied, use @guest_fpu->fpstate
         * as reference independent whether it is in use or not.
         */
        curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate;

        /* Determine whether @curfps is the active fpstate */
        in_use = fpu->fpstate == curfps;

        if (guest_fpu) {
                newfps->is_guest = true;
                newfps->is_confidential = curfps->is_confidential;
                newfps->in_use = curfps->in_use;
                guest_fpu->xfeatures |= xfeatures;
                guest_fpu->uabi_size = usize;
        }

        fpregs_lock();
        /*
         * If @curfps is in use, ensure that the current state is in the
         * registers before swapping fpstate as that might invalidate it
         * due to layout changes.
         */
        if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD))
                fpregs_restore_userregs();

        newfps->xfeatures = curfps->xfeatures | xfeatures;
        newfps->user_xfeatures = curfps->user_xfeatures | xfeatures;
        newfps->xfd = curfps->xfd & ~xfeatures;

        /* Do the final updates within the locked region */
        xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures);

        if (guest_fpu) {
                guest_fpu->fpstate = newfps;
                /* If curfps is active, update the FPU fpstate pointer */
                if (in_use)
                        fpu->fpstate = newfps;
        } else {
                fpu->fpstate = newfps;
        }

        if (in_use)
                xfd_update_state(fpu->fpstate);
        fpregs_unlock();

        /* Only free valloc'ed state */
        if (curfps && curfps->is_valloc)
                vfree(curfps);

        return 0;
}

static int validate_sigaltstack(unsigned int usize)
{
        struct task_struct *thread, *leader = current->group_leader;
        unsigned long framesize = get_sigframe_size();

        lockdep_assert_held(&current->sighand->siglock);

        /* get_sigframe_size() is based on fpu_user_cfg.max_size */
        framesize -= fpu_user_cfg.max_size;
        framesize += usize;
        for_each_thread(leader, thread) {
                if (thread->sas_ss_size && thread->sas_ss_size < framesize)
                        return -ENOSPC;
        }
        return 0;
}

static int __xstate_request_perm(u64 permitted, u64 requested, bool guest)
{
        /*
         * This deliberately does not exclude !XSAVES as we still might
         * decide to optionally context switch XCR0 or talk the silicon
         * vendors into extending XFD for the pre AMX states, especially
         * AVX512.
         */
        bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
        struct fpu *fpu = x86_task_fpu(current->group_leader);
        struct fpu_state_perm *perm;
        unsigned int ksize, usize;
        u64 mask;
        int ret = 0;

        /* Check whether fully enabled */
        if ((permitted & requested) == requested)
                return 0;

        /*
         * Calculate the resulting kernel state size.  Note, @permitted also
         * contains supervisor xfeatures even though supervisor are always
         * permitted for kernel and guest FPUs, and never permitted for user
         * FPUs.
         */
        mask = permitted | requested;
        ksize = xstate_calculate_size(mask, compacted);

        /*
         * Calculate the resulting user state size.  Take care not to clobber
         * the supervisor xfeatures in the new mask!
         */
        usize = xstate_calculate_size(mask & XFEATURE_MASK_USER_SUPPORTED, false);

        if (!guest) {
                ret = validate_sigaltstack(usize);
                if (ret)
                        return ret;
        }

        perm = guest ? &fpu->guest_perm : &fpu->perm;
        /* Pairs with the READ_ONCE() in xstate_get_group_perm() */
        WRITE_ONCE(perm->__state_perm, mask);
        /* Protected by sighand lock */
        perm->__state_size = ksize;
        perm->__user_state_size = usize;
        return ret;
}

/*
 * Permissions array to map facilities with more than one component
 */
static const u64 xstate_prctl_req[XFEATURE_MAX] = {
        [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA,
};

static int xstate_request_perm(unsigned long idx, bool guest)
{
        u64 permitted, requested;
        int ret;

        if (idx >= XFEATURE_MAX)
                return -EINVAL;

        /*
         * Look up the facility mask which can require more than
         * one xstate component.
         */
        idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req));
        requested = xstate_prctl_req[idx];
        if (!requested)
                return -EOPNOTSUPP;

        if ((fpu_user_cfg.max_features & requested) != requested)
                return -EOPNOTSUPP;

        /* Lockless quick check */
        permitted = xstate_get_group_perm(guest);
        if ((permitted & requested) == requested)
                return 0;

        /* Protect against concurrent modifications */
        spin_lock_irq(&current->sighand->siglock);
        permitted = xstate_get_group_perm(guest);

        /* First vCPU allocation locks the permissions. */
        if (guest && (permitted & FPU_GUEST_PERM_LOCKED))
                ret = -EBUSY;
        else
                ret = __xstate_request_perm(permitted, requested, guest);
        spin_unlock_irq(&current->sighand->siglock);
        return ret;
}

int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu)
{
        u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC;
        struct fpu_state_perm *perm;
        unsigned int ksize, usize;
        struct fpu *fpu;

        if (!xfd_event) {
                if (!guest_fpu)
                        pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err);
                return 0;
        }

        /* Protect against concurrent modifications */
        spin_lock_irq(&current->sighand->siglock);

        /* If not permitted let it die */
        if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) {
                spin_unlock_irq(&current->sighand->siglock);
                return -EPERM;
        }

        fpu = x86_task_fpu(current->group_leader);
        perm = guest_fpu ? &fpu->guest_perm : &fpu->perm;
        ksize = perm->__state_size;
        usize = perm->__user_state_size;

        /*
         * The feature is permitted. State size is sufficient.  Dropping
         * the lock is safe here even if more features are added from
         * another task, the retrieved buffer sizes are valid for the
         * currently requested feature(s).
         */
        spin_unlock_irq(&current->sighand->siglock);

        /*
         * Try to allocate a new fpstate. If that fails there is no way
         * out.
         */
        if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu))
                return -EFAULT;
        return 0;
}

int xfd_enable_feature(u64 xfd_err)
{
        return __xfd_enable_feature(xfd_err, NULL);
}

#else /* CONFIG_X86_64 */
static inline int xstate_request_perm(unsigned long idx, bool guest)
{
        return -EPERM;
}
#endif  /* !CONFIG_X86_64 */

u64 xstate_get_guest_group_perm(void)
{
        return xstate_get_group_perm(true);
}
EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm);

/**
 * fpu_xstate_prctl - xstate permission operations
 * @option:        A subfunction of arch_prctl()
 * @arg2:        option argument
 * Return:        0 if successful; otherwise, an error code
 *
 * Option arguments:
 *
 * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info
 * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info
 * ARCH_REQ_XCOMP_PERM: Facility number requested
 *
 * For facilities which require more than one XSTATE component, the request
 * must be the highest state component number related to that facility,
 * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and
 * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18).
 */
long fpu_xstate_prctl(int option, unsigned long arg2)
{
        u64 __user *uptr = (u64 __user *)arg2;
        u64 permitted, supported;
        unsigned long idx = arg2;
        bool guest = false;

        switch (option) {
        case ARCH_GET_XCOMP_SUPP:
                supported = fpu_user_cfg.max_features |        fpu_user_cfg.legacy_features;
                return put_user(supported, uptr);

        case ARCH_GET_XCOMP_PERM:
                /*
                 * Lockless snapshot as it can also change right after the
                 * dropping the lock.
                 */
                permitted = xstate_get_host_group_perm();
                permitted &= XFEATURE_MASK_USER_SUPPORTED;
                return put_user(permitted, uptr);

        case ARCH_GET_XCOMP_GUEST_PERM:
                permitted = xstate_get_guest_group_perm();
                permitted &= XFEATURE_MASK_USER_SUPPORTED;
                return put_user(permitted, uptr);

        case ARCH_REQ_XCOMP_GUEST_PERM:
                guest = true;
                fallthrough;

        case ARCH_REQ_XCOMP_PERM:
                if (!IS_ENABLED(CONFIG_X86_64))
                        return -EOPNOTSUPP;

                return xstate_request_perm(idx, guest);

        default:
                return -EINVAL;
        }
}

#ifdef CONFIG_PROC_PID_ARCH_STATUS
/*
 * Report the amount of time elapsed in millisecond since last AVX512
 * use in the task. Report -1 if no AVX-512 usage.
 */
static void avx512_status(struct seq_file *m, struct task_struct *task)
{
        unsigned long timestamp;
        long delta = -1;

        /* AVX-512 usage is not tracked for kernel threads. Don't report anything. */
        if (task->flags & (PF_KTHREAD | PF_USER_WORKER))
                return;

        timestamp = READ_ONCE(x86_task_fpu(task)->avx512_timestamp);

        if (timestamp) {
                delta = (long)(jiffies - timestamp);
                /*
                 * Cap to LONG_MAX if time difference > LONG_MAX
                 */
                if (delta < 0)
                        delta = LONG_MAX;
                delta = jiffies_to_msecs(delta);
        }

        seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta);
        seq_putc(m, '\n');
}

/*
 * Report architecture specific information
 */
int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
                        struct pid *pid, struct task_struct *task)
{
        /*
         * Report AVX512 state if the processor and build option supported.
         */
        if (cpu_feature_enabled(X86_FEATURE_AVX512F))
                avx512_status(m, task);

        return 0;
}
#endif /* CONFIG_PROC_PID_ARCH_STATUS */

#ifdef CONFIG_COREDUMP
static const char owner_name[] = "LINUX";

/*
 * Dump type, size, offset and flag values for every xfeature that is present.
 */
static int dump_xsave_layout_desc(struct coredump_params *cprm)
{
        int num_records = 0;
        int i;

        for_each_extended_xfeature(i, fpu_user_cfg.max_features) {
                struct x86_xfeat_component xc = {
                        .type   = i,
                        .size   = xstate_sizes[i],
                        .offset = xstate_offsets[i],
                        /* reserved for future use */
                        .flags  = 0,
                };

                if (!dump_emit(cprm, &xc, sizeof(xc)))
                        return 0;

                num_records++;
        }
        return num_records;
}

static u32 get_xsave_desc_size(void)
{
        u32 cnt = 0;
        u32 i;

        for_each_extended_xfeature(i, fpu_user_cfg.max_features)
                cnt++;

        return cnt * (sizeof(struct x86_xfeat_component));
}

int elf_coredump_extra_notes_write(struct coredump_params *cprm)
{
        int num_records = 0;
        struct elf_note en;

        if (!fpu_user_cfg.max_features)
                return 0;

        en.n_namesz = sizeof(owner_name);
        en.n_descsz = get_xsave_desc_size();
        en.n_type = NT_X86_XSAVE_LAYOUT;

        if (!dump_emit(cprm, &en, sizeof(en)))
                return 1;
        if (!dump_emit(cprm, owner_name, en.n_namesz))
                return 1;
        if (!dump_align(cprm, 4))
                return 1;

        num_records = dump_xsave_layout_desc(cprm);
        if (!num_records)
                return 1;

        /* Total size should be equal to the number of records */
        if ((sizeof(struct x86_xfeat_component) * num_records) != en.n_descsz)
                return 1;

        return 0;
}

int elf_coredump_extra_notes_size(void)
{
        int size;

        if (!fpu_user_cfg.max_features)
                return 0;

        /* .note header */
        size  = sizeof(struct elf_note);
        /*  Name plus alignment to 4 bytes */
        size += roundup(sizeof(owner_name), 4);
        size += get_xsave_desc_size();

        return size;
}
#endif /* CONFIG_COREDUMP */








































































  309 
  314 



















   13 


  313 































  316 
  313 











































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Access vector cache interface for object managers.
 *
 * Author : Stephen Smalley, <stephen.smalley.work@gmail.com>
 */

#ifndef _SELINUX_AVC_H_
#define _SELINUX_AVC_H_

#include <linux/stddef.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/kdev_t.h>
#include <linux/spinlock.h>
#include <linux/init.h>
#include <linux/audit.h>
#include <linux/lsm_audit.h>
#include <linux/in6.h>
#include "flask.h"
#include "av_permissions.h"
#include "security.h"

/*
 * An entry in the AVC.
 */
struct avc_entry;

struct task_struct;
struct inode;
struct sock;
struct sk_buff;

/*
 * AVC statistics
 */
struct avc_cache_stats {
        unsigned int lookups;
        unsigned int misses;
        unsigned int allocations;
        unsigned int reclaims;
        unsigned int frees;
};

/*
 * We only need this data after we have decided to send an audit message.
 */
struct selinux_audit_data {
        u32 ssid;
        u32 tsid;
        u16 tclass;
        u32 requested;
        u32 audited;
        u32 denied;
        int result;
} __randomize_layout;

/*
 * AVC operations
 */

void __init avc_init(void);

static inline u32 avc_audit_required(u32 requested, struct av_decision *avd,
                                     int result, u32 auditdeny, u32 *deniedp)
{
        u32 denied, audited;

        if (avd->flags & AVD_FLAGS_NEVERAUDIT)
                return 0;

        denied = requested & ~avd->allowed;
        if (unlikely(denied)) {
                audited = denied & avd->auditdeny;
                /*
                 * auditdeny is TRICKY!  Setting a bit in
                 * this field means that ANY denials should NOT be audited if
                 * the policy contains an explicit dontaudit rule for that
                 * permission.  Take notice that this is unrelated to the
                 * actual permissions that were denied.  As an example lets
                 * assume:
                 *
                 * denied == READ
                 * avd.auditdeny & ACCESS == 0 (not set means explicit rule)
                 * auditdeny & ACCESS == 1
                 *
                 * We will NOT audit the denial even though the denied
                 * permission was READ and the auditdeny checks were for
                 * ACCESS
                 */
                if (auditdeny && !(auditdeny & avd->auditdeny))
                        audited = 0;
        } else if (result)
                audited = denied = requested;
        else
                audited = requested & avd->auditallow;
        *deniedp = denied;
        return audited;
}

int slow_avc_audit(u32 ssid, u32 tsid, u16 tclass, u32 requested, u32 audited,
                   u32 denied, int result, struct common_audit_data *a);

/**
 * avc_audit - Audit the granting or denial of permissions.
 * @ssid: source security identifier
 * @tsid: target security identifier
 * @tclass: target security class
 * @requested: requested permissions
 * @avd: access vector decisions
 * @result: result from avc_has_perm_noaudit
 * @a:  auxiliary audit data
 *
 * Audit the granting or denial of permissions in accordance
 * with the policy.  This function is typically called by
 * avc_has_perm() after a permission check, but can also be
 * called directly by callers who use avc_has_perm_noaudit()
 * in order to separate the permission check from the auditing.
 * For example, this separation is useful when the permission check must
 * be performed under a lock, to allow the lock to be released
 * before calling the auditing code.
 */
static inline int avc_audit(u32 ssid, u32 tsid, u16 tclass, u32 requested,
                            struct av_decision *avd, int result,
                            struct common_audit_data *a)
{
        u32 audited, denied;
        audited = avc_audit_required(requested, avd, result, 0, &denied);
        if (likely(!audited))
                return 0;
        return slow_avc_audit(ssid, tsid, tclass, requested, audited, denied,
                              result, a);
}

#define AVC_STRICT           1 /* Ignore permissive mode. */
#define AVC_EXTENDED_PERMS 2 /* update extended permissions */
int avc_has_perm_noaudit(u32 ssid, u32 tsid, u16 tclass, u32 requested,
                         unsigned int flags, struct av_decision *avd);

int avc_has_perm(u32 ssid, u32 tsid, u16 tclass, u32 requested,
                 struct common_audit_data *auditdata);

#define AVC_EXT_IOCTL        (1 << 0) /* Cache entry for an ioctl extended permission */
#define AVC_EXT_NLMSG        (1 << 1) /* Cache entry for an nlmsg extended permission */
int avc_has_extended_perms(u32 ssid, u32 tsid, u16 tclass, u32 requested,
                           u8 driver, u8 base_perm, u8 perm,
                           struct common_audit_data *ad);

u32 avc_policy_seqno(void);

#define AVC_CALLBACK_GRANT                1
#define AVC_CALLBACK_TRY_REVOKE                2
#define AVC_CALLBACK_REVOKE                4
#define AVC_CALLBACK_RESET                8
#define AVC_CALLBACK_AUDITALLOW_ENABLE        16
#define AVC_CALLBACK_AUDITALLOW_DISABLE 32
#define AVC_CALLBACK_AUDITDENY_ENABLE        64
#define AVC_CALLBACK_AUDITDENY_DISABLE        128
#define AVC_CALLBACK_ADD_XPERMS                256

int avc_add_callback(int (*callback)(u32 event), u32 events);

/* Exported to selinuxfs */
int avc_get_hash_stats(char *page);
unsigned int avc_get_cache_threshold(void);
void avc_set_cache_threshold(unsigned int cache_threshold);

#ifdef CONFIG_SECURITY_SELINUX_AVC_STATS
DECLARE_PER_CPU(struct avc_cache_stats, avc_cache_stats);
#endif

#endif /* _SELINUX_AVC_H_ */















































































































































































    4 






































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    4 





































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
/* SPDX-License-Identifier: GPL-2.0+ */
#ifndef _LINUX_XARRAY_H
#define _LINUX_XARRAY_H
/*
 * eXtensible Arrays
 * Copyright (c) 2017 Microsoft Corporation
 * Author: Matthew Wilcox <willy@infradead.org>
 *
 * See Documentation/core-api/xarray.rst for how to use the XArray.
 */

#include <linux/bitmap.h>
#include <linux/bug.h>
#include <linux/compiler.h>
#include <linux/err.h>
#include <linux/gfp.h>
#include <linux/kconfig.h>
#include <linux/limits.h>
#include <linux/lockdep.h>
#include <linux/rcupdate.h>
#include <linux/sched/mm.h>
#include <linux/spinlock.h>
#include <linux/types.h>

struct list_lru;

/*
 * The bottom two bits of the entry determine how the XArray interprets
 * the contents:
 *
 * 00: Pointer entry
 * 10: Internal entry
 * x1: Value entry or tagged pointer
 *
 * Attempting to store internal entries in the XArray is a bug.
 *
 * Most internal entries are pointers to the next node in the tree.
 * The following internal entries have a special meaning:
 *
 * 0-62: Sibling entries
 * 256: Retry entry
 * 257: Zero entry
 *
 * Errors are also represented as internal entries, but use the negative
 * space (-4094 to -2).  They're never stored in the slots array; only
 * returned by the normal API.
 */

#define BITS_PER_XA_VALUE        (BITS_PER_LONG - 1)

/**
 * xa_mk_value() - Create an XArray entry from an integer.
 * @v: Value to store in XArray.
 *
 * Context: Any context.
 * Return: An entry suitable for storing in the XArray.
 */
static inline void *xa_mk_value(unsigned long v)
{
        WARN_ON((long)v < 0);
        return (void *)((v << 1) | 1);
}

/**
 * xa_to_value() - Get value stored in an XArray entry.
 * @entry: XArray entry.
 *
 * Context: Any context.
 * Return: The value stored in the XArray entry.
 */
static inline unsigned long xa_to_value(const void *entry)
{
        return (unsigned long)entry >> 1;
}

/**
 * xa_is_value() - Determine if an entry is a value.
 * @entry: XArray entry.
 *
 * Context: Any context.
 * Return: True if the entry is a value, false if it is a pointer.
 */
static inline bool xa_is_value(const void *entry)
{
        return (unsigned long)entry & 1;
}

/**
 * xa_tag_pointer() - Create an XArray entry for a tagged pointer.
 * @p: Plain pointer.
 * @tag: Tag value (0, 1 or 3).
 *
 * If the user of the XArray prefers, they can tag their pointers instead
 * of storing value entries.  Three tags are available (0, 1 and 3).
 * These are distinct from the xa_mark_t as they are not replicated up
 * through the array and cannot be searched for.
 *
 * Context: Any context.
 * Return: An XArray entry.
 */
static inline void *xa_tag_pointer(void *p, unsigned long tag)
{
        return (void *)((unsigned long)p | tag);
}

/**
 * xa_untag_pointer() - Turn an XArray entry into a plain pointer.
 * @entry: XArray entry.
 *
 * If you have stored a tagged pointer in the XArray, call this function
 * to get the untagged version of the pointer.
 *
 * Context: Any context.
 * Return: A pointer.
 */
static inline void *xa_untag_pointer(void *entry)
{
        return (void *)((unsigned long)entry & ~3UL);
}

/**
 * xa_pointer_tag() - Get the tag stored in an XArray entry.
 * @entry: XArray entry.
 *
 * If you have stored a tagged pointer in the XArray, call this function
 * to get the tag of that pointer.
 *
 * Context: Any context.
 * Return: A tag.
 */
static inline unsigned int xa_pointer_tag(void *entry)
{
        return (unsigned long)entry & 3UL;
}

/*
 * xa_mk_internal() - Create an internal entry.
 * @v: Value to turn into an internal entry.
 *
 * Internal entries are used for a number of purposes.  Entries 0-255 are
 * used for sibling entries (only 0-62 are used by the current code).  256
 * is used for the retry entry.  257 is used for the reserved / zero entry.
 * Negative internal entries are used to represent errnos.  Node pointers
 * are also tagged as internal entries in some situations.
 *
 * Context: Any context.
 * Return: An XArray internal entry corresponding to this value.
 */
static inline void *xa_mk_internal(unsigned long v)
{
        return (void *)((v << 2) | 2);
}

/*
 * xa_to_internal() - Extract the value from an internal entry.
 * @entry: XArray entry.
 *
 * Context: Any context.
 * Return: The value which was stored in the internal entry.
 */
static inline unsigned long xa_to_internal(const void *entry)
{
        return (unsigned long)entry >> 2;
}

/*
 * xa_is_internal() - Is the entry an internal entry?
 * @entry: XArray entry.
 *
 * Context: Any context.
 * Return: %true if the entry is an internal entry.
 */
static inline bool xa_is_internal(const void *entry)
{
        return ((unsigned long)entry & 3) == 2;
}

#define XA_ZERO_ENTRY                xa_mk_internal(257)

/**
 * xa_is_zero() - Is the entry a zero entry?
 * @entry: Entry retrieved from the XArray
 *
 * The normal API will return NULL as the contents of a slot containing
 * a zero entry.  You can only see zero entries by using the advanced API.
 *
 * Return: %true if the entry is a zero entry.
 */
static inline bool xa_is_zero(const void *entry)
{
        return unlikely(entry == XA_ZERO_ENTRY);
}

/**
 * xa_is_err() - Report whether an XArray operation returned an error
 * @entry: Result from calling an XArray function
 *
 * If an XArray operation cannot complete an operation, it will return
 * a special value indicating an error.  This function tells you
 * whether an error occurred; xa_err() tells you which error occurred.
 *
 * Context: Any context.
 * Return: %true if the entry indicates an error.
 */
static inline bool xa_is_err(const void *entry)
{
        return unlikely(xa_is_internal(entry) &&
                        entry >= xa_mk_internal(-MAX_ERRNO));
}

/**
 * xa_err() - Turn an XArray result into an errno.
 * @entry: Result from calling an XArray function.
 *
 * If an XArray operation cannot complete an operation, it will return
 * a special pointer value which encodes an errno.  This function extracts
 * the errno from the pointer value, or returns 0 if the pointer does not
 * represent an errno.
 *
 * Context: Any context.
 * Return: A negative errno or 0.
 */
static inline int xa_err(void *entry)
{
        /* xa_to_internal() would not do sign extension. */
        if (xa_is_err(entry))
                return (long)entry >> 2;
        return 0;
}

/**
 * struct xa_limit - Represents a range of IDs.
 * @min: The lowest ID to allocate (inclusive).
 * @max: The maximum ID to allocate (inclusive).
 *
 * This structure is used either directly or via the XA_LIMIT() macro
 * to communicate the range of IDs that are valid for allocation.
 * Three common ranges are predefined for you:
 * * xa_limit_32b        - [0 - UINT_MAX]
 * * xa_limit_31b        - [0 - INT_MAX]
 * * xa_limit_16b        - [0 - USHRT_MAX]
 */
struct xa_limit {
        u32 max;
        u32 min;
};

#define XA_LIMIT(_min, _max) (struct xa_limit) { .min = _min, .max = _max }

#define xa_limit_32b        XA_LIMIT(0, UINT_MAX)
#define xa_limit_31b        XA_LIMIT(0, INT_MAX)
#define xa_limit_16b        XA_LIMIT(0, USHRT_MAX)

typedef unsigned __bitwise xa_mark_t;
#define XA_MARK_0                ((__force xa_mark_t)0U)
#define XA_MARK_1                ((__force xa_mark_t)1U)
#define XA_MARK_2                ((__force xa_mark_t)2U)
#define XA_PRESENT                ((__force xa_mark_t)8U)
#define XA_MARK_MAX                XA_MARK_2
#define XA_FREE_MARK                XA_MARK_0

enum xa_lock_type {
        XA_LOCK_IRQ = 1,
        XA_LOCK_BH = 2,
};

/*
 * Values for xa_flags.  The radix tree stores its GFP flags in the xa_flags,
 * and we remain compatible with that.
 */
#define XA_FLAGS_LOCK_IRQ        ((__force gfp_t)XA_LOCK_IRQ)
#define XA_FLAGS_LOCK_BH        ((__force gfp_t)XA_LOCK_BH)
#define XA_FLAGS_TRACK_FREE        ((__force gfp_t)4U)
#define XA_FLAGS_ZERO_BUSY        ((__force gfp_t)8U)
#define XA_FLAGS_ALLOC_WRAPPED        ((__force gfp_t)16U)
#define XA_FLAGS_ACCOUNT        ((__force gfp_t)32U)
#define XA_FLAGS_MARK(mark)        ((__force gfp_t)((1U << __GFP_BITS_SHIFT) << \
                                                (__force unsigned)(mark)))

/* ALLOC is for a normal 0-based alloc.  ALLOC1 is for an 1-based alloc */
#define XA_FLAGS_ALLOC        (XA_FLAGS_TRACK_FREE | XA_FLAGS_MARK(XA_FREE_MARK))
#define XA_FLAGS_ALLOC1        (XA_FLAGS_TRACK_FREE | XA_FLAGS_ZERO_BUSY)

/**
 * struct xarray - The anchor of the XArray.
 * @xa_lock: Lock that protects the contents of the XArray.
 *
 * To use the xarray, define it statically or embed it in your data structure.
 * It is a very small data structure, so it does not usually make sense to
 * allocate it separately and keep a pointer to it in your data structure.
 *
 * You may use the xa_lock to protect your own data structures as well.
 */
/*
 * If all of the entries in the array are NULL, @xa_head is a NULL pointer.
 * If the only non-NULL entry in the array is at index 0, @xa_head is that
 * entry.  If any other entry in the array is non-NULL, @xa_head points
 * to an @xa_node.
 */
struct xarray {
        spinlock_t        xa_lock;
/* private: The rest of the data structure is not to be used directly. */
        gfp_t                xa_flags;
        void __rcu *        xa_head;
};

#define XARRAY_INIT(name, flags) {                                \
        .xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock),                \
        .xa_flags = flags,                                        \
        .xa_head = NULL,                                        \
}

/**
 * DEFINE_XARRAY_FLAGS() - Define an XArray with custom flags.
 * @name: A string that names your XArray.
 * @flags: XA_FLAG values.
 *
 * This is intended for file scope definitions of XArrays.  It declares
 * and initialises an empty XArray with the chosen name and flags.  It is
 * equivalent to calling xa_init_flags() on the array, but it does the
 * initialisation at compiletime instead of runtime.
 */
#define DEFINE_XARRAY_FLAGS(name, flags)                                \
        struct xarray name = XARRAY_INIT(name, flags)

/**
 * DEFINE_XARRAY() - Define an XArray.
 * @name: A string that names your XArray.
 *
 * This is intended for file scope definitions of XArrays.  It declares
 * and initialises an empty XArray with the chosen name.  It is equivalent
 * to calling xa_init() on the array, but it does the initialisation at
 * compiletime instead of runtime.
 */
#define DEFINE_XARRAY(name) DEFINE_XARRAY_FLAGS(name, 0)

/**
 * DEFINE_XARRAY_ALLOC() - Define an XArray which allocates IDs starting at 0.
 * @name: A string that names your XArray.
 *
 * This is intended for file scope definitions of allocating XArrays.
 * See also DEFINE_XARRAY().
 */
#define DEFINE_XARRAY_ALLOC(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC)

/**
 * DEFINE_XARRAY_ALLOC1() - Define an XArray which allocates IDs starting at 1.
 * @name: A string that names your XArray.
 *
 * This is intended for file scope definitions of allocating XArrays.
 * See also DEFINE_XARRAY().
 */
#define DEFINE_XARRAY_ALLOC1(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC1)

void *xa_load(struct xarray *, unsigned long index);
void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t);
void *xa_erase(struct xarray *, unsigned long index);
void *xa_store_range(struct xarray *, unsigned long first, unsigned long last,
                        void *entry, gfp_t);
bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t);
void xa_set_mark(struct xarray *, unsigned long index, xa_mark_t);
void xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t);
void *xa_find(struct xarray *xa, unsigned long *index,
                unsigned long max, xa_mark_t) __attribute__((nonnull(2)));
void *xa_find_after(struct xarray *xa, unsigned long *index,
                unsigned long max, xa_mark_t) __attribute__((nonnull(2)));
unsigned int xa_extract(struct xarray *, void **dst, unsigned long start,
                unsigned long max, unsigned int n, xa_mark_t);
void xa_destroy(struct xarray *);

/**
 * xa_init_flags() - Initialise an empty XArray with flags.
 * @xa: XArray.
 * @flags: XA_FLAG values.
 *
 * If you need to initialise an XArray with special flags (eg you need
 * to take the lock from interrupt context), use this function instead
 * of xa_init().
 *
 * Context: Any context.
 */
static inline void xa_init_flags(struct xarray *xa, gfp_t flags)
{
        spin_lock_init(&xa->xa_lock);
        xa->xa_flags = flags;
        xa->xa_head = NULL;
}

/**
 * xa_init() - Initialise an empty XArray.
 * @xa: XArray.
 *
 * An empty XArray is full of NULL entries.
 *
 * Context: Any context.
 */
static inline void xa_init(struct xarray *xa)
{
        xa_init_flags(xa, 0);
}

/**
 * xa_empty() - Determine if an array has any present entries.
 * @xa: XArray.
 *
 * Context: Any context.
 * Return: %true if the array contains only NULL pointers.
 */
static inline bool xa_empty(const struct xarray *xa)
{
        return xa->xa_head == NULL;
}

/**
 * xa_marked() - Inquire whether any entry in this array has a mark set
 * @xa: Array
 * @mark: Mark value
 *
 * Context: Any context.
 * Return: %true if any entry has this mark set.
 */
static inline bool xa_marked(const struct xarray *xa, xa_mark_t mark)
{
        return xa->xa_flags & XA_FLAGS_MARK(mark);
}

/**
 * xa_for_each_range() - Iterate over a portion of an XArray.
 * @xa: XArray.
 * @index: Index of @entry.
 * @entry: Entry retrieved from array.
 * @start: First index to retrieve from array.
 * @last: Last index to retrieve from array.
 *
 * During the iteration, @entry will have the value of the entry stored
 * in @xa at @index.  You may modify @index during the iteration if you
 * want to skip or reprocess indices.  It is safe to modify the array
 * during the iteration.  At the end of the iteration, @entry will be set
 * to NULL and @index will have a value less than or equal to max.
 *
 * xa_for_each_range() is O(n.log(n)) while xas_for_each() is O(n).  You have
 * to handle your own locking with xas_for_each(), and if you have to unlock
 * after each iteration, it will also end up being O(n.log(n)).
 * xa_for_each_range() will spin if it hits a retry entry; if you intend to
 * see retry entries, you should use the xas_for_each() iterator instead.
 * The xas_for_each() iterator will expand into more inline code than
 * xa_for_each_range().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 */
#define xa_for_each_range(xa, index, entry, start, last)                \
        for (index = start,                                                \
             entry = xa_find(xa, &index, last, XA_PRESENT);                \
             entry;                                                        \
             entry = xa_find_after(xa, &index, last, XA_PRESENT))

/**
 * xa_for_each_start() - Iterate over a portion of an XArray.
 * @xa: XArray.
 * @index: Index of @entry.
 * @entry: Entry retrieved from array.
 * @start: First index to retrieve from array.
 *
 * During the iteration, @entry will have the value of the entry stored
 * in @xa at @index.  You may modify @index during the iteration if you
 * want to skip or reprocess indices.  It is safe to modify the array
 * during the iteration.  At the end of the iteration, @entry will be set
 * to NULL and @index will have a value less than or equal to max.
 *
 * xa_for_each_start() is O(n.log(n)) while xas_for_each() is O(n).  You have
 * to handle your own locking with xas_for_each(), and if you have to unlock
 * after each iteration, it will also end up being O(n.log(n)).
 * xa_for_each_start() will spin if it hits a retry entry; if you intend to
 * see retry entries, you should use the xas_for_each() iterator instead.
 * The xas_for_each() iterator will expand into more inline code than
 * xa_for_each_start().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 */
#define xa_for_each_start(xa, index, entry, start) \
        xa_for_each_range(xa, index, entry, start, ULONG_MAX)

/**
 * xa_for_each() - Iterate over present entries in an XArray.
 * @xa: XArray.
 * @index: Index of @entry.
 * @entry: Entry retrieved from array.
 *
 * During the iteration, @entry will have the value of the entry stored
 * in @xa at @index.  You may modify @index during the iteration if you want
 * to skip or reprocess indices.  It is safe to modify the array during the
 * iteration.  At the end of the iteration, @entry will be set to NULL and
 * @index will have a value less than or equal to max.
 *
 * xa_for_each() is O(n.log(n)) while xas_for_each() is O(n).  You have
 * to handle your own locking with xas_for_each(), and if you have to unlock
 * after each iteration, it will also end up being O(n.log(n)).  xa_for_each()
 * will spin if it hits a retry entry; if you intend to see retry entries,
 * you should use the xas_for_each() iterator instead.  The xas_for_each()
 * iterator will expand into more inline code than xa_for_each().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 */
#define xa_for_each(xa, index, entry) \
        xa_for_each_start(xa, index, entry, 0)

/**
 * xa_for_each_marked() - Iterate over marked entries in an XArray.
 * @xa: XArray.
 * @index: Index of @entry.
 * @entry: Entry retrieved from array.
 * @filter: Selection criterion.
 *
 * During the iteration, @entry will have the value of the entry stored
 * in @xa at @index.  The iteration will skip all entries in the array
 * which do not match @filter.  You may modify @index during the iteration
 * if you want to skip or reprocess indices.  It is safe to modify the array
 * during the iteration.  At the end of the iteration, @entry will be set to
 * NULL and @index will have a value less than or equal to max.
 *
 * xa_for_each_marked() is O(n.log(n)) while xas_for_each_marked() is O(n).
 * You have to handle your own locking with xas_for_each(), and if you have
 * to unlock after each iteration, it will also end up being O(n.log(n)).
 * xa_for_each_marked() will spin if it hits a retry entry; if you intend to
 * see retry entries, you should use the xas_for_each_marked() iterator
 * instead.  The xas_for_each_marked() iterator will expand into more inline
 * code than xa_for_each_marked().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 */
#define xa_for_each_marked(xa, index, entry, filter) \
        for (index = 0, entry = xa_find(xa, &index, ULONG_MAX, filter); \
             entry; entry = xa_find_after(xa, &index, ULONG_MAX, filter))

#define xa_trylock(xa)                spin_trylock(&(xa)->xa_lock)
#define xa_lock(xa)                spin_lock(&(xa)->xa_lock)
#define xa_unlock(xa)                spin_unlock(&(xa)->xa_lock)
#define xa_lock_bh(xa)                spin_lock_bh(&(xa)->xa_lock)
#define xa_unlock_bh(xa)        spin_unlock_bh(&(xa)->xa_lock)
#define xa_lock_irq(xa)                spin_lock_irq(&(xa)->xa_lock)
#define xa_unlock_irq(xa)        spin_unlock_irq(&(xa)->xa_lock)
#define xa_lock_irqsave(xa, flags) \
                                spin_lock_irqsave(&(xa)->xa_lock, flags)
#define xa_unlock_irqrestore(xa, flags) \
                                spin_unlock_irqrestore(&(xa)->xa_lock, flags)
#define xa_lock_nested(xa, subclass) \
                                spin_lock_nested(&(xa)->xa_lock, subclass)
#define xa_lock_bh_nested(xa, subclass) \
                                spin_lock_bh_nested(&(xa)->xa_lock, subclass)
#define xa_lock_irq_nested(xa, subclass) \
                                spin_lock_irq_nested(&(xa)->xa_lock, subclass)
#define xa_lock_irqsave_nested(xa, flags, subclass) \
                spin_lock_irqsave_nested(&(xa)->xa_lock, flags, subclass)

/*
 * Versions of the normal API which require the caller to hold the
 * xa_lock.  If the GFP flags allow it, they will drop the lock to
 * allocate memory, then reacquire it afterwards.  These functions
 * may also re-enable interrupts if the XArray flags indicate the
 * locking should be interrupt safe.
 */
void *__xa_erase(struct xarray *, unsigned long index);
void *__xa_store(struct xarray *, unsigned long index, void *entry, gfp_t);
void *__xa_cmpxchg(struct xarray *, unsigned long index, void *old,
                void *entry, gfp_t);
int __must_check __xa_insert(struct xarray *, unsigned long index,
                void *entry, gfp_t);
int __must_check __xa_alloc(struct xarray *, u32 *id, void *entry,
                struct xa_limit, gfp_t);
int __must_check __xa_alloc_cyclic(struct xarray *, u32 *id, void *entry,
                struct xa_limit, u32 *next, gfp_t);
void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t);
void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t);

/**
 * xa_store_bh() - Store this entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * This function is like calling xa_store() except it disables softirqs
 * while holding the array lock.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.
 * Return: The old entry at this index or xa_err() if an error happened.
 */
static inline void *xa_store_bh(struct xarray *xa, unsigned long index,
                void *entry, gfp_t gfp)
{
        void *curr;

        might_alloc(gfp);
        xa_lock_bh(xa);
        curr = __xa_store(xa, index, entry, gfp);
        xa_unlock_bh(xa);

        return curr;
}

/**
 * xa_store_irq() - Store this entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * This function is like calling xa_store() except it disables interrupts
 * while holding the array lock.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.
 * Return: The old entry at this index or xa_err() if an error happened.
 */
static inline void *xa_store_irq(struct xarray *xa, unsigned long index,
                void *entry, gfp_t gfp)
{
        void *curr;

        might_alloc(gfp);
        xa_lock_irq(xa);
        curr = __xa_store(xa, index, entry, gfp);
        xa_unlock_irq(xa);

        return curr;
}

/**
 * xa_erase_bh() - Erase this entry from the XArray.
 * @xa: XArray.
 * @index: Index of entry.
 *
 * After this function returns, loading from @index will return %NULL.
 * If the index is part of a multi-index entry, all indices will be erased
 * and none of the entries will be part of a multi-index entry.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.
 * Return: The entry which used to be at this index.
 */
static inline void *xa_erase_bh(struct xarray *xa, unsigned long index)
{
        void *entry;

        xa_lock_bh(xa);
        entry = __xa_erase(xa, index);
        xa_unlock_bh(xa);

        return entry;
}

/**
 * xa_erase_irq() - Erase this entry from the XArray.
 * @xa: XArray.
 * @index: Index of entry.
 *
 * After this function returns, loading from @index will return %NULL.
 * If the index is part of a multi-index entry, all indices will be erased
 * and none of the entries will be part of a multi-index entry.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.
 * Return: The entry which used to be at this index.
 */
static inline void *xa_erase_irq(struct xarray *xa, unsigned long index)
{
        void *entry;

        xa_lock_irq(xa);
        entry = __xa_erase(xa, index);
        xa_unlock_irq(xa);

        return entry;
}

/**
 * xa_cmpxchg() - Conditionally replace an entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @old: Old value to test against.
 * @entry: New value to place in array.
 * @gfp: Memory allocation flags.
 *
 * If the entry at @index is the same as @old, replace it with @entry.
 * If the return value is equal to @old, then the exchange was successful.
 *
 * Context: Any context.  Takes and releases the xa_lock.  May sleep
 * if the @gfp flags permit.
 * Return: The old value at this index or xa_err() if an error happened.
 */
static inline void *xa_cmpxchg(struct xarray *xa, unsigned long index,
                        void *old, void *entry, gfp_t gfp)
{
        void *curr;

        might_alloc(gfp);
        xa_lock(xa);
        curr = __xa_cmpxchg(xa, index, old, entry, gfp);
        xa_unlock(xa);

        return curr;
}

/**
 * xa_cmpxchg_bh() - Conditionally replace an entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @old: Old value to test against.
 * @entry: New value to place in array.
 * @gfp: Memory allocation flags.
 *
 * This function is like calling xa_cmpxchg() except it disables softirqs
 * while holding the array lock.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.  May sleep if the @gfp flags permit.
 * Return: The old value at this index or xa_err() if an error happened.
 */
static inline void *xa_cmpxchg_bh(struct xarray *xa, unsigned long index,
                        void *old, void *entry, gfp_t gfp)
{
        void *curr;

        might_alloc(gfp);
        xa_lock_bh(xa);
        curr = __xa_cmpxchg(xa, index, old, entry, gfp);
        xa_unlock_bh(xa);

        return curr;
}

/**
 * xa_cmpxchg_irq() - Conditionally replace an entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @old: Old value to test against.
 * @entry: New value to place in array.
 * @gfp: Memory allocation flags.
 *
 * This function is like calling xa_cmpxchg() except it disables interrupts
 * while holding the array lock.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.  May sleep if the @gfp flags permit.
 * Return: The old value at this index or xa_err() if an error happened.
 */
static inline void *xa_cmpxchg_irq(struct xarray *xa, unsigned long index,
                        void *old, void *entry, gfp_t gfp)
{
        void *curr;

        might_alloc(gfp);
        xa_lock_irq(xa);
        curr = __xa_cmpxchg(xa, index, old, entry, gfp);
        xa_unlock_irq(xa);

        return curr;
}

/**
 * xa_insert() - Store this entry in the XArray unless another entry is
 *                        already present.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * Inserting a NULL entry will store a reserved entry (like xa_reserve())
 * if no entry is present.  Inserting will fail if a reserved entry is
 * present, even though loading from this index will return NULL.
 *
 * Context: Any context.  Takes and releases the xa_lock.  May sleep if
 * the @gfp flags permit.
 * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
 * -ENOMEM if memory could not be allocated.
 */
static inline int __must_check xa_insert(struct xarray *xa,
                unsigned long index, void *entry, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock(xa);
        err = __xa_insert(xa, index, entry, gfp);
        xa_unlock(xa);

        return err;
}

/**
 * xa_insert_bh() - Store this entry in the XArray unless another entry is
 *                        already present.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * Inserting a NULL entry will store a reserved entry (like xa_reserve())
 * if no entry is present.  Inserting will fail if a reserved entry is
 * present, even though loading from this index will return NULL.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.  May sleep if the @gfp flags permit.
 * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
 * -ENOMEM if memory could not be allocated.
 */
static inline int __must_check xa_insert_bh(struct xarray *xa,
                unsigned long index, void *entry, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_bh(xa);
        err = __xa_insert(xa, index, entry, gfp);
        xa_unlock_bh(xa);

        return err;
}

/**
 * xa_insert_irq() - Store this entry in the XArray unless another entry is
 *                        already present.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * Inserting a NULL entry will store a reserved entry (like xa_reserve())
 * if no entry is present.  Inserting will fail if a reserved entry is
 * present, even though loading from this index will return NULL.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.  May sleep if the @gfp flags permit.
 * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
 * -ENOMEM if memory could not be allocated.
 */
static inline int __must_check xa_insert_irq(struct xarray *xa,
                unsigned long index, void *entry, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_irq(xa);
        err = __xa_insert(xa, index, entry, gfp);
        xa_unlock_irq(xa);

        return err;
}

/**
 * xa_alloc() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Context: Any context.  Takes and releases the xa_lock.  May sleep if
 * the @gfp flags permit.
 * Return: 0 on success, -ENOMEM if memory could not be allocated or
 * -EBUSY if there are no free entries in @limit.
 */
static inline __must_check int xa_alloc(struct xarray *xa, u32 *id,
                void *entry, struct xa_limit limit, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock(xa);
        err = __xa_alloc(xa, id, entry, limit, gfp);
        xa_unlock(xa);

        return err;
}

/**
 * xa_alloc_bh() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.  May sleep if the @gfp flags permit.
 * Return: 0 on success, -ENOMEM if memory could not be allocated or
 * -EBUSY if there are no free entries in @limit.
 */
static inline int __must_check xa_alloc_bh(struct xarray *xa, u32 *id,
                void *entry, struct xa_limit limit, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_bh(xa);
        err = __xa_alloc(xa, id, entry, limit, gfp);
        xa_unlock_bh(xa);

        return err;
}

/**
 * xa_alloc_irq() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.  May sleep if the @gfp flags permit.
 * Return: 0 on success, -ENOMEM if memory could not be allocated or
 * -EBUSY if there are no free entries in @limit.
 */
static inline int __must_check xa_alloc_irq(struct xarray *xa, u32 *id,
                void *entry, struct xa_limit limit, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_irq(xa);
        err = __xa_alloc(xa, id, entry, limit, gfp);
        xa_unlock_irq(xa);

        return err;
}

/**
 * xa_alloc_cyclic() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of allocated ID.
 * @next: Pointer to next ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 * The search for an empty entry will start at @next and will wrap
 * around if necessary.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Note that callers interested in whether wrapping has occurred should
 * use __xa_alloc_cyclic() instead.
 *
 * Context: Any context.  Takes and releases the xa_lock.  May sleep if
 * the @gfp flags permit.
 * Return: 0 if the allocation succeeded, -ENOMEM if memory could not be
 * allocated or -EBUSY if there are no free entries in @limit.
 */
static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry,
                struct xa_limit limit, u32 *next, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock(xa);
        err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
        xa_unlock(xa);

        return err < 0 ? err : 0;
}

/**
 * xa_alloc_cyclic_bh() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of allocated ID.
 * @next: Pointer to next ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 * The search for an empty entry will start at @next and will wrap
 * around if necessary.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Note that callers interested in whether wrapping has occurred should
 * use __xa_alloc_cyclic() instead.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.  May sleep if the @gfp flags permit.
 * Return: 0 if the allocation succeeded, -ENOMEM if memory could not be
 * allocated or -EBUSY if there are no free entries in @limit.
 */
static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry,
                struct xa_limit limit, u32 *next, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_bh(xa);
        err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
        xa_unlock_bh(xa);

        return err < 0 ? err : 0;
}

/**
 * xa_alloc_cyclic_irq() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of allocated ID.
 * @next: Pointer to next ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 * The search for an empty entry will start at @next and will wrap
 * around if necessary.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Note that callers interested in whether wrapping has occurred should
 * use __xa_alloc_cyclic() instead.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.  May sleep if the @gfp flags permit.
 * Return: 0 if the allocation succeeded, -ENOMEM if memory could not be
 * allocated or -EBUSY if there are no free entries in @limit.
 */
static inline int xa_alloc_cyclic_irq(struct xarray *xa, u32 *id, void *entry,
                struct xa_limit limit, u32 *next, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_irq(xa);
        err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
        xa_unlock_irq(xa);

        return err < 0 ? err : 0;
}

/**
 * xa_reserve() - Reserve this index in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @gfp: Memory allocation flags.
 *
 * Ensures there is somewhere to store an entry at @index in the array.
 * If there is already something stored at @index, this function does
 * nothing.  If there was nothing there, the entry is marked as reserved.
 * Loading from a reserved entry returns a %NULL pointer.
 *
 * If you do not use the entry that you have reserved, call xa_release()
 * or xa_erase() to free any unnecessary memory.
 *
 * Context: Any context.  Takes and releases the xa_lock.
 * May sleep if the @gfp flags permit.
 * Return: 0 if the reservation succeeded or -ENOMEM if it failed.
 */
static inline __must_check
int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp)
{
        return xa_err(xa_cmpxchg(xa, index, NULL, XA_ZERO_ENTRY, gfp));
}

/**
 * xa_reserve_bh() - Reserve this index in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @gfp: Memory allocation flags.
 *
 * A softirq-disabling version of xa_reserve().
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.
 * Return: 0 if the reservation succeeded or -ENOMEM if it failed.
 */
static inline __must_check
int xa_reserve_bh(struct xarray *xa, unsigned long index, gfp_t gfp)
{
        return xa_err(xa_cmpxchg_bh(xa, index, NULL, XA_ZERO_ENTRY, gfp));
}

/**
 * xa_reserve_irq() - Reserve this index in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @gfp: Memory allocation flags.
 *
 * An interrupt-disabling version of xa_reserve().
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.
 * Return: 0 if the reservation succeeded or -ENOMEM if it failed.
 */
static inline __must_check
int xa_reserve_irq(struct xarray *xa, unsigned long index, gfp_t gfp)
{
        return xa_err(xa_cmpxchg_irq(xa, index, NULL, XA_ZERO_ENTRY, gfp));
}

/**
 * xa_release() - Release a reserved entry.
 * @xa: XArray.
 * @index: Index of entry.
 *
 * After calling xa_reserve(), you can call this function to release the
 * reservation.  If the entry at @index has been stored to, this function
 * will do nothing.
 */
static inline void xa_release(struct xarray *xa, unsigned long index)
{
        xa_cmpxchg(xa, index, XA_ZERO_ENTRY, NULL, 0);
}

/* Everything below here is the Advanced API.  Proceed with caution. */

/*
 * The xarray is constructed out of a set of 'chunks' of pointers.  Choosing
 * the best chunk size requires some tradeoffs.  A power of two recommends
 * itself so that we can walk the tree based purely on shifts and masks.
 * Generally, the larger the better; as the number of slots per level of the
 * tree increases, the less tall the tree needs to be.  But that needs to be
 * balanced against the memory consumption of each node.  On a 64-bit system,
 * xa_node is currently 576 bytes, and we get 7 of them per 4kB page.  If we
 * doubled the number of slots per node, we'd get only 3 nodes per 4kB page.
 */
#ifndef XA_CHUNK_SHIFT
#define XA_CHUNK_SHIFT                (IS_ENABLED(CONFIG_BASE_SMALL) ? 4 : 6)
#endif
#define XA_CHUNK_SIZE                (1UL << XA_CHUNK_SHIFT)
#define XA_CHUNK_MASK                (XA_CHUNK_SIZE - 1)
#define XA_MAX_MARKS                3
#define XA_MARK_LONGS                BITS_TO_LONGS(XA_CHUNK_SIZE)

/*
 * @count is the count of every non-NULL element in the ->slots array
 * whether that is a value entry, a retry entry, a user pointer,
 * a sibling entry or a pointer to the next level of the tree.
 * @nr_values is the count of every element in ->slots which is
 * either a value entry or a sibling of a value entry.
 */
struct xa_node {
        unsigned char        shift;                /* Bits remaining in each slot */
        unsigned char        offset;                /* Slot offset in parent */
        unsigned char        count;                /* Total entry count */
        unsigned char        nr_values;        /* Value entry count */
        struct xa_node __rcu *parent;        /* NULL at top of tree */
        struct xarray        *array;                /* The array we belong to */
        union {
                struct list_head private_list;        /* For tree user */
                struct rcu_head        rcu_head;        /* Used when freeing node */
        };
        void __rcu        *slots[XA_CHUNK_SIZE];
        union {
                unsigned long        tags[XA_MAX_MARKS][XA_MARK_LONGS];
                unsigned long        marks[XA_MAX_MARKS][XA_MARK_LONGS];
        };
};

void xa_dump(const struct xarray *);
void xa_dump_node(const struct xa_node *);

#ifdef XA_DEBUG
#define XA_BUG_ON(xa, x) do {                                        \
                if (x) {                                        \
                        xa_dump(xa);                                \
                        BUG();                                        \
                }                                                \
        } while (0)
#define XA_NODE_BUG_ON(node, x) do {                                \
                if (x) {                                        \
                        if (node) xa_dump_node(node);                \
                        BUG();                                        \
                }                                                \
        } while (0)
#else
#define XA_BUG_ON(xa, x)        do { } while (0)
#define XA_NODE_BUG_ON(node, x)        do { } while (0)
#endif

/* Private */
static inline void *xa_head(const struct xarray *xa)
{
        return rcu_dereference_check(xa->xa_head,
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline void *xa_head_locked(const struct xarray *xa)
{
        return rcu_dereference_protected(xa->xa_head,
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline void *xa_entry(const struct xarray *xa,
                                const struct xa_node *node, unsigned int offset)
{
        XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE);
        return rcu_dereference_check(node->slots[offset],
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline void *xa_entry_locked(const struct xarray *xa,
                                const struct xa_node *node, unsigned int offset)
{
        XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE);
        return rcu_dereference_protected(node->slots[offset],
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline struct xa_node *xa_parent(const struct xarray *xa,
                                        const struct xa_node *node)
{
        return rcu_dereference_check(node->parent,
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline struct xa_node *xa_parent_locked(const struct xarray *xa,
                                        const struct xa_node *node)
{
        return rcu_dereference_protected(node->parent,
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline void *xa_mk_node(const struct xa_node *node)
{
        return (void *)((unsigned long)node | 2);
}

/* Private */
static inline struct xa_node *xa_to_node(const void *entry)
{
        return (struct xa_node *)((unsigned long)entry - 2);
}

/* Private */
static inline bool xa_is_node(const void *entry)
{
        return xa_is_internal(entry) && (unsigned long)entry > 4096;
}

/* Private */
static inline void *xa_mk_sibling(unsigned int offset)
{
        return xa_mk_internal(offset);
}

/* Private */
static inline unsigned long xa_to_sibling(const void *entry)
{
        return xa_to_internal(entry);
}

/**
 * xa_is_sibling() - Is the entry a sibling entry?
 * @entry: Entry retrieved from the XArray
 *
 * Return: %true if the entry is a sibling entry.
 */
static inline bool xa_is_sibling(const void *entry)
{
        return IS_ENABLED(CONFIG_XARRAY_MULTI) && xa_is_internal(entry) &&
                (entry < xa_mk_sibling(XA_CHUNK_SIZE - 1));
}

#define XA_RETRY_ENTRY                xa_mk_internal(256)

/**
 * xa_is_retry() - Is the entry a retry entry?
 * @entry: Entry retrieved from the XArray
 *
 * Return: %true if the entry is a retry entry.
 */
static inline bool xa_is_retry(const void *entry)
{
        return unlikely(entry == XA_RETRY_ENTRY);
}

/**
 * xa_is_advanced() - Is the entry only permitted for the advanced API?
 * @entry: Entry to be stored in the XArray.
 *
 * Return: %true if the entry cannot be stored by the normal API.
 */
static inline bool xa_is_advanced(const void *entry)
{
        return xa_is_internal(entry) && (entry <= XA_RETRY_ENTRY);
}

/**
 * typedef xa_update_node_t - A callback function from the XArray.
 * @node: The node which is being processed
 *
 * This function is called every time the XArray updates the count of
 * present and value entries in a node.  It allows advanced users to
 * maintain the private_list in the node.
 *
 * Context: The xa_lock is held and interrupts may be disabled.
 *            Implementations should not drop the xa_lock, nor re-enable
 *            interrupts.
 */
typedef void (*xa_update_node_t)(struct xa_node *node);

void xa_delete_node(struct xa_node *, xa_update_node_t);

/*
 * The xa_state is opaque to its users.  It contains various different pieces
 * of state involved in the current operation on the XArray.  It should be
 * declared on the stack and passed between the various internal routines.
 * The various elements in it should not be accessed directly, but only
 * through the provided accessor functions.  The below documentation is for
 * the benefit of those working on the code, not for users of the XArray.
 *
 * @xa_node usually points to the xa_node containing the slot we're operating
 * on (and @xa_offset is the offset in the slots array).  If there is a
 * single entry in the array at index 0, there are no allocated xa_nodes to
 * point to, and so we store %NULL in @xa_node.  @xa_node is set to
 * the value %XAS_RESTART if the xa_state is not walked to the correct
 * position in the tree of nodes for this operation.  If an error occurs
 * during an operation, it is set to an %XAS_ERROR value.  If we run off the
 * end of the allocated nodes, it is set to %XAS_BOUNDS.
 */
struct xa_state {
        struct xarray *xa;
        unsigned long xa_index;
        unsigned char xa_shift;
        unsigned char xa_sibs;
        unsigned char xa_offset;
        unsigned char xa_pad;                /* Helps gcc generate better code */
        struct xa_node *xa_node;
        struct xa_node *xa_alloc;
        xa_update_node_t xa_update;
        struct list_lru *xa_lru;
};

/*
 * We encode errnos in the xas->xa_node.  If an error has happened, we need to
 * drop the lock to fix it, and once we've done so the xa_state is invalid.
 */
#define XA_ERROR(errno) ((struct xa_node *)(((unsigned long)errno << 2) | 2UL))
#define XAS_BOUNDS        ((struct xa_node *)1UL)
#define XAS_RESTART        ((struct xa_node *)3UL)

#define __XA_STATE(array, index, shift, sibs)  {        \
        .xa = array,                                        \
        .xa_index = index,                                \
        .xa_shift = shift,                                \
        .xa_sibs = sibs,                                \
        .xa_offset = 0,                                        \
        .xa_pad = 0,                                        \
        .xa_node = XAS_RESTART,                                \
        .xa_alloc = NULL,                                \
        .xa_update = NULL,                                \
        .xa_lru = NULL,                                        \
}

/**
 * XA_STATE() - Declare an XArray operation state.
 * @name: Name of this operation state (usually xas).
 * @array: Array to operate on.
 * @index: Initial index of interest.
 *
 * Declare and initialise an xa_state on the stack.
 */
#define XA_STATE(name, array, index)                                \
        struct xa_state name = __XA_STATE(array, index, 0, 0)

/**
 * XA_STATE_ORDER() - Declare an XArray operation state.
 * @name: Name of this operation state (usually xas).
 * @array: Array to operate on.
 * @index: Initial index of interest.
 * @order: Order of entry.
 *
 * Declare and initialise an xa_state on the stack.  This variant of
 * XA_STATE() allows you to specify the 'order' of the element you
 * want to operate on.`
 */
#define XA_STATE_ORDER(name, array, index, order)                \
        struct xa_state name = __XA_STATE(array,                \
                        (index >> order) << order,                \
                        order - (order % XA_CHUNK_SHIFT),        \
                        (1U << (order % XA_CHUNK_SHIFT)) - 1)

#define xas_marked(xas, mark)        xa_marked((xas)->xa, (mark))
#define xas_trylock(xas)        xa_trylock((xas)->xa)
#define xas_lock(xas)                xa_lock((xas)->xa)
#define xas_unlock(xas)                xa_unlock((xas)->xa)
#define xas_lock_bh(xas)        xa_lock_bh((xas)->xa)
#define xas_unlock_bh(xas)        xa_unlock_bh((xas)->xa)
#define xas_lock_irq(xas)        xa_lock_irq((xas)->xa)
#define xas_unlock_irq(xas)        xa_unlock_irq((xas)->xa)
#define xas_lock_irqsave(xas, flags) \
                                xa_lock_irqsave((xas)->xa, flags)
#define xas_unlock_irqrestore(xas, flags) \
                                xa_unlock_irqrestore((xas)->xa, flags)

/**
 * xas_error() - Return an errno stored in the xa_state.
 * @xas: XArray operation state.
 *
 * Return: 0 if no error has been noted.  A negative errno if one has.
 */
static inline int xas_error(const struct xa_state *xas)
{
        return xa_err(xas->xa_node);
}

/**
 * xas_set_err() - Note an error in the xa_state.
 * @xas: XArray operation state.
 * @err: Negative error number.
 *
 * Only call this function with a negative @err; zero or positive errors
 * will probably not behave the way you think they should.  If you want
 * to clear the error from an xa_state, use xas_reset().
 */
static inline void xas_set_err(struct xa_state *xas, long err)
{
        xas->xa_node = XA_ERROR(err);
}

/**
 * xas_invalid() - Is the xas in a retry or error state?
 * @xas: XArray operation state.
 *
 * Return: %true if the xas cannot be used for operations.
 */
static inline bool xas_invalid(const struct xa_state *xas)
{
        return (unsigned long)xas->xa_node & 3;
}

/**
 * xas_valid() - Is the xas a valid cursor into the array?
 * @xas: XArray operation state.
 *
 * Return: %true if the xas can be used for operations.
 */
static inline bool xas_valid(const struct xa_state *xas)
{
        return !xas_invalid(xas);
}

/**
 * xas_is_node() - Does the xas point to a node?
 * @xas: XArray operation state.
 *
 * Return: %true if the xas currently references a node.
 */
static inline bool xas_is_node(const struct xa_state *xas)
{
        return xas_valid(xas) && xas->xa_node;
}

/* True if the pointer is something other than a node */
static inline bool xas_not_node(struct xa_node *node)
{
        return ((unsigned long)node & 3) || !node;
}

/* True if the node represents RESTART or an error */
static inline bool xas_frozen(struct xa_node *node)
{
        return (unsigned long)node & 2;
}

/* True if the node represents head-of-tree, RESTART or BOUNDS */
static inline bool xas_top(struct xa_node *node)
{
        return node <= XAS_RESTART;
}

/**
 * xas_reset() - Reset an XArray operation state.
 * @xas: XArray operation state.
 *
 * Resets the error or walk state of the @xas so future walks of the
 * array will start from the root.  Use this if you have dropped the
 * xarray lock and want to reuse the xa_state.
 *
 * Context: Any context.
 */
static inline void xas_reset(struct xa_state *xas)
{
        xas->xa_node = XAS_RESTART;
}

/**
 * xas_retry() - Retry the operation if appropriate.
 * @xas: XArray operation state.
 * @entry: Entry from xarray.
 *
 * The advanced functions may sometimes return an internal entry, such as
 * a retry entry or a zero entry.  This function sets up the @xas to restart
 * the walk from the head of the array if needed.
 *
 * Context: Any context.
 * Return: true if the operation needs to be retried.
 */
static inline bool xas_retry(struct xa_state *xas, const void *entry)
{
        if (xa_is_zero(entry))
                return true;
        if (!xa_is_retry(entry))
                return false;
        xas_reset(xas);
        return true;
}

void *xas_load(struct xa_state *);
void *xas_store(struct xa_state *, void *entry);
void *xas_find(struct xa_state *, unsigned long max);
void *xas_find_conflict(struct xa_state *);

bool xas_get_mark(const struct xa_state *, xa_mark_t);
void xas_set_mark(const struct xa_state *, xa_mark_t);
void xas_clear_mark(const struct xa_state *, xa_mark_t);
void *xas_find_marked(struct xa_state *, unsigned long max, xa_mark_t);
void xas_init_marks(const struct xa_state *);

bool xas_nomem(struct xa_state *, gfp_t);
void xas_destroy(struct xa_state *);
void xas_pause(struct xa_state *);

void xas_create_range(struct xa_state *);

#ifdef CONFIG_XARRAY_MULTI
int xa_get_order(struct xarray *, unsigned long index);
int xas_get_order(struct xa_state *xas);
void xas_split(struct xa_state *, void *entry, unsigned int order);
void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t);
void xas_try_split(struct xa_state *xas, void *entry, unsigned int order);
unsigned int xas_try_split_min_order(unsigned int order);
#else
static inline int xa_get_order(struct xarray *xa, unsigned long index)
{
        return 0;
}

static inline int xas_get_order(struct xa_state *xas)
{
        return 0;
}

static inline void xas_split(struct xa_state *xas, void *entry,
                unsigned int order)
{
        xas_store(xas, entry);
}

static inline void xas_split_alloc(struct xa_state *xas, void *entry,
                unsigned int order, gfp_t gfp)
{
}

static inline void xas_try_split(struct xa_state *xas, void *entry,
                unsigned int order)
{
}

static inline unsigned int xas_try_split_min_order(unsigned int order)
{
        return 0;
}

#endif

/**
 * xas_reload() - Refetch an entry from the xarray.
 * @xas: XArray operation state.
 *
 * Use this function to check that a previously loaded entry still has
 * the same value.  This is useful for the lockless pagecache lookup where
 * we walk the array with only the RCU lock to protect us, lock the page,
 * then check that the page hasn't moved since we looked it up.
 *
 * The caller guarantees that @xas is still valid.  If it may be in an
 * error or restart state, call xas_load() instead.
 *
 * Return: The entry at this location in the xarray.
 */
static inline void *xas_reload(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;
        void *entry;
        char offset;

        if (!node)
                return xa_head(xas->xa);
        if (IS_ENABLED(CONFIG_XARRAY_MULTI)) {
                offset = (xas->xa_index >> node->shift) & XA_CHUNK_MASK;
                entry = xa_entry(xas->xa, node, offset);
                if (!xa_is_sibling(entry))
                        return entry;
                offset = xa_to_sibling(entry);
        } else {
                offset = xas->xa_offset;
        }
        return xa_entry(xas->xa, node, offset);
}

/**
 * xas_set() - Set up XArray operation state for a different index.
 * @xas: XArray operation state.
 * @index: New index into the XArray.
 *
 * Move the operation state to refer to a different index.  This will
 * have the effect of starting a walk from the top; see xas_next()
 * to move to an adjacent index.
 */
static inline void xas_set(struct xa_state *xas, unsigned long index)
{
        xas->xa_index = index;
        xas->xa_node = XAS_RESTART;
}

/**
 * xas_advance() - Skip over sibling entries.
 * @xas: XArray operation state.
 * @index: Index of last sibling entry.
 *
 * Move the operation state to refer to the last sibling entry.
 * This is useful for loops that normally want to see sibling
 * entries but sometimes want to skip them.  Use xas_set() if you
 * want to move to an index which is not part of this entry.
 */
static inline void xas_advance(struct xa_state *xas, unsigned long index)
{
        unsigned char shift = xas_is_node(xas) ? xas->xa_node->shift : 0;

        xas->xa_index = index;
        xas->xa_offset = (index >> shift) & XA_CHUNK_MASK;
}

/**
 * xas_set_order() - Set up XArray operation state for a multislot entry.
 * @xas: XArray operation state.
 * @index: Target of the operation.
 * @order: Entry occupies 2^@order indices.
 */
static inline void xas_set_order(struct xa_state *xas, unsigned long index,
                                        unsigned int order)
{
#ifdef CONFIG_XARRAY_MULTI
        xas->xa_index = order < BITS_PER_LONG ? (index >> order) << order : 0;
        xas->xa_shift = order - (order % XA_CHUNK_SHIFT);
        xas->xa_sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1;
        xas->xa_node = XAS_RESTART;
#else
        BUG_ON(order > 0);
        xas_set(xas, index);
#endif
}

/**
 * xas_set_update() - Set up XArray operation state for a callback.
 * @xas: XArray operation state.
 * @update: Function to call when updating a node.
 *
 * The XArray can notify a caller after it has updated an xa_node.
 * This is advanced functionality and is only needed by the page
 * cache and swap cache.
 */
static inline void xas_set_update(struct xa_state *xas, xa_update_node_t update)
{
        xas->xa_update = update;
}

static inline void xas_set_lru(struct xa_state *xas, struct list_lru *lru)
{
        xas->xa_lru = lru;
}

/**
 * xas_next_entry() - Advance iterator to next present entry.
 * @xas: XArray operation state.
 * @max: Highest index to return.
 *
 * xas_next_entry() is an inline function to optimise xarray traversal for
 * speed.  It is equivalent to calling xas_find(), and will call xas_find()
 * for all the hard cases.
 *
 * Return: The next present entry after the one currently referred to by @xas.
 */
static inline void *xas_next_entry(struct xa_state *xas, unsigned long max)
{
        struct xa_node *node = xas->xa_node;
        void *entry;

        if (unlikely(xas_not_node(node) || node->shift ||
                        xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK)))
                return xas_find(xas, max);

        do {
                if (unlikely(xas->xa_index >= max))
                        return xas_find(xas, max);
                if (unlikely(xas->xa_offset == XA_CHUNK_MASK))
                        return xas_find(xas, max);
                entry = xa_entry(xas->xa, node, xas->xa_offset + 1);
                if (unlikely(xa_is_internal(entry)))
                        return xas_find(xas, max);
                xas->xa_offset++;
                xas->xa_index++;
        } while (!entry);

        return entry;
}

/* Private */
static inline unsigned int xas_find_chunk(struct xa_state *xas, bool advance,
                xa_mark_t mark)
{
        unsigned long *addr = xas->xa_node->marks[(__force unsigned)mark];
        unsigned int offset = xas->xa_offset;

        if (advance)
                offset++;
        if (XA_CHUNK_SIZE == BITS_PER_LONG) {
                if (offset < XA_CHUNK_SIZE) {
                        unsigned long data = *addr & (~0UL << offset);
                        if (data)
                                return __ffs(data);
                }
                return XA_CHUNK_SIZE;
        }

        return find_next_bit(addr, XA_CHUNK_SIZE, offset);
}

/**
 * xas_next_marked() - Advance iterator to next marked entry.
 * @xas: XArray operation state.
 * @max: Highest index to return.
 * @mark: Mark to search for.
 *
 * xas_next_marked() is an inline function to optimise xarray traversal for
 * speed.  It is equivalent to calling xas_find_marked(), and will call
 * xas_find_marked() for all the hard cases.
 *
 * Return: The next marked entry after the one currently referred to by @xas.
 */
static inline void *xas_next_marked(struct xa_state *xas, unsigned long max,
                                                                xa_mark_t mark)
{
        struct xa_node *node = xas->xa_node;
        void *entry;
        unsigned int offset;

        if (unlikely(xas_not_node(node) || node->shift))
                return xas_find_marked(xas, max, mark);
        offset = xas_find_chunk(xas, true, mark);
        xas->xa_offset = offset;
        xas->xa_index = (xas->xa_index & ~XA_CHUNK_MASK) + offset;
        if (xas->xa_index > max)
                return NULL;
        if (offset == XA_CHUNK_SIZE)
                return xas_find_marked(xas, max, mark);
        entry = xa_entry(xas->xa, node, offset);
        if (!entry)
                return xas_find_marked(xas, max, mark);
        return entry;
}

/*
 * If iterating while holding a lock, drop the lock and reschedule
 * every %XA_CHECK_SCHED loops.
 */
enum {
        XA_CHECK_SCHED = 4096,
};

/**
 * xas_for_each() - Iterate over a range of an XArray.
 * @xas: XArray operation state.
 * @entry: Entry retrieved from the array.
 * @max: Maximum index to retrieve from array.
 *
 * The loop body will be executed for each entry present in the xarray
 * between the current xas position and @max.  @entry will be set to
 * the entry retrieved from the xarray.  It is safe to delete entries
 * from the array in the loop body.  You should hold either the RCU lock
 * or the xa_lock while iterating.  If you need to drop the lock, call
 * xas_pause() first.
 */
#define xas_for_each(xas, entry, max) \
        for (entry = xas_find(xas, max); entry; \
             entry = xas_next_entry(xas, max))

/**
 * xas_for_each_marked() - Iterate over a range of an XArray.
 * @xas: XArray operation state.
 * @entry: Entry retrieved from the array.
 * @max: Maximum index to retrieve from array.
 * @mark: Mark to search for.
 *
 * The loop body will be executed for each marked entry in the xarray
 * between the current xas position and @max.  @entry will be set to
 * the entry retrieved from the xarray.  It is safe to delete entries
 * from the array in the loop body.  You should hold either the RCU lock
 * or the xa_lock while iterating.  If you need to drop the lock, call
 * xas_pause() first.
 */
#define xas_for_each_marked(xas, entry, max, mark) \
        for (entry = xas_find_marked(xas, max, mark); entry; \
             entry = xas_next_marked(xas, max, mark))

/**
 * xas_for_each_conflict() - Iterate over a range of an XArray.
 * @xas: XArray operation state.
 * @entry: Entry retrieved from the array.
 *
 * The loop body will be executed for each entry in the XArray that
 * lies within the range specified by @xas.  If the loop terminates
 * normally, @entry will be %NULL.  The user may break out of the loop,
 * which will leave @entry set to the conflicting entry.  The caller
 * may also call xa_set_err() to exit the loop while setting an error
 * to record the reason.
 */
#define xas_for_each_conflict(xas, entry) \
        while ((entry = xas_find_conflict(xas)))

void *__xas_next(struct xa_state *);
void *__xas_prev(struct xa_state *);

/**
 * xas_prev() - Move iterator to previous index.
 * @xas: XArray operation state.
 *
 * If the @xas was in an error state, it will remain in an error state
 * and this function will return %NULL.  If the @xas has never been walked,
 * it will have the effect of calling xas_load().  Otherwise one will be
 * subtracted from the index and the state will be walked to the correct
 * location in the array for the next operation.
 *
 * If the iterator was referencing index 0, this function wraps
 * around to %ULONG_MAX.
 *
 * Return: The entry at the new index.  This may be %NULL or an internal
 * entry.
 */
static inline void *xas_prev(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;

        if (unlikely(xas_not_node(node) || node->shift ||
                                xas->xa_offset == 0))
                return __xas_prev(xas);

        xas->xa_index--;
        xas->xa_offset--;
        return xa_entry(xas->xa, node, xas->xa_offset);
}

/**
 * xas_next() - Move state to next index.
 * @xas: XArray operation state.
 *
 * If the @xas was in an error state, it will remain in an error state
 * and this function will return %NULL.  If the @xas has never been walked,
 * it will have the effect of calling xas_load().  Otherwise one will be
 * added to the index and the state will be walked to the correct
 * location in the array for the next operation.
 *
 * If the iterator was referencing index %ULONG_MAX, this function wraps
 * around to 0.
 *
 * Return: The entry at the new index.  This may be %NULL or an internal
 * entry.
 */
static inline void *xas_next(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;

        if (unlikely(xas_not_node(node) || node->shift ||
                                xas->xa_offset == XA_CHUNK_MASK))
                return __xas_next(xas);

        xas->xa_index++;
        xas->xa_offset++;
        return xa_entry(xas->xa, node, xas->xa_offset);
}

#endif /* _LINUX_XARRAY_H */















































    1 







































































































    1 


    1 
    1 









    1 






    1 







    1 
    1 
    1 



    1 






    1 




    1 







    1 
    1 
    1 



    1 
    1 




    1 








































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Cryptographic API.
 *
 * SHA-3, as specified in
 * https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf
 *
 * SHA-3 code by Jeff Garzik <jeff@garzik.org>
 *               Ard Biesheuvel <ard.biesheuvel@linaro.org>
 */
#include <crypto/internal/hash.h>
#include <crypto/sha3.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/string.h>
#include <linux/unaligned.h>

/*
 * On some 32-bit architectures (h8300), GCC ends up using
 * over 1 KB of stack if we inline the round calculation into the loop
 * in keccakf(). On the other hand, on 64-bit architectures with plenty
 * of [64-bit wide] general purpose registers, not inlining it severely
 * hurts performance. So let's use 64-bitness as a heuristic to decide
 * whether to inline or not.
 */
#ifdef CONFIG_64BIT
#define SHA3_INLINE        inline
#else
#define SHA3_INLINE        noinline
#endif

#define KECCAK_ROUNDS 24

static const u64 keccakf_rndc[24] = {
        0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL,
        0x8000000080008000ULL, 0x000000000000808bULL, 0x0000000080000001ULL,
        0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008aULL,
        0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000aULL,
        0x000000008000808bULL, 0x800000000000008bULL, 0x8000000000008089ULL,
        0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL,
        0x000000000000800aULL, 0x800000008000000aULL, 0x8000000080008081ULL,
        0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL
};

/* update the state with given number of rounds */

static SHA3_INLINE void keccakf_round(u64 st[25])
{
        u64 t[5], tt, bc[5];

        /* Theta */
        bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
        bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
        bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
        bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
        bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];

        t[0] = bc[4] ^ rol64(bc[1], 1);
        t[1] = bc[0] ^ rol64(bc[2], 1);
        t[2] = bc[1] ^ rol64(bc[3], 1);
        t[3] = bc[2] ^ rol64(bc[4], 1);
        t[4] = bc[3] ^ rol64(bc[0], 1);

        st[0] ^= t[0];

        /* Rho Pi */
        tt = st[1];
        st[ 1] = rol64(st[ 6] ^ t[1], 44);
        st[ 6] = rol64(st[ 9] ^ t[4], 20);
        st[ 9] = rol64(st[22] ^ t[2], 61);
        st[22] = rol64(st[14] ^ t[4], 39);
        st[14] = rol64(st[20] ^ t[0], 18);
        st[20] = rol64(st[ 2] ^ t[2], 62);
        st[ 2] = rol64(st[12] ^ t[2], 43);
        st[12] = rol64(st[13] ^ t[3], 25);
        st[13] = rol64(st[19] ^ t[4],  8);
        st[19] = rol64(st[23] ^ t[3], 56);
        st[23] = rol64(st[15] ^ t[0], 41);
        st[15] = rol64(st[ 4] ^ t[4], 27);
        st[ 4] = rol64(st[24] ^ t[4], 14);
        st[24] = rol64(st[21] ^ t[1],  2);
        st[21] = rol64(st[ 8] ^ t[3], 55);
        st[ 8] = rol64(st[16] ^ t[1], 45);
        st[16] = rol64(st[ 5] ^ t[0], 36);
        st[ 5] = rol64(st[ 3] ^ t[3], 28);
        st[ 3] = rol64(st[18] ^ t[3], 21);
        st[18] = rol64(st[17] ^ t[2], 15);
        st[17] = rol64(st[11] ^ t[1], 10);
        st[11] = rol64(st[ 7] ^ t[2],  6);
        st[ 7] = rol64(st[10] ^ t[0],  3);
        st[10] = rol64(    tt ^ t[1],  1);

        /* Chi */
        bc[ 0] = ~st[ 1] & st[ 2];
        bc[ 1] = ~st[ 2] & st[ 3];
        bc[ 2] = ~st[ 3] & st[ 4];
        bc[ 3] = ~st[ 4] & st[ 0];
        bc[ 4] = ~st[ 0] & st[ 1];
        st[ 0] ^= bc[ 0];
        st[ 1] ^= bc[ 1];
        st[ 2] ^= bc[ 2];
        st[ 3] ^= bc[ 3];
        st[ 4] ^= bc[ 4];

        bc[ 0] = ~st[ 6] & st[ 7];
        bc[ 1] = ~st[ 7] & st[ 8];
        bc[ 2] = ~st[ 8] & st[ 9];
        bc[ 3] = ~st[ 9] & st[ 5];
        bc[ 4] = ~st[ 5] & st[ 6];
        st[ 5] ^= bc[ 0];
        st[ 6] ^= bc[ 1];
        st[ 7] ^= bc[ 2];
        st[ 8] ^= bc[ 3];
        st[ 9] ^= bc[ 4];

        bc[ 0] = ~st[11] & st[12];
        bc[ 1] = ~st[12] & st[13];
        bc[ 2] = ~st[13] & st[14];
        bc[ 3] = ~st[14] & st[10];
        bc[ 4] = ~st[10] & st[11];
        st[10] ^= bc[ 0];
        st[11] ^= bc[ 1];
        st[12] ^= bc[ 2];
        st[13] ^= bc[ 3];
        st[14] ^= bc[ 4];

        bc[ 0] = ~st[16] & st[17];
        bc[ 1] = ~st[17] & st[18];
        bc[ 2] = ~st[18] & st[19];
        bc[ 3] = ~st[19] & st[15];
        bc[ 4] = ~st[15] & st[16];
        st[15] ^= bc[ 0];
        st[16] ^= bc[ 1];
        st[17] ^= bc[ 2];
        st[18] ^= bc[ 3];
        st[19] ^= bc[ 4];

        bc[ 0] = ~st[21] & st[22];
        bc[ 1] = ~st[22] & st[23];
        bc[ 2] = ~st[23] & st[24];
        bc[ 3] = ~st[24] & st[20];
        bc[ 4] = ~st[20] & st[21];
        st[20] ^= bc[ 0];
        st[21] ^= bc[ 1];
        st[22] ^= bc[ 2];
        st[23] ^= bc[ 3];
        st[24] ^= bc[ 4];
}

static void keccakf(u64 st[25])
{
        int round;

        for (round = 0; round < KECCAK_ROUNDS; round++) {
                keccakf_round(st);
                /* Iota */
                st[0] ^= keccakf_rndc[round];
        }
}

int crypto_sha3_init(struct shash_desc *desc)
{
        struct sha3_state *sctx = shash_desc_ctx(desc);

        memset(sctx->st, 0, sizeof(sctx->st));
        return 0;
}
EXPORT_SYMBOL(crypto_sha3_init);

static int crypto_sha3_update(struct shash_desc *desc, const u8 *data,
                              unsigned int len)
{
        unsigned int rsiz = crypto_shash_blocksize(desc->tfm);
        struct sha3_state *sctx = shash_desc_ctx(desc);
        unsigned int rsizw = rsiz / 8;

        do {
                int i;

                for (i = 0; i < rsizw; i++)
                        sctx->st[i] ^= get_unaligned_le64(data + 8 * i);
                keccakf(sctx->st);

                data += rsiz;
                len -= rsiz;
        } while (len >= rsiz);
        return len;
}

static int crypto_sha3_finup(struct shash_desc *desc, const u8 *src,
                             unsigned int len, u8 *out)
{
        unsigned int digest_size = crypto_shash_digestsize(desc->tfm);
        unsigned int rsiz = crypto_shash_blocksize(desc->tfm);
        struct sha3_state *sctx = shash_desc_ctx(desc);
        __le64 block[SHA3_224_BLOCK_SIZE / 8] = {};
        __le64 *digest = (__le64 *)out;
        unsigned int rsizw = rsiz / 8;
        u8 *p;
        int i;

        p = memcpy(block, src, len);
        p[len++] = 0x06;
        p[rsiz - 1] |= 0x80;

        for (i = 0; i < rsizw; i++)
                sctx->st[i] ^= le64_to_cpu(block[i]);
        memzero_explicit(block, sizeof(block));

        keccakf(sctx->st);

        for (i = 0; i < digest_size / 8; i++)
                put_unaligned_le64(sctx->st[i], digest++);

        if (digest_size & 4)
                put_unaligned_le32(sctx->st[i], (__le32 *)digest);

        return 0;
}

static struct shash_alg algs[] = { {
        .digestsize                = SHA3_224_DIGEST_SIZE,
        .init                        = crypto_sha3_init,
        .update                        = crypto_sha3_update,
        .finup                        = crypto_sha3_finup,
        .descsize                = SHA3_STATE_SIZE,
        .base.cra_name                = "sha3-224",
        .base.cra_driver_name        = "sha3-224-generic",
        .base.cra_flags                = CRYPTO_AHASH_ALG_BLOCK_ONLY,
        .base.cra_blocksize        = SHA3_224_BLOCK_SIZE,
        .base.cra_module        = THIS_MODULE,
}, {
        .digestsize                = SHA3_256_DIGEST_SIZE,
        .init                        = crypto_sha3_init,
        .update                        = crypto_sha3_update,
        .finup                        = crypto_sha3_finup,
        .descsize                = SHA3_STATE_SIZE,
        .base.cra_name                = "sha3-256",
        .base.cra_driver_name        = "sha3-256-generic",
        .base.cra_flags                = CRYPTO_AHASH_ALG_BLOCK_ONLY,
        .base.cra_blocksize        = SHA3_256_BLOCK_SIZE,
        .base.cra_module        = THIS_MODULE,
}, {
        .digestsize                = SHA3_384_DIGEST_SIZE,
        .init                        = crypto_sha3_init,
        .update                        = crypto_sha3_update,
        .finup                        = crypto_sha3_finup,
        .descsize                = SHA3_STATE_SIZE,
        .base.cra_name                = "sha3-384",
        .base.cra_driver_name        = "sha3-384-generic",
        .base.cra_flags                = CRYPTO_AHASH_ALG_BLOCK_ONLY,
        .base.cra_blocksize        = SHA3_384_BLOCK_SIZE,
        .base.cra_module        = THIS_MODULE,
}, {
        .digestsize                = SHA3_512_DIGEST_SIZE,
        .init                        = crypto_sha3_init,
        .update                        = crypto_sha3_update,
        .finup                        = crypto_sha3_finup,
        .descsize                = SHA3_STATE_SIZE,
        .base.cra_name                = "sha3-512",
        .base.cra_driver_name        = "sha3-512-generic",
        .base.cra_flags                = CRYPTO_AHASH_ALG_BLOCK_ONLY,
        .base.cra_blocksize        = SHA3_512_BLOCK_SIZE,
        .base.cra_module        = THIS_MODULE,
} };

static int __init sha3_generic_mod_init(void)
{
        return crypto_register_shashes(algs, ARRAY_SIZE(algs));
}

static void __exit sha3_generic_mod_fini(void)
{
        crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
}

module_init(sha3_generic_mod_init);
module_exit(sha3_generic_mod_fini);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("SHA-3 Secure Hash Algorithm");

MODULE_ALIAS_CRYPTO("sha3-224");
MODULE_ALIAS_CRYPTO("sha3-224-generic");
MODULE_ALIAS_CRYPTO("sha3-256");
MODULE_ALIAS_CRYPTO("sha3-256-generic");
MODULE_ALIAS_CRYPTO("sha3-384");
MODULE_ALIAS_CRYPTO("sha3-384-generic");
MODULE_ALIAS_CRYPTO("sha3-512");
MODULE_ALIAS_CRYPTO("sha3-512-generic");


























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Released under the GPLv2 only.
 */

#include <linux/pm.h>
#include <linux/acpi.h>

struct usb_hub_descriptor;
struct usb_dev_state;

/* Functions local to drivers/usb/core/ */

extern int usb_create_sysfs_dev_files(struct usb_device *dev);
extern void usb_remove_sysfs_dev_files(struct usb_device *dev);
extern void usb_create_sysfs_intf_files(struct usb_interface *intf);
extern void usb_remove_sysfs_intf_files(struct usb_interface *intf);
extern int usb_update_wireless_status_attr(struct usb_interface *intf);
extern int usb_create_ep_devs(struct device *parent,
                                struct usb_host_endpoint *endpoint,
                                struct usb_device *udev);
extern void usb_remove_ep_devs(struct usb_host_endpoint *endpoint);

extern void usb_enable_endpoint(struct usb_device *dev,
                struct usb_host_endpoint *ep, bool reset_toggle);
extern void usb_enable_interface(struct usb_device *dev,
                struct usb_interface *intf, bool reset_toggles);
extern void usb_disable_endpoint(struct usb_device *dev, unsigned int epaddr,
                bool reset_hardware);
extern void usb_disable_interface(struct usb_device *dev,
                struct usb_interface *intf, bool reset_hardware);
extern void usb_release_interface_cache(struct kref *ref);
extern void usb_disable_device(struct usb_device *dev, int skip_ep0);
extern int usb_deauthorize_device(struct usb_device *);
extern int usb_authorize_device(struct usb_device *);
extern void usb_deauthorize_interface(struct usb_interface *);
extern void usb_authorize_interface(struct usb_interface *);
extern void usb_detect_quirks(struct usb_device *udev);
extern void usb_detect_interface_quirks(struct usb_device *udev);
extern void usb_release_quirk_list(void);
extern bool usb_endpoint_is_ignored(struct usb_device *udev,
                struct usb_host_interface *intf,
                struct usb_endpoint_descriptor *epd);
extern int usb_remove_device(struct usb_device *udev);

extern struct usb_device_descriptor *usb_get_device_descriptor(
                struct usb_device *udev);
extern int usb_set_isoch_delay(struct usb_device *dev);
extern int usb_get_bos_descriptor(struct usb_device *dev);
extern void usb_release_bos_descriptor(struct usb_device *dev);
extern int usb_set_configuration(struct usb_device *dev, int configuration);
extern int usb_choose_configuration(struct usb_device *udev);
extern int usb_generic_driver_probe(struct usb_device *udev);
extern void usb_generic_driver_disconnect(struct usb_device *udev);
extern int usb_generic_driver_suspend(struct usb_device *udev,
                pm_message_t msg);
extern int usb_generic_driver_resume(struct usb_device *udev,
                pm_message_t msg);

static inline unsigned usb_get_max_power(struct usb_device *udev,
                struct usb_host_config *c)
{
        /* SuperSpeed power is in 8 mA units; others are in 2 mA units */
        unsigned mul = (udev->speed >= USB_SPEED_SUPER ? 8 : 2);

        return c->desc.bMaxPower * mul;
}

extern void usb_kick_hub_wq(struct usb_device *dev);
extern int usb_match_one_id_intf(struct usb_device *dev,
                                 struct usb_host_interface *intf,
                                 const struct usb_device_id *id);
extern int usb_match_device(struct usb_device *dev,
                            const struct usb_device_id *id);
extern const struct usb_device_id *usb_device_match_id(struct usb_device *udev,
                                const struct usb_device_id *id);
extern bool usb_driver_applicable(struct usb_device *udev,
                                  const struct usb_device_driver *udrv);
extern void usb_forced_unbind_intf(struct usb_interface *intf);
extern void usb_unbind_and_rebind_marked_interfaces(struct usb_device *udev);

extern void usb_hub_release_all_ports(struct usb_device *hdev,
                struct usb_dev_state *owner);
extern bool usb_device_is_owned(struct usb_device *udev);

extern int  usb_hub_init(void);
extern void usb_hub_cleanup(void);
extern int usb_major_init(void);
extern void usb_major_cleanup(void);
extern int usb_device_supports_lpm(struct usb_device *udev);
extern int usb_port_disable(struct usb_device *udev);

#ifdef        CONFIG_PM

extern int usb_suspend(struct device *dev, pm_message_t msg);
extern int usb_resume(struct device *dev, pm_message_t msg);
extern int usb_resume_complete(struct device *dev);

extern int usb_port_suspend(struct usb_device *dev, pm_message_t msg);
extern int usb_port_resume(struct usb_device *dev, pm_message_t msg);

extern void usb_autosuspend_device(struct usb_device *udev);
extern int usb_autoresume_device(struct usb_device *udev);
extern int usb_remote_wakeup(struct usb_device *dev);
extern int usb_runtime_suspend(struct device *dev);
extern int usb_runtime_resume(struct device *dev);
extern int usb_runtime_idle(struct device *dev);
extern int usb_enable_usb2_hardware_lpm(struct usb_device *udev);
extern int usb_disable_usb2_hardware_lpm(struct usb_device *udev);

extern void usbfs_notify_suspend(struct usb_device *udev);
extern void usbfs_notify_resume(struct usb_device *udev);

#else

static inline int usb_port_suspend(struct usb_device *udev, pm_message_t msg)
{
        return 0;
}

static inline int usb_port_resume(struct usb_device *udev, pm_message_t msg)
{
        return 0;
}

#define usb_autosuspend_device(udev)                do {} while (0)
static inline int usb_autoresume_device(struct usb_device *udev)
{
        return 0;
}

static inline int usb_enable_usb2_hardware_lpm(struct usb_device *udev)
{
        return 0;
}

static inline int usb_disable_usb2_hardware_lpm(struct usb_device *udev)
{
        return 0;
}

#endif

extern const struct class usbmisc_class;
extern const struct bus_type usb_bus_type;
extern struct mutex usb_port_peer_mutex;
extern const struct device_type usb_device_type;
extern const struct device_type usb_if_device_type;
extern const struct device_type usb_ep_device_type;
extern const struct device_type usb_port_device_type;
extern struct usb_device_driver usb_generic_driver;

static inline int is_usb_device(const struct device *dev)
{
        return dev->type == &usb_device_type;
}

static inline int is_usb_interface(const struct device *dev)
{
        return dev->type == &usb_if_device_type;
}

static inline int is_usb_endpoint(const struct device *dev)
{
        return dev->type == &usb_ep_device_type;
}

static inline int is_usb_port(const struct device *dev)
{
        return dev->type == &usb_port_device_type;
}

static inline int is_root_hub(struct usb_device *udev)
{
        return (udev->parent == NULL);
}

extern bool is_usb_device_driver(const struct device_driver *drv);

/* for labeling diagnostics */
extern const char *usbcore_name;

/* sysfs stuff */
extern const struct attribute_group *usb_device_groups[];
extern const struct attribute_group *usb_interface_groups[];

/* usbfs stuff */
extern struct usb_driver usbfs_driver;
extern const struct file_operations usbfs_devices_fops;
extern const struct file_operations usbdev_file_operations;

extern int usb_devio_init(void);
extern void usb_devio_cleanup(void);

/*
 * Firmware specific cookie identifying a port's location. '0' == no location
 * data available
 */
typedef u32 usb_port_location_t;

/* internal notify stuff */
extern void usb_notify_add_device(struct usb_device *udev);
extern void usb_notify_remove_device(struct usb_device *udev);
extern void usb_notify_add_bus(struct usb_bus *ubus);
extern void usb_notify_remove_bus(struct usb_bus *ubus);
extern void usb_hub_adjust_deviceremovable(struct usb_device *hdev,
                struct usb_hub_descriptor *desc);

#ifdef CONFIG_ACPI
extern int usb_acpi_register(void);
extern void usb_acpi_unregister(void);
extern acpi_handle usb_get_hub_port_acpi_handle(struct usb_device *hdev,
        int port1);
#else
static inline int usb_acpi_register(void) { return 0; };
static inline void usb_acpi_unregister(void) { };
#endif









































































































































   39 









































































































































































































































































































































































































































































































































   39 





































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
// SPDX-License-Identifier: GPL-2.0
/*
 * Detect hard and soft lockups on a system
 *
 * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
 *
 * Note: Most of this code is borrowed heavily from the original softlockup
 * detector, so thanks to Ingo for the initial implementation.
 * Some chunks also taken from the old x86-specific nmi watchdog code, thanks
 * to those contributors as well.
 */

#define pr_fmt(fmt) "watchdog: " fmt

#include <linux/cpu.h>
#include <linux/init.h>
#include <linux/irq.h>
#include <linux/irqdesc.h>
#include <linux/kernel_stat.h>
#include <linux/kvm_para.h>
#include <linux/math64.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/nmi.h>
#include <linux/stop_machine.h>
#include <linux/sysctl.h>
#include <linux/tick.h>

#include <linux/sched/clock.h>
#include <linux/sched/debug.h>
#include <linux/sched/isolation.h>

#include <asm/irq_regs.h>

static DEFINE_MUTEX(watchdog_mutex);

#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HARDLOCKUP_DETECTOR_SPARC64)
# define WATCHDOG_HARDLOCKUP_DEFAULT        1
#else
# define WATCHDOG_HARDLOCKUP_DEFAULT        0
#endif

#define NUM_SAMPLE_PERIODS        5

unsigned long __read_mostly watchdog_enabled;
int __read_mostly watchdog_user_enabled = 1;
static int __read_mostly watchdog_hardlockup_user_enabled = WATCHDOG_HARDLOCKUP_DEFAULT;
static int __read_mostly watchdog_softlockup_user_enabled = 1;
int __read_mostly watchdog_thresh = 10;
static int __read_mostly watchdog_thresh_next;
static int __read_mostly watchdog_hardlockup_available;

struct cpumask watchdog_cpumask __read_mostly;
unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);

#ifdef CONFIG_HARDLOCKUP_DETECTOR

# ifdef CONFIG_SMP
int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
# endif /* CONFIG_SMP */

/*
 * Should we panic when a soft-lockup or hard-lockup occurs:
 */
unsigned int __read_mostly hardlockup_panic =
                        IS_ENABLED(CONFIG_BOOTPARAM_HARDLOCKUP_PANIC);

#ifdef CONFIG_SYSFS

static unsigned int hardlockup_count;

static ssize_t hardlockup_count_show(struct kobject *kobj, struct kobj_attribute *attr,
                                     char *page)
{
        return sysfs_emit(page, "%u\n", hardlockup_count);
}

static struct kobj_attribute hardlockup_count_attr = __ATTR_RO(hardlockup_count);

static __init int kernel_hardlockup_sysfs_init(void)
{
        sysfs_add_file_to_group(kernel_kobj, &hardlockup_count_attr.attr, NULL);
        return 0;
}

late_initcall(kernel_hardlockup_sysfs_init);

#endif // CONFIG_SYSFS

/*
 * We may not want to enable hard lockup detection by default in all cases,
 * for example when running the kernel as a guest on a hypervisor. In these
 * cases this function can be called to disable hard lockup detection. This
 * function should only be executed once by the boot processor before the
 * kernel command line parameters are parsed, because otherwise it is not
 * possible to override this in hardlockup_panic_setup().
 */
void __init hardlockup_detector_disable(void)
{
        watchdog_hardlockup_user_enabled = 0;
}

static int __init hardlockup_panic_setup(char *str)
{
next:
        if (!strncmp(str, "panic", 5))
                hardlockup_panic = 1;
        else if (!strncmp(str, "nopanic", 7))
                hardlockup_panic = 0;
        else if (!strncmp(str, "0", 1))
                watchdog_hardlockup_user_enabled = 0;
        else if (!strncmp(str, "1", 1))
                watchdog_hardlockup_user_enabled = 1;
        else if (!strncmp(str, "r", 1))
                hardlockup_config_perf_event(str + 1);
        while (*(str++)) {
                if (*str == ',') {
                        str++;
                        goto next;
                }
        }
        return 1;
}
__setup("nmi_watchdog=", hardlockup_panic_setup);

#endif /* CONFIG_HARDLOCKUP_DETECTOR */

#if defined(CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER)

static DEFINE_PER_CPU(atomic_t, hrtimer_interrupts);
static DEFINE_PER_CPU(int, hrtimer_interrupts_saved);
static DEFINE_PER_CPU(bool, watchdog_hardlockup_warned);
static DEFINE_PER_CPU(bool, watchdog_hardlockup_touched);
static unsigned long hard_lockup_nmi_warn;

notrace void arch_touch_nmi_watchdog(void)
{
        /*
         * Using __raw here because some code paths have
         * preemption enabled.  If preemption is enabled
         * then interrupts should be enabled too, in which
         * case we shouldn't have to worry about the watchdog
         * going off.
         */
        raw_cpu_write(watchdog_hardlockup_touched, true);
}
EXPORT_SYMBOL(arch_touch_nmi_watchdog);

void watchdog_hardlockup_touch_cpu(unsigned int cpu)
{
        per_cpu(watchdog_hardlockup_touched, cpu) = true;
}

static bool is_hardlockup(unsigned int cpu)
{
        int hrint = atomic_read(&per_cpu(hrtimer_interrupts, cpu));

        if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint)
                return true;

        /*
         * NOTE: we don't need any fancy atomic_t or READ_ONCE/WRITE_ONCE
         * for hrtimer_interrupts_saved. hrtimer_interrupts_saved is
         * written/read by a single CPU.
         */
        per_cpu(hrtimer_interrupts_saved, cpu) = hrint;

        return false;
}

static void watchdog_hardlockup_kick(void)
{
        int new_interrupts;

        new_interrupts = atomic_inc_return(this_cpu_ptr(&hrtimer_interrupts));
        watchdog_buddy_check_hardlockup(new_interrupts);
}

void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
{
        if (per_cpu(watchdog_hardlockup_touched, cpu)) {
                per_cpu(watchdog_hardlockup_touched, cpu) = false;
                return;
        }

        /*
         * Check for a hardlockup by making sure the CPU's timer
         * interrupt is incrementing. The timer interrupt should have
         * fired multiple times before we overflow'd. If it hasn't
         * then this is a good indication the cpu is stuck
         */
        if (is_hardlockup(cpu)) {
                unsigned int this_cpu = smp_processor_id();
                unsigned long flags;

#ifdef CONFIG_SYSFS
                ++hardlockup_count;
#endif

                /* Only print hardlockups once. */
                if (per_cpu(watchdog_hardlockup_warned, cpu))
                        return;

                /*
                 * Prevent multiple hard-lockup reports if one cpu is already
                 * engaged in dumping all cpu back traces.
                 */
                if (sysctl_hardlockup_all_cpu_backtrace) {
                        if (test_and_set_bit_lock(0, &hard_lockup_nmi_warn))
                                return;
                }

                /*
                 * NOTE: we call printk_cpu_sync_get_irqsave() after printing
                 * the lockup message. While it would be nice to serialize
                 * that printout, we really want to make sure that if some
                 * other CPU somehow locked up while holding the lock associated
                 * with printk_cpu_sync_get_irqsave() that we can still at least
                 * get the message about the lockup out.
                 */
                pr_emerg("CPU%u: Watchdog detected hard LOCKUP on cpu %u\n", this_cpu, cpu);
                printk_cpu_sync_get_irqsave(flags);

                print_modules();
                print_irqtrace_events(current);
                if (cpu == this_cpu) {
                        if (regs)
                                show_regs(regs);
                        else
                                dump_stack();
                        printk_cpu_sync_put_irqrestore(flags);
                } else {
                        printk_cpu_sync_put_irqrestore(flags);
                        trigger_single_cpu_backtrace(cpu);
                }

                if (sysctl_hardlockup_all_cpu_backtrace) {
                        trigger_allbutcpu_cpu_backtrace(cpu);
                        if (!hardlockup_panic)
                                clear_bit_unlock(0, &hard_lockup_nmi_warn);
                }

                if (hardlockup_panic)
                        nmi_panic(regs, "Hard LOCKUP");

                per_cpu(watchdog_hardlockup_warned, cpu) = true;
        } else {
                per_cpu(watchdog_hardlockup_warned, cpu) = false;
        }
}

#else /* CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER */

static inline void watchdog_hardlockup_kick(void) { }

#endif /* !CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER */

/*
 * These functions can be overridden based on the configured hardlockdup detector.
 *
 * watchdog_hardlockup_enable/disable can be implemented to start and stop when
 * softlockup watchdog start and stop. The detector must select the
 * SOFTLOCKUP_DETECTOR Kconfig.
 */
void __weak watchdog_hardlockup_enable(unsigned int cpu) { }

void __weak watchdog_hardlockup_disable(unsigned int cpu) { }

/*
 * Watchdog-detector specific API.
 *
 * Return 0 when hardlockup watchdog is available, negative value otherwise.
 * Note that the negative value means that a delayed probe might
 * succeed later.
 */
int __weak __init watchdog_hardlockup_probe(void)
{
        return -ENODEV;
}

/**
 * watchdog_hardlockup_stop - Stop the watchdog for reconfiguration
 *
 * The reconfiguration steps are:
 * watchdog_hardlockup_stop();
 * update_variables();
 * watchdog_hardlockup_start();
 */
void __weak watchdog_hardlockup_stop(void) { }

/**
 * watchdog_hardlockup_start - Start the watchdog after reconfiguration
 *
 * Counterpart to watchdog_hardlockup_stop().
 *
 * The following variables have been updated in update_variables() and
 * contain the currently valid configuration:
 * - watchdog_enabled
 * - watchdog_thresh
 * - watchdog_cpumask
 */
void __weak watchdog_hardlockup_start(void) { }

/**
 * lockup_detector_update_enable - Update the sysctl enable bit
 *
 * Caller needs to make sure that the hard watchdogs are off, so this
 * can't race with watchdog_hardlockup_disable().
 */
static void lockup_detector_update_enable(void)
{
        watchdog_enabled = 0;
        if (!watchdog_user_enabled)
                return;
        if (watchdog_hardlockup_available && watchdog_hardlockup_user_enabled)
                watchdog_enabled |= WATCHDOG_HARDLOCKUP_ENABLED;
        if (watchdog_softlockup_user_enabled)
                watchdog_enabled |= WATCHDOG_SOFTOCKUP_ENABLED;
}

#ifdef CONFIG_SOFTLOCKUP_DETECTOR

/*
 * Delay the soflockup report when running a known slow code.
 * It does _not_ affect the timestamp of the last successdul reschedule.
 */
#define SOFTLOCKUP_DELAY_REPORT        ULONG_MAX

#ifdef CONFIG_SMP
int __read_mostly sysctl_softlockup_all_cpu_backtrace;
#endif

static struct cpumask watchdog_allowed_mask __read_mostly;

/* Global variables, exported for sysctl */
unsigned int __read_mostly softlockup_panic =
                        IS_ENABLED(CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC);

static bool softlockup_initialized __read_mostly;
static u64 __read_mostly sample_period;

#ifdef CONFIG_SYSFS

static unsigned int softlockup_count;

static ssize_t softlockup_count_show(struct kobject *kobj, struct kobj_attribute *attr,
                                     char *page)
{
        return sysfs_emit(page, "%u\n", softlockup_count);
}

static struct kobj_attribute softlockup_count_attr = __ATTR_RO(softlockup_count);

static __init int kernel_softlockup_sysfs_init(void)
{
        sysfs_add_file_to_group(kernel_kobj, &softlockup_count_attr.attr, NULL);
        return 0;
}

late_initcall(kernel_softlockup_sysfs_init);

#endif // CONFIG_SYSFS

/* Timestamp taken after the last successful reschedule. */
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
/* Timestamp of the last softlockup report. */
static DEFINE_PER_CPU(unsigned long, watchdog_report_ts);
static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
static DEFINE_PER_CPU(bool, softlockup_touch_sync);
static unsigned long soft_lockup_nmi_warn;

static int __init softlockup_panic_setup(char *str)
{
        softlockup_panic = simple_strtoul(str, NULL, 0);
        return 1;
}
__setup("softlockup_panic=", softlockup_panic_setup);

static int __init nowatchdog_setup(char *str)
{
        watchdog_user_enabled = 0;
        return 1;
}
__setup("nowatchdog", nowatchdog_setup);

static int __init nosoftlockup_setup(char *str)
{
        watchdog_softlockup_user_enabled = 0;
        return 1;
}
__setup("nosoftlockup", nosoftlockup_setup);

static int __init watchdog_thresh_setup(char *str)
{
        get_option(&str, &watchdog_thresh);
        return 1;
}
__setup("watchdog_thresh=", watchdog_thresh_setup);

#ifdef CONFIG_SOFTLOCKUP_DETECTOR_INTR_STORM
enum stats_per_group {
        STATS_SYSTEM,
        STATS_SOFTIRQ,
        STATS_HARDIRQ,
        STATS_IDLE,
        NUM_STATS_PER_GROUP,
};

static const enum cpu_usage_stat tracked_stats[NUM_STATS_PER_GROUP] = {
        CPUTIME_SYSTEM,
        CPUTIME_SOFTIRQ,
        CPUTIME_IRQ,
        CPUTIME_IDLE,
};

static DEFINE_PER_CPU(u16, cpustat_old[NUM_STATS_PER_GROUP]);
static DEFINE_PER_CPU(u8, cpustat_util[NUM_SAMPLE_PERIODS][NUM_STATS_PER_GROUP]);
static DEFINE_PER_CPU(u8, cpustat_tail);

/*
 * We don't need nanosecond resolution. A granularity of 16ms is
 * sufficient for our precision, allowing us to use u16 to store
 * cpustats, which will roll over roughly every ~1000 seconds.
 * 2^24 ~= 16 * 10^6
 */
static u16 get_16bit_precision(u64 data_ns)
{
        /*
         * 2^24ns ~= 16.8ms
         * Round to the nearest multiple of 16.8 milliseconds.
         */
        return (data_ns + (1 << 23)) >> 24LL;
}

static void update_cpustat(void)
{
        int i;
        u8 util;
        u16 old_stat, new_stat;
        struct kernel_cpustat kcpustat;
        u64 *cpustat = kcpustat.cpustat;
        u8 tail = __this_cpu_read(cpustat_tail);
        u16 sample_period_16 = get_16bit_precision(sample_period);

        kcpustat_cpu_fetch(&kcpustat, smp_processor_id());

        for (i = 0; i < NUM_STATS_PER_GROUP; i++) {
                old_stat = __this_cpu_read(cpustat_old[i]);
                new_stat = get_16bit_precision(cpustat[tracked_stats[i]]);
                util = DIV_ROUND_UP(100 * (new_stat - old_stat), sample_period_16);
                /*
                 * Since we use 16-bit precision, the raw data will undergo
                 * integer division, which may sometimes result in data loss,
                 * and then result might exceed 100%. To avoid confusion,
                 * we enforce a 100% display cap when calculations exceed this threshold.
                 */
                if (util > 100)
                        util = 100;
                __this_cpu_write(cpustat_util[tail][i], util);
                __this_cpu_write(cpustat_old[i], new_stat);
        }

        __this_cpu_write(cpustat_tail, (tail + 1) % NUM_SAMPLE_PERIODS);
}

static void print_cpustat(void)
{
        int i, group;
        u8 tail = __this_cpu_read(cpustat_tail);
        u64 sample_period_msecond = sample_period;

        do_div(sample_period_msecond, NSEC_PER_MSEC);

        /*
         * Outputting the "watchdog" prefix on every line is redundant and not
         * concise, and the original alarm information is sufficient for
         * positioning in logs, hence here printk() is used instead of pr_crit().
         */
        printk(KERN_CRIT "CPU#%d Utilization every %llums during lockup:\n",
               smp_processor_id(), sample_period_msecond);

        for (i = 0; i < NUM_SAMPLE_PERIODS; i++) {
                group = (tail + i) % NUM_SAMPLE_PERIODS;
                printk(KERN_CRIT "\t#%d: %3u%% system,\t%3u%% softirq,\t"
                        "%3u%% hardirq,\t%3u%% idle\n", i + 1,
                        __this_cpu_read(cpustat_util[group][STATS_SYSTEM]),
                        __this_cpu_read(cpustat_util[group][STATS_SOFTIRQ]),
                        __this_cpu_read(cpustat_util[group][STATS_HARDIRQ]),
                        __this_cpu_read(cpustat_util[group][STATS_IDLE]));
        }
}

#define HARDIRQ_PERCENT_THRESH          50
#define NUM_HARDIRQ_REPORT              5
struct irq_counts {
        int irq;
        u32 counts;
};

static DEFINE_PER_CPU(bool, snapshot_taken);

/* Tabulate the most frequent interrupts. */
static void tabulate_irq_count(struct irq_counts *irq_counts, int irq, u32 counts, int rank)
{
        int i;
        struct irq_counts new_count = {irq, counts};

        for (i = 0; i < rank; i++) {
                if (counts > irq_counts[i].counts)
                        swap(new_count, irq_counts[i]);
        }
}

/*
 * If the hardirq time exceeds HARDIRQ_PERCENT_THRESH% of the sample_period,
 * then the cause of softlockup might be interrupt storm. In this case, it
 * would be useful to start interrupt counting.
 */
static bool need_counting_irqs(void)
{
        u8 util;
        int tail = __this_cpu_read(cpustat_tail);

        tail = (tail + NUM_HARDIRQ_REPORT - 1) % NUM_HARDIRQ_REPORT;
        util = __this_cpu_read(cpustat_util[tail][STATS_HARDIRQ]);
        return util > HARDIRQ_PERCENT_THRESH;
}

static void start_counting_irqs(void)
{
        if (!__this_cpu_read(snapshot_taken)) {
                kstat_snapshot_irqs();
                __this_cpu_write(snapshot_taken, true);
        }
}

static void stop_counting_irqs(void)
{
        __this_cpu_write(snapshot_taken, false);
}

static void print_irq_counts(void)
{
        unsigned int i, count;
        struct irq_counts irq_counts_sorted[NUM_HARDIRQ_REPORT] = {
                {-1, 0}, {-1, 0}, {-1, 0}, {-1, 0}, {-1, 0}
        };

        if (__this_cpu_read(snapshot_taken)) {
                for_each_active_irq(i) {
                        count = kstat_get_irq_since_snapshot(i);
                        tabulate_irq_count(irq_counts_sorted, i, count, NUM_HARDIRQ_REPORT);
                }

                /*
                 * Outputting the "watchdog" prefix on every line is redundant and not
                 * concise, and the original alarm information is sufficient for
                 * positioning in logs, hence here printk() is used instead of pr_crit().
                 */
                printk(KERN_CRIT "CPU#%d Detect HardIRQ Time exceeds %d%%. Most frequent HardIRQs:\n",
                       smp_processor_id(), HARDIRQ_PERCENT_THRESH);

                for (i = 0; i < NUM_HARDIRQ_REPORT; i++) {
                        if (irq_counts_sorted[i].irq == -1)
                                break;

                        printk(KERN_CRIT "\t#%u: %-10u\tirq#%d\n",
                               i + 1, irq_counts_sorted[i].counts,
                               irq_counts_sorted[i].irq);
                }

                /*
                 * If the hardirq time is less than HARDIRQ_PERCENT_THRESH% in the last
                 * sample_period, then we suspect the interrupt storm might be subsiding.
                 */
                if (!need_counting_irqs())
                        stop_counting_irqs();
        }
}

static void report_cpu_status(void)
{
        print_cpustat();
        print_irq_counts();
}
#else
static inline void update_cpustat(void) { }
static inline void report_cpu_status(void) { }
static inline bool need_counting_irqs(void) { return false; }
static inline void start_counting_irqs(void) { }
static inline void stop_counting_irqs(void) { }
#endif

/*
 * Hard-lockup warnings should be triggered after just a few seconds. Soft-
 * lockups can have false positives under extreme conditions. So we generally
 * want a higher threshold for soft lockups than for hard lockups. So we couple
 * the thresholds with a factor: we make the soft threshold twice the amount of
 * time the hard threshold is.
 */
static int get_softlockup_thresh(void)
{
        return watchdog_thresh * 2;
}

/*
 * Returns seconds, approximately.  We don't need nanosecond
 * resolution, and we don't need to waste time with a big divide when
 * 2^30ns == 1.074s.
 */
static unsigned long get_timestamp(void)
{
        return running_clock() >> 30LL;  /* 2^30 ~= 10^9 */
}

static void set_sample_period(void)
{
        /*
         * convert watchdog_thresh from seconds to ns
         * the divide by 5 is to give hrtimer several chances (two
         * or three with the current relation between the soft
         * and hard thresholds) to increment before the
         * hardlockup detector generates a warning
         */
        sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / NUM_SAMPLE_PERIODS);
        watchdog_update_hrtimer_threshold(sample_period);
}

static void update_report_ts(void)
{
        __this_cpu_write(watchdog_report_ts, get_timestamp());
}

/* Commands for resetting the watchdog */
static void update_touch_ts(void)
{
        __this_cpu_write(watchdog_touch_ts, get_timestamp());
        update_report_ts();
}

/**
 * touch_softlockup_watchdog_sched - touch watchdog on scheduler stalls
 *
 * Call when the scheduler may have stalled for legitimate reasons
 * preventing the watchdog task from executing - e.g. the scheduler
 * entering idle state.  This should only be used for scheduler events.
 * Use touch_softlockup_watchdog() for everything else.
 */
notrace void touch_softlockup_watchdog_sched(void)
{
        /*
         * Preemption can be enabled.  It doesn't matter which CPU's watchdog
         * report period gets restarted here, so use the raw_ operation.
         */
        raw_cpu_write(watchdog_report_ts, SOFTLOCKUP_DELAY_REPORT);
}

notrace void touch_softlockup_watchdog(void)
{
        touch_softlockup_watchdog_sched();
        wq_watchdog_touch(raw_smp_processor_id());
}
EXPORT_SYMBOL(touch_softlockup_watchdog);

void touch_all_softlockup_watchdogs(void)
{
        int cpu;

        /*
         * watchdog_mutex cannpt be taken here, as this might be called
         * from (soft)interrupt context, so the access to
         * watchdog_allowed_cpumask might race with a concurrent update.
         *
         * The watchdog time stamp can race against a concurrent real
         * update as well, the only side effect might be a cycle delay for
         * the softlockup check.
         */
        for_each_cpu(cpu, &watchdog_allowed_mask) {
                per_cpu(watchdog_report_ts, cpu) = SOFTLOCKUP_DELAY_REPORT;
                wq_watchdog_touch(cpu);
        }
}

void touch_softlockup_watchdog_sync(void)
{
        __this_cpu_write(softlockup_touch_sync, true);
        __this_cpu_write(watchdog_report_ts, SOFTLOCKUP_DELAY_REPORT);
}

static int is_softlockup(unsigned long touch_ts,
                         unsigned long period_ts,
                         unsigned long now)
{
        if ((watchdog_enabled & WATCHDOG_SOFTOCKUP_ENABLED) && watchdog_thresh) {
                /*
                 * If period_ts has not been updated during a sample_period, then
                 * in the subsequent few sample_periods, period_ts might also not
                 * be updated, which could indicate a potential softlockup. In
                 * this case, if we suspect the cause of the potential softlockup
                 * might be interrupt storm, then we need to count the interrupts
                 * to find which interrupt is storming.
                 */
                if (time_after_eq(now, period_ts + get_softlockup_thresh() / NUM_SAMPLE_PERIODS) &&
                    need_counting_irqs())
                        start_counting_irqs();

                /*
                 * A poorly behaving BPF scheduler can live-lock the system into
                 * soft lockups. Tell sched_ext to try ejecting the BPF
                 * scheduler when close to a soft lockup.
                 */
                if (time_after_eq(now, period_ts + get_softlockup_thresh() * 3 / 4))
                        scx_softlockup(now - touch_ts);

                /* Warn about unreasonable delays. */
                if (time_after(now, period_ts + get_softlockup_thresh()))
                        return now - touch_ts;
        }
        return 0;
}

/* watchdog detector functions */
static DEFINE_PER_CPU(struct completion, softlockup_completion);
static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);

/*
 * The watchdog feed function - touches the timestamp.
 *
 * It only runs once every sample_period seconds (4 seconds by
 * default) to reset the softlockup timestamp. If this gets delayed
 * for more than 2*watchdog_thresh seconds then the debug-printout
 * triggers in watchdog_timer_fn().
 */
static int softlockup_fn(void *data)
{
        update_touch_ts();
        stop_counting_irqs();
        complete(this_cpu_ptr(&softlockup_completion));

        return 0;
}

/* watchdog kicker functions */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
        unsigned long touch_ts, period_ts, now;
        struct pt_regs *regs = get_irq_regs();
        int duration;
        int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
        unsigned long flags;

        if (!watchdog_enabled)
                return HRTIMER_NORESTART;

        /*
         * pass the buddy check if a panic is in process
         */
        if (panic_in_progress())
                return HRTIMER_NORESTART;

        watchdog_hardlockup_kick();

        /* kick the softlockup detector */
        if (completion_done(this_cpu_ptr(&softlockup_completion))) {
                reinit_completion(this_cpu_ptr(&softlockup_completion));
                stop_one_cpu_nowait(smp_processor_id(),
                                softlockup_fn, NULL,
                                this_cpu_ptr(&softlockup_stop_work));
        }

        /* .. and repeat */
        hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));

        /*
         * Read the current timestamp first. It might become invalid anytime
         * when a virtual machine is stopped by the host or when the watchog
         * is touched from NMI.
         */
        now = get_timestamp();
        /*
         * If a virtual machine is stopped by the host it can look to
         * the watchdog like a soft lockup. This function touches the watchdog.
         */
        kvm_check_and_clear_guest_paused();
        /*
         * The stored timestamp is comparable with @now only when not touched.
         * It might get touched anytime from NMI. Make sure that is_softlockup()
         * uses the same (valid) value.
         */
        period_ts = READ_ONCE(*this_cpu_ptr(&watchdog_report_ts));

        update_cpustat();

        /* Reset the interval when touched by known problematic code. */
        if (period_ts == SOFTLOCKUP_DELAY_REPORT) {
                if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
                        /*
                         * If the time stamp was touched atomically
                         * make sure the scheduler tick is up to date.
                         */
                        __this_cpu_write(softlockup_touch_sync, false);
                        sched_clock_tick();
                }

                update_report_ts();
                return HRTIMER_RESTART;
        }

        /* Check for a softlockup. */
        touch_ts = __this_cpu_read(watchdog_touch_ts);
        duration = is_softlockup(touch_ts, period_ts, now);
        if (unlikely(duration)) {
#ifdef CONFIG_SYSFS
                ++softlockup_count;
#endif

                /*
                 * Prevent multiple soft-lockup reports if one cpu is already
                 * engaged in dumping all cpu back traces.
                 */
                if (softlockup_all_cpu_backtrace) {
                        if (test_and_set_bit_lock(0, &soft_lockup_nmi_warn))
                                return HRTIMER_RESTART;
                }

                /* Start period for the next softlockup warning. */
                update_report_ts();

                printk_cpu_sync_get_irqsave(flags);
                pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
                        smp_processor_id(), duration,
                        current->comm, task_pid_nr(current));
                report_cpu_status();
                print_modules();
                print_irqtrace_events(current);
                if (regs)
                        show_regs(regs);
                else
                        dump_stack();
                printk_cpu_sync_put_irqrestore(flags);

                if (softlockup_all_cpu_backtrace) {
                        trigger_allbutcpu_cpu_backtrace(smp_processor_id());
                        if (!softlockup_panic)
                                clear_bit_unlock(0, &soft_lockup_nmi_warn);
                }

                add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
                if (softlockup_panic)
                        panic("softlockup: hung tasks");
        }

        return HRTIMER_RESTART;
}

static void watchdog_enable(unsigned int cpu)
{
        struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
        struct completion *done = this_cpu_ptr(&softlockup_completion);

        WARN_ON_ONCE(cpu != smp_processor_id());

        init_completion(done);
        complete(done);

        /*
         * Start the timer first to prevent the hardlockup watchdog triggering
         * before the timer has a chance to fire.
         */
        hrtimer_setup(hrtimer, watchdog_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
        hrtimer_start(hrtimer, ns_to_ktime(sample_period),
                      HRTIMER_MODE_REL_PINNED_HARD);

        /* Initialize timestamp */
        update_touch_ts();
        /* Enable the hardlockup detector */
        if (watchdog_enabled & WATCHDOG_HARDLOCKUP_ENABLED)
                watchdog_hardlockup_enable(cpu);
}

static void watchdog_disable(unsigned int cpu)
{
        struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);

        WARN_ON_ONCE(cpu != smp_processor_id());

        /*
         * Disable the hardlockup detector first. That prevents that a large
         * delay between disabling the timer and disabling the hardlockup
         * detector causes a false positive.
         */
        watchdog_hardlockup_disable(cpu);
        hrtimer_cancel(hrtimer);
        wait_for_completion(this_cpu_ptr(&softlockup_completion));
}

static int softlockup_stop_fn(void *data)
{
        watchdog_disable(smp_processor_id());
        return 0;
}

static void softlockup_stop_all(void)
{
        int cpu;

        if (!softlockup_initialized)
                return;

        for_each_cpu(cpu, &watchdog_allowed_mask)
                smp_call_on_cpu(cpu, softlockup_stop_fn, NULL, false);

        cpumask_clear(&watchdog_allowed_mask);
}

static int softlockup_start_fn(void *data)
{
        watchdog_enable(smp_processor_id());
        return 0;
}

static void softlockup_start_all(void)
{
        int cpu;

        cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask);
        for_each_cpu(cpu, &watchdog_allowed_mask)
                smp_call_on_cpu(cpu, softlockup_start_fn, NULL, false);
}

int lockup_detector_online_cpu(unsigned int cpu)
{
        if (cpumask_test_cpu(cpu, &watchdog_allowed_mask))
                watchdog_enable(cpu);
        return 0;
}

int lockup_detector_offline_cpu(unsigned int cpu)
{
        if (cpumask_test_cpu(cpu, &watchdog_allowed_mask))
                watchdog_disable(cpu);
        return 0;
}

static void __lockup_detector_reconfigure(bool thresh_changed)
{
        cpus_read_lock();
        watchdog_hardlockup_stop();

        softlockup_stop_all();
        /*
         * To prevent watchdog_timer_fn from using the old interval and
         * the new watchdog_thresh at the same time, which could lead to
         * false softlockup reports, it is necessary to update the
         * watchdog_thresh after the softlockup is completed.
         */
        if (thresh_changed)
                watchdog_thresh = READ_ONCE(watchdog_thresh_next);
        set_sample_period();
        lockup_detector_update_enable();
        if (watchdog_enabled && watchdog_thresh)
                softlockup_start_all();

        watchdog_hardlockup_start();
        cpus_read_unlock();
}

void lockup_detector_reconfigure(void)
{
        mutex_lock(&watchdog_mutex);
        __lockup_detector_reconfigure(false);
        mutex_unlock(&watchdog_mutex);
}

/*
 * Create the watchdog infrastructure and configure the detector(s).
 */
static __init void lockup_detector_setup(void)
{
        /*
         * If sysctl is off and watchdog got disabled on the command line,
         * nothing to do here.
         */
        lockup_detector_update_enable();

        if (!IS_ENABLED(CONFIG_SYSCTL) &&
            !(watchdog_enabled && watchdog_thresh))
                return;

        mutex_lock(&watchdog_mutex);
        __lockup_detector_reconfigure(false);
        softlockup_initialized = true;
        mutex_unlock(&watchdog_mutex);
}

#else /* CONFIG_SOFTLOCKUP_DETECTOR */
static void __lockup_detector_reconfigure(bool thresh_changed)
{
        cpus_read_lock();
        watchdog_hardlockup_stop();
        if (thresh_changed)
                watchdog_thresh = READ_ONCE(watchdog_thresh_next);
        lockup_detector_update_enable();
        watchdog_hardlockup_start();
        cpus_read_unlock();
}
void lockup_detector_reconfigure(void)
{
        __lockup_detector_reconfigure(false);
}
static inline void lockup_detector_setup(void)
{
        __lockup_detector_reconfigure(false);
}
#endif /* !CONFIG_SOFTLOCKUP_DETECTOR */

/**
 * lockup_detector_soft_poweroff - Interface to stop lockup detector(s)
 *
 * Special interface for parisc. It prevents lockup detector warnings from
 * the default pm_poweroff() function which busy loops forever.
 */
void lockup_detector_soft_poweroff(void)
{
        watchdog_enabled = 0;
}

#ifdef CONFIG_SYSCTL

/* Propagate any changes to the watchdog infrastructure */
static void proc_watchdog_update(bool thresh_changed)
{
        /* Remove impossible cpus to keep sysctl output clean. */
        cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask);
        __lockup_detector_reconfigure(thresh_changed);
}

/*
 * common function for watchdog, nmi_watchdog and soft_watchdog parameter
 *
 * caller             | table->data points to            | 'which'
 * -------------------|----------------------------------|-------------------------------
 * proc_watchdog      | watchdog_user_enabled            | WATCHDOG_HARDLOCKUP_ENABLED |
 *                    |                                  | WATCHDOG_SOFTOCKUP_ENABLED
 * -------------------|----------------------------------|-------------------------------
 * proc_nmi_watchdog  | watchdog_hardlockup_user_enabled | WATCHDOG_HARDLOCKUP_ENABLED
 * -------------------|----------------------------------|-------------------------------
 * proc_soft_watchdog | watchdog_softlockup_user_enabled | WATCHDOG_SOFTOCKUP_ENABLED
 */
static int proc_watchdog_common(int which, const struct ctl_table *table, int write,
                                void *buffer, size_t *lenp, loff_t *ppos)
{
        int err, old, *param = table->data;

        mutex_lock(&watchdog_mutex);

        old = *param;
        if (!write) {
                /*
                 * On read synchronize the userspace interface. This is a
                 * racy snapshot.
                 */
                *param = (watchdog_enabled & which) != 0;
                err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
                *param = old;
        } else {
                err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
                if (!err && old != READ_ONCE(*param))
                        proc_watchdog_update(false);
        }
        mutex_unlock(&watchdog_mutex);
        return err;
}

/*
 * /proc/sys/kernel/watchdog
 */
static int proc_watchdog(const struct ctl_table *table, int write,
                         void *buffer, size_t *lenp, loff_t *ppos)
{
        return proc_watchdog_common(WATCHDOG_HARDLOCKUP_ENABLED |
                                    WATCHDOG_SOFTOCKUP_ENABLED,
                                    table, write, buffer, lenp, ppos);
}

/*
 * /proc/sys/kernel/nmi_watchdog
 */
static int proc_nmi_watchdog(const struct ctl_table *table, int write,
                             void *buffer, size_t *lenp, loff_t *ppos)
{
        if (!watchdog_hardlockup_available && write)
                return -ENOTSUPP;
        return proc_watchdog_common(WATCHDOG_HARDLOCKUP_ENABLED,
                                    table, write, buffer, lenp, ppos);
}

#ifdef CONFIG_SOFTLOCKUP_DETECTOR
/*
 * /proc/sys/kernel/soft_watchdog
 */
static int proc_soft_watchdog(const struct ctl_table *table, int write,
                              void *buffer, size_t *lenp, loff_t *ppos)
{
        return proc_watchdog_common(WATCHDOG_SOFTOCKUP_ENABLED,
                                    table, write, buffer, lenp, ppos);
}
#endif

/*
 * /proc/sys/kernel/watchdog_thresh
 */
static int proc_watchdog_thresh(const struct ctl_table *table, int write,
                                void *buffer, size_t *lenp, loff_t *ppos)
{
        int err, old;

        mutex_lock(&watchdog_mutex);

        watchdog_thresh_next = READ_ONCE(watchdog_thresh);

        old = watchdog_thresh_next;
        err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);

        if (!err && write && old != READ_ONCE(watchdog_thresh_next))
                proc_watchdog_update(true);

        mutex_unlock(&watchdog_mutex);
        return err;
}

/*
 * The cpumask is the mask of possible cpus that the watchdog can run
 * on, not the mask of cpus it is actually running on.  This allows the
 * user to specify a mask that will include cpus that have not yet
 * been brought online, if desired.
 */
static int proc_watchdog_cpumask(const struct ctl_table *table, int write,
                                 void *buffer, size_t *lenp, loff_t *ppos)
{
        int err;

        mutex_lock(&watchdog_mutex);

        err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
        if (!err && write)
                proc_watchdog_update(false);

        mutex_unlock(&watchdog_mutex);
        return err;
}

static const int sixty = 60;

static const struct ctl_table watchdog_sysctls[] = {
        {
                .procname       = "watchdog",
                .data                = &watchdog_user_enabled,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler   = proc_watchdog,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
        {
                .procname        = "watchdog_thresh",
                .data                = &watchdog_thresh_next,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_watchdog_thresh,
                .extra1                = SYSCTL_ZERO,
                .extra2                = (void *)&sixty,
        },
        {
                .procname        = "watchdog_cpumask",
                .data                = &watchdog_cpumask_bits,
                .maxlen                = NR_CPUS,
                .mode                = 0644,
                .proc_handler        = proc_watchdog_cpumask,
        },
#ifdef CONFIG_SOFTLOCKUP_DETECTOR
        {
                .procname       = "soft_watchdog",
                .data                = &watchdog_softlockup_user_enabled,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler   = proc_soft_watchdog,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
        {
                .procname        = "softlockup_panic",
                .data                = &softlockup_panic,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
#ifdef CONFIG_SMP
        {
                .procname        = "softlockup_all_cpu_backtrace",
                .data                = &sysctl_softlockup_all_cpu_backtrace,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
#endif /* CONFIG_SMP */
#endif
#ifdef CONFIG_HARDLOCKUP_DETECTOR
        {
                .procname        = "hardlockup_panic",
                .data                = &hardlockup_panic,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
#ifdef CONFIG_SMP
        {
                .procname        = "hardlockup_all_cpu_backtrace",
                .data                = &sysctl_hardlockup_all_cpu_backtrace,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
#endif /* CONFIG_SMP */
#endif
};

static struct ctl_table watchdog_hardlockup_sysctl[] = {
        {
                .procname       = "nmi_watchdog",
                .data                = &watchdog_hardlockup_user_enabled,
                .maxlen                = sizeof(int),
                .mode                = 0444,
                .proc_handler   = proc_nmi_watchdog,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
};

static void __init watchdog_sysctl_init(void)
{
        register_sysctl_init("kernel", watchdog_sysctls);

        if (watchdog_hardlockup_available)
                watchdog_hardlockup_sysctl[0].mode = 0644;
        register_sysctl_init("kernel", watchdog_hardlockup_sysctl);
}

#else
#define watchdog_sysctl_init() do { } while (0)
#endif /* CONFIG_SYSCTL */

static void __init lockup_detector_delay_init(struct work_struct *work);
static bool allow_lockup_detector_init_retry __initdata;

static struct work_struct detector_work __initdata =
                __WORK_INITIALIZER(detector_work, lockup_detector_delay_init);

static void __init lockup_detector_delay_init(struct work_struct *work)
{
        int ret;

        ret = watchdog_hardlockup_probe();
        if (ret) {
                if (ret == -ENODEV)
                        pr_info("NMI not fully supported\n");
                else
                        pr_info("Delayed init of the lockup detector failed: %d\n", ret);
                pr_info("Hard watchdog permanently disabled\n");
                return;
        }

        allow_lockup_detector_init_retry = false;

        watchdog_hardlockup_available = true;
        lockup_detector_setup();
}

/*
 * lockup_detector_retry_init - retry init lockup detector if possible.
 *
 * Retry hardlockup detector init. It is useful when it requires some
 * functionality that has to be initialized later on a particular
 * platform.
 */
void __init lockup_detector_retry_init(void)
{
        /* Must be called before late init calls */
        if (!allow_lockup_detector_init_retry)
                return;

        schedule_work(&detector_work);
}

/*
 * Ensure that optional delayed hardlockup init is proceed before
 * the init code and memory is freed.
 */
static int __init lockup_detector_check(void)
{
        /* Prevent any later retry. */
        allow_lockup_detector_init_retry = false;

        /* Make sure no work is pending. */
        flush_work(&detector_work);

        watchdog_sysctl_init();

        return 0;

}
late_initcall_sync(lockup_detector_check);

void __init lockup_detector_init(void)
{
        if (tick_nohz_full_enabled())
                pr_info("Disabling watchdog on nohz_full cores by default\n");

        cpumask_copy(&watchdog_cpumask,
                     housekeeping_cpumask(HK_TYPE_TIMER));

        if (!watchdog_hardlockup_probe())
                watchdog_hardlockup_available = true;
        else
                allow_lockup_detector_init_retry = true;

        lockup_detector_setup();
}














































































































































































































































































































































































































































































































































































































































































































   39 

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
// SPDX-License-Identifier: GPL-2.0
/*
 *  Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
 *  Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
 *  Copyright(C) 2006-2007  Timesys Corp., Thomas Gleixner
 *
 *  NOHZ implementation for low and high resolution timers
 *
 *  Started by: Thomas Gleixner and Ingo Molnar
 */
#include <linux/compiler.h>
#include <linux/cpu.h>
#include <linux/err.h>
#include <linux/hrtimer.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include <linux/percpu.h>
#include <linux/nmi.h>
#include <linux/profile.h>
#include <linux/sched/signal.h>
#include <linux/sched/clock.h>
#include <linux/sched/stat.h>
#include <linux/sched/nohz.h>
#include <linux/sched/loadavg.h>
#include <linux/module.h>
#include <linux/irq_work.h>
#include <linux/posix-timers.h>
#include <linux/context_tracking.h>
#include <linux/mm.h>

#include <asm/irq_regs.h>

#include "tick-internal.h"

#include <trace/events/timer.h>

/*
 * Per-CPU nohz control structure
 */
static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);

struct tick_sched *tick_get_tick_sched(int cpu)
{
        return &per_cpu(tick_cpu_sched, cpu);
}

/*
 * The time when the last jiffy update happened. Write access must hold
 * jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a
 * consistent view of jiffies and last_jiffies_update.
 */
static ktime_t last_jiffies_update;

/*
 * Must be called with interrupts disabled !
 */
static void tick_do_update_jiffies64(ktime_t now)
{
        unsigned long ticks = 1;
        ktime_t delta, nextp;

        /*
         * 64-bit can do a quick check without holding the jiffies lock and
         * without looking at the sequence count. The smp_load_acquire()
         * pairs with the update done later in this function.
         *
         * 32-bit cannot do that because the store of 'tick_next_period'
         * consists of two 32-bit stores, and the first store could be
         * moved by the CPU to a random point in the future.
         */
        if (IS_ENABLED(CONFIG_64BIT)) {
                if (ktime_before(now, smp_load_acquire(&tick_next_period)))
                        return;
        } else {
                unsigned int seq;

                /*
                 * Avoid contention on 'jiffies_lock' and protect the quick
                 * check with the sequence count.
                 */
                do {
                        seq = read_seqcount_begin(&jiffies_seq);
                        nextp = tick_next_period;
                } while (read_seqcount_retry(&jiffies_seq, seq));

                if (ktime_before(now, nextp))
                        return;
        }

        /* Quick check failed, i.e. update is required. */
        raw_spin_lock(&jiffies_lock);
        /*
         * Re-evaluate with the lock held. Another CPU might have done the
         * update already.
         */
        if (ktime_before(now, tick_next_period)) {
                raw_spin_unlock(&jiffies_lock);
                return;
        }

        write_seqcount_begin(&jiffies_seq);

        delta = ktime_sub(now, tick_next_period);
        if (unlikely(delta >= TICK_NSEC)) {
                /* Slow path for long idle sleep times */
                s64 incr = TICK_NSEC;

                ticks += ktime_divns(delta, incr);

                last_jiffies_update = ktime_add_ns(last_jiffies_update,
                                                   incr * ticks);
        } else {
                last_jiffies_update = ktime_add_ns(last_jiffies_update,
                                                   TICK_NSEC);
        }

        /* Advance jiffies to complete the 'jiffies_seq' protected job */
        jiffies_64 += ticks;

        /* Keep the tick_next_period variable up to date */
        nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC);

        if (IS_ENABLED(CONFIG_64BIT)) {
                /*
                 * Pairs with smp_load_acquire() in the lockless quick
                 * check above, and ensures that the update to 'jiffies_64' is
                 * not reordered vs. the store to 'tick_next_period', neither
                 * by the compiler nor by the CPU.
                 */
                smp_store_release(&tick_next_period, nextp);
        } else {
                /*
                 * A plain store is good enough on 32-bit, as the quick check
                 * above is protected by the sequence count.
                 */
                tick_next_period = nextp;
        }

        /*
         * Release the sequence count. calc_global_load() below is not
         * protected by it, but 'jiffies_lock' needs to be held to prevent
         * concurrent invocations.
         */
        write_seqcount_end(&jiffies_seq);

        calc_global_load();

        raw_spin_unlock(&jiffies_lock);
        update_wall_time();
}

/*
 * Initialize and return retrieve the jiffies update.
 */
static ktime_t tick_init_jiffy_update(void)
{
        ktime_t period;

        raw_spin_lock(&jiffies_lock);
        write_seqcount_begin(&jiffies_seq);

        /* Have we started the jiffies update yet ? */
        if (last_jiffies_update == 0) {
                u32 rem;

                /*
                 * Ensure that the tick is aligned to a multiple of
                 * TICK_NSEC.
                 */
                div_u64_rem(tick_next_period, TICK_NSEC, &rem);
                if (rem)
                        tick_next_period += TICK_NSEC - rem;

                last_jiffies_update = tick_next_period;
        }
        period = last_jiffies_update;

        write_seqcount_end(&jiffies_seq);
        raw_spin_unlock(&jiffies_lock);

        return period;
}

static inline int tick_sched_flag_test(struct tick_sched *ts,
                                       unsigned long flag)
{
        return !!(ts->flags & flag);
}

static inline void tick_sched_flag_set(struct tick_sched *ts,
                                       unsigned long flag)
{
        lockdep_assert_irqs_disabled();
        ts->flags |= flag;
}

static inline void tick_sched_flag_clear(struct tick_sched *ts,
                                         unsigned long flag)
{
        lockdep_assert_irqs_disabled();
        ts->flags &= ~flag;
}

#define MAX_STALLED_JIFFIES 5

static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
{
        int tick_cpu, cpu = smp_processor_id();

        /*
         * Check if the do_timer duty was dropped. We don't care about
         * concurrency: This happens only when the CPU in charge went
         * into a long sleep. If two CPUs happen to assign themselves to
         * this duty, then the jiffies update is still serialized by
         * 'jiffies_lock'.
         *
         * If nohz_full is enabled, this should not happen because the
         * 'tick_do_timer_cpu' CPU never relinquishes.
         */
        tick_cpu = READ_ONCE(tick_do_timer_cpu);

        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && unlikely(tick_cpu == TICK_DO_TIMER_NONE)) {
#ifdef CONFIG_NO_HZ_FULL
                WARN_ON_ONCE(tick_nohz_full_running);
#endif
                WRITE_ONCE(tick_do_timer_cpu, cpu);
                tick_cpu = cpu;
        }

        /* Check if jiffies need an update */
        if (tick_cpu == cpu)
                tick_do_update_jiffies64(now);

        /*
         * If the jiffies update stalled for too long (timekeeper in stop_machine()
         * or VMEXIT'ed for several msecs), force an update.
         */
        if (ts->last_tick_jiffies != jiffies) {
                ts->stalled_jiffies = 0;
                ts->last_tick_jiffies = READ_ONCE(jiffies);
        } else {
                if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) {
                        tick_do_update_jiffies64(now);
                        ts->stalled_jiffies = 0;
                        ts->last_tick_jiffies = READ_ONCE(jiffies);
                }
        }

        if (tick_sched_flag_test(ts, TS_FLAG_INIDLE))
                ts->got_idle_tick = 1;
}

static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
{
        /*
         * When we are idle and the tick is stopped, we have to touch
         * the watchdog as we might not schedule for a really long
         * time. This happens on completely idle SMP systems while
         * waiting on the login prompt. We also increment the "start of
         * idle" jiffy stamp so the idle accounting adjustment we do
         * when we go busy again does not account too many ticks.
         */
        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) &&
            tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
                touch_softlockup_watchdog_sched();
                if (is_idle_task(current))
                        ts->idle_jiffies++;
                /*
                 * In case the current tick fired too early past its expected
                 * expiration, make sure we don't bypass the next clock reprogramming
                 * to the same deadline.
                 */
                ts->next_tick = 0;
        }

        update_process_times(user_mode(regs));
        profile_tick(CPU_PROFILING);
}

/*
 * We rearm the timer until we get disabled by the idle code.
 * Called with interrupts disabled.
 */
static enum hrtimer_restart tick_nohz_handler(struct hrtimer *timer)
{
        struct tick_sched *ts =        container_of(timer, struct tick_sched, sched_timer);
        struct pt_regs *regs = get_irq_regs();
        ktime_t now = ktime_get();

        tick_sched_do_timer(ts, now);

        /*
         * Do not call when we are not in IRQ context and have
         * no valid 'regs' pointer
         */
        if (regs)
                tick_sched_handle(ts, regs);
        else
                ts->next_tick = 0;

        /*
         * In dynticks mode, tick reprogram is deferred:
         * - to the idle task if in dynticks-idle
         * - to IRQ exit if in full-dynticks.
         */
        if (unlikely(tick_sched_flag_test(ts, TS_FLAG_STOPPED)))
                return HRTIMER_NORESTART;

        hrtimer_forward(timer, now, TICK_NSEC);

        return HRTIMER_RESTART;
}

#ifdef CONFIG_NO_HZ_FULL
cpumask_var_t tick_nohz_full_mask;
EXPORT_SYMBOL_GPL(tick_nohz_full_mask);
bool tick_nohz_full_running;
EXPORT_SYMBOL_GPL(tick_nohz_full_running);
static atomic_t tick_dep_mask;

static bool check_tick_dependency(atomic_t *dep)
{
        int val = atomic_read(dep);

        if (val & TICK_DEP_MASK_POSIX_TIMER) {
                trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER);
                return true;
        }

        if (val & TICK_DEP_MASK_PERF_EVENTS) {
                trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS);
                return true;
        }

        if (val & TICK_DEP_MASK_SCHED) {
                trace_tick_stop(0, TICK_DEP_MASK_SCHED);
                return true;
        }

        if (val & TICK_DEP_MASK_CLOCK_UNSTABLE) {
                trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE);
                return true;
        }

        if (val & TICK_DEP_MASK_RCU) {
                trace_tick_stop(0, TICK_DEP_MASK_RCU);
                return true;
        }

        if (val & TICK_DEP_MASK_RCU_EXP) {
                trace_tick_stop(0, TICK_DEP_MASK_RCU_EXP);
                return true;
        }

        return false;
}

static bool can_stop_full_tick(int cpu, struct tick_sched *ts)
{
        lockdep_assert_irqs_disabled();

        if (unlikely(!cpu_online(cpu)))
                return false;

        if (check_tick_dependency(&tick_dep_mask))
                return false;

        if (check_tick_dependency(&ts->tick_dep_mask))
                return false;

        if (check_tick_dependency(&current->tick_dep_mask))
                return false;

        if (check_tick_dependency(&current->signal->tick_dep_mask))
                return false;

        return true;
}

static void nohz_full_kick_func(struct irq_work *work)
{
        /* Empty, the tick restart happens on tick_nohz_irq_exit() */
}

static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) =
        IRQ_WORK_INIT_HARD(nohz_full_kick_func);

/*
 * Kick this CPU if it's full dynticks in order to force it to
 * re-evaluate its dependency on the tick and restart it if necessary.
 * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(),
 * is NMI safe.
 */
static void tick_nohz_full_kick(void)
{
        if (!tick_nohz_full_cpu(smp_processor_id()))
                return;

        irq_work_queue(this_cpu_ptr(&nohz_full_kick_work));
}

/*
 * Kick the CPU if it's full dynticks in order to force it to
 * re-evaluate its dependency on the tick and restart it if necessary.
 */
void tick_nohz_full_kick_cpu(int cpu)
{
        if (!tick_nohz_full_cpu(cpu))
                return;

        irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);
}

static void tick_nohz_kick_task(struct task_struct *tsk)
{
        int cpu;

        /*
         * If the task is not running, run_posix_cpu_timers()
         * has nothing to elapse, and an IPI can then be optimized out.
         *
         * activate_task()                      STORE p->tick_dep_mask
         *   STORE p->on_rq
         * __schedule() (switch to task 'p')    smp_mb() (atomic_fetch_or())
         *   LOCK rq->lock                      LOAD p->on_rq
         *   smp_mb__after_spin_lock()
         *   tick_nohz_task_switch()
         *     LOAD p->tick_dep_mask
         *
         * XXX given a task picks up the dependency on schedule(), should we
         * only care about tasks that are currently on the CPU instead of all
         * that are on the runqueue?
         *
         * That is, does this want to be: task_on_cpu() / task_curr()?
         */
        if (!sched_task_on_rq(tsk))
                return;

        /*
         * If the task concurrently migrates to another CPU,
         * we guarantee it sees the new tick dependency upon
         * schedule.
         *
         * set_task_cpu(p, cpu);
         *   STORE p->cpu = @cpu
         * __schedule() (switch to task 'p')
         *   LOCK rq->lock
         *   smp_mb__after_spin_lock()          STORE p->tick_dep_mask
         *   tick_nohz_task_switch()            smp_mb() (atomic_fetch_or())
         *      LOAD p->tick_dep_mask           LOAD p->cpu
         */
        cpu = task_cpu(tsk);

        preempt_disable();
        if (cpu_online(cpu))
                tick_nohz_full_kick_cpu(cpu);
        preempt_enable();
}

/*
 * Kick all full dynticks CPUs in order to force these to re-evaluate
 * their dependency on the tick and restart it if necessary.
 */
static void tick_nohz_full_kick_all(void)
{
        int cpu;

        if (!tick_nohz_full_running)
                return;

        preempt_disable();
        for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask)
                tick_nohz_full_kick_cpu(cpu);
        preempt_enable();
}

static void tick_nohz_dep_set_all(atomic_t *dep,
                                  enum tick_dep_bits bit)
{
        int prev;

        prev = atomic_fetch_or(BIT(bit), dep);
        if (!prev)
                tick_nohz_full_kick_all();
}

/*
 * Set a global tick dependency. Used by perf events that rely on freq and
 * unstable clocks.
 */
void tick_nohz_dep_set(enum tick_dep_bits bit)
{
        tick_nohz_dep_set_all(&tick_dep_mask, bit);
}

void tick_nohz_dep_clear(enum tick_dep_bits bit)
{
        atomic_andnot(BIT(bit), &tick_dep_mask);
}

/*
 * Set per-CPU tick dependency. Used by scheduler and perf events in order to
 * manage event-throttling.
 */
void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
{
        int prev;
        struct tick_sched *ts;

        ts = per_cpu_ptr(&tick_cpu_sched, cpu);

        prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask);
        if (!prev) {
                preempt_disable();
                /* Perf needs local kick that is NMI safe */
                if (cpu == smp_processor_id()) {
                        tick_nohz_full_kick();
                } else {
                        /* Remote IRQ work not NMI-safe */
                        if (!WARN_ON_ONCE(in_nmi()))
                                tick_nohz_full_kick_cpu(cpu);
                }
                preempt_enable();
        }
}
EXPORT_SYMBOL_GPL(tick_nohz_dep_set_cpu);

void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
{
        struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);

        atomic_andnot(BIT(bit), &ts->tick_dep_mask);
}
EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu);

/*
 * Set a per-task tick dependency. RCU needs this. Also posix CPU timers
 * in order to elapse per task timers.
 */
void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
{
        if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask))
                tick_nohz_kick_task(tsk);
}
EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task);

void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit)
{
        atomic_andnot(BIT(bit), &tsk->tick_dep_mask);
}
EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_task);

/*
 * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse
 * per process timers.
 */
void tick_nohz_dep_set_signal(struct task_struct *tsk,
                              enum tick_dep_bits bit)
{
        int prev;
        struct signal_struct *sig = tsk->signal;

        prev = atomic_fetch_or(BIT(bit), &sig->tick_dep_mask);
        if (!prev) {
                struct task_struct *t;

                lockdep_assert_held(&tsk->sighand->siglock);
                __for_each_thread(sig, t)
                        tick_nohz_kick_task(t);
        }
}

void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit)
{
        atomic_andnot(BIT(bit), &sig->tick_dep_mask);
}

/*
 * Re-evaluate the need for the tick as we switch the current task.
 * It might need the tick due to per task/process properties:
 * perf events, posix CPU timers, ...
 */
void __tick_nohz_task_switch(void)
{
        struct tick_sched *ts;

        if (!tick_nohz_full_cpu(smp_processor_id()))
                return;

        ts = this_cpu_ptr(&tick_cpu_sched);

        if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
                if (atomic_read(&current->tick_dep_mask) ||
                    atomic_read(&current->signal->tick_dep_mask))
                        tick_nohz_full_kick();
        }
}

/* Get the boot-time nohz CPU list from the kernel parameters. */
void __init tick_nohz_full_setup(cpumask_var_t cpumask)
{
        alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
        cpumask_copy(tick_nohz_full_mask, cpumask);
        tick_nohz_full_running = true;
}

bool tick_nohz_cpu_hotpluggable(unsigned int cpu)
{
        /*
         * The 'tick_do_timer_cpu' CPU handles housekeeping duty (unbound
         * timers, workqueues, timekeeping, ...) on behalf of full dynticks
         * CPUs. It must remain online when nohz full is enabled.
         */
        if (tick_nohz_full_running && READ_ONCE(tick_do_timer_cpu) == cpu)
                return false;
        return true;
}

static int tick_nohz_cpu_down(unsigned int cpu)
{
        return tick_nohz_cpu_hotpluggable(cpu) ? 0 : -EBUSY;
}

void __init tick_nohz_init(void)
{
        int cpu, ret;

        if (!tick_nohz_full_running)
                return;

        /*
         * Full dynticks uses IRQ work to drive the tick rescheduling on safe
         * locking contexts. But then we need IRQ work to raise its own
         * interrupts to avoid circular dependency on the tick.
         */
        if (!arch_irq_work_has_interrupt()) {
                pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support IRQ work self-IPIs\n");
                cpumask_clear(tick_nohz_full_mask);
                tick_nohz_full_running = false;
                return;
        }

        if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) &&
                        !IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) {
                cpu = smp_processor_id();

                if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
                        pr_warn("NO_HZ: Clearing %d from nohz_full range "
                                "for timekeeping\n", cpu);
                        cpumask_clear_cpu(cpu, tick_nohz_full_mask);
                }
        }

        for_each_cpu(cpu, tick_nohz_full_mask)
                ct_cpu_track_user(cpu);

        ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
                                        "kernel/nohz:predown", NULL,
                                        tick_nohz_cpu_down);
        WARN_ON(ret < 0);
        pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
                cpumask_pr_args(tick_nohz_full_mask));
}
#endif /* #ifdef CONFIG_NO_HZ_FULL */

/*
 * NOHZ - aka dynamic tick functionality
 */
#ifdef CONFIG_NO_HZ_COMMON
/*
 * NO HZ enabled ?
 */
bool tick_nohz_enabled __read_mostly  = true;
unsigned long tick_nohz_active  __read_mostly;
/*
 * Enable / Disable tickless mode
 */
static int __init setup_tick_nohz(char *str)
{
        return (kstrtobool(str, &tick_nohz_enabled) == 0);
}

__setup("nohz=", setup_tick_nohz);

bool tick_nohz_tick_stopped(void)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

        return tick_sched_flag_test(ts, TS_FLAG_STOPPED);
}

bool tick_nohz_tick_stopped_cpu(int cpu)
{
        struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);

        return tick_sched_flag_test(ts, TS_FLAG_STOPPED);
}

/**
 * tick_nohz_update_jiffies - update jiffies when idle was interrupted
 * @now: current ktime_t
 *
 * Called from interrupt entry when the CPU was idle
 *
 * In case the sched_tick was stopped on this CPU, we have to check if jiffies
 * must be updated. Otherwise an interrupt handler could use a stale jiffy
 * value. We do this unconditionally on any CPU, as we don't know whether the
 * CPU, which has the update task assigned, is in a long sleep.
 */
static void tick_nohz_update_jiffies(ktime_t now)
{
        unsigned long flags;

        __this_cpu_write(tick_cpu_sched.idle_waketime, now);

        local_irq_save(flags);
        tick_do_update_jiffies64(now);
        local_irq_restore(flags);

        touch_softlockup_watchdog_sched();
}

static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
{
        ktime_t delta;

        if (WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE)))
                return;

        delta = ktime_sub(now, ts->idle_entrytime);

        write_seqcount_begin(&ts->idle_sleeptime_seq);
        if (nr_iowait_cpu(smp_processor_id()) > 0)
                ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
        else
                ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);

        ts->idle_entrytime = now;
        tick_sched_flag_clear(ts, TS_FLAG_IDLE_ACTIVE);
        write_seqcount_end(&ts->idle_sleeptime_seq);

        sched_clock_idle_wakeup_event();
}

static void tick_nohz_start_idle(struct tick_sched *ts)
{
        write_seqcount_begin(&ts->idle_sleeptime_seq);
        ts->idle_entrytime = ktime_get();
        tick_sched_flag_set(ts, TS_FLAG_IDLE_ACTIVE);
        write_seqcount_end(&ts->idle_sleeptime_seq);

        sched_clock_idle_sleep_event();
}

static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime,
                                 bool compute_delta, u64 *last_update_time)
{
        ktime_t now, idle;
        unsigned int seq;

        if (!tick_nohz_active)
                return -1;

        now = ktime_get();
        if (last_update_time)
                *last_update_time = ktime_to_us(now);

        do {
                seq = read_seqcount_begin(&ts->idle_sleeptime_seq);

                if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE) && compute_delta) {
                        ktime_t delta = ktime_sub(now, ts->idle_entrytime);

                        idle = ktime_add(*sleeptime, delta);
                } else {
                        idle = *sleeptime;
                }
        } while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq));

        return ktime_to_us(idle);

}

/**
 * get_cpu_idle_time_us - get the total idle time of a CPU
 * @cpu: CPU number to query
 * @last_update_time: variable to store update time in. Do not update
 * counters if NULL.
 *
 * Return the cumulative idle time (since boot) for a given
 * CPU, in microseconds. Note that this is partially broken due to
 * the counter of iowait tasks that can be remotely updated without
 * any synchronization. Therefore it is possible to observe backward
 * values within two consecutive reads.
 *
 * This time is measured via accounting rather than sampling,
 * and is as accurate as ktime_get() is.
 *
 * Return: -1 if NOHZ is not enabled, else total idle time of the @cpu
 */
u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
{
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);

        return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime,
                                     !nr_iowait_cpu(cpu), last_update_time);
}
EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);

/**
 * get_cpu_iowait_time_us - get the total iowait time of a CPU
 * @cpu: CPU number to query
 * @last_update_time: variable to store update time in. Do not update
 * counters if NULL.
 *
 * Return the cumulative iowait time (since boot) for a given
 * CPU, in microseconds. Note this is partially broken due to
 * the counter of iowait tasks that can be remotely updated without
 * any synchronization. Therefore it is possible to observe backward
 * values within two consecutive reads.
 *
 * This time is measured via accounting rather than sampling,
 * and is as accurate as ktime_get() is.
 *
 * Return: -1 if NOHZ is not enabled, else total iowait time of @cpu
 */
u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
{
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);

        return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime,
                                     nr_iowait_cpu(cpu), last_update_time);
}
EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);

static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
{
        hrtimer_cancel(&ts->sched_timer);
        hrtimer_set_expires(&ts->sched_timer, ts->last_tick);

        /* Forward the time to expire in the future */
        hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);

        if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) {
                hrtimer_start_expires(&ts->sched_timer,
                                      HRTIMER_MODE_ABS_PINNED_HARD);
        } else {
                tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
        }

        /*
         * Reset to make sure the next tick stop doesn't get fooled by past
         * cached clock deadline.
         */
        ts->next_tick = 0;
}

static inline bool local_timer_softirq_pending(void)
{
        return local_timers_pending() & BIT(TIMER_SOFTIRQ);
}

/*
 * Read jiffies and the time when jiffies were updated last
 */
u64 get_jiffies_update(unsigned long *basej)
{
        unsigned long basejiff;
        unsigned int seq;
        u64 basemono;

        do {
                seq = read_seqcount_begin(&jiffies_seq);
                basemono = last_jiffies_update;
                basejiff = jiffies;
        } while (read_seqcount_retry(&jiffies_seq, seq));
        *basej = basejiff;
        return basemono;
}

/**
 * tick_nohz_next_event() - return the clock monotonic based next event
 * @ts:                pointer to tick_sched struct
 * @cpu:        CPU number
 *
 * Return:
 * *%0                - When the next event is a maximum of TICK_NSEC in the future
 *                  and the tick is not stopped yet
 * *%next_event        - Next event based on clock monotonic
 */
static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
{
        u64 basemono, next_tick, delta, expires;
        unsigned long basejiff;
        int tick_cpu;

        basemono = get_jiffies_update(&basejiff);
        ts->last_jiffies = basejiff;
        ts->timer_expires_base = basemono;

        /*
         * Keep the periodic tick, when RCU, architecture or irq_work
         * requests it.
         * Aside of that, check whether the local timer softirq is
         * pending. If so, its a bad idea to call get_next_timer_interrupt(),
         * because there is an already expired timer, so it will request
         * immediate expiry, which rearms the hardware timer with a
         * minimal delta, which brings us back to this place
         * immediately. Lather, rinse and repeat...
         */
        if (rcu_needs_cpu() || arch_needs_cpu() ||
            irq_work_needs_cpu() || local_timer_softirq_pending()) {
                next_tick = basemono + TICK_NSEC;
        } else {
                /*
                 * Get the next pending timer. If high resolution
                 * timers are enabled this only takes the timer wheel
                 * timers into account. If high resolution timers are
                 * disabled this also looks at the next expiring
                 * hrtimer.
                 */
                next_tick = get_next_timer_interrupt(basejiff, basemono);
                ts->next_timer = next_tick;
        }

        /* Make sure next_tick is never before basemono! */
        if (WARN_ON_ONCE(basemono > next_tick))
                next_tick = basemono;

        /*
         * If the tick is due in the next period, keep it ticking or
         * force prod the timer.
         */
        delta = next_tick - basemono;
        if (delta <= (u64)TICK_NSEC) {
                /*
                 * We've not stopped the tick yet, and there's a timer in the
                 * next period, so no point in stopping it either, bail.
                 */
                if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
                        ts->timer_expires = 0;
                        goto out;
                }
        }

        /*
         * If this CPU is the one which had the do_timer() duty last, we limit
         * the sleep time to the timekeeping 'max_deferment' value.
         * Otherwise we can sleep as long as we want.
         */
        delta = timekeeping_max_deferment();
        tick_cpu = READ_ONCE(tick_do_timer_cpu);
        if (tick_cpu != cpu &&
            (tick_cpu != TICK_DO_TIMER_NONE || !tick_sched_flag_test(ts, TS_FLAG_DO_TIMER_LAST)))
                delta = KTIME_MAX;

        /* Calculate the next expiry time */
        if (delta < (KTIME_MAX - basemono))
                expires = basemono + delta;
        else
                expires = KTIME_MAX;

        ts->timer_expires = min_t(u64, expires, next_tick);

out:
        return ts->timer_expires;
}

static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
{
        struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
        unsigned long basejiff = ts->last_jiffies;
        u64 basemono = ts->timer_expires_base;
        bool timer_idle = tick_sched_flag_test(ts, TS_FLAG_STOPPED);
        int tick_cpu;
        u64 expires;

        /* Make sure we won't be trying to stop it twice in a row. */
        ts->timer_expires_base = 0;

        /*
         * Now the tick should be stopped definitely - so the timer base needs
         * to be marked idle as well to not miss a newly queued timer.
         */
        expires = timer_base_try_to_set_idle(basejiff, basemono, &timer_idle);
        if (expires > ts->timer_expires) {
                /*
                 * This path could only happen when the first timer was removed
                 * between calculating the possible sleep length and now (when
                 * high resolution mode is not active, timer could also be a
                 * hrtimer).
                 *
                 * We have to stick to the original calculated expiry value to
                 * not stop the tick for too long with a shallow C-state (which
                 * was programmed by cpuidle because of an early next expiration
                 * value).
                 */
                expires = ts->timer_expires;
        }

        /* If the timer base is not idle, retain the not yet stopped tick. */
        if (!timer_idle)
                return;

        /*
         * If this CPU is the one which updates jiffies, then give up
         * the assignment and let it be taken by the CPU which runs
         * the tick timer next, which might be this CPU as well. If we
         * don't drop this here, the jiffies might be stale and
         * do_timer() never gets invoked. Keep track of the fact that it
         * was the one which had the do_timer() duty last.
         */
        tick_cpu = READ_ONCE(tick_do_timer_cpu);
        if (tick_cpu == cpu) {
                WRITE_ONCE(tick_do_timer_cpu, TICK_DO_TIMER_NONE);
                tick_sched_flag_set(ts, TS_FLAG_DO_TIMER_LAST);
        } else if (tick_cpu != TICK_DO_TIMER_NONE) {
                tick_sched_flag_clear(ts, TS_FLAG_DO_TIMER_LAST);
        }

        /* Skip reprogram of event if it's not changed */
        if (tick_sched_flag_test(ts, TS_FLAG_STOPPED) && (expires == ts->next_tick)) {
                /* Sanity check: make sure clockevent is actually programmed */
                if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
                        return;

                WARN_ONCE(1, "basemono: %llu ts->next_tick: %llu dev->next_event: %llu "
                          "timer->active: %d timer->expires: %llu\n", basemono, ts->next_tick,
                          dev->next_event, hrtimer_active(&ts->sched_timer),
                          hrtimer_get_expires(&ts->sched_timer));
        }

        /*
         * tick_nohz_stop_tick() can be called several times before
         * tick_nohz_restart_sched_tick() is called. This happens when
         * interrupts arrive which do not cause a reschedule. In the first
         * call we save the current tick time, so we can restart the
         * scheduler tick in tick_nohz_restart_sched_tick().
         */
        if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
                calc_load_nohz_start();
                quiet_vmstat();

                ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
                tick_sched_flag_set(ts, TS_FLAG_STOPPED);
                trace_tick_stop(1, TICK_DEP_MASK_NONE);
        }

        ts->next_tick = expires;

        /*
         * If the expiration time == KTIME_MAX, then we simply stop
         * the tick timer.
         */
        if (unlikely(expires == KTIME_MAX)) {
                if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES))
                        hrtimer_cancel(&ts->sched_timer);
                else
                        tick_program_event(KTIME_MAX, 1);
                return;
        }

        if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) {
                hrtimer_start(&ts->sched_timer, expires,
                              HRTIMER_MODE_ABS_PINNED_HARD);
        } else {
                hrtimer_set_expires(&ts->sched_timer, expires);
                tick_program_event(expires, 1);
        }
}

static void tick_nohz_retain_tick(struct tick_sched *ts)
{
        ts->timer_expires_base = 0;
}

#ifdef CONFIG_NO_HZ_FULL
static void tick_nohz_full_stop_tick(struct tick_sched *ts, int cpu)
{
        if (tick_nohz_next_event(ts, cpu))
                tick_nohz_stop_tick(ts, cpu);
        else
                tick_nohz_retain_tick(ts);
}
#endif /* CONFIG_NO_HZ_FULL */

static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
{
        /* Update jiffies first */
        tick_do_update_jiffies64(now);

        /*
         * Clear the timer idle flag, so we avoid IPIs on remote queueing and
         * the clock forward checks in the enqueue path:
         */
        timer_clear_idle();

        calc_load_nohz_stop();
        touch_softlockup_watchdog_sched();

        /* Cancel the scheduled timer and restore the tick: */
        tick_sched_flag_clear(ts, TS_FLAG_STOPPED);
        tick_nohz_restart(ts, now);
}

static void __tick_nohz_full_update_tick(struct tick_sched *ts,
                                         ktime_t now)
{
#ifdef CONFIG_NO_HZ_FULL
        int cpu = smp_processor_id();

        if (can_stop_full_tick(cpu, ts))
                tick_nohz_full_stop_tick(ts, cpu);
        else if (tick_sched_flag_test(ts, TS_FLAG_STOPPED))
                tick_nohz_restart_sched_tick(ts, now);
#endif
}

static void tick_nohz_full_update_tick(struct tick_sched *ts)
{
        if (!tick_nohz_full_cpu(smp_processor_id()))
                return;

        if (!tick_sched_flag_test(ts, TS_FLAG_NOHZ))
                return;

        __tick_nohz_full_update_tick(ts, ktime_get());
}

/*
 * A pending softirq outside an IRQ (or softirq disabled section) context
 * should be waiting for ksoftirqd to handle it. Therefore we shouldn't
 * reach this code due to the need_resched() early check in can_stop_idle_tick().
 *
 * However if we are between CPUHP_AP_SMPBOOT_THREADS and CPU_TEARDOWN_CPU on the
 * cpu_down() process, softirqs can still be raised while ksoftirqd is parked,
 * triggering the code below, since wakep_softirqd() is ignored.
 *
 */
static bool report_idle_softirq(void)
{
        static int ratelimit;
        unsigned int pending = local_softirq_pending();

        if (likely(!pending))
                return false;

        /* Some softirqs claim to be safe against hotplug and ksoftirqd parking */
        if (!cpu_active(smp_processor_id())) {
                pending &= ~SOFTIRQ_HOTPLUG_SAFE_MASK;
                if (!pending)
                        return false;
        }

        /* On RT, softirq handling may be waiting on some lock */
        if (local_bh_blocked())
                return false;

        if (ratelimit < 10) {
                pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n",
                        pending);
                ratelimit++;
        }

        return true;
}

static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
{
        WARN_ON_ONCE(cpu_is_offline(cpu));

        if (unlikely(!tick_sched_flag_test(ts, TS_FLAG_NOHZ)))
                return false;

        if (need_resched())
                return false;

        if (unlikely(report_idle_softirq()))
                return false;

        if (tick_nohz_full_enabled()) {
                int tick_cpu = READ_ONCE(tick_do_timer_cpu);

                /*
                 * Keep the tick alive to guarantee timekeeping progression
                 * if there are full dynticks CPUs around
                 */
                if (tick_cpu == cpu)
                        return false;

                /* Should not happen for nohz-full */
                if (WARN_ON_ONCE(tick_cpu == TICK_DO_TIMER_NONE))
                        return false;
        }

        return true;
}

/**
 * tick_nohz_idle_stop_tick - stop the idle tick from the idle task
 *
 * When the next event is more than a tick into the future, stop the idle tick
 */
void tick_nohz_idle_stop_tick(void)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
        int cpu = smp_processor_id();
        ktime_t expires;

        /*
         * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the
         * tick timer expiration time is known already.
         */
        if (ts->timer_expires_base)
                expires = ts->timer_expires;
        else if (can_stop_idle_tick(cpu, ts))
                expires = tick_nohz_next_event(ts, cpu);
        else
                return;

        ts->idle_calls++;

        if (expires > 0LL) {
                int was_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED);

                tick_nohz_stop_tick(ts, cpu);

                ts->idle_sleeps++;
                ts->idle_expires = expires;

                if (!was_stopped && tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
                        ts->idle_jiffies = ts->last_jiffies;
                        nohz_balance_enter_idle(cpu);
                }
        } else {
                tick_nohz_retain_tick(ts);
        }
}

void tick_nohz_idle_retain_tick(void)
{
        tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched));
}

/**
 * tick_nohz_idle_enter - prepare for entering idle on the current CPU
 *
 * Called when we start the idle loop.
 */
void tick_nohz_idle_enter(void)
{
        struct tick_sched *ts;

        lockdep_assert_irqs_enabled();

        local_irq_disable();

        ts = this_cpu_ptr(&tick_cpu_sched);

        WARN_ON_ONCE(ts->timer_expires_base);

        tick_sched_flag_set(ts, TS_FLAG_INIDLE);
        tick_nohz_start_idle(ts);

        local_irq_enable();
}

/**
 * tick_nohz_irq_exit - Notify the tick about IRQ exit
 *
 * A timer may have been added/modified/deleted either by the current IRQ,
 * or by another place using this IRQ as a notification. This IRQ may have
 * also updated the RCU callback list. These events may require a
 * re-evaluation of the next tick. Depending on the context:
 *
 * 1) If the CPU is idle and no resched is pending, just proceed with idle
 *    time accounting. The next tick will be re-evaluated on the next idle
 *    loop iteration.
 *
 * 2) If the CPU is nohz_full:
 *
 *    2.1) If there is any tick dependency, restart the tick if stopped.
 *
 *    2.2) If there is no tick dependency, (re-)evaluate the next tick and
 *         stop/update it accordingly.
 */
void tick_nohz_irq_exit(void)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

        if (tick_sched_flag_test(ts, TS_FLAG_INIDLE))
                tick_nohz_start_idle(ts);
        else
                tick_nohz_full_update_tick(ts);
}

/**
 * tick_nohz_idle_got_tick - Check whether or not the tick handler has run
 *
 * Return: %true if the tick handler has run, otherwise %false
 */
bool tick_nohz_idle_got_tick(void)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

        if (ts->got_idle_tick) {
                ts->got_idle_tick = 0;
                return true;
        }
        return false;
}

/**
 * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer
 * or the tick, whichever expires first. Note that, if the tick has been
 * stopped, it returns the next hrtimer.
 *
 * Called from power state control code with interrupts disabled
 *
 * Return: the next expiration time
 */
ktime_t tick_nohz_get_next_hrtimer(void)
{
        return __this_cpu_read(tick_cpu_device.evtdev)->next_event;
}

/**
 * tick_nohz_get_sleep_length - return the expected length of the current sleep
 * @delta_next: duration until the next event if the tick cannot be stopped
 *
 * Called from power state control code with interrupts disabled.
 *
 * The return value of this function and/or the value returned by it through the
 * @delta_next pointer can be negative which must be taken into account by its
 * callers.
 *
 * Return: the expected length of the current sleep
 */
ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
{
        struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
        int cpu = smp_processor_id();
        /*
         * The idle entry time is expected to be a sufficient approximation of
         * the current time at this point.
         */
        ktime_t now = ts->idle_entrytime;
        ktime_t next_event;

        WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE));

        *delta_next = ktime_sub(dev->next_event, now);

        if (!can_stop_idle_tick(cpu, ts))
                return *delta_next;

        next_event = tick_nohz_next_event(ts, cpu);
        if (!next_event)
                return *delta_next;

        /*
         * If the next highres timer to expire is earlier than 'next_event', the
         * idle governor needs to know that.
         */
        next_event = min_t(u64, next_event,
                           hrtimer_next_event_without(&ts->sched_timer));

        return ktime_sub(next_event, now);
}

/**
 * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value
 * for a particular CPU.
 * @cpu: target CPU number
 *
 * Called from the schedutil frequency scaling governor in scheduler context.
 *
 * Return: the current idle calls counter value for @cpu
 */
unsigned long tick_nohz_get_idle_calls_cpu(int cpu)
{
        struct tick_sched *ts = tick_get_tick_sched(cpu);

        return ts->idle_calls;
}

static void tick_nohz_account_idle_time(struct tick_sched *ts,
                                        ktime_t now)
{
        unsigned long ticks;

        ts->idle_exittime = now;

        if (vtime_accounting_enabled_this_cpu())
                return;
        /*
         * We stopped the tick in idle. update_process_times() would miss the
         * time we slept, as it does only a 1 tick accounting.
         * Enforce that this is accounted to idle !
         */
        ticks = jiffies - ts->idle_jiffies;
        /*
         * We might be one off. Do not randomly account a huge number of ticks!
         */
        if (ticks && ticks < LONG_MAX)
                account_idle_ticks(ticks);
}

void tick_nohz_idle_restart_tick(void)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

        if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
                ktime_t now = ktime_get();
                tick_nohz_restart_sched_tick(ts, now);
                tick_nohz_account_idle_time(ts, now);
        }
}

static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now)
{
        if (tick_nohz_full_cpu(smp_processor_id()))
                __tick_nohz_full_update_tick(ts, now);
        else
                tick_nohz_restart_sched_tick(ts, now);

        tick_nohz_account_idle_time(ts, now);
}

/**
 * tick_nohz_idle_exit - Update the tick upon idle task exit
 *
 * When the idle task exits, update the tick depending on the
 * following situations:
 *
 * 1) If the CPU is not in nohz_full mode (most cases), then
 *    restart the tick.
 *
 * 2) If the CPU is in nohz_full mode (corner case):
 *   2.1) If the tick can be kept stopped (no tick dependencies)
 *        then re-evaluate the next tick and try to keep it stopped
 *        as long as possible.
 *   2.2) If the tick has dependencies, restart the tick.
 *
 */
void tick_nohz_idle_exit(void)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
        bool idle_active, tick_stopped;
        ktime_t now;

        local_irq_disable();

        WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE));
        WARN_ON_ONCE(ts->timer_expires_base);

        tick_sched_flag_clear(ts, TS_FLAG_INIDLE);
        idle_active = tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE);
        tick_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED);

        if (idle_active || tick_stopped)
                now = ktime_get();

        if (idle_active)
                tick_nohz_stop_idle(ts, now);

        if (tick_stopped)
                tick_nohz_idle_update_tick(ts, now);

        local_irq_enable();
}

/*
 * In low-resolution mode, the tick handler must be implemented directly
 * at the clockevent level. hrtimer can't be used instead, because its
 * infrastructure actually relies on the tick itself as a backend in
 * low-resolution mode (see hrtimer_run_queues()).
 */
static void tick_nohz_lowres_handler(struct clock_event_device *dev)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

        dev->next_event = KTIME_MAX;

        if (likely(tick_nohz_handler(&ts->sched_timer) == HRTIMER_RESTART))
                tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
}

static inline void tick_nohz_activate(struct tick_sched *ts)
{
        if (!tick_nohz_enabled)
                return;
        tick_sched_flag_set(ts, TS_FLAG_NOHZ);
        /* One update is enough */
        if (!test_and_set_bit(0, &tick_nohz_active))
                timers_update_nohz();
}

/**
 * tick_nohz_switch_to_nohz - switch to NOHZ mode
 */
static void tick_nohz_switch_to_nohz(void)
{
        if (!tick_nohz_enabled)
                return;

        if (tick_switch_to_oneshot(tick_nohz_lowres_handler))
                return;

        /*
         * Recycle the hrtimer in 'ts', so we can share the
         * highres code.
         */
        tick_setup_sched_timer(false);
}

static inline void tick_nohz_irq_enter(void)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
        ktime_t now;

        if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED | TS_FLAG_IDLE_ACTIVE))
                return;
        now = ktime_get();
        if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE))
                tick_nohz_stop_idle(ts, now);
        /*
         * If all CPUs are idle we may need to update a stale jiffies value.
         * Note nohz_full is a special case: a timekeeper is guaranteed to stay
         * alive but it might be busy looping with interrupts disabled in some
         * rare case (typically stop machine). So we must make sure we have a
         * last resort.
         */
        if (tick_sched_flag_test(ts, TS_FLAG_STOPPED))
                tick_nohz_update_jiffies(now);
}

#else

static inline void tick_nohz_switch_to_nohz(void) { }
static inline void tick_nohz_irq_enter(void) { }
static inline void tick_nohz_activate(struct tick_sched *ts) { }

#endif /* CONFIG_NO_HZ_COMMON */

/*
 * Called from irq_enter() to notify about the possible interruption of idle()
 */
void tick_irq_enter(void)
{
        tick_check_oneshot_broadcast_this_cpu();
        tick_nohz_irq_enter();
}

static int sched_skew_tick;

static int __init skew_tick(char *str)
{
        get_option(&str, &sched_skew_tick);

        return 0;
}
early_param("skew_tick", skew_tick);

/**
 * tick_setup_sched_timer - setup the tick emulation timer
 * @hrtimer: whether to use the hrtimer or not
 */
void tick_setup_sched_timer(bool hrtimer)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

        /* Emulate tick processing via per-CPU hrtimers: */
        hrtimer_setup(&ts->sched_timer, tick_nohz_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);

        if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer)
                tick_sched_flag_set(ts, TS_FLAG_HIGHRES);

        /* Get the next period (per-CPU) */
        hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());

        /* Offset the tick to avert 'jiffies_lock' contention. */
        if (sched_skew_tick) {
                u64 offset = TICK_NSEC >> 1;
                do_div(offset, num_possible_cpus());
                offset *= smp_processor_id();
                hrtimer_add_expires_ns(&ts->sched_timer, offset);
        }

        hrtimer_forward_now(&ts->sched_timer, TICK_NSEC);
        if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer)
                hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD);
        else
                tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
        tick_nohz_activate(ts);
}

/*
 * Shut down the tick and make sure the CPU won't try to retake the timekeeping
 * duty before disabling IRQs in idle for the last time.
 */
void tick_sched_timer_dying(int cpu)
{
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
        ktime_t idle_sleeptime, iowait_sleeptime;
        unsigned long idle_calls, idle_sleeps;

        /* This must happen before hrtimers are migrated! */
        if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES))
                hrtimer_cancel(&ts->sched_timer);

        idle_sleeptime = ts->idle_sleeptime;
        iowait_sleeptime = ts->iowait_sleeptime;
        idle_calls = ts->idle_calls;
        idle_sleeps = ts->idle_sleeps;
        memset(ts, 0, sizeof(*ts));
        ts->idle_sleeptime = idle_sleeptime;
        ts->iowait_sleeptime = iowait_sleeptime;
        ts->idle_calls = idle_calls;
        ts->idle_sleeps = idle_sleeps;
}

/*
 * Async notification about clocksource changes
 */
void tick_clock_notify(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks);
}

/*
 * Async notification about clock event changes
 */
void tick_oneshot_notify(void)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

        set_bit(0, &ts->check_clocks);
}

/*
 * Check if a change happened, which makes oneshot possible.
 *
 * Called cyclically from the hrtimer softirq (driven by the timer
 * softirq). 'allow_nohz' signals that we can switch into low-res NOHZ
 * mode, because high resolution timers are disabled (either compile
 * or runtime). Called with interrupts disabled.
 */
int tick_check_oneshot_change(int allow_nohz)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

        if (!test_and_clear_bit(0, &ts->check_clocks))
                return 0;

        if (tick_sched_flag_test(ts, TS_FLAG_NOHZ))
                return 0;

        if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available())
                return 0;

        if (!allow_nohz)
                return 1;

        tick_nohz_switch_to_nohz();
        return 0;
}


























































































































  318 


    5 
   34 








   30 

    7 
   11 






   30 
    5 



   20 

   27 
    4 


   22 











   49 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#ifndef _LINUX_JHASH_H
#define _LINUX_JHASH_H

/* jhash.h: Jenkins hash support.
 *
 * Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net)
 *
 * https://burtleburtle.net/bob/hash/
 *
 * These are the credits from Bob's sources:
 *
 * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
 *
 * These are functions for producing 32-bit hashes for hash table lookup.
 * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
 * are externally useful functions.  Routines to test the hash are included
 * if SELF_TEST is defined.  You can use this free for any purpose.  It's in
 * the public domain.  It has no warranty.
 *
 * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@netfilter.org)
 *
 * I've modified Bob's hash to be useful in the Linux kernel, and
 * any bugs present are my fault.
 * Jozsef
 */
#include <linux/bitops.h>
#include <linux/unaligned.h>

/* Best hash sizes are of power of two */
#define jhash_size(n)   ((u32)1<<(n))
/* Mask the hash value, i.e (value & jhash_mask(n)) instead of (value % n) */
#define jhash_mask(n)   (jhash_size(n)-1)

/* __jhash_mix - mix 3 32-bit values reversibly. */
#define __jhash_mix(a, b, c)                        \
{                                                \
        a -= c;  a ^= rol32(c, 4);  c += b;        \
        b -= a;  b ^= rol32(a, 6);  a += c;        \
        c -= b;  c ^= rol32(b, 8);  b += a;        \
        a -= c;  a ^= rol32(c, 16); c += b;        \
        b -= a;  b ^= rol32(a, 19); a += c;        \
        c -= b;  c ^= rol32(b, 4);  b += a;        \
}

/* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */
#define __jhash_final(a, b, c)                        \
{                                                \
        c ^= b; c -= rol32(b, 14);                \
        a ^= c; a -= rol32(c, 11);                \
        b ^= a; b -= rol32(a, 25);                \
        c ^= b; c -= rol32(b, 16);                \
        a ^= c; a -= rol32(c, 4);                \
        b ^= a; b -= rol32(a, 14);                \
        c ^= b; c -= rol32(b, 24);                \
}

/* An arbitrary initial parameter */
#define JHASH_INITVAL                0xdeadbeef

/* jhash - hash an arbitrary key
 * @k: sequence of bytes as key
 * @length: the length of the key
 * @initval: the previous hash, or an arbitrary value
 *
 * The generic version, hashes an arbitrary sequence of bytes.
 * No alignment or length assumptions are made about the input key.
 *
 * Returns the hash value of the key. The result depends on endianness.
 */
static inline u32 jhash(const void *key, u32 length, u32 initval)
{
        u32 a, b, c;
        const u8 *k = key;

        /* Set up the internal state */
        a = b = c = JHASH_INITVAL + length + initval;

        /* All but the last block: affect some 32 bits of (a,b,c) */
        while (length > 12) {
                a += get_unaligned((u32 *)k);
                b += get_unaligned((u32 *)(k + 4));
                c += get_unaligned((u32 *)(k + 8));
                __jhash_mix(a, b, c);
                length -= 12;
                k += 12;
        }
        /* Last block: affect all 32 bits of (c) */
        switch (length) {
        case 12: c += (u32)k[11]<<24;        fallthrough;
        case 11: c += (u32)k[10]<<16;        fallthrough;
        case 10: c += (u32)k[9]<<8;        fallthrough;
        case 9:  c += k[8];                fallthrough;
        case 8:  b += (u32)k[7]<<24;        fallthrough;
        case 7:  b += (u32)k[6]<<16;        fallthrough;
        case 6:  b += (u32)k[5]<<8;        fallthrough;
        case 5:  b += k[4];                fallthrough;
        case 4:  a += (u32)k[3]<<24;        fallthrough;
        case 3:  a += (u32)k[2]<<16;        fallthrough;
        case 2:  a += (u32)k[1]<<8;        fallthrough;
        case 1:  a += k[0];
                 __jhash_final(a, b, c);
                 break;
        case 0: /* Nothing left to add */
                break;
        }

        return c;
}

/* jhash2 - hash an array of u32's
 * @k: the key which must be an array of u32's
 * @length: the number of u32's in the key
 * @initval: the previous hash, or an arbitrary value
 *
 * Returns the hash value of the key.
 */
static inline u32 jhash2(const u32 *k, u32 length, u32 initval)
{
        u32 a, b, c;

        /* Set up the internal state */
        a = b = c = JHASH_INITVAL + (length<<2) + initval;

        /* Handle most of the key */
        while (length > 3) {
                a += k[0];
                b += k[1];
                c += k[2];
                __jhash_mix(a, b, c);
                length -= 3;
                k += 3;
        }

        /* Handle the last 3 u32's */
        switch (length) {
        case 3: c += k[2];        fallthrough;
        case 2: b += k[1];        fallthrough;
        case 1: a += k[0];
                __jhash_final(a, b, c);
                break;
        case 0:        /* Nothing left to add */
                break;
        }

        return c;
}


/* __jhash_nwords - hash exactly 3, 2 or 1 word(s) */
static inline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval)
{
        a += initval;
        b += initval;
        c += initval;

        __jhash_final(a, b, c);

        return c;
}

static inline u32 jhash_3words(u32 a, u32 b, u32 c, u32 initval)
{
        return __jhash_nwords(a, b, c, initval + JHASH_INITVAL + (3 << 2));
}

static inline u32 jhash_2words(u32 a, u32 b, u32 initval)
{
        return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2));
}

static inline u32 jhash_1word(u32 a, u32 initval)
{
        return __jhash_nwords(a, 0, 0, initval + JHASH_INITVAL + (1 << 2));
}

#endif /* _LINUX_JHASH_H */




















    7 







    7 
















































































    9 








    9 













    9 






    9 
    9 












1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * geniv: Shared IV generator code
 *
 * This file provides common code to IV generators such as seqiv.
 *
 * Copyright (c) 2007-2019 Herbert Xu <herbert@gondor.apana.org.au>
 */

#include <crypto/internal/geniv.h>
#include <crypto/internal/rng.h>
#include <linux/err.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/rtnetlink.h>
#include <linux/slab.h>

static int aead_geniv_setkey(struct crypto_aead *tfm,
                             const u8 *key, unsigned int keylen)
{
        struct aead_geniv_ctx *ctx = crypto_aead_ctx(tfm);

        return crypto_aead_setkey(ctx->child, key, keylen);
}

static int aead_geniv_setauthsize(struct crypto_aead *tfm,
                                  unsigned int authsize)
{
        struct aead_geniv_ctx *ctx = crypto_aead_ctx(tfm);

        return crypto_aead_setauthsize(ctx->child, authsize);
}

static void aead_geniv_free(struct aead_instance *inst)
{
        crypto_drop_aead(aead_instance_ctx(inst));
        kfree(inst);
}

struct aead_instance *aead_geniv_alloc(struct crypto_template *tmpl,
                                       struct rtattr **tb)
{
        struct crypto_aead_spawn *spawn;
        struct aead_instance *inst;
        struct aead_alg *alg;
        unsigned int ivsize;
        unsigned int maxauthsize;
        u32 mask;
        int err;

        err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_AEAD, &mask);
        if (err)
                return ERR_PTR(err);

        inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL);
        if (!inst)
                return ERR_PTR(-ENOMEM);

        spawn = aead_instance_ctx(inst);

        err = crypto_grab_aead(spawn, aead_crypto_instance(inst),
                               crypto_attr_alg_name(tb[1]), 0, mask);
        if (err)
                goto err_free_inst;

        alg = crypto_spawn_aead_alg(spawn);

        ivsize = crypto_aead_alg_ivsize(alg);
        maxauthsize = crypto_aead_alg_maxauthsize(alg);

        err = -EINVAL;
        if (ivsize < sizeof(u64))
                goto err_free_inst;

        err = -ENAMETOOLONG;
        if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME,
                     "%s(%s)", tmpl->name, alg->base.cra_name) >=
            CRYPTO_MAX_ALG_NAME)
                goto err_free_inst;
        if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME,
                     "%s(%s)", tmpl->name, alg->base.cra_driver_name) >=
            CRYPTO_MAX_ALG_NAME)
                goto err_free_inst;

        inst->alg.base.cra_priority = alg->base.cra_priority;
        inst->alg.base.cra_blocksize = alg->base.cra_blocksize;
        inst->alg.base.cra_alignmask = alg->base.cra_alignmask;
        inst->alg.base.cra_ctxsize = sizeof(struct aead_geniv_ctx);

        inst->alg.setkey = aead_geniv_setkey;
        inst->alg.setauthsize = aead_geniv_setauthsize;

        inst->alg.ivsize = ivsize;
        inst->alg.maxauthsize = maxauthsize;

        inst->free = aead_geniv_free;

out:
        return inst;

err_free_inst:
        aead_geniv_free(inst);
        inst = ERR_PTR(err);
        goto out;
}
EXPORT_SYMBOL_GPL(aead_geniv_alloc);

int aead_init_geniv(struct crypto_aead *aead)
{
        struct aead_geniv_ctx *ctx = crypto_aead_ctx(aead);
        struct aead_instance *inst = aead_alg_instance(aead);
        struct crypto_aead *child;
        int err;

        spin_lock_init(&ctx->lock);

        err = crypto_get_default_rng();
        if (err)
                goto out;

        err = crypto_rng_get_bytes(crypto_default_rng, ctx->salt,
                                   crypto_aead_ivsize(aead));
        crypto_put_default_rng();
        if (err)
                goto out;

        child = crypto_spawn_aead(aead_instance_ctx(inst));
        err = PTR_ERR(child);
        if (IS_ERR(child))
                goto out;

        ctx->child = child;
        crypto_aead_set_reqsize(aead, crypto_aead_reqsize(child) +
                                      sizeof(struct aead_request));

        err = 0;

out:
        return err;
}
EXPORT_SYMBOL_GPL(aead_init_geniv);

void aead_exit_geniv(struct crypto_aead *tfm)
{
        struct aead_geniv_ctx *ctx = crypto_aead_ctx(tfm);

        crypto_free_aead(ctx->child);
}
EXPORT_SYMBOL_GPL(aead_exit_geniv);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Shared IV generator code");

































   50 








   50 

   50 
   12 














   50 












   50 









   12 

    2 







   50 



   38 


   38 
   23 

   50 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
// SPDX-License-Identifier: GPL-2.0-only
/*
 * ratelimit.c - Do something with rate limit.
 *
 * Isolated from kernel/printk.c by Dave Young <hidave.darkstar@gmail.com>
 *
 * 2008-05-01 rewrite the function and use a ratelimit_state data struct as
 * parameter. Now every user can use their own standalone ratelimit_state.
 */

#include <linux/ratelimit.h>
#include <linux/jiffies.h>
#include <linux/export.h>

/*
 * __ratelimit - rate limiting
 * @rs: ratelimit_state data
 * @func: name of calling function
 *
 * This enforces a rate limit: not more than @rs->burst callbacks
 * in every @rs->interval
 *
 * RETURNS:
 * 0 means callbacks will be suppressed.
 * 1 means go ahead and do it.
 */
int ___ratelimit(struct ratelimit_state *rs, const char *func)
{
        /* Paired with WRITE_ONCE() in .proc_handler().
         * Changing two values seperately could be inconsistent
         * and some message could be lost.  (See: net_ratelimit_state).
         */
        int interval = READ_ONCE(rs->interval);
        int burst = READ_ONCE(rs->burst);
        unsigned long flags;
        int ret = 0;

        /*
         * Zero interval says never limit, otherwise, non-positive burst
         * says always limit.
         */
        if (interval <= 0 || burst <= 0) {
                WARN_ONCE(interval < 0 || burst < 0, "Negative interval (%d) or burst (%d): Uninitialized ratelimit_state structure?\n", interval, burst);
                ret = interval == 0 || burst > 0;
                if (!(READ_ONCE(rs->flags) & RATELIMIT_INITIALIZED) || (!interval && !burst) ||
                    !raw_spin_trylock_irqsave(&rs->lock, flags))
                        goto nolock_ret;

                /* Force re-initialization once re-enabled. */
                rs->flags &= ~RATELIMIT_INITIALIZED;
                goto unlock_ret;
        }

        /*
         * If we contend on this state's lock then just check if
         * the current burst is used or not. It might cause
         * false positive when we are past the interval and
         * the current lock owner is just about to reset it.
         */
        if (!raw_spin_trylock_irqsave(&rs->lock, flags)) {
                if (READ_ONCE(rs->flags) & RATELIMIT_INITIALIZED &&
                    atomic_read(&rs->rs_n_left) > 0 && atomic_dec_return(&rs->rs_n_left) >= 0)
                        ret = 1;
                goto nolock_ret;
        }

        if (!(rs->flags & RATELIMIT_INITIALIZED)) {
                rs->begin = jiffies;
                rs->flags |= RATELIMIT_INITIALIZED;
                atomic_set(&rs->rs_n_left, rs->burst);
        }

        if (time_is_before_jiffies(rs->begin + interval)) {
                int m;

                /*
                 * Reset rs_n_left ASAP to reduce false positives
                 * in parallel calls, see above.
                 */
                atomic_set(&rs->rs_n_left, rs->burst);
                rs->begin = jiffies;

                if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) {
                        m = ratelimit_state_reset_miss(rs);
                        if (m) {
                                printk_deferred(KERN_WARNING
                                                "%s: %d callbacks suppressed\n", func, m);
                        }
                }
        }

        /* Note that the burst might be taken by a parallel call. */
        if (atomic_read(&rs->rs_n_left) > 0 && atomic_dec_return(&rs->rs_n_left) >= 0)
                ret = 1;

unlock_ret:
        raw_spin_unlock_irqrestore(&rs->lock, flags);

nolock_ret:
        if (!ret)
                ratelimit_state_inc_miss(rs);

        return ret;
}
EXPORT_SYMBOL(___ratelimit);






















    4 









    4 






























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PKRU_H
#define _ASM_X86_PKRU_H

#include <asm/cpufeature.h>

#define PKRU_AD_BIT 0x1u
#define PKRU_WD_BIT 0x2u
#define PKRU_BITS_PER_PKEY 2

#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
extern u32 init_pkru_value;
#define pkru_get_init_value()        READ_ONCE(init_pkru_value)
#else
#define init_pkru_value        0
#define pkru_get_init_value()        0
#endif

static inline bool __pkru_allows_read(u32 pkru, u16 pkey)
{
        int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
        return !(pkru & (PKRU_AD_BIT << pkru_pkey_bits));
}

static inline bool __pkru_allows_write(u32 pkru, u16 pkey)
{
        int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
        /*
         * Access-disable disables writes too so we need to check
         * both bits here.
         */
        return !(pkru & ((PKRU_AD_BIT|PKRU_WD_BIT) << pkru_pkey_bits));
}

static inline u32 read_pkru(void)
{
        if (cpu_feature_enabled(X86_FEATURE_OSPKE))
                return rdpkru();
        return 0;
}

static inline void write_pkru(u32 pkru)
{
        if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
                return;
        /*
         * WRPKRU is relatively expensive compared to RDPKRU.
         * Avoid WRPKRU when it would not change the value.
         */
        if (pkru != rdpkru())
                wrpkru(pkru);
}

static inline void pkru_write_default(void)
{
        if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
                return;

        wrpkru(pkru_get_init_value());
}

#endif





















































































































































    9 






    9 
    9 

































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
/*
 * DRBG based on NIST SP800-90A
 *
 * Copyright Stephan Mueller <smueller@chronox.de>, 2014
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, and the entire permission notice in its entirety,
 *    including the disclaimer of warranties.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the author may not be used to endorse or promote
 *    products derived from this software without specific prior
 *    written permission.
 *
 * ALTERNATIVELY, this product may be distributed under the terms of
 * the GNU General Public License, in which case the provisions of the GPL are
 * required INSTEAD OF the above restrictions.  (This clause is
 * necessary due to a potential bad interaction between the GPL and
 * the restrictions contained in a BSD-style copyright.)
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
 * WHICH ARE HEREBY DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
 * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
 * DAMAGE.
 */

#ifndef _DRBG_H
#define _DRBG_H


#include <linux/random.h>
#include <linux/scatterlist.h>
#include <crypto/hash.h>
#include <crypto/skcipher.h>
#include <linux/module.h>
#include <linux/crypto.h>
#include <linux/slab.h>
#include <crypto/internal/rng.h>
#include <crypto/rng.h>
#include <linux/fips.h>
#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/workqueue.h>

/*
 * Concatenation Helper and string operation helper
 *
 * SP800-90A requires the concatenation of different data. To avoid copying
 * buffers around or allocate additional memory, the following data structure
 * is used to point to the original memory with its size. In addition, it
 * is used to build a linked list. The linked list defines the concatenation
 * of individual buffers. The order of memory block referenced in that
 * linked list determines the order of concatenation.
 */
struct drbg_string {
        const unsigned char *buf;
        size_t len;
        struct list_head list;
};

static inline void drbg_string_fill(struct drbg_string *string,
                                    const unsigned char *buf, size_t len)
{
        string->buf = buf;
        string->len = len;
        INIT_LIST_HEAD(&string->list);
}

struct drbg_state;
typedef uint32_t drbg_flag_t;

struct drbg_core {
        drbg_flag_t flags;        /* flags for the cipher */
        __u8 statelen;                /* maximum state length */
        __u8 blocklen_bytes;        /* block size of output in bytes */
        char cra_name[CRYPTO_MAX_ALG_NAME]; /* mapping to kernel crypto API */
         /* kernel crypto API backend cipher name */
        char backend_cra_name[CRYPTO_MAX_ALG_NAME];
};

struct drbg_state_ops {
        int (*update)(struct drbg_state *drbg, struct list_head *seed,
                      int reseed);
        int (*generate)(struct drbg_state *drbg,
                        unsigned char *buf, unsigned int buflen,
                        struct list_head *addtl);
        int (*crypto_init)(struct drbg_state *drbg);
        int (*crypto_fini)(struct drbg_state *drbg);

};

struct drbg_test_data {
        struct drbg_string *testentropy; /* TEST PARAMETER: test entropy */
};

enum drbg_seed_state {
        DRBG_SEED_STATE_UNSEEDED,
        DRBG_SEED_STATE_PARTIAL, /* Seeded with !rng_is_initialized() */
        DRBG_SEED_STATE_FULL,
};

struct drbg_state {
        struct mutex drbg_mutex;        /* lock around DRBG */
        unsigned char *V;        /* internal state 10.1.1.1 1a) */
        unsigned char *Vbuf;
        /* hash: static value 10.1.1.1 1b) hmac / ctr: key */
        unsigned char *C;
        unsigned char *Cbuf;
        /* Number of RNG requests since last reseed -- 10.1.1.1 1c) */
        size_t reseed_ctr;
        size_t reseed_threshold;
         /* some memory the DRBG can use for its operation */
        unsigned char *scratchpad;
        unsigned char *scratchpadbuf;
        void *priv_data;        /* Cipher handle */

        struct crypto_skcipher *ctr_handle;        /* CTR mode cipher handle */
        struct skcipher_request *ctr_req;        /* CTR mode request handle */
        __u8 *outscratchpadbuf;                        /* CTR mode output scratchpad */
        __u8 *outscratchpad;                        /* CTR mode aligned outbuf */
        struct crypto_wait ctr_wait;                /* CTR mode async wait obj */
        struct scatterlist sg_in, sg_out;        /* CTR mode SGLs */

        enum drbg_seed_state seeded;                /* DRBG fully seeded? */
        unsigned long last_seed_time;
        bool pr;                /* Prediction resistance enabled? */
        bool fips_primed;        /* Continuous test primed? */
        unsigned char *prev;        /* FIPS 140-2 continuous test value */
        struct crypto_rng *jent;
        const struct drbg_state_ops *d_ops;
        const struct drbg_core *core;
        struct drbg_string test_data;
};

static inline __u8 drbg_statelen(struct drbg_state *drbg)
{
        if (drbg && drbg->core)
                return drbg->core->statelen;
        return 0;
}

static inline __u8 drbg_blocklen(struct drbg_state *drbg)
{
        if (drbg && drbg->core)
                return drbg->core->blocklen_bytes;
        return 0;
}

static inline __u8 drbg_keylen(struct drbg_state *drbg)
{
        if (drbg && drbg->core)
                return (drbg->core->statelen - drbg->core->blocklen_bytes);
        return 0;
}

static inline size_t drbg_max_request_bytes(struct drbg_state *drbg)
{
        /* SP800-90A requires the limit 2**19 bits, but we return bytes */
        return (1 << 16);
}

static inline size_t drbg_max_addtl(struct drbg_state *drbg)
{
        /* SP800-90A requires 2**35 bytes additional info str / pers str */
#if (__BITS_PER_LONG == 32)
        /*
         * SP800-90A allows smaller maximum numbers to be returned -- we
         * return SIZE_MAX - 1 to allow the verification of the enforcement
         * of this value in drbg_healthcheck_sanity.
         */
        return (SIZE_MAX - 1);
#else
        return (1UL<<35);
#endif
}

static inline size_t drbg_max_requests(struct drbg_state *drbg)
{
        /* SP800-90A requires 2**48 maximum requests before reseeding */
        return (1<<20);
}

/*
 * This is a wrapper to the kernel crypto API function of
 * crypto_rng_generate() to allow the caller to provide additional data.
 *
 * @drng DRBG handle -- see crypto_rng_get_bytes
 * @outbuf output buffer -- see crypto_rng_get_bytes
 * @outlen length of output buffer -- see crypto_rng_get_bytes
 * @addtl_input additional information string input buffer
 * @addtllen length of additional information string buffer
 *
 * return
 *        see crypto_rng_get_bytes
 */
static inline int crypto_drbg_get_bytes_addtl(struct crypto_rng *drng,
                        unsigned char *outbuf, unsigned int outlen,
                        struct drbg_string *addtl)
{
        return crypto_rng_generate(drng, addtl->buf, addtl->len,
                                   outbuf, outlen);
}

/*
 * TEST code
 *
 * This is a wrapper to the kernel crypto API function of
 * crypto_rng_generate() to allow the caller to provide additional data and
 * allow furnishing of test_data
 *
 * @drng DRBG handle -- see crypto_rng_get_bytes
 * @outbuf output buffer -- see crypto_rng_get_bytes
 * @outlen length of output buffer -- see crypto_rng_get_bytes
 * @addtl_input additional information string input buffer
 * @addtllen length of additional information string buffer
 * @test_data filled test data
 *
 * return
 *        see crypto_rng_get_bytes
 */
static inline int crypto_drbg_get_bytes_addtl_test(struct crypto_rng *drng,
                        unsigned char *outbuf, unsigned int outlen,
                        struct drbg_string *addtl,
                        struct drbg_test_data *test_data)
{
        crypto_rng_set_entropy(drng, test_data->testentropy->buf,
                               test_data->testentropy->len);
        return crypto_rng_generate(drng, addtl->buf, addtl->len,
                                   outbuf, outlen);
}

/*
 * TEST code
 *
 * This is a wrapper to the kernel crypto API function of
 * crypto_rng_reset() to allow the caller to provide test_data
 *
 * @drng DRBG handle -- see crypto_rng_reset
 * @pers personalization string input buffer
 * @perslen length of additional information string buffer
 * @test_data filled test data
 *
 * return
 *        see crypto_rng_reset
 */
static inline int crypto_drbg_reset_test(struct crypto_rng *drng,
                                         struct drbg_string *pers,
                                         struct drbg_test_data *test_data)
{
        crypto_rng_set_entropy(drng, test_data->testentropy->buf,
                               test_data->testentropy->len);
        return crypto_rng_reset(drng, pers->buf, pers->len);
}

/* DRBG type flags */
#define DRBG_CTR        ((drbg_flag_t)1<<0)
#define DRBG_HMAC        ((drbg_flag_t)1<<1)
#define DRBG_HASH        ((drbg_flag_t)1<<2)
#define DRBG_TYPE_MASK        (DRBG_CTR | DRBG_HMAC | DRBG_HASH)
/* DRBG strength flags */
#define DRBG_STRENGTH128        ((drbg_flag_t)1<<3)
#define DRBG_STRENGTH192        ((drbg_flag_t)1<<4)
#define DRBG_STRENGTH256        ((drbg_flag_t)1<<5)
#define DRBG_STRENGTH_MASK        (DRBG_STRENGTH128 | DRBG_STRENGTH192 | \
                                 DRBG_STRENGTH256)

enum drbg_prefixes {
        DRBG_PREFIX0 = 0x00,
        DRBG_PREFIX1,
        DRBG_PREFIX2,
        DRBG_PREFIX3
};

#endif /* _DRBG_H */








































































































































  265 



















  263 


  266 


  267 
  268 
  266 




  265 

  266 



  267 


  265 






































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
8073
8074
8075
8076
8077
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387
8388
8389
8390
8391
8392
8393
8394
8395
8396
8397
8398
8399
8400
8401
8402
8403
8404
8405
8406
8407
8408
8409
8410
8411
8412
8413
8414
8415
8416
8417
8418
8419
8420
8421
8422
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468
8469
8470
8471
8472
8473
8474
8475
8476
8477
8478
8479
8480
8481
8482
8483
8484
8485
8486
8487
8488
8489
8490
8491
8492
8493
8494
8495
8496
8497
8498
8499
8500
8501
8502
8503
8504
8505
8506
8507
8508
8509
8510
8511
8512
8513
8514
8515
8516
8517
8518
8519
8520
8521
8522
8523
8524
8525
8526
8527
8528
8529
8530
8531
8532
8533
8534
8535
8536
8537
8538
8539
8540
8541
8542
8543
8544
8545
8546
8547
8548
8549
8550
8551
8552
8553
8554
8555
8556
8557
8558
8559
8560
8561
8562
8563
8564
8565
8566
8567
8568
8569
8570
8571
8572
8573
8574
8575
8576
8577
8578
8579
8580
8581
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595
8596
8597
8598
8599
8600
8601
8602
8603
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623
8624
8625
8626
8627
8628
8629
8630
8631
8632
8633
8634
8635
8636
8637
8638
8639
8640
8641
8642
8643
8644
8645
8646
8647
8648
8649
8650
8651
8652
8653
8654
8655
8656
8657
8658
8659
8660
8661
8662
8663
8664
8665
8666
8667
8668
8669
8670
8671
8672
8673
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696
8697
8698
8699
8700
8701
8702
8703
8704
8705
8706
8707
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732
8733
8734
8735
8736
8737
8738
8739
8740
8741
8742
8743
8744
8745
8746
8747
8748
8749
8750
8751
8752
8753
8754
8755
8756
8757
8758
8759
8760
8761
8762
8763
8764
8765
8766
8767
8768
8769
8770
8771
8772
8773
8774
8775
8776
8777
8778
8779
8780
8781
8782
8783
8784
8785
8786
8787
8788
8789
8790
8791
8792
8793
8794
8795
8796
8797
8798
8799
8800
8801
8802
8803
8804
8805
8806
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816
8817
8818
8819
8820
8821
8822
8823
8824
8825
8826
8827
8828
8829
8830
8831
8832
8833
8834
8835
8836
8837
8838
8839
8840
8841
8842
8843
8844
8845
8846
8847
8848
8849
8850
8851
8852
8853
8854
8855
8856
8857
8858
8859
8860
8861
8862
8863
8864
8865
8866
8867
8868
8869
8870
8871
8872
8873
8874
8875
8876
8877
8878
8879
8880
8881
8882
8883
8884
8885
8886
8887
8888
8889
8890
8891
8892
8893
8894
8895
8896
8897
8898
8899
8900
8901
8902
8903
8904
8905
8906
8907
8908
8909
8910
8911
8912
8913
8914
8915
8916
8917
8918
8919
8920
8921
8922
8923
8924
8925
8926
8927
8928
8929
8930
8931
8932
8933
8934
8935
8936
8937
8938
8939
8940
8941
8942
8943
8944
8945
8946
8947
8948
8949
8950
8951
8952
8953
8954
8955
8956
8957
8958
8959
8960
8961
8962
8963
8964
8965
8966
8967
8968
8969
8970
8971
8972
8973
8974
8975
8976
8977
8978
8979
8980
8981
8982
8983
8984
8985
8986
8987
8988
8989
8990
8991
8992
8993
8994
8995
8996
8997
8998
8999
9000
9001
9002
9003
9004
9005
9006
9007
9008
9009
9010
9011
9012
9013
9014
9015
9016
9017
9018
9019
9020
9021
9022
9023
9024
9025
9026
9027
9028
9029
9030
9031
9032
9033
9034
9035
9036
9037
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054
9055
9056
9057
9058
9059
9060
9061
9062
9063
9064
9065
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078
9079
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096
9097
9098
9099
9100
9101
9102
9103
9104
9105
9106
9107
9108
9109
9110
9111
9112
9113
9114
9115
9116
9117
9118
9119
9120
9121
9122
9123
9124
9125
9126
9127
9128
9129
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144
9145
9146
9147
9148
9149
9150
9151
9152
9153
9154
9155
9156
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189
9190
9191
9192
9193
9194
9195
9196
9197
9198
9199
9200
9201
9202
9203
9204
9205
9206
9207
9208
9209
9210
9211
9212
9213
9214
9215
9216
9217
9218
9219
9220
9221
9222
9223
9224
9225
9226
9227
9228
9229
9230
9231
9232
9233
9234
9235
9236
9237
9238
9239
9240
9241
9242
9243
9244
9245
9246
9247
9248
9249
9250
9251
9252
9253
9254
9255
9256
9257
9258
9259
9260
9261
9262
9263
9264
9265
9266
9267
9268
9269
9270
9271
9272
9273
9274
9275
9276
9277
9278
9279
9280
9281
9282
9283
9284
9285
9286
9287
9288
9289
9290
9291
9292
9293
9294
9295
9296
9297
9298
9299
9300
9301
9302
9303
9304
9305
9306
9307
9308
9309
9310
9311
9312
9313
9314
9315
9316
9317
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351
9352
9353
9354
9355
9356
9357
9358
9359
9360
9361
9362
9363
9364
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
9440
9441
9442
9443
9444
9445
9446
9447
9448
9449
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
9469
9470
9471
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481
9482
9483
9484
9485
9486
9487
9488
9489
9490
9491
9492
9493
9494
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508
9509
9510
9511
9512
9513
9514
9515
9516
9517
9518
9519
9520
9521
9522
9523
9524
9525
9526
9527
9528
9529
9530
9531
9532
9533
9534
9535
9536
9537
9538
9539
9540
9541
9542
9543
9544
9545
9546
9547
9548
9549
9550
9551
9552
9553
9554
9555
9556
9557
9558
9559
9560
9561
9562
9563
9564
9565
9566
9567
9568
9569
9570
9571
9572
9573
9574
9575
9576
9577
9578
9579
9580
9581
9582
9583
9584
9585
9586
9587
9588
9589
9590
9591
9592
9593
9594
9595
9596
9597
9598
9599
9600
9601
9602
9603
9604
9605
9606
9607
9608
9609
9610
9611
9612
9613
9614
9615
9616
9617
9618
9619
9620
9621
9622
9623
9624
9625
9626
9627
9628
9629
9630
9631
9632
9633
9634
9635
9636
9637
9638
9639
9640
9641
9642
9643
9644
9645
9646
9647
9648
9649
9650
9651
9652
9653
9654
9655
9656
9657
9658
9659
9660
9661
9662
9663
9664
9665
9666
9667
9668
9669
9670
9671
9672
9673
9674
9675
9676
9677
9678
9679
9680
9681
9682
9683
9684
9685
9686
9687
9688
9689
9690
9691
9692
9693
9694
9695
9696
9697
9698
9699
9700
9701
9702
9703
9704
9705
9706
9707
9708
9709
9710
9711
9712
9713
9714
9715
9716
9717
9718
9719
9720
9721
9722
9723
9724
9725
9726
9727
9728
9729
9730
9731
9732
9733
9734
9735
9736
9737
9738
9739
9740
9741
9742
9743
9744
9745
9746
9747
9748
9749
9750
9751
9752
9753
9754
9755
9756
9757
9758
9759
9760
9761
9762
9763
9764
9765
9766
9767
9768
9769
9770
9771
9772
9773
9774
9775
9776
9777
9778
9779
9780
9781
9782
9783
9784
9785
9786
9787
9788
9789
9790
9791
9792
9793
9794
9795
9796
9797
9798
9799
9800
9801
9802
9803
9804
9805
9806
9807
9808
9809
9810
9811
9812
9813
9814
9815
9816
9817
9818
9819
9820
9821
9822
9823
9824
9825
9826
9827
9828
9829
9830
9831
9832
9833
9834
9835
9836
9837
9838
9839
9840
9841
9842
9843
9844
9845
9846
9847
9848
9849
9850
9851
9852
9853
9854
9855
9856
9857
9858
9859
9860
9861
9862
9863
9864
9865
9866
9867
9868
9869
9870
9871
9872
9873
9874
9875
9876
9877
9878
9879
9880
9881
9882
9883
9884
9885
9886
9887
9888
9889
9890
9891
9892
9893
9894
9895
9896
9897
9898
9899
9900
9901
9902
9903
9904
9905
9906
9907
9908
9909
9910
9911
9912
9913
9914
9915
9916
9917
9918
9919
9920
9921
9922
9923
9924
9925
9926
9927
9928
9929
9930
9931
9932
9933
9934
9935
9936
9937
9938
9939
9940
9941
9942
9943
9944
9945
9946
9947
9948
9949
9950
9951
9952
9953
9954
9955
9956
9957
9958
9959
9960
9961
9962
9963
9964
9965
9966
9967
9968
9969
9970
9971
9972
9973
9974
9975
9976
9977
9978
9979
9980
9981
9982
9983
9984
9985
9986
9987
9988
9989
9990
9991
9992
9993
9994
9995
9996
9997
9998
9999
10000
10001
10002
10003
10004
10005
10006
10007
10008
10009
10010
10011
10012
10013
10014
10015
10016
10017
10018
10019
10020
10021
10022
10023
10024
10025
10026
10027
10028
10029
10030
10031
10032
10033
10034
10035
10036
10037
10038
10039
10040
10041
10042
10043
10044
10045
10046
10047
10048
10049
10050
10051
10052
10053
10054
10055
10056
10057
10058
10059
10060
10061
10062
10063
10064
10065
10066
10067
10068
10069
10070
10071
10072
10073
10074
10075
10076
10077
10078
10079
10080
10081
10082
10083
10084
10085
10086
10087
10088
10089
10090
10091
10092
10093
10094
10095
10096
10097
10098
10099
10100
10101
10102
10103
10104
10105
10106
10107
10108
10109
10110
10111
10112
10113
10114
10115
10116
10117
10118
10119
10120
10121
10122
10123
10124
10125
10126
10127
10128
10129
10130
10131
10132
10133
10134
10135
10136
10137
10138
10139
10140
10141
10142
10143
10144
10145
10146
10147
10148
10149
10150
10151
10152
10153
10154
10155
10156
10157
10158
10159
10160
10161
10162
10163
10164
10165
10166
10167
10168
10169
10170
10171
10172
10173
10174
10175
10176
10177
10178
10179
10180
10181
10182
10183
10184
10185
10186
10187
10188
10189
10190
10191
10192
10193
10194
10195
10196
10197
10198
10199
10200
10201
10202
10203
10204
10205
10206
10207
10208
10209
10210
10211
10212
10213
10214
10215
10216
10217
10218
10219
10220
10221
10222
10223
10224
10225
10226
10227
10228
10229
10230
10231
10232
10233
10234
10235
10236
10237
10238
10239
10240
10241
10242
10243
10244
10245
10246
10247
10248
10249
10250
10251
10252
10253
10254
10255
10256
10257
10258
10259
10260
10261
10262
10263
10264
10265
10266
10267
10268
10269
10270
10271
10272
10273
10274
10275
10276
10277
10278
10279
10280
10281
10282
10283
10284
10285
10286
10287
10288
10289
10290
10291
10292
10293
10294
10295
10296
10297
10298
10299
10300
10301
10302
10303
10304
10305
10306
10307
10308
10309
10310
10311
10312
10313
10314
10315
10316
10317
10318
10319
10320
10321
10322
10323
10324
10325
10326
10327
10328
10329
10330
10331
10332
10333
10334
10335
10336
10337
10338
10339
10340
10341
10342
10343
10344
10345
10346
10347
10348
10349
10350
10351
10352
10353
10354
10355
10356
10357
10358
10359
10360
10361
10362
10363
10364
10365
10366
10367
10368
10369
10370
10371
10372
10373
10374
10375
10376
10377
10378
10379
10380
10381
10382
10383
10384
10385
10386
10387
10388
10389
10390
10391
10392
10393
10394
10395
10396
10397
10398
10399
10400
10401
10402
10403
10404
10405
10406
10407
10408
10409
10410
10411
10412
10413
10414
10415
10416
10417
10418
10419
10420
10421
10422
10423
10424
10425
10426
10427
10428
10429
10430
10431
10432
10433
10434
10435
10436
10437
10438
10439
10440
10441
10442
10443
10444
10445
10446
10447
10448
10449
10450
10451
10452
10453
10454
10455
10456
10457
10458
10459
10460
10461
10462
10463
10464
10465
10466
10467
10468
10469
10470
10471
10472
10473
10474
10475
10476
10477
10478
10479
10480
10481
10482
10483
10484
10485
10486
10487
10488
10489
10490
10491
10492
10493
10494
10495
10496
10497
10498
10499
10500
10501
10502
10503
10504
10505
10506
10507
10508
10509
10510
10511
10512
10513
10514
10515
10516
10517
10518
10519
10520
10521
10522
10523
10524
10525
10526
10527
10528
10529
10530
10531
10532
10533
10534
10535
10536
10537
10538
10539
10540
10541
10542
10543
10544
10545
10546
10547
10548
10549
10550
10551
10552
10553
10554
10555
10556
10557
10558
10559
10560
10561
10562
10563
10564
10565
10566
10567
10568
10569
10570
10571
10572
10573
10574
10575
10576
10577
10578
10579
10580
10581
10582
10583
10584
10585
10586
10587
10588
10589
10590
10591
10592
10593
10594
10595
10596
10597
10598
10599
10600
10601
10602
10603
10604
10605
10606
10607
10608
10609
10610
10611
10612
10613
10614
10615
10616
10617
10618
10619
10620
10621
10622
10623
10624
10625
10626
10627
10628
10629
10630
10631
10632
10633
10634
10635
10636
10637
10638
10639
10640
10641
10642
10643
10644
10645
10646
10647
10648
10649
10650
10651
10652
10653
10654
10655
10656
10657
10658
10659
10660
10661
10662
10663
10664
10665
10666
10667
10668
10669
10670
10671
10672
10673
10674
10675
10676
10677
10678
10679
10680
10681
10682
10683
10684
10685
10686
10687
10688
10689
10690
10691
10692
10693
10694
10695
10696
10697
10698
10699
10700
10701
10702
10703
10704
10705
10706
10707
10708
10709
10710
10711
10712
10713
10714
10715
10716
10717
10718
10719
10720
10721
10722
10723
10724
10725
10726
10727
10728
10729
10730
10731
10732
10733
10734
10735
10736
10737
10738
10739
10740
10741
10742
10743
10744
10745
10746
10747
10748
10749
10750
10751
10752
10753
10754
10755
10756
10757
10758
10759
10760
10761
10762
10763
10764
10765
10766
10767
10768
10769
10770
10771
10772
10773
10774
10775
10776
10777
10778
10779
10780
10781
10782
10783
10784
10785
10786
10787
10788
10789
10790
10791
10792
10793
10794
10795
10796
10797
10798
10799
10800
10801
10802
10803
10804
10805
10806
10807
10808
10809
10810
10811
10812
10813
10814
10815
10816
10817
10818
10819
10820
10821
10822
10823
10824
10825
10826
10827
10828
10829
10830
10831
10832
10833
10834
10835
10836
10837
10838
10839
10840
10841
10842
10843
10844
10845
10846
10847
10848
10849
10850
10851
10852
10853
10854
10855
10856
10857
10858
10859
10860
10861
10862
10863
10864
10865
10866
10867
10868
10869
10870
10871
10872
10873
10874
10875
10876
10877
10878
10879
10880
10881
10882
10883
10884
10885
10886
10887
10888
10889
10890
10891
10892
10893
10894
10895
10896
10897
10898
10899
10900
10901
10902
10903
10904
10905
10906
10907
10908
10909
10910
10911
10912
10913
10914
10915
10916
10917
10918
10919
10920
10921
10922
10923
10924
10925
10926
10927
10928
10929
10930
10931
10932
10933
10934
10935
10936
10937
10938
10939
10940
10941
10942
10943
10944
10945
10946
10947
10948
10949
10950
10951
10952
10953
10954
10955
10956
10957
10958
10959
10960
10961
10962
10963
10964
10965
10966
10967
10968
10969
10970
10971
10972
10973
10974
10975
10976
10977
10978
10979
10980
10981
10982
10983
10984
10985
10986
10987
10988
10989
10990
10991
10992
10993
10994
10995
10996
10997
10998
10999
11000
11001
11002
11003
11004
11005
11006
11007
11008
11009
11010
11011
11012
11013
11014
11015
11016
11017
11018
11019
11020
11021
11022
11023
11024
11025
11026
11027
11028
11029
11030
11031
11032
11033
11034
11035
11036
11037
11038
11039
11040
11041
11042
11043
11044
11045
11046
11047
11048
11049
11050
11051
11052
11053
11054
11055
11056
11057
11058
11059
11060
11061
11062
11063
11064
11065
11066
11067
11068
11069
11070
11071
11072
11073
11074
11075
11076
11077
11078
11079
11080
11081
11082
11083
11084
11085
11086
11087
11088
11089
11090
11091
11092
11093
11094
11095
11096
11097
11098
11099
11100
11101
11102
11103
11104
11105
11106
11107
11108
11109
11110
11111
11112
11113
11114
11115
11116
11117
11118
11119
11120
11121
11122
11123
11124
11125
11126
11127
11128
11129
11130
11131
11132
11133
11134
11135
11136
11137
11138
11139
11140
11141
11142
11143
11144
11145
11146
11147
11148
11149
11150
11151
11152
11153
11154
11155
11156
11157
11158
11159
11160
11161
11162
11163
11164
11165
11166
11167
11168
11169
11170
11171
11172
11173
11174
11175
11176
11177
11178
11179
11180
11181
11182
11183
11184
11185
11186
11187
11188
11189
11190
11191
11192
11193
11194
11195
11196
11197
11198
11199
11200
11201
11202
11203
11204
11205
11206
11207
11208
11209
11210
11211
11212
11213
11214
11215
11216
11217
11218
11219
11220
11221
11222
11223
11224
11225
11226
11227
11228
11229
11230
11231
11232
11233
11234
11235
11236
11237
11238
11239
11240
11241
11242
11243
11244
11245
11246
11247
11248
11249
11250
11251
11252
11253
11254
11255
11256
11257
11258
11259
11260
11261
11262
11263
11264
11265
11266
11267
11268
11269
11270
11271
11272
11273
11274
11275
11276
11277
11278
11279
11280
11281
11282
11283
11284
11285
11286
11287
11288
11289
11290
11291
11292
11293
11294
11295
11296
11297
11298
11299
11300
11301
11302
11303
11304
11305
11306
11307
11308
11309
11310
11311
11312
11313
11314
11315
11316
11317
11318
11319
11320
11321
11322
11323
11324
11325
11326
11327
11328
11329
11330
11331
11332
11333
11334
11335
11336
11337
11338
11339
11340
11341
11342
11343
11344
11345
11346
11347
11348
11349
11350
11351
11352
11353
11354
11355
11356
11357
11358
11359
11360
11361
11362
11363
11364
11365
11366
11367
11368
11369
11370
11371
11372
11373
11374
11375
11376
11377
11378
11379
11380
11381
11382
11383
11384
11385
11386
11387
11388
11389
11390
11391
11392
11393
11394
11395
11396
11397
11398
11399
11400
11401
11402
11403
11404
11405
11406
11407
11408
11409
11410
11411
11412
11413
11414
11415
11416
11417
11418
11419
11420
11421
11422
11423
11424
11425
11426
11427
11428
11429
11430
11431
11432
11433
11434
11435
11436
11437
11438
11439
11440
11441
11442
11443
11444
11445
11446
11447
11448
11449
11450
11451
11452
11453
11454
11455
11456
11457
11458
11459
11460
11461
11462
11463
11464
11465
11466
11467
11468
11469
11470
11471
11472
11473
11474
11475
11476
11477
11478
11479
11480
11481
11482
11483
11484
11485
11486
11487
11488
11489
11490
11491
11492
11493
11494
11495
11496
11497
11498
11499
11500
11501
11502
11503
11504
11505
11506
11507
11508
11509
11510
11511
11512
11513
11514
11515
11516
11517
11518
11519
11520
11521
11522
11523
11524
11525
11526
11527
11528
11529
11530
11531
11532
11533
11534
11535
11536
11537
11538
11539
11540
11541
11542
11543
11544
11545
11546
11547
11548
11549
11550
11551
11552
11553
11554
11555
11556
11557
11558
11559
11560
11561
11562
11563
11564
11565
11566
11567
11568
11569
11570
11571
11572
11573
11574
11575
11576
11577
11578
11579
11580
11581
11582
11583
11584
11585
11586
11587
11588
11589
11590
11591
11592
11593
11594
11595
11596
11597
11598
11599
11600
11601
11602
11603
11604
11605
11606
11607
11608
11609
11610
11611
11612
11613
11614
11615
11616
11617
11618
11619
11620
11621
11622
11623
11624
11625
11626
11627
11628
11629
11630
11631
11632
11633
11634
11635
11636
11637
11638
11639
11640
11641
11642
11643
11644
11645
11646
11647
11648
11649
11650
11651
11652
11653
11654
11655
11656
11657
11658
11659
11660
11661
11662
11663
11664
11665
11666
11667
11668
11669
11670
11671
11672
11673
11674
11675
11676
11677
11678
11679
11680
11681
11682
11683
11684
11685
11686
11687
11688
11689
11690
11691
11692
11693
11694
11695
11696
11697
11698
11699
11700
11701
11702
11703
11704
11705
11706
11707
11708
11709
11710
11711
11712
11713
11714
11715
11716
11717
11718
11719
11720
11721
11722
11723
11724
11725
11726
11727
11728
11729
11730
11731
11732
11733
11734
11735
11736
11737
11738
11739
11740
11741
11742
11743
11744
11745
11746
11747
11748
11749
11750
11751
11752
11753
11754
11755
11756
11757
11758
11759
11760
11761
11762
11763
11764
11765
11766
11767
11768
11769
11770
11771
11772
11773
11774
11775
11776
11777
11778
11779
11780
11781
11782
11783
11784
11785
11786
11787
11788
11789
11790
11791
11792
11793
11794
11795
11796
11797
11798
11799
11800
11801
11802
11803
11804
11805
11806
11807
11808
11809
11810
11811
11812
11813
11814
11815
11816
11817
11818
11819
11820
11821
11822
11823
11824
11825
11826
11827
11828
11829
11830
11831
11832
11833
11834
11835
11836
11837
11838
11839
11840
11841
11842
11843
11844
11845
11846
11847
11848
11849
11850
11851
11852
11853
11854
11855
11856
11857
11858
11859
11860
11861
11862
11863
11864
11865
11866
11867
11868
11869
11870
11871
11872
11873
11874
11875
11876
11877
11878
11879
11880
11881
11882
11883
11884
11885
11886
11887
11888
11889
11890
11891
11892
11893
11894
11895
11896
11897
11898
11899
11900
11901
11902
11903
11904
11905
11906
11907
11908
11909
11910
11911
11912
11913
11914
11915
11916
11917
11918
11919
11920
11921
11922
11923
11924
11925
11926
11927
11928
11929
11930
11931
11932
11933
11934
11935
11936
11937
11938
11939
11940
11941
11942
11943
11944
11945
11946
11947
11948
11949
11950
11951
11952
11953
11954
11955
11956
11957
11958
11959
11960
11961
11962
11963
11964
11965
11966
11967
11968
11969
11970
11971
11972
11973
11974
11975
11976
11977
11978
11979
11980
11981
11982
11983
11984
11985
11986
11987
11988
11989
11990
11991
11992
11993
11994
11995
11996
11997
11998
11999
12000
12001
12002
12003
12004
12005
12006
12007
12008
12009
12010
12011
12012
12013
12014
12015
12016
12017
12018
12019
12020
12021
12022
12023
12024
12025
12026
12027
12028
12029
12030
12031
12032
12033
12034
12035
12036
12037
12038
12039
12040
12041
12042
12043
12044
12045
12046
12047
12048
12049
12050
12051
12052
12053
12054
12055
12056
12057
12058
12059
12060
12061
12062
12063
12064
12065
12066
12067
12068
12069
12070
12071
12072
12073
12074
12075
12076
12077
12078
12079
12080
12081
12082
12083
12084
12085
12086
12087
12088
12089
12090
12091
12092
12093
12094
12095
12096
12097
12098
12099
12100
12101
12102
12103
12104
12105
12106
12107
12108
12109
12110
12111
12112
12113
12114
12115
12116
12117
12118
12119
12120
12121
12122
12123
12124
12125
12126
12127
12128
12129
12130
12131
12132
12133
12134
12135
12136
12137
12138
12139
12140
12141
12142
12143
12144
12145
12146
12147
12148
12149
12150
12151
12152
12153
12154
12155
12156
12157
12158
12159
12160
12161
12162
12163
12164
12165
12166
12167
12168
12169
12170
12171
12172
12173
12174
12175
12176
12177
12178
12179
12180
12181
12182
12183
12184
12185
12186
12187
12188
12189
12190
12191
12192
12193
12194
12195
12196
12197
12198
12199
12200
12201
12202
12203
12204
12205
12206
12207
12208
12209
12210
12211
12212
12213
12214
12215
12216
12217
12218
12219
12220
12221
12222
12223
12224
12225
12226
12227
12228
12229
12230
12231
12232
12233
12234
12235
12236
12237
12238
12239
12240
12241
12242
12243
12244
12245
12246
12247
12248
12249
12250
12251
12252
12253
12254
12255
12256
12257
12258
12259
12260
12261
12262
12263
12264
12265
12266
12267
12268
12269
12270
12271
12272
12273
12274
12275
12276
12277
12278
12279
12280
12281
12282
12283
12284
12285
12286
12287
12288
12289
12290
12291
12292
12293
12294
12295
12296
12297
12298
12299
12300
12301
12302
12303
12304
12305
12306
12307
12308
12309
12310
12311
12312
12313
12314
12315
12316
12317
12318
12319
12320
12321
12322
12323
12324
12325
12326
12327
12328
12329
12330
12331
12332
12333
12334
12335
12336
12337
12338
12339
12340
12341
12342
12343
12344
12345
12346
12347
12348
12349
12350
12351
12352
12353
12354
12355
12356
12357
12358
12359
12360
12361
12362
12363
12364
12365
12366
12367
12368
12369
12370
12371
12372
12373
12374
12375
12376
12377
12378
12379
12380
12381
12382
12383
12384
12385
12386
12387
12388
12389
12390
12391
12392
12393
12394
12395
12396
12397
12398
12399
12400
12401
12402
12403
12404
12405
12406
12407
12408
12409
12410
12411
12412
12413
12414
12415
12416
12417
12418
12419
12420
12421
12422
12423
12424
12425
12426
12427
12428
12429
12430
12431
12432
12433
12434
12435
12436
12437
12438
12439
12440
12441
12442
12443
12444
12445
12446
12447
12448
12449
12450
12451
12452
12453
12454
12455
12456
12457
12458
12459
12460
12461
12462
12463
12464
12465
12466
12467
12468
12469
12470
12471
12472
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Linux Socket Filter - Kernel level socket filtering
 *
 * Based on the design of the Berkeley Packet Filter. The new
 * internal format has been designed by PLUMgrid:
 *
 *        Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
 *
 * Authors:
 *
 *        Jay Schulist <jschlst@samba.org>
 *        Alexei Starovoitov <ast@plumgrid.com>
 *        Daniel Borkmann <dborkman@redhat.com>
 *
 * Andi Kleen - Fix a few bad bugs and races.
 * Kris Katterjohn - Added many additional checks in bpf_check_classic()
 */

#include <linux/atomic.h>
#include <linux/bpf_verifier.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/fcntl.h>
#include <linux/socket.h>
#include <linux/sock_diag.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/if_packet.h>
#include <linux/if_arp.h>
#include <linux/gfp.h>
#include <net/inet_common.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/netlink.h>
#include <linux/skbuff.h>
#include <linux/skmsg.h>
#include <net/sock.h>
#include <net/flow_dissector.h>
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/uaccess.h>
#include <linux/unaligned.h>
#include <linux/filter.h>
#include <linux/ratelimit.h>
#include <linux/seccomp.h>
#include <linux/if_vlan.h>
#include <linux/bpf.h>
#include <linux/btf.h>
#include <net/sch_generic.h>
#include <net/cls_cgroup.h>
#include <net/dst_metadata.h>
#include <net/dst.h>
#include <net/sock_reuseport.h>
#include <net/busy_poll.h>
#include <net/tcp.h>
#include <net/xfrm.h>
#include <net/udp.h>
#include <linux/bpf_trace.h>
#include <net/xdp_sock.h>
#include <linux/inetdevice.h>
#include <net/inet_hashtables.h>
#include <net/inet6_hashtables.h>
#include <net/ip_fib.h>
#include <net/nexthop.h>
#include <net/flow.h>
#include <net/arp.h>
#include <net/ipv6.h>
#include <net/net_namespace.h>
#include <linux/seg6_local.h>
#include <net/seg6.h>
#include <net/seg6_local.h>
#include <net/lwtunnel.h>
#include <net/ipv6_stubs.h>
#include <net/bpf_sk_storage.h>
#include <net/transp_v6.h>
#include <linux/btf_ids.h>
#include <net/tls.h>
#include <net/xdp.h>
#include <net/mptcp.h>
#include <net/netfilter/nf_conntrack_bpf.h>
#include <net/netkit.h>
#include <linux/un.h>
#include <net/xdp_sock_drv.h>
#include <net/inet_dscp.h>

#include "dev.h"

/* Keep the struct bpf_fib_lookup small so that it fits into a cacheline */
static_assert(sizeof(struct bpf_fib_lookup) == 64, "struct bpf_fib_lookup size check");

static const struct bpf_func_proto *
bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog);

int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len)
{
        if (in_compat_syscall()) {
                struct compat_sock_fprog f32;

                if (len != sizeof(f32))
                        return -EINVAL;
                if (copy_from_sockptr(&f32, src, sizeof(f32)))
                        return -EFAULT;
                memset(dst, 0, sizeof(*dst));
                dst->len = f32.len;
                dst->filter = compat_ptr(f32.filter);
        } else {
                if (len != sizeof(*dst))
                        return -EINVAL;
                if (copy_from_sockptr(dst, src, sizeof(*dst)))
                        return -EFAULT;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user);

/**
 *        sk_filter_trim_cap - run a packet through a socket filter
 *        @sk: sock associated with &sk_buff
 *        @skb: buffer to filter
 *        @cap: limit on how short the eBPF program may trim the packet
 *        @reason: record drop reason on errors (negative return value)
 *
 * Run the eBPF program and then cut skb->data to correct size returned by
 * the program. If pkt_len is 0 we toss packet. If skb->len is smaller
 * than pkt_len we keep whole skb->data. This is the socket level
 * wrapper to bpf_prog_run. It returns 0 if the packet should
 * be accepted or -EPERM if the packet should be tossed.
 *
 */
int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb,
                       unsigned int cap, enum skb_drop_reason *reason)
{
        int err;
        struct sk_filter *filter;

        /*
         * If the skb was allocated from pfmemalloc reserves, only
         * allow SOCK_MEMALLOC sockets to use it as this socket is
         * helping free memory
         */
        if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) {
                NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
                *reason = SKB_DROP_REASON_PFMEMALLOC;
                return -ENOMEM;
        }
        err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb);
        if (err) {
                *reason = SKB_DROP_REASON_SOCKET_FILTER;
                return err;
        }

        err = security_sock_rcv_skb(sk, skb);
        if (err) {
                *reason = SKB_DROP_REASON_SECURITY_HOOK;
                return err;
        }

        rcu_read_lock();
        filter = rcu_dereference(sk->sk_filter);
        if (filter) {
                struct sock *save_sk = skb->sk;
                unsigned int pkt_len;

                skb->sk = sk;
                pkt_len = bpf_prog_run_save_cb(filter->prog, skb);
                skb->sk = save_sk;
                err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM;
                if (err)
                        *reason = SKB_DROP_REASON_SOCKET_FILTER;
        }
        rcu_read_unlock();

        return err;
}
EXPORT_SYMBOL(sk_filter_trim_cap);

BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb)
{
        return skb_get_poff(skb);
}

BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
{
        struct nlattr *nla;

        if (skb_is_nonlinear(skb))
                return 0;

        if (skb->len < sizeof(struct nlattr))
                return 0;

        if (a > skb->len - sizeof(struct nlattr))
                return 0;

        nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x);
        if (nla)
                return (void *) nla - (void *) skb->data;

        return 0;
}

BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
{
        struct nlattr *nla;

        if (skb_is_nonlinear(skb))
                return 0;

        if (skb->len < sizeof(struct nlattr))
                return 0;

        if (a > skb->len - sizeof(struct nlattr))
                return 0;

        nla = (struct nlattr *) &skb->data[a];
        if (!nla_ok(nla, skb->len - a))
                return 0;

        nla = nla_find_nested(nla, x);
        if (nla)
                return (void *) nla - (void *) skb->data;

        return 0;
}

static int bpf_skb_load_helper_convert_offset(const struct sk_buff *skb, int offset)
{
        if (likely(offset >= 0))
                return offset;

        if (offset >= SKF_NET_OFF)
                return offset - SKF_NET_OFF + skb_network_offset(skb);

        if (offset >= SKF_LL_OFF && skb_mac_header_was_set(skb))
                return offset - SKF_LL_OFF + skb_mac_offset(skb);

        return INT_MIN;
}

BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *,
           data, int, headlen, int, offset)
{
        u8 tmp;
        const int len = sizeof(tmp);

        offset = bpf_skb_load_helper_convert_offset(skb, offset);
        if (offset == INT_MIN)
                return -EFAULT;

        if (headlen - offset >= len)
                return *(u8 *)(data + offset);
        if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
                return tmp;
        else
                return -EFAULT;
}

BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb,
           int, offset)
{
        return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len,
                                         offset);
}

BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *,
           data, int, headlen, int, offset)
{
        __be16 tmp;
        const int len = sizeof(tmp);

        offset = bpf_skb_load_helper_convert_offset(skb, offset);
        if (offset == INT_MIN)
                return -EFAULT;

        if (headlen - offset >= len)
                return get_unaligned_be16(data + offset);
        if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
                return be16_to_cpu(tmp);
        else
                return -EFAULT;
}

BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb,
           int, offset)
{
        return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len,
                                          offset);
}

BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *,
           data, int, headlen, int, offset)
{
        __be32 tmp;
        const int len = sizeof(tmp);

        offset = bpf_skb_load_helper_convert_offset(skb, offset);
        if (offset == INT_MIN)
                return -EFAULT;

        if (headlen - offset >= len)
                return get_unaligned_be32(data + offset);
        if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
                return be32_to_cpu(tmp);
        else
                return -EFAULT;
}

BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb,
           int, offset)
{
        return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len,
                                          offset);
}

static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
                              struct bpf_insn *insn_buf)
{
        struct bpf_insn *insn = insn_buf;

        switch (skb_field) {
        case SKF_AD_MARK:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, mark) != 4);

                *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
                                      offsetof(struct sk_buff, mark));
                break;

        case SKF_AD_PKTTYPE:
                *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET);
                *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX);
#ifdef __BIG_ENDIAN_BITFIELD
                *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5);
#endif
                break;

        case SKF_AD_QUEUE:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, queue_mapping) != 2);

                *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
                                      offsetof(struct sk_buff, queue_mapping));
                break;

        case SKF_AD_VLAN_TAG:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_tci) != 2);

                /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */
                *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
                                      offsetof(struct sk_buff, vlan_tci));
                break;
        case SKF_AD_VLAN_TAG_PRESENT:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_all) != 4);
                *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
                                      offsetof(struct sk_buff, vlan_all));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1);
                *insn++ = BPF_ALU32_IMM(BPF_MOV, dst_reg, 1);
                break;
        }

        return insn - insn_buf;
}

static bool convert_bpf_extensions(struct sock_filter *fp,
                                   struct bpf_insn **insnp)
{
        struct bpf_insn *insn = *insnp;
        u32 cnt;

        switch (fp->k) {
        case SKF_AD_OFF + SKF_AD_PROTOCOL:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, protocol) != 2);

                /* A = *(u16 *) (CTX + offsetof(protocol)) */
                *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
                                      offsetof(struct sk_buff, protocol));
                /* A = ntohs(A) [emitting a nop or swap16] */
                *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
                break;

        case SKF_AD_OFF + SKF_AD_PKTTYPE:
                cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn);
                insn += cnt - 1;
                break;

        case SKF_AD_OFF + SKF_AD_IFINDEX:
        case SKF_AD_OFF + SKF_AD_HATYPE:
                BUILD_BUG_ON(sizeof_field(struct net_device, ifindex) != 4);
                BUILD_BUG_ON(sizeof_field(struct net_device, type) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
                                      BPF_REG_TMP, BPF_REG_CTX,
                                      offsetof(struct sk_buff, dev));
                /* if (tmp != 0) goto pc + 1 */
                *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1);
                *insn++ = BPF_EXIT_INSN();
                if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX)
                        *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP,
                                            offsetof(struct net_device, ifindex));
                else
                        *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP,
                                            offsetof(struct net_device, type));
                break;

        case SKF_AD_OFF + SKF_AD_MARK:
                cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn);
                insn += cnt - 1;
                break;

        case SKF_AD_OFF + SKF_AD_RXHASH:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, hash) != 4);

                *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX,
                                    offsetof(struct sk_buff, hash));
                break;

        case SKF_AD_OFF + SKF_AD_QUEUE:
                cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn);
                insn += cnt - 1;
                break;

        case SKF_AD_OFF + SKF_AD_VLAN_TAG:
                cnt = convert_skb_access(SKF_AD_VLAN_TAG,
                                         BPF_REG_A, BPF_REG_CTX, insn);
                insn += cnt - 1;
                break;

        case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT:
                cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
                                         BPF_REG_A, BPF_REG_CTX, insn);
                insn += cnt - 1;
                break;

        case SKF_AD_OFF + SKF_AD_VLAN_TPID:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_proto) != 2);

                /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */
                *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
                                      offsetof(struct sk_buff, vlan_proto));
                /* A = ntohs(A) [emitting a nop or swap16] */
                *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
                break;

        case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
        case SKF_AD_OFF + SKF_AD_NLATTR:
        case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
        case SKF_AD_OFF + SKF_AD_CPU:
        case SKF_AD_OFF + SKF_AD_RANDOM:
                /* arg1 = CTX */
                *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
                /* arg2 = A */
                *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A);
                /* arg3 = X */
                *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X);
                /* Emit call(arg1=CTX, arg2=A, arg3=X) */
                switch (fp->k) {
                case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
                        *insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset);
                        break;
                case SKF_AD_OFF + SKF_AD_NLATTR:
                        *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr);
                        break;
                case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
                        *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest);
                        break;
                case SKF_AD_OFF + SKF_AD_CPU:
                        *insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id);
                        break;
                case SKF_AD_OFF + SKF_AD_RANDOM:
                        *insn = BPF_EMIT_CALL(bpf_user_rnd_u32);
                        bpf_user_rnd_init_once();
                        break;
                }
                break;

        case SKF_AD_OFF + SKF_AD_ALU_XOR_X:
                /* A ^= X */
                *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X);
                break;

        default:
                /* This is just a dummy call to avoid letting the compiler
                 * evict __bpf_call_base() as an optimization. Placed here
                 * where no-one bothers.
                 */
                BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0);
                return false;
        }

        *insnp = insn;
        return true;
}

static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp)
{
        const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS);
        int size = bpf_size_to_bytes(BPF_SIZE(fp->code));
        bool endian = BPF_SIZE(fp->code) == BPF_H ||
                      BPF_SIZE(fp->code) == BPF_W;
        bool indirect = BPF_MODE(fp->code) == BPF_IND;
        const int ip_align = NET_IP_ALIGN;
        struct bpf_insn *insn = *insnp;
        int offset = fp->k;

        if (!indirect &&
            ((unaligned_ok && offset >= 0) ||
             (!unaligned_ok && offset >= 0 &&
              offset + ip_align >= 0 &&
              offset + ip_align % size == 0))) {
                bool ldx_off_ok = offset <= S16_MAX;

                *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H);
                if (offset)
                        *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset);
                *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP,
                                      size, 2 + endian + (!ldx_off_ok * 2));
                if (ldx_off_ok) {
                        *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A,
                                              BPF_REG_D, offset);
                } else {
                        *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_D);
                        *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_TMP, offset);
                        *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A,
                                              BPF_REG_TMP, 0);
                }
                if (endian)
                        *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8);
                *insn++ = BPF_JMP_A(8);
        }

        *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
        *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D);
        *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H);
        if (!indirect) {
                *insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset);
        } else {
                *insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X);
                if (fp->k)
                        *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset);
        }

        switch (BPF_SIZE(fp->code)) {
        case BPF_B:
                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8);
                break;
        case BPF_H:
                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16);
                break;
        case BPF_W:
                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32);
                break;
        default:
                return false;
        }

        *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2);
        *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
        *insn   = BPF_EXIT_INSN();

        *insnp = insn;
        return true;
}

/**
 *        bpf_convert_filter - convert filter program
 *        @prog: the user passed filter program
 *        @len: the length of the user passed filter program
 *        @new_prog: allocated 'struct bpf_prog' or NULL
 *        @new_len: pointer to store length of converted program
 *        @seen_ld_abs: bool whether we've seen ld_abs/ind
 *
 * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn'
 * style extended BPF (eBPF).
 * Conversion workflow:
 *
 * 1) First pass for calculating the new program length:
 *   bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs)
 *
 * 2) 2nd pass to remap in two passes: 1st pass finds new
 *    jump offsets, 2nd pass remapping:
 *   bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs)
 */
static int bpf_convert_filter(struct sock_filter *prog, int len,
                              struct bpf_prog *new_prog, int *new_len,
                              bool *seen_ld_abs)
{
        int new_flen = 0, pass = 0, target, i, stack_off;
        struct bpf_insn *new_insn, *first_insn = NULL;
        struct sock_filter *fp;
        int *addrs = NULL;
        u8 bpf_src;

        BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK);
        BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);

        if (len <= 0 || len > BPF_MAXINSNS)
                return -EINVAL;

        if (new_prog) {
                first_insn = new_prog->insnsi;
                addrs = kcalloc(len, sizeof(*addrs),
                                GFP_KERNEL | __GFP_NOWARN);
                if (!addrs)
                        return -ENOMEM;
        }

do_pass:
        new_insn = first_insn;
        fp = prog;

        /* Classic BPF related prologue emission. */
        if (new_prog) {
                /* Classic BPF expects A and X to be reset first. These need
                 * to be guaranteed to be the first two instructions.
                 */
                *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
                *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X);

                /* All programs must keep CTX in callee saved BPF_REG_CTX.
                 * In eBPF case it's done by the compiler, here we need to
                 * do this ourself. Initial CTX is present in BPF_REG_ARG1.
                 */
                *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
                if (*seen_ld_abs) {
                        /* For packet access in classic BPF, cache skb->data
                         * in callee-saved BPF R8 and skb->len - skb->data_len
                         * (headlen) in BPF R9. Since classic BPF is read-only
                         * on CTX, we only need to cache it once.
                         */
                        *new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
                                                  BPF_REG_D, BPF_REG_CTX,
                                                  offsetof(struct sk_buff, data));
                        *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX,
                                                  offsetof(struct sk_buff, len));
                        *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX,
                                                  offsetof(struct sk_buff, data_len));
                        *new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP);
                }
        } else {
                new_insn += 3;
        }

        for (i = 0; i < len; fp++, i++) {
                struct bpf_insn tmp_insns[32] = { };
                struct bpf_insn *insn = tmp_insns;

                if (addrs)
                        addrs[i] = new_insn - first_insn;

                switch (fp->code) {
                /* All arithmetic insns and skb loads map as-is. */
                case BPF_ALU | BPF_ADD | BPF_X:
                case BPF_ALU | BPF_ADD | BPF_K:
                case BPF_ALU | BPF_SUB | BPF_X:
                case BPF_ALU | BPF_SUB | BPF_K:
                case BPF_ALU | BPF_AND | BPF_X:
                case BPF_ALU | BPF_AND | BPF_K:
                case BPF_ALU | BPF_OR | BPF_X:
                case BPF_ALU | BPF_OR | BPF_K:
                case BPF_ALU | BPF_LSH | BPF_X:
                case BPF_ALU | BPF_LSH | BPF_K:
                case BPF_ALU | BPF_RSH | BPF_X:
                case BPF_ALU | BPF_RSH | BPF_K:
                case BPF_ALU | BPF_XOR | BPF_X:
                case BPF_ALU | BPF_XOR | BPF_K:
                case BPF_ALU | BPF_MUL | BPF_X:
                case BPF_ALU | BPF_MUL | BPF_K:
                case BPF_ALU | BPF_DIV | BPF_X:
                case BPF_ALU | BPF_DIV | BPF_K:
                case BPF_ALU | BPF_MOD | BPF_X:
                case BPF_ALU | BPF_MOD | BPF_K:
                case BPF_ALU | BPF_NEG:
                case BPF_LD | BPF_ABS | BPF_W:
                case BPF_LD | BPF_ABS | BPF_H:
                case BPF_LD | BPF_ABS | BPF_B:
                case BPF_LD | BPF_IND | BPF_W:
                case BPF_LD | BPF_IND | BPF_H:
                case BPF_LD | BPF_IND | BPF_B:
                        /* Check for overloaded BPF extension and
                         * directly convert it if found, otherwise
                         * just move on with mapping.
                         */
                        if (BPF_CLASS(fp->code) == BPF_LD &&
                            BPF_MODE(fp->code) == BPF_ABS &&
                            convert_bpf_extensions(fp, &insn))
                                break;
                        if (BPF_CLASS(fp->code) == BPF_LD &&
                            convert_bpf_ld_abs(fp, &insn)) {
                                *seen_ld_abs = true;
                                break;
                        }

                        if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) ||
                            fp->code == (BPF_ALU | BPF_MOD | BPF_X)) {
                                *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X);
                                /* Error with exception code on div/mod by 0.
                                 * For cBPF programs, this was always return 0.
                                 */
                                *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2);
                                *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
                                *insn++ = BPF_EXIT_INSN();
                        }

                        *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k);
                        break;

                /* Jump transformation cannot use BPF block macros
                 * everywhere as offset calculation and target updates
                 * require a bit more work than the rest, i.e. jump
                 * opcodes map as-is, but offsets need adjustment.
                 */

#define BPF_EMIT_JMP                                                        \
        do {                                                                \
                const s32 off_min = S16_MIN, off_max = S16_MAX;                \
                s32 off;                                                \
                                                                        \
                if (target >= len || target < 0)                        \
                        goto err;                                        \
                off = addrs ? addrs[target] - addrs[i] - 1 : 0;                \
                /* Adjust pc relative offset for 2nd or 3rd insn. */        \
                off -= insn - tmp_insns;                                \
                /* Reject anything not fitting into insn->off. */        \
                if (off < off_min || off > off_max)                        \
                        goto err;                                        \
                insn->off = off;                                        \
        } while (0)

                case BPF_JMP | BPF_JA:
                        target = i + fp->k + 1;
                        insn->code = fp->code;
                        BPF_EMIT_JMP;
                        break;

                case BPF_JMP | BPF_JEQ | BPF_K:
                case BPF_JMP | BPF_JEQ | BPF_X:
                case BPF_JMP | BPF_JSET | BPF_K:
                case BPF_JMP | BPF_JSET | BPF_X:
                case BPF_JMP | BPF_JGT | BPF_K:
                case BPF_JMP | BPF_JGT | BPF_X:
                case BPF_JMP | BPF_JGE | BPF_K:
                case BPF_JMP | BPF_JGE | BPF_X:
                        if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) {
                                /* BPF immediates are signed, zero extend
                                 * immediate into tmp register and use it
                                 * in compare insn.
                                 */
                                *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k);

                                insn->dst_reg = BPF_REG_A;
                                insn->src_reg = BPF_REG_TMP;
                                bpf_src = BPF_X;
                        } else {
                                insn->dst_reg = BPF_REG_A;
                                insn->imm = fp->k;
                                bpf_src = BPF_SRC(fp->code);
                                insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0;
                        }

                        /* Common case where 'jump_false' is next insn. */
                        if (fp->jf == 0) {
                                insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
                                target = i + fp->jt + 1;
                                BPF_EMIT_JMP;
                                break;
                        }

                        /* Convert some jumps when 'jump_true' is next insn. */
                        if (fp->jt == 0) {
                                switch (BPF_OP(fp->code)) {
                                case BPF_JEQ:
                                        insn->code = BPF_JMP | BPF_JNE | bpf_src;
                                        break;
                                case BPF_JGT:
                                        insn->code = BPF_JMP | BPF_JLE | bpf_src;
                                        break;
                                case BPF_JGE:
                                        insn->code = BPF_JMP | BPF_JLT | bpf_src;
                                        break;
                                default:
                                        goto jmp_rest;
                                }

                                target = i + fp->jf + 1;
                                BPF_EMIT_JMP;
                                break;
                        }
jmp_rest:
                        /* Other jumps are mapped into two insns: Jxx and JA. */
                        target = i + fp->jt + 1;
                        insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
                        BPF_EMIT_JMP;
                        insn++;

                        insn->code = BPF_JMP | BPF_JA;
                        target = i + fp->jf + 1;
                        BPF_EMIT_JMP;
                        break;

                /* ldxb 4 * ([14] & 0xf) is remapped into 6 insns. */
                case BPF_LDX | BPF_MSH | BPF_B: {
                        struct sock_filter tmp = {
                                .code        = BPF_LD | BPF_ABS | BPF_B,
                                .k        = fp->k,
                        };

                        *seen_ld_abs = true;

                        /* X = A */
                        *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
                        /* A = BPF_R0 = *(u8 *) (skb->data + K) */
                        convert_bpf_ld_abs(&tmp, &insn);
                        insn++;
                        /* A &= 0xf */
                        *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf);
                        /* A <<= 2 */
                        *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2);
                        /* tmp = X */
                        *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X);
                        /* X = A */
                        *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
                        /* A = tmp */
                        *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
                        break;
                }
                /* RET_K is remapped into 2 insns. RET_A case doesn't need an
                 * extra mov as BPF_REG_0 is already mapped into BPF_REG_A.
                 */
                case BPF_RET | BPF_A:
                case BPF_RET | BPF_K:
                        if (BPF_RVAL(fp->code) == BPF_K)
                                *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0,
                                                        0, fp->k);
                        *insn = BPF_EXIT_INSN();
                        break;

                /* Store to stack. */
                case BPF_ST:
                case BPF_STX:
                        stack_off = fp->k * 4  + 4;
                        *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) ==
                                            BPF_ST ? BPF_REG_A : BPF_REG_X,
                                            -stack_off);
                        /* check_load_and_stores() verifies that classic BPF can
                         * load from stack only after write, so tracking
                         * stack_depth for ST|STX insns is enough
                         */
                        if (new_prog && new_prog->aux->stack_depth < stack_off)
                                new_prog->aux->stack_depth = stack_off;
                        break;

                /* Load from stack. */
                case BPF_LD | BPF_MEM:
                case BPF_LDX | BPF_MEM:
                        stack_off = fp->k * 4  + 4;
                        *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD  ?
                                            BPF_REG_A : BPF_REG_X, BPF_REG_FP,
                                            -stack_off);
                        break;

                /* A = K or X = K */
                case BPF_LD | BPF_IMM:
                case BPF_LDX | BPF_IMM:
                        *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ?
                                              BPF_REG_A : BPF_REG_X, fp->k);
                        break;

                /* X = A */
                case BPF_MISC | BPF_TAX:
                        *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
                        break;

                /* A = X */
                case BPF_MISC | BPF_TXA:
                        *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X);
                        break;

                /* A = skb->len or X = skb->len */
                case BPF_LD | BPF_W | BPF_LEN:
                case BPF_LDX | BPF_W | BPF_LEN:
                        *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
                                            BPF_REG_A : BPF_REG_X, BPF_REG_CTX,
                                            offsetof(struct sk_buff, len));
                        break;

                /* Access seccomp_data fields. */
                case BPF_LDX | BPF_ABS | BPF_W:
                        /* A = *(u32 *) (ctx + K) */
                        *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k);
                        break;

                /* Unknown instruction. */
                default:
                        goto err;
                }

                insn++;
                if (new_prog)
                        memcpy(new_insn, tmp_insns,
                               sizeof(*insn) * (insn - tmp_insns));
                new_insn += insn - tmp_insns;
        }

        if (!new_prog) {
                /* Only calculating new length. */
                *new_len = new_insn - first_insn;
                if (*seen_ld_abs)
                        *new_len += 4; /* Prologue bits. */
                return 0;
        }

        pass++;
        if (new_flen != new_insn - first_insn) {
                new_flen = new_insn - first_insn;
                if (pass > 2)
                        goto err;
                goto do_pass;
        }

        kfree(addrs);
        BUG_ON(*new_len != new_flen);
        return 0;
err:
        kfree(addrs);
        return -EINVAL;
}

/* Security:
 *
 * As we dont want to clear mem[] array for each packet going through
 * __bpf_prog_run(), we check that filter loaded by user never try to read
 * a cell if not previously written, and we check all branches to be sure
 * a malicious user doesn't try to abuse us.
 */
static int check_load_and_stores(const struct sock_filter *filter, int flen)
{
        u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */
        int pc, ret = 0;

        BUILD_BUG_ON(BPF_MEMWORDS > 16);

        masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL);
        if (!masks)
                return -ENOMEM;

        memset(masks, 0xff, flen * sizeof(*masks));

        for (pc = 0; pc < flen; pc++) {
                memvalid &= masks[pc];

                switch (filter[pc].code) {
                case BPF_ST:
                case BPF_STX:
                        memvalid |= (1 << filter[pc].k);
                        break;
                case BPF_LD | BPF_MEM:
                case BPF_LDX | BPF_MEM:
                        if (!(memvalid & (1 << filter[pc].k))) {
                                ret = -EINVAL;
                                goto error;
                        }
                        break;
                case BPF_JMP | BPF_JA:
                        /* A jump must set masks on target */
                        masks[pc + 1 + filter[pc].k] &= memvalid;
                        memvalid = ~0;
                        break;
                case BPF_JMP | BPF_JEQ | BPF_K:
                case BPF_JMP | BPF_JEQ | BPF_X:
                case BPF_JMP | BPF_JGE | BPF_K:
                case BPF_JMP | BPF_JGE | BPF_X:
                case BPF_JMP | BPF_JGT | BPF_K:
                case BPF_JMP | BPF_JGT | BPF_X:
                case BPF_JMP | BPF_JSET | BPF_K:
                case BPF_JMP | BPF_JSET | BPF_X:
                        /* A jump must set masks on targets */
                        masks[pc + 1 + filter[pc].jt] &= memvalid;
                        masks[pc + 1 + filter[pc].jf] &= memvalid;
                        memvalid = ~0;
                        break;
                }
        }
error:
        kfree(masks);
        return ret;
}

static bool chk_code_allowed(u16 code_to_probe)
{
        static const bool codes[] = {
                /* 32 bit ALU operations */
                [BPF_ALU | BPF_ADD | BPF_K] = true,
                [BPF_ALU | BPF_ADD | BPF_X] = true,
                [BPF_ALU | BPF_SUB | BPF_K] = true,
                [BPF_ALU | BPF_SUB | BPF_X] = true,
                [BPF_ALU | BPF_MUL | BPF_K] = true,
                [BPF_ALU | BPF_MUL | BPF_X] = true,
                [BPF_ALU | BPF_DIV | BPF_K] = true,
                [BPF_ALU | BPF_DIV | BPF_X] = true,
                [BPF_ALU | BPF_MOD | BPF_K] = true,
                [BPF_ALU | BPF_MOD | BPF_X] = true,
                [BPF_ALU | BPF_AND | BPF_K] = true,
                [BPF_ALU | BPF_AND | BPF_X] = true,
                [BPF_ALU | BPF_OR | BPF_K] = true,
                [BPF_ALU | BPF_OR | BPF_X] = true,
                [BPF_ALU | BPF_XOR | BPF_K] = true,
                [BPF_ALU | BPF_XOR | BPF_X] = true,
                [BPF_ALU | BPF_LSH | BPF_K] = true,
                [BPF_ALU | BPF_LSH | BPF_X] = true,
                [BPF_ALU | BPF_RSH | BPF_K] = true,
                [BPF_ALU | BPF_RSH | BPF_X] = true,
                [BPF_ALU | BPF_NEG] = true,
                /* Load instructions */
                [BPF_LD | BPF_W | BPF_ABS] = true,
                [BPF_LD | BPF_H | BPF_ABS] = true,
                [BPF_LD | BPF_B | BPF_ABS] = true,
                [BPF_LD | BPF_W | BPF_LEN] = true,
                [BPF_LD | BPF_W | BPF_IND] = true,
                [BPF_LD | BPF_H | BPF_IND] = true,
                [BPF_LD | BPF_B | BPF_IND] = true,
                [BPF_LD | BPF_IMM] = true,
                [BPF_LD | BPF_MEM] = true,
                [BPF_LDX | BPF_W | BPF_LEN] = true,
                [BPF_LDX | BPF_B | BPF_MSH] = true,
                [BPF_LDX | BPF_IMM] = true,
                [BPF_LDX | BPF_MEM] = true,
                /* Store instructions */
                [BPF_ST] = true,
                [BPF_STX] = true,
                /* Misc instructions */
                [BPF_MISC | BPF_TAX] = true,
                [BPF_MISC | BPF_TXA] = true,
                /* Return instructions */
                [BPF_RET | BPF_K] = true,
                [BPF_RET | BPF_A] = true,
                /* Jump instructions */
                [BPF_JMP | BPF_JA] = true,
                [BPF_JMP | BPF_JEQ | BPF_K] = true,
                [BPF_JMP | BPF_JEQ | BPF_X] = true,
                [BPF_JMP | BPF_JGE | BPF_K] = true,
                [BPF_JMP | BPF_JGE | BPF_X] = true,
                [BPF_JMP | BPF_JGT | BPF_K] = true,
                [BPF_JMP | BPF_JGT | BPF_X] = true,
                [BPF_JMP | BPF_JSET | BPF_K] = true,
                [BPF_JMP | BPF_JSET | BPF_X] = true,
        };

        if (code_to_probe >= ARRAY_SIZE(codes))
                return false;

        return codes[code_to_probe];
}

static bool bpf_check_basics_ok(const struct sock_filter *filter,
                                unsigned int flen)
{
        if (filter == NULL)
                return false;
        if (flen == 0 || flen > BPF_MAXINSNS)
                return false;

        return true;
}

/**
 *        bpf_check_classic - verify socket filter code
 *        @filter: filter to verify
 *        @flen: length of filter
 *
 * Check the user's filter code. If we let some ugly
 * filter code slip through kaboom! The filter must contain
 * no references or jumps that are out of range, no illegal
 * instructions, and must end with a RET instruction.
 *
 * All jumps are forward as they are not signed.
 *
 * Returns 0 if the rule set is legal or -EINVAL if not.
 */
static int bpf_check_classic(const struct sock_filter *filter,
                             unsigned int flen)
{
        bool anc_found;
        int pc;

        /* Check the filter code now */
        for (pc = 0; pc < flen; pc++) {
                const struct sock_filter *ftest = &filter[pc];

                /* May we actually operate on this code? */
                if (!chk_code_allowed(ftest->code))
                        return -EINVAL;

                /* Some instructions need special checks */
                switch (ftest->code) {
                case BPF_ALU | BPF_DIV | BPF_K:
                case BPF_ALU | BPF_MOD | BPF_K:
                        /* Check for division by zero */
                        if (ftest->k == 0)
                                return -EINVAL;
                        break;
                case BPF_ALU | BPF_LSH | BPF_K:
                case BPF_ALU | BPF_RSH | BPF_K:
                        if (ftest->k >= 32)
                                return -EINVAL;
                        break;
                case BPF_LD | BPF_MEM:
                case BPF_LDX | BPF_MEM:
                case BPF_ST:
                case BPF_STX:
                        /* Check for invalid memory addresses */
                        if (ftest->k >= BPF_MEMWORDS)
                                return -EINVAL;
                        break;
                case BPF_JMP | BPF_JA:
                        /* Note, the large ftest->k might cause loops.
                         * Compare this with conditional jumps below,
                         * where offsets are limited. --ANK (981016)
                         */
                        if (ftest->k >= (unsigned int)(flen - pc - 1))
                                return -EINVAL;
                        break;
                case BPF_JMP | BPF_JEQ | BPF_K:
                case BPF_JMP | BPF_JEQ | BPF_X:
                case BPF_JMP | BPF_JGE | BPF_K:
                case BPF_JMP | BPF_JGE | BPF_X:
                case BPF_JMP | BPF_JGT | BPF_K:
                case BPF_JMP | BPF_JGT | BPF_X:
                case BPF_JMP | BPF_JSET | BPF_K:
                case BPF_JMP | BPF_JSET | BPF_X:
                        /* Both conditionals must be safe */
                        if (pc + ftest->jt + 1 >= flen ||
                            pc + ftest->jf + 1 >= flen)
                                return -EINVAL;
                        break;
                case BPF_LD | BPF_W | BPF_ABS:
                case BPF_LD | BPF_H | BPF_ABS:
                case BPF_LD | BPF_B | BPF_ABS:
                        anc_found = false;
                        if (bpf_anc_helper(ftest) & BPF_ANC)
                                anc_found = true;
                        /* Ancillary operation unknown or unsupported */
                        if (anc_found == false && ftest->k >= SKF_AD_OFF)
                                return -EINVAL;
                }
        }

        /* Last instruction must be a RET code */
        switch (filter[flen - 1].code) {
        case BPF_RET | BPF_K:
        case BPF_RET | BPF_A:
                return check_load_and_stores(filter, flen);
        }

        return -EINVAL;
}

static int bpf_prog_store_orig_filter(struct bpf_prog *fp,
                                      const struct sock_fprog *fprog)
{
        unsigned int fsize = bpf_classic_proglen(fprog);
        struct sock_fprog_kern *fkprog;

        fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL);
        if (!fp->orig_prog)
                return -ENOMEM;

        fkprog = fp->orig_prog;
        fkprog->len = fprog->len;

        fkprog->filter = kmemdup(fp->insns, fsize,
                                 GFP_KERNEL | __GFP_NOWARN);
        if (!fkprog->filter) {
                kfree(fp->orig_prog);
                return -ENOMEM;
        }

        return 0;
}

static void bpf_release_orig_filter(struct bpf_prog *fp)
{
        struct sock_fprog_kern *fprog = fp->orig_prog;

        if (fprog) {
                kfree(fprog->filter);
                kfree(fprog);
        }
}

static void __bpf_prog_release(struct bpf_prog *prog)
{
        if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) {
                bpf_prog_put(prog);
        } else {
                bpf_release_orig_filter(prog);
                bpf_prog_free(prog);
        }
}

static void __sk_filter_release(struct sk_filter *fp)
{
        __bpf_prog_release(fp->prog);
        kfree(fp);
}

/**
 *         sk_filter_release_rcu - Release a socket filter by rcu_head
 *        @rcu: rcu_head that contains the sk_filter to free
 */
static void sk_filter_release_rcu(struct rcu_head *rcu)
{
        struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);

        __sk_filter_release(fp);
}

/**
 *        sk_filter_release - release a socket filter
 *        @fp: filter to remove
 *
 *        Remove a filter from a socket and release its resources.
 */
static void sk_filter_release(struct sk_filter *fp)
{
        if (refcount_dec_and_test(&fp->refcnt))
                call_rcu(&fp->rcu, sk_filter_release_rcu);
}

void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
{
        u32 filter_size = bpf_prog_size(fp->prog->len);

        atomic_sub(filter_size, &sk->sk_omem_alloc);
        sk_filter_release(fp);
}

/* try to charge the socket memory if there is space available
 * return true on success
 */
static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp)
{
        int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
        u32 filter_size = bpf_prog_size(fp->prog->len);

        /* same check as in sock_kmalloc() */
        if (filter_size <= optmem_max &&
            atomic_read(&sk->sk_omem_alloc) + filter_size < optmem_max) {
                atomic_add(filter_size, &sk->sk_omem_alloc);
                return true;
        }
        return false;
}

bool sk_filter_charge(struct sock *sk, struct sk_filter *fp)
{
        if (!refcount_inc_not_zero(&fp->refcnt))
                return false;

        if (!__sk_filter_charge(sk, fp)) {
                sk_filter_release(fp);
                return false;
        }
        return true;
}

static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
{
        struct sock_filter *old_prog;
        struct bpf_prog *old_fp;
        int err, new_len, old_len = fp->len;
        bool seen_ld_abs = false;

        /* We are free to overwrite insns et al right here as it won't be used at
         * this point in time anymore internally after the migration to the eBPF
         * instruction representation.
         */
        BUILD_BUG_ON(sizeof(struct sock_filter) !=
                     sizeof(struct bpf_insn));

        /* Conversion cannot happen on overlapping memory areas,
         * so we need to keep the user BPF around until the 2nd
         * pass. At this time, the user BPF is stored in fp->insns.
         */
        old_prog = kmemdup_array(fp->insns, old_len, sizeof(struct sock_filter),
                                 GFP_KERNEL | __GFP_NOWARN);
        if (!old_prog) {
                err = -ENOMEM;
                goto out_err;
        }

        /* 1st pass: calculate the new program length. */
        err = bpf_convert_filter(old_prog, old_len, NULL, &new_len,
                                 &seen_ld_abs);
        if (err)
                goto out_err_free;

        /* Expand fp for appending the new filter representation. */
        old_fp = fp;
        fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0);
        if (!fp) {
                /* The old_fp is still around in case we couldn't
                 * allocate new memory, so uncharge on that one.
                 */
                fp = old_fp;
                err = -ENOMEM;
                goto out_err_free;
        }

        fp->len = new_len;

        /* 2nd pass: remap sock_filter insns into bpf_insn insns. */
        err = bpf_convert_filter(old_prog, old_len, fp, &new_len,
                                 &seen_ld_abs);
        if (err)
                /* 2nd bpf_convert_filter() can fail only if it fails
                 * to allocate memory, remapping must succeed. Note,
                 * that at this time old_fp has already been released
                 * by krealloc().
                 */
                goto out_err_free;

        fp = bpf_prog_select_runtime(fp, &err);
        if (err)
                goto out_err_free;

        kfree(old_prog);
        return fp;

out_err_free:
        kfree(old_prog);
out_err:
        __bpf_prog_release(fp);
        return ERR_PTR(err);
}

static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
                                           bpf_aux_classic_check_t trans)
{
        int err;

        fp->bpf_func = NULL;
        fp->jited = 0;

        err = bpf_check_classic(fp->insns, fp->len);
        if (err) {
                __bpf_prog_release(fp);
                return ERR_PTR(err);
        }

        /* There might be additional checks and transformations
         * needed on classic filters, f.e. in case of seccomp.
         */
        if (trans) {
                err = trans(fp->insns, fp->len);
                if (err) {
                        __bpf_prog_release(fp);
                        return ERR_PTR(err);
                }
        }

        /* Probe if we can JIT compile the filter and if so, do
         * the compilation of the filter.
         */
        bpf_jit_compile(fp);

        /* JIT compiler couldn't process this filter, so do the eBPF translation
         * for the optimized interpreter.
         */
        if (!fp->jited)
                fp = bpf_migrate_filter(fp);

        return fp;
}

/**
 *        bpf_prog_create - create an unattached filter
 *        @pfp: the unattached filter that is created
 *        @fprog: the filter program
 *
 * Create a filter independent of any socket. We first run some
 * sanity checks on it to make sure it does not explode on us later.
 * If an error occurs or there is insufficient memory for the filter
 * a negative errno code is returned. On success the return is zero.
 */
int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
{
        unsigned int fsize = bpf_classic_proglen(fprog);
        struct bpf_prog *fp;

        /* Make sure new filter is there and in the right amounts. */
        if (!bpf_check_basics_ok(fprog->filter, fprog->len))
                return -EINVAL;

        fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
        if (!fp)
                return -ENOMEM;

        memcpy(fp->insns, fprog->filter, fsize);

        fp->len = fprog->len;
        /* Since unattached filters are not copied back to user
         * space through sk_get_filter(), we do not need to hold
         * a copy here, and can spare us the work.
         */
        fp->orig_prog = NULL;

        /* bpf_prepare_filter() already takes care of freeing
         * memory in case something goes wrong.
         */
        fp = bpf_prepare_filter(fp, NULL);
        if (IS_ERR(fp))
                return PTR_ERR(fp);

        *pfp = fp;
        return 0;
}
EXPORT_SYMBOL_GPL(bpf_prog_create);

/**
 *        bpf_prog_create_from_user - create an unattached filter from user buffer
 *        @pfp: the unattached filter that is created
 *        @fprog: the filter program
 *        @trans: post-classic verifier transformation handler
 *        @save_orig: save classic BPF program
 *
 * This function effectively does the same as bpf_prog_create(), only
 * that it builds up its insns buffer from user space provided buffer.
 * It also allows for passing a bpf_aux_classic_check_t handler.
 */
int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
                              bpf_aux_classic_check_t trans, bool save_orig)
{
        unsigned int fsize = bpf_classic_proglen(fprog);
        struct bpf_prog *fp;
        int err;

        /* Make sure new filter is there and in the right amounts. */
        if (!bpf_check_basics_ok(fprog->filter, fprog->len))
                return -EINVAL;

        fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
        if (!fp)
                return -ENOMEM;

        if (copy_from_user(fp->insns, fprog->filter, fsize)) {
                __bpf_prog_free(fp);
                return -EFAULT;
        }

        fp->len = fprog->len;
        fp->orig_prog = NULL;

        if (save_orig) {
                err = bpf_prog_store_orig_filter(fp, fprog);
                if (err) {
                        __bpf_prog_free(fp);
                        return -ENOMEM;
                }
        }

        /* bpf_prepare_filter() already takes care of freeing
         * memory in case something goes wrong.
         */
        fp = bpf_prepare_filter(fp, trans);
        if (IS_ERR(fp))
                return PTR_ERR(fp);

        *pfp = fp;
        return 0;
}
EXPORT_SYMBOL_GPL(bpf_prog_create_from_user);

void bpf_prog_destroy(struct bpf_prog *fp)
{
        __bpf_prog_release(fp);
}
EXPORT_SYMBOL_GPL(bpf_prog_destroy);

static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
{
        struct sk_filter *fp, *old_fp;

        fp = kmalloc(sizeof(*fp), GFP_KERNEL);
        if (!fp)
                return -ENOMEM;

        fp->prog = prog;

        if (!__sk_filter_charge(sk, fp)) {
                kfree(fp);
                return -ENOMEM;
        }
        refcount_set(&fp->refcnt, 1);

        old_fp = rcu_dereference_protected(sk->sk_filter,
                                           lockdep_sock_is_held(sk));
        rcu_assign_pointer(sk->sk_filter, fp);

        if (old_fp)
                sk_filter_uncharge(sk, old_fp);

        return 0;
}

static
struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk)
{
        unsigned int fsize = bpf_classic_proglen(fprog);
        struct bpf_prog *prog;
        int err;

        if (sock_flag(sk, SOCK_FILTER_LOCKED))
                return ERR_PTR(-EPERM);

        /* Make sure new filter is there and in the right amounts. */
        if (!bpf_check_basics_ok(fprog->filter, fprog->len))
                return ERR_PTR(-EINVAL);

        prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
        if (!prog)
                return ERR_PTR(-ENOMEM);

        if (copy_from_user(prog->insns, fprog->filter, fsize)) {
                __bpf_prog_free(prog);
                return ERR_PTR(-EFAULT);
        }

        prog->len = fprog->len;

        err = bpf_prog_store_orig_filter(prog, fprog);
        if (err) {
                __bpf_prog_free(prog);
                return ERR_PTR(-ENOMEM);
        }

        /* bpf_prepare_filter() already takes care of freeing
         * memory in case something goes wrong.
         */
        return bpf_prepare_filter(prog, NULL);
}

/**
 *        sk_attach_filter - attach a socket filter
 *        @fprog: the filter program
 *        @sk: the socket to use
 *
 * Attach the user's filter code. We first run some sanity checks on
 * it to make sure it does not explode on us later. If an error
 * occurs or there is insufficient memory for the filter a negative
 * errno code is returned. On success the return is zero.
 */
int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
{
        struct bpf_prog *prog = __get_filter(fprog, sk);
        int err;

        if (IS_ERR(prog))
                return PTR_ERR(prog);

        err = __sk_attach_prog(prog, sk);
        if (err < 0) {
                __bpf_prog_release(prog);
                return err;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(sk_attach_filter);

int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
{
        struct bpf_prog *prog = __get_filter(fprog, sk);
        int err, optmem_max;

        if (IS_ERR(prog))
                return PTR_ERR(prog);

        optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
        if (bpf_prog_size(prog->len) > optmem_max)
                err = -ENOMEM;
        else
                err = reuseport_attach_prog(sk, prog);

        if (err)
                __bpf_prog_release(prog);

        return err;
}

static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk)
{
        if (sock_flag(sk, SOCK_FILTER_LOCKED))
                return ERR_PTR(-EPERM);

        return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
}

int sk_attach_bpf(u32 ufd, struct sock *sk)
{
        struct bpf_prog *prog = __get_bpf(ufd, sk);
        int err;

        if (IS_ERR(prog))
                return PTR_ERR(prog);

        err = __sk_attach_prog(prog, sk);
        if (err < 0) {
                bpf_prog_put(prog);
                return err;
        }

        return 0;
}

int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
{
        struct bpf_prog *prog;
        int err, optmem_max;

        if (sock_flag(sk, SOCK_FILTER_LOCKED))
                return -EPERM;

        prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
        if (PTR_ERR(prog) == -EINVAL)
                prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT);
        if (IS_ERR(prog))
                return PTR_ERR(prog);

        if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) {
                /* Like other non BPF_PROG_TYPE_SOCKET_FILTER
                 * bpf prog (e.g. sockmap).  It depends on the
                 * limitation imposed by bpf_prog_load().
                 * Hence, sysctl_optmem_max is not checked.
                 */
                if ((sk->sk_type != SOCK_STREAM &&
                     sk->sk_type != SOCK_DGRAM) ||
                    (sk->sk_protocol != IPPROTO_UDP &&
                     sk->sk_protocol != IPPROTO_TCP) ||
                    (sk->sk_family != AF_INET &&
                     sk->sk_family != AF_INET6)) {
                        err = -ENOTSUPP;
                        goto err_prog_put;
                }
        } else {
                /* BPF_PROG_TYPE_SOCKET_FILTER */
                optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
                if (bpf_prog_size(prog->len) > optmem_max) {
                        err = -ENOMEM;
                        goto err_prog_put;
                }
        }

        err = reuseport_attach_prog(sk, prog);
err_prog_put:
        if (err)
                bpf_prog_put(prog);

        return err;
}

void sk_reuseport_prog_free(struct bpf_prog *prog)
{
        if (!prog)
                return;

        if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
                bpf_prog_put(prog);
        else
                bpf_prog_destroy(prog);
}

static inline int __bpf_try_make_writable(struct sk_buff *skb,
                                          unsigned int write_len)
{
#ifdef CONFIG_DEBUG_NET
        /* Avoid a splat in pskb_may_pull_reason() */
        if (write_len > INT_MAX)
                return -EINVAL;
#endif
        return skb_ensure_writable(skb, write_len);
}

static inline int bpf_try_make_writable(struct sk_buff *skb,
                                        unsigned int write_len)
{
        int err = __bpf_try_make_writable(skb, write_len);

        bpf_compute_data_pointers(skb);
        return err;
}

static int bpf_try_make_head_writable(struct sk_buff *skb)
{
        return bpf_try_make_writable(skb, skb_headlen(skb));
}

static inline void bpf_push_mac_rcsum(struct sk_buff *skb)
{
        if (skb_at_tc_ingress(skb))
                skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len);
}

static inline void bpf_pull_mac_rcsum(struct sk_buff *skb)
{
        if (skb_at_tc_ingress(skb))
                skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len);
}

BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset,
           const void *, from, u32, len, u64, flags)
{
        void *ptr;

        if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH)))
                return -EINVAL;
        if (unlikely(offset > INT_MAX))
                return -EFAULT;
        if (unlikely(bpf_try_make_writable(skb, offset + len)))
                return -EFAULT;

        ptr = skb->data + offset;
        if (flags & BPF_F_RECOMPUTE_CSUM)
                __skb_postpull_rcsum(skb, ptr, len, offset);

        memcpy(ptr, from, len);

        if (flags & BPF_F_RECOMPUTE_CSUM)
                __skb_postpush_rcsum(skb, ptr, len, offset);
        if (flags & BPF_F_INVALIDATE_HASH)
                skb_clear_hash(skb);

        return 0;
}

static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
        .func                = bpf_skb_store_bytes,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg4_type        = ARG_CONST_SIZE,
        .arg5_type        = ARG_ANYTHING,
};

int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from,
                          u32 len, u64 flags)
{
        return ____bpf_skb_store_bytes(skb, offset, from, len, flags);
}

BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset,
           void *, to, u32, len)
{
        void *ptr;

        if (unlikely(offset > INT_MAX))
                goto err_clear;

        ptr = skb_header_pointer(skb, offset, len, to);
        if (unlikely(!ptr))
                goto err_clear;
        if (ptr != to)
                memcpy(to, ptr, len);

        return 0;
err_clear:
        memset(to, 0, len);
        return -EFAULT;
}

static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
        .func                = bpf_skb_load_bytes,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
};

int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len)
{
        return ____bpf_skb_load_bytes(skb, offset, to, len);
}

BPF_CALL_4(bpf_flow_dissector_load_bytes,
           const struct bpf_flow_dissector *, ctx, u32, offset,
           void *, to, u32, len)
{
        void *ptr;

        if (unlikely(offset > 0xffff))
                goto err_clear;

        if (unlikely(!ctx->skb))
                goto err_clear;

        ptr = skb_header_pointer(ctx->skb, offset, len, to);
        if (unlikely(!ptr))
                goto err_clear;
        if (ptr != to)
                memcpy(to, ptr, len);

        return 0;
err_clear:
        memset(to, 0, len);
        return -EFAULT;
}

static const struct bpf_func_proto bpf_flow_dissector_load_bytes_proto = {
        .func                = bpf_flow_dissector_load_bytes,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb,
           u32, offset, void *, to, u32, len, u32, start_header)
{
        u8 *end = skb_tail_pointer(skb);
        u8 *start, *ptr;

        if (unlikely(offset > 0xffff))
                goto err_clear;

        switch (start_header) {
        case BPF_HDR_START_MAC:
                if (unlikely(!skb_mac_header_was_set(skb)))
                        goto err_clear;
                start = skb_mac_header(skb);
                break;
        case BPF_HDR_START_NET:
                start = skb_network_header(skb);
                break;
        default:
                goto err_clear;
        }

        ptr = start + offset;

        if (likely(ptr + len <= end)) {
                memcpy(to, ptr, len);
                return 0;
        }

err_clear:
        memset(to, 0, len);
        return -EFAULT;
}

static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = {
        .func                = bpf_skb_load_bytes_relative,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
{
        /* Idea is the following: should the needed direct read/write
         * test fail during runtime, we can pull in more data and redo
         * again, since implicitly, we invalidate previous checks here.
         *
         * Or, since we know how much we need to make read/writeable,
         * this can be done once at the program beginning for direct
         * access case. By this we overcome limitations of only current
         * headroom being accessible.
         */
        return bpf_try_make_writable(skb, len ? : skb_headlen(skb));
}

static const struct bpf_func_proto bpf_skb_pull_data_proto = {
        .func                = bpf_skb_pull_data,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

BPF_CALL_1(bpf_sk_fullsock, struct sock *, sk)
{
        return sk_fullsock(sk) ? (unsigned long)sk : (unsigned long)NULL;
}

static const struct bpf_func_proto bpf_sk_fullsock_proto = {
        .func                = bpf_sk_fullsock,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_SOCK_COMMON,
};

static inline int sk_skb_try_make_writable(struct sk_buff *skb,
                                           unsigned int write_len)
{
        return __bpf_try_make_writable(skb, write_len);
}

BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len)
{
        /* Idea is the following: should the needed direct read/write
         * test fail during runtime, we can pull in more data and redo
         * again, since implicitly, we invalidate previous checks here.
         *
         * Or, since we know how much we need to make read/writeable,
         * this can be done once at the program beginning for direct
         * access case. By this we overcome limitations of only current
         * headroom being accessible.
         */
        return sk_skb_try_make_writable(skb, len ? : skb_headlen(skb));
}

static const struct bpf_func_proto sk_skb_pull_data_proto = {
        .func                = sk_skb_pull_data,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset,
           u64, from, u64, to, u64, flags)
{
        __sum16 *ptr;

        if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK)))
                return -EINVAL;
        if (unlikely(offset > 0xffff || offset & 1))
                return -EFAULT;
        if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr))))
                return -EFAULT;

        ptr = (__sum16 *)(skb->data + offset);
        switch (flags & BPF_F_HDR_FIELD_MASK) {
        case 0:
                if (unlikely(from != 0))
                        return -EINVAL;

                csum_replace_by_diff(ptr, to);
                break;
        case 2:
                csum_replace2(ptr, from, to);
                break;
        case 4:
                csum_replace4(ptr, from, to);
                break;
        default:
                return -EINVAL;
        }

        return 0;
}

static const struct bpf_func_proto bpf_l3_csum_replace_proto = {
        .func                = bpf_l3_csum_replace,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
           u64, from, u64, to, u64, flags)
{
        bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
        bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
        bool do_mforce = flags & BPF_F_MARK_ENFORCE;
        bool is_ipv6   = flags & BPF_F_IPV6;
        __sum16 *ptr;

        if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE |
                               BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK | BPF_F_IPV6)))
                return -EINVAL;
        if (unlikely(offset > 0xffff || offset & 1))
                return -EFAULT;
        if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr))))
                return -EFAULT;

        ptr = (__sum16 *)(skb->data + offset);
        if (is_mmzero && !do_mforce && !*ptr)
                return 0;

        switch (flags & BPF_F_HDR_FIELD_MASK) {
        case 0:
                if (unlikely(from != 0))
                        return -EINVAL;

                inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo, is_ipv6);
                break;
        case 2:
                inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo);
                break;
        case 4:
                inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo);
                break;
        default:
                return -EINVAL;
        }

        if (is_mmzero && !*ptr)
                *ptr = CSUM_MANGLED_0;
        return 0;
}

static const struct bpf_func_proto bpf_l4_csum_replace_proto = {
        .func                = bpf_l4_csum_replace,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
           __be32 *, to, u32, to_size, __wsum, seed)
{
        /* This is quite flexible, some examples:
         *
         * from_size == 0, to_size > 0,  seed := csum --> pushing data
         * from_size > 0,  to_size == 0, seed := csum --> pulling data
         * from_size > 0,  to_size > 0,  seed := 0    --> diffing data
         *
         * Even for diffing, from_size and to_size don't need to be equal.
         */

        __wsum ret = seed;

        if (from_size && to_size)
                ret = csum_sub(csum_partial(to, to_size, ret),
                               csum_partial(from, from_size, 0));
        else if (to_size)
                ret = csum_partial(to, to_size, ret);

        else if (from_size)
                ret = ~csum_partial(from, from_size, ~ret);

        return csum_from32to16((__force unsigned int)ret);
}

static const struct bpf_func_proto bpf_csum_diff_proto = {
        .func                = bpf_csum_diff,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
        .arg2_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg3_type        = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
        .arg4_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum)
{
        /* The interface is to be used in combination with bpf_csum_diff()
         * for direct packet writes. csum rotation for alignment as well
         * as emulating csum_sub() can be done from the eBPF program.
         */
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                return (skb->csum = csum_add(skb->csum, csum));

        return -ENOTSUPP;
}

static const struct bpf_func_proto bpf_csum_update_proto = {
        .func                = bpf_csum_update,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

BPF_CALL_2(bpf_csum_level, struct sk_buff *, skb, u64, level)
{
        /* The interface is to be used in combination with bpf_skb_adjust_room()
         * for encap/decap of packet headers when BPF_F_ADJ_ROOM_NO_CSUM_RESET
         * is passed as flags, for example.
         */
        switch (level) {
        case BPF_CSUM_LEVEL_INC:
                __skb_incr_checksum_unnecessary(skb);
                break;
        case BPF_CSUM_LEVEL_DEC:
                __skb_decr_checksum_unnecessary(skb);
                break;
        case BPF_CSUM_LEVEL_RESET:
                __skb_reset_checksum_unnecessary(skb);
                break;
        case BPF_CSUM_LEVEL_QUERY:
                return skb->ip_summed == CHECKSUM_UNNECESSARY ?
                       skb->csum_level : -EACCES;
        default:
                return -EINVAL;
        }

        return 0;
}

static const struct bpf_func_proto bpf_csum_level_proto = {
        .func                = bpf_csum_level,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
{
        return dev_forward_skb_nomtu(dev, skb);
}

static inline int __bpf_rx_skb_no_mac(struct net_device *dev,
                                      struct sk_buff *skb)
{
        int ret = ____dev_forward_skb(dev, skb, false);

        if (likely(!ret)) {
                skb->dev = dev;
                ret = netif_rx(skb);
        }

        return ret;
}

static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
{
        int ret;

        if (dev_xmit_recursion()) {
                net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
                kfree_skb(skb);
                return -ENETDOWN;
        }

        skb->dev = dev;
        skb_set_redirected_noclear(skb, skb_at_tc_ingress(skb));
        skb_clear_tstamp(skb);

        dev_xmit_recursion_inc();
        ret = dev_queue_xmit(skb);
        dev_xmit_recursion_dec();

        return ret;
}

static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev,
                                 u32 flags)
{
        unsigned int mlen = skb_network_offset(skb);

        if (unlikely(skb->len <= mlen)) {
                kfree_skb(skb);
                return -ERANGE;
        }

        if (mlen) {
                __skb_pull(skb, mlen);

                /* At ingress, the mac header has already been pulled once.
                 * At egress, skb_pospull_rcsum has to be done in case that
                 * the skb is originated from ingress (i.e. a forwarded skb)
                 * to ensure that rcsum starts at net header.
                 */
                if (!skb_at_tc_ingress(skb))
                        skb_postpull_rcsum(skb, skb_mac_header(skb), mlen);
        }
        skb_pop_mac_header(skb);
        skb_reset_mac_len(skb);
        return flags & BPF_F_INGRESS ?
               __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb);
}

static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
                                 u32 flags)
{
        /* Verify that a link layer header is carried */
        if (unlikely(skb->mac_header >= skb->network_header || skb->len == 0)) {
                kfree_skb(skb);
                return -ERANGE;
        }

        bpf_push_mac_rcsum(skb);
        return flags & BPF_F_INGRESS ?
               __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
}

static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
                          u32 flags)
{
        if (dev_is_mac_header_xmit(dev))
                return __bpf_redirect_common(skb, dev, flags);
        else
                return __bpf_redirect_no_mac(skb, dev, flags);
}

#if IS_ENABLED(CONFIG_IPV6)
static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
                            struct net_device *dev, struct bpf_nh_params *nh)
{
        u32 hh_len = LL_RESERVED_SPACE(dev);
        const struct in6_addr *nexthop;
        struct dst_entry *dst = NULL;
        struct neighbour *neigh;

        if (dev_xmit_recursion()) {
                net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
                goto out_drop;
        }

        skb->dev = dev;
        skb_clear_tstamp(skb);

        if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
                skb = skb_expand_head(skb, hh_len);
                if (!skb)
                        return -ENOMEM;
        }

        rcu_read_lock();
        if (!nh) {
                dst = skb_dst(skb);
                nexthop = rt6_nexthop(dst_rt6_info(dst),
                                      &ipv6_hdr(skb)->daddr);
        } else {
                nexthop = &nh->ipv6_nh;
        }
        neigh = ip_neigh_gw6(dev, nexthop);
        if (likely(!IS_ERR(neigh))) {
                int ret;

                sock_confirm_neigh(skb, neigh);
                local_bh_disable();
                dev_xmit_recursion_inc();
                ret = neigh_output(neigh, skb, false);
                dev_xmit_recursion_dec();
                local_bh_enable();
                rcu_read_unlock();
                return ret;
        }
        rcu_read_unlock();
        if (dst)
                IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
out_drop:
        kfree_skb(skb);
        return -ENETDOWN;
}

static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
                                   struct bpf_nh_params *nh)
{
        const struct ipv6hdr *ip6h = ipv6_hdr(skb);
        struct net *net = dev_net(dev);
        int err, ret = NET_XMIT_DROP;

        if (!nh) {
                struct dst_entry *dst;
                struct flowi6 fl6 = {
                        .flowi6_flags = FLOWI_FLAG_ANYSRC,
                        .flowi6_mark  = skb->mark,
                        .flowlabel    = ip6_flowinfo(ip6h),
                        .flowi6_oif   = dev->ifindex,
                        .flowi6_proto = ip6h->nexthdr,
                        .daddr              = ip6h->daddr,
                        .saddr              = ip6h->saddr,
                };

                dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL);
                if (IS_ERR(dst))
                        goto out_drop;

                skb_dst_drop(skb);
                skb_dst_set(skb, dst);
        } else if (nh->nh_family != AF_INET6) {
                goto out_drop;
        }

        err = bpf_out_neigh_v6(net, skb, dev, nh);
        if (unlikely(net_xmit_eval(err)))
                DEV_STATS_INC(dev, tx_errors);
        else
                ret = NET_XMIT_SUCCESS;
        goto out_xmit;
out_drop:
        DEV_STATS_INC(dev, tx_errors);
        kfree_skb(skb);
out_xmit:
        return ret;
}
#else
static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
                                   struct bpf_nh_params *nh)
{
        kfree_skb(skb);
        return NET_XMIT_DROP;
}
#endif /* CONFIG_IPV6 */

#if IS_ENABLED(CONFIG_INET)
static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
                            struct net_device *dev, struct bpf_nh_params *nh)
{
        u32 hh_len = LL_RESERVED_SPACE(dev);
        struct neighbour *neigh;
        bool is_v6gw = false;

        if (dev_xmit_recursion()) {
                net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
                goto out_drop;
        }

        skb->dev = dev;
        skb_clear_tstamp(skb);

        if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
                skb = skb_expand_head(skb, hh_len);
                if (!skb)
                        return -ENOMEM;
        }

        rcu_read_lock();
        if (!nh) {
                struct rtable *rt = skb_rtable(skb);

                neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
        } else if (nh->nh_family == AF_INET6) {
                neigh = ip_neigh_gw6(dev, &nh->ipv6_nh);
                is_v6gw = true;
        } else if (nh->nh_family == AF_INET) {
                neigh = ip_neigh_gw4(dev, nh->ipv4_nh);
        } else {
                rcu_read_unlock();
                goto out_drop;
        }

        if (likely(!IS_ERR(neigh))) {
                int ret;

                sock_confirm_neigh(skb, neigh);
                local_bh_disable();
                dev_xmit_recursion_inc();
                ret = neigh_output(neigh, skb, is_v6gw);
                dev_xmit_recursion_dec();
                local_bh_enable();
                rcu_read_unlock();
                return ret;
        }
        rcu_read_unlock();
out_drop:
        kfree_skb(skb);
        return -ENETDOWN;
}

static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
                                   struct bpf_nh_params *nh)
{
        const struct iphdr *ip4h = ip_hdr(skb);
        struct net *net = dev_net(dev);
        int err, ret = NET_XMIT_DROP;

        if (!nh) {
                struct flowi4 fl4 = {
                        .flowi4_flags = FLOWI_FLAG_ANYSRC,
                        .flowi4_mark  = skb->mark,
                        .flowi4_dscp  = ip4h_dscp(ip4h),
                        .flowi4_oif   = dev->ifindex,
                        .flowi4_proto = ip4h->protocol,
                        .daddr              = ip4h->daddr,
                        .saddr              = ip4h->saddr,
                };
                struct rtable *rt;

                rt = ip_route_output_flow(net, &fl4, NULL);
                if (IS_ERR(rt))
                        goto out_drop;
                if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
                        ip_rt_put(rt);
                        goto out_drop;
                }

                skb_dst_drop(skb);
                skb_dst_set(skb, &rt->dst);
        }

        err = bpf_out_neigh_v4(net, skb, dev, nh);
        if (unlikely(net_xmit_eval(err)))
                DEV_STATS_INC(dev, tx_errors);
        else
                ret = NET_XMIT_SUCCESS;
        goto out_xmit;
out_drop:
        DEV_STATS_INC(dev, tx_errors);
        kfree_skb(skb);
out_xmit:
        return ret;
}
#else
static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
                                   struct bpf_nh_params *nh)
{
        kfree_skb(skb);
        return NET_XMIT_DROP;
}
#endif /* CONFIG_INET */

static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev,
                                struct bpf_nh_params *nh)
{
        struct ethhdr *ethh = eth_hdr(skb);

        if (unlikely(skb->mac_header >= skb->network_header))
                goto out;
        bpf_push_mac_rcsum(skb);
        if (is_multicast_ether_addr(ethh->h_dest))
                goto out;

        skb_pull(skb, sizeof(*ethh));
        skb_unset_mac_header(skb);
        skb_reset_network_header(skb);

        if (skb->protocol == htons(ETH_P_IP))
                return __bpf_redirect_neigh_v4(skb, dev, nh);
        else if (skb->protocol == htons(ETH_P_IPV6))
                return __bpf_redirect_neigh_v6(skb, dev, nh);
out:
        kfree_skb(skb);
        return -ENOTSUPP;
}

/* Internal, non-exposed redirect flags. */
enum {
        BPF_F_NEIGH        = (1ULL << 16),
        BPF_F_PEER        = (1ULL << 17),
        BPF_F_NEXTHOP        = (1ULL << 18),
#define BPF_F_REDIRECT_INTERNAL        (BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP)
};

BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
{
        struct net_device *dev;
        struct sk_buff *clone;
        int ret;

        BUILD_BUG_ON(BPF_F_REDIRECT_INTERNAL & BPF_F_REDIRECT_FLAGS);

        if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
                return -EINVAL;

        dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex);
        if (unlikely(!dev))
                return -EINVAL;

        clone = skb_clone(skb, GFP_ATOMIC);
        if (unlikely(!clone))
                return -ENOMEM;

        /* For direct write, we need to keep the invariant that the skbs
         * we're dealing with need to be uncloned. Should uncloning fail
         * here, we need to free the just generated clone to unclone once
         * again.
         */
        ret = bpf_try_make_head_writable(skb);
        if (unlikely(ret)) {
                kfree_skb(clone);
                return -ENOMEM;
        }

        return __bpf_redirect(clone, dev, flags);
}

static const struct bpf_func_proto bpf_clone_redirect_proto = {
        .func           = bpf_clone_redirect,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
        .arg3_type      = ARG_ANYTHING,
};

static struct net_device *skb_get_peer_dev(struct net_device *dev)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (likely(ops->ndo_get_peer_dev))
                return INDIRECT_CALL_1(ops->ndo_get_peer_dev,
                                       netkit_peer_dev, dev);
        return NULL;
}

int skb_do_redirect(struct sk_buff *skb)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
        struct net *net = dev_net(skb->dev);
        struct net_device *dev;
        u32 flags = ri->flags;

        dev = dev_get_by_index_rcu(net, ri->tgt_index);
        ri->tgt_index = 0;
        ri->flags = 0;
        if (unlikely(!dev))
                goto out_drop;
        if (flags & BPF_F_PEER) {
                if (unlikely(!skb_at_tc_ingress(skb)))
                        goto out_drop;
                dev = skb_get_peer_dev(dev);
                if (unlikely(!dev ||
                             !(dev->flags & IFF_UP) ||
                             net_eq(net, dev_net(dev))))
                        goto out_drop;
                skb->dev = dev;
                dev_sw_netstats_rx_add(dev, skb->len);
                skb_scrub_packet(skb, false);
                return -EAGAIN;
        }
        return flags & BPF_F_NEIGH ?
               __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ?
                                    &ri->nh : NULL) :
               __bpf_redirect(skb, dev, flags);
out_drop:
        kfree_skb(skb);
        return -EINVAL;
}

BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();

        if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
                return TC_ACT_SHOT;

        ri->flags = flags;
        ri->tgt_index = ifindex;

        return TC_ACT_REDIRECT;
}

static const struct bpf_func_proto bpf_redirect_proto = {
        .func           = bpf_redirect,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_ANYTHING,
        .arg2_type      = ARG_ANYTHING,
};

BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();

        if (unlikely(flags))
                return TC_ACT_SHOT;

        ri->flags = BPF_F_PEER;
        ri->tgt_index = ifindex;

        return TC_ACT_REDIRECT;
}

static const struct bpf_func_proto bpf_redirect_peer_proto = {
        .func           = bpf_redirect_peer,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_ANYTHING,
        .arg2_type      = ARG_ANYTHING,
};

BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params,
           int, plen, u64, flags)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();

        if (unlikely((plen && plen < sizeof(*params)) || flags))
                return TC_ACT_SHOT;

        ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0);
        ri->tgt_index = ifindex;

        BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params));
        if (plen)
                memcpy(&ri->nh, params, sizeof(ri->nh));

        return TC_ACT_REDIRECT;
}

static const struct bpf_func_proto bpf_redirect_neigh_proto = {
        .func                = bpf_redirect_neigh,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_ANYTHING,
        .arg2_type      = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
        .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes)
{
        msg->apply_bytes = bytes;
        return 0;
}

static const struct bpf_func_proto bpf_msg_apply_bytes_proto = {
        .func           = bpf_msg_apply_bytes,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
};

BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes)
{
        msg->cork_bytes = bytes;
        return 0;
}

static void sk_msg_reset_curr(struct sk_msg *msg)
{
        if (!msg->sg.size) {
                msg->sg.curr = msg->sg.start;
                msg->sg.copybreak = 0;
        } else {
                u32 i = msg->sg.end;

                sk_msg_iter_var_prev(i);
                msg->sg.curr = i;
                msg->sg.copybreak = msg->sg.data[i].length;
        }
}

static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
        .func           = bpf_msg_cork_bytes,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
};

BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
           u32, end, u64, flags)
{
        u32 len = 0, offset = 0, copy = 0, poffset = 0, bytes = end - start;
        u32 first_sge, last_sge, i, shift, bytes_sg_total;
        struct scatterlist *sge;
        u8 *raw, *to, *from;
        struct page *page;

        if (unlikely(flags || end <= start))
                return -EINVAL;

        /* First find the starting scatterlist element */
        i = msg->sg.start;
        do {
                offset += len;
                len = sk_msg_elem(msg, i)->length;
                if (start < offset + len)
                        break;
                sk_msg_iter_var_next(i);
        } while (i != msg->sg.end);

        if (unlikely(start >= offset + len))
                return -EINVAL;

        first_sge = i;
        /* The start may point into the sg element so we need to also
         * account for the headroom.
         */
        bytes_sg_total = start - offset + bytes;
        if (!test_bit(i, msg->sg.copy) && bytes_sg_total <= len)
                goto out;

        /* At this point we need to linearize multiple scatterlist
         * elements or a single shared page. Either way we need to
         * copy into a linear buffer exclusively owned by BPF. Then
         * place the buffer in the scatterlist and fixup the original
         * entries by removing the entries now in the linear buffer
         * and shifting the remaining entries. For now we do not try
         * to copy partial entries to avoid complexity of running out
         * of sg_entry slots. The downside is reading a single byte
         * will copy the entire sg entry.
         */
        do {
                copy += sk_msg_elem(msg, i)->length;
                sk_msg_iter_var_next(i);
                if (bytes_sg_total <= copy)
                        break;
        } while (i != msg->sg.end);
        last_sge = i;

        if (unlikely(bytes_sg_total > copy))
                return -EINVAL;

        page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP,
                           get_order(copy));
        if (unlikely(!page))
                return -ENOMEM;

        raw = page_address(page);
        i = first_sge;
        do {
                sge = sk_msg_elem(msg, i);
                from = sg_virt(sge);
                len = sge->length;
                to = raw + poffset;

                memcpy(to, from, len);
                poffset += len;
                sge->length = 0;
                put_page(sg_page(sge));

                sk_msg_iter_var_next(i);
        } while (i != last_sge);

        sg_set_page(&msg->sg.data[first_sge], page, copy, 0);

        /* To repair sg ring we need to shift entries. If we only
         * had a single entry though we can just replace it and
         * be done. Otherwise walk the ring and shift the entries.
         */
        WARN_ON_ONCE(last_sge == first_sge);
        shift = last_sge > first_sge ?
                last_sge - first_sge - 1 :
                NR_MSG_FRAG_IDS - first_sge + last_sge - 1;
        if (!shift)
                goto out;

        i = first_sge;
        sk_msg_iter_var_next(i);
        do {
                u32 move_from;

                if (i + shift >= NR_MSG_FRAG_IDS)
                        move_from = i + shift - NR_MSG_FRAG_IDS;
                else
                        move_from = i + shift;
                if (move_from == msg->sg.end)
                        break;

                msg->sg.data[i] = msg->sg.data[move_from];
                msg->sg.data[move_from].length = 0;
                msg->sg.data[move_from].page_link = 0;
                msg->sg.data[move_from].offset = 0;
                sk_msg_iter_var_next(i);
        } while (1);

        msg->sg.end = msg->sg.end - shift > msg->sg.end ?
                      msg->sg.end - shift + NR_MSG_FRAG_IDS :
                      msg->sg.end - shift;
out:
        sk_msg_reset_curr(msg);
        msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset;
        msg->data_end = msg->data + bytes;
        return 0;
}

static const struct bpf_func_proto bpf_msg_pull_data_proto = {
        .func                = bpf_msg_pull_data,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
           u32, len, u64, flags)
{
        struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge;
        u32 new, i = 0, l = 0, space, copy = 0, offset = 0;
        u8 *raw, *to, *from;
        struct page *page;

        if (unlikely(flags))
                return -EINVAL;

        if (unlikely(len == 0))
                return 0;

        /* First find the starting scatterlist element */
        i = msg->sg.start;
        do {
                offset += l;
                l = sk_msg_elem(msg, i)->length;

                if (start < offset + l)
                        break;
                sk_msg_iter_var_next(i);
        } while (i != msg->sg.end);

        if (start > offset + l)
                return -EINVAL;

        space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);

        /* If no space available will fallback to copy, we need at
         * least one scatterlist elem available to push data into
         * when start aligns to the beginning of an element or two
         * when it falls inside an element. We handle the start equals
         * offset case because its the common case for inserting a
         * header.
         */
        if (!space || (space == 1 && start != offset))
                copy = msg->sg.data[i].length;

        page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP,
                           get_order(copy + len));
        if (unlikely(!page))
                return -ENOMEM;

        if (copy) {
                int front, back;

                raw = page_address(page);

                if (i == msg->sg.end)
                        sk_msg_iter_var_prev(i);
                psge = sk_msg_elem(msg, i);
                front = start - offset;
                back = psge->length - front;
                from = sg_virt(psge);

                if (front)
                        memcpy(raw, from, front);

                if (back) {
                        from += front;
                        to = raw + front + len;

                        memcpy(to, from, back);
                }

                put_page(sg_page(psge));
                new = i;
                goto place_new;
        }

        if (start - offset) {
                if (i == msg->sg.end)
                        sk_msg_iter_var_prev(i);
                psge = sk_msg_elem(msg, i);
                rsge = sk_msg_elem_cpy(msg, i);

                psge->length = start - offset;
                rsge.length -= psge->length;
                rsge.offset += start;

                sk_msg_iter_var_next(i);
                sg_unmark_end(psge);
                sg_unmark_end(&rsge);
        }

        /* Slot(s) to place newly allocated data */
        sk_msg_iter_next(msg, end);
        new = i;
        sk_msg_iter_var_next(i);

        if (i == msg->sg.end) {
                if (!rsge.length)
                        goto place_new;
                sk_msg_iter_next(msg, end);
                goto place_new;
        }

        /* Shift one or two slots as needed */
        sge = sk_msg_elem_cpy(msg, new);
        sg_unmark_end(&sge);

        nsge = sk_msg_elem_cpy(msg, i);
        if (rsge.length) {
                sk_msg_iter_var_next(i);
                nnsge = sk_msg_elem_cpy(msg, i);
                sk_msg_iter_next(msg, end);
        }

        while (i != msg->sg.end) {
                msg->sg.data[i] = sge;
                sge = nsge;
                sk_msg_iter_var_next(i);
                if (rsge.length) {
                        nsge = nnsge;
                        nnsge = sk_msg_elem_cpy(msg, i);
                } else {
                        nsge = sk_msg_elem_cpy(msg, i);
                }
        }

place_new:
        /* Place newly allocated data buffer */
        sk_mem_charge(msg->sk, len);
        msg->sg.size += len;
        __clear_bit(new, msg->sg.copy);
        sg_set_page(&msg->sg.data[new], page, len + copy, 0);
        if (rsge.length) {
                get_page(sg_page(&rsge));
                sk_msg_iter_var_next(new);
                msg->sg.data[new] = rsge;
        }

        sk_msg_reset_curr(msg);
        sk_msg_compute_data_pointers(msg);
        return 0;
}

static const struct bpf_func_proto bpf_msg_push_data_proto = {
        .func                = bpf_msg_push_data,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
};

static void sk_msg_shift_left(struct sk_msg *msg, int i)
{
        struct scatterlist *sge = sk_msg_elem(msg, i);
        int prev;

        put_page(sg_page(sge));
        do {
                prev = i;
                sk_msg_iter_var_next(i);
                msg->sg.data[prev] = msg->sg.data[i];
        } while (i != msg->sg.end);

        sk_msg_iter_prev(msg, end);
}

static void sk_msg_shift_right(struct sk_msg *msg, int i)
{
        struct scatterlist tmp, sge;

        sk_msg_iter_next(msg, end);
        sge = sk_msg_elem_cpy(msg, i);
        sk_msg_iter_var_next(i);
        tmp = sk_msg_elem_cpy(msg, i);

        while (i != msg->sg.end) {
                msg->sg.data[i] = sge;
                sk_msg_iter_var_next(i);
                sge = tmp;
                tmp = sk_msg_elem_cpy(msg, i);
        }
}

BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
           u32, len, u64, flags)
{
        u32 i = 0, l = 0, space, offset = 0;
        u64 last = start + len;
        int pop;

        if (unlikely(flags))
                return -EINVAL;

        if (unlikely(len == 0))
                return 0;

        /* First find the starting scatterlist element */
        i = msg->sg.start;
        do {
                offset += l;
                l = sk_msg_elem(msg, i)->length;

                if (start < offset + l)
                        break;
                sk_msg_iter_var_next(i);
        } while (i != msg->sg.end);

        /* Bounds checks: start and pop must be inside message */
        if (start >= offset + l || last > msg->sg.size)
                return -EINVAL;

        space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);

        pop = len;
        /* --------------| offset
         * -| start      |-------- len -------|
         *
         *  |----- a ----|-------- pop -------|----- b ----|
         *  |______________________________________________| length
         *
         *
         * a:   region at front of scatter element to save
         * b:   region at back of scatter element to save when length > A + pop
         * pop: region to pop from element, same as input 'pop' here will be
         *      decremented below per iteration.
         *
         * Two top-level cases to handle when start != offset, first B is non
         * zero and second B is zero corresponding to when a pop includes more
         * than one element.
         *
         * Then if B is non-zero AND there is no space allocate space and
         * compact A, B regions into page. If there is space shift ring to
         * the right free'ing the next element in ring to place B, leaving
         * A untouched except to reduce length.
         */
        if (start != offset) {
                struct scatterlist *nsge, *sge = sk_msg_elem(msg, i);
                int a = start - offset;
                int b = sge->length - pop - a;

                sk_msg_iter_var_next(i);

                if (b > 0) {
                        if (space) {
                                sge->length = a;
                                sk_msg_shift_right(msg, i);
                                nsge = sk_msg_elem(msg, i);
                                get_page(sg_page(sge));
                                sg_set_page(nsge,
                                            sg_page(sge),
                                            b, sge->offset + pop + a);
                        } else {
                                struct page *page, *orig;
                                u8 *to, *from;

                                page = alloc_pages(__GFP_NOWARN |
                                                   __GFP_COMP   | GFP_ATOMIC,
                                                   get_order(a + b));
                                if (unlikely(!page))
                                        return -ENOMEM;

                                orig = sg_page(sge);
                                from = sg_virt(sge);
                                to = page_address(page);
                                memcpy(to, from, a);
                                memcpy(to + a, from + a + pop, b);
                                sg_set_page(sge, page, a + b, 0);
                                put_page(orig);
                        }
                        pop = 0;
                } else {
                        pop -= (sge->length - a);
                        sge->length = a;
                }
        }

        /* From above the current layout _must_ be as follows,
         *
         * -| offset
         * -| start
         *
         *  |---- pop ---|---------------- b ------------|
         *  |____________________________________________| length
         *
         * Offset and start of the current msg elem are equal because in the
         * previous case we handled offset != start and either consumed the
         * entire element and advanced to the next element OR pop == 0.
         *
         * Two cases to handle here are first pop is less than the length
         * leaving some remainder b above. Simply adjust the element's layout
         * in this case. Or pop >= length of the element so that b = 0. In this
         * case advance to next element decrementing pop.
         */
        while (pop) {
                struct scatterlist *sge = sk_msg_elem(msg, i);

                if (pop < sge->length) {
                        sge->length -= pop;
                        sge->offset += pop;
                        pop = 0;
                } else {
                        pop -= sge->length;
                        sk_msg_shift_left(msg, i);
                }
        }

        sk_mem_uncharge(msg->sk, len - pop);
        msg->sg.size -= (len - pop);
        sk_msg_reset_curr(msg);
        sk_msg_compute_data_pointers(msg);
        return 0;
}

static const struct bpf_func_proto bpf_msg_pop_data_proto = {
        .func                = bpf_msg_pop_data,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
};

#ifdef CONFIG_CGROUP_NET_CLASSID
BPF_CALL_0(bpf_get_cgroup_classid_curr)
{
        return __task_get_classid(current);
}

const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = {
        .func                = bpf_get_cgroup_classid_curr,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
};

BPF_CALL_1(bpf_skb_cgroup_classid, const struct sk_buff *, skb)
{
        struct sock *sk = skb_to_full_sk(skb);

        if (!sk || !sk_fullsock(sk))
                return 0;

        return sock_cgroup_classid(&sk->sk_cgrp_data);
}

static const struct bpf_func_proto bpf_skb_cgroup_classid_proto = {
        .func                = bpf_skb_cgroup_classid,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};
#endif

BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
{
        return task_get_classid(skb);
}

static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
        .func           = bpf_get_cgroup_classid,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb)
{
        return dst_tclassid(skb);
}

static const struct bpf_func_proto bpf_get_route_realm_proto = {
        .func           = bpf_get_route_realm,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb)
{
        /* If skb_clear_hash() was called due to mangling, we can
         * trigger SW recalculation here. Later access to hash
         * can then use the inline skb->hash via context directly
         * instead of calling this helper again.
         */
        return skb_get_hash(skb);
}

static const struct bpf_func_proto bpf_get_hash_recalc_proto = {
        .func                = bpf_get_hash_recalc,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb)
{
        /* After all direct packet write, this can be used once for
         * triggering a lazy recalc on next skb_get_hash() invocation.
         */
        skb_clear_hash(skb);
        return 0;
}

static const struct bpf_func_proto bpf_set_hash_invalid_proto = {
        .func                = bpf_set_hash_invalid,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_2(bpf_set_hash, struct sk_buff *, skb, u32, hash)
{
        /* Set user specified hash as L4(+), so that it gets returned
         * on skb_get_hash() call unless BPF prog later on triggers a
         * skb_clear_hash().
         */
        __skb_set_sw_hash(skb, hash, true);
        return 0;
}

static const struct bpf_func_proto bpf_set_hash_proto = {
        .func                = bpf_set_hash,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
           u16, vlan_tci)
{
        int ret;

        if (unlikely(vlan_proto != htons(ETH_P_8021Q) &&
                     vlan_proto != htons(ETH_P_8021AD)))
                vlan_proto = htons(ETH_P_8021Q);

        bpf_push_mac_rcsum(skb);
        ret = skb_vlan_push(skb, vlan_proto, vlan_tci);
        bpf_pull_mac_rcsum(skb);
        skb_reset_mac_len(skb);

        bpf_compute_data_pointers(skb);
        return ret;
}

static const struct bpf_func_proto bpf_skb_vlan_push_proto = {
        .func           = bpf_skb_vlan_push,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
        .arg3_type      = ARG_ANYTHING,
};

BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
{
        int ret;

        bpf_push_mac_rcsum(skb);
        ret = skb_vlan_pop(skb);
        bpf_pull_mac_rcsum(skb);

        bpf_compute_data_pointers(skb);
        return ret;
}

static const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
        .func           = bpf_skb_vlan_pop,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

static void bpf_skb_change_protocol(struct sk_buff *skb, u16 proto)
{
        skb->protocol = htons(proto);
        if (skb_valid_dst(skb))
                skb_dst_drop(skb);
}

static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
{
        /* Caller already did skb_cow() with len as headroom,
         * so no need to do it here.
         */
        skb_push(skb, len);
        memmove(skb->data, skb->data + len, off);
        memset(skb->data + off, 0, len);

        /* No skb_postpush_rcsum(skb, skb->data + off, len)
         * needed here as it does not change the skb->csum
         * result for checksum complete when summing over
         * zeroed blocks.
         */
        return 0;
}

static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len)
{
        void *old_data;

        /* skb_ensure_writable() is not needed here, as we're
         * already working on an uncloned skb.
         */
        if (unlikely(!pskb_may_pull(skb, off + len)))
                return -ENOMEM;

        old_data = skb->data;
        __skb_pull(skb, len);
        skb_postpull_rcsum(skb, old_data + off, len);
        memmove(skb->data, old_data, off);

        return 0;
}

static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len)
{
        bool trans_same = skb->transport_header == skb->network_header;
        int ret;

        /* There's no need for __skb_push()/__skb_pull() pair to
         * get to the start of the mac header as we're guaranteed
         * to always start from here under eBPF.
         */
        ret = bpf_skb_generic_push(skb, off, len);
        if (likely(!ret)) {
                skb->mac_header -= len;
                skb->network_header -= len;
                if (trans_same)
                        skb->transport_header = skb->network_header;
        }

        return ret;
}

static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len)
{
        bool trans_same = skb->transport_header == skb->network_header;
        int ret;

        /* Same here, __skb_push()/__skb_pull() pair not needed. */
        ret = bpf_skb_generic_pop(skb, off, len);
        if (likely(!ret)) {
                skb->mac_header += len;
                skb->network_header += len;
                if (trans_same)
                        skb->transport_header = skb->network_header;
        }

        return ret;
}

static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
{
        const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
        u32 off = skb_mac_header_len(skb);
        int ret;

        ret = skb_cow(skb, len_diff);
        if (unlikely(ret < 0))
                return ret;

        ret = bpf_skb_net_hdr_push(skb, off, len_diff);
        if (unlikely(ret < 0))
                return ret;

        if (skb_is_gso(skb)) {
                struct skb_shared_info *shinfo = skb_shinfo(skb);

                /* SKB_GSO_TCPV4 needs to be changed into SKB_GSO_TCPV6. */
                if (shinfo->gso_type & SKB_GSO_TCPV4) {
                        shinfo->gso_type &= ~SKB_GSO_TCPV4;
                        shinfo->gso_type |=  SKB_GSO_TCPV6;
                }
        }

        bpf_skb_change_protocol(skb, ETH_P_IPV6);
        skb_clear_hash(skb);

        return 0;
}

static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
{
        const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
        u32 off = skb_mac_header_len(skb);
        int ret;

        ret = skb_unclone(skb, GFP_ATOMIC);
        if (unlikely(ret < 0))
                return ret;

        ret = bpf_skb_net_hdr_pop(skb, off, len_diff);
        if (unlikely(ret < 0))
                return ret;

        if (skb_is_gso(skb)) {
                struct skb_shared_info *shinfo = skb_shinfo(skb);

                /* SKB_GSO_TCPV6 needs to be changed into SKB_GSO_TCPV4. */
                if (shinfo->gso_type & SKB_GSO_TCPV6) {
                        shinfo->gso_type &= ~SKB_GSO_TCPV6;
                        shinfo->gso_type |=  SKB_GSO_TCPV4;
                }
        }

        bpf_skb_change_protocol(skb, ETH_P_IP);
        skb_clear_hash(skb);

        return 0;
}

static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto)
{
        __be16 from_proto = skb->protocol;

        if (from_proto == htons(ETH_P_IP) &&
              to_proto == htons(ETH_P_IPV6))
                return bpf_skb_proto_4_to_6(skb);

        if (from_proto == htons(ETH_P_IPV6) &&
              to_proto == htons(ETH_P_IP))
                return bpf_skb_proto_6_to_4(skb);

        return -ENOTSUPP;
}

BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto,
           u64, flags)
{
        int ret;

        if (unlikely(flags))
                return -EINVAL;

        /* General idea is that this helper does the basic groundwork
         * needed for changing the protocol, and eBPF program fills the
         * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace()
         * and other helpers, rather than passing a raw buffer here.
         *
         * The rationale is to keep this minimal and without a need to
         * deal with raw packet data. F.e. even if we would pass buffers
         * here, the program still needs to call the bpf_lX_csum_replace()
         * helpers anyway. Plus, this way we keep also separation of
         * concerns, since f.e. bpf_skb_store_bytes() should only take
         * care of stores.
         *
         * Currently, additional options and extension header space are
         * not supported, but flags register is reserved so we can adapt
         * that. For offloads, we mark packet as dodgy, so that headers
         * need to be verified first.
         */
        ret = bpf_skb_proto_xlat(skb, proto);
        bpf_compute_data_pointers(skb);
        return ret;
}

static const struct bpf_func_proto bpf_skb_change_proto_proto = {
        .func                = bpf_skb_change_proto,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type)
{
        /* We only allow a restricted subset to be changed for now. */
        if (unlikely(!skb_pkt_type_ok(skb->pkt_type) ||
                     !skb_pkt_type_ok(pkt_type)))
                return -EINVAL;

        skb->pkt_type = pkt_type;
        return 0;
}

static const struct bpf_func_proto bpf_skb_change_type_proto = {
        .func                = bpf_skb_change_type,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
{
        switch (skb->protocol) {
        case htons(ETH_P_IP):
                return sizeof(struct iphdr);
        case htons(ETH_P_IPV6):
                return sizeof(struct ipv6hdr);
        default:
                return ~0U;
        }
}

#define BPF_F_ADJ_ROOM_ENCAP_L3_MASK        (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \
                                         BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)

#define BPF_F_ADJ_ROOM_DECAP_L3_MASK        (BPF_F_ADJ_ROOM_DECAP_L3_IPV4 | \
                                         BPF_F_ADJ_ROOM_DECAP_L3_IPV6)

#define BPF_F_ADJ_ROOM_MASK                (BPF_F_ADJ_ROOM_FIXED_GSO | \
                                         BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
                                         BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
                                         BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \
                                         BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \
                                         BPF_F_ADJ_ROOM_ENCAP_L2( \
                                          BPF_ADJ_ROOM_ENCAP_L2_MASK) | \
                                         BPF_F_ADJ_ROOM_DECAP_L3_MASK)

static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
                            u64 flags)
{
        u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT;
        bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK;
        u16 mac_len = 0, inner_net = 0, inner_trans = 0;
        unsigned int gso_type = SKB_GSO_DODGY;
        int ret;

        if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
                /* udp gso_size delineates datagrams, only allow if fixed */
                if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) ||
                    !(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
                        return -ENOTSUPP;
        }

        ret = skb_cow_head(skb, len_diff);
        if (unlikely(ret < 0))
                return ret;

        if (encap) {
                if (skb->protocol != htons(ETH_P_IP) &&
                    skb->protocol != htons(ETH_P_IPV6))
                        return -ENOTSUPP;

                if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 &&
                    flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
                        return -EINVAL;

                if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE &&
                    flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
                        return -EINVAL;

                if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH &&
                    inner_mac_len < ETH_HLEN)
                        return -EINVAL;

                if (skb->encapsulation)
                        return -EALREADY;

                mac_len = skb->network_header - skb->mac_header;
                inner_net = skb->network_header;
                if (inner_mac_len > len_diff)
                        return -EINVAL;
                inner_trans = skb->transport_header;
        }

        ret = bpf_skb_net_hdr_push(skb, off, len_diff);
        if (unlikely(ret < 0))
                return ret;

        if (encap) {
                skb->inner_mac_header = inner_net - inner_mac_len;
                skb->inner_network_header = inner_net;
                skb->inner_transport_header = inner_trans;

                if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH)
                        skb_set_inner_protocol(skb, htons(ETH_P_TEB));
                else
                        skb_set_inner_protocol(skb, skb->protocol);

                skb->encapsulation = 1;
                skb_set_network_header(skb, mac_len);

                if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
                        gso_type |= SKB_GSO_UDP_TUNNEL;
                else if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE)
                        gso_type |= SKB_GSO_GRE;
                else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
                        gso_type |= SKB_GSO_IPXIP6;
                else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4)
                        gso_type |= SKB_GSO_IPXIP4;

                if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE ||
                    flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) {
                        int nh_len = flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 ?
                                        sizeof(struct ipv6hdr) :
                                        sizeof(struct iphdr);

                        skb_set_transport_header(skb, mac_len + nh_len);
                }

                /* Match skb->protocol to new outer l3 protocol */
                if (skb->protocol == htons(ETH_P_IP) &&
                    flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
                        bpf_skb_change_protocol(skb, ETH_P_IPV6);
                else if (skb->protocol == htons(ETH_P_IPV6) &&
                         flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4)
                        bpf_skb_change_protocol(skb, ETH_P_IP);
        }

        if (skb_is_gso(skb)) {
                struct skb_shared_info *shinfo = skb_shinfo(skb);

                /* Header must be checked, and gso_segs recomputed. */
                shinfo->gso_type |= gso_type;
                shinfo->gso_segs = 0;

                /* Due to header growth, MSS needs to be downgraded.
                 * There is a BUG_ON() when segmenting the frag_list with
                 * head_frag true, so linearize the skb after downgrading
                 * the MSS.
                 */
                if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) {
                        skb_decrease_gso_size(shinfo, len_diff);
                        if (shinfo->frag_list)
                                return skb_linearize(skb);
                }
        }

        return 0;
}

static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,
                              u64 flags)
{
        int ret;

        if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO |
                               BPF_F_ADJ_ROOM_DECAP_L3_MASK |
                               BPF_F_ADJ_ROOM_NO_CSUM_RESET)))
                return -EINVAL;

        if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
                /* udp gso_size delineates datagrams, only allow if fixed */
                if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) ||
                    !(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
                        return -ENOTSUPP;
        }

        ret = skb_unclone(skb, GFP_ATOMIC);
        if (unlikely(ret < 0))
                return ret;

        ret = bpf_skb_net_hdr_pop(skb, off, len_diff);
        if (unlikely(ret < 0))
                return ret;

        /* Match skb->protocol to new outer l3 protocol */
        if (skb->protocol == htons(ETH_P_IP) &&
            flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6)
                bpf_skb_change_protocol(skb, ETH_P_IPV6);
        else if (skb->protocol == htons(ETH_P_IPV6) &&
                 flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4)
                bpf_skb_change_protocol(skb, ETH_P_IP);

        if (skb_is_gso(skb)) {
                struct skb_shared_info *shinfo = skb_shinfo(skb);

                /* Due to header shrink, MSS can be upgraded. */
                if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
                        skb_increase_gso_size(shinfo, len_diff);

                /* Header must be checked, and gso_segs recomputed. */
                shinfo->gso_type |= SKB_GSO_DODGY;
                shinfo->gso_segs = 0;
        }

        return 0;
}

#define BPF_SKB_MAX_LEN SKB_MAX_ALLOC

BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
           u32, mode, u64, flags)
{
        u32 len_diff_abs = abs(len_diff);
        bool shrink = len_diff < 0;
        int ret = 0;

        if (unlikely(flags || mode))
                return -EINVAL;
        if (unlikely(len_diff_abs > 0xfffU))
                return -EFAULT;

        if (!shrink) {
                ret = skb_cow(skb, len_diff);
                if (unlikely(ret < 0))
                        return ret;
                __skb_push(skb, len_diff_abs);
                memset(skb->data, 0, len_diff_abs);
        } else {
                if (unlikely(!pskb_may_pull(skb, len_diff_abs)))
                        return -ENOMEM;
                __skb_pull(skb, len_diff_abs);
        }
        if (tls_sw_has_ctx_rx(skb->sk)) {
                struct strp_msg *rxm = strp_msg(skb);

                rxm->full_len += len_diff;
        }
        return ret;
}

static const struct bpf_func_proto sk_skb_adjust_room_proto = {
        .func                = sk_skb_adjust_room,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
           u32, mode, u64, flags)
{
        u32 len_cur, len_diff_abs = abs(len_diff);
        u32 len_min = bpf_skb_net_base_len(skb);
        u32 len_max = BPF_SKB_MAX_LEN;
        __be16 proto = skb->protocol;
        bool shrink = len_diff < 0;
        u32 off;
        int ret;

        if (unlikely(flags & ~(BPF_F_ADJ_ROOM_MASK |
                               BPF_F_ADJ_ROOM_NO_CSUM_RESET)))
                return -EINVAL;
        if (unlikely(len_diff_abs > 0xfffU))
                return -EFAULT;
        if (unlikely(proto != htons(ETH_P_IP) &&
                     proto != htons(ETH_P_IPV6)))
                return -ENOTSUPP;

        off = skb_mac_header_len(skb);
        switch (mode) {
        case BPF_ADJ_ROOM_NET:
                off += bpf_skb_net_base_len(skb);
                break;
        case BPF_ADJ_ROOM_MAC:
                break;
        default:
                return -ENOTSUPP;
        }

        if (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) {
                if (!shrink)
                        return -EINVAL;

                switch (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) {
                case BPF_F_ADJ_ROOM_DECAP_L3_IPV4:
                        len_min = sizeof(struct iphdr);
                        break;
                case BPF_F_ADJ_ROOM_DECAP_L3_IPV6:
                        len_min = sizeof(struct ipv6hdr);
                        break;
                default:
                        return -EINVAL;
                }
        }

        len_cur = skb->len - skb_network_offset(skb);
        if ((shrink && (len_diff_abs >= len_cur ||
                        len_cur - len_diff_abs < len_min)) ||
            (!shrink && (skb->len + len_diff_abs > len_max &&
                         !skb_is_gso(skb))))
                return -ENOTSUPP;

        ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) :
                       bpf_skb_net_grow(skb, off, len_diff_abs, flags);
        if (!ret && !(flags & BPF_F_ADJ_ROOM_NO_CSUM_RESET))
                __skb_reset_checksum_unnecessary(skb);

        bpf_compute_data_pointers(skb);
        return ret;
}

static const struct bpf_func_proto bpf_skb_adjust_room_proto = {
        .func                = bpf_skb_adjust_room,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
};

static u32 __bpf_skb_min_len(const struct sk_buff *skb)
{
        int offset = skb_network_offset(skb);
        u32 min_len = 0;

        if (offset > 0)
                min_len = offset;
        if (skb_transport_header_was_set(skb)) {
                offset = skb_transport_offset(skb);
                if (offset > 0)
                        min_len = offset;
        }
        if (skb->ip_summed == CHECKSUM_PARTIAL) {
                offset = skb_checksum_start_offset(skb) +
                         skb->csum_offset + sizeof(__sum16);
                if (offset > 0)
                        min_len = offset;
        }
        return min_len;
}

static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len)
{
        unsigned int old_len = skb->len;
        int ret;

        ret = __skb_grow_rcsum(skb, new_len);
        if (!ret)
                memset(skb->data + old_len, 0, new_len - old_len);
        return ret;
}

static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len)
{
        return __skb_trim_rcsum(skb, new_len);
}

static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len,
                                        u64 flags)
{
        u32 max_len = BPF_SKB_MAX_LEN;
        u32 min_len = __bpf_skb_min_len(skb);
        int ret;

        if (unlikely(flags || new_len > max_len || new_len < min_len))
                return -EINVAL;
        if (skb->encapsulation)
                return -ENOTSUPP;

        /* The basic idea of this helper is that it's performing the
         * needed work to either grow or trim an skb, and eBPF program
         * rewrites the rest via helpers like bpf_skb_store_bytes(),
         * bpf_lX_csum_replace() and others rather than passing a raw
         * buffer here. This one is a slow path helper and intended
         * for replies with control messages.
         *
         * Like in bpf_skb_change_proto(), we want to keep this rather
         * minimal and without protocol specifics so that we are able
         * to separate concerns as in bpf_skb_store_bytes() should only
         * be the one responsible for writing buffers.
         *
         * It's really expected to be a slow path operation here for
         * control message replies, so we're implicitly linearizing,
         * uncloning and drop offloads from the skb by this.
         */
        ret = __bpf_try_make_writable(skb, skb->len);
        if (!ret) {
                if (new_len > skb->len)
                        ret = bpf_skb_grow_rcsum(skb, new_len);
                else if (new_len < skb->len)
                        ret = bpf_skb_trim_rcsum(skb, new_len);
                if (!ret && skb_is_gso(skb))
                        skb_gso_reset(skb);
        }
        return ret;
}

BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len,
           u64, flags)
{
        int ret = __bpf_skb_change_tail(skb, new_len, flags);

        bpf_compute_data_pointers(skb);
        return ret;
}

static const struct bpf_func_proto bpf_skb_change_tail_proto = {
        .func                = bpf_skb_change_tail,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len,
           u64, flags)
{
        return __bpf_skb_change_tail(skb, new_len, flags);
}

static const struct bpf_func_proto sk_skb_change_tail_proto = {
        .func                = sk_skb_change_tail,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};

static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room,
                                        u64 flags)
{
        u32 max_len = BPF_SKB_MAX_LEN;
        u32 new_len = skb->len + head_room;
        int ret;

        if (unlikely(flags || (int)head_room < 0 ||
                     (!skb_is_gso(skb) && new_len > max_len) ||
                     new_len < skb->len))
                return -EINVAL;

        ret = skb_cow(skb, head_room);
        if (likely(!ret)) {
                /* Idea for this helper is that we currently only
                 * allow to expand on mac header. This means that
                 * skb->protocol network header, etc, stay as is.
                 * Compared to bpf_skb_change_tail(), we're more
                 * flexible due to not needing to linearize or
                 * reset GSO. Intention for this helper is to be
                 * used by an L3 skb that needs to push mac header
                 * for redirection into L2 device.
                 */
                __skb_push(skb, head_room);
                memset(skb->data, 0, head_room);
                skb_reset_mac_header(skb);
                skb_reset_mac_len(skb);
        }

        return ret;
}

BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
           u64, flags)
{
        int ret = __bpf_skb_change_head(skb, head_room, flags);

        bpf_compute_data_pointers(skb);
        return ret;
}

static const struct bpf_func_proto bpf_skb_change_head_proto = {
        .func                = bpf_skb_change_head,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room,
           u64, flags)
{
        return __bpf_skb_change_head(skb, head_room, flags);
}

static const struct bpf_func_proto sk_skb_change_head_proto = {
        .func                = sk_skb_change_head,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_1(bpf_xdp_get_buff_len, struct xdp_buff*, xdp)
{
        return xdp_get_buff_len(xdp);
}

static const struct bpf_func_proto bpf_xdp_get_buff_len_proto = {
        .func                = bpf_xdp_get_buff_len,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BTF_ID_LIST_SINGLE(bpf_xdp_get_buff_len_bpf_ids, struct, xdp_buff)

const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto = {
        .func                = bpf_xdp_get_buff_len,
        .gpl_only        = false,
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &bpf_xdp_get_buff_len_bpf_ids[0],
};

static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
{
        return xdp_data_meta_unsupported(xdp) ? 0 :
               xdp->data - xdp->data_meta;
}

BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
{
        void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
        unsigned long metalen = xdp_get_metalen(xdp);
        void *data_start = xdp_frame_end + metalen;
        void *data = xdp->data + offset;

        if (unlikely(data < data_start ||
                     data > xdp->data_end - ETH_HLEN))
                return -EINVAL;

        if (metalen)
                memmove(xdp->data_meta + offset,
                        xdp->data_meta, metalen);
        xdp->data_meta += offset;
        xdp->data = data;

        return 0;
}

static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
        .func                = bpf_xdp_adjust_head,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
                      void *buf, unsigned long len, bool flush)
{
        unsigned long ptr_len, ptr_off = 0;
        skb_frag_t *next_frag, *end_frag;
        struct skb_shared_info *sinfo;
        void *src, *dst;
        u8 *ptr_buf;

        if (likely(xdp->data_end - xdp->data >= off + len)) {
                src = flush ? buf : xdp->data + off;
                dst = flush ? xdp->data + off : buf;
                memcpy(dst, src, len);
                return;
        }

        sinfo = xdp_get_shared_info_from_buff(xdp);
        end_frag = &sinfo->frags[sinfo->nr_frags];
        next_frag = &sinfo->frags[0];

        ptr_len = xdp->data_end - xdp->data;
        ptr_buf = xdp->data;

        while (true) {
                if (off < ptr_off + ptr_len) {
                        unsigned long copy_off = off - ptr_off;
                        unsigned long copy_len = min(len, ptr_len - copy_off);

                        src = flush ? buf : ptr_buf + copy_off;
                        dst = flush ? ptr_buf + copy_off : buf;
                        memcpy(dst, src, copy_len);

                        off += copy_len;
                        len -= copy_len;
                        buf += copy_len;
                }

                if (!len || next_frag == end_frag)
                        break;

                ptr_off += ptr_len;
                ptr_buf = skb_frag_address(next_frag);
                ptr_len = skb_frag_size(next_frag);
                next_frag++;
        }
}

void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len)
{
        u32 size = xdp->data_end - xdp->data;
        struct skb_shared_info *sinfo;
        void *addr = xdp->data;
        int i;

        if (unlikely(offset > 0xffff || len > 0xffff))
                return ERR_PTR(-EFAULT);

        if (unlikely(offset + len > xdp_get_buff_len(xdp)))
                return ERR_PTR(-EINVAL);

        if (likely(offset < size)) /* linear area */
                goto out;

        sinfo = xdp_get_shared_info_from_buff(xdp);
        offset -= size;
        for (i = 0; i < sinfo->nr_frags; i++) { /* paged area */
                u32 frag_size = skb_frag_size(&sinfo->frags[i]);

                if  (offset < frag_size) {
                        addr = skb_frag_address(&sinfo->frags[i]);
                        size = frag_size;
                        break;
                }
                offset -= frag_size;
        }
out:
        return offset + len <= size ? addr + offset : NULL;
}

BPF_CALL_4(bpf_xdp_load_bytes, struct xdp_buff *, xdp, u32, offset,
           void *, buf, u32, len)
{
        void *ptr;

        ptr = bpf_xdp_pointer(xdp, offset, len);
        if (IS_ERR(ptr))
                return PTR_ERR(ptr);

        if (!ptr)
                bpf_xdp_copy_buf(xdp, offset, buf, len, false);
        else
                memcpy(buf, ptr, len);

        return 0;
}

static const struct bpf_func_proto bpf_xdp_load_bytes_proto = {
        .func                = bpf_xdp_load_bytes,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
};

int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len)
{
        return ____bpf_xdp_load_bytes(xdp, offset, buf, len);
}

BPF_CALL_4(bpf_xdp_store_bytes, struct xdp_buff *, xdp, u32, offset,
           void *, buf, u32, len)
{
        void *ptr;

        ptr = bpf_xdp_pointer(xdp, offset, len);
        if (IS_ERR(ptr))
                return PTR_ERR(ptr);

        if (!ptr)
                bpf_xdp_copy_buf(xdp, offset, buf, len, true);
        else
                memcpy(ptr, buf, len);

        return 0;
}

static const struct bpf_func_proto bpf_xdp_store_bytes_proto = {
        .func                = bpf_xdp_store_bytes,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
};

int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len)
{
        return ____bpf_xdp_store_bytes(xdp, offset, buf, len);
}

static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
{
        struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
        skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1];
        struct xdp_rxq_info *rxq = xdp->rxq;
        unsigned int tailroom;

        if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz)
                return -EOPNOTSUPP;

        tailroom = rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag);
        if (unlikely(offset > tailroom))
                return -EINVAL;

        memset(skb_frag_address(frag) + skb_frag_size(frag), 0, offset);
        skb_frag_size_add(frag, offset);
        sinfo->xdp_frags_size += offset;
        if (rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL)
                xsk_buff_get_tail(xdp)->data_end += offset;

        return 0;
}

static struct xdp_buff *bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink,
                                               bool tail, bool release)
{
        struct xdp_buff *zc_frag = tail ? xsk_buff_get_tail(xdp) :
                                          xsk_buff_get_head(xdp);

        if (release) {
                xsk_buff_del_frag(zc_frag);
        } else {
                if (tail)
                        zc_frag->data_end -= shrink;
                else
                        zc_frag->data += shrink;
        }

        return zc_frag;
}

static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag,
                                int shrink, bool tail)
{
        enum xdp_mem_type mem_type = xdp->rxq->mem.type;
        bool release = skb_frag_size(frag) == shrink;
        netmem_ref netmem = skb_frag_netmem(frag);
        struct xdp_buff *zc_frag = NULL;

        if (mem_type == MEM_TYPE_XSK_BUFF_POOL) {
                netmem = 0;
                zc_frag = bpf_xdp_shrink_data_zc(xdp, shrink, tail, release);
        }

        if (release) {
                __xdp_return(netmem, mem_type, false, zc_frag);
        } else {
                if (!tail)
                        skb_frag_off_add(frag, shrink);
                skb_frag_size_sub(frag, shrink);
        }

        return release;
}

static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset)
{
        struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
        int i, n_frags_free = 0, len_free = 0;

        if (unlikely(offset > (int)xdp_get_buff_len(xdp) - ETH_HLEN))
                return -EINVAL;

        for (i = sinfo->nr_frags - 1; i >= 0 && offset > 0; i--) {
                skb_frag_t *frag = &sinfo->frags[i];
                int shrink = min_t(int, offset, skb_frag_size(frag));

                len_free += shrink;
                offset -= shrink;
                if (bpf_xdp_shrink_data(xdp, frag, shrink, true))
                        n_frags_free++;
        }
        sinfo->nr_frags -= n_frags_free;
        sinfo->xdp_frags_size -= len_free;

        if (unlikely(!sinfo->nr_frags)) {
                xdp_buff_clear_frags_flag(xdp);
                xdp_buff_clear_frag_pfmemalloc(xdp);
                xdp->data_end -= offset;
        }

        return 0;
}

BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
{
        void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
        void *data_end = xdp->data_end + offset;

        if (unlikely(xdp_buff_has_frags(xdp))) { /* non-linear xdp buff */
                if (offset < 0)
                        return bpf_xdp_frags_shrink_tail(xdp, -offset);

                return bpf_xdp_frags_increase_tail(xdp, offset);
        }

        /* Notice that xdp_data_hard_end have reserved some tailroom */
        if (unlikely(data_end > data_hard_end))
                return -EINVAL;

        if (unlikely(data_end < xdp->data + ETH_HLEN))
                return -EINVAL;

        /* Clear memory area on grow, can contain uninit kernel memory */
        if (offset > 0)
                memset(xdp->data_end, 0, offset);

        xdp->data_end = data_end;

        return 0;
}

static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = {
        .func                = bpf_xdp_adjust_tail,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
{
        void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
        void *meta = xdp->data_meta + offset;
        unsigned long metalen = xdp->data - meta;

        if (xdp_data_meta_unsupported(xdp))
                return -ENOTSUPP;
        if (unlikely(meta < xdp_frame_end ||
                     meta > xdp->data))
                return -EINVAL;
        if (unlikely(xdp_metalen_invalid(metalen)))
                return -EACCES;

        xdp->data_meta = meta;

        return 0;
}

static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
        .func                = bpf_xdp_adjust_meta,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

/**
 * DOC: xdp redirect
 *
 * XDP_REDIRECT works by a three-step process, implemented in the functions
 * below:
 *
 * 1. The bpf_redirect() and bpf_redirect_map() helpers will lookup the target
 *    of the redirect and store it (along with some other metadata) in a per-CPU
 *    struct bpf_redirect_info.
 *
 * 2. When the program returns the XDP_REDIRECT return code, the driver will
 *    call xdp_do_redirect() which will use the information in struct
 *    bpf_redirect_info to actually enqueue the frame into a map type-specific
 *    bulk queue structure.
 *
 * 3. Before exiting its NAPI poll loop, the driver will call
 *    xdp_do_flush(), which will flush all the different bulk queues,
 *    thus completing the redirect. Note that xdp_do_flush() must be
 *    called before napi_complete_done() in the driver, as the
 *    XDP_REDIRECT logic relies on being inside a single NAPI instance
 *    through to the xdp_do_flush() call for RCU protection of all
 *    in-kernel data structures.
 */
/*
 * Pointers to the map entries will be kept around for this whole sequence of
 * steps, protected by RCU. However, there is no top-level rcu_read_lock() in
 * the core code; instead, the RCU protection relies on everything happening
 * inside a single NAPI poll sequence, which means it's between a pair of calls
 * to local_bh_disable()/local_bh_enable().
 *
 * The map entries are marked as __rcu and the map code makes sure to
 * dereference those pointers with rcu_dereference_check() in a way that works
 * for both sections that to hold an rcu_read_lock() and sections that are
 * called from NAPI without a separate rcu_read_lock(). The code below does not
 * use RCU annotations, but relies on those in the map code.
 */
void xdp_do_flush(void)
{
        struct list_head *lh_map, *lh_dev, *lh_xsk;

        bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
        if (lh_dev)
                __dev_flush(lh_dev);
        if (lh_map)
                __cpu_map_flush(lh_map);
        if (lh_xsk)
                __xsk_map_flush(lh_xsk);
}
EXPORT_SYMBOL_GPL(xdp_do_flush);

#if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
void xdp_do_check_flushed(struct napi_struct *napi)
{
        struct list_head *lh_map, *lh_dev, *lh_xsk;
        bool missed = false;

        bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
        if (lh_dev) {
                __dev_flush(lh_dev);
                missed = true;
        }
        if (lh_map) {
                __cpu_map_flush(lh_map);
                missed = true;
        }
        if (lh_xsk) {
                __xsk_map_flush(lh_xsk);
                missed = true;
        }

        WARN_ONCE(missed, "Missing xdp_do_flush() invocation after NAPI by %ps\n",
                  napi->poll);
}
#endif

DEFINE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key);
EXPORT_SYMBOL_GPL(bpf_master_redirect_enabled_key);

u32 xdp_master_redirect(struct xdp_buff *xdp)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
        struct net_device *master, *slave;

        master = netdev_master_upper_dev_get_rcu(xdp->rxq->dev);
        slave = master->netdev_ops->ndo_xdp_get_xmit_slave(master, xdp);
        if (slave && slave != xdp->rxq->dev) {
                /* The target device is different from the receiving device, so
                 * redirect it to the new device.
                 * Using XDP_REDIRECT gets the correct behaviour from XDP enabled
                 * drivers to unmap the packet from their rx ring.
                 */
                ri->tgt_index = slave->ifindex;
                ri->map_id = INT_MAX;
                ri->map_type = BPF_MAP_TYPE_UNSPEC;
                return XDP_REDIRECT;
        }
        return XDP_TX;
}
EXPORT_SYMBOL_GPL(xdp_master_redirect);

static inline int __xdp_do_redirect_xsk(struct bpf_redirect_info *ri,
                                        const struct net_device *dev,
                                        struct xdp_buff *xdp,
                                        const struct bpf_prog *xdp_prog)
{
        enum bpf_map_type map_type = ri->map_type;
        void *fwd = ri->tgt_value;
        u32 map_id = ri->map_id;
        int err;

        ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
        ri->map_type = BPF_MAP_TYPE_UNSPEC;

        err = __xsk_map_redirect(fwd, xdp);
        if (unlikely(err))
                goto err;

        _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
        return 0;
err:
        _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
        return err;
}

static __always_inline int
__xdp_do_redirect_frame(struct bpf_redirect_info *ri, struct net_device *dev,
                        struct xdp_frame *xdpf,
                        const struct bpf_prog *xdp_prog)
{
        enum bpf_map_type map_type = ri->map_type;
        void *fwd = ri->tgt_value;
        u32 map_id = ri->map_id;
        u32 flags = ri->flags;
        struct bpf_map *map;
        int err;

        ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
        ri->flags = 0;
        ri->map_type = BPF_MAP_TYPE_UNSPEC;

        if (unlikely(!xdpf)) {
                err = -EOVERFLOW;
                goto err;
        }

        switch (map_type) {
        case BPF_MAP_TYPE_DEVMAP:
                fallthrough;
        case BPF_MAP_TYPE_DEVMAP_HASH:
                if (unlikely(flags & BPF_F_BROADCAST)) {
                        map = READ_ONCE(ri->map);

                        /* The map pointer is cleared when the map is being torn
                         * down by dev_map_free()
                         */
                        if (unlikely(!map)) {
                                err = -ENOENT;
                                break;
                        }

                        WRITE_ONCE(ri->map, NULL);
                        err = dev_map_enqueue_multi(xdpf, dev, map,
                                                    flags & BPF_F_EXCLUDE_INGRESS);
                } else {
                        err = dev_map_enqueue(fwd, xdpf, dev);
                }
                break;
        case BPF_MAP_TYPE_CPUMAP:
                err = cpu_map_enqueue(fwd, xdpf, dev);
                break;
        case BPF_MAP_TYPE_UNSPEC:
                if (map_id == INT_MAX) {
                        fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
                        if (unlikely(!fwd)) {
                                err = -EINVAL;
                                break;
                        }
                        err = dev_xdp_enqueue(fwd, xdpf, dev);
                        break;
                }
                fallthrough;
        default:
                err = -EBADRQC;
        }

        if (unlikely(err))
                goto err;

        _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
        return 0;
err:
        _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
        return err;
}

int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
                    const struct bpf_prog *xdp_prog)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
        enum bpf_map_type map_type = ri->map_type;

        if (map_type == BPF_MAP_TYPE_XSKMAP)
                return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog);

        return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp),
                                       xdp_prog);
}
EXPORT_SYMBOL_GPL(xdp_do_redirect);

int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp,
                          struct xdp_frame *xdpf,
                          const struct bpf_prog *xdp_prog)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
        enum bpf_map_type map_type = ri->map_type;

        if (map_type == BPF_MAP_TYPE_XSKMAP)
                return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog);

        return __xdp_do_redirect_frame(ri, dev, xdpf, xdp_prog);
}
EXPORT_SYMBOL_GPL(xdp_do_redirect_frame);

static int xdp_do_generic_redirect_map(struct net_device *dev,
                                       struct sk_buff *skb,
                                       struct xdp_buff *xdp,
                                       const struct bpf_prog *xdp_prog,
                                       void *fwd, enum bpf_map_type map_type,
                                       u32 map_id, u32 flags)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
        struct bpf_map *map;
        int err;

        switch (map_type) {
        case BPF_MAP_TYPE_DEVMAP:
                fallthrough;
        case BPF_MAP_TYPE_DEVMAP_HASH:
                if (unlikely(flags & BPF_F_BROADCAST)) {
                        map = READ_ONCE(ri->map);

                        /* The map pointer is cleared when the map is being torn
                         * down by dev_map_free()
                         */
                        if (unlikely(!map)) {
                                err = -ENOENT;
                                break;
                        }

                        WRITE_ONCE(ri->map, NULL);
                        err = dev_map_redirect_multi(dev, skb, xdp_prog, map,
                                                     flags & BPF_F_EXCLUDE_INGRESS);
                } else {
                        err = dev_map_generic_redirect(fwd, skb, xdp_prog);
                }
                if (unlikely(err))
                        goto err;
                break;
        case BPF_MAP_TYPE_XSKMAP:
                err = xsk_generic_rcv(fwd, xdp);
                if (err)
                        goto err;
                consume_skb(skb);
                break;
        case BPF_MAP_TYPE_CPUMAP:
                err = cpu_map_generic_redirect(fwd, skb);
                if (unlikely(err))
                        goto err;
                break;
        default:
                err = -EBADRQC;
                goto err;
        }

        _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
        return 0;
err:
        _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
        return err;
}

int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
                            struct xdp_buff *xdp,
                            const struct bpf_prog *xdp_prog)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
        enum bpf_map_type map_type = ri->map_type;
        void *fwd = ri->tgt_value;
        u32 map_id = ri->map_id;
        u32 flags = ri->flags;
        int err;

        ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
        ri->flags = 0;
        ri->map_type = BPF_MAP_TYPE_UNSPEC;

        if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) {
                fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
                if (unlikely(!fwd)) {
                        err = -EINVAL;
                        goto err;
                }

                err = xdp_ok_fwd_dev(fwd, skb->len);
                if (unlikely(err))
                        goto err;

                skb->dev = fwd;
                _trace_xdp_redirect(dev, xdp_prog, ri->tgt_index);
                generic_xdp_tx(skb, xdp_prog);
                return 0;
        }

        return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id, flags);
err:
        _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err);
        return err;
}

BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();

        if (unlikely(flags))
                return XDP_ABORTED;

        /* NB! Map type UNSPEC and map_id == INT_MAX (never generated
         * by map_idr) is used for ifindex based XDP redirect.
         */
        ri->tgt_index = ifindex;
        ri->map_id = INT_MAX;
        ri->map_type = BPF_MAP_TYPE_UNSPEC;

        return XDP_REDIRECT;
}

static const struct bpf_func_proto bpf_xdp_redirect_proto = {
        .func           = bpf_xdp_redirect,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_ANYTHING,
        .arg2_type      = ARG_ANYTHING,
};

BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u64, key,
           u64, flags)
{
        return map->ops->map_redirect(map, key, flags);
}

static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
        .func           = bpf_xdp_redirect_map,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_CONST_MAP_PTR,
        .arg2_type      = ARG_ANYTHING,
        .arg3_type      = ARG_ANYTHING,
};

static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
                                  unsigned long off, unsigned long len)
{
        void *ptr = skb_header_pointer(skb, off, len, dst_buff);

        if (unlikely(!ptr))
                return len;
        if (ptr != dst_buff)
                memcpy(dst_buff, ptr, len);

        return 0;
}

BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map,
           u64, flags, void *, meta, u64, meta_size)
{
        u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32;

        if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
                return -EINVAL;
        if (unlikely(!skb || skb_size > skb->len))
                return -EFAULT;

        return bpf_event_output(map, flags, meta, meta_size, skb, skb_size,
                                bpf_skb_copy);
}

static const struct bpf_func_proto bpf_skb_event_output_proto = {
        .func                = bpf_skb_event_output,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE_OR_ZERO,
};

BTF_ID_LIST_SINGLE(bpf_skb_output_btf_ids, struct, sk_buff)

const struct bpf_func_proto bpf_skb_output_proto = {
        .func                = bpf_skb_event_output,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &bpf_skb_output_btf_ids[0],
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE_OR_ZERO,
};

static unsigned short bpf_tunnel_key_af(u64 flags)
{
        return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET;
}

BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to,
           u32, size, u64, flags)
{
        const struct ip_tunnel_info *info = skb_tunnel_info(skb);
        u8 compat[sizeof(struct bpf_tunnel_key)];
        void *to_orig = to;
        int err;

        if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6 |
                                         BPF_F_TUNINFO_FLAGS)))) {
                err = -EINVAL;
                goto err_clear;
        }
        if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) {
                err = -EPROTO;
                goto err_clear;
        }
        if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
                err = -EINVAL;
                switch (size) {
                case offsetof(struct bpf_tunnel_key, local_ipv6[0]):
                case offsetof(struct bpf_tunnel_key, tunnel_label):
                case offsetof(struct bpf_tunnel_key, tunnel_ext):
                        goto set_compat;
                case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
                        /* Fixup deprecated structure layouts here, so we have
                         * a common path later on.
                         */
                        if (ip_tunnel_info_af(info) != AF_INET)
                                goto err_clear;
set_compat:
                        to = (struct bpf_tunnel_key *)compat;
                        break;
                default:
                        goto err_clear;
                }
        }

        to->tunnel_id = be64_to_cpu(info->key.tun_id);
        to->tunnel_tos = info->key.tos;
        to->tunnel_ttl = info->key.ttl;
        if (flags & BPF_F_TUNINFO_FLAGS)
                to->tunnel_flags = ip_tunnel_flags_to_be16(info->key.tun_flags);
        else
                to->tunnel_ext = 0;

        if (flags & BPF_F_TUNINFO_IPV6) {
                memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
                       sizeof(to->remote_ipv6));
                memcpy(to->local_ipv6, &info->key.u.ipv6.dst,
                       sizeof(to->local_ipv6));
                to->tunnel_label = be32_to_cpu(info->key.label);
        } else {
                to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
                memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
                to->local_ipv4 = be32_to_cpu(info->key.u.ipv4.dst);
                memset(&to->local_ipv6[1], 0, sizeof(__u32) * 3);
                to->tunnel_label = 0;
        }

        if (unlikely(size != sizeof(struct bpf_tunnel_key)))
                memcpy(to_orig, to, size);

        return 0;
err_clear:
        memset(to_orig, 0, size);
        return err;
}

static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
        .func                = bpf_skb_get_tunnel_key,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size)
{
        const struct ip_tunnel_info *info = skb_tunnel_info(skb);
        int err;

        if (unlikely(!info ||
                     !ip_tunnel_is_options_present(info->key.tun_flags))) {
                err = -ENOENT;
                goto err_clear;
        }
        if (unlikely(size < info->options_len)) {
                err = -ENOMEM;
                goto err_clear;
        }

        ip_tunnel_info_opts_get(to, info);
        if (size > info->options_len)
                memset(to + info->options_len, 0, size - info->options_len);

        return info->options_len;
err_clear:
        memset(to, 0, size);
        return err;
}

static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
        .func                = bpf_skb_get_tunnel_opt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg3_type        = ARG_CONST_SIZE,
};

static struct metadata_dst __percpu *md_dst;

BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
           const struct bpf_tunnel_key *, from, u32, size, u64, flags)
{
        struct metadata_dst *md = this_cpu_ptr(md_dst);
        u8 compat[sizeof(struct bpf_tunnel_key)];
        struct ip_tunnel_info *info;

        if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX |
                               BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER |
                               BPF_F_NO_TUNNEL_KEY)))
                return -EINVAL;
        if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
                switch (size) {
                case offsetof(struct bpf_tunnel_key, local_ipv6[0]):
                case offsetof(struct bpf_tunnel_key, tunnel_label):
                case offsetof(struct bpf_tunnel_key, tunnel_ext):
                case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
                        /* Fixup deprecated structure layouts here, so we have
                         * a common path later on.
                         */
                        memcpy(compat, from, size);
                        memset(compat + size, 0, sizeof(compat) - size);
                        from = (const struct bpf_tunnel_key *) compat;
                        break;
                default:
                        return -EINVAL;
                }
        }
        if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) ||
                     from->tunnel_ext))
                return -EINVAL;

        skb_dst_drop(skb);
        dst_hold((struct dst_entry *) md);
        skb_dst_set(skb, (struct dst_entry *) md);

        info = &md->u.tun_info;
        memset(info, 0, sizeof(*info));
        info->mode = IP_TUNNEL_INFO_TX;

        __set_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags);
        __assign_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags,
                     flags & BPF_F_DONT_FRAGMENT);
        __assign_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags,
                     !(flags & BPF_F_ZERO_CSUM_TX));
        __assign_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags,
                     flags & BPF_F_SEQ_NUMBER);
        __assign_bit(IP_TUNNEL_KEY_BIT, info->key.tun_flags,
                     !(flags & BPF_F_NO_TUNNEL_KEY));

        info->key.tun_id = cpu_to_be64(from->tunnel_id);
        info->key.tos = from->tunnel_tos;
        info->key.ttl = from->tunnel_ttl;

        if (flags & BPF_F_TUNINFO_IPV6) {
                info->mode |= IP_TUNNEL_INFO_IPV6;
                memcpy(&info->key.u.ipv6.dst, from->remote_ipv6,
                       sizeof(from->remote_ipv6));
                memcpy(&info->key.u.ipv6.src, from->local_ipv6,
                       sizeof(from->local_ipv6));
                info->key.label = cpu_to_be32(from->tunnel_label) &
                                  IPV6_FLOWLABEL_MASK;
        } else {
                info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
                info->key.u.ipv4.src = cpu_to_be32(from->local_ipv4);
                info->key.flow_flags = FLOWI_FLAG_ANYSRC;
        }

        return 0;
}

static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
        .func                = bpf_skb_set_tunnel_key,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb,
           const u8 *, from, u32, size)
{
        struct ip_tunnel_info *info = skb_tunnel_info(skb);
        const struct metadata_dst *md = this_cpu_ptr(md_dst);
        IP_TUNNEL_DECLARE_FLAGS(present) = { };

        if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1))))
                return -EINVAL;
        if (unlikely(size > IP_TUNNEL_OPTS_MAX))
                return -ENOMEM;

        ip_tunnel_set_options_present(present);
        ip_tunnel_info_opts_set(info, from, size, present);

        return 0;
}

static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
        .func                = bpf_skb_set_tunnel_opt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
};

static const struct bpf_func_proto *
bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
{
        if (!md_dst) {
                struct metadata_dst __percpu *tmp;

                tmp = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX,
                                                METADATA_IP_TUNNEL,
                                                GFP_KERNEL);
                if (!tmp)
                        return NULL;
                if (cmpxchg(&md_dst, NULL, tmp))
                        metadata_dst_free_percpu(tmp);
        }

        switch (which) {
        case BPF_FUNC_skb_set_tunnel_key:
                return &bpf_skb_set_tunnel_key_proto;
        case BPF_FUNC_skb_set_tunnel_opt:
                return &bpf_skb_set_tunnel_opt_proto;
        default:
                return NULL;
        }
}

BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map,
           u32, idx)
{
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        struct cgroup *cgrp;
        struct sock *sk;

        sk = skb_to_full_sk(skb);
        if (!sk || !sk_fullsock(sk))
                return -ENOENT;
        if (unlikely(idx >= array->map.max_entries))
                return -E2BIG;

        cgrp = READ_ONCE(array->ptrs[idx]);
        if (unlikely(!cgrp))
                return -EAGAIN;

        return sk_under_cgroup_hierarchy(sk, cgrp);
}

static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
        .func                = bpf_skb_under_cgroup,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
};

#ifdef CONFIG_SOCK_CGROUP_DATA
static inline u64 __bpf_sk_cgroup_id(struct sock *sk)
{
        struct cgroup *cgrp;

        sk = sk_to_full_sk(sk);
        if (!sk || !sk_fullsock(sk))
                return 0;

        cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
        return cgroup_id(cgrp);
}

BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb)
{
        return __bpf_sk_cgroup_id(skb->sk);
}

static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
        .func           = bpf_skb_cgroup_id,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk,
                                              int ancestor_level)
{
        struct cgroup *ancestor;
        struct cgroup *cgrp;

        sk = sk_to_full_sk(sk);
        if (!sk || !sk_fullsock(sk))
                return 0;

        cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
        ancestor = cgroup_ancestor(cgrp, ancestor_level);
        if (!ancestor)
                return 0;

        return cgroup_id(ancestor);
}

BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
           ancestor_level)
{
        return __bpf_sk_ancestor_cgroup_id(skb->sk, ancestor_level);
}

static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = {
        .func           = bpf_skb_ancestor_cgroup_id,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
};

BPF_CALL_1(bpf_sk_cgroup_id, struct sock *, sk)
{
        return __bpf_sk_cgroup_id(sk);
}

static const struct bpf_func_proto bpf_sk_cgroup_id_proto = {
        .func           = bpf_sk_cgroup_id,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
};

BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level)
{
        return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level);
}

static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = {
        .func           = bpf_sk_ancestor_cgroup_id,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg2_type      = ARG_ANYTHING,
};
#endif

static unsigned long bpf_xdp_copy(void *dst, const void *ctx,
                                  unsigned long off, unsigned long len)
{
        struct xdp_buff *xdp = (struct xdp_buff *)ctx;

        bpf_xdp_copy_buf(xdp, off, dst, len, false);
        return 0;
}

BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map,
           u64, flags, void *, meta, u64, meta_size)
{
        u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32;

        if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
                return -EINVAL;

        if (unlikely(!xdp || xdp_size > xdp_get_buff_len(xdp)))
                return -EFAULT;

        return bpf_event_output(map, flags, meta, meta_size, xdp,
                                xdp_size, bpf_xdp_copy);
}

static const struct bpf_func_proto bpf_xdp_event_output_proto = {
        .func                = bpf_xdp_event_output,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE_OR_ZERO,
};

BTF_ID_LIST_SINGLE(bpf_xdp_output_btf_ids, struct, xdp_buff)

const struct bpf_func_proto bpf_xdp_output_proto = {
        .func                = bpf_xdp_event_output,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &bpf_xdp_output_btf_ids[0],
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE_OR_ZERO,
};

BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb)
{
        return skb->sk ? __sock_gen_cookie(skb->sk) : 0;
}

static const struct bpf_func_proto bpf_get_socket_cookie_proto = {
        .func           = bpf_get_socket_cookie,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_socket_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
{
        return __sock_gen_cookie(ctx->sk);
}

static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = {
        .func                = bpf_get_socket_cookie_sock_addr,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_socket_cookie_sock, struct sock *, ctx)
{
        return __sock_gen_cookie(ctx);
}

static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = {
        .func                = bpf_get_socket_cookie_sock,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_socket_ptr_cookie, struct sock *, sk)
{
        return sk ? sock_gen_cookie(sk) : 0;
}

const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = {
        .func                = bpf_get_socket_ptr_cookie,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON | PTR_MAYBE_NULL,
};

BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
{
        return __sock_gen_cookie(ctx->sk);
}

static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = {
        .func                = bpf_get_socket_cookie_sock_ops,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

static u64 __bpf_get_netns_cookie(struct sock *sk)
{
        const struct net *net = sk ? sock_net(sk) : &init_net;

        return net->net_cookie;
}

BPF_CALL_1(bpf_get_netns_cookie, struct sk_buff *, skb)
{
        return __bpf_get_netns_cookie(skb && skb->sk ? skb->sk : NULL);
}

static const struct bpf_func_proto bpf_get_netns_cookie_proto = {
        .func           = bpf_get_netns_cookie,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX_OR_NULL,
};

BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx)
{
        return __bpf_get_netns_cookie(ctx);
}

static const struct bpf_func_proto bpf_get_netns_cookie_sock_proto = {
        .func                = bpf_get_netns_cookie_sock,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX_OR_NULL,
};

BPF_CALL_1(bpf_get_netns_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
{
        return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
}

static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = {
        .func                = bpf_get_netns_cookie_sock_addr,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX_OR_NULL,
};

BPF_CALL_1(bpf_get_netns_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
{
        return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
}

static const struct bpf_func_proto bpf_get_netns_cookie_sock_ops_proto = {
        .func                = bpf_get_netns_cookie_sock_ops,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX_OR_NULL,
};

BPF_CALL_1(bpf_get_netns_cookie_sk_msg, struct sk_msg *, ctx)
{
        return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
}

static const struct bpf_func_proto bpf_get_netns_cookie_sk_msg_proto = {
        .func                = bpf_get_netns_cookie_sk_msg,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX_OR_NULL,
};

BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb)
{
        struct sock *sk = sk_to_full_sk(skb->sk);
        kuid_t kuid;

        if (!sk || !sk_fullsock(sk))
                return overflowuid;
        kuid = sock_net_uid(sock_net(sk), sk);
        return from_kuid_munged(sock_net(sk)->user_ns, kuid);
}

static const struct bpf_func_proto bpf_get_socket_uid_proto = {
        .func           = bpf_get_socket_uid,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

static int sk_bpf_set_get_cb_flags(struct sock *sk, char *optval, bool getopt)
{
        u32 sk_bpf_cb_flags;

        if (getopt) {
                *(u32 *)optval = sk->sk_bpf_cb_flags;
                return 0;
        }

        sk_bpf_cb_flags = *(u32 *)optval;

        if (sk_bpf_cb_flags & ~SK_BPF_CB_MASK)
                return -EINVAL;

        sk->sk_bpf_cb_flags = sk_bpf_cb_flags;

        return 0;
}

static int sol_socket_sockopt(struct sock *sk, int optname,
                              char *optval, int *optlen,
                              bool getopt)
{
        switch (optname) {
        case SO_REUSEADDR:
        case SO_SNDBUF:
        case SO_RCVBUF:
        case SO_KEEPALIVE:
        case SO_PRIORITY:
        case SO_REUSEPORT:
        case SO_RCVLOWAT:
        case SO_MARK:
        case SO_MAX_PACING_RATE:
        case SO_BINDTOIFINDEX:
        case SO_TXREHASH:
        case SK_BPF_CB_FLAGS:
                if (*optlen != sizeof(int))
                        return -EINVAL;
                break;
        case SO_BINDTODEVICE:
                break;
        default:
                return -EINVAL;
        }

        if (optname == SK_BPF_CB_FLAGS)
                return sk_bpf_set_get_cb_flags(sk, optval, getopt);

        if (getopt) {
                if (optname == SO_BINDTODEVICE)
                        return -EINVAL;
                return sk_getsockopt(sk, SOL_SOCKET, optname,
                                     KERNEL_SOCKPTR(optval),
                                     KERNEL_SOCKPTR(optlen));
        }

        return sk_setsockopt(sk, SOL_SOCKET, optname,
                             KERNEL_SOCKPTR(optval), *optlen);
}

static int bpf_sol_tcp_getsockopt(struct sock *sk, int optname,
                                  char *optval, int optlen)
{
        if (optlen != sizeof(int))
                return -EINVAL;

        switch (optname) {
        case TCP_BPF_SOCK_OPS_CB_FLAGS: {
                int cb_flags = tcp_sk(sk)->bpf_sock_ops_cb_flags;

                memcpy(optval, &cb_flags, optlen);
                break;
        }
        case TCP_BPF_RTO_MIN: {
                int rto_min_us = jiffies_to_usecs(inet_csk(sk)->icsk_rto_min);

                memcpy(optval, &rto_min_us, optlen);
                break;
        }
        case TCP_BPF_DELACK_MAX: {
                int delack_max_us = jiffies_to_usecs(inet_csk(sk)->icsk_delack_max);

                memcpy(optval, &delack_max_us, optlen);
                break;
        }
        default:
                return -EINVAL;
        }

        return 0;
}

static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname,
                                  char *optval, int optlen)
{
        struct tcp_sock *tp = tcp_sk(sk);
        unsigned long timeout;
        int val;

        if (optlen != sizeof(int))
                return -EINVAL;

        val = *(int *)optval;

        /* Only some options are supported */
        switch (optname) {
        case TCP_BPF_IW:
                if (val <= 0 || tp->data_segs_out > tp->syn_data)
                        return -EINVAL;
                tcp_snd_cwnd_set(tp, val);
                break;
        case TCP_BPF_SNDCWND_CLAMP:
                if (val <= 0)
                        return -EINVAL;
                tp->snd_cwnd_clamp = val;
                tp->snd_ssthresh = val;
                break;
        case TCP_BPF_DELACK_MAX:
                timeout = usecs_to_jiffies(val);
                if (timeout > TCP_DELACK_MAX ||
                    timeout < TCP_TIMEOUT_MIN)
                        return -EINVAL;
                inet_csk(sk)->icsk_delack_max = timeout;
                break;
        case TCP_BPF_RTO_MIN:
                timeout = usecs_to_jiffies(val);
                if (timeout > TCP_RTO_MIN ||
                    timeout < TCP_TIMEOUT_MIN)
                        return -EINVAL;
                inet_csk(sk)->icsk_rto_min = timeout;
                break;
        case TCP_BPF_SOCK_OPS_CB_FLAGS:
                if (val & ~(BPF_SOCK_OPS_ALL_CB_FLAGS))
                        return -EINVAL;
                tp->bpf_sock_ops_cb_flags = val;
                break;
        default:
                return -EINVAL;
        }

        return 0;
}

static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval,
                                      int *optlen, bool getopt)
{
        struct tcp_sock *tp;
        int ret;

        if (*optlen < 2)
                return -EINVAL;

        if (getopt) {
                if (!inet_csk(sk)->icsk_ca_ops)
                        return -EINVAL;
                /* BPF expects NULL-terminated tcp-cc string */
                optval[--(*optlen)] = '\0';
                return do_tcp_getsockopt(sk, SOL_TCP, TCP_CONGESTION,
                                         KERNEL_SOCKPTR(optval),
                                         KERNEL_SOCKPTR(optlen));
        }

        /* "cdg" is the only cc that alloc a ptr
         * in inet_csk_ca area.  The bpf-tcp-cc may
         * overwrite this ptr after switching to cdg.
         */
        if (*optlen >= sizeof("cdg") - 1 && !strncmp("cdg", optval, *optlen))
                return -ENOTSUPP;

        /* It stops this looping
         *
         * .init => bpf_setsockopt(tcp_cc) => .init =>
         * bpf_setsockopt(tcp_cc)" => .init => ....
         *
         * The second bpf_setsockopt(tcp_cc) is not allowed
         * in order to break the loop when both .init
         * are the same bpf prog.
         *
         * This applies even the second bpf_setsockopt(tcp_cc)
         * does not cause a loop.  This limits only the first
         * '.init' can call bpf_setsockopt(TCP_CONGESTION) to
         * pick a fallback cc (eg. peer does not support ECN)
         * and the second '.init' cannot fallback to
         * another.
         */
        tp = tcp_sk(sk);
        if (tp->bpf_chg_cc_inprogress)
                return -EBUSY;

        tp->bpf_chg_cc_inprogress = 1;
        ret = do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
                                KERNEL_SOCKPTR(optval), *optlen);
        tp->bpf_chg_cc_inprogress = 0;
        return ret;
}

static int sol_tcp_sockopt(struct sock *sk, int optname,
                           char *optval, int *optlen,
                           bool getopt)
{
        if (sk->sk_protocol != IPPROTO_TCP)
                return -EINVAL;

        switch (optname) {
        case TCP_NODELAY:
        case TCP_MAXSEG:
        case TCP_KEEPIDLE:
        case TCP_KEEPINTVL:
        case TCP_KEEPCNT:
        case TCP_SYNCNT:
        case TCP_WINDOW_CLAMP:
        case TCP_THIN_LINEAR_TIMEOUTS:
        case TCP_USER_TIMEOUT:
        case TCP_NOTSENT_LOWAT:
        case TCP_SAVE_SYN:
        case TCP_RTO_MAX_MS:
                if (*optlen != sizeof(int))
                        return -EINVAL;
                break;
        case TCP_CONGESTION:
                return sol_tcp_sockopt_congestion(sk, optval, optlen, getopt);
        case TCP_SAVED_SYN:
                if (*optlen < 1)
                        return -EINVAL;
                break;
        default:
                if (getopt)
                        return bpf_sol_tcp_getsockopt(sk, optname, optval, *optlen);
                return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen);
        }

        if (getopt) {
                if (optname == TCP_SAVED_SYN) {
                        struct tcp_sock *tp = tcp_sk(sk);

                        if (!tp->saved_syn ||
                            *optlen > tcp_saved_syn_len(tp->saved_syn))
                                return -EINVAL;
                        memcpy(optval, tp->saved_syn->data, *optlen);
                        /* It cannot free tp->saved_syn here because it
                         * does not know if the user space still needs it.
                         */
                        return 0;
                }

                return do_tcp_getsockopt(sk, SOL_TCP, optname,
                                         KERNEL_SOCKPTR(optval),
                                         KERNEL_SOCKPTR(optlen));
        }

        return do_tcp_setsockopt(sk, SOL_TCP, optname,
                                 KERNEL_SOCKPTR(optval), *optlen);
}

static int sol_ip_sockopt(struct sock *sk, int optname,
                          char *optval, int *optlen,
                          bool getopt)
{
        if (sk->sk_family != AF_INET)
                return -EINVAL;

        switch (optname) {
        case IP_TOS:
                if (*optlen != sizeof(int))
                        return -EINVAL;
                break;
        default:
                return -EINVAL;
        }

        if (getopt)
                return do_ip_getsockopt(sk, SOL_IP, optname,
                                        KERNEL_SOCKPTR(optval),
                                        KERNEL_SOCKPTR(optlen));

        return do_ip_setsockopt(sk, SOL_IP, optname,
                                KERNEL_SOCKPTR(optval), *optlen);
}

static int sol_ipv6_sockopt(struct sock *sk, int optname,
                            char *optval, int *optlen,
                            bool getopt)
{
        if (sk->sk_family != AF_INET6)
                return -EINVAL;

        switch (optname) {
        case IPV6_TCLASS:
        case IPV6_AUTOFLOWLABEL:
                if (*optlen != sizeof(int))
                        return -EINVAL;
                break;
        default:
                return -EINVAL;
        }

        if (getopt)
                return ipv6_bpf_stub->ipv6_getsockopt(sk, SOL_IPV6, optname,
                                                      KERNEL_SOCKPTR(optval),
                                                      KERNEL_SOCKPTR(optlen));

        return ipv6_bpf_stub->ipv6_setsockopt(sk, SOL_IPV6, optname,
                                              KERNEL_SOCKPTR(optval), *optlen);
}

static int __bpf_setsockopt(struct sock *sk, int level, int optname,
                            char *optval, int optlen)
{
        if (!sk_fullsock(sk))
                return -EINVAL;

        if (level == SOL_SOCKET)
                return sol_socket_sockopt(sk, optname, optval, &optlen, false);
        else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP)
                return sol_ip_sockopt(sk, optname, optval, &optlen, false);
        else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6)
                return sol_ipv6_sockopt(sk, optname, optval, &optlen, false);
        else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP)
                return sol_tcp_sockopt(sk, optname, optval, &optlen, false);

        return -EINVAL;
}

static bool is_locked_tcp_sock_ops(struct bpf_sock_ops_kern *bpf_sock)
{
        return bpf_sock->op <= BPF_SOCK_OPS_WRITE_HDR_OPT_CB;
}

static int _bpf_setsockopt(struct sock *sk, int level, int optname,
                           char *optval, int optlen)
{
        if (sk_fullsock(sk))
                sock_owned_by_me(sk);
        return __bpf_setsockopt(sk, level, optname, optval, optlen);
}

static int __bpf_getsockopt(struct sock *sk, int level, int optname,
                            char *optval, int optlen)
{
        int err, saved_optlen = optlen;

        if (!sk_fullsock(sk)) {
                err = -EINVAL;
                goto done;
        }

        if (level == SOL_SOCKET)
                err = sol_socket_sockopt(sk, optname, optval, &optlen, true);
        else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP)
                err = sol_tcp_sockopt(sk, optname, optval, &optlen, true);
        else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP)
                err = sol_ip_sockopt(sk, optname, optval, &optlen, true);
        else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6)
                err = sol_ipv6_sockopt(sk, optname, optval, &optlen, true);
        else
                err = -EINVAL;

done:
        if (err)
                optlen = 0;
        if (optlen < saved_optlen)
                memset(optval + optlen, 0, saved_optlen - optlen);
        return err;
}

static int _bpf_getsockopt(struct sock *sk, int level, int optname,
                           char *optval, int optlen)
{
        if (sk_fullsock(sk))
                sock_owned_by_me(sk);
        return __bpf_getsockopt(sk, level, optname, optval, optlen);
}

BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level,
           int, optname, char *, optval, int, optlen)
{
        return _bpf_setsockopt(sk, level, optname, optval, optlen);
}

const struct bpf_func_proto bpf_sk_setsockopt_proto = {
        .func                = bpf_sk_setsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_sk_getsockopt, struct sock *, sk, int, level,
           int, optname, char *, optval, int, optlen)
{
        return _bpf_getsockopt(sk, level, optname, optval, optlen);
}

const struct bpf_func_proto bpf_sk_getsockopt_proto = {
        .func                = bpf_sk_getsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_unlocked_sk_setsockopt, struct sock *, sk, int, level,
           int, optname, char *, optval, int, optlen)
{
        return __bpf_setsockopt(sk, level, optname, optval, optlen);
}

const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto = {
        .func                = bpf_unlocked_sk_setsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_unlocked_sk_getsockopt, struct sock *, sk, int, level,
           int, optname, char *, optval, int, optlen)
{
        return __bpf_getsockopt(sk, level, optname, optval, optlen);
}

const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto = {
        .func                = bpf_unlocked_sk_getsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx,
           int, level, int, optname, char *, optval, int, optlen)
{
        return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen);
}

static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = {
        .func                = bpf_sock_addr_setsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_sock_addr_getsockopt, struct bpf_sock_addr_kern *, ctx,
           int, level, int, optname, char *, optval, int, optlen)
{
        return _bpf_getsockopt(ctx->sk, level, optname, optval, optlen);
}

static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = {
        .func                = bpf_sock_addr_getsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
           int, level, int, optname, char *, optval, int, optlen)
{
        if (!is_locked_tcp_sock_ops(bpf_sock))
                return -EOPNOTSUPP;

        return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen);
}

static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = {
        .func                = bpf_sock_ops_setsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE,
};

static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock,
                                int optname, const u8 **start)
{
        struct sk_buff *syn_skb = bpf_sock->syn_skb;
        const u8 *hdr_start;
        int ret;

        if (syn_skb) {
                /* sk is a request_sock here */

                if (optname == TCP_BPF_SYN) {
                        hdr_start = syn_skb->data;
                        ret = tcp_hdrlen(syn_skb);
                } else if (optname == TCP_BPF_SYN_IP) {
                        hdr_start = skb_network_header(syn_skb);
                        ret = skb_network_header_len(syn_skb) +
                                tcp_hdrlen(syn_skb);
                } else {
                        /* optname == TCP_BPF_SYN_MAC */
                        hdr_start = skb_mac_header(syn_skb);
                        ret = skb_mac_header_len(syn_skb) +
                                skb_network_header_len(syn_skb) +
                                tcp_hdrlen(syn_skb);
                }
        } else {
                struct sock *sk = bpf_sock->sk;
                struct saved_syn *saved_syn;

                if (sk->sk_state == TCP_NEW_SYN_RECV)
                        /* synack retransmit. bpf_sock->syn_skb will
                         * not be available.  It has to resort to
                         * saved_syn (if it is saved).
                         */
                        saved_syn = inet_reqsk(sk)->saved_syn;
                else
                        saved_syn = tcp_sk(sk)->saved_syn;

                if (!saved_syn)
                        return -ENOENT;

                if (optname == TCP_BPF_SYN) {
                        hdr_start = saved_syn->data +
                                saved_syn->mac_hdrlen +
                                saved_syn->network_hdrlen;
                        ret = saved_syn->tcp_hdrlen;
                } else if (optname == TCP_BPF_SYN_IP) {
                        hdr_start = saved_syn->data +
                                saved_syn->mac_hdrlen;
                        ret = saved_syn->network_hdrlen +
                                saved_syn->tcp_hdrlen;
                } else {
                        /* optname == TCP_BPF_SYN_MAC */

                        /* TCP_SAVE_SYN may not have saved the mac hdr */
                        if (!saved_syn->mac_hdrlen)
                                return -ENOENT;

                        hdr_start = saved_syn->data;
                        ret = saved_syn->mac_hdrlen +
                                saved_syn->network_hdrlen +
                                saved_syn->tcp_hdrlen;
                }
        }

        *start = hdr_start;
        return ret;
}

BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
           int, level, int, optname, char *, optval, int, optlen)
{
        if (!is_locked_tcp_sock_ops(bpf_sock))
                return -EOPNOTSUPP;

        if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP &&
            optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) {
                int ret, copy_len = 0;
                const u8 *start;

                ret = bpf_sock_ops_get_syn(bpf_sock, optname, &start);
                if (ret > 0) {
                        copy_len = ret;
                        if (optlen < copy_len) {
                                copy_len = optlen;
                                ret = -ENOSPC;
                        }

                        memcpy(optval, start, copy_len);
                }

                /* Zero out unused buffer at the end */
                memset(optval + copy_len, 0, optlen - copy_len);

                return ret;
        }

        return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen);
}

static const struct bpf_func_proto bpf_sock_ops_getsockopt_proto = {
        .func                = bpf_sock_ops_getsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock,
           int, argval)
{
        struct sock *sk = bpf_sock->sk;
        int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS;

        if (!is_locked_tcp_sock_ops(bpf_sock))
                return -EOPNOTSUPP;

        if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk))
                return -EINVAL;

        tcp_sk(sk)->bpf_sock_ops_cb_flags = val;

        return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS);
}

static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = {
        .func                = bpf_sock_ops_cb_flags_set,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly;
EXPORT_SYMBOL_GPL(ipv6_bpf_stub);

BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr,
           int, addr_len)
{
#ifdef CONFIG_INET
        struct sock *sk = ctx->sk;
        u32 flags = BIND_FROM_BPF;
        int err;

        err = -EINVAL;
        if (addr_len < offsetofend(struct sockaddr, sa_family))
                return err;
        if (addr->sa_family == AF_INET) {
                if (addr_len < sizeof(struct sockaddr_in))
                        return err;
                if (((struct sockaddr_in *)addr)->sin_port == htons(0))
                        flags |= BIND_FORCE_ADDRESS_NO_PORT;
                return __inet_bind(sk, addr, addr_len, flags);
#if IS_ENABLED(CONFIG_IPV6)
        } else if (addr->sa_family == AF_INET6) {
                if (addr_len < SIN6_LEN_RFC2133)
                        return err;
                if (((struct sockaddr_in6 *)addr)->sin6_port == htons(0))
                        flags |= BIND_FORCE_ADDRESS_NO_PORT;
                /* ipv6_bpf_stub cannot be NULL, since it's called from
                 * bpf_cgroup_inet6_connect hook and ipv6 is already loaded
                 */
                return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, flags);
#endif /* CONFIG_IPV6 */
        }
#endif /* CONFIG_INET */

        return -EAFNOSUPPORT;
}

static const struct bpf_func_proto bpf_bind_proto = {
        .func                = bpf_bind,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
};

#ifdef CONFIG_XFRM

#if (IS_BUILTIN(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \
    (IS_MODULE(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))

struct metadata_dst __percpu *xfrm_bpf_md_dst;
EXPORT_SYMBOL_GPL(xfrm_bpf_md_dst);

#endif

BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index,
           struct bpf_xfrm_state *, to, u32, size, u64, flags)
{
        const struct sec_path *sp = skb_sec_path(skb);
        const struct xfrm_state *x;

        if (!sp || unlikely(index >= sp->len || flags))
                goto err_clear;

        x = sp->xvec[index];

        if (unlikely(size != sizeof(struct bpf_xfrm_state)))
                goto err_clear;

        to->reqid = x->props.reqid;
        to->spi = x->id.spi;
        to->family = x->props.family;
        to->ext = 0;

        if (to->family == AF_INET6) {
                memcpy(to->remote_ipv6, x->props.saddr.a6,
                       sizeof(to->remote_ipv6));
        } else {
                to->remote_ipv4 = x->props.saddr.a4;
                memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
        }

        return 0;
err_clear:
        memset(to, 0, size);
        return -EINVAL;
}

static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
        .func                = bpf_skb_get_xfrm_state,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
        .arg5_type        = ARG_ANYTHING,
};
#endif

#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, u32 mtu)
{
        params->h_vlan_TCI = 0;
        params->h_vlan_proto = 0;
        if (mtu)
                params->mtu_result = mtu; /* union with tot_len */

        return 0;
}
#endif

#if IS_ENABLED(CONFIG_INET)
static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
                               u32 flags, bool check_mtu)
{
        struct fib_nh_common *nhc;
        struct in_device *in_dev;
        struct neighbour *neigh;
        struct net_device *dev;
        struct fib_result res;
        struct flowi4 fl4;
        u32 mtu = 0;
        int err;

        dev = dev_get_by_index_rcu(net, params->ifindex);
        if (unlikely(!dev))
                return -ENODEV;

        /* verify forwarding is enabled on this interface */
        in_dev = __in_dev_get_rcu(dev);
        if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
                return BPF_FIB_LKUP_RET_FWD_DISABLED;

        if (flags & BPF_FIB_LOOKUP_OUTPUT) {
                fl4.flowi4_iif = 1;
                fl4.flowi4_oif = params->ifindex;
        } else {
                fl4.flowi4_iif = params->ifindex;
                fl4.flowi4_oif = 0;
        }
        fl4.flowi4_dscp = inet_dsfield_to_dscp(params->tos);
        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
        fl4.flowi4_flags = 0;

        fl4.flowi4_proto = params->l4_protocol;
        fl4.daddr = params->ipv4_dst;
        fl4.saddr = params->ipv4_src;
        fl4.fl4_sport = params->sport;
        fl4.fl4_dport = params->dport;
        fl4.flowi4_multipath_hash = 0;

        if (flags & BPF_FIB_LOOKUP_DIRECT) {
                u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
                struct fib_table *tb;

                if (flags & BPF_FIB_LOOKUP_TBID) {
                        tbid = params->tbid;
                        /* zero out for vlan output */
                        params->tbid = 0;
                }

                tb = fib_get_table(net, tbid);
                if (unlikely(!tb))
                        return BPF_FIB_LKUP_RET_NOT_FWDED;

                err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
        } else {
                if (flags & BPF_FIB_LOOKUP_MARK)
                        fl4.flowi4_mark = params->mark;
                else
                        fl4.flowi4_mark = 0;
                fl4.flowi4_secid = 0;
                fl4.flowi4_tun_key.tun_id = 0;
                fl4.flowi4_uid = sock_net_uid(net, NULL);

                err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
        }

        if (err) {
                /* map fib lookup errors to RTN_ type */
                if (err == -EINVAL)
                        return BPF_FIB_LKUP_RET_BLACKHOLE;
                if (err == -EHOSTUNREACH)
                        return BPF_FIB_LKUP_RET_UNREACHABLE;
                if (err == -EACCES)
                        return BPF_FIB_LKUP_RET_PROHIBIT;

                return BPF_FIB_LKUP_RET_NOT_FWDED;
        }

        if (res.type != RTN_UNICAST)
                return BPF_FIB_LKUP_RET_NOT_FWDED;

        if (fib_info_num_path(res.fi) > 1)
                fib_select_path(net, &res, &fl4, NULL);

        if (check_mtu) {
                mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst);
                if (params->tot_len > mtu) {
                        params->mtu_result = mtu; /* union with tot_len */
                        return BPF_FIB_LKUP_RET_FRAG_NEEDED;
                }
        }

        nhc = res.nhc;

        /* do not handle lwt encaps right now */
        if (nhc->nhc_lwtstate)
                return BPF_FIB_LKUP_RET_UNSUPP_LWT;

        dev = nhc->nhc_dev;

        params->rt_metric = res.fi->fib_priority;
        params->ifindex = dev->ifindex;

        if (flags & BPF_FIB_LOOKUP_SRC)
                params->ipv4_src = fib_result_prefsrc(net, &res);

        /* xdp and cls_bpf programs are run in RCU-bh so
         * rcu_read_lock_bh is not needed here
         */
        if (likely(nhc->nhc_gw_family != AF_INET6)) {
                if (nhc->nhc_gw_family)
                        params->ipv4_dst = nhc->nhc_gw.ipv4;
        } else {
                struct in6_addr *dst = (struct in6_addr *)params->ipv6_dst;

                params->family = AF_INET6;
                *dst = nhc->nhc_gw.ipv6;
        }

        if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH)
                goto set_fwd_params;

        if (likely(nhc->nhc_gw_family != AF_INET6))
                neigh = __ipv4_neigh_lookup_noref(dev,
                                                  (__force u32)params->ipv4_dst);
        else
                neigh = __ipv6_neigh_lookup_noref_stub(dev, params->ipv6_dst);

        if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID))
                return BPF_FIB_LKUP_RET_NO_NEIGH;
        memcpy(params->dmac, neigh->ha, ETH_ALEN);
        memcpy(params->smac, dev->dev_addr, ETH_ALEN);

set_fwd_params:
        return bpf_fib_set_fwd_params(params, mtu);
}
#endif

#if IS_ENABLED(CONFIG_IPV6)
static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
                               u32 flags, bool check_mtu)
{
        struct in6_addr *src = (struct in6_addr *) params->ipv6_src;
        struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst;
        struct fib6_result res = {};
        struct neighbour *neigh;
        struct net_device *dev;
        struct inet6_dev *idev;
        struct flowi6 fl6;
        int strict = 0;
        int oif, err;
        u32 mtu = 0;

        /* link local addresses are never forwarded */
        if (rt6_need_strict(dst) || rt6_need_strict(src))
                return BPF_FIB_LKUP_RET_NOT_FWDED;

        dev = dev_get_by_index_rcu(net, params->ifindex);
        if (unlikely(!dev))
                return -ENODEV;

        idev = __in6_dev_get_safely(dev);
        if (unlikely(!idev || !READ_ONCE(idev->cnf.forwarding)))
                return BPF_FIB_LKUP_RET_FWD_DISABLED;

        if (flags & BPF_FIB_LOOKUP_OUTPUT) {
                fl6.flowi6_iif = 1;
                oif = fl6.flowi6_oif = params->ifindex;
        } else {
                oif = fl6.flowi6_iif = params->ifindex;
                fl6.flowi6_oif = 0;
                strict = RT6_LOOKUP_F_HAS_SADDR;
        }
        fl6.flowlabel = params->flowinfo;
        fl6.flowi6_scope = 0;
        fl6.flowi6_flags = 0;
        fl6.mp_hash = 0;

        fl6.flowi6_proto = params->l4_protocol;
        fl6.daddr = *dst;
        fl6.saddr = *src;
        fl6.fl6_sport = params->sport;
        fl6.fl6_dport = params->dport;

        if (flags & BPF_FIB_LOOKUP_DIRECT) {
                u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
                struct fib6_table *tb;

                if (flags & BPF_FIB_LOOKUP_TBID) {
                        tbid = params->tbid;
                        /* zero out for vlan output */
                        params->tbid = 0;
                }

                tb = ipv6_stub->fib6_get_table(net, tbid);
                if (unlikely(!tb))
                        return BPF_FIB_LKUP_RET_NOT_FWDED;

                err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res,
                                                   strict);
        } else {
                if (flags & BPF_FIB_LOOKUP_MARK)
                        fl6.flowi6_mark = params->mark;
                else
                        fl6.flowi6_mark = 0;
                fl6.flowi6_secid = 0;
                fl6.flowi6_tun_key.tun_id = 0;
                fl6.flowi6_uid = sock_net_uid(net, NULL);

                err = ipv6_stub->fib6_lookup(net, oif, &fl6, &res, strict);
        }

        if (unlikely(err || IS_ERR_OR_NULL(res.f6i) ||
                     res.f6i == net->ipv6.fib6_null_entry))
                return BPF_FIB_LKUP_RET_NOT_FWDED;

        switch (res.fib6_type) {
        /* only unicast is forwarded */
        case RTN_UNICAST:
                break;
        case RTN_BLACKHOLE:
                return BPF_FIB_LKUP_RET_BLACKHOLE;
        case RTN_UNREACHABLE:
                return BPF_FIB_LKUP_RET_UNREACHABLE;
        case RTN_PROHIBIT:
                return BPF_FIB_LKUP_RET_PROHIBIT;
        default:
                return BPF_FIB_LKUP_RET_NOT_FWDED;
        }

        ipv6_stub->fib6_select_path(net, &res, &fl6, fl6.flowi6_oif,
                                    fl6.flowi6_oif != 0, NULL, strict);

        if (check_mtu) {
                mtu = ipv6_stub->ip6_mtu_from_fib6(&res, dst, src);
                if (params->tot_len > mtu) {
                        params->mtu_result = mtu; /* union with tot_len */
                        return BPF_FIB_LKUP_RET_FRAG_NEEDED;
                }
        }

        if (res.nh->fib_nh_lws)
                return BPF_FIB_LKUP_RET_UNSUPP_LWT;

        if (res.nh->fib_nh_gw_family)
                *dst = res.nh->fib_nh_gw6;

        dev = res.nh->fib_nh_dev;
        params->rt_metric = res.f6i->fib6_metric;
        params->ifindex = dev->ifindex;

        if (flags & BPF_FIB_LOOKUP_SRC) {
                if (res.f6i->fib6_prefsrc.plen) {
                        *src = res.f6i->fib6_prefsrc.addr;
                } else {
                        err = ipv6_bpf_stub->ipv6_dev_get_saddr(net, dev,
                                                                &fl6.daddr, 0,
                                                                src);
                        if (err)
                                return BPF_FIB_LKUP_RET_NO_SRC_ADDR;
                }
        }

        if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH)
                goto set_fwd_params;

        /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
         * not needed here.
         */
        neigh = __ipv6_neigh_lookup_noref_stub(dev, dst);
        if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID))
                return BPF_FIB_LKUP_RET_NO_NEIGH;
        memcpy(params->dmac, neigh->ha, ETH_ALEN);
        memcpy(params->smac, dev->dev_addr, ETH_ALEN);

set_fwd_params:
        return bpf_fib_set_fwd_params(params, mtu);
}
#endif

#define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \
                             BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \
                             BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_MARK)

BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
           struct bpf_fib_lookup *, params, int, plen, u32, flags)
{
        if (plen < sizeof(*params))
                return -EINVAL;

        if (flags & ~BPF_FIB_LOOKUP_MASK)
                return -EINVAL;

        switch (params->family) {
#if IS_ENABLED(CONFIG_INET)
        case AF_INET:
                return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params,
                                           flags, true);
#endif
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params,
                                           flags, true);
#endif
        }
        return -EAFNOSUPPORT;
}

static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = {
        .func                = bpf_xdp_fib_lookup,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_PTR_TO_MEM,
        .arg3_type      = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
           struct bpf_fib_lookup *, params, int, plen, u32, flags)
{
        struct net *net = dev_net(skb->dev);
        int rc = -EAFNOSUPPORT;
        bool check_mtu = false;

        if (plen < sizeof(*params))
                return -EINVAL;

        if (flags & ~BPF_FIB_LOOKUP_MASK)
                return -EINVAL;

        if (params->tot_len)
                check_mtu = true;

        switch (params->family) {
#if IS_ENABLED(CONFIG_INET)
        case AF_INET:
                rc = bpf_ipv4_fib_lookup(net, params, flags, check_mtu);
                break;
#endif
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                rc = bpf_ipv6_fib_lookup(net, params, flags, check_mtu);
                break;
#endif
        }

        if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) {
                struct net_device *dev;

                /* When tot_len isn't provided by user, check skb
                 * against MTU of FIB lookup resulting net_device
                 */
                dev = dev_get_by_index_rcu(net, params->ifindex);
                if (!is_skb_forwardable(dev, skb))
                        rc = BPF_FIB_LKUP_RET_FRAG_NEEDED;

                params->mtu_result = dev->mtu; /* union with tot_len */
        }

        return rc;
}

static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
        .func                = bpf_skb_fib_lookup,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_PTR_TO_MEM,
        .arg3_type      = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
};

static struct net_device *__dev_via_ifindex(struct net_device *dev_curr,
                                            u32 ifindex)
{
        struct net *netns = dev_net(dev_curr);

        /* Non-redirect use-cases can use ifindex=0 and save ifindex lookup */
        if (ifindex == 0)
                return dev_curr;

        return dev_get_by_index_rcu(netns, ifindex);
}

BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb,
           u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags)
{
        int ret = BPF_MTU_CHK_RET_FRAG_NEEDED;
        struct net_device *dev = skb->dev;
        int mtu, dev_len, skb_len;

        if (unlikely(flags & ~(BPF_MTU_CHK_SEGS)))
                return -EINVAL;
        if (unlikely(flags & BPF_MTU_CHK_SEGS && (len_diff || *mtu_len)))
                return -EINVAL;

        dev = __dev_via_ifindex(dev, ifindex);
        if (unlikely(!dev))
                return -ENODEV;

        mtu = READ_ONCE(dev->mtu);
        dev_len = mtu + dev->hard_header_len;

        /* If set use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */
        skb_len = *mtu_len ? *mtu_len + dev->hard_header_len : skb->len;

        skb_len += len_diff; /* minus result pass check */
        if (skb_len <= dev_len) {
                ret = BPF_MTU_CHK_RET_SUCCESS;
                goto out;
        }
        /* At this point, skb->len exceed MTU, but as it include length of all
         * segments, it can still be below MTU.  The SKB can possibly get
         * re-segmented in transmit path (see validate_xmit_skb).  Thus, user
         * must choose if segs are to be MTU checked.
         */
        if (skb_is_gso(skb)) {
                ret = BPF_MTU_CHK_RET_SUCCESS;
                if (flags & BPF_MTU_CHK_SEGS &&
                    !skb_gso_validate_network_len(skb, mtu))
                        ret = BPF_MTU_CHK_RET_SEGS_TOOBIG;
        }
out:
        *mtu_len = mtu;
        return ret;
}

BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp,
           u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags)
{
        struct net_device *dev = xdp->rxq->dev;
        int xdp_len = xdp->data_end - xdp->data;
        int ret = BPF_MTU_CHK_RET_SUCCESS;
        int mtu, dev_len;

        /* XDP variant doesn't support multi-buffer segment check (yet) */
        if (unlikely(flags))
                return -EINVAL;

        dev = __dev_via_ifindex(dev, ifindex);
        if (unlikely(!dev))
                return -ENODEV;

        mtu = READ_ONCE(dev->mtu);
        dev_len = mtu + dev->hard_header_len;

        /* Use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */
        if (*mtu_len)
                xdp_len = *mtu_len + dev->hard_header_len;

        xdp_len += len_diff; /* minus result pass check */
        if (xdp_len > dev_len)
                ret = BPF_MTU_CHK_RET_FRAG_NEEDED;

        *mtu_len = mtu;
        return ret;
}

static const struct bpf_func_proto bpf_skb_check_mtu_proto = {
        .func                = bpf_skb_check_mtu,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
        .arg3_type      = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED,
        .arg3_size        = sizeof(u32),
        .arg4_type      = ARG_ANYTHING,
        .arg5_type      = ARG_ANYTHING,
};

static const struct bpf_func_proto bpf_xdp_check_mtu_proto = {
        .func                = bpf_xdp_check_mtu,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
        .arg3_type      = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED,
        .arg3_size        = sizeof(u32),
        .arg4_type      = ARG_ANYTHING,
        .arg5_type      = ARG_ANYTHING,
};

#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
{
        int err;
        struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr;

        if (!seg6_validate_srh(srh, len, false))
                return -EINVAL;

        switch (type) {
        case BPF_LWT_ENCAP_SEG6_INLINE:
                if (skb->protocol != htons(ETH_P_IPV6))
                        return -EBADMSG;

                err = seg6_do_srh_inline(skb, srh);
                break;
        case BPF_LWT_ENCAP_SEG6:
                skb_reset_inner_headers(skb);
                skb->encapsulation = 1;
                err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6);
                break;
        default:
                return -EINVAL;
        }

        bpf_compute_data_pointers(skb);
        if (err)
                return err;

        skb_set_transport_header(skb, sizeof(struct ipv6hdr));

        return seg6_lookup_nexthop(skb, NULL, 0);
}
#endif /* CONFIG_IPV6_SEG6_BPF */

#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
                             bool ingress)
{
        return bpf_lwt_push_ip_encap(skb, hdr, len, ingress);
}
#endif

BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
           u32, len)
{
        switch (type) {
#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
        case BPF_LWT_ENCAP_SEG6:
        case BPF_LWT_ENCAP_SEG6_INLINE:
                return bpf_push_seg6_encap(skb, type, hdr, len);
#endif
#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
        case BPF_LWT_ENCAP_IP:
                return bpf_push_ip_encap(skb, hdr, len, true /* ingress */);
#endif
        default:
                return -EINVAL;
        }
}

BPF_CALL_4(bpf_lwt_xmit_push_encap, struct sk_buff *, skb, u32, type,
           void *, hdr, u32, len)
{
        switch (type) {
#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
        case BPF_LWT_ENCAP_IP:
                return bpf_push_ip_encap(skb, hdr, len, false /* egress */);
#endif
        default:
                return -EINVAL;
        }
}

static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = {
        .func                = bpf_lwt_in_push_encap,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg4_type        = ARG_CONST_SIZE
};

static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = {
        .func                = bpf_lwt_xmit_push_encap,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg4_type        = ARG_CONST_SIZE
};

#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset,
           const void *, from, u32, len)
{
        struct seg6_bpf_srh_state *srh_state =
                this_cpu_ptr(&seg6_bpf_srh_states);
        struct ipv6_sr_hdr *srh = srh_state->srh;
        void *srh_tlvs, *srh_end, *ptr;
        int srhoff = 0;

        lockdep_assert_held(&srh_state->bh_lock);
        if (srh == NULL)
                return -EINVAL;

        srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4));
        srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen);

        ptr = skb->data + offset;
        if (ptr >= srh_tlvs && ptr + len <= srh_end)
                srh_state->valid = false;
        else if (ptr < (void *)&srh->flags ||
                 ptr + len > (void *)&srh->segments)
                return -EFAULT;

        if (unlikely(bpf_try_make_writable(skb, offset + len)))
                return -EFAULT;
        if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
                return -EINVAL;
        srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);

        memcpy(skb->data + offset, from, len);
        return 0;
}

static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = {
        .func                = bpf_lwt_seg6_store_bytes,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg4_type        = ARG_CONST_SIZE
};

static void bpf_update_srh_state(struct sk_buff *skb)
{
        struct seg6_bpf_srh_state *srh_state =
                this_cpu_ptr(&seg6_bpf_srh_states);
        int srhoff = 0;

        if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) {
                srh_state->srh = NULL;
        } else {
                srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
                srh_state->hdrlen = srh_state->srh->hdrlen << 3;
                srh_state->valid = true;
        }
}

BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
           u32, action, void *, param, u32, param_len)
{
        struct seg6_bpf_srh_state *srh_state =
                this_cpu_ptr(&seg6_bpf_srh_states);
        int hdroff = 0;
        int err;

        lockdep_assert_held(&srh_state->bh_lock);
        switch (action) {
        case SEG6_LOCAL_ACTION_END_X:
                if (!seg6_bpf_has_valid_srh(skb))
                        return -EBADMSG;
                if (param_len != sizeof(struct in6_addr))
                        return -EINVAL;
                return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0);
        case SEG6_LOCAL_ACTION_END_T:
                if (!seg6_bpf_has_valid_srh(skb))
                        return -EBADMSG;
                if (param_len != sizeof(int))
                        return -EINVAL;
                return seg6_lookup_nexthop(skb, NULL, *(int *)param);
        case SEG6_LOCAL_ACTION_END_DT6:
                if (!seg6_bpf_has_valid_srh(skb))
                        return -EBADMSG;
                if (param_len != sizeof(int))
                        return -EINVAL;

                if (ipv6_find_hdr(skb, &hdroff, IPPROTO_IPV6, NULL, NULL) < 0)
                        return -EBADMSG;
                if (!pskb_pull(skb, hdroff))
                        return -EBADMSG;

                skb_postpull_rcsum(skb, skb_network_header(skb), hdroff);
                skb_reset_network_header(skb);
                skb_reset_transport_header(skb);
                skb->encapsulation = 0;

                bpf_compute_data_pointers(skb);
                bpf_update_srh_state(skb);
                return seg6_lookup_nexthop(skb, NULL, *(int *)param);
        case SEG6_LOCAL_ACTION_END_B6:
                if (srh_state->srh && !seg6_bpf_has_valid_srh(skb))
                        return -EBADMSG;
                err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE,
                                          param, param_len);
                if (!err)
                        bpf_update_srh_state(skb);

                return err;
        case SEG6_LOCAL_ACTION_END_B6_ENCAP:
                if (srh_state->srh && !seg6_bpf_has_valid_srh(skb))
                        return -EBADMSG;
                err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6,
                                          param, param_len);
                if (!err)
                        bpf_update_srh_state(skb);

                return err;
        default:
                return -EINVAL;
        }
}

static const struct bpf_func_proto bpf_lwt_seg6_action_proto = {
        .func                = bpf_lwt_seg6_action,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg4_type        = ARG_CONST_SIZE
};

BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset,
           s32, len)
{
        struct seg6_bpf_srh_state *srh_state =
                this_cpu_ptr(&seg6_bpf_srh_states);
        struct ipv6_sr_hdr *srh = srh_state->srh;
        void *srh_end, *srh_tlvs, *ptr;
        struct ipv6hdr *hdr;
        int srhoff = 0;
        int ret;

        lockdep_assert_held(&srh_state->bh_lock);
        if (unlikely(srh == NULL))
                return -EINVAL;

        srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) +
                        ((srh->first_segment + 1) << 4));
        srh_end = (void *)((unsigned char *)srh + sizeof(*srh) +
                        srh_state->hdrlen);
        ptr = skb->data + offset;

        if (unlikely(ptr < srh_tlvs || ptr > srh_end))
                return -EFAULT;
        if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end))
                return -EFAULT;

        if (len > 0) {
                ret = skb_cow_head(skb, len);
                if (unlikely(ret < 0))
                        return ret;

                ret = bpf_skb_net_hdr_push(skb, offset, len);
        } else {
                ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len);
        }

        bpf_compute_data_pointers(skb);
        if (unlikely(ret < 0))
                return ret;

        hdr = (struct ipv6hdr *)skb->data;
        hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));

        if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
                return -EINVAL;
        srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
        srh_state->hdrlen += len;
        srh_state->valid = false;
        return 0;
}

static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
        .func                = bpf_lwt_seg6_adjust_srh,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};
#endif /* CONFIG_IPV6_SEG6_BPF */

#ifdef CONFIG_INET
static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
                              int dif, int sdif, u8 family, u8 proto)
{
        bool refcounted = false;
        struct sock *sk = NULL;

        if (family == AF_INET) {
                __be32 src4 = tuple->ipv4.saddr;
                __be32 dst4 = tuple->ipv4.daddr;

                if (proto == IPPROTO_TCP)
                        sk = __inet_lookup(net, NULL, 0,
                                           src4, tuple->ipv4.sport,
                                           dst4, tuple->ipv4.dport,
                                           dif, sdif, &refcounted);
                else
                        sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport,
                                               dst4, tuple->ipv4.dport,
                                               dif, sdif, net->ipv4.udp_table, NULL);
#if IS_ENABLED(CONFIG_IPV6)
        } else {
                struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr;
                struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr;

                if (proto == IPPROTO_TCP)
                        sk = __inet6_lookup(net, NULL, 0,
                                            src6, tuple->ipv6.sport,
                                            dst6, ntohs(tuple->ipv6.dport),
                                            dif, sdif, &refcounted);
                else if (likely(ipv6_bpf_stub))
                        sk = ipv6_bpf_stub->udp6_lib_lookup(net,
                                                            src6, tuple->ipv6.sport,
                                                            dst6, tuple->ipv6.dport,
                                                            dif, sdif,
                                                            net->ipv4.udp_table, NULL);
#endif
        }

        if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) {
                WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
                sk = NULL;
        }
        return sk;
}

/* bpf_skc_lookup performs the core lookup for different types of sockets,
 * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE.
 */
static struct sock *
__bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
                 struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id,
                 u64 flags, int sdif)
{
        struct sock *sk = NULL;
        struct net *net;
        u8 family;

        if (len == sizeof(tuple->ipv4))
                family = AF_INET;
        else if (len == sizeof(tuple->ipv6))
                family = AF_INET6;
        else
                return NULL;

        if (unlikely(flags || !((s32)netns_id < 0 || netns_id <= S32_MAX)))
                goto out;

        if (sdif < 0) {
                if (family == AF_INET)
                        sdif = inet_sdif(skb);
                else
                        sdif = inet6_sdif(skb);
        }

        if ((s32)netns_id < 0) {
                net = caller_net;
                sk = sk_lookup(net, tuple, ifindex, sdif, family, proto);
        } else {
                net = get_net_ns_by_id(caller_net, netns_id);
                if (unlikely(!net))
                        goto out;
                sk = sk_lookup(net, tuple, ifindex, sdif, family, proto);
                put_net(net);
        }

out:
        return sk;
}

static struct sock *
__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
                struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id,
                u64 flags, int sdif)
{
        struct sock *sk = __bpf_skc_lookup(skb, tuple, len, caller_net,
                                           ifindex, proto, netns_id, flags,
                                           sdif);

        if (sk) {
                struct sock *sk2 = sk_to_full_sk(sk);

                /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
                 * sock refcnt is decremented to prevent a request_sock leak.
                 */
                if (sk2 != sk) {
                        sock_gen_put(sk);
                        /* Ensure there is no need to bump sk2 refcnt */
                        if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) {
                                WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
                                return NULL;
                        }
                        sk = sk2;
                }
        }

        return sk;
}

static struct sock *
bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
               u8 proto, u64 netns_id, u64 flags)
{
        struct net *caller_net;
        int ifindex;

        if (skb->dev) {
                caller_net = dev_net(skb->dev);
                ifindex = skb->dev->ifindex;
        } else {
                caller_net = sock_net(skb->sk);
                ifindex = 0;
        }

        return __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto,
                                netns_id, flags, -1);
}

static struct sock *
bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
              u8 proto, u64 netns_id, u64 flags)
{
        struct sock *sk = bpf_skc_lookup(skb, tuple, len, proto, netns_id,
                                         flags);

        if (sk) {
                struct sock *sk2 = sk_to_full_sk(sk);

                /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
                 * sock refcnt is decremented to prevent a request_sock leak.
                 */
                if (sk2 != sk) {
                        sock_gen_put(sk);
                        /* Ensure there is no need to bump sk2 refcnt */
                        if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) {
                                WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
                                return NULL;
                        }
                        sk = sk2;
                }
        }

        return sk;
}

BPF_CALL_5(bpf_skc_lookup_tcp, struct sk_buff *, skb,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        return (unsigned long)bpf_skc_lookup(skb, tuple, len, IPPROTO_TCP,
                                             netns_id, flags);
}

static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = {
        .func                = bpf_skc_lookup_tcp,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_PTR_TO_SOCK_COMMON_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP,
                                            netns_id, flags);
}

static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = {
        .func                = bpf_sk_lookup_tcp,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP,
                                            netns_id, flags);
}

static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
        .func                = bpf_sk_lookup_udp,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_tc_skc_lookup_tcp, struct sk_buff *, skb,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        struct net_device *dev = skb->dev;
        int ifindex = dev->ifindex, sdif = dev_sdif(dev);
        struct net *caller_net = dev_net(dev);

        return (unsigned long)__bpf_skc_lookup(skb, tuple, len, caller_net,
                                               ifindex, IPPROTO_TCP, netns_id,
                                               flags, sdif);
}

static const struct bpf_func_proto bpf_tc_skc_lookup_tcp_proto = {
        .func                = bpf_tc_skc_lookup_tcp,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_PTR_TO_SOCK_COMMON_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_tc_sk_lookup_tcp, struct sk_buff *, skb,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        struct net_device *dev = skb->dev;
        int ifindex = dev->ifindex, sdif = dev_sdif(dev);
        struct net *caller_net = dev_net(dev);

        return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net,
                                              ifindex, IPPROTO_TCP, netns_id,
                                              flags, sdif);
}

static const struct bpf_func_proto bpf_tc_sk_lookup_tcp_proto = {
        .func                = bpf_tc_sk_lookup_tcp,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_tc_sk_lookup_udp, struct sk_buff *, skb,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        struct net_device *dev = skb->dev;
        int ifindex = dev->ifindex, sdif = dev_sdif(dev);
        struct net *caller_net = dev_net(dev);

        return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net,
                                              ifindex, IPPROTO_UDP, netns_id,
                                              flags, sdif);
}

static const struct bpf_func_proto bpf_tc_sk_lookup_udp_proto = {
        .func                = bpf_tc_sk_lookup_udp,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_1(bpf_sk_release, struct sock *, sk)
{
        if (sk && sk_is_refcounted(sk))
                sock_gen_put(sk);
        return 0;
}

static const struct bpf_func_proto bpf_sk_release_proto = {
        .func                = bpf_sk_release,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON | OBJ_RELEASE,
};

BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx,
           struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
{
        struct net_device *dev = ctx->rxq->dev;
        int ifindex = dev->ifindex, sdif = dev_sdif(dev);
        struct net *caller_net = dev_net(dev);

        return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net,
                                              ifindex, IPPROTO_UDP, netns_id,
                                              flags, sdif);
}

static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = {
        .func           = bpf_xdp_sk_lookup_udp,
        .gpl_only       = false,
        .pkt_access     = true,
        .ret_type       = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type      = ARG_ANYTHING,
        .arg5_type      = ARG_ANYTHING,
};

BPF_CALL_5(bpf_xdp_skc_lookup_tcp, struct xdp_buff *, ctx,
           struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
{
        struct net_device *dev = ctx->rxq->dev;
        int ifindex = dev->ifindex, sdif = dev_sdif(dev);
        struct net *caller_net = dev_net(dev);

        return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, caller_net,
                                               ifindex, IPPROTO_TCP, netns_id,
                                               flags, sdif);
}

static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = {
        .func           = bpf_xdp_skc_lookup_tcp,
        .gpl_only       = false,
        .pkt_access     = true,
        .ret_type       = RET_PTR_TO_SOCK_COMMON_OR_NULL,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type      = ARG_ANYTHING,
        .arg5_type      = ARG_ANYTHING,
};

BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx,
           struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
{
        struct net_device *dev = ctx->rxq->dev;
        int ifindex = dev->ifindex, sdif = dev_sdif(dev);
        struct net *caller_net = dev_net(dev);

        return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net,
                                              ifindex, IPPROTO_TCP, netns_id,
                                              flags, sdif);
}

static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = {
        .func           = bpf_xdp_sk_lookup_tcp,
        .gpl_only       = false,
        .pkt_access     = true,
        .ret_type       = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type      = ARG_ANYTHING,
        .arg5_type      = ARG_ANYTHING,
};

BPF_CALL_5(bpf_sock_addr_skc_lookup_tcp, struct bpf_sock_addr_kern *, ctx,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        return (unsigned long)__bpf_skc_lookup(NULL, tuple, len,
                                               sock_net(ctx->sk), 0,
                                               IPPROTO_TCP, netns_id, flags,
                                               -1);
}

static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = {
        .func                = bpf_sock_addr_skc_lookup_tcp,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_SOCK_COMMON_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        return (unsigned long)__bpf_sk_lookup(NULL, tuple, len,
                                              sock_net(ctx->sk), 0, IPPROTO_TCP,
                                              netns_id, flags, -1);
}

static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = {
        .func                = bpf_sock_addr_sk_lookup_tcp,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        return (unsigned long)__bpf_sk_lookup(NULL, tuple, len,
                                              sock_net(ctx->sk), 0, IPPROTO_UDP,
                                              netns_id, flags, -1);
}

static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {
        .func                = bpf_sock_addr_sk_lookup_udp,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
                                  struct bpf_insn_access_aux *info)
{
        if (off < 0 || off >= offsetofend(struct bpf_tcp_sock,
                                          icsk_retransmits))
                return false;

        if (off % size != 0)
                return false;

        switch (off) {
        case offsetof(struct bpf_tcp_sock, bytes_received):
        case offsetof(struct bpf_tcp_sock, bytes_acked):
                return size == sizeof(__u64);
        default:
                return size == sizeof(__u32);
        }
}

u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
                                    const struct bpf_insn *si,
                                    struct bpf_insn *insn_buf,
                                    struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;

#define BPF_TCP_SOCK_GET_COMMON(FIELD)                                        \
        do {                                                                \
                BUILD_BUG_ON(sizeof_field(struct tcp_sock, FIELD) >        \
                             sizeof_field(struct bpf_tcp_sock, FIELD));        \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_sock, FIELD),\
                                      si->dst_reg, si->src_reg,                \
                                      offsetof(struct tcp_sock, FIELD)); \
        } while (0)

#define BPF_INET_SOCK_GET_COMMON(FIELD)                                        \
        do {                                                                \
                BUILD_BUG_ON(sizeof_field(struct inet_connection_sock,        \
                                          FIELD) >                        \
                             sizeof_field(struct bpf_tcp_sock, FIELD));        \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                        \
                                        struct inet_connection_sock,        \
                                        FIELD),                                \
                                      si->dst_reg, si->src_reg,                \
                                      offsetof(                                \
                                        struct inet_connection_sock,        \
                                        FIELD));                        \
        } while (0)

        BTF_TYPE_EMIT(struct bpf_tcp_sock);

        switch (si->off) {
        case offsetof(struct bpf_tcp_sock, rtt_min):
                BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) !=
                             sizeof(struct minmax));
                BUILD_BUG_ON(sizeof(struct minmax) <
                             sizeof(struct minmax_sample));

                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      offsetof(struct tcp_sock, rtt_min) +
                                      offsetof(struct minmax_sample, v));
                break;
        case offsetof(struct bpf_tcp_sock, snd_cwnd):
                BPF_TCP_SOCK_GET_COMMON(snd_cwnd);
                break;
        case offsetof(struct bpf_tcp_sock, srtt_us):
                BPF_TCP_SOCK_GET_COMMON(srtt_us);
                break;
        case offsetof(struct bpf_tcp_sock, snd_ssthresh):
                BPF_TCP_SOCK_GET_COMMON(snd_ssthresh);
                break;
        case offsetof(struct bpf_tcp_sock, rcv_nxt):
                BPF_TCP_SOCK_GET_COMMON(rcv_nxt);
                break;
        case offsetof(struct bpf_tcp_sock, snd_nxt):
                BPF_TCP_SOCK_GET_COMMON(snd_nxt);
                break;
        case offsetof(struct bpf_tcp_sock, snd_una):
                BPF_TCP_SOCK_GET_COMMON(snd_una);
                break;
        case offsetof(struct bpf_tcp_sock, mss_cache):
                BPF_TCP_SOCK_GET_COMMON(mss_cache);
                break;
        case offsetof(struct bpf_tcp_sock, ecn_flags):
                BPF_TCP_SOCK_GET_COMMON(ecn_flags);
                break;
        case offsetof(struct bpf_tcp_sock, rate_delivered):
                BPF_TCP_SOCK_GET_COMMON(rate_delivered);
                break;
        case offsetof(struct bpf_tcp_sock, rate_interval_us):
                BPF_TCP_SOCK_GET_COMMON(rate_interval_us);
                break;
        case offsetof(struct bpf_tcp_sock, packets_out):
                BPF_TCP_SOCK_GET_COMMON(packets_out);
                break;
        case offsetof(struct bpf_tcp_sock, retrans_out):
                BPF_TCP_SOCK_GET_COMMON(retrans_out);
                break;
        case offsetof(struct bpf_tcp_sock, total_retrans):
                BPF_TCP_SOCK_GET_COMMON(total_retrans);
                break;
        case offsetof(struct bpf_tcp_sock, segs_in):
                BPF_TCP_SOCK_GET_COMMON(segs_in);
                break;
        case offsetof(struct bpf_tcp_sock, data_segs_in):
                BPF_TCP_SOCK_GET_COMMON(data_segs_in);
                break;
        case offsetof(struct bpf_tcp_sock, segs_out):
                BPF_TCP_SOCK_GET_COMMON(segs_out);
                break;
        case offsetof(struct bpf_tcp_sock, data_segs_out):
                BPF_TCP_SOCK_GET_COMMON(data_segs_out);
                break;
        case offsetof(struct bpf_tcp_sock, lost_out):
                BPF_TCP_SOCK_GET_COMMON(lost_out);
                break;
        case offsetof(struct bpf_tcp_sock, sacked_out):
                BPF_TCP_SOCK_GET_COMMON(sacked_out);
                break;
        case offsetof(struct bpf_tcp_sock, bytes_received):
                BPF_TCP_SOCK_GET_COMMON(bytes_received);
                break;
        case offsetof(struct bpf_tcp_sock, bytes_acked):
                BPF_TCP_SOCK_GET_COMMON(bytes_acked);
                break;
        case offsetof(struct bpf_tcp_sock, dsack_dups):
                BPF_TCP_SOCK_GET_COMMON(dsack_dups);
                break;
        case offsetof(struct bpf_tcp_sock, delivered):
                BPF_TCP_SOCK_GET_COMMON(delivered);
                break;
        case offsetof(struct bpf_tcp_sock, delivered_ce):
                BPF_TCP_SOCK_GET_COMMON(delivered_ce);
                break;
        case offsetof(struct bpf_tcp_sock, icsk_retransmits):
                BPF_INET_SOCK_GET_COMMON(icsk_retransmits);
                break;
        }

        return insn - insn_buf;
}

BPF_CALL_1(bpf_tcp_sock, struct sock *, sk)
{
        if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP)
                return (unsigned long)sk;

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_tcp_sock_proto = {
        .func                = bpf_tcp_sock,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_TCP_SOCK_OR_NULL,
        .arg1_type        = ARG_PTR_TO_SOCK_COMMON,
};

BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk)
{
        sk = sk_to_full_sk(sk);

        if (sk && sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE))
                return (unsigned long)sk;

        return (unsigned long)NULL;
}

static const struct bpf_func_proto bpf_get_listener_sock_proto = {
        .func                = bpf_get_listener_sock,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_SOCK_COMMON,
};

BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb)
{
        unsigned int iphdr_len;

        switch (skb_protocol(skb, true)) {
        case cpu_to_be16(ETH_P_IP):
                iphdr_len = sizeof(struct iphdr);
                break;
        case cpu_to_be16(ETH_P_IPV6):
                iphdr_len = sizeof(struct ipv6hdr);
                break;
        default:
                return 0;
        }

        if (skb_headlen(skb) < iphdr_len)
                return 0;

        if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len))
                return 0;

        return INET_ECN_set_ce(skb);
}

bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
                                  struct bpf_insn_access_aux *info)
{
        if (off < 0 || off >= offsetofend(struct bpf_xdp_sock, queue_id))
                return false;

        if (off % size != 0)
                return false;

        switch (off) {
        default:
                return size == sizeof(__u32);
        }
}

u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
                                    const struct bpf_insn *si,
                                    struct bpf_insn *insn_buf,
                                    struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;

#define BPF_XDP_SOCK_GET(FIELD)                                                \
        do {                                                                \
                BUILD_BUG_ON(sizeof_field(struct xdp_sock, FIELD) >        \
                             sizeof_field(struct bpf_xdp_sock, FIELD));        \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_sock, FIELD),\
                                      si->dst_reg, si->src_reg,                \
                                      offsetof(struct xdp_sock, FIELD)); \
        } while (0)

        BTF_TYPE_EMIT(struct bpf_xdp_sock);

        switch (si->off) {
        case offsetof(struct bpf_xdp_sock, queue_id):
                BPF_XDP_SOCK_GET(queue_id);
                break;
        }

        return insn - insn_buf;
}

static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = {
        .func           = bpf_skb_ecn_set_ce,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
           struct tcphdr *, th, u32, th_len)
{
#ifdef CONFIG_SYN_COOKIES
        int ret;

        if (unlikely(!sk || th_len < sizeof(*th)))
                return -EINVAL;

        /* sk_listener() allows TCP_NEW_SYN_RECV, which makes no sense here. */
        if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN)
                return -EINVAL;

        if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies))
                return -EINVAL;

        if (!th->ack || th->rst || th->syn)
                return -ENOENT;

        if (unlikely(iph_len < sizeof(struct iphdr)))
                return -EINVAL;

        if (tcp_synq_no_recent_overflow(sk))
                return -ENOENT;

        /* Both struct iphdr and struct ipv6hdr have the version field at the
         * same offset so we can cast to the shorter header (struct iphdr).
         */
        switch (((struct iphdr *)iph)->version) {
        case 4:
                if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk))
                        return -EINVAL;

                ret = __cookie_v4_check((struct iphdr *)iph, th);
                break;

#if IS_BUILTIN(CONFIG_IPV6)
        case 6:
                if (unlikely(iph_len < sizeof(struct ipv6hdr)))
                        return -EINVAL;

                if (sk->sk_family != AF_INET6)
                        return -EINVAL;

                ret = __cookie_v6_check((struct ipv6hdr *)iph, th);
                break;
#endif /* CONFIG_IPV6 */

        default:
                return -EPROTONOSUPPORT;
        }

        if (ret > 0)
                return 0;

        return -ENOENT;
#else
        return -ENOTSUPP;
#endif
}

static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = {
        .func                = bpf_tcp_check_syncookie,
        .gpl_only        = true,
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
           struct tcphdr *, th, u32, th_len)
{
#ifdef CONFIG_SYN_COOKIES
        u32 cookie;
        u16 mss;

        if (unlikely(!sk || th_len < sizeof(*th) || th_len != th->doff * 4))
                return -EINVAL;

        if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN)
                return -EINVAL;

        if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies))
                return -ENOENT;

        if (!th->syn || th->ack || th->fin || th->rst)
                return -EINVAL;

        if (unlikely(iph_len < sizeof(struct iphdr)))
                return -EINVAL;

        /* Both struct iphdr and struct ipv6hdr have the version field at the
         * same offset so we can cast to the shorter header (struct iphdr).
         */
        switch (((struct iphdr *)iph)->version) {
        case 4:
                if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk))
                        return -EINVAL;

                mss = tcp_v4_get_syncookie(sk, iph, th, &cookie);
                break;

#if IS_BUILTIN(CONFIG_IPV6)
        case 6:
                if (unlikely(iph_len < sizeof(struct ipv6hdr)))
                        return -EINVAL;

                if (sk->sk_family != AF_INET6)
                        return -EINVAL;

                mss = tcp_v6_get_syncookie(sk, iph, th, &cookie);
                break;
#endif /* CONFIG_IPV6 */

        default:
                return -EPROTONOSUPPORT;
        }
        if (mss == 0)
                return -ENOENT;

        return cookie | ((u64)mss << 32);
#else
        return -EOPNOTSUPP;
#endif /* CONFIG_SYN_COOKIES */
}

static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = {
        .func                = bpf_tcp_gen_syncookie,
        .gpl_only        = true, /* __cookie_v*_init_sequence() is GPL */
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags)
{
        if (!sk || flags != 0)
                return -EINVAL;
        if (!skb_at_tc_ingress(skb))
                return -EOPNOTSUPP;
        if (unlikely(dev_net(skb->dev) != sock_net(sk)))
                return -ENETUNREACH;
        if (sk_unhashed(sk))
                return -EOPNOTSUPP;
        if (sk_is_refcounted(sk) &&
            unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
                return -ENOENT;

        skb_orphan(skb);
        skb->sk = sk;
        skb->destructor = sock_pfree;

        return 0;
}

static const struct bpf_func_proto bpf_sk_assign_proto = {
        .func                = bpf_sk_assign,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg3_type        = ARG_ANYTHING,
};

static const u8 *bpf_search_tcp_opt(const u8 *op, const u8 *opend,
                                    u8 search_kind, const u8 *magic,
                                    u8 magic_len, bool *eol)
{
        u8 kind, kind_len;

        *eol = false;

        while (op < opend) {
                kind = op[0];

                if (kind == TCPOPT_EOL) {
                        *eol = true;
                        return ERR_PTR(-ENOMSG);
                } else if (kind == TCPOPT_NOP) {
                        op++;
                        continue;
                }

                if (opend - op < 2 || opend - op < op[1] || op[1] < 2)
                        /* Something is wrong in the received header.
                         * Follow the TCP stack's tcp_parse_options()
                         * and just bail here.
                         */
                        return ERR_PTR(-EFAULT);

                kind_len = op[1];
                if (search_kind == kind) {
                        if (!magic_len)
                                return op;

                        if (magic_len > kind_len - 2)
                                return ERR_PTR(-ENOMSG);

                        if (!memcmp(&op[2], magic, magic_len))
                                return op;
                }

                op += kind_len;
        }

        return ERR_PTR(-ENOMSG);
}

BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
           void *, search_res, u32, len, u64, flags)
{
        bool eol, load_syn = flags & BPF_LOAD_HDR_OPT_TCP_SYN;
        const u8 *op, *opend, *magic, *search = search_res;
        u8 search_kind, search_len, copy_len, magic_len;
        int ret;

        if (!is_locked_tcp_sock_ops(bpf_sock))
                return -EOPNOTSUPP;

        /* 2 byte is the minimal option len except TCPOPT_NOP and
         * TCPOPT_EOL which are useless for the bpf prog to learn
         * and this helper disallow loading them also.
         */
        if (len < 2 || flags & ~BPF_LOAD_HDR_OPT_TCP_SYN)
                return -EINVAL;

        search_kind = search[0];
        search_len = search[1];

        if (search_len > len || search_kind == TCPOPT_NOP ||
            search_kind == TCPOPT_EOL)
                return -EINVAL;

        if (search_kind == TCPOPT_EXP || search_kind == 253) {
                /* 16 or 32 bit magic.  +2 for kind and kind length */
                if (search_len != 4 && search_len != 6)
                        return -EINVAL;
                magic = &search[2];
                magic_len = search_len - 2;
        } else {
                if (search_len)
                        return -EINVAL;
                magic = NULL;
                magic_len = 0;
        }

        if (load_syn) {
                ret = bpf_sock_ops_get_syn(bpf_sock, TCP_BPF_SYN, &op);
                if (ret < 0)
                        return ret;

                opend = op + ret;
                op += sizeof(struct tcphdr);
        } else {
                if (!bpf_sock->skb ||
                    bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB)
                        /* This bpf_sock->op cannot call this helper */
                        return -EPERM;

                opend = bpf_sock->skb_data_end;
                op = bpf_sock->skb->data + sizeof(struct tcphdr);
        }

        op = bpf_search_tcp_opt(op, opend, search_kind, magic, magic_len,
                                &eol);
        if (IS_ERR(op))
                return PTR_ERR(op);

        copy_len = op[1];
        ret = copy_len;
        if (copy_len > len) {
                ret = -ENOSPC;
                copy_len = len;
        }

        memcpy(search_res, op, copy_len);
        return ret;
}

static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = {
        .func                = bpf_sock_ops_load_hdr_opt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_WRITE,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_4(bpf_sock_ops_store_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
           const void *, from, u32, len, u64, flags)
{
        u8 new_kind, new_kind_len, magic_len = 0, *opend;
        const u8 *op, *new_op, *magic = NULL;
        struct sk_buff *skb;
        bool eol;

        if (bpf_sock->op != BPF_SOCK_OPS_WRITE_HDR_OPT_CB)
                return -EPERM;

        if (len < 2 || flags)
                return -EINVAL;

        new_op = from;
        new_kind = new_op[0];
        new_kind_len = new_op[1];

        if (new_kind_len > len || new_kind == TCPOPT_NOP ||
            new_kind == TCPOPT_EOL)
                return -EINVAL;

        if (new_kind_len > bpf_sock->remaining_opt_len)
                return -ENOSPC;

        /* 253 is another experimental kind */
        if (new_kind == TCPOPT_EXP || new_kind == 253)  {
                if (new_kind_len < 4)
                        return -EINVAL;
                /* Match for the 2 byte magic also.
                 * RFC 6994: the magic could be 2 or 4 bytes.
                 * Hence, matching by 2 byte only is on the
                 * conservative side but it is the right
                 * thing to do for the 'search-for-duplication'
                 * purpose.
                 */
                magic = &new_op[2];
                magic_len = 2;
        }

        /* Check for duplication */
        skb = bpf_sock->skb;
        op = skb->data + sizeof(struct tcphdr);
        opend = bpf_sock->skb_data_end;

        op = bpf_search_tcp_opt(op, opend, new_kind, magic, magic_len,
                                &eol);
        if (!IS_ERR(op))
                return -EEXIST;

        if (PTR_ERR(op) != -ENOMSG)
                return PTR_ERR(op);

        if (eol)
                /* The option has been ended.  Treat it as no more
                 * header option can be written.
                 */
                return -ENOSPC;

        /* No duplication found.  Store the header option. */
        memcpy(opend, from, new_kind_len);

        bpf_sock->remaining_opt_len -= new_kind_len;
        bpf_sock->skb_data_end += new_kind_len;

        return 0;
}

static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = {
        .func                = bpf_sock_ops_store_hdr_opt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_sock_ops_reserve_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
           u32, len, u64, flags)
{
        if (bpf_sock->op != BPF_SOCK_OPS_HDR_OPT_LEN_CB)
                return -EPERM;

        if (flags || len < 2)
                return -EINVAL;

        if (len > bpf_sock->remaining_opt_len)
                return -ENOSPC;

        bpf_sock->remaining_opt_len -= len;

        return 0;
}

static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = {
        .func                = bpf_sock_ops_reserve_hdr_opt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb,
           u64, tstamp, u32, tstamp_type)
{
        /* skb_clear_delivery_time() is done for inet protocol */
        if (skb->protocol != htons(ETH_P_IP) &&
            skb->protocol != htons(ETH_P_IPV6))
                return -EOPNOTSUPP;

        switch (tstamp_type) {
        case BPF_SKB_CLOCK_REALTIME:
                skb->tstamp = tstamp;
                skb->tstamp_type = SKB_CLOCK_REALTIME;
                break;
        case BPF_SKB_CLOCK_MONOTONIC:
                if (!tstamp)
                        return -EINVAL;
                skb->tstamp = tstamp;
                skb->tstamp_type = SKB_CLOCK_MONOTONIC;
                break;
        case BPF_SKB_CLOCK_TAI:
                if (!tstamp)
                        return -EINVAL;
                skb->tstamp = tstamp;
                skb->tstamp_type = SKB_CLOCK_TAI;
                break;
        default:
                return -EINVAL;
        }

        return 0;
}

static const struct bpf_func_proto bpf_skb_set_tstamp_proto = {
        .func           = bpf_skb_set_tstamp,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
        .arg3_type      = ARG_ANYTHING,
};

#ifdef CONFIG_SYN_COOKIES
BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv4, struct iphdr *, iph,
           struct tcphdr *, th, u32, th_len)
{
        u32 cookie;
        u16 mss;

        if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4))
                return -EINVAL;

        mss = tcp_parse_mss_option(th, 0) ?: TCP_MSS_DEFAULT;
        cookie = __cookie_v4_init_sequence(iph, th, &mss);

        return cookie | ((u64)mss << 32);
}

static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv4_proto = {
        .func                = bpf_tcp_raw_gen_syncookie_ipv4,
        .gpl_only        = true, /* __cookie_v4_init_sequence() is GPL */
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_FIXED_SIZE_MEM,
        .arg1_size        = sizeof(struct iphdr),
        .arg2_type        = ARG_PTR_TO_MEM,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
};

BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv6, struct ipv6hdr *, iph,
           struct tcphdr *, th, u32, th_len)
{
#if IS_BUILTIN(CONFIG_IPV6)
        const u16 mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) -
                sizeof(struct ipv6hdr);
        u32 cookie;
        u16 mss;

        if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4))
                return -EINVAL;

        mss = tcp_parse_mss_option(th, 0) ?: mss_clamp;
        cookie = __cookie_v6_init_sequence(iph, th, &mss);

        return cookie | ((u64)mss << 32);
#else
        return -EPROTONOSUPPORT;
#endif
}

static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv6_proto = {
        .func                = bpf_tcp_raw_gen_syncookie_ipv6,
        .gpl_only        = true, /* __cookie_v6_init_sequence() is GPL */
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_FIXED_SIZE_MEM,
        .arg1_size        = sizeof(struct ipv6hdr),
        .arg2_type        = ARG_PTR_TO_MEM,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
};

BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv4, struct iphdr *, iph,
           struct tcphdr *, th)
{
        if (__cookie_v4_check(iph, th) > 0)
                return 0;

        return -EACCES;
}

static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv4_proto = {
        .func                = bpf_tcp_raw_check_syncookie_ipv4,
        .gpl_only        = true, /* __cookie_v4_check is GPL */
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_FIXED_SIZE_MEM,
        .arg1_size        = sizeof(struct iphdr),
        .arg2_type        = ARG_PTR_TO_FIXED_SIZE_MEM,
        .arg2_size        = sizeof(struct tcphdr),
};

BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv6, struct ipv6hdr *, iph,
           struct tcphdr *, th)
{
#if IS_BUILTIN(CONFIG_IPV6)
        if (__cookie_v6_check(iph, th) > 0)
                return 0;

        return -EACCES;
#else
        return -EPROTONOSUPPORT;
#endif
}

static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv6_proto = {
        .func                = bpf_tcp_raw_check_syncookie_ipv6,
        .gpl_only        = true, /* __cookie_v6_check is GPL */
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_FIXED_SIZE_MEM,
        .arg1_size        = sizeof(struct ipv6hdr),
        .arg2_type        = ARG_PTR_TO_FIXED_SIZE_MEM,
        .arg2_size        = sizeof(struct tcphdr),
};
#endif /* CONFIG_SYN_COOKIES */

#endif /* CONFIG_INET */

bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id)
{
        switch (func_id) {
        case BPF_FUNC_clone_redirect:
        case BPF_FUNC_l3_csum_replace:
        case BPF_FUNC_l4_csum_replace:
        case BPF_FUNC_lwt_push_encap:
        case BPF_FUNC_lwt_seg6_action:
        case BPF_FUNC_lwt_seg6_adjust_srh:
        case BPF_FUNC_lwt_seg6_store_bytes:
        case BPF_FUNC_msg_pop_data:
        case BPF_FUNC_msg_pull_data:
        case BPF_FUNC_msg_push_data:
        case BPF_FUNC_skb_adjust_room:
        case BPF_FUNC_skb_change_head:
        case BPF_FUNC_skb_change_proto:
        case BPF_FUNC_skb_change_tail:
        case BPF_FUNC_skb_pull_data:
        case BPF_FUNC_skb_store_bytes:
        case BPF_FUNC_skb_vlan_pop:
        case BPF_FUNC_skb_vlan_push:
        case BPF_FUNC_store_hdr_opt:
        case BPF_FUNC_xdp_adjust_head:
        case BPF_FUNC_xdp_adjust_meta:
        case BPF_FUNC_xdp_adjust_tail:
        /* tail-called program could call any of the above */
        case BPF_FUNC_tail_call:
                return true;
        default:
                return false;
        }
}

const struct bpf_func_proto bpf_event_output_data_proto __weak;
const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto __weak;

static const struct bpf_func_proto *
sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        const struct bpf_func_proto *func_proto;

        func_proto = cgroup_common_func_proto(func_id, prog);
        if (func_proto)
                return func_proto;

        switch (func_id) {
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_cookie_sock_proto;
        case BPF_FUNC_get_netns_cookie:
                return &bpf_get_netns_cookie_sock_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_event_output_data_proto;
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_cg_sock_proto;
        case BPF_FUNC_ktime_get_coarse_ns:
                return &bpf_ktime_get_coarse_ns_proto;
        default:
                return bpf_base_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        const struct bpf_func_proto *func_proto;

        func_proto = cgroup_common_func_proto(func_id, prog);
        if (func_proto)
                return func_proto;

        switch (func_id) {
        case BPF_FUNC_bind:
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_INET4_CONNECT:
                case BPF_CGROUP_INET6_CONNECT:
                        return &bpf_bind_proto;
                default:
                        return NULL;
                }
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_cookie_sock_addr_proto;
        case BPF_FUNC_get_netns_cookie:
                return &bpf_get_netns_cookie_sock_addr_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_event_output_data_proto;
#ifdef CONFIG_INET
        case BPF_FUNC_sk_lookup_tcp:
                return &bpf_sock_addr_sk_lookup_tcp_proto;
        case BPF_FUNC_sk_lookup_udp:
                return &bpf_sock_addr_sk_lookup_udp_proto;
        case BPF_FUNC_sk_release:
                return &bpf_sk_release_proto;
        case BPF_FUNC_skc_lookup_tcp:
                return &bpf_sock_addr_skc_lookup_tcp_proto;
#endif /* CONFIG_INET */
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
        case BPF_FUNC_setsockopt:
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_INET4_BIND:
                case BPF_CGROUP_INET6_BIND:
                case BPF_CGROUP_INET4_CONNECT:
                case BPF_CGROUP_INET6_CONNECT:
                case BPF_CGROUP_UNIX_CONNECT:
                case BPF_CGROUP_UDP4_RECVMSG:
                case BPF_CGROUP_UDP6_RECVMSG:
                case BPF_CGROUP_UNIX_RECVMSG:
                case BPF_CGROUP_UDP4_SENDMSG:
                case BPF_CGROUP_UDP6_SENDMSG:
                case BPF_CGROUP_UNIX_SENDMSG:
                case BPF_CGROUP_INET4_GETPEERNAME:
                case BPF_CGROUP_INET6_GETPEERNAME:
                case BPF_CGROUP_UNIX_GETPEERNAME:
                case BPF_CGROUP_INET4_GETSOCKNAME:
                case BPF_CGROUP_INET6_GETSOCKNAME:
                case BPF_CGROUP_UNIX_GETSOCKNAME:
                        return &bpf_sock_addr_setsockopt_proto;
                default:
                        return NULL;
                }
        case BPF_FUNC_getsockopt:
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_INET4_BIND:
                case BPF_CGROUP_INET6_BIND:
                case BPF_CGROUP_INET4_CONNECT:
                case BPF_CGROUP_INET6_CONNECT:
                case BPF_CGROUP_UNIX_CONNECT:
                case BPF_CGROUP_UDP4_RECVMSG:
                case BPF_CGROUP_UDP6_RECVMSG:
                case BPF_CGROUP_UNIX_RECVMSG:
                case BPF_CGROUP_UDP4_SENDMSG:
                case BPF_CGROUP_UDP6_SENDMSG:
                case BPF_CGROUP_UNIX_SENDMSG:
                case BPF_CGROUP_INET4_GETPEERNAME:
                case BPF_CGROUP_INET6_GETPEERNAME:
                case BPF_CGROUP_UNIX_GETPEERNAME:
                case BPF_CGROUP_INET4_GETSOCKNAME:
                case BPF_CGROUP_INET6_GETSOCKNAME:
                case BPF_CGROUP_UNIX_GETSOCKNAME:
                        return &bpf_sock_addr_getsockopt_proto;
                default:
                        return NULL;
                }
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_skb_load_bytes:
                return &bpf_skb_load_bytes_proto;
        case BPF_FUNC_skb_load_bytes_relative:
                return &bpf_skb_load_bytes_relative_proto;
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_cookie_proto;
        case BPF_FUNC_get_netns_cookie:
                return &bpf_get_netns_cookie_proto;
        case BPF_FUNC_get_socket_uid:
                return &bpf_get_socket_uid_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_skb_event_output_proto;
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

const struct bpf_func_proto bpf_sk_storage_get_proto __weak;
const struct bpf_func_proto bpf_sk_storage_delete_proto __weak;

static const struct bpf_func_proto *
cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        const struct bpf_func_proto *func_proto;

        func_proto = cgroup_common_func_proto(func_id, prog);
        if (func_proto)
                return func_proto;

        switch (func_id) {
        case BPF_FUNC_sk_fullsock:
                return &bpf_sk_fullsock_proto;
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_skb_event_output_proto;
#ifdef CONFIG_SOCK_CGROUP_DATA
        case BPF_FUNC_skb_cgroup_id:
                return &bpf_skb_cgroup_id_proto;
        case BPF_FUNC_skb_ancestor_cgroup_id:
                return &bpf_skb_ancestor_cgroup_id_proto;
        case BPF_FUNC_sk_cgroup_id:
                return &bpf_sk_cgroup_id_proto;
        case BPF_FUNC_sk_ancestor_cgroup_id:
                return &bpf_sk_ancestor_cgroup_id_proto;
#endif
#ifdef CONFIG_INET
        case BPF_FUNC_sk_lookup_tcp:
                return &bpf_sk_lookup_tcp_proto;
        case BPF_FUNC_sk_lookup_udp:
                return &bpf_sk_lookup_udp_proto;
        case BPF_FUNC_sk_release:
                return &bpf_sk_release_proto;
        case BPF_FUNC_skc_lookup_tcp:
                return &bpf_skc_lookup_tcp_proto;
        case BPF_FUNC_tcp_sock:
                return &bpf_tcp_sock_proto;
        case BPF_FUNC_get_listener_sock:
                return &bpf_get_listener_sock_proto;
        case BPF_FUNC_skb_ecn_set_ce:
                return &bpf_skb_ecn_set_ce_proto;
#endif
        default:
                return sk_filter_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_skb_store_bytes:
                return &bpf_skb_store_bytes_proto;
        case BPF_FUNC_skb_load_bytes:
                return &bpf_skb_load_bytes_proto;
        case BPF_FUNC_skb_load_bytes_relative:
                return &bpf_skb_load_bytes_relative_proto;
        case BPF_FUNC_skb_pull_data:
                return &bpf_skb_pull_data_proto;
        case BPF_FUNC_csum_diff:
                return &bpf_csum_diff_proto;
        case BPF_FUNC_csum_update:
                return &bpf_csum_update_proto;
        case BPF_FUNC_csum_level:
                return &bpf_csum_level_proto;
        case BPF_FUNC_l3_csum_replace:
                return &bpf_l3_csum_replace_proto;
        case BPF_FUNC_l4_csum_replace:
                return &bpf_l4_csum_replace_proto;
        case BPF_FUNC_clone_redirect:
                return &bpf_clone_redirect_proto;
        case BPF_FUNC_get_cgroup_classid:
                return &bpf_get_cgroup_classid_proto;
        case BPF_FUNC_skb_vlan_push:
                return &bpf_skb_vlan_push_proto;
        case BPF_FUNC_skb_vlan_pop:
                return &bpf_skb_vlan_pop_proto;
        case BPF_FUNC_skb_change_proto:
                return &bpf_skb_change_proto_proto;
        case BPF_FUNC_skb_change_type:
                return &bpf_skb_change_type_proto;
        case BPF_FUNC_skb_adjust_room:
                return &bpf_skb_adjust_room_proto;
        case BPF_FUNC_skb_change_tail:
                return &bpf_skb_change_tail_proto;
        case BPF_FUNC_skb_change_head:
                return &bpf_skb_change_head_proto;
        case BPF_FUNC_skb_get_tunnel_key:
                return &bpf_skb_get_tunnel_key_proto;
        case BPF_FUNC_skb_set_tunnel_key:
                return bpf_get_skb_set_tunnel_proto(func_id);
        case BPF_FUNC_skb_get_tunnel_opt:
                return &bpf_skb_get_tunnel_opt_proto;
        case BPF_FUNC_skb_set_tunnel_opt:
                return bpf_get_skb_set_tunnel_proto(func_id);
        case BPF_FUNC_redirect:
                return &bpf_redirect_proto;
        case BPF_FUNC_redirect_neigh:
                return &bpf_redirect_neigh_proto;
        case BPF_FUNC_redirect_peer:
                return &bpf_redirect_peer_proto;
        case BPF_FUNC_get_route_realm:
                return &bpf_get_route_realm_proto;
        case BPF_FUNC_get_hash_recalc:
                return &bpf_get_hash_recalc_proto;
        case BPF_FUNC_set_hash_invalid:
                return &bpf_set_hash_invalid_proto;
        case BPF_FUNC_set_hash:
                return &bpf_set_hash_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_skb_event_output_proto;
        case BPF_FUNC_get_smp_processor_id:
                return &bpf_get_smp_processor_id_proto;
        case BPF_FUNC_skb_under_cgroup:
                return &bpf_skb_under_cgroup_proto;
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_cookie_proto;
        case BPF_FUNC_get_netns_cookie:
                return &bpf_get_netns_cookie_proto;
        case BPF_FUNC_get_socket_uid:
                return &bpf_get_socket_uid_proto;
        case BPF_FUNC_fib_lookup:
                return &bpf_skb_fib_lookup_proto;
        case BPF_FUNC_check_mtu:
                return &bpf_skb_check_mtu_proto;
        case BPF_FUNC_sk_fullsock:
                return &bpf_sk_fullsock_proto;
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
#ifdef CONFIG_XFRM
        case BPF_FUNC_skb_get_xfrm_state:
                return &bpf_skb_get_xfrm_state_proto;
#endif
#ifdef CONFIG_CGROUP_NET_CLASSID
        case BPF_FUNC_skb_cgroup_classid:
                return &bpf_skb_cgroup_classid_proto;
#endif
#ifdef CONFIG_SOCK_CGROUP_DATA
        case BPF_FUNC_skb_cgroup_id:
                return &bpf_skb_cgroup_id_proto;
        case BPF_FUNC_skb_ancestor_cgroup_id:
                return &bpf_skb_ancestor_cgroup_id_proto;
#endif
#ifdef CONFIG_INET
        case BPF_FUNC_sk_lookup_tcp:
                return &bpf_tc_sk_lookup_tcp_proto;
        case BPF_FUNC_sk_lookup_udp:
                return &bpf_tc_sk_lookup_udp_proto;
        case BPF_FUNC_sk_release:
                return &bpf_sk_release_proto;
        case BPF_FUNC_tcp_sock:
                return &bpf_tcp_sock_proto;
        case BPF_FUNC_get_listener_sock:
                return &bpf_get_listener_sock_proto;
        case BPF_FUNC_skc_lookup_tcp:
                return &bpf_tc_skc_lookup_tcp_proto;
        case BPF_FUNC_tcp_check_syncookie:
                return &bpf_tcp_check_syncookie_proto;
        case BPF_FUNC_skb_ecn_set_ce:
                return &bpf_skb_ecn_set_ce_proto;
        case BPF_FUNC_tcp_gen_syncookie:
                return &bpf_tcp_gen_syncookie_proto;
        case BPF_FUNC_sk_assign:
                return &bpf_sk_assign_proto;
        case BPF_FUNC_skb_set_tstamp:
                return &bpf_skb_set_tstamp_proto;
#ifdef CONFIG_SYN_COOKIES
        case BPF_FUNC_tcp_raw_gen_syncookie_ipv4:
                return &bpf_tcp_raw_gen_syncookie_ipv4_proto;
        case BPF_FUNC_tcp_raw_gen_syncookie_ipv6:
                return &bpf_tcp_raw_gen_syncookie_ipv6_proto;
        case BPF_FUNC_tcp_raw_check_syncookie_ipv4:
                return &bpf_tcp_raw_check_syncookie_ipv4_proto;
        case BPF_FUNC_tcp_raw_check_syncookie_ipv6:
                return &bpf_tcp_raw_check_syncookie_ipv6_proto;
#endif
#endif
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_perf_event_output:
                return &bpf_xdp_event_output_proto;
        case BPF_FUNC_get_smp_processor_id:
                return &bpf_get_smp_processor_id_proto;
        case BPF_FUNC_csum_diff:
                return &bpf_csum_diff_proto;
        case BPF_FUNC_xdp_adjust_head:
                return &bpf_xdp_adjust_head_proto;
        case BPF_FUNC_xdp_adjust_meta:
                return &bpf_xdp_adjust_meta_proto;
        case BPF_FUNC_redirect:
                return &bpf_xdp_redirect_proto;
        case BPF_FUNC_redirect_map:
                return &bpf_xdp_redirect_map_proto;
        case BPF_FUNC_xdp_adjust_tail:
                return &bpf_xdp_adjust_tail_proto;
        case BPF_FUNC_xdp_get_buff_len:
                return &bpf_xdp_get_buff_len_proto;
        case BPF_FUNC_xdp_load_bytes:
                return &bpf_xdp_load_bytes_proto;
        case BPF_FUNC_xdp_store_bytes:
                return &bpf_xdp_store_bytes_proto;
        case BPF_FUNC_fib_lookup:
                return &bpf_xdp_fib_lookup_proto;
        case BPF_FUNC_check_mtu:
                return &bpf_xdp_check_mtu_proto;
#ifdef CONFIG_INET
        case BPF_FUNC_sk_lookup_udp:
                return &bpf_xdp_sk_lookup_udp_proto;
        case BPF_FUNC_sk_lookup_tcp:
                return &bpf_xdp_sk_lookup_tcp_proto;
        case BPF_FUNC_sk_release:
                return &bpf_sk_release_proto;
        case BPF_FUNC_skc_lookup_tcp:
                return &bpf_xdp_skc_lookup_tcp_proto;
        case BPF_FUNC_tcp_check_syncookie:
                return &bpf_tcp_check_syncookie_proto;
        case BPF_FUNC_tcp_gen_syncookie:
                return &bpf_tcp_gen_syncookie_proto;
#ifdef CONFIG_SYN_COOKIES
        case BPF_FUNC_tcp_raw_gen_syncookie_ipv4:
                return &bpf_tcp_raw_gen_syncookie_ipv4_proto;
        case BPF_FUNC_tcp_raw_gen_syncookie_ipv6:
                return &bpf_tcp_raw_gen_syncookie_ipv6_proto;
        case BPF_FUNC_tcp_raw_check_syncookie_ipv4:
                return &bpf_tcp_raw_check_syncookie_ipv4_proto;
        case BPF_FUNC_tcp_raw_check_syncookie_ipv6:
                return &bpf_tcp_raw_check_syncookie_ipv6_proto;
#endif
#endif
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }

#if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)
        /* The nf_conn___init type is used in the NF_CONNTRACK kfuncs. The
         * kfuncs are defined in two different modules, and we want to be able
         * to use them interchangeably with the same BTF type ID. Because modules
         * can't de-duplicate BTF IDs between each other, we need the type to be
         * referenced in the vmlinux BTF or the verifier will get confused about
         * the different types. So we add this dummy type reference which will
         * be included in vmlinux BTF, allowing both modules to refer to the
         * same type ID.
         */
        BTF_TYPE_EMIT(struct nf_conn___init);
#endif
}

const struct bpf_func_proto bpf_sock_map_update_proto __weak;
const struct bpf_func_proto bpf_sock_hash_update_proto __weak;

static const struct bpf_func_proto *
sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        const struct bpf_func_proto *func_proto;

        func_proto = cgroup_common_func_proto(func_id, prog);
        if (func_proto)
                return func_proto;

        switch (func_id) {
        case BPF_FUNC_setsockopt:
                return &bpf_sock_ops_setsockopt_proto;
        case BPF_FUNC_getsockopt:
                return &bpf_sock_ops_getsockopt_proto;
        case BPF_FUNC_sock_ops_cb_flags_set:
                return &bpf_sock_ops_cb_flags_set_proto;
        case BPF_FUNC_sock_map_update:
                return &bpf_sock_map_update_proto;
        case BPF_FUNC_sock_hash_update:
                return &bpf_sock_hash_update_proto;
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_cookie_sock_ops_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_event_output_data_proto;
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
        case BPF_FUNC_get_netns_cookie:
                return &bpf_get_netns_cookie_sock_ops_proto;
#ifdef CONFIG_INET
        case BPF_FUNC_load_hdr_opt:
                return &bpf_sock_ops_load_hdr_opt_proto;
        case BPF_FUNC_store_hdr_opt:
                return &bpf_sock_ops_store_hdr_opt_proto;
        case BPF_FUNC_reserve_hdr_opt:
                return &bpf_sock_ops_reserve_hdr_opt_proto;
        case BPF_FUNC_tcp_sock:
                return &bpf_tcp_sock_proto;
#endif /* CONFIG_INET */
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

const struct bpf_func_proto bpf_msg_redirect_map_proto __weak;
const struct bpf_func_proto bpf_msg_redirect_hash_proto __weak;

static const struct bpf_func_proto *
sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_msg_redirect_map:
                return &bpf_msg_redirect_map_proto;
        case BPF_FUNC_msg_redirect_hash:
                return &bpf_msg_redirect_hash_proto;
        case BPF_FUNC_msg_apply_bytes:
                return &bpf_msg_apply_bytes_proto;
        case BPF_FUNC_msg_cork_bytes:
                return &bpf_msg_cork_bytes_proto;
        case BPF_FUNC_msg_pull_data:
                return &bpf_msg_pull_data_proto;
        case BPF_FUNC_msg_push_data:
                return &bpf_msg_push_data_proto;
        case BPF_FUNC_msg_pop_data:
                return &bpf_msg_pop_data_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_event_output_data_proto;
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
        case BPF_FUNC_get_netns_cookie:
                return &bpf_get_netns_cookie_sk_msg_proto;
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

const struct bpf_func_proto bpf_sk_redirect_map_proto __weak;
const struct bpf_func_proto bpf_sk_redirect_hash_proto __weak;

static const struct bpf_func_proto *
sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_skb_store_bytes:
                return &bpf_skb_store_bytes_proto;
        case BPF_FUNC_skb_load_bytes:
                return &bpf_skb_load_bytes_proto;
        case BPF_FUNC_skb_pull_data:
                return &sk_skb_pull_data_proto;
        case BPF_FUNC_skb_change_tail:
                return &sk_skb_change_tail_proto;
        case BPF_FUNC_skb_change_head:
                return &sk_skb_change_head_proto;
        case BPF_FUNC_skb_adjust_room:
                return &sk_skb_adjust_room_proto;
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_cookie_proto;
        case BPF_FUNC_get_socket_uid:
                return &bpf_get_socket_uid_proto;
        case BPF_FUNC_sk_redirect_map:
                return &bpf_sk_redirect_map_proto;
        case BPF_FUNC_sk_redirect_hash:
                return &bpf_sk_redirect_hash_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_skb_event_output_proto;
#ifdef CONFIG_INET
        case BPF_FUNC_sk_lookup_tcp:
                return &bpf_sk_lookup_tcp_proto;
        case BPF_FUNC_sk_lookup_udp:
                return &bpf_sk_lookup_udp_proto;
        case BPF_FUNC_sk_release:
                return &bpf_sk_release_proto;
        case BPF_FUNC_skc_lookup_tcp:
                return &bpf_skc_lookup_tcp_proto;
#endif
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_skb_load_bytes:
                return &bpf_flow_dissector_load_bytes_proto;
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_skb_load_bytes:
                return &bpf_skb_load_bytes_proto;
        case BPF_FUNC_skb_pull_data:
                return &bpf_skb_pull_data_proto;
        case BPF_FUNC_csum_diff:
                return &bpf_csum_diff_proto;
        case BPF_FUNC_get_cgroup_classid:
                return &bpf_get_cgroup_classid_proto;
        case BPF_FUNC_get_route_realm:
                return &bpf_get_route_realm_proto;
        case BPF_FUNC_get_hash_recalc:
                return &bpf_get_hash_recalc_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_skb_event_output_proto;
        case BPF_FUNC_get_smp_processor_id:
                return &bpf_get_smp_processor_id_proto;
        case BPF_FUNC_skb_under_cgroup:
                return &bpf_skb_under_cgroup_proto;
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_lwt_push_encap:
                return &bpf_lwt_in_push_encap_proto;
        default:
                return lwt_out_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_skb_get_tunnel_key:
                return &bpf_skb_get_tunnel_key_proto;
        case BPF_FUNC_skb_set_tunnel_key:
                return bpf_get_skb_set_tunnel_proto(func_id);
        case BPF_FUNC_skb_get_tunnel_opt:
                return &bpf_skb_get_tunnel_opt_proto;
        case BPF_FUNC_skb_set_tunnel_opt:
                return bpf_get_skb_set_tunnel_proto(func_id);
        case BPF_FUNC_redirect:
                return &bpf_redirect_proto;
        case BPF_FUNC_clone_redirect:
                return &bpf_clone_redirect_proto;
        case BPF_FUNC_skb_change_tail:
                return &bpf_skb_change_tail_proto;
        case BPF_FUNC_skb_change_head:
                return &bpf_skb_change_head_proto;
        case BPF_FUNC_skb_store_bytes:
                return &bpf_skb_store_bytes_proto;
        case BPF_FUNC_csum_update:
                return &bpf_csum_update_proto;
        case BPF_FUNC_csum_level:
                return &bpf_csum_level_proto;
        case BPF_FUNC_l3_csum_replace:
                return &bpf_l3_csum_replace_proto;
        case BPF_FUNC_l4_csum_replace:
                return &bpf_l4_csum_replace_proto;
        case BPF_FUNC_set_hash_invalid:
                return &bpf_set_hash_invalid_proto;
        case BPF_FUNC_lwt_push_encap:
                return &bpf_lwt_xmit_push_encap_proto;
        default:
                return lwt_out_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
        case BPF_FUNC_lwt_seg6_store_bytes:
                return &bpf_lwt_seg6_store_bytes_proto;
        case BPF_FUNC_lwt_seg6_action:
                return &bpf_lwt_seg6_action_proto;
        case BPF_FUNC_lwt_seg6_adjust_srh:
                return &bpf_lwt_seg6_adjust_srh_proto;
#endif
        default:
                return lwt_out_func_proto(func_id, prog);
        }
}

static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type,
                                    const struct bpf_prog *prog,
                                    struct bpf_insn_access_aux *info)
{
        const int size_default = sizeof(__u32);

        if (off < 0 || off >= sizeof(struct __sk_buff))
                return false;

        /* The verifier guarantees that size > 0. */
        if (off % size != 0)
                return false;

        switch (off) {
        case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
                if (off + size > offsetofend(struct __sk_buff, cb[4]))
                        return false;
                break;
        case bpf_ctx_range(struct __sk_buff, data):
        case bpf_ctx_range(struct __sk_buff, data_meta):
        case bpf_ctx_range(struct __sk_buff, data_end):
                if (info->is_ldsx || size != size_default)
                        return false;
                break;
        case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]):
        case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]):
        case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4):
        case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4):
                if (size != size_default)
                        return false;
                break;
        case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
                return false;
        case bpf_ctx_range(struct __sk_buff, hwtstamp):
                if (type == BPF_WRITE || size != sizeof(__u64))
                        return false;
                break;
        case bpf_ctx_range(struct __sk_buff, tstamp):
                if (size != sizeof(__u64))
                        return false;
                break;
        case bpf_ctx_range_ptr(struct __sk_buff, sk):
                if (type == BPF_WRITE || size != sizeof(__u64))
                        return false;
                info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
                break;
        case offsetof(struct __sk_buff, tstamp_type):
                return false;
        case offsetofend(struct __sk_buff, tstamp_type) ... offsetof(struct __sk_buff, hwtstamp) - 1:
                /* Explicitly prohibit access to padding in __sk_buff. */
                return false;
        default:
                /* Only narrow read access allowed for now. */
                if (type == BPF_WRITE) {
                        if (size != size_default)
                                return false;
                } else {
                        bpf_ctx_record_field_size(info, size_default);
                        if (!bpf_ctx_narrow_access_ok(off, size, size_default))
                                return false;
                }
        }

        return true;
}

static bool sk_filter_is_valid_access(int off, int size,
                                      enum bpf_access_type type,
                                      const struct bpf_prog *prog,
                                      struct bpf_insn_access_aux *info)
{
        switch (off) {
        case bpf_ctx_range(struct __sk_buff, tc_classid):
        case bpf_ctx_range(struct __sk_buff, data):
        case bpf_ctx_range(struct __sk_buff, data_meta):
        case bpf_ctx_range(struct __sk_buff, data_end):
        case bpf_ctx_range_till(struct __sk_buff, family, local_port):
        case bpf_ctx_range(struct __sk_buff, tstamp):
        case bpf_ctx_range(struct __sk_buff, wire_len):
        case bpf_ctx_range(struct __sk_buff, hwtstamp):
                return false;
        }

        if (type == BPF_WRITE) {
                switch (off) {
                case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
                        break;
                default:
                        return false;
                }
        }

        return bpf_skb_is_valid_access(off, size, type, prog, info);
}

static bool cg_skb_is_valid_access(int off, int size,
                                   enum bpf_access_type type,
                                   const struct bpf_prog *prog,
                                   struct bpf_insn_access_aux *info)
{
        switch (off) {
        case bpf_ctx_range(struct __sk_buff, tc_classid):
        case bpf_ctx_range(struct __sk_buff, data_meta):
        case bpf_ctx_range(struct __sk_buff, wire_len):
                return false;
        case bpf_ctx_range(struct __sk_buff, data):
        case bpf_ctx_range(struct __sk_buff, data_end):
                if (!bpf_token_capable(prog->aux->token, CAP_BPF))
                        return false;
                break;
        }

        if (type == BPF_WRITE) {
                switch (off) {
                case bpf_ctx_range(struct __sk_buff, mark):
                case bpf_ctx_range(struct __sk_buff, priority):
                case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
                        break;
                case bpf_ctx_range(struct __sk_buff, tstamp):
                        if (!bpf_token_capable(prog->aux->token, CAP_BPF))
                                return false;
                        break;
                default:
                        return false;
                }
        }

        switch (off) {
        case bpf_ctx_range(struct __sk_buff, data):
                info->reg_type = PTR_TO_PACKET;
                break;
        case bpf_ctx_range(struct __sk_buff, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                break;
        }

        return bpf_skb_is_valid_access(off, size, type, prog, info);
}

static bool lwt_is_valid_access(int off, int size,
                                enum bpf_access_type type,
                                const struct bpf_prog *prog,
                                struct bpf_insn_access_aux *info)
{
        switch (off) {
        case bpf_ctx_range(struct __sk_buff, tc_classid):
        case bpf_ctx_range_till(struct __sk_buff, family, local_port):
        case bpf_ctx_range(struct __sk_buff, data_meta):
        case bpf_ctx_range(struct __sk_buff, tstamp):
        case bpf_ctx_range(struct __sk_buff, wire_len):
        case bpf_ctx_range(struct __sk_buff, hwtstamp):
                return false;
        }

        if (type == BPF_WRITE) {
                switch (off) {
                case bpf_ctx_range(struct __sk_buff, mark):
                case bpf_ctx_range(struct __sk_buff, priority):
                case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
                        break;
                default:
                        return false;
                }
        }

        switch (off) {
        case bpf_ctx_range(struct __sk_buff, data):
                info->reg_type = PTR_TO_PACKET;
                break;
        case bpf_ctx_range(struct __sk_buff, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                break;
        }

        return bpf_skb_is_valid_access(off, size, type, prog, info);
}

/* Attach type specific accesses */
static bool __sock_filter_check_attach_type(int off,
                                            enum bpf_access_type access_type,
                                            enum bpf_attach_type attach_type)
{
        switch (off) {
        case offsetof(struct bpf_sock, bound_dev_if):
        case offsetof(struct bpf_sock, mark):
        case offsetof(struct bpf_sock, priority):
                switch (attach_type) {
                case BPF_CGROUP_INET_SOCK_CREATE:
                case BPF_CGROUP_INET_SOCK_RELEASE:
                        goto full_access;
                default:
                        return false;
                }
        case bpf_ctx_range(struct bpf_sock, src_ip4):
                switch (attach_type) {
                case BPF_CGROUP_INET4_POST_BIND:
                        goto read_only;
                default:
                        return false;
                }
        case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
                switch (attach_type) {
                case BPF_CGROUP_INET6_POST_BIND:
                        goto read_only;
                default:
                        return false;
                }
        case bpf_ctx_range(struct bpf_sock, src_port):
                switch (attach_type) {
                case BPF_CGROUP_INET4_POST_BIND:
                case BPF_CGROUP_INET6_POST_BIND:
                        goto read_only;
                default:
                        return false;
                }
        }
read_only:
        return access_type == BPF_READ;
full_access:
        return true;
}

bool bpf_sock_common_is_valid_access(int off, int size,
                                     enum bpf_access_type type,
                                     struct bpf_insn_access_aux *info)
{
        switch (off) {
        case bpf_ctx_range_till(struct bpf_sock, type, priority):
                return false;
        default:
                return bpf_sock_is_valid_access(off, size, type, info);
        }
}

bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
                              struct bpf_insn_access_aux *info)
{
        const int size_default = sizeof(__u32);
        int field_size;

        if (off < 0 || off >= sizeof(struct bpf_sock))
                return false;
        if (off % size != 0)
                return false;

        switch (off) {
        case offsetof(struct bpf_sock, state):
        case offsetof(struct bpf_sock, family):
        case offsetof(struct bpf_sock, type):
        case offsetof(struct bpf_sock, protocol):
        case offsetof(struct bpf_sock, src_port):
        case offsetof(struct bpf_sock, rx_queue_mapping):
        case bpf_ctx_range(struct bpf_sock, src_ip4):
        case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
        case bpf_ctx_range(struct bpf_sock, dst_ip4):
        case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]):
                bpf_ctx_record_field_size(info, size_default);
                return bpf_ctx_narrow_access_ok(off, size, size_default);
        case bpf_ctx_range(struct bpf_sock, dst_port):
                field_size = size == size_default ?
                        size_default : sizeof_field(struct bpf_sock, dst_port);
                bpf_ctx_record_field_size(info, field_size);
                return bpf_ctx_narrow_access_ok(off, size, field_size);
        case offsetofend(struct bpf_sock, dst_port) ...
             offsetof(struct bpf_sock, dst_ip4) - 1:
                return false;
        }

        return size == size_default;
}

static bool sock_filter_is_valid_access(int off, int size,
                                        enum bpf_access_type type,
                                        const struct bpf_prog *prog,
                                        struct bpf_insn_access_aux *info)
{
        if (!bpf_sock_is_valid_access(off, size, type, info))
                return false;
        return __sock_filter_check_attach_type(off, type,
                                               prog->expected_attach_type);
}

static int bpf_noop_prologue(struct bpf_insn *insn_buf, bool direct_write,
                             const struct bpf_prog *prog)
{
        /* Neither direct read nor direct write requires any preliminary
         * action.
         */
        return 0;
}

static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write,
                                const struct bpf_prog *prog, int drop_verdict)
{
        struct bpf_insn *insn = insn_buf;

        if (!direct_write)
                return 0;

        /* if (!skb->cloned)
         *       goto start;
         *
         * (Fast-path, otherwise approximation that we might be
         *  a clone, do the rest in helper.)
         */
        *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET);
        *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK);
        *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7);

        /* ret = bpf_skb_pull_data(skb, 0); */
        *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
        *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2);
        *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
                               BPF_FUNC_skb_pull_data);
        /* if (!ret)
         *      goto restore;
         * return TC_ACT_SHOT;
         */
        *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2);
        *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, drop_verdict);
        *insn++ = BPF_EXIT_INSN();

        /* restore: */
        *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
        /* start: */
        *insn++ = prog->insnsi[0];

        return insn - insn_buf;
}

static int bpf_gen_ld_abs(const struct bpf_insn *orig,
                          struct bpf_insn *insn_buf)
{
        bool indirect = BPF_MODE(orig->code) == BPF_IND;
        struct bpf_insn *insn = insn_buf;

        if (!indirect) {
                *insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm);
        } else {
                *insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg);
                if (orig->imm)
                        *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm);
        }
        /* We're guaranteed here that CTX is in R6. */
        *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX);

        switch (BPF_SIZE(orig->code)) {
        case BPF_B:
                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache);
                break;
        case BPF_H:
                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache);
                break;
        case BPF_W:
                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache);
                break;
        }

        *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2);
        *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0);
        *insn++ = BPF_EXIT_INSN();

        return insn - insn_buf;
}

static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
                               const struct bpf_prog *prog)
{
        return bpf_unclone_prologue(insn_buf, direct_write, prog, TC_ACT_SHOT);
}

static bool tc_cls_act_is_valid_access(int off, int size,
                                       enum bpf_access_type type,
                                       const struct bpf_prog *prog,
                                       struct bpf_insn_access_aux *info)
{
        if (type == BPF_WRITE) {
                switch (off) {
                case bpf_ctx_range(struct __sk_buff, mark):
                case bpf_ctx_range(struct __sk_buff, tc_index):
                case bpf_ctx_range(struct __sk_buff, priority):
                case bpf_ctx_range(struct __sk_buff, tc_classid):
                case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
                case bpf_ctx_range(struct __sk_buff, tstamp):
                case bpf_ctx_range(struct __sk_buff, queue_mapping):
                        break;
                default:
                        return false;
                }
        }

        switch (off) {
        case bpf_ctx_range(struct __sk_buff, data):
                info->reg_type = PTR_TO_PACKET;
                break;
        case bpf_ctx_range(struct __sk_buff, data_meta):
                info->reg_type = PTR_TO_PACKET_META;
                break;
        case bpf_ctx_range(struct __sk_buff, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                break;
        case bpf_ctx_range_till(struct __sk_buff, family, local_port):
                return false;
        case offsetof(struct __sk_buff, tstamp_type):
                /* The convert_ctx_access() on reading and writing
                 * __sk_buff->tstamp depends on whether the bpf prog
                 * has used __sk_buff->tstamp_type or not.
                 * Thus, we need to set prog->tstamp_type_access
                 * earlier during is_valid_access() here.
                 */
                ((struct bpf_prog *)prog)->tstamp_type_access = 1;
                return size == sizeof(__u8);
        }

        return bpf_skb_is_valid_access(off, size, type, prog, info);
}

DEFINE_MUTEX(nf_conn_btf_access_lock);
EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock);

int (*nfct_btf_struct_access)(struct bpf_verifier_log *log,
                              const struct bpf_reg_state *reg,
                              int off, int size);
EXPORT_SYMBOL_GPL(nfct_btf_struct_access);

static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log,
                                        const struct bpf_reg_state *reg,
                                        int off, int size)
{
        int ret = -EACCES;

        mutex_lock(&nf_conn_btf_access_lock);
        if (nfct_btf_struct_access)
                ret = nfct_btf_struct_access(log, reg, off, size);
        mutex_unlock(&nf_conn_btf_access_lock);

        return ret;
}

static bool __is_valid_xdp_access(int off, int size)
{
        if (off < 0 || off >= sizeof(struct xdp_md))
                return false;
        if (off % size != 0)
                return false;
        if (size != sizeof(__u32))
                return false;

        return true;
}

static bool xdp_is_valid_access(int off, int size,
                                enum bpf_access_type type,
                                const struct bpf_prog *prog,
                                struct bpf_insn_access_aux *info)
{
        if (prog->expected_attach_type != BPF_XDP_DEVMAP) {
                switch (off) {
                case offsetof(struct xdp_md, egress_ifindex):
                        return false;
                }
        }

        if (type == BPF_WRITE) {
                if (bpf_prog_is_offloaded(prog->aux)) {
                        switch (off) {
                        case offsetof(struct xdp_md, rx_queue_index):
                                return __is_valid_xdp_access(off, size);
                        }
                }
                return false;
        } else {
                switch (off) {
                case offsetof(struct xdp_md, data_meta):
                case offsetof(struct xdp_md, data):
                case offsetof(struct xdp_md, data_end):
                        if (info->is_ldsx)
                                return false;
                }
        }

        switch (off) {
        case offsetof(struct xdp_md, data):
                info->reg_type = PTR_TO_PACKET;
                break;
        case offsetof(struct xdp_md, data_meta):
                info->reg_type = PTR_TO_PACKET_META;
                break;
        case offsetof(struct xdp_md, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                break;
        }

        return __is_valid_xdp_access(off, size);
}

void bpf_warn_invalid_xdp_action(const struct net_device *dev,
                                 const struct bpf_prog *prog, u32 act)
{
        const u32 act_max = XDP_REDIRECT;

        pr_warn_once("%s XDP return value %u on prog %s (id %d) dev %s, expect packet loss!\n",
                     act > act_max ? "Illegal" : "Driver unsupported",
                     act, prog->aux->name, prog->aux->id, dev ? dev->name : "N/A");
}
EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);

static int xdp_btf_struct_access(struct bpf_verifier_log *log,
                                 const struct bpf_reg_state *reg,
                                 int off, int size)
{
        int ret = -EACCES;

        mutex_lock(&nf_conn_btf_access_lock);
        if (nfct_btf_struct_access)
                ret = nfct_btf_struct_access(log, reg, off, size);
        mutex_unlock(&nf_conn_btf_access_lock);

        return ret;
}

static bool sock_addr_is_valid_access(int off, int size,
                                      enum bpf_access_type type,
                                      const struct bpf_prog *prog,
                                      struct bpf_insn_access_aux *info)
{
        const int size_default = sizeof(__u32);

        if (off < 0 || off >= sizeof(struct bpf_sock_addr))
                return false;
        if (off % size != 0)
                return false;

        /* Disallow access to fields not belonging to the attach type's address
         * family.
         */
        switch (off) {
        case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_INET4_BIND:
                case BPF_CGROUP_INET4_CONNECT:
                case BPF_CGROUP_INET4_GETPEERNAME:
                case BPF_CGROUP_INET4_GETSOCKNAME:
                case BPF_CGROUP_UDP4_SENDMSG:
                case BPF_CGROUP_UDP4_RECVMSG:
                        break;
                default:
                        return false;
                }
                break;
        case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_INET6_BIND:
                case BPF_CGROUP_INET6_CONNECT:
                case BPF_CGROUP_INET6_GETPEERNAME:
                case BPF_CGROUP_INET6_GETSOCKNAME:
                case BPF_CGROUP_UDP6_SENDMSG:
                case BPF_CGROUP_UDP6_RECVMSG:
                        break;
                default:
                        return false;
                }
                break;
        case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_UDP4_SENDMSG:
                        break;
                default:
                        return false;
                }
                break;
        case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
                                msg_src_ip6[3]):
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_UDP6_SENDMSG:
                        break;
                default:
                        return false;
                }
                break;
        }

        switch (off) {
        case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
        case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
        case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
        case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
                                msg_src_ip6[3]):
        case bpf_ctx_range(struct bpf_sock_addr, user_port):
                if (type == BPF_READ) {
                        bpf_ctx_record_field_size(info, size_default);

                        if (bpf_ctx_wide_access_ok(off, size,
                                                   struct bpf_sock_addr,
                                                   user_ip6))
                                return true;

                        if (bpf_ctx_wide_access_ok(off, size,
                                                   struct bpf_sock_addr,
                                                   msg_src_ip6))
                                return true;

                        if (!bpf_ctx_narrow_access_ok(off, size, size_default))
                                return false;
                } else {
                        if (bpf_ctx_wide_access_ok(off, size,
                                                   struct bpf_sock_addr,
                                                   user_ip6))
                                return true;

                        if (bpf_ctx_wide_access_ok(off, size,
                                                   struct bpf_sock_addr,
                                                   msg_src_ip6))
                                return true;

                        if (size != size_default)
                                return false;
                }
                break;
        case bpf_ctx_range_ptr(struct bpf_sock_addr, sk):
                if (type != BPF_READ)
                        return false;
                if (size != sizeof(__u64))
                        return false;
                info->reg_type = PTR_TO_SOCKET;
                break;
        case bpf_ctx_range(struct bpf_sock_addr, user_family):
        case bpf_ctx_range(struct bpf_sock_addr, family):
        case bpf_ctx_range(struct bpf_sock_addr, type):
        case bpf_ctx_range(struct bpf_sock_addr, protocol):
                if (type != BPF_READ)
                        return false;
                if (size != size_default)
                        return false;
                break;
        default:
                return false;
        }

        return true;
}

static bool sock_ops_is_valid_access(int off, int size,
                                     enum bpf_access_type type,
                                     const struct bpf_prog *prog,
                                     struct bpf_insn_access_aux *info)
{
        const int size_default = sizeof(__u32);

        if (off < 0 || off >= sizeof(struct bpf_sock_ops))
                return false;

        /* The verifier guarantees that size > 0. */
        if (off % size != 0)
                return false;

        if (type == BPF_WRITE) {
                switch (off) {
                case offsetof(struct bpf_sock_ops, reply):
                case offsetof(struct bpf_sock_ops, sk_txhash):
                        if (size != size_default)
                                return false;
                        break;
                default:
                        return false;
                }
        } else {
                switch (off) {
                case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received,
                                        bytes_acked):
                        if (size != sizeof(__u64))
                                return false;
                        break;
                case bpf_ctx_range_ptr(struct bpf_sock_ops, sk):
                        if (size != sizeof(__u64))
                                return false;
                        info->reg_type = PTR_TO_SOCKET_OR_NULL;
                        break;
                case bpf_ctx_range_ptr(struct bpf_sock_ops, skb_data):
                        if (size != sizeof(__u64))
                                return false;
                        info->reg_type = PTR_TO_PACKET;
                        break;
                case bpf_ctx_range_ptr(struct bpf_sock_ops, skb_data_end):
                        if (size != sizeof(__u64))
                                return false;
                        info->reg_type = PTR_TO_PACKET_END;
                        break;
                case offsetof(struct bpf_sock_ops, skb_tcp_flags):
                        bpf_ctx_record_field_size(info, size_default);
                        return bpf_ctx_narrow_access_ok(off, size,
                                                        size_default);
                case bpf_ctx_range(struct bpf_sock_ops, skb_hwtstamp):
                        if (size != sizeof(__u64))
                                return false;
                        break;
                default:
                        if (size != size_default)
                                return false;
                        break;
                }
        }

        return true;
}

static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write,
                           const struct bpf_prog *prog)
{
        return bpf_unclone_prologue(insn_buf, direct_write, prog, SK_DROP);
}

static bool sk_skb_is_valid_access(int off, int size,
                                   enum bpf_access_type type,
                                   const struct bpf_prog *prog,
                                   struct bpf_insn_access_aux *info)
{
        switch (off) {
        case bpf_ctx_range(struct __sk_buff, tc_classid):
        case bpf_ctx_range(struct __sk_buff, data_meta):
        case bpf_ctx_range(struct __sk_buff, tstamp):
        case bpf_ctx_range(struct __sk_buff, wire_len):
        case bpf_ctx_range(struct __sk_buff, hwtstamp):
                return false;
        }

        if (type == BPF_WRITE) {
                switch (off) {
                case bpf_ctx_range(struct __sk_buff, tc_index):
                case bpf_ctx_range(struct __sk_buff, priority):
                        break;
                default:
                        return false;
                }
        }

        switch (off) {
        case bpf_ctx_range(struct __sk_buff, mark):
                return false;
        case bpf_ctx_range(struct __sk_buff, data):
                info->reg_type = PTR_TO_PACKET;
                break;
        case bpf_ctx_range(struct __sk_buff, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                break;
        }

        return bpf_skb_is_valid_access(off, size, type, prog, info);
}

static bool sk_msg_is_valid_access(int off, int size,
                                   enum bpf_access_type type,
                                   const struct bpf_prog *prog,
                                   struct bpf_insn_access_aux *info)
{
        if (type == BPF_WRITE)
                return false;

        if (off % size != 0)
                return false;

        switch (off) {
        case bpf_ctx_range_ptr(struct sk_msg_md, data):
                info->reg_type = PTR_TO_PACKET;
                if (size != sizeof(__u64))
                        return false;
                break;
        case bpf_ctx_range_ptr(struct sk_msg_md, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                if (size != sizeof(__u64))
                        return false;
                break;
        case bpf_ctx_range_ptr(struct sk_msg_md, sk):
                if (size != sizeof(__u64))
                        return false;
                info->reg_type = PTR_TO_SOCKET;
                break;
        case bpf_ctx_range(struct sk_msg_md, family):
        case bpf_ctx_range(struct sk_msg_md, remote_ip4):
        case bpf_ctx_range(struct sk_msg_md, local_ip4):
        case bpf_ctx_range_till(struct sk_msg_md, remote_ip6[0], remote_ip6[3]):
        case bpf_ctx_range_till(struct sk_msg_md, local_ip6[0], local_ip6[3]):
        case bpf_ctx_range(struct sk_msg_md, remote_port):
        case bpf_ctx_range(struct sk_msg_md, local_port):
        case bpf_ctx_range(struct sk_msg_md, size):
                if (size != sizeof(__u32))
                        return false;
                break;
        default:
                return false;
        }
        return true;
}

static bool flow_dissector_is_valid_access(int off, int size,
                                           enum bpf_access_type type,
                                           const struct bpf_prog *prog,
                                           struct bpf_insn_access_aux *info)
{
        const int size_default = sizeof(__u32);

        if (off < 0 || off >= sizeof(struct __sk_buff))
                return false;

        if (off % size != 0)
                return false;

        if (type == BPF_WRITE)
                return false;

        switch (off) {
        case bpf_ctx_range(struct __sk_buff, data):
                if (info->is_ldsx || size != size_default)
                        return false;
                info->reg_type = PTR_TO_PACKET;
                return true;
        case bpf_ctx_range(struct __sk_buff, data_end):
                if (info->is_ldsx || size != size_default)
                        return false;
                info->reg_type = PTR_TO_PACKET_END;
                return true;
        case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
                if (size != sizeof(__u64))
                        return false;
                info->reg_type = PTR_TO_FLOW_KEYS;
                return true;
        default:
                return false;
        }
}

static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type,
                                             const struct bpf_insn *si,
                                             struct bpf_insn *insn_buf,
                                             struct bpf_prog *prog,
                                             u32 *target_size)

{
        struct bpf_insn *insn = insn_buf;

        switch (si->off) {
        case offsetof(struct __sk_buff, data):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_flow_dissector, data));
                break;

        case offsetof(struct __sk_buff, data_end):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data_end),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_flow_dissector, data_end));
                break;

        case offsetof(struct __sk_buff, flow_keys):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, flow_keys),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_flow_dissector, flow_keys));
                break;
        }

        return insn - insn_buf;
}

static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si,
                                                     struct bpf_insn *insn)
{
        __u8 value_reg = si->dst_reg;
        __u8 skb_reg = si->src_reg;
        BUILD_BUG_ON(__SKB_CLOCK_MAX != (int)BPF_SKB_CLOCK_TAI);
        BUILD_BUG_ON(SKB_CLOCK_REALTIME != (int)BPF_SKB_CLOCK_REALTIME);
        BUILD_BUG_ON(SKB_CLOCK_MONOTONIC != (int)BPF_SKB_CLOCK_MONOTONIC);
        BUILD_BUG_ON(SKB_CLOCK_TAI != (int)BPF_SKB_CLOCK_TAI);
        *insn++ = BPF_LDX_MEM(BPF_B, value_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
        *insn++ = BPF_ALU32_IMM(BPF_AND, value_reg, SKB_TSTAMP_TYPE_MASK);
#ifdef __BIG_ENDIAN_BITFIELD
        *insn++ = BPF_ALU32_IMM(BPF_RSH, value_reg, SKB_TSTAMP_TYPE_RSHIFT);
#else
        BUILD_BUG_ON(!(SKB_TSTAMP_TYPE_MASK & 0x1));
#endif

        return insn;
}

static struct bpf_insn *bpf_convert_shinfo_access(__u8 dst_reg, __u8 skb_reg,
                                                  struct bpf_insn *insn)
{
        /* si->dst_reg = skb_shinfo(SKB); */
#ifdef NET_SKBUFF_DATA_USES_OFFSET
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
                              BPF_REG_AX, skb_reg,
                              offsetof(struct sk_buff, end));
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head),
                              dst_reg, skb_reg,
                              offsetof(struct sk_buff, head));
        *insn++ = BPF_ALU64_REG(BPF_ADD, dst_reg, BPF_REG_AX);
#else
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
                              dst_reg, skb_reg,
                              offsetof(struct sk_buff, end));
#endif

        return insn;
}

static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,
                                                const struct bpf_insn *si,
                                                struct bpf_insn *insn)
{
        __u8 value_reg = si->dst_reg;
        __u8 skb_reg = si->src_reg;

#ifdef CONFIG_NET_XGRESS
        /* If the tstamp_type is read,
         * the bpf prog is aware the tstamp could have delivery time.
         * Thus, read skb->tstamp as is if tstamp_type_access is true.
         */
        if (!prog->tstamp_type_access) {
                /* AX is needed because src_reg and dst_reg could be the same */
                __u8 tmp_reg = BPF_REG_AX;

                *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
                /* check if ingress mask bits is set */
                *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
                *insn++ = BPF_JMP_A(4);
                *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, SKB_TSTAMP_TYPE_MASK, 1);
                *insn++ = BPF_JMP_A(2);
                /* skb->tc_at_ingress && skb->tstamp_type,
                 * read 0 as the (rcv) timestamp.
                 */
                *insn++ = BPF_MOV64_IMM(value_reg, 0);
                *insn++ = BPF_JMP_A(1);
        }
#endif

        *insn++ = BPF_LDX_MEM(BPF_DW, value_reg, skb_reg,
                              offsetof(struct sk_buff, tstamp));
        return insn;
}

static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
                                                 const struct bpf_insn *si,
                                                 struct bpf_insn *insn)
{
        __u8 value_reg = si->src_reg;
        __u8 skb_reg = si->dst_reg;

#ifdef CONFIG_NET_XGRESS
        /* If the tstamp_type is read,
         * the bpf prog is aware the tstamp could have delivery time.
         * Thus, write skb->tstamp as is if tstamp_type_access is true.
         * Otherwise, writing at ingress will have to clear the
         * skb->tstamp_type bit also.
         */
        if (!prog->tstamp_type_access) {
                __u8 tmp_reg = BPF_REG_AX;

                *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
                /* Writing __sk_buff->tstamp as ingress, goto <clear> */
                *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
                /* goto <store> */
                *insn++ = BPF_JMP_A(2);
                /* <clear>: skb->tstamp_type */
                *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_TSTAMP_TYPE_MASK);
                *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET);
        }
#endif

        /* <store>: skb->tstamp = tstamp */
        *insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_DW | BPF_MEM,
                               skb_reg, value_reg, offsetof(struct sk_buff, tstamp), si->imm);
        return insn;
}

#define BPF_EMIT_STORE(size, si, off)                                        \
        BPF_RAW_INSN(BPF_CLASS((si)->code) | (size) | BPF_MEM,                \
                     (si)->dst_reg, (si)->src_reg, (off), (si)->imm)

static u32 bpf_convert_ctx_access(enum bpf_access_type type,
                                  const struct bpf_insn *si,
                                  struct bpf_insn *insn_buf,
                                  struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;
        int off;

        switch (si->off) {
        case offsetof(struct __sk_buff, len):
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, len, 4,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, protocol):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, protocol, 2,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, vlan_proto):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, vlan_proto, 2,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, priority):
                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_W, si,
                                                 bpf_target_off(struct sk_buff, priority, 4,
                                                                target_size));
                else
                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                              bpf_target_off(struct sk_buff, priority, 4,
                                                             target_size));
                break;

        case offsetof(struct __sk_buff, ingress_ifindex):
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, skb_iif, 4,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, ifindex):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, dev));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct net_device, ifindex, 4,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, hash):
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, hash, 4,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, mark):
                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_W, si,
                                                 bpf_target_off(struct sk_buff, mark, 4,
                                                                target_size));
                else
                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                              bpf_target_off(struct sk_buff, mark, 4,
                                                             target_size));
                break;

        case offsetof(struct __sk_buff, pkt_type):
                *target_size = 1;
                *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg,
                                      PKT_TYPE_OFFSET);
                *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX);
#ifdef __BIG_ENDIAN_BITFIELD
                *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5);
#endif
                break;

        case offsetof(struct __sk_buff, queue_mapping):
                if (type == BPF_WRITE) {
                        u32 offset = bpf_target_off(struct sk_buff, queue_mapping, 2, target_size);

                        if (BPF_CLASS(si->code) == BPF_ST && si->imm >= NO_QUEUE_MAPPING) {
                                *insn++ = BPF_JMP_A(0); /* noop */
                                break;
                        }

                        if (BPF_CLASS(si->code) == BPF_STX)
                                *insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1);
                        *insn++ = BPF_EMIT_STORE(BPF_H, si, offset);
                } else {
                        *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                              bpf_target_off(struct sk_buff,
                                                             queue_mapping,
                                                             2, target_size));
                }
                break;

        case offsetof(struct __sk_buff, vlan_present):
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff,
                                                     vlan_all, 4, target_size));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_ALU32_IMM(BPF_MOV, si->dst_reg, 1);
                break;

        case offsetof(struct __sk_buff, vlan_tci):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, vlan_tci, 2,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, cb[0]) ...
             offsetofend(struct __sk_buff, cb[4]) - 1:
                BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, data) < 20);
                BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
                              offsetof(struct qdisc_skb_cb, data)) %
                             sizeof(__u64));

                prog->cb_access = 1;
                off  = si->off;
                off -= offsetof(struct __sk_buff, cb[0]);
                off += offsetof(struct sk_buff, cb);
                off += offsetof(struct qdisc_skb_cb, data);
                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off);
                else
                        *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
                                              si->src_reg, off);
                break;

        case offsetof(struct __sk_buff, tc_classid):
                BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, tc_classid) != 2);

                off  = si->off;
                off -= offsetof(struct __sk_buff, tc_classid);
                off += offsetof(struct sk_buff, cb);
                off += offsetof(struct qdisc_skb_cb, tc_classid);
                *target_size = 2;
                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_H, si, off);
                else
                        *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg,
                                              si->src_reg, off);
                break;

        case offsetof(struct __sk_buff, data):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, data));
                break;

        case offsetof(struct __sk_buff, data_meta):
                off  = si->off;
                off -= offsetof(struct __sk_buff, data_meta);
                off += offsetof(struct sk_buff, cb);
                off += offsetof(struct bpf_skb_data_end, data_meta);
                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
                                      si->src_reg, off);
                break;

        case offsetof(struct __sk_buff, data_end):
                off  = si->off;
                off -= offsetof(struct __sk_buff, data_end);
                off += offsetof(struct sk_buff, cb);
                off += offsetof(struct bpf_skb_data_end, data_end);
                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
                                      si->src_reg, off);
                break;

        case offsetof(struct __sk_buff, tc_index):
#ifdef CONFIG_NET_SCHED
                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_H, si,
                                                 bpf_target_off(struct sk_buff, tc_index, 2,
                                                                target_size));
                else
                        *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                              bpf_target_off(struct sk_buff, tc_index, 2,
                                                             target_size));
#else
                *target_size = 2;
                if (type == BPF_WRITE)
                        *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg);
                else
                        *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
#endif
                break;

        case offsetof(struct __sk_buff, napi_id):
#if defined(CONFIG_NET_RX_BUSY_POLL)
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, napi_id, 4,
                                                     target_size));
                *insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1);
                *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
#else
                *target_size = 4;
                *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
#endif
                break;
        case offsetof(struct __sk_buff, family):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct sock_common,
                                                     skc_family,
                                                     2, target_size));
                break;
        case offsetof(struct __sk_buff, remote_ip4):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct sock_common,
                                                     skc_daddr,
                                                     4, target_size));
                break;
        case offsetof(struct __sk_buff, local_ip4):
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_rcv_saddr) != 4);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct sock_common,
                                                     skc_rcv_saddr,
                                                     4, target_size));
                break;
        case offsetof(struct __sk_buff, remote_ip6[0]) ...
             offsetof(struct __sk_buff, remote_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_v6_daddr.s6_addr32[0]) != 4);

                off = si->off;
                off -= offsetof(struct __sk_buff, remote_ip6[0]);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_v6_daddr.s6_addr32[0]) +
                                      off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;
        case offsetof(struct __sk_buff, local_ip6[0]) ...
             offsetof(struct __sk_buff, local_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_v6_rcv_saddr.s6_addr32[0]) != 4);

                off = si->off;
                off -= offsetof(struct __sk_buff, local_ip6[0]);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_v6_rcv_saddr.s6_addr32[0]) +
                                      off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;

        case offsetof(struct __sk_buff, remote_port):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct sock_common,
                                                     skc_dport,
                                                     2, target_size));
#ifndef __BIG_ENDIAN_BITFIELD
                *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
#endif
                break;

        case offsetof(struct __sk_buff, local_port):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct sock_common,
                                                     skc_num, 2, target_size));
                break;

        case offsetof(struct __sk_buff, tstamp):
                BUILD_BUG_ON(sizeof_field(struct sk_buff, tstamp) != 8);

                if (type == BPF_WRITE)
                        insn = bpf_convert_tstamp_write(prog, si, insn);
                else
                        insn = bpf_convert_tstamp_read(prog, si, insn);
                break;

        case offsetof(struct __sk_buff, tstamp_type):
                insn = bpf_convert_tstamp_type_read(si, insn);
                break;

        case offsetof(struct __sk_buff, gso_segs):
                insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs),
                                      si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct skb_shared_info,
                                                     gso_segs, 2,
                                                     target_size));
                break;
        case offsetof(struct __sk_buff, gso_size):
                insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_size),
                                      si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct skb_shared_info,
                                                     gso_size, 2,
                                                     target_size));
                break;
        case offsetof(struct __sk_buff, wire_len):
                BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, pkt_len) != 4);

                off = si->off;
                off -= offsetof(struct __sk_buff, wire_len);
                off += offsetof(struct sk_buff, cb);
                off += offsetof(struct qdisc_skb_cb, pkt_len);
                *target_size = 4;
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off);
                break;

        case offsetof(struct __sk_buff, sk):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                break;
        case offsetof(struct __sk_buff, hwtstamp):
                BUILD_BUG_ON(sizeof_field(struct skb_shared_hwtstamps, hwtstamp) != 8);
                BUILD_BUG_ON(offsetof(struct skb_shared_hwtstamps, hwtstamp) != 0);

                insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn);
                *insn++ = BPF_LDX_MEM(BPF_DW,
                                      si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct skb_shared_info,
                                                     hwtstamps, 8,
                                                     target_size));
                break;
        }

        return insn - insn_buf;
}

u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
                                const struct bpf_insn *si,
                                struct bpf_insn *insn_buf,
                                struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;
        int off;

        switch (si->off) {
        case offsetof(struct bpf_sock, bound_dev_if):
                BUILD_BUG_ON(sizeof_field(struct sock, sk_bound_dev_if) != 4);

                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_W, si,
                                                 offsetof(struct sock, sk_bound_dev_if));
                else
                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      offsetof(struct sock, sk_bound_dev_if));
                break;

        case offsetof(struct bpf_sock, mark):
                BUILD_BUG_ON(sizeof_field(struct sock, sk_mark) != 4);

                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_W, si,
                                                 offsetof(struct sock, sk_mark));
                else
                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      offsetof(struct sock, sk_mark));
                break;

        case offsetof(struct bpf_sock, priority):
                BUILD_BUG_ON(sizeof_field(struct sock, sk_priority) != 4);

                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_W, si,
                                                 offsetof(struct sock, sk_priority));
                else
                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      offsetof(struct sock, sk_priority));
                break;

        case offsetof(struct bpf_sock, family):
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock_common, skc_family),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common,
                                       skc_family,
                                       sizeof_field(struct sock_common,
                                                    skc_family),
                                       target_size));
                break;

        case offsetof(struct bpf_sock, type):
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock, sk_type),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock, sk_type,
                                       sizeof_field(struct sock, sk_type),
                                       target_size));
                break;

        case offsetof(struct bpf_sock, protocol):
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock, sk_protocol),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock, sk_protocol,
                                       sizeof_field(struct sock, sk_protocol),
                                       target_size));
                break;

        case offsetof(struct bpf_sock, src_ip4):
                *insn++ = BPF_LDX_MEM(
                        BPF_SIZE(si->code), si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common, skc_rcv_saddr,
                                       sizeof_field(struct sock_common,
                                                    skc_rcv_saddr),
                                       target_size));
                break;

        case offsetof(struct bpf_sock, dst_ip4):
                *insn++ = BPF_LDX_MEM(
                        BPF_SIZE(si->code), si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common, skc_daddr,
                                       sizeof_field(struct sock_common,
                                                    skc_daddr),
                                       target_size));
                break;

        case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                off = si->off;
                off -= offsetof(struct bpf_sock, src_ip6[0]);
                *insn++ = BPF_LDX_MEM(
                        BPF_SIZE(si->code), si->dst_reg, si->src_reg,
                        bpf_target_off(
                                struct sock_common,
                                skc_v6_rcv_saddr.s6_addr32[0],
                                sizeof_field(struct sock_common,
                                             skc_v6_rcv_saddr.s6_addr32[0]),
                                target_size) + off);
#else
                (void)off;
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;

        case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                off = si->off;
                off -= offsetof(struct bpf_sock, dst_ip6[0]);
                *insn++ = BPF_LDX_MEM(
                        BPF_SIZE(si->code), si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common,
                                       skc_v6_daddr.s6_addr32[0],
                                       sizeof_field(struct sock_common,
                                                    skc_v6_daddr.s6_addr32[0]),
                                       target_size) + off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
                *target_size = 4;
#endif
                break;

        case offsetof(struct bpf_sock, src_port):
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock_common, skc_num),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common, skc_num,
                                       sizeof_field(struct sock_common,
                                                    skc_num),
                                       target_size));
                break;

        case offsetof(struct bpf_sock, dst_port):
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock_common, skc_dport),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common, skc_dport,
                                       sizeof_field(struct sock_common,
                                                    skc_dport),
                                       target_size));
                break;

        case offsetof(struct bpf_sock, state):
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock_common, skc_state),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common, skc_state,
                                       sizeof_field(struct sock_common,
                                                    skc_state),
                                       target_size));
                break;
        case offsetof(struct bpf_sock, rx_queue_mapping):
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock, sk_rx_queue_mapping),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock, sk_rx_queue_mapping,
                                       sizeof_field(struct sock,
                                                    sk_rx_queue_mapping),
                                       target_size));
                *insn++ = BPF_JMP_IMM(BPF_JNE, si->dst_reg, NO_QUEUE_MAPPING,
                                      1);
                *insn++ = BPF_MOV64_IMM(si->dst_reg, -1);
#else
                *insn++ = BPF_MOV64_IMM(si->dst_reg, -1);
                *target_size = 2;
#endif
                break;
        }

        return insn - insn_buf;
}

static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type,
                                         const struct bpf_insn *si,
                                         struct bpf_insn *insn_buf,
                                         struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;

        switch (si->off) {
        case offsetof(struct __sk_buff, ifindex):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, dev));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct net_device, ifindex, 4,
                                                     target_size));
                break;
        default:
                return bpf_convert_ctx_access(type, si, insn_buf, prog,
                                              target_size);
        }

        return insn - insn_buf;
}

static u32 xdp_convert_ctx_access(enum bpf_access_type type,
                                  const struct bpf_insn *si,
                                  struct bpf_insn *insn_buf,
                                  struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;

        switch (si->off) {
        case offsetof(struct xdp_md, data):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct xdp_buff, data));
                break;
        case offsetof(struct xdp_md, data_meta):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct xdp_buff, data_meta));
                break;
        case offsetof(struct xdp_md, data_end):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct xdp_buff, data_end));
                break;
        case offsetof(struct xdp_md, ingress_ifindex):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct xdp_buff, rxq));
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev),
                                      si->dst_reg, si->dst_reg,
                                      offsetof(struct xdp_rxq_info, dev));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct net_device, ifindex));
                break;
        case offsetof(struct xdp_md, rx_queue_index):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct xdp_buff, rxq));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct xdp_rxq_info,
                                               queue_index));
                break;
        case offsetof(struct xdp_md, egress_ifindex):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, txq),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct xdp_buff, txq));
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_txq_info, dev),
                                      si->dst_reg, si->dst_reg,
                                      offsetof(struct xdp_txq_info, dev));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct net_device, ifindex));
                break;
        }

        return insn - insn_buf;
}

/* SOCK_ADDR_LOAD_NESTED_FIELD() loads Nested Field S.F.NF where S is type of
 * context Structure, F is Field in context structure that contains a pointer
 * to Nested Structure of type NS that has the field NF.
 *
 * SIZE encodes the load size (BPF_B, BPF_H, etc). It's up to caller to make
 * sure that SIZE is not greater than actual size of S.F.NF.
 *
 * If offset OFF is provided, the load happens from that offset relative to
 * offset of NF.
 */
#define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF)               \
        do {                                                                       \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg,     \
                                      si->src_reg, offsetof(S, F));               \
                *insn++ = BPF_LDX_MEM(                                               \
                        SIZE, si->dst_reg, si->dst_reg,                               \
                        bpf_target_off(NS, NF, sizeof_field(NS, NF),               \
                                       target_size)                               \
                                + OFF);                                               \
        } while (0)

#define SOCK_ADDR_LOAD_NESTED_FIELD(S, NS, F, NF)                               \
        SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF,                       \
                                             BPF_FIELD_SIZEOF(NS, NF), 0)

/* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to
 * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation.
 *
 * In addition it uses Temporary Field TF (member of struct S) as the 3rd
 * "register" since two registers available in convert_ctx_access are not
 * enough: we can't override neither SRC, since it contains value to store, nor
 * DST since it contains pointer to context that may be used by later
 * instructions. But we need a temporary place to save pointer to nested
 * structure whose field we want to store to.
 */
#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, OFF, TF)               \
        do {                                                                       \
                int tmp_reg = BPF_REG_9;                                       \
                if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg)               \
                        --tmp_reg;                                               \
                if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg)               \
                        --tmp_reg;                                               \
                *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, tmp_reg,               \
                                      offsetof(S, TF));                               \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg,               \
                                      si->dst_reg, offsetof(S, F));               \
                *insn++ = BPF_RAW_INSN(SIZE | BPF_MEM | BPF_CLASS(si->code),   \
                                       tmp_reg, si->src_reg,                       \
                        bpf_target_off(NS, NF, sizeof_field(NS, NF),               \
                                       target_size)                               \
                                       + OFF,                                       \
                                       si->imm);                               \
                *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg,               \
                                      offsetof(S, TF));                               \
        } while (0)

#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF, \
                                                      TF)                       \
        do {                                                                       \
                if (type == BPF_WRITE) {                                       \
                        SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE,   \
                                                         OFF, TF);               \
                } else {                                                       \
                        SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(                       \
                                S, NS, F, NF, SIZE, OFF);  \
                }                                                               \
        } while (0)

static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
                                        const struct bpf_insn *si,
                                        struct bpf_insn *insn_buf,
                                        struct bpf_prog *prog, u32 *target_size)
{
        int off, port_size = sizeof_field(struct sockaddr_in6, sin6_port);
        struct bpf_insn *insn = insn_buf;

        switch (si->off) {
        case offsetof(struct bpf_sock_addr, user_family):
                SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
                                            struct sockaddr, uaddr, sa_family);
                break;

        case offsetof(struct bpf_sock_addr, user_ip4):
                SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
                        struct bpf_sock_addr_kern, struct sockaddr_in, uaddr,
                        sin_addr, BPF_SIZE(si->code), 0, tmp_reg);
                break;

        case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
                off = si->off;
                off -= offsetof(struct bpf_sock_addr, user_ip6[0]);
                SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
                        struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr,
                        sin6_addr.s6_addr32[0], BPF_SIZE(si->code), off,
                        tmp_reg);
                break;

        case offsetof(struct bpf_sock_addr, user_port):
                /* To get port we need to know sa_family first and then treat
                 * sockaddr as either sockaddr_in or sockaddr_in6.
                 * Though we can simplify since port field has same offset and
                 * size in both structures.
                 * Here we check this invariant and use just one of the
                 * structures if it's true.
                 */
                BUILD_BUG_ON(offsetof(struct sockaddr_in, sin_port) !=
                             offsetof(struct sockaddr_in6, sin6_port));
                BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) !=
                             sizeof_field(struct sockaddr_in6, sin6_port));
                /* Account for sin6_port being smaller than user_port. */
                port_size = min(port_size, BPF_LDST_BYTES(si));
                SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
                        struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr,
                        sin6_port, bytes_to_bpf_size(port_size), 0, tmp_reg);
                break;

        case offsetof(struct bpf_sock_addr, family):
                SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
                                            struct sock, sk, sk_family);
                break;

        case offsetof(struct bpf_sock_addr, type):
                SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
                                            struct sock, sk, sk_type);
                break;

        case offsetof(struct bpf_sock_addr, protocol):
                SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
                                            struct sock, sk, sk_protocol);
                break;

        case offsetof(struct bpf_sock_addr, msg_src_ip4):
                /* Treat t_ctx as struct in_addr for msg_src_ip4. */
                SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
                        struct bpf_sock_addr_kern, struct in_addr, t_ctx,
                        s_addr, BPF_SIZE(si->code), 0, tmp_reg);
                break;

        case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
                                msg_src_ip6[3]):
                off = si->off;
                off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]);
                /* Treat t_ctx as struct in6_addr for msg_src_ip6. */
                SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
                        struct bpf_sock_addr_kern, struct in6_addr, t_ctx,
                        s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg);
                break;
        case offsetof(struct bpf_sock_addr, sk):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_addr_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_addr_kern, sk));
                break;
        }

        return insn - insn_buf;
}

static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
                                       const struct bpf_insn *si,
                                       struct bpf_insn *insn_buf,
                                       struct bpf_prog *prog,
                                       u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;
        int off;

/* Helper macro for adding read access to tcp_sock or sock fields. */
#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)                              \
        do {                                                                      \
                int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 2;     \
                BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) >                      \
                             sizeof_field(struct bpf_sock_ops, BPF_FIELD));   \
                if (si->dst_reg == reg || si->src_reg == reg)                      \
                        reg--;                                                      \
                if (si->dst_reg == reg || si->src_reg == reg)                      \
                        reg--;                                                      \
                if (si->dst_reg == si->src_reg) {                              \
                        *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg,              \
                                          offsetof(struct bpf_sock_ops_kern,  \
                                          temp));                              \
                        fullsock_reg = reg;                                      \
                        jmp += 2;                                              \
                }                                                              \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                              \
                                                struct bpf_sock_ops_kern,     \
                                                is_locked_tcp_sock),              \
                                      fullsock_reg, si->src_reg,              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                               is_locked_tcp_sock));              \
                *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp);              \
                if (si->dst_reg == si->src_reg)                                      \
                        *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                      temp));                                      \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                              \
                                                struct bpf_sock_ops_kern, sk),\
                                      si->dst_reg, si->src_reg,                      \
                                      offsetof(struct bpf_sock_ops_kern, sk));\
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ,                      \
                                                       OBJ_FIELD),              \
                                      si->dst_reg, si->dst_reg,                      \
                                      offsetof(OBJ, OBJ_FIELD));              \
                if (si->dst_reg == si->src_reg)        {                              \
                        *insn++ = BPF_JMP_A(1);                                      \
                        *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                      temp));                                      \
                }                                                              \
        } while (0)

#define SOCK_OPS_GET_SK()                                                              \
        do {                                                                      \
                int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 1;     \
                if (si->dst_reg == reg || si->src_reg == reg)                      \
                        reg--;                                                      \
                if (si->dst_reg == reg || si->src_reg == reg)                      \
                        reg--;                                                      \
                if (si->dst_reg == si->src_reg) {                              \
                        *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg,              \
                                          offsetof(struct bpf_sock_ops_kern,  \
                                          temp));                              \
                        fullsock_reg = reg;                                      \
                        jmp += 2;                                              \
                }                                                              \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                              \
                                                struct bpf_sock_ops_kern,     \
                                                is_fullsock),                      \
                                      fullsock_reg, si->src_reg,              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                               is_fullsock));                      \
                *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp);              \
                if (si->dst_reg == si->src_reg)                                      \
                        *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                      temp));                                      \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                              \
                                                struct bpf_sock_ops_kern, sk),\
                                      si->dst_reg, si->src_reg,                      \
                                      offsetof(struct bpf_sock_ops_kern, sk));\
                if (si->dst_reg == si->src_reg)        {                              \
                        *insn++ = BPF_JMP_A(1);                                      \
                        *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                      temp));                                      \
                }                                                              \
        } while (0)

#define SOCK_OPS_GET_TCP_SOCK_FIELD(FIELD) \
                SOCK_OPS_GET_FIELD(FIELD, FIELD, struct tcp_sock)

/* Helper macro for adding write access to tcp_sock or sock fields.
 * The macro is called with two registers, dst_reg which contains a pointer
 * to ctx (context) and src_reg which contains the value that should be
 * stored. However, we need an additional register since we cannot overwrite
 * dst_reg because it may be used later in the program.
 * Instead we "borrow" one of the other register. We first save its value
 * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore
 * it at the end of the macro.
 */
#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)                              \
        do {                                                                      \
                int reg = BPF_REG_9;                                              \
                BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) >                      \
                             sizeof_field(struct bpf_sock_ops, BPF_FIELD));   \
                if (si->dst_reg == reg || si->src_reg == reg)                      \
                        reg--;                                                      \
                if (si->dst_reg == reg || si->src_reg == reg)                      \
                        reg--;                                                      \
                *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg,                      \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                               temp));                              \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                              \
                                                struct bpf_sock_ops_kern,     \
                                                is_locked_tcp_sock),              \
                                      reg, si->dst_reg,                              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                               is_locked_tcp_sock));              \
                *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2);                      \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                              \
                                                struct bpf_sock_ops_kern, sk),\
                                      reg, si->dst_reg,                              \
                                      offsetof(struct bpf_sock_ops_kern, sk));\
                *insn++ = BPF_RAW_INSN(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD) |     \
                                       BPF_MEM | BPF_CLASS(si->code),              \
                                       reg, si->src_reg,                      \
                                       offsetof(OBJ, OBJ_FIELD),              \
                                       si->imm);                              \
                *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg,                      \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                               temp));                              \
        } while (0)

#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE)              \
        do {                                                                      \
                if (TYPE == BPF_WRITE)                                              \
                        SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ);              \
                else                                                              \
                        SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ);              \
        } while (0)

        switch (si->off) {
        case offsetof(struct bpf_sock_ops, op):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
                                                       op),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, op));
                break;

        case offsetof(struct bpf_sock_ops, replylong[0]) ...
             offsetof(struct bpf_sock_ops, replylong[3]):
                BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, reply) !=
                             sizeof_field(struct bpf_sock_ops_kern, reply));
                BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, replylong) !=
                             sizeof_field(struct bpf_sock_ops_kern, replylong));
                off = si->off;
                off -= offsetof(struct bpf_sock_ops, replylong[0]);
                off += offsetof(struct bpf_sock_ops_kern, replylong[0]);
                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_W, si, off);
                else
                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                              off);
                break;

        case offsetof(struct bpf_sock_ops, family):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                              struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_family));
                break;

        case offsetof(struct bpf_sock_ops, remote_ip4):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_daddr));
                break;

        case offsetof(struct bpf_sock_ops, local_ip4):
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_rcv_saddr) != 4);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                              struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_rcv_saddr));
                break;

        case offsetof(struct bpf_sock_ops, remote_ip6[0]) ...
             offsetof(struct bpf_sock_ops, remote_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_v6_daddr.s6_addr32[0]) != 4);

                off = si->off;
                off -= offsetof(struct bpf_sock_ops, remote_ip6[0]);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_v6_daddr.s6_addr32[0]) +
                                      off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;

        case offsetof(struct bpf_sock_ops, local_ip6[0]) ...
             offsetof(struct bpf_sock_ops, local_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_v6_rcv_saddr.s6_addr32[0]) != 4);

                off = si->off;
                off -= offsetof(struct bpf_sock_ops, local_ip6[0]);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_v6_rcv_saddr.s6_addr32[0]) +
                                      off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;

        case offsetof(struct bpf_sock_ops, remote_port):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_dport));
#ifndef __BIG_ENDIAN_BITFIELD
                *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
#endif
                break;

        case offsetof(struct bpf_sock_ops, local_port):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_num));
                break;

        case offsetof(struct bpf_sock_ops, is_fullsock):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern,
                                                is_fullsock),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern,
                                               is_fullsock));
                break;

        case offsetof(struct bpf_sock_ops, state):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_state) != 1);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_state));
                break;

        case offsetof(struct bpf_sock_ops, rtt_min):
                BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) !=
                             sizeof(struct minmax));
                BUILD_BUG_ON(sizeof(struct minmax) <
                             sizeof(struct minmax_sample));

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct tcp_sock, rtt_min) +
                                      sizeof_field(struct minmax_sample, t));
                break;

        case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags):
                SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags,
                                   struct tcp_sock);
                break;

        case offsetof(struct bpf_sock_ops, sk_txhash):
                SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
                                          struct sock, type);
                break;
        case offsetof(struct bpf_sock_ops, snd_cwnd):
                SOCK_OPS_GET_TCP_SOCK_FIELD(snd_cwnd);
                break;
        case offsetof(struct bpf_sock_ops, srtt_us):
                SOCK_OPS_GET_TCP_SOCK_FIELD(srtt_us);
                break;
        case offsetof(struct bpf_sock_ops, snd_ssthresh):
                SOCK_OPS_GET_TCP_SOCK_FIELD(snd_ssthresh);
                break;
        case offsetof(struct bpf_sock_ops, rcv_nxt):
                SOCK_OPS_GET_TCP_SOCK_FIELD(rcv_nxt);
                break;
        case offsetof(struct bpf_sock_ops, snd_nxt):
                SOCK_OPS_GET_TCP_SOCK_FIELD(snd_nxt);
                break;
        case offsetof(struct bpf_sock_ops, snd_una):
                SOCK_OPS_GET_TCP_SOCK_FIELD(snd_una);
                break;
        case offsetof(struct bpf_sock_ops, mss_cache):
                SOCK_OPS_GET_TCP_SOCK_FIELD(mss_cache);
                break;
        case offsetof(struct bpf_sock_ops, ecn_flags):
                SOCK_OPS_GET_TCP_SOCK_FIELD(ecn_flags);
                break;
        case offsetof(struct bpf_sock_ops, rate_delivered):
                SOCK_OPS_GET_TCP_SOCK_FIELD(rate_delivered);
                break;
        case offsetof(struct bpf_sock_ops, rate_interval_us):
                SOCK_OPS_GET_TCP_SOCK_FIELD(rate_interval_us);
                break;
        case offsetof(struct bpf_sock_ops, packets_out):
                SOCK_OPS_GET_TCP_SOCK_FIELD(packets_out);
                break;
        case offsetof(struct bpf_sock_ops, retrans_out):
                SOCK_OPS_GET_TCP_SOCK_FIELD(retrans_out);
                break;
        case offsetof(struct bpf_sock_ops, total_retrans):
                SOCK_OPS_GET_TCP_SOCK_FIELD(total_retrans);
                break;
        case offsetof(struct bpf_sock_ops, segs_in):
                SOCK_OPS_GET_TCP_SOCK_FIELD(segs_in);
                break;
        case offsetof(struct bpf_sock_ops, data_segs_in):
                SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_in);
                break;
        case offsetof(struct bpf_sock_ops, segs_out):
                SOCK_OPS_GET_TCP_SOCK_FIELD(segs_out);
                break;
        case offsetof(struct bpf_sock_ops, data_segs_out):
                SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_out);
                break;
        case offsetof(struct bpf_sock_ops, lost_out):
                SOCK_OPS_GET_TCP_SOCK_FIELD(lost_out);
                break;
        case offsetof(struct bpf_sock_ops, sacked_out):
                SOCK_OPS_GET_TCP_SOCK_FIELD(sacked_out);
                break;
        case offsetof(struct bpf_sock_ops, bytes_received):
                SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_received);
                break;
        case offsetof(struct bpf_sock_ops, bytes_acked):
                SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_acked);
                break;
        case offsetof(struct bpf_sock_ops, sk):
                SOCK_OPS_GET_SK();
                break;
        case offsetof(struct bpf_sock_ops, skb_data_end):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
                                                       skb_data_end),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern,
                                               skb_data_end));
                break;
        case offsetof(struct bpf_sock_ops, skb_data):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
                                                       skb),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern,
                                               skb));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
                                      si->dst_reg, si->dst_reg,
                                      offsetof(struct sk_buff, data));
                break;
        case offsetof(struct bpf_sock_ops, skb_len):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
                                                       skb),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern,
                                               skb));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len),
                                      si->dst_reg, si->dst_reg,
                                      offsetof(struct sk_buff, len));
                break;
        case offsetof(struct bpf_sock_ops, skb_tcp_flags):
                off = offsetof(struct sk_buff, cb);
                off += offsetof(struct tcp_skb_cb, tcp_flags);
                *target_size = sizeof_field(struct tcp_skb_cb, tcp_flags);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
                                                       skb),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern,
                                               skb));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_skb_cb,
                                                       tcp_flags),
                                      si->dst_reg, si->dst_reg, off);
                break;
        case offsetof(struct bpf_sock_ops, skb_hwtstamp): {
                struct bpf_insn *jmp_on_null_skb;

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
                                                       skb),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern,
                                               skb));
                /* Reserve one insn to test skb == NULL */
                jmp_on_null_skb = insn++;
                insn = bpf_convert_shinfo_access(si->dst_reg, si->dst_reg, insn);
                *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct skb_shared_info,
                                                     hwtstamps, 8,
                                                     target_size));
                *jmp_on_null_skb = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0,
                                               insn - jmp_on_null_skb - 1);
                break;
        }
        }
        return insn - insn_buf;
}

/* data_end = skb->data + skb_headlen() */
static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si,
                                                    struct bpf_insn *insn)
{
        int reg;
        int temp_reg_off = offsetof(struct sk_buff, cb) +
                           offsetof(struct sk_skb_cb, temp_reg);

        if (si->src_reg == si->dst_reg) {
                /* We need an extra register, choose and save a register. */
                reg = BPF_REG_9;
                if (si->src_reg == reg || si->dst_reg == reg)
                        reg--;
                if (si->src_reg == reg || si->dst_reg == reg)
                        reg--;
                *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, temp_reg_off);
        } else {
                reg = si->dst_reg;
        }

        /* reg = skb->data */
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
                              reg, si->src_reg,
                              offsetof(struct sk_buff, data));
        /* AX = skb->len */
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len),
                              BPF_REG_AX, si->src_reg,
                              offsetof(struct sk_buff, len));
        /* reg = skb->data + skb->len */
        *insn++ = BPF_ALU64_REG(BPF_ADD, reg, BPF_REG_AX);
        /* AX = skb->data_len */
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len),
                              BPF_REG_AX, si->src_reg,
                              offsetof(struct sk_buff, data_len));

        /* reg = skb->data + skb->len - skb->data_len */
        *insn++ = BPF_ALU64_REG(BPF_SUB, reg, BPF_REG_AX);

        if (si->src_reg == si->dst_reg) {
                /* Restore the saved register */
                *insn++ = BPF_MOV64_REG(BPF_REG_AX, si->src_reg);
                *insn++ = BPF_MOV64_REG(si->dst_reg, reg);
                *insn++ = BPF_LDX_MEM(BPF_DW, reg, BPF_REG_AX, temp_reg_off);
        }

        return insn;
}

static u32 sk_skb_convert_ctx_access(enum bpf_access_type type,
                                     const struct bpf_insn *si,
                                     struct bpf_insn *insn_buf,
                                     struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;
        int off;

        switch (si->off) {
        case offsetof(struct __sk_buff, data_end):
                insn = bpf_convert_data_end_access(si, insn);
                break;
        case offsetof(struct __sk_buff, cb[0]) ...
             offsetofend(struct __sk_buff, cb[4]) - 1:
                BUILD_BUG_ON(sizeof_field(struct sk_skb_cb, data) < 20);
                BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
                              offsetof(struct sk_skb_cb, data)) %
                             sizeof(__u64));

                prog->cb_access = 1;
                off  = si->off;
                off -= offsetof(struct __sk_buff, cb[0]);
                off += offsetof(struct sk_buff, cb);
                off += offsetof(struct sk_skb_cb, data);
                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off);
                else
                        *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
                                              si->src_reg, off);
                break;


        default:
                return bpf_convert_ctx_access(type, si, insn_buf, prog,
                                              target_size);
        }

        return insn - insn_buf;
}

static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
                                     const struct bpf_insn *si,
                                     struct bpf_insn *insn_buf,
                                     struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;
#if IS_ENABLED(CONFIG_IPV6)
        int off;
#endif

        /* convert ctx uses the fact sg element is first in struct */
        BUILD_BUG_ON(offsetof(struct sk_msg, sg) != 0);

        switch (si->off) {
        case offsetof(struct sk_msg_md, data):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, data));
                break;
        case offsetof(struct sk_msg_md, data_end):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data_end),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, data_end));
                break;
        case offsetof(struct sk_msg_md, family):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                              struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_family));
                break;

        case offsetof(struct sk_msg_md, remote_ip4):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_daddr));
                break;

        case offsetof(struct sk_msg_md, local_ip4):
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_rcv_saddr) != 4);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                              struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_rcv_saddr));
                break;

        case offsetof(struct sk_msg_md, remote_ip6[0]) ...
             offsetof(struct sk_msg_md, remote_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_v6_daddr.s6_addr32[0]) != 4);

                off = si->off;
                off -= offsetof(struct sk_msg_md, remote_ip6[0]);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_v6_daddr.s6_addr32[0]) +
                                      off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;

        case offsetof(struct sk_msg_md, local_ip6[0]) ...
             offsetof(struct sk_msg_md, local_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_v6_rcv_saddr.s6_addr32[0]) != 4);

                off = si->off;
                off -= offsetof(struct sk_msg_md, local_ip6[0]);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_v6_rcv_saddr.s6_addr32[0]) +
                                      off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;

        case offsetof(struct sk_msg_md, remote_port):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_dport));
#ifndef __BIG_ENDIAN_BITFIELD
                *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
#endif
                break;

        case offsetof(struct sk_msg_md, local_port):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_num));
                break;

        case offsetof(struct sk_msg_md, size):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_sg, size),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg_sg, size));
                break;

        case offsetof(struct sk_msg_md, sk):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                break;
        }

        return insn - insn_buf;
}

const struct bpf_verifier_ops sk_filter_verifier_ops = {
        .get_func_proto                = sk_filter_func_proto,
        .is_valid_access        = sk_filter_is_valid_access,
        .convert_ctx_access        = bpf_convert_ctx_access,
        .gen_ld_abs                = bpf_gen_ld_abs,
};

const struct bpf_prog_ops sk_filter_prog_ops = {
        .test_run                = bpf_prog_test_run_skb,
};

const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
        .get_func_proto                = tc_cls_act_func_proto,
        .is_valid_access        = tc_cls_act_is_valid_access,
        .convert_ctx_access        = tc_cls_act_convert_ctx_access,
        .gen_prologue                = tc_cls_act_prologue,
        .gen_ld_abs                = bpf_gen_ld_abs,
        .btf_struct_access        = tc_cls_act_btf_struct_access,
};

const struct bpf_prog_ops tc_cls_act_prog_ops = {
        .test_run                = bpf_prog_test_run_skb,
};

const struct bpf_verifier_ops xdp_verifier_ops = {
        .get_func_proto                = xdp_func_proto,
        .is_valid_access        = xdp_is_valid_access,
        .convert_ctx_access        = xdp_convert_ctx_access,
        .gen_prologue                = bpf_noop_prologue,
        .btf_struct_access        = xdp_btf_struct_access,
};

const struct bpf_prog_ops xdp_prog_ops = {
        .test_run                = bpf_prog_test_run_xdp,
};

const struct bpf_verifier_ops cg_skb_verifier_ops = {
        .get_func_proto                = cg_skb_func_proto,
        .is_valid_access        = cg_skb_is_valid_access,
        .convert_ctx_access        = bpf_convert_ctx_access,
};

const struct bpf_prog_ops cg_skb_prog_ops = {
        .test_run                = bpf_prog_test_run_skb,
};

const struct bpf_verifier_ops lwt_in_verifier_ops = {
        .get_func_proto                = lwt_in_func_proto,
        .is_valid_access        = lwt_is_valid_access,
        .convert_ctx_access        = bpf_convert_ctx_access,
};

const struct bpf_prog_ops lwt_in_prog_ops = {
        .test_run                = bpf_prog_test_run_skb,
};

const struct bpf_verifier_ops lwt_out_verifier_ops = {
        .get_func_proto                = lwt_out_func_proto,
        .is_valid_access        = lwt_is_valid_access,
        .convert_ctx_access        = bpf_convert_ctx_access,
};

const struct bpf_prog_ops lwt_out_prog_ops = {
        .test_run                = bpf_prog_test_run_skb,
};

const struct bpf_verifier_ops lwt_xmit_verifier_ops = {
        .get_func_proto                = lwt_xmit_func_proto,
        .is_valid_access        = lwt_is_valid_access,
        .convert_ctx_access        = bpf_convert_ctx_access,
        .gen_prologue                = tc_cls_act_prologue,
};

const struct bpf_prog_ops lwt_xmit_prog_ops = {
        .test_run                = bpf_prog_test_run_skb,
};

const struct bpf_verifier_ops lwt_seg6local_verifier_ops = {
        .get_func_proto                = lwt_seg6local_func_proto,
        .is_valid_access        = lwt_is_valid_access,
        .convert_ctx_access        = bpf_convert_ctx_access,
};

const struct bpf_prog_ops lwt_seg6local_prog_ops = {
};

const struct bpf_verifier_ops cg_sock_verifier_ops = {
        .get_func_proto                = sock_filter_func_proto,
        .is_valid_access        = sock_filter_is_valid_access,
        .convert_ctx_access        = bpf_sock_convert_ctx_access,
};

const struct bpf_prog_ops cg_sock_prog_ops = {
};

const struct bpf_verifier_ops cg_sock_addr_verifier_ops = {
        .get_func_proto                = sock_addr_func_proto,
        .is_valid_access        = sock_addr_is_valid_access,
        .convert_ctx_access        = sock_addr_convert_ctx_access,
};

const struct bpf_prog_ops cg_sock_addr_prog_ops = {
};

const struct bpf_verifier_ops sock_ops_verifier_ops = {
        .get_func_proto                = sock_ops_func_proto,
        .is_valid_access        = sock_ops_is_valid_access,
        .convert_ctx_access        = sock_ops_convert_ctx_access,
};

const struct bpf_prog_ops sock_ops_prog_ops = {
};

const struct bpf_verifier_ops sk_skb_verifier_ops = {
        .get_func_proto                = sk_skb_func_proto,
        .is_valid_access        = sk_skb_is_valid_access,
        .convert_ctx_access        = sk_skb_convert_ctx_access,
        .gen_prologue                = sk_skb_prologue,
};

const struct bpf_prog_ops sk_skb_prog_ops = {
};

const struct bpf_verifier_ops sk_msg_verifier_ops = {
        .get_func_proto                = sk_msg_func_proto,
        .is_valid_access        = sk_msg_is_valid_access,
        .convert_ctx_access        = sk_msg_convert_ctx_access,
        .gen_prologue                = bpf_noop_prologue,
};

const struct bpf_prog_ops sk_msg_prog_ops = {
};

const struct bpf_verifier_ops flow_dissector_verifier_ops = {
        .get_func_proto                = flow_dissector_func_proto,
        .is_valid_access        = flow_dissector_is_valid_access,
        .convert_ctx_access        = flow_dissector_convert_ctx_access,
};

const struct bpf_prog_ops flow_dissector_prog_ops = {
        .test_run                = bpf_prog_test_run_flow_dissector,
};

int sk_detach_filter(struct sock *sk)
{
        int ret = -ENOENT;
        struct sk_filter *filter;

        if (sock_flag(sk, SOCK_FILTER_LOCKED))
                return -EPERM;

        filter = rcu_dereference_protected(sk->sk_filter,
                                           lockdep_sock_is_held(sk));
        if (filter) {
                RCU_INIT_POINTER(sk->sk_filter, NULL);
                sk_filter_uncharge(sk, filter);
                ret = 0;
        }

        return ret;
}
EXPORT_SYMBOL_GPL(sk_detach_filter);

int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len)
{
        struct sock_fprog_kern *fprog;
        struct sk_filter *filter;
        int ret = 0;

        sockopt_lock_sock(sk);
        filter = rcu_dereference_protected(sk->sk_filter,
                                           lockdep_sock_is_held(sk));
        if (!filter)
                goto out;

        /* We're copying the filter that has been originally attached,
         * so no conversion/decode needed anymore. eBPF programs that
         * have no original program cannot be dumped through this.
         */
        ret = -EACCES;
        fprog = filter->prog->orig_prog;
        if (!fprog)
                goto out;

        ret = fprog->len;
        if (!len)
                /* User space only enquires number of filter blocks. */
                goto out;

        ret = -EINVAL;
        if (len < fprog->len)
                goto out;

        ret = -EFAULT;
        if (copy_to_sockptr(optval, fprog->filter, bpf_classic_proglen(fprog)))
                goto out;

        /* Instead of bytes, the API requests to return the number
         * of filter blocks.
         */
        ret = fprog->len;
out:
        sockopt_release_sock(sk);
        return ret;
}

#ifdef CONFIG_INET
static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
                                    struct sock_reuseport *reuse,
                                    struct sock *sk, struct sk_buff *skb,
                                    struct sock *migrating_sk,
                                    u32 hash)
{
        reuse_kern->skb = skb;
        reuse_kern->sk = sk;
        reuse_kern->selected_sk = NULL;
        reuse_kern->migrating_sk = migrating_sk;
        reuse_kern->data_end = skb->data + skb_headlen(skb);
        reuse_kern->hash = hash;
        reuse_kern->reuseport_id = reuse->reuseport_id;
        reuse_kern->bind_inany = reuse->bind_inany;
}

struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
                                  struct bpf_prog *prog, struct sk_buff *skb,
                                  struct sock *migrating_sk,
                                  u32 hash)
{
        struct sk_reuseport_kern reuse_kern;
        enum sk_action action;

        bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash);
        action = bpf_prog_run(prog, &reuse_kern);

        if (action == SK_PASS)
                return reuse_kern.selected_sk;
        else
                return ERR_PTR(-ECONNREFUSED);
}

BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern,
           struct bpf_map *, map, void *, key, u32, flags)
{
        bool is_sockarray = map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY;
        struct sock_reuseport *reuse;
        struct sock *selected_sk;
        int err;

        selected_sk = map->ops->map_lookup_elem(map, key);
        if (!selected_sk)
                return -ENOENT;

        reuse = rcu_dereference(selected_sk->sk_reuseport_cb);
        if (!reuse) {
                /* reuseport_array has only sk with non NULL sk_reuseport_cb.
                 * The only (!reuse) case here is - the sk has already been
                 * unhashed (e.g. by close()), so treat it as -ENOENT.
                 *
                 * Other maps (e.g. sock_map) do not provide this guarantee and
                 * the sk may never be in the reuseport group to begin with.
                 */
                err = is_sockarray ? -ENOENT : -EINVAL;
                goto error;
        }

        if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) {
                struct sock *sk = reuse_kern->sk;

                if (sk->sk_protocol != selected_sk->sk_protocol) {
                        err = -EPROTOTYPE;
                } else if (sk->sk_family != selected_sk->sk_family) {
                        err = -EAFNOSUPPORT;
                } else {
                        /* Catch all. Likely bound to a different sockaddr. */
                        err = -EBADFD;
                }
                goto error;
        }

        reuse_kern->selected_sk = selected_sk;

        return 0;
error:
        /* Lookup in sock_map can return TCP ESTABLISHED sockets. */
        if (sk_is_refcounted(selected_sk))
                sock_put(selected_sk);

        return err;
}

static const struct bpf_func_proto sk_select_reuseport_proto = {
        .func           = sk_select_reuseport,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_CONST_MAP_PTR,
        .arg3_type      = ARG_PTR_TO_MAP_KEY,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_4(sk_reuseport_load_bytes,
           const struct sk_reuseport_kern *, reuse_kern, u32, offset,
           void *, to, u32, len)
{
        return ____bpf_skb_load_bytes(reuse_kern->skb, offset, to, len);
}

static const struct bpf_func_proto sk_reuseport_load_bytes_proto = {
        .func                = sk_reuseport_load_bytes,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(sk_reuseport_load_bytes_relative,
           const struct sk_reuseport_kern *, reuse_kern, u32, offset,
           void *, to, u32, len, u32, start_header)
{
        return ____bpf_skb_load_bytes_relative(reuse_kern->skb, offset, to,
                                               len, start_header);
}

static const struct bpf_func_proto sk_reuseport_load_bytes_relative_proto = {
        .func                = sk_reuseport_load_bytes_relative,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
        .arg5_type        = ARG_ANYTHING,
};

static const struct bpf_func_proto *
sk_reuseport_func_proto(enum bpf_func_id func_id,
                        const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_sk_select_reuseport:
                return &sk_select_reuseport_proto;
        case BPF_FUNC_skb_load_bytes:
                return &sk_reuseport_load_bytes_proto;
        case BPF_FUNC_skb_load_bytes_relative:
                return &sk_reuseport_load_bytes_relative_proto;
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_ptr_cookie_proto;
        case BPF_FUNC_ktime_get_coarse_ns:
                return &bpf_ktime_get_coarse_ns_proto;
        default:
                return bpf_base_func_proto(func_id, prog);
        }
}

static bool
sk_reuseport_is_valid_access(int off, int size,
                             enum bpf_access_type type,
                             const struct bpf_prog *prog,
                             struct bpf_insn_access_aux *info)
{
        const u32 size_default = sizeof(__u32);

        if (off < 0 || off >= sizeof(struct sk_reuseport_md) ||
            off % size || type != BPF_READ)
                return false;

        switch (off) {
        case offsetof(struct sk_reuseport_md, data):
                info->reg_type = PTR_TO_PACKET;
                return size == sizeof(__u64);

        case offsetof(struct sk_reuseport_md, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                return size == sizeof(__u64);

        case offsetof(struct sk_reuseport_md, hash):
                return size == size_default;

        case offsetof(struct sk_reuseport_md, sk):
                info->reg_type = PTR_TO_SOCKET;
                return size == sizeof(__u64);

        case offsetof(struct sk_reuseport_md, migrating_sk):
                info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
                return size == sizeof(__u64);

        /* Fields that allow narrowing */
        case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
                if (size < sizeof_field(struct sk_buff, protocol))
                        return false;
                fallthrough;
        case bpf_ctx_range(struct sk_reuseport_md, ip_protocol):
        case bpf_ctx_range(struct sk_reuseport_md, bind_inany):
        case bpf_ctx_range(struct sk_reuseport_md, len):
                bpf_ctx_record_field_size(info, size_default);
                return bpf_ctx_narrow_access_ok(off, size, size_default);

        default:
                return false;
        }
}

#define SK_REUSEPORT_LOAD_FIELD(F) ({                                        \
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \
                              si->dst_reg, si->src_reg,                        \
                              bpf_target_off(struct sk_reuseport_kern, F, \
                                             sizeof_field(struct sk_reuseport_kern, F), \
                                             target_size));                \
        })

#define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD)                                \
        SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern,                \
                                    struct sk_buff,                        \
                                    skb,                                \
                                    SKB_FIELD)

#define SK_REUSEPORT_LOAD_SK_FIELD(SK_FIELD)                                \
        SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern,                \
                                    struct sock,                        \
                                    sk,                                        \
                                    SK_FIELD)

static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
                                           const struct bpf_insn *si,
                                           struct bpf_insn *insn_buf,
                                           struct bpf_prog *prog,
                                           u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;

        switch (si->off) {
        case offsetof(struct sk_reuseport_md, data):
                SK_REUSEPORT_LOAD_SKB_FIELD(data);
                break;

        case offsetof(struct sk_reuseport_md, len):
                SK_REUSEPORT_LOAD_SKB_FIELD(len);
                break;

        case offsetof(struct sk_reuseport_md, eth_protocol):
                SK_REUSEPORT_LOAD_SKB_FIELD(protocol);
                break;

        case offsetof(struct sk_reuseport_md, ip_protocol):
                SK_REUSEPORT_LOAD_SK_FIELD(sk_protocol);
                break;

        case offsetof(struct sk_reuseport_md, data_end):
                SK_REUSEPORT_LOAD_FIELD(data_end);
                break;

        case offsetof(struct sk_reuseport_md, hash):
                SK_REUSEPORT_LOAD_FIELD(hash);
                break;

        case offsetof(struct sk_reuseport_md, bind_inany):
                SK_REUSEPORT_LOAD_FIELD(bind_inany);
                break;

        case offsetof(struct sk_reuseport_md, sk):
                SK_REUSEPORT_LOAD_FIELD(sk);
                break;

        case offsetof(struct sk_reuseport_md, migrating_sk):
                SK_REUSEPORT_LOAD_FIELD(migrating_sk);
                break;
        }

        return insn - insn_buf;
}

const struct bpf_verifier_ops sk_reuseport_verifier_ops = {
        .get_func_proto                = sk_reuseport_func_proto,
        .is_valid_access        = sk_reuseport_is_valid_access,
        .convert_ctx_access        = sk_reuseport_convert_ctx_access,
};

const struct bpf_prog_ops sk_reuseport_prog_ops = {
};

DEFINE_STATIC_KEY_FALSE(bpf_sk_lookup_enabled);
EXPORT_SYMBOL(bpf_sk_lookup_enabled);

BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx,
           struct sock *, sk, u64, flags)
{
        if (unlikely(flags & ~(BPF_SK_LOOKUP_F_REPLACE |
                               BPF_SK_LOOKUP_F_NO_REUSEPORT)))
                return -EINVAL;
        if (unlikely(sk && sk_is_refcounted(sk)))
                return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */
        if (unlikely(sk && sk_is_tcp(sk) && sk->sk_state != TCP_LISTEN))
                return -ESOCKTNOSUPPORT; /* only accept TCP socket in LISTEN */
        if (unlikely(sk && sk_is_udp(sk) && sk->sk_state != TCP_CLOSE))
                return -ESOCKTNOSUPPORT; /* only accept UDP socket in CLOSE */

        /* Check if socket is suitable for packet L3/L4 protocol */
        if (sk && sk->sk_protocol != ctx->protocol)
                return -EPROTOTYPE;
        if (sk && sk->sk_family != ctx->family &&
            (sk->sk_family == AF_INET || ipv6_only_sock(sk)))
                return -EAFNOSUPPORT;

        if (ctx->selected_sk && !(flags & BPF_SK_LOOKUP_F_REPLACE))
                return -EEXIST;

        /* Select socket as lookup result */
        ctx->selected_sk = sk;
        ctx->no_reuseport = flags & BPF_SK_LOOKUP_F_NO_REUSEPORT;
        return 0;
}

static const struct bpf_func_proto bpf_sk_lookup_assign_proto = {
        .func                = bpf_sk_lookup_assign,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_SOCKET_OR_NULL,
        .arg3_type        = ARG_ANYTHING,
};

static const struct bpf_func_proto *
sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_perf_event_output:
                return &bpf_event_output_data_proto;
        case BPF_FUNC_sk_assign:
                return &bpf_sk_lookup_assign_proto;
        case BPF_FUNC_sk_release:
                return &bpf_sk_release_proto;
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

static bool sk_lookup_is_valid_access(int off, int size,
                                      enum bpf_access_type type,
                                      const struct bpf_prog *prog,
                                      struct bpf_insn_access_aux *info)
{
        if (off < 0 || off >= sizeof(struct bpf_sk_lookup))
                return false;
        if (off % size != 0)
                return false;
        if (type != BPF_READ)
                return false;

        switch (off) {
        case bpf_ctx_range_ptr(struct bpf_sk_lookup, sk):
                info->reg_type = PTR_TO_SOCKET_OR_NULL;
                return size == sizeof(__u64);

        case bpf_ctx_range(struct bpf_sk_lookup, family):
        case bpf_ctx_range(struct bpf_sk_lookup, protocol):
        case bpf_ctx_range(struct bpf_sk_lookup, remote_ip4):
        case bpf_ctx_range(struct bpf_sk_lookup, local_ip4):
        case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]):
        case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]):
        case bpf_ctx_range(struct bpf_sk_lookup, local_port):
        case bpf_ctx_range(struct bpf_sk_lookup, ingress_ifindex):
                bpf_ctx_record_field_size(info, sizeof(__u32));
                return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32));

        case bpf_ctx_range(struct bpf_sk_lookup, remote_port):
                /* Allow 4-byte access to 2-byte field for backward compatibility */
                if (size == sizeof(__u32))
                        return true;
                bpf_ctx_record_field_size(info, sizeof(__be16));
                return bpf_ctx_narrow_access_ok(off, size, sizeof(__be16));

        case offsetofend(struct bpf_sk_lookup, remote_port) ...
             offsetof(struct bpf_sk_lookup, local_ip4) - 1:
                /* Allow access to zero padding for backward compatibility */
                bpf_ctx_record_field_size(info, sizeof(__u16));
                return bpf_ctx_narrow_access_ok(off, size, sizeof(__u16));

        default:
                return false;
        }
}

static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type,
                                        const struct bpf_insn *si,
                                        struct bpf_insn *insn_buf,
                                        struct bpf_prog *prog,
                                        u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;

        switch (si->off) {
        case offsetof(struct bpf_sk_lookup, sk):
                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sk_lookup_kern, selected_sk));
                break;

        case offsetof(struct bpf_sk_lookup, family):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     family, 2, target_size));
                break;

        case offsetof(struct bpf_sk_lookup, protocol):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     protocol, 2, target_size));
                break;

        case offsetof(struct bpf_sk_lookup, remote_ip4):
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     v4.saddr, 4, target_size));
                break;

        case offsetof(struct bpf_sk_lookup, local_ip4):
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     v4.daddr, 4, target_size));
                break;

        case bpf_ctx_range_till(struct bpf_sk_lookup,
                                remote_ip6[0], remote_ip6[3]): {
#if IS_ENABLED(CONFIG_IPV6)
                int off = si->off;

                off -= offsetof(struct bpf_sk_lookup, remote_ip6[0]);
                off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size);
                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sk_lookup_kern, v6.saddr));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;
        }
        case bpf_ctx_range_till(struct bpf_sk_lookup,
                                local_ip6[0], local_ip6[3]): {
#if IS_ENABLED(CONFIG_IPV6)
                int off = si->off;

                off -= offsetof(struct bpf_sk_lookup, local_ip6[0]);
                off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size);
                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sk_lookup_kern, v6.daddr));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;
        }
        case offsetof(struct bpf_sk_lookup, remote_port):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     sport, 2, target_size));
                break;

        case offsetofend(struct bpf_sk_lookup, remote_port):
                *target_size = 2;
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
                break;

        case offsetof(struct bpf_sk_lookup, local_port):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     dport, 2, target_size));
                break;

        case offsetof(struct bpf_sk_lookup, ingress_ifindex):
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     ingress_ifindex, 4, target_size));
                break;
        }

        return insn - insn_buf;
}

const struct bpf_prog_ops sk_lookup_prog_ops = {
        .test_run = bpf_prog_test_run_sk_lookup,
};

const struct bpf_verifier_ops sk_lookup_verifier_ops = {
        .get_func_proto                = sk_lookup_func_proto,
        .is_valid_access        = sk_lookup_is_valid_access,
        .convert_ctx_access        = sk_lookup_convert_ctx_access,
};

#endif /* CONFIG_INET */

DEFINE_BPF_DISPATCHER(xdp)

void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog)
{
        bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog);
}

BTF_ID_LIST_GLOBAL(btf_sock_ids, MAX_BTF_SOCK_TYPE)
#define BTF_SOCK_TYPE(name, type) BTF_ID(struct, type)
BTF_SOCK_TYPE_xxx
#undef BTF_SOCK_TYPE

BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk)
{
        /* tcp6_sock type is not generated in dwarf and hence btf,
         * trigger an explicit type generation here.
         */
        BTF_TYPE_EMIT(struct tcp6_sock);
        if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP &&
            sk->sk_family == AF_INET6)
                return (unsigned long)sk;

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = {
        .func                        = bpf_skc_to_tcp6_sock,
        .gpl_only                = false,
        .ret_type                = RET_PTR_TO_BTF_ID_OR_NULL,
        .arg1_type                = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .ret_btf_id                = &btf_sock_ids[BTF_SOCK_TYPE_TCP6],
};

BPF_CALL_1(bpf_skc_to_tcp_sock, struct sock *, sk)
{
        if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP)
                return (unsigned long)sk;

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_skc_to_tcp_sock_proto = {
        .func                        = bpf_skc_to_tcp_sock,
        .gpl_only                = false,
        .ret_type                = RET_PTR_TO_BTF_ID_OR_NULL,
        .arg1_type                = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .ret_btf_id                = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
};

BPF_CALL_1(bpf_skc_to_tcp_timewait_sock, struct sock *, sk)
{
        /* BTF types for tcp_timewait_sock and inet_timewait_sock are not
         * generated if CONFIG_INET=n. Trigger an explicit generation here.
         */
        BTF_TYPE_EMIT(struct inet_timewait_sock);
        BTF_TYPE_EMIT(struct tcp_timewait_sock);

#ifdef CONFIG_INET
        if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_TIME_WAIT)
                return (unsigned long)sk;
#endif

#if IS_BUILTIN(CONFIG_IPV6)
        if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_TIME_WAIT)
                return (unsigned long)sk;
#endif

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto = {
        .func                        = bpf_skc_to_tcp_timewait_sock,
        .gpl_only                = false,
        .ret_type                = RET_PTR_TO_BTF_ID_OR_NULL,
        .arg1_type                = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .ret_btf_id                = &btf_sock_ids[BTF_SOCK_TYPE_TCP_TW],
};

BPF_CALL_1(bpf_skc_to_tcp_request_sock, struct sock *, sk)
{
#ifdef CONFIG_INET
        if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_NEW_SYN_RECV)
                return (unsigned long)sk;
#endif

#if IS_BUILTIN(CONFIG_IPV6)
        if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_NEW_SYN_RECV)
                return (unsigned long)sk;
#endif

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = {
        .func                        = bpf_skc_to_tcp_request_sock,
        .gpl_only                = false,
        .ret_type                = RET_PTR_TO_BTF_ID_OR_NULL,
        .arg1_type                = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .ret_btf_id                = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ],
};

BPF_CALL_1(bpf_skc_to_udp6_sock, struct sock *, sk)
{
        /* udp6_sock type is not generated in dwarf and hence btf,
         * trigger an explicit type generation here.
         */
        BTF_TYPE_EMIT(struct udp6_sock);
        if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_UDP &&
            sk->sk_type == SOCK_DGRAM && sk->sk_family == AF_INET6)
                return (unsigned long)sk;

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = {
        .func                        = bpf_skc_to_udp6_sock,
        .gpl_only                = false,
        .ret_type                = RET_PTR_TO_BTF_ID_OR_NULL,
        .arg1_type                = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .ret_btf_id                = &btf_sock_ids[BTF_SOCK_TYPE_UDP6],
};

BPF_CALL_1(bpf_skc_to_unix_sock, struct sock *, sk)
{
        /* unix_sock type is not generated in dwarf and hence btf,
         * trigger an explicit type generation here.
         */
        BTF_TYPE_EMIT(struct unix_sock);
        if (sk && sk_fullsock(sk) && sk->sk_family == AF_UNIX)
                return (unsigned long)sk;

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_skc_to_unix_sock_proto = {
        .func                        = bpf_skc_to_unix_sock,
        .gpl_only                = false,
        .ret_type                = RET_PTR_TO_BTF_ID_OR_NULL,
        .arg1_type                = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .ret_btf_id                = &btf_sock_ids[BTF_SOCK_TYPE_UNIX],
};

BPF_CALL_1(bpf_skc_to_mptcp_sock, struct sock *, sk)
{
        BTF_TYPE_EMIT(struct mptcp_sock);
        return (unsigned long)bpf_mptcp_sock_from_subflow(sk);
}

const struct bpf_func_proto bpf_skc_to_mptcp_sock_proto = {
        .func                = bpf_skc_to_mptcp_sock,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_BTF_ID_OR_NULL,
        .arg1_type        = ARG_PTR_TO_SOCK_COMMON,
        .ret_btf_id        = &btf_sock_ids[BTF_SOCK_TYPE_MPTCP],
};

BPF_CALL_1(bpf_sock_from_file, struct file *, file)
{
        return (unsigned long)sock_from_file(file);
}

BTF_ID_LIST(bpf_sock_from_file_btf_ids)
BTF_ID(struct, socket)
BTF_ID(struct, file)

const struct bpf_func_proto bpf_sock_from_file_proto = {
        .func                = bpf_sock_from_file,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_BTF_ID_OR_NULL,
        .ret_btf_id        = &bpf_sock_from_file_btf_ids[0],
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &bpf_sock_from_file_btf_ids[1],
};

static const struct bpf_func_proto *
bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        const struct bpf_func_proto *func;

        switch (func_id) {
        case BPF_FUNC_skc_to_tcp6_sock:
                func = &bpf_skc_to_tcp6_sock_proto;
                break;
        case BPF_FUNC_skc_to_tcp_sock:
                func = &bpf_skc_to_tcp_sock_proto;
                break;
        case BPF_FUNC_skc_to_tcp_timewait_sock:
                func = &bpf_skc_to_tcp_timewait_sock_proto;
                break;
        case BPF_FUNC_skc_to_tcp_request_sock:
                func = &bpf_skc_to_tcp_request_sock_proto;
                break;
        case BPF_FUNC_skc_to_udp6_sock:
                func = &bpf_skc_to_udp6_sock_proto;
                break;
        case BPF_FUNC_skc_to_unix_sock:
                func = &bpf_skc_to_unix_sock_proto;
                break;
        case BPF_FUNC_skc_to_mptcp_sock:
                func = &bpf_skc_to_mptcp_sock_proto;
                break;
        case BPF_FUNC_ktime_get_coarse_ns:
                return &bpf_ktime_get_coarse_ns_proto;
        default:
                return bpf_base_func_proto(func_id, prog);
        }

        if (!bpf_token_capable(prog->aux->token, CAP_PERFMON))
                return NULL;

        return func;
}

/**
 * bpf_skb_meta_pointer() - Gets a mutable pointer within the skb metadata area.
 * @skb: socket buffer carrying the metadata
 * @offset: offset into the metadata area, must be <= skb_metadata_len()
 */
void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset)
{
        return skb_metadata_end(skb) - skb_metadata_len(skb) + offset;
}

__bpf_kfunc_start_defs();
__bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags,
                                    struct bpf_dynptr *ptr__uninit)
{
        struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit;
        struct sk_buff *skb = (struct sk_buff *)s;

        if (flags) {
                bpf_dynptr_set_null(ptr);
                return -EINVAL;
        }

        bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB, 0, skb->len);

        return 0;
}

/**
 * bpf_dynptr_from_skb_meta() - Initialize a dynptr to the skb metadata area.
 * @skb_: socket buffer carrying the metadata
 * @flags: future use, must be zero
 * @ptr__uninit: dynptr to initialize
 *
 * Set up a dynptr for access to the metadata area earlier allocated from the
 * XDP context with bpf_xdp_adjust_meta(). Serves as an alternative to
 * &__sk_buff->data_meta.
 *
 * If passed @skb_ is a clone which shares the data with the original, the
 * dynptr will be read-only. This limitation may be lifted in the future.
 *
 * Return:
 * * %0         - dynptr ready to use
 * * %-EINVAL   - invalid flags, dynptr set to null
 */
__bpf_kfunc int bpf_dynptr_from_skb_meta(struct __sk_buff *skb_, u64 flags,
                                         struct bpf_dynptr *ptr__uninit)
{
        struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit;
        struct sk_buff *skb = (struct sk_buff *)skb_;

        if (flags) {
                bpf_dynptr_set_null(ptr);
                return -EINVAL;
        }

        bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB_META, 0, skb_metadata_len(skb));

        if (skb_cloned(skb))
                bpf_dynptr_set_rdonly(ptr);

        return 0;
}

__bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_md *x, u64 flags,
                                    struct bpf_dynptr *ptr__uninit)
{
        struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit;
        struct xdp_buff *xdp = (struct xdp_buff *)x;

        if (flags) {
                bpf_dynptr_set_null(ptr);
                return -EINVAL;
        }

        bpf_dynptr_init(ptr, xdp, BPF_DYNPTR_TYPE_XDP, 0, xdp_get_buff_len(xdp));

        return 0;
}

__bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern,
                                           const u8 *sun_path, u32 sun_path__sz)
{
        struct sockaddr_un *un;

        if (sa_kern->sk->sk_family != AF_UNIX)
                return -EINVAL;

        /* We do not allow changing the address to unnamed or larger than the
         * maximum allowed address size for a unix sockaddr.
         */
        if (sun_path__sz == 0 || sun_path__sz > UNIX_PATH_MAX)
                return -EINVAL;

        un = (struct sockaddr_un *)sa_kern->uaddr;
        memcpy(un->sun_path, sun_path, sun_path__sz);
        sa_kern->uaddrlen = offsetof(struct sockaddr_un, sun_path) + sun_path__sz;

        return 0;
}

__bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct __sk_buff *s, struct sock *sk,
                                        struct bpf_tcp_req_attrs *attrs, int attrs__sz)
{
#if IS_ENABLED(CONFIG_SYN_COOKIES)
        struct sk_buff *skb = (struct sk_buff *)s;
        const struct request_sock_ops *ops;
        struct inet_request_sock *ireq;
        struct tcp_request_sock *treq;
        struct request_sock *req;
        struct net *net;
        __u16 min_mss;
        u32 tsoff = 0;

        if (attrs__sz != sizeof(*attrs) ||
            attrs->reserved[0] || attrs->reserved[1] || attrs->reserved[2])
                return -EINVAL;

        if (!skb_at_tc_ingress(skb))
                return -EINVAL;

        net = dev_net(skb->dev);
        if (net != sock_net(sk))
                return -ENETUNREACH;

        switch (skb->protocol) {
        case htons(ETH_P_IP):
                ops = &tcp_request_sock_ops;
                min_mss = 536;
                break;
#if IS_BUILTIN(CONFIG_IPV6)
        case htons(ETH_P_IPV6):
                ops = &tcp6_request_sock_ops;
                min_mss = IPV6_MIN_MTU - 60;
                break;
#endif
        default:
                return -EINVAL;
        }

        if (sk->sk_type != SOCK_STREAM || sk->sk_state != TCP_LISTEN ||
            sk_is_mptcp(sk))
                return -EINVAL;

        if (attrs->mss < min_mss)
                return -EINVAL;

        if (attrs->wscale_ok) {
                if (!READ_ONCE(net->ipv4.sysctl_tcp_window_scaling))
                        return -EINVAL;

                if (attrs->snd_wscale > TCP_MAX_WSCALE ||
                    attrs->rcv_wscale > TCP_MAX_WSCALE)
                        return -EINVAL;
        }

        if (attrs->sack_ok && !READ_ONCE(net->ipv4.sysctl_tcp_sack))
                return -EINVAL;

        if (attrs->tstamp_ok) {
                if (!READ_ONCE(net->ipv4.sysctl_tcp_timestamps))
                        return -EINVAL;

                tsoff = attrs->rcv_tsecr - tcp_ns_to_ts(attrs->usec_ts_ok, tcp_clock_ns());
        }

        req = inet_reqsk_alloc(ops, sk, false);
        if (!req)
                return -ENOMEM;

        ireq = inet_rsk(req);
        treq = tcp_rsk(req);

        req->rsk_listener = sk;
        req->syncookie = 1;
        req->mss = attrs->mss;
        req->ts_recent = attrs->rcv_tsval;

        ireq->snd_wscale = attrs->snd_wscale;
        ireq->rcv_wscale = attrs->rcv_wscale;
        ireq->tstamp_ok        = !!attrs->tstamp_ok;
        ireq->sack_ok = !!attrs->sack_ok;
        ireq->wscale_ok = !!attrs->wscale_ok;
        ireq->ecn_ok = !!attrs->ecn_ok;

        treq->req_usec_ts = !!attrs->usec_ts_ok;
        treq->ts_off = tsoff;

        skb_orphan(skb);
        skb->sk = req_to_sk(req);
        skb->destructor = sock_pfree;

        return 0;
#else
        return -EOPNOTSUPP;
#endif
}

__bpf_kfunc int bpf_sock_ops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops,
                                              u64 flags)
{
        struct sk_buff *skb;

        if (skops->op != BPF_SOCK_OPS_TSTAMP_SENDMSG_CB)
                return -EOPNOTSUPP;

        if (flags)
                return -EINVAL;

        skb = skops->skb;
        skb_shinfo(skb)->tx_flags |= SKBTX_BPF;
        TCP_SKB_CB(skb)->txstamp_ack |= TSTAMP_ACK_BPF;
        skb_shinfo(skb)->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;

        return 0;
}

/**
 * bpf_xdp_pull_data() - Pull in non-linear xdp data.
 * @x: &xdp_md associated with the XDP buffer
 * @len: length of data to be made directly accessible in the linear part
 *
 * Pull in data in case the XDP buffer associated with @x is non-linear and
 * not all @len are in the linear data area.
 *
 * Direct packet access allows reading and writing linear XDP data through
 * packet pointers (i.e., &xdp_md->data + offsets). The amount of data which
 * ends up in the linear part of the xdp_buff depends on the NIC and its
 * configuration. When a frag-capable XDP program wants to directly access
 * headers that may be in the non-linear area, call this kfunc to make sure
 * the data is available in the linear area. Alternatively, use dynptr or
 * bpf_xdp_{load,store}_bytes() to access data without pulling.
 *
 * This kfunc can also be used with bpf_xdp_adjust_head() to decapsulate
 * headers in the non-linear data area.
 *
 * A call to this kfunc may reduce headroom. If there is not enough tailroom
 * in the linear data area, metadata and data will be shifted down.
 *
 * A call to this kfunc is susceptible to change the buffer geometry.
 * Therefore, at load time, all checks on pointers previously done by the
 * verifier are invalidated and must be performed again, if the kfunc is used
 * in combination with direct packet access.
 *
 * Return:
 * * %0         - success
 * * %-EINVAL   - invalid len
 */
__bpf_kfunc int bpf_xdp_pull_data(struct xdp_md *x, u32 len)
{
        struct xdp_buff *xdp = (struct xdp_buff *)x;
        struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
        int i, delta, shift, headroom, tailroom, n_frags_free = 0;
        void *data_hard_end = xdp_data_hard_end(xdp);
        int data_len = xdp->data_end - xdp->data;
        void *start;

        if (len <= data_len)
                return 0;

        if (unlikely(len > xdp_get_buff_len(xdp)))
                return -EINVAL;

        start = xdp_data_meta_unsupported(xdp) ? xdp->data : xdp->data_meta;

        headroom = start - xdp->data_hard_start - sizeof(struct xdp_frame);
        tailroom = data_hard_end - xdp->data_end;

        delta = len - data_len;
        if (unlikely(delta > tailroom + headroom))
                return -EINVAL;

        shift = delta - tailroom;
        if (shift > 0) {
                memmove(start - shift, start, xdp->data_end - start);

                xdp->data_meta -= shift;
                xdp->data -= shift;
                xdp->data_end -= shift;
        }

        for (i = 0; i < sinfo->nr_frags && delta; i++) {
                skb_frag_t *frag = &sinfo->frags[i];
                u32 shrink = min_t(u32, delta, skb_frag_size(frag));

                memcpy(xdp->data_end, skb_frag_address(frag), shrink);

                xdp->data_end += shrink;
                sinfo->xdp_frags_size -= shrink;
                delta -= shrink;
                if (bpf_xdp_shrink_data(xdp, frag, shrink, false))
                        n_frags_free++;
        }

        if (unlikely(n_frags_free)) {
                memmove(sinfo->frags, sinfo->frags + n_frags_free,
                        (sinfo->nr_frags - n_frags_free) * sizeof(skb_frag_t));

                sinfo->nr_frags -= n_frags_free;

                if (!sinfo->nr_frags) {
                        xdp_buff_clear_frags_flag(xdp);
                        xdp_buff_clear_frag_pfmemalloc(xdp);
                }
        }

        return 0;
}

__bpf_kfunc_end_defs();

int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags,
                               struct bpf_dynptr *ptr__uninit)
{
        struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit;
        int err;

        err = bpf_dynptr_from_skb(skb, flags, ptr__uninit);
        if (err)
                return err;

        bpf_dynptr_set_rdonly(ptr);

        return 0;
}

BTF_KFUNCS_START(bpf_kfunc_check_set_skb)
BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(bpf_kfunc_check_set_skb)

BTF_KFUNCS_START(bpf_kfunc_check_set_skb_meta)
BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(bpf_kfunc_check_set_skb_meta)

BTF_KFUNCS_START(bpf_kfunc_check_set_xdp)
BTF_ID_FLAGS(func, bpf_dynptr_from_xdp)
BTF_ID_FLAGS(func, bpf_xdp_pull_data)
BTF_KFUNCS_END(bpf_kfunc_check_set_xdp)

BTF_KFUNCS_START(bpf_kfunc_check_set_sock_addr)
BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path)
BTF_KFUNCS_END(bpf_kfunc_check_set_sock_addr)

BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk)
BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk)

BTF_KFUNCS_START(bpf_kfunc_check_set_sock_ops)
BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(bpf_kfunc_check_set_sock_ops)

static const struct btf_kfunc_id_set bpf_kfunc_set_skb = {
        .owner = THIS_MODULE,
        .set = &bpf_kfunc_check_set_skb,
};

static const struct btf_kfunc_id_set bpf_kfunc_set_skb_meta = {
        .owner = THIS_MODULE,
        .set = &bpf_kfunc_check_set_skb_meta,
};

static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = {
        .owner = THIS_MODULE,
        .set = &bpf_kfunc_check_set_xdp,
};

static const struct btf_kfunc_id_set bpf_kfunc_set_sock_addr = {
        .owner = THIS_MODULE,
        .set = &bpf_kfunc_check_set_sock_addr,
};

static const struct btf_kfunc_id_set bpf_kfunc_set_tcp_reqsk = {
        .owner = THIS_MODULE,
        .set = &bpf_kfunc_check_set_tcp_reqsk,
};

static const struct btf_kfunc_id_set bpf_kfunc_set_sock_ops = {
        .owner = THIS_MODULE,
        .set = &bpf_kfunc_check_set_sock_ops,
};

static int __init bpf_kfunc_init(void)
{
        int ret;

        ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SK_SKB, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCKET_FILTER, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_OUT, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_IN, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb_meta);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb_meta);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
                                               &bpf_kfunc_set_sock_addr);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk);
        return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCK_OPS, &bpf_kfunc_set_sock_ops);
}
late_initcall(bpf_kfunc_init);

__bpf_kfunc_start_defs();

/* bpf_sock_destroy: Destroy the given socket with ECONNABORTED error code.
 *
 * The function expects a non-NULL pointer to a socket, and invokes the
 * protocol specific socket destroy handlers.
 *
 * The helper can only be called from BPF contexts that have acquired the socket
 * locks.
 *
 * Parameters:
 * @sock: Pointer to socket to be destroyed
 *
 * Return:
 * On error, may return EPROTONOSUPPORT, EINVAL.
 * EPROTONOSUPPORT if protocol specific destroy handler is not supported.
 * 0 otherwise
 */
__bpf_kfunc int bpf_sock_destroy(struct sock_common *sock)
{
        struct sock *sk = (struct sock *)sock;

        /* The locking semantics that allow for synchronous execution of the
         * destroy handlers are only supported for TCP and UDP.
         * Supporting protocols will need to acquire sock lock in the BPF context
         * prior to invoking this kfunc.
         */
        if (!sk->sk_prot->diag_destroy || (sk->sk_protocol != IPPROTO_TCP &&
                                           sk->sk_protocol != IPPROTO_UDP))
                return -EOPNOTSUPP;

        return sk->sk_prot->diag_destroy(sk, ECONNABORTED);
}

__bpf_kfunc_end_defs();

BTF_KFUNCS_START(bpf_sk_iter_kfunc_ids)
BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(bpf_sk_iter_kfunc_ids)

static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id)
{
        if (btf_id_set8_contains(&bpf_sk_iter_kfunc_ids, kfunc_id) &&
            prog->expected_attach_type != BPF_TRACE_ITER)
                return -EACCES;
        return 0;
}

static const struct btf_kfunc_id_set bpf_sk_iter_kfunc_set = {
        .owner = THIS_MODULE,
        .set   = &bpf_sk_iter_kfunc_ids,
        .filter = tracing_iter_filter,
};

static int init_subsystem(void)
{
        return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_sk_iter_kfunc_set);
}
late_initcall(init_subsystem);
















































   17 





































   17 






   17 












   17 





   17 
   17 


   17 










   17 





   17 
   17 



















   17 












































































































































































































































































   17 
   17 


    1 








    1 



    1 
    1 



    1 



    1 











































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Asynchronous Compression operations
 *
 * Copyright (c) 2016, Intel Corporation
 * Authors: Weigang Li <weigang.li@intel.com>
 *          Giovanni Cabiddu <giovanni.cabiddu@intel.com>
 */

#include <crypto/internal/acompress.h>
#include <crypto/scatterwalk.h>
#include <linux/cryptouser.h>
#include <linux/cpumask.h>
#include <linux/err.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/scatterlist.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/smp.h>
#include <linux/spinlock.h>
#include <linux/string.h>
#include <linux/workqueue.h>
#include <net/netlink.h>

#include "compress.h"

struct crypto_scomp;

enum {
        ACOMP_WALK_SLEEP = 1 << 0,
        ACOMP_WALK_SRC_LINEAR = 1 << 1,
        ACOMP_WALK_DST_LINEAR = 1 << 2,
};

static const struct crypto_type crypto_acomp_type;

static void acomp_reqchain_done(void *data, int err);

static inline struct acomp_alg *__crypto_acomp_alg(struct crypto_alg *alg)
{
        return container_of(alg, struct acomp_alg, calg.base);
}

static inline struct acomp_alg *crypto_acomp_alg(struct crypto_acomp *tfm)
{
        return __crypto_acomp_alg(crypto_acomp_tfm(tfm)->__crt_alg);
}

static int __maybe_unused crypto_acomp_report(
        struct sk_buff *skb, struct crypto_alg *alg)
{
        struct crypto_report_acomp racomp;

        memset(&racomp, 0, sizeof(racomp));

        strscpy(racomp.type, "acomp", sizeof(racomp.type));

        return nla_put(skb, CRYPTOCFGA_REPORT_ACOMP, sizeof(racomp), &racomp);
}

static void crypto_acomp_show(struct seq_file *m, struct crypto_alg *alg)
        __maybe_unused;

static void crypto_acomp_show(struct seq_file *m, struct crypto_alg *alg)
{
        seq_puts(m, "type         : acomp\n");
}

static void crypto_acomp_exit_tfm(struct crypto_tfm *tfm)
{
        struct crypto_acomp *acomp = __crypto_acomp_tfm(tfm);
        struct acomp_alg *alg = crypto_acomp_alg(acomp);

        if (alg->exit)
                alg->exit(acomp);

        if (acomp_is_async(acomp))
                crypto_free_acomp(crypto_acomp_fb(acomp));
}

static int crypto_acomp_init_tfm(struct crypto_tfm *tfm)
{
        struct crypto_acomp *acomp = __crypto_acomp_tfm(tfm);
        struct acomp_alg *alg = crypto_acomp_alg(acomp);
        struct crypto_acomp *fb = NULL;
        int err;

        if (tfm->__crt_alg->cra_type != &crypto_acomp_type)
                return crypto_init_scomp_ops_async(tfm);

        if (acomp_is_async(acomp)) {
                fb = crypto_alloc_acomp(crypto_acomp_alg_name(acomp), 0,
                                        CRYPTO_ALG_ASYNC);
                if (IS_ERR(fb))
                        return PTR_ERR(fb);

                err = -EINVAL;
                if (crypto_acomp_reqsize(fb) > MAX_SYNC_COMP_REQSIZE)
                        goto out_free_fb;

                tfm->fb = crypto_acomp_tfm(fb);
        }

        acomp->compress = alg->compress;
        acomp->decompress = alg->decompress;
        acomp->reqsize = alg->base.cra_reqsize;

        acomp->base.exit = crypto_acomp_exit_tfm;

        if (!alg->init)
                return 0;

        err = alg->init(acomp);
        if (err)
                goto out_free_fb;

        return 0;

out_free_fb:
        crypto_free_acomp(fb);
        return err;
}

static unsigned int crypto_acomp_extsize(struct crypto_alg *alg)
{
        int extsize = crypto_alg_extsize(alg);

        if (alg->cra_type != &crypto_acomp_type)
                extsize += sizeof(struct crypto_scomp *);

        return extsize;
}

static const struct crypto_type crypto_acomp_type = {
        .extsize = crypto_acomp_extsize,
        .init_tfm = crypto_acomp_init_tfm,
#ifdef CONFIG_PROC_FS
        .show = crypto_acomp_show,
#endif
#if IS_ENABLED(CONFIG_CRYPTO_USER)
        .report = crypto_acomp_report,
#endif
        .maskclear = ~CRYPTO_ALG_TYPE_MASK,
        .maskset = CRYPTO_ALG_TYPE_ACOMPRESS_MASK,
        .type = CRYPTO_ALG_TYPE_ACOMPRESS,
        .tfmsize = offsetof(struct crypto_acomp, base),
        .algsize = offsetof(struct acomp_alg, base),
};

struct crypto_acomp *crypto_alloc_acomp(const char *alg_name, u32 type,
                                        u32 mask)
{
        return crypto_alloc_tfm(alg_name, &crypto_acomp_type, type, mask);
}
EXPORT_SYMBOL_GPL(crypto_alloc_acomp);

struct crypto_acomp *crypto_alloc_acomp_node(const char *alg_name, u32 type,
                                        u32 mask, int node)
{
        return crypto_alloc_tfm_node(alg_name, &crypto_acomp_type, type, mask,
                                node);
}
EXPORT_SYMBOL_GPL(crypto_alloc_acomp_node);

static void acomp_save_req(struct acomp_req *req, crypto_completion_t cplt)
{
        struct acomp_req_chain *state = &req->chain;

        state->compl = req->base.complete;
        state->data = req->base.data;
        req->base.complete = cplt;
        req->base.data = state;
}

static void acomp_restore_req(struct acomp_req *req)
{
        struct acomp_req_chain *state = req->base.data;

        req->base.complete = state->compl;
        req->base.data = state->data;
}

static void acomp_reqchain_virt(struct acomp_req *req)
{
        struct acomp_req_chain *state = &req->chain;
        unsigned int slen = req->slen;
        unsigned int dlen = req->dlen;

        if (state->flags & CRYPTO_ACOMP_REQ_SRC_VIRT)
                acomp_request_set_src_dma(req, state->src, slen);
        if (state->flags & CRYPTO_ACOMP_REQ_DST_VIRT)
                acomp_request_set_dst_dma(req, state->dst, dlen);
}

static void acomp_virt_to_sg(struct acomp_req *req)
{
        struct acomp_req_chain *state = &req->chain;

        state->flags = req->base.flags & (CRYPTO_ACOMP_REQ_SRC_VIRT |
                                          CRYPTO_ACOMP_REQ_DST_VIRT);

        if (acomp_request_src_isvirt(req)) {
                unsigned int slen = req->slen;
                const u8 *svirt = req->svirt;

                state->src = svirt;
                sg_init_one(&state->ssg, svirt, slen);
                acomp_request_set_src_sg(req, &state->ssg, slen);
        }

        if (acomp_request_dst_isvirt(req)) {
                unsigned int dlen = req->dlen;
                u8 *dvirt = req->dvirt;

                state->dst = dvirt;
                sg_init_one(&state->dsg, dvirt, dlen);
                acomp_request_set_dst_sg(req, &state->dsg, dlen);
        }
}

static int acomp_do_nondma(struct acomp_req *req, bool comp)
{
        ACOMP_FBREQ_ON_STACK(fbreq, req);
        int err;

        if (comp)
                err = crypto_acomp_compress(fbreq);
        else
                err = crypto_acomp_decompress(fbreq);

        req->dlen = fbreq->dlen;
        return err;
}

static int acomp_do_one_req(struct acomp_req *req, bool comp)
{
        if (acomp_request_isnondma(req))
                return acomp_do_nondma(req, comp);

        acomp_virt_to_sg(req);
        return comp ? crypto_acomp_reqtfm(req)->compress(req) :
                      crypto_acomp_reqtfm(req)->decompress(req);
}

static int acomp_reqchain_finish(struct acomp_req *req, int err)
{
        acomp_reqchain_virt(req);
        acomp_restore_req(req);
        return err;
}

static void acomp_reqchain_done(void *data, int err)
{
        struct acomp_req *req = data;
        crypto_completion_t compl;

        compl = req->chain.compl;
        data = req->chain.data;

        if (err == -EINPROGRESS)
                goto notify;

        err = acomp_reqchain_finish(req, err);

notify:
        compl(data, err);
}

static int acomp_do_req_chain(struct acomp_req *req, bool comp)
{
        int err;

        acomp_save_req(req, acomp_reqchain_done);

        err = acomp_do_one_req(req, comp);
        if (err == -EBUSY || err == -EINPROGRESS)
                return err;

        return acomp_reqchain_finish(req, err);
}

int crypto_acomp_compress(struct acomp_req *req)
{
        struct crypto_acomp *tfm = crypto_acomp_reqtfm(req);

        if (acomp_req_on_stack(req) && acomp_is_async(tfm))
                return -EAGAIN;
        if (crypto_acomp_req_virt(tfm) || acomp_request_issg(req))
                return crypto_acomp_reqtfm(req)->compress(req);
        return acomp_do_req_chain(req, true);
}
EXPORT_SYMBOL_GPL(crypto_acomp_compress);

int crypto_acomp_decompress(struct acomp_req *req)
{
        struct crypto_acomp *tfm = crypto_acomp_reqtfm(req);

        if (acomp_req_on_stack(req) && acomp_is_async(tfm))
                return -EAGAIN;
        if (crypto_acomp_req_virt(tfm) || acomp_request_issg(req))
                return crypto_acomp_reqtfm(req)->decompress(req);
        return acomp_do_req_chain(req, false);
}
EXPORT_SYMBOL_GPL(crypto_acomp_decompress);

void comp_prepare_alg(struct comp_alg_common *alg)
{
        struct crypto_alg *base = &alg->base;

        base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
}

int crypto_register_acomp(struct acomp_alg *alg)
{
        struct crypto_alg *base = &alg->calg.base;

        comp_prepare_alg(&alg->calg);

        base->cra_type = &crypto_acomp_type;
        base->cra_flags |= CRYPTO_ALG_TYPE_ACOMPRESS;

        return crypto_register_alg(base);
}
EXPORT_SYMBOL_GPL(crypto_register_acomp);

void crypto_unregister_acomp(struct acomp_alg *alg)
{
        crypto_unregister_alg(&alg->base);
}
EXPORT_SYMBOL_GPL(crypto_unregister_acomp);

int crypto_register_acomps(struct acomp_alg *algs, int count)
{
        int i, ret;

        for (i = 0; i < count; i++) {
                ret = crypto_register_acomp(&algs[i]);
                if (ret)
                        goto err;
        }

        return 0;

err:
        for (--i; i >= 0; --i)
                crypto_unregister_acomp(&algs[i]);

        return ret;
}
EXPORT_SYMBOL_GPL(crypto_register_acomps);

void crypto_unregister_acomps(struct acomp_alg *algs, int count)
{
        int i;

        for (i = count - 1; i >= 0; --i)
                crypto_unregister_acomp(&algs[i]);
}
EXPORT_SYMBOL_GPL(crypto_unregister_acomps);

static void acomp_stream_workfn(struct work_struct *work)
{
        struct crypto_acomp_streams *s =
                container_of(work, struct crypto_acomp_streams, stream_work);
        struct crypto_acomp_stream __percpu *streams = s->streams;
        int cpu;

        for_each_cpu(cpu, &s->stream_want) {
                struct crypto_acomp_stream *ps;
                void *ctx;

                ps = per_cpu_ptr(streams, cpu);
                if (ps->ctx)
                        continue;

                ctx = s->alloc_ctx();
                if (IS_ERR(ctx))
                        break;

                spin_lock_bh(&ps->lock);
                ps->ctx = ctx;
                spin_unlock_bh(&ps->lock);

                cpumask_clear_cpu(cpu, &s->stream_want);
        }
}

void crypto_acomp_free_streams(struct crypto_acomp_streams *s)
{
        struct crypto_acomp_stream __percpu *streams = s->streams;
        void (*free_ctx)(void *);
        int i;

        s->streams = NULL;
        if (!streams)
                return;

        cancel_work_sync(&s->stream_work);
        free_ctx = s->free_ctx;

        for_each_possible_cpu(i) {
                struct crypto_acomp_stream *ps = per_cpu_ptr(streams, i);

                if (!ps->ctx)
                        continue;

                free_ctx(ps->ctx);
        }

        free_percpu(streams);
}
EXPORT_SYMBOL_GPL(crypto_acomp_free_streams);

int crypto_acomp_alloc_streams(struct crypto_acomp_streams *s)
{
        struct crypto_acomp_stream __percpu *streams;
        struct crypto_acomp_stream *ps;
        unsigned int i;
        void *ctx;

        if (s->streams)
                return 0;

        streams = alloc_percpu(struct crypto_acomp_stream);
        if (!streams)
                return -ENOMEM;

        ctx = s->alloc_ctx();
        if (IS_ERR(ctx)) {
                free_percpu(streams);
                return PTR_ERR(ctx);
        }

        i = cpumask_first(cpu_possible_mask);
        ps = per_cpu_ptr(streams, i);
        ps->ctx = ctx;

        for_each_possible_cpu(i) {
                ps = per_cpu_ptr(streams, i);
                spin_lock_init(&ps->lock);
        }

        s->streams = streams;

        INIT_WORK(&s->stream_work, acomp_stream_workfn);
        return 0;
}
EXPORT_SYMBOL_GPL(crypto_acomp_alloc_streams);

struct crypto_acomp_stream *crypto_acomp_lock_stream_bh(
        struct crypto_acomp_streams *s) __acquires(stream)
{
        struct crypto_acomp_stream __percpu *streams = s->streams;
        int cpu = raw_smp_processor_id();
        struct crypto_acomp_stream *ps;

        ps = per_cpu_ptr(streams, cpu);
        spin_lock_bh(&ps->lock);
        if (likely(ps->ctx))
                return ps;
        spin_unlock(&ps->lock);

        cpumask_set_cpu(cpu, &s->stream_want);
        schedule_work(&s->stream_work);

        ps = per_cpu_ptr(streams, cpumask_first(cpu_possible_mask));
        spin_lock(&ps->lock);
        return ps;
}
EXPORT_SYMBOL_GPL(crypto_acomp_lock_stream_bh);

void acomp_walk_done_src(struct acomp_walk *walk, int used)
{
        walk->slen -= used;
        if ((walk->flags & ACOMP_WALK_SRC_LINEAR))
                scatterwalk_advance(&walk->in, used);
        else
                scatterwalk_done_src(&walk->in, used);

        if ((walk->flags & ACOMP_WALK_SLEEP))
                cond_resched();
}
EXPORT_SYMBOL_GPL(acomp_walk_done_src);

void acomp_walk_done_dst(struct acomp_walk *walk, int used)
{
        walk->dlen -= used;
        if ((walk->flags & ACOMP_WALK_DST_LINEAR))
                scatterwalk_advance(&walk->out, used);
        else
                scatterwalk_done_dst(&walk->out, used);

        if ((walk->flags & ACOMP_WALK_SLEEP))
                cond_resched();
}
EXPORT_SYMBOL_GPL(acomp_walk_done_dst);

int acomp_walk_next_src(struct acomp_walk *walk)
{
        unsigned int slen = walk->slen;
        unsigned int max = UINT_MAX;

        if (!preempt_model_preemptible() && (walk->flags & ACOMP_WALK_SLEEP))
                max = PAGE_SIZE;
        if ((walk->flags & ACOMP_WALK_SRC_LINEAR)) {
                walk->in.__addr = (void *)(((u8 *)walk->in.sg) +
                                           walk->in.offset);
                return min(slen, max);
        }

        return slen ? scatterwalk_next(&walk->in, slen) : 0;
}
EXPORT_SYMBOL_GPL(acomp_walk_next_src);

int acomp_walk_next_dst(struct acomp_walk *walk)
{
        unsigned int dlen = walk->dlen;
        unsigned int max = UINT_MAX;

        if (!preempt_model_preemptible() && (walk->flags & ACOMP_WALK_SLEEP))
                max = PAGE_SIZE;
        if ((walk->flags & ACOMP_WALK_DST_LINEAR)) {
                walk->out.__addr = (void *)(((u8 *)walk->out.sg) +
                                            walk->out.offset);
                return min(dlen, max);
        }

        return dlen ? scatterwalk_next(&walk->out, dlen) : 0;
}
EXPORT_SYMBOL_GPL(acomp_walk_next_dst);

int acomp_walk_virt(struct acomp_walk *__restrict walk,
                    struct acomp_req *__restrict req, bool atomic)
{
        struct scatterlist *src = req->src;
        struct scatterlist *dst = req->dst;

        walk->slen = req->slen;
        walk->dlen = req->dlen;

        if (!walk->slen || !walk->dlen)
                return -EINVAL;

        walk->flags = 0;
        if ((req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) && !atomic)
                walk->flags |= ACOMP_WALK_SLEEP;
        if ((req->base.flags & CRYPTO_ACOMP_REQ_SRC_VIRT))
                walk->flags |= ACOMP_WALK_SRC_LINEAR;
        if ((req->base.flags & CRYPTO_ACOMP_REQ_DST_VIRT))
                walk->flags |= ACOMP_WALK_DST_LINEAR;

        if ((walk->flags & ACOMP_WALK_SRC_LINEAR)) {
                walk->in.sg = (void *)req->svirt;
                walk->in.offset = 0;
        } else
                scatterwalk_start(&walk->in, src);
        if ((walk->flags & ACOMP_WALK_DST_LINEAR)) {
                walk->out.sg = (void *)req->dvirt;
                walk->out.offset = 0;
        } else
                scatterwalk_start(&walk->out, dst);

        return 0;
}
EXPORT_SYMBOL_GPL(acomp_walk_virt);

struct acomp_req *acomp_request_clone(struct acomp_req *req,
                                      size_t total, gfp_t gfp)
{
        struct acomp_req *nreq;

        nreq = container_of(crypto_request_clone(&req->base, total, gfp),
                            struct acomp_req, base);
        if (nreq == req)
                return req;

        if (req->src == &req->chain.ssg)
                nreq->src = &nreq->chain.ssg;
        if (req->dst == &req->chain.dsg)
                nreq->dst = &nreq->chain.dsg;
        return nreq;
}
EXPORT_SYMBOL_GPL(acomp_request_clone);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Asynchronous compression type");



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 
    2 






    2 







    1 








    2 

    1 















    4 
    1 



    1 





    3 



    3 






    4 






    4 

    1 
    4 







    4 







    1 



    1 


    1 








    1 




    3 
    3 




    4 


    4 
    3 







    7 

    2 

    5 
    1 

    1 



    6 
    6 

    3 

    1 

    2 



    4 


    1 

    1 
















    3 



    7 
    2 



















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
// SPDX-License-Identifier: GPL-2.0-only
#define pr_fmt(fmt) "IPsec: " fmt

#include <crypto/aead.h>
#include <crypto/authenc.h>
#include <linux/err.h>
#include <linux/module.h>
#include <net/ip.h>
#include <net/xfrm.h>
#include <net/esp.h>
#include <linux/scatterlist.h>
#include <linux/kernel.h>
#include <linux/pfkeyv2.h>
#include <linux/rtnetlink.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/in6.h>
#include <net/icmp.h>
#include <net/protocol.h>
#include <net/udp.h>
#include <net/tcp.h>
#include <net/espintcp.h>
#include <linux/skbuff_ref.h>

#include <linux/highmem.h>

struct esp_skb_cb {
        struct xfrm_skb_cb xfrm;
        void *tmp;
};

struct esp_output_extra {
        __be32 seqhi;
        u32 esphoff;
};

#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))

/*
 * Allocate an AEAD request structure with extra space for SG and IV.
 *
 * For alignment considerations the IV is placed at the front, followed
 * by the request and finally the SG list.
 *
 * TODO: Use spare space in skb for this where possible.
 */
static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int extralen)
{
        unsigned int len;

        len = extralen;

        len += crypto_aead_ivsize(aead);

        if (len) {
                len += crypto_aead_alignmask(aead) &
                       ~(crypto_tfm_ctx_alignment() - 1);
                len = ALIGN(len, crypto_tfm_ctx_alignment());
        }

        len += sizeof(struct aead_request) + crypto_aead_reqsize(aead);
        len = ALIGN(len, __alignof__(struct scatterlist));

        len += sizeof(struct scatterlist) * nfrags;

        return kmalloc(len, GFP_ATOMIC);
}

static inline void *esp_tmp_extra(void *tmp)
{
        return PTR_ALIGN(tmp, __alignof__(struct esp_output_extra));
}

static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int extralen)
{
        return crypto_aead_ivsize(aead) ?
               PTR_ALIGN((u8 *)tmp + extralen,
                         crypto_aead_alignmask(aead) + 1) : tmp + extralen;
}

static inline struct aead_request *esp_tmp_req(struct crypto_aead *aead, u8 *iv)
{
        struct aead_request *req;

        req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead),
                                crypto_tfm_ctx_alignment());
        aead_request_set_tfm(req, aead);
        return req;
}

static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
                                             struct aead_request *req)
{
        return (void *)ALIGN((unsigned long)(req + 1) +
                             crypto_aead_reqsize(aead),
                             __alignof__(struct scatterlist));
}

static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb)
{
        struct crypto_aead *aead = x->data;
        int extralen = 0;
        u8 *iv;
        struct aead_request *req;
        struct scatterlist *sg;

        if (x->props.flags & XFRM_STATE_ESN)
                extralen += sizeof(struct esp_output_extra);

        iv = esp_tmp_iv(aead, tmp, extralen);
        req = esp_tmp_req(aead, iv);

        /* Unref skb_frag_pages in the src scatterlist if necessary.
         * Skip the first sg which comes from skb->data.
         */
        if (req->src != req->dst)
                for (sg = sg_next(req->src); sg; sg = sg_next(sg))
                        skb_page_unref(page_to_netmem(sg_page(sg)),
                                       skb->pp_recycle);
}

#ifdef CONFIG_INET_ESPINTCP
static struct sock *esp_find_tcp_sk(struct xfrm_state *x)
{
        struct xfrm_encap_tmpl *encap = x->encap;
        struct net *net = xs_net(x);
        __be16 sport, dport;
        struct sock *sk;

        spin_lock_bh(&x->lock);
        sport = encap->encap_sport;
        dport = encap->encap_dport;
        spin_unlock_bh(&x->lock);

        sk = inet_lookup_established(net, x->id.daddr.a4, dport,
                                     x->props.saddr.a4, sport, 0);
        if (!sk)
                return ERR_PTR(-ENOENT);

        if (!tcp_is_ulp_esp(sk)) {
                sock_put(sk);
                return ERR_PTR(-EINVAL);
        }

        return sk;
}

static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb)
{
        struct sock *sk;
        int err;

        rcu_read_lock();

        sk = esp_find_tcp_sk(x);
        err = PTR_ERR_OR_ZERO(sk);
        if (err) {
                kfree_skb(skb);
                goto out;
        }

        bh_lock_sock(sk);
        if (sock_owned_by_user(sk))
                err = espintcp_queue_out(sk, skb);
        else
                err = espintcp_push_skb(sk, skb);
        bh_unlock_sock(sk);

        sock_put(sk);

out:
        rcu_read_unlock();
        return err;
}

static int esp_output_tcp_encap_cb(struct net *net, struct sock *sk,
                                   struct sk_buff *skb)
{
        struct dst_entry *dst = skb_dst(skb);
        struct xfrm_state *x = dst->xfrm;

        return esp_output_tcp_finish(x, skb);
}

static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
{
        int err;

        local_bh_disable();
        err = xfrm_trans_queue_net(xs_net(x), skb, esp_output_tcp_encap_cb);
        local_bh_enable();

        /* EINPROGRESS just happens to do the right thing.  It
         * actually means that the skb has been consumed and
         * isn't coming back.
         */
        return err ?: -EINPROGRESS;
}
#else
static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
{
        WARN_ON(1);
        return -EOPNOTSUPP;
}
#endif

static void esp_output_done(void *data, int err)
{
        struct sk_buff *skb = data;
        struct xfrm_offload *xo = xfrm_offload(skb);
        void *tmp;
        struct xfrm_state *x;

        if (xo && (xo->flags & XFRM_DEV_RESUME)) {
                struct sec_path *sp = skb_sec_path(skb);

                x = sp->xvec[sp->len - 1];
        } else {
                x = skb_dst(skb)->xfrm;
        }

        tmp = ESP_SKB_CB(skb)->tmp;
        esp_ssg_unref(x, tmp, skb);
        kfree(tmp);

        if (xo && (xo->flags & XFRM_DEV_RESUME)) {
                if (err) {
                        XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR);
                        kfree_skb(skb);
                        return;
                }

                skb_push(skb, skb->data - skb_mac_header(skb));
                secpath_reset(skb);
                xfrm_dev_resume(skb);
        } else {
                if (!err &&
                    x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
                        esp_output_tail_tcp(x, skb);
                else
                        xfrm_output_resume(skb_to_full_sk(skb), skb, err);
        }
}

/* Move ESP header back into place. */
static void esp_restore_header(struct sk_buff *skb, unsigned int offset)
{
        struct ip_esp_hdr *esph = (void *)(skb->data + offset);
        void *tmp = ESP_SKB_CB(skb)->tmp;
        __be32 *seqhi = esp_tmp_extra(tmp);

        esph->seq_no = esph->spi;
        esph->spi = *seqhi;
}

static void esp_output_restore_header(struct sk_buff *skb)
{
        void *tmp = ESP_SKB_CB(skb)->tmp;
        struct esp_output_extra *extra = esp_tmp_extra(tmp);

        esp_restore_header(skb, skb_transport_offset(skb) + extra->esphoff -
                                sizeof(__be32));
}

static struct ip_esp_hdr *esp_output_set_extra(struct sk_buff *skb,
                                               struct xfrm_state *x,
                                               struct ip_esp_hdr *esph,
                                               struct esp_output_extra *extra)
{
        /* For ESN we move the header forward by 4 bytes to
         * accommodate the high bits.  We will move it back after
         * encryption.
         */
        if ((x->props.flags & XFRM_STATE_ESN)) {
                __u32 seqhi;
                struct xfrm_offload *xo = xfrm_offload(skb);

                if (xo)
                        seqhi = xo->seq.hi;
                else
                        seqhi = XFRM_SKB_CB(skb)->seq.output.hi;

                extra->esphoff = (unsigned char *)esph -
                                 skb_transport_header(skb);
                esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4);
                extra->seqhi = esph->spi;
                esph->seq_no = htonl(seqhi);
        }

        esph->spi = x->id.spi;

        return esph;
}

static void esp_output_done_esn(void *data, int err)
{
        struct sk_buff *skb = data;

        esp_output_restore_header(skb);
        esp_output_done(data, err);
}

static struct ip_esp_hdr *esp_output_udp_encap(struct sk_buff *skb,
                                               int encap_type,
                                               struct esp_info *esp,
                                               __be16 sport,
                                               __be16 dport)
{
        struct udphdr *uh;
        unsigned int len;
        struct xfrm_offload *xo = xfrm_offload(skb);

        len = skb->len + esp->tailen - skb_transport_offset(skb);
        if (len + sizeof(struct iphdr) > IP_MAX_MTU)
                return ERR_PTR(-EMSGSIZE);

        uh = (struct udphdr *)esp->esph;
        uh->source = sport;
        uh->dest = dport;
        uh->len = htons(len);
        uh->check = 0;

        /* For IPv4 ESP with UDP encapsulation, if xo is not null, the skb is in the crypto offload
         * data path, which means that esp_output_udp_encap is called outside of the XFRM stack.
         * In this case, the mac header doesn't point to the IPv4 protocol field, so don't set it.
         */
        if (!xo || encap_type != UDP_ENCAP_ESPINUDP)
                *skb_mac_header(skb) = IPPROTO_UDP;

        return (struct ip_esp_hdr *)(uh + 1);
}

#ifdef CONFIG_INET_ESPINTCP
static struct ip_esp_hdr *esp_output_tcp_encap(struct xfrm_state *x,
                                                    struct sk_buff *skb,
                                                    struct esp_info *esp)
{
        __be16 *lenp = (void *)esp->esph;
        struct ip_esp_hdr *esph;
        unsigned int len;
        struct sock *sk;

        len = skb->len + esp->tailen - skb_transport_offset(skb);
        if (len > IP_MAX_MTU)
                return ERR_PTR(-EMSGSIZE);

        rcu_read_lock();
        sk = esp_find_tcp_sk(x);
        rcu_read_unlock();

        if (IS_ERR(sk))
                return ERR_CAST(sk);

        sock_put(sk);

        *lenp = htons(len);
        esph = (struct ip_esp_hdr *)(lenp + 1);

        return esph;
}
#else
static struct ip_esp_hdr *esp_output_tcp_encap(struct xfrm_state *x,
                                                    struct sk_buff *skb,
                                                    struct esp_info *esp)
{
        return ERR_PTR(-EOPNOTSUPP);
}
#endif

static int esp_output_encap(struct xfrm_state *x, struct sk_buff *skb,
                            struct esp_info *esp)
{
        struct xfrm_encap_tmpl *encap = x->encap;
        struct ip_esp_hdr *esph;
        __be16 sport, dport;
        int encap_type;

        spin_lock_bh(&x->lock);
        sport = encap->encap_sport;
        dport = encap->encap_dport;
        encap_type = encap->encap_type;
        spin_unlock_bh(&x->lock);

        switch (encap_type) {
        default:
        case UDP_ENCAP_ESPINUDP:
                esph = esp_output_udp_encap(skb, encap_type, esp, sport, dport);
                break;
        case TCP_ENCAP_ESPINTCP:
                esph = esp_output_tcp_encap(x, skb, esp);
                break;
        }

        if (IS_ERR(esph))
                return PTR_ERR(esph);

        esp->esph = esph;

        return 0;
}

int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp)
{
        u8 *tail;
        int nfrags;
        int esph_offset;
        struct page *page;
        struct sk_buff *trailer;
        int tailen = esp->tailen;

        /* this is non-NULL only with TCP/UDP Encapsulation */
        if (x->encap) {
                int err = esp_output_encap(x, skb, esp);

                if (err < 0)
                        return err;
        }

        if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE ||
            ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE)
                goto cow;

        if (!skb_cloned(skb)) {
                if (tailen <= skb_tailroom(skb)) {
                        nfrags = 1;
                        trailer = skb;
                        tail = skb_tail_pointer(trailer);

                        goto skip_cow;
                } else if ((skb_shinfo(skb)->nr_frags < MAX_SKB_FRAGS)
                           && !skb_has_frag_list(skb)) {
                        int allocsize;
                        struct sock *sk = skb->sk;
                        struct page_frag *pfrag = &x->xfrag;

                        esp->inplace = false;

                        allocsize = ALIGN(tailen, L1_CACHE_BYTES);

                        spin_lock_bh(&x->lock);

                        if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
                                spin_unlock_bh(&x->lock);
                                goto cow;
                        }

                        page = pfrag->page;
                        get_page(page);

                        tail = page_address(page) + pfrag->offset;

                        esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto);

                        nfrags = skb_shinfo(skb)->nr_frags;

                        __skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
                                             tailen);
                        skb_shinfo(skb)->nr_frags = ++nfrags;

                        pfrag->offset = pfrag->offset + allocsize;

                        spin_unlock_bh(&x->lock);

                        nfrags++;

                        skb_len_add(skb, tailen);
                        if (sk && sk_fullsock(sk))
                                refcount_add(tailen, &sk->sk_wmem_alloc);

                        goto out;
                }
        }

cow:
        esph_offset = (unsigned char *)esp->esph - skb_transport_header(skb);

        nfrags = skb_cow_data(skb, tailen, &trailer);
        if (nfrags < 0)
                goto out;
        tail = skb_tail_pointer(trailer);
        esp->esph = (struct ip_esp_hdr *)(skb_transport_header(skb) + esph_offset);

skip_cow:
        esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto);
        pskb_put(skb, trailer, tailen);

out:
        return nfrags;
}
EXPORT_SYMBOL_GPL(esp_output_head);

int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp)
{
        u8 *iv;
        int alen;
        void *tmp;
        int ivlen;
        int assoclen;
        int extralen;
        struct page *page;
        struct ip_esp_hdr *esph;
        struct crypto_aead *aead;
        struct aead_request *req;
        struct scatterlist *sg, *dsg;
        struct esp_output_extra *extra;
        int err = -ENOMEM;

        assoclen = sizeof(struct ip_esp_hdr);
        extralen = 0;

        if (x->props.flags & XFRM_STATE_ESN) {
                extralen += sizeof(*extra);
                assoclen += sizeof(__be32);
        }

        aead = x->data;
        alen = crypto_aead_authsize(aead);
        ivlen = crypto_aead_ivsize(aead);

        tmp = esp_alloc_tmp(aead, esp->nfrags + 2, extralen);
        if (!tmp)
                goto error;

        extra = esp_tmp_extra(tmp);
        iv = esp_tmp_iv(aead, tmp, extralen);
        req = esp_tmp_req(aead, iv);
        sg = esp_req_sg(aead, req);

        if (esp->inplace)
                dsg = sg;
        else
                dsg = &sg[esp->nfrags];

        esph = esp_output_set_extra(skb, x, esp->esph, extra);
        esp->esph = esph;

        sg_init_table(sg, esp->nfrags);
        err = skb_to_sgvec(skb, sg,
                           (unsigned char *)esph - skb->data,
                           assoclen + ivlen + esp->clen + alen);
        if (unlikely(err < 0))
                goto error_free;

        if (!esp->inplace) {
                int allocsize;
                struct page_frag *pfrag = &x->xfrag;

                allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES);

                spin_lock_bh(&x->lock);
                if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
                        spin_unlock_bh(&x->lock);
                        goto error_free;
                }

                skb_shinfo(skb)->nr_frags = 1;

                page = pfrag->page;
                get_page(page);
                /* replace page frags in skb with new page */
                __skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len);
                pfrag->offset = pfrag->offset + allocsize;
                spin_unlock_bh(&x->lock);

                sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1);
                err = skb_to_sgvec(skb, dsg,
                                   (unsigned char *)esph - skb->data,
                                   assoclen + ivlen + esp->clen + alen);
                if (unlikely(err < 0))
                        goto error_free;
        }

        if ((x->props.flags & XFRM_STATE_ESN))
                aead_request_set_callback(req, 0, esp_output_done_esn, skb);
        else
                aead_request_set_callback(req, 0, esp_output_done, skb);

        aead_request_set_crypt(req, sg, dsg, ivlen + esp->clen, iv);
        aead_request_set_ad(req, assoclen);

        memset(iv, 0, ivlen);
        memcpy(iv + ivlen - min(ivlen, 8), (u8 *)&esp->seqno + 8 - min(ivlen, 8),
               min(ivlen, 8));

        ESP_SKB_CB(skb)->tmp = tmp;
        err = crypto_aead_encrypt(req);

        switch (err) {
        case -EINPROGRESS:
                goto error;

        case -ENOSPC:
                err = NET_XMIT_DROP;
                break;

        case 0:
                if ((x->props.flags & XFRM_STATE_ESN))
                        esp_output_restore_header(skb);
        }

        if (sg != dsg)
                esp_ssg_unref(x, tmp, skb);

        if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
                err = esp_output_tail_tcp(x, skb);

error_free:
        kfree(tmp);
error:
        return err;
}
EXPORT_SYMBOL_GPL(esp_output_tail);

static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
{
        int alen;
        int blksize;
        struct ip_esp_hdr *esph;
        struct crypto_aead *aead;
        struct esp_info esp;

        esp.inplace = true;

        esp.proto = *skb_mac_header(skb);
        *skb_mac_header(skb) = IPPROTO_ESP;

        /* skb is pure payload to encrypt */

        aead = x->data;
        alen = crypto_aead_authsize(aead);

        esp.tfclen = 0;
        if (x->tfcpad) {
                struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
                u32 padto;

                padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached));
                if (skb->len < padto)
                        esp.tfclen = padto - skb->len;
        }
        blksize = ALIGN(crypto_aead_blocksize(aead), 4);
        esp.clen = ALIGN(skb->len + 2 + esp.tfclen, blksize);
        esp.plen = esp.clen - skb->len - esp.tfclen;
        esp.tailen = esp.tfclen + esp.plen + alen;

        esp.esph = ip_esp_hdr(skb);

        esp.nfrags = esp_output_head(x, skb, &esp);
        if (esp.nfrags < 0)
                return esp.nfrags;

        esph = esp.esph;
        esph->spi = x->id.spi;

        esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
        esp.seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low +
                                 ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32));

        skb_push(skb, -skb_network_offset(skb));

        return esp_output_tail(x, skb, &esp);
}

static inline int esp_remove_trailer(struct sk_buff *skb)
{
        struct xfrm_state *x = xfrm_input_state(skb);
        struct crypto_aead *aead = x->data;
        int alen, hlen, elen;
        int padlen, trimlen;
        __wsum csumdiff;
        u8 nexthdr[2];
        int ret;

        alen = crypto_aead_authsize(aead);
        hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
        elen = skb->len - hlen;

        if (skb_copy_bits(skb, skb->len - alen - 2, nexthdr, 2))
                BUG();

        ret = -EINVAL;
        padlen = nexthdr[0];
        if (padlen + 2 + alen >= elen) {
                net_dbg_ratelimited("ipsec esp packet is garbage padlen=%d, elen=%d\n",
                                    padlen + 2, elen - alen);
                goto out;
        }

        trimlen = alen + padlen + 2;
        if (skb->ip_summed == CHECKSUM_COMPLETE) {
                csumdiff = skb_checksum(skb, skb->len - trimlen, trimlen, 0);
                skb->csum = csum_block_sub(skb->csum, csumdiff,
                                           skb->len - trimlen);
        }
        ret = pskb_trim(skb, skb->len - trimlen);
        if (unlikely(ret))
                return ret;

        ret = nexthdr[1];

out:
        return ret;
}

int esp_input_done2(struct sk_buff *skb, int err)
{
        const struct iphdr *iph;
        struct xfrm_state *x = xfrm_input_state(skb);
        struct xfrm_offload *xo = xfrm_offload(skb);
        struct crypto_aead *aead = x->data;
        int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
        int ihl;

        if (!xo || !(xo->flags & CRYPTO_DONE))
                kfree(ESP_SKB_CB(skb)->tmp);

        if (unlikely(err))
                goto out;

        err = esp_remove_trailer(skb);
        if (unlikely(err < 0))
                goto out;

        iph = ip_hdr(skb);
        ihl = iph->ihl * 4;

        if (x->encap) {
                struct xfrm_encap_tmpl *encap = x->encap;
                struct tcphdr *th = (void *)(skb_network_header(skb) + ihl);
                struct udphdr *uh = (void *)(skb_network_header(skb) + ihl);
                __be16 source;

                switch (x->encap->encap_type) {
                case TCP_ENCAP_ESPINTCP:
                        source = th->source;
                        break;
                case UDP_ENCAP_ESPINUDP:
                        source = uh->source;
                        break;
                default:
                        WARN_ON_ONCE(1);
                        err = -EINVAL;
                        goto out;
                }

                /*
                 * 1) if the NAT-T peer's IP or port changed then
                 *    advertise the change to the keying daemon.
                 *    This is an inbound SA, so just compare
                 *    SRC ports.
                 */
                if (iph->saddr != x->props.saddr.a4 ||
                    source != encap->encap_sport) {
                        xfrm_address_t ipaddr;

                        ipaddr.a4 = iph->saddr;
                        km_new_mapping(x, &ipaddr, source);

                        /* XXX: perhaps add an extra
                         * policy check here, to see
                         * if we should allow or
                         * reject a packet from a
                         * different source
                         * address/port.
                         */
                }

                /*
                 * 2) ignore UDP/TCP checksums in case
                 *    of NAT-T in Transport Mode, or
                 *    perform other post-processing fixes
                 *    as per draft-ietf-ipsec-udp-encaps-06,
                 *    section 3.1.2
                 */
                if (x->props.mode == XFRM_MODE_TRANSPORT)
                        skb->ip_summed = CHECKSUM_UNNECESSARY;
        }

        skb_pull_rcsum(skb, hlen);
        if (x->props.mode == XFRM_MODE_TUNNEL ||
            x->props.mode == XFRM_MODE_IPTFS)
                skb_reset_transport_header(skb);
        else
                skb_set_transport_header(skb, -ihl);

        /* RFC4303: Drop dummy packets without any error */
        if (err == IPPROTO_NONE)
                err = -EINVAL;

out:
        return err;
}
EXPORT_SYMBOL_GPL(esp_input_done2);

static void esp_input_done(void *data, int err)
{
        struct sk_buff *skb = data;

        xfrm_input_resume(skb, esp_input_done2(skb, err));
}

static void esp_input_restore_header(struct sk_buff *skb)
{
        esp_restore_header(skb, 0);
        __skb_pull(skb, 4);
}

static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi)
{
        struct xfrm_state *x = xfrm_input_state(skb);
        struct ip_esp_hdr *esph;

        /* For ESN we move the header forward by 4 bytes to
         * accommodate the high bits.  We will move it back after
         * decryption.
         */
        if ((x->props.flags & XFRM_STATE_ESN)) {
                esph = skb_push(skb, 4);
                *seqhi = esph->spi;
                esph->spi = esph->seq_no;
                esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
        }
}

static void esp_input_done_esn(void *data, int err)
{
        struct sk_buff *skb = data;

        esp_input_restore_header(skb);
        esp_input_done(data, err);
}

/*
 * Note: detecting truncated vs. non-truncated authentication data is very
 * expensive, so we only support truncated data, which is the recommended
 * and common case.
 */
static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
{
        struct crypto_aead *aead = x->data;
        struct aead_request *req;
        struct sk_buff *trailer;
        int ivlen = crypto_aead_ivsize(aead);
        int elen = skb->len - sizeof(struct ip_esp_hdr) - ivlen;
        int nfrags;
        int assoclen;
        int seqhilen;
        __be32 *seqhi;
        void *tmp;
        u8 *iv;
        struct scatterlist *sg;
        int err = -EINVAL;

        if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + ivlen))
                goto out;

        if (elen <= 0)
                goto out;

        assoclen = sizeof(struct ip_esp_hdr);
        seqhilen = 0;

        if (x->props.flags & XFRM_STATE_ESN) {
                seqhilen += sizeof(__be32);
                assoclen += seqhilen;
        }

        if (!skb_cloned(skb)) {
                if (!skb_is_nonlinear(skb)) {
                        nfrags = 1;

                        goto skip_cow;
                } else if (!skb_has_frag_list(skb)) {
                        nfrags = skb_shinfo(skb)->nr_frags;
                        nfrags++;

                        goto skip_cow;
                }
        }

        err = skb_cow_data(skb, 0, &trailer);
        if (err < 0)
                goto out;

        nfrags = err;

skip_cow:
        err = -ENOMEM;
        tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
        if (!tmp)
                goto out;

        ESP_SKB_CB(skb)->tmp = tmp;
        seqhi = esp_tmp_extra(tmp);
        iv = esp_tmp_iv(aead, tmp, seqhilen);
        req = esp_tmp_req(aead, iv);
        sg = esp_req_sg(aead, req);

        esp_input_set_header(skb, seqhi);

        sg_init_table(sg, nfrags);
        err = skb_to_sgvec(skb, sg, 0, skb->len);
        if (unlikely(err < 0)) {
                kfree(tmp);
                goto out;
        }

        skb->ip_summed = CHECKSUM_NONE;

        if ((x->props.flags & XFRM_STATE_ESN))
                aead_request_set_callback(req, 0, esp_input_done_esn, skb);
        else
                aead_request_set_callback(req, 0, esp_input_done, skb);

        aead_request_set_crypt(req, sg, sg, elen + ivlen, iv);
        aead_request_set_ad(req, assoclen);

        err = crypto_aead_decrypt(req);
        if (err == -EINPROGRESS)
                goto out;

        if ((x->props.flags & XFRM_STATE_ESN))
                esp_input_restore_header(skb);

        err = esp_input_done2(skb, err);

out:
        return err;
}

static int esp4_err(struct sk_buff *skb, u32 info)
{
        struct net *net = dev_net(skb->dev);
        const struct iphdr *iph = (const struct iphdr *)skb->data;
        struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
        struct xfrm_state *x;

        switch (icmp_hdr(skb)->type) {
        case ICMP_DEST_UNREACH:
                if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
                        return 0;
                break;
        case ICMP_REDIRECT:
                break;
        default:
                return 0;
        }

        x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
                              esph->spi, IPPROTO_ESP, AF_INET);
        if (!x)
                return 0;

        if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
                ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ESP);
        else
                ipv4_redirect(skb, net, 0, IPPROTO_ESP);
        xfrm_state_put(x);

        return 0;
}

static void esp_destroy(struct xfrm_state *x)
{
        struct crypto_aead *aead = x->data;

        if (!aead)
                return;

        crypto_free_aead(aead);
}

static int esp_init_aead(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
        char aead_name[CRYPTO_MAX_ALG_NAME];
        struct crypto_aead *aead;
        int err;

        if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)",
                     x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME) {
                NL_SET_ERR_MSG(extack, "Algorithm name is too long");
                return -ENAMETOOLONG;
        }

        aead = crypto_alloc_aead(aead_name, 0, 0);
        err = PTR_ERR(aead);
        if (IS_ERR(aead))
                goto error;

        x->data = aead;

        err = crypto_aead_setkey(aead, x->aead->alg_key,
                                 (x->aead->alg_key_len + 7) / 8);
        if (err)
                goto error;

        err = crypto_aead_setauthsize(aead, x->aead->alg_icv_len / 8);
        if (err)
                goto error;

        return 0;

error:
        NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
        return err;
}

static int esp_init_authenc(struct xfrm_state *x,
                            struct netlink_ext_ack *extack)
{
        struct crypto_aead *aead;
        struct crypto_authenc_key_param *param;
        struct rtattr *rta;
        char *key;
        char *p;
        char authenc_name[CRYPTO_MAX_ALG_NAME];
        unsigned int keylen;
        int err;

        err = -ENAMETOOLONG;

        if ((x->props.flags & XFRM_STATE_ESN)) {
                if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
                             "%s%sauthencesn(%s,%s)%s",
                             x->geniv ?: "", x->geniv ? "(" : "",
                             x->aalg ? x->aalg->alg_name : "digest_null",
                             x->ealg->alg_name,
                             x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) {
                        NL_SET_ERR_MSG(extack, "Algorithm name is too long");
                        goto error;
                }
        } else {
                if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
                             "%s%sauthenc(%s,%s)%s",
                             x->geniv ?: "", x->geniv ? "(" : "",
                             x->aalg ? x->aalg->alg_name : "digest_null",
                             x->ealg->alg_name,
                             x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) {
                        NL_SET_ERR_MSG(extack, "Algorithm name is too long");
                        goto error;
                }
        }

        aead = crypto_alloc_aead(authenc_name, 0, 0);
        err = PTR_ERR(aead);
        if (IS_ERR(aead)) {
                NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
                goto error;
        }

        x->data = aead;

        keylen = (x->aalg ? (x->aalg->alg_key_len + 7) / 8 : 0) +
                 (x->ealg->alg_key_len + 7) / 8 + RTA_SPACE(sizeof(*param));
        err = -ENOMEM;
        key = kmalloc(keylen, GFP_KERNEL);
        if (!key)
                goto error;

        p = key;
        rta = (void *)p;
        rta->rta_type = CRYPTO_AUTHENC_KEYA_PARAM;
        rta->rta_len = RTA_LENGTH(sizeof(*param));
        param = RTA_DATA(rta);
        p += RTA_SPACE(sizeof(*param));

        if (x->aalg) {
                struct xfrm_algo_desc *aalg_desc;

                memcpy(p, x->aalg->alg_key, (x->aalg->alg_key_len + 7) / 8);
                p += (x->aalg->alg_key_len + 7) / 8;

                aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
                BUG_ON(!aalg_desc);

                err = -EINVAL;
                if (aalg_desc->uinfo.auth.icv_fullbits / 8 !=
                    crypto_aead_authsize(aead)) {
                        NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
                        goto free_key;
                }

                err = crypto_aead_setauthsize(
                        aead, x->aalg->alg_trunc_len / 8);
                if (err) {
                        NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
                        goto free_key;
                }
        }

        param->enckeylen = cpu_to_be32((x->ealg->alg_key_len + 7) / 8);
        memcpy(p, x->ealg->alg_key, (x->ealg->alg_key_len + 7) / 8);

        err = crypto_aead_setkey(aead, key, keylen);

free_key:
        kfree_sensitive(key);

error:
        return err;
}

static int esp_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
        struct crypto_aead *aead;
        u32 align;
        int err;

        x->data = NULL;

        if (x->aead) {
                err = esp_init_aead(x, extack);
        } else if (x->ealg) {
                err = esp_init_authenc(x, extack);
        } else {
                NL_SET_ERR_MSG(extack, "ESP: AEAD or CRYPT must be provided");
                err = -EINVAL;
        }

        if (err)
                goto error;

        aead = x->data;

        x->props.header_len = sizeof(struct ip_esp_hdr) +
                              crypto_aead_ivsize(aead);
        if (x->props.mode == XFRM_MODE_TUNNEL)
                x->props.header_len += sizeof(struct iphdr);
        else if (x->props.mode == XFRM_MODE_BEET && x->sel.family != AF_INET6)
                x->props.header_len += IPV4_BEET_PHMAXLEN;
        if (x->encap) {
                struct xfrm_encap_tmpl *encap = x->encap;

                switch (encap->encap_type) {
                default:
                        NL_SET_ERR_MSG(extack, "Unsupported encapsulation type for ESP");
                        err = -EINVAL;
                        goto error;
                case UDP_ENCAP_ESPINUDP:
                        x->props.header_len += sizeof(struct udphdr);
                        break;
#ifdef CONFIG_INET_ESPINTCP
                case TCP_ENCAP_ESPINTCP:
                        /* only the length field, TCP encap is done by
                         * the socket
                         */
                        x->props.header_len += 2;
                        break;
#endif
                }
        }

        align = ALIGN(crypto_aead_blocksize(aead), 4);
        x->props.trailer_len = align + 1 + crypto_aead_authsize(aead);

error:
        return err;
}

static int esp4_rcv_cb(struct sk_buff *skb, int err)
{
        return 0;
}

static const struct xfrm_type esp_type =
{
        .owner                = THIS_MODULE,
        .proto                     = IPPROTO_ESP,
        .flags                = XFRM_TYPE_REPLAY_PROT,
        .init_state        = esp_init_state,
        .destructor        = esp_destroy,
        .input                = esp_input,
        .output                = esp_output,
};

static struct xfrm4_protocol esp4_protocol = {
        .handler        =        xfrm4_rcv,
        .input_handler        =        xfrm_input,
        .cb_handler        =        esp4_rcv_cb,
        .err_handler        =        esp4_err,
        .priority        =        0,
};

static int __init esp4_init(void)
{
        if (xfrm_register_type(&esp_type, AF_INET) < 0) {
                pr_info("%s: can't add xfrm type\n", __func__);
                return -EAGAIN;
        }
        if (xfrm4_protocol_register(&esp4_protocol, IPPROTO_ESP) < 0) {
                pr_info("%s: can't add protocol\n", __func__);
                xfrm_unregister_type(&esp_type, AF_INET);
                return -EAGAIN;
        }
        return 0;
}

static void __exit esp4_fini(void)
{
        if (xfrm4_protocol_deregister(&esp4_protocol, IPPROTO_ESP) < 0)
                pr_info("%s: can't remove protocol\n", __func__);
        xfrm_unregister_type(&esp_type, AF_INET);
}

module_init(esp4_init);
module_exit(esp4_fini);
MODULE_DESCRIPTION("IPv4 ESP transformation library");
MODULE_LICENSE("GPL");
MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_ESP);




















































































































































































































































































































































   56 






   57 
   56 



   58 



   57 



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   43 




   42 

































   44 

   44 






















































































   60 


   58 


















   60 













   60 







































































































































   58 






   60 






































   59 


























































   56 






   56 


   59 

   58 
















   19 


   18 

   19 















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Simple NUMA memory policy for the Linux kernel.
 *
 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
 *
 * NUMA policy allows the user to give hints in which node(s) memory should
 * be allocated.
 *
 * Support six policies per VMA and per process:
 *
 * The VMA policy has priority over the process policy for a page fault.
 *
 * interleave     Allocate memory interleaved over a set of nodes,
 *                with normal fallback if it fails.
 *                For VMA based allocations this interleaves based on the
 *                offset into the backing object or offset into the mapping
 *                for anonymous memory. For process policy an process counter
 *                is used.
 *
 * weighted interleave
 *                Allocate memory interleaved over a set of nodes based on
 *                a set of weights (per-node), with normal fallback if it
 *                fails.  Otherwise operates the same as interleave.
 *                Example: nodeset(0,1) & weights (2,1) - 2 pages allocated
 *                on node 0 for every 1 page allocated on node 1.
 *
 * bind           Only allocate memory on a specific set of nodes,
 *                no fallback.
 *                FIXME: memory is allocated starting with the first node
 *                to the last. It would be better if bind would truly restrict
 *                the allocation to memory nodes instead
 *
 * preferred      Try a specific node first before normal fallback.
 *                As a special case NUMA_NO_NODE here means do the allocation
 *                on the local CPU. This is normally identical to default,
 *                but useful to set in a VMA when you have a non default
 *                process policy.
 *
 * preferred many Try a set of nodes first before normal fallback. This is
 *                similar to preferred without the special case.
 *
 * default        Allocate on the local node first, or when on a VMA
 *                use the process policy. This is what Linux always did
 *                  in a NUMA aware kernel and still does by, ahem, default.
 *
 * The process policy is applied for most non interrupt memory allocations
 * in that process' context. Interrupts ignore the policies and always
 * try to allocate on the local CPU. The VMA policy is only applied for memory
 * allocations for a VMA in the VM.
 *
 * Currently there are a few corner cases in swapping where the policy
 * is not applied, but the majority should be handled. When process policy
 * is used it is not remembered over swap outs/swap ins.
 *
 * Only the highest zone in the zone hierarchy gets policied. Allocations
 * requesting a lower zone just use default policy. This implies that
 * on systems with highmem kernel lowmem allocation don't get policied.
 * Same with GFP_DMA allocations.
 *
 * For shmem/tmpfs shared memory the policy is shared between
 * all users and remembered even when nobody has memory mapped.
 */

/* Notebook:
   fix mmap readahead to honour policy and enable policy for any page cache
   object
   statistics for bigpages
   global policy for page cache? currently it uses process policy. Requires
   first item above.
   handle mremap for shared memory (currently ignored for the policy)
   grows down?
   make bind policy root only? It can trigger oom much faster and the
   kernel is not always grateful with that.
*/

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/mempolicy.h>
#include <linux/pagewalk.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task.h>
#include <linux/nodemask.h>
#include <linux/cpuset.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/export.h>
#include <linux/nsproxy.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/compat.h>
#include <linux/ptrace.h>
#include <linux/swap.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/migrate.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/ctype.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/printk.h>
#include <linux/swapops.h>
#include <linux/gcd.h>

#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <linux/uaccess.h>
#include <linux/memory.h>

#include "internal.h"

/* Internal flags */
#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)        /* Skip checks for continuous vmas */
#define MPOL_MF_INVERT       (MPOL_MF_INTERNAL << 1)        /* Invert check for nodemask */
#define MPOL_MF_WRLOCK       (MPOL_MF_INTERNAL << 2)        /* Write-lock walked vmas */

static struct kmem_cache *policy_cache;
static struct kmem_cache *sn_cache;

/* Highest zone. An specific allocation for a zone below that is not
   policied. */
enum zone_type policy_zone = 0;

/*
 * run-time system-wide default policy => local allocation
 */
static struct mempolicy default_policy = {
        .refcnt = ATOMIC_INIT(1), /* never free it */
        .mode = MPOL_LOCAL,
};

static struct mempolicy preferred_node_policy[MAX_NUMNODES];

/*
 * weightiness balances the tradeoff between small weights (cycles through nodes
 * faster, more fair/even distribution) and large weights (smaller errors
 * between actual bandwidth ratios and weight ratios). 32 is a number that has
 * been found to perform at a reasonable compromise between the two goals.
 */
static const int weightiness = 32;

/*
 * A null weighted_interleave_state is interpreted as having .mode="auto",
 * and .iw_table is interpreted as an array of 1s with length nr_node_ids.
 */
struct weighted_interleave_state {
        bool mode_auto;
        u8 iw_table[];
};
static struct weighted_interleave_state __rcu *wi_state;
static unsigned int *node_bw_table;

/*
 * wi_state_lock protects both wi_state and node_bw_table.
 * node_bw_table is only used by writers to update wi_state.
 */
static DEFINE_MUTEX(wi_state_lock);

static u8 get_il_weight(int node)
{
        struct weighted_interleave_state *state;
        u8 weight = 1;

        rcu_read_lock();
        state = rcu_dereference(wi_state);
        if (state)
                weight = state->iw_table[node];
        rcu_read_unlock();
        return weight;
}

/*
 * Convert bandwidth values into weighted interleave weights.
 * Call with wi_state_lock.
 */
static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw)
{
        u64 sum_bw = 0;
        unsigned int cast_sum_bw, scaling_factor = 1, iw_gcd = 0;
        int nid;

        for_each_node_state(nid, N_MEMORY)
                sum_bw += bw[nid];

        /* Scale bandwidths to whole numbers in the range [1, weightiness] */
        for_each_node_state(nid, N_MEMORY) {
                /*
                 * Try not to perform 64-bit division.
                 * If sum_bw < scaling_factor, then sum_bw < U32_MAX.
                 * If sum_bw > scaling_factor, then round the weight up to 1.
                 */
                scaling_factor = weightiness * bw[nid];
                if (bw[nid] && sum_bw < scaling_factor) {
                        cast_sum_bw = (unsigned int)sum_bw;
                        new_iw[nid] = scaling_factor / cast_sum_bw;
                } else {
                        new_iw[nid] = 1;
                }
                if (!iw_gcd)
                        iw_gcd = new_iw[nid];
                iw_gcd = gcd(iw_gcd, new_iw[nid]);
        }

        /* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */
        for_each_node_state(nid, N_MEMORY)
                new_iw[nid] /= iw_gcd;
}

int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords)
{
        struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
        unsigned int *old_bw, *new_bw;
        unsigned int bw_val;
        int i;

        bw_val = min(coords->read_bandwidth, coords->write_bandwidth);
        new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL);
        if (!new_bw)
                return -ENOMEM;

        new_wi_state = kmalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
                               GFP_KERNEL);
        if (!new_wi_state) {
                kfree(new_bw);
                return -ENOMEM;
        }
        new_wi_state->mode_auto = true;
        for (i = 0; i < nr_node_ids; i++)
                new_wi_state->iw_table[i] = 1;

        /*
         * Update bandwidth info, even in manual mode. That way, when switching
         * to auto mode in the future, iw_table can be overwritten using
         * accurate bw data.
         */
        mutex_lock(&wi_state_lock);

        old_bw = node_bw_table;
        if (old_bw)
                memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw));
        new_bw[node] = bw_val;
        node_bw_table = new_bw;

        old_wi_state = rcu_dereference_protected(wi_state,
                                        lockdep_is_held(&wi_state_lock));
        if (old_wi_state && !old_wi_state->mode_auto) {
                /* Manual mode; skip reducing weights and updating wi_state */
                mutex_unlock(&wi_state_lock);
                kfree(new_wi_state);
                goto out;
        }

        /* NULL wi_state assumes auto=true; reduce weights and update wi_state*/
        reduce_interleave_weights(new_bw, new_wi_state->iw_table);
        rcu_assign_pointer(wi_state, new_wi_state);

        mutex_unlock(&wi_state_lock);
        if (old_wi_state) {
                synchronize_rcu();
                kfree(old_wi_state);
        }
out:
        kfree(old_bw);
        return 0;
}

/**
 * numa_nearest_node - Find nearest node by state
 * @node: Node id to start the search
 * @state: State to filter the search
 *
 * Lookup the closest node by distance if @nid is not in state.
 *
 * Return: this @node if it is in state, otherwise the closest node by distance
 */
int numa_nearest_node(int node, unsigned int state)
{
        int min_dist = INT_MAX, dist, n, min_node;

        if (state >= NR_NODE_STATES)
                return -EINVAL;

        if (node == NUMA_NO_NODE || node_state(node, state))
                return node;

        min_node = node;
        for_each_node_state(n, state) {
                dist = node_distance(node, n);
                if (dist < min_dist) {
                        min_dist = dist;
                        min_node = n;
                }
        }

        return min_node;
}
EXPORT_SYMBOL_GPL(numa_nearest_node);

/**
 * nearest_node_nodemask - Find the node in @mask at the nearest distance
 *                           from @node.
 *
 * @node: a valid node ID to start the search from.
 * @mask: a pointer to a nodemask representing the allowed nodes.
 *
 * This function iterates over all nodes in @mask and calculates the
 * distance from the starting @node, then it returns the node ID that is
 * the closest to @node, or MAX_NUMNODES if no node is found.
 *
 * Note that @node must be a valid node ID usable with node_distance(),
 * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes
 * or unexpected behavior.
 */
int nearest_node_nodemask(int node, nodemask_t *mask)
{
        int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES;

        for_each_node_mask(n, *mask) {
                dist = node_distance(node, n);
                if (dist < min_dist) {
                        min_dist = dist;
                        min_node = n;
                }
        }

        return min_node;
}
EXPORT_SYMBOL_GPL(nearest_node_nodemask);

struct mempolicy *get_task_policy(struct task_struct *p)
{
        struct mempolicy *pol = p->mempolicy;
        int node;

        if (pol)
                return pol;

        node = numa_node_id();
        if (node != NUMA_NO_NODE) {
                pol = &preferred_node_policy[node];
                /* preferred_node_policy is not initialised early in boot */
                if (pol->mode)
                        return pol;
        }

        return &default_policy;
}

static const struct mempolicy_operations {
        int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
        void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
} mpol_ops[MPOL_MAX];

static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
{
        return pol->flags & MPOL_MODE_FLAGS;
}

static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
                                   const nodemask_t *rel)
{
        nodemask_t tmp;
        nodes_fold(tmp, *orig, nodes_weight(*rel));
        nodes_onto(*ret, tmp, *rel);
}

static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
        if (nodes_empty(*nodes))
                return -EINVAL;
        pol->nodes = *nodes;
        return 0;
}

static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
{
        if (nodes_empty(*nodes))
                return -EINVAL;

        nodes_clear(pol->nodes);
        node_set(first_node(*nodes), pol->nodes);
        return 0;
}

/*
 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 * any, for the new policy.  mpol_new() has already validated the nodes
 * parameter with respect to the policy mode and flags.
 *
 * Must be called holding task's alloc_lock to protect task's mems_allowed
 * and mempolicy.  May also be called holding the mmap_lock for write.
 */
static int mpol_set_nodemask(struct mempolicy *pol,
                     const nodemask_t *nodes, struct nodemask_scratch *nsc)
{
        int ret;

        /*
         * Default (pol==NULL) resp. local memory policies are not a
         * subject of any remapping. They also do not need any special
         * constructor.
         */
        if (!pol || pol->mode == MPOL_LOCAL)
                return 0;

        /* Check N_MEMORY */
        nodes_and(nsc->mask1,
                  cpuset_current_mems_allowed, node_states[N_MEMORY]);

        VM_BUG_ON(!nodes);

        if (pol->flags & MPOL_F_RELATIVE_NODES)
                mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
        else
                nodes_and(nsc->mask2, *nodes, nsc->mask1);

        if (mpol_store_user_nodemask(pol))
                pol->w.user_nodemask = *nodes;
        else
                pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;

        ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
        return ret;
}

/*
 * This function just creates a new policy, does some check and simple
 * initialization. You must invoke mpol_set_nodemask() to set nodes.
 */
static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
                                  nodemask_t *nodes)
{
        struct mempolicy *policy;

        if (mode == MPOL_DEFAULT) {
                if (nodes && !nodes_empty(*nodes))
                        return ERR_PTR(-EINVAL);
                return NULL;
        }
        VM_BUG_ON(!nodes);

        /*
         * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
         * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
         * All other modes require a valid pointer to a non-empty nodemask.
         */
        if (mode == MPOL_PREFERRED) {
                if (nodes_empty(*nodes)) {
                        if (((flags & MPOL_F_STATIC_NODES) ||
                             (flags & MPOL_F_RELATIVE_NODES)))
                                return ERR_PTR(-EINVAL);

                        mode = MPOL_LOCAL;
                }
        } else if (mode == MPOL_LOCAL) {
                if (!nodes_empty(*nodes) ||
                    (flags & MPOL_F_STATIC_NODES) ||
                    (flags & MPOL_F_RELATIVE_NODES))
                        return ERR_PTR(-EINVAL);
        } else if (nodes_empty(*nodes))
                return ERR_PTR(-EINVAL);

        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
        if (!policy)
                return ERR_PTR(-ENOMEM);
        atomic_set(&policy->refcnt, 1);
        policy->mode = mode;
        policy->flags = flags;
        policy->home_node = NUMA_NO_NODE;

        return policy;
}

/* Slow path of a mpol destructor. */
void __mpol_put(struct mempolicy *pol)
{
        if (!atomic_dec_and_test(&pol->refcnt))
                return;
        kmem_cache_free(policy_cache, pol);
}

static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
{
}

static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
        nodemask_t tmp;

        if (pol->flags & MPOL_F_STATIC_NODES)
                nodes_and(tmp, pol->w.user_nodemask, *nodes);
        else if (pol->flags & MPOL_F_RELATIVE_NODES)
                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
        else {
                nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
                                                                *nodes);
                pol->w.cpuset_mems_allowed = *nodes;
        }

        if (nodes_empty(tmp))
                tmp = *nodes;

        pol->nodes = tmp;
}

static void mpol_rebind_preferred(struct mempolicy *pol,
                                                const nodemask_t *nodes)
{
        pol->w.cpuset_mems_allowed = *nodes;
}

/*
 * mpol_rebind_policy - Migrate a policy to a different set of nodes
 *
 * Per-vma policies are protected by mmap_lock. Allocations using per-task
 * policies are protected by task->mems_allowed_seq to prevent a premature
 * OOM/allocation failure due to parallel nodemask modification.
 */
static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
{
        if (!pol || pol->mode == MPOL_LOCAL)
                return;
        if (!mpol_store_user_nodemask(pol) &&
            nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
                return;

        mpol_ops[pol->mode].rebind(pol, newmask);
}

/*
 * Wrapper for mpol_rebind_policy() that just requires task
 * pointer, and updates task mempolicy.
 *
 * Called with task's alloc_lock held.
 */
void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
{
        mpol_rebind_policy(tsk->mempolicy, new);
}

/*
 * Rebind each vma in mm to new nodemask.
 *
 * Call holding a reference to mm.  Takes mm->mmap_lock during call.
 */
void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
        struct vm_area_struct *vma;
        VMA_ITERATOR(vmi, mm, 0);

        mmap_write_lock(mm);
        for_each_vma(vmi, vma) {
                vma_start_write(vma);
                mpol_rebind_policy(vma->vm_policy, new);
        }
        mmap_write_unlock(mm);
}

static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
        [MPOL_DEFAULT] = {
                .rebind = mpol_rebind_default,
        },
        [MPOL_INTERLEAVE] = {
                .create = mpol_new_nodemask,
                .rebind = mpol_rebind_nodemask,
        },
        [MPOL_PREFERRED] = {
                .create = mpol_new_preferred,
                .rebind = mpol_rebind_preferred,
        },
        [MPOL_BIND] = {
                .create = mpol_new_nodemask,
                .rebind = mpol_rebind_nodemask,
        },
        [MPOL_LOCAL] = {
                .rebind = mpol_rebind_default,
        },
        [MPOL_PREFERRED_MANY] = {
                .create = mpol_new_nodemask,
                .rebind = mpol_rebind_preferred,
        },
        [MPOL_WEIGHTED_INTERLEAVE] = {
                .create = mpol_new_nodemask,
                .rebind = mpol_rebind_nodemask,
        },
};

static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
                                unsigned long flags);
static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
                                pgoff_t ilx, int *nid);

static bool strictly_unmovable(unsigned long flags)
{
        /*
         * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO
         * if any misplaced page is found.
         */
        return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ==
                         MPOL_MF_STRICT;
}

struct migration_mpol {                /* for alloc_migration_target_by_mpol() */
        struct mempolicy *pol;
        pgoff_t ilx;
};

struct queue_pages {
        struct list_head *pagelist;
        unsigned long flags;
        nodemask_t *nmask;
        unsigned long start;
        unsigned long end;
        struct vm_area_struct *first;
        struct folio *large;                /* note last large folio encountered */
        long nr_failed;                        /* could not be isolated at this time */
};

/*
 * Check if the folio's nid is in qp->nmask.
 *
 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
 * in the invert of qp->nmask.
 */
static inline bool queue_folio_required(struct folio *folio,
                                        struct queue_pages *qp)
{
        int nid = folio_nid(folio);
        unsigned long flags = qp->flags;

        return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
}

static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
{
        struct folio *folio;
        struct queue_pages *qp = walk->private;

        if (unlikely(is_pmd_migration_entry(*pmd))) {
                qp->nr_failed++;
                return;
        }
        folio = pmd_folio(*pmd);
        if (is_huge_zero_folio(folio)) {
                walk->action = ACTION_CONTINUE;
                return;
        }
        if (!queue_folio_required(folio, qp))
                return;
        if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
            !vma_migratable(walk->vma) ||
            !migrate_folio_add(folio, qp->pagelist, qp->flags))
                qp->nr_failed++;
}

/*
 * Scan through folios, checking if they satisfy the required conditions,
 * moving them from LRU to local pagelist for migration if they do (or not).
 *
 * queue_folios_pte_range() has two possible return values:
 * 0 - continue walking to scan for more, even if an existing folio on the
 *     wrong node could not be isolated and queued for migration.
 * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL,
 *        and an existing folio was on a node that does not follow the policy.
 */
static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
                        unsigned long end, struct mm_walk *walk)
{
        struct vm_area_struct *vma = walk->vma;
        struct folio *folio;
        struct queue_pages *qp = walk->private;
        unsigned long flags = qp->flags;
        pte_t *pte, *mapped_pte;
        pte_t ptent;
        spinlock_t *ptl;
        int max_nr, nr;

        ptl = pmd_trans_huge_lock(pmd, vma);
        if (ptl) {
                queue_folios_pmd(pmd, walk);
                spin_unlock(ptl);
                goto out;
        }

        mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
        if (!pte) {
                walk->action = ACTION_AGAIN;
                return 0;
        }
        for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) {
                max_nr = (end - addr) >> PAGE_SHIFT;
                nr = 1;
                ptent = ptep_get(pte);
                if (pte_none(ptent))
                        continue;
                if (!pte_present(ptent)) {
                        if (is_migration_entry(pte_to_swp_entry(ptent)))
                                qp->nr_failed++;
                        continue;
                }
                folio = vm_normal_folio(vma, addr, ptent);
                if (!folio || folio_is_zone_device(folio))
                        continue;
                if (folio_test_large(folio) && max_nr != 1)
                        nr = folio_pte_batch(folio, pte, ptent, max_nr);
                /*
                 * vm_normal_folio() filters out zero pages, but there might
                 * still be reserved folios to skip, perhaps in a VDSO.
                 */
                if (folio_test_reserved(folio))
                        continue;
                if (!queue_folio_required(folio, qp))
                        continue;
                if (folio_test_large(folio)) {
                        /*
                         * A large folio can only be isolated from LRU once,
                         * but may be mapped by many PTEs (and Copy-On-Write may
                         * intersperse PTEs of other, order 0, folios).  This is
                         * a common case, so don't mistake it for failure (but
                         * there can be other cases of multi-mapped pages which
                         * this quick check does not help to filter out - and a
                         * search of the pagelist might grow to be prohibitive).
                         *
                         * migrate_pages(&pagelist) returns nr_failed folios, so
                         * check "large" now so that queue_pages_range() returns
                         * a comparable nr_failed folios.  This does imply that
                         * if folio could not be isolated for some racy reason
                         * at its first PTE, later PTEs will not give it another
                         * chance of isolation; but keeps the accounting simple.
                         */
                        if (folio == qp->large)
                                continue;
                        qp->large = folio;
                }
                if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
                    !vma_migratable(vma) ||
                    !migrate_folio_add(folio, qp->pagelist, flags)) {
                        qp->nr_failed += nr;
                        if (strictly_unmovable(flags))
                                break;
                }
        }
        pte_unmap_unlock(mapped_pte, ptl);
        cond_resched();
out:
        if (qp->nr_failed && strictly_unmovable(flags))
                return -EIO;
        return 0;
}

static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
                               unsigned long addr, unsigned long end,
                               struct mm_walk *walk)
{
#ifdef CONFIG_HUGETLB_PAGE
        struct queue_pages *qp = walk->private;
        unsigned long flags = qp->flags;
        struct folio *folio;
        spinlock_t *ptl;
        pte_t entry;

        ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
        entry = huge_ptep_get(walk->mm, addr, pte);
        if (!pte_present(entry)) {
                if (unlikely(is_hugetlb_entry_migration(entry)))
                        qp->nr_failed++;
                goto unlock;
        }
        folio = pfn_folio(pte_pfn(entry));
        if (!queue_folio_required(folio, qp))
                goto unlock;
        if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
            !vma_migratable(walk->vma)) {
                qp->nr_failed++;
                goto unlock;
        }
        /*
         * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
         * Choosing not to migrate a shared folio is not counted as a failure.
         *
         * See folio_maybe_mapped_shared() on possible imprecision when we
         * cannot easily detect if a folio is shared.
         */
        if ((flags & MPOL_MF_MOVE_ALL) ||
            (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
                if (!folio_isolate_hugetlb(folio, qp->pagelist))
                        qp->nr_failed++;
unlock:
        spin_unlock(ptl);
        if (qp->nr_failed && strictly_unmovable(flags))
                return -EIO;
#endif
        return 0;
}

#ifdef CONFIG_NUMA_BALANCING
/*
 * This is used to mark a range of virtual addresses to be inaccessible.
 * These are later cleared by a NUMA hinting fault. Depending on these
 * faults, pages may be migrated for better NUMA placement.
 *
 * This is assuming that NUMA faults are handled using PROT_NONE. If
 * an architecture makes a different choice, it will need further
 * changes to the core.
 */
unsigned long change_prot_numa(struct vm_area_struct *vma,
                        unsigned long addr, unsigned long end)
{
        struct mmu_gather tlb;
        long nr_updated;

        tlb_gather_mmu(&tlb, vma->vm_mm);

        nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
        if (nr_updated > 0) {
                count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
                count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated);
        }

        tlb_finish_mmu(&tlb);

        return nr_updated;
}
#endif /* CONFIG_NUMA_BALANCING */

static int queue_pages_test_walk(unsigned long start, unsigned long end,
                                struct mm_walk *walk)
{
        struct vm_area_struct *next, *vma = walk->vma;
        struct queue_pages *qp = walk->private;
        unsigned long flags = qp->flags;

        /* range check first */
        VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);

        if (!qp->first) {
                qp->first = vma;
                if (!(flags & MPOL_MF_DISCONTIG_OK) &&
                        (qp->start < vma->vm_start))
                        /* hole at head side of range */
                        return -EFAULT;
        }
        next = find_vma(vma->vm_mm, vma->vm_end);
        if (!(flags & MPOL_MF_DISCONTIG_OK) &&
                ((vma->vm_end < qp->end) &&
                (!next || vma->vm_end < next->vm_start)))
                /* hole at middle or tail of range */
                return -EFAULT;

        /*
         * Need check MPOL_MF_STRICT to return -EIO if possible
         * regardless of vma_migratable
         */
        if (!vma_migratable(vma) &&
            !(flags & MPOL_MF_STRICT))
                return 1;

        /*
         * Check page nodes, and queue pages to move, in the current vma.
         * But if no moving, and no strict checking, the scan can be skipped.
         */
        if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
                return 0;
        return 1;
}

static const struct mm_walk_ops queue_pages_walk_ops = {
        .hugetlb_entry                = queue_folios_hugetlb,
        .pmd_entry                = queue_folios_pte_range,
        .test_walk                = queue_pages_test_walk,
        .walk_lock                = PGWALK_RDLOCK,
};

static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
        .hugetlb_entry                = queue_folios_hugetlb,
        .pmd_entry                = queue_folios_pte_range,
        .test_walk                = queue_pages_test_walk,
        .walk_lock                = PGWALK_WRLOCK,
};

/*
 * Walk through page tables and collect pages to be migrated.
 *
 * If pages found in a given range are not on the required set of @nodes,
 * and migration is allowed, they are isolated and queued to @pagelist.
 *
 * queue_pages_range() may return:
 * 0 - all pages already on the right node, or successfully queued for moving
 *     (or neither strict checking nor moving requested: only range checking).
 * >0 - this number of misplaced folios could not be queued for moving
 *      (a hugetlbfs page or a transparent huge page being counted as 1).
 * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs.
 * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified.
 */
static long
queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
                nodemask_t *nodes, unsigned long flags,
                struct list_head *pagelist)
{
        int err;
        struct queue_pages qp = {
                .pagelist = pagelist,
                .flags = flags,
                .nmask = nodes,
                .start = start,
                .end = end,
                .first = NULL,
        };
        const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ?
                        &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;

        err = walk_page_range(mm, start, end, ops, &qp);

        if (!qp.first)
                /* whole range in hole */
                err = -EFAULT;

        return err ? : qp.nr_failed;
}

/*
 * Apply policy to a single VMA
 * This must be called with the mmap_lock held for writing.
 */
static int vma_replace_policy(struct vm_area_struct *vma,
                                struct mempolicy *pol)
{
        int err;
        struct mempolicy *old;
        struct mempolicy *new;

        vma_assert_write_locked(vma);

        new = mpol_dup(pol);
        if (IS_ERR(new))
                return PTR_ERR(new);

        if (vma->vm_ops && vma->vm_ops->set_policy) {
                err = vma->vm_ops->set_policy(vma, new);
                if (err)
                        goto err_out;
        }

        old = vma->vm_policy;
        vma->vm_policy = new; /* protected by mmap_lock */
        mpol_put(old);

        return 0;
 err_out:
        mpol_put(new);
        return err;
}

/* Split or merge the VMA (if required) and apply the new policy */
static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
                struct vm_area_struct **prev, unsigned long start,
                unsigned long end, struct mempolicy *new_pol)
{
        unsigned long vmstart, vmend;

        vmend = min(end, vma->vm_end);
        if (start > vma->vm_start) {
                *prev = vma;
                vmstart = start;
        } else {
                vmstart = vma->vm_start;
        }

        if (mpol_equal(vma->vm_policy, new_pol)) {
                *prev = vma;
                return 0;
        }

        vma =  vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol);
        if (IS_ERR(vma))
                return PTR_ERR(vma);

        *prev = vma;
        return vma_replace_policy(vma, new_pol);
}

/* Set the process memory policy */
static long do_set_mempolicy(unsigned short mode, unsigned short flags,
                             nodemask_t *nodes)
{
        struct mempolicy *new, *old;
        NODEMASK_SCRATCH(scratch);
        int ret;

        if (!scratch)
                return -ENOMEM;

        new = mpol_new(mode, flags, nodes);
        if (IS_ERR(new)) {
                ret = PTR_ERR(new);
                goto out;
        }

        task_lock(current);
        ret = mpol_set_nodemask(new, nodes, scratch);
        if (ret) {
                task_unlock(current);
                mpol_put(new);
                goto out;
        }

        old = current->mempolicy;
        current->mempolicy = new;
        if (new && (new->mode == MPOL_INTERLEAVE ||
                    new->mode == MPOL_WEIGHTED_INTERLEAVE)) {
                current->il_prev = MAX_NUMNODES-1;
                current->il_weight = 0;
        }
        task_unlock(current);
        mpol_put(old);
        ret = 0;
out:
        NODEMASK_SCRATCH_FREE(scratch);
        return ret;
}

/*
 * Return nodemask for policy for get_mempolicy() query
 *
 * Called with task's alloc_lock held
 */
static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
{
        nodes_clear(*nodes);
        if (pol == &default_policy)
                return;

        switch (pol->mode) {
        case MPOL_BIND:
        case MPOL_INTERLEAVE:
        case MPOL_PREFERRED:
        case MPOL_PREFERRED_MANY:
        case MPOL_WEIGHTED_INTERLEAVE:
                *nodes = pol->nodes;
                break;
        case MPOL_LOCAL:
                /* return empty node mask for local allocation */
                break;
        default:
                BUG();
        }
}

static int lookup_node(struct mm_struct *mm, unsigned long addr)
{
        struct page *p = NULL;
        int ret;

        ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
        if (ret > 0) {
                ret = page_to_nid(p);
                put_page(p);
        }
        return ret;
}

/* Retrieve NUMA policy */
static long do_get_mempolicy(int *policy, nodemask_t *nmask,
                             unsigned long addr, unsigned long flags)
{
        int err;
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma = NULL;
        struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;

        if (flags &
                ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
                return -EINVAL;

        if (flags & MPOL_F_MEMS_ALLOWED) {
                if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
                        return -EINVAL;
                *policy = 0;        /* just so it's initialized */
                task_lock(current);
                *nmask  = cpuset_current_mems_allowed;
                task_unlock(current);
                return 0;
        }

        if (flags & MPOL_F_ADDR) {
                pgoff_t ilx;                /* ignored here */
                /*
                 * Do NOT fall back to task policy if the
                 * vma/shared policy at addr is NULL.  We
                 * want to return MPOL_DEFAULT in this case.
                 */
                mmap_read_lock(mm);
                vma = vma_lookup(mm, addr);
                if (!vma) {
                        mmap_read_unlock(mm);
                        return -EFAULT;
                }
                pol = __get_vma_policy(vma, addr, &ilx);
        } else if (addr)
                return -EINVAL;

        if (!pol)
                pol = &default_policy;        /* indicates default behavior */

        if (flags & MPOL_F_NODE) {
                if (flags & MPOL_F_ADDR) {
                        /*
                         * Take a refcount on the mpol, because we are about to
                         * drop the mmap_lock, after which only "pol" remains
                         * valid, "vma" is stale.
                         */
                        pol_refcount = pol;
                        vma = NULL;
                        mpol_get(pol);
                        mmap_read_unlock(mm);
                        err = lookup_node(mm, addr);
                        if (err < 0)
                                goto out;
                        *policy = err;
                } else if (pol == current->mempolicy &&
                                pol->mode == MPOL_INTERLEAVE) {
                        *policy = next_node_in(current->il_prev, pol->nodes);
                } else if (pol == current->mempolicy &&
                                pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
                        if (current->il_weight)
                                *policy = current->il_prev;
                        else
                                *policy = next_node_in(current->il_prev,
                                                       pol->nodes);
                } else {
                        err = -EINVAL;
                        goto out;
                }
        } else {
                *policy = pol == &default_policy ? MPOL_DEFAULT :
                                                pol->mode;
                /*
                 * Internal mempolicy flags must be masked off before exposing
                 * the policy to userspace.
                 */
                *policy |= (pol->flags & MPOL_MODE_FLAGS);
        }

        err = 0;
        if (nmask) {
                if (mpol_store_user_nodemask(pol)) {
                        *nmask = pol->w.user_nodemask;
                } else {
                        task_lock(current);
                        get_policy_nodemask(pol, nmask);
                        task_unlock(current);
                }
        }

 out:
        mpol_cond_put(pol);
        if (vma)
                mmap_read_unlock(mm);
        if (pol_refcount)
                mpol_put(pol_refcount);
        return err;
}

#ifdef CONFIG_MIGRATION
static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
                                unsigned long flags)
{
        /*
         * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
         * Choosing not to migrate a shared folio is not counted as a failure.
         *
         * See folio_maybe_mapped_shared() on possible imprecision when we
         * cannot easily detect if a folio is shared.
         */
        if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) {
                if (folio_isolate_lru(folio)) {
                        list_add_tail(&folio->lru, foliolist);
                        node_stat_mod_folio(folio,
                                NR_ISOLATED_ANON + folio_is_file_lru(folio),
                                folio_nr_pages(folio));
                } else {
                        /*
                         * Non-movable folio may reach here.  And, there may be
                         * temporary off LRU folios or non-LRU movable folios.
                         * Treat them as unmovable folios since they can't be
                         * isolated, so they can't be moved at the moment.
                         */
                        return false;
                }
        }
        return true;
}

/*
 * Migrate pages from one node to a target node.
 * Returns error or the number of pages not migrated.
 */
static long migrate_to_node(struct mm_struct *mm, int source, int dest,
                            int flags)
{
        nodemask_t nmask;
        struct vm_area_struct *vma;
        LIST_HEAD(pagelist);
        long nr_failed;
        long err = 0;
        struct migration_target_control mtc = {
                .nid = dest,
                .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
                .reason = MR_SYSCALL,
        };

        nodes_clear(nmask);
        node_set(source, nmask);

        VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));

        mmap_read_lock(mm);
        vma = find_vma(mm, 0);
        if (unlikely(!vma)) {
                mmap_read_unlock(mm);
                return 0;
        }

        /*
         * This does not migrate the range, but isolates all pages that
         * need migration.  Between passing in the full user address
         * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail,
         * but passes back the count of pages which could not be isolated.
         */
        nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
                                      flags | MPOL_MF_DISCONTIG_OK, &pagelist);
        mmap_read_unlock(mm);

        if (!list_empty(&pagelist)) {
                err = migrate_pages(&pagelist, alloc_migration_target, NULL,
                        (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
                if (err)
                        putback_movable_pages(&pagelist);
        }

        if (err >= 0)
                err += nr_failed;
        return err;
}

/*
 * Move pages between the two nodesets so as to preserve the physical
 * layout as much as possible.
 *
 * Returns the number of page that could not be moved.
 */
int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
                     const nodemask_t *to, int flags)
{
        long nr_failed = 0;
        long err = 0;
        nodemask_t tmp;

        lru_cache_disable();

        /*
         * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
         * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
         * bit in 'tmp', and return that <source, dest> pair for migration.
         * The pair of nodemasks 'to' and 'from' define the map.
         *
         * If no pair of bits is found that way, fallback to picking some
         * pair of 'source' and 'dest' bits that are not the same.  If the
         * 'source' and 'dest' bits are the same, this represents a node
         * that will be migrating to itself, so no pages need move.
         *
         * If no bits are left in 'tmp', or if all remaining bits left
         * in 'tmp' correspond to the same bit in 'to', return false
         * (nothing left to migrate).
         *
         * This lets us pick a pair of nodes to migrate between, such that
         * if possible the dest node is not already occupied by some other
         * source node, minimizing the risk of overloading the memory on a
         * node that would happen if we migrated incoming memory to a node
         * before migrating outgoing memory source that same node.
         *
         * A single scan of tmp is sufficient.  As we go, we remember the
         * most recent <s, d> pair that moved (s != d).  If we find a pair
         * that not only moved, but what's better, moved to an empty slot
         * (d is not set in tmp), then we break out then, with that pair.
         * Otherwise when we finish scanning from_tmp, we at least have the
         * most recent <s, d> pair that moved.  If we get all the way through
         * the scan of tmp without finding any node that moved, much less
         * moved to an empty node, then there is nothing left worth migrating.
         */

        tmp = *from;
        while (!nodes_empty(tmp)) {
                int s, d;
                int source = NUMA_NO_NODE;
                int dest = 0;

                for_each_node_mask(s, tmp) {

                        /*
                         * do_migrate_pages() tries to maintain the relative
                         * node relationship of the pages established between
                         * threads and memory areas.
                         *
                         * However if the number of source nodes is not equal to
                         * the number of destination nodes we can not preserve
                         * this node relative relationship.  In that case, skip
                         * copying memory from a node that is in the destination
                         * mask.
                         *
                         * Example: [2,3,4] -> [3,4,5] moves everything.
                         *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
                         */

                        if ((nodes_weight(*from) != nodes_weight(*to)) &&
                                                (node_isset(s, *to)))
                                continue;

                        d = node_remap(s, *from, *to);
                        if (s == d)
                                continue;

                        source = s;        /* Node moved. Memorize */
                        dest = d;

                        /* dest not in remaining from nodes? */
                        if (!node_isset(dest, tmp))
                                break;
                }
                if (source == NUMA_NO_NODE)
                        break;

                node_clear(source, tmp);
                err = migrate_to_node(mm, source, dest, flags);
                if (err > 0)
                        nr_failed += err;
                if (err < 0)
                        break;
        }

        lru_cache_enable();
        if (err < 0)
                return err;
        return (nr_failed < INT_MAX) ? nr_failed : INT_MAX;
}

/*
 * Allocate a new folio for page migration, according to NUMA mempolicy.
 */
static struct folio *alloc_migration_target_by_mpol(struct folio *src,
                                                    unsigned long private)
{
        struct migration_mpol *mmpol = (struct migration_mpol *)private;
        struct mempolicy *pol = mmpol->pol;
        pgoff_t ilx = mmpol->ilx;
        unsigned int order;
        int nid = numa_node_id();
        gfp_t gfp;

        order = folio_order(src);
        ilx += src->index >> order;

        if (folio_test_hugetlb(src)) {
                nodemask_t *nodemask;
                struct hstate *h;

                h = folio_hstate(src);
                gfp = htlb_alloc_mask(h);
                nodemask = policy_nodemask(gfp, pol, ilx, &nid);
                return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp,
                                htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND));
        }

        if (folio_test_large(src))
                gfp = GFP_TRANSHUGE;
        else
                gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;

        return folio_alloc_mpol(gfp, order, pol, ilx, nid);
}
#else

static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
                                unsigned long flags)
{
        return false;
}

int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
                     const nodemask_t *to, int flags)
{
        return -ENOSYS;
}

static struct folio *alloc_migration_target_by_mpol(struct folio *src,
                                                    unsigned long private)
{
        return NULL;
}
#endif

static long do_mbind(unsigned long start, unsigned long len,
                     unsigned short mode, unsigned short mode_flags,
                     nodemask_t *nmask, unsigned long flags)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev;
        struct vma_iterator vmi;
        struct migration_mpol mmpol;
        struct mempolicy *new;
        unsigned long end;
        long err;
        long nr_failed;
        LIST_HEAD(pagelist);

        if (flags & ~(unsigned long)MPOL_MF_VALID)
                return -EINVAL;
        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
                return -EPERM;

        if (start & ~PAGE_MASK)
                return -EINVAL;

        if (mode == MPOL_DEFAULT)
                flags &= ~MPOL_MF_STRICT;

        len = PAGE_ALIGN(len);
        end = start + len;

        if (end < start)
                return -EINVAL;
        if (end == start)
                return 0;

        new = mpol_new(mode, mode_flags, nmask);
        if (IS_ERR(new))
                return PTR_ERR(new);

        /*
         * If we are using the default policy then operation
         * on discontinuous address spaces is okay after all
         */
        if (!new)
                flags |= MPOL_MF_DISCONTIG_OK;

        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
                lru_cache_disable();
        {
                NODEMASK_SCRATCH(scratch);
                if (scratch) {
                        mmap_write_lock(mm);
                        err = mpol_set_nodemask(new, nmask, scratch);
                        if (err)
                                mmap_write_unlock(mm);
                } else
                        err = -ENOMEM;
                NODEMASK_SCRATCH_FREE(scratch);
        }
        if (err)
                goto mpol_out;

        /*
         * Lock the VMAs before scanning for pages to migrate,
         * to ensure we don't miss a concurrently inserted page.
         */
        nr_failed = queue_pages_range(mm, start, end, nmask,
                        flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist);

        if (nr_failed < 0) {
                err = nr_failed;
                nr_failed = 0;
        } else {
                vma_iter_init(&vmi, mm, start);
                prev = vma_prev(&vmi);
                for_each_vma_range(vmi, vma, end) {
                        err = mbind_range(&vmi, vma, &prev, start, end, new);
                        if (err)
                                break;
                }
        }

        if (!err && !list_empty(&pagelist)) {
                /* Convert MPOL_DEFAULT's NULL to task or default policy */
                if (!new) {
                        new = get_task_policy(current);
                        mpol_get(new);
                }
                mmpol.pol = new;
                mmpol.ilx = 0;

                /*
                 * In the interleaved case, attempt to allocate on exactly the
                 * targeted nodes, for the first VMA to be migrated; for later
                 * VMAs, the nodes will still be interleaved from the targeted
                 * nodemask, but one by one may be selected differently.
                 */
                if (new->mode == MPOL_INTERLEAVE ||
                    new->mode == MPOL_WEIGHTED_INTERLEAVE) {
                        struct folio *folio;
                        unsigned int order;
                        unsigned long addr = -EFAULT;

                        list_for_each_entry(folio, &pagelist, lru) {
                                if (!folio_test_ksm(folio))
                                        break;
                        }
                        if (!list_entry_is_head(folio, &pagelist, lru)) {
                                vma_iter_init(&vmi, mm, start);
                                for_each_vma_range(vmi, vma, end) {
                                        addr = page_address_in_vma(folio,
                                                folio_page(folio, 0), vma);
                                        if (addr != -EFAULT)
                                                break;
                                }
                        }
                        if (addr != -EFAULT) {
                                order = folio_order(folio);
                                /* We already know the pol, but not the ilx */
                                mpol_cond_put(get_vma_policy(vma, addr, order,
                                                             &mmpol.ilx));
                                /* Set base from which to increment by index */
                                mmpol.ilx -= folio->index >> order;
                        }
                }
        }

        mmap_write_unlock(mm);

        if (!err && !list_empty(&pagelist)) {
                nr_failed |= migrate_pages(&pagelist,
                                alloc_migration_target_by_mpol, NULL,
                                (unsigned long)&mmpol, MIGRATE_SYNC,
                                MR_MEMPOLICY_MBIND, NULL);
        }

        if (nr_failed && (flags & MPOL_MF_STRICT))
                err = -EIO;
        if (!list_empty(&pagelist))
                putback_movable_pages(&pagelist);
mpol_out:
        mpol_put(new);
        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
                lru_cache_enable();
        return err;
}

/*
 * User space interface with variable sized bitmaps for nodelists.
 */
static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
                      unsigned long maxnode)
{
        unsigned long nlongs = BITS_TO_LONGS(maxnode);
        int ret;

        if (in_compat_syscall())
                ret = compat_get_bitmap(mask,
                                        (const compat_ulong_t __user *)nmask,
                                        maxnode);
        else
                ret = copy_from_user(mask, nmask,
                                     nlongs * sizeof(unsigned long));

        if (ret)
                return -EFAULT;

        if (maxnode % BITS_PER_LONG)
                mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;

        return 0;
}

/* Copy a node mask from user space. */
static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
                     unsigned long maxnode)
{
        --maxnode;
        nodes_clear(*nodes);
        if (maxnode == 0 || !nmask)
                return 0;
        if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
                return -EINVAL;

        /*
         * When the user specified more nodes than supported just check
         * if the non supported part is all zero, one word at a time,
         * starting at the end.
         */
        while (maxnode > MAX_NUMNODES) {
                unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
                unsigned long t;

                if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
                        return -EFAULT;

                if (maxnode - bits >= MAX_NUMNODES) {
                        maxnode -= bits;
                } else {
                        maxnode = MAX_NUMNODES;
                        t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
                }
                if (t)
                        return -EINVAL;
        }

        return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
}

/* Copy a kernel node mask to user space */
static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
                              nodemask_t *nodes)
{
        unsigned long copy = ALIGN(maxnode-1, 64) / 8;
        unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
        bool compat = in_compat_syscall();

        if (compat)
                nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);

        if (copy > nbytes) {
                if (copy > PAGE_SIZE)
                        return -EINVAL;
                if (clear_user((char __user *)mask + nbytes, copy - nbytes))
                        return -EFAULT;
                copy = nbytes;
                maxnode = nr_node_ids;
        }

        if (compat)
                return compat_put_bitmap((compat_ulong_t __user *)mask,
                                         nodes_addr(*nodes), maxnode);

        return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
}

/* Basic parameter sanity check used by both mbind() and set_mempolicy() */
static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
{
        *flags = *mode & MPOL_MODE_FLAGS;
        *mode &= ~MPOL_MODE_FLAGS;

        if ((unsigned int)(*mode) >=  MPOL_MAX)
                return -EINVAL;
        if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
                return -EINVAL;
        if (*flags & MPOL_F_NUMA_BALANCING) {
                if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY)
                        *flags |= (MPOL_F_MOF | MPOL_F_MORON);
                else
                        return -EINVAL;
        }
        return 0;
}

static long kernel_mbind(unsigned long start, unsigned long len,
                         unsigned long mode, const unsigned long __user *nmask,
                         unsigned long maxnode, unsigned int flags)
{
        unsigned short mode_flags;
        nodemask_t nodes;
        int lmode = mode;
        int err;

        start = untagged_addr(start);
        err = sanitize_mpol_flags(&lmode, &mode_flags);
        if (err)
                return err;

        err = get_nodes(&nodes, nmask, maxnode);
        if (err)
                return err;

        return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
}

SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
                unsigned long, home_node, unsigned long, flags)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev;
        struct mempolicy *new, *old;
        unsigned long end;
        int err = -ENOENT;
        VMA_ITERATOR(vmi, mm, start);

        start = untagged_addr(start);
        if (start & ~PAGE_MASK)
                return -EINVAL;
        /*
         * flags is used for future extension if any.
         */
        if (flags != 0)
                return -EINVAL;

        /*
         * Check home_node is online to avoid accessing uninitialized
         * NODE_DATA.
         */
        if (home_node >= MAX_NUMNODES || !node_online(home_node))
                return -EINVAL;

        len = PAGE_ALIGN(len);
        end = start + len;

        if (end < start)
                return -EINVAL;
        if (end == start)
                return 0;
        mmap_write_lock(mm);
        prev = vma_prev(&vmi);
        for_each_vma_range(vmi, vma, end) {
                /*
                 * If any vma in the range got policy other than MPOL_BIND
                 * or MPOL_PREFERRED_MANY we return error. We don't reset
                 * the home node for vmas we already updated before.
                 */
                old = vma_policy(vma);
                if (!old) {
                        prev = vma;
                        continue;
                }
                if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
                        err = -EOPNOTSUPP;
                        break;
                }
                new = mpol_dup(old);
                if (IS_ERR(new)) {
                        err = PTR_ERR(new);
                        break;
                }

                vma_start_write(vma);
                new->home_node = home_node;
                err = mbind_range(&vmi, vma, &prev, start, end, new);
                mpol_put(new);
                if (err)
                        break;
        }
        mmap_write_unlock(mm);
        return err;
}

SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
                unsigned long, mode, const unsigned long __user *, nmask,
                unsigned long, maxnode, unsigned int, flags)
{
        return kernel_mbind(start, len, mode, nmask, maxnode, flags);
}

/* Set the process memory policy */
static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
                                 unsigned long maxnode)
{
        unsigned short mode_flags;
        nodemask_t nodes;
        int lmode = mode;
        int err;

        err = sanitize_mpol_flags(&lmode, &mode_flags);
        if (err)
                return err;

        err = get_nodes(&nodes, nmask, maxnode);
        if (err)
                return err;

        return do_set_mempolicy(lmode, mode_flags, &nodes);
}

SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
                unsigned long, maxnode)
{
        return kernel_set_mempolicy(mode, nmask, maxnode);
}

static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
                                const unsigned long __user *old_nodes,
                                const unsigned long __user *new_nodes)
{
        struct mm_struct *mm = NULL;
        struct task_struct *task;
        nodemask_t task_nodes;
        int err;
        nodemask_t *old;
        nodemask_t *new;
        NODEMASK_SCRATCH(scratch);

        if (!scratch)
                return -ENOMEM;

        old = &scratch->mask1;
        new = &scratch->mask2;

        err = get_nodes(old, old_nodes, maxnode);
        if (err)
                goto out;

        err = get_nodes(new, new_nodes, maxnode);
        if (err)
                goto out;

        /* Find the mm_struct */
        rcu_read_lock();
        task = pid ? find_task_by_vpid(pid) : current;
        if (!task) {
                rcu_read_unlock();
                err = -ESRCH;
                goto out;
        }
        get_task_struct(task);

        err = -EINVAL;

        /*
         * Check if this process has the right to modify the specified process.
         * Use the regular "ptrace_may_access()" checks.
         */
        if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
                rcu_read_unlock();
                err = -EPERM;
                goto out_put;
        }
        rcu_read_unlock();

        task_nodes = cpuset_mems_allowed(task);
        /* Is the user allowed to access the target nodes? */
        if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
                err = -EPERM;
                goto out_put;
        }

        task_nodes = cpuset_mems_allowed(current);
        nodes_and(*new, *new, task_nodes);
        if (nodes_empty(*new))
                goto out_put;

        err = security_task_movememory(task);
        if (err)
                goto out_put;

        mm = get_task_mm(task);
        put_task_struct(task);

        if (!mm) {
                err = -EINVAL;
                goto out;
        }

        err = do_migrate_pages(mm, old, new,
                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);

        mmput(mm);
out:
        NODEMASK_SCRATCH_FREE(scratch);

        return err;

out_put:
        put_task_struct(task);
        goto out;
}

SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
                const unsigned long __user *, old_nodes,
                const unsigned long __user *, new_nodes)
{
        return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
}

/* Retrieve NUMA policy */
static int kernel_get_mempolicy(int __user *policy,
                                unsigned long __user *nmask,
                                unsigned long maxnode,
                                unsigned long addr,
                                unsigned long flags)
{
        int err;
        int pval;
        nodemask_t nodes;

        if (nmask != NULL && maxnode < nr_node_ids)
                return -EINVAL;

        addr = untagged_addr(addr);

        err = do_get_mempolicy(&pval, &nodes, addr, flags);

        if (err)
                return err;

        if (policy && put_user(pval, policy))
                return -EFAULT;

        if (nmask)
                err = copy_nodes_to_user(nmask, maxnode, &nodes);

        return err;
}

SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
                unsigned long __user *, nmask, unsigned long, maxnode,
                unsigned long, addr, unsigned long, flags)
{
        return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
}

bool vma_migratable(struct vm_area_struct *vma)
{
        if (vma->vm_flags & (VM_IO | VM_PFNMAP))
                return false;

        /*
         * DAX device mappings require predictable access latency, so avoid
         * incurring periodic faults.
         */
        if (vma_is_dax(vma))
                return false;

        if (is_vm_hugetlb_page(vma) &&
                !hugepage_migration_supported(hstate_vma(vma)))
                return false;

        /*
         * Migration allocates pages in the highest zone. If we cannot
         * do so then migration (at least from node to node) is not
         * possible.
         */
        if (vma->vm_file &&
                gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
                        < policy_zone)
                return false;
        return true;
}

struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
                                   unsigned long addr, pgoff_t *ilx)
{
        *ilx = 0;
        return (vma->vm_ops && vma->vm_ops->get_policy) ?
                vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy;
}

/*
 * get_vma_policy(@vma, @addr, @order, @ilx)
 * @vma: virtual memory area whose policy is sought
 * @addr: address in @vma for shared policy lookup
 * @order: 0, or appropriate huge_page_order for interleaving
 * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or
 *       MPOL_WEIGHTED_INTERLEAVE
 *
 * Returns effective policy for a VMA at specified address.
 * Falls back to current->mempolicy or system default policy, as necessary.
 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
 * count--added by the get_policy() vm_op, as appropriate--to protect against
 * freeing by another task.  It is the caller's responsibility to free the
 * extra reference for shared policies.
 */
struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
                                 unsigned long addr, int order, pgoff_t *ilx)
{
        struct mempolicy *pol;

        pol = __get_vma_policy(vma, addr, ilx);
        if (!pol)
                pol = get_task_policy(current);
        if (pol->mode == MPOL_INTERLEAVE ||
            pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
                *ilx += vma->vm_pgoff >> order;
                *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
        }
        return pol;
}

bool vma_policy_mof(struct vm_area_struct *vma)
{
        struct mempolicy *pol;

        if (vma->vm_ops && vma->vm_ops->get_policy) {
                bool ret = false;
                pgoff_t ilx;                /* ignored here */

                pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx);
                if (pol && (pol->flags & MPOL_F_MOF))
                        ret = true;
                mpol_cond_put(pol);

                return ret;
        }

        pol = vma->vm_policy;
        if (!pol)
                pol = get_task_policy(current);

        return pol->flags & MPOL_F_MOF;
}

bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
{
        enum zone_type dynamic_policy_zone = policy_zone;

        BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);

        /*
         * if policy->nodes has movable memory only,
         * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
         *
         * policy->nodes is intersect with node_states[N_MEMORY].
         * so if the following test fails, it implies
         * policy->nodes has movable memory only.
         */
        if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
                dynamic_policy_zone = ZONE_MOVABLE;

        return zone >= dynamic_policy_zone;
}

static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
{
        unsigned int node;
        unsigned int cpuset_mems_cookie;

retry:
        /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */
        cpuset_mems_cookie = read_mems_allowed_begin();
        node = current->il_prev;
        if (!current->il_weight || !node_isset(node, policy->nodes)) {
                node = next_node_in(node, policy->nodes);
                if (read_mems_allowed_retry(cpuset_mems_cookie))
                        goto retry;
                if (node == MAX_NUMNODES)
                        return node;
                current->il_prev = node;
                current->il_weight = get_il_weight(node);
        }
        current->il_weight--;
        return node;
}

/* Do dynamic interleaving for a process */
static unsigned int interleave_nodes(struct mempolicy *policy)
{
        unsigned int nid;
        unsigned int cpuset_mems_cookie;

        /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */
        do {
                cpuset_mems_cookie = read_mems_allowed_begin();
                nid = next_node_in(current->il_prev, policy->nodes);
        } while (read_mems_allowed_retry(cpuset_mems_cookie));

        if (nid < MAX_NUMNODES)
                current->il_prev = nid;
        return nid;
}

/*
 * Depending on the memory policy provide a node from which to allocate the
 * next slab entry.
 */
unsigned int mempolicy_slab_node(void)
{
        struct mempolicy *policy;
        int node = numa_mem_id();

        if (!in_task())
                return node;

        policy = current->mempolicy;
        if (!policy)
                return node;

        switch (policy->mode) {
        case MPOL_PREFERRED:
                return first_node(policy->nodes);

        case MPOL_INTERLEAVE:
                return interleave_nodes(policy);

        case MPOL_WEIGHTED_INTERLEAVE:
                return weighted_interleave_nodes(policy);

        case MPOL_BIND:
        case MPOL_PREFERRED_MANY:
        {
                struct zoneref *z;

                /*
                 * Follow bind policy behavior and start allocation at the
                 * first node.
                 */
                struct zonelist *zonelist;
                enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
                zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
                z = first_zones_zonelist(zonelist, highest_zoneidx,
                                                        &policy->nodes);
                return zonelist_zone(z) ? zonelist_node_idx(z) : node;
        }
        case MPOL_LOCAL:
                return node;

        default:
                BUG();
        }
}

static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
                                              nodemask_t *mask)
{
        /*
         * barrier stabilizes the nodemask locally so that it can be iterated
         * over safely without concern for changes. Allocators validate node
         * selection does not violate mems_allowed, so this is safe.
         */
        barrier();
        memcpy(mask, &pol->nodes, sizeof(nodemask_t));
        barrier();
        return nodes_weight(*mask);
}

static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
{
        struct weighted_interleave_state *state;
        nodemask_t nodemask;
        unsigned int target, nr_nodes;
        u8 *table = NULL;
        unsigned int weight_total = 0;
        u8 weight;
        int nid = 0;

        nr_nodes = read_once_policy_nodemask(pol, &nodemask);
        if (!nr_nodes)
                return numa_node_id();

        rcu_read_lock();

        state = rcu_dereference(wi_state);
        /* Uninitialized wi_state means we should assume all weights are 1 */
        if (state)
                table = state->iw_table;

        /* calculate the total weight */
        for_each_node_mask(nid, nodemask)
                weight_total += table ? table[nid] : 1;

        /* Calculate the node offset based on totals */
        target = ilx % weight_total;
        nid = first_node(nodemask);
        while (target) {
                /* detect system default usage */
                weight = table ? table[nid] : 1;
                if (target < weight)
                        break;
                target -= weight;
                nid = next_node_in(nid, nodemask);
        }
        rcu_read_unlock();
        return nid;
}

/*
 * Do static interleaving for interleave index @ilx.  Returns the ilx'th
 * node in pol->nodes (starting from ilx=0), wrapping around if ilx
 * exceeds the number of present nodes.
 */
static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
{
        nodemask_t nodemask;
        unsigned int target, nnodes;
        int i;
        int nid;

        nnodes = read_once_policy_nodemask(pol, &nodemask);
        if (!nnodes)
                return numa_node_id();
        target = ilx % nnodes;
        nid = first_node(nodemask);
        for (i = 0; i < target; i++)
                nid = next_node(nid, nodemask);
        return nid;
}

/*
 * Return a nodemask representing a mempolicy for filtering nodes for
 * page allocation, together with preferred node id (or the input node id).
 */
static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
                                   pgoff_t ilx, int *nid)
{
        nodemask_t *nodemask = NULL;

        switch (pol->mode) {
        case MPOL_PREFERRED:
                /* Override input node id */
                *nid = first_node(pol->nodes);
                break;
        case MPOL_PREFERRED_MANY:
                nodemask = &pol->nodes;
                if (pol->home_node != NUMA_NO_NODE)
                        *nid = pol->home_node;
                break;
        case MPOL_BIND:
                /* Restrict to nodemask (but not on lower zones) */
                if (apply_policy_zone(pol, gfp_zone(gfp)) &&
                    cpuset_nodemask_valid_mems_allowed(&pol->nodes))
                        nodemask = &pol->nodes;
                if (pol->home_node != NUMA_NO_NODE)
                        *nid = pol->home_node;
                /*
                 * __GFP_THISNODE shouldn't even be used with the bind policy
                 * because we might easily break the expectation to stay on the
                 * requested node and not break the policy.
                 */
                WARN_ON_ONCE(gfp & __GFP_THISNODE);
                break;
        case MPOL_INTERLEAVE:
                /* Override input node id */
                *nid = (ilx == NO_INTERLEAVE_INDEX) ?
                        interleave_nodes(pol) : interleave_nid(pol, ilx);
                break;
        case MPOL_WEIGHTED_INTERLEAVE:
                *nid = (ilx == NO_INTERLEAVE_INDEX) ?
                        weighted_interleave_nodes(pol) :
                        weighted_interleave_nid(pol, ilx);
                break;
        }

        return nodemask;
}

#ifdef CONFIG_HUGETLBFS
/*
 * huge_node(@vma, @addr, @gfp_flags, @mpol)
 * @vma: virtual memory area whose policy is sought
 * @addr: address in @vma for shared policy lookup and interleave policy
 * @gfp_flags: for requested zone
 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
 * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
 *
 * Returns a nid suitable for a huge page allocation and a pointer
 * to the struct mempolicy for conditional unref after allocation.
 * If the effective policy is 'bind' or 'prefer-many', returns a pointer
 * to the mempolicy's @nodemask for filtering the zonelist.
 */
int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
                struct mempolicy **mpol, nodemask_t **nodemask)
{
        pgoff_t ilx;
        int nid;

        nid = numa_node_id();
        *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx);
        *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid);
        return nid;
}

/*
 * init_nodemask_of_mempolicy
 *
 * If the current task's mempolicy is "default" [NULL], return 'false'
 * to indicate default policy.  Otherwise, extract the policy nodemask
 * for 'bind' or 'interleave' policy into the argument nodemask, or
 * initialize the argument nodemask to contain the single node for
 * 'preferred' or 'local' policy and return 'true' to indicate presence
 * of non-default mempolicy.
 *
 * We don't bother with reference counting the mempolicy [mpol_get/put]
 * because the current task is examining it's own mempolicy and a task's
 * mempolicy is only ever changed by the task itself.
 *
 * N.B., it is the caller's responsibility to free a returned nodemask.
 */
bool init_nodemask_of_mempolicy(nodemask_t *mask)
{
        struct mempolicy *mempolicy;

        if (!(mask && current->mempolicy))
                return false;

        task_lock(current);
        mempolicy = current->mempolicy;
        switch (mempolicy->mode) {
        case MPOL_PREFERRED:
        case MPOL_PREFERRED_MANY:
        case MPOL_BIND:
        case MPOL_INTERLEAVE:
        case MPOL_WEIGHTED_INTERLEAVE:
                *mask = mempolicy->nodes;
                break;

        case MPOL_LOCAL:
                init_nodemask_of_node(mask, numa_node_id());
                break;

        default:
                BUG();
        }
        task_unlock(current);

        return true;
}
#endif

/*
 * mempolicy_in_oom_domain
 *
 * If tsk's mempolicy is "bind", check for intersection between mask and
 * the policy nodemask. Otherwise, return true for all other policies
 * including "interleave", as a tsk with "interleave" policy may have
 * memory allocated from all nodes in system.
 *
 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
 */
bool mempolicy_in_oom_domain(struct task_struct *tsk,
                                        const nodemask_t *mask)
{
        struct mempolicy *mempolicy;
        bool ret = true;

        if (!mask)
                return ret;

        task_lock(tsk);
        mempolicy = tsk->mempolicy;
        if (mempolicy && mempolicy->mode == MPOL_BIND)
                ret = nodes_intersects(mempolicy->nodes, *mask);
        task_unlock(tsk);

        return ret;
}

static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
                                                int nid, nodemask_t *nodemask)
{
        struct page *page;
        gfp_t preferred_gfp;

        /*
         * This is a two pass approach. The first pass will only try the
         * preferred nodes but skip the direct reclaim and allow the
         * allocation to fail, while the second pass will try all the
         * nodes in system.
         */
        preferred_gfp = gfp | __GFP_NOWARN;
        preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
        page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask);
        if (!page)
                page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL);

        return page;
}

/**
 * alloc_pages_mpol - Allocate pages according to NUMA mempolicy.
 * @gfp: GFP flags.
 * @order: Order of the page allocation.
 * @pol: Pointer to the NUMA mempolicy.
 * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()).
 * @nid: Preferred node (usually numa_node_id() but @mpol may override it).
 *
 * Return: The page on success or NULL if allocation fails.
 */
static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
                struct mempolicy *pol, pgoff_t ilx, int nid)
{
        nodemask_t *nodemask;
        struct page *page;

        nodemask = policy_nodemask(gfp, pol, ilx, &nid);

        if (pol->mode == MPOL_PREFERRED_MANY)
                return alloc_pages_preferred_many(gfp, order, nid, nodemask);

        if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
            /* filter "hugepage" allocation, unless from alloc_pages() */
            order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) {
                /*
                 * For hugepage allocation and non-interleave policy which
                 * allows the current node (or other explicitly preferred
                 * node) we only try to allocate from the current/preferred
                 * node and don't fall back to other nodes, as the cost of
                 * remote accesses would likely offset THP benefits.
                 *
                 * If the policy is interleave or does not allow the current
                 * node in its nodemask, we allocate the standard way.
                 */
                if (pol->mode != MPOL_INTERLEAVE &&
                    pol->mode != MPOL_WEIGHTED_INTERLEAVE &&
                    (!nodemask || node_isset(nid, *nodemask))) {
                        /*
                         * First, try to allocate THP only on local node, but
                         * don't reclaim unnecessarily, just compact.
                         */
                        page = __alloc_frozen_pages_noprof(
                                gfp | __GFP_THISNODE | __GFP_NORETRY, order,
                                nid, NULL);
                        if (page || !(gfp & __GFP_DIRECT_RECLAIM))
                                return page;
                        /*
                         * If hugepage allocations are configured to always
                         * synchronous compact or the vma has been madvised
                         * to prefer hugepage backing, retry allowing remote
                         * memory with both reclaim and compact as well.
                         */
                }
        }

        page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask);

        if (unlikely(pol->mode == MPOL_INTERLEAVE ||
                     pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) {
                /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
                if (static_branch_likely(&vm_numa_stat_key) &&
                    page_to_nid(page) == nid) {
                        preempt_disable();
                        __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
                        preempt_enable();
                }
        }

        return page;
}

struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
                struct mempolicy *pol, pgoff_t ilx, int nid)
{
        struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol,
                        ilx, nid);
        if (!page)
                return NULL;

        set_page_refcounted(page);
        return page_rmappable_folio(page);
}

/**
 * vma_alloc_folio - Allocate a folio for a VMA.
 * @gfp: GFP flags.
 * @order: Order of the folio.
 * @vma: Pointer to VMA.
 * @addr: Virtual address of the allocation.  Must be inside @vma.
 *
 * Allocate a folio for a specific address in @vma, using the appropriate
 * NUMA policy.  The caller must hold the mmap_lock of the mm_struct of the
 * VMA to prevent it from going away.  Should be used for all allocations
 * for folios that will be mapped into user space, excepting hugetlbfs, and
 * excepting where direct use of folio_alloc_mpol() is more appropriate.
 *
 * Return: The folio on success or NULL if allocation fails.
 */
struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
                unsigned long addr)
{
        struct mempolicy *pol;
        pgoff_t ilx;
        struct folio *folio;

        if (vma->vm_flags & VM_DROPPABLE)
                gfp |= __GFP_NOWARN;

        pol = get_vma_policy(vma, addr, order, &ilx);
        folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id());
        mpol_cond_put(pol);
        return folio;
}
EXPORT_SYMBOL(vma_alloc_folio_noprof);

struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order)
{
        struct mempolicy *pol = &default_policy;

        /*
         * No reference counting needed for current->mempolicy
         * nor system default_policy
         */
        if (!in_interrupt() && !(gfp & __GFP_THISNODE))
                pol = get_task_policy(current);

        return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX,
                                       numa_node_id());
}

/**
 * alloc_pages - Allocate pages.
 * @gfp: GFP flags.
 * @order: Power of two of number of pages to allocate.
 *
 * Allocate 1 << @order contiguous pages.  The physical address of the
 * first page is naturally aligned (eg an order-3 allocation will be aligned
 * to a multiple of 8 * PAGE_SIZE bytes).  The NUMA policy of the current
 * process is honoured when in process context.
 *
 * Context: Can be called from any context, providing the appropriate GFP
 * flags are used.
 * Return: The page on success or NULL if allocation fails.
 */
struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order)
{
        struct page *page = alloc_frozen_pages_noprof(gfp, order);

        if (page)
                set_page_refcounted(page);
        return page;
}
EXPORT_SYMBOL(alloc_pages_noprof);

struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
{
        return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order));
}
EXPORT_SYMBOL(folio_alloc_noprof);

static unsigned long alloc_pages_bulk_interleave(gfp_t gfp,
                struct mempolicy *pol, unsigned long nr_pages,
                struct page **page_array)
{
        int nodes;
        unsigned long nr_pages_per_node;
        int delta;
        int i;
        unsigned long nr_allocated;
        unsigned long total_allocated = 0;

        nodes = nodes_weight(pol->nodes);
        nr_pages_per_node = nr_pages / nodes;
        delta = nr_pages - nodes * nr_pages_per_node;

        for (i = 0; i < nodes; i++) {
                if (delta) {
                        nr_allocated = alloc_pages_bulk_noprof(gfp,
                                        interleave_nodes(pol), NULL,
                                        nr_pages_per_node + 1,
                                        page_array);
                        delta--;
                } else {
                        nr_allocated = alloc_pages_bulk_noprof(gfp,
                                        interleave_nodes(pol), NULL,
                                        nr_pages_per_node, page_array);
                }

                page_array += nr_allocated;
                total_allocated += nr_allocated;
        }

        return total_allocated;
}

static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
                struct mempolicy *pol, unsigned long nr_pages,
                struct page **page_array)
{
        struct weighted_interleave_state *state;
        struct task_struct *me = current;
        unsigned int cpuset_mems_cookie;
        unsigned long total_allocated = 0;
        unsigned long nr_allocated = 0;
        unsigned long rounds;
        unsigned long node_pages, delta;
        u8 *weights, weight;
        unsigned int weight_total = 0;
        unsigned long rem_pages = nr_pages;
        nodemask_t nodes;
        int nnodes, node;
        int resume_node = MAX_NUMNODES - 1;
        u8 resume_weight = 0;
        int prev_node;
        int i;

        if (!nr_pages)
                return 0;

        /* read the nodes onto the stack, retry if done during rebind */
        do {
                cpuset_mems_cookie = read_mems_allowed_begin();
                nnodes = read_once_policy_nodemask(pol, &nodes);
        } while (read_mems_allowed_retry(cpuset_mems_cookie));

        /* if the nodemask has become invalid, we cannot do anything */
        if (!nnodes)
                return 0;

        /* Continue allocating from most recent node and adjust the nr_pages */
        node = me->il_prev;
        weight = me->il_weight;
        if (weight && node_isset(node, nodes)) {
                node_pages = min(rem_pages, weight);
                nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
                                                  page_array);
                page_array += nr_allocated;
                total_allocated += nr_allocated;
                /* if that's all the pages, no need to interleave */
                if (rem_pages <= weight) {
                        me->il_weight -= rem_pages;
                        return total_allocated;
                }
                /* Otherwise we adjust remaining pages, continue from there */
                rem_pages -= weight;
        }
        /* clear active weight in case of an allocation failure */
        me->il_weight = 0;
        prev_node = node;

        /* create a local copy of node weights to operate on outside rcu */
        weights = kzalloc(nr_node_ids, GFP_KERNEL);
        if (!weights)
                return total_allocated;

        rcu_read_lock();
        state = rcu_dereference(wi_state);
        if (state) {
                memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8));
                rcu_read_unlock();
        } else {
                rcu_read_unlock();
                for (i = 0; i < nr_node_ids; i++)
                        weights[i] = 1;
        }

        /* calculate total, detect system default usage */
        for_each_node_mask(node, nodes)
                weight_total += weights[node];

        /*
         * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
         * Track which node weighted interleave should resume from.
         *
         * if (rounds > 0) and (delta == 0), resume_node will always be
         * the node following prev_node and its weight.
         */
        rounds = rem_pages / weight_total;
        delta = rem_pages % weight_total;
        resume_node = next_node_in(prev_node, nodes);
        resume_weight = weights[resume_node];
        for (i = 0; i < nnodes; i++) {
                node = next_node_in(prev_node, nodes);
                weight = weights[node];
                node_pages = weight * rounds;
                /* If a delta exists, add this node's portion of the delta */
                if (delta > weight) {
                        node_pages += weight;
                        delta -= weight;
                } else if (delta) {
                        /* when delta is depleted, resume from that node */
                        node_pages += delta;
                        resume_node = node;
                        resume_weight = weight - delta;
                        delta = 0;
                }
                /* node_pages can be 0 if an allocation fails and rounds == 0 */
                if (!node_pages)
                        break;
                nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
                                                  page_array);
                page_array += nr_allocated;
                total_allocated += nr_allocated;
                if (total_allocated == nr_pages)
                        break;
                prev_node = node;
        }
        me->il_prev = resume_node;
        me->il_weight = resume_weight;
        kfree(weights);
        return total_allocated;
}

static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid,
                struct mempolicy *pol, unsigned long nr_pages,
                struct page **page_array)
{
        gfp_t preferred_gfp;
        unsigned long nr_allocated = 0;

        preferred_gfp = gfp | __GFP_NOWARN;
        preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);

        nr_allocated  = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes,
                                           nr_pages, page_array);

        if (nr_allocated < nr_pages)
                nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL,
                                nr_pages - nr_allocated,
                                page_array + nr_allocated);
        return nr_allocated;
}

/* alloc pages bulk and mempolicy should be considered at the
 * same time in some situation such as vmalloc.
 *
 * It can accelerate memory allocation especially interleaving
 * allocate memory.
 */
unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,
                unsigned long nr_pages, struct page **page_array)
{
        struct mempolicy *pol = &default_policy;
        nodemask_t *nodemask;
        int nid;

        if (!in_interrupt() && !(gfp & __GFP_THISNODE))
                pol = get_task_policy(current);

        if (pol->mode == MPOL_INTERLEAVE)
                return alloc_pages_bulk_interleave(gfp, pol,
                                                         nr_pages, page_array);

        if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
                return alloc_pages_bulk_weighted_interleave(
                                  gfp, pol, nr_pages, page_array);

        if (pol->mode == MPOL_PREFERRED_MANY)
                return alloc_pages_bulk_preferred_many(gfp,
                                numa_node_id(), pol, nr_pages, page_array);

        nid = numa_node_id();
        nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
        return alloc_pages_bulk_noprof(gfp, nid, nodemask,
                                       nr_pages, page_array);
}

int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
{
        struct mempolicy *pol = mpol_dup(src->vm_policy);

        if (IS_ERR(pol))
                return PTR_ERR(pol);
        dst->vm_policy = pol;
        return 0;
}

/*
 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
 * with the mems_allowed returned by cpuset_mems_allowed().  This
 * keeps mempolicies cpuset relative after its cpuset moves.  See
 * further kernel/cpuset.c update_nodemask().
 *
 * current's mempolicy may be rebinded by the other task(the task that changes
 * cpuset's mems), so we needn't do rebind work for current task.
 */

/* Slow path of a mempolicy duplicate */
struct mempolicy *__mpol_dup(struct mempolicy *old)
{
        struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);

        if (!new)
                return ERR_PTR(-ENOMEM);

        /* task's mempolicy is protected by alloc_lock */
        if (old == current->mempolicy) {
                task_lock(current);
                *new = *old;
                task_unlock(current);
        } else
                *new = *old;

        if (current_cpuset_is_being_rebound()) {
                nodemask_t mems = cpuset_mems_allowed(current);
                mpol_rebind_policy(new, &mems);
        }
        atomic_set(&new->refcnt, 1);
        return new;
}

/* Slow path of a mempolicy comparison */
bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
        if (!a || !b)
                return false;
        if (a->mode != b->mode)
                return false;
        if (a->flags != b->flags)
                return false;
        if (a->home_node != b->home_node)
                return false;
        if (mpol_store_user_nodemask(a))
                if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
                        return false;

        switch (a->mode) {
        case MPOL_BIND:
        case MPOL_INTERLEAVE:
        case MPOL_PREFERRED:
        case MPOL_PREFERRED_MANY:
        case MPOL_WEIGHTED_INTERLEAVE:
                return !!nodes_equal(a->nodes, b->nodes);
        case MPOL_LOCAL:
                return true;
        default:
                BUG();
                return false;
        }
}

/*
 * Shared memory backing store policy support.
 *
 * Remember policies even when nobody has shared memory mapped.
 * The policies are kept in Red-Black tree linked from the inode.
 * They are protected by the sp->lock rwlock, which should be held
 * for any accesses to the tree.
 */

/*
 * lookup first element intersecting start-end.  Caller holds sp->lock for
 * reading or for writing
 */
static struct sp_node *sp_lookup(struct shared_policy *sp,
                                        pgoff_t start, pgoff_t end)
{
        struct rb_node *n = sp->root.rb_node;

        while (n) {
                struct sp_node *p = rb_entry(n, struct sp_node, nd);

                if (start >= p->end)
                        n = n->rb_right;
                else if (end <= p->start)
                        n = n->rb_left;
                else
                        break;
        }
        if (!n)
                return NULL;
        for (;;) {
                struct sp_node *w = NULL;
                struct rb_node *prev = rb_prev(n);
                if (!prev)
                        break;
                w = rb_entry(prev, struct sp_node, nd);
                if (w->end <= start)
                        break;
                n = prev;
        }
        return rb_entry(n, struct sp_node, nd);
}

/*
 * Insert a new shared policy into the list.  Caller holds sp->lock for
 * writing.
 */
static void sp_insert(struct shared_policy *sp, struct sp_node *new)
{
        struct rb_node **p = &sp->root.rb_node;
        struct rb_node *parent = NULL;
        struct sp_node *nd;

        while (*p) {
                parent = *p;
                nd = rb_entry(parent, struct sp_node, nd);
                if (new->start < nd->start)
                        p = &(*p)->rb_left;
                else if (new->end > nd->end)
                        p = &(*p)->rb_right;
                else
                        BUG();
        }
        rb_link_node(&new->nd, parent, p);
        rb_insert_color(&new->nd, &sp->root);
}

/* Find shared policy intersecting idx */
struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
                                                pgoff_t idx)
{
        struct mempolicy *pol = NULL;
        struct sp_node *sn;

        if (!sp->root.rb_node)
                return NULL;
        read_lock(&sp->lock);
        sn = sp_lookup(sp, idx, idx+1);
        if (sn) {
                mpol_get(sn->policy);
                pol = sn->policy;
        }
        read_unlock(&sp->lock);
        return pol;
}

static void sp_free(struct sp_node *n)
{
        mpol_put(n->policy);
        kmem_cache_free(sn_cache, n);
}

/**
 * mpol_misplaced - check whether current folio node is valid in policy
 *
 * @folio: folio to be checked
 * @vmf: structure describing the fault
 * @addr: virtual address in @vma for shared policy lookup and interleave policy
 *
 * Lookup current policy node id for vma,addr and "compare to" folio's
 * node id.  Policy determination "mimics" alloc_page_vma().
 * Called from fault path where we know the vma and faulting address.
 *
 * Return: NUMA_NO_NODE if the page is in a node that is valid for this
 * policy, or a suitable node ID to allocate a replacement folio from.
 */
int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
                   unsigned long addr)
{
        struct mempolicy *pol;
        pgoff_t ilx;
        struct zoneref *z;
        int curnid = folio_nid(folio);
        struct vm_area_struct *vma = vmf->vma;
        int thiscpu = raw_smp_processor_id();
        int thisnid = numa_node_id();
        int polnid = NUMA_NO_NODE;
        int ret = NUMA_NO_NODE;

        /*
         * Make sure ptl is held so that we don't preempt and we
         * have a stable smp processor id
         */
        lockdep_assert_held(vmf->ptl);
        pol = get_vma_policy(vma, addr, folio_order(folio), &ilx);
        if (!(pol->flags & MPOL_F_MOF))
                goto out;

        switch (pol->mode) {
        case MPOL_INTERLEAVE:
                polnid = interleave_nid(pol, ilx);
                break;

        case MPOL_WEIGHTED_INTERLEAVE:
                polnid = weighted_interleave_nid(pol, ilx);
                break;

        case MPOL_PREFERRED:
                if (node_isset(curnid, pol->nodes))
                        goto out;
                polnid = first_node(pol->nodes);
                break;

        case MPOL_LOCAL:
                polnid = numa_node_id();
                break;

        case MPOL_BIND:
        case MPOL_PREFERRED_MANY:
                /*
                 * Even though MPOL_PREFERRED_MANY can allocate pages outside
                 * policy nodemask we don't allow numa migration to nodes
                 * outside policy nodemask for now. This is done so that if we
                 * want demotion to slow memory to happen, before allocating
                 * from some DRAM node say 'x', we will end up using a
                 * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario
                 * we should not promote to node 'x' from slow memory node.
                 */
                if (pol->flags & MPOL_F_MORON) {
                        /*
                         * Optimize placement among multiple nodes
                         * via NUMA balancing
                         */
                        if (node_isset(thisnid, pol->nodes))
                                break;
                        goto out;
                }

                /*
                 * use current page if in policy nodemask,
                 * else select nearest allowed node, if any.
                 * If no allowed nodes, use current [!misplaced].
                 */
                if (node_isset(curnid, pol->nodes))
                        goto out;
                z = first_zones_zonelist(
                                node_zonelist(thisnid, GFP_HIGHUSER),
                                gfp_zone(GFP_HIGHUSER),
                                &pol->nodes);
                polnid = zonelist_node_idx(z);
                break;

        default:
                BUG();
        }

        /* Migrate the folio towards the node whose CPU is referencing it */
        if (pol->flags & MPOL_F_MORON) {
                polnid = thisnid;

                if (!should_numa_migrate_memory(current, folio, curnid,
                                                thiscpu))
                        goto out;
        }

        if (curnid != polnid)
                ret = polnid;
out:
        mpol_cond_put(pol);

        return ret;
}

/*
 * Drop the (possibly final) reference to task->mempolicy.  It needs to be
 * dropped after task->mempolicy is set to NULL so that any allocation done as
 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
 * policy.
 */
void mpol_put_task_policy(struct task_struct *task)
{
        struct mempolicy *pol;

        task_lock(task);
        pol = task->mempolicy;
        task->mempolicy = NULL;
        task_unlock(task);
        mpol_put(pol);
}

static void sp_delete(struct shared_policy *sp, struct sp_node *n)
{
        rb_erase(&n->nd, &sp->root);
        sp_free(n);
}

static void sp_node_init(struct sp_node *node, unsigned long start,
                        unsigned long end, struct mempolicy *pol)
{
        node->start = start;
        node->end = end;
        node->policy = pol;
}

static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
                                struct mempolicy *pol)
{
        struct sp_node *n;
        struct mempolicy *newpol;

        n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
        if (!n)
                return NULL;

        newpol = mpol_dup(pol);
        if (IS_ERR(newpol)) {
                kmem_cache_free(sn_cache, n);
                return NULL;
        }
        newpol->flags |= MPOL_F_SHARED;
        sp_node_init(n, start, end, newpol);

        return n;
}

/* Replace a policy range. */
static int shared_policy_replace(struct shared_policy *sp, pgoff_t start,
                                 pgoff_t end, struct sp_node *new)
{
        struct sp_node *n;
        struct sp_node *n_new = NULL;
        struct mempolicy *mpol_new = NULL;
        int ret = 0;

restart:
        write_lock(&sp->lock);
        n = sp_lookup(sp, start, end);
        /* Take care of old policies in the same range. */
        while (n && n->start < end) {
                struct rb_node *next = rb_next(&n->nd);
                if (n->start >= start) {
                        if (n->end <= end)
                                sp_delete(sp, n);
                        else
                                n->start = end;
                } else {
                        /* Old policy spanning whole new range. */
                        if (n->end > end) {
                                if (!n_new)
                                        goto alloc_new;

                                *mpol_new = *n->policy;
                                atomic_set(&mpol_new->refcnt, 1);
                                sp_node_init(n_new, end, n->end, mpol_new);
                                n->end = start;
                                sp_insert(sp, n_new);
                                n_new = NULL;
                                mpol_new = NULL;
                                break;
                        } else
                                n->end = start;
                }
                if (!next)
                        break;
                n = rb_entry(next, struct sp_node, nd);
        }
        if (new)
                sp_insert(sp, new);
        write_unlock(&sp->lock);
        ret = 0;

err_out:
        if (mpol_new)
                mpol_put(mpol_new);
        if (n_new)
                kmem_cache_free(sn_cache, n_new);

        return ret;

alloc_new:
        write_unlock(&sp->lock);
        ret = -ENOMEM;
        n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
        if (!n_new)
                goto err_out;
        mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
        if (!mpol_new)
                goto err_out;
        atomic_set(&mpol_new->refcnt, 1);
        goto restart;
}

/**
 * mpol_shared_policy_init - initialize shared policy for inode
 * @sp: pointer to inode shared policy
 * @mpol:  struct mempolicy to install
 *
 * Install non-NULL @mpol in inode's shared policy rb-tree.
 * On entry, the current task has a reference on a non-NULL @mpol.
 * This must be released on exit.
 * This is called at get_inode() calls and we can use GFP_KERNEL.
 */
void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
{
        int ret;

        sp->root = RB_ROOT;                /* empty tree == default mempolicy */
        rwlock_init(&sp->lock);

        if (mpol) {
                struct sp_node *sn;
                struct mempolicy *npol;
                NODEMASK_SCRATCH(scratch);

                if (!scratch)
                        goto put_mpol;

                /* contextualize the tmpfs mount point mempolicy to this file */
                npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
                if (IS_ERR(npol))
                        goto free_scratch; /* no valid nodemask intersection */

                task_lock(current);
                ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch);
                task_unlock(current);
                if (ret)
                        goto put_npol;

                /* alloc node covering entire file; adds ref to file's npol */
                sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol);
                if (sn)
                        sp_insert(sp, sn);
put_npol:
                mpol_put(npol);        /* drop initial ref on file's npol */
free_scratch:
                NODEMASK_SCRATCH_FREE(scratch);
put_mpol:
                mpol_put(mpol);        /* drop our incoming ref on sb mpol */
        }
}

int mpol_set_shared_policy(struct shared_policy *sp,
                        struct vm_area_struct *vma, struct mempolicy *pol)
{
        int err;
        struct sp_node *new = NULL;
        unsigned long sz = vma_pages(vma);

        if (pol) {
                new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol);
                if (!new)
                        return -ENOMEM;
        }
        err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new);
        if (err && new)
                sp_free(new);
        return err;
}

/* Free a backing policy store on inode delete. */
void mpol_free_shared_policy(struct shared_policy *sp)
{
        struct sp_node *n;
        struct rb_node *next;

        if (!sp->root.rb_node)
                return;
        write_lock(&sp->lock);
        next = rb_first(&sp->root);
        while (next) {
                n = rb_entry(next, struct sp_node, nd);
                next = rb_next(&n->nd);
                sp_delete(sp, n);
        }
        write_unlock(&sp->lock);
}

#ifdef CONFIG_NUMA_BALANCING
static int __initdata numabalancing_override;

static void __init check_numabalancing_enable(void)
{
        bool numabalancing_default = false;

        if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
                numabalancing_default = true;

        /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
        if (numabalancing_override)
                set_numabalancing_state(numabalancing_override == 1);

        if (num_online_nodes() > 1 && !numabalancing_override) {
                pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
                        numabalancing_default ? "Enabling" : "Disabling");
                set_numabalancing_state(numabalancing_default);
        }
}

static int __init setup_numabalancing(char *str)
{
        int ret = 0;
        if (!str)
                goto out;

        if (!strcmp(str, "enable")) {
                numabalancing_override = 1;
                ret = 1;
        } else if (!strcmp(str, "disable")) {
                numabalancing_override = -1;
                ret = 1;
        }
out:
        if (!ret)
                pr_warn("Unable to parse numa_balancing=\n");

        return ret;
}
__setup("numa_balancing=", setup_numabalancing);
#else
static inline void __init check_numabalancing_enable(void)
{
}
#endif /* CONFIG_NUMA_BALANCING */

void __init numa_policy_init(void)
{
        nodemask_t interleave_nodes;
        unsigned long largest = 0;
        int nid, prefer = 0;

        policy_cache = kmem_cache_create("numa_policy",
                                         sizeof(struct mempolicy),
                                         0, SLAB_PANIC, NULL);

        sn_cache = kmem_cache_create("shared_policy_node",
                                     sizeof(struct sp_node),
                                     0, SLAB_PANIC, NULL);

        for_each_node(nid) {
                preferred_node_policy[nid] = (struct mempolicy) {
                        .refcnt = ATOMIC_INIT(1),
                        .mode = MPOL_PREFERRED,
                        .flags = MPOL_F_MOF | MPOL_F_MORON,
                        .nodes = nodemask_of_node(nid),
                };
        }

        /*
         * Set interleaving policy for system init. Interleaving is only
         * enabled across suitably sized nodes (default is >= 16MB), or
         * fall back to the largest node if they're all smaller.
         */
        nodes_clear(interleave_nodes);
        for_each_node_state(nid, N_MEMORY) {
                unsigned long total_pages = node_present_pages(nid);

                /* Preserve the largest node */
                if (largest < total_pages) {
                        largest = total_pages;
                        prefer = nid;
                }

                /* Interleave this node? */
                if ((total_pages << PAGE_SHIFT) >= (16 << 20))
                        node_set(nid, interleave_nodes);
        }

        /* All too small, use the largest */
        if (unlikely(nodes_empty(interleave_nodes)))
                node_set(prefer, interleave_nodes);

        if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
                pr_err("%s: interleaving failed\n", __func__);

        check_numabalancing_enable();
}

/* Reset policy of current process to default */
void numa_default_policy(void)
{
        do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
}

/*
 * Parse and format mempolicy from/to strings
 */
static const char * const policy_modes[] =
{
        [MPOL_DEFAULT]    = "default",
        [MPOL_PREFERRED]  = "prefer",
        [MPOL_BIND]       = "bind",
        [MPOL_INTERLEAVE] = "interleave",
        [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
        [MPOL_LOCAL]      = "local",
        [MPOL_PREFERRED_MANY]  = "prefer (many)",
};

#ifdef CONFIG_TMPFS
/**
 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
 * @str:  string containing mempolicy to parse
 * @mpol:  pointer to struct mempolicy pointer, returned on success.
 *
 * Format of input:
 *        <mode>[=<flags>][:<nodelist>]
 *
 * Return: %0 on success, else %1
 */
int mpol_parse_str(char *str, struct mempolicy **mpol)
{
        struct mempolicy *new = NULL;
        unsigned short mode_flags;
        nodemask_t nodes;
        char *nodelist = strchr(str, ':');
        char *flags = strchr(str, '=');
        int err = 1, mode;

        if (flags)
                *flags++ = '\0';        /* terminate mode string */

        if (nodelist) {
                /* NUL-terminate mode or flags string */
                *nodelist++ = '\0';
                if (nodelist_parse(nodelist, nodes))
                        goto out;
                if (!nodes_subset(nodes, node_states[N_MEMORY]))
                        goto out;
        } else
                nodes_clear(nodes);

        mode = match_string(policy_modes, MPOL_MAX, str);
        if (mode < 0)
                goto out;

        switch (mode) {
        case MPOL_PREFERRED:
                /*
                 * Insist on a nodelist of one node only, although later
                 * we use first_node(nodes) to grab a single node, so here
                 * nodelist (or nodes) cannot be empty.
                 */
                if (nodelist) {
                        char *rest = nodelist;
                        while (isdigit(*rest))
                                rest++;
                        if (*rest)
                                goto out;
                        if (nodes_empty(nodes))
                                goto out;
                }
                break;
        case MPOL_INTERLEAVE:
        case MPOL_WEIGHTED_INTERLEAVE:
                /*
                 * Default to online nodes with memory if no nodelist
                 */
                if (!nodelist)
                        nodes = node_states[N_MEMORY];
                break;
        case MPOL_LOCAL:
                /*
                 * Don't allow a nodelist;  mpol_new() checks flags
                 */
                if (nodelist)
                        goto out;
                break;
        case MPOL_DEFAULT:
                /*
                 * Insist on a empty nodelist
                 */
                if (!nodelist)
                        err = 0;
                goto out;
        case MPOL_PREFERRED_MANY:
        case MPOL_BIND:
                /*
                 * Insist on a nodelist
                 */
                if (!nodelist)
                        goto out;
        }

        mode_flags = 0;
        if (flags) {
                /*
                 * Currently, we only support two mutually exclusive
                 * mode flags.
                 */
                if (!strcmp(flags, "static"))
                        mode_flags |= MPOL_F_STATIC_NODES;
                else if (!strcmp(flags, "relative"))
                        mode_flags |= MPOL_F_RELATIVE_NODES;
                else
                        goto out;
        }

        new = mpol_new(mode, mode_flags, &nodes);
        if (IS_ERR(new))
                goto out;

        /*
         * Save nodes for mpol_to_str() to show the tmpfs mount options
         * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
         */
        if (mode != MPOL_PREFERRED) {
                new->nodes = nodes;
        } else if (nodelist) {
                nodes_clear(new->nodes);
                node_set(first_node(nodes), new->nodes);
        } else {
                new->mode = MPOL_LOCAL;
        }

        /*
         * Save nodes for contextualization: this will be used to "clone"
         * the mempolicy in a specific context [cpuset] at a later time.
         */
        new->w.user_nodemask = nodes;

        err = 0;

out:
        /* Restore string for error message */
        if (nodelist)
                *--nodelist = ':';
        if (flags)
                *--flags = '=';
        if (!err)
                *mpol = new;
        return err;
}
#endif /* CONFIG_TMPFS */

/**
 * mpol_to_str - format a mempolicy structure for printing
 * @buffer:  to contain formatted mempolicy string
 * @maxlen:  length of @buffer
 * @pol:  pointer to mempolicy to be formatted
 *
 * Convert @pol into a string.  If @buffer is too short, truncate the string.
 * Recommend a @maxlen of at least 51 for the longest mode, "weighted
 * interleave", plus the longest flag flags, "relative|balancing", and to
 * display at least a few node ids.
 */
void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
{
        char *p = buffer;
        nodemask_t nodes = NODE_MASK_NONE;
        unsigned short mode = MPOL_DEFAULT;
        unsigned short flags = 0;

        if (pol &&
            pol != &default_policy &&
            !(pol >= &preferred_node_policy[0] &&
              pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) {
                mode = pol->mode;
                flags = pol->flags;
        }

        switch (mode) {
        case MPOL_DEFAULT:
        case MPOL_LOCAL:
                break;
        case MPOL_PREFERRED:
        case MPOL_PREFERRED_MANY:
        case MPOL_BIND:
        case MPOL_INTERLEAVE:
        case MPOL_WEIGHTED_INTERLEAVE:
                nodes = pol->nodes;
                break;
        default:
                WARN_ON_ONCE(1);
                snprintf(p, maxlen, "unknown");
                return;
        }

        p += snprintf(p, maxlen, "%s", policy_modes[mode]);

        if (flags & MPOL_MODE_FLAGS) {
                p += snprintf(p, buffer + maxlen - p, "=");

                /*
                 * Static and relative are mutually exclusive.
                 */
                if (flags & MPOL_F_STATIC_NODES)
                        p += snprintf(p, buffer + maxlen - p, "static");
                else if (flags & MPOL_F_RELATIVE_NODES)
                        p += snprintf(p, buffer + maxlen - p, "relative");

                if (flags & MPOL_F_NUMA_BALANCING) {
                        if (!is_power_of_2(flags & MPOL_MODE_FLAGS))
                                p += snprintf(p, buffer + maxlen - p, "|");
                        p += snprintf(p, buffer + maxlen - p, "balancing");
                }
        }

        if (!nodes_empty(nodes))
                p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
                               nodemask_pr_args(&nodes));
}

#ifdef CONFIG_SYSFS
struct iw_node_attr {
        struct kobj_attribute kobj_attr;
        int nid;
};

struct sysfs_wi_group {
        struct kobject wi_kobj;
        struct mutex kobj_lock;
        struct iw_node_attr *nattrs[];
};

static struct sysfs_wi_group *wi_group;

static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
                         char *buf)
{
        struct iw_node_attr *node_attr;
        u8 weight;

        node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
        weight = get_il_weight(node_attr->nid);
        return sysfs_emit(buf, "%d\n", weight);
}

static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
                          const char *buf, size_t count)
{
        struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
        struct iw_node_attr *node_attr;
        u8 weight = 0;
        int i;

        node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
        if (count == 0 || sysfs_streq(buf, "") ||
            kstrtou8(buf, 0, &weight) || weight == 0)
                return -EINVAL;

        new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
                               GFP_KERNEL);
        if (!new_wi_state)
                return -ENOMEM;

        mutex_lock(&wi_state_lock);
        old_wi_state = rcu_dereference_protected(wi_state,
                                        lockdep_is_held(&wi_state_lock));
        if (old_wi_state) {
                memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
                                        nr_node_ids * sizeof(u8));
        } else {
                for (i = 0; i < nr_node_ids; i++)
                        new_wi_state->iw_table[i] = 1;
        }
        new_wi_state->iw_table[node_attr->nid] = weight;
        new_wi_state->mode_auto = false;

        rcu_assign_pointer(wi_state, new_wi_state);
        mutex_unlock(&wi_state_lock);
        if (old_wi_state) {
                synchronize_rcu();
                kfree(old_wi_state);
        }
        return count;
}

static ssize_t weighted_interleave_auto_show(struct kobject *kobj,
                struct kobj_attribute *attr, char *buf)
{
        struct weighted_interleave_state *state;
        bool wi_auto = true;

        rcu_read_lock();
        state = rcu_dereference(wi_state);
        if (state)
                wi_auto = state->mode_auto;
        rcu_read_unlock();

        return sysfs_emit(buf, "%s\n", str_true_false(wi_auto));
}

static ssize_t weighted_interleave_auto_store(struct kobject *kobj,
                struct kobj_attribute *attr, const char *buf, size_t count)
{
        struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
        unsigned int *bw;
        bool input;
        int i;

        if (kstrtobool(buf, &input))
                return -EINVAL;

        new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
                               GFP_KERNEL);
        if (!new_wi_state)
                return -ENOMEM;
        for (i = 0; i < nr_node_ids; i++)
                new_wi_state->iw_table[i] = 1;

        mutex_lock(&wi_state_lock);
        if (!input) {
                old_wi_state = rcu_dereference_protected(wi_state,
                                        lockdep_is_held(&wi_state_lock));
                if (!old_wi_state)
                        goto update_wi_state;
                if (input == old_wi_state->mode_auto) {
                        mutex_unlock(&wi_state_lock);
                        return count;
                }

                memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
                                               nr_node_ids * sizeof(u8));
                goto update_wi_state;
        }

        bw = node_bw_table;
        if (!bw) {
                mutex_unlock(&wi_state_lock);
                kfree(new_wi_state);
                return -ENODEV;
        }

        new_wi_state->mode_auto = true;
        reduce_interleave_weights(bw, new_wi_state->iw_table);

update_wi_state:
        rcu_assign_pointer(wi_state, new_wi_state);
        mutex_unlock(&wi_state_lock);
        if (old_wi_state) {
                synchronize_rcu();
                kfree(old_wi_state);
        }
        return count;
}

static void sysfs_wi_node_delete(int nid)
{
        struct iw_node_attr *attr;

        if (nid < 0 || nid >= nr_node_ids)
                return;

        mutex_lock(&wi_group->kobj_lock);
        attr = wi_group->nattrs[nid];
        if (!attr) {
                mutex_unlock(&wi_group->kobj_lock);
                return;
        }

        wi_group->nattrs[nid] = NULL;
        mutex_unlock(&wi_group->kobj_lock);

        sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr);
        kfree(attr->kobj_attr.attr.name);
        kfree(attr);
}

static void sysfs_wi_node_delete_all(void)
{
        int nid;

        for (nid = 0; nid < nr_node_ids; nid++)
                sysfs_wi_node_delete(nid);
}

static void wi_state_free(void)
{
        struct weighted_interleave_state *old_wi_state;

        mutex_lock(&wi_state_lock);
        old_wi_state = rcu_dereference_protected(wi_state,
                        lockdep_is_held(&wi_state_lock));
        rcu_assign_pointer(wi_state, NULL);
        mutex_unlock(&wi_state_lock);

        if (old_wi_state) {
                synchronize_rcu();
                kfree(old_wi_state);
        }
}

static struct kobj_attribute wi_auto_attr =
        __ATTR(auto, 0664, weighted_interleave_auto_show,
                           weighted_interleave_auto_store);

static void wi_cleanup(void) {
        sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
        sysfs_wi_node_delete_all();
        wi_state_free();
}

static void wi_kobj_release(struct kobject *wi_kobj)
{
        kfree(wi_group);
}

static const struct kobj_type wi_ktype = {
        .sysfs_ops = &kobj_sysfs_ops,
        .release = wi_kobj_release,
};

static int sysfs_wi_node_add(int nid)
{
        int ret;
        char *name;
        struct iw_node_attr *new_attr;

        if (nid < 0 || nid >= nr_node_ids) {
                pr_err("invalid node id: %d\n", nid);
                return -EINVAL;
        }

        new_attr = kzalloc(sizeof(*new_attr), GFP_KERNEL);
        if (!new_attr)
                return -ENOMEM;

        name = kasprintf(GFP_KERNEL, "node%d", nid);
        if (!name) {
                kfree(new_attr);
                return -ENOMEM;
        }

        sysfs_attr_init(&new_attr->kobj_attr.attr);
        new_attr->kobj_attr.attr.name = name;
        new_attr->kobj_attr.attr.mode = 0644;
        new_attr->kobj_attr.show = node_show;
        new_attr->kobj_attr.store = node_store;
        new_attr->nid = nid;

        mutex_lock(&wi_group->kobj_lock);
        if (wi_group->nattrs[nid]) {
                mutex_unlock(&wi_group->kobj_lock);
                ret = -EEXIST;
                goto out;
        }

        ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr);
        if (ret) {
                mutex_unlock(&wi_group->kobj_lock);
                goto out;
        }
        wi_group->nattrs[nid] = new_attr;
        mutex_unlock(&wi_group->kobj_lock);
        return 0;

out:
        kfree(new_attr->kobj_attr.attr.name);
        kfree(new_attr);
        return ret;
}

static int wi_node_notifier(struct notifier_block *nb,
                               unsigned long action, void *data)
{
        int err;
        struct node_notify *nn = data;
        int nid = nn->nid;

        switch (action) {
        case NODE_ADDED_FIRST_MEMORY:
                err = sysfs_wi_node_add(nid);
                if (err)
                        pr_err("failed to add sysfs for node%d during hotplug: %d\n",
                               nid, err);
                break;
        case NODE_REMOVED_LAST_MEMORY:
                sysfs_wi_node_delete(nid);
                break;
        }

        return NOTIFY_OK;
}

static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj)
{
        int nid, err;

        wi_group = kzalloc(struct_size(wi_group, nattrs, nr_node_ids),
                           GFP_KERNEL);
        if (!wi_group)
                return -ENOMEM;
        mutex_init(&wi_group->kobj_lock);

        err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj,
                                   "weighted_interleave");
        if (err)
                goto err_put_kobj;

        err = sysfs_create_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
        if (err)
                goto err_put_kobj;

        for_each_online_node(nid) {
                if (!node_state(nid, N_MEMORY))
                        continue;

                err = sysfs_wi_node_add(nid);
                if (err) {
                        pr_err("failed to add sysfs for node%d during init: %d\n",
                               nid, err);
                        goto err_cleanup_kobj;
                }
        }

        hotplug_node_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
        return 0;

err_cleanup_kobj:
        wi_cleanup();
        kobject_del(&wi_group->wi_kobj);
err_put_kobj:
        kobject_put(&wi_group->wi_kobj);
        return err;
}

static int __init mempolicy_sysfs_init(void)
{
        int err;
        static struct kobject *mempolicy_kobj;

        mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj);
        if (!mempolicy_kobj)
                return -ENOMEM;

        err = add_weighted_interleave_group(mempolicy_kobj);
        if (err)
                goto err_kobj;

        return 0;

err_kobj:
        kobject_del(mempolicy_kobj);
        kobject_put(mempolicy_kobj);
        return err;
}

late_initcall(mempolicy_sysfs_init);
#endif /* CONFIG_SYSFS */






















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
/* SPDX-License-Identifier: GPL-2.0+ */
/*
 * RCU-based infrastructure for lightweight reader-writer locking
 *
 * Copyright (c) 2015, Red Hat, Inc.
 *
 * Author: Oleg Nesterov <oleg@redhat.com>
 */

#ifndef _LINUX_RCU_SYNC_H_
#define _LINUX_RCU_SYNC_H_

#include <linux/wait.h>
#include <linux/rcupdate.h>

/* Structure to mediate between updaters and fastpath-using readers.  */
struct rcu_sync {
        int                        gp_state;
        int                        gp_count;
        wait_queue_head_t        gp_wait;

        struct rcu_head                cb_head;
};

/**
 * rcu_sync_is_idle() - Are readers permitted to use their fastpaths?
 * @rsp: Pointer to rcu_sync structure to use for synchronization
 *
 * Returns true if readers are permitted to use their fastpaths.  Must be
 * invoked within some flavor of RCU read-side critical section.
 */
static inline bool rcu_sync_is_idle(struct rcu_sync *rsp)
{
        RCU_LOCKDEP_WARN(!rcu_read_lock_any_held(),
                         "suspicious rcu_sync_is_idle() usage");
        return !READ_ONCE(rsp->gp_state); /* GP_IDLE */
}

extern void rcu_sync_init(struct rcu_sync *);
extern void rcu_sync_enter(struct rcu_sync *);
extern void rcu_sync_exit(struct rcu_sync *);
extern void rcu_sync_dtor(struct rcu_sync *);

#define __RCU_SYNC_INITIALIZER(name) {                                        \
                .gp_state = 0,                                                \
                .gp_count = 0,                                                \
                .gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait),        \
        }

#define        DEFINE_RCU_SYNC(name)        \
        struct rcu_sync name = __RCU_SYNC_INITIALIZER(name)

#endif /* _LINUX_RCU_SYNC_H_ */
































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * include/linux/idr.h
 * 
 * 2002-10-18  written by Jim Houston jim.houston@ccur.com
 *        Copyright (C) 2002 by Concurrent Computer Corporation
 *
 * Small id to pointer translation service avoiding fixed sized
 * tables.
 */

#ifndef __IDR_H__
#define __IDR_H__

#include <linux/radix-tree.h>
#include <linux/gfp.h>
#include <linux/percpu.h>
#include <linux/cleanup.h>

struct idr {
        struct radix_tree_root        idr_rt;
        unsigned int                idr_base;
        unsigned int                idr_next;
};

/*
 * The IDR API does not expose the tagging functionality of the radix tree
 * to users.  Use tag 0 to track whether a node has free space below it.
 */
#define IDR_FREE        0

/* Set the IDR flag and the IDR_FREE tag */
#define IDR_RT_MARKER        (ROOT_IS_IDR | (__force gfp_t)                        \
                                        (1 << (ROOT_TAG_SHIFT + IDR_FREE)))

#define IDR_INIT_BASE(name, base) {                                        \
        .idr_rt = RADIX_TREE_INIT(name, IDR_RT_MARKER),                        \
        .idr_base = (base),                                                \
        .idr_next = 0,                                                        \
}

/**
 * IDR_INIT() - Initialise an IDR.
 * @name: Name of IDR.
 *
 * A freshly-initialised IDR contains no IDs.
 */
#define IDR_INIT(name)        IDR_INIT_BASE(name, 0)

/**
 * DEFINE_IDR() - Define a statically-allocated IDR.
 * @name: Name of IDR.
 *
 * An IDR defined using this macro is ready for use with no additional
 * initialisation required.  It contains no IDs.
 */
#define DEFINE_IDR(name)        struct idr name = IDR_INIT(name)

/**
 * idr_get_cursor - Return the current position of the cyclic allocator
 * @idr: idr handle
 *
 * The value returned is the value that will be next returned from
 * idr_alloc_cyclic() if it is free (otherwise the search will start from
 * this position).
 */
static inline unsigned int idr_get_cursor(const struct idr *idr)
{
        return READ_ONCE(idr->idr_next);
}

/**
 * idr_set_cursor - Set the current position of the cyclic allocator
 * @idr: idr handle
 * @val: new position
 *
 * The next call to idr_alloc_cyclic() will return @val if it is free
 * (otherwise the search will start from this position).
 */
static inline void idr_set_cursor(struct idr *idr, unsigned int val)
{
        WRITE_ONCE(idr->idr_next, val);
}

/**
 * DOC: idr sync
 * idr synchronization (stolen from radix-tree.h)
 *
 * idr_find() is able to be called locklessly, using RCU. The caller must
 * ensure calls to this function are made within rcu_read_lock() regions.
 * Other readers (lock-free or otherwise) and modifications may be running
 * concurrently.
 *
 * It is still required that the caller manage the synchronization and
 * lifetimes of the items. So if RCU lock-free lookups are used, typically
 * this would mean that the items have their own locks, or are amenable to
 * lock-free access; and that the items are freed by RCU (or only freed after
 * having been deleted from the idr tree *and* a synchronize_rcu() grace
 * period).
 */

#define idr_lock(idr)                xa_lock(&(idr)->idr_rt)
#define idr_unlock(idr)                xa_unlock(&(idr)->idr_rt)
#define idr_lock_bh(idr)        xa_lock_bh(&(idr)->idr_rt)
#define idr_unlock_bh(idr)        xa_unlock_bh(&(idr)->idr_rt)
#define idr_lock_irq(idr)        xa_lock_irq(&(idr)->idr_rt)
#define idr_unlock_irq(idr)        xa_unlock_irq(&(idr)->idr_rt)
#define idr_lock_irqsave(idr, flags) \
                                xa_lock_irqsave(&(idr)->idr_rt, flags)
#define idr_unlock_irqrestore(idr, flags) \
                                xa_unlock_irqrestore(&(idr)->idr_rt, flags)

void idr_preload(gfp_t gfp_mask);

int idr_alloc(struct idr *, void *ptr, int start, int end, gfp_t);
int __must_check idr_alloc_u32(struct idr *, void *ptr, u32 *id,
                                unsigned long max, gfp_t);
int idr_alloc_cyclic(struct idr *, void *ptr, int start, int end, gfp_t);
void *idr_remove(struct idr *, unsigned long id);
void *idr_find(const struct idr *, unsigned long id);
int idr_for_each(const struct idr *,
                 int (*fn)(int id, void *p, void *data), void *data);
void *idr_get_next(struct idr *, int *nextid);
void *idr_get_next_ul(struct idr *, unsigned long *nextid);
void *idr_replace(struct idr *, void *, unsigned long id);
void idr_destroy(struct idr *);

struct __class_idr {
        struct idr *idr;
        int id;
};

#define idr_null ((struct __class_idr){ NULL, -1 })
#define take_idr_id(id) __get_and_null(id, idr_null)

DEFINE_CLASS(idr_alloc, struct __class_idr,
             if (_T.id >= 0) idr_remove(_T.idr, _T.id),
             ((struct __class_idr){
                     .idr = idr,
                .id = idr_alloc(idr, ptr, start, end, gfp),
             }),
             struct idr *idr, void *ptr, int start, int end, gfp_t gfp);

/**
 * idr_init_base() - Initialise an IDR.
 * @idr: IDR handle.
 * @base: The base value for the IDR.
 *
 * This variation of idr_init() creates an IDR which will allocate IDs
 * starting at %base.
 */
static inline void idr_init_base(struct idr *idr, int base)
{
        INIT_RADIX_TREE(&idr->idr_rt, IDR_RT_MARKER);
        idr->idr_base = base;
        idr->idr_next = 0;
}

/**
 * idr_init() - Initialise an IDR.
 * @idr: IDR handle.
 *
 * Initialise a dynamically allocated IDR.  To initialise a
 * statically allocated IDR, use DEFINE_IDR().
 */
static inline void idr_init(struct idr *idr)
{
        idr_init_base(idr, 0);
}

/**
 * idr_is_empty() - Are there any IDs allocated?
 * @idr: IDR handle.
 *
 * Return: %true if any IDs have been allocated from this IDR.
 */
static inline bool idr_is_empty(const struct idr *idr)
{
        return radix_tree_empty(&idr->idr_rt) &&
                radix_tree_tagged(&idr->idr_rt, IDR_FREE);
}

/**
 * idr_preload_end - end preload section started with idr_preload()
 *
 * Each idr_preload() should be matched with an invocation of this
 * function.  See idr_preload() for details.
 */
static inline void idr_preload_end(void)
{
        local_unlock(&radix_tree_preloads.lock);
}

/**
 * idr_for_each_entry() - Iterate over an IDR's elements of a given type.
 * @idr: IDR handle.
 * @entry: The type * to use as cursor
 * @id: Entry ID.
 *
 * @entry and @id do not need to be initialized before the loop, and
 * after normal termination @entry is left with the value NULL.  This
 * is convenient for a "not found" value.
 */
#define idr_for_each_entry(idr, entry, id)                        \
        for (id = 0; ((entry) = idr_get_next(idr, &(id))) != NULL; id += 1U)

/**
 * idr_for_each_entry_ul() - Iterate over an IDR's elements of a given type.
 * @idr: IDR handle.
 * @entry: The type * to use as cursor.
 * @tmp: A temporary placeholder for ID.
 * @id: Entry ID.
 *
 * @entry and @id do not need to be initialized before the loop, and
 * after normal termination @entry is left with the value NULL.  This
 * is convenient for a "not found" value.
 */
#define idr_for_each_entry_ul(idr, entry, tmp, id)                        \
        for (tmp = 0, id = 0;                                                \
             ((entry) = tmp <= id ? idr_get_next_ul(idr, &(id)) : NULL) != NULL; \
             tmp = id, ++id)

/**
 * idr_for_each_entry_continue() - Continue iteration over an IDR's elements of a given type
 * @idr: IDR handle.
 * @entry: The type * to use as a cursor.
 * @id: Entry ID.
 *
 * Continue to iterate over entries, continuing after the current position.
 */
#define idr_for_each_entry_continue(idr, entry, id)                        \
        for ((entry) = idr_get_next((idr), &(id));                        \
             entry;                                                        \
             ++id, (entry) = idr_get_next((idr), &(id)))

/**
 * idr_for_each_entry_continue_ul() - Continue iteration over an IDR's elements of a given type
 * @idr: IDR handle.
 * @entry: The type * to use as a cursor.
 * @tmp: A temporary placeholder for ID.
 * @id: Entry ID.
 *
 * Continue to iterate over entries, continuing after the current position.
 * After normal termination @entry is left with the value NULL.  This
 * is convenient for a "not found" value.
 */
#define idr_for_each_entry_continue_ul(idr, entry, tmp, id)                \
        for (tmp = id;                                                        \
             ((entry) = tmp <= id ? idr_get_next_ul(idr, &(id)) : NULL) != NULL; \
             tmp = id, ++id)

/*
 * IDA - ID Allocator, use when translation from id to pointer isn't necessary.
 */
#define IDA_CHUNK_SIZE                128        /* 128 bytes per chunk */
#define IDA_BITMAP_LONGS        (IDA_CHUNK_SIZE / sizeof(long))
#define IDA_BITMAP_BITS         (IDA_BITMAP_LONGS * sizeof(long) * 8)

struct ida_bitmap {
        unsigned long                bitmap[IDA_BITMAP_LONGS];
};

struct ida {
        struct xarray xa;
};

#define IDA_INIT_FLAGS        (XA_FLAGS_LOCK_IRQ | XA_FLAGS_ALLOC)

#define IDA_INIT(name)        {                                                \
        .xa = XARRAY_INIT(name, IDA_INIT_FLAGS)                                \
}
#define DEFINE_IDA(name)        struct ida name = IDA_INIT(name)

int ida_alloc_range(struct ida *, unsigned int min, unsigned int max, gfp_t);
void ida_free(struct ida *, unsigned int id);
void ida_destroy(struct ida *ida);
int ida_find_first_range(struct ida *ida, unsigned int min, unsigned int max);

/**
 * ida_alloc() - Allocate an unused ID.
 * @ida: IDA handle.
 * @gfp: Memory allocation flags.
 *
 * Allocate an ID between 0 and %INT_MAX, inclusive.
 *
 * Context: Any context. It is safe to call this function without
 * locking in your code.
 * Return: The allocated ID, or %-ENOMEM if memory could not be allocated,
 * or %-ENOSPC if there are no free IDs.
 */
static inline int ida_alloc(struct ida *ida, gfp_t gfp)
{
        return ida_alloc_range(ida, 0, ~0, gfp);
}

/**
 * ida_alloc_min() - Allocate an unused ID.
 * @ida: IDA handle.
 * @min: Lowest ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Allocate an ID between @min and %INT_MAX, inclusive.
 *
 * Context: Any context. It is safe to call this function without
 * locking in your code.
 * Return: The allocated ID, or %-ENOMEM if memory could not be allocated,
 * or %-ENOSPC if there are no free IDs.
 */
static inline int ida_alloc_min(struct ida *ida, unsigned int min, gfp_t gfp)
{
        return ida_alloc_range(ida, min, ~0, gfp);
}

/**
 * ida_alloc_max() - Allocate an unused ID.
 * @ida: IDA handle.
 * @max: Highest ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Allocate an ID between 0 and @max, inclusive.
 *
 * Context: Any context. It is safe to call this function without
 * locking in your code.
 * Return: The allocated ID, or %-ENOMEM if memory could not be allocated,
 * or %-ENOSPC if there are no free IDs.
 */
static inline int ida_alloc_max(struct ida *ida, unsigned int max, gfp_t gfp)
{
        return ida_alloc_range(ida, 0, max, gfp);
}

static inline void ida_init(struct ida *ida)
{
        xa_init_flags(&ida->xa, IDA_INIT_FLAGS);
}

static inline bool ida_is_empty(const struct ida *ida)
{
        return xa_empty(&ida->xa);
}

static inline bool ida_exists(struct ida *ida, unsigned int id)
{
        return ida_find_first_range(ida, id, id) == id;
}

static inline int ida_find_first(struct ida *ida)
{
        return ida_find_first_range(ida, 0, ~0);
}
#endif /* __IDR_H__ */





























































































































































































































































































































































































































































































    3 




    3 
    2 




    2 



    2 





    1 

    1 












    1 





    1 
    1 

    1 



    1 



    1 

    3 



    1 
    2 

































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
// SPDX-License-Identifier: GPL-2.0-only
#define pr_fmt(fmt) "IPsec: " fmt

#include <crypto/hash.h>
#include <crypto/utils.h>
#include <linux/err.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <net/ip.h>
#include <net/xfrm.h>
#include <net/ah.h>
#include <linux/crypto.h>
#include <linux/pfkeyv2.h>
#include <linux/scatterlist.h>
#include <net/icmp.h>
#include <net/protocol.h>

struct ah_skb_cb {
        struct xfrm_skb_cb xfrm;
        void *tmp;
};

#define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0]))

static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags,
                          unsigned int size)
{
        unsigned int len;

        len = size + crypto_ahash_digestsize(ahash);

        len = ALIGN(len, crypto_tfm_ctx_alignment());

        len += sizeof(struct ahash_request) + crypto_ahash_reqsize(ahash);
        len = ALIGN(len, __alignof__(struct scatterlist));

        len += sizeof(struct scatterlist) * nfrags;

        return kmalloc(len, GFP_ATOMIC);
}

static inline u8 *ah_tmp_auth(void *tmp, unsigned int offset)
{
        return tmp + offset;
}

static inline u8 *ah_tmp_icv(void *tmp, unsigned int offset)
{
        return tmp + offset;
}

static inline struct ahash_request *ah_tmp_req(struct crypto_ahash *ahash,
                                               u8 *icv)
{
        struct ahash_request *req;

        req = (void *)PTR_ALIGN(icv + crypto_ahash_digestsize(ahash),
                                crypto_tfm_ctx_alignment());

        ahash_request_set_tfm(req, ahash);

        return req;
}

static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash,
                                             struct ahash_request *req)
{
        return (void *)ALIGN((unsigned long)(req + 1) +
                             crypto_ahash_reqsize(ahash),
                             __alignof__(struct scatterlist));
}

/* Clear mutable options and find final destination to substitute
 * into IP header for icv calculation. Options are already checked
 * for validity, so paranoia is not required. */

static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr)
{
        unsigned char *optptr = (unsigned char *)(iph+1);
        int  l = iph->ihl*4 - sizeof(struct iphdr);
        int  optlen;

        while (l > 0) {
                switch (*optptr) {
                case IPOPT_END:
                        return 0;
                case IPOPT_NOOP:
                        l--;
                        optptr++;
                        continue;
                }
                optlen = optptr[1];
                if (optlen<2 || optlen>l)
                        return -EINVAL;
                switch (*optptr) {
                case IPOPT_SEC:
                case 0x85:        /* Some "Extended Security" crap. */
                case IPOPT_CIPSO:
                case IPOPT_RA:
                case 0x80|21:        /* RFC1770 */
                        break;
                case IPOPT_LSRR:
                case IPOPT_SSRR:
                        if (optlen < 6)
                                return -EINVAL;
                        memcpy(daddr, optptr+optlen-4, 4);
                        fallthrough;
                default:
                        memset(optptr, 0, optlen);
                }
                l -= optlen;
                optptr += optlen;
        }
        return 0;
}

static void ah_output_done(void *data, int err)
{
        u8 *icv;
        struct iphdr *iph;
        struct sk_buff *skb = data;
        struct xfrm_state *x = skb_dst(skb)->xfrm;
        struct ah_data *ahp = x->data;
        struct iphdr *top_iph = ip_hdr(skb);
        struct ip_auth_hdr *ah = ip_auth_hdr(skb);
        int ihl = ip_hdrlen(skb);

        iph = AH_SKB_CB(skb)->tmp;
        icv = ah_tmp_icv(iph, ihl);
        memcpy(ah->auth_data, icv, ahp->icv_trunc_len);

        top_iph->tos = iph->tos;
        top_iph->ttl = iph->ttl;
        top_iph->frag_off = iph->frag_off;
        if (top_iph->ihl != 5) {
                top_iph->daddr = iph->daddr;
                memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
        }

        kfree(AH_SKB_CB(skb)->tmp);
        xfrm_output_resume(skb->sk, skb, err);
}

static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
{
        int err;
        int nfrags;
        int ihl;
        u8 *icv;
        struct sk_buff *trailer;
        struct crypto_ahash *ahash;
        struct ahash_request *req;
        struct scatterlist *sg;
        struct iphdr *iph, *top_iph;
        struct ip_auth_hdr *ah;
        struct ah_data *ahp;
        int seqhi_len = 0;
        __be32 *seqhi;
        int sglists = 0;
        struct scatterlist *seqhisg;

        ahp = x->data;
        ahash = ahp->ahash;

        if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
                goto out;
        nfrags = err;

        skb_push(skb, -skb_network_offset(skb));
        ah = ip_auth_hdr(skb);
        ihl = ip_hdrlen(skb);

        if (x->props.flags & XFRM_STATE_ESN) {
                sglists = 1;
                seqhi_len = sizeof(*seqhi);
        }
        err = -ENOMEM;
        iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + seqhi_len);
        if (!iph)
                goto out;
        seqhi = (__be32 *)((char *)iph + ihl);
        icv = ah_tmp_icv(seqhi, seqhi_len);
        req = ah_tmp_req(ahash, icv);
        sg = ah_req_sg(ahash, req);
        seqhisg = sg + nfrags;

        memset(ah->auth_data, 0, ahp->icv_trunc_len);

        top_iph = ip_hdr(skb);

        iph->tos = top_iph->tos;
        iph->ttl = top_iph->ttl;
        iph->frag_off = top_iph->frag_off;

        if (top_iph->ihl != 5) {
                iph->daddr = top_iph->daddr;
                memcpy(iph+1, top_iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
                err = ip_clear_mutable_options(top_iph, &top_iph->daddr);
                if (err)
                        goto out_free;
        }

        ah->nexthdr = *skb_mac_header(skb);
        *skb_mac_header(skb) = IPPROTO_AH;

        top_iph->tos = 0;
        top_iph->tot_len = htons(skb->len);
        top_iph->frag_off = 0;
        top_iph->ttl = 0;
        top_iph->check = 0;

        if (x->props.flags & XFRM_STATE_ALIGN4)
                ah->hdrlen  = (XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
        else
                ah->hdrlen  = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;

        ah->reserved = 0;
        ah->spi = x->id.spi;
        ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);

        sg_init_table(sg, nfrags + sglists);
        err = skb_to_sgvec_nomark(skb, sg, 0, skb->len);
        if (unlikely(err < 0))
                goto out_free;

        if (x->props.flags & XFRM_STATE_ESN) {
                /* Attach seqhi sg right after packet payload */
                *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
                sg_set_buf(seqhisg, seqhi, seqhi_len);
        }
        ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);
        ahash_request_set_callback(req, 0, ah_output_done, skb);

        AH_SKB_CB(skb)->tmp = iph;

        err = crypto_ahash_digest(req);
        if (err) {
                if (err == -EINPROGRESS)
                        goto out;

                if (err == -ENOSPC)
                        err = NET_XMIT_DROP;
                goto out_free;
        }

        memcpy(ah->auth_data, icv, ahp->icv_trunc_len);

        top_iph->tos = iph->tos;
        top_iph->ttl = iph->ttl;
        top_iph->frag_off = iph->frag_off;
        if (top_iph->ihl != 5) {
                top_iph->daddr = iph->daddr;
                memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
        }

out_free:
        kfree(iph);
out:
        return err;
}

static void ah_input_done(void *data, int err)
{
        u8 *auth_data;
        u8 *icv;
        struct iphdr *work_iph;
        struct sk_buff *skb = data;
        struct xfrm_state *x = xfrm_input_state(skb);
        struct ah_data *ahp = x->data;
        struct ip_auth_hdr *ah = ip_auth_hdr(skb);
        int ihl = ip_hdrlen(skb);
        int ah_hlen = (ah->hdrlen + 2) << 2;

        if (err)
                goto out;

        work_iph = AH_SKB_CB(skb)->tmp;
        auth_data = ah_tmp_auth(work_iph, ihl);
        icv = ah_tmp_icv(auth_data, ahp->icv_trunc_len);

        err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0;
        if (err)
                goto out;

        err = ah->nexthdr;

        skb->network_header += ah_hlen;
        memcpy(skb_network_header(skb), work_iph, ihl);
        __skb_pull(skb, ah_hlen + ihl);

        if (x->props.mode == XFRM_MODE_TUNNEL)
                skb_reset_transport_header(skb);
        else
                skb_set_transport_header(skb, -ihl);
out:
        kfree(AH_SKB_CB(skb)->tmp);
        xfrm_input_resume(skb, err);
}

static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
{
        int ah_hlen;
        int ihl;
        int nexthdr;
        int nfrags;
        u8 *auth_data;
        u8 *icv;
        struct sk_buff *trailer;
        struct crypto_ahash *ahash;
        struct ahash_request *req;
        struct scatterlist *sg;
        struct iphdr *iph, *work_iph;
        struct ip_auth_hdr *ah;
        struct ah_data *ahp;
        int err = -ENOMEM;
        int seqhi_len = 0;
        __be32 *seqhi;
        int sglists = 0;
        struct scatterlist *seqhisg;

        if (!pskb_may_pull(skb, sizeof(*ah)))
                goto out;

        ah = (struct ip_auth_hdr *)skb->data;
        ahp = x->data;
        ahash = ahp->ahash;

        nexthdr = ah->nexthdr;
        ah_hlen = (ah->hdrlen + 2) << 2;

        if (x->props.flags & XFRM_STATE_ALIGN4) {
                if (ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_full_len) &&
                    ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len))
                        goto out;
        } else {
                if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
                    ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
                        goto out;
        }

        if (!pskb_may_pull(skb, ah_hlen))
                goto out;

        /* We are going to _remove_ AH header to keep sockets happy,
         * so... Later this can change. */
        if (skb_unclone(skb, GFP_ATOMIC))
                goto out;

        skb->ip_summed = CHECKSUM_NONE;


        if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
                goto out;
        nfrags = err;

        ah = (struct ip_auth_hdr *)skb->data;
        iph = ip_hdr(skb);
        ihl = ip_hdrlen(skb);

        if (x->props.flags & XFRM_STATE_ESN) {
                sglists = 1;
                seqhi_len = sizeof(*seqhi);
        }

        work_iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl +
                                ahp->icv_trunc_len + seqhi_len);
        if (!work_iph) {
                err = -ENOMEM;
                goto out;
        }

        seqhi = (__be32 *)((char *)work_iph + ihl);
        auth_data = ah_tmp_auth(seqhi, seqhi_len);
        icv = ah_tmp_icv(auth_data, ahp->icv_trunc_len);
        req = ah_tmp_req(ahash, icv);
        sg = ah_req_sg(ahash, req);
        seqhisg = sg + nfrags;

        memcpy(work_iph, iph, ihl);
        memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
        memset(ah->auth_data, 0, ahp->icv_trunc_len);

        iph->ttl = 0;
        iph->tos = 0;
        iph->frag_off = 0;
        iph->check = 0;
        if (ihl > sizeof(*iph)) {
                __be32 dummy;
                err = ip_clear_mutable_options(iph, &dummy);
                if (err)
                        goto out_free;
        }

        skb_push(skb, ihl);

        sg_init_table(sg, nfrags + sglists);
        err = skb_to_sgvec_nomark(skb, sg, 0, skb->len);
        if (unlikely(err < 0))
                goto out_free;

        if (x->props.flags & XFRM_STATE_ESN) {
                /* Attach seqhi sg right after packet payload */
                *seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
                sg_set_buf(seqhisg, seqhi, seqhi_len);
        }
        ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);
        ahash_request_set_callback(req, 0, ah_input_done, skb);

        AH_SKB_CB(skb)->tmp = work_iph;

        err = crypto_ahash_digest(req);
        if (err) {
                if (err == -EINPROGRESS)
                        goto out;

                goto out_free;
        }

        err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0;
        if (err)
                goto out_free;

        skb->network_header += ah_hlen;
        memcpy(skb_network_header(skb), work_iph, ihl);
        __skb_pull(skb, ah_hlen + ihl);
        if (x->props.mode == XFRM_MODE_TUNNEL)
                skb_reset_transport_header(skb);
        else
                skb_set_transport_header(skb, -ihl);

        err = nexthdr;

out_free:
        kfree (work_iph);
out:
        return err;
}

static int ah4_err(struct sk_buff *skb, u32 info)
{
        struct net *net = dev_net(skb->dev);
        const struct iphdr *iph = (const struct iphdr *)skb->data;
        struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
        struct xfrm_state *x;

        switch (icmp_hdr(skb)->type) {
        case ICMP_DEST_UNREACH:
                if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
                        return 0;
                break;
        case ICMP_REDIRECT:
                break;
        default:
                return 0;
        }

        x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
                              ah->spi, IPPROTO_AH, AF_INET);
        if (!x)
                return 0;

        if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
                ipv4_update_pmtu(skb, net, info, 0, IPPROTO_AH);
        else
                ipv4_redirect(skb, net, 0, IPPROTO_AH);
        xfrm_state_put(x);

        return 0;
}

static int ah_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
        struct ah_data *ahp = NULL;
        struct xfrm_algo_desc *aalg_desc;
        struct crypto_ahash *ahash;

        if (!x->aalg) {
                NL_SET_ERR_MSG(extack, "AH requires a state with an AUTH algorithm");
                goto error;
        }

        if (x->encap) {
                NL_SET_ERR_MSG(extack, "AH is not compatible with encapsulation");
                goto error;
        }

        ahp = kzalloc(sizeof(*ahp), GFP_KERNEL);
        if (!ahp)
                return -ENOMEM;

        ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0);
        if (IS_ERR(ahash)) {
                NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
                goto error;
        }

        ahp->ahash = ahash;
        if (crypto_ahash_setkey(ahash, x->aalg->alg_key,
                                (x->aalg->alg_key_len + 7) / 8)) {
                NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
                goto error;
        }

        /*
         * Lookup the algorithm description maintained by xfrm_algo,
         * verify crypto transform properties, and store information
         * we need for AH processing.  This lookup cannot fail here
         * after a successful crypto_alloc_ahash().
         */
        aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
        BUG_ON(!aalg_desc);

        if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
            crypto_ahash_digestsize(ahash)) {
                NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
                goto error;
        }

        ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
        ahp->icv_trunc_len = x->aalg->alg_trunc_len/8;

        if (x->props.flags & XFRM_STATE_ALIGN4)
                x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) +
                                                  ahp->icv_trunc_len);
        else
                x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
                                                  ahp->icv_trunc_len);
        if (x->props.mode == XFRM_MODE_TUNNEL)
                x->props.header_len += sizeof(struct iphdr);
        x->data = ahp;

        return 0;

error:
        if (ahp) {
                crypto_free_ahash(ahp->ahash);
                kfree(ahp);
        }
        return -EINVAL;
}

static void ah_destroy(struct xfrm_state *x)
{
        struct ah_data *ahp = x->data;

        if (!ahp)
                return;

        crypto_free_ahash(ahp->ahash);
        kfree(ahp);
}

static int ah4_rcv_cb(struct sk_buff *skb, int err)
{
        return 0;
}

static const struct xfrm_type ah_type =
{
        .owner                = THIS_MODULE,
        .proto                     = IPPROTO_AH,
        .flags                = XFRM_TYPE_REPLAY_PROT,
        .init_state        = ah_init_state,
        .destructor        = ah_destroy,
        .input                = ah_input,
        .output                = ah_output
};

static struct xfrm4_protocol ah4_protocol = {
        .handler        =        xfrm4_rcv,
        .input_handler        =        xfrm_input,
        .cb_handler        =        ah4_rcv_cb,
        .err_handler        =        ah4_err,
        .priority        =        0,
};

static int __init ah4_init(void)
{
        if (xfrm_register_type(&ah_type, AF_INET) < 0) {
                pr_info("%s: can't add xfrm type\n", __func__);
                return -EAGAIN;
        }
        if (xfrm4_protocol_register(&ah4_protocol, IPPROTO_AH) < 0) {
                pr_info("%s: can't add protocol\n", __func__);
                xfrm_unregister_type(&ah_type, AF_INET);
                return -EAGAIN;
        }
        return 0;
}

static void __exit ah4_fini(void)
{
        if (xfrm4_protocol_deregister(&ah4_protocol, IPPROTO_AH) < 0)
                pr_info("%s: can't remove protocol\n", __func__);
        xfrm_unregister_type(&ah_type, AF_INET);
}

module_init(ah4_init);
module_exit(ah4_fini);
MODULE_DESCRIPTION("IPv4 AH transformation library");
MODULE_LICENSE("GPL");
MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_AH);

















































































  318 

  317 




















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  Copyright (C) 1993  Linus Torvalds
 *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
 *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
 *  Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
 *  Numa awareness, Christoph Lameter, SGI, June 2005
 *  Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019
 */

#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/set_memory.h>
#include <linux/debugobjects.h>
#include <linux/kallsyms.h>
#include <linux/list.h>
#include <linux/notifier.h>
#include <linux/rbtree.h>
#include <linux/xarray.h>
#include <linux/io.h>
#include <linux/rcupdate.h>
#include <linux/pfn.h>
#include <linux/kmemleak.h>
#include <linux/atomic.h>
#include <linux/compiler.h>
#include <linux/memcontrol.h>
#include <linux/llist.h>
#include <linux/uio.h>
#include <linux/bitops.h>
#include <linux/rbtree_augmented.h>
#include <linux/overflow.h>
#include <linux/pgtable.h>
#include <linux/hugetlb.h>
#include <linux/sched/mm.h>
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
#include <linux/page_owner.h>

#define CREATE_TRACE_POINTS
#include <trace/events/vmalloc.h>

#include "internal.h"
#include "pgalloc-track.h"

#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
static unsigned int __ro_after_init ioremap_max_page_shift = BITS_PER_LONG - 1;

static int __init set_nohugeiomap(char *str)
{
        ioremap_max_page_shift = PAGE_SHIFT;
        return 0;
}
early_param("nohugeiomap", set_nohugeiomap);
#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
static const unsigned int ioremap_max_page_shift = PAGE_SHIFT;
#endif        /* CONFIG_HAVE_ARCH_HUGE_VMAP */

#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
static bool __ro_after_init vmap_allow_huge = true;

static int __init set_nohugevmalloc(char *str)
{
        vmap_allow_huge = false;
        return 0;
}
early_param("nohugevmalloc", set_nohugevmalloc);
#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
static const bool vmap_allow_huge = false;
#endif        /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */

bool is_vmalloc_addr(const void *x)
{
        unsigned long addr = (unsigned long)kasan_reset_tag(x);

        return addr >= VMALLOC_START && addr < VMALLOC_END;
}
EXPORT_SYMBOL(is_vmalloc_addr);

struct vfree_deferred {
        struct llist_head list;
        struct work_struct wq;
};
static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);

/*** Page table manipulation functions ***/
static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift, pgtbl_mod_mask *mask)
{
        pte_t *pte;
        u64 pfn;
        struct page *page;
        unsigned long size = PAGE_SIZE;

        pfn = phys_addr >> PAGE_SHIFT;
        pte = pte_alloc_kernel_track(pmd, addr, mask);
        if (!pte)
                return -ENOMEM;

        arch_enter_lazy_mmu_mode();

        do {
                if (unlikely(!pte_none(ptep_get(pte)))) {
                        if (pfn_valid(pfn)) {
                                page = pfn_to_page(pfn);
                                dump_page(page, "remapping already mapped page");
                        }
                        BUG();
                }

#ifdef CONFIG_HUGETLB_PAGE
                size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift);
                if (size != PAGE_SIZE) {
                        pte_t entry = pfn_pte(pfn, prot);

                        entry = arch_make_huge_pte(entry, ilog2(size), 0);
                        set_huge_pte_at(&init_mm, addr, pte, entry, size);
                        pfn += PFN_DOWN(size);
                        continue;
                }
#endif
                set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
                pfn++;
        } while (pte += PFN_DOWN(size), addr += size, addr != end);

        arch_leave_lazy_mmu_mode();
        *mask |= PGTBL_PTE_MODIFIED;
        return 0;
}

static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift)
{
        if (max_page_shift < PMD_SHIFT)
                return 0;

        if (!arch_vmap_pmd_supported(prot))
                return 0;

        if ((end - addr) != PMD_SIZE)
                return 0;

        if (!IS_ALIGNED(addr, PMD_SIZE))
                return 0;

        if (!IS_ALIGNED(phys_addr, PMD_SIZE))
                return 0;

        if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
                return 0;

        return pmd_set_huge(pmd, phys_addr, prot);
}

static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift, pgtbl_mod_mask *mask)
{
        pmd_t *pmd;
        unsigned long next;

        pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
        if (!pmd)
                return -ENOMEM;
        do {
                next = pmd_addr_end(addr, end);

                if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
                                        max_page_shift)) {
                        *mask |= PGTBL_PMD_MODIFIED;
                        continue;
                }

                if (vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask))
                        return -ENOMEM;
        } while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
        return 0;
}

static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift)
{
        if (max_page_shift < PUD_SHIFT)
                return 0;

        if (!arch_vmap_pud_supported(prot))
                return 0;

        if ((end - addr) != PUD_SIZE)
                return 0;

        if (!IS_ALIGNED(addr, PUD_SIZE))
                return 0;

        if (!IS_ALIGNED(phys_addr, PUD_SIZE))
                return 0;

        if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
                return 0;

        return pud_set_huge(pud, phys_addr, prot);
}

static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift, pgtbl_mod_mask *mask)
{
        pud_t *pud;
        unsigned long next;

        pud = pud_alloc_track(&init_mm, p4d, addr, mask);
        if (!pud)
                return -ENOMEM;
        do {
                next = pud_addr_end(addr, end);

                if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot,
                                        max_page_shift)) {
                        *mask |= PGTBL_PUD_MODIFIED;
                        continue;
                }

                if (vmap_pmd_range(pud, addr, next, phys_addr, prot,
                                        max_page_shift, mask))
                        return -ENOMEM;
        } while (pud++, phys_addr += (next - addr), addr = next, addr != end);
        return 0;
}

static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift)
{
        if (max_page_shift < P4D_SHIFT)
                return 0;

        if (!arch_vmap_p4d_supported(prot))
                return 0;

        if ((end - addr) != P4D_SIZE)
                return 0;

        if (!IS_ALIGNED(addr, P4D_SIZE))
                return 0;

        if (!IS_ALIGNED(phys_addr, P4D_SIZE))
                return 0;

        if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
                return 0;

        return p4d_set_huge(p4d, phys_addr, prot);
}

static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift, pgtbl_mod_mask *mask)
{
        p4d_t *p4d;
        unsigned long next;

        p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
        if (!p4d)
                return -ENOMEM;
        do {
                next = p4d_addr_end(addr, end);

                if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot,
                                        max_page_shift)) {
                        *mask |= PGTBL_P4D_MODIFIED;
                        continue;
                }

                if (vmap_pud_range(p4d, addr, next, phys_addr, prot,
                                        max_page_shift, mask))
                        return -ENOMEM;
        } while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
        return 0;
}

static int vmap_range_noflush(unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift)
{
        pgd_t *pgd;
        unsigned long start;
        unsigned long next;
        int err;
        pgtbl_mod_mask mask = 0;

        might_sleep();
        BUG_ON(addr >= end);

        start = addr;
        pgd = pgd_offset_k(addr);
        do {
                next = pgd_addr_end(addr, end);
                err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
                                        max_page_shift, &mask);
                if (err)
                        break;
        } while (pgd++, phys_addr += (next - addr), addr = next, addr != end);

        if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
                arch_sync_kernel_mappings(start, end);

        return err;
}

int vmap_page_range(unsigned long addr, unsigned long end,
                    phys_addr_t phys_addr, pgprot_t prot)
{
        int err;

        err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot),
                                 ioremap_max_page_shift);
        flush_cache_vmap(addr, end);
        if (!err)
                err = kmsan_ioremap_page_range(addr, end, phys_addr, prot,
                                               ioremap_max_page_shift);
        return err;
}

int ioremap_page_range(unsigned long addr, unsigned long end,
                phys_addr_t phys_addr, pgprot_t prot)
{
        struct vm_struct *area;

        area = find_vm_area((void *)addr);
        if (!area || !(area->flags & VM_IOREMAP)) {
                WARN_ONCE(1, "vm_area at addr %lx is not marked as VM_IOREMAP\n", addr);
                return -EINVAL;
        }
        if (addr != (unsigned long)area->addr ||
            (void *)end != area->addr + get_vm_area_size(area)) {
                WARN_ONCE(1, "ioremap request [%lx,%lx) doesn't match vm_area [%lx, %lx)\n",
                          addr, end, (long)area->addr,
                          (long)area->addr + get_vm_area_size(area));
                return -ERANGE;
        }
        return vmap_page_range(addr, end, phys_addr, prot);
}

static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                             pgtbl_mod_mask *mask)
{
        pte_t *pte;
        pte_t ptent;
        unsigned long size = PAGE_SIZE;

        pte = pte_offset_kernel(pmd, addr);
        arch_enter_lazy_mmu_mode();

        do {
#ifdef CONFIG_HUGETLB_PAGE
                size = arch_vmap_pte_range_unmap_size(addr, pte);
                if (size != PAGE_SIZE) {
                        if (WARN_ON(!IS_ALIGNED(addr, size))) {
                                addr = ALIGN_DOWN(addr, size);
                                pte = PTR_ALIGN_DOWN(pte, sizeof(*pte) * (size >> PAGE_SHIFT));
                        }
                        ptent = huge_ptep_get_and_clear(&init_mm, addr, pte, size);
                        if (WARN_ON(end - addr < size))
                                size = end - addr;
                } else
#endif
                        ptent = ptep_get_and_clear(&init_mm, addr, pte);
                WARN_ON(!pte_none(ptent) && !pte_present(ptent));
        } while (pte += (size >> PAGE_SHIFT), addr += size, addr != end);

        arch_leave_lazy_mmu_mode();
        *mask |= PGTBL_PTE_MODIFIED;
}

static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
                             pgtbl_mod_mask *mask)
{
        pmd_t *pmd;
        unsigned long next;
        int cleared;

        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);

                cleared = pmd_clear_huge(pmd);
                if (cleared || pmd_bad(*pmd))
                        *mask |= PGTBL_PMD_MODIFIED;

                if (cleared) {
                        WARN_ON(next - addr < PMD_SIZE);
                        continue;
                }
                if (pmd_none_or_clear_bad(pmd))
                        continue;
                vunmap_pte_range(pmd, addr, next, mask);

                cond_resched();
        } while (pmd++, addr = next, addr != end);
}

static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
                             pgtbl_mod_mask *mask)
{
        pud_t *pud;
        unsigned long next;
        int cleared;

        pud = pud_offset(p4d, addr);
        do {
                next = pud_addr_end(addr, end);

                cleared = pud_clear_huge(pud);
                if (cleared || pud_bad(*pud))
                        *mask |= PGTBL_PUD_MODIFIED;

                if (cleared) {
                        WARN_ON(next - addr < PUD_SIZE);
                        continue;
                }
                if (pud_none_or_clear_bad(pud))
                        continue;
                vunmap_pmd_range(pud, addr, next, mask);
        } while (pud++, addr = next, addr != end);
}

static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
                             pgtbl_mod_mask *mask)
{
        p4d_t *p4d;
        unsigned long next;

        p4d = p4d_offset(pgd, addr);
        do {
                next = p4d_addr_end(addr, end);

                p4d_clear_huge(p4d);
                if (p4d_bad(*p4d))
                        *mask |= PGTBL_P4D_MODIFIED;

                if (p4d_none_or_clear_bad(p4d))
                        continue;
                vunmap_pud_range(p4d, addr, next, mask);
        } while (p4d++, addr = next, addr != end);
}

/*
 * vunmap_range_noflush is similar to vunmap_range, but does not
 * flush caches or TLBs.
 *
 * The caller is responsible for calling flush_cache_vmap() before calling
 * this function, and flush_tlb_kernel_range after it has returned
 * successfully (and before the addresses are expected to cause a page fault
 * or be re-mapped for something else, if TLB flushes are being delayed or
 * coalesced).
 *
 * This is an internal function only. Do not use outside mm/.
 */
void __vunmap_range_noflush(unsigned long start, unsigned long end)
{
        unsigned long next;
        pgd_t *pgd;
        unsigned long addr = start;
        pgtbl_mod_mask mask = 0;

        BUG_ON(addr >= end);
        pgd = pgd_offset_k(addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_bad(*pgd))
                        mask |= PGTBL_PGD_MODIFIED;
                if (pgd_none_or_clear_bad(pgd))
                        continue;
                vunmap_p4d_range(pgd, addr, next, &mask);
        } while (pgd++, addr = next, addr != end);

        if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
                arch_sync_kernel_mappings(start, end);
}

void vunmap_range_noflush(unsigned long start, unsigned long end)
{
        kmsan_vunmap_range_noflush(start, end);
        __vunmap_range_noflush(start, end);
}

/**
 * vunmap_range - unmap kernel virtual addresses
 * @addr: start of the VM area to unmap
 * @end: end of the VM area to unmap (non-inclusive)
 *
 * Clears any present PTEs in the virtual address range, flushes TLBs and
 * caches. Any subsequent access to the address before it has been re-mapped
 * is a kernel bug.
 */
void vunmap_range(unsigned long addr, unsigned long end)
{
        flush_cache_vunmap(addr, end);
        vunmap_range_noflush(addr, end);
        flush_tlb_kernel_range(addr, end);
}

static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
                unsigned long end, pgprot_t prot, struct page **pages, int *nr,
                pgtbl_mod_mask *mask)
{
        int err = 0;
        pte_t *pte;

        /*
         * nr is a running index into the array which helps higher level
         * callers keep track of where we're up to.
         */

        pte = pte_alloc_kernel_track(pmd, addr, mask);
        if (!pte)
                return -ENOMEM;

        arch_enter_lazy_mmu_mode();

        do {
                struct page *page = pages[*nr];

                if (WARN_ON(!pte_none(ptep_get(pte)))) {
                        err = -EBUSY;
                        break;
                }
                if (WARN_ON(!page)) {
                        err = -ENOMEM;
                        break;
                }
                if (WARN_ON(!pfn_valid(page_to_pfn(page)))) {
                        err = -EINVAL;
                        break;
                }

                set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
                (*nr)++;
        } while (pte++, addr += PAGE_SIZE, addr != end);

        arch_leave_lazy_mmu_mode();
        *mask |= PGTBL_PTE_MODIFIED;

        return err;
}

static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
                unsigned long end, pgprot_t prot, struct page **pages, int *nr,
                pgtbl_mod_mask *mask)
{
        pmd_t *pmd;
        unsigned long next;

        pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
        if (!pmd)
                return -ENOMEM;
        do {
                next = pmd_addr_end(addr, end);
                if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask))
                        return -ENOMEM;
        } while (pmd++, addr = next, addr != end);
        return 0;
}

static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
                unsigned long end, pgprot_t prot, struct page **pages, int *nr,
                pgtbl_mod_mask *mask)
{
        pud_t *pud;
        unsigned long next;

        pud = pud_alloc_track(&init_mm, p4d, addr, mask);
        if (!pud)
                return -ENOMEM;
        do {
                next = pud_addr_end(addr, end);
                if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask))
                        return -ENOMEM;
        } while (pud++, addr = next, addr != end);
        return 0;
}

static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
                unsigned long end, pgprot_t prot, struct page **pages, int *nr,
                pgtbl_mod_mask *mask)
{
        p4d_t *p4d;
        unsigned long next;

        p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
        if (!p4d)
                return -ENOMEM;
        do {
                next = p4d_addr_end(addr, end);
                if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask))
                        return -ENOMEM;
        } while (p4d++, addr = next, addr != end);
        return 0;
}

static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
                pgprot_t prot, struct page **pages)
{
        unsigned long start = addr;
        pgd_t *pgd;
        unsigned long next;
        int err = 0;
        int nr = 0;
        pgtbl_mod_mask mask = 0;

        BUG_ON(addr >= end);
        pgd = pgd_offset_k(addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_bad(*pgd))
                        mask |= PGTBL_PGD_MODIFIED;
                err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
                if (err)
                        break;
        } while (pgd++, addr = next, addr != end);

        if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
                arch_sync_kernel_mappings(start, end);

        return err;
}

/*
 * vmap_pages_range_noflush is similar to vmap_pages_range, but does not
 * flush caches.
 *
 * The caller is responsible for calling flush_cache_vmap() after this
 * function returns successfully and before the addresses are accessed.
 *
 * This is an internal function only. Do not use outside mm/.
 */
int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
                pgprot_t prot, struct page **pages, unsigned int page_shift)
{
        unsigned int i, nr = (end - addr) >> PAGE_SHIFT;

        WARN_ON(page_shift < PAGE_SHIFT);

        if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
                        page_shift == PAGE_SHIFT)
                return vmap_small_pages_range_noflush(addr, end, prot, pages);

        for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
                int err;

                err = vmap_range_noflush(addr, addr + (1UL << page_shift),
                                        page_to_phys(pages[i]), prot,
                                        page_shift);
                if (err)
                        return err;

                addr += 1UL << page_shift;
        }

        return 0;
}

int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
                pgprot_t prot, struct page **pages, unsigned int page_shift)
{
        int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages,
                                                 page_shift);

        if (ret)
                return ret;
        return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
}

/**
 * vmap_pages_range - map pages to a kernel virtual address
 * @addr: start of the VM area to map
 * @end: end of the VM area to map (non-inclusive)
 * @prot: page protection flags to use
 * @pages: pages to map (always PAGE_SIZE pages)
 * @page_shift: maximum shift that the pages may be mapped with, @pages must
 * be aligned and contiguous up to at least this shift.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
int vmap_pages_range(unsigned long addr, unsigned long end,
                pgprot_t prot, struct page **pages, unsigned int page_shift)
{
        int err;

        err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
        flush_cache_vmap(addr, end);
        return err;
}

static int check_sparse_vm_area(struct vm_struct *area, unsigned long start,
                                unsigned long end)
{
        might_sleep();
        if (WARN_ON_ONCE(area->flags & VM_FLUSH_RESET_PERMS))
                return -EINVAL;
        if (WARN_ON_ONCE(area->flags & VM_NO_GUARD))
                return -EINVAL;
        if (WARN_ON_ONCE(!(area->flags & VM_SPARSE)))
                return -EINVAL;
        if ((end - start) >> PAGE_SHIFT > totalram_pages())
                return -E2BIG;
        if (start < (unsigned long)area->addr ||
            (void *)end > area->addr + get_vm_area_size(area))
                return -ERANGE;
        return 0;
}

/**
 * vm_area_map_pages - map pages inside given sparse vm_area
 * @area: vm_area
 * @start: start address inside vm_area
 * @end: end address inside vm_area
 * @pages: pages to map (always PAGE_SIZE pages)
 */
int vm_area_map_pages(struct vm_struct *area, unsigned long start,
                      unsigned long end, struct page **pages)
{
        int err;

        err = check_sparse_vm_area(area, start, end);
        if (err)
                return err;

        return vmap_pages_range(start, end, PAGE_KERNEL, pages, PAGE_SHIFT);
}

/**
 * vm_area_unmap_pages - unmap pages inside given sparse vm_area
 * @area: vm_area
 * @start: start address inside vm_area
 * @end: end address inside vm_area
 */
void vm_area_unmap_pages(struct vm_struct *area, unsigned long start,
                         unsigned long end)
{
        if (check_sparse_vm_area(area, start, end))
                return;

        vunmap_range(start, end);
}

int is_vmalloc_or_module_addr(const void *x)
{
        /*
         * ARM, x86-64 and sparc64 put modules in a special place,
         * and fall back on vmalloc() if that fails. Others
         * just put it in the vmalloc space.
         */
#if defined(CONFIG_EXECMEM) && defined(MODULES_VADDR)
        unsigned long addr = (unsigned long)kasan_reset_tag(x);
        if (addr >= MODULES_VADDR && addr < MODULES_END)
                return 1;
#endif
        return is_vmalloc_addr(x);
}
EXPORT_SYMBOL_GPL(is_vmalloc_or_module_addr);

/*
 * Walk a vmap address to the struct page it maps. Huge vmap mappings will
 * return the tail page that corresponds to the base page address, which
 * matches small vmap mappings.
 */
struct page *vmalloc_to_page(const void *vmalloc_addr)
{
        unsigned long addr = (unsigned long) vmalloc_addr;
        struct page *page = NULL;
        pgd_t *pgd = pgd_offset_k(addr);
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;
        pte_t *ptep, pte;

        /*
         * XXX we might need to change this if we add VIRTUAL_BUG_ON for
         * architectures that do not vmalloc module space
         */
        VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));

        if (pgd_none(*pgd))
                return NULL;
        if (WARN_ON_ONCE(pgd_leaf(*pgd)))
                return NULL; /* XXX: no allowance for huge pgd */
        if (WARN_ON_ONCE(pgd_bad(*pgd)))
                return NULL;

        p4d = p4d_offset(pgd, addr);
        if (p4d_none(*p4d))
                return NULL;
        if (p4d_leaf(*p4d))
                return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
        if (WARN_ON_ONCE(p4d_bad(*p4d)))
                return NULL;

        pud = pud_offset(p4d, addr);
        if (pud_none(*pud))
                return NULL;
        if (pud_leaf(*pud))
                return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
        if (WARN_ON_ONCE(pud_bad(*pud)))
                return NULL;

        pmd = pmd_offset(pud, addr);
        if (pmd_none(*pmd))
                return NULL;
        if (pmd_leaf(*pmd))
                return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
        if (WARN_ON_ONCE(pmd_bad(*pmd)))
                return NULL;

        ptep = pte_offset_kernel(pmd, addr);
        pte = ptep_get(ptep);
        if (pte_present(pte))
                page = pte_page(pte);

        return page;
}
EXPORT_SYMBOL(vmalloc_to_page);

/*
 * Map a vmalloc()-space virtual address to the physical page frame number.
 */
unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
{
        return page_to_pfn(vmalloc_to_page(vmalloc_addr));
}
EXPORT_SYMBOL(vmalloc_to_pfn);


/*** Global kva allocator ***/

#define DEBUG_AUGMENT_PROPAGATE_CHECK 0
#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0


static DEFINE_SPINLOCK(free_vmap_area_lock);
static bool vmap_initialized __read_mostly;

/*
 * This kmem_cache is used for vmap_area objects. Instead of
 * allocating from slab we reuse an object from this cache to
 * make things faster. Especially in "no edge" splitting of
 * free block.
 */
static struct kmem_cache *vmap_area_cachep;

/*
 * This linked list is used in pair with free_vmap_area_root.
 * It gives O(1) access to prev/next to perform fast coalescing.
 */
static LIST_HEAD(free_vmap_area_list);

/*
 * This augment red-black tree represents the free vmap space.
 * All vmap_area objects in this tree are sorted by va->va_start
 * address. It is used for allocation and merging when a vmap
 * object is released.
 *
 * Each vmap_area node contains a maximum available free block
 * of its sub-tree, right or left. Therefore it is possible to
 * find a lowest match of free area.
 */
static struct rb_root free_vmap_area_root = RB_ROOT;

/*
 * Preload a CPU with one object for "no edge" split case. The
 * aim is to get rid of allocations from the atomic context, thus
 * to use more permissive allocation masks.
 */
static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);

/*
 * This structure defines a single, solid model where a list and
 * rb-tree are part of one entity protected by the lock. Nodes are
 * sorted in ascending order, thus for O(1) access to left/right
 * neighbors a list is used as well as for sequential traversal.
 */
struct rb_list {
        struct rb_root root;
        struct list_head head;
        spinlock_t lock;
};

/*
 * A fast size storage contains VAs up to 1M size. A pool consists
 * of linked between each other ready to go VAs of certain sizes.
 * An index in the pool-array corresponds to number of pages + 1.
 */
#define MAX_VA_SIZE_PAGES 256

struct vmap_pool {
        struct list_head head;
        unsigned long len;
};

/*
 * An effective vmap-node logic. Users make use of nodes instead
 * of a global heap. It allows to balance an access and mitigate
 * contention.
 */
static struct vmap_node {
        /* Simple size segregated storage. */
        struct vmap_pool pool[MAX_VA_SIZE_PAGES];
        spinlock_t pool_lock;
        bool skip_populate;

        /* Bookkeeping data of this node. */
        struct rb_list busy;
        struct rb_list lazy;

        /*
         * Ready-to-free areas.
         */
        struct list_head purge_list;
        struct work_struct purge_work;
        unsigned long nr_purged;
} single;

/*
 * Initial setup consists of one single node, i.e. a balancing
 * is fully disabled. Later on, after vmap is initialized these
 * parameters are updated based on a system capacity.
 */
static struct vmap_node *vmap_nodes = &single;
static __read_mostly unsigned int nr_vmap_nodes = 1;
static __read_mostly unsigned int vmap_zone_size = 1;

/* A simple iterator over all vmap-nodes. */
#define for_each_vmap_node(vn)        \
        for ((vn) = &vmap_nodes[0];        \
                (vn) < &vmap_nodes[nr_vmap_nodes]; (vn)++)

static inline unsigned int
addr_to_node_id(unsigned long addr)
{
        return (addr / vmap_zone_size) % nr_vmap_nodes;
}

static inline struct vmap_node *
addr_to_node(unsigned long addr)
{
        return &vmap_nodes[addr_to_node_id(addr)];
}

static inline struct vmap_node *
id_to_node(unsigned int id)
{
        return &vmap_nodes[id % nr_vmap_nodes];
}

static inline unsigned int
node_to_id(struct vmap_node *node)
{
        /* Pointer arithmetic. */
        unsigned int id = node - vmap_nodes;

        if (likely(id < nr_vmap_nodes))
                return id;

        WARN_ONCE(1, "An address 0x%p is out-of-bounds.\n", node);
        return 0;
}

/*
 * We use the value 0 to represent "no node", that is why
 * an encoded value will be the node-id incremented by 1.
 * It is always greater then 0. A valid node_id which can
 * be encoded is [0:nr_vmap_nodes - 1]. If a passed node_id
 * is not valid 0 is returned.
 */
static unsigned int
encode_vn_id(unsigned int node_id)
{
        /* Can store U8_MAX [0:254] nodes. */
        if (node_id < nr_vmap_nodes)
                return (node_id + 1) << BITS_PER_BYTE;

        /* Warn and no node encoded. */
        WARN_ONCE(1, "Encode wrong node id (%u)\n", node_id);
        return 0;
}

/*
 * Returns an encoded node-id, the valid range is within
 * [0:nr_vmap_nodes-1] values. Otherwise nr_vmap_nodes is
 * returned if extracted data is wrong.
 */
static unsigned int
decode_vn_id(unsigned int val)
{
        unsigned int node_id = (val >> BITS_PER_BYTE) - 1;

        /* Can store U8_MAX [0:254] nodes. */
        if (node_id < nr_vmap_nodes)
                return node_id;

        /* If it was _not_ zero, warn. */
        WARN_ONCE(node_id != UINT_MAX,
                "Decode wrong node id (%d)\n", node_id);

        return nr_vmap_nodes;
}

static bool
is_vn_id_valid(unsigned int node_id)
{
        if (node_id < nr_vmap_nodes)
                return true;

        return false;
}

static __always_inline unsigned long
va_size(struct vmap_area *va)
{
        return (va->va_end - va->va_start);
}

static __always_inline unsigned long
get_subtree_max_size(struct rb_node *node)
{
        struct vmap_area *va;

        va = rb_entry_safe(node, struct vmap_area, rb_node);
        return va ? va->subtree_max_size : 0;
}

RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
        struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)

static void reclaim_and_purge_vmap_areas(void);
static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
static void drain_vmap_area_work(struct work_struct *work);
static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work);

static __cacheline_aligned_in_smp atomic_long_t nr_vmalloc_pages;
static __cacheline_aligned_in_smp atomic_long_t vmap_lazy_nr;

unsigned long vmalloc_nr_pages(void)
{
        return atomic_long_read(&nr_vmalloc_pages);
}

static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root)
{
        struct rb_node *n = root->rb_node;

        addr = (unsigned long)kasan_reset_tag((void *)addr);

        while (n) {
                struct vmap_area *va;

                va = rb_entry(n, struct vmap_area, rb_node);
                if (addr < va->va_start)
                        n = n->rb_left;
                else if (addr >= va->va_end)
                        n = n->rb_right;
                else
                        return va;
        }

        return NULL;
}

/* Look up the first VA which satisfies addr < va_end, NULL if none. */
static struct vmap_area *
__find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root)
{
        struct vmap_area *va = NULL;
        struct rb_node *n = root->rb_node;

        addr = (unsigned long)kasan_reset_tag((void *)addr);

        while (n) {
                struct vmap_area *tmp;

                tmp = rb_entry(n, struct vmap_area, rb_node);
                if (tmp->va_end > addr) {
                        va = tmp;
                        if (tmp->va_start <= addr)
                                break;

                        n = n->rb_left;
                } else
                        n = n->rb_right;
        }

        return va;
}

/*
 * Returns a node where a first VA, that satisfies addr < va_end, resides.
 * If success, a node is locked. A user is responsible to unlock it when a
 * VA is no longer needed to be accessed.
 *
 * Returns NULL if nothing found.
 */
static struct vmap_node *
find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va)
{
        unsigned long va_start_lowest;
        struct vmap_node *vn;

repeat:
        va_start_lowest = 0;

        for_each_vmap_node(vn) {
                spin_lock(&vn->busy.lock);
                *va = __find_vmap_area_exceed_addr(addr, &vn->busy.root);

                if (*va)
                        if (!va_start_lowest || (*va)->va_start < va_start_lowest)
                                va_start_lowest = (*va)->va_start;
                spin_unlock(&vn->busy.lock);
        }

        /*
         * Check if found VA exists, it might have gone away.  In this case we
         * repeat the search because a VA has been removed concurrently and we
         * need to proceed to the next one, which is a rare case.
         */
        if (va_start_lowest) {
                vn = addr_to_node(va_start_lowest);

                spin_lock(&vn->busy.lock);
                *va = __find_vmap_area(va_start_lowest, &vn->busy.root);

                if (*va)
                        return vn;

                spin_unlock(&vn->busy.lock);
                goto repeat;
        }

        return NULL;
}

/*
 * This function returns back addresses of parent node
 * and its left or right link for further processing.
 *
 * Otherwise NULL is returned. In that case all further
 * steps regarding inserting of conflicting overlap range
 * have to be declined and actually considered as a bug.
 */
static __always_inline struct rb_node **
find_va_links(struct vmap_area *va,
        struct rb_root *root, struct rb_node *from,
        struct rb_node **parent)
{
        struct vmap_area *tmp_va;
        struct rb_node **link;

        if (root) {
                link = &root->rb_node;
                if (unlikely(!*link)) {
                        *parent = NULL;
                        return link;
                }
        } else {
                link = &from;
        }

        /*
         * Go to the bottom of the tree. When we hit the last point
         * we end up with parent rb_node and correct direction, i name
         * it link, where the new va->rb_node will be attached to.
         */
        do {
                tmp_va = rb_entry(*link, struct vmap_area, rb_node);

                /*
                 * During the traversal we also do some sanity check.
                 * Trigger the BUG() if there are sides(left/right)
                 * or full overlaps.
                 */
                if (va->va_end <= tmp_va->va_start)
                        link = &(*link)->rb_left;
                else if (va->va_start >= tmp_va->va_end)
                        link = &(*link)->rb_right;
                else {
                        WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
                                va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end);

                        return NULL;
                }
        } while (*link);

        *parent = &tmp_va->rb_node;
        return link;
}

static __always_inline struct list_head *
get_va_next_sibling(struct rb_node *parent, struct rb_node **link)
{
        struct list_head *list;

        if (unlikely(!parent))
                /*
                 * The red-black tree where we try to find VA neighbors
                 * before merging or inserting is empty, i.e. it means
                 * there is no free vmap space. Normally it does not
                 * happen but we handle this case anyway.
                 */
                return NULL;

        list = &rb_entry(parent, struct vmap_area, rb_node)->list;
        return (&parent->rb_right == link ? list->next : list);
}

static __always_inline void
__link_va(struct vmap_area *va, struct rb_root *root,
        struct rb_node *parent, struct rb_node **link,
        struct list_head *head, bool augment)
{
        /*
         * VA is still not in the list, but we can
         * identify its future previous list_head node.
         */
        if (likely(parent)) {
                head = &rb_entry(parent, struct vmap_area, rb_node)->list;
                if (&parent->rb_right != link)
                        head = head->prev;
        }

        /* Insert to the rb-tree */
        rb_link_node(&va->rb_node, parent, link);
        if (augment) {
                /*
                 * Some explanation here. Just perform simple insertion
                 * to the tree. We do not set va->subtree_max_size to
                 * its current size before calling rb_insert_augmented().
                 * It is because we populate the tree from the bottom
                 * to parent levels when the node _is_ in the tree.
                 *
                 * Therefore we set subtree_max_size to zero after insertion,
                 * to let __augment_tree_propagate_from() puts everything to
                 * the correct order later on.
                 */
                rb_insert_augmented(&va->rb_node,
                        root, &free_vmap_area_rb_augment_cb);
                va->subtree_max_size = 0;
        } else {
                rb_insert_color(&va->rb_node, root);
        }

        /* Address-sort this list */
        list_add(&va->list, head);
}

static __always_inline void
link_va(struct vmap_area *va, struct rb_root *root,
        struct rb_node *parent, struct rb_node **link,
        struct list_head *head)
{
        __link_va(va, root, parent, link, head, false);
}

static __always_inline void
link_va_augment(struct vmap_area *va, struct rb_root *root,
        struct rb_node *parent, struct rb_node **link,
        struct list_head *head)
{
        __link_va(va, root, parent, link, head, true);
}

static __always_inline void
__unlink_va(struct vmap_area *va, struct rb_root *root, bool augment)
{
        if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
                return;

        if (augment)
                rb_erase_augmented(&va->rb_node,
                        root, &free_vmap_area_rb_augment_cb);
        else
                rb_erase(&va->rb_node, root);

        list_del_init(&va->list);
        RB_CLEAR_NODE(&va->rb_node);
}

static __always_inline void
unlink_va(struct vmap_area *va, struct rb_root *root)
{
        __unlink_va(va, root, false);
}

static __always_inline void
unlink_va_augment(struct vmap_area *va, struct rb_root *root)
{
        __unlink_va(va, root, true);
}

#if DEBUG_AUGMENT_PROPAGATE_CHECK
/*
 * Gets called when remove the node and rotate.
 */
static __always_inline unsigned long
compute_subtree_max_size(struct vmap_area *va)
{
        return max3(va_size(va),
                get_subtree_max_size(va->rb_node.rb_left),
                get_subtree_max_size(va->rb_node.rb_right));
}

static void
augment_tree_propagate_check(void)
{
        struct vmap_area *va;
        unsigned long computed_size;

        list_for_each_entry(va, &free_vmap_area_list, list) {
                computed_size = compute_subtree_max_size(va);
                if (computed_size != va->subtree_max_size)
                        pr_emerg("tree is corrupted: %lu, %lu\n",
                                va_size(va), va->subtree_max_size);
        }
}
#endif

/*
 * This function populates subtree_max_size from bottom to upper
 * levels starting from VA point. The propagation must be done
 * when VA size is modified by changing its va_start/va_end. Or
 * in case of newly inserting of VA to the tree.
 *
 * It means that __augment_tree_propagate_from() must be called:
 * - After VA has been inserted to the tree(free path);
 * - After VA has been shrunk(allocation path);
 * - After VA has been increased(merging path).
 *
 * Please note that, it does not mean that upper parent nodes
 * and their subtree_max_size are recalculated all the time up
 * to the root node.
 *
 *       4--8
 *        /\
 *       /  \
 *      /    \
 *    2--2  8--8
 *
 * For example if we modify the node 4, shrinking it to 2, then
 * no any modification is required. If we shrink the node 2 to 1
 * its subtree_max_size is updated only, and set to 1. If we shrink
 * the node 8 to 6, then its subtree_max_size is set to 6 and parent
 * node becomes 4--6.
 */
static __always_inline void
augment_tree_propagate_from(struct vmap_area *va)
{
        /*
         * Populate the tree from bottom towards the root until
         * the calculated maximum available size of checked node
         * is equal to its current one.
         */
        free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL);

#if DEBUG_AUGMENT_PROPAGATE_CHECK
        augment_tree_propagate_check();
#endif
}

static void
insert_vmap_area(struct vmap_area *va,
        struct rb_root *root, struct list_head *head)
{
        struct rb_node **link;
        struct rb_node *parent;

        link = find_va_links(va, root, NULL, &parent);
        if (link)
                link_va(va, root, parent, link, head);
}

static void
insert_vmap_area_augment(struct vmap_area *va,
        struct rb_node *from, struct rb_root *root,
        struct list_head *head)
{
        struct rb_node **link;
        struct rb_node *parent;

        if (from)
                link = find_va_links(va, NULL, from, &parent);
        else
                link = find_va_links(va, root, NULL, &parent);

        if (link) {
                link_va_augment(va, root, parent, link, head);
                augment_tree_propagate_from(va);
        }
}

/*
 * Merge de-allocated chunk of VA memory with previous
 * and next free blocks. If coalesce is not done a new
 * free area is inserted. If VA has been merged, it is
 * freed.
 *
 * Please note, it can return NULL in case of overlap
 * ranges, followed by WARN() report. Despite it is a
 * buggy behaviour, a system can be alive and keep
 * ongoing.
 */
static __always_inline struct vmap_area *
__merge_or_add_vmap_area(struct vmap_area *va,
        struct rb_root *root, struct list_head *head, bool augment)
{
        struct vmap_area *sibling;
        struct list_head *next;
        struct rb_node **link;
        struct rb_node *parent;
        bool merged = false;

        /*
         * Find a place in the tree where VA potentially will be
         * inserted, unless it is merged with its sibling/siblings.
         */
        link = find_va_links(va, root, NULL, &parent);
        if (!link)
                return NULL;

        /*
         * Get next node of VA to check if merging can be done.
         */
        next = get_va_next_sibling(parent, link);
        if (unlikely(next == NULL))
                goto insert;

        /*
         * start            end
         * |                |
         * |<------VA------>|<-----Next----->|
         *                  |                |
         *                  start            end
         */
        if (next != head) {
                sibling = list_entry(next, struct vmap_area, list);
                if (sibling->va_start == va->va_end) {
                        sibling->va_start = va->va_start;

                        /* Free vmap_area object. */
                        kmem_cache_free(vmap_area_cachep, va);

                        /* Point to the new merged area. */
                        va = sibling;
                        merged = true;
                }
        }

        /*
         * start            end
         * |                |
         * |<-----Prev----->|<------VA------>|
         *                  |                |
         *                  start            end
         */
        if (next->prev != head) {
                sibling = list_entry(next->prev, struct vmap_area, list);
                if (sibling->va_end == va->va_start) {
                        /*
                         * If both neighbors are coalesced, it is important
                         * to unlink the "next" node first, followed by merging
                         * with "previous" one. Otherwise the tree might not be
                         * fully populated if a sibling's augmented value is
                         * "normalized" because of rotation operations.
                         */
                        if (merged)
                                __unlink_va(va, root, augment);

                        sibling->va_end = va->va_end;

                        /* Free vmap_area object. */
                        kmem_cache_free(vmap_area_cachep, va);

                        /* Point to the new merged area. */
                        va = sibling;
                        merged = true;
                }
        }

insert:
        if (!merged)
                __link_va(va, root, parent, link, head, augment);

        return va;
}

static __always_inline struct vmap_area *
merge_or_add_vmap_area(struct vmap_area *va,
        struct rb_root *root, struct list_head *head)
{
        return __merge_or_add_vmap_area(va, root, head, false);
}

static __always_inline struct vmap_area *
merge_or_add_vmap_area_augment(struct vmap_area *va,
        struct rb_root *root, struct list_head *head)
{
        va = __merge_or_add_vmap_area(va, root, head, true);
        if (va)
                augment_tree_propagate_from(va);

        return va;
}

static __always_inline bool
is_within_this_va(struct vmap_area *va, unsigned long size,
        unsigned long align, unsigned long vstart)
{
        unsigned long nva_start_addr;

        if (va->va_start > vstart)
                nva_start_addr = ALIGN(va->va_start, align);
        else
                nva_start_addr = ALIGN(vstart, align);

        /* Can be overflowed due to big size or alignment. */
        if (nva_start_addr + size < nva_start_addr ||
                        nva_start_addr < vstart)
                return false;

        return (nva_start_addr + size <= va->va_end);
}

/*
 * Find the first free block(lowest start address) in the tree,
 * that will accomplish the request corresponding to passing
 * parameters. Please note, with an alignment bigger than PAGE_SIZE,
 * a search length is adjusted to account for worst case alignment
 * overhead.
 */
static __always_inline struct vmap_area *
find_vmap_lowest_match(struct rb_root *root, unsigned long size,
        unsigned long align, unsigned long vstart, bool adjust_search_size)
{
        struct vmap_area *va;
        struct rb_node *node;
        unsigned long length;

        /* Start from the root. */
        node = root->rb_node;

        /* Adjust the search size for alignment overhead. */
        length = adjust_search_size ? size + align - 1 : size;

        while (node) {
                va = rb_entry(node, struct vmap_area, rb_node);

                if (get_subtree_max_size(node->rb_left) >= length &&
                                vstart < va->va_start) {
                        node = node->rb_left;
                } else {
                        if (is_within_this_va(va, size, align, vstart))
                                return va;

                        /*
                         * Does not make sense to go deeper towards the right
                         * sub-tree if it does not have a free block that is
                         * equal or bigger to the requested search length.
                         */
                        if (get_subtree_max_size(node->rb_right) >= length) {
                                node = node->rb_right;
                                continue;
                        }

                        /*
                         * OK. We roll back and find the first right sub-tree,
                         * that will satisfy the search criteria. It can happen
                         * due to "vstart" restriction or an alignment overhead
                         * that is bigger then PAGE_SIZE.
                         */
                        while ((node = rb_parent(node))) {
                                va = rb_entry(node, struct vmap_area, rb_node);
                                if (is_within_this_va(va, size, align, vstart))
                                        return va;

                                if (get_subtree_max_size(node->rb_right) >= length &&
                                                vstart <= va->va_start) {
                                        /*
                                         * Shift the vstart forward. Please note, we update it with
                                         * parent's start address adding "1" because we do not want
                                         * to enter same sub-tree after it has already been checked
                                         * and no suitable free block found there.
                                         */
                                        vstart = va->va_start + 1;
                                        node = node->rb_right;
                                        break;
                                }
                        }
                }
        }

        return NULL;
}

#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
#include <linux/random.h>

static struct vmap_area *
find_vmap_lowest_linear_match(struct list_head *head, unsigned long size,
        unsigned long align, unsigned long vstart)
{
        struct vmap_area *va;

        list_for_each_entry(va, head, list) {
                if (!is_within_this_va(va, size, align, vstart))
                        continue;

                return va;
        }

        return NULL;
}

static void
find_vmap_lowest_match_check(struct rb_root *root, struct list_head *head,
                             unsigned long size, unsigned long align)
{
        struct vmap_area *va_1, *va_2;
        unsigned long vstart;
        unsigned int rnd;

        get_random_bytes(&rnd, sizeof(rnd));
        vstart = VMALLOC_START + rnd;

        va_1 = find_vmap_lowest_match(root, size, align, vstart, false);
        va_2 = find_vmap_lowest_linear_match(head, size, align, vstart);

        if (va_1 != va_2)
                pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
                        va_1, va_2, vstart);
}
#endif

enum fit_type {
        NOTHING_FIT = 0,
        FL_FIT_TYPE = 1,        /* full fit */
        LE_FIT_TYPE = 2,        /* left edge fit */
        RE_FIT_TYPE = 3,        /* right edge fit */
        NE_FIT_TYPE = 4                /* no edge fit */
};

static __always_inline enum fit_type
classify_va_fit_type(struct vmap_area *va,
        unsigned long nva_start_addr, unsigned long size)
{
        enum fit_type type;

        /* Check if it is within VA. */
        if (nva_start_addr < va->va_start ||
                        nva_start_addr + size > va->va_end)
                return NOTHING_FIT;

        /* Now classify. */
        if (va->va_start == nva_start_addr) {
                if (va->va_end == nva_start_addr + size)
                        type = FL_FIT_TYPE;
                else
                        type = LE_FIT_TYPE;
        } else if (va->va_end == nva_start_addr + size) {
                type = RE_FIT_TYPE;
        } else {
                type = NE_FIT_TYPE;
        }

        return type;
}

static __always_inline int
va_clip(struct rb_root *root, struct list_head *head,
                struct vmap_area *va, unsigned long nva_start_addr,
                unsigned long size)
{
        struct vmap_area *lva = NULL;
        enum fit_type type = classify_va_fit_type(va, nva_start_addr, size);

        if (type == FL_FIT_TYPE) {
                /*
                 * No need to split VA, it fully fits.
                 *
                 * |               |
                 * V      NVA      V
                 * |---------------|
                 */
                unlink_va_augment(va, root);
                kmem_cache_free(vmap_area_cachep, va);
        } else if (type == LE_FIT_TYPE) {
                /*
                 * Split left edge of fit VA.
                 *
                 * |       |
                 * V  NVA  V   R
                 * |-------|-------|
                 */
                va->va_start += size;
        } else if (type == RE_FIT_TYPE) {
                /*
                 * Split right edge of fit VA.
                 *
                 *         |       |
                 *     L   V  NVA  V
                 * |-------|-------|
                 */
                va->va_end = nva_start_addr;
        } else if (type == NE_FIT_TYPE) {
                /*
                 * Split no edge of fit VA.
                 *
                 *     |       |
                 *   L V  NVA  V R
                 * |---|-------|---|
                 */
                lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
                if (unlikely(!lva)) {
                        /*
                         * For percpu allocator we do not do any pre-allocation
                         * and leave it as it is. The reason is it most likely
                         * never ends up with NE_FIT_TYPE splitting. In case of
                         * percpu allocations offsets and sizes are aligned to
                         * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
                         * are its main fitting cases.
                         *
                         * There are a few exceptions though, as an example it is
                         * a first allocation (early boot up) when we have "one"
                         * big free space that has to be split.
                         *
                         * Also we can hit this path in case of regular "vmap"
                         * allocations, if "this" current CPU was not preloaded.
                         * See the comment in alloc_vmap_area() why. If so, then
                         * GFP_NOWAIT is used instead to get an extra object for
                         * split purpose. That is rare and most time does not
                         * occur.
                         *
                         * What happens if an allocation gets failed. Basically,
                         * an "overflow" path is triggered to purge lazily freed
                         * areas to free some memory, then, the "retry" path is
                         * triggered to repeat one more time. See more details
                         * in alloc_vmap_area() function.
                         */
                        lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
                        if (!lva)
                                return -ENOMEM;
                }

                /*
                 * Build the remainder.
                 */
                lva->va_start = va->va_start;
                lva->va_end = nva_start_addr;

                /*
                 * Shrink this VA to remaining size.
                 */
                va->va_start = nva_start_addr + size;
        } else {
                return -EINVAL;
        }

        if (type != FL_FIT_TYPE) {
                augment_tree_propagate_from(va);

                if (lva)        /* type == NE_FIT_TYPE */
                        insert_vmap_area_augment(lva, &va->rb_node, root, head);
        }

        return 0;
}

static unsigned long
va_alloc(struct vmap_area *va,
                struct rb_root *root, struct list_head *head,
                unsigned long size, unsigned long align,
                unsigned long vstart, unsigned long vend)
{
        unsigned long nva_start_addr;
        int ret;

        if (va->va_start > vstart)
                nva_start_addr = ALIGN(va->va_start, align);
        else
                nva_start_addr = ALIGN(vstart, align);

        /* Check the "vend" restriction. */
        if (nva_start_addr + size > vend)
                return -ERANGE;

        /* Update the free vmap_area. */
        ret = va_clip(root, head, va, nva_start_addr, size);
        if (WARN_ON_ONCE(ret))
                return ret;

        return nva_start_addr;
}

/*
 * Returns a start address of the newly allocated area, if success.
 * Otherwise an error value is returned that indicates failure.
 */
static __always_inline unsigned long
__alloc_vmap_area(struct rb_root *root, struct list_head *head,
        unsigned long size, unsigned long align,
        unsigned long vstart, unsigned long vend)
{
        bool adjust_search_size = true;
        unsigned long nva_start_addr;
        struct vmap_area *va;

        /*
         * Do not adjust when:
         *   a) align <= PAGE_SIZE, because it does not make any sense.
         *      All blocks(their start addresses) are at least PAGE_SIZE
         *      aligned anyway;
         *   b) a short range where a requested size corresponds to exactly
         *      specified [vstart:vend] interval and an alignment > PAGE_SIZE.
         *      With adjusted search length an allocation would not succeed.
         */
        if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size))
                adjust_search_size = false;

        va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size);
        if (unlikely(!va))
                return -ENOENT;

        nva_start_addr = va_alloc(va, root, head, size, align, vstart, vend);

#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
        if (!IS_ERR_VALUE(nva_start_addr))
                find_vmap_lowest_match_check(root, head, size, align);
#endif

        return nva_start_addr;
}

/*
 * Free a region of KVA allocated by alloc_vmap_area
 */
static void free_vmap_area(struct vmap_area *va)
{
        struct vmap_node *vn = addr_to_node(va->va_start);

        /*
         * Remove from the busy tree/list.
         */
        spin_lock(&vn->busy.lock);
        unlink_va(va, &vn->busy.root);
        spin_unlock(&vn->busy.lock);

        /*
         * Insert/Merge it back to the free tree/list.
         */
        spin_lock(&free_vmap_area_lock);
        merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
        spin_unlock(&free_vmap_area_lock);
}

static inline void
preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
{
        struct vmap_area *va = NULL, *tmp;

        /*
         * Preload this CPU with one extra vmap_area object. It is used
         * when fit type of free area is NE_FIT_TYPE. It guarantees that
         * a CPU that does an allocation is preloaded.
         *
         * We do it in non-atomic context, thus it allows us to use more
         * permissive allocation masks to be more stable under low memory
         * condition and high memory pressure.
         */
        if (!this_cpu_read(ne_fit_preload_node))
                va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);

        spin_lock(lock);

        tmp = NULL;
        if (va && !__this_cpu_try_cmpxchg(ne_fit_preload_node, &tmp, va))
                kmem_cache_free(vmap_area_cachep, va);
}

static struct vmap_pool *
size_to_va_pool(struct vmap_node *vn, unsigned long size)
{
        unsigned int idx = (size - 1) / PAGE_SIZE;

        if (idx < MAX_VA_SIZE_PAGES)
                return &vn->pool[idx];

        return NULL;
}

static bool
node_pool_add_va(struct vmap_node *n, struct vmap_area *va)
{
        struct vmap_pool *vp;

        vp = size_to_va_pool(n, va_size(va));
        if (!vp)
                return false;

        spin_lock(&n->pool_lock);
        list_add(&va->list, &vp->head);
        WRITE_ONCE(vp->len, vp->len + 1);
        spin_unlock(&n->pool_lock);

        return true;
}

static struct vmap_area *
node_pool_del_va(struct vmap_node *vn, unsigned long size,
                unsigned long align, unsigned long vstart,
                unsigned long vend)
{
        struct vmap_area *va = NULL;
        struct vmap_pool *vp;
        int err = 0;

        vp = size_to_va_pool(vn, size);
        if (!vp || list_empty(&vp->head))
                return NULL;

        spin_lock(&vn->pool_lock);
        if (!list_empty(&vp->head)) {
                va = list_first_entry(&vp->head, struct vmap_area, list);

                if (IS_ALIGNED(va->va_start, align)) {
                        /*
                         * Do some sanity check and emit a warning
                         * if one of below checks detects an error.
                         */
                        err |= (va_size(va) != size);
                        err |= (va->va_start < vstart);
                        err |= (va->va_end > vend);

                        if (!WARN_ON_ONCE(err)) {
                                list_del_init(&va->list);
                                WRITE_ONCE(vp->len, vp->len - 1);
                        } else {
                                va = NULL;
                        }
                } else {
                        list_move_tail(&va->list, &vp->head);
                        va = NULL;
                }
        }
        spin_unlock(&vn->pool_lock);

        return va;
}

static struct vmap_area *
node_alloc(unsigned long size, unsigned long align,
                unsigned long vstart, unsigned long vend,
                unsigned long *addr, unsigned int *vn_id)
{
        struct vmap_area *va;

        *vn_id = 0;
        *addr = -EINVAL;

        /*
         * Fallback to a global heap if not vmalloc or there
         * is only one node.
         */
        if (vstart != VMALLOC_START || vend != VMALLOC_END ||
                        nr_vmap_nodes == 1)
                return NULL;

        *vn_id = raw_smp_processor_id() % nr_vmap_nodes;
        va = node_pool_del_va(id_to_node(*vn_id), size, align, vstart, vend);
        *vn_id = encode_vn_id(*vn_id);

        if (va)
                *addr = va->va_start;

        return va;
}

static inline void setup_vmalloc_vm(struct vm_struct *vm,
        struct vmap_area *va, unsigned long flags, const void *caller)
{
        vm->flags = flags;
        vm->addr = (void *)va->va_start;
        vm->size = vm->requested_size = va_size(va);
        vm->caller = caller;
        va->vm = vm;
}

/*
 * Allocate a region of KVA of the specified size and alignment, within the
 * vstart and vend. If vm is passed in, the two will also be bound.
 */
static struct vmap_area *alloc_vmap_area(unsigned long size,
                                unsigned long align,
                                unsigned long vstart, unsigned long vend,
                                int node, gfp_t gfp_mask,
                                unsigned long va_flags, struct vm_struct *vm)
{
        struct vmap_node *vn;
        struct vmap_area *va;
        unsigned long freed;
        unsigned long addr;
        unsigned int vn_id;
        int purged = 0;
        int ret;

        if (unlikely(!size || offset_in_page(size) || !is_power_of_2(align)))
                return ERR_PTR(-EINVAL);

        if (unlikely(!vmap_initialized))
                return ERR_PTR(-EBUSY);

        /* Only reclaim behaviour flags are relevant. */
        gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
        might_sleep();

        /*
         * If a VA is obtained from a global heap(if it fails here)
         * it is anyway marked with this "vn_id" so it is returned
         * to this pool's node later. Such way gives a possibility
         * to populate pools based on users demand.
         *
         * On success a ready to go VA is returned.
         */
        va = node_alloc(size, align, vstart, vend, &addr, &vn_id);
        if (!va) {
                va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
                if (unlikely(!va))
                        return ERR_PTR(-ENOMEM);

                /*
                 * Only scan the relevant parts containing pointers to other objects
                 * to avoid false negatives.
                 */
                kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
        }

retry:
        if (IS_ERR_VALUE(addr)) {
                preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
                addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list,
                        size, align, vstart, vend);
                spin_unlock(&free_vmap_area_lock);

                /*
                 * This is not a fast path.  Check if yielding is needed. This
                 * is the only reschedule point in the vmalloc() path.
                 */
                cond_resched();
        }

        trace_alloc_vmap_area(addr, size, align, vstart, vend, IS_ERR_VALUE(addr));

        /*
         * If an allocation fails, the error value is
         * returned. Therefore trigger the overflow path.
         */
        if (IS_ERR_VALUE(addr))
                goto overflow;

        va->va_start = addr;
        va->va_end = addr + size;
        va->vm = NULL;
        va->flags = (va_flags | vn_id);

        if (vm) {
                vm->addr = (void *)va->va_start;
                vm->size = va_size(va);
                va->vm = vm;
        }

        vn = addr_to_node(va->va_start);

        spin_lock(&vn->busy.lock);
        insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
        spin_unlock(&vn->busy.lock);

        BUG_ON(!IS_ALIGNED(va->va_start, align));
        BUG_ON(va->va_start < vstart);
        BUG_ON(va->va_end > vend);

        ret = kasan_populate_vmalloc(addr, size, gfp_mask);
        if (ret) {
                free_vmap_area(va);
                return ERR_PTR(ret);
        }

        return va;

overflow:
        if (!purged) {
                reclaim_and_purge_vmap_areas();
                purged = 1;
                goto retry;
        }

        freed = 0;
        blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);

        if (freed > 0) {
                purged = 0;
                goto retry;
        }

        if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
                pr_warn("vmalloc_node_range for size %lu failed: Address range restricted to %#lx - %#lx\n",
                                size, vstart, vend);

        kmem_cache_free(vmap_area_cachep, va);
        return ERR_PTR(-EBUSY);
}

int register_vmap_purge_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&vmap_notify_list, nb);
}
EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);

int unregister_vmap_purge_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_unregister(&vmap_notify_list, nb);
}
EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);

/*
 * lazy_max_pages is the maximum amount of virtual address space we gather up
 * before attempting to purge with a TLB flush.
 *
 * There is a tradeoff here: a larger number will cover more kernel page tables
 * and take slightly longer to purge, but it will linearly reduce the number of
 * global TLB flushes that must be performed. It would seem natural to scale
 * this number up linearly with the number of CPUs (because vmapping activity
 * could also scale linearly with the number of CPUs), however it is likely
 * that in practice, workloads might be constrained in other ways that mean
 * vmap activity will not scale linearly with CPUs. Also, I want to be
 * conservative and not introduce a big latency on huge systems, so go with
 * a less aggressive log scale. It will still be an improvement over the old
 * code, and it will be simple to change the scale factor if we find that it
 * becomes a problem on bigger systems.
 */
static unsigned long lazy_max_pages(void)
{
        unsigned int log;

        log = fls(num_online_cpus());

        return log * (32UL * 1024 * 1024 / PAGE_SIZE);
}

/*
 * Serialize vmap purging.  There is no actual critical section protected
 * by this lock, but we want to avoid concurrent calls for performance
 * reasons and to make the pcpu_get_vm_areas more deterministic.
 */
static DEFINE_MUTEX(vmap_purge_lock);

/* for per-CPU blocks */
static void purge_fragmented_blocks_allcpus(void);

static void
reclaim_list_global(struct list_head *head)
{
        struct vmap_area *va, *n;

        if (list_empty(head))
                return;

        spin_lock(&free_vmap_area_lock);
        list_for_each_entry_safe(va, n, head, list)
                merge_or_add_vmap_area_augment(va,
                        &free_vmap_area_root, &free_vmap_area_list);
        spin_unlock(&free_vmap_area_lock);
}

static void
decay_va_pool_node(struct vmap_node *vn, bool full_decay)
{
        LIST_HEAD(decay_list);
        struct rb_root decay_root = RB_ROOT;
        struct vmap_area *va, *nva;
        unsigned long n_decay, pool_len;
        int i;

        for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {
                LIST_HEAD(tmp_list);

                if (list_empty(&vn->pool[i].head))
                        continue;

                /* Detach the pool, so no-one can access it. */
                spin_lock(&vn->pool_lock);
                list_replace_init(&vn->pool[i].head, &tmp_list);
                spin_unlock(&vn->pool_lock);

                pool_len = n_decay = vn->pool[i].len;
                WRITE_ONCE(vn->pool[i].len, 0);

                /* Decay a pool by ~25% out of left objects. */
                if (!full_decay)
                        n_decay >>= 2;
                pool_len -= n_decay;

                list_for_each_entry_safe(va, nva, &tmp_list, list) {
                        if (!n_decay--)
                                break;

                        list_del_init(&va->list);
                        merge_or_add_vmap_area(va, &decay_root, &decay_list);
                }

                /*
                 * Attach the pool back if it has been partly decayed.
                 * Please note, it is supposed that nobody(other contexts)
                 * can populate the pool therefore a simple list replace
                 * operation takes place here.
                 */
                if (!list_empty(&tmp_list)) {
                        spin_lock(&vn->pool_lock);
                        list_replace_init(&tmp_list, &vn->pool[i].head);
                        WRITE_ONCE(vn->pool[i].len, pool_len);
                        spin_unlock(&vn->pool_lock);
                }
        }

        reclaim_list_global(&decay_list);
}

static void
kasan_release_vmalloc_node(struct vmap_node *vn)
{
        struct vmap_area *va;
        unsigned long start, end;

        start = list_first_entry(&vn->purge_list, struct vmap_area, list)->va_start;
        end = list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end;

        list_for_each_entry(va, &vn->purge_list, list) {
                if (is_vmalloc_or_module_addr((void *) va->va_start))
                        kasan_release_vmalloc(va->va_start, va->va_end,
                                va->va_start, va->va_end,
                                KASAN_VMALLOC_PAGE_RANGE);
        }

        kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH);
}

static void purge_vmap_node(struct work_struct *work)
{
        struct vmap_node *vn = container_of(work,
                struct vmap_node, purge_work);
        unsigned long nr_purged_pages = 0;
        struct vmap_area *va, *n_va;
        LIST_HEAD(local_list);

        if (IS_ENABLED(CONFIG_KASAN_VMALLOC))
                kasan_release_vmalloc_node(vn);

        vn->nr_purged = 0;

        list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
                unsigned long nr = va_size(va) >> PAGE_SHIFT;
                unsigned int vn_id = decode_vn_id(va->flags);

                list_del_init(&va->list);

                nr_purged_pages += nr;
                vn->nr_purged++;

                if (is_vn_id_valid(vn_id) && !vn->skip_populate)
                        if (node_pool_add_va(vn, va))
                                continue;

                /* Go back to global. */
                list_add(&va->list, &local_list);
        }

        atomic_long_sub(nr_purged_pages, &vmap_lazy_nr);

        reclaim_list_global(&local_list);
}

/*
 * Purges all lazily-freed vmap areas.
 */
static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
                bool full_pool_decay)
{
        unsigned long nr_purged_areas = 0;
        unsigned int nr_purge_helpers;
        static cpumask_t purge_nodes;
        unsigned int nr_purge_nodes;
        struct vmap_node *vn;
        int i;

        lockdep_assert_held(&vmap_purge_lock);

        /*
         * Use cpumask to mark which node has to be processed.
         */
        purge_nodes = CPU_MASK_NONE;

        for_each_vmap_node(vn) {
                INIT_LIST_HEAD(&vn->purge_list);
                vn->skip_populate = full_pool_decay;
                decay_va_pool_node(vn, full_pool_decay);

                if (RB_EMPTY_ROOT(&vn->lazy.root))
                        continue;

                spin_lock(&vn->lazy.lock);
                WRITE_ONCE(vn->lazy.root.rb_node, NULL);
                list_replace_init(&vn->lazy.head, &vn->purge_list);
                spin_unlock(&vn->lazy.lock);

                start = min(start, list_first_entry(&vn->purge_list,
                        struct vmap_area, list)->va_start);

                end = max(end, list_last_entry(&vn->purge_list,
                        struct vmap_area, list)->va_end);

                cpumask_set_cpu(node_to_id(vn), &purge_nodes);
        }

        nr_purge_nodes = cpumask_weight(&purge_nodes);
        if (nr_purge_nodes > 0) {
                flush_tlb_kernel_range(start, end);

                /* One extra worker is per a lazy_max_pages() full set minus one. */
                nr_purge_helpers = atomic_long_read(&vmap_lazy_nr) / lazy_max_pages();
                nr_purge_helpers = clamp(nr_purge_helpers, 1U, nr_purge_nodes) - 1;

                for_each_cpu(i, &purge_nodes) {
                        vn = &vmap_nodes[i];

                        if (nr_purge_helpers > 0) {
                                INIT_WORK(&vn->purge_work, purge_vmap_node);

                                if (cpumask_test_cpu(i, cpu_online_mask))
                                        schedule_work_on(i, &vn->purge_work);
                                else
                                        schedule_work(&vn->purge_work);

                                nr_purge_helpers--;
                        } else {
                                vn->purge_work.func = NULL;
                                purge_vmap_node(&vn->purge_work);
                                nr_purged_areas += vn->nr_purged;
                        }
                }

                for_each_cpu(i, &purge_nodes) {
                        vn = &vmap_nodes[i];

                        if (vn->purge_work.func) {
                                flush_work(&vn->purge_work);
                                nr_purged_areas += vn->nr_purged;
                        }
                }
        }

        trace_purge_vmap_area_lazy(start, end, nr_purged_areas);
        return nr_purged_areas > 0;
}

/*
 * Reclaim vmap areas by purging fragmented blocks and purge_vmap_area_list.
 */
static void reclaim_and_purge_vmap_areas(void)

{
        mutex_lock(&vmap_purge_lock);
        purge_fragmented_blocks_allcpus();
        __purge_vmap_area_lazy(ULONG_MAX, 0, true);
        mutex_unlock(&vmap_purge_lock);
}

static void drain_vmap_area_work(struct work_struct *work)
{
        mutex_lock(&vmap_purge_lock);
        __purge_vmap_area_lazy(ULONG_MAX, 0, false);
        mutex_unlock(&vmap_purge_lock);
}

/*
 * Free a vmap area, caller ensuring that the area has been unmapped,
 * unlinked and flush_cache_vunmap had been called for the correct
 * range previously.
 */
static void free_vmap_area_noflush(struct vmap_area *va)
{
        unsigned long nr_lazy_max = lazy_max_pages();
        unsigned long va_start = va->va_start;
        unsigned int vn_id = decode_vn_id(va->flags);
        struct vmap_node *vn;
        unsigned long nr_lazy;

        if (WARN_ON_ONCE(!list_empty(&va->list)))
                return;

        nr_lazy = atomic_long_add_return_relaxed(va_size(va) >> PAGE_SHIFT,
                                         &vmap_lazy_nr);

        /*
         * If it was request by a certain node we would like to
         * return it to that node, i.e. its pool for later reuse.
         */
        vn = is_vn_id_valid(vn_id) ?
                id_to_node(vn_id):addr_to_node(va->va_start);

        spin_lock(&vn->lazy.lock);
        insert_vmap_area(va, &vn->lazy.root, &vn->lazy.head);
        spin_unlock(&vn->lazy.lock);

        trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max);

        /* After this point, we may free va at any time */
        if (unlikely(nr_lazy > nr_lazy_max))
                schedule_work(&drain_vmap_work);
}

/*
 * Free and unmap a vmap area
 */
static void free_unmap_vmap_area(struct vmap_area *va)
{
        flush_cache_vunmap(va->va_start, va->va_end);
        vunmap_range_noflush(va->va_start, va->va_end);
        if (debug_pagealloc_enabled_static())
                flush_tlb_kernel_range(va->va_start, va->va_end);

        free_vmap_area_noflush(va);
}

struct vmap_area *find_vmap_area(unsigned long addr)
{
        struct vmap_node *vn;
        struct vmap_area *va;
        int i, j;

        if (unlikely(!vmap_initialized))
                return NULL;

        /*
         * An addr_to_node_id(addr) converts an address to a node index
         * where a VA is located. If VA spans several zones and passed
         * addr is not the same as va->va_start, what is not common, we
         * may need to scan extra nodes. See an example:
         *
         *      <----va---->
         * -|-----|-----|-----|-----|-
         *     1     2     0     1
         *
         * VA resides in node 1 whereas it spans 1, 2 an 0. If passed
         * addr is within 2 or 0 nodes we should do extra work.
         */
        i = j = addr_to_node_id(addr);
        do {
                vn = &vmap_nodes[i];

                spin_lock(&vn->busy.lock);
                va = __find_vmap_area(addr, &vn->busy.root);
                spin_unlock(&vn->busy.lock);

                if (va)
                        return va;
        } while ((i = (i + nr_vmap_nodes - 1) % nr_vmap_nodes) != j);

        return NULL;
}

static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
{
        struct vmap_node *vn;
        struct vmap_area *va;
        int i, j;

        /*
         * Check the comment in the find_vmap_area() about the loop.
         */
        i = j = addr_to_node_id(addr);
        do {
                vn = &vmap_nodes[i];

                spin_lock(&vn->busy.lock);
                va = __find_vmap_area(addr, &vn->busy.root);
                if (va)
                        unlink_va(va, &vn->busy.root);
                spin_unlock(&vn->busy.lock);

                if (va)
                        return va;
        } while ((i = (i + nr_vmap_nodes - 1) % nr_vmap_nodes) != j);

        return NULL;
}

/*** Per cpu kva allocator ***/

/*
 * vmap space is limited especially on 32 bit architectures. Ensure there is
 * room for at least 16 percpu vmap blocks per CPU.
 */
/*
 * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
 * to #define VMALLOC_SPACE                (VMALLOC_END-VMALLOC_START). Guess
 * instead (we just need a rough idea)
 */
#if BITS_PER_LONG == 32
#define VMALLOC_SPACE                (128UL*1024*1024)
#else
#define VMALLOC_SPACE                (128UL*1024*1024*1024)
#endif

#define VMALLOC_PAGES                (VMALLOC_SPACE / PAGE_SIZE)
#define VMAP_MAX_ALLOC                BITS_PER_LONG        /* 256K with 4K pages */
#define VMAP_BBMAP_BITS_MAX        1024        /* 4MB with 4K pages */
#define VMAP_BBMAP_BITS_MIN        (VMAP_MAX_ALLOC*2)
#define VMAP_MIN(x, y)                ((x) < (y) ? (x) : (y)) /* can't use min() */
#define VMAP_MAX(x, y)                ((x) > (y) ? (x) : (y)) /* can't use max() */
#define VMAP_BBMAP_BITS                \
                VMAP_MIN(VMAP_BBMAP_BITS_MAX,        \
                VMAP_MAX(VMAP_BBMAP_BITS_MIN,        \
                        VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))

#define VMAP_BLOCK_SIZE                (VMAP_BBMAP_BITS * PAGE_SIZE)

/*
 * Purge threshold to prevent overeager purging of fragmented blocks for
 * regular operations: Purge if vb->free is less than 1/4 of the capacity.
 */
#define VMAP_PURGE_THRESHOLD        (VMAP_BBMAP_BITS / 4)

#define VMAP_RAM                0x1 /* indicates vm_map_ram area*/
#define VMAP_BLOCK                0x2 /* mark out the vmap_block sub-type*/
#define VMAP_FLAGS_MASK                0x3

struct vmap_block_queue {
        spinlock_t lock;
        struct list_head free;

        /*
         * An xarray requires an extra memory dynamically to
         * be allocated. If it is an issue, we can use rb-tree
         * instead.
         */
        struct xarray vmap_blocks;
};

struct vmap_block {
        spinlock_t lock;
        struct vmap_area *va;
        unsigned long free, dirty;
        DECLARE_BITMAP(used_map, VMAP_BBMAP_BITS);
        unsigned long dirty_min, dirty_max; /*< dirty range */
        struct list_head free_list;
        struct rcu_head rcu_head;
        struct list_head purge;
        unsigned int cpu;
};

/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);

/*
 * In order to fast access to any "vmap_block" associated with a
 * specific address, we use a hash.
 *
 * A per-cpu vmap_block_queue is used in both ways, to serialize
 * an access to free block chains among CPUs(alloc path) and it
 * also acts as a vmap_block hash(alloc/free paths). It means we
 * overload it, since we already have the per-cpu array which is
 * used as a hash table. When used as a hash a 'cpu' passed to
 * per_cpu() is not actually a CPU but rather a hash index.
 *
 * A hash function is addr_to_vb_xa() which hashes any address
 * to a specific index(in a hash) it belongs to. This then uses a
 * per_cpu() macro to access an array with generated index.
 *
 * An example:
 *
 *  CPU_1  CPU_2  CPU_0
 *    |      |      |
 *    V      V      V
 * 0     10     20     30     40     50     60
 * |------|------|------|------|------|------|...<vmap address space>
 *   CPU0   CPU1   CPU2   CPU0   CPU1   CPU2
 *
 * - CPU_1 invokes vm_unmap_ram(6), 6 belongs to CPU0 zone, thus
 *   it access: CPU0/INDEX0 -> vmap_blocks -> xa_lock;
 *
 * - CPU_2 invokes vm_unmap_ram(11), 11 belongs to CPU1 zone, thus
 *   it access: CPU1/INDEX1 -> vmap_blocks -> xa_lock;
 *
 * - CPU_0 invokes vm_unmap_ram(20), 20 belongs to CPU2 zone, thus
 *   it access: CPU2/INDEX2 -> vmap_blocks -> xa_lock.
 *
 * This technique almost always avoids lock contention on insert/remove,
 * however xarray spinlocks protect against any contention that remains.
 */
static struct xarray *
addr_to_vb_xa(unsigned long addr)
{
        int index = (addr / VMAP_BLOCK_SIZE) % nr_cpu_ids;

        /*
         * Please note, nr_cpu_ids points on a highest set
         * possible bit, i.e. we never invoke cpumask_next()
         * if an index points on it which is nr_cpu_ids - 1.
         */
        if (!cpu_possible(index))
                index = cpumask_next(index, cpu_possible_mask);

        return &per_cpu(vmap_block_queue, index).vmap_blocks;
}

/*
 * We should probably have a fallback mechanism to allocate virtual memory
 * out of partially filled vmap blocks. However vmap block sizing should be
 * fairly reasonable according to the vmalloc size, so it shouldn't be a
 * big problem.
 */

static unsigned long addr_to_vb_idx(unsigned long addr)
{
        addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
        addr /= VMAP_BLOCK_SIZE;
        return addr;
}

static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
{
        unsigned long addr;

        addr = va_start + (pages_off << PAGE_SHIFT);
        BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
        return (void *)addr;
}

/**
 * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
 *                  block. Of course pages number can't exceed VMAP_BBMAP_BITS
 * @order:    how many 2^order pages should be occupied in newly allocated block
 * @gfp_mask: flags for the page level allocator
 *
 * Return: virtual address in a newly allocated block or ERR_PTR(-errno)
 */
static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
{
        struct vmap_block_queue *vbq;
        struct vmap_block *vb;
        struct vmap_area *va;
        struct xarray *xa;
        unsigned long vb_idx;
        int node, err;
        void *vaddr;

        node = numa_node_id();

        vb = kmalloc_node(sizeof(struct vmap_block),
                        gfp_mask & GFP_RECLAIM_MASK, node);
        if (unlikely(!vb))
                return ERR_PTR(-ENOMEM);

        va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
                                        VMALLOC_START, VMALLOC_END,
                                        node, gfp_mask,
                                        VMAP_RAM|VMAP_BLOCK, NULL);
        if (IS_ERR(va)) {
                kfree(vb);
                return ERR_CAST(va);
        }

        vaddr = vmap_block_vaddr(va->va_start, 0);
        spin_lock_init(&vb->lock);
        vb->va = va;
        /* At least something should be left free */
        BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
        bitmap_zero(vb->used_map, VMAP_BBMAP_BITS);
        vb->free = VMAP_BBMAP_BITS - (1UL << order);
        vb->dirty = 0;
        vb->dirty_min = VMAP_BBMAP_BITS;
        vb->dirty_max = 0;
        bitmap_set(vb->used_map, 0, (1UL << order));
        INIT_LIST_HEAD(&vb->free_list);
        vb->cpu = raw_smp_processor_id();

        xa = addr_to_vb_xa(va->va_start);
        vb_idx = addr_to_vb_idx(va->va_start);
        err = xa_insert(xa, vb_idx, vb, gfp_mask);
        if (err) {
                kfree(vb);
                free_vmap_area(va);
                return ERR_PTR(err);
        }
        /*
         * list_add_tail_rcu could happened in another core
         * rather than vb->cpu due to task migration, which
         * is safe as list_add_tail_rcu will ensure the list's
         * integrity together with list_for_each_rcu from read
         * side.
         */
        vbq = per_cpu_ptr(&vmap_block_queue, vb->cpu);
        spin_lock(&vbq->lock);
        list_add_tail_rcu(&vb->free_list, &vbq->free);
        spin_unlock(&vbq->lock);

        return vaddr;
}

static void free_vmap_block(struct vmap_block *vb)
{
        struct vmap_node *vn;
        struct vmap_block *tmp;
        struct xarray *xa;

        xa = addr_to_vb_xa(vb->va->va_start);
        tmp = xa_erase(xa, addr_to_vb_idx(vb->va->va_start));
        BUG_ON(tmp != vb);

        vn = addr_to_node(vb->va->va_start);
        spin_lock(&vn->busy.lock);
        unlink_va(vb->va, &vn->busy.root);
        spin_unlock(&vn->busy.lock);

        free_vmap_area_noflush(vb->va);
        kfree_rcu(vb, rcu_head);
}

static bool purge_fragmented_block(struct vmap_block *vb,
                struct list_head *purge_list, bool force_purge)
{
        struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, vb->cpu);

        if (vb->free + vb->dirty != VMAP_BBMAP_BITS ||
            vb->dirty == VMAP_BBMAP_BITS)
                return false;

        /* Don't overeagerly purge usable blocks unless requested */
        if (!(force_purge || vb->free < VMAP_PURGE_THRESHOLD))
                return false;

        /* prevent further allocs after releasing lock */
        WRITE_ONCE(vb->free, 0);
        /* prevent purging it again */
        WRITE_ONCE(vb->dirty, VMAP_BBMAP_BITS);
        vb->dirty_min = 0;
        vb->dirty_max = VMAP_BBMAP_BITS;
        spin_lock(&vbq->lock);
        list_del_rcu(&vb->free_list);
        spin_unlock(&vbq->lock);
        list_add_tail(&vb->purge, purge_list);
        return true;
}

static void free_purged_blocks(struct list_head *purge_list)
{
        struct vmap_block *vb, *n_vb;

        list_for_each_entry_safe(vb, n_vb, purge_list, purge) {
                list_del(&vb->purge);
                free_vmap_block(vb);
        }
}

static void purge_fragmented_blocks(int cpu)
{
        LIST_HEAD(purge);
        struct vmap_block *vb;
        struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);

        rcu_read_lock();
        list_for_each_entry_rcu(vb, &vbq->free, free_list) {
                unsigned long free = READ_ONCE(vb->free);
                unsigned long dirty = READ_ONCE(vb->dirty);

                if (free + dirty != VMAP_BBMAP_BITS ||
                    dirty == VMAP_BBMAP_BITS)
                        continue;

                spin_lock(&vb->lock);
                purge_fragmented_block(vb, &purge, true);
                spin_unlock(&vb->lock);
        }
        rcu_read_unlock();
        free_purged_blocks(&purge);
}

static void purge_fragmented_blocks_allcpus(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                purge_fragmented_blocks(cpu);
}

static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
{
        struct vmap_block_queue *vbq;
        struct vmap_block *vb;
        void *vaddr = NULL;
        unsigned int order;

        BUG_ON(offset_in_page(size));
        BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
        if (WARN_ON(size == 0)) {
                /*
                 * Allocating 0 bytes isn't what caller wants since
                 * get_order(0) returns funny result. Just warn and terminate
                 * early.
                 */
                return ERR_PTR(-EINVAL);
        }
        order = get_order(size);

        rcu_read_lock();
        vbq = raw_cpu_ptr(&vmap_block_queue);
        list_for_each_entry_rcu(vb, &vbq->free, free_list) {
                unsigned long pages_off;

                if (READ_ONCE(vb->free) < (1UL << order))
                        continue;

                spin_lock(&vb->lock);
                if (vb->free < (1UL << order)) {
                        spin_unlock(&vb->lock);
                        continue;
                }

                pages_off = VMAP_BBMAP_BITS - vb->free;
                vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
                WRITE_ONCE(vb->free, vb->free - (1UL << order));
                bitmap_set(vb->used_map, pages_off, (1UL << order));
                if (vb->free == 0) {
                        spin_lock(&vbq->lock);
                        list_del_rcu(&vb->free_list);
                        spin_unlock(&vbq->lock);
                }

                spin_unlock(&vb->lock);
                break;
        }

        rcu_read_unlock();

        /* Allocate new block if nothing was found */
        if (!vaddr)
                vaddr = new_vmap_block(order, gfp_mask);

        return vaddr;
}

static void vb_free(unsigned long addr, unsigned long size)
{
        unsigned long offset;
        unsigned int order;
        struct vmap_block *vb;
        struct xarray *xa;

        BUG_ON(offset_in_page(size));
        BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);

        flush_cache_vunmap(addr, addr + size);

        order = get_order(size);
        offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;

        xa = addr_to_vb_xa(addr);
        vb = xa_load(xa, addr_to_vb_idx(addr));

        spin_lock(&vb->lock);
        bitmap_clear(vb->used_map, offset, (1UL << order));
        spin_unlock(&vb->lock);

        vunmap_range_noflush(addr, addr + size);

        if (debug_pagealloc_enabled_static())
                flush_tlb_kernel_range(addr, addr + size);

        spin_lock(&vb->lock);

        /* Expand the not yet TLB flushed dirty range */
        vb->dirty_min = min(vb->dirty_min, offset);
        vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));

        WRITE_ONCE(vb->dirty, vb->dirty + (1UL << order));
        if (vb->dirty == VMAP_BBMAP_BITS) {
                BUG_ON(vb->free);
                spin_unlock(&vb->lock);
                free_vmap_block(vb);
        } else
                spin_unlock(&vb->lock);
}

static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
{
        LIST_HEAD(purge_list);
        int cpu;

        if (unlikely(!vmap_initialized))
                return;

        mutex_lock(&vmap_purge_lock);

        for_each_possible_cpu(cpu) {
                struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
                struct vmap_block *vb;
                unsigned long idx;

                rcu_read_lock();
                xa_for_each(&vbq->vmap_blocks, idx, vb) {
                        spin_lock(&vb->lock);

                        /*
                         * Try to purge a fragmented block first. If it's
                         * not purgeable, check whether there is dirty
                         * space to be flushed.
                         */
                        if (!purge_fragmented_block(vb, &purge_list, false) &&
                            vb->dirty_max && vb->dirty != VMAP_BBMAP_BITS) {
                                unsigned long va_start = vb->va->va_start;
                                unsigned long s, e;

                                s = va_start + (vb->dirty_min << PAGE_SHIFT);
                                e = va_start + (vb->dirty_max << PAGE_SHIFT);

                                start = min(s, start);
                                end   = max(e, end);

                                /* Prevent that this is flushed again */
                                vb->dirty_min = VMAP_BBMAP_BITS;
                                vb->dirty_max = 0;

                                flush = 1;
                        }
                        spin_unlock(&vb->lock);
                }
                rcu_read_unlock();
        }
        free_purged_blocks(&purge_list);

        if (!__purge_vmap_area_lazy(start, end, false) && flush)
                flush_tlb_kernel_range(start, end);
        mutex_unlock(&vmap_purge_lock);
}

/**
 * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
 *
 * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
 * to amortize TLB flushing overheads. What this means is that any page you
 * have now, may, in a former life, have been mapped into kernel virtual
 * address by the vmap layer and so there might be some CPUs with TLB entries
 * still referencing that page (additional to the regular 1:1 kernel mapping).
 *
 * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
 * be sure that none of the pages we have control over will have any aliases
 * from the vmap layer.
 */
void vm_unmap_aliases(void)
{
        _vm_unmap_aliases(ULONG_MAX, 0, 0);
}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);

/**
 * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
 * @mem: the pointer returned by vm_map_ram
 * @count: the count passed to that vm_map_ram call (cannot unmap partial)
 */
void vm_unmap_ram(const void *mem, unsigned int count)
{
        unsigned long size = (unsigned long)count << PAGE_SHIFT;
        unsigned long addr = (unsigned long)kasan_reset_tag(mem);
        struct vmap_area *va;

        might_sleep();
        BUG_ON(!addr);
        BUG_ON(addr < VMALLOC_START);
        BUG_ON(addr > VMALLOC_END);
        BUG_ON(!PAGE_ALIGNED(addr));

        kasan_poison_vmalloc(mem, size);

        if (likely(count <= VMAP_MAX_ALLOC)) {
                debug_check_no_locks_freed(mem, size);
                vb_free(addr, size);
                return;
        }

        va = find_unlink_vmap_area(addr);
        if (WARN_ON_ONCE(!va))
                return;

        debug_check_no_locks_freed((void *)va->va_start, va_size(va));
        free_unmap_vmap_area(va);
}
EXPORT_SYMBOL(vm_unmap_ram);

/**
 * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
 * @pages: an array of pointers to the pages to be mapped
 * @count: number of pages
 * @node: prefer to allocate data structures on this node
 *
 * If you use this function for less than VMAP_MAX_ALLOC pages, it could be
 * faster than vmap so it's good.  But if you mix long-life and short-life
 * objects with vm_map_ram(), it could consume lots of address space through
 * fragmentation (especially on a 32bit machine).  You could see failures in
 * the end.  Please use this function for short-lived objects.
 *
 * Returns: a pointer to the address that has been mapped, or %NULL on failure
 */
void *vm_map_ram(struct page **pages, unsigned int count, int node)
{
        unsigned long size = (unsigned long)count << PAGE_SHIFT;
        unsigned long addr;
        void *mem;

        if (likely(count <= VMAP_MAX_ALLOC)) {
                mem = vb_alloc(size, GFP_KERNEL);
                if (IS_ERR(mem))
                        return NULL;
                addr = (unsigned long)mem;
        } else {
                struct vmap_area *va;
                va = alloc_vmap_area(size, PAGE_SIZE,
                                VMALLOC_START, VMALLOC_END,
                                node, GFP_KERNEL, VMAP_RAM,
                                NULL);
                if (IS_ERR(va))
                        return NULL;

                addr = va->va_start;
                mem = (void *)addr;
        }

        if (vmap_pages_range(addr, addr + size, PAGE_KERNEL,
                                pages, PAGE_SHIFT) < 0) {
                vm_unmap_ram(mem, count);
                return NULL;
        }

        /*
         * Mark the pages as accessible, now that they are mapped.
         * With hardware tag-based KASAN, marking is skipped for
         * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
         */
        mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL);

        return mem;
}
EXPORT_SYMBOL(vm_map_ram);

static struct vm_struct *vmlist __initdata;

static inline unsigned int vm_area_page_order(struct vm_struct *vm)
{
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
        return vm->page_order;
#else
        return 0;
#endif
}

unsigned int get_vm_area_page_order(struct vm_struct *vm)
{
        return vm_area_page_order(vm);
}

static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order)
{
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
        vm->page_order = order;
#else
        BUG_ON(order != 0);
#endif
}

/**
 * vm_area_add_early - add vmap area early during boot
 * @vm: vm_struct to add
 *
 * This function is used to add fixed kernel vm area to vmlist before
 * vmalloc_init() is called.  @vm->addr, @vm->size, and @vm->flags
 * should contain proper values and the other fields should be zero.
 *
 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
 */
void __init vm_area_add_early(struct vm_struct *vm)
{
        struct vm_struct *tmp, **p;

        BUG_ON(vmap_initialized);
        for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
                if (tmp->addr >= vm->addr) {
                        BUG_ON(tmp->addr < vm->addr + vm->size);
                        break;
                } else
                        BUG_ON(tmp->addr + tmp->size > vm->addr);
        }
        vm->next = *p;
        *p = vm;
}

/**
 * vm_area_register_early - register vmap area early during boot
 * @vm: vm_struct to register
 * @align: requested alignment
 *
 * This function is used to register kernel vm area before
 * vmalloc_init() is called.  @vm->size and @vm->flags should contain
 * proper values on entry and other fields should be zero.  On return,
 * vm->addr contains the allocated address.
 *
 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
 */
void __init vm_area_register_early(struct vm_struct *vm, size_t align)
{
        unsigned long addr = ALIGN(VMALLOC_START, align);
        struct vm_struct *cur, **p;

        BUG_ON(vmap_initialized);

        for (p = &vmlist; (cur = *p) != NULL; p = &cur->next) {
                if ((unsigned long)cur->addr - addr >= vm->size)
                        break;
                addr = ALIGN((unsigned long)cur->addr + cur->size, align);
        }

        BUG_ON(addr > VMALLOC_END - vm->size);
        vm->addr = (void *)addr;
        vm->next = *p;
        *p = vm;
        kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
}

static void clear_vm_uninitialized_flag(struct vm_struct *vm)
{
        /*
         * Before removing VM_UNINITIALIZED,
         * we should make sure that vm has proper values.
         * Pair with smp_rmb() in vread_iter() and vmalloc_info_show().
         */
        smp_wmb();
        vm->flags &= ~VM_UNINITIALIZED;
}

struct vm_struct *__get_vm_area_node(unsigned long size,
                unsigned long align, unsigned long shift, unsigned long flags,
                unsigned long start, unsigned long end, int node,
                gfp_t gfp_mask, const void *caller)
{
        struct vmap_area *va;
        struct vm_struct *area;
        unsigned long requested_size = size;

        BUG_ON(in_interrupt());
        size = ALIGN(size, 1ul << shift);
        if (unlikely(!size))
                return NULL;

        if (flags & VM_IOREMAP)
                align = 1ul << clamp_t(int, get_count_order_long(size),
                                       PAGE_SHIFT, IOREMAP_MAX_ORDER);

        area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
        if (unlikely(!area))
                return NULL;

        if (!(flags & VM_NO_GUARD))
                size += PAGE_SIZE;

        area->flags = flags;
        area->caller = caller;
        area->requested_size = requested_size;

        va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area);
        if (IS_ERR(va)) {
                kfree(area);
                return NULL;
        }

        /*
         * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
         * best-effort approach, as they can be mapped outside of vmalloc code.
         * For VM_ALLOC mappings, the pages are marked as accessible after
         * getting mapped in __vmalloc_node_range().
         * With hardware tag-based KASAN, marking is skipped for
         * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
         */
        if (!(flags & VM_ALLOC))
                area->addr = kasan_unpoison_vmalloc(area->addr, requested_size,
                                                    KASAN_VMALLOC_PROT_NORMAL);

        return area;
}

struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
                                       unsigned long start, unsigned long end,
                                       const void *caller)
{
        return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end,
                                  NUMA_NO_NODE, GFP_KERNEL, caller);
}

/**
 * get_vm_area - reserve a contiguous kernel virtual area
 * @size:         size of the area
 * @flags:         %VM_IOREMAP for I/O mappings or VM_ALLOC
 *
 * Search an area of @size in the kernel virtual mapping area,
 * and reserved it for out purposes.  Returns the area descriptor
 * on success or %NULL on failure.
 *
 * Return: the area descriptor on success or %NULL on failure.
 */
struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
{
        return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
                                  VMALLOC_START, VMALLOC_END,
                                  NUMA_NO_NODE, GFP_KERNEL,
                                  __builtin_return_address(0));
}

struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
                                const void *caller)
{
        return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
                                  VMALLOC_START, VMALLOC_END,
                                  NUMA_NO_NODE, GFP_KERNEL, caller);
}

/**
 * find_vm_area - find a continuous kernel virtual area
 * @addr:          base address
 *
 * Search for the kernel VM area starting at @addr, and return it.
 * It is up to the caller to do all required locking to keep the returned
 * pointer valid.
 *
 * Return: the area descriptor on success or %NULL on failure.
 */
struct vm_struct *find_vm_area(const void *addr)
{
        struct vmap_area *va;

        va = find_vmap_area((unsigned long)addr);
        if (!va)
                return NULL;

        return va->vm;
}

/**
 * remove_vm_area - find and remove a continuous kernel virtual area
 * @addr:            base address
 *
 * Search for the kernel VM area starting at @addr, and remove it.
 * This function returns the found VM area, but using it is NOT safe
 * on SMP machines, except for its size or flags.
 *
 * Return: the area descriptor on success or %NULL on failure.
 */
struct vm_struct *remove_vm_area(const void *addr)
{
        struct vmap_area *va;
        struct vm_struct *vm;

        might_sleep();

        if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
                        addr))
                return NULL;

        va = find_unlink_vmap_area((unsigned long)addr);
        if (!va || !va->vm)
                return NULL;
        vm = va->vm;

        debug_check_no_locks_freed(vm->addr, get_vm_area_size(vm));
        debug_check_no_obj_freed(vm->addr, get_vm_area_size(vm));
        kasan_free_module_shadow(vm);
        kasan_poison_vmalloc(vm->addr, get_vm_area_size(vm));

        free_unmap_vmap_area(va);
        return vm;
}

static inline void set_area_direct_map(const struct vm_struct *area,
                                       int (*set_direct_map)(struct page *page))
{
        int i;

        /* HUGE_VMALLOC passes small pages to set_direct_map */
        for (i = 0; i < area->nr_pages; i++)
                if (page_address(area->pages[i]))
                        set_direct_map(area->pages[i]);
}

/*
 * Flush the vm mapping and reset the direct map.
 */
static void vm_reset_perms(struct vm_struct *area)
{
        unsigned long start = ULONG_MAX, end = 0;
        unsigned int page_order = vm_area_page_order(area);
        int flush_dmap = 0;
        int i;

        /*
         * Find the start and end range of the direct mappings to make sure that
         * the vm_unmap_aliases() flush includes the direct map.
         */
        for (i = 0; i < area->nr_pages; i += 1U << page_order) {
                unsigned long addr = (unsigned long)page_address(area->pages[i]);

                if (addr) {
                        unsigned long page_size;

                        page_size = PAGE_SIZE << page_order;
                        start = min(addr, start);
                        end = max(addr + page_size, end);
                        flush_dmap = 1;
                }
        }

        /*
         * Set direct map to something invalid so that it won't be cached if
         * there are any accesses after the TLB flush, then flush the TLB and
         * reset the direct map permissions to the default.
         */
        set_area_direct_map(area, set_direct_map_invalid_noflush);
        _vm_unmap_aliases(start, end, flush_dmap);
        set_area_direct_map(area, set_direct_map_default_noflush);
}

static void delayed_vfree_work(struct work_struct *w)
{
        struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
        struct llist_node *t, *llnode;

        llist_for_each_safe(llnode, t, llist_del_all(&p->list))
                vfree(llnode);
}

/**
 * vfree_atomic - release memory allocated by vmalloc()
 * @addr:          memory base address
 *
 * This one is just like vfree() but can be called in any atomic context
 * except NMIs.
 */
void vfree_atomic(const void *addr)
{
        struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);

        BUG_ON(in_nmi());
        kmemleak_free(addr);

        /*
         * Use raw_cpu_ptr() because this can be called from preemptible
         * context. Preemption is absolutely fine here, because the llist_add()
         * implementation is lockless, so it works even if we are adding to
         * another cpu's list. schedule_work() should be fine with this too.
         */
        if (addr && llist_add((struct llist_node *)addr, &p->list))
                schedule_work(&p->wq);
}

/**
 * vfree - Release memory allocated by vmalloc()
 * @addr:  Memory base address
 *
 * Free the virtually continuous memory area starting at @addr, as obtained
 * from one of the vmalloc() family of APIs.  This will usually also free the
 * physical memory underlying the virtual allocation, but that memory is
 * reference counted, so it will not be freed until the last user goes away.
 *
 * If @addr is NULL, no operation is performed.
 *
 * Context:
 * May sleep if called *not* from interrupt context.
 * Must not be called in NMI context (strictly speaking, it could be
 * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
 * conventions for vfree() arch-dependent would be a really bad idea).
 */
void vfree(const void *addr)
{
        struct vm_struct *vm;
        int i;

        if (unlikely(in_interrupt())) {
                vfree_atomic(addr);
                return;
        }

        BUG_ON(in_nmi());
        kmemleak_free(addr);
        might_sleep();

        if (!addr)
                return;

        vm = remove_vm_area(addr);
        if (unlikely(!vm)) {
                WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
                                addr);
                return;
        }

        if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS))
                vm_reset_perms(vm);
        /* All pages of vm should be charged to same memcg, so use first one. */
        if (vm->nr_pages && !(vm->flags & VM_MAP_PUT_PAGES))
                mod_memcg_page_state(vm->pages[0], MEMCG_VMALLOC, -vm->nr_pages);
        for (i = 0; i < vm->nr_pages; i++) {
                struct page *page = vm->pages[i];

                BUG_ON(!page);
                /*
                 * High-order allocs for huge vmallocs are split, so
                 * can be freed as an array of order-0 allocations
                 */
                __free_page(page);
                cond_resched();
        }
        if (!(vm->flags & VM_MAP_PUT_PAGES))
                atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages);
        kvfree(vm->pages);
        kfree(vm);
}
EXPORT_SYMBOL(vfree);

/**
 * vunmap - release virtual mapping obtained by vmap()
 * @addr:   memory base address
 *
 * Free the virtually contiguous memory area starting at @addr,
 * which was created from the page array passed to vmap().
 *
 * Must not be called in interrupt context.
 */
void vunmap(const void *addr)
{
        struct vm_struct *vm;

        BUG_ON(in_interrupt());
        might_sleep();

        if (!addr)
                return;
        vm = remove_vm_area(addr);
        if (unlikely(!vm)) {
                WARN(1, KERN_ERR "Trying to vunmap() nonexistent vm area (%p)\n",
                                addr);
                return;
        }
        kfree(vm);
}
EXPORT_SYMBOL(vunmap);

/**
 * vmap - map an array of pages into virtually contiguous space
 * @pages: array of page pointers
 * @count: number of pages to map
 * @flags: vm_area->flags
 * @prot: page protection for the mapping
 *
 * Maps @count pages from @pages into contiguous kernel virtual space.
 * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself
 * (which must be kmalloc or vmalloc memory) and one reference per pages in it
 * are transferred from the caller to vmap(), and will be freed / dropped when
 * vfree() is called on the return value.
 *
 * Return: the address of the area or %NULL on failure
 */
void *vmap(struct page **pages, unsigned int count,
           unsigned long flags, pgprot_t prot)
{
        struct vm_struct *area;
        unsigned long addr;
        unsigned long size;                /* In bytes */

        might_sleep();

        if (WARN_ON_ONCE(flags & VM_FLUSH_RESET_PERMS))
                return NULL;

        /*
         * Your top guard is someone else's bottom guard. Not having a top
         * guard compromises someone else's mappings too.
         */
        if (WARN_ON_ONCE(flags & VM_NO_GUARD))
                flags &= ~VM_NO_GUARD;

        if (count > totalram_pages())
                return NULL;

        size = (unsigned long)count << PAGE_SHIFT;
        area = get_vm_area_caller(size, flags, __builtin_return_address(0));
        if (!area)
                return NULL;

        addr = (unsigned long)area->addr;
        if (vmap_pages_range(addr, addr + size, pgprot_nx(prot),
                                pages, PAGE_SHIFT) < 0) {
                vunmap(area->addr);
                return NULL;
        }

        if (flags & VM_MAP_PUT_PAGES) {
                area->pages = pages;
                area->nr_pages = count;
        }
        return area->addr;
}
EXPORT_SYMBOL(vmap);

#ifdef CONFIG_VMAP_PFN
struct vmap_pfn_data {
        unsigned long        *pfns;
        pgprot_t        prot;
        unsigned int        idx;
};

static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private)
{
        struct vmap_pfn_data *data = private;
        unsigned long pfn = data->pfns[data->idx];
        pte_t ptent;

        if (WARN_ON_ONCE(pfn_valid(pfn)))
                return -EINVAL;

        ptent = pte_mkspecial(pfn_pte(pfn, data->prot));
        set_pte_at(&init_mm, addr, pte, ptent);

        data->idx++;
        return 0;
}

/**
 * vmap_pfn - map an array of PFNs into virtually contiguous space
 * @pfns: array of PFNs
 * @count: number of pages to map
 * @prot: page protection for the mapping
 *
 * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns
 * the start address of the mapping.
 */
void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot)
{
        struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) };
        struct vm_struct *area;

        area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP,
                        __builtin_return_address(0));
        if (!area)
                return NULL;
        if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
                        count * PAGE_SIZE, vmap_pfn_apply, &data)) {
                free_vm_area(area);
                return NULL;
        }

        flush_cache_vmap((unsigned long)area->addr,
                         (unsigned long)area->addr + count * PAGE_SIZE);

        return area->addr;
}
EXPORT_SYMBOL_GPL(vmap_pfn);
#endif /* CONFIG_VMAP_PFN */

static inline unsigned int
vm_area_alloc_pages(gfp_t gfp, int nid,
                unsigned int order, unsigned int nr_pages, struct page **pages)
{
        unsigned int nr_allocated = 0;
        struct page *page;
        int i;

        /*
         * For order-0 pages we make use of bulk allocator, if
         * the page array is partly or not at all populated due
         * to fails, fallback to a single page allocator that is
         * more permissive.
         */
        if (!order) {
                while (nr_allocated < nr_pages) {
                        unsigned int nr, nr_pages_request;

                        /*
                         * A maximum allowed request is hard-coded and is 100
                         * pages per call. That is done in order to prevent a
                         * long preemption off scenario in the bulk-allocator
                         * so the range is [1:100].
                         */
                        nr_pages_request = min(100U, nr_pages - nr_allocated);

                        /* memory allocation should consider mempolicy, we can't
                         * wrongly use nearest node when nid == NUMA_NO_NODE,
                         * otherwise memory may be allocated in only one node,
                         * but mempolicy wants to alloc memory by interleaving.
                         */
                        if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE)
                                nr = alloc_pages_bulk_mempolicy_noprof(gfp,
                                                        nr_pages_request,
                                                        pages + nr_allocated);
                        else
                                nr = alloc_pages_bulk_node_noprof(gfp, nid,
                                                        nr_pages_request,
                                                        pages + nr_allocated);

                        nr_allocated += nr;

                        /*
                         * If zero or pages were obtained partly,
                         * fallback to a single page allocator.
                         */
                        if (nr != nr_pages_request)
                                break;
                }
        }

        /* High-order pages or fallback path if "bulk" fails. */
        while (nr_allocated < nr_pages) {
                if (!(gfp & __GFP_NOFAIL) && fatal_signal_pending(current))
                        break;

                if (nid == NUMA_NO_NODE)
                        page = alloc_pages_noprof(gfp, order);
                else
                        page = alloc_pages_node_noprof(nid, gfp, order);

                if (unlikely(!page))
                        break;

                /*
                 * High-order allocations must be able to be treated as
                 * independent small pages by callers (as they can with
                 * small-page vmallocs). Some drivers do their own refcounting
                 * on vmalloc_to_page() pages, some use page->mapping,
                 * page->lru, etc.
                 */
                if (order)
                        split_page(page, order);

                /*
                 * Careful, we allocate and map page-order pages, but
                 * tracking is done per PAGE_SIZE page so as to keep the
                 * vm_struct APIs independent of the physical/mapped size.
                 */
                for (i = 0; i < (1U << order); i++)
                        pages[nr_allocated + i] = page + i;

                nr_allocated += 1U << order;
        }

        return nr_allocated;
}

static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
                                 pgprot_t prot, unsigned int page_shift,
                                 int node)
{
        const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
        bool nofail = gfp_mask & __GFP_NOFAIL;
        unsigned long addr = (unsigned long)area->addr;
        unsigned long size = get_vm_area_size(area);
        unsigned long array_size;
        unsigned int nr_small_pages = size >> PAGE_SHIFT;
        unsigned int page_order;
        unsigned int flags;
        int ret;

        array_size = (unsigned long)nr_small_pages * sizeof(struct page *);

        if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
                gfp_mask |= __GFP_HIGHMEM;

        /* Please note that the recursion is strictly bounded. */
        if (array_size > PAGE_SIZE) {
                area->pages = __vmalloc_node_noprof(array_size, 1, nested_gfp, node,
                                        area->caller);
        } else {
                area->pages = kmalloc_node_noprof(array_size, nested_gfp, node);
        }

        if (!area->pages) {
                warn_alloc(gfp_mask, NULL,
                        "vmalloc error: size %lu, failed to allocated page array size %lu",
                        nr_small_pages * PAGE_SIZE, array_size);
                free_vm_area(area);
                return NULL;
        }

        set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
        page_order = vm_area_page_order(area);

        /*
         * High-order nofail allocations are really expensive and
         * potentially dangerous (pre-mature OOM, disruptive reclaim
         * and compaction etc.
         *
         * Please note, the __vmalloc_node_range_noprof() falls-back
         * to order-0 pages if high-order attempt is unsuccessful.
         */
        area->nr_pages = vm_area_alloc_pages((page_order ?
                gfp_mask & ~__GFP_NOFAIL : gfp_mask) | __GFP_NOWARN,
                node, page_order, nr_small_pages, area->pages);

        atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
        /* All pages of vm should be charged to same memcg, so use first one. */
        if (gfp_mask & __GFP_ACCOUNT && area->nr_pages)
                mod_memcg_page_state(area->pages[0], MEMCG_VMALLOC,
                                     area->nr_pages);

        /*
         * If not enough pages were obtained to accomplish an
         * allocation request, free them via vfree() if any.
         */
        if (area->nr_pages != nr_small_pages) {
                /*
                 * vm_area_alloc_pages() can fail due to insufficient memory but
                 * also:-
                 *
                 * - a pending fatal signal
                 * - insufficient huge page-order pages
                 *
                 * Since we always retry allocations at order-0 in the huge page
                 * case a warning for either is spurious.
                 */
                if (!fatal_signal_pending(current) && page_order == 0)
                        warn_alloc(gfp_mask, NULL,
                                "vmalloc error: size %lu, failed to allocate pages",
                                area->nr_pages * PAGE_SIZE);
                goto fail;
        }

        /*
         * page tables allocations ignore external gfp mask, enforce it
         * by the scope API
         */
        if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
                flags = memalloc_nofs_save();
        else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
                flags = memalloc_noio_save();

        do {
                ret = vmap_pages_range(addr, addr + size, prot, area->pages,
                        page_shift);
                if (nofail && (ret < 0))
                        schedule_timeout_uninterruptible(1);
        } while (nofail && (ret < 0));

        if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
                memalloc_nofs_restore(flags);
        else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
                memalloc_noio_restore(flags);

        if (ret < 0) {
                warn_alloc(gfp_mask, NULL,
                        "vmalloc error: size %lu, failed to map pages",
                        area->nr_pages * PAGE_SIZE);
                goto fail;
        }

        return area->addr;

fail:
        vfree(area->addr);
        return NULL;
}

/**
 * __vmalloc_node_range - allocate virtually contiguous memory
 * @size:                  allocation size
 * @align:                  desired alignment
 * @start:                  vm area range start
 * @end:                  vm area range end
 * @gfp_mask:                  flags for the page level allocator
 * @prot:                  protection mask for the allocated pages
 * @vm_flags:                  additional vm area flags (e.g. %VM_NO_GUARD)
 * @node:                  node to use for allocation or NUMA_NO_NODE
 * @caller:                  caller's return address
 *
 * Allocate enough pages to cover @size from the page level
 * allocator with @gfp_mask flags. Please note that the full set of gfp
 * flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all
 * supported.
 * Zone modifiers are not supported. From the reclaim modifiers
 * __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported)
 * and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and
 * __GFP_RETRY_MAYFAIL are not supported).
 *
 * __GFP_NOWARN can be used to suppress failures messages.
 *
 * Map them into contiguous kernel virtual space, using a pagetable
 * protection of @prot.
 *
 * Return: the address of the area or %NULL on failure
 */
void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
                        unsigned long start, unsigned long end, gfp_t gfp_mask,
                        pgprot_t prot, unsigned long vm_flags, int node,
                        const void *caller)
{
        struct vm_struct *area;
        void *ret;
        kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE;
        unsigned long original_align = align;
        unsigned int shift = PAGE_SHIFT;

        if (WARN_ON_ONCE(!size))
                return NULL;

        if ((size >> PAGE_SHIFT) > totalram_pages()) {
                warn_alloc(gfp_mask, NULL,
                        "vmalloc error: size %lu, exceeds total pages",
                        size);
                return NULL;
        }

        if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) {
                /*
                 * Try huge pages. Only try for PAGE_KERNEL allocations,
                 * others like modules don't yet expect huge pages in
                 * their allocations due to apply_to_page_range not
                 * supporting them.
                 */

                if (arch_vmap_pmd_supported(prot) && size >= PMD_SIZE)
                        shift = PMD_SHIFT;
                else
                        shift = arch_vmap_pte_supported_shift(size);

                align = max(original_align, 1UL << shift);
        }

again:
        area = __get_vm_area_node(size, align, shift, VM_ALLOC |
                                  VM_UNINITIALIZED | vm_flags, start, end, node,
                                  gfp_mask, caller);
        if (!area) {
                bool nofail = gfp_mask & __GFP_NOFAIL;
                warn_alloc(gfp_mask, NULL,
                        "vmalloc error: size %lu, vm_struct allocation failed%s",
                        size, (nofail) ? ". Retrying." : "");
                if (nofail) {
                        schedule_timeout_uninterruptible(1);
                        goto again;
                }
                goto fail;
        }

        /*
         * Prepare arguments for __vmalloc_area_node() and
         * kasan_unpoison_vmalloc().
         */
        if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) {
                if (kasan_hw_tags_enabled()) {
                        /*
                         * Modify protection bits to allow tagging.
                         * This must be done before mapping.
                         */
                        prot = arch_vmap_pgprot_tagged(prot);

                        /*
                         * Skip page_alloc poisoning and zeroing for physical
                         * pages backing VM_ALLOC mapping. Memory is instead
                         * poisoned and zeroed by kasan_unpoison_vmalloc().
                         */
                        gfp_mask |= __GFP_SKIP_KASAN | __GFP_SKIP_ZERO;
                }

                /* Take note that the mapping is PAGE_KERNEL. */
                kasan_flags |= KASAN_VMALLOC_PROT_NORMAL;
        }

        /* Allocate physical pages and map them into vmalloc space. */
        ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
        if (!ret)
                goto fail;

        /*
         * Mark the pages as accessible, now that they are mapped.
         * The condition for setting KASAN_VMALLOC_INIT should complement the
         * one in post_alloc_hook() with regards to the __GFP_SKIP_ZERO check
         * to make sure that memory is initialized under the same conditions.
         * Tag-based KASAN modes only assign tags to normal non-executable
         * allocations, see __kasan_unpoison_vmalloc().
         */
        kasan_flags |= KASAN_VMALLOC_VM_ALLOC;
        if (!want_init_on_free() && want_init_on_alloc(gfp_mask) &&
            (gfp_mask & __GFP_SKIP_ZERO))
                kasan_flags |= KASAN_VMALLOC_INIT;
        /* KASAN_VMALLOC_PROT_NORMAL already set if required. */
        area->addr = kasan_unpoison_vmalloc(area->addr, size, kasan_flags);

        /*
         * In this function, newly allocated vm_struct has VM_UNINITIALIZED
         * flag. It means that vm_struct is not fully initialized.
         * Now, it is fully initialized, so remove this flag here.
         */
        clear_vm_uninitialized_flag(area);

        if (!(vm_flags & VM_DEFER_KMEMLEAK))
                kmemleak_vmalloc(area, PAGE_ALIGN(size), gfp_mask);

        return area->addr;

fail:
        if (shift > PAGE_SHIFT) {
                shift = PAGE_SHIFT;
                align = original_align;
                goto again;
        }

        return NULL;
}

/**
 * __vmalloc_node - allocate virtually contiguous memory
 * @size:            allocation size
 * @align:            desired alignment
 * @gfp_mask:            flags for the page level allocator
 * @node:            node to use for allocation or NUMA_NO_NODE
 * @caller:            caller's return address
 *
 * Allocate enough pages to cover @size from the page level allocator with
 * @gfp_mask flags.  Map them into contiguous kernel virtual space.
 *
 * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
 * and __GFP_NOFAIL are not supported
 *
 * Any use of gfp flags outside of GFP_KERNEL should be consulted
 * with mm people.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *__vmalloc_node_noprof(unsigned long size, unsigned long align,
                            gfp_t gfp_mask, int node, const void *caller)
{
        return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END,
                                gfp_mask, PAGE_KERNEL, 0, node, caller);
}
/*
 * This is only for performance analysis of vmalloc and stress purpose.
 * It is required by vmalloc test module, therefore do not use it other
 * than that.
 */
#ifdef CONFIG_TEST_VMALLOC_MODULE
EXPORT_SYMBOL_GPL(__vmalloc_node_noprof);
#endif

void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask)
{
        return __vmalloc_node_noprof(size, 1, gfp_mask, NUMA_NO_NODE,
                                __builtin_return_address(0));
}
EXPORT_SYMBOL(__vmalloc_noprof);

/**
 * vmalloc - allocate virtually contiguous memory
 * @size:    allocation size
 *
 * Allocate enough pages to cover @size from the page level
 * allocator and map them into contiguous kernel virtual space.
 *
 * For tight control over page level allocator and protection flags
 * use __vmalloc() instead.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_noprof(unsigned long size)
{
        return __vmalloc_node_noprof(size, 1, GFP_KERNEL, NUMA_NO_NODE,
                                __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_noprof);

/**
 * vmalloc_huge_node - allocate virtually contiguous memory, allow huge pages
 * @size:      allocation size
 * @gfp_mask:  flags for the page level allocator
 * @node:            node to use for allocation or NUMA_NO_NODE
 *
 * Allocate enough pages to cover @size from the page level
 * allocator and map them into contiguous kernel virtual space.
 * If @size is greater than or equal to PMD_SIZE, allow using
 * huge pages for the memory
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node)
{
        return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
                                           gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
                                           node, __builtin_return_address(0));
}
EXPORT_SYMBOL_GPL(vmalloc_huge_node_noprof);

/**
 * vzalloc - allocate virtually contiguous memory with zero fill
 * @size:    allocation size
 *
 * Allocate enough pages to cover @size from the page level
 * allocator and map them into contiguous kernel virtual space.
 * The memory allocated is set to zero.
 *
 * For tight control over page level allocator and protection flags
 * use __vmalloc() instead.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vzalloc_noprof(unsigned long size)
{
        return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
                                __builtin_return_address(0));
}
EXPORT_SYMBOL(vzalloc_noprof);

/**
 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
 * @size: allocation size
 *
 * The resulting memory area is zeroed so it can be mapped to userspace
 * without leaking data.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_user_noprof(unsigned long size)
{
        return __vmalloc_node_range_noprof(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
                                    GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
                                    VM_USERMAP, NUMA_NO_NODE,
                                    __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_user_noprof);

/**
 * vmalloc_node - allocate memory on a specific node
 * @size:          allocation size
 * @node:          numa node
 *
 * Allocate enough pages to cover @size from the page level
 * allocator and map them into contiguous kernel virtual space.
 *
 * For tight control over page level allocator and protection flags
 * use __vmalloc() instead.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_node_noprof(unsigned long size, int node)
{
        return __vmalloc_node_noprof(size, 1, GFP_KERNEL, node,
                        __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_node_noprof);

/**
 * vzalloc_node - allocate memory on a specific node with zero fill
 * @size:        allocation size
 * @node:        numa node
 *
 * Allocate enough pages to cover @size from the page level
 * allocator and map them into contiguous kernel virtual space.
 * The memory allocated is set to zero.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vzalloc_node_noprof(unsigned long size, int node)
{
        return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, node,
                                __builtin_return_address(0));
}
EXPORT_SYMBOL(vzalloc_node_noprof);

/**
 * vrealloc_node_align_noprof - reallocate virtually contiguous memory; contents
 * remain unchanged
 * @p: object to reallocate memory for
 * @size: the size to reallocate
 * @align: requested alignment
 * @flags: the flags for the page level allocator
 * @nid: node number of the target node
 *
 * If @p is %NULL, vrealloc_XXX() behaves exactly like vmalloc_XXX(). If @size
 * is 0 and @p is not a %NULL pointer, the object pointed to is freed.
 *
 * If the caller wants the new memory to be on specific node *only*,
 * __GFP_THISNODE flag should be set, otherwise the function will try to avoid
 * reallocation and possibly disregard the specified @nid.
 *
 * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
 * initial memory allocation, every subsequent call to this API for the same
 * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
 * __GFP_ZERO is not fully honored by this API.
 *
 * Requesting an alignment that is bigger than the alignment of the existing
 * allocation will fail.
 *
 * In any case, the contents of the object pointed to are preserved up to the
 * lesser of the new and old sizes.
 *
 * This function must not be called concurrently with itself or vfree() for the
 * same memory allocation.
 *
 * Return: pointer to the allocated memory; %NULL if @size is zero or in case of
 *         failure
 */
void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align,
                                 gfp_t flags, int nid)
{
        struct vm_struct *vm = NULL;
        size_t alloced_size = 0;
        size_t old_size = 0;
        void *n;

        if (!size) {
                vfree(p);
                return NULL;
        }

        if (p) {
                vm = find_vm_area(p);
                if (unlikely(!vm)) {
                        WARN(1, "Trying to vrealloc() nonexistent vm area (%p)\n", p);
                        return NULL;
                }

                alloced_size = get_vm_area_size(vm);
                old_size = vm->requested_size;
                if (WARN(alloced_size < old_size,
                         "vrealloc() has mismatched area vs requested sizes (%p)\n", p))
                        return NULL;
                if (WARN(!IS_ALIGNED((unsigned long)p, align),
                         "will not reallocate with a bigger alignment (0x%lx)\n", align))
                        return NULL;
                if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE &&
                             nid != page_to_nid(vmalloc_to_page(p)))
                        goto need_realloc;
        }

        /*
         * TODO: Shrink the vm_area, i.e. unmap and free unused pages. What
         * would be a good heuristic for when to shrink the vm_area?
         */
        if (size <= old_size) {
                /* Zero out "freed" memory, potentially for future realloc. */
                if (want_init_on_free() || want_init_on_alloc(flags))
                        memset((void *)p + size, 0, old_size - size);
                vm->requested_size = size;
                kasan_poison_vmalloc(p + size, old_size - size);
                return (void *)p;
        }

        /*
         * We already have the bytes available in the allocation; use them.
         */
        if (size <= alloced_size) {
                kasan_unpoison_vmalloc(p + old_size, size - old_size,
                                       KASAN_VMALLOC_PROT_NORMAL);
                /*
                 * No need to zero memory here, as unused memory will have
                 * already been zeroed at initial allocation time or during
                 * realloc shrink time.
                 */
                vm->requested_size = size;
                return (void *)p;
        }

need_realloc:
        /* TODO: Grow the vm_area, i.e. allocate and map additional pages. */
        n = __vmalloc_node_noprof(size, align, flags, nid, __builtin_return_address(0));

        if (!n)
                return NULL;

        if (p) {
                memcpy(n, p, old_size);
                vfree(p);
        }

        return n;
}

#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
#define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL)
#else
/*
 * 64b systems should always have either DMA or DMA32 zones. For others
 * GFP_DMA32 should do the right thing and use the normal zone.
 */
#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
#endif

/**
 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
 * @size:        allocation size
 *
 * Allocate enough 32bit PA addressable pages to cover @size from the
 * page level allocator and map them into contiguous kernel virtual space.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_32_noprof(unsigned long size)
{
        return __vmalloc_node_noprof(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
                        __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_32_noprof);

/**
 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
 * @size:             allocation size
 *
 * The resulting memory area is 32bit addressable and zeroed so it can be
 * mapped to userspace without leaking data.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_32_user_noprof(unsigned long size)
{
        return __vmalloc_node_range_noprof(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
                                    GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
                                    VM_USERMAP, NUMA_NO_NODE,
                                    __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_32_user_noprof);

/*
 * Atomically zero bytes in the iterator.
 *
 * Returns the number of zeroed bytes.
 */
static size_t zero_iter(struct iov_iter *iter, size_t count)
{
        size_t remains = count;

        while (remains > 0) {
                size_t num, copied;

                num = min_t(size_t, remains, PAGE_SIZE);
                copied = copy_page_to_iter_nofault(ZERO_PAGE(0), 0, num, iter);
                remains -= copied;

                if (copied < num)
                        break;
        }

        return count - remains;
}

/*
 * small helper routine, copy contents to iter from addr.
 * If the page is not present, fill zero.
 *
 * Returns the number of copied bytes.
 */
static size_t aligned_vread_iter(struct iov_iter *iter,
                                 const char *addr, size_t count)
{
        size_t remains = count;
        struct page *page;

        while (remains > 0) {
                unsigned long offset, length;
                size_t copied = 0;

                offset = offset_in_page(addr);
                length = PAGE_SIZE - offset;
                if (length > remains)
                        length = remains;
                page = vmalloc_to_page(addr);
                /*
                 * To do safe access to this _mapped_ area, we need lock. But
                 * adding lock here means that we need to add overhead of
                 * vmalloc()/vfree() calls for this _debug_ interface, rarely
                 * used. Instead of that, we'll use an local mapping via
                 * copy_page_to_iter_nofault() and accept a small overhead in
                 * this access function.
                 */
                if (page)
                        copied = copy_page_to_iter_nofault(page, offset,
                                                           length, iter);
                else
                        copied = zero_iter(iter, length);

                addr += copied;
                remains -= copied;

                if (copied != length)
                        break;
        }

        return count - remains;
}

/*
 * Read from a vm_map_ram region of memory.
 *
 * Returns the number of copied bytes.
 */
static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr,
                                  size_t count, unsigned long flags)
{
        char *start;
        struct vmap_block *vb;
        struct xarray *xa;
        unsigned long offset;
        unsigned int rs, re;
        size_t remains, n;

        /*
         * If it's area created by vm_map_ram() interface directly, but
         * not further subdividing and delegating management to vmap_block,
         * handle it here.
         */
        if (!(flags & VMAP_BLOCK))
                return aligned_vread_iter(iter, addr, count);

        remains = count;

        /*
         * Area is split into regions and tracked with vmap_block, read out
         * each region and zero fill the hole between regions.
         */
        xa = addr_to_vb_xa((unsigned long) addr);
        vb = xa_load(xa, addr_to_vb_idx((unsigned long)addr));
        if (!vb)
                goto finished_zero;

        spin_lock(&vb->lock);
        if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
                spin_unlock(&vb->lock);
                goto finished_zero;
        }

        for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
                size_t copied;

                if (remains == 0)
                        goto finished;

                start = vmap_block_vaddr(vb->va->va_start, rs);

                if (addr < start) {
                        size_t to_zero = min_t(size_t, start - addr, remains);
                        size_t zeroed = zero_iter(iter, to_zero);

                        addr += zeroed;
                        remains -= zeroed;

                        if (remains == 0 || zeroed != to_zero)
                                goto finished;
                }

                /*it could start reading from the middle of used region*/
                offset = offset_in_page(addr);
                n = ((re - rs + 1) << PAGE_SHIFT) - offset;
                if (n > remains)
                        n = remains;

                copied = aligned_vread_iter(iter, start + offset, n);

                addr += copied;
                remains -= copied;

                if (copied != n)
                        goto finished;
        }

        spin_unlock(&vb->lock);

finished_zero:
        /* zero-fill the left dirty or free regions */
        return count - remains + zero_iter(iter, remains);
finished:
        /* We couldn't copy/zero everything */
        spin_unlock(&vb->lock);
        return count - remains;
}

/**
 * vread_iter() - read vmalloc area in a safe way to an iterator.
 * @iter:         the iterator to which data should be written.
 * @addr:         vm address.
 * @count:        number of bytes to be read.
 *
 * This function checks that addr is a valid vmalloc'ed area, and
 * copy data from that area to a given buffer. If the given memory range
 * of [addr...addr+count) includes some valid address, data is copied to
 * proper area of @buf. If there are memory holes, they'll be zero-filled.
 * IOREMAP area is treated as memory hole and no copy is done.
 *
 * If [addr...addr+count) doesn't includes any intersects with alive
 * vm_struct area, returns 0. @buf should be kernel's buffer.
 *
 * Note: In usual ops, vread() is never necessary because the caller
 * should know vmalloc() area is valid and can use memcpy().
 * This is for routines which have to access vmalloc area without
 * any information, as /proc/kcore.
 *
 * Return: number of bytes for which addr and buf should be increased
 * (same number as @count) or %0 if [addr...addr+count) doesn't
 * include any intersection with valid vmalloc area
 */
long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
{
        struct vmap_node *vn;
        struct vmap_area *va;
        struct vm_struct *vm;
        char *vaddr;
        size_t n, size, flags, remains;
        unsigned long next;

        addr = kasan_reset_tag(addr);

        /* Don't allow overflow */
        if ((unsigned long) addr + count < count)
                count = -(unsigned long) addr;

        remains = count;

        vn = find_vmap_area_exceed_addr_lock((unsigned long) addr, &va);
        if (!vn)
                goto finished_zero;

        /* no intersects with alive vmap_area */
        if ((unsigned long)addr + remains <= va->va_start)
                goto finished_zero;

        do {
                size_t copied;

                if (remains == 0)
                        goto finished;

                vm = va->vm;
                flags = va->flags & VMAP_FLAGS_MASK;
                /*
                 * VMAP_BLOCK indicates a sub-type of vm_map_ram area, need
                 * be set together with VMAP_RAM.
                 */
                WARN_ON(flags == VMAP_BLOCK);

                if (!vm && !flags)
                        goto next_va;

                if (vm && (vm->flags & VM_UNINITIALIZED))
                        goto next_va;

                /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
                smp_rmb();

                vaddr = (char *) va->va_start;
                size = vm ? get_vm_area_size(vm) : va_size(va);

                if (addr >= vaddr + size)
                        goto next_va;

                if (addr < vaddr) {
                        size_t to_zero = min_t(size_t, vaddr - addr, remains);
                        size_t zeroed = zero_iter(iter, to_zero);

                        addr += zeroed;
                        remains -= zeroed;

                        if (remains == 0 || zeroed != to_zero)
                                goto finished;
                }

                n = vaddr + size - addr;
                if (n > remains)
                        n = remains;

                if (flags & VMAP_RAM)
                        copied = vmap_ram_vread_iter(iter, addr, n, flags);
                else if (!(vm && (vm->flags & (VM_IOREMAP | VM_SPARSE))))
                        copied = aligned_vread_iter(iter, addr, n);
                else /* IOREMAP | SPARSE area is treated as memory hole */
                        copied = zero_iter(iter, n);

                addr += copied;
                remains -= copied;

                if (copied != n)
                        goto finished;

        next_va:
                next = va->va_end;
                spin_unlock(&vn->busy.lock);
        } while ((vn = find_vmap_area_exceed_addr_lock(next, &va)));

finished_zero:
        if (vn)
                spin_unlock(&vn->busy.lock);

        /* zero-fill memory holes */
        return count - remains + zero_iter(iter, remains);
finished:
        /* Nothing remains, or We couldn't copy/zero everything. */
        if (vn)
                spin_unlock(&vn->busy.lock);

        return count - remains;
}

/**
 * remap_vmalloc_range_partial - map vmalloc pages to userspace
 * @vma:                vma to cover
 * @uaddr:                target user address to start at
 * @kaddr:                virtual address of vmalloc kernel memory
 * @pgoff:                offset from @kaddr to start at
 * @size:                size of map area
 *
 * Returns:        0 for success, -Exxx on failure
 *
 * This function checks that @kaddr is a valid vmalloc'ed area,
 * and that it is big enough to cover the range starting at
 * @uaddr in @vma. Will return failure if that criteria isn't
 * met.
 *
 * Similar to remap_pfn_range() (see mm/memory.c)
 */
int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
                                void *kaddr, unsigned long pgoff,
                                unsigned long size)
{
        struct vm_struct *area;
        unsigned long off;
        unsigned long end_index;

        if (check_shl_overflow(pgoff, PAGE_SHIFT, &off))
                return -EINVAL;

        size = PAGE_ALIGN(size);

        if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr))
                return -EINVAL;

        area = find_vm_area(kaddr);
        if (!area)
                return -EINVAL;

        if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT)))
                return -EINVAL;

        if (check_add_overflow(size, off, &end_index) ||
            end_index > get_vm_area_size(area))
                return -EINVAL;
        kaddr += off;

        do {
                struct page *page = vmalloc_to_page(kaddr);
                int ret;

                ret = vm_insert_page(vma, uaddr, page);
                if (ret)
                        return ret;

                uaddr += PAGE_SIZE;
                kaddr += PAGE_SIZE;
                size -= PAGE_SIZE;
        } while (size > 0);

        vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP);

        return 0;
}

/**
 * remap_vmalloc_range - map vmalloc pages to userspace
 * @vma:                vma to cover (map full range of vma)
 * @addr:                vmalloc memory
 * @pgoff:                number of pages into addr before first page to map
 *
 * Returns:        0 for success, -Exxx on failure
 *
 * This function checks that addr is a valid vmalloc'ed area, and
 * that it is big enough to cover the vma. Will return failure if
 * that criteria isn't met.
 *
 * Similar to remap_pfn_range() (see mm/memory.c)
 */
int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
                                                unsigned long pgoff)
{
        return remap_vmalloc_range_partial(vma, vma->vm_start,
                                           addr, pgoff,
                                           vma->vm_end - vma->vm_start);
}
EXPORT_SYMBOL(remap_vmalloc_range);

void free_vm_area(struct vm_struct *area)
{
        struct vm_struct *ret;
        ret = remove_vm_area(area->addr);
        BUG_ON(ret != area);
        kfree(area);
}
EXPORT_SYMBOL_GPL(free_vm_area);

#ifdef CONFIG_SMP
static struct vmap_area *node_to_va(struct rb_node *n)
{
        return rb_entry_safe(n, struct vmap_area, rb_node);
}

/**
 * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to
 * @addr: target address
 *
 * Returns: vmap_area if it is found. If there is no such area
 *   the first highest(reverse order) vmap_area is returned
 *   i.e. va->va_start < addr && va->va_end < addr or NULL
 *   if there are no any areas before @addr.
 */
static struct vmap_area *
pvm_find_va_enclose_addr(unsigned long addr)
{
        struct vmap_area *va, *tmp;
        struct rb_node *n;

        n = free_vmap_area_root.rb_node;
        va = NULL;

        while (n) {
                tmp = rb_entry(n, struct vmap_area, rb_node);
                if (tmp->va_start <= addr) {
                        va = tmp;
                        if (tmp->va_end >= addr)
                                break;

                        n = n->rb_right;
                } else {
                        n = n->rb_left;
                }
        }

        return va;
}

/**
 * pvm_determine_end_from_reverse - find the highest aligned address
 * of free block below VMALLOC_END
 * @va:
 *   in - the VA we start the search(reverse order);
 *   out - the VA with the highest aligned end address.
 * @align: alignment for required highest address
 *
 * Returns: determined end address within vmap_area
 */
static unsigned long
pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align)
{
        unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
        unsigned long addr;

        if (likely(*va)) {
                list_for_each_entry_from_reverse((*va),
                                &free_vmap_area_list, list) {
                        addr = min((*va)->va_end & ~(align - 1), vmalloc_end);
                        if ((*va)->va_start < addr)
                                return addr;
                }
        }

        return 0;
}

/**
 * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
 * @offsets: array containing offset of each area
 * @sizes: array containing size of each area
 * @nr_vms: the number of areas to allocate
 * @align: alignment, all entries in @offsets and @sizes must be aligned to this
 *
 * Returns: kmalloc'd vm_struct pointer array pointing to allocated
 *            vm_structs on success, %NULL on failure
 *
 * Percpu allocator wants to use congruent vm areas so that it can
 * maintain the offsets among percpu areas.  This function allocates
 * congruent vmalloc areas for it with GFP_KERNEL.  These areas tend to
 * be scattered pretty far, distance between two areas easily going up
 * to gigabytes.  To avoid interacting with regular vmallocs, these
 * areas are allocated from top.
 *
 * Despite its complicated look, this allocator is rather simple. It
 * does everything top-down and scans free blocks from the end looking
 * for matching base. While scanning, if any of the areas do not fit the
 * base address is pulled down to fit the area. Scanning is repeated till
 * all the areas fit and then all necessary data structures are inserted
 * and the result is returned.
 */
struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
                                     const size_t *sizes, int nr_vms,
                                     size_t align)
{
        const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
        const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
        struct vmap_area **vas, *va;
        struct vm_struct **vms;
        int area, area2, last_area, term_area;
        unsigned long base, start, size, end, last_end, orig_start, orig_end;
        bool purged = false;

        /* verify parameters and allocate data structures */
        BUG_ON(offset_in_page(align) || !is_power_of_2(align));
        for (last_area = 0, area = 0; area < nr_vms; area++) {
                start = offsets[area];
                end = start + sizes[area];

                /* is everything aligned properly? */
                BUG_ON(!IS_ALIGNED(offsets[area], align));
                BUG_ON(!IS_ALIGNED(sizes[area], align));

                /* detect the area with the highest address */
                if (start > offsets[last_area])
                        last_area = area;

                for (area2 = area + 1; area2 < nr_vms; area2++) {
                        unsigned long start2 = offsets[area2];
                        unsigned long end2 = start2 + sizes[area2];

                        BUG_ON(start2 < end && start < end2);
                }
        }
        last_end = offsets[last_area] + sizes[last_area];

        if (vmalloc_end - vmalloc_start < last_end) {
                WARN_ON(true);
                return NULL;
        }

        vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
        vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
        if (!vas || !vms)
                goto err_free2;

        for (area = 0; area < nr_vms; area++) {
                vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL);
                vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
                if (!vas[area] || !vms[area])
                        goto err_free;
        }
retry:
        spin_lock(&free_vmap_area_lock);

        /* start scanning - we scan from the top, begin with the last area */
        area = term_area = last_area;
        start = offsets[area];
        end = start + sizes[area];

        va = pvm_find_va_enclose_addr(vmalloc_end);
        base = pvm_determine_end_from_reverse(&va, align) - end;

        while (true) {
                /*
                 * base might have underflowed, add last_end before
                 * comparing.
                 */
                if (base + last_end < vmalloc_start + last_end)
                        goto overflow;

                /*
                 * Fitting base has not been found.
                 */
                if (va == NULL)
                        goto overflow;

                /*
                 * If required width exceeds current VA block, move
                 * base downwards and then recheck.
                 */
                if (base + end > va->va_end) {
                        base = pvm_determine_end_from_reverse(&va, align) - end;
                        term_area = area;
                        continue;
                }

                /*
                 * If this VA does not fit, move base downwards and recheck.
                 */
                if (base + start < va->va_start) {
                        va = node_to_va(rb_prev(&va->rb_node));
                        base = pvm_determine_end_from_reverse(&va, align) - end;
                        term_area = area;
                        continue;
                }

                /*
                 * This area fits, move on to the previous one.  If
                 * the previous one is the terminal one, we're done.
                 */
                area = (area + nr_vms - 1) % nr_vms;
                if (area == term_area)
                        break;

                start = offsets[area];
                end = start + sizes[area];
                va = pvm_find_va_enclose_addr(base + end);
        }

        /* we've found a fitting base, insert all va's */
        for (area = 0; area < nr_vms; area++) {
                int ret;

                start = base + offsets[area];
                size = sizes[area];

                va = pvm_find_va_enclose_addr(start);
                if (WARN_ON_ONCE(va == NULL))
                        /* It is a BUG(), but trigger recovery instead. */
                        goto recovery;

                ret = va_clip(&free_vmap_area_root,
                        &free_vmap_area_list, va, start, size);
                if (WARN_ON_ONCE(unlikely(ret)))
                        /* It is a BUG(), but trigger recovery instead. */
                        goto recovery;

                /* Allocated area. */
                va = vas[area];
                va->va_start = start;
                va->va_end = start + size;
        }

        spin_unlock(&free_vmap_area_lock);

        /* populate the kasan shadow space */
        for (area = 0; area < nr_vms; area++) {
                if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area], GFP_KERNEL))
                        goto err_free_shadow;
        }

        /* insert all vm's */
        for (area = 0; area < nr_vms; area++) {
                struct vmap_node *vn = addr_to_node(vas[area]->va_start);

                spin_lock(&vn->busy.lock);
                insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head);
                setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
                                 pcpu_get_vm_areas);
                spin_unlock(&vn->busy.lock);
        }

        /*
         * Mark allocated areas as accessible. Do it now as a best-effort
         * approach, as they can be mapped outside of vmalloc code.
         * With hardware tag-based KASAN, marking is skipped for
         * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
         */
        for (area = 0; area < nr_vms; area++)
                vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr,
                                vms[area]->size, KASAN_VMALLOC_PROT_NORMAL);

        kfree(vas);
        return vms;

recovery:
        /*
         * Remove previously allocated areas. There is no
         * need in removing these areas from the busy tree,
         * because they are inserted only on the final step
         * and when pcpu_get_vm_areas() is success.
         */
        while (area--) {
                orig_start = vas[area]->va_start;
                orig_end = vas[area]->va_end;
                va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
                                &free_vmap_area_list);
                if (va)
                        kasan_release_vmalloc(orig_start, orig_end,
                                va->va_start, va->va_end,
                                KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
                vas[area] = NULL;
        }

overflow:
        spin_unlock(&free_vmap_area_lock);
        if (!purged) {
                reclaim_and_purge_vmap_areas();
                purged = true;

                /* Before "retry", check if we recover. */
                for (area = 0; area < nr_vms; area++) {
                        if (vas[area])
                                continue;

                        vas[area] = kmem_cache_zalloc(
                                vmap_area_cachep, GFP_KERNEL);
                        if (!vas[area])
                                goto err_free;
                }

                goto retry;
        }

err_free:
        for (area = 0; area < nr_vms; area++) {
                if (vas[area])
                        kmem_cache_free(vmap_area_cachep, vas[area]);

                kfree(vms[area]);
        }
err_free2:
        kfree(vas);
        kfree(vms);
        return NULL;

err_free_shadow:
        spin_lock(&free_vmap_area_lock);
        /*
         * We release all the vmalloc shadows, even the ones for regions that
         * hadn't been successfully added. This relies on kasan_release_vmalloc
         * being able to tolerate this case.
         */
        for (area = 0; area < nr_vms; area++) {
                orig_start = vas[area]->va_start;
                orig_end = vas[area]->va_end;
                va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
                                &free_vmap_area_list);
                if (va)
                        kasan_release_vmalloc(orig_start, orig_end,
                                va->va_start, va->va_end,
                                KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
                vas[area] = NULL;
                kfree(vms[area]);
        }
        spin_unlock(&free_vmap_area_lock);
        kfree(vas);
        kfree(vms);
        return NULL;
}

/**
 * pcpu_free_vm_areas - free vmalloc areas for percpu allocator
 * @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
 * @nr_vms: the number of allocated areas
 *
 * Free vm_structs and the array allocated by pcpu_get_vm_areas().
 */
void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
{
        int i;

        for (i = 0; i < nr_vms; i++)
                free_vm_area(vms[i]);
        kfree(vms);
}
#endif        /* CONFIG_SMP */

#ifdef CONFIG_PRINTK
bool vmalloc_dump_obj(void *object)
{
        const void *caller;
        struct vm_struct *vm;
        struct vmap_area *va;
        struct vmap_node *vn;
        unsigned long addr;
        unsigned int nr_pages;

        addr = PAGE_ALIGN((unsigned long) object);
        vn = addr_to_node(addr);

        if (!spin_trylock(&vn->busy.lock))
                return false;

        va = __find_vmap_area(addr, &vn->busy.root);
        if (!va || !va->vm) {
                spin_unlock(&vn->busy.lock);
                return false;
        }

        vm = va->vm;
        addr = (unsigned long) vm->addr;
        caller = vm->caller;
        nr_pages = vm->nr_pages;
        spin_unlock(&vn->busy.lock);

        pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
                nr_pages, addr, caller);

        return true;
}
#endif

#ifdef CONFIG_PROC_FS

/*
 * Print number of pages allocated on each memory node.
 *
 * This function can only be called if CONFIG_NUMA is enabled
 * and VM_UNINITIALIZED bit in v->flags is disabled.
 */
static void show_numa_info(struct seq_file *m, struct vm_struct *v,
                                 unsigned int *counters)
{
        unsigned int nr;
        unsigned int step = 1U << vm_area_page_order(v);

        if (!counters)
                return;

        memset(counters, 0, nr_node_ids * sizeof(unsigned int));

        for (nr = 0; nr < v->nr_pages; nr += step)
                counters[page_to_nid(v->pages[nr])] += step;
        for_each_node_state(nr, N_HIGH_MEMORY)
                if (counters[nr])
                        seq_printf(m, " N%u=%u", nr, counters[nr]);
}

static void show_purge_info(struct seq_file *m)
{
        struct vmap_node *vn;
        struct vmap_area *va;

        for_each_vmap_node(vn) {
                spin_lock(&vn->lazy.lock);
                list_for_each_entry(va, &vn->lazy.head, list) {
                        seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
                                (void *)va->va_start, (void *)va->va_end,
                                va_size(va));
                }
                spin_unlock(&vn->lazy.lock);
        }
}

static int vmalloc_info_show(struct seq_file *m, void *p)
{
        struct vmap_node *vn;
        struct vmap_area *va;
        struct vm_struct *v;
        unsigned int *counters;

        if (IS_ENABLED(CONFIG_NUMA))
                counters = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);

        for_each_vmap_node(vn) {
                spin_lock(&vn->busy.lock);
                list_for_each_entry(va, &vn->busy.head, list) {
                        if (!va->vm) {
                                if (va->flags & VMAP_RAM)
                                        seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
                                                (void *)va->va_start, (void *)va->va_end,
                                                va_size(va));

                                continue;
                        }

                        v = va->vm;
                        if (v->flags & VM_UNINITIALIZED)
                                continue;

                        /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
                        smp_rmb();

                        seq_printf(m, "0x%pK-0x%pK %7ld",
                                v->addr, v->addr + v->size, v->size);

                        if (v->caller)
                                seq_printf(m, " %pS", v->caller);

                        if (v->nr_pages)
                                seq_printf(m, " pages=%d", v->nr_pages);

                        if (v->phys_addr)
                                seq_printf(m, " phys=%pa", &v->phys_addr);

                        if (v->flags & VM_IOREMAP)
                                seq_puts(m, " ioremap");

                        if (v->flags & VM_SPARSE)
                                seq_puts(m, " sparse");

                        if (v->flags & VM_ALLOC)
                                seq_puts(m, " vmalloc");

                        if (v->flags & VM_MAP)
                                seq_puts(m, " vmap");

                        if (v->flags & VM_USERMAP)
                                seq_puts(m, " user");

                        if (v->flags & VM_DMA_COHERENT)
                                seq_puts(m, " dma-coherent");

                        if (is_vmalloc_addr(v->pages))
                                seq_puts(m, " vpages");

                        if (IS_ENABLED(CONFIG_NUMA))
                                show_numa_info(m, v, counters);

                        seq_putc(m, '\n');
                }
                spin_unlock(&vn->busy.lock);
        }

        /*
         * As a final step, dump "unpurged" areas.
         */
        show_purge_info(m);
        if (IS_ENABLED(CONFIG_NUMA))
                kfree(counters);
        return 0;
}

static int __init proc_vmalloc_init(void)
{
        proc_create_single("vmallocinfo", 0400, NULL, vmalloc_info_show);
        return 0;
}
module_init(proc_vmalloc_init);

#endif

static void __init vmap_init_free_space(void)
{
        unsigned long vmap_start = 1;
        const unsigned long vmap_end = ULONG_MAX;
        struct vmap_area *free;
        struct vm_struct *busy;

        /*
         *     B     F     B     B     B     F
         * -|-----|.....|-----|-----|-----|.....|-
         *  |           The KVA space           |
         *  |<--------------------------------->|
         */
        for (busy = vmlist; busy; busy = busy->next) {
                if ((unsigned long) busy->addr - vmap_start > 0) {
                        free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
                        if (!WARN_ON_ONCE(!free)) {
                                free->va_start = vmap_start;
                                free->va_end = (unsigned long) busy->addr;

                                insert_vmap_area_augment(free, NULL,
                                        &free_vmap_area_root,
                                                &free_vmap_area_list);
                        }
                }

                vmap_start = (unsigned long) busy->addr + busy->size;
        }

        if (vmap_end - vmap_start > 0) {
                free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
                if (!WARN_ON_ONCE(!free)) {
                        free->va_start = vmap_start;
                        free->va_end = vmap_end;

                        insert_vmap_area_augment(free, NULL,
                                &free_vmap_area_root,
                                        &free_vmap_area_list);
                }
        }
}

static void vmap_init_nodes(void)
{
        struct vmap_node *vn;
        int i;

#if BITS_PER_LONG == 64
        /*
         * A high threshold of max nodes is fixed and bound to 128,
         * thus a scale factor is 1 for systems where number of cores
         * are less or equal to specified threshold.
         *
         * As for NUMA-aware notes. For bigger systems, for example
         * NUMA with multi-sockets, where we can end-up with thousands
         * of cores in total, a "sub-numa-clustering" should be added.
         *
         * In this case a NUMA domain is considered as a single entity
         * with dedicated sub-nodes in it which describe one group or
         * set of cores. Therefore a per-domain purging is supposed to
         * be added as well as a per-domain balancing.
         */
        int n = clamp_t(unsigned int, num_possible_cpus(), 1, 128);

        if (n > 1) {
                vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT);
                if (vn) {
                        /* Node partition is 16 pages. */
                        vmap_zone_size = (1 << 4) * PAGE_SIZE;
                        nr_vmap_nodes = n;
                        vmap_nodes = vn;
                } else {
                        pr_err("Failed to allocate an array. Disable a node layer\n");
                }
        }
#endif

        for_each_vmap_node(vn) {
                vn->busy.root = RB_ROOT;
                INIT_LIST_HEAD(&vn->busy.head);
                spin_lock_init(&vn->busy.lock);

                vn->lazy.root = RB_ROOT;
                INIT_LIST_HEAD(&vn->lazy.head);
                spin_lock_init(&vn->lazy.lock);

                for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {
                        INIT_LIST_HEAD(&vn->pool[i].head);
                        WRITE_ONCE(vn->pool[i].len, 0);
                }

                spin_lock_init(&vn->pool_lock);
        }
}

static unsigned long
vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
        unsigned long count = 0;
        struct vmap_node *vn;
        int i;

        for_each_vmap_node(vn) {
                for (i = 0; i < MAX_VA_SIZE_PAGES; i++)
                        count += READ_ONCE(vn->pool[i].len);
        }

        return count ? count : SHRINK_EMPTY;
}

static unsigned long
vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
        struct vmap_node *vn;

        for_each_vmap_node(vn)
                decay_va_pool_node(vn, true);

        return SHRINK_STOP;
}

void __init vmalloc_init(void)
{
        struct shrinker *vmap_node_shrinker;
        struct vmap_area *va;
        struct vmap_node *vn;
        struct vm_struct *tmp;
        int i;

        /*
         * Create the cache for vmap_area objects.
         */
        vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);

        for_each_possible_cpu(i) {
                struct vmap_block_queue *vbq;
                struct vfree_deferred *p;

                vbq = &per_cpu(vmap_block_queue, i);
                spin_lock_init(&vbq->lock);
                INIT_LIST_HEAD(&vbq->free);
                p = &per_cpu(vfree_deferred, i);
                init_llist_head(&p->list);
                INIT_WORK(&p->wq, delayed_vfree_work);
                xa_init(&vbq->vmap_blocks);
        }

        /*
         * Setup nodes before importing vmlist.
         */
        vmap_init_nodes();

        /* Import existing vmlist entries. */
        for (tmp = vmlist; tmp; tmp = tmp->next) {
                va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
                if (WARN_ON_ONCE(!va))
                        continue;

                va->va_start = (unsigned long)tmp->addr;
                va->va_end = va->va_start + tmp->size;
                va->vm = tmp;

                vn = addr_to_node(va->va_start);
                insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
        }

        /*
         * Now we can initialize a free vmap space.
         */
        vmap_init_free_space();
        vmap_initialized = true;

        vmap_node_shrinker = shrinker_alloc(0, "vmap-node");
        if (!vmap_node_shrinker) {
                pr_err("Failed to allocate vmap-node shrinker!\n");
                return;
        }

        vmap_node_shrinker->count_objects = vmap_node_shrink_count;
        vmap_node_shrinker->scan_objects = vmap_node_shrink_scan;
        shrinker_register(vmap_node_shrinker);
}





























































































































































































































































































































   38 



































   39 




















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
/* SPDX-License-Identifier: GPL-2.0 */

#ifndef _KERNEL_PRINTK_RINGBUFFER_H
#define _KERNEL_PRINTK_RINGBUFFER_H

#include <linux/atomic.h>
#include <linux/bits.h>
#include <linux/dev_printk.h>
#include <linux/stddef.h>
#include <linux/types.h>

/*
 * Meta information about each stored message.
 *
 * All fields are set by the printk code except for @seq, which is
 * set by the ringbuffer code.
 */
struct printk_info {
        u64        seq;                /* sequence number */
        u64        ts_nsec;        /* timestamp in nanoseconds */
        u16        text_len;        /* length of text message */
        u8        facility;        /* syslog facility */
        u8        flags:5;        /* internal record flags */
        u8        level:3;        /* syslog level */
        u32        caller_id;        /* thread id or processor id */

        struct dev_printk_info        dev_info;
};

/*
 * A structure providing the buffers, used by writers and readers.
 *
 * Writers:
 * Using prb_rec_init_wr(), a writer sets @text_buf_size before calling
 * prb_reserve(). On success, prb_reserve() sets @info and @text_buf to
 * buffers reserved for that writer.
 *
 * Readers:
 * Using prb_rec_init_rd(), a reader sets all fields before calling
 * prb_read_valid(). Note that the reader provides the @info and @text_buf,
 * buffers. On success, the struct pointed to by @info will be filled and
 * the char array pointed to by @text_buf will be filled with text data.
 */
struct printk_record {
        struct printk_info        *info;
        char                        *text_buf;
        unsigned int                text_buf_size;
};

/* Specifies the logical position and span of a data block. */
struct prb_data_blk_lpos {
        unsigned long        begin;
        unsigned long        next;
};

/*
 * A descriptor: the complete meta-data for a record.
 *
 * @state_var: A bitwise combination of descriptor ID and descriptor state.
 */
struct prb_desc {
        atomic_long_t                        state_var;
        struct prb_data_blk_lpos        text_blk_lpos;
};

/* A ringbuffer of "ID + data" elements. */
struct prb_data_ring {
        unsigned int        size_bits;
        char                *data;
        atomic_long_t        head_lpos;
        atomic_long_t        tail_lpos;
};

/* A ringbuffer of "struct prb_desc" elements. */
struct prb_desc_ring {
        unsigned int                count_bits;
        struct prb_desc                *descs;
        struct printk_info        *infos;
        atomic_long_t                head_id;
        atomic_long_t                tail_id;
        atomic_long_t                last_finalized_seq;
};

/*
 * The high level structure representing the printk ringbuffer.
 *
 * @fail: Count of failed prb_reserve() calls where not even a data-less
 *        record was created.
 */
struct printk_ringbuffer {
        struct prb_desc_ring        desc_ring;
        struct prb_data_ring        text_data_ring;
        atomic_long_t                fail;
};

/*
 * Used by writers as a reserve/commit handle.
 *
 * @rb:         Ringbuffer where the entry is reserved.
 * @irqflags:   Saved irq flags to restore on entry commit.
 * @id:         ID of the reserved descriptor.
 * @text_space: Total occupied buffer space in the text data ring, including
 *              ID, alignment padding, and wrapping data blocks.
 *
 * This structure is an opaque handle for writers. Its contents are only
 * to be used by the ringbuffer implementation.
 */
struct prb_reserved_entry {
        struct printk_ringbuffer        *rb;
        unsigned long                        irqflags;
        unsigned long                        id;
        unsigned int                        text_space;
};

/* The possible responses of a descriptor state-query. */
enum desc_state {
        desc_miss        =  -1,        /* ID mismatch (pseudo state) */
        desc_reserved        = 0x0,        /* reserved, in use by writer */
        desc_committed        = 0x1,        /* committed by writer, could get reopened */
        desc_finalized        = 0x2,        /* committed, no further modification allowed */
        desc_reusable        = 0x3,        /* free, not yet used by any writer */
};

#define _DATA_SIZE(sz_bits)        (1UL << (sz_bits))
#define _DESCS_COUNT(ct_bits)        (1U << (ct_bits))
#define DESC_SV_BITS                BITS_PER_LONG
#define DESC_FLAGS_SHIFT        (DESC_SV_BITS - 2)
#define DESC_FLAGS_MASK                (3UL << DESC_FLAGS_SHIFT)
#define DESC_STATE(sv)                (3UL & (sv >> DESC_FLAGS_SHIFT))
#define DESC_SV(id, state)        (((unsigned long)state << DESC_FLAGS_SHIFT) | id)
#define DESC_ID_MASK                (~DESC_FLAGS_MASK)
#define DESC_ID(sv)                ((sv) & DESC_ID_MASK)

/*
 * Special data block logical position values (for fields of
 * @prb_desc.text_blk_lpos).
 *
 * - Bit0 is used to identify if the record has no data block. (Implemented in
 *   the LPOS_DATALESS() macro.)
 *
 * - Bit1 specifies the reason for not having a data block.
 *
 * These special values could never be real lpos values because of the
 * meta data and alignment padding of data blocks. (See to_blk_size() for
 * details.)
 */
#define FAILED_LPOS                0x1
#define EMPTY_LINE_LPOS                0x3

#define FAILED_BLK_LPOS        \
{                                \
        .begin        = FAILED_LPOS,        \
        .next        = FAILED_LPOS,        \
}

/*
 * Descriptor Bootstrap
 *
 * The descriptor array is minimally initialized to allow immediate usage
 * by readers and writers. The requirements that the descriptor array
 * initialization must satisfy:
 *
 *   Req1
 *     The tail must point to an existing (committed or reusable) descriptor.
 *     This is required by the implementation of prb_first_seq().
 *
 *   Req2
 *     Readers must see that the ringbuffer is initially empty.
 *
 *   Req3
 *     The first record reserved by a writer is assigned sequence number 0.
 *
 * To satisfy Req1, the tail initially points to a descriptor that is
 * minimally initialized (having no data block, i.e. data-less with the
 * data block's lpos @begin and @next values set to FAILED_LPOS).
 *
 * To satisfy Req2, the initial tail descriptor is initialized to the
 * reusable state. Readers recognize reusable descriptors as existing
 * records, but skip over them.
 *
 * To satisfy Req3, the last descriptor in the array is used as the initial
 * head (and tail) descriptor. This allows the first record reserved by a
 * writer (head + 1) to be the first descriptor in the array. (Only the first
 * descriptor in the array could have a valid sequence number of 0.)
 *
 * The first time a descriptor is reserved, it is assigned a sequence number
 * with the value of the array index. A "first time reserved" descriptor can
 * be recognized because it has a sequence number of 0 but does not have an
 * index of 0. (Only the first descriptor in the array could have a valid
 * sequence number of 0.) After the first reservation, all future reservations
 * (recycling) simply involve incrementing the sequence number by the array
 * count.
 *
 *   Hack #1
 *     Only the first descriptor in the array is allowed to have the sequence
 *     number 0. In this case it is not possible to recognize if it is being
 *     reserved the first time (set to index value) or has been reserved
 *     previously (increment by the array count). This is handled by _always_
 *     incrementing the sequence number by the array count when reserving the
 *     first descriptor in the array. In order to satisfy Req3, the sequence
 *     number of the first descriptor in the array is initialized to minus
 *     the array count. Then, upon the first reservation, it is incremented
 *     to 0, thus satisfying Req3.
 *
 *   Hack #2
 *     prb_first_seq() can be called at any time by readers to retrieve the
 *     sequence number of the tail descriptor. However, due to Req2 and Req3,
 *     initially there are no records to report the sequence number of
 *     (sequence numbers are u64 and there is nothing less than 0). To handle
 *     this, the sequence number of the initial tail descriptor is initialized
 *     to 0. Technically this is incorrect, because there is no record with
 *     sequence number 0 (yet) and the tail descriptor is not the first
 *     descriptor in the array. But it allows prb_read_valid() to correctly
 *     report the existence of a record for _any_ given sequence number at all
 *     times. Bootstrapping is complete when the tail is pushed the first
 *     time, thus finally pointing to the first descriptor reserved by a
 *     writer, which has the assigned sequence number 0.
 */

/*
 * Initiating Logical Value Overflows
 *
 * Both logical position (lpos) and ID values can be mapped to array indexes
 * but may experience overflows during the lifetime of the system. To ensure
 * that printk_ringbuffer can handle the overflows for these types, initial
 * values are chosen that map to the correct initial array indexes, but will
 * result in overflows soon.
 *
 *   BLK0_LPOS
 *     The initial @head_lpos and @tail_lpos for data rings. It is at index
 *     0 and the lpos value is such that it will overflow on the first wrap.
 *
 *   DESC0_ID
 *     The initial @head_id and @tail_id for the desc ring. It is at the last
 *     index of the descriptor array (see Req3 above) and the ID value is such
 *     that it will overflow on the second wrap.
 */
#define BLK0_LPOS(sz_bits)        (-(_DATA_SIZE(sz_bits)))
#define DESC0_ID(ct_bits)        DESC_ID(-(_DESCS_COUNT(ct_bits) + 1))
#define DESC0_SV(ct_bits)        DESC_SV(DESC0_ID(ct_bits), desc_reusable)

/*
 * Define a ringbuffer with an external text data buffer. The same as
 * DEFINE_PRINTKRB() but requires specifying an external buffer for the
 * text data.
 *
 * Note: The specified external buffer must be of the size:
 *       2 ^ (descbits + avgtextbits)
 */
#define _DEFINE_PRINTKRB(name, descbits, avgtextbits, text_buf)                        \
static struct prb_desc _##name##_descs[_DESCS_COUNT(descbits)] = {                                \
        /* the initial head and tail */                                                                \
        [_DESCS_COUNT(descbits) - 1] = {                                                        \
                /* reusable */                                                                        \
                .state_var        = ATOMIC_INIT(DESC0_SV(descbits)),                                \
                /* no associated data block */                                                        \
                .text_blk_lpos        = FAILED_BLK_LPOS,                                                \
        },                                                                                        \
};                                                                                                \
static struct printk_info _##name##_infos[_DESCS_COUNT(descbits)] = {                                \
        /* this will be the first record reserved by a writer */                                \
        [0] = {                                                                                        \
                /* will be incremented to 0 on the first reservation */                                \
                .seq = -(u64)_DESCS_COUNT(descbits),                                                \
        },                                                                                        \
        /* the initial head and tail */                                                                \
        [_DESCS_COUNT(descbits) - 1] = {                                                        \
                /* reports the first seq value during the bootstrap phase */                        \
                .seq = 0,                                                                        \
        },                                                                                        \
};                                                                                                \
static struct printk_ringbuffer name = {                                                        \
        .desc_ring = {                                                                                \
                .count_bits        = descbits,                                                        \
                .descs                = &_##name##_descs[0],                                                \
                .infos                = &_##name##_infos[0],                                                \
                .head_id        = ATOMIC_INIT(DESC0_ID(descbits)),                                \
                .tail_id        = ATOMIC_INIT(DESC0_ID(descbits)),                                \
                .last_finalized_seq = ATOMIC_INIT(0),                                                \
        },                                                                                        \
        .text_data_ring = {                                                                        \
                .size_bits        = (avgtextbits) + (descbits),                                        \
                .data                = text_buf,                                                        \
                .head_lpos        = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))),        \
                .tail_lpos        = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))),        \
        },                                                                                        \
        .fail                        = ATOMIC_LONG_INIT(0),                                                \
}

/**
 * DEFINE_PRINTKRB() - Define a ringbuffer.
 *
 * @name:        The name of the ringbuffer variable.
 * @descbits:    The number of descriptors as a power-of-2 value.
 * @avgtextbits: The average text data size per record as a power-of-2 value.
 *
 * This is a macro for defining a ringbuffer and all internal structures
 * such that it is ready for immediate use. See _DEFINE_PRINTKRB() for a
 * variant where the text data buffer can be specified externally.
 */
#define DEFINE_PRINTKRB(name, descbits, avgtextbits)                                \
static char _##name##_text[1U << ((avgtextbits) + (descbits))]                        \
                        __aligned(__alignof__(unsigned long));                        \
_DEFINE_PRINTKRB(name, descbits, avgtextbits, &_##name##_text[0])

/* Writer Interface */

/**
 * prb_rec_init_wr() - Initialize a buffer for writing records.
 *
 * @r:             The record to initialize.
 * @text_buf_size: The needed text buffer size.
 */
static inline void prb_rec_init_wr(struct printk_record *r,
                                   unsigned int text_buf_size)
{
        r->info = NULL;
        r->text_buf = NULL;
        r->text_buf_size = text_buf_size;
}

bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
                 struct printk_record *r);
bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
                         struct printk_record *r, u32 caller_id, unsigned int max_size);
void prb_commit(struct prb_reserved_entry *e);
void prb_final_commit(struct prb_reserved_entry *e);

void prb_init(struct printk_ringbuffer *rb,
              char *text_buf, unsigned int text_buf_size,
              struct prb_desc *descs, unsigned int descs_count_bits,
              struct printk_info *infos);
unsigned int prb_record_text_space(struct prb_reserved_entry *e);

/* Reader Interface */

/**
 * prb_rec_init_rd() - Initialize a buffer for reading records.
 *
 * @r:             The record to initialize.
 * @info:          A buffer to store record meta-data.
 * @text_buf:      A buffer to store text data.
 * @text_buf_size: The size of @text_buf.
 *
 * Initialize all the fields that a reader is interested in. All arguments
 * (except @r) are optional. Only record data for arguments that are
 * non-NULL or non-zero will be read.
 */
static inline void prb_rec_init_rd(struct printk_record *r,
                                   struct printk_info *info,
                                   char *text_buf, unsigned int text_buf_size)
{
        r->info = info;
        r->text_buf = text_buf;
        r->text_buf_size = text_buf_size;
}

/**
 * prb_for_each_record() - Iterate over the records of a ringbuffer.
 *
 * @from: The sequence number to begin with.
 * @rb:   The ringbuffer to iterate over.
 * @s:    A u64 to store the sequence number on each iteration.
 * @r:    A printk_record to store the record on each iteration.
 *
 * This is a macro for conveniently iterating over a ringbuffer.
 * Note that @s may not be the sequence number of the record on each
 * iteration. For the sequence number, @r->info->seq should be checked.
 *
 * Context: Any context.
 */
#define prb_for_each_record(from, rb, s, r) \
for ((s) = from; prb_read_valid(rb, s, r); (s) = (r)->info->seq + 1)

/**
 * prb_for_each_info() - Iterate over the meta data of a ringbuffer.
 *
 * @from: The sequence number to begin with.
 * @rb:   The ringbuffer to iterate over.
 * @s:    A u64 to store the sequence number on each iteration.
 * @i:    A printk_info to store the record meta data on each iteration.
 * @lc:   An unsigned int to store the text line count of each record.
 *
 * This is a macro for conveniently iterating over a ringbuffer.
 * Note that @s may not be the sequence number of the record on each
 * iteration. For the sequence number, @r->info->seq should be checked.
 *
 * Context: Any context.
 */
#define prb_for_each_info(from, rb, s, i, lc) \
for ((s) = from; prb_read_valid_info(rb, s, i, lc); (s) = (i)->seq + 1)

bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq,
                    struct printk_record *r);
bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq,
                         struct printk_info *info, unsigned int *line_count);

u64 prb_first_seq(struct printk_ringbuffer *rb);
u64 prb_first_valid_seq(struct printk_ringbuffer *rb);
u64 prb_next_seq(struct printk_ringbuffer *rb);
u64 prb_next_reserve_seq(struct printk_ringbuffer *rb);

#ifdef CONFIG_64BIT

#define __u64seq_to_ulseq(u64seq) (u64seq)
#define __ulseq_to_u64seq(rb, ulseq) (ulseq)
#define ULSEQ_MAX(rb) (-1)

#else /* CONFIG_64BIT */

#define __u64seq_to_ulseq(u64seq) ((u32)u64seq)
#define ULSEQ_MAX(rb) __u64seq_to_ulseq(prb_first_seq(rb) + 0x80000000UL)

static inline u64 __ulseq_to_u64seq(struct printk_ringbuffer *rb, u32 ulseq)
{
        u64 rb_first_seq = prb_first_seq(rb);
        u64 seq;

        /*
         * The provided sequence is only the lower 32 bits of the ringbuffer
         * sequence. It needs to be expanded to 64bit. Get the first sequence
         * number from the ringbuffer and fold it.
         *
         * Having a 32bit representation in the console is sufficient.
         * If a console ever gets more than 2^31 records behind
         * the ringbuffer then this is the least of the problems.
         *
         * Also the access to the ring buffer is always safe.
         */
        seq = rb_first_seq - (s32)((u32)rb_first_seq - ulseq);

        return seq;
}

#endif /* CONFIG_64BIT */

#endif /* _KERNEL_PRINTK_RINGBUFFER_H */











































































































   13 



























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Because linux/module.h has tracepoints in the header, and ftrace.h
 * used to include this file, define_trace.h includes linux/module.h
 * But we do not want the module.h to override the TRACE_SYSTEM macro
 * variable that define_trace.h is processing, so we only set it
 * when module events are being processed, which would happen when
 * CREATE_TRACE_POINTS is defined.
 */
#ifdef CREATE_TRACE_POINTS
#undef TRACE_SYSTEM
#define TRACE_SYSTEM module
#endif

#if !defined(_TRACE_MODULE_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_MODULE_H

#include <linux/tracepoint.h>

#ifdef CONFIG_MODULES

struct module;

#define show_module_flags(flags) __print_flags(flags, "",        \
        { (1UL << TAINT_PROPRIETARY_MODULE),        "P" },                \
        { (1UL << TAINT_OOT_MODULE),                "O" },                \
        { (1UL << TAINT_FORCED_MODULE),                "F" },                \
        { (1UL << TAINT_CRAP),                        "C" },                \
        { (1UL << TAINT_UNSIGNED_MODULE),        "E" })

TRACE_EVENT(module_load,

        TP_PROTO(struct module *mod),

        TP_ARGS(mod),

        TP_STRUCT__entry(
                __field(        unsigned int,        taints                )
                __string(        name,                mod->name        )
        ),

        TP_fast_assign(
                __entry->taints = mod->taints;
                __assign_str(name);
        ),

        TP_printk("%s %s", __get_str(name), show_module_flags(__entry->taints))
);

TRACE_EVENT(module_free,

        TP_PROTO(struct module *mod),

        TP_ARGS(mod),

        TP_STRUCT__entry(
                __string(        name,                mod->name        )
        ),

        TP_fast_assign(
                __assign_str(name);
        ),

        TP_printk("%s", __get_str(name))
);

#ifdef CONFIG_MODULE_UNLOAD
/* trace_module_get/put are only used if CONFIG_MODULE_UNLOAD is defined */

DECLARE_EVENT_CLASS(module_refcnt,

        TP_PROTO(struct module *mod, unsigned long ip),

        TP_ARGS(mod, ip),

        TP_STRUCT__entry(
                __field(        unsigned long,        ip                )
                __field(        int,                refcnt                )
                __string(        name,                mod->name        )
        ),

        TP_fast_assign(
                __entry->ip        = ip;
                __entry->refcnt        = atomic_read(&mod->refcnt);
                __assign_str(name);
        ),

        TP_printk("%s call_site=%ps refcnt=%d",
                  __get_str(name), (void *)__entry->ip, __entry->refcnt)
);

DEFINE_EVENT(module_refcnt, module_get,

        TP_PROTO(struct module *mod, unsigned long ip),

        TP_ARGS(mod, ip)
);

DEFINE_EVENT(module_refcnt, module_put,

        TP_PROTO(struct module *mod, unsigned long ip),

        TP_ARGS(mod, ip)
);
#endif /* CONFIG_MODULE_UNLOAD */

TRACE_EVENT(module_request,

        TP_PROTO(char *name, bool wait, unsigned long ip),

        TP_ARGS(name, wait, ip),

        TP_STRUCT__entry(
                __field(        unsigned long,        ip                )
                __field(        bool,                wait                )
                __string(        name,                name                )
        ),

        TP_fast_assign(
                __entry->ip        = ip;
                __entry->wait        = wait;
                __assign_str(name);
        ),

        TP_printk("%s wait=%d call_site=%ps",
                  __get_str(name), (int)__entry->wait, (void *)__entry->ip)
);

#endif /* CONFIG_MODULES */

#endif /* _TRACE_MODULE_H */

/* This part must be outside protection */
#include <trace/define_trace.h>




























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * An extensible bitmap is a bitmap that supports an
 * arbitrary number of bits.  Extensible bitmaps are
 * used to represent sets of values, such as types,
 * roles, categories, and classes.
 *
 * Each extensible bitmap is implemented as a linked
 * list of bitmap nodes, where each bitmap node has
 * an explicitly specified starting bit position within
 * the total bitmap.
 *
 * Author : Stephen Smalley, <stephen.smalley.work@gmail.com>
 */

#ifndef _SS_EBITMAP_H_
#define _SS_EBITMAP_H_

#include <net/netlabel.h>

#ifdef CONFIG_64BIT
#define EBITMAP_NODE_SIZE 64
#else
#define EBITMAP_NODE_SIZE 32
#endif

#define EBITMAP_UNIT_NUMS                                     \
        ((EBITMAP_NODE_SIZE - sizeof(void *) - sizeof(u32)) / \
         sizeof(unsigned long))
#define EBITMAP_UNIT_SIZE BITS_PER_LONG
#define EBITMAP_SIZE          (EBITMAP_UNIT_NUMS * EBITMAP_UNIT_SIZE)
#define EBITMAP_BIT          1UL
#define EBITMAP_SHIFT_UNIT_SIZE(x) \
        (((x) >> EBITMAP_UNIT_SIZE / 2) >> EBITMAP_UNIT_SIZE / 2)

struct ebitmap_node {
        struct ebitmap_node *next;
        unsigned long maps[EBITMAP_UNIT_NUMS];
        u32 startbit;
};

struct ebitmap {
        struct ebitmap_node *node; /* first node in the bitmap */
        u32 highbit; /* highest position in the total bitmap */
};

#define ebitmap_length(e) ((e)->highbit)

static inline u32 ebitmap_start_positive(const struct ebitmap *e,
                                         struct ebitmap_node **n)
{
        u32 ofs;

        for (*n = e->node; *n; *n = (*n)->next) {
                ofs = find_first_bit((*n)->maps, EBITMAP_SIZE);
                if (ofs < EBITMAP_SIZE)
                        return (*n)->startbit + ofs;
        }
        return ebitmap_length(e);
}

static inline void ebitmap_init(struct ebitmap *e)
{
        memset(e, 0, sizeof(*e));
}

static inline u32 ebitmap_next_positive(const struct ebitmap *e,
                                        struct ebitmap_node **n, u32 bit)
{
        u32 ofs;

        ofs = find_next_bit((*n)->maps, EBITMAP_SIZE, bit - (*n)->startbit + 1);
        if (ofs < EBITMAP_SIZE)
                return ofs + (*n)->startbit;

        for (*n = (*n)->next; *n; *n = (*n)->next) {
                ofs = find_first_bit((*n)->maps, EBITMAP_SIZE);
                if (ofs < EBITMAP_SIZE)
                        return ofs + (*n)->startbit;
        }
        return ebitmap_length(e);
}

#define EBITMAP_NODE_INDEX(node, bit) \
        (((bit) - (node)->startbit) / EBITMAP_UNIT_SIZE)
#define EBITMAP_NODE_OFFSET(node, bit) \
        (((bit) - (node)->startbit) % EBITMAP_UNIT_SIZE)

static inline int ebitmap_node_get_bit(const struct ebitmap_node *n, u32 bit)
{
        u32 index = EBITMAP_NODE_INDEX(n, bit);
        u32 ofs = EBITMAP_NODE_OFFSET(n, bit);

        BUG_ON(index >= EBITMAP_UNIT_NUMS);
        if ((n->maps[index] & (EBITMAP_BIT << ofs)))
                return 1;
        return 0;
}

static inline void ebitmap_node_set_bit(struct ebitmap_node *n, u32 bit)
{
        u32 index = EBITMAP_NODE_INDEX(n, bit);
        u32 ofs = EBITMAP_NODE_OFFSET(n, bit);

        BUG_ON(index >= EBITMAP_UNIT_NUMS);
        n->maps[index] |= (EBITMAP_BIT << ofs);
}

static inline void ebitmap_node_clr_bit(struct ebitmap_node *n, u32 bit)
{
        u32 index = EBITMAP_NODE_INDEX(n, bit);
        u32 ofs = EBITMAP_NODE_OFFSET(n, bit);

        BUG_ON(index >= EBITMAP_UNIT_NUMS);
        n->maps[index] &= ~(EBITMAP_BIT << ofs);
}

#define ebitmap_for_each_positive_bit(e, n, bit)      \
        for ((bit) = ebitmap_start_positive(e, &(n)); \
             (bit) < ebitmap_length(e);               \
             (bit) = ebitmap_next_positive(e, &(n), bit))

bool ebitmap_equal(const struct ebitmap *e1, const struct ebitmap *e2);
int ebitmap_cpy(struct ebitmap *dst, const struct ebitmap *src);
int ebitmap_and(struct ebitmap *dst, const struct ebitmap *e1,
                const struct ebitmap *e2);
int ebitmap_contains(const struct ebitmap *e1, const struct ebitmap *e2,
                     u32 last_e2bit);
int ebitmap_get_bit(const struct ebitmap *e, u32 bit);
int ebitmap_set_bit(struct ebitmap *e, u32 bit, int value);
void ebitmap_destroy(struct ebitmap *e);
struct policy_file;
int ebitmap_read(struct ebitmap *e, struct policy_file *fp);
int ebitmap_write(const struct ebitmap *e, struct policy_file *fp);
u32 ebitmap_hash(const struct ebitmap *e, u32 hash);

#ifdef CONFIG_NETLABEL
int ebitmap_netlbl_export(struct ebitmap *ebmap,
                          struct netlbl_lsm_catmap **catmap);
int ebitmap_netlbl_import(struct ebitmap *ebmap,
                          struct netlbl_lsm_catmap *catmap);
#else
static inline int ebitmap_netlbl_export(struct ebitmap *ebmap,
                                        struct netlbl_lsm_catmap **catmap)
{
        return -ENOMEM;
}
static inline int ebitmap_netlbl_import(struct ebitmap *ebmap,
                                        struct netlbl_lsm_catmap *catmap)
{
        return -ENOMEM;
}
#endif

#endif /* _SS_EBITMAP_H_ */





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  315 


















































































































































  316 

  316 
  316 
   68 










  317 


  314 

  319 













    1 








    1 



















































  319 









































































































































































  316 














  316 





























































































  319 

  316 



























































  318 




   13 






























































  319 




























































































   15 




















































    6 










































































    1 

















   15 
























































































































































































































































































































































































































































































    6 























   79 















































































































































































  318 




















































  226 
  223 


































































  314 
  317 












  317 












  316 













  316 
































  307 























































































































































































































  314 


























































































































  314 






































































    1 







    1 















  266 














































    1 














  319 




























    6 















































































































































































































































































































































































































































































































































































































































































































































































































































































  320 

























































































































































































































































































































































































































































































































































































































  314 
















































































  314 



  310 





















































































  313 






















































  316 
























  317 
































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        Definitions for the 'struct sk_buff' memory handlers.
 *
 *        Authors:
 *                Alan Cox, <gw4pts@gw4pts.ampr.org>
 *                Florian La Roche, <rzsfl@rz.uni-sb.de>
 */

#ifndef _LINUX_SKBUFF_H
#define _LINUX_SKBUFF_H

#include <linux/kernel.h>
#include <linux/compiler.h>
#include <linux/time.h>
#include <linux/bug.h>
#include <linux/bvec.h>
#include <linux/cache.h>
#include <linux/rbtree.h>
#include <linux/socket.h>
#include <linux/refcount.h>

#include <linux/atomic.h>
#include <asm/types.h>
#include <linux/spinlock.h>
#include <net/checksum.h>
#include <linux/rcupdate.h>
#include <linux/dma-mapping.h>
#include <linux/netdev_features.h>
#include <net/flow_dissector.h>
#include <linux/in6.h>
#include <linux/if_packet.h>
#include <linux/llist.h>
#include <linux/page_frag_cache.h>
#include <net/flow.h>
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <linux/netfilter/nf_conntrack_common.h>
#endif
#include <net/net_debug.h>
#include <net/dropreason-core.h>
#include <net/netmem.h>

/**
 * DOC: skb checksums
 *
 * The interface for checksum offload between the stack and networking drivers
 * is as follows...
 *
 * IP checksum related features
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 *
 * Drivers advertise checksum offload capabilities in the features of a device.
 * From the stack's point of view these are capabilities offered by the driver.
 * A driver typically only advertises features that it is capable of offloading
 * to its device.
 *
 * .. flat-table:: Checksum related device features
 *   :widths: 1 10
 *
 *   * - %NETIF_F_HW_CSUM
 *     - The driver (or its device) is able to compute one
 *         IP (one's complement) checksum for any combination
 *         of protocols or protocol layering. The checksum is
 *         computed and set in a packet per the CHECKSUM_PARTIAL
 *         interface (see below).
 *
 *   * - %NETIF_F_IP_CSUM
 *     - Driver (device) is only able to checksum plain
 *         TCP or UDP packets over IPv4. These are specifically
 *         unencapsulated packets of the form IPv4|TCP or
 *         IPv4|UDP where the Protocol field in the IPv4 header
 *         is TCP or UDP. The IPv4 header may contain IP options.
 *         This feature cannot be set in features for a device
 *         with NETIF_F_HW_CSUM also set. This feature is being
 *         DEPRECATED (see below).
 *
 *   * - %NETIF_F_IPV6_CSUM
 *     - Driver (device) is only able to checksum plain
 *         TCP or UDP packets over IPv6. These are specifically
 *         unencapsulated packets of the form IPv6|TCP or
 *         IPv6|UDP where the Next Header field in the IPv6
 *         header is either TCP or UDP. IPv6 extension headers
 *         are not supported with this feature. This feature
 *         cannot be set in features for a device with
 *         NETIF_F_HW_CSUM also set. This feature is being
 *         DEPRECATED (see below).
 *
 *   * - %NETIF_F_RXCSUM
 *     - Driver (device) performs receive checksum offload.
 *         This flag is only used to disable the RX checksum
 *         feature for a device. The stack will accept receive
 *         checksum indication in packets received on a device
 *         regardless of whether NETIF_F_RXCSUM is set.
 *
 * Checksumming of received packets by device
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 *
 * Indication of checksum verification is set in &sk_buff.ip_summed.
 * Possible values are:
 *
 * - %CHECKSUM_NONE
 *
 *   Device did not checksum this packet e.g. due to lack of capabilities.
 *   The packet contains full (though not verified) checksum in packet but
 *   not in skb->csum. Thus, skb->csum is undefined in this case.
 *
 * - %CHECKSUM_UNNECESSARY
 *
 *   The hardware you're dealing with doesn't calculate the full checksum
 *   (as in %CHECKSUM_COMPLETE), but it does parse headers and verify checksums
 *   for specific protocols. For such packets it will set %CHECKSUM_UNNECESSARY
 *   if their checksums are okay. &sk_buff.csum is still undefined in this case
 *   though. A driver or device must never modify the checksum field in the
 *   packet even if checksum is verified.
 *
 *   %CHECKSUM_UNNECESSARY is applicable to following protocols:
 *
 *     - TCP: IPv6 and IPv4.
 *     - UDP: IPv4 and IPv6. A device may apply CHECKSUM_UNNECESSARY to a
 *       zero UDP checksum for either IPv4 or IPv6, the networking stack
 *       may perform further validation in this case.
 *     - GRE: only if the checksum is present in the header.
 *     - SCTP: indicates the CRC in SCTP header has been validated.
 *     - FCOE: indicates the CRC in FC frame has been validated.
 *
 *   &sk_buff.csum_level indicates the number of consecutive checksums found in
 *   the packet minus one that have been verified as %CHECKSUM_UNNECESSARY.
 *   For instance if a device receives an IPv6->UDP->GRE->IPv4->TCP packet
 *   and a device is able to verify the checksums for UDP (possibly zero),
 *   GRE (checksum flag is set) and TCP, &sk_buff.csum_level would be set to
 *   two. If the device were only able to verify the UDP checksum and not
 *   GRE, either because it doesn't support GRE checksum or because GRE
 *   checksum is bad, skb->csum_level would be set to zero (TCP checksum is
 *   not considered in this case).
 *
 * - %CHECKSUM_COMPLETE
 *
 *   This is the most generic way. The device supplied checksum of the _whole_
 *   packet as seen by netif_rx() and fills in &sk_buff.csum. This means the
 *   hardware doesn't need to parse L3/L4 headers to implement this.
 *
 *   Notes:
 *
 *   - Even if device supports only some protocols, but is able to produce
 *     skb->csum, it MUST use CHECKSUM_COMPLETE, not CHECKSUM_UNNECESSARY.
 *   - CHECKSUM_COMPLETE is not applicable to SCTP and FCoE protocols.
 *
 * - %CHECKSUM_PARTIAL
 *
 *   A checksum is set up to be offloaded to a device as described in the
 *   output description for CHECKSUM_PARTIAL. This may occur on a packet
 *   received directly from another Linux OS, e.g., a virtualized Linux kernel
 *   on the same host, or it may be set in the input path in GRO or remote
 *   checksum offload. For the purposes of checksum verification, the checksum
 *   referred to by skb->csum_start + skb->csum_offset and any preceding
 *   checksums in the packet are considered verified. Any checksums in the
 *   packet that are after the checksum being offloaded are not considered to
 *   be verified.
 *
 * Checksumming on transmit for non-GSO
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 *
 * The stack requests checksum offload in the &sk_buff.ip_summed for a packet.
 * Values are:
 *
 * - %CHECKSUM_PARTIAL
 *
 *   The driver is required to checksum the packet as seen by hard_start_xmit()
 *   from &sk_buff.csum_start up to the end, and to record/write the checksum at
 *   offset &sk_buff.csum_start + &sk_buff.csum_offset.
 *   A driver may verify that the
 *   csum_start and csum_offset values are valid values given the length and
 *   offset of the packet, but it should not attempt to validate that the
 *   checksum refers to a legitimate transport layer checksum -- it is the
 *   purview of the stack to validate that csum_start and csum_offset are set
 *   correctly.
 *
 *   When the stack requests checksum offload for a packet, the driver MUST
 *   ensure that the checksum is set correctly. A driver can either offload the
 *   checksum calculation to the device, or call skb_checksum_help (in the case
 *   that the device does not support offload for a particular checksum).
 *
 *   %NETIF_F_IP_CSUM and %NETIF_F_IPV6_CSUM are being deprecated in favor of
 *   %NETIF_F_HW_CSUM. New devices should use %NETIF_F_HW_CSUM to indicate
 *   checksum offload capability.
 *   skb_csum_hwoffload_help() can be called to resolve %CHECKSUM_PARTIAL based
 *   on network device checksumming capabilities: if a packet does not match
 *   them, skb_checksum_help() or skb_crc32c_help() (depending on the value of
 *   &sk_buff.csum_not_inet, see :ref:`crc`)
 *   is called to resolve the checksum.
 *
 * - %CHECKSUM_NONE
 *
 *   The skb was already checksummed by the protocol, or a checksum is not
 *   required.
 *
 * - %CHECKSUM_UNNECESSARY
 *
 *   This has the same meaning as CHECKSUM_NONE for checksum offload on
 *   output.
 *
 * - %CHECKSUM_COMPLETE
 *
 *   Not used in checksum output. If a driver observes a packet with this value
 *   set in skbuff, it should treat the packet as if %CHECKSUM_NONE were set.
 *
 * .. _crc:
 *
 * Non-IP checksum (CRC) offloads
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 *
 * .. flat-table::
 *   :widths: 1 10
 *
 *   * - %NETIF_F_SCTP_CRC
 *     - This feature indicates that a device is capable of
 *         offloading the SCTP CRC in a packet. To perform this offload the stack
 *         will set csum_start and csum_offset accordingly, set ip_summed to
 *         %CHECKSUM_PARTIAL and set csum_not_inet to 1, to provide an indication
 *         in the skbuff that the %CHECKSUM_PARTIAL refers to CRC32c.
 *         A driver that supports both IP checksum offload and SCTP CRC32c offload
 *         must verify which offload is configured for a packet by testing the
 *         value of &sk_buff.csum_not_inet; skb_crc32c_csum_help() is provided to
 *         resolve %CHECKSUM_PARTIAL on skbs where csum_not_inet is set to 1.
 *
 *   * - %NETIF_F_FCOE_CRC
 *     - This feature indicates that a device is capable of offloading the FCOE
 *         CRC in a packet. To perform this offload the stack will set ip_summed
 *         to %CHECKSUM_PARTIAL and set csum_start and csum_offset
 *         accordingly. Note that there is no indication in the skbuff that the
 *         %CHECKSUM_PARTIAL refers to an FCOE checksum, so a driver that supports
 *         both IP checksum offload and FCOE CRC offload must verify which offload
 *         is configured for a packet, presumably by inspecting packet headers.
 *
 * Checksumming on output with GSO
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 *
 * In the case of a GSO packet (skb_is_gso() is true), checksum offload
 * is implied by the SKB_GSO_* flags in gso_type. Most obviously, if the
 * gso_type is %SKB_GSO_TCPV4 or %SKB_GSO_TCPV6, TCP checksum offload as
 * part of the GSO operation is implied. If a checksum is being offloaded
 * with GSO then ip_summed is %CHECKSUM_PARTIAL, and both csum_start and
 * csum_offset are set to refer to the outermost checksum being offloaded
 * (two offloaded checksums are possible with UDP encapsulation).
 */

/* Don't change this without changing skb_csum_unnecessary! */
#define CHECKSUM_NONE                0
#define CHECKSUM_UNNECESSARY        1
#define CHECKSUM_COMPLETE        2
#define CHECKSUM_PARTIAL        3

/* Maximum value in skb->csum_level */
#define SKB_MAX_CSUM_LEVEL        3

#define SKB_DATA_ALIGN(X)        ALIGN(X, SMP_CACHE_BYTES)
#define SKB_WITH_OVERHEAD(X)        \
        ((X) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))

/* For X bytes available in skb->head, what is the minimal
 * allocation needed, knowing struct skb_shared_info needs
 * to be aligned.
 */
#define SKB_HEAD_ALIGN(X) (SKB_DATA_ALIGN(X) + \
        SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))

#define SKB_MAX_ORDER(X, ORDER) \
        SKB_WITH_OVERHEAD((PAGE_SIZE << (ORDER)) - (X))
#define SKB_MAX_HEAD(X)                (SKB_MAX_ORDER((X), 0))
#define SKB_MAX_ALLOC                (SKB_MAX_ORDER(0, 2))

/* return minimum truesize of one skb containing X bytes of data */
#define SKB_TRUESIZE(X) ((X) +                                                \
                         SKB_DATA_ALIGN(sizeof(struct sk_buff)) +        \
                         SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))

struct net_device;
struct scatterlist;
struct pipe_inode_info;
struct iov_iter;
struct napi_struct;
struct bpf_prog;
union bpf_attr;
struct skb_ext;
struct ts_config;

#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
struct nf_bridge_info {
        enum {
                BRNF_PROTO_UNCHANGED,
                BRNF_PROTO_8021Q,
                BRNF_PROTO_PPPOE
        } orig_proto:8;
        u8                        pkt_otherhost:1;
        u8                        in_prerouting:1;
        u8                        bridged_dnat:1;
        u8                        sabotage_in_done:1;
        __u16                        frag_max_size;
        int                        physinif;

        /* always valid & non-NULL from FORWARD on, for physdev match */
        struct net_device        *physoutdev;
        union {
                /* prerouting: detect dnat in orig/reply direction */
                __be32          ipv4_daddr;
                struct in6_addr ipv6_daddr;

                /* after prerouting + nat detected: store original source
                 * mac since neigh resolution overwrites it, only used while
                 * skb is out in neigh layer.
                 */
                char neigh_header[8];
        };
};
#endif

#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
/* Chain in tc_skb_ext will be used to share the tc chain with
 * ovs recirc_id. It will be set to the current chain by tc
 * and read by ovs to recirc_id.
 */
struct tc_skb_ext {
        union {
                u64 act_miss_cookie;
                __u32 chain;
        };
        __u16 mru;
        __u16 zone;
        u8 post_ct:1;
        u8 post_ct_snat:1;
        u8 post_ct_dnat:1;
        u8 act_miss:1; /* Set if act_miss_cookie is used */
        u8 l2_miss:1; /* Set by bridge upon FDB or MDB miss */
};
#endif

struct sk_buff_head {
        /* These two members must be first to match sk_buff. */
        struct_group_tagged(sk_buff_list, list,
                struct sk_buff        *next;
                struct sk_buff        *prev;
        );

        __u32                qlen;
        spinlock_t        lock;
};

struct sk_buff;

#ifndef CONFIG_MAX_SKB_FRAGS
# define CONFIG_MAX_SKB_FRAGS 17
#endif

#define MAX_SKB_FRAGS CONFIG_MAX_SKB_FRAGS

/* Set skb_shinfo(skb)->gso_size to this in case you want skb_segment to
 * segment using its current segmentation instead.
 */
#define GSO_BY_FRAGS        0xFFFF

typedef struct skb_frag {
        netmem_ref netmem;
        unsigned int len;
        unsigned int offset;
} skb_frag_t;

/**
 * skb_frag_size() - Returns the size of a skb fragment
 * @frag: skb fragment
 */
static inline unsigned int skb_frag_size(const skb_frag_t *frag)
{
        return frag->len;
}

/**
 * skb_frag_size_set() - Sets the size of a skb fragment
 * @frag: skb fragment
 * @size: size of fragment
 */
static inline void skb_frag_size_set(skb_frag_t *frag, unsigned int size)
{
        frag->len = size;
}

/**
 * skb_frag_size_add() - Increments the size of a skb fragment by @delta
 * @frag: skb fragment
 * @delta: value to add
 */
static inline void skb_frag_size_add(skb_frag_t *frag, int delta)
{
        frag->len += delta;
}

/**
 * skb_frag_size_sub() - Decrements the size of a skb fragment by @delta
 * @frag: skb fragment
 * @delta: value to subtract
 */
static inline void skb_frag_size_sub(skb_frag_t *frag, int delta)
{
        frag->len -= delta;
}

/**
 * skb_frag_must_loop - Test if %p is a high memory page
 * @p: fragment's page
 */
static inline bool skb_frag_must_loop(struct page *p)
{
#if defined(CONFIG_HIGHMEM)
        if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP) || PageHighMem(p))
                return true;
#endif
        return false;
}

/**
 *        skb_frag_foreach_page - loop over pages in a fragment
 *
 *        @f:                skb frag to operate on
 *        @f_off:                offset from start of f->netmem
 *        @f_len:                length from f_off to loop over
 *        @p:                (temp var) current page
 *        @p_off:                (temp var) offset from start of current page,
 *                                   non-zero only on first page.
 *        @p_len:                (temp var) length in current page,
 *                                   < PAGE_SIZE only on first and last page.
 *        @copied:        (temp var) length so far, excluding current p_len.
 *
 *        A fragment can hold a compound page, in which case per-page
 *        operations, notably kmap_atomic, must be called for each
 *        regular page.
 */
#define skb_frag_foreach_page(f, f_off, f_len, p, p_off, p_len, copied)        \
        for (p = skb_frag_page(f) + ((f_off) >> PAGE_SHIFT),                \
             p_off = (f_off) & (PAGE_SIZE - 1),                                \
             p_len = skb_frag_must_loop(p) ?                                \
             min_t(u32, f_len, PAGE_SIZE - p_off) : f_len,                \
             copied = 0;                                                \
             copied < f_len;                                                \
             copied += p_len, p++, p_off = 0,                                \
             p_len = min_t(u32, f_len - copied, PAGE_SIZE))                \

/**
 * struct skb_shared_hwtstamps - hardware time stamps
 * @hwtstamp:                hardware time stamp transformed into duration
 *                        since arbitrary point in time
 * @netdev_data:        address/cookie of network device driver used as
 *                        reference to actual hardware time stamp
 *
 * Software time stamps generated by ktime_get_real() are stored in
 * skb->tstamp.
 *
 * hwtstamps can only be compared against other hwtstamps from
 * the same device.
 *
 * This structure is attached to packets as part of the
 * &skb_shared_info. Use skb_hwtstamps() to get a pointer.
 */
struct skb_shared_hwtstamps {
        union {
                ktime_t        hwtstamp;
                void *netdev_data;
        };
};

/* Definitions for tx_flags in struct skb_shared_info */
enum {
        /* generate hardware time stamp */
        SKBTX_HW_TSTAMP_NOBPF = 1 << 0,

        /* generate software time stamp when queueing packet to NIC */
        SKBTX_SW_TSTAMP = 1 << 1,

        /* device driver is going to provide hardware time stamp */
        SKBTX_IN_PROGRESS = 1 << 2,

        /* generate software time stamp on packet tx completion */
        SKBTX_COMPLETION_TSTAMP = 1 << 3,

        /* determine hardware time stamp based on time or cycles */
        SKBTX_HW_TSTAMP_NETDEV = 1 << 5,

        /* generate software time stamp when entering packet scheduling */
        SKBTX_SCHED_TSTAMP = 1 << 6,

        /* used for bpf extension when a bpf program is loaded */
        SKBTX_BPF = 1 << 7,
};

#define SKBTX_HW_TSTAMP                (SKBTX_HW_TSTAMP_NOBPF | SKBTX_BPF)

#define SKBTX_ANY_SW_TSTAMP        (SKBTX_SW_TSTAMP    | \
                                 SKBTX_SCHED_TSTAMP | \
                                 SKBTX_BPF          | \
                                 SKBTX_COMPLETION_TSTAMP)
#define SKBTX_ANY_TSTAMP        (SKBTX_HW_TSTAMP | \
                                 SKBTX_ANY_SW_TSTAMP)

/* Definitions for flags in struct skb_shared_info */
enum {
        /* use zcopy routines */
        SKBFL_ZEROCOPY_ENABLE = BIT(0),

        /* This indicates at least one fragment might be overwritten
         * (as in vmsplice(), sendfile() ...)
         * If we need to compute a TX checksum, we'll need to copy
         * all frags to avoid possible bad checksum
         */
        SKBFL_SHARED_FRAG = BIT(1),

        /* segment contains only zerocopy data and should not be
         * charged to the kernel memory.
         */
        SKBFL_PURE_ZEROCOPY = BIT(2),

        SKBFL_DONT_ORPHAN = BIT(3),

        /* page references are managed by the ubuf_info, so it's safe to
         * use frags only up until ubuf_info is released
         */
        SKBFL_MANAGED_FRAG_REFS = BIT(4),
};

#define SKBFL_ZEROCOPY_FRAG        (SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG)
#define SKBFL_ALL_ZEROCOPY        (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY | \
                                 SKBFL_DONT_ORPHAN | SKBFL_MANAGED_FRAG_REFS)

struct ubuf_info_ops {
        void (*complete)(struct sk_buff *, struct ubuf_info *,
                         bool zerocopy_success);
        /* has to be compatible with skb_zcopy_set() */
        int (*link_skb)(struct sk_buff *skb, struct ubuf_info *uarg);
};

/*
 * The callback notifies userspace to release buffers when skb DMA is done in
 * lower device, the skb last reference should be 0 when calling this.
 * The zerocopy_success argument is true if zero copy transmit occurred,
 * false on data copy or out of memory error caused by data copy attempt.
 * The ctx field is used to track device context.
 * The desc field is used to track userspace buffer index.
 */
struct ubuf_info {
        const struct ubuf_info_ops *ops;
        refcount_t refcnt;
        u8 flags;
};

struct ubuf_info_msgzc {
        struct ubuf_info ubuf;

        union {
                struct {
                        unsigned long desc;
                        void *ctx;
                };
                struct {
                        u32 id;
                        u16 len;
                        u16 zerocopy:1;
                        u32 bytelen;
                };
        };

        struct mmpin {
                struct user_struct *user;
                unsigned int num_pg;
        } mmp;
};

#define skb_uarg(SKB)        ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))
#define uarg_to_msgzc(ubuf_ptr)        container_of((ubuf_ptr), struct ubuf_info_msgzc, \
                                             ubuf)

int mm_account_pinned_pages(struct mmpin *mmp, size_t size);
void mm_unaccount_pinned_pages(struct mmpin *mmp);

/* Preserve some data across TX submission and completion.
 *
 * Note, this state is stored in the driver. Extending the layout
 * might need some special care.
 */
struct xsk_tx_metadata_compl {
        __u64 *tx_timestamp;
};

/* This data is invariant across clones and lives at
 * the end of the header data, ie. at skb->end.
 */
struct skb_shared_info {
        __u8                flags;
        __u8                meta_len;
        __u8                nr_frags;
        __u8                tx_flags;
        unsigned short        gso_size;
        /* Warning: this field is not always filled in (UFO)! */
        unsigned short        gso_segs;
        struct sk_buff        *frag_list;
        union {
                struct skb_shared_hwtstamps hwtstamps;
                struct xsk_tx_metadata_compl xsk_meta;
        };
        unsigned int        gso_type;
        u32                tskey;

        /*
         * Warning : all fields before dataref are cleared in __alloc_skb()
         */
        atomic_t        dataref;

        union {
                struct {
                        u32                xdp_frags_size;
                        u32                xdp_frags_truesize;
                };

                /*
                 * Intermediate layers must ensure that destructor_arg
                 * remains valid until skb destructor.
                 */
                void                *destructor_arg;
        };

        /* must be last field, see pskb_expand_head() */
        skb_frag_t        frags[MAX_SKB_FRAGS];
};

/**
 * DOC: dataref and headerless skbs
 *
 * Transport layers send out clones of payload skbs they hold for
 * retransmissions. To allow lower layers of the stack to prepend their headers
 * we split &skb_shared_info.dataref into two halves.
 * The lower 16 bits count the overall number of references.
 * The higher 16 bits indicate how many of the references are payload-only.
 * skb_header_cloned() checks if skb is allowed to add / write the headers.
 *
 * The creator of the skb (e.g. TCP) marks its skb as &sk_buff.nohdr
 * (via __skb_header_release()). Any clone created from marked skb will get
 * &sk_buff.hdr_len populated with the available headroom.
 * If there's the only clone in existence it's able to modify the headroom
 * at will. The sequence of calls inside the transport layer is::
 *
 *  <alloc skb>
 *  skb_reserve()
 *  __skb_header_release()
 *  skb_clone()
 *  // send the clone down the stack
 *
 * This is not a very generic construct and it depends on the transport layers
 * doing the right thing. In practice there's usually only one payload-only skb.
 * Having multiple payload-only skbs with different lengths of hdr_len is not
 * possible. The payload-only skbs should never leave their owner.
 */
#define SKB_DATAREF_SHIFT 16
#define SKB_DATAREF_MASK ((1 << SKB_DATAREF_SHIFT) - 1)


enum {
        SKB_FCLONE_UNAVAILABLE,        /* skb has no fclone (from head_cache) */
        SKB_FCLONE_ORIG,        /* orig skb (from fclone_cache) */
        SKB_FCLONE_CLONE,        /* companion fclone skb (from fclone_cache) */
};

enum {
        SKB_GSO_TCPV4 = 1 << 0,

        /* This indicates the skb is from an untrusted source. */
        SKB_GSO_DODGY = 1 << 1,

        /* This indicates the tcp segment has CWR set. */
        SKB_GSO_TCP_ECN = 1 << 2,

        __SKB_GSO_TCP_FIXEDID = 1 << 3,

        SKB_GSO_TCPV6 = 1 << 4,

        SKB_GSO_FCOE = 1 << 5,

        SKB_GSO_GRE = 1 << 6,

        SKB_GSO_GRE_CSUM = 1 << 7,

        SKB_GSO_IPXIP4 = 1 << 8,

        SKB_GSO_IPXIP6 = 1 << 9,

        SKB_GSO_UDP_TUNNEL = 1 << 10,

        SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11,

        SKB_GSO_PARTIAL = 1 << 12,

        SKB_GSO_TUNNEL_REMCSUM = 1 << 13,

        SKB_GSO_SCTP = 1 << 14,

        SKB_GSO_ESP = 1 << 15,

        SKB_GSO_UDP = 1 << 16,

        SKB_GSO_UDP_L4 = 1 << 17,

        SKB_GSO_FRAGLIST = 1 << 18,

        SKB_GSO_TCP_ACCECN = 1 << 19,

        /* These indirectly map onto the same netdev feature.
         * If NETIF_F_TSO_MANGLEID is set it may mangle both inner and outer IDs.
         */
        SKB_GSO_TCP_FIXEDID = 1 << 30,
        SKB_GSO_TCP_FIXEDID_INNER = 1 << 31,
};

#if BITS_PER_LONG > 32
#define NET_SKBUFF_DATA_USES_OFFSET 1
#endif

#ifdef NET_SKBUFF_DATA_USES_OFFSET
typedef unsigned int sk_buff_data_t;
#else
typedef unsigned char *sk_buff_data_t;
#endif

enum skb_tstamp_type {
        SKB_CLOCK_REALTIME,
        SKB_CLOCK_MONOTONIC,
        SKB_CLOCK_TAI,
        __SKB_CLOCK_MAX = SKB_CLOCK_TAI,
};

/**
 * DOC: Basic sk_buff geometry
 *
 * struct sk_buff itself is a metadata structure and does not hold any packet
 * data. All the data is held in associated buffers.
 *
 * &sk_buff.head points to the main "head" buffer. The head buffer is divided
 * into two parts:
 *
 *  - data buffer, containing headers and sometimes payload;
 *    this is the part of the skb operated on by the common helpers
 *    such as skb_put() or skb_pull();
 *  - shared info (struct skb_shared_info) which holds an array of pointers
 *    to read-only data in the (page, offset, length) format.
 *
 * Optionally &skb_shared_info.frag_list may point to another skb.
 *
 * Basic diagram may look like this::
 *
 *                                  ---------------
 *                                 | sk_buff       |
 *                                  ---------------
 *     ,---------------------------  + head
 *    /          ,-----------------  + data
 *   /          /      ,-----------  + tail
 *  |          |      |            , + end
 *  |          |      |           |
 *  v          v      v           v
 *   -----------------------------------------------
 *  | headroom | data |  tailroom | skb_shared_info |
 *   -----------------------------------------------
 *                                 + [page frag]
 *                                 + [page frag]
 *                                 + [page frag]
 *                                 + [page frag]       ---------
 *                                 + frag_list    --> | sk_buff |
 *                                                     ---------
 *
 */

/**
 *        struct sk_buff - socket buffer
 *        @next: Next buffer in list
 *        @prev: Previous buffer in list
 *        @tstamp: Time we arrived/left
 *        @skb_mstamp_ns: (aka @tstamp) earliest departure time; start point
 *                for retransmit timer
 *        @rbnode: RB tree node, alternative to next/prev for netem/tcp
 *        @list: queue head
 *        @ll_node: anchor in an llist (eg socket defer_list)
 *        @sk: Socket we are owned by
 *        @dev: Device we arrived on/are leaving by
 *        @dev_scratch: (aka @dev) alternate use of @dev when @dev would be %NULL
 *        @cb: Control buffer. Free for use by every layer. Put private vars here
 *        @_skb_refdst: destination entry (with norefcount bit)
 *        @len: Length of actual data
 *        @data_len: Data length
 *        @mac_len: Length of link layer header
 *        @hdr_len: writable header length of cloned skb
 *        @csum: Checksum (must include start/offset pair)
 *        @csum_start: Offset from skb->head where checksumming should start
 *        @csum_offset: Offset from csum_start where checksum should be stored
 *        @priority: Packet queueing priority
 *        @ignore_df: allow local fragmentation
 *        @cloned: Head may be cloned (check refcnt to be sure)
 *        @ip_summed: Driver fed us an IP checksum
 *        @nohdr: Payload reference only, must not modify header
 *        @pkt_type: Packet class
 *        @fclone: skbuff clone status
 *        @ipvs_property: skbuff is owned by ipvs
 *        @inner_protocol_type: whether the inner protocol is
 *                ENCAP_TYPE_ETHER or ENCAP_TYPE_IPPROTO
 *        @remcsum_offload: remote checksum offload is enabled
 *        @offload_fwd_mark: Packet was L2-forwarded in hardware
 *        @offload_l3_fwd_mark: Packet was L3-forwarded in hardware
 *        @tc_skip_classify: do not classify packet. set by IFB device
 *        @tc_at_ingress: used within tc_classify to distinguish in/egress
 *        @redirected: packet was redirected by packet classifier
 *        @from_ingress: packet was redirected from the ingress path
 *        @nf_skip_egress: packet shall skip nf egress - see netfilter_netdev.h
 *        @peeked: this packet has been seen already, so stats have been
 *                done for it, don't do them again
 *        @nf_trace: netfilter packet trace flag
 *        @protocol: Packet protocol from driver
 *        @destructor: Destruct function
 *        @tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue)
 *        @_sk_redir: socket redirection information for skmsg
 *        @_nfct: Associated connection, if any (with nfctinfo bits)
 *        @skb_iif: ifindex of device we arrived on
 *        @tc_index: Traffic control index
 *        @hash: the packet hash
 *        @queue_mapping: Queue mapping for multiqueue devices
 *        @head_frag: skb was allocated from page fragments,
 *                not allocated by kmalloc() or vmalloc().
 *        @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves
 *        @pp_recycle: mark the packet for recycling instead of freeing (implies
 *                page_pool support on driver)
 *        @active_extensions: active extensions (skb_ext_id types)
 *        @ndisc_nodetype: router type (from link layer)
 *        @ooo_okay: allow the mapping of a socket to a queue to be changed
 *        @l4_hash: indicate hash is a canonical 4-tuple hash over transport
 *                ports.
 *        @sw_hash: indicates hash was computed in software stack
 *        @wifi_acked_valid: wifi_acked was set
 *        @wifi_acked: whether frame was acked on wifi or not
 *        @no_fcs:  Request NIC to treat last 4 bytes as Ethernet FCS
 *        @encapsulation: indicates the inner headers in the skbuff are valid
 *        @encap_hdr_csum: software checksum is needed
 *        @csum_valid: checksum is already valid
 *        @csum_not_inet: use CRC32c to resolve CHECKSUM_PARTIAL
 *        @csum_complete_sw: checksum was completed by software
 *        @csum_level: indicates the number of consecutive checksums found in
 *                the packet minus one that have been verified as
 *                CHECKSUM_UNNECESSARY (max 3)
 *        @unreadable: indicates that at least 1 of the fragments in this skb is
 *                unreadable.
 *        @dst_pending_confirm: need to confirm neighbour
 *        @decrypted: Decrypted SKB
 *        @slow_gro: state present at GRO time, slower prepare step required
 *        @tstamp_type: When set, skb->tstamp has the
 *                delivery_time clock base of skb->tstamp.
 *        @napi_id: id of the NAPI struct this skb came from
 *        @sender_cpu: (aka @napi_id) source CPU in XPS
 *        @alloc_cpu: CPU which did the skb allocation.
 *        @secmark: security marking
 *        @mark: Generic packet mark
 *        @reserved_tailroom: (aka @mark) number of bytes of free space available
 *                at the tail of an sk_buff
 *        @vlan_all: vlan fields (proto & tci)
 *        @vlan_proto: vlan encapsulation protocol
 *        @vlan_tci: vlan tag control information
 *        @inner_protocol: Protocol (encapsulation)
 *        @inner_ipproto: (aka @inner_protocol) stores ipproto when
 *                skb->inner_protocol_type == ENCAP_TYPE_IPPROTO;
 *        @inner_transport_header: Inner transport layer header (encapsulation)
 *        @inner_network_header: Network layer header (encapsulation)
 *        @inner_mac_header: Link layer header (encapsulation)
 *        @transport_header: Transport layer header
 *        @network_header: Network layer header
 *        @mac_header: Link layer header
 *        @kcov_handle: KCOV remote handle for remote coverage collection
 *        @tail: Tail pointer
 *        @end: End pointer
 *        @head: Head of buffer
 *        @data: Data head pointer
 *        @truesize: Buffer size
 *        @users: User count - see {datagram,tcp}.c
 *        @extensions: allocated extensions, valid if active_extensions is nonzero
 */

struct sk_buff {
        union {
                struct {
                        /* These two members must be first to match sk_buff_head. */
                        struct sk_buff                *next;
                        struct sk_buff                *prev;

                        union {
                                struct net_device        *dev;
                                /* Some protocols might use this space to store information,
                                 * while device pointer would be NULL.
                                 * UDP receive path is one user.
                                 */
                                unsigned long                dev_scratch;
                        };
                };
                struct rb_node                rbnode; /* used in netem, ip4 defrag, and tcp stack */
                struct list_head        list;
                struct llist_node        ll_node;
        };

        struct sock                *sk;

        union {
                ktime_t                tstamp;
                u64                skb_mstamp_ns; /* earliest departure time */
        };
        /*
         * This is the control buffer. It is free to use for every
         * layer. Please put your private variables there. If you
         * want to keep them across layers you have to do a skb_clone()
         * first. This is owned by whoever has the skb queued ATM.
         */
        char                        cb[48] __aligned(8);

        union {
                struct {
                        unsigned long        _skb_refdst;
                        void                (*destructor)(struct sk_buff *skb);
                };
                struct list_head        tcp_tsorted_anchor;
#ifdef CONFIG_NET_SOCK_MSG
                unsigned long                _sk_redir;
#endif
        };

#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        unsigned long                 _nfct;
#endif
        unsigned int                len,
                                data_len;
        __u16                        mac_len,
                                hdr_len;

        /* Following fields are _not_ copied in __copy_skb_header()
         * Note that queue_mapping is here mostly to fill a hole.
         */
        __u16                        queue_mapping;

/* if you move cloned around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define CLONED_MASK        (1 << 7)
#else
#define CLONED_MASK        1
#endif
#define CLONED_OFFSET                offsetof(struct sk_buff, __cloned_offset)

        /* private: */
        __u8                        __cloned_offset[0];
        /* public: */
        __u8                        cloned:1,
                                nohdr:1,
                                fclone:2,
                                peeked:1,
                                head_frag:1,
                                pfmemalloc:1,
                                pp_recycle:1; /* page_pool recycle indicator */
#ifdef CONFIG_SKB_EXTENSIONS
        __u8                        active_extensions;
#endif

        /* Fields enclosed in headers group are copied
         * using a single memcpy() in __copy_skb_header()
         */
        struct_group(headers,

        /* private: */
        __u8                        __pkt_type_offset[0];
        /* public: */
        __u8                        pkt_type:3; /* see PKT_TYPE_MAX */
        __u8                        ignore_df:1;
        __u8                        dst_pending_confirm:1;
        __u8                        ip_summed:2;
        __u8                        ooo_okay:1;

        /* private: */
        __u8                        __mono_tc_offset[0];
        /* public: */
        __u8                        tstamp_type:2;        /* See skb_tstamp_type */
#ifdef CONFIG_NET_XGRESS
        __u8                        tc_at_ingress:1;        /* See TC_AT_INGRESS_MASK */
        __u8                        tc_skip_classify:1;
#endif
        __u8                        remcsum_offload:1;
        __u8                        csum_complete_sw:1;
        __u8                        csum_level:2;
        __u8                        inner_protocol_type:1;

        __u8                        l4_hash:1;
        __u8                        sw_hash:1;
#ifdef CONFIG_WIRELESS
        __u8                        wifi_acked_valid:1;
        __u8                        wifi_acked:1;
#endif
        __u8                        no_fcs:1;
        /* Indicates the inner headers are valid in the skbuff. */
        __u8                        encapsulation:1;
        __u8                        encap_hdr_csum:1;
        __u8                        csum_valid:1;
#ifdef CONFIG_IPV6_NDISC_NODETYPE
        __u8                        ndisc_nodetype:2;
#endif

#if IS_ENABLED(CONFIG_IP_VS)
        __u8                        ipvs_property:1;
#endif
#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES)
        __u8                        nf_trace:1;
#endif
#ifdef CONFIG_NET_SWITCHDEV
        __u8                        offload_fwd_mark:1;
        __u8                        offload_l3_fwd_mark:1;
#endif
        __u8                        redirected:1;
#ifdef CONFIG_NET_REDIRECT
        __u8                        from_ingress:1;
#endif
#ifdef CONFIG_NETFILTER_SKIP_EGRESS
        __u8                        nf_skip_egress:1;
#endif
#ifdef CONFIG_SKB_DECRYPTED
        __u8                        decrypted:1;
#endif
        __u8                        slow_gro:1;
#if IS_ENABLED(CONFIG_IP_SCTP)
        __u8                        csum_not_inet:1;
#endif
        __u8                        unreadable:1;
#if defined(CONFIG_NET_SCHED) || defined(CONFIG_NET_XGRESS)
        __u16                        tc_index;        /* traffic control index */
#endif

        u16                        alloc_cpu;

        union {
                __wsum                csum;
                struct {
                        __u16        csum_start;
                        __u16        csum_offset;
                };
        };
        __u32                        priority;
        int                        skb_iif;
        __u32                        hash;
        union {
                u32                vlan_all;
                struct {
                        __be16        vlan_proto;
                        __u16        vlan_tci;
                };
        };
#if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS)
        union {
                unsigned int        napi_id;
                unsigned int        sender_cpu;
        };
#endif
#ifdef CONFIG_NETWORK_SECMARK
        __u32                secmark;
#endif

        union {
                __u32                mark;
                __u32                reserved_tailroom;
        };

        union {
                __be16                inner_protocol;
                __u8                inner_ipproto;
        };

        __u16                        inner_transport_header;
        __u16                        inner_network_header;
        __u16                        inner_mac_header;

        __be16                        protocol;
        __u16                        transport_header;
        __u16                        network_header;
        __u16                        mac_header;

#ifdef CONFIG_KCOV
        u64                        kcov_handle;
#endif

        ); /* end headers group */

        /* These elements must be at the end, see alloc_skb() for details.  */
        sk_buff_data_t                tail;
        sk_buff_data_t                end;
        unsigned char                *head,
                                *data;
        unsigned int                truesize;
        refcount_t                users;

#ifdef CONFIG_SKB_EXTENSIONS
        /* only usable after checking ->active_extensions != 0 */
        struct skb_ext                *extensions;
#endif
};

/* if you move pkt_type around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define PKT_TYPE_MAX        (7 << 5)
#else
#define PKT_TYPE_MAX        7
#endif
#define PKT_TYPE_OFFSET                offsetof(struct sk_buff, __pkt_type_offset)

/* if you move tc_at_ingress or tstamp_type
 * around, you also must adapt these constants.
 */
#ifdef __BIG_ENDIAN_BITFIELD
#define SKB_TSTAMP_TYPE_MASK                (3 << 6)
#define SKB_TSTAMP_TYPE_RSHIFT                (6)
#define TC_AT_INGRESS_MASK                (1 << 5)
#else
#define SKB_TSTAMP_TYPE_MASK                (3)
#define TC_AT_INGRESS_MASK                (1 << 2)
#endif
#define SKB_BF_MONO_TC_OFFSET                offsetof(struct sk_buff, __mono_tc_offset)

#ifdef __KERNEL__
/*
 *        Handling routines are only of interest to the kernel
 */

#define SKB_ALLOC_FCLONE        0x01
#define SKB_ALLOC_RX                0x02
#define SKB_ALLOC_NAPI                0x04

/**
 * skb_pfmemalloc - Test if the skb was allocated from PFMEMALLOC reserves
 * @skb: buffer
 */
static inline bool skb_pfmemalloc(const struct sk_buff *skb)
{
        return unlikely(skb->pfmemalloc);
}

/*
 * skb might have a dst pointer attached, refcounted or not.
 * _skb_refdst low order bit is set if refcount was _not_ taken
 */
#define SKB_DST_NOREF        1UL
#define SKB_DST_PTRMASK        ~(SKB_DST_NOREF)

/**
 * skb_dst - returns skb dst_entry
 * @skb: buffer
 *
 * Returns: skb dst_entry, regardless of reference taken or not.
 */
static inline struct dst_entry *skb_dst(const struct sk_buff *skb)
{
        /* If refdst was not refcounted, check we still are in a
         * rcu_read_lock section
         */
        WARN_ON((skb->_skb_refdst & SKB_DST_NOREF) &&
                !rcu_read_lock_held() &&
                !rcu_read_lock_bh_held());
        return (struct dst_entry *)(skb->_skb_refdst & SKB_DST_PTRMASK);
}

static inline void skb_dst_check_unset(struct sk_buff *skb)
{
        DEBUG_NET_WARN_ON_ONCE((skb->_skb_refdst & SKB_DST_PTRMASK) &&
                               !(skb->_skb_refdst & SKB_DST_NOREF));
}

/**
 * skb_dstref_steal() - return current dst_entry value and clear it
 * @skb: buffer
 *
 * Resets skb dst_entry without adjusting its reference count. Useful in
 * cases where dst_entry needs to be temporarily reset and restored.
 * Note that the returned value cannot be used directly because it
 * might contain SKB_DST_NOREF bit.
 *
 * When in doubt, prefer skb_dst_drop() over skb_dstref_steal() to correctly
 * handle dst_entry reference counting.
 *
 * Returns: original skb dst_entry.
 */
static inline unsigned long skb_dstref_steal(struct sk_buff *skb)
{
        unsigned long refdst = skb->_skb_refdst;

        skb->_skb_refdst = 0;
        return refdst;
}

/**
 * skb_dstref_restore() - restore skb dst_entry removed via skb_dstref_steal()
 * @skb: buffer
 * @refdst: dst entry from a call to skb_dstref_steal()
 */
static inline void skb_dstref_restore(struct sk_buff *skb, unsigned long refdst)
{
        skb_dst_check_unset(skb);
        skb->_skb_refdst = refdst;
}

/**
 * skb_dst_set - sets skb dst
 * @skb: buffer
 * @dst: dst entry
 *
 * Sets skb dst, assuming a reference was taken on dst and should
 * be released by skb_dst_drop()
 */
static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst)
{
        skb_dst_check_unset(skb);
        skb->slow_gro |= !!dst;
        skb->_skb_refdst = (unsigned long)dst;
}

/**
 * skb_dst_set_noref - sets skb dst, hopefully, without taking reference
 * @skb: buffer
 * @dst: dst entry
 *
 * Sets skb dst, assuming a reference was not taken on dst.
 * If dst entry is cached, we do not take reference and dst_release
 * will be avoided by refdst_drop. If dst entry is not cached, we take
 * reference, so that last dst_release can destroy the dst immediately.
 */
static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst)
{
        skb_dst_check_unset(skb);
        WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
        skb->slow_gro |= !!dst;
        skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF;
}

/**
 * skb_dst_is_noref - Test if skb dst isn't refcounted
 * @skb: buffer
 */
static inline bool skb_dst_is_noref(const struct sk_buff *skb)
{
        return (skb->_skb_refdst & SKB_DST_NOREF) && skb_dst(skb);
}

/* For mangling skb->pkt_type from user space side from applications
 * such as nft, tc, etc, we only allow a conservative subset of
 * possible pkt_types to be set.
*/
static inline bool skb_pkt_type_ok(u32 ptype)
{
        return ptype <= PACKET_OTHERHOST;
}

/**
 * skb_napi_id - Returns the skb's NAPI id
 * @skb: buffer
 */
static inline unsigned int skb_napi_id(const struct sk_buff *skb)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        return skb->napi_id;
#else
        return 0;
#endif
}

static inline bool skb_wifi_acked_valid(const struct sk_buff *skb)
{
#ifdef CONFIG_WIRELESS
        return skb->wifi_acked_valid;
#else
        return 0;
#endif
}

/**
 * skb_unref - decrement the skb's reference count
 * @skb: buffer
 *
 * Returns: true if we can free the skb.
 */
static inline bool skb_unref(struct sk_buff *skb)
{
        if (unlikely(!skb))
                return false;
        if (!IS_ENABLED(CONFIG_DEBUG_NET) && likely(refcount_read(&skb->users) == 1))
                smp_rmb();
        else if (likely(!refcount_dec_and_test(&skb->users)))
                return false;

        return true;
}

static inline bool skb_data_unref(const struct sk_buff *skb,
                                  struct skb_shared_info *shinfo)
{
        int bias;

        if (!skb->cloned)
                return true;

        bias = skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1;

        if (atomic_read(&shinfo->dataref) == bias)
                smp_rmb();
        else if (atomic_sub_return(bias, &shinfo->dataref))
                return false;

        return true;
}

void __fix_address sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb,
                                      enum skb_drop_reason reason);

static inline void
kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
{
        sk_skb_reason_drop(NULL, skb, reason);
}

/**
 *        kfree_skb - free an sk_buff with 'NOT_SPECIFIED' reason
 *        @skb: buffer to free
 */
static inline void kfree_skb(struct sk_buff *skb)
{
        kfree_skb_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED);
}

void skb_release_head_state(struct sk_buff *skb);
void kfree_skb_list_reason(struct sk_buff *segs,
                           enum skb_drop_reason reason);
void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt);
void skb_tx_error(struct sk_buff *skb);

static inline void kfree_skb_list(struct sk_buff *segs)
{
        kfree_skb_list_reason(segs, SKB_DROP_REASON_NOT_SPECIFIED);
}

#ifdef CONFIG_TRACEPOINTS
void consume_skb(struct sk_buff *skb);
#else
static inline void consume_skb(struct sk_buff *skb)
{
        return kfree_skb(skb);
}
#endif

void __consume_stateless_skb(struct sk_buff *skb);
void  __kfree_skb(struct sk_buff *skb);

void kfree_skb_partial(struct sk_buff *skb, bool head_stolen);
bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
                      bool *fragstolen, int *delta_truesize);

struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags,
                            int node);
struct sk_buff *__build_skb(void *data, unsigned int frag_size);
struct sk_buff *build_skb(void *data, unsigned int frag_size);
struct sk_buff *build_skb_around(struct sk_buff *skb,
                                 void *data, unsigned int frag_size);
void skb_attempt_defer_free(struct sk_buff *skb);

u32 napi_skb_cache_get_bulk(void **skbs, u32 n);
struct sk_buff *napi_build_skb(void *data, unsigned int frag_size);
struct sk_buff *slab_build_skb(void *data);

/**
 * alloc_skb - allocate a network buffer
 * @size: size to allocate
 * @priority: allocation mask
 *
 * This function is a convenient wrapper around __alloc_skb().
 */
static inline struct sk_buff *alloc_skb(unsigned int size,
                                        gfp_t priority)
{
        return __alloc_skb(size, priority, 0, NUMA_NO_NODE);
}

struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
                                     unsigned long data_len,
                                     int max_page_order,
                                     int *errcode,
                                     gfp_t gfp_mask);
struct sk_buff *alloc_skb_for_msg(struct sk_buff *first);

/* Layout of fast clones : [skb1][skb2][fclone_ref] */
struct sk_buff_fclones {
        struct sk_buff        skb1;

        struct sk_buff        skb2;

        refcount_t        fclone_ref;
};

/**
 *        skb_fclone_busy - check if fclone is busy
 *        @sk: socket
 *        @skb: buffer
 *
 * Returns: true if skb is a fast clone, and its clone is not freed.
 * Some drivers call skb_orphan() in their ndo_start_xmit(),
 * so we also check that didn't happen.
 */
static inline bool skb_fclone_busy(const struct sock *sk,
                                   const struct sk_buff *skb)
{
        const struct sk_buff_fclones *fclones;

        fclones = container_of(skb, struct sk_buff_fclones, skb1);

        return skb->fclone == SKB_FCLONE_ORIG &&
               refcount_read(&fclones->fclone_ref) > 1 &&
               READ_ONCE(fclones->skb2.sk) == sk;
}

/**
 * alloc_skb_fclone - allocate a network buffer from fclone cache
 * @size: size to allocate
 * @priority: allocation mask
 *
 * This function is a convenient wrapper around __alloc_skb().
 */
static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
                                               gfp_t priority)
{
        return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE);
}

struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src);
void skb_headers_offset_update(struct sk_buff *skb, int off);
int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask);
struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority);
void skb_copy_header(struct sk_buff *new, const struct sk_buff *old);
struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t priority);
struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
                                   gfp_t gfp_mask, bool fclone);
static inline struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom,
                                          gfp_t gfp_mask)
{
        return __pskb_copy_fclone(skb, headroom, gfp_mask, false);
}

int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, gfp_t gfp_mask);
struct sk_buff *skb_realloc_headroom(struct sk_buff *skb,
                                     unsigned int headroom);
struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom);
struct sk_buff *skb_copy_expand(const struct sk_buff *skb, int newheadroom,
                                int newtailroom, gfp_t priority);
int __must_check skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
                                     int offset, int len);
int __must_check skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg,
                              int offset, int len);
int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer);
int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error);

/**
 *        skb_pad                        -        zero pad the tail of an skb
 *        @skb: buffer to pad
 *        @pad: space to pad
 *
 *        Ensure that a buffer is followed by a padding area that is zero
 *        filled. Used by network drivers which may DMA or transfer data
 *        beyond the buffer end onto the wire.
 *
 *        May return error in out of memory cases. The skb is freed on error.
 */
static inline int skb_pad(struct sk_buff *skb, int pad)
{
        return __skb_pad(skb, pad, true);
}
#define dev_kfree_skb(a)        consume_skb(a)

int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
                         int offset, size_t size, size_t max_frags);

struct skb_seq_state {
        __u32                lower_offset;
        __u32                upper_offset;
        __u32                frag_idx;
        __u32                stepped_offset;
        struct sk_buff        *root_skb;
        struct sk_buff        *cur_skb;
        __u8                *frag_data;
        __u32                frag_off;
};

void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
                          unsigned int to, struct skb_seq_state *st);
unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
                          struct skb_seq_state *st);
void skb_abort_seq_read(struct skb_seq_state *st);
int skb_copy_seq_read(struct skb_seq_state *st, int offset, void *to, int len);

unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
                           unsigned int to, struct ts_config *config);

/*
 * Packet hash types specify the type of hash in skb_set_hash.
 *
 * Hash types refer to the protocol layer addresses which are used to
 * construct a packet's hash. The hashes are used to differentiate or identify
 * flows of the protocol layer for the hash type. Hash types are either
 * layer-2 (L2), layer-3 (L3), or layer-4 (L4).
 *
 * Properties of hashes:
 *
 * 1) Two packets in different flows have different hash values
 * 2) Two packets in the same flow should have the same hash value
 *
 * A hash at a higher layer is considered to be more specific. A driver should
 * set the most specific hash possible.
 *
 * A driver cannot indicate a more specific hash than the layer at which a hash
 * was computed. For instance an L3 hash cannot be set as an L4 hash.
 *
 * A driver may indicate a hash level which is less specific than the
 * actual layer the hash was computed on. For instance, a hash computed
 * at L4 may be considered an L3 hash. This should only be done if the
 * driver can't unambiguously determine that the HW computed the hash at
 * the higher layer. Note that the "should" in the second property above
 * permits this.
 */
enum pkt_hash_types {
        PKT_HASH_TYPE_NONE,        /* Undefined type */
        PKT_HASH_TYPE_L2,        /* Input: src_MAC, dest_MAC */
        PKT_HASH_TYPE_L3,        /* Input: src_IP, dst_IP */
        PKT_HASH_TYPE_L4,        /* Input: src_IP, dst_IP, src_port, dst_port */
};

static inline void skb_clear_hash(struct sk_buff *skb)
{
        skb->hash = 0;
        skb->sw_hash = 0;
        skb->l4_hash = 0;
}

static inline void skb_clear_hash_if_not_l4(struct sk_buff *skb)
{
        if (!skb->l4_hash)
                skb_clear_hash(skb);
}

static inline void
__skb_set_hash(struct sk_buff *skb, __u32 hash, bool is_sw, bool is_l4)
{
        skb->l4_hash = is_l4;
        skb->sw_hash = is_sw;
        skb->hash = hash;
}

static inline void
skb_set_hash(struct sk_buff *skb, __u32 hash, enum pkt_hash_types type)
{
        /* Used by drivers to set hash from HW */
        __skb_set_hash(skb, hash, false, type == PKT_HASH_TYPE_L4);
}

static inline void
__skb_set_sw_hash(struct sk_buff *skb, __u32 hash, bool is_l4)
{
        __skb_set_hash(skb, hash, true, is_l4);
}

u32 __skb_get_hash_symmetric_net(const struct net *net, const struct sk_buff *skb);

static inline u32 __skb_get_hash_symmetric(const struct sk_buff *skb)
{
        return __skb_get_hash_symmetric_net(NULL, skb);
}

void __skb_get_hash_net(const struct net *net, struct sk_buff *skb);
u32 skb_get_poff(const struct sk_buff *skb);
u32 __skb_get_poff(const struct sk_buff *skb, const void *data,
                   const struct flow_keys_basic *keys, int hlen);
__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
                          const void *data, int hlen_proto);

void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
                             const struct flow_dissector_key *key,
                             unsigned int key_count);

struct bpf_flow_dissector;
u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
                     __be16 proto, int nhoff, int hlen, unsigned int flags);

bool __skb_flow_dissect(const struct net *net,
                        const struct sk_buff *skb,
                        struct flow_dissector *flow_dissector,
                        void *target_container, const void *data,
                        __be16 proto, int nhoff, int hlen, unsigned int flags);

static inline bool skb_flow_dissect(const struct sk_buff *skb,
                                    struct flow_dissector *flow_dissector,
                                    void *target_container, unsigned int flags)
{
        return __skb_flow_dissect(NULL, skb, flow_dissector,
                                  target_container, NULL, 0, 0, 0, flags);
}

static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb,
                                              struct flow_keys *flow,
                                              unsigned int flags)
{
        memset(flow, 0, sizeof(*flow));
        return __skb_flow_dissect(NULL, skb, &flow_keys_dissector,
                                  flow, NULL, 0, 0, 0, flags);
}

static inline bool
skb_flow_dissect_flow_keys_basic(const struct net *net,
                                 const struct sk_buff *skb,
                                 struct flow_keys_basic *flow,
                                 const void *data, __be16 proto,
                                 int nhoff, int hlen, unsigned int flags)
{
        memset(flow, 0, sizeof(*flow));
        return __skb_flow_dissect(net, skb, &flow_keys_basic_dissector, flow,
                                  data, proto, nhoff, hlen, flags);
}

void skb_flow_dissect_meta(const struct sk_buff *skb,
                           struct flow_dissector *flow_dissector,
                           void *target_container);

/* Gets a skb connection tracking info, ctinfo map should be a
 * map of mapsize to translate enum ip_conntrack_info states
 * to user states.
 */
void
skb_flow_dissect_ct(const struct sk_buff *skb,
                    struct flow_dissector *flow_dissector,
                    void *target_container,
                    u16 *ctinfo_map, size_t mapsize,
                    bool post_ct, u16 zone);
void
skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
                             struct flow_dissector *flow_dissector,
                             void *target_container);

void skb_flow_dissect_hash(const struct sk_buff *skb,
                           struct flow_dissector *flow_dissector,
                           void *target_container);

static inline __u32 skb_get_hash_net(const struct net *net, struct sk_buff *skb)
{
        if (!skb->l4_hash && !skb->sw_hash)
                __skb_get_hash_net(net, skb);

        return skb->hash;
}

static inline __u32 skb_get_hash(struct sk_buff *skb)
{
        if (!skb->l4_hash && !skb->sw_hash)
                __skb_get_hash_net(NULL, skb);

        return skb->hash;
}

static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6)
{
        if (!skb->l4_hash && !skb->sw_hash) {
                struct flow_keys keys;
                __u32 hash = __get_hash_from_flowi6(fl6, &keys);

                __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
        }

        return skb->hash;
}

__u32 skb_get_hash_perturb(const struct sk_buff *skb,
                           const siphash_key_t *perturb);

static inline __u32 skb_get_hash_raw(const struct sk_buff *skb)
{
        return skb->hash;
}

static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from)
{
        to->hash = from->hash;
        to->sw_hash = from->sw_hash;
        to->l4_hash = from->l4_hash;
};

static inline int skb_cmp_decrypted(const struct sk_buff *skb1,
                                    const struct sk_buff *skb2)
{
#ifdef CONFIG_SKB_DECRYPTED
        return skb2->decrypted - skb1->decrypted;
#else
        return 0;
#endif
}

static inline bool skb_is_decrypted(const struct sk_buff *skb)
{
#ifdef CONFIG_SKB_DECRYPTED
        return skb->decrypted;
#else
        return false;
#endif
}

static inline void skb_copy_decrypted(struct sk_buff *to,
                                      const struct sk_buff *from)
{
#ifdef CONFIG_SKB_DECRYPTED
        to->decrypted = from->decrypted;
#endif
}

#ifdef NET_SKBUFF_DATA_USES_OFFSET
static inline unsigned char *skb_end_pointer(const struct sk_buff *skb)
{
        return skb->head + skb->end;
}

static inline unsigned int skb_end_offset(const struct sk_buff *skb)
{
        return skb->end;
}

static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset)
{
        skb->end = offset;
}
#else
static inline unsigned char *skb_end_pointer(const struct sk_buff *skb)
{
        return skb->end;
}

static inline unsigned int skb_end_offset(const struct sk_buff *skb)
{
        return skb->end - skb->head;
}

static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset)
{
        skb->end = skb->head + offset;
}
#endif

extern const struct ubuf_info_ops msg_zerocopy_ubuf_ops;

struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
                                       struct ubuf_info *uarg, bool devmem);

void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);

struct net_devmem_dmabuf_binding;

int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
                            struct sk_buff *skb, struct iov_iter *from,
                            size_t length,
                            struct net_devmem_dmabuf_binding *binding);

int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
                                struct iov_iter *from, size_t length);

static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb,
                                          struct msghdr *msg, int len)
{
        return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len,
                                       NULL);
}

int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
                             struct msghdr *msg, int len,
                             struct ubuf_info *uarg,
                             struct net_devmem_dmabuf_binding *binding);

/* Internal */
#define skb_shinfo(SKB)        ((struct skb_shared_info *)(skb_end_pointer(SKB)))

static inline struct skb_shared_hwtstamps *skb_hwtstamps(struct sk_buff *skb)
{
        return &skb_shinfo(skb)->hwtstamps;
}

static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
{
        bool is_zcopy = skb && skb_shinfo(skb)->flags & SKBFL_ZEROCOPY_ENABLE;

        return is_zcopy ? skb_uarg(skb) : NULL;
}

static inline bool skb_zcopy_pure(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY;
}

static inline bool skb_zcopy_managed(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->flags & SKBFL_MANAGED_FRAG_REFS;
}

static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1,
                                       const struct sk_buff *skb2)
{
        return skb_zcopy_pure(skb1) == skb_zcopy_pure(skb2);
}

static inline void net_zcopy_get(struct ubuf_info *uarg)
{
        refcount_inc(&uarg->refcnt);
}

static inline void skb_zcopy_init(struct sk_buff *skb, struct ubuf_info *uarg)
{
        skb_shinfo(skb)->destructor_arg = uarg;
        skb_shinfo(skb)->flags |= uarg->flags;
}

static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg,
                                 bool *have_ref)
{
        if (skb && uarg && !skb_zcopy(skb)) {
                if (unlikely(have_ref && *have_ref))
                        *have_ref = false;
                else
                        net_zcopy_get(uarg);
                skb_zcopy_init(skb, uarg);
        }
}

static inline void skb_zcopy_set_nouarg(struct sk_buff *skb, void *val)
{
        skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t) val | 0x1UL);
        skb_shinfo(skb)->flags |= SKBFL_ZEROCOPY_FRAG;
}

static inline bool skb_zcopy_is_nouarg(struct sk_buff *skb)
{
        return (uintptr_t) skb_shinfo(skb)->destructor_arg & 0x1UL;
}

static inline void *skb_zcopy_get_nouarg(struct sk_buff *skb)
{
        return (void *)((uintptr_t) skb_shinfo(skb)->destructor_arg & ~0x1UL);
}

static inline void net_zcopy_put(struct ubuf_info *uarg)
{
        if (uarg)
                uarg->ops->complete(NULL, uarg, true);
}

static inline void net_zcopy_put_abort(struct ubuf_info *uarg, bool have_uref)
{
        if (uarg) {
                if (uarg->ops == &msg_zerocopy_ubuf_ops)
                        msg_zerocopy_put_abort(uarg, have_uref);
                else if (have_uref)
                        net_zcopy_put(uarg);
        }
}

/* Release a reference on a zerocopy structure */
static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success)
{
        struct ubuf_info *uarg = skb_zcopy(skb);

        if (uarg) {
                if (!skb_zcopy_is_nouarg(skb))
                        uarg->ops->complete(skb, uarg, zerocopy_success);

                skb_shinfo(skb)->flags &= ~SKBFL_ALL_ZEROCOPY;
        }
}

void __skb_zcopy_downgrade_managed(struct sk_buff *skb);

static inline void skb_zcopy_downgrade_managed(struct sk_buff *skb)
{
        if (unlikely(skb_zcopy_managed(skb)))
                __skb_zcopy_downgrade_managed(skb);
}

/* Return true if frags in this skb are readable by the host. */
static inline bool skb_frags_readable(const struct sk_buff *skb)
{
        return !skb->unreadable;
}

static inline void skb_mark_not_on_list(struct sk_buff *skb)
{
        skb->next = NULL;
}

static inline void skb_poison_list(struct sk_buff *skb)
{
#ifdef CONFIG_DEBUG_NET
        skb->next = SKB_LIST_POISON_NEXT;
#endif
}

/* Iterate through singly-linked GSO fragments of an skb. */
#define skb_list_walk_safe(first, skb, next_skb)                               \
        for ((skb) = (first), (next_skb) = (skb) ? (skb)->next : NULL; (skb);  \
             (skb) = (next_skb), (next_skb) = (skb) ? (skb)->next : NULL)

static inline void skb_list_del_init(struct sk_buff *skb)
{
        __list_del_entry(&skb->list);
        skb_mark_not_on_list(skb);
}

/**
 *        skb_queue_empty - check if a queue is empty
 *        @list: queue head
 *
 *        Returns true if the queue is empty, false otherwise.
 */
static inline int skb_queue_empty(const struct sk_buff_head *list)
{
        return list->next == (const struct sk_buff *) list;
}

/**
 *        skb_queue_empty_lockless - check if a queue is empty
 *        @list: queue head
 *
 *        Returns true if the queue is empty, false otherwise.
 *        This variant can be used in lockless contexts.
 */
static inline bool skb_queue_empty_lockless(const struct sk_buff_head *list)
{
        return READ_ONCE(list->next) == (const struct sk_buff *) list;
}


/**
 *        skb_queue_is_last - check if skb is the last entry in the queue
 *        @list: queue head
 *        @skb: buffer
 *
 *        Returns true if @skb is the last buffer on the list.
 */
static inline bool skb_queue_is_last(const struct sk_buff_head *list,
                                     const struct sk_buff *skb)
{
        return skb->next == (const struct sk_buff *) list;
}

/**
 *        skb_queue_is_first - check if skb is the first entry in the queue
 *        @list: queue head
 *        @skb: buffer
 *
 *        Returns true if @skb is the first buffer on the list.
 */
static inline bool skb_queue_is_first(const struct sk_buff_head *list,
                                      const struct sk_buff *skb)
{
        return skb->prev == (const struct sk_buff *) list;
}

/**
 *        skb_queue_next - return the next packet in the queue
 *        @list: queue head
 *        @skb: current buffer
 *
 *        Return the next packet in @list after @skb.  It is only valid to
 *        call this if skb_queue_is_last() evaluates to false.
 */
static inline struct sk_buff *skb_queue_next(const struct sk_buff_head *list,
                                             const struct sk_buff *skb)
{
        /* This BUG_ON may seem severe, but if we just return then we
         * are going to dereference garbage.
         */
        BUG_ON(skb_queue_is_last(list, skb));
        return skb->next;
}

/**
 *        skb_queue_prev - return the prev packet in the queue
 *        @list: queue head
 *        @skb: current buffer
 *
 *        Return the prev packet in @list before @skb.  It is only valid to
 *        call this if skb_queue_is_first() evaluates to false.
 */
static inline struct sk_buff *skb_queue_prev(const struct sk_buff_head *list,
                                             const struct sk_buff *skb)
{
        /* This BUG_ON may seem severe, but if we just return then we
         * are going to dereference garbage.
         */
        BUG_ON(skb_queue_is_first(list, skb));
        return skb->prev;
}

/**
 *        skb_get - reference buffer
 *        @skb: buffer to reference
 *
 *        Makes another reference to a socket buffer and returns a pointer
 *        to the buffer.
 */
static inline struct sk_buff *skb_get(struct sk_buff *skb)
{
        refcount_inc(&skb->users);
        return skb;
}

/*
 * If users == 1, we are the only owner and can avoid redundant atomic changes.
 */

/**
 *        skb_cloned - is the buffer a clone
 *        @skb: buffer to check
 *
 *        Returns true if the buffer was generated with skb_clone() and is
 *        one of multiple shared copies of the buffer. Cloned buffers are
 *        shared data so must not be written to under normal circumstances.
 */
static inline int skb_cloned(const struct sk_buff *skb)
{
        return skb->cloned &&
               (atomic_read(&skb_shinfo(skb)->dataref) & SKB_DATAREF_MASK) != 1;
}

static inline int skb_unclone(struct sk_buff *skb, gfp_t pri)
{
        might_sleep_if(gfpflags_allow_blocking(pri));

        if (skb_cloned(skb))
                return pskb_expand_head(skb, 0, 0, pri);

        return 0;
}

/* This variant of skb_unclone() makes sure skb->truesize
 * and skb_end_offset() are not changed, whenever a new skb->head is needed.
 *
 * Indeed there is no guarantee that ksize(kmalloc(X)) == ksize(kmalloc(X))
 * when various debugging features are in place.
 */
int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri);
static inline int skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri)
{
        might_sleep_if(gfpflags_allow_blocking(pri));

        if (skb_cloned(skb))
                return __skb_unclone_keeptruesize(skb, pri);
        return 0;
}

/**
 *        skb_header_cloned - is the header a clone
 *        @skb: buffer to check
 *
 *        Returns true if modifying the header part of the buffer requires
 *        the data to be copied.
 */
static inline int skb_header_cloned(const struct sk_buff *skb)
{
        int dataref;

        if (!skb->cloned)
                return 0;

        dataref = atomic_read(&skb_shinfo(skb)->dataref);
        dataref = (dataref & SKB_DATAREF_MASK) - (dataref >> SKB_DATAREF_SHIFT);
        return dataref != 1;
}

static inline int skb_header_unclone(struct sk_buff *skb, gfp_t pri)
{
        might_sleep_if(gfpflags_allow_blocking(pri));

        if (skb_header_cloned(skb))
                return pskb_expand_head(skb, 0, 0, pri);

        return 0;
}

/**
 * __skb_header_release() - allow clones to use the headroom
 * @skb: buffer to operate on
 *
 * See "DOC: dataref and headerless skbs".
 */
static inline void __skb_header_release(struct sk_buff *skb)
{
        skb->nohdr = 1;
        atomic_set(&skb_shinfo(skb)->dataref, 1 + (1 << SKB_DATAREF_SHIFT));
}


/**
 *        skb_shared - is the buffer shared
 *        @skb: buffer to check
 *
 *        Returns true if more than one person has a reference to this
 *        buffer.
 */
static inline int skb_shared(const struct sk_buff *skb)
{
        return refcount_read(&skb->users) != 1;
}

/**
 *        skb_share_check - check if buffer is shared and if so clone it
 *        @skb: buffer to check
 *        @pri: priority for memory allocation
 *
 *        If the buffer is shared the buffer is cloned and the old copy
 *        drops a reference. A new clone with a single reference is returned.
 *        If the buffer is not shared the original buffer is returned. When
 *        being called from interrupt status or with spinlocks held pri must
 *        be GFP_ATOMIC.
 *
 *        NULL is returned on a memory allocation failure.
 */
static inline struct sk_buff *skb_share_check(struct sk_buff *skb, gfp_t pri)
{
        might_sleep_if(gfpflags_allow_blocking(pri));
        if (skb_shared(skb)) {
                struct sk_buff *nskb = skb_clone(skb, pri);

                if (likely(nskb))
                        consume_skb(skb);
                else
                        kfree_skb(skb);
                skb = nskb;
        }
        return skb;
}

/*
 *        Copy shared buffers into a new sk_buff. We effectively do COW on
 *        packets to handle cases where we have a local reader and forward
 *        and a couple of other messy ones. The normal one is tcpdumping
 *        a packet that's being forwarded.
 */

/**
 *        skb_unshare - make a copy of a shared buffer
 *        @skb: buffer to check
 *        @pri: priority for memory allocation
 *
 *        If the socket buffer is a clone then this function creates a new
 *        copy of the data, drops a reference count on the old copy and returns
 *        the new copy with the reference count at 1. If the buffer is not a clone
 *        the original buffer is returned. When called with a spinlock held or
 *        from interrupt state @pri must be %GFP_ATOMIC
 *
 *        %NULL is returned on a memory allocation failure.
 */
static inline struct sk_buff *skb_unshare(struct sk_buff *skb,
                                          gfp_t pri)
{
        might_sleep_if(gfpflags_allow_blocking(pri));
        if (skb_cloned(skb)) {
                struct sk_buff *nskb = skb_copy(skb, pri);

                /* Free our shared copy */
                if (likely(nskb))
                        consume_skb(skb);
                else
                        kfree_skb(skb);
                skb = nskb;
        }
        return skb;
}

/**
 *        skb_peek - peek at the head of an &sk_buff_head
 *        @list_: list to peek at
 *
 *        Peek an &sk_buff. Unlike most other operations you _MUST_
 *        be careful with this one. A peek leaves the buffer on the
 *        list and someone else may run off with it. You must hold
 *        the appropriate locks or have a private queue to do this.
 *
 *        Returns %NULL for an empty list or a pointer to the head element.
 *        The reference count is not incremented and the reference is therefore
 *        volatile. Use with caution.
 */
static inline struct sk_buff *skb_peek(const struct sk_buff_head *list_)
{
        struct sk_buff *skb = list_->next;

        if (skb == (struct sk_buff *)list_)
                skb = NULL;
        return skb;
}

/**
 *        __skb_peek - peek at the head of a non-empty &sk_buff_head
 *        @list_: list to peek at
 *
 *        Like skb_peek(), but the caller knows that the list is not empty.
 */
static inline struct sk_buff *__skb_peek(const struct sk_buff_head *list_)
{
        return list_->next;
}

/**
 *        skb_peek_next - peek skb following the given one from a queue
 *        @skb: skb to start from
 *        @list_: list to peek at
 *
 *        Returns %NULL when the end of the list is met or a pointer to the
 *        next element. The reference count is not incremented and the
 *        reference is therefore volatile. Use with caution.
 */
static inline struct sk_buff *skb_peek_next(struct sk_buff *skb,
                const struct sk_buff_head *list_)
{
        struct sk_buff *next = skb->next;

        if (next == (struct sk_buff *)list_)
                next = NULL;
        return next;
}

/**
 *        skb_peek_tail - peek at the tail of an &sk_buff_head
 *        @list_: list to peek at
 *
 *        Peek an &sk_buff. Unlike most other operations you _MUST_
 *        be careful with this one. A peek leaves the buffer on the
 *        list and someone else may run off with it. You must hold
 *        the appropriate locks or have a private queue to do this.
 *
 *        Returns %NULL for an empty list or a pointer to the tail element.
 *        The reference count is not incremented and the reference is therefore
 *        volatile. Use with caution.
 */
static inline struct sk_buff *skb_peek_tail(const struct sk_buff_head *list_)
{
        struct sk_buff *skb = READ_ONCE(list_->prev);

        if (skb == (struct sk_buff *)list_)
                skb = NULL;
        return skb;

}

/**
 *        skb_queue_len        - get queue length
 *        @list_: list to measure
 *
 *        Return the length of an &sk_buff queue.
 */
static inline __u32 skb_queue_len(const struct sk_buff_head *list_)
{
        return list_->qlen;
}

/**
 *        skb_queue_len_lockless        - get queue length
 *        @list_: list to measure
 *
 *        Return the length of an &sk_buff queue.
 *        This variant can be used in lockless contexts.
 */
static inline __u32 skb_queue_len_lockless(const struct sk_buff_head *list_)
{
        return READ_ONCE(list_->qlen);
}

/**
 *        __skb_queue_head_init - initialize non-spinlock portions of sk_buff_head
 *        @list: queue to initialize
 *
 *        This initializes only the list and queue length aspects of
 *        an sk_buff_head object.  This allows to initialize the list
 *        aspects of an sk_buff_head without reinitializing things like
 *        the spinlock.  It can also be used for on-stack sk_buff_head
 *        objects where the spinlock is known to not be used.
 */
static inline void __skb_queue_head_init(struct sk_buff_head *list)
{
        list->prev = list->next = (struct sk_buff *)list;
        list->qlen = 0;
}

/*
 * This function creates a split out lock class for each invocation;
 * this is needed for now since a whole lot of users of the skb-queue
 * infrastructure in drivers have different locking usage (in hardirq)
 * than the networking core (in softirq only). In the long run either the
 * network layer or drivers should need annotation to consolidate the
 * main types of usage into 3 classes.
 */
static inline void skb_queue_head_init(struct sk_buff_head *list)
{
        spin_lock_init(&list->lock);
        __skb_queue_head_init(list);
}

static inline void skb_queue_head_init_class(struct sk_buff_head *list,
                struct lock_class_key *class)
{
        skb_queue_head_init(list);
        lockdep_set_class(&list->lock, class);
}

/*
 *        Insert an sk_buff on a list.
 *
 *        The "__skb_xxxx()" functions are the non-atomic ones that
 *        can only be called with interrupts disabled.
 */
static inline void __skb_insert(struct sk_buff *newsk,
                                struct sk_buff *prev, struct sk_buff *next,
                                struct sk_buff_head *list)
{
        /* See skb_queue_empty_lockless() and skb_peek_tail()
         * for the opposite READ_ONCE()
         */
        WRITE_ONCE(newsk->next, next);
        WRITE_ONCE(newsk->prev, prev);
        WRITE_ONCE(((struct sk_buff_list *)next)->prev, newsk);
        WRITE_ONCE(((struct sk_buff_list *)prev)->next, newsk);
        WRITE_ONCE(list->qlen, list->qlen + 1);
}

static inline void __skb_queue_splice(const struct sk_buff_head *list,
                                      struct sk_buff *prev,
                                      struct sk_buff *next)
{
        struct sk_buff *first = list->next;
        struct sk_buff *last = list->prev;

        WRITE_ONCE(first->prev, prev);
        WRITE_ONCE(prev->next, first);

        WRITE_ONCE(last->next, next);
        WRITE_ONCE(next->prev, last);
}

/**
 *        skb_queue_splice - join two skb lists, this is designed for stacks
 *        @list: the new list to add
 *        @head: the place to add it in the first list
 */
static inline void skb_queue_splice(const struct sk_buff_head *list,
                                    struct sk_buff_head *head)
{
        if (!skb_queue_empty(list)) {
                __skb_queue_splice(list, (struct sk_buff *) head, head->next);
                head->qlen += list->qlen;
        }
}

/**
 *        skb_queue_splice_init - join two skb lists and reinitialise the emptied list
 *        @list: the new list to add
 *        @head: the place to add it in the first list
 *
 *        The list at @list is reinitialised
 */
static inline void skb_queue_splice_init(struct sk_buff_head *list,
                                         struct sk_buff_head *head)
{
        if (!skb_queue_empty(list)) {
                __skb_queue_splice(list, (struct sk_buff *) head, head->next);
                head->qlen += list->qlen;
                __skb_queue_head_init(list);
        }
}

/**
 *        skb_queue_splice_tail - join two skb lists, each list being a queue
 *        @list: the new list to add
 *        @head: the place to add it in the first list
 */
static inline void skb_queue_splice_tail(const struct sk_buff_head *list,
                                         struct sk_buff_head *head)
{
        if (!skb_queue_empty(list)) {
                __skb_queue_splice(list, head->prev, (struct sk_buff *) head);
                head->qlen += list->qlen;
        }
}

/**
 *        skb_queue_splice_tail_init - join two skb lists and reinitialise the emptied list
 *        @list: the new list to add
 *        @head: the place to add it in the first list
 *
 *        Each of the lists is a queue.
 *        The list at @list is reinitialised
 */
static inline void skb_queue_splice_tail_init(struct sk_buff_head *list,
                                              struct sk_buff_head *head)
{
        if (!skb_queue_empty(list)) {
                __skb_queue_splice(list, head->prev, (struct sk_buff *) head);
                head->qlen += list->qlen;
                __skb_queue_head_init(list);
        }
}

/**
 *        __skb_queue_after - queue a buffer at the list head
 *        @list: list to use
 *        @prev: place after this buffer
 *        @newsk: buffer to queue
 *
 *        Queue a buffer int the middle of a list. This function takes no locks
 *        and you must therefore hold required locks before calling it.
 *
 *        A buffer cannot be placed on two lists at the same time.
 */
static inline void __skb_queue_after(struct sk_buff_head *list,
                                     struct sk_buff *prev,
                                     struct sk_buff *newsk)
{
        __skb_insert(newsk, prev, ((struct sk_buff_list *)prev)->next, list);
}

void skb_append(struct sk_buff *old, struct sk_buff *newsk,
                struct sk_buff_head *list);

static inline void __skb_queue_before(struct sk_buff_head *list,
                                      struct sk_buff *next,
                                      struct sk_buff *newsk)
{
        __skb_insert(newsk, ((struct sk_buff_list *)next)->prev, next, list);
}

/**
 *        __skb_queue_head - queue a buffer at the list head
 *        @list: list to use
 *        @newsk: buffer to queue
 *
 *        Queue a buffer at the start of a list. This function takes no locks
 *        and you must therefore hold required locks before calling it.
 *
 *        A buffer cannot be placed on two lists at the same time.
 */
static inline void __skb_queue_head(struct sk_buff_head *list,
                                    struct sk_buff *newsk)
{
        __skb_queue_after(list, (struct sk_buff *)list, newsk);
}
void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk);

/**
 *        __skb_queue_tail - queue a buffer at the list tail
 *        @list: list to use
 *        @newsk: buffer to queue
 *
 *        Queue a buffer at the end of a list. This function takes no locks
 *        and you must therefore hold required locks before calling it.
 *
 *        A buffer cannot be placed on two lists at the same time.
 */
static inline void __skb_queue_tail(struct sk_buff_head *list,
                                   struct sk_buff *newsk)
{
        __skb_queue_before(list, (struct sk_buff *)list, newsk);
}
void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk);

/*
 * remove sk_buff from list. _Must_ be called atomically, and with
 * the list known..
 */
void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list);
static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
{
        struct sk_buff *next, *prev;

        WRITE_ONCE(list->qlen, list->qlen - 1);
        next           = skb->next;
        prev           = skb->prev;
        skb->next  = skb->prev = NULL;
        WRITE_ONCE(next->prev, prev);
        WRITE_ONCE(prev->next, next);
}

/**
 *        __skb_dequeue - remove from the head of the queue
 *        @list: list to dequeue from
 *
 *        Remove the head of the list. This function does not take any locks
 *        so must be used with appropriate locks held only. The head item is
 *        returned or %NULL if the list is empty.
 */
static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list)
{
        struct sk_buff *skb = skb_peek(list);
        if (skb)
                __skb_unlink(skb, list);
        return skb;
}
struct sk_buff *skb_dequeue(struct sk_buff_head *list);

/**
 *        __skb_dequeue_tail - remove from the tail of the queue
 *        @list: list to dequeue from
 *
 *        Remove the tail of the list. This function does not take any locks
 *        so must be used with appropriate locks held only. The tail item is
 *        returned or %NULL if the list is empty.
 */
static inline struct sk_buff *__skb_dequeue_tail(struct sk_buff_head *list)
{
        struct sk_buff *skb = skb_peek_tail(list);
        if (skb)
                __skb_unlink(skb, list);
        return skb;
}
struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list);


static inline bool skb_is_nonlinear(const struct sk_buff *skb)
{
        return skb->data_len;
}

static inline unsigned int skb_headlen(const struct sk_buff *skb)
{
        return skb->len - skb->data_len;
}

static inline unsigned int __skb_pagelen(const struct sk_buff *skb)
{
        unsigned int i, len = 0;

        for (i = skb_shinfo(skb)->nr_frags - 1; (int)i >= 0; i--)
                len += skb_frag_size(&skb_shinfo(skb)->frags[i]);
        return len;
}

static inline unsigned int skb_pagelen(const struct sk_buff *skb)
{
        return skb_headlen(skb) + __skb_pagelen(skb);
}

static inline void skb_frag_fill_netmem_desc(skb_frag_t *frag,
                                             netmem_ref netmem, int off,
                                             int size)
{
        frag->netmem = netmem;
        frag->offset = off;
        skb_frag_size_set(frag, size);
}

static inline void skb_frag_fill_page_desc(skb_frag_t *frag,
                                           struct page *page,
                                           int off, int size)
{
        skb_frag_fill_netmem_desc(frag, page_to_netmem(page), off, size);
}

static inline void __skb_fill_netmem_desc_noacc(struct skb_shared_info *shinfo,
                                                int i, netmem_ref netmem,
                                                int off, int size)
{
        skb_frag_t *frag = &shinfo->frags[i];

        skb_frag_fill_netmem_desc(frag, netmem, off, size);
}

static inline void __skb_fill_page_desc_noacc(struct skb_shared_info *shinfo,
                                              int i, struct page *page,
                                              int off, int size)
{
        __skb_fill_netmem_desc_noacc(shinfo, i, page_to_netmem(page), off,
                                     size);
}

/**
 * skb_len_add - adds a number to len fields of skb
 * @skb: buffer to add len to
 * @delta: number of bytes to add
 */
static inline void skb_len_add(struct sk_buff *skb, int delta)
{
        skb->len += delta;
        skb->data_len += delta;
        skb->truesize += delta;
}

/**
 * __skb_fill_netmem_desc - initialise a fragment in an skb
 * @skb: buffer containing fragment to be initialised
 * @i: fragment index to initialise
 * @netmem: the netmem to use for this fragment
 * @off: the offset to the data with @page
 * @size: the length of the data
 *
 * Initialises the @i'th fragment of @skb to point to &size bytes at
 * offset @off within @page.
 *
 * Does not take any additional reference on the fragment.
 */
static inline void __skb_fill_netmem_desc(struct sk_buff *skb, int i,
                                          netmem_ref netmem, int off, int size)
{
        struct page *page;

        __skb_fill_netmem_desc_noacc(skb_shinfo(skb), i, netmem, off, size);

        if (netmem_is_net_iov(netmem)) {
                skb->unreadable = true;
                return;
        }

        page = netmem_to_page(netmem);

        /* Propagate page pfmemalloc to the skb if we can. The problem is
         * that not all callers have unique ownership of the page but rely
         * on page_is_pfmemalloc doing the right thing(tm).
         */
        page = compound_head(page);
        if (page_is_pfmemalloc(page))
                skb->pfmemalloc = true;
}

static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
                                        struct page *page, int off, int size)
{
        __skb_fill_netmem_desc(skb, i, page_to_netmem(page), off, size);
}

static inline void skb_fill_netmem_desc(struct sk_buff *skb, int i,
                                        netmem_ref netmem, int off, int size)
{
        __skb_fill_netmem_desc(skb, i, netmem, off, size);
        skb_shinfo(skb)->nr_frags = i + 1;
}

/**
 * skb_fill_page_desc - initialise a paged fragment in an skb
 * @skb: buffer containing fragment to be initialised
 * @i: paged fragment index to initialise
 * @page: the page to use for this fragment
 * @off: the offset to the data with @page
 * @size: the length of the data
 *
 * As per __skb_fill_page_desc() -- initialises the @i'th fragment of
 * @skb to point to @size bytes at offset @off within @page. In
 * addition updates @skb such that @i is the last fragment.
 *
 * Does not take any additional reference on the fragment.
 */
static inline void skb_fill_page_desc(struct sk_buff *skb, int i,
                                      struct page *page, int off, int size)
{
        skb_fill_netmem_desc(skb, i, page_to_netmem(page), off, size);
}

/**
 * skb_fill_page_desc_noacc - initialise a paged fragment in an skb
 * @skb: buffer containing fragment to be initialised
 * @i: paged fragment index to initialise
 * @page: the page to use for this fragment
 * @off: the offset to the data with @page
 * @size: the length of the data
 *
 * Variant of skb_fill_page_desc() which does not deal with
 * pfmemalloc, if page is not owned by us.
 */
static inline void skb_fill_page_desc_noacc(struct sk_buff *skb, int i,
                                            struct page *page, int off,
                                            int size)
{
        struct skb_shared_info *shinfo = skb_shinfo(skb);

        __skb_fill_page_desc_noacc(shinfo, i, page, off, size);
        shinfo->nr_frags = i + 1;
}

void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem,
                            int off, int size, unsigned int truesize);

static inline void skb_add_rx_frag(struct sk_buff *skb, int i,
                                   struct page *page, int off, int size,
                                   unsigned int truesize)
{
        skb_add_rx_frag_netmem(skb, i, page_to_netmem(page), off, size,
                               truesize);
}

void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
                          unsigned int truesize);

#define SKB_LINEAR_ASSERT(skb)  BUG_ON(skb_is_nonlinear(skb))

#ifdef NET_SKBUFF_DATA_USES_OFFSET
static inline unsigned char *skb_tail_pointer(const struct sk_buff *skb)
{
        return skb->head + skb->tail;
}

static inline void skb_reset_tail_pointer(struct sk_buff *skb)
{
        skb->tail = skb->data - skb->head;
}

static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset)
{
        skb_reset_tail_pointer(skb);
        skb->tail += offset;
}

#else /* NET_SKBUFF_DATA_USES_OFFSET */
static inline unsigned char *skb_tail_pointer(const struct sk_buff *skb)
{
        return skb->tail;
}

static inline void skb_reset_tail_pointer(struct sk_buff *skb)
{
        skb->tail = skb->data;
}

static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset)
{
        skb->tail = skb->data + offset;
}

#endif /* NET_SKBUFF_DATA_USES_OFFSET */

static inline void skb_assert_len(struct sk_buff *skb)
{
#ifdef CONFIG_DEBUG_NET
        if (WARN_ONCE(!skb->len, "%s\n", __func__))
                DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
#endif /* CONFIG_DEBUG_NET */
}

#if defined(CONFIG_FAIL_SKB_REALLOC)
void skb_might_realloc(struct sk_buff *skb);
#else
static inline void skb_might_realloc(struct sk_buff *skb) {}
#endif

/*
 *        Add data to an sk_buff
 */
void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len);
void *skb_put(struct sk_buff *skb, unsigned int len);
static inline void *__skb_put(struct sk_buff *skb, unsigned int len)
{
        void *tmp = skb_tail_pointer(skb);
        SKB_LINEAR_ASSERT(skb);
        skb->tail += len;
        skb->len  += len;
        return tmp;
}

static inline void *__skb_put_zero(struct sk_buff *skb, unsigned int len)
{
        void *tmp = __skb_put(skb, len);

        memset(tmp, 0, len);
        return tmp;
}

static inline void *__skb_put_data(struct sk_buff *skb, const void *data,
                                   unsigned int len)
{
        void *tmp = __skb_put(skb, len);

        memcpy(tmp, data, len);
        return tmp;
}

static inline void __skb_put_u8(struct sk_buff *skb, u8 val)
{
        *(u8 *)__skb_put(skb, 1) = val;
}

static inline void *skb_put_zero(struct sk_buff *skb, unsigned int len)
{
        void *tmp = skb_put(skb, len);

        memset(tmp, 0, len);

        return tmp;
}

static inline void *skb_put_data(struct sk_buff *skb, const void *data,
                                 unsigned int len)
{
        void *tmp = skb_put(skb, len);

        memcpy(tmp, data, len);

        return tmp;
}

static inline void skb_put_u8(struct sk_buff *skb, u8 val)
{
        *(u8 *)skb_put(skb, 1) = val;
}

void *skb_push(struct sk_buff *skb, unsigned int len);
static inline void *__skb_push(struct sk_buff *skb, unsigned int len)
{
        DEBUG_NET_WARN_ON_ONCE(len > INT_MAX);

        skb->data -= len;
        skb->len  += len;
        return skb->data;
}

void *skb_pull(struct sk_buff *skb, unsigned int len);
static inline void *__skb_pull(struct sk_buff *skb, unsigned int len)
{
        DEBUG_NET_WARN_ON_ONCE(len > INT_MAX);

        skb->len -= len;
        if (unlikely(skb->len < skb->data_len)) {
#if defined(CONFIG_DEBUG_NET)
                skb->len += len;
                pr_err("__skb_pull(len=%u)\n", len);
                skb_dump(KERN_ERR, skb, false);
#endif
                BUG();
        }
        return skb->data += len;
}

static inline void *skb_pull_inline(struct sk_buff *skb, unsigned int len)
{
        return unlikely(len > skb->len) ? NULL : __skb_pull(skb, len);
}

void *skb_pull_data(struct sk_buff *skb, size_t len);

void *__pskb_pull_tail(struct sk_buff *skb, int delta);

static inline enum skb_drop_reason
pskb_may_pull_reason(struct sk_buff *skb, unsigned int len)
{
        DEBUG_NET_WARN_ON_ONCE(len > INT_MAX);
        skb_might_realloc(skb);

        if (likely(len <= skb_headlen(skb)))
                return SKB_NOT_DROPPED_YET;

        if (unlikely(len > skb->len))
                return SKB_DROP_REASON_PKT_TOO_SMALL;

        if (unlikely(!__pskb_pull_tail(skb, len - skb_headlen(skb))))
                return SKB_DROP_REASON_NOMEM;

        return SKB_NOT_DROPPED_YET;
}

static inline bool pskb_may_pull(struct sk_buff *skb, unsigned int len)
{
        return pskb_may_pull_reason(skb, len) == SKB_NOT_DROPPED_YET;
}

static inline void *pskb_pull(struct sk_buff *skb, unsigned int len)
{
        if (!pskb_may_pull(skb, len))
                return NULL;

        skb->len -= len;
        return skb->data += len;
}

void skb_condense(struct sk_buff *skb);

/**
 *        skb_headroom - bytes at buffer head
 *        @skb: buffer to check
 *
 *        Return the number of bytes of free space at the head of an &sk_buff.
 */
static inline unsigned int skb_headroom(const struct sk_buff *skb)
{
        return skb->data - skb->head;
}

/**
 *        skb_tailroom - bytes at buffer end
 *        @skb: buffer to check
 *
 *        Return the number of bytes of free space at the tail of an sk_buff
 */
static inline int skb_tailroom(const struct sk_buff *skb)
{
        return skb_is_nonlinear(skb) ? 0 : skb->end - skb->tail;
}

/**
 *        skb_availroom - bytes at buffer end
 *        @skb: buffer to check
 *
 *        Return the number of bytes of free space at the tail of an sk_buff
 *        allocated by sk_stream_alloc()
 */
static inline int skb_availroom(const struct sk_buff *skb)
{
        if (skb_is_nonlinear(skb))
                return 0;

        return skb->end - skb->tail - skb->reserved_tailroom;
}

/**
 *        skb_reserve - adjust headroom
 *        @skb: buffer to alter
 *        @len: bytes to move
 *
 *        Increase the headroom of an empty &sk_buff by reducing the tail
 *        room. This is only allowed for an empty buffer.
 */
static inline void skb_reserve(struct sk_buff *skb, int len)
{
        skb->data += len;
        skb->tail += len;
}

/**
 *        skb_tailroom_reserve - adjust reserved_tailroom
 *        @skb: buffer to alter
 *        @mtu: maximum amount of headlen permitted
 *        @needed_tailroom: minimum amount of reserved_tailroom
 *
 *        Set reserved_tailroom so that headlen can be as large as possible but
 *        not larger than mtu and tailroom cannot be smaller than
 *        needed_tailroom.
 *        The required headroom should already have been reserved before using
 *        this function.
 */
static inline void skb_tailroom_reserve(struct sk_buff *skb, unsigned int mtu,
                                        unsigned int needed_tailroom)
{
        SKB_LINEAR_ASSERT(skb);
        if (mtu < skb_tailroom(skb) - needed_tailroom)
                /* use at most mtu */
                skb->reserved_tailroom = skb_tailroom(skb) - mtu;
        else
                /* use up to all available space */
                skb->reserved_tailroom = needed_tailroom;
}

#define ENCAP_TYPE_ETHER        0
#define ENCAP_TYPE_IPPROTO        1

static inline void skb_set_inner_protocol(struct sk_buff *skb,
                                          __be16 protocol)
{
        skb->inner_protocol = protocol;
        skb->inner_protocol_type = ENCAP_TYPE_ETHER;
}

static inline void skb_set_inner_ipproto(struct sk_buff *skb,
                                         __u8 ipproto)
{
        skb->inner_ipproto = ipproto;
        skb->inner_protocol_type = ENCAP_TYPE_IPPROTO;
}

static inline void skb_reset_inner_headers(struct sk_buff *skb)
{
        skb->inner_mac_header = skb->mac_header;
        skb->inner_network_header = skb->network_header;
        skb->inner_transport_header = skb->transport_header;
}

static inline int skb_mac_header_was_set(const struct sk_buff *skb)
{
        return skb->mac_header != (typeof(skb->mac_header))~0U;
}

static inline void skb_reset_mac_len(struct sk_buff *skb)
{
        if (!skb_mac_header_was_set(skb)) {
                DEBUG_NET_WARN_ON_ONCE(1);
                skb->mac_len = 0;
        } else {
                skb->mac_len = skb->network_header - skb->mac_header;
        }
}

static inline unsigned char *skb_inner_transport_header(const struct sk_buff
                                                        *skb)
{
        return skb->head + skb->inner_transport_header;
}

static inline int skb_inner_transport_offset(const struct sk_buff *skb)
{
        return skb_inner_transport_header(skb) - skb->data;
}

static inline void skb_reset_inner_transport_header(struct sk_buff *skb)
{
        long offset = skb->data - skb->head;

        DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->inner_transport_header))offset);
        skb->inner_transport_header = offset;
}

static inline void skb_set_inner_transport_header(struct sk_buff *skb,
                                                   const int offset)
{
        skb_reset_inner_transport_header(skb);
        skb->inner_transport_header += offset;
}

static inline unsigned char *skb_inner_network_header(const struct sk_buff *skb)
{
        return skb->head + skb->inner_network_header;
}

static inline void skb_reset_inner_network_header(struct sk_buff *skb)
{
        long offset = skb->data - skb->head;

        DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->inner_network_header))offset);
        skb->inner_network_header = offset;
}

static inline void skb_set_inner_network_header(struct sk_buff *skb,
                                                const int offset)
{
        skb_reset_inner_network_header(skb);
        skb->inner_network_header += offset;
}

static inline bool skb_inner_network_header_was_set(const struct sk_buff *skb)
{
        return skb->inner_network_header > 0;
}

static inline unsigned char *skb_inner_mac_header(const struct sk_buff *skb)
{
        return skb->head + skb->inner_mac_header;
}

static inline void skb_reset_inner_mac_header(struct sk_buff *skb)
{
        long offset = skb->data - skb->head;

        DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->inner_mac_header))offset);
        skb->inner_mac_header = offset;
}

static inline void skb_set_inner_mac_header(struct sk_buff *skb,
                                            const int offset)
{
        skb_reset_inner_mac_header(skb);
        skb->inner_mac_header += offset;
}
static inline bool skb_transport_header_was_set(const struct sk_buff *skb)
{
        return skb->transport_header != (typeof(skb->transport_header))~0U;
}

static inline unsigned char *skb_transport_header(const struct sk_buff *skb)
{
        DEBUG_NET_WARN_ON_ONCE(!skb_transport_header_was_set(skb));
        return skb->head + skb->transport_header;
}

static inline void skb_reset_transport_header(struct sk_buff *skb)
{
        long offset = skb->data - skb->head;

        DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->transport_header))offset);
        skb->transport_header = offset;
}

/**
 * skb_reset_transport_header_careful - conditionally reset transport header
 * @skb: buffer to alter
 *
 * Hardened version of skb_reset_transport_header().
 *
 * Returns: true if the operation was a success.
 */
static inline bool __must_check
skb_reset_transport_header_careful(struct sk_buff *skb)
{
        long offset = skb->data - skb->head;

        if (unlikely(offset != (typeof(skb->transport_header))offset))
                return false;

        if (unlikely(offset == (typeof(skb->transport_header))~0U))
                return false;

        skb->transport_header = offset;
        return true;
}

static inline void skb_set_transport_header(struct sk_buff *skb,
                                            const int offset)
{
        skb_reset_transport_header(skb);
        skb->transport_header += offset;
}

static inline unsigned char *skb_network_header(const struct sk_buff *skb)
{
        return skb->head + skb->network_header;
}

static inline void skb_reset_network_header(struct sk_buff *skb)
{
        long offset = skb->data - skb->head;

        DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->network_header))offset);
        skb->network_header = offset;
}

static inline void skb_set_network_header(struct sk_buff *skb, const int offset)
{
        skb_reset_network_header(skb);
        skb->network_header += offset;
}

static inline unsigned char *skb_mac_header(const struct sk_buff *skb)
{
        DEBUG_NET_WARN_ON_ONCE(!skb_mac_header_was_set(skb));
        return skb->head + skb->mac_header;
}

static inline int skb_mac_offset(const struct sk_buff *skb)
{
        return skb_mac_header(skb) - skb->data;
}

static inline u32 skb_mac_header_len(const struct sk_buff *skb)
{
        DEBUG_NET_WARN_ON_ONCE(!skb_mac_header_was_set(skb));
        return skb->network_header - skb->mac_header;
}

static inline void skb_unset_mac_header(struct sk_buff *skb)
{
        skb->mac_header = (typeof(skb->mac_header))~0U;
}

static inline void skb_reset_mac_header(struct sk_buff *skb)
{
        long offset = skb->data - skb->head;

        DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->mac_header))offset);
        skb->mac_header = offset;
}

static inline void skb_set_mac_header(struct sk_buff *skb, const int offset)
{
        skb_reset_mac_header(skb);
        skb->mac_header += offset;
}

static inline void skb_pop_mac_header(struct sk_buff *skb)
{
        skb->mac_header = skb->network_header;
}

static inline void skb_probe_transport_header(struct sk_buff *skb)
{
        struct flow_keys_basic keys;

        if (skb_transport_header_was_set(skb))
                return;

        if (skb_flow_dissect_flow_keys_basic(NULL, skb, &keys,
                                             NULL, 0, 0, 0, 0))
                skb_set_transport_header(skb, keys.control.thoff);
}

static inline void skb_mac_header_rebuild(struct sk_buff *skb)
{
        if (skb_mac_header_was_set(skb)) {
                const unsigned char *old_mac = skb_mac_header(skb);

                skb_set_mac_header(skb, -skb->mac_len);
                memmove(skb_mac_header(skb), old_mac, skb->mac_len);
        }
}

/* Move the full mac header up to current network_header.
 * Leaves skb->data pointing at offset skb->mac_len into the mac_header.
 * Must be provided the complete mac header length.
 */
static inline void skb_mac_header_rebuild_full(struct sk_buff *skb, u32 full_mac_len)
{
        if (skb_mac_header_was_set(skb)) {
                const unsigned char *old_mac = skb_mac_header(skb);

                skb_set_mac_header(skb, -full_mac_len);
                memmove(skb_mac_header(skb), old_mac, full_mac_len);
                __skb_push(skb, full_mac_len - skb->mac_len);
        }
}

static inline int skb_checksum_start_offset(const struct sk_buff *skb)
{
        return skb->csum_start - skb_headroom(skb);
}

static inline unsigned char *skb_checksum_start(const struct sk_buff *skb)
{
        return skb->head + skb->csum_start;
}

static inline int skb_transport_offset(const struct sk_buff *skb)
{
        return skb_transport_header(skb) - skb->data;
}

static inline u32 skb_network_header_len(const struct sk_buff *skb)
{
        DEBUG_NET_WARN_ON_ONCE(!skb_transport_header_was_set(skb));
        return skb->transport_header - skb->network_header;
}

static inline u32 skb_inner_network_header_len(const struct sk_buff *skb)
{
        return skb->inner_transport_header - skb->inner_network_header;
}

static inline int skb_network_offset(const struct sk_buff *skb)
{
        return skb_network_header(skb) - skb->data;
}

static inline int skb_inner_network_offset(const struct sk_buff *skb)
{
        return skb_inner_network_header(skb) - skb->data;
}

static inline enum skb_drop_reason
pskb_network_may_pull_reason(struct sk_buff *skb, unsigned int len)
{
        return pskb_may_pull_reason(skb, skb_network_offset(skb) + len);
}

static inline int pskb_network_may_pull(struct sk_buff *skb, unsigned int len)
{
        return pskb_network_may_pull_reason(skb, len) == SKB_NOT_DROPPED_YET;
}

/*
 * CPUs often take a performance hit when accessing unaligned memory
 * locations. The actual performance hit varies, it can be small if the
 * hardware handles it or large if we have to take an exception and fix it
 * in software.
 *
 * Since an ethernet header is 14 bytes network drivers often end up with
 * the IP header at an unaligned offset. The IP header can be aligned by
 * shifting the start of the packet by 2 bytes. Drivers should do this
 * with:
 *
 * skb_reserve(skb, NET_IP_ALIGN);
 *
 * The downside to this alignment of the IP header is that the DMA is now
 * unaligned. On some architectures the cost of an unaligned DMA is high
 * and this cost outweighs the gains made by aligning the IP header.
 *
 * Since this trade off varies between architectures, we allow NET_IP_ALIGN
 * to be overridden.
 */
#ifndef NET_IP_ALIGN
#define NET_IP_ALIGN        2
#endif

/*
 * The networking layer reserves some headroom in skb data (via
 * dev_alloc_skb). This is used to avoid having to reallocate skb data when
 * the header has to grow. In the default case, if the header has to grow
 * 32 bytes or less we avoid the reallocation.
 *
 * Unfortunately this headroom changes the DMA alignment of the resulting
 * network packet. As for NET_IP_ALIGN, this unaligned DMA is expensive
 * on some architectures. An architecture can override this value,
 * perhaps setting it to a cacheline in size (since that will maintain
 * cacheline alignment of the DMA). It must be a power of 2.
 *
 * Various parts of the networking layer expect at least 32 bytes of
 * headroom, you should not reduce this.
 *
 * Using max(32, L1_CACHE_BYTES) makes sense (especially with RPS)
 * to reduce average number of cache lines per packet.
 * get_rps_cpu() for example only access one 64 bytes aligned block :
 * NET_IP_ALIGN(2) + ethernet_header(14) + IP_header(20/40) + ports(8)
 */
#ifndef NET_SKB_PAD
#define NET_SKB_PAD        max(32, L1_CACHE_BYTES)
#endif

int ___pskb_trim(struct sk_buff *skb, unsigned int len);

static inline void __skb_set_length(struct sk_buff *skb, unsigned int len)
{
        if (WARN_ON(skb_is_nonlinear(skb)))
                return;
        skb->len = len;
        skb_set_tail_pointer(skb, len);
}

static inline void __skb_trim(struct sk_buff *skb, unsigned int len)
{
        __skb_set_length(skb, len);
}

void skb_trim(struct sk_buff *skb, unsigned int len);

static inline int __pskb_trim(struct sk_buff *skb, unsigned int len)
{
        if (skb->data_len)
                return ___pskb_trim(skb, len);
        __skb_trim(skb, len);
        return 0;
}

static inline int pskb_trim(struct sk_buff *skb, unsigned int len)
{
        skb_might_realloc(skb);
        return (len < skb->len) ? __pskb_trim(skb, len) : 0;
}

/**
 *        pskb_trim_unique - remove end from a paged unique (not cloned) buffer
 *        @skb: buffer to alter
 *        @len: new length
 *
 *        This is identical to pskb_trim except that the caller knows that
 *        the skb is not cloned so we should never get an error due to out-
 *        of-memory.
 */
static inline void pskb_trim_unique(struct sk_buff *skb, unsigned int len)
{
        int err = pskb_trim(skb, len);
        BUG_ON(err);
}

static inline int __skb_grow(struct sk_buff *skb, unsigned int len)
{
        unsigned int diff = len - skb->len;

        if (skb_tailroom(skb) < diff) {
                int ret = pskb_expand_head(skb, 0, diff - skb_tailroom(skb),
                                           GFP_ATOMIC);
                if (ret)
                        return ret;
        }
        __skb_set_length(skb, len);
        return 0;
}

/**
 *        skb_orphan - orphan a buffer
 *        @skb: buffer to orphan
 *
 *        If a buffer currently has an owner then we call the owner's
 *        destructor function and make the @skb unowned. The buffer continues
 *        to exist but is no longer charged to its former owner.
 */
static inline void skb_orphan(struct sk_buff *skb)
{
        if (skb->destructor) {
                skb->destructor(skb);
                skb->destructor = NULL;
                skb->sk                = NULL;
        } else {
                BUG_ON(skb->sk);
        }
}

/**
 *        skb_orphan_frags - orphan the frags contained in a buffer
 *        @skb: buffer to orphan frags from
 *        @gfp_mask: allocation mask for replacement pages
 *
 *        For each frag in the SKB which needs a destructor (i.e. has an
 *        owner) create a copy of that frag and release the original
 *        page by calling the destructor.
 */
static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask)
{
        if (likely(!skb_zcopy(skb)))
                return 0;
        if (skb_shinfo(skb)->flags & SKBFL_DONT_ORPHAN)
                return 0;
        return skb_copy_ubufs(skb, gfp_mask);
}

/* Frags must be orphaned, even if refcounted, if skb might loop to rx path */
static inline int skb_orphan_frags_rx(struct sk_buff *skb, gfp_t gfp_mask)
{
        if (likely(!skb_zcopy(skb)))
                return 0;
        return skb_copy_ubufs(skb, gfp_mask);
}

/**
 *        __skb_queue_purge_reason - empty a list
 *        @list: list to empty
 *        @reason: drop reason
 *
 *        Delete all buffers on an &sk_buff list. Each buffer is removed from
 *        the list and one reference dropped. This function does not take the
 *        list lock and the caller must hold the relevant locks to use it.
 */
static inline void __skb_queue_purge_reason(struct sk_buff_head *list,
                                            enum skb_drop_reason reason)
{
        struct sk_buff *skb;

        while ((skb = __skb_dequeue(list)) != NULL)
                kfree_skb_reason(skb, reason);
}

static inline void __skb_queue_purge(struct sk_buff_head *list)
{
        __skb_queue_purge_reason(list, SKB_DROP_REASON_QUEUE_PURGE);
}

void skb_queue_purge_reason(struct sk_buff_head *list,
                            enum skb_drop_reason reason);

static inline void skb_queue_purge(struct sk_buff_head *list)
{
        skb_queue_purge_reason(list, SKB_DROP_REASON_QUEUE_PURGE);
}

unsigned int skb_rbtree_purge(struct rb_root *root);
void skb_errqueue_purge(struct sk_buff_head *list);

void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask);

/**
 * netdev_alloc_frag - allocate a page fragment
 * @fragsz: fragment size
 *
 * Allocates a frag from a page for receive buffer.
 * Uses GFP_ATOMIC allocations.
 */
static inline void *netdev_alloc_frag(unsigned int fragsz)
{
        return __netdev_alloc_frag_align(fragsz, ~0u);
}

static inline void *netdev_alloc_frag_align(unsigned int fragsz,
                                            unsigned int align)
{
        WARN_ON_ONCE(!is_power_of_2(align));
        return __netdev_alloc_frag_align(fragsz, -align);
}

struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length,
                                   gfp_t gfp_mask);

/**
 *        netdev_alloc_skb - allocate an skbuff for rx on a specific device
 *        @dev: network device to receive on
 *        @length: length to allocate
 *
 *        Allocate a new &sk_buff and assign it a usage count of one. The
 *        buffer has unspecified headroom built in. Users should allocate
 *        the headroom they think they need without accounting for the
 *        built in space. The built in space is used for optimisations.
 *
 *        %NULL is returned if there is no free memory. Although this function
 *        allocates memory it can be called from an interrupt.
 */
static inline struct sk_buff *netdev_alloc_skb(struct net_device *dev,
                                               unsigned int length)
{
        return __netdev_alloc_skb(dev, length, GFP_ATOMIC);
}

/* legacy helper around __netdev_alloc_skb() */
static inline struct sk_buff *__dev_alloc_skb(unsigned int length,
                                              gfp_t gfp_mask)
{
        return __netdev_alloc_skb(NULL, length, gfp_mask);
}

/* legacy helper around netdev_alloc_skb() */
static inline struct sk_buff *dev_alloc_skb(unsigned int length)
{
        return netdev_alloc_skb(NULL, length);
}


static inline struct sk_buff *__netdev_alloc_skb_ip_align(struct net_device *dev,
                unsigned int length, gfp_t gfp)
{
        struct sk_buff *skb = __netdev_alloc_skb(dev, length + NET_IP_ALIGN, gfp);

        if (NET_IP_ALIGN && skb)
                skb_reserve(skb, NET_IP_ALIGN);
        return skb;
}

static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev,
                unsigned int length)
{
        return __netdev_alloc_skb_ip_align(dev, length, GFP_ATOMIC);
}

static inline void skb_free_frag(void *addr)
{
        page_frag_free(addr);
}

void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask);

static inline void *napi_alloc_frag(unsigned int fragsz)
{
        return __napi_alloc_frag_align(fragsz, ~0u);
}

static inline void *napi_alloc_frag_align(unsigned int fragsz,
                                          unsigned int align)
{
        WARN_ON_ONCE(!is_power_of_2(align));
        return __napi_alloc_frag_align(fragsz, -align);
}

struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int length);
void napi_consume_skb(struct sk_buff *skb, int budget);

void napi_skb_free_stolen_head(struct sk_buff *skb);
void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason);

/**
 * __dev_alloc_pages - allocate page for network Rx
 * @gfp_mask: allocation priority. Set __GFP_NOMEMALLOC if not for network Rx
 * @order: size of the allocation
 *
 * Allocate a new page.
 *
 * %NULL is returned if there is no free memory.
*/
static inline struct page *__dev_alloc_pages_noprof(gfp_t gfp_mask,
                                             unsigned int order)
{
        /* This piece of code contains several assumptions.
         * 1.  This is for device Rx, therefore a cold page is preferred.
         * 2.  The expectation is the user wants a compound page.
         * 3.  If requesting a order 0 page it will not be compound
         *     due to the check to see if order has a value in prep_new_page
         * 4.  __GFP_MEMALLOC is ignored if __GFP_NOMEMALLOC is set due to
         *     code in gfp_to_alloc_flags that should be enforcing this.
         */
        gfp_mask |= __GFP_COMP | __GFP_MEMALLOC;

        return alloc_pages_node_noprof(NUMA_NO_NODE, gfp_mask, order);
}
#define __dev_alloc_pages(...)        alloc_hooks(__dev_alloc_pages_noprof(__VA_ARGS__))

/*
 * This specialized allocator has to be a macro for its allocations to be
 * accounted separately (to have a separate alloc_tag).
 */
#define dev_alloc_pages(_order) __dev_alloc_pages(GFP_ATOMIC | __GFP_NOWARN, _order)

/**
 * __dev_alloc_page - allocate a page for network Rx
 * @gfp_mask: allocation priority. Set __GFP_NOMEMALLOC if not for network Rx
 *
 * Allocate a new page.
 *
 * %NULL is returned if there is no free memory.
 */
static inline struct page *__dev_alloc_page_noprof(gfp_t gfp_mask)
{
        return __dev_alloc_pages_noprof(gfp_mask, 0);
}
#define __dev_alloc_page(...)        alloc_hooks(__dev_alloc_page_noprof(__VA_ARGS__))

/*
 * This specialized allocator has to be a macro for its allocations to be
 * accounted separately (to have a separate alloc_tag).
 */
#define dev_alloc_page()        dev_alloc_pages(0)

/**
 * dev_page_is_reusable - check whether a page can be reused for network Rx
 * @page: the page to test
 *
 * A page shouldn't be considered for reusing/recycling if it was allocated
 * under memory pressure or at a distant memory node.
 *
 * Returns: false if this page should be returned to page allocator, true
 * otherwise.
 */
static inline bool dev_page_is_reusable(const struct page *page)
{
        return likely(page_to_nid(page) == numa_mem_id() &&
                      !page_is_pfmemalloc(page));
}

/**
 *        skb_propagate_pfmemalloc - Propagate pfmemalloc if skb is allocated after RX page
 *        @page: The page that was allocated from skb_alloc_page
 *        @skb: The skb that may need pfmemalloc set
 */
static inline void skb_propagate_pfmemalloc(const struct page *page,
                                            struct sk_buff *skb)
{
        if (page_is_pfmemalloc(page))
                skb->pfmemalloc = true;
}

/**
 * skb_frag_off() - Returns the offset of a skb fragment
 * @frag: the paged fragment
 */
static inline unsigned int skb_frag_off(const skb_frag_t *frag)
{
        return frag->offset;
}

/**
 * skb_frag_off_add() - Increments the offset of a skb fragment by @delta
 * @frag: skb fragment
 * @delta: value to add
 */
static inline void skb_frag_off_add(skb_frag_t *frag, int delta)
{
        frag->offset += delta;
}

/**
 * skb_frag_off_set() - Sets the offset of a skb fragment
 * @frag: skb fragment
 * @offset: offset of fragment
 */
static inline void skb_frag_off_set(skb_frag_t *frag, unsigned int offset)
{
        frag->offset = offset;
}

/**
 * skb_frag_off_copy() - Sets the offset of a skb fragment from another fragment
 * @fragto: skb fragment where offset is set
 * @fragfrom: skb fragment offset is copied from
 */
static inline void skb_frag_off_copy(skb_frag_t *fragto,
                                     const skb_frag_t *fragfrom)
{
        fragto->offset = fragfrom->offset;
}

/* Return: true if the skb_frag contains a net_iov. */
static inline bool skb_frag_is_net_iov(const skb_frag_t *frag)
{
        return netmem_is_net_iov(frag->netmem);
}

/**
 * skb_frag_net_iov - retrieve the net_iov referred to by fragment
 * @frag: the fragment
 *
 * Return: the &struct net_iov associated with @frag. Returns NULL if this
 * frag has no associated net_iov.
 */
static inline struct net_iov *skb_frag_net_iov(const skb_frag_t *frag)
{
        if (!skb_frag_is_net_iov(frag))
                return NULL;

        return netmem_to_net_iov(frag->netmem);
}

/**
 * skb_frag_page - retrieve the page referred to by a paged fragment
 * @frag: the paged fragment
 *
 * Return: the &struct page associated with @frag. Returns NULL if this frag
 * has no associated page.
 */
static inline struct page *skb_frag_page(const skb_frag_t *frag)
{
        if (skb_frag_is_net_iov(frag))
                return NULL;

        return netmem_to_page(frag->netmem);
}

/**
 * skb_frag_netmem - retrieve the netmem referred to by a fragment
 * @frag: the fragment
 *
 * Return: the &netmem_ref associated with @frag.
 */
static inline netmem_ref skb_frag_netmem(const skb_frag_t *frag)
{
        return frag->netmem;
}

int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
                    unsigned int headroom);
int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
                         const struct bpf_prog *prog);

/**
 * skb_frag_address - gets the address of the data contained in a paged fragment
 * @frag: the paged fragment buffer
 *
 * Returns: the address of the data within @frag. The page must already
 * be mapped.
 */
static inline void *skb_frag_address(const skb_frag_t *frag)
{
        if (!skb_frag_page(frag))
                return NULL;

        return page_address(skb_frag_page(frag)) + skb_frag_off(frag);
}

/**
 * skb_frag_address_safe - gets the address of the data contained in a paged fragment
 * @frag: the paged fragment buffer
 *
 * Returns: the address of the data within @frag. Checks that the page
 * is mapped and returns %NULL otherwise.
 */
static inline void *skb_frag_address_safe(const skb_frag_t *frag)
{
        struct page *page = skb_frag_page(frag);
        void *ptr;

        if (!page)
                return NULL;

        ptr = page_address(page);
        if (unlikely(!ptr))
                return NULL;

        return ptr + skb_frag_off(frag);
}

/**
 * skb_frag_page_copy() - sets the page in a fragment from another fragment
 * @fragto: skb fragment where page is set
 * @fragfrom: skb fragment page is copied from
 */
static inline void skb_frag_page_copy(skb_frag_t *fragto,
                                      const skb_frag_t *fragfrom)
{
        fragto->netmem = fragfrom->netmem;
}

bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio);

/**
 * __skb_frag_dma_map - maps a paged fragment via the DMA API
 * @dev: the device to map the fragment to
 * @frag: the paged fragment to map
 * @offset: the offset within the fragment (starting at the
 *          fragment's own offset)
 * @size: the number of bytes to map
 * @dir: the direction of the mapping (``PCI_DMA_*``)
 *
 * Maps the page associated with @frag to @device.
 */
static inline dma_addr_t __skb_frag_dma_map(struct device *dev,
                                            const skb_frag_t *frag,
                                            size_t offset, size_t size,
                                            enum dma_data_direction dir)
{
        if (skb_frag_is_net_iov(frag)) {
                return netmem_to_net_iov(frag->netmem)->dma_addr + offset +
                       frag->offset;
        }
        return dma_map_page(dev, skb_frag_page(frag),
                            skb_frag_off(frag) + offset, size, dir);
}

#define skb_frag_dma_map(dev, frag, ...)                                \
        CONCATENATE(_skb_frag_dma_map,                                        \
                    COUNT_ARGS(__VA_ARGS__))(dev, frag, ##__VA_ARGS__)

#define __skb_frag_dma_map1(dev, frag, offset, uf, uo) ({                \
        const skb_frag_t *uf = (frag);                                        \
        size_t uo = (offset);                                                \
                                                                        \
        __skb_frag_dma_map(dev, uf, uo, skb_frag_size(uf) - uo,                \
                           DMA_TO_DEVICE);                                \
})
#define _skb_frag_dma_map1(dev, frag, offset)                                \
        __skb_frag_dma_map1(dev, frag, offset, __UNIQUE_ID(frag_),        \
                            __UNIQUE_ID(offset_))
#define _skb_frag_dma_map0(dev, frag)                                        \
        _skb_frag_dma_map1(dev, frag, 0)
#define _skb_frag_dma_map2(dev, frag, offset, size)                        \
        __skb_frag_dma_map(dev, frag, offset, size, DMA_TO_DEVICE)
#define _skb_frag_dma_map3(dev, frag, offset, size, dir)                \
        __skb_frag_dma_map(dev, frag, offset, size, dir)

static inline struct sk_buff *pskb_copy(struct sk_buff *skb,
                                        gfp_t gfp_mask)
{
        return __pskb_copy(skb, skb_headroom(skb), gfp_mask);
}


static inline struct sk_buff *pskb_copy_for_clone(struct sk_buff *skb,
                                                  gfp_t gfp_mask)
{
        return __pskb_copy_fclone(skb, skb_headroom(skb), gfp_mask, true);
}


/**
 *        skb_clone_writable - is the header of a clone writable
 *        @skb: buffer to check
 *        @len: length up to which to write
 *
 *        Returns true if modifying the header part of the cloned buffer
 *        does not requires the data to be copied.
 */
static inline int skb_clone_writable(const struct sk_buff *skb, unsigned int len)
{
        return !skb_header_cloned(skb) &&
               skb_headroom(skb) + len <= skb->hdr_len;
}

static inline int skb_try_make_writable(struct sk_buff *skb,
                                        unsigned int write_len)
{
        return skb_cloned(skb) && !skb_clone_writable(skb, write_len) &&
               pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
}

static inline int __skb_cow(struct sk_buff *skb, unsigned int headroom,
                            int cloned)
{
        int delta = 0;

        if (headroom > skb_headroom(skb))
                delta = headroom - skb_headroom(skb);

        if (delta || cloned)
                return pskb_expand_head(skb, ALIGN(delta, NET_SKB_PAD), 0,
                                        GFP_ATOMIC);
        return 0;
}

/**
 *        skb_cow - copy header of skb when it is required
 *        @skb: buffer to cow
 *        @headroom: needed headroom
 *
 *        If the skb passed lacks sufficient headroom or its data part
 *        is shared, data is reallocated. If reallocation fails, an error
 *        is returned and original skb is not changed.
 *
 *        The result is skb with writable area skb->head...skb->tail
 *        and at least @headroom of space at head.
 */
static inline int skb_cow(struct sk_buff *skb, unsigned int headroom)
{
        return __skb_cow(skb, headroom, skb_cloned(skb));
}

/**
 *        skb_cow_head - skb_cow but only making the head writable
 *        @skb: buffer to cow
 *        @headroom: needed headroom
 *
 *        This function is identical to skb_cow except that we replace the
 *        skb_cloned check by skb_header_cloned.  It should be used when
 *        you only need to push on some header and do not need to modify
 *        the data.
 */
static inline int skb_cow_head(struct sk_buff *skb, unsigned int headroom)
{
        return __skb_cow(skb, headroom, skb_header_cloned(skb));
}

/**
 *        skb_padto        - pad an skbuff up to a minimal size
 *        @skb: buffer to pad
 *        @len: minimal length
 *
 *        Pads up a buffer to ensure the trailing bytes exist and are
 *        blanked. If the buffer already contains sufficient data it
 *        is untouched. Otherwise it is extended. Returns zero on
 *        success. The skb is freed on error.
 */
static inline int skb_padto(struct sk_buff *skb, unsigned int len)
{
        unsigned int size = skb->len;
        if (likely(size >= len))
                return 0;
        return skb_pad(skb, len - size);
}

/**
 *        __skb_put_padto - increase size and pad an skbuff up to a minimal size
 *        @skb: buffer to pad
 *        @len: minimal length
 *        @free_on_error: free buffer on error
 *
 *        Pads up a buffer to ensure the trailing bytes exist and are
 *        blanked. If the buffer already contains sufficient data it
 *        is untouched. Otherwise it is extended. Returns zero on
 *        success. The skb is freed on error if @free_on_error is true.
 */
static inline int __must_check __skb_put_padto(struct sk_buff *skb,
                                               unsigned int len,
                                               bool free_on_error)
{
        unsigned int size = skb->len;

        if (unlikely(size < len)) {
                len -= size;
                if (__skb_pad(skb, len, free_on_error))
                        return -ENOMEM;
                __skb_put(skb, len);
        }
        return 0;
}

/**
 *        skb_put_padto - increase size and pad an skbuff up to a minimal size
 *        @skb: buffer to pad
 *        @len: minimal length
 *
 *        Pads up a buffer to ensure the trailing bytes exist and are
 *        blanked. If the buffer already contains sufficient data it
 *        is untouched. Otherwise it is extended. Returns zero on
 *        success. The skb is freed on error.
 */
static inline int __must_check skb_put_padto(struct sk_buff *skb, unsigned int len)
{
        return __skb_put_padto(skb, len, true);
}

bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i)
        __must_check;

static inline bool skb_can_coalesce_netmem(struct sk_buff *skb, int i,
                                           netmem_ref netmem, int off)
{
        if (skb_zcopy(skb))
                return false;
        if (i) {
                const skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];

                return netmem == skb_frag_netmem(frag) &&
                       off == skb_frag_off(frag) + skb_frag_size(frag);
        }
        return false;
}

static inline bool skb_can_coalesce(struct sk_buff *skb, int i,
                                    const struct page *page, int off)
{
        return skb_can_coalesce_netmem(skb, i, page_to_netmem(page), off);
}

static inline int __skb_linearize(struct sk_buff *skb)
{
        return __pskb_pull_tail(skb, skb->data_len) ? 0 : -ENOMEM;
}

/**
 *        skb_linearize - convert paged skb to linear one
 *        @skb: buffer to linarize
 *
 *        If there is no free memory -ENOMEM is returned, otherwise zero
 *        is returned and the old skb data released.
 */
static inline int skb_linearize(struct sk_buff *skb)
{
        return skb_is_nonlinear(skb) ? __skb_linearize(skb) : 0;
}

/**
 * skb_has_shared_frag - can any frag be overwritten
 * @skb: buffer to test
 *
 * Return: true if the skb has at least one frag that might be modified
 * by an external entity (as in vmsplice()/sendfile())
 */
static inline bool skb_has_shared_frag(const struct sk_buff *skb)
{
        return skb_is_nonlinear(skb) &&
               skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG;
}

/**
 *        skb_linearize_cow - make sure skb is linear and writable
 *        @skb: buffer to process
 *
 *        If there is no free memory -ENOMEM is returned, otherwise zero
 *        is returned and the old skb data released.
 */
static inline int skb_linearize_cow(struct sk_buff *skb)
{
        return skb_is_nonlinear(skb) || skb_cloned(skb) ?
               __skb_linearize(skb) : 0;
}

static __always_inline void
__skb_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len,
                     unsigned int off)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->csum = csum_block_sub(skb->csum,
                                           csum_partial(start, len, 0), off);
        else if (skb->ip_summed == CHECKSUM_PARTIAL &&
                 skb_checksum_start_offset(skb) < 0)
                skb->ip_summed = CHECKSUM_NONE;
}

/**
 *        skb_postpull_rcsum - update checksum for received skb after pull
 *        @skb: buffer to update
 *        @start: start of data before pull
 *        @len: length of data pulled
 *
 *        After doing a pull on a received packet, you need to call this to
 *        update the CHECKSUM_COMPLETE checksum, or set ip_summed to
 *        CHECKSUM_NONE so that it can be recomputed from scratch.
 */
static inline void skb_postpull_rcsum(struct sk_buff *skb,
                                      const void *start, unsigned int len)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->csum = wsum_negate(csum_partial(start, len,
                                                     wsum_negate(skb->csum)));
        else if (skb->ip_summed == CHECKSUM_PARTIAL &&
                 skb_checksum_start_offset(skb) < 0)
                skb->ip_summed = CHECKSUM_NONE;
}

static __always_inline void
__skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len,
                     unsigned int off)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->csum = csum_block_add(skb->csum,
                                           csum_partial(start, len, 0), off);
}

/**
 *        skb_postpush_rcsum - update checksum for received skb after push
 *        @skb: buffer to update
 *        @start: start of data after push
 *        @len: length of data pushed
 *
 *        After doing a push on a received packet, you need to call this to
 *        update the CHECKSUM_COMPLETE checksum.
 */
static inline void skb_postpush_rcsum(struct sk_buff *skb,
                                      const void *start, unsigned int len)
{
        __skb_postpush_rcsum(skb, start, len, 0);
}

void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len);

/**
 *        skb_push_rcsum - push skb and update receive checksum
 *        @skb: buffer to update
 *        @len: length of data pulled
 *
 *        This function performs an skb_push on the packet and updates
 *        the CHECKSUM_COMPLETE checksum.  It should be used on
 *        receive path processing instead of skb_push unless you know
 *        that the checksum difference is zero (e.g., a valid IP header)
 *        or you are setting ip_summed to CHECKSUM_NONE.
 */
static inline void *skb_push_rcsum(struct sk_buff *skb, unsigned int len)
{
        skb_push(skb, len);
        skb_postpush_rcsum(skb, skb->data, len);
        return skb->data;
}

int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len);
/**
 *        pskb_trim_rcsum - trim received skb and update checksum
 *        @skb: buffer to trim
 *        @len: new length
 *
 *        This is exactly the same as pskb_trim except that it ensures the
 *        checksum of received packets are still valid after the operation.
 *        It can change skb pointers.
 */

static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len)
{
        skb_might_realloc(skb);
        if (likely(len >= skb->len))
                return 0;
        return pskb_trim_rcsum_slow(skb, len);
}

static inline int __skb_trim_rcsum(struct sk_buff *skb, unsigned int len)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->ip_summed = CHECKSUM_NONE;
        __skb_trim(skb, len);
        return 0;
}

static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->ip_summed = CHECKSUM_NONE;
        return __skb_grow(skb, len);
}

#define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode)
#define skb_rb_first(root) rb_to_skb(rb_first(root))
#define skb_rb_last(root)  rb_to_skb(rb_last(root))
#define skb_rb_next(skb)   rb_to_skb(rb_next(&(skb)->rbnode))
#define skb_rb_prev(skb)   rb_to_skb(rb_prev(&(skb)->rbnode))

#define skb_queue_walk(queue, skb) \
                for (skb = (queue)->next;                                        \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = skb->next)

#define skb_queue_walk_safe(queue, skb, tmp)                                        \
                for (skb = (queue)->next, tmp = skb->next;                        \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = tmp, tmp = skb->next)

#define skb_queue_walk_from(queue, skb)                                                \
                for (; skb != (struct sk_buff *)(queue);                        \
                     skb = skb->next)

#define skb_rbtree_walk(skb, root)                                                \
                for (skb = skb_rb_first(root); skb != NULL;                        \
                     skb = skb_rb_next(skb))

#define skb_rbtree_walk_from(skb)                                                \
                for (; skb != NULL;                                                \
                     skb = skb_rb_next(skb))

#define skb_rbtree_walk_from_safe(skb, tmp)                                        \
                for (; tmp = skb ? skb_rb_next(skb) : NULL, (skb != NULL);        \
                     skb = tmp)

#define skb_queue_walk_from_safe(queue, skb, tmp)                                \
                for (tmp = skb->next;                                                \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = tmp, tmp = skb->next)

#define skb_queue_reverse_walk(queue, skb) \
                for (skb = (queue)->prev;                                        \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = skb->prev)

#define skb_queue_reverse_walk_safe(queue, skb, tmp)                                \
                for (skb = (queue)->prev, tmp = skb->prev;                        \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = tmp, tmp = skb->prev)

#define skb_queue_reverse_walk_from_safe(queue, skb, tmp)                        \
                for (tmp = skb->prev;                                                \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = tmp, tmp = skb->prev)

static inline bool skb_has_frag_list(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->frag_list != NULL;
}

static inline void skb_frag_list_init(struct sk_buff *skb)
{
        skb_shinfo(skb)->frag_list = NULL;
}

#define skb_walk_frags(skb, iter)        \
        for (iter = skb_shinfo(skb)->frag_list; iter; iter = iter->next)


int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue,
                                int *err, long *timeo_p,
                                const struct sk_buff *skb);
struct sk_buff *__skb_try_recv_from_queue(struct sk_buff_head *queue,
                                          unsigned int flags,
                                          int *off, int *err,
                                          struct sk_buff **last);
struct sk_buff *__skb_try_recv_datagram(struct sock *sk,
                                        struct sk_buff_head *queue,
                                        unsigned int flags, int *off, int *err,
                                        struct sk_buff **last);
struct sk_buff *__skb_recv_datagram(struct sock *sk,
                                    struct sk_buff_head *sk_queue,
                                    unsigned int flags, int *off, int *err);
struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags, int *err);
__poll_t datagram_poll_queue(struct file *file, struct socket *sock,
                             struct poll_table_struct *wait,
                             struct sk_buff_head *rcv_queue);
__poll_t datagram_poll(struct file *file, struct socket *sock,
                           struct poll_table_struct *wait);
int skb_copy_datagram_iter(const struct sk_buff *from, int offset,
                           struct iov_iter *to, int size);
static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset,
                                        struct msghdr *msg, int size)
{
        return skb_copy_datagram_iter(from, offset, &msg->msg_iter, size);
}
int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen,
                                   struct msghdr *msg);
int skb_copy_and_crc32c_datagram_iter(const struct sk_buff *skb, int offset,
                                      struct iov_iter *to, int len, u32 *crcp);
int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
                                 struct iov_iter *from, int len);
int skb_copy_datagram_from_iter_full(struct sk_buff *skb, int offset,
                                     struct iov_iter *from, int len);
int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm);
void skb_free_datagram(struct sock *sk, struct sk_buff *skb);
int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags);
int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len);
int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len);
__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, u8 *to,
                              int len);
int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
                    struct pipe_inode_info *pipe, unsigned int len,
                    unsigned int flags);
int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
                         int len);
int skb_send_sock_locked_with_flags(struct sock *sk, struct sk_buff *skb,
                                    int offset, int len, int flags);
int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len);
void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
unsigned int skb_zerocopy_headlen(const struct sk_buff *from);
int skb_zerocopy(struct sk_buff *to, struct sk_buff *from,
                 int len, int hlen);
void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len);
int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen);
void skb_scrub_packet(struct sk_buff *skb, bool xnet);
struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features);
struct sk_buff *skb_segment_list(struct sk_buff *skb, netdev_features_t features,
                                 unsigned int offset);
struct sk_buff *skb_vlan_untag(struct sk_buff *skb);
int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len);
int skb_ensure_writable_head_tail(struct sk_buff *skb, struct net_device *dev);
int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci);
int skb_vlan_pop(struct sk_buff *skb);
int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci);
int skb_eth_pop(struct sk_buff *skb);
int skb_eth_push(struct sk_buff *skb, const unsigned char *dst,
                 const unsigned char *src);
int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto,
                  int mac_len, bool ethernet);
int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len,
                 bool ethernet);
int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse);
int skb_mpls_dec_ttl(struct sk_buff *skb);
struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy,
                             gfp_t gfp);

static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len)
{
        return copy_from_iter_full(data, len, &msg->msg_iter) ? 0 : -EFAULT;
}

static inline int memcpy_to_msg(struct msghdr *msg, void *data, int len)
{
        return copy_to_iter(data, len, &msg->msg_iter) == len ? 0 : -EFAULT;
}

__wsum skb_checksum(const struct sk_buff *skb, int offset, int len,
                    __wsum csum);
u32 skb_crc32c(const struct sk_buff *skb, int offset, int len, u32 crc);

static inline void * __must_check
__skb_header_pointer(const struct sk_buff *skb, int offset, int len,
                     const void *data, int hlen, void *buffer)
{
        if (likely(hlen - offset >= len))
                return (void *)data + offset;

        if (!skb || unlikely(skb_copy_bits(skb, offset, buffer, len) < 0))
                return NULL;

        return buffer;
}

static inline void * __must_check
skb_header_pointer(const struct sk_buff *skb, int offset, int len, void *buffer)
{
        return __skb_header_pointer(skb, offset, len, skb->data,
                                    skb_headlen(skb), buffer);
}

static inline void * __must_check
skb_pointer_if_linear(const struct sk_buff *skb, int offset, int len)
{
        if (likely(skb_headlen(skb) - offset >= len))
                return skb->data + offset;
        return NULL;
}

/**
 *        skb_needs_linearize - check if we need to linearize a given skb
 *                              depending on the given device features.
 *        @skb: socket buffer to check
 *        @features: net device features
 *
 *        Returns true if either:
 *        1. skb has frag_list and the device doesn't support FRAGLIST, or
 *        2. skb is fragmented and the device does not support SG.
 */
static inline bool skb_needs_linearize(struct sk_buff *skb,
                                       netdev_features_t features)
{
        return skb_is_nonlinear(skb) &&
               ((skb_has_frag_list(skb) && !(features & NETIF_F_FRAGLIST)) ||
                (skb_shinfo(skb)->nr_frags && !(features & NETIF_F_SG)));
}

static inline void skb_copy_from_linear_data(const struct sk_buff *skb,
                                             void *to,
                                             const unsigned int len)
{
        memcpy(to, skb->data, len);
}

static inline void skb_copy_from_linear_data_offset(const struct sk_buff *skb,
                                                    const int offset, void *to,
                                                    const unsigned int len)
{
        memcpy(to, skb->data + offset, len);
}

static inline void skb_copy_to_linear_data(struct sk_buff *skb,
                                           const void *from,
                                           const unsigned int len)
{
        memcpy(skb->data, from, len);
}

static inline void skb_copy_to_linear_data_offset(struct sk_buff *skb,
                                                  const int offset,
                                                  const void *from,
                                                  const unsigned int len)
{
        memcpy(skb->data + offset, from, len);
}

void skb_init(void);

static inline ktime_t skb_get_ktime(const struct sk_buff *skb)
{
        return skb->tstamp;
}

/**
 *        skb_get_timestamp - get timestamp from a skb
 *        @skb: skb to get stamp from
 *        @stamp: pointer to struct __kernel_old_timeval to store stamp in
 *
 *        Timestamps are stored in the skb as offsets to a base timestamp.
 *        This function converts the offset back to a struct timeval and stores
 *        it in stamp.
 */
static inline void skb_get_timestamp(const struct sk_buff *skb,
                                     struct __kernel_old_timeval *stamp)
{
        *stamp = ns_to_kernel_old_timeval(skb->tstamp);
}

static inline void skb_get_new_timestamp(const struct sk_buff *skb,
                                         struct __kernel_sock_timeval *stamp)
{
        struct timespec64 ts = ktime_to_timespec64(skb->tstamp);

        stamp->tv_sec = ts.tv_sec;
        stamp->tv_usec = ts.tv_nsec / 1000;
}

static inline void skb_get_timestampns(const struct sk_buff *skb,
                                       struct __kernel_old_timespec *stamp)
{
        struct timespec64 ts = ktime_to_timespec64(skb->tstamp);

        stamp->tv_sec = ts.tv_sec;
        stamp->tv_nsec = ts.tv_nsec;
}

static inline void skb_get_new_timestampns(const struct sk_buff *skb,
                                           struct __kernel_timespec *stamp)
{
        struct timespec64 ts = ktime_to_timespec64(skb->tstamp);

        stamp->tv_sec = ts.tv_sec;
        stamp->tv_nsec = ts.tv_nsec;
}

static inline void __net_timestamp(struct sk_buff *skb)
{
        skb->tstamp = ktime_get_real();
        skb->tstamp_type = SKB_CLOCK_REALTIME;
}

static inline ktime_t net_timedelta(ktime_t t)
{
        return ktime_sub(ktime_get_real(), t);
}

static inline void skb_set_delivery_time(struct sk_buff *skb, ktime_t kt,
                                         u8 tstamp_type)
{
        skb->tstamp = kt;

        if (kt)
                skb->tstamp_type = tstamp_type;
        else
                skb->tstamp_type = SKB_CLOCK_REALTIME;
}

static inline void skb_set_delivery_type_by_clockid(struct sk_buff *skb,
                                                    ktime_t kt, clockid_t clockid)
{
        u8 tstamp_type = SKB_CLOCK_REALTIME;

        switch (clockid) {
        case CLOCK_REALTIME:
                break;
        case CLOCK_MONOTONIC:
                tstamp_type = SKB_CLOCK_MONOTONIC;
                break;
        case CLOCK_TAI:
                tstamp_type = SKB_CLOCK_TAI;
                break;
        default:
                WARN_ON_ONCE(1);
                kt = 0;
        }

        skb_set_delivery_time(skb, kt, tstamp_type);
}

DECLARE_STATIC_KEY_FALSE(netstamp_needed_key);

/* It is used in the ingress path to clear the delivery_time.
 * If needed, set the skb->tstamp to the (rcv) timestamp.
 */
static inline void skb_clear_delivery_time(struct sk_buff *skb)
{
        if (skb->tstamp_type) {
                skb->tstamp_type = SKB_CLOCK_REALTIME;
                if (static_branch_unlikely(&netstamp_needed_key))
                        skb->tstamp = ktime_get_real();
                else
                        skb->tstamp = 0;
        }
}

static inline void skb_clear_tstamp(struct sk_buff *skb)
{
        if (skb->tstamp_type)
                return;

        skb->tstamp = 0;
}

static inline ktime_t skb_tstamp(const struct sk_buff *skb)
{
        if (skb->tstamp_type)
                return 0;

        return skb->tstamp;
}

static inline ktime_t skb_tstamp_cond(const struct sk_buff *skb, bool cond)
{
        if (skb->tstamp_type != SKB_CLOCK_MONOTONIC && skb->tstamp)
                return skb->tstamp;

        if (static_branch_unlikely(&netstamp_needed_key) || cond)
                return ktime_get_real();

        return 0;
}

static inline u8 skb_metadata_len(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->meta_len;
}

static inline void *skb_metadata_end(const struct sk_buff *skb)
{
        return skb_mac_header(skb);
}

static inline bool __skb_metadata_differs(const struct sk_buff *skb_a,
                                          const struct sk_buff *skb_b,
                                          u8 meta_len)
{
        const void *a = skb_metadata_end(skb_a);
        const void *b = skb_metadata_end(skb_b);
        u64 diffs = 0;

        if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
            BITS_PER_LONG != 64)
                goto slow;

        /* Using more efficient variant than plain call to memcmp(). */
        switch (meta_len) {
#define __it(x, op) (x -= sizeof(u##op))
#define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op))
        case 32: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case 24: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case 16: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case  8: diffs |= __it_diff(a, b, 64);
                break;
        case 28: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case 20: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case 12: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case  4: diffs |= __it_diff(a, b, 32);
                break;
        default:
slow:
                return memcmp(a - meta_len, b - meta_len, meta_len);
        }
        return diffs;
}

static inline bool skb_metadata_differs(const struct sk_buff *skb_a,
                                        const struct sk_buff *skb_b)
{
        u8 len_a = skb_metadata_len(skb_a);
        u8 len_b = skb_metadata_len(skb_b);

        if (!(len_a | len_b))
                return false;

        return len_a != len_b ?
               true : __skb_metadata_differs(skb_a, skb_b, len_a);
}

static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len)
{
        skb_shinfo(skb)->meta_len = meta_len;
}

static inline void skb_metadata_clear(struct sk_buff *skb)
{
        skb_metadata_set(skb, 0);
}

struct sk_buff *skb_clone_sk(struct sk_buff *skb);

#ifdef CONFIG_NETWORK_PHY_TIMESTAMPING

void skb_clone_tx_timestamp(struct sk_buff *skb);
bool skb_defer_rx_timestamp(struct sk_buff *skb);

#else /* CONFIG_NETWORK_PHY_TIMESTAMPING */

static inline void skb_clone_tx_timestamp(struct sk_buff *skb)
{
}

static inline bool skb_defer_rx_timestamp(struct sk_buff *skb)
{
        return false;
}

#endif /* !CONFIG_NETWORK_PHY_TIMESTAMPING */

/**
 * skb_complete_tx_timestamp() - deliver cloned skb with tx timestamps
 *
 * PHY drivers may accept clones of transmitted packets for
 * timestamping via their phy_driver.txtstamp method. These drivers
 * must call this function to return the skb back to the stack with a
 * timestamp.
 *
 * @skb: clone of the original outgoing packet
 * @hwtstamps: hardware time stamps
 *
 */
void skb_complete_tx_timestamp(struct sk_buff *skb,
                               struct skb_shared_hwtstamps *hwtstamps);

void __skb_tstamp_tx(struct sk_buff *orig_skb, const struct sk_buff *ack_skb,
                     struct skb_shared_hwtstamps *hwtstamps,
                     struct sock *sk, int tstype);

/**
 * skb_tstamp_tx - queue clone of skb with send time stamps
 * @orig_skb:        the original outgoing packet
 * @hwtstamps:        hardware time stamps, may be NULL if not available
 *
 * If the skb has a socket associated, then this function clones the
 * skb (thus sharing the actual data and optional structures), stores
 * the optional hardware time stamping information (if non NULL) or
 * generates a software time stamp (otherwise), then queues the clone
 * to the error queue of the socket.  Errors are silently ignored.
 */
void skb_tstamp_tx(struct sk_buff *orig_skb,
                   struct skb_shared_hwtstamps *hwtstamps);

/**
 * skb_tx_timestamp() - Driver hook for transmit timestamping
 *
 * Ethernet MAC Drivers should call this function in their hard_xmit()
 * function immediately before giving the sk_buff to the MAC hardware.
 *
 * Specifically, one should make absolutely sure that this function is
 * called before TX completion of this packet can trigger.  Otherwise
 * the packet could potentially already be freed.
 *
 * @skb: A socket buffer.
 */
static inline void skb_tx_timestamp(struct sk_buff *skb)
{
        skb_clone_tx_timestamp(skb);
        if (skb_shinfo(skb)->tx_flags & (SKBTX_SW_TSTAMP | SKBTX_BPF))
                skb_tstamp_tx(skb, NULL);
}

/**
 * skb_complete_wifi_ack - deliver skb with wifi status
 *
 * @skb: the original outgoing packet
 * @acked: ack status
 *
 */
void skb_complete_wifi_ack(struct sk_buff *skb, bool acked);

__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len);
__sum16 __skb_checksum_complete(struct sk_buff *skb);

static inline int skb_csum_unnecessary(const struct sk_buff *skb)
{
        return ((skb->ip_summed == CHECKSUM_UNNECESSARY) ||
                skb->csum_valid ||
                (skb->ip_summed == CHECKSUM_PARTIAL &&
                 skb_checksum_start_offset(skb) >= 0));
}

/**
 *        skb_checksum_complete - Calculate checksum of an entire packet
 *        @skb: packet to process
 *
 *        This function calculates the checksum over the entire packet plus
 *        the value of skb->csum.  The latter can be used to supply the
 *        checksum of a pseudo header as used by TCP/UDP.  It returns the
 *        checksum.
 *
 *        For protocols that contain complete checksums such as ICMP/TCP/UDP,
 *        this function can be used to verify that checksum on received
 *        packets.  In that case the function should return zero if the
 *        checksum is correct.  In particular, this function will return zero
 *        if skb->ip_summed is CHECKSUM_UNNECESSARY which indicates that the
 *        hardware has already verified the correctness of the checksum.
 */
static inline __sum16 skb_checksum_complete(struct sk_buff *skb)
{
        return skb_csum_unnecessary(skb) ?
               0 : __skb_checksum_complete(skb);
}

static inline void __skb_decr_checksum_unnecessary(struct sk_buff *skb)
{
        if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
                if (skb->csum_level == 0)
                        skb->ip_summed = CHECKSUM_NONE;
                else
                        skb->csum_level--;
        }
}

static inline void __skb_incr_checksum_unnecessary(struct sk_buff *skb)
{
        if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
                if (skb->csum_level < SKB_MAX_CSUM_LEVEL)
                        skb->csum_level++;
        } else if (skb->ip_summed == CHECKSUM_NONE) {
                skb->ip_summed = CHECKSUM_UNNECESSARY;
                skb->csum_level = 0;
        }
}

static inline void __skb_reset_checksum_unnecessary(struct sk_buff *skb)
{
        if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
                skb->ip_summed = CHECKSUM_NONE;
                skb->csum_level = 0;
        }
}

/* Check if we need to perform checksum complete validation.
 *
 * Returns: true if checksum complete is needed, false otherwise
 * (either checksum is unnecessary or zero checksum is allowed).
 */
static inline bool __skb_checksum_validate_needed(struct sk_buff *skb,
                                                  bool zero_okay,
                                                  __sum16 check)
{
        if (skb_csum_unnecessary(skb) || (zero_okay && !check)) {
                skb->csum_valid = 1;
                __skb_decr_checksum_unnecessary(skb);
                return false;
        }

        return true;
}

/* For small packets <= CHECKSUM_BREAK perform checksum complete directly
 * in checksum_init.
 */
#define CHECKSUM_BREAK 76

/* Unset checksum-complete
 *
 * Unset checksum complete can be done when packet is being modified
 * (uncompressed for instance) and checksum-complete value is
 * invalidated.
 */
static inline void skb_checksum_complete_unset(struct sk_buff *skb)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->ip_summed = CHECKSUM_NONE;
}

/* Validate (init) checksum based on checksum complete.
 *
 * Return values:
 *   0: checksum is validated or try to in skb_checksum_complete. In the latter
 *        case the ip_summed will not be CHECKSUM_UNNECESSARY and the pseudo
 *        checksum is stored in skb->csum for use in __skb_checksum_complete
 *   non-zero: value of invalid checksum
 *
 */
static inline __sum16 __skb_checksum_validate_complete(struct sk_buff *skb,
                                                       bool complete,
                                                       __wsum psum)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE) {
                if (!csum_fold(csum_add(psum, skb->csum))) {
                        skb->csum_valid = 1;
                        return 0;
                }
        }

        skb->csum = psum;

        if (complete || skb->len <= CHECKSUM_BREAK) {
                __sum16 csum;

                csum = __skb_checksum_complete(skb);
                skb->csum_valid = !csum;
                return csum;
        }

        return 0;
}

static inline __wsum null_compute_pseudo(struct sk_buff *skb, int proto)
{
        return 0;
}

/* Perform checksum validate (init). Note that this is a macro since we only
 * want to calculate the pseudo header which is an input function if necessary.
 * First we try to validate without any computation (checksum unnecessary) and
 * then calculate based on checksum complete calling the function to compute
 * pseudo header.
 *
 * Return values:
 *   0: checksum is validated or try to in skb_checksum_complete
 *   non-zero: value of invalid checksum
 */
#define __skb_checksum_validate(skb, proto, complete,                        \
                                zero_okay, check, compute_pseudo)        \
({                                                                        \
        __sum16 __ret = 0;                                                \
        skb->csum_valid = 0;                                                \
        if (__skb_checksum_validate_needed(skb, zero_okay, check))        \
                __ret = __skb_checksum_validate_complete(skb,                \
                                complete, compute_pseudo(skb, proto));        \
        __ret;                                                                \
})

#define skb_checksum_init(skb, proto, compute_pseudo)                        \
        __skb_checksum_validate(skb, proto, false, false, 0, compute_pseudo)

#define skb_checksum_init_zero_check(skb, proto, check, compute_pseudo)        \
        __skb_checksum_validate(skb, proto, false, true, check, compute_pseudo)

#define skb_checksum_validate(skb, proto, compute_pseudo)                \
        __skb_checksum_validate(skb, proto, true, false, 0, compute_pseudo)

#define skb_checksum_validate_zero_check(skb, proto, check,                \
                                         compute_pseudo)                \
        __skb_checksum_validate(skb, proto, true, true, check, compute_pseudo)

#define skb_checksum_simple_validate(skb)                                \
        __skb_checksum_validate(skb, 0, true, false, 0, null_compute_pseudo)

static inline bool __skb_checksum_convert_check(struct sk_buff *skb)
{
        return (skb->ip_summed == CHECKSUM_NONE && skb->csum_valid);
}

static inline void __skb_checksum_convert(struct sk_buff *skb, __wsum pseudo)
{
        skb->csum = ~pseudo;
        skb->ip_summed = CHECKSUM_COMPLETE;
}

#define skb_checksum_try_convert(skb, proto, compute_pseudo)        \
do {                                                                        \
        if (__skb_checksum_convert_check(skb))                                \
                __skb_checksum_convert(skb, compute_pseudo(skb, proto)); \
} while (0)

static inline void skb_remcsum_adjust_partial(struct sk_buff *skb, void *ptr,
                                              u16 start, u16 offset)
{
        skb->ip_summed = CHECKSUM_PARTIAL;
        skb->csum_start = ((unsigned char *)ptr + start) - skb->head;
        skb->csum_offset = offset - start;
}

/* Update skbuf and packet to reflect the remote checksum offload operation.
 * When called, ptr indicates the starting point for skb->csum when
 * ip_summed is CHECKSUM_COMPLETE. If we need create checksum complete
 * here, skb_postpull_rcsum is done so skb->csum start is ptr.
 */
static inline void skb_remcsum_process(struct sk_buff *skb, void *ptr,
                                       int start, int offset, bool nopartial)
{
        __wsum delta;

        if (!nopartial) {
                skb_remcsum_adjust_partial(skb, ptr, start, offset);
                return;
        }

        if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) {
                __skb_checksum_complete(skb);
                skb_postpull_rcsum(skb, skb->data, ptr - (void *)skb->data);
        }

        delta = remcsum_adjust(ptr, skb->csum, start, offset);

        /* Adjust skb->csum since we changed the packet */
        skb->csum = csum_add(skb->csum, delta);
}

static inline struct nf_conntrack *skb_nfct(const struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        return (void *)(skb->_nfct & NFCT_PTRMASK);
#else
        return NULL;
#endif
}

static inline unsigned long skb_get_nfct(const struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        return skb->_nfct;
#else
        return 0UL;
#endif
}

static inline void skb_set_nfct(struct sk_buff *skb, unsigned long nfct)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        skb->slow_gro |= !!nfct;
        skb->_nfct = nfct;
#endif
}

#ifdef CONFIG_SKB_EXTENSIONS
enum skb_ext_id {
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
        SKB_EXT_BRIDGE_NF,
#endif
#ifdef CONFIG_XFRM
        SKB_EXT_SEC_PATH,
#endif
#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
        TC_SKB_EXT,
#endif
#if IS_ENABLED(CONFIG_MPTCP)
        SKB_EXT_MPTCP,
#endif
#if IS_ENABLED(CONFIG_MCTP_FLOWS)
        SKB_EXT_MCTP,
#endif
#if IS_ENABLED(CONFIG_INET_PSP)
        SKB_EXT_PSP,
#endif
        SKB_EXT_NUM, /* must be last */
};

/**
 *        struct skb_ext - sk_buff extensions
 *        @refcnt: 1 on allocation, deallocated on 0
 *        @offset: offset to add to @data to obtain extension address
 *        @chunks: size currently allocated, stored in SKB_EXT_ALIGN_SHIFT units
 *        @data: start of extension data, variable sized
 *
 *        Note: offsets/lengths are stored in chunks of 8 bytes, this allows
 *        to use 'u8' types while allowing up to 2kb worth of extension data.
 */
struct skb_ext {
        refcount_t refcnt;
        u8 offset[SKB_EXT_NUM]; /* in chunks of 8 bytes */
        u8 chunks;                /* same */
        char data[] __aligned(8);
};

struct skb_ext *__skb_ext_alloc(gfp_t flags);
void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id,
                    struct skb_ext *ext);
void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id);
void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id);
void __skb_ext_put(struct skb_ext *ext);

static inline void skb_ext_put(struct sk_buff *skb)
{
        if (skb->active_extensions)
                __skb_ext_put(skb->extensions);
}

static inline void __skb_ext_copy(struct sk_buff *dst,
                                  const struct sk_buff *src)
{
        dst->active_extensions = src->active_extensions;

        if (src->active_extensions) {
                struct skb_ext *ext = src->extensions;

                refcount_inc(&ext->refcnt);
                dst->extensions = ext;
        }
}

static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *src)
{
        skb_ext_put(dst);
        __skb_ext_copy(dst, src);
}

static inline bool __skb_ext_exist(const struct skb_ext *ext, enum skb_ext_id i)
{
        return !!ext->offset[i];
}

static inline bool skb_ext_exist(const struct sk_buff *skb, enum skb_ext_id id)
{
        return skb->active_extensions & (1 << id);
}

static inline void skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
{
        if (skb_ext_exist(skb, id))
                __skb_ext_del(skb, id);
}

static inline void *skb_ext_find(const struct sk_buff *skb, enum skb_ext_id id)
{
        if (skb_ext_exist(skb, id)) {
                struct skb_ext *ext = skb->extensions;

                return (void *)ext + (ext->offset[id] << 3);
        }

        return NULL;
}

static inline void skb_ext_reset(struct sk_buff *skb)
{
        if (unlikely(skb->active_extensions)) {
                __skb_ext_put(skb->extensions);
                skb->active_extensions = 0;
        }
}

static inline bool skb_has_extensions(struct sk_buff *skb)
{
        return unlikely(skb->active_extensions);
}
#else
static inline void skb_ext_put(struct sk_buff *skb) {}
static inline void skb_ext_reset(struct sk_buff *skb) {}
static inline void skb_ext_del(struct sk_buff *skb, int unused) {}
static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {}
static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {}
static inline bool skb_has_extensions(struct sk_buff *skb) { return false; }
#endif /* CONFIG_SKB_EXTENSIONS */

static inline void nf_reset_ct(struct sk_buff *skb)
{
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        nf_conntrack_put(skb_nfct(skb));
        skb->_nfct = 0;
#endif
}

static inline void nf_reset_trace(struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES)
        skb->nf_trace = 0;
#endif
}

static inline void ipvs_reset(struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_IP_VS)
        skb->ipvs_property = 0;
#endif
}

/* Note: This doesn't put any conntrack info in dst. */
static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src,
                             bool copy)
{
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        dst->_nfct = src->_nfct;
        nf_conntrack_get(skb_nfct(src));
#endif
#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES)
        if (copy)
                dst->nf_trace = src->nf_trace;
#endif
}

static inline void nf_copy(struct sk_buff *dst, const struct sk_buff *src)
{
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        nf_conntrack_put(skb_nfct(dst));
#endif
        dst->slow_gro = src->slow_gro;
        __nf_copy(dst, src, true);
}

#ifdef CONFIG_NETWORK_SECMARK
static inline void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from)
{
        to->secmark = from->secmark;
}

static inline void skb_init_secmark(struct sk_buff *skb)
{
        skb->secmark = 0;
}
#else
static inline void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from)
{ }

static inline void skb_init_secmark(struct sk_buff *skb)
{ }
#endif

static inline int secpath_exists(const struct sk_buff *skb)
{
#ifdef CONFIG_XFRM
        return skb_ext_exist(skb, SKB_EXT_SEC_PATH);
#else
        return 0;
#endif
}

static inline bool skb_irq_freeable(const struct sk_buff *skb)
{
        return !skb->destructor &&
                !secpath_exists(skb) &&
                !skb_nfct(skb) &&
                !skb->_skb_refdst &&
                !skb_has_frag_list(skb);
}

static inline void skb_set_queue_mapping(struct sk_buff *skb, u16 queue_mapping)
{
        skb->queue_mapping = queue_mapping;
}

static inline u16 skb_get_queue_mapping(const struct sk_buff *skb)
{
        return skb->queue_mapping;
}

static inline void skb_copy_queue_mapping(struct sk_buff *to, const struct sk_buff *from)
{
        to->queue_mapping = from->queue_mapping;
}

static inline void skb_record_rx_queue(struct sk_buff *skb, u16 rx_queue)
{
        skb->queue_mapping = rx_queue + 1;
}

static inline u16 skb_get_rx_queue(const struct sk_buff *skb)
{
        return skb->queue_mapping - 1;
}

static inline bool skb_rx_queue_recorded(const struct sk_buff *skb)
{
        return skb->queue_mapping != 0;
}

static inline void skb_set_dst_pending_confirm(struct sk_buff *skb, u32 val)
{
        skb->dst_pending_confirm = val;
}

static inline bool skb_get_dst_pending_confirm(const struct sk_buff *skb)
{
        return skb->dst_pending_confirm != 0;
}

static inline struct sec_path *skb_sec_path(const struct sk_buff *skb)
{
#ifdef CONFIG_XFRM
        return skb_ext_find(skb, SKB_EXT_SEC_PATH);
#else
        return NULL;
#endif
}

static inline bool skb_is_gso(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->gso_size;
}

/* Note: Should be called only if skb_is_gso(skb) is true */
static inline bool skb_is_gso_v6(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6;
}

/* Note: Should be called only if skb_is_gso(skb) is true */
static inline bool skb_is_gso_sctp(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->gso_type & SKB_GSO_SCTP;
}

/* Note: Should be called only if skb_is_gso(skb) is true */
static inline bool skb_is_gso_tcp(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6);
}

static inline void skb_gso_reset(struct sk_buff *skb)
{
        skb_shinfo(skb)->gso_size = 0;
        skb_shinfo(skb)->gso_segs = 0;
        skb_shinfo(skb)->gso_type = 0;
}

static inline void skb_increase_gso_size(struct skb_shared_info *shinfo,
                                         u16 increment)
{
        if (WARN_ON_ONCE(shinfo->gso_size == GSO_BY_FRAGS))
                return;
        shinfo->gso_size += increment;
}

static inline void skb_decrease_gso_size(struct skb_shared_info *shinfo,
                                         u16 decrement)
{
        if (WARN_ON_ONCE(shinfo->gso_size == GSO_BY_FRAGS))
                return;
        shinfo->gso_size -= decrement;
}

void __skb_warn_lro_forwarding(const struct sk_buff *skb);

static inline bool skb_warn_if_lro(const struct sk_buff *skb)
{
        /* LRO sets gso_size but not gso_type, whereas if GSO is really
         * wanted then gso_type will be set. */
        const struct skb_shared_info *shinfo = skb_shinfo(skb);

        if (skb_is_nonlinear(skb) && shinfo->gso_size != 0 &&
            unlikely(shinfo->gso_type == 0)) {
                __skb_warn_lro_forwarding(skb);
                return true;
        }
        return false;
}

static inline void skb_forward_csum(struct sk_buff *skb)
{
        /* Unfortunately we don't support this one.  Any brave souls? */
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->ip_summed = CHECKSUM_NONE;
}

/**
 * skb_checksum_none_assert - make sure skb ip_summed is CHECKSUM_NONE
 * @skb: skb to check
 *
 * fresh skbs have their ip_summed set to CHECKSUM_NONE.
 * Instead of forcing ip_summed to CHECKSUM_NONE, we can
 * use this helper, to document places where we make this assertion.
 */
static inline void skb_checksum_none_assert(const struct sk_buff *skb)
{
        DEBUG_NET_WARN_ON_ONCE(skb->ip_summed != CHECKSUM_NONE);
}

bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);

int skb_checksum_setup(struct sk_buff *skb, bool recalculate);
struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
                                     unsigned int transport_len,
                                     __sum16(*skb_chkf)(struct sk_buff *skb));

/**
 * skb_head_is_locked - Determine if the skb->head is locked down
 * @skb: skb to check
 *
 * The head on skbs build around a head frag can be removed if they are
 * not cloned.  This function returns true if the skb head is locked down
 * due to either being allocated via kmalloc, or by being a clone with
 * multiple references to the head.
 */
static inline bool skb_head_is_locked(const struct sk_buff *skb)
{
        return !skb->head_frag || skb_cloned(skb);
}

/* Local Checksum Offload.
 * Compute outer checksum based on the assumption that the
 * inner checksum will be offloaded later.
 * See Documentation/networking/checksum-offloads.rst for
 * explanation of how this works.
 * Fill in outer checksum adjustment (e.g. with sum of outer
 * pseudo-header) before calling.
 * Also ensure that inner checksum is in linear data area.
 */
static inline __wsum lco_csum(struct sk_buff *skb)
{
        unsigned char *csum_start = skb_checksum_start(skb);
        unsigned char *l4_hdr = skb_transport_header(skb);
        __wsum partial;

        /* Start with complement of inner checksum adjustment */
        partial = ~csum_unfold(*(__force __sum16 *)(csum_start +
                                                    skb->csum_offset));

        /* Add in checksum of our headers (incl. outer checksum
         * adjustment filled in by caller) and return result.
         */
        return csum_partial(l4_hdr, csum_start - l4_hdr, partial);
}

static inline bool skb_is_redirected(const struct sk_buff *skb)
{
        return skb->redirected;
}

static inline void skb_set_redirected(struct sk_buff *skb, bool from_ingress)
{
        skb->redirected = 1;
#ifdef CONFIG_NET_REDIRECT
        skb->from_ingress = from_ingress;
        if (skb->from_ingress)
                skb_clear_tstamp(skb);
#endif
}

static inline void skb_reset_redirect(struct sk_buff *skb)
{
        skb->redirected = 0;
}

static inline void skb_set_redirected_noclear(struct sk_buff *skb,
                                              bool from_ingress)
{
        skb->redirected = 1;
#ifdef CONFIG_NET_REDIRECT
        skb->from_ingress = from_ingress;
#endif
}

static inline bool skb_csum_is_sctp(struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_IP_SCTP)
        return skb->csum_not_inet;
#else
        return 0;
#endif
}

static inline void skb_reset_csum_not_inet(struct sk_buff *skb)
{
        skb->ip_summed = CHECKSUM_NONE;
#if IS_ENABLED(CONFIG_IP_SCTP)
        skb->csum_not_inet = 0;
#endif
}

static inline void skb_set_kcov_handle(struct sk_buff *skb,
                                       const u64 kcov_handle)
{
#ifdef CONFIG_KCOV
        skb->kcov_handle = kcov_handle;
#endif
}

static inline u64 skb_get_kcov_handle(struct sk_buff *skb)
{
#ifdef CONFIG_KCOV
        return skb->kcov_handle;
#else
        return 0;
#endif
}

static inline void skb_mark_for_recycle(struct sk_buff *skb)
{
#ifdef CONFIG_PAGE_POOL
        skb->pp_recycle = 1;
#endif
}

ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
                             ssize_t maxsize);

#endif        /* __KERNEL__ */
#endif        /* _LINUX_SKBUFF_H */



























  318 














































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PAGE_64_H
#define _ASM_X86_PAGE_64_H

#include <asm/page_64_types.h>

#ifndef __ASSEMBLER__
#include <asm/cpufeatures.h>
#include <asm/alternative.h>

#include <linux/kmsan-checks.h>

/* duplicated to the one in bootmem.h */
extern unsigned long max_pfn;
extern unsigned long phys_base;

extern unsigned long page_offset_base;
extern unsigned long vmalloc_base;
extern unsigned long vmemmap_base;
extern unsigned long direct_map_physmem_end;

static __always_inline unsigned long __phys_addr_nodebug(unsigned long x)
{
        unsigned long y = x - __START_KERNEL_map;

        /* use the carry flag to determine if x was < __START_KERNEL_map */
        x = y + ((x > y) ? phys_base : (__START_KERNEL_map - PAGE_OFFSET));

        return x;
}

#ifdef CONFIG_DEBUG_VIRTUAL
extern unsigned long __phys_addr(unsigned long);
extern unsigned long __phys_addr_symbol(unsigned long);
#else
#define __phys_addr(x)                __phys_addr_nodebug(x)
#define __phys_addr_symbol(x) \
        ((unsigned long)(x) - __START_KERNEL_map + phys_base)
#endif

#define __phys_reloc_hide(x)        (x)

void clear_page_orig(void *page);
void clear_page_rep(void *page);
void clear_page_erms(void *page);
KCFI_REFERENCE(clear_page_orig);
KCFI_REFERENCE(clear_page_rep);
KCFI_REFERENCE(clear_page_erms);

static inline void clear_page(void *page)
{
        /*
         * Clean up KMSAN metadata for the page being cleared. The assembly call
         * below clobbers @page, so we perform unpoisoning before it.
         */
        kmsan_unpoison_memory(page, PAGE_SIZE);
        alternative_call_2(clear_page_orig,
                           clear_page_rep, X86_FEATURE_REP_GOOD,
                           clear_page_erms, X86_FEATURE_ERMS,
                           "=D" (page),
                           "D" (page),
                           "cc", "memory", "rax", "rcx");
}

void copy_page(void *to, void *from);
KCFI_REFERENCE(copy_page);

/*
 * User space process size.  This is the first address outside the user range.
 * There are a few constraints that determine this:
 *
 * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
 * address, then that syscall will enter the kernel with a
 * non-canonical return address, and SYSRET will explode dangerously.
 * We avoid this particular problem by preventing anything
 * from being mapped at the maximum canonical address.
 *
 * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
 * CPUs malfunction if they execute code from the highest canonical page.
 * They'll speculate right off the end of the canonical space, and
 * bad things happen.  This is worked around in the same way as the
 * Intel problem.
 *
 * With page table isolation enabled, we map the LDT in ... [stay tuned]
 */
static __always_inline unsigned long task_size_max(void)
{
        unsigned long ret;

        alternative_io("movq %[small],%0","movq %[large],%0",
                        X86_FEATURE_LA57,
                        "=r" (ret),
                        [small] "i" ((1ul << 47)-PAGE_SIZE),
                        [large] "i" ((1ul << 56)-PAGE_SIZE));

        return ret;
}

#endif        /* !__ASSEMBLER__ */

#ifdef CONFIG_X86_VSYSCALL_EMULATION
# define __HAVE_ARCH_GATE_AREA 1
#endif

#endif /* _ASM_X86_PAGE_64_H */
























   68 
































  319 









































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM skb

#if !defined(_TRACE_SKB_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_SKB_H

#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/tracepoint.h>

#undef FN
#define FN(reason)        TRACE_DEFINE_ENUM(SKB_DROP_REASON_##reason);
DEFINE_DROP_REASON(FN, FN)

#undef FN
#undef FNe
#define FN(reason)        { SKB_DROP_REASON_##reason, #reason },
#define FNe(reason)        { SKB_DROP_REASON_##reason, #reason }

/*
 * Tracepoint for free an sk_buff:
 */
TRACE_EVENT(kfree_skb,

        TP_PROTO(struct sk_buff *skb, void *location,
                 enum skb_drop_reason reason, struct sock *rx_sk),

        TP_ARGS(skb, location, reason, rx_sk),

        TP_STRUCT__entry(
                __field(void *,                skbaddr)
                __field(void *,                location)
                __field(void *,                rx_sk)
                __field(unsigned short,        protocol)
                __field(enum skb_drop_reason,        reason)
        ),

        TP_fast_assign(
                __entry->skbaddr = skb;
                __entry->location = location;
                __entry->rx_sk = rx_sk;
                __entry->protocol = ntohs(skb->protocol);
                __entry->reason = reason;
        ),

        TP_printk("skbaddr=%p rx_sk=%p protocol=%u location=%pS reason: %s",
                  __entry->skbaddr, __entry->rx_sk, __entry->protocol,
                  __entry->location,
                  __print_symbolic(__entry->reason,
                                   DEFINE_DROP_REASON(FN, FNe)))
);

#undef FN
#undef FNe

TRACE_EVENT(consume_skb,

        TP_PROTO(struct sk_buff *skb, void *location),

        TP_ARGS(skb, location),

        TP_STRUCT__entry(
                __field(        void *,        skbaddr)
                __field(        void *,        location)
        ),

        TP_fast_assign(
                __entry->skbaddr = skb;
                __entry->location = location;
        ),

        TP_printk("skbaddr=%p location=%pS", __entry->skbaddr, __entry->location)
);

TRACE_EVENT(skb_copy_datagram_iovec,

        TP_PROTO(const struct sk_buff *skb, int len),

        TP_ARGS(skb, len),

        TP_STRUCT__entry(
                __field(        const void *,                skbaddr                )
                __field(        int,                        len                )
        ),

        TP_fast_assign(
                __entry->skbaddr = skb;
                __entry->len = len;
        ),

        TP_printk("skbaddr=%p len=%d", __entry->skbaddr, __entry->len)
);

#endif /* _TRACE_SKB_H */

/* This part must be outside protection */
#include <trace/define_trace.h>

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   38 








   39 















   39 


   39 



   39 








































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
// SPDX-License-Identifier: GPL-2.0-only
// Copyright (C) 2022 Linutronix GmbH, John Ogness
// Copyright (C) 2022 Intel, Thomas Gleixner

#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/console.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/export.h>
#include <linux/init.h>
#include <linux/irqflags.h>
#include <linux/kthread.h>
#include <linux/minmax.h>
#include <linux/panic.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
#include <linux/slab.h>
#include <linux/smp.h>
#include <linux/stddef.h>
#include <linux/string.h>
#include <linux/types.h>
#include "internal.h"
#include "printk_ringbuffer.h"
/*
 * Printk console printing implementation for consoles which does not depend
 * on the legacy style console_lock mechanism.
 *
 * The state of the console is maintained in the "nbcon_state" atomic
 * variable.
 *
 * The console is locked when:
 *
 *   - The 'prio' field contains the priority of the context that owns the
 *     console. Only higher priority contexts are allowed to take over the
 *     lock. A value of 0 (NBCON_PRIO_NONE) means the console is not locked.
 *
 *   - The 'cpu' field denotes on which CPU the console is locked. It is used
 *     to prevent busy waiting on the same CPU. Also it informs the lock owner
 *     that it has lost the lock in a more complex scenario when the lock was
 *     taken over by a higher priority context, released, and taken on another
 *     CPU with the same priority as the interrupted owner.
 *
 * The acquire mechanism uses a few more fields:
 *
 *   - The 'req_prio' field is used by the handover approach to make the
 *     current owner aware that there is a context with a higher priority
 *     waiting for the friendly handover.
 *
 *   - The 'unsafe' field allows to take over the console in a safe way in the
 *     middle of emitting a message. The field is set only when accessing some
 *     shared resources or when the console device is manipulated. It can be
 *     cleared, for example, after emitting one character when the console
 *     device is in a consistent state.
 *
 *   - The 'unsafe_takeover' field is set when a hostile takeover took the
 *     console in an unsafe state. The console will stay in the unsafe state
 *     until re-initialized.
 *
 * The acquire mechanism uses three approaches:
 *
 *   1) Direct acquire when the console is not owned or is owned by a lower
 *      priority context and is in a safe state.
 *
 *   2) Friendly handover mechanism uses a request/grant handshake. It is used
 *      when the current owner has lower priority and the console is in an
 *      unsafe state.
 *
 *      The requesting context:
 *
 *        a) Sets its priority into the 'req_prio' field.
 *
 *        b) Waits (with a timeout) for the owning context to unlock the
 *           console.
 *
 *        c) Takes the lock and clears the 'req_prio' field.
 *
 *      The owning context:
 *
 *        a) Observes the 'req_prio' field set on exit from the unsafe
 *           console state.
 *
 *        b) Gives up console ownership by clearing the 'prio' field.
 *
 *   3) Unsafe hostile takeover allows to take over the lock even when the
 *      console is an unsafe state. It is used only in panic() by the final
 *      attempt to flush consoles in a try and hope mode.
 *
 *      Note that separate record buffers are used in panic(). As a result,
 *      the messages can be read and formatted without any risk even after
 *      using the hostile takeover in unsafe state.
 *
 * The release function simply clears the 'prio' field.
 *
 * All operations on @console::nbcon_state are atomic cmpxchg based to
 * handle concurrency.
 *
 * The acquire/release functions implement only minimal policies:
 *
 *   - Preference for higher priority contexts.
 *   - Protection of the panic CPU.
 *
 * All other policy decisions must be made at the call sites:
 *
 *   - What is marked as an unsafe section.
 *   - Whether to spin-wait if there is already an owner and the console is
 *     in an unsafe state.
 *   - Whether to attempt an unsafe hostile takeover.
 *
 * The design allows to implement the well known:
 *
 *     acquire()
 *     output_one_printk_record()
 *     release()
 *
 * The output of one printk record might be interrupted with a higher priority
 * context. The new owner is supposed to reprint the entire interrupted record
 * from scratch.
 */

/**
 * nbcon_state_set - Helper function to set the console state
 * @con:        Console to update
 * @new:        The new state to write
 *
 * Only to be used when the console is not yet or no longer visible in the
 * system. Otherwise use nbcon_state_try_cmpxchg().
 */
static inline void nbcon_state_set(struct console *con, struct nbcon_state *new)
{
        atomic_set(&ACCESS_PRIVATE(con, nbcon_state), new->atom);
}

/**
 * nbcon_state_read - Helper function to read the console state
 * @con:        Console to read
 * @state:        The state to store the result
 */
static inline void nbcon_state_read(struct console *con, struct nbcon_state *state)
{
        state->atom = atomic_read(&ACCESS_PRIVATE(con, nbcon_state));
}

/**
 * nbcon_state_try_cmpxchg() - Helper function for atomic_try_cmpxchg() on console state
 * @con:        Console to update
 * @cur:        Old/expected state
 * @new:        New state
 *
 * Return: True on success. False on fail and @cur is updated.
 */
static inline bool nbcon_state_try_cmpxchg(struct console *con, struct nbcon_state *cur,
                                           struct nbcon_state *new)
{
        return atomic_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_state), &cur->atom, new->atom);
}

/**
 * nbcon_seq_read - Read the current console sequence
 * @con:        Console to read the sequence of
 *
 * Return:        Sequence number of the next record to print on @con.
 */
u64 nbcon_seq_read(struct console *con)
{
        unsigned long nbcon_seq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_seq));

        return __ulseq_to_u64seq(prb, nbcon_seq);
}

/**
 * nbcon_seq_force - Force console sequence to a specific value
 * @con:        Console to work on
 * @seq:        Sequence number value to set
 *
 * Only to be used during init (before registration) or in extreme situations
 * (such as panic with CONSOLE_REPLAY_ALL).
 */
void nbcon_seq_force(struct console *con, u64 seq)
{
        /*
         * If the specified record no longer exists, the oldest available record
         * is chosen. This is especially important on 32bit systems because only
         * the lower 32 bits of the sequence number are stored. The upper 32 bits
         * are derived from the sequence numbers available in the ringbuffer.
         */
        u64 valid_seq = max_t(u64, seq, prb_first_valid_seq(prb));

        atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), __u64seq_to_ulseq(valid_seq));
}

/**
 * nbcon_seq_try_update - Try to update the console sequence number
 * @ctxt:        Pointer to an acquire context that contains
 *                all information about the acquire mode
 * @new_seq:        The new sequence number to set
 *
 * @ctxt->seq is updated to the new value of @con::nbcon_seq (expanded to
 * the 64bit value). This could be a different value than @new_seq if
 * nbcon_seq_force() was used or the current context no longer owns the
 * console. In the later case, it will stop printing anyway.
 */
static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq)
{
        unsigned long nbcon_seq = __u64seq_to_ulseq(ctxt->seq);
        struct console *con = ctxt->console;

        if (atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_seq), &nbcon_seq,
                                    __u64seq_to_ulseq(new_seq))) {
                ctxt->seq = new_seq;
        } else {
                ctxt->seq = nbcon_seq_read(con);
        }
}

/**
 * nbcon_context_try_acquire_direct - Try to acquire directly
 * @ctxt:                The context of the caller
 * @cur:                The current console state
 * @is_reacquire:        This acquire is a reacquire
 *
 * Acquire the console when it is released. Also acquire the console when
 * the current owner has a lower priority and the console is in a safe state.
 *
 * Return:        0 on success. Otherwise, an error code on failure. Also @cur
 *                is updated to the latest state when failed to modify it.
 *
 * Errors:
 *
 *        -EPERM:                A panic is in progress and this is neither the panic
 *                        CPU nor is this a reacquire. Or the current owner or
 *                        waiter has the same or higher priority. No acquire
 *                        method can be successful in these cases.
 *
 *        -EBUSY:                The current owner has a lower priority but the console
 *                        in an unsafe state. The caller should try using
 *                        the handover acquire method.
 */
static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt,
                                            struct nbcon_state *cur, bool is_reacquire)
{
        unsigned int cpu = smp_processor_id();
        struct console *con = ctxt->console;
        struct nbcon_state new;

        do {
                /*
                 * Panic does not imply that the console is owned. However,
                 * since all non-panic CPUs are stopped during panic(), it
                 * is safer to have them avoid gaining console ownership.
                 *
                 * If this acquire is a reacquire (and an unsafe takeover
                 * has not previously occurred) then it is allowed to attempt
                 * a direct acquire in panic. This gives console drivers an
                 * opportunity to perform any necessary cleanup if they were
                 * interrupted by the panic CPU while printing.
                 */
                if (panic_on_other_cpu() &&
                    (!is_reacquire || cur->unsafe_takeover)) {
                        return -EPERM;
                }

                if (ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio)
                        return -EPERM;

                if (cur->unsafe)
                        return -EBUSY;

                /*
                 * The console should never be safe for a direct acquire
                 * if an unsafe hostile takeover has ever happened.
                 */
                WARN_ON_ONCE(cur->unsafe_takeover);

                new.atom = cur->atom;
                new.prio        = ctxt->prio;
                new.req_prio        = NBCON_PRIO_NONE;
                new.unsafe        = cur->unsafe_takeover;
                new.cpu                = cpu;

        } while (!nbcon_state_try_cmpxchg(con, cur, &new));

        return 0;
}

static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio)
{
        /*
         * The request context is well defined by the @req_prio because:
         *
         * - Only a context with a priority higher than the owner can become
         *   a waiter.
         * - Only a context with a priority higher than the waiter can
         *   directly take over the request.
         * - There are only three priorities.
         * - Only one CPU is allowed to request PANIC priority.
         * - Lower priorities are ignored during panic() until reboot.
         *
         * As a result, the following scenario is *not* possible:
         *
         * 1. This context is currently a waiter.
         * 2. Another context with a higher priority than this context
         *    directly takes ownership.
         * 3. The higher priority context releases the ownership.
         * 4. Another lower priority context takes the ownership.
         * 5. Another context with the same priority as this context
         *    creates a request and starts waiting.
         *
         * Event #1 implies this context is EMERGENCY.
         * Event #2 implies the new context is PANIC.
         * Event #3 occurs when panic() has flushed the console.
         * Event #4 occurs when a non-panic CPU reacquires.
         * Event #5 is not possible due to the panic_on_other_cpu() check
         *          in nbcon_context_try_acquire_handover().
         */

        return (cur->req_prio == expected_prio);
}

/**
 * nbcon_context_try_acquire_requested - Try to acquire after having
 *                                         requested a handover
 * @ctxt:        The context of the caller
 * @cur:        The current console state
 *
 * This is a helper function for nbcon_context_try_acquire_handover().
 * It is called when the console is in an unsafe state. The current
 * owner will release the console on exit from the unsafe region.
 *
 * Return:        0 on success and @cur is updated to the new console state.
 *                Otherwise an error code on failure.
 *
 * Errors:
 *
 *        -EPERM:                A panic is in progress and this is not the panic CPU
 *                        or this context is no longer the waiter.
 *
 *        -EBUSY:                The console is still locked. The caller should
 *                        continue waiting.
 *
 * Note: The caller must still remove the request when an error has occurred
 *       except when this context is no longer the waiter.
 */
static int nbcon_context_try_acquire_requested(struct nbcon_context *ctxt,
                                               struct nbcon_state *cur)
{
        unsigned int cpu = smp_processor_id();
        struct console *con = ctxt->console;
        struct nbcon_state new;

        /* Note that the caller must still remove the request! */
        if (panic_on_other_cpu())
                return -EPERM;

        /*
         * Note that the waiter will also change if there was an unsafe
         * hostile takeover.
         */
        if (!nbcon_waiter_matches(cur, ctxt->prio))
                return -EPERM;

        /* If still locked, caller should continue waiting. */
        if (cur->prio != NBCON_PRIO_NONE)
                return -EBUSY;

        /*
         * The previous owner should have never released ownership
         * in an unsafe region.
         */
        WARN_ON_ONCE(cur->unsafe);

        new.atom = cur->atom;
        new.prio        = ctxt->prio;
        new.req_prio        = NBCON_PRIO_NONE;
        new.unsafe        = cur->unsafe_takeover;
        new.cpu                = cpu;

        if (!nbcon_state_try_cmpxchg(con, cur, &new)) {
                /*
                 * The acquire could fail only when it has been taken
                 * over by a higher priority context.
                 */
                WARN_ON_ONCE(nbcon_waiter_matches(cur, ctxt->prio));
                return -EPERM;
        }

        /* Handover success. This context now owns the console. */
        return 0;
}

/**
 * nbcon_context_try_acquire_handover - Try to acquire via handover
 * @ctxt:        The context of the caller
 * @cur:        The current console state
 *
 * The function must be called only when the context has higher priority
 * than the current owner and the console is in an unsafe state.
 * It is the case when nbcon_context_try_acquire_direct() returns -EBUSY.
 *
 * The function sets "req_prio" field to make the current owner aware of
 * the request. Then it waits until the current owner releases the console,
 * or an even higher context takes over the request, or timeout expires.
 *
 * The current owner checks the "req_prio" field on exit from the unsafe
 * region and releases the console. It does not touch the "req_prio" field
 * so that the console stays reserved for the waiter.
 *
 * Return:        0 on success. Otherwise, an error code on failure. Also @cur
 *                is updated to the latest state when failed to modify it.
 *
 * Errors:
 *
 *        -EPERM:                A panic is in progress and this is not the panic CPU.
 *                        Or a higher priority context has taken over the
 *                        console or the handover request.
 *
 *        -EBUSY:                The current owner is on the same CPU so that the hand
 *                        shake could not work. Or the current owner is not
 *                        willing to wait (zero timeout). Or the console does
 *                        not enter the safe state before timeout passed. The
 *                        caller might still use the unsafe hostile takeover
 *                        when allowed.
 *
 *        -EAGAIN:        @cur has changed when creating the handover request.
 *                        The caller should retry with direct acquire.
 */
static int nbcon_context_try_acquire_handover(struct nbcon_context *ctxt,
                                              struct nbcon_state *cur)
{
        unsigned int cpu = smp_processor_id();
        struct console *con = ctxt->console;
        struct nbcon_state new;
        int timeout;
        int request_err = -EBUSY;

        /*
         * Check that the handover is called when the direct acquire failed
         * with -EBUSY.
         */
        WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio);
        WARN_ON_ONCE(!cur->unsafe);

        /*
         * Panic does not imply that the console is owned. However, it
         * is critical that non-panic CPUs during panic are unable to
         * wait for a handover in order to satisfy the assumptions of
         * nbcon_waiter_matches(). In particular, the assumption that
         * lower priorities are ignored during panic.
         */
        if (panic_on_other_cpu())
                return -EPERM;

        /* Handover is not possible on the same CPU. */
        if (cur->cpu == cpu)
                return -EBUSY;

        /*
         * Console stays unsafe after an unsafe takeover until re-initialized.
         * Waiting is not going to help in this case.
         */
        if (cur->unsafe_takeover)
                return -EBUSY;

        /* Is the caller willing to wait? */
        if (ctxt->spinwait_max_us == 0)
                return -EBUSY;

        /*
         * Setup a request for the handover. The caller should try to acquire
         * the console directly when the current state has been modified.
         */
        new.atom = cur->atom;
        new.req_prio = ctxt->prio;
        if (!nbcon_state_try_cmpxchg(con, cur, &new))
                return -EAGAIN;

        cur->atom = new.atom;

        /* Wait until there is no owner and then acquire the console. */
        for (timeout = ctxt->spinwait_max_us; timeout >= 0; timeout--) {
                /* On successful acquire, this request is cleared. */
                request_err = nbcon_context_try_acquire_requested(ctxt, cur);
                if (!request_err)
                        return 0;

                /*
                 * If the acquire should be aborted, it must be ensured
                 * that the request is removed before returning to caller.
                 */
                if (request_err == -EPERM)
                        break;

                udelay(1);

                /* Re-read the state because some time has passed. */
                nbcon_state_read(con, cur);
        }

        /* Timed out or aborted. Carefully remove handover request. */
        do {
                /*
                 * No need to remove request if there is a new waiter. This
                 * can only happen if a higher priority context has taken over
                 * the console or the handover request.
                 */
                if (!nbcon_waiter_matches(cur, ctxt->prio))
                        return -EPERM;

                /* Unset request for handover. */
                new.atom = cur->atom;
                new.req_prio = NBCON_PRIO_NONE;
                if (nbcon_state_try_cmpxchg(con, cur, &new)) {
                        /*
                         * Request successfully unset. Report failure of
                         * acquiring via handover.
                         */
                        cur->atom = new.atom;
                        return request_err;
                }

                /*
                 * Unable to remove request. Try to acquire in case
                 * the owner has released the lock.
                 */
        } while (nbcon_context_try_acquire_requested(ctxt, cur));

        /* Lucky timing. The acquire succeeded while removing the request. */
        return 0;
}

/**
 * nbcon_context_try_acquire_hostile - Acquire via unsafe hostile takeover
 * @ctxt:        The context of the caller
 * @cur:        The current console state
 *
 * Acquire the console even in the unsafe state.
 *
 * It can be permitted by setting the 'allow_unsafe_takeover' field only
 * by the final attempt to flush messages in panic().
 *
 * Return:        0 on success. -EPERM when not allowed by the context.
 */
static int nbcon_context_try_acquire_hostile(struct nbcon_context *ctxt,
                                             struct nbcon_state *cur)
{
        unsigned int cpu = smp_processor_id();
        struct console *con = ctxt->console;
        struct nbcon_state new;

        if (!ctxt->allow_unsafe_takeover)
                return -EPERM;

        /* Ensure caller is allowed to perform unsafe hostile takeovers. */
        if (WARN_ON_ONCE(ctxt->prio != NBCON_PRIO_PANIC))
                return -EPERM;

        /*
         * Check that try_acquire_direct() and try_acquire_handover() returned
         * -EBUSY in the right situation.
         */
        WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio);
        WARN_ON_ONCE(cur->unsafe != true);

        do {
                new.atom = cur->atom;
                new.cpu                        = cpu;
                new.prio                = ctxt->prio;
                new.unsafe                |= cur->unsafe_takeover;
                new.unsafe_takeover        |= cur->unsafe;

        } while (!nbcon_state_try_cmpxchg(con, cur, &new));

        return 0;
}

static struct printk_buffers panic_nbcon_pbufs;

/**
 * nbcon_context_try_acquire - Try to acquire nbcon console
 * @ctxt:                The context of the caller
 * @is_reacquire:        This acquire is a reacquire
 *
 * Context:        Under @ctxt->con->device_lock() or local_irq_save().
 * Return:        True if the console was acquired. False otherwise.
 *
 * If the caller allowed an unsafe hostile takeover, on success the
 * caller should check the current console state to see if it is
 * in an unsafe state. Otherwise, on success the caller may assume
 * the console is not in an unsafe state.
 */
static bool nbcon_context_try_acquire(struct nbcon_context *ctxt, bool is_reacquire)
{
        struct console *con = ctxt->console;
        struct nbcon_state cur;
        int err;

        nbcon_state_read(con, &cur);
try_again:
        err = nbcon_context_try_acquire_direct(ctxt, &cur, is_reacquire);
        if (err != -EBUSY)
                goto out;

        err = nbcon_context_try_acquire_handover(ctxt, &cur);
        if (err == -EAGAIN)
                goto try_again;
        if (err != -EBUSY)
                goto out;

        err = nbcon_context_try_acquire_hostile(ctxt, &cur);
out:
        if (err)
                return false;

        /* Acquire succeeded. */

        /* Assign the appropriate buffer for this context. */
        if (panic_on_this_cpu())
                ctxt->pbufs = &panic_nbcon_pbufs;
        else
                ctxt->pbufs = con->pbufs;

        /* Set the record sequence for this context to print. */
        ctxt->seq = nbcon_seq_read(ctxt->console);

        return true;
}

static bool nbcon_owner_matches(struct nbcon_state *cur, int expected_cpu,
                                int expected_prio)
{
        /*
         * A similar function, nbcon_waiter_matches(), only deals with
         * EMERGENCY and PANIC priorities. However, this function must also
         * deal with the NORMAL priority, which requires additional checks
         * and constraints.
         *
         * For the case where preemption and interrupts are disabled, it is
         * enough to also verify that the owning CPU has not changed.
         *
         * For the case where preemption or interrupts are enabled, an
         * external synchronization method *must* be used. In particular,
         * the driver-specific locking mechanism used in device_lock()
         * (including disabling migration) should be used. It prevents
         * scenarios such as:
         *
         * 1. [Task A] owns a context with NBCON_PRIO_NORMAL on [CPU X] and
         *    is scheduled out.
         * 2. Another context takes over the lock with NBCON_PRIO_EMERGENCY
         *    and releases it.
         * 3. [Task B] acquires a context with NBCON_PRIO_NORMAL on [CPU X]
         *    and is scheduled out.
         * 4. [Task A] gets running on [CPU X] and sees that the console is
         *    still owned by a task on [CPU X] with NBON_PRIO_NORMAL. Thus
         *    [Task A] thinks it is the owner when it is not.
         */

        if (cur->prio != expected_prio)
                return false;

        if (cur->cpu != expected_cpu)
                return false;

        return true;
}

/**
 * nbcon_context_release - Release the console
 * @ctxt:        The nbcon context from nbcon_context_try_acquire()
 */
static void nbcon_context_release(struct nbcon_context *ctxt)
{
        unsigned int cpu = smp_processor_id();
        struct console *con = ctxt->console;
        struct nbcon_state cur;
        struct nbcon_state new;

        nbcon_state_read(con, &cur);

        do {
                if (!nbcon_owner_matches(&cur, cpu, ctxt->prio))
                        break;

                new.atom = cur.atom;
                new.prio = NBCON_PRIO_NONE;

                /*
                 * If @unsafe_takeover is set, it is kept set so that
                 * the state remains permanently unsafe.
                 */
                new.unsafe |= cur.unsafe_takeover;

        } while (!nbcon_state_try_cmpxchg(con, &cur, &new));

        ctxt->pbufs = NULL;
}

/**
 * nbcon_context_can_proceed - Check whether ownership can proceed
 * @ctxt:        The nbcon context from nbcon_context_try_acquire()
 * @cur:        The current console state
 *
 * Return:        True if this context still owns the console. False if
 *                ownership was handed over or taken.
 *
 * Must be invoked when entering the unsafe state to make sure that it still
 * owns the lock. Also must be invoked when exiting the unsafe context
 * to eventually free the lock for a higher priority context which asked
 * for the friendly handover.
 *
 * It can be called inside an unsafe section when the console is just
 * temporary in safe state instead of exiting and entering the unsafe
 * state.
 *
 * Also it can be called in the safe context before doing an expensive
 * safe operation. It does not make sense to do the operation when
 * a higher priority context took the lock.
 *
 * When this function returns false then the calling context no longer owns
 * the console and is no longer allowed to go forward. In this case it must
 * back out immediately and carefully. The buffer content is also no longer
 * trusted since it no longer belongs to the calling context.
 */
static bool nbcon_context_can_proceed(struct nbcon_context *ctxt, struct nbcon_state *cur)
{
        unsigned int cpu = smp_processor_id();

        /* Make sure this context still owns the console. */
        if (!nbcon_owner_matches(cur, cpu, ctxt->prio))
                return false;

        /* The console owner can proceed if there is no waiter. */
        if (cur->req_prio == NBCON_PRIO_NONE)
                return true;

        /*
         * A console owner within an unsafe region is always allowed to
         * proceed, even if there are waiters. It can perform a handover
         * when exiting the unsafe region. Otherwise the waiter will
         * need to perform an unsafe hostile takeover.
         */
        if (cur->unsafe)
                return true;

        /* Waiters always have higher priorities than owners. */
        WARN_ON_ONCE(cur->req_prio <= cur->prio);

        /*
         * Having a safe point for take over and eventually a few
         * duplicated characters or a full line is way better than a
         * hostile takeover. Post processing can take care of the garbage.
         * Release and hand over.
         */
        nbcon_context_release(ctxt);

        /*
         * It is not clear whether the waiter really took over ownership. The
         * outermost callsite must make the final decision whether console
         * ownership is needed for it to proceed. If yes, it must reacquire
         * ownership (possibly hostile) before carefully proceeding.
         *
         * The calling context no longer owns the console so go back all the
         * way instead of trying to implement reacquire heuristics in tons of
         * places.
         */
        return false;
}

/**
 * nbcon_can_proceed - Check whether ownership can proceed
 * @wctxt:        The write context that was handed to the write function
 *
 * Return:        True if this context still owns the console. False if
 *                ownership was handed over or taken.
 *
 * It is used in nbcon_enter_unsafe() to make sure that it still owns the
 * lock. Also it is used in nbcon_exit_unsafe() to eventually free the lock
 * for a higher priority context which asked for the friendly handover.
 *
 * It can be called inside an unsafe section when the console is just
 * temporary in safe state instead of exiting and entering the unsafe state.
 *
 * Also it can be called in the safe context before doing an expensive safe
 * operation. It does not make sense to do the operation when a higher
 * priority context took the lock.
 *
 * When this function returns false then the calling context no longer owns
 * the console and is no longer allowed to go forward. In this case it must
 * back out immediately and carefully. The buffer content is also no longer
 * trusted since it no longer belongs to the calling context.
 */
bool nbcon_can_proceed(struct nbcon_write_context *wctxt)
{
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
        struct console *con = ctxt->console;
        struct nbcon_state cur;

        nbcon_state_read(con, &cur);

        return nbcon_context_can_proceed(ctxt, &cur);
}
EXPORT_SYMBOL_GPL(nbcon_can_proceed);

#define nbcon_context_enter_unsafe(c)        __nbcon_context_update_unsafe(c, true)
#define nbcon_context_exit_unsafe(c)        __nbcon_context_update_unsafe(c, false)

/**
 * __nbcon_context_update_unsafe - Update the unsafe bit in @con->nbcon_state
 * @ctxt:        The nbcon context from nbcon_context_try_acquire()
 * @unsafe:        The new value for the unsafe bit
 *
 * Return:        True if the unsafe state was updated and this context still
 *                owns the console. Otherwise false if ownership was handed
 *                over or taken.
 *
 * This function allows console owners to modify the unsafe status of the
 * console.
 *
 * When this function returns false then the calling context no longer owns
 * the console and is no longer allowed to go forward. In this case it must
 * back out immediately and carefully. The buffer content is also no longer
 * trusted since it no longer belongs to the calling context.
 *
 * Internal helper to avoid duplicated code.
 */
static bool __nbcon_context_update_unsafe(struct nbcon_context *ctxt, bool unsafe)
{
        struct console *con = ctxt->console;
        struct nbcon_state cur;
        struct nbcon_state new;

        nbcon_state_read(con, &cur);

        do {
                /*
                 * The unsafe bit must not be cleared if an
                 * unsafe hostile takeover has occurred.
                 */
                if (!unsafe && cur.unsafe_takeover)
                        goto out;

                if (!nbcon_context_can_proceed(ctxt, &cur))
                        return false;

                new.atom = cur.atom;
                new.unsafe = unsafe;
        } while (!nbcon_state_try_cmpxchg(con, &cur, &new));

        cur.atom = new.atom;
out:
        return nbcon_context_can_proceed(ctxt, &cur);
}

static void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt,
                                        char *buf, unsigned int len)
{
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
        struct console *con = ctxt->console;
        struct nbcon_state cur;

        wctxt->outbuf = buf;
        wctxt->len = len;
        nbcon_state_read(con, &cur);
        wctxt->unsafe_takeover = cur.unsafe_takeover;
}

/**
 * nbcon_enter_unsafe - Enter an unsafe region in the driver
 * @wctxt:        The write context that was handed to the write function
 *
 * Return:        True if this context still owns the console. False if
 *                ownership was handed over or taken.
 *
 * When this function returns false then the calling context no longer owns
 * the console and is no longer allowed to go forward. In this case it must
 * back out immediately and carefully. The buffer content is also no longer
 * trusted since it no longer belongs to the calling context.
 */
bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt)
{
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
        bool is_owner;

        is_owner = nbcon_context_enter_unsafe(ctxt);
        if (!is_owner)
                nbcon_write_context_set_buf(wctxt, NULL, 0);
        return is_owner;
}
EXPORT_SYMBOL_GPL(nbcon_enter_unsafe);

/**
 * nbcon_exit_unsafe - Exit an unsafe region in the driver
 * @wctxt:        The write context that was handed to the write function
 *
 * Return:        True if this context still owns the console. False if
 *                ownership was handed over or taken.
 *
 * When this function returns false then the calling context no longer owns
 * the console and is no longer allowed to go forward. In this case it must
 * back out immediately and carefully. The buffer content is also no longer
 * trusted since it no longer belongs to the calling context.
 */
bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt)
{
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
        bool ret;

        ret = nbcon_context_exit_unsafe(ctxt);
        if (!ret)
                nbcon_write_context_set_buf(wctxt, NULL, 0);
        return ret;
}
EXPORT_SYMBOL_GPL(nbcon_exit_unsafe);

/**
 * nbcon_reacquire_nobuf - Reacquire a console after losing ownership
 *                                while printing
 * @wctxt:        The write context that was handed to the write callback
 *
 * Since ownership can be lost at any time due to handover or takeover, a
 * printing context _must_ be prepared to back out immediately and
 * carefully. However, there are scenarios where the printing context must
 * reacquire ownership in order to finalize or revert hardware changes.
 *
 * This function allows a printing context to reacquire ownership using the
 * same priority as its previous ownership.
 *
 * Note that after a successful reacquire the printing context will have no
 * output buffer because that has been lost. This function cannot be used to
 * resume printing.
 */
void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt)
{
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);

        while (!nbcon_context_try_acquire(ctxt, true))
                cpu_relax();

        nbcon_write_context_set_buf(wctxt, NULL, 0);
}
EXPORT_SYMBOL_GPL(nbcon_reacquire_nobuf);

/**
 * nbcon_emit_next_record - Emit a record in the acquired context
 * @wctxt:        The write context that will be handed to the write function
 * @use_atomic:        True if the write_atomic() callback is to be used
 *
 * Return:        True if this context still owns the console. False if
 *                ownership was handed over or taken.
 *
 * When this function returns false then the calling context no longer owns
 * the console and is no longer allowed to go forward. In this case it must
 * back out immediately and carefully. The buffer content is also no longer
 * trusted since it no longer belongs to the calling context. If the caller
 * wants to do more it must reacquire the console first.
 *
 * When true is returned, @wctxt->ctxt.backlog indicates whether there are
 * still records pending in the ringbuffer,
 */
static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt, bool use_atomic)
{
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
        struct console *con = ctxt->console;
        bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED;
        struct printk_message pmsg = {
                .pbufs = ctxt->pbufs,
        };
        unsigned long con_dropped;
        struct nbcon_state cur;
        unsigned long dropped;
        unsigned long ulseq;

        /*
         * This function should never be called for consoles that have not
         * implemented the necessary callback for writing: i.e. legacy
         * consoles and, when atomic, nbcon consoles with no write_atomic().
         * Handle it as if ownership was lost and try to continue.
         *
         * Note that for nbcon consoles the write_thread() callback is
         * mandatory and was already checked in nbcon_alloc().
         */
        if (WARN_ON_ONCE((use_atomic && !con->write_atomic) ||
                         !(console_srcu_read_flags(con) & CON_NBCON))) {
                nbcon_context_release(ctxt);
                return false;
        }

        /*
         * The printk buffers are filled within an unsafe section. This
         * prevents NBCON_PRIO_NORMAL and NBCON_PRIO_EMERGENCY from
         * clobbering each other.
         */

        if (!nbcon_context_enter_unsafe(ctxt))
                return false;

        ctxt->backlog = printk_get_next_message(&pmsg, ctxt->seq, is_extended, true);
        if (!ctxt->backlog)
                return nbcon_context_exit_unsafe(ctxt);

        /*
         * @con->dropped is not protected in case of an unsafe hostile
         * takeover. In that situation the update can be racy so
         * annotate it accordingly.
         */
        con_dropped = data_race(READ_ONCE(con->dropped));

        dropped = con_dropped + pmsg.dropped;
        if (dropped && !is_extended)
                console_prepend_dropped(&pmsg, dropped);

        /*
         * If the previous owner was assigned the same record, this context
         * has taken over ownership and is replaying the record. Prepend a
         * message to let the user know the record is replayed.
         */
        ulseq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_prev_seq));
        if (__ulseq_to_u64seq(prb, ulseq) == pmsg.seq) {
                console_prepend_replay(&pmsg);
        } else {
                /*
                 * Ensure this context is still the owner before trying to
                 * update @nbcon_prev_seq. Otherwise the value in @ulseq may
                 * not be from the previous owner and instead be some later
                 * value from the context that took over ownership.
                 */
                nbcon_state_read(con, &cur);
                if (!nbcon_context_can_proceed(ctxt, &cur))
                        return false;

                atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_prev_seq), &ulseq,
                                        __u64seq_to_ulseq(pmsg.seq));
        }

        if (!nbcon_context_exit_unsafe(ctxt))
                return false;

        /* For skipped records just update seq/dropped in @con. */
        if (pmsg.outbuf_len == 0)
                goto update_con;

        /* Initialize the write context for driver callbacks. */
        nbcon_write_context_set_buf(wctxt, &pmsg.pbufs->outbuf[0], pmsg.outbuf_len);

        if (use_atomic)
                con->write_atomic(con, wctxt);
        else
                con->write_thread(con, wctxt);

        if (!wctxt->outbuf) {
                /*
                 * Ownership was lost and reacquired by the driver. Handle it
                 * as if ownership was lost.
                 */
                nbcon_context_release(ctxt);
                return false;
        }

        /*
         * Ownership may have been lost but _not_ reacquired by the driver.
         * This case is detected and handled when entering unsafe to update
         * dropped/seq values.
         */

        /*
         * Since any dropped message was successfully output, reset the
         * dropped count for the console.
         */
        dropped = 0;
update_con:
        /*
         * The dropped count and the sequence number are updated within an
         * unsafe section. This limits update races to the panic context and
         * allows the panic context to win.
         */

        if (!nbcon_context_enter_unsafe(ctxt))
                return false;

        if (dropped != con_dropped) {
                /* Counterpart to the READ_ONCE() above. */
                WRITE_ONCE(con->dropped, dropped);
        }

        nbcon_seq_try_update(ctxt, pmsg.seq + 1);

        return nbcon_context_exit_unsafe(ctxt);
}

/*
 * nbcon_emit_one - Print one record for an nbcon console using the
 *                        specified callback
 * @wctxt:        An initialized write context struct to use for this context
 * @use_atomic:        True if the write_atomic() callback is to be used
 *
 * Return:        True, when a record has been printed and there are still
 *                pending records. The caller might want to continue flushing.
 *
 *                False, when there is no pending record, or when the console
 *                context cannot be acquired, or the ownership has been lost.
 *                The caller should give up. Either the job is done, cannot be
 *                done, or will be handled by the owning context.
 *
 * This is an internal helper to handle the locking of the console before
 * calling nbcon_emit_next_record().
 */
static bool nbcon_emit_one(struct nbcon_write_context *wctxt, bool use_atomic)
{
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
        struct console *con = ctxt->console;
        unsigned long flags;
        bool ret = false;

        if (!use_atomic) {
                con->device_lock(con, &flags);

                /*
                 * Ensure this stays on the CPU to make handover and
                 * takeover possible.
                 */
                cant_migrate();
        }

        if (!nbcon_context_try_acquire(ctxt, false))
                goto out;

        /*
         * nbcon_emit_next_record() returns false when the console was
         * handed over or taken over. In both cases the context is no
         * longer valid.
         *
         * The higher priority printing context takes over responsibility
         * to print the pending records.
         */
        if (!nbcon_emit_next_record(wctxt, use_atomic))
                goto out;

        nbcon_context_release(ctxt);

        ret = ctxt->backlog;
out:
        if (!use_atomic)
                con->device_unlock(con, flags);
        return ret;
}

/**
 * nbcon_kthread_should_wakeup - Check whether a printer thread should wakeup
 * @con:        Console to operate on
 * @ctxt:        The nbcon context from nbcon_context_try_acquire()
 *
 * Return:        True if the thread should shutdown or if the console is
 *                allowed to print and a record is available. False otherwise.
 *
 * After the thread wakes up, it must first check if it should shutdown before
 * attempting any printing.
 */
static bool nbcon_kthread_should_wakeup(struct console *con, struct nbcon_context *ctxt)
{
        bool ret = false;
        short flags;
        int cookie;

        if (kthread_should_stop())
                return true;

        cookie = console_srcu_read_lock();

        flags = console_srcu_read_flags(con);
        if (console_is_usable(con, flags, false)) {
                /* Bring the sequence in @ctxt up to date */
                ctxt->seq = nbcon_seq_read(con);

                ret = prb_read_valid(prb, ctxt->seq, NULL);
        }

        console_srcu_read_unlock(cookie);
        return ret;
}

/**
 * nbcon_kthread_func - The printer thread function
 * @__console:        Console to operate on
 *
 * Return:        0
 */
static int nbcon_kthread_func(void *__console)
{
        struct console *con = __console;
        struct nbcon_write_context wctxt = {
                .ctxt.console        = con,
                .ctxt.prio        = NBCON_PRIO_NORMAL,
        };
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt);
        short con_flags;
        bool backlog;
        int cookie;

wait_for_event:
        /*
         * Guarantee this task is visible on the rcuwait before
         * checking the wake condition.
         *
         * The full memory barrier within set_current_state() of
         * ___rcuwait_wait_event() pairs with the full memory
         * barrier within rcuwait_has_sleeper().
         *
         * This pairs with rcuwait_has_sleeper:A and nbcon_kthread_wake:A.
         */
        rcuwait_wait_event(&con->rcuwait,
                           nbcon_kthread_should_wakeup(con, ctxt),
                           TASK_INTERRUPTIBLE); /* LMM(nbcon_kthread_func:A) */

        do {
                if (kthread_should_stop())
                        return 0;

                backlog = false;

                /*
                 * Keep the srcu read lock around the entire operation so that
                 * synchronize_srcu() can guarantee that the kthread stopped
                 * or suspended printing.
                 */
                cookie = console_srcu_read_lock();

                con_flags = console_srcu_read_flags(con);

                if (console_is_usable(con, con_flags, false))
                        backlog = nbcon_emit_one(&wctxt, false);

                console_srcu_read_unlock(cookie);

                cond_resched();

        } while (backlog);

        goto wait_for_event;
}

/**
 * nbcon_irq_work - irq work to wake console printer thread
 * @irq_work:        The irq work to operate on
 */
static void nbcon_irq_work(struct irq_work *irq_work)
{
        struct console *con = container_of(irq_work, struct console, irq_work);

        nbcon_kthread_wake(con);
}

static inline bool rcuwait_has_sleeper(struct rcuwait *w)
{
        /*
         * Guarantee any new records can be seen by tasks preparing to wait
         * before this context checks if the rcuwait is empty.
         *
         * This full memory barrier pairs with the full memory barrier within
         * set_current_state() of ___rcuwait_wait_event(), which is called
         * after prepare_to_rcuwait() adds the waiter but before it has
         * checked the wait condition.
         *
         * This pairs with nbcon_kthread_func:A.
         */
        smp_mb(); /* LMM(rcuwait_has_sleeper:A) */
        return rcuwait_active(w);
}

/**
 * nbcon_kthreads_wake - Wake up printing threads using irq_work
 */
void nbcon_kthreads_wake(void)
{
        struct console *con;
        int cookie;

        if (!printk_kthreads_running)
                return;

        cookie = console_srcu_read_lock();
        for_each_console_srcu(con) {
                if (!(console_srcu_read_flags(con) & CON_NBCON))
                        continue;

                /*
                 * Only schedule irq_work if the printing thread is
                 * actively waiting. If not waiting, the thread will
                 * notice by itself that it has work to do.
                 */
                if (rcuwait_has_sleeper(&con->rcuwait))
                        irq_work_queue(&con->irq_work);
        }
        console_srcu_read_unlock(cookie);
}

/*
 * nbcon_kthread_stop - Stop a console printer thread
 * @con:        Console to operate on
 */
void nbcon_kthread_stop(struct console *con)
{
        lockdep_assert_console_list_lock_held();

        if (!con->kthread)
                return;

        kthread_stop(con->kthread);
        con->kthread = NULL;
}

/**
 * nbcon_kthread_create - Create a console printer thread
 * @con:        Console to operate on
 *
 * Return:        True if the kthread was started or already exists.
 *                Otherwise false and @con must not be registered.
 *
 * This function is called when it will be expected that nbcon consoles are
 * flushed using the kthread. The messages printed with NBCON_PRIO_NORMAL
 * will be no longer flushed by the legacy loop. This is why failure must
 * be fatal for console registration.
 *
 * If @con was already registered and this function fails, @con must be
 * unregistered before the global state variable @printk_kthreads_running
 * can be set.
 */
bool nbcon_kthread_create(struct console *con)
{
        struct task_struct *kt;

        lockdep_assert_console_list_lock_held();

        if (con->kthread)
                return true;

        kt = kthread_run(nbcon_kthread_func, con, "pr/%s%d", con->name, con->index);
        if (WARN_ON(IS_ERR(kt))) {
                con_printk(KERN_ERR, con, "failed to start printing thread\n");
                return false;
        }

        con->kthread = kt;

        /*
         * It is important that console printing threads are scheduled
         * shortly after a printk call and with generous runtime budgets.
         */
        sched_set_normal(con->kthread, -20);

        return true;
}

/* Track the nbcon emergency nesting per CPU. */
static DEFINE_PER_CPU(unsigned int, nbcon_pcpu_emergency_nesting);
static unsigned int early_nbcon_pcpu_emergency_nesting __initdata;

/**
 * nbcon_get_cpu_emergency_nesting - Get the per CPU emergency nesting pointer
 *
 * Context:        For reading, any context. For writing, any context which could
 *                not be migrated to another CPU.
 * Return:        Either a pointer to the per CPU emergency nesting counter of
 *                the current CPU or to the init data during early boot.
 *
 * The function is safe for reading per-CPU variables in any context because
 * preemption is disabled if the current CPU is in the emergency state. See
 * also nbcon_cpu_emergency_enter().
 */
static __ref unsigned int *nbcon_get_cpu_emergency_nesting(void)
{
        /*
         * The value of __printk_percpu_data_ready gets set in normal
         * context and before SMP initialization. As a result it could
         * never change while inside an nbcon emergency section.
         */
        if (!printk_percpu_data_ready())
                return &early_nbcon_pcpu_emergency_nesting;

        return raw_cpu_ptr(&nbcon_pcpu_emergency_nesting);
}

/**
 * nbcon_get_default_prio - The appropriate nbcon priority to use for nbcon
 *                                printing on the current CPU
 *
 * Context:        Any context.
 * Return:        The nbcon_prio to use for acquiring an nbcon console in this
 *                context for printing.
 *
 * The function is safe for reading per-CPU data in any context because
 * preemption is disabled if the current CPU is in the emergency or panic
 * state.
 */
enum nbcon_prio nbcon_get_default_prio(void)
{
        unsigned int *cpu_emergency_nesting;

        if (panic_on_this_cpu())
                return NBCON_PRIO_PANIC;

        cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting();
        if (*cpu_emergency_nesting)
                return NBCON_PRIO_EMERGENCY;

        return NBCON_PRIO_NORMAL;
}

/**
 * nbcon_legacy_emit_next_record - Print one record for an nbcon console
 *                                        in legacy contexts
 * @con:        The console to print on
 * @handover:        Will be set to true if a printk waiter has taken over the
 *                console_lock, in which case the caller is no longer holding
 *                both the console_lock and the SRCU read lock. Otherwise it
 *                is set to false.
 * @cookie:        The cookie from the SRCU read lock.
 * @use_atomic: Set true when called in an atomic or unknown context.
 *                It affects which nbcon callback will be used: write_atomic()
 *                or write_thread().
 *
 *                When false, the write_thread() callback is used and would be
 *                called in a preemtible context unless disabled by the
 *                device_lock. The legacy handover is not allowed in this mode.
 *
 * Context:        Any context except NMI.
 * Return:        True, when a record has been printed and there are still
 *                pending records. The caller might want to continue flushing.
 *
 *                False, when there is no pending record, or when the console
 *                context cannot be acquired, or the ownership has been lost.
 *                The caller should give up. Either the job is done, cannot be
 *                done, or will be handled by the owning context.
 *
 * This function is meant to be called by console_flush_all() to print records
 * on nbcon consoles from legacy context (printing via console unlocking).
 * Essentially it is the nbcon version of console_emit_next_record().
 */
bool nbcon_legacy_emit_next_record(struct console *con, bool *handover,
                                   int cookie, bool use_atomic)
{
        struct nbcon_write_context wctxt = { };
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt);
        unsigned long flags;
        bool progress;

        ctxt->console        = con;
        ctxt->prio        = nbcon_get_default_prio();

        if (use_atomic) {
                /*
                 * In an atomic or unknown context, use the same procedure as
                 * in console_emit_next_record(). It allows to handover.
                 */
                printk_safe_enter_irqsave(flags);
                console_lock_spinning_enable();
                stop_critical_timings();
        }

        progress = nbcon_emit_one(&wctxt, use_atomic);

        if (use_atomic) {
                start_critical_timings();
                *handover = console_lock_spinning_disable_and_check(cookie);
                printk_safe_exit_irqrestore(flags);
        } else {
                /* Non-atomic does not perform legacy spinning handovers. */
                *handover = false;
        }

        return progress;
}

/**
 * __nbcon_atomic_flush_pending_con - Flush specified nbcon console using its
 *                                        write_atomic() callback
 * @con:                        The nbcon console to flush
 * @stop_seq:                        Flush up until this record
 * @allow_unsafe_takeover:        True, to allow unsafe hostile takeovers
 *
 * Return:        0 if @con was flushed up to @stop_seq Otherwise, error code on
 *                failure.
 *
 * Errors:
 *
 *        -EPERM:                Unable to acquire console ownership.
 *
 *        -EAGAIN:        Another context took over ownership while printing.
 *
 *        -ENOENT:        A record before @stop_seq is not available.
 *
 * If flushing up to @stop_seq was not successful, it only makes sense for the
 * caller to try again when -EAGAIN was returned. When -EPERM is returned,
 * this context is not allowed to acquire the console. When -ENOENT is
 * returned, it cannot be expected that the unfinalized record will become
 * available.
 */
static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq,
                                            bool allow_unsafe_takeover)
{
        struct nbcon_write_context wctxt = { };
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt);
        int err = 0;

        ctxt->console                        = con;
        ctxt->spinwait_max_us                = 2000;
        ctxt->prio                        = nbcon_get_default_prio();
        ctxt->allow_unsafe_takeover        = allow_unsafe_takeover;

        if (!nbcon_context_try_acquire(ctxt, false))
                return -EPERM;

        while (nbcon_seq_read(con) < stop_seq) {
                /*
                 * nbcon_emit_next_record() returns false when the console was
                 * handed over or taken over. In both cases the context is no
                 * longer valid.
                 */
                if (!nbcon_emit_next_record(&wctxt, true))
                        return -EAGAIN;

                if (!ctxt->backlog) {
                        /* Are there reserved but not yet finalized records? */
                        if (nbcon_seq_read(con) < stop_seq)
                                err = -ENOENT;
                        break;
                }
        }

        nbcon_context_release(ctxt);
        return err;
}

/**
 * nbcon_atomic_flush_pending_con - Flush specified nbcon console using its
 *                                        write_atomic() callback
 * @con:                        The nbcon console to flush
 * @stop_seq:                        Flush up until this record
 * @allow_unsafe_takeover:        True, to allow unsafe hostile takeovers
 *
 * This will stop flushing before @stop_seq if another context has ownership.
 * That context is then responsible for the flushing. Likewise, if new records
 * are added while this context was flushing and there is no other context
 * to handle the printing, this context must also flush those records.
 */
static void nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq,
                                           bool allow_unsafe_takeover)
{
        struct console_flush_type ft;
        unsigned long flags;
        int err;

again:
        /*
         * Atomic flushing does not use console driver synchronization (i.e.
         * it does not hold the port lock for uart consoles). Therefore IRQs
         * must be disabled to avoid being interrupted and then calling into
         * a driver that will deadlock trying to acquire console ownership.
         */
        local_irq_save(flags);

        err = __nbcon_atomic_flush_pending_con(con, stop_seq, allow_unsafe_takeover);

        local_irq_restore(flags);

        /*
         * If there was a new owner (-EPERM, -EAGAIN), that context is
         * responsible for completing.
         *
         * Do not wait for records not yet finalized (-ENOENT) to avoid a
         * possible deadlock. They will either get flushed by the writer or
         * eventually skipped on panic CPU.
         */
        if (err)
                return;

        /*
         * If flushing was successful but more records are available, this
         * context must flush those remaining records if the printer thread
         * is not available do it.
         */
        printk_get_console_flush_type(&ft);
        if (!ft.nbcon_offload &&
            prb_read_valid(prb, nbcon_seq_read(con), NULL)) {
                stop_seq = prb_next_reserve_seq(prb);
                goto again;
        }
}

/**
 * __nbcon_atomic_flush_pending - Flush all nbcon consoles using their
 *                                        write_atomic() callback
 * @stop_seq:                        Flush up until this record
 * @allow_unsafe_takeover:        True, to allow unsafe hostile takeovers
 */
static void __nbcon_atomic_flush_pending(u64 stop_seq, bool allow_unsafe_takeover)
{
        struct console *con;
        int cookie;

        cookie = console_srcu_read_lock();
        for_each_console_srcu(con) {
                short flags = console_srcu_read_flags(con);

                if (!(flags & CON_NBCON))
                        continue;

                if (!console_is_usable(con, flags, true))
                        continue;

                if (nbcon_seq_read(con) >= stop_seq)
                        continue;

                nbcon_atomic_flush_pending_con(con, stop_seq, allow_unsafe_takeover);
        }
        console_srcu_read_unlock(cookie);
}

/**
 * nbcon_atomic_flush_pending - Flush all nbcon consoles using their
 *                                write_atomic() callback
 *
 * Flush the backlog up through the currently newest record. Any new
 * records added while flushing will not be flushed if there is another
 * context available to handle the flushing. This is to avoid one CPU
 * printing unbounded because other CPUs continue to add records.
 */
void nbcon_atomic_flush_pending(void)
{
        __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), false);
}

/**
 * nbcon_atomic_flush_unsafe - Flush all nbcon consoles using their
 *        write_atomic() callback and allowing unsafe hostile takeovers
 *
 * Flush the backlog up through the currently newest record. Unsafe hostile
 * takeovers will be performed, if necessary.
 */
void nbcon_atomic_flush_unsafe(void)
{
        __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), true);
}

/**
 * nbcon_cpu_emergency_enter - Enter an emergency section where printk()
 *                                messages for that CPU are flushed directly
 *
 * Context:        Any context. Disables preemption.
 *
 * When within an emergency section, printk() calls will attempt to flush any
 * pending messages in the ringbuffer.
 */
void nbcon_cpu_emergency_enter(void)
{
        unsigned int *cpu_emergency_nesting;

        preempt_disable();

        cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting();
        (*cpu_emergency_nesting)++;
}

/**
 * nbcon_cpu_emergency_exit - Exit an emergency section
 *
 * Context:        Within an emergency section. Enables preemption.
 */
void nbcon_cpu_emergency_exit(void)
{
        unsigned int *cpu_emergency_nesting;

        cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting();

        if (!WARN_ON_ONCE(*cpu_emergency_nesting == 0))
                (*cpu_emergency_nesting)--;

        preempt_enable();
}

/**
 * nbcon_alloc - Allocate and init the nbcon console specific data
 * @con:        Console to initialize
 *
 * Return:        True if the console was fully allocated and initialized.
 *                Otherwise @con must not be registered.
 *
 * When allocation and init was successful, the console must be properly
 * freed using nbcon_free() once it is no longer needed.
 */
bool nbcon_alloc(struct console *con)
{
        struct nbcon_state state = { };

        /* Synchronize the kthread start. */
        lockdep_assert_console_list_lock_held();

        /* The write_thread() callback is mandatory. */
        if (WARN_ON(!con->write_thread))
                return false;

        rcuwait_init(&con->rcuwait);
        init_irq_work(&con->irq_work, nbcon_irq_work);
        atomic_long_set(&ACCESS_PRIVATE(con, nbcon_prev_seq), -1UL);
        nbcon_state_set(con, &state);

        /*
         * Initialize @nbcon_seq to the highest possible sequence number so
         * that practically speaking it will have nothing to print until a
         * desired initial sequence number has been set via nbcon_seq_force().
         */
        atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), ULSEQ_MAX(prb));

        if (con->flags & CON_BOOT) {
                /*
                 * Boot console printing is synchronized with legacy console
                 * printing, so boot consoles can share the same global printk
                 * buffers.
                 */
                con->pbufs = &printk_shared_pbufs;
        } else {
                con->pbufs = kmalloc(sizeof(*con->pbufs), GFP_KERNEL);
                if (!con->pbufs) {
                        con_printk(KERN_ERR, con, "failed to allocate printing buffer\n");
                        return false;
                }

                if (printk_kthreads_ready && !have_boot_console) {
                        if (!nbcon_kthread_create(con)) {
                                kfree(con->pbufs);
                                con->pbufs = NULL;
                                return false;
                        }

                        /* Might be the first kthread. */
                        printk_kthreads_running = true;
                }
        }

        return true;
}

/**
 * nbcon_free - Free and cleanup the nbcon console specific data
 * @con:        Console to free/cleanup nbcon data
 *
 * Important: @have_nbcon_console must be updated before calling
 *        this function. In particular, it can be set only when there
 *        is still another nbcon console registered.
 */
void nbcon_free(struct console *con)
{
        struct nbcon_state state = { };

        /* Synchronize the kthread stop. */
        lockdep_assert_console_list_lock_held();

        if (printk_kthreads_running) {
                nbcon_kthread_stop(con);

                /* Might be the last nbcon console.
                 *
                 * Do not rely on printk_kthreads_check_locked(). It is not
                 * called in some code paths, see nbcon_free() callers.
                 */
                if (!have_nbcon_console)
                        printk_kthreads_running = false;
        }

        nbcon_state_set(con, &state);

        /* Boot consoles share global printk buffers. */
        if (!(con->flags & CON_BOOT))
                kfree(con->pbufs);

        con->pbufs = NULL;
}

/**
 * nbcon_device_try_acquire - Try to acquire nbcon console and enter unsafe
 *                                section
 * @con:        The nbcon console to acquire
 *
 * Context:        Under the locking mechanism implemented in
 *                @con->device_lock() including disabling migration.
 * Return:        True if the console was acquired. False otherwise.
 *
 * Console drivers will usually use their own internal synchronization
 * mechasism to synchronize between console printing and non-printing
 * activities (such as setting baud rates). However, nbcon console drivers
 * supporting atomic consoles may also want to mark unsafe sections when
 * performing non-printing activities in order to synchronize against their
 * atomic_write() callback.
 *
 * This function acquires the nbcon console using priority NBCON_PRIO_NORMAL
 * and marks it unsafe for handover/takeover.
 */
bool nbcon_device_try_acquire(struct console *con)
{
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(con, nbcon_device_ctxt);

        cant_migrate();

        memset(ctxt, 0, sizeof(*ctxt));
        ctxt->console        = con;
        ctxt->prio        = NBCON_PRIO_NORMAL;

        if (!nbcon_context_try_acquire(ctxt, false))
                return false;

        if (!nbcon_context_enter_unsafe(ctxt))
                return false;

        return true;
}
EXPORT_SYMBOL_GPL(nbcon_device_try_acquire);

/**
 * nbcon_device_release - Exit unsafe section and release the nbcon console
 * @con:        The nbcon console acquired in nbcon_device_try_acquire()
 */
void nbcon_device_release(struct console *con)
{
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(con, nbcon_device_ctxt);
        struct console_flush_type ft;
        int cookie;

        if (!nbcon_context_exit_unsafe(ctxt))
                return;

        nbcon_context_release(ctxt);

        /*
         * This context must flush any new records added while the console
         * was locked if the printer thread is not available to do it. The
         * console_srcu_read_lock must be taken to ensure the console is
         * usable throughout flushing.
         */
        cookie = console_srcu_read_lock();
        printk_get_console_flush_type(&ft);
        if (console_is_usable(con, console_srcu_read_flags(con), true) &&
            !ft.nbcon_offload &&
            prb_read_valid(prb, nbcon_seq_read(con), NULL)) {
                /*
                 * If nbcon_atomic flushing is not available, fallback to
                 * using the legacy loop.
                 */
                if (ft.nbcon_atomic) {
                        __nbcon_atomic_flush_pending_con(con, prb_next_reserve_seq(prb), false);
                } else if (ft.legacy_direct) {
                        if (console_trylock())
                                console_unlock();
                } else if (ft.legacy_offload) {
                        printk_trigger_flush();
                }
        }
        console_srcu_read_unlock(cookie);
}
EXPORT_SYMBOL_GPL(nbcon_device_release);


































    4 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_VSYSCALL_H
#define _ASM_X86_VSYSCALL_H

#include <linux/seqlock.h>
#include <uapi/asm/vsyscall.h>
#include <asm/page_types.h>

#ifdef CONFIG_X86_VSYSCALL_EMULATION
extern void map_vsyscall(void);
extern void set_vsyscall_pgtable_user_bits(pgd_t *root);

/*
 * Called on instruction fetch fault in vsyscall page.
 * Returns true if handled.
 */
extern bool emulate_vsyscall(unsigned long error_code,
                             struct pt_regs *regs, unsigned long address);
#else
static inline void map_vsyscall(void) {}
static inline bool emulate_vsyscall(unsigned long error_code,
                                    struct pt_regs *regs, unsigned long address)
{
        return false;
}
#endif

/*
 * The (legacy) vsyscall page is the long page in the kernel portion
 * of the address space that has user-accessible permissions.
 */
static inline bool is_vsyscall_vaddr(unsigned long vaddr)
{
        return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
}

#endif /* _ASM_X86_VSYSCALL_H */










































































































































































































  318 










































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Security server interface.
 *
 * Author : Stephen Smalley, <stephen.smalley.work@gmail.com>
 *
 */

#ifndef _SELINUX_SECURITY_H_
#define _SELINUX_SECURITY_H_

#include <linux/compiler.h>
#include <linux/dcache.h>
#include <linux/magic.h>
#include <linux/types.h>
#include <linux/rcupdate.h>
#include <linux/refcount.h>
#include <linux/workqueue.h>
#include <linux/delay.h>
#include <linux/printk.h>
#include "flask.h"
#include "policycap.h"

#define SECSID_NULL   0x00000000 /* unspecified SID */
#define SECSID_WILD   0xffffffff /* wildcard SID */
#define SECCLASS_NULL 0x0000 /* no class */

/* Identify specific policy version changes */
#define POLICYDB_VERSION_BASE                     15
#define POLICYDB_VERSION_BOOL                     16
#define POLICYDB_VERSION_IPV6                     17
#define POLICYDB_VERSION_NLCLASS             18
#define POLICYDB_VERSION_VALIDATETRANS             19
#define POLICYDB_VERSION_MLS                     19
#define POLICYDB_VERSION_AVTAB                     20
#define POLICYDB_VERSION_RANGETRANS             21
#define POLICYDB_VERSION_POLCAP                     22
#define POLICYDB_VERSION_PERMISSIVE             23
#define POLICYDB_VERSION_BOUNDARY             24
#define POLICYDB_VERSION_FILENAME_TRANS             25
#define POLICYDB_VERSION_ROLETRANS             26
#define POLICYDB_VERSION_NEW_OBJECT_DEFAULTS 27
#define POLICYDB_VERSION_DEFAULT_TYPE             28
#define POLICYDB_VERSION_CONSTRAINT_NAMES    29
#define POLICYDB_VERSION_XPERMS_IOCTL             30
#define POLICYDB_VERSION_INFINIBAND             31
#define POLICYDB_VERSION_GLBLUB                     32
#define POLICYDB_VERSION_COMP_FTRANS             33 /* compressed filename transitions */
#define POLICYDB_VERSION_COND_XPERMS             34 /* extended permissions in conditional policies */
#define POLICYDB_VERSION_NEVERAUDIT             35 /* neveraudit types */

/* Range of policy versions we understand*/
#define POLICYDB_VERSION_MIN POLICYDB_VERSION_BASE
#define POLICYDB_VERSION_MAX POLICYDB_VERSION_NEVERAUDIT

/* Mask for just the mount related flags */
#define SE_MNTMASK 0x0f
/* Super block security struct flags for mount options */
/* BE CAREFUL, these need to be the low order bits for selinux_get_mnt_opts */
#define CONTEXT_MNT        0x01
#define FSCONTEXT_MNT        0x02
#define ROOTCONTEXT_MNT 0x04
#define DEFCONTEXT_MNT        0x08
#define SBLABEL_MNT        0x10
/* Non-mount related flags */
#define SE_SBINITIALIZED 0x0100
#define SE_SBPROC         0x0200
#define SE_SBGENFS         0x0400
#define SE_SBGENFS_XATTR 0x0800
#define SE_SBNATIVE         0x1000

#define CONTEXT_STR        "context"
#define FSCONTEXT_STR        "fscontext"
#define ROOTCONTEXT_STR "rootcontext"
#define DEFCONTEXT_STR        "defcontext"
#define SECLABEL_STR        "seclabel"

struct netlbl_lsm_secattr;

extern int selinux_enabled_boot;

/*
 * type_datum properties
 * available at the kernel policy version >= POLICYDB_VERSION_BOUNDARY
 */
#define TYPEDATUM_PROPERTY_PRIMARY   0x0001
#define TYPEDATUM_PROPERTY_ATTRIBUTE 0x0002

/* limitation of boundary depth  */
#define POLICYDB_BOUNDS_MAXDEPTH 4

struct selinux_policy;

struct selinux_state {
#ifdef CONFIG_SECURITY_SELINUX_DEVELOP
        bool enforcing;
#endif
        bool initialized;
        bool policycap[__POLICYDB_CAP_MAX];

        struct page *status_page;
        struct mutex status_lock;

        struct selinux_policy __rcu *policy;
        struct mutex policy_mutex;
} __randomize_layout;

void selinux_avc_init(void);

extern struct selinux_state selinux_state;

static inline bool selinux_initialized(void)
{
        /* do a synchronized load to avoid race conditions */
        return smp_load_acquire(&selinux_state.initialized);
}

static inline void selinux_mark_initialized(void)
{
        /* do a synchronized write to avoid race conditions */
        smp_store_release(&selinux_state.initialized, true);
}

#ifdef CONFIG_SECURITY_SELINUX_DEVELOP
static inline bool enforcing_enabled(void)
{
        return READ_ONCE(selinux_state.enforcing);
}

static inline void enforcing_set(bool value)
{
        WRITE_ONCE(selinux_state.enforcing, value);
}
#else
static inline bool enforcing_enabled(void)
{
        return true;
}

static inline void enforcing_set(bool value)
{
}
#endif

static inline bool checkreqprot_get(void)
{
        /* non-zero/true checkreqprot values are no longer supported */
        return 0;
}

static inline bool selinux_policycap_netpeer(void)
{
        return READ_ONCE(selinux_state.policycap[POLICYDB_CAP_NETPEER]);
}

static inline bool selinux_policycap_openperm(void)
{
        return READ_ONCE(selinux_state.policycap[POLICYDB_CAP_OPENPERM]);
}

static inline bool selinux_policycap_extsockclass(void)
{
        return READ_ONCE(selinux_state.policycap[POLICYDB_CAP_EXTSOCKCLASS]);
}

static inline bool selinux_policycap_alwaysnetwork(void)
{
        return READ_ONCE(selinux_state.policycap[POLICYDB_CAP_ALWAYSNETWORK]);
}

static inline bool selinux_policycap_cgroupseclabel(void)
{
        return READ_ONCE(selinux_state.policycap[POLICYDB_CAP_CGROUPSECLABEL]);
}

static inline bool selinux_policycap_nnp_nosuid_transition(void)
{
        return READ_ONCE(
                selinux_state.policycap[POLICYDB_CAP_NNP_NOSUID_TRANSITION]);
}

static inline bool selinux_policycap_genfs_seclabel_symlinks(void)
{
        return READ_ONCE(
                selinux_state.policycap[POLICYDB_CAP_GENFS_SECLABEL_SYMLINKS]);
}

static inline bool selinux_policycap_ioctl_skip_cloexec(void)
{
        return READ_ONCE(
                selinux_state.policycap[POLICYDB_CAP_IOCTL_SKIP_CLOEXEC]);
}

static inline bool selinux_policycap_userspace_initial_context(void)
{
        return READ_ONCE(
                selinux_state.policycap[POLICYDB_CAP_USERSPACE_INITIAL_CONTEXT]);
}

static inline bool selinux_policycap_netlink_xperm(void)
{
        return READ_ONCE(
                selinux_state.policycap[POLICYDB_CAP_NETLINK_XPERM]);
}

static inline bool selinux_policycap_functionfs_seclabel(void)
{
        return READ_ONCE(
                selinux_state.policycap[POLICYDB_CAP_FUNCTIONFS_SECLABEL]);
}

struct selinux_policy_convert_data;

struct selinux_load_state {
        struct selinux_policy *policy;
        struct selinux_policy_convert_data *convert_data;
};

int security_mls_enabled(void);
int security_load_policy(void *data, size_t len,
                         struct selinux_load_state *load_state);
void selinux_policy_commit(struct selinux_load_state *load_state);
void selinux_policy_cancel(struct selinux_load_state *load_state);
int security_read_policy(void **data, size_t *len);
int security_read_state_kernel(void **data, size_t *len);
int security_policycap_supported(unsigned int req_cap);

#define SEL_VEC_MAX 32
struct av_decision {
        u32 allowed;
        u32 auditallow;
        u32 auditdeny;
        u32 seqno;
        u32 flags;
};

#define XPERMS_ALLOWED          1
#define XPERMS_AUDITALLOW 2
#define XPERMS_DONTAUDIT  4

#define security_xperm_set(perms, x)  ((perms)[(x) >> 5] |= 1 << ((x)&0x1f))
#define security_xperm_test(perms, x) (1 & ((perms)[(x) >> 5] >> ((x)&0x1f)))
struct extended_perms_data {
        u32 p[8];
};

struct extended_perms_decision {
        u8 used;
        u8 driver;
        u8 base_perm;
        struct extended_perms_data *allowed;
        struct extended_perms_data *auditallow;
        struct extended_perms_data *dontaudit;
};

struct extended_perms {
        u16 len; /* length associated decision chain */
        u8 base_perms; /* which base permissions are covered */
        struct extended_perms_data drivers; /* flag drivers that are used */
};

/* definitions of av_decision.flags */
#define AVD_FLAGS_PERMISSIVE 0x0001
#define AVD_FLAGS_NEVERAUDIT  0x0002

void security_compute_av(u32 ssid, u32 tsid, u16 tclass,
                         struct av_decision *avd,
                         struct extended_perms *xperms);

void security_compute_xperms_decision(u32 ssid, u32 tsid, u16 tclass, u8 driver,
                                      u8 base_perm,
                                      struct extended_perms_decision *xpermd);

void security_compute_av_user(u32 ssid, u32 tsid, u16 tclass,
                              struct av_decision *avd);

int security_transition_sid(u32 ssid, u32 tsid, u16 tclass,
                            const struct qstr *qstr, u32 *out_sid);

int security_transition_sid_user(u32 ssid, u32 tsid, u16 tclass,
                                 const char *objname, u32 *out_sid);

int security_member_sid(u32 ssid, u32 tsid, u16 tclass, u32 *out_sid);

int security_change_sid(u32 ssid, u32 tsid, u16 tclass, u32 *out_sid);

int security_sid_to_context(u32 sid, char **scontext, u32 *scontext_len);

int security_sid_to_context_force(u32 sid, char **scontext, u32 *scontext_len);

int security_sid_to_context_inval(u32 sid, char **scontext, u32 *scontext_len);

int security_context_to_sid(const char *scontext, u32 scontext_len,
                            u32 *out_sid, gfp_t gfp);

int security_context_str_to_sid(const char *scontext, u32 *out_sid, gfp_t gfp);

int security_context_to_sid_default(const char *scontext, u32 scontext_len,
                                    u32 *out_sid, u32 def_sid, gfp_t gfp_flags);

int security_context_to_sid_force(const char *scontext, u32 scontext_len,
                                  u32 *sid);

int security_get_user_sids(u32 fromsid, const char *username, u32 **sids, u32 *nel);

int security_port_sid(u8 protocol, u16 port, u32 *out_sid);

int security_ib_pkey_sid(u64 subnet_prefix, u16 pkey_num, u32 *out_sid);

int security_ib_endport_sid(const char *dev_name, u8 port_num, u32 *out_sid);

int security_netif_sid(const char *name, u32 *if_sid);

int security_node_sid(u16 domain, const void *addr, u32 addrlen, u32 *out_sid);

int security_validate_transition(u32 oldsid, u32 newsid, u32 tasksid,
                                 u16 tclass);

int security_validate_transition_user(u32 oldsid, u32 newsid, u32 tasksid,
                                      u16 tclass);

int security_bounded_transition(u32 old_sid, u32 new_sid);

int security_sid_mls_copy(u32 sid, u32 mls_sid, u32 *new_sid);

int security_net_peersid_resolve(u32 nlbl_sid, u32 nlbl_type, u32 xfrm_sid,
                                 u32 *peer_sid);

int security_get_classes(struct selinux_policy *policy, char ***classes,
                         u32 *nclasses);
int security_get_permissions(struct selinux_policy *policy, const char *class,
                             char ***perms, u32 *nperms);
int security_get_reject_unknown(void);
int security_get_allow_unknown(void);

#define SECURITY_FS_USE_XATTR         1 /* use xattr */
#define SECURITY_FS_USE_TRANS         2 /* use transition SIDs, e.g. devpts/tmpfs */
#define SECURITY_FS_USE_TASK         3 /* use task SIDs, e.g. pipefs/sockfs */
#define SECURITY_FS_USE_GENFS         4 /* use the genfs support */
#define SECURITY_FS_USE_NONE         5 /* no labeling support */
#define SECURITY_FS_USE_MNTPOINT 6 /* use mountpoint labeling */
#define SECURITY_FS_USE_NATIVE         7 /* use native label support */
#define SECURITY_FS_USE_MAX         7 /* Highest SECURITY_FS_USE_XXX */

int security_fs_use(struct super_block *sb);

int security_genfs_sid(const char *fstype, const char *path, u16 sclass,
                       u32 *sid);

int selinux_policy_genfs_sid(struct selinux_policy *policy, const char *fstype,
                             const char *path, u16 sclass, u32 *sid);

#ifdef CONFIG_NETLABEL
int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr,
                                   u32 *sid);

int security_netlbl_sid_to_secattr(u32 sid, struct netlbl_lsm_secattr *secattr);
#else
static inline int
security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr, u32 *sid)
{
        return -EIDRM;
}

static inline int
security_netlbl_sid_to_secattr(u32 sid, struct netlbl_lsm_secattr *secattr)
{
        return -ENOENT;
}
#endif /* CONFIG_NETLABEL */

const char *security_get_initial_sid_context(u32 sid);

/*
 * status notifier using mmap interface
 */
extern struct page *selinux_kernel_status_page(void);

#define SELINUX_KERNEL_STATUS_VERSION 1
struct selinux_kernel_status {
        u32 version; /* version number of the structure */
        u32 sequence; /* sequence number of seqlock logic */
        u32 enforcing; /* current setting of enforcing mode */
        u32 policyload; /* times of policy reloaded */
        u32 deny_unknown; /* current setting of deny_unknown */
        /*
         * The version > 0 supports above members.
         */
} __packed;

extern void selinux_status_update_setenforce(bool enforcing);
extern void selinux_status_update_policyload(u32 seqno);
extern void selinux_complete_init(void);
extern struct path selinux_null;
extern void selnl_notify_setenforce(int val);
extern void selnl_notify_policyload(u32 seqno);
extern int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm);

extern void avtab_cache_init(void);
extern void ebitmap_cache_init(void);
extern void hashtab_cache_init(void);
extern int security_sidtab_hash_stats(char *page);

#endif /* _SELINUX_SECURITY_H_ */

























































































































































































































































































































































































    6 





























































































    6 


    6 





    6 
    6 
    6 
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/kernel/fork.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 *  'fork.c' contains the help-routines for the 'fork' system call
 * (see also entry.S and others).
 * Fork is rather simple, once you get the hang of it, but the memory
 * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
 */

#include <linux/anon_inodes.h>
#include <linux/slab.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/mm.h>
#include <linux/sched/user.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/stat.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
#include <linux/sched/ext.h>
#include <linux/seq_file.h>
#include <linux/rtmutex.h>
#include <linux/init.h>
#include <linux/unistd.h>
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/completion.h>
#include <linux/personality.h>
#include <linux/mempolicy.h>
#include <linux/sem.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/iocontext.h>
#include <linux/key.h>
#include <linux/kmsan.h>
#include <linux/binfmts.h>
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/memblock.h>
#include <linux/nsproxy.h>
#include <linux/capability.h>
#include <linux/cpu.h>
#include <linux/cgroup.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
#include <linux/seccomp.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
#include <linux/syscall_user_dispatch.h>
#include <linux/jiffies.h>
#include <linux/futex.h>
#include <linux/compat.h>
#include <linux/kthread.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/rcupdate.h>
#include <linux/ptrace.h>
#include <linux/mount.h>
#include <linux/audit.h>
#include <linux/memcontrol.h>
#include <linux/ftrace.h>
#include <linux/proc_fs.h>
#include <linux/profile.h>
#include <linux/rmap.h>
#include <linux/ksm.h>
#include <linux/acct.h>
#include <linux/userfaultfd_k.h>
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/freezer.h>
#include <linux/delayacct.h>
#include <linux/taskstats_kern.h>
#include <linux/tty.h>
#include <linux/fs_struct.h>
#include <linux/magic.h>
#include <linux/perf_event.h>
#include <linux/posix-timers.h>
#include <linux/user-return-notifier.h>
#include <linux/oom.h>
#include <linux/khugepaged.h>
#include <linux/signalfd.h>
#include <linux/uprobes.h>
#include <linux/aio.h>
#include <linux/compiler.h>
#include <linux/sysctl.h>
#include <linux/kcov.h>
#include <linux/livepatch.h>
#include <linux/thread_info.h>
#include <linux/kstack_erase.h>
#include <linux/kasan.h>
#include <linux/scs.h>
#include <linux/io_uring.h>
#include <linux/bpf.h>
#include <linux/stackprotector.h>
#include <linux/user_events.h>
#include <linux/iommu.h>
#include <linux/rseq.h>
#include <uapi/linux/pidfd.h>
#include <linux/pidfs.h>
#include <linux/tick.h>
#include <linux/unwind_deferred.h>

#include <asm/pgalloc.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>

/* For dup_mmap(). */
#include "../mm/internal.h"

#include <trace/events/sched.h>

#define CREATE_TRACE_POINTS
#include <trace/events/task.h>

#include <kunit/visibility.h>

/*
 * Minimum number of threads to boot the kernel
 */
#define MIN_THREADS 20

/*
 * Maximum number of threads
 */
#define MAX_THREADS FUTEX_TID_MASK

/*
 * Protected counters by write_lock_irq(&tasklist_lock)
 */
unsigned long total_forks;        /* Handle normal Linux uptimes. */
int nr_threads;                        /* The idle threads do not count.. */

static int max_threads;                /* tunable limit on nr_threads */

#define NAMED_ARRAY_INDEX(x)        [x] = __stringify(x)

static const char * const resident_page_types[] = {
        NAMED_ARRAY_INDEX(MM_FILEPAGES),
        NAMED_ARRAY_INDEX(MM_ANONPAGES),
        NAMED_ARRAY_INDEX(MM_SWAPENTS),
        NAMED_ARRAY_INDEX(MM_SHMEMPAGES),
};

DEFINE_PER_CPU(unsigned long, process_counts) = 0;

__cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */

#ifdef CONFIG_PROVE_RCU
int lockdep_tasklist_lock_is_held(void)
{
        return lockdep_is_held(&tasklist_lock);
}
EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
#endif /* #ifdef CONFIG_PROVE_RCU */

int nr_processes(void)
{
        int cpu;
        int total = 0;

        for_each_possible_cpu(cpu)
                total += per_cpu(process_counts, cpu);

        return total;
}

void __weak arch_release_task_struct(struct task_struct *tsk)
{
}

static struct kmem_cache *task_struct_cachep;

static inline struct task_struct *alloc_task_struct_node(int node)
{
        return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
}

static inline void free_task_struct(struct task_struct *tsk)
{
        kmem_cache_free(task_struct_cachep, tsk);
}

#ifdef CONFIG_VMAP_STACK
/*
 * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
 * flush.  Try to minimize the number of calls by caching stacks.
 */
#define NR_CACHED_STACKS 2
static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
/*
 * Allocated stacks are cached and later reused by new threads, so memcg
 * accounting is performed by the code assigning/releasing stacks to tasks.
 * We need a zeroed memory without __GFP_ACCOUNT.
 */
#define GFP_VMAP_STACK (GFP_KERNEL | __GFP_ZERO)

struct vm_stack {
        struct rcu_head rcu;
        struct vm_struct *stack_vm_area;
};

static bool try_release_thread_stack_to_cache(struct vm_struct *vm_area)
{
        unsigned int i;

        for (i = 0; i < NR_CACHED_STACKS; i++) {
                struct vm_struct *tmp = NULL;

                if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area))
                        return true;
        }
        return false;
}

static void thread_stack_free_rcu(struct rcu_head *rh)
{
        struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu);
        struct vm_struct *vm_area = vm_stack->stack_vm_area;

        if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area))
                return;

        vfree(vm_area->addr);
}

static void thread_stack_delayed_free(struct task_struct *tsk)
{
        struct vm_stack *vm_stack = tsk->stack;

        vm_stack->stack_vm_area = tsk->stack_vm_area;
        call_rcu(&vm_stack->rcu, thread_stack_free_rcu);
}

static int free_vm_stack_cache(unsigned int cpu)
{
        struct vm_struct **cached_vm_stack_areas = per_cpu_ptr(cached_stacks, cpu);
        int i;

        for (i = 0; i < NR_CACHED_STACKS; i++) {
                struct vm_struct *vm_area = cached_vm_stack_areas[i];

                if (!vm_area)
                        continue;

                vfree(vm_area->addr);
                cached_vm_stack_areas[i] = NULL;
        }

        return 0;
}

static int memcg_charge_kernel_stack(struct vm_struct *vm_area)
{
        int i;
        int ret;
        int nr_charged = 0;

        BUG_ON(vm_area->nr_pages != THREAD_SIZE / PAGE_SIZE);

        for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
                ret = memcg_kmem_charge_page(vm_area->pages[i], GFP_KERNEL, 0);
                if (ret)
                        goto err;
                nr_charged++;
        }
        return 0;
err:
        for (i = 0; i < nr_charged; i++)
                memcg_kmem_uncharge_page(vm_area->pages[i], 0);
        return ret;
}

static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
        struct vm_struct *vm_area;
        void *stack;
        int i;

        for (i = 0; i < NR_CACHED_STACKS; i++) {
                vm_area = this_cpu_xchg(cached_stacks[i], NULL);
                if (!vm_area)
                        continue;

                if (memcg_charge_kernel_stack(vm_area)) {
                        vfree(vm_area->addr);
                        return -ENOMEM;
                }

                /* Reset stack metadata. */
                kasan_unpoison_range(vm_area->addr, THREAD_SIZE);

                stack = kasan_reset_tag(vm_area->addr);

                /* Clear stale pointers from reused stack. */
                memset(stack, 0, THREAD_SIZE);

                tsk->stack_vm_area = vm_area;
                tsk->stack = stack;
                return 0;
        }

        stack = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN,
                                     GFP_VMAP_STACK,
                                     node, __builtin_return_address(0));
        if (!stack)
                return -ENOMEM;

        vm_area = find_vm_area(stack);
        if (memcg_charge_kernel_stack(vm_area)) {
                vfree(stack);
                return -ENOMEM;
        }
        /*
         * We can't call find_vm_area() in interrupt context, and
         * free_thread_stack() can be called in interrupt context,
         * so cache the vm_struct.
         */
        tsk->stack_vm_area = vm_area;
        stack = kasan_reset_tag(stack);
        tsk->stack = stack;
        return 0;
}

static void free_thread_stack(struct task_struct *tsk)
{
        if (!try_release_thread_stack_to_cache(tsk->stack_vm_area))
                thread_stack_delayed_free(tsk);

        tsk->stack = NULL;
        tsk->stack_vm_area = NULL;
}

#else /* !CONFIG_VMAP_STACK */

/*
 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
 * kmemcache based allocator.
 */
#if THREAD_SIZE >= PAGE_SIZE

static void thread_stack_free_rcu(struct rcu_head *rh)
{
        __free_pages(virt_to_page(rh), THREAD_SIZE_ORDER);
}

static void thread_stack_delayed_free(struct task_struct *tsk)
{
        struct rcu_head *rh = tsk->stack;

        call_rcu(rh, thread_stack_free_rcu);
}

static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
        struct page *page = alloc_pages_node(node, THREADINFO_GFP,
                                             THREAD_SIZE_ORDER);

        if (likely(page)) {
                tsk->stack = kasan_reset_tag(page_address(page));
                return 0;
        }
        return -ENOMEM;
}

static void free_thread_stack(struct task_struct *tsk)
{
        thread_stack_delayed_free(tsk);
        tsk->stack = NULL;
}

#else /* !(THREAD_SIZE >= PAGE_SIZE) */

static struct kmem_cache *thread_stack_cache;

static void thread_stack_free_rcu(struct rcu_head *rh)
{
        kmem_cache_free(thread_stack_cache, rh);
}

static void thread_stack_delayed_free(struct task_struct *tsk)
{
        struct rcu_head *rh = tsk->stack;

        call_rcu(rh, thread_stack_free_rcu);
}

static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
        unsigned long *stack;
        stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
        stack = kasan_reset_tag(stack);
        tsk->stack = stack;
        return stack ? 0 : -ENOMEM;
}

static void free_thread_stack(struct task_struct *tsk)
{
        thread_stack_delayed_free(tsk);
        tsk->stack = NULL;
}

void thread_stack_cache_init(void)
{
        thread_stack_cache = kmem_cache_create_usercopy("thread_stack",
                                        THREAD_SIZE, THREAD_SIZE, 0, 0,
                                        THREAD_SIZE, NULL);
        BUG_ON(thread_stack_cache == NULL);
}

#endif /* THREAD_SIZE >= PAGE_SIZE */
#endif /* CONFIG_VMAP_STACK */

/* SLAB cache for signal_struct structures (tsk->signal) */
static struct kmem_cache *signal_cachep;

/* SLAB cache for sighand_struct structures (tsk->sighand) */
struct kmem_cache *sighand_cachep;

/* SLAB cache for files_struct structures (tsk->files) */
struct kmem_cache *files_cachep;

/* SLAB cache for fs_struct structures (tsk->fs) */
struct kmem_cache *fs_cachep;

/* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep;

static void account_kernel_stack(struct task_struct *tsk, int account)
{
        if (IS_ENABLED(CONFIG_VMAP_STACK)) {
                struct vm_struct *vm_area = task_stack_vm_area(tsk);
                int i;

                for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
                        mod_lruvec_page_state(vm_area->pages[i], NR_KERNEL_STACK_KB,
                                              account * (PAGE_SIZE / 1024));
        } else {
                void *stack = task_stack_page(tsk);

                /* All stack pages are in the same node. */
                mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB,
                                      account * (THREAD_SIZE / 1024));
        }
}

void exit_task_stack_account(struct task_struct *tsk)
{
        account_kernel_stack(tsk, -1);

        if (IS_ENABLED(CONFIG_VMAP_STACK)) {
                struct vm_struct *vm_area;
                int i;

                vm_area = task_stack_vm_area(tsk);
                for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
                        memcg_kmem_uncharge_page(vm_area->pages[i], 0);
        }
}

static void release_task_stack(struct task_struct *tsk)
{
        if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
                return;  /* Better to leak the stack than to free prematurely */

        free_thread_stack(tsk);
}

#ifdef CONFIG_THREAD_INFO_IN_TASK
void put_task_stack(struct task_struct *tsk)
{
        if (refcount_dec_and_test(&tsk->stack_refcount))
                release_task_stack(tsk);
}
#endif

void free_task(struct task_struct *tsk)
{
#ifdef CONFIG_SECCOMP
        WARN_ON_ONCE(tsk->seccomp.filter);
#endif
        release_user_cpus_ptr(tsk);
        scs_release(tsk);

#ifndef CONFIG_THREAD_INFO_IN_TASK
        /*
         * The task is finally done with both the stack and thread_info,
         * so free both.
         */
        release_task_stack(tsk);
#else
        /*
         * If the task had a separate stack allocation, it should be gone
         * by now.
         */
        WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
#endif
        rt_mutex_debug_task_free(tsk);
        ftrace_graph_exit_task(tsk);
        arch_release_task_struct(tsk);
        if (tsk->flags & PF_KTHREAD)
                free_kthread_struct(tsk);
        bpf_task_storage_free(tsk);
        free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);

void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
{
        struct file *exe_file;

        exe_file = get_mm_exe_file(oldmm);
        RCU_INIT_POINTER(mm->exe_file, exe_file);
        /*
         * We depend on the oldmm having properly denied write access to the
         * exe_file already.
         */
        if (exe_file && exe_file_deny_write_access(exe_file))
                pr_warn_once("exe_file_deny_write_access() failed in %s\n", __func__);
}

#ifdef CONFIG_MMU
static inline int mm_alloc_pgd(struct mm_struct *mm)
{
        mm->pgd = pgd_alloc(mm);
        if (unlikely(!mm->pgd))
                return -ENOMEM;
        return 0;
}

static inline void mm_free_pgd(struct mm_struct *mm)
{
        pgd_free(mm, mm->pgd);
}
#else
#define mm_alloc_pgd(mm)        (0)
#define mm_free_pgd(mm)
#endif /* CONFIG_MMU */

#ifdef CONFIG_MM_ID
static DEFINE_IDA(mm_ida);

static inline int mm_alloc_id(struct mm_struct *mm)
{
        int ret;

        ret = ida_alloc_range(&mm_ida, MM_ID_MIN, MM_ID_MAX, GFP_KERNEL);
        if (ret < 0)
                return ret;
        mm->mm_id = ret;
        return 0;
}

static inline void mm_free_id(struct mm_struct *mm)
{
        const mm_id_t id = mm->mm_id;

        mm->mm_id = MM_ID_DUMMY;
        if (id == MM_ID_DUMMY)
                return;
        if (WARN_ON_ONCE(id < MM_ID_MIN || id > MM_ID_MAX))
                return;
        ida_free(&mm_ida, id);
}
#else /* !CONFIG_MM_ID */
static inline int mm_alloc_id(struct mm_struct *mm) { return 0; }
static inline void mm_free_id(struct mm_struct *mm) {}
#endif /* CONFIG_MM_ID */

static void check_mm(struct mm_struct *mm)
{
        int i;

        BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
                         "Please make sure 'struct resident_page_types[]' is updated as well");

        for (i = 0; i < NR_MM_COUNTERS; i++) {
                long x = percpu_counter_sum(&mm->rss_stat[i]);

                if (unlikely(x)) {
                        pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld Comm:%s Pid:%d\n",
                                 mm, resident_page_types[i], x,
                                 current->comm,
                                 task_pid_nr(current));
                }
        }

        if (mm_pgtables_bytes(mm))
                pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
                                mm_pgtables_bytes(mm));

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
        VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
#endif
}

#define allocate_mm()        (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
#define free_mm(mm)        (kmem_cache_free(mm_cachep, (mm)))

static void do_check_lazy_tlb(void *arg)
{
        struct mm_struct *mm = arg;

        WARN_ON_ONCE(current->active_mm == mm);
}

static void do_shoot_lazy_tlb(void *arg)
{
        struct mm_struct *mm = arg;

        if (current->active_mm == mm) {
                WARN_ON_ONCE(current->mm);
                current->active_mm = &init_mm;
                switch_mm(mm, &init_mm, current);
        }
}

static void cleanup_lazy_tlbs(struct mm_struct *mm)
{
        if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
                /*
                 * In this case, lazy tlb mms are refounted and would not reach
                 * __mmdrop until all CPUs have switched away and mmdrop()ed.
                 */
                return;
        }

        /*
         * Lazy mm shootdown does not refcount "lazy tlb mm" usage, rather it
         * requires lazy mm users to switch to another mm when the refcount
         * drops to zero, before the mm is freed. This requires IPIs here to
         * switch kernel threads to init_mm.
         *
         * archs that use IPIs to flush TLBs can piggy-back that lazy tlb mm
         * switch with the final userspace teardown TLB flush which leaves the
         * mm lazy on this CPU but no others, reducing the need for additional
         * IPIs here. There are cases where a final IPI is still required here,
         * such as the final mmdrop being performed on a different CPU than the
         * one exiting, or kernel threads using the mm when userspace exits.
         *
         * IPI overheads have not found to be expensive, but they could be
         * reduced in a number of possible ways, for example (roughly
         * increasing order of complexity):
         * - The last lazy reference created by exit_mm() could instead switch
         *   to init_mm, however it's probable this will run on the same CPU
         *   immediately afterwards, so this may not reduce IPIs much.
         * - A batch of mms requiring IPIs could be gathered and freed at once.
         * - CPUs store active_mm where it can be remotely checked without a
         *   lock, to filter out false-positives in the cpumask.
         * - After mm_users or mm_count reaches zero, switching away from the
         *   mm could clear mm_cpumask to reduce some IPIs, perhaps together
         *   with some batching or delaying of the final IPIs.
         * - A delayed freeing and RCU-like quiescing sequence based on mm
         *   switching to avoid IPIs completely.
         */
        on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
        if (IS_ENABLED(CONFIG_DEBUG_VM_SHOOT_LAZIES))
                on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
}

/*
 * Called when the last reference to the mm
 * is dropped: either by a lazy thread or by
 * mmput. Free the page directory and the mm.
 */
void __mmdrop(struct mm_struct *mm)
{
        BUG_ON(mm == &init_mm);
        WARN_ON_ONCE(mm == current->mm);

        /* Ensure no CPUs are using this as their lazy tlb mm */
        cleanup_lazy_tlbs(mm);

        WARN_ON_ONCE(mm == current->active_mm);
        mm_free_pgd(mm);
        mm_free_id(mm);
        destroy_context(mm);
        mmu_notifier_subscriptions_destroy(mm);
        check_mm(mm);
        put_user_ns(mm->user_ns);
        mm_pasid_drop(mm);
        mm_destroy_cid(mm);
        percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);

        free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);

static void mmdrop_async_fn(struct work_struct *work)
{
        struct mm_struct *mm;

        mm = container_of(work, struct mm_struct, async_put_work);
        __mmdrop(mm);
}

static void mmdrop_async(struct mm_struct *mm)
{
        if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
                INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
                schedule_work(&mm->async_put_work);
        }
}

static inline void free_signal_struct(struct signal_struct *sig)
{
        taskstats_tgid_free(sig);
        sched_autogroup_exit(sig);
        /*
         * __mmdrop is not safe to call from softirq context on x86 due to
         * pgd_dtor so postpone it to the async context
         */
        if (sig->oom_mm)
                mmdrop_async(sig->oom_mm);
        kmem_cache_free(signal_cachep, sig);
}

static inline void put_signal_struct(struct signal_struct *sig)
{
        if (refcount_dec_and_test(&sig->sigcnt))
                free_signal_struct(sig);
}

void __put_task_struct(struct task_struct *tsk)
{
        WARN_ON(!tsk->exit_state);
        WARN_ON(refcount_read(&tsk->usage));
        WARN_ON(tsk == current);

        unwind_task_free(tsk);
        sched_ext_free(tsk);
        io_uring_free(tsk);
        cgroup_free(tsk);
        task_numa_free(tsk, true);
        security_task_free(tsk);
        exit_creds(tsk);
        delayacct_tsk_free(tsk);
        put_signal_struct(tsk->signal);
        sched_core_free(tsk);
        free_task(tsk);
}
EXPORT_SYMBOL_GPL(__put_task_struct);

void __put_task_struct_rcu_cb(struct rcu_head *rhp)
{
        struct task_struct *task = container_of(rhp, struct task_struct, rcu);

        __put_task_struct(task);
}
EXPORT_SYMBOL_GPL(__put_task_struct_rcu_cb);

void __init __weak arch_task_cache_init(void) { }

/*
 * set_max_threads
 */
static void __init set_max_threads(unsigned int max_threads_suggested)
{
        u64 threads;
        unsigned long nr_pages = memblock_estimated_nr_free_pages();

        /*
         * The number of threads shall be limited such that the thread
         * structures may only consume a small part of the available memory.
         */
        if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64)
                threads = MAX_THREADS;
        else
                threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE,
                                    (u64) THREAD_SIZE * 8UL);

        if (threads > max_threads_suggested)
                threads = max_threads_suggested;

        max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
}

#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
/* Initialized by the architecture: */
int arch_task_struct_size __read_mostly;
#endif

static void __init task_struct_whitelist(unsigned long *offset, unsigned long *size)
{
        /* Fetch thread_struct whitelist for the architecture. */
        arch_thread_struct_whitelist(offset, size);

        /*
         * Handle zero-sized whitelist or empty thread_struct, otherwise
         * adjust offset to position of thread_struct in task_struct.
         */
        if (unlikely(*size == 0))
                *offset = 0;
        else
                *offset += offsetof(struct task_struct, thread);
}

void __init fork_init(void)
{
        int i;
#ifndef ARCH_MIN_TASKALIGN
#define ARCH_MIN_TASKALIGN        0
#endif
        int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
        unsigned long useroffset, usersize;

        /* create a slab on which task_structs can be allocated */
        task_struct_whitelist(&useroffset, &usersize);
        task_struct_cachep = kmem_cache_create_usercopy("task_struct",
                        arch_task_struct_size, align,
                        SLAB_PANIC|SLAB_ACCOUNT,
                        useroffset, usersize, NULL);

        /* do the arch specific task caches init */
        arch_task_cache_init();

        set_max_threads(MAX_THREADS);

        init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
        init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
        init_task.signal->rlim[RLIMIT_SIGPENDING] =
                init_task.signal->rlim[RLIMIT_NPROC];

        for (i = 0; i < UCOUNT_COUNTS; i++)
                init_user_ns.ucount_max[i] = max_threads/2;

        set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_NPROC,      RLIM_INFINITY);
        set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE,   RLIM_INFINITY);
        set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY);
        set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK,    RLIM_INFINITY);

#ifdef CONFIG_VMAP_STACK
        cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
                          NULL, free_vm_stack_cache);
#endif

        scs_init();

        lockdep_init_task(&init_task);
        uprobes_init();
}

int __weak arch_dup_task_struct(struct task_struct *dst,
                                               struct task_struct *src)
{
        *dst = *src;
        return 0;
}

void set_task_stack_end_magic(struct task_struct *tsk)
{
        unsigned long *stackend;

        stackend = end_of_stack(tsk);
        *stackend = STACK_END_MAGIC;        /* for overflow detection */
}

static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
        struct task_struct *tsk;
        int err;

        if (node == NUMA_NO_NODE)
                node = tsk_fork_get_node(orig);
        tsk = alloc_task_struct_node(node);
        if (!tsk)
                return NULL;

        err = arch_dup_task_struct(tsk, orig);
        if (err)
                goto free_tsk;

        err = alloc_thread_stack_node(tsk, node);
        if (err)
                goto free_tsk;

#ifdef CONFIG_THREAD_INFO_IN_TASK
        refcount_set(&tsk->stack_refcount, 1);
#endif
        account_kernel_stack(tsk, 1);

        err = scs_prepare(tsk, node);
        if (err)
                goto free_stack;

#ifdef CONFIG_SECCOMP
        /*
         * We must handle setting up seccomp filters once we're under
         * the sighand lock in case orig has changed between now and
         * then. Until then, filter must be NULL to avoid messing up
         * the usage counts on the error path calling free_task.
         */
        tsk->seccomp.filter = NULL;
#endif

        setup_thread_stack(tsk, orig);
        clear_user_return_notifier(tsk);
        clear_tsk_need_resched(tsk);
        set_task_stack_end_magic(tsk);
        clear_syscall_work_syscall_user_dispatch(tsk);

#ifdef CONFIG_STACKPROTECTOR
        tsk->stack_canary = get_random_canary();
#endif
        if (orig->cpus_ptr == &orig->cpus_mask)
                tsk->cpus_ptr = &tsk->cpus_mask;
        dup_user_cpus_ptr(tsk, orig, node);

        /*
         * One for the user space visible state that goes away when reaped.
         * One for the scheduler.
         */
        refcount_set(&tsk->rcu_users, 2);
        /* One for the rcu users */
        refcount_set(&tsk->usage, 1);
#ifdef CONFIG_BLK_DEV_IO_TRACE
        tsk->btrace_seq = 0;
#endif
        tsk->splice_pipe = NULL;
        tsk->task_frag.page = NULL;
        tsk->wake_q.next = NULL;
        tsk->worker_private = NULL;

        kcov_task_init(tsk);
        kmsan_task_create(tsk);
        kmap_local_fork(tsk);

#ifdef CONFIG_FAULT_INJECTION
        tsk->fail_nth = 0;
#endif

#ifdef CONFIG_BLK_CGROUP
        tsk->throttle_disk = NULL;
        tsk->use_memdelay = 0;
#endif

#ifdef CONFIG_ARCH_HAS_CPU_PASID
        tsk->pasid_activated = 0;
#endif

#ifdef CONFIG_MEMCG
        tsk->active_memcg = NULL;
#endif

#ifdef CONFIG_X86_BUS_LOCK_DETECT
        tsk->reported_split_lock = 0;
#endif

#ifdef CONFIG_SCHED_MM_CID
        tsk->mm_cid = -1;
        tsk->last_mm_cid = -1;
        tsk->mm_cid_active = 0;
        tsk->migrate_from_cpu = -1;
#endif
        return tsk;

free_stack:
        exit_task_stack_account(tsk);
        free_thread_stack(tsk);
free_tsk:
        free_task_struct(tsk);
        return NULL;
}

__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);

static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;

static int __init coredump_filter_setup(char *s)
{
        default_dump_filter =
                (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
                MMF_DUMP_FILTER_MASK;
        return 1;
}

__setup("coredump_filter=", coredump_filter_setup);

#include <linux/init_task.h>

static void mm_init_aio(struct mm_struct *mm)
{
#ifdef CONFIG_AIO
        spin_lock_init(&mm->ioctx_lock);
        mm->ioctx_table = NULL;
#endif
}

static __always_inline void mm_clear_owner(struct mm_struct *mm,
                                           struct task_struct *p)
{
#ifdef CONFIG_MEMCG
        if (mm->owner == p)
                WRITE_ONCE(mm->owner, NULL);
#endif
}

static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
{
#ifdef CONFIG_MEMCG
        mm->owner = p;
#endif
}

static void mm_init_uprobes_state(struct mm_struct *mm)
{
#ifdef CONFIG_UPROBES
        mm->uprobes_state.xol_area = NULL;
        arch_uprobe_init_state(mm);
#endif
}

static void mmap_init_lock(struct mm_struct *mm)
{
        init_rwsem(&mm->mmap_lock);
        mm_lock_seqcount_init(mm);
#ifdef CONFIG_PER_VMA_LOCK
        rcuwait_init(&mm->vma_writer_wait);
#endif
}

static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
        struct user_namespace *user_ns)
{
        mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
        mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
        atomic_set(&mm->mm_users, 1);
        atomic_set(&mm->mm_count, 1);
        seqcount_init(&mm->write_protect_seq);
        mmap_init_lock(mm);
        INIT_LIST_HEAD(&mm->mmlist);
        mm_pgtables_bytes_init(mm);
        mm->map_count = 0;
        mm->locked_vm = 0;
        atomic64_set(&mm->pinned_vm, 0);
        memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
        spin_lock_init(&mm->page_table_lock);
        spin_lock_init(&mm->arg_lock);
        mm_init_cpumask(mm);
        mm_init_aio(mm);
        mm_init_owner(mm, p);
        mm_pasid_init(mm);
        RCU_INIT_POINTER(mm->exe_file, NULL);
        mmu_notifier_subscriptions_init(mm);
        init_tlb_flush_pending(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
        mm->pmd_huge_pte = NULL;
#endif
        mm_init_uprobes_state(mm);
        hugetlb_count_init(mm);

        mm_flags_clear_all(mm);
        if (current->mm) {
                unsigned long flags = __mm_flags_get_word(current->mm);

                __mm_flags_set_word(mm, mmf_init_legacy_flags(flags));
                mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
        } else {
                __mm_flags_set_word(mm, default_dump_filter);
                mm->def_flags = 0;
        }

        if (futex_mm_init(mm))
                goto fail_mm_init;

        if (mm_alloc_pgd(mm))
                goto fail_nopgd;

        if (mm_alloc_id(mm))
                goto fail_noid;

        if (init_new_context(p, mm))
                goto fail_nocontext;

        if (mm_alloc_cid(mm, p))
                goto fail_cid;

        if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
                                     NR_MM_COUNTERS))
                goto fail_pcpu;

        mm->user_ns = get_user_ns(user_ns);
        lru_gen_init_mm(mm);
        return mm;

fail_pcpu:
        mm_destroy_cid(mm);
fail_cid:
        destroy_context(mm);
fail_nocontext:
        mm_free_id(mm);
fail_noid:
        mm_free_pgd(mm);
fail_nopgd:
        futex_hash_free(mm);
fail_mm_init:
        free_mm(mm);
        return NULL;
}

/*
 * Allocate and initialize an mm_struct.
 */
struct mm_struct *mm_alloc(void)
{
        struct mm_struct *mm;

        mm = allocate_mm();
        if (!mm)
                return NULL;

        memset(mm, 0, sizeof(*mm));
        return mm_init(mm, current, current_user_ns());
}
EXPORT_SYMBOL_IF_KUNIT(mm_alloc);

static inline void __mmput(struct mm_struct *mm)
{
        VM_BUG_ON(atomic_read(&mm->mm_users));

        uprobe_clear_state(mm);
        exit_aio(mm);
        ksm_exit(mm);
        khugepaged_exit(mm); /* must run before exit_mmap */
        exit_mmap(mm);
        mm_put_huge_zero_folio(mm);
        set_mm_exe_file(mm, NULL);
        if (!list_empty(&mm->mmlist)) {
                spin_lock(&mmlist_lock);
                list_del(&mm->mmlist);
                spin_unlock(&mmlist_lock);
        }
        if (mm->binfmt)
                module_put(mm->binfmt->module);
        lru_gen_del_mm(mm);
        futex_hash_free(mm);
        mmdrop(mm);
}

/*
 * Decrement the use count and release all resources for an mm.
 */
void mmput(struct mm_struct *mm)
{
        might_sleep();

        if (atomic_dec_and_test(&mm->mm_users))
                __mmput(mm);
}
EXPORT_SYMBOL_GPL(mmput);

#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH)
static void mmput_async_fn(struct work_struct *work)
{
        struct mm_struct *mm = container_of(work, struct mm_struct,
                                            async_put_work);

        __mmput(mm);
}

void mmput_async(struct mm_struct *mm)
{
        if (atomic_dec_and_test(&mm->mm_users)) {
                INIT_WORK(&mm->async_put_work, mmput_async_fn);
                schedule_work(&mm->async_put_work);
        }
}
EXPORT_SYMBOL_GPL(mmput_async);
#endif

/**
 * set_mm_exe_file - change a reference to the mm's executable file
 * @mm: The mm to change.
 * @new_exe_file: The new file to use.
 *
 * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
 *
 * Main users are mmput() and sys_execve(). Callers prevent concurrent
 * invocations: in mmput() nobody alive left, in execve it happens before
 * the new mm is made visible to anyone.
 *
 * Can only fail if new_exe_file != NULL.
 */
int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
{
        struct file *old_exe_file;

        /*
         * It is safe to dereference the exe_file without RCU as
         * this function is only called if nobody else can access
         * this mm -- see comment above for justification.
         */
        old_exe_file = rcu_dereference_raw(mm->exe_file);

        if (new_exe_file) {
                /*
                 * We expect the caller (i.e., sys_execve) to already denied
                 * write access, so this is unlikely to fail.
                 */
                if (unlikely(exe_file_deny_write_access(new_exe_file)))
                        return -EACCES;
                get_file(new_exe_file);
        }
        rcu_assign_pointer(mm->exe_file, new_exe_file);
        if (old_exe_file) {
                exe_file_allow_write_access(old_exe_file);
                fput(old_exe_file);
        }
        return 0;
}

/**
 * replace_mm_exe_file - replace a reference to the mm's executable file
 * @mm: The mm to change.
 * @new_exe_file: The new file to use.
 *
 * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
 *
 * Main user is sys_prctl(PR_SET_MM_MAP/EXE_FILE).
 */
int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
{
        struct vm_area_struct *vma;
        struct file *old_exe_file;
        int ret = 0;

        /* Forbid mm->exe_file change if old file still mapped. */
        old_exe_file = get_mm_exe_file(mm);
        if (old_exe_file) {
                VMA_ITERATOR(vmi, mm, 0);
                mmap_read_lock(mm);
                for_each_vma(vmi, vma) {
                        if (!vma->vm_file)
                                continue;
                        if (path_equal(&vma->vm_file->f_path,
                                       &old_exe_file->f_path)) {
                                ret = -EBUSY;
                                break;
                        }
                }
                mmap_read_unlock(mm);
                fput(old_exe_file);
                if (ret)
                        return ret;
        }

        ret = exe_file_deny_write_access(new_exe_file);
        if (ret)
                return -EACCES;
        get_file(new_exe_file);

        /* set the new file */
        mmap_write_lock(mm);
        old_exe_file = rcu_dereference_raw(mm->exe_file);
        rcu_assign_pointer(mm->exe_file, new_exe_file);
        mmap_write_unlock(mm);

        if (old_exe_file) {
                exe_file_allow_write_access(old_exe_file);
                fput(old_exe_file);
        }
        return 0;
}

/**
 * get_mm_exe_file - acquire a reference to the mm's executable file
 * @mm: The mm of interest.
 *
 * Returns %NULL if mm has no associated executable file.
 * User must release file via fput().
 */
struct file *get_mm_exe_file(struct mm_struct *mm)
{
        struct file *exe_file;

        rcu_read_lock();
        exe_file = get_file_rcu(&mm->exe_file);
        rcu_read_unlock();
        return exe_file;
}

/**
 * get_task_exe_file - acquire a reference to the task's executable file
 * @task: The task.
 *
 * Returns %NULL if task's mm (if any) has no associated executable file or
 * this is a kernel thread with borrowed mm (see the comment above get_task_mm).
 * User must release file via fput().
 */
struct file *get_task_exe_file(struct task_struct *task)
{
        struct file *exe_file = NULL;
        struct mm_struct *mm;

        if (task->flags & PF_KTHREAD)
                return NULL;

        task_lock(task);
        mm = task->mm;
        if (mm)
                exe_file = get_mm_exe_file(mm);
        task_unlock(task);
        return exe_file;
}

/**
 * get_task_mm - acquire a reference to the task's mm
 * @task: The task.
 *
 * Returns %NULL if the task has no mm.  Checks PF_KTHREAD (meaning
 * this kernel workthread has transiently adopted a user mm with use_mm,
 * to do its AIO) is not set and if so returns a reference to it, after
 * bumping up the use count.  User must release the mm via mmput()
 * after use.  Typically used by /proc and ptrace.
 */
struct mm_struct *get_task_mm(struct task_struct *task)
{
        struct mm_struct *mm;

        if (task->flags & PF_KTHREAD)
                return NULL;

        task_lock(task);
        mm = task->mm;
        if (mm)
                mmget(mm);
        task_unlock(task);
        return mm;
}
EXPORT_SYMBOL_GPL(get_task_mm);

static bool may_access_mm(struct mm_struct *mm, struct task_struct *task, unsigned int mode)
{
        if (mm == current->mm)
                return true;
        if (ptrace_may_access(task, mode))
                return true;
        if ((mode & PTRACE_MODE_READ) && perfmon_capable())
                return true;
        return false;
}

struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
{
        struct mm_struct *mm;
        int err;

        err =  down_read_killable(&task->signal->exec_update_lock);
        if (err)
                return ERR_PTR(err);

        mm = get_task_mm(task);
        if (!mm) {
                mm = ERR_PTR(-ESRCH);
        } else if (!may_access_mm(mm, task, mode)) {
                mmput(mm);
                mm = ERR_PTR(-EACCES);
        }
        up_read(&task->signal->exec_update_lock);

        return mm;
}

static void complete_vfork_done(struct task_struct *tsk)
{
        struct completion *vfork;

        task_lock(tsk);
        vfork = tsk->vfork_done;
        if (likely(vfork)) {
                tsk->vfork_done = NULL;
                complete(vfork);
        }
        task_unlock(tsk);
}

static int wait_for_vfork_done(struct task_struct *child,
                                struct completion *vfork)
{
        unsigned int state = TASK_KILLABLE|TASK_FREEZABLE;
        int killed;

        cgroup_enter_frozen();
        killed = wait_for_completion_state(vfork, state);
        cgroup_leave_frozen(false);

        if (killed) {
                task_lock(child);
                child->vfork_done = NULL;
                task_unlock(child);
        }

        put_task_struct(child);
        return killed;
}

/* Please note the differences between mmput and mm_release.
 * mmput is called whenever we stop holding onto a mm_struct,
 * error success whatever.
 *
 * mm_release is called after a mm_struct has been removed
 * from the current process.
 *
 * This difference is important for error handling, when we
 * only half set up a mm_struct for a new process and need to restore
 * the old one.  Because we mmput the new mm_struct before
 * restoring the old one. . .
 * Eric Biederman 10 January 1998
 */
static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
        uprobe_free_utask(tsk);

        /* Get rid of any cached register state */
        deactivate_mm(tsk, mm);

        /*
         * Signal userspace if we're not exiting with a core dump
         * because we want to leave the value intact for debugging
         * purposes.
         */
        if (tsk->clear_child_tid) {
                if (atomic_read(&mm->mm_users) > 1) {
                        /*
                         * We don't check the error code - if userspace has
                         * not set up a proper pointer then tough luck.
                         */
                        put_user(0, tsk->clear_child_tid);
                        do_futex(tsk->clear_child_tid, FUTEX_WAKE,
                                        1, NULL, NULL, 0, 0);
                }
                tsk->clear_child_tid = NULL;
        }

        /*
         * All done, finally we can wake up parent and return this mm to him.
         * Also kthread_stop() uses this completion for synchronization.
         */
        if (tsk->vfork_done)
                complete_vfork_done(tsk);
}

void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
        futex_exit_release(tsk);
        mm_release(tsk, mm);
}

void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
        futex_exec_release(tsk);
        mm_release(tsk, mm);
}

/**
 * dup_mm() - duplicates an existing mm structure
 * @tsk: the task_struct with which the new mm will be associated.
 * @oldmm: the mm to duplicate.
 *
 * Allocates a new mm structure and duplicates the provided @oldmm structure
 * content into it.
 *
 * Return: the duplicated mm or NULL on failure.
 */
static struct mm_struct *dup_mm(struct task_struct *tsk,
                                struct mm_struct *oldmm)
{
        struct mm_struct *mm;
        int err;

        mm = allocate_mm();
        if (!mm)
                goto fail_nomem;

        memcpy(mm, oldmm, sizeof(*mm));

        if (!mm_init(mm, tsk, mm->user_ns))
                goto fail_nomem;

        uprobe_start_dup_mmap();
        err = dup_mmap(mm, oldmm);
        if (err)
                goto free_pt;
        uprobe_end_dup_mmap();

        mm->hiwater_rss = get_mm_rss(mm);
        mm->hiwater_vm = mm->total_vm;

        if (mm->binfmt && !try_module_get(mm->binfmt->module))
                goto free_pt;

        return mm;

free_pt:
        /* don't put binfmt in mmput, we haven't got module yet */
        mm->binfmt = NULL;
        mm_init_owner(mm, NULL);
        mmput(mm);
        if (err)
                uprobe_end_dup_mmap();

fail_nomem:
        return NULL;
}

static int copy_mm(u64 clone_flags, struct task_struct *tsk)
{
        struct mm_struct *mm, *oldmm;

        tsk->min_flt = tsk->maj_flt = 0;
        tsk->nvcsw = tsk->nivcsw = 0;
#ifdef CONFIG_DETECT_HUNG_TASK
        tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
        tsk->last_switch_time = 0;
#endif

        tsk->mm = NULL;
        tsk->active_mm = NULL;

        /*
         * Are we cloning a kernel thread?
         *
         * We need to steal a active VM for that..
         */
        oldmm = current->mm;
        if (!oldmm)
                return 0;

        if (clone_flags & CLONE_VM) {
                mmget(oldmm);
                mm = oldmm;
        } else {
                mm = dup_mm(tsk, current->mm);
                if (!mm)
                        return -ENOMEM;
        }

        tsk->mm = mm;
        tsk->active_mm = mm;
        sched_mm_cid_fork(tsk);
        return 0;
}

static int copy_fs(u64 clone_flags, struct task_struct *tsk)
{
        struct fs_struct *fs = current->fs;
        if (clone_flags & CLONE_FS) {
                /* tsk->fs is already what we want */
                read_seqlock_excl(&fs->seq);
                /* "users" and "in_exec" locked for check_unsafe_exec() */
                if (fs->in_exec) {
                        read_sequnlock_excl(&fs->seq);
                        return -EAGAIN;
                }
                fs->users++;
                read_sequnlock_excl(&fs->seq);
                return 0;
        }
        tsk->fs = copy_fs_struct(fs);
        if (!tsk->fs)
                return -ENOMEM;
        return 0;
}

static int copy_files(u64 clone_flags, struct task_struct *tsk,
                      int no_files)
{
        struct files_struct *oldf, *newf;

        /*
         * A background process may not have any files ...
         */
        oldf = current->files;
        if (!oldf)
                return 0;

        if (no_files) {
                tsk->files = NULL;
                return 0;
        }

        if (clone_flags & CLONE_FILES) {
                atomic_inc(&oldf->count);
                return 0;
        }

        newf = dup_fd(oldf, NULL);
        if (IS_ERR(newf))
                return PTR_ERR(newf);

        tsk->files = newf;
        return 0;
}

static int copy_sighand(u64 clone_flags, struct task_struct *tsk)
{
        struct sighand_struct *sig;

        if (clone_flags & CLONE_SIGHAND) {
                refcount_inc(&current->sighand->count);
                return 0;
        }
        sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
        RCU_INIT_POINTER(tsk->sighand, sig);
        if (!sig)
                return -ENOMEM;

        refcount_set(&sig->count, 1);
        spin_lock_irq(&current->sighand->siglock);
        memcpy(sig->action, current->sighand->action, sizeof(sig->action));
        spin_unlock_irq(&current->sighand->siglock);

        /* Reset all signal handler not set to SIG_IGN to SIG_DFL. */
        if (clone_flags & CLONE_CLEAR_SIGHAND)
                flush_signal_handlers(tsk, 0);

        return 0;
}

void __cleanup_sighand(struct sighand_struct *sighand)
{
        if (refcount_dec_and_test(&sighand->count)) {
                signalfd_cleanup(sighand);
                /*
                 * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it
                 * without an RCU grace period, see __lock_task_sighand().
                 */
                kmem_cache_free(sighand_cachep, sighand);
        }
}

/*
 * Initialize POSIX timer handling for a thread group.
 */
static void posix_cpu_timers_init_group(struct signal_struct *sig)
{
        struct posix_cputimers *pct = &sig->posix_cputimers;
        unsigned long cpu_limit;

        cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
        posix_cputimers_group_init(pct, cpu_limit);
}

static int copy_signal(u64 clone_flags, struct task_struct *tsk)
{
        struct signal_struct *sig;

        if (clone_flags & CLONE_THREAD)
                return 0;

        sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
        tsk->signal = sig;
        if (!sig)
                return -ENOMEM;

        sig->nr_threads = 1;
        sig->quick_threads = 1;
        atomic_set(&sig->live, 1);
        refcount_set(&sig->sigcnt, 1);

        /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */
        sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
        tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head);

        init_waitqueue_head(&sig->wait_chldexit);
        sig->curr_target = tsk;
        init_sigpending(&sig->shared_pending);
        INIT_HLIST_HEAD(&sig->multiprocess);
        seqlock_init(&sig->stats_lock);
        prev_cputime_init(&sig->prev_cputime);

#ifdef CONFIG_POSIX_TIMERS
        INIT_HLIST_HEAD(&sig->posix_timers);
        INIT_HLIST_HEAD(&sig->ignored_posix_timers);
        hrtimer_setup(&sig->real_timer, it_real_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
#endif

        task_lock(current->group_leader);
        memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
        task_unlock(current->group_leader);

        posix_cpu_timers_init_group(sig);

        tty_audit_fork(sig);
        sched_autogroup_fork(sig);

#ifdef CONFIG_CGROUPS
        init_rwsem(&sig->cgroup_threadgroup_rwsem);
#endif

        sig->oom_score_adj = current->signal->oom_score_adj;
        sig->oom_score_adj_min = current->signal->oom_score_adj_min;

        mutex_init(&sig->cred_guard_mutex);
        init_rwsem(&sig->exec_update_lock);

        return 0;
}

static void copy_seccomp(struct task_struct *p)
{
#ifdef CONFIG_SECCOMP
        /*
         * Must be called with sighand->lock held, which is common to
         * all threads in the group. Holding cred_guard_mutex is not
         * needed because this new task is not yet running and cannot
         * be racing exec.
         */
        assert_spin_locked(&current->sighand->siglock);

        /* Ref-count the new filter user, and assign it. */
        get_seccomp_filter(current);
        p->seccomp = current->seccomp;

        /*
         * Explicitly enable no_new_privs here in case it got set
         * between the task_struct being duplicated and holding the
         * sighand lock. The seccomp state and nnp must be in sync.
         */
        if (task_no_new_privs(current))
                task_set_no_new_privs(p);

        /*
         * If the parent gained a seccomp mode after copying thread
         * flags and between before we held the sighand lock, we have
         * to manually enable the seccomp thread flag here.
         */
        if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
                set_task_syscall_work(p, SECCOMP);
#endif
}

SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
{
        current->clear_child_tid = tidptr;

        return task_pid_vnr(current);
}

static void rt_mutex_init_task(struct task_struct *p)
{
        raw_spin_lock_init(&p->pi_lock);
#ifdef CONFIG_RT_MUTEXES
        p->pi_waiters = RB_ROOT_CACHED;
        p->pi_top_task = NULL;
        p->pi_blocked_on = NULL;
#endif
}

static inline void init_task_pid_links(struct task_struct *task)
{
        enum pid_type type;

        for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type)
                INIT_HLIST_NODE(&task->pid_links[type]);
}

static inline void
init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
{
        if (type == PIDTYPE_PID)
                task->thread_pid = pid;
        else
                task->signal->pids[type] = pid;
}

static inline void rcu_copy_process(struct task_struct *p)
{
#ifdef CONFIG_PREEMPT_RCU
        p->rcu_read_lock_nesting = 0;
        p->rcu_read_unlock_special.s = 0;
        p->rcu_blocked_node = NULL;
        INIT_LIST_HEAD(&p->rcu_node_entry);
#endif /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_TASKS_RCU
        p->rcu_tasks_holdout = false;
        INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
        p->rcu_tasks_idle_cpu = -1;
        INIT_LIST_HEAD(&p->rcu_tasks_exit_list);
#endif /* #ifdef CONFIG_TASKS_RCU */
#ifdef CONFIG_TASKS_TRACE_RCU
        p->trc_reader_nesting = 0;
        p->trc_reader_special.s = 0;
        INIT_LIST_HEAD(&p->trc_holdout_list);
        INIT_LIST_HEAD(&p->trc_blkd_node);
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
}

/**
 * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
 * @pid:   the struct pid for which to create a pidfd
 * @flags: flags of the new @pidfd
 * @ret_file: return the new pidfs file
 *
 * Allocate a new file that stashes @pid and reserve a new pidfd number in the
 * caller's file descriptor table. The pidfd is reserved but not installed yet.
 *
 * The helper verifies that @pid is still in use, without PIDFD_THREAD the
 * task identified by @pid must be a thread-group leader.
 *
 * If this function returns successfully the caller is responsible to either
 * call fd_install() passing the returned pidfd and pidfd file as arguments in
 * order to install the pidfd into its file descriptor table or they must use
 * put_unused_fd() and fput() on the returned pidfd and pidfd file
 * respectively.
 *
 * This function is useful when a pidfd must already be reserved but there
 * might still be points of failure afterwards and the caller wants to ensure
 * that no pidfd is leaked into its file descriptor table.
 *
 * Return: On success, a reserved pidfd is returned from the function and a new
 *         pidfd file is returned in the last argument to the function. On
 *         error, a negative error code is returned from the function and the
 *         last argument remains unchanged.
 */
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret_file)
{
        struct file *pidfs_file;

        /*
         * PIDFD_STALE is only allowed to be passed if the caller knows
         * that @pid is already registered in pidfs and thus
         * PIDFD_INFO_EXIT information is guaranteed to be available.
         */
        if (!(flags & PIDFD_STALE)) {
                /*
                 * While holding the pidfd waitqueue lock removing the
                 * task linkage for the thread-group leader pid
                 * (PIDTYPE_TGID) isn't possible. Thus, if there's still
                 * task linkage for PIDTYPE_PID not having thread-group
                 * leader linkage for the pid means it wasn't a
                 * thread-group leader in the first place.
                 */
                guard(spinlock_irq)(&pid->wait_pidfd.lock);

                /* Task has already been reaped. */
                if (!pid_has_task(pid, PIDTYPE_PID))
                        return -ESRCH;
                /*
                 * If this struct pid isn't used as a thread-group
                 * leader but the caller requested to create a
                 * thread-group leader pidfd then report ENOENT.
                 */
                if (!(flags & PIDFD_THREAD) && !pid_has_task(pid, PIDTYPE_TGID))
                        return -ENOENT;
        }

        CLASS(get_unused_fd, pidfd)(O_CLOEXEC);
        if (pidfd < 0)
                return pidfd;

        pidfs_file = pidfs_alloc_file(pid, flags | O_RDWR);
        if (IS_ERR(pidfs_file))
                return PTR_ERR(pidfs_file);

        *ret_file = pidfs_file;
        return take_fd(pidfd);
}

static void __delayed_free_task(struct rcu_head *rhp)
{
        struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);

        free_task(tsk);
}

static __always_inline void delayed_free_task(struct task_struct *tsk)
{
        if (IS_ENABLED(CONFIG_MEMCG))
                call_rcu(&tsk->rcu, __delayed_free_task);
        else
                free_task(tsk);
}

static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
{
        /* Skip if kernel thread */
        if (!tsk->mm)
                return;

        /* Skip if spawning a thread or using vfork */
        if ((clone_flags & (CLONE_VM | CLONE_THREAD | CLONE_VFORK)) != CLONE_VM)
                return;

        /* We need to synchronize with __set_oom_adj */
        mutex_lock(&oom_adj_mutex);
        mm_flags_set(MMF_MULTIPROCESS, tsk->mm);
        /* Update the values in case they were changed after copy_signal */
        tsk->signal->oom_score_adj = current->signal->oom_score_adj;
        tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min;
        mutex_unlock(&oom_adj_mutex);
}

#ifdef CONFIG_RV
static void rv_task_fork(struct task_struct *p)
{
        memset(&p->rv, 0, sizeof(p->rv));
}
#else
#define rv_task_fork(p) do {} while (0)
#endif

static bool need_futex_hash_allocate_default(u64 clone_flags)
{
        if ((clone_flags & (CLONE_THREAD | CLONE_VM)) != (CLONE_THREAD | CLONE_VM))
                return false;
        return true;
}

/*
 * This creates a new process as a copy of the old one,
 * but does not actually start it yet.
 *
 * It copies the registers, and all the appropriate
 * parts of the process environment (as per the clone
 * flags). The actual kick-off is left to the caller.
 */
__latent_entropy struct task_struct *copy_process(
                                        struct pid *pid,
                                        int trace,
                                        int node,
                                        struct kernel_clone_args *args)
{
        int pidfd = -1, retval;
        struct task_struct *p;
        struct multiprocess_signals delayed;
        struct file *pidfile = NULL;
        const u64 clone_flags = args->flags;
        struct nsproxy *nsp = current->nsproxy;

        /*
         * Don't allow sharing the root directory with processes in a different
         * namespace
         */
        if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
                return ERR_PTR(-EINVAL);

        if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
                return ERR_PTR(-EINVAL);

        /*
         * Thread groups must share signals as well, and detached threads
         * can only be started up within the thread group.
         */
        if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
                return ERR_PTR(-EINVAL);

        /*
         * Shared signal handlers imply shared VM. By way of the above,
         * thread groups also imply shared VM. Blocking this case allows
         * for various simplifications in other code.
         */
        if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
                return ERR_PTR(-EINVAL);

        /*
         * Siblings of global init remain as zombies on exit since they are
         * not reaped by their parent (swapper). To solve this and to avoid
         * multi-rooted process trees, prevent global and container-inits
         * from creating siblings.
         */
        if ((clone_flags & CLONE_PARENT) &&
                                current->signal->flags & SIGNAL_UNKILLABLE)
                return ERR_PTR(-EINVAL);

        /*
         * If the new process will be in a different pid or user namespace
         * do not allow it to share a thread group with the forking task.
         */
        if (clone_flags & CLONE_THREAD) {
                if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
                    (task_active_pid_ns(current) != nsp->pid_ns_for_children))
                        return ERR_PTR(-EINVAL);
        }

        if (clone_flags & CLONE_PIDFD) {
                /*
                 * - CLONE_DETACHED is blocked so that we can potentially
                 *   reuse it later for CLONE_PIDFD.
                 */
                if (clone_flags & CLONE_DETACHED)
                        return ERR_PTR(-EINVAL);
        }

        /*
         * Force any signals received before this point to be delivered
         * before the fork happens.  Collect up signals sent to multiple
         * processes that happen during the fork and delay them so that
         * they appear to happen after the fork.
         */
        sigemptyset(&delayed.signal);
        INIT_HLIST_NODE(&delayed.node);

        spin_lock_irq(&current->sighand->siglock);
        if (!(clone_flags & CLONE_THREAD))
                hlist_add_head(&delayed.node, &current->signal->multiprocess);
        recalc_sigpending();
        spin_unlock_irq(&current->sighand->siglock);
        retval = -ERESTARTNOINTR;
        if (task_sigpending(current))
                goto fork_out;

        retval = -ENOMEM;
        p = dup_task_struct(current, node);
        if (!p)
                goto fork_out;
        p->flags &= ~PF_KTHREAD;
        if (args->kthread)
                p->flags |= PF_KTHREAD;
        if (args->user_worker) {
                /*
                 * Mark us a user worker, and block any signal that isn't
                 * fatal or STOP
                 */
                p->flags |= PF_USER_WORKER;
                siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
        }
        if (args->io_thread)
                p->flags |= PF_IO_WORKER;

        if (args->name)
                strscpy_pad(p->comm, args->name, sizeof(p->comm));

        p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
        /*
         * Clear TID on mm_release()?
         */
        p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;

        ftrace_graph_init_task(p);

        rt_mutex_init_task(p);

        lockdep_assert_irqs_enabled();
#ifdef CONFIG_PROVE_LOCKING
        DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
        retval = copy_creds(p, clone_flags);
        if (retval < 0)
                goto bad_fork_free;

        retval = -EAGAIN;
        if (is_rlimit_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
                if (p->real_cred->user != INIT_USER &&
                    !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
                        goto bad_fork_cleanup_count;
        }
        current->flags &= ~PF_NPROC_EXCEEDED;

        /*
         * If multiple threads are within copy_process(), then this check
         * triggers too late. This doesn't hurt, the check is only there
         * to stop root fork bombs.
         */
        retval = -EAGAIN;
        if (data_race(nr_threads >= max_threads))
                goto bad_fork_cleanup_count;

        delayacct_tsk_init(p);        /* Must remain after dup_task_struct() */
        p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE | PF_NO_SETAFFINITY);
        p->flags |= PF_FORKNOEXEC;
        INIT_LIST_HEAD(&p->children);
        INIT_LIST_HEAD(&p->sibling);
        rcu_copy_process(p);
        p->vfork_done = NULL;
        spin_lock_init(&p->alloc_lock);

        init_sigpending(&p->pending);

        p->utime = p->stime = p->gtime = 0;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
        p->utimescaled = p->stimescaled = 0;
#endif
        prev_cputime_init(&p->prev_cputime);

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
        seqcount_init(&p->vtime.seqcount);
        p->vtime.starttime = 0;
        p->vtime.state = VTIME_INACTIVE;
#endif

#ifdef CONFIG_IO_URING
        p->io_uring = NULL;
#endif

        p->default_timer_slack_ns = current->timer_slack_ns;

#ifdef CONFIG_PSI
        p->psi_flags = 0;
#endif

        task_io_accounting_init(&p->ioac);
        acct_clear_integrals(p);

        posix_cputimers_init(&p->posix_cputimers);
        tick_dep_init_task(p);

        p->io_context = NULL;
        audit_set_context(p, NULL);
        cgroup_fork(p);
        if (args->kthread) {
                if (!set_kthread_struct(p))
                        goto bad_fork_cleanup_delayacct;
        }
#ifdef CONFIG_NUMA
        p->mempolicy = mpol_dup(p->mempolicy);
        if (IS_ERR(p->mempolicy)) {
                retval = PTR_ERR(p->mempolicy);
                p->mempolicy = NULL;
                goto bad_fork_cleanup_delayacct;
        }
#endif
#ifdef CONFIG_CPUSETS
        p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
        seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
        memset(&p->irqtrace, 0, sizeof(p->irqtrace));
        p->irqtrace.hardirq_disable_ip        = _THIS_IP_;
        p->irqtrace.softirq_enable_ip        = _THIS_IP_;
        p->softirqs_enabled                = 1;
        p->softirq_context                = 0;
#endif

        p->pagefault_disabled = 0;

        lockdep_init_task(p);

        p->blocked_on = NULL; /* not blocked yet */

#ifdef CONFIG_BCACHE
        p->sequential_io        = 0;
        p->sequential_io_avg        = 0;
#endif
#ifdef CONFIG_BPF_SYSCALL
        RCU_INIT_POINTER(p->bpf_storage, NULL);
        p->bpf_ctx = NULL;
#endif

        unwind_task_init(p);

        /* Perform scheduler related setup. Assign this task to a CPU. */
        retval = sched_fork(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_policy;

        retval = perf_event_init_task(p, clone_flags);
        if (retval)
                goto bad_fork_sched_cancel_fork;
        retval = audit_alloc(p);
        if (retval)
                goto bad_fork_cleanup_perf;
        /* copy all the process information */
        shm_init_task(p);
        retval = security_task_alloc(p, clone_flags);
        if (retval)
                goto bad_fork_cleanup_audit;
        retval = copy_semundo(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_security;
        retval = copy_files(clone_flags, p, args->no_files);
        if (retval)
                goto bad_fork_cleanup_semundo;
        retval = copy_fs(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_files;
        retval = copy_sighand(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_fs;
        retval = copy_signal(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_sighand;
        retval = copy_mm(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_signal;
        retval = copy_namespaces(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_mm;
        retval = copy_io(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_namespaces;
        retval = copy_thread(p, args);
        if (retval)
                goto bad_fork_cleanup_io;

        stackleak_task_init(p);

        if (pid != &init_struct_pid) {
                pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
                                args->set_tid_size);
                if (IS_ERR(pid)) {
                        retval = PTR_ERR(pid);
                        goto bad_fork_cleanup_thread;
                }
        }

        /*
         * This has to happen after we've potentially unshared the file
         * descriptor table (so that the pidfd doesn't leak into the child
         * if the fd table isn't shared).
         */
        if (clone_flags & CLONE_PIDFD) {
                int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;

                /*
                 * Note that no task has been attached to @pid yet indicate
                 * that via CLONE_PIDFD.
                 */
                retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile);
                if (retval < 0)
                        goto bad_fork_free_pid;
                pidfd = retval;

                retval = put_user(pidfd, args->pidfd);
                if (retval)
                        goto bad_fork_put_pidfd;
        }

#ifdef CONFIG_BLOCK
        p->plug = NULL;
#endif
        futex_init_task(p);

        /*
         * sigaltstack should be cleared when sharing the same VM
         */
        if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
                sas_ss_reset(p);

        /*
         * Syscall tracing and stepping should be turned off in the
         * child regardless of CLONE_PTRACE.
         */
        user_disable_single_step(p);
        clear_task_syscall_work(p, SYSCALL_TRACE);
#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
        clear_task_syscall_work(p, SYSCALL_EMU);
#endif
        clear_tsk_latency_tracing(p);

        /* ok, now we should be set up.. */
        p->pid = pid_nr(pid);
        if (clone_flags & CLONE_THREAD) {
                p->group_leader = current->group_leader;
                p->tgid = current->tgid;
        } else {
                p->group_leader = p;
                p->tgid = p->pid;
        }

        p->nr_dirtied = 0;
        p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
        p->dirty_paused_when = 0;

        p->pdeath_signal = 0;
        p->task_works = NULL;
        clear_posix_cputimers_work(p);

#ifdef CONFIG_KRETPROBES
        p->kretprobe_instances.first = NULL;
#endif
#ifdef CONFIG_RETHOOK
        p->rethooks.first = NULL;
#endif

        /*
         * Ensure that the cgroup subsystem policies allow the new process to be
         * forked. It should be noted that the new process's css_set can be changed
         * between here and cgroup_post_fork() if an organisation operation is in
         * progress.
         */
        retval = cgroup_can_fork(p, args);
        if (retval)
                goto bad_fork_put_pidfd;

        /*
         * Now that the cgroups are pinned, re-clone the parent cgroup and put
         * the new task on the correct runqueue. All this *before* the task
         * becomes visible.
         *
         * This isn't part of ->can_fork() because while the re-cloning is
         * cgroup specific, it unconditionally needs to place the task on a
         * runqueue.
         */
        retval = sched_cgroup_fork(p, args);
        if (retval)
                goto bad_fork_cancel_cgroup;

        /*
         * Allocate a default futex hash for the user process once the first
         * thread spawns.
         */
        if (need_futex_hash_allocate_default(clone_flags)) {
                retval = futex_hash_allocate_default();
                if (retval)
                        goto bad_fork_cancel_cgroup;
                /*
                 * If we fail beyond this point we don't free the allocated
                 * futex hash map. We assume that another thread will be created
                 * and makes use of it. The hash map will be freed once the main
                 * thread terminates.
                 */
        }
        /*
         * From this point on we must avoid any synchronous user-space
         * communication until we take the tasklist-lock. In particular, we do
         * not want user-space to be able to predict the process start-time by
         * stalling fork(2) after we recorded the start_time but before it is
         * visible to the system.
         */

        p->start_time = ktime_get_ns();
        p->start_boottime = ktime_get_boottime_ns();

        /*
         * Make it visible to the rest of the system, but dont wake it up yet.
         * Need tasklist lock for parent etc handling!
         */
        write_lock_irq(&tasklist_lock);

        /* CLONE_PARENT re-uses the old parent */
        if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
                p->real_parent = current->real_parent;
                p->parent_exec_id = current->parent_exec_id;
                if (clone_flags & CLONE_THREAD)
                        p->exit_signal = -1;
                else
                        p->exit_signal = current->group_leader->exit_signal;
        } else {
                p->real_parent = current;
                p->parent_exec_id = current->self_exec_id;
                p->exit_signal = args->exit_signal;
        }

        klp_copy_process(p);

        sched_core_fork(p);

        spin_lock(&current->sighand->siglock);

        rv_task_fork(p);

        rseq_fork(p, clone_flags);

        /* Don't start children in a dying pid namespace */
        if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
                retval = -ENOMEM;
                goto bad_fork_core_free;
        }

        /* Let kill terminate clone/fork in the middle */
        if (fatal_signal_pending(current)) {
                retval = -EINTR;
                goto bad_fork_core_free;
        }

        /* No more failure paths after this point. */

        /*
         * Copy seccomp details explicitly here, in case they were changed
         * before holding sighand lock.
         */
        copy_seccomp(p);

        init_task_pid_links(p);
        if (likely(p->pid)) {
                ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);

                init_task_pid(p, PIDTYPE_PID, pid);
                if (thread_group_leader(p)) {
                        init_task_pid(p, PIDTYPE_TGID, pid);
                        init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
                        init_task_pid(p, PIDTYPE_SID, task_session(current));

                        if (is_child_reaper(pid)) {
                                ns_of_pid(pid)->child_reaper = p;
                                p->signal->flags |= SIGNAL_UNKILLABLE;
                        }
                        p->signal->shared_pending.signal = delayed.signal;
                        p->signal->tty = tty_kref_get(current->signal->tty);
                        /*
                         * Inherit has_child_subreaper flag under the same
                         * tasklist_lock with adding child to the process tree
                         * for propagate_has_child_subreaper optimization.
                         */
                        p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
                                                         p->real_parent->signal->is_child_subreaper;
                        list_add_tail(&p->sibling, &p->real_parent->children);
                        list_add_tail_rcu(&p->tasks, &init_task.tasks);
                        attach_pid(p, PIDTYPE_TGID);
                        attach_pid(p, PIDTYPE_PGID);
                        attach_pid(p, PIDTYPE_SID);
                        __this_cpu_inc(process_counts);
                } else {
                        current->signal->nr_threads++;
                        current->signal->quick_threads++;
                        atomic_inc(&current->signal->live);
                        refcount_inc(&current->signal->sigcnt);
                        task_join_group_stop(p);
                        list_add_tail_rcu(&p->thread_node,
                                          &p->signal->thread_head);
                }
                attach_pid(p, PIDTYPE_PID);
                nr_threads++;
        }
        total_forks++;
        hlist_del_init(&delayed.node);
        spin_unlock(&current->sighand->siglock);
        syscall_tracepoint_update(p);
        write_unlock_irq(&tasklist_lock);

        if (pidfile)
                fd_install(pidfd, pidfile);

        proc_fork_connector(p);
        sched_post_fork(p);
        cgroup_post_fork(p, args);
        perf_event_fork(p);

        trace_task_newtask(p, clone_flags);
        uprobe_copy_process(p, clone_flags);
        user_events_fork(p, clone_flags);

        copy_oom_score_adj(clone_flags, p);

        return p;

bad_fork_core_free:
        sched_core_free(p);
        spin_unlock(&current->sighand->siglock);
        write_unlock_irq(&tasklist_lock);
bad_fork_cancel_cgroup:
        cgroup_cancel_fork(p, args);
bad_fork_put_pidfd:
        if (clone_flags & CLONE_PIDFD) {
                fput(pidfile);
                put_unused_fd(pidfd);
        }
bad_fork_free_pid:
        if (pid != &init_struct_pid)
                free_pid(pid);
bad_fork_cleanup_thread:
        exit_thread(p);
bad_fork_cleanup_io:
        if (p->io_context)
                exit_io_context(p);
bad_fork_cleanup_namespaces:
        exit_task_namespaces(p);
bad_fork_cleanup_mm:
        if (p->mm) {
                mm_clear_owner(p->mm, p);
                mmput(p->mm);
        }
bad_fork_cleanup_signal:
        if (!(clone_flags & CLONE_THREAD))
                free_signal_struct(p->signal);
bad_fork_cleanup_sighand:
        __cleanup_sighand(p->sighand);
bad_fork_cleanup_fs:
        exit_fs(p); /* blocking */
bad_fork_cleanup_files:
        exit_files(p); /* blocking */
bad_fork_cleanup_semundo:
        exit_sem(p);
bad_fork_cleanup_security:
        security_task_free(p);
bad_fork_cleanup_audit:
        audit_free(p);
bad_fork_cleanup_perf:
        perf_event_free_task(p);
bad_fork_sched_cancel_fork:
        sched_cancel_fork(p);
bad_fork_cleanup_policy:
        lockdep_free_task(p);
#ifdef CONFIG_NUMA
        mpol_put(p->mempolicy);
#endif
bad_fork_cleanup_delayacct:
        delayacct_tsk_free(p);
bad_fork_cleanup_count:
        dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
        exit_creds(p);
bad_fork_free:
        WRITE_ONCE(p->__state, TASK_DEAD);
        exit_task_stack_account(p);
        put_task_stack(p);
        delayed_free_task(p);
fork_out:
        spin_lock_irq(&current->sighand->siglock);
        hlist_del_init(&delayed.node);
        spin_unlock_irq(&current->sighand->siglock);
        return ERR_PTR(retval);
}

static inline void init_idle_pids(struct task_struct *idle)
{
        enum pid_type type;

        for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
                INIT_HLIST_NODE(&idle->pid_links[type]); /* not really needed */
                init_task_pid(idle, type, &init_struct_pid);
        }
}

static int idle_dummy(void *dummy)
{
        /* This function is never called */
        return 0;
}

struct task_struct * __init fork_idle(int cpu)
{
        struct task_struct *task;
        struct kernel_clone_args args = {
                .flags                = CLONE_VM,
                .fn                = &idle_dummy,
                .fn_arg                = NULL,
                .kthread        = 1,
                .idle                = 1,
        };

        task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
        if (!IS_ERR(task)) {
                init_idle_pids(task);
                init_idle(task, cpu);
        }

        return task;
}

/*
 * This is like kernel_clone(), but shaved down and tailored to just
 * creating io_uring workers. It returns a created task, or an error pointer.
 * The returned task is inactive, and the caller must fire it up through
 * wake_up_new_task(p). All signals are blocked in the created task.
 */
struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
{
        unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
                              CLONE_IO|CLONE_VM|CLONE_UNTRACED;
        struct kernel_clone_args args = {
                .flags                = flags,
                .fn                = fn,
                .fn_arg                = arg,
                .io_thread        = 1,
                .user_worker        = 1,
        };

        return copy_process(NULL, 0, node, &args);
}

/*
 *  Ok, this is the main fork-routine.
 *
 * It copies the process, and if successful kick-starts
 * it and waits for it to finish using the VM if required.
 *
 * args->exit_signal is expected to be checked for sanity by the caller.
 */
pid_t kernel_clone(struct kernel_clone_args *args)
{
        u64 clone_flags = args->flags;
        struct completion vfork;
        struct pid *pid;
        struct task_struct *p;
        int trace = 0;
        pid_t nr;

        /*
         * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
         * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
         * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
         * field in struct clone_args and it still doesn't make sense to have
         * them both point at the same memory location. Performing this check
         * here has the advantage that we don't need to have a separate helper
         * to check for legacy clone().
         */
        if ((clone_flags & CLONE_PIDFD) &&
            (clone_flags & CLONE_PARENT_SETTID) &&
            (args->pidfd == args->parent_tid))
                return -EINVAL;

        /*
         * Determine whether and which event to report to ptracer.  When
         * called from kernel_thread or CLONE_UNTRACED is explicitly
         * requested, no event is reported; otherwise, report if the event
         * for the type of forking is enabled.
         */
        if (!(clone_flags & CLONE_UNTRACED)) {
                if (clone_flags & CLONE_VFORK)
                        trace = PTRACE_EVENT_VFORK;
                else if (args->exit_signal != SIGCHLD)
                        trace = PTRACE_EVENT_CLONE;
                else
                        trace = PTRACE_EVENT_FORK;

                if (likely(!ptrace_event_enabled(current, trace)))
                        trace = 0;
        }

        p = copy_process(NULL, trace, NUMA_NO_NODE, args);
        add_latent_entropy();

        if (IS_ERR(p))
                return PTR_ERR(p);

        /*
         * Do this prior waking up the new thread - the thread pointer
         * might get invalid after that point, if the thread exits quickly.
         */
        trace_sched_process_fork(current, p);

        pid = get_task_pid(p, PIDTYPE_PID);
        nr = pid_vnr(pid);

        if (clone_flags & CLONE_PARENT_SETTID)
                put_user(nr, args->parent_tid);

        if (clone_flags & CLONE_VFORK) {
                p->vfork_done = &vfork;
                init_completion(&vfork);
                get_task_struct(p);
        }

        if (IS_ENABLED(CONFIG_LRU_GEN_WALKS_MMU) && !(clone_flags & CLONE_VM)) {
                /* lock the task to synchronize with memcg migration */
                task_lock(p);
                lru_gen_add_mm(p->mm);
                task_unlock(p);
        }

        wake_up_new_task(p);

        /* forking complete and child started to run, tell ptracer */
        if (unlikely(trace))
                ptrace_event_pid(trace, pid);

        if (clone_flags & CLONE_VFORK) {
                if (!wait_for_vfork_done(p, &vfork))
                        ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
        }

        put_pid(pid);
        return nr;
}

/*
 * Create a kernel thread.
 */
pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
                    unsigned long flags)
{
        struct kernel_clone_args args = {
                .flags                = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL),
                .exit_signal        = (flags & CSIGNAL),
                .fn                = fn,
                .fn_arg                = arg,
                .name                = name,
                .kthread        = 1,
        };

        return kernel_clone(&args);
}

/*
 * Create a user mode thread.
 */
pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
        struct kernel_clone_args args = {
                .flags                = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL),
                .exit_signal        = (flags & CSIGNAL),
                .fn                = fn,
                .fn_arg                = arg,
        };

        return kernel_clone(&args);
}

#ifdef __ARCH_WANT_SYS_FORK
SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
        struct kernel_clone_args args = {
                .exit_signal = SIGCHLD,
        };

        return kernel_clone(&args);
#else
        /* can not support in nommu mode */
        return -EINVAL;
#endif
}
#endif

#ifdef __ARCH_WANT_SYS_VFORK
SYSCALL_DEFINE0(vfork)
{
        struct kernel_clone_args args = {
                .flags                = CLONE_VFORK | CLONE_VM,
                .exit_signal        = SIGCHLD,
        };

        return kernel_clone(&args);
}
#endif

#ifdef __ARCH_WANT_SYS_CLONE
#ifdef CONFIG_CLONE_BACKWARDS
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
                 int __user *, parent_tidptr,
                 unsigned long, tls,
                 int __user *, child_tidptr)
#elif defined(CONFIG_CLONE_BACKWARDS2)
SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
                 int __user *, parent_tidptr,
                 int __user *, child_tidptr,
                 unsigned long, tls)
#elif defined(CONFIG_CLONE_BACKWARDS3)
SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
                int, stack_size,
                int __user *, parent_tidptr,
                int __user *, child_tidptr,
                unsigned long, tls)
#else
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
                 int __user *, parent_tidptr,
                 int __user *, child_tidptr,
                 unsigned long, tls)
#endif
{
        struct kernel_clone_args args = {
                .flags                = (lower_32_bits(clone_flags) & ~CSIGNAL),
                .pidfd                = parent_tidptr,
                .child_tid        = child_tidptr,
                .parent_tid        = parent_tidptr,
                .exit_signal        = (lower_32_bits(clone_flags) & CSIGNAL),
                .stack                = newsp,
                .tls                = tls,
        };

        return kernel_clone(&args);
}
#endif

static noinline int copy_clone_args_from_user(struct kernel_clone_args *kargs,
                                              struct clone_args __user *uargs,
                                              size_t usize)
{
        int err;
        struct clone_args args;
        pid_t *kset_tid = kargs->set_tid;

        BUILD_BUG_ON(offsetofend(struct clone_args, tls) !=
                     CLONE_ARGS_SIZE_VER0);
        BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) !=
                     CLONE_ARGS_SIZE_VER1);
        BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
                     CLONE_ARGS_SIZE_VER2);
        BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);

        if (unlikely(usize > PAGE_SIZE))
                return -E2BIG;
        if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
                return -EINVAL;

        err = copy_struct_from_user(&args, sizeof(args), uargs, usize);
        if (err)
                return err;

        if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
                return -EINVAL;

        if (unlikely(!args.set_tid && args.set_tid_size > 0))
                return -EINVAL;

        if (unlikely(args.set_tid && args.set_tid_size == 0))
                return -EINVAL;

        /*
         * Verify that higher 32bits of exit_signal are unset and that
         * it is a valid signal
         */
        if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
                     !valid_signal(args.exit_signal)))
                return -EINVAL;

        if ((args.flags & CLONE_INTO_CGROUP) &&
            (args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2))
                return -EINVAL;

        *kargs = (struct kernel_clone_args){
                .flags                = args.flags,
                .pidfd                = u64_to_user_ptr(args.pidfd),
                .child_tid        = u64_to_user_ptr(args.child_tid),
                .parent_tid        = u64_to_user_ptr(args.parent_tid),
                .exit_signal        = args.exit_signal,
                .stack                = args.stack,
                .stack_size        = args.stack_size,
                .tls                = args.tls,
                .set_tid_size        = args.set_tid_size,
                .cgroup                = args.cgroup,
        };

        if (args.set_tid &&
                copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
                        (kargs->set_tid_size * sizeof(pid_t))))
                return -EFAULT;

        kargs->set_tid = kset_tid;

        return 0;
}

/**
 * clone3_stack_valid - check and prepare stack
 * @kargs: kernel clone args
 *
 * Verify that the stack arguments userspace gave us are sane.
 * In addition, set the stack direction for userspace since it's easy for us to
 * determine.
 */
static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
{
        if (kargs->stack == 0) {
                if (kargs->stack_size > 0)
                        return false;
        } else {
                if (kargs->stack_size == 0)
                        return false;

                if (!access_ok((void __user *)kargs->stack, kargs->stack_size))
                        return false;

#if !defined(CONFIG_STACK_GROWSUP)
                kargs->stack += kargs->stack_size;
#endif
        }

        return true;
}

static bool clone3_args_valid(struct kernel_clone_args *kargs)
{
        /* Verify that no unknown flags are passed along. */
        if (kargs->flags &
            ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
                return false;

        /*
         * - make the CLONE_DETACHED bit reusable for clone3
         * - make the CSIGNAL bits reusable for clone3
         */
        if (kargs->flags & (CLONE_DETACHED | (CSIGNAL & (~CLONE_NEWTIME))))
                return false;

        if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) ==
            (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND))
                return false;

        if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
            kargs->exit_signal)
                return false;

        if (!clone3_stack_valid(kargs))
                return false;

        return true;
}

/**
 * sys_clone3 - create a new process with specific properties
 * @uargs: argument structure
 * @size:  size of @uargs
 *
 * clone3() is the extensible successor to clone()/clone2().
 * It takes a struct as argument that is versioned by its size.
 *
 * Return: On success, a positive PID for the child process.
 *         On error, a negative errno number.
 */
SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
{
        int err;

        struct kernel_clone_args kargs;
        pid_t set_tid[MAX_PID_NS_LEVEL];

#ifdef __ARCH_BROKEN_SYS_CLONE3
#warning clone3() entry point is missing, please fix
        return -ENOSYS;
#endif

        kargs.set_tid = set_tid;

        err = copy_clone_args_from_user(&kargs, uargs, size);
        if (err)
                return err;

        if (!clone3_args_valid(&kargs))
                return -EINVAL;

        return kernel_clone(&kargs);
}

void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data)
{
        struct task_struct *leader, *parent, *child;
        int res;

        read_lock(&tasklist_lock);
        leader = top = top->group_leader;
down:
        for_each_thread(leader, parent) {
                list_for_each_entry(child, &parent->children, sibling) {
                        res = visitor(child, data);
                        if (res) {
                                if (res < 0)
                                        goto out;
                                leader = child;
                                goto down;
                        }
up:
                        ;
                }
        }

        if (leader != top) {
                child = leader;
                parent = child->real_parent;
                leader = parent->group_leader;
                goto up;
        }
out:
        read_unlock(&tasklist_lock);
}

#ifndef ARCH_MIN_MMSTRUCT_ALIGN
#define ARCH_MIN_MMSTRUCT_ALIGN 0
#endif

static void sighand_ctor(void *data)
{
        struct sighand_struct *sighand = data;

        spin_lock_init(&sighand->siglock);
        init_waitqueue_head(&sighand->signalfd_wqh);
}

void __init mm_cache_init(void)
{
        unsigned int mm_size;

        /*
         * The mm_cpumask is located at the end of mm_struct, and is
         * dynamically sized based on the maximum CPU number this system
         * can have, taking hotplug into account (nr_cpu_ids).
         */
        mm_size = sizeof(struct mm_struct) + cpumask_size() + mm_cid_size();

        mm_cachep = kmem_cache_create_usercopy("mm_struct",
                        mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                        offsetof(struct mm_struct, saved_auxv),
                        sizeof_field(struct mm_struct, saved_auxv),
                        NULL);
}

void __init proc_caches_init(void)
{
        sighand_cachep = kmem_cache_create("sighand_cache",
                        sizeof(struct sighand_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
                        SLAB_ACCOUNT, sighand_ctor);
        signal_cachep = kmem_cache_create("signal_cache",
                        sizeof(struct signal_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                        NULL);
        files_cachep = kmem_cache_create("files_cache",
                        sizeof(struct files_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                        NULL);
        fs_cachep = kmem_cache_create("fs_cache",
                        sizeof(struct fs_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                        NULL);
        mmap_init();
        nsproxy_cache_init();
}

/*
 * Check constraints on flags passed to the unshare system call.
 */
static int check_unshare_flags(unsigned long unshare_flags)
{
        if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
                                CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
                                CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
                                CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
                                CLONE_NEWTIME))
                return -EINVAL;
        /*
         * Not implemented, but pretend it works if there is nothing
         * to unshare.  Note that unsharing the address space or the
         * signal handlers also need to unshare the signal queues (aka
         * CLONE_THREAD).
         */
        if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
                if (!thread_group_empty(current))
                        return -EINVAL;
        }
        if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
                if (refcount_read(&current->sighand->count) > 1)
                        return -EINVAL;
        }
        if (unshare_flags & CLONE_VM) {
                if (!current_is_single_threaded())
                        return -EINVAL;
        }

        return 0;
}

/*
 * Unshare the filesystem structure if it is being shared
 */
static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
{
        struct fs_struct *fs = current->fs;

        if (!(unshare_flags & CLONE_FS) || !fs)
                return 0;

        /* don't need lock here; in the worst case we'll do useless copy */
        if (fs->users == 1)
                return 0;

        *new_fsp = copy_fs_struct(fs);
        if (!*new_fsp)
                return -ENOMEM;

        return 0;
}

/*
 * Unshare file descriptor table if it is being shared
 */
static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
{
        struct files_struct *fd = current->files;

        if ((unshare_flags & CLONE_FILES) &&
            (fd && atomic_read(&fd->count) > 1)) {
                fd = dup_fd(fd, NULL);
                if (IS_ERR(fd))
                        return PTR_ERR(fd);
                *new_fdp = fd;
        }

        return 0;
}

/*
 * unshare allows a process to 'unshare' part of the process
 * context which was originally shared using clone.  copy_*
 * functions used by kernel_clone() cannot be used here directly
 * because they modify an inactive task_struct that is being
 * constructed. Here we are modifying the current, active,
 * task_struct.
 */
int ksys_unshare(unsigned long unshare_flags)
{
        struct fs_struct *fs, *new_fs = NULL;
        struct files_struct *new_fd = NULL;
        struct cred *new_cred = NULL;
        struct nsproxy *new_nsproxy = NULL;
        int do_sysvsem = 0;
        int err;

        /*
         * If unsharing a user namespace must also unshare the thread group
         * and unshare the filesystem root and working directories.
         */
        if (unshare_flags & CLONE_NEWUSER)
                unshare_flags |= CLONE_THREAD | CLONE_FS;
        /*
         * If unsharing vm, must also unshare signal handlers.
         */
        if (unshare_flags & CLONE_VM)
                unshare_flags |= CLONE_SIGHAND;
        /*
         * If unsharing a signal handlers, must also unshare the signal queues.
         */
        if (unshare_flags & CLONE_SIGHAND)
                unshare_flags |= CLONE_THREAD;
        /*
         * If unsharing namespace, must also unshare filesystem information.
         */
        if (unshare_flags & CLONE_NEWNS)
                unshare_flags |= CLONE_FS;

        err = check_unshare_flags(unshare_flags);
        if (err)
                goto bad_unshare_out;
        /*
         * CLONE_NEWIPC must also detach from the undolist: after switching
         * to a new ipc namespace, the semaphore arrays from the old
         * namespace are unreachable.
         */
        if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
                do_sysvsem = 1;
        err = unshare_fs(unshare_flags, &new_fs);
        if (err)
                goto bad_unshare_out;
        err = unshare_fd(unshare_flags, &new_fd);
        if (err)
                goto bad_unshare_cleanup_fs;
        err = unshare_userns(unshare_flags, &new_cred);
        if (err)
                goto bad_unshare_cleanup_fd;
        err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
                                         new_cred, new_fs);
        if (err)
                goto bad_unshare_cleanup_cred;

        if (new_cred) {
                err = set_cred_ucounts(new_cred);
                if (err)
                        goto bad_unshare_cleanup_cred;
        }

        if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
                if (do_sysvsem) {
                        /*
                         * CLONE_SYSVSEM is equivalent to sys_exit().
                         */
                        exit_sem(current);
                }
                if (unshare_flags & CLONE_NEWIPC) {
                        /* Orphan segments in old ns (see sem above). */
                        exit_shm(current);
                        shm_init_task(current);
                }

                if (new_nsproxy)
                        switch_task_namespaces(current, new_nsproxy);

                task_lock(current);

                if (new_fs) {
                        fs = current->fs;
                        read_seqlock_excl(&fs->seq);
                        current->fs = new_fs;
                        if (--fs->users)
                                new_fs = NULL;
                        else
                                new_fs = fs;
                        read_sequnlock_excl(&fs->seq);
                }

                if (new_fd)
                        swap(current->files, new_fd);

                task_unlock(current);

                if (new_cred) {
                        /* Install the new user namespace */
                        commit_creds(new_cred);
                        new_cred = NULL;
                }
        }

        perf_event_namespaces(current);

bad_unshare_cleanup_cred:
        if (new_cred)
                put_cred(new_cred);
bad_unshare_cleanup_fd:
        if (new_fd)
                put_files_struct(new_fd);

bad_unshare_cleanup_fs:
        if (new_fs)
                free_fs_struct(new_fs);

bad_unshare_out:
        return err;
}

SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
{
        return ksys_unshare(unshare_flags);
}

/*
 *        Helper to unshare the files of the current task.
 *        We don't want to expose copy_files internals to
 *        the exec layer of the kernel.
 */

int unshare_files(void)
{
        struct task_struct *task = current;
        struct files_struct *old, *copy = NULL;
        int error;

        error = unshare_fd(CLONE_FILES, &copy);
        if (error || !copy)
                return error;

        old = task->files;
        task_lock(task);
        task->files = copy;
        task_unlock(task);
        put_files_struct(old);
        return 0;
}

static int sysctl_max_threads(const struct ctl_table *table, int write,
                       void *buffer, size_t *lenp, loff_t *ppos)
{
        struct ctl_table t;
        int ret;
        int threads = max_threads;
        int min = 1;
        int max = MAX_THREADS;

        t = *table;
        t.data = &threads;
        t.extra1 = &min;
        t.extra2 = &max;

        ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
        if (ret || !write)
                return ret;

        max_threads = threads;

        return 0;
}

static const struct ctl_table fork_sysctl_table[] = {
        {
                .procname        = "threads-max",
                .data                = NULL,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = sysctl_max_threads,
        },
};

static int __init init_fork_sysctl(void)
{
        register_sysctl_init("kernel", fork_sysctl_table);
        return 0;
}

subsys_initcall(init_fork_sysctl);

































































































































































   46 
   45 






   15 















   46 









   46 

























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Cryptographic API.
 *
 * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
 * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au>
 */
#ifndef _CRYPTO_INTERNAL_H
#define _CRYPTO_INTERNAL_H

#include <crypto/algapi.h>
#include <linux/completion.h>
#include <linux/err.h>
#include <linux/jump_label.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/notifier.h>
#include <linux/numa.h>
#include <linux/refcount.h>
#include <linux/rwsem.h>
#include <linux/scatterlist.h>
#include <linux/sched.h>
#include <linux/types.h>

struct crypto_instance;
struct crypto_template;

struct crypto_larval {
        struct crypto_alg alg;
        struct crypto_alg *adult;
        struct completion completion;
        u32 mask;
        bool test_started;
};

struct crypto_type {
        unsigned int (*ctxsize)(struct crypto_alg *alg, u32 type, u32 mask);
        unsigned int (*extsize)(struct crypto_alg *alg);
        int (*init_tfm)(struct crypto_tfm *tfm);
        void (*show)(struct seq_file *m, struct crypto_alg *alg);
        int (*report)(struct sk_buff *skb, struct crypto_alg *alg);
        void (*free)(struct crypto_instance *inst);
        void (*destroy)(struct crypto_alg *alg);

        unsigned int type;
        unsigned int maskclear;
        unsigned int maskset;
        unsigned int tfmsize;
        unsigned int algsize;
};

enum {
        CRYPTOA_UNSPEC,
        CRYPTOA_ALG,
        CRYPTOA_TYPE,
        __CRYPTOA_MAX,
};

#define CRYPTOA_MAX (__CRYPTOA_MAX - 1)

/* Maximum number of (rtattr) parameters for each template. */
#define CRYPTO_MAX_ATTRS 32

extern struct list_head crypto_alg_list;
extern struct rw_semaphore crypto_alg_sem;
extern struct blocking_notifier_head crypto_chain;

int alg_test(const char *driver, const char *alg, u32 type, u32 mask);

#if !IS_BUILTIN(CONFIG_CRYPTO_ALGAPI) || !IS_ENABLED(CONFIG_CRYPTO_SELFTESTS)
static inline bool crypto_boot_test_finished(void)
{
        return true;
}
static inline void set_crypto_boot_test_finished(void)
{
}
#else
DECLARE_STATIC_KEY_FALSE(__crypto_boot_test_finished);
static inline bool crypto_boot_test_finished(void)
{
        return static_branch_likely(&__crypto_boot_test_finished);
}
static inline void set_crypto_boot_test_finished(void)
{
        static_branch_enable(&__crypto_boot_test_finished);
}
#endif /* !IS_BUILTIN(CONFIG_CRYPTO_ALGAPI) ||
        * !IS_ENABLED(CONFIG_CRYPTO_SELFTESTS)
        */

#ifdef CONFIG_PROC_FS
void __init crypto_init_proc(void);
void __exit crypto_exit_proc(void);
#else
static inline void crypto_init_proc(void)
{ }
static inline void crypto_exit_proc(void)
{ }
#endif

static inline unsigned int crypto_cipher_ctxsize(struct crypto_alg *alg)
{
        return alg->cra_ctxsize;
}

static inline unsigned int crypto_compress_ctxsize(struct crypto_alg *alg)
{
        return alg->cra_ctxsize;
}

struct crypto_alg *crypto_mod_get(struct crypto_alg *alg);
struct crypto_alg *crypto_alg_mod_lookup(const char *name, u32 type, u32 mask);

struct crypto_larval *crypto_larval_alloc(const char *name, u32 type, u32 mask);
void crypto_schedule_test(struct crypto_larval *larval);
void crypto_alg_tested(const char *name, int err);

void crypto_remove_spawns(struct crypto_alg *alg, struct list_head *list,
                          struct crypto_alg *nalg);
void crypto_remove_final(struct list_head *list);
void crypto_shoot_alg(struct crypto_alg *alg);
struct crypto_tfm *__crypto_alloc_tfmgfp(struct crypto_alg *alg, u32 type,
                                         u32 mask, gfp_t gfp);
struct crypto_tfm *__crypto_alloc_tfm(struct crypto_alg *alg, u32 type,
                                      u32 mask);
void *crypto_create_tfm_node(struct crypto_alg *alg,
                        const struct crypto_type *frontend, int node);
void *crypto_clone_tfm(const struct crypto_type *frontend,
                       struct crypto_tfm *otfm);

static inline void *crypto_create_tfm(struct crypto_alg *alg,
                        const struct crypto_type *frontend)
{
        return crypto_create_tfm_node(alg, frontend, NUMA_NO_NODE);
}

struct crypto_alg *crypto_find_alg(const char *alg_name,
                                   const struct crypto_type *frontend,
                                   u32 type, u32 mask);

void *crypto_alloc_tfm_node(const char *alg_name,
                       const struct crypto_type *frontend, u32 type, u32 mask,
                       int node);

static inline void *crypto_alloc_tfm(const char *alg_name,
                       const struct crypto_type *frontend, u32 type, u32 mask)
{
        return crypto_alloc_tfm_node(alg_name, frontend, type, mask, NUMA_NO_NODE);
}

int crypto_probing_notify(unsigned long val, void *v);

unsigned int crypto_alg_extsize(struct crypto_alg *alg);

int crypto_type_has_alg(const char *name, const struct crypto_type *frontend,
                        u32 type, u32 mask);

static inline struct crypto_alg *crypto_alg_get(struct crypto_alg *alg)
{
        refcount_inc(&alg->cra_refcnt);
        return alg;
}

void crypto_destroy_alg(struct crypto_alg *alg);

static inline void crypto_alg_put(struct crypto_alg *alg)
{
        if (refcount_dec_and_test(&alg->cra_refcnt))
                crypto_destroy_alg(alg);
}

static inline int crypto_tmpl_get(struct crypto_template *tmpl)
{
        return try_module_get(tmpl->module);
}

static inline void crypto_tmpl_put(struct crypto_template *tmpl)
{
        module_put(tmpl->module);
}

static inline int crypto_is_larval(struct crypto_alg *alg)
{
        return alg->cra_flags & CRYPTO_ALG_LARVAL;
}

static inline int crypto_is_dead(struct crypto_alg *alg)
{
        return alg->cra_flags & CRYPTO_ALG_DEAD;
}

static inline int crypto_is_moribund(struct crypto_alg *alg)
{
        return alg->cra_flags & (CRYPTO_ALG_DEAD | CRYPTO_ALG_DYING);
}

static inline void crypto_notify(unsigned long val, void *v)
{
        blocking_notifier_call_chain(&crypto_chain, val, v);
}

static inline void crypto_yield(u32 flags)
{
        if (flags & CRYPTO_TFM_REQ_MAY_SLEEP)
                cond_resched();
}

static inline int crypto_is_test_larval(struct crypto_larval *larval)
{
        return larval->alg.cra_driver_name[0];
}

static inline struct crypto_tfm *crypto_tfm_get(struct crypto_tfm *tfm)
{
        return refcount_inc_not_zero(&tfm->refcnt) ? tfm : ERR_PTR(-EOVERFLOW);
}

#endif        /* _CRYPTO_INTERNAL_H */
















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __BEN_VLAN_802_1Q_INC__
#define __BEN_VLAN_802_1Q_INC__

#include <linux/if_vlan.h>
#include <linux/u64_stats_sync.h>
#include <linux/list.h>

/* if this changes, algorithm will have to be reworked because this
 * depends on completely exhausting the VLAN identifier space.  Thus
 * it gives constant time look-up, but in many cases it wastes memory.
 */
#define VLAN_GROUP_ARRAY_SPLIT_PARTS  8
#define VLAN_GROUP_ARRAY_PART_LEN     (VLAN_N_VID/VLAN_GROUP_ARRAY_SPLIT_PARTS)

enum vlan_protos {
        VLAN_PROTO_8021Q        = 0,
        VLAN_PROTO_8021AD,
        VLAN_PROTO_NUM,
};

struct vlan_group {
        unsigned int                nr_vlan_devs;
        struct hlist_node        hlist;        /* linked list */
        struct net_device **vlan_devices_arrays[VLAN_PROTO_NUM]
                                               [VLAN_GROUP_ARRAY_SPLIT_PARTS];
};

struct vlan_info {
        struct net_device        *real_dev; /* The ethernet(like) device
                                            * the vlan is attached to.
                                            */
        struct vlan_group        grp;
        struct list_head        vid_list;
        unsigned int                nr_vids;
        bool                        auto_vid0;
        struct rcu_head                rcu;
};

static inline int vlan_proto_idx(__be16 proto)
{
        switch (proto) {
        case htons(ETH_P_8021Q):
                return VLAN_PROTO_8021Q;
        case htons(ETH_P_8021AD):
                return VLAN_PROTO_8021AD;
        default:
                WARN(1, "invalid VLAN protocol: 0x%04x\n", ntohs(proto));
                return -EINVAL;
        }
}

static inline struct net_device *__vlan_group_get_device(struct vlan_group *vg,
                                                         unsigned int pidx,
                                                         u16 vlan_id)
{
        struct net_device **array;

        array = vg->vlan_devices_arrays[pidx]
                                       [vlan_id / VLAN_GROUP_ARRAY_PART_LEN];

        /* paired with smp_wmb() in vlan_group_prealloc_vid() */
        smp_rmb();

        return array ? array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] : NULL;
}

static inline struct net_device *vlan_group_get_device(struct vlan_group *vg,
                                                       __be16 vlan_proto,
                                                       u16 vlan_id)
{
        int pidx = vlan_proto_idx(vlan_proto);

        if (pidx < 0)
                return NULL;

        return __vlan_group_get_device(vg, pidx, vlan_id);
}

static inline void vlan_group_set_device(struct vlan_group *vg,
                                         __be16 vlan_proto, u16 vlan_id,
                                         struct net_device *dev)
{
        int pidx = vlan_proto_idx(vlan_proto);
        struct net_device **array;

        if (!vg || pidx < 0)
                return;
        array = vg->vlan_devices_arrays[pidx]
                                       [vlan_id / VLAN_GROUP_ARRAY_PART_LEN];
        array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] = dev;
}

/* Must be invoked with rcu_read_lock or with RTNL. */
static inline struct net_device *vlan_find_dev(struct net_device *real_dev,
                                               __be16 vlan_proto, u16 vlan_id)
{
        struct vlan_info *vlan_info = rcu_dereference_rtnl(real_dev->vlan_info);

        if (vlan_info)
                return vlan_group_get_device(&vlan_info->grp,
                                             vlan_proto, vlan_id);

        return NULL;
}

static inline netdev_features_t vlan_tnl_features(struct net_device *real_dev)
{
        netdev_features_t ret;

        ret = real_dev->hw_enc_features &
              (NETIF_F_CSUM_MASK | NETIF_F_GSO_SOFTWARE |
               NETIF_F_GSO_ENCAP_ALL);

        if ((ret & NETIF_F_GSO_ENCAP_ALL) && (ret & NETIF_F_CSUM_MASK))
                return (ret & ~NETIF_F_CSUM_MASK) | NETIF_F_HW_CSUM;
        return 0;
}

#define vlan_group_for_each_dev(grp, i, dev) \
        for ((i) = 0; i < VLAN_PROTO_NUM * VLAN_N_VID; i++) \
                if (((dev) = __vlan_group_get_device((grp), (i) / VLAN_N_VID, \
                                                            (i) % VLAN_N_VID)))

int vlan_filter_push_vids(struct vlan_info *vlan_info, __be16 proto);
void vlan_filter_drop_vids(struct vlan_info *vlan_info, __be16 proto);

/* found in vlan_dev.c */
void vlan_dev_set_ingress_priority(const struct net_device *dev,
                                   u32 skb_prio, u16 vlan_prio);
int vlan_dev_set_egress_priority(const struct net_device *dev,
                                 u32 skb_prio, u16 vlan_prio);
void vlan_dev_free_egress_priority(const struct net_device *dev);
int vlan_dev_change_flags(const struct net_device *dev, u32 flag, u32 mask);
void vlan_dev_get_realdev_name(const struct net_device *dev, char *result,
                               size_t size);

int vlan_check_real_dev(struct net_device *real_dev,
                        __be16 protocol, u16 vlan_id,
                        struct netlink_ext_ack *extack);
void vlan_setup(struct net_device *dev);
int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack);
void unregister_vlan_dev(struct net_device *dev, struct list_head *head);
bool vlan_dev_inherit_address(struct net_device *dev,
                              struct net_device *real_dev);

static inline u32 vlan_get_ingress_priority(struct net_device *dev,
                                            u16 vlan_tci)
{
        struct vlan_dev_priv *vip = vlan_dev_priv(dev);

        return vip->ingress_priority_map[(vlan_tci >> VLAN_PRIO_SHIFT) & 0x7];
}

#ifdef CONFIG_VLAN_8021Q_GVRP
int vlan_gvrp_request_join(const struct net_device *dev);
void vlan_gvrp_request_leave(const struct net_device *dev);
int vlan_gvrp_init_applicant(struct net_device *dev);
void vlan_gvrp_uninit_applicant(struct net_device *dev);
int vlan_gvrp_init(void);
void vlan_gvrp_uninit(void);
#else
static inline int vlan_gvrp_request_join(const struct net_device *dev) { return 0; }
static inline void vlan_gvrp_request_leave(const struct net_device *dev) {}
static inline int vlan_gvrp_init_applicant(struct net_device *dev) { return 0; }
static inline void vlan_gvrp_uninit_applicant(struct net_device *dev) {}
static inline int vlan_gvrp_init(void) { return 0; }
static inline void vlan_gvrp_uninit(void) {}
#endif

#ifdef CONFIG_VLAN_8021Q_MVRP
int vlan_mvrp_request_join(const struct net_device *dev);
void vlan_mvrp_request_leave(const struct net_device *dev);
int vlan_mvrp_init_applicant(struct net_device *dev);
void vlan_mvrp_uninit_applicant(struct net_device *dev);
int vlan_mvrp_init(void);
void vlan_mvrp_uninit(void);
#else
static inline int vlan_mvrp_request_join(const struct net_device *dev) { return 0; }
static inline void vlan_mvrp_request_leave(const struct net_device *dev) {}
static inline int vlan_mvrp_init_applicant(struct net_device *dev) { return 0; }
static inline void vlan_mvrp_uninit_applicant(struct net_device *dev) {}
static inline int vlan_mvrp_init(void) { return 0; }
static inline void vlan_mvrp_uninit(void) {}
#endif

extern const char vlan_fullname[];
extern const char vlan_version[];
int vlan_netlink_init(void);
void vlan_netlink_fini(void);

extern struct rtnl_link_ops vlan_link_ops;

extern unsigned int vlan_net_id;

struct proc_dir_entry;

struct vlan_net {
        /* /proc/net/vlan */
        struct proc_dir_entry *proc_vlan_dir;
        /* /proc/net/vlan/config */
        struct proc_dir_entry *proc_vlan_conf;
        /* Determines interface naming scheme. */
        unsigned short name_type;
};

#endif /* !(__BEN_VLAN_802_1Q_INC__) */


























   85 


















































































   85 










  311 






   85 































  311 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _X86_IRQFLAGS_H_
#define _X86_IRQFLAGS_H_

#include <asm/processor-flags.h>

#ifndef __ASSEMBLER__

#include <asm/nospec-branch.h>

/*
 * Interrupt control:
 */

/* Declaration required for gcc < 4.9 to prevent -Werror=missing-prototypes */
extern inline unsigned long native_save_fl(void);
extern __always_inline unsigned long native_save_fl(void)
{
        unsigned long flags;

        /*
         * "=rm" is safe here, because "pop" adjusts the stack before
         * it evaluates its effective address -- this is part of the
         * documented behavior of the "pop" instruction.
         */
        asm volatile("# __raw_save_flags\n\t"
                     "pushf ; pop %0"
                     : "=rm" (flags)
                     : /* no input */
                     : "memory");

        return flags;
}

static __always_inline void native_irq_disable(void)
{
        asm volatile("cli": : :"memory");
}

static __always_inline void native_irq_enable(void)
{
        asm volatile("sti": : :"memory");
}

static __always_inline void native_safe_halt(void)
{
        x86_idle_clear_cpu_buffers();
        asm volatile("sti; hlt": : :"memory");
}

static __always_inline void native_halt(void)
{
        x86_idle_clear_cpu_buffers();
        asm volatile("hlt": : :"memory");
}

static __always_inline int native_irqs_disabled_flags(unsigned long flags)
{
        return !(flags & X86_EFLAGS_IF);
}

static __always_inline unsigned long native_local_irq_save(void)
{
        unsigned long flags = native_save_fl();

        native_irq_disable();

        return flags;
}

static __always_inline void native_local_irq_restore(unsigned long flags)
{
        if (!native_irqs_disabled_flags(flags))
                native_irq_enable();
}

#endif

#ifndef CONFIG_PARAVIRT
#ifndef __ASSEMBLY__
/*
 * Used in the idle loop; sti takes one instruction cycle
 * to complete:
 */
static __always_inline void arch_safe_halt(void)
{
        native_safe_halt();
}

/*
 * Used when interrupts are already enabled or to
 * shutdown the processor:
 */
static __always_inline void halt(void)
{
        native_halt();
}
#endif /* __ASSEMBLY__ */
#endif /* CONFIG_PARAVIRT */

#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
#ifndef __ASSEMBLER__
#include <linux/types.h>

static __always_inline unsigned long arch_local_save_flags(void)
{
        return native_save_fl();
}

static __always_inline void arch_local_irq_disable(void)
{
        native_irq_disable();
}

static __always_inline void arch_local_irq_enable(void)
{
        native_irq_enable();
}

/*
 * For spinlocks, etc:
 */
static __always_inline unsigned long arch_local_irq_save(void)
{
        unsigned long flags = arch_local_save_flags();
        arch_local_irq_disable();
        return flags;
}
#else

#ifdef CONFIG_X86_64
#ifdef CONFIG_DEBUG_ENTRY
#define SAVE_FLAGS                pushfq; popq %rax
#endif

#endif

#endif /* __ASSEMBLER__ */
#endif /* CONFIG_PARAVIRT_XXL */

#ifndef __ASSEMBLER__
static __always_inline int arch_irqs_disabled_flags(unsigned long flags)
{
        return !(flags & X86_EFLAGS_IF);
}

static __always_inline int arch_irqs_disabled(void)
{
        unsigned long flags = arch_local_save_flags();

        return arch_irqs_disabled_flags(flags);
}

static __always_inline void arch_local_irq_restore(unsigned long flags)
{
        if (!arch_irqs_disabled_flags(flags))
                arch_local_irq_enable();
}
#endif /* !__ASSEMBLER__ */

#endif




































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MEMREMAP_H_
#define _LINUX_MEMREMAP_H_

#include <linux/mmzone.h>
#include <linux/range.h>
#include <linux/ioport.h>
#include <linux/percpu-refcount.h>

struct resource;
struct device;

/**
 * struct vmem_altmap - pre-allocated storage for vmemmap_populate
 * @base_pfn: base of the entire dev_pagemap mapping
 * @reserve: pages mapped, but reserved for driver use (relative to @base)
 * @free: free pages set aside in the mapping for memmap storage
 * @align: pages reserved to meet allocation alignments
 * @alloc: track pages consumed, private to vmemmap_populate()
 */
struct vmem_altmap {
        unsigned long base_pfn;
        const unsigned long end_pfn;
        const unsigned long reserve;
        unsigned long free;
        unsigned long align;
        unsigned long alloc;
        bool inaccessible;
};

/*
 * Specialize ZONE_DEVICE memory into multiple types each has a different
 * usage.
 *
 * MEMORY_DEVICE_PRIVATE:
 * Device memory that is not directly addressable by the CPU: CPU can neither
 * read nor write private memory. In this case, we do still have struct pages
 * backing the device memory. Doing so simplifies the implementation, but it is
 * important to remember that there are certain points at which the struct page
 * must be treated as an opaque object, rather than a "normal" struct page.
 *
 * A more complete discussion of unaddressable memory may be found in
 * include/linux/hmm.h and Documentation/mm/hmm.rst.
 *
 * MEMORY_DEVICE_COHERENT:
 * Device memory that is cache coherent from device and CPU point of view. This
 * is used on platforms that have an advanced system bus (like CAPI or CXL). A
 * driver can hotplug the device memory using ZONE_DEVICE and with that memory
 * type. Any page of a process can be migrated to such memory. However no one
 * should be allowed to pin such memory so that it can always be evicted.
 *
 * MEMORY_DEVICE_FS_DAX:
 * Host memory that has similar access semantics as System RAM i.e. DMA
 * coherent and supports page pinning. In support of coordinating page
 * pinning vs other operations MEMORY_DEVICE_FS_DAX arranges for a
 * wakeup event whenever a page is unpinned and becomes idle. This
 * wakeup is used to coordinate physical address space management (ex:
 * fs truncate/hole punch) vs pinned pages (ex: device dma).
 *
 * MEMORY_DEVICE_GENERIC:
 * Host memory that has similar access semantics as System RAM i.e. DMA
 * coherent and supports page pinning. This is for example used by DAX devices
 * that expose memory using a character device.
 *
 * MEMORY_DEVICE_PCI_P2PDMA:
 * Device memory residing in a PCI BAR intended for use with Peer-to-Peer
 * transactions.
 */
enum memory_type {
        /* 0 is reserved to catch uninitialized type fields */
        MEMORY_DEVICE_PRIVATE = 1,
        MEMORY_DEVICE_COHERENT,
        MEMORY_DEVICE_FS_DAX,
        MEMORY_DEVICE_GENERIC,
        MEMORY_DEVICE_PCI_P2PDMA,
};

struct dev_pagemap_ops {
        /*
         * Called once the page refcount reaches 0.  The reference count will be
         * reset to one by the core code after the method is called to prepare
         * for handing out the page again.
         */
        void (*page_free)(struct page *page);

        /*
         * Used for private (un-addressable) device memory only.  Must migrate
         * the page back to a CPU accessible page.
         */
        vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf);

        /*
         * Handle the memory failure happens on a range of pfns.  Notify the
         * processes who are using these pfns, and try to recover the data on
         * them if necessary.  The mf_flags is finally passed to the recover
         * function through the whole notify routine.
         *
         * When this is not implemented, or it returns -EOPNOTSUPP, the caller
         * will fall back to a common handler called mf_generic_kill_procs().
         */
        int (*memory_failure)(struct dev_pagemap *pgmap, unsigned long pfn,
                              unsigned long nr_pages, int mf_flags);
};

#define PGMAP_ALTMAP_VALID        (1 << 0)

/**
 * struct dev_pagemap - metadata for ZONE_DEVICE mappings
 * @altmap: pre-allocated/reserved memory for vmemmap allocations
 * @ref: reference count that pins the devm_memremap_pages() mapping
 * @done: completion for @ref
 * @type: memory type: see MEMORY_* above in memremap.h
 * @flags: PGMAP_* flags to specify defailed behavior
 * @vmemmap_shift: structural definition of how the vmemmap page metadata
 *      is populated, specifically the metadata page order.
 *        A zero value (default) uses base pages as the vmemmap metadata
 *        representation. A bigger value will set up compound struct pages
 *        of the requested order value.
 * @ops: method table
 * @owner: an opaque pointer identifying the entity that manages this
 *        instance.  Used by various helpers to make sure that no
 *        foreign ZONE_DEVICE memory is accessed.
 * @nr_range: number of ranges to be mapped
 * @range: range to be mapped when nr_range == 1
 * @ranges: array of ranges to be mapped when nr_range > 1
 */
struct dev_pagemap {
        struct vmem_altmap altmap;
        struct percpu_ref ref;
        struct completion done;
        enum memory_type type;
        unsigned int flags;
        unsigned long vmemmap_shift;
        const struct dev_pagemap_ops *ops;
        void *owner;
        int nr_range;
        union {
                struct range range;
                DECLARE_FLEX_ARRAY(struct range, ranges);
        };
};

static inline bool pgmap_has_memory_failure(struct dev_pagemap *pgmap)
{
        return pgmap->ops && pgmap->ops->memory_failure;
}

static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
{
        if (pgmap->flags & PGMAP_ALTMAP_VALID)
                return &pgmap->altmap;
        return NULL;
}

static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap)
{
        return 1 << pgmap->vmemmap_shift;
}

static inline bool folio_is_device_private(const struct folio *folio)
{
        return IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
                folio_is_zone_device(folio) &&
                folio->pgmap->type == MEMORY_DEVICE_PRIVATE;
}

static inline bool is_device_private_page(const struct page *page)
{
        return IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
                folio_is_device_private(page_folio(page));
}

static inline bool folio_is_pci_p2pdma(const struct folio *folio)
{
        return IS_ENABLED(CONFIG_PCI_P2PDMA) &&
                folio_is_zone_device(folio) &&
                folio->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
}

static inline bool is_pci_p2pdma_page(const struct page *page)
{
        return IS_ENABLED(CONFIG_PCI_P2PDMA) &&
                folio_is_pci_p2pdma(page_folio(page));
}

static inline bool folio_is_device_coherent(const struct folio *folio)
{
        return folio_is_zone_device(folio) &&
                folio->pgmap->type == MEMORY_DEVICE_COHERENT;
}

static inline bool is_device_coherent_page(const struct page *page)
{
        return folio_is_device_coherent(page_folio(page));
}

static inline bool folio_is_fsdax(const struct folio *folio)
{
        return folio_is_zone_device(folio) &&
                folio->pgmap->type == MEMORY_DEVICE_FS_DAX;
}

static inline bool is_fsdax_page(const struct page *page)
{
        return folio_is_fsdax(page_folio(page));
}

#ifdef CONFIG_ZONE_DEVICE
void zone_device_page_init(struct page *page);
void *memremap_pages(struct dev_pagemap *pgmap, int nid);
void memunmap_pages(struct dev_pagemap *pgmap);
void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap);
struct dev_pagemap *get_dev_pagemap(unsigned long pfn);
bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn);

unsigned long memremap_compat_align(void);
#else
static inline void *devm_memremap_pages(struct device *dev,
                struct dev_pagemap *pgmap)
{
        /*
         * Fail attempts to call devm_memremap_pages() without
         * ZONE_DEVICE support enabled, this requires callers to fall
         * back to plain devm_memremap() based on config
         */
        WARN_ON_ONCE(1);
        return ERR_PTR(-ENXIO);
}

static inline void devm_memunmap_pages(struct device *dev,
                struct dev_pagemap *pgmap)
{
}

static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn)
{
        return NULL;
}

static inline bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn)
{
        return false;
}

/* when memremap_pages() is disabled all archs can remap a single page */
static inline unsigned long memremap_compat_align(void)
{
        return PAGE_SIZE;
}
#endif /* CONFIG_ZONE_DEVICE */

static inline void put_dev_pagemap(struct dev_pagemap *pgmap)
{
        if (pgmap)
                percpu_ref_put(&pgmap->ref);
}

#endif /* _LINUX_MEMREMAP_H_ */

















































  319 



   54 


  319 

  316 


  319 



  319 
  316 


































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
// SPDX-License-Identifier: GPL-2.0
#include <linux/memblock.h>
#include <linux/mmdebug.h>
#include <linux/export.h>
#include <linux/mm.h>

#include <asm/page.h>
#include <linux/vmalloc.h>

#include "physaddr.h"

#ifdef CONFIG_X86_64

#ifdef CONFIG_DEBUG_VIRTUAL
unsigned long __phys_addr(unsigned long x)
{
        unsigned long y = x - __START_KERNEL_map;

        /* use the carry flag to determine if x was < __START_KERNEL_map */
        if (unlikely(x > y)) {
                x = y + phys_base;

                VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
        } else {
                x = y + (__START_KERNEL_map - PAGE_OFFSET);

                /* carry flag will be set if starting x was >= PAGE_OFFSET */
                VIRTUAL_BUG_ON((x > y) || !phys_addr_valid(x));
        }

        return x;
}
EXPORT_SYMBOL(__phys_addr);

unsigned long __phys_addr_symbol(unsigned long x)
{
        unsigned long y = x - __START_KERNEL_map;

        /* only check upper bounds since lower bounds will trigger carry */
        VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);

        return y + phys_base;
}
EXPORT_SYMBOL(__phys_addr_symbol);
#endif

bool __virt_addr_valid(unsigned long x)
{
        unsigned long y = x - __START_KERNEL_map;

        /* use the carry flag to determine if x was < __START_KERNEL_map */
        if (unlikely(x > y)) {
                x = y + phys_base;

                if (y >= KERNEL_IMAGE_SIZE)
                        return false;
        } else {
                x = y + (__START_KERNEL_map - PAGE_OFFSET);

                /* carry flag will be set if starting x was >= PAGE_OFFSET */
                if ((x > y) || !phys_addr_valid(x))
                        return false;
        }

        return pfn_valid(x >> PAGE_SHIFT);
}
EXPORT_SYMBOL(__virt_addr_valid);

#else

#ifdef CONFIG_DEBUG_VIRTUAL
unsigned long __phys_addr(unsigned long x)
{
        unsigned long phys_addr = x - PAGE_OFFSET;
        /* VMALLOC_* aren't constants  */
        VIRTUAL_BUG_ON(x < PAGE_OFFSET);
        VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
        /* max_low_pfn is set early, but not _that_ early */
        if (max_low_pfn) {
                VIRTUAL_BUG_ON((phys_addr >> PAGE_SHIFT) > max_low_pfn);
                BUG_ON(slow_virt_to_phys((void *)x) != phys_addr);
        }
        return phys_addr;
}
EXPORT_SYMBOL(__phys_addr);
#endif

bool __virt_addr_valid(unsigned long x)
{
        if (x < PAGE_OFFSET)
                return false;
        if (__vmalloc_start_set && is_vmalloc_addr((void *) x))
                return false;
        if (x >= FIXADDR_START)
                return false;
        return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
}
EXPORT_SYMBOL(__virt_addr_valid);

#endif        /* CONFIG_X86_64 */
























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NETFILTER_NETDEV_H_
#define _NETFILTER_NETDEV_H_

#include <linux/netfilter.h>
#include <linux/netdevice.h>

#ifdef CONFIG_NETFILTER_INGRESS
static inline bool nf_hook_ingress_active(const struct sk_buff *skb)
{
#ifdef CONFIG_JUMP_LABEL
        if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_INGRESS]))
                return false;
#endif
        return rcu_access_pointer(skb->dev->nf_hooks_ingress);
}

/* caller must hold rcu_read_lock */
static inline int nf_hook_ingress(struct sk_buff *skb)
{
        struct nf_hook_entries *e = rcu_dereference(skb->dev->nf_hooks_ingress);
        struct nf_hook_state state;
        int ret;

        /* Must recheck the ingress hook head, in the event it became NULL
         * after the check in nf_hook_ingress_active evaluated to true.
         */
        if (unlikely(!e))
                return 0;

        nf_hook_state_init(&state, NF_NETDEV_INGRESS,
                           NFPROTO_NETDEV, skb->dev, NULL, NULL,
                           dev_net(skb->dev), NULL);
        ret = nf_hook_slow(skb, &state, e, 0);
        if (ret == 0)
                return -1;

        return ret;
}

#else /* CONFIG_NETFILTER_INGRESS */
static inline int nf_hook_ingress_active(struct sk_buff *skb)
{
        return 0;
}

static inline int nf_hook_ingress(struct sk_buff *skb)
{
        return 0;
}
#endif /* CONFIG_NETFILTER_INGRESS */

#ifdef CONFIG_NETFILTER_EGRESS
static inline bool nf_hook_egress_active(void)
{
#ifdef CONFIG_JUMP_LABEL
        if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_EGRESS]))
                return false;
#endif
        return true;
}

/**
 * nf_hook_egress - classify packets before transmission
 * @skb: packet to be classified
 * @rc: result code which shall be returned by __dev_queue_xmit() on failure
 * @dev: netdev whose egress hooks shall be applied to @skb
 *
 * Caller must hold rcu_read_lock.
 *
 * On ingress, packets are classified first by tc, then by netfilter.
 * On egress, the order is reversed for symmetry.  Conceptually, tc and
 * netfilter can be thought of as layers, with netfilter layered above tc:
 * When tc redirects a packet to another interface, netfilter is not applied
 * because the packet is on the tc layer.
 *
 * The nf_skip_egress flag controls whether netfilter is applied on egress.
 * It is updated by __netif_receive_skb_core() and __dev_queue_xmit() when the
 * packet passes through tc and netfilter.  Because __dev_queue_xmit() may be
 * called recursively by tunnel drivers such as vxlan, the flag is reverted to
 * false after sch_handle_egress().  This ensures that netfilter is applied
 * both on the overlay and underlying network.
 *
 * Returns: @skb on success or %NULL if the packet was consumed or filtered.
 */
static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc,
                                             struct net_device *dev)
{
        struct nf_hook_entries *e;
        struct nf_hook_state state;
        int ret;

#ifdef CONFIG_NETFILTER_SKIP_EGRESS
        if (skb->nf_skip_egress)
                return skb;
#endif

        e = rcu_dereference_check(dev->nf_hooks_egress, rcu_read_lock_bh_held());
        if (!e)
                return skb;

        nf_hook_state_init(&state, NF_NETDEV_EGRESS,
                           NFPROTO_NETDEV, NULL, dev, NULL,
                           dev_net(dev), NULL);

        /* nf assumes rcu_read_lock, not just read_lock_bh */
        rcu_read_lock();
        ret = nf_hook_slow(skb, &state, e, 0);
        rcu_read_unlock();

        if (ret == 1) {
                return skb;
        } else if (ret < 0) {
                *rc = NET_XMIT_DROP;
                return NULL;
        } else { /* ret == 0 */
                *rc = NET_XMIT_SUCCESS;
                return NULL;
        }
}
#else /* CONFIG_NETFILTER_EGRESS */
static inline bool nf_hook_egress_active(void)
{
        return false;
}

static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc,
                                             struct net_device *dev)
{
        return skb;
}
#endif /* CONFIG_NETFILTER_EGRESS */

static inline void nf_skip_egress(struct sk_buff *skb, bool skip)
{
#ifdef CONFIG_NETFILTER_SKIP_EGRESS
        skb->nf_skip_egress = skip;
#endif
}

static inline void nf_hook_netdev_init(struct net_device *dev)
{
#ifdef CONFIG_NETFILTER_INGRESS
        RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL);
#endif
#ifdef CONFIG_NETFILTER_EGRESS
        RCU_INIT_POINTER(dev->nf_hooks_egress, NULL);
#endif
}

#endif /* _NETFILTER_NETDEV_H_ */






































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  313 














  317 
  317 

  314 




























  319 






  312 





  318 

  319 
  316 






  319 



  315 
  316 


  318 


























  317 




  318 












  316 






  318 





  319 



































































































































































































































































































































  315 















































































































  316 























































  318 














  316 

































































  319 

























  313 

  315 






  316 





















































































  313 









  316 

  319 

  319 



































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/export.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/if_vlan.h>
#include <linux/filter.h>
#include <net/dsa.h>
#include <net/dst_metadata.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/gre.h>
#include <net/pptp.h>
#include <net/tipc.h>
#include <linux/igmp.h>
#include <linux/icmp.h>
#include <linux/sctp.h>
#include <linux/dccp.h>
#include <linux/if_tunnel.h>
#include <linux/if_pppox.h>
#include <linux/ppp_defs.h>
#include <linux/stddef.h>
#include <linux/if_ether.h>
#include <linux/if_hsr.h>
#include <linux/mpls.h>
#include <linux/tcp.h>
#include <linux/ptp_classify.h>
#include <net/flow_dissector.h>
#include <net/pkt_cls.h>
#include <scsi/fc/fc_fcoe.h>
#include <uapi/linux/batadv_packet.h>
#include <linux/bpf.h>
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_labels.h>
#endif
#include <linux/bpf-netns.h>

static void dissector_set_key(struct flow_dissector *flow_dissector,
                              enum flow_dissector_key_id key_id)
{
        flow_dissector->used_keys |= (1ULL << key_id);
}

void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
                             const struct flow_dissector_key *key,
                             unsigned int key_count)
{
        unsigned int i;

        memset(flow_dissector, 0, sizeof(*flow_dissector));

        for (i = 0; i < key_count; i++, key++) {
                /* User should make sure that every key target offset is within
                 * boundaries of unsigned short.
                 */
                BUG_ON(key->offset > USHRT_MAX);
                BUG_ON(dissector_uses_key(flow_dissector,
                                          key->key_id));

                dissector_set_key(flow_dissector, key->key_id);
                flow_dissector->offset[key->key_id] = key->offset;
        }

        /* Ensure that the dissector always includes control and basic key.
         * That way we are able to avoid handling lack of these in fast path.
         */
        BUG_ON(!dissector_uses_key(flow_dissector,
                                   FLOW_DISSECTOR_KEY_CONTROL));
        BUG_ON(!dissector_uses_key(flow_dissector,
                                   FLOW_DISSECTOR_KEY_BASIC));
}
EXPORT_SYMBOL(skb_flow_dissector_init);

#ifdef CONFIG_BPF_SYSCALL
int flow_dissector_bpf_prog_attach_check(struct net *net,
                                         struct bpf_prog *prog)
{
        enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR;

        if (net == &init_net) {
                /* BPF flow dissector in the root namespace overrides
                 * any per-net-namespace one. When attaching to root,
                 * make sure we don't have any BPF program attached
                 * to the non-root namespaces.
                 */
                struct net *ns;

                for_each_net(ns) {
                        if (ns == &init_net)
                                continue;
                        if (rcu_access_pointer(ns->bpf.run_array[type]))
                                return -EEXIST;
                }
        } else {
                /* Make sure root flow dissector is not attached
                 * when attaching to the non-root namespace.
                 */
                if (rcu_access_pointer(init_net.bpf.run_array[type]))
                        return -EEXIST;
        }

        return 0;
}
#endif /* CONFIG_BPF_SYSCALL */

/**
 * skb_flow_get_ports - extract the upper layer ports and return them
 * @skb: sk_buff to extract the ports from
 * @thoff: transport header offset
 * @ip_proto: protocol for which to get port offset
 * @data: raw buffer pointer to the packet, if NULL use skb->data
 * @hlen: packet header length, if @data is NULL use skb_headlen(skb)
 *
 * The function will try to retrieve the ports at offset thoff + poff where poff
 * is the protocol port offset returned from proto_ports_offset
 */
__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
                          const void *data, int hlen)
{
        int poff = proto_ports_offset(ip_proto);

        if (!data) {
                data = skb->data;
                hlen = skb_headlen(skb);
        }

        if (poff >= 0) {
                __be32 *ports, _ports;

                ports = __skb_header_pointer(skb, thoff + poff,
                                             sizeof(_ports), data, hlen, &_ports);
                if (ports)
                        return *ports;
        }

        return 0;
}
EXPORT_SYMBOL(skb_flow_get_ports);

static bool icmp_has_id(u8 type)
{
        switch (type) {
        case ICMP_ECHO:
        case ICMP_ECHOREPLY:
        case ICMP_TIMESTAMP:
        case ICMP_TIMESTAMPREPLY:
        case ICMPV6_ECHO_REQUEST:
        case ICMPV6_ECHO_REPLY:
                return true;
        }

        return false;
}

/**
 * skb_flow_get_icmp_tci - extract ICMP(6) Type, Code and Identifier fields
 * @skb: sk_buff to extract from
 * @key_icmp: struct flow_dissector_key_icmp to fill
 * @data: raw buffer pointer to the packet
 * @thoff: offset to extract at
 * @hlen: packet header length
 */
void skb_flow_get_icmp_tci(const struct sk_buff *skb,
                           struct flow_dissector_key_icmp *key_icmp,
                           const void *data, int thoff, int hlen)
{
        struct icmphdr *ih, _ih;

        ih = __skb_header_pointer(skb, thoff, sizeof(_ih), data, hlen, &_ih);
        if (!ih)
                return;

        key_icmp->type = ih->type;
        key_icmp->code = ih->code;

        /* As we use 0 to signal that the Id field is not present,
         * avoid confusion with packets without such field
         */
        if (icmp_has_id(ih->type))
                key_icmp->id = ih->un.echo.id ? ntohs(ih->un.echo.id) : 1;
        else
                key_icmp->id = 0;
}
EXPORT_SYMBOL(skb_flow_get_icmp_tci);

/* If FLOW_DISSECTOR_KEY_ICMP is set, dissect an ICMP packet
 * using skb_flow_get_icmp_tci().
 */
static void __skb_flow_dissect_icmp(const struct sk_buff *skb,
                                    struct flow_dissector *flow_dissector,
                                    void *target_container, const void *data,
                                    int thoff, int hlen)
{
        struct flow_dissector_key_icmp *key_icmp;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ICMP))
                return;

        key_icmp = skb_flow_dissector_target(flow_dissector,
                                             FLOW_DISSECTOR_KEY_ICMP,
                                             target_container);

        skb_flow_get_icmp_tci(skb, key_icmp, data, thoff, hlen);
}

static void __skb_flow_dissect_ah(const struct sk_buff *skb,
                                  struct flow_dissector *flow_dissector,
                                  void *target_container, const void *data,
                                  int nhoff, int hlen)
{
        struct flow_dissector_key_ipsec *key_ah;
        struct ip_auth_hdr _hdr, *hdr;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC))
                return;

        hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
        if (!hdr)
                return;

        key_ah = skb_flow_dissector_target(flow_dissector,
                                           FLOW_DISSECTOR_KEY_IPSEC,
                                           target_container);

        key_ah->spi = hdr->spi;
}

static void __skb_flow_dissect_esp(const struct sk_buff *skb,
                                   struct flow_dissector *flow_dissector,
                                   void *target_container, const void *data,
                                   int nhoff, int hlen)
{
        struct flow_dissector_key_ipsec *key_esp;
        struct ip_esp_hdr _hdr, *hdr;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC))
                return;

        hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
        if (!hdr)
                return;

        key_esp = skb_flow_dissector_target(flow_dissector,
                                            FLOW_DISSECTOR_KEY_IPSEC,
                                            target_container);

        key_esp->spi = hdr->spi;
}

static void __skb_flow_dissect_l2tpv3(const struct sk_buff *skb,
                                      struct flow_dissector *flow_dissector,
                                      void *target_container, const void *data,
                                      int nhoff, int hlen)
{
        struct flow_dissector_key_l2tpv3 *key_l2tpv3;
        struct {
                __be32 session_id;
        } *hdr, _hdr;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_L2TPV3))
                return;

        hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
        if (!hdr)
                return;

        key_l2tpv3 = skb_flow_dissector_target(flow_dissector,
                                               FLOW_DISSECTOR_KEY_L2TPV3,
                                               target_container);

        key_l2tpv3->session_id = hdr->session_id;
}

void skb_flow_dissect_meta(const struct sk_buff *skb,
                           struct flow_dissector *flow_dissector,
                           void *target_container)
{
        struct flow_dissector_key_meta *meta;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_META))
                return;

        meta = skb_flow_dissector_target(flow_dissector,
                                         FLOW_DISSECTOR_KEY_META,
                                         target_container);
        meta->ingress_ifindex = skb->skb_iif;
#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
        if (tc_skb_ext_tc_enabled()) {
                struct tc_skb_ext *ext;

                ext = skb_ext_find(skb, TC_SKB_EXT);
                if (ext)
                        meta->l2_miss = ext->l2_miss;
        }
#endif
}
EXPORT_SYMBOL(skb_flow_dissect_meta);

static void
skb_flow_dissect_set_enc_control(enum flow_dissector_key_id type,
                                 u32 ctrl_flags,
                                 struct flow_dissector *flow_dissector,
                                 void *target_container)
{
        struct flow_dissector_key_control *ctrl;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL))
                return;

        ctrl = skb_flow_dissector_target(flow_dissector,
                                         FLOW_DISSECTOR_KEY_ENC_CONTROL,
                                         target_container);
        ctrl->addr_type = type;
        ctrl->flags = ctrl_flags;
}

void
skb_flow_dissect_ct(const struct sk_buff *skb,
                    struct flow_dissector *flow_dissector,
                    void *target_container, u16 *ctinfo_map,
                    size_t mapsize, bool post_ct, u16 zone)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        struct flow_dissector_key_ct *key;
        enum ip_conntrack_info ctinfo;
        struct nf_conn_labels *cl;
        struct nf_conn *ct;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CT))
                return;

        ct = nf_ct_get(skb, &ctinfo);
        if (!ct && !post_ct)
                return;

        key = skb_flow_dissector_target(flow_dissector,
                                        FLOW_DISSECTOR_KEY_CT,
                                        target_container);

        if (!ct) {
                key->ct_state = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
                                TCA_FLOWER_KEY_CT_FLAGS_INVALID;
                key->ct_zone = zone;
                return;
        }

        if (ctinfo < mapsize)
                key->ct_state = ctinfo_map[ctinfo];
#if IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)
        key->ct_zone = ct->zone.id;
#endif
#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
        key->ct_mark = READ_ONCE(ct->mark);
#endif

        cl = nf_ct_labels_find(ct);
        if (cl)
                memcpy(key->ct_labels, cl->bits, sizeof(key->ct_labels));
#endif /* CONFIG_NF_CONNTRACK */
}
EXPORT_SYMBOL(skb_flow_dissect_ct);

void
skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
                             struct flow_dissector *flow_dissector,
                             void *target_container)
{
        struct ip_tunnel_info *info;
        struct ip_tunnel_key *key;
        u32 ctrl_flags = 0;

        /* A quick check to see if there might be something to do. */
        if (!dissector_uses_key(flow_dissector,
                                FLOW_DISSECTOR_KEY_ENC_KEYID) &&
            !dissector_uses_key(flow_dissector,
                                FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) &&
            !dissector_uses_key(flow_dissector,
                                FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) &&
            !dissector_uses_key(flow_dissector,
                                FLOW_DISSECTOR_KEY_ENC_CONTROL) &&
            !dissector_uses_key(flow_dissector,
                                FLOW_DISSECTOR_KEY_ENC_PORTS) &&
            !dissector_uses_key(flow_dissector,
                                FLOW_DISSECTOR_KEY_ENC_IP) &&
            !dissector_uses_key(flow_dissector,
                                FLOW_DISSECTOR_KEY_ENC_OPTS))
                return;

        info = skb_tunnel_info(skb);
        if (!info)
                return;

        key = &info->key;

        if (test_bit(IP_TUNNEL_CSUM_BIT, key->tun_flags))
                ctrl_flags |= FLOW_DIS_F_TUNNEL_CSUM;
        if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags))
                ctrl_flags |= FLOW_DIS_F_TUNNEL_DONT_FRAGMENT;
        if (test_bit(IP_TUNNEL_OAM_BIT, key->tun_flags))
                ctrl_flags |= FLOW_DIS_F_TUNNEL_OAM;
        if (test_bit(IP_TUNNEL_CRIT_OPT_BIT, key->tun_flags))
                ctrl_flags |= FLOW_DIS_F_TUNNEL_CRIT_OPT;

        switch (ip_tunnel_info_af(info)) {
        case AF_INET:
                skb_flow_dissect_set_enc_control(FLOW_DISSECTOR_KEY_IPV4_ADDRS,
                                                 ctrl_flags, flow_dissector,
                                                 target_container);
                if (dissector_uses_key(flow_dissector,
                                       FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) {
                        struct flow_dissector_key_ipv4_addrs *ipv4;

                        ipv4 = skb_flow_dissector_target(flow_dissector,
                                                         FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS,
                                                         target_container);
                        ipv4->src = key->u.ipv4.src;
                        ipv4->dst = key->u.ipv4.dst;
                }
                break;
        case AF_INET6:
                skb_flow_dissect_set_enc_control(FLOW_DISSECTOR_KEY_IPV6_ADDRS,
                                                 ctrl_flags, flow_dissector,
                                                 target_container);
                if (dissector_uses_key(flow_dissector,
                                       FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) {
                        struct flow_dissector_key_ipv6_addrs *ipv6;

                        ipv6 = skb_flow_dissector_target(flow_dissector,
                                                         FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS,
                                                         target_container);
                        ipv6->src = key->u.ipv6.src;
                        ipv6->dst = key->u.ipv6.dst;
                }
                break;
        default:
                skb_flow_dissect_set_enc_control(0, ctrl_flags, flow_dissector,
                                                 target_container);
                break;
        }

        if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID)) {
                struct flow_dissector_key_keyid *keyid;

                keyid = skb_flow_dissector_target(flow_dissector,
                                                  FLOW_DISSECTOR_KEY_ENC_KEYID,
                                                  target_container);
                keyid->keyid = tunnel_id_to_key32(key->tun_id);
        }

        if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_PORTS)) {
                struct flow_dissector_key_ports *tp;

                tp = skb_flow_dissector_target(flow_dissector,
                                               FLOW_DISSECTOR_KEY_ENC_PORTS,
                                               target_container);
                tp->src = key->tp_src;
                tp->dst = key->tp_dst;
        }

        if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IP)) {
                struct flow_dissector_key_ip *ip;

                ip = skb_flow_dissector_target(flow_dissector,
                                               FLOW_DISSECTOR_KEY_ENC_IP,
                                               target_container);
                ip->tos = key->tos;
                ip->ttl = key->ttl;
        }

        if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS)) {
                struct flow_dissector_key_enc_opts *enc_opt;
                IP_TUNNEL_DECLARE_FLAGS(flags) = { };
                u32 val;

                enc_opt = skb_flow_dissector_target(flow_dissector,
                                                    FLOW_DISSECTOR_KEY_ENC_OPTS,
                                                    target_container);

                if (!info->options_len)
                        return;

                enc_opt->len = info->options_len;
                ip_tunnel_info_opts_get(enc_opt->data, info);

                ip_tunnel_set_options_present(flags);
                ip_tunnel_flags_and(flags, info->key.tun_flags, flags);

                val = find_next_bit(flags, __IP_TUNNEL_FLAG_NUM,
                                    IP_TUNNEL_GENEVE_OPT_BIT);
                enc_opt->dst_opt_type = val < __IP_TUNNEL_FLAG_NUM ? val : 0;
        }
}
EXPORT_SYMBOL(skb_flow_dissect_tunnel_info);

void skb_flow_dissect_hash(const struct sk_buff *skb,
                           struct flow_dissector *flow_dissector,
                           void *target_container)
{
        struct flow_dissector_key_hash *key;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_HASH))
                return;

        key = skb_flow_dissector_target(flow_dissector,
                                        FLOW_DISSECTOR_KEY_HASH,
                                        target_container);

        key->hash = skb_get_hash_raw(skb);
}
EXPORT_SYMBOL(skb_flow_dissect_hash);

static enum flow_dissect_ret
__skb_flow_dissect_mpls(const struct sk_buff *skb,
                        struct flow_dissector *flow_dissector,
                        void *target_container, const void *data, int nhoff,
                        int hlen, int lse_index, bool *entropy_label)
{
        struct mpls_label *hdr, _hdr;
        u32 entry, label, bos;

        if (!dissector_uses_key(flow_dissector,
                                FLOW_DISSECTOR_KEY_MPLS_ENTROPY) &&
            !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS))
                return FLOW_DISSECT_RET_OUT_GOOD;

        if (lse_index >= FLOW_DIS_MPLS_MAX)
                return FLOW_DISSECT_RET_OUT_GOOD;

        hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data,
                                   hlen, &_hdr);
        if (!hdr)
                return FLOW_DISSECT_RET_OUT_BAD;

        entry = ntohl(hdr->entry);
        label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT;
        bos = (entry & MPLS_LS_S_MASK) >> MPLS_LS_S_SHIFT;

        if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) {
                struct flow_dissector_key_mpls *key_mpls;
                struct flow_dissector_mpls_lse *lse;

                key_mpls = skb_flow_dissector_target(flow_dissector,
                                                     FLOW_DISSECTOR_KEY_MPLS,
                                                     target_container);
                lse = &key_mpls->ls[lse_index];

                lse->mpls_ttl = (entry & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
                lse->mpls_bos = bos;
                lse->mpls_tc = (entry & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT;
                lse->mpls_label = label;
                dissector_set_mpls_lse(key_mpls, lse_index);
        }

        if (*entropy_label &&
            dissector_uses_key(flow_dissector,
                               FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) {
                struct flow_dissector_key_keyid *key_keyid;

                key_keyid = skb_flow_dissector_target(flow_dissector,
                                                      FLOW_DISSECTOR_KEY_MPLS_ENTROPY,
                                                      target_container);
                key_keyid->keyid = cpu_to_be32(label);
        }

        *entropy_label = label == MPLS_LABEL_ENTROPY;

        return bos ? FLOW_DISSECT_RET_OUT_GOOD : FLOW_DISSECT_RET_PROTO_AGAIN;
}

static enum flow_dissect_ret
__skb_flow_dissect_arp(const struct sk_buff *skb,
                       struct flow_dissector *flow_dissector,
                       void *target_container, const void *data,
                       int nhoff, int hlen)
{
        struct flow_dissector_key_arp *key_arp;
        struct {
                unsigned char ar_sha[ETH_ALEN];
                unsigned char ar_sip[4];
                unsigned char ar_tha[ETH_ALEN];
                unsigned char ar_tip[4];
        } *arp_eth, _arp_eth;
        const struct arphdr *arp;
        struct arphdr _arp;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ARP))
                return FLOW_DISSECT_RET_OUT_GOOD;

        arp = __skb_header_pointer(skb, nhoff, sizeof(_arp), data,
                                   hlen, &_arp);
        if (!arp)
                return FLOW_DISSECT_RET_OUT_BAD;

        if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
            arp->ar_pro != htons(ETH_P_IP) ||
            arp->ar_hln != ETH_ALEN ||
            arp->ar_pln != 4 ||
            (arp->ar_op != htons(ARPOP_REPLY) &&
             arp->ar_op != htons(ARPOP_REQUEST)))
                return FLOW_DISSECT_RET_OUT_BAD;

        arp_eth = __skb_header_pointer(skb, nhoff + sizeof(_arp),
                                       sizeof(_arp_eth), data,
                                       hlen, &_arp_eth);
        if (!arp_eth)
                return FLOW_DISSECT_RET_OUT_BAD;

        key_arp = skb_flow_dissector_target(flow_dissector,
                                            FLOW_DISSECTOR_KEY_ARP,
                                            target_container);

        memcpy(&key_arp->sip, arp_eth->ar_sip, sizeof(key_arp->sip));
        memcpy(&key_arp->tip, arp_eth->ar_tip, sizeof(key_arp->tip));

        /* Only store the lower byte of the opcode;
         * this covers ARPOP_REPLY and ARPOP_REQUEST.
         */
        key_arp->op = ntohs(arp->ar_op) & 0xff;

        ether_addr_copy(key_arp->sha, arp_eth->ar_sha);
        ether_addr_copy(key_arp->tha, arp_eth->ar_tha);

        return FLOW_DISSECT_RET_OUT_GOOD;
}

static enum flow_dissect_ret
__skb_flow_dissect_cfm(const struct sk_buff *skb,
                       struct flow_dissector *flow_dissector,
                       void *target_container, const void *data,
                       int nhoff, int hlen)
{
        struct flow_dissector_key_cfm *key, *hdr, _hdr;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CFM))
                return FLOW_DISSECT_RET_OUT_GOOD;

        hdr = __skb_header_pointer(skb, nhoff, sizeof(*key), data, hlen, &_hdr);
        if (!hdr)
                return FLOW_DISSECT_RET_OUT_BAD;

        key = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CFM,
                                        target_container);

        key->mdl_ver = hdr->mdl_ver;
        key->opcode = hdr->opcode;

        return FLOW_DISSECT_RET_OUT_GOOD;
}

static enum flow_dissect_ret
__skb_flow_dissect_gre(const struct sk_buff *skb,
                       struct flow_dissector_key_control *key_control,
                       struct flow_dissector *flow_dissector,
                       void *target_container, const void *data,
                       __be16 *p_proto, int *p_nhoff, int *p_hlen,
                       unsigned int flags)
{
        struct flow_dissector_key_keyid *key_keyid;
        struct gre_base_hdr *hdr, _hdr;
        int offset = 0;
        u16 gre_ver;

        hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr),
                                   data, *p_hlen, &_hdr);
        if (!hdr)
                return FLOW_DISSECT_RET_OUT_BAD;

        /* Only look inside GRE without routing */
        if (hdr->flags & GRE_ROUTING)
                return FLOW_DISSECT_RET_OUT_GOOD;

        /* Only look inside GRE for version 0 and 1 */
        gre_ver = ntohs(hdr->flags & GRE_VERSION);
        if (gre_ver > 1)
                return FLOW_DISSECT_RET_OUT_GOOD;

        *p_proto = hdr->protocol;
        if (gre_ver) {
                /* Version1 must be PPTP, and check the flags */
                if (!(*p_proto == GRE_PROTO_PPP && (hdr->flags & GRE_KEY)))
                        return FLOW_DISSECT_RET_OUT_GOOD;
        }

        offset += sizeof(struct gre_base_hdr);

        if (hdr->flags & GRE_CSUM)
                offset += sizeof_field(struct gre_full_hdr, csum) +
                          sizeof_field(struct gre_full_hdr, reserved1);

        if (hdr->flags & GRE_KEY) {
                const __be32 *keyid;
                __be32 _keyid;

                keyid = __skb_header_pointer(skb, *p_nhoff + offset,
                                             sizeof(_keyid),
                                             data, *p_hlen, &_keyid);
                if (!keyid)
                        return FLOW_DISSECT_RET_OUT_BAD;

                if (dissector_uses_key(flow_dissector,
                                       FLOW_DISSECTOR_KEY_GRE_KEYID)) {
                        key_keyid = skb_flow_dissector_target(flow_dissector,
                                                              FLOW_DISSECTOR_KEY_GRE_KEYID,
                                                              target_container);
                        if (gre_ver == 0)
                                key_keyid->keyid = *keyid;
                        else
                                key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK;
                }
                offset += sizeof_field(struct gre_full_hdr, key);
        }

        if (hdr->flags & GRE_SEQ)
                offset += sizeof_field(struct pptp_gre_header, seq);

        if (gre_ver == 0) {
                if (*p_proto == htons(ETH_P_TEB)) {
                        const struct ethhdr *eth;
                        struct ethhdr _eth;

                        eth = __skb_header_pointer(skb, *p_nhoff + offset,
                                                   sizeof(_eth),
                                                   data, *p_hlen, &_eth);
                        if (!eth)
                                return FLOW_DISSECT_RET_OUT_BAD;
                        *p_proto = eth->h_proto;
                        offset += sizeof(*eth);

                        /* Cap headers that we access via pointers at the
                         * end of the Ethernet header as our maximum alignment
                         * at that point is only 2 bytes.
                         */
                        if (NET_IP_ALIGN)
                                *p_hlen = *p_nhoff + offset;
                }
        } else { /* version 1, must be PPTP */
                u8 _ppp_hdr[PPP_HDRLEN];
                u8 *ppp_hdr;

                if (hdr->flags & GRE_ACK)
                        offset += sizeof_field(struct pptp_gre_header, ack);

                ppp_hdr = __skb_header_pointer(skb, *p_nhoff + offset,
                                               sizeof(_ppp_hdr),
                                               data, *p_hlen, _ppp_hdr);
                if (!ppp_hdr)
                        return FLOW_DISSECT_RET_OUT_BAD;

                switch (PPP_PROTOCOL(ppp_hdr)) {
                case PPP_IP:
                        *p_proto = htons(ETH_P_IP);
                        break;
                case PPP_IPV6:
                        *p_proto = htons(ETH_P_IPV6);
                        break;
                default:
                        /* Could probably catch some more like MPLS */
                        break;
                }

                offset += PPP_HDRLEN;
        }

        *p_nhoff += offset;
        key_control->flags |= FLOW_DIS_ENCAPSULATION;
        if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
                return FLOW_DISSECT_RET_OUT_GOOD;

        return FLOW_DISSECT_RET_PROTO_AGAIN;
}

/**
 * __skb_flow_dissect_batadv() - dissect batman-adv header
 * @skb: sk_buff to with the batman-adv header
 * @key_control: flow dissectors control key
 * @data: raw buffer pointer to the packet, if NULL use skb->data
 * @p_proto: pointer used to update the protocol to process next
 * @p_nhoff: pointer used to update inner network header offset
 * @hlen: packet header length
 * @flags: any combination of FLOW_DISSECTOR_F_*
 *
 * ETH_P_BATMAN packets are tried to be dissected. Only
 * &struct batadv_unicast packets are actually processed because they contain an
 * inner ethernet header and are usually followed by actual network header. This
 * allows the flow dissector to continue processing the packet.
 *
 * Return: FLOW_DISSECT_RET_PROTO_AGAIN when &struct batadv_unicast was found,
 *  FLOW_DISSECT_RET_OUT_GOOD when dissector should stop after encapsulation,
 *  otherwise FLOW_DISSECT_RET_OUT_BAD
 */
static enum flow_dissect_ret
__skb_flow_dissect_batadv(const struct sk_buff *skb,
                          struct flow_dissector_key_control *key_control,
                          const void *data, __be16 *p_proto, int *p_nhoff,
                          int hlen, unsigned int flags)
{
        struct {
                struct batadv_unicast_packet batadv_unicast;
                struct ethhdr eth;
        } *hdr, _hdr;

        hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr), data, hlen,
                                   &_hdr);
        if (!hdr)
                return FLOW_DISSECT_RET_OUT_BAD;

        if (hdr->batadv_unicast.version != BATADV_COMPAT_VERSION)
                return FLOW_DISSECT_RET_OUT_BAD;

        if (hdr->batadv_unicast.packet_type != BATADV_UNICAST)
                return FLOW_DISSECT_RET_OUT_BAD;

        *p_proto = hdr->eth.h_proto;
        *p_nhoff += sizeof(*hdr);

        key_control->flags |= FLOW_DIS_ENCAPSULATION;
        if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
                return FLOW_DISSECT_RET_OUT_GOOD;

        return FLOW_DISSECT_RET_PROTO_AGAIN;
}

static void
__skb_flow_dissect_tcp(const struct sk_buff *skb,
                       struct flow_dissector *flow_dissector,
                       void *target_container, const void *data,
                       int thoff, int hlen)
{
        struct flow_dissector_key_tcp *key_tcp;
        struct tcphdr *th, _th;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_TCP))
                return;

        th = __skb_header_pointer(skb, thoff, sizeof(_th), data, hlen, &_th);
        if (!th)
                return;

        if (unlikely(__tcp_hdrlen(th) < sizeof(_th)))
                return;

        key_tcp = skb_flow_dissector_target(flow_dissector,
                                            FLOW_DISSECTOR_KEY_TCP,
                                            target_container);
        key_tcp->flags = (*(__be16 *) &tcp_flag_word(th) & htons(0x0FFF));
}

static void
__skb_flow_dissect_ports(const struct sk_buff *skb,
                         struct flow_dissector *flow_dissector,
                         void *target_container, const void *data,
                         int nhoff, u8 ip_proto, int hlen)
{
        struct flow_dissector_key_ports_range *key_ports_range = NULL;
        struct flow_dissector_key_ports *key_ports = NULL;
        __be32 ports;

        if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS))
                key_ports = skb_flow_dissector_target(flow_dissector,
                                                      FLOW_DISSECTOR_KEY_PORTS,
                                                      target_container);

        if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE))
                key_ports_range = skb_flow_dissector_target(flow_dissector,
                                                            FLOW_DISSECTOR_KEY_PORTS_RANGE,
                                                            target_container);

        if (!key_ports && !key_ports_range)
                return;

        ports = skb_flow_get_ports(skb, nhoff, ip_proto, data, hlen);

        if (key_ports)
                key_ports->ports = ports;

        if (key_ports_range)
                key_ports_range->tp.ports = ports;
}

static void
__skb_flow_dissect_ipv4(const struct sk_buff *skb,
                        struct flow_dissector *flow_dissector,
                        void *target_container, const void *data,
                        const struct iphdr *iph)
{
        struct flow_dissector_key_ip *key_ip;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP))
                return;

        key_ip = skb_flow_dissector_target(flow_dissector,
                                           FLOW_DISSECTOR_KEY_IP,
                                           target_container);
        key_ip->tos = iph->tos;
        key_ip->ttl = iph->ttl;
}

static void
__skb_flow_dissect_ipv6(const struct sk_buff *skb,
                        struct flow_dissector *flow_dissector,
                        void *target_container, const void *data,
                        const struct ipv6hdr *iph)
{
        struct flow_dissector_key_ip *key_ip;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP))
                return;

        key_ip = skb_flow_dissector_target(flow_dissector,
                                           FLOW_DISSECTOR_KEY_IP,
                                           target_container);
        key_ip->tos = ipv6_get_dsfield(iph);
        key_ip->ttl = iph->hop_limit;
}

/* Maximum number of protocol headers that can be parsed in
 * __skb_flow_dissect
 */
#define MAX_FLOW_DISSECT_HDRS        15

static bool skb_flow_dissect_allowed(int *num_hdrs)
{
        ++*num_hdrs;

        return (*num_hdrs <= MAX_FLOW_DISSECT_HDRS);
}

static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
                                     struct flow_dissector *flow_dissector,
                                     void *target_container)
{
        struct flow_dissector_key_ports_range *key_ports_range = NULL;
        struct flow_dissector_key_ports *key_ports = NULL;
        struct flow_dissector_key_control *key_control;
        struct flow_dissector_key_basic *key_basic;
        struct flow_dissector_key_addrs *key_addrs;
        struct flow_dissector_key_tags *key_tags;

        key_control = skb_flow_dissector_target(flow_dissector,
                                                FLOW_DISSECTOR_KEY_CONTROL,
                                                target_container);
        key_control->thoff = flow_keys->thoff;
        if (flow_keys->is_frag)
                key_control->flags |= FLOW_DIS_IS_FRAGMENT;
        if (flow_keys->is_first_frag)
                key_control->flags |= FLOW_DIS_FIRST_FRAG;
        if (flow_keys->is_encap)
                key_control->flags |= FLOW_DIS_ENCAPSULATION;

        key_basic = skb_flow_dissector_target(flow_dissector,
                                              FLOW_DISSECTOR_KEY_BASIC,
                                              target_container);
        key_basic->n_proto = flow_keys->n_proto;
        key_basic->ip_proto = flow_keys->ip_proto;

        if (flow_keys->addr_proto == ETH_P_IP &&
            dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) {
                key_addrs = skb_flow_dissector_target(flow_dissector,
                                                      FLOW_DISSECTOR_KEY_IPV4_ADDRS,
                                                      target_container);
                key_addrs->v4addrs.src = flow_keys->ipv4_src;
                key_addrs->v4addrs.dst = flow_keys->ipv4_dst;
                key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
        } else if (flow_keys->addr_proto == ETH_P_IPV6 &&
                   dissector_uses_key(flow_dissector,
                                      FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
                key_addrs = skb_flow_dissector_target(flow_dissector,
                                                      FLOW_DISSECTOR_KEY_IPV6_ADDRS,
                                                      target_container);
                memcpy(&key_addrs->v6addrs.src, &flow_keys->ipv6_src,
                       sizeof(key_addrs->v6addrs.src));
                memcpy(&key_addrs->v6addrs.dst, &flow_keys->ipv6_dst,
                       sizeof(key_addrs->v6addrs.dst));
                key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
        }

        if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) {
                key_ports = skb_flow_dissector_target(flow_dissector,
                                                      FLOW_DISSECTOR_KEY_PORTS,
                                                      target_container);
                key_ports->src = flow_keys->sport;
                key_ports->dst = flow_keys->dport;
        }
        if (dissector_uses_key(flow_dissector,
                               FLOW_DISSECTOR_KEY_PORTS_RANGE)) {
                key_ports_range = skb_flow_dissector_target(flow_dissector,
                                                            FLOW_DISSECTOR_KEY_PORTS_RANGE,
                                                            target_container);
                key_ports_range->tp.src = flow_keys->sport;
                key_ports_range->tp.dst = flow_keys->dport;
        }

        if (dissector_uses_key(flow_dissector,
                               FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
                key_tags = skb_flow_dissector_target(flow_dissector,
                                                     FLOW_DISSECTOR_KEY_FLOW_LABEL,
                                                     target_container);
                key_tags->flow_label = ntohl(flow_keys->flow_label);
        }
}

u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
                     __be16 proto, int nhoff, int hlen, unsigned int flags)
{
        struct bpf_flow_keys *flow_keys = ctx->flow_keys;
        u32 result;

        /* Pass parameters to the BPF program */
        memset(flow_keys, 0, sizeof(*flow_keys));
        flow_keys->n_proto = proto;
        flow_keys->nhoff = nhoff;
        flow_keys->thoff = flow_keys->nhoff;

        BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG !=
                     (int)FLOW_DISSECTOR_F_PARSE_1ST_FRAG);
        BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL !=
                     (int)FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
        BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP !=
                     (int)FLOW_DISSECTOR_F_STOP_AT_ENCAP);
        flow_keys->flags = flags;

        result = bpf_prog_run_pin_on_cpu(prog, ctx);

        flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, nhoff, hlen);
        flow_keys->thoff = clamp_t(u16, flow_keys->thoff,
                                   flow_keys->nhoff, hlen);

        return result;
}

static bool is_pppoe_ses_hdr_valid(const struct pppoe_hdr *hdr)
{
        return hdr->ver == 1 && hdr->type == 1 && hdr->code == 0;
}

/**
 * __skb_flow_dissect - extract the flow_keys struct and return it
 * @net: associated network namespace, derived from @skb if NULL
 * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified
 * @flow_dissector: list of keys to dissect
 * @target_container: target structure to put dissected values into
 * @data: raw buffer pointer to the packet, if NULL use skb->data
 * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol
 * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb)
 * @hlen: packet header length, if @data is NULL use skb_headlen(skb)
 * @flags: flags that control the dissection process, e.g.
 *         FLOW_DISSECTOR_F_STOP_AT_ENCAP.
 *
 * The function will try to retrieve individual keys into target specified
 * by flow_dissector from either the skbuff or a raw buffer specified by the
 * rest parameters.
 *
 * Caller must take care of zeroing target container memory.
 */
bool __skb_flow_dissect(const struct net *net,
                        const struct sk_buff *skb,
                        struct flow_dissector *flow_dissector,
                        void *target_container, const void *data,
                        __be16 proto, int nhoff, int hlen, unsigned int flags)
{
        struct flow_dissector_key_control *key_control;
        struct flow_dissector_key_basic *key_basic;
        struct flow_dissector_key_addrs *key_addrs;
        struct flow_dissector_key_tags *key_tags;
        struct flow_dissector_key_vlan *key_vlan;
        enum flow_dissect_ret fdret;
        enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX;
        bool mpls_el = false;
        int mpls_lse = 0;
        int num_hdrs = 0;
        u8 ip_proto = 0;
        bool ret;

        if (!data) {
                data = skb->data;
                proto = skb_vlan_tag_present(skb) ?
                         skb->vlan_proto : skb->protocol;
                nhoff = skb_network_offset(skb);
                hlen = skb_headlen(skb);
#if IS_ENABLED(CONFIG_NET_DSA)
                if (unlikely(skb->dev && netdev_uses_dsa(skb->dev) &&
                             proto == htons(ETH_P_XDSA))) {
                        struct metadata_dst *md_dst = skb_metadata_dst(skb);
                        const struct dsa_device_ops *ops;
                        int offset = 0;

                        ops = skb->dev->dsa_ptr->tag_ops;
                        /* Only DSA header taggers break flow dissection */
                        if (ops->needed_headroom &&
                            (!md_dst || md_dst->type != METADATA_HW_PORT_MUX)) {
                                if (ops->flow_dissect)
                                        ops->flow_dissect(skb, &proto, &offset);
                                else
                                        dsa_tag_generic_flow_dissect(skb,
                                                                     &proto,
                                                                     &offset);
                                hlen -= offset;
                                nhoff += offset;
                        }
                }
#endif
        }

        /* It is ensured by skb_flow_dissector_init() that control key will
         * be always present.
         */
        key_control = skb_flow_dissector_target(flow_dissector,
                                                FLOW_DISSECTOR_KEY_CONTROL,
                                                target_container);

        /* It is ensured by skb_flow_dissector_init() that basic key will
         * be always present.
         */
        key_basic = skb_flow_dissector_target(flow_dissector,
                                              FLOW_DISSECTOR_KEY_BASIC,
                                              target_container);

        rcu_read_lock();

        if (skb) {
                if (!net) {
                        if (skb->dev)
                                net = dev_net_rcu(skb->dev);
                        else if (skb->sk)
                                net = sock_net(skb->sk);
                }
        }

        DEBUG_NET_WARN_ON_ONCE(!net);
        if (net) {
                enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR;
                struct bpf_prog_array *run_array;

                run_array = rcu_dereference(init_net.bpf.run_array[type]);
                if (!run_array)
                        run_array = rcu_dereference(net->bpf.run_array[type]);

                if (run_array) {
                        struct bpf_flow_keys flow_keys;
                        struct bpf_flow_dissector ctx = {
                                .flow_keys = &flow_keys,
                                .data = data,
                                .data_end = data + hlen,
                        };
                        __be16 n_proto = proto;
                        struct bpf_prog *prog;
                        u32 result;

                        if (skb) {
                                ctx.skb = skb;
                                /* we can't use 'proto' in the skb case
                                 * because it might be set to skb->vlan_proto
                                 * which has been pulled from the data
                                 */
                                n_proto = skb->protocol;
                        }

                        prog = READ_ONCE(run_array->items[0].prog);
                        result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff,
                                                  hlen, flags);
                        if (result != BPF_FLOW_DISSECTOR_CONTINUE) {
                                __skb_flow_bpf_to_target(&flow_keys, flow_dissector,
                                                         target_container);
                                rcu_read_unlock();
                                return result == BPF_OK;
                        }
                }
        }

        rcu_read_unlock();

        if (dissector_uses_key(flow_dissector,
                               FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
                struct ethhdr *eth = eth_hdr(skb);
                struct flow_dissector_key_eth_addrs *key_eth_addrs;

                key_eth_addrs = skb_flow_dissector_target(flow_dissector,
                                                          FLOW_DISSECTOR_KEY_ETH_ADDRS,
                                                          target_container);
                memcpy(key_eth_addrs, eth, sizeof(*key_eth_addrs));
        }

        if (dissector_uses_key(flow_dissector,
                               FLOW_DISSECTOR_KEY_NUM_OF_VLANS)) {
                struct flow_dissector_key_num_of_vlans *key_num_of_vlans;

                key_num_of_vlans = skb_flow_dissector_target(flow_dissector,
                                                             FLOW_DISSECTOR_KEY_NUM_OF_VLANS,
                                                             target_container);
                key_num_of_vlans->num_of_vlans = 0;
        }

proto_again:
        fdret = FLOW_DISSECT_RET_CONTINUE;

        switch (proto) {
        case htons(ETH_P_IP): {
                const struct iphdr *iph;
                struct iphdr _iph;

                iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
                if (!iph || iph->ihl < 5) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                nhoff += iph->ihl * 4;

                ip_proto = iph->protocol;

                if (dissector_uses_key(flow_dissector,
                                       FLOW_DISSECTOR_KEY_IPV4_ADDRS)) {
                        key_addrs = skb_flow_dissector_target(flow_dissector,
                                                              FLOW_DISSECTOR_KEY_IPV4_ADDRS,
                                                              target_container);

                        memcpy(&key_addrs->v4addrs.src, &iph->saddr,
                               sizeof(key_addrs->v4addrs.src));
                        memcpy(&key_addrs->v4addrs.dst, &iph->daddr,
                               sizeof(key_addrs->v4addrs.dst));
                        key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
                }

                __skb_flow_dissect_ipv4(skb, flow_dissector,
                                        target_container, data, iph);

                if (ip_is_fragment(iph)) {
                        key_control->flags |= FLOW_DIS_IS_FRAGMENT;

                        if (iph->frag_off & htons(IP_OFFSET)) {
                                fdret = FLOW_DISSECT_RET_OUT_GOOD;
                                break;
                        } else {
                                key_control->flags |= FLOW_DIS_FIRST_FRAG;
                                if (!(flags &
                                      FLOW_DISSECTOR_F_PARSE_1ST_FRAG)) {
                                        fdret = FLOW_DISSECT_RET_OUT_GOOD;
                                        break;
                                }
                        }
                }

                break;
        }
        case htons(ETH_P_IPV6): {
                const struct ipv6hdr *iph;
                struct ipv6hdr _iph;

                iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
                if (!iph) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                ip_proto = iph->nexthdr;
                nhoff += sizeof(struct ipv6hdr);

                if (dissector_uses_key(flow_dissector,
                                       FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
                        key_addrs = skb_flow_dissector_target(flow_dissector,
                                                              FLOW_DISSECTOR_KEY_IPV6_ADDRS,
                                                              target_container);

                        memcpy(&key_addrs->v6addrs.src, &iph->saddr,
                               sizeof(key_addrs->v6addrs.src));
                        memcpy(&key_addrs->v6addrs.dst, &iph->daddr,
                               sizeof(key_addrs->v6addrs.dst));
                        key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                }

                if ((dissector_uses_key(flow_dissector,
                                        FLOW_DISSECTOR_KEY_FLOW_LABEL) ||
                     (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)) &&
                    ip6_flowlabel(iph)) {
                        __be32 flow_label = ip6_flowlabel(iph);

                        if (dissector_uses_key(flow_dissector,
                                               FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
                                key_tags = skb_flow_dissector_target(flow_dissector,
                                                                     FLOW_DISSECTOR_KEY_FLOW_LABEL,
                                                                     target_container);
                                key_tags->flow_label = ntohl(flow_label);
                        }
                        if (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL) {
                                fdret = FLOW_DISSECT_RET_OUT_GOOD;
                                break;
                        }
                }

                __skb_flow_dissect_ipv6(skb, flow_dissector,
                                        target_container, data, iph);

                break;
        }
        case htons(ETH_P_8021AD):
        case htons(ETH_P_8021Q): {
                const struct vlan_hdr *vlan = NULL;
                struct vlan_hdr _vlan;
                __be16 saved_vlan_tpid = proto;

                if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX &&
                    skb && skb_vlan_tag_present(skb)) {
                        proto = skb->protocol;
                } else {
                        vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan),
                                                    data, hlen, &_vlan);
                        if (!vlan) {
                                fdret = FLOW_DISSECT_RET_OUT_BAD;
                                break;
                        }

                        proto = vlan->h_vlan_encapsulated_proto;
                        nhoff += sizeof(*vlan);
                }

                if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS) &&
                    !(key_control->flags & FLOW_DIS_ENCAPSULATION)) {
                        struct flow_dissector_key_num_of_vlans *key_nvs;

                        key_nvs = skb_flow_dissector_target(flow_dissector,
                                                            FLOW_DISSECTOR_KEY_NUM_OF_VLANS,
                                                            target_container);
                        key_nvs->num_of_vlans++;
                }

                if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX) {
                        dissector_vlan = FLOW_DISSECTOR_KEY_VLAN;
                } else if (dissector_vlan == FLOW_DISSECTOR_KEY_VLAN) {
                        dissector_vlan = FLOW_DISSECTOR_KEY_CVLAN;
                } else {
                        fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                        break;
                }

                if (dissector_uses_key(flow_dissector, dissector_vlan)) {
                        key_vlan = skb_flow_dissector_target(flow_dissector,
                                                             dissector_vlan,
                                                             target_container);

                        if (!vlan) {
                                key_vlan->vlan_id = skb_vlan_tag_get_id(skb);
                                key_vlan->vlan_priority = skb_vlan_tag_get_prio(skb);
                        } else {
                                key_vlan->vlan_id = ntohs(vlan->h_vlan_TCI) &
                                        VLAN_VID_MASK;
                                key_vlan->vlan_priority =
                                        (ntohs(vlan->h_vlan_TCI) &
                                         VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
                        }
                        key_vlan->vlan_tpid = saved_vlan_tpid;
                        key_vlan->vlan_eth_type = proto;
                }

                fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                break;
        }
        case htons(ETH_P_PPP_SES): {
                struct {
                        struct pppoe_hdr hdr;
                        __be16 proto;
                } *hdr, _hdr;
                u16 ppp_proto;

                hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
                if (!hdr) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                if (!is_pppoe_ses_hdr_valid(&hdr->hdr)) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                /* least significant bit of the most significant octet
                 * indicates if protocol field was compressed
                 */
                ppp_proto = ntohs(hdr->proto);
                if (ppp_proto & 0x0100) {
                        ppp_proto = ppp_proto >> 8;
                        nhoff += PPPOE_SES_HLEN - 1;
                } else {
                        nhoff += PPPOE_SES_HLEN;
                }

                if (ppp_proto == PPP_IP) {
                        proto = htons(ETH_P_IP);
                        fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                } else if (ppp_proto == PPP_IPV6) {
                        proto = htons(ETH_P_IPV6);
                        fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                } else if (ppp_proto == PPP_MPLS_UC) {
                        proto = htons(ETH_P_MPLS_UC);
                        fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                } else if (ppp_proto == PPP_MPLS_MC) {
                        proto = htons(ETH_P_MPLS_MC);
                        fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                } else if (ppp_proto_is_valid(ppp_proto)) {
                        fdret = FLOW_DISSECT_RET_OUT_GOOD;
                } else {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                if (dissector_uses_key(flow_dissector,
                                       FLOW_DISSECTOR_KEY_PPPOE)) {
                        struct flow_dissector_key_pppoe *key_pppoe;

                        key_pppoe = skb_flow_dissector_target(flow_dissector,
                                                              FLOW_DISSECTOR_KEY_PPPOE,
                                                              target_container);
                        key_pppoe->session_id = hdr->hdr.sid;
                        key_pppoe->ppp_proto = htons(ppp_proto);
                        key_pppoe->type = htons(ETH_P_PPP_SES);
                }
                break;
        }
        case htons(ETH_P_TIPC): {
                struct tipc_basic_hdr *hdr, _hdr;

                hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr),
                                           data, hlen, &_hdr);
                if (!hdr) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                if (dissector_uses_key(flow_dissector,
                                       FLOW_DISSECTOR_KEY_TIPC)) {
                        key_addrs = skb_flow_dissector_target(flow_dissector,
                                                              FLOW_DISSECTOR_KEY_TIPC,
                                                              target_container);
                        key_addrs->tipckey.key = tipc_hdr_rps_key(hdr);
                        key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC;
                }
                fdret = FLOW_DISSECT_RET_OUT_GOOD;
                break;
        }

        case htons(ETH_P_MPLS_UC):
        case htons(ETH_P_MPLS_MC):
                fdret = __skb_flow_dissect_mpls(skb, flow_dissector,
                                                target_container, data,
                                                nhoff, hlen, mpls_lse,
                                                &mpls_el);
                nhoff += sizeof(struct mpls_label);
                mpls_lse++;
                break;
        case htons(ETH_P_FCOE):
                if ((hlen - nhoff) < FCOE_HEADER_LEN) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                nhoff += FCOE_HEADER_LEN;
                fdret = FLOW_DISSECT_RET_OUT_GOOD;
                break;

        case htons(ETH_P_ARP):
        case htons(ETH_P_RARP):
                fdret = __skb_flow_dissect_arp(skb, flow_dissector,
                                               target_container, data,
                                               nhoff, hlen);
                break;

        case htons(ETH_P_BATMAN):
                fdret = __skb_flow_dissect_batadv(skb, key_control, data,
                                                  &proto, &nhoff, hlen, flags);
                break;

        case htons(ETH_P_1588): {
                struct ptp_header *hdr, _hdr;

                hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data,
                                           hlen, &_hdr);
                if (!hdr) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                nhoff += sizeof(struct ptp_header);
                fdret = FLOW_DISSECT_RET_OUT_GOOD;
                break;
        }

        case htons(ETH_P_PRP):
        case htons(ETH_P_HSR): {
                struct hsr_tag *hdr, _hdr;

                hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen,
                                           &_hdr);
                if (!hdr) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                proto = hdr->encap_proto;
                nhoff += HSR_HLEN;
                fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                break;
        }

        case htons(ETH_P_CFM):
                fdret = __skb_flow_dissect_cfm(skb, flow_dissector,
                                               target_container, data,
                                               nhoff, hlen);
                break;

        default:
                fdret = FLOW_DISSECT_RET_OUT_BAD;
                break;
        }

        /* Process result of proto processing */
        switch (fdret) {
        case FLOW_DISSECT_RET_OUT_GOOD:
                goto out_good;
        case FLOW_DISSECT_RET_PROTO_AGAIN:
                if (skb_flow_dissect_allowed(&num_hdrs))
                        goto proto_again;
                goto out_good;
        case FLOW_DISSECT_RET_CONTINUE:
        case FLOW_DISSECT_RET_IPPROTO_AGAIN:
                break;
        case FLOW_DISSECT_RET_OUT_BAD:
        default:
                goto out_bad;
        }

ip_proto_again:
        fdret = FLOW_DISSECT_RET_CONTINUE;

        switch (ip_proto) {
        case IPPROTO_GRE:
                if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) {
                        fdret = FLOW_DISSECT_RET_OUT_GOOD;
                        break;
                }

                fdret = __skb_flow_dissect_gre(skb, key_control, flow_dissector,
                                               target_container, data,
                                               &proto, &nhoff, &hlen, flags);
                break;

        case NEXTHDR_HOP:
        case NEXTHDR_ROUTING:
        case NEXTHDR_DEST: {
                u8 _opthdr[2], *opthdr;

                if (proto != htons(ETH_P_IPV6))
                        break;

                opthdr = __skb_header_pointer(skb, nhoff, sizeof(_opthdr),
                                              data, hlen, &_opthdr);
                if (!opthdr) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                ip_proto = opthdr[0];
                nhoff += (opthdr[1] + 1) << 3;

                fdret = FLOW_DISSECT_RET_IPPROTO_AGAIN;
                break;
        }
        case NEXTHDR_FRAGMENT: {
                struct frag_hdr _fh, *fh;

                if (proto != htons(ETH_P_IPV6))
                        break;

                fh = __skb_header_pointer(skb, nhoff, sizeof(_fh),
                                          data, hlen, &_fh);

                if (!fh) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                key_control->flags |= FLOW_DIS_IS_FRAGMENT;

                nhoff += sizeof(_fh);
                ip_proto = fh->nexthdr;

                if (!(fh->frag_off & htons(IP6_OFFSET))) {
                        key_control->flags |= FLOW_DIS_FIRST_FRAG;
                        if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) {
                                fdret = FLOW_DISSECT_RET_IPPROTO_AGAIN;
                                break;
                        }
                }

                fdret = FLOW_DISSECT_RET_OUT_GOOD;
                break;
        }
        case IPPROTO_IPIP:
                if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) {
                        fdret = FLOW_DISSECT_RET_OUT_GOOD;
                        break;
                }

                proto = htons(ETH_P_IP);

                key_control->flags |= FLOW_DIS_ENCAPSULATION;
                if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) {
                        fdret = FLOW_DISSECT_RET_OUT_GOOD;
                        break;
                }

                fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                break;

        case IPPROTO_IPV6:
                if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) {
                        fdret = FLOW_DISSECT_RET_OUT_GOOD;
                        break;
                }

                proto = htons(ETH_P_IPV6);

                key_control->flags |= FLOW_DIS_ENCAPSULATION;
                if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) {
                        fdret = FLOW_DISSECT_RET_OUT_GOOD;
                        break;
                }

                fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                break;


        case IPPROTO_MPLS:
                proto = htons(ETH_P_MPLS_UC);
                fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                break;

        case IPPROTO_TCP:
                __skb_flow_dissect_tcp(skb, flow_dissector, target_container,
                                       data, nhoff, hlen);
                break;

        case IPPROTO_ICMP:
        case IPPROTO_ICMPV6:
                __skb_flow_dissect_icmp(skb, flow_dissector, target_container,
                                        data, nhoff, hlen);
                break;
        case IPPROTO_L2TP:
                __skb_flow_dissect_l2tpv3(skb, flow_dissector, target_container,
                                          data, nhoff, hlen);
                break;
        case IPPROTO_ESP:
                __skb_flow_dissect_esp(skb, flow_dissector, target_container,
                                       data, nhoff, hlen);
                break;
        case IPPROTO_AH:
                __skb_flow_dissect_ah(skb, flow_dissector, target_container,
                                      data, nhoff, hlen);
                break;
        default:
                break;
        }

        if (!(key_control->flags & FLOW_DIS_IS_FRAGMENT))
                __skb_flow_dissect_ports(skb, flow_dissector, target_container,
                                         data, nhoff, ip_proto, hlen);

        /* Process result of IP proto processing */
        switch (fdret) {
        case FLOW_DISSECT_RET_PROTO_AGAIN:
                if (skb_flow_dissect_allowed(&num_hdrs))
                        goto proto_again;
                break;
        case FLOW_DISSECT_RET_IPPROTO_AGAIN:
                if (skb_flow_dissect_allowed(&num_hdrs))
                        goto ip_proto_again;
                break;
        case FLOW_DISSECT_RET_OUT_GOOD:
        case FLOW_DISSECT_RET_CONTINUE:
                break;
        case FLOW_DISSECT_RET_OUT_BAD:
        default:
                goto out_bad;
        }

out_good:
        ret = true;

out:
        key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen);
        key_basic->n_proto = proto;
        key_basic->ip_proto = ip_proto;

        return ret;

out_bad:
        ret = false;
        goto out;
}
EXPORT_SYMBOL(__skb_flow_dissect);

static siphash_aligned_key_t hashrnd;
static __always_inline void __flow_hash_secret_init(void)
{
        net_get_random_once(&hashrnd, sizeof(hashrnd));
}

static const void *flow_keys_hash_start(const struct flow_keys *flow)
{
        BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % SIPHASH_ALIGNMENT);
        return &flow->FLOW_KEYS_HASH_START_FIELD;
}

static inline size_t flow_keys_hash_length(const struct flow_keys *flow)
{
        size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs);

        BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32));

        switch (flow->control.addr_type) {
        case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
                diff -= sizeof(flow->addrs.v4addrs);
                break;
        case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
                diff -= sizeof(flow->addrs.v6addrs);
                break;
        case FLOW_DISSECTOR_KEY_TIPC:
                diff -= sizeof(flow->addrs.tipckey);
                break;
        }
        return sizeof(*flow) - diff;
}

__be32 flow_get_u32_src(const struct flow_keys *flow)
{
        switch (flow->control.addr_type) {
        case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
                return flow->addrs.v4addrs.src;
        case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
                return (__force __be32)ipv6_addr_hash(
                        &flow->addrs.v6addrs.src);
        case FLOW_DISSECTOR_KEY_TIPC:
                return flow->addrs.tipckey.key;
        default:
                return 0;
        }
}
EXPORT_SYMBOL(flow_get_u32_src);

__be32 flow_get_u32_dst(const struct flow_keys *flow)
{
        switch (flow->control.addr_type) {
        case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
                return flow->addrs.v4addrs.dst;
        case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
                return (__force __be32)ipv6_addr_hash(
                        &flow->addrs.v6addrs.dst);
        default:
                return 0;
        }
}
EXPORT_SYMBOL(flow_get_u32_dst);

/* Sort the source and destination IP and the ports,
 * to have consistent hash within the two directions
 */
static inline void __flow_hash_consistentify(struct flow_keys *keys)
{
        int addr_diff, i;

        switch (keys->control.addr_type) {
        case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
                if ((__force u32)keys->addrs.v4addrs.dst <
                    (__force u32)keys->addrs.v4addrs.src)
                        swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst);

                if ((__force u16)keys->ports.dst <
                    (__force u16)keys->ports.src) {
                        swap(keys->ports.src, keys->ports.dst);
                }
                break;
        case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
                addr_diff = memcmp(&keys->addrs.v6addrs.dst,
                                   &keys->addrs.v6addrs.src,
                                   sizeof(keys->addrs.v6addrs.dst));
                if (addr_diff < 0) {
                        for (i = 0; i < 4; i++)
                                swap(keys->addrs.v6addrs.src.s6_addr32[i],
                                     keys->addrs.v6addrs.dst.s6_addr32[i]);
                }
                if ((__force u16)keys->ports.dst <
                    (__force u16)keys->ports.src) {
                        swap(keys->ports.src, keys->ports.dst);
                }
                break;
        }
}

static inline u32 __flow_hash_from_keys(struct flow_keys *keys,
                                        const siphash_key_t *keyval)
{
        u32 hash;

        __flow_hash_consistentify(keys);

        hash = siphash(flow_keys_hash_start(keys),
                       flow_keys_hash_length(keys), keyval);
        if (!hash)
                hash = 1;

        return hash;
}

u32 flow_hash_from_keys(struct flow_keys *keys)
{
        __flow_hash_secret_init();
        return __flow_hash_from_keys(keys, &hashrnd);
}
EXPORT_SYMBOL(flow_hash_from_keys);

u32 flow_hash_from_keys_seed(struct flow_keys *keys,
                             const siphash_key_t *keyval)
{
        return __flow_hash_from_keys(keys, keyval);
}
EXPORT_SYMBOL(flow_hash_from_keys_seed);

static inline u32 ___skb_get_hash(const struct sk_buff *skb,
                                  struct flow_keys *keys,
                                  const siphash_key_t *keyval)
{
        skb_flow_dissect_flow_keys(skb, keys,
                                   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);

        return __flow_hash_from_keys(keys, keyval);
}

struct _flow_keys_digest_data {
        __be16        n_proto;
        u8        ip_proto;
        u8        padding;
        __be32        ports;
        __be32        src;
        __be32        dst;
};

void make_flow_keys_digest(struct flow_keys_digest *digest,
                           const struct flow_keys *flow)
{
        struct _flow_keys_digest_data *data =
            (struct _flow_keys_digest_data *)digest;

        BUILD_BUG_ON(sizeof(*data) > sizeof(*digest));

        memset(digest, 0, sizeof(*digest));

        data->n_proto = flow->basic.n_proto;
        data->ip_proto = flow->basic.ip_proto;
        data->ports = flow->ports.ports;
        data->src = flow->addrs.v4addrs.src;
        data->dst = flow->addrs.v4addrs.dst;
}
EXPORT_SYMBOL(make_flow_keys_digest);

static struct flow_dissector flow_keys_dissector_symmetric __read_mostly;

u32 __skb_get_hash_symmetric_net(const struct net *net, const struct sk_buff *skb)
{
        struct flow_keys keys;

        __flow_hash_secret_init();

        memset(&keys, 0, sizeof(keys));
        __skb_flow_dissect(net, skb, &flow_keys_dissector_symmetric,
                           &keys, NULL, 0, 0, 0, 0);

        return __flow_hash_from_keys(&keys, &hashrnd);
}
EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric_net);

/**
 * __skb_get_hash_net: calculate a flow hash
 * @net: associated network namespace, derived from @skb if NULL
 * @skb: sk_buff to calculate flow hash from
 *
 * This function calculates a flow hash based on src/dst addresses
 * and src/dst port numbers.  Sets hash in skb to non-zero hash value
 * on success, zero indicates no valid hash.  Also, sets l4_hash in skb
 * if hash is a canonical 4-tuple hash over transport ports.
 */
void __skb_get_hash_net(const struct net *net, struct sk_buff *skb)
{
        struct flow_keys keys;
        u32 hash;

        memset(&keys, 0, sizeof(keys));

        __skb_flow_dissect(net, skb, &flow_keys_dissector,
                           &keys, NULL, 0, 0, 0,
                           FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);

        __flow_hash_secret_init();

        hash = __flow_hash_from_keys(&keys, &hashrnd);

        __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
}
EXPORT_SYMBOL(__skb_get_hash_net);

__u32 skb_get_hash_perturb(const struct sk_buff *skb,
                           const siphash_key_t *perturb)
{
        struct flow_keys keys;

        return ___skb_get_hash(skb, &keys, perturb);
}
EXPORT_SYMBOL(skb_get_hash_perturb);

u32 __skb_get_poff(const struct sk_buff *skb, const void *data,
                   const struct flow_keys_basic *keys, int hlen)
{
        u32 poff = keys->control.thoff;

        /* skip L4 headers for fragments after the first */
        if ((keys->control.flags & FLOW_DIS_IS_FRAGMENT) &&
            !(keys->control.flags & FLOW_DIS_FIRST_FRAG))
                return poff;

        switch (keys->basic.ip_proto) {
        case IPPROTO_TCP: {
                /* access doff as u8 to avoid unaligned access */
                const u8 *doff;
                u8 _doff;

                doff = __skb_header_pointer(skb, poff + 12, sizeof(_doff),
                                            data, hlen, &_doff);
                if (!doff)
                        return poff;

                poff += max_t(u32, sizeof(struct tcphdr), (*doff & 0xF0) >> 2);
                break;
        }
        case IPPROTO_UDP:
        case IPPROTO_UDPLITE:
                poff += sizeof(struct udphdr);
                break;
        /* For the rest, we do not really care about header
         * extensions at this point for now.
         */
        case IPPROTO_ICMP:
                poff += sizeof(struct icmphdr);
                break;
        case IPPROTO_ICMPV6:
                poff += sizeof(struct icmp6hdr);
                break;
        case IPPROTO_IGMP:
                poff += sizeof(struct igmphdr);
                break;
        case IPPROTO_DCCP:
                poff += sizeof(struct dccp_hdr);
                break;
        case IPPROTO_SCTP:
                poff += sizeof(struct sctphdr);
                break;
        }

        return poff;
}

/**
 * skb_get_poff - get the offset to the payload
 * @skb: sk_buff to get the payload offset from
 *
 * The function will get the offset to the payload as far as it could
 * be dissected.  The main user is currently BPF, so that we can dynamically
 * truncate packets without needing to push actual payload to the user
 * space and can analyze headers only, instead.
 */
u32 skb_get_poff(const struct sk_buff *skb)
{
        struct flow_keys_basic keys;

        if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys,
                                              NULL, 0, 0, 0, 0))
                return 0;

        return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
}

__u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys)
{
        memset(keys, 0, sizeof(*keys));

        memcpy(&keys->addrs.v6addrs.src, &fl6->saddr,
            sizeof(keys->addrs.v6addrs.src));
        memcpy(&keys->addrs.v6addrs.dst, &fl6->daddr,
            sizeof(keys->addrs.v6addrs.dst));
        keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
        keys->ports.src = fl6->fl6_sport;
        keys->ports.dst = fl6->fl6_dport;
        keys->keyid.keyid = fl6->fl6_gre_key;
        keys->tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
        keys->basic.ip_proto = fl6->flowi6_proto;

        return flow_hash_from_keys(keys);
}
EXPORT_SYMBOL(__get_hash_from_flowi6);

static const struct flow_dissector_key flow_keys_dissector_keys[] = {
        {
                .key_id = FLOW_DISSECTOR_KEY_CONTROL,
                .offset = offsetof(struct flow_keys, control),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_BASIC,
                .offset = offsetof(struct flow_keys, basic),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS,
                .offset = offsetof(struct flow_keys, addrs.v4addrs),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS,
                .offset = offsetof(struct flow_keys, addrs.v6addrs),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_TIPC,
                .offset = offsetof(struct flow_keys, addrs.tipckey),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_PORTS,
                .offset = offsetof(struct flow_keys, ports),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_VLAN,
                .offset = offsetof(struct flow_keys, vlan),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL,
                .offset = offsetof(struct flow_keys, tags),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID,
                .offset = offsetof(struct flow_keys, keyid),
        },
};

static const struct flow_dissector_key flow_keys_dissector_symmetric_keys[] = {
        {
                .key_id = FLOW_DISSECTOR_KEY_CONTROL,
                .offset = offsetof(struct flow_keys, control),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_BASIC,
                .offset = offsetof(struct flow_keys, basic),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS,
                .offset = offsetof(struct flow_keys, addrs.v4addrs),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS,
                .offset = offsetof(struct flow_keys, addrs.v6addrs),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_PORTS,
                .offset = offsetof(struct flow_keys, ports),
        },
};

static const struct flow_dissector_key flow_keys_basic_dissector_keys[] = {
        {
                .key_id = FLOW_DISSECTOR_KEY_CONTROL,
                .offset = offsetof(struct flow_keys, control),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_BASIC,
                .offset = offsetof(struct flow_keys, basic),
        },
};

struct flow_dissector flow_keys_dissector __read_mostly;
EXPORT_SYMBOL(flow_keys_dissector);

struct flow_dissector flow_keys_basic_dissector __read_mostly;
EXPORT_SYMBOL(flow_keys_basic_dissector);

static int __init init_default_flow_dissectors(void)
{
        skb_flow_dissector_init(&flow_keys_dissector,
                                flow_keys_dissector_keys,
                                ARRAY_SIZE(flow_keys_dissector_keys));
        skb_flow_dissector_init(&flow_keys_dissector_symmetric,
                                flow_keys_dissector_symmetric_keys,
                                ARRAY_SIZE(flow_keys_dissector_symmetric_keys));
        skb_flow_dissector_init(&flow_keys_basic_dissector,
                                flow_keys_basic_dissector_keys,
                                ARRAY_SIZE(flow_keys_basic_dissector_keys));
        return 0;
}
core_initcall(init_default_flow_dissectors);






















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_NSPROXY_H
#define _LINUX_NSPROXY_H

#include <linux/refcount.h>
#include <linux/spinlock.h>
#include <linux/sched.h>

struct mnt_namespace;
struct uts_namespace;
struct ipc_namespace;
struct pid_namespace;
struct cgroup_namespace;
struct fs_struct;

/*
 * A structure to contain pointers to all per-process
 * namespaces - fs (mount), uts, network, sysvipc, etc.
 *
 * The pid namespace is an exception -- it's accessed using
 * task_active_pid_ns.  The pid namespace here is the
 * namespace that children will use.
 *
 * 'count' is the number of tasks holding a reference.
 * The count for each namespace, then, will be the number
 * of nsproxies pointing to it, not the number of tasks.
 *
 * The nsproxy is shared by tasks which share all namespaces.
 * As soon as a single namespace is cloned or unshared, the
 * nsproxy is copied.
 */
struct nsproxy {
        refcount_t count;
        struct uts_namespace *uts_ns;
        struct ipc_namespace *ipc_ns;
        struct mnt_namespace *mnt_ns;
        struct pid_namespace *pid_ns_for_children;
        struct net              *net_ns;
        struct time_namespace *time_ns;
        struct time_namespace *time_ns_for_children;
        struct cgroup_namespace *cgroup_ns;
};
extern struct nsproxy init_nsproxy;

/*
 * A structure to encompass all bits needed to install
 * a partial or complete new set of namespaces.
 *
 * If a new user namespace is requested cred will
 * point to a modifiable set of credentials. If a pointer
 * to a modifiable set is needed nsset_cred() must be
 * used and tested.
 */
struct nsset {
        unsigned flags;
        struct nsproxy *nsproxy;
        struct fs_struct *fs;
        const struct cred *cred;
};

static inline struct cred *nsset_cred(struct nsset *set)
{
        if (set->flags & CLONE_NEWUSER)
                return (struct cred *)set->cred;

        return NULL;
}

/*
 * the namespaces access rules are:
 *
 *  1. only current task is allowed to change tsk->nsproxy pointer or
 *     any pointer on the nsproxy itself.  Current must hold the task_lock
 *     when changing tsk->nsproxy.
 *
 *  2. when accessing (i.e. reading) current task's namespaces - no
 *     precautions should be taken - just dereference the pointers
 *
 *  3. the access to other task namespaces is performed like this
 *     task_lock(task);
 *     nsproxy = task->nsproxy;
 *     if (nsproxy != NULL) {
 *             / *
 *               * work with the namespaces here
 *               * e.g. get the reference on one of them
 *               * /
 *     } / *
 *         * NULL task->nsproxy means that this task is
 *         * almost dead (zombie)
 *         * /
 *     task_unlock(task);
 *
 */

int copy_namespaces(u64 flags, struct task_struct *tsk);
void exit_task_namespaces(struct task_struct *tsk);
void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
int exec_task_namespaces(void);
void free_nsproxy(struct nsproxy *ns);
int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **,
        struct cred *, struct fs_struct *);
int __init nsproxy_cache_init(void);

static inline void put_nsproxy(struct nsproxy *ns)
{
        if (refcount_dec_and_test(&ns->count))
                free_nsproxy(ns);
}

static inline void get_nsproxy(struct nsproxy *ns)
{
        refcount_inc(&ns->count);
}

DEFINE_FREE(put_nsproxy, struct nsproxy *, if (_T) put_nsproxy(_T))

#endif




























  315 






















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_UNALIGNED_H
#define __LINUX_UNALIGNED_H

/*
 * This is the most generic implementation of unaligned accesses
 * and should work almost anywhere.
 */
#include <linux/unaligned/packed_struct.h>
#include <asm/byteorder.h>
#include <vdso/unaligned.h>

#define get_unaligned(ptr)        __get_unaligned_t(typeof(*(ptr)), (ptr))
#define put_unaligned(val, ptr) __put_unaligned_t(typeof(*(ptr)), (val), (ptr))

static inline u16 get_unaligned_le16(const void *p)
{
        return le16_to_cpu(__get_unaligned_t(__le16, p));
}

static inline u32 get_unaligned_le32(const void *p)
{
        return le32_to_cpu(__get_unaligned_t(__le32, p));
}

static inline u64 get_unaligned_le64(const void *p)
{
        return le64_to_cpu(__get_unaligned_t(__le64, p));
}

static inline void put_unaligned_le16(u16 val, void *p)
{
        __put_unaligned_t(__le16, cpu_to_le16(val), p);
}

static inline void put_unaligned_le32(u32 val, void *p)
{
        __put_unaligned_t(__le32, cpu_to_le32(val), p);
}

static inline void put_unaligned_le64(u64 val, void *p)
{
        __put_unaligned_t(__le64, cpu_to_le64(val), p);
}

static inline u16 get_unaligned_be16(const void *p)
{
        return be16_to_cpu(__get_unaligned_t(__be16, p));
}

static inline u32 get_unaligned_be32(const void *p)
{
        return be32_to_cpu(__get_unaligned_t(__be32, p));
}

static inline u64 get_unaligned_be64(const void *p)
{
        return be64_to_cpu(__get_unaligned_t(__be64, p));
}

static inline void put_unaligned_be16(u16 val, void *p)
{
        __put_unaligned_t(__be16, cpu_to_be16(val), p);
}

static inline void put_unaligned_be32(u32 val, void *p)
{
        __put_unaligned_t(__be32, cpu_to_be32(val), p);
}

static inline void put_unaligned_be64(u64 val, void *p)
{
        __put_unaligned_t(__be64, cpu_to_be64(val), p);
}

static inline u32 __get_unaligned_be24(const u8 *p)
{
        return p[0] << 16 | p[1] << 8 | p[2];
}

static inline u32 get_unaligned_be24(const void *p)
{
        return __get_unaligned_be24(p);
}

static inline u32 __get_unaligned_le24(const u8 *p)
{
        return p[0] | p[1] << 8 | p[2] << 16;
}

static inline u32 get_unaligned_le24(const void *p)
{
        return __get_unaligned_le24(p);
}

static inline void __put_unaligned_be24(const u32 val, u8 *p)
{
        *p++ = (val >> 16) & 0xff;
        *p++ = (val >> 8) & 0xff;
        *p++ = val & 0xff;
}

static inline void put_unaligned_be24(const u32 val, void *p)
{
        __put_unaligned_be24(val, p);
}

static inline void __put_unaligned_le24(const u32 val, u8 *p)
{
        *p++ = val & 0xff;
        *p++ = (val >> 8) & 0xff;
        *p++ = (val >> 16) & 0xff;
}

static inline void put_unaligned_le24(const u32 val, void *p)
{
        __put_unaligned_le24(val, p);
}

static inline void __put_unaligned_be48(const u64 val, u8 *p)
{
        *p++ = (val >> 40) & 0xff;
        *p++ = (val >> 32) & 0xff;
        *p++ = (val >> 24) & 0xff;
        *p++ = (val >> 16) & 0xff;
        *p++ = (val >> 8) & 0xff;
        *p++ = val & 0xff;
}

static inline void put_unaligned_be48(const u64 val, void *p)
{
        __put_unaligned_be48(val, p);
}

static inline u64 __get_unaligned_be48(const u8 *p)
{
        return (u64)p[0] << 40 | (u64)p[1] << 32 | (u64)p[2] << 24 |
                p[3] << 16 | p[4] << 8 | p[5];
}

static inline u64 get_unaligned_be48(const void *p)
{
        return __get_unaligned_be48(p);
}

#endif /* __LINUX_UNALIGNED_H */




































  318 





  313 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NF_CONNTRACK_COMMON_H
#define _NF_CONNTRACK_COMMON_H

#include <linux/refcount.h>
#include <uapi/linux/netfilter/nf_conntrack_common.h>

struct ip_conntrack_stat {
        unsigned int found;
        unsigned int invalid;
        unsigned int insert;
        unsigned int insert_failed;
        unsigned int clash_resolve;
        unsigned int drop;
        unsigned int early_drop;
        unsigned int error;
        unsigned int expect_new;
        unsigned int expect_create;
        unsigned int expect_delete;
        unsigned int search_restart;
        unsigned int chaintoolong;
};

#define NFCT_INFOMASK        7UL
#define NFCT_PTRMASK        ~(NFCT_INFOMASK)

struct nf_conntrack {
        refcount_t use;
};

void nf_conntrack_destroy(struct nf_conntrack *nfct);

/* like nf_ct_put, but without module dependency on nf_conntrack */
static inline void nf_conntrack_put(struct nf_conntrack *nfct)
{
        if (nfct && refcount_dec_and_test(&nfct->use))
                nf_conntrack_destroy(nfct);
}
static inline void nf_conntrack_get(struct nf_conntrack *nfct)
{
        if (nfct)
                refcount_inc(&nfct->use);
}

#endif /* _NF_CONNTRACK_COMMON_H */




































































































































































    9 




    9 





















    9 
    9 
    9 




    9 










    9 




    9 
    9 












































    9 







    9 



    9 
    9 



    9 

    9 















    9 







    9 



























    9 










    9 
    9 













    9 


































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * SHA-384, SHA-512, HMAC-SHA384, and HMAC-SHA512 library functions
 *
 * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
 * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
 * Copyright (c) 2003 Kyle McMartin <kyle@debian.org>
 * Copyright 2025 Google LLC
 */

#include <crypto/hmac.h>
#include <crypto/sha2.h>
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/overflow.h>
#include <linux/string.h>
#include <linux/unaligned.h>
#include <linux/wordpart.h>

static const struct sha512_block_state sha384_iv = {
        .h = {
                SHA384_H0, SHA384_H1, SHA384_H2, SHA384_H3,
                SHA384_H4, SHA384_H5, SHA384_H6, SHA384_H7,
        },
};

static const struct sha512_block_state sha512_iv = {
        .h = {
                SHA512_H0, SHA512_H1, SHA512_H2, SHA512_H3,
                SHA512_H4, SHA512_H5, SHA512_H6, SHA512_H7,
        },
};

static const u64 sha512_K[80] = {
        0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL,
        0xe9b5dba58189dbbcULL, 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
        0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, 0xd807aa98a3030242ULL,
        0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
        0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL,
        0xc19bf174cf692694ULL, 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
        0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, 0x2de92c6f592b0275ULL,
        0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
        0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL,
        0xbf597fc7beef0ee4ULL, 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
        0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, 0x27b70a8546d22ffcULL,
        0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
        0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL,
        0x92722c851482353bULL, 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
        0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, 0xd192e819d6ef5218ULL,
        0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
        0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL,
        0x34b0bcb5e19b48a8ULL, 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
        0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, 0x748f82ee5defb2fcULL,
        0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
        0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL,
        0xc67178f2e372532bULL, 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
        0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, 0x06f067aa72176fbaULL,
        0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
        0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL,
        0x431d67c49c100d4cULL, 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
        0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
};

#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
#define Maj(x, y, z) (((x) & (y)) | ((z) & ((x) | (y))))
#define e0(x) (ror64((x), 28) ^ ror64((x), 34) ^ ror64((x), 39))
#define e1(x) (ror64((x), 14) ^ ror64((x), 18) ^ ror64((x), 41))
#define s0(x) (ror64((x), 1) ^ ror64((x), 8) ^ ((x) >> 7))
#define s1(x) (ror64((x), 19) ^ ror64((x), 61) ^ ((x) >> 6))

static void sha512_block_generic(struct sha512_block_state *state,
                                 const u8 *data)
{
        u64 a = state->h[0];
        u64 b = state->h[1];
        u64 c = state->h[2];
        u64 d = state->h[3];
        u64 e = state->h[4];
        u64 f = state->h[5];
        u64 g = state->h[6];
        u64 h = state->h[7];
        u64 t1, t2;
        u64 W[16];

        for (int j = 0; j < 16; j++)
                W[j] = get_unaligned_be64(data + j * sizeof(u64));

        for (int i = 0; i < 80; i += 8) {
                if ((i & 15) == 0 && i != 0) {
                        for (int j = 0; j < 16; j++) {
                                W[j & 15] += s1(W[(j - 2) & 15]) +
                                             W[(j - 7) & 15] +
                                             s0(W[(j - 15) & 15]);
                        }
                }
                t1 = h + e1(e) + Ch(e, f, g) + sha512_K[i]   + W[(i & 15)];
                t2 = e0(a) + Maj(a, b, c);    d += t1;    h = t1 + t2;
                t1 = g + e1(d) + Ch(d, e, f) + sha512_K[i+1] + W[(i & 15) + 1];
                t2 = e0(h) + Maj(h, a, b);    c += t1;    g = t1 + t2;
                t1 = f + e1(c) + Ch(c, d, e) + sha512_K[i+2] + W[(i & 15) + 2];
                t2 = e0(g) + Maj(g, h, a);    b += t1;    f = t1 + t2;
                t1 = e + e1(b) + Ch(b, c, d) + sha512_K[i+3] + W[(i & 15) + 3];
                t2 = e0(f) + Maj(f, g, h);    a += t1;    e = t1 + t2;
                t1 = d + e1(a) + Ch(a, b, c) + sha512_K[i+4] + W[(i & 15) + 4];
                t2 = e0(e) + Maj(e, f, g);    h += t1;    d = t1 + t2;
                t1 = c + e1(h) + Ch(h, a, b) + sha512_K[i+5] + W[(i & 15) + 5];
                t2 = e0(d) + Maj(d, e, f);    g += t1;    c = t1 + t2;
                t1 = b + e1(g) + Ch(g, h, a) + sha512_K[i+6] + W[(i & 15) + 6];
                t2 = e0(c) + Maj(c, d, e);    f += t1;    b = t1 + t2;
                t1 = a + e1(f) + Ch(f, g, h) + sha512_K[i+7] + W[(i & 15) + 7];
                t2 = e0(b) + Maj(b, c, d);    e += t1;    a = t1 + t2;
        }

        state->h[0] += a;
        state->h[1] += b;
        state->h[2] += c;
        state->h[3] += d;
        state->h[4] += e;
        state->h[5] += f;
        state->h[6] += g;
        state->h[7] += h;
}

static void __maybe_unused
sha512_blocks_generic(struct sha512_block_state *state,
                      const u8 *data, size_t nblocks)
{
        do {
                sha512_block_generic(state, data);
                data += SHA512_BLOCK_SIZE;
        } while (--nblocks);
}

#ifdef CONFIG_CRYPTO_LIB_SHA512_ARCH
#include "sha512.h" /* $(SRCARCH)/sha512.h */
#else
#define sha512_blocks sha512_blocks_generic
#endif

static void __sha512_init(struct __sha512_ctx *ctx,
                          const struct sha512_block_state *iv,
                          u64 initial_bytecount)
{
        ctx->state = *iv;
        ctx->bytecount_lo = initial_bytecount;
        ctx->bytecount_hi = 0;
}

void sha384_init(struct sha384_ctx *ctx)
{
        __sha512_init(&ctx->ctx, &sha384_iv, 0);
}
EXPORT_SYMBOL_GPL(sha384_init);

void sha512_init(struct sha512_ctx *ctx)
{
        __sha512_init(&ctx->ctx, &sha512_iv, 0);
}
EXPORT_SYMBOL_GPL(sha512_init);

void __sha512_update(struct __sha512_ctx *ctx, const u8 *data, size_t len)
{
        size_t partial = ctx->bytecount_lo % SHA512_BLOCK_SIZE;

        if (check_add_overflow(ctx->bytecount_lo, len, &ctx->bytecount_lo))
                ctx->bytecount_hi++;

        if (partial + len >= SHA512_BLOCK_SIZE) {
                size_t nblocks;

                if (partial) {
                        size_t l = SHA512_BLOCK_SIZE - partial;

                        memcpy(&ctx->buf[partial], data, l);
                        data += l;
                        len -= l;

                        sha512_blocks(&ctx->state, ctx->buf, 1);
                }

                nblocks = len / SHA512_BLOCK_SIZE;
                len %= SHA512_BLOCK_SIZE;

                if (nblocks) {
                        sha512_blocks(&ctx->state, data, nblocks);
                        data += nblocks * SHA512_BLOCK_SIZE;
                }
                partial = 0;
        }
        if (len)
                memcpy(&ctx->buf[partial], data, len);
}
EXPORT_SYMBOL_GPL(__sha512_update);

static void __sha512_final(struct __sha512_ctx *ctx,
                           u8 *out, size_t digest_size)
{
        u64 bitcount_hi = (ctx->bytecount_hi << 3) | (ctx->bytecount_lo >> 61);
        u64 bitcount_lo = ctx->bytecount_lo << 3;
        size_t partial = ctx->bytecount_lo % SHA512_BLOCK_SIZE;

        ctx->buf[partial++] = 0x80;
        if (partial > SHA512_BLOCK_SIZE - 16) {
                memset(&ctx->buf[partial], 0, SHA512_BLOCK_SIZE - partial);
                sha512_blocks(&ctx->state, ctx->buf, 1);
                partial = 0;
        }
        memset(&ctx->buf[partial], 0, SHA512_BLOCK_SIZE - 16 - partial);
        *(__be64 *)&ctx->buf[SHA512_BLOCK_SIZE - 16] = cpu_to_be64(bitcount_hi);
        *(__be64 *)&ctx->buf[SHA512_BLOCK_SIZE - 8] = cpu_to_be64(bitcount_lo);
        sha512_blocks(&ctx->state, ctx->buf, 1);

        for (size_t i = 0; i < digest_size; i += 8)
                put_unaligned_be64(ctx->state.h[i / 8], out + i);
}

void sha384_final(struct sha384_ctx *ctx, u8 out[SHA384_DIGEST_SIZE])
{
        __sha512_final(&ctx->ctx, out, SHA384_DIGEST_SIZE);
        memzero_explicit(ctx, sizeof(*ctx));
}
EXPORT_SYMBOL_GPL(sha384_final);

void sha512_final(struct sha512_ctx *ctx, u8 out[SHA512_DIGEST_SIZE])
{
        __sha512_final(&ctx->ctx, out, SHA512_DIGEST_SIZE);
        memzero_explicit(ctx, sizeof(*ctx));
}
EXPORT_SYMBOL_GPL(sha512_final);

void sha384(const u8 *data, size_t len, u8 out[SHA384_DIGEST_SIZE])
{
        struct sha384_ctx ctx;

        sha384_init(&ctx);
        sha384_update(&ctx, data, len);
        sha384_final(&ctx, out);
}
EXPORT_SYMBOL_GPL(sha384);

void sha512(const u8 *data, size_t len, u8 out[SHA512_DIGEST_SIZE])
{
        struct sha512_ctx ctx;

        sha512_init(&ctx);
        sha512_update(&ctx, data, len);
        sha512_final(&ctx, out);
}
EXPORT_SYMBOL_GPL(sha512);

static void __hmac_sha512_preparekey(struct sha512_block_state *istate,
                                     struct sha512_block_state *ostate,
                                     const u8 *raw_key, size_t raw_key_len,
                                     const struct sha512_block_state *iv)
{
        union {
                u8 b[SHA512_BLOCK_SIZE];
                unsigned long w[SHA512_BLOCK_SIZE / sizeof(unsigned long)];
        } derived_key = { 0 };

        if (unlikely(raw_key_len > SHA512_BLOCK_SIZE)) {
                if (iv == &sha384_iv)
                        sha384(raw_key, raw_key_len, derived_key.b);
                else
                        sha512(raw_key, raw_key_len, derived_key.b);
        } else {
                memcpy(derived_key.b, raw_key, raw_key_len);
        }

        for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
                derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE);
        *istate = *iv;
        sha512_blocks(istate, derived_key.b, 1);

        for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
                derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^
                                                HMAC_IPAD_VALUE);
        *ostate = *iv;
        sha512_blocks(ostate, derived_key.b, 1);

        memzero_explicit(&derived_key, sizeof(derived_key));
}

void hmac_sha384_preparekey(struct hmac_sha384_key *key,
                            const u8 *raw_key, size_t raw_key_len)
{
        __hmac_sha512_preparekey(&key->key.istate, &key->key.ostate,
                                 raw_key, raw_key_len, &sha384_iv);
}
EXPORT_SYMBOL_GPL(hmac_sha384_preparekey);

void hmac_sha512_preparekey(struct hmac_sha512_key *key,
                            const u8 *raw_key, size_t raw_key_len)
{
        __hmac_sha512_preparekey(&key->key.istate, &key->key.ostate,
                                 raw_key, raw_key_len, &sha512_iv);
}
EXPORT_SYMBOL_GPL(hmac_sha512_preparekey);

void __hmac_sha512_init(struct __hmac_sha512_ctx *ctx,
                        const struct __hmac_sha512_key *key)
{
        __sha512_init(&ctx->sha_ctx, &key->istate, SHA512_BLOCK_SIZE);
        ctx->ostate = key->ostate;
}
EXPORT_SYMBOL_GPL(__hmac_sha512_init);

void hmac_sha384_init_usingrawkey(struct hmac_sha384_ctx *ctx,
                                  const u8 *raw_key, size_t raw_key_len)
{
        __hmac_sha512_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate,
                                 raw_key, raw_key_len, &sha384_iv);
        ctx->ctx.sha_ctx.bytecount_lo = SHA512_BLOCK_SIZE;
        ctx->ctx.sha_ctx.bytecount_hi = 0;
}
EXPORT_SYMBOL_GPL(hmac_sha384_init_usingrawkey);

void hmac_sha512_init_usingrawkey(struct hmac_sha512_ctx *ctx,
                                  const u8 *raw_key, size_t raw_key_len)
{
        __hmac_sha512_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate,
                                 raw_key, raw_key_len, &sha512_iv);
        ctx->ctx.sha_ctx.bytecount_lo = SHA512_BLOCK_SIZE;
        ctx->ctx.sha_ctx.bytecount_hi = 0;
}
EXPORT_SYMBOL_GPL(hmac_sha512_init_usingrawkey);

static void __hmac_sha512_final(struct __hmac_sha512_ctx *ctx,
                                u8 *out, size_t digest_size)
{
        /* Generate the padded input for the outer hash in ctx->sha_ctx.buf. */
        __sha512_final(&ctx->sha_ctx, ctx->sha_ctx.buf, digest_size);
        memset(&ctx->sha_ctx.buf[digest_size], 0,
               SHA512_BLOCK_SIZE - digest_size);
        ctx->sha_ctx.buf[digest_size] = 0x80;
        *(__be32 *)&ctx->sha_ctx.buf[SHA512_BLOCK_SIZE - 4] =
                cpu_to_be32(8 * (SHA512_BLOCK_SIZE + digest_size));

        /* Compute the outer hash, which gives the HMAC value. */
        sha512_blocks(&ctx->ostate, ctx->sha_ctx.buf, 1);
        for (size_t i = 0; i < digest_size; i += 8)
                put_unaligned_be64(ctx->ostate.h[i / 8], out + i);

        memzero_explicit(ctx, sizeof(*ctx));
}

void hmac_sha384_final(struct hmac_sha384_ctx *ctx,
                       u8 out[SHA384_DIGEST_SIZE])
{
        __hmac_sha512_final(&ctx->ctx, out, SHA384_DIGEST_SIZE);
}
EXPORT_SYMBOL_GPL(hmac_sha384_final);

void hmac_sha512_final(struct hmac_sha512_ctx *ctx,
                       u8 out[SHA512_DIGEST_SIZE])
{
        __hmac_sha512_final(&ctx->ctx, out, SHA512_DIGEST_SIZE);
}
EXPORT_SYMBOL_GPL(hmac_sha512_final);

void hmac_sha384(const struct hmac_sha384_key *key,
                 const u8 *data, size_t data_len, u8 out[SHA384_DIGEST_SIZE])
{
        struct hmac_sha384_ctx ctx;

        hmac_sha384_init(&ctx, key);
        hmac_sha384_update(&ctx, data, data_len);
        hmac_sha384_final(&ctx, out);
}
EXPORT_SYMBOL_GPL(hmac_sha384);

void hmac_sha512(const struct hmac_sha512_key *key,
                 const u8 *data, size_t data_len, u8 out[SHA512_DIGEST_SIZE])
{
        struct hmac_sha512_ctx ctx;

        hmac_sha512_init(&ctx, key);
        hmac_sha512_update(&ctx, data, data_len);
        hmac_sha512_final(&ctx, out);
}
EXPORT_SYMBOL_GPL(hmac_sha512);

void hmac_sha384_usingrawkey(const u8 *raw_key, size_t raw_key_len,
                             const u8 *data, size_t data_len,
                             u8 out[SHA384_DIGEST_SIZE])
{
        struct hmac_sha384_ctx ctx;

        hmac_sha384_init_usingrawkey(&ctx, raw_key, raw_key_len);
        hmac_sha384_update(&ctx, data, data_len);
        hmac_sha384_final(&ctx, out);
}
EXPORT_SYMBOL_GPL(hmac_sha384_usingrawkey);

void hmac_sha512_usingrawkey(const u8 *raw_key, size_t raw_key_len,
                             const u8 *data, size_t data_len,
                             u8 out[SHA512_DIGEST_SIZE])
{
        struct hmac_sha512_ctx ctx;

        hmac_sha512_init_usingrawkey(&ctx, raw_key, raw_key_len);
        hmac_sha512_update(&ctx, data, data_len);
        hmac_sha512_final(&ctx, out);
}
EXPORT_SYMBOL_GPL(hmac_sha512_usingrawkey);

#ifdef sha512_mod_init_arch
static int __init sha512_mod_init(void)
{
        sha512_mod_init_arch();
        return 0;
}
subsys_initcall(sha512_mod_init);

static void __exit sha512_mod_exit(void)
{
}
module_exit(sha512_mod_exit);
#endif

MODULE_DESCRIPTION("SHA-384, SHA-512, HMAC-SHA384, and HMAC-SHA512 library functions");
MODULE_LICENSE("GPL");







































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PSI_H
#define _LINUX_PSI_H

#include <linux/jump_label.h>
#include <linux/psi_types.h>
#include <linux/sched.h>
#include <linux/poll.h>
#include <linux/cgroup-defs.h>
#include <linux/cgroup.h>

struct seq_file;
struct css_set;

#ifdef CONFIG_PSI

extern struct static_key_false psi_disabled;
extern struct psi_group psi_system;

void psi_init(void);

void psi_memstall_enter(unsigned long *flags);
void psi_memstall_leave(unsigned long *flags);

int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf,
                                       enum psi_res res, struct file *file,
                                       struct kernfs_open_file *of);
void psi_trigger_destroy(struct psi_trigger *t);

__poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
                        poll_table *wait);

#ifdef CONFIG_CGROUPS
static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
{
        return cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
}

int psi_cgroup_alloc(struct cgroup *cgrp);
void psi_cgroup_free(struct cgroup *cgrp);
void cgroup_move_task(struct task_struct *p, struct css_set *to);
void psi_cgroup_restart(struct psi_group *group);
#endif

#else /* CONFIG_PSI */

static inline void psi_init(void) {}

static inline void psi_memstall_enter(unsigned long *flags) {}
static inline void psi_memstall_leave(unsigned long *flags) {}

#ifdef CONFIG_CGROUPS
static inline int psi_cgroup_alloc(struct cgroup *cgrp)
{
        return 0;
}
static inline void psi_cgroup_free(struct cgroup *cgrp)
{
}
static inline void cgroup_move_task(struct task_struct *p, struct css_set *to)
{
        rcu_assign_pointer(p->cgroups, to);
}
static inline void psi_cgroup_restart(struct psi_group *group) {}
#endif

#endif /* CONFIG_PSI */

#endif /* _LINUX_PSI_H */



































































































































































































































































































































































































   42 











   42 




   42 





































































































































































































































































































































































































































































   41 








   42 

   42 

   42 







































































































































  109 












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
// SPDX-License-Identifier: GPL-2.0
/*
 *  Kernel timekeeping code and accessor functions. Based on code from
 *  timer.c, moved in commit 8524070b7982.
 */
#include <linux/timekeeper_internal.h>
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/kobject.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/nmi.h>
#include <linux/sched.h>
#include <linux/sched/loadavg.h>
#include <linux/sched/clock.h>
#include <linux/syscore_ops.h>
#include <linux/clocksource.h>
#include <linux/jiffies.h>
#include <linux/time.h>
#include <linux/timex.h>
#include <linux/tick.h>
#include <linux/stop_machine.h>
#include <linux/pvclock_gtod.h>
#include <linux/compiler.h>
#include <linux/audit.h>
#include <linux/random.h>

#include <vdso/auxclock.h>

#include "tick-internal.h"
#include "ntp_internal.h"
#include "timekeeping_internal.h"

#define TK_CLEAR_NTP                (1 << 0)
#define TK_CLOCK_WAS_SET        (1 << 1)

#define TK_UPDATE_ALL                (TK_CLEAR_NTP | TK_CLOCK_WAS_SET)

enum timekeeping_adv_mode {
        /* Update timekeeper when a tick has passed */
        TK_ADV_TICK,

        /* Update timekeeper on a direct frequency change */
        TK_ADV_FREQ
};

/*
 * The most important data for readout fits into a single 64 byte
 * cache line.
 */
struct tk_data {
        seqcount_raw_spinlock_t        seq;
        struct timekeeper        timekeeper;
        struct timekeeper        shadow_timekeeper;
        raw_spinlock_t                lock;
} ____cacheline_aligned;

static struct tk_data timekeeper_data[TIMEKEEPERS_MAX];

/* The core timekeeper */
#define tk_core                (timekeeper_data[TIMEKEEPER_CORE])

#ifdef CONFIG_POSIX_AUX_CLOCKS
static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts)
{
        return ktime_get_aux_ts64(CLOCK_AUX + tkid - TIMEKEEPER_AUX_FIRST, ts);
}

static inline bool tk_is_aux(const struct timekeeper *tk)
{
        return tk->id >= TIMEKEEPER_AUX_FIRST && tk->id <= TIMEKEEPER_AUX_LAST;
}
#else
static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts)
{
        return false;
}

static inline bool tk_is_aux(const struct timekeeper *tk)
{
        return false;
}
#endif

static inline void tk_update_aux_offs(struct timekeeper *tk, ktime_t offs)
{
        tk->offs_aux = offs;
        tk->monotonic_to_aux = ktime_to_timespec64(offs);
}

/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;

/**
 * struct tk_fast - NMI safe timekeeper
 * @seq:        Sequence counter for protecting updates. The lowest bit
 *                is the index for the tk_read_base array
 * @base:        tk_read_base array. Access is indexed by the lowest bit of
 *                @seq.
 *
 * See @update_fast_timekeeper() below.
 */
struct tk_fast {
        seqcount_latch_t        seq;
        struct tk_read_base        base[2];
};

/* Suspend-time cycles value for halted fast timekeeper. */
static u64 cycles_at_suspend;

static u64 dummy_clock_read(struct clocksource *cs)
{
        if (timekeeping_suspended)
                return cycles_at_suspend;
        return local_clock();
}

static struct clocksource dummy_clock = {
        .read = dummy_clock_read,
};

/*
 * Boot time initialization which allows local_clock() to be utilized
 * during early boot when clocksources are not available. local_clock()
 * returns nanoseconds already so no conversion is required, hence mult=1
 * and shift=0. When the first proper clocksource is installed then
 * the fast time keepers are updated with the correct values.
 */
#define FAST_TK_INIT                                                \
        {                                                        \
                .clock                = &dummy_clock,                        \
                .mask                = CLOCKSOURCE_MASK(64),                \
                .mult                = 1,                                \
                .shift                = 0,                                \
        }

static struct tk_fast tk_fast_mono ____cacheline_aligned = {
        .seq     = SEQCNT_LATCH_ZERO(tk_fast_mono.seq),
        .base[0] = FAST_TK_INIT,
        .base[1] = FAST_TK_INIT,
};

static struct tk_fast tk_fast_raw  ____cacheline_aligned = {
        .seq     = SEQCNT_LATCH_ZERO(tk_fast_raw.seq),
        .base[0] = FAST_TK_INIT,
        .base[1] = FAST_TK_INIT,
};

#ifdef CONFIG_POSIX_AUX_CLOCKS
static __init void tk_aux_setup(void);
static void tk_aux_update_clocksource(void);
static void tk_aux_advance(void);
#else
static inline void tk_aux_setup(void) { }
static inline void tk_aux_update_clocksource(void) { }
static inline void tk_aux_advance(void) { }
#endif

unsigned long timekeeper_lock_irqsave(void)
{
        unsigned long flags;

        raw_spin_lock_irqsave(&tk_core.lock, flags);
        return flags;
}

void timekeeper_unlock_irqrestore(unsigned long flags)
{
        raw_spin_unlock_irqrestore(&tk_core.lock, flags);
}

/*
 * Multigrain timestamps require tracking the latest fine-grained timestamp
 * that has been issued, and never returning a coarse-grained timestamp that is
 * earlier than that value.
 *
 * mg_floor represents the latest fine-grained time that has been handed out as
 * a file timestamp on the system. This is tracked as a monotonic ktime_t, and
 * converted to a realtime clock value on an as-needed basis.
 *
 * Maintaining mg_floor ensures the multigrain interfaces never issue a
 * timestamp earlier than one that has been previously issued.
 *
 * The exception to this rule is when there is a backward realtime clock jump. If
 * such an event occurs, a timestamp can appear to be earlier than a previous one.
 */
static __cacheline_aligned_in_smp atomic64_t mg_floor;

static inline void tk_normalize_xtime(struct timekeeper *tk)
{
        while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
                tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
                tk->xtime_sec++;
        }
        while (tk->tkr_raw.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_raw.shift)) {
                tk->tkr_raw.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
                tk->raw_sec++;
        }
}

static inline struct timespec64 tk_xtime(const struct timekeeper *tk)
{
        struct timespec64 ts;

        ts.tv_sec = tk->xtime_sec;
        ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
        return ts;
}

static inline struct timespec64 tk_xtime_coarse(const struct timekeeper *tk)
{
        struct timespec64 ts;

        ts.tv_sec = tk->xtime_sec;
        ts.tv_nsec = tk->coarse_nsec;
        return ts;
}

/*
 * Update the nanoseconds part for the coarse time keepers. They can't rely
 * on xtime_nsec because xtime_nsec could be adjusted by a small negative
 * amount when the multiplication factor of the clock is adjusted, which
 * could cause the coarse clocks to go slightly backwards. See
 * timekeeping_apply_adjustment(). Thus we keep a separate copy for the coarse
 * clockids which only is updated when the clock has been set or  we have
 * accumulated time.
 */
static inline void tk_update_coarse_nsecs(struct timekeeper *tk)
{
        tk->coarse_nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
}

static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
{
        tk->xtime_sec = ts->tv_sec;
        tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift;
        tk_update_coarse_nsecs(tk);
}

static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
{
        tk->xtime_sec += ts->tv_sec;
        tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift;
        tk_normalize_xtime(tk);
        tk_update_coarse_nsecs(tk);
}

static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
{
        struct timespec64 tmp;

        /*
         * Verify consistency of: offset_real = -wall_to_monotonic
         * before modifying anything
         */
        set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec,
                                        -tk->wall_to_monotonic.tv_nsec);
        WARN_ON_ONCE(tk->offs_real != timespec64_to_ktime(tmp));
        tk->wall_to_monotonic = wtm;
        set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
        /* Paired with READ_ONCE() in ktime_mono_to_any() */
        WRITE_ONCE(tk->offs_real, timespec64_to_ktime(tmp));
        WRITE_ONCE(tk->offs_tai, ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0)));
}

static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
{
        /* Paired with READ_ONCE() in ktime_mono_to_any() */
        WRITE_ONCE(tk->offs_boot, ktime_add(tk->offs_boot, delta));
        /*
         * Timespec representation for VDSO update to avoid 64bit division
         * on every update.
         */
        tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot);
}

/*
 * tk_clock_read - atomic clocksource read() helper
 *
 * This helper is necessary to use in the read paths because, while the
 * seqcount ensures we don't return a bad value while structures are updated,
 * it doesn't protect from potential crashes. There is the possibility that
 * the tkr's clocksource may change between the read reference, and the
 * clock reference passed to the read function.  This can cause crashes if
 * the wrong clocksource is passed to the wrong read function.
 * This isn't necessary to use when holding the tk_core.lock or doing
 * a read of the fast-timekeeper tkrs (which is protected by its own locking
 * and update logic).
 */
static inline u64 tk_clock_read(const struct tk_read_base *tkr)
{
        struct clocksource *clock = READ_ONCE(tkr->clock);

        return clock->read(clock);
}

/**
 * tk_setup_internals - Set up internals to use clocksource clock.
 *
 * @tk:                The target timekeeper to setup.
 * @clock:                Pointer to clocksource.
 *
 * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
 * pair and interval request.
 *
 * Unless you're the timekeeping code, you should not be using this!
 */
static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
{
        u64 interval;
        u64 tmp, ntpinterval;
        struct clocksource *old_clock;

        ++tk->cs_was_changed_seq;
        old_clock = tk->tkr_mono.clock;
        tk->tkr_mono.clock = clock;
        tk->tkr_mono.mask = clock->mask;
        tk->tkr_mono.cycle_last = tk_clock_read(&tk->tkr_mono);

        tk->tkr_raw.clock = clock;
        tk->tkr_raw.mask = clock->mask;
        tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last;

        /* Do the ns -> cycle conversion first, using original mult */
        tmp = NTP_INTERVAL_LENGTH;
        tmp <<= clock->shift;
        ntpinterval = tmp;
        tmp += clock->mult/2;
        do_div(tmp, clock->mult);
        if (tmp == 0)
                tmp = 1;

        interval = (u64) tmp;
        tk->cycle_interval = interval;

        /* Go back from cycles -> shifted ns */
        tk->xtime_interval = interval * clock->mult;
        tk->xtime_remainder = ntpinterval - tk->xtime_interval;
        tk->raw_interval = interval * clock->mult;

         /* if changing clocks, convert xtime_nsec shift units */
        if (old_clock) {
                int shift_change = clock->shift - old_clock->shift;
                if (shift_change < 0) {
                        tk->tkr_mono.xtime_nsec >>= -shift_change;
                        tk->tkr_raw.xtime_nsec >>= -shift_change;
                } else {
                        tk->tkr_mono.xtime_nsec <<= shift_change;
                        tk->tkr_raw.xtime_nsec <<= shift_change;
                }
        }

        tk->tkr_mono.shift = clock->shift;
        tk->tkr_raw.shift = clock->shift;

        tk->ntp_error = 0;
        tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
        tk->ntp_tick = ntpinterval << tk->ntp_error_shift;

        /*
         * The timekeeper keeps its own mult values for the currently
         * active clocksource. These value will be adjusted via NTP
         * to counteract clock drifting.
         */
        tk->tkr_mono.mult = clock->mult;
        tk->tkr_raw.mult = clock->mult;
        tk->ntp_err_mult = 0;
        tk->skip_second_overflow = 0;
}

/* Timekeeper helper functions. */
static noinline u64 delta_to_ns_safe(const struct tk_read_base *tkr, u64 delta)
{
        return mul_u64_u32_add_u64_shr(delta, tkr->mult, tkr->xtime_nsec, tkr->shift);
}

static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles)
{
        /* Calculate the delta since the last update_wall_time() */
        u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask;

        /*
         * This detects both negative motion and the case where the delta
         * overflows the multiplication with tkr->mult.
         */
        if (unlikely(delta > tkr->clock->max_cycles)) {
                /*
                 * Handle clocksource inconsistency between CPUs to prevent
                 * time from going backwards by checking for the MSB of the
                 * mask being set in the delta.
                 */
                if (delta & ~(mask >> 1))
                        return tkr->xtime_nsec >> tkr->shift;

                return delta_to_ns_safe(tkr, delta);
        }

        return ((delta * tkr->mult) + tkr->xtime_nsec) >> tkr->shift;
}

static __always_inline u64 timekeeping_get_ns(const struct tk_read_base *tkr)
{
        return timekeeping_cycles_to_ns(tkr, tk_clock_read(tkr));
}

/**
 * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
 * @tkr: Timekeeping readout base from which we take the update
 * @tkf: Pointer to NMI safe timekeeper
 *
 * We want to use this from any context including NMI and tracing /
 * instrumenting the timekeeping code itself.
 *
 * Employ the latch technique; see @write_seqcount_latch.
 *
 * So if a NMI hits the update of base[0] then it will use base[1]
 * which is still consistent. In the worst case this can result is a
 * slightly wrong timestamp (a few nanoseconds). See
 * @ktime_get_mono_fast_ns.
 */
static void update_fast_timekeeper(const struct tk_read_base *tkr,
                                   struct tk_fast *tkf)
{
        struct tk_read_base *base = tkf->base;

        /* Force readers off to base[1] */
        write_seqcount_latch_begin(&tkf->seq);

        /* Update base[0] */
        memcpy(base, tkr, sizeof(*base));

        /* Force readers back to base[0] */
        write_seqcount_latch(&tkf->seq);

        /* Update base[1] */
        memcpy(base + 1, base, sizeof(*base));

        write_seqcount_latch_end(&tkf->seq);
}

static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
{
        struct tk_read_base *tkr;
        unsigned int seq;
        u64 now;

        do {
                seq = read_seqcount_latch(&tkf->seq);
                tkr = tkf->base + (seq & 0x01);
                now = ktime_to_ns(tkr->base);
                now += timekeeping_get_ns(tkr);
        } while (read_seqcount_latch_retry(&tkf->seq, seq));

        return now;
}

/**
 * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic
 *
 * This timestamp is not guaranteed to be monotonic across an update.
 * The timestamp is calculated by:
 *
 *        now = base_mono + clock_delta * slope
 *
 * So if the update lowers the slope, readers who are forced to the
 * not yet updated second array are still using the old steeper slope.
 *
 * tmono
 * ^
 * |    o  n
 * |   o n
 * |  u
 * | o
 * |o
 * |12345678---> reader order
 *
 * o = old slope
 * u = update
 * n = new slope
 *
 * So reader 6 will observe time going backwards versus reader 5.
 *
 * While other CPUs are likely to be able to observe that, the only way
 * for a CPU local observation is when an NMI hits in the middle of
 * the update. Timestamps taken from that NMI context might be ahead
 * of the following timestamps. Callers need to be aware of that and
 * deal with it.
 */
u64 notrace ktime_get_mono_fast_ns(void)
{
        return __ktime_get_fast_ns(&tk_fast_mono);
}
EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);

/**
 * ktime_get_raw_fast_ns - Fast NMI safe access to clock monotonic raw
 *
 * Contrary to ktime_get_mono_fast_ns() this is always correct because the
 * conversion factor is not affected by NTP/PTP correction.
 */
u64 notrace ktime_get_raw_fast_ns(void)
{
        return __ktime_get_fast_ns(&tk_fast_raw);
}
EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);

/**
 * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock.
 *
 * To keep it NMI safe since we're accessing from tracing, we're not using a
 * separate timekeeper with updates to monotonic clock and boot offset
 * protected with seqcounts. This has the following minor side effects:
 *
 * (1) Its possible that a timestamp be taken after the boot offset is updated
 * but before the timekeeper is updated. If this happens, the new boot offset
 * is added to the old timekeeping making the clock appear to update slightly
 * earlier:
 *    CPU 0                                        CPU 1
 *    timekeeping_inject_sleeptime64()
 *    __timekeeping_inject_sleeptime(tk, delta);
 *                                                 timestamp();
 *    timekeeping_update_staged(tkd, TK_CLEAR_NTP...);
 *
 * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be
 * partially updated.  Since the tk->offs_boot update is a rare event, this
 * should be a rare occurrence which postprocessing should be able to handle.
 *
 * The caveats vs. timestamp ordering as documented for ktime_get_mono_fast_ns()
 * apply as well.
 */
u64 notrace ktime_get_boot_fast_ns(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;

        return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_boot)));
}
EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);

/**
 * ktime_get_tai_fast_ns - NMI safe and fast access to tai clock.
 *
 * The same limitations as described for ktime_get_boot_fast_ns() apply. The
 * mono time and the TAI offset are not read atomically which may yield wrong
 * readouts. However, an update of the TAI offset is an rare event e.g., caused
 * by settime or adjtimex with an offset. The user of this function has to deal
 * with the possibility of wrong timestamps in post processing.
 */
u64 notrace ktime_get_tai_fast_ns(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;

        return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_tai)));
}
EXPORT_SYMBOL_GPL(ktime_get_tai_fast_ns);

/**
 * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime.
 *
 * See ktime_get_mono_fast_ns() for documentation of the time stamp ordering.
 */
u64 ktime_get_real_fast_ns(void)
{
        struct tk_fast *tkf = &tk_fast_mono;
        struct tk_read_base *tkr;
        u64 baser, delta;
        unsigned int seq;

        do {
                seq = raw_read_seqcount_latch(&tkf->seq);
                tkr = tkf->base + (seq & 0x01);
                baser = ktime_to_ns(tkr->base_real);
                delta = timekeeping_get_ns(tkr);
        } while (raw_read_seqcount_latch_retry(&tkf->seq, seq));

        return baser + delta;
}
EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns);

/**
 * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
 * @tk: Timekeeper to snapshot.
 *
 * It generally is unsafe to access the clocksource after timekeeping has been
 * suspended, so take a snapshot of the readout base of @tk and use it as the
 * fast timekeeper's readout base while suspended.  It will return the same
 * number of cycles every time until timekeeping is resumed at which time the
 * proper readout base for the fast timekeeper will be restored automatically.
 */
static void halt_fast_timekeeper(const struct timekeeper *tk)
{
        static struct tk_read_base tkr_dummy;
        const struct tk_read_base *tkr = &tk->tkr_mono;

        memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
        cycles_at_suspend = tk_clock_read(tkr);
        tkr_dummy.clock = &dummy_clock;
        tkr_dummy.base_real = tkr->base + tk->offs_real;
        update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);

        tkr = &tk->tkr_raw;
        memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
        tkr_dummy.clock = &dummy_clock;
        update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);
}

static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);

static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
{
        raw_notifier_call_chain(&pvclock_gtod_chain, was_set, tk);
}

/**
 * pvclock_gtod_register_notifier - register a pvclock timedata update listener
 * @nb: Pointer to the notifier block to register
 */
int pvclock_gtod_register_notifier(struct notifier_block *nb)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        int ret;

        guard(raw_spinlock_irqsave)(&tk_core.lock);
        ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
        update_pvclock_gtod(tk, true);

        return ret;
}
EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);

/**
 * pvclock_gtod_unregister_notifier - unregister a pvclock
 * timedata update listener
 * @nb: Pointer to the notifier block to unregister
 */
int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
{
        guard(raw_spinlock_irqsave)(&tk_core.lock);
        return raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
}
EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);

/*
 * tk_update_leap_state - helper to update the next_leap_ktime
 */
static inline void tk_update_leap_state(struct timekeeper *tk)
{
        tk->next_leap_ktime = ntp_get_next_leap(tk->id);
        if (tk->next_leap_ktime != KTIME_MAX)
                /* Convert to monotonic time */
                tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real);
}

/*
 * Leap state update for both shadow and the real timekeeper
 * Separate to spare a full memcpy() of the timekeeper.
 */
static void tk_update_leap_state_all(struct tk_data *tkd)
{
        write_seqcount_begin(&tkd->seq);
        tk_update_leap_state(&tkd->shadow_timekeeper);
        tkd->timekeeper.next_leap_ktime = tkd->shadow_timekeeper.next_leap_ktime;
        write_seqcount_end(&tkd->seq);
}

/*
 * Update the ktime_t based scalar nsec members of the timekeeper
 */
static inline void tk_update_ktime_data(struct timekeeper *tk)
{
        u64 seconds;
        u32 nsec;

        /*
         * The xtime based monotonic readout is:
         *        nsec = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + now();
         * The ktime based monotonic readout is:
         *        nsec = base_mono + now();
         * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec
         */
        seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
        nsec = (u32) tk->wall_to_monotonic.tv_nsec;
        tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);

        /*
         * The sum of the nanoseconds portions of xtime and
         * wall_to_monotonic can be greater/equal one second. Take
         * this into account before updating tk->ktime_sec.
         */
        nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
        if (nsec >= NSEC_PER_SEC)
                seconds++;
        tk->ktime_sec = seconds;

        /* Update the monotonic raw base */
        tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC);
}

/*
 * Restore the shadow timekeeper from the real timekeeper.
 */
static void timekeeping_restore_shadow(struct tk_data *tkd)
{
        lockdep_assert_held(&tkd->lock);
        memcpy(&tkd->shadow_timekeeper, &tkd->timekeeper, sizeof(tkd->timekeeper));
}

static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int action)
{
        struct timekeeper *tk = &tkd->shadow_timekeeper;

        lockdep_assert_held(&tkd->lock);

        /*
         * Block out readers before running the updates below because that
         * updates VDSO and other time related infrastructure. Not blocking
         * the readers might let a reader see time going backwards when
         * reading from the VDSO after the VDSO update and then reading in
         * the kernel from the timekeeper before that got updated.
         */
        write_seqcount_begin(&tkd->seq);

        if (action & TK_CLEAR_NTP) {
                tk->ntp_error = 0;
                ntp_clear(tk->id);
        }

        tk_update_leap_state(tk);
        tk_update_ktime_data(tk);
        tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real;

        if (tk->id == TIMEKEEPER_CORE) {
                update_vsyscall(tk);
                update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);

                update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
                update_fast_timekeeper(&tk->tkr_raw,  &tk_fast_raw);
        } else if (tk_is_aux(tk)) {
                vdso_time_update_aux(tk);
        }

        if (action & TK_CLOCK_WAS_SET)
                tk->clock_was_set_seq++;

        /*
         * Update the real timekeeper.
         *
         * We could avoid this memcpy() by switching pointers, but that has
         * the downside that the reader side does not longer benefit from
         * the cacheline optimized data layout of the timekeeper and requires
         * another indirection.
         */
        memcpy(&tkd->timekeeper, tk, sizeof(*tk));
        write_seqcount_end(&tkd->seq);
}

/**
 * timekeeping_forward_now - update clock to the current time
 * @tk:                Pointer to the timekeeper to update
 *
 * Forward the current clock to update its state since the last call to
 * update_wall_time(). This is useful before significant clock changes,
 * as it avoids having to deal with this time offset explicitly.
 */
static void timekeeping_forward_now(struct timekeeper *tk)
{
        u64 cycle_now, delta;

        cycle_now = tk_clock_read(&tk->tkr_mono);
        delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask,
                                  tk->tkr_mono.clock->max_raw_delta);
        tk->tkr_mono.cycle_last = cycle_now;
        tk->tkr_raw.cycle_last  = cycle_now;

        while (delta > 0) {
                u64 max = tk->tkr_mono.clock->max_cycles;
                u64 incr = delta < max ? delta : max;

                tk->tkr_mono.xtime_nsec += incr * tk->tkr_mono.mult;
                tk->tkr_raw.xtime_nsec += incr * tk->tkr_raw.mult;
                tk_normalize_xtime(tk);
                delta -= incr;
        }
        tk_update_coarse_nsecs(tk);
}

/**
 * ktime_get_real_ts64 - Returns the time of day in a timespec64.
 * @ts:                pointer to the timespec to be set
 *
 * Returns the time of day in a timespec64 (WARN if suspended).
 */
void ktime_get_real_ts64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        u64 nsecs;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                ts->tv_sec = tk->xtime_sec;
                nsecs = timekeeping_get_ns(&tk->tkr_mono);

        } while (read_seqcount_retry(&tk_core.seq, seq));

        ts->tv_nsec = 0;
        timespec64_add_ns(ts, nsecs);
}
EXPORT_SYMBOL(ktime_get_real_ts64);

ktime_t ktime_get(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base;
        u64 nsecs;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                base = tk->tkr_mono.base;
                nsecs = timekeeping_get_ns(&tk->tkr_mono);

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ktime_add_ns(base, nsecs);
}
EXPORT_SYMBOL_GPL(ktime_get);

u32 ktime_get_resolution_ns(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        u32 nsecs;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                nsecs = tk->tkr_mono.mult >> tk->tkr_mono.shift;
        } while (read_seqcount_retry(&tk_core.seq, seq));

        return nsecs;
}
EXPORT_SYMBOL_GPL(ktime_get_resolution_ns);

static ktime_t *offsets[TK_OFFS_MAX] = {
        [TK_OFFS_REAL]        = &tk_core.timekeeper.offs_real,
        [TK_OFFS_BOOT]        = &tk_core.timekeeper.offs_boot,
        [TK_OFFS_TAI]        = &tk_core.timekeeper.offs_tai,
};

ktime_t ktime_get_with_offset(enum tk_offsets offs)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base, *offset = offsets[offs];
        u64 nsecs;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                base = ktime_add(tk->tkr_mono.base, *offset);
                nsecs = timekeeping_get_ns(&tk->tkr_mono);

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ktime_add_ns(base, nsecs);

}
EXPORT_SYMBOL_GPL(ktime_get_with_offset);

ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        ktime_t base, *offset = offsets[offs];
        unsigned int seq;
        u64 nsecs;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                base = ktime_add(tk->tkr_mono.base, *offset);
                nsecs = tk->coarse_nsec;

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ktime_add_ns(base, nsecs);
}
EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);

/**
 * ktime_mono_to_any() - convert monotonic time to any other time
 * @tmono:        time to convert.
 * @offs:        which offset to use
 */
ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs)
{
        ktime_t *offset = offsets[offs];
        unsigned int seq;
        ktime_t tconv;

        if (IS_ENABLED(CONFIG_64BIT)) {
                /*
                 * Paired with WRITE_ONCE()s in tk_set_wall_to_mono() and
                 * tk_update_sleep_time().
                 */
                return ktime_add(tmono, READ_ONCE(*offset));
        }

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                tconv = ktime_add(tmono, *offset);
        } while (read_seqcount_retry(&tk_core.seq, seq));

        return tconv;
}
EXPORT_SYMBOL_GPL(ktime_mono_to_any);

/**
 * ktime_get_raw - Returns the raw monotonic time in ktime_t format
 */
ktime_t ktime_get_raw(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base;
        u64 nsecs;

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                base = tk->tkr_raw.base;
                nsecs = timekeeping_get_ns(&tk->tkr_raw);

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ktime_add_ns(base, nsecs);
}
EXPORT_SYMBOL_GPL(ktime_get_raw);

/**
 * ktime_get_ts64 - get the monotonic clock in timespec64 format
 * @ts:                pointer to timespec variable
 *
 * The function calculates the monotonic clock from the realtime
 * clock and the wall_to_monotonic offset and stores the result
 * in normalized timespec64 format in the variable pointed to by @ts.
 */
void ktime_get_ts64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        struct timespec64 tomono;
        unsigned int seq;
        u64 nsec;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                ts->tv_sec = tk->xtime_sec;
                nsec = timekeeping_get_ns(&tk->tkr_mono);
                tomono = tk->wall_to_monotonic;

        } while (read_seqcount_retry(&tk_core.seq, seq));

        ts->tv_sec += tomono.tv_sec;
        ts->tv_nsec = 0;
        timespec64_add_ns(ts, nsec + tomono.tv_nsec);
}
EXPORT_SYMBOL_GPL(ktime_get_ts64);

/**
 * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC
 *
 * Returns the seconds portion of CLOCK_MONOTONIC with a single non
 * serialized read. tk->ktime_sec is of type 'unsigned long' so this
 * works on both 32 and 64 bit systems. On 32 bit systems the readout
 * covers ~136 years of uptime which should be enough to prevent
 * premature wrap arounds.
 */
time64_t ktime_get_seconds(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;

        WARN_ON(timekeeping_suspended);
        return tk->ktime_sec;
}
EXPORT_SYMBOL_GPL(ktime_get_seconds);

/**
 * ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME
 *
 * Returns the wall clock seconds since 1970.
 *
 * For 64bit systems the fast access to tk->xtime_sec is preserved. On
 * 32bit systems the access must be protected with the sequence
 * counter to provide "atomic" access to the 64bit tk->xtime_sec
 * value.
 */
time64_t ktime_get_real_seconds(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        time64_t seconds;
        unsigned int seq;

        if (IS_ENABLED(CONFIG_64BIT))
                return tk->xtime_sec;

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                seconds = tk->xtime_sec;

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return seconds;
}
EXPORT_SYMBOL_GPL(ktime_get_real_seconds);

/**
 * __ktime_get_real_seconds - Unprotected access to CLOCK_REALTIME seconds
 *
 * The same as ktime_get_real_seconds() but without the sequence counter
 * protection. This function is used in restricted contexts like the x86 MCE
 * handler and in KGDB. It's unprotected on 32-bit vs. concurrent half
 * completed modification and only to be used for such critical contexts.
 *
 * Returns: Racy snapshot of the CLOCK_REALTIME seconds value
 */
noinstr time64_t __ktime_get_real_seconds(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;

        return tk->xtime_sec;
}

/**
 * ktime_get_snapshot - snapshots the realtime/monotonic raw clocks with counter
 * @systime_snapshot:        pointer to struct receiving the system time snapshot
 */
void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base_raw;
        ktime_t base_real;
        ktime_t base_boot;
        u64 nsec_raw;
        u64 nsec_real;
        u64 now;

        WARN_ON_ONCE(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                now = tk_clock_read(&tk->tkr_mono);
                systime_snapshot->cs_id = tk->tkr_mono.clock->id;
                systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq;
                systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;
                base_real = ktime_add(tk->tkr_mono.base,
                                      tk_core.timekeeper.offs_real);
                base_boot = ktime_add(tk->tkr_mono.base,
                                      tk_core.timekeeper.offs_boot);
                base_raw = tk->tkr_raw.base;
                nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now);
                nsec_raw  = timekeeping_cycles_to_ns(&tk->tkr_raw, now);
        } while (read_seqcount_retry(&tk_core.seq, seq));

        systime_snapshot->cycles = now;
        systime_snapshot->real = ktime_add_ns(base_real, nsec_real);
        systime_snapshot->boot = ktime_add_ns(base_boot, nsec_real);
        systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw);
}
EXPORT_SYMBOL_GPL(ktime_get_snapshot);

/* Scale base by mult/div checking for overflow */
static int scale64_check_overflow(u64 mult, u64 div, u64 *base)
{
        u64 tmp, rem;

        tmp = div64_u64_rem(*base, div, &rem);

        if (((int)sizeof(u64)*8 - fls64(mult) < fls64(tmp)) ||
            ((int)sizeof(u64)*8 - fls64(mult) < fls64(rem)))
                return -EOVERFLOW;
        tmp *= mult;

        rem = div64_u64(rem * mult, div);
        *base = tmp + rem;
        return 0;
}

/**
 * adjust_historical_crosststamp - adjust crosstimestamp previous to current interval
 * @history:                        Snapshot representing start of history
 * @partial_history_cycles:        Cycle offset into history (fractional part)
 * @total_history_cycles:        Total history length in cycles
 * @discontinuity:                True indicates clock was set on history period
 * @ts:                                Cross timestamp that should be adjusted using
 *        partial/total ratio
 *
 * Helper function used by get_device_system_crosststamp() to correct the
 * crosstimestamp corresponding to the start of the current interval to the
 * system counter value (timestamp point) provided by the driver. The
 * total_history_* quantities are the total history starting at the provided
 * reference point and ending at the start of the current interval. The cycle
 * count between the driver timestamp point and the start of the current
 * interval is partial_history_cycles.
 */
static int adjust_historical_crosststamp(struct system_time_snapshot *history,
                                         u64 partial_history_cycles,
                                         u64 total_history_cycles,
                                         bool discontinuity,
                                         struct system_device_crosststamp *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        u64 corr_raw, corr_real;
        bool interp_forward;
        int ret;

        if (total_history_cycles == 0 || partial_history_cycles == 0)
                return 0;

        /* Interpolate shortest distance from beginning or end of history */
        interp_forward = partial_history_cycles > total_history_cycles / 2;
        partial_history_cycles = interp_forward ?
                total_history_cycles - partial_history_cycles :
                partial_history_cycles;

        /*
         * Scale the monotonic raw time delta by:
         *        partial_history_cycles / total_history_cycles
         */
        corr_raw = (u64)ktime_to_ns(
                ktime_sub(ts->sys_monoraw, history->raw));
        ret = scale64_check_overflow(partial_history_cycles,
                                     total_history_cycles, &corr_raw);
        if (ret)
                return ret;

        /*
         * If there is a discontinuity in the history, scale monotonic raw
         *        correction by:
         *        mult(real)/mult(raw) yielding the realtime correction
         * Otherwise, calculate the realtime correction similar to monotonic
         *        raw calculation
         */
        if (discontinuity) {
                corr_real = mul_u64_u32_div
                        (corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult);
        } else {
                corr_real = (u64)ktime_to_ns(
                        ktime_sub(ts->sys_realtime, history->real));
                ret = scale64_check_overflow(partial_history_cycles,
                                             total_history_cycles, &corr_real);
                if (ret)
                        return ret;
        }

        /* Fixup monotonic raw and real time time values */
        if (interp_forward) {
                ts->sys_monoraw = ktime_add_ns(history->raw, corr_raw);
                ts->sys_realtime = ktime_add_ns(history->real, corr_real);
        } else {
                ts->sys_monoraw = ktime_sub_ns(ts->sys_monoraw, corr_raw);
                ts->sys_realtime = ktime_sub_ns(ts->sys_realtime, corr_real);
        }

        return 0;
}

/*
 * timestamp_in_interval - true if ts is chronologically in [start, end]
 *
 * True if ts occurs chronologically at or after start, and before or at end.
 */
static bool timestamp_in_interval(u64 start, u64 end, u64 ts)
{
        if (ts >= start && ts <= end)
                return true;
        if (start > end && (ts >= start || ts <= end))
                return true;
        return false;
}

static bool convert_clock(u64 *val, u32 numerator, u32 denominator)
{
        u64 rem, res;

        if (!numerator || !denominator)
                return false;

        res = div64_u64_rem(*val, denominator, &rem) * numerator;
        *val = res + div_u64(rem * numerator, denominator);
        return true;
}

static bool convert_base_to_cs(struct system_counterval_t *scv)
{
        struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock;
        struct clocksource_base *base;
        u32 num, den;

        /* The timestamp was taken from the time keeper clock source */
        if (cs->id == scv->cs_id)
                return true;

        /*
         * Check whether cs_id matches the base clock. Prevent the compiler from
         * re-evaluating @base as the clocksource might change concurrently.
         */
        base = READ_ONCE(cs->base);
        if (!base || base->id != scv->cs_id)
                return false;

        num = scv->use_nsecs ? cs->freq_khz : base->numerator;
        den = scv->use_nsecs ? USEC_PER_SEC : base->denominator;

        if (!convert_clock(&scv->cycles, num, den))
                return false;

        scv->cycles += base->offset;
        return true;
}

static bool convert_cs_to_base(u64 *cycles, enum clocksource_ids base_id)
{
        struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock;
        struct clocksource_base *base;

        /*
         * Check whether base_id matches the base clock. Prevent the compiler from
         * re-evaluating @base as the clocksource might change concurrently.
         */
        base = READ_ONCE(cs->base);
        if (!base || base->id != base_id)
                return false;

        *cycles -= base->offset;
        if (!convert_clock(cycles, base->denominator, base->numerator))
                return false;
        return true;
}

static bool convert_ns_to_cs(u64 *delta)
{
        struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono;

        if (BITS_TO_BYTES(fls64(*delta) + tkr->shift) >= sizeof(*delta))
                return false;

        *delta = div_u64((*delta << tkr->shift) - tkr->xtime_nsec, tkr->mult);
        return true;
}

/**
 * ktime_real_to_base_clock() - Convert CLOCK_REALTIME timestamp to a base clock timestamp
 * @treal:        CLOCK_REALTIME timestamp to convert
 * @base_id:        base clocksource id
 * @cycles:        pointer to store the converted base clock timestamp
 *
 * Converts a supplied, future realtime clock value to the corresponding base clock value.
 *
 * Return:  true if the conversion is successful, false otherwise.
 */
bool ktime_real_to_base_clock(ktime_t treal, enum clocksource_ids base_id, u64 *cycles)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        u64 delta;

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                if ((u64)treal < tk->tkr_mono.base_real)
                        return false;
                delta = (u64)treal - tk->tkr_mono.base_real;
                if (!convert_ns_to_cs(&delta))
                        return false;
                *cycles = tk->tkr_mono.cycle_last + delta;
                if (!convert_cs_to_base(cycles, base_id))
                        return false;
        } while (read_seqcount_retry(&tk_core.seq, seq));

        return true;
}
EXPORT_SYMBOL_GPL(ktime_real_to_base_clock);

/**
 * get_device_system_crosststamp - Synchronously capture system/device timestamp
 * @get_time_fn:        Callback to get simultaneous device time and
 *        system counter from the device driver
 * @ctx:                Context passed to get_time_fn()
 * @history_begin:        Historical reference point used to interpolate system
 *        time when counter provided by the driver is before the current interval
 * @xtstamp:                Receives simultaneously captured system and device time
 *
 * Reads a timestamp from a device and correlates it to system time
 */
int get_device_system_crosststamp(int (*get_time_fn)
                                  (ktime_t *device_time,
                                   struct system_counterval_t *sys_counterval,
                                   void *ctx),
                                  void *ctx,
                                  struct system_time_snapshot *history_begin,
                                  struct system_device_crosststamp *xtstamp)
{
        struct system_counterval_t system_counterval = {};
        struct timekeeper *tk = &tk_core.timekeeper;
        u64 cycles, now, interval_start;
        unsigned int clock_was_set_seq = 0;
        ktime_t base_real, base_raw;
        u64 nsec_real, nsec_raw;
        u8 cs_was_changed_seq;
        unsigned int seq;
        bool do_interp;
        int ret;

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                /*
                 * Try to synchronously capture device time and a system
                 * counter value calling back into the device driver
                 */
                ret = get_time_fn(&xtstamp->device, &system_counterval, ctx);
                if (ret)
                        return ret;

                /*
                 * Verify that the clocksource ID associated with the captured
                 * system counter value is the same as for the currently
                 * installed timekeeper clocksource
                 */
                if (system_counterval.cs_id == CSID_GENERIC ||
                    !convert_base_to_cs(&system_counterval))
                        return -ENODEV;
                cycles = system_counterval.cycles;

                /*
                 * Check whether the system counter value provided by the
                 * device driver is on the current timekeeping interval.
                 */
                now = tk_clock_read(&tk->tkr_mono);
                interval_start = tk->tkr_mono.cycle_last;
                if (!timestamp_in_interval(interval_start, now, cycles)) {
                        clock_was_set_seq = tk->clock_was_set_seq;
                        cs_was_changed_seq = tk->cs_was_changed_seq;
                        cycles = interval_start;
                        do_interp = true;
                } else {
                        do_interp = false;
                }

                base_real = ktime_add(tk->tkr_mono.base,
                                      tk_core.timekeeper.offs_real);
                base_raw = tk->tkr_raw.base;

                nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, cycles);
                nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, cycles);
        } while (read_seqcount_retry(&tk_core.seq, seq));

        xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real);
        xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw);

        /*
         * Interpolate if necessary, adjusting back from the start of the
         * current interval
         */
        if (do_interp) {
                u64 partial_history_cycles, total_history_cycles;
                bool discontinuity;

                /*
                 * Check that the counter value is not before the provided
                 * history reference and that the history doesn't cross a
                 * clocksource change
                 */
                if (!history_begin ||
                    !timestamp_in_interval(history_begin->cycles,
                                           cycles, system_counterval.cycles) ||
                    history_begin->cs_was_changed_seq != cs_was_changed_seq)
                        return -EINVAL;
                partial_history_cycles = cycles - system_counterval.cycles;
                total_history_cycles = cycles - history_begin->cycles;
                discontinuity =
                        history_begin->clock_was_set_seq != clock_was_set_seq;

                ret = adjust_historical_crosststamp(history_begin,
                                                    partial_history_cycles,
                                                    total_history_cycles,
                                                    discontinuity, xtstamp);
                if (ret)
                        return ret;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(get_device_system_crosststamp);

/**
 * timekeeping_clocksource_has_base - Check whether the current clocksource
 *                                      is based on given a base clock
 * @id:                base clocksource ID
 *
 * Note:        The return value is a snapshot which can become invalid right
 *                after the function returns.
 *
 * Return:        true if the timekeeper clocksource has a base clock with @id,
 *                false otherwise
 */
bool timekeeping_clocksource_has_base(enum clocksource_ids id)
{
        /*
         * This is a snapshot, so no point in using the sequence
         * count. Just prevent the compiler from re-evaluating @base as the
         * clocksource might change concurrently.
         */
        struct clocksource_base *base = READ_ONCE(tk_core.timekeeper.tkr_mono.clock->base);

        return base ? base->id == id : false;
}
EXPORT_SYMBOL_GPL(timekeeping_clocksource_has_base);

/**
 * do_settimeofday64 - Sets the time of day.
 * @ts:     pointer to the timespec64 variable containing the new time
 *
 * Sets the time of day to the new time and update NTP and notify hrtimers
 */
int do_settimeofday64(const struct timespec64 *ts)
{
        struct timespec64 ts_delta, xt;

        if (!timespec64_valid_settod(ts))
                return -EINVAL;

        scoped_guard (raw_spinlock_irqsave, &tk_core.lock) {
                struct timekeeper *tks = &tk_core.shadow_timekeeper;

                timekeeping_forward_now(tks);

                xt = tk_xtime(tks);
                ts_delta = timespec64_sub(*ts, xt);

                if (timespec64_compare(&tks->wall_to_monotonic, &ts_delta) > 0) {
                        timekeeping_restore_shadow(&tk_core);
                        return -EINVAL;
                }

                tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, ts_delta));
                tk_set_xtime(tks, ts);
                timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
        }

        /* Signal hrtimers about time change */
        clock_was_set(CLOCK_SET_WALL);

        audit_tk_injoffset(ts_delta);
        add_device_randomness(ts, sizeof(*ts));
        return 0;
}
EXPORT_SYMBOL(do_settimeofday64);

static inline bool timekeeper_is_core_tk(struct timekeeper *tk)
{
        return !IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS) || tk->id == TIMEKEEPER_CORE;
}

/**
 * __timekeeping_inject_offset - Adds or subtracts from the current time.
 * @tkd:        Pointer to the timekeeper to modify
 * @ts:                Pointer to the timespec variable containing the offset
 *
 * Adds or subtracts an offset value from the current time.
 */
static int __timekeeping_inject_offset(struct tk_data *tkd, const struct timespec64 *ts)
{
        struct timekeeper *tks = &tkd->shadow_timekeeper;
        struct timespec64 tmp;

        if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC)
                return -EINVAL;

        timekeeping_forward_now(tks);

        if (timekeeper_is_core_tk(tks)) {
                /* Make sure the proposed value is valid */
                tmp = timespec64_add(tk_xtime(tks), *ts);
                if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 ||
                    !timespec64_valid_settod(&tmp)) {
                        timekeeping_restore_shadow(tkd);
                        return -EINVAL;
                }

                tk_xtime_add(tks, ts);
                tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts));
        } else {
                struct tk_read_base *tkr_mono = &tks->tkr_mono;
                ktime_t now, offs;

                /* Get the current time */
                now = ktime_add_ns(tkr_mono->base, timekeeping_get_ns(tkr_mono));
                /* Add the relative offset change */
                offs = ktime_add(tks->offs_aux, timespec64_to_ktime(*ts));

                /* Prevent that the resulting time becomes negative */
                if (ktime_add(now, offs) < 0) {
                        timekeeping_restore_shadow(tkd);
                        return -EINVAL;
                }
                tk_update_aux_offs(tks, offs);
        }

        timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL);
        return 0;
}

static int timekeeping_inject_offset(const struct timespec64 *ts)
{
        int ret;

        scoped_guard (raw_spinlock_irqsave, &tk_core.lock)
                ret = __timekeeping_inject_offset(&tk_core, ts);

        /* Signal hrtimers about time change */
        if (!ret)
                clock_was_set(CLOCK_SET_WALL);
        return ret;
}

/*
 * Indicates if there is an offset between the system clock and the hardware
 * clock/persistent clock/rtc.
 */
int persistent_clock_is_local;

/*
 * Adjust the time obtained from the CMOS to be UTC time instead of
 * local time.
 *
 * This is ugly, but preferable to the alternatives.  Otherwise we
 * would either need to write a program to do it in /etc/rc (and risk
 * confusion if the program gets run more than once; it would also be
 * hard to make the program warp the clock precisely n hours)  or
 * compile in the timezone information into the kernel.  Bad, bad....
 *
 *                                                - TYT, 1992-01-01
 *
 * The best thing to do is to keep the CMOS clock in universal time (UTC)
 * as real UNIX machines always do it. This avoids all headaches about
 * daylight saving times and warping kernel clocks.
 */
void timekeeping_warp_clock(void)
{
        if (sys_tz.tz_minuteswest != 0) {
                struct timespec64 adjust;

                persistent_clock_is_local = 1;
                adjust.tv_sec = sys_tz.tz_minuteswest * 60;
                adjust.tv_nsec = 0;
                timekeeping_inject_offset(&adjust);
        }
}

/*
 * __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic
 */
static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
{
        tk->tai_offset = tai_offset;
        tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0));
}

/*
 * change_clocksource - Swaps clocksources if a new one is available
 *
 * Accumulates current time interval and initializes new clocksource
 */
static int change_clocksource(void *data)
{
        struct clocksource *new = data, *old = NULL;

        /*
         * If the clocksource is in a module, get a module reference.
         * Succeeds for built-in code (owner == NULL) as well. Abort if the
         * reference can't be acquired.
         */
        if (!try_module_get(new->owner))
                return 0;

        /* Abort if the device can't be enabled */
        if (new->enable && new->enable(new) != 0) {
                module_put(new->owner);
                return 0;
        }

        scoped_guard (raw_spinlock_irqsave, &tk_core.lock) {
                struct timekeeper *tks = &tk_core.shadow_timekeeper;

                timekeeping_forward_now(tks);
                old = tks->tkr_mono.clock;
                tk_setup_internals(tks, new);
                timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
        }

        tk_aux_update_clocksource();

        if (old) {
                if (old->disable)
                        old->disable(old);
                module_put(old->owner);
        }

        return 0;
}

/**
 * timekeeping_notify - Install a new clock source
 * @clock:                pointer to the clock source
 *
 * This function is called from clocksource.c after a new, better clock
 * source has been registered. The caller holds the clocksource_mutex.
 */
int timekeeping_notify(struct clocksource *clock)
{
        struct timekeeper *tk = &tk_core.timekeeper;

        if (tk->tkr_mono.clock == clock)
                return 0;
        stop_machine(change_clocksource, clock, NULL);
        tick_clock_notify();
        return tk->tkr_mono.clock == clock ? 0 : -1;
}

/**
 * ktime_get_raw_ts64 - Returns the raw monotonic time in a timespec
 * @ts:                pointer to the timespec64 to be set
 *
 * Returns the raw monotonic time (completely un-modified by ntp)
 */
void ktime_get_raw_ts64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        u64 nsecs;

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                ts->tv_sec = tk->raw_sec;
                nsecs = timekeeping_get_ns(&tk->tkr_raw);

        } while (read_seqcount_retry(&tk_core.seq, seq));

        ts->tv_nsec = 0;
        timespec64_add_ns(ts, nsecs);
}
EXPORT_SYMBOL(ktime_get_raw_ts64);

/**
 * ktime_get_clock_ts64 - Returns time of a clock in a timespec
 * @id:                POSIX clock ID of the clock to read
 * @ts:                Pointer to the timespec64 to be set
 *
 * The timestamp is invalidated (@ts->sec is set to -1) if the
 * clock @id is not available.
 */
void ktime_get_clock_ts64(clockid_t id, struct timespec64 *ts)
{
        /* Invalidate time stamp */
        ts->tv_sec = -1;
        ts->tv_nsec = 0;

        switch (id) {
        case CLOCK_REALTIME:
                ktime_get_real_ts64(ts);
                return;
        case CLOCK_MONOTONIC:
                ktime_get_ts64(ts);
                return;
        case CLOCK_MONOTONIC_RAW:
                ktime_get_raw_ts64(ts);
                return;
        case CLOCK_AUX ... CLOCK_AUX_LAST:
                if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS))
                        ktime_get_aux_ts64(id, ts);
                return;
        default:
                WARN_ON_ONCE(1);
        }
}
EXPORT_SYMBOL_GPL(ktime_get_clock_ts64);

/**
 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
 */
int timekeeping_valid_for_hres(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        int ret;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ret;
}

/**
 * timekeeping_max_deferment - Returns max time the clocksource can be deferred
 */
u64 timekeeping_max_deferment(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        u64 ret;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                ret = tk->tkr_mono.clock->max_idle_ns;

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ret;
}

/**
 * read_persistent_clock64 -  Return time from the persistent clock.
 * @ts: Pointer to the storage for the readout value
 *
 * Weak dummy function for arches that do not yet support it.
 * Reads the time from the battery backed persistent clock.
 * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
 *
 *  XXX - Do be sure to remove it once all arches implement it.
 */
void __weak read_persistent_clock64(struct timespec64 *ts)
{
        ts->tv_sec = 0;
        ts->tv_nsec = 0;
}

/**
 * read_persistent_wall_and_boot_offset - Read persistent clock, and also offset
 *                                        from the boot.
 * @wall_time:          current time as returned by persistent clock
 * @boot_offset:  offset that is defined as wall_time - boot_time
 *
 * Weak dummy function for arches that do not yet support it.
 *
 * The default function calculates offset based on the current value of
 * local_clock(). This way architectures that support sched_clock() but don't
 * support dedicated boot time clock will provide the best estimate of the
 * boot time.
 */
void __weak __init
read_persistent_wall_and_boot_offset(struct timespec64 *wall_time,
                                     struct timespec64 *boot_offset)
{
        read_persistent_clock64(wall_time);
        *boot_offset = ns_to_timespec64(local_clock());
}

static __init void tkd_basic_setup(struct tk_data *tkd, enum timekeeper_ids tk_id, bool valid)
{
        raw_spin_lock_init(&tkd->lock);
        seqcount_raw_spinlock_init(&tkd->seq, &tkd->lock);
        tkd->timekeeper.id = tkd->shadow_timekeeper.id = tk_id;
        tkd->timekeeper.clock_valid = tkd->shadow_timekeeper.clock_valid = valid;
}

/*
 * Flag reflecting whether timekeeping_resume() has injected sleeptime.
 *
 * The flag starts of false and is only set when a suspend reaches
 * timekeeping_suspend(), timekeeping_resume() sets it to false when the
 * timekeeper clocksource is not stopping across suspend and has been
 * used to update sleep time. If the timekeeper clocksource has stopped
 * then the flag stays true and is used by the RTC resume code to decide
 * whether sleeptime must be injected and if so the flag gets false then.
 *
 * If a suspend fails before reaching timekeeping_resume() then the flag
 * stays false and prevents erroneous sleeptime injection.
 */
static bool suspend_timing_needed;

/* Flag for if there is a persistent clock on this platform */
static bool persistent_clock_exists;

/*
 * timekeeping_init - Initializes the clocksource and common timekeeping values
 */
void __init timekeeping_init(void)
{
        struct timespec64 wall_time, boot_offset, wall_to_mono;
        struct timekeeper *tks = &tk_core.shadow_timekeeper;
        struct clocksource *clock;

        tkd_basic_setup(&tk_core, TIMEKEEPER_CORE, true);
        tk_aux_setup();

        read_persistent_wall_and_boot_offset(&wall_time, &boot_offset);
        if (timespec64_valid_settod(&wall_time) &&
            timespec64_to_ns(&wall_time) > 0) {
                persistent_clock_exists = true;
        } else if (timespec64_to_ns(&wall_time) != 0) {
                pr_warn("Persistent clock returned invalid value");
                wall_time = (struct timespec64){0};
        }

        if (timespec64_compare(&wall_time, &boot_offset) < 0)
                boot_offset = (struct timespec64){0};

        /*
         * We want set wall_to_mono, so the following is true:
         * wall time + wall_to_mono = boot time
         */
        wall_to_mono = timespec64_sub(boot_offset, wall_time);

        guard(raw_spinlock_irqsave)(&tk_core.lock);

        ntp_init();

        clock = clocksource_default_clock();
        if (clock->enable)
                clock->enable(clock);
        tk_setup_internals(tks, clock);

        tk_set_xtime(tks, &wall_time);
        tks->raw_sec = 0;

        tk_set_wall_to_mono(tks, wall_to_mono);

        timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET);
}

/* time in seconds when suspend began for persistent clock */
static struct timespec64 timekeeping_suspend_time;

/**
 * __timekeeping_inject_sleeptime - Internal function to add sleep interval
 * @tk:                Pointer to the timekeeper to be updated
 * @delta:        Pointer to the delta value in timespec64 format
 *
 * Takes a timespec offset measuring a suspend interval and properly
 * adds the sleep offset to the timekeeping variables.
 */
static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
                                           const struct timespec64 *delta)
{
        if (!timespec64_valid_strict(delta)) {
                printk_deferred(KERN_WARNING
                                "__timekeeping_inject_sleeptime: Invalid "
                                "sleep delta value!\n");
                return;
        }
        tk_xtime_add(tk, delta);
        tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta));
        tk_update_sleep_time(tk, timespec64_to_ktime(*delta));
        tk_debug_account_sleep_time(delta);
}

#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE)
/*
 * We have three kinds of time sources to use for sleep time
 * injection, the preference order is:
 * 1) non-stop clocksource
 * 2) persistent clock (ie: RTC accessible when irqs are off)
 * 3) RTC
 *
 * 1) and 2) are used by timekeeping, 3) by RTC subsystem.
 * If system has neither 1) nor 2), 3) will be used finally.
 *
 *
 * If timekeeping has injected sleeptime via either 1) or 2),
 * 3) becomes needless, so in this case we don't need to call
 * rtc_resume(), and this is what timekeeping_rtc_skipresume()
 * means.
 */
bool timekeeping_rtc_skipresume(void)
{
        return !suspend_timing_needed;
}

/*
 * 1) can be determined whether to use or not only when doing
 * timekeeping_resume() which is invoked after rtc_suspend(),
 * so we can't skip rtc_suspend() surely if system has 1).
 *
 * But if system has 2), 2) will definitely be used, so in this
 * case we don't need to call rtc_suspend(), and this is what
 * timekeeping_rtc_skipsuspend() means.
 */
bool timekeeping_rtc_skipsuspend(void)
{
        return persistent_clock_exists;
}

/**
 * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values
 * @delta: pointer to a timespec64 delta value
 *
 * This hook is for architectures that cannot support read_persistent_clock64
 * because their RTC/persistent clock is only accessible when irqs are enabled.
 * and also don't have an effective nonstop clocksource.
 *
 * This function should only be called by rtc_resume(), and allows
 * a suspend offset to be injected into the timekeeping values.
 */
void timekeeping_inject_sleeptime64(const struct timespec64 *delta)
{
        scoped_guard(raw_spinlock_irqsave, &tk_core.lock) {
                struct timekeeper *tks = &tk_core.shadow_timekeeper;

                suspend_timing_needed = false;
                timekeeping_forward_now(tks);
                __timekeeping_inject_sleeptime(tks, delta);
                timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
        }

        /* Signal hrtimers about time change */
        clock_was_set(CLOCK_SET_WALL | CLOCK_SET_BOOT);
}
#endif

/**
 * timekeeping_resume - Resumes the generic timekeeping subsystem.
 */
void timekeeping_resume(void)
{
        struct timekeeper *tks = &tk_core.shadow_timekeeper;
        struct clocksource *clock = tks->tkr_mono.clock;
        struct timespec64 ts_new, ts_delta;
        bool inject_sleeptime = false;
        u64 cycle_now, nsec;
        unsigned long flags;

        read_persistent_clock64(&ts_new);

        clockevents_resume();
        clocksource_resume();

        raw_spin_lock_irqsave(&tk_core.lock, flags);

        /*
         * After system resumes, we need to calculate the suspended time and
         * compensate it for the OS time. There are 3 sources that could be
         * used: Nonstop clocksource during suspend, persistent clock and rtc
         * device.
         *
         * One specific platform may have 1 or 2 or all of them, and the
         * preference will be:
         *        suspend-nonstop clocksource -> persistent clock -> rtc
         * The less preferred source will only be tried if there is no better
         * usable source. The rtc part is handled separately in rtc core code.
         */
        cycle_now = tk_clock_read(&tks->tkr_mono);
        nsec = clocksource_stop_suspend_timing(clock, cycle_now);
        if (nsec > 0) {
                ts_delta = ns_to_timespec64(nsec);
                inject_sleeptime = true;
        } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) {
                ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time);
                inject_sleeptime = true;
        }

        if (inject_sleeptime) {
                suspend_timing_needed = false;
                __timekeeping_inject_sleeptime(tks, &ts_delta);
        }

        /* Re-base the last cycle value */
        tks->tkr_mono.cycle_last = cycle_now;
        tks->tkr_raw.cycle_last  = cycle_now;

        tks->ntp_error = 0;
        timekeeping_suspended = 0;
        timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET);
        raw_spin_unlock_irqrestore(&tk_core.lock, flags);

        touch_softlockup_watchdog();

        /* Resume the clockevent device(s) and hrtimers */
        tick_resume();
        /* Notify timerfd as resume is equivalent to clock_was_set() */
        timerfd_resume();
}

int timekeeping_suspend(void)
{
        struct timekeeper *tks = &tk_core.shadow_timekeeper;
        struct timespec64 delta, delta_delta;
        static struct timespec64 old_delta;
        struct clocksource *curr_clock;
        unsigned long flags;
        u64 cycle_now;

        read_persistent_clock64(&timekeeping_suspend_time);

        /*
         * On some systems the persistent_clock can not be detected at
         * timekeeping_init by its return value, so if we see a valid
         * value returned, update the persistent_clock_exists flag.
         */
        if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec)
                persistent_clock_exists = true;

        suspend_timing_needed = true;

        raw_spin_lock_irqsave(&tk_core.lock, flags);
        timekeeping_forward_now(tks);
        timekeeping_suspended = 1;

        /*
         * Since we've called forward_now, cycle_last stores the value
         * just read from the current clocksource. Save this to potentially
         * use in suspend timing.
         */
        curr_clock = tks->tkr_mono.clock;
        cycle_now = tks->tkr_mono.cycle_last;
        clocksource_start_suspend_timing(curr_clock, cycle_now);

        if (persistent_clock_exists) {
                /*
                 * To avoid drift caused by repeated suspend/resumes,
                 * which each can add ~1 second drift error,
                 * try to compensate so the difference in system time
                 * and persistent_clock time stays close to constant.
                 */
                delta = timespec64_sub(tk_xtime(tks), timekeeping_suspend_time);
                delta_delta = timespec64_sub(delta, old_delta);
                if (abs(delta_delta.tv_sec) >= 2) {
                        /*
                         * if delta_delta is too large, assume time correction
                         * has occurred and set old_delta to the current delta.
                         */
                        old_delta = delta;
                } else {
                        /* Otherwise try to adjust old_system to compensate */
                        timekeeping_suspend_time =
                                timespec64_add(timekeeping_suspend_time, delta_delta);
                }
        }

        timekeeping_update_from_shadow(&tk_core, 0);
        halt_fast_timekeeper(tks);
        raw_spin_unlock_irqrestore(&tk_core.lock, flags);

        tick_suspend();
        clocksource_suspend();
        clockevents_suspend();

        return 0;
}

/* sysfs resume/suspend bits for timekeeping */
static struct syscore_ops timekeeping_syscore_ops = {
        .resume                = timekeeping_resume,
        .suspend        = timekeeping_suspend,
};

static int __init timekeeping_init_ops(void)
{
        register_syscore_ops(&timekeeping_syscore_ops);
        return 0;
}
device_initcall(timekeeping_init_ops);

/*
 * Apply a multiplier adjustment to the timekeeper
 */
static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
                                                         s64 offset,
                                                         s32 mult_adj)
{
        s64 interval = tk->cycle_interval;

        if (mult_adj == 0) {
                return;
        } else if (mult_adj == -1) {
                interval = -interval;
                offset = -offset;
        } else if (mult_adj != 1) {
                interval *= mult_adj;
                offset *= mult_adj;
        }

        /*
         * So the following can be confusing.
         *
         * To keep things simple, lets assume mult_adj == 1 for now.
         *
         * When mult_adj != 1, remember that the interval and offset values
         * have been appropriately scaled so the math is the same.
         *
         * The basic idea here is that we're increasing the multiplier
         * by one, this causes the xtime_interval to be incremented by
         * one cycle_interval. This is because:
         *        xtime_interval = cycle_interval * mult
         * So if mult is being incremented by one:
         *        xtime_interval = cycle_interval * (mult + 1)
         * Its the same as:
         *        xtime_interval = (cycle_interval * mult) + cycle_interval
         * Which can be shortened to:
         *        xtime_interval += cycle_interval
         *
         * So offset stores the non-accumulated cycles. Thus the current
         * time (in shifted nanoseconds) is:
         *        now = (offset * adj) + xtime_nsec
         * Now, even though we're adjusting the clock frequency, we have
         * to keep time consistent. In other words, we can't jump back
         * in time, and we also want to avoid jumping forward in time.
         *
         * So given the same offset value, we need the time to be the same
         * both before and after the freq adjustment.
         *        now = (offset * adj_1) + xtime_nsec_1
         *        now = (offset * adj_2) + xtime_nsec_2
         * So:
         *        (offset * adj_1) + xtime_nsec_1 =
         *                (offset * adj_2) + xtime_nsec_2
         * And we know:
         *        adj_2 = adj_1 + 1
         * So:
         *        (offset * adj_1) + xtime_nsec_1 =
         *                (offset * (adj_1+1)) + xtime_nsec_2
         *        (offset * adj_1) + xtime_nsec_1 =
         *                (offset * adj_1) + offset + xtime_nsec_2
         * Canceling the sides:
         *        xtime_nsec_1 = offset + xtime_nsec_2
         * Which gives us:
         *        xtime_nsec_2 = xtime_nsec_1 - offset
         * Which simplifies to:
         *        xtime_nsec -= offset
         */
        if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) {
                /* NTP adjustment caused clocksource mult overflow */
                WARN_ON_ONCE(1);
                return;
        }

        tk->tkr_mono.mult += mult_adj;
        tk->xtime_interval += interval;
        tk->tkr_mono.xtime_nsec -= offset;
}

/*
 * Adjust the timekeeper's multiplier to the correct frequency
 * and also to reduce the accumulated error value.
 */
static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
{
        u64 ntp_tl = ntp_tick_length(tk->id);
        u32 mult;

        /*
         * Determine the multiplier from the current NTP tick length.
         * Avoid expensive division when the tick length doesn't change.
         */
        if (likely(tk->ntp_tick == ntp_tl)) {
                mult = tk->tkr_mono.mult - tk->ntp_err_mult;
        } else {
                tk->ntp_tick = ntp_tl;
                mult = div64_u64((tk->ntp_tick >> tk->ntp_error_shift) -
                                 tk->xtime_remainder, tk->cycle_interval);
        }

        /*
         * If the clock is behind the NTP time, increase the multiplier by 1
         * to catch up with it. If it's ahead and there was a remainder in the
         * tick division, the clock will slow down. Otherwise it will stay
         * ahead until the tick length changes to a non-divisible value.
         */
        tk->ntp_err_mult = tk->ntp_error > 0 ? 1 : 0;
        mult += tk->ntp_err_mult;

        timekeeping_apply_adjustment(tk, offset, mult - tk->tkr_mono.mult);

        if (unlikely(tk->tkr_mono.clock->maxadj &&
                (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult)
                        > tk->tkr_mono.clock->maxadj))) {
                printk_once(KERN_WARNING
                        "Adjusting %s more than 11%% (%ld vs %ld)\n",
                        tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult,
                        (long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj);
        }

        /*
         * It may be possible that when we entered this function, xtime_nsec
         * was very small.  Further, if we're slightly speeding the clocksource
         * in the code above, its possible the required corrective factor to
         * xtime_nsec could cause it to underflow.
         *
         * Now, since we have already accumulated the second and the NTP
         * subsystem has been notified via second_overflow(), we need to skip
         * the next update.
         */
        if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) {
                tk->tkr_mono.xtime_nsec += (u64)NSEC_PER_SEC <<
                                                        tk->tkr_mono.shift;
                tk->xtime_sec--;
                tk->skip_second_overflow = 1;
        }
}

/*
 * accumulate_nsecs_to_secs - Accumulates nsecs into secs
 *
 * Helper function that accumulates the nsecs greater than a second
 * from the xtime_nsec field to the xtime_secs field.
 * It also calls into the NTP code to handle leapsecond processing.
 */
static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
{
        u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
        unsigned int clock_set = 0;

        while (tk->tkr_mono.xtime_nsec >= nsecps) {
                int leap;

                tk->tkr_mono.xtime_nsec -= nsecps;
                tk->xtime_sec++;

                /*
                 * Skip NTP update if this second was accumulated before,
                 * i.e. xtime_nsec underflowed in timekeeping_adjust()
                 */
                if (unlikely(tk->skip_second_overflow)) {
                        tk->skip_second_overflow = 0;
                        continue;
                }

                /* Figure out if its a leap sec and apply if needed */
                leap = second_overflow(tk->id, tk->xtime_sec);
                if (unlikely(leap)) {
                        struct timespec64 ts;

                        tk->xtime_sec += leap;

                        ts.tv_sec = leap;
                        ts.tv_nsec = 0;
                        tk_set_wall_to_mono(tk,
                                timespec64_sub(tk->wall_to_monotonic, ts));

                        __timekeeping_set_tai_offset(tk, tk->tai_offset - leap);

                        clock_set = TK_CLOCK_WAS_SET;
                }
        }
        return clock_set;
}

/*
 * logarithmic_accumulation - shifted accumulation of cycles
 *
 * This functions accumulates a shifted interval of cycles into
 * a shifted interval nanoseconds. Allows for O(log) accumulation
 * loop.
 *
 * Returns the unconsumed cycles.
 */
static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
                                    u32 shift, unsigned int *clock_set)
{
        u64 interval = tk->cycle_interval << shift;
        u64 snsec_per_sec;

        /* If the offset is smaller than a shifted interval, do nothing */
        if (offset < interval)
                return offset;

        /* Accumulate one shifted interval */
        offset -= interval;
        tk->tkr_mono.cycle_last += interval;
        tk->tkr_raw.cycle_last  += interval;

        tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift;
        *clock_set |= accumulate_nsecs_to_secs(tk);

        /* Accumulate raw time */
        tk->tkr_raw.xtime_nsec += tk->raw_interval << shift;
        snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
        while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) {
                tk->tkr_raw.xtime_nsec -= snsec_per_sec;
                tk->raw_sec++;
        }

        /* Accumulate error between NTP and clock interval */
        tk->ntp_error += tk->ntp_tick << shift;
        tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) <<
                                                (tk->ntp_error_shift + shift);

        return offset;
}

/*
 * timekeeping_advance - Updates the timekeeper to the current time and
 * current NTP tick length
 */
static bool __timekeeping_advance(struct tk_data *tkd, enum timekeeping_adv_mode mode)
{
        struct timekeeper *tk = &tkd->shadow_timekeeper;
        struct timekeeper *real_tk = &tkd->timekeeper;
        unsigned int clock_set = 0;
        int shift = 0, maxshift;
        u64 offset, orig_offset;

        /* Make sure we're fully resumed: */
        if (unlikely(timekeeping_suspended))
                return false;

        offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
                                   tk->tkr_mono.cycle_last, tk->tkr_mono.mask,
                                   tk->tkr_mono.clock->max_raw_delta);
        orig_offset = offset;
        /* Check if there's really nothing to do */
        if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK)
                return false;

        /*
         * With NO_HZ we may have to accumulate many cycle_intervals
         * (think "ticks") worth of time at once. To do this efficiently,
         * we calculate the largest doubling multiple of cycle_intervals
         * that is smaller than the offset.  We then accumulate that
         * chunk in one go, and then try to consume the next smaller
         * doubled multiple.
         */
        shift = ilog2(offset) - ilog2(tk->cycle_interval);
        shift = max(0, shift);
        /* Bound shift to one less than what overflows tick_length */
        maxshift = (64 - (ilog2(ntp_tick_length(tk->id)) + 1)) - 1;
        shift = min(shift, maxshift);
        while (offset >= tk->cycle_interval) {
                offset = logarithmic_accumulation(tk, offset, shift, &clock_set);
                if (offset < tk->cycle_interval<<shift)
                        shift--;
        }

        /* Adjust the multiplier to correct NTP error */
        timekeeping_adjust(tk, offset);

        /*
         * Finally, make sure that after the rounding
         * xtime_nsec isn't larger than NSEC_PER_SEC
         */
        clock_set |= accumulate_nsecs_to_secs(tk);

        /*
         * To avoid inconsistencies caused adjtimex TK_ADV_FREQ calls
         * making small negative adjustments to the base xtime_nsec
         * value, only update the coarse clocks if we accumulated time
         */
        if (orig_offset != offset)
                tk_update_coarse_nsecs(tk);

        timekeeping_update_from_shadow(tkd, clock_set);

        return !!clock_set;
}

static bool timekeeping_advance(enum timekeeping_adv_mode mode)
{
        guard(raw_spinlock_irqsave)(&tk_core.lock);
        return __timekeeping_advance(&tk_core, mode);
}

/**
 * update_wall_time - Uses the current clocksource to increment the wall time
 *
 * It also updates the enabled auxiliary clock timekeepers
 */
void update_wall_time(void)
{
        if (timekeeping_advance(TK_ADV_TICK))
                clock_was_set_delayed();
        tk_aux_advance();
}

/**
 * getboottime64 - Return the real time of system boot.
 * @ts:                pointer to the timespec64 to be set
 *
 * Returns the wall-time of boot in a timespec64.
 *
 * This is based on the wall_to_monotonic offset and the total suspend
 * time. Calls to settimeofday will affect the value returned (which
 * basically means that however wrong your real time clock is at boot time,
 * you get the right time here).
 */
void getboottime64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot);

        *ts = ktime_to_timespec64(t);
}
EXPORT_SYMBOL_GPL(getboottime64);

void ktime_get_coarse_real_ts64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                *ts = tk_xtime_coarse(tk);
        } while (read_seqcount_retry(&tk_core.seq, seq));
}
EXPORT_SYMBOL(ktime_get_coarse_real_ts64);

/**
 * ktime_get_coarse_real_ts64_mg - return latter of coarse grained time or floor
 * @ts:                timespec64 to be filled
 *
 * Fetch the global mg_floor value, convert it to realtime and compare it
 * to the current coarse-grained time. Fill @ts with whichever is
 * latest. Note that this is a filesystem-specific interface and should be
 * avoided outside of that context.
 */
void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        u64 floor = atomic64_read(&mg_floor);
        ktime_t f_real, offset, coarse;
        unsigned int seq;

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                *ts = tk_xtime_coarse(tk);
                offset = tk_core.timekeeper.offs_real;
        } while (read_seqcount_retry(&tk_core.seq, seq));

        coarse = timespec64_to_ktime(*ts);
        f_real = ktime_add(floor, offset);
        if (ktime_after(f_real, coarse))
                *ts = ktime_to_timespec64(f_real);
}

/**
 * ktime_get_real_ts64_mg - attempt to update floor value and return result
 * @ts:                pointer to the timespec to be set
 *
 * Get a monotonic fine-grained time value and attempt to swap it into
 * mg_floor. If that succeeds then accept the new floor value. If it fails
 * then another task raced in during the interim time and updated the
 * floor.  Since any update to the floor must be later than the previous
 * floor, either outcome is acceptable.
 *
 * Typically this will be called after calling ktime_get_coarse_real_ts64_mg(),
 * and determining that the resulting coarse-grained timestamp did not effect
 * a change in ctime. Any more recent floor value would effect a change to
 * ctime, so there is no need to retry the atomic64_try_cmpxchg() on failure.
 *
 * @ts will be filled with the latest floor value, regardless of the outcome of
 * the cmpxchg. Note that this is a filesystem specific interface and should be
 * avoided outside of that context.
 */
void ktime_get_real_ts64_mg(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        ktime_t old = atomic64_read(&mg_floor);
        ktime_t offset, mono;
        unsigned int seq;
        u64 nsecs;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                ts->tv_sec = tk->xtime_sec;
                mono = tk->tkr_mono.base;
                nsecs = timekeeping_get_ns(&tk->tkr_mono);
                offset = tk_core.timekeeper.offs_real;
        } while (read_seqcount_retry(&tk_core.seq, seq));

        mono = ktime_add_ns(mono, nsecs);

        /*
         * Attempt to update the floor with the new time value. As any
         * update must be later then the existing floor, and would effect
         * a change to ctime from the perspective of the current task,
         * accept the resulting floor value regardless of the outcome of
         * the swap.
         */
        if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) {
                ts->tv_nsec = 0;
                timespec64_add_ns(ts, nsecs);
                timekeeping_inc_mg_floor_swaps();
        } else {
                /*
                 * Another task changed mg_floor since "old" was fetched.
                 * "old" has been updated with the latest value of "mg_floor".
                 * That value is newer than the previous floor value, which
                 * is enough to effect a change to ctime. Accept it.
                 */
                *ts = ktime_to_timespec64(ktime_add(old, offset));
        }
}

void ktime_get_coarse_ts64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        struct timespec64 now, mono;
        unsigned int seq;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                now = tk_xtime_coarse(tk);
                mono = tk->wall_to_monotonic;
        } while (read_seqcount_retry(&tk_core.seq, seq));

        set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec,
                                  now.tv_nsec + mono.tv_nsec);
}
EXPORT_SYMBOL(ktime_get_coarse_ts64);

/*
 * Must hold jiffies_lock
 */
void do_timer(unsigned long ticks)
{
        jiffies_64 += ticks;
        calc_global_load();
}

/**
 * ktime_get_update_offsets_now - hrtimer helper
 * @cwsseq:        pointer to check and store the clock was set sequence number
 * @offs_real:        pointer to storage for monotonic -> realtime offset
 * @offs_boot:        pointer to storage for monotonic -> boottime offset
 * @offs_tai:        pointer to storage for monotonic -> clock tai offset
 *
 * Returns current monotonic time and updates the offsets if the
 * sequence number in @cwsseq and timekeeper.clock_was_set_seq are
 * different.
 *
 * Called from hrtimer_interrupt() or retrigger_next_event()
 */
ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
                                     ktime_t *offs_boot, ktime_t *offs_tai)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base;
        u64 nsecs;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                base = tk->tkr_mono.base;
                nsecs = timekeeping_get_ns(&tk->tkr_mono);
                base = ktime_add_ns(base, nsecs);

                if (*cwsseq != tk->clock_was_set_seq) {
                        *cwsseq = tk->clock_was_set_seq;
                        *offs_real = tk->offs_real;
                        *offs_boot = tk->offs_boot;
                        *offs_tai = tk->offs_tai;
                }

                /* Handle leapsecond insertion adjustments */
                if (unlikely(base >= tk->next_leap_ktime))
                        *offs_real = ktime_sub(tk->offs_real, ktime_set(1, 0));

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return base;
}

/*
 * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex
 */
static int timekeeping_validate_timex(const struct __kernel_timex *txc, bool aux_clock)
{
        if (txc->modes & ADJ_ADJTIME) {
                /* singleshot must not be used with any other mode bits */
                if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
                        return -EINVAL;
                if (!(txc->modes & ADJ_OFFSET_READONLY) &&
                    !capable(CAP_SYS_TIME))
                        return -EPERM;
        } else {
                /* In order to modify anything, you gotta be super-user! */
                if (txc->modes && !capable(CAP_SYS_TIME))
                        return -EPERM;
                /*
                 * if the quartz is off by more than 10% then
                 * something is VERY wrong!
                 */
                if (txc->modes & ADJ_TICK &&
                    (txc->tick <  900000/USER_HZ ||
                     txc->tick > 1100000/USER_HZ))
                        return -EINVAL;
        }

        if (txc->modes & ADJ_SETOFFSET) {
                /* In order to inject time, you gotta be super-user! */
                if (!capable(CAP_SYS_TIME))
                        return -EPERM;

                /*
                 * Validate if a timespec/timeval used to inject a time
                 * offset is valid.  Offsets can be positive or negative, so
                 * we don't check tv_sec. The value of the timeval/timespec
                 * is the sum of its fields,but *NOTE*:
                 * The field tv_usec/tv_nsec must always be non-negative and
                 * we can't have more nanoseconds/microseconds than a second.
                 */
                if (txc->time.tv_usec < 0)
                        return -EINVAL;

                if (txc->modes & ADJ_NANO) {
                        if (txc->time.tv_usec >= NSEC_PER_SEC)
                                return -EINVAL;
                } else {
                        if (txc->time.tv_usec >= USEC_PER_SEC)
                                return -EINVAL;
                }
        }

        /*
         * Check for potential multiplication overflows that can
         * only happen on 64-bit systems:
         */
        if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) {
                if (LLONG_MIN / PPM_SCALE > txc->freq)
                        return -EINVAL;
                if (LLONG_MAX / PPM_SCALE < txc->freq)
                        return -EINVAL;
        }

        if (aux_clock) {
                /* Auxiliary clocks are similar to TAI and do not have leap seconds */
                if (txc->status & (STA_INS | STA_DEL))
                        return -EINVAL;

                /* No TAI offset setting */
                if (txc->modes & ADJ_TAI)
                        return -EINVAL;

                /* No PPS support either */
                if (txc->status & (STA_PPSFREQ | STA_PPSTIME))
                        return -EINVAL;
        }

        return 0;
}

/**
 * random_get_entropy_fallback - Returns the raw clock source value,
 * used by random.c for platforms with no valid random_get_entropy().
 */
unsigned long random_get_entropy_fallback(void)
{
        struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono;
        struct clocksource *clock = READ_ONCE(tkr->clock);

        if (unlikely(timekeeping_suspended || !clock))
                return 0;
        return clock->read(clock);
}
EXPORT_SYMBOL_GPL(random_get_entropy_fallback);

struct adjtimex_result {
        struct audit_ntp_data        ad;
        struct timespec64        delta;
        bool                        clock_set;
};

static int __do_adjtimex(struct tk_data *tkd, struct __kernel_timex *txc,
                         struct adjtimex_result *result)
{
        struct timekeeper *tks = &tkd->shadow_timekeeper;
        bool aux_clock = !timekeeper_is_core_tk(tks);
        struct timespec64 ts;
        s32 orig_tai, tai;
        int ret;

        /* Validate the data before disabling interrupts */
        ret = timekeeping_validate_timex(txc, aux_clock);
        if (ret)
                return ret;
        add_device_randomness(txc, sizeof(*txc));

        if (!aux_clock)
                ktime_get_real_ts64(&ts);
        else
                tk_get_aux_ts64(tkd->timekeeper.id, &ts);

        add_device_randomness(&ts, sizeof(ts));

        guard(raw_spinlock_irqsave)(&tkd->lock);

        if (!tks->clock_valid)
                return -ENODEV;

        if (txc->modes & ADJ_SETOFFSET) {
                result->delta.tv_sec  = txc->time.tv_sec;
                result->delta.tv_nsec = txc->time.tv_usec;
                if (!(txc->modes & ADJ_NANO))
                        result->delta.tv_nsec *= 1000;
                ret = __timekeeping_inject_offset(tkd, &result->delta);
                if (ret)
                        return ret;
                result->clock_set = true;
        }

        orig_tai = tai = tks->tai_offset;
        ret = ntp_adjtimex(tks->id, txc, &ts, &tai, &result->ad);

        if (tai != orig_tai) {
                __timekeeping_set_tai_offset(tks, tai);
                timekeeping_update_from_shadow(tkd, TK_CLOCK_WAS_SET);
                result->clock_set = true;
        } else {
                tk_update_leap_state_all(&tk_core);
        }

        /* Update the multiplier immediately if frequency was set directly */
        if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK))
                result->clock_set |= __timekeeping_advance(tkd, TK_ADV_FREQ);

        return ret;
}

/**
 * do_adjtimex() - Accessor function to NTP __do_adjtimex function
 * @txc:        Pointer to kernel_timex structure containing NTP parameters
 */
int do_adjtimex(struct __kernel_timex *txc)
{
        struct adjtimex_result result = { };
        int ret;

        ret = __do_adjtimex(&tk_core, txc, &result);
        if (ret < 0)
                return ret;

        if (txc->modes & ADJ_SETOFFSET)
                audit_tk_injoffset(result.delta);

        audit_ntp_log(&result.ad);

        if (result.clock_set)
                clock_was_set(CLOCK_SET_WALL);

        ntp_notify_cmos_timer(result.delta.tv_sec != 0);

        return ret;
}

/*
 * Invoked from NTP with the time keeper lock held, so lockless access is
 * fine.
 */
long ktime_get_ntp_seconds(unsigned int id)
{
        return timekeeper_data[id].timekeeper.xtime_sec;
}

#ifdef CONFIG_NTP_PPS
/**
 * hardpps() - Accessor function to NTP __hardpps function
 * @phase_ts:        Pointer to timespec64 structure representing phase timestamp
 * @raw_ts:        Pointer to timespec64 structure representing raw timestamp
 */
void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
{
        guard(raw_spinlock_irqsave)(&tk_core.lock);
        __hardpps(phase_ts, raw_ts);
}
EXPORT_SYMBOL(hardpps);
#endif /* CONFIG_NTP_PPS */

#ifdef CONFIG_POSIX_AUX_CLOCKS
#include "posix-timers.h"

/*
 * Bitmap for the activated auxiliary timekeepers to allow lockless quick
 * checks in the hot paths without touching extra cache lines. If set, then
 * the state of the corresponding timekeeper has to be re-checked under
 * timekeeper::lock.
 */
static unsigned long aux_timekeepers;

static inline unsigned int clockid_to_tkid(unsigned int id)
{
        return TIMEKEEPER_AUX_FIRST + id - CLOCK_AUX;
}

static inline struct tk_data *aux_get_tk_data(clockid_t id)
{
        if (!clockid_aux_valid(id))
                return NULL;
        return &timekeeper_data[clockid_to_tkid(id)];
}

/* Invoked from timekeeping after a clocksource change */
static void tk_aux_update_clocksource(void)
{
        unsigned long active = READ_ONCE(aux_timekeepers);
        unsigned int id;

        for_each_set_bit(id, &active, BITS_PER_LONG) {
                struct tk_data *tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST];
                struct timekeeper *tks = &tkd->shadow_timekeeper;

                guard(raw_spinlock_irqsave)(&tkd->lock);
                if (!tks->clock_valid)
                        continue;

                timekeeping_forward_now(tks);
                tk_setup_internals(tks, tk_core.timekeeper.tkr_mono.clock);
                timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL);
        }
}

static void tk_aux_advance(void)
{
        unsigned long active = READ_ONCE(aux_timekeepers);
        unsigned int id;

        /* Lockless quick check to avoid extra cache lines */
        for_each_set_bit(id, &active, BITS_PER_LONG) {
                struct tk_data *aux_tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST];

                guard(raw_spinlock)(&aux_tkd->lock);
                if (aux_tkd->shadow_timekeeper.clock_valid)
                        __timekeeping_advance(aux_tkd, TK_ADV_TICK);
        }
}

/**
 * ktime_get_aux - Get time for a AUX clock
 * @id:        ID of the clock to read (CLOCK_AUX...)
 * @kt:        Pointer to ktime_t to store the time stamp
 *
 * Returns: True if the timestamp is valid, false otherwise
 */
bool ktime_get_aux(clockid_t id, ktime_t *kt)
{
        struct tk_data *aux_tkd = aux_get_tk_data(id);
        struct timekeeper *aux_tk;
        unsigned int seq;
        ktime_t base;
        u64 nsecs;

        WARN_ON(timekeeping_suspended);

        if (!aux_tkd)
                return false;

        aux_tk = &aux_tkd->timekeeper;
        do {
                seq = read_seqcount_begin(&aux_tkd->seq);
                if (!aux_tk->clock_valid)
                        return false;

                base = ktime_add(aux_tk->tkr_mono.base, aux_tk->offs_aux);
                nsecs = timekeeping_get_ns(&aux_tk->tkr_mono);
        } while (read_seqcount_retry(&aux_tkd->seq, seq));

        *kt = ktime_add_ns(base, nsecs);
        return true;
}
EXPORT_SYMBOL_GPL(ktime_get_aux);

/**
 * ktime_get_aux_ts64 - Get time for a AUX clock
 * @id:        ID of the clock to read (CLOCK_AUX...)
 * @ts:        Pointer to timespec64 to store the time stamp
 *
 * Returns: True if the timestamp is valid, false otherwise
 */
bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *ts)
{
        ktime_t now;

        if (!ktime_get_aux(id, &now))
                return false;
        *ts = ktime_to_timespec64(now);
        return true;
}
EXPORT_SYMBOL_GPL(ktime_get_aux_ts64);

static int aux_get_res(clockid_t id, struct timespec64 *tp)
{
        if (!clockid_aux_valid(id))
                return -ENODEV;

        tp->tv_sec = aux_clock_resolution_ns() / NSEC_PER_SEC;
        tp->tv_nsec = aux_clock_resolution_ns() % NSEC_PER_SEC;
        return 0;
}

static int aux_get_timespec(clockid_t id, struct timespec64 *tp)
{
        return ktime_get_aux_ts64(id, tp) ? 0 : -ENODEV;
}

static int aux_clock_set(const clockid_t id, const struct timespec64 *tnew)
{
        struct tk_data *aux_tkd = aux_get_tk_data(id);
        struct timekeeper *aux_tks;
        ktime_t tnow, nsecs;

        if (!timespec64_valid_settod(tnew))
                return -EINVAL;
        if (!aux_tkd)
                return -ENODEV;

        aux_tks = &aux_tkd->shadow_timekeeper;

        guard(raw_spinlock_irq)(&aux_tkd->lock);
        if (!aux_tks->clock_valid)
                return -ENODEV;

        /* Forward the timekeeper base time */
        timekeeping_forward_now(aux_tks);
        /*
         * Get the updated base time. tkr_mono.base has not been
         * updated yet, so do that first. That makes the update
         * in timekeeping_update_from_shadow() redundant, but
         * that's harmless. After that @tnow can be calculated
         * by using tkr_mono::cycle_last, which has been set
         * by timekeeping_forward_now().
         */
        tk_update_ktime_data(aux_tks);
        nsecs = timekeeping_cycles_to_ns(&aux_tks->tkr_mono, aux_tks->tkr_mono.cycle_last);
        tnow = ktime_add(aux_tks->tkr_mono.base, nsecs);

        /*
         * Calculate the new AUX offset as delta to @tnow ("monotonic").
         * That avoids all the tk::xtime back and forth conversions as
         * xtime ("realtime") is not applicable for auxiliary clocks and
         * kept in sync with "monotonic".
         */
        tk_update_aux_offs(aux_tks, ktime_sub(timespec64_to_ktime(*tnew), tnow));

        timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL);
        return 0;
}

static int aux_clock_adj(const clockid_t id, struct __kernel_timex *txc)
{
        struct tk_data *aux_tkd = aux_get_tk_data(id);
        struct adjtimex_result result = { };

        if (!aux_tkd)
                return -ENODEV;

        /*
         * @result is ignored for now as there are neither hrtimers nor a
         * RTC related to auxiliary clocks for now.
         */
        return __do_adjtimex(aux_tkd, txc, &result);
}

const struct k_clock clock_aux = {
        .clock_getres                = aux_get_res,
        .clock_get_timespec        = aux_get_timespec,
        .clock_set                = aux_clock_set,
        .clock_adj                = aux_clock_adj,
};

static void aux_clock_enable(clockid_t id)
{
        struct tk_read_base *tkr_raw = &tk_core.timekeeper.tkr_raw;
        struct tk_data *aux_tkd = aux_get_tk_data(id);
        struct timekeeper *aux_tks = &aux_tkd->shadow_timekeeper;

        /* Prevent the core timekeeper from changing. */
        guard(raw_spinlock_irq)(&tk_core.lock);

        /*
         * Setup the auxiliary clock assuming that the raw core timekeeper
         * clock frequency conversion is close enough. Userspace has to
         * adjust for the deviation via clock_adjtime(2).
         */
        guard(raw_spinlock_nested)(&aux_tkd->lock);

        /* Remove leftovers of a previous registration */
        memset(aux_tks, 0, sizeof(*aux_tks));
        /* Restore the timekeeper id */
        aux_tks->id = aux_tkd->timekeeper.id;
        /* Setup the timekeeper based on the current system clocksource */
        tk_setup_internals(aux_tks, tkr_raw->clock);

        /* Mark it valid and set it live */
        aux_tks->clock_valid = true;
        timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL);
}

static void aux_clock_disable(clockid_t id)
{
        struct tk_data *aux_tkd = aux_get_tk_data(id);

        guard(raw_spinlock_irq)(&aux_tkd->lock);
        aux_tkd->shadow_timekeeper.clock_valid = false;
        timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL);
}

static DEFINE_MUTEX(aux_clock_mutex);

static ssize_t aux_clock_enable_store(struct kobject *kobj, struct kobj_attribute *attr,
                                      const char *buf, size_t count)
{
        /* Lazy atoi() as name is "0..7" */
        int id = kobj->name[0] & 0x7;
        bool enable;

        if (!capable(CAP_SYS_TIME))
                return -EPERM;

        if (kstrtobool(buf, &enable) < 0)
                return -EINVAL;

        guard(mutex)(&aux_clock_mutex);
        if (enable == test_bit(id, &aux_timekeepers))
                return count;

        if (enable) {
                aux_clock_enable(CLOCK_AUX + id);
                set_bit(id, &aux_timekeepers);
        } else {
                aux_clock_disable(CLOCK_AUX + id);
                clear_bit(id, &aux_timekeepers);
        }
        return count;
}

static ssize_t aux_clock_enable_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
        unsigned long active = READ_ONCE(aux_timekeepers);
        /* Lazy atoi() as name is "0..7" */
        int id = kobj->name[0] & 0x7;

        return sysfs_emit(buf, "%d\n", test_bit(id, &active));
}

static struct kobj_attribute aux_clock_enable_attr = __ATTR_RW(aux_clock_enable);

static struct attribute *aux_clock_enable_attrs[] = {
        &aux_clock_enable_attr.attr,
        NULL
};

static const struct attribute_group aux_clock_enable_attr_group = {
        .attrs = aux_clock_enable_attrs,
};

static int __init tk_aux_sysfs_init(void)
{
        struct kobject *auxo, *tko = kobject_create_and_add("time", kernel_kobj);
        int ret = -ENOMEM;

        if (!tko)
                return ret;

        auxo = kobject_create_and_add("aux_clocks", tko);
        if (!auxo)
                goto err_clean;

        for (int i = 0; i < MAX_AUX_CLOCKS; i++) {
                char id[2] = { [0] = '0' + i, };
                struct kobject *clk = kobject_create_and_add(id, auxo);

                if (!clk) {
                        ret = -ENOMEM;
                        goto err_clean;
                }

                ret = sysfs_create_group(clk, &aux_clock_enable_attr_group);
                if (ret)
                        goto err_clean;
        }
        return 0;

err_clean:
        kobject_put(auxo);
        kobject_put(tko);
        return ret;
}
late_initcall(tk_aux_sysfs_init);

static __init void tk_aux_setup(void)
{
        for (int i = TIMEKEEPER_AUX_FIRST; i <= TIMEKEEPER_AUX_LAST; i++)
                tkd_basic_setup(&timekeeper_data[i], i, false);
}
#endif /* CONFIG_POSIX_AUX_CLOCKS */





















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_GENERIC_GETORDER_H
#define __ASM_GENERIC_GETORDER_H

#ifndef __ASSEMBLY__

#include <linux/compiler.h>
#include <linux/log2.h>

/**
 * get_order - Determine the allocation order of a memory size
 * @size: The size for which to get the order
 *
 * Determine the allocation order of a particular sized block of memory.  This
 * is on a logarithmic scale, where:
 *
 *        0 -> 2^0 * PAGE_SIZE and below
 *        1 -> 2^1 * PAGE_SIZE to 2^0 * PAGE_SIZE + 1
 *        2 -> 2^2 * PAGE_SIZE to 2^1 * PAGE_SIZE + 1
 *        3 -> 2^3 * PAGE_SIZE to 2^2 * PAGE_SIZE + 1
 *        4 -> 2^4 * PAGE_SIZE to 2^3 * PAGE_SIZE + 1
 *        ...
 *
 * The order returned is used to find the smallest allocation granule required
 * to hold an object of the specified size.
 *
 * The result is undefined if the size is 0.
 */
static __always_inline __attribute_const__ int get_order(unsigned long size)
{
        if (__builtin_constant_p(size)) {
                if (!size)
                        return BITS_PER_LONG - PAGE_SHIFT;

                if (size < (1UL << PAGE_SHIFT))
                        return 0;

                return ilog2((size) - 1) - PAGE_SHIFT + 1;
        }

        size--;
        size >>= PAGE_SHIFT;
#if BITS_PER_LONG == 32
        return fls(size);
#else
        return fls64(size);
#endif
}

#endif        /* __ASSEMBLY__ */

#endif        /* __ASM_GENERIC_GETORDER_H */









































































































































































































































































































































































































































































































































































































































































































































































































































































































































    4 


    4 

    4 



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/mmap.c
 *
 * Written by obz.
 *
 * Address space accounting code        <alan@lxorguk.ukuu.org.uk>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
#include <linux/capability.h>
#include <linux/init.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/personality.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
#include <linux/shmem_fs.h>
#include <linux/profile.h>
#include <linux/export.h>
#include <linux/mount.h>
#include <linux/mempolicy.h>
#include <linux/rmap.h>
#include <linux/mmu_notifier.h>
#include <linux/mmdebug.h>
#include <linux/perf_event.h>
#include <linux/audit.h>
#include <linux/khugepaged.h>
#include <linux/uprobes.h>
#include <linux/notifier.h>
#include <linux/memory.h>
#include <linux/printk.h>
#include <linux/userfaultfd_k.h>
#include <linux/moduleparam.h>
#include <linux/pkeys.h>
#include <linux/oom.h>
#include <linux/sched/mm.h>
#include <linux/ksm.h>
#include <linux/memfd.h>

#include <linux/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>

#define CREATE_TRACE_POINTS
#include <trace/events/mmap.h>

#include "internal.h"

#ifndef arch_mmap_check
#define arch_mmap_check(addr, len, flags)        (0)
#endif

#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN;
int mmap_rnd_bits_max __ro_after_init = CONFIG_ARCH_MMAP_RND_BITS_MAX;
int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS;
#endif
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN;
const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX;
int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
#endif

static bool ignore_rlimit_data;
core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);

/* Update vma->vm_page_prot to reflect vma->vm_flags. */
void vma_set_page_prot(struct vm_area_struct *vma)
{
        vm_flags_t vm_flags = vma->vm_flags;
        pgprot_t vm_page_prot;

        vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags);
        if (vma_wants_writenotify(vma, vm_page_prot)) {
                vm_flags &= ~VM_SHARED;
                vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags);
        }
        /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */
        WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
}

/*
 * check_brk_limits() - Use platform specific check of range & verify mlock
 * limits.
 * @addr: The address to check
 * @len: The size of increase.
 *
 * Return: 0 on success.
 */
static int check_brk_limits(unsigned long addr, unsigned long len)
{
        unsigned long mapped_addr;

        mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
        if (IS_ERR_VALUE(mapped_addr))
                return mapped_addr;

        return mlock_future_ok(current->mm, current->mm->def_flags, len)
                ? 0 : -EAGAIN;
}

SYSCALL_DEFINE1(brk, unsigned long, brk)
{
        unsigned long newbrk, oldbrk, origbrk;
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *brkvma, *next = NULL;
        unsigned long min_brk;
        bool populate = false;
        LIST_HEAD(uf);
        struct vma_iterator vmi;

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        origbrk = mm->brk;

        min_brk = mm->start_brk;
#ifdef CONFIG_COMPAT_BRK
        /*
         * CONFIG_COMPAT_BRK can still be overridden by setting
         * randomize_va_space to 2, which will still cause mm->start_brk
         * to be arbitrarily shifted
         */
        if (!current->brk_randomized)
                min_brk = mm->end_data;
#endif
        if (brk < min_brk)
                goto out;

        /*
         * Check against rlimit here. If this check is done later after the test
         * of oldbrk with newbrk then it can escape the test and let the data
         * segment grow beyond its set limit the in case where the limit is
         * not page aligned -Ram Gupta
         */
        if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk,
                              mm->end_data, mm->start_data))
                goto out;

        newbrk = PAGE_ALIGN(brk);
        oldbrk = PAGE_ALIGN(mm->brk);
        if (oldbrk == newbrk) {
                mm->brk = brk;
                goto success;
        }

        /* Always allow shrinking brk. */
        if (brk <= mm->brk) {
                /* Search one past newbrk */
                vma_iter_init(&vmi, mm, newbrk);
                brkvma = vma_find(&vmi, oldbrk);
                if (!brkvma || brkvma->vm_start >= oldbrk)
                        goto out; /* mapping intersects with an existing non-brk vma. */
                /*
                 * mm->brk must be protected by write mmap_lock.
                 * do_vmi_align_munmap() will drop the lock on success,  so
                 * update it before calling do_vma_munmap().
                 */
                mm->brk = brk;
                if (do_vmi_align_munmap(&vmi, brkvma, mm, newbrk, oldbrk, &uf,
                                        /* unlock = */ true))
                        goto out;

                goto success_unlocked;
        }

        if (check_brk_limits(oldbrk, newbrk - oldbrk))
                goto out;

        /*
         * Only check if the next VMA is within the stack_guard_gap of the
         * expansion area
         */
        vma_iter_init(&vmi, mm, oldbrk);
        next = vma_find(&vmi, newbrk + PAGE_SIZE + stack_guard_gap);
        if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
                goto out;

        brkvma = vma_prev_limit(&vmi, mm->start_brk);
        /* Ok, looks good - let it rip. */
        if (do_brk_flags(&vmi, brkvma, oldbrk, newbrk - oldbrk, 0) < 0)
                goto out;

        mm->brk = brk;
        if (mm->def_flags & VM_LOCKED)
                populate = true;

success:
        mmap_write_unlock(mm);
success_unlocked:
        userfaultfd_unmap_complete(mm, &uf);
        if (populate)
                mm_populate(oldbrk, newbrk - oldbrk);
        return brk;

out:
        mm->brk = origbrk;
        mmap_write_unlock(mm);
        return origbrk;
}

/*
 * If a hint addr is less than mmap_min_addr change hint to be as
 * low as possible but still greater than mmap_min_addr
 */
static inline unsigned long round_hint_to_min(unsigned long hint)
{
        hint &= PAGE_MASK;
        if (((void *)hint != NULL) &&
            (hint < mmap_min_addr))
                return PAGE_ALIGN(mmap_min_addr);
        return hint;
}

bool mlock_future_ok(const struct mm_struct *mm, vm_flags_t vm_flags,
                        unsigned long bytes)
{
        unsigned long locked_pages, limit_pages;

        if (!(vm_flags & VM_LOCKED) || capable(CAP_IPC_LOCK))
                return true;

        locked_pages = bytes >> PAGE_SHIFT;
        locked_pages += mm->locked_vm;

        limit_pages = rlimit(RLIMIT_MEMLOCK);
        limit_pages >>= PAGE_SHIFT;

        return locked_pages <= limit_pages;
}

static inline u64 file_mmap_size_max(struct file *file, struct inode *inode)
{
        if (S_ISREG(inode->i_mode))
                return MAX_LFS_FILESIZE;

        if (S_ISBLK(inode->i_mode))
                return MAX_LFS_FILESIZE;

        if (S_ISSOCK(inode->i_mode))
                return MAX_LFS_FILESIZE;

        /* Special "we do even unsigned file positions" case */
        if (file->f_op->fop_flags & FOP_UNSIGNED_OFFSET)
                return 0;

        /* Yes, random drivers might want more. But I'm tired of buggy drivers */
        return ULONG_MAX;
}

static inline bool file_mmap_ok(struct file *file, struct inode *inode,
                                unsigned long pgoff, unsigned long len)
{
        u64 maxsize = file_mmap_size_max(file, inode);

        if (maxsize && len > maxsize)
                return false;
        maxsize -= len;
        if (pgoff > maxsize >> PAGE_SHIFT)
                return false;
        return true;
}

/**
 * do_mmap() - Perform a userland memory mapping into the current process
 * address space of length @len with protection bits @prot, mmap flags @flags
 * (from which VMA flags will be inferred), and any additional VMA flags to
 * apply @vm_flags. If this is a file-backed mapping then the file is specified
 * in @file and page offset into the file via @pgoff.
 *
 * This function does not perform security checks on the file and assumes, if
 * @uf is non-NULL, the caller has provided a list head to track unmap events
 * for userfaultfd @uf.
 *
 * It also simply indicates whether memory population is required by setting
 * @populate, which must be non-NULL, expecting the caller to actually perform
 * this task itself if appropriate.
 *
 * This function will invoke architecture-specific (and if provided and
 * relevant, file system-specific) logic to determine the most appropriate
 * unmapped area in which to place the mapping if not MAP_FIXED.
 *
 * Callers which require userland mmap() behaviour should invoke vm_mmap(),
 * which is also exported for module use.
 *
 * Those which require this behaviour less security checks, userfaultfd and
 * populate behaviour, and who handle the mmap write lock themselves, should
 * call this function.
 *
 * Note that the returned address may reside within a merged VMA if an
 * appropriate merge were to take place, so it doesn't necessarily specify the
 * start of a VMA, rather only the start of a valid mapped range of length
 * @len bytes, rounded down to the nearest page size.
 *
 * The caller must write-lock current->mm->mmap_lock.
 *
 * @file: An optional struct file pointer describing the file which is to be
 * mapped, if a file-backed mapping.
 * @addr: If non-zero, hints at (or if @flags has MAP_FIXED set, specifies) the
 * address at which to perform this mapping. See mmap (2) for details. Must be
 * page-aligned.
 * @len: The length of the mapping. Will be page-aligned and must be at least 1
 * page in size.
 * @prot: Protection bits describing access required to the mapping. See mmap
 * (2) for details.
 * @flags: Flags specifying how the mapping should be performed, see mmap (2)
 * for details.
 * @vm_flags: VMA flags which should be set by default, or 0 otherwise.
 * @pgoff: Page offset into the @file if file-backed, should be 0 otherwise.
 * @populate: A pointer to a value which will be set to 0 if no population of
 * the range is required, or the number of bytes to populate if it is. Must be
 * non-NULL. See mmap (2) for details as to under what circumstances population
 * of the range occurs.
 * @uf: An optional pointer to a list head to track userfaultfd unmap events
 * should unmapping events arise. If provided, it is up to the caller to manage
 * this.
 *
 * Returns: Either an error, or the address at which the requested mapping has
 * been performed.
 */
unsigned long do_mmap(struct file *file, unsigned long addr,
                        unsigned long len, unsigned long prot,
                        unsigned long flags, vm_flags_t vm_flags,
                        unsigned long pgoff, unsigned long *populate,
                        struct list_head *uf)
{
        struct mm_struct *mm = current->mm;
        int pkey = 0;

        *populate = 0;

        mmap_assert_write_locked(mm);

        if (!len)
                return -EINVAL;

        /*
         * Does the application expect PROT_READ to imply PROT_EXEC?
         *
         * (the exception is when the underlying filesystem is noexec
         *  mounted, in which case we don't add PROT_EXEC.)
         */
        if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
                if (!(file && path_noexec(&file->f_path)))
                        prot |= PROT_EXEC;

        /* force arch specific MAP_FIXED handling in get_unmapped_area */
        if (flags & MAP_FIXED_NOREPLACE)
                flags |= MAP_FIXED;

        if (!(flags & MAP_FIXED))
                addr = round_hint_to_min(addr);

        /* Careful about overflows.. */
        len = PAGE_ALIGN(len);
        if (!len)
                return -ENOMEM;

        /* offset overflow? */
        if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
                return -EOVERFLOW;

        /* Too many mappings? */
        if (mm->map_count > sysctl_max_map_count)
                return -ENOMEM;

        /*
         * addr is returned from get_unmapped_area,
         * There are two cases:
         * 1> MAP_FIXED == false
         *        unallocated memory, no need to check sealing.
         * 1> MAP_FIXED == true
         *        sealing is checked inside mmap_region when
         *        do_vmi_munmap is called.
         */

        if (prot == PROT_EXEC) {
                pkey = execute_only_pkey(mm);
                if (pkey < 0)
                        pkey = 0;
        }

        /* Do simple checking here so the lower-level routines won't have
         * to. we assume access permissions have been handled by the open
         * of the memory object, so we don't do any here.
         */
        vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(file, flags) |
                        mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;

        /* Obtain the address to map to. we verify (or select) it and ensure
         * that it represents a valid section of the address space.
         */
        addr = __get_unmapped_area(file, addr, len, pgoff, flags, vm_flags);
        if (IS_ERR_VALUE(addr))
                return addr;

        if (flags & MAP_FIXED_NOREPLACE) {
                if (find_vma_intersection(mm, addr, addr + len))
                        return -EEXIST;
        }

        if (flags & MAP_LOCKED)
                if (!can_do_mlock())
                        return -EPERM;

        if (!mlock_future_ok(mm, vm_flags, len))
                return -EAGAIN;

        if (file) {
                struct inode *inode = file_inode(file);
                unsigned long flags_mask;
                int err;

                if (!file_mmap_ok(file, inode, pgoff, len))
                        return -EOVERFLOW;

                flags_mask = LEGACY_MAP_MASK;
                if (file->f_op->fop_flags & FOP_MMAP_SYNC)
                        flags_mask |= MAP_SYNC;

                switch (flags & MAP_TYPE) {
                case MAP_SHARED:
                        /*
                         * Force use of MAP_SHARED_VALIDATE with non-legacy
                         * flags. E.g. MAP_SYNC is dangerous to use with
                         * MAP_SHARED as you don't know which consistency model
                         * you will get. We silently ignore unsupported flags
                         * with MAP_SHARED to preserve backward compatibility.
                         */
                        flags &= LEGACY_MAP_MASK;
                        fallthrough;
                case MAP_SHARED_VALIDATE:
                        if (flags & ~flags_mask)
                                return -EOPNOTSUPP;
                        if (prot & PROT_WRITE) {
                                if (!(file->f_mode & FMODE_WRITE))
                                        return -EACCES;
                                if (IS_SWAPFILE(file->f_mapping->host))
                                        return -ETXTBSY;
                        }

                        /*
                         * Make sure we don't allow writing to an append-only
                         * file..
                         */
                        if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
                                return -EACCES;

                        vm_flags |= VM_SHARED | VM_MAYSHARE;
                        if (!(file->f_mode & FMODE_WRITE))
                                vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
                        fallthrough;
                case MAP_PRIVATE:
                        if (!(file->f_mode & FMODE_READ))
                                return -EACCES;
                        if (path_noexec(&file->f_path)) {
                                if (vm_flags & VM_EXEC)
                                        return -EPERM;
                                vm_flags &= ~VM_MAYEXEC;
                        }

                        if (!can_mmap_file(file))
                                return -ENODEV;
                        if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
                                return -EINVAL;
                        break;

                default:
                        return -EINVAL;
                }

                /*
                 * Check to see if we are violating any seals and update VMA
                 * flags if necessary to avoid future seal violations.
                 */
                err = memfd_check_seals_mmap(file, &vm_flags);
                if (err)
                        return (unsigned long)err;
        } else {
                switch (flags & MAP_TYPE) {
                case MAP_SHARED:
                        if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
                                return -EINVAL;
                        /*
                         * Ignore pgoff.
                         */
                        pgoff = 0;
                        vm_flags |= VM_SHARED | VM_MAYSHARE;
                        break;
                case MAP_DROPPABLE:
                        if (VM_DROPPABLE == VM_NONE)
                                return -ENOTSUPP;
                        /*
                         * A locked or stack area makes no sense to be droppable.
                         *
                         * Also, since droppable pages can just go away at any time
                         * it makes no sense to copy them on fork or dump them.
                         *
                         * And don't attempt to combine with hugetlb for now.
                         */
                        if (flags & (MAP_LOCKED | MAP_HUGETLB))
                                return -EINVAL;
                        if (vm_flags & (VM_GROWSDOWN | VM_GROWSUP))
                                return -EINVAL;

                        vm_flags |= VM_DROPPABLE;

                        /*
                         * If the pages can be dropped, then it doesn't make
                         * sense to reserve them.
                         */
                        vm_flags |= VM_NORESERVE;

                        /*
                         * Likewise, they're volatile enough that they
                         * shouldn't survive forks or coredumps.
                         */
                        vm_flags |= VM_WIPEONFORK | VM_DONTDUMP;
                        fallthrough;
                case MAP_PRIVATE:
                        /*
                         * Set pgoff according to addr for anon_vma.
                         */
                        pgoff = addr >> PAGE_SHIFT;
                        break;
                default:
                        return -EINVAL;
                }
        }

        /*
         * Set 'VM_NORESERVE' if we should not account for the
         * memory use of this mapping.
         */
        if (flags & MAP_NORESERVE) {
                /* We honor MAP_NORESERVE if allowed to overcommit */
                if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
                        vm_flags |= VM_NORESERVE;

                /* hugetlb applies strict overcommit unless MAP_NORESERVE */
                if (file && is_file_hugepages(file))
                        vm_flags |= VM_NORESERVE;
        }

        addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
        if (!IS_ERR_VALUE(addr) &&
            ((vm_flags & VM_LOCKED) ||
             (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
                *populate = len;
        return addr;
}

unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
                              unsigned long prot, unsigned long flags,
                              unsigned long fd, unsigned long pgoff)
{
        struct file *file = NULL;
        unsigned long retval;

        if (!(flags & MAP_ANONYMOUS)) {
                audit_mmap_fd(fd, flags);
                file = fget(fd);
                if (!file)
                        return -EBADF;
                if (is_file_hugepages(file)) {
                        len = ALIGN(len, huge_page_size(hstate_file(file)));
                } else if (unlikely(flags & MAP_HUGETLB)) {
                        retval = -EINVAL;
                        goto out_fput;
                }
        } else if (flags & MAP_HUGETLB) {
                struct hstate *hs;

                hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
                if (!hs)
                        return -EINVAL;

                len = ALIGN(len, huge_page_size(hs));
                /*
                 * VM_NORESERVE is used because the reservations will be
                 * taken when vm_ops->mmap() is called
                 */
                file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
                                VM_NORESERVE,
                                HUGETLB_ANONHUGE_INODE,
                                (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
                if (IS_ERR(file))
                        return PTR_ERR(file);
        }

        retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
out_fput:
        if (file)
                fput(file);
        return retval;
}

SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
                unsigned long, prot, unsigned long, flags,
                unsigned long, fd, unsigned long, pgoff)
{
        return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
}

#ifdef __ARCH_WANT_SYS_OLD_MMAP
struct mmap_arg_struct {
        unsigned long addr;
        unsigned long len;
        unsigned long prot;
        unsigned long flags;
        unsigned long fd;
        unsigned long offset;
};

SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
{
        struct mmap_arg_struct a;

        if (copy_from_user(&a, arg, sizeof(a)))
                return -EFAULT;
        if (offset_in_page(a.offset))
                return -EINVAL;

        return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
                               a.offset >> PAGE_SHIFT);
}
#endif /* __ARCH_WANT_SYS_OLD_MMAP */

/*
 * Determine if the allocation needs to ensure that there is no
 * existing mapping within it's guard gaps, for use as start_gap.
 */
static inline unsigned long stack_guard_placement(vm_flags_t vm_flags)
{
        if (vm_flags & VM_SHADOW_STACK)
                return PAGE_SIZE;

        return 0;
}

/*
 * Search for an unmapped address range.
 *
 * We are looking for a range that:
 * - does not intersect with any VMA;
 * - is contained within the [low_limit, high_limit) interval;
 * - is at least the desired size.
 * - satisfies (begin_addr & align_mask) == (align_offset & align_mask)
 */
unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info)
{
        unsigned long addr;

        if (info->flags & VM_UNMAPPED_AREA_TOPDOWN)
                addr = unmapped_area_topdown(info);
        else
                addr = unmapped_area(info);

        trace_vm_unmapped_area(addr, info);
        return addr;
}

/* Get an address range which is currently unmapped.
 * For shmat() with addr=0.
 *
 * Ugly calling convention alert:
 * Return value with the low bits set means error value,
 * ie
 *        if (ret & ~PAGE_MASK)
 *                error = ret;
 *
 * This function "knows" that -ENOMEM has the bits set.
 */
unsigned long
generic_get_unmapped_area(struct file *filp, unsigned long addr,
                          unsigned long len, unsigned long pgoff,
                          unsigned long flags, vm_flags_t vm_flags)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev;
        struct vm_unmapped_area_info info = {};
        const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);

        if (len > mmap_end - mmap_min_addr)
                return -ENOMEM;

        if (flags & MAP_FIXED)
                return addr;

        if (addr) {
                addr = PAGE_ALIGN(addr);
                vma = find_vma_prev(mm, addr, &prev);
                if (mmap_end - len >= addr && addr >= mmap_min_addr &&
                    (!vma || addr + len <= vm_start_gap(vma)) &&
                    (!prev || addr >= vm_end_gap(prev)))
                        return addr;
        }

        info.length = len;
        info.low_limit = mm->mmap_base;
        info.high_limit = mmap_end;
        info.start_gap = stack_guard_placement(vm_flags);
        if (filp && is_file_hugepages(filp))
                info.align_mask = huge_page_mask_align(filp);
        return vm_unmapped_area(&info);
}

#ifndef HAVE_ARCH_UNMAPPED_AREA
unsigned long
arch_get_unmapped_area(struct file *filp, unsigned long addr,
                       unsigned long len, unsigned long pgoff,
                       unsigned long flags, vm_flags_t vm_flags)
{
        return generic_get_unmapped_area(filp, addr, len, pgoff, flags,
                                         vm_flags);
}
#endif

/*
 * This mmap-allocator allocates new areas top-down from below the
 * stack's low limit (the base):
 */
unsigned long
generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
                                  unsigned long len, unsigned long pgoff,
                                  unsigned long flags, vm_flags_t vm_flags)
{
        struct vm_area_struct *vma, *prev;
        struct mm_struct *mm = current->mm;
        struct vm_unmapped_area_info info = {};
        const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);

        /* requested length too big for entire address space */
        if (len > mmap_end - mmap_min_addr)
                return -ENOMEM;

        if (flags & MAP_FIXED)
                return addr;

        /* requesting a specific address */
        if (addr) {
                addr = PAGE_ALIGN(addr);
                vma = find_vma_prev(mm, addr, &prev);
                if (mmap_end - len >= addr && addr >= mmap_min_addr &&
                                (!vma || addr + len <= vm_start_gap(vma)) &&
                                (!prev || addr >= vm_end_gap(prev)))
                        return addr;
        }

        info.flags = VM_UNMAPPED_AREA_TOPDOWN;
        info.length = len;
        info.low_limit = PAGE_SIZE;
        info.high_limit = arch_get_mmap_base(addr, mm->mmap_base);
        info.start_gap = stack_guard_placement(vm_flags);
        if (filp && is_file_hugepages(filp))
                info.align_mask = huge_page_mask_align(filp);
        addr = vm_unmapped_area(&info);

        /*
         * A failed mmap() very likely causes application failure,
         * so fall back to the bottom-up function here. This scenario
         * can happen with large stack limits and large mmap()
         * allocations.
         */
        if (offset_in_page(addr)) {
                VM_BUG_ON(addr != -ENOMEM);
                info.flags = 0;
                info.low_limit = TASK_UNMAPPED_BASE;
                info.high_limit = mmap_end;
                addr = vm_unmapped_area(&info);
        }

        return addr;
}

#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
unsigned long
arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
                               unsigned long len, unsigned long pgoff,
                               unsigned long flags, vm_flags_t vm_flags)
{
        return generic_get_unmapped_area_topdown(filp, addr, len, pgoff, flags,
                                                 vm_flags);
}
#endif

unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *filp,
                                           unsigned long addr, unsigned long len,
                                           unsigned long pgoff, unsigned long flags,
                                           vm_flags_t vm_flags)
{
        if (mm_flags_test(MMF_TOPDOWN, mm))
                return arch_get_unmapped_area_topdown(filp, addr, len, pgoff,
                                                      flags, vm_flags);
        return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags);
}

unsigned long
__get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
                unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags)
{
        unsigned long (*get_area)(struct file *, unsigned long,
                                  unsigned long, unsigned long, unsigned long)
                                  = NULL;

        unsigned long error = arch_mmap_check(addr, len, flags);
        if (error)
                return error;

        /* Careful about overflows.. */
        if (len > TASK_SIZE)
                return -ENOMEM;

        if (file) {
                if (file->f_op->get_unmapped_area)
                        get_area = file->f_op->get_unmapped_area;
        } else if (flags & MAP_SHARED) {
                /*
                 * mmap_region() will call shmem_zero_setup() to create a file,
                 * so use shmem's get_unmapped_area in case it can be huge.
                 */
                get_area = shmem_get_unmapped_area;
        }

        /* Always treat pgoff as zero for anonymous memory. */
        if (!file)
                pgoff = 0;

        if (get_area) {
                addr = get_area(file, addr, len, pgoff, flags);
        } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && !file
                   && !addr /* no hint */
                   && IS_ALIGNED(len, PMD_SIZE)) {
                /* Ensures that larger anonymous mappings are THP aligned. */
                addr = thp_get_unmapped_area_vmflags(file, addr, len,
                                                     pgoff, flags, vm_flags);
        } else {
                addr = mm_get_unmapped_area_vmflags(current->mm, file, addr, len,
                                                    pgoff, flags, vm_flags);
        }
        if (IS_ERR_VALUE(addr))
                return addr;

        if (addr > TASK_SIZE - len)
                return -ENOMEM;
        if (offset_in_page(addr))
                return -EINVAL;

        error = security_mmap_addr(addr);
        return error ? error : addr;
}

unsigned long
mm_get_unmapped_area(struct mm_struct *mm, struct file *file,
                     unsigned long addr, unsigned long len,
                     unsigned long pgoff, unsigned long flags)
{
        return mm_get_unmapped_area_vmflags(mm, file, addr, len,
                                            pgoff, flags, 0);
}
EXPORT_SYMBOL(mm_get_unmapped_area);

/**
 * find_vma_intersection() - Look up the first VMA which intersects the interval
 * @mm: The process address space.
 * @start_addr: The inclusive start user address.
 * @end_addr: The exclusive end user address.
 *
 * Returns: The first VMA within the provided range, %NULL otherwise.  Assumes
 * start_addr < end_addr.
 */
struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
                                             unsigned long start_addr,
                                             unsigned long end_addr)
{
        unsigned long index = start_addr;

        mmap_assert_locked(mm);
        return mt_find(&mm->mm_mt, &index, end_addr - 1);
}
EXPORT_SYMBOL(find_vma_intersection);

/**
 * find_vma() - Find the VMA for a given address, or the next VMA.
 * @mm: The mm_struct to check
 * @addr: The address
 *
 * Returns: The VMA associated with addr, or the next VMA.
 * May return %NULL in the case of no VMA at addr or above.
 */
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
        unsigned long index = addr;

        mmap_assert_locked(mm);
        return mt_find(&mm->mm_mt, &index, ULONG_MAX);
}
EXPORT_SYMBOL(find_vma);

/**
 * find_vma_prev() - Find the VMA for a given address, or the next vma and
 * set %pprev to the previous VMA, if any.
 * @mm: The mm_struct to check
 * @addr: The address
 * @pprev: The pointer to set to the previous VMA
 *
 * Note that RCU lock is missing here since the external mmap_lock() is used
 * instead.
 *
 * Returns: The VMA associated with @addr, or the next vma.
 * May return %NULL in the case of no vma at addr or above.
 */
struct vm_area_struct *
find_vma_prev(struct mm_struct *mm, unsigned long addr,
                        struct vm_area_struct **pprev)
{
        struct vm_area_struct *vma;
        VMA_ITERATOR(vmi, mm, addr);

        vma = vma_iter_load(&vmi);
        *pprev = vma_prev(&vmi);
        if (!vma)
                vma = vma_next(&vmi);
        return vma;
}

/* enforced gap between the expanding stack and other mappings. */
unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;

static int __init cmdline_parse_stack_guard_gap(char *p)
{
        unsigned long val;
        char *endptr;

        val = simple_strtoul(p, &endptr, 10);
        if (!*endptr)
                stack_guard_gap = val << PAGE_SHIFT;

        return 1;
}
__setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);

#ifdef CONFIG_STACK_GROWSUP
int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
{
        return expand_upwards(vma, address);
}

struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
{
        struct vm_area_struct *vma, *prev;

        addr &= PAGE_MASK;
        vma = find_vma_prev(mm, addr, &prev);
        if (vma && (vma->vm_start <= addr))
                return vma;
        if (!prev)
                return NULL;
        if (expand_stack_locked(prev, addr))
                return NULL;
        if (prev->vm_flags & VM_LOCKED)
                populate_vma_page_range(prev, addr, prev->vm_end, NULL);
        return prev;
}
#else
int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
{
        return expand_downwards(vma, address);
}

struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
{
        struct vm_area_struct *vma;
        unsigned long start;

        addr &= PAGE_MASK;
        vma = find_vma(mm, addr);
        if (!vma)
                return NULL;
        if (vma->vm_start <= addr)
                return vma;
        start = vma->vm_start;
        if (expand_stack_locked(vma, addr))
                return NULL;
        if (vma->vm_flags & VM_LOCKED)
                populate_vma_page_range(vma, addr, start, NULL);
        return vma;
}
#endif

#if defined(CONFIG_STACK_GROWSUP)

#define vma_expand_up(vma,addr) expand_upwards(vma, addr)
#define vma_expand_down(vma, addr) (-EFAULT)

#else

#define vma_expand_up(vma,addr) (-EFAULT)
#define vma_expand_down(vma, addr) expand_downwards(vma, addr)

#endif

/*
 * expand_stack(): legacy interface for page faulting. Don't use unless
 * you have to.
 *
 * This is called with the mm locked for reading, drops the lock, takes
 * the lock for writing, tries to look up a vma again, expands it if
 * necessary, and downgrades the lock to reading again.
 *
 * If no vma is found or it can't be expanded, it returns NULL and has
 * dropped the lock.
 */
struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr)
{
        struct vm_area_struct *vma, *prev;

        mmap_read_unlock(mm);
        if (mmap_write_lock_killable(mm))
                return NULL;

        vma = find_vma_prev(mm, addr, &prev);
        if (vma && vma->vm_start <= addr)
                goto success;

        if (prev && !vma_expand_up(prev, addr)) {
                vma = prev;
                goto success;
        }

        if (vma && !vma_expand_down(vma, addr))
                goto success;

        mmap_write_unlock(mm);
        return NULL;

success:
        mmap_write_downgrade(mm);
        return vma;
}

/* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls.
 * @mm: The mm_struct
 * @start: The start address to munmap
 * @len: The length to be munmapped.
 * @uf: The userfaultfd list_head
 *
 * Return: 0 on success, error otherwise.
 */
int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
              struct list_head *uf)
{
        VMA_ITERATOR(vmi, mm, start);

        return do_vmi_munmap(&vmi, mm, start, len, uf, false);
}

int vm_munmap(unsigned long start, size_t len)
{
        return __vm_munmap(start, len, false);
}
EXPORT_SYMBOL(vm_munmap);

SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
{
        addr = untagged_addr(addr);
        return __vm_munmap(addr, len, true);
}


/*
 * Emulation of deprecated remap_file_pages() syscall.
 */
SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
                unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
{

        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        unsigned long populate = 0;
        unsigned long ret = -EINVAL;
        struct file *file;
        vm_flags_t vm_flags;

        pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/mm/remap_file_pages.rst.\n",
                     current->comm, current->pid);

        if (prot)
                return ret;
        start = start & PAGE_MASK;
        size = size & PAGE_MASK;

        if (start + size <= start)
                return ret;

        /* Does pgoff wrap? */
        if (pgoff + (size >> PAGE_SHIFT) < pgoff)
                return ret;

        if (mmap_read_lock_killable(mm))
                return -EINTR;

        /*
         * Look up VMA under read lock first so we can perform the security
         * without holding locks (which can be problematic). We reacquire a
         * write lock later and check nothing changed underneath us.
         */
        vma = vma_lookup(mm, start);

        if (!vma || !(vma->vm_flags & VM_SHARED)) {
                mmap_read_unlock(mm);
                return -EINVAL;
        }

        prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
        prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
        prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;

        flags &= MAP_NONBLOCK;
        flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
        if (vma->vm_flags & VM_LOCKED)
                flags |= MAP_LOCKED;

        /* Save vm_flags used to calculate prot and flags, and recheck later. */
        vm_flags = vma->vm_flags;
        file = get_file(vma->vm_file);

        mmap_read_unlock(mm);

        /* Call outside mmap_lock to be consistent with other callers. */
        ret = security_mmap_file(file, prot, flags);
        if (ret) {
                fput(file);
                return ret;
        }

        ret = -EINVAL;

        /* OK security check passed, take write lock + let it rip. */
        if (mmap_write_lock_killable(mm)) {
                fput(file);
                return -EINTR;
        }

        vma = vma_lookup(mm, start);

        if (!vma)
                goto out;

        /* Make sure things didn't change under us. */
        if (vma->vm_flags != vm_flags)
                goto out;
        if (vma->vm_file != file)
                goto out;

        if (start + size > vma->vm_end) {
                VMA_ITERATOR(vmi, mm, vma->vm_end);
                struct vm_area_struct *next, *prev = vma;

                for_each_vma_range(vmi, next, start + size) {
                        /* hole between vmas ? */
                        if (next->vm_start != prev->vm_end)
                                goto out;

                        if (next->vm_file != vma->vm_file)
                                goto out;

                        if (next->vm_flags != vma->vm_flags)
                                goto out;

                        if (start + size <= next->vm_end)
                                break;

                        prev = next;
                }

                if (!next)
                        goto out;
        }

        ret = do_mmap(vma->vm_file, start, size,
                        prot, flags, 0, pgoff, &populate, NULL);
out:
        mmap_write_unlock(mm);
        fput(file);
        if (populate)
                mm_populate(ret, populate);
        if (!IS_ERR_VALUE(ret))
                ret = 0;
        return ret;
}

int vm_brk_flags(unsigned long addr, unsigned long request, vm_flags_t vm_flags)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma = NULL;
        unsigned long len;
        int ret;
        bool populate;
        LIST_HEAD(uf);
        VMA_ITERATOR(vmi, mm, addr);

        len = PAGE_ALIGN(request);
        if (len < request)
                return -ENOMEM;
        if (!len)
                return 0;

        /* Until we need other flags, refuse anything except VM_EXEC. */
        if ((vm_flags & (~VM_EXEC)) != 0)
                return -EINVAL;

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        ret = check_brk_limits(addr, len);
        if (ret)
                goto limits_failed;

        ret = do_vmi_munmap(&vmi, mm, addr, len, &uf, 0);
        if (ret)
                goto munmap_failed;

        vma = vma_prev(&vmi);
        ret = do_brk_flags(&vmi, vma, addr, len, vm_flags);
        populate = ((mm->def_flags & VM_LOCKED) != 0);
        mmap_write_unlock(mm);
        userfaultfd_unmap_complete(mm, &uf);
        if (populate && !ret)
                mm_populate(addr, len);
        return ret;

munmap_failed:
limits_failed:
        mmap_write_unlock(mm);
        return ret;
}
EXPORT_SYMBOL(vm_brk_flags);

/* Release all mmaps. */
void exit_mmap(struct mm_struct *mm)
{
        struct mmu_gather tlb;
        struct vm_area_struct *vma;
        unsigned long nr_accounted = 0;
        VMA_ITERATOR(vmi, mm, 0);
        int count = 0;

        /* mm's last user has gone, and its about to be pulled down */
        mmu_notifier_release(mm);

        mmap_read_lock(mm);
        arch_exit_mmap(mm);

        vma = vma_next(&vmi);
        if (!vma || unlikely(xa_is_zero(vma))) {
                /* Can happen if dup_mmap() received an OOM */
                mmap_read_unlock(mm);
                mmap_write_lock(mm);
                goto destroy;
        }

        flush_cache_mm(mm);
        tlb_gather_mmu_fullmm(&tlb, mm);
        /* update_hiwater_rss(mm) here? but nobody should be looking */
        /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */
        unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX, false);
        mmap_read_unlock(mm);

        /*
         * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper
         * because the memory has been already freed.
         */
        mm_flags_set(MMF_OOM_SKIP, mm);
        mmap_write_lock(mm);
        mt_clear_in_rcu(&mm->mm_mt);
        vma_iter_set(&vmi, vma->vm_end);
        free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS,
                      USER_PGTABLES_CEILING, true);
        tlb_finish_mmu(&tlb);

        /*
         * Walk the list again, actually closing and freeing it, with preemption
         * enabled, without holding any MM locks besides the unreachable
         * mmap_write_lock.
         */
        vma_iter_set(&vmi, vma->vm_end);
        do {
                if (vma->vm_flags & VM_ACCOUNT)
                        nr_accounted += vma_pages(vma);
                vma_mark_detached(vma);
                remove_vma(vma);
                count++;
                cond_resched();
                vma = vma_next(&vmi);
        } while (vma && likely(!xa_is_zero(vma)));

        BUG_ON(count != mm->map_count);

        trace_exit_mmap(mm);
destroy:
        __mt_destroy(&mm->mm_mt);
        mmap_write_unlock(mm);
        vm_unacct_memory(nr_accounted);
}

/*
 * Return true if the calling process may expand its vm space by the passed
 * number of pages
 */
bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
{
        if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT)
                return false;

        if (is_data_mapping(flags) &&
            mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) {
                /* Workaround for Valgrind */
                if (rlimit(RLIMIT_DATA) == 0 &&
                    mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> PAGE_SHIFT)
                        return true;

                pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits%s.\n",
                             current->comm, current->pid,
                             (mm->data_vm + npages) << PAGE_SHIFT,
                             rlimit(RLIMIT_DATA),
                             ignore_rlimit_data ? "" : " or use boot option ignore_rlimit_data");

                if (!ignore_rlimit_data)
                        return false;
        }

        return true;
}

void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
{
        WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages);

        if (is_exec_mapping(flags))
                mm->exec_vm += npages;
        else if (is_stack_mapping(flags))
                mm->stack_vm += npages;
        else if (is_data_mapping(flags))
                mm->data_vm += npages;
}

static vm_fault_t special_mapping_fault(struct vm_fault *vmf);

/*
 * Close hook, called for unmap() and on the old vma for mremap().
 *
 * Having a close hook prevents vma merging regardless of flags.
 */
static void special_mapping_close(struct vm_area_struct *vma)
{
        const struct vm_special_mapping *sm = vma->vm_private_data;

        if (sm->close)
                sm->close(sm, vma);
}

static const char *special_mapping_name(struct vm_area_struct *vma)
{
        return ((struct vm_special_mapping *)vma->vm_private_data)->name;
}

static int special_mapping_mremap(struct vm_area_struct *new_vma)
{
        struct vm_special_mapping *sm = new_vma->vm_private_data;

        if (WARN_ON_ONCE(current->mm != new_vma->vm_mm))
                return -EFAULT;

        if (sm->mremap)
                return sm->mremap(sm, new_vma);

        return 0;
}

static int special_mapping_split(struct vm_area_struct *vma, unsigned long addr)
{
        /*
         * Forbid splitting special mappings - kernel has expectations over
         * the number of pages in mapping. Together with VM_DONTEXPAND
         * the size of vma should stay the same over the special mapping's
         * lifetime.
         */
        return -EINVAL;
}

static const struct vm_operations_struct special_mapping_vmops = {
        .close = special_mapping_close,
        .fault = special_mapping_fault,
        .mremap = special_mapping_mremap,
        .name = special_mapping_name,
        /* vDSO code relies that VVAR can't be accessed remotely */
        .access = NULL,
        .may_split = special_mapping_split,
};

static vm_fault_t special_mapping_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        pgoff_t pgoff;
        struct page **pages;
        struct vm_special_mapping *sm = vma->vm_private_data;

        if (sm->fault)
                return sm->fault(sm, vmf->vma, vmf);

        pages = sm->pages;

        for (pgoff = vmf->pgoff; pgoff && *pages; ++pages)
                pgoff--;

        if (*pages) {
                struct page *page = *pages;
                get_page(page);
                vmf->page = page;
                return 0;
        }

        return VM_FAULT_SIGBUS;
}

static struct vm_area_struct *__install_special_mapping(
        struct mm_struct *mm,
        unsigned long addr, unsigned long len,
        vm_flags_t vm_flags, void *priv,
        const struct vm_operations_struct *ops)
{
        int ret;
        struct vm_area_struct *vma;

        vma = vm_area_alloc(mm);
        if (unlikely(vma == NULL))
                return ERR_PTR(-ENOMEM);

        vma_set_range(vma, addr, addr + len, 0);
        vm_flags_init(vma, (vm_flags | mm->def_flags |
                      VM_DONTEXPAND | VM_SOFTDIRTY) & ~VM_LOCKED_MASK);
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);

        vma->vm_ops = ops;
        vma->vm_private_data = priv;

        ret = insert_vm_struct(mm, vma);
        if (ret)
                goto out;

        vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT);

        perf_event_mmap(vma);

        return vma;

out:
        vm_area_free(vma);
        return ERR_PTR(ret);
}

bool vma_is_special_mapping(const struct vm_area_struct *vma,
        const struct vm_special_mapping *sm)
{
        return vma->vm_private_data == sm &&
                vma->vm_ops == &special_mapping_vmops;
}

/*
 * Called with mm->mmap_lock held for writing.
 * Insert a new vma covering the given region, with the given flags.
 * Its pages are supplied by the given array of struct page *.
 * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
 * The region past the last page supplied will always produce SIGBUS.
 * The array pointer and the pages it points to are assumed to stay alive
 * for as long as this mapping might exist.
 */
struct vm_area_struct *_install_special_mapping(
        struct mm_struct *mm,
        unsigned long addr, unsigned long len,
        vm_flags_t vm_flags, const struct vm_special_mapping *spec)
{
        return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec,
                                        &special_mapping_vmops);
}

#ifdef CONFIG_SYSCTL
#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
                defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
int sysctl_legacy_va_layout;
#endif

static const struct ctl_table mmap_table[] = {
                {
                                .procname       = "max_map_count",
                                .data           = &sysctl_max_map_count,
                                .maxlen         = sizeof(sysctl_max_map_count),
                                .mode           = 0644,
                                .proc_handler   = proc_dointvec_minmax,
                                .extra1         = SYSCTL_ZERO,
                },
#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
                defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
                {
                                .procname       = "legacy_va_layout",
                                .data           = &sysctl_legacy_va_layout,
                                .maxlen         = sizeof(sysctl_legacy_va_layout),
                                .mode           = 0644,
                                .proc_handler   = proc_dointvec_minmax,
                                .extra1         = SYSCTL_ZERO,
                },
#endif
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
                {
                                .procname       = "mmap_rnd_bits",
                                .data           = &mmap_rnd_bits,
                                .maxlen         = sizeof(mmap_rnd_bits),
                                .mode           = 0600,
                                .proc_handler   = proc_dointvec_minmax,
                                .extra1         = (void *)&mmap_rnd_bits_min,
                                .extra2         = (void *)&mmap_rnd_bits_max,
                },
#endif
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
                {
                                .procname       = "mmap_rnd_compat_bits",
                                .data           = &mmap_rnd_compat_bits,
                                .maxlen         = sizeof(mmap_rnd_compat_bits),
                                .mode           = 0600,
                                .proc_handler   = proc_dointvec_minmax,
                                .extra1         = (void *)&mmap_rnd_compat_bits_min,
                                .extra2         = (void *)&mmap_rnd_compat_bits_max,
                },
#endif
};
#endif /* CONFIG_SYSCTL */

/*
 * initialise the percpu counter for VM, initialise VMA state.
 */
void __init mmap_init(void)
{
        int ret;

        ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
        VM_BUG_ON(ret);
#ifdef CONFIG_SYSCTL
        register_sysctl_init("vm", mmap_table);
#endif
        vma_state_init();
}

/*
 * Initialise sysctl_user_reserve_kbytes.
 *
 * This is intended to prevent a user from starting a single memory hogging
 * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
 * mode.
 *
 * The default value is min(3% of free memory, 128MB)
 * 128MB is enough to recover with sshd/login, bash, and top/kill.
 */
static int init_user_reserve(void)
{
        unsigned long free_kbytes;

        free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));

        sysctl_user_reserve_kbytes = min(free_kbytes / 32, SZ_128K);
        return 0;
}
subsys_initcall(init_user_reserve);

/*
 * Initialise sysctl_admin_reserve_kbytes.
 *
 * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
 * to log in and kill a memory hogging process.
 *
 * Systems with more than 256MB will reserve 8MB, enough to recover
 * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
 * only reserve 3% of free pages by default.
 */
static int init_admin_reserve(void)
{
        unsigned long free_kbytes;

        free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));

        sysctl_admin_reserve_kbytes = min(free_kbytes / 32, SZ_8K);
        return 0;
}
subsys_initcall(init_admin_reserve);

/*
 * Reinititalise user and admin reserves if memory is added or removed.
 *
 * The default user reserve max is 128MB, and the default max for the
 * admin reserve is 8MB. These are usually, but not always, enough to
 * enable recovery from a memory hogging process using login/sshd, a shell,
 * and tools like top. It may make sense to increase or even disable the
 * reserve depending on the existence of swap or variations in the recovery
 * tools. So, the admin may have changed them.
 *
 * If memory is added and the reserves have been eliminated or increased above
 * the default max, then we'll trust the admin.
 *
 * If memory is removed and there isn't enough free memory, then we
 * need to reset the reserves.
 *
 * Otherwise keep the reserve set by the admin.
 */
static int reserve_mem_notifier(struct notifier_block *nb,
                             unsigned long action, void *data)
{
        unsigned long tmp, free_kbytes;

        switch (action) {
        case MEM_ONLINE:
                /* Default max is 128MB. Leave alone if modified by operator. */
                tmp = sysctl_user_reserve_kbytes;
                if (tmp > 0 && tmp < SZ_128K)
                        init_user_reserve();

                /* Default max is 8MB.  Leave alone if modified by operator. */
                tmp = sysctl_admin_reserve_kbytes;
                if (tmp > 0 && tmp < SZ_8K)
                        init_admin_reserve();

                break;
        case MEM_OFFLINE:
                free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));

                if (sysctl_user_reserve_kbytes > free_kbytes) {
                        init_user_reserve();
                        pr_info("vm.user_reserve_kbytes reset to %lu\n",
                                sysctl_user_reserve_kbytes);
                }

                if (sysctl_admin_reserve_kbytes > free_kbytes) {
                        init_admin_reserve();
                        pr_info("vm.admin_reserve_kbytes reset to %lu\n",
                                sysctl_admin_reserve_kbytes);
                }
                break;
        default:
                break;
        }
        return NOTIFY_OK;
}

static int __meminit init_reserve_notifier(void)
{
        if (hotplug_memory_notifier(reserve_mem_notifier, DEFAULT_CALLBACK_PRI))
                pr_err("Failed registering memory add/remove notifier for admin reserve\n");

        return 0;
}
subsys_initcall(init_reserve_notifier);

/*
 * Obtain a read lock on mm->mmap_lock, if the specified address is below the
 * start of the VMA, the intent is to perform a write, and it is a
 * downward-growing stack, then attempt to expand the stack to contain it.
 *
 * This function is intended only for obtaining an argument page from an ELF
 * image, and is almost certainly NOT what you want to use for any other
 * purpose.
 *
 * IMPORTANT - VMA fields are accessed without an mmap lock being held, so the
 * VMA referenced must not be linked in any user-visible tree, i.e. it must be a
 * new VMA being mapped.
 *
 * The function assumes that addr is either contained within the VMA or below
 * it, and makes no attempt to validate this value beyond that.
 *
 * Returns true if the read lock was obtained and a stack was perhaps expanded,
 * false if the stack expansion failed.
 *
 * On stack expansion the function temporarily acquires an mmap write lock
 * before downgrading it.
 */
bool mmap_read_lock_maybe_expand(struct mm_struct *mm,
                                 struct vm_area_struct *new_vma,
                                 unsigned long addr, bool write)
{
        if (!write || addr >= new_vma->vm_start) {
                mmap_read_lock(mm);
                return true;
        }

        if (!(new_vma->vm_flags & VM_GROWSDOWN))
                return false;

        mmap_write_lock(mm);
        if (expand_downwards(new_vma, addr)) {
                mmap_write_unlock(mm);
                return false;
        }

        mmap_write_downgrade(mm);
        return true;
}

__latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
{
        struct vm_area_struct *mpnt, *tmp;
        int retval;
        unsigned long charge = 0;
        LIST_HEAD(uf);
        VMA_ITERATOR(vmi, mm, 0);

        if (mmap_write_lock_killable(oldmm))
                return -EINTR;
        flush_cache_dup_mm(oldmm);
        uprobe_dup_mmap(oldmm, mm);
        /*
         * Not linked in yet - no deadlock potential:
         */
        mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);

        /* No ordering required: file already has been exposed. */
        dup_mm_exe_file(mm, oldmm);

        mm->total_vm = oldmm->total_vm;
        mm->data_vm = oldmm->data_vm;
        mm->exec_vm = oldmm->exec_vm;
        mm->stack_vm = oldmm->stack_vm;

        /* Use __mt_dup() to efficiently build an identical maple tree. */
        retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
        if (unlikely(retval))
                goto out;

        mt_clear_in_rcu(vmi.mas.tree);
        for_each_vma(vmi, mpnt) {
                struct file *file;

                vma_start_write(mpnt);
                if (mpnt->vm_flags & VM_DONTCOPY) {
                        retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
                                                    mpnt->vm_end, GFP_KERNEL);
                        if (retval)
                                goto loop_out;

                        vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
                        continue;
                }
                charge = 0;
                /*
                 * Don't duplicate many vmas if we've been oom-killed (for
                 * example)
                 */
                if (fatal_signal_pending(current)) {
                        retval = -EINTR;
                        goto loop_out;
                }
                if (mpnt->vm_flags & VM_ACCOUNT) {
                        unsigned long len = vma_pages(mpnt);

                        if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
                                goto fail_nomem;
                        charge = len;
                }

                tmp = vm_area_dup(mpnt);
                if (!tmp)
                        goto fail_nomem;
                retval = vma_dup_policy(mpnt, tmp);
                if (retval)
                        goto fail_nomem_policy;
                tmp->vm_mm = mm;
                retval = dup_userfaultfd(tmp, &uf);
                if (retval)
                        goto fail_nomem_anon_vma_fork;
                if (tmp->vm_flags & VM_WIPEONFORK) {
                        /*
                         * VM_WIPEONFORK gets a clean slate in the child.
                         * Don't prepare anon_vma until fault since we don't
                         * copy page for current vma.
                         */
                        tmp->anon_vma = NULL;
                } else if (anon_vma_fork(tmp, mpnt))
                        goto fail_nomem_anon_vma_fork;
                vm_flags_clear(tmp, VM_LOCKED_MASK);
                /*
                 * Copy/update hugetlb private vma information.
                 */
                if (is_vm_hugetlb_page(tmp))
                        hugetlb_dup_vma_private(tmp);

                /*
                 * Link the vma into the MT. After using __mt_dup(), memory
                 * allocation is not necessary here, so it cannot fail.
                 */
                vma_iter_bulk_store(&vmi, tmp);

                mm->map_count++;

                if (tmp->vm_ops && tmp->vm_ops->open)
                        tmp->vm_ops->open(tmp);

                file = tmp->vm_file;
                if (file) {
                        struct address_space *mapping = file->f_mapping;

                        get_file(file);
                        i_mmap_lock_write(mapping);
                        if (vma_is_shared_maywrite(tmp))
                                mapping_allow_writable(mapping);
                        flush_dcache_mmap_lock(mapping);
                        /* insert tmp into the share list, just after mpnt */
                        vma_interval_tree_insert_after(tmp, mpnt,
                                        &mapping->i_mmap);
                        flush_dcache_mmap_unlock(mapping);
                        i_mmap_unlock_write(mapping);
                }

                if (!(tmp->vm_flags & VM_WIPEONFORK))
                        retval = copy_page_range(tmp, mpnt);

                if (retval) {
                        mpnt = vma_next(&vmi);
                        goto loop_out;
                }
        }
        /* a new mm has just been created */
        retval = arch_dup_mmap(oldmm, mm);
loop_out:
        vma_iter_free(&vmi);
        if (!retval) {
                mt_set_in_rcu(vmi.mas.tree);
                ksm_fork(mm, oldmm);
                khugepaged_fork(mm, oldmm);
        } else {

                /*
                 * The entire maple tree has already been duplicated. If the
                 * mmap duplication fails, mark the failure point with
                 * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
                 * stop releasing VMAs that have not been duplicated after this
                 * point.
                 */
                if (mpnt) {
                        mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
                        mas_store(&vmi.mas, XA_ZERO_ENTRY);
                        /* Avoid OOM iterating a broken tree */
                        mm_flags_set(MMF_OOM_SKIP, mm);
                }
                /*
                 * The mm_struct is going to exit, but the locks will be dropped
                 * first.  Set the mm_struct as unstable is advisable as it is
                 * not fully initialised.
                 */
                mm_flags_set(MMF_UNSTABLE, mm);
        }
out:
        mmap_write_unlock(mm);
        flush_tlb_mm(oldmm);
        mmap_write_unlock(oldmm);
        if (!retval)
                dup_userfaultfd_complete(&uf);
        else
                dup_userfaultfd_fail(&uf);
        return retval;

fail_nomem_anon_vma_fork:
        mpol_put(vma_policy(tmp));
fail_nomem_policy:
        vm_area_free(tmp);
fail_nomem:
        retval = -ENOMEM;
        vm_unacct_memory(charge);
        goto loop_out;
}





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    4 











    4 







    4 







    4 







    4 









    4 






    4 









    4 


    4 



























































































    4 


















    4 






    4 









    4 














    4 





    4 














    4 

    4 

    4 







    4 














    4 










    4 


























    4 


    4 
    4 

    4 






    4 
































    4 







    4 




    4 

    4 






























    4 
























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
// SPDX-License-Identifier: GPL-2.0
/*
 *  Copyright (C) 1995  Linus Torvalds
 *  Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
 *  Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
 */
#include <linux/sched.h>                /* test_thread_flag(), ...        */
#include <linux/sched/task_stack.h>        /* task_stack_*(), ...                */
#include <linux/kdebug.h>                /* oops_begin/end, ...                */
#include <linux/memblock.h>                /* max_low_pfn                        */
#include <linux/kfence.h>                /* kfence_handle_page_fault        */
#include <linux/kprobes.h>                /* NOKPROBE_SYMBOL, ...                */
#include <linux/mmiotrace.h>                /* kmmio_handler, ...                */
#include <linux/perf_event.h>                /* perf_sw_event                */
#include <linux/hugetlb.h>                /* hstate_index_to_shift        */
#include <linux/context_tracking.h>        /* exception_enter(), ...        */
#include <linux/uaccess.h>                /* faulthandler_disabled()        */
#include <linux/efi.h>                        /* efi_crash_gracefully_on_page_fault()*/
#include <linux/mm_types.h>
#include <linux/mm.h>                        /* find_and_lock_vma() */
#include <linux/vmalloc.h>

#include <asm/cpufeature.h>                /* boot_cpu_has, ...                */
#include <asm/traps.h>                        /* dotraplinkage, ...                */
#include <asm/fixmap.h>                        /* VSYSCALL_ADDR                */
#include <asm/vsyscall.h>                /* emulate_vsyscall                */
#include <asm/vm86.h>                        /* struct vm86                        */
#include <asm/mmu_context.h>                /* vma_pkey()                        */
#include <asm/efi.h>                        /* efi_crash_gracefully_on_page_fault()*/
#include <asm/desc.h>                        /* store_idt(), ...                */
#include <asm/cpu_entry_area.h>                /* exception stack                */
#include <asm/pgtable_areas.h>                /* VMALLOC_START, ...                */
#include <asm/kvm_para.h>                /* kvm_handle_async_pf                */
#include <asm/vdso.h>                        /* fixup_vdso_exception()        */
#include <asm/irq_stack.h>
#include <asm/fred.h>
#include <asm/sev.h>                        /* snp_dump_hva_rmpentry()        */

#define CREATE_TRACE_POINTS
#include <trace/events/exceptions.h>

/*
 * Returns 0 if mmiotrace is disabled, or if the fault is not
 * handled by mmiotrace:
 */
static nokprobe_inline int
kmmio_fault(struct pt_regs *regs, unsigned long addr)
{
        if (unlikely(is_kmmio_active()))
                if (kmmio_handler(regs, addr) == 1)
                        return -1;
        return 0;
}

/*
 * Prefetch quirks:
 *
 * 32-bit mode:
 *
 *   Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
 *   Check that here and ignore it.  This is AMD erratum #91.
 *
 * 64-bit mode:
 *
 *   Sometimes the CPU reports invalid exceptions on prefetch.
 *   Check that here and ignore it.
 *
 * Opcode checker based on code by Richard Brunner.
 */
static inline int
check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
                      unsigned char opcode, int *prefetch)
{
        unsigned char instr_hi = opcode & 0xf0;
        unsigned char instr_lo = opcode & 0x0f;

        switch (instr_hi) {
        case 0x20:
        case 0x30:
                /*
                 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
                 * In X86_64 long mode, the CPU will signal invalid
                 * opcode if some of these prefixes are present so
                 * X86_64 will never get here anyway
                 */
                return ((instr_lo & 7) == 0x6);
#ifdef CONFIG_X86_64
        case 0x40:
                /*
                 * In 64-bit mode 0x40..0x4F are valid REX prefixes
                 */
                return (!user_mode(regs) || user_64bit_mode(regs));
#endif
        case 0x60:
                /* 0x64 thru 0x67 are valid prefixes in all modes. */
                return (instr_lo & 0xC) == 0x4;
        case 0xF0:
                /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
                return !instr_lo || (instr_lo>>1) == 1;
        case 0x00:
                /* Prefetch instruction is 0x0F0D or 0x0F18 */
                if (get_kernel_nofault(opcode, instr))
                        return 0;

                *prefetch = (instr_lo == 0xF) &&
                        (opcode == 0x0D || opcode == 0x18);
                return 0;
        default:
                return 0;
        }
}

static bool is_amd_k8_pre_npt(void)
{
        struct cpuinfo_x86 *c = &boot_cpu_data;

        return unlikely(IS_ENABLED(CONFIG_CPU_SUP_AMD) &&
                        c->x86_vendor == X86_VENDOR_AMD &&
                        c->x86 == 0xf && c->x86_model < 0x40);
}

static int
is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
{
        unsigned char *max_instr;
        unsigned char *instr;
        int prefetch = 0;

        /* Erratum #91 affects AMD K8, pre-NPT CPUs */
        if (!is_amd_k8_pre_npt())
                return 0;

        /*
         * If it was a exec (instruction fetch) fault on NX page, then
         * do not ignore the fault:
         */
        if (error_code & X86_PF_INSTR)
                return 0;

        instr = (void *)convert_ip_to_linear(current, regs);
        max_instr = instr + 15;

        /*
         * This code has historically always bailed out if IP points to a
         * not-present page (e.g. due to a race).  No one has ever
         * complained about this.
         */
        pagefault_disable();

        while (instr < max_instr) {
                unsigned char opcode;

                if (user_mode(regs)) {
                        if (get_user(opcode, (unsigned char __user *) instr))
                                break;
                } else {
                        if (get_kernel_nofault(opcode, instr))
                                break;
                }

                instr++;

                if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
                        break;
        }

        pagefault_enable();
        return prefetch;
}

DEFINE_SPINLOCK(pgd_lock);
LIST_HEAD(pgd_list);

#ifdef CONFIG_X86_32
static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
{
        unsigned index = pgd_index(address);
        pgd_t *pgd_k;
        p4d_t *p4d, *p4d_k;
        pud_t *pud, *pud_k;
        pmd_t *pmd, *pmd_k;

        pgd += index;
        pgd_k = init_mm.pgd + index;

        if (!pgd_present(*pgd_k))
                return NULL;

        /*
         * set_pgd(pgd, *pgd_k); here would be useless on PAE
         * and redundant with the set_pmd() on non-PAE. As would
         * set_p4d/set_pud.
         */
        p4d = p4d_offset(pgd, address);
        p4d_k = p4d_offset(pgd_k, address);
        if (!p4d_present(*p4d_k))
                return NULL;

        pud = pud_offset(p4d, address);
        pud_k = pud_offset(p4d_k, address);
        if (!pud_present(*pud_k))
                return NULL;

        pmd = pmd_offset(pud, address);
        pmd_k = pmd_offset(pud_k, address);

        if (pmd_present(*pmd) != pmd_present(*pmd_k))
                set_pmd(pmd, *pmd_k);

        if (!pmd_present(*pmd_k))
                return NULL;
        else
                BUG_ON(pmd_pfn(*pmd) != pmd_pfn(*pmd_k));

        return pmd_k;
}

/*
 *   Handle a fault on the vmalloc or module mapping area
 *
 *   This is needed because there is a race condition between the time
 *   when the vmalloc mapping code updates the PMD to the point in time
 *   where it synchronizes this update with the other page-tables in the
 *   system.
 *
 *   In this race window another thread/CPU can map an area on the same
 *   PMD, finds it already present and does not synchronize it with the
 *   rest of the system yet. As a result v[mz]alloc might return areas
 *   which are not mapped in every page-table in the system, causing an
 *   unhandled page-fault when they are accessed.
 */
static noinline int vmalloc_fault(unsigned long address)
{
        unsigned long pgd_paddr;
        pmd_t *pmd_k;
        pte_t *pte_k;

        /* Make sure we are in vmalloc area: */
        if (!(address >= VMALLOC_START && address < VMALLOC_END))
                return -1;

        /*
         * Synchronize this task's top level page-table
         * with the 'reference' page table.
         *
         * Do _not_ use "current" here. We might be inside
         * an interrupt in the middle of a task switch..
         */
        pgd_paddr = read_cr3_pa();
        pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
        if (!pmd_k)
                return -1;

        if (pmd_leaf(*pmd_k))
                return 0;

        pte_k = pte_offset_kernel(pmd_k, address);
        if (!pte_present(*pte_k))
                return -1;

        return 0;
}
NOKPROBE_SYMBOL(vmalloc_fault);

void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
{
        unsigned long addr;

        for (addr = start & PMD_MASK;
             addr >= TASK_SIZE_MAX && addr < VMALLOC_END;
             addr += PMD_SIZE) {
                struct page *page;

                spin_lock(&pgd_lock);
                list_for_each_entry(page, &pgd_list, lru) {
                        spinlock_t *pgt_lock;

                        /* the pgt_lock only for Xen */
                        pgt_lock = &pgd_page_get_mm(page)->page_table_lock;

                        spin_lock(pgt_lock);
                        vmalloc_sync_one(page_address(page), addr);
                        spin_unlock(pgt_lock);
                }
                spin_unlock(&pgd_lock);
        }
}

static bool low_pfn(unsigned long pfn)
{
        return pfn < max_low_pfn;
}

static void dump_pagetable(unsigned long address)
{
        pgd_t *base = __va(read_cr3_pa());
        pgd_t *pgd = &base[pgd_index(address)];
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;
        pte_t *pte;

#ifdef CONFIG_X86_PAE
        pr_info("*pdpt = %016Lx ", pgd_val(*pgd));
        if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
                goto out;
#define pr_pde pr_cont
#else
#define pr_pde pr_info
#endif
        p4d = p4d_offset(pgd, address);
        pud = pud_offset(p4d, address);
        pmd = pmd_offset(pud, address);
        pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
#undef pr_pde

        /*
         * We must not directly access the pte in the highpte
         * case if the page table is located in highmem.
         * And let's rather not kmap-atomic the pte, just in case
         * it's allocated already:
         */
        if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_leaf(*pmd))
                goto out;

        pte = pte_offset_kernel(pmd, address);
        pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
out:
        pr_cont("\n");
}

#else /* CONFIG_X86_64: */

#ifdef CONFIG_CPU_SUP_AMD
static const char errata93_warning[] =
KERN_ERR 
"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
"******* Working around it, but it may cause SEGVs or burn power.\n"
"******* Please consider a BIOS update.\n"
"******* Disabling USB legacy in the BIOS may also help.\n";
#endif

static int bad_address(void *p)
{
        unsigned long dummy;

        return get_kernel_nofault(dummy, (unsigned long *)p);
}

static void dump_pagetable(unsigned long address)
{
        pgd_t *base = __va(read_cr3_pa());
        pgd_t *pgd = base + pgd_index(address);
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;
        pte_t *pte;

        if (bad_address(pgd))
                goto bad;

        pr_info("PGD %lx ", pgd_val(*pgd));

        if (!pgd_present(*pgd))
                goto out;

        p4d = p4d_offset(pgd, address);
        if (bad_address(p4d))
                goto bad;

        pr_cont("P4D %lx ", p4d_val(*p4d));
        if (!p4d_present(*p4d) || p4d_leaf(*p4d))
                goto out;

        pud = pud_offset(p4d, address);
        if (bad_address(pud))
                goto bad;

        pr_cont("PUD %lx ", pud_val(*pud));
        if (!pud_present(*pud) || pud_leaf(*pud))
                goto out;

        pmd = pmd_offset(pud, address);
        if (bad_address(pmd))
                goto bad;

        pr_cont("PMD %lx ", pmd_val(*pmd));
        if (!pmd_present(*pmd) || pmd_leaf(*pmd))
                goto out;

        pte = pte_offset_kernel(pmd, address);
        if (bad_address(pte))
                goto bad;

        pr_cont("PTE %lx", pte_val(*pte));
out:
        pr_cont("\n");
        return;
bad:
        pr_info("BAD\n");
}

#endif /* CONFIG_X86_64 */

/*
 * Workaround for K8 erratum #93 & buggy BIOS.
 *
 * BIOS SMM functions are required to use a specific workaround
 * to avoid corruption of the 64bit RIP register on C stepping K8.
 *
 * A lot of BIOS that didn't get tested properly miss this.
 *
 * The OS sees this as a page fault with the upper 32bits of RIP cleared.
 * Try to work around it here.
 *
 * Note we only handle faults in kernel here.
 * Does nothing on 32-bit.
 */
static int is_errata93(struct pt_regs *regs, unsigned long address)
{
#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
        if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
            || boot_cpu_data.x86 != 0xf)
                return 0;

        if (user_mode(regs))
                return 0;

        if (address != regs->ip)
                return 0;

        if ((address >> 32) != 0)
                return 0;

        address |= 0xffffffffUL << 32;
        if ((address >= (u64)_stext && address <= (u64)_etext) ||
            (address >= MODULES_VADDR && address <= MODULES_END)) {
                printk_once(errata93_warning);
                regs->ip = address;
                return 1;
        }
#endif
        return 0;
}

/*
 * Work around K8 erratum #100 K8 in compat mode occasionally jumps
 * to illegal addresses >4GB.
 *
 * We catch this in the page fault handler because these addresses
 * are not reachable. Just detect this case and return.  Any code
 * segment in LDT is compatibility mode.
 */
static int is_errata100(struct pt_regs *regs, unsigned long address)
{
#ifdef CONFIG_X86_64
        if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
                return 1;
#endif
        return 0;
}

/* Pentium F0 0F C7 C8 bug workaround: */
static int is_f00f_bug(struct pt_regs *regs, unsigned long error_code,
                       unsigned long address)
{
#ifdef CONFIG_X86_F00F_BUG
        if (boot_cpu_has_bug(X86_BUG_F00F) && !(error_code & X86_PF_USER) &&
            idt_is_f00f_address(address)) {
                handle_invalid_op(regs);
                return 1;
        }
#endif
        return 0;
}

static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index)
{
        u32 offset = (index >> 3) * sizeof(struct desc_struct);
        unsigned long addr;
        struct ldttss_desc desc;

        if (index == 0) {
                pr_alert("%s: NULL\n", name);
                return;
        }

        if (offset + sizeof(struct ldttss_desc) >= gdt->size) {
                pr_alert("%s: 0x%hx -- out of bounds\n", name, index);
                return;
        }

        if (copy_from_kernel_nofault(&desc, (void *)(gdt->address + offset),
                              sizeof(struct ldttss_desc))) {
                pr_alert("%s: 0x%hx -- GDT entry is not readable\n",
                         name, index);
                return;
        }

        addr = desc.base0 | (desc.base1 << 16) | ((unsigned long)desc.base2 << 24);
#ifdef CONFIG_X86_64
        addr |= ((u64)desc.base3 << 32);
#endif
        pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n",
                 name, index, addr, (desc.limit0 | (desc.limit1 << 16)));
}

static void
show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address)
{
        if (!oops_may_print())
                return;

        if (error_code & X86_PF_INSTR) {
                unsigned int level;
                bool nx, rw;
                pgd_t *pgd;
                pte_t *pte;

                pgd = __va(read_cr3_pa());
                pgd += pgd_index(address);

                pte = lookup_address_in_pgd_attr(pgd, address, &level, &nx, &rw);

                if (pte && pte_present(*pte) && (!pte_exec(*pte) || nx))
                        pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n",
                                from_kuid(&init_user_ns, current_uid()));
                if (pte && pte_present(*pte) && pte_exec(*pte) && !nx &&
                                (pgd_flags(*pgd) & _PAGE_USER) &&
                                (__read_cr4() & X86_CR4_SMEP))
                        pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n",
                                from_kuid(&init_user_ns, current_uid()));
        }

        if (address < PAGE_SIZE && !user_mode(regs))
                pr_alert("BUG: kernel NULL pointer dereference, address: %px\n",
                        (void *)address);
        else
                pr_alert("BUG: unable to handle page fault for address: %px\n",
                        (void *)address);

        pr_alert("#PF: %s %s in %s mode\n",
                 (error_code & X86_PF_USER)  ? "user" : "supervisor",
                 (error_code & X86_PF_INSTR) ? "instruction fetch" :
                 (error_code & X86_PF_WRITE) ? "write access" :
                                               "read access",
                             user_mode(regs) ? "user" : "kernel");
        pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code,
                 !(error_code & X86_PF_PROT) ? "not-present page" :
                 (error_code & X86_PF_RSVD)  ? "reserved bit violation" :
                 (error_code & X86_PF_PK)    ? "protection keys violation" :
                 (error_code & X86_PF_RMP)   ? "RMP violation" :
                                               "permissions violation");

        if (!(error_code & X86_PF_USER) && user_mode(regs)) {
                struct desc_ptr idt, gdt;
                u16 ldtr, tr;

                /*
                 * This can happen for quite a few reasons.  The more obvious
                 * ones are faults accessing the GDT, or LDT.  Perhaps
                 * surprisingly, if the CPU tries to deliver a benign or
                 * contributory exception from user code and gets a page fault
                 * during delivery, the page fault can be delivered as though
                 * it originated directly from user code.  This could happen
                 * due to wrong permissions on the IDT, GDT, LDT, TSS, or
                 * kernel or IST stack.
                 */
                store_idt(&idt);

                /* Usable even on Xen PV -- it's just slow. */
                native_store_gdt(&gdt);

                pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n",
                         idt.address, idt.size, gdt.address, gdt.size);

                store_ldt(ldtr);
                show_ldttss(&gdt, "LDTR", ldtr);

                store_tr(tr);
                show_ldttss(&gdt, "TR", tr);
        }

        dump_pagetable(address);

        if (error_code & X86_PF_RMP)
                snp_dump_hva_rmpentry(address);
}

static noinline void
pgtable_bad(struct pt_regs *regs, unsigned long error_code,
            unsigned long address)
{
        struct task_struct *tsk;
        unsigned long flags;
        int sig;

        flags = oops_begin();
        tsk = current;
        sig = SIGKILL;

        printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
               tsk->comm, address);
        dump_pagetable(address);

        if (__die("Bad pagetable", regs, error_code))
                sig = 0;

        oops_end(flags, regs, sig);
}

static void sanitize_error_code(unsigned long address,
                                unsigned long *error_code)
{
        /*
         * To avoid leaking information about the kernel page
         * table layout, pretend that user-mode accesses to
         * kernel addresses are always protection faults.
         *
         * NB: This means that failed vsyscalls with vsyscall=none
         * will have the PROT bit.  This doesn't leak any
         * information and does not appear to cause any problems.
         */
        if (address >= TASK_SIZE_MAX)
                *error_code |= X86_PF_PROT;
}

static void set_signal_archinfo(unsigned long address,
                                unsigned long error_code)
{
        struct task_struct *tsk = current;

        tsk->thread.trap_nr = X86_TRAP_PF;
        tsk->thread.error_code = error_code | X86_PF_USER;
        tsk->thread.cr2 = address;
}

static noinline void
page_fault_oops(struct pt_regs *regs, unsigned long error_code,
                unsigned long address)
{
#ifdef CONFIG_VMAP_STACK
        struct stack_info info;
#endif
        unsigned long flags;
        int sig;

        if (user_mode(regs)) {
                /*
                 * Implicit kernel access from user mode?  Skip the stack
                 * overflow and EFI special cases.
                 */
                goto oops;
        }

#ifdef CONFIG_VMAP_STACK
        /*
         * Stack overflow?  During boot, we can fault near the initial
         * stack in the direct map, but that's not an overflow -- check
         * that we're in vmalloc space to avoid this.
         */
        if (is_vmalloc_addr((void *)address) &&
            get_stack_guard_info((void *)address, &info)) {
                /*
                 * We're likely to be running with very little stack space
                 * left.  It's plausible that we'd hit this condition but
                 * double-fault even before we get this far, in which case
                 * we're fine: the double-fault handler will deal with it.
                 *
                 * We don't want to make it all the way into the oops code
                 * and then double-fault, though, because we're likely to
                 * break the console driver and lose most of the stack dump.
                 */
                call_on_stack(__this_cpu_ist_top_va(DF) - sizeof(void*),
                              handle_stack_overflow,
                              ASM_CALL_ARG3,
                              , [arg1] "r" (regs), [arg2] "r" (address), [arg3] "r" (&info));

                BUG();
        }
#endif

        /*
         * Buggy firmware could access regions which might page fault.  If
         * this happens, EFI has a special OOPS path that will try to
         * avoid hanging the system.
         */
        if (IS_ENABLED(CONFIG_EFI))
                efi_crash_gracefully_on_page_fault(address);

        /* Only not-present faults should be handled by KFENCE. */
        if (!(error_code & X86_PF_PROT) &&
            kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs))
                return;

oops:
        /*
         * Oops. The kernel tried to access some bad page. We'll have to
         * terminate things with extreme prejudice:
         */
        flags = oops_begin();

        show_fault_oops(regs, error_code, address);

        if (task_stack_end_corrupted(current))
                printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");

        sig = SIGKILL;
        if (__die("Oops", regs, error_code))
                sig = 0;

        /* Executive summary in case the body of the oops scrolled away */
        printk(KERN_DEFAULT "CR2: %016lx\n", address);

        oops_end(flags, regs, sig);
}

static noinline void
kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code,
                         unsigned long address, int signal, int si_code,
                         u32 pkey)
{
        WARN_ON_ONCE(user_mode(regs));

        /* Are we prepared to handle this kernel fault? */
        if (fixup_exception(regs, X86_TRAP_PF, error_code, address))
                return;

        /*
         * AMD erratum #91 manifests as a spurious page fault on a PREFETCH
         * instruction.
         */
        if (is_prefetch(regs, error_code, address))
                return;

        page_fault_oops(regs, error_code, address);
}

/*
 * Print out info about fatal segfaults, if the show_unhandled_signals
 * sysctl is set:
 */
static inline void
show_signal_msg(struct pt_regs *regs, unsigned long error_code,
                unsigned long address, struct task_struct *tsk)
{
        const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG;
        /* This is a racy snapshot, but it's better than nothing. */
        int cpu = raw_smp_processor_id();

        if (!unhandled_signal(tsk, SIGSEGV))
                return;

        if (!printk_ratelimit())
                return;

        printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
                loglvl, tsk->comm, task_pid_nr(tsk), address,
                (void *)regs->ip, (void *)regs->sp, error_code);

        print_vma_addr(KERN_CONT " in ", regs->ip);

        /*
         * Dump the likely CPU where the fatal segfault happened.
         * This can help identify faulty hardware.
         */
        printk(KERN_CONT " likely on CPU %d (core %d, socket %d)", cpu,
               topology_core_id(cpu), topology_physical_package_id(cpu));


        printk(KERN_CONT "\n");

        show_opcodes(regs, loglvl);
}

static void
__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
                       unsigned long address, u32 pkey, int si_code)
{
        struct task_struct *tsk = current;

        if (!user_mode(regs)) {
                kernelmode_fixup_or_oops(regs, error_code, address,
                                         SIGSEGV, si_code, pkey);
                return;
        }

        if (!(error_code & X86_PF_USER)) {
                /* Implicit user access to kernel memory -- just oops */
                page_fault_oops(regs, error_code, address);
                return;
        }

        /*
         * User mode accesses just cause a SIGSEGV.
         * It's possible to have interrupts off here:
         */
        local_irq_enable();

        /*
         * Valid to do another page fault here because this one came
         * from user space:
         */
        if (is_prefetch(regs, error_code, address))
                return;

        if (is_errata100(regs, address))
                return;

        sanitize_error_code(address, &error_code);

        if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address))
                return;

        if (likely(show_unhandled_signals))
                show_signal_msg(regs, error_code, address, tsk);

        set_signal_archinfo(address, error_code);

        if (si_code == SEGV_PKUERR)
                force_sig_pkuerr((void __user *)address, pkey);
        else
                force_sig_fault(SIGSEGV, si_code, (void __user *)address);

        local_irq_disable();
}

static noinline void
bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
                     unsigned long address)
{
        __bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR);
}

static void
__bad_area(struct pt_regs *regs, unsigned long error_code,
           unsigned long address, struct mm_struct *mm,
           struct vm_area_struct *vma, u32 pkey, int si_code)
{
        /*
         * Something tried to access memory that isn't in our memory map..
         * Fix it, but check if it's kernel or user first..
         */
        if (mm)
                mmap_read_unlock(mm);
        else
                vma_end_read(vma);

        __bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
}

static inline bool bad_area_access_from_pkeys(unsigned long error_code,
                struct vm_area_struct *vma)
{
        /* This code is always called on the current mm */
        bool foreign = false;

        if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
                return false;
        if (error_code & X86_PF_PK)
                return true;
        /* this checks permission keys on the VMA: */
        if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
                                       (error_code & X86_PF_INSTR), foreign))
                return true;
        return false;
}

static noinline void
bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
                      unsigned long address, struct mm_struct *mm,
                      struct vm_area_struct *vma)
{
        /*
         * This OSPKE check is not strictly necessary at runtime.
         * But, doing it this way allows compiler optimizations
         * if pkeys are compiled out.
         */
        if (bad_area_access_from_pkeys(error_code, vma)) {
                /*
                 * A protection key fault means that the PKRU value did not allow
                 * access to some PTE.  Userspace can figure out what PKRU was
                 * from the XSAVE state.  This function captures the pkey from
                 * the vma and passes it to userspace so userspace can discover
                 * which protection key was set on the PTE.
                 *
                 * If we get here, we know that the hardware signaled a X86_PF_PK
                 * fault and that there was a VMA once we got in the fault
                 * handler.  It does *not* guarantee that the VMA we find here
                 * was the one that we faulted on.
                 *
                 * 1. T1   : mprotect_key(foo, PAGE_SIZE, pkey=4);
                 * 2. T1   : set PKRU to deny access to pkey=4, touches page
                 * 3. T1   : faults...
                 * 4.    T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
                 * 5. T1   : enters fault handler, takes mmap_lock, etc...
                 * 6. T1   : reaches here, sees vma_pkey(vma)=5, when we really
                 *             faulted on a pte with its pkey=4.
                 */
                u32 pkey = vma_pkey(vma);

                __bad_area(regs, error_code, address, mm, vma, pkey, SEGV_PKUERR);
        } else {
                __bad_area(regs, error_code, address, mm, vma, 0, SEGV_ACCERR);
        }
}

static void
do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
          vm_fault_t fault)
{
        /* Kernel mode? Handle exceptions or die: */
        if (!user_mode(regs)) {
                kernelmode_fixup_or_oops(regs, error_code, address,
                                         SIGBUS, BUS_ADRERR, ARCH_DEFAULT_PKEY);
                return;
        }

        /* User-space => ok to do another page fault: */
        if (is_prefetch(regs, error_code, address))
                return;

        sanitize_error_code(address, &error_code);

        if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address))
                return;

        set_signal_archinfo(address, error_code);

#ifdef CONFIG_MEMORY_FAILURE
        if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
                struct task_struct *tsk = current;
                unsigned lsb = 0;

                pr_err(
        "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
                        tsk->comm, tsk->pid, address);
                if (fault & VM_FAULT_HWPOISON_LARGE)
                        lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
                if (fault & VM_FAULT_HWPOISON)
                        lsb = PAGE_SHIFT;
                force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb);
                return;
        }
#endif
        force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
}

static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
{
        if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
                return 0;

        if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
                return 0;

        return 1;
}

/*
 * Handle a spurious fault caused by a stale TLB entry.
 *
 * This allows us to lazily refresh the TLB when increasing the
 * permissions of a kernel page (RO -> RW or NX -> X).  Doing it
 * eagerly is very expensive since that implies doing a full
 * cross-processor TLB flush, even if no stale TLB entries exist
 * on other processors.
 *
 * Spurious faults may only occur if the TLB contains an entry with
 * fewer permission than the page table entry.  Non-present (P = 0)
 * and reserved bit (R = 1) faults are never spurious.
 *
 * There are no security implications to leaving a stale TLB when
 * increasing the permissions on a page.
 *
 * Returns non-zero if a spurious fault was handled, zero otherwise.
 *
 * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3
 * (Optional Invalidation).
 */
static noinline int
spurious_kernel_fault(unsigned long error_code, unsigned long address)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;
        pte_t *pte;
        int ret;

        /*
         * Only writes to RO or instruction fetches from NX may cause
         * spurious faults.
         *
         * These could be from user or supervisor accesses but the TLB
         * is only lazily flushed after a kernel mapping protection
         * change, so user accesses are not expected to cause spurious
         * faults.
         */
        if (error_code != (X86_PF_WRITE | X86_PF_PROT) &&
            error_code != (X86_PF_INSTR | X86_PF_PROT))
                return 0;

        pgd = init_mm.pgd + pgd_index(address);
        if (!pgd_present(*pgd))
                return 0;

        p4d = p4d_offset(pgd, address);
        if (!p4d_present(*p4d))
                return 0;

        if (p4d_leaf(*p4d))
                return spurious_kernel_fault_check(error_code, (pte_t *) p4d);

        pud = pud_offset(p4d, address);
        if (!pud_present(*pud))
                return 0;

        if (pud_leaf(*pud))
                return spurious_kernel_fault_check(error_code, (pte_t *) pud);

        pmd = pmd_offset(pud, address);
        if (!pmd_present(*pmd))
                return 0;

        if (pmd_leaf(*pmd))
                return spurious_kernel_fault_check(error_code, (pte_t *) pmd);

        pte = pte_offset_kernel(pmd, address);
        if (!pte_present(*pte))
                return 0;

        ret = spurious_kernel_fault_check(error_code, pte);
        if (!ret)
                return 0;

        /*
         * Make sure we have permissions in PMD.
         * If not, then there's a bug in the page tables:
         */
        ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd);
        WARN_ONCE(!ret, "PMD has incorrect permission bits\n");

        return ret;
}
NOKPROBE_SYMBOL(spurious_kernel_fault);

int show_unhandled_signals = 1;

static inline int
access_error(unsigned long error_code, struct vm_area_struct *vma)
{
        /* This is only called for the current mm, so: */
        bool foreign = false;

        /*
         * Read or write was blocked by protection keys.  This is
         * always an unconditional error and can never result in
         * a follow-up action to resolve the fault, like a COW.
         */
        if (error_code & X86_PF_PK)
                return 1;

        /*
         * SGX hardware blocked the access.  This usually happens
         * when the enclave memory contents have been destroyed, like
         * after a suspend/resume cycle. In any case, the kernel can't
         * fix the cause of the fault.  Handle the fault as an access
         * error even in cases where no actual access violation
         * occurred.  This allows userspace to rebuild the enclave in
         * response to the signal.
         */
        if (unlikely(error_code & X86_PF_SGX))
                return 1;

        /*
         * Make sure to check the VMA so that we do not perform
         * faults just to hit a X86_PF_PK as soon as we fill in a
         * page.
         */
        if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
                                       (error_code & X86_PF_INSTR), foreign))
                return 1;

        /*
         * Shadow stack accesses (PF_SHSTK=1) are only permitted to
         * shadow stack VMAs. All other accesses result in an error.
         */
        if (error_code & X86_PF_SHSTK) {
                if (unlikely(!(vma->vm_flags & VM_SHADOW_STACK)))
                        return 1;
                if (unlikely(!(vma->vm_flags & VM_WRITE)))
                        return 1;
                return 0;
        }

        if (error_code & X86_PF_WRITE) {
                /* write, present and write, not present: */
                if (unlikely(vma->vm_flags & VM_SHADOW_STACK))
                        return 1;
                if (unlikely(!(vma->vm_flags & VM_WRITE)))
                        return 1;
                return 0;
        }

        /* read, present: */
        if (unlikely(error_code & X86_PF_PROT))
                return 1;

        /* read, not present: */
        if (unlikely(!vma_is_accessible(vma)))
                return 1;

        return 0;
}

bool fault_in_kernel_space(unsigned long address)
{
        /*
         * On 64-bit systems, the vsyscall page is at an address above
         * TASK_SIZE_MAX, but is not considered part of the kernel
         * address space.
         */
        if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address))
                return false;

        return address >= TASK_SIZE_MAX;
}

/*
 * Called for all faults where 'address' is part of the kernel address
 * space.  Might get called for faults that originate from *code* that
 * ran in userspace or the kernel.
 */
static void
do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
                   unsigned long address)
{
        /*
         * Protection keys exceptions only happen on user pages.  We
         * have no user pages in the kernel portion of the address
         * space, so do not expect them here.
         */
        WARN_ON_ONCE(hw_error_code & X86_PF_PK);

#ifdef CONFIG_X86_32
        /*
         * We can fault-in kernel-space virtual memory on-demand. The
         * 'reference' page table is init_mm.pgd.
         *
         * NOTE! We MUST NOT take any locks for this case. We may
         * be in an interrupt or a critical region, and should
         * only copy the information from the master page table,
         * nothing more.
         *
         * Before doing this on-demand faulting, ensure that the
         * fault is not any of the following:
         * 1. A fault on a PTE with a reserved bit set.
         * 2. A fault caused by a user-mode access.  (Do not demand-
         *    fault kernel memory due to user-mode accesses).
         * 3. A fault caused by a page-level protection violation.
         *    (A demand fault would be on a non-present page which
         *     would have X86_PF_PROT==0).
         *
         * This is only needed to close a race condition on x86-32 in
         * the vmalloc mapping/unmapping code. See the comment above
         * vmalloc_fault() for details. On x86-64 the race does not
         * exist as the vmalloc mappings don't need to be synchronized
         * there.
         */
        if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
                if (vmalloc_fault(address) >= 0)
                        return;
        }
#endif

        if (is_f00f_bug(regs, hw_error_code, address))
                return;

        /* Was the fault spurious, caused by lazy TLB invalidation? */
        if (spurious_kernel_fault(hw_error_code, address))
                return;

        /* kprobes don't want to hook the spurious faults: */
        if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF)))
                return;

        /*
         * Note, despite being a "bad area", there are quite a few
         * acceptable reasons to get here, such as erratum fixups
         * and handling kernel code that can fault, like get_user().
         *
         * Don't take the mm semaphore here. If we fixup a prefetch
         * fault we could otherwise deadlock:
         */
        bad_area_nosemaphore(regs, hw_error_code, address);
}
NOKPROBE_SYMBOL(do_kern_addr_fault);

/*
 * Handle faults in the user portion of the address space.  Nothing in here
 * should check X86_PF_USER without a specific justification: for almost
 * all purposes, we should treat a normal kernel access to user memory
 * (e.g. get_user(), put_user(), etc.) the same as the WRUSS instruction.
 * The one exception is AC flag handling, which is, per the x86
 * architecture, special for WRUSS.
 */
static inline
void do_user_addr_fault(struct pt_regs *regs,
                        unsigned long error_code,
                        unsigned long address)
{
        struct vm_area_struct *vma;
        struct task_struct *tsk;
        struct mm_struct *mm;
        vm_fault_t fault;
        unsigned int flags = FAULT_FLAG_DEFAULT;

        tsk = current;
        mm = tsk->mm;

        if (unlikely((error_code & (X86_PF_USER | X86_PF_INSTR)) == X86_PF_INSTR)) {
                /*
                 * Whoops, this is kernel mode code trying to execute from
                 * user memory.  Unless this is AMD erratum #93, which
                 * corrupts RIP such that it looks like a user address,
                 * this is unrecoverable.  Don't even try to look up the
                 * VMA or look for extable entries.
                 */
                if (is_errata93(regs, address))
                        return;

                page_fault_oops(regs, error_code, address);
                return;
        }

        /* kprobes don't want to hook the spurious faults: */
        if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF)))
                return;

        /*
         * Reserved bits are never expected to be set on
         * entries in the user portion of the page tables.
         */
        if (unlikely(error_code & X86_PF_RSVD))
                pgtable_bad(regs, error_code, address);

        /*
         * If SMAP is on, check for invalid kernel (supervisor) access to user
         * pages in the user address space.  The odd case here is WRUSS,
         * which, according to the preliminary documentation, does not respect
         * SMAP and will have the USER bit set so, in all cases, SMAP
         * enforcement appears to be consistent with the USER bit.
         */
        if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
                     !(error_code & X86_PF_USER) &&
                     !(regs->flags & X86_EFLAGS_AC))) {
                /*
                 * No extable entry here.  This was a kernel access to an
                 * invalid pointer.  get_kernel_nofault() will not get here.
                 */
                page_fault_oops(regs, error_code, address);
                return;
        }

        /*
         * If we're in an interrupt, have no user context or are running
         * in a region with pagefaults disabled then we must not take the fault
         */
        if (unlikely(faulthandler_disabled() || !mm)) {
                bad_area_nosemaphore(regs, error_code, address);
                return;
        }

        /* Legacy check - remove this after verifying that it doesn't trigger */
        if (WARN_ON_ONCE(!(regs->flags & X86_EFLAGS_IF))) {
                bad_area_nosemaphore(regs, error_code, address);
                return;
        }

        local_irq_enable();

        perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);

        /*
         * Read-only permissions can not be expressed in shadow stack PTEs.
         * Treat all shadow stack accesses as WRITE faults. This ensures
         * that the MM will prepare everything (e.g., break COW) such that
         * maybe_mkwrite() can create a proper shadow stack PTE.
         */
        if (error_code & X86_PF_SHSTK)
                flags |= FAULT_FLAG_WRITE;
        if (error_code & X86_PF_WRITE)
                flags |= FAULT_FLAG_WRITE;
        if (error_code & X86_PF_INSTR)
                flags |= FAULT_FLAG_INSTRUCTION;

        /*
         * We set FAULT_FLAG_USER based on the register state, not
         * based on X86_PF_USER. User space accesses that cause
         * system page faults are still user accesses.
         */
        if (user_mode(regs))
                flags |= FAULT_FLAG_USER;

#ifdef CONFIG_X86_64
        /*
         * Faults in the vsyscall page might need emulation.  The
         * vsyscall page is at a high address (>PAGE_OFFSET), but is
         * considered to be part of the user address space.
         *
         * The vsyscall page does not have a "real" VMA, so do this
         * emulation before we go searching for VMAs.
         *
         * PKRU never rejects instruction fetches, so we don't need
         * to consider the PF_PK bit.
         */
        if (is_vsyscall_vaddr(address)) {
                if (emulate_vsyscall(error_code, regs, address))
                        return;
        }
#endif

        if (!(flags & FAULT_FLAG_USER))
                goto lock_mmap;

        vma = lock_vma_under_rcu(mm, address);
        if (!vma)
                goto lock_mmap;

        if (unlikely(access_error(error_code, vma))) {
                bad_area_access_error(regs, error_code, address, NULL, vma);
                count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
                return;
        }
        fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
        if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
                vma_end_read(vma);

        if (!(fault & VM_FAULT_RETRY)) {
                count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
                goto done;
        }
        count_vm_vma_lock_event(VMA_LOCK_RETRY);
        if (fault & VM_FAULT_MAJOR)
                flags |= FAULT_FLAG_TRIED;

        /* Quick path to respond to signals */
        if (fault_signal_pending(fault, regs)) {
                if (!user_mode(regs))
                        kernelmode_fixup_or_oops(regs, error_code, address,
                                                 SIGBUS, BUS_ADRERR,
                                                 ARCH_DEFAULT_PKEY);
                return;
        }
lock_mmap:

retry:
        vma = lock_mm_and_find_vma(mm, address, regs);
        if (unlikely(!vma)) {
                bad_area_nosemaphore(regs, error_code, address);
                return;
        }

        /*
         * Ok, we have a good vm_area for this memory access, so
         * we can handle it..
         */
        if (unlikely(access_error(error_code, vma))) {
                bad_area_access_error(regs, error_code, address, mm, vma);
                return;
        }

        /*
         * If for any reason at all we couldn't handle the fault,
         * make sure we exit gracefully rather than endlessly redo
         * the fault.  Since we never set FAULT_FLAG_RETRY_NOWAIT, if
         * we get VM_FAULT_RETRY back, the mmap_lock has been unlocked.
         *
         * Note that handle_userfault() may also release and reacquire mmap_lock
         * (and not return with VM_FAULT_RETRY), when returning to userland to
         * repeat the page fault later with a VM_FAULT_NOPAGE retval
         * (potentially after handling any pending signal during the return to
         * userland). The return to userland is identified whenever
         * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags.
         */
        fault = handle_mm_fault(vma, address, flags, regs);

        if (fault_signal_pending(fault, regs)) {
                /*
                 * Quick path to respond to signals.  The core mm code
                 * has unlocked the mm for us if we get here.
                 */
                if (!user_mode(regs))
                        kernelmode_fixup_or_oops(regs, error_code, address,
                                                 SIGBUS, BUS_ADRERR,
                                                 ARCH_DEFAULT_PKEY);
                return;
        }

        /* The fault is fully completed (including releasing mmap lock) */
        if (fault & VM_FAULT_COMPLETED)
                return;

        /*
         * If we need to retry the mmap_lock has already been released,
         * and if there is a fatal signal pending there is no guarantee
         * that we made any progress. Handle this case first.
         */
        if (unlikely(fault & VM_FAULT_RETRY)) {
                flags |= FAULT_FLAG_TRIED;
                goto retry;
        }

        mmap_read_unlock(mm);
done:
        if (likely(!(fault & VM_FAULT_ERROR)))
                return;

        if (fatal_signal_pending(current) && !user_mode(regs)) {
                kernelmode_fixup_or_oops(regs, error_code, address,
                                         0, 0, ARCH_DEFAULT_PKEY);
                return;
        }

        if (fault & VM_FAULT_OOM) {
                /* Kernel mode? Handle exceptions or die: */
                if (!user_mode(regs)) {
                        kernelmode_fixup_or_oops(regs, error_code, address,
                                                 SIGSEGV, SEGV_MAPERR,
                                                 ARCH_DEFAULT_PKEY);
                        return;
                }

                /*
                 * We ran out of memory, call the OOM killer, and return the
                 * userspace (which will retry the fault, or kill us if we got
                 * oom-killed):
                 */
                pagefault_out_of_memory();
        } else {
                if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
                             VM_FAULT_HWPOISON_LARGE))
                        do_sigbus(regs, error_code, address, fault);
                else if (fault & VM_FAULT_SIGSEGV)
                        bad_area_nosemaphore(regs, error_code, address);
                else
                        BUG();
        }
}
NOKPROBE_SYMBOL(do_user_addr_fault);

static __always_inline void
trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code,
                         unsigned long address)
{
        if (user_mode(regs))
                trace_page_fault_user(address, regs, error_code);
        else
                trace_page_fault_kernel(address, regs, error_code);
}

static __always_inline void
handle_page_fault(struct pt_regs *regs, unsigned long error_code,
                              unsigned long address)
{
        trace_page_fault_entries(regs, error_code, address);

        if (unlikely(kmmio_fault(regs, address)))
                return;

        /* Was the fault on kernel-controlled part of the address space? */
        if (unlikely(fault_in_kernel_space(address))) {
                do_kern_addr_fault(regs, error_code, address);
        } else {
                do_user_addr_fault(regs, error_code, address);
                /*
                 * User address page fault handling might have reenabled
                 * interrupts. Fixing up all potential exit points of
                 * do_user_addr_fault() and its leaf functions is just not
                 * doable w/o creating an unholy mess or turning the code
                 * upside down.
                 */
                local_irq_disable();
        }
}

DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
{
        irqentry_state_t state;
        unsigned long address;

        address = cpu_feature_enabled(X86_FEATURE_FRED) ? fred_event_data(regs) : read_cr2();

        /*
         * KVM uses #PF vector to deliver 'page not present' events to guests
         * (asynchronous page fault mechanism). The event happens when a
         * userspace task is trying to access some valid (from guest's point of
         * view) memory which is not currently mapped by the host (e.g. the
         * memory is swapped out). Note, the corresponding "page ready" event
         * which is injected when the memory becomes available, is delivered via
         * an interrupt mechanism and not a #PF exception
         * (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()).
         *
         * We are relying on the interrupted context being sane (valid RSP,
         * relevant locks not held, etc.), which is fine as long as the
         * interrupted context had IF=1.  We are also relying on the KVM
         * async pf type field and CR2 being read consistently instead of
         * getting values from real and async page faults mixed up.
         *
         * Fingers crossed.
         *
         * The async #PF handling code takes care of idtentry handling
         * itself.
         */
        if (kvm_handle_async_pf(regs, (u32)address))
                return;

        /*
         * Entry handling for valid #PF from kernel mode is slightly
         * different: RCU is already watching and ct_irq_enter() must not
         * be invoked because a kernel fault on a user space address might
         * sleep.
         *
         * In case the fault hit a RCU idle region the conditional entry
         * code reenabled RCU to avoid subsequent wreckage which helps
         * debuggability.
         */
        state = irqentry_enter(regs);

        instrumentation_begin();
        handle_page_fault(regs, error_code, address);
        instrumentation_end();

        irqentry_exit(regs, state);
}

























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_USER_H
#define _LINUX_SCHED_USER_H

#include <linux/uidgid.h>
#include <linux/atomic.h>
#include <linux/percpu_counter.h>
#include <linux/refcount.h>
#include <linux/ratelimit.h>

/*
 * Some day this will be a full-fledged user tracking system..
 */
struct user_struct {
        refcount_t __count;        /* reference count */
#ifdef CONFIG_EPOLL
        struct percpu_counter epoll_watches; /* The number of file descriptors currently watched */
#endif
        unsigned long unix_inflight;        /* How many files in flight in unix sockets */
        atomic_long_t pipe_bufs;  /* how many pages are allocated in pipe buffers */

        /* Hash table maintenance information */
        struct hlist_node uidhash_node;
        kuid_t uid;

#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \
        defined(CONFIG_NET) || defined(CONFIG_IO_URING) || \
        defined(CONFIG_VFIO_PCI_ZDEV_KVM) || IS_ENABLED(CONFIG_IOMMUFD)
        atomic_long_t locked_vm;
#endif
#ifdef CONFIG_WATCH_QUEUE
        atomic_t nr_watches;        /* The number of watches this user currently has */
#endif

        /* Miscellaneous per-user rate limit */
        struct ratelimit_state ratelimit;
};

extern int uids_sysfs_init(void);

extern struct user_struct *find_user(kuid_t);

extern struct user_struct root_user;
#define INIT_USER (&root_user)


/* per-UID process charging. */
extern struct user_struct * alloc_uid(kuid_t);
static inline struct user_struct *get_uid(struct user_struct *u)
{
        refcount_inc(&u->__count);
        return u;
}
extern void free_uid(struct user_struct *);

#endif /* _LINUX_SCHED_USER_H */





























    1 

















































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
/* inflate.c -- zlib decompression
 * Copyright (C) 1995-2005 Mark Adler
 * For conditions of distribution and use, see copyright notice in zlib.h
 *
 * Based on zlib 1.2.3 but modified for the Linux Kernel by
 * Richard Purdie <richard@openedhand.com>
 *
 * Changes mainly for static instead of dynamic memory allocation
 *
 */

#include <linux/zutil.h>
#include "inftrees.h"
#include "inflate.h"
#include "inffast.h"
#include "infutil.h"

/* architecture-specific bits */
#ifdef CONFIG_ZLIB_DFLTCC
#  include "../zlib_dfltcc/dfltcc_inflate.h"
#else
#define INFLATE_RESET_HOOK(strm) do {} while (0)
#define INFLATE_TYPEDO_HOOK(strm, flush) do {} while (0)
#define INFLATE_NEED_UPDATEWINDOW(strm) 1
#define INFLATE_NEED_CHECKSUM(strm) 1
#endif

int zlib_inflate_workspacesize(void)
{
    return sizeof(struct inflate_workspace);
}

int zlib_inflateReset(z_streamp strm)
{
    struct inflate_state *state;

    if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR;
    state = (struct inflate_state *)strm->state;
    strm->total_in = strm->total_out = state->total = 0;
    strm->msg = NULL;
    strm->adler = 1;        /* to support ill-conceived Java test suite */
    state->mode = HEAD;
    state->last = 0;
    state->havedict = 0;
    state->dmax = 32768U;
    state->hold = 0;
    state->bits = 0;
    state->lencode = state->distcode = state->next = state->codes;

    /* Initialise Window */
    state->wsize = 1U << state->wbits;
    state->write = 0;
    state->whave = 0;

    INFLATE_RESET_HOOK(strm);
    return Z_OK;
}

int zlib_inflateInit2(z_streamp strm, int windowBits)
{
    struct inflate_state *state;

    if (strm == NULL) return Z_STREAM_ERROR;
    strm->msg = NULL;                 /* in case we return an error */

    state = &WS(strm)->inflate_state;
    strm->state = (struct internal_state *)state;

    if (windowBits < 0) {
        state->wrap = 0;
        windowBits = -windowBits;
    }
    else {
        state->wrap = (windowBits >> 4) + 1;
    }
    if (windowBits < 8 || windowBits > 15) {
        return Z_STREAM_ERROR;
    }
    state->wbits = (unsigned)windowBits;
#ifdef CONFIG_ZLIB_DFLTCC
    /*
     * DFLTCC requires the window to be page aligned.
     * Thus, we overallocate and take the aligned portion of the buffer.
     */
    state->window = PTR_ALIGN(&WS(strm)->working_window[0], PAGE_SIZE);
#else
    state->window = &WS(strm)->working_window[0];
#endif

    return zlib_inflateReset(strm);
}

/*
   Return state with length and distance decoding tables and index sizes set to
   fixed code decoding.  This returns fixed tables from inffixed.h.
 */
static void zlib_fixedtables(struct inflate_state *state)
{
#   include "inffixed.h"
    state->lencode = lenfix;
    state->lenbits = 9;
    state->distcode = distfix;
    state->distbits = 5;
}


/*
   Update the window with the last wsize (normally 32K) bytes written before
   returning. This is only called when a window is already in use, or when
   output has been written during this inflate call, but the end of the deflate
   stream has not been reached yet. It is also called to window dictionary data
   when a dictionary is loaded.

   Providing output buffers larger than 32K to inflate() should provide a speed
   advantage, since only the last 32K of output is copied to the sliding window
   upon return from inflate(), and since all distances after the first 32K of
   output will fall in the output data, making match copies simpler and faster.
   The advantage may be dependent on the size of the processor's data caches.
 */
static void zlib_updatewindow(z_streamp strm, unsigned out)
{
    struct inflate_state *state;
    unsigned copy, dist;

    state = (struct inflate_state *)strm->state;

    /* copy state->wsize or less output bytes into the circular window */
    copy = out - strm->avail_out;
    if (copy >= state->wsize) {
        memcpy(state->window, strm->next_out - state->wsize, state->wsize);
        state->write = 0;
        state->whave = state->wsize;
    }
    else {
        dist = state->wsize - state->write;
        if (dist > copy) dist = copy;
        memcpy(state->window + state->write, strm->next_out - copy, dist);
        copy -= dist;
        if (copy) {
            memcpy(state->window, strm->next_out - copy, copy);
            state->write = copy;
            state->whave = state->wsize;
        }
        else {
            state->write += dist;
            if (state->write == state->wsize) state->write = 0;
            if (state->whave < state->wsize) state->whave += dist;
        }
    }
}


/*
 * At the end of a Deflate-compressed PPP packet, we expect to have seen
 * a `stored' block type value but not the (zero) length bytes.
 */
/*
   Returns true if inflate is currently at the end of a block generated by
   Z_SYNC_FLUSH or Z_FULL_FLUSH. This function is used by one PPP
   implementation to provide an additional safety check. PPP uses
   Z_SYNC_FLUSH but removes the length bytes of the resulting empty stored
   block. When decompressing, PPP checks that at the end of input packet,
   inflate is waiting for these length bytes.
 */
static int zlib_inflateSyncPacket(z_streamp strm)
{
    struct inflate_state *state;

    if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR;
    state = (struct inflate_state *)strm->state;

    if (state->mode == STORED && state->bits == 0) {
        state->mode = TYPE;
        return Z_OK;
    }
    return Z_DATA_ERROR;
}

/* Macros for inflate(): */

/* check function to use adler32() for zlib or crc32() for gzip */
#define UPDATE(check, buf, len) zlib_adler32(check, buf, len)

/* Load registers with state in inflate() for speed */
#define LOAD() \
    do { \
        put = strm->next_out; \
        left = strm->avail_out; \
        next = strm->next_in; \
        have = strm->avail_in; \
        hold = state->hold; \
        bits = state->bits; \
    } while (0)

/* Restore state from registers in inflate() */
#define RESTORE() \
    do { \
        strm->next_out = put; \
        strm->avail_out = left; \
        strm->next_in = next; \
        strm->avail_in = have; \
        state->hold = hold; \
        state->bits = bits; \
    } while (0)

/* Clear the input bit accumulator */
#define INITBITS() \
    do { \
        hold = 0; \
        bits = 0; \
    } while (0)

/* Get a byte of input into the bit accumulator, or return from inflate()
   if there is no input available. */
#define PULLBYTE() \
    do { \
        if (have == 0) goto inf_leave; \
        have--; \
        hold += (unsigned long)(*next++) << bits; \
        bits += 8; \
    } while (0)

/* Assure that there are at least n bits in the bit accumulator.  If there is
   not enough available input to do that, then return from inflate(). */
#define NEEDBITS(n) \
    do { \
        while (bits < (unsigned)(n)) \
            PULLBYTE(); \
    } while (0)

/* Return the low n bits of the bit accumulator (n < 16) */
#define BITS(n) \
    ((unsigned)hold & ((1U << (n)) - 1))

/* Remove n bits from the bit accumulator */
#define DROPBITS(n) \
    do { \
        hold >>= (n); \
        bits -= (unsigned)(n); \
    } while (0)

/* Remove zero to seven bits as needed to go to a byte boundary */
#define BYTEBITS() \
    do { \
        hold >>= bits & 7; \
        bits -= bits & 7; \
    } while (0)

/*
   inflate() uses a state machine to process as much input data and generate as
   much output data as possible before returning.  The state machine is
   structured roughly as follows:

    for (;;) switch (state) {
    ...
    case STATEn:
        if (not enough input data or output space to make progress)
            return;
        ... make progress ...
        state = STATEm;
        break;
    ...
    }

   so when inflate() is called again, the same case is attempted again, and
   if the appropriate resources are provided, the machine proceeds to the
   next state.  The NEEDBITS() macro is usually the way the state evaluates
   whether it can proceed or should return.  NEEDBITS() does the return if
   the requested bits are not available.  The typical use of the BITS macros
   is:

        NEEDBITS(n);
        ... do something with BITS(n) ...
        DROPBITS(n);

   where NEEDBITS(n) either returns from inflate() if there isn't enough
   input left to load n bits into the accumulator, or it continues.  BITS(n)
   gives the low n bits in the accumulator.  When done, DROPBITS(n) drops
   the low n bits off the accumulator.  INITBITS() clears the accumulator
   and sets the number of available bits to zero.  BYTEBITS() discards just
   enough bits to put the accumulator on a byte boundary.  After BYTEBITS()
   and a NEEDBITS(8), then BITS(8) would return the next byte in the stream.

   NEEDBITS(n) uses PULLBYTE() to get an available byte of input, or to return
   if there is no input available.  The decoding of variable length codes uses
   PULLBYTE() directly in order to pull just enough bytes to decode the next
   code, and no more.

   Some states loop until they get enough input, making sure that enough
   state information is maintained to continue the loop where it left off
   if NEEDBITS() returns in the loop.  For example, want, need, and keep
   would all have to actually be part of the saved state in case NEEDBITS()
   returns:

    case STATEw:
        while (want < need) {
            NEEDBITS(n);
            keep[want++] = BITS(n);
            DROPBITS(n);
        }
        state = STATEx;
    case STATEx:

   As shown above, if the next state is also the next case, then the break
   is omitted.

   A state may also return if there is not enough output space available to
   complete that state.  Those states are copying stored data, writing a
   literal byte, and copying a matching string.

   When returning, a "goto inf_leave" is used to update the total counters,
   update the check value, and determine whether any progress has been made
   during that inflate() call in order to return the proper return code.
   Progress is defined as a change in either strm->avail_in or strm->avail_out.
   When there is a window, goto inf_leave will update the window with the last
   output written.  If a goto inf_leave occurs in the middle of decompression
   and there is no window currently, goto inf_leave will create one and copy
   output to the window for the next call of inflate().

   In this implementation, the flush parameter of inflate() only affects the
   return code (per zlib.h).  inflate() always writes as much as possible to
   strm->next_out, given the space available and the provided input--the effect
   documented in zlib.h of Z_SYNC_FLUSH.  Furthermore, inflate() always defers
   the allocation of and copying into a sliding window until necessary, which
   provides the effect documented in zlib.h for Z_FINISH when the entire input
   stream available.  So the only thing the flush parameter actually does is:
   when flush is set to Z_FINISH, inflate() cannot return Z_OK.  Instead it
   will return Z_BUF_ERROR if it has not reached the end of the stream.
 */

int zlib_inflate(z_streamp strm, int flush)
{
    struct inflate_state *state;
    const unsigned char *next;  /* next input */
    unsigned char *put;         /* next output */
    unsigned have, left;        /* available input and output */
    unsigned long hold;         /* bit buffer */
    unsigned bits;              /* bits in bit buffer */
    unsigned in, out;           /* save starting available input and output */
    unsigned copy;              /* number of stored or match bytes to copy */
    unsigned char *from;        /* where to copy match bytes from */
    code this;                  /* current decoding table entry */
    code last;                  /* parent table entry */
    unsigned len;               /* length to copy for repeats, bits to drop */
    int ret;                    /* return code */
    static const unsigned short order[19] = /* permutation of code lengths */
        {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};

    /* Do not check for strm->next_out == NULL here as ppc zImage
       inflates to strm->next_out = 0 */

    if (strm == NULL || strm->state == NULL ||
        (strm->next_in == NULL && strm->avail_in != 0))
        return Z_STREAM_ERROR;

    state = (struct inflate_state *)strm->state;

    if (state->mode == TYPE) state->mode = TYPEDO;      /* skip check */
    LOAD();
    in = have;
    out = left;
    ret = Z_OK;
    for (;;)
        switch (state->mode) {
        case HEAD:
            if (state->wrap == 0) {
                state->mode = TYPEDO;
                break;
            }
            NEEDBITS(16);
            if (
                ((BITS(8) << 8) + (hold >> 8)) % 31) {
                strm->msg = (char *)"incorrect header check";
                state->mode = BAD;
                break;
            }
            if (BITS(4) != Z_DEFLATED) {
                strm->msg = (char *)"unknown compression method";
                state->mode = BAD;
                break;
            }
            DROPBITS(4);
            len = BITS(4) + 8;
            if (len > state->wbits) {
                strm->msg = (char *)"invalid window size";
                state->mode = BAD;
                break;
            }
            state->dmax = 1U << len;
            strm->adler = state->check = zlib_adler32(0L, NULL, 0);
            state->mode = hold & 0x200 ? DICTID : TYPE;
            INITBITS();
            break;
        case DICTID:
            NEEDBITS(32);
            strm->adler = state->check = REVERSE(hold);
            INITBITS();
            state->mode = DICT;
            fallthrough;
        case DICT:
            if (state->havedict == 0) {
                RESTORE();
                return Z_NEED_DICT;
            }
            strm->adler = state->check = zlib_adler32(0L, NULL, 0);
            state->mode = TYPE;
            fallthrough;
        case TYPE:
            if (flush == Z_BLOCK) goto inf_leave;
            fallthrough;
        case TYPEDO:
            INFLATE_TYPEDO_HOOK(strm, flush);
            if (state->last) {
                BYTEBITS();
                state->mode = CHECK;
                break;
            }
            NEEDBITS(3);
            state->last = BITS(1);
            DROPBITS(1);
            switch (BITS(2)) {
            case 0:                             /* stored block */
                state->mode = STORED;
                break;
            case 1:                             /* fixed block */
                zlib_fixedtables(state);
                state->mode = LEN;              /* decode codes */
                break;
            case 2:                             /* dynamic block */
                state->mode = TABLE;
                break;
            case 3:
                strm->msg = (char *)"invalid block type";
                state->mode = BAD;
            }
            DROPBITS(2);
            break;
        case STORED:
            BYTEBITS();                         /* go to byte boundary */
            NEEDBITS(32);
            if ((hold & 0xffff) != ((hold >> 16) ^ 0xffff)) {
                strm->msg = (char *)"invalid stored block lengths";
                state->mode = BAD;
                break;
            }
            state->length = (unsigned)hold & 0xffff;
            INITBITS();
            state->mode = COPY;
            fallthrough;
        case COPY:
            copy = state->length;
            if (copy) {
                if (copy > have) copy = have;
                if (copy > left) copy = left;
                if (copy == 0) goto inf_leave;
                memcpy(put, next, copy);
                have -= copy;
                next += copy;
                left -= copy;
                put += copy;
                state->length -= copy;
                break;
            }
            state->mode = TYPE;
            break;
        case TABLE:
            NEEDBITS(14);
            state->nlen = BITS(5) + 257;
            DROPBITS(5);
            state->ndist = BITS(5) + 1;
            DROPBITS(5);
            state->ncode = BITS(4) + 4;
            DROPBITS(4);
#ifndef PKZIP_BUG_WORKAROUND
            if (state->nlen > 286 || state->ndist > 30) {
                strm->msg = (char *)"too many length or distance symbols";
                state->mode = BAD;
                break;
            }
#endif
            state->have = 0;
            state->mode = LENLENS;
            fallthrough;
        case LENLENS:
            while (state->have < state->ncode) {
                NEEDBITS(3);
                state->lens[order[state->have++]] = (unsigned short)BITS(3);
                DROPBITS(3);
            }
            while (state->have < 19)
                state->lens[order[state->have++]] = 0;
            state->next = state->codes;
            state->lencode = (code const *)(state->next);
            state->lenbits = 7;
            ret = zlib_inflate_table(CODES, state->lens, 19, &(state->next),
                                &(state->lenbits), state->work);
            if (ret) {
                strm->msg = (char *)"invalid code lengths set";
                state->mode = BAD;
                break;
            }
            state->have = 0;
            state->mode = CODELENS;
            fallthrough;
        case CODELENS:
            while (state->have < state->nlen + state->ndist) {
                for (;;) {
                    this = state->lencode[BITS(state->lenbits)];
                    if ((unsigned)(this.bits) <= bits) break;
                    PULLBYTE();
                }
                if (this.val < 16) {
                    NEEDBITS(this.bits);
                    DROPBITS(this.bits);
                    state->lens[state->have++] = this.val;
                }
                else {
                    if (this.val == 16) {
                        NEEDBITS(this.bits + 2);
                        DROPBITS(this.bits);
                        if (state->have == 0) {
                            strm->msg = (char *)"invalid bit length repeat";
                            state->mode = BAD;
                            break;
                        }
                        len = state->lens[state->have - 1];
                        copy = 3 + BITS(2);
                        DROPBITS(2);
                    }
                    else if (this.val == 17) {
                        NEEDBITS(this.bits + 3);
                        DROPBITS(this.bits);
                        len = 0;
                        copy = 3 + BITS(3);
                        DROPBITS(3);
                    }
                    else {
                        NEEDBITS(this.bits + 7);
                        DROPBITS(this.bits);
                        len = 0;
                        copy = 11 + BITS(7);
                        DROPBITS(7);
                    }
                    if (state->have + copy > state->nlen + state->ndist) {
                        strm->msg = (char *)"invalid bit length repeat";
                        state->mode = BAD;
                        break;
                    }
                    while (copy--)
                        state->lens[state->have++] = (unsigned short)len;
                }
            }

            /* handle error breaks in while */
            if (state->mode == BAD) break;

            /* build code tables */
            state->next = state->codes;
            state->lencode = (code const *)(state->next);
            state->lenbits = 9;
            ret = zlib_inflate_table(LENS, state->lens, state->nlen, &(state->next),
                                &(state->lenbits), state->work);
            if (ret) {
                strm->msg = (char *)"invalid literal/lengths set";
                state->mode = BAD;
                break;
            }
            state->distcode = (code const *)(state->next);
            state->distbits = 6;
            ret = zlib_inflate_table(DISTS, state->lens + state->nlen, state->ndist,
                            &(state->next), &(state->distbits), state->work);
            if (ret) {
                strm->msg = (char *)"invalid distances set";
                state->mode = BAD;
                break;
            }
            state->mode = LEN;
            fallthrough;
        case LEN:
            if (have >= 6 && left >= 258) {
                RESTORE();
                inflate_fast(strm, out);
                LOAD();
                break;
            }
            for (;;) {
                this = state->lencode[BITS(state->lenbits)];
                if ((unsigned)(this.bits) <= bits) break;
                PULLBYTE();
            }
            if (this.op && (this.op & 0xf0) == 0) {
                last = this;
                for (;;) {
                    this = state->lencode[last.val +
                            (BITS(last.bits + last.op) >> last.bits)];
                    if ((unsigned)(last.bits + this.bits) <= bits) break;
                    PULLBYTE();
                }
                DROPBITS(last.bits);
            }
            DROPBITS(this.bits);
            state->length = (unsigned)this.val;
            if ((int)(this.op) == 0) {
                state->mode = LIT;
                break;
            }
            if (this.op & 32) {
                state->mode = TYPE;
                break;
            }
            if (this.op & 64) {
                strm->msg = (char *)"invalid literal/length code";
                state->mode = BAD;
                break;
            }
            state->extra = (unsigned)(this.op) & 15;
            state->mode = LENEXT;
            fallthrough;
        case LENEXT:
            if (state->extra) {
                NEEDBITS(state->extra);
                state->length += BITS(state->extra);
                DROPBITS(state->extra);
            }
            state->mode = DIST;
            fallthrough;
        case DIST:
            for (;;) {
                this = state->distcode[BITS(state->distbits)];
                if ((unsigned)(this.bits) <= bits) break;
                PULLBYTE();
            }
            if ((this.op & 0xf0) == 0) {
                last = this;
                for (;;) {
                    this = state->distcode[last.val +
                            (BITS(last.bits + last.op) >> last.bits)];
                    if ((unsigned)(last.bits + this.bits) <= bits) break;
                    PULLBYTE();
                }
                DROPBITS(last.bits);
            }
            DROPBITS(this.bits);
            if (this.op & 64) {
                strm->msg = (char *)"invalid distance code";
                state->mode = BAD;
                break;
            }
            state->offset = (unsigned)this.val;
            state->extra = (unsigned)(this.op) & 15;
            state->mode = DISTEXT;
            fallthrough;
        case DISTEXT:
            if (state->extra) {
                NEEDBITS(state->extra);
                state->offset += BITS(state->extra);
                DROPBITS(state->extra);
            }
#ifdef INFLATE_STRICT
            if (state->offset > state->dmax) {
                strm->msg = (char *)"invalid distance too far back";
                state->mode = BAD;
                break;
            }
#endif
            if (state->offset > state->whave + out - left) {
                strm->msg = (char *)"invalid distance too far back";
                state->mode = BAD;
                break;
            }
            state->mode = MATCH;
            fallthrough;
        case MATCH:
            if (left == 0) goto inf_leave;
            copy = out - left;
            if (state->offset > copy) {         /* copy from window */
                copy = state->offset - copy;
                if (copy > state->write) {
                    copy -= state->write;
                    from = state->window + (state->wsize - copy);
                }
                else
                    from = state->window + (state->write - copy);
                if (copy > state->length) copy = state->length;
            }
            else {                              /* copy from output */
                from = put - state->offset;
                copy = state->length;
            }
            if (copy > left) copy = left;
            left -= copy;
            state->length -= copy;
            do {
                *put++ = *from++;
            } while (--copy);
            if (state->length == 0) state->mode = LEN;
            break;
        case LIT:
            if (left == 0) goto inf_leave;
            *put++ = (unsigned char)(state->length);
            left--;
            state->mode = LEN;
            break;
        case CHECK:
            if (state->wrap) {
                NEEDBITS(32);
                out -= left;
                strm->total_out += out;
                state->total += out;
                if (INFLATE_NEED_CHECKSUM(strm) && out)
                    strm->adler = state->check =
                        UPDATE(state->check, put - out, out);
                out = left;
                if ((
                     REVERSE(hold)) != state->check) {
                    strm->msg = (char *)"incorrect data check";
                    state->mode = BAD;
                    break;
                }
                INITBITS();
            }
            state->mode = DONE;
            fallthrough;
        case DONE:
            ret = Z_STREAM_END;
            goto inf_leave;
        case BAD:
            ret = Z_DATA_ERROR;
            goto inf_leave;
        case MEM:
            return Z_MEM_ERROR;
        case SYNC:
        default:
            return Z_STREAM_ERROR;
        }

    /*
       Return from inflate(), updating the total counts and the check value.
       If there was no progress during the inflate() call, return a buffer
       error.  Call zlib_updatewindow() to create and/or update the window state.
     */
  inf_leave:
    RESTORE();
    if (INFLATE_NEED_UPDATEWINDOW(strm) &&
            (state->wsize || (state->mode < CHECK && out != strm->avail_out)))
        zlib_updatewindow(strm, out);

    in -= strm->avail_in;
    out -= strm->avail_out;
    strm->total_in += in;
    strm->total_out += out;
    state->total += out;
    if (INFLATE_NEED_CHECKSUM(strm) && state->wrap && out)
        strm->adler = state->check =
            UPDATE(state->check, strm->next_out - out, out);

    strm->data_type = state->bits + (state->last ? 64 : 0) +
                      (state->mode == TYPE ? 128 : 0);

    if (flush == Z_PACKET_FLUSH && ret == Z_OK &&
            strm->avail_out != 0 && strm->avail_in == 0)
                return zlib_inflateSyncPacket(strm);

    if (((in == 0 && out == 0) || flush == Z_FINISH) && ret == Z_OK)
        ret = Z_BUF_ERROR;

    return ret;
}

int zlib_inflateEnd(z_streamp strm)
{
    if (strm == NULL || strm->state == NULL)
        return Z_STREAM_ERROR;
    return Z_OK;
}

/*
 * This subroutine adds the data at next_in/avail_in to the output history
 * without performing any output.  The output buffer must be "caught up";
 * i.e. no pending output but this should always be the case. The state must
 * be waiting on the start of a block (i.e. mode == TYPE or HEAD).  On exit,
 * the output will also be caught up, and the checksum will have been updated
 * if need be.
 */
int zlib_inflateIncomp(z_stream *z)
{
    struct inflate_state *state = (struct inflate_state *)z->state;
    Byte *saved_no = z->next_out;
    uInt saved_ao = z->avail_out;

    if (state->mode != TYPE && state->mode != HEAD)
        return Z_DATA_ERROR;

    /* Setup some variables to allow misuse of updateWindow */
    z->avail_out = 0;
    z->next_out = (unsigned char*)z->next_in + z->avail_in;

    zlib_updatewindow(z, z->avail_in);

    /* Restore saved variables */
    z->avail_out = saved_ao;
    z->next_out = saved_no;

    z->adler = state->check =
        UPDATE(state->check, z->next_in, z->avail_in);

    z->total_out += z->avail_in;
    z->total_in += z->avail_in;
    z->next_in += z->avail_in;
    state->total += z->avail_in;
    z->avail_in = 0;

    return Z_OK;
}

































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NET_NEIGHBOUR_H
#define _NET_NEIGHBOUR_H

#include <linux/neighbour.h>

/*
 *        Generic neighbour manipulation
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>
 *        Alexey Kuznetsov        <kuznet@ms2.inr.ac.ru>
 *
 *         Changes:
 *
 *        Harald Welte:                <laforge@gnumonks.org>
 *                - Add neighbour cache statistics like rtstat
 */

#include <linux/atomic.h>
#include <linux/refcount.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rcupdate.h>
#include <linux/seq_file.h>
#include <linux/bitmap.h>

#include <linux/err.h>
#include <linux/sysctl.h>
#include <linux/workqueue.h>
#include <net/rtnetlink.h>
#include <net/neighbour_tables.h>

/*
 * NUD stands for "neighbor unreachability detection"
 */

#define NUD_IN_TIMER        (NUD_INCOMPLETE|NUD_REACHABLE|NUD_DELAY|NUD_PROBE)
#define NUD_VALID        (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE|NUD_PROBE|NUD_STALE|NUD_DELAY)
#define NUD_CONNECTED        (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE)

struct neighbour;

enum {
        NEIGH_VAR_MCAST_PROBES,
        NEIGH_VAR_UCAST_PROBES,
        NEIGH_VAR_APP_PROBES,
        NEIGH_VAR_MCAST_REPROBES,
        NEIGH_VAR_RETRANS_TIME,
        NEIGH_VAR_BASE_REACHABLE_TIME,
        NEIGH_VAR_DELAY_PROBE_TIME,
        NEIGH_VAR_INTERVAL_PROBE_TIME_MS,
        NEIGH_VAR_GC_STALETIME,
        NEIGH_VAR_QUEUE_LEN_BYTES,
        NEIGH_VAR_PROXY_QLEN,
        NEIGH_VAR_ANYCAST_DELAY,
        NEIGH_VAR_PROXY_DELAY,
        NEIGH_VAR_LOCKTIME,
#define NEIGH_VAR_DATA_MAX (NEIGH_VAR_LOCKTIME + 1)
        /* Following are used as a second way to access one of the above */
        NEIGH_VAR_QUEUE_LEN, /* same data as NEIGH_VAR_QUEUE_LEN_BYTES */
        NEIGH_VAR_RETRANS_TIME_MS, /* same data as NEIGH_VAR_RETRANS_TIME */
        NEIGH_VAR_BASE_REACHABLE_TIME_MS, /* same data as NEIGH_VAR_BASE_REACHABLE_TIME */
        /* Following are used by "default" only */
        NEIGH_VAR_GC_INTERVAL,
        NEIGH_VAR_GC_THRESH1,
        NEIGH_VAR_GC_THRESH2,
        NEIGH_VAR_GC_THRESH3,
        NEIGH_VAR_MAX
};

struct neigh_parms {
        possible_net_t net;
        struct net_device *dev;
        netdevice_tracker dev_tracker;
        struct list_head list;
        int        (*neigh_setup)(struct neighbour *);
        struct neigh_table *tbl;

        void        *sysctl_table;

        int dead;
        refcount_t refcnt;
        struct rcu_head rcu_head;

        int        reachable_time;
        u32        qlen;
        int        data[NEIGH_VAR_DATA_MAX];
        DECLARE_BITMAP(data_state, NEIGH_VAR_DATA_MAX);
};

static inline void neigh_var_set(struct neigh_parms *p, int index, int val)
{
        set_bit(index, p->data_state);
        p->data[index] = val;
}

#define NEIGH_VAR(p, attr) ((p)->data[NEIGH_VAR_ ## attr])

/* In ndo_neigh_setup, NEIGH_VAR_INIT should be used.
 * In other cases, NEIGH_VAR_SET should be used.
 */
#define NEIGH_VAR_INIT(p, attr, val) (NEIGH_VAR(p, attr) = val)
#define NEIGH_VAR_SET(p, attr, val) neigh_var_set(p, NEIGH_VAR_ ## attr, val)

static inline void neigh_parms_data_state_setall(struct neigh_parms *p)
{
        bitmap_fill(p->data_state, NEIGH_VAR_DATA_MAX);
}

static inline void neigh_parms_data_state_cleanall(struct neigh_parms *p)
{
        bitmap_zero(p->data_state, NEIGH_VAR_DATA_MAX);
}

struct neigh_statistics {
        unsigned long allocs;                /* number of allocated neighs */
        unsigned long destroys;                /* number of destroyed neighs */
        unsigned long hash_grows;        /* number of hash resizes */

        unsigned long res_failed;        /* number of failed resolutions */

        unsigned long lookups;                /* number of lookups */
        unsigned long hits;                /* number of hits (among lookups) */

        unsigned long rcv_probes_mcast;        /* number of received mcast ipv6 */
        unsigned long rcv_probes_ucast; /* number of received ucast ipv6 */

        unsigned long periodic_gc_runs;        /* number of periodic GC runs */
        unsigned long forced_gc_runs;        /* number of forced GC runs */

        unsigned long unres_discards;        /* number of unresolved drops */
        unsigned long table_fulls;      /* times even gc couldn't help */
};

#define NEIGH_CACHE_STAT_INC(tbl, field) this_cpu_inc((tbl)->stats->field)

struct neighbour {
        struct hlist_node        hash;
        struct hlist_node        dev_list;
        struct neigh_table        *tbl;
        struct neigh_parms        *parms;
        unsigned long                confirmed;
        unsigned long                updated;
        rwlock_t                lock;
        refcount_t                refcnt;
        unsigned int                arp_queue_len_bytes;
        struct sk_buff_head        arp_queue;
        struct timer_list        timer;
        unsigned long                used;
        atomic_t                probes;
        u8                        nud_state;
        u8                        type;
        u8                        dead;
        u8                        protocol;
        u32                        flags;
        seqlock_t                ha_lock;
        unsigned char                ha[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))] __aligned(8);
        struct hh_cache                hh;
        int                        (*output)(struct neighbour *, struct sk_buff *);
        const struct neigh_ops        *ops;
        struct list_head        gc_list;
        struct list_head        managed_list;
        struct rcu_head                rcu;
        struct net_device        *dev;
        netdevice_tracker        dev_tracker;
        u8                        primary_key[];
} __randomize_layout;

struct neigh_ops {
        int                        family;
        void                        (*solicit)(struct neighbour *, struct sk_buff *);
        void                        (*error_report)(struct neighbour *, struct sk_buff *);
        int                        (*output)(struct neighbour *, struct sk_buff *);
        int                        (*connected_output)(struct neighbour *, struct sk_buff *);
};

struct pneigh_entry {
        struct pneigh_entry        __rcu *next;
        possible_net_t                net;
        struct net_device        *dev;
        netdevice_tracker        dev_tracker;
        union {
                struct list_head        free_node;
                struct rcu_head                rcu;
        };
        u32                        flags;
        u8                        protocol;
        bool                        permanent;
        u32                        key[];
};

/*
 *        neighbour table manipulation
 */

#define NEIGH_NUM_HASH_RND        4

struct neigh_hash_table {
        struct hlist_head        *hash_heads;
        unsigned int                hash_shift;
        __u32                        hash_rnd[NEIGH_NUM_HASH_RND];
        struct rcu_head                rcu;
};


struct neigh_table {
        int                        family;
        unsigned int                entry_size;
        unsigned int                key_len;
        __be16                        protocol;
        __u32                        (*hash)(const void *pkey,
                                        const struct net_device *dev,
                                        __u32 *hash_rnd);
        bool                        (*key_eq)(const struct neighbour *, const void *pkey);
        int                        (*constructor)(struct neighbour *);
        int                        (*pconstructor)(struct pneigh_entry *);
        void                        (*pdestructor)(struct pneigh_entry *);
        void                        (*proxy_redo)(struct sk_buff *skb);
        int                        (*is_multicast)(const void *pkey);
        bool                        (*allow_add)(const struct net_device *dev,
                                             struct netlink_ext_ack *extack);
        char                        *id;
        struct neigh_parms        parms;
        struct list_head        parms_list;
        int                        gc_interval;
        int                        gc_thresh1;
        int                        gc_thresh2;
        int                        gc_thresh3;
        unsigned long                last_flush;
        struct delayed_work        gc_work;
        struct delayed_work        managed_work;
        struct timer_list         proxy_timer;
        struct sk_buff_head        proxy_queue;
        atomic_t                entries;
        atomic_t                gc_entries;
        struct list_head        gc_list;
        struct list_head        managed_list;
        rwlock_t                lock;
        unsigned long                last_rand;
        struct neigh_statistics        __percpu *stats;
        struct neigh_hash_table __rcu *nht;
        struct mutex                phash_lock;
        struct pneigh_entry        __rcu **phash_buckets;
};

static inline int neigh_parms_family(struct neigh_parms *p)
{
        return p->tbl->family;
}

#define NEIGH_PRIV_ALIGN        sizeof(long long)
#define NEIGH_ENTRY_SIZE(size)        ALIGN((size), NEIGH_PRIV_ALIGN)

static inline void *neighbour_priv(const struct neighbour *n)
{
        return (char *)n + n->tbl->entry_size;
}

/* flags for neigh_update() */
#define NEIGH_UPDATE_F_OVERRIDE                        BIT(0)
#define NEIGH_UPDATE_F_WEAK_OVERRIDE                BIT(1)
#define NEIGH_UPDATE_F_OVERRIDE_ISROUTER        BIT(2)
#define NEIGH_UPDATE_F_USE                        BIT(3)
#define NEIGH_UPDATE_F_MANAGED                        BIT(4)
#define NEIGH_UPDATE_F_EXT_LEARNED                BIT(5)
#define NEIGH_UPDATE_F_ISROUTER                        BIT(6)
#define NEIGH_UPDATE_F_ADMIN                        BIT(7)
#define NEIGH_UPDATE_F_EXT_VALIDATED                BIT(8)

/* In-kernel representation for NDA_FLAGS_EXT flags: */
#define NTF_OLD_MASK                0xff
#define NTF_EXT_SHIFT                8
#define NTF_EXT_MASK                (NTF_EXT_MANAGED | NTF_EXT_EXT_VALIDATED)

#define NTF_MANAGED                (NTF_EXT_MANAGED << NTF_EXT_SHIFT)
#define NTF_EXT_VALIDATED        (NTF_EXT_EXT_VALIDATED << NTF_EXT_SHIFT)

extern const struct nla_policy nda_policy[];

#define neigh_for_each_in_bucket(pos, head) hlist_for_each_entry(pos, head, hash)
#define neigh_for_each_in_bucket_rcu(pos, head) \
        hlist_for_each_entry_rcu(pos, head, hash)
#define neigh_for_each_in_bucket_safe(pos, tmp, head) \
        hlist_for_each_entry_safe(pos, tmp, head, hash)

static inline bool neigh_key_eq32(const struct neighbour *n, const void *pkey)
{
        return *(const u32 *)n->primary_key == *(const u32 *)pkey;
}

static inline bool neigh_key_eq128(const struct neighbour *n, const void *pkey)
{
        const u32 *n32 = (const u32 *)n->primary_key;
        const u32 *p32 = pkey;

        return ((n32[0] ^ p32[0]) | (n32[1] ^ p32[1]) |
                (n32[2] ^ p32[2]) | (n32[3] ^ p32[3])) == 0;
}

static inline struct neighbour *___neigh_lookup_noref(
        struct neigh_table *tbl,
        bool (*key_eq)(const struct neighbour *n, const void *pkey),
        __u32 (*hash)(const void *pkey,
                      const struct net_device *dev,
                      __u32 *hash_rnd),
        const void *pkey,
        struct net_device *dev)
{
        struct neigh_hash_table *nht = rcu_dereference(tbl->nht);
        struct neighbour *n;
        u32 hash_val;

        hash_val = hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift);
        neigh_for_each_in_bucket_rcu(n, &nht->hash_heads[hash_val])
                if (n->dev == dev && key_eq(n, pkey))
                        return n;

        return NULL;
}

static inline struct neighbour *__neigh_lookup_noref(struct neigh_table *tbl,
                                                     const void *pkey,
                                                     struct net_device *dev)
{
        return ___neigh_lookup_noref(tbl, tbl->key_eq, tbl->hash, pkey, dev);
}

static inline void neigh_confirm(struct neighbour *n)
{
        if (n) {
                unsigned long now = jiffies;

                /* avoid dirtying neighbour */
                if (READ_ONCE(n->confirmed) != now)
                        WRITE_ONCE(n->confirmed, now);
        }
}

void neigh_table_init(int index, struct neigh_table *tbl);
int neigh_table_clear(int index, struct neigh_table *tbl);
struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
                               struct net_device *dev);
struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
                                 struct net_device *dev, bool want_ref);
static inline struct neighbour *neigh_create(struct neigh_table *tbl,
                                             const void *pkey,
                                             struct net_device *dev)
{
        return __neigh_create(tbl, pkey, dev, true);
}
void neigh_destroy(struct neighbour *neigh);
int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb,
                       const bool immediate_ok);
int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags,
                 u32 nlmsg_pid);
void __neigh_set_probe_once(struct neighbour *neigh);
bool neigh_remove_one(struct neighbour *ndel);
void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev);
int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev);
int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev);
int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb);
int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb);
int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb);
struct neighbour *neigh_event_ns(struct neigh_table *tbl,
                                                u8 *lladdr, void *saddr,
                                                struct net_device *dev);

struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
                                      struct neigh_table *tbl);
void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms);

static inline
struct net *neigh_parms_net(const struct neigh_parms *parms)
{
        return read_pnet(&parms->net);
}

unsigned long neigh_rand_reach_time(unsigned long base);

void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
                    struct sk_buff *skb);
struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net,
                                   const void *key, struct net_device *dev);
int pneigh_create(struct neigh_table *tbl, struct net *net, const void *key,
                  struct net_device *dev, u32 flags, u8 protocol,
                  bool permanent);
int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *key,
                  struct net_device *dev);

static inline struct net *pneigh_net(const struct pneigh_entry *pneigh)
{
        return read_pnet(&pneigh->net);
}

void neigh_app_ns(struct neighbour *n);
void neigh_for_each(struct neigh_table *tbl,
                    void (*cb)(struct neighbour *, void *), void *cookie);
void __neigh_for_each_release(struct neigh_table *tbl,
                              int (*cb)(struct neighbour *));
int neigh_xmit(int fam, struct net_device *, const void *, struct sk_buff *);

struct neigh_seq_state {
        struct seq_net_private p;
        struct neigh_table *tbl;
        struct neigh_hash_table *nht;
        void *(*neigh_sub_iter)(struct neigh_seq_state *state,
                                struct neighbour *n, loff_t *pos);
        unsigned int bucket;
        unsigned int flags;
#define NEIGH_SEQ_NEIGH_ONLY        0x00000001
#define NEIGH_SEQ_IS_PNEIGH        0x00000002
#define NEIGH_SEQ_SKIP_NOARP        0x00000004
};
void *neigh_seq_start(struct seq_file *, loff_t *, struct neigh_table *,
                      unsigned int);
void *neigh_seq_next(struct seq_file *, void *, loff_t *);
void neigh_seq_stop(struct seq_file *, void *);

int neigh_proc_dointvec(const struct ctl_table *ctl, int write,
                        void *buffer, size_t *lenp, loff_t *ppos);
int neigh_proc_dointvec_jiffies(const struct ctl_table *ctl, int write,
                                void *buffer,
                                size_t *lenp, loff_t *ppos);
int neigh_proc_dointvec_ms_jiffies(const struct ctl_table *ctl, int write,
                                   void *buffer, size_t *lenp, loff_t *ppos);

int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
                          proc_handler *proc_handler);
void neigh_sysctl_unregister(struct neigh_parms *p);

static inline void __neigh_parms_put(struct neigh_parms *parms)
{
        refcount_dec(&parms->refcnt);
}

static inline struct neigh_parms *neigh_parms_clone(struct neigh_parms *parms)
{
        refcount_inc(&parms->refcnt);
        return parms;
}

/*
 *        Neighbour references
 */

static inline void neigh_release(struct neighbour *neigh)
{
        if (refcount_dec_and_test(&neigh->refcnt))
                neigh_destroy(neigh);
}

static inline struct neighbour * neigh_clone(struct neighbour *neigh)
{
        if (neigh)
                refcount_inc(&neigh->refcnt);
        return neigh;
}

#define neigh_hold(n)        refcount_inc(&(n)->refcnt)

static __always_inline int neigh_event_send_probe(struct neighbour *neigh,
                                                  struct sk_buff *skb,
                                                  const bool immediate_ok)
{
        unsigned long now = jiffies;

        if (READ_ONCE(neigh->used) != now)
                WRITE_ONCE(neigh->used, now);
        if (!(READ_ONCE(neigh->nud_state) & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE)))
                return __neigh_event_send(neigh, skb, immediate_ok);
        return 0;
}

static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
{
        return neigh_event_send_probe(neigh, skb, true);
}

#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb)
{
        unsigned int seq, hh_alen;

        do {
                seq = read_seqbegin(&hh->hh_lock);
                hh_alen = HH_DATA_ALIGN(ETH_HLEN);
                memcpy(skb->data - hh_alen, hh->hh_data, ETH_ALEN + hh_alen - ETH_HLEN);
        } while (read_seqretry(&hh->hh_lock, seq));
        return 0;
}
#endif

static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
{
        unsigned int hh_alen = 0;
        unsigned int seq;
        unsigned int hh_len;

        do {
                seq = read_seqbegin(&hh->hh_lock);
                hh_len = READ_ONCE(hh->hh_len);
                if (likely(hh_len <= HH_DATA_MOD)) {
                        hh_alen = HH_DATA_MOD;

                        /* skb_push() would proceed silently if we have room for
                         * the unaligned size but not for the aligned size:
                         * check headroom explicitly.
                         */
                        if (likely(skb_headroom(skb) >= HH_DATA_MOD)) {
                                /* this is inlined by gcc */
                                memcpy(skb->data - HH_DATA_MOD, hh->hh_data,
                                       HH_DATA_MOD);
                        }
                } else {
                        hh_alen = HH_DATA_ALIGN(hh_len);

                        if (likely(skb_headroom(skb) >= hh_alen)) {
                                memcpy(skb->data - hh_alen, hh->hh_data,
                                       hh_alen);
                        }
                }
        } while (read_seqretry(&hh->hh_lock, seq));

        if (WARN_ON_ONCE(skb_headroom(skb) < hh_alen)) {
                kfree_skb(skb);
                return NET_XMIT_DROP;
        }

        __skb_push(skb, hh_len);
        return dev_queue_xmit(skb);
}

static inline int neigh_output(struct neighbour *n, struct sk_buff *skb,
                               bool skip_cache)
{
        const struct hh_cache *hh = &n->hh;

        /* n->nud_state and hh->hh_len could be changed under us.
         * neigh_hh_output() is taking care of the race later.
         */
        if (!skip_cache &&
            (READ_ONCE(n->nud_state) & NUD_CONNECTED) &&
            READ_ONCE(hh->hh_len))
                return neigh_hh_output(hh, skb);

        return READ_ONCE(n->output)(n, skb);
}

static inline struct neighbour *
__neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev, int creat)
{
        struct neighbour *n = neigh_lookup(tbl, pkey, dev);

        if (n || !creat)
                return n;

        n = neigh_create(tbl, pkey, dev);
        return IS_ERR(n) ? NULL : n;
}

static inline struct neighbour *
__neigh_lookup_errno(struct neigh_table *tbl, const void *pkey,
  struct net_device *dev)
{
        struct neighbour *n = neigh_lookup(tbl, pkey, dev);

        if (n)
                return n;

        return neigh_create(tbl, pkey, dev);
}

struct neighbour_cb {
        unsigned long sched_next;
        unsigned int flags;
};

#define LOCALLY_ENQUEUED 0x1

#define NEIGH_CB(skb)        ((struct neighbour_cb *)(skb)->cb)

static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
                                     const struct net_device *dev)
{
        unsigned int seq;

        do {
                seq = read_seqbegin(&n->ha_lock);
                memcpy(dst, n->ha, dev->addr_len);
        } while (read_seqretry(&n->ha_lock, seq));
}

static inline void neigh_update_is_router(struct neighbour *neigh, u32 flags,
                                          int *notify)
{
        u8 ndm_flags = 0;

        ndm_flags |= (flags & NEIGH_UPDATE_F_ISROUTER) ? NTF_ROUTER : 0;
        if ((neigh->flags ^ ndm_flags) & NTF_ROUTER) {
                if (ndm_flags & NTF_ROUTER)
                        neigh->flags |= NTF_ROUTER;
                else
                        neigh->flags &= ~NTF_ROUTER;
                *notify = 1;
        }
}
#endif





































































































  150 













   54 


























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_CPUFEATURE_H
#define _ASM_X86_CPUFEATURE_H

#include <asm/processor.h>

#if defined(__KERNEL__) && !defined(__ASSEMBLER__)

#include <asm/asm.h>
#include <linux/bitops.h>
#include <asm/alternative.h>
#include <asm/cpufeaturemasks.h>

enum cpuid_leafs
{
        CPUID_1_EDX                = 0,
        CPUID_8000_0001_EDX,
        CPUID_8086_0001_EDX,
        CPUID_LNX_1,
        CPUID_1_ECX,
        CPUID_C000_0001_EDX,
        CPUID_8000_0001_ECX,
        CPUID_LNX_2,
        CPUID_LNX_3,
        CPUID_7_0_EBX,
        CPUID_D_1_EAX,
        CPUID_LNX_4,
        CPUID_7_1_EAX,
        CPUID_8000_0008_EBX,
        CPUID_6_EAX,
        CPUID_8000_000A_EDX,
        CPUID_7_ECX,
        CPUID_8000_0007_EBX,
        CPUID_7_EDX,
        CPUID_8000_001F_EAX,
        CPUID_8000_0021_EAX,
        CPUID_LNX_5,
        NR_CPUID_WORDS,
};

extern const char * const x86_cap_flags[NCAPINTS*32];
extern const char * const x86_power_flags[32];

/*
 * In order to save room, we index into this array by doing
 * X86_BUG_<name> - NCAPINTS*32.
 */
extern const char * const x86_bug_flags[NBUGINTS*32];
#define x86_bug_flag(flag) x86_bug_flags[flag]

#define test_cpu_cap(c, bit)                                                \
         arch_test_bit(bit, (unsigned long *)((c)->x86_capability))

#define cpu_has(c, bit)                                                        \
        (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 :        \
         test_cpu_cap(c, bit))

#define this_cpu_has(bit)                                                \
        (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 :        \
         x86_this_cpu_test_bit(bit, cpu_info.x86_capability))

/*
 * This is the default CPU features testing macro to use in code.
 *
 * It is for detection of features which need kernel infrastructure to be
 * used.  It may *not* directly test the CPU itself.  Use the cpu_has() family
 * if you want true runtime testing of CPU features, like in hypervisor code
 * where you are supporting a possible guest feature where host support for it
 * is not relevant.
 */
#define cpu_feature_enabled(bit)        \
        (__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : static_cpu_has(bit))

#define boot_cpu_has(bit)        cpu_has(&boot_cpu_data, bit)

#define set_cpu_cap(c, bit)        set_bit(bit, (unsigned long *)((c)->x86_capability))

extern void setup_clear_cpu_cap(unsigned int bit);
extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
void check_cpufeature_deps(struct cpuinfo_x86 *c);

#define setup_force_cpu_cap(bit) do {                        \
                                                        \
        if (!boot_cpu_has(bit))                                \
                WARN_ON(alternatives_patched);                \
                                                        \
        set_cpu_cap(&boot_cpu_data, bit);                \
        set_bit(bit, (unsigned long *)cpu_caps_set);        \
} while (0)

#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)

/*
 * Do not use an "m" constraint for [cap_byte] here: gcc doesn't know
 * that this is only used on a fallback path and will sometimes cause
 * it to manifest the address of boot_cpu_data in a register, fouling
 * the mainline (post-initialization) code.
 */
static __always_inline bool _static_cpu_has(u16 bit)
{
        asm goto(ALTERNATIVE_TERNARY("jmp 6f", %c[feature], "", "jmp %l[t_no]")
                ".pushsection .altinstr_aux,\"ax\"\n"
                "6:\n"
                " testb %[bitnum], %a[cap_byte]\n"
                " jnz %l[t_yes]\n"
                " jmp %l[t_no]\n"
                ".popsection\n"
                 : : [feature]  "i" (bit),
                     [bitnum]   "i" (1 << (bit & 7)),
                     [cap_byte] "i" (&((const char *)boot_cpu_data.x86_capability)[bit >> 3])
                 : : t_yes, t_no);
t_yes:
        return true;
t_no:
        return false;
}

#define static_cpu_has(bit)                                        \
(                                                                \
        __builtin_constant_p(boot_cpu_has(bit)) ?                \
                boot_cpu_has(bit) :                                \
                _static_cpu_has(bit)                                \
)

#define cpu_has_bug(c, bit)                cpu_has(c, (bit))
#define set_cpu_bug(c, bit)                set_cpu_cap(c, (bit))
#define clear_cpu_bug(c, bit)                clear_cpu_cap(c, (bit))

#define static_cpu_has_bug(bit)                static_cpu_has((bit))
#define boot_cpu_has_bug(bit)                cpu_has_bug(&boot_cpu_data, (bit))
#define boot_cpu_set_bug(bit)                set_cpu_cap(&boot_cpu_data, (bit))

#define MAX_CPU_FEATURES                (NCAPINTS * 32)
#define cpu_have_feature                boot_cpu_has

#define CPU_FEATURE_TYPEFMT                "x86,ven%04Xfam%04Xmod%04X"
#define CPU_FEATURE_TYPEVAL                boot_cpu_data.x86_vendor, boot_cpu_data.x86, \
                                        boot_cpu_data.x86_model

#endif /* defined(__KERNEL__) && !defined(__ASSEMBLER__) */
#endif /* _ASM_X86_CPUFEATURE_H */































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PERCPU_COUNTER_H
#define _LINUX_PERCPU_COUNTER_H
/*
 * A simple "approximate counter" for use in ext2 and ext3 superblocks.
 *
 * WARNING: these things are HUGE.  4 kbytes per counter on 32-way P4.
 */

#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/list.h>
#include <linux/threads.h>
#include <linux/percpu.h>
#include <linux/types.h>

/* percpu_counter batch for local add or sub */
#define PERCPU_COUNTER_LOCAL_BATCH        INT_MAX

#ifdef CONFIG_SMP

struct percpu_counter {
        raw_spinlock_t lock;
        s64 count;
#ifdef CONFIG_HOTPLUG_CPU
        struct list_head list;        /* All percpu_counters are on a list */
#endif
        s32 __percpu *counters;
};

extern int percpu_counter_batch;

int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount,
                               gfp_t gfp, u32 nr_counters,
                               struct lock_class_key *key);

#define percpu_counter_init_many(fbc, value, gfp, nr_counters)                \
        ({                                                                \
                static struct lock_class_key __key;                        \
                                                                        \
                __percpu_counter_init_many(fbc, value, gfp, nr_counters,\
                                           &__key);                        \
        })


#define percpu_counter_init(fbc, value, gfp)                                \
        percpu_counter_init_many(fbc, value, gfp, 1)

void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters);
static inline void percpu_counter_destroy(struct percpu_counter *fbc)
{
        percpu_counter_destroy_many(fbc, 1);
}

void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount,
                              s32 batch);
s64 __percpu_counter_sum(struct percpu_counter *fbc);
int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch);
bool __percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit,
                                  s64 amount, s32 batch);
void percpu_counter_sync(struct percpu_counter *fbc);

static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
{
        return __percpu_counter_compare(fbc, rhs, percpu_counter_batch);
}

static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
{
        percpu_counter_add_batch(fbc, amount, percpu_counter_batch);
}

static inline bool
percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount)
{
        return __percpu_counter_limited_add(fbc, limit, amount,
                                            percpu_counter_batch);
}

/*
 * With percpu_counter_add_local() and percpu_counter_sub_local(), counts
 * are accumulated in local per cpu counter and not in fbc->count until
 * local count overflows PERCPU_COUNTER_LOCAL_BATCH. This makes counter
 * write efficient.
 * But percpu_counter_sum(), instead of percpu_counter_read(), needs to be
 * used to add up the counts from each CPU to account for all the local
 * counts. So percpu_counter_add_local() and percpu_counter_sub_local()
 * should be used when a counter is updated frequently and read rarely.
 */
static inline void
percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
{
        percpu_counter_add_batch(fbc, amount, PERCPU_COUNTER_LOCAL_BATCH);
}

static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
{
        s64 ret = __percpu_counter_sum(fbc);
        return ret < 0 ? 0 : ret;
}

static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
{
        return __percpu_counter_sum(fbc);
}

static inline s64 percpu_counter_read(struct percpu_counter *fbc)
{
        return fbc->count;
}

/*
 * It is possible for the percpu_counter_read() to return a small negative
 * number for some counter which should never be negative.
 *
 */
static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
{
        /* Prevent reloads of fbc->count */
        s64 ret = READ_ONCE(fbc->count);

        if (ret >= 0)
                return ret;
        return 0;
}

static inline bool percpu_counter_initialized(struct percpu_counter *fbc)
{
        return (fbc->counters != NULL);
}

#else /* !CONFIG_SMP */

struct percpu_counter {
        s64 count;
};

static inline int percpu_counter_init_many(struct percpu_counter *fbc,
                                           s64 amount, gfp_t gfp,
                                           u32 nr_counters)
{
        u32 i;

        for (i = 0; i < nr_counters; i++)
                fbc[i].count = amount;

        return 0;
}

static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount,
                                      gfp_t gfp)
{
        return percpu_counter_init_many(fbc, amount, gfp, 1);
}

static inline void percpu_counter_destroy_many(struct percpu_counter *fbc,
                                               u32 nr_counters)
{
}

static inline void percpu_counter_destroy(struct percpu_counter *fbc)
{
}

static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
{
        fbc->count = amount;
}

static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
{
        if (fbc->count > rhs)
                return 1;
        else if (fbc->count < rhs)
                return -1;
        else
                return 0;
}

static inline int
__percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch)
{
        return percpu_counter_compare(fbc, rhs);
}

static inline void
percpu_counter_add(struct percpu_counter *fbc, s64 amount)
{
        unsigned long flags;

        local_irq_save(flags);
        fbc->count += amount;
        local_irq_restore(flags);
}

static inline bool
percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount)
{
        unsigned long flags;
        bool good = false;
        s64 count;

        if (amount == 0)
                return true;

        local_irq_save(flags);
        count = fbc->count + amount;
        if ((amount > 0 && count <= limit) ||
            (amount < 0 && count >= limit)) {
                fbc->count = count;
                good = true;
        }
        local_irq_restore(flags);
        return good;
}

/* non-SMP percpu_counter_add_local is the same with percpu_counter_add */
static inline void
percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
{
        percpu_counter_add(fbc, amount);
}

static inline void
percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
{
        percpu_counter_add(fbc, amount);
}

static inline s64 percpu_counter_read(struct percpu_counter *fbc)
{
        return fbc->count;
}

/*
 * percpu_counter is intended to track positive numbers. In the UP case the
 * number should never be negative.
 */
static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
{
        return fbc->count;
}

static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
{
        return percpu_counter_read_positive(fbc);
}

static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
{
        return percpu_counter_read(fbc);
}

static inline bool percpu_counter_initialized(struct percpu_counter *fbc)
{
        return true;
}

static inline void percpu_counter_sync(struct percpu_counter *fbc)
{
}
#endif        /* CONFIG_SMP */

static inline void percpu_counter_inc(struct percpu_counter *fbc)
{
        percpu_counter_add(fbc, 1);
}

static inline void percpu_counter_dec(struct percpu_counter *fbc)
{
        percpu_counter_add(fbc, -1);
}

static inline void percpu_counter_sub(struct percpu_counter *fbc, s64 amount)
{
        percpu_counter_add(fbc, -amount);
}

static inline void
percpu_counter_sub_local(struct percpu_counter *fbc, s64 amount)
{
        percpu_counter_add_local(fbc, -amount);
}

#endif /* _LINUX_PERCPU_COUNTER_H */























































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * NOTE:
 *
 * This header has combined a lot of unrelated to each other stuff.
 * The process of splitting its content is in progress while keeping
 * backward compatibility. That's why it's highly recommended NOT to
 * include this header inside another header file, especially under
 * generic or architectural include/ directory.
 */
#ifndef _LINUX_KERNEL_H
#define _LINUX_KERNEL_H

#include <linux/stdarg.h>
#include <linux/align.h>
#include <linux/array_size.h>
#include <linux/limits.h>
#include <linux/linkage.h>
#include <linux/stddef.h>
#include <linux/types.h>
#include <linux/compiler.h>
#include <linux/container_of.h>
#include <linux/bitops.h>
#include <linux/hex.h>
#include <linux/kstrtox.h>
#include <linux/log2.h>
#include <linux/math.h>
#include <linux/minmax.h>
#include <linux/typecheck.h>
#include <linux/panic.h>
#include <linux/printk.h>
#include <linux/build_bug.h>
#include <linux/sprintf.h>
#include <linux/static_call_types.h>
#include <linux/instruction_pointer.h>
#include <linux/util_macros.h>
#include <linux/wordpart.h>

#include <asm/byteorder.h>

#include <uapi/linux/kernel.h>

#define STACK_MAGIC        0xdeadbeef

struct completion;
struct user;

#ifdef CONFIG_PREEMPT_VOLUNTARY_BUILD

extern int __cond_resched(void);
# define might_resched() __cond_resched()

#elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)

extern int __cond_resched(void);

DECLARE_STATIC_CALL(might_resched, __cond_resched);

static __always_inline void might_resched(void)
{
        static_call_mod(might_resched)();
}

#elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)

extern int dynamic_might_resched(void);
# define might_resched() dynamic_might_resched()

#else

# define might_resched() do { } while (0)

#endif /* CONFIG_PREEMPT_* */

#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
extern void __might_resched(const char *file, int line, unsigned int offsets);
extern void __might_sleep(const char *file, int line);
extern void __cant_sleep(const char *file, int line, int preempt_offset);
extern void __cant_migrate(const char *file, int line);

/**
 * might_sleep - annotation for functions that can sleep
 *
 * this macro will print a stack trace if it is executed in an atomic
 * context (spinlock, irq-handler, ...). Additional sections where blocking is
 * not allowed can be annotated with non_block_start() and non_block_end()
 * pairs.
 *
 * This is a useful debugging help to be able to catch problems early and not
 * be bitten later when the calling function happens to sleep when it is not
 * supposed to.
 */
# define might_sleep() \
        do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0)
/**
 * cant_sleep - annotation for functions that cannot sleep
 *
 * this macro will print a stack trace if it is executed with preemption enabled
 */
# define cant_sleep() \
        do { __cant_sleep(__FILE__, __LINE__, 0); } while (0)
# define sched_annotate_sleep()        (current->task_state_change = 0)

/**
 * cant_migrate - annotation for functions that cannot migrate
 *
 * Will print a stack trace if executed in code which is migratable
 */
# define cant_migrate()                                                        \
        do {                                                                \
                if (IS_ENABLED(CONFIG_SMP))                                \
                        __cant_migrate(__FILE__, __LINE__);                \
        } while (0)

/**
 * non_block_start - annotate the start of section where sleeping is prohibited
 *
 * This is on behalf of the oom reaper, specifically when it is calling the mmu
 * notifiers. The problem is that if the notifier were to block on, for example,
 * mutex_lock() and if the process which holds that mutex were to perform a
 * sleeping memory allocation, the oom reaper is now blocked on completion of
 * that memory allocation. Other blocking calls like wait_event() pose similar
 * issues.
 */
# define non_block_start() (current->non_block_count++)
/**
 * non_block_end - annotate the end of section where sleeping is prohibited
 *
 * Closes a section opened by non_block_start().
 */
# define non_block_end() WARN_ON(current->non_block_count-- == 0)
#else
  static inline void __might_resched(const char *file, int line,
                                     unsigned int offsets) { }
static inline void __might_sleep(const char *file, int line) { }
# define might_sleep() do { might_resched(); } while (0)
# define cant_sleep() do { } while (0)
# define cant_migrate()                do { } while (0)
# define sched_annotate_sleep() do { } while (0)
# define non_block_start() do { } while (0)
# define non_block_end() do { } while (0)
#endif

#define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0)

#if defined(CONFIG_MMU) && \
        (defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP))
#define might_fault() __might_fault(__FILE__, __LINE__)
void __might_fault(const char *file, int line);
#else
static inline void might_fault(void) { }
#endif

void do_exit(long error_code) __noreturn;

extern int core_kernel_text(unsigned long addr);
extern int __kernel_text_address(unsigned long addr);
extern int kernel_text_address(unsigned long addr);
extern int func_ptr_is_kernel_text(void *ptr);

extern void bust_spinlocks(int yes);

extern int root_mountflags;

extern bool early_boot_irqs_disabled;

/**
 * enum system_states - Values used for system_state.
 *
 * @SYSTEM_BOOTING:        %0, no init needed
 * @SYSTEM_SCHEDULING: system is ready for scheduling; OK to use RCU
 * @SYSTEM_FREEING_INITMEM: system is freeing all of initmem; almost running
 * @SYSTEM_RUNNING:        system is up and running
 * @SYSTEM_HALT:        system entered clean system halt state
 * @SYSTEM_POWER_OFF:        system entered shutdown/clean power off state
 * @SYSTEM_RESTART:        system entered emergency power off or normal restart
 * @SYSTEM_SUSPEND:        system entered suspend or hibernate state
 *
 * Note:
 * Ordering of the states must not be changed
 * as code checks for <, <=, >, >= STATE.
 */
enum system_states {
        SYSTEM_BOOTING,
        SYSTEM_SCHEDULING,
        SYSTEM_FREEING_INITMEM,
        SYSTEM_RUNNING,
        SYSTEM_HALT,
        SYSTEM_POWER_OFF,
        SYSTEM_RESTART,
        SYSTEM_SUSPEND,
};
extern enum system_states system_state;

/*
 * General tracing related utility functions - trace_printk(),
 * tracing_on/tracing_off and tracing_start()/tracing_stop
 *
 * Use tracing_on/tracing_off when you want to quickly turn on or off
 * tracing. It simply enables or disables the recording of the trace events.
 * This also corresponds to the user space /sys/kernel/tracing/tracing_on
 * file, which gives a means for the kernel and userspace to interact.
 * Place a tracing_off() in the kernel where you want tracing to end.
 * From user space, examine the trace, and then echo 1 > tracing_on
 * to continue tracing.
 *
 * tracing_stop/tracing_start has slightly more overhead. It is used
 * by things like suspend to ram where disabling the recording of the
 * trace is not enough, but tracing must actually stop because things
 * like calling smp_processor_id() may crash the system.
 *
 * Most likely, you want to use tracing_on/tracing_off.
 */

enum ftrace_dump_mode {
        DUMP_NONE,
        DUMP_ALL,
        DUMP_ORIG,
        DUMP_PARAM,
};

#ifdef CONFIG_TRACING
void tracing_on(void);
void tracing_off(void);
int tracing_is_on(void);
void tracing_snapshot(void);
void tracing_snapshot_alloc(void);

extern void tracing_start(void);
extern void tracing_stop(void);

static inline __printf(1, 2)
void ____trace_printk_check_format(const char *fmt, ...)
{
}
#define __trace_printk_check_format(fmt, args...)                        \
do {                                                                        \
        if (0)                                                                \
                ____trace_printk_check_format(fmt, ##args);                \
} while (0)

/**
 * trace_printk - printf formatting in the ftrace buffer
 * @fmt: the printf format for printing
 *
 * Note: __trace_printk is an internal function for trace_printk() and
 *       the @ip is passed in via the trace_printk() macro.
 *
 * This function allows a kernel developer to debug fast path sections
 * that printk is not appropriate for. By scattering in various
 * printk like tracing in the code, a developer can quickly see
 * where problems are occurring.
 *
 * This is intended as a debugging tool for the developer only.
 * Please refrain from leaving trace_printks scattered around in
 * your code. (Extra memory is used for special buffers that are
 * allocated when trace_printk() is used.)
 *
 * A little optimization trick is done here. If there's only one
 * argument, there's no need to scan the string for printf formats.
 * The trace_puts() will suffice. But how can we take advantage of
 * using trace_puts() when trace_printk() has only one argument?
 * By stringifying the args and checking the size we can tell
 * whether or not there are args. __stringify((__VA_ARGS__)) will
 * turn into "()\0" with a size of 3 when there are no args, anything
 * else will be bigger. All we need to do is define a string to this,
 * and then take its size and compare to 3. If it's bigger, use
 * do_trace_printk() otherwise, optimize it to trace_puts(). Then just
 * let gcc optimize the rest.
 */

#define trace_printk(fmt, ...)                                \
do {                                                        \
        char _______STR[] = __stringify((__VA_ARGS__));        \
        if (sizeof(_______STR) > 3)                        \
                do_trace_printk(fmt, ##__VA_ARGS__);        \
        else                                                \
                trace_puts(fmt);                        \
} while (0)

#define do_trace_printk(fmt, args...)                                        \
do {                                                                        \
        static const char *trace_printk_fmt __used                        \
                __section("__trace_printk_fmt") =                        \
                __builtin_constant_p(fmt) ? fmt : NULL;                        \
                                                                        \
        __trace_printk_check_format(fmt, ##args);                        \
                                                                        \
        if (__builtin_constant_p(fmt))                                        \
                __trace_bprintk(_THIS_IP_, trace_printk_fmt, ##args);        \
        else                                                                \
                __trace_printk(_THIS_IP_, fmt, ##args);                        \
} while (0)

extern __printf(2, 3)
int __trace_bprintk(unsigned long ip, const char *fmt, ...);

extern __printf(2, 3)
int __trace_printk(unsigned long ip, const char *fmt, ...);

/**
 * trace_puts - write a string into the ftrace buffer
 * @str: the string to record
 *
 * Note: __trace_bputs is an internal function for trace_puts and
 *       the @ip is passed in via the trace_puts macro.
 *
 * This is similar to trace_printk() but is made for those really fast
 * paths that a developer wants the least amount of "Heisenbug" effects,
 * where the processing of the print format is still too much.
 *
 * This function allows a kernel developer to debug fast path sections
 * that printk is not appropriate for. By scattering in various
 * printk like tracing in the code, a developer can quickly see
 * where problems are occurring.
 *
 * This is intended as a debugging tool for the developer only.
 * Please refrain from leaving trace_puts scattered around in
 * your code. (Extra memory is used for special buffers that are
 * allocated when trace_puts() is used.)
 *
 * Returns: 0 if nothing was written, positive # if string was.
 *  (1 when __trace_bputs is used, strlen(str) when __trace_puts is used)
 */

#define trace_puts(str) ({                                                \
        static const char *trace_printk_fmt __used                        \
                __section("__trace_printk_fmt") =                        \
                __builtin_constant_p(str) ? str : NULL;                        \
                                                                        \
        if (__builtin_constant_p(str))                                        \
                __trace_bputs(_THIS_IP_, trace_printk_fmt);                \
        else                                                                \
                __trace_puts(_THIS_IP_, str, strlen(str));                \
})
extern int __trace_bputs(unsigned long ip, const char *str);
extern int __trace_puts(unsigned long ip, const char *str, int size);

extern void trace_dump_stack(int skip);

/*
 * The double __builtin_constant_p is because gcc will give us an error
 * if we try to allocate the static variable to fmt if it is not a
 * constant. Even with the outer if statement.
 */
#define ftrace_vprintk(fmt, vargs)                                        \
do {                                                                        \
        if (__builtin_constant_p(fmt)) {                                \
                static const char *trace_printk_fmt __used                \
                  __section("__trace_printk_fmt") =                        \
                        __builtin_constant_p(fmt) ? fmt : NULL;                \
                                                                        \
                __ftrace_vbprintk(_THIS_IP_, trace_printk_fmt, vargs);        \
        } else                                                                \
                __ftrace_vprintk(_THIS_IP_, fmt, vargs);                \
} while (0)

extern __printf(2, 0) int
__ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap);

extern __printf(2, 0) int
__ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);

extern void ftrace_dump(enum ftrace_dump_mode oops_dump_mode);
#else
static inline void tracing_start(void) { }
static inline void tracing_stop(void) { }
static inline void trace_dump_stack(int skip) { }

static inline void tracing_on(void) { }
static inline void tracing_off(void) { }
static inline int tracing_is_on(void) { return 0; }
static inline void tracing_snapshot(void) { }
static inline void tracing_snapshot_alloc(void) { }

static inline __printf(1, 2)
int trace_printk(const char *fmt, ...)
{
        return 0;
}
static __printf(1, 0) inline int
ftrace_vprintk(const char *fmt, va_list ap)
{
        return 0;
}
static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
#endif /* CONFIG_TRACING */

/* Rebuild everything on CONFIG_DYNAMIC_FTRACE */
#ifdef CONFIG_DYNAMIC_FTRACE
# define REBUILD_DUE_TO_DYNAMIC_FTRACE
#endif

/* Permissions on a sysfs file: you didn't miss the 0 prefix did you? */
#define VERIFY_OCTAL_PERMISSIONS(perms)                                                \
        (BUILD_BUG_ON_ZERO((perms) < 0) +                                        \
         BUILD_BUG_ON_ZERO((perms) > 0777) +                                        \
         /* USER_READABLE >= GROUP_READABLE >= OTHER_READABLE */                \
         BUILD_BUG_ON_ZERO((((perms) >> 6) & 4) < (((perms) >> 3) & 4)) +        \
         BUILD_BUG_ON_ZERO((((perms) >> 3) & 4) < ((perms) & 4)) +                \
         /* USER_WRITABLE >= GROUP_WRITABLE */                                        \
         BUILD_BUG_ON_ZERO((((perms) >> 6) & 2) < (((perms) >> 3) & 2)) +        \
         /* OTHER_WRITABLE?  Generally considered a bad idea. */                \
         BUILD_BUG_ON_ZERO((perms) & 2) +                                        \
         (perms))
#endif






























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MM_TYPES_H
#define _LINUX_MM_TYPES_H

#include <linux/mm_types_task.h>

#include <linux/auxvec.h>
#include <linux/kref.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/rbtree.h>
#include <linux/maple_tree.h>
#include <linux/rwsem.h>
#include <linux/completion.h>
#include <linux/cpumask.h>
#include <linux/uprobes.h>
#include <linux/rcupdate.h>
#include <linux/page-flags-layout.h>
#include <linux/workqueue.h>
#include <linux/seqlock.h>
#include <linux/percpu_counter.h>
#include <linux/types.h>
#include <linux/bitmap.h>

#include <asm/mmu.h>

#ifndef AT_VECTOR_SIZE_ARCH
#define AT_VECTOR_SIZE_ARCH 0
#endif
#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1))


struct address_space;
struct futex_private_hash;
struct mem_cgroup;

typedef struct {
        unsigned long f;
} memdesc_flags_t;

/*
 * Each physical page in the system has a struct page associated with
 * it to keep track of whatever it is we are using the page for at the
 * moment. Note that we have no way to track which tasks are using
 * a page, though if it is a pagecache page, rmap structures can tell us
 * who is mapping it.
 *
 * If you allocate the page using alloc_pages(), you can use some of the
 * space in struct page for your own purposes.  The five words in the main
 * union are available, except for bit 0 of the first word which must be
 * kept clear.  Many users use this word to store a pointer to an object
 * which is guaranteed to be aligned.  If you use the same storage as
 * page->mapping, you must restore it to NULL before freeing the page.
 *
 * The mapcount field must not be used for own purposes.
 *
 * If you want to use the refcount field, it must be used in such a way
 * that other CPUs temporarily incrementing and then decrementing the
 * refcount does not cause problems.  On receiving the page from
 * alloc_pages(), the refcount will be positive.
 *
 * If you allocate pages of order > 0, you can use some of the fields
 * in each subpage, but you may need to restore some of their values
 * afterwards.
 *
 * SLUB uses cmpxchg_double() to atomically update its freelist and counters.
 * That requires that freelist & counters in struct slab be adjacent and
 * double-word aligned. Because struct slab currently just reinterprets the
 * bits of struct page, we align all struct pages to double-word boundaries,
 * and ensure that 'freelist' is aligned within struct slab.
 */
#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE
#define _struct_page_alignment        __aligned(2 * sizeof(unsigned long))
#else
#define _struct_page_alignment        __aligned(sizeof(unsigned long))
#endif

struct page {
        memdesc_flags_t flags;                /* Atomic flags, some possibly
                                         * updated asynchronously */
        /*
         * Five words (20/40 bytes) are available in this union.
         * WARNING: bit 0 of the first word is used for PageTail(). That
         * means the other users of this union MUST NOT use the bit to
         * avoid collision and false-positive PageTail().
         */
        union {
                struct {        /* Page cache and anonymous pages */
                        /**
                         * @lru: Pageout list, eg. active_list protected by
                         * lruvec->lru_lock.  Sometimes used as a generic list
                         * by the page owner.
                         */
                        union {
                                struct list_head lru;

                                /* Or, free page */
                                struct list_head buddy_list;
                                struct list_head pcp_list;
                                struct llist_node pcp_llist;
                        };
                        struct address_space *mapping;
                        union {
                                pgoff_t __folio_index;                /* Our offset within mapping. */
                                unsigned long share;        /* share count for fsdax */
                        };
                        /**
                         * @private: Mapping-private opaque data.
                         * Usually used for buffer_heads if PagePrivate.
                         * Used for swp_entry_t if swapcache flag set.
                         * Indicates order in the buddy system if PageBuddy
                         * or on pcp_llist.
                         */
                        unsigned long private;
                };
                struct {        /* page_pool used by netstack */
                        /**
                         * @pp_magic: magic value to avoid recycling non
                         * page_pool allocated pages.
                         */
                        unsigned long pp_magic;
                        struct page_pool *pp;
                        unsigned long _pp_mapping_pad;
                        unsigned long dma_addr;
                        atomic_long_t pp_ref_count;
                };
                struct {        /* Tail pages of compound page */
                        unsigned long compound_head;        /* Bit zero is set */
                };
                struct {        /* ZONE_DEVICE pages */
                        /*
                         * The first word is used for compound_head or folio
                         * pgmap
                         */
                        void *_unused_pgmap_compound_head;
                        void *zone_device_data;
                        /*
                         * ZONE_DEVICE private pages are counted as being
                         * mapped so the next 3 words hold the mapping, index,
                         * and private fields from the source anonymous or
                         * page cache page while the page is migrated to device
                         * private memory.
                         * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also
                         * use the mapping, index, and private fields when
                         * pmem backed DAX files are mapped.
                         */
                };

                /** @rcu_head: You can use this to free a page by RCU. */
                struct rcu_head rcu_head;
        };

        union {                /* This union is 4 bytes in size. */
                /*
                 * For head pages of typed folios, the value stored here
                 * allows for determining what this page is used for. The
                 * tail pages of typed folios will not store a type
                 * (page_type == _mapcount == -1).
                 *
                 * See page-flags.h for a list of page types which are currently
                 * stored here.
                 *
                 * Owners of typed folios may reuse the lower 16 bit of the
                 * head page page_type field after setting the page type,
                 * but must reset these 16 bit to -1 before clearing the
                 * page type.
                 */
                unsigned int page_type;

                /*
                 * For pages that are part of non-typed folios for which mappings
                 * are tracked via the RMAP, encodes the number of times this page
                 * is directly referenced by a page table.
                 *
                 * Note that the mapcount is always initialized to -1, so that
                 * transitions both from it and to it can be tracked, using
                 * atomic_inc_and_test() and atomic_add_negative(-1).
                 */
                atomic_t _mapcount;
        };

        /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */
        atomic_t _refcount;

#ifdef CONFIG_MEMCG
        unsigned long memcg_data;
#elif defined(CONFIG_SLAB_OBJ_EXT)
        unsigned long _unused_slab_obj_exts;
#endif

        /*
         * On machines where all RAM is mapped into kernel address space,
         * we can simply calculate the virtual address. On machines with
         * highmem some memory is mapped into kernel virtual memory
         * dynamically, so we need a place to store that address.
         * Note that this field could be 16 bits on x86 ... ;)
         *
         * Architectures with slow multiplication can define
         * WANT_PAGE_VIRTUAL in asm/page.h
         */
#if defined(WANT_PAGE_VIRTUAL)
        void *virtual;                        /* Kernel virtual address (NULL if
                                           not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */

#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
        int _last_cpupid;
#endif

#ifdef CONFIG_KMSAN
        /*
         * KMSAN metadata for this page:
         *  - shadow page: every bit indicates whether the corresponding
         *    bit of the original page is initialized (0) or not (1);
         *  - origin page: every 4 bytes contain an id of the stack trace
         *    where the uninitialized value was created.
         */
        struct page *kmsan_shadow;
        struct page *kmsan_origin;
#endif
} _struct_page_alignment;

/*
 * struct encoded_page - a nonexistent type marking this pointer
 *
 * An 'encoded_page' pointer is a pointer to a regular 'struct page', but
 * with the low bits of the pointer indicating extra context-dependent
 * information. Only used in mmu_gather handling, and this acts as a type
 * system check on that use.
 *
 * We only really have two guaranteed bits in general, although you could
 * play with 'struct page' alignment (see CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
 * for more.
 *
 * Use the supplied helper functions to endcode/decode the pointer and bits.
 */
struct encoded_page;

#define ENCODED_PAGE_BITS                        3ul

/* Perform rmap removal after we have flushed the TLB. */
#define ENCODED_PAGE_BIT_DELAY_RMAP                1ul

/*
 * The next item in an encoded_page array is the "nr_pages" argument, specifying
 * the number of consecutive pages starting from this page, that all belong to
 * the same folio. For example, "nr_pages" corresponds to the number of folio
 * references that must be dropped. If this bit is not set, "nr_pages" is
 * implicitly 1.
 */
#define ENCODED_PAGE_BIT_NR_PAGES_NEXT                2ul

static __always_inline struct encoded_page *encode_page(struct page *page, unsigned long flags)
{
        BUILD_BUG_ON(flags > ENCODED_PAGE_BITS);
        return (struct encoded_page *)(flags | (unsigned long)page);
}

static inline unsigned long encoded_page_flags(struct encoded_page *page)
{
        return ENCODED_PAGE_BITS & (unsigned long)page;
}

static inline struct page *encoded_page_ptr(struct encoded_page *page)
{
        return (struct page *)(~ENCODED_PAGE_BITS & (unsigned long)page);
}

static __always_inline struct encoded_page *encode_nr_pages(unsigned long nr)
{
        VM_WARN_ON_ONCE((nr << 2) >> 2 != nr);
        return (struct encoded_page *)(nr << 2);
}

static __always_inline unsigned long encoded_nr_pages(struct encoded_page *page)
{
        return ((unsigned long)page) >> 2;
}

/*
 * A swap entry has to fit into a "unsigned long", as the entry is hidden
 * in the "index" field of the swapper address space.
 */
typedef struct {
        unsigned long val;
} swp_entry_t;

#if defined(CONFIG_MEMCG) || defined(CONFIG_SLAB_OBJ_EXT)
/* We have some extra room after the refcount in tail pages. */
#define NR_PAGES_IN_LARGE_FOLIO
#endif

/*
 * On 32bit, we can cut the required metadata in half, because:
 * (a) PID_MAX_LIMIT implicitly limits the number of MMs we could ever have,
 *     so we can limit MM IDs to 15 bit (32767).
 * (b) We don't expect folios where even a single complete PTE mapping by
 *     one MM would exceed 15 bits (order-15).
 */
#ifdef CONFIG_64BIT
typedef int mm_id_mapcount_t;
#define MM_ID_MAPCOUNT_MAX                INT_MAX
typedef unsigned int mm_id_t;
#else /* !CONFIG_64BIT */
typedef short mm_id_mapcount_t;
#define MM_ID_MAPCOUNT_MAX                SHRT_MAX
typedef unsigned short mm_id_t;
#endif /* CONFIG_64BIT */

/* We implicitly use the dummy ID for init-mm etc. where we never rmap pages. */
#define MM_ID_DUMMY                        0
#define MM_ID_MIN                        (MM_ID_DUMMY + 1)

/*
 * We leave the highest bit of each MM id unused, so we can store a flag
 * in the highest bit of each folio->_mm_id[].
 */
#define MM_ID_BITS                        ((sizeof(mm_id_t) * BITS_PER_BYTE) - 1)
#define MM_ID_MASK                        ((1U << MM_ID_BITS) - 1)
#define MM_ID_MAX                        MM_ID_MASK

/*
 * In order to use bit_spin_lock(), which requires an unsigned long, we
 * operate on folio->_mm_ids when working on flags.
 */
#define FOLIO_MM_IDS_LOCK_BITNUM        MM_ID_BITS
#define FOLIO_MM_IDS_LOCK_BIT                BIT(FOLIO_MM_IDS_LOCK_BITNUM)
#define FOLIO_MM_IDS_SHARED_BITNUM        (2 * MM_ID_BITS + 1)
#define FOLIO_MM_IDS_SHARED_BIT                BIT(FOLIO_MM_IDS_SHARED_BITNUM)

/**
 * struct folio - Represents a contiguous set of bytes.
 * @flags: Identical to the page flags.
 * @lru: Least Recently Used list; tracks how recently this folio was used.
 * @mlock_count: Number of times this folio has been pinned by mlock().
 * @mapping: The file this page belongs to, or refers to the anon_vma for
 *    anonymous memory.
 * @index: Offset within the file, in units of pages.  For anonymous memory,
 *    this is the index from the beginning of the mmap.
 * @share: number of DAX mappings that reference this folio. See
 *    dax_associate_entry.
 * @private: Filesystem per-folio data (see folio_attach_private()).
 * @swap: Used for swp_entry_t if folio_test_swapcache().
 * @_mapcount: Do not access this member directly.  Use folio_mapcount() to
 *    find out how many times this folio is mapped by userspace.
 * @_refcount: Do not access this member directly.  Use folio_ref_count()
 *    to find how many references there are to this folio.
 * @memcg_data: Memory Control Group data.
 * @pgmap: Metadata for ZONE_DEVICE mappings
 * @virtual: Virtual address in the kernel direct map.
 * @_last_cpupid: IDs of last CPU and last process that accessed the folio.
 * @_entire_mapcount: Do not use directly, call folio_entire_mapcount().
 * @_large_mapcount: Do not use directly, call folio_mapcount().
 * @_nr_pages_mapped: Do not use outside of rmap and debug code.
 * @_pincount: Do not use directly, call folio_maybe_dma_pinned().
 * @_nr_pages: Do not use directly, call folio_nr_pages().
 * @_mm_id: Do not use outside of rmap code.
 * @_mm_ids: Do not use outside of rmap code.
 * @_mm_id_mapcount: Do not use outside of rmap code.
 * @_hugetlb_subpool: Do not use directly, use accessor in hugetlb.h.
 * @_hugetlb_cgroup: Do not use directly, use accessor in hugetlb_cgroup.h.
 * @_hugetlb_cgroup_rsvd: Do not use directly, use accessor in hugetlb_cgroup.h.
 * @_hugetlb_hwpoison: Do not use directly, call raw_hwp_list_head().
 * @_deferred_list: Folios to be split under memory pressure.
 * @_unused_slab_obj_exts: Placeholder to match obj_exts in struct slab.
 *
 * A folio is a physically, virtually and logically contiguous set
 * of bytes.  It is a power-of-two in size, and it is aligned to that
 * same power-of-two.  It is at least as large as %PAGE_SIZE.  If it is
 * in the page cache, it is at a file offset which is a multiple of that
 * power-of-two.  It may be mapped into userspace at an address which is
 * at an arbitrary page offset, but its kernel virtual address is aligned
 * to its size.
 */
struct folio {
        /* private: don't document the anon union */
        union {
                struct {
        /* public: */
                        memdesc_flags_t flags;
                        union {
                                struct list_head lru;
        /* private: avoid cluttering the output */
                                /* For the Unevictable "LRU list" slot */
                                struct {
                                        /* Avoid compound_head */
                                        void *__filler;
        /* public: */
                                        unsigned int mlock_count;
        /* private: */
                                };
        /* public: */
                                struct dev_pagemap *pgmap;
                        };
                        struct address_space *mapping;
                        union {
                                pgoff_t index;
                                unsigned long share;
                        };
                        union {
                                void *private;
                                swp_entry_t swap;
                        };
                        atomic_t _mapcount;
                        atomic_t _refcount;
#ifdef CONFIG_MEMCG
                        unsigned long memcg_data;
#elif defined(CONFIG_SLAB_OBJ_EXT)
                        unsigned long _unused_slab_obj_exts;
#endif
#if defined(WANT_PAGE_VIRTUAL)
                        void *virtual;
#endif
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
                        int _last_cpupid;
#endif
        /* private: the union with struct page is transitional */
                };
                struct page page;
        };
        union {
                struct {
                        unsigned long _flags_1;
                        unsigned long _head_1;
                        union {
                                struct {
        /* public: */
                                        atomic_t _large_mapcount;
                                        atomic_t _nr_pages_mapped;
#ifdef CONFIG_64BIT
                                        atomic_t _entire_mapcount;
                                        atomic_t _pincount;
#endif /* CONFIG_64BIT */
                                        mm_id_mapcount_t _mm_id_mapcount[2];
                                        union {
                                                mm_id_t _mm_id[2];
                                                unsigned long _mm_ids;
                                        };
        /* private: the union with struct page is transitional */
                                };
                                unsigned long _usable_1[4];
                        };
                        atomic_t _mapcount_1;
                        atomic_t _refcount_1;
        /* public: */
#ifdef NR_PAGES_IN_LARGE_FOLIO
                        unsigned int _nr_pages;
#endif /* NR_PAGES_IN_LARGE_FOLIO */
        /* private: the union with struct page is transitional */
                };
                struct page __page_1;
        };
        union {
                struct {
                        unsigned long _flags_2;
                        unsigned long _head_2;
        /* public: */
                        struct list_head _deferred_list;
#ifndef CONFIG_64BIT
                        atomic_t _entire_mapcount;
                        atomic_t _pincount;
#endif /* !CONFIG_64BIT */
        /* private: the union with struct page is transitional */
                };
                struct page __page_2;
        };
        union {
                struct {
                        unsigned long _flags_3;
                        unsigned long _head_3;
        /* public: */
                        void *_hugetlb_subpool;
                        void *_hugetlb_cgroup;
                        void *_hugetlb_cgroup_rsvd;
                        void *_hugetlb_hwpoison;
        /* private: the union with struct page is transitional */
                };
                struct page __page_3;
        };
};

#define FOLIO_MATCH(pg, fl)                                                \
        static_assert(offsetof(struct page, pg) == offsetof(struct folio, fl))
FOLIO_MATCH(flags, flags);
FOLIO_MATCH(lru, lru);
FOLIO_MATCH(mapping, mapping);
FOLIO_MATCH(compound_head, lru);
FOLIO_MATCH(__folio_index, index);
FOLIO_MATCH(private, private);
FOLIO_MATCH(_mapcount, _mapcount);
FOLIO_MATCH(_refcount, _refcount);
#ifdef CONFIG_MEMCG
FOLIO_MATCH(memcg_data, memcg_data);
#endif
#if defined(WANT_PAGE_VIRTUAL)
FOLIO_MATCH(virtual, virtual);
#endif
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
FOLIO_MATCH(_last_cpupid, _last_cpupid);
#endif
#undef FOLIO_MATCH
#define FOLIO_MATCH(pg, fl)                                                \
        static_assert(offsetof(struct folio, fl) ==                        \
                        offsetof(struct page, pg) + sizeof(struct page))
FOLIO_MATCH(flags, _flags_1);
FOLIO_MATCH(compound_head, _head_1);
FOLIO_MATCH(_mapcount, _mapcount_1);
FOLIO_MATCH(_refcount, _refcount_1);
#undef FOLIO_MATCH
#define FOLIO_MATCH(pg, fl)                                                \
        static_assert(offsetof(struct folio, fl) ==                        \
                        offsetof(struct page, pg) + 2 * sizeof(struct page))
FOLIO_MATCH(flags, _flags_2);
FOLIO_MATCH(compound_head, _head_2);
#undef FOLIO_MATCH
#define FOLIO_MATCH(pg, fl)                                                \
        static_assert(offsetof(struct folio, fl) ==                        \
                        offsetof(struct page, pg) + 3 * sizeof(struct page))
FOLIO_MATCH(flags, _flags_3);
FOLIO_MATCH(compound_head, _head_3);
#undef FOLIO_MATCH

/**
 * struct ptdesc -    Memory descriptor for page tables.
 * @pt_flags: enum pt_flags plus zone/node/section.
 * @pt_rcu_head:      For freeing page table pages.
 * @pt_list:          List of used page tables. Used for s390 gmap shadow pages
 *                    (which are not linked into the user page tables) and x86
 *                    pgds.
 * @_pt_pad_1:        Padding that aliases with page's compound head.
 * @pmd_huge_pte:     Protected by ptdesc->ptl, used for THPs.
 * @__page_mapping:   Aliases with page->mapping. Unused for page tables.
 * @pt_index:         Used for s390 gmap.
 * @pt_mm:            Used for x86 pgds.
 * @pt_frag_refcount: For fragmented page table tracking. Powerpc only.
 * @pt_share_count:   Used for HugeTLB PMD page table share count.
 * @_pt_pad_2:        Padding to ensure proper alignment.
 * @ptl:              Lock for the page table.
 * @__page_type:      Same as page->page_type. Unused for page tables.
 * @__page_refcount:  Same as page refcount.
 * @pt_memcg_data:    Memcg data. Tracked for page tables here.
 *
 * This struct overlays struct page for now. Do not modify without a good
 * understanding of the issues.
 */
struct ptdesc {
        memdesc_flags_t pt_flags;

        union {
                struct rcu_head pt_rcu_head;
                struct list_head pt_list;
                struct {
                        unsigned long _pt_pad_1;
                        pgtable_t pmd_huge_pte;
                };
        };
        unsigned long __page_mapping;

        union {
                pgoff_t pt_index;
                struct mm_struct *pt_mm;
                atomic_t pt_frag_refcount;
#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
                atomic_t pt_share_count;
#endif
        };

        union {
                unsigned long _pt_pad_2;
#if ALLOC_SPLIT_PTLOCKS
                spinlock_t *ptl;
#else
                spinlock_t ptl;
#endif
        };
        unsigned int __page_type;
        atomic_t __page_refcount;
#ifdef CONFIG_MEMCG
        unsigned long pt_memcg_data;
#endif
};

#define TABLE_MATCH(pg, pt)                                                \
        static_assert(offsetof(struct page, pg) == offsetof(struct ptdesc, pt))
TABLE_MATCH(flags, pt_flags);
TABLE_MATCH(compound_head, pt_list);
TABLE_MATCH(compound_head, _pt_pad_1);
TABLE_MATCH(mapping, __page_mapping);
TABLE_MATCH(__folio_index, pt_index);
TABLE_MATCH(rcu_head, pt_rcu_head);
TABLE_MATCH(page_type, __page_type);
TABLE_MATCH(_refcount, __page_refcount);
#ifdef CONFIG_MEMCG
TABLE_MATCH(memcg_data, pt_memcg_data);
#endif
#undef TABLE_MATCH
static_assert(sizeof(struct ptdesc) <= sizeof(struct page));

#define ptdesc_page(pt)                        (_Generic((pt),                        \
        const struct ptdesc *:                (const struct page *)(pt),        \
        struct ptdesc *:                (struct page *)(pt)))

#define ptdesc_folio(pt)                (_Generic((pt),                        \
        const struct ptdesc *:                (const struct folio *)(pt),        \
        struct ptdesc *:                (struct folio *)(pt)))

#define page_ptdesc(p)                        (_Generic((p),                        \
        const struct page *:                (const struct ptdesc *)(p),        \
        struct page *:                        (struct ptdesc *)(p)))

#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc)
{
        atomic_set(&ptdesc->pt_share_count, 0);
}

static inline void ptdesc_pmd_pts_inc(struct ptdesc *ptdesc)
{
        atomic_inc(&ptdesc->pt_share_count);
}

static inline void ptdesc_pmd_pts_dec(struct ptdesc *ptdesc)
{
        atomic_dec(&ptdesc->pt_share_count);
}

static inline int ptdesc_pmd_pts_count(const struct ptdesc *ptdesc)
{
        return atomic_read(&ptdesc->pt_share_count);
}

static inline bool ptdesc_pmd_is_shared(struct ptdesc *ptdesc)
{
        return !!ptdesc_pmd_pts_count(ptdesc);
}
#else
static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc)
{
}
#endif

/*
 * Used for sizing the vmemmap region on some architectures
 */
#define STRUCT_PAGE_MAX_SHIFT        (order_base_2(sizeof(struct page)))

/*
 * page_private can be used on tail pages.  However, PagePrivate is only
 * checked by the VM on the head page.  So page_private on the tail pages
 * should be used for data that's ancillary to the head page (eg attaching
 * buffer heads to tail pages after attaching buffer heads to the head page)
 */
#define page_private(page)                ((page)->private)

static inline void set_page_private(struct page *page, unsigned long private)
{
        page->private = private;
}

static inline void *folio_get_private(const struct folio *folio)
{
        return folio->private;
}

typedef unsigned long vm_flags_t;

/*
 * freeptr_t represents a SLUB freelist pointer, which might be encoded
 * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
 */
typedef struct { unsigned long v; } freeptr_t;

/*
 * A region containing a mapping of a non-memory backed file under NOMMU
 * conditions.  These are held in a global tree and are pinned by the VMAs that
 * map parts of them.
 */
struct vm_region {
        struct rb_node        vm_rb;                /* link in global region tree */
        vm_flags_t        vm_flags;        /* VMA vm_flags */
        unsigned long        vm_start;        /* start address of region */
        unsigned long        vm_end;                /* region initialised to here */
        unsigned long        vm_top;                /* region allocated to here */
        unsigned long        vm_pgoff;        /* the offset in vm_file corresponding to vm_start */
        struct file        *vm_file;        /* the backing file or NULL */

        int                vm_usage;        /* region usage count (access under nommu_region_sem) */
        bool                vm_icache_flushed : 1; /* true if the icache has been flushed for
                                                * this region */
};

#ifdef CONFIG_USERFAULTFD
#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) { NULL, })
struct vm_userfaultfd_ctx {
        struct userfaultfd_ctx *ctx;
};
#else /* CONFIG_USERFAULTFD */
#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) {})
struct vm_userfaultfd_ctx {};
#endif /* CONFIG_USERFAULTFD */

struct anon_vma_name {
        struct kref kref;
        /* The name needs to be at the end because it is dynamically sized. */
        char name[];
};

#ifdef CONFIG_ANON_VMA_NAME
/*
 * mmap_lock should be read-locked when calling anon_vma_name(). Caller should
 * either keep holding the lock while using the returned pointer or it should
 * raise anon_vma_name refcount before releasing the lock.
 */
struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma);
struct anon_vma_name *anon_vma_name_alloc(const char *name);
void anon_vma_name_free(struct kref *kref);
#else /* CONFIG_ANON_VMA_NAME */
static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
{
        return NULL;
}

static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
{
        return NULL;
}
#endif

#define VMA_LOCK_OFFSET        0x40000000
#define VMA_REF_LIMIT        (VMA_LOCK_OFFSET - 1)

struct vma_numab_state {
        /*
         * Initialised as time in 'jiffies' after which VMA
         * should be scanned.  Delays first scan of new VMA by at
         * least sysctl_numa_balancing_scan_delay:
         */
        unsigned long next_scan;

        /*
         * Time in jiffies when pids_active[] is reset to
         * detect phase change behaviour:
         */
        unsigned long pids_active_reset;

        /*
         * Approximate tracking of PIDs that trapped a NUMA hinting
         * fault. May produce false positives due to hash collisions.
         *
         *   [0] Previous PID tracking
         *   [1] Current PID tracking
         *
         * Window moves after next_pid_reset has expired approximately
         * every VMA_PID_RESET_PERIOD jiffies:
         */
        unsigned long pids_active[2];

        /* MM scan sequence ID when scan first started after VMA creation */
        int start_scan_seq;

        /*
         * MM scan sequence ID when the VMA was last completely scanned.
         * A VMA is not eligible for scanning if prev_scan_seq == numa_scan_seq
         */
        int prev_scan_seq;
};

#ifdef __HAVE_PFNMAP_TRACKING
struct pfnmap_track_ctx {
        struct kref kref;
        unsigned long pfn;
        unsigned long size;        /* in bytes */
};
#endif

/*
 * Describes a VMA that is about to be mmap()'ed. Drivers may choose to
 * manipulate mutable fields which will cause those fields to be updated in the
 * resultant VMA.
 *
 * Helper functions are not required for manipulating any field.
 */
struct vm_area_desc {
        /* Immutable state. */
        const struct mm_struct *const mm;
        struct file *const file; /* May vary from vm_file in stacked callers. */
        unsigned long start;
        unsigned long end;

        /* Mutable fields. Populated with initial state. */
        pgoff_t pgoff;
        struct file *vm_file;
        vm_flags_t vm_flags;
        pgprot_t page_prot;

        /* Write-only fields. */
        const struct vm_operations_struct *vm_ops;
        void *private_data;
};

/*
 * This struct describes a virtual memory area. There is one of these
 * per VM-area/task. A VM area is any part of the process virtual memory
 * space that has a special rule for the page-fault handlers (ie a shared
 * library, the executable area etc).
 *
 * Only explicitly marked struct members may be accessed by RCU readers before
 * getting a stable reference.
 *
 * WARNING: when adding new members, please update vm_area_init_from() to copy
 * them during vm_area_struct content duplication.
 */
struct vm_area_struct {
        /* The first cache line has the info for VMA tree walking. */

        union {
                struct {
                        /* VMA covers [vm_start; vm_end) addresses within mm */
                        unsigned long vm_start;
                        unsigned long vm_end;
                };
                freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */
        };

        /*
         * The address space we belong to.
         * Unstable RCU readers are allowed to read this.
         */
        struct mm_struct *vm_mm;
        pgprot_t vm_page_prot;          /* Access permissions of this VMA. */

        /*
         * Flags, see mm.h.
         * To modify use vm_flags_{init|reset|set|clear|mod} functions.
         */
        union {
                const vm_flags_t vm_flags;
                vm_flags_t __private __vm_flags;
        };

#ifdef CONFIG_PER_VMA_LOCK
        /*
         * Can only be written (using WRITE_ONCE()) while holding both:
         *  - mmap_lock (in write mode)
         *  - vm_refcnt bit at VMA_LOCK_OFFSET is set
         * Can be read reliably while holding one of:
         *  - mmap_lock (in read or write mode)
         *  - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1
         * Can be read unreliably (using READ_ONCE()) for pessimistic bailout
         * while holding nothing (except RCU to keep the VMA struct allocated).
         *
         * This sequence counter is explicitly allowed to overflow; sequence
         * counter reuse can only lead to occasional unnecessary use of the
         * slowpath.
         */
        unsigned int vm_lock_seq;
#endif
        /*
         * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
         * list, after a COW of one of the file pages.        A MAP_SHARED vma
         * can only be in the i_mmap tree.  An anonymous MAP_PRIVATE, stack
         * or brk vma (with NULL file) can only be in an anon_vma list.
         */
        struct list_head anon_vma_chain; /* Serialized by mmap_lock &
                                          * page_table_lock */
        struct anon_vma *anon_vma;        /* Serialized by page_table_lock */

        /* Function pointers to deal with this struct. */
        const struct vm_operations_struct *vm_ops;

        /* Information about our backing store: */
        unsigned long vm_pgoff;                /* Offset (within vm_file) in PAGE_SIZE
                                           units */
        struct file * vm_file;                /* File we map to (can be NULL). */
        void * vm_private_data;                /* was vm_pte (shared mem) */

#ifdef CONFIG_SWAP
        atomic_long_t swap_readahead_info;
#endif
#ifndef CONFIG_MMU
        struct vm_region *vm_region;        /* NOMMU mapping region */
#endif
#ifdef CONFIG_NUMA
        struct mempolicy *vm_policy;        /* NUMA policy for the VMA */
#endif
#ifdef CONFIG_NUMA_BALANCING
        struct vma_numab_state *numab_state;        /* NUMA Balancing state */
#endif
#ifdef CONFIG_PER_VMA_LOCK
        /* Unstable RCU readers are allowed to read this. */
        refcount_t vm_refcnt ____cacheline_aligned_in_smp;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map vmlock_dep_map;
#endif
#endif
        /*
         * For areas with an address space and backing store,
         * linkage into the address_space->i_mmap interval tree.
         *
         */
        struct {
                struct rb_node rb;
                unsigned long rb_subtree_last;
        } shared;
#ifdef CONFIG_ANON_VMA_NAME
        /*
         * For private and shared anonymous mappings, a pointer to a null
         * terminated string containing the name given to the vma, or NULL if
         * unnamed. Serialized by mmap_lock. Use anon_vma_name to access.
         */
        struct anon_vma_name *anon_name;
#endif
        struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
#ifdef __HAVE_PFNMAP_TRACKING
        struct pfnmap_track_ctx *pfnmap_track_ctx;
#endif
} __randomize_layout;

#ifdef CONFIG_NUMA
#define vma_policy(vma) ((vma)->vm_policy)
#else
#define vma_policy(vma) NULL
#endif

#ifdef CONFIG_SCHED_MM_CID
struct mm_cid {
        u64 time;
        int cid;
        int recent_cid;
};
#endif

/*
 * Opaque type representing current mm_struct flag state. Must be accessed via
 * mm_flags_xxx() helper functions.
 */
#define NUM_MM_FLAG_BITS (64)
typedef struct {
        DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS);
} __private mm_flags_t;

struct kioctx_table;
struct iommu_mm_data;
struct mm_struct {
        struct {
                /*
                 * Fields which are often written to are placed in a separate
                 * cache line.
                 */
                struct {
                        /**
                         * @mm_count: The number of references to &struct
                         * mm_struct (@mm_users count as 1).
                         *
                         * Use mmgrab()/mmdrop() to modify. When this drops to
                         * 0, the &struct mm_struct is freed.
                         */
                        atomic_t mm_count;
                } ____cacheline_aligned_in_smp;

                struct maple_tree mm_mt;

                unsigned long mmap_base;        /* base of mmap area */
                unsigned long mmap_legacy_base;        /* base of mmap area in bottom-up allocations */
#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
                /* Base addresses for compatible mmap() */
                unsigned long mmap_compat_base;
                unsigned long mmap_compat_legacy_base;
#endif
                unsigned long task_size;        /* size of task vm space */
                pgd_t * pgd;

#ifdef CONFIG_MEMBARRIER
                /**
                 * @membarrier_state: Flags controlling membarrier behavior.
                 *
                 * This field is close to @pgd to hopefully fit in the same
                 * cache-line, which needs to be touched by switch_mm().
                 */
                atomic_t membarrier_state;
#endif

                /**
                 * @mm_users: The number of users including userspace.
                 *
                 * Use mmget()/mmget_not_zero()/mmput() to modify. When this
                 * drops to 0 (i.e. when the task exits and there are no other
                 * temporary reference holders), we also release a reference on
                 * @mm_count (which may then free the &struct mm_struct if
                 * @mm_count also drops to 0).
                 */
                atomic_t mm_users;

#ifdef CONFIG_SCHED_MM_CID
                /**
                 * @pcpu_cid: Per-cpu current cid.
                 *
                 * Keep track of the currently allocated mm_cid for each cpu.
                 * The per-cpu mm_cid values are serialized by their respective
                 * runqueue locks.
                 */
                struct mm_cid __percpu *pcpu_cid;
                /*
                 * @mm_cid_next_scan: Next mm_cid scan (in jiffies).
                 *
                 * When the next mm_cid scan is due (in jiffies).
                 */
                unsigned long mm_cid_next_scan;
                /**
                 * @nr_cpus_allowed: Number of CPUs allowed for mm.
                 *
                 * Number of CPUs allowed in the union of all mm's
                 * threads allowed CPUs.
                 */
                unsigned int nr_cpus_allowed;
                /**
                 * @max_nr_cid: Maximum number of allowed concurrency
                 *              IDs allocated.
                 *
                 * Track the highest number of allowed concurrency IDs
                 * allocated for the mm.
                 */
                atomic_t max_nr_cid;
                /**
                 * @cpus_allowed_lock: Lock protecting mm cpus_allowed.
                 *
                 * Provide mutual exclusion for mm cpus_allowed and
                 * mm nr_cpus_allowed updates.
                 */
                raw_spinlock_t cpus_allowed_lock;
#endif
#ifdef CONFIG_MMU
                atomic_long_t pgtables_bytes;        /* size of all page tables */
#endif
                int map_count;                        /* number of VMAs */

                spinlock_t page_table_lock; /* Protects page tables and some
                                             * counters
                                             */
                /*
                 * Typically the current mmap_lock's offset is 56 bytes from
                 * the last cacheline boundary, which is very optimal, as
                 * its two hot fields 'count' and 'owner' sit in 2 different
                 * cachelines, and when mmap_lock is highly contended, both
                 * of the 2 fields will be accessed frequently, current layout
                 * will help to reduce cache bouncing.
                 *
                 * So please be careful with adding new fields before
                 * mmap_lock, which can easily push the 2 fields into one
                 * cacheline.
                 */
                struct rw_semaphore mmap_lock;

                struct list_head mmlist; /* List of maybe swapped mm's.        These
                                          * are globally strung together off
                                          * init_mm.mmlist, and are protected
                                          * by mmlist_lock
                                          */
#ifdef CONFIG_PER_VMA_LOCK
                struct rcuwait vma_writer_wait;
                /*
                 * This field has lock-like semantics, meaning it is sometimes
                 * accessed with ACQUIRE/RELEASE semantics.
                 * Roughly speaking, incrementing the sequence number is
                 * equivalent to releasing locks on VMAs; reading the sequence
                 * number can be part of taking a read lock on a VMA.
                 * Incremented every time mmap_lock is write-locked/unlocked.
                 * Initialized to 0, therefore odd values indicate mmap_lock
                 * is write-locked and even values that it's released.
                 *
                 * Can be modified under write mmap_lock using RELEASE
                 * semantics.
                 * Can be read with no other protection when holding write
                 * mmap_lock.
                 * Can be read with ACQUIRE semantics if not holding write
                 * mmap_lock.
                 */
                seqcount_t mm_lock_seq;
#endif
#ifdef CONFIG_FUTEX_PRIVATE_HASH
                struct mutex                        futex_hash_lock;
                struct futex_private_hash        __rcu *futex_phash;
                struct futex_private_hash        *futex_phash_new;
                /* futex-ref */
                unsigned long                        futex_batches;
                struct rcu_head                        futex_rcu;
                atomic_long_t                        futex_atomic;
                unsigned int                        __percpu *futex_ref;
#endif

                unsigned long hiwater_rss; /* High-watermark of RSS usage */
                unsigned long hiwater_vm;  /* High-water virtual memory usage */

                unsigned long total_vm;           /* Total pages mapped */
                unsigned long locked_vm;   /* Pages that have PG_mlocked set */
                atomic64_t    pinned_vm;   /* Refcount permanently increased */
                unsigned long data_vm;           /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
                unsigned long exec_vm;           /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
                unsigned long stack_vm;           /* VM_STACK */
                vm_flags_t def_flags;

                /**
                 * @write_protect_seq: Locked when any thread is write
                 * protecting pages mapped by this mm to enforce a later COW,
                 * for instance during page table copying for fork().
                 */
                seqcount_t write_protect_seq;

                spinlock_t arg_lock; /* protect the below fields */

                unsigned long start_code, end_code, start_data, end_data;
                unsigned long start_brk, brk, start_stack;
                unsigned long arg_start, arg_end, env_start, env_end;

                unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */

#ifdef CONFIG_ARCH_HAS_ELF_CORE_EFLAGS
                /* the ABI-related flags from the ELF header. Used for core dump */
                unsigned long saved_e_flags;
#endif

                struct percpu_counter rss_stat[NR_MM_COUNTERS];

                struct linux_binfmt *binfmt;

                /* Architecture-specific MM context */
                mm_context_t context;

                mm_flags_t flags; /* Must use mm_flags_* hlpers to access */

#ifdef CONFIG_AIO
                spinlock_t                        ioctx_lock;
                struct kioctx_table __rcu        *ioctx_table;
#endif
#ifdef CONFIG_MEMCG
                /*
                 * "owner" points to a task that is regarded as the canonical
                 * user/owner of this mm. All of the following must be true in
                 * order for it to be changed:
                 *
                 * current == mm->owner
                 * current->mm != mm
                 * new_owner->mm == mm
                 * new_owner->alloc_lock is held
                 */
                struct task_struct __rcu *owner;
#endif
                struct user_namespace *user_ns;

                /* store ref to file /proc/<pid>/exe symlink points to */
                struct file __rcu *exe_file;
#ifdef CONFIG_MMU_NOTIFIER
                struct mmu_notifier_subscriptions *notifier_subscriptions;
#endif
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
                pgtable_t pmd_huge_pte; /* protected by page_table_lock */
#endif
#ifdef CONFIG_NUMA_BALANCING
                /*
                 * numa_next_scan is the next time that PTEs will be remapped
                 * PROT_NONE to trigger NUMA hinting faults; such faults gather
                 * statistics and migrate pages to new nodes if necessary.
                 */
                unsigned long numa_next_scan;

                /* Restart point for scanning and remapping PTEs. */
                unsigned long numa_scan_offset;

                /* numa_scan_seq prevents two threads remapping PTEs. */
                int numa_scan_seq;
#endif
                /*
                 * An operation with batched TLB flushing is going on. Anything
                 * that can move process memory needs to flush the TLB when
                 * moving a PROT_NONE mapped page.
                 */
                atomic_t tlb_flush_pending;
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
                /* See flush_tlb_batched_pending() */
                atomic_t tlb_flush_batched;
#endif
                struct uprobes_state uprobes_state;
#ifdef CONFIG_PREEMPT_RT
                struct rcu_head delayed_drop;
#endif
#ifdef CONFIG_HUGETLB_PAGE
                atomic_long_t hugetlb_usage;
#endif
                struct work_struct async_put_work;

#ifdef CONFIG_IOMMU_MM_DATA
                struct iommu_mm_data *iommu_mm;
#endif
#ifdef CONFIG_KSM
                /*
                 * Represent how many pages of this process are involved in KSM
                 * merging (not including ksm_zero_pages).
                 */
                unsigned long ksm_merging_pages;
                /*
                 * Represent how many pages are checked for ksm merging
                 * including merged and not merged.
                 */
                unsigned long ksm_rmap_items;
                /*
                 * Represent how many empty pages are merged with kernel zero
                 * pages when enabling KSM use_zero_pages.
                 */
                atomic_long_t ksm_zero_pages;
#endif /* CONFIG_KSM */
#ifdef CONFIG_LRU_GEN_WALKS_MMU
                struct {
                        /* this mm_struct is on lru_gen_mm_list */
                        struct list_head list;
                        /*
                         * Set when switching to this mm_struct, as a hint of
                         * whether it has been used since the last time per-node
                         * page table walkers cleared the corresponding bits.
                         */
                        unsigned long bitmap;
#ifdef CONFIG_MEMCG
                        /* points to the memcg of "owner" above */
                        struct mem_cgroup *memcg;
#endif
                } lru_gen;
#endif /* CONFIG_LRU_GEN_WALKS_MMU */
#ifdef CONFIG_MM_ID
                mm_id_t mm_id;
#endif /* CONFIG_MM_ID */
        } __randomize_layout;

        /*
         * The mm_cpumask needs to be at the end of mm_struct, because it
         * is dynamically sized based on nr_cpu_ids.
         */
        unsigned long cpu_bitmap[];
};

/* Set the first system word of mm flags, non-atomically. */
static inline void __mm_flags_set_word(struct mm_struct *mm, unsigned long value)
{
        unsigned long *bitmap = ACCESS_PRIVATE(&mm->flags, __mm_flags);

        bitmap_copy(bitmap, &value, BITS_PER_LONG);
}

/* Obtain a read-only view of the bitmap. */
static inline const unsigned long *__mm_flags_get_bitmap(const struct mm_struct *mm)
{
        return (const unsigned long *)ACCESS_PRIVATE(&mm->flags, __mm_flags);
}

/* Read the first system word of mm flags, non-atomically. */
static inline unsigned long __mm_flags_get_word(const struct mm_struct *mm)
{
        const unsigned long *bitmap = __mm_flags_get_bitmap(mm);

        return bitmap_read(bitmap, 0, BITS_PER_LONG);
}

/*
 * Update the first system word of mm flags ONLY, applying the specified mask to
 * it, then setting all flags specified by bits.
 */
static inline void __mm_flags_set_mask_bits_word(struct mm_struct *mm,
                unsigned long mask, unsigned long bits)
{
        unsigned long *bitmap = ACCESS_PRIVATE(&mm->flags, __mm_flags);

        set_mask_bits(bitmap, mask, bits);
}

#define MM_MT_FLAGS        (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | \
                         MT_FLAGS_USE_RCU)
extern struct mm_struct init_mm;

/* Pointer magic because the dynamic array size confuses some compilers. */
static inline void mm_init_cpumask(struct mm_struct *mm)
{
        unsigned long cpu_bitmap = (unsigned long)mm;

        cpu_bitmap += offsetof(struct mm_struct, cpu_bitmap);
        cpumask_clear((struct cpumask *)cpu_bitmap);
}

/* Future-safe accessor for struct mm_struct's cpu_vm_mask. */
static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
{
        return (struct cpumask *)&mm->cpu_bitmap;
}

#ifdef CONFIG_LRU_GEN

struct lru_gen_mm_list {
        /* mm_struct list for page table walkers */
        struct list_head fifo;
        /* protects the list above */
        spinlock_t lock;
};

#endif /* CONFIG_LRU_GEN */

#ifdef CONFIG_LRU_GEN_WALKS_MMU

void lru_gen_add_mm(struct mm_struct *mm);
void lru_gen_del_mm(struct mm_struct *mm);
void lru_gen_migrate_mm(struct mm_struct *mm);

static inline void lru_gen_init_mm(struct mm_struct *mm)
{
        INIT_LIST_HEAD(&mm->lru_gen.list);
        mm->lru_gen.bitmap = 0;
#ifdef CONFIG_MEMCG
        mm->lru_gen.memcg = NULL;
#endif
}

static inline void lru_gen_use_mm(struct mm_struct *mm)
{
        /*
         * When the bitmap is set, page reclaim knows this mm_struct has been
         * used since the last time it cleared the bitmap. So it might be worth
         * walking the page tables of this mm_struct to clear the accessed bit.
         */
        WRITE_ONCE(mm->lru_gen.bitmap, -1);
}

#else /* !CONFIG_LRU_GEN_WALKS_MMU */

static inline void lru_gen_add_mm(struct mm_struct *mm)
{
}

static inline void lru_gen_del_mm(struct mm_struct *mm)
{
}

static inline void lru_gen_migrate_mm(struct mm_struct *mm)
{
}

static inline void lru_gen_init_mm(struct mm_struct *mm)
{
}

static inline void lru_gen_use_mm(struct mm_struct *mm)
{
}

#endif /* CONFIG_LRU_GEN_WALKS_MMU */

struct vma_iterator {
        struct ma_state mas;
};

#define VMA_ITERATOR(name, __mm, __addr)                                \
        struct vma_iterator name = {                                        \
                .mas = {                                                \
                        .tree = &(__mm)->mm_mt,                                \
                        .index = __addr,                                \
                        .node = NULL,                                        \
                        .status = ma_start,                                \
                },                                                        \
        }

static inline void vma_iter_init(struct vma_iterator *vmi,
                struct mm_struct *mm, unsigned long addr)
{
        mas_init(&vmi->mas, &mm->mm_mt, addr);
}

#ifdef CONFIG_SCHED_MM_CID

enum mm_cid_state {
        MM_CID_UNSET = -1U,                /* Unset state has lazy_put flag set. */
        MM_CID_LAZY_PUT = (1U << 31),
};

static inline bool mm_cid_is_unset(int cid)
{
        return cid == MM_CID_UNSET;
}

static inline bool mm_cid_is_lazy_put(int cid)
{
        return !mm_cid_is_unset(cid) && (cid & MM_CID_LAZY_PUT);
}

static inline bool mm_cid_is_valid(int cid)
{
        return !(cid & MM_CID_LAZY_PUT);
}

static inline int mm_cid_set_lazy_put(int cid)
{
        return cid | MM_CID_LAZY_PUT;
}

static inline int mm_cid_clear_lazy_put(int cid)
{
        return cid & ~MM_CID_LAZY_PUT;
}

/*
 * mm_cpus_allowed: Union of all mm's threads allowed CPUs.
 */
static inline cpumask_t *mm_cpus_allowed(struct mm_struct *mm)
{
        unsigned long bitmap = (unsigned long)mm;

        bitmap += offsetof(struct mm_struct, cpu_bitmap);
        /* Skip cpu_bitmap */
        bitmap += cpumask_size();
        return (struct cpumask *)bitmap;
}

/* Accessor for struct mm_struct's cidmask. */
static inline cpumask_t *mm_cidmask(struct mm_struct *mm)
{
        unsigned long cid_bitmap = (unsigned long)mm_cpus_allowed(mm);

        /* Skip mm_cpus_allowed */
        cid_bitmap += cpumask_size();
        return (struct cpumask *)cid_bitmap;
}

static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
{
        int i;

        for_each_possible_cpu(i) {
                struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i);

                pcpu_cid->cid = MM_CID_UNSET;
                pcpu_cid->recent_cid = MM_CID_UNSET;
                pcpu_cid->time = 0;
        }
        mm->nr_cpus_allowed = p->nr_cpus_allowed;
        atomic_set(&mm->max_nr_cid, 0);
        raw_spin_lock_init(&mm->cpus_allowed_lock);
        cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
        cpumask_clear(mm_cidmask(mm));
}

static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *p)
{
        mm->pcpu_cid = alloc_percpu_noprof(struct mm_cid);
        if (!mm->pcpu_cid)
                return -ENOMEM;
        mm_init_cid(mm, p);
        return 0;
}
#define mm_alloc_cid(...)        alloc_hooks(mm_alloc_cid_noprof(__VA_ARGS__))

static inline void mm_destroy_cid(struct mm_struct *mm)
{
        free_percpu(mm->pcpu_cid);
        mm->pcpu_cid = NULL;
}

static inline unsigned int mm_cid_size(void)
{
        return 2 * cpumask_size();        /* mm_cpus_allowed(), mm_cidmask(). */
}

static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask)
{
        struct cpumask *mm_allowed = mm_cpus_allowed(mm);

        if (!mm)
                return;
        /* The mm_cpus_allowed is the union of each thread allowed CPUs masks. */
        raw_spin_lock(&mm->cpus_allowed_lock);
        cpumask_or(mm_allowed, mm_allowed, cpumask);
        WRITE_ONCE(mm->nr_cpus_allowed, cpumask_weight(mm_allowed));
        raw_spin_unlock(&mm->cpus_allowed_lock);
}
#else /* CONFIG_SCHED_MM_CID */
static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { }
static inline int mm_alloc_cid(struct mm_struct *mm, struct task_struct *p) { return 0; }
static inline void mm_destroy_cid(struct mm_struct *mm) { }

static inline unsigned int mm_cid_size(void)
{
        return 0;
}
static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { }
#endif /* CONFIG_SCHED_MM_CID */

struct mmu_gather;
extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
extern void tlb_finish_mmu(struct mmu_gather *tlb);

struct vm_fault;

/**
 * typedef vm_fault_t - Return type for page fault handlers.
 *
 * Page fault handlers return a bitmask of %VM_FAULT values.
 */
typedef __bitwise unsigned int vm_fault_t;

/**
 * enum vm_fault_reason - Page fault handlers return a bitmask of
 * these values to tell the core VM what happened when handling the
 * fault. Used to decide whether a process gets delivered SIGBUS or
 * just gets major/minor fault counters bumped up.
 *
 * @VM_FAULT_OOM:                Out Of Memory
 * @VM_FAULT_SIGBUS:                Bad access
 * @VM_FAULT_MAJOR:                Page read from storage
 * @VM_FAULT_HWPOISON:                Hit poisoned small page
 * @VM_FAULT_HWPOISON_LARGE:        Hit poisoned large page. Index encoded
 *                                in upper bits
 * @VM_FAULT_SIGSEGV:                segmentation fault
 * @VM_FAULT_NOPAGE:                ->fault installed the pte, not return page
 * @VM_FAULT_LOCKED:                ->fault locked the returned page
 * @VM_FAULT_RETRY:                ->fault blocked, must retry
 * @VM_FAULT_FALLBACK:                huge page fault failed, fall back to small
 * @VM_FAULT_DONE_COW:                ->fault has fully handled COW
 * @VM_FAULT_NEEDDSYNC:                ->fault did not modify page tables and needs
 *                                fsync() to complete (for synchronous page faults
 *                                in DAX)
 * @VM_FAULT_COMPLETED:                ->fault completed, meanwhile mmap lock released
 * @VM_FAULT_HINDEX_MASK:        mask HINDEX value
 *
 */
enum vm_fault_reason {
        VM_FAULT_OOM            = (__force vm_fault_t)0x000001,
        VM_FAULT_SIGBUS         = (__force vm_fault_t)0x000002,
        VM_FAULT_MAJOR          = (__force vm_fault_t)0x000004,
        VM_FAULT_HWPOISON       = (__force vm_fault_t)0x000010,
        VM_FAULT_HWPOISON_LARGE = (__force vm_fault_t)0x000020,
        VM_FAULT_SIGSEGV        = (__force vm_fault_t)0x000040,
        VM_FAULT_NOPAGE         = (__force vm_fault_t)0x000100,
        VM_FAULT_LOCKED         = (__force vm_fault_t)0x000200,
        VM_FAULT_RETRY          = (__force vm_fault_t)0x000400,
        VM_FAULT_FALLBACK       = (__force vm_fault_t)0x000800,
        VM_FAULT_DONE_COW       = (__force vm_fault_t)0x001000,
        VM_FAULT_NEEDDSYNC      = (__force vm_fault_t)0x002000,
        VM_FAULT_COMPLETED      = (__force vm_fault_t)0x004000,
        VM_FAULT_HINDEX_MASK    = (__force vm_fault_t)0x0f0000,
};

/* Encode hstate index for a hwpoisoned large page */
#define VM_FAULT_SET_HINDEX(x) ((__force vm_fault_t)((x) << 16))
#define VM_FAULT_GET_HINDEX(x) (((__force unsigned int)(x) >> 16) & 0xf)

#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS |        \
                        VM_FAULT_SIGSEGV | VM_FAULT_HWPOISON |        \
                        VM_FAULT_HWPOISON_LARGE | VM_FAULT_FALLBACK)

#define VM_FAULT_RESULT_TRACE \
        { VM_FAULT_OOM,                 "OOM" },        \
        { VM_FAULT_SIGBUS,              "SIGBUS" },        \
        { VM_FAULT_MAJOR,               "MAJOR" },        \
        { VM_FAULT_HWPOISON,            "HWPOISON" },        \
        { VM_FAULT_HWPOISON_LARGE,      "HWPOISON_LARGE" },        \
        { VM_FAULT_SIGSEGV,             "SIGSEGV" },        \
        { VM_FAULT_NOPAGE,              "NOPAGE" },        \
        { VM_FAULT_LOCKED,              "LOCKED" },        \
        { VM_FAULT_RETRY,               "RETRY" },        \
        { VM_FAULT_FALLBACK,            "FALLBACK" },        \
        { VM_FAULT_DONE_COW,            "DONE_COW" },        \
        { VM_FAULT_NEEDDSYNC,           "NEEDDSYNC" },        \
        { VM_FAULT_COMPLETED,           "COMPLETED" }

struct vm_special_mapping {
        const char *name;        /* The name, e.g. "[vdso]". */

        /*
         * If .fault is not provided, this points to a
         * NULL-terminated array of pages that back the special mapping.
         *
         * This must not be NULL unless .fault is provided.
         */
        struct page **pages;

        /*
         * If non-NULL, then this is called to resolve page faults
         * on the special mapping.  If used, .pages is not checked.
         */
        vm_fault_t (*fault)(const struct vm_special_mapping *sm,
                                struct vm_area_struct *vma,
                                struct vm_fault *vmf);

        int (*mremap)(const struct vm_special_mapping *sm,
                     struct vm_area_struct *new_vma);

        void (*close)(const struct vm_special_mapping *sm,
                      struct vm_area_struct *vma);
};

enum tlb_flush_reason {
        TLB_FLUSH_ON_TASK_SWITCH,
        TLB_REMOTE_SHOOTDOWN,
        TLB_LOCAL_SHOOTDOWN,
        TLB_LOCAL_MM_SHOOTDOWN,
        TLB_REMOTE_SEND_IPI,
        TLB_REMOTE_WRONG_CPU,
        NR_TLB_FLUSH_REASONS,
};

/**
 * enum fault_flag - Fault flag definitions.
 * @FAULT_FLAG_WRITE: Fault was a write fault.
 * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE.
 * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked.
 * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying.
 * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region.
 * @FAULT_FLAG_TRIED: The fault has been tried once.
 * @FAULT_FLAG_USER: The fault originated in userspace.
 * @FAULT_FLAG_REMOTE: The fault is not for current task/mm.
 * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch.
 * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals.
 * @FAULT_FLAG_UNSHARE: The fault is an unsharing request to break COW in a
 *                      COW mapping, making sure that an exclusive anon page is
 *                      mapped after the fault.
 * @FAULT_FLAG_ORIG_PTE_VALID: whether the fault has vmf->orig_pte cached.
 *                        We should only access orig_pte if this flag set.
 * @FAULT_FLAG_VMA_LOCK: The fault is handled under VMA lock.
 *
 * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
 * whether we would allow page faults to retry by specifying these two
 * fault flags correctly.  Currently there can be three legal combinations:
 *
 * (a) ALLOW_RETRY and !TRIED:  this means the page fault allows retry, and
 *                              this is the first try
 *
 * (b) ALLOW_RETRY and TRIED:   this means the page fault allows retry, and
 *                              we've already tried at least once
 *
 * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry
 *
 * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never
 * be used.  Note that page faults can be allowed to retry for multiple times,
 * in which case we'll have an initial fault with flags (a) then later on
 * continuous faults with flags (b).  We should always try to detect pending
 * signals before a retry to make sure the continuous page faults can still be
 * interrupted if necessary.
 *
 * The combination FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE is illegal.
 * FAULT_FLAG_UNSHARE is ignored and treated like an ordinary read fault when
 * applied to mappings that are not COW mappings.
 */
enum fault_flag {
        FAULT_FLAG_WRITE =                1 << 0,
        FAULT_FLAG_MKWRITE =                1 << 1,
        FAULT_FLAG_ALLOW_RETRY =        1 << 2,
        FAULT_FLAG_RETRY_NOWAIT =         1 << 3,
        FAULT_FLAG_KILLABLE =                1 << 4,
        FAULT_FLAG_TRIED =                 1 << 5,
        FAULT_FLAG_USER =                1 << 6,
        FAULT_FLAG_REMOTE =                1 << 7,
        FAULT_FLAG_INSTRUCTION =        1 << 8,
        FAULT_FLAG_INTERRUPTIBLE =        1 << 9,
        FAULT_FLAG_UNSHARE =                1 << 10,
        FAULT_FLAG_ORIG_PTE_VALID =        1 << 11,
        FAULT_FLAG_VMA_LOCK =                1 << 12,
};

typedef unsigned int __bitwise zap_flags_t;

/* Flags for clear_young_dirty_ptes(). */
typedef int __bitwise cydp_t;

/* Clear the access bit */
#define CYDP_CLEAR_YOUNG                ((__force cydp_t)BIT(0))

/* Clear the dirty bit */
#define CYDP_CLEAR_DIRTY                ((__force cydp_t)BIT(1))

/*
 * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each
 * other. Here is what they mean, and how to use them:
 *
 *
 * FIXME: For pages which are part of a filesystem, mappings are subject to the
 * lifetime enforced by the filesystem and we need guarantees that longterm
 * users like RDMA and V4L2 only establish mappings which coordinate usage with
 * the filesystem.  Ideas for this coordination include revoking the longterm
 * pin, delaying writeback, bounce buffer page writeback, etc.  As FS DAX was
 * added after the problem with filesystems was found FS DAX VMAs are
 * specifically failed.  Filesystem pages are still subject to bugs and use of
 * FOLL_LONGTERM should be avoided on those pages.
 *
 * In the CMA case: long term pins in a CMA region would unnecessarily fragment
 * that region.  And so, CMA attempts to migrate the page before pinning, when
 * FOLL_LONGTERM is specified.
 *
 * FOLL_PIN indicates that a special kind of tracking (not just page->_refcount,
 * but an additional pin counting system) will be invoked. This is intended for
 * anything that gets a page reference and then touches page data (for example,
 * Direct IO). This lets the filesystem know that some non-file-system entity is
 * potentially changing the pages' data. In contrast to FOLL_GET (whose pages
 * are released via put_page()), FOLL_PIN pages must be released, ultimately, by
 * a call to unpin_user_page().
 *
 * FOLL_PIN is similar to FOLL_GET: both of these pin pages. They use different
 * and separate refcounting mechanisms, however, and that means that each has
 * its own acquire and release mechanisms:
 *
 *     FOLL_GET: get_user_pages*() to acquire, and put_page() to release.
 *
 *     FOLL_PIN: pin_user_pages*() to acquire, and unpin_user_pages to release.
 *
 * FOLL_PIN and FOLL_GET are mutually exclusive for a given function call.
 * (The underlying pages may experience both FOLL_GET-based and FOLL_PIN-based
 * calls applied to them, and that's perfectly OK. This is a constraint on the
 * callers, not on the pages.)
 *
 * FOLL_PIN should be set internally by the pin_user_pages*() APIs, never
 * directly by the caller. That's in order to help avoid mismatches when
 * releasing pages: get_user_pages*() pages must be released via put_page(),
 * while pin_user_pages*() pages must be released via unpin_user_page().
 *
 * Please see Documentation/core-api/pin_user_pages.rst for more information.
 */

enum {
        /* check pte is writable */
        FOLL_WRITE = 1 << 0,
        /* do get_page on page */
        FOLL_GET = 1 << 1,
        /* give error on hole if it would be zero */
        FOLL_DUMP = 1 << 2,
        /* get_user_pages read/write w/o permission */
        FOLL_FORCE = 1 << 3,
        /*
         * if a disk transfer is needed, start the IO and return without waiting
         * upon it
         */
        FOLL_NOWAIT = 1 << 4,
        /* do not fault in pages */
        FOLL_NOFAULT = 1 << 5,
        /* check page is hwpoisoned */
        FOLL_HWPOISON = 1 << 6,
        /* don't do file mappings */
        FOLL_ANON = 1 << 7,
        /*
         * FOLL_LONGTERM indicates that the page will be held for an indefinite
         * time period _often_ under userspace control.  This is in contrast to
         * iov_iter_get_pages(), whose usages are transient.
         */
        FOLL_LONGTERM = 1 << 8,
        /* split huge pmd before returning */
        FOLL_SPLIT_PMD = 1 << 9,
        /* allow returning PCI P2PDMA pages */
        FOLL_PCI_P2PDMA = 1 << 10,
        /* allow interrupts from generic signals */
        FOLL_INTERRUPTIBLE = 1 << 11,
        /*
         * Always honor (trigger) NUMA hinting faults.
         *
         * FOLL_WRITE implicitly honors NUMA hinting faults because a
         * PROT_NONE-mapped page is not writable (exceptions with FOLL_FORCE
         * apply). get_user_pages_fast_only() always implicitly honors NUMA
         * hinting faults.
         */
        FOLL_HONOR_NUMA_FAULT = 1 << 12,

        /* See also internal only FOLL flags in mm/internal.h */
};

/* mm flags */

/*
 * The first two bits represent core dump modes for set-user-ID,
 * the modes are SUID_DUMP_* defined in linux/sched/coredump.h
 */
#define MMF_DUMPABLE_BITS 2
#define MMF_DUMPABLE_MASK (BIT(MMF_DUMPABLE_BITS) - 1)
/* coredump filter bits */
#define MMF_DUMP_ANON_PRIVATE        2
#define MMF_DUMP_ANON_SHARED        3
#define MMF_DUMP_MAPPED_PRIVATE        4
#define MMF_DUMP_MAPPED_SHARED        5
#define MMF_DUMP_ELF_HEADERS        6
#define MMF_DUMP_HUGETLB_PRIVATE 7
#define MMF_DUMP_HUGETLB_SHARED  8
#define MMF_DUMP_DAX_PRIVATE        9
#define MMF_DUMP_DAX_SHARED        10

#define MMF_DUMP_FILTER_SHIFT        MMF_DUMPABLE_BITS
#define MMF_DUMP_FILTER_BITS        9
#define MMF_DUMP_FILTER_MASK \
        ((BIT(MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
#define MMF_DUMP_FILTER_DEFAULT \
        (BIT(MMF_DUMP_ANON_PRIVATE) | BIT(MMF_DUMP_ANON_SHARED) | \
         BIT(MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF)

#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS
# define MMF_DUMP_MASK_DEFAULT_ELF        BIT(MMF_DUMP_ELF_HEADERS)
#else
# define MMF_DUMP_MASK_DEFAULT_ELF        0
#endif
                                        /* leave room for more dump flags */
#define MMF_VM_MERGEABLE        16        /* KSM may merge identical pages */
#define MMF_VM_HUGEPAGE                17        /* set when mm is available for khugepaged */

#define MMF_HUGE_ZERO_FOLIO        18      /* mm has ever used the global huge zero folio */

#define MMF_HAS_UPROBES                19        /* has uprobes */
#define MMF_RECALC_UPROBES        20        /* MMF_HAS_UPROBES can be wrong */
#define MMF_OOM_SKIP                21        /* mm is of no interest for the OOM killer */
#define MMF_UNSTABLE                22        /* mm is unstable for copy_from_user */
#define MMF_DISABLE_THP_EXCEPT_ADVISED        23        /* no THP except when advised (e.g., VM_HUGEPAGE) */
#define MMF_DISABLE_THP_COMPLETELY        24        /* no THP for all VMAs */
#define MMF_DISABLE_THP_MASK        (BIT(MMF_DISABLE_THP_COMPLETELY) | \
                                 BIT(MMF_DISABLE_THP_EXCEPT_ADVISED))
#define MMF_OOM_REAP_QUEUED        25        /* mm was queued for oom_reaper */
#define MMF_MULTIPROCESS        26        /* mm is shared between processes */
/*
 * MMF_HAS_PINNED: Whether this mm has pinned any pages.  This can be either
 * replaced in the future by mm.pinned_vm when it becomes stable, or grow into
 * a counter on its own. We're aggresive on this bit for now: even if the
 * pinned pages were unpinned later on, we'll still keep this bit set for the
 * lifecycle of this mm, just for simplicity.
 */
#define MMF_HAS_PINNED                27        /* FOLL_PIN has run, never cleared */

#define MMF_HAS_MDWE                28
#define MMF_HAS_MDWE_MASK        BIT(MMF_HAS_MDWE)

#define MMF_HAS_MDWE_NO_INHERIT        29

#define MMF_VM_MERGE_ANY        30
#define MMF_VM_MERGE_ANY_MASK        BIT(MMF_VM_MERGE_ANY)

#define MMF_TOPDOWN                31        /* mm searches top down by default */
#define MMF_TOPDOWN_MASK        BIT(MMF_TOPDOWN)

#define MMF_INIT_LEGACY_MASK        (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
                                 MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\
                                 MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK)

/* Legacy flags must fit within 32 bits. */
static_assert((u64)MMF_INIT_LEGACY_MASK <= (u64)UINT_MAX);

/*
 * Initialise legacy flags according to masks, propagating selected flags on
 * fork. Further flag manipulation can be performed by the caller.
 */
static inline unsigned long mmf_init_legacy_flags(unsigned long flags)
{
        if (flags & (1UL << MMF_HAS_MDWE_NO_INHERIT))
                flags &= ~((1UL << MMF_HAS_MDWE) |
                           (1UL << MMF_HAS_MDWE_NO_INHERIT));
        return flags & MMF_INIT_LEGACY_MASK;
}

#endif /* _LINUX_MM_TYPES_H */

























































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Runtime locking correctness validator
 *
 *  Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
 *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
 *
 * see Documentation/locking/lockdep-design.rst for more details.
 */
#ifndef __LINUX_LOCKDEP_H
#define __LINUX_LOCKDEP_H

#include <linux/lockdep_types.h>
#include <linux/smp.h>
#include <asm/percpu.h>

struct task_struct;

#ifdef CONFIG_LOCKDEP

#include <linux/linkage.h>
#include <linux/list.h>
#include <linux/debug_locks.h>
#include <linux/stacktrace.h>

static inline void lockdep_copy_map(struct lockdep_map *to,
                                    struct lockdep_map *from)
{
        int i;

        *to = *from;
        /*
         * Since the class cache can be modified concurrently we could observe
         * half pointers (64bit arch using 32bit copy insns). Therefore clear
         * the caches and take the performance hit.
         *
         * XXX it doesn't work well with lockdep_set_class_and_subclass(), since
         *     that relies on cache abuse.
         */
        for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
                to->class_cache[i] = NULL;
}

/*
 * Every lock has a list of other locks that were taken after it.
 * We only grow the list, never remove from it:
 */
struct lock_list {
        struct list_head                entry;
        struct lock_class                *class;
        struct lock_class                *links_to;
        const struct lock_trace                *trace;
        u16                                distance;
        /* bitmap of different dependencies from head to this */
        u8                                dep;
        /* used by BFS to record whether "prev -> this" only has -(*R)-> */
        u8                                only_xr;

        /*
         * The parent field is used to implement breadth-first search, and the
         * bit 0 is reused to indicate if the lock has been accessed in BFS.
         */
        struct lock_list                *parent;
};

/**
 * struct lock_chain - lock dependency chain record
 *
 * @irq_context: the same as irq_context in held_lock below
 * @depth:       the number of held locks in this chain
 * @base:        the index in chain_hlocks for this chain
 * @entry:       the collided lock chains in lock_chain hash list
 * @chain_key:   the hash key of this lock_chain
 */
struct lock_chain {
        /* see BUILD_BUG_ON()s in add_chain_cache() */
        unsigned int                        irq_context :  2,
                                        depth       :  6,
                                        base            : 24;
        /* 4 byte hole */
        struct hlist_node                entry;
        u64                                chain_key;
};

/*
 * Initialization, self-test and debugging-output methods:
 */
extern void lockdep_init(void);
extern void lockdep_reset(void);
extern void lockdep_reset_lock(struct lockdep_map *lock);
extern void lockdep_free_key_range(void *start, unsigned long size);
extern asmlinkage void lockdep_sys_exit(void);
extern void lockdep_set_selftest_task(struct task_struct *task);

extern void lockdep_init_task(struct task_struct *task);

/*
 * Split the recursion counter in two to readily detect 'off' vs recursion.
 */
#define LOCKDEP_RECURSION_BITS        16
#define LOCKDEP_OFF                (1U << LOCKDEP_RECURSION_BITS)
#define LOCKDEP_RECURSION_MASK        (LOCKDEP_OFF - 1)

/*
 * lockdep_{off,on}() are macros to avoid tracing and kprobes; not inlines due
 * to header dependencies.
 */

#define lockdep_off()                                        \
do {                                                        \
        current->lockdep_recursion += LOCKDEP_OFF;        \
} while (0)

#define lockdep_on()                                        \
do {                                                        \
        current->lockdep_recursion -= LOCKDEP_OFF;        \
} while (0)

extern void lockdep_register_key(struct lock_class_key *key);
extern void lockdep_unregister_key(struct lock_class_key *key);

/*
 * These methods are used by specific locking variants (spinlocks,
 * rwlocks, mutexes and rwsems) to pass init/acquire/release events
 * to lockdep:
 */

extern void lockdep_init_map_type(struct lockdep_map *lock, const char *name,
        struct lock_class_key *key, int subclass, u8 inner, u8 outer, u8 lock_type);

static inline void
lockdep_init_map_waits(struct lockdep_map *lock, const char *name,
                       struct lock_class_key *key, int subclass, u8 inner, u8 outer)
{
        lockdep_init_map_type(lock, name, key, subclass, inner, outer, LD_LOCK_NORMAL);
}

static inline void
lockdep_init_map_wait(struct lockdep_map *lock, const char *name,
                      struct lock_class_key *key, int subclass, u8 inner)
{
        lockdep_init_map_waits(lock, name, key, subclass, inner, LD_WAIT_INV);
}

static inline void lockdep_init_map(struct lockdep_map *lock, const char *name,
                             struct lock_class_key *key, int subclass)
{
        lockdep_init_map_wait(lock, name, key, subclass, LD_WAIT_INV);
}

/*
 * Reinitialize a lock key - for cases where there is special locking or
 * special initialization of locks so that the validator gets the scope
 * of dependencies wrong: they are either too broad (they need a class-split)
 * or they are too narrow (they suffer from a false class-split):
 */
#define lockdep_set_class(lock, key)                                \
        lockdep_init_map_type(&(lock)->dep_map, #key, key, 0,        \
                              (lock)->dep_map.wait_type_inner,        \
                              (lock)->dep_map.wait_type_outer,        \
                              (lock)->dep_map.lock_type)

#define lockdep_set_class_and_name(lock, key, name)                \
        lockdep_init_map_type(&(lock)->dep_map, name, key, 0,        \
                              (lock)->dep_map.wait_type_inner,        \
                              (lock)->dep_map.wait_type_outer,        \
                              (lock)->dep_map.lock_type)

#define lockdep_set_class_and_subclass(lock, key, sub)                \
        lockdep_init_map_type(&(lock)->dep_map, #key, key, sub,        \
                              (lock)->dep_map.wait_type_inner,        \
                              (lock)->dep_map.wait_type_outer,        \
                              (lock)->dep_map.lock_type)

#define lockdep_set_subclass(lock, sub)                                        \
        lockdep_init_map_type(&(lock)->dep_map, (lock)->dep_map.name, (lock)->dep_map.key, sub,\
                              (lock)->dep_map.wait_type_inner,                \
                              (lock)->dep_map.wait_type_outer,                \
                              (lock)->dep_map.lock_type)

/**
 * lockdep_set_novalidate_class: disable checking of lock ordering on a given
 * lock
 * @lock: Lock to mark
 *
 * Lockdep will still record that this lock has been taken, and print held
 * instances when dumping locks
 */
#define lockdep_set_novalidate_class(lock) \
        lockdep_set_class_and_name(lock, &__lockdep_no_validate__, #lock)

/**
 * lockdep_set_notrack_class: disable lockdep tracking of a given lock entirely
 * @lock: Lock to mark
 *
 * Bigger hammer than lockdep_set_novalidate_class: so far just for bcachefs,
 * which takes more locks than lockdep is able to track (48).
 */
#define lockdep_set_notrack_class(lock) \
        lockdep_set_class_and_name(lock, &__lockdep_no_track__, #lock)

/*
 * Compare locking classes
 */
#define lockdep_match_class(lock, key) lockdep_match_key(&(lock)->dep_map, key)

static inline int lockdep_match_key(struct lockdep_map *lock,
                                    struct lock_class_key *key)
{
        return lock->key == key;
}

/*
 * Acquire a lock.
 *
 * Values for "read":
 *
 *   0: exclusive (write) acquire
 *   1: read-acquire (no recursion allowed)
 *   2: read-acquire with same-instance recursion allowed
 *
 * Values for check:
 *
 *   0: simple checks (freeing, held-at-exit-time, etc.)
 *   1: full validation
 */
extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
                         int trylock, int read, int check,
                         struct lockdep_map *nest_lock, unsigned long ip);

extern void lock_release(struct lockdep_map *lock, unsigned long ip);

extern void lock_sync(struct lockdep_map *lock, unsigned int subclass,
                      int read, int check, struct lockdep_map *nest_lock,
                      unsigned long ip);

/* lock_is_held_type() returns */
#define LOCK_STATE_UNKNOWN        -1
#define LOCK_STATE_NOT_HELD        0
#define LOCK_STATE_HELD                1

/*
 * Same "read" as for lock_acquire(), except -1 means any.
 */
extern int lock_is_held_type(const struct lockdep_map *lock, int read);

static inline int lock_is_held(const struct lockdep_map *lock)
{
        return lock_is_held_type(lock, -1);
}

#define lockdep_is_held(lock)                lock_is_held(&(lock)->dep_map)
#define lockdep_is_held_type(lock, r)        lock_is_held_type(&(lock)->dep_map, (r))

extern void lock_set_class(struct lockdep_map *lock, const char *name,
                           struct lock_class_key *key, unsigned int subclass,
                           unsigned long ip);

#define lock_set_novalidate_class(l, n, i) \
        lock_set_class(l, n, &__lockdep_no_validate__, 0, i)

static inline void lock_set_subclass(struct lockdep_map *lock,
                unsigned int subclass, unsigned long ip)
{
        lock_set_class(lock, lock->name, lock->key, subclass, ip);
}

extern void lock_downgrade(struct lockdep_map *lock, unsigned long ip);

#define NIL_COOKIE (struct pin_cookie){ .val = 0U, }

extern struct pin_cookie lock_pin_lock(struct lockdep_map *lock);
extern void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie);
extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);

#define lockdep_depth(tsk)        (debug_locks ? (tsk)->lockdep_depth : 0)

#define lockdep_assert(cond)                \
        do { WARN_ON(debug_locks && !(cond)); } while (0)

#define lockdep_assert_once(cond)        \
        do { WARN_ON_ONCE(debug_locks && !(cond)); } while (0)

#define lockdep_assert_held(l)                \
        lockdep_assert(lockdep_is_held(l) != LOCK_STATE_NOT_HELD)

#define lockdep_assert_not_held(l)        \
        lockdep_assert(lockdep_is_held(l) != LOCK_STATE_HELD)

#define lockdep_assert_held_write(l)        \
        lockdep_assert(lockdep_is_held_type(l, 0))

#define lockdep_assert_held_read(l)        \
        lockdep_assert(lockdep_is_held_type(l, 1))

#define lockdep_assert_held_once(l)                \
        lockdep_assert_once(lockdep_is_held(l) != LOCK_STATE_NOT_HELD)

#define lockdep_assert_none_held_once()                \
        lockdep_assert_once(!current->lockdep_depth)

#define lockdep_recursing(tsk)        ((tsk)->lockdep_recursion)

#define lockdep_pin_lock(l)        lock_pin_lock(&(l)->dep_map)
#define lockdep_repin_lock(l,c)        lock_repin_lock(&(l)->dep_map, (c))
#define lockdep_unpin_lock(l,c)        lock_unpin_lock(&(l)->dep_map, (c))

/*
 * Must use lock_map_aquire_try() with override maps to avoid
 * lockdep thinking they participate in the block chain.
 */
#define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type)        \
        struct lockdep_map _name = {                        \
                .name = #_name "-wait-type-override",        \
                .wait_type_inner = _wait_type,                \
                .lock_type = LD_LOCK_WAIT_OVERRIDE, }

#else /* !CONFIG_LOCKDEP */

static inline void lockdep_init_task(struct task_struct *task)
{
}

static inline void lockdep_off(void)
{
}

static inline void lockdep_on(void)
{
}

static inline void lockdep_set_selftest_task(struct task_struct *task)
{
}

# define lock_acquire(l, s, t, r, c, n, i)        do { } while (0)
# define lock_release(l, i)                        do { } while (0)
# define lock_downgrade(l, i)                        do { } while (0)
# define lock_set_class(l, n, key, s, i)        do { (void)(key); } while (0)
# define lock_set_novalidate_class(l, n, i)        do { } while (0)
# define lock_set_subclass(l, s, i)                do { } while (0)
# define lockdep_init()                                do { } while (0)
# define lockdep_init_map_type(lock, name, key, sub, inner, outer, type) \
                do { (void)(name); (void)(key); } while (0)
# define lockdep_init_map_waits(lock, name, key, sub, inner, outer) \
                do { (void)(name); (void)(key); } while (0)
# define lockdep_init_map_wait(lock, name, key, sub, inner) \
                do { (void)(name); (void)(key); } while (0)
# define lockdep_init_map(lock, name, key, sub) \
                do { (void)(name); (void)(key); } while (0)
# define lockdep_set_class(lock, key)                do { (void)(key); } while (0)
# define lockdep_set_class_and_name(lock, key, name) \
                do { (void)(key); (void)(name); } while (0)
#define lockdep_set_class_and_subclass(lock, key, sub) \
                do { (void)(key); } while (0)
#define lockdep_set_subclass(lock, sub)                do { } while (0)

#define lockdep_set_novalidate_class(lock) do { } while (0)
#define lockdep_set_notrack_class(lock) do { } while (0)

/*
 * We don't define lockdep_match_class() and lockdep_match_key() for !LOCKDEP
 * case since the result is not well defined and the caller should rather
 * #ifdef the call himself.
 */

# define lockdep_reset()                do { debug_locks = 1; } while (0)
# define lockdep_free_key_range(start, size)        do { } while (0)
# define lockdep_sys_exit()                         do { } while (0)

static inline void lockdep_register_key(struct lock_class_key *key)
{
}

static inline void lockdep_unregister_key(struct lock_class_key *key)
{
}

#define lockdep_depth(tsk)        (0)

/*
 * Dummy forward declarations, allow users to write less ifdef-y code
 * and depend on dead code elimination.
 */
extern int lock_is_held(const void *);
extern int lockdep_is_held(const void *);
#define lockdep_is_held_type(l, r)                (1)

#define lockdep_assert(c)                        do { } while (0)
#define lockdep_assert_once(c)                        do { } while (0)

#define lockdep_assert_held(l)                        do { (void)(l); } while (0)
#define lockdep_assert_not_held(l)                do { (void)(l); } while (0)
#define lockdep_assert_held_write(l)                do { (void)(l); } while (0)
#define lockdep_assert_held_read(l)                do { (void)(l); } while (0)
#define lockdep_assert_held_once(l)                do { (void)(l); } while (0)
#define lockdep_assert_none_held_once()        do { } while (0)

#define lockdep_recursing(tsk)                        (0)

#define NIL_COOKIE (struct pin_cookie){ }

#define lockdep_pin_lock(l)                        ({ struct pin_cookie cookie = { }; cookie; })
#define lockdep_repin_lock(l, c)                do { (void)(l); (void)(c); } while (0)
#define lockdep_unpin_lock(l, c)                do { (void)(l); (void)(c); } while (0)

#define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type)        \
        struct lockdep_map __maybe_unused _name = {}

#endif /* !LOCKDEP */

#ifdef CONFIG_PROVE_LOCKING
void lockdep_set_lock_cmp_fn(struct lockdep_map *, lock_cmp_fn, lock_print_fn);

#define lock_set_cmp_fn(lock, ...)        lockdep_set_lock_cmp_fn(&(lock)->dep_map, __VA_ARGS__)
#else
#define lock_set_cmp_fn(lock, ...)        do { } while (0)
#endif

enum xhlock_context_t {
        XHLOCK_HARD,
        XHLOCK_SOFT,
        XHLOCK_CTX_NR,
};

/*
 * To initialize a lockdep_map statically use this macro.
 * Note that _name must not be NULL.
 */
#define STATIC_LOCKDEP_MAP_INIT(_name, _key) \
        { .name = (_name), .key = (void *)(_key), }

static inline void lockdep_invariant_state(bool force) {}
static inline void lockdep_free_task(struct task_struct *task) {}

#ifdef CONFIG_LOCK_STAT

extern void lock_contended(struct lockdep_map *lock, unsigned long ip);
extern void lock_acquired(struct lockdep_map *lock, unsigned long ip);

#define LOCK_CONTENDED(_lock, try, lock)                        \
do {                                                                \
        if (!try(_lock)) {                                        \
                lock_contended(&(_lock)->dep_map, _RET_IP_);        \
                lock(_lock);                                        \
        }                                                        \
        lock_acquired(&(_lock)->dep_map, _RET_IP_);                        \
} while (0)

#define LOCK_CONTENDED_RETURN(_lock, try, lock)                        \
({                                                                \
        int ____err = 0;                                        \
        if (!try(_lock)) {                                        \
                lock_contended(&(_lock)->dep_map, _RET_IP_);        \
                ____err = lock(_lock);                                \
        }                                                        \
        if (!____err)                                                \
                lock_acquired(&(_lock)->dep_map, _RET_IP_);        \
        ____err;                                                \
})

#else /* CONFIG_LOCK_STAT */

#define lock_contended(lockdep_map, ip) do {} while (0)
#define lock_acquired(lockdep_map, ip) do {} while (0)

#define LOCK_CONTENDED(_lock, try, lock) \
        lock(_lock)

#define LOCK_CONTENDED_RETURN(_lock, try, lock) \
        lock(_lock)

#endif /* CONFIG_LOCK_STAT */

#ifdef CONFIG_PROVE_LOCKING
extern void print_irqtrace_events(struct task_struct *curr);
#else
static inline void print_irqtrace_events(struct task_struct *curr)
{
}
#endif

/* Variable used to make lockdep treat read_lock() as recursive in selftests */
#ifdef CONFIG_DEBUG_LOCKING_API_SELFTESTS
extern unsigned int force_read_lock_recursive;
#else /* CONFIG_DEBUG_LOCKING_API_SELFTESTS */
#define force_read_lock_recursive 0
#endif /* CONFIG_DEBUG_LOCKING_API_SELFTESTS */

#ifdef CONFIG_LOCKDEP
extern bool read_lock_is_recursive(void);
#else /* CONFIG_LOCKDEP */
/* If !LOCKDEP, the value is meaningless */
#define read_lock_is_recursive() 0
#endif

/*
 * For trivial one-depth nesting of a lock-class, the following
 * global define can be used. (Subsystems with multiple levels
 * of nesting should define their own lock-nesting subclasses.)
 */
#define SINGLE_DEPTH_NESTING                        1

/*
 * Map the dependency ops to NOP or to real lockdep ops, depending
 * on the per lock-class debug mode:
 */

#define lock_acquire_exclusive(l, s, t, n, i)                lock_acquire(l, s, t, 0, 1, n, i)
#define lock_acquire_shared(l, s, t, n, i)                lock_acquire(l, s, t, 1, 1, n, i)
#define lock_acquire_shared_recursive(l, s, t, n, i)        lock_acquire(l, s, t, 2, 1, n, i)

#define spin_acquire(l, s, t, i)                lock_acquire_exclusive(l, s, t, NULL, i)
#define spin_acquire_nest(l, s, t, n, i)        lock_acquire_exclusive(l, s, t, n, i)
#define spin_release(l, i)                        lock_release(l, i)

#define rwlock_acquire(l, s, t, i)                lock_acquire_exclusive(l, s, t, NULL, i)
#define rwlock_acquire_read(l, s, t, i)                                        \
do {                                                                        \
        if (read_lock_is_recursive())                                        \
                lock_acquire_shared_recursive(l, s, t, NULL, i);        \
        else                                                                \
                lock_acquire_shared(l, s, t, NULL, i);                        \
} while (0)

#define rwlock_release(l, i)                        lock_release(l, i)

#define seqcount_acquire(l, s, t, i)                lock_acquire_exclusive(l, s, t, NULL, i)
#define seqcount_acquire_read(l, s, t, i)        lock_acquire_shared_recursive(l, s, t, NULL, i)
#define seqcount_release(l, i)                        lock_release(l, i)

#define mutex_acquire(l, s, t, i)                lock_acquire_exclusive(l, s, t, NULL, i)
#define mutex_acquire_nest(l, s, t, n, i)        lock_acquire_exclusive(l, s, t, n, i)
#define mutex_release(l, i)                        lock_release(l, i)

#define rwsem_acquire(l, s, t, i)                lock_acquire_exclusive(l, s, t, NULL, i)
#define rwsem_acquire_nest(l, s, t, n, i)        lock_acquire_exclusive(l, s, t, n, i)
#define rwsem_acquire_read(l, s, t, i)                lock_acquire_shared(l, s, t, NULL, i)
#define rwsem_release(l, i)                        lock_release(l, i)

#define lock_map_acquire(l)                        lock_acquire_exclusive(l, 0, 0, NULL, _THIS_IP_)
#define lock_map_acquire_try(l)                        lock_acquire_exclusive(l, 0, 1, NULL, _THIS_IP_)
#define lock_map_acquire_read(l)                lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_)
#define lock_map_acquire_tryread(l)                lock_acquire_shared_recursive(l, 0, 1, NULL, _THIS_IP_)
#define lock_map_release(l)                        lock_release(l, _THIS_IP_)
#define lock_map_sync(l)                        lock_sync(l, 0, 0, 1, NULL, _THIS_IP_)

#ifdef CONFIG_PROVE_LOCKING
# define might_lock(lock)                                                \
do {                                                                        \
        typecheck(struct lockdep_map *, &(lock)->dep_map);                \
        lock_acquire(&(lock)->dep_map, 0, 0, 0, 1, NULL, _THIS_IP_);        \
        lock_release(&(lock)->dep_map, _THIS_IP_);                        \
} while (0)
# define might_lock_read(lock)                                                \
do {                                                                        \
        typecheck(struct lockdep_map *, &(lock)->dep_map);                \
        lock_acquire(&(lock)->dep_map, 0, 0, 1, 1, NULL, _THIS_IP_);        \
        lock_release(&(lock)->dep_map, _THIS_IP_);                        \
} while (0)
# define might_lock_nested(lock, subclass)                                \
do {                                                                        \
        typecheck(struct lockdep_map *, &(lock)->dep_map);                \
        lock_acquire(&(lock)->dep_map, subclass, 0, 1, 1, NULL,                \
                     _THIS_IP_);                                        \
        lock_release(&(lock)->dep_map, _THIS_IP_);                        \
} while (0)

DECLARE_PER_CPU(int, hardirqs_enabled);
DECLARE_PER_CPU(int, hardirq_context);
DECLARE_PER_CPU(unsigned int, lockdep_recursion);

#define __lockdep_enabled        (debug_locks && !this_cpu_read(lockdep_recursion))

#define lockdep_assert_irqs_enabled()                                        \
do {                                                                        \
        WARN_ON_ONCE(__lockdep_enabled && !this_cpu_read(hardirqs_enabled)); \
} while (0)

#define lockdep_assert_irqs_disabled()                                        \
do {                                                                        \
        WARN_ON_ONCE(__lockdep_enabled && this_cpu_read(hardirqs_enabled)); \
} while (0)

#define lockdep_assert_in_irq()                                                \
do {                                                                        \
        WARN_ON_ONCE(__lockdep_enabled && !this_cpu_read(hardirq_context)); \
} while (0)

#define lockdep_assert_no_hardirq()                                        \
do {                                                                        \
        WARN_ON_ONCE(__lockdep_enabled && (this_cpu_read(hardirq_context) || \
                                           !this_cpu_read(hardirqs_enabled))); \
} while (0)

#define lockdep_assert_preemption_enabled()                                \
do {                                                                        \
        WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT)        &&                \
                     __lockdep_enabled                        &&                \
                     (preempt_count() != 0                ||                \
                      !this_cpu_read(hardirqs_enabled)));                \
} while (0)

#define lockdep_assert_preemption_disabled()                                \
do {                                                                        \
        WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT)        &&                \
                     __lockdep_enabled                        &&                \
                     (preempt_count() == 0                &&                \
                      this_cpu_read(hardirqs_enabled)));                \
} while (0)

/*
 * Acceptable for protecting per-CPU resources accessed from BH.
 * Much like in_softirq() - semantics are ambiguous, use carefully.
 */
#define lockdep_assert_in_softirq()                                        \
do {                                                                        \
        WARN_ON_ONCE(__lockdep_enabled                        &&                \
                     (!in_softirq() || in_irq() || in_nmi()));                \
} while (0)

extern void lockdep_assert_in_softirq_func(void);

#else
# define might_lock(lock) do { } while (0)
# define might_lock_read(lock) do { } while (0)
# define might_lock_nested(lock, subclass) do { } while (0)

# define lockdep_assert_irqs_enabled() do { } while (0)
# define lockdep_assert_irqs_disabled() do { } while (0)
# define lockdep_assert_in_irq() do { } while (0)
# define lockdep_assert_no_hardirq() do { } while (0)

# define lockdep_assert_preemption_enabled() do { } while (0)
# define lockdep_assert_preemption_disabled() do { } while (0)
# define lockdep_assert_in_softirq() do { } while (0)
# define lockdep_assert_in_softirq_func() do { } while (0)
#endif

#ifdef CONFIG_PROVE_RAW_LOCK_NESTING

# define lockdep_assert_RT_in_threaded_ctx() do {                        \
                WARN_ONCE(debug_locks && !current->lockdep_recursion &&        \
                          lockdep_hardirq_context() &&                        \
                          !(current->hardirq_threaded || current->irq_config),        \
                          "Not in threaded context on PREEMPT_RT as expected\n");        \
} while (0)

#else

# define lockdep_assert_RT_in_threaded_ctx() do { } while (0)

#endif

#ifdef CONFIG_LOCKDEP
void lockdep_rcu_suspicious(const char *file, const int line, const char *s);
#else
static inline void
lockdep_rcu_suspicious(const char *file, const int line, const char *s)
{
}
#endif

#endif /* __LINUX_LOCKDEP_H */





























































































































































































































































































































































































































































































































































































































































































  303 




























































  303 























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * workqueue.h --- work queue handling for Linux.
 */

#ifndef _LINUX_WORKQUEUE_H
#define _LINUX_WORKQUEUE_H

#include <linux/alloc_tag.h>
#include <linux/timer.h>
#include <linux/linkage.h>
#include <linux/bitops.h>
#include <linux/lockdep.h>
#include <linux/threads.h>
#include <linux/atomic.h>
#include <linux/cpumask_types.h>
#include <linux/rcupdate.h>
#include <linux/workqueue_types.h>

/*
 * The first word is the work queue pointer and the flags rolled into
 * one
 */
#define work_data_bits(work) ((unsigned long *)(&(work)->data))

enum work_bits {
        WORK_STRUCT_PENDING_BIT        = 0,        /* work item is pending execution */
        WORK_STRUCT_INACTIVE_BIT,        /* work item is inactive */
        WORK_STRUCT_PWQ_BIT,                /* data points to pwq */
        WORK_STRUCT_LINKED_BIT,                /* next work is linked to this one */
#ifdef CONFIG_DEBUG_OBJECTS_WORK
        WORK_STRUCT_STATIC_BIT,                /* static initializer (debugobjects) */
#endif
        WORK_STRUCT_FLAG_BITS,

        /* color for workqueue flushing */
        WORK_STRUCT_COLOR_SHIFT        = WORK_STRUCT_FLAG_BITS,
        WORK_STRUCT_COLOR_BITS        = 4,

        /*
         * When WORK_STRUCT_PWQ is set, reserve 8 bits off of pwq pointer w/
         * debugobjects turned off. This makes pwqs aligned to 256 bytes (512
         * bytes w/ DEBUG_OBJECTS_WORK) and allows 16 workqueue flush colors.
         *
         * MSB
         * [ pwq pointer ] [ flush color ] [ STRUCT flags ]
         *                     4 bits        4 or 5 bits
         */
        WORK_STRUCT_PWQ_SHIFT        = WORK_STRUCT_COLOR_SHIFT + WORK_STRUCT_COLOR_BITS,

        /*
         * data contains off-queue information when !WORK_STRUCT_PWQ.
         *
         * MSB
         * [ pool ID ] [ disable depth ] [ OFFQ flags ] [ STRUCT flags ]
         *                  16 bits          1 bit        4 or 5 bits
         */
        WORK_OFFQ_FLAG_SHIFT        = WORK_STRUCT_FLAG_BITS,
        WORK_OFFQ_BH_BIT        = WORK_OFFQ_FLAG_SHIFT,
        WORK_OFFQ_FLAG_END,
        WORK_OFFQ_FLAG_BITS        = WORK_OFFQ_FLAG_END - WORK_OFFQ_FLAG_SHIFT,

        WORK_OFFQ_DISABLE_SHIFT        = WORK_OFFQ_FLAG_SHIFT + WORK_OFFQ_FLAG_BITS,
        WORK_OFFQ_DISABLE_BITS        = 16,

        /*
         * When a work item is off queue, the high bits encode off-queue flags
         * and the last pool it was on. Cap pool ID to 31 bits and use the
         * highest number to indicate that no pool is associated.
         */
        WORK_OFFQ_POOL_SHIFT        = WORK_OFFQ_DISABLE_SHIFT + WORK_OFFQ_DISABLE_BITS,
        WORK_OFFQ_LEFT                = BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT,
        WORK_OFFQ_POOL_BITS        = WORK_OFFQ_LEFT <= 31 ? WORK_OFFQ_LEFT : 31,
};

enum work_flags {
        WORK_STRUCT_PENDING        = 1 << WORK_STRUCT_PENDING_BIT,
        WORK_STRUCT_INACTIVE        = 1 << WORK_STRUCT_INACTIVE_BIT,
        WORK_STRUCT_PWQ                = 1 << WORK_STRUCT_PWQ_BIT,
        WORK_STRUCT_LINKED        = 1 << WORK_STRUCT_LINKED_BIT,
#ifdef CONFIG_DEBUG_OBJECTS_WORK
        WORK_STRUCT_STATIC        = 1 << WORK_STRUCT_STATIC_BIT,
#else
        WORK_STRUCT_STATIC        = 0,
#endif
};

enum wq_misc_consts {
        WORK_NR_COLORS                = (1 << WORK_STRUCT_COLOR_BITS),

        /* not bound to any CPU, prefer the local CPU */
        WORK_CPU_UNBOUND        = NR_CPUS,

        /* bit mask for work_busy() return values */
        WORK_BUSY_PENDING        = 1 << 0,
        WORK_BUSY_RUNNING        = 1 << 1,

        /* maximum string length for set_worker_desc() */
        WORKER_DESC_LEN                = 32,
};

/* Convenience constants - of type 'unsigned long', not 'enum'! */
#define WORK_OFFQ_BH                (1ul << WORK_OFFQ_BH_BIT)
#define WORK_OFFQ_FLAG_MASK        (((1ul << WORK_OFFQ_FLAG_BITS) - 1) << WORK_OFFQ_FLAG_SHIFT)
#define WORK_OFFQ_DISABLE_MASK        (((1ul << WORK_OFFQ_DISABLE_BITS) - 1) << WORK_OFFQ_DISABLE_SHIFT)
#define WORK_OFFQ_POOL_NONE        ((1ul << WORK_OFFQ_POOL_BITS) - 1)
#define WORK_STRUCT_NO_POOL        (WORK_OFFQ_POOL_NONE << WORK_OFFQ_POOL_SHIFT)
#define WORK_STRUCT_PWQ_MASK        (~((1ul << WORK_STRUCT_PWQ_SHIFT) - 1))

#define WORK_DATA_INIT()        ATOMIC_LONG_INIT((unsigned long)WORK_STRUCT_NO_POOL)
#define WORK_DATA_STATIC_INIT()        \
        ATOMIC_LONG_INIT((unsigned long)(WORK_STRUCT_NO_POOL | WORK_STRUCT_STATIC))

struct delayed_work {
        struct work_struct work;
        struct timer_list timer;

        /* target workqueue and CPU ->timer uses to queue ->work */
        struct workqueue_struct *wq;
        int cpu;
};

struct rcu_work {
        struct work_struct work;
        struct rcu_head rcu;

        /* target workqueue ->rcu uses to queue ->work */
        struct workqueue_struct *wq;
};

enum wq_affn_scope {
        WQ_AFFN_DFL,                        /* use system default */
        WQ_AFFN_CPU,                        /* one pod per CPU */
        WQ_AFFN_SMT,                        /* one pod poer SMT */
        WQ_AFFN_CACHE,                        /* one pod per LLC */
        WQ_AFFN_NUMA,                        /* one pod per NUMA node */
        WQ_AFFN_SYSTEM,                        /* one pod across the whole system */

        WQ_AFFN_NR_TYPES,
};

/**
 * struct workqueue_attrs - A struct for workqueue attributes.
 *
 * This can be used to change attributes of an unbound workqueue.
 */
struct workqueue_attrs {
        /**
         * @nice: nice level
         */
        int nice;

        /**
         * @cpumask: allowed CPUs
         *
         * Work items in this workqueue are affine to these CPUs and not allowed
         * to execute on other CPUs. A pool serving a workqueue must have the
         * same @cpumask.
         */
        cpumask_var_t cpumask;

        /**
         * @__pod_cpumask: internal attribute used to create per-pod pools
         *
         * Internal use only.
         *
         * Per-pod unbound worker pools are used to improve locality. Always a
         * subset of ->cpumask. A workqueue can be associated with multiple
         * worker pools with disjoint @__pod_cpumask's. Whether the enforcement
         * of a pool's @__pod_cpumask is strict depends on @affn_strict.
         */
        cpumask_var_t __pod_cpumask;

        /**
         * @affn_strict: affinity scope is strict
         *
         * If clear, workqueue will make a best-effort attempt at starting the
         * worker inside @__pod_cpumask but the scheduler is free to migrate it
         * outside.
         *
         * If set, workers are only allowed to run inside @__pod_cpumask.
         */
        bool affn_strict;

        /*
         * Below fields aren't properties of a worker_pool. They only modify how
         * :c:func:`apply_workqueue_attrs` select pools and thus don't
         * participate in pool hash calculations or equality comparisons.
         *
         * If @affn_strict is set, @cpumask isn't a property of a worker_pool
         * either.
         */

        /**
         * @affn_scope: unbound CPU affinity scope
         *
         * CPU pods are used to improve execution locality of unbound work
         * items. There are multiple pod types, one for each wq_affn_scope, and
         * every CPU in the system belongs to one pod in every pod type. CPUs
         * that belong to the same pod share the worker pool. For example,
         * selecting %WQ_AFFN_NUMA makes the workqueue use a separate worker
         * pool for each NUMA node.
         */
        enum wq_affn_scope affn_scope;

        /**
         * @ordered: work items must be executed one by one in queueing order
         */
        bool ordered;
};

static inline struct delayed_work *to_delayed_work(struct work_struct *work)
{
        return container_of(work, struct delayed_work, work);
}

static inline struct rcu_work *to_rcu_work(struct work_struct *work)
{
        return container_of(work, struct rcu_work, work);
}

struct execute_work {
        struct work_struct work;
};

#ifdef CONFIG_LOCKDEP
/*
 * NB: because we have to copy the lockdep_map, setting _key
 * here is required, otherwise it could get initialised to the
 * copy of the lockdep_map!
 */
#define __WORK_INIT_LOCKDEP_MAP(n, k) \
        .lockdep_map = STATIC_LOCKDEP_MAP_INIT(n, k),
#else
#define __WORK_INIT_LOCKDEP_MAP(n, k)
#endif

#define __WORK_INITIALIZER(n, f) {                                        \
        .data = WORK_DATA_STATIC_INIT(),                                \
        .entry        = { &(n).entry, &(n).entry },                                \
        .func = (f),                                                        \
        __WORK_INIT_LOCKDEP_MAP(#n, &(n))                                \
        }

#define __DELAYED_WORK_INITIALIZER(n, f, tflags) {                        \
        .work = __WORK_INITIALIZER((n).work, (f)),                        \
        .timer = __TIMER_INITIALIZER(delayed_work_timer_fn,\
                                     (tflags) | TIMER_IRQSAFE),                \
        }

#define DECLARE_WORK(n, f)                                                \
        struct work_struct n = __WORK_INITIALIZER(n, f)

#define DECLARE_DELAYED_WORK(n, f)                                        \
        struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, 0)

#define DECLARE_DEFERRABLE_WORK(n, f)                                        \
        struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, TIMER_DEFERRABLE)

#ifdef CONFIG_DEBUG_OBJECTS_WORK
extern void __init_work(struct work_struct *work, int onstack);
extern void destroy_work_on_stack(struct work_struct *work);
extern void destroy_delayed_work_on_stack(struct delayed_work *work);
static inline unsigned int work_static(struct work_struct *work)
{
        return *work_data_bits(work) & WORK_STRUCT_STATIC;
}
#else
static inline void __init_work(struct work_struct *work, int onstack) { }
static inline void destroy_work_on_stack(struct work_struct *work) { }
static inline void destroy_delayed_work_on_stack(struct delayed_work *work) { }
static inline unsigned int work_static(struct work_struct *work) { return 0; }
#endif

/*
 * initialize all of a work item in one go
 *
 * NOTE! No point in using "atomic_long_set()": using a direct
 * assignment of the work data initializer allows the compiler
 * to generate better code.
 */
#ifdef CONFIG_LOCKDEP
#define __INIT_WORK_KEY(_work, _func, _onstack, _key)                        \
        do {                                                                \
                __init_work((_work), _onstack);                                \
                (_work)->data = (atomic_long_t) WORK_DATA_INIT();        \
                lockdep_init_map(&(_work)->lockdep_map, "(work_completion)"#_work, (_key), 0); \
                INIT_LIST_HEAD(&(_work)->entry);                        \
                (_work)->func = (_func);                                \
        } while (0)
#else
#define __INIT_WORK_KEY(_work, _func, _onstack, _key)                        \
        do {                                                                \
                __init_work((_work), _onstack);                                \
                (_work)->data = (atomic_long_t) WORK_DATA_INIT();        \
                INIT_LIST_HEAD(&(_work)->entry);                        \
                (_work)->func = (_func);                                \
        } while (0)
#endif

#define __INIT_WORK(_work, _func, _onstack)                                \
        do {                                                                \
                static __maybe_unused struct lock_class_key __key;        \
                                                                        \
                __INIT_WORK_KEY(_work, _func, _onstack, &__key);        \
        } while (0)

#define INIT_WORK(_work, _func)                                                \
        __INIT_WORK((_work), (_func), 0)

#define INIT_WORK_ONSTACK(_work, _func)                                        \
        __INIT_WORK((_work), (_func), 1)

#define INIT_WORK_ONSTACK_KEY(_work, _func, _key)                        \
        __INIT_WORK_KEY((_work), (_func), 1, _key)

#define __INIT_DELAYED_WORK(_work, _func, _tflags)                        \
        do {                                                                \
                INIT_WORK(&(_work)->work, (_func));                        \
                __timer_init(&(_work)->timer,                                \
                             delayed_work_timer_fn,                        \
                             (_tflags) | TIMER_IRQSAFE);                \
        } while (0)

#define __INIT_DELAYED_WORK_ONSTACK(_work, _func, _tflags)                \
        do {                                                                \
                INIT_WORK_ONSTACK(&(_work)->work, (_func));                \
                __timer_init_on_stack(&(_work)->timer,                        \
                                      delayed_work_timer_fn,                \
                                      (_tflags) | TIMER_IRQSAFE);        \
        } while (0)

#define INIT_DELAYED_WORK(_work, _func)                                        \
        __INIT_DELAYED_WORK(_work, _func, 0)

#define INIT_DELAYED_WORK_ONSTACK(_work, _func)                                \
        __INIT_DELAYED_WORK_ONSTACK(_work, _func, 0)

#define INIT_DEFERRABLE_WORK(_work, _func)                                \
        __INIT_DELAYED_WORK(_work, _func, TIMER_DEFERRABLE)

#define INIT_DEFERRABLE_WORK_ONSTACK(_work, _func)                        \
        __INIT_DELAYED_WORK_ONSTACK(_work, _func, TIMER_DEFERRABLE)

#define INIT_RCU_WORK(_work, _func)                                        \
        INIT_WORK(&(_work)->work, (_func))

#define INIT_RCU_WORK_ONSTACK(_work, _func)                                \
        INIT_WORK_ONSTACK(&(_work)->work, (_func))

/**
 * work_pending - Find out whether a work item is currently pending
 * @work: The work item in question
 */
#define work_pending(work) \
        test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))

/**
 * delayed_work_pending - Find out whether a delayable work item is currently
 * pending
 * @w: The work item in question
 */
#define delayed_work_pending(w) \
        work_pending(&(w)->work)

/*
 * Workqueue flags and constants.  For details, please refer to
 * Documentation/core-api/workqueue.rst.
 */
enum wq_flags {
        WQ_BH                        = 1 << 0, /* execute in bottom half (softirq) context */
        WQ_UNBOUND                = 1 << 1, /* not bound to any cpu */
        WQ_FREEZABLE                = 1 << 2, /* freeze during suspend */
        WQ_MEM_RECLAIM                = 1 << 3, /* may be used for memory reclaim */
        WQ_HIGHPRI                = 1 << 4, /* high priority */
        WQ_CPU_INTENSIVE        = 1 << 5, /* cpu intensive workqueue */
        WQ_SYSFS                = 1 << 6, /* visible in sysfs, see workqueue_sysfs_register() */

        /*
         * Per-cpu workqueues are generally preferred because they tend to
         * show better performance thanks to cache locality.  Per-cpu
         * workqueues exclude the scheduler from choosing the CPU to
         * execute the worker threads, which has an unfortunate side effect
         * of increasing power consumption.
         *
         * The scheduler considers a CPU idle if it doesn't have any task
         * to execute and tries to keep idle cores idle to conserve power;
         * however, for example, a per-cpu work item scheduled from an
         * interrupt handler on an idle CPU will force the scheduler to
         * execute the work item on that CPU breaking the idleness, which in
         * turn may lead to more scheduling choices which are sub-optimal
         * in terms of power consumption.
         *
         * Workqueues marked with WQ_POWER_EFFICIENT are per-cpu by default
         * but become unbound if workqueue.power_efficient kernel param is
         * specified.  Per-cpu workqueues which are identified to
         * contribute significantly to power-consumption are identified and
         * marked with this flag and enabling the power_efficient mode
         * leads to noticeable power saving at the cost of small
         * performance disadvantage.
         *
         * http://thread.gmane.org/gmane.linux.kernel/1480396
         */
        WQ_POWER_EFFICIENT        = 1 << 7,
        WQ_PERCPU                = 1 << 8, /* bound to a specific cpu */

        __WQ_DESTROYING                = 1 << 15, /* internal: workqueue is destroying */
        __WQ_DRAINING                = 1 << 16, /* internal: workqueue is draining */
        __WQ_ORDERED                = 1 << 17, /* internal: workqueue is ordered */
        __WQ_LEGACY                = 1 << 18, /* internal: create*_workqueue() */

        /* BH wq only allows the following flags */
        __WQ_BH_ALLOWS                = WQ_BH | WQ_HIGHPRI | WQ_PERCPU,
};

enum wq_consts {
        WQ_MAX_ACTIVE                = 2048,          /* I like 2048, better ideas? */
        WQ_UNBOUND_MAX_ACTIVE        = WQ_MAX_ACTIVE,
        WQ_DFL_ACTIVE                = WQ_MAX_ACTIVE / 2,

        /*
         * Per-node default cap on min_active. Unless explicitly set, min_active
         * is set to min(max_active, WQ_DFL_MIN_ACTIVE). For more details, see
         * workqueue_struct->min_active definition.
         */
        WQ_DFL_MIN_ACTIVE        = 8,
};

/*
 * System-wide workqueues which are always present.
 *
 * system_percpu_wq is the one used by schedule[_delayed]_work[_on]().
 * Multi-CPU multi-threaded.  There are users which expect relatively
 * short queue flush time.  Don't queue works which can run for too
 * long.
 *
 * system_highpri_wq is similar to system_percpu_wq but for work items which
 * require WQ_HIGHPRI.
 *
 * system_long_wq is similar to system_percpu_wq but may host long running
 * works.  Queue flushing might take relatively long.
 *
 * system_dfl_wq is unbound workqueue.  Workers are not bound to
 * any specific CPU, not concurrency managed, and all queued works are
 * executed immediately as long as max_active limit is not reached and
 * resources are available.
 *
 * system_freezable_wq is equivalent to system_percpu_wq except that it's
 * freezable.
 *
 * *_power_efficient_wq are inclined towards saving power and converted
 * into WQ_UNBOUND variants if 'wq_power_efficient' is enabled; otherwise,
 * they are same as their non-power-efficient counterparts - e.g.
 * system_power_efficient_wq is identical to system_percpu_wq if
 * 'wq_power_efficient' is disabled.  See WQ_POWER_EFFICIENT for more info.
 *
 * system_bh[_highpri]_wq are convenience interface to softirq. BH work items
 * are executed in the queueing CPU's BH context in the queueing order.
 */
extern struct workqueue_struct *system_wq; /* use system_percpu_wq, this will be removed */
extern struct workqueue_struct *system_percpu_wq;
extern struct workqueue_struct *system_highpri_wq;
extern struct workqueue_struct *system_long_wq;
extern struct workqueue_struct *system_unbound_wq;
extern struct workqueue_struct *system_dfl_wq;
extern struct workqueue_struct *system_freezable_wq;
extern struct workqueue_struct *system_power_efficient_wq;
extern struct workqueue_struct *system_freezable_power_efficient_wq;
extern struct workqueue_struct *system_bh_wq;
extern struct workqueue_struct *system_bh_highpri_wq;

void workqueue_softirq_action(bool highpri);
void workqueue_softirq_dead(unsigned int cpu);

/**
 * alloc_workqueue - allocate a workqueue
 * @fmt: printf format for the name of the workqueue
 * @flags: WQ_* flags
 * @max_active: max in-flight work items, 0 for default
 * @...: args for @fmt
 *
 * For a per-cpu workqueue, @max_active limits the number of in-flight work
 * items for each CPU. e.g. @max_active of 1 indicates that each CPU can be
 * executing at most one work item for the workqueue.
 *
 * For unbound workqueues, @max_active limits the number of in-flight work items
 * for the whole system. e.g. @max_active of 16 indicates that there can be
 * at most 16 work items executing for the workqueue in the whole system.
 *
 * As sharing the same active counter for an unbound workqueue across multiple
 * NUMA nodes can be expensive, @max_active is distributed to each NUMA node
 * according to the proportion of the number of online CPUs and enforced
 * independently.
 *
 * Depending on online CPU distribution, a node may end up with per-node
 * max_active which is significantly lower than @max_active, which can lead to
 * deadlocks if the per-node concurrency limit is lower than the maximum number
 * of interdependent work items for the workqueue.
 *
 * To guarantee forward progress regardless of online CPU distribution, the
 * concurrency limit on every node is guaranteed to be equal to or greater than
 * min_active which is set to min(@max_active, %WQ_DFL_MIN_ACTIVE). This means
 * that the sum of per-node max_active's may be larger than @max_active.
 *
 * For detailed information on %WQ_\* flags, please refer to
 * Documentation/core-api/workqueue.rst.
 *
 * RETURNS:
 * Pointer to the allocated workqueue on success, %NULL on failure.
 */
__printf(1, 4) struct workqueue_struct *
alloc_workqueue_noprof(const char *fmt, unsigned int flags, int max_active, ...);
#define alloc_workqueue(...)        alloc_hooks(alloc_workqueue_noprof(__VA_ARGS__))

#ifdef CONFIG_LOCKDEP
/**
 * alloc_workqueue_lockdep_map - allocate a workqueue with user-defined lockdep_map
 * @fmt: printf format for the name of the workqueue
 * @flags: WQ_* flags
 * @max_active: max in-flight work items, 0 for default
 * @lockdep_map: user-defined lockdep_map
 * @...: args for @fmt
 *
 * Same as alloc_workqueue but with the a user-define lockdep_map. Useful for
 * workqueues created with the same purpose and to avoid leaking a lockdep_map
 * on each workqueue creation.
 *
 * RETURNS:
 * Pointer to the allocated workqueue on success, %NULL on failure.
 */
__printf(1, 5) struct workqueue_struct *
alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active,
                            struct lockdep_map *lockdep_map, ...);

/**
 * alloc_ordered_workqueue_lockdep_map - allocate an ordered workqueue with
 * user-defined lockdep_map
 *
 * @fmt: printf format for the name of the workqueue
 * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful)
 * @lockdep_map: user-defined lockdep_map
 * @args: args for @fmt
 *
 * Same as alloc_ordered_workqueue but with the a user-define lockdep_map.
 * Useful for workqueues created with the same purpose and to avoid leaking a
 * lockdep_map on each workqueue creation.
 *
 * RETURNS:
 * Pointer to the allocated workqueue on success, %NULL on failure.
 */
#define alloc_ordered_workqueue_lockdep_map(fmt, flags, lockdep_map, args...)        \
        alloc_hooks(alloc_workqueue_lockdep_map(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags),\
                                                1, lockdep_map, ##args))
#endif

/**
 * alloc_ordered_workqueue - allocate an ordered workqueue
 * @fmt: printf format for the name of the workqueue
 * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful)
 * @args: args for @fmt
 *
 * Allocate an ordered workqueue.  An ordered workqueue executes at
 * most one work item at any given time in the queued order.  They are
 * implemented as unbound workqueues with @max_active of one.
 *
 * RETURNS:
 * Pointer to the allocated workqueue on success, %NULL on failure.
 */
#define alloc_ordered_workqueue(fmt, flags, args...)                        \
        alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args)

#define create_workqueue(name)                                                \
        alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM | WQ_PERCPU, 1, (name))
#define create_freezable_workqueue(name)                                \
        alloc_workqueue("%s", __WQ_LEGACY | WQ_FREEZABLE | WQ_UNBOUND |        \
                        WQ_MEM_RECLAIM, 1, (name))
#define create_singlethread_workqueue(name)                                \
        alloc_ordered_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, name)

#define from_work(var, callback_work, work_fieldname)        \
        container_of(callback_work, typeof(*var), work_fieldname)

extern void destroy_workqueue(struct workqueue_struct *wq);

struct workqueue_attrs *alloc_workqueue_attrs_noprof(void);
#define alloc_workqueue_attrs(...)        alloc_hooks(alloc_workqueue_attrs_noprof(__VA_ARGS__))

void free_workqueue_attrs(struct workqueue_attrs *attrs);
int apply_workqueue_attrs(struct workqueue_struct *wq,
                          const struct workqueue_attrs *attrs);
extern int workqueue_unbound_exclude_cpumask(cpumask_var_t cpumask);

extern bool queue_work_on(int cpu, struct workqueue_struct *wq,
                        struct work_struct *work);
extern bool queue_work_node(int node, struct workqueue_struct *wq,
                            struct work_struct *work);
extern bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
                        struct delayed_work *work, unsigned long delay);
extern bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
                        struct delayed_work *dwork, unsigned long delay);
extern bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork);

extern void __flush_workqueue(struct workqueue_struct *wq);
extern void drain_workqueue(struct workqueue_struct *wq);

extern int schedule_on_each_cpu(work_func_t func);

int execute_in_process_context(work_func_t fn, struct execute_work *);

extern bool flush_work(struct work_struct *work);
extern bool cancel_work(struct work_struct *work);
extern bool cancel_work_sync(struct work_struct *work);

extern bool flush_delayed_work(struct delayed_work *dwork);
extern bool cancel_delayed_work(struct delayed_work *dwork);
extern bool cancel_delayed_work_sync(struct delayed_work *dwork);

extern bool disable_work(struct work_struct *work);
extern bool disable_work_sync(struct work_struct *work);
extern bool enable_work(struct work_struct *work);

extern bool disable_delayed_work(struct delayed_work *dwork);
extern bool disable_delayed_work_sync(struct delayed_work *dwork);
extern bool enable_delayed_work(struct delayed_work *dwork);

extern bool flush_rcu_work(struct rcu_work *rwork);

extern void workqueue_set_max_active(struct workqueue_struct *wq,
                                     int max_active);
extern void workqueue_set_min_active(struct workqueue_struct *wq,
                                     int min_active);
extern struct work_struct *current_work(void);
extern bool current_is_workqueue_rescuer(void);
extern bool workqueue_congested(int cpu, struct workqueue_struct *wq);
extern unsigned int work_busy(struct work_struct *work);
extern __printf(1, 2) void set_worker_desc(const char *fmt, ...);
extern void print_worker_info(const char *log_lvl, struct task_struct *task);
extern void show_all_workqueues(void);
extern void show_freezable_workqueues(void);
extern void show_one_workqueue(struct workqueue_struct *wq);
extern void wq_worker_comm(char *buf, size_t size, struct task_struct *task);

/**
 * queue_work - queue work on a workqueue
 * @wq: workqueue to use
 * @work: work to queue
 *
 * Returns %false if @work was already on a queue, %true otherwise.
 *
 * We queue the work to the CPU on which it was submitted, but if the CPU dies
 * it can be processed by another CPU.
 *
 * Memory-ordering properties:  If it returns %true, guarantees that all stores
 * preceding the call to queue_work() in the program order will be visible from
 * the CPU which will execute @work by the time such work executes, e.g.,
 *
 * { x is initially 0 }
 *
 *   CPU0                                CPU1
 *
 *   WRITE_ONCE(x, 1);                        [ @work is being executed ]
 *   r0 = queue_work(wq, work);                  r1 = READ_ONCE(x);
 *
 * Forbids: r0 == true && r1 == 0
 */
static inline bool queue_work(struct workqueue_struct *wq,
                              struct work_struct *work)
{
        return queue_work_on(WORK_CPU_UNBOUND, wq, work);
}

/**
 * queue_delayed_work - queue work on a workqueue after delay
 * @wq: workqueue to use
 * @dwork: delayable work to queue
 * @delay: number of jiffies to wait before queueing
 *
 * Equivalent to queue_delayed_work_on() but tries to use the local CPU.
 */
static inline bool queue_delayed_work(struct workqueue_struct *wq,
                                      struct delayed_work *dwork,
                                      unsigned long delay)
{
        return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
}

/**
 * mod_delayed_work - modify delay of or queue a delayed work
 * @wq: workqueue to use
 * @dwork: work to queue
 * @delay: number of jiffies to wait before queueing
 *
 * mod_delayed_work_on() on local CPU.
 */
static inline bool mod_delayed_work(struct workqueue_struct *wq,
                                    struct delayed_work *dwork,
                                    unsigned long delay)
{
        return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
}

/**
 * schedule_work_on - put work task on a specific cpu
 * @cpu: cpu to put the work task on
 * @work: job to be done
 *
 * This puts a job on a specific cpu
 */
static inline bool schedule_work_on(int cpu, struct work_struct *work)
{
        return queue_work_on(cpu, system_percpu_wq, work);
}

/**
 * schedule_work - put work task in global workqueue
 * @work: job to be done
 *
 * Returns %false if @work was already on the kernel-global workqueue and
 * %true otherwise.
 *
 * This puts a job in the kernel-global workqueue if it was not already
 * queued and leaves it in the same position on the kernel-global
 * workqueue otherwise.
 *
 * Shares the same memory-ordering properties of queue_work(), cf. the
 * DocBook header of queue_work().
 */
static inline bool schedule_work(struct work_struct *work)
{
        return queue_work(system_percpu_wq, work);
}

/**
 * enable_and_queue_work - Enable and queue a work item on a specific workqueue
 * @wq: The target workqueue
 * @work: The work item to be enabled and queued
 *
 * This function combines the operations of enable_work() and queue_work(),
 * providing a convenient way to enable and queue a work item in a single call.
 * It invokes enable_work() on @work and then queues it if the disable depth
 * reached 0. Returns %true if the disable depth reached 0 and @work is queued,
 * and %false otherwise.
 *
 * Note that @work is always queued when disable depth reaches zero. If the
 * desired behavior is queueing only if certain events took place while @work is
 * disabled, the user should implement the necessary state tracking and perform
 * explicit conditional queueing after enable_work().
 */
static inline bool enable_and_queue_work(struct workqueue_struct *wq,
                                         struct work_struct *work)
{
        if (enable_work(work)) {
                queue_work(wq, work);
                return true;
        }
        return false;
}

/*
 * Detect attempt to flush system-wide workqueues at compile time when possible.
 * Warn attempt to flush system-wide workqueues at runtime.
 *
 * See https://lkml.kernel.org/r/49925af7-78a8-a3dd-bce6-cfc02e1a9236@I-love.SAKURA.ne.jp
 * for reasons and steps for converting system-wide workqueues into local workqueues.
 */
extern void __warn_flushing_systemwide_wq(void)
        __compiletime_warning("Please avoid flushing system-wide workqueues.");

/* Please stop using this function, for this function will be removed in near future. */
#define flush_scheduled_work()                                                \
({                                                                        \
        __warn_flushing_systemwide_wq();                                \
        __flush_workqueue(system_percpu_wq);                                        \
})

#define flush_workqueue(wq)                                                \
({                                                                        \
        struct workqueue_struct *_wq = (wq);                                \
                                                                        \
        if ((__builtin_constant_p(_wq == system_percpu_wq) &&                        \
             _wq == system_percpu_wq) ||                                        \
            (__builtin_constant_p(_wq == system_highpri_wq) &&                \
             _wq == system_highpri_wq) ||                                \
            (__builtin_constant_p(_wq == system_long_wq) &&                \
             _wq == system_long_wq) ||                                        \
            (__builtin_constant_p(_wq == system_dfl_wq) &&                \
             _wq == system_dfl_wq) ||                                \
            (__builtin_constant_p(_wq == system_freezable_wq) &&        \
             _wq == system_freezable_wq) ||                                \
            (__builtin_constant_p(_wq == system_power_efficient_wq) &&        \
             _wq == system_power_efficient_wq) ||                        \
            (__builtin_constant_p(_wq == system_freezable_power_efficient_wq) && \
             _wq == system_freezable_power_efficient_wq))                \
                __warn_flushing_systemwide_wq();                        \
        __flush_workqueue(_wq);                                                \
})

/**
 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
 * @cpu: cpu to use
 * @dwork: job to be done
 * @delay: number of jiffies to wait
 *
 * After waiting for a given time this puts a job in the kernel-global
 * workqueue on the specified CPU.
 */
static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
                                            unsigned long delay)
{
        return queue_delayed_work_on(cpu, system_percpu_wq, dwork, delay);
}

/**
 * schedule_delayed_work - put work task in global workqueue after delay
 * @dwork: job to be done
 * @delay: number of jiffies to wait or 0 for immediate execution
 *
 * After waiting for a given time this puts a job in the kernel-global
 * workqueue.
 */
static inline bool schedule_delayed_work(struct delayed_work *dwork,
                                         unsigned long delay)
{
        return queue_delayed_work(system_percpu_wq, dwork, delay);
}

#ifndef CONFIG_SMP
static inline long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
{
        return fn(arg);
}
static inline long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
{
        return fn(arg);
}
#else
long work_on_cpu_key(int cpu, long (*fn)(void *),
                     void *arg, struct lock_class_key *key);
/*
 * A new key is defined for each caller to make sure the work
 * associated with the function doesn't share its locking class.
 */
#define work_on_cpu(_cpu, _fn, _arg)                        \
({                                                        \
        static struct lock_class_key __key;                \
                                                        \
        work_on_cpu_key(_cpu, _fn, _arg, &__key);        \
})

#endif /* CONFIG_SMP */

#ifdef CONFIG_FREEZER
extern void freeze_workqueues_begin(void);
extern bool freeze_workqueues_busy(void);
extern void thaw_workqueues(void);
#endif /* CONFIG_FREEZER */

#ifdef CONFIG_SYSFS
int workqueue_sysfs_register(struct workqueue_struct *wq);
#else        /* CONFIG_SYSFS */
static inline int workqueue_sysfs_register(struct workqueue_struct *wq)
{ return 0; }
#endif        /* CONFIG_SYSFS */

#ifdef CONFIG_WQ_WATCHDOG
void wq_watchdog_touch(int cpu);
#else        /* CONFIG_WQ_WATCHDOG */
static inline void wq_watchdog_touch(int cpu) { }
#endif        /* CONFIG_WQ_WATCHDOG */

#ifdef CONFIG_SMP
int workqueue_prepare_cpu(unsigned int cpu);
int workqueue_online_cpu(unsigned int cpu);
int workqueue_offline_cpu(unsigned int cpu);
#endif

void __init workqueue_init_early(void);
void __init workqueue_init(void);
void __init workqueue_init_topology(void);

#endif






































































































































































































































































































































































































































































































































   39 

   39 





   39 























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
/*
 *  linux/include/linux/console.h
 *
 *  Copyright (C) 1993        Hamish Macdonald
 *
 * This file is subject to the terms and conditions of the GNU General Public
 * License.  See the file COPYING in the main directory of this archive
 * for more details.
 *
 * Changed:
 * 10-Mar-94: Arno Griffioen: Conversion for vt100 emulator port from PC LINUX
 */

#ifndef _LINUX_CONSOLE_H_
#define _LINUX_CONSOLE_H_ 1

#include <linux/atomic.h>
#include <linux/bits.h>
#include <linux/irq_work.h>
#include <linux/rculist.h>
#include <linux/rcuwait.h>
#include <linux/types.h>
#include <linux/vesa.h>

struct vc_data;
struct console_font_op;
struct console_font;
struct module;
struct tty_struct;
struct notifier_block;

enum con_scroll {
        SM_UP,
        SM_DOWN,
};

enum vc_intensity;

/**
 * struct consw - callbacks for consoles
 *
 * @owner:      the module to get references of when this console is used
 * @con_startup: set up the console and return its name (like VGA, EGA, ...)
 * @con_init:   initialize the console on @vc. @init is true for the very first
 *                call on this @vc.
 * @con_deinit: deinitialize the console from @vc.
 * @con_clear:  erase @count characters at [@x, @y] on @vc. @count >= 1.
 * @con_putc:   emit one character with attributes @ca to [@x, @y] on @vc.
 *                (optional -- @con_putcs would be called instead)
 * @con_putcs:  emit @count characters with attributes @s to [@x, @y] on @vc.
 * @con_cursor: enable/disable cursor depending on @enable
 * @con_scroll: move lines from @top to @bottom in direction @dir by @lines.
 *                Return true if no generic handling should be done.
 *                Invoked by csi_M and printing to the console.
 * @con_switch: notifier about the console switch; it is supposed to return
 *                true if a redraw is needed.
 * @con_blank:  blank/unblank the console. The target mode is passed in @blank.
 *                @mode_switch is set if changing from/to text/graphics. The hook
 *                is supposed to return true if a redraw is needed.
 * @con_font_set: set console @vc font to @font with height @vpitch. @flags can
 *                be %KD_FONT_FLAG_DONT_RECALC. (optional)
 * @con_font_get: fetch the current font on @vc of height @vpitch into @font.
 *                (optional)
 * @con_font_default: set default font on @vc. @name can be %NULL or font name
 *                to search for. @font can be filled back. (optional)
 * @con_resize:        resize the @vc console to @width x @height. @from_user is true
 *                when this change comes from the user space.
 * @con_set_palette: sets the palette of the console @vc to @table (optional)
 * @con_scrolldelta: the contents of the console should be scrolled by @lines.
 *                     Invoked by user. (optional)
 * @con_set_origin: set origin (see &vc_data::vc_origin) of the @vc. If not
 *                provided or returns false, the origin is set to
 *                @vc->vc_screenbuf. (optional)
 * @con_save_screen: save screen content into @vc->vc_screenbuf. Called e.g.
 *                upon entering graphics. (optional)
 * @con_build_attr: build attributes based on @color, @intensity and other
 *                parameters. The result is used for both normal and erase
 *                characters. (optional)
 * @con_invert_region: invert a region of length @count on @vc starting at @p.
 *                (optional)
 * @con_debug_enter: prepare the console for the debugger. This includes, but
 *                is not limited to, unblanking the console, loading an
 *                appropriate palette, and allowing debugger generated output.
 *                (optional)
 * @con_debug_leave: restore the console to its pre-debug state as closely as
 *                possible. (optional)
 */
struct consw {
        struct module *owner;
        const char *(*con_startup)(void);
        void        (*con_init)(struct vc_data *vc, bool init);
        void        (*con_deinit)(struct vc_data *vc);
        void        (*con_clear)(struct vc_data *vc, unsigned int y,
                             unsigned int x, unsigned int count);
        void        (*con_putc)(struct vc_data *vc, u16 ca, unsigned int y,
                            unsigned int x);
        void        (*con_putcs)(struct vc_data *vc, const u16 *s,
                             unsigned int count, unsigned int ypos,
                             unsigned int xpos);
        void        (*con_cursor)(struct vc_data *vc, bool enable);
        bool        (*con_scroll)(struct vc_data *vc, unsigned int top,
                        unsigned int bottom, enum con_scroll dir,
                        unsigned int lines);
        bool        (*con_switch)(struct vc_data *vc);
        bool        (*con_blank)(struct vc_data *vc, enum vesa_blank_mode blank,
                             bool mode_switch);
        int        (*con_font_set)(struct vc_data *vc,
                                const struct console_font *font,
                                unsigned int vpitch, unsigned int flags);
        int        (*con_font_get)(struct vc_data *vc, struct console_font *font,
                        unsigned int vpitch);
        int        (*con_font_default)(struct vc_data *vc,
                        struct console_font *font, const char *name);
        int     (*con_resize)(struct vc_data *vc, unsigned int width,
                              unsigned int height, bool from_user);
        void        (*con_set_palette)(struct vc_data *vc,
                        const unsigned char *table);
        void        (*con_scrolldelta)(struct vc_data *vc, int lines);
        bool        (*con_set_origin)(struct vc_data *vc);
        void        (*con_save_screen)(struct vc_data *vc);
        u8        (*con_build_attr)(struct vc_data *vc, u8 color,
                        enum vc_intensity intensity,
                        bool blink, bool underline, bool reverse, bool italic);
        void        (*con_invert_region)(struct vc_data *vc, u16 *p, int count);
        void        (*con_debug_enter)(struct vc_data *vc);
        void        (*con_debug_leave)(struct vc_data *vc);
};

extern const struct consw *conswitchp;

extern const struct consw dummy_con;        /* dummy console buffer */
extern const struct consw vga_con;        /* VGA text console */
extern const struct consw newport_con;        /* SGI Newport console  */

struct screen_info;
#ifdef CONFIG_VGA_CONSOLE
void vgacon_register_screen(struct screen_info *si);
#else
static inline void vgacon_register_screen(struct screen_info *si) { }
#endif

int con_is_bound(const struct consw *csw);
int do_unregister_con_driver(const struct consw *csw);
int do_take_over_console(const struct consw *sw, int first, int last, int deflt);
void give_up_console(const struct consw *sw);
#ifdef CONFIG_VT
void con_debug_enter(struct vc_data *vc);
void con_debug_leave(void);
#else
static inline void con_debug_enter(struct vc_data *vc) { }
static inline void con_debug_leave(void) { }
#endif

/*
 * The interface for a console, or any other device that wants to capture
 * console messages (printer driver?)
 */

/**
 * enum cons_flags - General console flags
 * @CON_PRINTBUFFER:        Used by newly registered consoles to avoid duplicate
 *                        output of messages that were already shown by boot
 *                        consoles or read by userspace via syslog() syscall.
 * @CON_CONSDEV:        Indicates that the console driver is backing
 *                        /dev/console.
 * @CON_ENABLED:        Indicates if a console is allowed to print records. If
 *                        false, the console also will not advance to later
 *                        records.
 * @CON_BOOT:                Marks the console driver as early console driver which
 *                        is used during boot before the real driver becomes
 *                        available. It will be automatically unregistered
 *                        when the real console driver is registered unless
 *                        "keep_bootcon" parameter is used.
 * @CON_ANYTIME:        A misnomed historical flag which tells the core code
 *                        that the legacy @console::write callback can be invoked
 *                        on a CPU which is marked OFFLINE. That is misleading as
 *                        it suggests that there is no contextual limit for
 *                        invoking the callback. The original motivation was
 *                        readiness of the per-CPU areas.
 * @CON_BRL:                Indicates a braille device which is exempt from
 *                        receiving the printk spam for obvious reasons.
 * @CON_EXTENDED:        The console supports the extended output format of
 *                        /dev/kmesg which requires a larger output buffer.
 * @CON_SUSPENDED:        Indicates if a console is suspended. If true, the
 *                        printing callbacks must not be called.
 * @CON_NBCON:                Console can operate outside of the legacy style console_lock
 *                        constraints.
 */
enum cons_flags {
        CON_PRINTBUFFER                = BIT(0),
        CON_CONSDEV                = BIT(1),
        CON_ENABLED                = BIT(2),
        CON_BOOT                = BIT(3),
        CON_ANYTIME                = BIT(4),
        CON_BRL                        = BIT(5),
        CON_EXTENDED                = BIT(6),
        CON_SUSPENDED                = BIT(7),
        CON_NBCON                = BIT(8),
};

/**
 * struct nbcon_state - console state for nbcon consoles
 * @atom:        Compound of the state fields for atomic operations
 *
 * @req_prio:                The priority of a handover request
 * @prio:                The priority of the current owner
 * @unsafe:                Console is busy in a non takeover region
 * @unsafe_takeover:        A hostile takeover in an unsafe state happened in the
 *                        past. The console cannot be safe until re-initialized.
 * @cpu:                The CPU on which the owner runs
 *
 * To be used for reading and preparing of the value stored in the nbcon
 * state variable @console::nbcon_state.
 *
 * The @prio and @req_prio fields are particularly important to allow
 * spin-waiting to timeout and give up without the risk of a waiter being
 * assigned the lock after giving up.
 */
struct nbcon_state {
        union {
                unsigned int        atom;
                struct {
                        unsigned int prio                :  2;
                        unsigned int req_prio                :  2;
                        unsigned int unsafe                :  1;
                        unsigned int unsafe_takeover        :  1;
                        unsigned int cpu                : 24;
                };
        };
};

/*
 * The nbcon_state struct is used to easily create and interpret values that
 * are stored in the @console::nbcon_state variable. Ensure this struct stays
 * within the size boundaries of the atomic variable's underlying type in
 * order to avoid any accidental truncation.
 */
static_assert(sizeof(struct nbcon_state) <= sizeof(int));

/**
 * enum nbcon_prio - console owner priority for nbcon consoles
 * @NBCON_PRIO_NONE:                Unused
 * @NBCON_PRIO_NORMAL:                Normal (non-emergency) usage
 * @NBCON_PRIO_EMERGENCY:        Emergency output (WARN/OOPS...)
 * @NBCON_PRIO_PANIC:                Panic output
 * @NBCON_PRIO_MAX:                The number of priority levels
 *
 * A higher priority context can takeover the console when it is
 * in the safe state. The final attempt to flush consoles in panic()
 * can be allowed to do so even in an unsafe state (Hope and pray).
 */
enum nbcon_prio {
        NBCON_PRIO_NONE = 0,
        NBCON_PRIO_NORMAL,
        NBCON_PRIO_EMERGENCY,
        NBCON_PRIO_PANIC,
        NBCON_PRIO_MAX,
};

struct console;
struct printk_buffers;

/**
 * struct nbcon_context - Context for console acquire/release
 * @console:                        The associated console
 * @spinwait_max_us:                Limit for spin-wait acquire
 * @prio:                        Priority of the context
 * @allow_unsafe_takeover:        Allow performing takeover even if unsafe. Can
 *                                be used only with NBCON_PRIO_PANIC @prio. It
 *                                might cause a system freeze when the console
 *                                is used later.
 * @backlog:                        Ringbuffer has pending records
 * @pbufs:                        Pointer to the text buffer for this context
 * @seq:                        The sequence number to print for this context
 */
struct nbcon_context {
        /* members set by caller */
        struct console                *console;
        unsigned int                spinwait_max_us;
        enum nbcon_prio                prio;
        unsigned int                allow_unsafe_takeover        : 1;

        /* members set by emit */
        unsigned int                backlog                        : 1;

        /* members set by acquire */
        struct printk_buffers        *pbufs;
        u64                        seq;
};

/**
 * struct nbcon_write_context - Context handed to the nbcon write callbacks
 * @ctxt:                The core console context
 * @outbuf:                Pointer to the text buffer for output
 * @len:                Length to write
 * @unsafe_takeover:        If a hostile takeover in an unsafe state has occurred
 */
struct nbcon_write_context {
        struct nbcon_context        __private ctxt;
        char                        *outbuf;
        unsigned int                len;
        bool                        unsafe_takeover;
};

/**
 * struct console - The console descriptor structure
 * @name:                The name of the console driver
 * @write:                Legacy write callback to output messages (Optional)
 * @read:                Read callback for console input (Optional)
 * @device:                The underlying TTY device driver (Optional)
 * @unblank:                Callback to unblank the console (Optional)
 * @setup:                Callback for initializing the console (Optional)
 * @exit:                Callback for teardown of the console (Optional)
 * @match:                Callback for matching a console (Optional)
 * @flags:                Console flags. See enum cons_flags
 * @index:                Console index, e.g. port number
 * @cflag:                TTY control mode flags
 * @ispeed:                TTY input speed
 * @ospeed:                TTY output speed
 * @seq:                Sequence number of the next ringbuffer record to print
 * @dropped:                Number of unreported dropped ringbuffer records
 * @data:                Driver private data
 * @node:                hlist node for the console list
 *
 * @nbcon_state:        State for nbcon consoles
 * @nbcon_seq:                Sequence number of the next record for nbcon to print
 * @nbcon_device_ctxt:        Context available for non-printing operations
 * @nbcon_prev_seq:        Seq num the previous nbcon owner was assigned to print
 * @pbufs:                Pointer to nbcon private buffer
 * @kthread:                Printer kthread for this console
 * @rcuwait:                RCU-safe wait object for @kthread waking
 * @irq_work:                Defer @kthread waking to IRQ work context
 */
struct console {
        char                        name[16];
        void                        (*write)(struct console *co, const char *s, unsigned int count);
        int                        (*read)(struct console *co, char *s, unsigned int count);
        struct tty_driver        *(*device)(struct console *co, int *index);
        void                        (*unblank)(void);
        int                        (*setup)(struct console *co, char *options);
        int                        (*exit)(struct console *co);
        int                        (*match)(struct console *co, char *name, int idx, char *options);
        short                        flags;
        short                        index;
        int                        cflag;
        uint                        ispeed;
        uint                        ospeed;
        u64                        seq;
        unsigned long                dropped;
        void                        *data;
        struct hlist_node        node;

        /* nbcon console specific members */

        /**
         * @write_atomic:
         *
         * NBCON callback to write out text in any context. (Optional)
         *
         * This callback is called with the console already acquired. However,
         * a higher priority context is allowed to take it over by default.
         *
         * The callback must call nbcon_enter_unsafe() and nbcon_exit_unsafe()
         * around any code where the takeover is not safe, for example, when
         * manipulating the serial port registers.
         *
         * nbcon_enter_unsafe() will fail if the context has lost the console
         * ownership in the meantime. In this case, the callback is no longer
         * allowed to go forward. It must back out immediately and carefully.
         * The buffer content is also no longer trusted since it no longer
         * belongs to the context.
         *
         * The callback should allow the takeover whenever it is safe. It
         * increases the chance to see messages when the system is in trouble.
         * If the driver must reacquire ownership in order to finalize or
         * revert hardware changes, nbcon_reacquire_nobuf() can be used.
         * However, on reacquire the buffer content is no longer available. A
         * reacquire cannot be used to resume printing.
         *
         * The callback can be called from any context (including NMI).
         * Therefore it must avoid usage of any locking and instead rely
         * on the console ownership for synchronization.
         */
        void (*write_atomic)(struct console *con, struct nbcon_write_context *wctxt);

        /**
         * @write_thread:
         *
         * NBCON callback to write out text in task context.
         *
         * This callback must be called only in task context with both
         * device_lock() and the nbcon console acquired with
         * NBCON_PRIO_NORMAL.
         *
         * The same rules for console ownership verification and unsafe
         * sections handling applies as with write_atomic().
         *
         * The console ownership handling is necessary for synchronization
         * against write_atomic() which is synchronized only via the context.
         *
         * The device_lock() provides the primary serialization for operations
         * on the device. It might be as relaxed (mutex)[*] or as tight
         * (disabled preemption and interrupts) as needed. It allows
         * the kthread to operate in the least restrictive mode[**].
         *
         * [*] Standalone nbcon_context_try_acquire() is not safe with
         *     the preemption enabled, see nbcon_owner_matches(). But it
         *     can be safe when always called in the preemptive context
         *     under the device_lock().
         *
         * [**] The device_lock() makes sure that nbcon_context_try_acquire()
         *      would never need to spin which is important especially with
         *      PREEMPT_RT.
         */
        void (*write_thread)(struct console *con, struct nbcon_write_context *wctxt);

        /**
         * @device_lock:
         *
         * NBCON callback to begin synchronization with driver code.
         *
         * Console drivers typically must deal with access to the hardware
         * via user input/output (such as an interactive login shell) and
         * output of kernel messages via printk() calls. This callback is
         * called by the printk-subsystem whenever it needs to synchronize
         * with hardware access by the driver. It should be implemented to
         * use whatever synchronization mechanism the driver is using for
         * itself (for example, the port lock for uart serial consoles).
         *
         * The callback is always called from task context. It may use any
         * synchronization method required by the driver.
         *
         * IMPORTANT: The callback MUST disable migration. The console driver
         *        may be using a synchronization mechanism that already takes
         *        care of this (such as spinlocks). Otherwise this function must
         *        explicitly call migrate_disable().
         *
         * The flags argument is provided as a convenience to the driver. It
         * will be passed again to device_unlock(). It can be ignored if the
         * driver does not need it.
         */
        void (*device_lock)(struct console *con, unsigned long *flags);

        /**
         * @device_unlock:
         *
         * NBCON callback to finish synchronization with driver code.
         *
         * It is the counterpart to device_lock().
         *
         * This callback is always called from task context. It must
         * appropriately re-enable migration (depending on how device_lock()
         * disabled migration).
         *
         * The flags argument is the value of the same variable that was
         * passed to device_lock().
         */
        void (*device_unlock)(struct console *con, unsigned long flags);

        atomic_t                __private nbcon_state;
        atomic_long_t                __private nbcon_seq;
        struct nbcon_context        __private nbcon_device_ctxt;
        atomic_long_t           __private nbcon_prev_seq;

        struct printk_buffers        *pbufs;
        struct task_struct        *kthread;
        struct rcuwait                rcuwait;
        struct irq_work                irq_work;
};

#ifdef CONFIG_LOCKDEP
extern void lockdep_assert_console_list_lock_held(void);
#else
static inline void lockdep_assert_console_list_lock_held(void)
{
}
#endif

#ifdef CONFIG_DEBUG_LOCK_ALLOC
extern bool console_srcu_read_lock_is_held(void);
#else
static inline bool console_srcu_read_lock_is_held(void)
{
        return 1;
}
#endif

extern int console_srcu_read_lock(void);
extern void console_srcu_read_unlock(int cookie);

extern void console_list_lock(void) __acquires(console_mutex);
extern void console_list_unlock(void) __releases(console_mutex);

extern struct hlist_head console_list;

/**
 * console_srcu_read_flags - Locklessly read flags of a possibly registered
 *                                console
 * @con:        struct console pointer of console to read flags from
 *
 * Locklessly reading @con->flags provides a consistent read value because
 * there is at most one CPU modifying @con->flags and that CPU is using only
 * read-modify-write operations to do so.
 *
 * Requires console_srcu_read_lock to be held, which implies that @con might
 * be a registered console. The purpose of holding console_srcu_read_lock is
 * to guarantee that the console state is valid (CON_SUSPENDED/CON_ENABLED)
 * and that no exit/cleanup routines will run if the console is currently
 * undergoing unregistration.
 *
 * If the caller is holding the console_list_lock or it is _certain_ that
 * @con is not and will not become registered, the caller may read
 * @con->flags directly instead.
 *
 * Context: Any context.
 * Return: The current value of the @con->flags field.
 */
static inline short console_srcu_read_flags(const struct console *con)
{
        WARN_ON_ONCE(!console_srcu_read_lock_is_held());

        /*
         * The READ_ONCE() matches the WRITE_ONCE() when @flags are modified
         * for registered consoles with console_srcu_write_flags().
         */
        return data_race(READ_ONCE(con->flags));
}

/**
 * console_srcu_write_flags - Write flags for a registered console
 * @con:        struct console pointer of console to write flags to
 * @flags:        new flags value to write
 *
 * Only use this function to write flags for registered consoles. It
 * requires holding the console_list_lock.
 *
 * Context: Any context.
 */
static inline void console_srcu_write_flags(struct console *con, short flags)
{
        lockdep_assert_console_list_lock_held();

        /* This matches the READ_ONCE() in console_srcu_read_flags(). */
        WRITE_ONCE(con->flags, flags);
}

/* Variant of console_is_registered() when the console_list_lock is held. */
static inline bool console_is_registered_locked(const struct console *con)
{
        lockdep_assert_console_list_lock_held();
        return !hlist_unhashed(&con->node);
}

/*
 * console_is_registered - Check if the console is registered
 * @con:        struct console pointer of console to check
 *
 * Context: Process context. May sleep while acquiring console list lock.
 * Return: true if the console is in the console list, otherwise false.
 *
 * If false is returned for a console that was previously registered, it
 * can be assumed that the console's unregistration is fully completed,
 * including the exit() callback after console list removal.
 */
static inline bool console_is_registered(const struct console *con)
{
        bool ret;

        console_list_lock();
        ret = console_is_registered_locked(con);
        console_list_unlock();
        return ret;
}

/**
 * for_each_console_srcu() - Iterator over registered consoles
 * @con:        struct console pointer used as loop cursor
 *
 * Although SRCU guarantees the console list will be consistent, the
 * struct console fields may be updated by other CPUs while iterating.
 *
 * Requires console_srcu_read_lock to be held. Can be invoked from
 * any context.
 */
#define for_each_console_srcu(con)                                        \
        hlist_for_each_entry_srcu(con, &console_list, node,                \
                                  console_srcu_read_lock_is_held())

/**
 * for_each_console() - Iterator over registered consoles
 * @con:        struct console pointer used as loop cursor
 *
 * The console list and the &console.flags are immutable while iterating.
 *
 * Requires console_list_lock to be held.
 */
#define for_each_console(con)                                                \
        lockdep_assert_console_list_lock_held();                        \
        hlist_for_each_entry(con, &console_list, node)

#ifdef CONFIG_PRINTK
extern void nbcon_cpu_emergency_enter(void);
extern void nbcon_cpu_emergency_exit(void);
extern bool nbcon_can_proceed(struct nbcon_write_context *wctxt);
extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt);
extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt);
extern void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt);
#else
static inline void nbcon_cpu_emergency_enter(void) { }
static inline void nbcon_cpu_emergency_exit(void) { }
static inline bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { return false; }
static inline bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { return false; }
static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return false; }
static inline void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) { }
#endif

extern int console_set_on_cmdline;
extern struct console *early_console;

enum con_flush_mode {
        CONSOLE_FLUSH_PENDING,
        CONSOLE_REPLAY_ALL,
};

extern int add_preferred_console(const char *name, const short idx, char *options);
extern void console_force_preferred_locked(struct console *con);
extern void register_console(struct console *);
extern int unregister_console(struct console *);
extern void console_lock(void);
extern int console_trylock(void);
extern void console_unlock(void);
extern void console_conditional_schedule(void);
extern void console_unblank(void);
extern void console_flush_on_panic(enum con_flush_mode mode);
extern struct tty_driver *console_device(int *);
extern void console_suspend(struct console *);
extern void console_resume(struct console *);
extern int is_console_locked(void);
extern int braille_register_console(struct console *, int index,
                char *console_options, char *braille_options);
extern int braille_unregister_console(struct console *);
#ifdef CONFIG_TTY
extern void console_sysfs_notify(void);
#else
static inline void console_sysfs_notify(void)
{ }
#endif
extern bool console_suspend_enabled;

/* Suspend and resume console messages over PM events */
extern void console_suspend_all(void);
extern void console_resume_all(void);

int mda_console_init(void);

void vcs_make_sysfs(int index);
void vcs_remove_sysfs(int index);

/* Some debug stub to catch some of the obvious races in the VT code */
#define WARN_CONSOLE_UNLOCKED()                                                \
        WARN_ON(!atomic_read(&ignore_console_lock_warning) &&                \
                !is_console_locked() && !oops_in_progress)
/*
 * Increment ignore_console_lock_warning if you need to quiet
 * WARN_CONSOLE_UNLOCKED() for debugging purposes.
 */
extern atomic_t ignore_console_lock_warning;

DEFINE_LOCK_GUARD_0(console_lock, console_lock(), console_unlock());

extern void console_init(void);

/* For deferred console takeover */
void dummycon_register_output_notifier(struct notifier_block *nb);
void dummycon_unregister_output_notifier(struct notifier_block *nb);

#endif /* _LINUX_CONSOLE_H */


















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Statically sized hash table implementation
 * (C) 2012  Sasha Levin <levinsasha928@gmail.com>
 */

#ifndef _LINUX_HASHTABLE_H
#define _LINUX_HASHTABLE_H

#include <linux/list.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/hash.h>
#include <linux/rculist.h>

#define DEFINE_HASHTABLE(name, bits)                                                \
        struct hlist_head name[1 << (bits)] =                                        \
                        { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT }

#define DEFINE_READ_MOSTLY_HASHTABLE(name, bits)                                \
        struct hlist_head name[1 << (bits)] __read_mostly =                        \
                        { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT }

#define DECLARE_HASHTABLE(name, bits)                                           \
        struct hlist_head name[1 << (bits)]

#define HASH_SIZE(name) (ARRAY_SIZE(name))
#define HASH_BITS(name) ilog2(HASH_SIZE(name))

/* Use hash_32 when possible to allow for fast 32bit hashing in 64bit kernels. */
#define hash_min(val, bits)                                                        \
        (sizeof(val) <= 4 ? hash_32(val, bits) : hash_long(val, bits))

static inline void __hash_init(struct hlist_head *ht, unsigned int sz)
{
        unsigned int i;

        for (i = 0; i < sz; i++)
                INIT_HLIST_HEAD(&ht[i]);
}

/**
 * hash_init - initialize a hash table
 * @hashtable: hashtable to be initialized
 *
 * Calculates the size of the hashtable from the given parameter, otherwise
 * same as hash_init_size.
 *
 * This has to be a macro since HASH_BITS() will not work on pointers since
 * it calculates the size during preprocessing.
 */
#define hash_init(hashtable) __hash_init(hashtable, HASH_SIZE(hashtable))

/**
 * hash_add - add an object to a hashtable
 * @hashtable: hashtable to add to
 * @node: the &struct hlist_node of the object to be added
 * @key: the key of the object to be added
 */
#define hash_add(hashtable, node, key)                                                \
        hlist_add_head(node, &hashtable[hash_min(key, HASH_BITS(hashtable))])

/**
 * hash_add_rcu - add an object to a rcu enabled hashtable
 * @hashtable: hashtable to add to
 * @node: the &struct hlist_node of the object to be added
 * @key: the key of the object to be added
 */
#define hash_add_rcu(hashtable, node, key)                                        \
        hlist_add_head_rcu(node, &hashtable[hash_min(key, HASH_BITS(hashtable))])

/**
 * hash_hashed - check whether an object is in any hashtable
 * @node: the &struct hlist_node of the object to be checked
 */
static inline bool hash_hashed(struct hlist_node *node)
{
        return !hlist_unhashed(node);
}

static inline bool __hash_empty(struct hlist_head *ht, unsigned int sz)
{
        unsigned int i;

        for (i = 0; i < sz; i++)
                if (!hlist_empty(&ht[i]))
                        return false;

        return true;
}

/**
 * hash_empty - check whether a hashtable is empty
 * @hashtable: hashtable to check
 *
 * This has to be a macro since HASH_BITS() will not work on pointers since
 * it calculates the size during preprocessing.
 */
#define hash_empty(hashtable) __hash_empty(hashtable, HASH_SIZE(hashtable))

/**
 * hash_del - remove an object from a hashtable
 * @node: &struct hlist_node of the object to remove
 */
static inline void hash_del(struct hlist_node *node)
{
        hlist_del_init(node);
}

/**
 * hash_del_rcu - remove an object from a rcu enabled hashtable
 * @node: &struct hlist_node of the object to remove
 */
static inline void hash_del_rcu(struct hlist_node *node)
{
        hlist_del_init_rcu(node);
}

/**
 * hash_for_each - iterate over a hashtable
 * @name: hashtable to iterate
 * @bkt: integer to use as bucket loop cursor
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 */
#define hash_for_each(name, bkt, obj, member)                                \
        for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\
                        (bkt)++)\
                hlist_for_each_entry(obj, &name[bkt], member)

/**
 * hash_for_each_rcu - iterate over a rcu enabled hashtable
 * @name: hashtable to iterate
 * @bkt: integer to use as bucket loop cursor
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 */
#define hash_for_each_rcu(name, bkt, obj, member)                        \
        for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\
                        (bkt)++)\
                hlist_for_each_entry_rcu(obj, &name[bkt], member)

/**
 * hash_for_each_safe - iterate over a hashtable safe against removal of
 * hash entry
 * @name: hashtable to iterate
 * @bkt: integer to use as bucket loop cursor
 * @tmp: a &struct hlist_node used for temporary storage
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 */
#define hash_for_each_safe(name, bkt, tmp, obj, member)                        \
        for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\
                        (bkt)++)\
                hlist_for_each_entry_safe(obj, tmp, &name[bkt], member)

/**
 * hash_for_each_possible - iterate over all possible objects hashing to the
 * same bucket
 * @name: hashtable to iterate
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 * @key: the key of the objects to iterate over
 */
#define hash_for_each_possible(name, obj, member, key)                        \
        hlist_for_each_entry(obj, &name[hash_min(key, HASH_BITS(name))], member)

/**
 * hash_for_each_possible_rcu - iterate over all possible objects hashing to the
 * same bucket in an rcu enabled hashtable
 * @name: hashtable to iterate
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 * @key: the key of the objects to iterate over
 */
#define hash_for_each_possible_rcu(name, obj, member, key, cond...)        \
        hlist_for_each_entry_rcu(obj, &name[hash_min(key, HASH_BITS(name))],\
                member, ## cond)

/**
 * hash_for_each_possible_rcu_notrace - iterate over all possible objects hashing
 * to the same bucket in an rcu enabled hashtable in a rcu enabled hashtable
 * @name: hashtable to iterate
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 * @key: the key of the objects to iterate over
 *
 * This is the same as hash_for_each_possible_rcu() except that it does
 * not do any RCU debugging or tracing.
 */
#define hash_for_each_possible_rcu_notrace(name, obj, member, key) \
        hlist_for_each_entry_rcu_notrace(obj, \
                &name[hash_min(key, HASH_BITS(name))], member)

/**
 * hash_for_each_possible_safe - iterate over all possible objects hashing to the
 * same bucket safe against removals
 * @name: hashtable to iterate
 * @obj: the type * to use as a loop cursor for each entry
 * @tmp: a &struct hlist_node used for temporary storage
 * @member: the name of the hlist_node within the struct
 * @key: the key of the objects to iterate over
 */
#define hash_for_each_possible_safe(name, obj, tmp, member, key)        \
        hlist_for_each_entry_safe(obj, tmp,\
                &name[hash_min(key, HASH_BITS(name))], member)


#endif








































































































































































































































































































































   19 




   19 
    2 





   17 



   17 




   19 



   19 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * IP Payload Compression Protocol (IPComp) - RFC3173.
 *
 * Copyright (c) 2003 James Morris <jmorris@intercode.com.au>
 * Copyright (c) 2003-2025 Herbert Xu <herbert@gondor.apana.org.au>
 *
 * Todo:
 *   - Tunable compression parameters.
 *   - Compression stats.
 *   - Adaptive compression.
 */

#include <crypto/acompress.h>
#include <linux/err.h>
#include <linux/module.h>
#include <linux/skbuff_ref.h>
#include <linux/slab.h>
#include <net/ipcomp.h>
#include <net/xfrm.h>

#define IPCOMP_SCRATCH_SIZE 65400

struct ipcomp_skb_cb {
        struct xfrm_skb_cb xfrm;
        struct acomp_req *req;
};

struct ipcomp_data {
        u16 threshold;
        struct crypto_acomp *tfm;
};

struct ipcomp_req_extra {
        struct xfrm_state *x;
        struct scatterlist sg[];
};

static inline struct ipcomp_skb_cb *ipcomp_cb(struct sk_buff *skb)
{
        struct ipcomp_skb_cb *cb = (void *)skb->cb;

        BUILD_BUG_ON(sizeof(*cb) > sizeof(skb->cb));
        return cb;
}

static int ipcomp_post_acomp(struct sk_buff *skb, int err, int hlen)
{
        struct acomp_req *req = ipcomp_cb(skb)->req;
        struct ipcomp_req_extra *extra;
        struct scatterlist *dsg;
        int len, dlen;

        if (unlikely(err))
                goto out_free_req;

        extra = acomp_request_extra(req);
        dsg = extra->sg;
        dlen = req->dlen;

        pskb_trim_unique(skb, 0);
        __skb_put(skb, hlen);

        /* Only update truesize on input. */
        if (!hlen)
                skb->truesize += dlen;
        skb->data_len = dlen;
        skb->len += dlen;

        do {
                skb_frag_t *frag;
                struct page *page;

                frag = skb_shinfo(skb)->frags + skb_shinfo(skb)->nr_frags;
                page = sg_page(dsg);
                dsg = sg_next(dsg);

                len = PAGE_SIZE;
                if (dlen < len)
                        len = dlen;

                skb_frag_fill_page_desc(frag, page, 0, len);

                skb_shinfo(skb)->nr_frags++;
        } while ((dlen -= len));

        for (; dsg; dsg = sg_next(dsg))
                __free_page(sg_page(dsg));

out_free_req:
        acomp_request_free(req);
        return err;
}

static int ipcomp_input_done2(struct sk_buff *skb, int err)
{
        struct ip_comp_hdr *ipch = ip_comp_hdr(skb);
        const int plen = skb->len;

        skb->transport_header = skb->network_header + sizeof(*ipch);

        return ipcomp_post_acomp(skb, err, 0) ?:
               skb->len < (plen + sizeof(ip_comp_hdr)) ? -EINVAL :
               ipch->nexthdr;
}

static void ipcomp_input_done(void *data, int err)
{
        struct sk_buff *skb = data;

        xfrm_input_resume(skb, ipcomp_input_done2(skb, err));
}

static struct acomp_req *ipcomp_setup_req(struct xfrm_state *x,
                                          struct sk_buff *skb, int minhead,
                                          int dlen)
{
        const int dnfrags = min(MAX_SKB_FRAGS, 16);
        struct ipcomp_data *ipcd = x->data;
        struct ipcomp_req_extra *extra;
        struct scatterlist *sg, *dsg;
        const int plen = skb->len;
        struct crypto_acomp *tfm;
        struct acomp_req *req;
        int nfrags;
        int total;
        int err;
        int i;

        ipcomp_cb(skb)->req = NULL;

        do {
                struct sk_buff *trailer;

                if (skb->len > PAGE_SIZE) {
                        if (skb_linearize_cow(skb))
                                return ERR_PTR(-ENOMEM);
                        nfrags = 1;
                        break;
                }

                if (!skb_cloned(skb) && skb_headlen(skb) >= minhead) {
                        if (!skb_is_nonlinear(skb)) {
                                nfrags = 1;
                                break;
                        } else if (!skb_has_frag_list(skb)) {
                                nfrags = skb_shinfo(skb)->nr_frags;
                                nfrags++;
                                break;
                        }
                }

                nfrags = skb_cow_data(skb, skb_headlen(skb) < minhead ?
                                           minhead - skb_headlen(skb) : 0,
                                      &trailer);
                if (nfrags < 0)
                        return ERR_PTR(nfrags);
        } while (0);

        tfm = ipcd->tfm;
        req = acomp_request_alloc_extra(
                tfm, sizeof(*extra) + sizeof(*sg) * (nfrags + dnfrags),
                GFP_ATOMIC);
        ipcomp_cb(skb)->req = req;
        if (!req)
                return ERR_PTR(-ENOMEM);

        extra = acomp_request_extra(req);
        extra->x = x;

        dsg = extra->sg;
        sg = dsg + dnfrags;
        sg_init_table(sg, nfrags);
        err = skb_to_sgvec(skb, sg, 0, plen);
        if (unlikely(err < 0))
                return ERR_PTR(err);

        sg_init_table(dsg, dnfrags);
        total = 0;
        for (i = 0; i < dnfrags && total < dlen; i++) {
                struct page *page;

                page = alloc_page(GFP_ATOMIC);
                if (!page)
                        break;
                sg_set_page(dsg + i, page, PAGE_SIZE, 0);
                total += PAGE_SIZE;
        }
        if (!i)
                return ERR_PTR(-ENOMEM);
        sg_mark_end(dsg + i - 1);
        dlen = min(dlen, total);

        acomp_request_set_params(req, sg, dsg, plen, dlen);

        return req;
}

static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb)
{
        struct acomp_req *req;
        int err;

        req = ipcomp_setup_req(x, skb, 0, IPCOMP_SCRATCH_SIZE);
        err = PTR_ERR(req);
        if (IS_ERR(req))
                goto out;

        acomp_request_set_callback(req, 0, ipcomp_input_done, skb);
        err = crypto_acomp_decompress(req);
        if (err == -EINPROGRESS)
                return err;

out:
        return ipcomp_input_done2(skb, err);
}

int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb)
{
        struct ip_comp_hdr *ipch __maybe_unused;

        if (!pskb_may_pull(skb, sizeof(*ipch)))
                return -EINVAL;

        skb->ip_summed = CHECKSUM_NONE;

        /* Remove ipcomp header and decompress original payload */
        __skb_pull(skb, sizeof(*ipch));

        return ipcomp_decompress(x, skb);
}
EXPORT_SYMBOL_GPL(ipcomp_input);

static int ipcomp_output_push(struct sk_buff *skb)
{
        skb_push(skb, -skb_network_offset(skb));
        return 0;
}

static int ipcomp_output_done2(struct xfrm_state *x, struct sk_buff *skb,
                               int err)
{
        struct ip_comp_hdr *ipch;

        err = ipcomp_post_acomp(skb, err, sizeof(*ipch));
        if (err)
                goto out_ok;

        /* Install ipcomp header, convert into ipcomp datagram. */
        ipch = ip_comp_hdr(skb);
        ipch->nexthdr = *skb_mac_header(skb);
        ipch->flags = 0;
        ipch->cpi = htons((u16 )ntohl(x->id.spi));
        *skb_mac_header(skb) = IPPROTO_COMP;
out_ok:
        return ipcomp_output_push(skb);
}

static void ipcomp_output_done(void *data, int err)
{
        struct ipcomp_req_extra *extra;
        struct sk_buff *skb = data;
        struct acomp_req *req;

        req = ipcomp_cb(skb)->req;
        extra = acomp_request_extra(req);

        xfrm_output_resume(skb_to_full_sk(skb), skb,
                           ipcomp_output_done2(extra->x, skb, err));
}

static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb)
{
        struct ip_comp_hdr *ipch __maybe_unused;
        struct acomp_req *req;
        int err;

        req = ipcomp_setup_req(x, skb, sizeof(*ipch),
                               skb->len - sizeof(*ipch));
        err = PTR_ERR(req);
        if (IS_ERR(req))
                goto out;

        acomp_request_set_callback(req, 0, ipcomp_output_done, skb);
        err = crypto_acomp_compress(req);
        if (err == -EINPROGRESS)
                return err;

out:
        return ipcomp_output_done2(x, skb, err);
}

int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb)
{
        struct ipcomp_data *ipcd = x->data;

        if (skb->len < ipcd->threshold) {
                /* Don't bother compressing */
                return ipcomp_output_push(skb);
        }

        return ipcomp_compress(x, skb);
}
EXPORT_SYMBOL_GPL(ipcomp_output);

static void ipcomp_free_data(struct ipcomp_data *ipcd)
{
        crypto_free_acomp(ipcd->tfm);
}

void ipcomp_destroy(struct xfrm_state *x)
{
        struct ipcomp_data *ipcd = x->data;
        if (!ipcd)
                return;
        ipcomp_free_data(ipcd);
        kfree(ipcd);
}
EXPORT_SYMBOL_GPL(ipcomp_destroy);

int ipcomp_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
        int err;
        struct ipcomp_data *ipcd;
        struct xfrm_algo_desc *calg_desc;

        err = -EINVAL;
        if (!x->calg) {
                NL_SET_ERR_MSG(extack, "Missing required compression algorithm");
                goto out;
        }

        if (x->encap) {
                NL_SET_ERR_MSG(extack, "IPComp is not compatible with encapsulation");
                goto out;
        }

        err = -ENOMEM;
        ipcd = kzalloc(sizeof(*ipcd), GFP_KERNEL);
        if (!ipcd)
                goto out;

        ipcd->tfm = crypto_alloc_acomp(x->calg->alg_name, 0, 0);
        if (IS_ERR(ipcd->tfm))
                goto error;

        calg_desc = xfrm_calg_get_byname(x->calg->alg_name, 0);
        BUG_ON(!calg_desc);
        ipcd->threshold = calg_desc->uinfo.comp.threshold;
        x->data = ipcd;
        err = 0;
out:
        return err;

error:
        ipcomp_free_data(ipcd);
        kfree(ipcd);
        goto out;
}
EXPORT_SYMBOL_GPL(ipcomp_init_state);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp) - RFC3173");
MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");

























































  315 












  318 













   67 











































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_ERR_H
#define _LINUX_ERR_H

#include <linux/compiler.h>
#include <linux/types.h>

#include <asm/errno.h>

/*
 * Kernel pointers have redundant information, so we can use a
 * scheme where we can return either an error code or a normal
 * pointer with the same return value.
 *
 * This should be a per-architecture thing, to allow different
 * error and pointer decisions.
 */
#define MAX_ERRNO        4095

#ifndef __ASSEMBLY__

/**
 * IS_ERR_VALUE - Detect an error pointer.
 * @x: The pointer to check.
 *
 * Like IS_ERR(), but does not generate a compiler warning if result is unused.
 */
#define IS_ERR_VALUE(x) unlikely((unsigned long)(void *)(x) >= (unsigned long)-MAX_ERRNO)

/**
 * ERR_PTR - Create an error pointer.
 * @error: A negative error code.
 *
 * Encodes @error into a pointer value. Users should consider the result
 * opaque and not assume anything about how the error is encoded.
 *
 * Return: A pointer with @error encoded within its value.
 */
static inline void * __must_check ERR_PTR(long error)
{
        return (void *) error;
}

/* Return the pointer in the percpu address space. */
#define ERR_PTR_PCPU(error) ((void __percpu *)(unsigned long)ERR_PTR(error))

/* Cast an error pointer to __iomem. */
#define IOMEM_ERR_PTR(error) (__force void __iomem *)ERR_PTR(error)

/**
 * PTR_ERR - Extract the error code from an error pointer.
 * @ptr: An error pointer.
 * Return: The error code within @ptr.
 */
static inline long __must_check PTR_ERR(__force const void *ptr)
{
        return (long) ptr;
}

/* Read an error pointer from the percpu address space. */
#define PTR_ERR_PCPU(ptr) (PTR_ERR((const void *)(__force const unsigned long)(ptr)))

/**
 * IS_ERR - Detect an error pointer.
 * @ptr: The pointer to check.
 * Return: true if @ptr is an error pointer, false otherwise.
 */
static inline bool __must_check IS_ERR(__force const void *ptr)
{
        return IS_ERR_VALUE((unsigned long)ptr);
}

/* Read an error pointer from the percpu address space. */
#define IS_ERR_PCPU(ptr) (IS_ERR((const void *)(__force const unsigned long)(ptr)))

/**
 * IS_ERR_OR_NULL - Detect an error pointer or a null pointer.
 * @ptr: The pointer to check.
 *
 * Like IS_ERR(), but also returns true for a null pointer.
 */
static inline bool __must_check IS_ERR_OR_NULL(__force const void *ptr)
{
        return unlikely(!ptr) || IS_ERR_VALUE((unsigned long)ptr);
}

/**
 * ERR_CAST - Explicitly cast an error-valued pointer to another pointer type
 * @ptr: The pointer to cast.
 *
 * Explicitly cast an error-valued pointer to another pointer type in such a
 * way as to make it clear that's what's going on.
 */
static inline void * __must_check ERR_CAST(__force const void *ptr)
{
        /* cast away the const */
        return (void *) ptr;
}

/**
 * PTR_ERR_OR_ZERO - Extract the error code from a pointer if it has one.
 * @ptr: A potential error pointer.
 *
 * Convenience function that can be used inside a function that returns
 * an error code to propagate errors received as error pointers.
 * For example, ``return PTR_ERR_OR_ZERO(ptr);`` replaces:
 *
 * .. code-block:: c
 *
 *        if (IS_ERR(ptr))
 *                return PTR_ERR(ptr);
 *        else
 *                return 0;
 *
 * Return: The error code within @ptr if it is an error pointer; 0 otherwise.
 */
static inline int __must_check PTR_ERR_OR_ZERO(__force const void *ptr)
{
        if (IS_ERR(ptr))
                return PTR_ERR(ptr);
        else
                return 0;
}

#endif

#endif /* _LINUX_ERR_H */































































































   12 

   12 

























   12 


















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
/*
 *  include/linux/ktime.h
 *
 *  ktime_t - nanosecond-resolution time format.
 *
 *   Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
 *   Copyright(C) 2005, Red Hat, Inc., Ingo Molnar
 *
 *  data type definitions, declarations, prototypes and macros.
 *
 *  Started by: Thomas Gleixner and Ingo Molnar
 *
 *  Credits:
 *
 *          Roman Zippel provided the ideas and primary code snippets of
 *          the ktime_t union and further simplifications of the original
 *          code.
 *
 *  For licencing details see kernel-base/COPYING
 */
#ifndef _LINUX_KTIME_H
#define _LINUX_KTIME_H

#include <asm/bug.h>
#include <linux/jiffies.h>
#include <linux/time.h>
#include <linux/types.h>

/**
 * ktime_set - Set a ktime_t variable from a seconds/nanoseconds value
 * @secs:        seconds to set
 * @nsecs:        nanoseconds to set
 *
 * Return: The ktime_t representation of the value.
 */
static inline ktime_t ktime_set(const s64 secs, const unsigned long nsecs)
{
        if (unlikely(secs >= KTIME_SEC_MAX))
                return KTIME_MAX;

        return secs * NSEC_PER_SEC + (s64)nsecs;
}

/* Subtract two ktime_t variables. rem = lhs -rhs: */
#define ktime_sub(lhs, rhs)        ((lhs) - (rhs))

/* Add two ktime_t variables. res = lhs + rhs: */
#define ktime_add(lhs, rhs)        ((lhs) + (rhs))

/*
 * Same as ktime_add(), but avoids undefined behaviour on overflow; however,
 * this means that you must check the result for overflow yourself.
 */
#define ktime_add_unsafe(lhs, rhs)        ((u64) (lhs) + (rhs))

/*
 * Add a ktime_t variable and a scalar nanosecond value.
 * res = kt + nsval:
 */
#define ktime_add_ns(kt, nsval)                ((kt) + (nsval))

/*
 * Subtract a scalar nanosecod from a ktime_t variable
 * res = kt - nsval:
 */
#define ktime_sub_ns(kt, nsval)                ((kt) - (nsval))

/* convert a timespec64 to ktime_t format: */
static inline ktime_t timespec64_to_ktime(struct timespec64 ts)
{
        return ktime_set(ts.tv_sec, ts.tv_nsec);
}

/* Map the ktime_t to timespec conversion to ns_to_timespec function */
#define ktime_to_timespec64(kt)                ns_to_timespec64((kt))

/* Convert ktime_t to nanoseconds */
static inline s64 ktime_to_ns(const ktime_t kt)
{
        return kt;
}

/**
 * ktime_compare - Compares two ktime_t variables for less, greater or equal
 * @cmp1:        comparable1
 * @cmp2:        comparable2
 *
 * Return: ...
 *   cmp1  < cmp2: return <0
 *   cmp1 == cmp2: return 0
 *   cmp1  > cmp2: return >0
 */
static inline int ktime_compare(const ktime_t cmp1, const ktime_t cmp2)
{
        if (cmp1 < cmp2)
                return -1;
        if (cmp1 > cmp2)
                return 1;
        return 0;
}

/**
 * ktime_after - Compare if a ktime_t value is bigger than another one.
 * @cmp1:        comparable1
 * @cmp2:        comparable2
 *
 * Return: true if cmp1 happened after cmp2.
 */
static inline bool ktime_after(const ktime_t cmp1, const ktime_t cmp2)
{
        return ktime_compare(cmp1, cmp2) > 0;
}

/**
 * ktime_before - Compare if a ktime_t value is smaller than another one.
 * @cmp1:        comparable1
 * @cmp2:        comparable2
 *
 * Return: true if cmp1 happened before cmp2.
 */
static inline bool ktime_before(const ktime_t cmp1, const ktime_t cmp2)
{
        return ktime_compare(cmp1, cmp2) < 0;
}

#if BITS_PER_LONG < 64
extern s64 __ktime_divns(const ktime_t kt, s64 div);
static inline s64 ktime_divns(const ktime_t kt, s64 div)
{
        /*
         * Negative divisors could cause an inf loop,
         * so bug out here.
         */
        BUG_ON(div < 0);
        if (__builtin_constant_p(div) && !(div >> 32)) {
                s64 ns = kt;
                u64 tmp = ns < 0 ? -ns : ns;

                do_div(tmp, div);
                return ns < 0 ? -tmp : tmp;
        } else {
                return __ktime_divns(kt, div);
        }
}
#else /* BITS_PER_LONG < 64 */
static inline s64 ktime_divns(const ktime_t kt, s64 div)
{
        /*
         * 32-bit implementation cannot handle negative divisors,
         * so catch them on 64bit as well.
         */
        WARN_ON(div < 0);
        return kt / div;
}
#endif

static inline s64 ktime_to_us(const ktime_t kt)
{
        return ktime_divns(kt, NSEC_PER_USEC);
}

static inline s64 ktime_to_ms(const ktime_t kt)
{
        return ktime_divns(kt, NSEC_PER_MSEC);
}

static inline s64 ktime_us_delta(const ktime_t later, const ktime_t earlier)
{
       return ktime_to_us(ktime_sub(later, earlier));
}

static inline s64 ktime_ms_delta(const ktime_t later, const ktime_t earlier)
{
        return ktime_to_ms(ktime_sub(later, earlier));
}

static inline ktime_t ktime_add_us(const ktime_t kt, const u64 usec)
{
        return ktime_add_ns(kt, usec * NSEC_PER_USEC);
}

static inline ktime_t ktime_add_ms(const ktime_t kt, const u64 msec)
{
        return ktime_add_ns(kt, msec * NSEC_PER_MSEC);
}

static inline ktime_t ktime_sub_us(const ktime_t kt, const u64 usec)
{
        return ktime_sub_ns(kt, usec * NSEC_PER_USEC);
}

static inline ktime_t ktime_sub_ms(const ktime_t kt, const u64 msec)
{
        return ktime_sub_ns(kt, msec * NSEC_PER_MSEC);
}

extern ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs);

/**
 * ktime_to_timespec64_cond - convert a ktime_t variable to timespec64
 *                            format only if the variable contains data
 * @kt:                the ktime_t variable to convert
 * @ts:                the timespec variable to store the result in
 *
 * Return: %true if there was a successful conversion, %false if kt was 0.
 */
static inline __must_check bool ktime_to_timespec64_cond(const ktime_t kt,
                                                       struct timespec64 *ts)
{
        if (kt) {
                *ts = ktime_to_timespec64(kt);
                return true;
        } else {
                return false;
        }
}

#include <vdso/ktime.h>

static inline ktime_t ns_to_ktime(u64 ns)
{
        return ns;
}

static inline ktime_t us_to_ktime(u64 us)
{
        return us * NSEC_PER_USEC;
}

static inline ktime_t ms_to_ktime(u64 ms)
{
        return ms * NSEC_PER_MSEC;
}

# include <linux/timekeeping.h>

#endif



























































































































































  202 
  202 

















































































































  319 











  318 












  318 

  316 



  316 


  318 







  319 








  319 


  316 
  319 
  319 





  317 


  317 

  315 
  319 

  319 
  313 




  318 
  313 


























  313 






  316 

  315 



  315 
  315 


  319 





































































   50 


















  319 





  317 







  315 



  319 

  319 






  305 






  317 



  316 

  319 
  319 


  318 




























  303 
  302 




  301 
  303 





  304 

  305 





  301 

  197 

  300 










  299 

  298 


















































































































































































































  303 










  300 
   42 
  303 
  305 


   42 

  301 


   42 
  301 




  302 





  300 














  311 
    1 
  316 
  314 
  310 













































  312 






    3 





































































































































































































































  315 





  317 







  316 




























  315 



    3 
  319 


    3 



    3 


  312 


    3 













  253 







    3 
  249 



























  251 

  251 



    3 


  267 













  252 
  253 









  317 




  319 
   15 
   15 











   15 
  319 









  316 




  319 





  319 





  313 




  319 

  318 

  319 
  319 

  319 

  253 
  319 

  249 



  254 





  254 


  254 




























    1 







    1 
    1 





    1 
























    3 


    3 



    1 











    1 

   53 


    1 
    1 



   54 




    1 


    1 





    1 


    1 


    1 


    1 







    1 










    2 







   52 











    1 












   54 
   54 



   50 

   54 






    1 

   54 

    1 
    3 






































































































































































































































































  318 












  320 




  319 
  317 


  315 

    6 

    6 

    6 



    3 




  311 




  317 









  317 



  318 








  320 





  315 

    3 



    3 


  318 


  313 
  319 

















































































































































































































































  310 





















  245 



















  245 


  243 










































   11 



   11 


   11 

   11 








   11 




   11 




   14 










    5 
   14 





















   14 




   14 

    2 














   14 









   14 











   12 

   14 


   14 
   11 
    5 








   11 













   11 




   11 


   11 










   14 





   13 





   14 


   14 









    1 







   14 












    2 







   14 




   14 

   14 








   14 

    1 


    1 
   14 
   14 









  234 







  227 
  234 



  235 



  235 
  234 


  233 

  234 






  227 


  226 



  231 


  233 



  232 


  235 


  222 






  315 




  314 




  234 

  313 



  314 



  316 



  302 
  319 


  302 



  312 






    4 



































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * NETLINK      Kernel-user communication protocol.
 *
 *                 Authors:        Alan Cox <alan@lxorguk.ukuu.org.uk>
 *                                 Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
 *                                 Patrick McHardy <kaber@trash.net>
 *
 * Tue Jun 26 14:36:48 MEST 2001 Herbert "herp" Rosmanith
 *                               added netlink_proto_exit
 * Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo <acme@conectiva.com.br>
 *                                  use nlk_sk, as sk->protinfo is on a diet 8)
 * Fri Jul 22 19:51:12 MEST 2005 Harald Welte <laforge@gnumonks.org>
 *                                  - inc module use count of module that owns
 *                                    the kernel socket in case userspace opens
 *                                    socket of same protocol
 *                                  - remove all module support, since netlink is
 *                                    mandatory if CONFIG_NET=y these days
 */

#include <linux/module.h>

#include <linux/bpf.h>
#include <linux/capability.h>
#include <linux/kernel.h>
#include <linux/filter.h>
#include <linux/init.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/stat.h>
#include <linux/socket.h>
#include <linux/un.h>
#include <linux/fcntl.h>
#include <linux/termios.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/notifier.h>
#include <linux/security.h>
#include <linux/jhash.h>
#include <linux/jiffies.h>
#include <linux/random.h>
#include <linux/bitops.h>
#include <linux/mm.h>
#include <linux/types.h>
#include <linux/audit.h>
#include <linux/mutex.h>
#include <linux/vmalloc.h>
#include <linux/if_arp.h>
#include <linux/rhashtable.h>
#include <asm/cacheflush.h>
#include <linux/hash.h>
#include <linux/net_namespace.h>
#include <linux/nospec.h>
#include <linux/btf_ids.h>

#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/sock.h>
#include <net/scm.h>
#include <net/netlink.h>
#define CREATE_TRACE_POINTS
#include <trace/events/netlink.h>

#include "af_netlink.h"
#include "genetlink.h"

struct listeners {
        struct rcu_head                rcu;
        unsigned long                masks[];
};

/* state bits */
#define NETLINK_S_CONGESTED                0x0

static inline int netlink_is_kernel(struct sock *sk)
{
        return nlk_test_bit(KERNEL_SOCKET, sk);
}

struct netlink_table *nl_table __read_mostly;
EXPORT_SYMBOL_GPL(nl_table);

static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait);

static struct lock_class_key nlk_cb_mutex_keys[MAX_LINKS];

static const char *const nlk_cb_mutex_key_strings[MAX_LINKS + 1] = {
        "nlk_cb_mutex-ROUTE",
        "nlk_cb_mutex-1",
        "nlk_cb_mutex-USERSOCK",
        "nlk_cb_mutex-FIREWALL",
        "nlk_cb_mutex-SOCK_DIAG",
        "nlk_cb_mutex-NFLOG",
        "nlk_cb_mutex-XFRM",
        "nlk_cb_mutex-SELINUX",
        "nlk_cb_mutex-ISCSI",
        "nlk_cb_mutex-AUDIT",
        "nlk_cb_mutex-FIB_LOOKUP",
        "nlk_cb_mutex-CONNECTOR",
        "nlk_cb_mutex-NETFILTER",
        "nlk_cb_mutex-IP6_FW",
        "nlk_cb_mutex-DNRTMSG",
        "nlk_cb_mutex-KOBJECT_UEVENT",
        "nlk_cb_mutex-GENERIC",
        "nlk_cb_mutex-17",
        "nlk_cb_mutex-SCSITRANSPORT",
        "nlk_cb_mutex-ECRYPTFS",
        "nlk_cb_mutex-RDMA",
        "nlk_cb_mutex-CRYPTO",
        "nlk_cb_mutex-SMC",
        "nlk_cb_mutex-23",
        "nlk_cb_mutex-24",
        "nlk_cb_mutex-25",
        "nlk_cb_mutex-26",
        "nlk_cb_mutex-27",
        "nlk_cb_mutex-28",
        "nlk_cb_mutex-29",
        "nlk_cb_mutex-30",
        "nlk_cb_mutex-31",
        "nlk_cb_mutex-MAX_LINKS"
};

static int netlink_dump(struct sock *sk, bool lock_taken);

/* nl_table locking explained:
 * Lookup and traversal are protected with an RCU read-side lock. Insertion
 * and removal are protected with per bucket lock while using RCU list
 * modification primitives and may run in parallel to RCU protected lookups.
 * Destruction of the Netlink socket may only occur *after* nl_table_lock has
 * been acquired * either during or after the socket has been removed from
 * the list and after an RCU grace period.
 */
DEFINE_RWLOCK(nl_table_lock);
EXPORT_SYMBOL_GPL(nl_table_lock);
static atomic_t nl_table_users = ATOMIC_INIT(0);

#define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock));

static BLOCKING_NOTIFIER_HEAD(netlink_chain);


static const struct rhashtable_params netlink_rhashtable_params;

void do_trace_netlink_extack(const char *msg)
{
        trace_netlink_extack(msg);
}
EXPORT_SYMBOL(do_trace_netlink_extack);

static inline u32 netlink_group_mask(u32 group)
{
        if (group > 32)
                return 0;
        return group ? 1 << (group - 1) : 0;
}

static struct sk_buff *netlink_to_full_skb(const struct sk_buff *skb,
                                           gfp_t gfp_mask)
{
        unsigned int len = skb->len;
        struct sk_buff *new;

        new = alloc_skb(len, gfp_mask);
        if (new == NULL)
                return NULL;

        NETLINK_CB(new).portid = NETLINK_CB(skb).portid;
        NETLINK_CB(new).dst_group = NETLINK_CB(skb).dst_group;
        NETLINK_CB(new).creds = NETLINK_CB(skb).creds;

        skb_put_data(new, skb->data, len);
        return new;
}

static unsigned int netlink_tap_net_id;

struct netlink_tap_net {
        struct list_head netlink_tap_all;
        struct mutex netlink_tap_lock;
};

int netlink_add_tap(struct netlink_tap *nt)
{
        struct net *net = dev_net(nt->dev);
        struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);

        if (unlikely(nt->dev->type != ARPHRD_NETLINK))
                return -EINVAL;

        mutex_lock(&nn->netlink_tap_lock);
        list_add_rcu(&nt->list, &nn->netlink_tap_all);
        mutex_unlock(&nn->netlink_tap_lock);

        __module_get(nt->module);

        return 0;
}
EXPORT_SYMBOL_GPL(netlink_add_tap);

static int __netlink_remove_tap(struct netlink_tap *nt)
{
        struct net *net = dev_net(nt->dev);
        struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);
        bool found = false;
        struct netlink_tap *tmp;

        mutex_lock(&nn->netlink_tap_lock);

        list_for_each_entry(tmp, &nn->netlink_tap_all, list) {
                if (nt == tmp) {
                        list_del_rcu(&nt->list);
                        found = true;
                        goto out;
                }
        }

        pr_warn("__netlink_remove_tap: %p not found\n", nt);
out:
        mutex_unlock(&nn->netlink_tap_lock);

        if (found)
                module_put(nt->module);

        return found ? 0 : -ENODEV;
}

int netlink_remove_tap(struct netlink_tap *nt)
{
        int ret;

        ret = __netlink_remove_tap(nt);
        synchronize_net();

        return ret;
}
EXPORT_SYMBOL_GPL(netlink_remove_tap);

static __net_init int netlink_tap_init_net(struct net *net)
{
        struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);

        INIT_LIST_HEAD(&nn->netlink_tap_all);
        mutex_init(&nn->netlink_tap_lock);
        return 0;
}

static struct pernet_operations netlink_tap_net_ops = {
        .init = netlink_tap_init_net,
        .id   = &netlink_tap_net_id,
        .size = sizeof(struct netlink_tap_net),
};

static bool netlink_filter_tap(const struct sk_buff *skb)
{
        struct sock *sk = skb->sk;

        /* We take the more conservative approach and
         * whitelist socket protocols that may pass.
         */
        switch (sk->sk_protocol) {
        case NETLINK_ROUTE:
        case NETLINK_USERSOCK:
        case NETLINK_SOCK_DIAG:
        case NETLINK_NFLOG:
        case NETLINK_XFRM:
        case NETLINK_FIB_LOOKUP:
        case NETLINK_NETFILTER:
        case NETLINK_GENERIC:
                return true;
        }

        return false;
}

static int __netlink_deliver_tap_skb(struct sk_buff *skb,
                                     struct net_device *dev)
{
        struct sk_buff *nskb;
        struct sock *sk = skb->sk;
        int ret = -ENOMEM;

        if (!net_eq(dev_net(dev), sock_net(sk)))
                return 0;

        dev_hold(dev);

        if (is_vmalloc_addr(skb->head))
                nskb = netlink_to_full_skb(skb, GFP_ATOMIC);
        else
                nskb = skb_clone(skb, GFP_ATOMIC);
        if (nskb) {
                nskb->dev = dev;
                nskb->protocol = htons((u16) sk->sk_protocol);
                nskb->pkt_type = netlink_is_kernel(sk) ?
                                 PACKET_KERNEL : PACKET_USER;
                skb_reset_network_header(nskb);
                ret = dev_queue_xmit(nskb);
                if (unlikely(ret > 0))
                        ret = net_xmit_errno(ret);
        }

        dev_put(dev);
        return ret;
}

static void __netlink_deliver_tap(struct sk_buff *skb, struct netlink_tap_net *nn)
{
        int ret;
        struct netlink_tap *tmp;

        if (!netlink_filter_tap(skb))
                return;

        list_for_each_entry_rcu(tmp, &nn->netlink_tap_all, list) {
                ret = __netlink_deliver_tap_skb(skb, tmp->dev);
                if (unlikely(ret))
                        break;
        }
}

static void netlink_deliver_tap(struct net *net, struct sk_buff *skb)
{
        struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);

        rcu_read_lock();

        if (unlikely(!list_empty(&nn->netlink_tap_all)))
                __netlink_deliver_tap(skb, nn);

        rcu_read_unlock();
}

static void netlink_deliver_tap_kernel(struct sock *dst, struct sock *src,
                                       struct sk_buff *skb)
{
        if (!(netlink_is_kernel(dst) && netlink_is_kernel(src)))
                netlink_deliver_tap(sock_net(dst), skb);
}

static void netlink_overrun(struct sock *sk)
{
        if (!nlk_test_bit(RECV_NO_ENOBUFS, sk)) {
                if (!test_and_set_bit(NETLINK_S_CONGESTED,
                                      &nlk_sk(sk)->state)) {
                        WRITE_ONCE(sk->sk_err, ENOBUFS);
                        sk_error_report(sk);
                }
        }
        sk_drops_inc(sk);
}

static void netlink_rcv_wake(struct sock *sk)
{
        struct netlink_sock *nlk = nlk_sk(sk);

        if (skb_queue_empty_lockless(&sk->sk_receive_queue))
                clear_bit(NETLINK_S_CONGESTED, &nlk->state);
        if (!test_bit(NETLINK_S_CONGESTED, &nlk->state))
                wake_up_interruptible(&nlk->wait);
}

static void netlink_skb_destructor(struct sk_buff *skb)
{
        if (is_vmalloc_addr(skb->head)) {
                if (!skb->cloned ||
                    !atomic_dec_return(&(skb_shinfo(skb)->dataref)))
                        vfree_atomic(skb->head);

                skb->head = NULL;
        }
        if (skb->sk != NULL)
                sock_rfree(skb);
}

static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
{
        WARN_ON(skb->sk != NULL);
        skb->sk = sk;
        skb->destructor = netlink_skb_destructor;
        sk_mem_charge(sk, skb->truesize);
}

static void netlink_sock_destruct(struct sock *sk)
{
        skb_queue_purge(&sk->sk_receive_queue);

        if (!sock_flag(sk, SOCK_DEAD)) {
                printk(KERN_ERR "Freeing alive netlink socket %p\n", sk);
                return;
        }

        WARN_ON(atomic_read(&sk->sk_rmem_alloc));
        WARN_ON(refcount_read(&sk->sk_wmem_alloc));
        WARN_ON(nlk_sk(sk)->groups);
}

/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on
 * SMP. Look, when several writers sleep and reader wakes them up, all but one
 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
 * this, _but_ remember, it adds useless work on UP machines.
 */

void netlink_table_grab(void)
        __acquires(nl_table_lock)
{
        might_sleep();

        write_lock_irq(&nl_table_lock);

        if (atomic_read(&nl_table_users)) {
                DECLARE_WAITQUEUE(wait, current);

                add_wait_queue_exclusive(&nl_table_wait, &wait);
                for (;;) {
                        set_current_state(TASK_UNINTERRUPTIBLE);
                        if (atomic_read(&nl_table_users) == 0)
                                break;
                        write_unlock_irq(&nl_table_lock);
                        schedule();
                        write_lock_irq(&nl_table_lock);
                }

                __set_current_state(TASK_RUNNING);
                remove_wait_queue(&nl_table_wait, &wait);
        }
}

void netlink_table_ungrab(void)
        __releases(nl_table_lock)
{
        write_unlock_irq(&nl_table_lock);
        wake_up(&nl_table_wait);
}

static inline void
netlink_lock_table(void)
{
        unsigned long flags;

        /* read_lock() synchronizes us to netlink_table_grab */

        read_lock_irqsave(&nl_table_lock, flags);
        atomic_inc(&nl_table_users);
        read_unlock_irqrestore(&nl_table_lock, flags);
}

static inline void
netlink_unlock_table(void)
{
        if (atomic_dec_and_test(&nl_table_users))
                wake_up(&nl_table_wait);
}

struct netlink_compare_arg
{
        possible_net_t pnet;
        u32 portid;
};

/* Doing sizeof directly may yield 4 extra bytes on 64-bit. */
#define netlink_compare_arg_len \
        (offsetof(struct netlink_compare_arg, portid) + sizeof(u32))

static inline int netlink_compare(struct rhashtable_compare_arg *arg,
                                  const void *ptr)
{
        const struct netlink_compare_arg *x = arg->key;
        const struct netlink_sock *nlk = ptr;

        return nlk->portid != x->portid ||
               !net_eq(sock_net(&nlk->sk), read_pnet(&x->pnet));
}

static void netlink_compare_arg_init(struct netlink_compare_arg *arg,
                                     struct net *net, u32 portid)
{
        memset(arg, 0, sizeof(*arg));
        write_pnet(&arg->pnet, net);
        arg->portid = portid;
}

static struct sock *__netlink_lookup(struct netlink_table *table, u32 portid,
                                     struct net *net)
{
        struct netlink_compare_arg arg;

        netlink_compare_arg_init(&arg, net, portid);
        return rhashtable_lookup_fast(&table->hash, &arg,
                                      netlink_rhashtable_params);
}

static int __netlink_insert(struct netlink_table *table, struct sock *sk)
{
        struct netlink_compare_arg arg;

        netlink_compare_arg_init(&arg, sock_net(sk), nlk_sk(sk)->portid);
        return rhashtable_lookup_insert_key(&table->hash, &arg,
                                            &nlk_sk(sk)->node,
                                            netlink_rhashtable_params);
}

static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid)
{
        struct netlink_table *table = &nl_table[protocol];
        struct sock *sk;

        rcu_read_lock();
        sk = __netlink_lookup(table, portid, net);
        if (sk)
                sock_hold(sk);
        rcu_read_unlock();

        return sk;
}

static const struct proto_ops netlink_ops;

static void
netlink_update_listeners(struct sock *sk)
{
        struct netlink_table *tbl = &nl_table[sk->sk_protocol];
        unsigned long mask;
        unsigned int i;
        struct listeners *listeners;

        listeners = nl_deref_protected(tbl->listeners);
        if (!listeners)
                return;

        for (i = 0; i < NLGRPLONGS(tbl->groups); i++) {
                mask = 0;
                sk_for_each_bound(sk, &tbl->mc_list) {
                        if (i < NLGRPLONGS(nlk_sk(sk)->ngroups))
                                mask |= nlk_sk(sk)->groups[i];
                }
                listeners->masks[i] = mask;
        }
        /* this function is only called with the netlink table "grabbed", which
         * makes sure updates are visible before bind or setsockopt return. */
}

static int netlink_insert(struct sock *sk, u32 portid)
{
        struct netlink_table *table = &nl_table[sk->sk_protocol];
        int err;

        lock_sock(sk);

        err = nlk_sk(sk)->portid == portid ? 0 : -EBUSY;
        if (nlk_sk(sk)->bound)
                goto err;

        /* portid can be read locklessly from netlink_getname(). */
        WRITE_ONCE(nlk_sk(sk)->portid, portid);

        sock_hold(sk);

        err = __netlink_insert(table, sk);
        if (err) {
                /* In case the hashtable backend returns with -EBUSY
                 * from here, it must not escape to the caller.
                 */
                if (unlikely(err == -EBUSY))
                        err = -EOVERFLOW;
                if (err == -EEXIST)
                        err = -EADDRINUSE;
                sock_put(sk);
                goto err;
        }

        /* We need to ensure that the socket is hashed and visible. */
        smp_wmb();
        /* Paired with lockless reads from netlink_bind(),
         * netlink_connect() and netlink_sendmsg().
         */
        WRITE_ONCE(nlk_sk(sk)->bound, portid);

err:
        release_sock(sk);
        return err;
}

static void netlink_remove(struct sock *sk)
{
        struct netlink_table *table;

        table = &nl_table[sk->sk_protocol];
        if (!rhashtable_remove_fast(&table->hash, &nlk_sk(sk)->node,
                                    netlink_rhashtable_params)) {
                WARN_ON(refcount_read(&sk->sk_refcnt) == 1);
                __sock_put(sk);
        }

        netlink_table_grab();
        if (nlk_sk(sk)->subscriptions) {
                __sk_del_bind_node(sk);
                netlink_update_listeners(sk);
        }
        if (sk->sk_protocol == NETLINK_GENERIC)
                atomic_inc(&genl_sk_destructing_cnt);
        netlink_table_ungrab();
}

static struct proto netlink_proto = {
        .name          = "NETLINK",
        .owner          = THIS_MODULE,
        .obj_size = sizeof(struct netlink_sock),
};

static int __netlink_create(struct net *net, struct socket *sock,
                            int protocol, int kern)
{
        struct sock *sk;
        struct netlink_sock *nlk;

        sock->ops = &netlink_ops;

        sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto, kern);
        if (!sk)
                return -ENOMEM;

        sock_init_data(sock, sk);

        nlk = nlk_sk(sk);
        mutex_init(&nlk->nl_cb_mutex);
        lockdep_set_class_and_name(&nlk->nl_cb_mutex,
                                           nlk_cb_mutex_keys + protocol,
                                           nlk_cb_mutex_key_strings[protocol]);
        init_waitqueue_head(&nlk->wait);

        sk->sk_destruct = netlink_sock_destruct;
        sk->sk_protocol = protocol;
        return 0;
}

static int netlink_create(struct net *net, struct socket *sock, int protocol,
                          int kern)
{
        struct module *module = NULL;
        struct netlink_sock *nlk;
        int (*bind)(struct net *net, int group);
        void (*unbind)(struct net *net, int group);
        void (*release)(struct sock *sock, unsigned long *groups);
        int err = 0;

        sock->state = SS_UNCONNECTED;

        if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
                return -ESOCKTNOSUPPORT;

        if (protocol < 0 || protocol >= MAX_LINKS)
                return -EPROTONOSUPPORT;
        protocol = array_index_nospec(protocol, MAX_LINKS);

        netlink_lock_table();
#ifdef CONFIG_MODULES
        if (!nl_table[protocol].registered) {
                netlink_unlock_table();
                request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol);
                netlink_lock_table();
        }
#endif
        if (nl_table[protocol].registered &&
            try_module_get(nl_table[protocol].module))
                module = nl_table[protocol].module;
        else
                err = -EPROTONOSUPPORT;
        bind = nl_table[protocol].bind;
        unbind = nl_table[protocol].unbind;
        release = nl_table[protocol].release;
        netlink_unlock_table();

        if (err < 0)
                goto out;

        err = __netlink_create(net, sock, protocol, kern);
        if (err < 0)
                goto out_module;

        sock_prot_inuse_add(net, &netlink_proto, 1);

        nlk = nlk_sk(sock->sk);
        nlk->module = module;
        nlk->netlink_bind = bind;
        nlk->netlink_unbind = unbind;
        nlk->netlink_release = release;
out:
        return err;

out_module:
        module_put(module);
        goto out;
}

static void deferred_put_nlk_sk(struct rcu_head *head)
{
        struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu);
        struct sock *sk = &nlk->sk;

        kfree(nlk->groups);
        nlk->groups = NULL;

        if (!refcount_dec_and_test(&sk->sk_refcnt))
                return;

        sk_free(sk);
}

static int netlink_release(struct socket *sock)
{
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk;

        if (!sk)
                return 0;

        netlink_remove(sk);
        sock_orphan(sk);
        nlk = nlk_sk(sk);

        /*
         * OK. Socket is unlinked, any packets that arrive now
         * will be purged.
         */
        if (nlk->netlink_release)
                nlk->netlink_release(sk, nlk->groups);

        /* must not acquire netlink_table_lock in any way again before unbind
         * and notifying genetlink is done as otherwise it might deadlock
         */
        if (nlk->netlink_unbind) {
                int i;

                for (i = 0; i < nlk->ngroups; i++)
                        if (test_bit(i, nlk->groups))
                                nlk->netlink_unbind(sock_net(sk), i + 1);
        }
        if (sk->sk_protocol == NETLINK_GENERIC &&
            atomic_dec_return(&genl_sk_destructing_cnt) == 0)
                wake_up(&genl_sk_destructing_waitq);

        sock->sk = NULL;
        wake_up_interruptible_all(&nlk->wait);

        skb_queue_purge(&sk->sk_write_queue);

        if (nlk->portid && nlk->bound) {
                struct netlink_notify n = {
                                                .net = sock_net(sk),
                                                .protocol = sk->sk_protocol,
                                                .portid = nlk->portid,
                                          };
                blocking_notifier_call_chain(&netlink_chain,
                                NETLINK_URELEASE, &n);
        }

        /* Terminate any outstanding dump */
        if (nlk->cb_running) {
                if (nlk->cb.done)
                        nlk->cb.done(&nlk->cb);
                module_put(nlk->cb.module);
                kfree_skb(nlk->cb.skb);
                WRITE_ONCE(nlk->cb_running, false);
        }

        module_put(nlk->module);

        if (netlink_is_kernel(sk)) {
                netlink_table_grab();
                BUG_ON(nl_table[sk->sk_protocol].registered == 0);
                if (--nl_table[sk->sk_protocol].registered == 0) {
                        struct listeners *old;

                        old = nl_deref_protected(nl_table[sk->sk_protocol].listeners);
                        RCU_INIT_POINTER(nl_table[sk->sk_protocol].listeners, NULL);
                        kfree_rcu(old, rcu);
                        nl_table[sk->sk_protocol].module = NULL;
                        nl_table[sk->sk_protocol].bind = NULL;
                        nl_table[sk->sk_protocol].unbind = NULL;
                        nl_table[sk->sk_protocol].flags = 0;
                        nl_table[sk->sk_protocol].registered = 0;
                }
                netlink_table_ungrab();
        }

        sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1);

        call_rcu(&nlk->rcu, deferred_put_nlk_sk);
        return 0;
}

static int netlink_autobind(struct socket *sock)
{
        struct sock *sk = sock->sk;
        struct net *net = sock_net(sk);
        struct netlink_table *table = &nl_table[sk->sk_protocol];
        s32 portid = task_tgid_vnr(current);
        int err;
        s32 rover = -4096;
        bool ok;

retry:
        cond_resched();
        rcu_read_lock();
        ok = !__netlink_lookup(table, portid, net);
        rcu_read_unlock();
        if (!ok) {
                /* Bind collision, search negative portid values. */
                if (rover == -4096)
                        /* rover will be in range [S32_MIN, -4097] */
                        rover = S32_MIN + get_random_u32_below(-4096 - S32_MIN);
                else if (rover >= -4096)
                        rover = -4097;
                portid = rover--;
                goto retry;
        }

        err = netlink_insert(sk, portid);
        if (err == -EADDRINUSE)
                goto retry;

        /* If 2 threads race to autobind, that is fine.  */
        if (err == -EBUSY)
                err = 0;

        return err;
}

/**
 * __netlink_ns_capable - General netlink message capability test
 * @nsp: NETLINK_CB of the socket buffer holding a netlink command from userspace.
 * @user_ns: The user namespace of the capability to use
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has the capability @cap in the user namespace @user_ns.
 */
bool __netlink_ns_capable(const struct netlink_skb_parms *nsp,
                        struct user_namespace *user_ns, int cap)
{
        return ((nsp->flags & NETLINK_SKB_DST) ||
                file_ns_capable(nsp->sk->sk_socket->file, user_ns, cap)) &&
                ns_capable(user_ns, cap);
}
EXPORT_SYMBOL(__netlink_ns_capable);

/**
 * netlink_ns_capable - General netlink message capability test
 * @skb: socket buffer holding a netlink command from userspace
 * @user_ns: The user namespace of the capability to use
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has the capability @cap in the user namespace @user_ns.
 */
bool netlink_ns_capable(const struct sk_buff *skb,
                        struct user_namespace *user_ns, int cap)
{
        return __netlink_ns_capable(&NETLINK_CB(skb), user_ns, cap);
}
EXPORT_SYMBOL(netlink_ns_capable);

/**
 * netlink_capable - Netlink global message capability test
 * @skb: socket buffer holding a netlink command from userspace
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has the capability @cap in all user namespaces.
 */
bool netlink_capable(const struct sk_buff *skb, int cap)
{
        return netlink_ns_capable(skb, &init_user_ns, cap);
}
EXPORT_SYMBOL(netlink_capable);

/**
 * netlink_net_capable - Netlink network namespace message capability test
 * @skb: socket buffer holding a netlink command from userspace
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has the capability @cap over the network namespace of
 * the socket we received the message from.
 */
bool netlink_net_capable(const struct sk_buff *skb, int cap)
{
        return netlink_ns_capable(skb, sock_net(skb->sk)->user_ns, cap);
}
EXPORT_SYMBOL(netlink_net_capable);

static inline int netlink_allowed(const struct socket *sock, unsigned int flag)
{
        return (nl_table[sock->sk->sk_protocol].flags & flag) ||
                ns_capable(sock_net(sock->sk)->user_ns, CAP_NET_ADMIN);
}

static void
netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions)
{
        struct netlink_sock *nlk = nlk_sk(sk);

        if (nlk->subscriptions && !subscriptions)
                __sk_del_bind_node(sk);
        else if (!nlk->subscriptions && subscriptions)
                sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list);
        nlk->subscriptions = subscriptions;
}

static int netlink_realloc_groups(struct sock *sk)
{
        struct netlink_sock *nlk = nlk_sk(sk);
        unsigned int groups;
        unsigned long *new_groups;
        int err = 0;

        netlink_table_grab();

        groups = nl_table[sk->sk_protocol].groups;
        if (!nl_table[sk->sk_protocol].registered) {
                err = -ENOENT;
                goto out_unlock;
        }

        if (nlk->ngroups >= groups)
                goto out_unlock;

        new_groups = krealloc(nlk->groups, NLGRPSZ(groups), GFP_ATOMIC);
        if (new_groups == NULL) {
                err = -ENOMEM;
                goto out_unlock;
        }
        memset((char *)new_groups + NLGRPSZ(nlk->ngroups), 0,
               NLGRPSZ(groups) - NLGRPSZ(nlk->ngroups));

        nlk->groups = new_groups;
        nlk->ngroups = groups;
 out_unlock:
        netlink_table_ungrab();
        return err;
}

static void netlink_undo_bind(int group, long unsigned int groups,
                              struct sock *sk)
{
        struct netlink_sock *nlk = nlk_sk(sk);
        int undo;

        if (!nlk->netlink_unbind)
                return;

        for (undo = 0; undo < group; undo++)
                if (test_bit(undo, &groups))
                        nlk->netlink_unbind(sock_net(sk), undo + 1);
}

static int netlink_bind(struct socket *sock, struct sockaddr *addr,
                        int addr_len)
{
        struct sock *sk = sock->sk;
        struct net *net = sock_net(sk);
        struct netlink_sock *nlk = nlk_sk(sk);
        struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
        int err = 0;
        unsigned long groups;
        bool bound;

        if (addr_len < sizeof(struct sockaddr_nl))
                return -EINVAL;

        if (nladdr->nl_family != AF_NETLINK)
                return -EINVAL;
        groups = nladdr->nl_groups;

        /* Only superuser is allowed to listen multicasts */
        if (groups) {
                if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV))
                        return -EPERM;
                err = netlink_realloc_groups(sk);
                if (err)
                        return err;
        }

        if (nlk->ngroups < BITS_PER_LONG)
                groups &= (1UL << nlk->ngroups) - 1;

        /* Paired with WRITE_ONCE() in netlink_insert() */
        bound = READ_ONCE(nlk->bound);
        if (bound) {
                /* Ensure nlk->portid is up-to-date. */
                smp_rmb();

                if (nladdr->nl_pid != nlk->portid)
                        return -EINVAL;
        }

        if (nlk->netlink_bind && groups) {
                int group;

                /* nl_groups is a u32, so cap the maximum groups we can bind */
                for (group = 0; group < BITS_PER_TYPE(u32); group++) {
                        if (!test_bit(group, &groups))
                                continue;
                        err = nlk->netlink_bind(net, group + 1);
                        if (!err)
                                continue;
                        netlink_undo_bind(group, groups, sk);
                        return err;
                }
        }

        /* No need for barriers here as we return to user-space without
         * using any of the bound attributes.
         */
        netlink_lock_table();
        if (!bound) {
                err = nladdr->nl_pid ?
                        netlink_insert(sk, nladdr->nl_pid) :
                        netlink_autobind(sock);
                if (err) {
                        netlink_undo_bind(BITS_PER_TYPE(u32), groups, sk);
                        goto unlock;
                }
        }

        if (!groups && (nlk->groups == NULL || !(u32)nlk->groups[0]))
                goto unlock;
        netlink_unlock_table();

        netlink_table_grab();
        netlink_update_subscriptions(sk, nlk->subscriptions +
                                         hweight32(groups) -
                                         hweight32(nlk->groups[0]));
        nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | groups;
        netlink_update_listeners(sk);
        netlink_table_ungrab();

        return 0;

unlock:
        netlink_unlock_table();
        return err;
}

static int netlink_connect(struct socket *sock, struct sockaddr *addr,
                           int alen, int flags)
{
        int err = 0;
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk = nlk_sk(sk);
        struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;

        if (alen < sizeof(addr->sa_family))
                return -EINVAL;

        if (addr->sa_family == AF_UNSPEC) {
                /* paired with READ_ONCE() in netlink_getsockbyportid() */
                WRITE_ONCE(sk->sk_state, NETLINK_UNCONNECTED);
                /* dst_portid and dst_group can be read locklessly */
                WRITE_ONCE(nlk->dst_portid, 0);
                WRITE_ONCE(nlk->dst_group, 0);
                return 0;
        }
        if (addr->sa_family != AF_NETLINK)
                return -EINVAL;

        if (alen < sizeof(struct sockaddr_nl))
                return -EINVAL;

        if ((nladdr->nl_groups || nladdr->nl_pid) &&
            !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND))
                return -EPERM;

        /* No need for barriers here as we return to user-space without
         * using any of the bound attributes.
         * Paired with WRITE_ONCE() in netlink_insert().
         */
        if (!READ_ONCE(nlk->bound))
                err = netlink_autobind(sock);

        if (err == 0) {
                /* paired with READ_ONCE() in netlink_getsockbyportid() */
                WRITE_ONCE(sk->sk_state, NETLINK_CONNECTED);
                /* dst_portid and dst_group can be read locklessly */
                WRITE_ONCE(nlk->dst_portid, nladdr->nl_pid);
                WRITE_ONCE(nlk->dst_group, ffs(nladdr->nl_groups));
        }

        return err;
}

static int netlink_getname(struct socket *sock, struct sockaddr *addr,
                           int peer)
{
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk = nlk_sk(sk);
        DECLARE_SOCKADDR(struct sockaddr_nl *, nladdr, addr);

        nladdr->nl_family = AF_NETLINK;
        nladdr->nl_pad = 0;

        if (peer) {
                /* Paired with WRITE_ONCE() in netlink_connect() */
                nladdr->nl_pid = READ_ONCE(nlk->dst_portid);
                nladdr->nl_groups = netlink_group_mask(READ_ONCE(nlk->dst_group));
        } else {
                /* Paired with WRITE_ONCE() in netlink_insert() */
                nladdr->nl_pid = READ_ONCE(nlk->portid);
                netlink_lock_table();
                nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0;
                netlink_unlock_table();
        }
        return sizeof(*nladdr);
}

static int netlink_ioctl(struct socket *sock, unsigned int cmd,
                         unsigned long arg)
{
        /* try to hand this ioctl down to the NIC drivers.
         */
        return -ENOIOCTLCMD;
}

static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid)
{
        struct sock *sock;
        struct netlink_sock *nlk;

        sock = netlink_lookup(sock_net(ssk), ssk->sk_protocol, portid);
        if (!sock)
                return ERR_PTR(-ECONNREFUSED);

        /* Don't bother queuing skb if kernel socket has no input function */
        nlk = nlk_sk(sock);
        /* dst_portid and sk_state can be changed in netlink_connect() */
        if (READ_ONCE(sock->sk_state) == NETLINK_CONNECTED &&
            READ_ONCE(nlk->dst_portid) != nlk_sk(ssk)->portid) {
                sock_put(sock);
                return ERR_PTR(-ECONNREFUSED);
        }
        return sock;
}

struct sock *netlink_getsockbyfd(int fd)
{
        CLASS(fd, f)(fd);
        struct inode *inode;
        struct sock *sock;

        if (fd_empty(f))
                return ERR_PTR(-EBADF);

        inode = file_inode(fd_file(f));
        if (!S_ISSOCK(inode->i_mode))
                return ERR_PTR(-ENOTSOCK);

        sock = SOCKET_I(inode)->sk;
        if (sock->sk_family != AF_NETLINK)
                return ERR_PTR(-EINVAL);

        sock_hold(sock);
        return sock;
}

struct sk_buff *netlink_alloc_large_skb(unsigned int size, int broadcast)
{
        size_t head_size = SKB_HEAD_ALIGN(size);
        struct sk_buff *skb;
        void *data;

        if (head_size <= PAGE_SIZE || broadcast)
                return alloc_skb(size, GFP_KERNEL);

        data = kvmalloc(head_size, GFP_KERNEL);
        if (!data)
                return NULL;

        skb = __build_skb(data, head_size);
        if (!skb)
                kvfree(data);
        else if (is_vmalloc_addr(data))
                skb->destructor = netlink_skb_destructor;

        return skb;
}

/*
 * Attach a skb to a netlink socket.
 * The caller must hold a reference to the destination socket. On error, the
 * reference is dropped. The skb is not send to the destination, just all
 * all error checks are performed and memory in the queue is reserved.
 * Return values:
 * < 0: error. skb freed, reference to sock dropped.
 * 0: continue
 * 1: repeat lookup - reference dropped while waiting for socket memory.
 */
int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
                      long *timeo, struct sock *ssk)
{
        DECLARE_WAITQUEUE(wait, current);
        struct netlink_sock *nlk;
        unsigned int rmem;

        nlk = nlk_sk(sk);
        rmem = atomic_add_return(skb->truesize, &sk->sk_rmem_alloc);

        if ((rmem == skb->truesize || rmem <= READ_ONCE(sk->sk_rcvbuf)) &&
            !test_bit(NETLINK_S_CONGESTED, &nlk->state)) {
                netlink_skb_set_owner_r(skb, sk);
                return 0;
        }

        atomic_sub(skb->truesize, &sk->sk_rmem_alloc);

        if (!*timeo) {
                if (!ssk || netlink_is_kernel(ssk))
                        netlink_overrun(sk);
                sock_put(sk);
                kfree_skb(skb);
                return -EAGAIN;
        }

        __set_current_state(TASK_INTERRUPTIBLE);
        add_wait_queue(&nlk->wait, &wait);
        rmem = atomic_read(&sk->sk_rmem_alloc);

        if (((rmem && rmem + skb->truesize > READ_ONCE(sk->sk_rcvbuf)) ||
             test_bit(NETLINK_S_CONGESTED, &nlk->state)) &&
            !sock_flag(sk, SOCK_DEAD))
                *timeo = schedule_timeout(*timeo);

        __set_current_state(TASK_RUNNING);
        remove_wait_queue(&nlk->wait, &wait);
        sock_put(sk);

        if (signal_pending(current)) {
                kfree_skb(skb);
                return sock_intr_errno(*timeo);
        }

        return 1;
}

static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
{
        int len = skb->len;

        netlink_deliver_tap(sock_net(sk), skb);

        skb_queue_tail(&sk->sk_receive_queue, skb);
        sk->sk_data_ready(sk);
        return len;
}

int netlink_sendskb(struct sock *sk, struct sk_buff *skb)
{
        int len = __netlink_sendskb(sk, skb);

        sock_put(sk);
        return len;
}

void netlink_detachskb(struct sock *sk, struct sk_buff *skb)
{
        kfree_skb(skb);
        sock_put(sk);
}

static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
{
        int delta;

        skb_assert_len(skb);
        WARN_ON(skb->sk != NULL);
        delta = skb->end - skb->tail;
        if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize)
                return skb;

        if (skb_shared(skb)) {
                struct sk_buff *nskb = skb_clone(skb, allocation);
                if (!nskb)
                        return skb;
                consume_skb(skb);
                skb = nskb;
        }

        pskb_expand_head(skb, 0, -delta,
                         (allocation & ~__GFP_DIRECT_RECLAIM) |
                         __GFP_NOWARN | __GFP_NORETRY);
        return skb;
}

static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb,
                                  struct sock *ssk)
{
        int ret;
        struct netlink_sock *nlk = nlk_sk(sk);

        ret = -ECONNREFUSED;
        if (nlk->netlink_rcv != NULL) {
                ret = skb->len;
                atomic_add(skb->truesize, &sk->sk_rmem_alloc);
                netlink_skb_set_owner_r(skb, sk);
                NETLINK_CB(skb).sk = ssk;
                netlink_deliver_tap_kernel(sk, ssk, skb);
                nlk->netlink_rcv(skb);
                consume_skb(skb);
        } else {
                kfree_skb(skb);
        }
        sock_put(sk);
        return ret;
}

int netlink_unicast(struct sock *ssk, struct sk_buff *skb,
                    u32 portid, int nonblock)
{
        struct sock *sk;
        int err;
        long timeo;

        skb = netlink_trim(skb, gfp_any());

        timeo = sock_sndtimeo(ssk, nonblock);
retry:
        sk = netlink_getsockbyportid(ssk, portid);
        if (IS_ERR(sk)) {
                kfree_skb(skb);
                return PTR_ERR(sk);
        }
        if (netlink_is_kernel(sk))
                return netlink_unicast_kernel(sk, skb, ssk);

        if (sk_filter(sk, skb)) {
                err = skb->len;
                kfree_skb(skb);
                sock_put(sk);
                return err;
        }

        err = netlink_attachskb(sk, skb, &timeo, ssk);
        if (err == 1)
                goto retry;
        if (err)
                return err;

        return netlink_sendskb(sk, skb);
}
EXPORT_SYMBOL(netlink_unicast);

int netlink_has_listeners(struct sock *sk, unsigned int group)
{
        int res = 0;
        struct listeners *listeners;

        BUG_ON(!netlink_is_kernel(sk));

        rcu_read_lock();
        listeners = rcu_dereference(nl_table[sk->sk_protocol].listeners);

        if (listeners && group - 1 < nl_table[sk->sk_protocol].groups)
                res = test_bit(group - 1, listeners->masks);

        rcu_read_unlock();

        return res;
}
EXPORT_SYMBOL_GPL(netlink_has_listeners);

bool netlink_strict_get_check(struct sk_buff *skb)
{
        return nlk_test_bit(STRICT_CHK, NETLINK_CB(skb).sk);
}
EXPORT_SYMBOL_GPL(netlink_strict_get_check);

static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb)
{
        struct netlink_sock *nlk = nlk_sk(sk);
        unsigned int rmem, rcvbuf;

        rmem = atomic_add_return(skb->truesize, &sk->sk_rmem_alloc);
        rcvbuf = READ_ONCE(sk->sk_rcvbuf);

        if ((rmem == skb->truesize || rmem <= rcvbuf) &&
            !test_bit(NETLINK_S_CONGESTED, &nlk->state)) {
                netlink_skb_set_owner_r(skb, sk);
                __netlink_sendskb(sk, skb);
                return rmem > (rcvbuf >> 1);
        }

        atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
        return -1;
}

struct netlink_broadcast_data {
        struct sock *exclude_sk;
        struct net *net;
        u32 portid;
        u32 group;
        int failure;
        int delivery_failure;
        int congested;
        int delivered;
        gfp_t allocation;
        struct sk_buff *skb, *skb2;
        int (*tx_filter)(struct sock *dsk, struct sk_buff *skb, void *data);
        void *tx_data;
};

static void do_one_broadcast(struct sock *sk,
                                    struct netlink_broadcast_data *p)
{
        struct netlink_sock *nlk = nlk_sk(sk);
        int val;

        if (p->exclude_sk == sk)
                return;

        if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups ||
            !test_bit(p->group - 1, nlk->groups))
                return;

        if (!net_eq(sock_net(sk), p->net)) {
                if (!nlk_test_bit(LISTEN_ALL_NSID, sk))
                        return;

                if (!peernet_has_id(sock_net(sk), p->net))
                        return;

                if (!file_ns_capable(sk->sk_socket->file, p->net->user_ns,
                                     CAP_NET_BROADCAST))
                        return;
        }

        if (p->failure) {
                netlink_overrun(sk);
                return;
        }

        sock_hold(sk);
        if (p->skb2 == NULL) {
                if (skb_shared(p->skb)) {
                        p->skb2 = skb_clone(p->skb, p->allocation);
                } else {
                        p->skb2 = skb_get(p->skb);
                        /*
                         * skb ownership may have been set when
                         * delivered to a previous socket.
                         */
                        skb_orphan(p->skb2);
                }
        }
        if (p->skb2 == NULL) {
                netlink_overrun(sk);
                /* Clone failed. Notify ALL listeners. */
                p->failure = 1;
                if (nlk_test_bit(BROADCAST_SEND_ERROR, sk))
                        p->delivery_failure = 1;
                goto out;
        }

        if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) {
                kfree_skb(p->skb2);
                p->skb2 = NULL;
                goto out;
        }

        if (sk_filter(sk, p->skb2)) {
                kfree_skb(p->skb2);
                p->skb2 = NULL;
                goto out;
        }
        NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net);
        if (NETLINK_CB(p->skb2).nsid != NETNSA_NSID_NOT_ASSIGNED)
                NETLINK_CB(p->skb2).nsid_is_set = true;
        val = netlink_broadcast_deliver(sk, p->skb2);
        if (val < 0) {
                netlink_overrun(sk);
                if (nlk_test_bit(BROADCAST_SEND_ERROR, sk))
                        p->delivery_failure = 1;
        } else {
                p->congested |= val;
                p->delivered = 1;
                p->skb2 = NULL;
        }
out:
        sock_put(sk);
}

int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb,
                               u32 portid,
                               u32 group, gfp_t allocation,
                               netlink_filter_fn filter,
                               void *filter_data)
{
        struct net *net = sock_net(ssk);
        struct netlink_broadcast_data info;
        struct sock *sk;

        skb = netlink_trim(skb, allocation);

        info.exclude_sk = ssk;
        info.net = net;
        info.portid = portid;
        info.group = group;
        info.failure = 0;
        info.delivery_failure = 0;
        info.congested = 0;
        info.delivered = 0;
        info.allocation = allocation;
        info.skb = skb;
        info.skb2 = NULL;
        info.tx_filter = filter;
        info.tx_data = filter_data;

        /* While we sleep in clone, do not allow to change socket list */

        netlink_lock_table();

        sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list)
                do_one_broadcast(sk, &info);

        consume_skb(skb);

        netlink_unlock_table();

        if (info.delivery_failure) {
                kfree_skb(info.skb2);
                return -ENOBUFS;
        }
        consume_skb(info.skb2);

        if (info.delivered) {
                if (info.congested && gfpflags_allow_blocking(allocation))
                        yield();
                return 0;
        }
        return -ESRCH;
}
EXPORT_SYMBOL(netlink_broadcast_filtered);

int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid,
                      u32 group, gfp_t allocation)
{
        return netlink_broadcast_filtered(ssk, skb, portid, group, allocation,
                                          NULL, NULL);
}
EXPORT_SYMBOL(netlink_broadcast);

struct netlink_set_err_data {
        struct sock *exclude_sk;
        u32 portid;
        u32 group;
        int code;
};

static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p)
{
        struct netlink_sock *nlk = nlk_sk(sk);
        int ret = 0;

        if (sk == p->exclude_sk)
                goto out;

        if (!net_eq(sock_net(sk), sock_net(p->exclude_sk)))
                goto out;

        if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups ||
            !test_bit(p->group - 1, nlk->groups))
                goto out;

        if (p->code == ENOBUFS && nlk_test_bit(RECV_NO_ENOBUFS, sk)) {
                ret = 1;
                goto out;
        }

        WRITE_ONCE(sk->sk_err, p->code);
        sk_error_report(sk);
out:
        return ret;
}

/**
 * netlink_set_err - report error to broadcast listeners
 * @ssk: the kernel netlink socket, as returned by netlink_kernel_create()
 * @portid: the PORTID of a process that we want to skip (if any)
 * @group: the broadcast group that will notice the error
 * @code: error code, must be negative (as usual in kernelspace)
 *
 * This function returns the number of broadcast listeners that have set the
 * NETLINK_NO_ENOBUFS socket option.
 */
int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code)
{
        struct netlink_set_err_data info;
        unsigned long flags;
        struct sock *sk;
        int ret = 0;

        info.exclude_sk = ssk;
        info.portid = portid;
        info.group = group;
        /* sk->sk_err wants a positive error value */
        info.code = -code;

        read_lock_irqsave(&nl_table_lock, flags);

        sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list)
                ret += do_one_set_err(sk, &info);

        read_unlock_irqrestore(&nl_table_lock, flags);
        return ret;
}
EXPORT_SYMBOL(netlink_set_err);

/* must be called with netlink table grabbed */
static void netlink_update_socket_mc(struct netlink_sock *nlk,
                                     unsigned int group,
                                     int is_new)
{
        int old, new = !!is_new, subscriptions;

        old = test_bit(group - 1, nlk->groups);
        subscriptions = nlk->subscriptions - old + new;
        __assign_bit(group - 1, nlk->groups, new);
        netlink_update_subscriptions(&nlk->sk, subscriptions);
        netlink_update_listeners(&nlk->sk);
}

static int netlink_setsockopt(struct socket *sock, int level, int optname,
                              sockptr_t optval, unsigned int optlen)
{
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk = nlk_sk(sk);
        unsigned int val = 0;
        int nr = -1;

        if (level != SOL_NETLINK)
                return -ENOPROTOOPT;

        if (optlen >= sizeof(int) &&
            copy_from_sockptr(&val, optval, sizeof(val)))
                return -EFAULT;

        switch (optname) {
        case NETLINK_PKTINFO:
                nr = NETLINK_F_RECV_PKTINFO;
                break;
        case NETLINK_ADD_MEMBERSHIP:
        case NETLINK_DROP_MEMBERSHIP: {
                int err;

                if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV))
                        return -EPERM;
                err = netlink_realloc_groups(sk);
                if (err)
                        return err;
                if (!val || val - 1 >= nlk->ngroups)
                        return -EINVAL;
                if (optname == NETLINK_ADD_MEMBERSHIP && nlk->netlink_bind) {
                        err = nlk->netlink_bind(sock_net(sk), val);
                        if (err)
                                return err;
                }
                netlink_table_grab();
                netlink_update_socket_mc(nlk, val,
                                         optname == NETLINK_ADD_MEMBERSHIP);
                netlink_table_ungrab();
                if (optname == NETLINK_DROP_MEMBERSHIP && nlk->netlink_unbind)
                        nlk->netlink_unbind(sock_net(sk), val);

                break;
        }
        case NETLINK_BROADCAST_ERROR:
                nr = NETLINK_F_BROADCAST_SEND_ERROR;
                break;
        case NETLINK_NO_ENOBUFS:
                assign_bit(NETLINK_F_RECV_NO_ENOBUFS, &nlk->flags, val);
                if (val) {
                        clear_bit(NETLINK_S_CONGESTED, &nlk->state);
                        wake_up_interruptible(&nlk->wait);
                }
                break;
        case NETLINK_LISTEN_ALL_NSID:
                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST))
                        return -EPERM;
                nr = NETLINK_F_LISTEN_ALL_NSID;
                break;
        case NETLINK_CAP_ACK:
                nr = NETLINK_F_CAP_ACK;
                break;
        case NETLINK_EXT_ACK:
                nr = NETLINK_F_EXT_ACK;
                break;
        case NETLINK_GET_STRICT_CHK:
                nr = NETLINK_F_STRICT_CHK;
                break;
        default:
                return -ENOPROTOOPT;
        }
        if (nr >= 0)
                assign_bit(nr, &nlk->flags, val);
        return 0;
}

static int netlink_getsockopt(struct socket *sock, int level, int optname,
                              char __user *optval, int __user *optlen)
{
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk = nlk_sk(sk);
        unsigned int flag;
        int len, val;

        if (level != SOL_NETLINK)
                return -ENOPROTOOPT;

        if (get_user(len, optlen))
                return -EFAULT;
        if (len < 0)
                return -EINVAL;

        switch (optname) {
        case NETLINK_PKTINFO:
                flag = NETLINK_F_RECV_PKTINFO;
                break;
        case NETLINK_BROADCAST_ERROR:
                flag = NETLINK_F_BROADCAST_SEND_ERROR;
                break;
        case NETLINK_NO_ENOBUFS:
                flag = NETLINK_F_RECV_NO_ENOBUFS;
                break;
        case NETLINK_LIST_MEMBERSHIPS: {
                int pos, idx, shift, err = 0;

                netlink_lock_table();
                for (pos = 0; pos * 8 < nlk->ngroups; pos += sizeof(u32)) {
                        if (len - pos < sizeof(u32))
                                break;

                        idx = pos / sizeof(unsigned long);
                        shift = (pos % sizeof(unsigned long)) * 8;
                        if (put_user((u32)(nlk->groups[idx] >> shift),
                                     (u32 __user *)(optval + pos))) {
                                err = -EFAULT;
                                break;
                        }
                }
                if (put_user(ALIGN(BITS_TO_BYTES(nlk->ngroups), sizeof(u32)), optlen))
                        err = -EFAULT;
                netlink_unlock_table();
                return err;
        }
        case NETLINK_LISTEN_ALL_NSID:
                flag = NETLINK_F_LISTEN_ALL_NSID;
                break;
        case NETLINK_CAP_ACK:
                flag = NETLINK_F_CAP_ACK;
                break;
        case NETLINK_EXT_ACK:
                flag = NETLINK_F_EXT_ACK;
                break;
        case NETLINK_GET_STRICT_CHK:
                flag = NETLINK_F_STRICT_CHK;
                break;
        default:
                return -ENOPROTOOPT;
        }

        if (len < sizeof(int))
                return -EINVAL;

        len = sizeof(int);
        val = test_bit(flag, &nlk->flags);

        if (put_user(len, optlen) ||
            copy_to_user(optval, &val, len))
                return -EFAULT;

        return 0;
}

static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
{
        struct nl_pktinfo info;

        info.group = NETLINK_CB(skb).dst_group;
        put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info);
}

static void netlink_cmsg_listen_all_nsid(struct sock *sk, struct msghdr *msg,
                                         struct sk_buff *skb)
{
        if (!NETLINK_CB(skb).nsid_is_set)
                return;

        put_cmsg(msg, SOL_NETLINK, NETLINK_LISTEN_ALL_NSID, sizeof(int),
                 &NETLINK_CB(skb).nsid);
}

static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
{
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk = nlk_sk(sk);
        DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name);
        u32 dst_portid;
        u32 dst_group;
        struct sk_buff *skb;
        int err;
        struct scm_cookie scm;
        u32 netlink_skb_flags = 0;

        if (msg->msg_flags & MSG_OOB)
                return -EOPNOTSUPP;

        if (len == 0) {
                pr_warn_once("Zero length message leads to an empty skb\n");
                return -ENODATA;
        }

        err = scm_send(sock, msg, &scm, true);
        if (err < 0)
                return err;

        if (msg->msg_namelen) {
                err = -EINVAL;
                if (msg->msg_namelen < sizeof(struct sockaddr_nl))
                        goto out;
                if (addr->nl_family != AF_NETLINK)
                        goto out;
                dst_portid = addr->nl_pid;
                dst_group = ffs(addr->nl_groups);
                err =  -EPERM;
                if ((dst_group || dst_portid) &&
                    !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND))
                        goto out;
                netlink_skb_flags |= NETLINK_SKB_DST;
        } else {
                /* Paired with WRITE_ONCE() in netlink_connect() */
                dst_portid = READ_ONCE(nlk->dst_portid);
                dst_group = READ_ONCE(nlk->dst_group);
        }

        /* Paired with WRITE_ONCE() in netlink_insert() */
        if (!READ_ONCE(nlk->bound)) {
                err = netlink_autobind(sock);
                if (err)
                        goto out;
        } else {
                /* Ensure nlk is hashed and visible. */
                smp_rmb();
        }

        err = -EMSGSIZE;
        if (len > sk->sk_sndbuf - 32)
                goto out;
        err = -ENOBUFS;
        skb = netlink_alloc_large_skb(len, dst_group);
        if (skb == NULL)
                goto out;

        NETLINK_CB(skb).portid        = nlk->portid;
        NETLINK_CB(skb).dst_group = dst_group;
        NETLINK_CB(skb).creds        = scm.creds;
        NETLINK_CB(skb).flags        = netlink_skb_flags;

        err = -EFAULT;
        if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
                kfree_skb(skb);
                goto out;
        }

        err = security_netlink_send(sk, skb);
        if (err) {
                kfree_skb(skb);
                goto out;
        }

        if (dst_group) {
                refcount_inc(&skb->users);
                netlink_broadcast(sk, skb, dst_portid, dst_group, GFP_KERNEL);
        }
        err = netlink_unicast(sk, skb, dst_portid, msg->msg_flags & MSG_DONTWAIT);

out:
        scm_destroy(&scm);
        return err;
}

static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
                           int flags)
{
        struct scm_cookie scm;
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk = nlk_sk(sk);
        size_t copied, max_recvmsg_len;
        struct sk_buff *skb, *data_skb;
        int err, ret;

        if (flags & MSG_OOB)
                return -EOPNOTSUPP;

        copied = 0;

        skb = skb_recv_datagram(sk, flags, &err);
        if (skb == NULL)
                goto out;

        data_skb = skb;

#ifdef CONFIG_COMPAT_NETLINK_MESSAGES
        if (unlikely(skb_shinfo(skb)->frag_list)) {
                /*
                 * If this skb has a frag_list, then here that means that we
                 * will have to use the frag_list skb's data for compat tasks
                 * and the regular skb's data for normal (non-compat) tasks.
                 *
                 * If we need to send the compat skb, assign it to the
                 * 'data_skb' variable so that it will be used below for data
                 * copying. We keep 'skb' for everything else, including
                 * freeing both later.
                 */
                if (flags & MSG_CMSG_COMPAT)
                        data_skb = skb_shinfo(skb)->frag_list;
        }
#endif

        /* Record the max length of recvmsg() calls for future allocations */
        max_recvmsg_len = max(READ_ONCE(nlk->max_recvmsg_len), len);
        max_recvmsg_len = min_t(size_t, max_recvmsg_len,
                                SKB_WITH_OVERHEAD(32768));
        WRITE_ONCE(nlk->max_recvmsg_len, max_recvmsg_len);

        copied = data_skb->len;
        if (len < copied) {
                msg->msg_flags |= MSG_TRUNC;
                copied = len;
        }

        err = skb_copy_datagram_msg(data_skb, 0, msg, copied);

        if (msg->msg_name) {
                DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name);
                addr->nl_family = AF_NETLINK;
                addr->nl_pad    = 0;
                addr->nl_pid        = NETLINK_CB(skb).portid;
                addr->nl_groups        = netlink_group_mask(NETLINK_CB(skb).dst_group);
                msg->msg_namelen = sizeof(*addr);
        }

        if (nlk_test_bit(RECV_PKTINFO, sk))
                netlink_cmsg_recv_pktinfo(msg, skb);
        if (nlk_test_bit(LISTEN_ALL_NSID, sk))
                netlink_cmsg_listen_all_nsid(sk, msg, skb);

        memset(&scm, 0, sizeof(scm));
        scm.creds = *NETLINK_CREDS(skb);
        if (flags & MSG_TRUNC)
                copied = data_skb->len;

        skb_free_datagram(sk, skb);

        if (READ_ONCE(nlk->cb_running) &&
            atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) {
                ret = netlink_dump(sk, false);
                if (ret) {
                        WRITE_ONCE(sk->sk_err, -ret);
                        sk_error_report(sk);
                }
        }

        scm_recv(sock, msg, &scm, flags);
out:
        netlink_rcv_wake(sk);
        return err ? : copied;
}

static void netlink_data_ready(struct sock *sk)
{
        BUG();
}

/*
 *        We export these functions to other modules. They provide a
 *        complete set of kernel non-blocking support for message
 *        queueing.
 */

struct sock *
__netlink_kernel_create(struct net *net, int unit, struct module *module,
                        struct netlink_kernel_cfg *cfg)
{
        struct socket *sock;
        struct sock *sk;
        struct netlink_sock *nlk;
        struct listeners *listeners = NULL;
        unsigned int groups;

        BUG_ON(!nl_table);

        if (unit < 0 || unit >= MAX_LINKS)
                return NULL;

        if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock))
                return NULL;

        if (__netlink_create(net, sock, unit, 1) < 0)
                goto out_sock_release_nosk;

        sk = sock->sk;

        if (!cfg || cfg->groups < 32)
                groups = 32;
        else
                groups = cfg->groups;

        listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
        if (!listeners)
                goto out_sock_release;

        sk->sk_data_ready = netlink_data_ready;
        if (cfg && cfg->input)
                nlk_sk(sk)->netlink_rcv = cfg->input;

        if (netlink_insert(sk, 0))
                goto out_sock_release;

        nlk = nlk_sk(sk);
        set_bit(NETLINK_F_KERNEL_SOCKET, &nlk->flags);

        netlink_table_grab();
        if (!nl_table[unit].registered) {
                nl_table[unit].groups = groups;
                rcu_assign_pointer(nl_table[unit].listeners, listeners);
                nl_table[unit].module = module;
                if (cfg) {
                        nl_table[unit].bind = cfg->bind;
                        nl_table[unit].unbind = cfg->unbind;
                        nl_table[unit].release = cfg->release;
                        nl_table[unit].flags = cfg->flags;
                }
                nl_table[unit].registered = 1;
        } else {
                kfree(listeners);
                nl_table[unit].registered++;
        }
        netlink_table_ungrab();
        return sk;

out_sock_release:
        kfree(listeners);
        netlink_kernel_release(sk);
        return NULL;

out_sock_release_nosk:
        sock_release(sock);
        return NULL;
}
EXPORT_SYMBOL(__netlink_kernel_create);

void
netlink_kernel_release(struct sock *sk)
{
        if (sk == NULL || sk->sk_socket == NULL)
                return;

        sock_release(sk->sk_socket);
}
EXPORT_SYMBOL(netlink_kernel_release);

int __netlink_change_ngroups(struct sock *sk, unsigned int groups)
{
        struct listeners *new, *old;
        struct netlink_table *tbl = &nl_table[sk->sk_protocol];

        if (groups < 32)
                groups = 32;

        if (NLGRPSZ(tbl->groups) < NLGRPSZ(groups)) {
                new = kzalloc(sizeof(*new) + NLGRPSZ(groups), GFP_ATOMIC);
                if (!new)
                        return -ENOMEM;
                old = nl_deref_protected(tbl->listeners);
                memcpy(new->masks, old->masks, NLGRPSZ(tbl->groups));
                rcu_assign_pointer(tbl->listeners, new);

                kfree_rcu(old, rcu);
        }
        tbl->groups = groups;

        return 0;
}

/**
 * netlink_change_ngroups - change number of multicast groups
 *
 * This changes the number of multicast groups that are available
 * on a certain netlink family. Note that it is not possible to
 * change the number of groups to below 32. Also note that it does
 * not implicitly call netlink_clear_multicast_users() when the
 * number of groups is reduced.
 *
 * @sk: The kernel netlink socket, as returned by netlink_kernel_create().
 * @groups: The new number of groups.
 */
int netlink_change_ngroups(struct sock *sk, unsigned int groups)
{
        int err;

        netlink_table_grab();
        err = __netlink_change_ngroups(sk, groups);
        netlink_table_ungrab();

        return err;
}

void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group)
{
        struct sock *sk;
        struct netlink_table *tbl = &nl_table[ksk->sk_protocol];
        struct hlist_node *tmp;

        sk_for_each_bound_safe(sk, tmp, &tbl->mc_list)
                netlink_update_socket_mc(nlk_sk(sk), group, 0);
}

struct nlmsghdr *
__nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags)
{
        struct nlmsghdr *nlh;
        int size = nlmsg_msg_size(len);

        nlh = skb_put(skb, NLMSG_ALIGN(size));
        nlh->nlmsg_type = type;
        nlh->nlmsg_len = size;
        nlh->nlmsg_flags = flags;
        nlh->nlmsg_pid = portid;
        nlh->nlmsg_seq = seq;
        if (!__builtin_constant_p(size) || NLMSG_ALIGN(size) - size != 0)
                memset(nlmsg_data(nlh) + len, 0, NLMSG_ALIGN(size) - size);
        return nlh;
}
EXPORT_SYMBOL(__nlmsg_put);

static size_t
netlink_ack_tlv_len(struct netlink_sock *nlk, int err,
                    const struct netlink_ext_ack *extack)
{
        size_t tlvlen;

        if (!extack || !test_bit(NETLINK_F_EXT_ACK, &nlk->flags))
                return 0;

        tlvlen = 0;
        if (extack->_msg)
                tlvlen += nla_total_size(strlen(extack->_msg) + 1);
        if (extack->cookie_len)
                tlvlen += nla_total_size(extack->cookie_len);

        /* Following attributes are only reported as error (not warning) */
        if (!err)
                return tlvlen;

        if (extack->bad_attr)
                tlvlen += nla_total_size(sizeof(u32));
        if (extack->policy)
                tlvlen += netlink_policy_dump_attr_size_estimate(extack->policy);
        if (extack->miss_type)
                tlvlen += nla_total_size(sizeof(u32));
        if (extack->miss_nest)
                tlvlen += nla_total_size(sizeof(u32));

        return tlvlen;
}

static bool nlmsg_check_in_payload(const struct nlmsghdr *nlh, const void *addr)
{
        return !WARN_ON(addr < nlmsg_data(nlh) ||
                        addr - (const void *) nlh >= nlh->nlmsg_len);
}

static void
netlink_ack_tlv_fill(struct sk_buff *skb, const struct nlmsghdr *nlh, int err,
                     const struct netlink_ext_ack *extack)
{
        if (extack->_msg)
                WARN_ON(nla_put_string(skb, NLMSGERR_ATTR_MSG, extack->_msg));
        if (extack->cookie_len)
                WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE,
                                extack->cookie_len, extack->cookie));

        if (!err)
                return;

        if (extack->bad_attr && nlmsg_check_in_payload(nlh, extack->bad_attr))
                WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS,
                                    (u8 *)extack->bad_attr - (const u8 *)nlh));
        if (extack->policy)
                netlink_policy_dump_write_attr(skb, extack->policy,
                                               NLMSGERR_ATTR_POLICY);
        if (extack->miss_type)
                WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_MISS_TYPE,
                                    extack->miss_type));
        if (extack->miss_nest && nlmsg_check_in_payload(nlh, extack->miss_nest))
                WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_MISS_NEST,
                                    (u8 *)extack->miss_nest - (const u8 *)nlh));
}

/*
 * It looks a bit ugly.
 * It would be better to create kernel thread.
 */

static int netlink_dump_done(struct netlink_sock *nlk, struct sk_buff *skb,
                             struct netlink_callback *cb,
                             struct netlink_ext_ack *extack)
{
        struct nlmsghdr *nlh;
        size_t extack_len;

        nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, sizeof(nlk->dump_done_errno),
                               NLM_F_MULTI | cb->answer_flags);
        if (WARN_ON(!nlh))
                return -ENOBUFS;

        nl_dump_check_consistent(cb, nlh);
        memcpy(nlmsg_data(nlh), &nlk->dump_done_errno, sizeof(nlk->dump_done_errno));

        extack_len = netlink_ack_tlv_len(nlk, nlk->dump_done_errno, extack);
        if (extack_len) {
                nlh->nlmsg_flags |= NLM_F_ACK_TLVS;
                if (skb_tailroom(skb) >= extack_len) {
                        netlink_ack_tlv_fill(skb, cb->nlh,
                                             nlk->dump_done_errno, extack);
                        nlmsg_end(skb, nlh);
                }
        }

        return 0;
}

static int netlink_dump(struct sock *sk, bool lock_taken)
{
        struct netlink_sock *nlk = nlk_sk(sk);
        struct netlink_ext_ack extack = {};
        struct netlink_callback *cb;
        struct sk_buff *skb = NULL;
        unsigned int rmem, rcvbuf;
        size_t max_recvmsg_len;
        struct module *module;
        int err = -ENOBUFS;
        int alloc_min_size;
        int alloc_size;

        if (!lock_taken)
                mutex_lock(&nlk->nl_cb_mutex);
        if (!nlk->cb_running) {
                err = -EINVAL;
                goto errout_skb;
        }

        /* NLMSG_GOODSIZE is small to avoid high order allocations being
         * required, but it makes sense to _attempt_ a 32KiB allocation
         * to reduce number of system calls on dump operations, if user
         * ever provided a big enough buffer.
         */
        cb = &nlk->cb;
        alloc_min_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE);

        max_recvmsg_len = READ_ONCE(nlk->max_recvmsg_len);
        if (alloc_min_size < max_recvmsg_len) {
                alloc_size = max_recvmsg_len;
                skb = alloc_skb(alloc_size,
                                (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) |
                                __GFP_NOWARN | __GFP_NORETRY);
        }
        if (!skb) {
                alloc_size = alloc_min_size;
                skb = alloc_skb(alloc_size, GFP_KERNEL);
        }
        if (!skb)
                goto errout_skb;

        rcvbuf = READ_ONCE(sk->sk_rcvbuf);
        rmem = atomic_add_return(skb->truesize, &sk->sk_rmem_alloc);
        if (rmem != skb->truesize && rmem >= rcvbuf) {
                atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
                goto errout_skb;
        }

        /* Trim skb to allocated size. User is expected to provide buffer as
         * large as max(min_dump_alloc, 32KiB (max_recvmsg_len capped at
         * netlink_recvmsg())). dump will pack as many smaller messages as
         * could fit within the allocated skb. skb is typically allocated
         * with larger space than required (could be as much as near 2x the
         * requested size with align to next power of 2 approach). Allowing
         * dump to use the excess space makes it difficult for a user to have a
         * reasonable static buffer based on the expected largest dump of a
         * single netdev. The outcome is MSG_TRUNC error.
         */
        skb_reserve(skb, skb_tailroom(skb) - alloc_size);

        /* Make sure malicious BPF programs can not read unitialized memory
         * from skb->head -> skb->data
         */
        skb_reset_network_header(skb);
        skb_reset_mac_header(skb);

        netlink_skb_set_owner_r(skb, sk);

        if (nlk->dump_done_errno > 0) {
                cb->extack = &extack;

                nlk->dump_done_errno = cb->dump(skb, cb);

                /* EMSGSIZE plus something already in the skb means
                 * that there's more to dump but current skb has filled up.
                 * If the callback really wants to return EMSGSIZE to user space
                 * it needs to do so again, on the next cb->dump() call,
                 * without putting data in the skb.
                 */
                if (nlk->dump_done_errno == -EMSGSIZE && skb->len)
                        nlk->dump_done_errno = skb->len;

                cb->extack = NULL;
        }

        if (nlk->dump_done_errno > 0 ||
            skb_tailroom(skb) < nlmsg_total_size(sizeof(nlk->dump_done_errno))) {
                mutex_unlock(&nlk->nl_cb_mutex);

                if (sk_filter(sk, skb))
                        kfree_skb(skb);
                else
                        __netlink_sendskb(sk, skb);
                return 0;
        }

        if (netlink_dump_done(nlk, skb, cb, &extack))
                goto errout_skb;

#ifdef CONFIG_COMPAT_NETLINK_MESSAGES
        /* frag_list skb's data is used for compat tasks
         * and the regular skb's data for normal (non-compat) tasks.
         * See netlink_recvmsg().
         */
        if (unlikely(skb_shinfo(skb)->frag_list)) {
                if (netlink_dump_done(nlk, skb_shinfo(skb)->frag_list, cb, &extack))
                        goto errout_skb;
        }
#endif

        if (sk_filter(sk, skb))
                kfree_skb(skb);
        else
                __netlink_sendskb(sk, skb);

        if (cb->done)
                cb->done(cb);

        WRITE_ONCE(nlk->cb_running, false);
        module = cb->module;
        skb = cb->skb;
        mutex_unlock(&nlk->nl_cb_mutex);
        module_put(module);
        consume_skb(skb);
        return 0;

errout_skb:
        mutex_unlock(&nlk->nl_cb_mutex);
        kfree_skb(skb);
        return err;
}

int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
                         const struct nlmsghdr *nlh,
                         struct netlink_dump_control *control)
{
        struct netlink_callback *cb;
        struct netlink_sock *nlk;
        struct sock *sk;
        int ret;

        refcount_inc(&skb->users);

        sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid);
        if (sk == NULL) {
                ret = -ECONNREFUSED;
                goto error_free;
        }

        nlk = nlk_sk(sk);
        mutex_lock(&nlk->nl_cb_mutex);
        /* A dump is in progress... */
        if (nlk->cb_running) {
                ret = -EBUSY;
                goto error_unlock;
        }
        /* add reference of module which cb->dump belongs to */
        if (!try_module_get(control->module)) {
                ret = -EPROTONOSUPPORT;
                goto error_unlock;
        }

        cb = &nlk->cb;
        memset(cb, 0, sizeof(*cb));
        cb->dump = control->dump;
        cb->done = control->done;
        cb->nlh = nlh;
        cb->data = control->data;
        cb->module = control->module;
        cb->min_dump_alloc = control->min_dump_alloc;
        cb->flags = control->flags;
        cb->skb = skb;

        cb->strict_check = nlk_test_bit(STRICT_CHK, NETLINK_CB(skb).sk);

        if (control->start) {
                cb->extack = control->extack;
                ret = control->start(cb);
                cb->extack = NULL;
                if (ret)
                        goto error_put;
        }

        WRITE_ONCE(nlk->cb_running, true);
        nlk->dump_done_errno = INT_MAX;

        ret = netlink_dump(sk, true);

        sock_put(sk);

        if (ret)
                return ret;

        /* We successfully started a dump, by returning -EINTR we
         * signal not to send ACK even if it was requested.
         */
        return -EINTR;

error_put:
        module_put(control->module);
error_unlock:
        sock_put(sk);
        mutex_unlock(&nlk->nl_cb_mutex);
error_free:
        kfree_skb(skb);
        return ret;
}
EXPORT_SYMBOL(__netlink_dump_start);

void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err,
                 const struct netlink_ext_ack *extack)
{
        struct sk_buff *skb;
        struct nlmsghdr *rep;
        struct nlmsgerr *errmsg;
        size_t payload = sizeof(*errmsg);
        struct netlink_sock *nlk = nlk_sk(NETLINK_CB(in_skb).sk);
        unsigned int flags = 0;
        size_t tlvlen;

        /* Error messages get the original request appended, unless the user
         * requests to cap the error message, and get extra error data if
         * requested.
         */
        if (err && !test_bit(NETLINK_F_CAP_ACK, &nlk->flags))
                payload += nlmsg_len(nlh);
        else
                flags |= NLM_F_CAPPED;

        tlvlen = netlink_ack_tlv_len(nlk, err, extack);
        if (tlvlen)
                flags |= NLM_F_ACK_TLVS;

        skb = nlmsg_new(payload + tlvlen, GFP_KERNEL);
        if (!skb)
                goto err_skb;

        rep = nlmsg_put(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
                        NLMSG_ERROR, sizeof(*errmsg), flags);
        if (!rep)
                goto err_bad_put;
        errmsg = nlmsg_data(rep);
        errmsg->error = err;
        errmsg->msg = *nlh;

        if (!(flags & NLM_F_CAPPED)) {
                if (!nlmsg_append(skb, nlmsg_len(nlh)))
                        goto err_bad_put;

                memcpy(nlmsg_data(&errmsg->msg), nlmsg_data(nlh),
                       nlmsg_len(nlh));
        }

        if (tlvlen)
                netlink_ack_tlv_fill(skb, nlh, err, extack);

        nlmsg_end(skb, rep);

        nlmsg_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid);

        return;

err_bad_put:
        nlmsg_free(skb);
err_skb:
        WRITE_ONCE(NETLINK_CB(in_skb).sk->sk_err, ENOBUFS);
        sk_error_report(NETLINK_CB(in_skb).sk);
}
EXPORT_SYMBOL(netlink_ack);

int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *,
                                                   struct nlmsghdr *,
                                                   struct netlink_ext_ack *))
{
        struct netlink_ext_ack extack;
        struct nlmsghdr *nlh;
        int err;

        while (skb->len >= nlmsg_total_size(0)) {
                int msglen;

                memset(&extack, 0, sizeof(extack));
                nlh = nlmsg_hdr(skb);
                err = 0;

                if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len)
                        return 0;

                /* Only requests are handled by the kernel */
                if (!(nlh->nlmsg_flags & NLM_F_REQUEST))
                        goto ack;

                /* Skip control messages */
                if (nlh->nlmsg_type < NLMSG_MIN_TYPE)
                        goto ack;

                err = cb(skb, nlh, &extack);
                if (err == -EINTR)
                        goto skip;

ack:
                if (nlh->nlmsg_flags & NLM_F_ACK || err)
                        netlink_ack(skb, nlh, err, &extack);

skip:
                msglen = NLMSG_ALIGN(nlh->nlmsg_len);
                if (msglen > skb->len)
                        msglen = skb->len;
                skb_pull(skb, msglen);
        }

        return 0;
}
EXPORT_SYMBOL(netlink_rcv_skb);

/**
 * nlmsg_notify - send a notification netlink message
 * @sk: netlink socket to use
 * @skb: notification message
 * @portid: destination netlink portid for reports or 0
 * @group: destination multicast group or 0
 * @report: 1 to report back, 0 to disable
 * @flags: allocation flags
 */
int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid,
                 unsigned int group, int report, gfp_t flags)
{
        int err = 0;

        if (group) {
                int exclude_portid = 0;

                if (report) {
                        refcount_inc(&skb->users);
                        exclude_portid = portid;
                }

                /* errors reported via destination sk->sk_err, but propagate
                 * delivery errors if NETLINK_BROADCAST_ERROR flag is set */
                err = nlmsg_multicast(sk, skb, exclude_portid, group, flags);
                if (err == -ESRCH)
                        err = 0;
        }

        if (report) {
                int err2;

                err2 = nlmsg_unicast(sk, skb, portid);
                if (!err)
                        err = err2;
        }

        return err;
}
EXPORT_SYMBOL(nlmsg_notify);

#ifdef CONFIG_PROC_FS
struct nl_seq_iter {
        struct seq_net_private p;
        struct rhashtable_iter hti;
        int link;
};

static void netlink_walk_start(struct nl_seq_iter *iter)
{
        rhashtable_walk_enter(&nl_table[iter->link].hash, &iter->hti);
        rhashtable_walk_start(&iter->hti);
}

static void netlink_walk_stop(struct nl_seq_iter *iter)
{
        rhashtable_walk_stop(&iter->hti);
        rhashtable_walk_exit(&iter->hti);
}

static void *__netlink_seq_next(struct seq_file *seq)
{
        struct nl_seq_iter *iter = seq->private;
        struct netlink_sock *nlk;

        do {
                for (;;) {
                        nlk = rhashtable_walk_next(&iter->hti);

                        if (IS_ERR(nlk)) {
                                if (PTR_ERR(nlk) == -EAGAIN)
                                        continue;

                                return nlk;
                        }

                        if (nlk)
                                break;

                        netlink_walk_stop(iter);
                        if (++iter->link >= MAX_LINKS)
                                return NULL;

                        netlink_walk_start(iter);
                }
        } while (sock_net(&nlk->sk) != seq_file_net(seq));

        return nlk;
}

static void *netlink_seq_start(struct seq_file *seq, loff_t *posp)
        __acquires(RCU)
{
        struct nl_seq_iter *iter = seq->private;
        void *obj = SEQ_START_TOKEN;
        loff_t pos;

        iter->link = 0;

        netlink_walk_start(iter);

        for (pos = *posp; pos && obj && !IS_ERR(obj); pos--)
                obj = __netlink_seq_next(seq);

        return obj;
}

static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        ++*pos;
        return __netlink_seq_next(seq);
}

static void netlink_native_seq_stop(struct seq_file *seq, void *v)
{
        struct nl_seq_iter *iter = seq->private;

        if (iter->link >= MAX_LINKS)
                return;

        netlink_walk_stop(iter);
}


static int netlink_native_seq_show(struct seq_file *seq, void *v)
{
        if (v == SEQ_START_TOKEN) {
                seq_puts(seq,
                         "sk               Eth Pid        Groups   "
                         "Rmem     Wmem     Dump  Locks    Drops    Inode\n");
        } else {
                struct sock *s = v;
                struct netlink_sock *nlk = nlk_sk(s);

                seq_printf(seq, "%pK %-3d %-10u %08x %-8d %-8d %-5d %-8d %-8u %-8lu\n",
                           s,
                           s->sk_protocol,
                           nlk->portid,
                           nlk->groups ? (u32)nlk->groups[0] : 0,
                           sk_rmem_alloc_get(s),
                           sk_wmem_alloc_get(s),
                           READ_ONCE(nlk->cb_running),
                           refcount_read(&s->sk_refcnt),
                           sk_drops_read(s),
                           sock_i_ino(s)
                        );

        }
        return 0;
}

#ifdef CONFIG_BPF_SYSCALL
struct bpf_iter__netlink {
        __bpf_md_ptr(struct bpf_iter_meta *, meta);
        __bpf_md_ptr(struct netlink_sock *, sk);
};

DEFINE_BPF_ITER_FUNC(netlink, struct bpf_iter_meta *meta, struct netlink_sock *sk)

static int netlink_prog_seq_show(struct bpf_prog *prog,
                                  struct bpf_iter_meta *meta,
                                  void *v)
{
        struct bpf_iter__netlink ctx;

        meta->seq_num--;  /* skip SEQ_START_TOKEN */
        ctx.meta = meta;
        ctx.sk = nlk_sk((struct sock *)v);
        return bpf_iter_run_prog(prog, &ctx);
}

static int netlink_seq_show(struct seq_file *seq, void *v)
{
        struct bpf_iter_meta meta;
        struct bpf_prog *prog;

        meta.seq = seq;
        prog = bpf_iter_get_info(&meta, false);
        if (!prog)
                return netlink_native_seq_show(seq, v);

        if (v != SEQ_START_TOKEN)
                return netlink_prog_seq_show(prog, &meta, v);

        return 0;
}

static void netlink_seq_stop(struct seq_file *seq, void *v)
{
        struct bpf_iter_meta meta;
        struct bpf_prog *prog;

        if (!v) {
                meta.seq = seq;
                prog = bpf_iter_get_info(&meta, true);
                if (prog)
                        (void)netlink_prog_seq_show(prog, &meta, v);
        }

        netlink_native_seq_stop(seq, v);
}
#else
static int netlink_seq_show(struct seq_file *seq, void *v)
{
        return netlink_native_seq_show(seq, v);
}

static void netlink_seq_stop(struct seq_file *seq, void *v)
{
        netlink_native_seq_stop(seq, v);
}
#endif

static const struct seq_operations netlink_seq_ops = {
        .start  = netlink_seq_start,
        .next   = netlink_seq_next,
        .stop   = netlink_seq_stop,
        .show   = netlink_seq_show,
};
#endif

int netlink_register_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&netlink_chain, nb);
}
EXPORT_SYMBOL(netlink_register_notifier);

int netlink_unregister_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_unregister(&netlink_chain, nb);
}
EXPORT_SYMBOL(netlink_unregister_notifier);

static const struct proto_ops netlink_ops = {
        .family =        PF_NETLINK,
        .owner =        THIS_MODULE,
        .release =        netlink_release,
        .bind =                netlink_bind,
        .connect =        netlink_connect,
        .socketpair =        sock_no_socketpair,
        .accept =        sock_no_accept,
        .getname =        netlink_getname,
        .poll =                datagram_poll,
        .ioctl =        netlink_ioctl,
        .listen =        sock_no_listen,
        .shutdown =        sock_no_shutdown,
        .setsockopt =        netlink_setsockopt,
        .getsockopt =        netlink_getsockopt,
        .sendmsg =        netlink_sendmsg,
        .recvmsg =        netlink_recvmsg,
        .mmap =                sock_no_mmap,
};

static const struct net_proto_family netlink_family_ops = {
        .family = PF_NETLINK,
        .create = netlink_create,
        .owner        = THIS_MODULE,        /* for consistency 8) */
};

static int __net_init netlink_net_init(struct net *net)
{
#ifdef CONFIG_PROC_FS
        if (!proc_create_net("netlink", 0, net->proc_net, &netlink_seq_ops,
                        sizeof(struct nl_seq_iter)))
                return -ENOMEM;
#endif
        return 0;
}

static void __net_exit netlink_net_exit(struct net *net)
{
#ifdef CONFIG_PROC_FS
        remove_proc_entry("netlink", net->proc_net);
#endif
}

static void __init netlink_add_usersock_entry(void)
{
        struct listeners *listeners;
        int groups = 32;

        listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
        if (!listeners)
                panic("netlink_add_usersock_entry: Cannot allocate listeners\n");

        netlink_table_grab();

        nl_table[NETLINK_USERSOCK].groups = groups;
        rcu_assign_pointer(nl_table[NETLINK_USERSOCK].listeners, listeners);
        nl_table[NETLINK_USERSOCK].module = THIS_MODULE;
        nl_table[NETLINK_USERSOCK].registered = 1;
        nl_table[NETLINK_USERSOCK].flags = NL_CFG_F_NONROOT_SEND;

        netlink_table_ungrab();
}

static struct pernet_operations __net_initdata netlink_net_ops = {
        .init = netlink_net_init,
        .exit = netlink_net_exit,
};

static inline u32 netlink_hash(const void *data, u32 len, u32 seed)
{
        const struct netlink_sock *nlk = data;
        struct netlink_compare_arg arg;

        netlink_compare_arg_init(&arg, sock_net(&nlk->sk), nlk->portid);
        return jhash2((u32 *)&arg, netlink_compare_arg_len / sizeof(u32), seed);
}

static const struct rhashtable_params netlink_rhashtable_params = {
        .head_offset = offsetof(struct netlink_sock, node),
        .key_len = netlink_compare_arg_len,
        .obj_hashfn = netlink_hash,
        .obj_cmpfn = netlink_compare,
        .automatic_shrinking = true,
};

#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
BTF_ID_LIST_SINGLE(btf_netlink_sock_id, struct, netlink_sock)

static const struct bpf_iter_seq_info netlink_seq_info = {
        .seq_ops                = &netlink_seq_ops,
        .init_seq_private        = bpf_iter_init_seq_net,
        .fini_seq_private        = bpf_iter_fini_seq_net,
        .seq_priv_size                = sizeof(struct nl_seq_iter),
};

static struct bpf_iter_reg netlink_reg_info = {
        .target                        = "netlink",
        .ctx_arg_info_size        = 1,
        .ctx_arg_info                = {
                { offsetof(struct bpf_iter__netlink, sk),
                  PTR_TO_BTF_ID_OR_NULL },
        },
        .seq_info                = &netlink_seq_info,
};

static int __init bpf_iter_register(void)
{
        netlink_reg_info.ctx_arg_info[0].btf_id = *btf_netlink_sock_id;
        return bpf_iter_reg_target(&netlink_reg_info);
}
#endif

static int __init netlink_proto_init(void)
{
        int i;
        int err = proto_register(&netlink_proto, 0);

        if (err != 0)
                goto out;

#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
        err = bpf_iter_register();
        if (err)
                goto out;
#endif

        BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof_field(struct sk_buff, cb));

        nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL);
        if (!nl_table)
                goto panic;

        for (i = 0; i < MAX_LINKS; i++) {
                if (rhashtable_init(&nl_table[i].hash,
                                    &netlink_rhashtable_params) < 0)
                        goto panic;
        }

        netlink_add_usersock_entry();

        sock_register(&netlink_family_ops);
        register_pernet_subsys(&netlink_net_ops);
        register_pernet_subsys(&netlink_tap_net_ops);
        /* The netlink device handler may be needed early. */
        rtnetlink_init();
out:
        return err;
panic:
        panic("netlink_init: Cannot allocate nl_table\n");
}

core_initcall(netlink_proto_init);


























































































  317 































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_U64_STATS_SYNC_H
#define _LINUX_U64_STATS_SYNC_H

/*
 * Protect against 64-bit values tearing on 32-bit architectures. This is
 * typically used for statistics read/update in different subsystems.
 *
 * Key points :
 *
 * -  Use a seqcount on 32-bit
 * -  The whole thing is a no-op on 64-bit architectures.
 *
 * Usage constraints:
 *
 * 1) Write side must ensure mutual exclusion, or one seqcount update could
 *    be lost, thus blocking readers forever.
 *
 * 2) Write side must disable preemption, or a seqcount reader can preempt the
 *    writer and also spin forever.
 *
 * 3) Write side must use the _irqsave() variant if other writers, or a reader,
 *    can be invoked from an IRQ context. On 64bit systems this variant does not
 *    disable interrupts.
 *
 * 4) If reader fetches several counters, there is no guarantee the whole values
 *    are consistent w.r.t. each other (remember point #2: seqcounts are not
 *    used for 64bit architectures).
 *
 * 5) Readers are allowed to sleep or be preempted/interrupted: they perform
 *    pure reads.
 *
 * Usage :
 *
 * Stats producer (writer) should use following template granted it already got
 * an exclusive access to counters (a lock is already taken, or per cpu
 * data is used [in a non preemptable context])
 *
 *   spin_lock_bh(...) or other synchronization to get exclusive access
 *   ...
 *   u64_stats_update_begin(&stats->syncp);
 *   u64_stats_add(&stats->bytes64, len); // non atomic operation
 *   u64_stats_inc(&stats->packets64);    // non atomic operation
 *   u64_stats_update_end(&stats->syncp);
 *
 * While a consumer (reader) should use following template to get consistent
 * snapshot for each variable (but no guarantee on several ones)
 *
 * u64 tbytes, tpackets;
 * unsigned int start;
 *
 * do {
 *         start = u64_stats_fetch_begin(&stats->syncp);
 *         tbytes = u64_stats_read(&stats->bytes64); // non atomic operation
 *         tpackets = u64_stats_read(&stats->packets64); // non atomic operation
 * } while (u64_stats_fetch_retry(&stats->syncp, start));
 *
 *
 * Example of use in drivers/net/loopback.c, using per_cpu containers,
 * in BH disabled context.
 */
#include <linux/seqlock.h>

struct u64_stats_sync {
#if BITS_PER_LONG == 32
        seqcount_t        seq;
#endif
};

#if BITS_PER_LONG == 64
#include <asm/local64.h>

typedef struct {
        local64_t        v;
} u64_stats_t ;

static inline u64 u64_stats_read(const u64_stats_t *p)
{
        return local64_read(&p->v);
}

static inline void u64_stats_set(u64_stats_t *p, u64 val)
{
        local64_set(&p->v, val);
}

static inline void u64_stats_add(u64_stats_t *p, unsigned long val)
{
        local64_add(val, &p->v);
}

static inline void u64_stats_inc(u64_stats_t *p)
{
        local64_inc(&p->v);
}

static inline void u64_stats_init(struct u64_stats_sync *syncp) { }
static inline void __u64_stats_update_begin(struct u64_stats_sync *syncp) { }
static inline void __u64_stats_update_end(struct u64_stats_sync *syncp) { }
static inline unsigned long __u64_stats_irqsave(void) { return 0; }
static inline void __u64_stats_irqrestore(unsigned long flags) { }
static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
{
        return 0;
}
static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
                                           unsigned int start)
{
        return false;
}

#else /* 64 bit */

typedef struct {
        u64                v;
} u64_stats_t;

static inline u64 u64_stats_read(const u64_stats_t *p)
{
        return p->v;
}

static inline void u64_stats_set(u64_stats_t *p, u64 val)
{
        p->v = val;
}

static inline void u64_stats_add(u64_stats_t *p, unsigned long val)
{
        p->v += val;
}

static inline void u64_stats_inc(u64_stats_t *p)
{
        p->v++;
}

#define u64_stats_init(syncp)                                \
        do {                                                \
                struct u64_stats_sync *__s = (syncp);        \
                seqcount_init(&__s->seq);                \
        } while (0)

static inline void __u64_stats_update_begin(struct u64_stats_sync *syncp)
{
        preempt_disable_nested();
        write_seqcount_begin(&syncp->seq);
}

static inline void __u64_stats_update_end(struct u64_stats_sync *syncp)
{
        write_seqcount_end(&syncp->seq);
        preempt_enable_nested();
}

static inline unsigned long __u64_stats_irqsave(void)
{
        unsigned long flags;

        local_irq_save(flags);
        return flags;
}

static inline void __u64_stats_irqrestore(unsigned long flags)
{
        local_irq_restore(flags);
}

static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
{
        return read_seqcount_begin(&syncp->seq);
}

static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
                                           unsigned int start)
{
        return read_seqcount_retry(&syncp->seq, start);
}
#endif /* !64 bit */

static inline void u64_stats_update_begin(struct u64_stats_sync *syncp)
{
        __u64_stats_update_begin(syncp);
}

static inline void u64_stats_update_end(struct u64_stats_sync *syncp)
{
        __u64_stats_update_end(syncp);
}

static inline unsigned long u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp)
{
        unsigned long flags = __u64_stats_irqsave();

        __u64_stats_update_begin(syncp);
        return flags;
}

static inline void u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp,
                                                   unsigned long flags)
{
        __u64_stats_update_end(syncp);
        __u64_stats_irqrestore(flags);
}

static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
{
        return __u64_stats_fetch_begin(syncp);
}

static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
                                         unsigned int start)
{
        return __u64_stats_fetch_retry(syncp, start);
}

#endif /* _LINUX_U64_STATS_SYNC_H */













































































































































































































































































































































































































































































































































































































































































































































































  318 







































































































































































































  316 




































































































































































































































































































































































































































































































































































































































































































































































   39 























































































































































































































































































































































































































































   12 













































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   39 





















































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
// SPDX-License-Identifier: GPL-2.0

// Generated by scripts/atomic/gen-atomic-fallback.sh
// DO NOT MODIFY THIS FILE DIRECTLY

#ifndef _LINUX_ATOMIC_FALLBACK_H
#define _LINUX_ATOMIC_FALLBACK_H

#include <linux/compiler.h>

#if defined(arch_xchg)
#define raw_xchg arch_xchg
#elif defined(arch_xchg_relaxed)
#define raw_xchg(...) \
        __atomic_op_fence(arch_xchg, __VA_ARGS__)
#else
extern void raw_xchg_not_implemented(void);
#define raw_xchg(...) raw_xchg_not_implemented()
#endif

#if defined(arch_xchg_acquire)
#define raw_xchg_acquire arch_xchg_acquire
#elif defined(arch_xchg_relaxed)
#define raw_xchg_acquire(...) \
        __atomic_op_acquire(arch_xchg, __VA_ARGS__)
#elif defined(arch_xchg)
#define raw_xchg_acquire arch_xchg
#else
extern void raw_xchg_acquire_not_implemented(void);
#define raw_xchg_acquire(...) raw_xchg_acquire_not_implemented()
#endif

#if defined(arch_xchg_release)
#define raw_xchg_release arch_xchg_release
#elif defined(arch_xchg_relaxed)
#define raw_xchg_release(...) \
        __atomic_op_release(arch_xchg, __VA_ARGS__)
#elif defined(arch_xchg)
#define raw_xchg_release arch_xchg
#else
extern void raw_xchg_release_not_implemented(void);
#define raw_xchg_release(...) raw_xchg_release_not_implemented()
#endif

#if defined(arch_xchg_relaxed)
#define raw_xchg_relaxed arch_xchg_relaxed
#elif defined(arch_xchg)
#define raw_xchg_relaxed arch_xchg
#else
extern void raw_xchg_relaxed_not_implemented(void);
#define raw_xchg_relaxed(...) raw_xchg_relaxed_not_implemented()
#endif

#if defined(arch_cmpxchg)
#define raw_cmpxchg arch_cmpxchg
#elif defined(arch_cmpxchg_relaxed)
#define raw_cmpxchg(...) \
        __atomic_op_fence(arch_cmpxchg, __VA_ARGS__)
#else
extern void raw_cmpxchg_not_implemented(void);
#define raw_cmpxchg(...) raw_cmpxchg_not_implemented()
#endif

#if defined(arch_cmpxchg_acquire)
#define raw_cmpxchg_acquire arch_cmpxchg_acquire
#elif defined(arch_cmpxchg_relaxed)
#define raw_cmpxchg_acquire(...) \
        __atomic_op_acquire(arch_cmpxchg, __VA_ARGS__)
#elif defined(arch_cmpxchg)
#define raw_cmpxchg_acquire arch_cmpxchg
#else
extern void raw_cmpxchg_acquire_not_implemented(void);
#define raw_cmpxchg_acquire(...) raw_cmpxchg_acquire_not_implemented()
#endif

#if defined(arch_cmpxchg_release)
#define raw_cmpxchg_release arch_cmpxchg_release
#elif defined(arch_cmpxchg_relaxed)
#define raw_cmpxchg_release(...) \
        __atomic_op_release(arch_cmpxchg, __VA_ARGS__)
#elif defined(arch_cmpxchg)
#define raw_cmpxchg_release arch_cmpxchg
#else
extern void raw_cmpxchg_release_not_implemented(void);
#define raw_cmpxchg_release(...) raw_cmpxchg_release_not_implemented()
#endif

#if defined(arch_cmpxchg_relaxed)
#define raw_cmpxchg_relaxed arch_cmpxchg_relaxed
#elif defined(arch_cmpxchg)
#define raw_cmpxchg_relaxed arch_cmpxchg
#else
extern void raw_cmpxchg_relaxed_not_implemented(void);
#define raw_cmpxchg_relaxed(...) raw_cmpxchg_relaxed_not_implemented()
#endif

#if defined(arch_cmpxchg64)
#define raw_cmpxchg64 arch_cmpxchg64
#elif defined(arch_cmpxchg64_relaxed)
#define raw_cmpxchg64(...) \
        __atomic_op_fence(arch_cmpxchg64, __VA_ARGS__)
#else
extern void raw_cmpxchg64_not_implemented(void);
#define raw_cmpxchg64(...) raw_cmpxchg64_not_implemented()
#endif

#if defined(arch_cmpxchg64_acquire)
#define raw_cmpxchg64_acquire arch_cmpxchg64_acquire
#elif defined(arch_cmpxchg64_relaxed)
#define raw_cmpxchg64_acquire(...) \
        __atomic_op_acquire(arch_cmpxchg64, __VA_ARGS__)
#elif defined(arch_cmpxchg64)
#define raw_cmpxchg64_acquire arch_cmpxchg64
#else
extern void raw_cmpxchg64_acquire_not_implemented(void);
#define raw_cmpxchg64_acquire(...) raw_cmpxchg64_acquire_not_implemented()
#endif

#if defined(arch_cmpxchg64_release)
#define raw_cmpxchg64_release arch_cmpxchg64_release
#elif defined(arch_cmpxchg64_relaxed)
#define raw_cmpxchg64_release(...) \
        __atomic_op_release(arch_cmpxchg64, __VA_ARGS__)
#elif defined(arch_cmpxchg64)
#define raw_cmpxchg64_release arch_cmpxchg64
#else
extern void raw_cmpxchg64_release_not_implemented(void);
#define raw_cmpxchg64_release(...) raw_cmpxchg64_release_not_implemented()
#endif

#if defined(arch_cmpxchg64_relaxed)
#define raw_cmpxchg64_relaxed arch_cmpxchg64_relaxed
#elif defined(arch_cmpxchg64)
#define raw_cmpxchg64_relaxed arch_cmpxchg64
#else
extern void raw_cmpxchg64_relaxed_not_implemented(void);
#define raw_cmpxchg64_relaxed(...) raw_cmpxchg64_relaxed_not_implemented()
#endif

#if defined(arch_cmpxchg128)
#define raw_cmpxchg128 arch_cmpxchg128
#elif defined(arch_cmpxchg128_relaxed)
#define raw_cmpxchg128(...) \
        __atomic_op_fence(arch_cmpxchg128, __VA_ARGS__)
#else
extern void raw_cmpxchg128_not_implemented(void);
#define raw_cmpxchg128(...) raw_cmpxchg128_not_implemented()
#endif

#if defined(arch_cmpxchg128_acquire)
#define raw_cmpxchg128_acquire arch_cmpxchg128_acquire
#elif defined(arch_cmpxchg128_relaxed)
#define raw_cmpxchg128_acquire(...) \
        __atomic_op_acquire(arch_cmpxchg128, __VA_ARGS__)
#elif defined(arch_cmpxchg128)
#define raw_cmpxchg128_acquire arch_cmpxchg128
#else
extern void raw_cmpxchg128_acquire_not_implemented(void);
#define raw_cmpxchg128_acquire(...) raw_cmpxchg128_acquire_not_implemented()
#endif

#if defined(arch_cmpxchg128_release)
#define raw_cmpxchg128_release arch_cmpxchg128_release
#elif defined(arch_cmpxchg128_relaxed)
#define raw_cmpxchg128_release(...) \
        __atomic_op_release(arch_cmpxchg128, __VA_ARGS__)
#elif defined(arch_cmpxchg128)
#define raw_cmpxchg128_release arch_cmpxchg128
#else
extern void raw_cmpxchg128_release_not_implemented(void);
#define raw_cmpxchg128_release(...) raw_cmpxchg128_release_not_implemented()
#endif

#if defined(arch_cmpxchg128_relaxed)
#define raw_cmpxchg128_relaxed arch_cmpxchg128_relaxed
#elif defined(arch_cmpxchg128)
#define raw_cmpxchg128_relaxed arch_cmpxchg128
#else
extern void raw_cmpxchg128_relaxed_not_implemented(void);
#define raw_cmpxchg128_relaxed(...) raw_cmpxchg128_relaxed_not_implemented()
#endif

#if defined(arch_try_cmpxchg)
#define raw_try_cmpxchg arch_try_cmpxchg
#elif defined(arch_try_cmpxchg_relaxed)
#define raw_try_cmpxchg(...) \
        __atomic_op_fence(arch_try_cmpxchg, __VA_ARGS__)
#else
#define raw_try_cmpxchg(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg_acquire)
#define raw_try_cmpxchg_acquire arch_try_cmpxchg_acquire
#elif defined(arch_try_cmpxchg_relaxed)
#define raw_try_cmpxchg_acquire(...) \
        __atomic_op_acquire(arch_try_cmpxchg, __VA_ARGS__)
#elif defined(arch_try_cmpxchg)
#define raw_try_cmpxchg_acquire arch_try_cmpxchg
#else
#define raw_try_cmpxchg_acquire(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg_acquire((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg_release)
#define raw_try_cmpxchg_release arch_try_cmpxchg_release
#elif defined(arch_try_cmpxchg_relaxed)
#define raw_try_cmpxchg_release(...) \
        __atomic_op_release(arch_try_cmpxchg, __VA_ARGS__)
#elif defined(arch_try_cmpxchg)
#define raw_try_cmpxchg_release arch_try_cmpxchg
#else
#define raw_try_cmpxchg_release(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg_release((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg_relaxed)
#define raw_try_cmpxchg_relaxed arch_try_cmpxchg_relaxed
#elif defined(arch_try_cmpxchg)
#define raw_try_cmpxchg_relaxed arch_try_cmpxchg
#else
#define raw_try_cmpxchg_relaxed(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg_relaxed((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg64)
#define raw_try_cmpxchg64 arch_try_cmpxchg64
#elif defined(arch_try_cmpxchg64_relaxed)
#define raw_try_cmpxchg64(...) \
        __atomic_op_fence(arch_try_cmpxchg64, __VA_ARGS__)
#else
#define raw_try_cmpxchg64(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg64((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg64_acquire)
#define raw_try_cmpxchg64_acquire arch_try_cmpxchg64_acquire
#elif defined(arch_try_cmpxchg64_relaxed)
#define raw_try_cmpxchg64_acquire(...) \
        __atomic_op_acquire(arch_try_cmpxchg64, __VA_ARGS__)
#elif defined(arch_try_cmpxchg64)
#define raw_try_cmpxchg64_acquire arch_try_cmpxchg64
#else
#define raw_try_cmpxchg64_acquire(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg64_acquire((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg64_release)
#define raw_try_cmpxchg64_release arch_try_cmpxchg64_release
#elif defined(arch_try_cmpxchg64_relaxed)
#define raw_try_cmpxchg64_release(...) \
        __atomic_op_release(arch_try_cmpxchg64, __VA_ARGS__)
#elif defined(arch_try_cmpxchg64)
#define raw_try_cmpxchg64_release arch_try_cmpxchg64
#else
#define raw_try_cmpxchg64_release(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg64_release((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg64_relaxed)
#define raw_try_cmpxchg64_relaxed arch_try_cmpxchg64_relaxed
#elif defined(arch_try_cmpxchg64)
#define raw_try_cmpxchg64_relaxed arch_try_cmpxchg64
#else
#define raw_try_cmpxchg64_relaxed(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg64_relaxed((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg128)
#define raw_try_cmpxchg128 arch_try_cmpxchg128
#elif defined(arch_try_cmpxchg128_relaxed)
#define raw_try_cmpxchg128(...) \
        __atomic_op_fence(arch_try_cmpxchg128, __VA_ARGS__)
#else
#define raw_try_cmpxchg128(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg128((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg128_acquire)
#define raw_try_cmpxchg128_acquire arch_try_cmpxchg128_acquire
#elif defined(arch_try_cmpxchg128_relaxed)
#define raw_try_cmpxchg128_acquire(...) \
        __atomic_op_acquire(arch_try_cmpxchg128, __VA_ARGS__)
#elif defined(arch_try_cmpxchg128)
#define raw_try_cmpxchg128_acquire arch_try_cmpxchg128
#else
#define raw_try_cmpxchg128_acquire(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg128_acquire((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg128_release)
#define raw_try_cmpxchg128_release arch_try_cmpxchg128_release
#elif defined(arch_try_cmpxchg128_relaxed)
#define raw_try_cmpxchg128_release(...) \
        __atomic_op_release(arch_try_cmpxchg128, __VA_ARGS__)
#elif defined(arch_try_cmpxchg128)
#define raw_try_cmpxchg128_release arch_try_cmpxchg128
#else
#define raw_try_cmpxchg128_release(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg128_release((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg128_relaxed)
#define raw_try_cmpxchg128_relaxed arch_try_cmpxchg128_relaxed
#elif defined(arch_try_cmpxchg128)
#define raw_try_cmpxchg128_relaxed arch_try_cmpxchg128
#else
#define raw_try_cmpxchg128_relaxed(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg128_relaxed((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#define raw_cmpxchg_local arch_cmpxchg_local

#ifdef arch_try_cmpxchg_local
#define raw_try_cmpxchg_local arch_try_cmpxchg_local
#else
#define raw_try_cmpxchg_local(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg_local((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#define raw_cmpxchg64_local arch_cmpxchg64_local

#ifdef arch_try_cmpxchg64_local
#define raw_try_cmpxchg64_local arch_try_cmpxchg64_local
#else
#define raw_try_cmpxchg64_local(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg64_local((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#define raw_cmpxchg128_local arch_cmpxchg128_local

#ifdef arch_try_cmpxchg128_local
#define raw_try_cmpxchg128_local arch_try_cmpxchg128_local
#else
#define raw_try_cmpxchg128_local(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg128_local((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#define raw_sync_cmpxchg arch_sync_cmpxchg

#ifdef arch_sync_try_cmpxchg
#define raw_sync_try_cmpxchg arch_sync_try_cmpxchg
#else
#define raw_sync_try_cmpxchg(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_sync_cmpxchg((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

/**
 * raw_atomic_read() - atomic load with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically loads the value of @v with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_read() elsewhere.
 *
 * Return: The value loaded from @v.
 */
static __always_inline int
raw_atomic_read(const atomic_t *v)
{
        return arch_atomic_read(v);
}

/**
 * raw_atomic_read_acquire() - atomic load with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically loads the value of @v with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_read_acquire() elsewhere.
 *
 * Return: The value loaded from @v.
 */
static __always_inline int
raw_atomic_read_acquire(const atomic_t *v)
{
#if defined(arch_atomic_read_acquire)
        return arch_atomic_read_acquire(v);
#else
        int ret;

        if (__native_word(atomic_t)) {
                ret = smp_load_acquire(&(v)->counter);
        } else {
                ret = raw_atomic_read(v);
                __atomic_acquire_fence();
        }

        return ret;
#endif
}

/**
 * raw_atomic_set() - atomic set with relaxed ordering
 * @v: pointer to atomic_t
 * @i: int value to assign
 *
 * Atomically sets @v to @i with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_set() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_set(atomic_t *v, int i)
{
        arch_atomic_set(v, i);
}

/**
 * raw_atomic_set_release() - atomic set with release ordering
 * @v: pointer to atomic_t
 * @i: int value to assign
 *
 * Atomically sets @v to @i with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_set_release() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_set_release(atomic_t *v, int i)
{
#if defined(arch_atomic_set_release)
        arch_atomic_set_release(v, i);
#else
        if (__native_word(atomic_t)) {
                smp_store_release(&(v)->counter, i);
        } else {
                __atomic_release_fence();
                raw_atomic_set(v, i);
        }
#endif
}

/**
 * raw_atomic_add() - atomic add with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_add(int i, atomic_t *v)
{
        arch_atomic_add(i, v);
}

/**
 * raw_atomic_add_return() - atomic add with full ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_add_return(int i, atomic_t *v)
{
#if defined(arch_atomic_add_return)
        return arch_atomic_add_return(i, v);
#elif defined(arch_atomic_add_return_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_add_return_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_add_return"
#endif
}

/**
 * raw_atomic_add_return_acquire() - atomic add with acquire ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_add_return_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_add_return_acquire)
        return arch_atomic_add_return_acquire(i, v);
#elif defined(arch_atomic_add_return_relaxed)
        int ret = arch_atomic_add_return_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_add_return)
        return arch_atomic_add_return(i, v);
#else
#error "Unable to define raw_atomic_add_return_acquire"
#endif
}

/**
 * raw_atomic_add_return_release() - atomic add with release ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_add_return_release(int i, atomic_t *v)
{
#if defined(arch_atomic_add_return_release)
        return arch_atomic_add_return_release(i, v);
#elif defined(arch_atomic_add_return_relaxed)
        __atomic_release_fence();
        return arch_atomic_add_return_relaxed(i, v);
#elif defined(arch_atomic_add_return)
        return arch_atomic_add_return(i, v);
#else
#error "Unable to define raw_atomic_add_return_release"
#endif
}

/**
 * raw_atomic_add_return_relaxed() - atomic add with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_add_return_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_add_return_relaxed)
        return arch_atomic_add_return_relaxed(i, v);
#elif defined(arch_atomic_add_return)
        return arch_atomic_add_return(i, v);
#else
#error "Unable to define raw_atomic_add_return_relaxed"
#endif
}

/**
 * raw_atomic_fetch_add() - atomic add with full ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_add() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_add(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_add)
        return arch_atomic_fetch_add(i, v);
#elif defined(arch_atomic_fetch_add_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_add_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_fetch_add"
#endif
}

/**
 * raw_atomic_fetch_add_acquire() - atomic add with acquire ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_add_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_add_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_add_acquire)
        return arch_atomic_fetch_add_acquire(i, v);
#elif defined(arch_atomic_fetch_add_relaxed)
        int ret = arch_atomic_fetch_add_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_add)
        return arch_atomic_fetch_add(i, v);
#else
#error "Unable to define raw_atomic_fetch_add_acquire"
#endif
}

/**
 * raw_atomic_fetch_add_release() - atomic add with release ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_add_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_add_release(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_add_release)
        return arch_atomic_fetch_add_release(i, v);
#elif defined(arch_atomic_fetch_add_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_add_relaxed(i, v);
#elif defined(arch_atomic_fetch_add)
        return arch_atomic_fetch_add(i, v);
#else
#error "Unable to define raw_atomic_fetch_add_release"
#endif
}

/**
 * raw_atomic_fetch_add_relaxed() - atomic add with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_add_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_add_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_add_relaxed)
        return arch_atomic_fetch_add_relaxed(i, v);
#elif defined(arch_atomic_fetch_add)
        return arch_atomic_fetch_add(i, v);
#else
#error "Unable to define raw_atomic_fetch_add_relaxed"
#endif
}

/**
 * raw_atomic_sub() - atomic subtract with relaxed ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_sub() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_sub(int i, atomic_t *v)
{
        arch_atomic_sub(i, v);
}

/**
 * raw_atomic_sub_return() - atomic subtract with full ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_sub_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_sub_return(int i, atomic_t *v)
{
#if defined(arch_atomic_sub_return)
        return arch_atomic_sub_return(i, v);
#elif defined(arch_atomic_sub_return_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_sub_return_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_sub_return"
#endif
}

/**
 * raw_atomic_sub_return_acquire() - atomic subtract with acquire ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_sub_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_sub_return_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_sub_return_acquire)
        return arch_atomic_sub_return_acquire(i, v);
#elif defined(arch_atomic_sub_return_relaxed)
        int ret = arch_atomic_sub_return_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_sub_return)
        return arch_atomic_sub_return(i, v);
#else
#error "Unable to define raw_atomic_sub_return_acquire"
#endif
}

/**
 * raw_atomic_sub_return_release() - atomic subtract with release ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_sub_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_sub_return_release(int i, atomic_t *v)
{
#if defined(arch_atomic_sub_return_release)
        return arch_atomic_sub_return_release(i, v);
#elif defined(arch_atomic_sub_return_relaxed)
        __atomic_release_fence();
        return arch_atomic_sub_return_relaxed(i, v);
#elif defined(arch_atomic_sub_return)
        return arch_atomic_sub_return(i, v);
#else
#error "Unable to define raw_atomic_sub_return_release"
#endif
}

/**
 * raw_atomic_sub_return_relaxed() - atomic subtract with relaxed ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_sub_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_sub_return_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_sub_return_relaxed)
        return arch_atomic_sub_return_relaxed(i, v);
#elif defined(arch_atomic_sub_return)
        return arch_atomic_sub_return(i, v);
#else
#error "Unable to define raw_atomic_sub_return_relaxed"
#endif
}

/**
 * raw_atomic_fetch_sub() - atomic subtract with full ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_sub() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_sub(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_sub)
        return arch_atomic_fetch_sub(i, v);
#elif defined(arch_atomic_fetch_sub_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_sub_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_fetch_sub"
#endif
}

/**
 * raw_atomic_fetch_sub_acquire() - atomic subtract with acquire ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_sub_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_sub_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_sub_acquire)
        return arch_atomic_fetch_sub_acquire(i, v);
#elif defined(arch_atomic_fetch_sub_relaxed)
        int ret = arch_atomic_fetch_sub_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_sub)
        return arch_atomic_fetch_sub(i, v);
#else
#error "Unable to define raw_atomic_fetch_sub_acquire"
#endif
}

/**
 * raw_atomic_fetch_sub_release() - atomic subtract with release ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_sub_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_sub_release(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_sub_release)
        return arch_atomic_fetch_sub_release(i, v);
#elif defined(arch_atomic_fetch_sub_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_sub_relaxed(i, v);
#elif defined(arch_atomic_fetch_sub)
        return arch_atomic_fetch_sub(i, v);
#else
#error "Unable to define raw_atomic_fetch_sub_release"
#endif
}

/**
 * raw_atomic_fetch_sub_relaxed() - atomic subtract with relaxed ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_sub_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_sub_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_sub_relaxed)
        return arch_atomic_fetch_sub_relaxed(i, v);
#elif defined(arch_atomic_fetch_sub)
        return arch_atomic_fetch_sub(i, v);
#else
#error "Unable to define raw_atomic_fetch_sub_relaxed"
#endif
}

/**
 * raw_atomic_inc() - atomic increment with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_inc() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_inc(atomic_t *v)
{
#if defined(arch_atomic_inc)
        arch_atomic_inc(v);
#else
        raw_atomic_add(1, v);
#endif
}

/**
 * raw_atomic_inc_return() - atomic increment with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_inc_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_inc_return(atomic_t *v)
{
#if defined(arch_atomic_inc_return)
        return arch_atomic_inc_return(v);
#elif defined(arch_atomic_inc_return_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_inc_return_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic_add_return(1, v);
#endif
}

/**
 * raw_atomic_inc_return_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_inc_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_inc_return_acquire(atomic_t *v)
{
#if defined(arch_atomic_inc_return_acquire)
        return arch_atomic_inc_return_acquire(v);
#elif defined(arch_atomic_inc_return_relaxed)
        int ret = arch_atomic_inc_return_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_inc_return)
        return arch_atomic_inc_return(v);
#else
        return raw_atomic_add_return_acquire(1, v);
#endif
}

/**
 * raw_atomic_inc_return_release() - atomic increment with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_inc_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_inc_return_release(atomic_t *v)
{
#if defined(arch_atomic_inc_return_release)
        return arch_atomic_inc_return_release(v);
#elif defined(arch_atomic_inc_return_relaxed)
        __atomic_release_fence();
        return arch_atomic_inc_return_relaxed(v);
#elif defined(arch_atomic_inc_return)
        return arch_atomic_inc_return(v);
#else
        return raw_atomic_add_return_release(1, v);
#endif
}

/**
 * raw_atomic_inc_return_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_inc_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_inc_return_relaxed(atomic_t *v)
{
#if defined(arch_atomic_inc_return_relaxed)
        return arch_atomic_inc_return_relaxed(v);
#elif defined(arch_atomic_inc_return)
        return arch_atomic_inc_return(v);
#else
        return raw_atomic_add_return_relaxed(1, v);
#endif
}

/**
 * raw_atomic_fetch_inc() - atomic increment with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_inc() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_inc(atomic_t *v)
{
#if defined(arch_atomic_fetch_inc)
        return arch_atomic_fetch_inc(v);
#elif defined(arch_atomic_fetch_inc_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_inc_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic_fetch_add(1, v);
#endif
}

/**
 * raw_atomic_fetch_inc_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_inc_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_inc_acquire(atomic_t *v)
{
#if defined(arch_atomic_fetch_inc_acquire)
        return arch_atomic_fetch_inc_acquire(v);
#elif defined(arch_atomic_fetch_inc_relaxed)
        int ret = arch_atomic_fetch_inc_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_inc)
        return arch_atomic_fetch_inc(v);
#else
        return raw_atomic_fetch_add_acquire(1, v);
#endif
}

/**
 * raw_atomic_fetch_inc_release() - atomic increment with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_inc_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_inc_release(atomic_t *v)
{
#if defined(arch_atomic_fetch_inc_release)
        return arch_atomic_fetch_inc_release(v);
#elif defined(arch_atomic_fetch_inc_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_inc_relaxed(v);
#elif defined(arch_atomic_fetch_inc)
        return arch_atomic_fetch_inc(v);
#else
        return raw_atomic_fetch_add_release(1, v);
#endif
}

/**
 * raw_atomic_fetch_inc_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_inc_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_inc_relaxed(atomic_t *v)
{
#if defined(arch_atomic_fetch_inc_relaxed)
        return arch_atomic_fetch_inc_relaxed(v);
#elif defined(arch_atomic_fetch_inc)
        return arch_atomic_fetch_inc(v);
#else
        return raw_atomic_fetch_add_relaxed(1, v);
#endif
}

/**
 * raw_atomic_dec() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_dec() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_dec(atomic_t *v)
{
#if defined(arch_atomic_dec)
        arch_atomic_dec(v);
#else
        raw_atomic_sub(1, v);
#endif
}

/**
 * raw_atomic_dec_return() - atomic decrement with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_dec_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_dec_return(atomic_t *v)
{
#if defined(arch_atomic_dec_return)
        return arch_atomic_dec_return(v);
#elif defined(arch_atomic_dec_return_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_dec_return_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic_sub_return(1, v);
#endif
}

/**
 * raw_atomic_dec_return_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_dec_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_dec_return_acquire(atomic_t *v)
{
#if defined(arch_atomic_dec_return_acquire)
        return arch_atomic_dec_return_acquire(v);
#elif defined(arch_atomic_dec_return_relaxed)
        int ret = arch_atomic_dec_return_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_dec_return)
        return arch_atomic_dec_return(v);
#else
        return raw_atomic_sub_return_acquire(1, v);
#endif
}

/**
 * raw_atomic_dec_return_release() - atomic decrement with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_dec_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_dec_return_release(atomic_t *v)
{
#if defined(arch_atomic_dec_return_release)
        return arch_atomic_dec_return_release(v);
#elif defined(arch_atomic_dec_return_relaxed)
        __atomic_release_fence();
        return arch_atomic_dec_return_relaxed(v);
#elif defined(arch_atomic_dec_return)
        return arch_atomic_dec_return(v);
#else
        return raw_atomic_sub_return_release(1, v);
#endif
}

/**
 * raw_atomic_dec_return_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_dec_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_dec_return_relaxed(atomic_t *v)
{
#if defined(arch_atomic_dec_return_relaxed)
        return arch_atomic_dec_return_relaxed(v);
#elif defined(arch_atomic_dec_return)
        return arch_atomic_dec_return(v);
#else
        return raw_atomic_sub_return_relaxed(1, v);
#endif
}

/**
 * raw_atomic_fetch_dec() - atomic decrement with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_dec() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_dec(atomic_t *v)
{
#if defined(arch_atomic_fetch_dec)
        return arch_atomic_fetch_dec(v);
#elif defined(arch_atomic_fetch_dec_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_dec_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic_fetch_sub(1, v);
#endif
}

/**
 * raw_atomic_fetch_dec_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_dec_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_dec_acquire(atomic_t *v)
{
#if defined(arch_atomic_fetch_dec_acquire)
        return arch_atomic_fetch_dec_acquire(v);
#elif defined(arch_atomic_fetch_dec_relaxed)
        int ret = arch_atomic_fetch_dec_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_dec)
        return arch_atomic_fetch_dec(v);
#else
        return raw_atomic_fetch_sub_acquire(1, v);
#endif
}

/**
 * raw_atomic_fetch_dec_release() - atomic decrement with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_dec_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_dec_release(atomic_t *v)
{
#if defined(arch_atomic_fetch_dec_release)
        return arch_atomic_fetch_dec_release(v);
#elif defined(arch_atomic_fetch_dec_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_dec_relaxed(v);
#elif defined(arch_atomic_fetch_dec)
        return arch_atomic_fetch_dec(v);
#else
        return raw_atomic_fetch_sub_release(1, v);
#endif
}

/**
 * raw_atomic_fetch_dec_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_dec_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_dec_relaxed(atomic_t *v)
{
#if defined(arch_atomic_fetch_dec_relaxed)
        return arch_atomic_fetch_dec_relaxed(v);
#elif defined(arch_atomic_fetch_dec)
        return arch_atomic_fetch_dec(v);
#else
        return raw_atomic_fetch_sub_relaxed(1, v);
#endif
}

/**
 * raw_atomic_and() - atomic bitwise AND with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_and() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_and(int i, atomic_t *v)
{
        arch_atomic_and(i, v);
}

/**
 * raw_atomic_fetch_and() - atomic bitwise AND with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_and() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_and(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_and)
        return arch_atomic_fetch_and(i, v);
#elif defined(arch_atomic_fetch_and_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_and_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_fetch_and"
#endif
}

/**
 * raw_atomic_fetch_and_acquire() - atomic bitwise AND with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_and_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_and_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_and_acquire)
        return arch_atomic_fetch_and_acquire(i, v);
#elif defined(arch_atomic_fetch_and_relaxed)
        int ret = arch_atomic_fetch_and_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_and)
        return arch_atomic_fetch_and(i, v);
#else
#error "Unable to define raw_atomic_fetch_and_acquire"
#endif
}

/**
 * raw_atomic_fetch_and_release() - atomic bitwise AND with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_and_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_and_release(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_and_release)
        return arch_atomic_fetch_and_release(i, v);
#elif defined(arch_atomic_fetch_and_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_and_relaxed(i, v);
#elif defined(arch_atomic_fetch_and)
        return arch_atomic_fetch_and(i, v);
#else
#error "Unable to define raw_atomic_fetch_and_release"
#endif
}

/**
 * raw_atomic_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_and_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_and_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_and_relaxed)
        return arch_atomic_fetch_and_relaxed(i, v);
#elif defined(arch_atomic_fetch_and)
        return arch_atomic_fetch_and(i, v);
#else
#error "Unable to define raw_atomic_fetch_and_relaxed"
#endif
}

/**
 * raw_atomic_andnot() - atomic bitwise AND NOT with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_andnot() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_andnot(int i, atomic_t *v)
{
#if defined(arch_atomic_andnot)
        arch_atomic_andnot(i, v);
#else
        raw_atomic_and(~i, v);
#endif
}

/**
 * raw_atomic_fetch_andnot() - atomic bitwise AND NOT with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_andnot() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_andnot(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_andnot)
        return arch_atomic_fetch_andnot(i, v);
#elif defined(arch_atomic_fetch_andnot_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_andnot_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic_fetch_and(~i, v);
#endif
}

/**
 * raw_atomic_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_andnot_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_andnot_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_andnot_acquire)
        return arch_atomic_fetch_andnot_acquire(i, v);
#elif defined(arch_atomic_fetch_andnot_relaxed)
        int ret = arch_atomic_fetch_andnot_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_andnot)
        return arch_atomic_fetch_andnot(i, v);
#else
        return raw_atomic_fetch_and_acquire(~i, v);
#endif
}

/**
 * raw_atomic_fetch_andnot_release() - atomic bitwise AND NOT with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_andnot_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_andnot_release(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_andnot_release)
        return arch_atomic_fetch_andnot_release(i, v);
#elif defined(arch_atomic_fetch_andnot_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_andnot_relaxed(i, v);
#elif defined(arch_atomic_fetch_andnot)
        return arch_atomic_fetch_andnot(i, v);
#else
        return raw_atomic_fetch_and_release(~i, v);
#endif
}

/**
 * raw_atomic_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_andnot_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_andnot_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_andnot_relaxed)
        return arch_atomic_fetch_andnot_relaxed(i, v);
#elif defined(arch_atomic_fetch_andnot)
        return arch_atomic_fetch_andnot(i, v);
#else
        return raw_atomic_fetch_and_relaxed(~i, v);
#endif
}

/**
 * raw_atomic_or() - atomic bitwise OR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_or() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_or(int i, atomic_t *v)
{
        arch_atomic_or(i, v);
}

/**
 * raw_atomic_fetch_or() - atomic bitwise OR with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_or() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_or(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_or)
        return arch_atomic_fetch_or(i, v);
#elif defined(arch_atomic_fetch_or_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_or_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_fetch_or"
#endif
}

/**
 * raw_atomic_fetch_or_acquire() - atomic bitwise OR with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_or_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_or_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_or_acquire)
        return arch_atomic_fetch_or_acquire(i, v);
#elif defined(arch_atomic_fetch_or_relaxed)
        int ret = arch_atomic_fetch_or_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_or)
        return arch_atomic_fetch_or(i, v);
#else
#error "Unable to define raw_atomic_fetch_or_acquire"
#endif
}

/**
 * raw_atomic_fetch_or_release() - atomic bitwise OR with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_or_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_or_release(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_or_release)
        return arch_atomic_fetch_or_release(i, v);
#elif defined(arch_atomic_fetch_or_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_or_relaxed(i, v);
#elif defined(arch_atomic_fetch_or)
        return arch_atomic_fetch_or(i, v);
#else
#error "Unable to define raw_atomic_fetch_or_release"
#endif
}

/**
 * raw_atomic_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_or_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_or_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_or_relaxed)
        return arch_atomic_fetch_or_relaxed(i, v);
#elif defined(arch_atomic_fetch_or)
        return arch_atomic_fetch_or(i, v);
#else
#error "Unable to define raw_atomic_fetch_or_relaxed"
#endif
}

/**
 * raw_atomic_xor() - atomic bitwise XOR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_xor() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_xor(int i, atomic_t *v)
{
        arch_atomic_xor(i, v);
}

/**
 * raw_atomic_fetch_xor() - atomic bitwise XOR with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_xor() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_xor(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_xor)
        return arch_atomic_fetch_xor(i, v);
#elif defined(arch_atomic_fetch_xor_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_xor_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_fetch_xor"
#endif
}

/**
 * raw_atomic_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_xor_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_xor_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_xor_acquire)
        return arch_atomic_fetch_xor_acquire(i, v);
#elif defined(arch_atomic_fetch_xor_relaxed)
        int ret = arch_atomic_fetch_xor_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_xor)
        return arch_atomic_fetch_xor(i, v);
#else
#error "Unable to define raw_atomic_fetch_xor_acquire"
#endif
}

/**
 * raw_atomic_fetch_xor_release() - atomic bitwise XOR with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_xor_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_xor_release(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_xor_release)
        return arch_atomic_fetch_xor_release(i, v);
#elif defined(arch_atomic_fetch_xor_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_xor_relaxed(i, v);
#elif defined(arch_atomic_fetch_xor)
        return arch_atomic_fetch_xor(i, v);
#else
#error "Unable to define raw_atomic_fetch_xor_release"
#endif
}

/**
 * raw_atomic_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_xor_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_xor_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_xor_relaxed)
        return arch_atomic_fetch_xor_relaxed(i, v);
#elif defined(arch_atomic_fetch_xor)
        return arch_atomic_fetch_xor(i, v);
#else
#error "Unable to define raw_atomic_fetch_xor_relaxed"
#endif
}

/**
 * raw_atomic_xchg() - atomic exchange with full ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_xchg() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_xchg(atomic_t *v, int new)
{
#if defined(arch_atomic_xchg)
        return arch_atomic_xchg(v, new);
#elif defined(arch_atomic_xchg_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_xchg_relaxed(v, new);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_xchg(&v->counter, new);
#endif
}

/**
 * raw_atomic_xchg_acquire() - atomic exchange with acquire ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_xchg_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_xchg_acquire(atomic_t *v, int new)
{
#if defined(arch_atomic_xchg_acquire)
        return arch_atomic_xchg_acquire(v, new);
#elif defined(arch_atomic_xchg_relaxed)
        int ret = arch_atomic_xchg_relaxed(v, new);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_xchg)
        return arch_atomic_xchg(v, new);
#else
        return raw_xchg_acquire(&v->counter, new);
#endif
}

/**
 * raw_atomic_xchg_release() - atomic exchange with release ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_xchg_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_xchg_release(atomic_t *v, int new)
{
#if defined(arch_atomic_xchg_release)
        return arch_atomic_xchg_release(v, new);
#elif defined(arch_atomic_xchg_relaxed)
        __atomic_release_fence();
        return arch_atomic_xchg_relaxed(v, new);
#elif defined(arch_atomic_xchg)
        return arch_atomic_xchg(v, new);
#else
        return raw_xchg_release(&v->counter, new);
#endif
}

/**
 * raw_atomic_xchg_relaxed() - atomic exchange with relaxed ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_xchg_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_xchg_relaxed(atomic_t *v, int new)
{
#if defined(arch_atomic_xchg_relaxed)
        return arch_atomic_xchg_relaxed(v, new);
#elif defined(arch_atomic_xchg)
        return arch_atomic_xchg(v, new);
#else
        return raw_xchg_relaxed(&v->counter, new);
#endif
}

/**
 * raw_atomic_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_cmpxchg() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_cmpxchg(atomic_t *v, int old, int new)
{
#if defined(arch_atomic_cmpxchg)
        return arch_atomic_cmpxchg(v, old, new);
#elif defined(arch_atomic_cmpxchg_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_cmpxchg_relaxed(v, old, new);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_cmpxchg(&v->counter, old, new);
#endif
}

/**
 * raw_atomic_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_cmpxchg_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_cmpxchg_acquire(atomic_t *v, int old, int new)
{
#if defined(arch_atomic_cmpxchg_acquire)
        return arch_atomic_cmpxchg_acquire(v, old, new);
#elif defined(arch_atomic_cmpxchg_relaxed)
        int ret = arch_atomic_cmpxchg_relaxed(v, old, new);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_cmpxchg)
        return arch_atomic_cmpxchg(v, old, new);
#else
        return raw_cmpxchg_acquire(&v->counter, old, new);
#endif
}

/**
 * raw_atomic_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_cmpxchg_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_cmpxchg_release(atomic_t *v, int old, int new)
{
#if defined(arch_atomic_cmpxchg_release)
        return arch_atomic_cmpxchg_release(v, old, new);
#elif defined(arch_atomic_cmpxchg_relaxed)
        __atomic_release_fence();
        return arch_atomic_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic_cmpxchg)
        return arch_atomic_cmpxchg(v, old, new);
#else
        return raw_cmpxchg_release(&v->counter, old, new);
#endif
}

/**
 * raw_atomic_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_cmpxchg_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_cmpxchg_relaxed(atomic_t *v, int old, int new)
{
#if defined(arch_atomic_cmpxchg_relaxed)
        return arch_atomic_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic_cmpxchg)
        return arch_atomic_cmpxchg(v, old, new);
#else
        return raw_cmpxchg_relaxed(&v->counter, old, new);
#endif
}

/**
 * raw_atomic_try_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_try_cmpxchg() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic_try_cmpxchg(atomic_t *v, int *old, int new)
{
#if defined(arch_atomic_try_cmpxchg)
        return arch_atomic_try_cmpxchg(v, old, new);
#elif defined(arch_atomic_try_cmpxchg_relaxed)
        bool ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_try_cmpxchg_relaxed(v, old, new);
        __atomic_post_full_fence();
        return ret;
#else
        int r, o = *old;
        r = raw_atomic_cmpxchg(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_try_cmpxchg_acquire() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new)
{
#if defined(arch_atomic_try_cmpxchg_acquire)
        return arch_atomic_try_cmpxchg_acquire(v, old, new);
#elif defined(arch_atomic_try_cmpxchg_relaxed)
        bool ret = arch_atomic_try_cmpxchg_relaxed(v, old, new);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_try_cmpxchg)
        return arch_atomic_try_cmpxchg(v, old, new);
#else
        int r, o = *old;
        r = raw_atomic_cmpxchg_acquire(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic_try_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_try_cmpxchg_release() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic_try_cmpxchg_release(atomic_t *v, int *old, int new)
{
#if defined(arch_atomic_try_cmpxchg_release)
        return arch_atomic_try_cmpxchg_release(v, old, new);
#elif defined(arch_atomic_try_cmpxchg_relaxed)
        __atomic_release_fence();
        return arch_atomic_try_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic_try_cmpxchg)
        return arch_atomic_try_cmpxchg(v, old, new);
#else
        int r, o = *old;
        r = raw_atomic_cmpxchg_release(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_try_cmpxchg_relaxed() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new)
{
#if defined(arch_atomic_try_cmpxchg_relaxed)
        return arch_atomic_try_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic_try_cmpxchg)
        return arch_atomic_try_cmpxchg(v, old, new);
#else
        int r, o = *old;
        r = raw_atomic_cmpxchg_relaxed(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic_sub_and_test() - atomic subtract and test if zero with full ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_sub_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic_sub_and_test(int i, atomic_t *v)
{
#if defined(arch_atomic_sub_and_test)
        return arch_atomic_sub_and_test(i, v);
#else
        return raw_atomic_sub_return(i, v) == 0;
#endif
}

/**
 * raw_atomic_dec_and_test() - atomic decrement and test if zero with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_dec_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic_dec_and_test(atomic_t *v)
{
#if defined(arch_atomic_dec_and_test)
        return arch_atomic_dec_and_test(v);
#else
        return raw_atomic_dec_return(v) == 0;
#endif
}

/**
 * raw_atomic_inc_and_test() - atomic increment and test if zero with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_inc_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic_inc_and_test(atomic_t *v)
{
#if defined(arch_atomic_inc_and_test)
        return arch_atomic_inc_and_test(v);
#else
        return raw_atomic_inc_return(v) == 0;
#endif
}

/**
 * raw_atomic_add_negative() - atomic add and test if negative with full ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_negative() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_add_negative(int i, atomic_t *v)
{
#if defined(arch_atomic_add_negative)
        return arch_atomic_add_negative(i, v);
#elif defined(arch_atomic_add_negative_relaxed)
        bool ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_add_negative_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic_add_return(i, v) < 0;
#endif
}

/**
 * raw_atomic_add_negative_acquire() - atomic add and test if negative with acquire ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_negative_acquire() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_add_negative_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_add_negative_acquire)
        return arch_atomic_add_negative_acquire(i, v);
#elif defined(arch_atomic_add_negative_relaxed)
        bool ret = arch_atomic_add_negative_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_add_negative)
        return arch_atomic_add_negative(i, v);
#else
        return raw_atomic_add_return_acquire(i, v) < 0;
#endif
}

/**
 * raw_atomic_add_negative_release() - atomic add and test if negative with release ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_negative_release() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_add_negative_release(int i, atomic_t *v)
{
#if defined(arch_atomic_add_negative_release)
        return arch_atomic_add_negative_release(i, v);
#elif defined(arch_atomic_add_negative_relaxed)
        __atomic_release_fence();
        return arch_atomic_add_negative_relaxed(i, v);
#elif defined(arch_atomic_add_negative)
        return arch_atomic_add_negative(i, v);
#else
        return raw_atomic_add_return_release(i, v) < 0;
#endif
}

/**
 * raw_atomic_add_negative_relaxed() - atomic add and test if negative with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_negative_relaxed() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_add_negative_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_add_negative_relaxed)
        return arch_atomic_add_negative_relaxed(i, v);
#elif defined(arch_atomic_add_negative)
        return arch_atomic_add_negative(i, v);
#else
        return raw_atomic_add_return_relaxed(i, v) < 0;
#endif
}

/**
 * raw_atomic_fetch_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_t
 * @a: int value to add
 * @u: int value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_add_unless() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_add_unless(atomic_t *v, int a, int u)
{
#if defined(arch_atomic_fetch_add_unless)
        return arch_atomic_fetch_add_unless(v, a, u);
#else
        int c = raw_atomic_read(v);

        do {
                if (unlikely(c == u))
                        break;
        } while (!raw_atomic_try_cmpxchg(v, &c, c + a));

        return c;
#endif
}

/**
 * raw_atomic_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_t
 * @a: int value to add
 * @u: int value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_add_unless() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_add_unless(atomic_t *v, int a, int u)
{
#if defined(arch_atomic_add_unless)
        return arch_atomic_add_unless(v, a, u);
#else
        return raw_atomic_fetch_add_unless(v, a, u) != u;
#endif
}

/**
 * raw_atomic_inc_not_zero() - atomic increment unless zero with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_inc_not_zero() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_inc_not_zero(atomic_t *v)
{
#if defined(arch_atomic_inc_not_zero)
        return arch_atomic_inc_not_zero(v);
#else
        return raw_atomic_add_unless(v, 1, 0);
#endif
}

/**
 * raw_atomic_inc_unless_negative() - atomic increment unless negative with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_inc_unless_negative() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_inc_unless_negative(atomic_t *v)
{
#if defined(arch_atomic_inc_unless_negative)
        return arch_atomic_inc_unless_negative(v);
#else
        int c = raw_atomic_read(v);

        do {
                if (unlikely(c < 0))
                        return false;
        } while (!raw_atomic_try_cmpxchg(v, &c, c + 1));

        return true;
#endif
}

/**
 * raw_atomic_dec_unless_positive() - atomic decrement unless positive with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_dec_unless_positive() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_dec_unless_positive(atomic_t *v)
{
#if defined(arch_atomic_dec_unless_positive)
        return arch_atomic_dec_unless_positive(v);
#else
        int c = raw_atomic_read(v);

        do {
                if (unlikely(c > 0))
                        return false;
        } while (!raw_atomic_try_cmpxchg(v, &c, c - 1));

        return true;
#endif
}

/**
 * raw_atomic_dec_if_positive() - atomic decrement if positive with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_dec_if_positive() elsewhere.
 *
 * Return: The old value of (@v - 1), regardless of whether @v was updated.
 */
static __always_inline int
raw_atomic_dec_if_positive(atomic_t *v)
{
#if defined(arch_atomic_dec_if_positive)
        return arch_atomic_dec_if_positive(v);
#else
        int dec, c = raw_atomic_read(v);

        do {
                dec = c - 1;
                if (unlikely(dec < 0))
                        break;
        } while (!raw_atomic_try_cmpxchg(v, &c, dec));

        return dec;
#endif
}

#ifdef CONFIG_GENERIC_ATOMIC64
#include <asm-generic/atomic64.h>
#endif

/**
 * raw_atomic64_read() - atomic load with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically loads the value of @v with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_read() elsewhere.
 *
 * Return: The value loaded from @v.
 */
static __always_inline s64
raw_atomic64_read(const atomic64_t *v)
{
        return arch_atomic64_read(v);
}

/**
 * raw_atomic64_read_acquire() - atomic load with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically loads the value of @v with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_read_acquire() elsewhere.
 *
 * Return: The value loaded from @v.
 */
static __always_inline s64
raw_atomic64_read_acquire(const atomic64_t *v)
{
#if defined(arch_atomic64_read_acquire)
        return arch_atomic64_read_acquire(v);
#else
        s64 ret;

        if (__native_word(atomic64_t)) {
                ret = smp_load_acquire(&(v)->counter);
        } else {
                ret = raw_atomic64_read(v);
                __atomic_acquire_fence();
        }

        return ret;
#endif
}

/**
 * raw_atomic64_set() - atomic set with relaxed ordering
 * @v: pointer to atomic64_t
 * @i: s64 value to assign
 *
 * Atomically sets @v to @i with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_set() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_set(atomic64_t *v, s64 i)
{
        arch_atomic64_set(v, i);
}

/**
 * raw_atomic64_set_release() - atomic set with release ordering
 * @v: pointer to atomic64_t
 * @i: s64 value to assign
 *
 * Atomically sets @v to @i with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_set_release() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_set_release(atomic64_t *v, s64 i)
{
#if defined(arch_atomic64_set_release)
        arch_atomic64_set_release(v, i);
#else
        if (__native_word(atomic64_t)) {
                smp_store_release(&(v)->counter, i);
        } else {
                __atomic_release_fence();
                raw_atomic64_set(v, i);
        }
#endif
}

/**
 * raw_atomic64_add() - atomic add with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_add(s64 i, atomic64_t *v)
{
        arch_atomic64_add(i, v);
}

/**
 * raw_atomic64_add_return() - atomic add with full ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_add_return(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_return)
        return arch_atomic64_add_return(i, v);
#elif defined(arch_atomic64_add_return_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_add_return_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_add_return"
#endif
}

/**
 * raw_atomic64_add_return_acquire() - atomic add with acquire ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_add_return_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_return_acquire)
        return arch_atomic64_add_return_acquire(i, v);
#elif defined(arch_atomic64_add_return_relaxed)
        s64 ret = arch_atomic64_add_return_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_add_return)
        return arch_atomic64_add_return(i, v);
#else
#error "Unable to define raw_atomic64_add_return_acquire"
#endif
}

/**
 * raw_atomic64_add_return_release() - atomic add with release ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_add_return_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_return_release)
        return arch_atomic64_add_return_release(i, v);
#elif defined(arch_atomic64_add_return_relaxed)
        __atomic_release_fence();
        return arch_atomic64_add_return_relaxed(i, v);
#elif defined(arch_atomic64_add_return)
        return arch_atomic64_add_return(i, v);
#else
#error "Unable to define raw_atomic64_add_return_release"
#endif
}

/**
 * raw_atomic64_add_return_relaxed() - atomic add with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_add_return_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_return_relaxed)
        return arch_atomic64_add_return_relaxed(i, v);
#elif defined(arch_atomic64_add_return)
        return arch_atomic64_add_return(i, v);
#else
#error "Unable to define raw_atomic64_add_return_relaxed"
#endif
}

/**
 * raw_atomic64_fetch_add() - atomic add with full ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_add() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_add(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_add)
        return arch_atomic64_fetch_add(i, v);
#elif defined(arch_atomic64_fetch_add_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_add_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_fetch_add"
#endif
}

/**
 * raw_atomic64_fetch_add_acquire() - atomic add with acquire ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_add_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_add_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_add_acquire)
        return arch_atomic64_fetch_add_acquire(i, v);
#elif defined(arch_atomic64_fetch_add_relaxed)
        s64 ret = arch_atomic64_fetch_add_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_add)
        return arch_atomic64_fetch_add(i, v);
#else
#error "Unable to define raw_atomic64_fetch_add_acquire"
#endif
}

/**
 * raw_atomic64_fetch_add_release() - atomic add with release ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_add_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_add_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_add_release)
        return arch_atomic64_fetch_add_release(i, v);
#elif defined(arch_atomic64_fetch_add_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_add_relaxed(i, v);
#elif defined(arch_atomic64_fetch_add)
        return arch_atomic64_fetch_add(i, v);
#else
#error "Unable to define raw_atomic64_fetch_add_release"
#endif
}

/**
 * raw_atomic64_fetch_add_relaxed() - atomic add with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_add_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_add_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_add_relaxed)
        return arch_atomic64_fetch_add_relaxed(i, v);
#elif defined(arch_atomic64_fetch_add)
        return arch_atomic64_fetch_add(i, v);
#else
#error "Unable to define raw_atomic64_fetch_add_relaxed"
#endif
}

/**
 * raw_atomic64_sub() - atomic subtract with relaxed ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_sub() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_sub(s64 i, atomic64_t *v)
{
        arch_atomic64_sub(i, v);
}

/**
 * raw_atomic64_sub_return() - atomic subtract with full ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_sub_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_sub_return(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_sub_return)
        return arch_atomic64_sub_return(i, v);
#elif defined(arch_atomic64_sub_return_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_sub_return_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_sub_return"
#endif
}

/**
 * raw_atomic64_sub_return_acquire() - atomic subtract with acquire ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_sub_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_sub_return_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_sub_return_acquire)
        return arch_atomic64_sub_return_acquire(i, v);
#elif defined(arch_atomic64_sub_return_relaxed)
        s64 ret = arch_atomic64_sub_return_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_sub_return)
        return arch_atomic64_sub_return(i, v);
#else
#error "Unable to define raw_atomic64_sub_return_acquire"
#endif
}

/**
 * raw_atomic64_sub_return_release() - atomic subtract with release ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_sub_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_sub_return_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_sub_return_release)
        return arch_atomic64_sub_return_release(i, v);
#elif defined(arch_atomic64_sub_return_relaxed)
        __atomic_release_fence();
        return arch_atomic64_sub_return_relaxed(i, v);
#elif defined(arch_atomic64_sub_return)
        return arch_atomic64_sub_return(i, v);
#else
#error "Unable to define raw_atomic64_sub_return_release"
#endif
}

/**
 * raw_atomic64_sub_return_relaxed() - atomic subtract with relaxed ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_sub_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_sub_return_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_sub_return_relaxed)
        return arch_atomic64_sub_return_relaxed(i, v);
#elif defined(arch_atomic64_sub_return)
        return arch_atomic64_sub_return(i, v);
#else
#error "Unable to define raw_atomic64_sub_return_relaxed"
#endif
}

/**
 * raw_atomic64_fetch_sub() - atomic subtract with full ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_sub() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_sub(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_sub)
        return arch_atomic64_fetch_sub(i, v);
#elif defined(arch_atomic64_fetch_sub_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_sub_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_fetch_sub"
#endif
}

/**
 * raw_atomic64_fetch_sub_acquire() - atomic subtract with acquire ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_sub_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_sub_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_sub_acquire)
        return arch_atomic64_fetch_sub_acquire(i, v);
#elif defined(arch_atomic64_fetch_sub_relaxed)
        s64 ret = arch_atomic64_fetch_sub_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_sub)
        return arch_atomic64_fetch_sub(i, v);
#else
#error "Unable to define raw_atomic64_fetch_sub_acquire"
#endif
}

/**
 * raw_atomic64_fetch_sub_release() - atomic subtract with release ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_sub_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_sub_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_sub_release)
        return arch_atomic64_fetch_sub_release(i, v);
#elif defined(arch_atomic64_fetch_sub_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_sub_relaxed(i, v);
#elif defined(arch_atomic64_fetch_sub)
        return arch_atomic64_fetch_sub(i, v);
#else
#error "Unable to define raw_atomic64_fetch_sub_release"
#endif
}

/**
 * raw_atomic64_fetch_sub_relaxed() - atomic subtract with relaxed ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_sub_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_sub_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_sub_relaxed)
        return arch_atomic64_fetch_sub_relaxed(i, v);
#elif defined(arch_atomic64_fetch_sub)
        return arch_atomic64_fetch_sub(i, v);
#else
#error "Unable to define raw_atomic64_fetch_sub_relaxed"
#endif
}

/**
 * raw_atomic64_inc() - atomic increment with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_inc() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_inc(atomic64_t *v)
{
#if defined(arch_atomic64_inc)
        arch_atomic64_inc(v);
#else
        raw_atomic64_add(1, v);
#endif
}

/**
 * raw_atomic64_inc_return() - atomic increment with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_inc_return(atomic64_t *v)
{
#if defined(arch_atomic64_inc_return)
        return arch_atomic64_inc_return(v);
#elif defined(arch_atomic64_inc_return_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_inc_return_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic64_add_return(1, v);
#endif
}

/**
 * raw_atomic64_inc_return_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_inc_return_acquire(atomic64_t *v)
{
#if defined(arch_atomic64_inc_return_acquire)
        return arch_atomic64_inc_return_acquire(v);
#elif defined(arch_atomic64_inc_return_relaxed)
        s64 ret = arch_atomic64_inc_return_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_inc_return)
        return arch_atomic64_inc_return(v);
#else
        return raw_atomic64_add_return_acquire(1, v);
#endif
}

/**
 * raw_atomic64_inc_return_release() - atomic increment with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_inc_return_release(atomic64_t *v)
{
#if defined(arch_atomic64_inc_return_release)
        return arch_atomic64_inc_return_release(v);
#elif defined(arch_atomic64_inc_return_relaxed)
        __atomic_release_fence();
        return arch_atomic64_inc_return_relaxed(v);
#elif defined(arch_atomic64_inc_return)
        return arch_atomic64_inc_return(v);
#else
        return raw_atomic64_add_return_release(1, v);
#endif
}

/**
 * raw_atomic64_inc_return_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_inc_return_relaxed(atomic64_t *v)
{
#if defined(arch_atomic64_inc_return_relaxed)
        return arch_atomic64_inc_return_relaxed(v);
#elif defined(arch_atomic64_inc_return)
        return arch_atomic64_inc_return(v);
#else
        return raw_atomic64_add_return_relaxed(1, v);
#endif
}

/**
 * raw_atomic64_fetch_inc() - atomic increment with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_inc() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_inc(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_inc)
        return arch_atomic64_fetch_inc(v);
#elif defined(arch_atomic64_fetch_inc_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_inc_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic64_fetch_add(1, v);
#endif
}

/**
 * raw_atomic64_fetch_inc_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_inc_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_inc_acquire(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_inc_acquire)
        return arch_atomic64_fetch_inc_acquire(v);
#elif defined(arch_atomic64_fetch_inc_relaxed)
        s64 ret = arch_atomic64_fetch_inc_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_inc)
        return arch_atomic64_fetch_inc(v);
#else
        return raw_atomic64_fetch_add_acquire(1, v);
#endif
}

/**
 * raw_atomic64_fetch_inc_release() - atomic increment with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_inc_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_inc_release(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_inc_release)
        return arch_atomic64_fetch_inc_release(v);
#elif defined(arch_atomic64_fetch_inc_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_inc_relaxed(v);
#elif defined(arch_atomic64_fetch_inc)
        return arch_atomic64_fetch_inc(v);
#else
        return raw_atomic64_fetch_add_release(1, v);
#endif
}

/**
 * raw_atomic64_fetch_inc_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_inc_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_inc_relaxed(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_inc_relaxed)
        return arch_atomic64_fetch_inc_relaxed(v);
#elif defined(arch_atomic64_fetch_inc)
        return arch_atomic64_fetch_inc(v);
#else
        return raw_atomic64_fetch_add_relaxed(1, v);
#endif
}

/**
 * raw_atomic64_dec() - atomic decrement with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_dec() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_dec(atomic64_t *v)
{
#if defined(arch_atomic64_dec)
        arch_atomic64_dec(v);
#else
        raw_atomic64_sub(1, v);
#endif
}

/**
 * raw_atomic64_dec_return() - atomic decrement with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_dec_return(atomic64_t *v)
{
#if defined(arch_atomic64_dec_return)
        return arch_atomic64_dec_return(v);
#elif defined(arch_atomic64_dec_return_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_dec_return_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic64_sub_return(1, v);
#endif
}

/**
 * raw_atomic64_dec_return_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_dec_return_acquire(atomic64_t *v)
{
#if defined(arch_atomic64_dec_return_acquire)
        return arch_atomic64_dec_return_acquire(v);
#elif defined(arch_atomic64_dec_return_relaxed)
        s64 ret = arch_atomic64_dec_return_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_dec_return)
        return arch_atomic64_dec_return(v);
#else
        return raw_atomic64_sub_return_acquire(1, v);
#endif
}

/**
 * raw_atomic64_dec_return_release() - atomic decrement with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_dec_return_release(atomic64_t *v)
{
#if defined(arch_atomic64_dec_return_release)
        return arch_atomic64_dec_return_release(v);
#elif defined(arch_atomic64_dec_return_relaxed)
        __atomic_release_fence();
        return arch_atomic64_dec_return_relaxed(v);
#elif defined(arch_atomic64_dec_return)
        return arch_atomic64_dec_return(v);
#else
        return raw_atomic64_sub_return_release(1, v);
#endif
}

/**
 * raw_atomic64_dec_return_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_dec_return_relaxed(atomic64_t *v)
{
#if defined(arch_atomic64_dec_return_relaxed)
        return arch_atomic64_dec_return_relaxed(v);
#elif defined(arch_atomic64_dec_return)
        return arch_atomic64_dec_return(v);
#else
        return raw_atomic64_sub_return_relaxed(1, v);
#endif
}

/**
 * raw_atomic64_fetch_dec() - atomic decrement with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_dec() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_dec(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_dec)
        return arch_atomic64_fetch_dec(v);
#elif defined(arch_atomic64_fetch_dec_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_dec_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic64_fetch_sub(1, v);
#endif
}

/**
 * raw_atomic64_fetch_dec_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_dec_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_dec_acquire(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_dec_acquire)
        return arch_atomic64_fetch_dec_acquire(v);
#elif defined(arch_atomic64_fetch_dec_relaxed)
        s64 ret = arch_atomic64_fetch_dec_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_dec)
        return arch_atomic64_fetch_dec(v);
#else
        return raw_atomic64_fetch_sub_acquire(1, v);
#endif
}

/**
 * raw_atomic64_fetch_dec_release() - atomic decrement with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_dec_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_dec_release(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_dec_release)
        return arch_atomic64_fetch_dec_release(v);
#elif defined(arch_atomic64_fetch_dec_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_dec_relaxed(v);
#elif defined(arch_atomic64_fetch_dec)
        return arch_atomic64_fetch_dec(v);
#else
        return raw_atomic64_fetch_sub_release(1, v);
#endif
}

/**
 * raw_atomic64_fetch_dec_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_dec_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_dec_relaxed(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_dec_relaxed)
        return arch_atomic64_fetch_dec_relaxed(v);
#elif defined(arch_atomic64_fetch_dec)
        return arch_atomic64_fetch_dec(v);
#else
        return raw_atomic64_fetch_sub_relaxed(1, v);
#endif
}

/**
 * raw_atomic64_and() - atomic bitwise AND with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_and() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_and(s64 i, atomic64_t *v)
{
        arch_atomic64_and(i, v);
}

/**
 * raw_atomic64_fetch_and() - atomic bitwise AND with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_and() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_and(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_and)
        return arch_atomic64_fetch_and(i, v);
#elif defined(arch_atomic64_fetch_and_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_and_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_fetch_and"
#endif
}

/**
 * raw_atomic64_fetch_and_acquire() - atomic bitwise AND with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_and_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_and_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_and_acquire)
        return arch_atomic64_fetch_and_acquire(i, v);
#elif defined(arch_atomic64_fetch_and_relaxed)
        s64 ret = arch_atomic64_fetch_and_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_and)
        return arch_atomic64_fetch_and(i, v);
#else
#error "Unable to define raw_atomic64_fetch_and_acquire"
#endif
}

/**
 * raw_atomic64_fetch_and_release() - atomic bitwise AND with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_and_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_and_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_and_release)
        return arch_atomic64_fetch_and_release(i, v);
#elif defined(arch_atomic64_fetch_and_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_and_relaxed(i, v);
#elif defined(arch_atomic64_fetch_and)
        return arch_atomic64_fetch_and(i, v);
#else
#error "Unable to define raw_atomic64_fetch_and_release"
#endif
}

/**
 * raw_atomic64_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_and_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_and_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_and_relaxed)
        return arch_atomic64_fetch_and_relaxed(i, v);
#elif defined(arch_atomic64_fetch_and)
        return arch_atomic64_fetch_and(i, v);
#else
#error "Unable to define raw_atomic64_fetch_and_relaxed"
#endif
}

/**
 * raw_atomic64_andnot() - atomic bitwise AND NOT with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_andnot() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_andnot(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_andnot)
        arch_atomic64_andnot(i, v);
#else
        raw_atomic64_and(~i, v);
#endif
}

/**
 * raw_atomic64_fetch_andnot() - atomic bitwise AND NOT with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_andnot() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_andnot(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_andnot)
        return arch_atomic64_fetch_andnot(i, v);
#elif defined(arch_atomic64_fetch_andnot_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_andnot_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic64_fetch_and(~i, v);
#endif
}

/**
 * raw_atomic64_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_andnot_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_andnot_acquire)
        return arch_atomic64_fetch_andnot_acquire(i, v);
#elif defined(arch_atomic64_fetch_andnot_relaxed)
        s64 ret = arch_atomic64_fetch_andnot_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_andnot)
        return arch_atomic64_fetch_andnot(i, v);
#else
        return raw_atomic64_fetch_and_acquire(~i, v);
#endif
}

/**
 * raw_atomic64_fetch_andnot_release() - atomic bitwise AND NOT with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_andnot_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_andnot_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_andnot_release)
        return arch_atomic64_fetch_andnot_release(i, v);
#elif defined(arch_atomic64_fetch_andnot_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_andnot_relaxed(i, v);
#elif defined(arch_atomic64_fetch_andnot)
        return arch_atomic64_fetch_andnot(i, v);
#else
        return raw_atomic64_fetch_and_release(~i, v);
#endif
}

/**
 * raw_atomic64_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_andnot_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_andnot_relaxed)
        return arch_atomic64_fetch_andnot_relaxed(i, v);
#elif defined(arch_atomic64_fetch_andnot)
        return arch_atomic64_fetch_andnot(i, v);
#else
        return raw_atomic64_fetch_and_relaxed(~i, v);
#endif
}

/**
 * raw_atomic64_or() - atomic bitwise OR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_or() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_or(s64 i, atomic64_t *v)
{
        arch_atomic64_or(i, v);
}

/**
 * raw_atomic64_fetch_or() - atomic bitwise OR with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_or() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_or(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_or)
        return arch_atomic64_fetch_or(i, v);
#elif defined(arch_atomic64_fetch_or_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_or_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_fetch_or"
#endif
}

/**
 * raw_atomic64_fetch_or_acquire() - atomic bitwise OR with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_or_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_or_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_or_acquire)
        return arch_atomic64_fetch_or_acquire(i, v);
#elif defined(arch_atomic64_fetch_or_relaxed)
        s64 ret = arch_atomic64_fetch_or_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_or)
        return arch_atomic64_fetch_or(i, v);
#else
#error "Unable to define raw_atomic64_fetch_or_acquire"
#endif
}

/**
 * raw_atomic64_fetch_or_release() - atomic bitwise OR with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_or_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_or_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_or_release)
        return arch_atomic64_fetch_or_release(i, v);
#elif defined(arch_atomic64_fetch_or_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_or_relaxed(i, v);
#elif defined(arch_atomic64_fetch_or)
        return arch_atomic64_fetch_or(i, v);
#else
#error "Unable to define raw_atomic64_fetch_or_release"
#endif
}

/**
 * raw_atomic64_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_or_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_or_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_or_relaxed)
        return arch_atomic64_fetch_or_relaxed(i, v);
#elif defined(arch_atomic64_fetch_or)
        return arch_atomic64_fetch_or(i, v);
#else
#error "Unable to define raw_atomic64_fetch_or_relaxed"
#endif
}

/**
 * raw_atomic64_xor() - atomic bitwise XOR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_xor() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_xor(s64 i, atomic64_t *v)
{
        arch_atomic64_xor(i, v);
}

/**
 * raw_atomic64_fetch_xor() - atomic bitwise XOR with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_xor() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_xor(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_xor)
        return arch_atomic64_fetch_xor(i, v);
#elif defined(arch_atomic64_fetch_xor_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_xor_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_fetch_xor"
#endif
}

/**
 * raw_atomic64_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_xor_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_xor_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_xor_acquire)
        return arch_atomic64_fetch_xor_acquire(i, v);
#elif defined(arch_atomic64_fetch_xor_relaxed)
        s64 ret = arch_atomic64_fetch_xor_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_xor)
        return arch_atomic64_fetch_xor(i, v);
#else
#error "Unable to define raw_atomic64_fetch_xor_acquire"
#endif
}

/**
 * raw_atomic64_fetch_xor_release() - atomic bitwise XOR with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_xor_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_xor_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_xor_release)
        return arch_atomic64_fetch_xor_release(i, v);
#elif defined(arch_atomic64_fetch_xor_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_xor_relaxed(i, v);
#elif defined(arch_atomic64_fetch_xor)
        return arch_atomic64_fetch_xor(i, v);
#else
#error "Unable to define raw_atomic64_fetch_xor_release"
#endif
}

/**
 * raw_atomic64_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_xor_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_xor_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_xor_relaxed)
        return arch_atomic64_fetch_xor_relaxed(i, v);
#elif defined(arch_atomic64_fetch_xor)
        return arch_atomic64_fetch_xor(i, v);
#else
#error "Unable to define raw_atomic64_fetch_xor_relaxed"
#endif
}

/**
 * raw_atomic64_xchg() - atomic exchange with full ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_xchg() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_xchg(atomic64_t *v, s64 new)
{
#if defined(arch_atomic64_xchg)
        return arch_atomic64_xchg(v, new);
#elif defined(arch_atomic64_xchg_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_xchg_relaxed(v, new);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_xchg(&v->counter, new);
#endif
}

/**
 * raw_atomic64_xchg_acquire() - atomic exchange with acquire ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_xchg_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_xchg_acquire(atomic64_t *v, s64 new)
{
#if defined(arch_atomic64_xchg_acquire)
        return arch_atomic64_xchg_acquire(v, new);
#elif defined(arch_atomic64_xchg_relaxed)
        s64 ret = arch_atomic64_xchg_relaxed(v, new);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_xchg)
        return arch_atomic64_xchg(v, new);
#else
        return raw_xchg_acquire(&v->counter, new);
#endif
}

/**
 * raw_atomic64_xchg_release() - atomic exchange with release ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_xchg_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_xchg_release(atomic64_t *v, s64 new)
{
#if defined(arch_atomic64_xchg_release)
        return arch_atomic64_xchg_release(v, new);
#elif defined(arch_atomic64_xchg_relaxed)
        __atomic_release_fence();
        return arch_atomic64_xchg_relaxed(v, new);
#elif defined(arch_atomic64_xchg)
        return arch_atomic64_xchg(v, new);
#else
        return raw_xchg_release(&v->counter, new);
#endif
}

/**
 * raw_atomic64_xchg_relaxed() - atomic exchange with relaxed ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_xchg_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_xchg_relaxed(atomic64_t *v, s64 new)
{
#if defined(arch_atomic64_xchg_relaxed)
        return arch_atomic64_xchg_relaxed(v, new);
#elif defined(arch_atomic64_xchg)
        return arch_atomic64_xchg(v, new);
#else
        return raw_xchg_relaxed(&v->counter, new);
#endif
}

/**
 * raw_atomic64_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_cmpxchg() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
{
#if defined(arch_atomic64_cmpxchg)
        return arch_atomic64_cmpxchg(v, old, new);
#elif defined(arch_atomic64_cmpxchg_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_cmpxchg_relaxed(v, old, new);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_cmpxchg(&v->counter, old, new);
#endif
}

/**
 * raw_atomic64_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_cmpxchg_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new)
{
#if defined(arch_atomic64_cmpxchg_acquire)
        return arch_atomic64_cmpxchg_acquire(v, old, new);
#elif defined(arch_atomic64_cmpxchg_relaxed)
        s64 ret = arch_atomic64_cmpxchg_relaxed(v, old, new);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_cmpxchg)
        return arch_atomic64_cmpxchg(v, old, new);
#else
        return raw_cmpxchg_acquire(&v->counter, old, new);
#endif
}

/**
 * raw_atomic64_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_cmpxchg_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new)
{
#if defined(arch_atomic64_cmpxchg_release)
        return arch_atomic64_cmpxchg_release(v, old, new);
#elif defined(arch_atomic64_cmpxchg_relaxed)
        __atomic_release_fence();
        return arch_atomic64_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic64_cmpxchg)
        return arch_atomic64_cmpxchg(v, old, new);
#else
        return raw_cmpxchg_release(&v->counter, old, new);
#endif
}

/**
 * raw_atomic64_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_cmpxchg_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_cmpxchg_relaxed(atomic64_t *v, s64 old, s64 new)
{
#if defined(arch_atomic64_cmpxchg_relaxed)
        return arch_atomic64_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic64_cmpxchg)
        return arch_atomic64_cmpxchg(v, old, new);
#else
        return raw_cmpxchg_relaxed(&v->counter, old, new);
#endif
}

/**
 * raw_atomic64_try_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_try_cmpxchg() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
{
#if defined(arch_atomic64_try_cmpxchg)
        return arch_atomic64_try_cmpxchg(v, old, new);
#elif defined(arch_atomic64_try_cmpxchg_relaxed)
        bool ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_try_cmpxchg_relaxed(v, old, new);
        __atomic_post_full_fence();
        return ret;
#else
        s64 r, o = *old;
        r = raw_atomic64_cmpxchg(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic64_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_try_cmpxchg_acquire() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new)
{
#if defined(arch_atomic64_try_cmpxchg_acquire)
        return arch_atomic64_try_cmpxchg_acquire(v, old, new);
#elif defined(arch_atomic64_try_cmpxchg_relaxed)
        bool ret = arch_atomic64_try_cmpxchg_relaxed(v, old, new);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_try_cmpxchg)
        return arch_atomic64_try_cmpxchg(v, old, new);
#else
        s64 r, o = *old;
        r = raw_atomic64_cmpxchg_acquire(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic64_try_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_try_cmpxchg_release() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new)
{
#if defined(arch_atomic64_try_cmpxchg_release)
        return arch_atomic64_try_cmpxchg_release(v, old, new);
#elif defined(arch_atomic64_try_cmpxchg_relaxed)
        __atomic_release_fence();
        return arch_atomic64_try_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic64_try_cmpxchg)
        return arch_atomic64_try_cmpxchg(v, old, new);
#else
        s64 r, o = *old;
        r = raw_atomic64_cmpxchg_release(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic64_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_try_cmpxchg_relaxed() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new)
{
#if defined(arch_atomic64_try_cmpxchg_relaxed)
        return arch_atomic64_try_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic64_try_cmpxchg)
        return arch_atomic64_try_cmpxchg(v, old, new);
#else
        s64 r, o = *old;
        r = raw_atomic64_cmpxchg_relaxed(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic64_sub_and_test() - atomic subtract and test if zero with full ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_sub_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic64_sub_and_test(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_sub_and_test)
        return arch_atomic64_sub_and_test(i, v);
#else
        return raw_atomic64_sub_return(i, v) == 0;
#endif
}

/**
 * raw_atomic64_dec_and_test() - atomic decrement and test if zero with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic64_dec_and_test(atomic64_t *v)
{
#if defined(arch_atomic64_dec_and_test)
        return arch_atomic64_dec_and_test(v);
#else
        return raw_atomic64_dec_return(v) == 0;
#endif
}

/**
 * raw_atomic64_inc_and_test() - atomic increment and test if zero with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic64_inc_and_test(atomic64_t *v)
{
#if defined(arch_atomic64_inc_and_test)
        return arch_atomic64_inc_and_test(v);
#else
        return raw_atomic64_inc_return(v) == 0;
#endif
}

/**
 * raw_atomic64_add_negative() - atomic add and test if negative with full ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_negative() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic64_add_negative(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_negative)
        return arch_atomic64_add_negative(i, v);
#elif defined(arch_atomic64_add_negative_relaxed)
        bool ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_add_negative_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic64_add_return(i, v) < 0;
#endif
}

/**
 * raw_atomic64_add_negative_acquire() - atomic add and test if negative with acquire ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_negative_acquire() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic64_add_negative_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_negative_acquire)
        return arch_atomic64_add_negative_acquire(i, v);
#elif defined(arch_atomic64_add_negative_relaxed)
        bool ret = arch_atomic64_add_negative_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_add_negative)
        return arch_atomic64_add_negative(i, v);
#else
        return raw_atomic64_add_return_acquire(i, v) < 0;
#endif
}

/**
 * raw_atomic64_add_negative_release() - atomic add and test if negative with release ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_negative_release() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic64_add_negative_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_negative_release)
        return arch_atomic64_add_negative_release(i, v);
#elif defined(arch_atomic64_add_negative_relaxed)
        __atomic_release_fence();
        return arch_atomic64_add_negative_relaxed(i, v);
#elif defined(arch_atomic64_add_negative)
        return arch_atomic64_add_negative(i, v);
#else
        return raw_atomic64_add_return_release(i, v) < 0;
#endif
}

/**
 * raw_atomic64_add_negative_relaxed() - atomic add and test if negative with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_negative_relaxed() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic64_add_negative_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_negative_relaxed)
        return arch_atomic64_add_negative_relaxed(i, v);
#elif defined(arch_atomic64_add_negative)
        return arch_atomic64_add_negative(i, v);
#else
        return raw_atomic64_add_return_relaxed(i, v) < 0;
#endif
}

/**
 * raw_atomic64_fetch_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic64_t
 * @a: s64 value to add
 * @u: s64 value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_add_unless() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u)
{
#if defined(arch_atomic64_fetch_add_unless)
        return arch_atomic64_fetch_add_unless(v, a, u);
#else
        s64 c = raw_atomic64_read(v);

        do {
                if (unlikely(c == u))
                        break;
        } while (!raw_atomic64_try_cmpxchg(v, &c, c + a));

        return c;
#endif
}

/**
 * raw_atomic64_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic64_t
 * @a: s64 value to add
 * @u: s64 value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_add_unless() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
{
#if defined(arch_atomic64_add_unless)
        return arch_atomic64_add_unless(v, a, u);
#else
        return raw_atomic64_fetch_add_unless(v, a, u) != u;
#endif
}

/**
 * raw_atomic64_inc_not_zero() - atomic increment unless zero with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_not_zero() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic64_inc_not_zero(atomic64_t *v)
{
#if defined(arch_atomic64_inc_not_zero)
        return arch_atomic64_inc_not_zero(v);
#else
        return raw_atomic64_add_unless(v, 1, 0);
#endif
}

/**
 * raw_atomic64_inc_unless_negative() - atomic increment unless negative with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_unless_negative() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic64_inc_unless_negative(atomic64_t *v)
{
#if defined(arch_atomic64_inc_unless_negative)
        return arch_atomic64_inc_unless_negative(v);
#else
        s64 c = raw_atomic64_read(v);

        do {
                if (unlikely(c < 0))
                        return false;
        } while (!raw_atomic64_try_cmpxchg(v, &c, c + 1));

        return true;
#endif
}

/**
 * raw_atomic64_dec_unless_positive() - atomic decrement unless positive with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_unless_positive() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic64_dec_unless_positive(atomic64_t *v)
{
#if defined(arch_atomic64_dec_unless_positive)
        return arch_atomic64_dec_unless_positive(v);
#else
        s64 c = raw_atomic64_read(v);

        do {
                if (unlikely(c > 0))
                        return false;
        } while (!raw_atomic64_try_cmpxchg(v, &c, c - 1));

        return true;
#endif
}

/**
 * raw_atomic64_dec_if_positive() - atomic decrement if positive with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_if_positive() elsewhere.
 *
 * Return: The old value of (@v - 1), regardless of whether @v was updated.
 */
static __always_inline s64
raw_atomic64_dec_if_positive(atomic64_t *v)
{
#if defined(arch_atomic64_dec_if_positive)
        return arch_atomic64_dec_if_positive(v);
#else
        s64 dec, c = raw_atomic64_read(v);

        do {
                dec = c - 1;
                if (unlikely(dec < 0))
                        break;
        } while (!raw_atomic64_try_cmpxchg(v, &c, dec));

        return dec;
#endif
}

#endif /* _LINUX_ATOMIC_FALLBACK_H */
// b565db590afeeff0d7c9485ccbca5bb6e155749f























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_LOCAL_H
#define _ASM_X86_LOCAL_H

#include <linux/percpu.h>

#include <linux/atomic.h>
#include <asm/asm.h>

typedef struct {
        atomic_long_t a;
} local_t;

#define LOCAL_INIT(i)        { ATOMIC_LONG_INIT(i) }

#define local_read(l)        atomic_long_read(&(l)->a)
#define local_set(l, i)        atomic_long_set(&(l)->a, (i))

static inline void local_inc(local_t *l)
{
        asm volatile(_ASM_INC "%0"
                     : "+m" (l->a.counter));
}

static inline void local_dec(local_t *l)
{
        asm volatile(_ASM_DEC "%0"
                     : "+m" (l->a.counter));
}

static inline void local_add(long i, local_t *l)
{
        asm volatile(_ASM_ADD "%1,%0"
                     : "+m" (l->a.counter)
                     : "ir" (i));
}

static inline void local_sub(long i, local_t *l)
{
        asm volatile(_ASM_SUB "%1,%0"
                     : "+m" (l->a.counter)
                     : "ir" (i));
}

/**
 * local_sub_and_test - subtract value from variable and test result
 * @i: integer value to subtract
 * @l: pointer to type local_t
 *
 * Atomically subtracts @i from @l and returns
 * true if the result is zero, or false for all
 * other cases.
 */
static inline bool local_sub_and_test(long i, local_t *l)
{
        return GEN_BINARY_RMWcc(_ASM_SUB, l->a.counter, e, "er", i);
}

/**
 * local_dec_and_test - decrement and test
 * @l: pointer to type local_t
 *
 * Atomically decrements @l by 1 and
 * returns true if the result is 0, or false for all other
 * cases.
 */
static inline bool local_dec_and_test(local_t *l)
{
        return GEN_UNARY_RMWcc(_ASM_DEC, l->a.counter, e);
}

/**
 * local_inc_and_test - increment and test
 * @l: pointer to type local_t
 *
 * Atomically increments @l by 1
 * and returns true if the result is zero, or false for all
 * other cases.
 */
static inline bool local_inc_and_test(local_t *l)
{
        return GEN_UNARY_RMWcc(_ASM_INC, l->a.counter, e);
}

/**
 * local_add_negative - add and test if negative
 * @i: integer value to add
 * @l: pointer to type local_t
 *
 * Atomically adds @i to @l and returns true
 * if the result is negative, or false when
 * result is greater than or equal to zero.
 */
static inline bool local_add_negative(long i, local_t *l)
{
        return GEN_BINARY_RMWcc(_ASM_ADD, l->a.counter, s, "er", i);
}

/**
 * local_add_return - add and return
 * @i: integer value to add
 * @l: pointer to type local_t
 *
 * Atomically adds @i to @l and returns @i + @l
 */
static inline long local_add_return(long i, local_t *l)
{
        long __i = i;
        asm volatile(_ASM_XADD "%0, %1;"
                     : "+r" (i), "+m" (l->a.counter)
                     : : "memory");
        return i + __i;
}

static inline long local_sub_return(long i, local_t *l)
{
        return local_add_return(-i, l);
}

#define local_inc_return(l)  (local_add_return(1, l))
#define local_dec_return(l)  (local_sub_return(1, l))

static inline long local_cmpxchg(local_t *l, long old, long new)
{
        return cmpxchg_local(&l->a.counter, old, new);
}

static inline bool local_try_cmpxchg(local_t *l, long *old, long new)
{
        return try_cmpxchg_local(&l->a.counter,
                                 (typeof(l->a.counter) *) old, new);
}

/*
 * Implement local_xchg using CMPXCHG instruction without the LOCK prefix.
 * XCHG is expensive due to the implied LOCK prefix.  The processor
 * cannot prefetch cachelines if XCHG is used.
 */
static __always_inline long
local_xchg(local_t *l, long n)
{
        long c = local_read(l);

        do { } while (!local_try_cmpxchg(l, &c, n));

        return c;
}

/**
 * local_add_unless - add unless the number is already a given value
 * @l: pointer of type local_t
 * @a: the amount to add to l...
 * @u: ...unless l is equal to u.
 *
 * Atomically adds @a to @l, if @v was not already @u.
 * Returns true if the addition was done.
 */
static __always_inline bool
local_add_unless(local_t *l, long a, long u)
{
        long c = local_read(l);

        do {
                if (unlikely(c == u))
                        return false;
        } while (!local_try_cmpxchg(l, &c, c + a));

        return true;
}

#define local_inc_not_zero(l) local_add_unless((l), 1, 0)

/* On x86_32, these are no better than the atomic variants.
 * On x86-64 these are better than the atomic variants on SMP kernels
 * because they dont use a lock prefix.
 */
#define __local_inc(l)                local_inc(l)
#define __local_dec(l)                local_dec(l)
#define __local_add(i, l)        local_add((i), (l))
#define __local_sub(i, l)        local_sub((i), (l))

#endif /* _ASM_X86_LOCAL_H */












  315 








































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/ethtool.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/netlink.h>
#include <net/net_namespace.h>
#include <linux/if_arp.h>
#include <net/rtnetlink.h>

static netdev_tx_t nlmon_xmit(struct sk_buff *skb, struct net_device *dev)
{
        dev_lstats_add(dev, skb->len);

        dev_kfree_skb(skb);

        return NETDEV_TX_OK;
}

struct nlmon {
        struct netlink_tap nt;
};

static int nlmon_open(struct net_device *dev)
{
        struct nlmon *nlmon = netdev_priv(dev);

        nlmon->nt.dev = dev;
        nlmon->nt.module = THIS_MODULE;
        return netlink_add_tap(&nlmon->nt);
}

static int nlmon_close(struct net_device *dev)
{
        struct nlmon *nlmon = netdev_priv(dev);

        return netlink_remove_tap(&nlmon->nt);
}

static void
nlmon_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
{
        dev_lstats_read(dev, &stats->rx_packets, &stats->rx_bytes);
}

static u32 always_on(struct net_device *dev)
{
        return 1;
}

static const struct ethtool_ops nlmon_ethtool_ops = {
        .get_link = always_on,
};

static const struct net_device_ops nlmon_ops = {
        .ndo_open = nlmon_open,
        .ndo_stop = nlmon_close,
        .ndo_start_xmit = nlmon_xmit,
        .ndo_get_stats64 = nlmon_get_stats64,
};

static void nlmon_setup(struct net_device *dev)
{
        dev->type = ARPHRD_NETLINK;
        dev->priv_flags |= IFF_NO_QUEUE;
        dev->lltx = true;

        dev->netdev_ops        = &nlmon_ops;
        dev->ethtool_ops = &nlmon_ethtool_ops;
        dev->needs_free_netdev = true;

        dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA;
        dev->flags = IFF_NOARP;
        dev->pcpu_stat_type = NETDEV_PCPU_STAT_LSTATS;

        /* That's rather a softlimit here, which, of course,
         * can be altered. Not a real MTU, but what is to be
         * expected in most cases.
         */
        dev->mtu = NLMSG_GOODSIZE;
        dev->min_mtu = sizeof(struct nlmsghdr);
}

static int nlmon_validate(struct nlattr *tb[], struct nlattr *data[],
                          struct netlink_ext_ack *extack)
{
        if (tb[IFLA_ADDRESS])
                return -EINVAL;
        return 0;
}

static struct rtnl_link_ops nlmon_link_ops __read_mostly = {
        .kind                        = "nlmon",
        .priv_size                = sizeof(struct nlmon),
        .setup                        = nlmon_setup,
        .validate                = nlmon_validate,
};

static __init int nlmon_register(void)
{
        return rtnl_link_register(&nlmon_link_ops);
}

static __exit void nlmon_unregister(void)
{
        rtnl_link_unregister(&nlmon_link_ops);
}

module_init(nlmon_register);
module_exit(nlmon_unregister);

MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
MODULE_AUTHOR("Mathieu Geli <geli@enseirb.fr>");
MODULE_DESCRIPTION("Netlink monitoring device");
MODULE_ALIAS_RTNL_LINK("nlmon");











































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/pm_qos.h>

static inline void device_pm_init_common(struct device *dev)
{
        if (!dev->power.early_init) {
                spin_lock_init(&dev->power.lock);
                dev->power.qos = NULL;
                dev->power.early_init = true;
        }
}

#ifdef CONFIG_PM

static inline void pm_runtime_early_init(struct device *dev)
{
        dev->power.disable_depth = 1;
        device_pm_init_common(dev);
}

extern void pm_runtime_init(struct device *dev);
extern void pm_runtime_reinit(struct device *dev);
extern void pm_runtime_remove(struct device *dev);
extern u64 pm_runtime_active_time(struct device *dev);

#define WAKE_IRQ_DEDICATED_ALLOCATED        BIT(0)
#define WAKE_IRQ_DEDICATED_MANAGED        BIT(1)
#define WAKE_IRQ_DEDICATED_REVERSE        BIT(2)
#define WAKE_IRQ_DEDICATED_MASK                (WAKE_IRQ_DEDICATED_ALLOCATED | \
                                         WAKE_IRQ_DEDICATED_MANAGED | \
                                         WAKE_IRQ_DEDICATED_REVERSE)
#define WAKE_IRQ_DEDICATED_ENABLED        BIT(3)

struct wake_irq {
        struct device *dev;
        unsigned int status;
        int irq;
        const char *name;
};

extern void dev_pm_arm_wake_irq(struct wake_irq *wirq);
extern void dev_pm_disarm_wake_irq(struct wake_irq *wirq);
extern void dev_pm_enable_wake_irq_check(struct device *dev,
                                         bool can_change_status);
extern void dev_pm_disable_wake_irq_check(struct device *dev, bool cond_disable);
extern void dev_pm_enable_wake_irq_complete(struct device *dev);

#ifdef CONFIG_PM_SLEEP

extern void device_wakeup_attach_irq(struct device *dev, struct wake_irq *wakeirq);
extern void device_wakeup_detach_irq(struct device *dev);
extern void device_wakeup_arm_wake_irqs(void);
extern void device_wakeup_disarm_wake_irqs(void);

#else

static inline void device_wakeup_attach_irq(struct device *dev,
                                            struct wake_irq *wakeirq) {}

static inline void device_wakeup_detach_irq(struct device *dev)
{
}

#endif /* CONFIG_PM_SLEEP */

/*
 * sysfs.c
 */

extern int dpm_sysfs_add(struct device *dev);
extern void dpm_sysfs_remove(struct device *dev);
extern void rpm_sysfs_remove(struct device *dev);
extern int wakeup_sysfs_add(struct device *dev);
extern void wakeup_sysfs_remove(struct device *dev);
extern int pm_qos_sysfs_add_resume_latency(struct device *dev);
extern void pm_qos_sysfs_remove_resume_latency(struct device *dev);
extern int pm_qos_sysfs_add_flags(struct device *dev);
extern void pm_qos_sysfs_remove_flags(struct device *dev);
extern int pm_qos_sysfs_add_latency_tolerance(struct device *dev);
extern void pm_qos_sysfs_remove_latency_tolerance(struct device *dev);
extern int dpm_sysfs_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid);

#else /* CONFIG_PM */

static inline void pm_runtime_early_init(struct device *dev)
{
        device_pm_init_common(dev);
}

static inline void pm_runtime_init(struct device *dev) {}
static inline void pm_runtime_reinit(struct device *dev) {}
static inline void pm_runtime_remove(struct device *dev) {}

static inline int dpm_sysfs_add(struct device *dev) { return 0; }
static inline void dpm_sysfs_remove(struct device *dev) {}
static inline int dpm_sysfs_change_owner(struct device *dev, kuid_t kuid,
                                         kgid_t kgid) { return 0; }

#endif

#ifdef CONFIG_PM_SLEEP

/* kernel/power/main.c */
extern int pm_async_enabled;

/* drivers/base/power/main.c */
extern struct list_head dpm_list;        /* The active device list */

static inline struct device *to_device(struct list_head *entry)
{
        return container_of(entry, struct device, power.entry);
}

extern void device_pm_sleep_init(struct device *dev);
extern void device_pm_add(struct device *);
extern void device_pm_remove(struct device *);
extern void device_pm_move_before(struct device *, struct device *);
extern void device_pm_move_after(struct device *, struct device *);
extern void device_pm_move_last(struct device *);
extern void device_pm_check_callbacks(struct device *dev);

static inline bool device_pm_initialized(struct device *dev)
{
        return dev->power.in_dpm_list;
}

/* drivers/base/power/wakeup_stats.c */
extern int wakeup_source_sysfs_add(struct device *parent,
                                   struct wakeup_source *ws);
extern void wakeup_source_sysfs_remove(struct wakeup_source *ws);

extern int pm_wakeup_source_sysfs_add(struct device *parent);

#else /* !CONFIG_PM_SLEEP */

static inline void device_pm_sleep_init(struct device *dev) {}

static inline void device_pm_add(struct device *dev) {}

static inline void device_pm_remove(struct device *dev)
{
        pm_runtime_remove(dev);
}

static inline void device_pm_move_before(struct device *deva,
                                         struct device *devb) {}
static inline void device_pm_move_after(struct device *deva,
                                        struct device *devb) {}
static inline void device_pm_move_last(struct device *dev) {}

static inline void device_pm_check_callbacks(struct device *dev) {}

static inline bool device_pm_initialized(struct device *dev)
{
        return device_is_registered(dev);
}

static inline int pm_wakeup_source_sysfs_add(struct device *parent)
{
        return 0;
}

#endif /* !CONFIG_PM_SLEEP */

static inline void device_pm_init(struct device *dev)
{
        device_pm_init_common(dev);
        device_pm_sleep_init(dev);
        pm_runtime_init(dev);
}
























































































































































































  311 






















    6 




    6 







































































    4 






































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_UACCESS_H__
#define __LINUX_UACCESS_H__

#include <linux/fault-inject-usercopy.h>
#include <linux/instrumented.h>
#include <linux/minmax.h>
#include <linux/nospec.h>
#include <linux/sched.h>
#include <linux/ucopysize.h>

#include <asm/uaccess.h>

/*
 * Architectures that support memory tagging (assigning tags to memory regions,
 * embedding these tags into addresses that point to these memory regions, and
 * checking that the memory and the pointer tags match on memory accesses)
 * redefine this macro to strip tags from pointers.
 *
 * Passing down mm_struct allows to define untagging rules on per-process
 * basis.
 *
 * It's defined as noop for architectures that don't support memory tagging.
 */
#ifndef untagged_addr
#define untagged_addr(addr) (addr)
#endif

#ifndef untagged_addr_remote
#define untagged_addr_remote(mm, addr)        ({                \
        mmap_assert_locked(mm);                                \
        untagged_addr(addr);                                \
})
#endif

#ifdef masked_user_access_begin
 #define can_do_masked_user_access() 1
#else
 #define can_do_masked_user_access() 0
 #define masked_user_access_begin(src) NULL
 #define mask_user_address(src) (src)
#endif

/*
 * Architectures should provide two primitives (raw_copy_{to,from}_user())
 * and get rid of their private instances of copy_{to,from}_user() and
 * __copy_{to,from}_user{,_inatomic}().
 *
 * raw_copy_{to,from}_user(to, from, size) should copy up to size bytes and
 * return the amount left to copy.  They should assume that access_ok() has
 * already been checked (and succeeded); they should *not* zero-pad anything.
 * No KASAN or object size checks either - those belong here.
 *
 * Both of these functions should attempt to copy size bytes starting at from
 * into the area starting at to.  They must not fetch or store anything
 * outside of those areas.  Return value must be between 0 (everything
 * copied successfully) and size (nothing copied).
 *
 * If raw_copy_{to,from}_user(to, from, size) returns N, size - N bytes starting
 * at to must become equal to the bytes fetched from the corresponding area
 * starting at from.  All data past to + size - N must be left unmodified.
 *
 * If copying succeeds, the return value must be 0.  If some data cannot be
 * fetched, it is permitted to copy less than had been fetched; the only
 * hard requirement is that not storing anything at all (i.e. returning size)
 * should happen only when nothing could be copied.  In other words, you don't
 * have to squeeze as much as possible - it is allowed, but not necessary.
 *
 * For raw_copy_from_user() to always points to kernel memory and no faults
 * on store should happen.  Interpretation of from is affected by set_fs().
 * For raw_copy_to_user() it's the other way round.
 *
 * Both can be inlined - it's up to architectures whether it wants to bother
 * with that.  They should not be used directly; they are used to implement
 * the 6 functions (copy_{to,from}_user(), __copy_{to,from}_user_inatomic())
 * that are used instead.  Out of those, __... ones are inlined.  Plain
 * copy_{to,from}_user() might or might not be inlined.  If you want them
 * inlined, have asm/uaccess.h define INLINE_COPY_{TO,FROM}_USER.
 *
 * NOTE: only copy_from_user() zero-pads the destination in case of short copy.
 * Neither __copy_from_user() nor __copy_from_user_inatomic() zero anything
 * at all; their callers absolutely must check the return value.
 *
 * Biarch ones should also provide raw_copy_in_user() - similar to the above,
 * but both source and destination are __user pointers (affected by set_fs()
 * as usual) and both source and destination can trigger faults.
 */

static __always_inline __must_check unsigned long
__copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
{
        unsigned long res;

        instrument_copy_from_user_before(to, from, n);
        check_object_size(to, n, false);
        res = raw_copy_from_user(to, from, n);
        instrument_copy_from_user_after(to, from, n, res);
        return res;
}

static __always_inline __must_check unsigned long
__copy_from_user(void *to, const void __user *from, unsigned long n)
{
        unsigned long res;

        might_fault();
        instrument_copy_from_user_before(to, from, n);
        if (should_fail_usercopy())
                return n;
        check_object_size(to, n, false);
        res = raw_copy_from_user(to, from, n);
        instrument_copy_from_user_after(to, from, n, res);
        return res;
}

/**
 * __copy_to_user_inatomic: - Copy a block of data into user space, with less checking.
 * @to:   Destination address, in user space.
 * @from: Source address, in kernel space.
 * @n:    Number of bytes to copy.
 *
 * Context: User context only.
 *
 * Copy data from kernel space to user space.  Caller must check
 * the specified block with access_ok() before calling this function.
 * The caller should also make sure he pins the user space address
 * so that we don't result in page fault and sleep.
 */
static __always_inline __must_check unsigned long
__copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
{
        if (should_fail_usercopy())
                return n;
        instrument_copy_to_user(to, from, n);
        check_object_size(from, n, true);
        return raw_copy_to_user(to, from, n);
}

static __always_inline __must_check unsigned long
__copy_to_user(void __user *to, const void *from, unsigned long n)
{
        might_fault();
        if (should_fail_usercopy())
                return n;
        instrument_copy_to_user(to, from, n);
        check_object_size(from, n, true);
        return raw_copy_to_user(to, from, n);
}

/*
 * Architectures that #define INLINE_COPY_TO_USER use this function
 * directly in the normal copy_to/from_user(), the other ones go
 * through an extern _copy_to/from_user(), which expands the same code
 * here.
 *
 * Rust code always uses the extern definition.
 */
static inline __must_check unsigned long
_inline_copy_from_user(void *to, const void __user *from, unsigned long n)
{
        unsigned long res = n;
        might_fault();
        if (should_fail_usercopy())
                goto fail;
        if (can_do_masked_user_access())
                from = mask_user_address(from);
        else {
                if (!access_ok(from, n))
                        goto fail;
                /*
                 * Ensure that bad access_ok() speculation will not
                 * lead to nasty side effects *after* the copy is
                 * finished:
                 */
                barrier_nospec();
        }
        instrument_copy_from_user_before(to, from, n);
        res = raw_copy_from_user(to, from, n);
        instrument_copy_from_user_after(to, from, n, res);
        if (likely(!res))
                return 0;
fail:
        memset(to + (n - res), 0, res);
        return res;
}
extern __must_check unsigned long
_copy_from_user(void *, const void __user *, unsigned long);

static inline __must_check unsigned long
_inline_copy_to_user(void __user *to, const void *from, unsigned long n)
{
        might_fault();
        if (should_fail_usercopy())
                return n;
        if (access_ok(to, n)) {
                instrument_copy_to_user(to, from, n);
                n = raw_copy_to_user(to, from, n);
        }
        return n;
}
extern __must_check unsigned long
_copy_to_user(void __user *, const void *, unsigned long);

static __always_inline unsigned long __must_check
copy_from_user(void *to, const void __user *from, unsigned long n)
{
        if (!check_copy_size(to, n, false))
                return n;
#ifdef INLINE_COPY_FROM_USER
        return _inline_copy_from_user(to, from, n);
#else
        return _copy_from_user(to, from, n);
#endif
}

static __always_inline unsigned long __must_check
copy_to_user(void __user *to, const void *from, unsigned long n)
{
        if (!check_copy_size(from, n, true))
                return n;

#ifdef INLINE_COPY_TO_USER
        return _inline_copy_to_user(to, from, n);
#else
        return _copy_to_user(to, from, n);
#endif
}

#ifndef copy_mc_to_kernel
/*
 * Without arch opt-in this generic copy_mc_to_kernel() will not handle
 * #MC (or arch equivalent) during source read.
 */
static inline unsigned long __must_check
copy_mc_to_kernel(void *dst, const void *src, size_t cnt)
{
        memcpy(dst, src, cnt);
        return 0;
}
#endif

static __always_inline void pagefault_disabled_inc(void)
{
        current->pagefault_disabled++;
}

static __always_inline void pagefault_disabled_dec(void)
{
        current->pagefault_disabled--;
}

/*
 * These routines enable/disable the pagefault handler. If disabled, it will
 * not take any locks and go straight to the fixup table.
 *
 * User access methods will not sleep when called from a pagefault_disabled()
 * environment.
 */
static inline void pagefault_disable(void)
{
        pagefault_disabled_inc();
        /*
         * make sure to have issued the store before a pagefault
         * can hit.
         */
        barrier();
}

static inline void pagefault_enable(void)
{
        /*
         * make sure to issue those last loads/stores before enabling
         * the pagefault handler again.
         */
        barrier();
        pagefault_disabled_dec();
}

/*
 * Is the pagefault handler disabled? If so, user access methods will not sleep.
 */
static inline bool pagefault_disabled(void)
{
        return current->pagefault_disabled != 0;
}

/*
 * The pagefault handler is in general disabled by pagefault_disable() or
 * when in irq context (via in_atomic()).
 *
 * This function should only be used by the fault handlers. Other users should
 * stick to pagefault_disabled().
 * Please NEVER use preempt_disable() to disable the fault handler. With
 * !CONFIG_PREEMPT_COUNT, this is like a NOP. So the handler won't be disabled.
 * in_atomic() will report different values based on !CONFIG_PREEMPT_COUNT.
 */
#define faulthandler_disabled() (pagefault_disabled() || in_atomic())

DEFINE_LOCK_GUARD_0(pagefault, pagefault_disable(), pagefault_enable())

#ifndef CONFIG_ARCH_HAS_SUBPAGE_FAULTS

/**
 * probe_subpage_writeable: probe the user range for write faults at sub-page
 *                            granularity (e.g. arm64 MTE)
 * @uaddr: start of address range
 * @size: size of address range
 *
 * Returns 0 on success, the number of bytes not probed on fault.
 *
 * It is expected that the caller checked for the write permission of each
 * page in the range either by put_user() or GUP. The architecture port can
 * implement a more efficient get_user() probing if the same sub-page faults
 * are triggered by either a read or a write.
 */
static inline size_t probe_subpage_writeable(char __user *uaddr, size_t size)
{
        return 0;
}

#endif /* CONFIG_ARCH_HAS_SUBPAGE_FAULTS */

#ifndef ARCH_HAS_NOCACHE_UACCESS

static inline __must_check unsigned long
__copy_from_user_inatomic_nocache(void *to, const void __user *from,
                                  unsigned long n)
{
        return __copy_from_user_inatomic(to, from, n);
}

#endif                /* ARCH_HAS_NOCACHE_UACCESS */

extern __must_check int check_zeroed_user(const void __user *from, size_t size);

/**
 * copy_struct_from_user: copy a struct from userspace
 * @dst:   Destination address, in kernel space. This buffer must be @ksize
 *         bytes long.
 * @ksize: Size of @dst struct.
 * @src:   Source address, in userspace.
 * @usize: (Alleged) size of @src struct.
 *
 * Copies a struct from userspace to kernel space, in a way that guarantees
 * backwards-compatibility for struct syscall arguments (as long as future
 * struct extensions are made such that all new fields are *appended* to the
 * old struct, and zeroed-out new fields have the same meaning as the old
 * struct).
 *
 * @ksize is just sizeof(*dst), and @usize should've been passed by userspace.
 * The recommended usage is something like the following:
 *
 *   SYSCALL_DEFINE2(foobar, const struct foo __user *, uarg, size_t, usize)
 *   {
 *      int err;
 *      struct foo karg = {};
 *
 *      if (usize > PAGE_SIZE)
 *        return -E2BIG;
 *      if (usize < FOO_SIZE_VER0)
 *        return -EINVAL;
 *
 *      err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize);
 *      if (err)
 *        return err;
 *
 *      // ...
 *   }
 *
 * There are three cases to consider:
 *  * If @usize == @ksize, then it's copied verbatim.
 *  * If @usize < @ksize, then the userspace has passed an old struct to a
 *    newer kernel. The rest of the trailing bytes in @dst (@ksize - @usize)
 *    are to be zero-filled.
 *  * If @usize > @ksize, then the userspace has passed a new struct to an
 *    older kernel. The trailing bytes unknown to the kernel (@usize - @ksize)
 *    are checked to ensure they are zeroed, otherwise -E2BIG is returned.
 *
 * Returns (in all cases, some data may have been copied):
 *  * -E2BIG:  (@usize > @ksize) and there are non-zero trailing bytes in @src.
 *  * -EFAULT: access to userspace failed.
 */
static __always_inline __must_check int
copy_struct_from_user(void *dst, size_t ksize, const void __user *src,
                      size_t usize)
{
        size_t size = min(ksize, usize);
        size_t rest = max(ksize, usize) - size;

        /* Double check if ksize is larger than a known object size. */
        if (WARN_ON_ONCE(ksize > __builtin_object_size(dst, 1)))
                return -E2BIG;

        /* Deal with trailing bytes. */
        if (usize < ksize) {
                memset(dst + size, 0, rest);
        } else if (usize > ksize) {
                int ret = check_zeroed_user(src + size, rest);
                if (ret <= 0)
                        return ret ?: -E2BIG;
        }
        /* Copy the interoperable parts of the struct. */
        if (copy_from_user(dst, src, size))
                return -EFAULT;
        return 0;
}

/**
 * copy_struct_to_user: copy a struct to userspace
 * @dst:   Destination address, in userspace. This buffer must be @ksize
 *         bytes long.
 * @usize: (Alleged) size of @dst struct.
 * @src:   Source address, in kernel space.
 * @ksize: Size of @src struct.
 * @ignored_trailing: Set to %true if there was a non-zero byte in @src that
 * userspace cannot see because they are using an smaller struct.
 *
 * Copies a struct from kernel space to userspace, in a way that guarantees
 * backwards-compatibility for struct syscall arguments (as long as future
 * struct extensions are made such that all new fields are *appended* to the
 * old struct, and zeroed-out new fields have the same meaning as the old
 * struct).
 *
 * Some syscalls may wish to make sure that userspace knows about everything in
 * the struct, and if there is a non-zero value that userspce doesn't know
 * about, they want to return an error (such as -EMSGSIZE) or have some other
 * fallback (such as adding a "you're missing some information" flag). If
 * @ignored_trailing is non-%NULL, it will be set to %true if there was a
 * non-zero byte that could not be copied to userspace (ie. was past @usize).
 *
 * While unconditionally returning an error in this case is the simplest
 * solution, for maximum backward compatibility you should try to only return
 * -EMSGSIZE if the user explicitly requested the data that couldn't be copied.
 * Note that structure sizes can change due to header changes and simple
 * recompilations without code changes(!), so if you care about
 * @ignored_trailing you probably want to make sure that any new field data is
 * associated with a flag. Otherwise you might assume that a program knows
 * about data it does not.
 *
 * @ksize is just sizeof(*src), and @usize should've been passed by userspace.
 * The recommended usage is something like the following:
 *
 *   SYSCALL_DEFINE2(foobar, struct foo __user *, uarg, size_t, usize)
 *   {
 *      int err;
 *      bool ignored_trailing;
 *      struct foo karg = {};
 *
 *      if (usize > PAGE_SIZE)
 *                return -E2BIG;
 *      if (usize < FOO_SIZE_VER0)
 *                return -EINVAL;
 *
 *      // ... modify karg somehow ...
 *
 *      err = copy_struct_to_user(uarg, usize, &karg, sizeof(karg),
 *                                  &ignored_trailing);
 *      if (err)
 *                return err;
 *      if (ignored_trailing)
 *                return -EMSGSIZE:
 *
 *      // ...
 *   }
 *
 * There are three cases to consider:
 *  * If @usize == @ksize, then it's copied verbatim.
 *  * If @usize < @ksize, then the kernel is trying to pass userspace a newer
 *    struct than it supports. Thus we only copy the interoperable portions
 *    (@usize) and ignore the rest (but @ignored_trailing is set to %true if
 *    any of the trailing (@ksize - @usize) bytes are non-zero).
 *  * If @usize > @ksize, then the kernel is trying to pass userspace an older
 *    struct than userspace supports. In order to make sure the
 *    unknown-to-the-kernel fields don't contain garbage values, we zero the
 *    trailing (@usize - @ksize) bytes.
 *
 * Returns (in all cases, some data may have been copied):
 *  * -EFAULT: access to userspace failed.
 */
static __always_inline __must_check int
copy_struct_to_user(void __user *dst, size_t usize, const void *src,
                    size_t ksize, bool *ignored_trailing)
{
        size_t size = min(ksize, usize);
        size_t rest = max(ksize, usize) - size;

        /* Double check if ksize is larger than a known object size. */
        if (WARN_ON_ONCE(ksize > __builtin_object_size(src, 1)))
                return -E2BIG;

        /* Deal with trailing bytes. */
        if (usize > ksize) {
                if (clear_user(dst + size, rest))
                        return -EFAULT;
        }
        if (ignored_trailing)
                *ignored_trailing = ksize < usize &&
                        memchr_inv(src + size, 0, rest) != NULL;
        /* Copy the interoperable parts of the struct. */
        if (copy_to_user(dst, src, size))
                return -EFAULT;
        return 0;
}

bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size);

long copy_from_kernel_nofault(void *dst, const void *src, size_t size);
long notrace copy_to_kernel_nofault(void *dst, const void *src, size_t size);

long copy_from_user_nofault(void *dst, const void __user *src, size_t size);
long notrace copy_to_user_nofault(void __user *dst, const void *src,
                size_t size);

long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr,
                long count);

long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
                long count);
long strnlen_user_nofault(const void __user *unsafe_addr, long count);

#ifndef __get_kernel_nofault
#define __get_kernel_nofault(dst, src, type, label)        \
do {                                                        \
        type __user *p = (type __force __user *)(src);        \
        type data;                                        \
        if (__get_user(data, p))                        \
                goto label;                                \
        *(type *)dst = data;                                \
} while (0)

#define __put_kernel_nofault(dst, src, type, label)        \
do {                                                        \
        type __user *p = (type __force __user *)(dst);        \
        type data = *(type *)src;                        \
        if (__put_user(data, p))                        \
                goto label;                                \
} while (0)
#endif

/**
 * get_kernel_nofault(): safely attempt to read from a location
 * @val: read into this variable
 * @ptr: address to read from
 *
 * Returns 0 on success, or -EFAULT.
 */
#define get_kernel_nofault(val, ptr) ({                                \
        const typeof(val) *__gk_ptr = (ptr);                        \
        copy_from_kernel_nofault(&(val), __gk_ptr, sizeof(val));\
})

#ifndef user_access_begin
#define user_access_begin(ptr,len) access_ok(ptr, len)
#define user_access_end() do { } while (0)
#define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0)
#define unsafe_get_user(x,p,e) unsafe_op_wrap(__get_user(x,p),e)
#define unsafe_put_user(x,p,e) unsafe_op_wrap(__put_user(x,p),e)
#define unsafe_copy_to_user(d,s,l,e) unsafe_op_wrap(__copy_to_user(d,s,l),e)
#define unsafe_copy_from_user(d,s,l,e) unsafe_op_wrap(__copy_from_user(d,s,l),e)
static inline unsigned long user_access_save(void) { return 0UL; }
static inline void user_access_restore(unsigned long flags) { }
#endif
#ifndef user_write_access_begin
#define user_write_access_begin user_access_begin
#define user_write_access_end user_access_end
#endif
#ifndef user_read_access_begin
#define user_read_access_begin user_access_begin
#define user_read_access_end user_access_end
#endif

#ifdef CONFIG_HARDENED_USERCOPY
void __noreturn usercopy_abort(const char *name, const char *detail,
                               bool to_user, unsigned long offset,
                               unsigned long len);
#endif

#endif                /* __LINUX_UACCESS_H__ */












































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * pm_runtime.h - Device run-time power management helper functions.
 *
 * Copyright (C) 2009 Rafael J. Wysocki <rjw@sisk.pl>
 */

#ifndef _LINUX_PM_RUNTIME_H
#define _LINUX_PM_RUNTIME_H

#include <linux/device.h>
#include <linux/notifier.h>
#include <linux/pm.h>

#include <linux/jiffies.h>

/* Runtime PM flag argument bits */
#define RPM_ASYNC                0x01        /* Request is asynchronous */
#define RPM_NOWAIT                0x02        /* Don't wait for concurrent
                                            state change */
#define RPM_GET_PUT                0x04        /* Increment/decrement the
                                            usage_count */
#define RPM_AUTO                0x08        /* Use autosuspend_delay */
#define RPM_TRANSPARENT        0x10        /* Succeed if runtime PM is disabled */

/*
 * Use this for defining a set of PM operations to be used in all situations
 * (system suspend, hibernation or runtime PM).
 *
 * Note that the behaviour differs from the deprecated UNIVERSAL_DEV_PM_OPS()
 * macro, which uses the provided callbacks for both runtime PM and system
 * sleep, while DEFINE_RUNTIME_DEV_PM_OPS() uses pm_runtime_force_suspend()
 * and pm_runtime_force_resume() for its system sleep callbacks.
 *
 * If the underlying dev_pm_ops struct symbol has to be exported, use
 * EXPORT_RUNTIME_DEV_PM_OPS() or EXPORT_GPL_RUNTIME_DEV_PM_OPS() instead.
 */
#define DEFINE_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \
        _DEFINE_DEV_PM_OPS(name, pm_runtime_force_suspend, \
                           pm_runtime_force_resume, suspend_fn, \
                           resume_fn, idle_fn)

#define EXPORT_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \
        EXPORT_DEV_PM_OPS(name) = { \
                RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \
        }
#define EXPORT_GPL_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \
        EXPORT_GPL_DEV_PM_OPS(name) = { \
                RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \
        }
#define EXPORT_NS_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn, ns) \
        EXPORT_NS_DEV_PM_OPS(name, ns) = { \
                RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \
        }
#define EXPORT_NS_GPL_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn, ns) \
        EXPORT_NS_GPL_DEV_PM_OPS(name, ns) = { \
                RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \
        }

#ifdef CONFIG_PM
extern struct workqueue_struct *pm_wq;

static inline bool queue_pm_work(struct work_struct *work)
{
        return queue_work(pm_wq, work);
}

extern int pm_generic_runtime_suspend(struct device *dev);
extern int pm_generic_runtime_resume(struct device *dev);
extern int pm_runtime_force_suspend(struct device *dev);

extern int __pm_runtime_idle(struct device *dev, int rpmflags);
extern int __pm_runtime_suspend(struct device *dev, int rpmflags);
extern int __pm_runtime_resume(struct device *dev, int rpmflags);
extern int pm_runtime_get_if_active(struct device *dev);
extern int pm_runtime_get_if_in_use(struct device *dev);
extern int pm_schedule_suspend(struct device *dev, unsigned int delay);
extern int __pm_runtime_set_status(struct device *dev, unsigned int status);
extern int pm_runtime_barrier(struct device *dev);
extern bool pm_runtime_block_if_disabled(struct device *dev);
extern void pm_runtime_unblock(struct device *dev);
extern void pm_runtime_enable(struct device *dev);
extern void __pm_runtime_disable(struct device *dev, bool check_resume);
extern void pm_runtime_allow(struct device *dev);
extern void pm_runtime_forbid(struct device *dev);
extern void pm_runtime_no_callbacks(struct device *dev);
extern void pm_runtime_irq_safe(struct device *dev);
extern void __pm_runtime_use_autosuspend(struct device *dev, bool use);
extern void pm_runtime_set_autosuspend_delay(struct device *dev, int delay);
extern u64 pm_runtime_autosuspend_expiration(struct device *dev);
extern void pm_runtime_set_memalloc_noio(struct device *dev, bool enable);
extern void pm_runtime_get_suppliers(struct device *dev);
extern void pm_runtime_put_suppliers(struct device *dev);
extern void pm_runtime_new_link(struct device *dev);
extern void pm_runtime_drop_link(struct device_link *link);
extern void pm_runtime_release_supplier(struct device_link *link);

int devm_pm_runtime_set_active_enabled(struct device *dev);
extern int devm_pm_runtime_enable(struct device *dev);
int devm_pm_runtime_get_noresume(struct device *dev);

/**
 * pm_suspend_ignore_children - Set runtime PM behavior regarding children.
 * @dev: Target device.
 * @enable: Whether or not to ignore possible dependencies on children.
 *
 * The dependencies of @dev on its children will not be taken into account by
 * the runtime PM framework going forward if @enable is %true, or they will
 * be taken into account otherwise.
 */
static inline void pm_suspend_ignore_children(struct device *dev, bool enable)
{
        dev->power.ignore_children = enable;
}

/**
 * pm_runtime_get_noresume - Bump up runtime PM usage counter of a device.
 * @dev: Target device.
 */
static inline void pm_runtime_get_noresume(struct device *dev)
{
        atomic_inc(&dev->power.usage_count);
}

/**
 * pm_runtime_put_noidle - Drop runtime PM usage counter of a device.
 * @dev: Target device.
 *
 * Decrement the runtime PM usage counter of @dev unless it is 0 already.
 */
static inline void pm_runtime_put_noidle(struct device *dev)
{
        atomic_add_unless(&dev->power.usage_count, -1, 0);
}

/**
 * pm_runtime_suspended - Check whether or not a device is runtime-suspended.
 * @dev: Target device.
 *
 * Return %true if runtime PM is enabled for @dev and its runtime PM status is
 * %RPM_SUSPENDED, or %false otherwise.
 *
 * Note that the return value of this function can only be trusted if it is
 * called under the runtime PM lock of @dev or under conditions in which
 * runtime PM cannot be either disabled or enabled for @dev and its runtime PM
 * status cannot change.
 */
static inline bool pm_runtime_suspended(struct device *dev)
{
        return dev->power.runtime_status == RPM_SUSPENDED
                && !dev->power.disable_depth;
}

/**
 * pm_runtime_active - Check whether or not a device is runtime-active.
 * @dev: Target device.
 *
 * Return %true if runtime PM is disabled for @dev or its runtime PM status is
 * %RPM_ACTIVE, or %false otherwise.
 *
 * Note that the return value of this function can only be trusted if it is
 * called under the runtime PM lock of @dev or under conditions in which
 * runtime PM cannot be either disabled or enabled for @dev and its runtime PM
 * status cannot change.
 */
static inline bool pm_runtime_active(struct device *dev)
{
        return dev->power.runtime_status == RPM_ACTIVE
                || dev->power.disable_depth;
}

/**
 * pm_runtime_status_suspended - Check if runtime PM status is "suspended".
 * @dev: Target device.
 *
 * Return %true if the runtime PM status of @dev is %RPM_SUSPENDED, or %false
 * otherwise, regardless of whether or not runtime PM has been enabled for @dev.
 *
 * Note that the return value of this function can only be trusted if it is
 * called under the runtime PM lock of @dev or under conditions in which the
 * runtime PM status of @dev cannot change.
 */
static inline bool pm_runtime_status_suspended(struct device *dev)
{
        return dev->power.runtime_status == RPM_SUSPENDED;
}

/**
 * pm_runtime_enabled - Check if runtime PM is enabled.
 * @dev: Target device.
 *
 * Return %true if runtime PM is enabled for @dev or %false otherwise.
 *
 * Note that the return value of this function can only be trusted if it is
 * called under the runtime PM lock of @dev or under conditions in which
 * runtime PM cannot be either disabled or enabled for @dev.
 */
static inline bool pm_runtime_enabled(struct device *dev)
{
        return !dev->power.disable_depth;
}

/**
 * pm_runtime_blocked - Check if runtime PM enabling is blocked.
 * @dev: Target device.
 *
 * Do not call this function outside system suspend/resume code paths.
 */
static inline bool pm_runtime_blocked(struct device *dev)
{
        return dev->power.last_status == RPM_BLOCKED;
}

/**
 * pm_runtime_has_no_callbacks - Check if runtime PM callbacks may be present.
 * @dev: Target device.
 *
 * Return %true if @dev is a special device without runtime PM callbacks or
 * %false otherwise.
 */
static inline bool pm_runtime_has_no_callbacks(struct device *dev)
{
        return dev->power.no_callbacks;
}

/**
 * pm_runtime_mark_last_busy - Update the last access time of a device.
 * @dev: Target device.
 *
 * Update the last access time of @dev used by the runtime PM autosuspend
 * mechanism to the current time as returned by ktime_get_mono_fast_ns().
 */
static inline void pm_runtime_mark_last_busy(struct device *dev)
{
        WRITE_ONCE(dev->power.last_busy, ktime_get_mono_fast_ns());
}

/**
 * pm_runtime_is_irq_safe - Check if runtime PM can work in interrupt context.
 * @dev: Target device.
 *
 * Return %true if @dev has been marked as an "IRQ-safe" device (with respect
 * to runtime PM), in which case its runtime PM callabcks can be expected to
 * work correctly when invoked from interrupt handlers.
 */
static inline bool pm_runtime_is_irq_safe(struct device *dev)
{
        return dev->power.irq_safe;
}

extern u64 pm_runtime_suspended_time(struct device *dev);

#else /* !CONFIG_PM */

static inline bool queue_pm_work(struct work_struct *work) { return false; }

static inline int pm_generic_runtime_suspend(struct device *dev) { return 0; }
static inline int pm_generic_runtime_resume(struct device *dev) { return 0; }
static inline int pm_runtime_force_suspend(struct device *dev) { return 0; }

static inline int __pm_runtime_idle(struct device *dev, int rpmflags)
{
        return -ENOSYS;
}
static inline int __pm_runtime_suspend(struct device *dev, int rpmflags)
{
        return -ENOSYS;
}
static inline int __pm_runtime_resume(struct device *dev, int rpmflags)
{
        return 1;
}
static inline int pm_schedule_suspend(struct device *dev, unsigned int delay)
{
        return -ENOSYS;
}
static inline int pm_runtime_get_if_in_use(struct device *dev)
{
        return -EINVAL;
}
static inline int pm_runtime_get_if_active(struct device *dev)
{
        return -EINVAL;
}
static inline int __pm_runtime_set_status(struct device *dev,
                                            unsigned int status) { return 0; }
static inline int pm_runtime_barrier(struct device *dev) { return 0; }
static inline bool pm_runtime_block_if_disabled(struct device *dev) { return true; }
static inline void pm_runtime_unblock(struct device *dev) {}
static inline void pm_runtime_enable(struct device *dev) {}
static inline void __pm_runtime_disable(struct device *dev, bool c) {}
static inline bool pm_runtime_blocked(struct device *dev) { return true; }
static inline void pm_runtime_allow(struct device *dev) {}
static inline void pm_runtime_forbid(struct device *dev) {}

static inline int devm_pm_runtime_set_active_enabled(struct device *dev) { return 0; }
static inline int devm_pm_runtime_enable(struct device *dev) { return 0; }
static inline int devm_pm_runtime_get_noresume(struct device *dev) { return 0; }

static inline void pm_suspend_ignore_children(struct device *dev, bool enable) {}
static inline void pm_runtime_get_noresume(struct device *dev) {}
static inline void pm_runtime_put_noidle(struct device *dev) {}
static inline bool pm_runtime_suspended(struct device *dev) { return false; }
static inline bool pm_runtime_active(struct device *dev) { return true; }
static inline bool pm_runtime_status_suspended(struct device *dev) { return false; }
static inline bool pm_runtime_enabled(struct device *dev) { return false; }

static inline void pm_runtime_no_callbacks(struct device *dev) {}
static inline void pm_runtime_irq_safe(struct device *dev) {}
static inline bool pm_runtime_is_irq_safe(struct device *dev) { return false; }

static inline bool pm_runtime_has_no_callbacks(struct device *dev) { return false; }
static inline void pm_runtime_mark_last_busy(struct device *dev) {}
static inline void __pm_runtime_use_autosuspend(struct device *dev,
                                                bool use) {}
static inline void pm_runtime_set_autosuspend_delay(struct device *dev,
                                                int delay) {}
static inline u64 pm_runtime_autosuspend_expiration(
                                struct device *dev) { return 0; }
static inline void pm_runtime_set_memalloc_noio(struct device *dev,
                                                bool enable){}
static inline void pm_runtime_get_suppliers(struct device *dev) {}
static inline void pm_runtime_put_suppliers(struct device *dev) {}
static inline void pm_runtime_new_link(struct device *dev) {}
static inline void pm_runtime_drop_link(struct device_link *link) {}
static inline void pm_runtime_release_supplier(struct device_link *link) {}

#endif /* !CONFIG_PM */

#ifdef CONFIG_PM_SLEEP

bool pm_runtime_need_not_resume(struct device *dev);
int pm_runtime_force_resume(struct device *dev);

#else /* !CONFIG_PM_SLEEP */

static inline bool pm_runtime_need_not_resume(struct device *dev) {return true; }
static inline int pm_runtime_force_resume(struct device *dev) { return -ENXIO; }

#endif /* CONFIG_PM_SLEEP */

/**
 * pm_runtime_idle - Conditionally set up autosuspend of a device or suspend it.
 * @dev: Target device.
 *
 * Invoke the "idle check" callback of @dev and, depending on its return value,
 * set up autosuspend of @dev or suspend it (depending on whether or not
 * autosuspend has been enabled for it).
 *
 * Return:
 * * 0: Success.
 * * -EINVAL: Runtime PM error.
 * * -EACCES: Runtime PM disabled.
 * * -EAGAIN: Runtime PM usage counter non-zero, Runtime PM status change
 *            ongoing or device not in %RPM_ACTIVE state.
 * * -EBUSY: Runtime PM child_count non-zero.
 * * -EPERM: Device PM QoS resume latency 0.
 * * -EINPROGRESS: Suspend already in progress.
 * * -ENOSYS: CONFIG_PM not enabled.
 * Other values and conditions for the above values are possible as returned by
 * Runtime PM idle and suspend callbacks.
 */
static inline int pm_runtime_idle(struct device *dev)
{
        return __pm_runtime_idle(dev, 0);
}

/**
 * pm_runtime_suspend - Suspend a device synchronously.
 * @dev: Target device.
 *
 * Return:
 * * 1: Success; device was already suspended.
 * * 0: Success.
 * * -EINVAL: Runtime PM error.
 * * -EACCES: Runtime PM disabled.
 * * -EAGAIN: Runtime PM usage counter non-zero or Runtime PM status change
 *            ongoing.
 * * -EBUSY: Runtime PM child_count non-zero.
 * * -EPERM: Device PM QoS resume latency 0.
 * * -ENOSYS: CONFIG_PM not enabled.
 * Other values and conditions for the above values are possible as returned by
 * Runtime PM suspend callbacks.
 */
static inline int pm_runtime_suspend(struct device *dev)
{
        return __pm_runtime_suspend(dev, 0);
}

/**
 * pm_runtime_autosuspend - Update the last access time and set up autosuspend
 * of a device.
 * @dev: Target device.
 *
 * First update the last access time, then set up autosuspend of @dev or suspend
 * it (depending on whether or not autosuspend is enabled for it) without
 * engaging its "idle check" callback.
 *
 * Return:
 * * 1: Success; device was already suspended.
 * * 0: Success.
 * * -EINVAL: Runtime PM error.
 * * -EACCES: Runtime PM disabled.
 * * -EAGAIN: Runtime PM usage counter non-zero or Runtime PM status change
 *            ongoing.
 * * -EBUSY: Runtime PM child_count non-zero.
 * * -EPERM: Device PM QoS resume latency 0.
 * * -ENOSYS: CONFIG_PM not enabled.
 * Other values and conditions for the above values are possible as returned by
 * Runtime PM suspend callbacks.
 */
static inline int pm_runtime_autosuspend(struct device *dev)
{
        pm_runtime_mark_last_busy(dev);
        return __pm_runtime_suspend(dev, RPM_AUTO);
}

/**
 * pm_runtime_resume - Resume a device synchronously.
 * @dev: Target device.
 */
static inline int pm_runtime_resume(struct device *dev)
{
        return __pm_runtime_resume(dev, 0);
}

/**
 * pm_request_idle - Queue up "idle check" execution for a device.
 * @dev: Target device.
 *
 * Queue up a work item to run an equivalent of pm_runtime_idle() for @dev
 * asynchronously.
 *
 * Return:
 * * 0: Success.
 * * -EINVAL: Runtime PM error.
 * * -EACCES: Runtime PM disabled.
 * * -EAGAIN: Runtime PM usage counter non-zero, Runtime PM status change
 *            ongoing or device not in %RPM_ACTIVE state.
 * * -EBUSY: Runtime PM child_count non-zero.
 * * -EPERM: Device PM QoS resume latency 0.
 * * -EINPROGRESS: Suspend already in progress.
 * * -ENOSYS: CONFIG_PM not enabled.
 */
static inline int pm_request_idle(struct device *dev)
{
        return __pm_runtime_idle(dev, RPM_ASYNC);
}

/**
 * pm_request_resume - Queue up runtime-resume of a device.
 * @dev: Target device.
 */
static inline int pm_request_resume(struct device *dev)
{
        return __pm_runtime_resume(dev, RPM_ASYNC);
}

/**
 * pm_request_autosuspend - Update the last access time and queue up autosuspend
 * of a device.
 * @dev: Target device.
 *
 * Update the last access time of a device and queue up a work item to run an
 * equivalent pm_runtime_autosuspend() for @dev asynchronously.
 *
 * Return:
 * * 1: Success; device was already suspended.
 * * 0: Success.
 * * -EINVAL: Runtime PM error.
 * * -EACCES: Runtime PM disabled.
 * * -EAGAIN: Runtime PM usage counter non-zero or Runtime PM status change
 *            ongoing.
 * * -EBUSY: Runtime PM child_count non-zero.
 * * -EPERM: Device PM QoS resume latency 0.
 * * -EINPROGRESS: Suspend already in progress.
 * * -ENOSYS: CONFIG_PM not enabled.
 */
static inline int pm_request_autosuspend(struct device *dev)
{
        pm_runtime_mark_last_busy(dev);
        return __pm_runtime_suspend(dev, RPM_ASYNC | RPM_AUTO);
}

/**
 * pm_runtime_get - Bump up usage counter and queue up resume of a device.
 * @dev: Target device.
 *
 * Bump up the runtime PM usage counter of @dev and queue up a work item to
 * carry out runtime-resume of it.
 */
static inline int pm_runtime_get(struct device *dev)
{
        return __pm_runtime_resume(dev, RPM_GET_PUT | RPM_ASYNC);
}

/**
 * pm_runtime_get_sync - Bump up usage counter of a device and resume it.
 * @dev: Target device.
 *
 * Bump up the runtime PM usage counter of @dev and carry out runtime-resume of
 * it synchronously.
 *
 * The possible return values of this function are the same as for
 * pm_runtime_resume() and the runtime PM usage counter of @dev remains
 * incremented in all cases, even if it returns an error code.
 * Consider using pm_runtime_resume_and_get() instead of it, especially
 * if its return value is checked by the caller, as this is likely to result
 * in cleaner code.
 */
static inline int pm_runtime_get_sync(struct device *dev)
{
        return __pm_runtime_resume(dev, RPM_GET_PUT);
}

static inline int pm_runtime_get_active(struct device *dev, int rpmflags)
{
        int ret;

        ret = __pm_runtime_resume(dev, RPM_GET_PUT | rpmflags);
        if (ret < 0) {
                pm_runtime_put_noidle(dev);
                return ret;
        }

        return 0;
}

/**
 * pm_runtime_resume_and_get - Bump up usage counter of a device and resume it.
 * @dev: Target device.
 *
 * Resume @dev synchronously and if that is successful, increment its runtime
 * PM usage counter. Return 0 if the runtime PM usage counter of @dev has been
 * incremented or a negative error code otherwise.
 */
static inline int pm_runtime_resume_and_get(struct device *dev)
{
        return pm_runtime_get_active(dev, 0);
}

/**
 * pm_runtime_put - Drop device usage counter and queue up "idle check" if 0.
 * @dev: Target device.
 *
 * Decrement the runtime PM usage counter of @dev and if it turns out to be
 * equal to 0, queue up a work item for @dev like in pm_request_idle().
 *
 * Return:
 * * 1: Success. Usage counter dropped to zero, but device was already suspended.
 * * 0: Success.
 * * -EINVAL: Runtime PM error.
 * * -EACCES: Runtime PM disabled.
 * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status
 *            change ongoing.
 * * -EBUSY: Runtime PM child_count non-zero.
 * * -EPERM: Device PM QoS resume latency 0.
 * * -EINPROGRESS: Suspend already in progress.
 * * -ENOSYS: CONFIG_PM not enabled.
 */
static inline int pm_runtime_put(struct device *dev)
{
        return __pm_runtime_idle(dev, RPM_GET_PUT | RPM_ASYNC);
}

/**
 * __pm_runtime_put_autosuspend - Drop device usage counter and queue autosuspend if 0.
 * @dev: Target device.
 *
 * Decrement the runtime PM usage counter of @dev and if it turns out to be
 * equal to 0, queue up a work item for @dev like in pm_request_autosuspend().
 *
 * Return:
 * * 1: Success. Usage counter dropped to zero, but device was already suspended.
 * * 0: Success.
 * * -EINVAL: Runtime PM error.
 * * -EACCES: Runtime PM disabled.
 * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status
 *            change ongoing.
 * * -EBUSY: Runtime PM child_count non-zero.
 * * -EPERM: Device PM QoS resume latency 0.
 * * -EINPROGRESS: Suspend already in progress.
 * * -ENOSYS: CONFIG_PM not enabled.
 */
static inline int __pm_runtime_put_autosuspend(struct device *dev)
{
        return __pm_runtime_suspend(dev, RPM_GET_PUT | RPM_ASYNC | RPM_AUTO);
}

/**
 * pm_runtime_put_autosuspend - Update the last access time of a device, drop
 * its usage counter and queue autosuspend if the usage counter becomes 0.
 * @dev: Target device.
 *
 * Update the last access time of @dev, decrement runtime PM usage counter of
 * @dev and if it turns out to be equal to 0, queue up a work item for @dev like
 * in pm_request_autosuspend().
 *
 * Return:
 * * 1: Success. Usage counter dropped to zero, but device was already suspended.
 * * 0: Success.
 * * -EINVAL: Runtime PM error.
 * * -EACCES: Runtime PM disabled.
 * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status
 *            change ongoing.
 * * -EBUSY: Runtime PM child_count non-zero.
 * * -EPERM: Device PM QoS resume latency 0.
 * * -EINPROGRESS: Suspend already in progress.
 * * -ENOSYS: CONFIG_PM not enabled.
 */
static inline int pm_runtime_put_autosuspend(struct device *dev)
{
        pm_runtime_mark_last_busy(dev);
        return __pm_runtime_put_autosuspend(dev);
}

DEFINE_GUARD(pm_runtime_noresume, struct device *,
             pm_runtime_get_noresume(_T), pm_runtime_put_noidle(_T));

DEFINE_GUARD(pm_runtime_active, struct device *,
             pm_runtime_get_sync(_T), pm_runtime_put(_T));
DEFINE_GUARD(pm_runtime_active_auto, struct device *,
             pm_runtime_get_sync(_T), pm_runtime_put_autosuspend(_T));
/*
 * Use the following guards with ACQUIRE()/ACQUIRE_ERR().
 *
 * The difference between the "_try" and "_try_enabled" variants is that the
 * former do not produce an error when runtime PM is disabled for the given
 * device.
 */
DEFINE_GUARD_COND(pm_runtime_active, _try,
                  pm_runtime_get_active(_T, RPM_TRANSPARENT), _RET == 0)
DEFINE_GUARD_COND(pm_runtime_active, _try_enabled,
                  pm_runtime_resume_and_get(_T), _RET == 0)
DEFINE_GUARD_COND(pm_runtime_active_auto, _try,
                  pm_runtime_get_active(_T, RPM_TRANSPARENT), _RET == 0)
DEFINE_GUARD_COND(pm_runtime_active_auto, _try_enabled,
                  pm_runtime_resume_and_get(_T), _RET == 0)

/**
 * pm_runtime_put_sync - Drop device usage counter and run "idle check" if 0.
 * @dev: Target device.
 *
 * Decrement the runtime PM usage counter of @dev and if it turns out to be
 * equal to 0, invoke the "idle check" callback of @dev and, depending on its
 * return value, set up autosuspend of @dev or suspend it (depending on whether
 * or not autosuspend has been enabled for it).
 *
 * The runtime PM usage counter of @dev remains decremented in all cases, even
 * if it returns an error code.
 *
 * Return:
 * * 1: Success. Usage counter dropped to zero, but device was already suspended.
 * * 0: Success.
 * * -EINVAL: Runtime PM error.
 * * -EACCES: Runtime PM disabled.
 * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status
 *            change ongoing.
 * * -EBUSY: Runtime PM child_count non-zero.
 * * -EPERM: Device PM QoS resume latency 0.
 * * -ENOSYS: CONFIG_PM not enabled.
 * Other values and conditions for the above values are possible as returned by
 * Runtime PM suspend callbacks.
 */
static inline int pm_runtime_put_sync(struct device *dev)
{
        return __pm_runtime_idle(dev, RPM_GET_PUT);
}

/**
 * pm_runtime_put_sync_suspend - Drop device usage counter and suspend if 0.
 * @dev: Target device.
 *
 * Decrement the runtime PM usage counter of @dev and if it turns out to be
 * equal to 0, carry out runtime-suspend of @dev synchronously.
 *
 * The runtime PM usage counter of @dev remains decremented in all cases, even
 * if it returns an error code.
 *
 * Return:
 * * 1: Success. Usage counter dropped to zero, but device was already suspended.
 * * 0: Success.
 * * -EINVAL: Runtime PM error.
 * * -EACCES: Runtime PM disabled.
 * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status
 *            change ongoing.
 * * -EBUSY: Runtime PM child_count non-zero.
 * * -EPERM: Device PM QoS resume latency 0.
 * * -ENOSYS: CONFIG_PM not enabled.
 * Other values and conditions for the above values are possible as returned by
 * Runtime PM suspend callbacks.
 */
static inline int pm_runtime_put_sync_suspend(struct device *dev)
{
        return __pm_runtime_suspend(dev, RPM_GET_PUT);
}

/**
 * pm_runtime_put_sync_autosuspend - Update the last access time of a device,
 * drop device usage counter and autosuspend if 0.
 * @dev: Target device.
 *
 * Update the last access time of @dev, decrement the runtime PM usage counter
 * of @dev and if it turns out to be equal to 0, set up autosuspend of @dev or
 * suspend it synchronously (depending on whether or not autosuspend has been
 * enabled for it).
 *
 * The runtime PM usage counter of @dev remains decremented in all cases, even
 * if it returns an error code.
 *
 * Return:
 * * 1: Success. Usage counter dropped to zero, but device was already suspended.
 * * 0: Success.
 * * -EINVAL: Runtime PM error.
 * * -EACCES: Runtime PM disabled.
 * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status
 *            change ongoing.
 * * -EBUSY: Runtime PM child_count non-zero.
 * * -EPERM: Device PM QoS resume latency 0.
 * * -EINPROGRESS: Suspend already in progress.
 * * -ENOSYS: CONFIG_PM not enabled.
 * Other values and conditions for the above values are possible as returned by
 * Runtime PM suspend callbacks.
 */
static inline int pm_runtime_put_sync_autosuspend(struct device *dev)
{
        pm_runtime_mark_last_busy(dev);
        return __pm_runtime_suspend(dev, RPM_GET_PUT | RPM_AUTO);
}

/**
 * pm_runtime_set_active - Set runtime PM status to "active".
 * @dev: Target device.
 *
 * Set the runtime PM status of @dev to %RPM_ACTIVE and ensure that dependencies
 * of it will be taken into account.
 *
 * It is not valid to call this function for devices with runtime PM enabled.
 */
static inline int pm_runtime_set_active(struct device *dev)
{
        return __pm_runtime_set_status(dev, RPM_ACTIVE);
}

/**
 * pm_runtime_set_suspended - Set runtime PM status to "suspended".
 * @dev: Target device.
 *
 * Set the runtime PM status of @dev to %RPM_SUSPENDED and ensure that
 * dependencies of it will be taken into account.
 *
 * It is not valid to call this function for devices with runtime PM enabled.
 */
static inline int pm_runtime_set_suspended(struct device *dev)
{
        return __pm_runtime_set_status(dev, RPM_SUSPENDED);
}

/**
 * pm_runtime_disable - Disable runtime PM for a device.
 * @dev: Target device.
 *
 * Prevent the runtime PM framework from working with @dev by incrementing its
 * "disable" counter.
 *
 * If the counter is zero when this function runs and there is a pending runtime
 * resume request for @dev, it will be resumed.  If the counter is still zero at
 * that point, all of the pending runtime PM requests for @dev will be canceled
 * and all runtime PM operations in progress involving it will be waited for to
 * complete.
 *
 * For each invocation of this function for @dev, there must be a matching
 * pm_runtime_enable() call, so that runtime PM is eventually enabled for it
 * again.
 */
static inline void pm_runtime_disable(struct device *dev)
{
        __pm_runtime_disable(dev, true);
}

/**
 * pm_runtime_use_autosuspend - Allow autosuspend to be used for a device.
 * @dev: Target device.
 *
 * Allow the runtime PM autosuspend mechanism to be used for @dev whenever
 * requested (or "autosuspend" will be handled as direct runtime-suspend for
 * it).
 *
 * NOTE: It's important to undo this with pm_runtime_dont_use_autosuspend()
 * at driver exit time unless your driver initially enabled pm_runtime
 * with devm_pm_runtime_enable() (which handles it for you).
 */
static inline void pm_runtime_use_autosuspend(struct device *dev)
{
        __pm_runtime_use_autosuspend(dev, true);
}

/**
 * pm_runtime_dont_use_autosuspend - Prevent autosuspend from being used.
 * @dev: Target device.
 *
 * Prevent the runtime PM autosuspend mechanism from being used for @dev which
 * means that "autosuspend" will be handled as direct runtime-suspend for it
 * going forward.
 */
static inline void pm_runtime_dont_use_autosuspend(struct device *dev)
{
        __pm_runtime_use_autosuspend(dev, false);
}

#endif












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 



    1 













1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
/* +++ deflate.c */
/* deflate.c -- compress data using the deflation algorithm
 * Copyright (C) 1995-1996 Jean-loup Gailly.
 * For conditions of distribution and use, see copyright notice in zlib.h 
 */

/*
 *  ALGORITHM
 *
 *      The "deflation" process depends on being able to identify portions
 *      of the input text which are identical to earlier input (within a
 *      sliding window trailing behind the input currently being processed).
 *
 *      The most straightforward technique turns out to be the fastest for
 *      most input files: try all possible matches and select the longest.
 *      The key feature of this algorithm is that insertions into the string
 *      dictionary are very simple and thus fast, and deletions are avoided
 *      completely. Insertions are performed at each input character, whereas
 *      string matches are performed only when the previous match ends. So it
 *      is preferable to spend more time in matches to allow very fast string
 *      insertions and avoid deletions. The matching algorithm for small
 *      strings is inspired from that of Rabin & Karp. A brute force approach
 *      is used to find longer strings when a small match has been found.
 *      A similar algorithm is used in comic (by Jan-Mark Wams) and freeze
 *      (by Leonid Broukhis).
 *         A previous version of this file used a more sophisticated algorithm
 *      (by Fiala and Greene) which is guaranteed to run in linear amortized
 *      time, but has a larger average cost, uses more memory and is patented.
 *      However the F&G algorithm may be faster for some highly redundant
 *      files if the parameter max_chain_length (described below) is too large.
 *
 *  ACKNOWLEDGEMENTS
 *
 *      The idea of lazy evaluation of matches is due to Jan-Mark Wams, and
 *      I found it in 'freeze' written by Leonid Broukhis.
 *      Thanks to many people for bug reports and testing.
 *
 *  REFERENCES
 *
 *      Deutsch, L.P.,"DEFLATE Compressed Data Format Specification".
 *      Available in ftp://ds.internic.net/rfc/rfc1951.txt
 *
 *      A description of the Rabin and Karp algorithm is given in the book
 *         "Algorithms" by R. Sedgewick, Addison-Wesley, p252.
 *
 *      Fiala,E.R., and Greene,D.H.
 *         Data Compression with Finite Windows, Comm.ACM, 32,4 (1989) 490-595
 *
 */

#include <linux/module.h>
#include <linux/zutil.h>
#include "defutil.h"

/* architecture-specific bits */
#ifdef CONFIG_ZLIB_DFLTCC
#  include "../zlib_dfltcc/dfltcc_deflate.h"
#else
#define DEFLATE_RESET_HOOK(strm) do {} while (0)
#define DEFLATE_HOOK(strm, flush, bstate) 0
#define DEFLATE_NEED_CHECKSUM(strm) 1
#define DEFLATE_DFLTCC_ENABLED() 0
#endif

/* ===========================================================================
 *  Function prototypes.
 */

typedef block_state (*compress_func) (deflate_state *s, int flush);
/* Compression function. Returns the block state after the call. */

static void fill_window    (deflate_state *s);
static block_state deflate_stored (deflate_state *s, int flush);
static block_state deflate_fast   (deflate_state *s, int flush);
static block_state deflate_slow   (deflate_state *s, int flush);
static void lm_init        (deflate_state *s);
static void putShortMSB    (deflate_state *s, uInt b);
static int read_buf        (z_streamp strm, Byte *buf, unsigned size);
static uInt longest_match  (deflate_state *s, IPos cur_match);

#ifdef DEBUG_ZLIB
static  void check_match (deflate_state *s, IPos start, IPos match,
                         int length);
#endif

/* ===========================================================================
 * Local data
 */

#define NIL 0
/* Tail of hash chains */

#ifndef TOO_FAR
#  define TOO_FAR 4096
#endif
/* Matches of length 3 are discarded if their distance exceeds TOO_FAR */

#define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1)
/* Minimum amount of lookahead, except at the end of the input file.
 * See deflate.c for comments about the MIN_MATCH+1.
 */

/* Workspace to be allocated for deflate processing */
typedef struct deflate_workspace {
    /* State memory for the deflator */
    deflate_state deflate_memory;
#ifdef CONFIG_ZLIB_DFLTCC
    /* State memory for s390 hardware deflate */
    struct dfltcc_deflate_state dfltcc_memory;
#endif
    Byte *window_memory;
    Pos *prev_memory;
    Pos *head_memory;
    char *overlay_memory;
} deflate_workspace;

#ifdef CONFIG_ZLIB_DFLTCC
/* dfltcc_state must be doubleword aligned for DFLTCC call */
static_assert(offsetof(struct deflate_workspace, dfltcc_memory) % 8 == 0);
#endif

/* Values for max_lazy_match, good_match and max_chain_length, depending on
 * the desired pack level (0..9). The values given below have been tuned to
 * exclude worst case performance for pathological files. Better values may be
 * found for specific files.
 */
typedef struct config_s {
   ush good_length; /* reduce lazy search above this match length */
   ush max_lazy;    /* do not perform lazy search above this match length */
   ush nice_length; /* quit search above this match length */
   ush max_chain;
   compress_func func;
} config;

static const config configuration_table[10] = {
/*      good lazy nice chain */
/* 0 */ {0,    0,  0,    0, deflate_stored},  /* store only */
/* 1 */ {4,    4,  8,    4, deflate_fast}, /* maximum speed, no lazy matches */
/* 2 */ {4,    5, 16,    8, deflate_fast},
/* 3 */ {4,    6, 32,   32, deflate_fast},

/* 4 */ {4,    4, 16,   16, deflate_slow},  /* lazy matches */
/* 5 */ {8,   16, 32,   32, deflate_slow},
/* 6 */ {8,   16, 128, 128, deflate_slow},
/* 7 */ {8,   32, 128, 256, deflate_slow},
/* 8 */ {32, 128, 258, 1024, deflate_slow},
/* 9 */ {32, 258, 258, 4096, deflate_slow}}; /* maximum compression */

/* Note: the deflate() code requires max_lazy >= MIN_MATCH and max_chain >= 4
 * For deflate_fast() (levels <= 3) good is ignored and lazy has a different
 * meaning.
 */

/* ===========================================================================
 * Update a hash value with the given input byte
 * IN  assertion: all calls to UPDATE_HASH are made with consecutive
 *    input characters, so that a running hash key can be computed from the
 *    previous key instead of complete recalculation each time.
 */
#define UPDATE_HASH(s,h,c) (h = (((h)<<s->hash_shift) ^ (c)) & s->hash_mask)


/* ===========================================================================
 * Insert string str in the dictionary and set match_head to the previous head
 * of the hash chain (the most recent string with same hash key). Return
 * the previous length of the hash chain.
 * IN  assertion: all calls to INSERT_STRING are made with consecutive
 *    input characters and the first MIN_MATCH bytes of str are valid
 *    (except for the last MIN_MATCH-1 bytes of the input file).
 */
#define INSERT_STRING(s, str, match_head) \
   (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \
    s->prev[(str) & s->w_mask] = match_head = s->head[s->ins_h], \
    s->head[s->ins_h] = (Pos)(str))

/* ===========================================================================
 * Initialize the hash table (avoiding 64K overflow for 16 bit systems).
 * prev[] will be initialized on the fly.
 */
#define CLEAR_HASH(s) \
    s->head[s->hash_size-1] = NIL; \
    memset((char *)s->head, 0, (unsigned)(s->hash_size-1)*sizeof(*s->head));

/* ========================================================================= */
int zlib_deflateInit2(
        z_streamp strm,
        int  level,
        int  method,
        int  windowBits,
        int  memLevel,
        int  strategy
)
{
    deflate_state *s;
    int noheader = 0;
    deflate_workspace *mem;
    char *next;

    ush *overlay;
    /* We overlay pending_buf and d_buf+l_buf. This works since the average
     * output size for (length,distance) codes is <= 24 bits.
     */

    if (strm == NULL) return Z_STREAM_ERROR;

    strm->msg = NULL;

    if (level == Z_DEFAULT_COMPRESSION) level = 6;

    mem = (deflate_workspace *) strm->workspace;

    if (windowBits < 0) { /* undocumented feature: suppress zlib header */
        noheader = 1;
        windowBits = -windowBits;
    }
    if (memLevel < 1 || memLevel > MAX_MEM_LEVEL || method != Z_DEFLATED ||
        windowBits < 9 || windowBits > 15 || level < 0 || level > 9 ||
        strategy < 0 || strategy > Z_HUFFMAN_ONLY) {
        return Z_STREAM_ERROR;
    }

    /*
     * Direct the workspace's pointers to the chunks that were allocated
     * along with the deflate_workspace struct.
     */
    next = (char *) mem;
    next += sizeof(*mem);
#ifdef CONFIG_ZLIB_DFLTCC
    /*
     *  DFLTCC requires the window to be page aligned.
     *  Thus, we overallocate and take the aligned portion of the buffer.
     */
    mem->window_memory = (Byte *) PTR_ALIGN(next, PAGE_SIZE);
#else
    mem->window_memory = (Byte *) next;
#endif
    next += zlib_deflate_window_memsize(windowBits);
    mem->prev_memory = (Pos *) next;
    next += zlib_deflate_prev_memsize(windowBits);
    mem->head_memory = (Pos *) next;
    next += zlib_deflate_head_memsize(memLevel);
    mem->overlay_memory = next;

    s = (deflate_state *) &(mem->deflate_memory);
    strm->state = (struct internal_state *)s;
    s->strm = strm;

    s->noheader = noheader;
    s->w_bits = windowBits;
    s->w_size = 1 << s->w_bits;
    s->w_mask = s->w_size - 1;

    s->hash_bits = memLevel + 7;
    s->hash_size = 1 << s->hash_bits;
    s->hash_mask = s->hash_size - 1;
    s->hash_shift =  ((s->hash_bits+MIN_MATCH-1)/MIN_MATCH);

    s->window = (Byte *) mem->window_memory;
    s->prev   = (Pos *)  mem->prev_memory;
    s->head   = (Pos *)  mem->head_memory;

    s->lit_bufsize = 1 << (memLevel + 6); /* 16K elements by default */

    overlay = (ush *) mem->overlay_memory;
    s->pending_buf = (uch *) overlay;
    s->pending_buf_size = (ulg)s->lit_bufsize * (sizeof(ush)+2L);

    s->d_buf = overlay + s->lit_bufsize/sizeof(ush);
    s->l_buf = s->pending_buf + (1+sizeof(ush))*s->lit_bufsize;

    s->level = level;
    s->strategy = strategy;
    s->method = (Byte)method;

    return zlib_deflateReset(strm);
}

/* ========================================================================= */
int zlib_deflateReset(
        z_streamp strm
)
{
    deflate_state *s;
    
    if (strm == NULL || strm->state == NULL)
        return Z_STREAM_ERROR;

    strm->total_in = strm->total_out = 0;
    strm->msg = NULL;
    strm->data_type = Z_UNKNOWN;

    s = (deflate_state *)strm->state;
    s->pending = 0;
    s->pending_out = s->pending_buf;

    if (s->noheader < 0) {
        s->noheader = 0; /* was set to -1 by deflate(..., Z_FINISH); */
    }
    s->status = s->noheader ? BUSY_STATE : INIT_STATE;
    strm->adler = 1;
    s->last_flush = Z_NO_FLUSH;

    zlib_tr_init(s);
    lm_init(s);

    DEFLATE_RESET_HOOK(strm);

    return Z_OK;
}

/* =========================================================================
 * Put a short in the pending buffer. The 16-bit value is put in MSB order.
 * IN assertion: the stream state is correct and there is enough room in
 * pending_buf.
 */
static void putShortMSB(
        deflate_state *s,
        uInt b
)
{
    put_byte(s, (Byte)(b >> 8));
    put_byte(s, (Byte)(b & 0xff));
}   

/* ========================================================================= */
int zlib_deflate(
        z_streamp strm,
        int flush
)
{
    int old_flush; /* value of flush param for previous deflate call */
    deflate_state *s;

    if (strm == NULL || strm->state == NULL ||
        flush > Z_FINISH || flush < 0) {
        return Z_STREAM_ERROR;
    }
    s = (deflate_state *) strm->state;

    if ((strm->next_in == NULL && strm->avail_in != 0) ||
        (s->status == FINISH_STATE && flush != Z_FINISH)) {
        return Z_STREAM_ERROR;
    }
    if (strm->avail_out == 0) return Z_BUF_ERROR;

    s->strm = strm; /* just in case */
    old_flush = s->last_flush;
    s->last_flush = flush;

    /* Write the zlib header */
    if (s->status == INIT_STATE) {

        uInt header = (Z_DEFLATED + ((s->w_bits-8)<<4)) << 8;
        uInt level_flags = (s->level-1) >> 1;

        if (level_flags > 3) level_flags = 3;
        header |= (level_flags << 6);
        if (s->strstart != 0) header |= PRESET_DICT;
        header += 31 - (header % 31);

        s->status = BUSY_STATE;
        putShortMSB(s, header);

        /* Save the adler32 of the preset dictionary: */
        if (s->strstart != 0) {
            putShortMSB(s, (uInt)(strm->adler >> 16));
            putShortMSB(s, (uInt)(strm->adler & 0xffff));
        }
        strm->adler = 1L;
    }

    /* Flush as much pending output as possible */
    if (s->pending != 0) {
        flush_pending(strm);
        if (strm->avail_out == 0) {
            /* Since avail_out is 0, deflate will be called again with
             * more output space, but possibly with both pending and
             * avail_in equal to zero. There won't be anything to do,
             * but this is not an error situation so make sure we
             * return OK instead of BUF_ERROR at next call of deflate:
             */
            s->last_flush = -1;
            return Z_OK;
        }

    /* Make sure there is something to do and avoid duplicate consecutive
     * flushes. For repeated and useless calls with Z_FINISH, we keep
     * returning Z_STREAM_END instead of Z_BUFF_ERROR.
     */
    } else if (strm->avail_in == 0 && flush <= old_flush &&
               flush != Z_FINISH) {
        return Z_BUF_ERROR;
    }

    /* User must not provide more input after the first FINISH: */
    if (s->status == FINISH_STATE && strm->avail_in != 0) {
        return Z_BUF_ERROR;
    }

    /* Start a new block or continue the current one.
     */
    if (strm->avail_in != 0 || s->lookahead != 0 ||
        (flush != Z_NO_FLUSH && s->status != FINISH_STATE)) {
        block_state bstate;

        bstate = DEFLATE_HOOK(strm, flush, &bstate) ? bstate :
                 (*(configuration_table[s->level].func))(s, flush);

        if (bstate == finish_started || bstate == finish_done) {
            s->status = FINISH_STATE;
        }
        if (bstate == need_more || bstate == finish_started) {
            if (strm->avail_out == 0) {
                s->last_flush = -1; /* avoid BUF_ERROR next call, see above */
            }
            return Z_OK;
            /* If flush != Z_NO_FLUSH && avail_out == 0, the next call
             * of deflate should use the same flush parameter to make sure
             * that the flush is complete. So we don't have to output an
             * empty block here, this will be done at next call. This also
             * ensures that for a very small output buffer, we emit at most
             * one empty block.
             */
        }
        if (bstate == block_done) {
            if (flush == Z_PARTIAL_FLUSH) {
                zlib_tr_align(s);
            } else if (flush == Z_PACKET_FLUSH) {
                /* Output just the 3-bit `stored' block type value,
                   but not a zero length. */
                zlib_tr_stored_type_only(s);
            } else { /* FULL_FLUSH or SYNC_FLUSH */
                zlib_tr_stored_block(s, (char*)0, 0L, 0);
                /* For a full flush, this empty block will be recognized
                 * as a special marker by inflate_sync().
                 */
                if (flush == Z_FULL_FLUSH) {
                    CLEAR_HASH(s);             /* forget history */
                }
            }
            flush_pending(strm);
            if (strm->avail_out == 0) {
              s->last_flush = -1; /* avoid BUF_ERROR at next call, see above */
              return Z_OK;
            }
        }
    }
    Assert(strm->avail_out > 0, "bug2");

    if (flush != Z_FINISH) return Z_OK;

    if (!s->noheader) {
        /* Write zlib trailer (adler32) */
        putShortMSB(s, (uInt)(strm->adler >> 16));
        putShortMSB(s, (uInt)(strm->adler & 0xffff));
    }
    flush_pending(strm);
    /* If avail_out is zero, the application will call deflate again
     * to flush the rest.
     */
    if (!s->noheader) {
        s->noheader = -1; /* write the trailer only once! */
    }
    if (s->pending == 0) {
        Assert(s->bi_valid == 0, "bi_buf not flushed");
        return Z_STREAM_END;
    }
    return Z_OK;
}

/* ========================================================================= */
int zlib_deflateEnd(
        z_streamp strm
)
{
    int status;
    deflate_state *s;

    if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR;
    s = (deflate_state *) strm->state;

    status = s->status;
    if (status != INIT_STATE && status != BUSY_STATE &&
        status != FINISH_STATE) {
      return Z_STREAM_ERROR;
    }

    strm->state = NULL;

    return status == BUSY_STATE ? Z_DATA_ERROR : Z_OK;
}

/* ===========================================================================
 * Read a new buffer from the current input stream, update the adler32
 * and total number of bytes read.  All deflate() input goes through
 * this function so some applications may wish to modify it to avoid
 * allocating a large strm->next_in buffer and copying from it.
 * (See also flush_pending()).
 */
static int read_buf(
        z_streamp strm,
        Byte *buf,
        unsigned size
)
{
    unsigned len = strm->avail_in;

    if (len > size) len = size;
    if (len == 0) return 0;

    strm->avail_in  -= len;

    if (!DEFLATE_NEED_CHECKSUM(strm)) {}
    else if (!((deflate_state *)(strm->state))->noheader) {
        strm->adler = zlib_adler32(strm->adler, strm->next_in, len);
    }
    memcpy(buf, strm->next_in, len);
    strm->next_in  += len;
    strm->total_in += len;

    return (int)len;
}

/* ===========================================================================
 * Initialize the "longest match" routines for a new zlib stream
 */
static void lm_init(
        deflate_state *s
)
{
    s->window_size = (ulg)2L*s->w_size;

    CLEAR_HASH(s);

    /* Set the default configuration parameters:
     */
    s->max_lazy_match   = configuration_table[s->level].max_lazy;
    s->good_match       = configuration_table[s->level].good_length;
    s->nice_match       = configuration_table[s->level].nice_length;
    s->max_chain_length = configuration_table[s->level].max_chain;

    s->strstart = 0;
    s->block_start = 0L;
    s->lookahead = 0;
    s->match_length = s->prev_length = MIN_MATCH-1;
    s->match_available = 0;
    s->ins_h = 0;
}

/* ===========================================================================
 * Set match_start to the longest match starting at the given string and
 * return its length. Matches shorter or equal to prev_length are discarded,
 * in which case the result is equal to prev_length and match_start is
 * garbage.
 * IN assertions: cur_match is the head of the hash chain for the current
 *   string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1
 * OUT assertion: the match length is not greater than s->lookahead.
 */
/* For 80x86 and 680x0, an optimized version will be provided in match.asm or
 * match.S. The code will be functionally equivalent.
 */
static uInt longest_match(
        deflate_state *s,
        IPos cur_match                        /* current match */
)
{
    unsigned chain_length = s->max_chain_length;/* max hash chain length */
    register Byte *scan = s->window + s->strstart; /* current string */
    register Byte *match;                       /* matched string */
    register int len;                           /* length of current match */
    int best_len = s->prev_length;              /* best match length so far */
    int nice_match = s->nice_match;             /* stop if match long enough */
    IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
        s->strstart - (IPos)MAX_DIST(s) : NIL;
    /* Stop when cur_match becomes <= limit. To simplify the code,
     * we prevent matches with the string of window index 0.
     */
    Pos *prev = s->prev;
    uInt wmask = s->w_mask;

#ifdef UNALIGNED_OK
    /* Compare two bytes at a time. Note: this is not always beneficial.
     * Try with and without -DUNALIGNED_OK to check.
     */
    register Byte *strend = s->window + s->strstart + MAX_MATCH - 1;
    register ush scan_start = *(ush*)scan;
    register ush scan_end   = *(ush*)(scan+best_len-1);
#else
    register Byte *strend = s->window + s->strstart + MAX_MATCH;
    register Byte scan_end1  = scan[best_len-1];
    register Byte scan_end   = scan[best_len];
#endif

    /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16.
     * It is easy to get rid of this optimization if necessary.
     */
    Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever");

    /* Do not waste too much time if we already have a good match: */
    if (s->prev_length >= s->good_match) {
        chain_length >>= 2;
    }
    /* Do not look for matches beyond the end of the input. This is necessary
     * to make deflate deterministic.
     */
    if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;

    Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead");

    do {
        Assert(cur_match < s->strstart, "no future");
        match = s->window + cur_match;

        /* Skip to next match if the match length cannot increase
         * or if the match length is less than 2:
         */
#if (defined(UNALIGNED_OK) && MAX_MATCH == 258)
        /* This code assumes sizeof(unsigned short) == 2. Do not use
         * UNALIGNED_OK if your compiler uses a different size.
         */
        if (*(ush*)(match+best_len-1) != scan_end ||
            *(ush*)match != scan_start) continue;

        /* It is not necessary to compare scan[2] and match[2] since they are
         * always equal when the other bytes match, given that the hash keys
         * are equal and that HASH_BITS >= 8. Compare 2 bytes at a time at
         * strstart+3, +5, ... up to strstart+257. We check for insufficient
         * lookahead only every 4th comparison; the 128th check will be made
         * at strstart+257. If MAX_MATCH-2 is not a multiple of 8, it is
         * necessary to put more guard bytes at the end of the window, or
         * to check more often for insufficient lookahead.
         */
        Assert(scan[2] == match[2], "scan[2]?");
        scan++, match++;
        do {
        } while (*(ush*)(scan+=2) == *(ush*)(match+=2) &&
                 *(ush*)(scan+=2) == *(ush*)(match+=2) &&
                 *(ush*)(scan+=2) == *(ush*)(match+=2) &&
                 *(ush*)(scan+=2) == *(ush*)(match+=2) &&
                 scan < strend);
        /* The funny "do {}" generates better code on most compilers */

        /* Here, scan <= window+strstart+257 */
        Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
        if (*scan == *match) scan++;

        len = (MAX_MATCH - 1) - (int)(strend-scan);
        scan = strend - (MAX_MATCH-1);

#else /* UNALIGNED_OK */

        if (match[best_len]   != scan_end  ||
            match[best_len-1] != scan_end1 ||
            *match            != *scan     ||
            *++match          != scan[1])      continue;

        /* The check at best_len-1 can be removed because it will be made
         * again later. (This heuristic is not always a win.)
         * It is not necessary to compare scan[2] and match[2] since they
         * are always equal when the other bytes match, given that
         * the hash keys are equal and that HASH_BITS >= 8.
         */
        scan += 2, match++;
        Assert(*scan == *match, "match[2]?");

        /* We check for insufficient lookahead only every 8th comparison;
         * the 256th check will be made at strstart+258.
         */
        do {
        } while (*++scan == *++match && *++scan == *++match &&
                 *++scan == *++match && *++scan == *++match &&
                 *++scan == *++match && *++scan == *++match &&
                 *++scan == *++match && *++scan == *++match &&
                 scan < strend);

        Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");

        len = MAX_MATCH - (int)(strend - scan);
        scan = strend - MAX_MATCH;

#endif /* UNALIGNED_OK */

        if (len > best_len) {
            s->match_start = cur_match;
            best_len = len;
            if (len >= nice_match) break;
#ifdef UNALIGNED_OK
            scan_end = *(ush*)(scan+best_len-1);
#else
            scan_end1  = scan[best_len-1];
            scan_end   = scan[best_len];
#endif
        }
    } while ((cur_match = prev[cur_match & wmask]) > limit
             && --chain_length != 0);

    if ((uInt)best_len <= s->lookahead) return best_len;
    return s->lookahead;
}

#ifdef DEBUG_ZLIB
/* ===========================================================================
 * Check that the match at match_start is indeed a match.
 */
static void check_match(
        deflate_state *s,
        IPos start,
        IPos match,
        int length
)
{
    /* check that the match is indeed a match */
    if (memcmp((char *)s->window + match, (char *)s->window + start, length)) {
        fprintf(stderr, " start %u, match %u, length %d\n",
                start, match, length);
        do {
            fprintf(stderr, "%c%c", s->window[match++], s->window[start++]);
        } while (--length != 0);
        z_error("invalid match");
    }
    if (z_verbose > 1) {
        fprintf(stderr,"\\[%d,%d]", start-match, length);
        do { putc(s->window[start++], stderr); } while (--length != 0);
    }
}
#else
#  define check_match(s, start, match, length)
#endif

/* ===========================================================================
 * Fill the window when the lookahead becomes insufficient.
 * Updates strstart and lookahead.
 *
 * IN assertion: lookahead < MIN_LOOKAHEAD
 * OUT assertions: strstart <= window_size-MIN_LOOKAHEAD
 *    At least one byte has been read, or avail_in == 0; reads are
 *    performed for at least two bytes (required for the zip translate_eol
 *    option -- not supported here).
 */
static void fill_window(
        deflate_state *s
)
{
    register unsigned n, m;
    register Pos *p;
    unsigned more;    /* Amount of free space at the end of the window. */
    uInt wsize = s->w_size;

    do {
        more = (unsigned)(s->window_size -(ulg)s->lookahead -(ulg)s->strstart);

        /* Deal with !@#$% 64K limit: */
        if (more == 0 && s->strstart == 0 && s->lookahead == 0) {
            more = wsize;

        } else if (more == (unsigned)(-1)) {
            /* Very unlikely, but possible on 16 bit machine if strstart == 0
             * and lookahead == 1 (input done one byte at time)
             */
            more--;

        /* If the window is almost full and there is insufficient lookahead,
         * move the upper half to the lower one to make room in the upper half.
         */
        } else if (s->strstart >= wsize+MAX_DIST(s)) {

            memcpy((char *)s->window, (char *)s->window+wsize,
                   (unsigned)wsize);
            s->match_start -= wsize;
            s->strstart    -= wsize; /* we now have strstart >= MAX_DIST */
            s->block_start -= (long) wsize;

            /* Slide the hash table (could be avoided with 32 bit values
               at the expense of memory usage). We slide even when level == 0
               to keep the hash table consistent if we switch back to level > 0
               later. (Using level 0 permanently is not an optimal usage of
               zlib, so we don't care about this pathological case.)
             */
            n = s->hash_size;
            p = &s->head[n];
            do {
                m = *--p;
                *p = (Pos)(m >= wsize ? m-wsize : NIL);
            } while (--n);

            n = wsize;
            p = &s->prev[n];
            do {
                m = *--p;
                *p = (Pos)(m >= wsize ? m-wsize : NIL);
                /* If n is not on any hash chain, prev[n] is garbage but
                 * its value will never be used.
                 */
            } while (--n);
            more += wsize;
        }
        if (s->strm->avail_in == 0) return;

        /* If there was no sliding:
         *    strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
         *    more == window_size - lookahead - strstart
         * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
         * => more >= window_size - 2*WSIZE + 2
         * In the BIG_MEM or MMAP case (not yet supported),
         *   window_size == input_size + MIN_LOOKAHEAD  &&
         *   strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
         * Otherwise, window_size == 2*WSIZE so more >= 2.
         * If there was sliding, more >= WSIZE. So in all cases, more >= 2.
         */
        Assert(more >= 2, "more < 2");

        n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more);
        s->lookahead += n;

        /* Initialize the hash value now that we have some input: */
        if (s->lookahead >= MIN_MATCH) {
            s->ins_h = s->window[s->strstart];
            UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1]);
#if MIN_MATCH != 3
            Call UPDATE_HASH() MIN_MATCH-3 more times
#endif
        }
        /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
         * but this is not important since only literal bytes will be emitted.
         */

    } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
}

/* ===========================================================================
 * Flush the current block, with given end-of-file flag.
 * IN assertion: strstart is set to the end of the current match.
 */
#define FLUSH_BLOCK_ONLY(s, eof) { \
   zlib_tr_flush_block(s, (s->block_start >= 0L ? \
                   (char *)&s->window[(unsigned)s->block_start] : \
                   NULL), \
                (ulg)((long)s->strstart - s->block_start), \
                (eof)); \
   s->block_start = s->strstart; \
   flush_pending(s->strm); \
   Tracev((stderr,"[FLUSH]")); \
}

/* Same but force premature exit if necessary. */
#define FLUSH_BLOCK(s, eof) { \
   FLUSH_BLOCK_ONLY(s, eof); \
   if (s->strm->avail_out == 0) return (eof) ? finish_started : need_more; \
}

/* ===========================================================================
 * Copy without compression as much as possible from the input stream, return
 * the current block state.
 * This function does not insert new strings in the dictionary since
 * uncompressible data is probably not useful. This function is used
 * only for the level=0 compression option.
 * NOTE: this function should be optimized to avoid extra copying from
 * window to pending_buf.
 */
static block_state deflate_stored(
        deflate_state *s,
        int flush
)
{
    /* Stored blocks are limited to 0xffff bytes, pending_buf is limited
     * to pending_buf_size, and each stored block has a 5 byte header:
     */
    ulg max_block_size = 0xffff;
    ulg max_start;

    if (max_block_size > s->pending_buf_size - 5) {
        max_block_size = s->pending_buf_size - 5;
    }

    /* Copy as much as possible from input to output: */
    for (;;) {
        /* Fill the window as much as possible: */
        if (s->lookahead <= 1) {

            Assert(s->strstart < s->w_size+MAX_DIST(s) ||
                   s->block_start >= (long)s->w_size, "slide too late");

            fill_window(s);
            if (s->lookahead == 0 && flush == Z_NO_FLUSH) return need_more;

            if (s->lookahead == 0) break; /* flush the current block */
        }
        Assert(s->block_start >= 0L, "block gone");

        s->strstart += s->lookahead;
        s->lookahead = 0;

        /* Emit a stored block if pending_buf will be full: */
         max_start = s->block_start + max_block_size;
        if (s->strstart == 0 || (ulg)s->strstart >= max_start) {
            /* strstart == 0 is possible when wraparound on 16-bit machine */
            s->lookahead = (uInt)(s->strstart - max_start);
            s->strstart = (uInt)max_start;
            FLUSH_BLOCK(s, 0);
        }
        /* Flush if we may have to slide, otherwise block_start may become
         * negative and the data will be gone:
         */
        if (s->strstart - (uInt)s->block_start >= MAX_DIST(s)) {
            FLUSH_BLOCK(s, 0);
        }
    }
    FLUSH_BLOCK(s, flush == Z_FINISH);
    return flush == Z_FINISH ? finish_done : block_done;
}

/* ===========================================================================
 * Compress as much as possible from the input stream, return the current
 * block state.
 * This function does not perform lazy evaluation of matches and inserts
 * new strings in the dictionary only for unmatched strings or for short
 * matches. It is used only for the fast compression options.
 */
static block_state deflate_fast(
        deflate_state *s,
        int flush
)
{
    IPos hash_head = NIL; /* head of the hash chain */
    int bflush;           /* set if current block must be flushed */

    for (;;) {
        /* Make sure that we always have enough lookahead, except
         * at the end of the input file. We need MAX_MATCH bytes
         * for the next match, plus MIN_MATCH bytes to insert the
         * string following the next match.
         */
        if (s->lookahead < MIN_LOOKAHEAD) {
            fill_window(s);
            if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) {
                return need_more;
            }
            if (s->lookahead == 0) break; /* flush the current block */
        }

        /* Insert the string window[strstart .. strstart+2] in the
         * dictionary, and set hash_head to the head of the hash chain:
         */
        if (s->lookahead >= MIN_MATCH) {
            INSERT_STRING(s, s->strstart, hash_head);
        }

        /* Find the longest match, discarding those <= prev_length.
         * At this point we have always match_length < MIN_MATCH
         */
        if (hash_head != NIL && s->strstart - hash_head <= MAX_DIST(s)) {
            /* To simplify the code, we prevent matches with the string
             * of window index 0 (in particular we have to avoid a match
             * of the string with itself at the start of the input file).
             */
            if (s->strategy != Z_HUFFMAN_ONLY) {
                s->match_length = longest_match (s, hash_head);
            }
            /* longest_match() sets match_start */
        }
        if (s->match_length >= MIN_MATCH) {
            check_match(s, s->strstart, s->match_start, s->match_length);

            bflush = zlib_tr_tally(s, s->strstart - s->match_start,
                               s->match_length - MIN_MATCH);

            s->lookahead -= s->match_length;

            /* Insert new strings in the hash table only if the match length
             * is not too large. This saves time but degrades compression.
             */
            if (s->match_length <= s->max_insert_length &&
                s->lookahead >= MIN_MATCH) {
                s->match_length--; /* string at strstart already in hash table */
                do {
                    s->strstart++;
                    INSERT_STRING(s, s->strstart, hash_head);
                    /* strstart never exceeds WSIZE-MAX_MATCH, so there are
                     * always MIN_MATCH bytes ahead.
                     */
                } while (--s->match_length != 0);
                s->strstart++; 
            } else {
                s->strstart += s->match_length;
                s->match_length = 0;
                s->ins_h = s->window[s->strstart];
                UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1]);
#if MIN_MATCH != 3
                Call UPDATE_HASH() MIN_MATCH-3 more times
#endif
                /* If lookahead < MIN_MATCH, ins_h is garbage, but it does not
                 * matter since it will be recomputed at next deflate call.
                 */
            }
        } else {
            /* No match, output a literal byte */
            Tracevv((stderr,"%c", s->window[s->strstart]));
            bflush = zlib_tr_tally (s, 0, s->window[s->strstart]);
            s->lookahead--;
            s->strstart++; 
        }
        if (bflush) FLUSH_BLOCK(s, 0);
    }
    FLUSH_BLOCK(s, flush == Z_FINISH);
    return flush == Z_FINISH ? finish_done : block_done;
}

/* ===========================================================================
 * Same as above, but achieves better compression. We use a lazy
 * evaluation for matches: a match is finally adopted only if there is
 * no better match at the next window position.
 */
static block_state deflate_slow(
        deflate_state *s,
        int flush
)
{
    IPos hash_head = NIL;    /* head of hash chain */
    int bflush;              /* set if current block must be flushed */

    /* Process the input block. */
    for (;;) {
        /* Make sure that we always have enough lookahead, except
         * at the end of the input file. We need MAX_MATCH bytes
         * for the next match, plus MIN_MATCH bytes to insert the
         * string following the next match.
         */
        if (s->lookahead < MIN_LOOKAHEAD) {
            fill_window(s);
            if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) {
                return need_more;
            }
            if (s->lookahead == 0) break; /* flush the current block */
        }

        /* Insert the string window[strstart .. strstart+2] in the
         * dictionary, and set hash_head to the head of the hash chain:
         */
        if (s->lookahead >= MIN_MATCH) {
            INSERT_STRING(s, s->strstart, hash_head);
        }

        /* Find the longest match, discarding those <= prev_length.
         */
        s->prev_length = s->match_length, s->prev_match = s->match_start;
        s->match_length = MIN_MATCH-1;

        if (hash_head != NIL && s->prev_length < s->max_lazy_match &&
            s->strstart - hash_head <= MAX_DIST(s)) {
            /* To simplify the code, we prevent matches with the string
             * of window index 0 (in particular we have to avoid a match
             * of the string with itself at the start of the input file).
             */
            if (s->strategy != Z_HUFFMAN_ONLY) {
                s->match_length = longest_match (s, hash_head);
            }
            /* longest_match() sets match_start */

            if (s->match_length <= 5 && (s->strategy == Z_FILTERED ||
                 (s->match_length == MIN_MATCH &&
                  s->strstart - s->match_start > TOO_FAR))) {

                /* If prev_match is also MIN_MATCH, match_start is garbage
                 * but we will ignore the current match anyway.
                 */
                s->match_length = MIN_MATCH-1;
            }
        }
        /* If there was a match at the previous step and the current
         * match is not better, output the previous match:
         */
        if (s->prev_length >= MIN_MATCH && s->match_length <= s->prev_length) {
            uInt max_insert = s->strstart + s->lookahead - MIN_MATCH;
            /* Do not insert strings in hash table beyond this. */

            check_match(s, s->strstart-1, s->prev_match, s->prev_length);

            bflush = zlib_tr_tally(s, s->strstart -1 - s->prev_match,
                                   s->prev_length - MIN_MATCH);

            /* Insert in hash table all strings up to the end of the match.
             * strstart-1 and strstart are already inserted. If there is not
             * enough lookahead, the last two strings are not inserted in
             * the hash table.
             */
            s->lookahead -= s->prev_length-1;
            s->prev_length -= 2;
            do {
                if (++s->strstart <= max_insert) {
                    INSERT_STRING(s, s->strstart, hash_head);
                }
            } while (--s->prev_length != 0);
            s->match_available = 0;
            s->match_length = MIN_MATCH-1;
            s->strstart++;

            if (bflush) FLUSH_BLOCK(s, 0);

        } else if (s->match_available) {
            /* If there was no match at the previous position, output a
             * single literal. If there was a match but the current match
             * is longer, truncate the previous match to a single literal.
             */
            Tracevv((stderr,"%c", s->window[s->strstart-1]));
            if (zlib_tr_tally (s, 0, s->window[s->strstart-1])) {
                FLUSH_BLOCK_ONLY(s, 0);
            }
            s->strstart++;
            s->lookahead--;
            if (s->strm->avail_out == 0) return need_more;
        } else {
            /* There is no previous match to compare with, wait for
             * the next step to decide.
             */
            s->match_available = 1;
            s->strstart++;
            s->lookahead--;
        }
    }
    Assert (flush != Z_NO_FLUSH, "no flush?");
    if (s->match_available) {
        Tracevv((stderr,"%c", s->window[s->strstart-1]));
        zlib_tr_tally (s, 0, s->window[s->strstart-1]);
        s->match_available = 0;
    }
    FLUSH_BLOCK(s, flush == Z_FINISH);
    return flush == Z_FINISH ? finish_done : block_done;
}

int zlib_deflate_workspacesize(int windowBits, int memLevel)
{
    if (windowBits < 0) /* undocumented feature: suppress zlib header */
        windowBits = -windowBits;

    /* Since the return value is typically passed to vmalloc() unchecked... */
    BUG_ON(memLevel < 1 || memLevel > MAX_MEM_LEVEL || windowBits < 9 ||
                                                        windowBits > 15);

    return sizeof(deflate_workspace)
        + zlib_deflate_window_memsize(windowBits)
        + zlib_deflate_prev_memsize(windowBits)
        + zlib_deflate_head_memsize(memLevel)
        + zlib_deflate_overlay_memsize(memLevel);
}

int zlib_deflate_dfltcc_enabled(void)
{
        return DEFLATE_DFLTCC_ENABLED();
}

































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM task

#if !defined(_TRACE_TASK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_TASK_H
#include <linux/tracepoint.h>

TRACE_EVENT(task_newtask,

        TP_PROTO(struct task_struct *task, u64 clone_flags),

        TP_ARGS(task, clone_flags),

        TP_STRUCT__entry(
                __field(        pid_t,        pid)
                __array(        char,        comm, TASK_COMM_LEN)
                __field(        u64,    clone_flags)
                __field(        short,        oom_score_adj)
        ),

        TP_fast_assign(
                __entry->pid = task->pid;
                memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
                __entry->clone_flags = clone_flags;
                __entry->oom_score_adj = task->signal->oom_score_adj;
        ),

        TP_printk("pid=%d comm=%s clone_flags=%llx oom_score_adj=%hd",
                __entry->pid, __entry->comm,
                __entry->clone_flags, __entry->oom_score_adj)
);

TRACE_EVENT(task_rename,

        TP_PROTO(struct task_struct *task, const char *comm),

        TP_ARGS(task, comm),

        TP_STRUCT__entry(
                __array(        char, oldcomm,  TASK_COMM_LEN)
                __array(        char, newcomm,  TASK_COMM_LEN)
                __field(        short,        oom_score_adj)
        ),

        TP_fast_assign(
                memcpy(entry->oldcomm, task->comm, TASK_COMM_LEN);
                strscpy(entry->newcomm, comm, TASK_COMM_LEN);
                __entry->oom_score_adj = task->signal->oom_score_adj;
        ),

        TP_printk("oldcomm=%s newcomm=%s oom_score_adj=%hd",
                  __entry->oldcomm, __entry->newcomm, __entry->oom_score_adj)
);

/**
 * task_prctl_unknown - called on unknown prctl() option
 * @option:        option passed
 * @arg2:        arg2 passed
 * @arg3:        arg3 passed
 * @arg4:        arg4 passed
 * @arg5:        arg5 passed
 *
 * Called on an unknown prctl() option.
 */
TRACE_EVENT(task_prctl_unknown,

        TP_PROTO(int option, unsigned long arg2, unsigned long arg3,
                 unsigned long arg4, unsigned long arg5),

        TP_ARGS(option, arg2, arg3, arg4, arg5),

        TP_STRUCT__entry(
                __field(        int,                option)
                __field(        unsigned long,        arg2)
                __field(        unsigned long,        arg3)
                __field(        unsigned long,        arg4)
                __field(        unsigned long,        arg5)
        ),

        TP_fast_assign(
                __entry->option = option;
                __entry->arg2 = arg2;
                __entry->arg3 = arg3;
                __entry->arg4 = arg4;
                __entry->arg5 = arg5;
        ),

        TP_printk("option=%d arg2=%ld arg3=%ld arg4=%ld arg5=%ld",
                  __entry->option, __entry->arg2, __entry->arg3, __entry->arg4, __entry->arg5)
);

#endif

/* This part must be outside protection */
#include <trace/define_trace.h>

























































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MMU_NOTIFIER_H
#define _LINUX_MMU_NOTIFIER_H

#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/mm_types.h>
#include <linux/mmap_lock.h>
#include <linux/srcu.h>
#include <linux/interval_tree.h>

struct mmu_notifier_subscriptions;
struct mmu_notifier;
struct mmu_notifier_range;
struct mmu_interval_notifier;

/**
 * enum mmu_notifier_event - reason for the mmu notifier callback
 * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that
 * move the range
 *
 * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like
 * madvise() or replacing a page by another one, ...).
 *
 * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range
 * ie using the vma access permission (vm_page_prot) to update the whole range
 * is enough no need to inspect changes to the CPU page table (mprotect()
 * syscall)
 *
 * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for
 * pages in the range so to mirror those changes the user must inspect the CPU
 * page table (from the end callback).
 *
 * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same
 * access flags). User should soft dirty the page in the end callback to make
 * sure that anyone relying on soft dirtiness catch pages that might be written
 * through non CPU mappings.
 *
 * @MMU_NOTIFY_RELEASE: used during mmu_interval_notifier invalidate to signal
 * that the mm refcount is zero and the range is no longer accessible.
 *
 * @MMU_NOTIFY_MIGRATE: used during migrate_vma_collect() invalidate to signal
 * a device driver to possibly ignore the invalidation if the
 * owner field matches the driver's device private pgmap owner.
 *
 * @MMU_NOTIFY_EXCLUSIVE: conversion of a page table entry to device-exclusive.
 * The owner is initialized to the value provided by the caller of
 * make_device_exclusive(), such that this caller can filter out these
 * events.
 */
enum mmu_notifier_event {
        MMU_NOTIFY_UNMAP = 0,
        MMU_NOTIFY_CLEAR,
        MMU_NOTIFY_PROTECTION_VMA,
        MMU_NOTIFY_PROTECTION_PAGE,
        MMU_NOTIFY_SOFT_DIRTY,
        MMU_NOTIFY_RELEASE,
        MMU_NOTIFY_MIGRATE,
        MMU_NOTIFY_EXCLUSIVE,
};

#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)

struct mmu_notifier_ops {
        /*
         * Called either by mmu_notifier_unregister or when the mm is
         * being destroyed by exit_mmap, always before all pages are
         * freed. This can run concurrently with other mmu notifier
         * methods (the ones invoked outside the mm context) and it
         * should tear down all secondary mmu mappings and freeze the
         * secondary mmu. If this method isn't implemented you've to
         * be sure that nothing could possibly write to the pages
         * through the secondary mmu by the time the last thread with
         * tsk->mm == mm exits.
         *
         * As side note: the pages freed after ->release returns could
         * be immediately reallocated by the gart at an alias physical
         * address with a different cache model, so if ->release isn't
         * implemented because all _software_ driven memory accesses
         * through the secondary mmu are terminated by the time the
         * last thread of this mm quits, you've also to be sure that
         * speculative _hardware_ operations can't allocate dirty
         * cachelines in the cpu that could not be snooped and made
         * coherent with the other read and write operations happening
         * through the gart alias address, so leading to memory
         * corruption.
         */
        void (*release)(struct mmu_notifier *subscription,
                        struct mm_struct *mm);

        /*
         * clear_flush_young is called after the VM is
         * test-and-clearing the young/accessed bitflag in the
         * pte. This way the VM will provide proper aging to the
         * accesses to the page through the secondary MMUs and not
         * only to the ones through the Linux pte.
         * Start-end is necessary in case the secondary MMU is mapping the page
         * at a smaller granularity than the primary MMU.
         */
        int (*clear_flush_young)(struct mmu_notifier *subscription,
                                 struct mm_struct *mm,
                                 unsigned long start,
                                 unsigned long end);

        /*
         * clear_young is a lightweight version of clear_flush_young. Like the
         * latter, it is supposed to test-and-clear the young/accessed bitflag
         * in the secondary pte, but it may omit flushing the secondary tlb.
         */
        int (*clear_young)(struct mmu_notifier *subscription,
                           struct mm_struct *mm,
                           unsigned long start,
                           unsigned long end);

        /*
         * test_young is called to check the young/accessed bitflag in
         * the secondary pte. This is used to know if the page is
         * frequently used without actually clearing the flag or tearing
         * down the secondary mapping on the page.
         */
        int (*test_young)(struct mmu_notifier *subscription,
                          struct mm_struct *mm,
                          unsigned long address);

        /*
         * invalidate_range_start() and invalidate_range_end() must be
         * paired and are called only when the mmap_lock and/or the
         * locks protecting the reverse maps are held. If the subsystem
         * can't guarantee that no additional references are taken to
         * the pages in the range, it has to implement the
         * invalidate_range() notifier to remove any references taken
         * after invalidate_range_start().
         *
         * Invalidation of multiple concurrent ranges may be
         * optionally permitted by the driver. Either way the
         * establishment of sptes is forbidden in the range passed to
         * invalidate_range_begin/end for the whole duration of the
         * invalidate_range_begin/end critical section.
         *
         * invalidate_range_start() is called when all pages in the
         * range are still mapped and have at least a refcount of one.
         *
         * invalidate_range_end() is called when all pages in the
         * range have been unmapped and the pages have been freed by
         * the VM.
         *
         * The VM will remove the page table entries and potentially
         * the page between invalidate_range_start() and
         * invalidate_range_end(). If the page must not be freed
         * because of pending I/O or other circumstances then the
         * invalidate_range_start() callback (or the initial mapping
         * by the driver) must make sure that the refcount is kept
         * elevated.
         *
         * If the driver increases the refcount when the pages are
         * initially mapped into an address space then either
         * invalidate_range_start() or invalidate_range_end() may
         * decrease the refcount. If the refcount is decreased on
         * invalidate_range_start() then the VM can free pages as page
         * table entries are removed.  If the refcount is only
         * dropped on invalidate_range_end() then the driver itself
         * will drop the last refcount but it must take care to flush
         * any secondary tlb before doing the final free on the
         * page. Pages will no longer be referenced by the linux
         * address space but may still be referenced by sptes until
         * the last refcount is dropped.
         *
         * If blockable argument is set to false then the callback cannot
         * sleep and has to return with -EAGAIN if sleeping would be required.
         * 0 should be returned otherwise. Please note that notifiers that can
         * fail invalidate_range_start are not allowed to implement
         * invalidate_range_end, as there is no mechanism for informing the
         * notifier that its start failed.
         */
        int (*invalidate_range_start)(struct mmu_notifier *subscription,
                                      const struct mmu_notifier_range *range);
        void (*invalidate_range_end)(struct mmu_notifier *subscription,
                                     const struct mmu_notifier_range *range);

        /*
         * arch_invalidate_secondary_tlbs() is used to manage a non-CPU TLB
         * which shares page-tables with the CPU. The
         * invalidate_range_start()/end() callbacks should not be implemented as
         * invalidate_secondary_tlbs() already catches the points in time when
         * an external TLB needs to be flushed.
         *
         * This requires arch_invalidate_secondary_tlbs() to be called while
         * holding the ptl spin-lock and therefore this callback is not allowed
         * to sleep.
         *
         * This is called by architecture code whenever invalidating a TLB
         * entry. It is assumed that any secondary TLB has the same rules for
         * when invalidations are required. If this is not the case architecture
         * code will need to call this explicitly when required for secondary
         * TLB invalidation.
         */
        void (*arch_invalidate_secondary_tlbs)(
                                        struct mmu_notifier *subscription,
                                        struct mm_struct *mm,
                                        unsigned long start,
                                        unsigned long end);

        /*
         * These callbacks are used with the get/put interface to manage the
         * lifetime of the mmu_notifier memory. alloc_notifier() returns a new
         * notifier for use with the mm.
         *
         * free_notifier() is only called after the mmu_notifier has been
         * fully put, calls to any ops callback are prevented and no ops
         * callbacks are currently running. It is called from a SRCU callback
         * and cannot sleep.
         */
        struct mmu_notifier *(*alloc_notifier)(struct mm_struct *mm);
        void (*free_notifier)(struct mmu_notifier *subscription);
};

/*
 * The notifier chains are protected by mmap_lock and/or the reverse map
 * semaphores. Notifier chains are only changed when all reverse maps and
 * the mmap_lock locks are taken.
 *
 * Therefore notifier chains can only be traversed when either
 *
 * 1. mmap_lock is held.
 * 2. One of the reverse map locks is held (i_mmap_rwsem or anon_vma->rwsem).
 * 3. No other concurrent thread can access the list (release)
 */
struct mmu_notifier {
        struct hlist_node hlist;
        const struct mmu_notifier_ops *ops;
        struct mm_struct *mm;
        struct rcu_head rcu;
        unsigned int users;
};

/**
 * struct mmu_interval_notifier_ops
 * @invalidate: Upon return the caller must stop using any SPTEs within this
 *              range. This function can sleep. Return false only if sleeping
 *              was required but mmu_notifier_range_blockable(range) is false.
 */
struct mmu_interval_notifier_ops {
        bool (*invalidate)(struct mmu_interval_notifier *interval_sub,
                           const struct mmu_notifier_range *range,
                           unsigned long cur_seq);
};

struct mmu_interval_notifier {
        struct interval_tree_node interval_tree;
        const struct mmu_interval_notifier_ops *ops;
        struct mm_struct *mm;
        struct hlist_node deferred_item;
        unsigned long invalidate_seq;
};

#ifdef CONFIG_MMU_NOTIFIER

#ifdef CONFIG_LOCKDEP
extern struct lockdep_map __mmu_notifier_invalidate_range_start_map;
#endif

struct mmu_notifier_range {
        struct mm_struct *mm;
        unsigned long start;
        unsigned long end;
        unsigned flags;
        enum mmu_notifier_event event;
        void *owner;
};

static inline int mm_has_notifiers(struct mm_struct *mm)
{
        return unlikely(mm->notifier_subscriptions);
}

struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops,
                                             struct mm_struct *mm);
static inline struct mmu_notifier *
mmu_notifier_get(const struct mmu_notifier_ops *ops, struct mm_struct *mm)
{
        struct mmu_notifier *ret;

        mmap_write_lock(mm);
        ret = mmu_notifier_get_locked(ops, mm);
        mmap_write_unlock(mm);
        return ret;
}
void mmu_notifier_put(struct mmu_notifier *subscription);
void mmu_notifier_synchronize(void);

extern int mmu_notifier_register(struct mmu_notifier *subscription,
                                 struct mm_struct *mm);
extern int __mmu_notifier_register(struct mmu_notifier *subscription,
                                   struct mm_struct *mm);
extern void mmu_notifier_unregister(struct mmu_notifier *subscription,
                                    struct mm_struct *mm);

unsigned long
mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub);
int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub,
                                 struct mm_struct *mm, unsigned long start,
                                 unsigned long length,
                                 const struct mmu_interval_notifier_ops *ops);
int mmu_interval_notifier_insert_locked(
        struct mmu_interval_notifier *interval_sub, struct mm_struct *mm,
        unsigned long start, unsigned long length,
        const struct mmu_interval_notifier_ops *ops);
void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub);

/**
 * mmu_interval_set_seq - Save the invalidation sequence
 * @interval_sub - The subscription passed to invalidate
 * @cur_seq - The cur_seq passed to the invalidate() callback
 *
 * This must be called unconditionally from the invalidate callback of a
 * struct mmu_interval_notifier_ops under the same lock that is used to call
 * mmu_interval_read_retry(). It updates the sequence number for later use by
 * mmu_interval_read_retry(). The provided cur_seq will always be odd.
 *
 * If the caller does not call mmu_interval_read_begin() or
 * mmu_interval_read_retry() then this call is not required.
 */
static inline void
mmu_interval_set_seq(struct mmu_interval_notifier *interval_sub,
                     unsigned long cur_seq)
{
        WRITE_ONCE(interval_sub->invalidate_seq, cur_seq);
}

/**
 * mmu_interval_read_retry - End a read side critical section against a VA range
 * interval_sub: The subscription
 * seq: The return of the paired mmu_interval_read_begin()
 *
 * This MUST be called under a user provided lock that is also held
 * unconditionally by op->invalidate() when it calls mmu_interval_set_seq().
 *
 * Each call should be paired with a single mmu_interval_read_begin() and
 * should be used to conclude the read side.
 *
 * Returns true if an invalidation collided with this critical section, and
 * the caller should retry.
 */
static inline bool
mmu_interval_read_retry(struct mmu_interval_notifier *interval_sub,
                        unsigned long seq)
{
        return interval_sub->invalidate_seq != seq;
}

/**
 * mmu_interval_check_retry - Test if a collision has occurred
 * interval_sub: The subscription
 * seq: The return of the matching mmu_interval_read_begin()
 *
 * This can be used in the critical section between mmu_interval_read_begin()
 * and mmu_interval_read_retry().  A return of true indicates an invalidation
 * has collided with this critical region and a future
 * mmu_interval_read_retry() will return true.
 *
 * False is not reliable and only suggests a collision may not have
 * occurred. It can be called many times and does not have to hold the user
 * provided lock.
 *
 * This call can be used as part of loops and other expensive operations to
 * expedite a retry.
 */
static inline bool
mmu_interval_check_retry(struct mmu_interval_notifier *interval_sub,
                         unsigned long seq)
{
        /* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */
        return READ_ONCE(interval_sub->invalidate_seq) != seq;
}

extern void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm);
extern void __mmu_notifier_release(struct mm_struct *mm);
extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
                                          unsigned long start,
                                          unsigned long end);
extern int __mmu_notifier_clear_young(struct mm_struct *mm,
                                      unsigned long start,
                                      unsigned long end);
extern int __mmu_notifier_test_young(struct mm_struct *mm,
                                     unsigned long address);
extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r);
extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r);
extern void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm,
                                        unsigned long start, unsigned long end);
extern bool
mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range);

static inline bool
mmu_notifier_range_blockable(const struct mmu_notifier_range *range)
{
        return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE);
}

static inline void mmu_notifier_release(struct mm_struct *mm)
{
        if (mm_has_notifiers(mm))
                __mmu_notifier_release(mm);
}

static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
                                          unsigned long start,
                                          unsigned long end)
{
        if (mm_has_notifiers(mm))
                return __mmu_notifier_clear_flush_young(mm, start, end);
        return 0;
}

static inline int mmu_notifier_clear_young(struct mm_struct *mm,
                                           unsigned long start,
                                           unsigned long end)
{
        if (mm_has_notifiers(mm))
                return __mmu_notifier_clear_young(mm, start, end);
        return 0;
}

static inline int mmu_notifier_test_young(struct mm_struct *mm,
                                          unsigned long address)
{
        if (mm_has_notifiers(mm))
                return __mmu_notifier_test_young(mm, address);
        return 0;
}

static inline void
mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
{
        might_sleep();

        lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
        if (mm_has_notifiers(range->mm)) {
                range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE;
                __mmu_notifier_invalidate_range_start(range);
        }
        lock_map_release(&__mmu_notifier_invalidate_range_start_map);
}

/*
 * This version of mmu_notifier_invalidate_range_start() avoids blocking, but it
 * can return an error if a notifier can't proceed without blocking, in which
 * case you're not allowed to modify PTEs in the specified range.
 *
 * This is mainly intended for OOM handling.
 */
static inline int __must_check
mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range)
{
        int ret = 0;

        lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
        if (mm_has_notifiers(range->mm)) {
                range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE;
                ret = __mmu_notifier_invalidate_range_start(range);
        }
        lock_map_release(&__mmu_notifier_invalidate_range_start_map);
        return ret;
}

static inline void
mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range)
{
        if (mmu_notifier_range_blockable(range))
                might_sleep();

        if (mm_has_notifiers(range->mm))
                __mmu_notifier_invalidate_range_end(range);
}

static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm,
                                        unsigned long start, unsigned long end)
{
        if (mm_has_notifiers(mm))
                __mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
}

static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm)
{
        mm->notifier_subscriptions = NULL;
}

static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
{
        if (mm_has_notifiers(mm))
                __mmu_notifier_subscriptions_destroy(mm);
}


static inline void mmu_notifier_range_init(struct mmu_notifier_range *range,
                                           enum mmu_notifier_event event,
                                           unsigned flags,
                                           struct mm_struct *mm,
                                           unsigned long start,
                                           unsigned long end)
{
        range->event = event;
        range->mm = mm;
        range->start = start;
        range->end = end;
        range->flags = flags;
}

static inline void mmu_notifier_range_init_owner(
                        struct mmu_notifier_range *range,
                        enum mmu_notifier_event event, unsigned int flags,
                        struct mm_struct *mm, unsigned long start,
                        unsigned long end, void *owner)
{
        mmu_notifier_range_init(range, event, flags, mm, start, end);
        range->owner = owner;
}

#define ptep_clear_flush_young_notify(__vma, __address, __ptep)                \
({                                                                        \
        int __young;                                                        \
        struct vm_area_struct *___vma = __vma;                                \
        unsigned long ___address = __address;                                \
        __young = ptep_clear_flush_young(___vma, ___address, __ptep);        \
        __young |= mmu_notifier_clear_flush_young(___vma->vm_mm,        \
                                                  ___address,                \
                                                  ___address +                \
                                                        PAGE_SIZE);        \
        __young;                                                        \
})

#define pmdp_clear_flush_young_notify(__vma, __address, __pmdp)                \
({                                                                        \
        int __young;                                                        \
        struct vm_area_struct *___vma = __vma;                                \
        unsigned long ___address = __address;                                \
        __young = pmdp_clear_flush_young(___vma, ___address, __pmdp);        \
        __young |= mmu_notifier_clear_flush_young(___vma->vm_mm,        \
                                                  ___address,                \
                                                  ___address +                \
                                                        PMD_SIZE);        \
        __young;                                                        \
})

#define ptep_clear_young_notify(__vma, __address, __ptep)                \
({                                                                        \
        int __young;                                                        \
        struct vm_area_struct *___vma = __vma;                                \
        unsigned long ___address = __address;                                \
        __young = ptep_test_and_clear_young(___vma, ___address, __ptep);\
        __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address,        \
                                            ___address + PAGE_SIZE);        \
        __young;                                                        \
})

#define pmdp_clear_young_notify(__vma, __address, __pmdp)                \
({                                                                        \
        int __young;                                                        \
        struct vm_area_struct *___vma = __vma;                                \
        unsigned long ___address = __address;                                \
        __young = pmdp_test_and_clear_young(___vma, ___address, __pmdp);\
        __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address,        \
                                            ___address + PMD_SIZE);        \
        __young;                                                        \
})

#else /* CONFIG_MMU_NOTIFIER */

struct mmu_notifier_range {
        unsigned long start;
        unsigned long end;
};

static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range,
                                            unsigned long start,
                                            unsigned long end)
{
        range->start = start;
        range->end = end;
}

#define mmu_notifier_range_init(range,event,flags,mm,start,end)  \
        _mmu_notifier_range_init(range, start, end)
#define mmu_notifier_range_init_owner(range, event, flags, mm, start, \
                                        end, owner) \
        _mmu_notifier_range_init(range, start, end)

static inline bool
mmu_notifier_range_blockable(const struct mmu_notifier_range *range)
{
        return true;
}

static inline int mm_has_notifiers(struct mm_struct *mm)
{
        return 0;
}

static inline void mmu_notifier_release(struct mm_struct *mm)
{
}

static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
                                          unsigned long start,
                                          unsigned long end)
{
        return 0;
}

static inline int mmu_notifier_clear_young(struct mm_struct *mm,
                                           unsigned long start,
                                           unsigned long end)
{
        return 0;
}

static inline int mmu_notifier_test_young(struct mm_struct *mm,
                                          unsigned long address)
{
        return 0;
}

static inline void
mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
{
}

static inline int
mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range)
{
        return 0;
}

static inline
void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range)
{
}

static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm,
                                  unsigned long start, unsigned long end)
{
}

static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm)
{
}

static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
{
}

#define mmu_notifier_range_update_to_read_only(r) false

#define ptep_clear_flush_young_notify ptep_clear_flush_young
#define pmdp_clear_flush_young_notify pmdp_clear_flush_young
#define ptep_clear_young_notify ptep_test_and_clear_young
#define pmdp_clear_young_notify pmdp_test_and_clear_young

static inline void mmu_notifier_synchronize(void)
{
}

#endif /* CONFIG_MMU_NOTIFIER */

#endif /* _LINUX_MMU_NOTIFIER_H */












































































































































































































































    6 

    6 



























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/kernel/exit.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/mm.h>
#include <linux/sched/stat.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
#include <linux/interrupt.h>
#include <linux/module.h>
#include <linux/capability.h>
#include <linux/completion.h>
#include <linux/personality.h>
#include <linux/tty.h>
#include <linux/iocontext.h>
#include <linux/key.h>
#include <linux/cpu.h>
#include <linux/acct.h>
#include <linux/tsacct_kern.h>
#include <linux/file.h>
#include <linux/freezer.h>
#include <linux/binfmts.h>
#include <linux/nsproxy.h>
#include <linux/pid_namespace.h>
#include <linux/ptrace.h>
#include <linux/profile.h>
#include <linux/mount.h>
#include <linux/proc_fs.h>
#include <linux/kthread.h>
#include <linux/mempolicy.h>
#include <linux/taskstats_kern.h>
#include <linux/delayacct.h>
#include <linux/cgroup.h>
#include <linux/syscalls.h>
#include <linux/signal.h>
#include <linux/posix-timers.h>
#include <linux/cn_proc.h>
#include <linux/mutex.h>
#include <linux/futex.h>
#include <linux/pipe_fs_i.h>
#include <linux/audit.h> /* for audit_free() */
#include <linux/resource.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/blkdev.h>
#include <linux/task_work.h>
#include <linux/fs_struct.h>
#include <linux/init_task.h>
#include <linux/perf_event.h>
#include <trace/events/sched.h>
#include <linux/hw_breakpoint.h>
#include <linux/oom.h>
#include <linux/writeback.h>
#include <linux/shm.h>
#include <linux/kcov.h>
#include <linux/kmsan.h>
#include <linux/random.h>
#include <linux/rcuwait.h>
#include <linux/compat.h>
#include <linux/io_uring.h>
#include <linux/kprobes.h>
#include <linux/rethook.h>
#include <linux/sysfs.h>
#include <linux/user_events.h>
#include <linux/unwind_deferred.h>
#include <linux/uaccess.h>
#include <linux/pidfs.h>

#include <uapi/linux/wait.h>

#include <asm/unistd.h>
#include <asm/mmu_context.h>

#include "exit.h"

/*
 * The default value should be high enough to not crash a system that randomly
 * crashes its kernel from time to time, but low enough to at least not permit
 * overflowing 32-bit refcounts or the ldsem writer count.
 */
static unsigned int oops_limit = 10000;

#ifdef CONFIG_SYSCTL
static const struct ctl_table kern_exit_table[] = {
        {
                .procname       = "oops_limit",
                .data           = &oops_limit,
                .maxlen         = sizeof(oops_limit),
                .mode           = 0644,
                .proc_handler   = proc_douintvec,
        },
};

static __init int kernel_exit_sysctls_init(void)
{
        register_sysctl_init("kernel", kern_exit_table);
        return 0;
}
late_initcall(kernel_exit_sysctls_init);
#endif

static atomic_t oops_count = ATOMIC_INIT(0);

#ifdef CONFIG_SYSFS
static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr,
                               char *page)
{
        return sysfs_emit(page, "%d\n", atomic_read(&oops_count));
}

static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count);

static __init int kernel_exit_sysfs_init(void)
{
        sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL);
        return 0;
}
late_initcall(kernel_exit_sysfs_init);
#endif

/*
 * For things release_task() would like to do *after* tasklist_lock is released.
 */
struct release_task_post {
        struct pid *pids[PIDTYPE_MAX];
};

static void __unhash_process(struct release_task_post *post, struct task_struct *p,
                             bool group_dead)
{
        struct pid *pid = task_pid(p);

        nr_threads--;

        detach_pid(post->pids, p, PIDTYPE_PID);
        wake_up_all(&pid->wait_pidfd);

        if (group_dead) {
                detach_pid(post->pids, p, PIDTYPE_TGID);
                detach_pid(post->pids, p, PIDTYPE_PGID);
                detach_pid(post->pids, p, PIDTYPE_SID);

                list_del_rcu(&p->tasks);
                list_del_init(&p->sibling);
                __this_cpu_dec(process_counts);
        }
        list_del_rcu(&p->thread_node);
}

/*
 * This function expects the tasklist_lock write-locked.
 */
static void __exit_signal(struct release_task_post *post, struct task_struct *tsk)
{
        struct signal_struct *sig = tsk->signal;
        bool group_dead = thread_group_leader(tsk);
        struct sighand_struct *sighand;
        struct tty_struct *tty;
        u64 utime, stime;

        sighand = rcu_dereference_check(tsk->sighand,
                                        lockdep_tasklist_lock_is_held());
        spin_lock(&sighand->siglock);

#ifdef CONFIG_POSIX_TIMERS
        posix_cpu_timers_exit(tsk);
        if (group_dead)
                posix_cpu_timers_exit_group(tsk);
#endif

        if (group_dead) {
                tty = sig->tty;
                sig->tty = NULL;
        } else {
                /*
                 * If there is any task waiting for the group exit
                 * then notify it:
                 */
                if (sig->notify_count > 0 && !--sig->notify_count)
                        wake_up_process(sig->group_exec_task);

                if (tsk == sig->curr_target)
                        sig->curr_target = next_thread(tsk);
        }

        /*
         * Accumulate here the counters for all threads as they die. We could
         * skip the group leader because it is the last user of signal_struct,
         * but we want to avoid the race with thread_group_cputime() which can
         * see the empty ->thread_head list.
         */
        task_cputime(tsk, &utime, &stime);
        write_seqlock(&sig->stats_lock);
        sig->utime += utime;
        sig->stime += stime;
        sig->gtime += task_gtime(tsk);
        sig->min_flt += tsk->min_flt;
        sig->maj_flt += tsk->maj_flt;
        sig->nvcsw += tsk->nvcsw;
        sig->nivcsw += tsk->nivcsw;
        sig->inblock += task_io_get_inblock(tsk);
        sig->oublock += task_io_get_oublock(tsk);
        task_io_accounting_add(&sig->ioac, &tsk->ioac);
        sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
        sig->nr_threads--;
        __unhash_process(post, tsk, group_dead);
        write_sequnlock(&sig->stats_lock);

        tsk->sighand = NULL;
        spin_unlock(&sighand->siglock);

        __cleanup_sighand(sighand);
        if (group_dead)
                tty_kref_put(tty);
}

static void delayed_put_task_struct(struct rcu_head *rhp)
{
        struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);

        kprobe_flush_task(tsk);
        rethook_flush_task(tsk);
        perf_event_delayed_put(tsk);
        trace_sched_process_free(tsk);
        put_task_struct(tsk);
}

void put_task_struct_rcu_user(struct task_struct *task)
{
        if (refcount_dec_and_test(&task->rcu_users))
                call_rcu(&task->rcu, delayed_put_task_struct);
}

void __weak release_thread(struct task_struct *dead_task)
{
}

void release_task(struct task_struct *p)
{
        struct release_task_post post;
        struct task_struct *leader;
        struct pid *thread_pid;
        int zap_leader;
repeat:
        memset(&post, 0, sizeof(post));

        /* don't need to get the RCU readlock here - the process is dead and
         * can't be modifying its own credentials. But shut RCU-lockdep up */
        rcu_read_lock();
        dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
        rcu_read_unlock();

        pidfs_exit(p);
        cgroup_release(p);

        /* Retrieve @thread_pid before __unhash_process() may set it to NULL. */
        thread_pid = task_pid(p);

        write_lock_irq(&tasklist_lock);
        ptrace_release_task(p);
        __exit_signal(&post, p);

        /*
         * If we are the last non-leader member of the thread
         * group, and the leader is zombie, then notify the
         * group leader's parent process. (if it wants notification.)
         */
        zap_leader = 0;
        leader = p->group_leader;
        if (leader != p && thread_group_empty(leader)
                        && leader->exit_state == EXIT_ZOMBIE) {
                /* for pidfs_exit() and do_notify_parent() */
                if (leader->signal->flags & SIGNAL_GROUP_EXIT)
                        leader->exit_code = leader->signal->group_exit_code;
                /*
                 * If we were the last child thread and the leader has
                 * exited already, and the leader's parent ignores SIGCHLD,
                 * then we are the one who should release the leader.
                 */
                zap_leader = do_notify_parent(leader, leader->exit_signal);
                if (zap_leader)
                        leader->exit_state = EXIT_DEAD;
        }

        write_unlock_irq(&tasklist_lock);
        /* @thread_pid can't go away until free_pids() below */
        proc_flush_pid(thread_pid);
        add_device_randomness(&p->se.sum_exec_runtime,
                              sizeof(p->se.sum_exec_runtime));
        free_pids(post.pids);
        release_thread(p);
        /*
         * This task was already removed from the process/thread/pid lists
         * and lock_task_sighand(p) can't succeed. Nobody else can touch
         * ->pending or, if group dead, signal->shared_pending. We can call
         * flush_sigqueue() lockless.
         */
        flush_sigqueue(&p->pending);
        if (thread_group_leader(p))
                flush_sigqueue(&p->signal->shared_pending);

        put_task_struct_rcu_user(p);

        p = leader;
        if (unlikely(zap_leader))
                goto repeat;
}

int rcuwait_wake_up(struct rcuwait *w)
{
        int ret = 0;
        struct task_struct *task;

        rcu_read_lock();

        /*
         * Order condition vs @task, such that everything prior to the load
         * of @task is visible. This is the condition as to why the user called
         * rcuwait_wake() in the first place. Pairs with set_current_state()
         * barrier (A) in rcuwait_wait_event().
         *
         *    WAIT                WAKE
         *    [S] tsk = current          [S] cond = true
         *        MB (A)              MB (B)
         *    [L] cond                  [L] tsk
         */
        smp_mb(); /* (B) */

        task = rcu_dereference(w->task);
        if (task)
                ret = wake_up_process(task);
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL_GPL(rcuwait_wake_up);

/*
 * Determine if a process group is "orphaned", according to the POSIX
 * definition in 2.2.2.52.  Orphaned process groups are not to be affected
 * by terminal-generated stop signals.  Newly orphaned process groups are
 * to receive a SIGHUP and a SIGCONT.
 *
 * "I ask you, have you ever known what it is to be an orphan?"
 */
static int will_become_orphaned_pgrp(struct pid *pgrp,
                                        struct task_struct *ignored_task)
{
        struct task_struct *p;

        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
                if ((p == ignored_task) ||
                    (p->exit_state && thread_group_empty(p)) ||
                    is_global_init(p->real_parent))
                        continue;

                if (task_pgrp(p->real_parent) != pgrp &&
                    task_session(p->real_parent) == task_session(p))
                        return 0;
        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);

        return 1;
}

int is_current_pgrp_orphaned(void)
{
        int retval;

        read_lock(&tasklist_lock);
        retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);
        read_unlock(&tasklist_lock);

        return retval;
}

static bool has_stopped_jobs(struct pid *pgrp)
{
        struct task_struct *p;

        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
                if (p->signal->flags & SIGNAL_STOP_STOPPED)
                        return true;
        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);

        return false;
}

/*
 * Check to see if any process groups have become orphaned as
 * a result of our exiting, and if they have any stopped jobs,
 * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
 */
static void
kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
{
        struct pid *pgrp = task_pgrp(tsk);
        struct task_struct *ignored_task = tsk;

        if (!parent)
                /* exit: our father is in a different pgrp than
                 * we are and we were the only connection outside.
                 */
                parent = tsk->real_parent;
        else
                /* reparent: our child is in a different pgrp than
                 * we are, and it was the only connection outside.
                 */
                ignored_task = NULL;

        if (task_pgrp(parent) != pgrp &&
            task_session(parent) == task_session(tsk) &&
            will_become_orphaned_pgrp(pgrp, ignored_task) &&
            has_stopped_jobs(pgrp)) {
                __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
                __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
        }
}

static void coredump_task_exit(struct task_struct *tsk,
                               struct core_state *core_state)
{
        struct core_thread self;

        self.task = tsk;
        if (self.task->flags & PF_SIGNALED)
                self.next = xchg(&core_state->dumper.next, &self);
        else
                self.task = NULL;
        /*
         * Implies mb(), the result of xchg() must be visible
         * to core_state->dumper.
         */
        if (atomic_dec_and_test(&core_state->nr_threads))
                complete(&core_state->startup);

        for (;;) {
                set_current_state(TASK_IDLE|TASK_FREEZABLE);
                if (!self.task) /* see coredump_finish() */
                        break;
                schedule();
        }
        __set_current_state(TASK_RUNNING);
}

#ifdef CONFIG_MEMCG
/* drops tasklist_lock if succeeds */
static bool __try_to_set_owner(struct task_struct *tsk, struct mm_struct *mm)
{
        bool ret = false;

        task_lock(tsk);
        if (likely(tsk->mm == mm)) {
                /* tsk can't pass exit_mm/exec_mmap and exit */
                read_unlock(&tasklist_lock);
                WRITE_ONCE(mm->owner, tsk);
                lru_gen_migrate_mm(mm);
                ret = true;
        }
        task_unlock(tsk);
        return ret;
}

static bool try_to_set_owner(struct task_struct *g, struct mm_struct *mm)
{
        struct task_struct *t;

        for_each_thread(g, t) {
                struct mm_struct *t_mm = READ_ONCE(t->mm);
                if (t_mm == mm) {
                        if (__try_to_set_owner(t, mm))
                                return true;
                } else if (t_mm)
                        break;
        }

        return false;
}

/*
 * A task is exiting.   If it owned this mm, find a new owner for the mm.
 */
void mm_update_next_owner(struct mm_struct *mm)
{
        struct task_struct *g, *p = current;

        /*
         * If the exiting or execing task is not the owner, it's
         * someone else's problem.
         */
        if (mm->owner != p)
                return;
        /*
         * The current owner is exiting/execing and there are no other
         * candidates.  Do not leave the mm pointing to a possibly
         * freed task structure.
         */
        if (atomic_read(&mm->mm_users) <= 1) {
                WRITE_ONCE(mm->owner, NULL);
                return;
        }

        read_lock(&tasklist_lock);
        /*
         * Search in the children
         */
        list_for_each_entry(g, &p->children, sibling) {
                if (try_to_set_owner(g, mm))
                        goto ret;
        }
        /*
         * Search in the siblings
         */
        list_for_each_entry(g, &p->real_parent->children, sibling) {
                if (try_to_set_owner(g, mm))
                        goto ret;
        }
        /*
         * Search through everything else, we should not get here often.
         */
        for_each_process(g) {
                if (atomic_read(&mm->mm_users) <= 1)
                        break;
                if (g->flags & PF_KTHREAD)
                        continue;
                if (try_to_set_owner(g, mm))
                        goto ret;
        }
        read_unlock(&tasklist_lock);
        /*
         * We found no owner yet mm_users > 1: this implies that we are
         * most likely racing with swapoff (try_to_unuse()) or /proc or
         * ptrace or page migration (get_task_mm()).  Mark owner as NULL.
         */
        WRITE_ONCE(mm->owner, NULL);
 ret:
        return;

}
#endif /* CONFIG_MEMCG */

/*
 * Turn us into a lazy TLB process if we
 * aren't already..
 */
static void exit_mm(void)
{
        struct mm_struct *mm = current->mm;

        exit_mm_release(current, mm);
        if (!mm)
                return;
        mmap_read_lock(mm);
        mmgrab_lazy_tlb(mm);
        BUG_ON(mm != current->active_mm);
        /* more a memory barrier than a real lock */
        task_lock(current);
        /*
         * When a thread stops operating on an address space, the loop
         * in membarrier_private_expedited() may not observe that
         * tsk->mm, and the loop in membarrier_global_expedited() may
         * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED
         * rq->membarrier_state, so those would not issue an IPI.
         * Membarrier requires a memory barrier after accessing
         * user-space memory, before clearing tsk->mm or the
         * rq->membarrier_state.
         */
        smp_mb__after_spinlock();
        local_irq_disable();
        current->mm = NULL;
        membarrier_update_current_mm(NULL);
        enter_lazy_tlb(mm, current);
        local_irq_enable();
        task_unlock(current);
        mmap_read_unlock(mm);
        mm_update_next_owner(mm);
        mmput(mm);
        if (test_thread_flag(TIF_MEMDIE))
                exit_oom_victim();
}

static struct task_struct *find_alive_thread(struct task_struct *p)
{
        struct task_struct *t;

        for_each_thread(p, t) {
                if (!(t->flags & PF_EXITING))
                        return t;
        }
        return NULL;
}

static struct task_struct *find_child_reaper(struct task_struct *father,
                                                struct list_head *dead)
        __releases(&tasklist_lock)
        __acquires(&tasklist_lock)
{
        struct pid_namespace *pid_ns = task_active_pid_ns(father);
        struct task_struct *reaper = pid_ns->child_reaper;
        struct task_struct *p, *n;

        if (likely(reaper != father))
                return reaper;

        reaper = find_alive_thread(father);
        if (reaper) {
                pid_ns->child_reaper = reaper;
                return reaper;
        }

        write_unlock_irq(&tasklist_lock);

        list_for_each_entry_safe(p, n, dead, ptrace_entry) {
                list_del_init(&p->ptrace_entry);
                release_task(p);
        }

        zap_pid_ns_processes(pid_ns);
        write_lock_irq(&tasklist_lock);

        return father;
}

/*
 * When we die, we re-parent all our children, and try to:
 * 1. give them to another thread in our thread group, if such a member exists
 * 2. give it to the first ancestor process which prctl'd itself as a
 *    child_subreaper for its children (like a service manager)
 * 3. give it to the init process (PID 1) in our pid namespace
 */
static struct task_struct *find_new_reaper(struct task_struct *father,
                                           struct task_struct *child_reaper)
{
        struct task_struct *thread, *reaper;

        thread = find_alive_thread(father);
        if (thread)
                return thread;

        if (father->signal->has_child_subreaper) {
                unsigned int ns_level = task_pid(father)->level;
                /*
                 * Find the first ->is_child_subreaper ancestor in our pid_ns.
                 * We can't check reaper != child_reaper to ensure we do not
                 * cross the namespaces, the exiting parent could be injected
                 * by setns() + fork().
                 * We check pid->level, this is slightly more efficient than
                 * task_active_pid_ns(reaper) != task_active_pid_ns(father).
                 */
                for (reaper = father->real_parent;
                     task_pid(reaper)->level == ns_level;
                     reaper = reaper->real_parent) {
                        if (reaper == &init_task)
                                break;
                        if (!reaper->signal->is_child_subreaper)
                                continue;
                        thread = find_alive_thread(reaper);
                        if (thread)
                                return thread;
                }
        }

        return child_reaper;
}

/*
* Any that need to be release_task'd are put on the @dead list.
 */
static void reparent_leader(struct task_struct *father, struct task_struct *p,
                                struct list_head *dead)
{
        if (unlikely(p->exit_state == EXIT_DEAD))
                return;

        /* We don't want people slaying init. */
        p->exit_signal = SIGCHLD;

        /* If it has exited notify the new parent about this child's death. */
        if (!p->ptrace &&
            p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
                if (do_notify_parent(p, p->exit_signal)) {
                        p->exit_state = EXIT_DEAD;
                        list_add(&p->ptrace_entry, dead);
                }
        }

        kill_orphaned_pgrp(p, father);
}

/*
 * Make init inherit all the child processes
 */
static void forget_original_parent(struct task_struct *father,
                                        struct list_head *dead)
{
        struct task_struct *p, *t, *reaper;

        if (unlikely(!list_empty(&father->ptraced)))
                exit_ptrace(father, dead);

        /* Can drop and reacquire tasklist_lock */
        reaper = find_child_reaper(father, dead);
        if (list_empty(&father->children))
                return;

        reaper = find_new_reaper(father, reaper);
        list_for_each_entry(p, &father->children, sibling) {
                for_each_thread(p, t) {
                        RCU_INIT_POINTER(t->real_parent, reaper);
                        BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father));
                        if (likely(!t->ptrace))
                                t->parent = t->real_parent;
                        if (t->pdeath_signal)
                                group_send_sig_info(t->pdeath_signal,
                                                    SEND_SIG_NOINFO, t,
                                                    PIDTYPE_TGID);
                }
                /*
                 * If this is a threaded reparent there is no need to
                 * notify anyone anything has happened.
                 */
                if (!same_thread_group(reaper, father))
                        reparent_leader(father, p, dead);
        }
        list_splice_tail_init(&father->children, &reaper->children);
}

/*
 * Send signals to all our closest relatives so that they know
 * to properly mourn us..
 */
static void exit_notify(struct task_struct *tsk, int group_dead)
{
        bool autoreap;
        struct task_struct *p, *n;
        LIST_HEAD(dead);

        write_lock_irq(&tasklist_lock);
        forget_original_parent(tsk, &dead);

        if (group_dead)
                kill_orphaned_pgrp(tsk->group_leader, NULL);

        tsk->exit_state = EXIT_ZOMBIE;

        if (unlikely(tsk->ptrace)) {
                int sig = thread_group_leader(tsk) &&
                                thread_group_empty(tsk) &&
                                !ptrace_reparented(tsk) ?
                        tsk->exit_signal : SIGCHLD;
                autoreap = do_notify_parent(tsk, sig);
        } else if (thread_group_leader(tsk)) {
                autoreap = thread_group_empty(tsk) &&
                        do_notify_parent(tsk, tsk->exit_signal);
        } else {
                autoreap = true;
                /* untraced sub-thread */
                do_notify_pidfd(tsk);
        }

        if (autoreap) {
                tsk->exit_state = EXIT_DEAD;
                list_add(&tsk->ptrace_entry, &dead);
        }

        /* mt-exec, de_thread() is waiting for group leader */
        if (unlikely(tsk->signal->notify_count < 0))
                wake_up_process(tsk->signal->group_exec_task);
        write_unlock_irq(&tasklist_lock);

        list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
                list_del_init(&p->ptrace_entry);
                release_task(p);
        }
}

#ifdef CONFIG_DEBUG_STACK_USAGE
#ifdef CONFIG_STACK_GROWSUP
unsigned long stack_not_used(struct task_struct *p)
{
        unsigned long *n = end_of_stack(p);

        do {        /* Skip over canary */
                n--;
        } while (!*n);

        return (unsigned long)end_of_stack(p) - (unsigned long)n;
}
#else /* !CONFIG_STACK_GROWSUP */
unsigned long stack_not_used(struct task_struct *p)
{
        unsigned long *n = end_of_stack(p);

        do {        /* Skip over canary */
                n++;
        } while (!*n);

        return (unsigned long)n - (unsigned long)end_of_stack(p);
}
#endif /* CONFIG_STACK_GROWSUP */

/* Count the maximum pages reached in kernel stacks */
static inline void kstack_histogram(unsigned long used_stack)
{
#ifdef CONFIG_VM_EVENT_COUNTERS
        if (used_stack <= 1024)
                count_vm_event(KSTACK_1K);
#if THREAD_SIZE > 1024
        else if (used_stack <= 2048)
                count_vm_event(KSTACK_2K);
#endif
#if THREAD_SIZE > 2048
        else if (used_stack <= 4096)
                count_vm_event(KSTACK_4K);
#endif
#if THREAD_SIZE > 4096
        else if (used_stack <= 8192)
                count_vm_event(KSTACK_8K);
#endif
#if THREAD_SIZE > 8192
        else if (used_stack <= 16384)
                count_vm_event(KSTACK_16K);
#endif
#if THREAD_SIZE > 16384
        else if (used_stack <= 32768)
                count_vm_event(KSTACK_32K);
#endif
#if THREAD_SIZE > 32768
        else if (used_stack <= 65536)
                count_vm_event(KSTACK_64K);
#endif
#if THREAD_SIZE > 65536
        else
                count_vm_event(KSTACK_REST);
#endif
#endif /* CONFIG_VM_EVENT_COUNTERS */
}

static void check_stack_usage(void)
{
        static DEFINE_SPINLOCK(low_water_lock);
        static int lowest_to_date = THREAD_SIZE;
        unsigned long free;

        free = stack_not_used(current);
        kstack_histogram(THREAD_SIZE - free);

        if (free >= lowest_to_date)
                return;

        spin_lock(&low_water_lock);
        if (free < lowest_to_date) {
                pr_info("%s (%d) used greatest stack depth: %lu bytes left\n",
                        current->comm, task_pid_nr(current), free);
                lowest_to_date = free;
        }
        spin_unlock(&low_water_lock);
}
#else /* !CONFIG_DEBUG_STACK_USAGE */
static inline void check_stack_usage(void) {}
#endif /* CONFIG_DEBUG_STACK_USAGE */

static void synchronize_group_exit(struct task_struct *tsk, long code)
{
        struct sighand_struct *sighand = tsk->sighand;
        struct signal_struct *signal = tsk->signal;
        struct core_state *core_state;

        spin_lock_irq(&sighand->siglock);
        signal->quick_threads--;
        if ((signal->quick_threads == 0) &&
            !(signal->flags & SIGNAL_GROUP_EXIT)) {
                signal->flags = SIGNAL_GROUP_EXIT;
                signal->group_exit_code = code;
                signal->group_stop_count = 0;
        }
        /*
         * Serialize with any possible pending coredump.
         * We must hold siglock around checking core_state
         * and setting PF_POSTCOREDUMP.  The core-inducing thread
         * will increment ->nr_threads for each thread in the
         * group without PF_POSTCOREDUMP set.
         */
        tsk->flags |= PF_POSTCOREDUMP;
        core_state = signal->core_state;
        spin_unlock_irq(&sighand->siglock);

        if (unlikely(core_state))
                coredump_task_exit(tsk, core_state);
}

void __noreturn do_exit(long code)
{
        struct task_struct *tsk = current;
        int group_dead;

        WARN_ON(irqs_disabled());
        WARN_ON(tsk->plug);

        kcov_task_exit(tsk);
        kmsan_task_exit(tsk);

        synchronize_group_exit(tsk, code);
        ptrace_event(PTRACE_EVENT_EXIT, code);
        user_events_exit(tsk);

        io_uring_files_cancel();
        exit_signals(tsk);  /* sets PF_EXITING */

        seccomp_filter_release(tsk);

        acct_update_integrals(tsk);
        group_dead = atomic_dec_and_test(&tsk->signal->live);
        if (group_dead) {
                /*
                 * If the last thread of global init has exited, panic
                 * immediately to get a useable coredump.
                 */
                if (unlikely(is_global_init(tsk)))
                        panic("Attempted to kill init! exitcode=0x%08x\n",
                                tsk->signal->group_exit_code ?: (int)code);

#ifdef CONFIG_POSIX_TIMERS
                hrtimer_cancel(&tsk->signal->real_timer);
                exit_itimers(tsk);
#endif
                if (tsk->mm)
                        setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
        }
        acct_collect(code, group_dead);
        if (group_dead)
                tty_audit_exit();
        audit_free(tsk);

        tsk->exit_code = code;
        taskstats_exit(tsk, group_dead);
        unwind_deferred_task_exit(tsk);
        trace_sched_process_exit(tsk, group_dead);

        /*
         * Since sampling can touch ->mm, make sure to stop everything before we
         * tear it down.
         *
         * Also flushes inherited counters to the parent - before the parent
         * gets woken up by child-exit notifications.
         */
        perf_event_exit_task(tsk);

        exit_mm();

        if (group_dead)
                acct_process();

        exit_sem(tsk);
        exit_shm(tsk);
        exit_files(tsk);
        exit_fs(tsk);
        if (group_dead)
                disassociate_ctty(1);
        exit_task_namespaces(tsk);
        exit_task_work(tsk);
        exit_thread(tsk);

        sched_autogroup_exit_task(tsk);
        cgroup_exit(tsk);

        /*
         * FIXME: do that only when needed, using sched_exit tracepoint
         */
        flush_ptrace_hw_breakpoint(tsk);

        exit_tasks_rcu_start();
        exit_notify(tsk, group_dead);
        proc_exit_connector(tsk);
        mpol_put_task_policy(tsk);
#ifdef CONFIG_FUTEX
        if (unlikely(current->pi_state_cache))
                kfree(current->pi_state_cache);
#endif
        /*
         * Make sure we are holding no locks:
         */
        debug_check_no_locks_held();

        if (tsk->io_context)
                exit_io_context(tsk);

        if (tsk->splice_pipe)
                free_pipe_info(tsk->splice_pipe);

        if (tsk->task_frag.page)
                put_page(tsk->task_frag.page);

        exit_task_stack_account(tsk);

        check_stack_usage();
        preempt_disable();
        if (tsk->nr_dirtied)
                __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
        exit_rcu();
        exit_tasks_rcu_finish();

        lockdep_free_task(tsk);
        do_task_dead();
}

void __noreturn make_task_dead(int signr)
{
        /*
         * Take the task off the cpu after something catastrophic has
         * happened.
         *
         * We can get here from a kernel oops, sometimes with preemption off.
         * Start by checking for critical errors.
         * Then fix up important state like USER_DS and preemption.
         * Then do everything else.
         */
        struct task_struct *tsk = current;
        unsigned int limit;

        if (unlikely(in_interrupt()))
                panic("Aiee, killing interrupt handler!");
        if (unlikely(!tsk->pid))
                panic("Attempted to kill the idle task!");

        if (unlikely(irqs_disabled())) {
                pr_info("note: %s[%d] exited with irqs disabled\n",
                        current->comm, task_pid_nr(current));
                local_irq_enable();
        }
        if (unlikely(in_atomic())) {
                pr_info("note: %s[%d] exited with preempt_count %d\n",
                        current->comm, task_pid_nr(current),
                        preempt_count());
                preempt_count_set(PREEMPT_ENABLED);
        }

        /*
         * Every time the system oopses, if the oops happens while a reference
         * to an object was held, the reference leaks.
         * If the oops doesn't also leak memory, repeated oopsing can cause
         * reference counters to wrap around (if they're not using refcount_t).
         * This means that repeated oopsing can make unexploitable-looking bugs
         * exploitable through repeated oopsing.
         * To make sure this can't happen, place an upper bound on how often the
         * kernel may oops without panic().
         */
        limit = READ_ONCE(oops_limit);
        if (atomic_inc_return(&oops_count) >= limit && limit)
                panic("Oopsed too often (kernel.oops_limit is %d)", limit);

        /*
         * We're taking recursive faults here in make_task_dead. Safest is to just
         * leave this task alone and wait for reboot.
         */
        if (unlikely(tsk->flags & PF_EXITING)) {
                pr_alert("Fixing recursive fault but reboot is needed!\n");
                futex_exit_recursive(tsk);
                tsk->exit_state = EXIT_DEAD;
                refcount_inc(&tsk->rcu_users);
                do_task_dead();
        }

        do_exit(signr);
}

SYSCALL_DEFINE1(exit, int, error_code)
{
        do_exit((error_code&0xff)<<8);
}

/*
 * Take down every thread in the group.  This is called by fatal signals
 * as well as by sys_exit_group (below).
 */
void __noreturn
do_group_exit(int exit_code)
{
        struct signal_struct *sig = current->signal;

        if (sig->flags & SIGNAL_GROUP_EXIT)
                exit_code = sig->group_exit_code;
        else if (sig->group_exec_task)
                exit_code = 0;
        else {
                struct sighand_struct *const sighand = current->sighand;

                spin_lock_irq(&sighand->siglock);
                if (sig->flags & SIGNAL_GROUP_EXIT)
                        /* Another thread got here before we took the lock.  */
                        exit_code = sig->group_exit_code;
                else if (sig->group_exec_task)
                        exit_code = 0;
                else {
                        sig->group_exit_code = exit_code;
                        sig->flags = SIGNAL_GROUP_EXIT;
                        zap_other_threads(current);
                }
                spin_unlock_irq(&sighand->siglock);
        }

        do_exit(exit_code);
        /* NOTREACHED */
}

/*
 * this kills every thread in the thread group. Note that any externally
 * wait4()-ing process will get the correct exit code - even if this
 * thread is not the thread group leader.
 */
SYSCALL_DEFINE1(exit_group, int, error_code)
{
        do_group_exit((error_code & 0xff) << 8);
        /* NOTREACHED */
        return 0;
}

static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
{
        return        wo->wo_type == PIDTYPE_MAX ||
                task_pid_type(p, wo->wo_type) == wo->wo_pid;
}

static int
eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p)
{
        if (!eligible_pid(wo, p))
                return 0;

        /*
         * Wait for all children (clone and not) if __WALL is set or
         * if it is traced by us.
         */
        if (ptrace || (wo->wo_flags & __WALL))
                return 1;

        /*
         * Otherwise, wait for clone children *only* if __WCLONE is set;
         * otherwise, wait for non-clone children *only*.
         *
         * Note: a "clone" child here is one that reports to its parent
         * using a signal other than SIGCHLD, or a non-leader thread which
         * we can only see if it is traced by us.
         */
        if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
                return 0;

        return 1;
}

/*
 * Handle sys_wait4 work for one task in state EXIT_ZOMBIE.  We hold
 * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
 * the lock and this task is uninteresting.  If we return nonzero, we have
 * released the lock and the system call should return.
 */
static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
{
        int state, status;
        pid_t pid = task_pid_vnr(p);
        uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
        struct waitid_info *infop;

        if (!likely(wo->wo_flags & WEXITED))
                return 0;

        if (unlikely(wo->wo_flags & WNOWAIT)) {
                status = (p->signal->flags & SIGNAL_GROUP_EXIT)
                        ? p->signal->group_exit_code : p->exit_code;
                get_task_struct(p);
                read_unlock(&tasklist_lock);
                sched_annotate_sleep();
                if (wo->wo_rusage)
                        getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
                put_task_struct(p);
                goto out_info;
        }
        /*
         * Move the task's state to DEAD/TRACE, only one thread can do this.
         */
        state = (ptrace_reparented(p) && thread_group_leader(p)) ?
                EXIT_TRACE : EXIT_DEAD;
        if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
                return 0;
        /*
         * We own this thread, nobody else can reap it.
         */
        read_unlock(&tasklist_lock);
        sched_annotate_sleep();

        /*
         * Check thread_group_leader() to exclude the traced sub-threads.
         */
        if (state == EXIT_DEAD && thread_group_leader(p)) {
                struct signal_struct *sig = p->signal;
                struct signal_struct *psig = current->signal;
                unsigned long maxrss;
                u64 tgutime, tgstime;

                /*
                 * The resource counters for the group leader are in its
                 * own task_struct.  Those for dead threads in the group
                 * are in its signal_struct, as are those for the child
                 * processes it has previously reaped.  All these
                 * accumulate in the parent's signal_struct c* fields.
                 *
                 * We don't bother to take a lock here to protect these
                 * p->signal fields because the whole thread group is dead
                 * and nobody can change them.
                 *
                 * psig->stats_lock also protects us from our sub-threads
                 * which can reap other children at the same time.
                 *
                 * We use thread_group_cputime_adjusted() to get times for
                 * the thread group, which consolidates times for all threads
                 * in the group including the group leader.
                 */
                thread_group_cputime_adjusted(p, &tgutime, &tgstime);
                write_seqlock_irq(&psig->stats_lock);
                psig->cutime += tgutime + sig->cutime;
                psig->cstime += tgstime + sig->cstime;
                psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
                psig->cmin_flt +=
                        p->min_flt + sig->min_flt + sig->cmin_flt;
                psig->cmaj_flt +=
                        p->maj_flt + sig->maj_flt + sig->cmaj_flt;
                psig->cnvcsw +=
                        p->nvcsw + sig->nvcsw + sig->cnvcsw;
                psig->cnivcsw +=
                        p->nivcsw + sig->nivcsw + sig->cnivcsw;
                psig->cinblock +=
                        task_io_get_inblock(p) +
                        sig->inblock + sig->cinblock;
                psig->coublock +=
                        task_io_get_oublock(p) +
                        sig->oublock + sig->coublock;
                maxrss = max(sig->maxrss, sig->cmaxrss);
                if (psig->cmaxrss < maxrss)
                        psig->cmaxrss = maxrss;
                task_io_accounting_add(&psig->ioac, &p->ioac);
                task_io_accounting_add(&psig->ioac, &sig->ioac);
                write_sequnlock_irq(&psig->stats_lock);
        }

        if (wo->wo_rusage)
                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
        status = (p->signal->flags & SIGNAL_GROUP_EXIT)
                ? p->signal->group_exit_code : p->exit_code;
        wo->wo_stat = status;

        if (state == EXIT_TRACE) {
                write_lock_irq(&tasklist_lock);
                /* We dropped tasklist, ptracer could die and untrace */
                ptrace_unlink(p);

                /* If parent wants a zombie, don't release it now */
                state = EXIT_ZOMBIE;
                if (do_notify_parent(p, p->exit_signal))
                        state = EXIT_DEAD;
                p->exit_state = state;
                write_unlock_irq(&tasklist_lock);
        }
        if (state == EXIT_DEAD)
                release_task(p);

out_info:
        infop = wo->wo_info;
        if (infop) {
                if ((status & 0x7f) == 0) {
                        infop->cause = CLD_EXITED;
                        infop->status = status >> 8;
                } else {
                        infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
                        infop->status = status & 0x7f;
                }
                infop->pid = pid;
                infop->uid = uid;
        }

        return pid;
}

static int *task_stopped_code(struct task_struct *p, bool ptrace)
{
        if (ptrace) {
                if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING))
                        return &p->exit_code;
        } else {
                if (p->signal->flags & SIGNAL_STOP_STOPPED)
                        return &p->signal->group_exit_code;
        }
        return NULL;
}

/**
 * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
 * @wo: wait options
 * @ptrace: is the wait for ptrace
 * @p: task to wait for
 *
 * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
 *
 * CONTEXT:
 * read_lock(&tasklist_lock), which is released if return value is
 * non-zero.  Also, grabs and releases @p->sighand->siglock.
 *
 * RETURNS:
 * 0 if wait condition didn't exist and search for other wait conditions
 * should continue.  Non-zero return, -errno on failure and @p's pid on
 * success, implies that tasklist_lock is released and wait condition
 * search should terminate.
 */
static int wait_task_stopped(struct wait_opts *wo,
                                int ptrace, struct task_struct *p)
{
        struct waitid_info *infop;
        int exit_code, *p_code, why;
        uid_t uid = 0; /* unneeded, required by compiler */
        pid_t pid;

        /*
         * Traditionally we see ptrace'd stopped tasks regardless of options.
         */
        if (!ptrace && !(wo->wo_flags & WUNTRACED))
                return 0;

        if (!task_stopped_code(p, ptrace))
                return 0;

        exit_code = 0;
        spin_lock_irq(&p->sighand->siglock);

        p_code = task_stopped_code(p, ptrace);
        if (unlikely(!p_code))
                goto unlock_sig;

        exit_code = *p_code;
        if (!exit_code)
                goto unlock_sig;

        if (!unlikely(wo->wo_flags & WNOWAIT))
                *p_code = 0;

        uid = from_kuid_munged(current_user_ns(), task_uid(p));
unlock_sig:
        spin_unlock_irq(&p->sighand->siglock);
        if (!exit_code)
                return 0;

        /*
         * Now we are pretty sure this task is interesting.
         * Make sure it doesn't get reaped out from under us while we
         * give up the lock and then examine it below.  We don't want to
         * keep holding onto the tasklist_lock while we call getrusage and
         * possibly take page faults for user memory.
         */
        get_task_struct(p);
        pid = task_pid_vnr(p);
        why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
        read_unlock(&tasklist_lock);
        sched_annotate_sleep();
        if (wo->wo_rusage)
                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
        put_task_struct(p);

        if (likely(!(wo->wo_flags & WNOWAIT)))
                wo->wo_stat = (exit_code << 8) | 0x7f;

        infop = wo->wo_info;
        if (infop) {
                infop->cause = why;
                infop->status = exit_code;
                infop->pid = pid;
                infop->uid = uid;
        }
        return pid;
}

/*
 * Handle do_wait work for one task in a live, non-stopped state.
 * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
 * the lock and this task is uninteresting.  If we return nonzero, we have
 * released the lock and the system call should return.
 */
static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
{
        struct waitid_info *infop;
        pid_t pid;
        uid_t uid;

        if (!unlikely(wo->wo_flags & WCONTINUED))
                return 0;

        if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
                return 0;

        spin_lock_irq(&p->sighand->siglock);
        /* Re-check with the lock held.  */
        if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
                spin_unlock_irq(&p->sighand->siglock);
                return 0;
        }
        if (!unlikely(wo->wo_flags & WNOWAIT))
                p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
        uid = from_kuid_munged(current_user_ns(), task_uid(p));
        spin_unlock_irq(&p->sighand->siglock);

        pid = task_pid_vnr(p);
        get_task_struct(p);
        read_unlock(&tasklist_lock);
        sched_annotate_sleep();
        if (wo->wo_rusage)
                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
        put_task_struct(p);

        infop = wo->wo_info;
        if (!infop) {
                wo->wo_stat = 0xffff;
        } else {
                infop->cause = CLD_CONTINUED;
                infop->pid = pid;
                infop->uid = uid;
                infop->status = SIGCONT;
        }
        return pid;
}

/*
 * Consider @p for a wait by @parent.
 *
 * -ECHILD should be in ->notask_error before the first call.
 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
 * Returns zero if the search for a child should continue;
 * then ->notask_error is 0 if @p is an eligible child,
 * or still -ECHILD.
 */
static int wait_consider_task(struct wait_opts *wo, int ptrace,
                                struct task_struct *p)
{
        /*
         * We can race with wait_task_zombie() from another thread.
         * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
         * can't confuse the checks below.
         */
        int exit_state = READ_ONCE(p->exit_state);
        int ret;

        if (unlikely(exit_state == EXIT_DEAD))
                return 0;

        ret = eligible_child(wo, ptrace, p);
        if (!ret)
                return ret;

        if (unlikely(exit_state == EXIT_TRACE)) {
                /*
                 * ptrace == 0 means we are the natural parent. In this case
                 * we should clear notask_error, debugger will notify us.
                 */
                if (likely(!ptrace))
                        wo->notask_error = 0;
                return 0;
        }

        if (likely(!ptrace) && unlikely(p->ptrace)) {
                /*
                 * If it is traced by its real parent's group, just pretend
                 * the caller is ptrace_do_wait() and reap this child if it
                 * is zombie.
                 *
                 * This also hides group stop state from real parent; otherwise
                 * a single stop can be reported twice as group and ptrace stop.
                 * If a ptracer wants to distinguish these two events for its
                 * own children it should create a separate process which takes
                 * the role of real parent.
                 */
                if (!ptrace_reparented(p))
                        ptrace = 1;
        }

        /* slay zombie? */
        if (exit_state == EXIT_ZOMBIE) {
                /* we don't reap group leaders with subthreads */
                if (!delay_group_leader(p)) {
                        /*
                         * A zombie ptracee is only visible to its ptracer.
                         * Notification and reaping will be cascaded to the
                         * real parent when the ptracer detaches.
                         */
                        if (unlikely(ptrace) || likely(!p->ptrace))
                                return wait_task_zombie(wo, p);
                }

                /*
                 * Allow access to stopped/continued state via zombie by
                 * falling through.  Clearing of notask_error is complex.
                 *
                 * When !@ptrace:
                 *
                 * If WEXITED is set, notask_error should naturally be
                 * cleared.  If not, subset of WSTOPPED|WCONTINUED is set,
                 * so, if there are live subthreads, there are events to
                 * wait for.  If all subthreads are dead, it's still safe
                 * to clear - this function will be called again in finite
                 * amount time once all the subthreads are released and
                 * will then return without clearing.
                 *
                 * When @ptrace:
                 *
                 * Stopped state is per-task and thus can't change once the
                 * target task dies.  Only continued and exited can happen.
                 * Clear notask_error if WCONTINUED | WEXITED.
                 */
                if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
                        wo->notask_error = 0;
        } else {
                /*
                 * @p is alive and it's gonna stop, continue or exit, so
                 * there always is something to wait for.
                 */
                wo->notask_error = 0;
        }

        /*
         * Wait for stopped.  Depending on @ptrace, different stopped state
         * is used and the two don't interact with each other.
         */
        ret = wait_task_stopped(wo, ptrace, p);
        if (ret)
                return ret;

        /*
         * Wait for continued.  There's only one continued state and the
         * ptracer can consume it which can confuse the real parent.  Don't
         * use WCONTINUED from ptracer.  You don't need or want it.
         */
        return wait_task_continued(wo, p);
}

/*
 * Do the work of do_wait() for one thread in the group, @tsk.
 *
 * -ECHILD should be in ->notask_error before the first call.
 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
 * Returns zero if the search for a child should continue; then
 * ->notask_error is 0 if there were any eligible children,
 * or still -ECHILD.
 */
static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
{
        struct task_struct *p;

        list_for_each_entry(p, &tsk->children, sibling) {
                int ret = wait_consider_task(wo, 0, p);

                if (ret)
                        return ret;
        }

        return 0;
}

static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
{
        struct task_struct *p;

        list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
                int ret = wait_consider_task(wo, 1, p);

                if (ret)
                        return ret;
        }

        return 0;
}

bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p)
{
        if (!eligible_pid(wo, p))
                return false;

        if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent)
                return false;

        return true;
}

static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
                                int sync, void *key)
{
        struct wait_opts *wo = container_of(wait, struct wait_opts,
                                                child_wait);
        struct task_struct *p = key;

        if (pid_child_should_wake(wo, p))
                return default_wake_function(wait, mode, sync, key);

        return 0;
}

void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
{
        __wake_up_sync_key(&parent->signal->wait_chldexit,
                           TASK_INTERRUPTIBLE, p);
}

static bool is_effectively_child(struct wait_opts *wo, bool ptrace,
                                 struct task_struct *target)
{
        struct task_struct *parent =
                !ptrace ? target->real_parent : target->parent;

        return current == parent || (!(wo->wo_flags & __WNOTHREAD) &&
                                     same_thread_group(current, parent));
}

/*
 * Optimization for waiting on PIDTYPE_PID. No need to iterate through child
 * and tracee lists to find the target task.
 */
static int do_wait_pid(struct wait_opts *wo)
{
        bool ptrace;
        struct task_struct *target;
        int retval;

        ptrace = false;
        target = pid_task(wo->wo_pid, PIDTYPE_TGID);
        if (target && is_effectively_child(wo, ptrace, target)) {
                retval = wait_consider_task(wo, ptrace, target);
                if (retval)
                        return retval;
        }

        ptrace = true;
        target = pid_task(wo->wo_pid, PIDTYPE_PID);
        if (target && target->ptrace &&
            is_effectively_child(wo, ptrace, target)) {
                retval = wait_consider_task(wo, ptrace, target);
                if (retval)
                        return retval;
        }

        return 0;
}

long __do_wait(struct wait_opts *wo)
{
        long retval;

        /*
         * If there is nothing that can match our criteria, just get out.
         * We will clear ->notask_error to zero if we see any child that
         * might later match our criteria, even if we are not able to reap
         * it yet.
         */
        wo->notask_error = -ECHILD;
        if ((wo->wo_type < PIDTYPE_MAX) &&
           (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type)))
                goto notask;

        read_lock(&tasklist_lock);

        if (wo->wo_type == PIDTYPE_PID) {
                retval = do_wait_pid(wo);
                if (retval)
                        return retval;
        } else {
                struct task_struct *tsk = current;

                do {
                        retval = do_wait_thread(wo, tsk);
                        if (retval)
                                return retval;

                        retval = ptrace_do_wait(wo, tsk);
                        if (retval)
                                return retval;

                        if (wo->wo_flags & __WNOTHREAD)
                                break;
                } while_each_thread(current, tsk);
        }
        read_unlock(&tasklist_lock);

notask:
        retval = wo->notask_error;
        if (!retval && !(wo->wo_flags & WNOHANG))
                return -ERESTARTSYS;

        return retval;
}

static long do_wait(struct wait_opts *wo)
{
        int retval;

        trace_sched_process_wait(wo->wo_pid);

        init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
        wo->child_wait.private = current;
        add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);

        do {
                set_current_state(TASK_INTERRUPTIBLE);
                retval = __do_wait(wo);
                if (retval != -ERESTARTSYS)
                        break;
                if (signal_pending(current))
                        break;
                schedule();
        } while (1);

        __set_current_state(TASK_RUNNING);
        remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
        return retval;
}

int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
                          struct waitid_info *infop, int options,
                          struct rusage *ru)
{
        unsigned int f_flags = 0;
        struct pid *pid = NULL;
        enum pid_type type;

        if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
                        __WNOTHREAD|__WCLONE|__WALL))
                return -EINVAL;
        if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
                return -EINVAL;

        switch (which) {
        case P_ALL:
                type = PIDTYPE_MAX;
                break;
        case P_PID:
                type = PIDTYPE_PID;
                if (upid <= 0)
                        return -EINVAL;

                pid = find_get_pid(upid);
                break;
        case P_PGID:
                type = PIDTYPE_PGID;
                if (upid < 0)
                        return -EINVAL;

                if (upid)
                        pid = find_get_pid(upid);
                else
                        pid = get_task_pid(current, PIDTYPE_PGID);
                break;
        case P_PIDFD:
                type = PIDTYPE_PID;
                if (upid < 0)
                        return -EINVAL;

                pid = pidfd_get_pid(upid, &f_flags);
                if (IS_ERR(pid))
                        return PTR_ERR(pid);

                break;
        default:
                return -EINVAL;
        }

        wo->wo_type        = type;
        wo->wo_pid        = pid;
        wo->wo_flags        = options;
        wo->wo_info        = infop;
        wo->wo_rusage        = ru;
        if (f_flags & O_NONBLOCK)
                wo->wo_flags |= WNOHANG;

        return 0;
}

static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
                          int options, struct rusage *ru)
{
        struct wait_opts wo;
        long ret;

        ret = kernel_waitid_prepare(&wo, which, upid, infop, options, ru);
        if (ret)
                return ret;

        ret = do_wait(&wo);
        if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG))
                ret = -EAGAIN;

        put_pid(wo.wo_pid);
        return ret;
}

SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
                infop, int, options, struct rusage __user *, ru)
{
        struct rusage r;
        struct waitid_info info = {.status = 0};
        long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
        int signo = 0;

        if (err > 0) {
                signo = SIGCHLD;
                err = 0;
                if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
                        return -EFAULT;
        }
        if (!infop)
                return err;

        if (!user_write_access_begin(infop, sizeof(*infop)))
                return -EFAULT;

        unsafe_put_user(signo, &infop->si_signo, Efault);
        unsafe_put_user(0, &infop->si_errno, Efault);
        unsafe_put_user(info.cause, &infop->si_code, Efault);
        unsafe_put_user(info.pid, &infop->si_pid, Efault);
        unsafe_put_user(info.uid, &infop->si_uid, Efault);
        unsafe_put_user(info.status, &infop->si_status, Efault);
        user_write_access_end();
        return err;
Efault:
        user_write_access_end();
        return -EFAULT;
}

long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
                  struct rusage *ru)
{
        struct wait_opts wo;
        struct pid *pid = NULL;
        enum pid_type type;
        long ret;

        if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
                        __WNOTHREAD|__WCLONE|__WALL))
                return -EINVAL;

        /* -INT_MIN is not defined */
        if (upid == INT_MIN)
                return -ESRCH;

        if (upid == -1)
                type = PIDTYPE_MAX;
        else if (upid < 0) {
                type = PIDTYPE_PGID;
                pid = find_get_pid(-upid);
        } else if (upid == 0) {
                type = PIDTYPE_PGID;
                pid = get_task_pid(current, PIDTYPE_PGID);
        } else /* upid > 0 */ {
                type = PIDTYPE_PID;
                pid = find_get_pid(upid);
        }

        wo.wo_type        = type;
        wo.wo_pid        = pid;
        wo.wo_flags        = options | WEXITED;
        wo.wo_info        = NULL;
        wo.wo_stat        = 0;
        wo.wo_rusage        = ru;
        ret = do_wait(&wo);
        put_pid(pid);
        if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr))
                ret = -EFAULT;

        return ret;
}

int kernel_wait(pid_t pid, int *stat)
{
        struct wait_opts wo = {
                .wo_type        = PIDTYPE_PID,
                .wo_pid                = find_get_pid(pid),
                .wo_flags        = WEXITED,
        };
        int ret;

        ret = do_wait(&wo);
        if (ret > 0 && wo.wo_stat)
                *stat = wo.wo_stat;
        put_pid(wo.wo_pid);
        return ret;
}

SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
                int, options, struct rusage __user *, ru)
{
        struct rusage r;
        long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL);

        if (err > 0) {
                if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
                        return -EFAULT;
        }
        return err;
}

#ifdef __ARCH_WANT_SYS_WAITPID

/*
 * sys_waitpid() remains for compatibility. waitpid() should be
 * implemented by calling sys_wait4() from libc.a.
 */
SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
{
        return kernel_wait4(pid, stat_addr, options, NULL);
}

#endif

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(wait4,
        compat_pid_t, pid,
        compat_uint_t __user *, stat_addr,
        int, options,
        struct compat_rusage __user *, ru)
{
        struct rusage r;
        long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL);
        if (err > 0) {
                if (ru && put_compat_rusage(&r, ru))
                        return -EFAULT;
        }
        return err;
}

COMPAT_SYSCALL_DEFINE5(waitid,
                int, which, compat_pid_t, pid,
                struct compat_siginfo __user *, infop, int, options,
                struct compat_rusage __user *, uru)
{
        struct rusage ru;
        struct waitid_info info = {.status = 0};
        long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL);
        int signo = 0;
        if (err > 0) {
                signo = SIGCHLD;
                err = 0;
                if (uru) {
                        /* kernel_waitid() overwrites everything in ru */
                        if (COMPAT_USE_64BIT_TIME)
                                err = copy_to_user(uru, &ru, sizeof(ru));
                        else
                                err = put_compat_rusage(&ru, uru);
                        if (err)
                                return -EFAULT;
                }
        }

        if (!infop)
                return err;

        if (!user_write_access_begin(infop, sizeof(*infop)))
                return -EFAULT;

        unsafe_put_user(signo, &infop->si_signo, Efault);
        unsafe_put_user(0, &infop->si_errno, Efault);
        unsafe_put_user(info.cause, &infop->si_code, Efault);
        unsafe_put_user(info.pid, &infop->si_pid, Efault);
        unsafe_put_user(info.uid, &infop->si_uid, Efault);
        unsafe_put_user(info.status, &infop->si_status, Efault);
        user_write_access_end();
        return err;
Efault:
        user_write_access_end();
        return -EFAULT;
}
#endif

/*
 * This needs to be __function_aligned as GCC implicitly makes any
 * implementation of abort() cold and drops alignment specified by
 * -falign-functions=N.
 *
 * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88345#c11
 */
__weak __function_aligned void abort(void)
{
        BUG();

        /* if that doesn't kill us, halt */
        panic("Oops failed to kill thread");
}
EXPORT_SYMBOL(abort);






























































































































































   42 



   42 

   41 
   42 
































   42 


















   42 






   42 


   42 























   42 

   42 



























   40 

   42 

    2 






























































   42 
   41 
































































































































   42 






































































































































































































































































































































   12 









   12 












   12 


   12 














   12 






































































































































































   42 


































































   42 
   42 


   41 




















    2 





    2 













    2 

    2 
























































































































   42 
   42 



   42 



   42 












   42 







   42 



























   42 













   41 

   42 


   42 



















































































































































































































































   95 
















   42 





   41 






   41 











   94 
























   94 
   95 
   92 







   93 
   95 
















   94 



























































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
// SPDX-License-Identifier: GPL-2.0
/*
 *  Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
 *  Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
 *  Copyright(C) 2006-2007  Timesys Corp., Thomas Gleixner
 *
 *  High-resolution kernel timers
 *
 *  In contrast to the low-resolution timeout API, aka timer wheel,
 *  hrtimers provide finer resolution and accuracy depending on system
 *  configuration and capabilities.
 *
 *  Started by: Thomas Gleixner and Ingo Molnar
 *
 *  Credits:
 *        Based on the original timer wheel code
 *
 *        Help, testing, suggestions, bugfixes, improvements were
 *        provided by:
 *
 *        George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel
 *        et. al.
 */

#include <linux/cpu.h>
#include <linux/export.h>
#include <linux/percpu.h>
#include <linux/hrtimer.h>
#include <linux/notifier.h>
#include <linux/syscalls.h>
#include <linux/interrupt.h>
#include <linux/tick.h>
#include <linux/err.h>
#include <linux/debugobjects.h>
#include <linux/sched/signal.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/rt.h>
#include <linux/sched/deadline.h>
#include <linux/sched/nohz.h>
#include <linux/sched/debug.h>
#include <linux/sched/isolation.h>
#include <linux/timer.h>
#include <linux/freezer.h>
#include <linux/compat.h>

#include <linux/uaccess.h>

#include <trace/events/timer.h>

#include "tick-internal.h"

/*
 * Masks for selecting the soft and hard context timers from
 * cpu_base->active
 */
#define MASK_SHIFT                (HRTIMER_BASE_MONOTONIC_SOFT)
#define HRTIMER_ACTIVE_HARD        ((1U << MASK_SHIFT) - 1)
#define HRTIMER_ACTIVE_SOFT        (HRTIMER_ACTIVE_HARD << MASK_SHIFT)
#define HRTIMER_ACTIVE_ALL        (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)

static void retrigger_next_event(void *arg);
static ktime_t __hrtimer_cb_get_time(clockid_t clock_id);

/*
 * The timer bases:
 *
 * There are more clockids than hrtimer bases. Thus, we index
 * into the timer bases by the hrtimer_base_type enum. When trying
 * to reach a base using a clockid, hrtimer_clockid_to_base()
 * is used to convert from clockid to the proper hrtimer_base_type.
 */
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
        .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
        .clock_base =
        {
                {
                        .index = HRTIMER_BASE_MONOTONIC,
                        .clockid = CLOCK_MONOTONIC,
                },
                {
                        .index = HRTIMER_BASE_REALTIME,
                        .clockid = CLOCK_REALTIME,
                },
                {
                        .index = HRTIMER_BASE_BOOTTIME,
                        .clockid = CLOCK_BOOTTIME,
                },
                {
                        .index = HRTIMER_BASE_TAI,
                        .clockid = CLOCK_TAI,
                },
                {
                        .index = HRTIMER_BASE_MONOTONIC_SOFT,
                        .clockid = CLOCK_MONOTONIC,
                },
                {
                        .index = HRTIMER_BASE_REALTIME_SOFT,
                        .clockid = CLOCK_REALTIME,
                },
                {
                        .index = HRTIMER_BASE_BOOTTIME_SOFT,
                        .clockid = CLOCK_BOOTTIME,
                },
                {
                        .index = HRTIMER_BASE_TAI_SOFT,
                        .clockid = CLOCK_TAI,
                },
        },
        .csd = CSD_INIT(retrigger_next_event, NULL)
};

static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base)
{
        if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
                return true;
        else
                return likely(base->online);
}

/*
 * Functions and macros which are different for UP/SMP systems are kept in a
 * single place
 */
#ifdef CONFIG_SMP

/*
 * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
 * such that hrtimer_callback_running() can unconditionally dereference
 * timer->base->cpu_base
 */
static struct hrtimer_cpu_base migration_cpu_base = {
        .clock_base = { {
                .cpu_base = &migration_cpu_base,
                .seq      = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq,
                                                     &migration_cpu_base.lock),
        }, },
};

#define migration_base        migration_cpu_base.clock_base[0]

/*
 * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
 * means that all timers which are tied to this base via timer->base are
 * locked, and the base itself is locked too.
 *
 * So __run_timers/migrate_timers can safely modify all timers which could
 * be found on the lists/queues.
 *
 * When the timer's base is locked, and the timer removed from list, it is
 * possible to set timer->base = &migration_base and drop the lock: the timer
 * remains locked.
 */
static
struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
                                             unsigned long *flags)
        __acquires(&timer->base->lock)
{
        struct hrtimer_clock_base *base;

        for (;;) {
                base = READ_ONCE(timer->base);
                if (likely(base != &migration_base)) {
                        raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
                        if (likely(base == timer->base))
                                return base;
                        /* The timer has migrated to another CPU: */
                        raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
                }
                cpu_relax();
        }
}

/*
 * Check if the elected target is suitable considering its next
 * event and the hotplug state of the current CPU.
 *
 * If the elected target is remote and its next event is after the timer
 * to queue, then a remote reprogram is necessary. However there is no
 * guarantee the IPI handling the operation would arrive in time to meet
 * the high resolution deadline. In this case the local CPU becomes a
 * preferred target, unless it is offline.
 *
 * High and low resolution modes are handled the same way for simplicity.
 *
 * Called with cpu_base->lock of target cpu held.
 */
static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base,
                                    struct hrtimer_cpu_base *new_cpu_base,
                                    struct hrtimer_cpu_base *this_cpu_base)
{
        ktime_t expires;

        /*
         * The local CPU clockevent can be reprogrammed. Also get_target_base()
         * guarantees it is online.
         */
        if (new_cpu_base == this_cpu_base)
                return true;

        /*
         * The offline local CPU can't be the default target if the
         * next remote target event is after this timer. Keep the
         * elected new base. An IPI will be issued to reprogram
         * it as a last resort.
         */
        if (!hrtimer_base_is_online(this_cpu_base))
                return true;

        expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);

        return expires >= new_base->cpu_base->expires_next;
}

static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned)
{
        if (!hrtimer_base_is_online(base)) {
                int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER));

                return &per_cpu(hrtimer_bases, cpu);
        }

#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
        if (static_branch_likely(&timers_migration_enabled) && !pinned)
                return &per_cpu(hrtimer_bases, get_nohz_timer_target());
#endif
        return base;
}

/*
 * We switch the timer base to a power-optimized selected CPU target,
 * if:
 *        - NO_HZ_COMMON is enabled
 *        - timer migration is enabled
 *        - the timer callback is not running
 *        - the timer is not the first expiring timer on the new target
 *
 * If one of the above requirements is not fulfilled we move the timer
 * to the current CPU or leave it on the previously assigned CPU if
 * the timer callback is currently running.
 */
static inline struct hrtimer_clock_base *
switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
                    int pinned)
{
        struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base;
        struct hrtimer_clock_base *new_base;
        int basenum = base->index;

        this_cpu_base = this_cpu_ptr(&hrtimer_bases);
        new_cpu_base = get_target_base(this_cpu_base, pinned);
again:
        new_base = &new_cpu_base->clock_base[basenum];

        if (base != new_base) {
                /*
                 * We are trying to move timer to new_base.
                 * However we can't change timer's base while it is running,
                 * so we keep it on the same CPU. No hassle vs. reprogramming
                 * the event source in the high resolution case. The softirq
                 * code will take care of this when the timer function has
                 * completed. There is no conflict as we hold the lock until
                 * the timer is enqueued.
                 */
                if (unlikely(hrtimer_callback_running(timer)))
                        return base;

                /* See the comment in lock_hrtimer_base() */
                WRITE_ONCE(timer->base, &migration_base);
                raw_spin_unlock(&base->cpu_base->lock);
                raw_spin_lock(&new_base->cpu_base->lock);

                if (!hrtimer_suitable_target(timer, new_base, new_cpu_base,
                                             this_cpu_base)) {
                        raw_spin_unlock(&new_base->cpu_base->lock);
                        raw_spin_lock(&base->cpu_base->lock);
                        new_cpu_base = this_cpu_base;
                        WRITE_ONCE(timer->base, base);
                        goto again;
                }
                WRITE_ONCE(timer->base, new_base);
        } else {
                if (!hrtimer_suitable_target(timer, new_base,  new_cpu_base, this_cpu_base)) {
                        new_cpu_base = this_cpu_base;
                        goto again;
                }
        }
        return new_base;
}

#else /* CONFIG_SMP */

static inline struct hrtimer_clock_base *
lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
        __acquires(&timer->base->cpu_base->lock)
{
        struct hrtimer_clock_base *base = timer->base;

        raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);

        return base;
}

# define switch_hrtimer_base(t, b, p)        (b)

#endif        /* !CONFIG_SMP */

/*
 * Functions for the union type storage format of ktime_t which are
 * too large for inlining:
 */
#if BITS_PER_LONG < 64
/*
 * Divide a ktime value by a nanosecond value
 */
s64 __ktime_divns(const ktime_t kt, s64 div)
{
        int sft = 0;
        s64 dclc;
        u64 tmp;

        dclc = ktime_to_ns(kt);
        tmp = dclc < 0 ? -dclc : dclc;

        /* Make sure the divisor is less than 2^32: */
        while (div >> 32) {
                sft++;
                div >>= 1;
        }
        tmp >>= sft;
        do_div(tmp, (u32) div);
        return dclc < 0 ? -tmp : tmp;
}
EXPORT_SYMBOL_GPL(__ktime_divns);
#endif /* BITS_PER_LONG >= 64 */

/*
 * Add two ktime values and do a safety check for overflow:
 */
ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
{
        ktime_t res = ktime_add_unsafe(lhs, rhs);

        /*
         * We use KTIME_SEC_MAX here, the maximum timeout which we can
         * return to user space in a timespec:
         */
        if (res < 0 || res < lhs || res < rhs)
                res = ktime_set(KTIME_SEC_MAX, 0);

        return res;
}

EXPORT_SYMBOL_GPL(ktime_add_safe);

#ifdef CONFIG_DEBUG_OBJECTS_TIMERS

static const struct debug_obj_descr hrtimer_debug_descr;

static void *hrtimer_debug_hint(void *addr)
{
        return ACCESS_PRIVATE((struct hrtimer *)addr, function);
}

/*
 * fixup_init is called when:
 * - an active object is initialized
 */
static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state)
{
        struct hrtimer *timer = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                hrtimer_cancel(timer);
                debug_object_init(timer, &hrtimer_debug_descr);
                return true;
        default:
                return false;
        }
}

/*
 * fixup_activate is called when:
 * - an active object is activated
 * - an unknown non-static object is activated
 */
static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
{
        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                WARN_ON(1);
                fallthrough;
        default:
                return false;
        }
}

/*
 * fixup_free is called when:
 * - an active object is freed
 */
static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state)
{
        struct hrtimer *timer = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                hrtimer_cancel(timer);
                debug_object_free(timer, &hrtimer_debug_descr);
                return true;
        default:
                return false;
        }
}

static const struct debug_obj_descr hrtimer_debug_descr = {
        .name                = "hrtimer",
        .debug_hint        = hrtimer_debug_hint,
        .fixup_init        = hrtimer_fixup_init,
        .fixup_activate        = hrtimer_fixup_activate,
        .fixup_free        = hrtimer_fixup_free,
};

static inline void debug_hrtimer_init(struct hrtimer *timer)
{
        debug_object_init(timer, &hrtimer_debug_descr);
}

static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer)
{
        debug_object_init_on_stack(timer, &hrtimer_debug_descr);
}

static inline void debug_hrtimer_activate(struct hrtimer *timer,
                                          enum hrtimer_mode mode)
{
        debug_object_activate(timer, &hrtimer_debug_descr);
}

static inline void debug_hrtimer_deactivate(struct hrtimer *timer)
{
        debug_object_deactivate(timer, &hrtimer_debug_descr);
}

void destroy_hrtimer_on_stack(struct hrtimer *timer)
{
        debug_object_free(timer, &hrtimer_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);

#else

static inline void debug_hrtimer_init(struct hrtimer *timer) { }
static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) { }
static inline void debug_hrtimer_activate(struct hrtimer *timer,
                                          enum hrtimer_mode mode) { }
static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
#endif

static inline void debug_setup(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode)
{
        debug_hrtimer_init(timer);
        trace_hrtimer_setup(timer, clockid, mode);
}

static inline void debug_setup_on_stack(struct hrtimer *timer, clockid_t clockid,
                                        enum hrtimer_mode mode)
{
        debug_hrtimer_init_on_stack(timer);
        trace_hrtimer_setup(timer, clockid, mode);
}

static inline void debug_activate(struct hrtimer *timer,
                                  enum hrtimer_mode mode)
{
        debug_hrtimer_activate(timer, mode);
        trace_hrtimer_start(timer, mode);
}

static inline void debug_deactivate(struct hrtimer *timer)
{
        debug_hrtimer_deactivate(timer);
        trace_hrtimer_cancel(timer);
}

static struct hrtimer_clock_base *
__next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active)
{
        unsigned int idx;

        if (!*active)
                return NULL;

        idx = __ffs(*active);
        *active &= ~(1U << idx);

        return &cpu_base->clock_base[idx];
}

#define for_each_active_base(base, cpu_base, active)        \
        while ((base = __next_base((cpu_base), &(active))))

static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
                                         const struct hrtimer *exclude,
                                         unsigned int active,
                                         ktime_t expires_next)
{
        struct hrtimer_clock_base *base;
        ktime_t expires;

        for_each_active_base(base, cpu_base, active) {
                struct timerqueue_node *next;
                struct hrtimer *timer;

                next = timerqueue_getnext(&base->active);
                timer = container_of(next, struct hrtimer, node);
                if (timer == exclude) {
                        /* Get to the next timer in the queue. */
                        next = timerqueue_iterate_next(next);
                        if (!next)
                                continue;

                        timer = container_of(next, struct hrtimer, node);
                }
                expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
                if (expires < expires_next) {
                        expires_next = expires;

                        /* Skip cpu_base update if a timer is being excluded. */
                        if (exclude)
                                continue;

                        if (timer->is_soft)
                                cpu_base->softirq_next_timer = timer;
                        else
                                cpu_base->next_timer = timer;
                }
        }
        /*
         * clock_was_set() might have changed base->offset of any of
         * the clock bases so the result might be negative. Fix it up
         * to prevent a false positive in clockevents_program_event().
         */
        if (expires_next < 0)
                expires_next = 0;
        return expires_next;
}

/*
 * Recomputes cpu_base::*next_timer and returns the earliest expires_next
 * but does not set cpu_base::*expires_next, that is done by
 * hrtimer[_force]_reprogram and hrtimer_interrupt only. When updating
 * cpu_base::*expires_next right away, reprogramming logic would no longer
 * work.
 *
 * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases,
 * those timers will get run whenever the softirq gets handled, at the end of
 * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases.
 *
 * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases.
 * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual
 * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD.
 *
 * @active_mask must be one of:
 *  - HRTIMER_ACTIVE_ALL,
 *  - HRTIMER_ACTIVE_SOFT, or
 *  - HRTIMER_ACTIVE_HARD.
 */
static ktime_t
__hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask)
{
        unsigned int active;
        struct hrtimer *next_timer = NULL;
        ktime_t expires_next = KTIME_MAX;

        if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) {
                active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
                cpu_base->softirq_next_timer = NULL;
                expires_next = __hrtimer_next_event_base(cpu_base, NULL,
                                                         active, KTIME_MAX);

                next_timer = cpu_base->softirq_next_timer;
        }

        if (active_mask & HRTIMER_ACTIVE_HARD) {
                active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
                cpu_base->next_timer = next_timer;
                expires_next = __hrtimer_next_event_base(cpu_base, NULL, active,
                                                         expires_next);
        }

        return expires_next;
}

static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base)
{
        ktime_t expires_next, soft = KTIME_MAX;

        /*
         * If the soft interrupt has already been activated, ignore the
         * soft bases. They will be handled in the already raised soft
         * interrupt.
         */
        if (!cpu_base->softirq_activated) {
                soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
                /*
                 * Update the soft expiry time. clock_settime() might have
                 * affected it.
                 */
                cpu_base->softirq_expires_next = soft;
        }

        expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD);
        /*
         * If a softirq timer is expiring first, update cpu_base->next_timer
         * and program the hardware with the soft expiry time.
         */
        if (expires_next > soft) {
                cpu_base->next_timer = cpu_base->softirq_next_timer;
                expires_next = soft;
        }

        return expires_next;
}

static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
{
        ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
        ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
        ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;

        ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq,
                                            offs_real, offs_boot, offs_tai);

        base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real;
        base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot;
        base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai;

        return now;
}

/*
 * Is the high resolution mode active ?
 */
static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
{
        return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
                cpu_base->hres_active : 0;
}

static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base,
                                struct hrtimer *next_timer,
                                ktime_t expires_next)
{
        cpu_base->expires_next = expires_next;

        /*
         * If hres is not active, hardware does not have to be
         * reprogrammed yet.
         *
         * If a hang was detected in the last timer interrupt then we
         * leave the hang delay active in the hardware. We want the
         * system to make progress. That also prevents the following
         * scenario:
         * T1 expires 50ms from now
         * T2 expires 5s from now
         *
         * T1 is removed, so this code is called and would reprogram
         * the hardware to 5s from now. Any hrtimer_start after that
         * will not reprogram the hardware due to hang_detected being
         * set. So we'd effectively block all timers until the T2 event
         * fires.
         */
        if (!hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
                return;

        tick_program_event(expires_next, 1);
}

/*
 * Reprogram the event source with checking both queues for the
 * next event
 * Called with interrupts disabled and base->lock held
 */
static void
hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
{
        ktime_t expires_next;

        expires_next = hrtimer_update_next_event(cpu_base);

        if (skip_equal && expires_next == cpu_base->expires_next)
                return;

        __hrtimer_reprogram(cpu_base, cpu_base->next_timer, expires_next);
}

/* High resolution timer related functions */
#ifdef CONFIG_HIGH_RES_TIMERS

/*
 * High resolution timer enabled ?
 */
static bool hrtimer_hres_enabled __read_mostly  = true;
unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
EXPORT_SYMBOL_GPL(hrtimer_resolution);

/*
 * Enable / Disable high resolution mode
 */
static int __init setup_hrtimer_hres(char *str)
{
        return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
}

__setup("highres=", setup_hrtimer_hres);

/*
 * hrtimer_high_res_enabled - query, if the highres mode is enabled
 */
static inline int hrtimer_is_hres_enabled(void)
{
        return hrtimer_hres_enabled;
}

/*
 * Switch to high resolution mode
 */
static void hrtimer_switch_to_hres(void)
{
        struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);

        if (tick_init_highres()) {
                pr_warn("Could not switch to high resolution mode on CPU %u\n",
                        base->cpu);
                return;
        }
        base->hres_active = 1;
        hrtimer_resolution = HIGH_RES_NSEC;

        tick_setup_sched_timer(true);
        /* "Retrigger" the interrupt to get things going */
        retrigger_next_event(NULL);
}

#else

static inline int hrtimer_is_hres_enabled(void) { return 0; }
static inline void hrtimer_switch_to_hres(void) { }

#endif /* CONFIG_HIGH_RES_TIMERS */
/*
 * Retrigger next event is called after clock was set with interrupts
 * disabled through an SMP function call or directly from low level
 * resume code.
 *
 * This is only invoked when:
 *        - CONFIG_HIGH_RES_TIMERS is enabled.
 *        - CONFIG_NOHZ_COMMON is enabled
 *
 * For the other cases this function is empty and because the call sites
 * are optimized out it vanishes as well, i.e. no need for lots of
 * #ifdeffery.
 */
static void retrigger_next_event(void *arg)
{
        struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);

        /*
         * When high resolution mode or nohz is active, then the offsets of
         * CLOCK_REALTIME/TAI/BOOTTIME have to be updated. Otherwise the
         * next tick will take care of that.
         *
         * If high resolution mode is active then the next expiring timer
         * must be reevaluated and the clock event device reprogrammed if
         * necessary.
         *
         * In the NOHZ case the update of the offset and the reevaluation
         * of the next expiring timer is enough. The return from the SMP
         * function call will take care of the reprogramming in case the
         * CPU was in a NOHZ idle sleep.
         *
         * In periodic low resolution mode, the next softirq expiration
         * must also be updated.
         */
        raw_spin_lock(&base->lock);
        hrtimer_update_base(base);
        if (hrtimer_hres_active(base))
                hrtimer_force_reprogram(base, 0);
        else
                hrtimer_update_next_event(base);
        raw_spin_unlock(&base->lock);
}

/*
 * When a timer is enqueued and expires earlier than the already enqueued
 * timers, we have to check, whether it expires earlier than the timer for
 * which the clock event device was armed.
 *
 * Called with interrupts disabled and base->cpu_base.lock held
 */
static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        struct hrtimer_clock_base *base = timer->base;
        ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);

        WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);

        /*
         * CLOCK_REALTIME timer might be requested with an absolute
         * expiry time which is less than base->offset. Set it to 0.
         */
        if (expires < 0)
                expires = 0;

        if (timer->is_soft) {
                /*
                 * soft hrtimer could be started on a remote CPU. In this
                 * case softirq_expires_next needs to be updated on the
                 * remote CPU. The soft hrtimer will not expire before the
                 * first hard hrtimer on the remote CPU -
                 * hrtimer_check_target() prevents this case.
                 */
                struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base;

                if (timer_cpu_base->softirq_activated)
                        return;

                if (!ktime_before(expires, timer_cpu_base->softirq_expires_next))
                        return;

                timer_cpu_base->softirq_next_timer = timer;
                timer_cpu_base->softirq_expires_next = expires;

                if (!ktime_before(expires, timer_cpu_base->expires_next) ||
                    !reprogram)
                        return;
        }

        /*
         * If the timer is not on the current cpu, we cannot reprogram
         * the other cpus clock event device.
         */
        if (base->cpu_base != cpu_base)
                return;

        if (expires >= cpu_base->expires_next)
                return;

        /*
         * If the hrtimer interrupt is running, then it will reevaluate the
         * clock bases and reprogram the clock event device.
         */
        if (cpu_base->in_hrtirq)
                return;

        cpu_base->next_timer = timer;

        __hrtimer_reprogram(cpu_base, timer, expires);
}

static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base,
                             unsigned int active)
{
        struct hrtimer_clock_base *base;
        unsigned int seq;
        ktime_t expires;

        /*
         * Update the base offsets unconditionally so the following
         * checks whether the SMP function call is required works.
         *
         * The update is safe even when the remote CPU is in the hrtimer
         * interrupt or the hrtimer soft interrupt and expiring affected
         * bases. Either it will see the update before handling a base or
         * it will see it when it finishes the processing and reevaluates
         * the next expiring timer.
         */
        seq = cpu_base->clock_was_set_seq;
        hrtimer_update_base(cpu_base);

        /*
         * If the sequence did not change over the update then the
         * remote CPU already handled it.
         */
        if (seq == cpu_base->clock_was_set_seq)
                return false;

        /*
         * If the remote CPU is currently handling an hrtimer interrupt, it
         * will reevaluate the first expiring timer of all clock bases
         * before reprogramming. Nothing to do here.
         */
        if (cpu_base->in_hrtirq)
                return false;

        /*
         * Walk the affected clock bases and check whether the first expiring
         * timer in a clock base is moving ahead of the first expiring timer of
         * @cpu_base. If so, the IPI must be invoked because per CPU clock
         * event devices cannot be remotely reprogrammed.
         */
        active &= cpu_base->active_bases;

        for_each_active_base(base, cpu_base, active) {
                struct timerqueue_node *next;

                next = timerqueue_getnext(&base->active);
                expires = ktime_sub(next->expires, base->offset);
                if (expires < cpu_base->expires_next)
                        return true;

                /* Extra check for softirq clock bases */
                if (base->clockid < HRTIMER_BASE_MONOTONIC_SOFT)
                        continue;
                if (cpu_base->softirq_activated)
                        continue;
                if (expires < cpu_base->softirq_expires_next)
                        return true;
        }
        return false;
}

/*
 * Clock was set. This might affect CLOCK_REALTIME, CLOCK_TAI and
 * CLOCK_BOOTTIME (for late sleep time injection).
 *
 * This requires to update the offsets for these clocks
 * vs. CLOCK_MONOTONIC. When high resolution timers are enabled, then this
 * also requires to eventually reprogram the per CPU clock event devices
 * when the change moves an affected timer ahead of the first expiring
 * timer on that CPU. Obviously remote per CPU clock event devices cannot
 * be reprogrammed. The other reason why an IPI has to be sent is when the
 * system is in !HIGH_RES and NOHZ mode. The NOHZ mode updates the offsets
 * in the tick, which obviously might be stopped, so this has to bring out
 * the remote CPU which might sleep in idle to get this sorted.
 */
void clock_was_set(unsigned int bases)
{
        struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases);
        cpumask_var_t mask;
        int cpu;

        if (!hrtimer_hres_active(cpu_base) && !tick_nohz_active)
                goto out_timerfd;

        if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
                on_each_cpu(retrigger_next_event, NULL, 1);
                goto out_timerfd;
        }

        /* Avoid interrupting CPUs if possible */
        cpus_read_lock();
        for_each_online_cpu(cpu) {
                unsigned long flags;

                cpu_base = &per_cpu(hrtimer_bases, cpu);
                raw_spin_lock_irqsave(&cpu_base->lock, flags);

                if (update_needs_ipi(cpu_base, bases))
                        cpumask_set_cpu(cpu, mask);

                raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
        }

        preempt_disable();
        smp_call_function_many(mask, retrigger_next_event, NULL, 1);
        preempt_enable();
        cpus_read_unlock();
        free_cpumask_var(mask);

out_timerfd:
        timerfd_clock_was_set();
}

static void clock_was_set_work(struct work_struct *work)
{
        clock_was_set(CLOCK_SET_WALL);
}

static DECLARE_WORK(hrtimer_work, clock_was_set_work);

/*
 * Called from timekeeping code to reprogram the hrtimer interrupt device
 * on all cpus and to notify timerfd.
 */
void clock_was_set_delayed(void)
{
        schedule_work(&hrtimer_work);
}

/*
 * Called during resume either directly from via timekeeping_resume()
 * or in the case of s2idle from tick_unfreeze() to ensure that the
 * hrtimers are up to date.
 */
void hrtimers_resume_local(void)
{
        lockdep_assert_irqs_disabled();
        /* Retrigger on the local CPU */
        retrigger_next_event(NULL);
}

/*
 * Counterpart to lock_hrtimer_base above:
 */
static inline
void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
        __releases(&timer->base->cpu_base->lock)
{
        raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
}

/**
 * hrtimer_forward() - forward the timer expiry
 * @timer:        hrtimer to forward
 * @now:        forward past this time
 * @interval:        the interval to forward
 *
 * Forward the timer expiry so it will expire in the future.
 *
 * .. note::
 *  This only updates the timer expiry value and does not requeue the timer.
 *
 * There is also a variant of the function hrtimer_forward_now().
 *
 * Context: Can be safely called from the callback function of @timer. If called
 *          from other contexts @timer must neither be enqueued nor running the
 *          callback and the caller needs to take care of serialization.
 *
 * Return: The number of overruns are returned.
 */
u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
{
        u64 orun = 1;
        ktime_t delta;

        delta = ktime_sub(now, hrtimer_get_expires(timer));

        if (delta < 0)
                return 0;

        if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED))
                return 0;

        if (interval < hrtimer_resolution)
                interval = hrtimer_resolution;

        if (unlikely(delta >= interval)) {
                s64 incr = ktime_to_ns(interval);

                orun = ktime_divns(delta, incr);
                hrtimer_add_expires_ns(timer, incr * orun);
                if (hrtimer_get_expires_tv64(timer) > now)
                        return orun;
                /*
                 * This (and the ktime_add() below) is the
                 * correction for exact:
                 */
                orun++;
        }
        hrtimer_add_expires(timer, interval);

        return orun;
}
EXPORT_SYMBOL_GPL(hrtimer_forward);

/*
 * enqueue_hrtimer - internal function to (re)start a timer
 *
 * The timer is inserted in expiry order. Insertion into the
 * red black tree is O(log(n)). Must hold the base lock.
 *
 * Returns true when the new timer is the leftmost timer in the tree.
 */
static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
                            enum hrtimer_mode mode)
{
        debug_activate(timer, mode);
        WARN_ON_ONCE(!base->cpu_base->online);

        base->cpu_base->active_bases |= 1 << base->index;

        /* Pairs with the lockless read in hrtimer_is_queued() */
        WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED);

        return timerqueue_add(&base->active, &timer->node);
}

/*
 * __remove_hrtimer - internal function to remove a timer
 *
 * Caller must hold the base lock.
 *
 * High resolution timer mode reprograms the clock event device when the
 * timer is the one which expires next. The caller can disable this by setting
 * reprogram to zero. This is useful, when the context does a reprogramming
 * anyway (e.g. timer interrupt)
 */
static void __remove_hrtimer(struct hrtimer *timer,
                             struct hrtimer_clock_base *base,
                             u8 newstate, int reprogram)
{
        struct hrtimer_cpu_base *cpu_base = base->cpu_base;
        u8 state = timer->state;

        /* Pairs with the lockless read in hrtimer_is_queued() */
        WRITE_ONCE(timer->state, newstate);
        if (!(state & HRTIMER_STATE_ENQUEUED))
                return;

        if (!timerqueue_del(&base->active, &timer->node))
                cpu_base->active_bases &= ~(1 << base->index);

        /*
         * Note: If reprogram is false we do not update
         * cpu_base->next_timer. This happens when we remove the first
         * timer on a remote cpu. No harm as we never dereference
         * cpu_base->next_timer. So the worst thing what can happen is
         * an superfluous call to hrtimer_force_reprogram() on the
         * remote cpu later on if the same timer gets enqueued again.
         */
        if (reprogram && timer == cpu_base->next_timer)
                hrtimer_force_reprogram(cpu_base, 1);
}

/*
 * remove hrtimer, called with base lock held
 */
static inline int
remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
               bool restart, bool keep_local)
{
        u8 state = timer->state;

        if (state & HRTIMER_STATE_ENQUEUED) {
                bool reprogram;

                /*
                 * Remove the timer and force reprogramming when high
                 * resolution mode is active and the timer is on the current
                 * CPU. If we remove a timer on another CPU, reprogramming is
                 * skipped. The interrupt event on this CPU is fired and
                 * reprogramming happens in the interrupt handler. This is a
                 * rare case and less expensive than a smp call.
                 */
                debug_deactivate(timer);
                reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);

                /*
                 * If the timer is not restarted then reprogramming is
                 * required if the timer is local. If it is local and about
                 * to be restarted, avoid programming it twice (on removal
                 * and a moment later when it's requeued).
                 */
                if (!restart)
                        state = HRTIMER_STATE_INACTIVE;
                else
                        reprogram &= !keep_local;

                __remove_hrtimer(timer, base, state, reprogram);
                return 1;
        }
        return 0;
}

static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
                                            const enum hrtimer_mode mode)
{
#ifdef CONFIG_TIME_LOW_RES
        /*
         * CONFIG_TIME_LOW_RES indicates that the system has no way to return
         * granular time values. For relative timers we add hrtimer_resolution
         * (i.e. one jiffy) to prevent short timeouts.
         */
        timer->is_rel = mode & HRTIMER_MODE_REL;
        if (timer->is_rel)
                tim = ktime_add_safe(tim, hrtimer_resolution);
#endif
        return tim;
}

static void
hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram)
{
        ktime_t expires;

        /*
         * Find the next SOFT expiration.
         */
        expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);

        /*
         * reprogramming needs to be triggered, even if the next soft
         * hrtimer expires at the same time than the next hard
         * hrtimer. cpu_base->softirq_expires_next needs to be updated!
         */
        if (expires == KTIME_MAX)
                return;

        /*
         * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event()
         * cpu_base->*expires_next is only set by hrtimer_reprogram()
         */
        hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram);
}

static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
                                    u64 delta_ns, const enum hrtimer_mode mode,
                                    struct hrtimer_clock_base *base)
{
        struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases);
        struct hrtimer_clock_base *new_base;
        bool force_local, first;

        /*
         * If the timer is on the local cpu base and is the first expiring
         * timer then this might end up reprogramming the hardware twice
         * (on removal and on enqueue). To avoid that by prevent the
         * reprogram on removal, keep the timer local to the current CPU
         * and enforce reprogramming after it is queued no matter whether
         * it is the new first expiring timer again or not.
         */
        force_local = base->cpu_base == this_cpu_base;
        force_local &= base->cpu_base->next_timer == timer;

        /*
         * Don't force local queuing if this enqueue happens on a unplugged
         * CPU after hrtimer_cpu_dying() has been invoked.
         */
        force_local &= this_cpu_base->online;

        /*
         * Remove an active timer from the queue. In case it is not queued
         * on the current CPU, make sure that remove_hrtimer() updates the
         * remote data correctly.
         *
         * If it's on the current CPU and the first expiring timer, then
         * skip reprogramming, keep the timer local and enforce
         * reprogramming later if it was the first expiring timer.  This
         * avoids programming the underlying clock event twice (once at
         * removal and once after enqueue).
         */
        remove_hrtimer(timer, base, true, force_local);

        if (mode & HRTIMER_MODE_REL)
                tim = ktime_add_safe(tim, __hrtimer_cb_get_time(base->clockid));

        tim = hrtimer_update_lowres(timer, tim, mode);

        hrtimer_set_expires_range_ns(timer, tim, delta_ns);

        /* Switch the timer base, if necessary: */
        if (!force_local) {
                new_base = switch_hrtimer_base(timer, base,
                                               mode & HRTIMER_MODE_PINNED);
        } else {
                new_base = base;
        }

        first = enqueue_hrtimer(timer, new_base, mode);
        if (!force_local) {
                /*
                 * If the current CPU base is online, then the timer is
                 * never queued on a remote CPU if it would be the first
                 * expiring timer there.
                 */
                if (hrtimer_base_is_online(this_cpu_base))
                        return first;

                /*
                 * Timer was enqueued remote because the current base is
                 * already offline. If the timer is the first to expire,
                 * kick the remote CPU to reprogram the clock event.
                 */
                if (first) {
                        struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base;

                        smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd);
                }
                return 0;
        }

        /*
         * Timer was forced to stay on the current CPU to avoid
         * reprogramming on removal and enqueue. Force reprogram the
         * hardware by evaluating the new first expiring timer.
         */
        hrtimer_force_reprogram(new_base->cpu_base, 1);
        return 0;
}

/**
 * hrtimer_start_range_ns - (re)start an hrtimer
 * @timer:        the timer to be added
 * @tim:        expiry time
 * @delta_ns:        "slack" range for the timer
 * @mode:        timer mode: absolute (HRTIMER_MODE_ABS) or
 *                relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
 *                softirq based mode is considered for debug purpose only!
 */
void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
                            u64 delta_ns, const enum hrtimer_mode mode)
{
        struct hrtimer_clock_base *base;
        unsigned long flags;

        /*
         * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
         * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard
         * expiry mode because unmarked timers are moved to softirq expiry.
         */
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
        else
                WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard);

        base = lock_hrtimer_base(timer, &flags);

        if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base))
                hrtimer_reprogram(timer, true);

        unlock_hrtimer_base(timer, &flags);
}
EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);

/**
 * hrtimer_try_to_cancel - try to deactivate a timer
 * @timer:        hrtimer to stop
 *
 * Returns:
 *
 *  *  0 when the timer was not active
 *  *  1 when the timer was active
 *  * -1 when the timer is currently executing the callback function and
 *    cannot be stopped
 */
int hrtimer_try_to_cancel(struct hrtimer *timer)
{
        struct hrtimer_clock_base *base;
        unsigned long flags;
        int ret = -1;

        /*
         * Check lockless first. If the timer is not active (neither
         * enqueued nor running the callback, nothing to do here.  The
         * base lock does not serialize against a concurrent enqueue,
         * so we can avoid taking it.
         */
        if (!hrtimer_active(timer))
                return 0;

        base = lock_hrtimer_base(timer, &flags);

        if (!hrtimer_callback_running(timer))
                ret = remove_hrtimer(timer, base, false, false);

        unlock_hrtimer_base(timer, &flags);

        return ret;

}
EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);

#ifdef CONFIG_PREEMPT_RT
static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base)
{
        spin_lock_init(&base->softirq_expiry_lock);
}

static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base)
        __acquires(&base->softirq_expiry_lock)
{
        spin_lock(&base->softirq_expiry_lock);
}

static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base)
        __releases(&base->softirq_expiry_lock)
{
        spin_unlock(&base->softirq_expiry_lock);
}

/*
 * The counterpart to hrtimer_cancel_wait_running().
 *
 * If there is a waiter for cpu_base->expiry_lock, then it was waiting for
 * the timer callback to finish. Drop expiry_lock and reacquire it. That
 * allows the waiter to acquire the lock and make progress.
 */
static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base,
                                      unsigned long flags)
{
        if (atomic_read(&cpu_base->timer_waiters)) {
                raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
                spin_unlock(&cpu_base->softirq_expiry_lock);
                spin_lock(&cpu_base->softirq_expiry_lock);
                raw_spin_lock_irq(&cpu_base->lock);
        }
}

#ifdef CONFIG_SMP
static __always_inline bool is_migration_base(struct hrtimer_clock_base *base)
{
        return base == &migration_base;
}
#else
static __always_inline bool is_migration_base(struct hrtimer_clock_base *base)
{
        return false;
}
#endif

/*
 * This function is called on PREEMPT_RT kernels when the fast path
 * deletion of a timer failed because the timer callback function was
 * running.
 *
 * This prevents priority inversion: if the soft irq thread is preempted
 * in the middle of a timer callback, then calling hrtimer_cancel() can
 * lead to two issues:
 *
 *  - If the caller is on a remote CPU then it has to spin wait for the timer
 *    handler to complete. This can result in unbound priority inversion.
 *
 *  - If the caller originates from the task which preempted the timer
 *    handler on the same CPU, then spin waiting for the timer handler to
 *    complete is never going to end.
 */
void hrtimer_cancel_wait_running(const struct hrtimer *timer)
{
        /* Lockless read. Prevent the compiler from reloading it below */
        struct hrtimer_clock_base *base = READ_ONCE(timer->base);

        /*
         * Just relax if the timer expires in hard interrupt context or if
         * it is currently on the migration base.
         */
        if (!timer->is_soft || is_migration_base(base)) {
                cpu_relax();
                return;
        }

        /*
         * Mark the base as contended and grab the expiry lock, which is
         * held by the softirq across the timer callback. Drop the lock
         * immediately so the softirq can expire the next timer. In theory
         * the timer could already be running again, but that's more than
         * unlikely and just causes another wait loop.
         */
        atomic_inc(&base->cpu_base->timer_waiters);
        spin_lock_bh(&base->cpu_base->softirq_expiry_lock);
        atomic_dec(&base->cpu_base->timer_waiters);
        spin_unlock_bh(&base->cpu_base->softirq_expiry_lock);
}
#else
static inline void
hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { }
static inline void
hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { }
static inline void
hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { }
static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base,
                                             unsigned long flags) { }
#endif

/**
 * hrtimer_cancel - cancel a timer and wait for the handler to finish.
 * @timer:        the timer to be cancelled
 *
 * Returns:
 *  0 when the timer was not active
 *  1 when the timer was active
 */
int hrtimer_cancel(struct hrtimer *timer)
{
        int ret;

        do {
                ret = hrtimer_try_to_cancel(timer);

                if (ret < 0)
                        hrtimer_cancel_wait_running(timer);
        } while (ret < 0);
        return ret;
}
EXPORT_SYMBOL_GPL(hrtimer_cancel);

/**
 * __hrtimer_get_remaining - get remaining time for the timer
 * @timer:        the timer to read
 * @adjust:        adjust relative timers when CONFIG_TIME_LOW_RES=y
 */
ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust)
{
        unsigned long flags;
        ktime_t rem;

        lock_hrtimer_base(timer, &flags);
        if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust)
                rem = hrtimer_expires_remaining_adjusted(timer);
        else
                rem = hrtimer_expires_remaining(timer);
        unlock_hrtimer_base(timer, &flags);

        return rem;
}
EXPORT_SYMBOL_GPL(__hrtimer_get_remaining);

#ifdef CONFIG_NO_HZ_COMMON
/**
 * hrtimer_get_next_event - get the time until next expiry event
 *
 * Returns the next expiry time or KTIME_MAX if no timer is pending.
 */
u64 hrtimer_get_next_event(void)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        u64 expires = KTIME_MAX;
        unsigned long flags;

        raw_spin_lock_irqsave(&cpu_base->lock, flags);

        if (!hrtimer_hres_active(cpu_base))
                expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);

        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);

        return expires;
}

/**
 * hrtimer_next_event_without - time until next expiry event w/o one timer
 * @exclude:        timer to exclude
 *
 * Returns the next expiry time over all timers except for the @exclude one or
 * KTIME_MAX if none of them is pending.
 */
u64 hrtimer_next_event_without(const struct hrtimer *exclude)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        u64 expires = KTIME_MAX;
        unsigned long flags;

        raw_spin_lock_irqsave(&cpu_base->lock, flags);

        if (hrtimer_hres_active(cpu_base)) {
                unsigned int active;

                if (!cpu_base->softirq_activated) {
                        active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
                        expires = __hrtimer_next_event_base(cpu_base, exclude,
                                                            active, KTIME_MAX);
                }
                active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
                expires = __hrtimer_next_event_base(cpu_base, exclude, active,
                                                    expires);
        }

        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);

        return expires;
}
#endif

static inline int hrtimer_clockid_to_base(clockid_t clock_id)
{
        switch (clock_id) {
        case CLOCK_MONOTONIC:
                return HRTIMER_BASE_MONOTONIC;
        case CLOCK_REALTIME:
                return HRTIMER_BASE_REALTIME;
        case CLOCK_BOOTTIME:
                return HRTIMER_BASE_BOOTTIME;
        case CLOCK_TAI:
                return HRTIMER_BASE_TAI;
        default:
                WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
                return HRTIMER_BASE_MONOTONIC;
        }
}

static ktime_t __hrtimer_cb_get_time(clockid_t clock_id)
{
        switch (clock_id) {
        case CLOCK_MONOTONIC:
                return ktime_get();
        case CLOCK_REALTIME:
                return ktime_get_real();
        case CLOCK_BOOTTIME:
                return ktime_get_boottime();
        case CLOCK_TAI:
                return ktime_get_clocktai();
        default:
                WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
                return ktime_get();
        }
}

ktime_t hrtimer_cb_get_time(const struct hrtimer *timer)
{
        return __hrtimer_cb_get_time(timer->base->clockid);
}
EXPORT_SYMBOL_GPL(hrtimer_cb_get_time);

static void __hrtimer_setup(struct hrtimer *timer,
                            enum hrtimer_restart (*function)(struct hrtimer *),
                            clockid_t clock_id, enum hrtimer_mode mode)
{
        bool softtimer = !!(mode & HRTIMER_MODE_SOFT);
        struct hrtimer_cpu_base *cpu_base;
        int base;

        /*
         * On PREEMPT_RT enabled kernels hrtimers which are not explicitly
         * marked for hard interrupt expiry mode are moved into soft
         * interrupt context for latency reasons and because the callbacks
         * can invoke functions which might sleep on RT, e.g. spin_lock().
         */
        if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(mode & HRTIMER_MODE_HARD))
                softtimer = true;

        memset(timer, 0, sizeof(struct hrtimer));

        cpu_base = raw_cpu_ptr(&hrtimer_bases);

        /*
         * POSIX magic: Relative CLOCK_REALTIME timers are not affected by
         * clock modifications, so they needs to become CLOCK_MONOTONIC to
         * ensure POSIX compliance.
         */
        if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL)
                clock_id = CLOCK_MONOTONIC;

        base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0;
        base += hrtimer_clockid_to_base(clock_id);
        timer->is_soft = softtimer;
        timer->is_hard = !!(mode & HRTIMER_MODE_HARD);
        timer->base = &cpu_base->clock_base[base];
        timerqueue_init(&timer->node);

        if (WARN_ON_ONCE(!function))
                ACCESS_PRIVATE(timer, function) = hrtimer_dummy_timeout;
        else
                ACCESS_PRIVATE(timer, function) = function;
}

/**
 * hrtimer_setup - initialize a timer to the given clock
 * @timer:        the timer to be initialized
 * @function:        the callback function
 * @clock_id:        the clock to be used
 * @mode:       The modes which are relevant for initialization:
 *              HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT,
 *              HRTIMER_MODE_REL_SOFT
 *
 *              The PINNED variants of the above can be handed in,
 *              but the PINNED bit is ignored as pinning happens
 *              when the hrtimer is started
 */
void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *),
                   clockid_t clock_id, enum hrtimer_mode mode)
{
        debug_setup(timer, clock_id, mode);
        __hrtimer_setup(timer, function, clock_id, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_setup);

/**
 * hrtimer_setup_on_stack - initialize a timer on stack memory
 * @timer:        The timer to be initialized
 * @function:        the callback function
 * @clock_id:        The clock to be used
 * @mode:       The timer mode
 *
 * Similar to hrtimer_setup(), except that this one must be used if struct hrtimer is in stack
 * memory.
 */
void hrtimer_setup_on_stack(struct hrtimer *timer,
                            enum hrtimer_restart (*function)(struct hrtimer *),
                            clockid_t clock_id, enum hrtimer_mode mode)
{
        debug_setup_on_stack(timer, clock_id, mode);
        __hrtimer_setup(timer, function, clock_id, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_setup_on_stack);

/*
 * A timer is active, when it is enqueued into the rbtree or the
 * callback function is running or it's in the state of being migrated
 * to another cpu.
 *
 * It is important for this function to not return a false negative.
 */
bool hrtimer_active(const struct hrtimer *timer)
{
        struct hrtimer_clock_base *base;
        unsigned int seq;

        do {
                base = READ_ONCE(timer->base);
                seq = raw_read_seqcount_begin(&base->seq);

                if (timer->state != HRTIMER_STATE_INACTIVE ||
                    base->running == timer)
                        return true;

        } while (read_seqcount_retry(&base->seq, seq) ||
                 base != READ_ONCE(timer->base));

        return false;
}
EXPORT_SYMBOL_GPL(hrtimer_active);

/*
 * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3
 * distinct sections:
 *
 *  - queued:        the timer is queued
 *  - callback:        the timer is being ran
 *  - post:        the timer is inactive or (re)queued
 *
 * On the read side we ensure we observe timer->state and cpu_base->running
 * from the same section, if anything changed while we looked at it, we retry.
 * This includes timer->base changing because sequence numbers alone are
 * insufficient for that.
 *
 * The sequence numbers are required because otherwise we could still observe
 * a false negative if the read side got smeared over multiple consecutive
 * __run_hrtimer() invocations.
 */

static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
                          struct hrtimer_clock_base *base,
                          struct hrtimer *timer, ktime_t *now,
                          unsigned long flags) __must_hold(&cpu_base->lock)
{
        enum hrtimer_restart (*fn)(struct hrtimer *);
        bool expires_in_hardirq;
        int restart;

        lockdep_assert_held(&cpu_base->lock);

        debug_deactivate(timer);
        base->running = timer;

        /*
         * Separate the ->running assignment from the ->state assignment.
         *
         * As with a regular write barrier, this ensures the read side in
         * hrtimer_active() cannot observe base->running == NULL &&
         * timer->state == INACTIVE.
         */
        raw_write_seqcount_barrier(&base->seq);

        __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
        fn = ACCESS_PRIVATE(timer, function);

        /*
         * Clear the 'is relative' flag for the TIME_LOW_RES case. If the
         * timer is restarted with a period then it becomes an absolute
         * timer. If its not restarted it does not matter.
         */
        if (IS_ENABLED(CONFIG_TIME_LOW_RES))
                timer->is_rel = false;

        /*
         * The timer is marked as running in the CPU base, so it is
         * protected against migration to a different CPU even if the lock
         * is dropped.
         */
        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
        trace_hrtimer_expire_entry(timer, now);
        expires_in_hardirq = lockdep_hrtimer_enter(timer);

        restart = fn(timer);

        lockdep_hrtimer_exit(expires_in_hardirq);
        trace_hrtimer_expire_exit(timer);
        raw_spin_lock_irq(&cpu_base->lock);

        /*
         * Note: We clear the running state after enqueue_hrtimer and
         * we do not reprogram the event hardware. Happens either in
         * hrtimer_start_range_ns() or in hrtimer_interrupt()
         *
         * Note: Because we dropped the cpu_base->lock above,
         * hrtimer_start_range_ns() can have popped in and enqueued the timer
         * for us already.
         */
        if (restart != HRTIMER_NORESTART &&
            !(timer->state & HRTIMER_STATE_ENQUEUED))
                enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS);

        /*
         * Separate the ->running assignment from the ->state assignment.
         *
         * As with a regular write barrier, this ensures the read side in
         * hrtimer_active() cannot observe base->running.timer == NULL &&
         * timer->state == INACTIVE.
         */
        raw_write_seqcount_barrier(&base->seq);

        WARN_ON_ONCE(base->running != timer);
        base->running = NULL;
}

static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
                                 unsigned long flags, unsigned int active_mask)
{
        struct hrtimer_clock_base *base;
        unsigned int active = cpu_base->active_bases & active_mask;

        for_each_active_base(base, cpu_base, active) {
                struct timerqueue_node *node;
                ktime_t basenow;

                basenow = ktime_add(now, base->offset);

                while ((node = timerqueue_getnext(&base->active))) {
                        struct hrtimer *timer;

                        timer = container_of(node, struct hrtimer, node);

                        /*
                         * The immediate goal for using the softexpires is
                         * minimizing wakeups, not running timers at the
                         * earliest interrupt after their soft expiration.
                         * This allows us to avoid using a Priority Search
                         * Tree, which can answer a stabbing query for
                         * overlapping intervals and instead use the simple
                         * BST we already have.
                         * We don't add extra wakeups by delaying timers that
                         * are right-of a not yet expired timer, because that
                         * timer will have to trigger a wakeup anyway.
                         */
                        if (basenow < hrtimer_get_softexpires_tv64(timer))
                                break;

                        __run_hrtimer(cpu_base, base, timer, &basenow, flags);
                        if (active_mask == HRTIMER_ACTIVE_SOFT)
                                hrtimer_sync_wait_running(cpu_base, flags);
                }
        }
}

static __latent_entropy void hrtimer_run_softirq(void)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        unsigned long flags;
        ktime_t now;

        hrtimer_cpu_base_lock_expiry(cpu_base);
        raw_spin_lock_irqsave(&cpu_base->lock, flags);

        now = hrtimer_update_base(cpu_base);
        __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT);

        cpu_base->softirq_activated = 0;
        hrtimer_update_softirq_timer(cpu_base, true);

        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
        hrtimer_cpu_base_unlock_expiry(cpu_base);
}

#ifdef CONFIG_HIGH_RES_TIMERS

/*
 * High resolution timer interrupt
 * Called with interrupts disabled
 */
void hrtimer_interrupt(struct clock_event_device *dev)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        ktime_t expires_next, now, entry_time, delta;
        unsigned long flags;
        int retries = 0;

        BUG_ON(!cpu_base->hres_active);
        cpu_base->nr_events++;
        dev->next_event = KTIME_MAX;

        raw_spin_lock_irqsave(&cpu_base->lock, flags);
        entry_time = now = hrtimer_update_base(cpu_base);
retry:
        cpu_base->in_hrtirq = 1;
        /*
         * We set expires_next to KTIME_MAX here with cpu_base->lock
         * held to prevent that a timer is enqueued in our queue via
         * the migration code. This does not affect enqueueing of
         * timers which run their callback and need to be requeued on
         * this CPU.
         */
        cpu_base->expires_next = KTIME_MAX;

        if (!ktime_before(now, cpu_base->softirq_expires_next)) {
                cpu_base->softirq_expires_next = KTIME_MAX;
                cpu_base->softirq_activated = 1;
                raise_timer_softirq(HRTIMER_SOFTIRQ);
        }

        __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);

        /* Reevaluate the clock bases for the [soft] next expiry */
        expires_next = hrtimer_update_next_event(cpu_base);
        /*
         * Store the new expiry value so the migration code can verify
         * against it.
         */
        cpu_base->expires_next = expires_next;
        cpu_base->in_hrtirq = 0;
        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);

        /* Reprogramming necessary ? */
        if (!tick_program_event(expires_next, 0)) {
                cpu_base->hang_detected = 0;
                return;
        }

        /*
         * The next timer was already expired due to:
         * - tracing
         * - long lasting callbacks
         * - being scheduled away when running in a VM
         *
         * We need to prevent that we loop forever in the hrtimer
         * interrupt routine. We give it 3 attempts to avoid
         * overreacting on some spurious event.
         *
         * Acquire base lock for updating the offsets and retrieving
         * the current time.
         */
        raw_spin_lock_irqsave(&cpu_base->lock, flags);
        now = hrtimer_update_base(cpu_base);
        cpu_base->nr_retries++;
        if (++retries < 3)
                goto retry;
        /*
         * Give the system a chance to do something else than looping
         * here. We stored the entry time, so we know exactly how long
         * we spent here. We schedule the next event this amount of
         * time away.
         */
        cpu_base->nr_hangs++;
        cpu_base->hang_detected = 1;
        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);

        delta = ktime_sub(now, entry_time);
        if ((unsigned int)delta > cpu_base->max_hang_time)
                cpu_base->max_hang_time = (unsigned int) delta;
        /*
         * Limit it to a sensible value as we enforce a longer
         * delay. Give the CPU at least 100ms to catch up.
         */
        if (delta > 100 * NSEC_PER_MSEC)
                expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
        else
                expires_next = ktime_add(now, delta);
        tick_program_event(expires_next, 1);
        pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta));
}
#endif /* !CONFIG_HIGH_RES_TIMERS */

/*
 * Called from run_local_timers in hardirq context every jiffy
 */
void hrtimer_run_queues(void)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        unsigned long flags;
        ktime_t now;

        if (hrtimer_hres_active(cpu_base))
                return;

        /*
         * This _is_ ugly: We have to check periodically, whether we
         * can switch to highres and / or nohz mode. The clocksource
         * switch happens with xtime_lock held. Notification from
         * there only sets the check bit in the tick_oneshot code,
         * otherwise we might deadlock vs. xtime_lock.
         */
        if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) {
                hrtimer_switch_to_hres();
                return;
        }

        raw_spin_lock_irqsave(&cpu_base->lock, flags);
        now = hrtimer_update_base(cpu_base);

        if (!ktime_before(now, cpu_base->softirq_expires_next)) {
                cpu_base->softirq_expires_next = KTIME_MAX;
                cpu_base->softirq_activated = 1;
                raise_timer_softirq(HRTIMER_SOFTIRQ);
        }

        __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
}

/*
 * Sleep related functions:
 */
static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
{
        struct hrtimer_sleeper *t =
                container_of(timer, struct hrtimer_sleeper, timer);
        struct task_struct *task = t->task;

        t->task = NULL;
        if (task)
                wake_up_process(task);

        return HRTIMER_NORESTART;
}

/**
 * hrtimer_sleeper_start_expires - Start a hrtimer sleeper timer
 * @sl:                sleeper to be started
 * @mode:        timer mode abs/rel
 *
 * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers
 * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context)
 */
void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl,
                                   enum hrtimer_mode mode)
{
        /*
         * Make the enqueue delivery mode check work on RT. If the sleeper
         * was initialized for hard interrupt delivery, force the mode bit.
         * This is a special case for hrtimer_sleepers because
         * __hrtimer_setup_sleeper() determines the delivery mode on RT so the
         * fiddling with this decision is avoided at the call sites.
         */
        if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard)
                mode |= HRTIMER_MODE_HARD;

        hrtimer_start_expires(&sl->timer, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires);

static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl,
                                    clockid_t clock_id, enum hrtimer_mode mode)
{
        /*
         * On PREEMPT_RT enabled kernels hrtimers which are not explicitly
         * marked for hard interrupt expiry mode are moved into soft
         * interrupt context either for latency reasons or because the
         * hrtimer callback takes regular spinlocks or invokes other
         * functions which are not suitable for hard interrupt context on
         * PREEMPT_RT.
         *
         * The hrtimer_sleeper callback is RT compatible in hard interrupt
         * context, but there is a latency concern: Untrusted userspace can
         * spawn many threads which arm timers for the same expiry time on
         * the same CPU. That causes a latency spike due to the wakeup of
         * a gazillion threads.
         *
         * OTOH, privileged real-time user space applications rely on the
         * low latency of hard interrupt wakeups. If the current task is in
         * a real-time scheduling class, mark the mode for hard interrupt
         * expiry.
         */
        if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
                if (rt_or_dl_task_policy(current) && !(mode & HRTIMER_MODE_SOFT))
                        mode |= HRTIMER_MODE_HARD;
        }

        __hrtimer_setup(&sl->timer, hrtimer_wakeup, clock_id, mode);
        sl->task = current;
}

/**
 * hrtimer_setup_sleeper_on_stack - initialize a sleeper in stack memory
 * @sl:                sleeper to be initialized
 * @clock_id:        the clock to be used
 * @mode:        timer mode abs/rel
 */
void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl,
                                    clockid_t clock_id, enum hrtimer_mode mode)
{
        debug_setup_on_stack(&sl->timer, clock_id, mode);
        __hrtimer_setup_sleeper(sl, clock_id, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_setup_sleeper_on_stack);

int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts)
{
        switch(restart->nanosleep.type) {
#ifdef CONFIG_COMPAT_32BIT_TIME
        case TT_COMPAT:
                if (put_old_timespec32(ts, restart->nanosleep.compat_rmtp))
                        return -EFAULT;
                break;
#endif
        case TT_NATIVE:
                if (put_timespec64(ts, restart->nanosleep.rmtp))
                        return -EFAULT;
                break;
        default:
                BUG();
        }
        return -ERESTART_RESTARTBLOCK;
}

static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
{
        struct restart_block *restart;

        do {
                set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
                hrtimer_sleeper_start_expires(t, mode);

                if (likely(t->task))
                        schedule();

                hrtimer_cancel(&t->timer);
                mode = HRTIMER_MODE_ABS;

        } while (t->task && !signal_pending(current));

        __set_current_state(TASK_RUNNING);

        if (!t->task)
                return 0;

        restart = &current->restart_block;
        if (restart->nanosleep.type != TT_NONE) {
                ktime_t rem = hrtimer_expires_remaining(&t->timer);
                struct timespec64 rmt;

                if (rem <= 0)
                        return 0;
                rmt = ktime_to_timespec64(rem);

                return nanosleep_copyout(restart, &rmt);
        }
        return -ERESTART_RESTARTBLOCK;
}

static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
{
        struct hrtimer_sleeper t;
        int ret;

        hrtimer_setup_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS);
        hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
        ret = do_nanosleep(&t, HRTIMER_MODE_ABS);
        destroy_hrtimer_on_stack(&t.timer);
        return ret;
}

long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
                       const clockid_t clockid)
{
        struct restart_block *restart;
        struct hrtimer_sleeper t;
        int ret = 0;

        hrtimer_setup_sleeper_on_stack(&t, clockid, mode);
        hrtimer_set_expires_range_ns(&t.timer, rqtp, current->timer_slack_ns);
        ret = do_nanosleep(&t, mode);
        if (ret != -ERESTART_RESTARTBLOCK)
                goto out;

        /* Absolute timers do not update the rmtp value and restart: */
        if (mode == HRTIMER_MODE_ABS) {
                ret = -ERESTARTNOHAND;
                goto out;
        }

        restart = &current->restart_block;
        restart->nanosleep.clockid = t.timer.base->clockid;
        restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
        set_restart_fn(restart, hrtimer_nanosleep_restart);
out:
        destroy_hrtimer_on_stack(&t.timer);
        return ret;
}

#ifdef CONFIG_64BIT

SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
                struct __kernel_timespec __user *, rmtp)
{
        struct timespec64 tu;

        if (get_timespec64(&tu, rqtp))
                return -EFAULT;

        if (!timespec64_valid(&tu))
                return -EINVAL;

        current->restart_block.fn = do_no_restart_syscall;
        current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
        current->restart_block.nanosleep.rmtp = rmtp;
        return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
                                 CLOCK_MONOTONIC);
}

#endif

#ifdef CONFIG_COMPAT_32BIT_TIME

SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
                       struct old_timespec32 __user *, rmtp)
{
        struct timespec64 tu;

        if (get_old_timespec32(&tu, rqtp))
                return -EFAULT;

        if (!timespec64_valid(&tu))
                return -EINVAL;

        current->restart_block.fn = do_no_restart_syscall;
        current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
        current->restart_block.nanosleep.compat_rmtp = rmtp;
        return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
                                 CLOCK_MONOTONIC);
}
#endif

/*
 * Functions related to boot-time initialization:
 */
int hrtimers_prepare_cpu(unsigned int cpu)
{
        struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
        int i;

        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
                struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i];

                clock_b->cpu_base = cpu_base;
                seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock);
                timerqueue_init_head(&clock_b->active);
        }

        cpu_base->cpu = cpu;
        hrtimer_cpu_base_init_expiry_lock(cpu_base);
        return 0;
}

int hrtimers_cpu_starting(unsigned int cpu)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);

        /* Clear out any left over state from a CPU down operation */
        cpu_base->active_bases = 0;
        cpu_base->hres_active = 0;
        cpu_base->hang_detected = 0;
        cpu_base->next_timer = NULL;
        cpu_base->softirq_next_timer = NULL;
        cpu_base->expires_next = KTIME_MAX;
        cpu_base->softirq_expires_next = KTIME_MAX;
        cpu_base->online = 1;
        return 0;
}

#ifdef CONFIG_HOTPLUG_CPU

static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
                                struct hrtimer_clock_base *new_base)
{
        struct hrtimer *timer;
        struct timerqueue_node *node;

        while ((node = timerqueue_getnext(&old_base->active))) {
                timer = container_of(node, struct hrtimer, node);
                BUG_ON(hrtimer_callback_running(timer));
                debug_deactivate(timer);

                /*
                 * Mark it as ENQUEUED not INACTIVE otherwise the
                 * timer could be seen as !active and just vanish away
                 * under us on another CPU
                 */
                __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
                timer->base = new_base;
                /*
                 * Enqueue the timers on the new cpu. This does not
                 * reprogram the event device in case the timer
                 * expires before the earliest on this CPU, but we run
                 * hrtimer_interrupt after we migrated everything to
                 * sort out already expired timers and reprogram the
                 * event device.
                 */
                enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS);
        }
}

int hrtimers_cpu_dying(unsigned int dying_cpu)
{
        int i, ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER));
        struct hrtimer_cpu_base *old_base, *new_base;

        old_base = this_cpu_ptr(&hrtimer_bases);
        new_base = &per_cpu(hrtimer_bases, ncpu);

        /*
         * The caller is globally serialized and nobody else
         * takes two locks at once, deadlock is not possible.
         */
        raw_spin_lock(&old_base->lock);
        raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING);

        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
                migrate_hrtimer_list(&old_base->clock_base[i],
                                     &new_base->clock_base[i]);
        }

        /* Tell the other CPU to retrigger the next event */
        smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);

        raw_spin_unlock(&new_base->lock);
        old_base->online = 0;
        raw_spin_unlock(&old_base->lock);

        return 0;
}

#endif /* CONFIG_HOTPLUG_CPU */

void __init hrtimers_init(void)
{
        hrtimers_prepare_cpu(smp_processor_id());
        hrtimers_cpu_starting(smp_processor_id());
        open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq);
}









































































































  320 







































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_UACCESS_64_H
#define _ASM_X86_UACCESS_64_H

/*
 * User space memory access functions
 */
#include <linux/compiler.h>
#include <linux/lockdep.h>
#include <linux/kasan-checks.h>
#include <asm/alternative.h>
#include <asm/cpufeatures.h>
#include <asm/page.h>
#include <asm/percpu.h>

#ifdef MODULE
  #define runtime_const_ptr(sym) (sym)
#else
  #include <asm/runtime-const.h>
#endif
extern unsigned long USER_PTR_MAX;

#ifdef CONFIG_ADDRESS_MASKING
/*
 * Mask out tag bits from the address.
 */
static inline unsigned long __untagged_addr(unsigned long addr)
{
        asm_inline (ALTERNATIVE("", "and " __percpu_arg([mask]) ", %[addr]",
                                X86_FEATURE_LAM)
             : [addr] "+r" (addr)
             : [mask] "m" (__my_cpu_var(tlbstate_untag_mask)));

        return addr;
}

#define untagged_addr(addr)        ({                                        \
        unsigned long __addr = (__force unsigned long)(addr);                \
        (__force __typeof__(addr))__untagged_addr(__addr);                \
})

static inline unsigned long __untagged_addr_remote(struct mm_struct *mm,
                                                   unsigned long addr)
{
        mmap_assert_locked(mm);
        return addr & (mm)->context.untag_mask;
}

#define untagged_addr_remote(mm, addr)        ({                                \
        unsigned long __addr = (__force unsigned long)(addr);                \
        (__force __typeof__(addr))__untagged_addr_remote(mm, __addr);        \
})

#endif

#define valid_user_address(x) \
        likely((__force unsigned long)(x) <= runtime_const_ptr(USER_PTR_MAX))

/*
 * Masking the user address is an alternative to a conditional
 * user_access_begin that can avoid the fencing. This only works
 * for dense accesses starting at the address.
 */
static inline void __user *mask_user_address(const void __user *ptr)
{
        void __user *ret;
        asm("cmp %1,%0\n\t"
            "cmova %1,%0"
                :"=r" (ret)
                :"r" (runtime_const_ptr(USER_PTR_MAX)),
                 "0" (ptr));
        return ret;
}
#define masked_user_access_begin(x) ({                                \
        __auto_type __masked_ptr = (x);                                \
        __masked_ptr = mask_user_address(__masked_ptr);                \
        __uaccess_begin(); __masked_ptr; })

/*
 * User pointers can have tag bits on x86-64.  This scheme tolerates
 * arbitrary values in those bits rather then masking them off.
 *
 * Enforce two rules:
 * 1. 'ptr' must be in the user part of the address space
 * 2. 'ptr+size' must not overflow into kernel addresses
 *
 * Note that we always have at least one guard page between the
 * max user address and the non-canonical gap, allowing us to
 * ignore small sizes entirely.
 *
 * In fact, we could probably remove the size check entirely, since
 * any kernel accesses will be in increasing address order starting
 * at 'ptr'.
 *
 * That's a separate optimization, for now just handle the small
 * constant case.
 */
static inline bool __access_ok(const void __user *ptr, unsigned long size)
{
        if (__builtin_constant_p(size <= PAGE_SIZE) && size <= PAGE_SIZE) {
                return valid_user_address(ptr);
        } else {
                unsigned long sum = size + (__force unsigned long)ptr;

                return valid_user_address(sum) && sum >= (__force unsigned long)ptr;
        }
}
#define __access_ok __access_ok

/*
 * Copy To/From Userspace
 */

/* Handles exceptions in both to and from, but doesn't do access_ok */
__must_check unsigned long
rep_movs_alternative(void *to, const void *from, unsigned len);

static __always_inline __must_check unsigned long
copy_user_generic(void *to, const void *from, unsigned long len)
{
        stac();
        /*
         * If CPU has FSRM feature, use 'rep movs'.
         * Otherwise, use rep_movs_alternative.
         */
        asm volatile(
                "1:\n\t"
                ALTERNATIVE("rep movsb",
                            "call rep_movs_alternative", ALT_NOT(X86_FEATURE_FSRM))
                "2:\n"
                _ASM_EXTABLE_UA(1b, 2b)
                :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
                : : "memory", "rax");
        clac();
        return len;
}

static __always_inline __must_check unsigned long
raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
{
        return copy_user_generic(dst, (__force void *)src, size);
}

static __always_inline __must_check unsigned long
raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
{
        return copy_user_generic((__force void *)dst, src, size);
}

extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size);
extern long __copy_user_flushcache(void *dst, const void __user *src, unsigned size);

static inline int
__copy_from_user_inatomic_nocache(void *dst, const void __user *src,
                                  unsigned size)
{
        long ret;
        kasan_check_write(dst, size);
        stac();
        ret = __copy_user_nocache(dst, src, size);
        clac();
        return ret;
}

static inline int
__copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
{
        kasan_check_write(dst, size);
        return __copy_user_flushcache(dst, src, size);
}

/*
 * Zero Userspace.
 */

__must_check unsigned long
rep_stos_alternative(void __user *addr, unsigned long len);

static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size)
{
        might_fault();
        stac();

        /*
         * No memory constraint because it doesn't change any memory gcc
         * knows about.
         */
        asm volatile(
                "1:\n\t"
                ALTERNATIVE("rep stosb",
                            "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRS))
                "2:\n"
               _ASM_EXTABLE_UA(1b, 2b)
               : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT
               : "a" (0));

        clac();

        return size;
}

static __always_inline unsigned long clear_user(void __user *to, unsigned long n)
{
        if (__access_ok(to, n))
                return __clear_user(to, n);
        return n;
}
#endif /* _ASM_X86_UACCESS_64_H */





















































































































































































































































    3 












   19 








   19 






   19 


















   16 






    3 
















   13 
   16 































































































































































































































































































































































































   19 








   19 






















   19 










   19 
















   19 
   19 


   18 































































    8 





    3 


























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Asynchronous Cryptographic Hash operations.
 *
 * This is the implementation of the ahash (asynchronous hash) API.  It differs
 * from shash (synchronous hash) in that ahash supports asynchronous operations,
 * and it hashes data from scatterlists instead of virtually addressed buffers.
 *
 * The ahash API provides access to both ahash and shash algorithms.  The shash
 * API only provides access to shash algorithms.
 *
 * Copyright (c) 2008 Loc Ho <lho@amcc.com>
 */

#include <crypto/scatterwalk.h>
#include <linux/cryptouser.h>
#include <linux/err.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/scatterlist.h>
#include <linux/slab.h>
#include <linux/seq_file.h>
#include <linux/string.h>
#include <linux/string_choices.h>
#include <net/netlink.h>

#include "hash.h"

#define CRYPTO_ALG_TYPE_AHASH_MASK        0x0000000e

static int ahash_def_finup(struct ahash_request *req);

static inline bool crypto_ahash_block_only(struct crypto_ahash *tfm)
{
        return crypto_ahash_alg(tfm)->halg.base.cra_flags &
               CRYPTO_AHASH_ALG_BLOCK_ONLY;
}

static inline bool crypto_ahash_final_nonzero(struct crypto_ahash *tfm)
{
        return crypto_ahash_alg(tfm)->halg.base.cra_flags &
               CRYPTO_AHASH_ALG_FINAL_NONZERO;
}

static inline bool crypto_ahash_need_fallback(struct crypto_ahash *tfm)
{
        return crypto_ahash_alg(tfm)->halg.base.cra_flags &
               CRYPTO_ALG_NEED_FALLBACK;
}

static inline void ahash_op_done(void *data, int err,
                                 int (*finish)(struct ahash_request *, int))
{
        struct ahash_request *areq = data;
        crypto_completion_t compl;

        compl = areq->saved_complete;
        data = areq->saved_data;
        if (err == -EINPROGRESS)
                goto out;

        areq->base.flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;

        err = finish(areq, err);
        if (err == -EINPROGRESS || err == -EBUSY)
                return;

out:
        compl(data, err);
}

static int hash_walk_next(struct crypto_hash_walk *walk)
{
        unsigned int offset = walk->offset;
        unsigned int nbytes = min(walk->entrylen,
                                  ((unsigned int)(PAGE_SIZE)) - offset);

        walk->data = kmap_local_page(walk->pg);
        walk->data += offset;
        walk->entrylen -= nbytes;
        return nbytes;
}

static int hash_walk_new_entry(struct crypto_hash_walk *walk)
{
        struct scatterlist *sg;

        sg = walk->sg;
        walk->offset = sg->offset;
        walk->pg = sg_page(walk->sg) + (walk->offset >> PAGE_SHIFT);
        walk->offset = offset_in_page(walk->offset);
        walk->entrylen = sg->length;

        if (walk->entrylen > walk->total)
                walk->entrylen = walk->total;
        walk->total -= walk->entrylen;

        return hash_walk_next(walk);
}

int crypto_hash_walk_first(struct ahash_request *req,
                           struct crypto_hash_walk *walk)
{
        walk->total = req->nbytes;
        walk->entrylen = 0;

        if (!walk->total)
                return 0;

        walk->flags = req->base.flags;

        if (ahash_request_isvirt(req)) {
                walk->data = req->svirt;
                walk->total = 0;
                return req->nbytes;
        }

        walk->sg = req->src;

        return hash_walk_new_entry(walk);
}
EXPORT_SYMBOL_GPL(crypto_hash_walk_first);

int crypto_hash_walk_done(struct crypto_hash_walk *walk, int err)
{
        if ((walk->flags & CRYPTO_AHASH_REQ_VIRT))
                return err;

        walk->data -= walk->offset;

        kunmap_local(walk->data);
        crypto_yield(walk->flags);

        if (err)
                return err;

        if (walk->entrylen) {
                walk->offset = 0;
                walk->pg++;
                return hash_walk_next(walk);
        }

        if (!walk->total)
                return 0;

        walk->sg = sg_next(walk->sg);

        return hash_walk_new_entry(walk);
}
EXPORT_SYMBOL_GPL(crypto_hash_walk_done);

/*
 * For an ahash tfm that is using an shash algorithm (instead of an ahash
 * algorithm), this returns the underlying shash tfm.
 */
static inline struct crypto_shash *ahash_to_shash(struct crypto_ahash *tfm)
{
        return *(struct crypto_shash **)crypto_ahash_ctx(tfm);
}

static inline struct shash_desc *prepare_shash_desc(struct ahash_request *req,
                                                    struct crypto_ahash *tfm)
{
        struct shash_desc *desc = ahash_request_ctx(req);

        desc->tfm = ahash_to_shash(tfm);
        return desc;
}

int shash_ahash_update(struct ahash_request *req, struct shash_desc *desc)
{
        struct crypto_hash_walk walk;
        int nbytes;

        for (nbytes = crypto_hash_walk_first(req, &walk); nbytes > 0;
             nbytes = crypto_hash_walk_done(&walk, nbytes))
                nbytes = crypto_shash_update(desc, walk.data, nbytes);

        return nbytes;
}
EXPORT_SYMBOL_GPL(shash_ahash_update);

int shash_ahash_finup(struct ahash_request *req, struct shash_desc *desc)
{
        struct crypto_hash_walk walk;
        int nbytes;

        nbytes = crypto_hash_walk_first(req, &walk);
        if (!nbytes)
                return crypto_shash_final(desc, req->result);

        do {
                nbytes = crypto_hash_walk_last(&walk) ?
                         crypto_shash_finup(desc, walk.data, nbytes,
                                            req->result) :
                         crypto_shash_update(desc, walk.data, nbytes);
                nbytes = crypto_hash_walk_done(&walk, nbytes);
        } while (nbytes > 0);

        return nbytes;
}
EXPORT_SYMBOL_GPL(shash_ahash_finup);

int shash_ahash_digest(struct ahash_request *req, struct shash_desc *desc)
{
        unsigned int nbytes = req->nbytes;
        struct scatterlist *sg;
        unsigned int offset;
        struct page *page;
        const u8 *data;
        int err;

        data = req->svirt;
        if (!nbytes || ahash_request_isvirt(req))
                return crypto_shash_digest(desc, data, nbytes, req->result);

        sg = req->src;
        if (nbytes > sg->length)
                return crypto_shash_init(desc) ?:
                       shash_ahash_finup(req, desc);

        page = sg_page(sg);
        offset = sg->offset;
        data = lowmem_page_address(page) + offset;
        if (!IS_ENABLED(CONFIG_HIGHMEM))
                return crypto_shash_digest(desc, data, nbytes, req->result);

        page += offset >> PAGE_SHIFT;
        offset = offset_in_page(offset);

        if (nbytes > (unsigned int)PAGE_SIZE - offset)
                return crypto_shash_init(desc) ?:
                       shash_ahash_finup(req, desc);

        data = kmap_local_page(page);
        err = crypto_shash_digest(desc, data + offset, nbytes,
                                  req->result);
        kunmap_local(data);
        return err;
}
EXPORT_SYMBOL_GPL(shash_ahash_digest);

static void crypto_exit_ahash_using_shash(struct crypto_tfm *tfm)
{
        struct crypto_shash **ctx = crypto_tfm_ctx(tfm);

        crypto_free_shash(*ctx);
}

static int crypto_init_ahash_using_shash(struct crypto_tfm *tfm)
{
        struct crypto_alg *calg = tfm->__crt_alg;
        struct crypto_ahash *crt = __crypto_ahash_cast(tfm);
        struct crypto_shash **ctx = crypto_tfm_ctx(tfm);
        struct crypto_shash *shash;

        if (!crypto_mod_get(calg))
                return -EAGAIN;

        shash = crypto_create_tfm(calg, &crypto_shash_type);
        if (IS_ERR(shash)) {
                crypto_mod_put(calg);
                return PTR_ERR(shash);
        }

        crt->using_shash = true;
        *ctx = shash;
        tfm->exit = crypto_exit_ahash_using_shash;

        crypto_ahash_set_flags(crt, crypto_shash_get_flags(shash) &
                                    CRYPTO_TFM_NEED_KEY);

        return 0;
}

static int ahash_nosetkey(struct crypto_ahash *tfm, const u8 *key,
                          unsigned int keylen)
{
        return -ENOSYS;
}

static void ahash_set_needkey(struct crypto_ahash *tfm, struct ahash_alg *alg)
{
        if (alg->setkey != ahash_nosetkey &&
            !(alg->halg.base.cra_flags & CRYPTO_ALG_OPTIONAL_KEY))
                crypto_ahash_set_flags(tfm, CRYPTO_TFM_NEED_KEY);
}

int crypto_ahash_setkey(struct crypto_ahash *tfm, const u8 *key,
                        unsigned int keylen)
{
        if (likely(tfm->using_shash)) {
                struct crypto_shash *shash = ahash_to_shash(tfm);
                int err;

                err = crypto_shash_setkey(shash, key, keylen);
                if (unlikely(err)) {
                        crypto_ahash_set_flags(tfm,
                                               crypto_shash_get_flags(shash) &
                                               CRYPTO_TFM_NEED_KEY);
                        return err;
                }
        } else {
                struct ahash_alg *alg = crypto_ahash_alg(tfm);
                int err;

                err = alg->setkey(tfm, key, keylen);
                if (!err && crypto_ahash_need_fallback(tfm))
                        err = crypto_ahash_setkey(crypto_ahash_fb(tfm),
                                                  key, keylen);
                if (unlikely(err)) {
                        ahash_set_needkey(tfm, alg);
                        return err;
                }
        }
        crypto_ahash_clear_flags(tfm, CRYPTO_TFM_NEED_KEY);
        return 0;
}
EXPORT_SYMBOL_GPL(crypto_ahash_setkey);

static int ahash_do_req_chain(struct ahash_request *req,
                              int (*const *op)(struct ahash_request *req))
{
        struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
        int err;

        if (crypto_ahash_req_virt(tfm) || !ahash_request_isvirt(req))
                return (*op)(req);

        if (crypto_ahash_statesize(tfm) > HASH_MAX_STATESIZE)
                return -ENOSYS;

        if (!crypto_ahash_need_fallback(tfm))
                return -ENOSYS;

        if (crypto_hash_no_export_core(tfm))
                return -ENOSYS;

        {
                u8 state[HASH_MAX_STATESIZE];

                if (op == &crypto_ahash_alg(tfm)->digest) {
                        ahash_request_set_tfm(req, crypto_ahash_fb(tfm));
                        err = crypto_ahash_digest(req);
                        goto out_no_state;
                }

                err = crypto_ahash_export(req, state);
                ahash_request_set_tfm(req, crypto_ahash_fb(tfm));
                err = err ?: crypto_ahash_import(req, state);

                if (op == &crypto_ahash_alg(tfm)->finup) {
                        err = err ?: crypto_ahash_finup(req);
                        goto out_no_state;
                }

                err = err ?:
                      crypto_ahash_update(req) ?:
                      crypto_ahash_export(req, state);

                ahash_request_set_tfm(req, tfm);
                return err ?: crypto_ahash_import(req, state);

out_no_state:
                ahash_request_set_tfm(req, tfm);
                return err;
        }
}

int crypto_ahash_init(struct ahash_request *req)
{
        struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);

        if (likely(tfm->using_shash))
                return crypto_shash_init(prepare_shash_desc(req, tfm));
        if (crypto_ahash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
                return -ENOKEY;
        if (ahash_req_on_stack(req) && ahash_is_async(tfm))
                return -EAGAIN;
        if (crypto_ahash_block_only(tfm)) {
                u8 *buf = ahash_request_ctx(req);

                buf += crypto_ahash_reqsize(tfm) - 1;
                *buf = 0;
        }
        return crypto_ahash_alg(tfm)->init(req);
}
EXPORT_SYMBOL_GPL(crypto_ahash_init);

static void ahash_save_req(struct ahash_request *req, crypto_completion_t cplt)
{
        req->saved_complete = req->base.complete;
        req->saved_data = req->base.data;
        req->base.complete = cplt;
        req->base.data = req;
}

static void ahash_restore_req(struct ahash_request *req)
{
        req->base.complete = req->saved_complete;
        req->base.data = req->saved_data;
}

static int ahash_update_finish(struct ahash_request *req, int err)
{
        struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
        bool nonzero = crypto_ahash_final_nonzero(tfm);
        int bs = crypto_ahash_blocksize(tfm);
        u8 *blenp = ahash_request_ctx(req);
        int blen;
        u8 *buf;

        blenp += crypto_ahash_reqsize(tfm) - 1;
        blen = *blenp;
        buf = blenp - bs;

        if (blen) {
                req->src = req->sg_head + 1;
                if (sg_is_chain(req->src))
                        req->src = sg_chain_ptr(req->src);
        }

        req->nbytes += nonzero - blen;

        blen = err < 0 ? 0 : err + nonzero;
        if (ahash_request_isvirt(req))
                memcpy(buf, req->svirt + req->nbytes - blen, blen);
        else
                memcpy_from_sglist(buf, req->src, req->nbytes - blen, blen);
        *blenp = blen;

        ahash_restore_req(req);

        return err;
}

static void ahash_update_done(void *data, int err)
{
        ahash_op_done(data, err, ahash_update_finish);
}

int crypto_ahash_update(struct ahash_request *req)
{
        struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
        bool nonzero = crypto_ahash_final_nonzero(tfm);
        int bs = crypto_ahash_blocksize(tfm);
        u8 *blenp = ahash_request_ctx(req);
        int blen, err;
        u8 *buf;

        if (likely(tfm->using_shash))
                return shash_ahash_update(req, ahash_request_ctx(req));
        if (ahash_req_on_stack(req) && ahash_is_async(tfm))
                return -EAGAIN;
        if (!crypto_ahash_block_only(tfm))
                return ahash_do_req_chain(req, &crypto_ahash_alg(tfm)->update);

        blenp += crypto_ahash_reqsize(tfm) - 1;
        blen = *blenp;
        buf = blenp - bs;

        if (blen + req->nbytes < bs + nonzero) {
                if (ahash_request_isvirt(req))
                        memcpy(buf + blen, req->svirt, req->nbytes);
                else
                        memcpy_from_sglist(buf + blen, req->src, 0,
                                           req->nbytes);

                *blenp += req->nbytes;
                return 0;
        }

        if (blen) {
                memset(req->sg_head, 0, sizeof(req->sg_head[0]));
                sg_set_buf(req->sg_head, buf, blen);
                if (req->src != req->sg_head + 1)
                        sg_chain(req->sg_head, 2, req->src);
                req->src = req->sg_head;
                req->nbytes += blen;
        }
        req->nbytes -= nonzero;

        ahash_save_req(req, ahash_update_done);

        err = ahash_do_req_chain(req, &crypto_ahash_alg(tfm)->update);
        if (err == -EINPROGRESS || err == -EBUSY)
                return err;

        return ahash_update_finish(req, err);
}
EXPORT_SYMBOL_GPL(crypto_ahash_update);

static int ahash_finup_finish(struct ahash_request *req, int err)
{
        struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
        u8 *blenp = ahash_request_ctx(req);
        int blen;

        blenp += crypto_ahash_reqsize(tfm) - 1;
        blen = *blenp;

        if (blen) {
                if (sg_is_last(req->src))
                        req->src = NULL;
                else {
                        req->src = req->sg_head + 1;
                        if (sg_is_chain(req->src))
                                req->src = sg_chain_ptr(req->src);
                }
                req->nbytes -= blen;
        }

        ahash_restore_req(req);

        return err;
}

static void ahash_finup_done(void *data, int err)
{
        ahash_op_done(data, err, ahash_finup_finish);
}

int crypto_ahash_finup(struct ahash_request *req)
{
        struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
        int bs = crypto_ahash_blocksize(tfm);
        u8 *blenp = ahash_request_ctx(req);
        int blen, err;
        u8 *buf;

        if (likely(tfm->using_shash))
                return shash_ahash_finup(req, ahash_request_ctx(req));
        if (ahash_req_on_stack(req) && ahash_is_async(tfm))
                return -EAGAIN;
        if (!crypto_ahash_alg(tfm)->finup)
                return ahash_def_finup(req);
        if (!crypto_ahash_block_only(tfm))
                return ahash_do_req_chain(req, &crypto_ahash_alg(tfm)->finup);

        blenp += crypto_ahash_reqsize(tfm) - 1;
        blen = *blenp;
        buf = blenp - bs;

        if (blen) {
                memset(req->sg_head, 0, sizeof(req->sg_head[0]));
                sg_set_buf(req->sg_head, buf, blen);
                if (!req->src)
                        sg_mark_end(req->sg_head);
                else if (req->src != req->sg_head + 1)
                        sg_chain(req->sg_head, 2, req->src);
                req->src = req->sg_head;
                req->nbytes += blen;
        }

        ahash_save_req(req, ahash_finup_done);

        err = ahash_do_req_chain(req, &crypto_ahash_alg(tfm)->finup);
        if (err == -EINPROGRESS || err == -EBUSY)
                return err;

        return ahash_finup_finish(req, err);
}
EXPORT_SYMBOL_GPL(crypto_ahash_finup);

int crypto_ahash_digest(struct ahash_request *req)
{
        struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);

        if (likely(tfm->using_shash))
                return shash_ahash_digest(req, prepare_shash_desc(req, tfm));
        if (ahash_req_on_stack(req) && ahash_is_async(tfm))
                return -EAGAIN;
        if (crypto_ahash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
                return -ENOKEY;
        return ahash_do_req_chain(req, &crypto_ahash_alg(tfm)->digest);
}
EXPORT_SYMBOL_GPL(crypto_ahash_digest);

static void ahash_def_finup_done2(void *data, int err)
{
        struct ahash_request *areq = data;

        if (err == -EINPROGRESS)
                return;

        ahash_restore_req(areq);
        ahash_request_complete(areq, err);
}

static int ahash_def_finup_finish1(struct ahash_request *req, int err)
{
        struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);

        if (err)
                goto out;

        req->base.complete = ahash_def_finup_done2;

        err = crypto_ahash_alg(tfm)->final(req);
        if (err == -EINPROGRESS || err == -EBUSY)
                return err;

out:
        ahash_restore_req(req);
        return err;
}

static void ahash_def_finup_done1(void *data, int err)
{
        ahash_op_done(data, err, ahash_def_finup_finish1);
}

static int ahash_def_finup(struct ahash_request *req)
{
        int err;

        ahash_save_req(req, ahash_def_finup_done1);

        err = crypto_ahash_update(req);
        if (err == -EINPROGRESS || err == -EBUSY)
                return err;

        return ahash_def_finup_finish1(req, err);
}

int crypto_ahash_export_core(struct ahash_request *req, void *out)
{
        struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);

        if (likely(tfm->using_shash))
                return crypto_shash_export_core(ahash_request_ctx(req), out);
        return crypto_ahash_alg(tfm)->export_core(req, out);
}
EXPORT_SYMBOL_GPL(crypto_ahash_export_core);

int crypto_ahash_export(struct ahash_request *req, void *out)
{
        struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);

        if (likely(tfm->using_shash))
                return crypto_shash_export(ahash_request_ctx(req), out);
        if (crypto_ahash_block_only(tfm)) {
                unsigned int plen = crypto_ahash_blocksize(tfm) + 1;
                unsigned int reqsize = crypto_ahash_reqsize(tfm);
                unsigned int ss = crypto_ahash_statesize(tfm);
                u8 *buf = ahash_request_ctx(req);

                memcpy(out + ss - plen, buf + reqsize - plen, plen);
        }
        return crypto_ahash_alg(tfm)->export(req, out);
}
EXPORT_SYMBOL_GPL(crypto_ahash_export);

int crypto_ahash_import_core(struct ahash_request *req, const void *in)
{
        struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);

        if (likely(tfm->using_shash))
                return crypto_shash_import_core(prepare_shash_desc(req, tfm),
                                                in);
        if (crypto_ahash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
                return -ENOKEY;
        return crypto_ahash_alg(tfm)->import_core(req, in);
}
EXPORT_SYMBOL_GPL(crypto_ahash_import_core);

int crypto_ahash_import(struct ahash_request *req, const void *in)
{
        struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);

        if (likely(tfm->using_shash))
                return crypto_shash_import(prepare_shash_desc(req, tfm), in);
        if (crypto_ahash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
                return -ENOKEY;
        if (crypto_ahash_block_only(tfm)) {
                unsigned int reqsize = crypto_ahash_reqsize(tfm);
                u8 *buf = ahash_request_ctx(req);

                buf[reqsize - 1] = 0;
        }
        return crypto_ahash_alg(tfm)->import(req, in);
}
EXPORT_SYMBOL_GPL(crypto_ahash_import);

static void crypto_ahash_exit_tfm(struct crypto_tfm *tfm)
{
        struct crypto_ahash *hash = __crypto_ahash_cast(tfm);
        struct ahash_alg *alg = crypto_ahash_alg(hash);

        if (alg->exit_tfm)
                alg->exit_tfm(hash);
        else if (tfm->__crt_alg->cra_exit)
                tfm->__crt_alg->cra_exit(tfm);

        if (crypto_ahash_need_fallback(hash))
                crypto_free_ahash(crypto_ahash_fb(hash));
}

static int crypto_ahash_init_tfm(struct crypto_tfm *tfm)
{
        struct crypto_ahash *hash = __crypto_ahash_cast(tfm);
        struct ahash_alg *alg = crypto_ahash_alg(hash);
        struct crypto_ahash *fb = NULL;
        int err;

        crypto_ahash_set_statesize(hash, alg->halg.statesize);
        crypto_ahash_set_reqsize(hash, crypto_tfm_alg_reqsize(tfm));

        if (tfm->__crt_alg->cra_type == &crypto_shash_type)
                return crypto_init_ahash_using_shash(tfm);

        if (crypto_ahash_need_fallback(hash)) {
                fb = crypto_alloc_ahash(crypto_ahash_alg_name(hash),
                                        CRYPTO_ALG_REQ_VIRT,
                                        CRYPTO_ALG_ASYNC |
                                        CRYPTO_ALG_REQ_VIRT |
                                        CRYPTO_AHASH_ALG_NO_EXPORT_CORE);
                if (IS_ERR(fb))
                        return PTR_ERR(fb);

                tfm->fb = crypto_ahash_tfm(fb);
        }

        ahash_set_needkey(hash, alg);

        tfm->exit = crypto_ahash_exit_tfm;

        if (alg->init_tfm)
                err = alg->init_tfm(hash);
        else if (tfm->__crt_alg->cra_init)
                err = tfm->__crt_alg->cra_init(tfm);
        else
                return 0;

        if (err)
                goto out_free_sync_hash;

        if (!ahash_is_async(hash) && crypto_ahash_reqsize(hash) >
                                     MAX_SYNC_HASH_REQSIZE)
                goto out_exit_tfm;

        BUILD_BUG_ON(HASH_MAX_DESCSIZE > MAX_SYNC_HASH_REQSIZE);
        if (crypto_ahash_reqsize(hash) < HASH_MAX_DESCSIZE)
                crypto_ahash_set_reqsize(hash, HASH_MAX_DESCSIZE);

        return 0;

out_exit_tfm:
        if (alg->exit_tfm)
                alg->exit_tfm(hash);
        else if (tfm->__crt_alg->cra_exit)
                tfm->__crt_alg->cra_exit(tfm);
        err = -EINVAL;
out_free_sync_hash:
        crypto_free_ahash(fb);
        return err;
}

static unsigned int crypto_ahash_extsize(struct crypto_alg *alg)
{
        if (alg->cra_type == &crypto_shash_type)
                return sizeof(struct crypto_shash *);

        return crypto_alg_extsize(alg);
}

static void crypto_ahash_free_instance(struct crypto_instance *inst)
{
        struct ahash_instance *ahash = ahash_instance(inst);

        ahash->free(ahash);
}

static int __maybe_unused crypto_ahash_report(
        struct sk_buff *skb, struct crypto_alg *alg)
{
        struct crypto_report_hash rhash;

        memset(&rhash, 0, sizeof(rhash));

        strscpy(rhash.type, "ahash", sizeof(rhash.type));

        rhash.blocksize = alg->cra_blocksize;
        rhash.digestsize = __crypto_hash_alg_common(alg)->digestsize;

        return nla_put(skb, CRYPTOCFGA_REPORT_HASH, sizeof(rhash), &rhash);
}

static void crypto_ahash_show(struct seq_file *m, struct crypto_alg *alg)
        __maybe_unused;
static void crypto_ahash_show(struct seq_file *m, struct crypto_alg *alg)
{
        seq_printf(m, "type         : ahash\n");
        seq_printf(m, "async        : %s\n",
                   str_yes_no(alg->cra_flags & CRYPTO_ALG_ASYNC));
        seq_printf(m, "blocksize    : %u\n", alg->cra_blocksize);
        seq_printf(m, "digestsize   : %u\n",
                   __crypto_hash_alg_common(alg)->digestsize);
}

static const struct crypto_type crypto_ahash_type = {
        .extsize = crypto_ahash_extsize,
        .init_tfm = crypto_ahash_init_tfm,
        .free = crypto_ahash_free_instance,
#ifdef CONFIG_PROC_FS
        .show = crypto_ahash_show,
#endif
#if IS_ENABLED(CONFIG_CRYPTO_USER)
        .report = crypto_ahash_report,
#endif
        .maskclear = ~CRYPTO_ALG_TYPE_MASK,
        .maskset = CRYPTO_ALG_TYPE_AHASH_MASK,
        .type = CRYPTO_ALG_TYPE_AHASH,
        .tfmsize = offsetof(struct crypto_ahash, base),
        .algsize = offsetof(struct ahash_alg, halg.base),
};

int crypto_grab_ahash(struct crypto_ahash_spawn *spawn,
                      struct crypto_instance *inst,
                      const char *name, u32 type, u32 mask)
{
        spawn->base.frontend = &crypto_ahash_type;
        return crypto_grab_spawn(&spawn->base, inst, name, type, mask);
}
EXPORT_SYMBOL_GPL(crypto_grab_ahash);

struct crypto_ahash *crypto_alloc_ahash(const char *alg_name, u32 type,
                                        u32 mask)
{
        return crypto_alloc_tfm(alg_name, &crypto_ahash_type, type, mask);
}
EXPORT_SYMBOL_GPL(crypto_alloc_ahash);

int crypto_has_ahash(const char *alg_name, u32 type, u32 mask)
{
        return crypto_type_has_alg(alg_name, &crypto_ahash_type, type, mask);
}
EXPORT_SYMBOL_GPL(crypto_has_ahash);

bool crypto_hash_alg_has_setkey(struct hash_alg_common *halg)
{
        struct crypto_alg *alg = &halg->base;

        if (alg->cra_type == &crypto_shash_type)
                return crypto_shash_alg_has_setkey(__crypto_shash_alg(alg));

        return __crypto_ahash_alg(alg)->setkey != ahash_nosetkey;
}
EXPORT_SYMBOL_GPL(crypto_hash_alg_has_setkey);

struct crypto_ahash *crypto_clone_ahash(struct crypto_ahash *hash)
{
        struct hash_alg_common *halg = crypto_hash_alg_common(hash);
        struct crypto_tfm *tfm = crypto_ahash_tfm(hash);
        struct crypto_ahash *fb = NULL;
        struct crypto_ahash *nhash;
        struct ahash_alg *alg;
        int err;

        if (!crypto_hash_alg_has_setkey(halg)) {
                tfm = crypto_tfm_get(tfm);
                if (IS_ERR(tfm))
                        return ERR_CAST(tfm);

                return hash;
        }

        nhash = crypto_clone_tfm(&crypto_ahash_type, tfm);

        if (IS_ERR(nhash))
                return nhash;

        nhash->reqsize = hash->reqsize;
        nhash->statesize = hash->statesize;

        if (likely(hash->using_shash)) {
                struct crypto_shash **nctx = crypto_ahash_ctx(nhash);
                struct crypto_shash *shash;

                shash = crypto_clone_shash(ahash_to_shash(hash));
                if (IS_ERR(shash)) {
                        err = PTR_ERR(shash);
                        goto out_free_nhash;
                }
                crypto_ahash_tfm(nhash)->exit = crypto_exit_ahash_using_shash;
                nhash->using_shash = true;
                *nctx = shash;
                return nhash;
        }

        if (crypto_ahash_need_fallback(hash)) {
                fb = crypto_clone_ahash(crypto_ahash_fb(hash));
                err = PTR_ERR(fb);
                if (IS_ERR(fb))
                        goto out_free_nhash;

                crypto_ahash_tfm(nhash)->fb = crypto_ahash_tfm(fb);
        }

        err = -ENOSYS;
        alg = crypto_ahash_alg(hash);
        if (!alg->clone_tfm)
                goto out_free_fb;

        err = alg->clone_tfm(nhash, hash);
        if (err)
                goto out_free_fb;

        crypto_ahash_tfm(nhash)->exit = crypto_ahash_exit_tfm;

        return nhash;

out_free_fb:
        crypto_free_ahash(fb);
out_free_nhash:
        crypto_free_ahash(nhash);
        return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(crypto_clone_ahash);

static int ahash_default_export_core(struct ahash_request *req, void *out)
{
        return -ENOSYS;
}

static int ahash_default_import_core(struct ahash_request *req, const void *in)
{
        return -ENOSYS;
}

static int ahash_prepare_alg(struct ahash_alg *alg)
{
        struct crypto_alg *base = &alg->halg.base;
        int err;

        if (alg->halg.statesize == 0)
                return -EINVAL;

        if (base->cra_reqsize && base->cra_reqsize < alg->halg.statesize)
                return -EINVAL;

        if (!(base->cra_flags & CRYPTO_ALG_ASYNC) &&
            base->cra_reqsize > MAX_SYNC_HASH_REQSIZE)
                return -EINVAL;

        if (base->cra_flags & CRYPTO_ALG_NEED_FALLBACK &&
            base->cra_flags & CRYPTO_ALG_NO_FALLBACK)
                return -EINVAL;

        err = hash_prepare_alg(&alg->halg);
        if (err)
                return err;

        base->cra_type = &crypto_ahash_type;
        base->cra_flags |= CRYPTO_ALG_TYPE_AHASH;

        if ((base->cra_flags ^ CRYPTO_ALG_REQ_VIRT) &
            (CRYPTO_ALG_ASYNC | CRYPTO_ALG_REQ_VIRT) &&
            !(base->cra_flags & CRYPTO_ALG_NO_FALLBACK))
                base->cra_flags |= CRYPTO_ALG_NEED_FALLBACK;

        if (!alg->setkey)
                alg->setkey = ahash_nosetkey;

        if (base->cra_flags & CRYPTO_AHASH_ALG_BLOCK_ONLY) {
                BUILD_BUG_ON(MAX_ALGAPI_BLOCKSIZE >= 256);
                if (!alg->finup)
                        return -EINVAL;

                base->cra_reqsize += base->cra_blocksize + 1;
                alg->halg.statesize += base->cra_blocksize + 1;
                alg->export_core = alg->export;
                alg->import_core = alg->import;
        } else if (!alg->export_core || !alg->import_core) {
                alg->export_core = ahash_default_export_core;
                alg->import_core = ahash_default_import_core;
                base->cra_flags |= CRYPTO_AHASH_ALG_NO_EXPORT_CORE;
        }

        return 0;
}

int crypto_register_ahash(struct ahash_alg *alg)
{
        struct crypto_alg *base = &alg->halg.base;
        int err;

        err = ahash_prepare_alg(alg);
        if (err)
                return err;

        return crypto_register_alg(base);
}
EXPORT_SYMBOL_GPL(crypto_register_ahash);

void crypto_unregister_ahash(struct ahash_alg *alg)
{
        crypto_unregister_alg(&alg->halg.base);
}
EXPORT_SYMBOL_GPL(crypto_unregister_ahash);

int crypto_register_ahashes(struct ahash_alg *algs, int count)
{
        int i, ret;

        for (i = 0; i < count; i++) {
                ret = crypto_register_ahash(&algs[i]);
                if (ret)
                        goto err;
        }

        return 0;

err:
        for (--i; i >= 0; --i)
                crypto_unregister_ahash(&algs[i]);

        return ret;
}
EXPORT_SYMBOL_GPL(crypto_register_ahashes);

void crypto_unregister_ahashes(struct ahash_alg *algs, int count)
{
        int i;

        for (i = count - 1; i >= 0; --i)
                crypto_unregister_ahash(&algs[i]);
}
EXPORT_SYMBOL_GPL(crypto_unregister_ahashes);

int ahash_register_instance(struct crypto_template *tmpl,
                            struct ahash_instance *inst)
{
        int err;

        if (WARN_ON(!inst->free))
                return -EINVAL;

        err = ahash_prepare_alg(&inst->alg);
        if (err)
                return err;

        return crypto_register_instance(tmpl, ahash_crypto_instance(inst));
}
EXPORT_SYMBOL_GPL(ahash_register_instance);

void ahash_request_free(struct ahash_request *req)
{
        if (unlikely(!req))
                return;

        if (!ahash_req_on_stack(req)) {
                kfree(req);
                return;
        }

        ahash_request_zero(req);
}
EXPORT_SYMBOL_GPL(ahash_request_free);

int crypto_hash_digest(struct crypto_ahash *tfm, const u8 *data,
                       unsigned int len, u8 *out)
{
        HASH_REQUEST_ON_STACK(req, crypto_ahash_fb(tfm));
        int err;

        ahash_request_set_callback(req, 0, NULL, NULL);
        ahash_request_set_virt(req, data, out, len);
        err = crypto_ahash_digest(req);

        ahash_request_zero(req);

        return err;
}
EXPORT_SYMBOL_GPL(crypto_hash_digest);

void ahash_free_singlespawn_instance(struct ahash_instance *inst)
{
        crypto_drop_spawn(ahash_instance_ctx(inst));
        kfree(inst);
}
EXPORT_SYMBOL_GPL(ahash_free_singlespawn_instance);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Asynchronous cryptographic hash type");


































































































































































































































   17 




































































   15 






















    2 






























































    9 




































































































































































































































































































































































































































































































































































































   11 












   76 













   12 



   10 
   16 











    3 
    6 





























   12 
   25 
   86 


















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_LIST_H
#define _LINUX_LIST_H

#include <linux/container_of.h>
#include <linux/types.h>
#include <linux/stddef.h>
#include <linux/poison.h>
#include <linux/const.h>

#include <asm/barrier.h>

/*
 * Circular doubly linked list implementation.
 *
 * Some of the internal functions ("__xxx") are useful when
 * manipulating whole lists rather than single entries, as
 * sometimes we already know the next/prev entries and we can
 * generate better code by using them directly rather than
 * using the generic single-entry routines.
 */

/**
 * LIST_HEAD_INIT - initialize a &struct list_head's links to point to itself
 * @name: name of the list_head
 */
#define LIST_HEAD_INIT(name) { &(name), &(name) }

/**
 * LIST_HEAD - definition of a &struct list_head with initialization values
 * @name: name of the list_head
 */
#define LIST_HEAD(name) \
        struct list_head name = LIST_HEAD_INIT(name)

/**
 * INIT_LIST_HEAD - Initialize a list_head structure
 * @list: list_head structure to be initialized.
 *
 * Initializes the list_head to point to itself.  If it is a list header,
 * the result is an empty list.
 */
static inline void INIT_LIST_HEAD(struct list_head *list)
{
        WRITE_ONCE(list->next, list);
        WRITE_ONCE(list->prev, list);
}

#ifdef CONFIG_LIST_HARDENED

#ifdef CONFIG_DEBUG_LIST
# define __list_valid_slowpath
#else
# define __list_valid_slowpath __cold __preserve_most
#endif

/*
 * Performs the full set of list corruption checks before __list_add().
 * On list corruption reports a warning, and returns false.
 */
bool __list_valid_slowpath __list_add_valid_or_report(struct list_head *new,
                                                      struct list_head *prev,
                                                      struct list_head *next);

/*
 * Performs list corruption checks before __list_add(). Returns false if a
 * corruption is detected, true otherwise.
 *
 * With CONFIG_LIST_HARDENED only, performs minimal list integrity checking
 * inline to catch non-faulting corruptions, and only if a corruption is
 * detected calls the reporting function __list_add_valid_or_report().
 */
static __always_inline bool __list_add_valid(struct list_head *new,
                                             struct list_head *prev,
                                             struct list_head *next)
{
        bool ret = true;

        if (!IS_ENABLED(CONFIG_DEBUG_LIST)) {
                /*
                 * With the hardening version, elide checking if next and prev
                 * are NULL, since the immediate dereference of them below would
                 * result in a fault if NULL.
                 *
                 * With the reduced set of checks, we can afford to inline the
                 * checks, which also gives the compiler a chance to elide some
                 * of them completely if they can be proven at compile-time. If
                 * one of the pre-conditions does not hold, the slow-path will
                 * show a report which pre-condition failed.
                 */
                if (likely(next->prev == prev && prev->next == next && new != prev && new != next))
                        return true;
                ret = false;
        }

        ret &= __list_add_valid_or_report(new, prev, next);
        return ret;
}

/*
 * Performs the full set of list corruption checks before __list_del_entry().
 * On list corruption reports a warning, and returns false.
 */
bool __list_valid_slowpath __list_del_entry_valid_or_report(struct list_head *entry);

/*
 * Performs list corruption checks before __list_del_entry(). Returns false if a
 * corruption is detected, true otherwise.
 *
 * With CONFIG_LIST_HARDENED only, performs minimal list integrity checking
 * inline to catch non-faulting corruptions, and only if a corruption is
 * detected calls the reporting function __list_del_entry_valid_or_report().
 */
static __always_inline bool __list_del_entry_valid(struct list_head *entry)
{
        bool ret = true;

        if (!IS_ENABLED(CONFIG_DEBUG_LIST)) {
                struct list_head *prev = entry->prev;
                struct list_head *next = entry->next;

                /*
                 * With the hardening version, elide checking if next and prev
                 * are NULL, LIST_POISON1 or LIST_POISON2, since the immediate
                 * dereference of them below would result in a fault.
                 */
                if (likely(prev->next == entry && next->prev == entry))
                        return true;
                ret = false;
        }

        ret &= __list_del_entry_valid_or_report(entry);
        return ret;
}
#else
static inline bool __list_add_valid(struct list_head *new,
                                struct list_head *prev,
                                struct list_head *next)
{
        return true;
}
static inline bool __list_del_entry_valid(struct list_head *entry)
{
        return true;
}
#endif

/*
 * Insert a new entry between two known consecutive entries.
 *
 * This is only for internal list manipulation where we know
 * the prev/next entries already!
 */
static inline void __list_add(struct list_head *new,
                              struct list_head *prev,
                              struct list_head *next)
{
        if (!__list_add_valid(new, prev, next))
                return;

        next->prev = new;
        new->next = next;
        new->prev = prev;
        WRITE_ONCE(prev->next, new);
}

/**
 * list_add - add a new entry
 * @new: new entry to be added
 * @head: list head to add it after
 *
 * Insert a new entry after the specified head.
 * This is good for implementing stacks.
 */
static inline void list_add(struct list_head *new, struct list_head *head)
{
        __list_add(new, head, head->next);
}


/**
 * list_add_tail - add a new entry
 * @new: new entry to be added
 * @head: list head to add it before
 *
 * Insert a new entry before the specified head.
 * This is useful for implementing queues.
 */
static inline void list_add_tail(struct list_head *new, struct list_head *head)
{
        __list_add(new, head->prev, head);
}

/*
 * Delete a list entry by making the prev/next entries
 * point to each other.
 *
 * This is only for internal list manipulation where we know
 * the prev/next entries already!
 */
static inline void __list_del(struct list_head * prev, struct list_head * next)
{
        next->prev = prev;
        WRITE_ONCE(prev->next, next);
}

/*
 * Delete a list entry and clear the 'prev' pointer.
 *
 * This is a special-purpose list clearing method used in the networking code
 * for lists allocated as per-cpu, where we don't want to incur the extra
 * WRITE_ONCE() overhead of a regular list_del_init(). The code that uses this
 * needs to check the node 'prev' pointer instead of calling list_empty().
 */
static inline void __list_del_clearprev(struct list_head *entry)
{
        __list_del(entry->prev, entry->next);
        entry->prev = NULL;
}

static inline void __list_del_entry(struct list_head *entry)
{
        if (!__list_del_entry_valid(entry))
                return;

        __list_del(entry->prev, entry->next);
}

/**
 * list_del - deletes entry from list.
 * @entry: the element to delete from the list.
 * Note: list_empty() on entry does not return true after this, the entry is
 * in an undefined state.
 */
static inline void list_del(struct list_head *entry)
{
        __list_del_entry(entry);
        entry->next = LIST_POISON1;
        entry->prev = LIST_POISON2;
}

/**
 * list_replace - replace old entry by new one
 * @old : the element to be replaced
 * @new : the new element to insert
 *
 * If @old was empty, it will be overwritten.
 */
static inline void list_replace(struct list_head *old,
                                struct list_head *new)
{
        new->next = old->next;
        new->next->prev = new;
        new->prev = old->prev;
        new->prev->next = new;
}

/**
 * list_replace_init - replace old entry by new one and initialize the old one
 * @old : the element to be replaced
 * @new : the new element to insert
 *
 * If @old was empty, it will be overwritten.
 */
static inline void list_replace_init(struct list_head *old,
                                     struct list_head *new)
{
        list_replace(old, new);
        INIT_LIST_HEAD(old);
}

/**
 * list_swap - replace entry1 with entry2 and re-add entry1 at entry2's position
 * @entry1: the location to place entry2
 * @entry2: the location to place entry1
 */
static inline void list_swap(struct list_head *entry1,
                             struct list_head *entry2)
{
        struct list_head *pos = entry2->prev;

        list_del(entry2);
        list_replace(entry1, entry2);
        if (pos == entry1)
                pos = entry2;
        list_add(entry1, pos);
}

/**
 * list_del_init - deletes entry from list and reinitialize it.
 * @entry: the element to delete from the list.
 */
static inline void list_del_init(struct list_head *entry)
{
        __list_del_entry(entry);
        INIT_LIST_HEAD(entry);
}

/**
 * list_move - delete from one list and add as another's head
 * @list: the entry to move
 * @head: the head that will precede our entry
 */
static inline void list_move(struct list_head *list, struct list_head *head)
{
        __list_del_entry(list);
        list_add(list, head);
}

/**
 * list_move_tail - delete from one list and add as another's tail
 * @list: the entry to move
 * @head: the head that will follow our entry
 */
static inline void list_move_tail(struct list_head *list,
                                  struct list_head *head)
{
        __list_del_entry(list);
        list_add_tail(list, head);
}

/**
 * list_bulk_move_tail - move a subsection of a list to its tail
 * @head: the head that will follow our entry
 * @first: first entry to move
 * @last: last entry to move, can be the same as first
 *
 * Move all entries between @first and including @last before @head.
 * All three entries must belong to the same linked list.
 */
static inline void list_bulk_move_tail(struct list_head *head,
                                       struct list_head *first,
                                       struct list_head *last)
{
        first->prev->next = last->next;
        last->next->prev = first->prev;

        head->prev->next = first;
        first->prev = head->prev;

        last->next = head;
        head->prev = last;
}

/**
 * list_is_first -- tests whether @list is the first entry in list @head
 * @list: the entry to test
 * @head: the head of the list
 */
static inline int list_is_first(const struct list_head *list, const struct list_head *head)
{
        return list->prev == head;
}

/**
 * list_is_last - tests whether @list is the last entry in list @head
 * @list: the entry to test
 * @head: the head of the list
 */
static inline int list_is_last(const struct list_head *list, const struct list_head *head)
{
        return list->next == head;
}

/**
 * list_is_head - tests whether @list is the list @head
 * @list: the entry to test
 * @head: the head of the list
 */
static inline int list_is_head(const struct list_head *list, const struct list_head *head)
{
        return list == head;
}

/**
 * list_empty - tests whether a list is empty
 * @head: the list to test.
 */
static inline int list_empty(const struct list_head *head)
{
        return READ_ONCE(head->next) == head;
}

/**
 * list_del_init_careful - deletes entry from list and reinitialize it.
 * @entry: the element to delete from the list.
 *
 * This is the same as list_del_init(), except designed to be used
 * together with list_empty_careful() in a way to guarantee ordering
 * of other memory operations.
 *
 * Any memory operations done before a list_del_init_careful() are
 * guaranteed to be visible after a list_empty_careful() test.
 */
static inline void list_del_init_careful(struct list_head *entry)
{
        __list_del_entry(entry);
        WRITE_ONCE(entry->prev, entry);
        smp_store_release(&entry->next, entry);
}

/**
 * list_empty_careful - tests whether a list is empty and not being modified
 * @head: the list to test
 *
 * Description:
 * tests whether a list is empty _and_ checks that no other CPU might be
 * in the process of modifying either member (next or prev)
 *
 * NOTE: using list_empty_careful() without synchronization
 * can only be safe if the only activity that can happen
 * to the list entry is list_del_init(). Eg. it cannot be used
 * if another CPU could re-list_add() it.
 */
static inline int list_empty_careful(const struct list_head *head)
{
        struct list_head *next = smp_load_acquire(&head->next);
        return list_is_head(next, head) && (next == READ_ONCE(head->prev));
}

/**
 * list_rotate_left - rotate the list to the left
 * @head: the head of the list
 */
static inline void list_rotate_left(struct list_head *head)
{
        struct list_head *first;

        if (!list_empty(head)) {
                first = head->next;
                list_move_tail(first, head);
        }
}

/**
 * list_rotate_to_front() - Rotate list to specific item.
 * @list: The desired new front of the list.
 * @head: The head of the list.
 *
 * Rotates list so that @list becomes the new front of the list.
 */
static inline void list_rotate_to_front(struct list_head *list,
                                        struct list_head *head)
{
        /*
         * Deletes the list head from the list denoted by @head and
         * places it as the tail of @list, this effectively rotates the
         * list so that @list is at the front.
         */
        list_move_tail(head, list);
}

/**
 * list_is_singular - tests whether a list has just one entry.
 * @head: the list to test.
 */
static inline int list_is_singular(const struct list_head *head)
{
        return !list_empty(head) && (head->next == head->prev);
}

static inline void __list_cut_position(struct list_head *list,
                struct list_head *head, struct list_head *entry)
{
        struct list_head *new_first = entry->next;
        list->next = head->next;
        list->next->prev = list;
        list->prev = entry;
        entry->next = list;
        head->next = new_first;
        new_first->prev = head;
}

/**
 * list_cut_position - cut a list into two
 * @list: a new list to add all removed entries
 * @head: a list with entries
 * @entry: an entry within head, could be the head itself
 *        and if so we won't cut the list
 *
 * This helper moves the initial part of @head, up to and
 * including @entry, from @head to @list. You should
 * pass on @entry an element you know is on @head. @list
 * should be an empty list or a list you do not care about
 * losing its data.
 *
 */
static inline void list_cut_position(struct list_head *list,
                struct list_head *head, struct list_head *entry)
{
        if (list_empty(head))
                return;
        if (list_is_singular(head) && !list_is_head(entry, head) && (entry != head->next))
                return;
        if (list_is_head(entry, head))
                INIT_LIST_HEAD(list);
        else
                __list_cut_position(list, head, entry);
}

/**
 * list_cut_before - cut a list into two, before given entry
 * @list: a new list to add all removed entries
 * @head: a list with entries
 * @entry: an entry within head, could be the head itself
 *
 * This helper moves the initial part of @head, up to but
 * excluding @entry, from @head to @list.  You should pass
 * in @entry an element you know is on @head.  @list should
 * be an empty list or a list you do not care about losing
 * its data.
 * If @entry == @head, all entries on @head are moved to
 * @list.
 */
static inline void list_cut_before(struct list_head *list,
                                   struct list_head *head,
                                   struct list_head *entry)
{
        if (head->next == entry) {
                INIT_LIST_HEAD(list);
                return;
        }
        list->next = head->next;
        list->next->prev = list;
        list->prev = entry->prev;
        list->prev->next = list;
        head->next = entry;
        entry->prev = head;
}

static inline void __list_splice(const struct list_head *list,
                                 struct list_head *prev,
                                 struct list_head *next)
{
        struct list_head *first = list->next;
        struct list_head *last = list->prev;

        first->prev = prev;
        prev->next = first;

        last->next = next;
        next->prev = last;
}

/**
 * list_splice - join two lists, this is designed for stacks
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 */
static inline void list_splice(const struct list_head *list,
                                struct list_head *head)
{
        if (!list_empty(list))
                __list_splice(list, head, head->next);
}

/**
 * list_splice_tail - join two lists, each list being a queue
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 */
static inline void list_splice_tail(struct list_head *list,
                                struct list_head *head)
{
        if (!list_empty(list))
                __list_splice(list, head->prev, head);
}

/**
 * list_splice_init - join two lists and reinitialise the emptied list.
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 *
 * The list at @list is reinitialised
 */
static inline void list_splice_init(struct list_head *list,
                                    struct list_head *head)
{
        if (!list_empty(list)) {
                __list_splice(list, head, head->next);
                INIT_LIST_HEAD(list);
        }
}

/**
 * list_splice_tail_init - join two lists and reinitialise the emptied list
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 *
 * Each of the lists is a queue.
 * The list at @list is reinitialised
 */
static inline void list_splice_tail_init(struct list_head *list,
                                         struct list_head *head)
{
        if (!list_empty(list)) {
                __list_splice(list, head->prev, head);
                INIT_LIST_HEAD(list);
        }
}

/**
 * list_entry - get the struct for this entry
 * @ptr:        the &struct list_head pointer.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the list_head within the struct.
 */
#define list_entry(ptr, type, member) \
        container_of(ptr, type, member)

/**
 * list_first_entry - get the first element from a list
 * @ptr:        the list head to take the element from.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the list_head within the struct.
 *
 * Note, that list is expected to be not empty.
 */
#define list_first_entry(ptr, type, member) \
        list_entry((ptr)->next, type, member)

/**
 * list_last_entry - get the last element from a list
 * @ptr:        the list head to take the element from.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the list_head within the struct.
 *
 * Note, that list is expected to be not empty.
 */
#define list_last_entry(ptr, type, member) \
        list_entry((ptr)->prev, type, member)

/**
 * list_first_entry_or_null - get the first element from a list
 * @ptr:        the list head to take the element from.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the list_head within the struct.
 *
 * Note that if the list is empty, it returns NULL.
 */
#define list_first_entry_or_null(ptr, type, member) ({ \
        struct list_head *head__ = (ptr); \
        struct list_head *pos__ = READ_ONCE(head__->next); \
        pos__ != head__ ? list_entry(pos__, type, member) : NULL; \
})

/**
 * list_last_entry_or_null - get the last element from a list
 * @ptr:        the list head to take the element from.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the list_head within the struct.
 *
 * Note that if the list is empty, it returns NULL.
 */
#define list_last_entry_or_null(ptr, type, member) ({ \
        struct list_head *head__ = (ptr); \
        struct list_head *pos__ = READ_ONCE(head__->prev); \
        pos__ != head__ ? list_entry(pos__, type, member) : NULL; \
})

/**
 * list_next_entry - get the next element in list
 * @pos:        the type * to cursor
 * @member:        the name of the list_head within the struct.
 */
#define list_next_entry(pos, member) \
        list_entry((pos)->member.next, typeof(*(pos)), member)

/**
 * list_next_entry_circular - get the next element in list
 * @pos:        the type * to cursor.
 * @head:        the list head to take the element from.
 * @member:        the name of the list_head within the struct.
 *
 * Wraparound if pos is the last element (return the first element).
 * Note, that list is expected to be not empty.
 */
#define list_next_entry_circular(pos, head, member) \
        (list_is_last(&(pos)->member, head) ? \
        list_first_entry(head, typeof(*(pos)), member) : list_next_entry(pos, member))

/**
 * list_prev_entry - get the prev element in list
 * @pos:        the type * to cursor
 * @member:        the name of the list_head within the struct.
 */
#define list_prev_entry(pos, member) \
        list_entry((pos)->member.prev, typeof(*(pos)), member)

/**
 * list_prev_entry_circular - get the prev element in list
 * @pos:        the type * to cursor.
 * @head:        the list head to take the element from.
 * @member:        the name of the list_head within the struct.
 *
 * Wraparound if pos is the first element (return the last element).
 * Note, that list is expected to be not empty.
 */
#define list_prev_entry_circular(pos, head, member) \
        (list_is_first(&(pos)->member, head) ? \
        list_last_entry(head, typeof(*(pos)), member) : list_prev_entry(pos, member))

/**
 * list_for_each        -        iterate over a list
 * @pos:        the &struct list_head to use as a loop cursor.
 * @head:        the head for your list.
 */
#define list_for_each(pos, head) \
        for (pos = (head)->next; !list_is_head(pos, (head)); pos = pos->next)

/**
 * list_for_each_continue - continue iteration over a list
 * @pos:        the &struct list_head to use as a loop cursor.
 * @head:        the head for your list.
 *
 * Continue to iterate over a list, continuing after the current position.
 */
#define list_for_each_continue(pos, head) \
        for (pos = pos->next; !list_is_head(pos, (head)); pos = pos->next)

/**
 * list_for_each_prev        -        iterate over a list backwards
 * @pos:        the &struct list_head to use as a loop cursor.
 * @head:        the head for your list.
 */
#define list_for_each_prev(pos, head) \
        for (pos = (head)->prev; !list_is_head(pos, (head)); pos = pos->prev)

/**
 * list_for_each_safe - iterate over a list safe against removal of list entry
 * @pos:        the &struct list_head to use as a loop cursor.
 * @n:                another &struct list_head to use as temporary storage
 * @head:        the head for your list.
 */
#define list_for_each_safe(pos, n, head) \
        for (pos = (head)->next, n = pos->next; \
             !list_is_head(pos, (head)); \
             pos = n, n = pos->next)

/**
 * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry
 * @pos:        the &struct list_head to use as a loop cursor.
 * @n:                another &struct list_head to use as temporary storage
 * @head:        the head for your list.
 */
#define list_for_each_prev_safe(pos, n, head) \
        for (pos = (head)->prev, n = pos->prev; \
             !list_is_head(pos, (head)); \
             pos = n, n = pos->prev)

/**
 * list_count_nodes - count nodes in the list
 * @head:        the head for your list.
 */
static inline size_t list_count_nodes(struct list_head *head)
{
        struct list_head *pos;
        size_t count = 0;

        list_for_each(pos, head)
                count++;

        return count;
}

/**
 * list_entry_is_head - test if the entry points to the head of the list
 * @pos:        the type * to cursor
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 */
#define list_entry_is_head(pos, head, member)                                \
        list_is_head(&pos->member, (head))

/**
 * list_for_each_entry        -        iterate over list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 */
#define list_for_each_entry(pos, head, member)                                \
        for (pos = list_first_entry(head, typeof(*pos), member);        \
             !list_entry_is_head(pos, head, member);                        \
             pos = list_next_entry(pos, member))

/**
 * list_for_each_entry_reverse - iterate backwards over list of given type.
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 */
#define list_for_each_entry_reverse(pos, head, member)                        \
        for (pos = list_last_entry(head, typeof(*pos), member);                \
             !list_entry_is_head(pos, head, member);                         \
             pos = list_prev_entry(pos, member))

/**
 * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue()
 * @pos:        the type * to use as a start point
 * @head:        the head of the list
 * @member:        the name of the list_head within the struct.
 *
 * Prepares a pos entry for use as a start point in list_for_each_entry_continue().
 */
#define list_prepare_entry(pos, head, member) \
        ((pos) ? : list_entry(head, typeof(*pos), member))

/**
 * list_for_each_entry_continue - continue iteration over list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Continue to iterate over list of given type, continuing after
 * the current position.
 */
#define list_for_each_entry_continue(pos, head, member)                 \
        for (pos = list_next_entry(pos, member);                        \
             !list_entry_is_head(pos, head, member);                        \
             pos = list_next_entry(pos, member))

/**
 * list_for_each_entry_continue_reverse - iterate backwards from the given point
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Start to iterate over list of given type backwards, continuing after
 * the current position.
 */
#define list_for_each_entry_continue_reverse(pos, head, member)                \
        for (pos = list_prev_entry(pos, member);                        \
             !list_entry_is_head(pos, head, member);                        \
             pos = list_prev_entry(pos, member))

/**
 * list_for_each_entry_from - iterate over list of given type from the current point
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate over list of given type, continuing from current position.
 */
#define list_for_each_entry_from(pos, head, member)                         \
        for (; !list_entry_is_head(pos, head, member);                        \
             pos = list_next_entry(pos, member))

/**
 * list_for_each_entry_from_reverse - iterate backwards over list of given type
 *                                    from the current point
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate backwards over list of given type, continuing from current position.
 */
#define list_for_each_entry_from_reverse(pos, head, member)                \
        for (; !list_entry_is_head(pos, head, member);                        \
             pos = list_prev_entry(pos, member))

/**
 * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
 * @pos:        the type * to use as a loop cursor.
 * @n:                another type * to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 */
#define list_for_each_entry_safe(pos, n, head, member)                        \
        for (pos = list_first_entry(head, typeof(*pos), member),        \
                n = list_next_entry(pos, member);                        \
             !list_entry_is_head(pos, head, member);                         \
             pos = n, n = list_next_entry(n, member))

/**
 * list_for_each_entry_safe_continue - continue list iteration safe against removal
 * @pos:        the type * to use as a loop cursor.
 * @n:                another type * to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate over list of given type, continuing after current point,
 * safe against removal of list entry.
 */
#define list_for_each_entry_safe_continue(pos, n, head, member)                 \
        for (pos = list_next_entry(pos, member),                                 \
                n = list_next_entry(pos, member);                                \
             !list_entry_is_head(pos, head, member);                                \
             pos = n, n = list_next_entry(n, member))

/**
 * list_for_each_entry_safe_from - iterate over list from current point safe against removal
 * @pos:        the type * to use as a loop cursor.
 * @n:                another type * to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate over list of given type from current point, safe against
 * removal of list entry.
 */
#define list_for_each_entry_safe_from(pos, n, head, member)                         \
        for (n = list_next_entry(pos, member);                                        \
             !list_entry_is_head(pos, head, member);                                \
             pos = n, n = list_next_entry(n, member))

/**
 * list_for_each_entry_safe_reverse - iterate backwards over list safe against removal
 * @pos:        the type * to use as a loop cursor.
 * @n:                another type * to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate backwards over list of given type, safe against removal
 * of list entry.
 */
#define list_for_each_entry_safe_reverse(pos, n, head, member)                \
        for (pos = list_last_entry(head, typeof(*pos), member),                \
                n = list_prev_entry(pos, member);                        \
             !list_entry_is_head(pos, head, member);                         \
             pos = n, n = list_prev_entry(n, member))

/**
 * list_safe_reset_next - reset a stale list_for_each_entry_safe loop
 * @pos:        the loop cursor used in the list_for_each_entry_safe loop
 * @n:                temporary storage used in list_for_each_entry_safe
 * @member:        the name of the list_head within the struct.
 *
 * list_safe_reset_next is not safe to use in general if the list may be
 * modified concurrently (eg. the lock is dropped in the loop body). An
 * exception to this is if the cursor element (pos) is pinned in the list,
 * and list_safe_reset_next is called after re-taking the lock and before
 * completing the current iteration of the loop body.
 */
#define list_safe_reset_next(pos, n, member)                                \
        n = list_next_entry(pos, member)

/*
 * Double linked lists with a single pointer list head.
 * Mostly useful for hash tables where the two pointer list head is
 * too wasteful.
 * You lose the ability to access the tail in O(1).
 */

#define HLIST_HEAD_INIT { .first = NULL }
#define HLIST_HEAD(name) struct hlist_head name = {  .first = NULL }
#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL)
static inline void INIT_HLIST_NODE(struct hlist_node *h)
{
        h->next = NULL;
        h->pprev = NULL;
}

/**
 * hlist_unhashed - Has node been removed from list and reinitialized?
 * @h: Node to be checked
 *
 * Not that not all removal functions will leave a node in unhashed
 * state.  For example, hlist_nulls_del_init_rcu() does leave the
 * node in unhashed state, but hlist_nulls_del() does not.
 */
static inline int hlist_unhashed(const struct hlist_node *h)
{
        return !h->pprev;
}

/**
 * hlist_unhashed_lockless - Version of hlist_unhashed for lockless use
 * @h: Node to be checked
 *
 * This variant of hlist_unhashed() must be used in lockless contexts
 * to avoid potential load-tearing.  The READ_ONCE() is paired with the
 * various WRITE_ONCE() in hlist helpers that are defined below.
 */
static inline int hlist_unhashed_lockless(const struct hlist_node *h)
{
        return !READ_ONCE(h->pprev);
}

/**
 * hlist_empty - Is the specified hlist_head structure an empty hlist?
 * @h: Structure to check.
 */
static inline int hlist_empty(const struct hlist_head *h)
{
        return !READ_ONCE(h->first);
}

static inline void __hlist_del(struct hlist_node *n)
{
        struct hlist_node *next = n->next;
        struct hlist_node **pprev = n->pprev;

        WRITE_ONCE(*pprev, next);
        if (next)
                WRITE_ONCE(next->pprev, pprev);
}

/**
 * hlist_del - Delete the specified hlist_node from its list
 * @n: Node to delete.
 *
 * Note that this function leaves the node in hashed state.  Use
 * hlist_del_init() or similar instead to unhash @n.
 */
static inline void hlist_del(struct hlist_node *n)
{
        __hlist_del(n);
        n->next = LIST_POISON1;
        n->pprev = LIST_POISON2;
}

/**
 * hlist_del_init - Delete the specified hlist_node from its list and initialize
 * @n: Node to delete.
 *
 * Note that this function leaves the node in unhashed state.
 */
static inline void hlist_del_init(struct hlist_node *n)
{
        if (!hlist_unhashed(n)) {
                __hlist_del(n);
                INIT_HLIST_NODE(n);
        }
}

/**
 * hlist_add_head - add a new entry at the beginning of the hlist
 * @n: new entry to be added
 * @h: hlist head to add it after
 *
 * Insert a new entry after the specified head.
 * This is good for implementing stacks.
 */
static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
{
        struct hlist_node *first = h->first;
        WRITE_ONCE(n->next, first);
        if (first)
                WRITE_ONCE(first->pprev, &n->next);
        WRITE_ONCE(h->first, n);
        WRITE_ONCE(n->pprev, &h->first);
}

/**
 * hlist_add_before - add a new entry before the one specified
 * @n: new entry to be added
 * @next: hlist node to add it before, which must be non-NULL
 */
static inline void hlist_add_before(struct hlist_node *n,
                                    struct hlist_node *next)
{
        WRITE_ONCE(n->pprev, next->pprev);
        WRITE_ONCE(n->next, next);
        WRITE_ONCE(next->pprev, &n->next);
        WRITE_ONCE(*(n->pprev), n);
}

/**
 * hlist_add_behind - add a new entry after the one specified
 * @n: new entry to be added
 * @prev: hlist node to add it after, which must be non-NULL
 */
static inline void hlist_add_behind(struct hlist_node *n,
                                    struct hlist_node *prev)
{
        WRITE_ONCE(n->next, prev->next);
        WRITE_ONCE(prev->next, n);
        WRITE_ONCE(n->pprev, &prev->next);

        if (n->next)
                WRITE_ONCE(n->next->pprev, &n->next);
}

/**
 * hlist_add_fake - create a fake hlist consisting of a single headless node
 * @n: Node to make a fake list out of
 *
 * This makes @n appear to be its own predecessor on a headless hlist.
 * The point of this is to allow things like hlist_del() to work correctly
 * in cases where there is no list.
 */
static inline void hlist_add_fake(struct hlist_node *n)
{
        n->pprev = &n->next;
}

/**
 * hlist_fake: Is this node a fake hlist?
 * @h: Node to check for being a self-referential fake hlist.
 */
static inline bool hlist_fake(struct hlist_node *h)
{
        return h->pprev == &h->next;
}

/**
 * hlist_is_singular_node - is node the only element of the specified hlist?
 * @n: Node to check for singularity.
 * @h: Header for potentially singular list.
 *
 * Check whether the node is the only node of the head without
 * accessing head, thus avoiding unnecessary cache misses.
 */
static inline bool
hlist_is_singular_node(struct hlist_node *n, struct hlist_head *h)
{
        return !n->next && n->pprev == &h->first;
}

/**
 * hlist_move_list - Move an hlist
 * @old: hlist_head for old list.
 * @new: hlist_head for new list.
 *
 * Move a list from one list head to another. Fixup the pprev
 * reference of the first entry if it exists.
 */
static inline void hlist_move_list(struct hlist_head *old,
                                   struct hlist_head *new)
{
        new->first = old->first;
        if (new->first)
                new->first->pprev = &new->first;
        old->first = NULL;
}

/**
 * hlist_splice_init() - move all entries from one list to another
 * @from: hlist_head from which entries will be moved
 * @last: last entry on the @from list
 * @to:   hlist_head to which entries will be moved
 *
 * @to can be empty, @from must contain at least @last.
 */
static inline void hlist_splice_init(struct hlist_head *from,
                                     struct hlist_node *last,
                                     struct hlist_head *to)
{
        if (to->first)
                to->first->pprev = &last->next;
        last->next = to->first;
        to->first = from->first;
        from->first->pprev = &to->first;
        from->first = NULL;
}

#define hlist_entry(ptr, type, member) container_of(ptr,type,member)

#define hlist_for_each(pos, head) \
        for (pos = (head)->first; pos ; pos = pos->next)

#define hlist_for_each_safe(pos, n, head) \
        for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \
             pos = n)

#define hlist_entry_safe(ptr, type, member) \
        ({ typeof(ptr) ____ptr = (ptr); \
           ____ptr ? hlist_entry(____ptr, type, member) : NULL; \
        })

/**
 * hlist_for_each_entry        - iterate over list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry(pos, head, member)                                \
        for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\
             pos;                                                        \
             pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))

/**
 * hlist_for_each_entry_continue - iterate over a hlist continuing after current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_continue(pos, member)                        \
        for (pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member);\
             pos;                                                        \
             pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))

/**
 * hlist_for_each_entry_from - iterate over a hlist continuing from current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_from(pos, member)                                \
        for (; pos;                                                        \
             pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))

/**
 * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry
 * @pos:        the type * to use as a loop cursor.
 * @n:                a &struct hlist_node to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_safe(pos, n, head, member)                 \
        for (pos = hlist_entry_safe((head)->first, typeof(*pos), member);\
             pos && ({ n = pos->member.next; 1; });                        \
             pos = hlist_entry_safe(n, typeof(*pos), member))

/**
 * hlist_count_nodes - count nodes in the hlist
 * @head:        the head for your hlist.
 */
static inline size_t hlist_count_nodes(struct hlist_head *head)
{
        struct hlist_node *pos;
        size_t count = 0;

        hlist_for_each(pos, head)
                count++;

        return count;
}

#endif


























































































































































































































































































































































































































































































































































































   12 
   13 















































































   18 
   55 
































































   24 


























   18 
























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RCULIST_H
#define _LINUX_RCULIST_H

#ifdef __KERNEL__

/*
 * RCU-protected list version
 */
#include <linux/list.h>
#include <linux/rcupdate.h>

/*
 * INIT_LIST_HEAD_RCU - Initialize a list_head visible to RCU readers
 * @list: list to be initialized
 *
 * You should instead use INIT_LIST_HEAD() for normal initialization and
 * cleanup tasks, when readers have no access to the list being initialized.
 * However, if the list being initialized is visible to readers, you
 * need to keep the compiler from being too mischievous.
 */
static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
{
        WRITE_ONCE(list->next, list);
        WRITE_ONCE(list->prev, list);
}

/*
 * return the ->next pointer of a list_head in an rcu safe
 * way, we must not access it directly
 */
#define list_next_rcu(list)        (*((struct list_head __rcu **)(&(list)->next)))
/*
 * Return the ->prev pointer of a list_head in an rcu safe way. Don't
 * access it directly.
 *
 * Any list traversed with list_bidir_prev_rcu() must never use
 * list_del_rcu().  Doing so will poison the ->prev pointer that
 * list_bidir_prev_rcu() relies on, which will result in segfaults.
 * To prevent these segfaults, use list_bidir_del_rcu() instead
 * of list_del_rcu().
 */
#define list_bidir_prev_rcu(list) (*((struct list_head __rcu **)(&(list)->prev)))

/**
 * list_for_each_rcu - Iterate over a list in an RCU-safe fashion
 * @pos:        the &struct list_head to use as a loop cursor.
 * @head:        the head for your list.
 */
#define list_for_each_rcu(pos, head)                  \
        for (pos = rcu_dereference((head)->next); \
             !list_is_head(pos, (head)); \
             pos = rcu_dereference(pos->next))

/**
 * list_tail_rcu - returns the prev pointer of the head of the list
 * @head: the head of the list
 *
 * Note: This should only be used with the list header, and even then
 * only if list_del() and similar primitives are not also used on the
 * list header.
 */
#define list_tail_rcu(head)        (*((struct list_head __rcu **)(&(head)->prev)))

/*
 * Check during list traversal that we are within an RCU reader
 */

#define check_arg_count_one(dummy)

#ifdef CONFIG_PROVE_RCU_LIST
#define __list_check_rcu(dummy, cond, extra...)                                \
        ({                                                                \
        check_arg_count_one(extra);                                        \
        RCU_LOCKDEP_WARN(!(cond) && !rcu_read_lock_any_held(),                \
                         "RCU-list traversed in non-reader section!");        \
        })

#define __list_check_srcu(cond)                                         \
        ({                                                                 \
        RCU_LOCKDEP_WARN(!(cond),                                         \
                "RCU-list traversed without holding the required lock!");\
        })
#else
#define __list_check_rcu(dummy, cond, extra...)                                \
        ({ check_arg_count_one(extra); })

#define __list_check_srcu(cond) ({ })
#endif

/*
 * Insert a new entry between two known consecutive entries.
 *
 * This is only for internal list manipulation where we know
 * the prev/next entries already!
 */
static inline void __list_add_rcu(struct list_head *new,
                struct list_head *prev, struct list_head *next)
{
        if (!__list_add_valid(new, prev, next))
                return;

        new->next = next;
        new->prev = prev;
        rcu_assign_pointer(list_next_rcu(prev), new);
        next->prev = new;
}

/**
 * list_add_rcu - add a new entry to rcu-protected list
 * @new: new entry to be added
 * @head: list head to add it after
 *
 * Insert a new entry after the specified head.
 * This is good for implementing stacks.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as list_add_rcu()
 * or list_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * list_for_each_entry_rcu().
 */
static inline void list_add_rcu(struct list_head *new, struct list_head *head)
{
        __list_add_rcu(new, head, head->next);
}

/**
 * list_add_tail_rcu - add a new entry to rcu-protected list
 * @new: new entry to be added
 * @head: list head to add it before
 *
 * Insert a new entry before the specified head.
 * This is useful for implementing queues.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as list_add_tail_rcu()
 * or list_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * list_for_each_entry_rcu().
 */
static inline void list_add_tail_rcu(struct list_head *new,
                                        struct list_head *head)
{
        __list_add_rcu(new, head->prev, head);
}

/**
 * list_del_rcu - deletes entry from list without re-initialization
 * @entry: the element to delete from the list.
 *
 * Note: list_empty() on entry does not return true after this,
 * the entry is in an undefined state. It is useful for RCU based
 * lockfree traversal.
 *
 * In particular, it means that we can not poison the forward
 * pointers that may still be used for walking the list.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as list_del_rcu()
 * or list_add_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * list_for_each_entry_rcu().
 *
 * Note that the caller is not permitted to immediately free
 * the newly deleted entry.  Instead, either synchronize_rcu()
 * or call_rcu() must be used to defer freeing until an RCU
 * grace period has elapsed.
 */
static inline void list_del_rcu(struct list_head *entry)
{
        __list_del_entry(entry);
        entry->prev = LIST_POISON2;
}

/**
 * list_bidir_del_rcu - deletes entry from list without re-initialization
 * @entry: the element to delete from the list.
 *
 * In contrast to list_del_rcu() doesn't poison the prev pointer thus
 * allowing backwards traversal via list_bidir_prev_rcu().
 *
 * Note: list_empty() on entry does not return true after this because
 * the entry is in a special undefined state that permits RCU-based
 * lockfree reverse traversal. In particular this means that we can not
 * poison the forward and backwards pointers that may still be used for
 * walking the list.
 *
 * The caller must take whatever precautions are necessary (such as
 * holding appropriate locks) to avoid racing with another list-mutation
 * primitive, such as list_bidir_del_rcu() or list_add_rcu(), running on
 * this same list. However, it is perfectly legal to run concurrently
 * with the _rcu list-traversal primitives, such as
 * list_for_each_entry_rcu().
 *
 * Note that list_del_rcu() and list_bidir_del_rcu() must not be used on
 * the same list.
 *
 * Note that the caller is not permitted to immediately free
 * the newly deleted entry.  Instead, either synchronize_rcu()
 * or call_rcu() must be used to defer freeing until an RCU
 * grace period has elapsed.
 */
static inline void list_bidir_del_rcu(struct list_head *entry)
{
        __list_del_entry(entry);
}

/**
 * hlist_del_init_rcu - deletes entry from hash list with re-initialization
 * @n: the element to delete from the hash list.
 *
 * Note: list_unhashed() on the node return true after this. It is
 * useful for RCU based read lockfree traversal if the writer side
 * must know if the list entry is still hashed or already unhashed.
 *
 * In particular, it means that we can not poison the forward pointers
 * that may still be used for walking the hash list and we can only
 * zero the pprev pointer so list_unhashed() will return true after
 * this.
 *
 * The caller must take whatever precautions are necessary (such as
 * holding appropriate locks) to avoid racing with another
 * list-mutation primitive, such as hlist_add_head_rcu() or
 * hlist_del_rcu(), running on this same list.  However, it is
 * perfectly legal to run concurrently with the _rcu list-traversal
 * primitives, such as hlist_for_each_entry_rcu().
 */
static inline void hlist_del_init_rcu(struct hlist_node *n)
{
        if (!hlist_unhashed(n)) {
                __hlist_del(n);
                WRITE_ONCE(n->pprev, NULL);
        }
}

/**
 * list_replace_rcu - replace old entry by new one
 * @old : the element to be replaced
 * @new : the new element to insert
 *
 * The @old entry will be replaced with the @new entry atomically from
 * the perspective of concurrent readers.  It is the caller's responsibility
 * to synchronize with concurrent updaters, if any.
 *
 * Note: @old should not be empty.
 */
static inline void list_replace_rcu(struct list_head *old,
                                struct list_head *new)
{
        new->next = old->next;
        new->prev = old->prev;
        rcu_assign_pointer(list_next_rcu(new->prev), new);
        new->next->prev = new;
        old->prev = LIST_POISON2;
}

/**
 * __list_splice_init_rcu - join an RCU-protected list into an existing list.
 * @list:        the RCU-protected list to splice
 * @prev:        points to the last element of the existing list
 * @next:        points to the first element of the existing list
 * @sync:        synchronize_rcu, synchronize_rcu_expedited, ...
 *
 * The list pointed to by @prev and @next can be RCU-read traversed
 * concurrently with this function.
 *
 * Note that this function blocks.
 *
 * Important note: the caller must take whatever action is necessary to prevent
 * any other updates to the existing list.  In principle, it is possible to
 * modify the list as soon as sync() begins execution. If this sort of thing
 * becomes necessary, an alternative version based on call_rcu() could be
 * created.  But only if -really- needed -- there is no shortage of RCU API
 * members.
 */
static inline void __list_splice_init_rcu(struct list_head *list,
                                          struct list_head *prev,
                                          struct list_head *next,
                                          void (*sync)(void))
{
        struct list_head *first = list->next;
        struct list_head *last = list->prev;

        /*
         * "first" and "last" tracking list, so initialize it.  RCU readers
         * have access to this list, so we must use INIT_LIST_HEAD_RCU()
         * instead of INIT_LIST_HEAD().
         */

        INIT_LIST_HEAD_RCU(list);

        /*
         * At this point, the list body still points to the source list.
         * Wait for any readers to finish using the list before splicing
         * the list body into the new list.  Any new readers will see
         * an empty list.
         */

        sync();
        ASSERT_EXCLUSIVE_ACCESS(*first);
        ASSERT_EXCLUSIVE_ACCESS(*last);

        /*
         * Readers are finished with the source list, so perform splice.
         * The order is important if the new list is global and accessible
         * to concurrent RCU readers.  Note that RCU readers are not
         * permitted to traverse the prev pointers without excluding
         * this function.
         */

        last->next = next;
        rcu_assign_pointer(list_next_rcu(prev), first);
        first->prev = prev;
        next->prev = last;
}

/**
 * list_splice_init_rcu - splice an RCU-protected list into an existing list,
 *                        designed for stacks.
 * @list:        the RCU-protected list to splice
 * @head:        the place in the existing list to splice the first list into
 * @sync:        synchronize_rcu, synchronize_rcu_expedited, ...
 */
static inline void list_splice_init_rcu(struct list_head *list,
                                        struct list_head *head,
                                        void (*sync)(void))
{
        if (!list_empty(list))
                __list_splice_init_rcu(list, head, head->next, sync);
}

/**
 * list_splice_tail_init_rcu - splice an RCU-protected list into an existing
 *                             list, designed for queues.
 * @list:        the RCU-protected list to splice
 * @head:        the place in the existing list to splice the first list into
 * @sync:        synchronize_rcu, synchronize_rcu_expedited, ...
 */
static inline void list_splice_tail_init_rcu(struct list_head *list,
                                             struct list_head *head,
                                             void (*sync)(void))
{
        if (!list_empty(list))
                __list_splice_init_rcu(list, head->prev, head, sync);
}

/**
 * list_entry_rcu - get the struct for this entry
 * @ptr:        the &struct list_head pointer.
 * @type:       the type of the struct this is embedded in.
 * @member:     the name of the list_head within the struct.
 *
 * This primitive may safely run concurrently with the _rcu list-mutation
 * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
 */
#define list_entry_rcu(ptr, type, member) \
        container_of(READ_ONCE(ptr), type, member)

/*
 * Where are list_empty_rcu() and list_first_entry_rcu()?
 *
 * They do not exist because they would lead to subtle race conditions:
 *
 * if (!list_empty_rcu(mylist)) {
 *        struct foo *bar = list_first_entry_rcu(mylist, struct foo, list_member);
 *        do_something(bar);
 * }
 *
 * The list might be non-empty when list_empty_rcu() checks it, but it
 * might have become empty by the time that list_first_entry_rcu() rereads
 * the ->next pointer, which would result in a SEGV.
 *
 * When not using RCU, it is OK for list_first_entry() to re-read that
 * pointer because both functions should be protected by some lock that
 * blocks writers.
 *
 * When using RCU, list_empty() uses READ_ONCE() to fetch the
 * RCU-protected ->next pointer and then compares it to the address of the
 * list head.  However, it neither dereferences this pointer nor provides
 * this pointer to its caller.  Thus, READ_ONCE() suffices (that is,
 * rcu_dereference() is not needed), which means that list_empty() can be
 * used anywhere you would want to use list_empty_rcu().  Just don't
 * expect anything useful to happen if you do a subsequent lockless
 * call to list_first_entry_rcu()!!!
 *
 * See list_first_or_null_rcu for an alternative.
 */

/**
 * list_first_or_null_rcu - get the first element from a list
 * @ptr:        the list head to take the element from.
 * @type:       the type of the struct this is embedded in.
 * @member:     the name of the list_head within the struct.
 *
 * Note that if the list is empty, it returns NULL.
 *
 * This primitive may safely run concurrently with the _rcu list-mutation
 * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
 */
#define list_first_or_null_rcu(ptr, type, member) \
({ \
        struct list_head *__ptr = (ptr); \
        struct list_head *__next = READ_ONCE(__ptr->next); \
        likely(__ptr != __next) ? list_entry_rcu(__next, type, member) : NULL; \
})

/**
 * list_next_or_null_rcu - get the next element from a list
 * @head:        the head for the list.
 * @ptr:        the list head to take the next element from.
 * @type:       the type of the struct this is embedded in.
 * @member:     the name of the list_head within the struct.
 *
 * Note that if the ptr is at the end of the list, NULL is returned.
 *
 * This primitive may safely run concurrently with the _rcu list-mutation
 * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
 */
#define list_next_or_null_rcu(head, ptr, type, member) \
({ \
        struct list_head *__head = (head); \
        struct list_head *__ptr = (ptr); \
        struct list_head *__next = READ_ONCE(__ptr->next); \
        likely(__next != __head) ? list_entry_rcu(__next, type, \
                                                  member) : NULL; \
})

/**
 * list_for_each_entry_rcu        -        iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 * @cond:        optional lockdep expression if called from non-RCU protection.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as list_add_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 */
#define list_for_each_entry_rcu(pos, head, member, cond...)                \
        for (__list_check_rcu(dummy, ## cond, 0),                        \
             pos = list_entry_rcu((head)->next, typeof(*pos), member);        \
                &pos->member != (head);                                        \
                pos = list_entry_rcu(pos->member.next, typeof(*pos), member))

/**
 * list_for_each_entry_srcu        -        iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 * @cond:        lockdep expression for the lock required to traverse the list.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as list_add_rcu()
 * as long as the traversal is guarded by srcu_read_lock().
 * The lockdep expression srcu_read_lock_held() can be passed as the
 * cond argument from read side.
 */
#define list_for_each_entry_srcu(pos, head, member, cond)                \
        for (__list_check_srcu(cond),                                        \
             pos = list_entry_rcu((head)->next, typeof(*pos), member);        \
                &pos->member != (head);                                        \
                pos = list_entry_rcu(pos->member.next, typeof(*pos), member))

/**
 * list_entry_lockless - get the struct for this entry
 * @ptr:        the &struct list_head pointer.
 * @type:       the type of the struct this is embedded in.
 * @member:     the name of the list_head within the struct.
 *
 * This primitive may safely run concurrently with the _rcu
 * list-mutation primitives such as list_add_rcu(), but requires some
 * implicit RCU read-side guarding.  One example is running within a special
 * exception-time environment where preemption is disabled and where lockdep
 * cannot be invoked.  Another example is when items are added to the list,
 * but never deleted.
 */
#define list_entry_lockless(ptr, type, member) \
        container_of((typeof(ptr))READ_ONCE(ptr), type, member)

/**
 * list_for_each_entry_lockless - iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_struct within the struct.
 *
 * This primitive may safely run concurrently with the _rcu
 * list-mutation primitives such as list_add_rcu(), but requires some
 * implicit RCU read-side guarding.  One example is running within a special
 * exception-time environment where preemption is disabled and where lockdep
 * cannot be invoked.  Another example is when items are added to the list,
 * but never deleted.
 */
#define list_for_each_entry_lockless(pos, head, member) \
        for (pos = list_entry_lockless((head)->next, typeof(*pos), member); \
             &pos->member != (head); \
             pos = list_entry_lockless(pos->member.next, typeof(*pos), member))

/**
 * list_for_each_entry_continue_rcu - continue iteration over list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Continue to iterate over list of given type, continuing after
 * the current position which must have been in the list when the RCU read
 * lock was taken.
 * This would typically require either that you obtained the node from a
 * previous walk of the list in the same RCU read-side critical section, or
 * that you held some sort of non-RCU reference (such as a reference count)
 * to keep the node alive *and* in the list.
 *
 * This iterator is similar to list_for_each_entry_from_rcu() except
 * this starts after the given position and that one starts at the given
 * position.
 */
#define list_for_each_entry_continue_rcu(pos, head, member)                 \
        for (pos = list_entry_rcu(pos->member.next, typeof(*pos), member); \
             &pos->member != (head);        \
             pos = list_entry_rcu(pos->member.next, typeof(*pos), member))

/**
 * list_for_each_entry_from_rcu - iterate over a list from current point
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_node within the struct.
 *
 * Iterate over the tail of a list starting from a given position,
 * which must have been in the list when the RCU read lock was taken.
 * This would typically require either that you obtained the node from a
 * previous walk of the list in the same RCU read-side critical section, or
 * that you held some sort of non-RCU reference (such as a reference count)
 * to keep the node alive *and* in the list.
 *
 * This iterator is similar to list_for_each_entry_continue_rcu() except
 * this starts from the given position and that one starts from the position
 * after the given position.
 */
#define list_for_each_entry_from_rcu(pos, head, member)                        \
        for (; &(pos)->member != (head);                                        \
                pos = list_entry_rcu(pos->member.next, typeof(*(pos)), member))

/**
 * hlist_del_rcu - deletes entry from hash list without re-initialization
 * @n: the element to delete from the hash list.
 *
 * Note: list_unhashed() on entry does not return true after this,
 * the entry is in an undefined state. It is useful for RCU based
 * lockfree traversal.
 *
 * In particular, it means that we can not poison the forward
 * pointers that may still be used for walking the hash list.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry().
 */
static inline void hlist_del_rcu(struct hlist_node *n)
{
        __hlist_del(n);
        WRITE_ONCE(n->pprev, LIST_POISON2);
}

/**
 * hlist_replace_rcu - replace old entry by new one
 * @old : the element to be replaced
 * @new : the new element to insert
 *
 * The @old entry will be replaced with the @new entry atomically from
 * the perspective of concurrent readers.  It is the caller's responsibility
 * to synchronize with concurrent updaters, if any.
 */
static inline void hlist_replace_rcu(struct hlist_node *old,
                                        struct hlist_node *new)
{
        struct hlist_node *next = old->next;

        new->next = next;
        WRITE_ONCE(new->pprev, old->pprev);
        rcu_assign_pointer(*(struct hlist_node __rcu **)new->pprev, new);
        if (next)
                WRITE_ONCE(new->next->pprev, &new->next);
        WRITE_ONCE(old->pprev, LIST_POISON2);
}

/**
 * hlists_swap_heads_rcu - swap the lists the hlist heads point to
 * @left:  The hlist head on the left
 * @right: The hlist head on the right
 *
 * The lists start out as [@left  ][node1 ... ] and
 *                        [@right ][node2 ... ]
 * The lists end up as    [@left  ][node2 ... ]
 *                        [@right ][node1 ... ]
 */
static inline void hlists_swap_heads_rcu(struct hlist_head *left, struct hlist_head *right)
{
        struct hlist_node *node1 = left->first;
        struct hlist_node *node2 = right->first;

        rcu_assign_pointer(left->first, node2);
        rcu_assign_pointer(right->first, node1);
        WRITE_ONCE(node2->pprev, &left->first);
        WRITE_ONCE(node1->pprev, &right->first);
}

/*
 * return the first or the next element in an RCU protected hlist
 */
#define hlist_first_rcu(head)        (*((struct hlist_node __rcu **)(&(head)->first)))
#define hlist_next_rcu(node)        (*((struct hlist_node __rcu **)(&(node)->next)))
#define hlist_pprev_rcu(node)        (*((struct hlist_node __rcu **)((node)->pprev)))

/**
 * hlist_add_head_rcu
 * @n: the element to add to the hash list.
 * @h: the list to add to.
 *
 * Description:
 * Adds the specified element to the specified hlist,
 * while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.  Regardless of the type of CPU, the
 * list-traversal primitive must be guarded by rcu_read_lock().
 */
static inline void hlist_add_head_rcu(struct hlist_node *n,
                                        struct hlist_head *h)
{
        struct hlist_node *first = h->first;

        n->next = first;
        WRITE_ONCE(n->pprev, &h->first);
        rcu_assign_pointer(hlist_first_rcu(h), n);
        if (first)
                WRITE_ONCE(first->pprev, &n->next);
}

/**
 * hlist_add_tail_rcu
 * @n: the element to add to the hash list.
 * @h: the list to add to.
 *
 * Description:
 * Adds the specified element to the specified hlist,
 * while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.  Regardless of the type of CPU, the
 * list-traversal primitive must be guarded by rcu_read_lock().
 */
static inline void hlist_add_tail_rcu(struct hlist_node *n,
                                      struct hlist_head *h)
{
        struct hlist_node *i, *last = NULL;

        /* Note: write side code, so rcu accessors are not needed. */
        for (i = h->first; i; i = i->next)
                last = i;

        if (last) {
                n->next = last->next;
                WRITE_ONCE(n->pprev, &last->next);
                rcu_assign_pointer(hlist_next_rcu(last), n);
        } else {
                hlist_add_head_rcu(n, h);
        }
}

/**
 * hlist_add_before_rcu
 * @n: the new element to add to the hash list.
 * @next: the existing element to add the new element before.
 *
 * Description:
 * Adds the specified element to the specified hlist
 * before the specified node while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.
 */
static inline void hlist_add_before_rcu(struct hlist_node *n,
                                        struct hlist_node *next)
{
        WRITE_ONCE(n->pprev, next->pprev);
        n->next = next;
        rcu_assign_pointer(hlist_pprev_rcu(n), n);
        WRITE_ONCE(next->pprev, &n->next);
}

/**
 * hlist_add_behind_rcu
 * @n: the new element to add to the hash list.
 * @prev: the existing element to add the new element after.
 *
 * Description:
 * Adds the specified element to the specified hlist
 * after the specified node while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.
 */
static inline void hlist_add_behind_rcu(struct hlist_node *n,
                                        struct hlist_node *prev)
{
        n->next = prev->next;
        WRITE_ONCE(n->pprev, &prev->next);
        rcu_assign_pointer(hlist_next_rcu(prev), n);
        if (n->next)
                WRITE_ONCE(n->next->pprev, &n->next);
}

#define __hlist_for_each_rcu(pos, head)                                \
        for (pos = rcu_dereference(hlist_first_rcu(head));        \
             pos;                                                \
             pos = rcu_dereference(hlist_next_rcu(pos)))

/**
 * hlist_for_each_entry_rcu - iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 * @cond:        optional lockdep expression if called from non-RCU protection.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 */
#define hlist_for_each_entry_rcu(pos, head, member, cond...)                \
        for (__list_check_rcu(dummy, ## cond, 0),                        \
             pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\
                        typeof(*(pos)), member);                        \
                pos;                                                        \
                pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_srcu - iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 * @cond:        lockdep expression for the lock required to traverse the list.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by srcu_read_lock().
 * The lockdep expression srcu_read_lock_held() can be passed as the
 * cond argument from read side.
 */
#define hlist_for_each_entry_srcu(pos, head, member, cond)                \
        for (__list_check_srcu(cond),                                        \
             pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\
                        typeof(*(pos)), member);                        \
                pos;                                                        \
                pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_rcu_notrace - iterate over rcu list of given type (for tracing)
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 *
 * This is the same as hlist_for_each_entry_rcu() except that it does
 * not do any RCU debugging or tracing.
 */
#define hlist_for_each_entry_rcu_notrace(pos, head, member)                        \
        for (pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_first_rcu(head)),\
                        typeof(*(pos)), member);                        \
                pos;                                                        \
                pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_next_rcu(\
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_rcu_bh - iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 */
#define hlist_for_each_entry_rcu_bh(pos, head, member)                        \
        for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_first_rcu(head)),\
                        typeof(*(pos)), member);                        \
                pos;                                                        \
                pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(\
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_continue_rcu - iterate over a hlist continuing after current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_continue_rcu(pos, member)                        \
        for (pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \
                        &(pos)->member)), typeof(*(pos)), member);        \
             pos;                                                        \
             pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(        \
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_continue_rcu_bh - iterate over a hlist continuing after current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_continue_rcu_bh(pos, member)                \
        for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(  \
                        &(pos)->member)), typeof(*(pos)), member);        \
             pos;                                                        \
             pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(        \
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_from_rcu - iterate over a hlist continuing from current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_from_rcu(pos, member)                        \
        for (; pos;                                                        \
             pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(        \
                        &(pos)->member)), typeof(*(pos)), member))

#endif        /* __KERNEL__ */
#endif



































































































































































































































































































































  319 































































































































































































































































































  316 


























  318 
  315 























  316 







  316 


  315 

  318 














































































  314 


  319 
















  317 


  316 

























  319 




































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * VLAN                An implementation of 802.1Q VLAN tagging.
 *
 * Authors:        Ben Greear <greearb@candelatech.com>
 */
#ifndef _LINUX_IF_VLAN_H_
#define _LINUX_IF_VLAN_H_

#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/rtnetlink.h>
#include <linux/bug.h>
#include <uapi/linux/if_vlan.h>

#define VLAN_HLEN        4                /* The additional bytes required by VLAN
                                         * (in addition to the Ethernet header)
                                         */
#define VLAN_ETH_HLEN        18                /* Total octets in header.         */
#define VLAN_ETH_ZLEN        64                /* Min. octets in frame sans FCS */

/*
 * According to 802.3ac, the packet can be 4 bytes longer. --Klika Jan
 */
#define VLAN_ETH_DATA_LEN        1500        /* Max. octets in payload         */
#define VLAN_ETH_FRAME_LEN        1518        /* Max. octets in frame sans FCS */

#define VLAN_MAX_DEPTH        8                /* Max. number of nested VLAN tags parsed */

/*
 *         struct vlan_hdr - vlan header
 *         @h_vlan_TCI: priority and VLAN ID
 *        @h_vlan_encapsulated_proto: packet type ID or len
 */
struct vlan_hdr {
        __be16        h_vlan_TCI;
        __be16        h_vlan_encapsulated_proto;
};

/**
 *        struct vlan_ethhdr - vlan ethernet header (ethhdr + vlan_hdr)
 *        @h_dest: destination ethernet address
 *        @h_source: source ethernet address
 *        @h_vlan_proto: ethernet protocol
 *        @h_vlan_TCI: priority and VLAN ID
 *        @h_vlan_encapsulated_proto: packet type ID or len
 */
struct vlan_ethhdr {
        struct_group(addrs,
                unsigned char        h_dest[ETH_ALEN];
                unsigned char        h_source[ETH_ALEN];
        );
        __be16                h_vlan_proto;
        __be16                h_vlan_TCI;
        __be16                h_vlan_encapsulated_proto;
};

#include <linux/skbuff.h>

static inline struct vlan_ethhdr *vlan_eth_hdr(const struct sk_buff *skb)
{
        return (struct vlan_ethhdr *)skb_mac_header(skb);
}

/* Prefer this version in TX path, instead of
 * skb_reset_mac_header() + vlan_eth_hdr()
 */
static inline struct vlan_ethhdr *skb_vlan_eth_hdr(const struct sk_buff *skb)
{
        return (struct vlan_ethhdr *)skb->data;
}

#define VLAN_PRIO_MASK                0xe000 /* Priority Code Point */
#define VLAN_PRIO_SHIFT                13
#define VLAN_CFI_MASK                0x1000 /* Canonical Format Indicator / Drop Eligible Indicator */
#define VLAN_VID_MASK                0x0fff /* VLAN Identifier */
#define VLAN_N_VID                4096

/* found in socket.c */
extern void vlan_ioctl_set(int (*hook)(struct net *, void __user *));

#define skb_vlan_tag_present(__skb)        (!!(__skb)->vlan_all)
#define skb_vlan_tag_get(__skb)                ((__skb)->vlan_tci)
#define skb_vlan_tag_get_id(__skb)        ((__skb)->vlan_tci & VLAN_VID_MASK)
#define skb_vlan_tag_get_cfi(__skb)        (!!((__skb)->vlan_tci & VLAN_CFI_MASK))
#define skb_vlan_tag_get_prio(__skb)        (((__skb)->vlan_tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT)

static inline int vlan_get_rx_ctag_filter_info(struct net_device *dev)
{
        ASSERT_RTNL();
        return notifier_to_errno(call_netdevice_notifiers(NETDEV_CVLAN_FILTER_PUSH_INFO, dev));
}

static inline void vlan_drop_rx_ctag_filter_info(struct net_device *dev)
{
        ASSERT_RTNL();
        call_netdevice_notifiers(NETDEV_CVLAN_FILTER_DROP_INFO, dev);
}

static inline int vlan_get_rx_stag_filter_info(struct net_device *dev)
{
        ASSERT_RTNL();
        return notifier_to_errno(call_netdevice_notifiers(NETDEV_SVLAN_FILTER_PUSH_INFO, dev));
}

static inline void vlan_drop_rx_stag_filter_info(struct net_device *dev)
{
        ASSERT_RTNL();
        call_netdevice_notifiers(NETDEV_SVLAN_FILTER_DROP_INFO, dev);
}

/**
 *        struct vlan_pcpu_stats - VLAN percpu rx/tx stats
 *        @rx_packets: number of received packets
 *        @rx_bytes: number of received bytes
 *        @rx_multicast: number of received multicast packets
 *        @tx_packets: number of transmitted packets
 *        @tx_bytes: number of transmitted bytes
 *        @syncp: synchronization point for 64bit counters
 *        @rx_errors: number of rx errors
 *        @tx_dropped: number of tx drops
 */
struct vlan_pcpu_stats {
        u64_stats_t                rx_packets;
        u64_stats_t                rx_bytes;
        u64_stats_t                rx_multicast;
        u64_stats_t                tx_packets;
        u64_stats_t                tx_bytes;
        struct u64_stats_sync        syncp;
        u32                        rx_errors;
        u32                        tx_dropped;
};

#if IS_ENABLED(CONFIG_VLAN_8021Q)

extern struct net_device *__vlan_find_dev_deep_rcu(struct net_device *real_dev,
                                               __be16 vlan_proto, u16 vlan_id);
extern int vlan_for_each(struct net_device *dev,
                         int (*action)(struct net_device *dev, int vid,
                                       void *arg), void *arg);
extern struct net_device *vlan_dev_real_dev(const struct net_device *dev);
extern u16 vlan_dev_vlan_id(const struct net_device *dev);
extern __be16 vlan_dev_vlan_proto(const struct net_device *dev);

/**
 *        struct vlan_priority_tci_mapping - vlan egress priority mappings
 *        @priority: skb priority
 *        @vlan_qos: vlan priority: (skb->priority << 13) & 0xE000
 *        @next: pointer to next struct
 */
struct vlan_priority_tci_mapping {
        u32                                        priority;
        u16                                        vlan_qos;
        struct vlan_priority_tci_mapping        *next;
};

struct proc_dir_entry;
struct netpoll;

/**
 *        struct vlan_dev_priv - VLAN private device data
 *        @nr_ingress_mappings: number of ingress priority mappings
 *        @ingress_priority_map: ingress priority mappings
 *        @nr_egress_mappings: number of egress priority mappings
 *        @egress_priority_map: hash of egress priority mappings
 *        @vlan_proto: VLAN encapsulation protocol
 *        @vlan_id: VLAN identifier
 *        @flags: device flags
 *        @real_dev: underlying netdevice
 *        @dev_tracker: refcount tracker for @real_dev reference
 *        @real_dev_addr: address of underlying netdevice
 *        @dent: proc dir entry
 *        @vlan_pcpu_stats: ptr to percpu rx stats
 *        @netpoll: netpoll instance "propagated" down to @real_dev
 */
struct vlan_dev_priv {
        unsigned int                                nr_ingress_mappings;
        u32                                        ingress_priority_map[8];
        unsigned int                                nr_egress_mappings;
        struct vlan_priority_tci_mapping        *egress_priority_map[16];

        __be16                                        vlan_proto;
        u16                                        vlan_id;
        u16                                        flags;

        struct net_device                        *real_dev;
        netdevice_tracker                        dev_tracker;

        unsigned char                                real_dev_addr[ETH_ALEN];

        struct proc_dir_entry                        *dent;
        struct vlan_pcpu_stats __percpu                *vlan_pcpu_stats;
#ifdef CONFIG_NET_POLL_CONTROLLER
        struct netpoll                                *netpoll;
#endif
};

static inline bool is_vlan_dev(const struct net_device *dev)
{
        return dev->priv_flags & IFF_802_1Q_VLAN;
}

static inline struct vlan_dev_priv *vlan_dev_priv(const struct net_device *dev)
{
        return netdev_priv(dev);
}

static inline u16
vlan_dev_get_egress_qos_mask(struct net_device *dev, u32 skprio)
{
        struct vlan_priority_tci_mapping *mp;

        smp_rmb(); /* coupled with smp_wmb() in vlan_dev_set_egress_priority() */

        mp = vlan_dev_priv(dev)->egress_priority_map[(skprio & 0xF)];
        while (mp) {
                if (mp->priority == skprio) {
                        return mp->vlan_qos; /* This should already be shifted
                                              * to mask correctly with the
                                              * VLAN's TCI */
                }
                mp = mp->next;
        }
        return 0;
}

extern bool vlan_do_receive(struct sk_buff **skb);

extern int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid);
extern void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid);

extern int vlan_vids_add_by_dev(struct net_device *dev,
                                const struct net_device *by_dev);
extern void vlan_vids_del_by_dev(struct net_device *dev,
                                 const struct net_device *by_dev);

extern bool vlan_uses_dev(const struct net_device *dev);

#else
static inline bool is_vlan_dev(const struct net_device *dev)
{
        return false;
}

static inline struct net_device *
__vlan_find_dev_deep_rcu(struct net_device *real_dev,
                     __be16 vlan_proto, u16 vlan_id)
{
        return NULL;
}

static inline int
vlan_for_each(struct net_device *dev,
              int (*action)(struct net_device *dev, int vid, void *arg),
              void *arg)
{
        return 0;
}

static inline struct net_device *vlan_dev_real_dev(const struct net_device *dev)
{
        WARN_ON_ONCE(1);
        return NULL;
}

static inline u16 vlan_dev_vlan_id(const struct net_device *dev)
{
        WARN_ON_ONCE(1);
        return 0;
}

static inline __be16 vlan_dev_vlan_proto(const struct net_device *dev)
{
        WARN_ON_ONCE(1);
        return 0;
}

static inline u16 vlan_dev_get_egress_qos_mask(struct net_device *dev,
                                               u32 skprio)
{
        return 0;
}

static inline bool vlan_do_receive(struct sk_buff **skb)
{
        return false;
}

static inline int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid)
{
        return 0;
}

static inline void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid)
{
}

static inline int vlan_vids_add_by_dev(struct net_device *dev,
                                       const struct net_device *by_dev)
{
        return 0;
}

static inline void vlan_vids_del_by_dev(struct net_device *dev,
                                        const struct net_device *by_dev)
{
}

static inline bool vlan_uses_dev(const struct net_device *dev)
{
        return false;
}
#endif

/**
 * eth_type_vlan - check for valid vlan ether type.
 * @ethertype: ether type to check
 *
 * Returns: true if the ether type is a vlan ether type.
 */
static inline bool eth_type_vlan(__be16 ethertype)
{
        switch (ethertype) {
        case htons(ETH_P_8021Q):
        case htons(ETH_P_8021AD):
                return true;
        default:
                return false;
        }
}

static inline bool vlan_hw_offload_capable(netdev_features_t features,
                                           __be16 proto)
{
        if (proto == htons(ETH_P_8021Q) && features & NETIF_F_HW_VLAN_CTAG_TX)
                return true;
        if (proto == htons(ETH_P_8021AD) && features & NETIF_F_HW_VLAN_STAG_TX)
                return true;
        return false;
}

/**
 * __vlan_insert_inner_tag - inner VLAN tag inserting
 * @skb: skbuff to tag
 * @vlan_proto: VLAN encapsulation protocol
 * @vlan_tci: VLAN TCI to insert
 * @mac_len: MAC header length including outer vlan headers
 *
 * Inserts the VLAN tag into @skb as part of the payload at offset mac_len
 * Does not change skb->protocol so this function can be used during receive.
 *
 * Returns: error if skb_cow_head fails.
 */
static inline int __vlan_insert_inner_tag(struct sk_buff *skb,
                                          __be16 vlan_proto, u16 vlan_tci,
                                          unsigned int mac_len)
{
        struct vlan_ethhdr *veth;

        if (skb_cow_head(skb, VLAN_HLEN) < 0)
                return -ENOMEM;

        skb_push(skb, VLAN_HLEN);

        /* Move the mac header sans proto to the beginning of the new header. */
        if (likely(mac_len > ETH_TLEN))
                memmove(skb->data, skb->data + VLAN_HLEN, mac_len - ETH_TLEN);
        if (skb_mac_header_was_set(skb))
                skb->mac_header -= VLAN_HLEN;

        veth = (struct vlan_ethhdr *)(skb->data + mac_len - ETH_HLEN);

        /* first, the ethernet type */
        if (likely(mac_len >= ETH_TLEN)) {
                /* h_vlan_encapsulated_proto should already be populated, and
                 * skb->data has space for h_vlan_proto
                 */
                veth->h_vlan_proto = vlan_proto;
        } else {
                /* h_vlan_encapsulated_proto should not be populated, and
                 * skb->data has no space for h_vlan_proto
                 */
                veth->h_vlan_encapsulated_proto = skb->protocol;
        }

        /* now, the TCI */
        veth->h_vlan_TCI = htons(vlan_tci);

        return 0;
}

/**
 * __vlan_insert_tag - regular VLAN tag inserting
 * @skb: skbuff to tag
 * @vlan_proto: VLAN encapsulation protocol
 * @vlan_tci: VLAN TCI to insert
 *
 * Inserts the VLAN tag into @skb as part of the payload
 * Does not change skb->protocol so this function can be used during receive.
 *
 * Returns: error if skb_cow_head fails.
 */
static inline int __vlan_insert_tag(struct sk_buff *skb,
                                    __be16 vlan_proto, u16 vlan_tci)
{
        return __vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, ETH_HLEN);
}

/**
 * vlan_insert_inner_tag - inner VLAN tag inserting
 * @skb: skbuff to tag
 * @vlan_proto: VLAN encapsulation protocol
 * @vlan_tci: VLAN TCI to insert
 * @mac_len: MAC header length including outer vlan headers
 *
 * Inserts the VLAN tag into @skb as part of the payload at offset mac_len
 * Returns a VLAN tagged skb. This might change skb->head.
 *
 * Following the skb_unshare() example, in case of error, the calling function
 * doesn't have to worry about freeing the original skb.
 *
 * Does not change skb->protocol so this function can be used during receive.
 *
 * Return: modified @skb on success, NULL on error (@skb is freed).
 */
static inline struct sk_buff *vlan_insert_inner_tag(struct sk_buff *skb,
                                                    __be16 vlan_proto,
                                                    u16 vlan_tci,
                                                    unsigned int mac_len)
{
        int err;

        err = __vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, mac_len);
        if (err) {
                dev_kfree_skb_any(skb);
                return NULL;
        }
        return skb;
}

/**
 * vlan_insert_tag - regular VLAN tag inserting
 * @skb: skbuff to tag
 * @vlan_proto: VLAN encapsulation protocol
 * @vlan_tci: VLAN TCI to insert
 *
 * Inserts the VLAN tag into @skb as part of the payload
 * Returns a VLAN tagged skb. This might change skb->head.
 *
 * Following the skb_unshare() example, in case of error, the calling function
 * doesn't have to worry about freeing the original skb.
 *
 * Does not change skb->protocol so this function can be used during receive.
 *
 * Return: modified @skb on success, NULL on error (@skb is freed).
 */
static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb,
                                              __be16 vlan_proto, u16 vlan_tci)
{
        return vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, ETH_HLEN);
}

/**
 * vlan_insert_tag_set_proto - regular VLAN tag inserting
 * @skb: skbuff to tag
 * @vlan_proto: VLAN encapsulation protocol
 * @vlan_tci: VLAN TCI to insert
 *
 * Inserts the VLAN tag into @skb as part of the payload
 * Returns a VLAN tagged skb. This might change skb->head.
 *
 * Following the skb_unshare() example, in case of error, the calling function
 * doesn't have to worry about freeing the original skb.
 *
 * Return: modified @skb on success, NULL on error (@skb is freed).
 */
static inline struct sk_buff *vlan_insert_tag_set_proto(struct sk_buff *skb,
                                                        __be16 vlan_proto,
                                                        u16 vlan_tci)
{
        skb = vlan_insert_tag(skb, vlan_proto, vlan_tci);
        if (skb)
                skb->protocol = vlan_proto;
        return skb;
}

/**
 * __vlan_hwaccel_clear_tag - clear hardware accelerated VLAN info
 * @skb: skbuff to clear
 *
 * Clears the VLAN information from @skb
 */
static inline void __vlan_hwaccel_clear_tag(struct sk_buff *skb)
{
        skb->vlan_all = 0;
}

/**
 * __vlan_hwaccel_copy_tag - copy hardware accelerated VLAN info from another skb
 * @dst: skbuff to copy to
 * @src: skbuff to copy from
 *
 * Copies VLAN information from @src to @dst (for branchless code)
 */
static inline void __vlan_hwaccel_copy_tag(struct sk_buff *dst, const struct sk_buff *src)
{
        dst->vlan_all = src->vlan_all;
}

/*
 * __vlan_hwaccel_push_inside - pushes vlan tag to the payload
 * @skb: skbuff to tag
 *
 * Pushes the VLAN tag from @skb->vlan_tci inside to the payload.
 *
 * Following the skb_unshare() example, in case of error, the calling function
 * doesn't have to worry about freeing the original skb.
 */
static inline struct sk_buff *__vlan_hwaccel_push_inside(struct sk_buff *skb)
{
        skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
                                        skb_vlan_tag_get(skb));
        if (likely(skb))
                __vlan_hwaccel_clear_tag(skb);
        return skb;
}

/**
 * __vlan_hwaccel_put_tag - hardware accelerated VLAN inserting
 * @skb: skbuff to tag
 * @vlan_proto: VLAN encapsulation protocol
 * @vlan_tci: VLAN TCI to insert
 *
 * Puts the VLAN TCI in @skb->vlan_tci and lets the device do the rest
 */
static inline void __vlan_hwaccel_put_tag(struct sk_buff *skb,
                                          __be16 vlan_proto, u16 vlan_tci)
{
        skb->vlan_proto = vlan_proto;
        skb->vlan_tci = vlan_tci;
}

/**
 * __vlan_get_tag - get the VLAN ID that is part of the payload
 * @skb: skbuff to query
 * @vlan_tci: buffer to store value
 *
 * Returns: error if the skb is not of VLAN type
 */
static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci)
{
        struct vlan_ethhdr *veth = skb_vlan_eth_hdr(skb);

        if (!eth_type_vlan(veth->h_vlan_proto))
                return -ENODATA;

        *vlan_tci = ntohs(veth->h_vlan_TCI);
        return 0;
}

/**
 * __vlan_hwaccel_get_tag - get the VLAN ID that is in @skb->cb[]
 * @skb: skbuff to query
 * @vlan_tci: buffer to store value
 *
 * Returns: error if @skb->vlan_tci is not set correctly
 */
static inline int __vlan_hwaccel_get_tag(const struct sk_buff *skb,
                                         u16 *vlan_tci)
{
        if (skb_vlan_tag_present(skb)) {
                *vlan_tci = skb_vlan_tag_get(skb);
                return 0;
        } else {
                *vlan_tci = 0;
                return -ENODATA;
        }
}

/**
 * vlan_get_tag - get the VLAN ID from the skb
 * @skb: skbuff to query
 * @vlan_tci: buffer to store value
 *
 * Returns: error if the skb is not VLAN tagged
 */
static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci)
{
        if (skb->dev->features & NETIF_F_HW_VLAN_CTAG_TX) {
                return __vlan_hwaccel_get_tag(skb, vlan_tci);
        } else {
                return __vlan_get_tag(skb, vlan_tci);
        }
}

/**
 * __vlan_get_protocol_offset() - get protocol EtherType.
 * @skb: skbuff to query
 * @type: first vlan protocol
 * @mac_offset: MAC offset
 * @depth: buffer to store length of eth and vlan tags in bytes
 *
 * Returns: the EtherType of the packet, regardless of whether it is
 * vlan encapsulated (normal or hardware accelerated) or not.
 */
static inline __be16 __vlan_get_protocol_offset(const struct sk_buff *skb,
                                                __be16 type,
                                                int mac_offset,
                                                int *depth)
{
        unsigned int vlan_depth = skb->mac_len, parse_depth = VLAN_MAX_DEPTH;

        /* if type is 802.1Q/AD then the header should already be
         * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
         * ETH_HLEN otherwise
         */
        if (eth_type_vlan(type)) {
                if (vlan_depth) {
                        if (WARN_ON(vlan_depth < VLAN_HLEN))
                                return 0;
                        vlan_depth -= VLAN_HLEN;
                } else {
                        vlan_depth = ETH_HLEN;
                }
                do {
                        struct vlan_hdr vhdr, *vh;

                        vh = skb_header_pointer(skb, mac_offset + vlan_depth,
                                                sizeof(vhdr), &vhdr);
                        if (unlikely(!vh || !--parse_depth))
                                return 0;

                        type = vh->h_vlan_encapsulated_proto;
                        vlan_depth += VLAN_HLEN;
                } while (eth_type_vlan(type));
        }

        if (depth)
                *depth = vlan_depth;

        return type;
}

static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type,
                                         int *depth)
{
        return __vlan_get_protocol_offset(skb, type, 0, depth);
}

/**
 * vlan_get_protocol - get protocol EtherType.
 * @skb: skbuff to query
 *
 * Returns: the EtherType of the packet, regardless of whether it is
 * vlan encapsulated (normal or hardware accelerated) or not.
 */
static inline __be16 vlan_get_protocol(const struct sk_buff *skb)
{
        return __vlan_get_protocol(skb, skb->protocol, NULL);
}

/* This version of __vlan_get_protocol() also pulls mac header in skb->head */
static inline __be16 vlan_get_protocol_and_depth(struct sk_buff *skb,
                                                 __be16 type, int *depth)
{
        int maclen;

        type = __vlan_get_protocol(skb, type, &maclen);

        if (type) {
                if (!pskb_may_pull(skb, maclen))
                        type = 0;
                else if (depth)
                        *depth = maclen;
        }
        return type;
}

/* A getter for the SKB protocol field which will handle VLAN tags consistently
 * whether VLAN acceleration is enabled or not.
 */
static inline __be16 skb_protocol(const struct sk_buff *skb, bool skip_vlan)
{
        if (!skip_vlan)
                /* VLAN acceleration strips the VLAN header from the skb and
                 * moves it to skb->vlan_proto
                 */
                return skb_vlan_tag_present(skb) ? skb->vlan_proto : skb->protocol;

        return vlan_get_protocol(skb);
}

static inline void vlan_set_encap_proto(struct sk_buff *skb,
                                        struct vlan_hdr *vhdr)
{
        __be16 proto;
        unsigned short *rawp;

        /*
         * Was a VLAN packet, grab the encapsulated protocol, which the layer
         * three protocols care about.
         */

        proto = vhdr->h_vlan_encapsulated_proto;
        if (eth_proto_is_802_3(proto)) {
                skb->protocol = proto;
                return;
        }

        rawp = (unsigned short *)(vhdr + 1);
        if (*rawp == 0xFFFF)
                /*
                 * This is a magic hack to spot IPX packets. Older Novell
                 * breaks the protocol design and runs IPX over 802.3 without
                 * an 802.2 LLC layer. We look for FFFF which isn't a used
                 * 802.2 SSAP/DSAP. This won't work for fault tolerant netware
                 * but does for the rest.
                 */
                skb->protocol = htons(ETH_P_802_3);
        else
                /*
                 * Real 802.2 LLC
                 */
                skb->protocol = htons(ETH_P_802_2);
}

/**
 * vlan_remove_tag - remove outer VLAN tag from payload
 * @skb: skbuff to remove tag from
 * @vlan_tci: buffer to store value
 *
 * Expects the skb to contain a VLAN tag in the payload, and to have skb->data
 * pointing at the MAC header.
 *
 * Returns: a new pointer to skb->data, or NULL on failure to pull.
 */
static inline void *vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci)
{
        struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);

        *vlan_tci = ntohs(vhdr->h_vlan_TCI);

        memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);
        vlan_set_encap_proto(skb, vhdr);
        return __skb_pull(skb, VLAN_HLEN);
}

/**
 * skb_vlan_tagged - check if skb is vlan tagged.
 * @skb: skbuff to query
 *
 * Returns: true if the skb is tagged, regardless of whether it is hardware
 * accelerated or not.
 */
static inline bool skb_vlan_tagged(const struct sk_buff *skb)
{
        if (!skb_vlan_tag_present(skb) &&
            likely(!eth_type_vlan(skb->protocol)))
                return false;

        return true;
}

/**
 * skb_vlan_tagged_multi - check if skb is vlan tagged with multiple headers.
 * @skb: skbuff to query
 *
 * Returns: true if the skb is tagged with multiple vlan headers, regardless
 * of whether it is hardware accelerated or not.
 */
static inline bool skb_vlan_tagged_multi(struct sk_buff *skb)
{
        __be16 protocol = skb->protocol;

        if (!skb_vlan_tag_present(skb)) {
                struct vlan_ethhdr *veh;

                if (likely(!eth_type_vlan(protocol)))
                        return false;

                if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
                        return false;

                veh = skb_vlan_eth_hdr(skb);
                protocol = veh->h_vlan_encapsulated_proto;
        }

        if (!eth_type_vlan(protocol))
                return false;

        return true;
}

/**
 * vlan_features_check - drop unsafe features for skb with multiple tags.
 * @skb: skbuff to query
 * @features: features to be checked
 *
 * Returns: features without unsafe ones if the skb has multiple tags.
 */
static inline netdev_features_t vlan_features_check(struct sk_buff *skb,
                                                    netdev_features_t features)
{
        if (skb_vlan_tagged_multi(skb)) {
                /* In the case of multi-tagged packets, use a direct mask
                 * instead of using netdev_interesect_features(), to make
                 * sure that only devices supporting NETIF_F_HW_CSUM will
                 * have checksum offloading support.
                 */
                features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_HW_CSUM |
                            NETIF_F_FRAGLIST | NETIF_F_HW_VLAN_CTAG_TX |
                            NETIF_F_HW_VLAN_STAG_TX;
        }

        return features;
}

/**
 * compare_vlan_header - Compare two vlan headers
 * @h1: Pointer to vlan header
 * @h2: Pointer to vlan header
 *
 * Compare two vlan headers.
 *
 * Please note that alignment of h1 & h2 are only guaranteed to be 16 bits.
 *
 * Return: 0 if equal, arbitrary non-zero value if not equal.
 */
static inline unsigned long compare_vlan_header(const struct vlan_hdr *h1,
                                                const struct vlan_hdr *h2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        return *(u32 *)h1 ^ *(u32 *)h2;
#else
        return ((__force u32)h1->h_vlan_TCI ^ (__force u32)h2->h_vlan_TCI) |
               ((__force u32)h1->h_vlan_encapsulated_proto ^
                (__force u32)h2->h_vlan_encapsulated_proto);
#endif
}
#endif /* !(_LINUX_IF_VLAN_H_) */

















































































   42 






   42 





































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
/* SPDX-License-Identifier: GPL-2.0 */

#ifndef _LINUX_RANDOM_H
#define _LINUX_RANDOM_H

#include <linux/bug.h>
#include <linux/kernel.h>
#include <linux/list.h>

#include <uapi/linux/random.h>

struct notifier_block;

void add_device_randomness(const void *buf, size_t len);
void __init add_bootloader_randomness(const void *buf, size_t len);
void add_input_randomness(unsigned int type, unsigned int code,
                          unsigned int value) __latent_entropy;
void add_interrupt_randomness(int irq) __latent_entropy;
void add_hwgenerator_randomness(const void *buf, size_t len, size_t entropy, bool sleep_after);

static inline void add_latent_entropy(void)
{
#if defined(LATENT_ENTROPY_PLUGIN) && !defined(__CHECKER__)
        add_device_randomness((const void *)&latent_entropy, sizeof(latent_entropy));
#else
        add_device_randomness(NULL, 0);
#endif
}

#if IS_ENABLED(CONFIG_VMGENID)
void add_vmfork_randomness(const void *unique_vm_id, size_t len);
int register_random_vmfork_notifier(struct notifier_block *nb);
int unregister_random_vmfork_notifier(struct notifier_block *nb);
#else
static inline int register_random_vmfork_notifier(struct notifier_block *nb) { return 0; }
static inline int unregister_random_vmfork_notifier(struct notifier_block *nb) { return 0; }
#endif

void get_random_bytes(void *buf, size_t len);
u8 get_random_u8(void);
u16 get_random_u16(void);
u32 get_random_u32(void);
u64 get_random_u64(void);
static inline unsigned long get_random_long(void)
{
#if BITS_PER_LONG == 64
        return get_random_u64();
#else
        return get_random_u32();
#endif
}

u32 __get_random_u32_below(u32 ceil);

/*
 * Returns a random integer in the interval [0, ceil), with uniform
 * distribution, suitable for all uses. Fastest when ceil is a constant, but
 * still fast for variable ceil as well.
 */
static inline u32 get_random_u32_below(u32 ceil)
{
        if (!__builtin_constant_p(ceil))
                return __get_random_u32_below(ceil);

        /*
         * For the fast path, below, all operations on ceil are precomputed by
         * the compiler, so this incurs no overhead for checking pow2, doing
         * divisions, or branching based on integer size. The resultant
         * algorithm does traditional reciprocal multiplication (typically
         * optimized by the compiler into shifts and adds), rejecting samples
         * whose lower half would indicate a range indivisible by ceil.
         */
        BUILD_BUG_ON_MSG(!ceil, "get_random_u32_below() must take ceil > 0");
        if (ceil <= 1)
                return 0;
        for (;;) {
                if (ceil <= 1U << 8) {
                        u32 mult = ceil * get_random_u8();
                        if (likely(is_power_of_2(ceil) || (u8)mult >= (1U << 8) % ceil))
                                return mult >> 8;
                } else if (ceil <= 1U << 16) {
                        u32 mult = ceil * get_random_u16();
                        if (likely(is_power_of_2(ceil) || (u16)mult >= (1U << 16) % ceil))
                                return mult >> 16;
                } else {
                        u64 mult = (u64)ceil * get_random_u32();
                        if (likely(is_power_of_2(ceil) || (u32)mult >= -ceil % ceil))
                                return mult >> 32;
                }
        }
}

/*
 * Returns a random integer in the interval (floor, U32_MAX], with uniform
 * distribution, suitable for all uses. Fastest when floor is a constant, but
 * still fast for variable floor as well.
 */
static inline u32 get_random_u32_above(u32 floor)
{
        BUILD_BUG_ON_MSG(__builtin_constant_p(floor) && floor == U32_MAX,
                         "get_random_u32_above() must take floor < U32_MAX");
        return floor + 1 + get_random_u32_below(U32_MAX - floor);
}

/*
 * Returns a random integer in the interval [floor, ceil], with uniform
 * distribution, suitable for all uses. Fastest when floor and ceil are
 * constant, but still fast for variable floor and ceil as well.
 */
static inline u32 get_random_u32_inclusive(u32 floor, u32 ceil)
{
        BUILD_BUG_ON_MSG(__builtin_constant_p(floor) && __builtin_constant_p(ceil) &&
                         (floor > ceil || ceil - floor == U32_MAX),
                         "get_random_u32_inclusive() must take floor <= ceil");
        return floor + get_random_u32_below(ceil - floor + 1);
}

void __init random_init_early(const char *command_line);
void __init random_init(void);
bool rng_is_initialized(void);
int wait_for_random_bytes(void);
int execute_with_initialized_rng(struct notifier_block *nb);

/* Calls wait_for_random_bytes() and then calls get_random_bytes(buf, nbytes).
 * Returns the result of the call to wait_for_random_bytes. */
static inline int get_random_bytes_wait(void *buf, size_t nbytes)
{
        int ret = wait_for_random_bytes();
        get_random_bytes(buf, nbytes);
        return ret;
}

#define declare_get_random_var_wait(name, ret_type) \
        static inline int get_random_ ## name ## _wait(ret_type *out) { \
                int ret = wait_for_random_bytes(); \
                if (unlikely(ret)) \
                        return ret; \
                *out = get_random_ ## name(); \
                return 0; \
        }
declare_get_random_var_wait(u8, u8)
declare_get_random_var_wait(u16, u16)
declare_get_random_var_wait(u32, u32)
declare_get_random_var_wait(u64, u32)
declare_get_random_var_wait(long, unsigned long)
#undef declare_get_random_var

#ifdef CONFIG_SMP
int random_prepare_cpu(unsigned int cpu);
int random_online_cpu(unsigned int cpu);
#endif

#ifndef MODULE
extern const struct file_operations random_fops, urandom_fops;
#endif

#endif /* _LINUX_RANDOM_H */






















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    6 






   45 








































































































































   45 





















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Written by Mark Hemment, 1996 (markhe@nextd.demon.co.uk).
 *
 * (C) SGI 2006, Christoph Lameter
 *         Cleaned up and restructured to ease the addition of alternative
 *         implementations of SLAB allocators.
 * (C) Linux Foundation 2008-2013
 *      Unified interface for all slab allocators
 */

#ifndef _LINUX_SLAB_H
#define        _LINUX_SLAB_H

#include <linux/cache.h>
#include <linux/gfp.h>
#include <linux/overflow.h>
#include <linux/types.h>
#include <linux/rcupdate.h>
#include <linux/workqueue.h>
#include <linux/percpu-refcount.h>
#include <linux/cleanup.h>
#include <linux/hash.h>

enum _slab_flag_bits {
        _SLAB_CONSISTENCY_CHECKS,
        _SLAB_RED_ZONE,
        _SLAB_POISON,
        _SLAB_KMALLOC,
        _SLAB_HWCACHE_ALIGN,
        _SLAB_CACHE_DMA,
        _SLAB_CACHE_DMA32,
        _SLAB_STORE_USER,
        _SLAB_PANIC,
        _SLAB_TYPESAFE_BY_RCU,
        _SLAB_TRACE,
#ifdef CONFIG_DEBUG_OBJECTS
        _SLAB_DEBUG_OBJECTS,
#endif
        _SLAB_NOLEAKTRACE,
        _SLAB_NO_MERGE,
#ifdef CONFIG_FAILSLAB
        _SLAB_FAILSLAB,
#endif
#ifdef CONFIG_MEMCG
        _SLAB_ACCOUNT,
#endif
#ifdef CONFIG_KASAN_GENERIC
        _SLAB_KASAN,
#endif
        _SLAB_NO_USER_FLAGS,
#ifdef CONFIG_KFENCE
        _SLAB_SKIP_KFENCE,
#endif
#ifndef CONFIG_SLUB_TINY
        _SLAB_RECLAIM_ACCOUNT,
#endif
        _SLAB_OBJECT_POISON,
        _SLAB_CMPXCHG_DOUBLE,
#ifdef CONFIG_SLAB_OBJ_EXT
        _SLAB_NO_OBJ_EXT,
#endif
        _SLAB_FLAGS_LAST_BIT
};

#define __SLAB_FLAG_BIT(nr)        ((slab_flags_t __force)(1U << (nr)))
#define __SLAB_FLAG_UNUSED        ((slab_flags_t __force)(0U))

/*
 * Flags to pass to kmem_cache_create().
 * The ones marked DEBUG need CONFIG_SLUB_DEBUG enabled, otherwise are no-op
 */
/* DEBUG: Perform (expensive) checks on alloc/free */
#define SLAB_CONSISTENCY_CHECKS        __SLAB_FLAG_BIT(_SLAB_CONSISTENCY_CHECKS)
/* DEBUG: Red zone objs in a cache */
#define SLAB_RED_ZONE                __SLAB_FLAG_BIT(_SLAB_RED_ZONE)
/* DEBUG: Poison objects */
#define SLAB_POISON                __SLAB_FLAG_BIT(_SLAB_POISON)
/* Indicate a kmalloc slab */
#define SLAB_KMALLOC                __SLAB_FLAG_BIT(_SLAB_KMALLOC)
/**
 * define SLAB_HWCACHE_ALIGN - Align objects on cache line boundaries.
 *
 * Sufficiently large objects are aligned on cache line boundary. For object
 * size smaller than a half of cache line size, the alignment is on the half of
 * cache line size. In general, if object size is smaller than 1/2^n of cache
 * line size, the alignment is adjusted to 1/2^n.
 *
 * If explicit alignment is also requested by the respective
 * &struct kmem_cache_args field, the greater of both is alignments is applied.
 */
#define SLAB_HWCACHE_ALIGN        __SLAB_FLAG_BIT(_SLAB_HWCACHE_ALIGN)
/* Use GFP_DMA memory */
#define SLAB_CACHE_DMA                __SLAB_FLAG_BIT(_SLAB_CACHE_DMA)
/* Use GFP_DMA32 memory */
#define SLAB_CACHE_DMA32        __SLAB_FLAG_BIT(_SLAB_CACHE_DMA32)
/* DEBUG: Store the last owner for bug hunting */
#define SLAB_STORE_USER                __SLAB_FLAG_BIT(_SLAB_STORE_USER)
/* Panic if kmem_cache_create() fails */
#define SLAB_PANIC                __SLAB_FLAG_BIT(_SLAB_PANIC)
/**
 * define SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS!
 *
 * This delays freeing the SLAB page by a grace period, it does _NOT_
 * delay object freeing. This means that if you do kmem_cache_free()
 * that memory location is free to be reused at any time. Thus it may
 * be possible to see another object there in the same RCU grace period.
 *
 * This feature only ensures the memory location backing the object
 * stays valid, the trick to using this is relying on an independent
 * object validation pass. Something like:
 *
 * ::
 *
 *  begin:
 *   rcu_read_lock();
 *   obj = lockless_lookup(key);
 *   if (obj) {
 *     if (!try_get_ref(obj)) // might fail for free objects
 *       rcu_read_unlock();
 *       goto begin;
 *
 *     if (obj->key != key) { // not the object we expected
 *       put_ref(obj);
 *       rcu_read_unlock();
 *       goto begin;
 *     }
 *   }
 *  rcu_read_unlock();
 *
 * This is useful if we need to approach a kernel structure obliquely,
 * from its address obtained without the usual locking. We can lock
 * the structure to stabilize it and check it's still at the given address,
 * only if we can be sure that the memory has not been meanwhile reused
 * for some other kind of object (which our subsystem's lock might corrupt).
 *
 * rcu_read_lock before reading the address, then rcu_read_unlock after
 * taking the spinlock within the structure expected at that address.
 *
 * Note that object identity check has to be done *after* acquiring a
 * reference, therefore user has to ensure proper ordering for loads.
 * Similarly, when initializing objects allocated with SLAB_TYPESAFE_BY_RCU,
 * the newly allocated object has to be fully initialized *before* its
 * refcount gets initialized and proper ordering for stores is required.
 * refcount_{add|inc}_not_zero_acquire() and refcount_set_release() are
 * designed with the proper fences required for reference counting objects
 * allocated with SLAB_TYPESAFE_BY_RCU.
 *
 * Note that it is not possible to acquire a lock within a structure
 * allocated with SLAB_TYPESAFE_BY_RCU without first acquiring a reference
 * as described above.  The reason is that SLAB_TYPESAFE_BY_RCU pages
 * are not zeroed before being given to the slab, which means that any
 * locks must be initialized after each and every kmem_struct_alloc().
 * Alternatively, make the ctor passed to kmem_cache_create() initialize
 * the locks at page-allocation time, as is done in __i915_request_ctor(),
 * sighand_ctor(), and anon_vma_ctor().  Such a ctor permits readers
 * to safely acquire those ctor-initialized locks under rcu_read_lock()
 * protection.
 *
 * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU.
 */
#define SLAB_TYPESAFE_BY_RCU        __SLAB_FLAG_BIT(_SLAB_TYPESAFE_BY_RCU)
/* Trace allocations and frees */
#define SLAB_TRACE                __SLAB_FLAG_BIT(_SLAB_TRACE)

/* Flag to prevent checks on free */
#ifdef CONFIG_DEBUG_OBJECTS
# define SLAB_DEBUG_OBJECTS        __SLAB_FLAG_BIT(_SLAB_DEBUG_OBJECTS)
#else
# define SLAB_DEBUG_OBJECTS        __SLAB_FLAG_UNUSED
#endif

/* Avoid kmemleak tracing */
#define SLAB_NOLEAKTRACE        __SLAB_FLAG_BIT(_SLAB_NOLEAKTRACE)

/*
 * Prevent merging with compatible kmem caches. This flag should be used
 * cautiously. Valid use cases:
 *
 * - caches created for self-tests (e.g. kunit)
 * - general caches created and used by a subsystem, only when a
 *   (subsystem-specific) debug option is enabled
 * - performance critical caches, should be very rare and consulted with slab
 *   maintainers, and not used together with CONFIG_SLUB_TINY
 */
#define SLAB_NO_MERGE                __SLAB_FLAG_BIT(_SLAB_NO_MERGE)

/* Fault injection mark */
#ifdef CONFIG_FAILSLAB
# define SLAB_FAILSLAB                __SLAB_FLAG_BIT(_SLAB_FAILSLAB)
#else
# define SLAB_FAILSLAB                __SLAB_FLAG_UNUSED
#endif
/**
 * define SLAB_ACCOUNT - Account allocations to memcg.
 *
 * All object allocations from this cache will be memcg accounted, regardless of
 * __GFP_ACCOUNT being or not being passed to individual allocations.
 */
#ifdef CONFIG_MEMCG
# define SLAB_ACCOUNT                __SLAB_FLAG_BIT(_SLAB_ACCOUNT)
#else
# define SLAB_ACCOUNT                __SLAB_FLAG_UNUSED
#endif

#ifdef CONFIG_KASAN_GENERIC
#define SLAB_KASAN                __SLAB_FLAG_BIT(_SLAB_KASAN)
#else
#define SLAB_KASAN                __SLAB_FLAG_UNUSED
#endif

/*
 * Ignore user specified debugging flags.
 * Intended for caches created for self-tests so they have only flags
 * specified in the code and other flags are ignored.
 */
#define SLAB_NO_USER_FLAGS        __SLAB_FLAG_BIT(_SLAB_NO_USER_FLAGS)

#ifdef CONFIG_KFENCE
#define SLAB_SKIP_KFENCE        __SLAB_FLAG_BIT(_SLAB_SKIP_KFENCE)
#else
#define SLAB_SKIP_KFENCE        __SLAB_FLAG_UNUSED
#endif

/* The following flags affect the page allocator grouping pages by mobility */
/**
 * define SLAB_RECLAIM_ACCOUNT - Objects are reclaimable.
 *
 * Use this flag for caches that have an associated shrinker. As a result, slab
 * pages are allocated with __GFP_RECLAIMABLE, which affects grouping pages by
 * mobility, and are accounted in SReclaimable counter in /proc/meminfo
 */
#ifndef CONFIG_SLUB_TINY
#define SLAB_RECLAIM_ACCOUNT        __SLAB_FLAG_BIT(_SLAB_RECLAIM_ACCOUNT)
#else
#define SLAB_RECLAIM_ACCOUNT        __SLAB_FLAG_UNUSED
#endif
#define SLAB_TEMPORARY                SLAB_RECLAIM_ACCOUNT        /* Objects are short-lived */

/* Slab created using create_boot_cache */
#ifdef CONFIG_SLAB_OBJ_EXT
#define SLAB_NO_OBJ_EXT                __SLAB_FLAG_BIT(_SLAB_NO_OBJ_EXT)
#else
#define SLAB_NO_OBJ_EXT                __SLAB_FLAG_UNUSED
#endif

/*
 * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
 *
 * Dereferencing ZERO_SIZE_PTR will lead to a distinct access fault.
 *
 * ZERO_SIZE_PTR can be passed to kfree though in the same way that NULL can.
 * Both make kfree a no-op.
 */
#define ZERO_SIZE_PTR ((void *)16)

#define ZERO_OR_NULL_PTR(x) ((unsigned long)(x) <= \
                                (unsigned long)ZERO_SIZE_PTR)

#include <linux/kasan.h>

struct list_lru;
struct mem_cgroup;
/*
 * struct kmem_cache related prototypes
 */
bool slab_is_available(void);

/**
 * struct kmem_cache_args - Less common arguments for kmem_cache_create()
 *
 * Any uninitialized fields of the structure are interpreted as unused. The
 * exception is @freeptr_offset where %0 is a valid value, so
 * @use_freeptr_offset must be also set to %true in order to interpret the field
 * as used. For @useroffset %0 is also valid, but only with non-%0
 * @usersize.
 *
 * When %NULL args is passed to kmem_cache_create(), it is equivalent to all
 * fields unused.
 */
struct kmem_cache_args {
        /**
         * @align: The required alignment for the objects.
         *
         * %0 means no specific alignment is requested.
         */
        unsigned int align;
        /**
         * @useroffset: Usercopy region offset.
         *
         * %0 is a valid offset, when @usersize is non-%0
         */
        unsigned int useroffset;
        /**
         * @usersize: Usercopy region size.
         *
         * %0 means no usercopy region is specified.
         */
        unsigned int usersize;
        /**
         * @freeptr_offset: Custom offset for the free pointer
         * in &SLAB_TYPESAFE_BY_RCU caches
         *
         * By default &SLAB_TYPESAFE_BY_RCU caches place the free pointer
         * outside of the object. This might cause the object to grow in size.
         * Cache creators that have a reason to avoid this can specify a custom
         * free pointer offset in their struct where the free pointer will be
         * placed.
         *
         * Note that placing the free pointer inside the object requires the
         * caller to ensure that no fields are invalidated that are required to
         * guard against object recycling (See &SLAB_TYPESAFE_BY_RCU for
         * details).
         *
         * Using %0 as a value for @freeptr_offset is valid. If @freeptr_offset
         * is specified, %use_freeptr_offset must be set %true.
         *
         * Note that @ctor currently isn't supported with custom free pointers
         * as a @ctor requires an external free pointer.
         */
        unsigned int freeptr_offset;
        /**
         * @use_freeptr_offset: Whether a @freeptr_offset is used.
         */
        bool use_freeptr_offset;
        /**
         * @ctor: A constructor for the objects.
         *
         * The constructor is invoked for each object in a newly allocated slab
         * page. It is the cache user's responsibility to free object in the
         * same state as after calling the constructor, or deal appropriately
         * with any differences between a freshly constructed and a reallocated
         * object.
         *
         * %NULL means no constructor.
         */
        void (*ctor)(void *);
        /**
         * @sheaf_capacity: Enable sheaves of given capacity for the cache.
         *
         * With a non-zero value, allocations from the cache go through caching
         * arrays called sheaves. Each cpu has a main sheaf that's always
         * present, and a spare sheaf that may be not present. When both become
         * empty, there's an attempt to replace an empty sheaf with a full sheaf
         * from the per-node barn.
         *
         * When no full sheaf is available, and gfp flags allow blocking, a
         * sheaf is allocated and filled from slab(s) using bulk allocation.
         * Otherwise the allocation falls back to the normal operation
         * allocating a single object from a slab.
         *
         * Analogically when freeing and both percpu sheaves are full, the barn
         * may replace it with an empty sheaf, unless it's over capacity. In
         * that case a sheaf is bulk freed to slab pages.
         *
         * The sheaves do not enforce NUMA placement of objects, so allocations
         * via kmem_cache_alloc_node() with a node specified other than
         * NUMA_NO_NODE will bypass them.
         *
         * Bulk allocation and free operations also try to use the cpu sheaves
         * and barn, but fallback to using slab pages directly.
         *
         * When slub_debug is enabled for the cache, the sheaf_capacity argument
         * is ignored.
         *
         * %0 means no sheaves will be created.
         */
        unsigned int sheaf_capacity;
};

struct kmem_cache *__kmem_cache_create_args(const char *name,
                                            unsigned int object_size,
                                            struct kmem_cache_args *args,
                                            slab_flags_t flags);
static inline struct kmem_cache *
__kmem_cache_create(const char *name, unsigned int size, unsigned int align,
                    slab_flags_t flags, void (*ctor)(void *))
{
        struct kmem_cache_args kmem_args = {
                .align        = align,
                .ctor        = ctor,
        };

        return __kmem_cache_create_args(name, size, &kmem_args, flags);
}

/**
 * kmem_cache_create_usercopy - Create a kmem cache with a region suitable
 * for copying to userspace.
 * @name: A string which is used in /proc/slabinfo to identify this cache.
 * @size: The size of objects to be created in this cache.
 * @align: The required alignment for the objects.
 * @flags: SLAB flags
 * @useroffset: Usercopy region offset
 * @usersize: Usercopy region size
 * @ctor: A constructor for the objects, or %NULL.
 *
 * This is a legacy wrapper, new code should use either KMEM_CACHE_USERCOPY()
 * if whitelisting a single field is sufficient, or kmem_cache_create() with
 * the necessary parameters passed via the args parameter (see
 * &struct kmem_cache_args)
 *
 * Return: a pointer to the cache on success, NULL on failure.
 */
static inline struct kmem_cache *
kmem_cache_create_usercopy(const char *name, unsigned int size,
                           unsigned int align, slab_flags_t flags,
                           unsigned int useroffset, unsigned int usersize,
                           void (*ctor)(void *))
{
        struct kmem_cache_args kmem_args = {
                .align                = align,
                .ctor                = ctor,
                .useroffset        = useroffset,
                .usersize        = usersize,
        };

        return __kmem_cache_create_args(name, size, &kmem_args, flags);
}

/* If NULL is passed for @args, use this variant with default arguments. */
static inline struct kmem_cache *
__kmem_cache_default_args(const char *name, unsigned int size,
                          struct kmem_cache_args *args,
                          slab_flags_t flags)
{
        struct kmem_cache_args kmem_default_args = {};

        /* Make sure we don't get passed garbage. */
        if (WARN_ON_ONCE(args))
                return ERR_PTR(-EINVAL);

        return __kmem_cache_create_args(name, size, &kmem_default_args, flags);
}

/**
 * kmem_cache_create - Create a kmem cache.
 * @__name: A string which is used in /proc/slabinfo to identify this cache.
 * @__object_size: The size of objects to be created in this cache.
 * @__args: Optional arguments, see &struct kmem_cache_args. Passing %NULL
 *            means defaults will be used for all the arguments.
 *
 * This is currently implemented as a macro using ``_Generic()`` to call
 * either the new variant of the function, or a legacy one.
 *
 * The new variant has 4 parameters:
 * ``kmem_cache_create(name, object_size, args, flags)``
 *
 * See __kmem_cache_create_args() which implements this.
 *
 * The legacy variant has 5 parameters:
 * ``kmem_cache_create(name, object_size, align, flags, ctor)``
 *
 * The align and ctor parameters map to the respective fields of
 * &struct kmem_cache_args
 *
 * Context: Cannot be called within a interrupt, but can be interrupted.
 *
 * Return: a pointer to the cache on success, NULL on failure.
 */
#define kmem_cache_create(__name, __object_size, __args, ...)           \
        _Generic((__args),                                              \
                struct kmem_cache_args *: __kmem_cache_create_args,        \
                void *: __kmem_cache_default_args,                        \
                default: __kmem_cache_create)(__name, __object_size, __args, __VA_ARGS__)

void kmem_cache_destroy(struct kmem_cache *s);
int kmem_cache_shrink(struct kmem_cache *s);

/*
 * Please use this macro to create slab caches. Simply specify the
 * name of the structure and maybe some flags that are listed above.
 *
 * The alignment of the struct determines object alignment. If you
 * f.e. add ____cacheline_aligned_in_smp to the struct declaration
 * then the objects will be properly aligned in SMP configurations.
 */
#define KMEM_CACHE(__struct, __flags)                                   \
        __kmem_cache_create_args(#__struct, sizeof(struct __struct),    \
                        &(struct kmem_cache_args) {                        \
                                .align        = __alignof__(struct __struct), \
                        }, (__flags))

/*
 * To whitelist a single field for copying to/from usercopy, use this
 * macro instead for KMEM_CACHE() above.
 */
#define KMEM_CACHE_USERCOPY(__struct, __flags, __field)                                                \
        __kmem_cache_create_args(#__struct, sizeof(struct __struct),                                \
                        &(struct kmem_cache_args) {                                                \
                                .align                = __alignof__(struct __struct),                        \
                                .useroffset        = offsetof(struct __struct, __field),                \
                                .usersize        = sizeof_field(struct __struct, __field),        \
                        }, (__flags))

/*
 * Common kmalloc functions provided by all allocators
 */
void * __must_check krealloc_node_align_noprof(const void *objp, size_t new_size,
                                               unsigned long align,
                                               gfp_t flags, int nid) __realloc_size(2);
#define krealloc_noprof(_o, _s, _f)        krealloc_node_align_noprof(_o, _s, 1, _f, NUMA_NO_NODE)
#define krealloc_node_align(...)        alloc_hooks(krealloc_node_align_noprof(__VA_ARGS__))
#define krealloc_node(_o, _s, _f, _n)        krealloc_node_align(_o, _s, 1, _f, _n)
#define krealloc(...)                        krealloc_node(__VA_ARGS__, NUMA_NO_NODE)

void kfree(const void *objp);
void kfree_nolock(const void *objp);
void kfree_sensitive(const void *objp);
size_t __ksize(const void *objp);

DEFINE_FREE(kfree, void *, if (!IS_ERR_OR_NULL(_T)) kfree(_T))
DEFINE_FREE(kfree_sensitive, void *, if (_T) kfree_sensitive(_T))

/**
 * ksize - Report actual allocation size of associated object
 *
 * @objp: Pointer returned from a prior kmalloc()-family allocation.
 *
 * This should not be used for writing beyond the originally requested
 * allocation size. Either use krealloc() or round up the allocation size
 * with kmalloc_size_roundup() prior to allocation. If this is used to
 * access beyond the originally requested allocation size, UBSAN_BOUNDS
 * and/or FORTIFY_SOURCE may trip, since they only know about the
 * originally allocated size via the __alloc_size attribute.
 */
size_t ksize(const void *objp);

#ifdef CONFIG_PRINTK
bool kmem_dump_obj(void *object);
#else
static inline bool kmem_dump_obj(void *object) { return false; }
#endif

/*
 * Some archs want to perform DMA into kmalloc caches and need a guaranteed
 * alignment larger than the alignment of a 64-bit integer.
 * Setting ARCH_DMA_MINALIGN in arch headers allows that.
 */
#ifdef ARCH_HAS_DMA_MINALIGN
#if ARCH_DMA_MINALIGN > 8 && !defined(ARCH_KMALLOC_MINALIGN)
#define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN
#endif
#endif

#ifndef ARCH_KMALLOC_MINALIGN
#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
#elif ARCH_KMALLOC_MINALIGN > 8
#define KMALLOC_MIN_SIZE ARCH_KMALLOC_MINALIGN
#define KMALLOC_SHIFT_LOW ilog2(KMALLOC_MIN_SIZE)
#endif

/*
 * Setting ARCH_SLAB_MINALIGN in arch headers allows a different alignment.
 * Intended for arches that get misalignment faults even for 64 bit integer
 * aligned buffers.
 */
#ifndef ARCH_SLAB_MINALIGN
#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
#endif

/*
 * Arches can define this function if they want to decide the minimum slab
 * alignment at runtime. The value returned by the function must be a power
 * of two and >= ARCH_SLAB_MINALIGN.
 */
#ifndef arch_slab_minalign
static inline unsigned int arch_slab_minalign(void)
{
        return ARCH_SLAB_MINALIGN;
}
#endif

/*
 * kmem_cache_alloc and friends return pointers aligned to ARCH_SLAB_MINALIGN.
 * kmalloc and friends return pointers aligned to both ARCH_KMALLOC_MINALIGN
 * and ARCH_SLAB_MINALIGN, but here we only assume the former alignment.
 */
#define __assume_kmalloc_alignment __assume_aligned(ARCH_KMALLOC_MINALIGN)
#define __assume_slab_alignment __assume_aligned(ARCH_SLAB_MINALIGN)
#define __assume_page_alignment __assume_aligned(PAGE_SIZE)

/*
 * Kmalloc array related definitions
 */

/*
 * SLUB directly allocates requests fitting in to an order-1 page
 * (PAGE_SIZE*2).  Larger requests are passed to the page allocator.
 */
#define KMALLOC_SHIFT_HIGH        (PAGE_SHIFT + 1)
#define KMALLOC_SHIFT_MAX        (MAX_PAGE_ORDER + PAGE_SHIFT)
#ifndef KMALLOC_SHIFT_LOW
#define KMALLOC_SHIFT_LOW        3
#endif

/* Maximum allocatable size */
#define KMALLOC_MAX_SIZE        (1UL << KMALLOC_SHIFT_MAX)
/* Maximum size for which we actually use a slab cache */
#define KMALLOC_MAX_CACHE_SIZE        (1UL << KMALLOC_SHIFT_HIGH)
/* Maximum order allocatable via the slab allocator */
#define KMALLOC_MAX_ORDER        (KMALLOC_SHIFT_MAX - PAGE_SHIFT)

/*
 * Kmalloc subsystem.
 */
#ifndef KMALLOC_MIN_SIZE
#define KMALLOC_MIN_SIZE (1 << KMALLOC_SHIFT_LOW)
#endif

/*
 * This restriction comes from byte sized index implementation.
 * Page size is normally 2^12 bytes and, in this case, if we want to use
 * byte sized index which can represent 2^8 entries, the size of the object
 * should be equal or greater to 2^12 / 2^8 = 2^4 = 16.
 * If minimum size of kmalloc is less than 16, we use it as minimum object
 * size and give up to use byte sized index.
 */
#define SLAB_OBJ_MIN_SIZE      (KMALLOC_MIN_SIZE < 16 ? \
                               (KMALLOC_MIN_SIZE) : 16)

#ifdef CONFIG_RANDOM_KMALLOC_CACHES
#define RANDOM_KMALLOC_CACHES_NR        15 // # of cache copies
#else
#define RANDOM_KMALLOC_CACHES_NR        0
#endif

/*
 * Whenever changing this, take care of that kmalloc_type() and
 * create_kmalloc_caches() still work as intended.
 *
 * KMALLOC_NORMAL can contain only unaccounted objects whereas KMALLOC_CGROUP
 * is for accounted but unreclaimable and non-dma objects. All the other
 * kmem caches can have both accounted and unaccounted objects.
 */
enum kmalloc_cache_type {
        KMALLOC_NORMAL = 0,
#ifndef CONFIG_ZONE_DMA
        KMALLOC_DMA = KMALLOC_NORMAL,
#endif
#ifndef CONFIG_MEMCG
        KMALLOC_CGROUP = KMALLOC_NORMAL,
#endif
        KMALLOC_RANDOM_START = KMALLOC_NORMAL,
        KMALLOC_RANDOM_END = KMALLOC_RANDOM_START + RANDOM_KMALLOC_CACHES_NR,
#ifdef CONFIG_SLUB_TINY
        KMALLOC_RECLAIM = KMALLOC_NORMAL,
#else
        KMALLOC_RECLAIM,
#endif
#ifdef CONFIG_ZONE_DMA
        KMALLOC_DMA,
#endif
#ifdef CONFIG_MEMCG
        KMALLOC_CGROUP,
#endif
        NR_KMALLOC_TYPES
};

typedef struct kmem_cache * kmem_buckets[KMALLOC_SHIFT_HIGH + 1];

extern kmem_buckets kmalloc_caches[NR_KMALLOC_TYPES];

/*
 * Define gfp bits that should not be set for KMALLOC_NORMAL.
 */
#define KMALLOC_NOT_NORMAL_BITS                                        \
        (__GFP_RECLAIMABLE |                                        \
        (IS_ENABLED(CONFIG_ZONE_DMA)   ? __GFP_DMA : 0) |        \
        (IS_ENABLED(CONFIG_MEMCG) ? __GFP_ACCOUNT : 0))

extern unsigned long random_kmalloc_seed;

static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags, unsigned long caller)
{
        /*
         * The most common case is KMALLOC_NORMAL, so test for it
         * with a single branch for all the relevant flags.
         */
        if (likely((flags & KMALLOC_NOT_NORMAL_BITS) == 0))
#ifdef CONFIG_RANDOM_KMALLOC_CACHES
                /* RANDOM_KMALLOC_CACHES_NR (=15) copies + the KMALLOC_NORMAL */
                return KMALLOC_RANDOM_START + hash_64(caller ^ random_kmalloc_seed,
                                                      ilog2(RANDOM_KMALLOC_CACHES_NR + 1));
#else
                return KMALLOC_NORMAL;
#endif

        /*
         * At least one of the flags has to be set. Their priorities in
         * decreasing order are:
         *  1) __GFP_DMA
         *  2) __GFP_RECLAIMABLE
         *  3) __GFP_ACCOUNT
         */
        if (IS_ENABLED(CONFIG_ZONE_DMA) && (flags & __GFP_DMA))
                return KMALLOC_DMA;
        if (!IS_ENABLED(CONFIG_MEMCG) || (flags & __GFP_RECLAIMABLE))
                return KMALLOC_RECLAIM;
        else
                return KMALLOC_CGROUP;
}

/*
 * Figure out which kmalloc slab an allocation of a certain size
 * belongs to.
 * 0 = zero alloc
 * 1 =  65 .. 96 bytes
 * 2 = 129 .. 192 bytes
 * n = 2^(n-1)+1 .. 2^n
 *
 * Note: __kmalloc_index() is compile-time optimized, and not runtime optimized;
 * typical usage is via kmalloc_index() and therefore evaluated at compile-time.
 * Callers where !size_is_constant should only be test modules, where runtime
 * overheads of __kmalloc_index() can be tolerated.  Also see kmalloc_slab().
 */
static __always_inline unsigned int __kmalloc_index(size_t size,
                                                    bool size_is_constant)
{
        if (!size)
                return 0;

        if (size <= KMALLOC_MIN_SIZE)
                return KMALLOC_SHIFT_LOW;

        if (KMALLOC_MIN_SIZE <= 32 && size > 64 && size <= 96)
                return 1;
        if (KMALLOC_MIN_SIZE <= 64 && size > 128 && size <= 192)
                return 2;
        if (size <=          8) return 3;
        if (size <=         16) return 4;
        if (size <=         32) return 5;
        if (size <=         64) return 6;
        if (size <=        128) return 7;
        if (size <=        256) return 8;
        if (size <=        512) return 9;
        if (size <=       1024) return 10;
        if (size <=   2 * 1024) return 11;
        if (size <=   4 * 1024) return 12;
        if (size <=   8 * 1024) return 13;
        if (size <=  16 * 1024) return 14;
        if (size <=  32 * 1024) return 15;
        if (size <=  64 * 1024) return 16;
        if (size <= 128 * 1024) return 17;
        if (size <= 256 * 1024) return 18;
        if (size <= 512 * 1024) return 19;
        if (size <= 1024 * 1024) return 20;
        if (size <=  2 * 1024 * 1024) return 21;

        if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES) && size_is_constant)
                BUILD_BUG_ON_MSG(1, "unexpected size in kmalloc_index()");
        else
                BUG();

        /* Will never be reached. Needed because the compiler may complain */
        return -1;
}
static_assert(PAGE_SHIFT <= 20);
#define kmalloc_index(s) __kmalloc_index(s, true)

#include <linux/alloc_tag.h>

/**
 * kmem_cache_alloc - Allocate an object
 * @cachep: The cache to allocate from.
 * @flags: See kmalloc().
 *
 * Allocate an object from this cache.
 * See kmem_cache_zalloc() for a shortcut of adding __GFP_ZERO to flags.
 *
 * Return: pointer to the new object or %NULL in case of error
 */
void *kmem_cache_alloc_noprof(struct kmem_cache *cachep,
                              gfp_t flags) __assume_slab_alignment __malloc;
#define kmem_cache_alloc(...)                        alloc_hooks(kmem_cache_alloc_noprof(__VA_ARGS__))

void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru,
                            gfp_t gfpflags) __assume_slab_alignment __malloc;
#define kmem_cache_alloc_lru(...)        alloc_hooks(kmem_cache_alloc_lru_noprof(__VA_ARGS__))

/**
 * kmem_cache_charge - memcg charge an already allocated slab memory
 * @objp: address of the slab object to memcg charge
 * @gfpflags: describe the allocation context
 *
 * kmem_cache_charge allows charging a slab object to the current memcg,
 * primarily in cases where charging at allocation time might not be possible
 * because the target memcg is not known (i.e. softirq context)
 *
 * The objp should be pointer returned by the slab allocator functions like
 * kmalloc (with __GFP_ACCOUNT in flags) or kmem_cache_alloc. The memcg charge
 * behavior can be controlled through gfpflags parameter, which affects how the
 * necessary internal metadata can be allocated. Including __GFP_NOFAIL denotes
 * that overcharging is requested instead of failure, but is not applied for the
 * internal metadata allocation.
 *
 * There are several cases where it will return true even if the charging was
 * not done:
 * More specifically:
 *
 * 1. For !CONFIG_MEMCG or cgroup_disable=memory systems.
 * 2. Already charged slab objects.
 * 3. For slab objects from KMALLOC_NORMAL caches - allocated by kmalloc()
 *    without __GFP_ACCOUNT
 * 4. Allocating internal metadata has failed
 *
 * Return: true if charge was successful otherwise false.
 */
bool kmem_cache_charge(void *objp, gfp_t gfpflags);
void kmem_cache_free(struct kmem_cache *s, void *objp);

kmem_buckets *kmem_buckets_create(const char *name, slab_flags_t flags,
                                  unsigned int useroffset, unsigned int usersize,
                                  void (*ctor)(void *));

/*
 * Bulk allocation and freeing operations. These are accelerated in an
 * allocator specific way to avoid taking locks repeatedly or building
 * metadata structures unnecessarily.
 *
 * Note that interrupts must be enabled when calling these functions.
 */
void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p);

int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, void **p);
#define kmem_cache_alloc_bulk(...)        alloc_hooks(kmem_cache_alloc_bulk_noprof(__VA_ARGS__))

static __always_inline void kfree_bulk(size_t size, void **p)
{
        kmem_cache_free_bulk(NULL, size, p);
}

void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t flags,
                                   int node) __assume_slab_alignment __malloc;
#define kmem_cache_alloc_node(...)        alloc_hooks(kmem_cache_alloc_node_noprof(__VA_ARGS__))

struct slab_sheaf *
kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size);

int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp,
                struct slab_sheaf **sheafp, unsigned int size);

void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp,
                                       struct slab_sheaf *sheaf);

void *kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *cachep, gfp_t gfp,
                        struct slab_sheaf *sheaf) __assume_slab_alignment __malloc;
#define kmem_cache_alloc_from_sheaf(...)        \
                        alloc_hooks(kmem_cache_alloc_from_sheaf_noprof(__VA_ARGS__))

unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf);

/*
 * These macros allow declaring a kmem_buckets * parameter alongside size, which
 * can be compiled out with CONFIG_SLAB_BUCKETS=n so that a large number of call
 * sites don't have to pass NULL.
 */
#ifdef CONFIG_SLAB_BUCKETS
#define DECL_BUCKET_PARAMS(_size, _b)        size_t (_size), kmem_buckets *(_b)
#define PASS_BUCKET_PARAMS(_size, _b)        (_size), (_b)
#define PASS_BUCKET_PARAM(_b)                (_b)
#else
#define DECL_BUCKET_PARAMS(_size, _b)        size_t (_size)
#define PASS_BUCKET_PARAMS(_size, _b)        (_size)
#define PASS_BUCKET_PARAM(_b)                NULL
#endif

/*
 * The following functions are not to be used directly and are intended only
 * for internal use from kmalloc() and kmalloc_node()
 * with the exception of kunit tests
 */

void *__kmalloc_noprof(size_t size, gfp_t flags)
                                __assume_kmalloc_alignment __alloc_size(1);

void *__kmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
                                __assume_kmalloc_alignment __alloc_size(1);

void *__kmalloc_cache_noprof(struct kmem_cache *s, gfp_t flags, size_t size)
                                __assume_kmalloc_alignment __alloc_size(3);

void *__kmalloc_cache_node_noprof(struct kmem_cache *s, gfp_t gfpflags,
                                  int node, size_t size)
                                __assume_kmalloc_alignment __alloc_size(4);

void *__kmalloc_large_noprof(size_t size, gfp_t flags)
                                __assume_page_alignment __alloc_size(1);

void *__kmalloc_large_node_noprof(size_t size, gfp_t flags, int node)
                                __assume_page_alignment __alloc_size(1);

/**
 * kmalloc - allocate kernel memory
 * @size: how many bytes of memory are required.
 * @flags: describe the allocation context
 *
 * kmalloc is the normal method of allocating memory
 * for objects smaller than page size in the kernel.
 *
 * The allocated object address is aligned to at least ARCH_KMALLOC_MINALIGN
 * bytes. For @size of power of two bytes, the alignment is also guaranteed
 * to be at least to the size. For other sizes, the alignment is guaranteed to
 * be at least the largest power-of-two divisor of @size.
 *
 * The @flags argument may be one of the GFP flags defined at
 * include/linux/gfp_types.h and described at
 * :ref:`Documentation/core-api/mm-api.rst <mm-api-gfp-flags>`
 *
 * The recommended usage of the @flags is described at
 * :ref:`Documentation/core-api/memory-allocation.rst <memory_allocation>`
 *
 * Below is a brief outline of the most useful GFP flags
 *
 * %GFP_KERNEL
 *        Allocate normal kernel ram. May sleep.
 *
 * %GFP_NOWAIT
 *        Allocation will not sleep.
 *
 * %GFP_ATOMIC
 *        Allocation will not sleep.  May use emergency pools.
 *
 * Also it is possible to set different flags by OR'ing
 * in one or more of the following additional @flags:
 *
 * %__GFP_ZERO
 *        Zero the allocated memory before returning. Also see kzalloc().
 *
 * %__GFP_HIGH
 *        This allocation has high priority and may use emergency pools.
 *
 * %__GFP_NOFAIL
 *        Indicate that this allocation is in no way allowed to fail
 *        (think twice before using).
 *
 * %__GFP_NORETRY
 *        If memory is not immediately available,
 *        then give up at once.
 *
 * %__GFP_NOWARN
 *        If allocation fails, don't issue any warnings.
 *
 * %__GFP_RETRY_MAYFAIL
 *        Try really hard to succeed the allocation but fail
 *        eventually.
 */
static __always_inline __alloc_size(1) void *kmalloc_noprof(size_t size, gfp_t flags)
{
        if (__builtin_constant_p(size) && size) {
                unsigned int index;

                if (size > KMALLOC_MAX_CACHE_SIZE)
                        return __kmalloc_large_noprof(size, flags);

                index = kmalloc_index(size);
                return __kmalloc_cache_noprof(
                                kmalloc_caches[kmalloc_type(flags, _RET_IP_)][index],
                                flags, size);
        }
        return __kmalloc_noprof(size, flags);
}
#define kmalloc(...)                                alloc_hooks(kmalloc_noprof(__VA_ARGS__))

void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node);
#define kmalloc_nolock(...)                        alloc_hooks(kmalloc_nolock_noprof(__VA_ARGS__))

#define kmem_buckets_alloc(_b, _size, _flags)        \
        alloc_hooks(__kmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), _flags, NUMA_NO_NODE))

#define kmem_buckets_alloc_track_caller(_b, _size, _flags)        \
        alloc_hooks(__kmalloc_node_track_caller_noprof(PASS_BUCKET_PARAMS(_size, _b), _flags, NUMA_NO_NODE, _RET_IP_))

static __always_inline __alloc_size(1) void *kmalloc_node_noprof(size_t size, gfp_t flags, int node)
{
        if (__builtin_constant_p(size) && size) {
                unsigned int index;

                if (size > KMALLOC_MAX_CACHE_SIZE)
                        return __kmalloc_large_node_noprof(size, flags, node);

                index = kmalloc_index(size);
                return __kmalloc_cache_node_noprof(
                                kmalloc_caches[kmalloc_type(flags, _RET_IP_)][index],
                                flags, node, size);
        }
        return __kmalloc_node_noprof(PASS_BUCKET_PARAMS(size, NULL), flags, node);
}
#define kmalloc_node(...)                        alloc_hooks(kmalloc_node_noprof(__VA_ARGS__))

/**
 * kmalloc_array - allocate memory for an array.
 * @n: number of elements.
 * @size: element size.
 * @flags: the type of memory to allocate (see kmalloc).
 */
static inline __alloc_size(1, 2) void *kmalloc_array_noprof(size_t n, size_t size, gfp_t flags)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(n, size, &bytes)))
                return NULL;
        return kmalloc_noprof(bytes, flags);
}
#define kmalloc_array(...)                        alloc_hooks(kmalloc_array_noprof(__VA_ARGS__))

/**
 * krealloc_array - reallocate memory for an array.
 * @p: pointer to the memory chunk to reallocate
 * @new_n: new number of elements to alloc
 * @new_size: new size of a single member of the array
 * @flags: the type of memory to allocate (see kmalloc)
 *
 * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
 * initial memory allocation, every subsequent call to this API for the same
 * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
 * __GFP_ZERO is not fully honored by this API.
 *
 * See krealloc_noprof() for further details.
 *
 * In any case, the contents of the object pointed to are preserved up to the
 * lesser of the new and old sizes.
 */
static inline __realloc_size(2, 3) void * __must_check krealloc_array_noprof(void *p,
                                                                       size_t new_n,
                                                                       size_t new_size,
                                                                       gfp_t flags)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(new_n, new_size, &bytes)))
                return NULL;

        return krealloc_noprof(p, bytes, flags);
}
#define krealloc_array(...)                        alloc_hooks(krealloc_array_noprof(__VA_ARGS__))

/**
 * kcalloc - allocate memory for an array. The memory is set to zero.
 * @n: number of elements.
 * @size: element size.
 * @flags: the type of memory to allocate (see kmalloc).
 */
#define kcalloc(n, size, flags)                kmalloc_array(n, size, (flags) | __GFP_ZERO)

void *__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node,
                                         unsigned long caller) __alloc_size(1);
#define kmalloc_node_track_caller_noprof(size, flags, node, caller) \
        __kmalloc_node_track_caller_noprof(PASS_BUCKET_PARAMS(size, NULL), flags, node, caller)
#define kmalloc_node_track_caller(...)                \
        alloc_hooks(kmalloc_node_track_caller_noprof(__VA_ARGS__, _RET_IP_))

/*
 * kmalloc_track_caller is a special version of kmalloc that records the
 * calling function of the routine calling it for slab leak tracking instead
 * of just the calling function (confusing, eh?).
 * It's useful when the call to kmalloc comes from a widely-used standard
 * allocator where we care about the real place the memory allocation
 * request comes from.
 */
#define kmalloc_track_caller(...)                kmalloc_node_track_caller(__VA_ARGS__, NUMA_NO_NODE)

#define kmalloc_track_caller_noprof(...)        \
                kmalloc_node_track_caller_noprof(__VA_ARGS__, NUMA_NO_NODE, _RET_IP_)

static inline __alloc_size(1, 2) void *kmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags,
                                                          int node)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(n, size, &bytes)))
                return NULL;
        if (__builtin_constant_p(n) && __builtin_constant_p(size))
                return kmalloc_node_noprof(bytes, flags, node);
        return __kmalloc_node_noprof(PASS_BUCKET_PARAMS(bytes, NULL), flags, node);
}
#define kmalloc_array_node(...)                        alloc_hooks(kmalloc_array_node_noprof(__VA_ARGS__))

#define kcalloc_node(_n, _size, _flags, _node)        \
        kmalloc_array_node(_n, _size, (_flags) | __GFP_ZERO, _node)

/*
 * Shortcuts
 */
#define kmem_cache_zalloc(_k, _flags)                kmem_cache_alloc(_k, (_flags)|__GFP_ZERO)

/**
 * kzalloc - allocate memory. The memory is set to zero.
 * @size: how many bytes of memory are required.
 * @flags: the type of memory to allocate (see kmalloc).
 */
static inline __alloc_size(1) void *kzalloc_noprof(size_t size, gfp_t flags)
{
        return kmalloc_noprof(size, flags | __GFP_ZERO);
}
#define kzalloc(...)                                alloc_hooks(kzalloc_noprof(__VA_ARGS__))
#define kzalloc_node(_size, _flags, _node)        kmalloc_node(_size, (_flags)|__GFP_ZERO, _node)

void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align,
                             gfp_t flags, int node) __alloc_size(1);
#define kvmalloc_node_align_noprof(_size, _align, _flags, _node)        \
        __kvmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, NULL), _align, _flags, _node)
#define kvmalloc_node_align(...)                \
        alloc_hooks(kvmalloc_node_align_noprof(__VA_ARGS__))
#define kvmalloc_node(_s, _f, _n)                kvmalloc_node_align(_s, 1, _f, _n)
#define kvmalloc(...)                                kvmalloc_node(__VA_ARGS__, NUMA_NO_NODE)
#define kvzalloc(_size, _flags)                        kvmalloc(_size, (_flags)|__GFP_ZERO)

#define kvzalloc_node(_size, _flags, _node)        kvmalloc_node(_size, (_flags)|__GFP_ZERO, _node)

#define kmem_buckets_valloc(_b, _size, _flags)        \
        alloc_hooks(__kvmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), 1, _flags, NUMA_NO_NODE))

static inline __alloc_size(1, 2) void *
kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(n, size, &bytes)))
                return NULL;

        return kvmalloc_node_align_noprof(bytes, 1, flags, node);
}

#define kvmalloc_array_noprof(...)                kvmalloc_array_node_noprof(__VA_ARGS__, NUMA_NO_NODE)
#define kvcalloc_node_noprof(_n,_s,_f,_node)        kvmalloc_array_node_noprof(_n,_s,(_f)|__GFP_ZERO,_node)
#define kvcalloc_noprof(...)                        kvcalloc_node_noprof(__VA_ARGS__, NUMA_NO_NODE)

#define kvmalloc_array(...)                        alloc_hooks(kvmalloc_array_noprof(__VA_ARGS__))
#define kvcalloc_node(...)                        alloc_hooks(kvcalloc_node_noprof(__VA_ARGS__))
#define kvcalloc(...)                                alloc_hooks(kvcalloc_noprof(__VA_ARGS__))

void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long align,
                                  gfp_t flags, int nid) __realloc_size(2);
#define kvrealloc_node_align(...)                \
        alloc_hooks(kvrealloc_node_align_noprof(__VA_ARGS__))
#define kvrealloc_node(_p, _s, _f, _n)                kvrealloc_node_align(_p, _s, 1, _f, _n)
#define kvrealloc(...)                                kvrealloc_node(__VA_ARGS__, NUMA_NO_NODE)

extern void kvfree(const void *addr);
DEFINE_FREE(kvfree, void *, if (!IS_ERR_OR_NULL(_T)) kvfree(_T))

extern void kvfree_sensitive(const void *addr, size_t len);

unsigned int kmem_cache_size(struct kmem_cache *s);

#ifndef CONFIG_KVFREE_RCU_BATCHED
static inline void kvfree_rcu_barrier(void)
{
        rcu_barrier();
}

static inline void kfree_rcu_scheduler_running(void) { }
#else
void kvfree_rcu_barrier(void);

void kfree_rcu_scheduler_running(void);
#endif

/**
 * kmalloc_size_roundup - Report allocation bucket size for the given size
 *
 * @size: Number of bytes to round up from.
 *
 * This returns the number of bytes that would be available in a kmalloc()
 * allocation of @size bytes. For example, a 126 byte request would be
 * rounded up to the next sized kmalloc bucket, 128 bytes. (This is strictly
 * for the general-purpose kmalloc()-based allocations, and is not for the
 * pre-sized kmem_cache_alloc()-based allocations.)
 *
 * Use this to kmalloc() the full bucket size ahead of time instead of using
 * ksize() to query the size after an allocation.
 */
size_t kmalloc_size_roundup(size_t size);

void __init kmem_cache_init_late(void);
void __init kvfree_rcu_init(void);

#endif        /* _LINUX_SLAB_H */







































  317 
  277 




  316 






    6 



    6 
  275 














  251 








































  317 













  317 
  315 









  314 





  317 

  317 











  319 























  318 






  319 


  319 

  318 

  303 



  318 















  315 

  316 



  319 


  313 
























  319 


  319 
  316 

























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
// SPDX-License-Identifier: GPL-2.0-only
/*
 * This implements the various checks for CONFIG_HARDENED_USERCOPY*,
 * which are designed to protect kernel memory from needless exposure
 * and overwrite under many unintended conditions. This code is based
 * on PAX_USERCOPY, which is:
 *
 * Copyright (C) 2001-2016 PaX Team, Bradley Spengler, Open Source
 * Security Inc.
 */
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/kstrtox.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/ucopysize.h>
#include <linux/vmalloc.h>
#include <linux/atomic.h>
#include <linux/jump_label.h>
#include <asm/sections.h>
#include "slab.h"

/*
 * Checks if a given pointer and length is contained by the current
 * stack frame (if possible).
 *
 * Returns:
 *        NOT_STACK: not at all on the stack
 *        GOOD_FRAME: fully within a valid stack frame
 *        GOOD_STACK: within the current stack (when can't frame-check exactly)
 *        BAD_STACK: error condition (invalid stack position or bad stack frame)
 */
static noinline int check_stack_object(const void *obj, unsigned long len)
{
        const void * const stack = task_stack_page(current);
        const void * const stackend = stack + THREAD_SIZE;
        int ret;

        /* Object is not on the stack at all. */
        if (obj + len <= stack || stackend <= obj)
                return NOT_STACK;

        /*
         * Reject: object partially overlaps the stack (passing the
         * check above means at least one end is within the stack,
         * so if this check fails, the other end is outside the stack).
         */
        if (obj < stack || stackend < obj + len)
                return BAD_STACK;

        /* Check if object is safely within a valid frame. */
        ret = arch_within_stack_frames(stack, stackend, obj, len);
        if (ret)
                return ret;

        /* Finally, check stack depth if possible. */
#ifdef CONFIG_ARCH_HAS_CURRENT_STACK_POINTER
        if (IS_ENABLED(CONFIG_STACK_GROWSUP)) {
                if ((void *)current_stack_pointer < obj + len)
                        return BAD_STACK;
        } else {
                if (obj < (void *)current_stack_pointer)
                        return BAD_STACK;
        }
#endif

        return GOOD_STACK;
}

/*
 * If these functions are reached, then CONFIG_HARDENED_USERCOPY has found
 * an unexpected state during a copy_from_user() or copy_to_user() call.
 * There are several checks being performed on the buffer by the
 * __check_object_size() function. Normal stack buffer usage should never
 * trip the checks, and kernel text addressing will always trip the check.
 * For cache objects, it is checking that only the whitelisted range of
 * bytes for a given cache is being accessed (via the cache's usersize and
 * useroffset fields). To adjust a cache whitelist, use the usercopy-aware
 * kmem_cache_create_usercopy() function to create the cache (and
 * carefully audit the whitelist range).
 */
void __noreturn usercopy_abort(const char *name, const char *detail,
                               bool to_user, unsigned long offset,
                               unsigned long len)
{
        pr_emerg("Kernel memory %s attempt detected %s %s%s%s%s (offset %lu, size %lu)!\n",
                 to_user ? "exposure" : "overwrite",
                 to_user ? "from" : "to",
                 name ? : "unknown?!",
                 detail ? " '" : "", detail ? : "", detail ? "'" : "",
                 offset, len);

        /*
         * For greater effect, it would be nice to do do_group_exit(),
         * but BUG() actually hooks all the lock-breaking and per-arch
         * Oops code, so that is used here instead.
         */
        BUG();
}

/* Returns true if any portion of [ptr,ptr+n) over laps with [low,high). */
static bool overlaps(const unsigned long ptr, unsigned long n,
                     unsigned long low, unsigned long high)
{
        const unsigned long check_low = ptr;
        unsigned long check_high = check_low + n;

        /* Does not overlap if entirely above or entirely below. */
        if (check_low >= high || check_high <= low)
                return false;

        return true;
}

/* Is this address range in the kernel text area? */
static inline void check_kernel_text_object(const unsigned long ptr,
                                            unsigned long n, bool to_user)
{
        unsigned long textlow = (unsigned long)_stext;
        unsigned long texthigh = (unsigned long)_etext;
        unsigned long textlow_linear, texthigh_linear;

        if (overlaps(ptr, n, textlow, texthigh))
                usercopy_abort("kernel text", NULL, to_user, ptr - textlow, n);

        /*
         * Some architectures have virtual memory mappings with a secondary
         * mapping of the kernel text, i.e. there is more than one virtual
         * kernel address that points to the kernel image. It is usually
         * when there is a separate linear physical memory mapping, in that
         * __pa() is not just the reverse of __va(). This can be detected
         * and checked:
         */
        textlow_linear = (unsigned long)lm_alias(textlow);
        /* No different mapping: we're done. */
        if (textlow_linear == textlow)
                return;

        /* Check the secondary mapping... */
        texthigh_linear = (unsigned long)lm_alias(texthigh);
        if (overlaps(ptr, n, textlow_linear, texthigh_linear))
                usercopy_abort("linear kernel text", NULL, to_user,
                               ptr - textlow_linear, n);
}

static inline void check_bogus_address(const unsigned long ptr, unsigned long n,
                                       bool to_user)
{
        /* Reject if object wraps past end of memory. */
        if (ptr + (n - 1) < ptr)
                usercopy_abort("wrapped address", NULL, to_user, 0, ptr + n);

        /* Reject if NULL or ZERO-allocation. */
        if (ZERO_OR_NULL_PTR(ptr))
                usercopy_abort("null address", NULL, to_user, ptr, n);
}

static inline void check_heap_object(const void *ptr, unsigned long n,
                                     bool to_user)
{
        unsigned long addr = (unsigned long)ptr;
        unsigned long offset;
        struct folio *folio;

        if (is_kmap_addr(ptr)) {
                offset = offset_in_page(ptr);
                if (n > PAGE_SIZE - offset)
                        usercopy_abort("kmap", NULL, to_user, offset, n);
                return;
        }

        if (is_vmalloc_addr(ptr) && !pagefault_disabled()) {
                struct vmap_area *area = find_vmap_area(addr);

                if (!area)
                        usercopy_abort("vmalloc", "no area", to_user, 0, n);

                if (n > area->va_end - addr) {
                        offset = addr - area->va_start;
                        usercopy_abort("vmalloc", NULL, to_user, offset, n);
                }
                return;
        }

        if (!virt_addr_valid(ptr))
                return;

        folio = virt_to_folio(ptr);

        if (folio_test_slab(folio)) {
                /* Check slab allocator for flags and size. */
                __check_heap_object(ptr, n, folio_slab(folio), to_user);
        } else if (folio_test_large(folio)) {
                offset = ptr - folio_address(folio);
                if (n > folio_size(folio) - offset)
                        usercopy_abort("page alloc", NULL, to_user, offset, n);
        }
}

DEFINE_STATIC_KEY_MAYBE_RO(CONFIG_HARDENED_USERCOPY_DEFAULT_ON,
                           validate_usercopy_range);
EXPORT_SYMBOL(validate_usercopy_range);

/*
 * Validates that the given object is:
 * - not bogus address
 * - fully contained by stack (or stack frame, when available)
 * - fully within SLAB object (or object whitelist area, when available)
 * - not in kernel text
 */
void __check_object_size(const void *ptr, unsigned long n, bool to_user)
{
        /* Skip all tests if size is zero. */
        if (!n)
                return;

        /* Check for invalid addresses. */
        check_bogus_address((const unsigned long)ptr, n, to_user);

        /* Check for bad stack object. */
        switch (check_stack_object(ptr, n)) {
        case NOT_STACK:
                /* Object is not touching the current process stack. */
                break;
        case GOOD_FRAME:
        case GOOD_STACK:
                /*
                 * Object is either in the correct frame (when it
                 * is possible to check) or just generally on the
                 * process stack (when frame checking not available).
                 */
                return;
        default:
                usercopy_abort("process stack", NULL, to_user,
#ifdef CONFIG_ARCH_HAS_CURRENT_STACK_POINTER
                        IS_ENABLED(CONFIG_STACK_GROWSUP) ?
                                ptr - (void *)current_stack_pointer :
                                (void *)current_stack_pointer - ptr,
#else
                        0,
#endif
                        n);
        }

        /* Check for bad heap object. */
        check_heap_object(ptr, n, to_user);

        /* Check for object in kernel to avoid text exposure. */
        check_kernel_text_object((const unsigned long)ptr, n, to_user);
}
EXPORT_SYMBOL(__check_object_size);

static bool enable_checks __initdata =
                IS_ENABLED(CONFIG_HARDENED_USERCOPY_DEFAULT_ON);

static int __init parse_hardened_usercopy(char *str)
{
        if (kstrtobool(str, &enable_checks))
                pr_warn("Invalid option string for hardened_usercopy: '%s'\n",
                        str);
        return 1;
}

__setup("hardened_usercopy=", parse_hardened_usercopy);

static int __init set_hardened_usercopy(void)
{
        if (enable_checks)
                static_branch_enable(&validate_usercopy_range);
        else
                static_branch_disable(&validate_usercopy_range);
        return 1;
}

late_initcall(set_hardened_usercopy);


















































    2 


    2 
    2 



    2 






    2 














    2 

    2 

















































































































































































































    4 








    4 







    4 














    4 






















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * authencesn.c - AEAD wrapper for IPsec with extended sequence numbers,
 *                 derived from authenc.c
 *
 * Copyright (C) 2010 secunet Security Networks AG
 * Copyright (C) 2010 Steffen Klassert <steffen.klassert@secunet.com>
 * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au>
 */

#include <crypto/internal/aead.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/skcipher.h>
#include <crypto/authenc.h>
#include <crypto/scatterwalk.h>
#include <linux/err.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/rtnetlink.h>
#include <linux/slab.h>
#include <linux/spinlock.h>

struct authenc_esn_instance_ctx {
        struct crypto_ahash_spawn auth;
        struct crypto_skcipher_spawn enc;
};

struct crypto_authenc_esn_ctx {
        unsigned int reqoff;
        struct crypto_ahash *auth;
        struct crypto_skcipher *enc;
};

struct authenc_esn_request_ctx {
        struct scatterlist src[2];
        struct scatterlist dst[2];
        char tail[];
};

static void authenc_esn_request_complete(struct aead_request *req, int err)
{
        if (err != -EINPROGRESS)
                aead_request_complete(req, err);
}

static int crypto_authenc_esn_setauthsize(struct crypto_aead *authenc_esn,
                                          unsigned int authsize)
{
        if (authsize > 0 && authsize < 4)
                return -EINVAL;

        return 0;
}

static int crypto_authenc_esn_setkey(struct crypto_aead *authenc_esn, const u8 *key,
                                     unsigned int keylen)
{
        struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(authenc_esn);
        struct crypto_ahash *auth = ctx->auth;
        struct crypto_skcipher *enc = ctx->enc;
        struct crypto_authenc_keys keys;
        int err = -EINVAL;

        if (crypto_authenc_extractkeys(&keys, key, keylen) != 0)
                goto out;

        crypto_ahash_clear_flags(auth, CRYPTO_TFM_REQ_MASK);
        crypto_ahash_set_flags(auth, crypto_aead_get_flags(authenc_esn) &
                                     CRYPTO_TFM_REQ_MASK);
        err = crypto_ahash_setkey(auth, keys.authkey, keys.authkeylen);
        if (err)
                goto out;

        crypto_skcipher_clear_flags(enc, CRYPTO_TFM_REQ_MASK);
        crypto_skcipher_set_flags(enc, crypto_aead_get_flags(authenc_esn) &
                                         CRYPTO_TFM_REQ_MASK);
        err = crypto_skcipher_setkey(enc, keys.enckey, keys.enckeylen);
out:
        memzero_explicit(&keys, sizeof(keys));
        return err;
}

static int crypto_authenc_esn_genicv_tail(struct aead_request *req,
                                          unsigned int flags)
{
        struct crypto_aead *authenc_esn = crypto_aead_reqtfm(req);
        struct authenc_esn_request_ctx *areq_ctx = aead_request_ctx(req);
        u8 *hash = areq_ctx->tail;
        unsigned int authsize = crypto_aead_authsize(authenc_esn);
        unsigned int assoclen = req->assoclen;
        unsigned int cryptlen = req->cryptlen;
        struct scatterlist *dst = req->dst;
        u32 tmp[2];

        /* Move high-order bits of sequence number back. */
        scatterwalk_map_and_copy(tmp, dst, 4, 4, 0);
        scatterwalk_map_and_copy(tmp + 1, dst, assoclen + cryptlen, 4, 0);
        scatterwalk_map_and_copy(tmp, dst, 0, 8, 1);

        scatterwalk_map_and_copy(hash, dst, assoclen + cryptlen, authsize, 1);
        return 0;
}

static void authenc_esn_geniv_ahash_done(void *data, int err)
{
        struct aead_request *req = data;

        err = err ?: crypto_authenc_esn_genicv_tail(req, 0);
        aead_request_complete(req, err);
}

static int crypto_authenc_esn_genicv(struct aead_request *req,
                                     unsigned int flags)
{
        struct crypto_aead *authenc_esn = crypto_aead_reqtfm(req);
        struct authenc_esn_request_ctx *areq_ctx = aead_request_ctx(req);
        struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(authenc_esn);
        struct crypto_ahash *auth = ctx->auth;
        u8 *hash = areq_ctx->tail;
        struct ahash_request *ahreq = (void *)(areq_ctx->tail + ctx->reqoff);
        unsigned int authsize = crypto_aead_authsize(authenc_esn);
        unsigned int assoclen = req->assoclen;
        unsigned int cryptlen = req->cryptlen;
        struct scatterlist *dst = req->dst;
        u32 tmp[2];

        if (!authsize)
                return 0;

        /* Move high-order bits of sequence number to the end. */
        scatterwalk_map_and_copy(tmp, dst, 0, 8, 0);
        scatterwalk_map_and_copy(tmp, dst, 4, 4, 1);
        scatterwalk_map_and_copy(tmp + 1, dst, assoclen + cryptlen, 4, 1);

        sg_init_table(areq_ctx->dst, 2);
        dst = scatterwalk_ffwd(areq_ctx->dst, dst, 4);

        ahash_request_set_tfm(ahreq, auth);
        ahash_request_set_crypt(ahreq, dst, hash, assoclen + cryptlen);
        ahash_request_set_callback(ahreq, flags,
                                   authenc_esn_geniv_ahash_done, req);

        return crypto_ahash_digest(ahreq) ?:
               crypto_authenc_esn_genicv_tail(req, aead_request_flags(req));
}


static void crypto_authenc_esn_encrypt_done(void *data, int err)
{
        struct aead_request *areq = data;

        if (!err)
                err = crypto_authenc_esn_genicv(areq, 0);

        authenc_esn_request_complete(areq, err);
}

static int crypto_authenc_esn_encrypt(struct aead_request *req)
{
        struct crypto_aead *authenc_esn = crypto_aead_reqtfm(req);
        struct authenc_esn_request_ctx *areq_ctx = aead_request_ctx(req);
        struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(authenc_esn);
        struct skcipher_request *skreq = (void *)(areq_ctx->tail +
                                                  ctx->reqoff);
        struct crypto_skcipher *enc = ctx->enc;
        unsigned int assoclen = req->assoclen;
        unsigned int cryptlen = req->cryptlen;
        struct scatterlist *src, *dst;
        int err;

        sg_init_table(areq_ctx->src, 2);
        src = scatterwalk_ffwd(areq_ctx->src, req->src, assoclen);
        dst = src;

        if (req->src != req->dst) {
                memcpy_sglist(req->dst, req->src, assoclen);
                sg_init_table(areq_ctx->dst, 2);
                dst = scatterwalk_ffwd(areq_ctx->dst, req->dst, assoclen);
        }

        skcipher_request_set_tfm(skreq, enc);
        skcipher_request_set_callback(skreq, aead_request_flags(req),
                                      crypto_authenc_esn_encrypt_done, req);
        skcipher_request_set_crypt(skreq, src, dst, cryptlen, req->iv);

        err = crypto_skcipher_encrypt(skreq);
        if (err)
                return err;

        return crypto_authenc_esn_genicv(req, aead_request_flags(req));
}

static int crypto_authenc_esn_decrypt_tail(struct aead_request *req,
                                           unsigned int flags)
{
        struct crypto_aead *authenc_esn = crypto_aead_reqtfm(req);
        unsigned int authsize = crypto_aead_authsize(authenc_esn);
        struct authenc_esn_request_ctx *areq_ctx = aead_request_ctx(req);
        struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(authenc_esn);
        struct skcipher_request *skreq = (void *)(areq_ctx->tail +
                                                  ctx->reqoff);
        struct crypto_ahash *auth = ctx->auth;
        u8 *ohash = areq_ctx->tail;
        unsigned int cryptlen = req->cryptlen - authsize;
        unsigned int assoclen = req->assoclen;
        struct scatterlist *dst = req->dst;
        u8 *ihash = ohash + crypto_ahash_digestsize(auth);
        u32 tmp[2];

        if (!authsize)
                goto decrypt;

        /* Move high-order bits of sequence number back. */
        scatterwalk_map_and_copy(tmp, dst, 4, 4, 0);
        scatterwalk_map_and_copy(tmp + 1, dst, assoclen + cryptlen, 4, 0);
        scatterwalk_map_and_copy(tmp, dst, 0, 8, 1);

        if (crypto_memneq(ihash, ohash, authsize))
                return -EBADMSG;

decrypt:

        sg_init_table(areq_ctx->dst, 2);
        dst = scatterwalk_ffwd(areq_ctx->dst, dst, assoclen);

        skcipher_request_set_tfm(skreq, ctx->enc);
        skcipher_request_set_callback(skreq, flags,
                                      req->base.complete, req->base.data);
        skcipher_request_set_crypt(skreq, dst, dst, cryptlen, req->iv);

        return crypto_skcipher_decrypt(skreq);
}

static void authenc_esn_verify_ahash_done(void *data, int err)
{
        struct aead_request *req = data;

        err = err ?: crypto_authenc_esn_decrypt_tail(req, 0);
        authenc_esn_request_complete(req, err);
}

static int crypto_authenc_esn_decrypt(struct aead_request *req)
{
        struct crypto_aead *authenc_esn = crypto_aead_reqtfm(req);
        struct authenc_esn_request_ctx *areq_ctx = aead_request_ctx(req);
        struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(authenc_esn);
        struct ahash_request *ahreq = (void *)(areq_ctx->tail + ctx->reqoff);
        unsigned int authsize = crypto_aead_authsize(authenc_esn);
        struct crypto_ahash *auth = ctx->auth;
        u8 *ohash = areq_ctx->tail;
        unsigned int assoclen = req->assoclen;
        unsigned int cryptlen = req->cryptlen;
        u8 *ihash = ohash + crypto_ahash_digestsize(auth);
        struct scatterlist *dst = req->dst;
        u32 tmp[2];
        int err;

        cryptlen -= authsize;

        if (req->src != dst)
                memcpy_sglist(dst, req->src, assoclen + cryptlen);

        scatterwalk_map_and_copy(ihash, req->src, assoclen + cryptlen,
                                 authsize, 0);

        if (!authsize)
                goto tail;

        /* Move high-order bits of sequence number to the end. */
        scatterwalk_map_and_copy(tmp, dst, 0, 8, 0);
        scatterwalk_map_and_copy(tmp, dst, 4, 4, 1);
        scatterwalk_map_and_copy(tmp + 1, dst, assoclen + cryptlen, 4, 1);

        sg_init_table(areq_ctx->dst, 2);
        dst = scatterwalk_ffwd(areq_ctx->dst, dst, 4);

        ahash_request_set_tfm(ahreq, auth);
        ahash_request_set_crypt(ahreq, dst, ohash, assoclen + cryptlen);
        ahash_request_set_callback(ahreq, aead_request_flags(req),
                                   authenc_esn_verify_ahash_done, req);

        err = crypto_ahash_digest(ahreq);
        if (err)
                return err;

tail:
        return crypto_authenc_esn_decrypt_tail(req, aead_request_flags(req));
}

static int crypto_authenc_esn_init_tfm(struct crypto_aead *tfm)
{
        struct aead_instance *inst = aead_alg_instance(tfm);
        struct authenc_esn_instance_ctx *ictx = aead_instance_ctx(inst);
        struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(tfm);
        struct crypto_ahash *auth;
        struct crypto_skcipher *enc;
        int err;

        auth = crypto_spawn_ahash(&ictx->auth);
        if (IS_ERR(auth))
                return PTR_ERR(auth);

        enc = crypto_spawn_skcipher(&ictx->enc);
        err = PTR_ERR(enc);
        if (IS_ERR(enc))
                goto err_free_ahash;

        ctx->auth = auth;
        ctx->enc = enc;

        ctx->reqoff = 2 * crypto_ahash_digestsize(auth);

        crypto_aead_set_reqsize(
                tfm,
                sizeof(struct authenc_esn_request_ctx) +
                ctx->reqoff +
                max_t(unsigned int,
                      crypto_ahash_reqsize(auth) +
                      sizeof(struct ahash_request),
                      sizeof(struct skcipher_request) +
                      crypto_skcipher_reqsize(enc)));

        return 0;

err_free_ahash:
        crypto_free_ahash(auth);
        return err;
}

static void crypto_authenc_esn_exit_tfm(struct crypto_aead *tfm)
{
        struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(tfm);

        crypto_free_ahash(ctx->auth);
        crypto_free_skcipher(ctx->enc);
}

static void crypto_authenc_esn_free(struct aead_instance *inst)
{
        struct authenc_esn_instance_ctx *ctx = aead_instance_ctx(inst);

        crypto_drop_skcipher(&ctx->enc);
        crypto_drop_ahash(&ctx->auth);
        kfree(inst);
}

static int crypto_authenc_esn_create(struct crypto_template *tmpl,
                                     struct rtattr **tb)
{
        u32 mask;
        struct aead_instance *inst;
        struct authenc_esn_instance_ctx *ctx;
        struct skcipher_alg_common *enc;
        struct hash_alg_common *auth;
        struct crypto_alg *auth_base;
        int err;

        err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_AEAD, &mask);
        if (err)
                return err;

        inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL);
        if (!inst)
                return -ENOMEM;
        ctx = aead_instance_ctx(inst);

        err = crypto_grab_ahash(&ctx->auth, aead_crypto_instance(inst),
                                crypto_attr_alg_name(tb[1]), 0, mask);
        if (err)
                goto err_free_inst;
        auth = crypto_spawn_ahash_alg(&ctx->auth);
        auth_base = &auth->base;

        err = crypto_grab_skcipher(&ctx->enc, aead_crypto_instance(inst),
                                   crypto_attr_alg_name(tb[2]), 0, mask);
        if (err)
                goto err_free_inst;
        enc = crypto_spawn_skcipher_alg_common(&ctx->enc);

        err = -ENAMETOOLONG;
        if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME,
                     "authencesn(%s,%s)", auth_base->cra_name,
                     enc->base.cra_name) >= CRYPTO_MAX_ALG_NAME)
                goto err_free_inst;

        if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME,
                     "authencesn(%s,%s)", auth_base->cra_driver_name,
                     enc->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME)
                goto err_free_inst;

        inst->alg.base.cra_priority = enc->base.cra_priority * 10 +
                                      auth_base->cra_priority;
        inst->alg.base.cra_blocksize = enc->base.cra_blocksize;
        inst->alg.base.cra_alignmask = enc->base.cra_alignmask;
        inst->alg.base.cra_ctxsize = sizeof(struct crypto_authenc_esn_ctx);

        inst->alg.ivsize = enc->ivsize;
        inst->alg.chunksize = enc->chunksize;
        inst->alg.maxauthsize = auth->digestsize;

        inst->alg.init = crypto_authenc_esn_init_tfm;
        inst->alg.exit = crypto_authenc_esn_exit_tfm;

        inst->alg.setkey = crypto_authenc_esn_setkey;
        inst->alg.setauthsize = crypto_authenc_esn_setauthsize;
        inst->alg.encrypt = crypto_authenc_esn_encrypt;
        inst->alg.decrypt = crypto_authenc_esn_decrypt;

        inst->free = crypto_authenc_esn_free;

        err = aead_register_instance(tmpl, inst);
        if (err) {
err_free_inst:
                crypto_authenc_esn_free(inst);
        }
        return err;
}

static struct crypto_template crypto_authenc_esn_tmpl = {
        .name = "authencesn",
        .create = crypto_authenc_esn_create,
        .module = THIS_MODULE,
};

static int __init crypto_authenc_esn_module_init(void)
{
        return crypto_register_template(&crypto_authenc_esn_tmpl);
}

static void __exit crypto_authenc_esn_module_exit(void)
{
        crypto_unregister_template(&crypto_authenc_esn_tmpl);
}

module_init(crypto_authenc_esn_module_init);
module_exit(crypto_authenc_esn_module_exit);

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>");
MODULE_DESCRIPTION("AEAD wrapper for IPsec with extended sequence numbers");
MODULE_ALIAS_CRYPTO("authencesn");























































































































































































































































































































































































































   14 






  318 








  316 





























































































































    3 




   13 
    3 











    3 


    3 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Operations on the network namespace
 */
#ifndef __NET_NET_NAMESPACE_H
#define __NET_NET_NAMESPACE_H

#include <linux/atomic.h>
#include <linux/refcount.h>
#include <linux/workqueue.h>
#include <linux/list.h>
#include <linux/sysctl.h>
#include <linux/uidgid.h>

#include <net/flow.h>
#include <net/netns/core.h>
#include <net/netns/mib.h>
#include <net/netns/unix.h>
#include <net/netns/packet.h>
#include <net/netns/ipv4.h>
#include <net/netns/ipv6.h>
#include <net/netns/nexthop.h>
#include <net/netns/ieee802154_6lowpan.h>
#include <net/netns/sctp.h>
#include <net/netns/netfilter.h>
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
#include <net/netns/conntrack.h>
#endif
#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
#include <net/netns/flow_table.h>
#endif
#include <net/netns/nftables.h>
#include <net/netns/xfrm.h>
#include <net/netns/mpls.h>
#include <net/netns/can.h>
#include <net/netns/xdp.h>
#include <net/netns/smc.h>
#include <net/netns/bpf.h>
#include <net/netns/mctp.h>
#include <net/net_trackers.h>
#include <linux/ns_common.h>
#include <linux/idr.h>
#include <linux/skbuff.h>
#include <linux/notifier.h>
#include <linux/xarray.h>

struct user_namespace;
struct proc_dir_entry;
struct net_device;
struct sock;
struct ctl_table_header;
struct net_generic;
struct uevent_sock;
struct netns_ipvs;
struct bpf_prog;


#define NETDEV_HASHBITS    8
#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)

struct net {
        /* First cache line can be often dirtied.
         * Do not place here read-mostly fields.
         */
        refcount_t                passive;        /* To decide when the network
                                                 * namespace should be freed.
                                                 */
        spinlock_t                rules_mod_lock;

        unsigned int                dev_base_seq;        /* protected by rtnl_mutex */
        u32                        ifindex;

        spinlock_t                nsid_lock;
        atomic_t                fnhe_genid;

        struct list_head        list;                /* list of network namespaces */
        struct list_head        exit_list;        /* To linked to call pernet exit
                                                 * methods on dead net (
                                                 * pernet_ops_rwsem read locked),
                                                 * or to unregister pernet ops
                                                 * (pernet_ops_rwsem write locked).
                                                 */
        struct llist_node        defer_free_list;
        struct llist_node        cleanup_list;        /* namespaces on death row */

        struct list_head ptype_all;
        struct list_head ptype_specific;

#ifdef CONFIG_KEYS
        struct key_tag                *key_domain;        /* Key domain of operation tag */
#endif
        struct user_namespace   *user_ns;        /* Owning user namespace */
        struct ucounts                *ucounts;
        struct idr                netns_ids;

        struct ns_common        ns;
        struct ref_tracker_dir  refcnt_tracker;
        struct ref_tracker_dir  notrefcnt_tracker; /* tracker for objects not
                                                    * refcounted against netns
                                                    */
        struct list_head         dev_base_head;
        struct proc_dir_entry         *proc_net;
        struct proc_dir_entry         *proc_net_stat;

#ifdef CONFIG_SYSCTL
        struct ctl_table_set        sysctls;
#endif

        struct sock                 *rtnl;                        /* rtnetlink socket */
        struct sock                *genl_sock;

        struct uevent_sock        *uevent_sock;                /* uevent socket */

        struct hlist_head         *dev_name_head;
        struct hlist_head        *dev_index_head;
        struct xarray                dev_by_index;
        struct raw_notifier_head        netdev_chain;

        /* Note that @hash_mix can be read millions times per second,
         * it is critical that it is on a read_mostly cache line.
         */
        u32                        hash_mix;

        struct net_device       *loopback_dev;          /* The loopback */

        /* core fib_rules */
        struct list_head        rules_ops;

        struct netns_core        core;
        struct netns_mib        mib;
        struct netns_packet        packet;
#if IS_ENABLED(CONFIG_UNIX)
        struct netns_unix        unx;
#endif
        struct netns_nexthop        nexthop;
        struct netns_ipv4        ipv4;
#if IS_ENABLED(CONFIG_IPV6)
        struct netns_ipv6        ipv6;
#endif
#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
        struct netns_ieee802154_lowpan        ieee802154_lowpan;
#endif
#if defined(CONFIG_IP_SCTP) || defined(CONFIG_IP_SCTP_MODULE)
        struct netns_sctp        sctp;
#endif
#ifdef CONFIG_NETFILTER
        struct netns_nf                nf;
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        struct netns_ct                ct;
#endif
#if defined(CONFIG_NF_TABLES) || defined(CONFIG_NF_TABLES_MODULE)
        struct netns_nftables        nft;
#endif
#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
        struct netns_ft ft;
#endif
#endif
#ifdef CONFIG_WEXT_CORE
        struct sk_buff_head        wext_nlevents;
#endif
        struct net_generic __rcu        *gen;

        /* Used to store attached BPF programs */
        struct netns_bpf        bpf;

        /* Note : following structs are cache line aligned */
#ifdef CONFIG_XFRM
        struct netns_xfrm        xfrm;
#endif

        u64                        net_cookie; /* written once */

#if IS_ENABLED(CONFIG_IP_VS)
        struct netns_ipvs        *ipvs;
#endif
#if IS_ENABLED(CONFIG_MPLS)
        struct netns_mpls        mpls;
#endif
#if IS_ENABLED(CONFIG_CAN)
        struct netns_can        can;
#endif
#ifdef CONFIG_XDP_SOCKETS
        struct netns_xdp        xdp;
#endif
#if IS_ENABLED(CONFIG_MCTP)
        struct netns_mctp        mctp;
#endif
#if IS_ENABLED(CONFIG_CRYPTO_USER)
        struct sock                *crypto_nlsk;
#endif
        struct sock                *diag_nlsk;
#if IS_ENABLED(CONFIG_SMC)
        struct netns_smc        smc;
#endif
#ifdef CONFIG_DEBUG_NET_SMALL_RTNL
        /* Move to a better place when the config guard is removed. */
        struct mutex                rtnl_mutex;
#endif
} __randomize_layout;

#include <linux/seq_file_net.h>

/* Init's network namespace */
extern struct net init_net;

#ifdef CONFIG_NET_NS
struct net *copy_net_ns(u64 flags, struct user_namespace *user_ns,
                        struct net *old_net);

void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid);

void net_ns_barrier(void);

struct ns_common *get_net_ns(struct ns_common *ns);
struct net *get_net_ns_by_fd(int fd);
extern struct task_struct *cleanup_net_task;

#else /* CONFIG_NET_NS */
#include <linux/sched.h>
#include <linux/nsproxy.h>
static inline struct net *copy_net_ns(u64 flags,
        struct user_namespace *user_ns, struct net *old_net)
{
        if (flags & CLONE_NEWNET)
                return ERR_PTR(-EINVAL);
        return old_net;
}

static inline void net_ns_get_ownership(const struct net *net,
                                        kuid_t *uid, kgid_t *gid)
{
        *uid = GLOBAL_ROOT_UID;
        *gid = GLOBAL_ROOT_GID;
}

static inline void net_ns_barrier(void) {}

static inline struct ns_common *get_net_ns(struct ns_common *ns)
{
        return ERR_PTR(-EINVAL);
}

static inline struct net *get_net_ns_by_fd(int fd)
{
        return ERR_PTR(-EINVAL);
}
#endif /* CONFIG_NET_NS */


extern struct list_head net_namespace_list;

struct net *get_net_ns_by_pid(pid_t pid);

#ifdef CONFIG_SYSCTL
void ipx_register_sysctl(void);
void ipx_unregister_sysctl(void);
#else
#define ipx_register_sysctl()
#define ipx_unregister_sysctl()
#endif

#ifdef CONFIG_NET_NS
void __put_net(struct net *net);

static inline struct net *to_net_ns(struct ns_common *ns)
{
        return container_of(ns, struct net, ns);
}

/* Try using get_net_track() instead */
static inline struct net *get_net(struct net *net)
{
        ns_ref_inc(net);
        return net;
}

static inline struct net *maybe_get_net(struct net *net)
{
        /* Used when we know struct net exists but we
         * aren't guaranteed a previous reference count
         * exists.  If the reference count is zero this
         * function fails and returns NULL.
         */
        if (!ns_ref_get(net))
                net = NULL;
        return net;
}

/* Try using put_net_track() instead */
static inline void put_net(struct net *net)
{
        if (ns_ref_put(net))
                __put_net(net);
}

static inline
int net_eq(const struct net *net1, const struct net *net2)
{
        return net1 == net2;
}

static inline int check_net(const struct net *net)
{
        return ns_ref_read(net) != 0;
}

void net_drop_ns(void *);
void net_passive_dec(struct net *net);

#else

static inline struct net *get_net(struct net *net)
{
        return net;
}

static inline void put_net(struct net *net)
{
}

static inline struct net *maybe_get_net(struct net *net)
{
        return net;
}

static inline
int net_eq(const struct net *net1, const struct net *net2)
{
        return 1;
}

static inline int check_net(const struct net *net)
{
        return 1;
}

#define net_drop_ns NULL

static inline void net_passive_dec(struct net *net)
{
        refcount_dec(&net->passive);
}
#endif

static inline void net_passive_inc(struct net *net)
{
        refcount_inc(&net->passive);
}

/* Returns true if the netns initialization is completed successfully */
static inline bool net_initialized(const struct net *net)
{
        return READ_ONCE(net->list.next);
}

static inline void __netns_tracker_alloc(struct net *net,
                                         netns_tracker *tracker,
                                         bool refcounted,
                                         gfp_t gfp)
{
#ifdef CONFIG_NET_NS_REFCNT_TRACKER
        ref_tracker_alloc(refcounted ? &net->refcnt_tracker :
                                       &net->notrefcnt_tracker,
                          tracker, gfp);
#endif
}

static inline void netns_tracker_alloc(struct net *net, netns_tracker *tracker,
                                       gfp_t gfp)
{
        __netns_tracker_alloc(net, tracker, true, gfp);
}

static inline void __netns_tracker_free(struct net *net,
                                        netns_tracker *tracker,
                                        bool refcounted)
{
#ifdef CONFIG_NET_NS_REFCNT_TRACKER
       ref_tracker_free(refcounted ? &net->refcnt_tracker :
                                     &net->notrefcnt_tracker, tracker);
#endif
}

static inline struct net *get_net_track(struct net *net,
                                        netns_tracker *tracker, gfp_t gfp)
{
        get_net(net);
        netns_tracker_alloc(net, tracker, gfp);
        return net;
}

static inline void put_net_track(struct net *net, netns_tracker *tracker)
{
        __netns_tracker_free(net, tracker, true);
        put_net(net);
}

typedef struct {
#ifdef CONFIG_NET_NS
        struct net __rcu *net;
#endif
} possible_net_t;

static inline void write_pnet(possible_net_t *pnet, struct net *net)
{
#ifdef CONFIG_NET_NS
        rcu_assign_pointer(pnet->net, net);
#endif
}

static inline struct net *read_pnet(const possible_net_t *pnet)
{
#ifdef CONFIG_NET_NS
        return rcu_dereference_protected(pnet->net, true);
#else
        return &init_net;
#endif
}

static inline struct net *read_pnet_rcu(const possible_net_t *pnet)
{
#ifdef CONFIG_NET_NS
        return rcu_dereference(pnet->net);
#else
        return &init_net;
#endif
}

/* Protected by net_rwsem */
#define for_each_net(VAR)                                \
        list_for_each_entry(VAR, &net_namespace_list, list)
#define for_each_net_continue_reverse(VAR)                \
        list_for_each_entry_continue_reverse(VAR, &net_namespace_list, list)
#define for_each_net_rcu(VAR)                                \
        list_for_each_entry_rcu(VAR, &net_namespace_list, list)

#ifdef CONFIG_NET_NS
#define __net_init
#define __net_exit
#define __net_initdata
#define __net_initconst
#else
#define __net_init        __init
#define __net_exit        __ref
#define __net_initdata        __initdata
#define __net_initconst        __initconst
#endif

int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp);
int peernet2id(const struct net *net, struct net *peer);
bool peernet_has_id(const struct net *net, struct net *peer);
struct net *get_net_ns_by_id(const struct net *net, int id);

struct pernet_operations {
        struct list_head list;
        /*
         * Below methods are called without any exclusive locks.
         * More than one net may be constructed and destructed
         * in parallel on several cpus. Every pernet_operations
         * have to keep in mind all other pernet_operations and
         * to introduce a locking, if they share common resources.
         *
         * The only time they are called with exclusive lock is
         * from register_pernet_subsys(), unregister_pernet_subsys()
         * register_pernet_device() and unregister_pernet_device().
         *
         * Exit methods using blocking RCU primitives, such as
         * synchronize_rcu(), should be implemented via exit_batch.
         * Then, destruction of a group of net requires single
         * synchronize_rcu() related to these pernet_operations,
         * instead of separate synchronize_rcu() for every net.
         * Please, avoid synchronize_rcu() at all, where it's possible.
         *
         * Note that a combination of pre_exit() and exit() can
         * be used, since a synchronize_rcu() is guaranteed between
         * the calls.
         */
        int (*init)(struct net *net);
        void (*pre_exit)(struct net *net);
        void (*exit)(struct net *net);
        void (*exit_batch)(struct list_head *net_exit_list);
        /* Following method is called with RTNL held. */
        void (*exit_rtnl)(struct net *net,
                          struct list_head *dev_kill_list);
        unsigned int * const id;
        const size_t size;
};

/*
 * Use these carefully.  If you implement a network device and it
 * needs per network namespace operations use device pernet operations,
 * otherwise use pernet subsys operations.
 *
 * Network interfaces need to be removed from a dying netns _before_
 * subsys notifiers can be called, as most of the network code cleanup
 * (which is done from subsys notifiers) runs with the assumption that
 * dev_remove_pack has been called so no new packets will arrive during
 * and after the cleanup functions have been called.  dev_remove_pack
 * is not per namespace so instead the guarantee of no more packets
 * arriving in a network namespace is provided by ensuring that all
 * network devices and all sockets have left the network namespace
 * before the cleanup methods are called.
 *
 * For the longest time the ipv4 icmp code was registered as a pernet
 * device which caused kernel oops, and panics during network
 * namespace cleanup.   So please don't get this wrong.
 */
int register_pernet_subsys(struct pernet_operations *);
void unregister_pernet_subsys(struct pernet_operations *);
int register_pernet_device(struct pernet_operations *);
void unregister_pernet_device(struct pernet_operations *);

struct ctl_table;

#define register_net_sysctl(net, path, table)        \
        register_net_sysctl_sz(net, path, table, ARRAY_SIZE(table))
#ifdef CONFIG_SYSCTL
int net_sysctl_init(void);
struct ctl_table_header *register_net_sysctl_sz(struct net *net, const char *path,
                                             struct ctl_table *table, size_t table_size);
void unregister_net_sysctl_table(struct ctl_table_header *header);
#else
static inline int net_sysctl_init(void) { return 0; }
static inline struct ctl_table_header *register_net_sysctl_sz(struct net *net,
        const char *path, struct ctl_table *table, size_t table_size)
{
        return NULL;
}
static inline void unregister_net_sysctl_table(struct ctl_table_header *header)
{
}
#endif

static inline int rt_genid_ipv4(const struct net *net)
{
        return atomic_read(&net->ipv4.rt_genid);
}

#if IS_ENABLED(CONFIG_IPV6)
static inline int rt_genid_ipv6(const struct net *net)
{
        return atomic_read(&net->ipv6.fib6_sernum);
}
#endif

static inline void rt_genid_bump_ipv4(struct net *net)
{
        atomic_inc(&net->ipv4.rt_genid);
}

extern void (*__fib6_flush_trees)(struct net *net);
static inline void rt_genid_bump_ipv6(struct net *net)
{
        if (__fib6_flush_trees)
                __fib6_flush_trees(net);
}

#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
static inline struct netns_ieee802154_lowpan *
net_ieee802154_lowpan(struct net *net)
{
        return &net->ieee802154_lowpan;
}
#endif

/* For callers who don't really care about whether it's IPv4 or IPv6 */
static inline void rt_genid_bump_all(struct net *net)
{
        rt_genid_bump_ipv4(net);
        rt_genid_bump_ipv6(net);
}

static inline int fnhe_genid(const struct net *net)
{
        return atomic_read(&net->fnhe_genid);
}

static inline void fnhe_genid_bump(struct net *net)
{
        atomic_inc(&net->fnhe_genid);
}

#ifdef CONFIG_NET
void net_ns_init(void);
#else
static inline void net_ns_init(void) {}
#endif

#endif /* __NET_NET_NAMESPACE_H */


















































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Internal procfs definitions
 *
 * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#include <linux/proc_fs.h>
#include <linux/proc_ns.h>
#include <linux/refcount.h>
#include <linux/spinlock.h>
#include <linux/atomic.h>
#include <linux/binfmts.h>
#include <linux/sched/coredump.h>
#include <linux/sched/task.h>
#include <linux/mm.h>

struct ctl_table_header;
struct mempolicy;

/*
 * This is not completely implemented yet. The idea is to
 * create an in-memory tree (like the actual /proc filesystem
 * tree) of these proc_dir_entries, so that we can dynamically
 * add new files to /proc.
 *
 * parent/subdir are used for the directory structure (every /proc file has a
 * parent, but "subdir" is empty for all non-directory entries).
 * subdir_node is used to build the rb tree "subdir" of the parent.
 */
struct proc_dir_entry {
        /*
         * number of callers into module in progress;
         * negative -> it's going away RSN
         */
        atomic_t in_use;
        refcount_t refcnt;
        struct list_head pde_openers;        /* who did ->open, but not ->release */
        /* protects ->pde_openers and all struct pde_opener instances */
        spinlock_t pde_unload_lock;
        struct completion *pde_unload_completion;
        const struct inode_operations *proc_iops;
        union {
                const struct proc_ops *proc_ops;
                const struct file_operations *proc_dir_ops;
        };
        union {
                const struct seq_operations *seq_ops;
                int (*single_show)(struct seq_file *, void *);
        };
        proc_write_t write;
        void *data;
        unsigned int state_size;
        unsigned int low_ino;
        nlink_t nlink;
        kuid_t uid;
        kgid_t gid;
        loff_t size;
        struct proc_dir_entry *parent;
        struct rb_root subdir;
        struct rb_node subdir_node;
        char *name;
        umode_t mode;
        u8 flags;
        u8 namelen;
        char inline_name[];
} __randomize_layout;

#define SIZEOF_PDE        (                                \
        sizeof(struct proc_dir_entry) < 128 ? 128 :        \
        sizeof(struct proc_dir_entry) < 192 ? 192 :        \
        sizeof(struct proc_dir_entry) < 256 ? 256 :        \
        sizeof(struct proc_dir_entry) < 512 ? 512 :        \
        0)
#define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry))

static inline bool pde_is_permanent(const struct proc_dir_entry *pde)
{
        return pde->flags & PROC_ENTRY_PERMANENT;
}

static inline void pde_make_permanent(struct proc_dir_entry *pde)
{
        pde->flags |= PROC_ENTRY_PERMANENT;
}

static inline bool pde_has_proc_read_iter(const struct proc_dir_entry *pde)
{
        return pde->flags & PROC_ENTRY_proc_read_iter;
}

static inline bool pde_has_proc_compat_ioctl(const struct proc_dir_entry *pde)
{
#ifdef CONFIG_COMPAT
        return pde->flags & PROC_ENTRY_proc_compat_ioctl;
#else
        return false;
#endif
}

static inline bool pde_has_proc_lseek(const struct proc_dir_entry *pde)
{
        return pde->flags & PROC_ENTRY_proc_lseek;
}

extern struct kmem_cache *proc_dir_entry_cache;
void pde_free(struct proc_dir_entry *pde);

union proc_op {
        int (*proc_get_link)(struct dentry *, struct path *);
        int (*proc_show)(struct seq_file *m,
                struct pid_namespace *ns, struct pid *pid,
                struct task_struct *task);
        int lsmid;
};

struct proc_inode {
        struct pid *pid;
        unsigned int fd;
        union proc_op op;
        struct proc_dir_entry *pde;
        struct ctl_table_header *sysctl;
        const struct ctl_table *sysctl_entry;
        struct hlist_node sibling_inodes;
        const struct proc_ns_operations *ns_ops;
        struct inode vfs_inode;
} __randomize_layout;

/*
 * General functions
 */
static inline struct proc_inode *PROC_I(const struct inode *inode)
{
        return container_of(inode, struct proc_inode, vfs_inode);
}

static inline struct proc_dir_entry *PDE(const struct inode *inode)
{
        return PROC_I(inode)->pde;
}

static inline struct pid *proc_pid(const struct inode *inode)
{
        return PROC_I(inode)->pid;
}

static inline struct task_struct *get_proc_task(const struct inode *inode)
{
        return get_pid_task(proc_pid(inode), PIDTYPE_PID);
}

void task_dump_owner(struct task_struct *task, umode_t mode,
                     kuid_t *ruid, kgid_t *rgid);

unsigned name_to_int(const struct qstr *qstr);
/*
 * Offset of the first process in the /proc root directory..
 */
#define FIRST_PROCESS_ENTRY 256

/* Worst case buffer size needed for holding an integer. */
#define PROC_NUMBUF 13

#ifdef CONFIG_PAGE_MAPCOUNT
/**
 * folio_precise_page_mapcount() - Number of mappings of this folio page.
 * @folio: The folio.
 * @page: The page.
 *
 * The number of present user page table entries that reference this page
 * as tracked via the RMAP: either referenced directly (PTE) or as part of
 * a larger area that covers this page (e.g., PMD).
 *
 * Use this function only for the calculation of existing statistics
 * (USS, PSS, mapcount_max) and for debugging purposes (/proc/kpagecount).
 *
 * Do not add new users.
 *
 * Returns: The number of mappings of this folio page. 0 for
 * folios that are not mapped to user space or are not tracked via the RMAP
 * (e.g., shared zeropage).
 */
static inline int folio_precise_page_mapcount(struct folio *folio,
                struct page *page)
{
        int mapcount = atomic_read(&page->_mapcount) + 1;

        if (page_mapcount_is_type(mapcount))
                mapcount = 0;
        if (folio_test_large(folio))
                mapcount += folio_entire_mapcount(folio);

        return mapcount;
}
#else /* !CONFIG_PAGE_MAPCOUNT */
static inline int folio_precise_page_mapcount(struct folio *folio,
                struct page *page)
{
        BUILD_BUG();
}
#endif /* CONFIG_PAGE_MAPCOUNT */

/**
 * folio_average_page_mapcount() - Average number of mappings per page in this
 *                                   folio
 * @folio: The folio.
 *
 * The average number of user page table entries that reference each page in
 * this folio as tracked via the RMAP: either referenced directly (PTE) or
 * as part of a larger area that covers this page (e.g., PMD).
 *
 * The average is calculated by rounding to the nearest integer; however,
 * to avoid duplicated code in current callers, the average is at least
 * 1 if any page of the folio is mapped.
 *
 * Returns: The average number of mappings per page in this folio.
 */
static inline int folio_average_page_mapcount(struct folio *folio)
{
        int mapcount, entire_mapcount, avg;

        if (!folio_test_large(folio))
                return atomic_read(&folio->_mapcount) + 1;

        mapcount = folio_large_mapcount(folio);
        if (unlikely(mapcount <= 0))
                return 0;
        entire_mapcount = folio_entire_mapcount(folio);
        if (mapcount <= entire_mapcount)
                return entire_mapcount;
        mapcount -= entire_mapcount;

        /* Round to closest integer ... */
        avg = ((unsigned int)mapcount + folio_large_nr_pages(folio) / 2) >> folio_large_order(folio);
        /* ... but return at least 1. */
        return max_t(int, avg + entire_mapcount, 1);
}
/*
 * array.c
 */
extern const struct file_operations proc_tid_children_operations;

extern void proc_task_name(struct seq_file *m, struct task_struct *p,
                           bool escape);
extern int proc_tid_stat(struct seq_file *, struct pid_namespace *,
                         struct pid *, struct task_struct *);
extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *,
                          struct pid *, struct task_struct *);
extern int proc_pid_status(struct seq_file *, struct pid_namespace *,
                           struct pid *, struct task_struct *);
extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,
                          struct pid *, struct task_struct *);

/*
 * base.c
 */
extern const struct dentry_operations pid_dentry_operations;
extern int pid_getattr(struct mnt_idmap *, const struct path *,
                       struct kstat *, u32, unsigned int);
extern int proc_setattr(struct mnt_idmap *, struct dentry *,
                        struct iattr *);
extern void proc_pid_evict_inode(struct proc_inode *);
extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t);
extern void pid_update_inode(struct task_struct *, struct inode *);
extern int pid_delete_dentry(const struct dentry *);
extern int proc_pid_readdir(struct file *, struct dir_context *);
struct dentry *proc_pid_lookup(struct dentry *, unsigned int);
extern loff_t mem_lseek(struct file *, loff_t, int);

/* Lookups */
typedef struct dentry *instantiate_t(struct dentry *,
                                     struct task_struct *, const void *);
bool proc_fill_cache(struct file *, struct dir_context *, const char *, unsigned int,
                           instantiate_t, struct task_struct *, const void *);

/*
 * generic.c
 */
struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
                struct proc_dir_entry **parent, void *data);
struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
                struct proc_dir_entry *dp);
extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
struct dentry *proc_lookup_de(struct inode *, struct dentry *, struct proc_dir_entry *);
extern int proc_readdir(struct file *, struct dir_context *);
int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry *);

static inline void pde_get(struct proc_dir_entry *pde)
{
        refcount_inc(&pde->refcnt);
}
extern void pde_put(struct proc_dir_entry *);

static inline bool is_empty_pde(const struct proc_dir_entry *pde)
{
        return S_ISDIR(pde->mode) && !pde->proc_iops;
}
extern ssize_t proc_simple_write(struct file *, const char __user *, size_t, loff_t *);

/*
 * inode.c
 */
struct pde_opener {
        struct list_head lh;
        struct file *file;
        bool closing;
        struct completion *c;
} __randomize_layout;
extern const struct inode_operations proc_link_inode_operations;
extern const struct inode_operations proc_pid_link_inode_operations;
extern const struct super_operations proc_sops;

void proc_init_kmemcache(void);
void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock);
void set_proc_pid_nlink(void);
extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
extern void proc_entry_rundown(struct proc_dir_entry *);

/*
 * proc_namespaces.c
 */
extern const struct inode_operations proc_ns_dir_inode_operations;
extern const struct file_operations proc_ns_dir_operations;

/*
 * proc_net.c
 */
extern const struct file_operations proc_net_operations;
extern const struct inode_operations proc_net_inode_operations;

#ifdef CONFIG_NET
extern int proc_net_init(void);
#else
static inline int proc_net_init(void) { return 0; }
#endif

/*
 * proc_self.c
 */
extern int proc_setup_self(struct super_block *);

/*
 * proc_thread_self.c
 */
extern int proc_setup_thread_self(struct super_block *);
extern void proc_thread_self_init(void);

/*
 * proc_sysctl.c
 */
#ifdef CONFIG_PROC_SYSCTL
extern int proc_sys_init(void);
extern void proc_sys_evict_inode(struct inode *inode,
                                 struct ctl_table_header *head);
#else
static inline void proc_sys_init(void) { }
static inline void proc_sys_evict_inode(struct  inode *inode,
                                        struct ctl_table_header *head) { }
#endif

/*
 * proc_tty.c
 */
#ifdef CONFIG_TTY
extern void proc_tty_init(void);
#else
static inline void proc_tty_init(void) {}
#endif

/*
 * root.c
 */
extern struct proc_dir_entry proc_root;

extern void proc_self_init(void);

/*
 * task_[no]mmu.c
 */
struct mem_size_stats;

struct proc_maps_locking_ctx {
        struct mm_struct *mm;
#ifdef CONFIG_PER_VMA_LOCK
        bool mmap_locked;
        struct vm_area_struct *locked_vma;
#endif
};

struct proc_maps_private {
        struct inode *inode;
        struct task_struct *task;
        struct vma_iterator iter;
        loff_t last_pos;
        struct proc_maps_locking_ctx lock_ctx;
#ifdef CONFIG_NUMA
        struct mempolicy *task_mempolicy;
#endif
} __randomize_layout;

struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode);

extern const struct file_operations proc_pid_maps_operations;
extern const struct file_operations proc_pid_numa_maps_operations;
extern const struct file_operations proc_pid_smaps_operations;
extern const struct file_operations proc_pid_smaps_rollup_operations;
extern const struct file_operations proc_clear_refs_operations;
extern const struct file_operations proc_pagemap_operations;

extern unsigned long task_vsize(struct mm_struct *);
extern unsigned long task_statm(struct mm_struct *,
                                unsigned long *, unsigned long *,
                                unsigned long *, unsigned long *);
extern void task_mem(struct seq_file *, struct mm_struct *);

extern const struct dentry_operations proc_net_dentry_ops;
static inline void pde_force_lookup(struct proc_dir_entry *pde)
{
        /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */
        pde->flags |= PROC_ENTRY_FORCE_LOOKUP;
}

/*
 * Add a new procfs dentry that can't serve as a mountpoint. That should
 * encompass anything that is ephemeral and can just disappear while the
 * process is still around.
 */
static inline struct dentry *proc_splice_unmountable(struct inode *inode,
                struct dentry *dentry, const struct dentry_operations *d_ops)
{
        dont_mount(dentry);
        return d_splice_alias_ops(inode, dentry, d_ops);
}













































































































































































































































































































































































  316 

































































  314 








  318 























    3 







    3 


    3 
































































































   67 



  313 
  291 













   67 




















  317 


  313 
  291 
































  314 
  313 

  315 
  314 


  311 



  313 
  317 









  313 
















  317 




































































































































































































































































































































































  312 



  316 



  312 






  312 

  314 


  314 



  319 
  314 

  316 




  315 


  313 
  318 


  313 


  314 










  316 
  312 








  316 


  318 




















  317 



  316 
  319 




  319 


  319 



  315 

  317 

  316 










   18 


  315 







   69 







   18 
    1 

















   19 
   19 
   18 
   19 














   51 







   50 


   51 





   51 




   50 
   51 

   51 

   51 





   51 











































































































































  315 
  319 



  315 
   18 




























































































  316 



  317 
  318 
  313 




  311 































  316 






  314 









  319 































































































































































































































































































































































































































































































  317 





  319 








  315 


  316 



  314 


  318 
  315 





   15 


   15 



   14 


   15 























































































































































   13 





   15 

   15 

   15 




   15 


   15 















   15 














   15 























   15 

   15 




















































































































































































































































































  318 

  313 

  318 




































  317 





































    1 

    1 
    1 































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    6 








    6 



















































































  265 

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        Routines having to do with the 'struct sk_buff' memory handlers.
 *
 *        Authors:        Alan Cox <alan@lxorguk.ukuu.org.uk>
 *                        Florian La Roche <rzsfl@rz.uni-sb.de>
 *
 *        Fixes:
 *                Alan Cox        :        Fixed the worst of the load
 *                                        balancer bugs.
 *                Dave Platt        :        Interrupt stacking fix.
 *        Richard Kooijman        :        Timestamp fixes.
 *                Alan Cox        :        Changed buffer format.
 *                Alan Cox        :        destructor hook for AF_UNIX etc.
 *                Linus Torvalds        :        Better skb_clone.
 *                Alan Cox        :        Added skb_copy.
 *                Alan Cox        :        Added all the changed routines Linus
 *                                        only put in the headers
 *                Ray VanTassle        :        Fixed --skb->lock in free
 *                Alan Cox        :        skb_copy copy arp field
 *                Andi Kleen        :        slabified it.
 *                Robert Olsson        :        Removed skb_head_pool
 *
 *        NOTE:
 *                The __skb_ routines should be called with interrupts
 *        disabled, or you better be *real* sure that the operation is atomic
 *        with respect to whatever list is being frobbed (e.g. via lock_sock()
 *        or via disabling bottom half handlers, etc).
 */

/*
 *        The functions in this file will not compile correctly with gcc 2.4.x
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/slab.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/sctp.h>
#include <linux/netdevice.h>
#ifdef CONFIG_NET_CLS_ACT
#include <net/pkt_sched.h>
#endif
#include <linux/string.h>
#include <linux/skbuff.h>
#include <linux/skbuff_ref.h>
#include <linux/splice.h>
#include <linux/cache.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/scatterlist.h>
#include <linux/errqueue.h>
#include <linux/prefetch.h>
#include <linux/bitfield.h>
#include <linux/if_vlan.h>
#include <linux/mpls.h>
#include <linux/kcov.h>
#include <linux/iov_iter.h>
#include <linux/crc32.h>

#include <net/protocol.h>
#include <net/dst.h>
#include <net/sock.h>
#include <net/checksum.h>
#include <net/gro.h>
#include <net/gso.h>
#include <net/hotdata.h>
#include <net/ip6_checksum.h>
#include <net/xfrm.h>
#include <net/mpls.h>
#include <net/mptcp.h>
#include <net/mctp.h>
#include <net/page_pool/helpers.h>
#include <net/psp/types.h>
#include <net/dropreason.h>

#include <linux/uaccess.h>
#include <trace/events/skb.h>
#include <linux/highmem.h>
#include <linux/capability.h>
#include <linux/user_namespace.h>
#include <linux/indirect_call_wrapper.h>
#include <linux/textsearch.h>

#include "dev.h"
#include "devmem.h"
#include "netmem_priv.h"
#include "sock_destructor.h"

#ifdef CONFIG_SKB_EXTENSIONS
static struct kmem_cache *skbuff_ext_cache __ro_after_init;
#endif

#define GRO_MAX_HEAD_PAD (GRO_MAX_HEAD + NET_SKB_PAD + NET_IP_ALIGN)
#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(max(MAX_TCP_HEADER, \
                                               GRO_MAX_HEAD_PAD))

/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two.
 * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique
 * size, and we can differentiate heads from skb_small_head_cache
 * vs system slabs by looking at their size (skb_end_offset()).
 */
#define SKB_SMALL_HEAD_CACHE_SIZE                                        \
        (is_power_of_2(SKB_SMALL_HEAD_SIZE) ?                        \
                (SKB_SMALL_HEAD_SIZE + L1_CACHE_BYTES) :        \
                SKB_SMALL_HEAD_SIZE)

#define SKB_SMALL_HEAD_HEADROOM                                                \
        SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE)

/* kcm_write_msgs() relies on casting paged frags to bio_vec to use
 * iov_iter_bvec(). These static asserts ensure the cast is valid is long as the
 * netmem is a page.
 */
static_assert(offsetof(struct bio_vec, bv_page) ==
              offsetof(skb_frag_t, netmem));
static_assert(sizeof_field(struct bio_vec, bv_page) ==
              sizeof_field(skb_frag_t, netmem));

static_assert(offsetof(struct bio_vec, bv_len) == offsetof(skb_frag_t, len));
static_assert(sizeof_field(struct bio_vec, bv_len) ==
              sizeof_field(skb_frag_t, len));

static_assert(offsetof(struct bio_vec, bv_offset) ==
              offsetof(skb_frag_t, offset));
static_assert(sizeof_field(struct bio_vec, bv_offset) ==
              sizeof_field(skb_frag_t, offset));

#undef FN
#define FN(reason) [SKB_DROP_REASON_##reason] = #reason,
static const char * const drop_reasons[] = {
        [SKB_CONSUMED] = "CONSUMED",
        DEFINE_DROP_REASON(FN, FN)
};

static const struct drop_reason_list drop_reasons_core = {
        .reasons = drop_reasons,
        .n_reasons = ARRAY_SIZE(drop_reasons),
};

const struct drop_reason_list __rcu *
drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_NUM] = {
        [SKB_DROP_REASON_SUBSYS_CORE] = RCU_INITIALIZER(&drop_reasons_core),
};
EXPORT_SYMBOL(drop_reasons_by_subsys);

/**
 * drop_reasons_register_subsys - register another drop reason subsystem
 * @subsys: the subsystem to register, must not be the core
 * @list: the list of drop reasons within the subsystem, must point to
 *        a statically initialized list
 */
void drop_reasons_register_subsys(enum skb_drop_reason_subsys subsys,
                                  const struct drop_reason_list *list)
{
        if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
                 subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
                 "invalid subsystem %d\n", subsys))
                return;

        /* must point to statically allocated memory, so INIT is OK */
        RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], list);
}
EXPORT_SYMBOL_GPL(drop_reasons_register_subsys);

/**
 * drop_reasons_unregister_subsys - unregister a drop reason subsystem
 * @subsys: the subsystem to remove, must not be the core
 *
 * Note: This will synchronize_rcu() to ensure no users when it returns.
 */
void drop_reasons_unregister_subsys(enum skb_drop_reason_subsys subsys)
{
        if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
                 subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
                 "invalid subsystem %d\n", subsys))
                return;

        RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], NULL);

        synchronize_rcu();
}
EXPORT_SYMBOL_GPL(drop_reasons_unregister_subsys);

/**
 *        skb_panic - private function for out-of-line support
 *        @skb:        buffer
 *        @sz:        size
 *        @addr:        address
 *        @msg:        skb_over_panic or skb_under_panic
 *
 *        Out-of-line support for skb_put() and skb_push().
 *        Called via the wrapper skb_over_panic() or skb_under_panic().
 *        Keep out of line to prevent kernel bloat.
 *        __builtin_return_address is not used because it is not always reliable.
 */
static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
                      const char msg[])
{
        pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n",
                 msg, addr, skb->len, sz, skb->head, skb->data,
                 (unsigned long)skb->tail, (unsigned long)skb->end,
                 skb->dev ? skb->dev->name : "<NULL>");
        BUG();
}

static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
{
        skb_panic(skb, sz, addr, __func__);
}

static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
{
        skb_panic(skb, sz, addr, __func__);
}

#define NAPI_SKB_CACHE_SIZE        64
#define NAPI_SKB_CACHE_BULK        16
#define NAPI_SKB_CACHE_HALF        (NAPI_SKB_CACHE_SIZE / 2)

struct napi_alloc_cache {
        local_lock_t bh_lock;
        struct page_frag_cache page;
        unsigned int skb_count;
        void *skb_cache[NAPI_SKB_CACHE_SIZE];
};

static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = {
        .bh_lock = INIT_LOCAL_LOCK(bh_lock),
};

void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
{
        struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
        void *data;

        fragsz = SKB_DATA_ALIGN(fragsz);

        local_lock_nested_bh(&napi_alloc_cache.bh_lock);
        data = __page_frag_alloc_align(&nc->page, fragsz,
                                       GFP_ATOMIC | __GFP_NOWARN, align_mask);
        local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
        return data;

}
EXPORT_SYMBOL(__napi_alloc_frag_align);

void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
{
        void *data;

        if (in_hardirq() || irqs_disabled()) {
                struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache);

                fragsz = SKB_DATA_ALIGN(fragsz);
                data = __page_frag_alloc_align(nc, fragsz,
                                               GFP_ATOMIC | __GFP_NOWARN,
                                               align_mask);
        } else {
                local_bh_disable();
                data = __napi_alloc_frag_align(fragsz, align_mask);
                local_bh_enable();
        }
        return data;
}
EXPORT_SYMBOL(__netdev_alloc_frag_align);

static struct sk_buff *napi_skb_cache_get(void)
{
        struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
        struct sk_buff *skb;

        local_lock_nested_bh(&napi_alloc_cache.bh_lock);
        if (unlikely(!nc->skb_count)) {
                nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
                                                      GFP_ATOMIC | __GFP_NOWARN,
                                                      NAPI_SKB_CACHE_BULK,
                                                      nc->skb_cache);
                if (unlikely(!nc->skb_count)) {
                        local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
                        return NULL;
                }
        }

        skb = nc->skb_cache[--nc->skb_count];
        local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
        kasan_mempool_unpoison_object(skb, kmem_cache_size(net_hotdata.skbuff_cache));

        return skb;
}

/**
 * napi_skb_cache_get_bulk - obtain a number of zeroed skb heads from the cache
 * @skbs: pointer to an at least @n-sized array to fill with skb pointers
 * @n: number of entries to provide
 *
 * Tries to obtain @n &sk_buff entries from the NAPI percpu cache and writes
 * the pointers into the provided array @skbs. If there are less entries
 * available, tries to replenish the cache and bulk-allocates the diff from
 * the MM layer if needed.
 * The heads are being zeroed with either memset() or %__GFP_ZERO, so they are
 * ready for {,__}build_skb_around() and don't have any data buffers attached.
 * Must be called *only* from the BH context.
 *
 * Return: number of successfully allocated skbs (@n if no actual allocation
 *           needed or kmem_cache_alloc_bulk() didn't fail).
 */
u32 napi_skb_cache_get_bulk(void **skbs, u32 n)
{
        struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
        u32 bulk, total = n;

        local_lock_nested_bh(&napi_alloc_cache.bh_lock);

        if (nc->skb_count >= n)
                goto get;

        /* No enough cached skbs. Try refilling the cache first */
        bulk = min(NAPI_SKB_CACHE_SIZE - nc->skb_count, NAPI_SKB_CACHE_BULK);
        nc->skb_count += kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
                                               GFP_ATOMIC | __GFP_NOWARN, bulk,
                                               &nc->skb_cache[nc->skb_count]);
        if (likely(nc->skb_count >= n))
                goto get;

        /* Still not enough. Bulk-allocate the missing part directly, zeroed */
        n -= kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
                                   GFP_ATOMIC | __GFP_ZERO | __GFP_NOWARN,
                                   n - nc->skb_count, &skbs[nc->skb_count]);
        if (likely(nc->skb_count >= n))
                goto get;

        /* kmem_cache didn't allocate the number we need, limit the output */
        total -= n - nc->skb_count;
        n = nc->skb_count;

get:
        for (u32 base = nc->skb_count - n, i = 0; i < n; i++) {
                u32 cache_size = kmem_cache_size(net_hotdata.skbuff_cache);

                skbs[i] = nc->skb_cache[base + i];

                kasan_mempool_unpoison_object(skbs[i], cache_size);
                memset(skbs[i], 0, offsetof(struct sk_buff, tail));
        }

        nc->skb_count -= n;
        local_unlock_nested_bh(&napi_alloc_cache.bh_lock);

        return total;
}
EXPORT_SYMBOL_GPL(napi_skb_cache_get_bulk);

static inline void __finalize_skb_around(struct sk_buff *skb, void *data,
                                         unsigned int size)
{
        struct skb_shared_info *shinfo;

        size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));

        /* Assumes caller memset cleared SKB */
        skb->truesize = SKB_TRUESIZE(size);
        refcount_set(&skb->users, 1);
        skb->head = data;
        skb->data = data;
        skb_reset_tail_pointer(skb);
        skb_set_end_offset(skb, size);
        skb->mac_header = (typeof(skb->mac_header))~0U;
        skb->transport_header = (typeof(skb->transport_header))~0U;
        skb->alloc_cpu = raw_smp_processor_id();
        /* make sure we initialize shinfo sequentially */
        shinfo = skb_shinfo(skb);
        memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
        atomic_set(&shinfo->dataref, 1);

        skb_set_kcov_handle(skb, kcov_common_handle());
}

static inline void *__slab_build_skb(void *data, unsigned int *size)
{
        void *resized;

        /* Must find the allocation size (and grow it to match). */
        *size = ksize(data);
        /* krealloc() will immediately return "data" when
         * "ksize(data)" is requested: it is the existing upper
         * bounds. As a result, GFP_ATOMIC will be ignored. Note
         * that this "new" pointer needs to be passed back to the
         * caller for use so the __alloc_size hinting will be
         * tracked correctly.
         */
        resized = krealloc(data, *size, GFP_ATOMIC);
        WARN_ON_ONCE(resized != data);
        return resized;
}

/* build_skb() variant which can operate on slab buffers.
 * Note that this should be used sparingly as slab buffers
 * cannot be combined efficiently by GRO!
 */
struct sk_buff *slab_build_skb(void *data)
{
        struct sk_buff *skb;
        unsigned int size;

        skb = kmem_cache_alloc(net_hotdata.skbuff_cache,
                               GFP_ATOMIC | __GFP_NOWARN);
        if (unlikely(!skb))
                return NULL;

        memset(skb, 0, offsetof(struct sk_buff, tail));
        data = __slab_build_skb(data, &size);
        __finalize_skb_around(skb, data, size);

        return skb;
}
EXPORT_SYMBOL(slab_build_skb);

/* Caller must provide SKB that is memset cleared */
static void __build_skb_around(struct sk_buff *skb, void *data,
                               unsigned int frag_size)
{
        unsigned int size = frag_size;

        /* frag_size == 0 is considered deprecated now. Callers
         * using slab buffer should use slab_build_skb() instead.
         */
        if (WARN_ONCE(size == 0, "Use slab_build_skb() instead"))
                data = __slab_build_skb(data, &size);

        __finalize_skb_around(skb, data, size);
}

/**
 * __build_skb - build a network buffer
 * @data: data buffer provided by caller
 * @frag_size: size of data (must not be 0)
 *
 * Allocate a new &sk_buff. Caller provides space holding head and
 * skb_shared_info. @data must have been allocated from the page
 * allocator or vmalloc(). (A @frag_size of 0 to indicate a kmalloc()
 * allocation is deprecated, and callers should use slab_build_skb()
 * instead.)
 * The return is the new skb buffer.
 * On a failure the return is %NULL, and @data is not freed.
 * Notes :
 *  Before IO, driver allocates only data buffer where NIC put incoming frame
 *  Driver should add room at head (NET_SKB_PAD) and
 *  MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
 *  After IO, driver calls build_skb(), to allocate sk_buff and populate it
 *  before giving packet to stack.
 *  RX rings only contains data buffers, not full skbs.
 */
struct sk_buff *__build_skb(void *data, unsigned int frag_size)
{
        struct sk_buff *skb;

        skb = kmem_cache_alloc(net_hotdata.skbuff_cache,
                               GFP_ATOMIC | __GFP_NOWARN);
        if (unlikely(!skb))
                return NULL;

        memset(skb, 0, offsetof(struct sk_buff, tail));
        __build_skb_around(skb, data, frag_size);

        return skb;
}

/* build_skb() is wrapper over __build_skb(), that specifically
 * takes care of skb->head and skb->pfmemalloc
 */
struct sk_buff *build_skb(void *data, unsigned int frag_size)
{
        struct sk_buff *skb = __build_skb(data, frag_size);

        if (likely(skb && frag_size)) {
                skb->head_frag = 1;
                skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
        }
        return skb;
}
EXPORT_SYMBOL(build_skb);

/**
 * build_skb_around - build a network buffer around provided skb
 * @skb: sk_buff provide by caller, must be memset cleared
 * @data: data buffer provided by caller
 * @frag_size: size of data
 */
struct sk_buff *build_skb_around(struct sk_buff *skb,
                                 void *data, unsigned int frag_size)
{
        if (unlikely(!skb))
                return NULL;

        __build_skb_around(skb, data, frag_size);

        if (frag_size) {
                skb->head_frag = 1;
                skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
        }
        return skb;
}
EXPORT_SYMBOL(build_skb_around);

/**
 * __napi_build_skb - build a network buffer
 * @data: data buffer provided by caller
 * @frag_size: size of data
 *
 * Version of __build_skb() that uses NAPI percpu caches to obtain
 * skbuff_head instead of inplace allocation.
 *
 * Returns a new &sk_buff on success, %NULL on allocation failure.
 */
static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
{
        struct sk_buff *skb;

        skb = napi_skb_cache_get();
        if (unlikely(!skb))
                return NULL;

        memset(skb, 0, offsetof(struct sk_buff, tail));
        __build_skb_around(skb, data, frag_size);

        return skb;
}

/**
 * napi_build_skb - build a network buffer
 * @data: data buffer provided by caller
 * @frag_size: size of data
 *
 * Version of __napi_build_skb() that takes care of skb->head_frag
 * and skb->pfmemalloc when the data is a page or page fragment.
 *
 * Returns a new &sk_buff on success, %NULL on allocation failure.
 */
struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)
{
        struct sk_buff *skb = __napi_build_skb(data, frag_size);

        if (likely(skb) && frag_size) {
                skb->head_frag = 1;
                skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
        }

        return skb;
}
EXPORT_SYMBOL(napi_build_skb);

/*
 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
 * the caller if emergency pfmemalloc reserves are being used. If it is and
 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
 * may be used. Otherwise, the packet data may be discarded until enough
 * memory is free
 */
static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
                             bool *pfmemalloc)
{
        bool ret_pfmemalloc = false;
        size_t obj_size;
        void *obj;

        obj_size = SKB_HEAD_ALIGN(*size);
        if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE &&
            !(flags & KMALLOC_NOT_NORMAL_BITS)) {
                obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache,
                                flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
                                node);
                *size = SKB_SMALL_HEAD_CACHE_SIZE;
                if (obj || !(gfp_pfmemalloc_allowed(flags)))
                        goto out;
                /* Try again but now we are using pfmemalloc reserves */
                ret_pfmemalloc = true;
                obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, flags, node);
                goto out;
        }

        obj_size = kmalloc_size_roundup(obj_size);
        /* The following cast might truncate high-order bits of obj_size, this
         * is harmless because kmalloc(obj_size >= 2^32) will fail anyway.
         */
        *size = (unsigned int)obj_size;

        /*
         * Try a regular allocation, when that fails and we're not entitled
         * to the reserves, fail.
         */
        obj = kmalloc_node_track_caller(obj_size,
                                        flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
                                        node);
        if (obj || !(gfp_pfmemalloc_allowed(flags)))
                goto out;

        /* Try again but now we are using pfmemalloc reserves */
        ret_pfmemalloc = true;
        obj = kmalloc_node_track_caller(obj_size, flags, node);

out:
        if (pfmemalloc)
                *pfmemalloc = ret_pfmemalloc;

        return obj;
}

/*         Allocate a new skbuff. We do this ourselves so we can fill in a few
 *        'private' fields and also do memory statistics to find all the
 *        [BEEP] leaks.
 *
 */

/**
 *        __alloc_skb        -        allocate a network buffer
 *        @size: size to allocate
 *        @gfp_mask: allocation mask
 *        @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
 *                instead of head cache and allocate a cloned (child) skb.
 *                If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
 *                allocations in case the data is required for writeback
 *        @node: numa node to allocate memory on
 *
 *        Allocate a new &sk_buff. The returned buffer has no headroom and a
 *        tail room of at least size bytes. The object has a reference count
 *        of one. The return is the buffer. On a failure the return is %NULL.
 *
 *        Buffers may only be allocated from interrupts using a @gfp_mask of
 *        %GFP_ATOMIC.
 */
struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
                            int flags, int node)
{
        struct kmem_cache *cache;
        struct sk_buff *skb;
        bool pfmemalloc;
        u8 *data;

        cache = (flags & SKB_ALLOC_FCLONE)
                ? net_hotdata.skbuff_fclone_cache : net_hotdata.skbuff_cache;

        if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
                gfp_mask |= __GFP_MEMALLOC;

        /* Get the HEAD */
        if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI &&
            likely(node == NUMA_NO_NODE || node == numa_mem_id()))
                skb = napi_skb_cache_get();
        else
                skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node);
        if (unlikely(!skb))
                return NULL;
        prefetchw(skb);

        /* We do our best to align skb_shared_info on a separate cache
         * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
         * aligned memory blocks, unless SLUB/SLAB debug is enabled.
         * Both skb->head and skb_shared_info are cache line aligned.
         */
        data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc);
        if (unlikely(!data))
                goto nodata;
        /* kmalloc_size_roundup() might give us more room than requested.
         * Put skb_shared_info exactly at the end of allocated zone,
         * to allow max possible filling before reallocation.
         */
        prefetchw(data + SKB_WITH_OVERHEAD(size));

        /*
         * Only clear those fields we need to clear, not those that we will
         * actually initialise below. Hence, don't put any more fields after
         * the tail pointer in struct sk_buff!
         */
        memset(skb, 0, offsetof(struct sk_buff, tail));
        __build_skb_around(skb, data, size);
        skb->pfmemalloc = pfmemalloc;

        if (flags & SKB_ALLOC_FCLONE) {
                struct sk_buff_fclones *fclones;

                fclones = container_of(skb, struct sk_buff_fclones, skb1);

                skb->fclone = SKB_FCLONE_ORIG;
                refcount_set(&fclones->fclone_ref, 1);
        }

        return skb;

nodata:
        kmem_cache_free(cache, skb);
        return NULL;
}
EXPORT_SYMBOL(__alloc_skb);

/**
 *        __netdev_alloc_skb - allocate an skbuff for rx on a specific device
 *        @dev: network device to receive on
 *        @len: length to allocate
 *        @gfp_mask: get_free_pages mask, passed to alloc_skb
 *
 *        Allocate a new &sk_buff and assign it a usage count of one. The
 *        buffer has NET_SKB_PAD headroom built in. Users should allocate
 *        the headroom they think they need without accounting for the
 *        built in space. The built in space is used for optimisations.
 *
 *        %NULL is returned if there is no free memory.
 */
struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
                                   gfp_t gfp_mask)
{
        struct page_frag_cache *nc;
        struct sk_buff *skb;
        bool pfmemalloc;
        void *data;

        len += NET_SKB_PAD;

        /* If requested length is either too small or too big,
         * we use kmalloc() for skb->head allocation.
         */
        if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) ||
            len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
            (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
                skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
                if (!skb)
                        goto skb_fail;
                goto skb_success;
        }

        len = SKB_HEAD_ALIGN(len);

        if (sk_memalloc_socks())
                gfp_mask |= __GFP_MEMALLOC;

        if (in_hardirq() || irqs_disabled()) {
                nc = this_cpu_ptr(&netdev_alloc_cache);
                data = page_frag_alloc(nc, len, gfp_mask);
                pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
        } else {
                local_bh_disable();
                local_lock_nested_bh(&napi_alloc_cache.bh_lock);

                nc = this_cpu_ptr(&napi_alloc_cache.page);
                data = page_frag_alloc(nc, len, gfp_mask);
                pfmemalloc = page_frag_cache_is_pfmemalloc(nc);

                local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
                local_bh_enable();
        }

        if (unlikely(!data))
                return NULL;

        skb = __build_skb(data, len);
        if (unlikely(!skb)) {
                skb_free_frag(data);
                return NULL;
        }

        if (pfmemalloc)
                skb->pfmemalloc = 1;
        skb->head_frag = 1;

skb_success:
        skb_reserve(skb, NET_SKB_PAD);
        skb->dev = dev;

skb_fail:
        return skb;
}
EXPORT_SYMBOL(__netdev_alloc_skb);

/**
 *        napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
 *        @napi: napi instance this buffer was allocated for
 *        @len: length to allocate
 *
 *        Allocate a new sk_buff for use in NAPI receive.  This buffer will
 *        attempt to allocate the head from a special reserved region used
 *        only for NAPI Rx allocation.  By doing this we can save several
 *        CPU cycles by avoiding having to disable and re-enable IRQs.
 *
 *        %NULL is returned if there is no free memory.
 */
struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
{
        gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN;
        struct napi_alloc_cache *nc;
        struct sk_buff *skb;
        bool pfmemalloc;
        void *data;

        DEBUG_NET_WARN_ON_ONCE(!in_softirq());
        len += NET_SKB_PAD + NET_IP_ALIGN;

        /* If requested length is either too small or too big,
         * we use kmalloc() for skb->head allocation.
         */
        if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) ||
            len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
            (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
                skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
                                  NUMA_NO_NODE);
                if (!skb)
                        goto skb_fail;
                goto skb_success;
        }

        len = SKB_HEAD_ALIGN(len);

        if (sk_memalloc_socks())
                gfp_mask |= __GFP_MEMALLOC;

        local_lock_nested_bh(&napi_alloc_cache.bh_lock);
        nc = this_cpu_ptr(&napi_alloc_cache);

        data = page_frag_alloc(&nc->page, len, gfp_mask);
        pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
        local_unlock_nested_bh(&napi_alloc_cache.bh_lock);

        if (unlikely(!data))
                return NULL;

        skb = __napi_build_skb(data, len);
        if (unlikely(!skb)) {
                skb_free_frag(data);
                return NULL;
        }

        if (pfmemalloc)
                skb->pfmemalloc = 1;
        skb->head_frag = 1;

skb_success:
        skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
        skb->dev = napi->dev;

skb_fail:
        return skb;
}
EXPORT_SYMBOL(napi_alloc_skb);

void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem,
                            int off, int size, unsigned int truesize)
{
        DEBUG_NET_WARN_ON_ONCE(size > truesize);

        skb_fill_netmem_desc(skb, i, netmem, off, size);
        skb->len += size;
        skb->data_len += size;
        skb->truesize += truesize;
}
EXPORT_SYMBOL(skb_add_rx_frag_netmem);

void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
                          unsigned int truesize)
{
        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];

        DEBUG_NET_WARN_ON_ONCE(size > truesize);

        skb_frag_size_add(frag, size);
        skb->len += size;
        skb->data_len += size;
        skb->truesize += truesize;
}
EXPORT_SYMBOL(skb_coalesce_rx_frag);

static void skb_drop_list(struct sk_buff **listp)
{
        kfree_skb_list(*listp);
        *listp = NULL;
}

static inline void skb_drop_fraglist(struct sk_buff *skb)
{
        skb_drop_list(&skb_shinfo(skb)->frag_list);
}

static void skb_clone_fraglist(struct sk_buff *skb)
{
        struct sk_buff *list;

        skb_walk_frags(skb, list)
                skb_get(list);
}

int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
                    unsigned int headroom)
{
#if IS_ENABLED(CONFIG_PAGE_POOL)
        u32 size, truesize, len, max_head_size, off;
        struct sk_buff *skb = *pskb, *nskb;
        int err, i, head_off;
        void *data;

        /* XDP does not support fraglist so we need to linearize
         * the skb.
         */
        if (skb_has_frag_list(skb))
                return -EOPNOTSUPP;

        max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - headroom);
        if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE)
                return -ENOMEM;

        size = min_t(u32, skb->len, max_head_size);
        truesize = SKB_HEAD_ALIGN(size) + headroom;
        data = page_pool_dev_alloc_va(pool, &truesize);
        if (!data)
                return -ENOMEM;

        nskb = napi_build_skb(data, truesize);
        if (!nskb) {
                page_pool_free_va(pool, data, true);
                return -ENOMEM;
        }

        skb_reserve(nskb, headroom);
        skb_copy_header(nskb, skb);
        skb_mark_for_recycle(nskb);

        err = skb_copy_bits(skb, 0, nskb->data, size);
        if (err) {
                consume_skb(nskb);
                return err;
        }
        skb_put(nskb, size);

        head_off = skb_headroom(nskb) - skb_headroom(skb);
        skb_headers_offset_update(nskb, head_off);

        off = size;
        len = skb->len - off;
        for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
                struct page *page;
                u32 page_off;

                size = min_t(u32, len, PAGE_SIZE);
                truesize = size;

                page = page_pool_dev_alloc(pool, &page_off, &truesize);
                if (!page) {
                        consume_skb(nskb);
                        return -ENOMEM;
                }

                skb_add_rx_frag(nskb, i, page, page_off, size, truesize);
                err = skb_copy_bits(skb, off, page_address(page) + page_off,
                                    size);
                if (err) {
                        consume_skb(nskb);
                        return err;
                }

                len -= size;
                off += size;
        }

        consume_skb(skb);
        *pskb = nskb;

        return 0;
#else
        return -EOPNOTSUPP;
#endif
}
EXPORT_SYMBOL(skb_pp_cow_data);

int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
                         const struct bpf_prog *prog)
{
        if (!prog->aux->xdp_has_frags)
                return -EINVAL;

        return skb_pp_cow_data(pool, pskb, XDP_PACKET_HEADROOM);
}
EXPORT_SYMBOL(skb_cow_data_for_xdp);

#if IS_ENABLED(CONFIG_PAGE_POOL)
bool napi_pp_put_page(netmem_ref netmem)
{
        netmem = netmem_compound_head(netmem);

        if (unlikely(!netmem_is_pp(netmem)))
                return false;

        page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false);

        return true;
}
EXPORT_SYMBOL(napi_pp_put_page);
#endif

static bool skb_pp_recycle(struct sk_buff *skb, void *data)
{
        if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
                return false;
        return napi_pp_put_page(page_to_netmem(virt_to_page(data)));
}

/**
 * skb_pp_frag_ref() - Increase fragment references of a page pool aware skb
 * @skb:        page pool aware skb
 *
 * Increase the fragment reference count (pp_ref_count) of a skb. This is
 * intended to gain fragment references only for page pool aware skbs,
 * i.e. when skb->pp_recycle is true, and not for fragments in a
 * non-pp-recycling skb. It has a fallback to increase references on normal
 * pages, as page pool aware skbs may also have normal page fragments.
 */
static int skb_pp_frag_ref(struct sk_buff *skb)
{
        struct skb_shared_info *shinfo;
        netmem_ref head_netmem;
        int i;

        if (!skb->pp_recycle)
                return -EINVAL;

        shinfo = skb_shinfo(skb);

        for (i = 0; i < shinfo->nr_frags; i++) {
                head_netmem = netmem_compound_head(shinfo->frags[i].netmem);
                if (likely(netmem_is_pp(head_netmem)))
                        page_pool_ref_netmem(head_netmem);
                else
                        page_ref_inc(netmem_to_page(head_netmem));
        }
        return 0;
}

static void skb_kfree_head(void *head, unsigned int end_offset)
{
        if (end_offset == SKB_SMALL_HEAD_HEADROOM)
                kmem_cache_free(net_hotdata.skb_small_head_cache, head);
        else
                kfree(head);
}

static void skb_free_head(struct sk_buff *skb)
{
        unsigned char *head = skb->head;

        if (skb->head_frag) {
                if (skb_pp_recycle(skb, head))
                        return;
                skb_free_frag(head);
        } else {
                skb_kfree_head(head, skb_end_offset(skb));
        }
}

static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
{
        struct skb_shared_info *shinfo = skb_shinfo(skb);
        int i;

        if (!skb_data_unref(skb, shinfo))
                goto exit;

        if (skb_zcopy(skb)) {
                bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS;

                skb_zcopy_clear(skb, true);
                if (skip_unref)
                        goto free_head;
        }

        for (i = 0; i < shinfo->nr_frags; i++)
                __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);

free_head:
        if (shinfo->frag_list)
                kfree_skb_list_reason(shinfo->frag_list, reason);

        skb_free_head(skb);
exit:
        /* When we clone an SKB we copy the reycling bit. The pp_recycle
         * bit is only set on the head though, so in order to avoid races
         * while trying to recycle fragments on __skb_frag_unref() we need
         * to make one SKB responsible for triggering the recycle path.
         * So disable the recycling bit if an SKB is cloned and we have
         * additional references to the fragmented part of the SKB.
         * Eventually the last SKB will have the recycling bit set and it's
         * dataref set to 0, which will trigger the recycling
         */
        skb->pp_recycle = 0;
}

/*
 *        Free an skbuff by memory without cleaning the state.
 */
static void kfree_skbmem(struct sk_buff *skb)
{
        struct sk_buff_fclones *fclones;

        switch (skb->fclone) {
        case SKB_FCLONE_UNAVAILABLE:
                kmem_cache_free(net_hotdata.skbuff_cache, skb);
                return;

        case SKB_FCLONE_ORIG:
                fclones = container_of(skb, struct sk_buff_fclones, skb1);

                /* We usually free the clone (TX completion) before original skb
                 * This test would have no chance to be true for the clone,
                 * while here, branch prediction will be good.
                 */
                if (refcount_read(&fclones->fclone_ref) == 1)
                        goto fastpath;
                break;

        default: /* SKB_FCLONE_CLONE */
                fclones = container_of(skb, struct sk_buff_fclones, skb2);
                break;
        }
        if (!refcount_dec_and_test(&fclones->fclone_ref))
                return;
fastpath:
        kmem_cache_free(net_hotdata.skbuff_fclone_cache, fclones);
}

void skb_release_head_state(struct sk_buff *skb)
{
        skb_dst_drop(skb);
        if (skb->destructor) {
                DEBUG_NET_WARN_ON_ONCE(in_hardirq());
                skb->destructor(skb);
        }
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        nf_conntrack_put(skb_nfct(skb));
#endif
        skb_ext_put(skb);
}

/* Free everything but the sk_buff shell. */
static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
{
        skb_release_head_state(skb);
        if (likely(skb->head))
                skb_release_data(skb, reason);
}

/**
 *        __kfree_skb - private function
 *        @skb: buffer
 *
 *        Free an sk_buff. Release anything attached to the buffer.
 *        Clean the state. This is an internal helper function. Users should
 *        always call kfree_skb
 */

void __kfree_skb(struct sk_buff *skb)
{
        skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
        kfree_skbmem(skb);
}
EXPORT_SYMBOL(__kfree_skb);

static __always_inline
bool __sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb,
                          enum skb_drop_reason reason)
{
        if (unlikely(!skb_unref(skb)))
                return false;

        DEBUG_NET_WARN_ON_ONCE(reason == SKB_NOT_DROPPED_YET ||
                               u32_get_bits(reason,
                                            SKB_DROP_REASON_SUBSYS_MASK) >=
                                SKB_DROP_REASON_SUBSYS_NUM);

        if (reason == SKB_CONSUMED)
                trace_consume_skb(skb, __builtin_return_address(0));
        else
                trace_kfree_skb(skb, __builtin_return_address(0), reason, sk);
        return true;
}

/**
 *        sk_skb_reason_drop - free an sk_buff with special reason
 *        @sk: the socket to receive @skb, or NULL if not applicable
 *        @skb: buffer to free
 *        @reason: reason why this skb is dropped
 *
 *        Drop a reference to the buffer and free it if the usage count has hit
 *        zero. Meanwhile, pass the receiving socket and drop reason to
 *        'kfree_skb' tracepoint.
 */
void __fix_address
sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason)
{
        if (__sk_skb_reason_drop(sk, skb, reason))
                __kfree_skb(skb);
}
EXPORT_SYMBOL(sk_skb_reason_drop);

#define KFREE_SKB_BULK_SIZE        16

struct skb_free_array {
        unsigned int skb_count;
        void *skb_array[KFREE_SKB_BULK_SIZE];
};

static void kfree_skb_add_bulk(struct sk_buff *skb,
                               struct skb_free_array *sa,
                               enum skb_drop_reason reason)
{
        /* if SKB is a clone, don't handle this case */
        if (unlikely(skb->fclone != SKB_FCLONE_UNAVAILABLE)) {
                __kfree_skb(skb);
                return;
        }

        skb_release_all(skb, reason);
        sa->skb_array[sa->skb_count++] = skb;

        if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) {
                kmem_cache_free_bulk(net_hotdata.skbuff_cache, KFREE_SKB_BULK_SIZE,
                                     sa->skb_array);
                sa->skb_count = 0;
        }
}

void __fix_address
kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason)
{
        struct skb_free_array sa;

        sa.skb_count = 0;

        while (segs) {
                struct sk_buff *next = segs->next;

                if (__sk_skb_reason_drop(NULL, segs, reason)) {
                        skb_poison_list(segs);
                        kfree_skb_add_bulk(segs, &sa, reason);
                }

                segs = next;
        }

        if (sa.skb_count)
                kmem_cache_free_bulk(net_hotdata.skbuff_cache, sa.skb_count, sa.skb_array);
}
EXPORT_SYMBOL(kfree_skb_list_reason);

/* Dump skb information and contents.
 *
 * Must only be called from net_ratelimit()-ed paths.
 *
 * Dumps whole packets if full_pkt, only headers otherwise.
 */
void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
{
        struct skb_shared_info *sh = skb_shinfo(skb);
        struct net_device *dev = skb->dev;
        struct sock *sk = skb->sk;
        struct sk_buff *list_skb;
        bool has_mac, has_trans;
        int headroom, tailroom;
        int i, len, seg_len;

        if (full_pkt)
                len = skb->len;
        else
                len = min_t(int, skb->len, MAX_HEADER + 128);

        headroom = skb_headroom(skb);
        tailroom = skb_tailroom(skb);

        has_mac = skb_mac_header_was_set(skb);
        has_trans = skb_transport_header_was_set(skb);

        printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n"
               "mac=(%d,%d) mac_len=%u net=(%d,%d) trans=%d\n"
               "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n"
               "csum(0x%x start=%u offset=%u ip_summed=%u complete_sw=%u valid=%u level=%u)\n"
               "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n"
               "priority=0x%x mark=0x%x alloc_cpu=%u vlan_all=0x%x\n"
               "encapsulation=%d inner(proto=0x%04x, mac=%u, net=%u, trans=%u)\n",
               level, skb->len, headroom, skb_headlen(skb), tailroom,
               has_mac ? skb->mac_header : -1,
               has_mac ? skb_mac_header_len(skb) : -1,
               skb->mac_len,
               skb->network_header,
               has_trans ? skb_network_header_len(skb) : -1,
               has_trans ? skb->transport_header : -1,
               sh->tx_flags, sh->nr_frags,
               sh->gso_size, sh->gso_type, sh->gso_segs,
               skb->csum, skb->csum_start, skb->csum_offset, skb->ip_summed,
               skb->csum_complete_sw, skb->csum_valid, skb->csum_level,
               skb->hash, skb->sw_hash, skb->l4_hash,
               ntohs(skb->protocol), skb->pkt_type, skb->skb_iif,
               skb->priority, skb->mark, skb->alloc_cpu, skb->vlan_all,
               skb->encapsulation, skb->inner_protocol, skb->inner_mac_header,
               skb->inner_network_header, skb->inner_transport_header);

        if (dev)
                printk("%sdev name=%s feat=%pNF\n",
                       level, dev->name, &dev->features);
        if (sk)
                printk("%ssk family=%hu type=%u proto=%u\n",
                       level, sk->sk_family, sk->sk_type, sk->sk_protocol);

        if (full_pkt && headroom)
                print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET,
                               16, 1, skb->head, headroom, false);

        seg_len = min_t(int, skb_headlen(skb), len);
        if (seg_len)
                print_hex_dump(level, "skb linear:   ", DUMP_PREFIX_OFFSET,
                               16, 1, skb->data, seg_len, false);
        len -= seg_len;

        if (full_pkt && tailroom)
                print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET,
                               16, 1, skb_tail_pointer(skb), tailroom, false);

        for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) {
                skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
                u32 p_off, p_len, copied;
                struct page *p;
                u8 *vaddr;

                if (skb_frag_is_net_iov(frag)) {
                        printk("%sskb frag %d: not readable\n", level, i);
                        len -= skb_frag_size(frag);
                        if (!len)
                                break;
                        continue;
                }

                skb_frag_foreach_page(frag, skb_frag_off(frag),
                                      skb_frag_size(frag), p, p_off, p_len,
                                      copied) {
                        seg_len = min_t(int, p_len, len);
                        vaddr = kmap_atomic(p);
                        print_hex_dump(level, "skb frag:     ",
                                       DUMP_PREFIX_OFFSET,
                                       16, 1, vaddr + p_off, seg_len, false);
                        kunmap_atomic(vaddr);
                        len -= seg_len;
                        if (!len)
                                break;
                }
        }

        if (full_pkt && skb_has_frag_list(skb)) {
                printk("skb fraglist:\n");
                skb_walk_frags(skb, list_skb)
                        skb_dump(level, list_skb, true);
        }
}
EXPORT_SYMBOL(skb_dump);

/**
 *        skb_tx_error - report an sk_buff xmit error
 *        @skb: buffer that triggered an error
 *
 *        Report xmit error if a device callback is tracking this skb.
 *        skb must be freed afterwards.
 */
void skb_tx_error(struct sk_buff *skb)
{
        if (skb) {
                skb_zcopy_downgrade_managed(skb);
                skb_zcopy_clear(skb, true);
        }
}
EXPORT_SYMBOL(skb_tx_error);

#ifdef CONFIG_TRACEPOINTS
/**
 *        consume_skb - free an skbuff
 *        @skb: buffer to free
 *
 *        Drop a ref to the buffer and free it if the usage count has hit zero
 *        Functions identically to kfree_skb, but kfree_skb assumes that the frame
 *        is being dropped after a failure and notes that
 */
void consume_skb(struct sk_buff *skb)
{
        if (!skb_unref(skb))
                return;

        trace_consume_skb(skb, __builtin_return_address(0));
        __kfree_skb(skb);
}
EXPORT_SYMBOL(consume_skb);
#endif

/**
 *        __consume_stateless_skb - free an skbuff, assuming it is stateless
 *        @skb: buffer to free
 *
 *        Alike consume_skb(), but this variant assumes that this is the last
 *        skb reference and all the head states have been already dropped
 */
void __consume_stateless_skb(struct sk_buff *skb)
{
        trace_consume_skb(skb, __builtin_return_address(0));
        skb_release_data(skb, SKB_CONSUMED);
        kfree_skbmem(skb);
}

static void napi_skb_cache_put(struct sk_buff *skb)
{
        struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
        u32 i;

        if (!kasan_mempool_poison_object(skb))
                return;

        local_lock_nested_bh(&napi_alloc_cache.bh_lock);
        nc->skb_cache[nc->skb_count++] = skb;

        if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
                for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++)
                        kasan_mempool_unpoison_object(nc->skb_cache[i],
                                                kmem_cache_size(net_hotdata.skbuff_cache));

                kmem_cache_free_bulk(net_hotdata.skbuff_cache, NAPI_SKB_CACHE_HALF,
                                     nc->skb_cache + NAPI_SKB_CACHE_HALF);
                nc->skb_count = NAPI_SKB_CACHE_HALF;
        }
        local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
}

void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason)
{
        skb_release_all(skb, reason);
        napi_skb_cache_put(skb);
}

void napi_skb_free_stolen_head(struct sk_buff *skb)
{
        if (unlikely(skb->slow_gro)) {
                nf_reset_ct(skb);
                skb_dst_drop(skb);
                skb_ext_put(skb);
                skb_orphan(skb);
                skb->slow_gro = 0;
        }
        napi_skb_cache_put(skb);
}

void napi_consume_skb(struct sk_buff *skb, int budget)
{
        /* Zero budget indicate non-NAPI context called us, like netpoll */
        if (unlikely(!budget)) {
                dev_consume_skb_any(skb);
                return;
        }

        DEBUG_NET_WARN_ON_ONCE(!in_softirq());

        if (!skb_unref(skb))
                return;

        /* if reaching here SKB is ready to free */
        trace_consume_skb(skb, __builtin_return_address(0));

        /* if SKB is a clone, don't handle this case */
        if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
                __kfree_skb(skb);
                return;
        }

        skb_release_all(skb, SKB_CONSUMED);
        napi_skb_cache_put(skb);
}
EXPORT_SYMBOL(napi_consume_skb);

/* Make sure a field is contained by headers group */
#define CHECK_SKB_FIELD(field) \
        BUILD_BUG_ON(offsetof(struct sk_buff, field) !=                \
                     offsetof(struct sk_buff, headers.field));        \

static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
{
        new->tstamp                = old->tstamp;
        /* We do not copy old->sk */
        new->dev                = old->dev;
        memcpy(new->cb, old->cb, sizeof(old->cb));
        skb_dst_copy(new, old);
        __skb_ext_copy(new, old);
        __nf_copy(new, old, false);

        /* Note : this field could be in the headers group.
         * It is not yet because we do not want to have a 16 bit hole
         */
        new->queue_mapping = old->queue_mapping;

        memcpy(&new->headers, &old->headers, sizeof(new->headers));
        CHECK_SKB_FIELD(protocol);
        CHECK_SKB_FIELD(csum);
        CHECK_SKB_FIELD(hash);
        CHECK_SKB_FIELD(priority);
        CHECK_SKB_FIELD(skb_iif);
        CHECK_SKB_FIELD(vlan_proto);
        CHECK_SKB_FIELD(vlan_tci);
        CHECK_SKB_FIELD(transport_header);
        CHECK_SKB_FIELD(network_header);
        CHECK_SKB_FIELD(mac_header);
        CHECK_SKB_FIELD(inner_protocol);
        CHECK_SKB_FIELD(inner_transport_header);
        CHECK_SKB_FIELD(inner_network_header);
        CHECK_SKB_FIELD(inner_mac_header);
        CHECK_SKB_FIELD(mark);
#ifdef CONFIG_NETWORK_SECMARK
        CHECK_SKB_FIELD(secmark);
#endif
#ifdef CONFIG_NET_RX_BUSY_POLL
        CHECK_SKB_FIELD(napi_id);
#endif
        CHECK_SKB_FIELD(alloc_cpu);
#ifdef CONFIG_XPS
        CHECK_SKB_FIELD(sender_cpu);
#endif
#ifdef CONFIG_NET_SCHED
        CHECK_SKB_FIELD(tc_index);
#endif

}

/*
 * You should not add any new code to this function.  Add it to
 * __copy_skb_header above instead.
 */
static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
{
#define C(x) n->x = skb->x

        n->next = n->prev = NULL;
        n->sk = NULL;
        __copy_skb_header(n, skb);

        C(len);
        C(data_len);
        C(mac_len);
        n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
        n->cloned = 1;
        n->nohdr = 0;
        n->peeked = 0;
        C(pfmemalloc);
        C(pp_recycle);
        n->destructor = NULL;
        C(tail);
        C(end);
        C(head);
        C(head_frag);
        C(data);
        C(truesize);
        refcount_set(&n->users, 1);

        atomic_inc(&(skb_shinfo(skb)->dataref));
        skb->cloned = 1;

        return n;
#undef C
}

/**
 * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg
 * @first: first sk_buff of the msg
 */
struct sk_buff *alloc_skb_for_msg(struct sk_buff *first)
{
        struct sk_buff *n;

        n = alloc_skb(0, GFP_ATOMIC);
        if (!n)
                return NULL;

        n->len = first->len;
        n->data_len = first->len;
        n->truesize = first->truesize;

        skb_shinfo(n)->frag_list = first;

        __copy_skb_header(n, first);
        n->destructor = NULL;

        return n;
}
EXPORT_SYMBOL_GPL(alloc_skb_for_msg);

/**
 *        skb_morph        -        morph one skb into another
 *        @dst: the skb to receive the contents
 *        @src: the skb to supply the contents
 *
 *        This is identical to skb_clone except that the target skb is
 *        supplied by the user.
 *
 *        The target skb is returned upon exit.
 */
struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
{
        skb_release_all(dst, SKB_CONSUMED);
        return __skb_clone(dst, src);
}
EXPORT_SYMBOL_GPL(skb_morph);

int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
{
        unsigned long max_pg, num_pg, new_pg, old_pg, rlim;
        struct user_struct *user;

        if (capable(CAP_IPC_LOCK) || !size)
                return 0;

        rlim = rlimit(RLIMIT_MEMLOCK);
        if (rlim == RLIM_INFINITY)
                return 0;

        num_pg = (size >> PAGE_SHIFT) + 2;        /* worst case */
        max_pg = rlim >> PAGE_SHIFT;
        user = mmp->user ? : current_user();

        old_pg = atomic_long_read(&user->locked_vm);
        do {
                new_pg = old_pg + num_pg;
                if (new_pg > max_pg)
                        return -ENOBUFS;
        } while (!atomic_long_try_cmpxchg(&user->locked_vm, &old_pg, new_pg));

        if (!mmp->user) {
                mmp->user = get_uid(user);
                mmp->num_pg = num_pg;
        } else {
                mmp->num_pg += num_pg;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(mm_account_pinned_pages);

void mm_unaccount_pinned_pages(struct mmpin *mmp)
{
        if (mmp->user) {
                atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm);
                free_uid(mmp->user);
        }
}
EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);

static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size,
                                            bool devmem)
{
        struct ubuf_info_msgzc *uarg;
        struct sk_buff *skb;

        WARN_ON_ONCE(!in_task());

        skb = sock_omalloc(sk, 0, GFP_KERNEL);
        if (!skb)
                return NULL;

        BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb));
        uarg = (void *)skb->cb;
        uarg->mmp.user = NULL;

        if (likely(!devmem) && mm_account_pinned_pages(&uarg->mmp, size)) {
                kfree_skb(skb);
                return NULL;
        }

        uarg->ubuf.ops = &msg_zerocopy_ubuf_ops;
        uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
        uarg->len = 1;
        uarg->bytelen = size;
        uarg->zerocopy = 1;
        uarg->ubuf.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
        refcount_set(&uarg->ubuf.refcnt, 1);
        sock_hold(sk);

        return &uarg->ubuf;
}

static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg)
{
        return container_of((void *)uarg, struct sk_buff, cb);
}

struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
                                       struct ubuf_info *uarg, bool devmem)
{
        if (uarg) {
                struct ubuf_info_msgzc *uarg_zc;
                const u32 byte_limit = 1 << 19;                /* limit to a few TSO */
                u32 bytelen, next;

                /* there might be non MSG_ZEROCOPY users */
                if (uarg->ops != &msg_zerocopy_ubuf_ops)
                        return NULL;

                /* realloc only when socket is locked (TCP, UDP cork),
                 * so uarg->len and sk_zckey access is serialized
                 */
                if (!sock_owned_by_user(sk)) {
                        WARN_ON_ONCE(1);
                        return NULL;
                }

                uarg_zc = uarg_to_msgzc(uarg);
                bytelen = uarg_zc->bytelen + size;
                if (uarg_zc->len == USHRT_MAX - 1 || bytelen > byte_limit) {
                        /* TCP can create new skb to attach new uarg */
                        if (sk->sk_type == SOCK_STREAM)
                                goto new_alloc;
                        return NULL;
                }

                next = (u32)atomic_read(&sk->sk_zckey);
                if ((u32)(uarg_zc->id + uarg_zc->len) == next) {
                        if (likely(!devmem) &&
                            mm_account_pinned_pages(&uarg_zc->mmp, size))
                                return NULL;
                        uarg_zc->len++;
                        uarg_zc->bytelen = bytelen;
                        atomic_set(&sk->sk_zckey, ++next);

                        /* no extra ref when appending to datagram (MSG_MORE) */
                        if (sk->sk_type == SOCK_STREAM)
                                net_zcopy_get(uarg);

                        return uarg;
                }
        }

new_alloc:
        return msg_zerocopy_alloc(sk, size, devmem);
}
EXPORT_SYMBOL_GPL(msg_zerocopy_realloc);

static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
{
        struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
        u32 old_lo, old_hi;
        u64 sum_len;

        old_lo = serr->ee.ee_info;
        old_hi = serr->ee.ee_data;
        sum_len = old_hi - old_lo + 1ULL + len;

        if (sum_len >= (1ULL << 32))
                return false;

        if (lo != old_hi + 1)
                return false;

        serr->ee.ee_data += len;
        return true;
}

static void __msg_zerocopy_callback(struct ubuf_info_msgzc *uarg)
{
        struct sk_buff *tail, *skb = skb_from_uarg(uarg);
        struct sock_exterr_skb *serr;
        struct sock *sk = skb->sk;
        struct sk_buff_head *q;
        unsigned long flags;
        bool is_zerocopy;
        u32 lo, hi;
        u16 len;

        mm_unaccount_pinned_pages(&uarg->mmp);

        /* if !len, there was only 1 call, and it was aborted
         * so do not queue a completion notification
         */
        if (!uarg->len || sock_flag(sk, SOCK_DEAD))
                goto release;

        len = uarg->len;
        lo = uarg->id;
        hi = uarg->id + len - 1;
        is_zerocopy = uarg->zerocopy;

        serr = SKB_EXT_ERR(skb);
        memset(serr, 0, sizeof(*serr));
        serr->ee.ee_errno = 0;
        serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
        serr->ee.ee_data = hi;
        serr->ee.ee_info = lo;
        if (!is_zerocopy)
                serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED;

        q = &sk->sk_error_queue;
        spin_lock_irqsave(&q->lock, flags);
        tail = skb_peek_tail(q);
        if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY ||
            !skb_zerocopy_notify_extend(tail, lo, len)) {
                __skb_queue_tail(q, skb);
                skb = NULL;
        }
        spin_unlock_irqrestore(&q->lock, flags);

        sk_error_report(sk);

release:
        consume_skb(skb);
        sock_put(sk);
}

static void msg_zerocopy_complete(struct sk_buff *skb, struct ubuf_info *uarg,
                                  bool success)
{
        struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg);

        uarg_zc->zerocopy = uarg_zc->zerocopy & success;

        if (refcount_dec_and_test(&uarg->refcnt))
                __msg_zerocopy_callback(uarg_zc);
}

void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
{
        struct sock *sk = skb_from_uarg(uarg_to_msgzc(uarg))->sk;

        atomic_dec(&sk->sk_zckey);
        uarg_to_msgzc(uarg)->len--;

        if (have_uref)
                msg_zerocopy_complete(NULL, uarg, true);
}
EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort);

const struct ubuf_info_ops msg_zerocopy_ubuf_ops = {
        .complete = msg_zerocopy_complete,
};
EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops);

int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
                             struct msghdr *msg, int len,
                             struct ubuf_info *uarg,
                             struct net_devmem_dmabuf_binding *binding)
{
        int err, orig_len = skb->len;

        if (uarg->ops->link_skb) {
                err = uarg->ops->link_skb(skb, uarg);
                if (err)
                        return err;
        } else {
                struct ubuf_info *orig_uarg = skb_zcopy(skb);

                /* An skb can only point to one uarg. This edge case happens
                 * when TCP appends to an skb, but zerocopy_realloc triggered
                 * a new alloc.
                 */
                if (orig_uarg && uarg != orig_uarg)
                        return -EEXIST;
        }

        err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len,
                                      binding);
        if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
                struct sock *save_sk = skb->sk;

                /* Streams do not free skb on error. Reset to prev state. */
                iov_iter_revert(&msg->msg_iter, skb->len - orig_len);
                skb->sk = sk;
                ___pskb_trim(skb, orig_len);
                skb->sk = save_sk;
                return err;
        }

        skb_zcopy_set(skb, uarg, NULL);
        return skb->len - orig_len;
}
EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);

void __skb_zcopy_downgrade_managed(struct sk_buff *skb)
{
        int i;

        skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAG_REFS;
        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
                skb_frag_ref(skb, i);
}
EXPORT_SYMBOL_GPL(__skb_zcopy_downgrade_managed);

static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
                              gfp_t gfp_mask)
{
        if (skb_zcopy(orig)) {
                if (skb_zcopy(nskb)) {
                        /* !gfp_mask callers are verified to !skb_zcopy(nskb) */
                        if (!gfp_mask) {
                                WARN_ON_ONCE(1);
                                return -ENOMEM;
                        }
                        if (skb_uarg(nskb) == skb_uarg(orig))
                                return 0;
                        if (skb_copy_ubufs(nskb, GFP_ATOMIC))
                                return -EIO;
                }
                skb_zcopy_set(nskb, skb_uarg(orig), NULL);
        }
        return 0;
}

/**
 *        skb_copy_ubufs        -        copy userspace skb frags buffers to kernel
 *        @skb: the skb to modify
 *        @gfp_mask: allocation priority
 *
 *        This must be called on skb with SKBFL_ZEROCOPY_ENABLE.
 *        It will copy all frags into kernel and drop the reference
 *        to userspace pages.
 *
 *        If this function is called from an interrupt gfp_mask() must be
 *        %GFP_ATOMIC.
 *
 *        Returns 0 on success or a negative error code on failure
 *        to allocate kernel memory to copy to.
 */
int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
{
        int num_frags = skb_shinfo(skb)->nr_frags;
        struct page *page, *head = NULL;
        int i, order, psize, new_frags;
        u32 d_off;

        if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
                return -EINVAL;

        if (!skb_frags_readable(skb))
                return -EFAULT;

        if (!num_frags)
                goto release;

        /* We might have to allocate high order pages, so compute what minimum
         * page order is needed.
         */
        order = 0;
        while ((PAGE_SIZE << order) * MAX_SKB_FRAGS < __skb_pagelen(skb))
                order++;
        psize = (PAGE_SIZE << order);

        new_frags = (__skb_pagelen(skb) + psize - 1) >> (PAGE_SHIFT + order);
        for (i = 0; i < new_frags; i++) {
                page = alloc_pages(gfp_mask | __GFP_COMP, order);
                if (!page) {
                        while (head) {
                                struct page *next = (struct page *)page_private(head);
                                put_page(head);
                                head = next;
                        }
                        return -ENOMEM;
                }
                set_page_private(page, (unsigned long)head);
                head = page;
        }

        page = head;
        d_off = 0;
        for (i = 0; i < num_frags; i++) {
                skb_frag_t *f = &skb_shinfo(skb)->frags[i];
                u32 p_off, p_len, copied;
                struct page *p;
                u8 *vaddr;

                skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f),
                                      p, p_off, p_len, copied) {
                        u32 copy, done = 0;
                        vaddr = kmap_atomic(p);

                        while (done < p_len) {
                                if (d_off == psize) {
                                        d_off = 0;
                                        page = (struct page *)page_private(page);
                                }
                                copy = min_t(u32, psize - d_off, p_len - done);
                                memcpy(page_address(page) + d_off,
                                       vaddr + p_off + done, copy);
                                done += copy;
                                d_off += copy;
                        }
                        kunmap_atomic(vaddr);
                }
        }

        /* skb frags release userspace buffers */
        for (i = 0; i < num_frags; i++)
                skb_frag_unref(skb, i);

        /* skb frags point to kernel buffers */
        for (i = 0; i < new_frags - 1; i++) {
                __skb_fill_netmem_desc(skb, i, page_to_netmem(head), 0, psize);
                head = (struct page *)page_private(head);
        }
        __skb_fill_netmem_desc(skb, new_frags - 1, page_to_netmem(head), 0,
                               d_off);
        skb_shinfo(skb)->nr_frags = new_frags;

release:
        skb_zcopy_clear(skb, false);
        return 0;
}
EXPORT_SYMBOL_GPL(skb_copy_ubufs);

/**
 *        skb_clone        -        duplicate an sk_buff
 *        @skb: buffer to clone
 *        @gfp_mask: allocation priority
 *
 *        Duplicate an &sk_buff. The new one is not owned by a socket. Both
 *        copies share the same packet data but not structure. The new
 *        buffer has a reference count of 1. If the allocation fails the
 *        function returns %NULL otherwise the new buffer is returned.
 *
 *        If this function is called from an interrupt gfp_mask() must be
 *        %GFP_ATOMIC.
 */

struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
{
        struct sk_buff_fclones *fclones = container_of(skb,
                                                       struct sk_buff_fclones,
                                                       skb1);
        struct sk_buff *n;

        if (skb_orphan_frags(skb, gfp_mask))
                return NULL;

        if (skb->fclone == SKB_FCLONE_ORIG &&
            refcount_read(&fclones->fclone_ref) == 1) {
                n = &fclones->skb2;
                refcount_set(&fclones->fclone_ref, 2);
                n->fclone = SKB_FCLONE_CLONE;
        } else {
                if (skb_pfmemalloc(skb))
                        gfp_mask |= __GFP_MEMALLOC;

                n = kmem_cache_alloc(net_hotdata.skbuff_cache, gfp_mask);
                if (!n)
                        return NULL;

                n->fclone = SKB_FCLONE_UNAVAILABLE;
        }

        return __skb_clone(n, skb);
}
EXPORT_SYMBOL(skb_clone);

void skb_headers_offset_update(struct sk_buff *skb, int off)
{
        /* Only adjust this if it actually is csum_start rather than csum */
        if (skb->ip_summed == CHECKSUM_PARTIAL)
                skb->csum_start += off;
        /* {transport,network,mac}_header and tail are relative to skb->head */
        skb->transport_header += off;
        skb->network_header   += off;
        if (skb_mac_header_was_set(skb))
                skb->mac_header += off;
        skb->inner_transport_header += off;
        skb->inner_network_header += off;
        skb->inner_mac_header += off;
}
EXPORT_SYMBOL(skb_headers_offset_update);

void skb_copy_header(struct sk_buff *new, const struct sk_buff *old)
{
        __copy_skb_header(new, old);

        skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
        skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
        skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
}
EXPORT_SYMBOL(skb_copy_header);

static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
{
        if (skb_pfmemalloc(skb))
                return SKB_ALLOC_RX;
        return 0;
}

/**
 *        skb_copy        -        create private copy of an sk_buff
 *        @skb: buffer to copy
 *        @gfp_mask: allocation priority
 *
 *        Make a copy of both an &sk_buff and its data. This is used when the
 *        caller wishes to modify the data and needs a private copy of the
 *        data to alter. Returns %NULL on failure or the pointer to the buffer
 *        on success. The returned buffer has a reference count of 1.
 *
 *        As by-product this function converts non-linear &sk_buff to linear
 *        one, so that &sk_buff becomes completely private and caller is allowed
 *        to modify all the data of returned buffer. This means that this
 *        function is not recommended for use in circumstances when only
 *        header is going to be modified. Use pskb_copy() instead.
 */

struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
{
        struct sk_buff *n;
        unsigned int size;
        int headerlen;

        if (!skb_frags_readable(skb))
                return NULL;

        if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST))
                return NULL;

        headerlen = skb_headroom(skb);
        size = skb_end_offset(skb) + skb->data_len;
        n = __alloc_skb(size, gfp_mask,
                        skb_alloc_rx_flag(skb), NUMA_NO_NODE);
        if (!n)
                return NULL;

        /* Set the data pointer */
        skb_reserve(n, headerlen);
        /* Set the tail pointer and length */
        skb_put(n, skb->len);

        BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len));

        skb_copy_header(n, skb);
        return n;
}
EXPORT_SYMBOL(skb_copy);

/**
 *        __pskb_copy_fclone        -  create copy of an sk_buff with private head.
 *        @skb: buffer to copy
 *        @headroom: headroom of new skb
 *        @gfp_mask: allocation priority
 *        @fclone: if true allocate the copy of the skb from the fclone
 *        cache instead of the head cache; it is recommended to set this
 *        to true for the cases where the copy will likely be cloned
 *
 *        Make a copy of both an &sk_buff and part of its data, located
 *        in header. Fragmented data remain shared. This is used when
 *        the caller wishes to modify only header of &sk_buff and needs
 *        private copy of the header to alter. Returns %NULL on failure
 *        or the pointer to the buffer on success.
 *        The returned buffer has a reference count of 1.
 */

struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
                                   gfp_t gfp_mask, bool fclone)
{
        unsigned int size = skb_headlen(skb) + headroom;
        int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0);
        struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE);

        if (!n)
                goto out;

        /* Set the data pointer */
        skb_reserve(n, headroom);
        /* Set the tail pointer and length */
        skb_put(n, skb_headlen(skb));
        /* Copy the bytes */
        skb_copy_from_linear_data(skb, n->data, n->len);

        n->truesize += skb->data_len;
        n->data_len  = skb->data_len;
        n->len             = skb->len;

        if (skb_shinfo(skb)->nr_frags) {
                int i;

                if (skb_orphan_frags(skb, gfp_mask) ||
                    skb_zerocopy_clone(n, skb, gfp_mask)) {
                        kfree_skb(n);
                        n = NULL;
                        goto out;
                }
                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                        skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
                        skb_frag_ref(skb, i);
                }
                skb_shinfo(n)->nr_frags = i;
        }

        if (skb_has_frag_list(skb)) {
                skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
                skb_clone_fraglist(n);
        }

        skb_copy_header(n, skb);
out:
        return n;
}
EXPORT_SYMBOL(__pskb_copy_fclone);

/**
 *        pskb_expand_head - reallocate header of &sk_buff
 *        @skb: buffer to reallocate
 *        @nhead: room to add at head
 *        @ntail: room to add at tail
 *        @gfp_mask: allocation priority
 *
 *        Expands (or creates identical copy, if @nhead and @ntail are zero)
 *        header of @skb. &sk_buff itself is not changed. &sk_buff MUST have
 *        reference count of 1. Returns zero in the case of success or error,
 *        if expansion failed. In the last case, &sk_buff is not changed.
 *
 *        All the pointers pointing into skb header may change and must be
 *        reloaded after call to this function.
 */

int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
                     gfp_t gfp_mask)
{
        unsigned int osize = skb_end_offset(skb);
        unsigned int size = osize + nhead + ntail;
        long off;
        u8 *data;
        int i;

        BUG_ON(nhead < 0);

        BUG_ON(skb_shared(skb));

        skb_zcopy_downgrade_managed(skb);

        if (skb_pfmemalloc(skb))
                gfp_mask |= __GFP_MEMALLOC;

        data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
        if (!data)
                goto nodata;
        size = SKB_WITH_OVERHEAD(size);

        /* Copy only real data... and, alas, header. This should be
         * optimized for the cases when header is void.
         */
        memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head);

        memcpy((struct skb_shared_info *)(data + size),
               skb_shinfo(skb),
               offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));

        /*
         * if shinfo is shared we must drop the old head gracefully, but if it
         * is not we can just drop the old head and let the existing refcount
         * be since all we did is relocate the values
         */
        if (skb_cloned(skb)) {
                if (skb_orphan_frags(skb, gfp_mask))
                        goto nofrags;
                if (skb_zcopy(skb))
                        refcount_inc(&skb_uarg(skb)->refcnt);
                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
                        skb_frag_ref(skb, i);

                if (skb_has_frag_list(skb))
                        skb_clone_fraglist(skb);

                skb_release_data(skb, SKB_CONSUMED);
        } else {
                skb_free_head(skb);
        }
        off = (data + nhead) - skb->head;

        skb->head     = data;
        skb->head_frag = 0;
        skb->data    += off;

        skb_set_end_offset(skb, size);
#ifdef NET_SKBUFF_DATA_USES_OFFSET
        off           = nhead;
#endif
        skb->tail              += off;
        skb_headers_offset_update(skb, nhead);
        skb->cloned   = 0;
        skb->hdr_len  = 0;
        skb->nohdr    = 0;
        atomic_set(&skb_shinfo(skb)->dataref, 1);

        skb_metadata_clear(skb);

        /* It is not generally safe to change skb->truesize.
         * For the moment, we really care of rx path, or
         * when skb is orphaned (not attached to a socket).
         */
        if (!skb->sk || skb->destructor == sock_edemux)
                skb->truesize += size - osize;

        return 0;

nofrags:
        skb_kfree_head(data, size);
nodata:
        return -ENOMEM;
}
EXPORT_SYMBOL(pskb_expand_head);

/* Make private copy of skb with writable head and some headroom */

struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
{
        struct sk_buff *skb2;
        int delta = headroom - skb_headroom(skb);

        if (delta <= 0)
                skb2 = pskb_copy(skb, GFP_ATOMIC);
        else {
                skb2 = skb_clone(skb, GFP_ATOMIC);
                if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0,
                                             GFP_ATOMIC)) {
                        kfree_skb(skb2);
                        skb2 = NULL;
                }
        }
        return skb2;
}
EXPORT_SYMBOL(skb_realloc_headroom);

/* Note: We plan to rework this in linux-6.4 */
int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri)
{
        unsigned int saved_end_offset, saved_truesize;
        struct skb_shared_info *shinfo;
        int res;

        saved_end_offset = skb_end_offset(skb);
        saved_truesize = skb->truesize;

        res = pskb_expand_head(skb, 0, 0, pri);
        if (res)
                return res;

        skb->truesize = saved_truesize;

        if (likely(skb_end_offset(skb) == saved_end_offset))
                return 0;

        /* We can not change skb->end if the original or new value
         * is SKB_SMALL_HEAD_HEADROOM, as it might break skb_kfree_head().
         */
        if (saved_end_offset == SKB_SMALL_HEAD_HEADROOM ||
            skb_end_offset(skb) == SKB_SMALL_HEAD_HEADROOM) {
                /* We think this path should not be taken.
                 * Add a temporary trace to warn us just in case.
                 */
                pr_err_once("__skb_unclone_keeptruesize() skb_end_offset() %u -> %u\n",
                            saved_end_offset, skb_end_offset(skb));
                WARN_ON_ONCE(1);
                return 0;
        }

        shinfo = skb_shinfo(skb);

        /* We are about to change back skb->end,
         * we need to move skb_shinfo() to its new location.
         */
        memmove(skb->head + saved_end_offset,
                shinfo,
                offsetof(struct skb_shared_info, frags[shinfo->nr_frags]));

        skb_set_end_offset(skb, saved_end_offset);

        return 0;
}

/**
 *        skb_expand_head - reallocate header of &sk_buff
 *        @skb: buffer to reallocate
 *        @headroom: needed headroom
 *
 *        Unlike skb_realloc_headroom, this one does not allocate a new skb
 *        if possible; copies skb->sk to new skb as needed
 *        and frees original skb in case of failures.
 *
 *        It expect increased headroom and generates warning otherwise.
 */

struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
{
        int delta = headroom - skb_headroom(skb);
        int osize = skb_end_offset(skb);
        struct sock *sk = skb->sk;

        if (WARN_ONCE(delta <= 0,
                      "%s is expecting an increase in the headroom", __func__))
                return skb;

        delta = SKB_DATA_ALIGN(delta);
        /* pskb_expand_head() might crash, if skb is shared. */
        if (skb_shared(skb) || !is_skb_wmem(skb)) {
                struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);

                if (unlikely(!nskb))
                        goto fail;

                if (sk)
                        skb_set_owner_w(nskb, sk);
                consume_skb(skb);
                skb = nskb;
        }
        if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC))
                goto fail;

        if (sk && is_skb_wmem(skb)) {
                delta = skb_end_offset(skb) - osize;
                refcount_add(delta, &sk->sk_wmem_alloc);
                skb->truesize += delta;
        }
        return skb;

fail:
        kfree_skb(skb);
        return NULL;
}
EXPORT_SYMBOL(skb_expand_head);

/**
 *        skb_copy_expand        -        copy and expand sk_buff
 *        @skb: buffer to copy
 *        @newheadroom: new free bytes at head
 *        @newtailroom: new free bytes at tail
 *        @gfp_mask: allocation priority
 *
 *        Make a copy of both an &sk_buff and its data and while doing so
 *        allocate additional space.
 *
 *        This is used when the caller wishes to modify the data and needs a
 *        private copy of the data to alter as well as more space for new fields.
 *        Returns %NULL on failure or the pointer to the buffer
 *        on success. The returned buffer has a reference count of 1.
 *
 *        You must pass %GFP_ATOMIC as the allocation priority if this function
 *        is called from an interrupt.
 */
struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
                                int newheadroom, int newtailroom,
                                gfp_t gfp_mask)
{
        /*
         *        Allocate the copy buffer
         */
        int head_copy_len, head_copy_off;
        struct sk_buff *n;
        int oldheadroom;

        if (!skb_frags_readable(skb))
                return NULL;

        if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST))
                return NULL;

        oldheadroom = skb_headroom(skb);
        n = __alloc_skb(newheadroom + skb->len + newtailroom,
                        gfp_mask, skb_alloc_rx_flag(skb),
                        NUMA_NO_NODE);
        if (!n)
                return NULL;

        skb_reserve(n, newheadroom);

        /* Set the tail pointer and length */
        skb_put(n, skb->len);

        head_copy_len = oldheadroom;
        head_copy_off = 0;
        if (newheadroom <= head_copy_len)
                head_copy_len = newheadroom;
        else
                head_copy_off = newheadroom - head_copy_len;

        /* Copy the linear header and data. */
        BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
                             skb->len + head_copy_len));

        skb_copy_header(n, skb);

        skb_headers_offset_update(n, newheadroom - oldheadroom);

        return n;
}
EXPORT_SYMBOL(skb_copy_expand);

/**
 *        __skb_pad                -        zero pad the tail of an skb
 *        @skb: buffer to pad
 *        @pad: space to pad
 *        @free_on_error: free buffer on error
 *
 *        Ensure that a buffer is followed by a padding area that is zero
 *        filled. Used by network drivers which may DMA or transfer data
 *        beyond the buffer end onto the wire.
 *
 *        May return error in out of memory cases. The skb is freed on error
 *        if @free_on_error is true.
 */

int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error)
{
        int err;
        int ntail;

        /* If the skbuff is non linear tailroom is always zero.. */
        if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) {
                memset(skb->data+skb->len, 0, pad);
                return 0;
        }

        ntail = skb->data_len + pad - (skb->end - skb->tail);
        if (likely(skb_cloned(skb) || ntail > 0)) {
                err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC);
                if (unlikely(err))
                        goto free_skb;
        }

        /* FIXME: The use of this function with non-linear skb's really needs
         * to be audited.
         */
        err = skb_linearize(skb);
        if (unlikely(err))
                goto free_skb;

        memset(skb->data + skb->len, 0, pad);
        return 0;

free_skb:
        if (free_on_error)
                kfree_skb(skb);
        return err;
}
EXPORT_SYMBOL(__skb_pad);

/**
 *        pskb_put - add data to the tail of a potentially fragmented buffer
 *        @skb: start of the buffer to use
 *        @tail: tail fragment of the buffer to use
 *        @len: amount of data to add
 *
 *        This function extends the used data area of the potentially
 *        fragmented buffer. @tail must be the last fragment of @skb -- or
 *        @skb itself. If this would exceed the total buffer size the kernel
 *        will panic. A pointer to the first byte of the extra data is
 *        returned.
 */

void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len)
{
        if (tail != skb) {
                skb->data_len += len;
                skb->len += len;
        }
        return skb_put(tail, len);
}
EXPORT_SYMBOL_GPL(pskb_put);

/**
 *        skb_put - add data to a buffer
 *        @skb: buffer to use
 *        @len: amount of data to add
 *
 *        This function extends the used data area of the buffer. If this would
 *        exceed the total buffer size the kernel will panic. A pointer to the
 *        first byte of the extra data is returned.
 */
void *skb_put(struct sk_buff *skb, unsigned int len)
{
        void *tmp = skb_tail_pointer(skb);
        SKB_LINEAR_ASSERT(skb);
        skb->tail += len;
        skb->len  += len;
        if (unlikely(skb->tail > skb->end))
                skb_over_panic(skb, len, __builtin_return_address(0));
        return tmp;
}
EXPORT_SYMBOL(skb_put);

/**
 *        skb_push - add data to the start of a buffer
 *        @skb: buffer to use
 *        @len: amount of data to add
 *
 *        This function extends the used data area of the buffer at the buffer
 *        start. If this would exceed the total buffer headroom the kernel will
 *        panic. A pointer to the first byte of the extra data is returned.
 */
void *skb_push(struct sk_buff *skb, unsigned int len)
{
        skb->data -= len;
        skb->len  += len;
        if (unlikely(skb->data < skb->head))
                skb_under_panic(skb, len, __builtin_return_address(0));
        return skb->data;
}
EXPORT_SYMBOL(skb_push);

/**
 *        skb_pull - remove data from the start of a buffer
 *        @skb: buffer to use
 *        @len: amount of data to remove
 *
 *        This function removes data from the start of a buffer, returning
 *        the memory to the headroom. A pointer to the next data in the buffer
 *        is returned. Once the data has been pulled future pushes will overwrite
 *        the old data.
 */
void *skb_pull(struct sk_buff *skb, unsigned int len)
{
        return skb_pull_inline(skb, len);
}
EXPORT_SYMBOL(skb_pull);

/**
 *        skb_pull_data - remove data from the start of a buffer returning its
 *        original position.
 *        @skb: buffer to use
 *        @len: amount of data to remove
 *
 *        This function removes data from the start of a buffer, returning
 *        the memory to the headroom. A pointer to the original data in the buffer
 *        is returned after checking if there is enough data to pull. Once the
 *        data has been pulled future pushes will overwrite the old data.
 */
void *skb_pull_data(struct sk_buff *skb, size_t len)
{
        void *data = skb->data;

        if (skb->len < len)
                return NULL;

        skb_pull(skb, len);

        return data;
}
EXPORT_SYMBOL(skb_pull_data);

/**
 *        skb_trim - remove end from a buffer
 *        @skb: buffer to alter
 *        @len: new length
 *
 *        Cut the length of a buffer down by removing data from the tail. If
 *        the buffer is already under the length specified it is not modified.
 *        The skb must be linear.
 */
void skb_trim(struct sk_buff *skb, unsigned int len)
{
        if (skb->len > len)
                __skb_trim(skb, len);
}
EXPORT_SYMBOL(skb_trim);

/* Trims skb to length len. It can change skb pointers.
 */

int ___pskb_trim(struct sk_buff *skb, unsigned int len)
{
        struct sk_buff **fragp;
        struct sk_buff *frag;
        int offset = skb_headlen(skb);
        int nfrags = skb_shinfo(skb)->nr_frags;
        int i;
        int err;

        if (skb_cloned(skb) &&
            unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))))
                return err;

        i = 0;
        if (offset >= len)
                goto drop_pages;

        for (; i < nfrags; i++) {
                int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]);

                if (end < len) {
                        offset = end;
                        continue;
                }

                skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset);

drop_pages:
                skb_shinfo(skb)->nr_frags = i;

                for (; i < nfrags; i++)
                        skb_frag_unref(skb, i);

                if (skb_has_frag_list(skb))
                        skb_drop_fraglist(skb);
                goto done;
        }

        for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp);
             fragp = &frag->next) {
                int end = offset + frag->len;

                if (skb_shared(frag)) {
                        struct sk_buff *nfrag;

                        nfrag = skb_clone(frag, GFP_ATOMIC);
                        if (unlikely(!nfrag))
                                return -ENOMEM;

                        nfrag->next = frag->next;
                        consume_skb(frag);
                        frag = nfrag;
                        *fragp = frag;
                }

                if (end < len) {
                        offset = end;
                        continue;
                }

                if (end > len &&
                    unlikely((err = pskb_trim(frag, len - offset))))
                        return err;

                if (frag->next)
                        skb_drop_list(&frag->next);
                break;
        }

done:
        if (len > skb_headlen(skb)) {
                skb->data_len -= skb->len - len;
                skb->len       = len;
        } else {
                skb->len       = len;
                skb->data_len  = 0;
                skb_set_tail_pointer(skb, len);
        }

        if (!skb->sk || skb->destructor == sock_edemux)
                skb_condense(skb);
        return 0;
}
EXPORT_SYMBOL(___pskb_trim);

/* Note : use pskb_trim_rcsum() instead of calling this directly
 */
int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE) {
                int delta = skb->len - len;

                skb->csum = csum_block_sub(skb->csum,
                                           skb_checksum(skb, len, delta, 0),
                                           len);
        } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
                int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len;
                int offset = skb_checksum_start_offset(skb) + skb->csum_offset;

                if (offset + sizeof(__sum16) > hdlen)
                        return -EINVAL;
        }
        return __pskb_trim(skb, len);
}
EXPORT_SYMBOL(pskb_trim_rcsum_slow);

/**
 *        __pskb_pull_tail - advance tail of skb header
 *        @skb: buffer to reallocate
 *        @delta: number of bytes to advance tail
 *
 *        The function makes a sense only on a fragmented &sk_buff,
 *        it expands header moving its tail forward and copying necessary
 *        data from fragmented part.
 *
 *        &sk_buff MUST have reference count of 1.
 *
 *        Returns %NULL (and &sk_buff does not change) if pull failed
 *        or value of new tail of skb in the case of success.
 *
 *        All the pointers pointing into skb header may change and must be
 *        reloaded after call to this function.
 */

/* Moves tail of skb head forward, copying data from fragmented part,
 * when it is necessary.
 * 1. It may fail due to malloc failure.
 * 2. It may change skb pointers.
 *
 * It is pretty complicated. Luckily, it is called only in exceptional cases.
 */
void *__pskb_pull_tail(struct sk_buff *skb, int delta)
{
        /* If skb has not enough free space at tail, get new one
         * plus 128 bytes for future expansions. If we have enough
         * room at tail, reallocate without expansion only if skb is cloned.
         */
        int i, k, eat = (skb->tail + delta) - skb->end;

        if (!skb_frags_readable(skb))
                return NULL;

        if (eat > 0 || skb_cloned(skb)) {
                if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
                                     GFP_ATOMIC))
                        return NULL;
        }

        BUG_ON(skb_copy_bits(skb, skb_headlen(skb),
                             skb_tail_pointer(skb), delta));

        /* Optimization: no fragments, no reasons to preestimate
         * size of pulled pages. Superb.
         */
        if (!skb_has_frag_list(skb))
                goto pull_pages;

        /* Estimate size of pulled pages. */
        eat = delta;
        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);

                if (size >= eat)
                        goto pull_pages;
                eat -= size;
        }

        /* If we need update frag list, we are in troubles.
         * Certainly, it is possible to add an offset to skb data,
         * but taking into account that pulling is expected to
         * be very rare operation, it is worth to fight against
         * further bloating skb head and crucify ourselves here instead.
         * Pure masohism, indeed. 8)8)
         */
        if (eat) {
                struct sk_buff *list = skb_shinfo(skb)->frag_list;
                struct sk_buff *clone = NULL;
                struct sk_buff *insp = NULL;

                do {
                        if (list->len <= eat) {
                                /* Eaten as whole. */
                                eat -= list->len;
                                list = list->next;
                                insp = list;
                        } else {
                                /* Eaten partially. */
                                if (skb_is_gso(skb) && !list->head_frag &&
                                    skb_headlen(list))
                                        skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;

                                if (skb_shared(list)) {
                                        /* Sucks! We need to fork list. :-( */
                                        clone = skb_clone(list, GFP_ATOMIC);
                                        if (!clone)
                                                return NULL;
                                        insp = list->next;
                                        list = clone;
                                } else {
                                        /* This may be pulled without
                                         * problems. */
                                        insp = list;
                                }
                                if (!pskb_pull(list, eat)) {
                                        kfree_skb(clone);
                                        return NULL;
                                }
                                break;
                        }
                } while (eat);

                /* Free pulled out fragments. */
                while ((list = skb_shinfo(skb)->frag_list) != insp) {
                        skb_shinfo(skb)->frag_list = list->next;
                        consume_skb(list);
                }
                /* And insert new clone at head. */
                if (clone) {
                        clone->next = list;
                        skb_shinfo(skb)->frag_list = clone;
                }
        }
        /* Success! Now we may commit changes to skb data. */

pull_pages:
        eat = delta;
        k = 0;
        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);

                if (size <= eat) {
                        skb_frag_unref(skb, i);
                        eat -= size;
                } else {
                        skb_frag_t *frag = &skb_shinfo(skb)->frags[k];

                        *frag = skb_shinfo(skb)->frags[i];
                        if (eat) {
                                skb_frag_off_add(frag, eat);
                                skb_frag_size_sub(frag, eat);
                                if (!i)
                                        goto end;
                                eat = 0;
                        }
                        k++;
                }
        }
        skb_shinfo(skb)->nr_frags = k;

end:
        skb->tail     += delta;
        skb->data_len -= delta;

        if (!skb->data_len)
                skb_zcopy_clear(skb, false);

        return skb_tail_pointer(skb);
}
EXPORT_SYMBOL(__pskb_pull_tail);

/**
 *        skb_copy_bits - copy bits from skb to kernel buffer
 *        @skb: source skb
 *        @offset: offset in source
 *        @to: destination buffer
 *        @len: number of bytes to copy
 *
 *        Copy the specified number of bytes from the source skb to the
 *        destination buffer.
 *
 *        CAUTION ! :
 *                If its prototype is ever changed,
 *                check arch/{*}/net/{*}.S files,
 *                since it is called from BPF assembly code.
 */
int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
{
        int start = skb_headlen(skb);
        struct sk_buff *frag_iter;
        int i, copy;

        if (offset > (int)skb->len - len)
                goto fault;

        /* Copy header. */
        if ((copy = start - offset) > 0) {
                if (copy > len)
                        copy = len;
                skb_copy_from_linear_data_offset(skb, offset, to, copy);
                if ((len -= copy) == 0)
                        return 0;
                offset += copy;
                to     += copy;
        }

        if (!skb_frags_readable(skb))
                goto fault;

        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int end;
                skb_frag_t *f = &skb_shinfo(skb)->frags[i];

                WARN_ON(start > offset + len);

                end = start + skb_frag_size(f);
                if ((copy = end - offset) > 0) {
                        u32 p_off, p_len, copied;
                        struct page *p;
                        u8 *vaddr;

                        if (copy > len)
                                copy = len;

                        skb_frag_foreach_page(f,
                                              skb_frag_off(f) + offset - start,
                                              copy, p, p_off, p_len, copied) {
                                vaddr = kmap_atomic(p);
                                memcpy(to + copied, vaddr + p_off, p_len);
                                kunmap_atomic(vaddr);
                        }

                        if ((len -= copy) == 0)
                                return 0;
                        offset += copy;
                        to     += copy;
                }
                start = end;
        }

        skb_walk_frags(skb, frag_iter) {
                int end;

                WARN_ON(start > offset + len);

                end = start + frag_iter->len;
                if ((copy = end - offset) > 0) {
                        if (copy > len)
                                copy = len;
                        if (skb_copy_bits(frag_iter, offset - start, to, copy))
                                goto fault;
                        if ((len -= copy) == 0)
                                return 0;
                        offset += copy;
                        to     += copy;
                }
                start = end;
        }

        if (!len)
                return 0;

fault:
        return -EFAULT;
}
EXPORT_SYMBOL(skb_copy_bits);

/*
 * Callback from splice_to_pipe(), if we need to release some pages
 * at the end of the spd in case we error'ed out in filling the pipe.
 */
static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
{
        put_page(spd->pages[i]);
}

static struct page *linear_to_page(struct page *page, unsigned int *len,
                                   unsigned int *offset,
                                   struct sock *sk)
{
        struct page_frag *pfrag = sk_page_frag(sk);

        if (!sk_page_frag_refill(sk, pfrag))
                return NULL;

        *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset);

        memcpy(page_address(pfrag->page) + pfrag->offset,
               page_address(page) + *offset, *len);
        *offset = pfrag->offset;
        pfrag->offset += *len;

        return pfrag->page;
}

static bool spd_can_coalesce(const struct splice_pipe_desc *spd,
                             struct page *page,
                             unsigned int offset)
{
        return        spd->nr_pages &&
                spd->pages[spd->nr_pages - 1] == page &&
                (spd->partial[spd->nr_pages - 1].offset +
                 spd->partial[spd->nr_pages - 1].len == offset);
}

/*
 * Fill page/offset/length into spd, if it can hold more pages.
 */
static bool spd_fill_page(struct splice_pipe_desc *spd, struct page *page,
                          unsigned int *len, unsigned int offset, bool linear,
                          struct sock *sk)
{
        if (unlikely(spd->nr_pages == MAX_SKB_FRAGS))
                return true;

        if (linear) {
                page = linear_to_page(page, len, &offset, sk);
                if (!page)
                        return true;
        }
        if (spd_can_coalesce(spd, page, offset)) {
                spd->partial[spd->nr_pages - 1].len += *len;
                return false;
        }
        get_page(page);
        spd->pages[spd->nr_pages] = page;
        spd->partial[spd->nr_pages].len = *len;
        spd->partial[spd->nr_pages].offset = offset;
        spd->nr_pages++;

        return false;
}

static bool __splice_segment(struct page *page, unsigned int poff,
                             unsigned int plen, unsigned int *off,
                             unsigned int *len,
                             struct splice_pipe_desc *spd, bool linear,
                             struct sock *sk)
{
        if (!*len)
                return true;

        /* skip this segment if already processed */
        if (*off >= plen) {
                *off -= plen;
                return false;
        }

        /* ignore any bits we already processed */
        poff += *off;
        plen -= *off;
        *off = 0;

        do {
                unsigned int flen = min(*len, plen);

                if (spd_fill_page(spd, page, &flen, poff, linear, sk))
                        return true;
                poff += flen;
                plen -= flen;
                *len -= flen;
                if (!*len)
                        return true;
        } while (plen);

        return false;
}

/*
 * Map linear and fragment data from the skb to spd. It reports true if the
 * pipe is full or if we already spliced the requested length.
 */
static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
                              unsigned int *offset, unsigned int *len,
                              struct splice_pipe_desc *spd, struct sock *sk)
{
        struct sk_buff *iter;
        int seg;

        /* map the linear part :
         * If skb->head_frag is set, this 'linear' part is backed by a
         * fragment, and if the head is not shared with any clones then
         * we can avoid a copy since we own the head portion of this page.
         */
        if (__splice_segment(virt_to_page(skb->data),
                             (unsigned long) skb->data & (PAGE_SIZE - 1),
                             skb_headlen(skb),
                             offset, len, spd,
                             skb_head_is_locked(skb),
                             sk))
                return true;

        /*
         * then map the fragments
         */
        if (!skb_frags_readable(skb))
                return false;

        for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
                const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];

                if (WARN_ON_ONCE(!skb_frag_page(f)))
                        return false;

                if (__splice_segment(skb_frag_page(f),
                                     skb_frag_off(f), skb_frag_size(f),
                                     offset, len, spd, false, sk))
                        return true;
        }

        skb_walk_frags(skb, iter) {
                if (*offset >= iter->len) {
                        *offset -= iter->len;
                        continue;
                }
                /* __skb_splice_bits() only fails if the output has no room
                 * left, so no point in going over the frag_list for the error
                 * case.
                 */
                if (__skb_splice_bits(iter, pipe, offset, len, spd, sk))
                        return true;
        }

        return false;
}

/*
 * Map data from the skb to a pipe. Should handle both the linear part,
 * the fragments, and the frag list.
 */
int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
                    struct pipe_inode_info *pipe, unsigned int tlen,
                    unsigned int flags)
{
        struct partial_page partial[MAX_SKB_FRAGS];
        struct page *pages[MAX_SKB_FRAGS];
        struct splice_pipe_desc spd = {
                .pages = pages,
                .partial = partial,
                .nr_pages_max = MAX_SKB_FRAGS,
                .ops = &nosteal_pipe_buf_ops,
                .spd_release = sock_spd_release,
        };
        int ret = 0;

        __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk);

        if (spd.nr_pages)
                ret = splice_to_pipe(pipe, &spd);

        return ret;
}
EXPORT_SYMBOL_GPL(skb_splice_bits);

static int sendmsg_locked(struct sock *sk, struct msghdr *msg)
{
        struct socket *sock = sk->sk_socket;
        size_t size = msg_data_left(msg);

        if (!sock)
                return -EINVAL;

        if (!sock->ops->sendmsg_locked)
                return sock_no_sendmsg_locked(sk, msg, size);

        return sock->ops->sendmsg_locked(sk, msg, size);
}

static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg)
{
        struct socket *sock = sk->sk_socket;

        if (!sock)
                return -EINVAL;
        return sock_sendmsg(sock, msg);
}

typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg);
static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset,
                           int len, sendmsg_func sendmsg, int flags)
{
        int more_hint = sk_is_tcp(sk) ? MSG_MORE : 0;
        unsigned int orig_len = len;
        struct sk_buff *head = skb;
        unsigned short fragidx;
        int slen, ret;

do_frag_list:

        /* Deal with head data */
        while (offset < skb_headlen(skb) && len) {
                struct kvec kv;
                struct msghdr msg;

                slen = min_t(int, len, skb_headlen(skb) - offset);
                kv.iov_base = skb->data + offset;
                kv.iov_len = slen;
                memset(&msg, 0, sizeof(msg));
                msg.msg_flags = MSG_DONTWAIT | flags;
                if (slen < len)
                        msg.msg_flags |= more_hint;

                iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &kv, 1, slen);
                ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked,
                                      sendmsg_unlocked, sk, &msg);
                if (ret <= 0)
                        goto error;

                offset += ret;
                len -= ret;
        }

        /* All the data was skb head? */
        if (!len)
                goto out;

        /* Make offset relative to start of frags */
        offset -= skb_headlen(skb);

        /* Find where we are in frag list */
        for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
                skb_frag_t *frag  = &skb_shinfo(skb)->frags[fragidx];

                if (offset < skb_frag_size(frag))
                        break;

                offset -= skb_frag_size(frag);
        }

        for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
                skb_frag_t *frag  = &skb_shinfo(skb)->frags[fragidx];

                slen = min_t(size_t, len, skb_frag_size(frag) - offset);

                while (slen) {
                        struct bio_vec bvec;
                        struct msghdr msg = {
                                .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT |
                                             flags,
                        };

                        if (slen < len)
                                msg.msg_flags |= more_hint;
                        bvec_set_page(&bvec, skb_frag_page(frag), slen,
                                      skb_frag_off(frag) + offset);
                        iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1,
                                      slen);

                        ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked,
                                              sendmsg_unlocked, sk, &msg);
                        if (ret <= 0)
                                goto error;

                        len -= ret;
                        offset += ret;
                        slen -= ret;
                }

                offset = 0;
        }

        if (len) {
                /* Process any frag lists */

                if (skb == head) {
                        if (skb_has_frag_list(skb)) {
                                skb = skb_shinfo(skb)->frag_list;
                                goto do_frag_list;
                        }
                } else if (skb->next) {
                        skb = skb->next;
                        goto do_frag_list;
                }
        }

out:
        return orig_len - len;

error:
        return orig_len == len ? ret : orig_len - len;
}

/* Send skb data on a socket. Socket must be locked. */
int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
                         int len)
{
        return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, 0);
}
EXPORT_SYMBOL_GPL(skb_send_sock_locked);

int skb_send_sock_locked_with_flags(struct sock *sk, struct sk_buff *skb,
                                    int offset, int len, int flags)
{
        return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, flags);
}
EXPORT_SYMBOL_GPL(skb_send_sock_locked_with_flags);

/* Send skb data on a socket. Socket must be unlocked. */
int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len)
{
        return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, 0);
}

/**
 *        skb_store_bits - store bits from kernel buffer to skb
 *        @skb: destination buffer
 *        @offset: offset in destination
 *        @from: source buffer
 *        @len: number of bytes to copy
 *
 *        Copy the specified number of bytes from the source buffer to the
 *        destination skb.  This function handles all the messy bits of
 *        traversing fragment lists and such.
 */

int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
{
        int start = skb_headlen(skb);
        struct sk_buff *frag_iter;
        int i, copy;

        if (offset > (int)skb->len - len)
                goto fault;

        if ((copy = start - offset) > 0) {
                if (copy > len)
                        copy = len;
                skb_copy_to_linear_data_offset(skb, offset, from, copy);
                if ((len -= copy) == 0)
                        return 0;
                offset += copy;
                from += copy;
        }

        if (!skb_frags_readable(skb))
                goto fault;

        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
                int end;

                WARN_ON(start > offset + len);

                end = start + skb_frag_size(frag);
                if ((copy = end - offset) > 0) {
                        u32 p_off, p_len, copied;
                        struct page *p;
                        u8 *vaddr;

                        if (copy > len)
                                copy = len;

                        skb_frag_foreach_page(frag,
                                              skb_frag_off(frag) + offset - start,
                                              copy, p, p_off, p_len, copied) {
                                vaddr = kmap_atomic(p);
                                memcpy(vaddr + p_off, from + copied, p_len);
                                kunmap_atomic(vaddr);
                        }

                        if ((len -= copy) == 0)
                                return 0;
                        offset += copy;
                        from += copy;
                }
                start = end;
        }

        skb_walk_frags(skb, frag_iter) {
                int end;

                WARN_ON(start > offset + len);

                end = start + frag_iter->len;
                if ((copy = end - offset) > 0) {
                        if (copy > len)
                                copy = len;
                        if (skb_store_bits(frag_iter, offset - start,
                                           from, copy))
                                goto fault;
                        if ((len -= copy) == 0)
                                return 0;
                        offset += copy;
                        from += copy;
                }
                start = end;
        }
        if (!len)
                return 0;

fault:
        return -EFAULT;
}
EXPORT_SYMBOL(skb_store_bits);

/* Checksum skb data. */
__wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum)
{
        int start = skb_headlen(skb);
        int i, copy = start - offset;
        struct sk_buff *frag_iter;
        int pos = 0;

        /* Checksum header. */
        if (copy > 0) {
                if (copy > len)
                        copy = len;
                csum = csum_partial(skb->data + offset, copy, csum);
                if ((len -= copy) == 0)
                        return csum;
                offset += copy;
                pos        = copy;
        }

        if (WARN_ON_ONCE(!skb_frags_readable(skb)))
                return 0;

        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int end;
                skb_frag_t *frag = &skb_shinfo(skb)->frags[i];

                WARN_ON(start > offset + len);

                end = start + skb_frag_size(frag);
                if ((copy = end - offset) > 0) {
                        u32 p_off, p_len, copied;
                        struct page *p;
                        __wsum csum2;
                        u8 *vaddr;

                        if (copy > len)
                                copy = len;

                        skb_frag_foreach_page(frag,
                                              skb_frag_off(frag) + offset - start,
                                              copy, p, p_off, p_len, copied) {
                                vaddr = kmap_atomic(p);
                                csum2 = csum_partial(vaddr + p_off, p_len, 0);
                                kunmap_atomic(vaddr);
                                csum = csum_block_add(csum, csum2, pos);
                                pos += p_len;
                        }

                        if (!(len -= copy))
                                return csum;
                        offset += copy;
                }
                start = end;
        }

        skb_walk_frags(skb, frag_iter) {
                int end;

                WARN_ON(start > offset + len);

                end = start + frag_iter->len;
                if ((copy = end - offset) > 0) {
                        __wsum csum2;
                        if (copy > len)
                                copy = len;
                        csum2 = skb_checksum(frag_iter, offset - start, copy,
                                             0);
                        csum = csum_block_add(csum, csum2, pos);
                        if ((len -= copy) == 0)
                                return csum;
                        offset += copy;
                        pos    += copy;
                }
                start = end;
        }
        BUG_ON(len);

        return csum;
}
EXPORT_SYMBOL(skb_checksum);

/* Both of above in one bottle. */

__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
                                    u8 *to, int len)
{
        int start = skb_headlen(skb);
        int i, copy = start - offset;
        struct sk_buff *frag_iter;
        int pos = 0;
        __wsum csum = 0;

        /* Copy header. */
        if (copy > 0) {
                if (copy > len)
                        copy = len;
                csum = csum_partial_copy_nocheck(skb->data + offset, to,
                                                 copy);
                if ((len -= copy) == 0)
                        return csum;
                offset += copy;
                to     += copy;
                pos        = copy;
        }

        if (!skb_frags_readable(skb))
                return 0;

        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int end;

                WARN_ON(start > offset + len);

                end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
                if ((copy = end - offset) > 0) {
                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
                        u32 p_off, p_len, copied;
                        struct page *p;
                        __wsum csum2;
                        u8 *vaddr;

                        if (copy > len)
                                copy = len;

                        skb_frag_foreach_page(frag,
                                              skb_frag_off(frag) + offset - start,
                                              copy, p, p_off, p_len, copied) {
                                vaddr = kmap_atomic(p);
                                csum2 = csum_partial_copy_nocheck(vaddr + p_off,
                                                                  to + copied,
                                                                  p_len);
                                kunmap_atomic(vaddr);
                                csum = csum_block_add(csum, csum2, pos);
                                pos += p_len;
                        }

                        if (!(len -= copy))
                                return csum;
                        offset += copy;
                        to     += copy;
                }
                start = end;
        }

        skb_walk_frags(skb, frag_iter) {
                __wsum csum2;
                int end;

                WARN_ON(start > offset + len);

                end = start + frag_iter->len;
                if ((copy = end - offset) > 0) {
                        if (copy > len)
                                copy = len;
                        csum2 = skb_copy_and_csum_bits(frag_iter,
                                                       offset - start,
                                                       to, copy);
                        csum = csum_block_add(csum, csum2, pos);
                        if ((len -= copy) == 0)
                                return csum;
                        offset += copy;
                        to     += copy;
                        pos    += copy;
                }
                start = end;
        }
        BUG_ON(len);
        return csum;
}
EXPORT_SYMBOL(skb_copy_and_csum_bits);

#ifdef CONFIG_NET_CRC32C
u32 skb_crc32c(const struct sk_buff *skb, int offset, int len, u32 crc)
{
        int start = skb_headlen(skb);
        int i, copy = start - offset;
        struct sk_buff *frag_iter;

        if (copy > 0) {
                copy = min(copy, len);
                crc = crc32c(crc, skb->data + offset, copy);
                len -= copy;
                if (len == 0)
                        return crc;
                offset += copy;
        }

        if (WARN_ON_ONCE(!skb_frags_readable(skb)))
                return 0;

        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int end;
                skb_frag_t *frag = &skb_shinfo(skb)->frags[i];

                WARN_ON(start > offset + len);

                end = start + skb_frag_size(frag);
                copy = end - offset;
                if (copy > 0) {
                        u32 p_off, p_len, copied;
                        struct page *p;
                        u8 *vaddr;

                        copy = min(copy, len);
                        skb_frag_foreach_page(frag,
                                              skb_frag_off(frag) + offset - start,
                                              copy, p, p_off, p_len, copied) {
                                vaddr = kmap_atomic(p);
                                crc = crc32c(crc, vaddr + p_off, p_len);
                                kunmap_atomic(vaddr);
                        }
                        len -= copy;
                        if (len == 0)
                                return crc;
                        offset += copy;
                }
                start = end;
        }

        skb_walk_frags(skb, frag_iter) {
                int end;

                WARN_ON(start > offset + len);

                end = start + frag_iter->len;
                copy = end - offset;
                if (copy > 0) {
                        copy = min(copy, len);
                        crc = skb_crc32c(frag_iter, offset - start, copy, crc);
                        len -= copy;
                        if (len == 0)
                                return crc;
                        offset += copy;
                }
                start = end;
        }
        BUG_ON(len);

        return crc;
}
EXPORT_SYMBOL(skb_crc32c);
#endif /* CONFIG_NET_CRC32C */

__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
{
        __sum16 sum;

        sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
        /* See comments in __skb_checksum_complete(). */
        if (likely(!sum)) {
                if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
                    !skb->csum_complete_sw)
                        netdev_rx_csum_fault(skb->dev, skb);
        }
        if (!skb_shared(skb))
                skb->csum_valid = !sum;
        return sum;
}
EXPORT_SYMBOL(__skb_checksum_complete_head);

/* This function assumes skb->csum already holds pseudo header's checksum,
 * which has been changed from the hardware checksum, for example, by
 * __skb_checksum_validate_complete(). And, the original skb->csum must
 * have been validated unsuccessfully for CHECKSUM_COMPLETE case.
 *
 * It returns non-zero if the recomputed checksum is still invalid, otherwise
 * zero. The new checksum is stored back into skb->csum unless the skb is
 * shared.
 */
__sum16 __skb_checksum_complete(struct sk_buff *skb)
{
        __wsum csum;
        __sum16 sum;

        csum = skb_checksum(skb, 0, skb->len, 0);

        sum = csum_fold(csum_add(skb->csum, csum));
        /* This check is inverted, because we already knew the hardware
         * checksum is invalid before calling this function. So, if the
         * re-computed checksum is valid instead, then we have a mismatch
         * between the original skb->csum and skb_checksum(). This means either
         * the original hardware checksum is incorrect or we screw up skb->csum
         * when moving skb->data around.
         */
        if (likely(!sum)) {
                if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
                    !skb->csum_complete_sw)
                        netdev_rx_csum_fault(skb->dev, skb);
        }

        if (!skb_shared(skb)) {
                /* Save full packet checksum */
                skb->csum = csum;
                skb->ip_summed = CHECKSUM_COMPLETE;
                skb->csum_complete_sw = 1;
                skb->csum_valid = !sum;
        }

        return sum;
}
EXPORT_SYMBOL(__skb_checksum_complete);

 /**
 *        skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy()
 *        @from: source buffer
 *
 *        Calculates the amount of linear headroom needed in the 'to' skb passed
 *        into skb_zerocopy().
 */
unsigned int
skb_zerocopy_headlen(const struct sk_buff *from)
{
        unsigned int hlen = 0;

        if (!from->head_frag ||
            skb_headlen(from) < L1_CACHE_BYTES ||
            skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) {
                hlen = skb_headlen(from);
                if (!hlen)
                        hlen = from->len;
        }

        if (skb_has_frag_list(from))
                hlen = from->len;

        return hlen;
}
EXPORT_SYMBOL_GPL(skb_zerocopy_headlen);

/**
 *        skb_zerocopy - Zero copy skb to skb
 *        @to: destination buffer
 *        @from: source buffer
 *        @len: number of bytes to copy from source buffer
 *        @hlen: size of linear headroom in destination buffer
 *
 *        Copies up to `len` bytes from `from` to `to` by creating references
 *        to the frags in the source buffer.
 *
 *        The `hlen` as calculated by skb_zerocopy_headlen() specifies the
 *        headroom in the `to` buffer.
 *
 *        Return value:
 *        0: everything is OK
 *        -ENOMEM: couldn't orphan frags of @from due to lack of memory
 *        -EFAULT: skb_copy_bits() found some problem with skb geometry
 */
int
skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
{
        int i, j = 0;
        int plen = 0; /* length of skb->head fragment */
        int ret;
        struct page *page;
        unsigned int offset;

        BUG_ON(!from->head_frag && !hlen);

        /* dont bother with small payloads */
        if (len <= skb_tailroom(to))
                return skb_copy_bits(from, 0, skb_put(to, len), len);

        if (hlen) {
                ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
                if (unlikely(ret))
                        return ret;
                len -= hlen;
        } else {
                plen = min_t(int, skb_headlen(from), len);
                if (plen) {
                        page = virt_to_head_page(from->head);
                        offset = from->data - (unsigned char *)page_address(page);
                        __skb_fill_netmem_desc(to, 0, page_to_netmem(page),
                                               offset, plen);
                        get_page(page);
                        j = 1;
                        len -= plen;
                }
        }

        skb_len_add(to, len + plen);

        if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) {
                skb_tx_error(from);
                return -ENOMEM;
        }
        skb_zerocopy_clone(to, from, GFP_ATOMIC);

        for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
                int size;

                if (!len)
                        break;
                skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i];
                size = min_t(int, skb_frag_size(&skb_shinfo(to)->frags[j]),
                                        len);
                skb_frag_size_set(&skb_shinfo(to)->frags[j], size);
                len -= size;
                skb_frag_ref(to, j);
                j++;
        }
        skb_shinfo(to)->nr_frags = j;

        return 0;
}
EXPORT_SYMBOL_GPL(skb_zerocopy);

void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
{
        __wsum csum;
        long csstart;

        if (skb->ip_summed == CHECKSUM_PARTIAL)
                csstart = skb_checksum_start_offset(skb);
        else
                csstart = skb_headlen(skb);

        BUG_ON(csstart > skb_headlen(skb));

        skb_copy_from_linear_data(skb, to, csstart);

        csum = 0;
        if (csstart != skb->len)
                csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
                                              skb->len - csstart);

        if (skb->ip_summed == CHECKSUM_PARTIAL) {
                long csstuff = csstart + skb->csum_offset;

                *((__sum16 *)(to + csstuff)) = csum_fold(csum);
        }
}
EXPORT_SYMBOL(skb_copy_and_csum_dev);

/**
 *        skb_dequeue - remove from the head of the queue
 *        @list: list to dequeue from
 *
 *        Remove the head of the list. The list lock is taken so the function
 *        may be used safely with other locking list functions. The head item is
 *        returned or %NULL if the list is empty.
 */

struct sk_buff *skb_dequeue(struct sk_buff_head *list)
{
        unsigned long flags;
        struct sk_buff *result;

        spin_lock_irqsave(&list->lock, flags);
        result = __skb_dequeue(list);
        spin_unlock_irqrestore(&list->lock, flags);
        return result;
}
EXPORT_SYMBOL(skb_dequeue);

/**
 *        skb_dequeue_tail - remove from the tail of the queue
 *        @list: list to dequeue from
 *
 *        Remove the tail of the list. The list lock is taken so the function
 *        may be used safely with other locking list functions. The tail item is
 *        returned or %NULL if the list is empty.
 */
struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list)
{
        unsigned long flags;
        struct sk_buff *result;

        spin_lock_irqsave(&list->lock, flags);
        result = __skb_dequeue_tail(list);
        spin_unlock_irqrestore(&list->lock, flags);
        return result;
}
EXPORT_SYMBOL(skb_dequeue_tail);

/**
 *        skb_queue_purge_reason - empty a list
 *        @list: list to empty
 *        @reason: drop reason
 *
 *        Delete all buffers on an &sk_buff list. Each buffer is removed from
 *        the list and one reference dropped. This function takes the list
 *        lock and is atomic with respect to other list locking functions.
 */
void skb_queue_purge_reason(struct sk_buff_head *list,
                            enum skb_drop_reason reason)
{
        struct sk_buff_head tmp;
        unsigned long flags;

        if (skb_queue_empty_lockless(list))
                return;

        __skb_queue_head_init(&tmp);

        spin_lock_irqsave(&list->lock, flags);
        skb_queue_splice_init(list, &tmp);
        spin_unlock_irqrestore(&list->lock, flags);

        __skb_queue_purge_reason(&tmp, reason);
}
EXPORT_SYMBOL(skb_queue_purge_reason);

/**
 *        skb_rbtree_purge - empty a skb rbtree
 *        @root: root of the rbtree to empty
 *        Return value: the sum of truesizes of all purged skbs.
 *
 *        Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
 *        the list and one reference dropped. This function does not take
 *        any lock. Synchronization should be handled by the caller (e.g., TCP
 *        out-of-order queue is protected by the socket lock).
 */
unsigned int skb_rbtree_purge(struct rb_root *root)
{
        struct rb_node *p = rb_first(root);
        unsigned int sum = 0;

        while (p) {
                struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);

                p = rb_next(p);
                rb_erase(&skb->rbnode, root);
                sum += skb->truesize;
                kfree_skb(skb);
        }
        return sum;
}

void skb_errqueue_purge(struct sk_buff_head *list)
{
        struct sk_buff *skb, *next;
        struct sk_buff_head kill;
        unsigned long flags;

        __skb_queue_head_init(&kill);

        spin_lock_irqsave(&list->lock, flags);
        skb_queue_walk_safe(list, skb, next) {
                if (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ZEROCOPY ||
                    SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING)
                        continue;
                __skb_unlink(skb, list);
                __skb_queue_tail(&kill, skb);
        }
        spin_unlock_irqrestore(&list->lock, flags);
        __skb_queue_purge(&kill);
}
EXPORT_SYMBOL(skb_errqueue_purge);

/**
 *        skb_queue_head - queue a buffer at the list head
 *        @list: list to use
 *        @newsk: buffer to queue
 *
 *        Queue a buffer at the start of the list. This function takes the
 *        list lock and can be used safely with other locking &sk_buff functions
 *        safely.
 *
 *        A buffer cannot be placed on two lists at the same time.
 */
void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk)
{
        unsigned long flags;

        spin_lock_irqsave(&list->lock, flags);
        __skb_queue_head(list, newsk);
        spin_unlock_irqrestore(&list->lock, flags);
}
EXPORT_SYMBOL(skb_queue_head);

/**
 *        skb_queue_tail - queue a buffer at the list tail
 *        @list: list to use
 *        @newsk: buffer to queue
 *
 *        Queue a buffer at the tail of the list. This function takes the
 *        list lock and can be used safely with other locking &sk_buff functions
 *        safely.
 *
 *        A buffer cannot be placed on two lists at the same time.
 */
void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
{
        unsigned long flags;

        spin_lock_irqsave(&list->lock, flags);
        __skb_queue_tail(list, newsk);
        spin_unlock_irqrestore(&list->lock, flags);
}
EXPORT_SYMBOL(skb_queue_tail);

/**
 *        skb_unlink        -        remove a buffer from a list
 *        @skb: buffer to remove
 *        @list: list to use
 *
 *        Remove a packet from a list. The list locks are taken and this
 *        function is atomic with respect to other list locked calls
 *
 *        You must know what list the SKB is on.
 */
void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
{
        unsigned long flags;

        spin_lock_irqsave(&list->lock, flags);
        __skb_unlink(skb, list);
        spin_unlock_irqrestore(&list->lock, flags);
}
EXPORT_SYMBOL(skb_unlink);

/**
 *        skb_append        -        append a buffer
 *        @old: buffer to insert after
 *        @newsk: buffer to insert
 *        @list: list to use
 *
 *        Place a packet after a given packet in a list. The list locks are taken
 *        and this function is atomic with respect to other list locked calls.
 *        A buffer cannot be placed on two lists at the same time.
 */
void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
{
        unsigned long flags;

        spin_lock_irqsave(&list->lock, flags);
        __skb_queue_after(list, old, newsk);
        spin_unlock_irqrestore(&list->lock, flags);
}
EXPORT_SYMBOL(skb_append);

static inline void skb_split_inside_header(struct sk_buff *skb,
                                           struct sk_buff* skb1,
                                           const u32 len, const int pos)
{
        int i;

        skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len),
                                         pos - len);
        /* And move data appendix as is. */
        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
                skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];

        skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
        skb1->unreadable           = skb->unreadable;
        skb_shinfo(skb)->nr_frags  = 0;
        skb1->data_len                   = skb->data_len;
        skb1->len                   += skb1->data_len;
        skb->data_len                   = 0;
        skb->len                   = len;
        skb_set_tail_pointer(skb, len);
}

static inline void skb_split_no_header(struct sk_buff *skb,
                                       struct sk_buff* skb1,
                                       const u32 len, int pos)
{
        int i, k = 0;
        const int nfrags = skb_shinfo(skb)->nr_frags;

        skb_shinfo(skb)->nr_frags = 0;
        skb1->len                  = skb1->data_len = skb->len - len;
        skb->len                  = len;
        skb->data_len                  = len - pos;

        for (i = 0; i < nfrags; i++) {
                int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);

                if (pos + size > len) {
                        skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];

                        if (pos < len) {
                                /* Split frag.
                                 * We have two variants in this case:
                                 * 1. Move all the frag to the second
                                 *    part, if it is possible. F.e.
                                 *    this approach is mandatory for TUX,
                                 *    where splitting is expensive.
                                 * 2. Split is accurately. We make this.
                                 */
                                skb_frag_ref(skb, i);
                                skb_frag_off_add(&skb_shinfo(skb1)->frags[0], len - pos);
                                skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos);
                                skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos);
                                skb_shinfo(skb)->nr_frags++;
                        }
                        k++;
                } else
                        skb_shinfo(skb)->nr_frags++;
                pos += size;
        }
        skb_shinfo(skb1)->nr_frags = k;

        skb1->unreadable = skb->unreadable;
}

/**
 * skb_split - Split fragmented skb to two parts at length len.
 * @skb: the buffer to split
 * @skb1: the buffer to receive the second part
 * @len: new length for skb
 */
void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
{
        int pos = skb_headlen(skb);
        const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY;

        skb_zcopy_downgrade_managed(skb);

        skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags;
        skb_zerocopy_clone(skb1, skb, 0);
        if (len < pos)        /* Split line is inside header. */
                skb_split_inside_header(skb, skb1, len, pos);
        else                /* Second chunk has no header, nothing to copy. */
                skb_split_no_header(skb, skb1, len, pos);
}
EXPORT_SYMBOL(skb_split);

/* Shifting from/to a cloned skb is a no-go.
 *
 * Caller cannot keep skb_shinfo related pointers past calling here!
 */
static int skb_prepare_for_shift(struct sk_buff *skb)
{
        return skb_unclone_keeptruesize(skb, GFP_ATOMIC);
}

/**
 * skb_shift - Shifts paged data partially from skb to another
 * @tgt: buffer into which tail data gets added
 * @skb: buffer from which the paged data comes from
 * @shiftlen: shift up to this many bytes
 *
 * Attempts to shift up to shiftlen worth of bytes, which may be less than
 * the length of the skb, from skb to tgt. Returns number bytes shifted.
 * It's up to caller to free skb if everything was shifted.
 *
 * If @tgt runs out of frags, the whole operation is aborted.
 *
 * Skb cannot include anything else but paged data while tgt is allowed
 * to have non-paged data as well.
 *
 * TODO: full sized shift could be optimized but that would need
 * specialized skb free'er to handle frags without up-to-date nr_frags.
 */
int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
{
        int from, to, merge, todo;
        skb_frag_t *fragfrom, *fragto;

        BUG_ON(shiftlen > skb->len);

        if (skb_headlen(skb))
                return 0;
        if (skb_zcopy(tgt) || skb_zcopy(skb))
                return 0;

        DEBUG_NET_WARN_ON_ONCE(tgt->pp_recycle != skb->pp_recycle);
        DEBUG_NET_WARN_ON_ONCE(skb_cmp_decrypted(tgt, skb));

        todo = shiftlen;
        from = 0;
        to = skb_shinfo(tgt)->nr_frags;
        fragfrom = &skb_shinfo(skb)->frags[from];

        /* Actual merge is delayed until the point when we know we can
         * commit all, so that we don't have to undo partial changes
         */
        if (!skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
                              skb_frag_off(fragfrom))) {
                merge = -1;
        } else {
                merge = to - 1;

                todo -= skb_frag_size(fragfrom);
                if (todo < 0) {
                        if (skb_prepare_for_shift(skb) ||
                            skb_prepare_for_shift(tgt))
                                return 0;

                        /* All previous frag pointers might be stale! */
                        fragfrom = &skb_shinfo(skb)->frags[from];
                        fragto = &skb_shinfo(tgt)->frags[merge];

                        skb_frag_size_add(fragto, shiftlen);
                        skb_frag_size_sub(fragfrom, shiftlen);
                        skb_frag_off_add(fragfrom, shiftlen);

                        goto onlymerged;
                }

                from++;
        }

        /* Skip full, not-fitting skb to avoid expensive operations */
        if ((shiftlen == skb->len) &&
            (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to))
                return 0;

        if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt))
                return 0;

        while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) {
                if (to == MAX_SKB_FRAGS)
                        return 0;

                fragfrom = &skb_shinfo(skb)->frags[from];
                fragto = &skb_shinfo(tgt)->frags[to];

                if (todo >= skb_frag_size(fragfrom)) {
                        *fragto = *fragfrom;
                        todo -= skb_frag_size(fragfrom);
                        from++;
                        to++;

                } else {
                        __skb_frag_ref(fragfrom);
                        skb_frag_page_copy(fragto, fragfrom);
                        skb_frag_off_copy(fragto, fragfrom);
                        skb_frag_size_set(fragto, todo);

                        skb_frag_off_add(fragfrom, todo);
                        skb_frag_size_sub(fragfrom, todo);
                        todo = 0;

                        to++;
                        break;
                }
        }

        /* Ready to "commit" this state change to tgt */
        skb_shinfo(tgt)->nr_frags = to;

        if (merge >= 0) {
                fragfrom = &skb_shinfo(skb)->frags[0];
                fragto = &skb_shinfo(tgt)->frags[merge];

                skb_frag_size_add(fragto, skb_frag_size(fragfrom));
                __skb_frag_unref(fragfrom, skb->pp_recycle);
        }

        /* Reposition in the original skb */
        to = 0;
        while (from < skb_shinfo(skb)->nr_frags)
                skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++];
        skb_shinfo(skb)->nr_frags = to;

        BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags);

onlymerged:
        /* Most likely the tgt won't ever need its checksum anymore, skb on
         * the other hand might need it if it needs to be resent
         */
        tgt->ip_summed = CHECKSUM_PARTIAL;
        skb->ip_summed = CHECKSUM_PARTIAL;

        skb_len_add(skb, -shiftlen);
        skb_len_add(tgt, shiftlen);

        return shiftlen;
}

/**
 * skb_prepare_seq_read - Prepare a sequential read of skb data
 * @skb: the buffer to read
 * @from: lower offset of data to be read
 * @to: upper offset of data to be read
 * @st: state variable
 *
 * Initializes the specified state variable. Must be called before
 * invoking skb_seq_read() for the first time.
 */
void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
                          unsigned int to, struct skb_seq_state *st)
{
        st->lower_offset = from;
        st->upper_offset = to;
        st->root_skb = st->cur_skb = skb;
        st->frag_idx = st->stepped_offset = 0;
        st->frag_data = NULL;
        st->frag_off = 0;
}
EXPORT_SYMBOL(skb_prepare_seq_read);

/**
 * skb_seq_read - Sequentially read skb data
 * @consumed: number of bytes consumed by the caller so far
 * @data: destination pointer for data to be returned
 * @st: state variable
 *
 * Reads a block of skb data at @consumed relative to the
 * lower offset specified to skb_prepare_seq_read(). Assigns
 * the head of the data block to @data and returns the length
 * of the block or 0 if the end of the skb data or the upper
 * offset has been reached.
 *
 * The caller is not required to consume all of the data
 * returned, i.e. @consumed is typically set to the number
 * of bytes already consumed and the next call to
 * skb_seq_read() will return the remaining part of the block.
 *
 * Note 1: The size of each block of data returned can be arbitrary,
 *       this limitation is the cost for zerocopy sequential
 *       reads of potentially non linear data.
 *
 * Note 2: Fragment lists within fragments are not implemented
 *       at the moment, state->root_skb could be replaced with
 *       a stack for this purpose.
 */
unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
                          struct skb_seq_state *st)
{
        unsigned int block_limit, abs_offset = consumed + st->lower_offset;
        skb_frag_t *frag;

        if (unlikely(abs_offset >= st->upper_offset)) {
                if (st->frag_data) {
                        kunmap_atomic(st->frag_data);
                        st->frag_data = NULL;
                }
                return 0;
        }

next_skb:
        block_limit = skb_headlen(st->cur_skb) + st->stepped_offset;

        if (abs_offset < block_limit && !st->frag_data) {
                *data = st->cur_skb->data + (abs_offset - st->stepped_offset);
                return block_limit - abs_offset;
        }

        if (!skb_frags_readable(st->cur_skb))
                return 0;

        if (st->frag_idx == 0 && !st->frag_data)
                st->stepped_offset += skb_headlen(st->cur_skb);

        while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
                unsigned int pg_idx, pg_off, pg_sz;

                frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];

                pg_idx = 0;
                pg_off = skb_frag_off(frag);
                pg_sz = skb_frag_size(frag);

                if (skb_frag_must_loop(skb_frag_page(frag))) {
                        pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT;
                        pg_off = offset_in_page(pg_off + st->frag_off);
                        pg_sz = min_t(unsigned int, pg_sz - st->frag_off,
                                                    PAGE_SIZE - pg_off);
                }

                block_limit = pg_sz + st->stepped_offset;
                if (abs_offset < block_limit) {
                        if (!st->frag_data)
                                st->frag_data = kmap_atomic(skb_frag_page(frag) + pg_idx);

                        *data = (u8 *)st->frag_data + pg_off +
                                (abs_offset - st->stepped_offset);

                        return block_limit - abs_offset;
                }

                if (st->frag_data) {
                        kunmap_atomic(st->frag_data);
                        st->frag_data = NULL;
                }

                st->stepped_offset += pg_sz;
                st->frag_off += pg_sz;
                if (st->frag_off == skb_frag_size(frag)) {
                        st->frag_off = 0;
                        st->frag_idx++;
                }
        }

        if (st->frag_data) {
                kunmap_atomic(st->frag_data);
                st->frag_data = NULL;
        }

        if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) {
                st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
                st->frag_idx = 0;
                goto next_skb;
        } else if (st->cur_skb->next) {
                st->cur_skb = st->cur_skb->next;
                st->frag_idx = 0;
                goto next_skb;
        }

        return 0;
}
EXPORT_SYMBOL(skb_seq_read);

/**
 * skb_abort_seq_read - Abort a sequential read of skb data
 * @st: state variable
 *
 * Must be called if skb_seq_read() was not called until it
 * returned 0.
 */
void skb_abort_seq_read(struct skb_seq_state *st)
{
        if (st->frag_data)
                kunmap_atomic(st->frag_data);
}
EXPORT_SYMBOL(skb_abort_seq_read);

/**
 * skb_copy_seq_read() - copy from a skb_seq_state to a buffer
 * @st: source skb_seq_state
 * @offset: offset in source
 * @to: destination buffer
 * @len: number of bytes to copy
 *
 * Copy @len bytes from @offset bytes into the source @st to the destination
 * buffer @to. `offset` should increase (or be unchanged) with each subsequent
 * call to this function. If offset needs to decrease from the previous use `st`
 * should be reset first.
 *
 * Return: 0 on success or -EINVAL if the copy ended early
 */
int skb_copy_seq_read(struct skb_seq_state *st, int offset, void *to, int len)
{
        const u8 *data;
        u32 sqlen;

        for (;;) {
                sqlen = skb_seq_read(offset, &data, st);
                if (sqlen == 0)
                        return -EINVAL;
                if (sqlen >= len) {
                        memcpy(to, data, len);
                        return 0;
                }
                memcpy(to, data, sqlen);
                to += sqlen;
                offset += sqlen;
                len -= sqlen;
        }
}
EXPORT_SYMBOL(skb_copy_seq_read);

#define TS_SKB_CB(state)        ((struct skb_seq_state *) &((state)->cb))

static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
                                          struct ts_config *conf,
                                          struct ts_state *state)
{
        return skb_seq_read(offset, text, TS_SKB_CB(state));
}

static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
{
        skb_abort_seq_read(TS_SKB_CB(state));
}

/**
 * skb_find_text - Find a text pattern in skb data
 * @skb: the buffer to look in
 * @from: search offset
 * @to: search limit
 * @config: textsearch configuration
 *
 * Finds a pattern in the skb data according to the specified
 * textsearch configuration. Use textsearch_next() to retrieve
 * subsequent occurrences of the pattern. Returns the offset
 * to the first occurrence or UINT_MAX if no match was found.
 */
unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
                           unsigned int to, struct ts_config *config)
{
        unsigned int patlen = config->ops->get_pattern_len(config);
        struct ts_state state;
        unsigned int ret;

        BUILD_BUG_ON(sizeof(struct skb_seq_state) > sizeof(state.cb));

        config->get_next_block = skb_ts_get_next_block;
        config->finish = skb_ts_finish;

        skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state));

        ret = textsearch_find(config, &state);
        return (ret + patlen <= to - from ? ret : UINT_MAX);
}
EXPORT_SYMBOL(skb_find_text);

int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
                         int offset, size_t size, size_t max_frags)
{
        int i = skb_shinfo(skb)->nr_frags;

        if (skb_can_coalesce(skb, i, page, offset)) {
                skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
        } else if (i < max_frags) {
                skb_zcopy_downgrade_managed(skb);
                get_page(page);
                skb_fill_page_desc_noacc(skb, i, page, offset, size);
        } else {
                return -EMSGSIZE;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(skb_append_pagefrags);

/**
 *        skb_pull_rcsum - pull skb and update receive checksum
 *        @skb: buffer to update
 *        @len: length of data pulled
 *
 *        This function performs an skb_pull on the packet and updates
 *        the CHECKSUM_COMPLETE checksum.  It should be used on
 *        receive path processing instead of skb_pull unless you know
 *        that the checksum difference is zero (e.g., a valid IP header)
 *        or you are setting ip_summed to CHECKSUM_NONE.
 */
void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
{
        unsigned char *data = skb->data;

        BUG_ON(len > skb->len);
        __skb_pull(skb, len);
        skb_postpull_rcsum(skb, data, len);
        return skb->data;
}
EXPORT_SYMBOL_GPL(skb_pull_rcsum);

static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb)
{
        skb_frag_t head_frag;
        struct page *page;

        page = virt_to_head_page(frag_skb->head);
        skb_frag_fill_page_desc(&head_frag, page, frag_skb->data -
                                (unsigned char *)page_address(page),
                                skb_headlen(frag_skb));
        return head_frag;
}

struct sk_buff *skb_segment_list(struct sk_buff *skb,
                                 netdev_features_t features,
                                 unsigned int offset)
{
        struct sk_buff *list_skb = skb_shinfo(skb)->frag_list;
        unsigned int tnl_hlen = skb_tnl_header_len(skb);
        unsigned int delta_truesize = 0;
        unsigned int delta_len = 0;
        struct sk_buff *tail = NULL;
        struct sk_buff *nskb, *tmp;
        int len_diff, err;

        skb_push(skb, -skb_network_offset(skb) + offset);

        /* Ensure the head is writeable before touching the shared info */
        err = skb_unclone(skb, GFP_ATOMIC);
        if (err)
                goto err_linearize;

        skb_shinfo(skb)->frag_list = NULL;

        while (list_skb) {
                nskb = list_skb;
                list_skb = list_skb->next;

                err = 0;
                delta_truesize += nskb->truesize;
                if (skb_shared(nskb)) {
                        tmp = skb_clone(nskb, GFP_ATOMIC);
                        if (tmp) {
                                consume_skb(nskb);
                                nskb = tmp;
                                err = skb_unclone(nskb, GFP_ATOMIC);
                        } else {
                                err = -ENOMEM;
                        }
                }

                if (!tail)
                        skb->next = nskb;
                else
                        tail->next = nskb;

                if (unlikely(err)) {
                        nskb->next = list_skb;
                        goto err_linearize;
                }

                tail = nskb;

                delta_len += nskb->len;

                skb_push(nskb, -skb_network_offset(nskb) + offset);

                skb_release_head_state(nskb);
                len_diff = skb_network_header_len(nskb) - skb_network_header_len(skb);
                __copy_skb_header(nskb, skb);

                skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb));
                nskb->transport_header += len_diff;
                skb_copy_from_linear_data_offset(skb, -tnl_hlen,
                                                 nskb->data - tnl_hlen,
                                                 offset + tnl_hlen);

                if (skb_needs_linearize(nskb, features) &&
                    __skb_linearize(nskb))
                        goto err_linearize;
        }

        skb->truesize = skb->truesize - delta_truesize;
        skb->data_len = skb->data_len - delta_len;
        skb->len = skb->len - delta_len;

        skb_gso_reset(skb);

        skb->prev = tail;

        if (skb_needs_linearize(skb, features) &&
            __skb_linearize(skb))
                goto err_linearize;

        skb_get(skb);

        return skb;

err_linearize:
        kfree_skb_list(skb->next);
        skb->next = NULL;
        return ERR_PTR(-ENOMEM);
}
EXPORT_SYMBOL_GPL(skb_segment_list);

/**
 *        skb_segment - Perform protocol segmentation on skb.
 *        @head_skb: buffer to segment
 *        @features: features for the output path (see dev->features)
 *
 *        This function performs segmentation on the given skb.  It returns
 *        a pointer to the first in a list of new skbs for the segments.
 *        In case of error it returns ERR_PTR(err).
 */
struct sk_buff *skb_segment(struct sk_buff *head_skb,
                            netdev_features_t features)
{
        struct sk_buff *segs = NULL;
        struct sk_buff *tail = NULL;
        struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list;
        unsigned int mss = skb_shinfo(head_skb)->gso_size;
        unsigned int doffset = head_skb->data - skb_mac_header(head_skb);
        unsigned int offset = doffset;
        unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
        unsigned int partial_segs = 0;
        unsigned int headroom;
        unsigned int len = head_skb->len;
        struct sk_buff *frag_skb;
        skb_frag_t *frag;
        __be16 proto;
        bool csum, sg;
        int err = -ENOMEM;
        int i = 0;
        int nfrags, pos;

        if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) &&
            mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) {
                struct sk_buff *check_skb;

                for (check_skb = list_skb; check_skb; check_skb = check_skb->next) {
                        if (skb_headlen(check_skb) && !check_skb->head_frag) {
                                /* gso_size is untrusted, and we have a frag_list with
                                 * a linear non head_frag item.
                                 *
                                 * If head_skb's headlen does not fit requested gso_size,
                                 * it means that the frag_list members do NOT terminate
                                 * on exact gso_size boundaries. Hence we cannot perform
                                 * skb_frag_t page sharing. Therefore we must fallback to
                                 * copying the frag_list skbs; we do so by disabling SG.
                                 */
                                features &= ~NETIF_F_SG;
                                break;
                        }
                }
        }

        __skb_push(head_skb, doffset);
        proto = skb_network_protocol(head_skb, NULL);
        if (unlikely(!proto))
                return ERR_PTR(-EINVAL);

        sg = !!(features & NETIF_F_SG);
        csum = !!can_checksum_protocol(features, proto);

        if (sg && csum && (mss != GSO_BY_FRAGS))  {
                if (!(features & NETIF_F_GSO_PARTIAL)) {
                        struct sk_buff *iter;
                        unsigned int frag_len;

                        if (!list_skb ||
                            !net_gso_ok(features, skb_shinfo(head_skb)->gso_type))
                                goto normal;

                        /* If we get here then all the required
                         * GSO features except frag_list are supported.
                         * Try to split the SKB to multiple GSO SKBs
                         * with no frag_list.
                         * Currently we can do that only when the buffers don't
                         * have a linear part and all the buffers except
                         * the last are of the same length.
                         */
                        frag_len = list_skb->len;
                        skb_walk_frags(head_skb, iter) {
                                if (frag_len != iter->len && iter->next)
                                        goto normal;
                                if (skb_headlen(iter) && !iter->head_frag)
                                        goto normal;

                                len -= iter->len;
                        }

                        if (len != frag_len)
                                goto normal;
                }

                /* GSO partial only requires that we trim off any excess that
                 * doesn't fit into an MSS sized block, so take care of that
                 * now.
                 * Cap len to not accidentally hit GSO_BY_FRAGS.
                 */
                partial_segs = min(len, GSO_BY_FRAGS - 1) / mss;
                if (partial_segs > 1)
                        mss *= partial_segs;
                else
                        partial_segs = 0;
        }

normal:
        headroom = skb_headroom(head_skb);
        pos = skb_headlen(head_skb);

        if (skb_orphan_frags(head_skb, GFP_ATOMIC))
                return ERR_PTR(-ENOMEM);

        nfrags = skb_shinfo(head_skb)->nr_frags;
        frag = skb_shinfo(head_skb)->frags;
        frag_skb = head_skb;

        do {
                struct sk_buff *nskb;
                skb_frag_t *nskb_frag;
                int hsize;
                int size;

                if (unlikely(mss == GSO_BY_FRAGS)) {
                        len = list_skb->len;
                } else {
                        len = head_skb->len - offset;
                        if (len > mss)
                                len = mss;
                }

                hsize = skb_headlen(head_skb) - offset;

                if (hsize <= 0 && i >= nfrags && skb_headlen(list_skb) &&
                    (skb_headlen(list_skb) == len || sg)) {
                        BUG_ON(skb_headlen(list_skb) > len);

                        nskb = skb_clone(list_skb, GFP_ATOMIC);
                        if (unlikely(!nskb))
                                goto err;

                        i = 0;
                        nfrags = skb_shinfo(list_skb)->nr_frags;
                        frag = skb_shinfo(list_skb)->frags;
                        frag_skb = list_skb;
                        pos += skb_headlen(list_skb);

                        while (pos < offset + len) {
                                BUG_ON(i >= nfrags);

                                size = skb_frag_size(frag);
                                if (pos + size > offset + len)
                                        break;

                                i++;
                                pos += size;
                                frag++;
                        }

                        list_skb = list_skb->next;

                        if (unlikely(pskb_trim(nskb, len))) {
                                kfree_skb(nskb);
                                goto err;
                        }

                        hsize = skb_end_offset(nskb);
                        if (skb_cow_head(nskb, doffset + headroom)) {
                                kfree_skb(nskb);
                                goto err;
                        }

                        nskb->truesize += skb_end_offset(nskb) - hsize;
                        skb_release_head_state(nskb);
                        __skb_push(nskb, doffset);
                } else {
                        if (hsize < 0)
                                hsize = 0;
                        if (hsize > len || !sg)
                                hsize = len;

                        nskb = __alloc_skb(hsize + doffset + headroom,
                                           GFP_ATOMIC, skb_alloc_rx_flag(head_skb),
                                           NUMA_NO_NODE);

                        if (unlikely(!nskb))
                                goto err;

                        skb_reserve(nskb, headroom);
                        __skb_put(nskb, doffset);
                }

                if (segs)
                        tail->next = nskb;
                else
                        segs = nskb;
                tail = nskb;

                __copy_skb_header(nskb, head_skb);

                skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom);
                skb_reset_mac_len(nskb);

                skb_copy_from_linear_data_offset(head_skb, -tnl_hlen,
                                                 nskb->data - tnl_hlen,
                                                 doffset + tnl_hlen);

                if (nskb->len == len + doffset)
                        goto perform_csum_check;

                if (!sg) {
                        if (!csum) {
                                if (!nskb->remcsum_offload)
                                        nskb->ip_summed = CHECKSUM_NONE;
                                SKB_GSO_CB(nskb)->csum =
                                        skb_copy_and_csum_bits(head_skb, offset,
                                                               skb_put(nskb,
                                                                       len),
                                                               len);
                                SKB_GSO_CB(nskb)->csum_start =
                                        skb_headroom(nskb) + doffset;
                        } else {
                                if (skb_copy_bits(head_skb, offset, skb_put(nskb, len), len))
                                        goto err;
                        }
                        continue;
                }

                nskb_frag = skb_shinfo(nskb)->frags;

                skb_copy_from_linear_data_offset(head_skb, offset,
                                                 skb_put(nskb, hsize), hsize);

                skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags &
                                           SKBFL_SHARED_FRAG;

                if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
                        goto err;

                while (pos < offset + len) {
                        if (i >= nfrags) {
                                if (skb_orphan_frags(list_skb, GFP_ATOMIC) ||
                                    skb_zerocopy_clone(nskb, list_skb,
                                                       GFP_ATOMIC))
                                        goto err;

                                i = 0;
                                nfrags = skb_shinfo(list_skb)->nr_frags;
                                frag = skb_shinfo(list_skb)->frags;
                                frag_skb = list_skb;
                                if (!skb_headlen(list_skb)) {
                                        BUG_ON(!nfrags);
                                } else {
                                        BUG_ON(!list_skb->head_frag);

                                        /* to make room for head_frag. */
                                        i--;
                                        frag--;
                                }

                                list_skb = list_skb->next;
                        }

                        if (unlikely(skb_shinfo(nskb)->nr_frags >=
                                     MAX_SKB_FRAGS)) {
                                net_warn_ratelimited(
                                        "skb_segment: too many frags: %u %u\n",
                                        pos, mss);
                                err = -EINVAL;
                                goto err;
                        }

                        *nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag;
                        __skb_frag_ref(nskb_frag);
                        size = skb_frag_size(nskb_frag);

                        if (pos < offset) {
                                skb_frag_off_add(nskb_frag, offset - pos);
                                skb_frag_size_sub(nskb_frag, offset - pos);
                        }

                        skb_shinfo(nskb)->nr_frags++;

                        if (pos + size <= offset + len) {
                                i++;
                                frag++;
                                pos += size;
                        } else {
                                skb_frag_size_sub(nskb_frag, pos + size - (offset + len));
                                goto skip_fraglist;
                        }

                        nskb_frag++;
                }

skip_fraglist:
                nskb->data_len = len - hsize;
                nskb->len += nskb->data_len;
                nskb->truesize += nskb->data_len;

perform_csum_check:
                if (!csum) {
                        if (skb_has_shared_frag(nskb) &&
                            __skb_linearize(nskb))
                                goto err;

                        if (!nskb->remcsum_offload)
                                nskb->ip_summed = CHECKSUM_NONE;
                        SKB_GSO_CB(nskb)->csum =
                                skb_checksum(nskb, doffset,
                                             nskb->len - doffset, 0);
                        SKB_GSO_CB(nskb)->csum_start =
                                skb_headroom(nskb) + doffset;
                }
        } while ((offset += len) < head_skb->len);

        /* Some callers want to get the end of the list.
         * Put it in segs->prev to avoid walking the list.
         * (see validate_xmit_skb_list() for example)
         */
        segs->prev = tail;

        if (partial_segs) {
                struct sk_buff *iter;
                int type = skb_shinfo(head_skb)->gso_type;
                unsigned short gso_size = skb_shinfo(head_skb)->gso_size;

                /* Update type to add partial and then remove dodgy if set */
                type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL;
                type &= ~SKB_GSO_DODGY;

                /* Update GSO info and prepare to start updating headers on
                 * our way back down the stack of protocols.
                 */
                for (iter = segs; iter; iter = iter->next) {
                        skb_shinfo(iter)->gso_size = gso_size;
                        skb_shinfo(iter)->gso_segs = partial_segs;
                        skb_shinfo(iter)->gso_type = type;
                        SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset;
                }

                if (tail->len - doffset <= gso_size)
                        skb_shinfo(tail)->gso_size = 0;
                else if (tail != segs)
                        skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size);
        }

        /* Following permits correct backpressure, for protocols
         * using skb_set_owner_w().
         * Idea is to tranfert ownership from head_skb to last segment.
         */
        if (head_skb->destructor == sock_wfree) {
                swap(tail->truesize, head_skb->truesize);
                swap(tail->destructor, head_skb->destructor);
                swap(tail->sk, head_skb->sk);
        }
        return segs;

err:
        kfree_skb_list(segs);
        return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(skb_segment);

#ifdef CONFIG_SKB_EXTENSIONS
#define SKB_EXT_ALIGN_VALUE        8
#define SKB_EXT_CHUNKSIZEOF(x)        (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE)

static const u8 skb_ext_type_len[] = {
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
        [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info),
#endif
#ifdef CONFIG_XFRM
        [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path),
#endif
#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
        [TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext),
#endif
#if IS_ENABLED(CONFIG_MPTCP)
        [SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext),
#endif
#if IS_ENABLED(CONFIG_MCTP_FLOWS)
        [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow),
#endif
#if IS_ENABLED(CONFIG_INET_PSP)
        [SKB_EXT_PSP] = SKB_EXT_CHUNKSIZEOF(struct psp_skb_ext),
#endif
};

static __always_inline unsigned int skb_ext_total_length(void)
{
        unsigned int l = SKB_EXT_CHUNKSIZEOF(struct skb_ext);
        int i;

        for (i = 0; i < ARRAY_SIZE(skb_ext_type_len); i++)
                l += skb_ext_type_len[i];

        return l;
}

static void skb_extensions_init(void)
{
        BUILD_BUG_ON(SKB_EXT_NUM >= 8);
#if !IS_ENABLED(CONFIG_KCOV_INSTRUMENT_ALL)
        BUILD_BUG_ON(skb_ext_total_length() > 255);
#endif

        skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache",
                                             SKB_EXT_ALIGN_VALUE * skb_ext_total_length(),
                                             0,
                                             SLAB_HWCACHE_ALIGN|SLAB_PANIC,
                                             NULL);
}
#else
static void skb_extensions_init(void) {}
#endif

/* The SKB kmem_cache slab is critical for network performance.  Never
 * merge/alias the slab with similar sized objects.  This avoids fragmentation
 * that hurts performance of kmem_cache_{alloc,free}_bulk APIs.
 */
#ifndef CONFIG_SLUB_TINY
#define FLAG_SKB_NO_MERGE        SLAB_NO_MERGE
#else /* CONFIG_SLUB_TINY - simple loop in kmem_cache_alloc_bulk */
#define FLAG_SKB_NO_MERGE        0
#endif

void __init skb_init(void)
{
        net_hotdata.skbuff_cache = kmem_cache_create_usercopy("skbuff_head_cache",
                                              sizeof(struct sk_buff),
                                              0,
                                              SLAB_HWCACHE_ALIGN|SLAB_PANIC|
                                                FLAG_SKB_NO_MERGE,
                                              offsetof(struct sk_buff, cb),
                                              sizeof_field(struct sk_buff, cb),
                                              NULL);
        net_hotdata.skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
                                                sizeof(struct sk_buff_fclones),
                                                0,
                                                SLAB_HWCACHE_ALIGN|SLAB_PANIC,
                                                NULL);
        /* usercopy should only access first SKB_SMALL_HEAD_HEADROOM bytes.
         * struct skb_shared_info is located at the end of skb->head,
         * and should not be copied to/from user.
         */
        net_hotdata.skb_small_head_cache = kmem_cache_create_usercopy("skbuff_small_head",
                                                SKB_SMALL_HEAD_CACHE_SIZE,
                                                0,
                                                SLAB_HWCACHE_ALIGN | SLAB_PANIC,
                                                0,
                                                SKB_SMALL_HEAD_HEADROOM,
                                                NULL);
        skb_extensions_init();
}

static int
__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len,
               unsigned int recursion_level)
{
        int start = skb_headlen(skb);
        int i, copy = start - offset;
        struct sk_buff *frag_iter;
        int elt = 0;

        if (unlikely(recursion_level >= 24))
                return -EMSGSIZE;

        if (copy > 0) {
                if (copy > len)
                        copy = len;
                sg_set_buf(sg, skb->data + offset, copy);
                elt++;
                if ((len -= copy) == 0)
                        return elt;
                offset += copy;
        }

        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int end;

                WARN_ON(start > offset + len);

                end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
                if ((copy = end - offset) > 0) {
                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
                        if (unlikely(elt && sg_is_last(&sg[elt - 1])))
                                return -EMSGSIZE;

                        if (copy > len)
                                copy = len;
                        sg_set_page(&sg[elt], skb_frag_page(frag), copy,
                                    skb_frag_off(frag) + offset - start);
                        elt++;
                        if (!(len -= copy))
                                return elt;
                        offset += copy;
                }
                start = end;
        }

        skb_walk_frags(skb, frag_iter) {
                int end, ret;

                WARN_ON(start > offset + len);

                end = start + frag_iter->len;
                if ((copy = end - offset) > 0) {
                        if (unlikely(elt && sg_is_last(&sg[elt - 1])))
                                return -EMSGSIZE;

                        if (copy > len)
                                copy = len;
                        ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start,
                                              copy, recursion_level + 1);
                        if (unlikely(ret < 0))
                                return ret;
                        elt += ret;
                        if ((len -= copy) == 0)
                                return elt;
                        offset += copy;
                }
                start = end;
        }
        BUG_ON(len);
        return elt;
}

/**
 *        skb_to_sgvec - Fill a scatter-gather list from a socket buffer
 *        @skb: Socket buffer containing the buffers to be mapped
 *        @sg: The scatter-gather list to map into
 *        @offset: The offset into the buffer's contents to start mapping
 *        @len: Length of buffer space to be mapped
 *
 *        Fill the specified scatter-gather list with mappings/pointers into a
 *        region of the buffer space attached to a socket buffer. Returns either
 *        the number of scatterlist items used, or -EMSGSIZE if the contents
 *        could not fit.
 */
int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
{
        int nsg = __skb_to_sgvec(skb, sg, offset, len, 0);

        if (nsg <= 0)
                return nsg;

        sg_mark_end(&sg[nsg - 1]);

        return nsg;
}
EXPORT_SYMBOL_GPL(skb_to_sgvec);

/* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given
 * sglist without mark the sg which contain last skb data as the end.
 * So the caller can mannipulate sg list as will when padding new data after
 * the first call without calling sg_unmark_end to expend sg list.
 *
 * Scenario to use skb_to_sgvec_nomark:
 * 1. sg_init_table
 * 2. skb_to_sgvec_nomark(payload1)
 * 3. skb_to_sgvec_nomark(payload2)
 *
 * This is equivalent to:
 * 1. sg_init_table
 * 2. skb_to_sgvec(payload1)
 * 3. sg_unmark_end
 * 4. skb_to_sgvec(payload2)
 *
 * When mapping multiple payload conditionally, skb_to_sgvec_nomark
 * is more preferable.
 */
int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
                        int offset, int len)
{
        return __skb_to_sgvec(skb, sg, offset, len, 0);
}
EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark);



/**
 *        skb_cow_data - Check that a socket buffer's data buffers are writable
 *        @skb: The socket buffer to check.
 *        @tailbits: Amount of trailing space to be added
 *        @trailer: Returned pointer to the skb where the @tailbits space begins
 *
 *        Make sure that the data buffers attached to a socket buffer are
 *        writable. If they are not, private copies are made of the data buffers
 *        and the socket buffer is set to use these instead.
 *
 *        If @tailbits is given, make sure that there is space to write @tailbits
 *        bytes of data beyond current end of socket buffer.  @trailer will be
 *        set to point to the skb in which this space begins.
 *
 *        The number of scatterlist elements required to completely map the
 *        COW'd and extended socket buffer will be returned.
 */
int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
{
        int copyflag;
        int elt;
        struct sk_buff *skb1, **skb_p;

        /* If skb is cloned or its head is paged, reallocate
         * head pulling out all the pages (pages are considered not writable
         * at the moment even if they are anonymous).
         */
        if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) &&
            !__pskb_pull_tail(skb, __skb_pagelen(skb)))
                return -ENOMEM;

        /* Easy case. Most of packets will go this way. */
        if (!skb_has_frag_list(skb)) {
                /* A little of trouble, not enough of space for trailer.
                 * This should not happen, when stack is tuned to generate
                 * good frames. OK, on miss we reallocate and reserve even more
                 * space, 128 bytes is fair. */

                if (skb_tailroom(skb) < tailbits &&
                    pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC))
                        return -ENOMEM;

                /* Voila! */
                *trailer = skb;
                return 1;
        }

        /* Misery. We are in troubles, going to mincer fragments... */

        elt = 1;
        skb_p = &skb_shinfo(skb)->frag_list;
        copyflag = 0;

        while ((skb1 = *skb_p) != NULL) {
                int ntail = 0;

                /* The fragment is partially pulled by someone,
                 * this can happen on input. Copy it and everything
                 * after it. */

                if (skb_shared(skb1))
                        copyflag = 1;

                /* If the skb is the last, worry about trailer. */

                if (skb1->next == NULL && tailbits) {
                        if (skb_shinfo(skb1)->nr_frags ||
                            skb_has_frag_list(skb1) ||
                            skb_tailroom(skb1) < tailbits)
                                ntail = tailbits + 128;
                }

                if (copyflag ||
                    skb_cloned(skb1) ||
                    ntail ||
                    skb_shinfo(skb1)->nr_frags ||
                    skb_has_frag_list(skb1)) {
                        struct sk_buff *skb2;

                        /* Fuck, we are miserable poor guys... */
                        if (ntail == 0)
                                skb2 = skb_copy(skb1, GFP_ATOMIC);
                        else
                                skb2 = skb_copy_expand(skb1,
                                                       skb_headroom(skb1),
                                                       ntail,
                                                       GFP_ATOMIC);
                        if (unlikely(skb2 == NULL))
                                return -ENOMEM;

                        if (skb1->sk)
                                skb_set_owner_w(skb2, skb1->sk);

                        /* Looking around. Are we still alive?
                         * OK, link new skb, drop old one */

                        skb2->next = skb1->next;
                        *skb_p = skb2;
                        kfree_skb(skb1);
                        skb1 = skb2;
                }
                elt++;
                *trailer = skb1;
                skb_p = &skb1->next;
        }

        return elt;
}
EXPORT_SYMBOL_GPL(skb_cow_data);

static void sock_rmem_free(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;

        atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
}

static void skb_set_err_queue(struct sk_buff *skb)
{
        /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING.
         * So, it is safe to (mis)use it to mark skbs on the error queue.
         */
        skb->pkt_type = PACKET_OUTGOING;
        BUILD_BUG_ON(PACKET_OUTGOING == 0);
}

/*
 * Note: We dont mem charge error packets (no sk_forward_alloc changes)
 */
int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
{
        if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
            (unsigned int)READ_ONCE(sk->sk_rcvbuf))
                return -ENOMEM;

        skb_orphan(skb);
        skb->sk = sk;
        skb->destructor = sock_rmem_free;
        atomic_add(skb->truesize, &sk->sk_rmem_alloc);
        skb_set_err_queue(skb);

        /* before exiting rcu section, make sure dst is refcounted */
        skb_dst_force(skb);

        skb_queue_tail(&sk->sk_error_queue, skb);
        if (!sock_flag(sk, SOCK_DEAD))
                sk_error_report(sk);
        return 0;
}
EXPORT_SYMBOL(sock_queue_err_skb);

static bool is_icmp_err_skb(const struct sk_buff *skb)
{
        return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
                       SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6);
}

struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
{
        struct sk_buff_head *q = &sk->sk_error_queue;
        struct sk_buff *skb, *skb_next = NULL;
        bool icmp_next = false;
        unsigned long flags;

        if (skb_queue_empty_lockless(q))
                return NULL;

        spin_lock_irqsave(&q->lock, flags);
        skb = __skb_dequeue(q);
        if (skb && (skb_next = skb_peek(q))) {
                icmp_next = is_icmp_err_skb(skb_next);
                if (icmp_next)
                        sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno;
        }
        spin_unlock_irqrestore(&q->lock, flags);

        if (is_icmp_err_skb(skb) && !icmp_next)
                sk->sk_err = 0;

        if (skb_next)
                sk_error_report(sk);

        return skb;
}
EXPORT_SYMBOL(sock_dequeue_err_skb);

/**
 * skb_clone_sk - create clone of skb, and take reference to socket
 * @skb: the skb to clone
 *
 * This function creates a clone of a buffer that holds a reference on
 * sk_refcnt.  Buffers created via this function are meant to be
 * returned using sock_queue_err_skb, or free via kfree_skb.
 *
 * When passing buffers allocated with this function to sock_queue_err_skb
 * it is necessary to wrap the call with sock_hold/sock_put in order to
 * prevent the socket from being released prior to being enqueued on
 * the sk_error_queue.
 */
struct sk_buff *skb_clone_sk(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;
        struct sk_buff *clone;

        if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
                return NULL;

        clone = skb_clone(skb, GFP_ATOMIC);
        if (!clone) {
                sock_put(sk);
                return NULL;
        }

        clone->sk = sk;
        clone->destructor = sock_efree;

        return clone;
}
EXPORT_SYMBOL(skb_clone_sk);

static void __skb_complete_tx_timestamp(struct sk_buff *skb,
                                        struct sock *sk,
                                        int tstype,
                                        bool opt_stats)
{
        struct sock_exterr_skb *serr;
        int err;

        BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb));

        serr = SKB_EXT_ERR(skb);
        memset(serr, 0, sizeof(*serr));
        serr->ee.ee_errno = ENOMSG;
        serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
        serr->ee.ee_info = tstype;
        serr->opt_stats = opt_stats;
        serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0;
        if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) {
                serr->ee.ee_data = skb_shinfo(skb)->tskey;
                if (sk_is_tcp(sk))
                        serr->ee.ee_data -= atomic_read(&sk->sk_tskey);
        }

        err = sock_queue_err_skb(sk, skb);

        if (err)
                kfree_skb(skb);
}

static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly)
{
        bool ret;

        if (likely(tsonly || READ_ONCE(sock_net(sk)->core.sysctl_tstamp_allow_data)))
                return true;

        read_lock_bh(&sk->sk_callback_lock);
        ret = sk->sk_socket && sk->sk_socket->file &&
              file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW);
        read_unlock_bh(&sk->sk_callback_lock);
        return ret;
}

void skb_complete_tx_timestamp(struct sk_buff *skb,
                               struct skb_shared_hwtstamps *hwtstamps)
{
        struct sock *sk = skb->sk;

        if (!skb_may_tx_timestamp(sk, false))
                goto err;

        /* Take a reference to prevent skb_orphan() from freeing the socket,
         * but only if the socket refcount is not zero.
         */
        if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {
                *skb_hwtstamps(skb) = *hwtstamps;
                __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false);
                sock_put(sk);
                return;
        }

err:
        kfree_skb(skb);
}
EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);

static bool skb_tstamp_tx_report_so_timestamping(struct sk_buff *skb,
                                                 struct skb_shared_hwtstamps *hwtstamps,
                                                 int tstype)
{
        switch (tstype) {
        case SCM_TSTAMP_SCHED:
                return skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP;
        case SCM_TSTAMP_SND:
                return skb_shinfo(skb)->tx_flags & (hwtstamps ? SKBTX_HW_TSTAMP_NOBPF :
                                                    SKBTX_SW_TSTAMP);
        case SCM_TSTAMP_ACK:
                return TCP_SKB_CB(skb)->txstamp_ack & TSTAMP_ACK_SK;
        case SCM_TSTAMP_COMPLETION:
                return skb_shinfo(skb)->tx_flags & SKBTX_COMPLETION_TSTAMP;
        }

        return false;
}

static void skb_tstamp_tx_report_bpf_timestamping(struct sk_buff *skb,
                                                  struct skb_shared_hwtstamps *hwtstamps,
                                                  struct sock *sk,
                                                  int tstype)
{
        int op;

        switch (tstype) {
        case SCM_TSTAMP_SCHED:
                op = BPF_SOCK_OPS_TSTAMP_SCHED_CB;
                break;
        case SCM_TSTAMP_SND:
                if (hwtstamps) {
                        op = BPF_SOCK_OPS_TSTAMP_SND_HW_CB;
                        *skb_hwtstamps(skb) = *hwtstamps;
                } else {
                        op = BPF_SOCK_OPS_TSTAMP_SND_SW_CB;
                }
                break;
        case SCM_TSTAMP_ACK:
                op = BPF_SOCK_OPS_TSTAMP_ACK_CB;
                break;
        default:
                return;
        }

        bpf_skops_tx_timestamping(sk, skb, op);
}

void __skb_tstamp_tx(struct sk_buff *orig_skb,
                     const struct sk_buff *ack_skb,
                     struct skb_shared_hwtstamps *hwtstamps,
                     struct sock *sk, int tstype)
{
        struct sk_buff *skb;
        bool tsonly, opt_stats = false;
        u32 tsflags;

        if (!sk)
                return;

        if (skb_shinfo(orig_skb)->tx_flags & SKBTX_BPF)
                skb_tstamp_tx_report_bpf_timestamping(orig_skb, hwtstamps,
                                                      sk, tstype);

        if (!skb_tstamp_tx_report_so_timestamping(orig_skb, hwtstamps, tstype))
                return;

        tsflags = READ_ONCE(sk->sk_tsflags);
        if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
            skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS)
                return;

        tsonly = tsflags & SOF_TIMESTAMPING_OPT_TSONLY;
        if (!skb_may_tx_timestamp(sk, tsonly))
                return;

        if (tsonly) {
#ifdef CONFIG_INET
                if ((tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
                    sk_is_tcp(sk)) {
                        skb = tcp_get_timestamping_opt_stats(sk, orig_skb,
                                                             ack_skb);
                        opt_stats = true;
                } else
#endif
                        skb = alloc_skb(0, GFP_ATOMIC);
        } else {
                skb = skb_clone(orig_skb, GFP_ATOMIC);

                if (skb_orphan_frags_rx(skb, GFP_ATOMIC)) {
                        kfree_skb(skb);
                        return;
                }
        }
        if (!skb)
                return;

        if (tsonly) {
                skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags &
                                             SKBTX_ANY_TSTAMP;
                skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey;
        }

        if (hwtstamps)
                *skb_hwtstamps(skb) = *hwtstamps;
        else
                __net_timestamp(skb);

        __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
}
EXPORT_SYMBOL_GPL(__skb_tstamp_tx);

void skb_tstamp_tx(struct sk_buff *orig_skb,
                   struct skb_shared_hwtstamps *hwtstamps)
{
        return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk,
                               SCM_TSTAMP_SND);
}
EXPORT_SYMBOL_GPL(skb_tstamp_tx);

#ifdef CONFIG_WIRELESS
void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
{
        struct sock *sk = skb->sk;
        struct sock_exterr_skb *serr;
        int err = 1;

        skb->wifi_acked_valid = 1;
        skb->wifi_acked = acked;

        serr = SKB_EXT_ERR(skb);
        memset(serr, 0, sizeof(*serr));
        serr->ee.ee_errno = ENOMSG;
        serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;

        /* Take a reference to prevent skb_orphan() from freeing the socket,
         * but only if the socket refcount is not zero.
         */
        if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {
                err = sock_queue_err_skb(sk, skb);
                sock_put(sk);
        }
        if (err)
                kfree_skb(skb);
}
EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
#endif /* CONFIG_WIRELESS */

/**
 * skb_partial_csum_set - set up and verify partial csum values for packet
 * @skb: the skb to set
 * @start: the number of bytes after skb->data to start checksumming.
 * @off: the offset from start to place the checksum.
 *
 * For untrusted partially-checksummed packets, we need to make sure the values
 * for skb->csum_start and skb->csum_offset are valid so we don't oops.
 *
 * This function checks and sets those values and skb->ip_summed: if this
 * returns false you should drop the packet.
 */
bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
{
        u32 csum_end = (u32)start + (u32)off + sizeof(__sum16);
        u32 csum_start = skb_headroom(skb) + (u32)start;

        if (unlikely(csum_start >= U16_MAX || csum_end > skb_headlen(skb))) {
                net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n",
                                     start, off, skb_headroom(skb), skb_headlen(skb));
                return false;
        }
        skb->ip_summed = CHECKSUM_PARTIAL;
        skb->csum_start = csum_start;
        skb->csum_offset = off;
        skb->transport_header = csum_start;
        return true;
}
EXPORT_SYMBOL_GPL(skb_partial_csum_set);

static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len,
                               unsigned int max)
{
        if (skb_headlen(skb) >= len)
                return 0;

        /* If we need to pullup then pullup to the max, so we
         * won't need to do it again.
         */
        if (max > skb->len)
                max = skb->len;

        if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL)
                return -ENOMEM;

        if (skb_headlen(skb) < len)
                return -EPROTO;

        return 0;
}

#define MAX_TCP_HDR_LEN (15 * 4)

static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb,
                                      typeof(IPPROTO_IP) proto,
                                      unsigned int off)
{
        int err;

        switch (proto) {
        case IPPROTO_TCP:
                err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr),
                                          off + MAX_TCP_HDR_LEN);
                if (!err && !skb_partial_csum_set(skb, off,
                                                  offsetof(struct tcphdr,
                                                           check)))
                        err = -EPROTO;
                return err ? ERR_PTR(err) : &tcp_hdr(skb)->check;

        case IPPROTO_UDP:
                err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr),
                                          off + sizeof(struct udphdr));
                if (!err && !skb_partial_csum_set(skb, off,
                                                  offsetof(struct udphdr,
                                                           check)))
                        err = -EPROTO;
                return err ? ERR_PTR(err) : &udp_hdr(skb)->check;
        }

        return ERR_PTR(-EPROTO);
}

/* This value should be large enough to cover a tagged ethernet header plus
 * maximally sized IP and TCP or UDP headers.
 */
#define MAX_IP_HDR_LEN 128

static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate)
{
        unsigned int off;
        bool fragment;
        __sum16 *csum;
        int err;

        fragment = false;

        err = skb_maybe_pull_tail(skb,
                                  sizeof(struct iphdr),
                                  MAX_IP_HDR_LEN);
        if (err < 0)
                goto out;

        if (ip_is_fragment(ip_hdr(skb)))
                fragment = true;

        off = ip_hdrlen(skb);

        err = -EPROTO;

        if (fragment)
                goto out;

        csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off);
        if (IS_ERR(csum))
                return PTR_ERR(csum);

        if (recalculate)
                *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
                                           ip_hdr(skb)->daddr,
                                           skb->len - off,
                                           ip_hdr(skb)->protocol, 0);
        err = 0;

out:
        return err;
}

/* This value should be large enough to cover a tagged ethernet header plus
 * an IPv6 header, all options, and a maximal TCP or UDP header.
 */
#define MAX_IPV6_HDR_LEN 256

#define OPT_HDR(type, skb, off) \
        (type *)(skb_network_header(skb) + (off))

static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate)
{
        int err;
        u8 nexthdr;
        unsigned int off;
        unsigned int len;
        bool fragment;
        bool done;
        __sum16 *csum;

        fragment = false;
        done = false;

        off = sizeof(struct ipv6hdr);

        err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN);
        if (err < 0)
                goto out;

        nexthdr = ipv6_hdr(skb)->nexthdr;

        len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len);
        while (off <= len && !done) {
                switch (nexthdr) {
                case IPPROTO_DSTOPTS:
                case IPPROTO_HOPOPTS:
                case IPPROTO_ROUTING: {
                        struct ipv6_opt_hdr *hp;

                        err = skb_maybe_pull_tail(skb,
                                                  off +
                                                  sizeof(struct ipv6_opt_hdr),
                                                  MAX_IPV6_HDR_LEN);
                        if (err < 0)
                                goto out;

                        hp = OPT_HDR(struct ipv6_opt_hdr, skb, off);
                        nexthdr = hp->nexthdr;
                        off += ipv6_optlen(hp);
                        break;
                }
                case IPPROTO_AH: {
                        struct ip_auth_hdr *hp;

                        err = skb_maybe_pull_tail(skb,
                                                  off +
                                                  sizeof(struct ip_auth_hdr),
                                                  MAX_IPV6_HDR_LEN);
                        if (err < 0)
                                goto out;

                        hp = OPT_HDR(struct ip_auth_hdr, skb, off);
                        nexthdr = hp->nexthdr;
                        off += ipv6_authlen(hp);
                        break;
                }
                case IPPROTO_FRAGMENT: {
                        struct frag_hdr *hp;

                        err = skb_maybe_pull_tail(skb,
                                                  off +
                                                  sizeof(struct frag_hdr),
                                                  MAX_IPV6_HDR_LEN);
                        if (err < 0)
                                goto out;

                        hp = OPT_HDR(struct frag_hdr, skb, off);

                        if (hp->frag_off & htons(IP6_OFFSET | IP6_MF))
                                fragment = true;

                        nexthdr = hp->nexthdr;
                        off += sizeof(struct frag_hdr);
                        break;
                }
                default:
                        done = true;
                        break;
                }
        }

        err = -EPROTO;

        if (!done || fragment)
                goto out;

        csum = skb_checksum_setup_ip(skb, nexthdr, off);
        if (IS_ERR(csum))
                return PTR_ERR(csum);

        if (recalculate)
                *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
                                         &ipv6_hdr(skb)->daddr,
                                         skb->len - off, nexthdr, 0);
        err = 0;

out:
        return err;
}

/**
 * skb_checksum_setup - set up partial checksum offset
 * @skb: the skb to set up
 * @recalculate: if true the pseudo-header checksum will be recalculated
 */
int skb_checksum_setup(struct sk_buff *skb, bool recalculate)
{
        int err;

        switch (skb->protocol) {
        case htons(ETH_P_IP):
                err = skb_checksum_setup_ipv4(skb, recalculate);
                break;

        case htons(ETH_P_IPV6):
                err = skb_checksum_setup_ipv6(skb, recalculate);
                break;

        default:
                err = -EPROTO;
                break;
        }

        return err;
}
EXPORT_SYMBOL(skb_checksum_setup);

/**
 * skb_checksum_maybe_trim - maybe trims the given skb
 * @skb: the skb to check
 * @transport_len: the data length beyond the network header
 *
 * Checks whether the given skb has data beyond the given transport length.
 * If so, returns a cloned skb trimmed to this transport length.
 * Otherwise returns the provided skb. Returns NULL in error cases
 * (e.g. transport_len exceeds skb length or out-of-memory).
 *
 * Caller needs to set the skb transport header and free any returned skb if it
 * differs from the provided skb.
 */
static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb,
                                               unsigned int transport_len)
{
        struct sk_buff *skb_chk;
        unsigned int len = skb_transport_offset(skb) + transport_len;
        int ret;

        if (skb->len < len)
                return NULL;
        else if (skb->len == len)
                return skb;

        skb_chk = skb_clone(skb, GFP_ATOMIC);
        if (!skb_chk)
                return NULL;

        ret = pskb_trim_rcsum(skb_chk, len);
        if (ret) {
                kfree_skb(skb_chk);
                return NULL;
        }

        return skb_chk;
}

/**
 * skb_checksum_trimmed - validate checksum of an skb
 * @skb: the skb to check
 * @transport_len: the data length beyond the network header
 * @skb_chkf: checksum function to use
 *
 * Applies the given checksum function skb_chkf to the provided skb.
 * Returns a checked and maybe trimmed skb. Returns NULL on error.
 *
 * If the skb has data beyond the given transport length, then a
 * trimmed & cloned skb is checked and returned.
 *
 * Caller needs to set the skb transport header and free any returned skb if it
 * differs from the provided skb.
 */
struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
                                     unsigned int transport_len,
                                     __sum16(*skb_chkf)(struct sk_buff *skb))
{
        struct sk_buff *skb_chk;
        unsigned int offset = skb_transport_offset(skb);
        __sum16 ret;

        skb_chk = skb_checksum_maybe_trim(skb, transport_len);
        if (!skb_chk)
                goto err;

        if (!pskb_may_pull(skb_chk, offset))
                goto err;

        skb_pull_rcsum(skb_chk, offset);
        ret = skb_chkf(skb_chk);
        skb_push_rcsum(skb_chk, offset);

        if (ret)
                goto err;

        return skb_chk;

err:
        if (skb_chk && skb_chk != skb)
                kfree_skb(skb_chk);

        return NULL;

}
EXPORT_SYMBOL(skb_checksum_trimmed);

void __skb_warn_lro_forwarding(const struct sk_buff *skb)
{
        net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n",
                             skb->dev->name);
}
EXPORT_SYMBOL(__skb_warn_lro_forwarding);

void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
{
        if (head_stolen) {
                skb_release_head_state(skb);
                kmem_cache_free(net_hotdata.skbuff_cache, skb);
        } else {
                __kfree_skb(skb);
        }
}
EXPORT_SYMBOL(kfree_skb_partial);

/**
 * skb_try_coalesce - try to merge skb to prior one
 * @to: prior buffer
 * @from: buffer to add
 * @fragstolen: pointer to boolean
 * @delta_truesize: how much more was allocated than was requested
 */
bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
                      bool *fragstolen, int *delta_truesize)
{
        struct skb_shared_info *to_shinfo, *from_shinfo;
        int i, delta, len = from->len;

        *fragstolen = false;

        if (skb_cloned(to))
                return false;

        /* In general, avoid mixing page_pool and non-page_pool allocated
         * pages within the same SKB. In theory we could take full
         * references if @from is cloned and !@to->pp_recycle but its
         * tricky (due to potential race with the clone disappearing) and
         * rare, so not worth dealing with.
         */
        if (to->pp_recycle != from->pp_recycle)
                return false;

        if (skb_frags_readable(from) != skb_frags_readable(to))
                return false;

        if (len <= skb_tailroom(to) && skb_frags_readable(from)) {
                if (len)
                        BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
                *delta_truesize = 0;
                return true;
        }

        to_shinfo = skb_shinfo(to);
        from_shinfo = skb_shinfo(from);
        if (to_shinfo->frag_list || from_shinfo->frag_list)
                return false;
        if (skb_zcopy(to) || skb_zcopy(from))
                return false;

        if (skb_headlen(from) != 0) {
                struct page *page;
                unsigned int offset;

                if (to_shinfo->nr_frags +
                    from_shinfo->nr_frags >= MAX_SKB_FRAGS)
                        return false;

                if (skb_head_is_locked(from))
                        return false;

                delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));

                page = virt_to_head_page(from->head);
                offset = from->data - (unsigned char *)page_address(page);

                skb_fill_page_desc(to, to_shinfo->nr_frags,
                                   page, offset, skb_headlen(from));
                *fragstolen = true;
        } else {
                if (to_shinfo->nr_frags +
                    from_shinfo->nr_frags > MAX_SKB_FRAGS)
                        return false;

                delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from));
        }

        WARN_ON_ONCE(delta < len);

        memcpy(to_shinfo->frags + to_shinfo->nr_frags,
               from_shinfo->frags,
               from_shinfo->nr_frags * sizeof(skb_frag_t));
        to_shinfo->nr_frags += from_shinfo->nr_frags;

        if (!skb_cloned(from))
                from_shinfo->nr_frags = 0;

        /* if the skb is not cloned this does nothing
         * since we set nr_frags to 0.
         */
        if (skb_pp_frag_ref(from)) {
                for (i = 0; i < from_shinfo->nr_frags; i++)
                        __skb_frag_ref(&from_shinfo->frags[i]);
        }

        to->truesize += delta;
        to->len += len;
        to->data_len += len;

        *delta_truesize = delta;
        return true;
}
EXPORT_SYMBOL(skb_try_coalesce);

/**
 * skb_scrub_packet - scrub an skb
 *
 * @skb: buffer to clean
 * @xnet: packet is crossing netns
 *
 * skb_scrub_packet can be used after encapsulating or decapsulating a packet
 * into/from a tunnel. Some information have to be cleared during these
 * operations.
 * skb_scrub_packet can also be used to clean a skb before injecting it in
 * another namespace (@xnet == true). We have to clear all information in the
 * skb that could impact namespace isolation.
 */
void skb_scrub_packet(struct sk_buff *skb, bool xnet)
{
        skb->pkt_type = PACKET_HOST;
        skb->skb_iif = 0;
        skb->ignore_df = 0;
        skb_dst_drop(skb);
        skb_ext_reset(skb);
        nf_reset_ct(skb);
        nf_reset_trace(skb);

#ifdef CONFIG_NET_SWITCHDEV
        skb->offload_fwd_mark = 0;
        skb->offload_l3_fwd_mark = 0;
#endif
        ipvs_reset(skb);

        if (!xnet)
                return;

        skb->mark = 0;
        skb_clear_tstamp(skb);
}
EXPORT_SYMBOL_GPL(skb_scrub_packet);

static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
{
        int mac_len, meta_len;
        void *meta;

        if (skb_cow(skb, skb_headroom(skb)) < 0) {
                kfree_skb(skb);
                return NULL;
        }

        mac_len = skb->data - skb_mac_header(skb);
        if (likely(mac_len > VLAN_HLEN + ETH_TLEN)) {
                memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb),
                        mac_len - VLAN_HLEN - ETH_TLEN);
        }

        meta_len = skb_metadata_len(skb);
        if (meta_len) {
                meta = skb_metadata_end(skb) - meta_len;
                memmove(meta + VLAN_HLEN, meta, meta_len);
        }

        skb->mac_header += VLAN_HLEN;
        return skb;
}

struct sk_buff *skb_vlan_untag(struct sk_buff *skb)
{
        struct vlan_hdr *vhdr;
        u16 vlan_tci;

        if (unlikely(skb_vlan_tag_present(skb))) {
                /* vlan_tci is already set-up so leave this for another time */
                return skb;
        }

        skb = skb_share_check(skb, GFP_ATOMIC);
        if (unlikely(!skb))
                goto err_free;
        /* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */
        if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short))))
                goto err_free;

        vhdr = (struct vlan_hdr *)skb->data;
        vlan_tci = ntohs(vhdr->h_vlan_TCI);
        __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci);

        skb_pull_rcsum(skb, VLAN_HLEN);
        vlan_set_encap_proto(skb, vhdr);

        skb = skb_reorder_vlan_header(skb);
        if (unlikely(!skb))
                goto err_free;

        skb_reset_network_header(skb);
        if (!skb_transport_header_was_set(skb))
                skb_reset_transport_header(skb);
        skb_reset_mac_len(skb);

        return skb;

err_free:
        kfree_skb(skb);
        return NULL;
}
EXPORT_SYMBOL(skb_vlan_untag);

int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len)
{
        if (!pskb_may_pull(skb, write_len))
                return -ENOMEM;

        if (!skb_cloned(skb) || skb_clone_writable(skb, write_len))
                return 0;

        return pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
}
EXPORT_SYMBOL(skb_ensure_writable);

int skb_ensure_writable_head_tail(struct sk_buff *skb, struct net_device *dev)
{
        int needed_headroom = dev->needed_headroom;
        int needed_tailroom = dev->needed_tailroom;

        /* For tail taggers, we need to pad short frames ourselves, to ensure
         * that the tail tag does not fail at its role of being at the end of
         * the packet, once the conduit interface pads the frame. Account for
         * that pad length here, and pad later.
         */
        if (unlikely(needed_tailroom && skb->len < ETH_ZLEN))
                needed_tailroom += ETH_ZLEN - skb->len;
        /* skb_headroom() returns unsigned int... */
        needed_headroom = max_t(int, needed_headroom - skb_headroom(skb), 0);
        needed_tailroom = max_t(int, needed_tailroom - skb_tailroom(skb), 0);

        if (likely(!needed_headroom && !needed_tailroom && !skb_cloned(skb)))
                /* No reallocation needed, yay! */
                return 0;

        return pskb_expand_head(skb, needed_headroom, needed_tailroom,
                                GFP_ATOMIC);
}
EXPORT_SYMBOL(skb_ensure_writable_head_tail);

/* remove VLAN header from packet and update csum accordingly.
 * expects a non skb_vlan_tag_present skb with a vlan tag payload
 */
int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
{
        int offset = skb->data - skb_mac_header(skb);
        int err;

        if (WARN_ONCE(offset,
                      "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n",
                      offset)) {
                return -EINVAL;
        }

        err = skb_ensure_writable(skb, VLAN_ETH_HLEN);
        if (unlikely(err))
                return err;

        skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);

        vlan_remove_tag(skb, vlan_tci);

        skb->mac_header += VLAN_HLEN;

        if (skb_network_offset(skb) < ETH_HLEN)
                skb_set_network_header(skb, ETH_HLEN);

        skb_reset_mac_len(skb);

        return err;
}
EXPORT_SYMBOL(__skb_vlan_pop);

/* Pop a vlan tag either from hwaccel or from payload.
 * Expects skb->data at mac header.
 */
int skb_vlan_pop(struct sk_buff *skb)
{
        u16 vlan_tci;
        __be16 vlan_proto;
        int err;

        if (likely(skb_vlan_tag_present(skb))) {
                __vlan_hwaccel_clear_tag(skb);
        } else {
                if (unlikely(!eth_type_vlan(skb->protocol)))
                        return 0;

                err = __skb_vlan_pop(skb, &vlan_tci);
                if (err)
                        return err;
        }
        /* move next vlan tag to hw accel tag */
        if (likely(!eth_type_vlan(skb->protocol)))
                return 0;

        vlan_proto = skb->protocol;
        err = __skb_vlan_pop(skb, &vlan_tci);
        if (unlikely(err))
                return err;

        __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
        return 0;
}
EXPORT_SYMBOL(skb_vlan_pop);

/* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present).
 * Expects skb->data at mac header.
 */
int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
{
        if (skb_vlan_tag_present(skb)) {
                int offset = skb->data - skb_mac_header(skb);
                int err;

                if (WARN_ONCE(offset,
                              "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n",
                              offset)) {
                        return -EINVAL;
                }

                err = __vlan_insert_tag(skb, skb->vlan_proto,
                                        skb_vlan_tag_get(skb));
                if (err)
                        return err;

                skb->protocol = skb->vlan_proto;
                skb->network_header -= VLAN_HLEN;

                skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
        }
        __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
        return 0;
}
EXPORT_SYMBOL(skb_vlan_push);

/**
 * skb_eth_pop() - Drop the Ethernet header at the head of a packet
 *
 * @skb: Socket buffer to modify
 *
 * Drop the Ethernet header of @skb.
 *
 * Expects that skb->data points to the mac header and that no VLAN tags are
 * present.
 *
 * Returns 0 on success, -errno otherwise.
 */
int skb_eth_pop(struct sk_buff *skb)
{
        if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) ||
            skb_network_offset(skb) < ETH_HLEN)
                return -EPROTO;

        skb_pull_rcsum(skb, ETH_HLEN);
        skb_reset_mac_header(skb);
        skb_reset_mac_len(skb);

        return 0;
}
EXPORT_SYMBOL(skb_eth_pop);

/**
 * skb_eth_push() - Add a new Ethernet header at the head of a packet
 *
 * @skb: Socket buffer to modify
 * @dst: Destination MAC address of the new header
 * @src: Source MAC address of the new header
 *
 * Prepend @skb with a new Ethernet header.
 *
 * Expects that skb->data points to the mac header, which must be empty.
 *
 * Returns 0 on success, -errno otherwise.
 */
int skb_eth_push(struct sk_buff *skb, const unsigned char *dst,
                 const unsigned char *src)
{
        struct ethhdr *eth;
        int err;

        if (skb_network_offset(skb) || skb_vlan_tag_present(skb))
                return -EPROTO;

        err = skb_cow_head(skb, sizeof(*eth));
        if (err < 0)
                return err;

        skb_push(skb, sizeof(*eth));
        skb_reset_mac_header(skb);
        skb_reset_mac_len(skb);

        eth = eth_hdr(skb);
        ether_addr_copy(eth->h_dest, dst);
        ether_addr_copy(eth->h_source, src);
        eth->h_proto = skb->protocol;

        skb_postpush_rcsum(skb, eth, sizeof(*eth));

        return 0;
}
EXPORT_SYMBOL(skb_eth_push);

/* Update the ethertype of hdr and the skb csum value if required. */
static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr,
                             __be16 ethertype)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE) {
                __be16 diff[] = { ~hdr->h_proto, ethertype };

                skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
        }

        hdr->h_proto = ethertype;
}

/**
 * skb_mpls_push() - push a new MPLS header after mac_len bytes from start of
 *                   the packet
 *
 * @skb: buffer
 * @mpls_lse: MPLS label stack entry to push
 * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848)
 * @mac_len: length of the MAC header
 * @ethernet: flag to indicate if the resulting packet after skb_mpls_push is
 *            ethernet
 *
 * Expects skb->data at mac header.
 *
 * Returns 0 on success, -errno otherwise.
 */
int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto,
                  int mac_len, bool ethernet)
{
        struct mpls_shim_hdr *lse;
        int err;

        if (unlikely(!eth_p_mpls(mpls_proto)))
                return -EINVAL;

        /* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */
        if (skb->encapsulation)
                return -EINVAL;

        err = skb_cow_head(skb, MPLS_HLEN);
        if (unlikely(err))
                return err;

        if (!skb->inner_protocol) {
                skb_set_inner_network_header(skb, skb_network_offset(skb));
                skb_set_inner_protocol(skb, skb->protocol);
        }

        skb_push(skb, MPLS_HLEN);
        memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
                mac_len);
        skb_reset_mac_header(skb);
        skb_set_network_header(skb, mac_len);
        skb_reset_mac_len(skb);

        lse = mpls_hdr(skb);
        lse->label_stack_entry = mpls_lse;
        skb_postpush_rcsum(skb, lse, MPLS_HLEN);

        if (ethernet && mac_len >= ETH_HLEN)
                skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto);
        skb->protocol = mpls_proto;

        return 0;
}
EXPORT_SYMBOL_GPL(skb_mpls_push);

/**
 * skb_mpls_pop() - pop the outermost MPLS header
 *
 * @skb: buffer
 * @next_proto: ethertype of header after popped MPLS header
 * @mac_len: length of the MAC header
 * @ethernet: flag to indicate if the packet is ethernet
 *
 * Expects skb->data at mac header.
 *
 * Returns 0 on success, -errno otherwise.
 */
int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len,
                 bool ethernet)
{
        int err;

        if (unlikely(!eth_p_mpls(skb->protocol)))
                return 0;

        err = skb_ensure_writable(skb, mac_len + MPLS_HLEN);
        if (unlikely(err))
                return err;

        skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN);
        memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
                mac_len);

        __skb_pull(skb, MPLS_HLEN);
        skb_reset_mac_header(skb);
        skb_set_network_header(skb, mac_len);

        if (ethernet && mac_len >= ETH_HLEN) {
                struct ethhdr *hdr;

                /* use mpls_hdr() to get ethertype to account for VLANs. */
                hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN);
                skb_mod_eth_type(skb, hdr, next_proto);
        }
        skb->protocol = next_proto;

        return 0;
}
EXPORT_SYMBOL_GPL(skb_mpls_pop);

/**
 * skb_mpls_update_lse() - modify outermost MPLS header and update csum
 *
 * @skb: buffer
 * @mpls_lse: new MPLS label stack entry to update to
 *
 * Expects skb->data at mac header.
 *
 * Returns 0 on success, -errno otherwise.
 */
int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse)
{
        int err;

        if (unlikely(!eth_p_mpls(skb->protocol)))
                return -EINVAL;

        err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
        if (unlikely(err))
                return err;

        if (skb->ip_summed == CHECKSUM_COMPLETE) {
                __be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse };

                skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
        }

        mpls_hdr(skb)->label_stack_entry = mpls_lse;

        return 0;
}
EXPORT_SYMBOL_GPL(skb_mpls_update_lse);

/**
 * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header
 *
 * @skb: buffer
 *
 * Expects skb->data at mac header.
 *
 * Returns 0 on success, -errno otherwise.
 */
int skb_mpls_dec_ttl(struct sk_buff *skb)
{
        u32 lse;
        u8 ttl;

        if (unlikely(!eth_p_mpls(skb->protocol)))
                return -EINVAL;

        if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN))
                return -ENOMEM;

        lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry);
        ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
        if (!--ttl)
                return -EINVAL;

        lse &= ~MPLS_LS_TTL_MASK;
        lse |= ttl << MPLS_LS_TTL_SHIFT;

        return skb_mpls_update_lse(skb, cpu_to_be32(lse));
}
EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl);

/**
 * alloc_skb_with_frags - allocate skb with page frags
 *
 * @header_len: size of linear part
 * @data_len: needed length in frags
 * @order: max page order desired.
 * @errcode: pointer to error code if any
 * @gfp_mask: allocation mask
 *
 * This can be used to allocate a paged skb, given a maximal order for frags.
 */
struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
                                     unsigned long data_len,
                                     int order,
                                     int *errcode,
                                     gfp_t gfp_mask)
{
        unsigned long chunk;
        struct sk_buff *skb;
        struct page *page;
        int nr_frags = 0;

        *errcode = -EMSGSIZE;
        if (unlikely(data_len > MAX_SKB_FRAGS * (PAGE_SIZE << order)))
                return NULL;

        *errcode = -ENOBUFS;
        skb = alloc_skb(header_len, gfp_mask);
        if (!skb)
                return NULL;

        while (data_len) {
                if (nr_frags == MAX_SKB_FRAGS)
                        goto failure;
                while (order && PAGE_ALIGN(data_len) < (PAGE_SIZE << order))
                        order--;

                if (order) {
                        page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) |
                                           __GFP_COMP |
                                           __GFP_NOWARN,
                                           order);
                        if (!page) {
                                order--;
                                continue;
                        }
                } else {
                        page = alloc_page(gfp_mask);
                        if (!page)
                                goto failure;
                }
                chunk = min_t(unsigned long, data_len,
                              PAGE_SIZE << order);
                skb_fill_page_desc(skb, nr_frags, page, 0, chunk);
                nr_frags++;
                skb->truesize += (PAGE_SIZE << order);
                data_len -= chunk;
        }
        return skb;

failure:
        kfree_skb(skb);
        return NULL;
}
EXPORT_SYMBOL(alloc_skb_with_frags);

/* carve out the first off bytes from skb when off < headlen */
static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
                                    const int headlen, gfp_t gfp_mask)
{
        int i;
        unsigned int size = skb_end_offset(skb);
        int new_hlen = headlen - off;
        u8 *data;

        if (skb_pfmemalloc(skb))
                gfp_mask |= __GFP_MEMALLOC;

        data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
        if (!data)
                return -ENOMEM;
        size = SKB_WITH_OVERHEAD(size);

        /* Copy real data, and all frags */
        skb_copy_from_linear_data_offset(skb, off, data, new_hlen);
        skb->len -= off;

        memcpy((struct skb_shared_info *)(data + size),
               skb_shinfo(skb),
               offsetof(struct skb_shared_info,
                        frags[skb_shinfo(skb)->nr_frags]));
        if (skb_cloned(skb)) {
                /* drop the old head gracefully */
                if (skb_orphan_frags(skb, gfp_mask)) {
                        skb_kfree_head(data, size);
                        return -ENOMEM;
                }
                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
                        skb_frag_ref(skb, i);
                if (skb_has_frag_list(skb))
                        skb_clone_fraglist(skb);
                skb_release_data(skb, SKB_CONSUMED);
        } else {
                /* we can reuse existing recount- all we did was
                 * relocate values
                 */
                skb_free_head(skb);
        }

        skb->head = data;
        skb->data = data;
        skb->head_frag = 0;
        skb_set_end_offset(skb, size);
        skb_set_tail_pointer(skb, skb_headlen(skb));
        skb_headers_offset_update(skb, 0);
        skb->cloned = 0;
        skb->hdr_len = 0;
        skb->nohdr = 0;
        atomic_set(&skb_shinfo(skb)->dataref, 1);

        return 0;
}

static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp);

/* carve out the first eat bytes from skb's frag_list. May recurse into
 * pskb_carve()
 */
static int pskb_carve_frag_list(struct skb_shared_info *shinfo, int eat,
                                gfp_t gfp_mask)
{
        struct sk_buff *list = shinfo->frag_list;
        struct sk_buff *clone = NULL;
        struct sk_buff *insp = NULL;

        do {
                if (!list) {
                        pr_err("Not enough bytes to eat. Want %d\n", eat);
                        return -EFAULT;
                }
                if (list->len <= eat) {
                        /* Eaten as whole. */
                        eat -= list->len;
                        list = list->next;
                        insp = list;
                } else {
                        /* Eaten partially. */
                        if (skb_shared(list)) {
                                clone = skb_clone(list, gfp_mask);
                                if (!clone)
                                        return -ENOMEM;
                                insp = list->next;
                                list = clone;
                        } else {
                                /* This may be pulled without problems. */
                                insp = list;
                        }
                        if (pskb_carve(list, eat, gfp_mask) < 0) {
                                kfree_skb(clone);
                                return -ENOMEM;
                        }
                        break;
                }
        } while (eat);

        /* Free pulled out fragments. */
        while ((list = shinfo->frag_list) != insp) {
                shinfo->frag_list = list->next;
                consume_skb(list);
        }
        /* And insert new clone at head. */
        if (clone) {
                clone->next = list;
                shinfo->frag_list = clone;
        }
        return 0;
}

/* carve off first len bytes from skb. Split line (off) is in the
 * non-linear part of skb
 */
static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
                                       int pos, gfp_t gfp_mask)
{
        int i, k = 0;
        unsigned int size = skb_end_offset(skb);
        u8 *data;
        const int nfrags = skb_shinfo(skb)->nr_frags;
        struct skb_shared_info *shinfo;

        if (skb_pfmemalloc(skb))
                gfp_mask |= __GFP_MEMALLOC;

        data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
        if (!data)
                return -ENOMEM;
        size = SKB_WITH_OVERHEAD(size);

        memcpy((struct skb_shared_info *)(data + size),
               skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0]));
        if (skb_orphan_frags(skb, gfp_mask)) {
                skb_kfree_head(data, size);
                return -ENOMEM;
        }
        shinfo = (struct skb_shared_info *)(data + size);
        for (i = 0; i < nfrags; i++) {
                int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]);

                if (pos + fsize > off) {
                        shinfo->frags[k] = skb_shinfo(skb)->frags[i];

                        if (pos < off) {
                                /* Split frag.
                                 * We have two variants in this case:
                                 * 1. Move all the frag to the second
                                 *    part, if it is possible. F.e.
                                 *    this approach is mandatory for TUX,
                                 *    where splitting is expensive.
                                 * 2. Split is accurately. We make this.
                                 */
                                skb_frag_off_add(&shinfo->frags[0], off - pos);
                                skb_frag_size_sub(&shinfo->frags[0], off - pos);
                        }
                        skb_frag_ref(skb, i);
                        k++;
                }
                pos += fsize;
        }
        shinfo->nr_frags = k;
        if (skb_has_frag_list(skb))
                skb_clone_fraglist(skb);

        /* split line is in frag list */
        if (k == 0 && pskb_carve_frag_list(shinfo, off - pos, gfp_mask)) {
                /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */
                if (skb_has_frag_list(skb))
                        kfree_skb_list(skb_shinfo(skb)->frag_list);
                skb_kfree_head(data, size);
                return -ENOMEM;
        }
        skb_release_data(skb, SKB_CONSUMED);

        skb->head = data;
        skb->head_frag = 0;
        skb->data = data;
        skb_set_end_offset(skb, size);
        skb_reset_tail_pointer(skb);
        skb_headers_offset_update(skb, 0);
        skb->cloned   = 0;
        skb->hdr_len  = 0;
        skb->nohdr    = 0;
        skb->len -= off;
        skb->data_len = skb->len;
        atomic_set(&skb_shinfo(skb)->dataref, 1);
        return 0;
}

/* remove len bytes from the beginning of the skb */
static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp)
{
        int headlen = skb_headlen(skb);

        if (len < headlen)
                return pskb_carve_inside_header(skb, len, headlen, gfp);
        else
                return pskb_carve_inside_nonlinear(skb, len, headlen, gfp);
}

/* Extract to_copy bytes starting at off from skb, and return this in
 * a new skb
 */
struct sk_buff *pskb_extract(struct sk_buff *skb, int off,
                             int to_copy, gfp_t gfp)
{
        struct sk_buff  *clone = skb_clone(skb, gfp);

        if (!clone)
                return NULL;

        if (pskb_carve(clone, off, gfp) < 0 ||
            pskb_trim(clone, to_copy)) {
                kfree_skb(clone);
                return NULL;
        }
        return clone;
}
EXPORT_SYMBOL(pskb_extract);

/**
 * skb_condense - try to get rid of fragments/frag_list if possible
 * @skb: buffer
 *
 * Can be used to save memory before skb is added to a busy queue.
 * If packet has bytes in frags and enough tail room in skb->head,
 * pull all of them, so that we can free the frags right now and adjust
 * truesize.
 * Notes:
 *        We do not reallocate skb->head thus can not fail.
 *        Caller must re-evaluate skb->truesize if needed.
 */
void skb_condense(struct sk_buff *skb)
{
        if (skb->data_len) {
                if (skb->data_len > skb->end - skb->tail ||
                    skb_cloned(skb) || !skb_frags_readable(skb))
                        return;

                /* Nice, we can free page frag(s) right now */
                __pskb_pull_tail(skb, skb->data_len);
        }
        /* At this point, skb->truesize might be over estimated,
         * because skb had a fragment, and fragments do not tell
         * their truesize.
         * When we pulled its content into skb->head, fragment
         * was freed, but __pskb_pull_tail() could not possibly
         * adjust skb->truesize, not knowing the frag truesize.
         */
        skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
}
EXPORT_SYMBOL(skb_condense);

#ifdef CONFIG_SKB_EXTENSIONS
static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id)
{
        return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE);
}

/**
 * __skb_ext_alloc - allocate a new skb extensions storage
 *
 * @flags: See kmalloc().
 *
 * Returns the newly allocated pointer. The pointer can later attached to a
 * skb via __skb_ext_set().
 * Note: caller must handle the skb_ext as an opaque data.
 */
struct skb_ext *__skb_ext_alloc(gfp_t flags)
{
        struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags);

        if (new) {
                memset(new->offset, 0, sizeof(new->offset));
                refcount_set(&new->refcnt, 1);
        }

        return new;
}

static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old,
                                         unsigned int old_active)
{
        struct skb_ext *new;

        if (refcount_read(&old->refcnt) == 1)
                return old;

        new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC);
        if (!new)
                return NULL;

        memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE);
        refcount_set(&new->refcnt, 1);

#ifdef CONFIG_XFRM
        if (old_active & (1 << SKB_EXT_SEC_PATH)) {
                struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH);
                unsigned int i;

                for (i = 0; i < sp->len; i++)
                        xfrm_state_hold(sp->xvec[i]);
        }
#endif
#ifdef CONFIG_MCTP_FLOWS
        if (old_active & (1 << SKB_EXT_MCTP)) {
                struct mctp_flow *flow = skb_ext_get_ptr(old, SKB_EXT_MCTP);

                if (flow->key)
                        refcount_inc(&flow->key->refs);
        }
#endif
        __skb_ext_put(old);
        return new;
}

/**
 * __skb_ext_set - attach the specified extension storage to this skb
 * @skb: buffer
 * @id: extension id
 * @ext: extension storage previously allocated via __skb_ext_alloc()
 *
 * Existing extensions, if any, are cleared.
 *
 * Returns the pointer to the extension.
 */
void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id,
                    struct skb_ext *ext)
{
        unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext);

        skb_ext_put(skb);
        newlen = newoff + skb_ext_type_len[id];
        ext->chunks = newlen;
        ext->offset[id] = newoff;
        skb->extensions = ext;
        skb->active_extensions = 1 << id;
        return skb_ext_get_ptr(ext, id);
}
EXPORT_SYMBOL_NS_GPL(__skb_ext_set, "NETDEV_INTERNAL");

/**
 * skb_ext_add - allocate space for given extension, COW if needed
 * @skb: buffer
 * @id: extension to allocate space for
 *
 * Allocates enough space for the given extension.
 * If the extension is already present, a pointer to that extension
 * is returned.
 *
 * If the skb was cloned, COW applies and the returned memory can be
 * modified without changing the extension space of clones buffers.
 *
 * Returns pointer to the extension or NULL on allocation failure.
 */
void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id)
{
        struct skb_ext *new, *old = NULL;
        unsigned int newlen, newoff;

        if (skb->active_extensions) {
                old = skb->extensions;

                new = skb_ext_maybe_cow(old, skb->active_extensions);
                if (!new)
                        return NULL;

                if (__skb_ext_exist(new, id))
                        goto set_active;

                newoff = new->chunks;
        } else {
                newoff = SKB_EXT_CHUNKSIZEOF(*new);

                new = __skb_ext_alloc(GFP_ATOMIC);
                if (!new)
                        return NULL;
        }

        newlen = newoff + skb_ext_type_len[id];
        new->chunks = newlen;
        new->offset[id] = newoff;
set_active:
        skb->slow_gro = 1;
        skb->extensions = new;
        skb->active_extensions |= 1 << id;
        return skb_ext_get_ptr(new, id);
}
EXPORT_SYMBOL(skb_ext_add);

#ifdef CONFIG_XFRM
static void skb_ext_put_sp(struct sec_path *sp)
{
        unsigned int i;

        for (i = 0; i < sp->len; i++)
                xfrm_state_put(sp->xvec[i]);
}
#endif

#ifdef CONFIG_MCTP_FLOWS
static void skb_ext_put_mctp(struct mctp_flow *flow)
{
        if (flow->key)
                mctp_key_unref(flow->key);
}
#endif

void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
{
        struct skb_ext *ext = skb->extensions;

        skb->active_extensions &= ~(1 << id);
        if (skb->active_extensions == 0) {
                skb->extensions = NULL;
                __skb_ext_put(ext);
#ifdef CONFIG_XFRM
        } else if (id == SKB_EXT_SEC_PATH &&
                   refcount_read(&ext->refcnt) == 1) {
                struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH);

                skb_ext_put_sp(sp);
                sp->len = 0;
#endif
        }
}
EXPORT_SYMBOL(__skb_ext_del);

void __skb_ext_put(struct skb_ext *ext)
{
        /* If this is last clone, nothing can increment
         * it after check passes.  Avoids one atomic op.
         */
        if (refcount_read(&ext->refcnt) == 1)
                goto free_now;

        if (!refcount_dec_and_test(&ext->refcnt))
                return;
free_now:
#ifdef CONFIG_XFRM
        if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH))
                skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH));
#endif
#ifdef CONFIG_MCTP_FLOWS
        if (__skb_ext_exist(ext, SKB_EXT_MCTP))
                skb_ext_put_mctp(skb_ext_get_ptr(ext, SKB_EXT_MCTP));
#endif

        kmem_cache_free(skbuff_ext_cache, ext);
}
EXPORT_SYMBOL(__skb_ext_put);
#endif /* CONFIG_SKB_EXTENSIONS */

static void kfree_skb_napi_cache(struct sk_buff *skb)
{
        /* if SKB is a clone, don't handle this case */
        if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
                __kfree_skb(skb);
                return;
        }

        local_bh_disable();
        __napi_kfree_skb(skb, SKB_CONSUMED);
        local_bh_enable();
}

/**
 * skb_attempt_defer_free - queue skb for remote freeing
 * @skb: buffer
 *
 * Put @skb in a per-cpu list, using the cpu which
 * allocated the skb/pages to reduce false sharing
 * and memory zone spinlock contention.
 */
void skb_attempt_defer_free(struct sk_buff *skb)
{
        struct skb_defer_node *sdn;
        unsigned long defer_count;
        int cpu = skb->alloc_cpu;
        unsigned int defer_max;
        bool kick;

        if (cpu == raw_smp_processor_id() ||
            WARN_ON_ONCE(cpu >= nr_cpu_ids) ||
            !cpu_online(cpu)) {
nodefer:        kfree_skb_napi_cache(skb);
                return;
        }

        DEBUG_NET_WARN_ON_ONCE(skb_dst(skb));
        DEBUG_NET_WARN_ON_ONCE(skb->destructor);
        DEBUG_NET_WARN_ON_ONCE(skb_nfct(skb));

        sdn = per_cpu_ptr(net_hotdata.skb_defer_nodes, cpu) + numa_node_id();

        defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max);
        defer_count = atomic_long_inc_return(&sdn->defer_count);

        if (defer_count >= defer_max)
                goto nodefer;

        llist_add(&skb->ll_node, &sdn->defer_list);

        /* Send an IPI every time queue reaches half capacity. */
        kick = (defer_count - 1) == (defer_max >> 1);

        /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
         * if we are unlucky enough (this seems very unlikely).
         */
        if (unlikely(kick))
                kick_defer_list_purge(cpu);
}

static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
                                 size_t offset, size_t len)
{
        const char *kaddr;
        __wsum csum;

        kaddr = kmap_local_page(page);
        csum = csum_partial(kaddr + offset, len, 0);
        kunmap_local(kaddr);
        skb->csum = csum_block_add(skb->csum, csum, skb->len);
}

/**
 * skb_splice_from_iter - Splice (or copy) pages to skbuff
 * @skb: The buffer to add pages to
 * @iter: Iterator representing the pages to be added
 * @maxsize: Maximum amount of pages to be added
 *
 * This is a common helper function for supporting MSG_SPLICE_PAGES.  It
 * extracts pages from an iterator and adds them to the socket buffer if
 * possible, copying them to fragments if not possible (such as if they're slab
 * pages).
 *
 * Returns the amount of data spliced/copied or -EMSGSIZE if there's
 * insufficient space in the buffer to transfer anything.
 */
ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
                             ssize_t maxsize)
{
        size_t frag_limit = READ_ONCE(net_hotdata.sysctl_max_skb_frags);
        struct page *pages[8], **ppages = pages;
        ssize_t spliced = 0, ret = 0;
        unsigned int i;

        while (iter->count > 0) {
                ssize_t space, nr, len;
                size_t off;

                ret = -EMSGSIZE;
                space = frag_limit - skb_shinfo(skb)->nr_frags;
                if (space < 0)
                        break;

                /* We might be able to coalesce without increasing nr_frags */
                nr = clamp_t(size_t, space, 1, ARRAY_SIZE(pages));

                len = iov_iter_extract_pages(iter, &ppages, maxsize, nr, 0, &off);
                if (len <= 0) {
                        ret = len ?: -EIO;
                        break;
                }

                i = 0;
                do {
                        struct page *page = pages[i++];
                        size_t part = min_t(size_t, PAGE_SIZE - off, len);

                        ret = -EIO;
                        if (WARN_ON_ONCE(!sendpage_ok(page)))
                                goto out;

                        ret = skb_append_pagefrags(skb, page, off, part,
                                                   frag_limit);
                        if (ret < 0) {
                                iov_iter_revert(iter, len);
                                goto out;
                        }

                        if (skb->ip_summed == CHECKSUM_NONE)
                                skb_splice_csum_page(skb, page, off, part);

                        off = 0;
                        spliced += part;
                        maxsize -= part;
                        len -= part;
                } while (len > 0);

                if (maxsize <= 0)
                        break;
        }

out:
        skb_len_add(skb, spliced);
        return spliced ?: ret;
}
EXPORT_SYMBOL(skb_splice_from_iter);

static __always_inline
size_t memcpy_from_iter_csum(void *iter_from, size_t progress,
                             size_t len, void *to, void *priv2)
{
        __wsum *csum = priv2;
        __wsum next = csum_partial_copy_nocheck(iter_from, to + progress, len);

        *csum = csum_block_add(*csum, next, progress);
        return 0;
}

static __always_inline
size_t copy_from_user_iter_csum(void __user *iter_from, size_t progress,
                                size_t len, void *to, void *priv2)
{
        __wsum next, *csum = priv2;

        next = csum_and_copy_from_user(iter_from, to + progress, len);
        *csum = csum_block_add(*csum, next, progress);
        return next ? 0 : len;
}

bool csum_and_copy_from_iter_full(void *addr, size_t bytes,
                                  __wsum *csum, struct iov_iter *i)
{
        size_t copied;

        if (WARN_ON_ONCE(!i->data_source))
                return false;
        copied = iterate_and_advance2(i, bytes, addr, csum,
                                      copy_from_user_iter_csum,
                                      memcpy_from_iter_csum);
        if (likely(copied == bytes))
                return true;
        iov_iter_revert(i, copied);
        return false;
}
EXPORT_SYMBOL(csum_and_copy_from_iter_full);

void get_netmem(netmem_ref netmem)
{
        struct net_iov *niov;

        if (netmem_is_net_iov(netmem)) {
                niov = netmem_to_net_iov(netmem);
                if (net_is_devmem_iov(niov))
                        net_devmem_get_net_iov(netmem_to_net_iov(netmem));
                return;
        }
        get_page(netmem_to_page(netmem));
}
EXPORT_SYMBOL(get_netmem);

void put_netmem(netmem_ref netmem)
{
        struct net_iov *niov;

        if (netmem_is_net_iov(netmem)) {
                niov = netmem_to_net_iov(netmem);
                if (net_is_devmem_iov(niov))
                        net_devmem_put_net_iov(netmem_to_net_iov(netmem));
                return;
        }

        put_page(netmem_to_page(netmem));
}
EXPORT_SYMBOL(put_netmem);






























































































































































































































































































































































































   11 

















































































































































































    7 




























































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Symmetric key ciphers.
 * 
 * Copyright (c) 2007-2015 Herbert Xu <herbert@gondor.apana.org.au>
 */

#ifndef _CRYPTO_SKCIPHER_H
#define _CRYPTO_SKCIPHER_H

#include <linux/atomic.h>
#include <linux/container_of.h>
#include <linux/crypto.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/types.h>

/* Set this bit if the lskcipher operation is a continuation. */
#define CRYPTO_LSKCIPHER_FLAG_CONT        0x00000001
/* Set this bit if the lskcipher operation is final. */
#define CRYPTO_LSKCIPHER_FLAG_FINAL        0x00000002
/* The bit CRYPTO_TFM_REQ_MAY_SLEEP can also be set if needed. */

/* Set this bit if the skcipher operation is a continuation. */
#define CRYPTO_SKCIPHER_REQ_CONT        0x00000001
/* Set this bit if the skcipher operation is not final. */
#define CRYPTO_SKCIPHER_REQ_NOTFINAL        0x00000002

struct scatterlist;

/**
 *        struct skcipher_request - Symmetric key cipher request
 *        @cryptlen: Number of bytes to encrypt or decrypt
 *        @iv: Initialisation Vector
 *        @src: Source SG list
 *        @dst: Destination SG list
 *        @base: Underlying async request
 *        @__ctx: Start of private context data
 */
struct skcipher_request {
        unsigned int cryptlen;

        u8 *iv;

        struct scatterlist *src;
        struct scatterlist *dst;

        struct crypto_async_request base;

        void *__ctx[] CRYPTO_MINALIGN_ATTR;
};

struct crypto_skcipher {
        unsigned int reqsize;

        struct crypto_tfm base;
};

struct crypto_sync_skcipher {
        struct crypto_skcipher base;
};

struct crypto_lskcipher {
        struct crypto_tfm base;
};

/*
 * struct skcipher_alg_common - common properties of skcipher_alg
 * @min_keysize: Minimum key size supported by the transformation. This is the
 *                 smallest key length supported by this transformation algorithm.
 *                 This must be set to one of the pre-defined values as this is
 *                 not hardware specific. Possible values for this field can be
 *                 found via git grep "_MIN_KEY_SIZE" include/crypto/
 * @max_keysize: Maximum key size supported by the transformation. This is the
 *                 largest key length supported by this transformation algorithm.
 *                 This must be set to one of the pre-defined values as this is
 *                 not hardware specific. Possible values for this field can be
 *                 found via git grep "_MAX_KEY_SIZE" include/crypto/
 * @ivsize: IV size applicable for transformation. The consumer must provide an
 *            IV of exactly that size to perform the encrypt or decrypt operation.
 * @chunksize: Equal to the block size except for stream ciphers such as
 *               CTR where it is set to the underlying block size.
 * @statesize: Size of the internal state for the algorithm.
 * @base: Definition of a generic crypto algorithm.
 */
#define SKCIPHER_ALG_COMMON {                \
        unsigned int min_keysize;        \
        unsigned int max_keysize;        \
        unsigned int ivsize;                \
        unsigned int chunksize;                \
        unsigned int statesize;                \
                                        \
        struct crypto_alg base;                \
}
struct skcipher_alg_common SKCIPHER_ALG_COMMON;

/**
 * struct skcipher_alg - symmetric key cipher definition
 * @setkey: Set key for the transformation. This function is used to either
 *            program a supplied key into the hardware or store the key in the
 *            transformation context for programming it later. Note that this
 *            function does modify the transformation context. This function can
 *            be called multiple times during the existence of the transformation
 *            object, so one must make sure the key is properly reprogrammed into
 *            the hardware. This function is also responsible for checking the key
 *            length for validity. In case a software fallback was put in place in
 *            the @cra_init call, this function might need to use the fallback if
 *            the algorithm doesn't support all of the key sizes.
 * @encrypt: Encrypt a scatterlist of blocks. This function is used to encrypt
 *             the supplied scatterlist containing the blocks of data. The crypto
 *             API consumer is responsible for aligning the entries of the
 *             scatterlist properly and making sure the chunks are correctly
 *             sized. In case a software fallback was put in place in the
 *             @cra_init call, this function might need to use the fallback if
 *             the algorithm doesn't support all of the key sizes. In case the
 *             key was stored in transformation context, the key might need to be
 *             re-programmed into the hardware in this function. This function
 *             shall not modify the transformation context, as this function may
 *             be called in parallel with the same transformation object.
 * @decrypt: Decrypt a single block. This is a reverse counterpart to @encrypt
 *             and the conditions are exactly the same.
 * @export: Export partial state of the transformation. This function dumps the
 *            entire state of the ongoing transformation into a provided block of
 *            data so it can be @import 'ed back later on. This is useful in case
 *            you want to save partial result of the transformation after
 *            processing certain amount of data and reload this partial result
 *            multiple times later on for multiple re-use. No data processing
 *            happens at this point.
 * @import: Import partial state of the transformation. This function loads the
 *            entire state of the ongoing transformation from a provided block of
 *            data so the transformation can continue from this point onward. No
 *            data processing happens at this point.
 * @init: Initialize the cryptographic transformation object. This function
 *          is used to initialize the cryptographic transformation object.
 *          This function is called only once at the instantiation time, right
 *          after the transformation context was allocated. In case the
 *          cryptographic hardware has some special requirements which need to
 *          be handled by software, this function shall check for the precise
 *          requirement of the transformation and put any software fallbacks
 *          in place.
 * @exit: Deinitialize the cryptographic transformation object. This is a
 *          counterpart to @init, used to remove various changes set in
 *          @init.
 * @walksize: Equal to the chunk size except in cases where the algorithm is
 *               considerably more efficient if it can operate on multiple chunks
 *               in parallel. Should be a multiple of chunksize.
 * @co: see struct skcipher_alg_common
 *
 * All fields except @ivsize are mandatory and must be filled.
 */
struct skcipher_alg {
        int (*setkey)(struct crypto_skcipher *tfm, const u8 *key,
                      unsigned int keylen);
        int (*encrypt)(struct skcipher_request *req);
        int (*decrypt)(struct skcipher_request *req);
        int (*export)(struct skcipher_request *req, void *out);
        int (*import)(struct skcipher_request *req, const void *in);
        int (*init)(struct crypto_skcipher *tfm);
        void (*exit)(struct crypto_skcipher *tfm);

        unsigned int walksize;

        union {
                struct SKCIPHER_ALG_COMMON;
                struct skcipher_alg_common co;
        };
};

/**
 * struct lskcipher_alg - linear symmetric key cipher definition
 * @setkey: Set key for the transformation. This function is used to either
 *            program a supplied key into the hardware or store the key in the
 *            transformation context for programming it later. Note that this
 *            function does modify the transformation context. This function can
 *            be called multiple times during the existence of the transformation
 *            object, so one must make sure the key is properly reprogrammed into
 *            the hardware. This function is also responsible for checking the key
 *            length for validity. In case a software fallback was put in place in
 *            the @cra_init call, this function might need to use the fallback if
 *            the algorithm doesn't support all of the key sizes.
 * @encrypt: Encrypt a number of bytes. This function is used to encrypt
 *             the supplied data.  This function shall not modify
 *             the transformation context, as this function may be called
 *             in parallel with the same transformation object.  Data
 *             may be left over if length is not a multiple of blocks
 *             and there is more to come (final == false).  The number of
 *             left-over bytes should be returned in case of success.
 *             The siv field shall be as long as ivsize + statesize with
 *             the IV placed at the front.  The state will be used by the
 *             algorithm internally.
 * @decrypt: Decrypt a number of bytes. This is a reverse counterpart to
 *             @encrypt and the conditions are exactly the same.
 * @init: Initialize the cryptographic transformation object. This function
 *          is used to initialize the cryptographic transformation object.
 *          This function is called only once at the instantiation time, right
 *          after the transformation context was allocated.
 * @exit: Deinitialize the cryptographic transformation object. This is a
 *          counterpart to @init, used to remove various changes set in
 *          @init.
 * @co: see struct skcipher_alg_common
 */
struct lskcipher_alg {
        int (*setkey)(struct crypto_lskcipher *tfm, const u8 *key,
                      unsigned int keylen);
        int (*encrypt)(struct crypto_lskcipher *tfm, const u8 *src,
                       u8 *dst, unsigned len, u8 *siv, u32 flags);
        int (*decrypt)(struct crypto_lskcipher *tfm, const u8 *src,
                       u8 *dst, unsigned len, u8 *siv, u32 flags);
        int (*init)(struct crypto_lskcipher *tfm);
        void (*exit)(struct crypto_lskcipher *tfm);

        struct skcipher_alg_common co;
};

#define MAX_SYNC_SKCIPHER_REQSIZE      384
/*
 * This performs a type-check against the "_tfm" argument to make sure
 * all users have the correct skcipher tfm for doing on-stack requests.
 */
#define SYNC_SKCIPHER_REQUEST_ON_STACK(name, _tfm) \
        char __##name##_desc[sizeof(struct skcipher_request) + \
                             MAX_SYNC_SKCIPHER_REQSIZE \
                            ] CRYPTO_MINALIGN_ATTR; \
        struct skcipher_request *name = \
                (((struct skcipher_request *)__##name##_desc)->base.tfm = \
                        crypto_sync_skcipher_tfm((_tfm)), \
                 (void *)__##name##_desc)

/**
 * DOC: Symmetric Key Cipher API
 *
 * Symmetric key cipher API is used with the ciphers of type
 * CRYPTO_ALG_TYPE_SKCIPHER (listed as type "skcipher" in /proc/crypto).
 *
 * Asynchronous cipher operations imply that the function invocation for a
 * cipher request returns immediately before the completion of the operation.
 * The cipher request is scheduled as a separate kernel thread and therefore
 * load-balanced on the different CPUs via the process scheduler. To allow
 * the kernel crypto API to inform the caller about the completion of a cipher
 * request, the caller must provide a callback function. That function is
 * invoked with the cipher handle when the request completes.
 *
 * To support the asynchronous operation, additional information than just the
 * cipher handle must be supplied to the kernel crypto API. That additional
 * information is given by filling in the skcipher_request data structure.
 *
 * For the symmetric key cipher API, the state is maintained with the tfm
 * cipher handle. A single tfm can be used across multiple calls and in
 * parallel. For asynchronous block cipher calls, context data supplied and
 * only used by the caller can be referenced the request data structure in
 * addition to the IV used for the cipher request. The maintenance of such
 * state information would be important for a crypto driver implementer to
 * have, because when calling the callback function upon completion of the
 * cipher operation, that callback function may need some information about
 * which operation just finished if it invoked multiple in parallel. This
 * state information is unused by the kernel crypto API.
 */

static inline struct crypto_skcipher *__crypto_skcipher_cast(
        struct crypto_tfm *tfm)
{
        return container_of(tfm, struct crypto_skcipher, base);
}

/**
 * crypto_alloc_skcipher() - allocate symmetric key cipher handle
 * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
 *              skcipher cipher
 * @type: specifies the type of the cipher
 * @mask: specifies the mask for the cipher
 *
 * Allocate a cipher handle for an skcipher. The returned struct
 * crypto_skcipher is the cipher handle that is required for any subsequent
 * API invocation for that skcipher.
 *
 * Return: allocated cipher handle in case of success; IS_ERR() is true in case
 *           of an error, PTR_ERR() returns the error code.
 */
struct crypto_skcipher *crypto_alloc_skcipher(const char *alg_name,
                                              u32 type, u32 mask);

struct crypto_sync_skcipher *crypto_alloc_sync_skcipher(const char *alg_name,
                                              u32 type, u32 mask);


/**
 * crypto_alloc_lskcipher() - allocate linear symmetric key cipher handle
 * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
 *              lskcipher
 * @type: specifies the type of the cipher
 * @mask: specifies the mask for the cipher
 *
 * Allocate a cipher handle for an lskcipher. The returned struct
 * crypto_lskcipher is the cipher handle that is required for any subsequent
 * API invocation for that lskcipher.
 *
 * Return: allocated cipher handle in case of success; IS_ERR() is true in case
 *           of an error, PTR_ERR() returns the error code.
 */
struct crypto_lskcipher *crypto_alloc_lskcipher(const char *alg_name,
                                                u32 type, u32 mask);

static inline struct crypto_tfm *crypto_skcipher_tfm(
        struct crypto_skcipher *tfm)
{
        return &tfm->base;
}

static inline struct crypto_tfm *crypto_lskcipher_tfm(
        struct crypto_lskcipher *tfm)
{
        return &tfm->base;
}

static inline struct crypto_tfm *crypto_sync_skcipher_tfm(
        struct crypto_sync_skcipher *tfm)
{
        return crypto_skcipher_tfm(&tfm->base);
}

/**
 * crypto_free_skcipher() - zeroize and free cipher handle
 * @tfm: cipher handle to be freed
 *
 * If @tfm is a NULL or error pointer, this function does nothing.
 */
static inline void crypto_free_skcipher(struct crypto_skcipher *tfm)
{
        crypto_destroy_tfm(tfm, crypto_skcipher_tfm(tfm));
}

static inline void crypto_free_sync_skcipher(struct crypto_sync_skcipher *tfm)
{
        crypto_free_skcipher(&tfm->base);
}

/**
 * crypto_free_lskcipher() - zeroize and free cipher handle
 * @tfm: cipher handle to be freed
 *
 * If @tfm is a NULL or error pointer, this function does nothing.
 */
static inline void crypto_free_lskcipher(struct crypto_lskcipher *tfm)
{
        crypto_destroy_tfm(tfm, crypto_lskcipher_tfm(tfm));
}

/**
 * crypto_has_skcipher() - Search for the availability of an skcipher.
 * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
 *              skcipher
 * @type: specifies the type of the skcipher
 * @mask: specifies the mask for the skcipher
 *
 * Return: true when the skcipher is known to the kernel crypto API; false
 *           otherwise
 */
int crypto_has_skcipher(const char *alg_name, u32 type, u32 mask);

static inline const char *crypto_skcipher_driver_name(
        struct crypto_skcipher *tfm)
{
        return crypto_tfm_alg_driver_name(crypto_skcipher_tfm(tfm));
}

static inline const char *crypto_lskcipher_driver_name(
        struct crypto_lskcipher *tfm)
{
        return crypto_tfm_alg_driver_name(crypto_lskcipher_tfm(tfm));
}

static inline struct skcipher_alg_common *crypto_skcipher_alg_common(
        struct crypto_skcipher *tfm)
{
        return container_of(crypto_skcipher_tfm(tfm)->__crt_alg,
                            struct skcipher_alg_common, base);
}

static inline struct skcipher_alg *crypto_skcipher_alg(
        struct crypto_skcipher *tfm)
{
        return container_of(crypto_skcipher_tfm(tfm)->__crt_alg,
                            struct skcipher_alg, base);
}

static inline struct lskcipher_alg *crypto_lskcipher_alg(
        struct crypto_lskcipher *tfm)
{
        return container_of(crypto_lskcipher_tfm(tfm)->__crt_alg,
                            struct lskcipher_alg, co.base);
}

/**
 * crypto_skcipher_ivsize() - obtain IV size
 * @tfm: cipher handle
 *
 * The size of the IV for the skcipher referenced by the cipher handle is
 * returned. This IV size may be zero if the cipher does not need an IV.
 *
 * Return: IV size in bytes
 */
static inline unsigned int crypto_skcipher_ivsize(struct crypto_skcipher *tfm)
{
        return crypto_skcipher_alg_common(tfm)->ivsize;
}

static inline unsigned int crypto_sync_skcipher_ivsize(
        struct crypto_sync_skcipher *tfm)
{
        return crypto_skcipher_ivsize(&tfm->base);
}

/**
 * crypto_lskcipher_ivsize() - obtain IV size
 * @tfm: cipher handle
 *
 * The size of the IV for the lskcipher referenced by the cipher handle is
 * returned. This IV size may be zero if the cipher does not need an IV.
 *
 * Return: IV size in bytes
 */
static inline unsigned int crypto_lskcipher_ivsize(
        struct crypto_lskcipher *tfm)
{
        return crypto_lskcipher_alg(tfm)->co.ivsize;
}

/**
 * crypto_skcipher_blocksize() - obtain block size of cipher
 * @tfm: cipher handle
 *
 * The block size for the skcipher referenced with the cipher handle is
 * returned. The caller may use that information to allocate appropriate
 * memory for the data returned by the encryption or decryption operation
 *
 * Return: block size of cipher
 */
static inline unsigned int crypto_skcipher_blocksize(
        struct crypto_skcipher *tfm)
{
        return crypto_tfm_alg_blocksize(crypto_skcipher_tfm(tfm));
}

/**
 * crypto_lskcipher_blocksize() - obtain block size of cipher
 * @tfm: cipher handle
 *
 * The block size for the lskcipher referenced with the cipher handle is
 * returned. The caller may use that information to allocate appropriate
 * memory for the data returned by the encryption or decryption operation
 *
 * Return: block size of cipher
 */
static inline unsigned int crypto_lskcipher_blocksize(
        struct crypto_lskcipher *tfm)
{
        return crypto_tfm_alg_blocksize(crypto_lskcipher_tfm(tfm));
}

/**
 * crypto_skcipher_chunksize() - obtain chunk size
 * @tfm: cipher handle
 *
 * The block size is set to one for ciphers such as CTR.  However,
 * you still need to provide incremental updates in multiples of
 * the underlying block size as the IV does not have sub-block
 * granularity.  This is known in this API as the chunk size.
 *
 * Return: chunk size in bytes
 */
static inline unsigned int crypto_skcipher_chunksize(
        struct crypto_skcipher *tfm)
{
        return crypto_skcipher_alg_common(tfm)->chunksize;
}

/**
 * crypto_lskcipher_chunksize() - obtain chunk size
 * @tfm: cipher handle
 *
 * The block size is set to one for ciphers such as CTR.  However,
 * you still need to provide incremental updates in multiples of
 * the underlying block size as the IV does not have sub-block
 * granularity.  This is known in this API as the chunk size.
 *
 * Return: chunk size in bytes
 */
static inline unsigned int crypto_lskcipher_chunksize(
        struct crypto_lskcipher *tfm)
{
        return crypto_lskcipher_alg(tfm)->co.chunksize;
}

/**
 * crypto_skcipher_statesize() - obtain state size
 * @tfm: cipher handle
 *
 * Some algorithms cannot be chained with the IV alone.  They carry
 * internal state which must be replicated if data is to be processed
 * incrementally.  The size of that state can be obtained with this
 * function.
 *
 * Return: state size in bytes
 */
static inline unsigned int crypto_skcipher_statesize(
        struct crypto_skcipher *tfm)
{
        return crypto_skcipher_alg_common(tfm)->statesize;
}

/**
 * crypto_lskcipher_statesize() - obtain state size
 * @tfm: cipher handle
 *
 * Some algorithms cannot be chained with the IV alone.  They carry
 * internal state which must be replicated if data is to be processed
 * incrementally.  The size of that state can be obtained with this
 * function.
 *
 * Return: state size in bytes
 */
static inline unsigned int crypto_lskcipher_statesize(
        struct crypto_lskcipher *tfm)
{
        return crypto_lskcipher_alg(tfm)->co.statesize;
}

static inline unsigned int crypto_sync_skcipher_blocksize(
        struct crypto_sync_skcipher *tfm)
{
        return crypto_skcipher_blocksize(&tfm->base);
}

static inline unsigned int crypto_skcipher_alignmask(
        struct crypto_skcipher *tfm)
{
        return crypto_tfm_alg_alignmask(crypto_skcipher_tfm(tfm));
}

static inline unsigned int crypto_lskcipher_alignmask(
        struct crypto_lskcipher *tfm)
{
        return crypto_tfm_alg_alignmask(crypto_lskcipher_tfm(tfm));
}

static inline u32 crypto_skcipher_get_flags(struct crypto_skcipher *tfm)
{
        return crypto_tfm_get_flags(crypto_skcipher_tfm(tfm));
}

static inline void crypto_skcipher_set_flags(struct crypto_skcipher *tfm,
                                               u32 flags)
{
        crypto_tfm_set_flags(crypto_skcipher_tfm(tfm), flags);
}

static inline void crypto_skcipher_clear_flags(struct crypto_skcipher *tfm,
                                                 u32 flags)
{
        crypto_tfm_clear_flags(crypto_skcipher_tfm(tfm), flags);
}

static inline u32 crypto_sync_skcipher_get_flags(
        struct crypto_sync_skcipher *tfm)
{
        return crypto_skcipher_get_flags(&tfm->base);
}

static inline void crypto_sync_skcipher_set_flags(
        struct crypto_sync_skcipher *tfm, u32 flags)
{
        crypto_skcipher_set_flags(&tfm->base, flags);
}

static inline void crypto_sync_skcipher_clear_flags(
        struct crypto_sync_skcipher *tfm, u32 flags)
{
        crypto_skcipher_clear_flags(&tfm->base, flags);
}

static inline u32 crypto_lskcipher_get_flags(struct crypto_lskcipher *tfm)
{
        return crypto_tfm_get_flags(crypto_lskcipher_tfm(tfm));
}

static inline void crypto_lskcipher_set_flags(struct crypto_lskcipher *tfm,
                                               u32 flags)
{
        crypto_tfm_set_flags(crypto_lskcipher_tfm(tfm), flags);
}

static inline void crypto_lskcipher_clear_flags(struct crypto_lskcipher *tfm,
                                                 u32 flags)
{
        crypto_tfm_clear_flags(crypto_lskcipher_tfm(tfm), flags);
}

/**
 * crypto_skcipher_setkey() - set key for cipher
 * @tfm: cipher handle
 * @key: buffer holding the key
 * @keylen: length of the key in bytes
 *
 * The caller provided key is set for the skcipher referenced by the cipher
 * handle.
 *
 * Note, the key length determines the cipher type. Many block ciphers implement
 * different cipher modes depending on the key size, such as AES-128 vs AES-192
 * vs. AES-256. When providing a 16 byte key for an AES cipher handle, AES-128
 * is performed.
 *
 * Return: 0 if the setting of the key was successful; < 0 if an error occurred
 */
int crypto_skcipher_setkey(struct crypto_skcipher *tfm,
                           const u8 *key, unsigned int keylen);

static inline int crypto_sync_skcipher_setkey(struct crypto_sync_skcipher *tfm,
                                         const u8 *key, unsigned int keylen)
{
        return crypto_skcipher_setkey(&tfm->base, key, keylen);
}

/**
 * crypto_lskcipher_setkey() - set key for cipher
 * @tfm: cipher handle
 * @key: buffer holding the key
 * @keylen: length of the key in bytes
 *
 * The caller provided key is set for the lskcipher referenced by the cipher
 * handle.
 *
 * Note, the key length determines the cipher type. Many block ciphers implement
 * different cipher modes depending on the key size, such as AES-128 vs AES-192
 * vs. AES-256. When providing a 16 byte key for an AES cipher handle, AES-128
 * is performed.
 *
 * Return: 0 if the setting of the key was successful; < 0 if an error occurred
 */
int crypto_lskcipher_setkey(struct crypto_lskcipher *tfm,
                            const u8 *key, unsigned int keylen);

static inline unsigned int crypto_skcipher_min_keysize(
        struct crypto_skcipher *tfm)
{
        return crypto_skcipher_alg_common(tfm)->min_keysize;
}

static inline unsigned int crypto_skcipher_max_keysize(
        struct crypto_skcipher *tfm)
{
        return crypto_skcipher_alg_common(tfm)->max_keysize;
}

static inline unsigned int crypto_lskcipher_min_keysize(
        struct crypto_lskcipher *tfm)
{
        return crypto_lskcipher_alg(tfm)->co.min_keysize;
}

static inline unsigned int crypto_lskcipher_max_keysize(
        struct crypto_lskcipher *tfm)
{
        return crypto_lskcipher_alg(tfm)->co.max_keysize;
}

/**
 * crypto_skcipher_reqtfm() - obtain cipher handle from request
 * @req: skcipher_request out of which the cipher handle is to be obtained
 *
 * Return the crypto_skcipher handle when furnishing an skcipher_request
 * data structure.
 *
 * Return: crypto_skcipher handle
 */
static inline struct crypto_skcipher *crypto_skcipher_reqtfm(
        struct skcipher_request *req)
{
        return __crypto_skcipher_cast(req->base.tfm);
}

static inline struct crypto_sync_skcipher *crypto_sync_skcipher_reqtfm(
        struct skcipher_request *req)
{
        struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);

        return container_of(tfm, struct crypto_sync_skcipher, base);
}

/**
 * crypto_skcipher_encrypt() - encrypt plaintext
 * @req: reference to the skcipher_request handle that holds all information
 *         needed to perform the cipher operation
 *
 * Encrypt plaintext data using the skcipher_request handle. That data
 * structure and how it is filled with data is discussed with the
 * skcipher_request_* functions.
 *
 * Return: 0 if the cipher operation was successful; < 0 if an error occurred
 */
int crypto_skcipher_encrypt(struct skcipher_request *req);

/**
 * crypto_skcipher_decrypt() - decrypt ciphertext
 * @req: reference to the skcipher_request handle that holds all information
 *         needed to perform the cipher operation
 *
 * Decrypt ciphertext data using the skcipher_request handle. That data
 * structure and how it is filled with data is discussed with the
 * skcipher_request_* functions.
 *
 * Return: 0 if the cipher operation was successful; < 0 if an error occurred
 */
int crypto_skcipher_decrypt(struct skcipher_request *req);

/**
 * crypto_skcipher_export() - export partial state
 * @req: reference to the skcipher_request handle that holds all information
 *         needed to perform the operation
 * @out: output buffer of sufficient size that can hold the state
 *
 * Export partial state of the transformation. This function dumps the
 * entire state of the ongoing transformation into a provided block of
 * data so it can be @import 'ed back later on. This is useful in case
 * you want to save partial result of the transformation after
 * processing certain amount of data and reload this partial result
 * multiple times later on for multiple re-use. No data processing
 * happens at this point.
 *
 * Return: 0 if the cipher operation was successful; < 0 if an error occurred
 */
int crypto_skcipher_export(struct skcipher_request *req, void *out);

/**
 * crypto_skcipher_import() - import partial state
 * @req: reference to the skcipher_request handle that holds all information
 *         needed to perform the operation
 * @in: buffer holding the state
 *
 * Import partial state of the transformation. This function loads the
 * entire state of the ongoing transformation from a provided block of
 * data so the transformation can continue from this point onward. No
 * data processing happens at this point.
 *
 * Return: 0 if the cipher operation was successful; < 0 if an error occurred
 */
int crypto_skcipher_import(struct skcipher_request *req, const void *in);

/**
 * crypto_lskcipher_encrypt() - encrypt plaintext
 * @tfm: lskcipher handle
 * @src: source buffer
 * @dst: destination buffer
 * @len: number of bytes to process
 * @siv: IV + state for the cipher operation.  The length of the IV must
 *         comply with the IV size defined by crypto_lskcipher_ivsize.  The
 *         IV is then followed with a buffer with the length as specified by
 *         crypto_lskcipher_statesize.
 * Encrypt plaintext data using the lskcipher handle.
 *
 * Return: >=0 if the cipher operation was successful, if positive
 *           then this many bytes have been left unprocessed;
 *           < 0 if an error occurred
 */
int crypto_lskcipher_encrypt(struct crypto_lskcipher *tfm, const u8 *src,
                             u8 *dst, unsigned len, u8 *siv);

/**
 * crypto_lskcipher_decrypt() - decrypt ciphertext
 * @tfm: lskcipher handle
 * @src: source buffer
 * @dst: destination buffer
 * @len: number of bytes to process
 * @siv: IV + state for the cipher operation.  The length of the IV must
 *         comply with the IV size defined by crypto_lskcipher_ivsize.  The
 *         IV is then followed with a buffer with the length as specified by
 *         crypto_lskcipher_statesize.
 *
 * Decrypt ciphertext data using the lskcipher handle.
 *
 * Return: >=0 if the cipher operation was successful, if positive
 *           then this many bytes have been left unprocessed;
 *           < 0 if an error occurred
 */
int crypto_lskcipher_decrypt(struct crypto_lskcipher *tfm, const u8 *src,
                             u8 *dst, unsigned len, u8 *siv);

/**
 * DOC: Symmetric Key Cipher Request Handle
 *
 * The skcipher_request data structure contains all pointers to data
 * required for the symmetric key cipher operation. This includes the cipher
 * handle (which can be used by multiple skcipher_request instances), pointer
 * to plaintext and ciphertext, asynchronous callback function, etc. It acts
 * as a handle to the skcipher_request_* API calls in a similar way as
 * skcipher handle to the crypto_skcipher_* API calls.
 */

/**
 * crypto_skcipher_reqsize() - obtain size of the request data structure
 * @tfm: cipher handle
 *
 * Return: number of bytes
 */
static inline unsigned int crypto_skcipher_reqsize(struct crypto_skcipher *tfm)
{
        return tfm->reqsize;
}

/**
 * skcipher_request_set_tfm() - update cipher handle reference in request
 * @req: request handle to be modified
 * @tfm: cipher handle that shall be added to the request handle
 *
 * Allow the caller to replace the existing skcipher handle in the request
 * data structure with a different one.
 */
static inline void skcipher_request_set_tfm(struct skcipher_request *req,
                                            struct crypto_skcipher *tfm)
{
        req->base.tfm = crypto_skcipher_tfm(tfm);
}

static inline void skcipher_request_set_sync_tfm(struct skcipher_request *req,
                                            struct crypto_sync_skcipher *tfm)
{
        skcipher_request_set_tfm(req, &tfm->base);
}

static inline struct skcipher_request *skcipher_request_cast(
        struct crypto_async_request *req)
{
        return container_of(req, struct skcipher_request, base);
}

/**
 * skcipher_request_alloc() - allocate request data structure
 * @tfm: cipher handle to be registered with the request
 * @gfp: memory allocation flag that is handed to kmalloc by the API call.
 *
 * Allocate the request data structure that must be used with the skcipher
 * encrypt and decrypt API calls. During the allocation, the provided skcipher
 * handle is registered in the request data structure.
 *
 * Return: allocated request handle in case of success, or NULL if out of memory
 */
static inline struct skcipher_request *skcipher_request_alloc_noprof(
        struct crypto_skcipher *tfm, gfp_t gfp)
{
        struct skcipher_request *req;

        req = kmalloc_noprof(sizeof(struct skcipher_request) +
                             crypto_skcipher_reqsize(tfm), gfp);

        if (likely(req))
                skcipher_request_set_tfm(req, tfm);

        return req;
}
#define skcipher_request_alloc(...)        alloc_hooks(skcipher_request_alloc_noprof(__VA_ARGS__))

/**
 * skcipher_request_free() - zeroize and free request data structure
 * @req: request data structure cipher handle to be freed
 */
static inline void skcipher_request_free(struct skcipher_request *req)
{
        kfree_sensitive(req);
}

static inline void skcipher_request_zero(struct skcipher_request *req)
{
        struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);

        memzero_explicit(req, sizeof(*req) + crypto_skcipher_reqsize(tfm));
}

/**
 * skcipher_request_set_callback() - set asynchronous callback function
 * @req: request handle
 * @flags: specify zero or an ORing of the flags
 *           CRYPTO_TFM_REQ_MAY_BACKLOG the request queue may back log and
 *           increase the wait queue beyond the initial maximum size;
 *           CRYPTO_TFM_REQ_MAY_SLEEP the request processing may sleep
 * @compl: callback function pointer to be registered with the request handle
 * @data: The data pointer refers to memory that is not used by the kernel
 *          crypto API, but provided to the callback function for it to use. Here,
 *          the caller can provide a reference to memory the callback function can
 *          operate on. As the callback function is invoked asynchronously to the
 *          related functionality, it may need to access data structures of the
 *          related functionality which can be referenced using this pointer. The
 *          callback function can access the memory via the "data" field in the
 *          crypto_async_request data structure provided to the callback function.
 *
 * This function allows setting the callback function that is triggered once the
 * cipher operation completes.
 *
 * The callback function is registered with the skcipher_request handle and
 * must comply with the following template::
 *
 *        void callback_function(struct crypto_async_request *req, int error)
 */
static inline void skcipher_request_set_callback(struct skcipher_request *req,
                                                 u32 flags,
                                                 crypto_completion_t compl,
                                                 void *data)
{
        req->base.complete = compl;
        req->base.data = data;
        req->base.flags = flags;
}

/**
 * skcipher_request_set_crypt() - set data buffers
 * @req: request handle
 * @src: source scatter / gather list
 * @dst: destination scatter / gather list
 * @cryptlen: number of bytes to process from @src
 * @iv: IV for the cipher operation which must comply with the IV size defined
 *      by crypto_skcipher_ivsize
 *
 * This function allows setting of the source data and destination data
 * scatter / gather lists.
 *
 * For encryption, the source is treated as the plaintext and the
 * destination is the ciphertext. For a decryption operation, the use is
 * reversed - the source is the ciphertext and the destination is the plaintext.
 */
static inline void skcipher_request_set_crypt(
        struct skcipher_request *req,
        struct scatterlist *src, struct scatterlist *dst,
        unsigned int cryptlen, void *iv)
{
        req->src = src;
        req->dst = dst;
        req->cryptlen = cryptlen;
        req->iv = iv;
}

#endif        /* _CRYPTO_SKCIPHER_H */

















































































































































































































































































































   19 































































































































   13 

















































































































































































































































































































































































    1 









   20 













    1 









    3 





   15 



   22 



















    1 










































































































































    1 





















    1 



























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Hash: Hash algorithms under the crypto API
 * 
 * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au>
 */

#ifndef _CRYPTO_HASH_H
#define _CRYPTO_HASH_H

#include <linux/crypto.h>
#include <linux/scatterlist.h>
#include <linux/slab.h>
#include <linux/string.h>

/* Set this bit for virtual address instead of SG list. */
#define CRYPTO_AHASH_REQ_VIRT        0x00000001

#define CRYPTO_AHASH_REQ_PRIVATE \
        CRYPTO_AHASH_REQ_VIRT

struct crypto_ahash;

/**
 * DOC: Message Digest Algorithm Definitions
 *
 * These data structures define modular message digest algorithm
 * implementations, managed via crypto_register_ahash(),
 * crypto_register_shash(), crypto_unregister_ahash() and
 * crypto_unregister_shash().
 */

/*
 * struct hash_alg_common - define properties of message digest
 * @digestsize: Size of the result of the transformation. A buffer of this size
 *                must be available to the @final and @finup calls, so they can
 *                store the resulting hash into it. For various predefined sizes,
 *                search include/crypto/ using
 *                git grep _DIGEST_SIZE include/crypto.
 * @statesize: Size of the block for partial state of the transformation. A
 *               buffer of this size must be passed to the @export function as it
 *               will save the partial state of the transformation into it. On the
 *               other side, the @import function will load the state from a
 *               buffer of this size as well.
 * @base: Start of data structure of cipher algorithm. The common data
 *          structure of crypto_alg contains information common to all ciphers.
 *          The hash_alg_common data structure now adds the hash-specific
 *          information.
 */
#define HASH_ALG_COMMON {                \
        unsigned int digestsize;        \
        unsigned int statesize;                \
                                        \
        struct crypto_alg base;                \
}
struct hash_alg_common HASH_ALG_COMMON;

struct ahash_request {
        struct crypto_async_request base;

        unsigned int nbytes;
        union {
                struct scatterlist *src;
                const u8 *svirt;
        };
        u8 *result;

        struct scatterlist sg_head[2];
        crypto_completion_t saved_complete;
        void *saved_data;

        void *__ctx[] CRYPTO_MINALIGN_ATTR;
};

/**
 * struct ahash_alg - asynchronous message digest definition
 * @init: **[mandatory]** Initialize the transformation context. Intended only to initialize the
 *          state of the HASH transformation at the beginning. This shall fill in
 *          the internal structures used during the entire duration of the whole
 *          transformation. No data processing happens at this point. Driver code
 *          implementation must not use req->result.
 * @update: **[mandatory]** Push a chunk of data into the driver for transformation. This
 *           function actually pushes blocks of data from upper layers into the
 *           driver, which then passes those to the hardware as seen fit. This
 *           function must not finalize the HASH transformation by calculating the
 *           final message digest as this only adds more data into the
 *           transformation. This function shall not modify the transformation
 *           context, as this function may be called in parallel with the same
 *           transformation object. Data processing can happen synchronously
 *           [SHASH] or asynchronously [AHASH] at this point. Driver must not use
 *           req->result.
 *           For block-only algorithms, @update must return the number
 *           of bytes to store in the API partial block buffer.
 * @final: **[mandatory]** Retrieve result from the driver. This function finalizes the
 *           transformation and retrieves the resulting hash from the driver and
 *           pushes it back to upper layers. No data processing happens at this
 *           point unless hardware requires it to finish the transformation
 *           (then the data buffered by the device driver is processed).
 * @finup: **[optional]** Combination of @update and @final. This function is effectively a
 *           combination of @update and @final calls issued in sequence. As some
 *           hardware cannot do @update and @final separately, this callback was
 *           added to allow such hardware to be used at least by IPsec. Data
 *           processing can happen synchronously [SHASH] or asynchronously [AHASH]
 *           at this point.
 * @digest: Combination of @init and @update and @final. This function
 *            effectively behaves as the entire chain of operations, @init,
 *            @update and @final issued in sequence. Just like @finup, this was
 *            added for hardware which cannot do even the @finup, but can only do
 *            the whole transformation in one run. Data processing can happen
 *            synchronously [SHASH] or asynchronously [AHASH] at this point.
 * @setkey: Set optional key used by the hashing algorithm. Intended to push
 *            optional key used by the hashing algorithm from upper layers into
 *            the driver. This function can store the key in the transformation
 *            context or can outright program it into the hardware. In the former
 *            case, one must be careful to program the key into the hardware at
 *            appropriate time and one must be careful that .setkey() can be
 *            called multiple times during the existence of the transformation
 *            object. Not  all hashing algorithms do implement this function as it
 *            is only needed for keyed message digests. SHAx/MDx/CRCx do NOT
 *            implement this function. HMAC(MDx)/HMAC(SHAx)/CMAC(AES) do implement
 *            this function. This function must be called before any other of the
 *            @init, @update, @final, @finup, @digest is called. No data
 *            processing happens at this point.
 * @export: Export partial state of the transformation. This function dumps the
 *            entire state of the ongoing transformation into a provided block of
 *            data so it can be @import 'ed back later on. This is useful in case
 *            you want to save partial result of the transformation after
 *            processing certain amount of data and reload this partial result
 *            multiple times later on for multiple re-use. No data processing
 *            happens at this point. Driver must not use req->result.
 * @import: Import partial state of the transformation. This function loads the
 *            entire state of the ongoing transformation from a provided block of
 *            data so the transformation can continue from this point onward. No
 *            data processing happens at this point. Driver must not use
 *            req->result.
 * @export_core: Export partial state without partial block.  Only defined
 *                 for algorithms that are not block-only.
 * @import_core: Import partial state without partial block.  Only defined
 *                 for algorithms that are not block-only.
 * @init_tfm: Initialize the cryptographic transformation object.
 *              This function is called only once at the instantiation
 *              time, right after the transformation context was
 *              allocated. In case the cryptographic hardware has
 *              some special requirements which need to be handled
 *              by software, this function shall check for the precise
 *              requirement of the transformation and put any software
 *              fallbacks in place.
 * @exit_tfm: Deinitialize the cryptographic transformation object.
 *              This is a counterpart to @init_tfm, used to remove
 *              various changes set in @init_tfm.
 * @clone_tfm: Copy transform into new object, may allocate memory.
 * @halg: see struct hash_alg_common
 */
struct ahash_alg {
        int (*init)(struct ahash_request *req);
        int (*update)(struct ahash_request *req);
        int (*final)(struct ahash_request *req);
        int (*finup)(struct ahash_request *req);
        int (*digest)(struct ahash_request *req);
        int (*export)(struct ahash_request *req, void *out);
        int (*import)(struct ahash_request *req, const void *in);
        int (*export_core)(struct ahash_request *req, void *out);
        int (*import_core)(struct ahash_request *req, const void *in);
        int (*setkey)(struct crypto_ahash *tfm, const u8 *key,
                      unsigned int keylen);
        int (*init_tfm)(struct crypto_ahash *tfm);
        void (*exit_tfm)(struct crypto_ahash *tfm);
        int (*clone_tfm)(struct crypto_ahash *dst, struct crypto_ahash *src);

        struct hash_alg_common halg;
};

struct shash_desc {
        struct crypto_shash *tfm;
        void *__ctx[] __aligned(ARCH_SLAB_MINALIGN);
};

#define HASH_MAX_DIGESTSIZE         64

/*
 * The size of a core hash state and a partial block.  The final byte
 * is the length of the partial block.
 */
#define HASH_STATE_AND_BLOCK(state, block) ((state) + (block) + 1)


/* Worst case is sha3-224. */
#define HASH_MAX_STATESIZE         HASH_STATE_AND_BLOCK(200, 144)

/* This needs to match arch/s390/crypto/sha.h. */
#define S390_SHA_CTX_SIZE        216

/*
 * Worst case is hmac(sha3-224-s390).  Its context is a nested 'shash_desc'
 * containing a 'struct s390_sha_ctx'.
 */
#define SHA3_224_S390_DESCSIZE        HASH_STATE_AND_BLOCK(S390_SHA_CTX_SIZE, 144)
#define HASH_MAX_DESCSIZE        (sizeof(struct shash_desc) + \
                                 SHA3_224_S390_DESCSIZE)
#define MAX_SYNC_HASH_REQSIZE        (sizeof(struct ahash_request) + \
                                 HASH_MAX_DESCSIZE)

#define SHASH_DESC_ON_STACK(shash, ctx)                                             \
        char __##shash##_desc[sizeof(struct shash_desc) + HASH_MAX_DESCSIZE] \
                __aligned(__alignof__(struct shash_desc));                     \
        struct shash_desc *shash = (struct shash_desc *)__##shash##_desc

#define HASH_REQUEST_ON_STACK(name, _tfm) \
        char __##name##_req[sizeof(struct ahash_request) + \
                            MAX_SYNC_HASH_REQSIZE] CRYPTO_MINALIGN_ATTR; \
        struct ahash_request *name = \
                ahash_request_on_stack_init(__##name##_req, (_tfm))

#define HASH_REQUEST_CLONE(name, gfp) \
        hash_request_clone(name, sizeof(__##name##_req), gfp)

#define CRYPTO_HASH_STATESIZE(coresize, blocksize) (coresize + blocksize + 1)

/**
 * struct shash_alg - synchronous message digest definition
 * @init: see struct ahash_alg
 * @update: see struct ahash_alg
 * @final: see struct ahash_alg
 * @finup: see struct ahash_alg
 * @digest: see struct ahash_alg
 * @export: see struct ahash_alg
 * @import: see struct ahash_alg
 * @export_core: see struct ahash_alg
 * @import_core: see struct ahash_alg
 * @setkey: see struct ahash_alg
 * @init_tfm: Initialize the cryptographic transformation object.
 *              This function is called only once at the instantiation
 *              time, right after the transformation context was
 *              allocated. In case the cryptographic hardware has
 *              some special requirements which need to be handled
 *              by software, this function shall check for the precise
 *              requirement of the transformation and put any software
 *              fallbacks in place.
 * @exit_tfm: Deinitialize the cryptographic transformation object.
 *              This is a counterpart to @init_tfm, used to remove
 *              various changes set in @init_tfm.
 * @clone_tfm: Copy transform into new object, may allocate memory.
 * @descsize: Size of the operational state for the message digest. This state
 *               size is the memory size that needs to be allocated for
 *              shash_desc.__ctx
 * @halg: see struct hash_alg_common
 * @HASH_ALG_COMMON: see struct hash_alg_common
 */
struct shash_alg {
        int (*init)(struct shash_desc *desc);
        int (*update)(struct shash_desc *desc, const u8 *data,
                      unsigned int len);
        int (*final)(struct shash_desc *desc, u8 *out);
        int (*finup)(struct shash_desc *desc, const u8 *data,
                     unsigned int len, u8 *out);
        int (*digest)(struct shash_desc *desc, const u8 *data,
                      unsigned int len, u8 *out);
        int (*export)(struct shash_desc *desc, void *out);
        int (*import)(struct shash_desc *desc, const void *in);
        int (*export_core)(struct shash_desc *desc, void *out);
        int (*import_core)(struct shash_desc *desc, const void *in);
        int (*setkey)(struct crypto_shash *tfm, const u8 *key,
                      unsigned int keylen);
        int (*init_tfm)(struct crypto_shash *tfm);
        void (*exit_tfm)(struct crypto_shash *tfm);
        int (*clone_tfm)(struct crypto_shash *dst, struct crypto_shash *src);

        unsigned int descsize;

        union {
                struct HASH_ALG_COMMON;
                struct hash_alg_common halg;
        };
};
#undef HASH_ALG_COMMON

struct crypto_ahash {
        bool using_shash; /* Underlying algorithm is shash, not ahash */
        unsigned int statesize;
        unsigned int reqsize;
        struct crypto_tfm base;
};

struct crypto_shash {
        struct crypto_tfm base;
};

/**
 * DOC: Asynchronous Message Digest API
 *
 * The asynchronous message digest API is used with the ciphers of type
 * CRYPTO_ALG_TYPE_AHASH (listed as type "ahash" in /proc/crypto)
 *
 * The asynchronous cipher operation discussion provided for the
 * CRYPTO_ALG_TYPE_SKCIPHER API applies here as well.
 */

static inline bool ahash_req_on_stack(struct ahash_request *req)
{
        return crypto_req_on_stack(&req->base);
}

static inline struct crypto_ahash *__crypto_ahash_cast(struct crypto_tfm *tfm)
{
        return container_of(tfm, struct crypto_ahash, base);
}

/**
 * crypto_alloc_ahash() - allocate ahash cipher handle
 * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
 *              ahash cipher
 * @type: specifies the type of the cipher
 * @mask: specifies the mask for the cipher
 *
 * Allocate a cipher handle for an ahash. The returned struct
 * crypto_ahash is the cipher handle that is required for any subsequent
 * API invocation for that ahash.
 *
 * Return: allocated cipher handle in case of success; IS_ERR() is true in case
 *           of an error, PTR_ERR() returns the error code.
 */
struct crypto_ahash *crypto_alloc_ahash(const char *alg_name, u32 type,
                                        u32 mask);

struct crypto_ahash *crypto_clone_ahash(struct crypto_ahash *tfm);

static inline struct crypto_tfm *crypto_ahash_tfm(struct crypto_ahash *tfm)
{
        return &tfm->base;
}

/**
 * crypto_free_ahash() - zeroize and free the ahash handle
 * @tfm: cipher handle to be freed
 *
 * If @tfm is a NULL or error pointer, this function does nothing.
 */
static inline void crypto_free_ahash(struct crypto_ahash *tfm)
{
        crypto_destroy_tfm(tfm, crypto_ahash_tfm(tfm));
}

/**
 * crypto_has_ahash() - Search for the availability of an ahash.
 * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
 *              ahash
 * @type: specifies the type of the ahash
 * @mask: specifies the mask for the ahash
 *
 * Return: true when the ahash is known to the kernel crypto API; false
 *           otherwise
 */
int crypto_has_ahash(const char *alg_name, u32 type, u32 mask);

static inline const char *crypto_ahash_alg_name(struct crypto_ahash *tfm)
{
        return crypto_tfm_alg_name(crypto_ahash_tfm(tfm));
}

static inline const char *crypto_ahash_driver_name(struct crypto_ahash *tfm)
{
        return crypto_tfm_alg_driver_name(crypto_ahash_tfm(tfm));
}

/**
 * crypto_ahash_blocksize() - obtain block size for cipher
 * @tfm: cipher handle
 *
 * The block size for the message digest cipher referenced with the cipher
 * handle is returned.
 *
 * Return: block size of cipher
 */
static inline unsigned int crypto_ahash_blocksize(struct crypto_ahash *tfm)
{
        return crypto_tfm_alg_blocksize(crypto_ahash_tfm(tfm));
}

static inline struct hash_alg_common *__crypto_hash_alg_common(
        struct crypto_alg *alg)
{
        return container_of(alg, struct hash_alg_common, base);
}

static inline struct hash_alg_common *crypto_hash_alg_common(
        struct crypto_ahash *tfm)
{
        return __crypto_hash_alg_common(crypto_ahash_tfm(tfm)->__crt_alg);
}

/**
 * crypto_ahash_digestsize() - obtain message digest size
 * @tfm: cipher handle
 *
 * The size for the message digest created by the message digest cipher
 * referenced with the cipher handle is returned.
 *
 *
 * Return: message digest size of cipher
 */
static inline unsigned int crypto_ahash_digestsize(struct crypto_ahash *tfm)
{
        return crypto_hash_alg_common(tfm)->digestsize;
}

/**
 * crypto_ahash_statesize() - obtain size of the ahash state
 * @tfm: cipher handle
 *
 * Return the size of the ahash state. With the crypto_ahash_export()
 * function, the caller can export the state into a buffer whose size is
 * defined with this function.
 *
 * Return: size of the ahash state
 */
static inline unsigned int crypto_ahash_statesize(struct crypto_ahash *tfm)
{
        return tfm->statesize;
}

static inline u32 crypto_ahash_get_flags(struct crypto_ahash *tfm)
{
        return crypto_tfm_get_flags(crypto_ahash_tfm(tfm));
}

static inline void crypto_ahash_set_flags(struct crypto_ahash *tfm, u32 flags)
{
        crypto_tfm_set_flags(crypto_ahash_tfm(tfm), flags);
}

static inline void crypto_ahash_clear_flags(struct crypto_ahash *tfm, u32 flags)
{
        crypto_tfm_clear_flags(crypto_ahash_tfm(tfm), flags);
}

/**
 * crypto_ahash_reqtfm() - obtain cipher handle from request
 * @req: asynchronous request handle that contains the reference to the ahash
 *         cipher handle
 *
 * Return the ahash cipher handle that is registered with the asynchronous
 * request handle ahash_request.
 *
 * Return: ahash cipher handle
 */
static inline struct crypto_ahash *crypto_ahash_reqtfm(
        struct ahash_request *req)
{
        return __crypto_ahash_cast(req->base.tfm);
}

/**
 * crypto_ahash_reqsize() - obtain size of the request data structure
 * @tfm: cipher handle
 *
 * Return: size of the request data
 */
static inline unsigned int crypto_ahash_reqsize(struct crypto_ahash *tfm)
{
        return tfm->reqsize;
}

static inline void *ahash_request_ctx(struct ahash_request *req)
{
        return req->__ctx;
}

/**
 * crypto_ahash_setkey - set key for cipher handle
 * @tfm: cipher handle
 * @key: buffer holding the key
 * @keylen: length of the key in bytes
 *
 * The caller provided key is set for the ahash cipher. The cipher
 * handle must point to a keyed hash in order for this function to succeed.
 *
 * Return: 0 if the setting of the key was successful; < 0 if an error occurred
 */
int crypto_ahash_setkey(struct crypto_ahash *tfm, const u8 *key,
                        unsigned int keylen);

/**
 * crypto_ahash_finup() - update and finalize message digest
 * @req: reference to the ahash_request handle that holds all information
 *         needed to perform the cipher operation
 *
 * This function is a "short-hand" for the function calls of
 * crypto_ahash_update and crypto_ahash_final. The parameters have the same
 * meaning as discussed for those separate functions.
 *
 * Return: see crypto_ahash_final()
 */
int crypto_ahash_finup(struct ahash_request *req);

/**
 * crypto_ahash_final() - calculate message digest
 * @req: reference to the ahash_request handle that holds all information
 *         needed to perform the cipher operation
 *
 * Finalize the message digest operation and create the message digest
 * based on all data added to the cipher handle. The message digest is placed
 * into the output buffer registered with the ahash_request handle.
 *
 * Return:
 * 0                if the message digest was successfully calculated;
 * -EINPROGRESS        if data is fed into hardware (DMA) or queued for later;
 * -EBUSY        if queue is full and request should be resubmitted later;
 * other < 0        if an error occurred
 */
static inline int crypto_ahash_final(struct ahash_request *req)
{
        req->nbytes = 0;
        return crypto_ahash_finup(req);
}

/**
 * crypto_ahash_digest() - calculate message digest for a buffer
 * @req: reference to the ahash_request handle that holds all information
 *         needed to perform the cipher operation
 *
 * This function is a "short-hand" for the function calls of crypto_ahash_init,
 * crypto_ahash_update and crypto_ahash_final. The parameters have the same
 * meaning as discussed for those separate three functions.
 *
 * Return: see crypto_ahash_final()
 */
int crypto_ahash_digest(struct ahash_request *req);

/**
 * crypto_ahash_export() - extract current message digest state
 * @req: reference to the ahash_request handle whose state is exported
 * @out: output buffer of sufficient size that can hold the hash state
 *
 * This function exports the hash state of the ahash_request handle into the
 * caller-allocated output buffer out which must have sufficient size (e.g. by
 * calling crypto_ahash_statesize()).
 *
 * Return: 0 if the export was successful; < 0 if an error occurred
 */
int crypto_ahash_export(struct ahash_request *req, void *out);

/**
 * crypto_ahash_import() - import message digest state
 * @req: reference to ahash_request handle the state is imported into
 * @in: buffer holding the state
 *
 * This function imports the hash state into the ahash_request handle from the
 * input buffer. That buffer should have been generated with the
 * crypto_ahash_export function.
 *
 * Return: 0 if the import was successful; < 0 if an error occurred
 */
int crypto_ahash_import(struct ahash_request *req, const void *in);

/**
 * crypto_ahash_init() - (re)initialize message digest handle
 * @req: ahash_request handle that already is initialized with all necessary
 *         data using the ahash_request_* API functions
 *
 * The call (re-)initializes the message digest referenced by the ahash_request
 * handle. Any potentially existing state created by previous operations is
 * discarded.
 *
 * Return: see crypto_ahash_final()
 */
int crypto_ahash_init(struct ahash_request *req);

/**
 * crypto_ahash_update() - add data to message digest for processing
 * @req: ahash_request handle that was previously initialized with the
 *         crypto_ahash_init call.
 *
 * Updates the message digest state of the &ahash_request handle. The input data
 * is pointed to by the scatter/gather list registered in the &ahash_request
 * handle
 *
 * Return: see crypto_ahash_final()
 */
int crypto_ahash_update(struct ahash_request *req);

/**
 * DOC: Asynchronous Hash Request Handle
 *
 * The &ahash_request data structure contains all pointers to data
 * required for the asynchronous cipher operation. This includes the cipher
 * handle (which can be used by multiple &ahash_request instances), pointer
 * to plaintext and the message digest output buffer, asynchronous callback
 * function, etc. It acts as a handle to the ahash_request_* API calls in a
 * similar way as ahash handle to the crypto_ahash_* API calls.
 */

/**
 * ahash_request_set_tfm() - update cipher handle reference in request
 * @req: request handle to be modified
 * @tfm: cipher handle that shall be added to the request handle
 *
 * Allow the caller to replace the existing ahash handle in the request
 * data structure with a different one.
 */
static inline void ahash_request_set_tfm(struct ahash_request *req,
                                         struct crypto_ahash *tfm)
{
        crypto_request_set_tfm(&req->base, crypto_ahash_tfm(tfm));
}

/**
 * ahash_request_alloc() - allocate request data structure
 * @tfm: cipher handle to be registered with the request
 * @gfp: memory allocation flag that is handed to kmalloc by the API call.
 *
 * Allocate the request data structure that must be used with the ahash
 * message digest API calls. During
 * the allocation, the provided ahash handle
 * is registered in the request data structure.
 *
 * Return: allocated request handle in case of success, or NULL if out of memory
 */
static inline struct ahash_request *ahash_request_alloc_noprof(
        struct crypto_ahash *tfm, gfp_t gfp)
{
        struct ahash_request *req;

        req = kmalloc_noprof(sizeof(struct ahash_request) +
                             crypto_ahash_reqsize(tfm), gfp);

        if (likely(req))
                ahash_request_set_tfm(req, tfm);

        return req;
}
#define ahash_request_alloc(...)        alloc_hooks(ahash_request_alloc_noprof(__VA_ARGS__))

/**
 * ahash_request_free() - zeroize and free the request data structure
 * @req: request data structure cipher handle to be freed
 */
void ahash_request_free(struct ahash_request *req);

static inline void ahash_request_zero(struct ahash_request *req)
{
        memzero_explicit(req, sizeof(*req) +
                              crypto_ahash_reqsize(crypto_ahash_reqtfm(req)));
}

static inline struct ahash_request *ahash_request_cast(
        struct crypto_async_request *req)
{
        return container_of(req, struct ahash_request, base);
}

/**
 * ahash_request_set_callback() - set asynchronous callback function
 * @req: request handle
 * @flags: specify zero or an ORing of the flags
 *           CRYPTO_TFM_REQ_MAY_BACKLOG the request queue may back log and
 *           increase the wait queue beyond the initial maximum size;
 *           CRYPTO_TFM_REQ_MAY_SLEEP the request processing may sleep
 * @compl: callback function pointer to be registered with the request handle
 * @data: The data pointer refers to memory that is not used by the kernel
 *          crypto API, but provided to the callback function for it to use. Here,
 *          the caller can provide a reference to memory the callback function can
 *          operate on. As the callback function is invoked asynchronously to the
 *          related functionality, it may need to access data structures of the
 *          related functionality which can be referenced using this pointer. The
 *          callback function can access the memory via the "data" field in the
 *          &crypto_async_request data structure provided to the callback function.
 *
 * This function allows setting the callback function that is triggered once
 * the cipher operation completes.
 *
 * The callback function is registered with the &ahash_request handle and
 * must comply with the following template::
 *
 *        void callback_function(struct crypto_async_request *req, int error)
 */
static inline void ahash_request_set_callback(struct ahash_request *req,
                                              u32 flags,
                                              crypto_completion_t compl,
                                              void *data)
{
        flags &= ~CRYPTO_AHASH_REQ_PRIVATE;
        flags |= req->base.flags & CRYPTO_AHASH_REQ_PRIVATE;
        crypto_request_set_callback(&req->base, flags, compl, data);
}

/**
 * ahash_request_set_crypt() - set data buffers
 * @req: ahash_request handle to be updated
 * @src: source scatter/gather list
 * @result: buffer that is filled with the message digest -- the caller must
 *            ensure that the buffer has sufficient space by, for example, calling
 *            crypto_ahash_digestsize()
 * @nbytes: number of bytes to process from the source scatter/gather list
 *
 * By using this call, the caller references the source scatter/gather list.
 * The source scatter/gather list points to the data the message digest is to
 * be calculated for.
 */
static inline void ahash_request_set_crypt(struct ahash_request *req,
                                           struct scatterlist *src, u8 *result,
                                           unsigned int nbytes)
{
        req->src = src;
        req->nbytes = nbytes;
        req->result = result;
        req->base.flags &= ~CRYPTO_AHASH_REQ_VIRT;
}

/**
 * ahash_request_set_virt() - set virtual address data buffers
 * @req: ahash_request handle to be updated
 * @src: source virtual address
 * @result: buffer that is filled with the message digest -- the caller must
 *            ensure that the buffer has sufficient space by, for example, calling
 *            crypto_ahash_digestsize()
 * @nbytes: number of bytes to process from the source virtual address
 *
 * By using this call, the caller references the source virtual address.
 * The source virtual address points to the data the message digest is to
 * be calculated for.
 */
static inline void ahash_request_set_virt(struct ahash_request *req,
                                          const u8 *src, u8 *result,
                                          unsigned int nbytes)
{
        req->svirt = src;
        req->nbytes = nbytes;
        req->result = result;
        req->base.flags |= CRYPTO_AHASH_REQ_VIRT;
}

/**
 * DOC: Synchronous Message Digest API
 *
 * The synchronous message digest API is used with the ciphers of type
 * CRYPTO_ALG_TYPE_SHASH (listed as type "shash" in /proc/crypto)
 *
 * The message digest API is able to maintain state information for the
 * caller.
 *
 * The synchronous message digest API can store user-related context in its
 * shash_desc request data structure.
 */

/**
 * crypto_alloc_shash() - allocate message digest handle
 * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
 *              message digest cipher
 * @type: specifies the type of the cipher
 * @mask: specifies the mask for the cipher
 *
 * Allocate a cipher handle for a message digest. The returned &struct
 * crypto_shash is the cipher handle that is required for any subsequent
 * API invocation for that message digest.
 *
 * Return: allocated cipher handle in case of success; IS_ERR() is true in case
 *           of an error, PTR_ERR() returns the error code.
 */
struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type,
                                        u32 mask);

struct crypto_shash *crypto_clone_shash(struct crypto_shash *tfm);

int crypto_has_shash(const char *alg_name, u32 type, u32 mask);

static inline struct crypto_tfm *crypto_shash_tfm(struct crypto_shash *tfm)
{
        return &tfm->base;
}

/**
 * crypto_free_shash() - zeroize and free the message digest handle
 * @tfm: cipher handle to be freed
 *
 * If @tfm is a NULL or error pointer, this function does nothing.
 */
static inline void crypto_free_shash(struct crypto_shash *tfm)
{
        crypto_destroy_tfm(tfm, crypto_shash_tfm(tfm));
}

static inline const char *crypto_shash_alg_name(struct crypto_shash *tfm)
{
        return crypto_tfm_alg_name(crypto_shash_tfm(tfm));
}

static inline const char *crypto_shash_driver_name(struct crypto_shash *tfm)
{
        return crypto_tfm_alg_driver_name(crypto_shash_tfm(tfm));
}

/**
 * crypto_shash_blocksize() - obtain block size for cipher
 * @tfm: cipher handle
 *
 * The block size for the message digest cipher referenced with the cipher
 * handle is returned.
 *
 * Return: block size of cipher
 */
static inline unsigned int crypto_shash_blocksize(struct crypto_shash *tfm)
{
        return crypto_tfm_alg_blocksize(crypto_shash_tfm(tfm));
}

static inline struct shash_alg *__crypto_shash_alg(struct crypto_alg *alg)
{
        return container_of(alg, struct shash_alg, base);
}

static inline struct shash_alg *crypto_shash_alg(struct crypto_shash *tfm)
{
        return __crypto_shash_alg(crypto_shash_tfm(tfm)->__crt_alg);
}

/**
 * crypto_shash_digestsize() - obtain message digest size
 * @tfm: cipher handle
 *
 * The size for the message digest created by the message digest cipher
 * referenced with the cipher handle is returned.
 *
 * Return: digest size of cipher
 */
static inline unsigned int crypto_shash_digestsize(struct crypto_shash *tfm)
{
        return crypto_shash_alg(tfm)->digestsize;
}

static inline unsigned int crypto_shash_statesize(struct crypto_shash *tfm)
{
        return crypto_shash_alg(tfm)->statesize;
}

static inline u32 crypto_shash_get_flags(struct crypto_shash *tfm)
{
        return crypto_tfm_get_flags(crypto_shash_tfm(tfm));
}

static inline void crypto_shash_set_flags(struct crypto_shash *tfm, u32 flags)
{
        crypto_tfm_set_flags(crypto_shash_tfm(tfm), flags);
}

static inline void crypto_shash_clear_flags(struct crypto_shash *tfm, u32 flags)
{
        crypto_tfm_clear_flags(crypto_shash_tfm(tfm), flags);
}

/**
 * crypto_shash_descsize() - obtain the operational state size
 * @tfm: cipher handle
 *
 * The size of the operational state the cipher needs during operation is
 * returned for the hash referenced with the cipher handle. This size is
 * required to calculate the memory requirements to allow the caller allocating
 * sufficient memory for operational state.
 *
 * The operational state is defined with struct shash_desc where the size of
 * that data structure is to be calculated as
 * sizeof(struct shash_desc) + crypto_shash_descsize(alg)
 *
 * Return: size of the operational state
 */
static inline unsigned int crypto_shash_descsize(struct crypto_shash *tfm)
{
        return crypto_shash_alg(tfm)->descsize;
}

static inline void *shash_desc_ctx(struct shash_desc *desc)
{
        return desc->__ctx;
}

/**
 * crypto_shash_setkey() - set key for message digest
 * @tfm: cipher handle
 * @key: buffer holding the key
 * @keylen: length of the key in bytes
 *
 * The caller provided key is set for the keyed message digest cipher. The
 * cipher handle must point to a keyed message digest cipher in order for this
 * function to succeed.
 *
 * Context: Softirq or process context.
 * Return: 0 if the setting of the key was successful; < 0 if an error occurred
 */
int crypto_shash_setkey(struct crypto_shash *tfm, const u8 *key,
                        unsigned int keylen);

/**
 * crypto_shash_digest() - calculate message digest for buffer
 * @desc: see crypto_shash_final()
 * @data: see crypto_shash_update()
 * @len: see crypto_shash_update()
 * @out: see crypto_shash_final()
 *
 * This function is a "short-hand" for the function calls of crypto_shash_init,
 * crypto_shash_update and crypto_shash_final. The parameters have the same
 * meaning as discussed for those separate three functions.
 *
 * Context: Softirq or process context.
 * Return: 0 if the message digest creation was successful; < 0 if an error
 *           occurred
 */
int crypto_shash_digest(struct shash_desc *desc, const u8 *data,
                        unsigned int len, u8 *out);

/**
 * crypto_shash_tfm_digest() - calculate message digest for buffer
 * @tfm: hash transformation object
 * @data: see crypto_shash_update()
 * @len: see crypto_shash_update()
 * @out: see crypto_shash_final()
 *
 * This is a simplified version of crypto_shash_digest() for users who don't
 * want to allocate their own hash descriptor (shash_desc).  Instead,
 * crypto_shash_tfm_digest() takes a hash transformation object (crypto_shash)
 * directly, and it allocates a hash descriptor on the stack internally.
 * Note that this stack allocation may be fairly large.
 *
 * Context: Softirq or process context.
 * Return: 0 on success; < 0 if an error occurred.
 */
int crypto_shash_tfm_digest(struct crypto_shash *tfm, const u8 *data,
                            unsigned int len, u8 *out);

int crypto_hash_digest(struct crypto_ahash *tfm, const u8 *data,
                       unsigned int len, u8 *out);

/**
 * crypto_shash_export() - extract operational state for message digest
 * @desc: reference to the operational state handle whose state is exported
 * @out: output buffer of sufficient size that can hold the hash state
 *
 * This function exports the hash state of the operational state handle into the
 * caller-allocated output buffer out which must have sufficient size (e.g. by
 * calling crypto_shash_descsize).
 *
 * Context: Softirq or process context.
 * Return: 0 if the export creation was successful; < 0 if an error occurred
 */
int crypto_shash_export(struct shash_desc *desc, void *out);

/**
 * crypto_shash_import() - import operational state
 * @desc: reference to the operational state handle the state imported into
 * @in: buffer holding the state
 *
 * This function imports the hash state into the operational state handle from
 * the input buffer. That buffer should have been generated with the
 * crypto_ahash_export function.
 *
 * Context: Softirq or process context.
 * Return: 0 if the import was successful; < 0 if an error occurred
 */
int crypto_shash_import(struct shash_desc *desc, const void *in);

/**
 * crypto_shash_init() - (re)initialize message digest
 * @desc: operational state handle that is already filled
 *
 * The call (re-)initializes the message digest referenced by the
 * operational state handle. Any potentially existing state created by
 * previous operations is discarded.
 *
 * Context: Softirq or process context.
 * Return: 0 if the message digest initialization was successful; < 0 if an
 *           error occurred
 */
int crypto_shash_init(struct shash_desc *desc);

/**
 * crypto_shash_finup() - calculate message digest of buffer
 * @desc: see crypto_shash_final()
 * @data: see crypto_shash_update()
 * @len: see crypto_shash_update()
 * @out: see crypto_shash_final()
 *
 * This function is a "short-hand" for the function calls of
 * crypto_shash_update and crypto_shash_final. The parameters have the same
 * meaning as discussed for those separate functions.
 *
 * Context: Softirq or process context.
 * Return: 0 if the message digest creation was successful; < 0 if an error
 *           occurred
 */
int crypto_shash_finup(struct shash_desc *desc, const u8 *data,
                       unsigned int len, u8 *out);

/**
 * crypto_shash_update() - add data to message digest for processing
 * @desc: operational state handle that is already initialized
 * @data: input data to be added to the message digest
 * @len: length of the input data
 *
 * Updates the message digest state of the operational state handle.
 *
 * Context: Softirq or process context.
 * Return: 0 if the message digest update was successful; < 0 if an error
 *           occurred
 */
static inline int crypto_shash_update(struct shash_desc *desc, const u8 *data,
                                      unsigned int len)
{
        return crypto_shash_finup(desc, data, len, NULL);
}

/**
 * crypto_shash_final() - calculate message digest
 * @desc: operational state handle that is already filled with data
 * @out: output buffer filled with the message digest
 *
 * Finalize the message digest operation and create the message digest
 * based on all data added to the cipher handle. The message digest is placed
 * into the output buffer. The caller must ensure that the output buffer is
 * large enough by using crypto_shash_digestsize.
 *
 * Context: Softirq or process context.
 * Return: 0 if the message digest creation was successful; < 0 if an error
 *           occurred
 */
static inline int crypto_shash_final(struct shash_desc *desc, u8 *out)
{
        return crypto_shash_finup(desc, NULL, 0, out);
}

static inline void shash_desc_zero(struct shash_desc *desc)
{
        memzero_explicit(desc,
                         sizeof(*desc) + crypto_shash_descsize(desc->tfm));
}

static inline bool ahash_is_async(struct crypto_ahash *tfm)
{
        return crypto_tfm_is_async(&tfm->base);
}

static inline struct ahash_request *ahash_request_on_stack_init(
        char *buf, struct crypto_ahash *tfm)
{
        struct ahash_request *req = (void *)buf;

        crypto_stack_request_init(&req->base, crypto_ahash_tfm(tfm));
        return req;
}

static inline struct ahash_request *ahash_request_clone(
        struct ahash_request *req, size_t total, gfp_t gfp)
{
        return container_of(crypto_request_clone(&req->base, total, gfp),
                            struct ahash_request, base);
}

#endif        /* _CRYPTO_HASH_H */















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright (c) 2023 Isovalent */
#ifndef __NET_TCX_H
#define __NET_TCX_H

#include <linux/bpf.h>
#include <linux/bpf_mprog.h>

#include <net/sch_generic.h>

struct mini_Qdisc;

struct tcx_entry {
        struct mini_Qdisc __rcu *miniq;
        struct bpf_mprog_bundle bundle;
        u32 miniq_active;
        struct rcu_head rcu;
};

struct tcx_link {
        struct bpf_link link;
        struct net_device *dev;
};

static inline void tcx_set_ingress(struct sk_buff *skb, bool ingress)
{
#ifdef CONFIG_NET_XGRESS
        skb->tc_at_ingress = ingress;
#endif
}

#ifdef CONFIG_NET_XGRESS
static inline struct tcx_entry *tcx_entry(struct bpf_mprog_entry *entry)
{
        struct bpf_mprog_bundle *bundle = entry->parent;

        return container_of(bundle, struct tcx_entry, bundle);
}

static inline struct tcx_link *tcx_link(const struct bpf_link *link)
{
        return container_of(link, struct tcx_link, link);
}

void tcx_inc(void);
void tcx_dec(void);

static inline void tcx_entry_sync(void)
{
        /* bpf_mprog_entry got a/b swapped, therefore ensure that
         * there are no inflight users on the old one anymore.
         */
        synchronize_rcu();
}

static inline void
tcx_entry_update(struct net_device *dev, struct bpf_mprog_entry *entry,
                 bool ingress)
{
        ASSERT_RTNL();
        if (ingress)
                rcu_assign_pointer(dev->tcx_ingress, entry);
        else
                rcu_assign_pointer(dev->tcx_egress, entry);
}

static inline struct bpf_mprog_entry *
tcx_entry_fetch(struct net_device *dev, bool ingress)
{
        ASSERT_RTNL();
        if (ingress)
                return rcu_dereference_rtnl(dev->tcx_ingress);
        else
                return rcu_dereference_rtnl(dev->tcx_egress);
}

static inline struct bpf_mprog_entry *tcx_entry_create_noprof(void)
{
        struct tcx_entry *tcx = kzalloc_noprof(sizeof(*tcx), GFP_KERNEL);

        if (tcx) {
                bpf_mprog_bundle_init(&tcx->bundle);
                return &tcx->bundle.a;
        }
        return NULL;
}
#define tcx_entry_create(...)        alloc_hooks(tcx_entry_create_noprof(__VA_ARGS__))

static inline void tcx_entry_free(struct bpf_mprog_entry *entry)
{
        kfree_rcu(tcx_entry(entry), rcu);
}

static inline struct bpf_mprog_entry *
tcx_entry_fetch_or_create(struct net_device *dev, bool ingress, bool *created)
{
        struct bpf_mprog_entry *entry = tcx_entry_fetch(dev, ingress);

        *created = false;
        if (!entry) {
                entry = tcx_entry_create();
                if (!entry)
                        return NULL;
                *created = true;
        }
        return entry;
}

static inline void tcx_skeys_inc(bool ingress)
{
        tcx_inc();
        if (ingress)
                net_inc_ingress_queue();
        else
                net_inc_egress_queue();
}

static inline void tcx_skeys_dec(bool ingress)
{
        if (ingress)
                net_dec_ingress_queue();
        else
                net_dec_egress_queue();
        tcx_dec();
}

static inline void tcx_miniq_inc(struct bpf_mprog_entry *entry)
{
        ASSERT_RTNL();
        tcx_entry(entry)->miniq_active++;
}

static inline void tcx_miniq_dec(struct bpf_mprog_entry *entry)
{
        ASSERT_RTNL();
        tcx_entry(entry)->miniq_active--;
}

static inline bool tcx_entry_is_active(struct bpf_mprog_entry *entry)
{
        ASSERT_RTNL();
        return bpf_mprog_total(entry) || tcx_entry(entry)->miniq_active;
}

static inline enum tcx_action_base tcx_action_code(struct sk_buff *skb,
                                                   int code)
{
        switch (code) {
        case TCX_PASS:
                skb->tc_index = qdisc_skb_cb(skb)->tc_classid;
                fallthrough;
        case TCX_DROP:
        case TCX_REDIRECT:
                return code;
        case TCX_NEXT:
        default:
                return TCX_NEXT;
        }
}
#endif /* CONFIG_NET_XGRESS */

#if defined(CONFIG_NET_XGRESS) && defined(CONFIG_BPF_SYSCALL)
int tcx_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog);
int tcx_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
int tcx_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog);
void tcx_uninstall(struct net_device *dev, bool ingress);

int tcx_prog_query(const union bpf_attr *attr,
                   union bpf_attr __user *uattr);

static inline void dev_tcx_uninstall(struct net_device *dev)
{
        ASSERT_RTNL();
        tcx_uninstall(dev, true);
        tcx_uninstall(dev, false);
}
#else
static inline int tcx_prog_attach(const union bpf_attr *attr,
                                  struct bpf_prog *prog)
{
        return -EINVAL;
}

static inline int tcx_link_attach(const union bpf_attr *attr,
                                  struct bpf_prog *prog)
{
        return -EINVAL;
}

static inline int tcx_prog_detach(const union bpf_attr *attr,
                                  struct bpf_prog *prog)
{
        return -EINVAL;
}

static inline int tcx_prog_query(const union bpf_attr *attr,
                                 union bpf_attr __user *uattr)
{
        return -EINVAL;
}

static inline void dev_tcx_uninstall(struct net_device *dev)
{
}
#endif /* CONFIG_NET_XGRESS && CONFIG_BPF_SYSCALL */
#endif /* __NET_TCX_H */
















    8 









    8 



    7 


































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/lib/kasprintf.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/stdarg.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/string.h>

/* Simplified asprintf. */
char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap)
{
        unsigned int first, second;
        char *p;
        va_list aq;

        va_copy(aq, ap);
        first = vsnprintf(NULL, 0, fmt, aq);
        va_end(aq);

        p = kmalloc_track_caller(first+1, gfp);
        if (!p)
                return NULL;

        second = vsnprintf(p, first+1, fmt, ap);
        WARN(first != second, "different return values (%u and %u) from vsnprintf(\"%s\", ...)",
             first, second, fmt);

        return p;
}
EXPORT_SYMBOL(kvasprintf);

/*
 * If fmt contains no % (or is exactly %s), use kstrdup_const. If fmt
 * (or the sole vararg) points to rodata, we will then save a memory
 * allocation and string copy. In any case, the return value should be
 * freed using kfree_const().
 */
const char *kvasprintf_const(gfp_t gfp, const char *fmt, va_list ap)
{
        if (!strchr(fmt, '%'))
                return kstrdup_const(fmt, gfp);
        if (!strcmp(fmt, "%s"))
                return kstrdup_const(va_arg(ap, const char*), gfp);
        return kvasprintf(gfp, fmt, ap);
}
EXPORT_SYMBOL(kvasprintf_const);

char *kasprintf(gfp_t gfp, const char *fmt, ...)
{
        va_list ap;
        char *p;

        va_start(ap, fmt);
        p = kvasprintf(gfp, fmt, ap);
        va_end(ap);

        return p;
}
EXPORT_SYMBOL(kasprintf);
































































































































































































































































    4 



































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Crypto API support for SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256
 *
 * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
 * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
 * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
 * SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com>
 * Copyright 2025 Google LLC
 */
#include <crypto/internal/hash.h>
#include <crypto/sha2.h>
#include <linux/kernel.h>
#include <linux/module.h>

/*
 * Export and import functions.  crypto_shash wants a particular format that
 * matches that used by some legacy drivers.  It currently is the same as the
 * library SHA context, except the value in bytecount must be block-aligned and
 * the remainder must be stored in an extra u8 appended to the struct.
 */

#define SHA256_SHASH_STATE_SIZE 105
static_assert(offsetof(struct __sha256_ctx, state) == 0);
static_assert(offsetof(struct __sha256_ctx, bytecount) == 32);
static_assert(offsetof(struct __sha256_ctx, buf) == 40);
static_assert(sizeof(struct __sha256_ctx) + 1 == SHA256_SHASH_STATE_SIZE);

static int __crypto_sha256_export(const struct __sha256_ctx *ctx0, void *out)
{
        struct __sha256_ctx ctx = *ctx0;
        unsigned int partial;
        u8 *p = out;

        partial = ctx.bytecount % SHA256_BLOCK_SIZE;
        ctx.bytecount -= partial;
        memcpy(p, &ctx, sizeof(ctx));
        p += sizeof(ctx);
        *p = partial;
        return 0;
}

static int __crypto_sha256_import(struct __sha256_ctx *ctx, const void *in)
{
        const u8 *p = in;

        memcpy(ctx, p, sizeof(*ctx));
        p += sizeof(*ctx);
        ctx->bytecount += *p;
        return 0;
}

static int __crypto_sha256_export_core(const struct __sha256_ctx *ctx,
                                       void *out)
{
        memcpy(out, ctx, offsetof(struct __sha256_ctx, buf));
        return 0;
}

static int __crypto_sha256_import_core(struct __sha256_ctx *ctx, const void *in)
{
        memcpy(ctx, in, offsetof(struct __sha256_ctx, buf));
        return 0;
}

/* SHA-224 */

const u8 sha224_zero_message_hash[SHA224_DIGEST_SIZE] = {
        0xd1, 0x4a, 0x02, 0x8c, 0x2a, 0x3a, 0x2b, 0xc9, 0x47,
        0x61, 0x02, 0xbb, 0x28, 0x82, 0x34, 0xc4, 0x15, 0xa2,
        0xb0, 0x1f, 0x82, 0x8e, 0xa6, 0x2a, 0xc5, 0xb3, 0xe4,
        0x2f
};
EXPORT_SYMBOL_GPL(sha224_zero_message_hash);

#define SHA224_CTX(desc) ((struct sha224_ctx *)shash_desc_ctx(desc))

static int crypto_sha224_init(struct shash_desc *desc)
{
        sha224_init(SHA224_CTX(desc));
        return 0;
}

static int crypto_sha224_update(struct shash_desc *desc,
                                const u8 *data, unsigned int len)
{
        sha224_update(SHA224_CTX(desc), data, len);
        return 0;
}

static int crypto_sha224_final(struct shash_desc *desc, u8 *out)
{
        sha224_final(SHA224_CTX(desc), out);
        return 0;
}

static int crypto_sha224_digest(struct shash_desc *desc,
                                const u8 *data, unsigned int len, u8 *out)
{
        sha224(data, len, out);
        return 0;
}

static int crypto_sha224_export(struct shash_desc *desc, void *out)
{
        return __crypto_sha256_export(&SHA224_CTX(desc)->ctx, out);
}

static int crypto_sha224_import(struct shash_desc *desc, const void *in)
{
        return __crypto_sha256_import(&SHA224_CTX(desc)->ctx, in);
}

static int crypto_sha224_export_core(struct shash_desc *desc, void *out)
{
        return __crypto_sha256_export_core(&SHA224_CTX(desc)->ctx, out);
}

static int crypto_sha224_import_core(struct shash_desc *desc, const void *in)
{
        return __crypto_sha256_import_core(&SHA224_CTX(desc)->ctx, in);
}

/* SHA-256 */

const u8 sha256_zero_message_hash[SHA256_DIGEST_SIZE] = {
        0xe3, 0xb0, 0xc4, 0x42, 0x98, 0xfc, 0x1c, 0x14,
        0x9a, 0xfb, 0xf4, 0xc8, 0x99, 0x6f, 0xb9, 0x24,
        0x27, 0xae, 0x41, 0xe4, 0x64, 0x9b, 0x93, 0x4c,
        0xa4, 0x95, 0x99, 0x1b, 0x78, 0x52, 0xb8, 0x55
};
EXPORT_SYMBOL_GPL(sha256_zero_message_hash);

#define SHA256_CTX(desc) ((struct sha256_ctx *)shash_desc_ctx(desc))

static int crypto_sha256_init(struct shash_desc *desc)
{
        sha256_init(SHA256_CTX(desc));
        return 0;
}

static int crypto_sha256_update(struct shash_desc *desc,
                                const u8 *data, unsigned int len)
{
        sha256_update(SHA256_CTX(desc), data, len);
        return 0;
}

static int crypto_sha256_final(struct shash_desc *desc, u8 *out)
{
        sha256_final(SHA256_CTX(desc), out);
        return 0;
}

static int crypto_sha256_digest(struct shash_desc *desc,
                                const u8 *data, unsigned int len, u8 *out)
{
        sha256(data, len, out);
        return 0;
}

static int crypto_sha256_export(struct shash_desc *desc, void *out)
{
        return __crypto_sha256_export(&SHA256_CTX(desc)->ctx, out);
}

static int crypto_sha256_import(struct shash_desc *desc, const void *in)
{
        return __crypto_sha256_import(&SHA256_CTX(desc)->ctx, in);
}

static int crypto_sha256_export_core(struct shash_desc *desc, void *out)
{
        return __crypto_sha256_export_core(&SHA256_CTX(desc)->ctx, out);
}

static int crypto_sha256_import_core(struct shash_desc *desc, const void *in)
{
        return __crypto_sha256_import_core(&SHA256_CTX(desc)->ctx, in);
}

/* HMAC-SHA224 */

#define HMAC_SHA224_KEY(tfm) ((struct hmac_sha224_key *)crypto_shash_ctx(tfm))
#define HMAC_SHA224_CTX(desc) ((struct hmac_sha224_ctx *)shash_desc_ctx(desc))

static int crypto_hmac_sha224_setkey(struct crypto_shash *tfm,
                                     const u8 *raw_key, unsigned int keylen)
{
        hmac_sha224_preparekey(HMAC_SHA224_KEY(tfm), raw_key, keylen);
        return 0;
}

static int crypto_hmac_sha224_init(struct shash_desc *desc)
{
        hmac_sha224_init(HMAC_SHA224_CTX(desc), HMAC_SHA224_KEY(desc->tfm));
        return 0;
}

static int crypto_hmac_sha224_update(struct shash_desc *desc,
                                     const u8 *data, unsigned int len)
{
        hmac_sha224_update(HMAC_SHA224_CTX(desc), data, len);
        return 0;
}

static int crypto_hmac_sha224_final(struct shash_desc *desc, u8 *out)
{
        hmac_sha224_final(HMAC_SHA224_CTX(desc), out);
        return 0;
}

static int crypto_hmac_sha224_digest(struct shash_desc *desc,
                                     const u8 *data, unsigned int len,
                                     u8 *out)
{
        hmac_sha224(HMAC_SHA224_KEY(desc->tfm), data, len, out);
        return 0;
}

static int crypto_hmac_sha224_export(struct shash_desc *desc, void *out)
{
        return __crypto_sha256_export(&HMAC_SHA224_CTX(desc)->ctx.sha_ctx, out);
}

static int crypto_hmac_sha224_import(struct shash_desc *desc, const void *in)
{
        struct hmac_sha224_ctx *ctx = HMAC_SHA224_CTX(desc);

        ctx->ctx.ostate = HMAC_SHA224_KEY(desc->tfm)->key.ostate;
        return __crypto_sha256_import(&ctx->ctx.sha_ctx, in);
}

static int crypto_hmac_sha224_export_core(struct shash_desc *desc, void *out)
{
        return __crypto_sha256_export_core(&HMAC_SHA224_CTX(desc)->ctx.sha_ctx,
                                           out);
}

static int crypto_hmac_sha224_import_core(struct shash_desc *desc,
                                          const void *in)
{
        struct hmac_sha224_ctx *ctx = HMAC_SHA224_CTX(desc);

        ctx->ctx.ostate = HMAC_SHA224_KEY(desc->tfm)->key.ostate;
        return __crypto_sha256_import_core(&ctx->ctx.sha_ctx, in);
}

/* HMAC-SHA256 */

#define HMAC_SHA256_KEY(tfm) ((struct hmac_sha256_key *)crypto_shash_ctx(tfm))
#define HMAC_SHA256_CTX(desc) ((struct hmac_sha256_ctx *)shash_desc_ctx(desc))

static int crypto_hmac_sha256_setkey(struct crypto_shash *tfm,
                                     const u8 *raw_key, unsigned int keylen)
{
        hmac_sha256_preparekey(HMAC_SHA256_KEY(tfm), raw_key, keylen);
        return 0;
}

static int crypto_hmac_sha256_init(struct shash_desc *desc)
{
        hmac_sha256_init(HMAC_SHA256_CTX(desc), HMAC_SHA256_KEY(desc->tfm));
        return 0;
}

static int crypto_hmac_sha256_update(struct shash_desc *desc,
                                     const u8 *data, unsigned int len)
{
        hmac_sha256_update(HMAC_SHA256_CTX(desc), data, len);
        return 0;
}

static int crypto_hmac_sha256_final(struct shash_desc *desc, u8 *out)
{
        hmac_sha256_final(HMAC_SHA256_CTX(desc), out);
        return 0;
}

static int crypto_hmac_sha256_digest(struct shash_desc *desc,
                                     const u8 *data, unsigned int len,
                                     u8 *out)
{
        hmac_sha256(HMAC_SHA256_KEY(desc->tfm), data, len, out);
        return 0;
}

static int crypto_hmac_sha256_export(struct shash_desc *desc, void *out)
{
        return __crypto_sha256_export(&HMAC_SHA256_CTX(desc)->ctx.sha_ctx, out);
}

static int crypto_hmac_sha256_import(struct shash_desc *desc, const void *in)
{
        struct hmac_sha256_ctx *ctx = HMAC_SHA256_CTX(desc);

        ctx->ctx.ostate = HMAC_SHA256_KEY(desc->tfm)->key.ostate;
        return __crypto_sha256_import(&ctx->ctx.sha_ctx, in);
}

static int crypto_hmac_sha256_export_core(struct shash_desc *desc, void *out)
{
        return __crypto_sha256_export_core(&HMAC_SHA256_CTX(desc)->ctx.sha_ctx,
                                           out);
}

static int crypto_hmac_sha256_import_core(struct shash_desc *desc,
                                          const void *in)
{
        struct hmac_sha256_ctx *ctx = HMAC_SHA256_CTX(desc);

        ctx->ctx.ostate = HMAC_SHA256_KEY(desc->tfm)->key.ostate;
        return __crypto_sha256_import_core(&ctx->ctx.sha_ctx, in);
}

/* Algorithm definitions */

static struct shash_alg algs[] = {
        {
                .base.cra_name                = "sha224",
                .base.cra_driver_name        = "sha224-lib",
                .base.cra_priority        = 300,
                .base.cra_blocksize        = SHA224_BLOCK_SIZE,
                .base.cra_module        = THIS_MODULE,
                .digestsize                = SHA224_DIGEST_SIZE,
                .init                        = crypto_sha224_init,
                .update                        = crypto_sha224_update,
                .final                        = crypto_sha224_final,
                .digest                        = crypto_sha224_digest,
                .export                        = crypto_sha224_export,
                .import                        = crypto_sha224_import,
                .export_core                = crypto_sha224_export_core,
                .import_core                = crypto_sha224_import_core,
                .descsize                = sizeof(struct sha224_ctx),
                .statesize                = SHA256_SHASH_STATE_SIZE,
        },
        {
                .base.cra_name                = "sha256",
                .base.cra_driver_name        = "sha256-lib",
                .base.cra_priority        = 300,
                .base.cra_blocksize        = SHA256_BLOCK_SIZE,
                .base.cra_module        = THIS_MODULE,
                .digestsize                = SHA256_DIGEST_SIZE,
                .init                        = crypto_sha256_init,
                .update                        = crypto_sha256_update,
                .final                        = crypto_sha256_final,
                .digest                        = crypto_sha256_digest,
                .export                        = crypto_sha256_export,
                .import                        = crypto_sha256_import,
                .export_core                = crypto_sha256_export_core,
                .import_core                = crypto_sha256_import_core,
                .descsize                = sizeof(struct sha256_ctx),
                .statesize                = SHA256_SHASH_STATE_SIZE,
        },
        {
                .base.cra_name                = "hmac(sha224)",
                .base.cra_driver_name        = "hmac-sha224-lib",
                .base.cra_priority        = 300,
                .base.cra_blocksize        = SHA224_BLOCK_SIZE,
                .base.cra_ctxsize        = sizeof(struct hmac_sha224_key),
                .base.cra_module        = THIS_MODULE,
                .digestsize                = SHA224_DIGEST_SIZE,
                .setkey                        = crypto_hmac_sha224_setkey,
                .init                        = crypto_hmac_sha224_init,
                .update                        = crypto_hmac_sha224_update,
                .final                        = crypto_hmac_sha224_final,
                .digest                        = crypto_hmac_sha224_digest,
                .export                        = crypto_hmac_sha224_export,
                .import                        = crypto_hmac_sha224_import,
                .export_core                = crypto_hmac_sha224_export_core,
                .import_core                = crypto_hmac_sha224_import_core,
                .descsize                = sizeof(struct hmac_sha224_ctx),
                .statesize                = SHA256_SHASH_STATE_SIZE,
        },
        {
                .base.cra_name                = "hmac(sha256)",
                .base.cra_driver_name        = "hmac-sha256-lib",
                .base.cra_priority        = 300,
                .base.cra_blocksize        = SHA256_BLOCK_SIZE,
                .base.cra_ctxsize        = sizeof(struct hmac_sha256_key),
                .base.cra_module        = THIS_MODULE,
                .digestsize                = SHA256_DIGEST_SIZE,
                .setkey                        = crypto_hmac_sha256_setkey,
                .init                        = crypto_hmac_sha256_init,
                .update                        = crypto_hmac_sha256_update,
                .final                        = crypto_hmac_sha256_final,
                .digest                        = crypto_hmac_sha256_digest,
                .export                        = crypto_hmac_sha256_export,
                .import                        = crypto_hmac_sha256_import,
                .export_core                = crypto_hmac_sha256_export_core,
                .import_core                = crypto_hmac_sha256_import_core,
                .descsize                = sizeof(struct hmac_sha256_ctx),
                .statesize                = SHA256_SHASH_STATE_SIZE,
        },
};

static int __init crypto_sha256_mod_init(void)
{
        return crypto_register_shashes(algs, ARRAY_SIZE(algs));
}
module_init(crypto_sha256_mod_init);

static void __exit crypto_sha256_mod_exit(void)
{
        crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
}
module_exit(crypto_sha256_mod_exit);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Crypto API support for SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256");

MODULE_ALIAS_CRYPTO("sha224");
MODULE_ALIAS_CRYPTO("sha224-lib");
MODULE_ALIAS_CRYPTO("sha256");
MODULE_ALIAS_CRYPTO("sha256-lib");
MODULE_ALIAS_CRYPTO("hmac(sha224)");
MODULE_ALIAS_CRYPTO("hmac-sha224-lib");
MODULE_ALIAS_CRYPTO("hmac(sha256)");
MODULE_ALIAS_CRYPTO("hmac-sha256-lib");





























































   39 





    2 
   39 













   39 

   39 




   39 













   39 



   39 




   39 




   39 

    2 





   39 


   39 







































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra
 *
 * Provides a framework for enqueueing and running callbacks from hardirq
 * context. The enqueueing is NMI-safe.
 */

#include <linux/bug.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/irq_work.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/irqflags.h>
#include <linux/sched.h>
#include <linux/tick.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/smp.h>
#include <linux/smpboot.h>
#include <asm/processor.h>
#include <linux/kasan.h>

#include <trace/events/ipi.h>

static DEFINE_PER_CPU(struct llist_head, raised_list);
static DEFINE_PER_CPU(struct llist_head, lazy_list);
static DEFINE_PER_CPU(struct task_struct *, irq_workd);

static void wake_irq_workd(void)
{
        struct task_struct *tsk = __this_cpu_read(irq_workd);

        if (!llist_empty(this_cpu_ptr(&lazy_list)) && tsk)
                wake_up_process(tsk);
}

#ifdef CONFIG_SMP
static void irq_work_wake(struct irq_work *entry)
{
        wake_irq_workd();
}

static DEFINE_PER_CPU(struct irq_work, irq_work_wakeup) =
        IRQ_WORK_INIT_HARD(irq_work_wake);
#endif

static int irq_workd_should_run(unsigned int cpu)
{
        return !llist_empty(this_cpu_ptr(&lazy_list));
}

/*
 * Claim the entry so that no one else will poke at it.
 */
static bool irq_work_claim(struct irq_work *work)
{
        int oflags;

        oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->node.a_flags);
        /*
         * If the work is already pending, no need to raise the IPI.
         * The pairing smp_mb() in irq_work_single() makes sure
         * everything we did before is visible.
         */
        if (oflags & IRQ_WORK_PENDING)
                return false;
        return true;
}

void __weak arch_irq_work_raise(void)
{
        /*
         * Lame architectures will get the timer tick callback
         */
}

static __always_inline void irq_work_raise(struct irq_work *work)
{
        if (trace_ipi_send_cpu_enabled() && arch_irq_work_has_interrupt())
                trace_ipi_send_cpu(smp_processor_id(), _RET_IP_, work->func);

        arch_irq_work_raise();
}

/* Enqueue on current CPU, work must already be claimed and preempt disabled */
static void __irq_work_queue_local(struct irq_work *work)
{
        struct llist_head *list;
        bool rt_lazy_work = false;
        bool lazy_work = false;
        int work_flags;

        work_flags = atomic_read(&work->node.a_flags);
        if (work_flags & IRQ_WORK_LAZY)
                lazy_work = true;
        else if (IS_ENABLED(CONFIG_PREEMPT_RT) &&
                 !(work_flags & IRQ_WORK_HARD_IRQ))
                rt_lazy_work = true;

        if (lazy_work || rt_lazy_work)
                list = this_cpu_ptr(&lazy_list);
        else
                list = this_cpu_ptr(&raised_list);

        if (!llist_add(&work->node.llist, list))
                return;

        /* If the work is "lazy", handle it from next tick if any */
        if (!lazy_work || tick_nohz_tick_stopped())
                irq_work_raise(work);
}

/* Enqueue the irq work @work on the current CPU */
bool irq_work_queue(struct irq_work *work)
{
        /* Only queue if not already pending */
        if (!irq_work_claim(work))
                return false;

        /* Queue the entry and raise the IPI if needed. */
        preempt_disable();
        __irq_work_queue_local(work);
        preempt_enable();

        return true;
}
EXPORT_SYMBOL_GPL(irq_work_queue);

/*
 * Enqueue the irq_work @work on @cpu unless it's already pending
 * somewhere.
 *
 * Can be re-enqueued while the callback is still in progress.
 */
bool irq_work_queue_on(struct irq_work *work, int cpu)
{
#ifndef CONFIG_SMP
        return irq_work_queue(work);

#else /* CONFIG_SMP: */
        /* All work should have been flushed before going offline */
        WARN_ON_ONCE(cpu_is_offline(cpu));

        /* Only queue if not already pending */
        if (!irq_work_claim(work))
                return false;

        kasan_record_aux_stack(work);

        preempt_disable();
        if (cpu != smp_processor_id()) {
                /* Arch remote IPI send/receive backend aren't NMI safe */
                WARN_ON_ONCE(in_nmi());

                /*
                 * On PREEMPT_RT the items which are not marked as
                 * IRQ_WORK_HARD_IRQ are added to the lazy list and a HARD work
                 * item is used on the remote CPU to wake the thread.
                 */
                if (IS_ENABLED(CONFIG_PREEMPT_RT) &&
                    !(atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ)) {

                        if (!llist_add(&work->node.llist, &per_cpu(lazy_list, cpu)))
                                goto out;

                        work = &per_cpu(irq_work_wakeup, cpu);
                        if (!irq_work_claim(work))
                                goto out;
                }

                __smp_call_single_queue(cpu, &work->node.llist);
        } else {
                __irq_work_queue_local(work);
        }
out:
        preempt_enable();

        return true;
#endif /* CONFIG_SMP */
}

bool irq_work_needs_cpu(void)
{
        struct llist_head *raised, *lazy;

        raised = this_cpu_ptr(&raised_list);
        lazy = this_cpu_ptr(&lazy_list);

        if (llist_empty(raised) || arch_irq_work_has_interrupt())
                if (llist_empty(lazy))
                        return false;

        /* All work should have been flushed before going offline */
        WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));

        return true;
}

void irq_work_single(void *arg)
{
        struct irq_work *work = arg;
        int flags;

        /*
         * Clear the PENDING bit, after this point the @work can be re-used.
         * The PENDING bit acts as a lock, and we own it, so we can clear it
         * without atomic ops.
         */
        flags = atomic_read(&work->node.a_flags);
        flags &= ~IRQ_WORK_PENDING;
        atomic_set(&work->node.a_flags, flags);

        /*
         * See irq_work_claim().
         */
        smp_mb();

        lockdep_irq_work_enter(flags);
        work->func(work);
        lockdep_irq_work_exit(flags);

        /*
         * Clear the BUSY bit, if set, and return to the free state if no-one
         * else claimed it meanwhile.
         */
        (void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY);

        if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) ||
            !arch_irq_work_has_interrupt())
                rcuwait_wake_up(&work->irqwait);
}

static void irq_work_run_list(struct llist_head *list)
{
        struct irq_work *work, *tmp;
        struct llist_node *llnode;

        /*
         * On PREEMPT_RT IRQ-work which is not marked as HARD will be processed
         * in a per-CPU thread in preemptible context. Only the items which are
         * marked as IRQ_WORK_HARD_IRQ will be processed in hardirq context.
         */
        BUG_ON(!irqs_disabled() && !IS_ENABLED(CONFIG_PREEMPT_RT));

        if (llist_empty(list))
                return;

        llnode = llist_del_all(list);
        llist_for_each_entry_safe(work, tmp, llnode, node.llist)
                irq_work_single(work);
}

/*
 * hotplug calls this through:
 *  hotplug_cfd() -> flush_smp_call_function_queue()
 */
void irq_work_run(void)
{
        irq_work_run_list(this_cpu_ptr(&raised_list));
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                irq_work_run_list(this_cpu_ptr(&lazy_list));
        else
                wake_irq_workd();
}
EXPORT_SYMBOL_GPL(irq_work_run);

void irq_work_tick(void)
{
        struct llist_head *raised = this_cpu_ptr(&raised_list);

        if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
                irq_work_run_list(raised);

        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                irq_work_run_list(this_cpu_ptr(&lazy_list));
        else
                wake_irq_workd();
}

/*
 * Synchronize against the irq_work @entry, ensures the entry is not
 * currently in use.
 */
void irq_work_sync(struct irq_work *work)
{
        lockdep_assert_irqs_enabled();
        might_sleep();

        if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) ||
            !arch_irq_work_has_interrupt()) {
                rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work),
                                   TASK_UNINTERRUPTIBLE);
                return;
        }

        while (irq_work_is_busy(work))
                cpu_relax();
}
EXPORT_SYMBOL_GPL(irq_work_sync);

static void run_irq_workd(unsigned int cpu)
{
        irq_work_run_list(this_cpu_ptr(&lazy_list));
}

static void irq_workd_setup(unsigned int cpu)
{
        sched_set_fifo_low(current);
}

static struct smp_hotplug_thread irqwork_threads = {
        .store                  = &irq_workd,
        .setup                        = irq_workd_setup,
        .thread_should_run      = irq_workd_should_run,
        .thread_fn              = run_irq_workd,
        .thread_comm            = "irq_work/%u",
};

static __init int irq_work_init_threads(void)
{
        if (IS_ENABLED(CONFIG_PREEMPT_RT))
                BUG_ON(smpboot_register_percpu_thread(&irqwork_threads));
        return 0;
}
early_initcall(irq_work_init_threads);











































































  164 






























































































































   70 





























   70 








































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_BITOPS_H
#define _ASM_X86_BITOPS_H

/*
 * Copyright 1992, Linus Torvalds.
 *
 * Note: inlines with more than a single statement should be marked
 * __always_inline to avoid problems with older gcc's inlining heuristics.
 */

#ifndef _LINUX_BITOPS_H
#error only <linux/bitops.h> can be included directly
#endif

#include <linux/compiler.h>
#include <asm/alternative.h>
#include <asm/rmwcc.h>
#include <asm/barrier.h>

#if BITS_PER_LONG == 32
# define _BITOPS_LONG_SHIFT 5
#elif BITS_PER_LONG == 64
# define _BITOPS_LONG_SHIFT 6
#else
# error "Unexpected BITS_PER_LONG"
#endif

#define BIT_64(n)                        (U64_C(1) << (n))

/*
 * These have to be done with inline assembly: that way the bit-setting
 * is guaranteed to be atomic. All bit operations return 0 if the bit
 * was cleared before the operation and != 0 if it was not.
 *
 * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
 */

#define RLONG_ADDR(x)                         "m" (*(volatile long *) (x))
#define WBYTE_ADDR(x)                        "+m" (*(volatile char *) (x))

#define ADDR                                RLONG_ADDR(addr)

/*
 * We do the locked ops that don't return the old value as
 * a mask operation on a byte.
 */
#define CONST_MASK_ADDR(nr, addr)        WBYTE_ADDR((void *)(addr) + ((nr)>>3))
#define CONST_MASK(nr)                        (1 << ((nr) & 7))

static __always_inline void
arch_set_bit(long nr, volatile unsigned long *addr)
{
        if (__builtin_constant_p(nr)) {
                asm_inline volatile(LOCK_PREFIX "orb %b1,%0"
                        : CONST_MASK_ADDR(nr, addr)
                        : "iq" (CONST_MASK(nr))
                        : "memory");
        } else {
                asm_inline volatile(LOCK_PREFIX __ASM_SIZE(bts) " %1,%0"
                        : : RLONG_ADDR(addr), "Ir" (nr) : "memory");
        }
}

static __always_inline void
arch___set_bit(unsigned long nr, volatile unsigned long *addr)
{
        asm volatile(__ASM_SIZE(bts) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
}

static __always_inline void
arch_clear_bit(long nr, volatile unsigned long *addr)
{
        if (__builtin_constant_p(nr)) {
                asm_inline volatile(LOCK_PREFIX "andb %b1,%0"
                        : CONST_MASK_ADDR(nr, addr)
                        : "iq" (~CONST_MASK(nr)));
        } else {
                asm_inline volatile(LOCK_PREFIX __ASM_SIZE(btr) " %1,%0"
                        : : RLONG_ADDR(addr), "Ir" (nr) : "memory");
        }
}

static __always_inline void
arch_clear_bit_unlock(long nr, volatile unsigned long *addr)
{
        barrier();
        arch_clear_bit(nr, addr);
}

static __always_inline void
arch___clear_bit(unsigned long nr, volatile unsigned long *addr)
{
        asm volatile(__ASM_SIZE(btr) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
}

static __always_inline bool arch_xor_unlock_is_negative_byte(unsigned long mask,
                volatile unsigned long *addr)
{
        bool negative;
        asm_inline volatile(LOCK_PREFIX "xorb %2,%1"
                : "=@ccs" (negative), WBYTE_ADDR(addr)
                : "iq" ((char)mask) : "memory");
        return negative;
}
#define arch_xor_unlock_is_negative_byte arch_xor_unlock_is_negative_byte

static __always_inline void
arch___clear_bit_unlock(long nr, volatile unsigned long *addr)
{
        arch___clear_bit(nr, addr);
}

static __always_inline void
arch___change_bit(unsigned long nr, volatile unsigned long *addr)
{
        asm volatile(__ASM_SIZE(btc) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
}

static __always_inline void
arch_change_bit(long nr, volatile unsigned long *addr)
{
        if (__builtin_constant_p(nr)) {
                asm_inline volatile(LOCK_PREFIX "xorb %b1,%0"
                        : CONST_MASK_ADDR(nr, addr)
                        : "iq" (CONST_MASK(nr)));
        } else {
                asm_inline volatile(LOCK_PREFIX __ASM_SIZE(btc) " %1,%0"
                        : : RLONG_ADDR(addr), "Ir" (nr) : "memory");
        }
}

static __always_inline bool
arch_test_and_set_bit(long nr, volatile unsigned long *addr)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(bts), *addr, c, "Ir", nr);
}

static __always_inline bool
arch_test_and_set_bit_lock(long nr, volatile unsigned long *addr)
{
        return arch_test_and_set_bit(nr, addr);
}

static __always_inline bool
arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
{
        bool oldbit;

        asm(__ASM_SIZE(bts) " %2,%1"
            : "=@ccc" (oldbit)
            : ADDR, "Ir" (nr) : "memory");
        return oldbit;
}

static __always_inline bool
arch_test_and_clear_bit(long nr, volatile unsigned long *addr)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btr), *addr, c, "Ir", nr);
}

/*
 * Note: the operation is performed atomically with respect to
 * the local CPU, but not other CPUs. Portable code should not
 * rely on this behaviour.
 * KVM relies on this behaviour on x86 for modifying memory that is also
 * accessed from a hypervisor on the same CPU if running in a VM: don't change
 * this without also updating arch/x86/kernel/kvm.c
 */
static __always_inline bool
arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
{
        bool oldbit;

        asm volatile(__ASM_SIZE(btr) " %2,%1"
                     : "=@ccc" (oldbit)
                     : ADDR, "Ir" (nr) : "memory");
        return oldbit;
}

static __always_inline bool
arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
{
        bool oldbit;

        asm volatile(__ASM_SIZE(btc) " %2,%1"
                     : "=@ccc" (oldbit)
                     : ADDR, "Ir" (nr) : "memory");

        return oldbit;
}

static __always_inline bool
arch_test_and_change_bit(long nr, volatile unsigned long *addr)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btc), *addr, c, "Ir", nr);
}

static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr)
{
        return ((1UL << (nr & (BITS_PER_LONG-1))) &
                (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
}

static __always_inline bool constant_test_bit_acquire(long nr, const volatile unsigned long *addr)
{
        bool oldbit;

        asm volatile("testb %2,%1"
                     : "=@ccnz" (oldbit)
                     : "m" (((unsigned char *)addr)[nr >> 3]),
                       "i" (1 << (nr & 7))
                     :"memory");

        return oldbit;
}

static __always_inline bool variable_test_bit(long nr, volatile const unsigned long *addr)
{
        bool oldbit;

        asm volatile(__ASM_SIZE(bt) " %2,%1"
                     : "=@ccc" (oldbit)
                     : "m" (*(unsigned long *)addr), "Ir" (nr) : "memory");

        return oldbit;
}

static __always_inline bool
arch_test_bit(unsigned long nr, const volatile unsigned long *addr)
{
        return __builtin_constant_p(nr) ? constant_test_bit(nr, addr) :
                                          variable_test_bit(nr, addr);
}

static __always_inline bool
arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
{
        return __builtin_constant_p(nr) ? constant_test_bit_acquire(nr, addr) :
                                          variable_test_bit(nr, addr);
}

static __always_inline __attribute_const__ unsigned long variable__ffs(unsigned long word)
{
        asm("tzcnt %1,%0"
                : "=r" (word)
                : ASM_INPUT_RM (word));
        return word;
}

/**
 * __ffs - find first set bit in word
 * @word: The word to search
 *
 * Undefined if no bit exists, so code should check against 0 first.
 */
#define __ffs(word)                                \
        (__builtin_constant_p(word) ?                \
         (unsigned long)__builtin_ctzl(word) :        \
         variable__ffs(word))

static __always_inline __attribute_const__ unsigned long variable_ffz(unsigned long word)
{
        return variable__ffs(~word);
}

/**
 * ffz - find first zero bit in word
 * @word: The word to search
 *
 * Undefined if no zero exists, so code should check against ~0UL first.
 */
#define ffz(word)                                \
        (__builtin_constant_p(word) ?                \
         (unsigned long)__builtin_ctzl(~word) :        \
         variable_ffz(word))

/*
 * __fls: find last set bit in word
 * @word: The word to search
 *
 * Undefined if no set bit exists, so code should check against 0 first.
 */
static __always_inline __attribute_const__ unsigned long __fls(unsigned long word)
{
        if (__builtin_constant_p(word))
                return BITS_PER_LONG - 1 - __builtin_clzl(word);

        asm("bsr %1,%0"
            : "=r" (word)
            : ASM_INPUT_RM (word));
        return word;
}

#undef ADDR

#ifdef __KERNEL__
static __always_inline __attribute_const__ int variable_ffs(int x)
{
        int r;

#ifdef CONFIG_X86_64
        /*
         * AMD64 says BSFL won't clobber the dest reg if x==0; Intel64 says the
         * dest reg is undefined if x==0, but their CPU architect says its
         * value is written to set it to the same as before, except that the
         * top 32 bits will be cleared.
         *
         * We cannot do this on 32 bits because at the very least some
         * 486 CPUs did not behave this way.
         */
        asm("bsfl %1,%0"
            : "=r" (r)
            : ASM_INPUT_RM (x), "0" (-1));
#elif defined(CONFIG_X86_CMOV)
        asm("bsfl %1,%0\n\t"
            "cmovzl %2,%0"
            : "=&r" (r) : "rm" (x), "r" (-1));
#else
        asm("bsfl %1,%0\n\t"
            "jnz 1f\n\t"
            "movl $-1,%0\n"
            "1:" : "=r" (r) : "rm" (x));
#endif
        return r + 1;
}

/**
 * ffs - find first set bit in word
 * @x: the word to search
 *
 * This is defined the same way as the libc and compiler builtin ffs
 * routines, therefore differs in spirit from the other bitops.
 *
 * ffs(value) returns 0 if value is 0 or the position of the first
 * set bit if value is nonzero. The first (least significant) bit
 * is at position 1.
 */
#define ffs(x) (__builtin_constant_p(x) ? __builtin_ffs(x) : variable_ffs(x))

/**
 * fls - find last set bit in word
 * @x: the word to search
 *
 * This is defined in a similar way as the libc and compiler builtin
 * ffs, but returns the position of the most significant set bit.
 *
 * fls(value) returns 0 if value is 0 or the position of the last
 * set bit if value is nonzero. The last (most significant) bit is
 * at position 32.
 */
static __always_inline __attribute_const__ int fls(unsigned int x)
{
        int r;

        if (__builtin_constant_p(x))
                return x ? 32 - __builtin_clz(x) : 0;

#ifdef CONFIG_X86_64
        /*
         * AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the
         * dest reg is undefined if x==0, but their CPU architect says its
         * value is written to set it to the same as before, except that the
         * top 32 bits will be cleared.
         *
         * We cannot do this on 32 bits because at the very least some
         * 486 CPUs did not behave this way.
         */
        asm("bsrl %1,%0"
            : "=r" (r)
            : ASM_INPUT_RM (x), "0" (-1));
#elif defined(CONFIG_X86_CMOV)
        asm("bsrl %1,%0\n\t"
            "cmovzl %2,%0"
            : "=&r" (r) : "rm" (x), "rm" (-1));
#else
        asm("bsrl %1,%0\n\t"
            "jnz 1f\n\t"
            "movl $-1,%0\n"
            "1:" : "=r" (r) : "rm" (x));
#endif
        return r + 1;
}

/**
 * fls64 - find last set bit in a 64-bit word
 * @x: the word to search
 *
 * This is defined in a similar way as the libc and compiler builtin
 * ffsll, but returns the position of the most significant set bit.
 *
 * fls64(value) returns 0 if value is 0 or the position of the last
 * set bit if value is nonzero. The last (most significant) bit is
 * at position 64.
 */
#ifdef CONFIG_X86_64
static __always_inline __attribute_const__ int fls64(__u64 x)
{
        int bitpos = -1;

        if (__builtin_constant_p(x))
                return x ? 64 - __builtin_clzll(x) : 0;
        /*
         * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
         * dest reg is undefined if x==0, but their CPU architect says its
         * value is written to set it to the same as before.
         */
        asm("bsrq %1,%q0"
            : "+r" (bitpos)
            : ASM_INPUT_RM (x));
        return bitpos + 1;
}
#else
#include <asm-generic/bitops/fls64.h>
#endif

#include <asm-generic/bitops/sched.h>

#include <asm/arch_hweight.h>

#include <asm-generic/bitops/const_hweight.h>

#include <asm-generic/bitops/instrumented-atomic.h>
#include <asm-generic/bitops/instrumented-non-atomic.h>
#include <asm-generic/bitops/instrumented-lock.h>

#include <asm-generic/bitops/le.h>

#include <asm-generic/bitops/ext2-atomic-setbit.h>

#endif /* __KERNEL__ */
#endif /* _ASM_X86_BITOPS_H */



























































































































































   39 












































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
/* SPDX-License-Identifier: GPL-2.0+ */
/*
 *  Driver for 8250/16550-type serial ports
 *
 *  Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o.
 *
 *  Copyright (C) 2001 Russell King.
 */

#include <linux/bits.h>
#include <linux/serial_8250.h>
#include <linux/serial_core.h>
#include <linux/dmaengine.h>

#include "../serial_mctrl_gpio.h"

struct uart_8250_dma {
        int (*tx_dma)(struct uart_8250_port *p);
        int (*rx_dma)(struct uart_8250_port *p);
        void (*prepare_tx_dma)(struct uart_8250_port *p);
        void (*prepare_rx_dma)(struct uart_8250_port *p);

        /* Filter function */
        dma_filter_fn                fn;
        /* Parameter to the filter function */
        void                        *rx_param;
        void                        *tx_param;

        struct dma_slave_config        rxconf;
        struct dma_slave_config        txconf;

        struct dma_chan                *rxchan;
        struct dma_chan                *txchan;

        /* Device address base for DMA operations */
        phys_addr_t                rx_dma_addr;
        phys_addr_t                tx_dma_addr;

        /* DMA address of the buffer in memory */
        dma_addr_t                rx_addr;
        dma_addr_t                tx_addr;

        dma_cookie_t                rx_cookie;
        dma_cookie_t                tx_cookie;

        void                        *rx_buf;

        size_t                        rx_size;
        size_t                        tx_size;

        unsigned char                tx_running;
        unsigned char                tx_err;
        unsigned char                rx_running;
};

struct old_serial_port {
        unsigned int uart;
        unsigned int baud_base;
        unsigned int port;
        unsigned int irq;
        upf_t        flags;
        unsigned char io_type;
        unsigned char __iomem *iomem_base;
        unsigned short iomem_reg_shift;
};

struct serial8250_config {
        const char        *name;
        unsigned short        fifo_size;
        unsigned short        tx_loadsz;
        unsigned char        fcr;
        unsigned char        rxtrig_bytes[UART_FCR_R_TRIG_MAX_STATE];
        unsigned int        flags;
};

#define UART_CAP_FIFO        BIT(8)        /* UART has FIFO */
#define UART_CAP_EFR        BIT(9)        /* UART has EFR */
#define UART_CAP_SLEEP        BIT(10)        /* UART has IER sleep */
#define UART_CAP_AFE        BIT(11)        /* MCR-based hw flow control */
#define UART_CAP_UUE        BIT(12)        /* UART needs IER bit 6 set (Xscale) */
#define UART_CAP_RTOIE        BIT(13)        /* UART needs IER bit 4 set (Xscale, Tegra) */
#define UART_CAP_HFIFO        BIT(14)        /* UART has a "hidden" FIFO */
#define UART_CAP_RPM        BIT(15)        /* Runtime PM is active while idle */
#define UART_CAP_IRDA        BIT(16)        /* UART supports IrDA line discipline */
#define UART_CAP_MINI        BIT(17)        /* Mini UART on BCM283X family lacks:
                                         * STOP PARITY EPAR SPAR WLEN5 WLEN6
                                         */
#define UART_CAP_NOTEMT        BIT(18)        /* UART without interrupt on TEMT available */

#define UART_BUG_QUOT        BIT(0)        /* UART has buggy quot LSB */
#define UART_BUG_TXEN        BIT(1)        /* UART has buggy TX IIR status */
#define UART_BUG_NOMSR        BIT(2)        /* UART has buggy MSR status bits (Au1x00) */
#define UART_BUG_THRE        BIT(3)        /* UART has buggy THRE reassertion */
#define UART_BUG_TXRACE        BIT(5)        /* UART Tx fails to set remote DR */

/* Module parameters */
#define UART_NR        CONFIG_SERIAL_8250_NR_UARTS

extern unsigned int nr_uarts;

#ifdef CONFIG_SERIAL_8250_SHARE_IRQ
#define SERIAL8250_SHARE_IRQS 1
#else
#define SERIAL8250_SHARE_IRQS 0
#endif

extern unsigned int share_irqs;
extern unsigned int skip_txen_test;

#define SERIAL8250_PORT_FLAGS(_base, _irq, _flags)                \
        {                                                        \
                .iobase                = _base,                        \
                .irq                = _irq,                                \
                .uartclk        = 1843200,                        \
                .iotype                = UPIO_PORT,                        \
                .flags                = UPF_BOOT_AUTOCONF | (_flags),        \
        }

#define SERIAL8250_PORT(_base, _irq) SERIAL8250_PORT_FLAGS(_base, _irq, 0)

extern struct uart_driver serial8250_reg;
void serial8250_register_ports(struct uart_driver *drv, struct device *dev);

/* Legacy ISA bus related APIs */
typedef void (*serial8250_isa_config_fn)(int, struct uart_port *, u32 *);
extern serial8250_isa_config_fn serial8250_isa_config;

void serial8250_isa_init_ports(void);

extern struct platform_device *serial8250_isa_devs;

extern const struct uart_ops *univ8250_port_base_ops;
extern struct uart_ops univ8250_port_ops;

static inline int serial_in(struct uart_8250_port *up, int offset)
{
        return up->port.serial_in(&up->port, offset);
}

static inline void serial_out(struct uart_8250_port *up, int offset, int value)
{
        up->port.serial_out(&up->port, offset, value);
}

/**
 *        serial_lsr_in - Read LSR register and preserve flags across reads
 *        @up:        uart 8250 port
 *
 *        Read LSR register and handle saving non-preserved flags across reads.
 *        The flags that are not preserved across reads are stored into
 *        up->lsr_saved_flags.
 *
 *        Returns LSR value or'ed with the preserved flags (if any).
 */
static inline u16 serial_lsr_in(struct uart_8250_port *up)
{
        u16 lsr = up->lsr_saved_flags;

        lsr |= serial_in(up, UART_LSR);
        up->lsr_saved_flags = lsr & up->lsr_save_mask;

        return lsr;
}

/*
 * For the 16C950
 */
static void serial_icr_write(struct uart_8250_port *up, int offset, int value)
{
        serial_out(up, UART_SCR, offset);
        serial_out(up, UART_ICR, value);
}

static unsigned int __maybe_unused serial_icr_read(struct uart_8250_port *up,
                                                   int offset)
{
        unsigned int value;

        serial_icr_write(up, UART_ACR, up->acr | UART_ACR_ICRRD);
        serial_out(up, UART_SCR, offset);
        value = serial_in(up, UART_ICR);
        serial_icr_write(up, UART_ACR, up->acr);

        return value;
}

void serial8250_clear_and_reinit_fifos(struct uart_8250_port *p);

void serial8250_rpm_get(struct uart_8250_port *p);
void serial8250_rpm_put(struct uart_8250_port *p);
DEFINE_GUARD(serial8250_rpm, struct uart_8250_port *,
             serial8250_rpm_get(_T), serial8250_rpm_put(_T));

static inline u32 serial_dl_read(struct uart_8250_port *up)
{
        return up->dl_read(up);
}

static inline void serial_dl_write(struct uart_8250_port *up, u32 value)
{
        up->dl_write(up, value);
}

static inline bool serial8250_set_THRI(struct uart_8250_port *up)
{
        /* Port locked to synchronize UART_IER access against the console. */
        lockdep_assert_held_once(&up->port.lock);

        if (up->ier & UART_IER_THRI)
                return false;
        up->ier |= UART_IER_THRI;
        serial_out(up, UART_IER, up->ier);
        return true;
}

static inline bool serial8250_clear_THRI(struct uart_8250_port *up)
{
        /* Port locked to synchronize UART_IER access against the console. */
        lockdep_assert_held_once(&up->port.lock);

        if (!(up->ier & UART_IER_THRI))
                return false;
        up->ier &= ~UART_IER_THRI;
        serial_out(up, UART_IER, up->ier);
        return true;
}

struct uart_8250_port *serial8250_setup_port(int index);
struct uart_8250_port *serial8250_get_port(int line);

int serial8250_em485_config(struct uart_port *port, struct ktermios *termios,
                            struct serial_rs485 *rs485);
void serial8250_em485_start_tx(struct uart_8250_port *p, bool toggle_ier);
void serial8250_em485_stop_tx(struct uart_8250_port *p, bool toggle_ier);
void serial8250_em485_destroy(struct uart_8250_port *p);
extern struct serial_rs485 serial8250_em485_supported;

/* MCR <-> TIOCM conversion */
static inline int serial8250_TIOCM_to_MCR(int tiocm)
{
        int mcr = 0;

        if (tiocm & TIOCM_RTS)
                mcr |= UART_MCR_RTS;
        if (tiocm & TIOCM_DTR)
                mcr |= UART_MCR_DTR;
        if (tiocm & TIOCM_OUT1)
                mcr |= UART_MCR_OUT1;
        if (tiocm & TIOCM_OUT2)
                mcr |= UART_MCR_OUT2;
        if (tiocm & TIOCM_LOOP)
                mcr |= UART_MCR_LOOP;

        return mcr;
}

static inline int serial8250_MCR_to_TIOCM(int mcr)
{
        int tiocm = 0;

        if (mcr & UART_MCR_RTS)
                tiocm |= TIOCM_RTS;
        if (mcr & UART_MCR_DTR)
                tiocm |= TIOCM_DTR;
        if (mcr & UART_MCR_OUT1)
                tiocm |= TIOCM_OUT1;
        if (mcr & UART_MCR_OUT2)
                tiocm |= TIOCM_OUT2;
        if (mcr & UART_MCR_LOOP)
                tiocm |= TIOCM_LOOP;

        return tiocm;
}

/* MSR <-> TIOCM conversion */
static inline int serial8250_MSR_to_TIOCM(int msr)
{
        int tiocm = 0;

        if (msr & UART_MSR_DCD)
                tiocm |= TIOCM_CAR;
        if (msr & UART_MSR_RI)
                tiocm |= TIOCM_RNG;
        if (msr & UART_MSR_DSR)
                tiocm |= TIOCM_DSR;
        if (msr & UART_MSR_CTS)
                tiocm |= TIOCM_CTS;

        return tiocm;
}

static inline void serial8250_out_MCR(struct uart_8250_port *up, int value)
{
        serial_out(up, UART_MCR, value);

        if (up->gpios)
                mctrl_gpio_set(up->gpios, serial8250_MCR_to_TIOCM(value));
}

static inline int serial8250_in_MCR(struct uart_8250_port *up)
{
        int mctrl;

        mctrl = serial_in(up, UART_MCR);

        if (up->gpios) {
                unsigned int mctrl_gpio = 0;

                mctrl_gpio = mctrl_gpio_get_outputs(up->gpios, &mctrl_gpio);
                mctrl |= serial8250_TIOCM_to_MCR(mctrl_gpio);
        }

        return mctrl;
}

#ifdef CONFIG_SERIAL_8250_PNP
int serial8250_pnp_init(void);
void serial8250_pnp_exit(void);
#else
static inline int serial8250_pnp_init(void) { return 0; }
static inline void serial8250_pnp_exit(void) { }
#endif

#ifdef CONFIG_SERIAL_8250_RSA
void univ8250_rsa_support(struct uart_ops *ops, const struct uart_ops *core_ops);
void rsa_enable(struct uart_8250_port *up);
void rsa_disable(struct uart_8250_port *up);
void rsa_autoconfig(struct uart_8250_port *up);
void rsa_reset(struct uart_8250_port *up);
#else
static inline void univ8250_rsa_support(struct uart_ops *ops, const struct uart_ops *core_ops) { }
static inline void rsa_enable(struct uart_8250_port *up) {}
static inline void rsa_disable(struct uart_8250_port *up) {}
static inline void rsa_autoconfig(struct uart_8250_port *up) {}
static inline void rsa_reset(struct uart_8250_port *up) {}
#endif

#ifdef CONFIG_SERIAL_8250_FINTEK
int fintek_8250_probe(struct uart_8250_port *uart);
#else
static inline int fintek_8250_probe(struct uart_8250_port *uart) { return 0; }
#endif

#ifdef CONFIG_ARCH_OMAP1
#include <linux/soc/ti/omap1-soc.h>
static inline int is_omap1_8250(struct uart_8250_port *pt)
{
        int res;

        switch (pt->port.mapbase) {
        case OMAP1_UART1_BASE:
        case OMAP1_UART2_BASE:
        case OMAP1_UART3_BASE:
                res = 1;
                break;
        default:
                res = 0;
                break;
        }

        return res;
}

static inline int is_omap1510_8250(struct uart_8250_port *pt)
{
        if (!cpu_is_omap1510())
                return 0;

        return is_omap1_8250(pt);
}
#else
static inline int is_omap1_8250(struct uart_8250_port *pt)
{
        return 0;
}
static inline int is_omap1510_8250(struct uart_8250_port *pt)
{
        return 0;
}
#endif

#ifdef CONFIG_SERIAL_8250_DMA
extern int serial8250_tx_dma(struct uart_8250_port *);
extern void serial8250_tx_dma_flush(struct uart_8250_port *);
extern int serial8250_rx_dma(struct uart_8250_port *);
extern void serial8250_rx_dma_flush(struct uart_8250_port *);
extern int serial8250_request_dma(struct uart_8250_port *);
extern void serial8250_release_dma(struct uart_8250_port *);

static inline void serial8250_do_prepare_tx_dma(struct uart_8250_port *p)
{
        struct uart_8250_dma *dma = p->dma;

        if (dma->prepare_tx_dma)
                dma->prepare_tx_dma(p);
}

static inline void serial8250_do_prepare_rx_dma(struct uart_8250_port *p)
{
        struct uart_8250_dma *dma = p->dma;

        if (dma->prepare_rx_dma)
                dma->prepare_rx_dma(p);
}

static inline bool serial8250_tx_dma_running(struct uart_8250_port *p)
{
        struct uart_8250_dma *dma = p->dma;

        return dma && dma->tx_running;
}
#else
static inline int serial8250_tx_dma(struct uart_8250_port *p)
{
        return -1;
}
static inline void serial8250_tx_dma_flush(struct uart_8250_port *p) { }
static inline int serial8250_rx_dma(struct uart_8250_port *p)
{
        return -1;
}
static inline void serial8250_rx_dma_flush(struct uart_8250_port *p) { }
static inline int serial8250_request_dma(struct uart_8250_port *p)
{
        return -1;
}
static inline void serial8250_release_dma(struct uart_8250_port *p) { }

static inline bool serial8250_tx_dma_running(struct uart_8250_port *p)
{
        return false;
}
#endif

static inline int ns16550a_goto_highspeed(struct uart_8250_port *up)
{
        unsigned char status;

        status = serial_in(up, 0x04); /* EXCR2 */
#define PRESL(x) ((x) & 0x30)
        if (PRESL(status) == 0x10) {
                /* already in high speed mode */
                return 0;
        } else {
                status &= ~0xB0; /* Disable LOCK, mask out PRESL[01] */
                status |= 0x10;  /* 1.625 divisor for baud_base --> 921600 */
                serial_out(up, 0x04, status);
        }
        return 1;
}

static inline int serial_index(struct uart_port *port)
{
        return port->minor - 64;
}




























































































































    3 



























    8 
    8 













    8 


    8 





    8 
    1 



    7 
    8 


































































































   11 

    3 

   11 











   11 




   11 
   11 


   11 


   11 
   11 


   11 











































































































    1 































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Symmetric key cipher operations.
 *
 * Generic encrypt/decrypt wrapper for ciphers, handles operations across
 * multiple page boundaries by using temporary blocks.  In user context,
 * the kernel is given a chance to schedule us once per page.
 *
 * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au>
 */

#include <crypto/internal/aead.h>
#include <crypto/internal/cipher.h>
#include <crypto/internal/skcipher.h>
#include <crypto/scatterwalk.h>
#include <linux/bug.h>
#include <linux/cryptouser.h>
#include <linux/err.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/string_choices.h>
#include <net/netlink.h>
#include "skcipher.h"

#define CRYPTO_ALG_TYPE_SKCIPHER_MASK        0x0000000e

static const struct crypto_type crypto_skcipher_type;

static inline struct skcipher_alg *__crypto_skcipher_alg(
        struct crypto_alg *alg)
{
        return container_of(alg, struct skcipher_alg, base);
}

int skcipher_walk_virt(struct skcipher_walk *__restrict walk,
                       struct skcipher_request *__restrict req, bool atomic)
{
        struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
        struct skcipher_alg *alg;

        might_sleep_if(req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP);

        alg = crypto_skcipher_alg(tfm);

        walk->total = req->cryptlen;
        walk->nbytes = 0;
        walk->iv = req->iv;
        walk->oiv = req->iv;
        if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP))
                atomic = true;

        if (unlikely(!walk->total))
                return 0;

        scatterwalk_start(&walk->in, req->src);
        scatterwalk_start(&walk->out, req->dst);

        walk->blocksize = crypto_skcipher_blocksize(tfm);
        walk->ivsize = crypto_skcipher_ivsize(tfm);
        walk->alignmask = crypto_skcipher_alignmask(tfm);

        if (alg->co.base.cra_type != &crypto_skcipher_type)
                walk->stride = alg->co.chunksize;
        else
                walk->stride = alg->walksize;

        return skcipher_walk_first(walk, atomic);
}
EXPORT_SYMBOL_GPL(skcipher_walk_virt);

static int skcipher_walk_aead_common(struct skcipher_walk *__restrict walk,
                                     struct aead_request *__restrict req,
                                     bool atomic)
{
        struct crypto_aead *tfm = crypto_aead_reqtfm(req);

        walk->nbytes = 0;
        walk->iv = req->iv;
        walk->oiv = req->iv;
        if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP))
                atomic = true;

        if (unlikely(!walk->total))
                return 0;

        scatterwalk_start_at_pos(&walk->in, req->src, req->assoclen);
        scatterwalk_start_at_pos(&walk->out, req->dst, req->assoclen);

        walk->blocksize = crypto_aead_blocksize(tfm);
        walk->stride = crypto_aead_chunksize(tfm);
        walk->ivsize = crypto_aead_ivsize(tfm);
        walk->alignmask = crypto_aead_alignmask(tfm);

        return skcipher_walk_first(walk, atomic);
}

int skcipher_walk_aead_encrypt(struct skcipher_walk *__restrict walk,
                               struct aead_request *__restrict req,
                               bool atomic)
{
        walk->total = req->cryptlen;

        return skcipher_walk_aead_common(walk, req, atomic);
}
EXPORT_SYMBOL_GPL(skcipher_walk_aead_encrypt);

int skcipher_walk_aead_decrypt(struct skcipher_walk *__restrict walk,
                               struct aead_request *__restrict req,
                               bool atomic)
{
        struct crypto_aead *tfm = crypto_aead_reqtfm(req);

        walk->total = req->cryptlen - crypto_aead_authsize(tfm);

        return skcipher_walk_aead_common(walk, req, atomic);
}
EXPORT_SYMBOL_GPL(skcipher_walk_aead_decrypt);

static void skcipher_set_needkey(struct crypto_skcipher *tfm)
{
        if (crypto_skcipher_max_keysize(tfm) != 0)
                crypto_skcipher_set_flags(tfm, CRYPTO_TFM_NEED_KEY);
}

static int skcipher_setkey_unaligned(struct crypto_skcipher *tfm,
                                     const u8 *key, unsigned int keylen)
{
        unsigned long alignmask = crypto_skcipher_alignmask(tfm);
        struct skcipher_alg *cipher = crypto_skcipher_alg(tfm);
        u8 *buffer, *alignbuffer;
        unsigned long absize;
        int ret;

        absize = keylen + alignmask;
        buffer = kmalloc(absize, GFP_ATOMIC);
        if (!buffer)
                return -ENOMEM;

        alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1);
        memcpy(alignbuffer, key, keylen);
        ret = cipher->setkey(tfm, alignbuffer, keylen);
        kfree_sensitive(buffer);
        return ret;
}

int crypto_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key,
                           unsigned int keylen)
{
        struct skcipher_alg *cipher = crypto_skcipher_alg(tfm);
        unsigned long alignmask = crypto_skcipher_alignmask(tfm);
        int err;

        if (cipher->co.base.cra_type != &crypto_skcipher_type) {
                struct crypto_lskcipher **ctx = crypto_skcipher_ctx(tfm);

                crypto_lskcipher_clear_flags(*ctx, CRYPTO_TFM_REQ_MASK);
                crypto_lskcipher_set_flags(*ctx,
                                           crypto_skcipher_get_flags(tfm) &
                                           CRYPTO_TFM_REQ_MASK);
                err = crypto_lskcipher_setkey(*ctx, key, keylen);
                goto out;
        }

        if (keylen < cipher->min_keysize || keylen > cipher->max_keysize)
                return -EINVAL;

        if ((unsigned long)key & alignmask)
                err = skcipher_setkey_unaligned(tfm, key, keylen);
        else
                err = cipher->setkey(tfm, key, keylen);

out:
        if (unlikely(err)) {
                skcipher_set_needkey(tfm);
                return err;
        }

        crypto_skcipher_clear_flags(tfm, CRYPTO_TFM_NEED_KEY);
        return 0;
}
EXPORT_SYMBOL_GPL(crypto_skcipher_setkey);

int crypto_skcipher_encrypt(struct skcipher_request *req)
{
        struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
        struct skcipher_alg *alg = crypto_skcipher_alg(tfm);

        if (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
                return -ENOKEY;
        if (alg->co.base.cra_type != &crypto_skcipher_type)
                return crypto_lskcipher_encrypt_sg(req);
        return alg->encrypt(req);
}
EXPORT_SYMBOL_GPL(crypto_skcipher_encrypt);

int crypto_skcipher_decrypt(struct skcipher_request *req)
{
        struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
        struct skcipher_alg *alg = crypto_skcipher_alg(tfm);

        if (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
                return -ENOKEY;
        if (alg->co.base.cra_type != &crypto_skcipher_type)
                return crypto_lskcipher_decrypt_sg(req);
        return alg->decrypt(req);
}
EXPORT_SYMBOL_GPL(crypto_skcipher_decrypt);

static int crypto_lskcipher_export(struct skcipher_request *req, void *out)
{
        struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
        u8 *ivs = skcipher_request_ctx(req);

        ivs = PTR_ALIGN(ivs, crypto_skcipher_alignmask(tfm) + 1);

        memcpy(out, ivs + crypto_skcipher_ivsize(tfm),
               crypto_skcipher_statesize(tfm));

        return 0;
}

static int crypto_lskcipher_import(struct skcipher_request *req, const void *in)
{
        struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
        u8 *ivs = skcipher_request_ctx(req);

        ivs = PTR_ALIGN(ivs, crypto_skcipher_alignmask(tfm) + 1);

        memcpy(ivs + crypto_skcipher_ivsize(tfm), in,
               crypto_skcipher_statesize(tfm));

        return 0;
}

static int skcipher_noexport(struct skcipher_request *req, void *out)
{
        return 0;
}

static int skcipher_noimport(struct skcipher_request *req, const void *in)
{
        return 0;
}

int crypto_skcipher_export(struct skcipher_request *req, void *out)
{
        struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
        struct skcipher_alg *alg = crypto_skcipher_alg(tfm);

        if (alg->co.base.cra_type != &crypto_skcipher_type)
                return crypto_lskcipher_export(req, out);
        return alg->export(req, out);
}
EXPORT_SYMBOL_GPL(crypto_skcipher_export);

int crypto_skcipher_import(struct skcipher_request *req, const void *in)
{
        struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
        struct skcipher_alg *alg = crypto_skcipher_alg(tfm);

        if (alg->co.base.cra_type != &crypto_skcipher_type)
                return crypto_lskcipher_import(req, in);
        return alg->import(req, in);
}
EXPORT_SYMBOL_GPL(crypto_skcipher_import);

static void crypto_skcipher_exit_tfm(struct crypto_tfm *tfm)
{
        struct crypto_skcipher *skcipher = __crypto_skcipher_cast(tfm);
        struct skcipher_alg *alg = crypto_skcipher_alg(skcipher);

        alg->exit(skcipher);
}

static int crypto_skcipher_init_tfm(struct crypto_tfm *tfm)
{
        struct crypto_skcipher *skcipher = __crypto_skcipher_cast(tfm);
        struct skcipher_alg *alg = crypto_skcipher_alg(skcipher);

        skcipher_set_needkey(skcipher);

        if (tfm->__crt_alg->cra_type != &crypto_skcipher_type) {
                unsigned am = crypto_skcipher_alignmask(skcipher);
                unsigned reqsize;

                reqsize = am & ~(crypto_tfm_ctx_alignment() - 1);
                reqsize += crypto_skcipher_ivsize(skcipher);
                reqsize += crypto_skcipher_statesize(skcipher);
                crypto_skcipher_set_reqsize(skcipher, reqsize);

                return crypto_init_lskcipher_ops_sg(tfm);
        }

        crypto_skcipher_set_reqsize(skcipher, crypto_tfm_alg_reqsize(tfm));

        if (alg->exit)
                skcipher->base.exit = crypto_skcipher_exit_tfm;

        if (alg->init)
                return alg->init(skcipher);

        return 0;
}

static unsigned int crypto_skcipher_extsize(struct crypto_alg *alg)
{
        if (alg->cra_type != &crypto_skcipher_type)
                return sizeof(struct crypto_lskcipher *);

        return crypto_alg_extsize(alg);
}

static void crypto_skcipher_free_instance(struct crypto_instance *inst)
{
        struct skcipher_instance *skcipher =
                container_of(inst, struct skcipher_instance, s.base);

        skcipher->free(skcipher);
}

static void crypto_skcipher_show(struct seq_file *m, struct crypto_alg *alg)
        __maybe_unused;
static void crypto_skcipher_show(struct seq_file *m, struct crypto_alg *alg)
{
        struct skcipher_alg *skcipher = __crypto_skcipher_alg(alg);

        seq_printf(m, "type         : skcipher\n");
        seq_printf(m, "async        : %s\n",
                   str_yes_no(alg->cra_flags & CRYPTO_ALG_ASYNC));
        seq_printf(m, "blocksize    : %u\n", alg->cra_blocksize);
        seq_printf(m, "min keysize  : %u\n", skcipher->min_keysize);
        seq_printf(m, "max keysize  : %u\n", skcipher->max_keysize);
        seq_printf(m, "ivsize       : %u\n", skcipher->ivsize);
        seq_printf(m, "chunksize    : %u\n", skcipher->chunksize);
        seq_printf(m, "walksize     : %u\n", skcipher->walksize);
        seq_printf(m, "statesize    : %u\n", skcipher->statesize);
}

static int __maybe_unused crypto_skcipher_report(
        struct sk_buff *skb, struct crypto_alg *alg)
{
        struct skcipher_alg *skcipher = __crypto_skcipher_alg(alg);
        struct crypto_report_blkcipher rblkcipher;

        memset(&rblkcipher, 0, sizeof(rblkcipher));

        strscpy(rblkcipher.type, "skcipher", sizeof(rblkcipher.type));
        strscpy(rblkcipher.geniv, "<none>", sizeof(rblkcipher.geniv));

        rblkcipher.blocksize = alg->cra_blocksize;
        rblkcipher.min_keysize = skcipher->min_keysize;
        rblkcipher.max_keysize = skcipher->max_keysize;
        rblkcipher.ivsize = skcipher->ivsize;

        return nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER,
                       sizeof(rblkcipher), &rblkcipher);
}

static const struct crypto_type crypto_skcipher_type = {
        .extsize = crypto_skcipher_extsize,
        .init_tfm = crypto_skcipher_init_tfm,
        .free = crypto_skcipher_free_instance,
#ifdef CONFIG_PROC_FS
        .show = crypto_skcipher_show,
#endif
#if IS_ENABLED(CONFIG_CRYPTO_USER)
        .report = crypto_skcipher_report,
#endif
        .maskclear = ~CRYPTO_ALG_TYPE_MASK,
        .maskset = CRYPTO_ALG_TYPE_SKCIPHER_MASK,
        .type = CRYPTO_ALG_TYPE_SKCIPHER,
        .tfmsize = offsetof(struct crypto_skcipher, base),
        .algsize = offsetof(struct skcipher_alg, base),
};

int crypto_grab_skcipher(struct crypto_skcipher_spawn *spawn,
                         struct crypto_instance *inst,
                         const char *name, u32 type, u32 mask)
{
        spawn->base.frontend = &crypto_skcipher_type;
        return crypto_grab_spawn(&spawn->base, inst, name, type, mask);
}
EXPORT_SYMBOL_GPL(crypto_grab_skcipher);

struct crypto_skcipher *crypto_alloc_skcipher(const char *alg_name,
                                              u32 type, u32 mask)
{
        return crypto_alloc_tfm(alg_name, &crypto_skcipher_type, type, mask);
}
EXPORT_SYMBOL_GPL(crypto_alloc_skcipher);

struct crypto_sync_skcipher *crypto_alloc_sync_skcipher(
                                const char *alg_name, u32 type, u32 mask)
{
        struct crypto_skcipher *tfm;

        /* Only sync algorithms allowed. */
        mask |= CRYPTO_ALG_ASYNC | CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE;
        type &= ~(CRYPTO_ALG_ASYNC | CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE);

        tfm = crypto_alloc_tfm(alg_name, &crypto_skcipher_type, type, mask);

        /*
         * Make sure we do not allocate something that might get used with
         * an on-stack request: check the request size.
         */
        if (!IS_ERR(tfm) && WARN_ON(crypto_skcipher_reqsize(tfm) >
                                    MAX_SYNC_SKCIPHER_REQSIZE)) {
                crypto_free_skcipher(tfm);
                return ERR_PTR(-EINVAL);
        }

        return (struct crypto_sync_skcipher *)tfm;
}
EXPORT_SYMBOL_GPL(crypto_alloc_sync_skcipher);

int crypto_has_skcipher(const char *alg_name, u32 type, u32 mask)
{
        return crypto_type_has_alg(alg_name, &crypto_skcipher_type, type, mask);
}
EXPORT_SYMBOL_GPL(crypto_has_skcipher);

int skcipher_prepare_alg_common(struct skcipher_alg_common *alg)
{
        struct crypto_alg *base = &alg->base;

        if (alg->ivsize > PAGE_SIZE / 8 || alg->chunksize > PAGE_SIZE / 8 ||
            alg->statesize > PAGE_SIZE / 2 ||
            (alg->ivsize + alg->statesize) > PAGE_SIZE / 2)
                return -EINVAL;

        if (!alg->chunksize)
                alg->chunksize = base->cra_blocksize;

        base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;

        return 0;
}

static int skcipher_prepare_alg(struct skcipher_alg *alg)
{
        struct crypto_alg *base = &alg->base;
        int err;

        err = skcipher_prepare_alg_common(&alg->co);
        if (err)
                return err;

        if (alg->walksize > PAGE_SIZE / 8)
                return -EINVAL;

        if (!alg->walksize)
                alg->walksize = alg->chunksize;

        if (!alg->statesize) {
                alg->import = skcipher_noimport;
                alg->export = skcipher_noexport;
        } else if (!(alg->import && alg->export))
                return -EINVAL;

        base->cra_type = &crypto_skcipher_type;
        base->cra_flags |= CRYPTO_ALG_TYPE_SKCIPHER;

        return 0;
}

int crypto_register_skcipher(struct skcipher_alg *alg)
{
        struct crypto_alg *base = &alg->base;
        int err;

        err = skcipher_prepare_alg(alg);
        if (err)
                return err;

        return crypto_register_alg(base);
}
EXPORT_SYMBOL_GPL(crypto_register_skcipher);

void crypto_unregister_skcipher(struct skcipher_alg *alg)
{
        crypto_unregister_alg(&alg->base);
}
EXPORT_SYMBOL_GPL(crypto_unregister_skcipher);

int crypto_register_skciphers(struct skcipher_alg *algs, int count)
{
        int i, ret;

        for (i = 0; i < count; i++) {
                ret = crypto_register_skcipher(&algs[i]);
                if (ret)
                        goto err;
        }

        return 0;

err:
        for (--i; i >= 0; --i)
                crypto_unregister_skcipher(&algs[i]);

        return ret;
}
EXPORT_SYMBOL_GPL(crypto_register_skciphers);

void crypto_unregister_skciphers(struct skcipher_alg *algs, int count)
{
        int i;

        for (i = count - 1; i >= 0; --i)
                crypto_unregister_skcipher(&algs[i]);
}
EXPORT_SYMBOL_GPL(crypto_unregister_skciphers);

int skcipher_register_instance(struct crypto_template *tmpl,
                           struct skcipher_instance *inst)
{
        int err;

        if (WARN_ON(!inst->free))
                return -EINVAL;

        err = skcipher_prepare_alg(&inst->alg);
        if (err)
                return err;

        return crypto_register_instance(tmpl, skcipher_crypto_instance(inst));
}
EXPORT_SYMBOL_GPL(skcipher_register_instance);

static int skcipher_setkey_simple(struct crypto_skcipher *tfm, const u8 *key,
                                  unsigned int keylen)
{
        struct crypto_cipher *cipher = skcipher_cipher_simple(tfm);

        crypto_cipher_clear_flags(cipher, CRYPTO_TFM_REQ_MASK);
        crypto_cipher_set_flags(cipher, crypto_skcipher_get_flags(tfm) &
                                CRYPTO_TFM_REQ_MASK);
        return crypto_cipher_setkey(cipher, key, keylen);
}

static int skcipher_init_tfm_simple(struct crypto_skcipher *tfm)
{
        struct skcipher_instance *inst = skcipher_alg_instance(tfm);
        struct crypto_cipher_spawn *spawn = skcipher_instance_ctx(inst);
        struct skcipher_ctx_simple *ctx = crypto_skcipher_ctx(tfm);
        struct crypto_cipher *cipher;

        cipher = crypto_spawn_cipher(spawn);
        if (IS_ERR(cipher))
                return PTR_ERR(cipher);

        ctx->cipher = cipher;
        return 0;
}

static void skcipher_exit_tfm_simple(struct crypto_skcipher *tfm)
{
        struct skcipher_ctx_simple *ctx = crypto_skcipher_ctx(tfm);

        crypto_free_cipher(ctx->cipher);
}

static void skcipher_free_instance_simple(struct skcipher_instance *inst)
{
        crypto_drop_cipher(skcipher_instance_ctx(inst));
        kfree(inst);
}

/**
 * skcipher_alloc_instance_simple - allocate instance of simple block cipher mode
 *
 * Allocate an skcipher_instance for a simple block cipher mode of operation,
 * e.g. cbc or ecb.  The instance context will have just a single crypto_spawn,
 * that for the underlying cipher.  The {min,max}_keysize, ivsize, blocksize,
 * alignmask, and priority are set from the underlying cipher but can be
 * overridden if needed.  The tfm context defaults to skcipher_ctx_simple, and
 * default ->setkey(), ->init(), and ->exit() methods are installed.
 *
 * @tmpl: the template being instantiated
 * @tb: the template parameters
 *
 * Return: a pointer to the new instance, or an ERR_PTR().  The caller still
 *           needs to register the instance.
 */
struct skcipher_instance *skcipher_alloc_instance_simple(
        struct crypto_template *tmpl, struct rtattr **tb)
{
        u32 mask;
        struct skcipher_instance *inst;
        struct crypto_cipher_spawn *spawn;
        struct crypto_alg *cipher_alg;
        int err;

        err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SKCIPHER, &mask);
        if (err)
                return ERR_PTR(err);

        inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL);
        if (!inst)
                return ERR_PTR(-ENOMEM);
        spawn = skcipher_instance_ctx(inst);

        err = crypto_grab_cipher(spawn, skcipher_crypto_instance(inst),
                                 crypto_attr_alg_name(tb[1]), 0, mask);
        if (err)
                goto err_free_inst;
        cipher_alg = crypto_spawn_cipher_alg(spawn);

        err = crypto_inst_setname(skcipher_crypto_instance(inst), tmpl->name,
                                  cipher_alg);
        if (err)
                goto err_free_inst;

        inst->free = skcipher_free_instance_simple;

        /* Default algorithm properties, can be overridden */
        inst->alg.base.cra_blocksize = cipher_alg->cra_blocksize;
        inst->alg.base.cra_alignmask = cipher_alg->cra_alignmask;
        inst->alg.base.cra_priority = cipher_alg->cra_priority;
        inst->alg.min_keysize = cipher_alg->cra_cipher.cia_min_keysize;
        inst->alg.max_keysize = cipher_alg->cra_cipher.cia_max_keysize;
        inst->alg.ivsize = cipher_alg->cra_blocksize;

        /* Use skcipher_ctx_simple by default, can be overridden */
        inst->alg.base.cra_ctxsize = sizeof(struct skcipher_ctx_simple);
        inst->alg.setkey = skcipher_setkey_simple;
        inst->alg.init = skcipher_init_tfm_simple;
        inst->alg.exit = skcipher_exit_tfm_simple;

        return inst;

err_free_inst:
        skcipher_free_instance_simple(inst);
        return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(skcipher_alloc_instance_simple);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Symmetric key cipher type");
MODULE_IMPORT_NS("CRYPTO_INTERNAL");





























































































































































































































































































  167 




  165 










































































































































































































































































































































































































































































































































































































































































































































































































































































   42 
























































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
// SPDX-License-Identifier: GPL-2.0-only
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <linux/init.h>
#include <linux/export.h>
#include <linux/timer.h>
#include <linux/acpi_pmtmr.h>
#include <linux/cpufreq.h>
#include <linux/delay.h>
#include <linux/clocksource.h>
#include <linux/percpu.h>
#include <linux/timex.h>
#include <linux/static_key.h>
#include <linux/static_call.h>

#include <asm/cpuid/api.h>
#include <asm/hpet.h>
#include <asm/timer.h>
#include <asm/vgtod.h>
#include <asm/time.h>
#include <asm/delay.h>
#include <asm/hypervisor.h>
#include <asm/nmi.h>
#include <asm/x86_init.h>
#include <asm/geode.h>
#include <asm/apic.h>
#include <asm/cpu_device_id.h>
#include <asm/i8259.h>
#include <asm/msr.h>
#include <asm/topology.h>
#include <asm/uv/uv.h>
#include <asm/sev.h>

unsigned int __read_mostly cpu_khz;        /* TSC clocks / usec, not used here */
EXPORT_SYMBOL(cpu_khz);

unsigned int __read_mostly tsc_khz;
EXPORT_SYMBOL(tsc_khz);

#define KHZ        1000

/*
 * TSC can be unstable due to cpufreq or due to unsynced TSCs
 */
static int __read_mostly tsc_unstable;
static unsigned int __initdata tsc_early_khz;

static DEFINE_STATIC_KEY_FALSE_RO(__use_tsc);

int tsc_clocksource_reliable;

static int __read_mostly tsc_force_recalibrate;

static struct clocksource_base art_base_clk = {
        .id    = CSID_X86_ART,
};
static bool have_art;

struct cyc2ns {
        struct cyc2ns_data data[2];        /*  0 + 2*16 = 32 */
        seqcount_latch_t   seq;                /* 32 + 4    = 36 */

}; /* fits one cacheline */

static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);

static int __init tsc_early_khz_setup(char *buf)
{
        return kstrtouint(buf, 0, &tsc_early_khz);
}
early_param("tsc_early_khz", tsc_early_khz_setup);

__always_inline void __cyc2ns_read(struct cyc2ns_data *data)
{
        int seq, idx;

        do {
                seq = this_cpu_read(cyc2ns.seq.seqcount.sequence);
                idx = seq & 1;

                data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset);
                data->cyc2ns_mul    = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul);
                data->cyc2ns_shift  = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift);

        } while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence)));
}

__always_inline void cyc2ns_read_begin(struct cyc2ns_data *data)
{
        preempt_disable_notrace();
        __cyc2ns_read(data);
}

__always_inline void cyc2ns_read_end(void)
{
        preempt_enable_notrace();
}

/*
 * Accelerators for sched_clock()
 * convert from cycles(64bits) => nanoseconds (64bits)
 *  basic equation:
 *              ns = cycles / (freq / ns_per_sec)
 *              ns = cycles * (ns_per_sec / freq)
 *              ns = cycles * (10^9 / (cpu_khz * 10^3))
 *              ns = cycles * (10^6 / cpu_khz)
 *
 *      Then we use scaling math (suggested by george@mvista.com) to get:
 *              ns = cycles * (10^6 * SC / cpu_khz) / SC
 *              ns = cycles * cyc2ns_scale / SC
 *
 *      And since SC is a constant power of two, we can convert the div
 *  into a shift. The larger SC is, the more accurate the conversion, but
 *  cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication
 *  (64-bit result) can be used.
 *
 *  We can use khz divisor instead of mhz to keep a better precision.
 *  (mathieu.desnoyers@polymtl.ca)
 *
 *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
 */

static __always_inline unsigned long long __cycles_2_ns(unsigned long long cyc)
{
        struct cyc2ns_data data;
        unsigned long long ns;

        __cyc2ns_read(&data);

        ns = data.cyc2ns_offset;
        ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);

        return ns;
}

static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
        unsigned long long ns;
        preempt_disable_notrace();
        ns = __cycles_2_ns(cyc);
        preempt_enable_notrace();
        return ns;
}

static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
{
        unsigned long long ns_now;
        struct cyc2ns_data data;
        struct cyc2ns *c2n;

        ns_now = cycles_2_ns(tsc_now);

        /*
         * Compute a new multiplier as per the above comment and ensure our
         * time function is continuous; see the comment near struct
         * cyc2ns_data.
         */
        clocks_calc_mult_shift(&data.cyc2ns_mul, &data.cyc2ns_shift, khz,
                               NSEC_PER_MSEC, 0);

        /*
         * cyc2ns_shift is exported via arch_perf_update_userpage() where it is
         * not expected to be greater than 31 due to the original published
         * conversion algorithm shifting a 32-bit value (now specifies a 64-bit
         * value) - refer perf_event_mmap_page documentation in perf_event.h.
         */
        if (data.cyc2ns_shift == 32) {
                data.cyc2ns_shift = 31;
                data.cyc2ns_mul >>= 1;
        }

        data.cyc2ns_offset = ns_now -
                mul_u64_u32_shr(tsc_now, data.cyc2ns_mul, data.cyc2ns_shift);

        c2n = per_cpu_ptr(&cyc2ns, cpu);

        write_seqcount_latch_begin(&c2n->seq);
        c2n->data[0] = data;
        write_seqcount_latch(&c2n->seq);
        c2n->data[1] = data;
        write_seqcount_latch_end(&c2n->seq);
}

static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
{
        unsigned long flags;

        local_irq_save(flags);
        sched_clock_idle_sleep_event();

        if (khz)
                __set_cyc2ns_scale(khz, cpu, tsc_now);

        sched_clock_idle_wakeup_event();
        local_irq_restore(flags);
}

/*
 * Initialize cyc2ns for boot cpu
 */
static void __init cyc2ns_init_boot_cpu(void)
{
        struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns);

        seqcount_latch_init(&c2n->seq);
        __set_cyc2ns_scale(tsc_khz, smp_processor_id(), rdtsc());
}

/*
 * Secondary CPUs do not run through tsc_init(), so set up
 * all the scale factors for all CPUs, assuming the same
 * speed as the bootup CPU.
 */
static void __init cyc2ns_init_secondary_cpus(void)
{
        unsigned int cpu, this_cpu = smp_processor_id();
        struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns);
        struct cyc2ns_data *data = c2n->data;

        for_each_possible_cpu(cpu) {
                if (cpu != this_cpu) {
                        seqcount_latch_init(&c2n->seq);
                        c2n = per_cpu_ptr(&cyc2ns, cpu);
                        c2n->data[0] = data[0];
                        c2n->data[1] = data[1];
                }
        }
}

/*
 * Scheduler clock - returns current time in nanosec units.
 */
noinstr u64 native_sched_clock(void)
{
        if (static_branch_likely(&__use_tsc)) {
                u64 tsc_now = rdtsc();

                /* return the value in ns */
                return __cycles_2_ns(tsc_now);
        }

        /*
         * Fall back to jiffies if there's no TSC available:
         * ( But note that we still use it if the TSC is marked
         *   unstable. We do this because unlike Time Of Day,
         *   the scheduler clock tolerates small errors and it's
         *   very important for it to be as fast as the platform
         *   can achieve it. )
         */

        /* No locking but a rare wrong value is not a big deal: */
        return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
}

/*
 * Generate a sched_clock if you already have a TSC value.
 */
u64 native_sched_clock_from_tsc(u64 tsc)
{
        return cycles_2_ns(tsc);
}

/* We need to define a real function for sched_clock, to override the
   weak default version */
#ifdef CONFIG_PARAVIRT
noinstr u64 sched_clock_noinstr(void)
{
        return paravirt_sched_clock();
}

bool using_native_sched_clock(void)
{
        return static_call_query(pv_sched_clock) == native_sched_clock;
}
#else
u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock")));

bool using_native_sched_clock(void) { return true; }
#endif

notrace u64 sched_clock(void)
{
        u64 now;
        preempt_disable_notrace();
        now = sched_clock_noinstr();
        preempt_enable_notrace();
        return now;
}

int check_tsc_unstable(void)
{
        return tsc_unstable;
}
EXPORT_SYMBOL_GPL(check_tsc_unstable);

#ifdef CONFIG_X86_TSC
int __init notsc_setup(char *str)
{
        mark_tsc_unstable("boot parameter notsc");
        return 1;
}
#else
/*
 * disable flag for tsc. Takes effect by clearing the TSC cpu flag
 * in cpu/common.c
 */
int __init notsc_setup(char *str)
{
        setup_clear_cpu_cap(X86_FEATURE_TSC);
        return 1;
}
#endif

__setup("notsc", notsc_setup);

static int no_sched_irq_time;
static int no_tsc_watchdog;
static int tsc_as_watchdog;

static int __init tsc_setup(char *str)
{
        if (!strcmp(str, "reliable"))
                tsc_clocksource_reliable = 1;
        if (!strncmp(str, "noirqtime", 9))
                no_sched_irq_time = 1;
        if (!strcmp(str, "unstable"))
                mark_tsc_unstable("boot parameter");
        if (!strcmp(str, "nowatchdog")) {
                no_tsc_watchdog = 1;
                if (tsc_as_watchdog)
                        pr_alert("%s: Overriding earlier tsc=watchdog with tsc=nowatchdog\n",
                                 __func__);
                tsc_as_watchdog = 0;
        }
        if (!strcmp(str, "recalibrate"))
                tsc_force_recalibrate = 1;
        if (!strcmp(str, "watchdog")) {
                if (no_tsc_watchdog)
                        pr_alert("%s: tsc=watchdog overridden by earlier tsc=nowatchdog\n",
                                 __func__);
                else
                        tsc_as_watchdog = 1;
        }
        return 1;
}

__setup("tsc=", tsc_setup);

#define MAX_RETRIES                5
#define TSC_DEFAULT_THRESHOLD        0x20000

/*
 * Read TSC and the reference counters. Take care of any disturbances
 */
static u64 tsc_read_refs(u64 *p, int hpet)
{
        u64 t1, t2;
        u64 thresh = tsc_khz ? tsc_khz >> 5 : TSC_DEFAULT_THRESHOLD;
        int i;

        for (i = 0; i < MAX_RETRIES; i++) {
                t1 = get_cycles();
                if (hpet)
                        *p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
                else
                        *p = acpi_pm_read_early();
                t2 = get_cycles();
                if ((t2 - t1) < thresh)
                        return t2;
        }
        return ULLONG_MAX;
}

/*
 * Calculate the TSC frequency from HPET reference
 */
static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2)
{
        u64 tmp;

        if (hpet2 < hpet1)
                hpet2 += 0x100000000ULL;
        hpet2 -= hpet1;
        tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
        do_div(tmp, 1000000);
        deltatsc = div64_u64(deltatsc, tmp);

        return (unsigned long) deltatsc;
}

/*
 * Calculate the TSC frequency from PMTimer reference
 */
static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
{
        u64 tmp;

        if (!pm1 && !pm2)
                return ULONG_MAX;

        if (pm2 < pm1)
                pm2 += (u64)ACPI_PM_OVRRUN;
        pm2 -= pm1;
        tmp = pm2 * 1000000000LL;
        do_div(tmp, PMTMR_TICKS_PER_SEC);
        do_div(deltatsc, tmp);

        return (unsigned long) deltatsc;
}

#define CAL_MS                10
#define CAL_LATCH        (PIT_TICK_RATE / (1000 / CAL_MS))
#define CAL_PIT_LOOPS        1000

#define CAL2_MS                50
#define CAL2_LATCH        (PIT_TICK_RATE / (1000 / CAL2_MS))
#define CAL2_PIT_LOOPS        5000


/*
 * Try to calibrate the TSC against the Programmable
 * Interrupt Timer and return the frequency of the TSC
 * in kHz.
 *
 * Return ULONG_MAX on failure to calibrate.
 */
static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
{
        u64 tsc, t1, t2, delta;
        unsigned long tscmin, tscmax;
        int pitcnt;

        if (!has_legacy_pic()) {
                /*
                 * Relies on tsc_early_delay_calibrate() to have given us semi
                 * usable udelay(), wait for the same 50ms we would have with
                 * the PIT loop below.
                 */
                udelay(10 * USEC_PER_MSEC);
                udelay(10 * USEC_PER_MSEC);
                udelay(10 * USEC_PER_MSEC);
                udelay(10 * USEC_PER_MSEC);
                udelay(10 * USEC_PER_MSEC);
                return ULONG_MAX;
        }

        /* Set the Gate high, disable speaker */
        outb((inb(0x61) & ~0x02) | 0x01, 0x61);

        /*
         * Setup CTC channel 2* for mode 0, (interrupt on terminal
         * count mode), binary count. Set the latch register to 50ms
         * (LSB then MSB) to begin countdown.
         */
        outb(0xb0, 0x43);
        outb(latch & 0xff, 0x42);
        outb(latch >> 8, 0x42);

        tsc = t1 = t2 = get_cycles();

        pitcnt = 0;
        tscmax = 0;
        tscmin = ULONG_MAX;
        while ((inb(0x61) & 0x20) == 0) {
                t2 = get_cycles();
                delta = t2 - tsc;
                tsc = t2;
                if ((unsigned long) delta < tscmin)
                        tscmin = (unsigned int) delta;
                if ((unsigned long) delta > tscmax)
                        tscmax = (unsigned int) delta;
                pitcnt++;
        }

        /*
         * Sanity checks:
         *
         * If we were not able to read the PIT more than loopmin
         * times, then we have been hit by a massive SMI
         *
         * If the maximum is 10 times larger than the minimum,
         * then we got hit by an SMI as well.
         */
        if (pitcnt < loopmin || tscmax > 10 * tscmin)
                return ULONG_MAX;

        /* Calculate the PIT value */
        delta = t2 - t1;
        do_div(delta, ms);
        return delta;
}

/*
 * This reads the current MSB of the PIT counter, and
 * checks if we are running on sufficiently fast and
 * non-virtualized hardware.
 *
 * Our expectations are:
 *
 *  - the PIT is running at roughly 1.19MHz
 *
 *  - each IO is going to take about 1us on real hardware,
 *    but we allow it to be much faster (by a factor of 10) or
 *    _slightly_ slower (ie we allow up to a 2us read+counter
 *    update - anything else implies a unacceptably slow CPU
 *    or PIT for the fast calibration to work.
 *
 *  - with 256 PIT ticks to read the value, we have 214us to
 *    see the same MSB (and overhead like doing a single TSC
 *    read per MSB value etc).
 *
 *  - We're doing 2 reads per loop (LSB, MSB), and we expect
 *    them each to take about a microsecond on real hardware.
 *    So we expect a count value of around 100. But we'll be
 *    generous, and accept anything over 50.
 *
 *  - if the PIT is stuck, and we see *many* more reads, we
 *    return early (and the next caller of pit_expect_msb()
 *    then consider it a failure when they don't see the
 *    next expected value).
 *
 * These expectations mean that we know that we have seen the
 * transition from one expected value to another with a fairly
 * high accuracy, and we didn't miss any events. We can thus
 * use the TSC value at the transitions to calculate a pretty
 * good value for the TSC frequency.
 */
static inline int pit_verify_msb(unsigned char val)
{
        /* Ignore LSB */
        inb(0x42);
        return inb(0x42) == val;
}

static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)
{
        int count;
        u64 tsc = 0, prev_tsc = 0;

        for (count = 0; count < 50000; count++) {
                if (!pit_verify_msb(val))
                        break;
                prev_tsc = tsc;
                tsc = get_cycles();
        }
        *deltap = get_cycles() - prev_tsc;
        *tscp = tsc;

        /*
         * We require _some_ success, but the quality control
         * will be based on the error terms on the TSC values.
         */
        return count > 5;
}

/*
 * How many MSB values do we want to see? We aim for
 * a maximum error rate of 500ppm (in practice the
 * real error is much smaller), but refuse to spend
 * more than 50ms on it.
 */
#define MAX_QUICK_PIT_MS 50
#define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)

static unsigned long quick_pit_calibrate(void)
{
        int i;
        u64 tsc, delta;
        unsigned long d1, d2;

        if (!has_legacy_pic())
                return 0;

        /* Set the Gate high, disable speaker */
        outb((inb(0x61) & ~0x02) | 0x01, 0x61);

        /*
         * Counter 2, mode 0 (one-shot), binary count
         *
         * NOTE! Mode 2 decrements by two (and then the
         * output is flipped each time, giving the same
         * final output frequency as a decrement-by-one),
         * so mode 0 is much better when looking at the
         * individual counts.
         */
        outb(0xb0, 0x43);

        /* Start at 0xffff */
        outb(0xff, 0x42);
        outb(0xff, 0x42);

        /*
         * The PIT starts counting at the next edge, so we
         * need to delay for a microsecond. The easiest way
         * to do that is to just read back the 16-bit counter
         * once from the PIT.
         */
        pit_verify_msb(0);

        if (pit_expect_msb(0xff, &tsc, &d1)) {
                for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) {
                        if (!pit_expect_msb(0xff-i, &delta, &d2))
                                break;

                        delta -= tsc;

                        /*
                         * Extrapolate the error and fail fast if the error will
                         * never be below 500 ppm.
                         */
                        if (i == 1 &&
                            d1 + d2 >= (delta * MAX_QUICK_PIT_ITERATIONS) >> 11)
                                return 0;

                        /*
                         * Iterate until the error is less than 500 ppm
                         */
                        if (d1+d2 >= delta >> 11)
                                continue;

                        /*
                         * Check the PIT one more time to verify that
                         * all TSC reads were stable wrt the PIT.
                         *
                         * This also guarantees serialization of the
                         * last cycle read ('d2') in pit_expect_msb.
                         */
                        if (!pit_verify_msb(0xfe - i))
                                break;
                        goto success;
                }
        }
        pr_info("Fast TSC calibration failed\n");
        return 0;

success:
        /*
         * Ok, if we get here, then we've seen the
         * MSB of the PIT decrement 'i' times, and the
         * error has shrunk to less than 500 ppm.
         *
         * As a result, we can depend on there not being
         * any odd delays anywhere, and the TSC reads are
         * reliable (within the error).
         *
         * kHz = ticks / time-in-seconds / 1000;
         * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000
         * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000)
         */
        delta *= PIT_TICK_RATE;
        do_div(delta, i*256*1000);
        pr_info("Fast TSC calibration using PIT\n");
        return delta;
}

/**
 * native_calibrate_tsc - determine TSC frequency
 * Determine TSC frequency via CPUID, else return 0.
 */
unsigned long native_calibrate_tsc(void)
{
        unsigned int eax_denominator, ebx_numerator, ecx_hz, edx;
        unsigned int crystal_khz;

        if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
                return 0;

        if (boot_cpu_data.cpuid_level < CPUID_LEAF_TSC)
                return 0;

        eax_denominator = ebx_numerator = ecx_hz = edx = 0;

        /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */
        cpuid(CPUID_LEAF_TSC, &eax_denominator, &ebx_numerator, &ecx_hz, &edx);

        if (ebx_numerator == 0 || eax_denominator == 0)
                return 0;

        crystal_khz = ecx_hz / 1000;

        /*
         * Denverton SoCs don't report crystal clock, and also don't support
         * CPUID_LEAF_FREQ for the calculation below, so hardcode the 25MHz
         * crystal clock.
         */
        if (crystal_khz == 0 &&
                        boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT_D)
                crystal_khz = 25000;

        /*
         * TSC frequency reported directly by CPUID is a "hardware reported"
         * frequency and is the most accurate one so far we have. This
         * is considered a known frequency.
         */
        if (crystal_khz != 0)
                setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);

        /*
         * Some Intel SoCs like Skylake and Kabylake don't report the crystal
         * clock, but we can easily calculate it to a high degree of accuracy
         * by considering the crystal ratio and the CPU speed.
         */
        if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= CPUID_LEAF_FREQ) {
                unsigned int eax_base_mhz, ebx, ecx, edx;

                cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx, &ecx, &edx);
                crystal_khz = eax_base_mhz * 1000 *
                        eax_denominator / ebx_numerator;
        }

        if (crystal_khz == 0)
                return 0;

        /*
         * For Atom SoCs TSC is the only reliable clocksource.
         * Mark TSC reliable so no watchdog on it.
         */
        if (boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT)
                setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);

#ifdef CONFIG_X86_LOCAL_APIC
        /*
         * The local APIC appears to be fed by the core crystal clock
         * (which sounds entirely sensible). We can set the global
         * lapic_timer_period here to avoid having to calibrate the APIC
         * timer later.
         */
        lapic_timer_period = crystal_khz * 1000 / HZ;
#endif

        return crystal_khz * ebx_numerator / eax_denominator;
}

static unsigned long cpu_khz_from_cpuid(void)
{
        unsigned int eax_base_mhz, ebx_max_mhz, ecx_bus_mhz, edx;

        if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
                return 0;

        if (boot_cpu_data.cpuid_level < CPUID_LEAF_FREQ)
                return 0;

        eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0;

        cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx);

        return eax_base_mhz * 1000;
}

/*
 * calibrate cpu using pit, hpet, and ptimer methods. They are available
 * later in boot after acpi is initialized.
 */
static unsigned long pit_hpet_ptimer_calibrate_cpu(void)
{
        u64 tsc1, tsc2, delta, ref1, ref2;
        unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
        unsigned long flags, latch, ms;
        int hpet = is_hpet_enabled(), i, loopmin;

        /*
         * Run 5 calibration loops to get the lowest frequency value
         * (the best estimate). We use two different calibration modes
         * here:
         *
         * 1) PIT loop. We set the PIT Channel 2 to oneshot mode and
         * load a timeout of 50ms. We read the time right after we
         * started the timer and wait until the PIT count down reaches
         * zero. In each wait loop iteration we read the TSC and check
         * the delta to the previous read. We keep track of the min
         * and max values of that delta. The delta is mostly defined
         * by the IO time of the PIT access, so we can detect when
         * any disturbance happened between the two reads. If the
         * maximum time is significantly larger than the minimum time,
         * then we discard the result and have another try.
         *
         * 2) Reference counter. If available we use the HPET or the
         * PMTIMER as a reference to check the sanity of that value.
         * We use separate TSC readouts and check inside of the
         * reference read for any possible disturbance. We discard
         * disturbed values here as well. We do that around the PIT
         * calibration delay loop as we have to wait for a certain
         * amount of time anyway.
         */

        /* Preset PIT loop values */
        latch = CAL_LATCH;
        ms = CAL_MS;
        loopmin = CAL_PIT_LOOPS;

        for (i = 0; i < 3; i++) {
                unsigned long tsc_pit_khz;

                /*
                 * Read the start value and the reference count of
                 * hpet/pmtimer when available. Then do the PIT
                 * calibration, which will take at least 50ms, and
                 * read the end value.
                 */
                local_irq_save(flags);
                tsc1 = tsc_read_refs(&ref1, hpet);
                tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin);
                tsc2 = tsc_read_refs(&ref2, hpet);
                local_irq_restore(flags);

                /* Pick the lowest PIT TSC calibration so far */
                tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);

                /* hpet or pmtimer available ? */
                if (ref1 == ref2)
                        continue;

                /* Check, whether the sampling was disturbed */
                if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX)
                        continue;

                tsc2 = (tsc2 - tsc1) * 1000000LL;
                if (hpet)
                        tsc2 = calc_hpet_ref(tsc2, ref1, ref2);
                else
                        tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2);

                tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);

                /* Check the reference deviation */
                delta = ((u64) tsc_pit_min) * 100;
                do_div(delta, tsc_ref_min);

                /*
                 * If both calibration results are inside a 10% window
                 * then we can be sure, that the calibration
                 * succeeded. We break out of the loop right away. We
                 * use the reference value, as it is more precise.
                 */
                if (delta >= 90 && delta <= 110) {
                        pr_info("PIT calibration matches %s. %d loops\n",
                                hpet ? "HPET" : "PMTIMER", i + 1);
                        return tsc_ref_min;
                }

                /*
                 * Check whether PIT failed more than once. This
                 * happens in virtualized environments. We need to
                 * give the virtual PC a slightly longer timeframe for
                 * the HPET/PMTIMER to make the result precise.
                 */
                if (i == 1 && tsc_pit_min == ULONG_MAX) {
                        latch = CAL2_LATCH;
                        ms = CAL2_MS;
                        loopmin = CAL2_PIT_LOOPS;
                }
        }

        /*
         * Now check the results.
         */
        if (tsc_pit_min == ULONG_MAX) {
                /* PIT gave no useful value */
                pr_warn("Unable to calibrate against PIT\n");

                /* We don't have an alternative source, disable TSC */
                if (!hpet && !ref1 && !ref2) {
                        pr_notice("No reference (HPET/PMTIMER) available\n");
                        return 0;
                }

                /* The alternative source failed as well, disable TSC */
                if (tsc_ref_min == ULONG_MAX) {
                        pr_warn("HPET/PMTIMER calibration failed\n");
                        return 0;
                }

                /* Use the alternative source */
                pr_info("using %s reference calibration\n",
                        hpet ? "HPET" : "PMTIMER");

                return tsc_ref_min;
        }

        /* We don't have an alternative source, use the PIT calibration value */
        if (!hpet && !ref1 && !ref2) {
                pr_info("Using PIT calibration value\n");
                return tsc_pit_min;
        }

        /* The alternative source failed, use the PIT calibration value */
        if (tsc_ref_min == ULONG_MAX) {
                pr_warn("HPET/PMTIMER calibration failed. Using PIT calibration.\n");
                return tsc_pit_min;
        }

        /*
         * The calibration values differ too much. In doubt, we use
         * the PIT value as we know that there are PMTIMERs around
         * running at double speed. At least we let the user know:
         */
        pr_warn("PIT calibration deviates from %s: %lu %lu\n",
                hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
        pr_info("Using PIT calibration value\n");
        return tsc_pit_min;
}

/**
 * native_calibrate_cpu_early - can calibrate the cpu early in boot
 */
unsigned long native_calibrate_cpu_early(void)
{
        unsigned long flags, fast_calibrate = cpu_khz_from_cpuid();

        if (!fast_calibrate)
                fast_calibrate = cpu_khz_from_msr();
        if (!fast_calibrate) {
                local_irq_save(flags);
                fast_calibrate = quick_pit_calibrate();
                local_irq_restore(flags);
        }
        return fast_calibrate;
}


/**
 * native_calibrate_cpu - calibrate the cpu
 */
static unsigned long native_calibrate_cpu(void)
{
        unsigned long tsc_freq = native_calibrate_cpu_early();

        if (!tsc_freq)
                tsc_freq = pit_hpet_ptimer_calibrate_cpu();

        return tsc_freq;
}

void recalibrate_cpu_khz(void)
{
#ifndef CONFIG_SMP
        unsigned long cpu_khz_old = cpu_khz;

        if (!boot_cpu_has(X86_FEATURE_TSC))
                return;

        cpu_khz = x86_platform.calibrate_cpu();
        tsc_khz = x86_platform.calibrate_tsc();
        if (tsc_khz == 0)
                tsc_khz = cpu_khz;
        else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz)
                cpu_khz = tsc_khz;
        cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy,
                                                    cpu_khz_old, cpu_khz);
#endif
}
EXPORT_SYMBOL_GPL(recalibrate_cpu_khz);


static unsigned long long cyc2ns_suspend;

void tsc_save_sched_clock_state(void)
{
        if (!static_branch_likely(&__use_tsc) && !sched_clock_stable())
                return;

        cyc2ns_suspend = sched_clock();
}

/*
 * Even on processors with invariant TSC, TSC gets reset in some the
 * ACPI system sleep states. And in some systems BIOS seem to reinit TSC to
 * arbitrary value (still sync'd across cpu's) during resume from such sleep
 * states. To cope up with this, recompute the cyc2ns_offset for each cpu so
 * that sched_clock() continues from the point where it was left off during
 * suspend.
 */
void tsc_restore_sched_clock_state(void)
{
        unsigned long long offset;
        unsigned long flags;
        int cpu;

        if (!static_branch_likely(&__use_tsc) && !sched_clock_stable())
                return;

        local_irq_save(flags);

        /*
         * We're coming out of suspend, there's no concurrency yet; don't
         * bother being nice about the RCU stuff, just write to both
         * data fields.
         */

        this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0);
        this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0);

        offset = cyc2ns_suspend - sched_clock();

        for_each_possible_cpu(cpu) {
                per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset;
                per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset;
        }

        local_irq_restore(flags);
}

#ifdef CONFIG_CPU_FREQ
/*
 * Frequency scaling support. Adjust the TSC based timer when the CPU frequency
 * changes.
 *
 * NOTE: On SMP the situation is not fixable in general, so simply mark the TSC
 * as unstable and give up in those cases.
 *
 * Should fix up last_tsc too. Currently gettimeofday in the
 * first tick after the change will be slightly wrong.
 */

static unsigned int  ref_freq;
static unsigned long loops_per_jiffy_ref;
static unsigned long tsc_khz_ref;

static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
                                void *data)
{
        struct cpufreq_freqs *freq = data;

        if (num_online_cpus() > 1) {
                mark_tsc_unstable("cpufreq changes on SMP");
                return 0;
        }

        if (!ref_freq) {
                ref_freq = freq->old;
                loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy;
                tsc_khz_ref = tsc_khz;
        }

        if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
            (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
                boot_cpu_data.loops_per_jiffy =
                        cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);

                tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
                if (!(freq->flags & CPUFREQ_CONST_LOOPS))
                        mark_tsc_unstable("cpufreq changes");

                set_cyc2ns_scale(tsc_khz, freq->policy->cpu, rdtsc());
        }

        return 0;
}

static struct notifier_block time_cpufreq_notifier_block = {
        .notifier_call  = time_cpufreq_notifier
};

static int __init cpufreq_register_tsc_scaling(void)
{
        if (!boot_cpu_has(X86_FEATURE_TSC))
                return 0;
        if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
                return 0;
        cpufreq_register_notifier(&time_cpufreq_notifier_block,
                                CPUFREQ_TRANSITION_NOTIFIER);
        return 0;
}

core_initcall(cpufreq_register_tsc_scaling);

#endif /* CONFIG_CPU_FREQ */

#define ART_MIN_DENOMINATOR (1)

/*
 * If ART is present detect the numerator:denominator to convert to TSC
 */
static void __init detect_art(void)
{
        unsigned int unused;

        if (boot_cpu_data.cpuid_level < CPUID_LEAF_TSC)
                return;

        /*
         * Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required,
         * and the TSC counter resets must not occur asynchronously.
         */
        if (boot_cpu_has(X86_FEATURE_HYPERVISOR) ||
            !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
            !boot_cpu_has(X86_FEATURE_TSC_ADJUST) ||
            tsc_async_resets)
                return;

        cpuid(CPUID_LEAF_TSC, &art_base_clk.denominator,
              &art_base_clk.numerator, &art_base_clk.freq_khz, &unused);

        art_base_clk.freq_khz /= KHZ;
        if (art_base_clk.denominator < ART_MIN_DENOMINATOR)
                return;

        rdmsrq(MSR_IA32_TSC_ADJUST, art_base_clk.offset);

        /* Make this sticky over multiple CPU init calls */
        setup_force_cpu_cap(X86_FEATURE_ART);
}


/* clocksource code */

static void tsc_resume(struct clocksource *cs)
{
        tsc_verify_tsc_adjust(true);
}

/*
 * We used to compare the TSC to the cycle_last value in the clocksource
 * structure to avoid a nasty time-warp. This can be observed in a
 * very small window right after one CPU updated cycle_last under
 * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which
 * is smaller than the cycle_last reference value due to a TSC which
 * is slightly behind. This delta is nowhere else observable, but in
 * that case it results in a forward time jump in the range of hours
 * due to the unsigned delta calculation of the time keeping core
 * code, which is necessary to support wrapping clocksources like pm
 * timer.
 *
 * This sanity check is now done in the core timekeeping code.
 * checking the result of read_tsc() - cycle_last for being negative.
 * That works because CLOCKSOURCE_MASK(64) does not mask out any bit.
 */
static u64 read_tsc(struct clocksource *cs)
{
        return (u64)rdtsc_ordered();
}

static void tsc_cs_mark_unstable(struct clocksource *cs)
{
        if (tsc_unstable)
                return;

        tsc_unstable = 1;
        if (using_native_sched_clock())
                clear_sched_clock_stable();
        disable_sched_clock_irqtime();
        pr_info("Marking TSC unstable due to clocksource watchdog\n");
}

static void tsc_cs_tick_stable(struct clocksource *cs)
{
        if (tsc_unstable)
                return;

        if (using_native_sched_clock())
                sched_clock_tick_stable();
}

static int tsc_cs_enable(struct clocksource *cs)
{
        vclocks_set_used(VDSO_CLOCKMODE_TSC);
        return 0;
}

/*
 * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
 */
static struct clocksource clocksource_tsc_early = {
        .name                        = "tsc-early",
        .rating                        = 299,
        .uncertainty_margin        = 32 * NSEC_PER_MSEC,
        .read                        = read_tsc,
        .mask                        = CLOCKSOURCE_MASK(64),
        .flags                        = CLOCK_SOURCE_IS_CONTINUOUS |
                                  CLOCK_SOURCE_MUST_VERIFY,
        .id                        = CSID_X86_TSC_EARLY,
        .vdso_clock_mode        = VDSO_CLOCKMODE_TSC,
        .enable                        = tsc_cs_enable,
        .resume                        = tsc_resume,
        .mark_unstable                = tsc_cs_mark_unstable,
        .tick_stable                = tsc_cs_tick_stable,
        .list                        = LIST_HEAD_INIT(clocksource_tsc_early.list),
};

/*
 * Must mark VALID_FOR_HRES early such that when we unregister tsc_early
 * this one will immediately take over. We will only register if TSC has
 * been found good.
 */
static struct clocksource clocksource_tsc = {
        .name                        = "tsc",
        .rating                        = 300,
        .read                        = read_tsc,
        .mask                        = CLOCKSOURCE_MASK(64),
        .flags                        = CLOCK_SOURCE_IS_CONTINUOUS |
                                  CLOCK_SOURCE_VALID_FOR_HRES |
                                  CLOCK_SOURCE_MUST_VERIFY |
                                  CLOCK_SOURCE_VERIFY_PERCPU,
        .id                        = CSID_X86_TSC,
        .vdso_clock_mode        = VDSO_CLOCKMODE_TSC,
        .enable                        = tsc_cs_enable,
        .resume                        = tsc_resume,
        .mark_unstable                = tsc_cs_mark_unstable,
        .tick_stable                = tsc_cs_tick_stable,
        .list                        = LIST_HEAD_INIT(clocksource_tsc.list),
};

void mark_tsc_unstable(char *reason)
{
        if (tsc_unstable)
                return;

        tsc_unstable = 1;
        if (using_native_sched_clock())
                clear_sched_clock_stable();
        disable_sched_clock_irqtime();
        pr_info("Marking TSC unstable due to %s\n", reason);

        clocksource_mark_unstable(&clocksource_tsc_early);
        clocksource_mark_unstable(&clocksource_tsc);
}

EXPORT_SYMBOL_GPL(mark_tsc_unstable);

static void __init tsc_disable_clocksource_watchdog(void)
{
        clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
        clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
}

bool tsc_clocksource_watchdog_disabled(void)
{
        return !(clocksource_tsc.flags & CLOCK_SOURCE_MUST_VERIFY) &&
               tsc_as_watchdog && !no_tsc_watchdog;
}

static void __init check_system_tsc_reliable(void)
{
#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
        if (is_geode_lx()) {
                /* RTSC counts during suspend */
#define RTSC_SUSP 0x100
                unsigned long res_low, res_high;

                rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
                /* Geode_LX - the OLPC CPU has a very reliable TSC */
                if (res_low & RTSC_SUSP)
                        tsc_clocksource_reliable = 1;
        }
#endif
        if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
                tsc_clocksource_reliable = 1;

        /*
         * Disable the clocksource watchdog when the system has:
         *  - TSC running at constant frequency
         *  - TSC which does not stop in C-States
         *  - the TSC_ADJUST register which allows to detect even minimal
         *    modifications
         *  - not more than four packages
         */
        if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
            boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
            boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
            topology_max_packages() <= 4)
                tsc_disable_clocksource_watchdog();
}

/*
 * Make an educated guess if the TSC is trustworthy and synchronized
 * over all CPUs.
 */
int unsynchronized_tsc(void)
{
        if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_unstable)
                return 1;

#ifdef CONFIG_SMP
        if (apic_is_clustered_box())
                return 1;
#endif

        if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
                return 0;

        if (tsc_clocksource_reliable)
                return 0;
        /*
         * Intel systems are normally all synchronized.
         * Exceptions must mark TSC as unstable:
         */
        if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
                /* assume multi socket systems are not synchronized: */
                if (topology_max_packages() > 1)
                        return 1;
        }

        return 0;
}

static void tsc_refine_calibration_work(struct work_struct *work);
static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
/**
 * tsc_refine_calibration_work - Further refine tsc freq calibration
 * @work: ignored.
 *
 * This functions uses delayed work over a period of a
 * second to further refine the TSC freq value. Since this is
 * timer based, instead of loop based, we don't block the boot
 * process while this longer calibration is done.
 *
 * If there are any calibration anomalies (too many SMIs, etc),
 * or the refined calibration is off by 1% of the fast early
 * calibration, we throw out the new calibration and use the
 * early calibration.
 */
static void tsc_refine_calibration_work(struct work_struct *work)
{
        static u64 tsc_start = ULLONG_MAX, ref_start;
        static int hpet;
        u64 tsc_stop, ref_stop, delta;
        unsigned long freq;
        int cpu;

        /* Don't bother refining TSC on unstable systems */
        if (tsc_unstable)
                goto unreg;

        /*
         * Since the work is started early in boot, we may be
         * delayed the first time we expire. So set the workqueue
         * again once we know timers are working.
         */
        if (tsc_start == ULLONG_MAX) {
restart:
                /*
                 * Only set hpet once, to avoid mixing hardware
                 * if the hpet becomes enabled later.
                 */
                hpet = is_hpet_enabled();
                tsc_start = tsc_read_refs(&ref_start, hpet);
                schedule_delayed_work(&tsc_irqwork, HZ);
                return;
        }

        tsc_stop = tsc_read_refs(&ref_stop, hpet);

        /* hpet or pmtimer available ? */
        if (ref_start == ref_stop)
                goto out;

        /* Check, whether the sampling was disturbed */
        if (tsc_stop == ULLONG_MAX)
                goto restart;

        delta = tsc_stop - tsc_start;
        delta *= 1000000LL;
        if (hpet)
                freq = calc_hpet_ref(delta, ref_start, ref_stop);
        else
                freq = calc_pmtimer_ref(delta, ref_start, ref_stop);

        /* Will hit this only if tsc_force_recalibrate has been set */
        if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) {

                /* Warn if the deviation exceeds 500 ppm */
                if (abs(tsc_khz - freq) > (tsc_khz >> 11)) {
                        pr_warn("Warning: TSC freq calibrated by CPUID/MSR differs from what is calibrated by HW timer, please check with vendor!!\n");
                        pr_info("Previous calibrated TSC freq:\t %lu.%03lu MHz\n",
                                (unsigned long)tsc_khz / 1000,
                                (unsigned long)tsc_khz % 1000);
                }

                pr_info("TSC freq recalibrated by [%s]:\t %lu.%03lu MHz\n",
                        hpet ? "HPET" : "PM_TIMER",
                        (unsigned long)freq / 1000,
                        (unsigned long)freq % 1000);

                return;
        }

        /* Make sure we're within 1% */
        if (abs(tsc_khz - freq) > tsc_khz/100)
                goto out;

        tsc_khz = freq;
        pr_info("Refined TSC clocksource calibration: %lu.%03lu MHz\n",
                (unsigned long)tsc_khz / 1000,
                (unsigned long)tsc_khz % 1000);

        /* Inform the TSC deadline clockevent devices about the recalibration */
        lapic_update_tsc_freq();

        /* Update the sched_clock() rate to match the clocksource one */
        for_each_possible_cpu(cpu)
                set_cyc2ns_scale(tsc_khz, cpu, tsc_stop);

out:
        if (tsc_unstable)
                goto unreg;

        if (boot_cpu_has(X86_FEATURE_ART)) {
                have_art = true;
                clocksource_tsc.base = &art_base_clk;
        }
        clocksource_register_khz(&clocksource_tsc, tsc_khz);
unreg:
        clocksource_unregister(&clocksource_tsc_early);
}


static int __init init_tsc_clocksource(void)
{
        if (!boot_cpu_has(X86_FEATURE_TSC) || !tsc_khz)
                return 0;

        if (tsc_unstable) {
                clocksource_unregister(&clocksource_tsc_early);
                return 0;
        }

        if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
                clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;

        /*
         * When TSC frequency is known (retrieved via MSR or CPUID), we skip
         * the refined calibration and directly register it as a clocksource.
         */
        if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) {
                if (boot_cpu_has(X86_FEATURE_ART)) {
                        have_art = true;
                        clocksource_tsc.base = &art_base_clk;
                }
                clocksource_register_khz(&clocksource_tsc, tsc_khz);
                clocksource_unregister(&clocksource_tsc_early);

                if (!tsc_force_recalibrate)
                        return 0;
        }

        schedule_delayed_work(&tsc_irqwork, 0);
        return 0;
}
/*
 * We use device_initcall here, to ensure we run after the hpet
 * is fully initialized, which may occur at fs_initcall time.
 */
device_initcall(init_tsc_clocksource);

static bool __init determine_cpu_tsc_frequencies(bool early)
{
        /* Make sure that cpu and tsc are not already calibrated */
        WARN_ON(cpu_khz || tsc_khz);

        if (early) {
                cpu_khz = x86_platform.calibrate_cpu();
                if (tsc_early_khz) {
                        tsc_khz = tsc_early_khz;
                } else {
                        tsc_khz = x86_platform.calibrate_tsc();
                        clocksource_tsc.freq_khz = tsc_khz;
                }
        } else {
                /* We should not be here with non-native cpu calibration */
                WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu);
                cpu_khz = pit_hpet_ptimer_calibrate_cpu();
        }

        /*
         * Trust non-zero tsc_khz as authoritative,
         * and use it to sanity check cpu_khz,
         * which will be off if system timer is off.
         */
        if (tsc_khz == 0)
                tsc_khz = cpu_khz;
        else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz)
                cpu_khz = tsc_khz;

        if (tsc_khz == 0)
                return false;

        pr_info("Detected %lu.%03lu MHz processor\n",
                (unsigned long)cpu_khz / KHZ,
                (unsigned long)cpu_khz % KHZ);

        if (cpu_khz != tsc_khz) {
                pr_info("Detected %lu.%03lu MHz TSC",
                        (unsigned long)tsc_khz / KHZ,
                        (unsigned long)tsc_khz % KHZ);
        }
        return true;
}

static unsigned long __init get_loops_per_jiffy(void)
{
        u64 lpj = (u64)tsc_khz * KHZ;

        do_div(lpj, HZ);
        return lpj;
}

static void __init tsc_enable_sched_clock(void)
{
        loops_per_jiffy = get_loops_per_jiffy();
        use_tsc_delay();

        /* Sanitize TSC ADJUST before cyc2ns gets initialized */
        tsc_store_and_check_tsc_adjust(true);
        cyc2ns_init_boot_cpu();
        static_branch_enable(&__use_tsc);
}

void __init tsc_early_init(void)
{
        if (!boot_cpu_has(X86_FEATURE_TSC))
                return;
        /* Don't change UV TSC multi-chassis synchronization */
        if (is_early_uv_system())
                return;

        snp_secure_tsc_init();

        if (!determine_cpu_tsc_frequencies(true))
                return;
        tsc_enable_sched_clock();
}

void __init tsc_init(void)
{
        if (!cpu_feature_enabled(X86_FEATURE_TSC)) {
                setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
                return;
        }

        /*
         * native_calibrate_cpu_early can only calibrate using methods that are
         * available early in boot.
         */
        if (x86_platform.calibrate_cpu == native_calibrate_cpu_early)
                x86_platform.calibrate_cpu = native_calibrate_cpu;

        if (!tsc_khz) {
                /* We failed to determine frequencies earlier, try again */
                if (!determine_cpu_tsc_frequencies(false)) {
                        mark_tsc_unstable("could not calculate TSC khz");
                        setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
                        return;
                }
                tsc_enable_sched_clock();
        }

        cyc2ns_init_secondary_cpus();

        if (!no_sched_irq_time)
                enable_sched_clock_irqtime();

        lpj_fine = get_loops_per_jiffy();

        check_system_tsc_reliable();

        if (unsynchronized_tsc()) {
                mark_tsc_unstable("TSCs unsynchronized");
                return;
        }

        if (tsc_clocksource_reliable || no_tsc_watchdog)
                tsc_disable_clocksource_watchdog();

        clocksource_register_khz(&clocksource_tsc_early, tsc_khz);
        detect_art();
}

#ifdef CONFIG_SMP
/*
 * Check whether existing calibration data can be reused.
 */
unsigned long calibrate_delay_is_known(void)
{
        int sibling, cpu = smp_processor_id();
        int constant_tsc = cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC);
        const struct cpumask *mask = topology_core_cpumask(cpu);

        /*
         * If TSC has constant frequency and TSC is synchronized across
         * sockets then reuse CPU0 calibration.
         */
        if (constant_tsc && !tsc_unstable)
                return cpu_data(0).loops_per_jiffy;

        /*
         * If TSC has constant frequency and TSC is not synchronized across
         * sockets and this is not the first CPU in the socket, then reuse
         * the calibration value of an already online CPU on that socket.
         *
         * This assumes that CONSTANT_TSC is consistent for all CPUs in a
         * socket.
         */
        if (!constant_tsc || !mask)
                return 0;

        sibling = cpumask_any_but(mask, cpu);
        if (sibling < nr_cpu_ids)
                return cpu_data(sibling).loops_per_jiffy;
        return 0;
}
#endif















































    7 



    7 








































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Cryptographic API.
 *
 * Null algorithms, aka Much Ado About Nothing.
 *
 * These are needed for IPsec, and may be useful in general for
 * testing & debugging.
 *
 * The null cipher is compliant with RFC2410.
 *
 * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
 */

#include <crypto/null.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/skcipher.h>
#include <crypto/scatterwalk.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/string.h>

static int null_init(struct shash_desc *desc)
{
        return 0;
}

static int null_update(struct shash_desc *desc, const u8 *data,
                       unsigned int len)
{
        return 0;
}

static int null_final(struct shash_desc *desc, u8 *out)
{
        return 0;
}

static int null_digest(struct shash_desc *desc, const u8 *data,
                       unsigned int len, u8 *out)
{
        return 0;
}

static int null_hash_setkey(struct crypto_shash *tfm, const u8 *key,
                            unsigned int keylen)
{ return 0; }

static int null_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key,
                                unsigned int keylen)
{ return 0; }

static int null_setkey(struct crypto_tfm *tfm, const u8 *key,
                       unsigned int keylen)
{ return 0; }

static void null_crypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
        memcpy(dst, src, NULL_BLOCK_SIZE);
}

static int null_skcipher_crypt(struct skcipher_request *req)
{
        if (req->src != req->dst)
                memcpy_sglist(req->dst, req->src, req->cryptlen);
        return 0;
}

static struct shash_alg digest_null = {
        .digestsize                =        NULL_DIGEST_SIZE,
        .setkey                   =        null_hash_setkey,
        .init                   =        null_init,
        .update                 =        null_update,
        .finup                         =        null_digest,
        .digest                 =        null_digest,
        .final                  =        null_final,
        .base                        =        {
                .cra_name                =        "digest_null",
                .cra_driver_name        =        "digest_null-generic",
                .cra_blocksize                =        NULL_BLOCK_SIZE,
                .cra_module                =        THIS_MODULE,
        }
};

static struct skcipher_alg skcipher_null = {
        .base.cra_name                =        "ecb(cipher_null)",
        .base.cra_driver_name        =        "ecb-cipher_null",
        .base.cra_priority        =        100,
        .base.cra_blocksize        =        NULL_BLOCK_SIZE,
        .base.cra_ctxsize        =        0,
        .base.cra_module        =        THIS_MODULE,
        .min_keysize                =        NULL_KEY_SIZE,
        .max_keysize                =        NULL_KEY_SIZE,
        .ivsize                        =        NULL_IV_SIZE,
        .setkey                        =        null_skcipher_setkey,
        .encrypt                =        null_skcipher_crypt,
        .decrypt                =        null_skcipher_crypt,
};

static struct crypto_alg cipher_null = {
        .cra_name                =        "cipher_null",
        .cra_driver_name        =        "cipher_null-generic",
        .cra_flags                =        CRYPTO_ALG_TYPE_CIPHER,
        .cra_blocksize                =        NULL_BLOCK_SIZE,
        .cra_ctxsize                =        0,
        .cra_module                =        THIS_MODULE,
        .cra_u                        =        { .cipher = {
        .cia_min_keysize        =        NULL_KEY_SIZE,
        .cia_max_keysize        =        NULL_KEY_SIZE,
        .cia_setkey                =         null_setkey,
        .cia_encrypt                =        null_crypt,
        .cia_decrypt                =        null_crypt } }
};

MODULE_ALIAS_CRYPTO("digest_null");
MODULE_ALIAS_CRYPTO("cipher_null");

static int __init crypto_null_mod_init(void)
{
        int ret = 0;

        ret = crypto_register_alg(&cipher_null);
        if (ret < 0)
                goto out;

        ret = crypto_register_shash(&digest_null);
        if (ret < 0)
                goto out_unregister_algs;

        ret = crypto_register_skcipher(&skcipher_null);
        if (ret < 0)
                goto out_unregister_shash;

        return 0;

out_unregister_shash:
        crypto_unregister_shash(&digest_null);
out_unregister_algs:
        crypto_unregister_alg(&cipher_null);
out:
        return ret;
}

static void __exit crypto_null_mod_fini(void)
{
        crypto_unregister_alg(&cipher_null);
        crypto_unregister_shash(&digest_null);
        crypto_unregister_skcipher(&skcipher_null);
}

module_init(crypto_null_mod_init);
module_exit(crypto_null_mod_fini);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Null Cryptographic Algorithms");













































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (c) 2020 Christoph Hellwig.
 *
 * Support for "universal" pointers that can point to either kernel or userspace
 * memory.
 */
#ifndef _LINUX_SOCKPTR_H
#define _LINUX_SOCKPTR_H

#include <linux/slab.h>
#include <linux/uaccess.h>

typedef struct {
        union {
                void                *kernel;
                void __user        *user;
        };
        bool                is_kernel : 1;
} sockptr_t;

static inline bool sockptr_is_kernel(sockptr_t sockptr)
{
        return sockptr.is_kernel;
}

static inline sockptr_t KERNEL_SOCKPTR(void *p)
{
        return (sockptr_t) { .kernel = p, .is_kernel = true };
}

static inline sockptr_t USER_SOCKPTR(void __user *p)
{
        return (sockptr_t) { .user = p };
}

static inline bool sockptr_is_null(sockptr_t sockptr)
{
        if (sockptr_is_kernel(sockptr))
                return !sockptr.kernel;
        return !sockptr.user;
}

static inline int copy_from_sockptr_offset(void *dst, sockptr_t src,
                size_t offset, size_t size)
{
        if (!sockptr_is_kernel(src))
                return copy_from_user(dst, src.user + offset, size);
        memcpy(dst, src.kernel + offset, size);
        return 0;
}

/* Deprecated.
 * This is unsafe, unless caller checked user provided optlen.
 * Prefer copy_safe_from_sockptr() instead.
 *
 * Returns 0 for success, or number of bytes not copied on error.
 */
static inline int copy_from_sockptr(void *dst, sockptr_t src, size_t size)
{
        return copy_from_sockptr_offset(dst, src, 0, size);
}

/**
 * copy_safe_from_sockptr: copy a struct from sockptr
 * @dst:   Destination address, in kernel space. This buffer must be @ksize
 *         bytes long.
 * @ksize: Size of @dst struct.
 * @optval: Source address. (in user or kernel space)
 * @optlen: Size of @optval data.
 *
 * Returns:
 *  * -EINVAL: @optlen < @ksize
 *  * -EFAULT: access to userspace failed.
 *  * 0 : @ksize bytes were copied
 */
static inline int copy_safe_from_sockptr(void *dst, size_t ksize,
                                         sockptr_t optval, unsigned int optlen)
{
        if (optlen < ksize)
                return -EINVAL;
        if (copy_from_sockptr(dst, optval, ksize))
                return -EFAULT;
        return 0;
}

static inline int copy_struct_from_sockptr(void *dst, size_t ksize,
                sockptr_t src, size_t usize)
{
        size_t size = min(ksize, usize);
        size_t rest = max(ksize, usize) - size;

        if (!sockptr_is_kernel(src))
                return copy_struct_from_user(dst, ksize, src.user, size);

        if (usize < ksize) {
                memset(dst + size, 0, rest);
        } else if (usize > ksize) {
                char *p = src.kernel;

                while (rest--) {
                        if (*p++)
                                return -E2BIG;
                }
        }
        memcpy(dst, src.kernel, size);
        return 0;
}

static inline int copy_to_sockptr_offset(sockptr_t dst, size_t offset,
                const void *src, size_t size)
{
        if (!sockptr_is_kernel(dst))
                return copy_to_user(dst.user + offset, src, size);
        memcpy(dst.kernel + offset, src, size);
        return 0;
}

static inline int copy_to_sockptr(sockptr_t dst, const void *src, size_t size)
{
        return copy_to_sockptr_offset(dst, 0, src, size);
}

static inline void *memdup_sockptr_noprof(sockptr_t src, size_t len)
{
        void *p = kmalloc_track_caller_noprof(len, GFP_USER | __GFP_NOWARN);

        if (!p)
                return ERR_PTR(-ENOMEM);
        if (copy_from_sockptr(p, src, len)) {
                kfree(p);
                return ERR_PTR(-EFAULT);
        }
        return p;
}
#define memdup_sockptr(...)        alloc_hooks(memdup_sockptr_noprof(__VA_ARGS__))

static inline void *memdup_sockptr_nul_noprof(sockptr_t src, size_t len)
{
        char *p = kmalloc_track_caller_noprof(len + 1, GFP_KERNEL);

        if (!p)
                return ERR_PTR(-ENOMEM);
        if (copy_from_sockptr(p, src, len)) {
                kfree(p);
                return ERR_PTR(-EFAULT);
        }
        p[len] = '\0';
        return p;
}
#define memdup_sockptr_nul(...)        alloc_hooks(memdup_sockptr_nul_noprof(__VA_ARGS__))

static inline long strncpy_from_sockptr(char *dst, sockptr_t src, size_t count)
{
        if (sockptr_is_kernel(src)) {
                size_t len = min(strnlen(src.kernel, count - 1) + 1, count);

                memcpy(dst, src.kernel, len);
                return len;
        }
        return strncpy_from_user(dst, src.user, count);
}

static inline int check_zeroed_sockptr(sockptr_t src, size_t offset,
                                       size_t size)
{
        if (!sockptr_is_kernel(src))
                return check_zeroed_user(src.user + offset, size);
        return memchr_inv(src.kernel + offset, 0, size) == NULL;
}

#endif /* _LINUX_SOCKPTR_H */






































































































  166 























  319 









































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
/* SPDX-License-Identifier: GPL-2.0 */
/* thread_info.h: common low-level thread information accessors
 *
 * Copyright (C) 2002  David Howells (dhowells@redhat.com)
 * - Incorporating suggestions made by Linus Torvalds
 */

#ifndef _LINUX_THREAD_INFO_H
#define _LINUX_THREAD_INFO_H

#include <linux/types.h>
#include <linux/limits.h>
#include <linux/bug.h>
#include <linux/restart_block.h>
#include <linux/errno.h>

#ifdef CONFIG_THREAD_INFO_IN_TASK
/*
 * For CONFIG_THREAD_INFO_IN_TASK kernels we need <asm/current.h> for the
 * definition of current, but for !CONFIG_THREAD_INFO_IN_TASK kernels,
 * including <asm/current.h> can cause a circular dependency on some platforms.
 */
#include <asm/current.h>
#define current_thread_info() ((struct thread_info *)current)
#endif

#include <linux/bitops.h>

/*
 * For per-arch arch_within_stack_frames() implementations, defined in
 * asm/thread_info.h.
 */
enum {
        BAD_STACK = -1,
        NOT_STACK = 0,
        GOOD_FRAME,
        GOOD_STACK,
};

#ifdef CONFIG_GENERIC_ENTRY
enum syscall_work_bit {
        SYSCALL_WORK_BIT_SECCOMP,
        SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT,
        SYSCALL_WORK_BIT_SYSCALL_TRACE,
        SYSCALL_WORK_BIT_SYSCALL_EMU,
        SYSCALL_WORK_BIT_SYSCALL_AUDIT,
        SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH,
        SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP,
};

#define SYSCALL_WORK_SECCOMP                BIT(SYSCALL_WORK_BIT_SECCOMP)
#define SYSCALL_WORK_SYSCALL_TRACEPOINT        BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT)
#define SYSCALL_WORK_SYSCALL_TRACE        BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE)
#define SYSCALL_WORK_SYSCALL_EMU        BIT(SYSCALL_WORK_BIT_SYSCALL_EMU)
#define SYSCALL_WORK_SYSCALL_AUDIT        BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT)
#define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH)
#define SYSCALL_WORK_SYSCALL_EXIT_TRAP        BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP)
#endif

#include <asm/thread_info.h>

#ifndef TIF_NEED_RESCHED_LAZY
#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
#error Inconsistent PREEMPT_LAZY
#endif
#define TIF_NEED_RESCHED_LAZY TIF_NEED_RESCHED
#define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED
#endif

#ifdef __KERNEL__

#ifndef arch_set_restart_data
#define arch_set_restart_data(restart) do { } while (0)
#endif

static inline long set_restart_fn(struct restart_block *restart,
                                        long (*fn)(struct restart_block *))
{
        restart->fn = fn;
        arch_set_restart_data(restart);
        return -ERESTART_RESTARTBLOCK;
}

#ifndef THREAD_ALIGN
#define THREAD_ALIGN        THREAD_SIZE
#endif

#define THREADINFO_GFP                (GFP_KERNEL_ACCOUNT | __GFP_ZERO)

/*
 * flag set/clear/test wrappers
 * - pass TIF_xxxx constants to these functions
 */

static inline void set_ti_thread_flag(struct thread_info *ti, int flag)
{
        set_bit(flag, (unsigned long *)&ti->flags);
}

static inline void clear_ti_thread_flag(struct thread_info *ti, int flag)
{
        clear_bit(flag, (unsigned long *)&ti->flags);
}

static inline void update_ti_thread_flag(struct thread_info *ti, int flag,
                                         bool value)
{
        if (value)
                set_ti_thread_flag(ti, flag);
        else
                clear_ti_thread_flag(ti, flag);
}

static inline int test_and_set_ti_thread_flag(struct thread_info *ti, int flag)
{
        return test_and_set_bit(flag, (unsigned long *)&ti->flags);
}

static inline int test_and_clear_ti_thread_flag(struct thread_info *ti, int flag)
{
        return test_and_clear_bit(flag, (unsigned long *)&ti->flags);
}

static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
{
        return test_bit(flag, (unsigned long *)&ti->flags);
}

/*
 * This may be used in noinstr code, and needs to be __always_inline to prevent
 * inadvertent instrumentation.
 */
static __always_inline unsigned long read_ti_thread_flags(struct thread_info *ti)
{
        return READ_ONCE(ti->flags);
}

#define set_thread_flag(flag) \
        set_ti_thread_flag(current_thread_info(), flag)
#define clear_thread_flag(flag) \
        clear_ti_thread_flag(current_thread_info(), flag)
#define update_thread_flag(flag, value) \
        update_ti_thread_flag(current_thread_info(), flag, value)
#define test_and_set_thread_flag(flag) \
        test_and_set_ti_thread_flag(current_thread_info(), flag)
#define test_and_clear_thread_flag(flag) \
        test_and_clear_ti_thread_flag(current_thread_info(), flag)
#define test_thread_flag(flag) \
        test_ti_thread_flag(current_thread_info(), flag)
#define read_thread_flags() \
        read_ti_thread_flags(current_thread_info())

#define read_task_thread_flags(t) \
        read_ti_thread_flags(task_thread_info(t))

#ifdef CONFIG_GENERIC_ENTRY
#define set_syscall_work(fl) \
        set_bit(SYSCALL_WORK_BIT_##fl, &current_thread_info()->syscall_work)
#define test_syscall_work(fl) \
        test_bit(SYSCALL_WORK_BIT_##fl, &current_thread_info()->syscall_work)
#define clear_syscall_work(fl) \
        clear_bit(SYSCALL_WORK_BIT_##fl, &current_thread_info()->syscall_work)

#define set_task_syscall_work(t, fl) \
        set_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work)
#define test_task_syscall_work(t, fl) \
        test_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work)
#define clear_task_syscall_work(t, fl) \
        clear_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work)

#else /* CONFIG_GENERIC_ENTRY */

#define set_syscall_work(fl)                                                \
        set_ti_thread_flag(current_thread_info(), TIF_##fl)
#define test_syscall_work(fl) \
        test_ti_thread_flag(current_thread_info(), TIF_##fl)
#define clear_syscall_work(fl) \
        clear_ti_thread_flag(current_thread_info(), TIF_##fl)

#define set_task_syscall_work(t, fl) \
        set_ti_thread_flag(task_thread_info(t), TIF_##fl)
#define test_task_syscall_work(t, fl) \
        test_ti_thread_flag(task_thread_info(t), TIF_##fl)
#define clear_task_syscall_work(t, fl) \
        clear_ti_thread_flag(task_thread_info(t), TIF_##fl)
#endif /* !CONFIG_GENERIC_ENTRY */

#ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H

static __always_inline bool tif_test_bit(int bit)
{
        return arch_test_bit(bit,
                             (unsigned long *)(&current_thread_info()->flags));
}

#else

static __always_inline bool tif_test_bit(int bit)
{
        return test_bit(bit,
                        (unsigned long *)(&current_thread_info()->flags));
}

#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */

static __always_inline bool tif_need_resched(void)
{
        return tif_test_bit(TIF_NEED_RESCHED);
}

#ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
static inline int arch_within_stack_frames(const void * const stack,
                                           const void * const stackend,
                                           const void *obj, unsigned long len)
{
        return 0;
}
#endif

#ifndef arch_setup_new_exec
static inline void arch_setup_new_exec(void) { }
#endif

void arch_task_cache_init(void); /* for CONFIG_SH */
void arch_release_task_struct(struct task_struct *tsk);
int arch_dup_task_struct(struct task_struct *dst,
                                struct task_struct *src);

#endif        /* __KERNEL__ */

#endif /* _LINUX_THREAD_INFO_H */
















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Global definitions for the Ethernet IEEE 802.3 interface.
 *
 * Version:        @(#)if_ether.h        1.0.1a        02/08/94
 *
 * Author:        Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Donald Becker, <becker@super.org>
 *                Alan Cox, <alan@lxorguk.ukuu.org.uk>
 *                Steve Whitehouse, <gw7rrm@eeshack3.swan.ac.uk>
 */
#ifndef _LINUX_IF_ETHER_H
#define _LINUX_IF_ETHER_H

#include <linux/skbuff.h>
#include <uapi/linux/if_ether.h>

/* XX:XX:XX:XX:XX:XX */
#define MAC_ADDR_STR_LEN (3 * ETH_ALEN - 1)

static inline struct ethhdr *eth_hdr(const struct sk_buff *skb)
{
        return (struct ethhdr *)skb_mac_header(skb);
}

/* Prefer this version in TX path, instead of
 * skb_reset_mac_header() + eth_hdr()
 */
static inline struct ethhdr *skb_eth_hdr(const struct sk_buff *skb)
{
        return (struct ethhdr *)skb->data;
}

static inline struct ethhdr *inner_eth_hdr(const struct sk_buff *skb)
{
        return (struct ethhdr *)skb_inner_mac_header(skb);
}

int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr);

extern ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len);

#endif        /* _LINUX_IF_ETHER_H */











































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
/* SPDX-License-Identifier: GPL-2.0-or-later */

#ifndef _NET_NETDEV_LOCK_H
#define _NET_NETDEV_LOCK_H

#include <linux/lockdep.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>

static inline bool netdev_trylock(struct net_device *dev)
{
        return mutex_trylock(&dev->lock);
}

static inline void netdev_assert_locked(const struct net_device *dev)
{
        lockdep_assert_held(&dev->lock);
}

static inline void
netdev_assert_locked_or_invisible(const struct net_device *dev)
{
        if (dev->reg_state == NETREG_REGISTERED ||
            dev->reg_state == NETREG_UNREGISTERING)
                netdev_assert_locked(dev);
}

static inline bool netdev_need_ops_lock(const struct net_device *dev)
{
        bool ret = dev->request_ops_lock || !!dev->queue_mgmt_ops;

#if IS_ENABLED(CONFIG_NET_SHAPER)
        ret |= !!dev->netdev_ops->net_shaper_ops;
#endif

        return ret;
}

static inline void netdev_lock_ops(struct net_device *dev)
{
        if (netdev_need_ops_lock(dev))
                netdev_lock(dev);
}

static inline void netdev_unlock_ops(struct net_device *dev)
{
        if (netdev_need_ops_lock(dev))
                netdev_unlock(dev);
}

static inline void netdev_lock_ops_to_full(struct net_device *dev)
{
        if (netdev_need_ops_lock(dev))
                netdev_assert_locked(dev);
        else
                netdev_lock(dev);
}

static inline void netdev_unlock_full_to_ops(struct net_device *dev)
{
        if (netdev_need_ops_lock(dev))
                netdev_assert_locked(dev);
        else
                netdev_unlock(dev);
}

static inline void netdev_ops_assert_locked(const struct net_device *dev)
{
        if (netdev_need_ops_lock(dev))
                lockdep_assert_held(&dev->lock);
        else
                ASSERT_RTNL();
}

static inline void
netdev_ops_assert_locked_or_invisible(const struct net_device *dev)
{
        if (dev->reg_state == NETREG_REGISTERED ||
            dev->reg_state == NETREG_UNREGISTERING)
                netdev_ops_assert_locked(dev);
}

static inline void netdev_lock_ops_compat(struct net_device *dev)
{
        if (netdev_need_ops_lock(dev))
                netdev_lock(dev);
        else
                rtnl_lock();
}

static inline void netdev_unlock_ops_compat(struct net_device *dev)
{
        if (netdev_need_ops_lock(dev))
                netdev_unlock(dev);
        else
                rtnl_unlock();
}

static inline int netdev_lock_cmp_fn(const struct lockdep_map *a,
                                     const struct lockdep_map *b)
{
        if (a == b)
                return 0;

        /* Allow locking multiple devices only under rtnl_lock,
         * the exact order doesn't matter.
         * Note that upper devices don't lock their ops, so nesting
         * mostly happens in batched device removal for now.
         */
        return lockdep_rtnl_is_held() ? -1 : 1;
}

#define netdev_lockdep_set_classes(dev)                                \
{                                                                \
        static struct lock_class_key qdisc_tx_busylock_key;        \
        static struct lock_class_key qdisc_xmit_lock_key;        \
        static struct lock_class_key dev_addr_list_lock_key;        \
        static struct lock_class_key dev_instance_lock_key;        \
        unsigned int i;                                                \
                                                                \
        (dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key;        \
        lockdep_set_class(&(dev)->addr_list_lock,                \
                          &dev_addr_list_lock_key);                \
        lockdep_set_class(&(dev)->lock,                                \
                          &dev_instance_lock_key);                \
        lock_set_cmp_fn(&dev->lock, netdev_lock_cmp_fn, NULL);        \
        for (i = 0; i < (dev)->num_tx_queues; i++)                \
                lockdep_set_class(&(dev)->_tx[i]._xmit_lock,        \
                                  &qdisc_xmit_lock_key);        \
}

#define netdev_lock_dereference(p, dev)                                \
        rcu_dereference_protected(p, lockdep_is_held(&(dev)->lock))

int netdev_debug_event(struct notifier_block *nb, unsigned long event,
                       void *ptr);

#endif


























































































































   25 


   25 



   25 




















   25 













   25 



   25 









































   24 















   25 
    1 

    1 



    1 










































































   25 































   25 





   25 




















































  246 


  236 


  243 







   73 


    4 






  243 
   41 





   37 







  245 





























   74 
















   25 








   37 





    1 

















































  230 









   73 




   73 




  244 








   25 
   25 










  238 







    2 













  312 


  248 


  309 
  310 

  313 
  247 

  243 
    5 







  246 
  247 


    4 
    3 


  245 



  310 


   47 
   46 



  312 
  246 











































































  312 














































































































































































   69 






























































   22 


   22 
    1 































































   69 




















    2 





































   70 



   69 






















    3 
    3 



    3 









































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
// SPDX-License-Identifier: GPL-2.0
/*
 * NETLINK      Netlink attributes
 *
 *                 Authors:        Thomas Graf <tgraf@suug.ch>
 *                                 Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
 */

#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/jiffies.h>
#include <linux/nospec.h>
#include <linux/skbuff.h>
#include <linux/string.h>
#include <linux/types.h>
#include <net/netlink.h>

/* For these data types, attribute length should be exactly the given
 * size. However, to maintain compatibility with broken commands, if the
 * attribute length does not match the expected size a warning is emitted
 * to the user that the command is sending invalid data and needs to be fixed.
 */
static const u8 nla_attr_len[NLA_TYPE_MAX+1] = {
        [NLA_U8]        = sizeof(u8),
        [NLA_U16]        = sizeof(u16),
        [NLA_U32]        = sizeof(u32),
        [NLA_U64]        = sizeof(u64),
        [NLA_S8]        = sizeof(s8),
        [NLA_S16]        = sizeof(s16),
        [NLA_S32]        = sizeof(s32),
        [NLA_S64]        = sizeof(s64),
        [NLA_BE16]        = sizeof(__be16),
        [NLA_BE32]        = sizeof(__be32),
};

static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = {
        [NLA_U8]        = sizeof(u8),
        [NLA_U16]        = sizeof(u16),
        [NLA_U32]        = sizeof(u32),
        [NLA_U64]        = sizeof(u64),
        [NLA_MSECS]        = sizeof(u64),
        [NLA_NESTED]        = NLA_HDRLEN,
        [NLA_S8]        = sizeof(s8),
        [NLA_S16]        = sizeof(s16),
        [NLA_S32]        = sizeof(s32),
        [NLA_S64]        = sizeof(s64),
        [NLA_BE16]        = sizeof(__be16),
        [NLA_BE32]        = sizeof(__be32),
};

/*
 * Nested policies might refer back to the original
 * policy in some cases, and userspace could try to
 * abuse that and recurse by nesting in the right
 * ways. Limit recursion to avoid this problem.
 */
#define MAX_POLICY_RECURSION_DEPTH        10

static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype,
                                const struct nla_policy *policy,
                                unsigned int validate,
                                struct netlink_ext_ack *extack,
                                struct nlattr **tb, unsigned int depth);

static int validate_nla_bitfield32(const struct nlattr *nla,
                                   const u32 valid_flags_mask)
{
        const struct nla_bitfield32 *bf = nla_data(nla);

        if (!valid_flags_mask)
                return -EINVAL;

        /*disallow invalid bit selector */
        if (bf->selector & ~valid_flags_mask)
                return -EINVAL;

        /*disallow invalid bit values */
        if (bf->value & ~valid_flags_mask)
                return -EINVAL;

        /*disallow valid bit values that are not selected*/
        if (bf->value & ~bf->selector)
                return -EINVAL;

        return 0;
}

static int nla_validate_array(const struct nlattr *head, int len, int maxtype,
                              const struct nla_policy *policy,
                              struct netlink_ext_ack *extack,
                              unsigned int validate, unsigned int depth)
{
        const struct nlattr *entry;
        int rem;

        nla_for_each_attr(entry, head, len, rem) {
                int ret;

                if (nla_len(entry) == 0)
                        continue;

                if (nla_len(entry) < NLA_HDRLEN) {
                        NL_SET_ERR_MSG_ATTR_POL(extack, entry, policy,
                                                "Array element too short");
                        return -ERANGE;
                }

                ret = __nla_validate_parse(nla_data(entry), nla_len(entry),
                                           maxtype, policy, validate, extack,
                                           NULL, depth + 1);
                if (ret < 0)
                        return ret;
        }

        return 0;
}

void nla_get_range_unsigned(const struct nla_policy *pt,
                            struct netlink_range_validation *range)
{
        WARN_ON_ONCE(pt->validation_type != NLA_VALIDATE_RANGE_PTR &&
                     (pt->min < 0 || pt->max < 0));

        range->min = 0;

        switch (pt->type) {
        case NLA_U8:
                range->max = U8_MAX;
                break;
        case NLA_U16:
        case NLA_BE16:
        case NLA_BINARY:
                range->max = U16_MAX;
                break;
        case NLA_U32:
        case NLA_BE32:
                range->max = U32_MAX;
                break;
        case NLA_U64:
        case NLA_UINT:
        case NLA_MSECS:
                range->max = U64_MAX;
                break;
        default:
                WARN_ON_ONCE(1);
                return;
        }

        switch (pt->validation_type) {
        case NLA_VALIDATE_RANGE:
        case NLA_VALIDATE_RANGE_WARN_TOO_LONG:
                range->min = pt->min;
                range->max = pt->max;
                break;
        case NLA_VALIDATE_RANGE_PTR:
                *range = *pt->range;
                break;
        case NLA_VALIDATE_MIN:
                range->min = pt->min;
                break;
        case NLA_VALIDATE_MAX:
                range->max = pt->max;
                break;
        default:
                break;
        }
}

static int nla_validate_range_unsigned(const struct nla_policy *pt,
                                       const struct nlattr *nla,
                                       struct netlink_ext_ack *extack,
                                       unsigned int validate)
{
        struct netlink_range_validation range;
        u64 value;

        switch (pt->type) {
        case NLA_U8:
                value = nla_get_u8(nla);
                break;
        case NLA_U16:
                value = nla_get_u16(nla);
                break;
        case NLA_U32:
                value = nla_get_u32(nla);
                break;
        case NLA_U64:
                value = nla_get_u64(nla);
                break;
        case NLA_UINT:
                value = nla_get_uint(nla);
                break;
        case NLA_MSECS:
                value = nla_get_u64(nla);
                break;
        case NLA_BINARY:
                value = nla_len(nla);
                break;
        case NLA_BE16:
                value = ntohs(nla_get_be16(nla));
                break;
        case NLA_BE32:
                value = ntohl(nla_get_be32(nla));
                break;
        default:
                return -EINVAL;
        }

        nla_get_range_unsigned(pt, &range);

        if (pt->validation_type == NLA_VALIDATE_RANGE_WARN_TOO_LONG &&
            pt->type == NLA_BINARY && value > range.max) {
                pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n",
                                    current->comm, pt->type);
                if (validate & NL_VALIDATE_STRICT_ATTRS) {
                        NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                                "invalid attribute length");
                        return -EINVAL;
                }

                /* this assumes min <= max (don't validate against min) */
                return 0;
        }

        if (value < range.min || value > range.max) {
                bool binary = pt->type == NLA_BINARY;

                if (binary)
                        NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                                "binary attribute size out of range");
                else
                        NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                                "integer out of range");

                return -ERANGE;
        }

        return 0;
}

void nla_get_range_signed(const struct nla_policy *pt,
                          struct netlink_range_validation_signed *range)
{
        switch (pt->type) {
        case NLA_S8:
                range->min = S8_MIN;
                range->max = S8_MAX;
                break;
        case NLA_S16:
                range->min = S16_MIN;
                range->max = S16_MAX;
                break;
        case NLA_S32:
                range->min = S32_MIN;
                range->max = S32_MAX;
                break;
        case NLA_S64:
        case NLA_SINT:
                range->min = S64_MIN;
                range->max = S64_MAX;
                break;
        default:
                WARN_ON_ONCE(1);
                return;
        }

        switch (pt->validation_type) {
        case NLA_VALIDATE_RANGE:
                range->min = pt->min;
                range->max = pt->max;
                break;
        case NLA_VALIDATE_RANGE_PTR:
                *range = *pt->range_signed;
                break;
        case NLA_VALIDATE_MIN:
                range->min = pt->min;
                break;
        case NLA_VALIDATE_MAX:
                range->max = pt->max;
                break;
        default:
                break;
        }
}

static int nla_validate_int_range_signed(const struct nla_policy *pt,
                                         const struct nlattr *nla,
                                         struct netlink_ext_ack *extack)
{
        struct netlink_range_validation_signed range;
        s64 value;

        switch (pt->type) {
        case NLA_S8:
                value = nla_get_s8(nla);
                break;
        case NLA_S16:
                value = nla_get_s16(nla);
                break;
        case NLA_S32:
                value = nla_get_s32(nla);
                break;
        case NLA_S64:
                value = nla_get_s64(nla);
                break;
        case NLA_SINT:
                value = nla_get_sint(nla);
                break;
        default:
                return -EINVAL;
        }

        nla_get_range_signed(pt, &range);

        if (value < range.min || value > range.max) {
                NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                        "integer out of range");
                return -ERANGE;
        }

        return 0;
}

static int nla_validate_int_range(const struct nla_policy *pt,
                                  const struct nlattr *nla,
                                  struct netlink_ext_ack *extack,
                                  unsigned int validate)
{
        switch (pt->type) {
        case NLA_U8:
        case NLA_U16:
        case NLA_U32:
        case NLA_U64:
        case NLA_UINT:
        case NLA_MSECS:
        case NLA_BINARY:
        case NLA_BE16:
        case NLA_BE32:
                return nla_validate_range_unsigned(pt, nla, extack, validate);
        case NLA_S8:
        case NLA_S16:
        case NLA_S32:
        case NLA_S64:
        case NLA_SINT:
                return nla_validate_int_range_signed(pt, nla, extack);
        default:
                WARN_ON(1);
                return -EINVAL;
        }
}

static int nla_validate_mask(const struct nla_policy *pt,
                             const struct nlattr *nla,
                             struct netlink_ext_ack *extack)
{
        u64 value;

        switch (pt->type) {
        case NLA_U8:
                value = nla_get_u8(nla);
                break;
        case NLA_U16:
                value = nla_get_u16(nla);
                break;
        case NLA_U32:
                value = nla_get_u32(nla);
                break;
        case NLA_U64:
                value = nla_get_u64(nla);
                break;
        case NLA_UINT:
                value = nla_get_uint(nla);
                break;
        case NLA_BE16:
                value = ntohs(nla_get_be16(nla));
                break;
        case NLA_BE32:
                value = ntohl(nla_get_be32(nla));
                break;
        default:
                return -EINVAL;
        }

        if (value & ~(u64)pt->mask) {
                NL_SET_ERR_MSG_ATTR(extack, nla, "reserved bit set");
                return -EINVAL;
        }

        return 0;
}

static int validate_nla(const struct nlattr *nla, int maxtype,
                        const struct nla_policy *policy, unsigned int validate,
                        struct netlink_ext_ack *extack, unsigned int depth)
{
        u16 strict_start_type = policy[0].strict_start_type;
        const struct nla_policy *pt;
        int minlen = 0, attrlen = nla_len(nla), type = nla_type(nla);
        int err = -ERANGE;

        if (strict_start_type && type >= strict_start_type)
                validate |= NL_VALIDATE_STRICT;

        if (type <= 0 || type > maxtype)
                return 0;

        type = array_index_nospec(type, maxtype + 1);
        pt = &policy[type];

        BUG_ON(pt->type > NLA_TYPE_MAX);

        if (nla_attr_len[pt->type] && attrlen != nla_attr_len[pt->type]) {
                pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n",
                                    current->comm, type);
                if (validate & NL_VALIDATE_STRICT_ATTRS) {
                        NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                                "invalid attribute length");
                        return -EINVAL;
                }
        }

        if (validate & NL_VALIDATE_NESTED) {
                if ((pt->type == NLA_NESTED || pt->type == NLA_NESTED_ARRAY) &&
                    !(nla->nla_type & NLA_F_NESTED)) {
                        NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                                "NLA_F_NESTED is missing");
                        return -EINVAL;
                }
                if (pt->type != NLA_NESTED && pt->type != NLA_NESTED_ARRAY &&
                    pt->type != NLA_UNSPEC && (nla->nla_type & NLA_F_NESTED)) {
                        NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                                "NLA_F_NESTED not expected");
                        return -EINVAL;
                }
        }

        switch (pt->type) {
        case NLA_REJECT:
                if (extack && pt->reject_message) {
                        NL_SET_BAD_ATTR(extack, nla);
                        extack->_msg = pt->reject_message;
                        return -EINVAL;
                }
                err = -EINVAL;
                goto out_err;

        case NLA_FLAG:
                if (attrlen > 0)
                        goto out_err;
                break;

        case NLA_SINT:
        case NLA_UINT:
                if (attrlen != sizeof(u32) && attrlen != sizeof(u64)) {
                        NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                                "invalid attribute length");
                        return -EINVAL;
                }
                break;

        case NLA_BITFIELD32:
                if (attrlen != sizeof(struct nla_bitfield32))
                        goto out_err;

                err = validate_nla_bitfield32(nla, pt->bitfield32_valid);
                if (err)
                        goto out_err;
                break;

        case NLA_NUL_STRING:
                if (pt->len)
                        minlen = min_t(int, attrlen, pt->len + 1);
                else
                        minlen = attrlen;

                if (!minlen || memchr(nla_data(nla), '\0', minlen) == NULL) {
                        err = -EINVAL;
                        goto out_err;
                }
                fallthrough;

        case NLA_STRING:
                if (attrlen < 1)
                        goto out_err;

                if (pt->len) {
                        char *buf = nla_data(nla);

                        if (buf[attrlen - 1] == '\0')
                                attrlen--;

                        if (attrlen > pt->len)
                                goto out_err;
                }
                break;

        case NLA_BINARY:
                if (pt->len && attrlen > pt->len)
                        goto out_err;
                break;

        case NLA_NESTED:
                /* a nested attributes is allowed to be empty; if its not,
                 * it must have a size of at least NLA_HDRLEN.
                 */
                if (attrlen == 0)
                        break;
                if (attrlen < NLA_HDRLEN)
                        goto out_err;
                if (pt->nested_policy) {
                        err = __nla_validate_parse(nla_data(nla), nla_len(nla),
                                                   pt->len, pt->nested_policy,
                                                   validate, extack, NULL,
                                                   depth + 1);
                        if (err < 0) {
                                /*
                                 * return directly to preserve the inner
                                 * error message/attribute pointer
                                 */
                                return err;
                        }
                }
                break;
        case NLA_NESTED_ARRAY:
                /* a nested array attribute is allowed to be empty; if its not,
                 * it must have a size of at least NLA_HDRLEN.
                 */
                if (attrlen == 0)
                        break;
                if (attrlen < NLA_HDRLEN)
                        goto out_err;
                if (pt->nested_policy) {
                        int err;

                        err = nla_validate_array(nla_data(nla), nla_len(nla),
                                                 pt->len, pt->nested_policy,
                                                 extack, validate, depth);
                        if (err < 0) {
                                /*
                                 * return directly to preserve the inner
                                 * error message/attribute pointer
                                 */
                                return err;
                        }
                }
                break;

        case NLA_UNSPEC:
                if (validate & NL_VALIDATE_UNSPEC) {
                        NL_SET_ERR_MSG_ATTR(extack, nla,
                                            "Unsupported attribute");
                        return -EINVAL;
                }
                if (attrlen < pt->len)
                        goto out_err;
                break;

        default:
                if (pt->len)
                        minlen = pt->len;
                else
                        minlen = nla_attr_minlen[pt->type];

                if (attrlen < minlen)
                        goto out_err;
        }

        /* further validation */
        switch (pt->validation_type) {
        case NLA_VALIDATE_NONE:
                /* nothing to do */
                break;
        case NLA_VALIDATE_RANGE_PTR:
        case NLA_VALIDATE_RANGE:
        case NLA_VALIDATE_RANGE_WARN_TOO_LONG:
        case NLA_VALIDATE_MIN:
        case NLA_VALIDATE_MAX:
                err = nla_validate_int_range(pt, nla, extack, validate);
                if (err)
                        return err;
                break;
        case NLA_VALIDATE_MASK:
                err = nla_validate_mask(pt, nla, extack);
                if (err)
                        return err;
                break;
        case NLA_VALIDATE_FUNCTION:
                if (pt->validate) {
                        err = pt->validate(nla, extack);
                        if (err)
                                return err;
                }
                break;
        }

        return 0;
out_err:
        NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                "Attribute failed policy validation");
        return err;
}

static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype,
                                const struct nla_policy *policy,
                                unsigned int validate,
                                struct netlink_ext_ack *extack,
                                struct nlattr **tb, unsigned int depth)
{
        const struct nlattr *nla;
        int rem;

        if (depth >= MAX_POLICY_RECURSION_DEPTH) {
                NL_SET_ERR_MSG(extack,
                               "allowed policy recursion depth exceeded");
                return -EINVAL;
        }

        if (tb)
                memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));

        nla_for_each_attr(nla, head, len, rem) {
                u16 type = nla_type(nla);

                if (type == 0 || type > maxtype) {
                        if (validate & NL_VALIDATE_MAXTYPE) {
                                NL_SET_ERR_MSG_ATTR(extack, nla,
                                                    "Unknown attribute type");
                                return -EINVAL;
                        }
                        continue;
                }
                type = array_index_nospec(type, maxtype + 1);
                if (policy) {
                        int err = validate_nla(nla, maxtype, policy,
                                               validate, extack, depth);

                        if (err < 0)
                                return err;
                }

                if (tb)
                        tb[type] = (struct nlattr *)nla;
        }

        if (unlikely(rem > 0)) {
                pr_warn_ratelimited("netlink: %d bytes leftover after parsing attributes in process `%s'.\n",
                                    rem, current->comm);
                NL_SET_ERR_MSG(extack, "bytes leftover after parsing attributes");
                if (validate & NL_VALIDATE_TRAILING)
                        return -EINVAL;
        }

        return 0;
}

/**
 * __nla_validate - Validate a stream of attributes
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @validate: validation strictness
 * @extack: extended ACK report struct
 *
 * Validates all attributes in the specified attribute stream against the
 * specified policy. Validation depends on the validate flags passed, see
 * &enum netlink_validation for more details on that.
 * See documentation of struct nla_policy for more details.
 *
 * Returns 0 on success or a negative error code.
 */
int __nla_validate(const struct nlattr *head, int len, int maxtype,
                   const struct nla_policy *policy, unsigned int validate,
                   struct netlink_ext_ack *extack)
{
        return __nla_validate_parse(head, len, maxtype, policy, validate,
                                    extack, NULL, 0);
}
EXPORT_SYMBOL(__nla_validate);

/**
 * nla_policy_len - Determine the max. length of a policy
 * @p: policy to use
 * @n: number of policies
 *
 * Determines the max. length of the policy.  It is currently used
 * to allocated Netlink buffers roughly the size of the actual
 * message.
 *
 * Returns 0 on success or a negative error code.
 */
int
nla_policy_len(const struct nla_policy *p, int n)
{
        int i, len = 0;

        for (i = 0; i < n; i++, p++) {
                if (p->len)
                        len += nla_total_size(p->len);
                else if (nla_attr_len[p->type])
                        len += nla_total_size(nla_attr_len[p->type]);
                else if (nla_attr_minlen[p->type])
                        len += nla_total_size(nla_attr_minlen[p->type]);
        }

        return len;
}
EXPORT_SYMBOL(nla_policy_len);

/**
 * __nla_parse - Parse a stream of attributes into a tb buffer
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @policy: validation policy
 * @validate: validation strictness
 * @extack: extended ACK pointer
 *
 * Parses a stream of attributes and stores a pointer to each attribute in
 * the tb array accessible via the attribute type.
 * Validation is controlled by the @validate parameter.
 *
 * Returns 0 on success or a negative error code.
 */
int __nla_parse(struct nlattr **tb, int maxtype,
                const struct nlattr *head, int len,
                const struct nla_policy *policy, unsigned int validate,
                struct netlink_ext_ack *extack)
{
        return __nla_validate_parse(head, len, maxtype, policy, validate,
                                    extack, tb, 0);
}
EXPORT_SYMBOL(__nla_parse);

/**
 * nla_find - Find a specific attribute in a stream of attributes
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @attrtype: type of attribute to look for
 *
 * Returns the first attribute in the stream matching the specified type.
 */
struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype)
{
        const struct nlattr *nla;
        int rem;

        nla_for_each_attr(nla, head, len, rem)
                if (nla_type(nla) == attrtype)
                        return (struct nlattr *)nla;

        return NULL;
}
EXPORT_SYMBOL(nla_find);

/**
 * nla_strscpy - Copy string attribute payload into a sized buffer
 * @dst: Where to copy the string to.
 * @nla: Attribute to copy the string from.
 * @dstsize: Size of destination buffer.
 *
 * Copies at most dstsize - 1 bytes into the destination buffer.
 * Unlike strscpy() the destination buffer is always padded out.
 *
 * Return:
 * * srclen - Returns @nla length (not including the trailing %NUL).
 * * -E2BIG - If @dstsize is 0 or greater than U16_MAX or @nla length greater
 *            than @dstsize.
 */
ssize_t nla_strscpy(char *dst, const struct nlattr *nla, size_t dstsize)
{
        size_t srclen = nla_len(nla);
        char *src = nla_data(nla);
        ssize_t ret;
        size_t len;

        if (dstsize == 0 || WARN_ON_ONCE(dstsize > U16_MAX))
                return -E2BIG;

        if (srclen > 0 && src[srclen - 1] == '\0')
                srclen--;

        if (srclen >= dstsize) {
                len = dstsize - 1;
                ret = -E2BIG;
        } else {
                len = srclen;
                ret = len;
        }

        memcpy(dst, src, len);
        /* Zero pad end of dst. */
        memset(dst + len, 0, dstsize - len);

        return ret;
}
EXPORT_SYMBOL(nla_strscpy);

/**
 * nla_strdup - Copy string attribute payload into a newly allocated buffer
 * @nla: attribute to copy the string from
 * @flags: the type of memory to allocate (see kmalloc).
 *
 * Returns a pointer to the allocated buffer or NULL on error.
 */
char *nla_strdup(const struct nlattr *nla, gfp_t flags)
{
        size_t srclen = nla_len(nla);
        char *src = nla_data(nla), *dst;

        if (srclen > 0 && src[srclen - 1] == '\0')
                srclen--;

        dst = kmalloc(srclen + 1, flags);
        if (dst != NULL) {
                memcpy(dst, src, srclen);
                dst[srclen] = '\0';
        }
        return dst;
}
EXPORT_SYMBOL(nla_strdup);

/**
 * nla_memcpy - Copy a netlink attribute into another memory area
 * @dest: where to copy to memcpy
 * @src: netlink attribute to copy from
 * @count: size of the destination area
 *
 * Note: The number of bytes copied is limited by the length of
 *       attribute's payload. memcpy
 *
 * Returns the number of bytes copied.
 */
int nla_memcpy(void *dest, const struct nlattr *src, int count)
{
        int minlen = min_t(int, count, nla_len(src));

        memcpy(dest, nla_data(src), minlen);
        if (count > minlen)
                memset(dest + minlen, 0, count - minlen);

        return minlen;
}
EXPORT_SYMBOL(nla_memcpy);

/**
 * nla_memcmp - Compare an attribute with sized memory area
 * @nla: netlink attribute
 * @data: memory area
 * @size: size of memory area
 */
int nla_memcmp(const struct nlattr *nla, const void *data,
                             size_t size)
{
        int d = nla_len(nla) - size;

        if (d == 0)
                d = memcmp(nla_data(nla), data, size);

        return d;
}
EXPORT_SYMBOL(nla_memcmp);

/**
 * nla_strcmp - Compare a string attribute against a string
 * @nla: netlink string attribute
 * @str: another string
 */
int nla_strcmp(const struct nlattr *nla, const char *str)
{
        int len = strlen(str);
        char *buf = nla_data(nla);
        int attrlen = nla_len(nla);
        int d;

        while (attrlen > 0 && buf[attrlen - 1] == '\0')
                attrlen--;

        d = attrlen - len;
        if (d == 0)
                d = memcmp(nla_data(nla), str, len);

        return d;
}
EXPORT_SYMBOL(nla_strcmp);

#ifdef CONFIG_NET
/**
 * __nla_reserve - reserve room for attribute on the skb
 * @skb: socket buffer to reserve room on
 * @attrtype: attribute type
 * @attrlen: length of attribute payload
 *
 * Adds a netlink attribute header to a socket buffer and reserves
 * room for the payload but does not copy it.
 *
 * The caller is responsible to ensure that the skb provides enough
 * tailroom for the attribute header and payload.
 */
struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen)
{
        struct nlattr *nla;

        nla = skb_put(skb, nla_total_size(attrlen));
        nla->nla_type = attrtype;
        nla->nla_len = nla_attr_size(attrlen);

        memset((unsigned char *) nla + nla->nla_len, 0, nla_padlen(attrlen));

        return nla;
}
EXPORT_SYMBOL(__nla_reserve);

/**
 * __nla_reserve_64bit - reserve room for attribute on the skb and align it
 * @skb: socket buffer to reserve room on
 * @attrtype: attribute type
 * @attrlen: length of attribute payload
 * @padattr: attribute type for the padding
 *
 * Adds a netlink attribute header to a socket buffer and reserves
 * room for the payload but does not copy it. It also ensure that this
 * attribute will have a 64-bit aligned nla_data() area.
 *
 * The caller is responsible to ensure that the skb provides enough
 * tailroom for the attribute header and payload.
 */
struct nlattr *__nla_reserve_64bit(struct sk_buff *skb, int attrtype,
                                   int attrlen, int padattr)
{
        nla_align_64bit(skb, padattr);

        return __nla_reserve(skb, attrtype, attrlen);
}
EXPORT_SYMBOL(__nla_reserve_64bit);

/**
 * __nla_reserve_nohdr - reserve room for attribute without header
 * @skb: socket buffer to reserve room on
 * @attrlen: length of attribute payload
 *
 * Reserves room for attribute payload without a header.
 *
 * The caller is responsible to ensure that the skb provides enough
 * tailroom for the payload.
 */
void *__nla_reserve_nohdr(struct sk_buff *skb, int attrlen)
{
        return skb_put_zero(skb, NLA_ALIGN(attrlen));
}
EXPORT_SYMBOL(__nla_reserve_nohdr);

/**
 * nla_reserve - reserve room for attribute on the skb
 * @skb: socket buffer to reserve room on
 * @attrtype: attribute type
 * @attrlen: length of attribute payload
 *
 * Adds a netlink attribute header to a socket buffer and reserves
 * room for the payload but does not copy it.
 *
 * Returns NULL if the tailroom of the skb is insufficient to store
 * the attribute header and payload.
 */
struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen)
{
        if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen)))
                return NULL;

        return __nla_reserve(skb, attrtype, attrlen);
}
EXPORT_SYMBOL(nla_reserve);

/**
 * nla_reserve_64bit - reserve room for attribute on the skb and align it
 * @skb: socket buffer to reserve room on
 * @attrtype: attribute type
 * @attrlen: length of attribute payload
 * @padattr: attribute type for the padding
 *
 * Adds a netlink attribute header to a socket buffer and reserves
 * room for the payload but does not copy it. It also ensure that this
 * attribute will have a 64-bit aligned nla_data() area.
 *
 * Returns NULL if the tailroom of the skb is insufficient to store
 * the attribute header and payload.
 */
struct nlattr *nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen,
                                 int padattr)
{
        size_t len;

        if (nla_need_padding_for_64bit(skb))
                len = nla_total_size_64bit(attrlen);
        else
                len = nla_total_size(attrlen);
        if (unlikely(skb_tailroom(skb) < len))
                return NULL;

        return __nla_reserve_64bit(skb, attrtype, attrlen, padattr);
}
EXPORT_SYMBOL(nla_reserve_64bit);

/**
 * nla_reserve_nohdr - reserve room for attribute without header
 * @skb: socket buffer to reserve room on
 * @attrlen: length of attribute payload
 *
 * Reserves room for attribute payload without a header.
 *
 * Returns NULL if the tailroom of the skb is insufficient to store
 * the attribute payload.
 */
void *nla_reserve_nohdr(struct sk_buff *skb, int attrlen)
{
        if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen)))
                return NULL;

        return __nla_reserve_nohdr(skb, attrlen);
}
EXPORT_SYMBOL(nla_reserve_nohdr);

/**
 * __nla_put - Add a netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @attrlen: length of attribute payload
 * @data: head of attribute payload
 *
 * The caller is responsible to ensure that the skb provides enough
 * tailroom for the attribute header and payload.
 */
void __nla_put(struct sk_buff *skb, int attrtype, int attrlen,
                             const void *data)
{
        struct nlattr *nla;

        nla = __nla_reserve(skb, attrtype, attrlen);
        memcpy(nla_data(nla), data, attrlen);
}
EXPORT_SYMBOL(__nla_put);

/**
 * __nla_put_64bit - Add a netlink attribute to a socket buffer and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @attrlen: length of attribute payload
 * @data: head of attribute payload
 * @padattr: attribute type for the padding
 *
 * The caller is responsible to ensure that the skb provides enough
 * tailroom for the attribute header and payload.
 */
void __nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen,
                     const void *data, int padattr)
{
        struct nlattr *nla;

        nla = __nla_reserve_64bit(skb, attrtype, attrlen, padattr);
        memcpy(nla_data(nla), data, attrlen);
}
EXPORT_SYMBOL(__nla_put_64bit);

/**
 * __nla_put_nohdr - Add a netlink attribute without header
 * @skb: socket buffer to add attribute to
 * @attrlen: length of attribute payload
 * @data: head of attribute payload
 *
 * The caller is responsible to ensure that the skb provides enough
 * tailroom for the attribute payload.
 */
void __nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data)
{
        void *start;

        start = __nla_reserve_nohdr(skb, attrlen);
        memcpy(start, data, attrlen);
}
EXPORT_SYMBOL(__nla_put_nohdr);

/**
 * nla_put - Add a netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @attrlen: length of attribute payload
 * @data: head of attribute payload
 *
 * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store
 * the attribute header and payload.
 */
int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data)
{
        if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen)))
                return -EMSGSIZE;

        __nla_put(skb, attrtype, attrlen, data);
        return 0;
}
EXPORT_SYMBOL(nla_put);

/**
 * nla_put_64bit - Add a netlink attribute to a socket buffer and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @attrlen: length of attribute payload
 * @data: head of attribute payload
 * @padattr: attribute type for the padding
 *
 * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store
 * the attribute header and payload.
 */
int nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen,
                  const void *data, int padattr)
{
        size_t len;

        if (nla_need_padding_for_64bit(skb))
                len = nla_total_size_64bit(attrlen);
        else
                len = nla_total_size(attrlen);
        if (unlikely(skb_tailroom(skb) < len))
                return -EMSGSIZE;

        __nla_put_64bit(skb, attrtype, attrlen, data, padattr);
        return 0;
}
EXPORT_SYMBOL(nla_put_64bit);

/**
 * nla_put_nohdr - Add a netlink attribute without header
 * @skb: socket buffer to add attribute to
 * @attrlen: length of attribute payload
 * @data: head of attribute payload
 *
 * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store
 * the attribute payload.
 */
int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data)
{
        if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen)))
                return -EMSGSIZE;

        __nla_put_nohdr(skb, attrlen, data);
        return 0;
}
EXPORT_SYMBOL(nla_put_nohdr);

/**
 * nla_append - Add a netlink attribute without header or padding
 * @skb: socket buffer to add attribute to
 * @attrlen: length of attribute payload
 * @data: head of attribute payload
 *
 * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store
 * the attribute payload.
 */
int nla_append(struct sk_buff *skb, int attrlen, const void *data)
{
        if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen)))
                return -EMSGSIZE;

        skb_put_data(skb, data, attrlen);
        return 0;
}
EXPORT_SYMBOL(nla_append);
#endif



























   23 















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RATELIMIT_H
#define _LINUX_RATELIMIT_H

#include <linux/ratelimit_types.h>
#include <linux/sched.h>
#include <linux/spinlock.h>

static inline void ratelimit_state_init(struct ratelimit_state *rs,
                                        int interval, int burst)
{
        memset(rs, 0, sizeof(*rs));

        raw_spin_lock_init(&rs->lock);
        rs->interval        = interval;
        rs->burst        = burst;
}

static inline void ratelimit_default_init(struct ratelimit_state *rs)
{
        return ratelimit_state_init(rs, DEFAULT_RATELIMIT_INTERVAL,
                                        DEFAULT_RATELIMIT_BURST);
}

static inline void ratelimit_state_inc_miss(struct ratelimit_state *rs)
{
        atomic_inc(&rs->missed);
}

static inline int ratelimit_state_get_miss(struct ratelimit_state *rs)
{
        return atomic_read(&rs->missed);
}

static inline int ratelimit_state_reset_miss(struct ratelimit_state *rs)
{
        return atomic_xchg_relaxed(&rs->missed, 0);
}

static inline void ratelimit_state_reset_interval(struct ratelimit_state *rs, int interval_init)
{
        unsigned long flags;

        raw_spin_lock_irqsave(&rs->lock, flags);
        rs->interval = interval_init;
        rs->flags &= ~RATELIMIT_INITIALIZED;
        atomic_set(&rs->rs_n_left, rs->burst);
        ratelimit_state_reset_miss(rs);
        raw_spin_unlock_irqrestore(&rs->lock, flags);
}

static inline void ratelimit_state_exit(struct ratelimit_state *rs)
{
        int m;

        if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE))
                return;

        m = ratelimit_state_reset_miss(rs);
        if (m)
                pr_warn("%s: %d output lines suppressed due to ratelimiting\n", current->comm, m);
}

static inline void
ratelimit_set_flags(struct ratelimit_state *rs, unsigned long flags)
{
        rs->flags = flags;
}

extern struct ratelimit_state printk_ratelimit_state;

#ifdef CONFIG_PRINTK

#define WARN_ON_RATELIMIT(condition, state)        ({                \
        bool __rtn_cond = !!(condition);                        \
        WARN_ON(__rtn_cond && __ratelimit(state));                \
        __rtn_cond;                                                \
})

#define WARN_RATELIMIT(condition, format, ...)                        \
({                                                                \
        static DEFINE_RATELIMIT_STATE(_rs,                        \
                                      DEFAULT_RATELIMIT_INTERVAL,        \
                                      DEFAULT_RATELIMIT_BURST);        \
        int rtn = !!(condition);                                \
                                                                \
        if (unlikely(rtn && __ratelimit(&_rs)))                        \
                WARN(rtn, format, ##__VA_ARGS__);                \
                                                                \
        rtn;                                                        \
})

#else

#define WARN_ON_RATELIMIT(condition, state)                        \
        WARN_ON(condition)

#define WARN_RATELIMIT(condition, format, ...)                        \
({                                                                \
        int rtn = WARN(condition, format, ##__VA_ARGS__);        \
        rtn;                                                        \
})

#endif

#endif /* _LINUX_RATELIMIT_H */























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 




















































































































































































































































































































































































  265 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_H
#define _LINUX_SCHED_H

/*
 * Define 'struct task_struct' and provide the main scheduler
 * APIs (schedule(), wakeup variants, etc.)
 */

#include <uapi/linux/sched.h>

#include <asm/current.h>
#include <asm/processor.h>
#include <linux/thread_info.h>
#include <linux/preempt.h>
#include <linux/cpumask_types.h>

#include <linux/cache.h>
#include <linux/irqflags_types.h>
#include <linux/smp_types.h>
#include <linux/pid_types.h>
#include <linux/sem_types.h>
#include <linux/shm.h>
#include <linux/kmsan_types.h>
#include <linux/mutex_types.h>
#include <linux/plist_types.h>
#include <linux/hrtimer_types.h>
#include <linux/timer_types.h>
#include <linux/seccomp_types.h>
#include <linux/nodemask_types.h>
#include <linux/refcount_types.h>
#include <linux/resource.h>
#include <linux/latencytop.h>
#include <linux/sched/prio.h>
#include <linux/sched/types.h>
#include <linux/signal_types.h>
#include <linux/spinlock.h>
#include <linux/syscall_user_dispatch_types.h>
#include <linux/mm_types_task.h>
#include <linux/netdevice_xmit.h>
#include <linux/task_io_accounting.h>
#include <linux/posix-timers_types.h>
#include <linux/restart_block.h>
#include <uapi/linux/rseq.h>
#include <linux/seqlock_types.h>
#include <linux/kcsan.h>
#include <linux/rv.h>
#include <linux/uidgid_types.h>
#include <linux/tracepoint-defs.h>
#include <linux/unwind_deferred_types.h>
#include <asm/kmap_size.h>
#ifndef COMPILE_OFFSETS
#include <generated/rq-offsets.h>
#endif

/* task_struct member predeclarations (sorted alphabetically): */
struct audit_context;
struct bio_list;
struct blk_plug;
struct bpf_local_storage;
struct bpf_run_ctx;
struct bpf_net_context;
struct capture_control;
struct cfs_rq;
struct fs_struct;
struct futex_pi_state;
struct io_context;
struct io_uring_task;
struct mempolicy;
struct nameidata;
struct nsproxy;
struct perf_event_context;
struct perf_ctx_data;
struct pid_namespace;
struct pipe_inode_info;
struct rcu_node;
struct reclaim_state;
struct robust_list_head;
struct root_domain;
struct rq;
struct sched_attr;
struct sched_dl_entity;
struct seq_file;
struct sighand_struct;
struct signal_struct;
struct task_delay_info;
struct task_group;
struct task_struct;
struct user_event_mm;

#include <linux/sched/ext.h>

/*
 * Task state bitmask. NOTE! These bits are also
 * encoded in fs/proc/array.c: get_task_state().
 *
 * We have two separate sets of flags: task->__state
 * is about runnability, while task->exit_state are
 * about the task exiting. Confusing, but this way
 * modifying one set can't modify the other one by
 * mistake.
 */

/* Used in tsk->__state: */
#define TASK_RUNNING                        0x00000000
#define TASK_INTERRUPTIBLE                0x00000001
#define TASK_UNINTERRUPTIBLE                0x00000002
#define __TASK_STOPPED                        0x00000004
#define __TASK_TRACED                        0x00000008
/* Used in tsk->exit_state: */
#define EXIT_DEAD                        0x00000010
#define EXIT_ZOMBIE                        0x00000020
#define EXIT_TRACE                        (EXIT_ZOMBIE | EXIT_DEAD)
/* Used in tsk->__state again: */
#define TASK_PARKED                        0x00000040
#define TASK_DEAD                        0x00000080
#define TASK_WAKEKILL                        0x00000100
#define TASK_WAKING                        0x00000200
#define TASK_NOLOAD                        0x00000400
#define TASK_NEW                        0x00000800
#define TASK_RTLOCK_WAIT                0x00001000
#define TASK_FREEZABLE                        0x00002000
#define __TASK_FREEZABLE_UNSAFE               (0x00004000 * IS_ENABLED(CONFIG_LOCKDEP))
#define TASK_FROZEN                        0x00008000
#define TASK_STATE_MAX                        0x00010000

#define TASK_ANY                        (TASK_STATE_MAX-1)

/*
 * DO NOT ADD ANY NEW USERS !
 */
#define TASK_FREEZABLE_UNSAFE                (TASK_FREEZABLE | __TASK_FREEZABLE_UNSAFE)

/* Convenience macros for the sake of set_current_state: */
#define TASK_KILLABLE                        (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
#define TASK_STOPPED                        (TASK_WAKEKILL | __TASK_STOPPED)
#define TASK_TRACED                        __TASK_TRACED

#define TASK_IDLE                        (TASK_UNINTERRUPTIBLE | TASK_NOLOAD)

/* Convenience macros for the sake of wake_up(): */
#define TASK_NORMAL                        (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)

/* get_task_state(): */
#define TASK_REPORT                        (TASK_RUNNING | TASK_INTERRUPTIBLE | \
                                         TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
                                         __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \
                                         TASK_PARKED)

#define task_is_running(task)                (READ_ONCE((task)->__state) == TASK_RUNNING)

#define task_is_traced(task)                ((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0)
#define task_is_stopped(task)                ((READ_ONCE(task->jobctl) & JOBCTL_STOPPED) != 0)
#define task_is_stopped_or_traced(task)        ((READ_ONCE(task->jobctl) & (JOBCTL_STOPPED | JOBCTL_TRACED)) != 0)

/*
 * Special states are those that do not use the normal wait-loop pattern. See
 * the comment with set_special_state().
 */
#define is_special_task_state(state)                                        \
        ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED |        \
                    TASK_DEAD | TASK_FROZEN))

#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
# define debug_normal_state_change(state_value)                                \
        do {                                                                \
                WARN_ON_ONCE(is_special_task_state(state_value));        \
                current->task_state_change = _THIS_IP_;                        \
        } while (0)

# define debug_special_state_change(state_value)                        \
        do {                                                                \
                WARN_ON_ONCE(!is_special_task_state(state_value));        \
                current->task_state_change = _THIS_IP_;                        \
        } while (0)

# define debug_rtlock_wait_set_state()                                        \
        do {                                                                 \
                current->saved_state_change = current->task_state_change;\
                current->task_state_change = _THIS_IP_;                         \
        } while (0)

# define debug_rtlock_wait_restore_state()                                \
        do {                                                                 \
                current->task_state_change = current->saved_state_change;\
        } while (0)

#else
# define debug_normal_state_change(cond)        do { } while (0)
# define debug_special_state_change(cond)        do { } while (0)
# define debug_rtlock_wait_set_state()                do { } while (0)
# define debug_rtlock_wait_restore_state()        do { } while (0)
#endif

#define trace_set_current_state(state_value)                     \
        do {                                                     \
                if (tracepoint_enabled(sched_set_state_tp))      \
                        __trace_set_current_state(state_value); \
        } while (0)

/*
 * set_current_state() includes a barrier so that the write of current->__state
 * is correctly serialised wrt the caller's subsequent test of whether to
 * actually sleep:
 *
 *   for (;;) {
 *        set_current_state(TASK_UNINTERRUPTIBLE);
 *        if (CONDITION)
 *           break;
 *
 *        schedule();
 *   }
 *   __set_current_state(TASK_RUNNING);
 *
 * If the caller does not need such serialisation (because, for instance, the
 * CONDITION test and condition change and wakeup are under the same lock) then
 * use __set_current_state().
 *
 * The above is typically ordered against the wakeup, which does:
 *
 *   CONDITION = 1;
 *   wake_up_state(p, TASK_UNINTERRUPTIBLE);
 *
 * where wake_up_state()/try_to_wake_up() executes a full memory barrier before
 * accessing p->__state.
 *
 * Wakeup will do: if (@state & p->__state) p->__state = TASK_RUNNING, that is,
 * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a
 * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING).
 *
 * However, with slightly different timing the wakeup TASK_RUNNING store can
 * also collide with the TASK_UNINTERRUPTIBLE store. Losing that store is not
 * a problem either because that will result in one extra go around the loop
 * and our @cond test will save the day.
 *
 * Also see the comments of try_to_wake_up().
 */
#define __set_current_state(state_value)                                \
        do {                                                                \
                debug_normal_state_change((state_value));                \
                trace_set_current_state(state_value);                        \
                WRITE_ONCE(current->__state, (state_value));                \
        } while (0)

#define set_current_state(state_value)                                        \
        do {                                                                \
                debug_normal_state_change((state_value));                \
                trace_set_current_state(state_value);                        \
                smp_store_mb(current->__state, (state_value));                \
        } while (0)

/*
 * set_special_state() should be used for those states when the blocking task
 * can not use the regular condition based wait-loop. In that case we must
 * serialize against wakeups such that any possible in-flight TASK_RUNNING
 * stores will not collide with our state change.
 */
#define set_special_state(state_value)                                        \
        do {                                                                \
                unsigned long flags; /* may shadow */                        \
                                                                        \
                raw_spin_lock_irqsave(&current->pi_lock, flags);        \
                debug_special_state_change((state_value));                \
                trace_set_current_state(state_value);                        \
                WRITE_ONCE(current->__state, (state_value));                \
                raw_spin_unlock_irqrestore(&current->pi_lock, flags);        \
        } while (0)

/*
 * PREEMPT_RT specific variants for "sleeping" spin/rwlocks
 *
 * RT's spin/rwlock substitutions are state preserving. The state of the
 * task when blocking on the lock is saved in task_struct::saved_state and
 * restored after the lock has been acquired.  These operations are
 * serialized by task_struct::pi_lock against try_to_wake_up(). Any non RT
 * lock related wakeups while the task is blocked on the lock are
 * redirected to operate on task_struct::saved_state to ensure that these
 * are not dropped. On restore task_struct::saved_state is set to
 * TASK_RUNNING so any wakeup attempt redirected to saved_state will fail.
 *
 * The lock operation looks like this:
 *
 *        current_save_and_set_rtlock_wait_state();
 *        for (;;) {
 *                if (try_lock())
 *                        break;
 *                raw_spin_unlock_irq(&lock->wait_lock);
 *                schedule_rtlock();
 *                raw_spin_lock_irq(&lock->wait_lock);
 *                set_current_state(TASK_RTLOCK_WAIT);
 *        }
 *        current_restore_rtlock_saved_state();
 */
#define current_save_and_set_rtlock_wait_state()                        \
        do {                                                                \
                lockdep_assert_irqs_disabled();                                \
                raw_spin_lock(&current->pi_lock);                        \
                current->saved_state = current->__state;                \
                debug_rtlock_wait_set_state();                                \
                trace_set_current_state(TASK_RTLOCK_WAIT);                \
                WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT);                \
                raw_spin_unlock(&current->pi_lock);                        \
        } while (0);

#define current_restore_rtlock_saved_state()                                \
        do {                                                                \
                lockdep_assert_irqs_disabled();                                \
                raw_spin_lock(&current->pi_lock);                        \
                debug_rtlock_wait_restore_state();                        \
                trace_set_current_state(current->saved_state);                \
                WRITE_ONCE(current->__state, current->saved_state);        \
                current->saved_state = TASK_RUNNING;                        \
                raw_spin_unlock(&current->pi_lock);                        \
        } while (0);

#define get_current_state()        READ_ONCE(current->__state)

/*
 * Define the task command name length as enum, then it can be visible to
 * BPF programs.
 */
enum {
        TASK_COMM_LEN = 16,
};

extern void sched_tick(void);

#define        MAX_SCHEDULE_TIMEOUT                LONG_MAX

extern long schedule_timeout(long timeout);
extern long schedule_timeout_interruptible(long timeout);
extern long schedule_timeout_killable(long timeout);
extern long schedule_timeout_uninterruptible(long timeout);
extern long schedule_timeout_idle(long timeout);
asmlinkage void schedule(void);
extern void schedule_preempt_disabled(void);
asmlinkage void preempt_schedule_irq(void);
#ifdef CONFIG_PREEMPT_RT
 extern void schedule_rtlock(void);
#endif

extern int __must_check io_schedule_prepare(void);
extern void io_schedule_finish(int token);
extern long io_schedule_timeout(long timeout);
extern void io_schedule(void);

/* wrapper functions to trace from this header file */
DECLARE_TRACEPOINT(sched_set_state_tp);
extern void __trace_set_current_state(int state_value);
DECLARE_TRACEPOINT(sched_set_need_resched_tp);
extern void __trace_set_need_resched(struct task_struct *curr, int tif);

/**
 * struct prev_cputime - snapshot of system and user cputime
 * @utime: time spent in user mode
 * @stime: time spent in system mode
 * @lock: protects the above two fields
 *
 * Stores previous user/system time values such that we can guarantee
 * monotonicity.
 */
struct prev_cputime {
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
        u64                                utime;
        u64                                stime;
        raw_spinlock_t                        lock;
#endif
};

enum vtime_state {
        /* Task is sleeping or running in a CPU with VTIME inactive: */
        VTIME_INACTIVE = 0,
        /* Task is idle */
        VTIME_IDLE,
        /* Task runs in kernelspace in a CPU with VTIME active: */
        VTIME_SYS,
        /* Task runs in userspace in a CPU with VTIME active: */
        VTIME_USER,
        /* Task runs as guests in a CPU with VTIME active: */
        VTIME_GUEST,
};

struct vtime {
        seqcount_t                seqcount;
        unsigned long long        starttime;
        enum vtime_state        state;
        unsigned int                cpu;
        u64                        utime;
        u64                        stime;
        u64                        gtime;
};

/*
 * Utilization clamp constraints.
 * @UCLAMP_MIN:        Minimum utilization
 * @UCLAMP_MAX:        Maximum utilization
 * @UCLAMP_CNT:        Utilization clamp constraints count
 */
enum uclamp_id {
        UCLAMP_MIN = 0,
        UCLAMP_MAX,
        UCLAMP_CNT
};

extern struct root_domain def_root_domain;
extern struct mutex sched_domains_mutex;
extern void sched_domains_mutex_lock(void);
extern void sched_domains_mutex_unlock(void);

struct sched_param {
        int sched_priority;
};

struct sched_info {
#ifdef CONFIG_SCHED_INFO
        /* Cumulative counters: */

        /* # of times we have run on this CPU: */
        unsigned long                        pcount;

        /* Time spent waiting on a runqueue: */
        unsigned long long                run_delay;

        /* Max time spent waiting on a runqueue: */
        unsigned long long                max_run_delay;

        /* Min time spent waiting on a runqueue: */
        unsigned long long                min_run_delay;

        /* Timestamps: */

        /* When did we last run on a CPU? */
        unsigned long long                last_arrival;

        /* When were we last queued to run? */
        unsigned long long                last_queued;

#endif /* CONFIG_SCHED_INFO */
};

/*
 * Integer metrics need fixed point arithmetic, e.g., sched/fair
 * has a few: load, load_avg, util_avg, freq, and capacity.
 *
 * We define a basic fixed point arithmetic range, and then formalize
 * all these metrics based on that basic range.
 */
# define SCHED_FIXEDPOINT_SHIFT                10
# define SCHED_FIXEDPOINT_SCALE                (1L << SCHED_FIXEDPOINT_SHIFT)

/* Increase resolution of cpu_capacity calculations */
# define SCHED_CAPACITY_SHIFT                SCHED_FIXEDPOINT_SHIFT
# define SCHED_CAPACITY_SCALE                (1L << SCHED_CAPACITY_SHIFT)

struct load_weight {
        unsigned long                        weight;
        u32                                inv_weight;
};

/*
 * The load/runnable/util_avg accumulates an infinite geometric series
 * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c).
 *
 * [load_avg definition]
 *
 *   load_avg = runnable% * scale_load_down(load)
 *
 * [runnable_avg definition]
 *
 *   runnable_avg = runnable% * SCHED_CAPACITY_SCALE
 *
 * [util_avg definition]
 *
 *   util_avg = running% * SCHED_CAPACITY_SCALE
 *
 * where runnable% is the time ratio that a sched_entity is runnable and
 * running% the time ratio that a sched_entity is running.
 *
 * For cfs_rq, they are the aggregated values of all runnable and blocked
 * sched_entities.
 *
 * The load/runnable/util_avg doesn't directly factor frequency scaling and CPU
 * capacity scaling. The scaling is done through the rq_clock_pelt that is used
 * for computing those signals (see update_rq_clock_pelt())
 *
 * N.B., the above ratios (runnable% and running%) themselves are in the
 * range of [0, 1]. To do fixed point arithmetics, we therefore scale them
 * to as large a range as necessary. This is for example reflected by
 * util_avg's SCHED_CAPACITY_SCALE.
 *
 * [Overflow issue]
 *
 * The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities
 * with the highest load (=88761), always runnable on a single cfs_rq,
 * and should not overflow as the number already hits PID_MAX_LIMIT.
 *
 * For all other cases (including 32-bit kernels), struct load_weight's
 * weight will overflow first before we do, because:
 *
 *    Max(load_avg) <= Max(load.weight)
 *
 * Then it is the load_weight's responsibility to consider overflow
 * issues.
 */
struct sched_avg {
        u64                                last_update_time;
        u64                                load_sum;
        u64                                runnable_sum;
        u32                                util_sum;
        u32                                period_contrib;
        unsigned long                        load_avg;
        unsigned long                        runnable_avg;
        unsigned long                        util_avg;
        unsigned int                        util_est;
} ____cacheline_aligned;

/*
 * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
 * updates. When a task is dequeued, its util_est should not be updated if its
 * util_avg has not been updated in the meantime.
 * This information is mapped into the MSB bit of util_est at dequeue time.
 * Since max value of util_est for a task is 1024 (PELT util_avg for a task)
 * it is safe to use MSB.
 */
#define UTIL_EST_WEIGHT_SHIFT                2
#define UTIL_AVG_UNCHANGED                0x80000000

struct sched_statistics {
#ifdef CONFIG_SCHEDSTATS
        u64                                wait_start;
        u64                                wait_max;
        u64                                wait_count;
        u64                                wait_sum;
        u64                                iowait_count;
        u64                                iowait_sum;

        u64                                sleep_start;
        u64                                sleep_max;
        s64                                sum_sleep_runtime;

        u64                                block_start;
        u64                                block_max;
        s64                                sum_block_runtime;

        s64                                exec_max;
        u64                                slice_max;

        u64                                nr_migrations_cold;
        u64                                nr_failed_migrations_affine;
        u64                                nr_failed_migrations_running;
        u64                                nr_failed_migrations_hot;
        u64                                nr_forced_migrations;

        u64                                nr_wakeups;
        u64                                nr_wakeups_sync;
        u64                                nr_wakeups_migrate;
        u64                                nr_wakeups_local;
        u64                                nr_wakeups_remote;
        u64                                nr_wakeups_affine;
        u64                                nr_wakeups_affine_attempts;
        u64                                nr_wakeups_passive;
        u64                                nr_wakeups_idle;

#ifdef CONFIG_SCHED_CORE
        u64                                core_forceidle_sum;
#endif
#endif /* CONFIG_SCHEDSTATS */
} ____cacheline_aligned;

struct sched_entity {
        /* For load-balancing: */
        struct load_weight                load;
        struct rb_node                        run_node;
        u64                                deadline;
        u64                                min_vruntime;
        u64                                min_slice;

        struct list_head                group_node;
        unsigned char                        on_rq;
        unsigned char                        sched_delayed;
        unsigned char                        rel_deadline;
        unsigned char                        custom_slice;
                                        /* hole */

        u64                                exec_start;
        u64                                sum_exec_runtime;
        u64                                prev_sum_exec_runtime;
        u64                                vruntime;
        union {
                /*
                 * When !@on_rq this field is vlag.
                 * When cfs_rq->curr == se (which implies @on_rq)
                 * this field is vprot. See protect_slice().
                 */
                s64                     vlag;
                u64                     vprot;
        };
        u64                                slice;

        u64                                nr_migrations;

#ifdef CONFIG_FAIR_GROUP_SCHED
        int                                depth;
        struct sched_entity                *parent;
        /* rq on which this entity is (to be) queued: */
        struct cfs_rq                        *cfs_rq;
        /* rq "owned" by this entity/group: */
        struct cfs_rq                        *my_q;
        /* cached value of my_q->h_nr_running */
        unsigned long                        runnable_weight;
#endif

        /*
         * Per entity load average tracking.
         *
         * Put into separate cache line so it does not
         * collide with read-mostly values above.
         */
        struct sched_avg                avg;
};

struct sched_rt_entity {
        struct list_head                run_list;
        unsigned long                        timeout;
        unsigned long                        watchdog_stamp;
        unsigned int                        time_slice;
        unsigned short                        on_rq;
        unsigned short                        on_list;

        struct sched_rt_entity                *back;
#ifdef CONFIG_RT_GROUP_SCHED
        struct sched_rt_entity                *parent;
        /* rq on which this entity is (to be) queued: */
        struct rt_rq                        *rt_rq;
        /* rq "owned" by this entity/group: */
        struct rt_rq                        *my_q;
#endif
} __randomize_layout;

typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *);
typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *);

struct sched_dl_entity {
        struct rb_node                        rb_node;

        /*
         * Original scheduling parameters. Copied here from sched_attr
         * during sched_setattr(), they will remain the same until
         * the next sched_setattr().
         */
        u64                                dl_runtime;        /* Maximum runtime for each instance        */
        u64                                dl_deadline;        /* Relative deadline of each instance        */
        u64                                dl_period;        /* Separation of two instances (period) */
        u64                                dl_bw;                /* dl_runtime / dl_period                */
        u64                                dl_density;        /* dl_runtime / dl_deadline                */

        /*
         * Actual scheduling parameters. Initialized with the values above,
         * they are continuously updated during task execution. Note that
         * the remaining runtime could be < 0 in case we are in overrun.
         */
        s64                                runtime;        /* Remaining runtime for this instance        */
        u64                                deadline;        /* Absolute deadline for this instance        */
        unsigned int                        flags;                /* Specifying the scheduler behaviour        */

        /*
         * Some bool flags:
         *
         * @dl_throttled tells if we exhausted the runtime. If so, the
         * task has to wait for a replenishment to be performed at the
         * next firing of dl_timer.
         *
         * @dl_yielded tells if task gave up the CPU before consuming
         * all its available runtime during the last job.
         *
         * @dl_non_contending tells if the task is inactive while still
         * contributing to the active utilization. In other words, it
         * indicates if the inactive timer has been armed and its handler
         * has not been executed yet. This flag is useful to avoid race
         * conditions between the inactive timer handler and the wakeup
         * code.
         *
         * @dl_overrun tells if the task asked to be informed about runtime
         * overruns.
         *
         * @dl_server tells if this is a server entity.
         *
         * @dl_defer tells if this is a deferred or regular server. For
         * now only defer server exists.
         *
         * @dl_defer_armed tells if the deferrable server is waiting
         * for the replenishment timer to activate it.
         *
         * @dl_server_active tells if the dlserver is active(started).
         * dlserver is started on first cfs enqueue on an idle runqueue
         * and is stopped when a dequeue results in 0 cfs tasks on the
         * runqueue. In other words, dlserver is active only when cpu's
         * runqueue has atleast one cfs task.
         *
         * @dl_defer_running tells if the deferrable server is actually
         * running, skipping the defer phase.
         */
        unsigned int                        dl_throttled      : 1;
        unsigned int                        dl_yielded        : 1;
        unsigned int                        dl_non_contending : 1;
        unsigned int                        dl_overrun          : 1;
        unsigned int                        dl_server         : 1;
        unsigned int                        dl_server_active  : 1;
        unsigned int                        dl_defer          : 1;
        unsigned int                        dl_defer_armed          : 1;
        unsigned int                        dl_defer_running  : 1;

        /*
         * Bandwidth enforcement timer. Each -deadline task has its
         * own bandwidth to be enforced, thus we need one timer per task.
         */
        struct hrtimer                        dl_timer;

        /*
         * Inactive timer, responsible for decreasing the active utilization
         * at the "0-lag time". When a -deadline task blocks, it contributes
         * to GRUB's active utilization until the "0-lag time", hence a
         * timer is needed to decrease the active utilization at the correct
         * time.
         */
        struct hrtimer                        inactive_timer;

        /*
         * Bits for DL-server functionality. Also see the comment near
         * dl_server_update().
         *
         * @rq the runqueue this server is for
         *
         * @server_has_tasks() returns true if @server_pick return a
         * runnable task.
         */
        struct rq                        *rq;
        dl_server_pick_f                server_pick_task;

#ifdef CONFIG_RT_MUTEXES
        /*
         * Priority Inheritance. When a DEADLINE scheduling entity is boosted
         * pi_se points to the donor, otherwise points to the dl_se it belongs
         * to (the original one/itself).
         */
        struct sched_dl_entity *pi_se;
#endif
};

#ifdef CONFIG_UCLAMP_TASK
/* Number of utilization clamp buckets (shorter alias) */
#define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT

/*
 * Utilization clamp for a scheduling entity
 * @value:                clamp value "assigned" to a se
 * @bucket_id:                bucket index corresponding to the "assigned" value
 * @active:                the se is currently refcounted in a rq's bucket
 * @user_defined:        the requested clamp value comes from user-space
 *
 * The bucket_id is the index of the clamp bucket matching the clamp value
 * which is pre-computed and stored to avoid expensive integer divisions from
 * the fast path.
 *
 * The active bit is set whenever a task has got an "effective" value assigned,
 * which can be different from the clamp value "requested" from user-space.
 * This allows to know a task is refcounted in the rq's bucket corresponding
 * to the "effective" bucket_id.
 *
 * The user_defined bit is set whenever a task has got a task-specific clamp
 * value requested from userspace, i.e. the system defaults apply to this task
 * just as a restriction. This allows to relax default clamps when a less
 * restrictive task-specific value has been requested, thus allowing to
 * implement a "nice" semantic. For example, a task running with a 20%
 * default boost can still drop its own boosting to 0%.
 */
struct uclamp_se {
        unsigned int value                : bits_per(SCHED_CAPACITY_SCALE);
        unsigned int bucket_id                : bits_per(UCLAMP_BUCKETS);
        unsigned int active                : 1;
        unsigned int user_defined        : 1;
};
#endif /* CONFIG_UCLAMP_TASK */

union rcu_special {
        struct {
                u8                        blocked;
                u8                        need_qs;
                u8                        exp_hint; /* Hint for performance. */
                u8                        need_mb; /* Readers need smp_mb(). */
        } b; /* Bits. */
        u32 s; /* Set of bits. */
};

enum perf_event_task_context {
        perf_invalid_context = -1,
        perf_hw_context = 0,
        perf_sw_context,
        perf_nr_task_contexts,
};

/*
 * Number of contexts where an event can trigger:
 *      task, softirq, hardirq, nmi.
 */
#define PERF_NR_CONTEXTS        4

struct wake_q_node {
        struct wake_q_node *next;
};

struct kmap_ctrl {
#ifdef CONFIG_KMAP_LOCAL
        int                                idx;
        pte_t                                pteval[KM_MAX_IDX];
#endif
};

struct task_struct {
#ifdef CONFIG_THREAD_INFO_IN_TASK
        /*
         * For reasons of header soup (see current_thread_info()), this
         * must be the first element of task_struct.
         */
        struct thread_info                thread_info;
#endif
        unsigned int                        __state;

        /* saved state for "spinlock sleepers" */
        unsigned int                        saved_state;

        /*
         * This begins the randomizable portion of task_struct. Only
         * scheduling-critical items should be added above here.
         */
        randomized_struct_fields_start

        void                                *stack;
        refcount_t                        usage;
        /* Per task flags (PF_*), defined further below: */
        unsigned int                        flags;
        unsigned int                        ptrace;

#ifdef CONFIG_MEM_ALLOC_PROFILING
        struct alloc_tag                *alloc_tag;
#endif

        int                                on_cpu;
        struct __call_single_node        wake_entry;
        unsigned int                        wakee_flips;
        unsigned long                        wakee_flip_decay_ts;
        struct task_struct                *last_wakee;

        /*
         * recent_used_cpu is initially set as the last CPU used by a task
         * that wakes affine another task. Waker/wakee relationships can
         * push tasks around a CPU where each wakeup moves to the next one.
         * Tracking a recently used CPU allows a quick search for a recently
         * used CPU that may be idle.
         */
        int                                recent_used_cpu;
        int                                wake_cpu;
        int                                on_rq;

        int                                prio;
        int                                static_prio;
        int                                normal_prio;
        unsigned int                        rt_priority;

        struct sched_entity                se;
        struct sched_rt_entity                rt;
        struct sched_dl_entity                dl;
        struct sched_dl_entity                *dl_server;
#ifdef CONFIG_SCHED_CLASS_EXT
        struct sched_ext_entity                scx;
#endif
        const struct sched_class        *sched_class;

#ifdef CONFIG_SCHED_CORE
        struct rb_node                        core_node;
        unsigned long                        core_cookie;
        unsigned int                        core_occupation;
#endif

#ifdef CONFIG_CGROUP_SCHED
        struct task_group                *sched_task_group;
#ifdef CONFIG_CFS_BANDWIDTH
        struct callback_head                sched_throttle_work;
        struct list_head                throttle_node;
        bool                                throttled;
#endif
#endif


#ifdef CONFIG_UCLAMP_TASK
        /*
         * Clamp values requested for a scheduling entity.
         * Must be updated with task_rq_lock() held.
         */
        struct uclamp_se                uclamp_req[UCLAMP_CNT];
        /*
         * Effective clamp values used for a scheduling entity.
         * Must be updated with task_rq_lock() held.
         */
        struct uclamp_se                uclamp[UCLAMP_CNT];
#endif

        struct sched_statistics         stats;

#ifdef CONFIG_PREEMPT_NOTIFIERS
        /* List of struct preempt_notifier: */
        struct hlist_head                preempt_notifiers;
#endif

#ifdef CONFIG_BLK_DEV_IO_TRACE
        unsigned int                        btrace_seq;
#endif

        unsigned int                        policy;
        unsigned long                        max_allowed_capacity;
        int                                nr_cpus_allowed;
        const cpumask_t                        *cpus_ptr;
        cpumask_t                        *user_cpus_ptr;
        cpumask_t                        cpus_mask;
        void                                *migration_pending;
        unsigned short                        migration_disabled;
        unsigned short                        migration_flags;

#ifdef CONFIG_PREEMPT_RCU
        int                                rcu_read_lock_nesting;
        union rcu_special                rcu_read_unlock_special;
        struct list_head                rcu_node_entry;
        struct rcu_node                        *rcu_blocked_node;
#endif /* #ifdef CONFIG_PREEMPT_RCU */

#ifdef CONFIG_TASKS_RCU
        unsigned long                        rcu_tasks_nvcsw;
        u8                                rcu_tasks_holdout;
        u8                                rcu_tasks_idx;
        int                                rcu_tasks_idle_cpu;
        struct list_head                rcu_tasks_holdout_list;
        int                                rcu_tasks_exit_cpu;
        struct list_head                rcu_tasks_exit_list;
#endif /* #ifdef CONFIG_TASKS_RCU */

#ifdef CONFIG_TASKS_TRACE_RCU
        int                                trc_reader_nesting;
        int                                trc_ipi_to_cpu;
        union rcu_special                trc_reader_special;
        struct list_head                trc_holdout_list;
        struct list_head                trc_blkd_node;
        int                                trc_blkd_cpu;
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */

        struct sched_info                sched_info;

        struct list_head                tasks;
        struct plist_node                pushable_tasks;
        struct rb_node                        pushable_dl_tasks;

        struct mm_struct                *mm;
        struct mm_struct                *active_mm;
        struct address_space                *faults_disabled_mapping;

        int                                exit_state;
        int                                exit_code;
        int                                exit_signal;
        /* The signal sent when the parent dies: */
        int                                pdeath_signal;
        /* JOBCTL_*, siglock protected: */
        unsigned long                        jobctl;

        /* Used for emulating ABI behavior of previous Linux versions: */
        unsigned int                        personality;

        /* Scheduler bits, serialized by scheduler locks: */
        unsigned                        sched_reset_on_fork:1;
        unsigned                        sched_contributes_to_load:1;
        unsigned                        sched_migrated:1;
        unsigned                        sched_task_hot:1;

        /* Force alignment to the next boundary: */
        unsigned                        :0;

        /* Unserialized, strictly 'current' */

        /*
         * This field must not be in the scheduler word above due to wakelist
         * queueing no longer being serialized by p->on_cpu. However:
         *
         * p->XXX = X;                        ttwu()
         * schedule()                          if (p->on_rq && ..) // false
         *   smp_mb__after_spinlock();          if (smp_load_acquire(&p->on_cpu) && //true
         *   deactivate_task()                      ttwu_queue_wakelist())
         *     p->on_rq = 0;                        p->sched_remote_wakeup = Y;
         *
         * guarantees all stores of 'current' are visible before
         * ->sched_remote_wakeup gets used, so it can be in this word.
         */
        unsigned                        sched_remote_wakeup:1;
#ifdef CONFIG_RT_MUTEXES
        unsigned                        sched_rt_mutex:1;
#endif

        /* Bit to tell TOMOYO we're in execve(): */
        unsigned                        in_execve:1;
        unsigned                        in_iowait:1;
#ifndef TIF_RESTORE_SIGMASK
        unsigned                        restore_sigmask:1;
#endif
#ifdef CONFIG_MEMCG_V1
        unsigned                        in_user_fault:1;
#endif
#ifdef CONFIG_LRU_GEN
        /* whether the LRU algorithm may apply to this access */
        unsigned                        in_lru_fault:1;
#endif
#ifdef CONFIG_COMPAT_BRK
        unsigned                        brk_randomized:1;
#endif
#ifdef CONFIG_CGROUPS
        /* disallow userland-initiated cgroup migration */
        unsigned                        no_cgroup_migration:1;
        /* task is frozen/stopped (used by the cgroup freezer) */
        unsigned                        frozen:1;
#endif
#ifdef CONFIG_BLK_CGROUP
        unsigned                        use_memdelay:1;
#endif
#ifdef CONFIG_PSI
        /* Stalled due to lack of memory */
        unsigned                        in_memstall:1;
#endif
#ifdef CONFIG_PAGE_OWNER
        /* Used by page_owner=on to detect recursion in page tracking. */
        unsigned                        in_page_owner:1;
#endif
#ifdef CONFIG_EVENTFD
        /* Recursion prevention for eventfd_signal() */
        unsigned                        in_eventfd:1;
#endif
#ifdef CONFIG_ARCH_HAS_CPU_PASID
        unsigned                        pasid_activated:1;
#endif
#ifdef CONFIG_X86_BUS_LOCK_DETECT
        unsigned                        reported_split_lock:1;
#endif
#ifdef CONFIG_TASK_DELAY_ACCT
        /* delay due to memory thrashing */
        unsigned                        in_thrashing:1;
#endif
        unsigned                        in_nf_duplicate:1;
#ifdef CONFIG_PREEMPT_RT
        struct netdev_xmit                net_xmit;
#endif
        unsigned long                        atomic_flags; /* Flags requiring atomic access. */

        struct restart_block                restart_block;

        pid_t                                pid;
        pid_t                                tgid;

#ifdef CONFIG_STACKPROTECTOR
        /* Canary value for the -fstack-protector GCC feature: */
        unsigned long                        stack_canary;
#endif
        /*
         * Pointers to the (original) parent process, youngest child, younger sibling,
         * older sibling, respectively.  (p->father can be replaced with
         * p->real_parent->pid)
         */

        /* Real parent process: */
        struct task_struct __rcu        *real_parent;

        /* Recipient of SIGCHLD, wait4() reports: */
        struct task_struct __rcu        *parent;

        /*
         * Children/sibling form the list of natural children:
         */
        struct list_head                children;
        struct list_head                sibling;
        struct task_struct                *group_leader;

        /*
         * 'ptraced' is the list of tasks this task is using ptrace() on.
         *
         * This includes both natural children and PTRACE_ATTACH targets.
         * 'ptrace_entry' is this task's link on the p->parent->ptraced list.
         */
        struct list_head                ptraced;
        struct list_head                ptrace_entry;

        /* PID/PID hash table linkage. */
        struct pid                        *thread_pid;
        struct hlist_node                pid_links[PIDTYPE_MAX];
        struct list_head                thread_node;

        struct completion                *vfork_done;

        /* CLONE_CHILD_SETTID: */
        int __user                        *set_child_tid;

        /* CLONE_CHILD_CLEARTID: */
        int __user                        *clear_child_tid;

        /* PF_KTHREAD | PF_IO_WORKER */
        void                                *worker_private;

        u64                                utime;
        u64                                stime;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
        u64                                utimescaled;
        u64                                stimescaled;
#endif
        u64                                gtime;
        struct prev_cputime                prev_cputime;
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
        struct vtime                        vtime;
#endif

#ifdef CONFIG_NO_HZ_FULL
        atomic_t                        tick_dep_mask;
#endif
        /* Context switch counts: */
        unsigned long                        nvcsw;
        unsigned long                        nivcsw;

        /* Monotonic time in nsecs: */
        u64                                start_time;

        /* Boot based time in nsecs: */
        u64                                start_boottime;

        /* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */
        unsigned long                        min_flt;
        unsigned long                        maj_flt;

        /* Empty if CONFIG_POSIX_CPUTIMERS=n */
        struct posix_cputimers                posix_cputimers;

#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
        struct posix_cputimers_work        posix_cputimers_work;
#endif

        /* Process credentials: */

        /* Tracer's credentials at attach: */
        const struct cred __rcu                *ptracer_cred;

        /* Objective and real subjective task credentials (COW): */
        const struct cred __rcu                *real_cred;

        /* Effective (overridable) subjective task credentials (COW): */
        const struct cred __rcu                *cred;

#ifdef CONFIG_KEYS
        /* Cached requested key. */
        struct key                        *cached_requested_key;
#endif

        /*
         * executable name, excluding path.
         *
         * - normally initialized begin_new_exec()
         * - set it with set_task_comm()
         *   - strscpy_pad() to ensure it is always NUL-terminated and
         *     zero-padded
         *   - task_lock() to ensure the operation is atomic and the name is
         *     fully updated.
         */
        char                                comm[TASK_COMM_LEN];

        struct nameidata                *nameidata;

#ifdef CONFIG_SYSVIPC
        struct sysv_sem                        sysvsem;
        struct sysv_shm                        sysvshm;
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
        unsigned long                        last_switch_count;
        unsigned long                        last_switch_time;
#endif
        /* Filesystem information: */
        struct fs_struct                *fs;

        /* Open file information: */
        struct files_struct                *files;

#ifdef CONFIG_IO_URING
        struct io_uring_task                *io_uring;
#endif

        /* Namespaces: */
        struct nsproxy                        *nsproxy;

        /* Signal handlers: */
        struct signal_struct                *signal;
        struct sighand_struct __rcu                *sighand;
        sigset_t                        blocked;
        sigset_t                        real_blocked;
        /* Restored if set_restore_sigmask() was used: */
        sigset_t                        saved_sigmask;
        struct sigpending                pending;
        unsigned long                        sas_ss_sp;
        size_t                                sas_ss_size;
        unsigned int                        sas_ss_flags;

        struct callback_head                *task_works;

#ifdef CONFIG_AUDIT
#ifdef CONFIG_AUDITSYSCALL
        struct audit_context                *audit_context;
#endif
        kuid_t                                loginuid;
        unsigned int                        sessionid;
#endif
        struct seccomp                        seccomp;
        struct syscall_user_dispatch        syscall_dispatch;

        /* Thread group tracking: */
        u64                                parent_exec_id;
        u64                                self_exec_id;

        /* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */
        spinlock_t                        alloc_lock;

        /* Protection of the PI data structures: */
        raw_spinlock_t                        pi_lock;

        struct wake_q_node                wake_q;

#ifdef CONFIG_RT_MUTEXES
        /* PI waiters blocked on a rt_mutex held by this task: */
        struct rb_root_cached                pi_waiters;
        /* Updated under owner's pi_lock and rq lock */
        struct task_struct                *pi_top_task;
        /* Deadlock detection and priority inheritance handling: */
        struct rt_mutex_waiter                *pi_blocked_on;
#endif

        struct mutex                        *blocked_on;        /* lock we're blocked on */

#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
        /*
         * Encoded lock address causing task block (lower 2 bits = type from
         * <linux/hung_task.h>). Accessed via hung_task_*() helpers.
         */
        unsigned long                        blocker;
#endif

#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
        int                                non_block_count;
#endif

#ifdef CONFIG_TRACE_IRQFLAGS
        struct irqtrace_events                irqtrace;
        unsigned int                        hardirq_threaded;
        u64                                hardirq_chain_key;
        int                                softirqs_enabled;
        int                                softirq_context;
        int                                irq_config;
#endif
#ifdef CONFIG_PREEMPT_RT
        int                                softirq_disable_cnt;
#endif

#ifdef CONFIG_LOCKDEP
# define MAX_LOCK_DEPTH                        48UL
        u64                                curr_chain_key;
        int                                lockdep_depth;
        unsigned int                        lockdep_recursion;
        struct held_lock                held_locks[MAX_LOCK_DEPTH];
#endif

#if defined(CONFIG_UBSAN) && !defined(CONFIG_UBSAN_TRAP)
        unsigned int                        in_ubsan;
#endif

        /* Journalling filesystem info: */
        void                                *journal_info;

        /* Stacked block device info: */
        struct bio_list                        *bio_list;

        /* Stack plugging: */
        struct blk_plug                        *plug;

        /* VM state: */
        struct reclaim_state                *reclaim_state;

        struct io_context                *io_context;

#ifdef CONFIG_COMPACTION
        struct capture_control                *capture_control;
#endif
        /* Ptrace state: */
        unsigned long                        ptrace_message;
        kernel_siginfo_t                *last_siginfo;

        struct task_io_accounting        ioac;
#ifdef CONFIG_PSI
        /* Pressure stall state */
        unsigned int                        psi_flags;
#endif
#ifdef CONFIG_TASK_XACCT
        /* Accumulated RSS usage: */
        u64                                acct_rss_mem1;
        /* Accumulated virtual memory usage: */
        u64                                acct_vm_mem1;
        /* stime + utime since last update: */
        u64                                acct_timexpd;
#endif
#ifdef CONFIG_CPUSETS
        /* Protected by ->alloc_lock: */
        nodemask_t                        mems_allowed;
        /* Sequence number to catch updates: */
        seqcount_spinlock_t                mems_allowed_seq;
        int                                cpuset_mem_spread_rotor;
#endif
#ifdef CONFIG_CGROUPS
        /* Control Group info protected by css_set_lock: */
        struct css_set __rcu                *cgroups;
        /* cg_list protected by css_set_lock and tsk->alloc_lock: */
        struct list_head                cg_list;
#endif
#ifdef CONFIG_X86_CPU_RESCTRL
        u32                                closid;
        u32                                rmid;
#endif
#ifdef CONFIG_FUTEX
        struct robust_list_head __user        *robust_list;
#ifdef CONFIG_COMPAT
        struct compat_robust_list_head __user *compat_robust_list;
#endif
        struct list_head                pi_state_list;
        struct futex_pi_state                *pi_state_cache;
        struct mutex                        futex_exit_mutex;
        unsigned int                        futex_state;
#endif
#ifdef CONFIG_PERF_EVENTS
        u8                                perf_recursion[PERF_NR_CONTEXTS];
        struct perf_event_context        *perf_event_ctxp;
        struct mutex                        perf_event_mutex;
        struct list_head                perf_event_list;
        struct perf_ctx_data __rcu        *perf_ctx_data;
#endif
#ifdef CONFIG_DEBUG_PREEMPT
        unsigned long                        preempt_disable_ip;
#endif
#ifdef CONFIG_NUMA
        /* Protected by alloc_lock: */
        struct mempolicy                *mempolicy;
        short                                il_prev;
        u8                                il_weight;
        short                                pref_node_fork;
#endif
#ifdef CONFIG_NUMA_BALANCING
        int                                numa_scan_seq;
        unsigned int                        numa_scan_period;
        unsigned int                        numa_scan_period_max;
        int                                numa_preferred_nid;
        unsigned long                        numa_migrate_retry;
        /* Migration stamp: */
        u64                                node_stamp;
        u64                                last_task_numa_placement;
        u64                                last_sum_exec_runtime;
        struct callback_head                numa_work;

        /*
         * This pointer is only modified for current in syscall and
         * pagefault context (and for tasks being destroyed), so it can be read
         * from any of the following contexts:
         *  - RCU read-side critical section
         *  - current->numa_group from everywhere
         *  - task's runqueue locked, task not running
         */
        struct numa_group __rcu                *numa_group;

        /*
         * numa_faults is an array split into four regions:
         * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer
         * in this precise order.
         *
         * faults_memory: Exponential decaying average of faults on a per-node
         * basis. Scheduling placement decisions are made based on these
         * counts. The values remain static for the duration of a PTE scan.
         * faults_cpu: Track the nodes the process was running on when a NUMA
         * hinting fault was incurred.
         * faults_memory_buffer and faults_cpu_buffer: Record faults per node
         * during the current scan window. When the scan completes, the counts
         * in faults_memory and faults_cpu decay and these values are copied.
         */
        unsigned long                        *numa_faults;
        unsigned long                        total_numa_faults;

        /*
         * numa_faults_locality tracks if faults recorded during the last
         * scan window were remote/local or failed to migrate. The task scan
         * period is adapted based on the locality of the faults with different
         * weights depending on whether they were shared or private faults
         */
        unsigned long                        numa_faults_locality[3];

        unsigned long                        numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */

#ifdef CONFIG_RSEQ
        struct rseq __user *rseq;
        u32 rseq_len;
        u32 rseq_sig;
        /*
         * RmW on rseq_event_mask must be performed atomically
         * with respect to preemption.
         */
        unsigned long rseq_event_mask;
# ifdef CONFIG_DEBUG_RSEQ
        /*
         * This is a place holder to save a copy of the rseq fields for
         * validation of read-only fields. The struct rseq has a
         * variable-length array at the end, so it cannot be used
         * directly. Reserve a size large enough for the known fields.
         */
        char                                rseq_fields[sizeof(struct rseq)];
# endif
#endif

#ifdef CONFIG_SCHED_MM_CID
        int                                mm_cid;                /* Current cid in mm */
        int                                last_mm_cid;        /* Most recent cid in mm */
        int                                migrate_from_cpu;
        int                                mm_cid_active;        /* Whether cid bitmap is active */
        struct callback_head                cid_work;
#endif

        struct tlbflush_unmap_batch        tlb_ubc;

        /* Cache last used pipe for splice(): */
        struct pipe_inode_info                *splice_pipe;

        struct page_frag                task_frag;

#ifdef CONFIG_TASK_DELAY_ACCT
        struct task_delay_info                *delays;
#endif

#ifdef CONFIG_FAULT_INJECTION
        int                                make_it_fail;
        unsigned int                        fail_nth;
#endif
        /*
         * When (nr_dirtied >= nr_dirtied_pause), it's time to call
         * balance_dirty_pages() for a dirty throttling pause:
         */
        int                                nr_dirtied;
        int                                nr_dirtied_pause;
        /* Start of a write-and-pause period: */
        unsigned long                        dirty_paused_when;

#ifdef CONFIG_LATENCYTOP
        int                                latency_record_count;
        struct latency_record                latency_record[LT_SAVECOUNT];
#endif
        /*
         * Time slack values; these are used to round up poll() and
         * select() etc timeout values. These are in nanoseconds.
         */
        u64                                timer_slack_ns;
        u64                                default_timer_slack_ns;

#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
        unsigned int                        kasan_depth;
#endif

#ifdef CONFIG_KCSAN
        struct kcsan_ctx                kcsan_ctx;
#ifdef CONFIG_TRACE_IRQFLAGS
        struct irqtrace_events                kcsan_save_irqtrace;
#endif
#ifdef CONFIG_KCSAN_WEAK_MEMORY
        int                                kcsan_stack_depth;
#endif
#endif

#ifdef CONFIG_KMSAN
        struct kmsan_ctx                kmsan_ctx;
#endif

#if IS_ENABLED(CONFIG_KUNIT)
        struct kunit                        *kunit_test;
#endif

#ifdef CONFIG_FUNCTION_GRAPH_TRACER
        /* Index of current stored address in ret_stack: */
        int                                curr_ret_stack;
        int                                curr_ret_depth;

        /* Stack of return addresses for return function tracing: */
        unsigned long                        *ret_stack;

        /* Timestamp for last schedule: */
        unsigned long long                ftrace_timestamp;
        unsigned long long                ftrace_sleeptime;

        /*
         * Number of functions that haven't been traced
         * because of depth overrun:
         */
        atomic_t                        trace_overrun;

        /* Pause tracing: */
        atomic_t                        tracing_graph_pause;
#endif

#ifdef CONFIG_TRACING
        /* Bitmask and counter of trace recursion: */
        unsigned long                        trace_recursion;
#endif /* CONFIG_TRACING */

#ifdef CONFIG_KCOV
        /* See kernel/kcov.c for more details. */

        /* Coverage collection mode enabled for this task (0 if disabled): */
        unsigned int                        kcov_mode;

        /* Size of the kcov_area: */
        unsigned int                        kcov_size;

        /* Buffer for coverage collection: */
        void                                *kcov_area;

        /* KCOV descriptor wired with this task or NULL: */
        struct kcov                        *kcov;

        /* KCOV common handle for remote coverage collection: */
        u64                                kcov_handle;

        /* KCOV sequence number: */
        int                                kcov_sequence;

        /* Collect coverage from softirq context: */
        unsigned int                        kcov_softirq;
#endif

#ifdef CONFIG_MEMCG_V1
        struct mem_cgroup                *memcg_in_oom;
#endif

#ifdef CONFIG_MEMCG
        /* Number of pages to reclaim on returning to userland: */
        unsigned int                        memcg_nr_pages_over_high;

        /* Used by memcontrol for targeted memcg charge: */
        struct mem_cgroup                *active_memcg;

        /* Cache for current->cgroups->memcg->objcg lookups: */
        struct obj_cgroup                *objcg;
#endif

#ifdef CONFIG_BLK_CGROUP
        struct gendisk                        *throttle_disk;
#endif

#ifdef CONFIG_UPROBES
        struct uprobe_task                *utask;
#endif
#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
        unsigned int                        sequential_io;
        unsigned int                        sequential_io_avg;
#endif
        struct kmap_ctrl                kmap_ctrl;
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
        unsigned long                        task_state_change;
# ifdef CONFIG_PREEMPT_RT
        unsigned long                        saved_state_change;
# endif
#endif
        struct rcu_head                        rcu;
        refcount_t                        rcu_users;
        int                                pagefault_disabled;
#ifdef CONFIG_MMU
        struct task_struct                *oom_reaper_list;
        struct timer_list                oom_reaper_timer;
#endif
#ifdef CONFIG_VMAP_STACK
        struct vm_struct                *stack_vm_area;
#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
        /* A live task holds one reference: */
        refcount_t                        stack_refcount;
#endif
#ifdef CONFIG_LIVEPATCH
        int patch_state;
#endif
#ifdef CONFIG_SECURITY
        /* Used by LSM modules for access restriction: */
        void                                *security;
#endif
#ifdef CONFIG_BPF_SYSCALL
        /* Used by BPF task local storage */
        struct bpf_local_storage __rcu        *bpf_storage;
        /* Used for BPF run context */
        struct bpf_run_ctx                *bpf_ctx;
#endif
        /* Used by BPF for per-TASK xdp storage */
        struct bpf_net_context                *bpf_net_context;

#ifdef CONFIG_KSTACK_ERASE
        unsigned long                        lowest_stack;
#endif
#ifdef CONFIG_KSTACK_ERASE_METRICS
        unsigned long                        prev_lowest_stack;
#endif

#ifdef CONFIG_X86_MCE
        void __user                        *mce_vaddr;
        __u64                                mce_kflags;
        u64                                mce_addr;
        __u64                                mce_ripv : 1,
                                        mce_whole_page : 1,
                                        __mce_reserved : 62;
        struct callback_head                mce_kill_me;
        int                                mce_count;
#endif

#ifdef CONFIG_KRETPROBES
        struct llist_head               kretprobe_instances;
#endif
#ifdef CONFIG_RETHOOK
        struct llist_head               rethooks;
#endif

#ifdef CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH
        /*
         * If L1D flush is supported on mm context switch
         * then we use this callback head to queue kill work
         * to kill tasks that are not running on SMT disabled
         * cores
         */
        struct callback_head                l1d_flush_kill;
#endif

#ifdef CONFIG_RV
        /*
         * Per-task RV monitor, fixed in CONFIG_RV_PER_TASK_MONITORS.
         * If memory becomes a concern, we can think about a dynamic method.
         */
        union rv_task_monitor                rv[CONFIG_RV_PER_TASK_MONITORS];
#endif

#ifdef CONFIG_USER_EVENTS
        struct user_event_mm                *user_event_mm;
#endif

#ifdef CONFIG_UNWIND_USER
        struct unwind_task_info                unwind_info;
#endif

        /* CPU-specific state of this task: */
        struct thread_struct                thread;

        /*
         * New fields for task_struct should be added above here, so that
         * they are included in the randomized portion of task_struct.
         */
        randomized_struct_fields_end
} __attribute__ ((aligned (64)));

#ifdef CONFIG_SCHED_PROXY_EXEC
DECLARE_STATIC_KEY_TRUE(__sched_proxy_exec);
static inline bool sched_proxy_exec(void)
{
        return static_branch_likely(&__sched_proxy_exec);
}
#else
static inline bool sched_proxy_exec(void)
{
        return false;
}
#endif

#define TASK_REPORT_IDLE        (TASK_REPORT + 1)
#define TASK_REPORT_MAX                (TASK_REPORT_IDLE << 1)

static inline unsigned int __task_state_index(unsigned int tsk_state,
                                              unsigned int tsk_exit_state)
{
        unsigned int state = (tsk_state | tsk_exit_state) & TASK_REPORT;

        BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX);

        if ((tsk_state & TASK_IDLE) == TASK_IDLE)
                state = TASK_REPORT_IDLE;

        /*
         * We're lying here, but rather than expose a completely new task state
         * to userspace, we can make this appear as if the task has gone through
         * a regular rt_mutex_lock() call.
         * Report frozen tasks as uninterruptible.
         */
        if ((tsk_state & TASK_RTLOCK_WAIT) || (tsk_state & TASK_FROZEN))
                state = TASK_UNINTERRUPTIBLE;

        return fls(state);
}

static inline unsigned int task_state_index(struct task_struct *tsk)
{
        return __task_state_index(READ_ONCE(tsk->__state), tsk->exit_state);
}

static inline char task_index_to_char(unsigned int state)
{
        static const char state_char[] = "RSDTtXZPI";

        BUILD_BUG_ON(TASK_REPORT_MAX * 2 != 1 << (sizeof(state_char) - 1));

        return state_char[state];
}

static inline char task_state_to_char(struct task_struct *tsk)
{
        return task_index_to_char(task_state_index(tsk));
}

extern struct pid *cad_pid;

/*
 * Per process flags
 */
#define PF_VCPU                        0x00000001        /* I'm a virtual CPU */
#define PF_IDLE                        0x00000002        /* I am an IDLE thread */
#define PF_EXITING                0x00000004        /* Getting shut down */
#define PF_POSTCOREDUMP                0x00000008        /* Coredumps should ignore this task */
#define PF_IO_WORKER                0x00000010        /* Task is an IO worker */
#define PF_WQ_WORKER                0x00000020        /* I'm a workqueue worker */
#define PF_FORKNOEXEC                0x00000040        /* Forked but didn't exec */
#define PF_MCE_PROCESS                0x00000080      /* Process policy on mce errors */
#define PF_SUPERPRIV                0x00000100        /* Used super-user privileges */
#define PF_DUMPCORE                0x00000200        /* Dumped core */
#define PF_SIGNALED                0x00000400        /* Killed by a signal */
#define PF_MEMALLOC                0x00000800        /* Allocating memory to free memory. See memalloc_noreclaim_save() */
#define PF_NPROC_EXCEEDED        0x00001000        /* set_user() noticed that RLIMIT_NPROC was exceeded */
#define PF_USED_MATH                0x00002000        /* If unset the fpu must be initialized before use */
#define PF_USER_WORKER                0x00004000        /* Kernel thread cloned from userspace thread */
#define PF_NOFREEZE                0x00008000        /* This thread should not be frozen */
#define PF_KCOMPACTD                0x00010000        /* I am kcompactd */
#define PF_KSWAPD                0x00020000        /* I am kswapd */
#define PF_MEMALLOC_NOFS        0x00040000        /* All allocations inherit GFP_NOFS. See memalloc_nfs_save() */
#define PF_MEMALLOC_NOIO        0x00080000        /* All allocations inherit GFP_NOIO. See memalloc_noio_save() */
#define PF_LOCAL_THROTTLE        0x00100000        /* Throttle writes only against the bdi I write to,
                                                 * I am cleaning dirty pages from some other bdi. */
#define PF_KTHREAD                0x00200000        /* I am a kernel thread */
#define PF_RANDOMIZE                0x00400000        /* Randomize virtual address space */
#define PF__HOLE__00800000        0x00800000
#define PF__HOLE__01000000        0x01000000
#define PF__HOLE__02000000        0x02000000
#define PF_NO_SETAFFINITY        0x04000000        /* Userland is not allowed to meddle with cpus_mask */
#define PF_MCE_EARLY                0x08000000      /* Early kill for mce process policy */
#define PF_MEMALLOC_PIN                0x10000000        /* Allocations constrained to zones which allow long term pinning.
                                                 * See memalloc_pin_save() */
#define PF_BLOCK_TS                0x20000000        /* plug has ts that needs updating */
#define PF__HOLE__40000000        0x40000000
#define PF_SUSPEND_TASK                0x80000000      /* This thread called freeze_processes() and should not be frozen */

/*
 * Only the _current_ task can read/write to tsk->flags, but other
 * tasks can access tsk->flags in readonly mode for example
 * with tsk_used_math (like during threaded core dumping).
 * There is however an exception to this rule during ptrace
 * or during fork: the ptracer task is allowed to write to the
 * child->flags of its traced child (same goes for fork, the parent
 * can write to the child->flags), because we're guaranteed the
 * child is not running and in turn not changing child->flags
 * at the same time the parent does it.
 */
#define clear_stopped_child_used_math(child)        do { (child)->flags &= ~PF_USED_MATH; } while (0)
#define set_stopped_child_used_math(child)        do { (child)->flags |= PF_USED_MATH; } while (0)
#define clear_used_math()                        clear_stopped_child_used_math(current)
#define set_used_math()                                set_stopped_child_used_math(current)

#define conditional_stopped_child_used_math(condition, child) \
        do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0)

#define conditional_used_math(condition)        conditional_stopped_child_used_math(condition, current)

#define copy_to_stopped_child_used_math(child) \
        do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0)

/* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */
#define tsk_used_math(p)                        ((p)->flags & PF_USED_MATH)
#define used_math()                                tsk_used_math(current)

static __always_inline bool is_percpu_thread(void)
{
        return (current->flags & PF_NO_SETAFFINITY) &&
                (current->nr_cpus_allowed  == 1);
}

/* Per-process atomic flags. */
#define PFA_NO_NEW_PRIVS                0        /* May not gain new privileges. */
#define PFA_SPREAD_PAGE                        1        /* Spread page cache over cpuset */
#define PFA_SPREAD_SLAB                        2        /* Spread some slab caches over cpuset */
#define PFA_SPEC_SSB_DISABLE                3        /* Speculative Store Bypass disabled */
#define PFA_SPEC_SSB_FORCE_DISABLE        4        /* Speculative Store Bypass force disabled*/
#define PFA_SPEC_IB_DISABLE                5        /* Indirect branch speculation restricted */
#define PFA_SPEC_IB_FORCE_DISABLE        6        /* Indirect branch speculation permanently restricted */
#define PFA_SPEC_SSB_NOEXEC                7        /* Speculative Store Bypass clear on execve() */

#define TASK_PFA_TEST(name, func)                                        \
        static inline bool task_##func(struct task_struct *p)                \
        { return test_bit(PFA_##name, &p->atomic_flags); }

#define TASK_PFA_SET(name, func)                                        \
        static inline void task_set_##func(struct task_struct *p)        \
        { set_bit(PFA_##name, &p->atomic_flags); }

#define TASK_PFA_CLEAR(name, func)                                        \
        static inline void task_clear_##func(struct task_struct *p)        \
        { clear_bit(PFA_##name, &p->atomic_flags); }

TASK_PFA_TEST(NO_NEW_PRIVS, no_new_privs)
TASK_PFA_SET(NO_NEW_PRIVS, no_new_privs)

TASK_PFA_TEST(SPREAD_PAGE, spread_page)
TASK_PFA_SET(SPREAD_PAGE, spread_page)
TASK_PFA_CLEAR(SPREAD_PAGE, spread_page)

TASK_PFA_TEST(SPREAD_SLAB, spread_slab)
TASK_PFA_SET(SPREAD_SLAB, spread_slab)
TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab)

TASK_PFA_TEST(SPEC_SSB_DISABLE, spec_ssb_disable)
TASK_PFA_SET(SPEC_SSB_DISABLE, spec_ssb_disable)
TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable)

TASK_PFA_TEST(SPEC_SSB_NOEXEC, spec_ssb_noexec)
TASK_PFA_SET(SPEC_SSB_NOEXEC, spec_ssb_noexec)
TASK_PFA_CLEAR(SPEC_SSB_NOEXEC, spec_ssb_noexec)

TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)
TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)

TASK_PFA_TEST(SPEC_IB_DISABLE, spec_ib_disable)
TASK_PFA_SET(SPEC_IB_DISABLE, spec_ib_disable)
TASK_PFA_CLEAR(SPEC_IB_DISABLE, spec_ib_disable)

TASK_PFA_TEST(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable)
TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable)

static inline void
current_restore_flags(unsigned long orig_flags, unsigned long flags)
{
        current->flags &= ~flags;
        current->flags |= orig_flags & flags;
}

extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
extern int task_can_attach(struct task_struct *p);
extern int dl_bw_alloc(int cpu, u64 dl_bw);
extern void dl_bw_free(int cpu, u64 dl_bw);

/* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */
extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);

/**
 * set_cpus_allowed_ptr - set CPU affinity mask of a task
 * @p: the task
 * @new_mask: CPU affinity mask
 *
 * Return: zero if successful, or a negative error code
 */
extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask);
extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node);
extern void release_user_cpus_ptr(struct task_struct *p);
extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask);
extern void force_compatible_cpus_allowed_ptr(struct task_struct *p);
extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p);

extern int yield_to(struct task_struct *p, bool preempt);
extern void set_user_nice(struct task_struct *p, long nice);
extern int task_prio(const struct task_struct *p);

/**
 * task_nice - return the nice value of a given task.
 * @p: the task in question.
 *
 * Return: The nice value [ -20 ... 0 ... 19 ].
 */
static inline int task_nice(const struct task_struct *p)
{
        return PRIO_TO_NICE((p)->static_prio);
}

extern int can_nice(const struct task_struct *p, const int nice);
extern int task_curr(const struct task_struct *p);
extern int idle_cpu(int cpu);
extern int available_idle_cpu(int cpu);
extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *);
extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
extern void sched_set_fifo(struct task_struct *p);
extern void sched_set_fifo_low(struct task_struct *p);
extern void sched_set_normal(struct task_struct *p, int nice);
extern int sched_setattr(struct task_struct *, const struct sched_attr *);
extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *);
extern struct task_struct *idle_task(int cpu);

/**
 * is_idle_task - is the specified task an idle task?
 * @p: the task in question.
 *
 * Return: 1 if @p is an idle task. 0 otherwise.
 */
static __always_inline bool is_idle_task(const struct task_struct *p)
{
        return !!(p->flags & PF_IDLE);
}

extern struct task_struct *curr_task(int cpu);
extern void ia64_set_curr_task(int cpu, struct task_struct *p);

void yield(void);

union thread_union {
        struct task_struct task;
#ifndef CONFIG_THREAD_INFO_IN_TASK
        struct thread_info thread_info;
#endif
        unsigned long stack[THREAD_SIZE/sizeof(long)];
};

#ifndef CONFIG_THREAD_INFO_IN_TASK
extern struct thread_info init_thread_info;
#endif

extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)];

#ifdef CONFIG_THREAD_INFO_IN_TASK
# define task_thread_info(task)        (&(task)->thread_info)
#else
# define task_thread_info(task)        ((struct thread_info *)(task)->stack)
#endif

/*
 * find a task by one of its numerical ids
 *
 * find_task_by_pid_ns():
 *      finds a task by its pid in the specified namespace
 * find_task_by_vpid():
 *      finds a task by its virtual pid
 *
 * see also find_vpid() etc in include/linux/pid.h
 */

extern struct task_struct *find_task_by_vpid(pid_t nr);
extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns);

/*
 * find a task by its virtual pid and get the task struct
 */
extern struct task_struct *find_get_task_by_vpid(pid_t nr);

extern int wake_up_state(struct task_struct *tsk, unsigned int state);
extern int wake_up_process(struct task_struct *tsk);
extern void wake_up_new_task(struct task_struct *tsk);

extern void kick_process(struct task_struct *tsk);

extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec);
#define set_task_comm(tsk, from) ({                        \
        BUILD_BUG_ON(sizeof(from) != TASK_COMM_LEN);        \
        __set_task_comm(tsk, from, false);                \
})

/*
 * - Why not use task_lock()?
 *   User space can randomly change their names anyway, so locking for readers
 *   doesn't make sense. For writers, locking is probably necessary, as a race
 *   condition could lead to long-term mixed results.
 *   The strscpy_pad() in __set_task_comm() can ensure that the task comm is
 *   always NUL-terminated and zero-padded. Therefore the race condition between
 *   reader and writer is not an issue.
 *
 * - BUILD_BUG_ON() can help prevent the buf from being truncated.
 *   Since the callers don't perform any return value checks, this safeguard is
 *   necessary.
 */
#define get_task_comm(buf, tsk) ({                        \
        BUILD_BUG_ON(sizeof(buf) < TASK_COMM_LEN);        \
        strscpy_pad(buf, (tsk)->comm);                        \
        buf;                                                \
})

static __always_inline void scheduler_ipi(void)
{
        /*
         * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
         * TIF_NEED_RESCHED remotely (for the first time) will also send
         * this IPI.
         */
        preempt_fold_need_resched();
}

extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state);

/*
 * Set thread flags in other task's structures.
 * See asm/thread_info.h for TIF_xxxx flags available:
 */
static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag)
{
        set_ti_thread_flag(task_thread_info(tsk), flag);
}

static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag)
{
        clear_ti_thread_flag(task_thread_info(tsk), flag);
}

static inline void update_tsk_thread_flag(struct task_struct *tsk, int flag,
                                          bool value)
{
        update_ti_thread_flag(task_thread_info(tsk), flag, value);
}

static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag)
{
        return test_and_set_ti_thread_flag(task_thread_info(tsk), flag);
}

static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag)
{
        return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag);
}

static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
{
        return test_ti_thread_flag(task_thread_info(tsk), flag);
}

static inline void set_tsk_need_resched(struct task_struct *tsk)
{
        if (tracepoint_enabled(sched_set_need_resched_tp) &&
            !test_tsk_thread_flag(tsk, TIF_NEED_RESCHED))
                __trace_set_need_resched(tsk, TIF_NEED_RESCHED);
        set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
}

static inline void clear_tsk_need_resched(struct task_struct *tsk)
{
        atomic_long_andnot(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY,
                           (atomic_long_t *)&task_thread_info(tsk)->flags);
}

static inline int test_tsk_need_resched(struct task_struct *tsk)
{
        return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
}

/*
 * cond_resched() and cond_resched_lock(): latency reduction via
 * explicit rescheduling in places that are safe. The return
 * value indicates whether a reschedule was done in fact.
 * cond_resched_lock() will drop the spinlock before scheduling,
 */
#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
extern int __cond_resched(void);

#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)

DECLARE_STATIC_CALL(cond_resched, __cond_resched);

static __always_inline int _cond_resched(void)
{
        return static_call_mod(cond_resched)();
}

#elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)

extern int dynamic_cond_resched(void);

static __always_inline int _cond_resched(void)
{
        return dynamic_cond_resched();
}

#else /* !CONFIG_PREEMPTION */

static inline int _cond_resched(void)
{
        return __cond_resched();
}

#endif /* PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */

#else /* CONFIG_PREEMPTION && !CONFIG_PREEMPT_DYNAMIC */

static inline int _cond_resched(void)
{
        return 0;
}

#endif /* !CONFIG_PREEMPTION || CONFIG_PREEMPT_DYNAMIC */

#define cond_resched() ({                        \
        __might_resched(__FILE__, __LINE__, 0);        \
        _cond_resched();                        \
})

extern int __cond_resched_lock(spinlock_t *lock);
extern int __cond_resched_rwlock_read(rwlock_t *lock);
extern int __cond_resched_rwlock_write(rwlock_t *lock);

#define MIGHT_RESCHED_RCU_SHIFT                8
#define MIGHT_RESCHED_PREEMPT_MASK        ((1U << MIGHT_RESCHED_RCU_SHIFT) - 1)

#ifndef CONFIG_PREEMPT_RT
/*
 * Non RT kernels have an elevated preempt count due to the held lock,
 * but are not allowed to be inside a RCU read side critical section
 */
# define PREEMPT_LOCK_RESCHED_OFFSETS        PREEMPT_LOCK_OFFSET
#else
/*
 * spin/rw_lock() on RT implies rcu_read_lock(). The might_sleep() check in
 * cond_resched*lock() has to take that into account because it checks for
 * preempt_count() and rcu_preempt_depth().
 */
# define PREEMPT_LOCK_RESCHED_OFFSETS        \
        (PREEMPT_LOCK_OFFSET + (1U << MIGHT_RESCHED_RCU_SHIFT))
#endif

#define cond_resched_lock(lock) ({                                                \
        __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS);        \
        __cond_resched_lock(lock);                                                \
})

#define cond_resched_rwlock_read(lock) ({                                        \
        __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS);        \
        __cond_resched_rwlock_read(lock);                                        \
})

#define cond_resched_rwlock_write(lock) ({                                        \
        __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS);        \
        __cond_resched_rwlock_write(lock);                                        \
})

#ifndef CONFIG_PREEMPT_RT
static inline struct mutex *__get_task_blocked_on(struct task_struct *p)
{
        struct mutex *m = p->blocked_on;

        if (m)
                lockdep_assert_held_once(&m->wait_lock);
        return m;
}

static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m)
{
        struct mutex *blocked_on = READ_ONCE(p->blocked_on);

        WARN_ON_ONCE(!m);
        /* The task should only be setting itself as blocked */
        WARN_ON_ONCE(p != current);
        /* Currently we serialize blocked_on under the mutex::wait_lock */
        lockdep_assert_held_once(&m->wait_lock);
        /*
         * Check ensure we don't overwrite existing mutex value
         * with a different mutex. Note, setting it to the same
         * lock repeatedly is ok.
         */
        WARN_ON_ONCE(blocked_on && blocked_on != m);
        WRITE_ONCE(p->blocked_on, m);
}

static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m)
{
        guard(raw_spinlock_irqsave)(&m->wait_lock);
        __set_task_blocked_on(p, m);
}

static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *m)
{
        if (m) {
                struct mutex *blocked_on = READ_ONCE(p->blocked_on);

                /* Currently we serialize blocked_on under the mutex::wait_lock */
                lockdep_assert_held_once(&m->wait_lock);
                /*
                 * There may be cases where we re-clear already cleared
                 * blocked_on relationships, but make sure we are not
                 * clearing the relationship with a different lock.
                 */
                WARN_ON_ONCE(blocked_on && blocked_on != m);
        }
        WRITE_ONCE(p->blocked_on, NULL);
}

static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m)
{
        guard(raw_spinlock_irqsave)(&m->wait_lock);
        __clear_task_blocked_on(p, m);
}
#else
static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m)
{
}

static inline void clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m)
{
}
#endif /* !CONFIG_PREEMPT_RT */

static __always_inline bool need_resched(void)
{
        return unlikely(tif_need_resched());
}

/*
 * Wrappers for p->thread_info->cpu access. No-op on UP.
 */
#ifdef CONFIG_SMP

static inline unsigned int task_cpu(const struct task_struct *p)
{
        return READ_ONCE(task_thread_info(p)->cpu);
}

extern void set_task_cpu(struct task_struct *p, unsigned int cpu);

#else

static inline unsigned int task_cpu(const struct task_struct *p)
{
        return 0;
}

static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
{
}

#endif /* CONFIG_SMP */

static inline bool task_is_runnable(struct task_struct *p)
{
        return p->on_rq && !p->se.sched_delayed;
}

extern bool sched_task_on_rq(struct task_struct *p);
extern unsigned long get_wchan(struct task_struct *p);
extern struct task_struct *cpu_curr_snapshot(int cpu);

/*
 * In order to reduce various lock holder preemption latencies provide an
 * interface to see if a vCPU is currently running or not.
 *
 * This allows us to terminate optimistic spin loops and block, analogous to
 * the native optimistic spin heuristic of testing if the lock owner task is
 * running or not.
 */
#ifndef vcpu_is_preempted
static inline bool vcpu_is_preempted(int cpu)
{
        return false;
}
#endif

extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
extern long sched_getaffinity(pid_t pid, struct cpumask *mask);

#ifndef TASK_SIZE_OF
#define TASK_SIZE_OF(tsk)        TASK_SIZE
#endif

static inline bool owner_on_cpu(struct task_struct *owner)
{
        /*
         * As lock holder preemption issue, we both skip spinning if
         * task is not on cpu or its cpu is preempted
         */
        return READ_ONCE(owner->on_cpu) && !vcpu_is_preempted(task_cpu(owner));
}

/* Returns effective CPU energy utilization, as seen by the scheduler */
unsigned long sched_cpu_util(int cpu);

#ifdef CONFIG_SCHED_CORE
extern void sched_core_free(struct task_struct *tsk);
extern void sched_core_fork(struct task_struct *p);
extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
                                unsigned long uaddr);
extern int sched_core_idle_cpu(int cpu);
#else
static inline void sched_core_free(struct task_struct *tsk) { }
static inline void sched_core_fork(struct task_struct *p) { }
static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); }
#endif

extern void sched_set_stop_task(int cpu, struct task_struct *stop);

#ifdef CONFIG_MEM_ALLOC_PROFILING
static __always_inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag)
{
        swap(current->alloc_tag, tag);
        return tag;
}

static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old)
{
#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
        WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n");
#endif
        current->alloc_tag = old;
}
#else
#define alloc_tag_save(_tag)                        NULL
#define alloc_tag_restore(_tag, _old)                do {} while (0)
#endif

#ifndef MODULE
#ifndef COMPILE_OFFSETS

extern void ___migrate_enable(void);

struct rq;
DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

/*
 * The "struct rq" is not available here, so we can't access the
 * "runqueues" with this_cpu_ptr(), as the compilation will fail in
 * this_cpu_ptr() -> raw_cpu_ptr() -> __verify_pcpu_ptr():
 *   typeof((ptr) + 0)
 *
 * So use arch_raw_cpu_ptr()/PERCPU_PTR() directly here.
 */
#ifdef CONFIG_SMP
#define this_rq_raw() arch_raw_cpu_ptr(&runqueues)
#else
#define this_rq_raw() PERCPU_PTR(&runqueues)
#endif
#define this_rq_pinned() (*(unsigned int *)((void *)this_rq_raw() + RQ_nr_pinned))

static inline void __migrate_enable(void)
{
        struct task_struct *p = current;

#ifdef CONFIG_DEBUG_PREEMPT
        /*
         * Check both overflow from migrate_disable() and superfluous
         * migrate_enable().
         */
        if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
                return;
#endif

        if (p->migration_disabled > 1) {
                p->migration_disabled--;
                return;
        }

        /*
         * Ensure stop_task runs either before or after this, and that
         * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
         */
        guard(preempt)();
        if (unlikely(p->cpus_ptr != &p->cpus_mask))
                ___migrate_enable();
        /*
         * Mustn't clear migration_disabled() until cpus_ptr points back at the
         * regular cpus_mask, otherwise things that race (eg.
         * select_fallback_rq) get confused.
         */
        barrier();
        p->migration_disabled = 0;
        this_rq_pinned()--;
}

static inline void __migrate_disable(void)
{
        struct task_struct *p = current;

        if (p->migration_disabled) {
#ifdef CONFIG_DEBUG_PREEMPT
                /*
                 *Warn about overflow half-way through the range.
                 */
                WARN_ON_ONCE((s16)p->migration_disabled < 0);
#endif
                p->migration_disabled++;
                return;
        }

        guard(preempt)();
        this_rq_pinned()++;
        p->migration_disabled = 1;
}
#else /* !COMPILE_OFFSETS */
static inline void __migrate_disable(void) { }
static inline void __migrate_enable(void) { }
#endif /* !COMPILE_OFFSETS */

/*
 * So that it is possible to not export the runqueues variable, define and
 * export migrate_enable/migrate_disable in kernel/sched/core.c too, and use
 * them for the modules. The macro "INSTANTIATE_EXPORTED_MIGRATE_DISABLE" will
 * be defined in kernel/sched/core.c.
 */
#ifndef INSTANTIATE_EXPORTED_MIGRATE_DISABLE
static __always_inline void migrate_disable(void)
{
        __migrate_disable();
}

static __always_inline void migrate_enable(void)
{
        __migrate_enable();
}
#else /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */
extern void migrate_disable(void);
extern void migrate_enable(void);
#endif /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */

#else /* MODULE */
extern void migrate_disable(void);
extern void migrate_enable(void);
#endif /* MODULE */

DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())

#endif





























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM timestamp

#if !defined(_TRACE_TIMESTAMP_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_TIMESTAMP_H

#include <linux/tracepoint.h>
#include <linux/fs.h>

#define CTIME_QUERIED_FLAGS \
        { I_CTIME_QUERIED, "Q" }

DECLARE_EVENT_CLASS(ctime,
        TP_PROTO(struct inode *inode,
                 struct timespec64 *ctime),

        TP_ARGS(inode, ctime),

        TP_STRUCT__entry(
                __field(dev_t,                dev)
                __field(ino_t,                ino)
                __field(time64_t,        ctime_s)
                __field(u32,                ctime_ns)
                __field(u32,                gen)
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->gen                = inode->i_generation;
                __entry->ctime_s        = ctime->tv_sec;
                __entry->ctime_ns        = ctime->tv_nsec;
        ),

        TP_printk("ino=%d:%d:%ld:%u ctime=%lld.%u",
                MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->gen,
                __entry->ctime_s, __entry->ctime_ns
        )
);

DEFINE_EVENT(ctime, inode_set_ctime_to_ts,
                TP_PROTO(struct inode *inode,
                         struct timespec64 *ctime),
                TP_ARGS(inode, ctime));

DEFINE_EVENT(ctime, ctime_xchg_skip,
                TP_PROTO(struct inode *inode,
                         struct timespec64 *ctime),
                TP_ARGS(inode, ctime));

TRACE_EVENT(ctime_ns_xchg,
        TP_PROTO(struct inode *inode,
                 u32 old,
                 u32 new,
                 u32 cur),

        TP_ARGS(inode, old, new, cur),

        TP_STRUCT__entry(
                __field(dev_t,                dev)
                __field(ino_t,                ino)
                __field(u32,                gen)
                __field(u32,                old)
                __field(u32,                new)
                __field(u32,                cur)
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->gen                = inode->i_generation;
                __entry->old                = old;
                __entry->new                = new;
                __entry->cur                = cur;
        ),

        TP_printk("ino=%d:%d:%ld:%u old=%u:%s new=%u cur=%u:%s",
                MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->gen,
                __entry->old & ~I_CTIME_QUERIED,
                __print_flags(__entry->old & I_CTIME_QUERIED, "|", CTIME_QUERIED_FLAGS),
                __entry->new,
                __entry->cur & ~I_CTIME_QUERIED,
                __print_flags(__entry->cur & I_CTIME_QUERIED, "|", CTIME_QUERIED_FLAGS)
        )
);

TRACE_EVENT(fill_mg_cmtime,
        TP_PROTO(struct inode *inode,
                 struct timespec64 *ctime,
                 struct timespec64 *mtime),

        TP_ARGS(inode, ctime, mtime),

        TP_STRUCT__entry(
                __field(dev_t,                dev)
                __field(ino_t,                ino)
                __field(time64_t,        ctime_s)
                __field(time64_t,        mtime_s)
                __field(u32,                ctime_ns)
                __field(u32,                mtime_ns)
                __field(u32,                gen)
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->gen                = inode->i_generation;
                __entry->ctime_s        = ctime->tv_sec;
                __entry->mtime_s        = mtime->tv_sec;
                __entry->ctime_ns        = ctime->tv_nsec;
                __entry->mtime_ns        = mtime->tv_nsec;
        ),

        TP_printk("ino=%d:%d:%ld:%u ctime=%lld.%u mtime=%lld.%u",
                MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->gen,
                __entry->ctime_s, __entry->ctime_ns,
                __entry->mtime_s, __entry->mtime_ns
        )
);
#endif /* _TRACE_TIMESTAMP_H */

/* This part must be outside protection */
#include <trace/define_trace.h>

























































































  316 
  319 





























































  318 


























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PID_H
#define _LINUX_PID_H

#include <linux/pid_types.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/refcount.h>
#include <linux/sched.h>
#include <linux/wait.h>

/*
 * What is struct pid?
 *
 * A struct pid is the kernel's internal notion of a process identifier.
 * It refers to individual tasks, process groups, and sessions.  While
 * there are processes attached to it the struct pid lives in a hash
 * table, so it and then the processes that it refers to can be found
 * quickly from the numeric pid value.  The attached processes may be
 * quickly accessed by following pointers from struct pid.
 *
 * Storing pid_t values in the kernel and referring to them later has a
 * problem.  The process originally with that pid may have exited and the
 * pid allocator wrapped, and another process could have come along
 * and been assigned that pid.
 *
 * Referring to user space processes by holding a reference to struct
 * task_struct has a problem.  When the user space process exits
 * the now useless task_struct is still kept.  A task_struct plus a
 * stack consumes around 10K of low kernel memory.  More precisely
 * this is THREAD_SIZE + sizeof(struct task_struct).  By comparison
 * a struct pid is about 64 bytes.
 *
 * Holding a reference to struct pid solves both of these problems.
 * It is small so holding a reference does not consume a lot of
 * resources, and since a new struct pid is allocated when the numeric pid
 * value is reused (when pids wrap around) we don't mistakenly refer to new
 * processes.
 */


/*
 * struct upid is used to get the id of the struct pid, as it is
 * seen in particular namespace. Later the struct pid is found with
 * find_pid_ns() using the int nr and struct pid_namespace *ns.
 */

#define RESERVED_PIDS 300

struct pidfs_attr;

struct upid {
        int nr;
        struct pid_namespace *ns;
};

struct pid {
        refcount_t count;
        unsigned int level;
        spinlock_t lock;
        struct {
                u64 ino;
                struct rb_node pidfs_node;
                struct dentry *stashed;
                struct pidfs_attr *attr;
        };
        /* lists of tasks that use this pid */
        struct hlist_head tasks[PIDTYPE_MAX];
        struct hlist_head inodes;
        /* wait queue for pidfd notifications */
        wait_queue_head_t wait_pidfd;
        struct rcu_head rcu;
        struct upid numbers[];
};

extern seqcount_spinlock_t pidmap_lock_seq;
extern struct pid init_struct_pid;

struct file;

struct pid *pidfd_pid(const struct file *file);
struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags);
struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags);
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret_file);
void do_notify_pidfd(struct task_struct *task);

static inline struct pid *get_pid(struct pid *pid)
{
        if (pid)
                refcount_inc(&pid->count);
        return pid;
}

extern void put_pid(struct pid *pid);
extern struct task_struct *pid_task(struct pid *pid, enum pid_type);
static inline bool pid_has_task(struct pid *pid, enum pid_type type)
{
        return !hlist_empty(&pid->tasks[type]);
}
extern struct task_struct *get_pid_task(struct pid *pid, enum pid_type);

extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type);

/*
 * these helpers must be called with the tasklist_lock write-held.
 */
extern void attach_pid(struct task_struct *task, enum pid_type);
void detach_pid(struct pid **pids, struct task_struct *task, enum pid_type);
void change_pid(struct pid **pids, struct task_struct *task, enum pid_type,
                struct pid *pid);
extern void exchange_tids(struct task_struct *task, struct task_struct *old);
extern void transfer_pid(struct task_struct *old, struct task_struct *new,
                         enum pid_type);

/*
 * look up a PID in the hash table. Must be called with the tasklist_lock
 * or rcu_read_lock() held.
 *
 * find_pid_ns() finds the pid in the namespace specified
 * find_vpid() finds the pid by its virtual id, i.e. in the current namespace
 *
 * see also find_task_by_vpid() set in include/linux/sched.h
 */
extern struct pid *find_pid_ns(int nr, struct pid_namespace *ns);
extern struct pid *find_vpid(int nr);

/*
 * Lookup a PID in the hash table, and return with it's count elevated.
 */
extern struct pid *find_get_pid(int nr);
extern struct pid *find_ge_pid(int nr, struct pid_namespace *);

extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
                             size_t set_tid_size);
extern void free_pid(struct pid *pid);
void free_pids(struct pid **pids);
extern void disable_pid_allocation(struct pid_namespace *ns);

/*
 * ns_of_pid() returns the pid namespace in which the specified pid was
 * allocated.
 *
 * NOTE:
 *         ns_of_pid() is expected to be called for a process (task) that has
 *         an attached 'struct pid' (see attach_pid(), detach_pid()) i.e @pid
 *         is expected to be non-NULL. If @pid is NULL, caller should handle
 *         the resulting NULL pid-ns.
 */
static inline struct pid_namespace *ns_of_pid(struct pid *pid)
{
        struct pid_namespace *ns = NULL;
        if (pid)
                ns = pid->numbers[pid->level].ns;
        return ns;
}

/*
 * is_child_reaper returns true if the pid is the init process
 * of the current namespace. As this one could be checked before
 * pid_ns->child_reaper is assigned in copy_process, we check
 * with the pid number.
 */
static inline bool is_child_reaper(struct pid *pid)
{
        return pid->numbers[pid->level].nr == 1;
}

/*
 * the helpers to get the pid's id seen from different namespaces
 *
 * pid_nr()    : global id, i.e. the id seen from the init namespace;
 * pid_vnr()   : virtual id, i.e. the id seen from the pid namespace of
 *               current.
 * pid_nr_ns() : id seen from the ns specified.
 *
 * see also task_xid_nr() etc in include/linux/sched.h
 */

static inline pid_t pid_nr(struct pid *pid)
{
        pid_t nr = 0;
        if (pid)
                nr = pid->numbers[0].nr;
        return nr;
}

pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns);
pid_t pid_vnr(struct pid *pid);

#define do_each_pid_task(pid, type, task)                                \
        do {                                                                \
                if ((pid) != NULL)                                        \
                        hlist_for_each_entry_rcu((task),                \
                                &(pid)->tasks[type], pid_links[type]) {

                        /*
                         * Both old and new leaders may be attached to
                         * the same pid in the middle of de_thread().
                         */
#define while_each_pid_task(pid, type, task)                                \
                                if (type == PIDTYPE_PID)                \
                                        break;                                \
                        }                                                \
        } while (0)

#define do_each_pid_thread(pid, type, task)                                \
        do_each_pid_task(pid, type, task) {                                \
                struct task_struct *tg___ = task;                        \
                for_each_thread(tg___, task) {

#define while_each_pid_thread(pid, type, task)                                \
                }                                                        \
                task = tg___;                                                \
        } while_each_pid_task(pid, type, task)

static inline struct pid *task_pid(struct task_struct *task)
{
        return task->thread_pid;
}

/*
 * the helpers to get the task's different pids as they are seen
 * from various namespaces
 *
 * task_xid_nr()     : global id, i.e. the id seen from the init namespace;
 * task_xid_vnr()    : virtual id, i.e. the id seen from the pid namespace of
 *                     current.
 * task_xid_nr_ns()  : id seen from the ns specified;
 *
 * see also pid_nr() etc in include/linux/pid.h
 */
pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns);

static inline pid_t task_pid_nr(struct task_struct *tsk)
{
        return tsk->pid;
}

static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
}

static inline pid_t task_pid_vnr(struct task_struct *tsk)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
}


static inline pid_t task_tgid_nr(struct task_struct *tsk)
{
        return tsk->tgid;
}

/**
 * pid_alive - check that a task structure is not stale
 * @p: Task structure to be checked.
 *
 * Test if a process is not yet dead (at most zombie state)
 * If pid_alive fails, then pointers within the task structure
 * can be stale and must not be dereferenced.
 *
 * Return: 1 if the process is alive. 0 otherwise.
 */
static inline int pid_alive(const struct task_struct *p)
{
        return p->thread_pid != NULL;
}

static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
}

static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
}


static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
}

static inline pid_t task_session_vnr(struct task_struct *tsk)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
}

static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns);
}

static inline pid_t task_tgid_vnr(struct task_struct *tsk)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL);
}

static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns)
{
        pid_t pid = 0;

        rcu_read_lock();
        if (pid_alive(tsk))
                pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns);
        rcu_read_unlock();

        return pid;
}

static inline pid_t task_ppid_nr(const struct task_struct *tsk)
{
        return task_ppid_nr_ns(tsk, &init_pid_ns);
}

/* Obsolete, do not use: */
static inline pid_t task_pgrp_nr(struct task_struct *tsk)
{
        return task_pgrp_nr_ns(tsk, &init_pid_ns);
}

/**
 * is_global_init - check if a task structure is init. Since init
 * is free to have sub-threads we need to check tgid.
 * @tsk: Task structure to be checked.
 *
 * Check if a task structure is the first user space task the kernel created.
 *
 * Return: 1 if the task structure is init. 0 otherwise.
 */
static inline int is_global_init(struct task_struct *tsk)
{
        return task_tgid_nr(tsk) == 1;
}

#endif /* _LINUX_PID_H */






























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PAGEMAP_H
#define _LINUX_PAGEMAP_H

/*
 * Copyright 1995 Linus Torvalds
 */
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/list.h>
#include <linux/highmem.h>
#include <linux/compiler.h>
#include <linux/uaccess.h>
#include <linux/gfp.h>
#include <linux/bitops.h>
#include <linux/hardirq.h> /* for in_interrupt() */
#include <linux/hugetlb_inline.h>

struct folio_batch;

unsigned long invalidate_mapping_pages(struct address_space *mapping,
                                        pgoff_t start, pgoff_t end);

static inline void invalidate_remote_inode(struct inode *inode)
{
        if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
            S_ISLNK(inode->i_mode))
                invalidate_mapping_pages(inode->i_mapping, 0, -1);
}
int invalidate_inode_pages2(struct address_space *mapping);
int invalidate_inode_pages2_range(struct address_space *mapping,
                pgoff_t start, pgoff_t end);
int kiocb_invalidate_pages(struct kiocb *iocb, size_t count);
void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count);
int filemap_invalidate_pages(struct address_space *mapping,
                             loff_t pos, loff_t end, bool nowait);

int write_inode_now(struct inode *, int sync);
int filemap_fdatawrite(struct address_space *);
int filemap_flush(struct address_space *);
int filemap_fdatawait_keep_errors(struct address_space *mapping);
int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend);
int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
                loff_t start_byte, loff_t end_byte);
int filemap_invalidate_inode(struct inode *inode, bool flush,
                             loff_t start, loff_t end);

static inline int filemap_fdatawait(struct address_space *mapping)
{
        return filemap_fdatawait_range(mapping, 0, LLONG_MAX);
}

bool filemap_range_has_page(struct address_space *, loff_t lstart, loff_t lend);
int filemap_write_and_wait_range(struct address_space *mapping,
                loff_t lstart, loff_t lend);
int __filemap_fdatawrite_range(struct address_space *mapping,
                loff_t start, loff_t end, int sync_mode);
int filemap_fdatawrite_range(struct address_space *mapping,
                loff_t start, loff_t end);
int filemap_check_errors(struct address_space *mapping);
void __filemap_set_wb_err(struct address_space *mapping, int err);
int filemap_fdatawrite_wbc(struct address_space *mapping,
                           struct writeback_control *wbc);
int kiocb_write_and_wait(struct kiocb *iocb, size_t count);

static inline int filemap_write_and_wait(struct address_space *mapping)
{
        return filemap_write_and_wait_range(mapping, 0, LLONG_MAX);
}

/**
 * filemap_set_wb_err - set a writeback error on an address_space
 * @mapping: mapping in which to set writeback error
 * @err: error to be set in mapping
 *
 * When writeback fails in some way, we must record that error so that
 * userspace can be informed when fsync and the like are called.  We endeavor
 * to report errors on any file that was open at the time of the error.  Some
 * internal callers also need to know when writeback errors have occurred.
 *
 * When a writeback error occurs, most filesystems will want to call
 * filemap_set_wb_err to record the error in the mapping so that it will be
 * automatically reported whenever fsync is called on the file.
 */
static inline void filemap_set_wb_err(struct address_space *mapping, int err)
{
        /* Fastpath for common case of no error */
        if (unlikely(err))
                __filemap_set_wb_err(mapping, err);
}

/**
 * filemap_check_wb_err - has an error occurred since the mark was sampled?
 * @mapping: mapping to check for writeback errors
 * @since: previously-sampled errseq_t
 *
 * Grab the errseq_t value from the mapping, and see if it has changed "since"
 * the given value was sampled.
 *
 * If it has then report the latest error set, otherwise return 0.
 */
static inline int filemap_check_wb_err(struct address_space *mapping,
                                        errseq_t since)
{
        return errseq_check(&mapping->wb_err, since);
}

/**
 * filemap_sample_wb_err - sample the current errseq_t to test for later errors
 * @mapping: mapping to be sampled
 *
 * Writeback errors are always reported relative to a particular sample point
 * in the past. This function provides those sample points.
 */
static inline errseq_t filemap_sample_wb_err(struct address_space *mapping)
{
        return errseq_sample(&mapping->wb_err);
}

/**
 * file_sample_sb_err - sample the current errseq_t to test for later errors
 * @file: file pointer to be sampled
 *
 * Grab the most current superblock-level errseq_t value for the given
 * struct file.
 */
static inline errseq_t file_sample_sb_err(struct file *file)
{
        return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err);
}

/*
 * Flush file data before changing attributes.  Caller must hold any locks
 * required to prevent further writes to this file until we're done setting
 * flags.
 */
static inline int inode_drain_writes(struct inode *inode)
{
        inode_dio_wait(inode);
        return filemap_write_and_wait(inode->i_mapping);
}

static inline bool mapping_empty(const struct address_space *mapping)
{
        return xa_empty(&mapping->i_pages);
}

/*
 * mapping_shrinkable - test if page cache state allows inode reclaim
 * @mapping: the page cache mapping
 *
 * This checks the mapping's cache state for the pupose of inode
 * reclaim and LRU management.
 *
 * The caller is expected to hold the i_lock, but is not required to
 * hold the i_pages lock, which usually protects cache state. That's
 * because the i_lock and the list_lru lock that protect the inode and
 * its LRU state don't nest inside the irq-safe i_pages lock.
 *
 * Cache deletions are performed under the i_lock, which ensures that
 * when an inode goes empty, it will reliably get queued on the LRU.
 *
 * Cache additions do not acquire the i_lock and may race with this
 * check, in which case we'll report the inode as shrinkable when it
 * has cache pages. This is okay: the shrinker also checks the
 * refcount and the referenced bit, which will be elevated or set in
 * the process of adding new cache pages to an inode.
 */
static inline bool mapping_shrinkable(const struct address_space *mapping)
{
        void *head;

        /*
         * On highmem systems, there could be lowmem pressure from the
         * inodes before there is highmem pressure from the page
         * cache. Make inodes shrinkable regardless of cache state.
         */
        if (IS_ENABLED(CONFIG_HIGHMEM))
                return true;

        /* Cache completely empty? Shrink away. */
        head = rcu_access_pointer(mapping->i_pages.xa_head);
        if (!head)
                return true;

        /*
         * The xarray stores single offset-0 entries directly in the
         * head pointer, which allows non-resident page cache entries
         * to escape the shadow shrinker's list of xarray nodes. The
         * inode shrinker needs to pick them up under memory pressure.
         */
        if (!xa_is_node(head) && xa_is_value(head))
                return true;

        return false;
}

/*
 * Bits in mapping->flags.
 */
enum mapping_flags {
        AS_EIO                = 0,        /* IO error on async write */
        AS_ENOSPC        = 1,        /* ENOSPC on async write */
        AS_MM_ALL_LOCKS        = 2,        /* under mm_take_all_locks() */
        AS_UNEVICTABLE        = 3,        /* e.g., ramdisk, SHM_LOCK */
        AS_EXITING        = 4,         /* final truncate in progress */
        /* writeback related tags are not used */
        AS_NO_WRITEBACK_TAGS = 5,
        AS_RELEASE_ALWAYS = 6,        /* Call ->release_folio(), even if no private data */
        AS_STABLE_WRITES = 7,        /* must wait for writeback before modifying
                                   folio contents */
        AS_INACCESSIBLE = 8,        /* Do not attempt direct R/W access to the mapping */
        AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM = 9,
        AS_KERNEL_FILE = 10,        /* mapping for a fake kernel file that shouldn't
                                   account usage to user cgroups */
        /* Bits 16-25 are used for FOLIO_ORDER */
        AS_FOLIO_ORDER_BITS = 5,
        AS_FOLIO_ORDER_MIN = 16,
        AS_FOLIO_ORDER_MAX = AS_FOLIO_ORDER_MIN + AS_FOLIO_ORDER_BITS,
};

#define AS_FOLIO_ORDER_BITS_MASK ((1u << AS_FOLIO_ORDER_BITS) - 1)
#define AS_FOLIO_ORDER_MIN_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MIN)
#define AS_FOLIO_ORDER_MAX_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MAX)
#define AS_FOLIO_ORDER_MASK (AS_FOLIO_ORDER_MIN_MASK | AS_FOLIO_ORDER_MAX_MASK)

/**
 * mapping_set_error - record a writeback error in the address_space
 * @mapping: the mapping in which an error should be set
 * @error: the error to set in the mapping
 *
 * When writeback fails in some way, we must record that error so that
 * userspace can be informed when fsync and the like are called.  We endeavor
 * to report errors on any file that was open at the time of the error.  Some
 * internal callers also need to know when writeback errors have occurred.
 *
 * When a writeback error occurs, most filesystems will want to call
 * mapping_set_error to record the error in the mapping so that it can be
 * reported when the application calls fsync(2).
 */
static inline void mapping_set_error(struct address_space *mapping, int error)
{
        if (likely(!error))
                return;

        /* Record in wb_err for checkers using errseq_t based tracking */
        __filemap_set_wb_err(mapping, error);

        /* Record it in superblock */
        if (mapping->host)
                errseq_set(&mapping->host->i_sb->s_wb_err, error);

        /* Record it in flags for now, for legacy callers */
        if (error == -ENOSPC)
                set_bit(AS_ENOSPC, &mapping->flags);
        else
                set_bit(AS_EIO, &mapping->flags);
}

static inline void mapping_set_unevictable(struct address_space *mapping)
{
        set_bit(AS_UNEVICTABLE, &mapping->flags);
}

static inline void mapping_clear_unevictable(struct address_space *mapping)
{
        clear_bit(AS_UNEVICTABLE, &mapping->flags);
}

static inline bool mapping_unevictable(const struct address_space *mapping)
{
        return mapping && test_bit(AS_UNEVICTABLE, &mapping->flags);
}

static inline void mapping_set_exiting(struct address_space *mapping)
{
        set_bit(AS_EXITING, &mapping->flags);
}

static inline int mapping_exiting(const struct address_space *mapping)
{
        return test_bit(AS_EXITING, &mapping->flags);
}

static inline void mapping_set_no_writeback_tags(struct address_space *mapping)
{
        set_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags);
}

static inline int mapping_use_writeback_tags(const struct address_space *mapping)
{
        return !test_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags);
}

static inline bool mapping_release_always(const struct address_space *mapping)
{
        return test_bit(AS_RELEASE_ALWAYS, &mapping->flags);
}

static inline void mapping_set_release_always(struct address_space *mapping)
{
        set_bit(AS_RELEASE_ALWAYS, &mapping->flags);
}

static inline void mapping_clear_release_always(struct address_space *mapping)
{
        clear_bit(AS_RELEASE_ALWAYS, &mapping->flags);
}

static inline bool mapping_stable_writes(const struct address_space *mapping)
{
        return test_bit(AS_STABLE_WRITES, &mapping->flags);
}

static inline void mapping_set_stable_writes(struct address_space *mapping)
{
        set_bit(AS_STABLE_WRITES, &mapping->flags);
}

static inline void mapping_clear_stable_writes(struct address_space *mapping)
{
        clear_bit(AS_STABLE_WRITES, &mapping->flags);
}

static inline void mapping_set_inaccessible(struct address_space *mapping)
{
        /*
         * It's expected inaccessible mappings are also unevictable. Compaction
         * migrate scanner (isolate_migratepages_block()) relies on this to
         * reduce page locking.
         */
        set_bit(AS_UNEVICTABLE, &mapping->flags);
        set_bit(AS_INACCESSIBLE, &mapping->flags);
}

static inline bool mapping_inaccessible(const struct address_space *mapping)
{
        return test_bit(AS_INACCESSIBLE, &mapping->flags);
}

static inline void mapping_set_writeback_may_deadlock_on_reclaim(struct address_space *mapping)
{
        set_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags);
}

static inline bool mapping_writeback_may_deadlock_on_reclaim(const struct address_space *mapping)
{
        return test_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags);
}

static inline gfp_t mapping_gfp_mask(const struct address_space *mapping)
{
        return mapping->gfp_mask;
}

/* Restricts the given gfp_mask to what the mapping allows. */
static inline gfp_t mapping_gfp_constraint(const struct address_space *mapping,
                gfp_t gfp_mask)
{
        return mapping_gfp_mask(mapping) & gfp_mask;
}

/*
 * This is non-atomic.  Only to be used before the mapping is activated.
 * Probably needs a barrier...
 */
static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
{
        m->gfp_mask = mask;
}

/*
 * There are some parts of the kernel which assume that PMD entries
 * are exactly HPAGE_PMD_ORDER.  Those should be fixed, but until then,
 * limit the maximum allocation order to PMD size.  I'm not aware of any
 * assumptions about maximum order if THP are disabled, but 8 seems like
 * a good order (that's 1MB if you're using 4kB pages)
 */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define PREFERRED_MAX_PAGECACHE_ORDER        HPAGE_PMD_ORDER
#else
#define PREFERRED_MAX_PAGECACHE_ORDER        8
#endif

/*
 * xas_split_alloc() does not support arbitrary orders. This implies no
 * 512MB THP on ARM64 with 64KB base page size.
 */
#define MAX_XAS_ORDER                (XA_CHUNK_SHIFT * 2 - 1)
#define MAX_PAGECACHE_ORDER        min(MAX_XAS_ORDER, PREFERRED_MAX_PAGECACHE_ORDER)

/*
 * mapping_max_folio_size_supported() - Check the max folio size supported
 *
 * The filesystem should call this function at mount time if there is a
 * requirement on the folio mapping size in the page cache.
 */
static inline size_t mapping_max_folio_size_supported(void)
{
        if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                return 1U << (PAGE_SHIFT + MAX_PAGECACHE_ORDER);
        return PAGE_SIZE;
}

/*
 * mapping_set_folio_order_range() - Set the orders supported by a file.
 * @mapping: The address space of the file.
 * @min: Minimum folio order (between 0-MAX_PAGECACHE_ORDER inclusive).
 * @max: Maximum folio order (between @min-MAX_PAGECACHE_ORDER inclusive).
 *
 * The filesystem should call this function in its inode constructor to
 * indicate which base size (min) and maximum size (max) of folio the VFS
 * can use to cache the contents of the file.  This should only be used
 * if the filesystem needs special handling of folio sizes (ie there is
 * something the core cannot know).
 * Do not tune it based on, eg, i_size.
 *
 * Context: This should not be called while the inode is active as it
 * is non-atomic.
 */
static inline void mapping_set_folio_order_range(struct address_space *mapping,
                                                 unsigned int min,
                                                 unsigned int max)
{
        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                return;

        if (min > MAX_PAGECACHE_ORDER)
                min = MAX_PAGECACHE_ORDER;

        if (max > MAX_PAGECACHE_ORDER)
                max = MAX_PAGECACHE_ORDER;

        if (max < min)
                max = min;

        mapping->flags = (mapping->flags & ~AS_FOLIO_ORDER_MASK) |
                (min << AS_FOLIO_ORDER_MIN) | (max << AS_FOLIO_ORDER_MAX);
}

static inline void mapping_set_folio_min_order(struct address_space *mapping,
                                               unsigned int min)
{
        mapping_set_folio_order_range(mapping, min, MAX_PAGECACHE_ORDER);
}

/**
 * mapping_set_large_folios() - Indicate the file supports large folios.
 * @mapping: The address space of the file.
 *
 * The filesystem should call this function in its inode constructor to
 * indicate that the VFS can use large folios to cache the contents of
 * the file.
 *
 * Context: This should not be called while the inode is active as it
 * is non-atomic.
 */
static inline void mapping_set_large_folios(struct address_space *mapping)
{
        mapping_set_folio_order_range(mapping, 0, MAX_PAGECACHE_ORDER);
}

static inline unsigned int
mapping_max_folio_order(const struct address_space *mapping)
{
        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                return 0;
        return (mapping->flags & AS_FOLIO_ORDER_MAX_MASK) >> AS_FOLIO_ORDER_MAX;
}

static inline unsigned int
mapping_min_folio_order(const struct address_space *mapping)
{
        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                return 0;
        return (mapping->flags & AS_FOLIO_ORDER_MIN_MASK) >> AS_FOLIO_ORDER_MIN;
}

static inline unsigned long
mapping_min_folio_nrpages(const struct address_space *mapping)
{
        return 1UL << mapping_min_folio_order(mapping);
}

static inline unsigned long
mapping_min_folio_nrbytes(const struct address_space *mapping)
{
        return mapping_min_folio_nrpages(mapping) << PAGE_SHIFT;
}

/**
 * mapping_align_index() - Align index for this mapping.
 * @mapping: The address_space.
 * @index: The page index.
 *
 * The index of a folio must be naturally aligned.  If you are adding a
 * new folio to the page cache and need to know what index to give it,
 * call this function.
 */
static inline pgoff_t mapping_align_index(const struct address_space *mapping,
                                          pgoff_t index)
{
        return round_down(index, mapping_min_folio_nrpages(mapping));
}

/*
 * Large folio support currently depends on THP.  These dependencies are
 * being worked on but are not yet fixed.
 */
static inline bool mapping_large_folio_support(const struct address_space *mapping)
{
        /* AS_FOLIO_ORDER is only reasonable for pagecache folios */
        VM_WARN_ONCE((unsigned long)mapping & FOLIO_MAPPING_ANON,
                        "Anonymous mapping always supports large folio");

        return mapping_max_folio_order(mapping) > 0;
}

/* Return the maximum folio size for this pagecache mapping, in bytes. */
static inline size_t mapping_max_folio_size(const struct address_space *mapping)
{
        return PAGE_SIZE << mapping_max_folio_order(mapping);
}

static inline int filemap_nr_thps(const struct address_space *mapping)
{
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        return atomic_read(&mapping->nr_thps);
#else
        return 0;
#endif
}

static inline void filemap_nr_thps_inc(struct address_space *mapping)
{
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        if (!mapping_large_folio_support(mapping))
                atomic_inc(&mapping->nr_thps);
#else
        WARN_ON_ONCE(mapping_large_folio_support(mapping) == 0);
#endif
}

static inline void filemap_nr_thps_dec(struct address_space *mapping)
{
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        if (!mapping_large_folio_support(mapping))
                atomic_dec(&mapping->nr_thps);
#else
        WARN_ON_ONCE(mapping_large_folio_support(mapping) == 0);
#endif
}

struct address_space *folio_mapping(const struct folio *folio);

/**
 * folio_flush_mapping - Find the file mapping this folio belongs to.
 * @folio: The folio.
 *
 * For folios which are in the page cache, return the mapping that this
 * page belongs to.  Anonymous folios return NULL, even if they're in
 * the swap cache.  Other kinds of folio also return NULL.
 *
 * This is ONLY used by architecture cache flushing code.  If you aren't
 * writing cache flushing code, you want either folio_mapping() or
 * folio_file_mapping().
 */
static inline struct address_space *folio_flush_mapping(struct folio *folio)
{
        if (unlikely(folio_test_swapcache(folio)))
                return NULL;

        return folio_mapping(folio);
}

/**
 * folio_inode - Get the host inode for this folio.
 * @folio: The folio.
 *
 * For folios which are in the page cache, return the inode that this folio
 * belongs to.
 *
 * Do not call this for folios which aren't in the page cache.
 */
static inline struct inode *folio_inode(struct folio *folio)
{
        return folio->mapping->host;
}

/**
 * folio_attach_private - Attach private data to a folio.
 * @folio: Folio to attach data to.
 * @data: Data to attach to folio.
 *
 * Attaching private data to a folio increments the page's reference count.
 * The data must be detached before the folio will be freed.
 */
static inline void folio_attach_private(struct folio *folio, void *data)
{
        folio_get(folio);
        folio->private = data;
        folio_set_private(folio);
}

/**
 * folio_change_private - Change private data on a folio.
 * @folio: Folio to change the data on.
 * @data: Data to set on the folio.
 *
 * Change the private data attached to a folio and return the old
 * data.  The page must previously have had data attached and the data
 * must be detached before the folio will be freed.
 *
 * Return: Data that was previously attached to the folio.
 */
static inline void *folio_change_private(struct folio *folio, void *data)
{
        void *old = folio_get_private(folio);

        folio->private = data;
        return old;
}

/**
 * folio_detach_private - Detach private data from a folio.
 * @folio: Folio to detach data from.
 *
 * Removes the data that was previously attached to the folio and decrements
 * the refcount on the page.
 *
 * Return: Data that was attached to the folio.
 */
static inline void *folio_detach_private(struct folio *folio)
{
        void *data = folio_get_private(folio);

        if (!folio_test_private(folio))
                return NULL;
        folio_clear_private(folio);
        folio->private = NULL;
        folio_put(folio);

        return data;
}

static inline void attach_page_private(struct page *page, void *data)
{
        folio_attach_private(page_folio(page), data);
}

static inline void *detach_page_private(struct page *page)
{
        return folio_detach_private(page_folio(page));
}

#ifdef CONFIG_NUMA
struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order);
#else
static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
{
        return folio_alloc_noprof(gfp, order);
}
#endif

#define filemap_alloc_folio(...)                                \
        alloc_hooks(filemap_alloc_folio_noprof(__VA_ARGS__))

static inline struct page *__page_cache_alloc(gfp_t gfp)
{
        return &filemap_alloc_folio(gfp, 0)->page;
}

static inline gfp_t readahead_gfp_mask(struct address_space *x)
{
        return mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN;
}

typedef int filler_t(struct file *, struct folio *);

pgoff_t page_cache_next_miss(struct address_space *mapping,
                             pgoff_t index, unsigned long max_scan);
pgoff_t page_cache_prev_miss(struct address_space *mapping,
                             pgoff_t index, unsigned long max_scan);

/**
 * typedef fgf_t - Flags for getting folios from the page cache.
 *
 * Most users of the page cache will not need to use these flags;
 * there are convenience functions such as filemap_get_folio() and
 * filemap_lock_folio().  For users which need more control over exactly
 * what is done with the folios, these flags to __filemap_get_folio()
 * are available.
 *
 * * %FGP_ACCESSED - The folio will be marked accessed.
 * * %FGP_LOCK - The folio is returned locked.
 * * %FGP_CREAT - If no folio is present then a new folio is allocated,
 *   added to the page cache and the VM's LRU list.  The folio is
 *   returned locked.
 * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
 *   folio is already in cache.  If the folio was allocated, unlock it
 *   before returning so the caller can do the same dance.
 * * %FGP_WRITE - The folio will be written to by the caller.
 * * %FGP_NOFS - __GFP_FS will get cleared in gfp.
 * * %FGP_NOWAIT - Don't block on the folio lock.
 * * %FGP_STABLE - Wait for the folio to be stable (finished writeback)
 * * %FGP_DONTCACHE - Uncached buffered IO
 * * %FGP_WRITEBEGIN - The flags to use in a filesystem write_begin()
 *   implementation.
 */
typedef unsigned int __bitwise fgf_t;

#define FGP_ACCESSED                ((__force fgf_t)0x00000001)
#define FGP_LOCK                ((__force fgf_t)0x00000002)
#define FGP_CREAT                ((__force fgf_t)0x00000004)
#define FGP_WRITE                ((__force fgf_t)0x00000008)
#define FGP_NOFS                ((__force fgf_t)0x00000010)
#define FGP_NOWAIT                ((__force fgf_t)0x00000020)
#define FGP_FOR_MMAP                ((__force fgf_t)0x00000040)
#define FGP_STABLE                ((__force fgf_t)0x00000080)
#define FGP_DONTCACHE                ((__force fgf_t)0x00000100)
#define FGF_GET_ORDER(fgf)        (((__force unsigned)fgf) >> 26)        /* top 6 bits */

#define FGP_WRITEBEGIN                (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE)

static inline unsigned int filemap_get_order(size_t size)
{
        unsigned int shift = ilog2(size);

        if (shift <= PAGE_SHIFT)
                return 0;

        return shift - PAGE_SHIFT;
}

/**
 * fgf_set_order - Encode a length in the fgf_t flags.
 * @size: The suggested size of the folio to create.
 *
 * The caller of __filemap_get_folio() can use this to suggest a preferred
 * size for the folio that is created.  If there is already a folio at
 * the index, it will be returned, no matter what its size.  If a folio
 * is freshly created, it may be of a different size than requested
 * due to alignment constraints, memory pressure, or the presence of
 * other folios at nearby indices.
 */
static inline fgf_t fgf_set_order(size_t size)
{
        unsigned int order = filemap_get_order(size);

        if (!order)
                return 0;
        return (__force fgf_t)(order << 26);
}

void *filemap_get_entry(struct address_space *mapping, pgoff_t index);
struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
                fgf_t fgp_flags, gfp_t gfp);
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
                fgf_t fgp_flags, gfp_t gfp);

/**
 * write_begin_get_folio - Get folio for write_begin with flags.
 * @iocb: The kiocb passed from write_begin (may be NULL).
 * @mapping: The address space to search.
 * @index: The page cache index.
 * @len: Length of data being written.
 *
 * This is a helper for filesystem write_begin() implementations.
 * It wraps __filemap_get_folio(), setting appropriate flags in
 * the write begin context.
 *
 * Return: A folio or an ERR_PTR.
 */
static inline struct folio *write_begin_get_folio(const struct kiocb *iocb,
                  struct address_space *mapping, pgoff_t index, size_t len)
{
        fgf_t fgp_flags = FGP_WRITEBEGIN;

        fgp_flags |= fgf_set_order(len);

        if (iocb && iocb->ki_flags & IOCB_DONTCACHE)
                fgp_flags |= FGP_DONTCACHE;

        return __filemap_get_folio(mapping, index, fgp_flags,
                                   mapping_gfp_mask(mapping));
}

/**
 * filemap_get_folio - Find and get a folio.
 * @mapping: The address_space to search.
 * @index: The page index.
 *
 * Looks up the page cache entry at @mapping & @index.  If a folio is
 * present, it is returned with an increased refcount.
 *
 * Return: A folio or ERR_PTR(-ENOENT) if there is no folio in the cache for
 * this index.  Will not return a shadow, swap or DAX entry.
 */
static inline struct folio *filemap_get_folio(struct address_space *mapping,
                                        pgoff_t index)
{
        return __filemap_get_folio(mapping, index, 0, 0);
}

/**
 * filemap_lock_folio - Find and lock a folio.
 * @mapping: The address_space to search.
 * @index: The page index.
 *
 * Looks up the page cache entry at @mapping & @index.  If a folio is
 * present, it is returned locked with an increased refcount.
 *
 * Context: May sleep.
 * Return: A folio or ERR_PTR(-ENOENT) if there is no folio in the cache for
 * this index.  Will not return a shadow, swap or DAX entry.
 */
static inline struct folio *filemap_lock_folio(struct address_space *mapping,
                                        pgoff_t index)
{
        return __filemap_get_folio(mapping, index, FGP_LOCK, 0);
}

/**
 * filemap_grab_folio - grab a folio from the page cache
 * @mapping: The address space to search
 * @index: The page index
 *
 * Looks up the page cache entry at @mapping & @index. If no folio is found,
 * a new folio is created. The folio is locked, marked as accessed, and
 * returned.
 *
 * Return: A found or created folio. ERR_PTR(-ENOMEM) if no folio is found
 * and failed to create a folio.
 */
static inline struct folio *filemap_grab_folio(struct address_space *mapping,
                                        pgoff_t index)
{
        return __filemap_get_folio(mapping, index,
                        FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
                        mapping_gfp_mask(mapping));
}

/**
 * find_get_page - find and get a page reference
 * @mapping: the address_space to search
 * @offset: the page index
 *
 * Looks up the page cache slot at @mapping & @offset.  If there is a
 * page cache page, it is returned with an increased refcount.
 *
 * Otherwise, %NULL is returned.
 */
static inline struct page *find_get_page(struct address_space *mapping,
                                        pgoff_t offset)
{
        return pagecache_get_page(mapping, offset, 0, 0);
}

static inline struct page *find_get_page_flags(struct address_space *mapping,
                                        pgoff_t offset, fgf_t fgp_flags)
{
        return pagecache_get_page(mapping, offset, fgp_flags, 0);
}

/**
 * find_lock_page - locate, pin and lock a pagecache page
 * @mapping: the address_space to search
 * @index: the page index
 *
 * Looks up the page cache entry at @mapping & @index.  If there is a
 * page cache page, it is returned locked and with an increased
 * refcount.
 *
 * Context: May sleep.
 * Return: A struct page or %NULL if there is no page in the cache for this
 * index.
 */
static inline struct page *find_lock_page(struct address_space *mapping,
                                        pgoff_t index)
{
        return pagecache_get_page(mapping, index, FGP_LOCK, 0);
}

/**
 * find_or_create_page - locate or add a pagecache page
 * @mapping: the page's address_space
 * @index: the page's index into the mapping
 * @gfp_mask: page allocation mode
 *
 * Looks up the page cache slot at @mapping & @offset.  If there is a
 * page cache page, it is returned locked and with an increased
 * refcount.
 *
 * If the page is not present, a new page is allocated using @gfp_mask
 * and added to the page cache and the VM's LRU list.  The page is
 * returned locked and with an increased refcount.
 *
 * On memory exhaustion, %NULL is returned.
 *
 * find_or_create_page() may sleep, even if @gfp_flags specifies an
 * atomic allocation!
 */
static inline struct page *find_or_create_page(struct address_space *mapping,
                                        pgoff_t index, gfp_t gfp_mask)
{
        return pagecache_get_page(mapping, index,
                                        FGP_LOCK|FGP_ACCESSED|FGP_CREAT,
                                        gfp_mask);
}

/**
 * grab_cache_page_nowait - returns locked page at given index in given cache
 * @mapping: target address_space
 * @index: the page index
 *
 * Returns locked page at given index in given cache, creating it if
 * needed, but do not wait if the page is locked or to reclaim memory.
 * This is intended for speculative data generators, where the data can
 * be regenerated if the page couldn't be grabbed.  This routine should
 * be safe to call while holding the lock for another page.
 *
 * Clear __GFP_FS when allocating the page to avoid recursion into the fs
 * and deadlock against the caller's locked page.
 */
static inline struct page *grab_cache_page_nowait(struct address_space *mapping,
                                pgoff_t index)
{
        return pagecache_get_page(mapping, index,
                        FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
                        mapping_gfp_mask(mapping));
}

/**
 * folio_next_index - Get the index of the next folio.
 * @folio: The current folio.
 *
 * Return: The index of the folio which follows this folio in the file.
 */
static inline pgoff_t folio_next_index(const struct folio *folio)
{
        return folio->index + folio_nr_pages(folio);
}

/**
 * folio_file_page - The page for a particular index.
 * @folio: The folio which contains this index.
 * @index: The index we want to look up.
 *
 * Sometimes after looking up a folio in the page cache, we need to
 * obtain the specific page for an index (eg a page fault).
 *
 * Return: The page containing the file data for this index.
 */
static inline struct page *folio_file_page(struct folio *folio, pgoff_t index)
{
        return folio_page(folio, index & (folio_nr_pages(folio) - 1));
}

/**
 * folio_contains - Does this folio contain this index?
 * @folio: The folio.
 * @index: The page index within the file.
 *
 * Context: The caller should have the folio locked and ensure
 * e.g., shmem did not move this folio to the swap cache.
 * Return: true or false.
 */
static inline bool folio_contains(const struct folio *folio, pgoff_t index)
{
        VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio);
        return index - folio->index < folio_nr_pages(folio);
}

unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,
                pgoff_t end, struct folio_batch *fbatch);
unsigned filemap_get_folios_contig(struct address_space *mapping,
                pgoff_t *start, pgoff_t end, struct folio_batch *fbatch);
unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,
                pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch);

struct folio *read_cache_folio(struct address_space *, pgoff_t index,
                filler_t *filler, struct file *file);
struct folio *mapping_read_folio_gfp(struct address_space *, pgoff_t index,
                gfp_t flags);
struct page *read_cache_page(struct address_space *, pgoff_t index,
                filler_t *filler, struct file *file);
extern struct page * read_cache_page_gfp(struct address_space *mapping,
                                pgoff_t index, gfp_t gfp_mask);

static inline struct page *read_mapping_page(struct address_space *mapping,
                                pgoff_t index, struct file *file)
{
        return read_cache_page(mapping, index, NULL, file);
}

static inline struct folio *read_mapping_folio(struct address_space *mapping,
                                pgoff_t index, struct file *file)
{
        return read_cache_folio(mapping, index, NULL, file);
}

/**
 * page_pgoff - Calculate the logical page offset of this page.
 * @folio: The folio containing this page.
 * @page: The page which we need the offset of.
 *
 * For file pages, this is the offset from the beginning of the file
 * in units of PAGE_SIZE.  For anonymous pages, this is the offset from
 * the beginning of the anon_vma in units of PAGE_SIZE.  This will
 * return nonsense for KSM pages.
 *
 * Context: Caller must have a reference on the folio or otherwise
 * prevent it from being split or freed.
 *
 * Return: The offset in units of PAGE_SIZE.
 */
static inline pgoff_t page_pgoff(const struct folio *folio,
                const struct page *page)
{
        return folio->index + folio_page_idx(folio, page);
}

/**
 * folio_pos - Returns the byte position of this folio in its file.
 * @folio: The folio.
 */
static inline loff_t folio_pos(const struct folio *folio)
{
        return ((loff_t)folio->index) * PAGE_SIZE;
}

/*
 * Return byte-offset into filesystem object for page.
 */
static inline loff_t page_offset(struct page *page)
{
        struct folio *folio = page_folio(page);

        return folio_pos(folio) + folio_page_idx(folio, page) * PAGE_SIZE;
}

/*
 * Get the offset in PAGE_SIZE (even for hugetlb folios).
 */
static inline pgoff_t folio_pgoff(const struct folio *folio)
{
        return folio->index;
}

static inline pgoff_t linear_page_index(const struct vm_area_struct *vma,
                                        const unsigned long address)
{
        pgoff_t pgoff;
        pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
        pgoff += vma->vm_pgoff;
        return pgoff;
}

struct wait_page_key {
        struct folio *folio;
        int bit_nr;
        int page_match;
};

struct wait_page_queue {
        struct folio *folio;
        int bit_nr;
        wait_queue_entry_t wait;
};

static inline bool wake_page_match(struct wait_page_queue *wait_page,
                                  struct wait_page_key *key)
{
        if (wait_page->folio != key->folio)
               return false;
        key->page_match = 1;

        if (wait_page->bit_nr != key->bit_nr)
                return false;

        return true;
}

void __folio_lock(struct folio *folio);
int __folio_lock_killable(struct folio *folio);
vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf);
void unlock_page(struct page *page);
void folio_unlock(struct folio *folio);

/**
 * folio_trylock() - Attempt to lock a folio.
 * @folio: The folio to attempt to lock.
 *
 * Sometimes it is undesirable to wait for a folio to be unlocked (eg
 * when the locks are being taken in the wrong order, or if making
 * progress through a batch of folios is more important than processing
 * them in order).  Usually folio_lock() is the correct function to call.
 *
 * Context: Any context.
 * Return: Whether the lock was successfully acquired.
 */
static inline bool folio_trylock(struct folio *folio)
{
        return likely(!test_and_set_bit_lock(PG_locked, folio_flags(folio, 0)));
}

/*
 * Return true if the page was successfully locked
 */
static inline bool trylock_page(struct page *page)
{
        return folio_trylock(page_folio(page));
}

/**
 * folio_lock() - Lock this folio.
 * @folio: The folio to lock.
 *
 * The folio lock protects against many things, probably more than it
 * should.  It is primarily held while a folio is being brought uptodate,
 * either from its backing file or from swap.  It is also held while a
 * folio is being truncated from its address_space, so holding the lock
 * is sufficient to keep folio->mapping stable.
 *
 * The folio lock is also held while write() is modifying the page to
 * provide POSIX atomicity guarantees (as long as the write does not
 * cross a page boundary).  Other modifications to the data in the folio
 * do not hold the folio lock and can race with writes, eg DMA and stores
 * to mapped pages.
 *
 * Context: May sleep.  If you need to acquire the locks of two or
 * more folios, they must be in order of ascending index, if they are
 * in the same address_space.  If they are in different address_spaces,
 * acquire the lock of the folio which belongs to the address_space which
 * has the lowest address in memory first.
 */
static inline void folio_lock(struct folio *folio)
{
        might_sleep();
        if (!folio_trylock(folio))
                __folio_lock(folio);
}

/**
 * lock_page() - Lock the folio containing this page.
 * @page: The page to lock.
 *
 * See folio_lock() for a description of what the lock protects.
 * This is a legacy function and new code should probably use folio_lock()
 * instead.
 *
 * Context: May sleep.  Pages in the same folio share a lock, so do not
 * attempt to lock two pages which share a folio.
 */
static inline void lock_page(struct page *page)
{
        struct folio *folio;
        might_sleep();

        folio = page_folio(page);
        if (!folio_trylock(folio))
                __folio_lock(folio);
}

/**
 * folio_lock_killable() - Lock this folio, interruptible by a fatal signal.
 * @folio: The folio to lock.
 *
 * Attempts to lock the folio, like folio_lock(), except that the sleep
 * to acquire the lock is interruptible by a fatal signal.
 *
 * Context: May sleep; see folio_lock().
 * Return: 0 if the lock was acquired; -EINTR if a fatal signal was received.
 */
static inline int folio_lock_killable(struct folio *folio)
{
        might_sleep();
        if (!folio_trylock(folio))
                return __folio_lock_killable(folio);
        return 0;
}

/*
 * folio_lock_or_retry - Lock the folio, unless this would block and the
 * caller indicated that it can handle a retry.
 *
 * Return value and mmap_lock implications depend on flags; see
 * __folio_lock_or_retry().
 */
static inline vm_fault_t folio_lock_or_retry(struct folio *folio,
                                             struct vm_fault *vmf)
{
        might_sleep();
        if (!folio_trylock(folio))
                return __folio_lock_or_retry(folio, vmf);
        return 0;
}

/*
 * This is exported only for folio_wait_locked/folio_wait_writeback, etc.,
 * and should not be used directly.
 */
void folio_wait_bit(struct folio *folio, int bit_nr);
int folio_wait_bit_killable(struct folio *folio, int bit_nr);

/* 
 * Wait for a folio to be unlocked.
 *
 * This must be called with the caller "holding" the folio,
 * ie with increased folio reference count so that the folio won't
 * go away during the wait.
 */
static inline void folio_wait_locked(struct folio *folio)
{
        if (folio_test_locked(folio))
                folio_wait_bit(folio, PG_locked);
}

static inline int folio_wait_locked_killable(struct folio *folio)
{
        if (!folio_test_locked(folio))
                return 0;
        return folio_wait_bit_killable(folio, PG_locked);
}

void folio_end_read(struct folio *folio, bool success);
void wait_on_page_writeback(struct page *page);
void folio_wait_writeback(struct folio *folio);
int folio_wait_writeback_killable(struct folio *folio);
void end_page_writeback(struct page *page);
void folio_end_writeback(struct folio *folio);
void folio_end_writeback_no_dropbehind(struct folio *folio);
void folio_end_dropbehind(struct folio *folio);
void folio_wait_stable(struct folio *folio);
void __folio_mark_dirty(struct folio *folio, struct address_space *, int warn);
void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb);
void __folio_cancel_dirty(struct folio *folio);
static inline void folio_cancel_dirty(struct folio *folio)
{
        /* Avoid atomic ops, locking, etc. when not actually needed. */
        if (folio_test_dirty(folio))
                __folio_cancel_dirty(folio);
}
bool folio_clear_dirty_for_io(struct folio *folio);
bool clear_page_dirty_for_io(struct page *page);
void folio_invalidate(struct folio *folio, size_t offset, size_t length);
bool noop_dirty_folio(struct address_space *mapping, struct folio *folio);

#ifdef CONFIG_MIGRATION
int filemap_migrate_folio(struct address_space *mapping, struct folio *dst,
                struct folio *src, enum migrate_mode mode);
#else
#define filemap_migrate_folio NULL
#endif
void folio_end_private_2(struct folio *folio);
void folio_wait_private_2(struct folio *folio);
int folio_wait_private_2_killable(struct folio *folio);

/*
 * Fault in userspace address range.
 */
size_t fault_in_writeable(char __user *uaddr, size_t size);
size_t fault_in_subpage_writeable(char __user *uaddr, size_t size);
size_t fault_in_safe_writeable(const char __user *uaddr, size_t size);
size_t fault_in_readable(const char __user *uaddr, size_t size);

int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
                pgoff_t index, gfp_t gfp);
int filemap_add_folio(struct address_space *mapping, struct folio *folio,
                pgoff_t index, gfp_t gfp);
void filemap_remove_folio(struct folio *folio);
void __filemap_remove_folio(struct folio *folio, void *shadow);
void replace_page_cache_folio(struct folio *old, struct folio *new);
void delete_from_page_cache_batch(struct address_space *mapping,
                                  struct folio_batch *fbatch);
bool filemap_release_folio(struct folio *folio, gfp_t gfp);
loff_t mapping_seek_hole_data(struct address_space *, loff_t start, loff_t end,
                int whence);

/* Must be non-static for BPF error injection */
int __filemap_add_folio(struct address_space *mapping, struct folio *folio,
                pgoff_t index, gfp_t gfp, void **shadowp);

bool filemap_range_has_writeback(struct address_space *mapping,
                                 loff_t start_byte, loff_t end_byte);

/**
 * filemap_range_needs_writeback - check if range potentially needs writeback
 * @mapping:           address space within which to check
 * @start_byte:        offset in bytes where the range starts
 * @end_byte:          offset in bytes where the range ends (inclusive)
 *
 * Find at least one page in the range supplied, usually used to check if
 * direct writing in this range will trigger a writeback. Used by O_DIRECT
 * read/write with IOCB_NOWAIT, to see if the caller needs to do
 * filemap_write_and_wait_range() before proceeding.
 *
 * Return: %true if the caller should do filemap_write_and_wait_range() before
 * doing O_DIRECT to a page in this range, %false otherwise.
 */
static inline bool filemap_range_needs_writeback(struct address_space *mapping,
                                                 loff_t start_byte,
                                                 loff_t end_byte)
{
        if (!mapping->nrpages)
                return false;
        if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
            !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
                return false;
        return filemap_range_has_writeback(mapping, start_byte, end_byte);
}

/**
 * struct readahead_control - Describes a readahead request.
 *
 * A readahead request is for consecutive pages.  Filesystems which
 * implement the ->readahead method should call readahead_folio() or
 * __readahead_batch() in a loop and attempt to start reads into each
 * folio in the request.
 *
 * Most of the fields in this struct are private and should be accessed
 * by the functions below.
 *
 * @file: The file, used primarily by network filesystems for authentication.
 *          May be NULL if invoked internally by the filesystem.
 * @mapping: Readahead this filesystem object.
 * @ra: File readahead state.  May be NULL.
 */
struct readahead_control {
        struct file *file;
        struct address_space *mapping;
        struct file_ra_state *ra;
/* private: use the readahead_* accessors instead */
        pgoff_t _index;
        unsigned int _nr_pages;
        unsigned int _batch_count;
        bool dropbehind;
        bool _workingset;
        unsigned long _pflags;
};

#define DEFINE_READAHEAD(ractl, f, r, m, i)                                \
        struct readahead_control ractl = {                                \
                .file = f,                                                \
                .mapping = m,                                                \
                .ra = r,                                                \
                ._index = i,                                                \
        }

#define VM_READAHEAD_PAGES        (SZ_128K / PAGE_SIZE)

void page_cache_ra_unbounded(struct readahead_control *,
                unsigned long nr_to_read, unsigned long lookahead_count);
void page_cache_sync_ra(struct readahead_control *, unsigned long req_count);
void page_cache_async_ra(struct readahead_control *, struct folio *,
                unsigned long req_count);
void readahead_expand(struct readahead_control *ractl,
                      loff_t new_start, size_t new_len);

/**
 * page_cache_sync_readahead - generic file readahead
 * @mapping: address_space which holds the pagecache and I/O vectors
 * @ra: file_ra_state which holds the readahead state
 * @file: Used by the filesystem for authentication.
 * @index: Index of first page to be read.
 * @req_count: Total number of pages being read by the caller.
 *
 * page_cache_sync_readahead() should be called when a cache miss happened:
 * it will submit the read.  The readahead logic may decide to piggyback more
 * pages onto the read request if access patterns suggest it will improve
 * performance.
 */
static inline
void page_cache_sync_readahead(struct address_space *mapping,
                struct file_ra_state *ra, struct file *file, pgoff_t index,
                unsigned long req_count)
{
        DEFINE_READAHEAD(ractl, file, ra, mapping, index);
        page_cache_sync_ra(&ractl, req_count);
}

/**
 * page_cache_async_readahead - file readahead for marked pages
 * @mapping: address_space which holds the pagecache and I/O vectors
 * @ra: file_ra_state which holds the readahead state
 * @file: Used by the filesystem for authentication.
 * @folio: The folio which triggered the readahead call.
 * @req_count: Total number of pages being read by the caller.
 *
 * page_cache_async_readahead() should be called when a page is used which
 * is marked as PageReadahead; this is a marker to suggest that the application
 * has used up enough of the readahead window that we should start pulling in
 * more pages.
 */
static inline
void page_cache_async_readahead(struct address_space *mapping,
                struct file_ra_state *ra, struct file *file,
                struct folio *folio, unsigned long req_count)
{
        DEFINE_READAHEAD(ractl, file, ra, mapping, folio->index);
        page_cache_async_ra(&ractl, folio, req_count);
}

static inline struct folio *__readahead_folio(struct readahead_control *ractl)
{
        struct folio *folio;

        BUG_ON(ractl->_batch_count > ractl->_nr_pages);
        ractl->_nr_pages -= ractl->_batch_count;
        ractl->_index += ractl->_batch_count;

        if (!ractl->_nr_pages) {
                ractl->_batch_count = 0;
                return NULL;
        }

        folio = xa_load(&ractl->mapping->i_pages, ractl->_index);
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        ractl->_batch_count = folio_nr_pages(folio);

        return folio;
}

/**
 * readahead_folio - Get the next folio to read.
 * @ractl: The current readahead request.
 *
 * Context: The folio is locked.  The caller should unlock the folio once
 * all I/O to that folio has completed.
 * Return: A pointer to the next folio, or %NULL if we are done.
 */
static inline struct folio *readahead_folio(struct readahead_control *ractl)
{
        struct folio *folio = __readahead_folio(ractl);

        if (folio)
                folio_put(folio);
        return folio;
}

static inline unsigned int __readahead_batch(struct readahead_control *rac,
                struct page **array, unsigned int array_sz)
{
        unsigned int i = 0;
        XA_STATE(xas, &rac->mapping->i_pages, 0);
        struct folio *folio;

        BUG_ON(rac->_batch_count > rac->_nr_pages);
        rac->_nr_pages -= rac->_batch_count;
        rac->_index += rac->_batch_count;
        rac->_batch_count = 0;

        xas_set(&xas, rac->_index);
        rcu_read_lock();
        xas_for_each(&xas, folio, rac->_index + rac->_nr_pages - 1) {
                if (xas_retry(&xas, folio))
                        continue;
                VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
                array[i++] = folio_page(folio, 0);
                rac->_batch_count += folio_nr_pages(folio);
                if (i == array_sz)
                        break;
        }
        rcu_read_unlock();

        return i;
}

/**
 * readahead_pos - The byte offset into the file of this readahead request.
 * @rac: The readahead request.
 */
static inline loff_t readahead_pos(const struct readahead_control *rac)
{
        return (loff_t)rac->_index * PAGE_SIZE;
}

/**
 * readahead_length - The number of bytes in this readahead request.
 * @rac: The readahead request.
 */
static inline size_t readahead_length(const struct readahead_control *rac)
{
        return rac->_nr_pages * PAGE_SIZE;
}

/**
 * readahead_index - The index of the first page in this readahead request.
 * @rac: The readahead request.
 */
static inline pgoff_t readahead_index(const struct readahead_control *rac)
{
        return rac->_index;
}

/**
 * readahead_count - The number of pages in this readahead request.
 * @rac: The readahead request.
 */
static inline unsigned int readahead_count(const struct readahead_control *rac)
{
        return rac->_nr_pages;
}

/**
 * readahead_batch_length - The number of bytes in the current batch.
 * @rac: The readahead request.
 */
static inline size_t readahead_batch_length(const struct readahead_control *rac)
{
        return rac->_batch_count * PAGE_SIZE;
}

static inline unsigned long dir_pages(const struct inode *inode)
{
        return (unsigned long)(inode->i_size + PAGE_SIZE - 1) >>
                               PAGE_SHIFT;
}

/**
 * folio_mkwrite_check_truncate - check if folio was truncated
 * @folio: the folio to check
 * @inode: the inode to check the folio against
 *
 * Return: the number of bytes in the folio up to EOF,
 * or -EFAULT if the folio was truncated.
 */
static inline ssize_t folio_mkwrite_check_truncate(const struct folio *folio,
                                                   const struct inode *inode)
{
        loff_t size = i_size_read(inode);
        pgoff_t index = size >> PAGE_SHIFT;
        size_t offset = offset_in_folio(folio, size);

        if (!folio->mapping)
                return -EFAULT;

        /* folio is wholly inside EOF */
        if (folio_next_index(folio) - 1 < index)
                return folio_size(folio);
        /* folio is wholly past EOF */
        if (folio->index > index || !offset)
                return -EFAULT;
        /* folio is partially inside EOF */
        return offset;
}

/**
 * i_blocks_per_folio - How many blocks fit in this folio.
 * @inode: The inode which contains the blocks.
 * @folio: The folio.
 *
 * If the block size is larger than the size of this folio, return zero.
 *
 * Context: The caller should hold a refcount on the folio to prevent it
 * from being split.
 * Return: The number of filesystem blocks covered by this folio.
 */
static inline
unsigned int i_blocks_per_folio(const struct inode *inode,
                                const struct folio *folio)
{
        return folio_size(folio) >> inode->i_blkbits;
}
#endif /* _LINUX_PAGEMAP_H */



























   10 




   10 






   10 
















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
// SPDX-License-Identifier: GPL-2.0-only
/* xfrm4_tunnel.c: Generic IP tunnel transformer.
 *
 * Copyright (C) 2003 David S. Miller (davem@redhat.com)
 */

#define pr_fmt(fmt) "IPsec: " fmt

#include <linux/skbuff.h>
#include <linux/module.h>
#include <net/xfrm.h>
#include <net/protocol.h>

static int ipip_output(struct xfrm_state *x, struct sk_buff *skb)
{
        skb_push(skb, -skb_network_offset(skb));
        return 0;
}

static int ipip_xfrm_rcv(struct xfrm_state *x, struct sk_buff *skb)
{
        return ip_hdr(skb)->protocol;
}

static int ipip_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
        if (x->props.mode != XFRM_MODE_TUNNEL) {
                NL_SET_ERR_MSG(extack, "IPv4 tunnel can only be used with tunnel mode");
                return -EINVAL;
        }

        if (x->encap) {
                NL_SET_ERR_MSG(extack, "IPv4 tunnel is not compatible with encapsulation");
                return -EINVAL;
        }

        x->props.header_len = sizeof(struct iphdr);

        return 0;
}

static void ipip_destroy(struct xfrm_state *x)
{
}

static const struct xfrm_type ipip_type = {
        .owner                = THIS_MODULE,
        .proto                     = IPPROTO_IPIP,
        .init_state        = ipip_init_state,
        .destructor        = ipip_destroy,
        .input                = ipip_xfrm_rcv,
        .output                = ipip_output
};

static int xfrm_tunnel_rcv(struct sk_buff *skb)
{
        return xfrm4_rcv_spi(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr);
}

static int xfrm_tunnel_err(struct sk_buff *skb, u32 info)
{
        return -ENOENT;
}

static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = {
        .handler        =        xfrm_tunnel_rcv,
        .err_handler        =        xfrm_tunnel_err,
        .priority        =        4,
};

#if IS_ENABLED(CONFIG_IPV6)
static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = {
        .handler        =        xfrm_tunnel_rcv,
        .err_handler        =        xfrm_tunnel_err,
        .priority        =        3,
};
#endif

static int __init ipip_init(void)
{
        if (xfrm_register_type(&ipip_type, AF_INET) < 0) {
                pr_info("%s: can't add xfrm type\n", __func__);
                return -EAGAIN;
        }

        if (xfrm4_tunnel_register(&xfrm_tunnel_handler, AF_INET)) {
                pr_info("%s: can't add xfrm handler for AF_INET\n", __func__);
                xfrm_unregister_type(&ipip_type, AF_INET);
                return -EAGAIN;
        }
#if IS_ENABLED(CONFIG_IPV6)
        if (xfrm4_tunnel_register(&xfrm64_tunnel_handler, AF_INET6)) {
                pr_info("%s: can't add xfrm handler for AF_INET6\n", __func__);
                xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET);
                xfrm_unregister_type(&ipip_type, AF_INET);
                return -EAGAIN;
        }
#endif
        return 0;
}

static void __exit ipip_fini(void)
{
#if IS_ENABLED(CONFIG_IPV6)
        if (xfrm4_tunnel_deregister(&xfrm64_tunnel_handler, AF_INET6))
                pr_info("%s: can't remove xfrm handler for AF_INET6\n",
                        __func__);
#endif
        if (xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET))
                pr_info("%s: can't remove xfrm handler for AF_INET\n",
                        __func__);
        xfrm_unregister_type(&ipip_type, AF_INET);
}

module_init(ipip_init);
module_exit(ipip_fini);
MODULE_DESCRIPTION("IPv4 XFRM tunnel driver");
MODULE_LICENSE("GPL");
MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_IPIP);



























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_CGROUP_NAMESPACE_H
#define _LINUX_CGROUP_NAMESPACE_H

#include <linux/ns_common.h>

struct cgroup_namespace {
        struct ns_common        ns;
        struct user_namespace        *user_ns;
        struct ucounts                *ucounts;
        struct css_set          *root_cset;
};

extern struct cgroup_namespace init_cgroup_ns;

#ifdef CONFIG_CGROUPS

static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
{
        return container_of(ns, struct cgroup_namespace, ns);
}

void free_cgroup_ns(struct cgroup_namespace *ns);

struct cgroup_namespace *copy_cgroup_ns(u64 flags,
                                        struct user_namespace *user_ns,
                                        struct cgroup_namespace *old_ns);

int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
                   struct cgroup_namespace *ns);

static inline void get_cgroup_ns(struct cgroup_namespace *ns)
{
        ns_ref_inc(ns);
}

static inline void put_cgroup_ns(struct cgroup_namespace *ns)
{
        if (ns_ref_put(ns))
                free_cgroup_ns(ns);
}

#else /* !CONFIG_CGROUPS */

static inline void free_cgroup_ns(struct cgroup_namespace *ns) { }
static inline struct cgroup_namespace *
copy_cgroup_ns(u64 flags, struct user_namespace *user_ns,
               struct cgroup_namespace *old_ns)
{
        return old_ns;
}

static inline void get_cgroup_ns(struct cgroup_namespace *ns) { }
static inline void put_cgroup_ns(struct cgroup_namespace *ns) { }

#endif /* !CONFIG_CGROUPS */

#endif /* _LINUX_CGROUP_NAMESPACE_H */




















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Portions of this file
 * Copyright(c) 2016-2017 Intel Deutschland GmbH
 * Copyright (C) 2018, 2021-2025 Intel Corporation
 */
#ifndef __CFG80211_RDEV_OPS
#define __CFG80211_RDEV_OPS

#include <linux/rtnetlink.h>
#include <net/cfg80211.h>
#include "core.h"
#include "trace.h"

static inline int rdev_suspend(struct cfg80211_registered_device *rdev,
                               struct cfg80211_wowlan *wowlan)
{
        int ret;
        trace_rdev_suspend(&rdev->wiphy, wowlan);
        ret = rdev->ops->suspend(&rdev->wiphy, wowlan);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_resume(struct cfg80211_registered_device *rdev)
{
        int ret;
        trace_rdev_resume(&rdev->wiphy);
        ret = rdev->ops->resume(&rdev->wiphy);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void rdev_set_wakeup(struct cfg80211_registered_device *rdev,
                                   bool enabled)
{
        trace_rdev_set_wakeup(&rdev->wiphy, enabled);
        rdev->ops->set_wakeup(&rdev->wiphy, enabled);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline struct wireless_dev
*rdev_add_virtual_intf(struct cfg80211_registered_device *rdev, char *name,
                       unsigned char name_assign_type,
                       enum nl80211_iftype type,
                       struct vif_params *params)
{
        struct wireless_dev *ret;
        trace_rdev_add_virtual_intf(&rdev->wiphy, name, type);
        ret = rdev->ops->add_virtual_intf(&rdev->wiphy, name, name_assign_type,
                                          type, params);
        trace_rdev_return_wdev(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_del_virtual_intf(struct cfg80211_registered_device *rdev,
                      struct wireless_dev *wdev)
{
        int ret;
        trace_rdev_del_virtual_intf(&rdev->wiphy, wdev);
        ret = rdev->ops->del_virtual_intf(&rdev->wiphy, wdev);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_change_virtual_intf(struct cfg80211_registered_device *rdev,
                         struct net_device *dev, enum nl80211_iftype type,
                         struct vif_params *params)
{
        int ret;
        trace_rdev_change_virtual_intf(&rdev->wiphy, dev, type);
        ret = rdev->ops->change_virtual_intf(&rdev->wiphy, dev, type, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_add_key(struct cfg80211_registered_device *rdev,
                               struct net_device *netdev, int link_id,
                               u8 key_index, bool pairwise, const u8 *mac_addr,
                               struct key_params *params)
{
        int ret;
        trace_rdev_add_key(&rdev->wiphy, netdev, link_id, key_index, pairwise,
                           mac_addr, params->mode);
        ret = rdev->ops->add_key(&rdev->wiphy, netdev, link_id, key_index,
                                  pairwise, mac_addr, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_get_key(struct cfg80211_registered_device *rdev, struct net_device *netdev,
             int link_id, u8 key_index, bool pairwise, const u8 *mac_addr,
             void *cookie,
             void (*callback)(void *cookie, struct key_params*))
{
        int ret;
        trace_rdev_get_key(&rdev->wiphy, netdev, link_id, key_index, pairwise,
                           mac_addr);
        ret = rdev->ops->get_key(&rdev->wiphy, netdev, link_id, key_index,
                                  pairwise, mac_addr, cookie, callback);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_del_key(struct cfg80211_registered_device *rdev,
                               struct net_device *netdev, int link_id,
                               u8 key_index, bool pairwise, const u8 *mac_addr)
{
        int ret;
        trace_rdev_del_key(&rdev->wiphy, netdev, link_id, key_index, pairwise,
                           mac_addr);
        ret = rdev->ops->del_key(&rdev->wiphy, netdev, link_id, key_index,
                                  pairwise, mac_addr);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_set_default_key(struct cfg80211_registered_device *rdev,
                     struct net_device *netdev, int link_id, u8 key_index,
                     bool unicast, bool multicast)
{
        int ret;
        trace_rdev_set_default_key(&rdev->wiphy, netdev, link_id, key_index,
                                   unicast, multicast);
        ret = rdev->ops->set_default_key(&rdev->wiphy, netdev, link_id,
                                          key_index, unicast, multicast);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_set_default_mgmt_key(struct cfg80211_registered_device *rdev,
                          struct net_device *netdev, int link_id, u8 key_index)
{
        int ret;
        trace_rdev_set_default_mgmt_key(&rdev->wiphy, netdev, link_id,
                                        key_index);
        ret = rdev->ops->set_default_mgmt_key(&rdev->wiphy, netdev, link_id,
                                               key_index);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_set_default_beacon_key(struct cfg80211_registered_device *rdev,
                            struct net_device *netdev, int link_id,
                            u8 key_index)
{
        int ret;

        trace_rdev_set_default_beacon_key(&rdev->wiphy, netdev, link_id,
                                          key_index);
        ret = rdev->ops->set_default_beacon_key(&rdev->wiphy, netdev, link_id,
                                                 key_index);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_start_ap(struct cfg80211_registered_device *rdev,
                                struct net_device *dev,
                                struct cfg80211_ap_settings *settings)
{
        int ret;
        trace_rdev_start_ap(&rdev->wiphy, dev, settings);
        ret = rdev->ops->start_ap(&rdev->wiphy, dev, settings);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_change_beacon(struct cfg80211_registered_device *rdev,
                                     struct net_device *dev,
                                     struct cfg80211_ap_update *info)
{
        int ret;
        trace_rdev_change_beacon(&rdev->wiphy, dev, info);
        ret = rdev->ops->change_beacon(&rdev->wiphy, dev, info);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_stop_ap(struct cfg80211_registered_device *rdev,
                               struct net_device *dev, unsigned int link_id)
{
        int ret;
        trace_rdev_stop_ap(&rdev->wiphy, dev, link_id);
        ret = rdev->ops->stop_ap(&rdev->wiphy, dev, link_id);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_add_station(struct cfg80211_registered_device *rdev,
                                   struct net_device *dev, u8 *mac,
                                   struct station_parameters *params)
{
        int ret;
        trace_rdev_add_station(&rdev->wiphy, dev, mac, params);
        ret = rdev->ops->add_station(&rdev->wiphy, dev, mac, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_del_station(struct cfg80211_registered_device *rdev,
                                   struct net_device *dev,
                                   struct station_del_parameters *params)
{
        int ret;
        trace_rdev_del_station(&rdev->wiphy, dev, params);
        ret = rdev->ops->del_station(&rdev->wiphy, dev, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_change_station(struct cfg80211_registered_device *rdev,
                                      struct net_device *dev, u8 *mac,
                                      struct station_parameters *params)
{
        int ret;
        trace_rdev_change_station(&rdev->wiphy, dev, mac, params);
        ret = rdev->ops->change_station(&rdev->wiphy, dev, mac, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_get_station(struct cfg80211_registered_device *rdev,
                                   struct net_device *dev, const u8 *mac,
                                   struct station_info *sinfo)
{
        int ret;
        trace_rdev_get_station(&rdev->wiphy, dev, mac);
        ret = rdev->ops->get_station(&rdev->wiphy, dev, mac, sinfo);
        trace_rdev_return_int_station_info(&rdev->wiphy, ret, sinfo);
        return ret;
}

static inline int rdev_dump_station(struct cfg80211_registered_device *rdev,
                                    struct net_device *dev, int idx, u8 *mac,
                                    struct station_info *sinfo)
{
        int ret;
        trace_rdev_dump_station(&rdev->wiphy, dev, idx, mac);
        ret = rdev->ops->dump_station(&rdev->wiphy, dev, idx, mac, sinfo);
        trace_rdev_return_int_station_info(&rdev->wiphy, ret, sinfo);
        return ret;
}

static inline int rdev_add_mpath(struct cfg80211_registered_device *rdev,
                                 struct net_device *dev, u8 *dst, u8 *next_hop)
{
        int ret;
        trace_rdev_add_mpath(&rdev->wiphy, dev, dst, next_hop);
        ret = rdev->ops->add_mpath(&rdev->wiphy, dev, dst, next_hop);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_del_mpath(struct cfg80211_registered_device *rdev,
                                 struct net_device *dev, u8 *dst)
{
        int ret;
        trace_rdev_del_mpath(&rdev->wiphy, dev, dst);
        ret = rdev->ops->del_mpath(&rdev->wiphy, dev, dst);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_change_mpath(struct cfg80211_registered_device *rdev,
                                    struct net_device *dev, u8 *dst,
                                    u8 *next_hop)
{
        int ret;
        trace_rdev_change_mpath(&rdev->wiphy, dev, dst, next_hop);
        ret = rdev->ops->change_mpath(&rdev->wiphy, dev, dst, next_hop);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_get_mpath(struct cfg80211_registered_device *rdev,
                                 struct net_device *dev, u8 *dst, u8 *next_hop,
                                 struct mpath_info *pinfo)
{
        int ret;
        trace_rdev_get_mpath(&rdev->wiphy, dev, dst, next_hop);
        ret = rdev->ops->get_mpath(&rdev->wiphy, dev, dst, next_hop, pinfo);
        trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo);
        return ret;

}

static inline int rdev_get_mpp(struct cfg80211_registered_device *rdev,
                               struct net_device *dev, u8 *dst, u8 *mpp,
                               struct mpath_info *pinfo)
{
        int ret;

        trace_rdev_get_mpp(&rdev->wiphy, dev, dst, mpp);
        ret = rdev->ops->get_mpp(&rdev->wiphy, dev, dst, mpp, pinfo);
        trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo);
        return ret;
}

static inline int rdev_dump_mpath(struct cfg80211_registered_device *rdev,
                                  struct net_device *dev, int idx, u8 *dst,
                                  u8 *next_hop, struct mpath_info *pinfo)

{
        int ret;
        trace_rdev_dump_mpath(&rdev->wiphy, dev, idx, dst, next_hop);
        ret = rdev->ops->dump_mpath(&rdev->wiphy, dev, idx, dst, next_hop,
                                    pinfo);
        trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo);
        return ret;
}

static inline int rdev_dump_mpp(struct cfg80211_registered_device *rdev,
                                struct net_device *dev, int idx, u8 *dst,
                                u8 *mpp, struct mpath_info *pinfo)

{
        int ret;

        trace_rdev_dump_mpp(&rdev->wiphy, dev, idx, dst, mpp);
        ret = rdev->ops->dump_mpp(&rdev->wiphy, dev, idx, dst, mpp, pinfo);
        trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo);
        return ret;
}

static inline int
rdev_get_mesh_config(struct cfg80211_registered_device *rdev,
                     struct net_device *dev, struct mesh_config *conf)
{
        int ret;
        trace_rdev_get_mesh_config(&rdev->wiphy, dev);
        ret = rdev->ops->get_mesh_config(&rdev->wiphy, dev, conf);
        trace_rdev_return_int_mesh_config(&rdev->wiphy, ret, conf);
        return ret;
}

static inline int
rdev_update_mesh_config(struct cfg80211_registered_device *rdev,
                        struct net_device *dev, u32 mask,
                        const struct mesh_config *nconf)
{
        int ret;
        trace_rdev_update_mesh_config(&rdev->wiphy, dev, mask, nconf);
        ret = rdev->ops->update_mesh_config(&rdev->wiphy, dev, mask, nconf);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_join_mesh(struct cfg80211_registered_device *rdev,
                                 struct net_device *dev,
                                 const struct mesh_config *conf,
                                 const struct mesh_setup *setup)
{
        int ret;
        trace_rdev_join_mesh(&rdev->wiphy, dev, conf, setup);
        ret = rdev->ops->join_mesh(&rdev->wiphy, dev, conf, setup);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}


static inline int rdev_leave_mesh(struct cfg80211_registered_device *rdev,
                                  struct net_device *dev)
{
        int ret;
        trace_rdev_leave_mesh(&rdev->wiphy, dev);
        ret = rdev->ops->leave_mesh(&rdev->wiphy, dev);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_join_ocb(struct cfg80211_registered_device *rdev,
                                struct net_device *dev,
                                struct ocb_setup *setup)
{
        int ret;
        trace_rdev_join_ocb(&rdev->wiphy, dev, setup);
        ret = rdev->ops->join_ocb(&rdev->wiphy, dev, setup);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_leave_ocb(struct cfg80211_registered_device *rdev,
                                 struct net_device *dev)
{
        int ret;
        trace_rdev_leave_ocb(&rdev->wiphy, dev);
        ret = rdev->ops->leave_ocb(&rdev->wiphy, dev);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_change_bss(struct cfg80211_registered_device *rdev,
                                  struct net_device *dev,
                                  struct bss_parameters *params)

{
        int ret;
        trace_rdev_change_bss(&rdev->wiphy, dev, params);
        ret = rdev->ops->change_bss(&rdev->wiphy, dev, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void rdev_inform_bss(struct cfg80211_registered_device *rdev,
                                   struct cfg80211_bss *bss,
                                   const struct cfg80211_bss_ies *ies,
                                   void *drv_data)

{
        trace_rdev_inform_bss(&rdev->wiphy, bss);
        if (rdev->ops->inform_bss)
                rdev->ops->inform_bss(&rdev->wiphy, bss, ies, drv_data);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline int rdev_set_txq_params(struct cfg80211_registered_device *rdev,
                                      struct net_device *dev,
                                      struct ieee80211_txq_params *params)

{
        int ret;
        trace_rdev_set_txq_params(&rdev->wiphy, dev, params);
        ret = rdev->ops->set_txq_params(&rdev->wiphy, dev, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_libertas_set_mesh_channel(struct cfg80211_registered_device *rdev,
                               struct net_device *dev,
                               struct ieee80211_channel *chan)
{
        int ret;
        trace_rdev_libertas_set_mesh_channel(&rdev->wiphy, dev, chan);
        ret = rdev->ops->libertas_set_mesh_channel(&rdev->wiphy, dev, chan);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_set_monitor_channel(struct cfg80211_registered_device *rdev,
                         struct net_device *dev,
                         struct cfg80211_chan_def *chandef)
{
        int ret;
        trace_rdev_set_monitor_channel(&rdev->wiphy, dev, chandef);
        ret = rdev->ops->set_monitor_channel(&rdev->wiphy, dev, chandef);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_scan(struct cfg80211_registered_device *rdev,
                            struct cfg80211_scan_request_int *request)
{
        int ret;

        if (WARN_ON_ONCE(!request->req.n_ssids && request->req.ssids))
                return -EINVAL;

        trace_rdev_scan(&rdev->wiphy, request);
        ret = rdev->ops->scan(&rdev->wiphy, &request->req);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void rdev_abort_scan(struct cfg80211_registered_device *rdev,
                                   struct wireless_dev *wdev)
{
        trace_rdev_abort_scan(&rdev->wiphy, wdev);
        rdev->ops->abort_scan(&rdev->wiphy, wdev);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline int rdev_auth(struct cfg80211_registered_device *rdev,
                            struct net_device *dev,
                            struct cfg80211_auth_request *req)
{
        int ret;
        trace_rdev_auth(&rdev->wiphy, dev, req);
        ret = rdev->ops->auth(&rdev->wiphy, dev, req);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_assoc(struct cfg80211_registered_device *rdev,
                             struct net_device *dev,
                             struct cfg80211_assoc_request *req)
{
        int ret;

        trace_rdev_assoc(&rdev->wiphy, dev, req);
        ret = rdev->ops->assoc(&rdev->wiphy, dev, req);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_deauth(struct cfg80211_registered_device *rdev,
                              struct net_device *dev,
                              struct cfg80211_deauth_request *req)
{
        int ret;
        trace_rdev_deauth(&rdev->wiphy, dev, req);
        ret = rdev->ops->deauth(&rdev->wiphy, dev, req);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_disassoc(struct cfg80211_registered_device *rdev,
                                struct net_device *dev,
                                struct cfg80211_disassoc_request *req)
{
        int ret;
        trace_rdev_disassoc(&rdev->wiphy, dev, req);
        ret = rdev->ops->disassoc(&rdev->wiphy, dev, req);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_connect(struct cfg80211_registered_device *rdev,
                               struct net_device *dev,
                               struct cfg80211_connect_params *sme)
{
        int ret;
        trace_rdev_connect(&rdev->wiphy, dev, sme);
        ret = rdev->ops->connect(&rdev->wiphy, dev, sme);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_update_connect_params(struct cfg80211_registered_device *rdev,
                           struct net_device *dev,
                           struct cfg80211_connect_params *sme, u32 changed)
{
        int ret;
        trace_rdev_update_connect_params(&rdev->wiphy, dev, sme, changed);
        ret = rdev->ops->update_connect_params(&rdev->wiphy, dev, sme, changed);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_disconnect(struct cfg80211_registered_device *rdev,
                                  struct net_device *dev, u16 reason_code)
{
        int ret;
        trace_rdev_disconnect(&rdev->wiphy, dev, reason_code);
        ret = rdev->ops->disconnect(&rdev->wiphy, dev, reason_code);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_join_ibss(struct cfg80211_registered_device *rdev,
                                 struct net_device *dev,
                                 struct cfg80211_ibss_params *params)
{
        int ret;
        trace_rdev_join_ibss(&rdev->wiphy, dev, params);
        ret = rdev->ops->join_ibss(&rdev->wiphy, dev, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_leave_ibss(struct cfg80211_registered_device *rdev,
                                  struct net_device *dev)
{
        int ret;
        trace_rdev_leave_ibss(&rdev->wiphy, dev);
        ret = rdev->ops->leave_ibss(&rdev->wiphy, dev);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_set_wiphy_params(struct cfg80211_registered_device *rdev, int radio_idx,
                      u32 changed)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_set_wiphy_params(&rdev->wiphy, radio_idx, changed);
        if (rdev->ops->set_wiphy_params)
                ret = rdev->ops->set_wiphy_params(&rdev->wiphy, radio_idx,
                                                  changed);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_set_tx_power(struct cfg80211_registered_device *rdev,
                                    struct wireless_dev *wdev, int radio_idx,
                                    enum nl80211_tx_power_setting type,
                                    int mbm)
{
        int ret;
        trace_rdev_set_tx_power(&rdev->wiphy, wdev, radio_idx, type, mbm);
        ret = rdev->ops->set_tx_power(&rdev->wiphy, wdev, radio_idx, type,
                                      mbm);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_get_tx_power(struct cfg80211_registered_device *rdev,
                                    struct wireless_dev *wdev, int radio_idx,
                                    unsigned int link_id, int *dbm)
{
        int ret;
        trace_rdev_get_tx_power(&rdev->wiphy, wdev, radio_idx, link_id);
        ret = rdev->ops->get_tx_power(&rdev->wiphy, wdev, radio_idx, link_id,
                                      dbm);
        trace_rdev_return_int_int(&rdev->wiphy, ret, *dbm);
        return ret;
}

static inline int
rdev_set_multicast_to_unicast(struct cfg80211_registered_device *rdev,
                              struct net_device *dev,
                              const bool enabled)
{
        int ret;
        trace_rdev_set_multicast_to_unicast(&rdev->wiphy, dev, enabled);
        ret = rdev->ops->set_multicast_to_unicast(&rdev->wiphy, dev, enabled);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_get_txq_stats(struct cfg80211_registered_device *rdev,
                   struct wireless_dev *wdev,
                   struct cfg80211_txq_stats *txqstats)
{
        int ret;
        trace_rdev_get_txq_stats(&rdev->wiphy, wdev);
        ret = rdev->ops->get_txq_stats(&rdev->wiphy, wdev, txqstats);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void rdev_rfkill_poll(struct cfg80211_registered_device *rdev)
{
        trace_rdev_rfkill_poll(&rdev->wiphy);
        rdev->ops->rfkill_poll(&rdev->wiphy);
        trace_rdev_return_void(&rdev->wiphy);
}


#ifdef CONFIG_NL80211_TESTMODE
static inline int rdev_testmode_cmd(struct cfg80211_registered_device *rdev,
                                    struct wireless_dev *wdev,
                                    void *data, int len)
{
        int ret;
        trace_rdev_testmode_cmd(&rdev->wiphy, wdev);
        ret = rdev->ops->testmode_cmd(&rdev->wiphy, wdev, data, len);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_testmode_dump(struct cfg80211_registered_device *rdev,
                                     struct sk_buff *skb,
                                     struct netlink_callback *cb, void *data,
                                     int len)
{
        int ret;
        trace_rdev_testmode_dump(&rdev->wiphy);
        ret = rdev->ops->testmode_dump(&rdev->wiphy, skb, cb, data, len);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}
#endif

static inline int
rdev_set_bitrate_mask(struct cfg80211_registered_device *rdev,
                      struct net_device *dev, unsigned int link_id,
                      const u8 *peer,
                      const struct cfg80211_bitrate_mask *mask)
{
        int ret;
        trace_rdev_set_bitrate_mask(&rdev->wiphy, dev, link_id, peer, mask);
        ret = rdev->ops->set_bitrate_mask(&rdev->wiphy, dev, link_id,
                                          peer, mask);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_dump_survey(struct cfg80211_registered_device *rdev,
                                   struct net_device *netdev, int idx,
                                   struct survey_info *info)
{
        int ret;
        trace_rdev_dump_survey(&rdev->wiphy, netdev, idx);
        ret = rdev->ops->dump_survey(&rdev->wiphy, netdev, idx, info);
        if (ret < 0)
                trace_rdev_return_int(&rdev->wiphy, ret);
        else
                trace_rdev_return_int_survey_info(&rdev->wiphy, ret, info);
        return ret;
}

static inline int rdev_set_pmksa(struct cfg80211_registered_device *rdev,
                                 struct net_device *netdev,
                                 struct cfg80211_pmksa *pmksa)
{
        int ret;
        trace_rdev_set_pmksa(&rdev->wiphy, netdev, pmksa);
        ret = rdev->ops->set_pmksa(&rdev->wiphy, netdev, pmksa);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_del_pmksa(struct cfg80211_registered_device *rdev,
                                 struct net_device *netdev,
                                 struct cfg80211_pmksa *pmksa)
{
        int ret;
        trace_rdev_del_pmksa(&rdev->wiphy, netdev, pmksa);
        ret = rdev->ops->del_pmksa(&rdev->wiphy, netdev, pmksa);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_flush_pmksa(struct cfg80211_registered_device *rdev,
                                   struct net_device *netdev)
{
        int ret;
        trace_rdev_flush_pmksa(&rdev->wiphy, netdev);
        ret = rdev->ops->flush_pmksa(&rdev->wiphy, netdev);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_remain_on_channel(struct cfg80211_registered_device *rdev,
                       struct wireless_dev *wdev,
                       struct ieee80211_channel *chan,
                       unsigned int duration, u64 *cookie)
{
        int ret;
        trace_rdev_remain_on_channel(&rdev->wiphy, wdev, chan, duration);
        ret = rdev->ops->remain_on_channel(&rdev->wiphy, wdev, chan,
                                           duration, cookie);
        trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie);
        return ret;
}

static inline int
rdev_cancel_remain_on_channel(struct cfg80211_registered_device *rdev,
                              struct wireless_dev *wdev, u64 cookie)
{
        int ret;
        trace_rdev_cancel_remain_on_channel(&rdev->wiphy, wdev, cookie);
        ret = rdev->ops->cancel_remain_on_channel(&rdev->wiphy, wdev, cookie);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_mgmt_tx(struct cfg80211_registered_device *rdev,
                               struct wireless_dev *wdev,
                               struct cfg80211_mgmt_tx_params *params,
                               u64 *cookie)
{
        int ret;
        trace_rdev_mgmt_tx(&rdev->wiphy, wdev, params);
        ret = rdev->ops->mgmt_tx(&rdev->wiphy, wdev, params, cookie);
        trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie);
        return ret;
}

static inline int rdev_tx_control_port(struct cfg80211_registered_device *rdev,
                                       struct net_device *dev,
                                       const void *buf, size_t len,
                                       const u8 *dest, __be16 proto,
                                       const bool noencrypt, int link,
                                       u64 *cookie)
{
        int ret;
        trace_rdev_tx_control_port(&rdev->wiphy, dev, buf, len,
                                   dest, proto, noencrypt, link);
        ret = rdev->ops->tx_control_port(&rdev->wiphy, dev, buf, len,
                                         dest, proto, noencrypt, link, cookie);
        if (cookie)
                trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie);
        else
                trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_mgmt_tx_cancel_wait(struct cfg80211_registered_device *rdev,
                         struct wireless_dev *wdev, u64 cookie)
{
        int ret;
        trace_rdev_mgmt_tx_cancel_wait(&rdev->wiphy, wdev, cookie);
        ret = rdev->ops->mgmt_tx_cancel_wait(&rdev->wiphy, wdev, cookie);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_set_power_mgmt(struct cfg80211_registered_device *rdev,
                                      struct net_device *dev, bool enabled,
                                      int timeout)
{
        int ret;
        trace_rdev_set_power_mgmt(&rdev->wiphy, dev, enabled, timeout);
        ret = rdev->ops->set_power_mgmt(&rdev->wiphy, dev, enabled, timeout);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_set_cqm_rssi_config(struct cfg80211_registered_device *rdev,
                         struct net_device *dev, s32 rssi_thold, u32 rssi_hyst)
{
        int ret;
        trace_rdev_set_cqm_rssi_config(&rdev->wiphy, dev, rssi_thold,
                                       rssi_hyst);
        ret = rdev->ops->set_cqm_rssi_config(&rdev->wiphy, dev, rssi_thold,
                                       rssi_hyst);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_set_cqm_rssi_range_config(struct cfg80211_registered_device *rdev,
                               struct net_device *dev, s32 low, s32 high)
{
        int ret;
        trace_rdev_set_cqm_rssi_range_config(&rdev->wiphy, dev, low, high);
        ret = rdev->ops->set_cqm_rssi_range_config(&rdev->wiphy, dev,
                                                   low, high);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_set_cqm_txe_config(struct cfg80211_registered_device *rdev,
                        struct net_device *dev, u32 rate, u32 pkts, u32 intvl)
{
        int ret;
        trace_rdev_set_cqm_txe_config(&rdev->wiphy, dev, rate, pkts, intvl);
        ret = rdev->ops->set_cqm_txe_config(&rdev->wiphy, dev, rate, pkts,
                                             intvl);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void
rdev_update_mgmt_frame_registrations(struct cfg80211_registered_device *rdev,
                                     struct wireless_dev *wdev,
                                     struct mgmt_frame_regs *upd)
{
        might_sleep();

        trace_rdev_update_mgmt_frame_registrations(&rdev->wiphy, wdev, upd);
        if (rdev->ops->update_mgmt_frame_registrations)
                rdev->ops->update_mgmt_frame_registrations(&rdev->wiphy, wdev,
                                                           upd);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline int rdev_set_antenna(struct cfg80211_registered_device *rdev,
                                   int radio_idx, u32 tx_ant, u32 rx_ant)
{
        int ret;
        trace_rdev_set_antenna(&rdev->wiphy, radio_idx, tx_ant, rx_ant);
        ret = rdev->ops->set_antenna(&rdev->wiphy, -1, tx_ant, rx_ant);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_get_antenna(struct cfg80211_registered_device *rdev,
                                   int radio_idx, u32 *tx_ant, u32 *rx_ant)
{
        int ret;
        trace_rdev_get_antenna(&rdev->wiphy, radio_idx);
        ret = rdev->ops->get_antenna(&rdev->wiphy, radio_idx, tx_ant, rx_ant);
        if (ret)
                trace_rdev_return_int(&rdev->wiphy, ret);
        else
                trace_rdev_return_int_tx_rx(&rdev->wiphy, ret, *tx_ant,
                                            *rx_ant);
        return ret;
}

static inline int
rdev_sched_scan_start(struct cfg80211_registered_device *rdev,
                      struct net_device *dev,
                      struct cfg80211_sched_scan_request *request)
{
        int ret;
        trace_rdev_sched_scan_start(&rdev->wiphy, dev, request->reqid);
        ret = rdev->ops->sched_scan_start(&rdev->wiphy, dev, request);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_sched_scan_stop(struct cfg80211_registered_device *rdev,
                                       struct net_device *dev, u64 reqid)
{
        int ret;
        trace_rdev_sched_scan_stop(&rdev->wiphy, dev, reqid);
        ret = rdev->ops->sched_scan_stop(&rdev->wiphy, dev, reqid);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_set_rekey_data(struct cfg80211_registered_device *rdev,
                                      struct net_device *dev,
                                      struct cfg80211_gtk_rekey_data *data)
{
        int ret;
        trace_rdev_set_rekey_data(&rdev->wiphy, dev);
        ret = rdev->ops->set_rekey_data(&rdev->wiphy, dev, data);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_tdls_mgmt(struct cfg80211_registered_device *rdev,
                                 struct net_device *dev, u8 *peer,
                                 int link_id, u8 action_code,
                                 u8 dialog_token, u16 status_code,
                                 u32 peer_capability, bool initiator,
                                 const u8 *buf, size_t len)
{
        int ret;
        trace_rdev_tdls_mgmt(&rdev->wiphy, dev, peer, link_id, action_code,
                             dialog_token, status_code, peer_capability,
                             initiator, buf, len);
        ret = rdev->ops->tdls_mgmt(&rdev->wiphy, dev, peer, link_id,
                                   action_code, dialog_token, status_code,
                                   peer_capability, initiator, buf, len);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_tdls_oper(struct cfg80211_registered_device *rdev,
                                 struct net_device *dev, u8 *peer,
                                 enum nl80211_tdls_operation oper)
{
        int ret;
        trace_rdev_tdls_oper(&rdev->wiphy, dev, peer, oper);
        ret = rdev->ops->tdls_oper(&rdev->wiphy, dev, peer, oper);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_probe_client(struct cfg80211_registered_device *rdev,
                                    struct net_device *dev, const u8 *peer,
                                    u64 *cookie)
{
        int ret;
        trace_rdev_probe_client(&rdev->wiphy, dev, peer);
        ret = rdev->ops->probe_client(&rdev->wiphy, dev, peer, cookie);
        trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie);
        return ret;
}

static inline int rdev_set_noack_map(struct cfg80211_registered_device *rdev,
                                     struct net_device *dev, u16 noack_map)
{
        int ret;
        trace_rdev_set_noack_map(&rdev->wiphy, dev, noack_map);
        ret = rdev->ops->set_noack_map(&rdev->wiphy, dev, noack_map);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_get_channel(struct cfg80211_registered_device *rdev,
                 struct wireless_dev *wdev,
                 unsigned int link_id,
                 struct cfg80211_chan_def *chandef)
{
        int ret;

        trace_rdev_get_channel(&rdev->wiphy, wdev, link_id);
        ret = rdev->ops->get_channel(&rdev->wiphy, wdev, link_id, chandef);
        trace_rdev_return_chandef(&rdev->wiphy, ret, chandef);

        return ret;
}

static inline int rdev_start_p2p_device(struct cfg80211_registered_device *rdev,
                                        struct wireless_dev *wdev)
{
        int ret;

        trace_rdev_start_p2p_device(&rdev->wiphy, wdev);
        ret = rdev->ops->start_p2p_device(&rdev->wiphy, wdev);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void rdev_stop_p2p_device(struct cfg80211_registered_device *rdev,
                                        struct wireless_dev *wdev)
{
        trace_rdev_stop_p2p_device(&rdev->wiphy, wdev);
        rdev->ops->stop_p2p_device(&rdev->wiphy, wdev);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline int rdev_start_nan(struct cfg80211_registered_device *rdev,
                                 struct wireless_dev *wdev,
                                 struct cfg80211_nan_conf *conf)
{
        int ret;

        trace_rdev_start_nan(&rdev->wiphy, wdev, conf);
        ret = rdev->ops->start_nan(&rdev->wiphy, wdev, conf);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void rdev_stop_nan(struct cfg80211_registered_device *rdev,
                                 struct wireless_dev *wdev)
{
        trace_rdev_stop_nan(&rdev->wiphy, wdev);
        rdev->ops->stop_nan(&rdev->wiphy, wdev);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline int
rdev_add_nan_func(struct cfg80211_registered_device *rdev,
                  struct wireless_dev *wdev,
                  struct cfg80211_nan_func *nan_func)
{
        int ret;

        trace_rdev_add_nan_func(&rdev->wiphy, wdev, nan_func);
        ret = rdev->ops->add_nan_func(&rdev->wiphy, wdev, nan_func);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void rdev_del_nan_func(struct cfg80211_registered_device *rdev,
                                    struct wireless_dev *wdev, u64 cookie)
{
        trace_rdev_del_nan_func(&rdev->wiphy, wdev, cookie);
        rdev->ops->del_nan_func(&rdev->wiphy, wdev, cookie);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline int
rdev_nan_change_conf(struct cfg80211_registered_device *rdev,
                     struct wireless_dev *wdev,
                     struct cfg80211_nan_conf *conf, u32 changes)
{
        int ret;

        trace_rdev_nan_change_conf(&rdev->wiphy, wdev, conf, changes);
        if (rdev->ops->nan_change_conf)
                ret = rdev->ops->nan_change_conf(&rdev->wiphy, wdev, conf,
                                                 changes);
        else
                ret = -EOPNOTSUPP;
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_set_mac_acl(struct cfg80211_registered_device *rdev,
                                   struct net_device *dev,
                                   struct cfg80211_acl_data *params)
{
        int ret;

        trace_rdev_set_mac_acl(&rdev->wiphy, dev, params);
        ret = rdev->ops->set_mac_acl(&rdev->wiphy, dev, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_update_ft_ies(struct cfg80211_registered_device *rdev,
                                     struct net_device *dev,
                                     struct cfg80211_update_ft_ies_params *ftie)
{
        int ret;

        trace_rdev_update_ft_ies(&rdev->wiphy, dev, ftie);
        ret = rdev->ops->update_ft_ies(&rdev->wiphy, dev, ftie);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_crit_proto_start(struct cfg80211_registered_device *rdev,
                                        struct wireless_dev *wdev,
                                        enum nl80211_crit_proto_id protocol,
                                        u16 duration)
{
        int ret;

        trace_rdev_crit_proto_start(&rdev->wiphy, wdev, protocol, duration);
        ret = rdev->ops->crit_proto_start(&rdev->wiphy, wdev,
                                          protocol, duration);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void rdev_crit_proto_stop(struct cfg80211_registered_device *rdev,
                                       struct wireless_dev *wdev)
{
        trace_rdev_crit_proto_stop(&rdev->wiphy, wdev);
        rdev->ops->crit_proto_stop(&rdev->wiphy, wdev);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline int rdev_channel_switch(struct cfg80211_registered_device *rdev,
                                      struct net_device *dev,
                                      struct cfg80211_csa_settings *params)
{
        int ret;

        trace_rdev_channel_switch(&rdev->wiphy, dev, params);
        ret = rdev->ops->channel_switch(&rdev->wiphy, dev, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_set_qos_map(struct cfg80211_registered_device *rdev,
                                   struct net_device *dev,
                                   struct cfg80211_qos_map *qos_map)
{
        int ret = -EOPNOTSUPP;

        if (rdev->ops->set_qos_map) {
                trace_rdev_set_qos_map(&rdev->wiphy, dev, qos_map);
                ret = rdev->ops->set_qos_map(&rdev->wiphy, dev, qos_map);
                trace_rdev_return_int(&rdev->wiphy, ret);
        }

        return ret;
}

static inline int
rdev_set_ap_chanwidth(struct cfg80211_registered_device *rdev,
                      struct net_device *dev,
                      unsigned int link_id,
                      struct cfg80211_chan_def *chandef)
{
        int ret;

        trace_rdev_set_ap_chanwidth(&rdev->wiphy, dev, link_id, chandef);
        ret = rdev->ops->set_ap_chanwidth(&rdev->wiphy, dev, link_id, chandef);
        trace_rdev_return_int(&rdev->wiphy, ret);

        return ret;
}

static inline int
rdev_add_tx_ts(struct cfg80211_registered_device *rdev,
               struct net_device *dev, u8 tsid, const u8 *peer,
               u8 user_prio, u16 admitted_time)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_add_tx_ts(&rdev->wiphy, dev, tsid, peer,
                             user_prio, admitted_time);
        if (rdev->ops->add_tx_ts)
                ret = rdev->ops->add_tx_ts(&rdev->wiphy, dev, tsid, peer,
                                           user_prio, admitted_time);
        trace_rdev_return_int(&rdev->wiphy, ret);

        return ret;
}

static inline int
rdev_del_tx_ts(struct cfg80211_registered_device *rdev,
               struct net_device *dev, u8 tsid, const u8 *peer)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_del_tx_ts(&rdev->wiphy, dev, tsid, peer);
        if (rdev->ops->del_tx_ts)
                ret = rdev->ops->del_tx_ts(&rdev->wiphy, dev, tsid, peer);
        trace_rdev_return_int(&rdev->wiphy, ret);

        return ret;
}

static inline int
rdev_tdls_channel_switch(struct cfg80211_registered_device *rdev,
                         struct net_device *dev, const u8 *addr,
                         u8 oper_class, struct cfg80211_chan_def *chandef)
{
        int ret;

        trace_rdev_tdls_channel_switch(&rdev->wiphy, dev, addr, oper_class,
                                       chandef);
        ret = rdev->ops->tdls_channel_switch(&rdev->wiphy, dev, addr,
                                             oper_class, chandef);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void
rdev_tdls_cancel_channel_switch(struct cfg80211_registered_device *rdev,
                                struct net_device *dev, const u8 *addr)
{
        trace_rdev_tdls_cancel_channel_switch(&rdev->wiphy, dev, addr);
        rdev->ops->tdls_cancel_channel_switch(&rdev->wiphy, dev, addr);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline int
rdev_start_radar_detection(struct cfg80211_registered_device *rdev,
                           struct net_device *dev,
                           struct cfg80211_chan_def *chandef,
                           u32 cac_time_ms, int link_id)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_start_radar_detection(&rdev->wiphy, dev, chandef,
                                         cac_time_ms, link_id);
        if (rdev->ops->start_radar_detection)
                ret = rdev->ops->start_radar_detection(&rdev->wiphy, dev,
                                                       chandef, cac_time_ms,
                                                       link_id);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void
rdev_end_cac(struct cfg80211_registered_device *rdev,
             struct net_device *dev, unsigned int link_id)
{
        trace_rdev_end_cac(&rdev->wiphy, dev, link_id);
        if (rdev->ops->end_cac)
                rdev->ops->end_cac(&rdev->wiphy, dev, link_id);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline int
rdev_set_mcast_rate(struct cfg80211_registered_device *rdev,
                    struct net_device *dev,
                    int mcast_rate[NUM_NL80211_BANDS])
{
        int ret = -EOPNOTSUPP;

        trace_rdev_set_mcast_rate(&rdev->wiphy, dev, mcast_rate);
        if (rdev->ops->set_mcast_rate)
                ret = rdev->ops->set_mcast_rate(&rdev->wiphy, dev, mcast_rate);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_set_coalesce(struct cfg80211_registered_device *rdev,
                  struct cfg80211_coalesce *coalesce)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_set_coalesce(&rdev->wiphy, coalesce);
        if (rdev->ops->set_coalesce)
                ret = rdev->ops->set_coalesce(&rdev->wiphy, coalesce);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_set_pmk(struct cfg80211_registered_device *rdev,
                               struct net_device *dev,
                               struct cfg80211_pmk_conf *pmk_conf)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_set_pmk(&rdev->wiphy, dev, pmk_conf);
        if (rdev->ops->set_pmk)
                ret = rdev->ops->set_pmk(&rdev->wiphy, dev, pmk_conf);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_del_pmk(struct cfg80211_registered_device *rdev,
                               struct net_device *dev, const u8 *aa)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_del_pmk(&rdev->wiphy, dev, aa);
        if (rdev->ops->del_pmk)
                ret = rdev->ops->del_pmk(&rdev->wiphy, dev, aa);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_external_auth(struct cfg80211_registered_device *rdev,
                   struct net_device *dev,
                   struct cfg80211_external_auth_params *params)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_external_auth(&rdev->wiphy, dev, params);
        if (rdev->ops->external_auth)
                ret = rdev->ops->external_auth(&rdev->wiphy, dev, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_get_ftm_responder_stats(struct cfg80211_registered_device *rdev,
                             struct net_device *dev,
                             struct cfg80211_ftm_responder_stats *ftm_stats)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_get_ftm_responder_stats(&rdev->wiphy, dev, ftm_stats);
        if (rdev->ops->get_ftm_responder_stats)
                ret = rdev->ops->get_ftm_responder_stats(&rdev->wiphy, dev,
                                                        ftm_stats);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_start_pmsr(struct cfg80211_registered_device *rdev,
                struct wireless_dev *wdev,
                struct cfg80211_pmsr_request *request)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_start_pmsr(&rdev->wiphy, wdev, request->cookie);
        if (rdev->ops->start_pmsr)
                ret = rdev->ops->start_pmsr(&rdev->wiphy, wdev, request);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void
rdev_abort_pmsr(struct cfg80211_registered_device *rdev,
                struct wireless_dev *wdev,
                struct cfg80211_pmsr_request *request)
{
        trace_rdev_abort_pmsr(&rdev->wiphy, wdev, request->cookie);
        if (rdev->ops->abort_pmsr)
                rdev->ops->abort_pmsr(&rdev->wiphy, wdev, request);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline int rdev_update_owe_info(struct cfg80211_registered_device *rdev,
                                       struct net_device *dev,
                                       struct cfg80211_update_owe_info *oweinfo)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_update_owe_info(&rdev->wiphy, dev, oweinfo);
        if (rdev->ops->update_owe_info)
                ret = rdev->ops->update_owe_info(&rdev->wiphy, dev, oweinfo);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_probe_mesh_link(struct cfg80211_registered_device *rdev,
                     struct net_device *dev, const u8 *dest,
                     const void *buf, size_t len)
{
        int ret;

        trace_rdev_probe_mesh_link(&rdev->wiphy, dev, dest, buf, len);
        ret = rdev->ops->probe_mesh_link(&rdev->wiphy, dev, buf, len);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_set_tid_config(struct cfg80211_registered_device *rdev,
                                      struct net_device *dev,
                                      struct cfg80211_tid_config *tid_conf)
{
        int ret;

        trace_rdev_set_tid_config(&rdev->wiphy, dev, tid_conf);
        ret = rdev->ops->set_tid_config(&rdev->wiphy, dev, tid_conf);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_reset_tid_config(struct cfg80211_registered_device *rdev,
                                        struct net_device *dev, const u8 *peer,
                                        u8 tids)
{
        int ret;

        trace_rdev_reset_tid_config(&rdev->wiphy, dev, peer, tids);
        ret = rdev->ops->reset_tid_config(&rdev->wiphy, dev, peer, tids);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_set_sar_specs(struct cfg80211_registered_device *rdev,
                                     struct cfg80211_sar_specs *sar)
{
        int ret;

        trace_rdev_set_sar_specs(&rdev->wiphy, sar);
        ret = rdev->ops->set_sar_specs(&rdev->wiphy, sar);
        trace_rdev_return_int(&rdev->wiphy, ret);

        return ret;
}

static inline int rdev_color_change(struct cfg80211_registered_device *rdev,
                                    struct net_device *dev,
                                    struct cfg80211_color_change_settings *params)
{
        int ret;

        trace_rdev_color_change(&rdev->wiphy, dev, params);
        ret = rdev->ops->color_change(&rdev->wiphy, dev, params);
        trace_rdev_return_int(&rdev->wiphy, ret);

        return ret;
}

static inline int
rdev_set_fils_aad(struct cfg80211_registered_device *rdev,
                  struct net_device *dev, struct cfg80211_fils_aad *fils_aad)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_set_fils_aad(&rdev->wiphy, dev, fils_aad);
        if (rdev->ops->set_fils_aad)
                ret = rdev->ops->set_fils_aad(&rdev->wiphy, dev, fils_aad);
        trace_rdev_return_int(&rdev->wiphy, ret);

        return ret;
}

static inline int
rdev_set_radar_background(struct cfg80211_registered_device *rdev,
                          struct cfg80211_chan_def *chandef)
{
        struct wiphy *wiphy = &rdev->wiphy;
        int ret = -EOPNOTSUPP;

        trace_rdev_set_radar_background(wiphy, chandef);
        if (rdev->ops->set_radar_background)
                ret = rdev->ops->set_radar_background(wiphy, chandef);
        trace_rdev_return_int(wiphy, ret);

        return ret;
}

static inline int
rdev_add_intf_link(struct cfg80211_registered_device *rdev,
                   struct wireless_dev *wdev,
                   unsigned int link_id)
{
        int ret = 0;

        trace_rdev_add_intf_link(&rdev->wiphy, wdev, link_id);
        if (rdev->ops->add_intf_link)
                ret = rdev->ops->add_intf_link(&rdev->wiphy, wdev, link_id);
        trace_rdev_return_int(&rdev->wiphy, ret);

        return ret;
}

static inline void
rdev_del_intf_link(struct cfg80211_registered_device *rdev,
                   struct wireless_dev *wdev,
                   unsigned int link_id)
{
        trace_rdev_del_intf_link(&rdev->wiphy, wdev, link_id);
        if (rdev->ops->del_intf_link)
                rdev->ops->del_intf_link(&rdev->wiphy, wdev, link_id);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline int
rdev_add_link_station(struct cfg80211_registered_device *rdev,
                      struct net_device *dev,
                      struct link_station_parameters *params)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_add_link_station(&rdev->wiphy, dev, params);
        if (rdev->ops->add_link_station)
                ret = rdev->ops->add_link_station(&rdev->wiphy, dev, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_mod_link_station(struct cfg80211_registered_device *rdev,
                      struct net_device *dev,
                      struct link_station_parameters *params)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_mod_link_station(&rdev->wiphy, dev, params);
        if (rdev->ops->mod_link_station)
                ret = rdev->ops->mod_link_station(&rdev->wiphy, dev, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_del_link_station(struct cfg80211_registered_device *rdev,
                      struct net_device *dev,
                      struct link_station_del_parameters *params)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_del_link_station(&rdev->wiphy, dev, params);
        if (rdev->ops->del_link_station)
                ret = rdev->ops->del_link_station(&rdev->wiphy, dev, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_set_hw_timestamp(struct cfg80211_registered_device *rdev,
                      struct net_device *dev,
                      struct cfg80211_set_hw_timestamp *hwts)
{
        struct wiphy *wiphy = &rdev->wiphy;
        int ret = -EOPNOTSUPP;

        trace_rdev_set_hw_timestamp(wiphy, dev, hwts);
        if (rdev->ops->set_hw_timestamp)
                ret = rdev->ops->set_hw_timestamp(wiphy, dev, hwts);
        trace_rdev_return_int(wiphy, ret);

        return ret;
}

static inline int
rdev_set_ttlm(struct cfg80211_registered_device *rdev,
              struct net_device *dev,
              struct cfg80211_ttlm_params *params)
{
        struct wiphy *wiphy = &rdev->wiphy;
        int ret = -EOPNOTSUPP;

        trace_rdev_set_ttlm(wiphy, dev, params);
        if (rdev->ops->set_ttlm)
                ret = rdev->ops->set_ttlm(wiphy, dev, params);
        trace_rdev_return_int(wiphy, ret);

        return ret;
}

static inline u32
rdev_get_radio_mask(struct cfg80211_registered_device *rdev,
                    struct net_device *dev)
{
        struct wiphy *wiphy = &rdev->wiphy;

        if (!rdev->ops->get_radio_mask)
                return 0;

        return rdev->ops->get_radio_mask(wiphy, dev);
}

static inline int
rdev_assoc_ml_reconf(struct cfg80211_registered_device *rdev,
                     struct net_device *dev,
                     struct cfg80211_ml_reconf_req *req)
{
        struct wiphy *wiphy = &rdev->wiphy;
        int ret = -EOPNOTSUPP;

        trace_rdev_assoc_ml_reconf(wiphy, dev, req);
        if (rdev->ops->assoc_ml_reconf)
                ret = rdev->ops->assoc_ml_reconf(wiphy, dev, req);
        trace_rdev_return_int(wiphy, ret);

        return ret;
}

static inline int
rdev_set_epcs(struct cfg80211_registered_device *rdev,
              struct net_device *dev, bool val)
{
        struct wiphy *wiphy = &rdev->wiphy;
        int ret = -EOPNOTSUPP;

        trace_rdev_set_epcs(wiphy, dev, val);
        if (rdev->ops->set_epcs)
                ret = rdev->ops->set_epcs(wiphy, dev, val);
        trace_rdev_return_int(wiphy, ret);

        return ret;
}

#endif /* __CFG80211_RDEV_OPS */





















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_WORD_AT_A_TIME_H
#define _ASM_WORD_AT_A_TIME_H

#include <linux/bitops.h>
#include <linux/wordpart.h>

struct word_at_a_time {
        const unsigned long one_bits, high_bits;
};

#define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) }

/* Return nonzero if it has a zero */
static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c)
{
        unsigned long mask = ((a - c->one_bits) & ~a) & c->high_bits;
        *bits = mask;
        return mask;
}

static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits, const struct word_at_a_time *c)
{
        return bits;
}

#ifdef CONFIG_64BIT

/* Keep the initial has_zero() value for both bitmask and size calc */
#define create_zero_mask(bits) (bits)

static inline unsigned long zero_bytemask(unsigned long bits)
{
        bits = (bits - 1) & ~bits;
        return bits >> 7;
}

#define find_zero(bits) (__ffs(bits) >> 3)

#else

/* Create the final mask for both bytemask and size */
static inline unsigned long create_zero_mask(unsigned long bits)
{
        bits = (bits - 1) & ~bits;
        return bits >> 7;
}

/* The mask we created is directly usable as a bytemask */
#define zero_bytemask(mask) (mask)

/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */
static inline unsigned long find_zero(unsigned long mask)
{
        /* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */
        long a = (0x0ff0001+mask) >> 23;
        /* Fix the 1 for 00 case */
        return a & mask;
}

#endif

/*
 * Load an unaligned word from kernel space.
 *
 * In the (very unlikely) case of the word being a page-crosser
 * and the next page not being mapped, take the exception and
 * return zeroes in the non-existing part.
 */
static inline unsigned long load_unaligned_zeropad(const void *addr)
{
        unsigned long ret;

        asm volatile(
                "1:        mov %[mem], %[ret]\n"
                "2:\n"
                _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_ZEROPAD)
                : [ret] "=r" (ret)
                : [mem] "m" (*(unsigned long *)addr));

        return ret;
}

#endif /* _ASM_WORD_AT_A_TIME_H */
















































































  315 




  319 
  315 






  317 













  316 















































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
// SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause)
/* Copyright (C) 2016-2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
 *
 * SipHash: a fast short-input PRF
 * https://131002.net/siphash/
 *
 * This implementation is specifically for SipHash2-4 for a secure PRF
 * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for
 * hashtables.
 */

#include <linux/siphash.h>
#include <linux/unaligned.h>

#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
#include <linux/dcache.h>
#include <asm/word-at-a-time.h>
#endif

#define SIPROUND SIPHASH_PERMUTATION(v0, v1, v2, v3)

#define PREAMBLE(len) \
        u64 v0 = SIPHASH_CONST_0; \
        u64 v1 = SIPHASH_CONST_1; \
        u64 v2 = SIPHASH_CONST_2; \
        u64 v3 = SIPHASH_CONST_3; \
        u64 b = ((u64)(len)) << 56; \
        v3 ^= key->key[1]; \
        v2 ^= key->key[0]; \
        v1 ^= key->key[1]; \
        v0 ^= key->key[0];

#define POSTAMBLE \
        v3 ^= b; \
        SIPROUND; \
        SIPROUND; \
        v0 ^= b; \
        v2 ^= 0xff; \
        SIPROUND; \
        SIPROUND; \
        SIPROUND; \
        SIPROUND; \
        return (v0 ^ v1) ^ (v2 ^ v3);

#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key)
{
        const u8 *end = data + len - (len % sizeof(u64));
        const u8 left = len & (sizeof(u64) - 1);
        u64 m;
        PREAMBLE(len)
        for (; data != end; data += sizeof(u64)) {
                m = le64_to_cpup(data);
                v3 ^= m;
                SIPROUND;
                SIPROUND;
                v0 ^= m;
        }
#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
        if (left)
                b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
                                                  bytemask_from_count(left)));
#else
        switch (left) {
        case 7: b |= ((u64)end[6]) << 48; fallthrough;
        case 6: b |= ((u64)end[5]) << 40; fallthrough;
        case 5: b |= ((u64)end[4]) << 32; fallthrough;
        case 4: b |= le32_to_cpup(data); break;
        case 3: b |= ((u64)end[2]) << 16; fallthrough;
        case 2: b |= le16_to_cpup(data); break;
        case 1: b |= end[0];
        }
#endif
        POSTAMBLE
}
EXPORT_SYMBOL(__siphash_aligned);
#endif

u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key)
{
        const u8 *end = data + len - (len % sizeof(u64));
        const u8 left = len & (sizeof(u64) - 1);
        u64 m;
        PREAMBLE(len)
        for (; data != end; data += sizeof(u64)) {
                m = get_unaligned_le64(data);
                v3 ^= m;
                SIPROUND;
                SIPROUND;
                v0 ^= m;
        }
#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
        if (left)
                b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
                                                  bytemask_from_count(left)));
#else
        switch (left) {
        case 7: b |= ((u64)end[6]) << 48; fallthrough;
        case 6: b |= ((u64)end[5]) << 40; fallthrough;
        case 5: b |= ((u64)end[4]) << 32; fallthrough;
        case 4: b |= get_unaligned_le32(end); break;
        case 3: b |= ((u64)end[2]) << 16; fallthrough;
        case 2: b |= get_unaligned_le16(end); break;
        case 1: b |= end[0];
        }
#endif
        POSTAMBLE
}
EXPORT_SYMBOL(__siphash_unaligned);

/**
 * siphash_1u64 - compute 64-bit siphash PRF value of a u64
 * @first: first u64
 * @key: the siphash key
 */
u64 siphash_1u64(const u64 first, const siphash_key_t *key)
{
        PREAMBLE(8)
        v3 ^= first;
        SIPROUND;
        SIPROUND;
        v0 ^= first;
        POSTAMBLE
}
EXPORT_SYMBOL(siphash_1u64);

/**
 * siphash_2u64 - compute 64-bit siphash PRF value of 2 u64
 * @first: first u64
 * @second: second u64
 * @key: the siphash key
 */
u64 siphash_2u64(const u64 first, const u64 second, const siphash_key_t *key)
{
        PREAMBLE(16)
        v3 ^= first;
        SIPROUND;
        SIPROUND;
        v0 ^= first;
        v3 ^= second;
        SIPROUND;
        SIPROUND;
        v0 ^= second;
        POSTAMBLE
}
EXPORT_SYMBOL(siphash_2u64);

/**
 * siphash_3u64 - compute 64-bit siphash PRF value of 3 u64
 * @first: first u64
 * @second: second u64
 * @third: third u64
 * @key: the siphash key
 */
u64 siphash_3u64(const u64 first, const u64 second, const u64 third,
                 const siphash_key_t *key)
{
        PREAMBLE(24)
        v3 ^= first;
        SIPROUND;
        SIPROUND;
        v0 ^= first;
        v3 ^= second;
        SIPROUND;
        SIPROUND;
        v0 ^= second;
        v3 ^= third;
        SIPROUND;
        SIPROUND;
        v0 ^= third;
        POSTAMBLE
}
EXPORT_SYMBOL(siphash_3u64);

/**
 * siphash_4u64 - compute 64-bit siphash PRF value of 4 u64
 * @first: first u64
 * @second: second u64
 * @third: third u64
 * @forth: forth u64
 * @key: the siphash key
 */
u64 siphash_4u64(const u64 first, const u64 second, const u64 third,
                 const u64 forth, const siphash_key_t *key)
{
        PREAMBLE(32)
        v3 ^= first;
        SIPROUND;
        SIPROUND;
        v0 ^= first;
        v3 ^= second;
        SIPROUND;
        SIPROUND;
        v0 ^= second;
        v3 ^= third;
        SIPROUND;
        SIPROUND;
        v0 ^= third;
        v3 ^= forth;
        SIPROUND;
        SIPROUND;
        v0 ^= forth;
        POSTAMBLE
}
EXPORT_SYMBOL(siphash_4u64);

u64 siphash_1u32(const u32 first, const siphash_key_t *key)
{
        PREAMBLE(4)
        b |= first;
        POSTAMBLE
}
EXPORT_SYMBOL(siphash_1u32);

u64 siphash_3u32(const u32 first, const u32 second, const u32 third,
                 const siphash_key_t *key)
{
        u64 combined = (u64)second << 32 | first;
        PREAMBLE(12)
        v3 ^= combined;
        SIPROUND;
        SIPROUND;
        v0 ^= combined;
        b |= third;
        POSTAMBLE
}
EXPORT_SYMBOL(siphash_3u32);

#if BITS_PER_LONG == 64
/* Note that on 64-bit, we make HalfSipHash1-3 actually be SipHash1-3, for
 * performance reasons. On 32-bit, below, we actually implement HalfSipHash1-3.
 */

#define HSIPROUND SIPROUND
#define HPREAMBLE(len) PREAMBLE(len)
#define HPOSTAMBLE \
        v3 ^= b; \
        HSIPROUND; \
        v0 ^= b; \
        v2 ^= 0xff; \
        HSIPROUND; \
        HSIPROUND; \
        HSIPROUND; \
        return (v0 ^ v1) ^ (v2 ^ v3);

#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key)
{
        const u8 *end = data + len - (len % sizeof(u64));
        const u8 left = len & (sizeof(u64) - 1);
        u64 m;
        HPREAMBLE(len)
        for (; data != end; data += sizeof(u64)) {
                m = le64_to_cpup(data);
                v3 ^= m;
                HSIPROUND;
                v0 ^= m;
        }
#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
        if (left)
                b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
                                                  bytemask_from_count(left)));
#else
        switch (left) {
        case 7: b |= ((u64)end[6]) << 48; fallthrough;
        case 6: b |= ((u64)end[5]) << 40; fallthrough;
        case 5: b |= ((u64)end[4]) << 32; fallthrough;
        case 4: b |= le32_to_cpup(data); break;
        case 3: b |= ((u64)end[2]) << 16; fallthrough;
        case 2: b |= le16_to_cpup(data); break;
        case 1: b |= end[0];
        }
#endif
        HPOSTAMBLE
}
EXPORT_SYMBOL(__hsiphash_aligned);
#endif

u32 __hsiphash_unaligned(const void *data, size_t len,
                         const hsiphash_key_t *key)
{
        const u8 *end = data + len - (len % sizeof(u64));
        const u8 left = len & (sizeof(u64) - 1);
        u64 m;
        HPREAMBLE(len)
        for (; data != end; data += sizeof(u64)) {
                m = get_unaligned_le64(data);
                v3 ^= m;
                HSIPROUND;
                v0 ^= m;
        }
#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
        if (left)
                b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
                                                  bytemask_from_count(left)));
#else
        switch (left) {
        case 7: b |= ((u64)end[6]) << 48; fallthrough;
        case 6: b |= ((u64)end[5]) << 40; fallthrough;
        case 5: b |= ((u64)end[4]) << 32; fallthrough;
        case 4: b |= get_unaligned_le32(end); break;
        case 3: b |= ((u64)end[2]) << 16; fallthrough;
        case 2: b |= get_unaligned_le16(end); break;
        case 1: b |= end[0];
        }
#endif
        HPOSTAMBLE
}
EXPORT_SYMBOL(__hsiphash_unaligned);

/**
 * hsiphash_1u32 - compute 64-bit hsiphash PRF value of a u32
 * @first: first u32
 * @key: the hsiphash key
 */
u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key)
{
        HPREAMBLE(4)
        b |= first;
        HPOSTAMBLE
}
EXPORT_SYMBOL(hsiphash_1u32);

/**
 * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32
 * @first: first u32
 * @second: second u32
 * @key: the hsiphash key
 */
u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key)
{
        u64 combined = (u64)second << 32 | first;
        HPREAMBLE(8)
        v3 ^= combined;
        HSIPROUND;
        v0 ^= combined;
        HPOSTAMBLE
}
EXPORT_SYMBOL(hsiphash_2u32);

/**
 * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32
 * @first: first u32
 * @second: second u32
 * @third: third u32
 * @key: the hsiphash key
 */
u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third,
                  const hsiphash_key_t *key)
{
        u64 combined = (u64)second << 32 | first;
        HPREAMBLE(12)
        v3 ^= combined;
        HSIPROUND;
        v0 ^= combined;
        b |= third;
        HPOSTAMBLE
}
EXPORT_SYMBOL(hsiphash_3u32);

/**
 * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32
 * @first: first u32
 * @second: second u32
 * @third: third u32
 * @forth: forth u32
 * @key: the hsiphash key
 */
u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third,
                  const u32 forth, const hsiphash_key_t *key)
{
        u64 combined = (u64)second << 32 | first;
        HPREAMBLE(16)
        v3 ^= combined;
        HSIPROUND;
        v0 ^= combined;
        combined = (u64)forth << 32 | third;
        v3 ^= combined;
        HSIPROUND;
        v0 ^= combined;
        HPOSTAMBLE
}
EXPORT_SYMBOL(hsiphash_4u32);
#else
#define HSIPROUND HSIPHASH_PERMUTATION(v0, v1, v2, v3)

#define HPREAMBLE(len) \
        u32 v0 = HSIPHASH_CONST_0; \
        u32 v1 = HSIPHASH_CONST_1; \
        u32 v2 = HSIPHASH_CONST_2; \
        u32 v3 = HSIPHASH_CONST_3; \
        u32 b = ((u32)(len)) << 24; \
        v3 ^= key->key[1]; \
        v2 ^= key->key[0]; \
        v1 ^= key->key[1]; \
        v0 ^= key->key[0];

#define HPOSTAMBLE \
        v3 ^= b; \
        HSIPROUND; \
        v0 ^= b; \
        v2 ^= 0xff; \
        HSIPROUND; \
        HSIPROUND; \
        HSIPROUND; \
        return v1 ^ v3;

#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key)
{
        const u8 *end = data + len - (len % sizeof(u32));
        const u8 left = len & (sizeof(u32) - 1);
        u32 m;
        HPREAMBLE(len)
        for (; data != end; data += sizeof(u32)) {
                m = le32_to_cpup(data);
                v3 ^= m;
                HSIPROUND;
                v0 ^= m;
        }
        switch (left) {
        case 3: b |= ((u32)end[2]) << 16; fallthrough;
        case 2: b |= le16_to_cpup(data); break;
        case 1: b |= end[0];
        }
        HPOSTAMBLE
}
EXPORT_SYMBOL(__hsiphash_aligned);
#endif

u32 __hsiphash_unaligned(const void *data, size_t len,
                         const hsiphash_key_t *key)
{
        const u8 *end = data + len - (len % sizeof(u32));
        const u8 left = len & (sizeof(u32) - 1);
        u32 m;
        HPREAMBLE(len)
        for (; data != end; data += sizeof(u32)) {
                m = get_unaligned_le32(data);
                v3 ^= m;
                HSIPROUND;
                v0 ^= m;
        }
        switch (left) {
        case 3: b |= ((u32)end[2]) << 16; fallthrough;
        case 2: b |= get_unaligned_le16(end); break;
        case 1: b |= end[0];
        }
        HPOSTAMBLE
}
EXPORT_SYMBOL(__hsiphash_unaligned);

/**
 * hsiphash_1u32 - compute 32-bit hsiphash PRF value of a u32
 * @first: first u32
 * @key: the hsiphash key
 */
u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key)
{
        HPREAMBLE(4)
        v3 ^= first;
        HSIPROUND;
        v0 ^= first;
        HPOSTAMBLE
}
EXPORT_SYMBOL(hsiphash_1u32);

/**
 * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32
 * @first: first u32
 * @second: second u32
 * @key: the hsiphash key
 */
u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key)
{
        HPREAMBLE(8)
        v3 ^= first;
        HSIPROUND;
        v0 ^= first;
        v3 ^= second;
        HSIPROUND;
        v0 ^= second;
        HPOSTAMBLE
}
EXPORT_SYMBOL(hsiphash_2u32);

/**
 * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32
 * @first: first u32
 * @second: second u32
 * @third: third u32
 * @key: the hsiphash key
 */
u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third,
                  const hsiphash_key_t *key)
{
        HPREAMBLE(12)
        v3 ^= first;
        HSIPROUND;
        v0 ^= first;
        v3 ^= second;
        HSIPROUND;
        v0 ^= second;
        v3 ^= third;
        HSIPROUND;
        v0 ^= third;
        HPOSTAMBLE
}
EXPORT_SYMBOL(hsiphash_3u32);

/**
 * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32
 * @first: first u32
 * @second: second u32
 * @third: third u32
 * @forth: forth u32
 * @key: the hsiphash key
 */
u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third,
                  const u32 forth, const hsiphash_key_t *key)
{
        HPREAMBLE(16)
        v3 ^= first;
        HSIPROUND;
        v0 ^= first;
        v3 ^= second;
        HSIPROUND;
        v0 ^= second;
        v3 ^= third;
        HSIPROUND;
        v0 ^= third;
        v3 ^= forth;
        HSIPROUND;
        v0 ^= forth;
        HPOSTAMBLE
}
EXPORT_SYMBOL(hsiphash_4u32);
#endif































































































































































  318 

















  315 






























  317 
  264 


  318 



















  315 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Netlink message type permission tables, for user generated messages.
 *
 * Author: James Morris <jmorris@redhat.com>
 *
 * Copyright (C) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
 */
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#include <linux/if.h>
#include <linux/inet_diag.h>
#include <linux/xfrm.h>
#include <linux/audit.h>
#include <linux/sock_diag.h>

#include "flask.h"
#include "av_permissions.h"
#include "security.h"

struct nlmsg_perm {
        u16 nlmsg_type;
        u32 perm;
};

static const struct nlmsg_perm nlmsg_route_perms[] = {
        { RTM_NEWLINK, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELLINK, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_GETLINK, NETLINK_ROUTE_SOCKET__NLMSG_READ },
        { RTM_SETLINK, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_NEWADDR, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELADDR, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_GETADDR, NETLINK_ROUTE_SOCKET__NLMSG_READ },
        { RTM_NEWROUTE, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELROUTE, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_GETROUTE, NETLINK_ROUTE_SOCKET__NLMSG_READ },
        { RTM_NEWNEIGH, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELNEIGH, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_GETNEIGH, NETLINK_ROUTE_SOCKET__NLMSG_READ },
        { RTM_NEWRULE, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELRULE, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_GETRULE, NETLINK_ROUTE_SOCKET__NLMSG_READ },
        { RTM_NEWQDISC, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELQDISC, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_GETQDISC, NETLINK_ROUTE_SOCKET__NLMSG_READ },
        { RTM_NEWTCLASS, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELTCLASS, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_GETTCLASS, NETLINK_ROUTE_SOCKET__NLMSG_READ },
        { RTM_NEWTFILTER, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELTFILTER, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_GETTFILTER, NETLINK_ROUTE_SOCKET__NLMSG_READ },
        { RTM_NEWACTION, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELACTION, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_GETACTION, NETLINK_ROUTE_SOCKET__NLMSG_READ },
        { RTM_NEWPREFIX, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_GETMULTICAST, NETLINK_ROUTE_SOCKET__NLMSG_READ },
        { RTM_GETANYCAST, NETLINK_ROUTE_SOCKET__NLMSG_READ },
        { RTM_GETNEIGHTBL, NETLINK_ROUTE_SOCKET__NLMSG_READ },
        { RTM_SETNEIGHTBL, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_NEWADDRLABEL, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELADDRLABEL, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_GETADDRLABEL, NETLINK_ROUTE_SOCKET__NLMSG_READ },
        { RTM_GETDCB, NETLINK_ROUTE_SOCKET__NLMSG_READ },
        { RTM_SETDCB, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_NEWNETCONF, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELNETCONF, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_GETNETCONF, NETLINK_ROUTE_SOCKET__NLMSG_READ },
        { RTM_NEWMDB, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELMDB, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_GETMDB, NETLINK_ROUTE_SOCKET__NLMSG_READ },
        { RTM_NEWNSID, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELNSID, NETLINK_ROUTE_SOCKET__NLMSG_READ },
        { RTM_GETNSID, NETLINK_ROUTE_SOCKET__NLMSG_READ },
        { RTM_NEWSTATS, NETLINK_ROUTE_SOCKET__NLMSG_READ },
        { RTM_GETSTATS, NETLINK_ROUTE_SOCKET__NLMSG_READ },
        { RTM_SETSTATS, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_NEWCACHEREPORT, NETLINK_ROUTE_SOCKET__NLMSG_READ },
        { RTM_NEWCHAIN, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELCHAIN, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_GETCHAIN, NETLINK_ROUTE_SOCKET__NLMSG_READ },
        { RTM_NEWNEXTHOP, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELNEXTHOP, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_GETNEXTHOP, NETLINK_ROUTE_SOCKET__NLMSG_READ },
        { RTM_NEWLINKPROP, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELLINKPROP, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_NEWVLAN, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELVLAN, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_GETVLAN, NETLINK_ROUTE_SOCKET__NLMSG_READ },
        { RTM_NEWNEXTHOPBUCKET, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELNEXTHOPBUCKET, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_GETNEXTHOPBUCKET, NETLINK_ROUTE_SOCKET__NLMSG_READ },
        { RTM_NEWTUNNEL, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_DELTUNNEL, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
        { RTM_GETTUNNEL, NETLINK_ROUTE_SOCKET__NLMSG_READ },
};

static const struct nlmsg_perm nlmsg_tcpdiag_perms[] = {
        { TCPDIAG_GETSOCK, NETLINK_TCPDIAG_SOCKET__NLMSG_READ },
        { SOCK_DIAG_BY_FAMILY, NETLINK_TCPDIAG_SOCKET__NLMSG_READ },
        { SOCK_DESTROY, NETLINK_TCPDIAG_SOCKET__NLMSG_WRITE },
};

static const struct nlmsg_perm nlmsg_xfrm_perms[] = {
        { XFRM_MSG_NEWSA, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
        { XFRM_MSG_DELSA, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
        { XFRM_MSG_GETSA, NETLINK_XFRM_SOCKET__NLMSG_READ },
        { XFRM_MSG_NEWPOLICY, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
        { XFRM_MSG_DELPOLICY, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
        { XFRM_MSG_GETPOLICY, NETLINK_XFRM_SOCKET__NLMSG_READ },
        { XFRM_MSG_ALLOCSPI, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
        { XFRM_MSG_ACQUIRE, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
        { XFRM_MSG_EXPIRE, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
        { XFRM_MSG_UPDPOLICY, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
        { XFRM_MSG_UPDSA, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
        { XFRM_MSG_POLEXPIRE, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
        { XFRM_MSG_FLUSHSA, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
        { XFRM_MSG_FLUSHPOLICY, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
        { XFRM_MSG_NEWAE, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
        { XFRM_MSG_GETAE, NETLINK_XFRM_SOCKET__NLMSG_READ },
        { XFRM_MSG_REPORT, NETLINK_XFRM_SOCKET__NLMSG_READ },
        { XFRM_MSG_MIGRATE, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
        { XFRM_MSG_NEWSADINFO, NETLINK_XFRM_SOCKET__NLMSG_READ },
        { XFRM_MSG_GETSADINFO, NETLINK_XFRM_SOCKET__NLMSG_READ },
        { XFRM_MSG_NEWSPDINFO, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
        { XFRM_MSG_GETSPDINFO, NETLINK_XFRM_SOCKET__NLMSG_READ },
        { XFRM_MSG_MAPPING, NETLINK_XFRM_SOCKET__NLMSG_READ },
        { XFRM_MSG_SETDEFAULT, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
        { XFRM_MSG_GETDEFAULT, NETLINK_XFRM_SOCKET__NLMSG_READ },
};

static const struct nlmsg_perm nlmsg_audit_perms[] = {
        { AUDIT_GET, NETLINK_AUDIT_SOCKET__NLMSG_READ },
        { AUDIT_SET, NETLINK_AUDIT_SOCKET__NLMSG_WRITE },
        { AUDIT_LIST, NETLINK_AUDIT_SOCKET__NLMSG_READPRIV },
        { AUDIT_ADD, NETLINK_AUDIT_SOCKET__NLMSG_WRITE },
        { AUDIT_DEL, NETLINK_AUDIT_SOCKET__NLMSG_WRITE },
        { AUDIT_LIST_RULES, NETLINK_AUDIT_SOCKET__NLMSG_READPRIV },
        { AUDIT_ADD_RULE, NETLINK_AUDIT_SOCKET__NLMSG_WRITE },
        { AUDIT_DEL_RULE, NETLINK_AUDIT_SOCKET__NLMSG_WRITE },
        { AUDIT_USER, NETLINK_AUDIT_SOCKET__NLMSG_RELAY },
        { AUDIT_SIGNAL_INFO, NETLINK_AUDIT_SOCKET__NLMSG_READ },
        { AUDIT_TRIM, NETLINK_AUDIT_SOCKET__NLMSG_WRITE },
        { AUDIT_MAKE_EQUIV, NETLINK_AUDIT_SOCKET__NLMSG_WRITE },
        { AUDIT_TTY_GET, NETLINK_AUDIT_SOCKET__NLMSG_READ },
        { AUDIT_TTY_SET, NETLINK_AUDIT_SOCKET__NLMSG_TTY_AUDIT },
        { AUDIT_GET_FEATURE, NETLINK_AUDIT_SOCKET__NLMSG_READ },
        { AUDIT_SET_FEATURE, NETLINK_AUDIT_SOCKET__NLMSG_WRITE },
};

static int nlmsg_perm(u16 nlmsg_type, u32 *perm, const struct nlmsg_perm *tab,
                      size_t tabsize)
{
        unsigned int i;
        int err = -EINVAL;

        for (i = 0; i < tabsize / sizeof(struct nlmsg_perm); i++)
                if (nlmsg_type == tab[i].nlmsg_type) {
                        *perm = tab[i].perm;
                        err = 0;
                        break;
                }

        return err;
}

int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm)
{
        /* While it is possible to add a similar permission to other netlink
         * classes, note that the extended permission value is matched against
         * the nlmsg_type field. Notably, SECCLASS_NETLINK_GENERIC_SOCKET uses
         * dynamic values for this field, which means that it cannot be added
         * as-is.
         */

        switch (sclass) {
        case SECCLASS_NETLINK_ROUTE_SOCKET:
                /* RTM_MAX always points to RTM_SETxxxx, ie RTM_NEWxxx + 3.
                 * If the BUILD_BUG_ON() below fails you must update the
                 * structures at the top of this file with the new mappings
                 * before updating the BUILD_BUG_ON() macro!
                 */
                BUILD_BUG_ON(RTM_MAX != (RTM_NEWTUNNEL + 3));

                if (selinux_policycap_netlink_xperm()) {
                        *perm = NETLINK_ROUTE_SOCKET__NLMSG;
                        return 0;
                }
                return nlmsg_perm(nlmsg_type, perm, nlmsg_route_perms,
                                  sizeof(nlmsg_route_perms));
                break;
        case SECCLASS_NETLINK_TCPDIAG_SOCKET:
                if (selinux_policycap_netlink_xperm()) {
                        *perm = NETLINK_TCPDIAG_SOCKET__NLMSG;
                        return 0;
                }
                return nlmsg_perm(nlmsg_type, perm, nlmsg_tcpdiag_perms,
                                  sizeof(nlmsg_tcpdiag_perms));
                break;
        case SECCLASS_NETLINK_XFRM_SOCKET:
                /* If the BUILD_BUG_ON() below fails you must update the
                 * structures at the top of this file with the new mappings
                 * before updating the BUILD_BUG_ON() macro!
                 */
                BUILD_BUG_ON(XFRM_MSG_MAX != XFRM_MSG_GETDEFAULT);

                if (selinux_policycap_netlink_xperm()) {
                        *perm = NETLINK_XFRM_SOCKET__NLMSG;
                        return 0;
                }
                return nlmsg_perm(nlmsg_type, perm, nlmsg_xfrm_perms,
                                  sizeof(nlmsg_xfrm_perms));
                break;
        case SECCLASS_NETLINK_AUDIT_SOCKET:
                if (selinux_policycap_netlink_xperm()) {
                        *perm = NETLINK_AUDIT_SOCKET__NLMSG;
                        return 0;
                } else if ((nlmsg_type >= AUDIT_FIRST_USER_MSG &&
                            nlmsg_type <= AUDIT_LAST_USER_MSG) ||
                           (nlmsg_type >= AUDIT_FIRST_USER_MSG2 &&
                            nlmsg_type <= AUDIT_LAST_USER_MSG2)) {
                        *perm = NETLINK_AUDIT_SOCKET__NLMSG_RELAY;
                        return 0;
                }
                return nlmsg_perm(nlmsg_type, perm, nlmsg_audit_perms,
                                  sizeof(nlmsg_audit_perms));
                break;
        }

        /* No messaging from userspace, or class unknown/unhandled */
        return -ENOENT;
}


























































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM writeback

#if !defined(_TRACE_WRITEBACK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_WRITEBACK_H

#include <linux/tracepoint.h>
#include <linux/backing-dev.h>
#include <linux/writeback.h>

#define show_inode_state(state)                                        \
        __print_flags(state, "|",                                \
                {I_DIRTY_SYNC,                "I_DIRTY_SYNC"},        \
                {I_DIRTY_DATASYNC,        "I_DIRTY_DATASYNC"},        \
                {I_DIRTY_PAGES,                "I_DIRTY_PAGES"},        \
                {I_NEW,                        "I_NEW"},                \
                {I_WILL_FREE,                "I_WILL_FREE"},                \
                {I_FREEING,                "I_FREEING"},                \
                {I_CLEAR,                "I_CLEAR"},                \
                {I_SYNC,                "I_SYNC"},                \
                {I_DIRTY_TIME,                "I_DIRTY_TIME"},        \
                {I_REFERENCED,                "I_REFERENCED"},        \
                {I_LINKABLE,                "I_LINKABLE"},                \
                {I_WB_SWITCH,                "I_WB_SWITCH"},                \
                {I_OVL_INUSE,                "I_OVL_INUSE"},                \
                {I_CREATING,                "I_CREATING"},                \
                {I_DONTCACHE,                "I_DONTCACHE"},                \
                {I_SYNC_QUEUED,                "I_SYNC_QUEUED"},        \
                {I_PINNING_NETFS_WB,        "I_PINNING_NETFS_WB"},        \
                {I_LRU_ISOLATING,        "I_LRU_ISOLATING"}        \
        )

/* enums need to be exported to user space */
#undef EM
#undef EMe
#define EM(a,b)         TRACE_DEFINE_ENUM(a);
#define EMe(a,b)        TRACE_DEFINE_ENUM(a);

#define WB_WORK_REASON                                                        \
        EM( WB_REASON_BACKGROUND,                "background")                \
        EM( WB_REASON_VMSCAN,                        "vmscan")                \
        EM( WB_REASON_SYNC,                        "sync")                        \
        EM( WB_REASON_PERIODIC,                        "periodic")                \
        EM( WB_REASON_LAPTOP_TIMER,                "laptop_timer")                \
        EM( WB_REASON_FS_FREE_SPACE,                "fs_free_space")        \
        EM( WB_REASON_FORKER_THREAD,                "forker_thread")        \
        EMe(WB_REASON_FOREIGN_FLUSH,                "foreign_flush")

WB_WORK_REASON

/*
 * Now redefine the EM() and EMe() macros to map the enums to the strings
 * that will be printed in the output.
 */
#undef EM
#undef EMe
#define EM(a,b)                { a, b },
#define EMe(a,b)        { a, b }

struct wb_writeback_work;

DECLARE_EVENT_CLASS(writeback_folio_template,

        TP_PROTO(struct folio *folio, struct address_space *mapping),

        TP_ARGS(folio, mapping),

        TP_STRUCT__entry (
                __array(char, name, 32)
                __field(ino_t, ino)
                __field(pgoff_t, index)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name,
                            bdi_dev_name(mapping ? inode_to_bdi(mapping->host) :
                                         NULL), 32);
                __entry->ino = (mapping && mapping->host) ? mapping->host->i_ino : 0;
                __entry->index = folio->index;
        ),

        TP_printk("bdi %s: ino=%lu index=%lu",
                __entry->name,
                (unsigned long)__entry->ino,
                __entry->index
        )
);

DEFINE_EVENT(writeback_folio_template, writeback_dirty_folio,

        TP_PROTO(struct folio *folio, struct address_space *mapping),

        TP_ARGS(folio, mapping)
);

DEFINE_EVENT(writeback_folio_template, folio_wait_writeback,

        TP_PROTO(struct folio *folio, struct address_space *mapping),

        TP_ARGS(folio, mapping)
);

DECLARE_EVENT_CLASS(writeback_dirty_inode_template,

        TP_PROTO(struct inode *inode, int flags),

        TP_ARGS(inode, flags),

        TP_STRUCT__entry (
                __array(char, name, 32)
                __field(ino_t, ino)
                __field(unsigned long, state)
                __field(unsigned long, flags)
        ),

        TP_fast_assign(
                struct backing_dev_info *bdi = inode_to_bdi(inode);

                /* may be called for files on pseudo FSes w/ unregistered bdi */
                strscpy_pad(__entry->name, bdi_dev_name(bdi), 32);
                __entry->ino                = inode->i_ino;
                __entry->state                = inode->i_state;
                __entry->flags                = flags;
        ),

        TP_printk("bdi %s: ino=%lu state=%s flags=%s",
                __entry->name,
                (unsigned long)__entry->ino,
                show_inode_state(__entry->state),
                show_inode_state(__entry->flags)
        )
);

DEFINE_EVENT(writeback_dirty_inode_template, writeback_mark_inode_dirty,

        TP_PROTO(struct inode *inode, int flags),

        TP_ARGS(inode, flags)
);

DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode_start,

        TP_PROTO(struct inode *inode, int flags),

        TP_ARGS(inode, flags)
);

DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode,

        TP_PROTO(struct inode *inode, int flags),

        TP_ARGS(inode, flags)
);

#ifdef CREATE_TRACE_POINTS
#ifdef CONFIG_CGROUP_WRITEBACK

static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb)
{
        return cgroup_ino(wb->memcg_css->cgroup);
}

static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc)
{
        if (wbc->wb)
                return __trace_wb_assign_cgroup(wbc->wb);
        else
                return 1;
}
#else        /* CONFIG_CGROUP_WRITEBACK */

static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb)
{
        return 1;
}

static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc)
{
        return 1;
}

#endif        /* CONFIG_CGROUP_WRITEBACK */
#endif        /* CREATE_TRACE_POINTS */

#ifdef CONFIG_CGROUP_WRITEBACK
TRACE_EVENT(inode_foreign_history,

        TP_PROTO(struct inode *inode, struct writeback_control *wbc,
                 unsigned int history),

        TP_ARGS(inode, wbc, history),

        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(ino_t,                ino)
                __field(ino_t,                cgroup_ino)
                __field(unsigned int,        history)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32);
                __entry->ino                = inode->i_ino;
                __entry->cgroup_ino        = __trace_wbc_assign_cgroup(wbc);
                __entry->history        = history;
        ),

        TP_printk("bdi %s: ino=%lu cgroup_ino=%lu history=0x%x",
                __entry->name,
                (unsigned long)__entry->ino,
                (unsigned long)__entry->cgroup_ino,
                __entry->history
        )
);

TRACE_EVENT(inode_switch_wbs_queue,

        TP_PROTO(struct bdi_writeback *old_wb, struct bdi_writeback *new_wb,
                 unsigned int count),

        TP_ARGS(old_wb, new_wb, count),

        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(ino_t,                old_cgroup_ino)
                __field(ino_t,                new_cgroup_ino)
                __field(unsigned int,        count)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(old_wb->bdi), 32);
                __entry->old_cgroup_ino        = __trace_wb_assign_cgroup(old_wb);
                __entry->new_cgroup_ino        = __trace_wb_assign_cgroup(new_wb);
                __entry->count                = count;
        ),

        TP_printk("bdi %s: old_cgroup_ino=%lu new_cgroup_ino=%lu count=%u",
                __entry->name,
                (unsigned long)__entry->old_cgroup_ino,
                (unsigned long)__entry->new_cgroup_ino,
                __entry->count
        )
);

TRACE_EVENT(inode_switch_wbs,

        TP_PROTO(struct inode *inode, struct bdi_writeback *old_wb,
                 struct bdi_writeback *new_wb),

        TP_ARGS(inode, old_wb, new_wb),

        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(ino_t,                ino)
                __field(ino_t,                old_cgroup_ino)
                __field(ino_t,                new_cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(old_wb->bdi), 32);
                __entry->ino                = inode->i_ino;
                __entry->old_cgroup_ino        = __trace_wb_assign_cgroup(old_wb);
                __entry->new_cgroup_ino        = __trace_wb_assign_cgroup(new_wb);
        ),

        TP_printk("bdi %s: ino=%lu old_cgroup_ino=%lu new_cgroup_ino=%lu",
                __entry->name,
                (unsigned long)__entry->ino,
                (unsigned long)__entry->old_cgroup_ino,
                (unsigned long)__entry->new_cgroup_ino
        )
);

TRACE_EVENT(track_foreign_dirty,

        TP_PROTO(struct folio *folio, struct bdi_writeback *wb),

        TP_ARGS(folio, wb),

        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(u64,                bdi_id)
                __field(ino_t,                ino)
                __field(unsigned int,        memcg_id)
                __field(ino_t,                cgroup_ino)
                __field(ino_t,                page_cgroup_ino)
        ),

        TP_fast_assign(
                struct address_space *mapping = folio_mapping(folio);
                struct inode *inode = mapping ? mapping->host : NULL;

                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->bdi_id                = wb->bdi->id;
                __entry->ino                = inode ? inode->i_ino : 0;
                __entry->memcg_id        = wb->memcg_css->id;
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
                __entry->page_cgroup_ino = cgroup_ino(folio_memcg(folio)->css.cgroup);
        ),

        TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu",
                __entry->name,
                __entry->bdi_id,
                (unsigned long)__entry->ino,
                __entry->memcg_id,
                (unsigned long)__entry->cgroup_ino,
                (unsigned long)__entry->page_cgroup_ino
        )
);

TRACE_EVENT(flush_foreign,

        TP_PROTO(struct bdi_writeback *wb, unsigned int frn_bdi_id,
                 unsigned int frn_memcg_id),

        TP_ARGS(wb, frn_bdi_id, frn_memcg_id),

        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(ino_t,                cgroup_ino)
                __field(unsigned int,        frn_bdi_id)
                __field(unsigned int,        frn_memcg_id)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
                __entry->frn_bdi_id        = frn_bdi_id;
                __entry->frn_memcg_id        = frn_memcg_id;
        ),

        TP_printk("bdi %s: cgroup_ino=%lu frn_bdi_id=%u frn_memcg_id=%u",
                __entry->name,
                (unsigned long)__entry->cgroup_ino,
                __entry->frn_bdi_id,
                __entry->frn_memcg_id
        )
);
#endif

DECLARE_EVENT_CLASS(writeback_write_inode_template,

        TP_PROTO(struct inode *inode, struct writeback_control *wbc),

        TP_ARGS(inode, wbc),

        TP_STRUCT__entry (
                __array(char, name, 32)
                __field(ino_t, ino)
                __field(int, sync_mode)
                __field(ino_t, cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name,
                            bdi_dev_name(inode_to_bdi(inode)), 32);
                __entry->ino                = inode->i_ino;
                __entry->sync_mode        = wbc->sync_mode;
                __entry->cgroup_ino        = __trace_wbc_assign_cgroup(wbc);
        ),

        TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup_ino=%lu",
                __entry->name,
                (unsigned long)__entry->ino,
                __entry->sync_mode,
                (unsigned long)__entry->cgroup_ino
        )
);

DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode_start,

        TP_PROTO(struct inode *inode, struct writeback_control *wbc),

        TP_ARGS(inode, wbc)
);

DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode,

        TP_PROTO(struct inode *inode, struct writeback_control *wbc),

        TP_ARGS(inode, wbc)
);

DECLARE_EVENT_CLASS(writeback_work_class,
        TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work),
        TP_ARGS(wb, work),
        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(long, nr_pages)
                __field(dev_t, sb_dev)
                __field(int, sync_mode)
                __field(int, for_kupdate)
                __field(int, range_cyclic)
                __field(int, for_background)
                __field(int, reason)
                __field(ino_t, cgroup_ino)
        ),
        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->nr_pages = work->nr_pages;
                __entry->sb_dev = work->sb ? work->sb->s_dev : 0;
                __entry->sync_mode = work->sync_mode;
                __entry->for_kupdate = work->for_kupdate;
                __entry->range_cyclic = work->range_cyclic;
                __entry->for_background        = work->for_background;
                __entry->reason = work->reason;
                __entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
        ),
        TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d "
                  "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup_ino=%lu",
                  __entry->name,
                  MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev),
                  __entry->nr_pages,
                  __entry->sync_mode,
                  __entry->for_kupdate,
                  __entry->range_cyclic,
                  __entry->for_background,
                  __print_symbolic(__entry->reason, WB_WORK_REASON),
                  (unsigned long)__entry->cgroup_ino
        )
);
#define DEFINE_WRITEBACK_WORK_EVENT(name) \
DEFINE_EVENT(writeback_work_class, name, \
        TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), \
        TP_ARGS(wb, work))
DEFINE_WRITEBACK_WORK_EVENT(writeback_queue);
DEFINE_WRITEBACK_WORK_EVENT(writeback_exec);
DEFINE_WRITEBACK_WORK_EVENT(writeback_start);
DEFINE_WRITEBACK_WORK_EVENT(writeback_written);
DEFINE_WRITEBACK_WORK_EVENT(writeback_wait);

TRACE_EVENT(writeback_pages_written,
        TP_PROTO(long pages_written),
        TP_ARGS(pages_written),
        TP_STRUCT__entry(
                __field(long,                pages)
        ),
        TP_fast_assign(
                __entry->pages                = pages_written;
        ),
        TP_printk("%ld", __entry->pages)
);

DECLARE_EVENT_CLASS(writeback_class,
        TP_PROTO(struct bdi_writeback *wb),
        TP_ARGS(wb),
        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(ino_t, cgroup_ino)
        ),
        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
        ),
        TP_printk("bdi %s: cgroup_ino=%lu",
                  __entry->name,
                  (unsigned long)__entry->cgroup_ino
        )
);
#define DEFINE_WRITEBACK_EVENT(name) \
DEFINE_EVENT(writeback_class, name, \
        TP_PROTO(struct bdi_writeback *wb), \
        TP_ARGS(wb))

DEFINE_WRITEBACK_EVENT(writeback_wake_background);

TRACE_EVENT(writeback_bdi_register,
        TP_PROTO(struct backing_dev_info *bdi),
        TP_ARGS(bdi),
        TP_STRUCT__entry(
                __array(char, name, 32)
        ),
        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(bdi), 32);
        ),
        TP_printk("bdi %s",
                __entry->name
        )
);

DECLARE_EVENT_CLASS(wbc_class,
        TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
        TP_ARGS(wbc, bdi),
        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(long, nr_to_write)
                __field(long, pages_skipped)
                __field(int, sync_mode)
                __field(int, for_kupdate)
                __field(int, for_background)
                __field(int, range_cyclic)
                __field(long, range_start)
                __field(long, range_end)
                __field(ino_t, cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(bdi), 32);
                __entry->nr_to_write        = wbc->nr_to_write;
                __entry->pages_skipped        = wbc->pages_skipped;
                __entry->sync_mode        = wbc->sync_mode;
                __entry->for_kupdate        = wbc->for_kupdate;
                __entry->for_background        = wbc->for_background;
                __entry->range_cyclic        = wbc->range_cyclic;
                __entry->range_start        = (long)wbc->range_start;
                __entry->range_end        = (long)wbc->range_end;
                __entry->cgroup_ino        = __trace_wbc_assign_cgroup(wbc);
        ),

        TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d bgrd=%d "
                "cyclic=%d start=0x%lx end=0x%lx cgroup_ino=%lu",
                __entry->name,
                __entry->nr_to_write,
                __entry->pages_skipped,
                __entry->sync_mode,
                __entry->for_kupdate,
                __entry->for_background,
                __entry->range_cyclic,
                __entry->range_start,
                __entry->range_end,
                (unsigned long)__entry->cgroup_ino
        )
)

#define DEFINE_WBC_EVENT(name) \
DEFINE_EVENT(wbc_class, name, \
        TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \
        TP_ARGS(wbc, bdi))
DEFINE_WBC_EVENT(wbc_writepage);

TRACE_EVENT(writeback_queue_io,
        TP_PROTO(struct bdi_writeback *wb,
                 struct wb_writeback_work *work,
                 unsigned long dirtied_before,
                 int moved),
        TP_ARGS(wb, work, dirtied_before, moved),
        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(unsigned long,        older)
                __field(long,                age)
                __field(int,                moved)
                __field(int,                reason)
                __field(ino_t,                cgroup_ino)
        ),
        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->older        = dirtied_before;
                __entry->age        = (jiffies - dirtied_before) * 1000 / HZ;
                __entry->moved        = moved;
                __entry->reason        = work->reason;
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
        ),
        TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%lu",
                __entry->name,
                __entry->older,        /* dirtied_before in jiffies */
                __entry->age,        /* dirtied_before in relative milliseconds */
                __entry->moved,
                __print_symbolic(__entry->reason, WB_WORK_REASON),
                (unsigned long)__entry->cgroup_ino
        )
);

TRACE_EVENT(global_dirty_state,

        TP_PROTO(unsigned long background_thresh,
                 unsigned long dirty_thresh
        ),

        TP_ARGS(background_thresh,
                dirty_thresh
        ),

        TP_STRUCT__entry(
                __field(unsigned long,        nr_dirty)
                __field(unsigned long,        nr_writeback)
                __field(unsigned long,        background_thresh)
                __field(unsigned long,        dirty_thresh)
                __field(unsigned long,        dirty_limit)
                __field(unsigned long,        nr_dirtied)
                __field(unsigned long,        nr_written)
        ),

        TP_fast_assign(
                __entry->nr_dirty        = global_node_page_state(NR_FILE_DIRTY);
                __entry->nr_writeback        = global_node_page_state(NR_WRITEBACK);
                __entry->nr_dirtied        = global_node_page_state(NR_DIRTIED);
                __entry->nr_written        = global_node_page_state(NR_WRITTEN);
                __entry->background_thresh = background_thresh;
                __entry->dirty_thresh        = dirty_thresh;
                __entry->dirty_limit        = global_wb_domain.dirty_limit;
        ),

        TP_printk("dirty=%lu writeback=%lu "
                  "bg_thresh=%lu thresh=%lu limit=%lu "
                  "dirtied=%lu written=%lu",
                  __entry->nr_dirty,
                  __entry->nr_writeback,
                  __entry->background_thresh,
                  __entry->dirty_thresh,
                  __entry->dirty_limit,
                  __entry->nr_dirtied,
                  __entry->nr_written
        )
);

#define KBps(x)                        ((x) << (PAGE_SHIFT - 10))

TRACE_EVENT(bdi_dirty_ratelimit,

        TP_PROTO(struct bdi_writeback *wb,
                 unsigned long dirty_rate,
                 unsigned long task_ratelimit),

        TP_ARGS(wb, dirty_rate, task_ratelimit),

        TP_STRUCT__entry(
                __array(char,                bdi, 32)
                __field(unsigned long,        write_bw)
                __field(unsigned long,        avg_write_bw)
                __field(unsigned long,        dirty_rate)
                __field(unsigned long,        dirty_ratelimit)
                __field(unsigned long,        task_ratelimit)
                __field(unsigned long,        balanced_dirty_ratelimit)
                __field(ino_t,                cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32);
                __entry->write_bw        = KBps(wb->write_bandwidth);
                __entry->avg_write_bw        = KBps(wb->avg_write_bandwidth);
                __entry->dirty_rate        = KBps(dirty_rate);
                __entry->dirty_ratelimit = KBps(wb->dirty_ratelimit);
                __entry->task_ratelimit        = KBps(task_ratelimit);
                __entry->balanced_dirty_ratelimit =
                                        KBps(wb->balanced_dirty_ratelimit);
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
        ),

        TP_printk("bdi %s: "
                  "write_bw=%lu awrite_bw=%lu dirty_rate=%lu "
                  "dirty_ratelimit=%lu task_ratelimit=%lu "
                  "balanced_dirty_ratelimit=%lu cgroup_ino=%lu",
                  __entry->bdi,
                  __entry->write_bw,                /* write bandwidth */
                  __entry->avg_write_bw,        /* avg write bandwidth */
                  __entry->dirty_rate,                /* bdi dirty rate */
                  __entry->dirty_ratelimit,        /* base ratelimit */
                  __entry->task_ratelimit, /* ratelimit with position control */
                  __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */
                  (unsigned long)__entry->cgroup_ino
        )
);

TRACE_EVENT(balance_dirty_pages,

        TP_PROTO(struct bdi_writeback *wb,
                 struct dirty_throttle_control *dtc,
                 unsigned long dirty_ratelimit,
                 unsigned long task_ratelimit,
                 unsigned long dirtied,
                 unsigned long period,
                 long pause,
                 unsigned long start_time),

        TP_ARGS(wb, dtc,
                dirty_ratelimit, task_ratelimit,
                dirtied, period, pause, start_time),

        TP_STRUCT__entry(
                __array(         char,        bdi, 32)
                __field(unsigned long,        limit)
                __field(unsigned long,        setpoint)
                __field(unsigned long,        dirty)
                __field(unsigned long,        wb_setpoint)
                __field(unsigned long,        wb_dirty)
                __field(unsigned long,        dirty_ratelimit)
                __field(unsigned long,        task_ratelimit)
                __field(unsigned int,        dirtied)
                __field(unsigned int,        dirtied_pause)
                __field(unsigned long,        paused)
                __field(         long,        pause)
                __field(unsigned long,        period)
                __field(         long,        think)
                __field(ino_t,                cgroup_ino)
        ),

        TP_fast_assign(
                unsigned long freerun = (dtc->thresh + dtc->bg_thresh) / 2;
                strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32);

                __entry->limit                = dtc->limit;
                __entry->setpoint        = (dtc->limit + freerun) / 2;
                __entry->dirty                = dtc->dirty;
                __entry->wb_setpoint        = __entry->setpoint *
                                                dtc->wb_thresh / (dtc->thresh + 1);
                __entry->wb_dirty        = dtc->wb_dirty;
                __entry->dirty_ratelimit = KBps(dirty_ratelimit);
                __entry->task_ratelimit        = KBps(task_ratelimit);
                __entry->dirtied        = dirtied;
                __entry->dirtied_pause        = current->nr_dirtied_pause;
                __entry->think                = current->dirty_paused_when == 0 ? 0 :
                         (long)(jiffies - current->dirty_paused_when) * 1000/HZ;
                __entry->period                = period * 1000 / HZ;
                __entry->pause                = pause * 1000 / HZ;
                __entry->paused                = (jiffies - start_time) * 1000 / HZ;
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
        ),


        TP_printk("bdi %s: "
                  "limit=%lu setpoint=%lu dirty=%lu "
                  "wb_setpoint=%lu wb_dirty=%lu "
                  "dirty_ratelimit=%lu task_ratelimit=%lu "
                  "dirtied=%u dirtied_pause=%u "
                  "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%lu",
                  __entry->bdi,
                  __entry->limit,
                  __entry->setpoint,
                  __entry->dirty,
                  __entry->wb_setpoint,
                  __entry->wb_dirty,
                  __entry->dirty_ratelimit,
                  __entry->task_ratelimit,
                  __entry->dirtied,
                  __entry->dirtied_pause,
                  __entry->paused,        /* ms */
                  __entry->pause,        /* ms */
                  __entry->period,        /* ms */
                  __entry->think,        /* ms */
                  (unsigned long)__entry->cgroup_ino
          )
);

TRACE_EVENT(writeback_sb_inodes_requeue,

        TP_PROTO(struct inode *inode),
        TP_ARGS(inode),

        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(ino_t, ino)
                __field(unsigned long, state)
                __field(unsigned long, dirtied_when)
                __field(ino_t, cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name,
                            bdi_dev_name(inode_to_bdi(inode)), 32);
                __entry->ino                = inode->i_ino;
                __entry->state                = inode->i_state;
                __entry->dirtied_when        = inode->dirtied_when;
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(inode_to_wb(inode));
        ),

        TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup_ino=%lu",
                  __entry->name,
                  (unsigned long)__entry->ino,
                  show_inode_state(__entry->state),
                  __entry->dirtied_when,
                  (jiffies - __entry->dirtied_when) / HZ,
                  (unsigned long)__entry->cgroup_ino
        )
);

DECLARE_EVENT_CLASS(writeback_single_inode_template,

        TP_PROTO(struct inode *inode,
                 struct writeback_control *wbc,
                 unsigned long nr_to_write
        ),

        TP_ARGS(inode, wbc, nr_to_write),

        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(ino_t, ino)
                __field(unsigned long, state)
                __field(unsigned long, dirtied_when)
                __field(unsigned long, writeback_index)
                __field(long, nr_to_write)
                __field(unsigned long, wrote)
                __field(ino_t, cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name,
                            bdi_dev_name(inode_to_bdi(inode)), 32);
                __entry->ino                = inode->i_ino;
                __entry->state                = inode->i_state;
                __entry->dirtied_when        = inode->dirtied_when;
                __entry->writeback_index = inode->i_mapping->writeback_index;
                __entry->nr_to_write        = nr_to_write;
                __entry->wrote                = nr_to_write - wbc->nr_to_write;
                __entry->cgroup_ino        = __trace_wbc_assign_cgroup(wbc);
        ),

        TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu "
                  "index=%lu to_write=%ld wrote=%lu cgroup_ino=%lu",
                  __entry->name,
                  (unsigned long)__entry->ino,
                  show_inode_state(__entry->state),
                  __entry->dirtied_when,
                  (jiffies - __entry->dirtied_when) / HZ,
                  __entry->writeback_index,
                  __entry->nr_to_write,
                  __entry->wrote,
                  (unsigned long)__entry->cgroup_ino
        )
);

DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_start,
        TP_PROTO(struct inode *inode,
                 struct writeback_control *wbc,
                 unsigned long nr_to_write),
        TP_ARGS(inode, wbc, nr_to_write)
);

DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode,
        TP_PROTO(struct inode *inode,
                 struct writeback_control *wbc,
                 unsigned long nr_to_write),
        TP_ARGS(inode, wbc, nr_to_write)
);

DECLARE_EVENT_CLASS(writeback_inode_template,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(unsigned long,        state                        )
                __field(        __u16, mode                        )
                __field(unsigned long, dirtied_when                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->state        = inode->i_state;
                __entry->mode        = inode->i_mode;
                __entry->dirtied_when = inode->dirtied_when;
        ),

        TP_printk("dev %d,%d ino %lu dirtied %lu state %s mode 0%o",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long)__entry->ino, __entry->dirtied_when,
                  show_inode_state(__entry->state), __entry->mode)
);

DEFINE_EVENT(writeback_inode_template, writeback_lazytime,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

DEFINE_EVENT(writeback_inode_template, writeback_lazytime_iput,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

DEFINE_EVENT(writeback_inode_template, writeback_dirty_inode_enqueue,

        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

/*
 * Inode writeback list tracking.
 */

DEFINE_EVENT(writeback_inode_template, sb_mark_inode_writeback,
        TP_PROTO(struct inode *inode),
        TP_ARGS(inode)
);

DEFINE_EVENT(writeback_inode_template, sb_clear_inode_writeback,
        TP_PROTO(struct inode *inode),
        TP_ARGS(inode)
);

#endif /* _TRACE_WRITEBACK_H */

/* This part must be outside protection */
#include <trace/define_trace.h>




































































































































































































































  311 


































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  311 








  313 










  314 
  316 
  316 
  315 



  314 











































































































































































































































































































































































































































































































  314 


































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   13 


































































































































































































































































































































































































































































































































































  318 








  318 

  318 

















































































































































































































































































































































  318 















































































































































  265 






  268 
  266 


























































































  316 






  315 
  315 









  316 


  319 
  316 

  313 





































































































































































































































































































































































































































































































































































































































































































































  312 


















  318 
  317 

  314 

  313 





  318 



    1 













  316 



  314 


    4 




    3 



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  Security-Enhanced Linux (SELinux) security module
 *
 *  This file contains the SELinux hook function implementations.
 *
 *  Authors:  Stephen Smalley, <stephen.smalley.work@gmail.com>
 *              Chris Vance, <cvance@nai.com>
 *              Wayne Salamon, <wsalamon@nai.com>
 *              James Morris <jmorris@redhat.com>
 *
 *  Copyright (C) 2001,2002 Networks Associates Technology, Inc.
 *  Copyright (C) 2003-2008 Red Hat, Inc., James Morris <jmorris@redhat.com>
 *                                           Eric Paris <eparis@redhat.com>
 *  Copyright (C) 2004-2005 Trusted Computer Solutions, Inc.
 *                            <dgoeddel@trustedcs.com>
 *  Copyright (C) 2006, 2007, 2009 Hewlett-Packard Development Company, L.P.
 *        Paul Moore <paul@paul-moore.com>
 *  Copyright (C) 2007 Hitachi Software Engineering Co., Ltd.
 *                       Yuichi Nakamura <ynakam@hitachisoft.jp>
 *  Copyright (C) 2016 Mellanox Technologies
 */

#include <linux/init.h>
#include <linux/kd.h>
#include <linux/kernel.h>
#include <linux/kernel_read_file.h>
#include <linux/errno.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/lsm_hooks.h>
#include <linux/xattr.h>
#include <linux/capability.h>
#include <linux/unistd.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/proc_fs.h>
#include <linux/swap.h>
#include <linux/spinlock.h>
#include <linux/syscalls.h>
#include <linux/dcache.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/namei.h>
#include <linux/mount.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_ipv6.h>
#include <linux/tty.h>
#include <net/icmp.h>
#include <net/ip.h>                /* for local_port_range[] */
#include <net/tcp.h>                /* struct or_callable used in sock_rcv_skb */
#include <net/inet_connection_sock.h>
#include <net/net_namespace.h>
#include <net/netlabel.h>
#include <linux/uaccess.h>
#include <asm/ioctls.h>
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/interrupt.h>
#include <linux/netdevice.h>        /* for network interface checks */
#include <net/netlink.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/sctp.h>
#include <net/sctp/structs.h>
#include <linux/quota.h>
#include <linux/un.h>                /* for Unix socket types */
#include <net/af_unix.h>        /* for Unix socket types */
#include <linux/parser.h>
#include <linux/nfs_mount.h>
#include <net/ipv6.h>
#include <linux/hugetlb.h>
#include <linux/personality.h>
#include <linux/audit.h>
#include <linux/string.h>
#include <linux/mutex.h>
#include <linux/posix-timers.h>
#include <linux/syslog.h>
#include <linux/user_namespace.h>
#include <linux/export.h>
#include <linux/msg.h>
#include <linux/shm.h>
#include <uapi/linux/shm.h>
#include <linux/bpf.h>
#include <linux/kernfs.h>
#include <linux/stringhash.h>        /* for hashlen_string() */
#include <uapi/linux/mount.h>
#include <linux/fsnotify.h>
#include <linux/fanotify.h>
#include <linux/io_uring/cmd.h>
#include <uapi/linux/lsm.h>

#include "avc.h"
#include "objsec.h"
#include "netif.h"
#include "netnode.h"
#include "netport.h"
#include "ibpkey.h"
#include "xfrm.h"
#include "netlabel.h"
#include "audit.h"
#include "avc_ss.h"

#define SELINUX_INODE_INIT_XATTRS 1

struct selinux_state selinux_state;

/* SECMARK reference count */
static atomic_t selinux_secmark_refcount = ATOMIC_INIT(0);

#ifdef CONFIG_SECURITY_SELINUX_DEVELOP
static int selinux_enforcing_boot __initdata;

static int __init enforcing_setup(char *str)
{
        unsigned long enforcing;
        if (!kstrtoul(str, 0, &enforcing))
                selinux_enforcing_boot = enforcing ? 1 : 0;
        return 1;
}
__setup("enforcing=", enforcing_setup);
#else
#define selinux_enforcing_boot 1
#endif

int selinux_enabled_boot __initdata = 1;
#ifdef CONFIG_SECURITY_SELINUX_BOOTPARAM
static int __init selinux_enabled_setup(char *str)
{
        unsigned long enabled;
        if (!kstrtoul(str, 0, &enabled))
                selinux_enabled_boot = enabled ? 1 : 0;
        return 1;
}
__setup("selinux=", selinux_enabled_setup);
#endif

static int __init checkreqprot_setup(char *str)
{
        unsigned long checkreqprot;

        if (!kstrtoul(str, 0, &checkreqprot)) {
                if (checkreqprot)
                        pr_err("SELinux: checkreqprot set to 1 via kernel parameter.  This is no longer supported.\n");
        }
        return 1;
}
__setup("checkreqprot=", checkreqprot_setup);

/**
 * selinux_secmark_enabled - Check to see if SECMARK is currently enabled
 *
 * Description:
 * This function checks the SECMARK reference counter to see if any SECMARK
 * targets are currently configured, if the reference counter is greater than
 * zero SECMARK is considered to be enabled.  Returns true (1) if SECMARK is
 * enabled, false (0) if SECMARK is disabled.  If the always_check_network
 * policy capability is enabled, SECMARK is always considered enabled.
 *
 */
static int selinux_secmark_enabled(void)
{
        return (selinux_policycap_alwaysnetwork() ||
                atomic_read(&selinux_secmark_refcount));
}

/**
 * selinux_peerlbl_enabled - Check to see if peer labeling is currently enabled
 *
 * Description:
 * This function checks if NetLabel or labeled IPSEC is enabled.  Returns true
 * (1) if any are enabled or false (0) if neither are enabled.  If the
 * always_check_network policy capability is enabled, peer labeling
 * is always considered enabled.
 *
 */
static int selinux_peerlbl_enabled(void)
{
        return (selinux_policycap_alwaysnetwork() ||
                netlbl_enabled() || selinux_xfrm_enabled());
}

static int selinux_netcache_avc_callback(u32 event)
{
        if (event == AVC_CALLBACK_RESET) {
                sel_netif_flush();
                sel_netnode_flush();
                sel_netport_flush();
                synchronize_net();
        }
        return 0;
}

static int selinux_lsm_notifier_avc_callback(u32 event)
{
        if (event == AVC_CALLBACK_RESET) {
                sel_ib_pkey_flush();
                call_blocking_lsm_notifier(LSM_POLICY_CHANGE, NULL);
        }

        return 0;
}

/*
 * initialise the security for the init task
 */
static void cred_init_security(void)
{
        struct cred_security_struct *crsec;

        /* NOTE: the lsm framework zeros out the buffer on allocation */

        crsec = selinux_cred(unrcu_pointer(current->real_cred));
        crsec->osid = crsec->sid = SECINITSID_KERNEL;
}

/*
 * get the security ID of a set of credentials
 */
static inline u32 cred_sid(const struct cred *cred)
{
        const struct cred_security_struct *crsec;

        crsec = selinux_cred(cred);
        return crsec->sid;
}

static void __ad_net_init(struct common_audit_data *ad,
                          struct lsm_network_audit *net,
                          int ifindex, struct sock *sk, u16 family)
{
        ad->type = LSM_AUDIT_DATA_NET;
        ad->u.net = net;
        net->netif = ifindex;
        net->sk = sk;
        net->family = family;
}

static void ad_net_init_from_sk(struct common_audit_data *ad,
                                struct lsm_network_audit *net,
                                struct sock *sk)
{
        __ad_net_init(ad, net, 0, sk, 0);
}

static void ad_net_init_from_iif(struct common_audit_data *ad,
                                 struct lsm_network_audit *net,
                                 int ifindex, u16 family)
{
        __ad_net_init(ad, net, ifindex, NULL, family);
}

/*
 * get the objective security ID of a task
 */
static inline u32 task_sid_obj(const struct task_struct *task)
{
        u32 sid;

        rcu_read_lock();
        sid = cred_sid(__task_cred(task));
        rcu_read_unlock();
        return sid;
}

static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dentry);

/*
 * Try reloading inode security labels that have been marked as invalid.  The
 * @may_sleep parameter indicates when sleeping and thus reloading labels is
 * allowed; when set to false, returns -ECHILD when the label is
 * invalid.  The @dentry parameter should be set to a dentry of the inode.
 */
static int __inode_security_revalidate(struct inode *inode,
                                       struct dentry *dentry,
                                       bool may_sleep)
{
        if (!selinux_initialized())
                return 0;

        if (may_sleep)
                might_sleep();
        else
                return -ECHILD;

        /*
         * Check to ensure that an inode's SELinux state is valid and try
         * reloading the inode security label if necessary.  This will fail if
         * @dentry is NULL and no dentry for this inode can be found; in that
         * case, continue using the old label.
         */
        inode_doinit_with_dentry(inode, dentry);
        return 0;
}

static struct inode_security_struct *inode_security_novalidate(struct inode *inode)
{
        return selinux_inode(inode);
}

static inline struct inode_security_struct *inode_security_rcu(struct inode *inode,
                                                               bool rcu)
{
        int rc;
        struct inode_security_struct *isec = selinux_inode(inode);

        /* check below is racy, but revalidate will recheck with lock held */
        if (data_race(likely(isec->initialized == LABEL_INITIALIZED)))
                return isec;
        rc = __inode_security_revalidate(inode, NULL, !rcu);
        if (rc)
                return ERR_PTR(rc);
        return isec;
}

/*
 * Get the security label of an inode.
 */
static inline struct inode_security_struct *inode_security(struct inode *inode)
{
        struct inode_security_struct *isec = selinux_inode(inode);

        /* check below is racy, but revalidate will recheck with lock held */
        if (data_race(likely(isec->initialized == LABEL_INITIALIZED)))
                return isec;
        __inode_security_revalidate(inode, NULL, true);
        return isec;
}

static inline struct inode_security_struct *backing_inode_security_novalidate(struct dentry *dentry)
{
        return selinux_inode(d_backing_inode(dentry));
}

/*
 * Get the security label of a dentry's backing inode.
 */
static inline struct inode_security_struct *backing_inode_security(struct dentry *dentry)
{
        struct inode *inode = d_backing_inode(dentry);
        struct inode_security_struct *isec = selinux_inode(inode);

        /* check below is racy, but revalidate will recheck with lock held */
        if (data_race(likely(isec->initialized == LABEL_INITIALIZED)))
                return isec;
        __inode_security_revalidate(inode, dentry, true);
        return isec;
}

static void inode_free_security(struct inode *inode)
{
        struct inode_security_struct *isec = selinux_inode(inode);
        struct superblock_security_struct *sbsec;

        if (!isec)
                return;
        sbsec = selinux_superblock(inode->i_sb);
        /*
         * As not all inode security structures are in a list, we check for
         * empty list outside of the lock to make sure that we won't waste
         * time taking a lock doing nothing.
         *
         * The list_del_init() function can be safely called more than once.
         * It should not be possible for this function to be called with
         * concurrent list_add(), but for better safety against future changes
         * in the code, we use list_empty_careful() here.
         */
        if (!list_empty_careful(&isec->list)) {
                spin_lock(&sbsec->isec_lock);
                list_del_init(&isec->list);
                spin_unlock(&sbsec->isec_lock);
        }
}

struct selinux_mnt_opts {
        u32 fscontext_sid;
        u32 context_sid;
        u32 rootcontext_sid;
        u32 defcontext_sid;
};

static void selinux_free_mnt_opts(void *mnt_opts)
{
        kfree(mnt_opts);
}

enum {
        Opt_error = -1,
        Opt_context = 0,
        Opt_defcontext = 1,
        Opt_fscontext = 2,
        Opt_rootcontext = 3,
        Opt_seclabel = 4,
};

#define A(s, has_arg) {#s, sizeof(#s) - 1, Opt_##s, has_arg}
static const struct {
        const char *name;
        int len;
        int opt;
        bool has_arg;
} tokens[] = {
        A(context, true),
        A(fscontext, true),
        A(defcontext, true),
        A(rootcontext, true),
        A(seclabel, false),
};
#undef A

static int match_opt_prefix(char *s, int l, char **arg)
{
        unsigned int i;

        for (i = 0; i < ARRAY_SIZE(tokens); i++) {
                size_t len = tokens[i].len;
                if (len > l || memcmp(s, tokens[i].name, len))
                        continue;
                if (tokens[i].has_arg) {
                        if (len == l || s[len] != '=')
                                continue;
                        *arg = s + len + 1;
                } else if (len != l)
                        continue;
                return tokens[i].opt;
        }
        return Opt_error;
}

#define SEL_MOUNT_FAIL_MSG "SELinux:  duplicate or incompatible mount options\n"

static int may_context_mount_sb_relabel(u32 sid,
                        struct superblock_security_struct *sbsec,
                        const struct cred *cred)
{
        const struct cred_security_struct *crsec = selinux_cred(cred);
        int rc;

        rc = avc_has_perm(crsec->sid, sbsec->sid, SECCLASS_FILESYSTEM,
                          FILESYSTEM__RELABELFROM, NULL);
        if (rc)
                return rc;

        rc = avc_has_perm(crsec->sid, sid, SECCLASS_FILESYSTEM,
                          FILESYSTEM__RELABELTO, NULL);
        return rc;
}

static int may_context_mount_inode_relabel(u32 sid,
                        struct superblock_security_struct *sbsec,
                        const struct cred *cred)
{
        const struct cred_security_struct *crsec = selinux_cred(cred);
        int rc;
        rc = avc_has_perm(crsec->sid, sbsec->sid, SECCLASS_FILESYSTEM,
                          FILESYSTEM__RELABELFROM, NULL);
        if (rc)
                return rc;

        rc = avc_has_perm(sid, sbsec->sid, SECCLASS_FILESYSTEM,
                          FILESYSTEM__ASSOCIATE, NULL);
        return rc;
}

static int selinux_is_genfs_special_handling(struct super_block *sb)
{
        /* Special handling. Genfs but also in-core setxattr handler */
        return        !strcmp(sb->s_type->name, "sysfs") ||
                !strcmp(sb->s_type->name, "pstore") ||
                !strcmp(sb->s_type->name, "debugfs") ||
                !strcmp(sb->s_type->name, "tracefs") ||
                !strcmp(sb->s_type->name, "rootfs") ||
                (selinux_policycap_cgroupseclabel() &&
                 (!strcmp(sb->s_type->name, "cgroup") ||
                  !strcmp(sb->s_type->name, "cgroup2"))) ||
                (selinux_policycap_functionfs_seclabel() &&
                 !strcmp(sb->s_type->name, "functionfs"));
}

static int selinux_is_sblabel_mnt(struct super_block *sb)
{
        struct superblock_security_struct *sbsec = selinux_superblock(sb);

        /*
         * IMPORTANT: Double-check logic in this function when adding a new
         * SECURITY_FS_USE_* definition!
         */
        BUILD_BUG_ON(SECURITY_FS_USE_MAX != 7);

        switch (sbsec->behavior) {
        case SECURITY_FS_USE_XATTR:
        case SECURITY_FS_USE_TRANS:
        case SECURITY_FS_USE_TASK:
        case SECURITY_FS_USE_NATIVE:
                return 1;

        case SECURITY_FS_USE_GENFS:
                return selinux_is_genfs_special_handling(sb);

        /* Never allow relabeling on context mounts */
        case SECURITY_FS_USE_MNTPOINT:
        case SECURITY_FS_USE_NONE:
        default:
                return 0;
        }
}

static int sb_check_xattr_support(struct super_block *sb)
{
        struct superblock_security_struct *sbsec = selinux_superblock(sb);
        struct dentry *root = sb->s_root;
        struct inode *root_inode = d_backing_inode(root);
        u32 sid;
        int rc;

        /*
         * Make sure that the xattr handler exists and that no
         * error other than -ENODATA is returned by getxattr on
         * the root directory.  -ENODATA is ok, as this may be
         * the first boot of the SELinux kernel before we have
         * assigned xattr values to the filesystem.
         */
        if (!(root_inode->i_opflags & IOP_XATTR)) {
                pr_warn("SELinux: (dev %s, type %s) has no xattr support\n",
                        sb->s_id, sb->s_type->name);
                goto fallback;
        }

        rc = __vfs_getxattr(root, root_inode, XATTR_NAME_SELINUX, NULL, 0);
        if (rc < 0 && rc != -ENODATA) {
                if (rc == -EOPNOTSUPP) {
                        pr_warn("SELinux: (dev %s, type %s) has no security xattr handler\n",
                                sb->s_id, sb->s_type->name);
                        goto fallback;
                } else {
                        pr_warn("SELinux: (dev %s, type %s) getxattr errno %d\n",
                                sb->s_id, sb->s_type->name, -rc);
                        return rc;
                }
        }
        return 0;

fallback:
        /* No xattr support - try to fallback to genfs if possible. */
        rc = security_genfs_sid(sb->s_type->name, "/",
                                SECCLASS_DIR, &sid);
        if (rc)
                return -EOPNOTSUPP;

        pr_warn("SELinux: (dev %s, type %s) falling back to genfs\n",
                sb->s_id, sb->s_type->name);
        sbsec->behavior = SECURITY_FS_USE_GENFS;
        sbsec->sid = sid;
        return 0;
}

static int sb_finish_set_opts(struct super_block *sb)
{
        struct superblock_security_struct *sbsec = selinux_superblock(sb);
        struct dentry *root = sb->s_root;
        struct inode *root_inode = d_backing_inode(root);
        int rc = 0;

        if (sbsec->behavior == SECURITY_FS_USE_XATTR) {
                rc = sb_check_xattr_support(sb);
                if (rc)
                        return rc;
        }

        sbsec->flags |= SE_SBINITIALIZED;

        /*
         * Explicitly set or clear SBLABEL_MNT.  It's not sufficient to simply
         * leave the flag untouched because sb_clone_mnt_opts might be handing
         * us a superblock that needs the flag to be cleared.
         */
        if (selinux_is_sblabel_mnt(sb))
                sbsec->flags |= SBLABEL_MNT;
        else
                sbsec->flags &= ~SBLABEL_MNT;

        /* Initialize the root inode. */
        rc = inode_doinit_with_dentry(root_inode, root);

        /* Initialize any other inodes associated with the superblock, e.g.
           inodes created prior to initial policy load or inodes created
           during get_sb by a pseudo filesystem that directly
           populates itself. */
        spin_lock(&sbsec->isec_lock);
        while (!list_empty(&sbsec->isec_head)) {
                struct inode_security_struct *isec =
                                list_first_entry(&sbsec->isec_head,
                                           struct inode_security_struct, list);
                struct inode *inode = isec->inode;
                list_del_init(&isec->list);
                spin_unlock(&sbsec->isec_lock);
                inode = igrab(inode);
                if (inode) {
                        if (!IS_PRIVATE(inode))
                                inode_doinit_with_dentry(inode, NULL);
                        iput(inode);
                }
                spin_lock(&sbsec->isec_lock);
        }
        spin_unlock(&sbsec->isec_lock);
        return rc;
}

static int bad_option(struct superblock_security_struct *sbsec, char flag,
                      u32 old_sid, u32 new_sid)
{
        char mnt_flags = sbsec->flags & SE_MNTMASK;

        /* check if the old mount command had the same options */
        if (sbsec->flags & SE_SBINITIALIZED)
                if (!(sbsec->flags & flag) ||
                    (old_sid != new_sid))
                        return 1;

        /* check if we were passed the same options twice,
         * aka someone passed context=a,context=b
         */
        if (!(sbsec->flags & SE_SBINITIALIZED))
                if (mnt_flags & flag)
                        return 1;
        return 0;
}

/*
 * Allow filesystems with binary mount data to explicitly set mount point
 * labeling information.
 */
static int selinux_set_mnt_opts(struct super_block *sb,
                                void *mnt_opts,
                                unsigned long kern_flags,
                                unsigned long *set_kern_flags)
{
        const struct cred *cred = current_cred();
        struct superblock_security_struct *sbsec = selinux_superblock(sb);
        struct dentry *root = sb->s_root;
        struct selinux_mnt_opts *opts = mnt_opts;
        struct inode_security_struct *root_isec;
        u32 fscontext_sid = 0, context_sid = 0, rootcontext_sid = 0;
        u32 defcontext_sid = 0;
        int rc = 0;

        /*
         * Specifying internal flags without providing a place to
         * place the results is not allowed
         */
        if (kern_flags && !set_kern_flags)
                return -EINVAL;

        mutex_lock(&sbsec->lock);

        if (!selinux_initialized()) {
                if (!opts) {
                        /* Defer initialization until selinux_complete_init,
                           after the initial policy is loaded and the security
                           server is ready to handle calls. */
                        if (kern_flags & SECURITY_LSM_NATIVE_LABELS) {
                                sbsec->flags |= SE_SBNATIVE;
                                *set_kern_flags |= SECURITY_LSM_NATIVE_LABELS;
                        }
                        goto out;
                }
                rc = -EINVAL;
                pr_warn("SELinux: Unable to set superblock options "
                        "before the security server is initialized\n");
                goto out;
        }

        /*
         * Binary mount data FS will come through this function twice.  Once
         * from an explicit call and once from the generic calls from the vfs.
         * Since the generic VFS calls will not contain any security mount data
         * we need to skip the double mount verification.
         *
         * This does open a hole in which we will not notice if the first
         * mount using this sb set explicit options and a second mount using
         * this sb does not set any security options.  (The first options
         * will be used for both mounts)
         */
        if ((sbsec->flags & SE_SBINITIALIZED) && (sb->s_type->fs_flags & FS_BINARY_MOUNTDATA)
            && !opts)
                goto out;

        root_isec = backing_inode_security_novalidate(root);

        /*
         * parse the mount options, check if they are valid sids.
         * also check if someone is trying to mount the same sb more
         * than once with different security options.
         */
        if (opts) {
                if (opts->fscontext_sid) {
                        fscontext_sid = opts->fscontext_sid;
                        if (bad_option(sbsec, FSCONTEXT_MNT, sbsec->sid,
                                        fscontext_sid))
                                goto out_double_mount;
                        sbsec->flags |= FSCONTEXT_MNT;
                }
                if (opts->context_sid) {
                        context_sid = opts->context_sid;
                        if (bad_option(sbsec, CONTEXT_MNT, sbsec->mntpoint_sid,
                                        context_sid))
                                goto out_double_mount;
                        sbsec->flags |= CONTEXT_MNT;
                }
                if (opts->rootcontext_sid) {
                        rootcontext_sid = opts->rootcontext_sid;
                        if (bad_option(sbsec, ROOTCONTEXT_MNT, root_isec->sid,
                                        rootcontext_sid))
                                goto out_double_mount;
                        sbsec->flags |= ROOTCONTEXT_MNT;
                }
                if (opts->defcontext_sid) {
                        defcontext_sid = opts->defcontext_sid;
                        if (bad_option(sbsec, DEFCONTEXT_MNT, sbsec->def_sid,
                                        defcontext_sid))
                                goto out_double_mount;
                        sbsec->flags |= DEFCONTEXT_MNT;
                }
        }

        if (sbsec->flags & SE_SBINITIALIZED) {
                /* previously mounted with options, but not on this attempt? */
                if ((sbsec->flags & SE_MNTMASK) && !opts)
                        goto out_double_mount;
                rc = 0;
                goto out;
        }

        if (strcmp(sb->s_type->name, "proc") == 0)
                sbsec->flags |= SE_SBPROC | SE_SBGENFS;

        if (!strcmp(sb->s_type->name, "debugfs") ||
            !strcmp(sb->s_type->name, "tracefs") ||
            !strcmp(sb->s_type->name, "binder") ||
            !strcmp(sb->s_type->name, "bpf") ||
            !strcmp(sb->s_type->name, "pstore") ||
            !strcmp(sb->s_type->name, "securityfs") ||
            (selinux_policycap_functionfs_seclabel() &&
             !strcmp(sb->s_type->name, "functionfs")))
                sbsec->flags |= SE_SBGENFS;

        if (!strcmp(sb->s_type->name, "sysfs") ||
            !strcmp(sb->s_type->name, "cgroup") ||
            !strcmp(sb->s_type->name, "cgroup2"))
                sbsec->flags |= SE_SBGENFS | SE_SBGENFS_XATTR;

        if (!sbsec->behavior) {
                /*
                 * Determine the labeling behavior to use for this
                 * filesystem type.
                 */
                rc = security_fs_use(sb);
                if (rc) {
                        pr_warn("%s: security_fs_use(%s) returned %d\n",
                                        __func__, sb->s_type->name, rc);
                        goto out;
                }
        }

        /*
         * If this is a user namespace mount and the filesystem type is not
         * explicitly whitelisted, then no contexts are allowed on the command
         * line and security labels must be ignored.
         */
        if (sb->s_user_ns != &init_user_ns &&
            strcmp(sb->s_type->name, "tmpfs") &&
            strcmp(sb->s_type->name, "ramfs") &&
            strcmp(sb->s_type->name, "devpts") &&
            strcmp(sb->s_type->name, "overlay")) {
                if (context_sid || fscontext_sid || rootcontext_sid ||
                    defcontext_sid) {
                        rc = -EACCES;
                        goto out;
                }
                if (sbsec->behavior == SECURITY_FS_USE_XATTR) {
                        sbsec->behavior = SECURITY_FS_USE_MNTPOINT;
                        rc = security_transition_sid(current_sid(),
                                                     current_sid(),
                                                     SECCLASS_FILE, NULL,
                                                     &sbsec->mntpoint_sid);
                        if (rc)
                                goto out;
                }
                goto out_set_opts;
        }

        /* sets the context of the superblock for the fs being mounted. */
        if (fscontext_sid) {
                rc = may_context_mount_sb_relabel(fscontext_sid, sbsec, cred);
                if (rc)
                        goto out;

                sbsec->sid = fscontext_sid;
        }

        /*
         * Switch to using mount point labeling behavior.
         * sets the label used on all file below the mountpoint, and will set
         * the superblock context if not already set.
         */
        if (sbsec->flags & SE_SBNATIVE) {
                /*
                 * This means we are initializing a superblock that has been
                 * mounted before the SELinux was initialized and the
                 * filesystem requested native labeling. We had already
                 * returned SECURITY_LSM_NATIVE_LABELS in *set_kern_flags
                 * in the original mount attempt, so now we just need to set
                 * the SECURITY_FS_USE_NATIVE behavior.
                 */
                sbsec->behavior = SECURITY_FS_USE_NATIVE;
        } else if (kern_flags & SECURITY_LSM_NATIVE_LABELS && !context_sid) {
                sbsec->behavior = SECURITY_FS_USE_NATIVE;
                *set_kern_flags |= SECURITY_LSM_NATIVE_LABELS;
        }

        if (context_sid) {
                if (!fscontext_sid) {
                        rc = may_context_mount_sb_relabel(context_sid, sbsec,
                                                          cred);
                        if (rc)
                                goto out;
                        sbsec->sid = context_sid;
                } else {
                        rc = may_context_mount_inode_relabel(context_sid, sbsec,
                                                             cred);
                        if (rc)
                                goto out;
                }
                if (!rootcontext_sid)
                        rootcontext_sid = context_sid;

                sbsec->mntpoint_sid = context_sid;
                sbsec->behavior = SECURITY_FS_USE_MNTPOINT;
        }

        if (rootcontext_sid) {
                rc = may_context_mount_inode_relabel(rootcontext_sid, sbsec,
                                                     cred);
                if (rc)
                        goto out;

                root_isec->sid = rootcontext_sid;
                root_isec->initialized = LABEL_INITIALIZED;
        }

        if (defcontext_sid) {
                if (sbsec->behavior != SECURITY_FS_USE_XATTR &&
                        sbsec->behavior != SECURITY_FS_USE_NATIVE) {
                        rc = -EINVAL;
                        pr_warn("SELinux: defcontext option is "
                               "invalid for this filesystem type\n");
                        goto out;
                }

                if (defcontext_sid != sbsec->def_sid) {
                        rc = may_context_mount_inode_relabel(defcontext_sid,
                                                             sbsec, cred);
                        if (rc)
                                goto out;
                }

                sbsec->def_sid = defcontext_sid;
        }

out_set_opts:
        rc = sb_finish_set_opts(sb);
out:
        mutex_unlock(&sbsec->lock);
        return rc;
out_double_mount:
        rc = -EINVAL;
        pr_warn("SELinux: mount invalid.  Same superblock, different "
               "security settings for (dev %s, type %s)\n", sb->s_id,
               sb->s_type->name);
        goto out;
}

static int selinux_cmp_sb_context(const struct super_block *oldsb,
                                    const struct super_block *newsb)
{
        struct superblock_security_struct *old = selinux_superblock(oldsb);
        struct superblock_security_struct *new = selinux_superblock(newsb);
        char oldflags = old->flags & SE_MNTMASK;
        char newflags = new->flags & SE_MNTMASK;

        if (oldflags != newflags)
                goto mismatch;
        if ((oldflags & FSCONTEXT_MNT) && old->sid != new->sid)
                goto mismatch;
        if ((oldflags & CONTEXT_MNT) && old->mntpoint_sid != new->mntpoint_sid)
                goto mismatch;
        if ((oldflags & DEFCONTEXT_MNT) && old->def_sid != new->def_sid)
                goto mismatch;
        if (oldflags & ROOTCONTEXT_MNT) {
                struct inode_security_struct *oldroot = backing_inode_security(oldsb->s_root);
                struct inode_security_struct *newroot = backing_inode_security(newsb->s_root);
                if (oldroot->sid != newroot->sid)
                        goto mismatch;
        }
        return 0;
mismatch:
        pr_warn("SELinux: mount invalid.  Same superblock, "
                            "different security settings for (dev %s, "
                            "type %s)\n", newsb->s_id, newsb->s_type->name);
        return -EBUSY;
}

static int selinux_sb_clone_mnt_opts(const struct super_block *oldsb,
                                        struct super_block *newsb,
                                        unsigned long kern_flags,
                                        unsigned long *set_kern_flags)
{
        int rc = 0;
        const struct superblock_security_struct *oldsbsec =
                                                selinux_superblock(oldsb);
        struct superblock_security_struct *newsbsec = selinux_superblock(newsb);

        int set_fscontext =        (oldsbsec->flags & FSCONTEXT_MNT);
        int set_context =        (oldsbsec->flags & CONTEXT_MNT);
        int set_rootcontext =        (oldsbsec->flags & ROOTCONTEXT_MNT);

        /*
         * Specifying internal flags without providing a place to
         * place the results is not allowed.
         */
        if (kern_flags && !set_kern_flags)
                return -EINVAL;

        mutex_lock(&newsbsec->lock);

        /*
         * if the parent was able to be mounted it clearly had no special lsm
         * mount options.  thus we can safely deal with this superblock later
         */
        if (!selinux_initialized()) {
                if (kern_flags & SECURITY_LSM_NATIVE_LABELS) {
                        newsbsec->flags |= SE_SBNATIVE;
                        *set_kern_flags |= SECURITY_LSM_NATIVE_LABELS;
                }
                goto out;
        }

        /* how can we clone if the old one wasn't set up?? */
        BUG_ON(!(oldsbsec->flags & SE_SBINITIALIZED));

        /* if fs is reusing a sb, make sure that the contexts match */
        if (newsbsec->flags & SE_SBINITIALIZED) {
                mutex_unlock(&newsbsec->lock);
                if ((kern_flags & SECURITY_LSM_NATIVE_LABELS) && !set_context)
                        *set_kern_flags |= SECURITY_LSM_NATIVE_LABELS;
                return selinux_cmp_sb_context(oldsb, newsb);
        }

        newsbsec->flags = oldsbsec->flags;

        newsbsec->sid = oldsbsec->sid;
        newsbsec->def_sid = oldsbsec->def_sid;
        newsbsec->behavior = oldsbsec->behavior;

        if (newsbsec->behavior == SECURITY_FS_USE_NATIVE &&
                !(kern_flags & SECURITY_LSM_NATIVE_LABELS) && !set_context) {
                rc = security_fs_use(newsb);
                if (rc)
                        goto out;
        }

        if (kern_flags & SECURITY_LSM_NATIVE_LABELS && !set_context) {
                newsbsec->behavior = SECURITY_FS_USE_NATIVE;
                *set_kern_flags |= SECURITY_LSM_NATIVE_LABELS;
        }

        if (set_context) {
                u32 sid = oldsbsec->mntpoint_sid;

                if (!set_fscontext)
                        newsbsec->sid = sid;
                if (!set_rootcontext) {
                        struct inode_security_struct *newisec = backing_inode_security(newsb->s_root);
                        newisec->sid = sid;
                }
                newsbsec->mntpoint_sid = sid;
        }
        if (set_rootcontext) {
                const struct inode_security_struct *oldisec = backing_inode_security(oldsb->s_root);
                struct inode_security_struct *newisec = backing_inode_security(newsb->s_root);

                newisec->sid = oldisec->sid;
        }

        sb_finish_set_opts(newsb);
out:
        mutex_unlock(&newsbsec->lock);
        return rc;
}

/*
 * NOTE: the caller is responsible for freeing the memory even if on error.
 */
static int selinux_add_opt(int token, const char *s, void **mnt_opts)
{
        struct selinux_mnt_opts *opts = *mnt_opts;
        u32 *dst_sid;
        int rc;

        if (token == Opt_seclabel)
                /* eaten and completely ignored */
                return 0;
        if (!s)
                return -EINVAL;

        if (!selinux_initialized()) {
                pr_warn("SELinux: Unable to set superblock options before the security server is initialized\n");
                return -EINVAL;
        }

        if (!opts) {
                opts = kzalloc(sizeof(*opts), GFP_KERNEL);
                if (!opts)
                        return -ENOMEM;
                *mnt_opts = opts;
        }

        switch (token) {
        case Opt_context:
                if (opts->context_sid || opts->defcontext_sid)
                        goto err;
                dst_sid = &opts->context_sid;
                break;
        case Opt_fscontext:
                if (opts->fscontext_sid)
                        goto err;
                dst_sid = &opts->fscontext_sid;
                break;
        case Opt_rootcontext:
                if (opts->rootcontext_sid)
                        goto err;
                dst_sid = &opts->rootcontext_sid;
                break;
        case Opt_defcontext:
                if (opts->context_sid || opts->defcontext_sid)
                        goto err;
                dst_sid = &opts->defcontext_sid;
                break;
        default:
                WARN_ON(1);
                return -EINVAL;
        }
        rc = security_context_str_to_sid(s, dst_sid, GFP_KERNEL);
        if (rc)
                pr_warn("SELinux: security_context_str_to_sid (%s) failed with errno=%d\n",
                        s, rc);
        return rc;

err:
        pr_warn(SEL_MOUNT_FAIL_MSG);
        return -EINVAL;
}

static int show_sid(struct seq_file *m, u32 sid)
{
        char *context = NULL;
        u32 len;
        int rc;

        rc = security_sid_to_context(sid, &context, &len);
        if (!rc) {
                bool has_comma = strchr(context, ',');

                seq_putc(m, '=');
                if (has_comma)
                        seq_putc(m, '\"');
                seq_escape(m, context, "\"\n\\");
                if (has_comma)
                        seq_putc(m, '\"');
        }
        kfree(context);
        return rc;
}

static int selinux_sb_show_options(struct seq_file *m, struct super_block *sb)
{
        struct superblock_security_struct *sbsec = selinux_superblock(sb);
        int rc;

        if (!(sbsec->flags & SE_SBINITIALIZED))
                return 0;

        if (!selinux_initialized())
                return 0;

        if (sbsec->flags & FSCONTEXT_MNT) {
                seq_putc(m, ',');
                seq_puts(m, FSCONTEXT_STR);
                rc = show_sid(m, sbsec->sid);
                if (rc)
                        return rc;
        }
        if (sbsec->flags & CONTEXT_MNT) {
                seq_putc(m, ',');
                seq_puts(m, CONTEXT_STR);
                rc = show_sid(m, sbsec->mntpoint_sid);
                if (rc)
                        return rc;
        }
        if (sbsec->flags & DEFCONTEXT_MNT) {
                seq_putc(m, ',');
                seq_puts(m, DEFCONTEXT_STR);
                rc = show_sid(m, sbsec->def_sid);
                if (rc)
                        return rc;
        }
        if (sbsec->flags & ROOTCONTEXT_MNT) {
                struct dentry *root = sb->s_root;
                struct inode_security_struct *isec = backing_inode_security(root);
                seq_putc(m, ',');
                seq_puts(m, ROOTCONTEXT_STR);
                rc = show_sid(m, isec->sid);
                if (rc)
                        return rc;
        }
        if (sbsec->flags & SBLABEL_MNT) {
                seq_putc(m, ',');
                seq_puts(m, SECLABEL_STR);
        }
        return 0;
}

static inline u16 inode_mode_to_security_class(umode_t mode)
{
        switch (mode & S_IFMT) {
        case S_IFSOCK:
                return SECCLASS_SOCK_FILE;
        case S_IFLNK:
                return SECCLASS_LNK_FILE;
        case S_IFREG:
                return SECCLASS_FILE;
        case S_IFBLK:
                return SECCLASS_BLK_FILE;
        case S_IFDIR:
                return SECCLASS_DIR;
        case S_IFCHR:
                return SECCLASS_CHR_FILE;
        case S_IFIFO:
                return SECCLASS_FIFO_FILE;

        }

        return SECCLASS_FILE;
}

static inline int default_protocol_stream(int protocol)
{
        return (protocol == IPPROTO_IP || protocol == IPPROTO_TCP ||
                protocol == IPPROTO_MPTCP);
}

static inline int default_protocol_dgram(int protocol)
{
        return (protocol == IPPROTO_IP || protocol == IPPROTO_UDP);
}

static inline u16 socket_type_to_security_class(int family, int type, int protocol)
{
        bool extsockclass = selinux_policycap_extsockclass();

        switch (family) {
        case PF_UNIX:
                switch (type) {
                case SOCK_STREAM:
                case SOCK_SEQPACKET:
                        return SECCLASS_UNIX_STREAM_SOCKET;
                case SOCK_DGRAM:
                case SOCK_RAW:
                        return SECCLASS_UNIX_DGRAM_SOCKET;
                }
                break;
        case PF_INET:
        case PF_INET6:
                switch (type) {
                case SOCK_STREAM:
                case SOCK_SEQPACKET:
                        if (default_protocol_stream(protocol))
                                return SECCLASS_TCP_SOCKET;
                        else if (extsockclass && protocol == IPPROTO_SCTP)
                                return SECCLASS_SCTP_SOCKET;
                        else
                                return SECCLASS_RAWIP_SOCKET;
                case SOCK_DGRAM:
                        if (default_protocol_dgram(protocol))
                                return SECCLASS_UDP_SOCKET;
                        else if (extsockclass && (protocol == IPPROTO_ICMP ||
                                                  protocol == IPPROTO_ICMPV6))
                                return SECCLASS_ICMP_SOCKET;
                        else
                                return SECCLASS_RAWIP_SOCKET;
                default:
                        return SECCLASS_RAWIP_SOCKET;
                }
                break;
        case PF_NETLINK:
                switch (protocol) {
                case NETLINK_ROUTE:
                        return SECCLASS_NETLINK_ROUTE_SOCKET;
                case NETLINK_SOCK_DIAG:
                        return SECCLASS_NETLINK_TCPDIAG_SOCKET;
                case NETLINK_NFLOG:
                        return SECCLASS_NETLINK_NFLOG_SOCKET;
                case NETLINK_XFRM:
                        return SECCLASS_NETLINK_XFRM_SOCKET;
                case NETLINK_SELINUX:
                        return SECCLASS_NETLINK_SELINUX_SOCKET;
                case NETLINK_ISCSI:
                        return SECCLASS_NETLINK_ISCSI_SOCKET;
                case NETLINK_AUDIT:
                        return SECCLASS_NETLINK_AUDIT_SOCKET;
                case NETLINK_FIB_LOOKUP:
                        return SECCLASS_NETLINK_FIB_LOOKUP_SOCKET;
                case NETLINK_CONNECTOR:
                        return SECCLASS_NETLINK_CONNECTOR_SOCKET;
                case NETLINK_NETFILTER:
                        return SECCLASS_NETLINK_NETFILTER_SOCKET;
                case NETLINK_DNRTMSG:
                        return SECCLASS_NETLINK_DNRT_SOCKET;
                case NETLINK_KOBJECT_UEVENT:
                        return SECCLASS_NETLINK_KOBJECT_UEVENT_SOCKET;
                case NETLINK_GENERIC:
                        return SECCLASS_NETLINK_GENERIC_SOCKET;
                case NETLINK_SCSITRANSPORT:
                        return SECCLASS_NETLINK_SCSITRANSPORT_SOCKET;
                case NETLINK_RDMA:
                        return SECCLASS_NETLINK_RDMA_SOCKET;
                case NETLINK_CRYPTO:
                        return SECCLASS_NETLINK_CRYPTO_SOCKET;
                default:
                        return SECCLASS_NETLINK_SOCKET;
                }
        case PF_PACKET:
                return SECCLASS_PACKET_SOCKET;
        case PF_KEY:
                return SECCLASS_KEY_SOCKET;
        case PF_APPLETALK:
                return SECCLASS_APPLETALK_SOCKET;
        }

        if (extsockclass) {
                switch (family) {
                case PF_AX25:
                        return SECCLASS_AX25_SOCKET;
                case PF_IPX:
                        return SECCLASS_IPX_SOCKET;
                case PF_NETROM:
                        return SECCLASS_NETROM_SOCKET;
                case PF_ATMPVC:
                        return SECCLASS_ATMPVC_SOCKET;
                case PF_X25:
                        return SECCLASS_X25_SOCKET;
                case PF_ROSE:
                        return SECCLASS_ROSE_SOCKET;
                case PF_DECnet:
                        return SECCLASS_DECNET_SOCKET;
                case PF_ATMSVC:
                        return SECCLASS_ATMSVC_SOCKET;
                case PF_RDS:
                        return SECCLASS_RDS_SOCKET;
                case PF_IRDA:
                        return SECCLASS_IRDA_SOCKET;
                case PF_PPPOX:
                        return SECCLASS_PPPOX_SOCKET;
                case PF_LLC:
                        return SECCLASS_LLC_SOCKET;
                case PF_CAN:
                        return SECCLASS_CAN_SOCKET;
                case PF_TIPC:
                        return SECCLASS_TIPC_SOCKET;
                case PF_BLUETOOTH:
                        return SECCLASS_BLUETOOTH_SOCKET;
                case PF_IUCV:
                        return SECCLASS_IUCV_SOCKET;
                case PF_RXRPC:
                        return SECCLASS_RXRPC_SOCKET;
                case PF_ISDN:
                        return SECCLASS_ISDN_SOCKET;
                case PF_PHONET:
                        return SECCLASS_PHONET_SOCKET;
                case PF_IEEE802154:
                        return SECCLASS_IEEE802154_SOCKET;
                case PF_CAIF:
                        return SECCLASS_CAIF_SOCKET;
                case PF_ALG:
                        return SECCLASS_ALG_SOCKET;
                case PF_NFC:
                        return SECCLASS_NFC_SOCKET;
                case PF_VSOCK:
                        return SECCLASS_VSOCK_SOCKET;
                case PF_KCM:
                        return SECCLASS_KCM_SOCKET;
                case PF_QIPCRTR:
                        return SECCLASS_QIPCRTR_SOCKET;
                case PF_SMC:
                        return SECCLASS_SMC_SOCKET;
                case PF_XDP:
                        return SECCLASS_XDP_SOCKET;
                case PF_MCTP:
                        return SECCLASS_MCTP_SOCKET;
#if PF_MAX > 46
#error New address family defined, please update this function.
#endif
                }
        }

        return SECCLASS_SOCKET;
}

static int selinux_genfs_get_sid(struct dentry *dentry,
                                 u16 tclass,
                                 u16 flags,
                                 u32 *sid)
{
        int rc;
        struct super_block *sb = dentry->d_sb;
        char *buffer, *path;

        buffer = (char *)__get_free_page(GFP_KERNEL);
        if (!buffer)
                return -ENOMEM;

        path = dentry_path_raw(dentry, buffer, PAGE_SIZE);
        if (IS_ERR(path))
                rc = PTR_ERR(path);
        else {
                if (flags & SE_SBPROC) {
                        /* each process gets a /proc/PID/ entry. Strip off the
                         * PID part to get a valid selinux labeling.
                         * e.g. /proc/1/net/rpc/nfs -> /net/rpc/nfs */
                        while (path[1] >= '0' && path[1] <= '9') {
                                path[1] = '/';
                                path++;
                        }
                }
                rc = security_genfs_sid(sb->s_type->name,
                                        path, tclass, sid);
                if (rc == -ENOENT) {
                        /* No match in policy, mark as unlabeled. */
                        *sid = SECINITSID_UNLABELED;
                        rc = 0;
                }
        }
        free_page((unsigned long)buffer);
        return rc;
}

static int inode_doinit_use_xattr(struct inode *inode, struct dentry *dentry,
                                  u32 def_sid, u32 *sid)
{
#define INITCONTEXTLEN 255
        char *context;
        unsigned int len;
        int rc;

        len = INITCONTEXTLEN;
        context = kmalloc(len + 1, GFP_NOFS);
        if (!context)
                return -ENOMEM;

        context[len] = '\0';
        rc = __vfs_getxattr(dentry, inode, XATTR_NAME_SELINUX, context, len);
        if (rc == -ERANGE) {
                kfree(context);

                /* Need a larger buffer.  Query for the right size. */
                rc = __vfs_getxattr(dentry, inode, XATTR_NAME_SELINUX, NULL, 0);
                if (rc < 0)
                        return rc;

                len = rc;
                context = kmalloc(len + 1, GFP_NOFS);
                if (!context)
                        return -ENOMEM;

                context[len] = '\0';
                rc = __vfs_getxattr(dentry, inode, XATTR_NAME_SELINUX,
                                    context, len);
        }
        if (rc < 0) {
                kfree(context);
                if (rc != -ENODATA) {
                        pr_warn("SELinux: %s:  getxattr returned %d for dev=%s ino=%ld\n",
                                __func__, -rc, inode->i_sb->s_id, inode->i_ino);
                        return rc;
                }
                *sid = def_sid;
                return 0;
        }

        rc = security_context_to_sid_default(context, rc, sid,
                                             def_sid, GFP_NOFS);
        if (rc) {
                char *dev = inode->i_sb->s_id;
                unsigned long ino = inode->i_ino;

                if (rc == -EINVAL) {
                        pr_notice_ratelimited("SELinux: inode=%lu on dev=%s was found to have an invalid context=%s.  This indicates you may need to relabel the inode or the filesystem in question.\n",
                                              ino, dev, context);
                } else {
                        pr_warn("SELinux: %s:  context_to_sid(%s) returned %d for dev=%s ino=%ld\n",
                                __func__, context, -rc, dev, ino);
                }
        }
        kfree(context);
        return 0;
}

/* The inode's security attributes must be initialized before first use. */
static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dentry)
{
        struct superblock_security_struct *sbsec = NULL;
        struct inode_security_struct *isec = selinux_inode(inode);
        u32 task_sid, sid = 0;
        u16 sclass;
        struct dentry *dentry;
        int rc = 0;

        if (isec->initialized == LABEL_INITIALIZED)
                return 0;

        spin_lock(&isec->lock);
        if (isec->initialized == LABEL_INITIALIZED)
                goto out_unlock;

        if (isec->sclass == SECCLASS_FILE)
                isec->sclass = inode_mode_to_security_class(inode->i_mode);

        sbsec = selinux_superblock(inode->i_sb);
        if (!(sbsec->flags & SE_SBINITIALIZED)) {
                /* Defer initialization until selinux_complete_init,
                   after the initial policy is loaded and the security
                   server is ready to handle calls. */
                spin_lock(&sbsec->isec_lock);
                if (list_empty(&isec->list))
                        list_add(&isec->list, &sbsec->isec_head);
                spin_unlock(&sbsec->isec_lock);
                goto out_unlock;
        }

        sclass = isec->sclass;
        task_sid = isec->task_sid;
        sid = isec->sid;
        isec->initialized = LABEL_PENDING;
        spin_unlock(&isec->lock);

        switch (sbsec->behavior) {
        /*
         * In case of SECURITY_FS_USE_NATIVE we need to re-fetch the labels
         * via xattr when called from delayed_superblock_init().
         */
        case SECURITY_FS_USE_NATIVE:
        case SECURITY_FS_USE_XATTR:
                if (!(inode->i_opflags & IOP_XATTR)) {
                        sid = sbsec->def_sid;
                        break;
                }
                /* Need a dentry, since the xattr API requires one.
                   Life would be simpler if we could just pass the inode. */
                if (opt_dentry) {
                        /* Called from d_instantiate or d_splice_alias. */
                        dentry = dget(opt_dentry);
                } else {
                        /*
                         * Called from selinux_complete_init, try to find a dentry.
                         * Some filesystems really want a connected one, so try
                         * that first.  We could split SECURITY_FS_USE_XATTR in
                         * two, depending upon that...
                         */
                        dentry = d_find_alias(inode);
                        if (!dentry)
                                dentry = d_find_any_alias(inode);
                }
                if (!dentry) {
                        /*
                         * this is can be hit on boot when a file is accessed
                         * before the policy is loaded.  When we load policy we
                         * may find inodes that have no dentry on the
                         * sbsec->isec_head list.  No reason to complain as these
                         * will get fixed up the next time we go through
                         * inode_doinit with a dentry, before these inodes could
                         * be used again by userspace.
                         */
                        goto out_invalid;
                }

                rc = inode_doinit_use_xattr(inode, dentry, sbsec->def_sid,
                                            &sid);
                dput(dentry);
                if (rc)
                        goto out;
                break;
        case SECURITY_FS_USE_TASK:
                sid = task_sid;
                break;
        case SECURITY_FS_USE_TRANS:
                /* Default to the fs SID. */
                sid = sbsec->sid;

                /* Try to obtain a transition SID. */
                rc = security_transition_sid(task_sid, sid,
                                             sclass, NULL, &sid);
                if (rc)
                        goto out;
                break;
        case SECURITY_FS_USE_MNTPOINT:
                sid = sbsec->mntpoint_sid;
                break;
        default:
                /* Default to the fs superblock SID. */
                sid = sbsec->sid;

                if ((sbsec->flags & SE_SBGENFS) &&
                     (!S_ISLNK(inode->i_mode) ||
                      selinux_policycap_genfs_seclabel_symlinks())) {
                        /* We must have a dentry to determine the label on
                         * procfs inodes */
                        if (opt_dentry) {
                                /* Called from d_instantiate or
                                 * d_splice_alias. */
                                dentry = dget(opt_dentry);
                        } else {
                                /* Called from selinux_complete_init, try to
                                 * find a dentry.  Some filesystems really want
                                 * a connected one, so try that first.
                                 */
                                dentry = d_find_alias(inode);
                                if (!dentry)
                                        dentry = d_find_any_alias(inode);
                        }
                        /*
                         * This can be hit on boot when a file is accessed
                         * before the policy is loaded.  When we load policy we
                         * may find inodes that have no dentry on the
                         * sbsec->isec_head list.  No reason to complain as
                         * these will get fixed up the next time we go through
                         * inode_doinit() with a dentry, before these inodes
                         * could be used again by userspace.
                         */
                        if (!dentry)
                                goto out_invalid;
                        rc = selinux_genfs_get_sid(dentry, sclass,
                                                   sbsec->flags, &sid);
                        if (rc) {
                                dput(dentry);
                                goto out;
                        }

                        if ((sbsec->flags & SE_SBGENFS_XATTR) &&
                            (inode->i_opflags & IOP_XATTR)) {
                                rc = inode_doinit_use_xattr(inode, dentry,
                                                            sid, &sid);
                                if (rc) {
                                        dput(dentry);
                                        goto out;
                                }
                        }
                        dput(dentry);
                }
                break;
        }

out:
        spin_lock(&isec->lock);
        if (isec->initialized == LABEL_PENDING) {
                if (rc) {
                        isec->initialized = LABEL_INVALID;
                        goto out_unlock;
                }
                isec->initialized = LABEL_INITIALIZED;
                isec->sid = sid;
        }

out_unlock:
        spin_unlock(&isec->lock);
        return rc;

out_invalid:
        spin_lock(&isec->lock);
        if (isec->initialized == LABEL_PENDING) {
                isec->initialized = LABEL_INVALID;
                isec->sid = sid;
        }
        spin_unlock(&isec->lock);
        return 0;
}

/* Convert a Linux signal to an access vector. */
static inline u32 signal_to_av(int sig)
{
        u32 perm = 0;

        switch (sig) {
        case SIGCHLD:
                /* Commonly granted from child to parent. */
                perm = PROCESS__SIGCHLD;
                break;
        case SIGKILL:
                /* Cannot be caught or ignored */
                perm = PROCESS__SIGKILL;
                break;
        case SIGSTOP:
                /* Cannot be caught or ignored */
                perm = PROCESS__SIGSTOP;
                break;
        default:
                /* All other signals. */
                perm = PROCESS__SIGNAL;
                break;
        }

        return perm;
}

#if CAP_LAST_CAP > 63
#error Fix SELinux to handle capabilities > 63.
#endif

/* Check whether a task is allowed to use a capability. */
static int cred_has_capability(const struct cred *cred,
                               int cap, unsigned int opts, bool initns)
{
        struct common_audit_data ad;
        struct av_decision avd;
        u16 sclass;
        u32 sid = cred_sid(cred);
        u32 av = CAP_TO_MASK(cap);
        int rc;

        ad.type = LSM_AUDIT_DATA_CAP;
        ad.u.cap = cap;

        switch (CAP_TO_INDEX(cap)) {
        case 0:
                sclass = initns ? SECCLASS_CAPABILITY : SECCLASS_CAP_USERNS;
                break;
        case 1:
                sclass = initns ? SECCLASS_CAPABILITY2 : SECCLASS_CAP2_USERNS;
                break;
        default:
                pr_err("SELinux:  out of range capability %d\n", cap);
                BUG();
                return -EINVAL;
        }

        rc = avc_has_perm_noaudit(sid, sid, sclass, av, 0, &avd);
        if (!(opts & CAP_OPT_NOAUDIT)) {
                int rc2 = avc_audit(sid, sid, sclass, av, &avd, rc, &ad);
                if (rc2)
                        return rc2;
        }
        return rc;
}

/* Check whether a task has a particular permission to an inode.
   The 'adp' parameter is optional and allows other audit
   data to be passed (e.g. the dentry). */
static int inode_has_perm(const struct cred *cred,
                          struct inode *inode,
                          u32 perms,
                          struct common_audit_data *adp)
{
        struct inode_security_struct *isec;
        u32 sid;

        if (unlikely(IS_PRIVATE(inode)))
                return 0;

        sid = cred_sid(cred);
        isec = selinux_inode(inode);

        return avc_has_perm(sid, isec->sid, isec->sclass, perms, adp);
}

/* Same as inode_has_perm, but pass explicit audit data containing
   the dentry to help the auditing code to more easily generate the
   pathname if needed. */
static inline int dentry_has_perm(const struct cred *cred,
                                  struct dentry *dentry,
                                  u32 av)
{
        struct common_audit_data ad;
        struct inode *inode = d_backing_inode(dentry);
        struct inode_security_struct *isec = selinux_inode(inode);

        ad.type = LSM_AUDIT_DATA_DENTRY;
        ad.u.dentry = dentry;
        /* check below is racy, but revalidate will recheck with lock held */
        if (data_race(unlikely(isec->initialized != LABEL_INITIALIZED)))
                __inode_security_revalidate(inode, dentry, true);
        return inode_has_perm(cred, inode, av, &ad);
}

/* Same as inode_has_perm, but pass explicit audit data containing
   the path to help the auditing code to more easily generate the
   pathname if needed. */
static inline int path_has_perm(const struct cred *cred,
                                const struct path *path,
                                u32 av)
{
        struct common_audit_data ad;
        struct inode *inode = d_backing_inode(path->dentry);
        struct inode_security_struct *isec = selinux_inode(inode);

        ad.type = LSM_AUDIT_DATA_PATH;
        ad.u.path = *path;
        /* check below is racy, but revalidate will recheck with lock held */
        if (data_race(unlikely(isec->initialized != LABEL_INITIALIZED)))
                __inode_security_revalidate(inode, path->dentry, true);
        return inode_has_perm(cred, inode, av, &ad);
}

/* Same as path_has_perm, but uses the inode from the file struct. */
static inline int file_path_has_perm(const struct cred *cred,
                                     struct file *file,
                                     u32 av)
{
        struct common_audit_data ad;

        ad.type = LSM_AUDIT_DATA_FILE;
        ad.u.file = file;
        return inode_has_perm(cred, file_inode(file), av, &ad);
}

#ifdef CONFIG_BPF_SYSCALL
static int bpf_fd_pass(const struct file *file, u32 sid);
#endif

/* Check whether a task can use an open file descriptor to
   access an inode in a given way.  Check access to the
   descriptor itself, and then use dentry_has_perm to
   check a particular permission to the file.
   Access to the descriptor is implicitly granted if it
   has the same SID as the process.  If av is zero, then
   access to the file is not checked, e.g. for cases
   where only the descriptor is affected like seek. */
static int file_has_perm(const struct cred *cred,
                         struct file *file,
                         u32 av)
{
        struct file_security_struct *fsec = selinux_file(file);
        struct inode *inode = file_inode(file);
        struct common_audit_data ad;
        u32 sid = cred_sid(cred);
        int rc;

        ad.type = LSM_AUDIT_DATA_FILE;
        ad.u.file = file;

        if (sid != fsec->sid) {
                rc = avc_has_perm(sid, fsec->sid,
                                  SECCLASS_FD,
                                  FD__USE,
                                  &ad);
                if (rc)
                        goto out;
        }

#ifdef CONFIG_BPF_SYSCALL
        rc = bpf_fd_pass(file, cred_sid(cred));
        if (rc)
                return rc;
#endif

        /* av is zero if only checking access to the descriptor. */
        rc = 0;
        if (av)
                rc = inode_has_perm(cred, inode, av, &ad);

out:
        return rc;
}

/*
 * Determine the label for an inode that might be unioned.
 */
static int
selinux_determine_inode_label(const struct cred_security_struct *crsec,
                                 struct inode *dir,
                                 const struct qstr *name, u16 tclass,
                                 u32 *_new_isid)
{
        const struct superblock_security_struct *sbsec =
                                                selinux_superblock(dir->i_sb);

        if ((sbsec->flags & SE_SBINITIALIZED) &&
            (sbsec->behavior == SECURITY_FS_USE_MNTPOINT)) {
                *_new_isid = sbsec->mntpoint_sid;
        } else if ((sbsec->flags & SBLABEL_MNT) &&
                   crsec->create_sid) {
                *_new_isid = crsec->create_sid;
        } else {
                const struct inode_security_struct *dsec = inode_security(dir);
                return security_transition_sid(crsec->sid,
                                               dsec->sid, tclass,
                                               name, _new_isid);
        }

        return 0;
}

/* Check whether a task can create a file. */
static int may_create(struct inode *dir,
                      struct dentry *dentry,
                      u16 tclass)
{
        const struct cred_security_struct *crsec = selinux_cred(current_cred());
        struct inode_security_struct *dsec;
        struct superblock_security_struct *sbsec;
        u32 sid, newsid;
        struct common_audit_data ad;
        int rc;

        dsec = inode_security(dir);
        sbsec = selinux_superblock(dir->i_sb);

        sid = crsec->sid;

        ad.type = LSM_AUDIT_DATA_DENTRY;
        ad.u.dentry = dentry;

        rc = avc_has_perm(sid, dsec->sid, SECCLASS_DIR,
                          DIR__ADD_NAME | DIR__SEARCH,
                          &ad);
        if (rc)
                return rc;

        rc = selinux_determine_inode_label(crsec, dir, &dentry->d_name, tclass,
                                           &newsid);
        if (rc)
                return rc;

        rc = avc_has_perm(sid, newsid, tclass, FILE__CREATE, &ad);
        if (rc)
                return rc;

        return avc_has_perm(newsid, sbsec->sid,
                            SECCLASS_FILESYSTEM,
                            FILESYSTEM__ASSOCIATE, &ad);
}

#define MAY_LINK        0
#define MAY_UNLINK        1
#define MAY_RMDIR        2

/* Check whether a task can link, unlink, or rmdir a file/directory. */
static int may_link(struct inode *dir,
                    struct dentry *dentry,
                    int kind)

{
        struct inode_security_struct *dsec, *isec;
        struct common_audit_data ad;
        u32 sid = current_sid();
        u32 av;
        int rc;

        dsec = inode_security(dir);
        isec = backing_inode_security(dentry);

        ad.type = LSM_AUDIT_DATA_DENTRY;
        ad.u.dentry = dentry;

        av = DIR__SEARCH;
        av |= (kind ? DIR__REMOVE_NAME : DIR__ADD_NAME);
        rc = avc_has_perm(sid, dsec->sid, SECCLASS_DIR, av, &ad);
        if (rc)
                return rc;

        switch (kind) {
        case MAY_LINK:
                av = FILE__LINK;
                break;
        case MAY_UNLINK:
                av = FILE__UNLINK;
                break;
        case MAY_RMDIR:
                av = DIR__RMDIR;
                break;
        default:
                pr_warn("SELinux: %s:  unrecognized kind %d\n",
                        __func__, kind);
                return 0;
        }

        rc = avc_has_perm(sid, isec->sid, isec->sclass, av, &ad);
        return rc;
}

static inline int may_rename(struct inode *old_dir,
                             struct dentry *old_dentry,
                             struct inode *new_dir,
                             struct dentry *new_dentry)
{
        struct inode_security_struct *old_dsec, *new_dsec, *old_isec, *new_isec;
        struct common_audit_data ad;
        u32 sid = current_sid();
        u32 av;
        int old_is_dir, new_is_dir;
        int rc;

        old_dsec = inode_security(old_dir);
        old_isec = backing_inode_security(old_dentry);
        old_is_dir = d_is_dir(old_dentry);
        new_dsec = inode_security(new_dir);

        ad.type = LSM_AUDIT_DATA_DENTRY;

        ad.u.dentry = old_dentry;
        rc = avc_has_perm(sid, old_dsec->sid, SECCLASS_DIR,
                          DIR__REMOVE_NAME | DIR__SEARCH, &ad);
        if (rc)
                return rc;
        rc = avc_has_perm(sid, old_isec->sid,
                          old_isec->sclass, FILE__RENAME, &ad);
        if (rc)
                return rc;
        if (old_is_dir && new_dir != old_dir) {
                rc = avc_has_perm(sid, old_isec->sid,
                                  old_isec->sclass, DIR__REPARENT, &ad);
                if (rc)
                        return rc;
        }

        ad.u.dentry = new_dentry;
        av = DIR__ADD_NAME | DIR__SEARCH;
        if (d_is_positive(new_dentry))
                av |= DIR__REMOVE_NAME;
        rc = avc_has_perm(sid, new_dsec->sid, SECCLASS_DIR, av, &ad);
        if (rc)
                return rc;
        if (d_is_positive(new_dentry)) {
                new_isec = backing_inode_security(new_dentry);
                new_is_dir = d_is_dir(new_dentry);
                rc = avc_has_perm(sid, new_isec->sid,
                                  new_isec->sclass,
                                  (new_is_dir ? DIR__RMDIR : FILE__UNLINK), &ad);
                if (rc)
                        return rc;
        }

        return 0;
}

/* Check whether a task can perform a filesystem operation. */
static int superblock_has_perm(const struct cred *cred,
                               const struct super_block *sb,
                               u32 perms,
                               struct common_audit_data *ad)
{
        struct superblock_security_struct *sbsec;
        u32 sid = cred_sid(cred);

        sbsec = selinux_superblock(sb);
        return avc_has_perm(sid, sbsec->sid, SECCLASS_FILESYSTEM, perms, ad);
}

/* Convert a Linux mode and permission mask to an access vector. */
static inline u32 file_mask_to_av(int mode, int mask)
{
        u32 av = 0;

        if (!S_ISDIR(mode)) {
                if (mask & MAY_EXEC)
                        av |= FILE__EXECUTE;
                if (mask & MAY_READ)
                        av |= FILE__READ;

                if (mask & MAY_APPEND)
                        av |= FILE__APPEND;
                else if (mask & MAY_WRITE)
                        av |= FILE__WRITE;

        } else {
                if (mask & MAY_EXEC)
                        av |= DIR__SEARCH;
                if (mask & MAY_WRITE)
                        av |= DIR__WRITE;
                if (mask & MAY_READ)
                        av |= DIR__READ;
        }

        return av;
}

/* Convert a Linux file to an access vector. */
static inline u32 file_to_av(const struct file *file)
{
        u32 av = 0;

        if (file->f_mode & FMODE_READ)
                av |= FILE__READ;
        if (file->f_mode & FMODE_WRITE) {
                if (file->f_flags & O_APPEND)
                        av |= FILE__APPEND;
                else
                        av |= FILE__WRITE;
        }
        if (!av) {
                /*
                 * Special file opened with flags 3 for ioctl-only use.
                 */
                av = FILE__IOCTL;
        }

        return av;
}

/*
 * Convert a file to an access vector and include the correct
 * open permission.
 */
static inline u32 open_file_to_av(struct file *file)
{
        u32 av = file_to_av(file);
        struct inode *inode = file_inode(file);

        if (selinux_policycap_openperm() &&
            inode->i_sb->s_magic != SOCKFS_MAGIC)
                av |= FILE__OPEN;

        return av;
}

/* Hook functions begin here. */

static int selinux_binder_set_context_mgr(const struct cred *mgr)
{
        return avc_has_perm(current_sid(), cred_sid(mgr), SECCLASS_BINDER,
                            BINDER__SET_CONTEXT_MGR, NULL);
}

static int selinux_binder_transaction(const struct cred *from,
                                      const struct cred *to)
{
        u32 mysid = current_sid();
        u32 fromsid = cred_sid(from);
        u32 tosid = cred_sid(to);
        int rc;

        if (mysid != fromsid) {
                rc = avc_has_perm(mysid, fromsid, SECCLASS_BINDER,
                                  BINDER__IMPERSONATE, NULL);
                if (rc)
                        return rc;
        }

        return avc_has_perm(fromsid, tosid,
                            SECCLASS_BINDER, BINDER__CALL, NULL);
}

static int selinux_binder_transfer_binder(const struct cred *from,
                                          const struct cred *to)
{
        return avc_has_perm(cred_sid(from), cred_sid(to),
                            SECCLASS_BINDER, BINDER__TRANSFER,
                            NULL);
}

static int selinux_binder_transfer_file(const struct cred *from,
                                        const struct cred *to,
                                        const struct file *file)
{
        u32 sid = cred_sid(to);
        struct file_security_struct *fsec = selinux_file(file);
        struct dentry *dentry = file->f_path.dentry;
        struct inode_security_struct *isec;
        struct common_audit_data ad;
        int rc;

        ad.type = LSM_AUDIT_DATA_PATH;
        ad.u.path = file->f_path;

        if (sid != fsec->sid) {
                rc = avc_has_perm(sid, fsec->sid,
                                  SECCLASS_FD,
                                  FD__USE,
                                  &ad);
                if (rc)
                        return rc;
        }

#ifdef CONFIG_BPF_SYSCALL
        rc = bpf_fd_pass(file, sid);
        if (rc)
                return rc;
#endif

        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;

        isec = backing_inode_security(dentry);
        return avc_has_perm(sid, isec->sid, isec->sclass, file_to_av(file),
                            &ad);
}

static int selinux_ptrace_access_check(struct task_struct *child,
                                       unsigned int mode)
{
        u32 sid = current_sid();
        u32 csid = task_sid_obj(child);

        if (mode & PTRACE_MODE_READ)
                return avc_has_perm(sid, csid, SECCLASS_FILE, FILE__READ,
                                NULL);

        return avc_has_perm(sid, csid, SECCLASS_PROCESS, PROCESS__PTRACE,
                        NULL);
}

static int selinux_ptrace_traceme(struct task_struct *parent)
{
        return avc_has_perm(task_sid_obj(parent), task_sid_obj(current),
                            SECCLASS_PROCESS, PROCESS__PTRACE, NULL);
}

static int selinux_capget(const struct task_struct *target, kernel_cap_t *effective,
                          kernel_cap_t *inheritable, kernel_cap_t *permitted)
{
        return avc_has_perm(current_sid(), task_sid_obj(target),
                        SECCLASS_PROCESS, PROCESS__GETCAP, NULL);
}

static int selinux_capset(struct cred *new, const struct cred *old,
                          const kernel_cap_t *effective,
                          const kernel_cap_t *inheritable,
                          const kernel_cap_t *permitted)
{
        return avc_has_perm(cred_sid(old), cred_sid(new), SECCLASS_PROCESS,
                            PROCESS__SETCAP, NULL);
}

/*
 * (This comment used to live with the selinux_task_setuid hook,
 * which was removed).
 *
 * Since setuid only affects the current process, and since the SELinux
 * controls are not based on the Linux identity attributes, SELinux does not
 * need to control this operation.  However, SELinux does control the use of
 * the CAP_SETUID and CAP_SETGID capabilities using the capable hook.
 */

static int selinux_capable(const struct cred *cred, struct user_namespace *ns,
                           int cap, unsigned int opts)
{
        return cred_has_capability(cred, cap, opts, ns == &init_user_ns);
}

static int selinux_quotactl(int cmds, int type, int id, const struct super_block *sb)
{
        const struct cred *cred = current_cred();
        int rc = 0;

        if (!sb)
                return 0;

        switch (cmds) {
        case Q_SYNC:
        case Q_QUOTAON:
        case Q_QUOTAOFF:
        case Q_SETINFO:
        case Q_SETQUOTA:
        case Q_XQUOTAOFF:
        case Q_XQUOTAON:
        case Q_XSETQLIM:
                rc = superblock_has_perm(cred, sb, FILESYSTEM__QUOTAMOD, NULL);
                break;
        case Q_GETFMT:
        case Q_GETINFO:
        case Q_GETQUOTA:
        case Q_XGETQUOTA:
        case Q_XGETQSTAT:
        case Q_XGETQSTATV:
        case Q_XGETNEXTQUOTA:
                rc = superblock_has_perm(cred, sb, FILESYSTEM__QUOTAGET, NULL);
                break;
        default:
                rc = 0;  /* let the kernel handle invalid cmds */
                break;
        }
        return rc;
}

static int selinux_quota_on(struct dentry *dentry)
{
        const struct cred *cred = current_cred();

        return dentry_has_perm(cred, dentry, FILE__QUOTAON);
}

static int selinux_syslog(int type)
{
        switch (type) {
        case SYSLOG_ACTION_READ_ALL:        /* Read last kernel messages */
        case SYSLOG_ACTION_SIZE_BUFFER:        /* Return size of the log buffer */
                return avc_has_perm(current_sid(), SECINITSID_KERNEL,
                                    SECCLASS_SYSTEM, SYSTEM__SYSLOG_READ, NULL);
        case SYSLOG_ACTION_CONSOLE_OFF:        /* Disable logging to console */
        case SYSLOG_ACTION_CONSOLE_ON:        /* Enable logging to console */
        /* Set level of messages printed to console */
        case SYSLOG_ACTION_CONSOLE_LEVEL:
                return avc_has_perm(current_sid(), SECINITSID_KERNEL,
                                    SECCLASS_SYSTEM, SYSTEM__SYSLOG_CONSOLE,
                                    NULL);
        }
        /* All other syslog types */
        return avc_has_perm(current_sid(), SECINITSID_KERNEL,
                            SECCLASS_SYSTEM, SYSTEM__SYSLOG_MOD, NULL);
}

/*
 * Check permission for allocating a new virtual mapping. Returns
 * 0 if permission is granted, negative error code if not.
 *
 * Do not audit the selinux permission check, as this is applied to all
 * processes that allocate mappings.
 */
static int selinux_vm_enough_memory(struct mm_struct *mm, long pages)
{
        return cred_has_capability(current_cred(), CAP_SYS_ADMIN,
                                   CAP_OPT_NOAUDIT, true);
}

/* binprm security operations */

static u32 ptrace_parent_sid(void)
{
        u32 sid = 0;
        struct task_struct *tracer;

        rcu_read_lock();
        tracer = ptrace_parent(current);
        if (tracer)
                sid = task_sid_obj(tracer);
        rcu_read_unlock();

        return sid;
}

static int check_nnp_nosuid(const struct linux_binprm *bprm,
                            const struct cred_security_struct *old_crsec,
                            const struct cred_security_struct *new_crsec)
{
        int nnp = (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS);
        int nosuid = !mnt_may_suid(bprm->file->f_path.mnt);
        int rc;
        u32 av;

        if (!nnp && !nosuid)
                return 0; /* neither NNP nor nosuid */

        if (new_crsec->sid == old_crsec->sid)
                return 0; /* No change in credentials */

        /*
         * If the policy enables the nnp_nosuid_transition policy capability,
         * then we permit transitions under NNP or nosuid if the
         * policy allows the corresponding permission between
         * the old and new contexts.
         */
        if (selinux_policycap_nnp_nosuid_transition()) {
                av = 0;
                if (nnp)
                        av |= PROCESS2__NNP_TRANSITION;
                if (nosuid)
                        av |= PROCESS2__NOSUID_TRANSITION;
                rc = avc_has_perm(old_crsec->sid, new_crsec->sid,
                                  SECCLASS_PROCESS2, av, NULL);
                if (!rc)
                        return 0;
        }

        /*
         * We also permit NNP or nosuid transitions to bounded SIDs,
         * i.e. SIDs that are guaranteed to only be allowed a subset
         * of the permissions of the current SID.
         */
        rc = security_bounded_transition(old_crsec->sid,
                                         new_crsec->sid);
        if (!rc)
                return 0;

        /*
         * On failure, preserve the errno values for NNP vs nosuid.
         * NNP:  Operation not permitted for caller.
         * nosuid:  Permission denied to file.
         */
        if (nnp)
                return -EPERM;
        return -EACCES;
}

static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm)
{
        const struct cred_security_struct *old_crsec;
        struct cred_security_struct *new_crsec;
        struct inode_security_struct *isec;
        struct common_audit_data ad;
        struct inode *inode = file_inode(bprm->file);
        int rc;

        /* SELinux context only depends on initial program or script and not
         * the script interpreter */

        old_crsec = selinux_cred(current_cred());
        new_crsec = selinux_cred(bprm->cred);
        isec = inode_security(inode);

        /* Default to the current task SID. */
        new_crsec->sid = old_crsec->sid;
        new_crsec->osid = old_crsec->sid;

        /* Reset fs, key, and sock SIDs on execve. */
        new_crsec->create_sid = 0;
        new_crsec->keycreate_sid = 0;
        new_crsec->sockcreate_sid = 0;

        /*
         * Before policy is loaded, label any task outside kernel space
         * as SECINITSID_INIT, so that any userspace tasks surviving from
         * early boot end up with a label different from SECINITSID_KERNEL
         * (if the policy chooses to set SECINITSID_INIT != SECINITSID_KERNEL).
         */
        if (!selinux_initialized()) {
                new_crsec->sid = SECINITSID_INIT;
                /* also clear the exec_sid just in case */
                new_crsec->exec_sid = 0;
                return 0;
        }

        if (old_crsec->exec_sid) {
                new_crsec->sid = old_crsec->exec_sid;
                /* Reset exec SID on execve. */
                new_crsec->exec_sid = 0;

                /* Fail on NNP or nosuid if not an allowed transition. */
                rc = check_nnp_nosuid(bprm, old_crsec, new_crsec);
                if (rc)
                        return rc;
        } else {
                /* Check for a default transition on this program. */
                rc = security_transition_sid(old_crsec->sid,
                                             isec->sid, SECCLASS_PROCESS, NULL,
                                             &new_crsec->sid);
                if (rc)
                        return rc;

                /*
                 * Fallback to old SID on NNP or nosuid if not an allowed
                 * transition.
                 */
                rc = check_nnp_nosuid(bprm, old_crsec, new_crsec);
                if (rc)
                        new_crsec->sid = old_crsec->sid;
        }

        ad.type = LSM_AUDIT_DATA_FILE;
        ad.u.file = bprm->file;

        if (new_crsec->sid == old_crsec->sid) {
                rc = avc_has_perm(old_crsec->sid, isec->sid,
                                  SECCLASS_FILE, FILE__EXECUTE_NO_TRANS, &ad);
                if (rc)
                        return rc;
        } else {
                /* Check permissions for the transition. */
                rc = avc_has_perm(old_crsec->sid, new_crsec->sid,
                                  SECCLASS_PROCESS, PROCESS__TRANSITION, &ad);
                if (rc)
                        return rc;

                rc = avc_has_perm(new_crsec->sid, isec->sid,
                                  SECCLASS_FILE, FILE__ENTRYPOINT, &ad);
                if (rc)
                        return rc;

                /* Check for shared state */
                if (bprm->unsafe & LSM_UNSAFE_SHARE) {
                        rc = avc_has_perm(old_crsec->sid, new_crsec->sid,
                                          SECCLASS_PROCESS, PROCESS__SHARE,
                                          NULL);
                        if (rc)
                                return -EPERM;
                }

                /* Make sure that anyone attempting to ptrace over a task that
                 * changes its SID has the appropriate permit */
                if (bprm->unsafe & LSM_UNSAFE_PTRACE) {
                        u32 ptsid = ptrace_parent_sid();
                        if (ptsid != 0) {
                                rc = avc_has_perm(ptsid, new_crsec->sid,
                                                  SECCLASS_PROCESS,
                                                  PROCESS__PTRACE, NULL);
                                if (rc)
                                        return -EPERM;
                        }
                }

                /* Clear any possibly unsafe personality bits on exec: */
                bprm->per_clear |= PER_CLEAR_ON_SETID;

                /* Enable secure mode for SIDs transitions unless
                   the noatsecure permission is granted between
                   the two SIDs, i.e. ahp returns 0. */
                rc = avc_has_perm(old_crsec->sid, new_crsec->sid,
                                  SECCLASS_PROCESS, PROCESS__NOATSECURE,
                                  NULL);
                bprm->secureexec |= !!rc;
        }

        return 0;
}

static int match_file(const void *p, struct file *file, unsigned fd)
{
        return file_has_perm(p, file, file_to_av(file)) ? fd + 1 : 0;
}

/* Derived from fs/exec.c:flush_old_files. */
static inline void flush_unauthorized_files(const struct cred *cred,
                                            struct files_struct *files)
{
        struct file *file, *devnull = NULL;
        struct tty_struct *tty;
        int drop_tty = 0;
        unsigned n;

        tty = get_current_tty();
        if (tty) {
                spin_lock(&tty->files_lock);
                if (!list_empty(&tty->tty_files)) {
                        struct tty_file_private *file_priv;

                        /* Revalidate access to controlling tty.
                           Use file_path_has_perm on the tty path directly
                           rather than using file_has_perm, as this particular
                           open file may belong to another process and we are
                           only interested in the inode-based check here. */
                        file_priv = list_first_entry(&tty->tty_files,
                                                struct tty_file_private, list);
                        file = file_priv->file;
                        if (file_path_has_perm(cred, file, FILE__READ | FILE__WRITE))
                                drop_tty = 1;
                }
                spin_unlock(&tty->files_lock);
                tty_kref_put(tty);
        }
        /* Reset controlling tty. */
        if (drop_tty)
                no_tty();

        /* Revalidate access to inherited open files. */
        n = iterate_fd(files, 0, match_file, cred);
        if (!n) /* none found? */
                return;

        devnull = dentry_open(&selinux_null, O_RDWR, cred);
        if (IS_ERR(devnull))
                devnull = NULL;
        /* replace all the matching ones with this */
        do {
                replace_fd(n - 1, devnull, 0);
        } while ((n = iterate_fd(files, n, match_file, cred)) != 0);
        if (devnull)
                fput(devnull);
}

/*
 * Prepare a process for imminent new credential changes due to exec
 */
static void selinux_bprm_committing_creds(const struct linux_binprm *bprm)
{
        struct cred_security_struct *new_crsec;
        struct rlimit *rlim, *initrlim;
        int rc, i;

        new_crsec = selinux_cred(bprm->cred);
        if (new_crsec->sid == new_crsec->osid)
                return;

        /* Close files for which the new task SID is not authorized. */
        flush_unauthorized_files(bprm->cred, current->files);

        /* Always clear parent death signal on SID transitions. */
        current->pdeath_signal = 0;

        /* Check whether the new SID can inherit resource limits from the old
         * SID.  If not, reset all soft limits to the lower of the current
         * task's hard limit and the init task's soft limit.
         *
         * Note that the setting of hard limits (even to lower them) can be
         * controlled by the setrlimit check.  The inclusion of the init task's
         * soft limit into the computation is to avoid resetting soft limits
         * higher than the default soft limit for cases where the default is
         * lower than the hard limit, e.g. RLIMIT_CORE or RLIMIT_STACK.
         */
        rc = avc_has_perm(new_crsec->osid, new_crsec->sid, SECCLASS_PROCESS,
                          PROCESS__RLIMITINH, NULL);
        if (rc) {
                /* protect against do_prlimit() */
                task_lock(current);
                for (i = 0; i < RLIM_NLIMITS; i++) {
                        rlim = current->signal->rlim + i;
                        initrlim = init_task.signal->rlim + i;
                        rlim->rlim_cur = min(rlim->rlim_max, initrlim->rlim_cur);
                }
                task_unlock(current);
                if (IS_ENABLED(CONFIG_POSIX_TIMERS))
                        update_rlimit_cpu(current, rlimit(RLIMIT_CPU));
        }
}

/*
 * Clean up the process immediately after the installation of new credentials
 * due to exec
 */
static void selinux_bprm_committed_creds(const struct linux_binprm *bprm)
{
        const struct cred_security_struct *crsec = selinux_cred(current_cred());
        u32 osid, sid;
        int rc;

        osid = crsec->osid;
        sid = crsec->sid;

        if (sid == osid)
                return;

        /* Check whether the new SID can inherit signal state from the old SID.
         * If not, clear itimers to avoid subsequent signal generation and
         * flush and unblock signals.
         *
         * This must occur _after_ the task SID has been updated so that any
         * kill done after the flush will be checked against the new SID.
         */
        rc = avc_has_perm(osid, sid, SECCLASS_PROCESS, PROCESS__SIGINH, NULL);
        if (rc) {
                clear_itimer();

                spin_lock_irq(&unrcu_pointer(current->sighand)->siglock);
                if (!fatal_signal_pending(current)) {
                        flush_sigqueue(&current->pending);
                        flush_sigqueue(&current->signal->shared_pending);
                        flush_signal_handlers(current, 1);
                        sigemptyset(&current->blocked);
                        recalc_sigpending();
                }
                spin_unlock_irq(&unrcu_pointer(current->sighand)->siglock);
        }

        /* Wake up the parent if it is waiting so that it can recheck
         * wait permission to the new task SID. */
        read_lock(&tasklist_lock);
        __wake_up_parent(current, unrcu_pointer(current->real_parent));
        read_unlock(&tasklist_lock);
}

/* superblock security operations */

static int selinux_sb_alloc_security(struct super_block *sb)
{
        struct superblock_security_struct *sbsec = selinux_superblock(sb);

        mutex_init(&sbsec->lock);
        INIT_LIST_HEAD(&sbsec->isec_head);
        spin_lock_init(&sbsec->isec_lock);
        sbsec->sid = SECINITSID_UNLABELED;
        sbsec->def_sid = SECINITSID_FILE;
        sbsec->mntpoint_sid = SECINITSID_UNLABELED;

        return 0;
}

static inline int opt_len(const char *s)
{
        bool open_quote = false;
        int len;
        char c;

        for (len = 0; (c = s[len]) != '\0'; len++) {
                if (c == '"')
                        open_quote = !open_quote;
                if (c == ',' && !open_quote)
                        break;
        }
        return len;
}

static int selinux_sb_eat_lsm_opts(char *options, void **mnt_opts)
{
        char *from = options;
        char *to = options;
        bool first = true;
        int rc;

        while (1) {
                int len = opt_len(from);
                int token;
                char *arg = NULL;

                token = match_opt_prefix(from, len, &arg);

                if (token != Opt_error) {
                        char *p, *q;

                        /* strip quotes */
                        if (arg) {
                                for (p = q = arg; p < from + len; p++) {
                                        char c = *p;
                                        if (c != '"')
                                                *q++ = c;
                                }
                                arg = kmemdup_nul(arg, q - arg, GFP_KERNEL);
                                if (!arg) {
                                        rc = -ENOMEM;
                                        goto free_opt;
                                }
                        }
                        rc = selinux_add_opt(token, arg, mnt_opts);
                        kfree(arg);
                        arg = NULL;
                        if (unlikely(rc)) {
                                goto free_opt;
                        }
                } else {
                        if (!first) {        // copy with preceding comma
                                from--;
                                len++;
                        }
                        if (to != from)
                                memmove(to, from, len);
                        to += len;
                        first = false;
                }
                if (!from[len])
                        break;
                from += len + 1;
        }
        *to = '\0';
        return 0;

free_opt:
        if (*mnt_opts) {
                selinux_free_mnt_opts(*mnt_opts);
                *mnt_opts = NULL;
        }
        return rc;
}

static int selinux_sb_mnt_opts_compat(struct super_block *sb, void *mnt_opts)
{
        struct selinux_mnt_opts *opts = mnt_opts;
        struct superblock_security_struct *sbsec = selinux_superblock(sb);

        /*
         * Superblock not initialized (i.e. no options) - reject if any
         * options specified, otherwise accept.
         */
        if (!(sbsec->flags & SE_SBINITIALIZED))
                return opts ? 1 : 0;

        /*
         * Superblock initialized and no options specified - reject if
         * superblock has any options set, otherwise accept.
         */
        if (!opts)
                return (sbsec->flags & SE_MNTMASK) ? 1 : 0;

        if (opts->fscontext_sid) {
                if (bad_option(sbsec, FSCONTEXT_MNT, sbsec->sid,
                               opts->fscontext_sid))
                        return 1;
        }
        if (opts->context_sid) {
                if (bad_option(sbsec, CONTEXT_MNT, sbsec->mntpoint_sid,
                               opts->context_sid))
                        return 1;
        }
        if (opts->rootcontext_sid) {
                struct inode_security_struct *root_isec;

                root_isec = backing_inode_security(sb->s_root);
                if (bad_option(sbsec, ROOTCONTEXT_MNT, root_isec->sid,
                               opts->rootcontext_sid))
                        return 1;
        }
        if (opts->defcontext_sid) {
                if (bad_option(sbsec, DEFCONTEXT_MNT, sbsec->def_sid,
                               opts->defcontext_sid))
                        return 1;
        }
        return 0;
}

static int selinux_sb_remount(struct super_block *sb, void *mnt_opts)
{
        struct selinux_mnt_opts *opts = mnt_opts;
        struct superblock_security_struct *sbsec = selinux_superblock(sb);

        if (!(sbsec->flags & SE_SBINITIALIZED))
                return 0;

        if (!opts)
                return 0;

        if (opts->fscontext_sid) {
                if (bad_option(sbsec, FSCONTEXT_MNT, sbsec->sid,
                               opts->fscontext_sid))
                        goto out_bad_option;
        }
        if (opts->context_sid) {
                if (bad_option(sbsec, CONTEXT_MNT, sbsec->mntpoint_sid,
                               opts->context_sid))
                        goto out_bad_option;
        }
        if (opts->rootcontext_sid) {
                struct inode_security_struct *root_isec;
                root_isec = backing_inode_security(sb->s_root);
                if (bad_option(sbsec, ROOTCONTEXT_MNT, root_isec->sid,
                               opts->rootcontext_sid))
                        goto out_bad_option;
        }
        if (opts->defcontext_sid) {
                if (bad_option(sbsec, DEFCONTEXT_MNT, sbsec->def_sid,
                               opts->defcontext_sid))
                        goto out_bad_option;
        }
        return 0;

out_bad_option:
        pr_warn("SELinux: unable to change security options "
               "during remount (dev %s, type=%s)\n", sb->s_id,
               sb->s_type->name);
        return -EINVAL;
}

static int selinux_sb_kern_mount(const struct super_block *sb)
{
        const struct cred *cred = current_cred();
        struct common_audit_data ad;

        ad.type = LSM_AUDIT_DATA_DENTRY;
        ad.u.dentry = sb->s_root;
        return superblock_has_perm(cred, sb, FILESYSTEM__MOUNT, &ad);
}

static int selinux_sb_statfs(struct dentry *dentry)
{
        const struct cred *cred = current_cred();
        struct common_audit_data ad;

        ad.type = LSM_AUDIT_DATA_DENTRY;
        ad.u.dentry = dentry->d_sb->s_root;
        return superblock_has_perm(cred, dentry->d_sb, FILESYSTEM__GETATTR, &ad);
}

static int selinux_mount(const char *dev_name,
                         const struct path *path,
                         const char *type,
                         unsigned long flags,
                         void *data)
{
        const struct cred *cred = current_cred();

        if (flags & MS_REMOUNT)
                return superblock_has_perm(cred, path->dentry->d_sb,
                                           FILESYSTEM__REMOUNT, NULL);
        else
                return path_has_perm(cred, path, FILE__MOUNTON);
}

static int selinux_move_mount(const struct path *from_path,
                              const struct path *to_path)
{
        const struct cred *cred = current_cred();

        return path_has_perm(cred, to_path, FILE__MOUNTON);
}

static int selinux_umount(struct vfsmount *mnt, int flags)
{
        const struct cred *cred = current_cred();

        return superblock_has_perm(cred, mnt->mnt_sb,
                                   FILESYSTEM__UNMOUNT, NULL);
}

static int selinux_fs_context_submount(struct fs_context *fc,
                                   struct super_block *reference)
{
        const struct superblock_security_struct *sbsec = selinux_superblock(reference);
        struct selinux_mnt_opts *opts;

        /*
         * Ensure that fc->security remains NULL when no options are set
         * as expected by selinux_set_mnt_opts().
         */
        if (!(sbsec->flags & (FSCONTEXT_MNT|CONTEXT_MNT|DEFCONTEXT_MNT)))
                return 0;

        opts = kzalloc(sizeof(*opts), GFP_KERNEL);
        if (!opts)
                return -ENOMEM;

        if (sbsec->flags & FSCONTEXT_MNT)
                opts->fscontext_sid = sbsec->sid;
        if (sbsec->flags & CONTEXT_MNT)
                opts->context_sid = sbsec->mntpoint_sid;
        if (sbsec->flags & DEFCONTEXT_MNT)
                opts->defcontext_sid = sbsec->def_sid;
        fc->security = opts;
        return 0;
}

static int selinux_fs_context_dup(struct fs_context *fc,
                                  struct fs_context *src_fc)
{
        const struct selinux_mnt_opts *src = src_fc->security;

        if (!src)
                return 0;

        fc->security = kmemdup(src, sizeof(*src), GFP_KERNEL);
        return fc->security ? 0 : -ENOMEM;
}

static const struct fs_parameter_spec selinux_fs_parameters[] = {
        fsparam_string(CONTEXT_STR,        Opt_context),
        fsparam_string(DEFCONTEXT_STR,        Opt_defcontext),
        fsparam_string(FSCONTEXT_STR,        Opt_fscontext),
        fsparam_string(ROOTCONTEXT_STR,        Opt_rootcontext),
        fsparam_flag  (SECLABEL_STR,        Opt_seclabel),
        {}
};

static int selinux_fs_context_parse_param(struct fs_context *fc,
                                          struct fs_parameter *param)
{
        struct fs_parse_result result;
        int opt;

        opt = fs_parse(fc, selinux_fs_parameters, param, &result);
        if (opt < 0)
                return opt;

        return selinux_add_opt(opt, param->string, &fc->security);
}

/* inode security operations */

static int selinux_inode_alloc_security(struct inode *inode)
{
        struct inode_security_struct *isec = selinux_inode(inode);
        u32 sid = current_sid();

        spin_lock_init(&isec->lock);
        INIT_LIST_HEAD(&isec->list);
        isec->inode = inode;
        isec->sid = SECINITSID_UNLABELED;
        isec->sclass = SECCLASS_FILE;
        isec->task_sid = sid;
        isec->initialized = LABEL_INVALID;

        return 0;
}

static void selinux_inode_free_security(struct inode *inode)
{
        inode_free_security(inode);
}

static int selinux_dentry_init_security(struct dentry *dentry, int mode,
                                        const struct qstr *name,
                                        const char **xattr_name,
                                        struct lsm_context *cp)
{
        u32 newsid;
        int rc;

        rc = selinux_determine_inode_label(selinux_cred(current_cred()),
                                           d_inode(dentry->d_parent), name,
                                           inode_mode_to_security_class(mode),
                                           &newsid);
        if (rc)
                return rc;

        if (xattr_name)
                *xattr_name = XATTR_NAME_SELINUX;

        cp->id = LSM_ID_SELINUX;
        return security_sid_to_context(newsid, &cp->context, &cp->len);
}

static int selinux_dentry_create_files_as(struct dentry *dentry, int mode,
                                          const struct qstr *name,
                                          const struct cred *old,
                                          struct cred *new)
{
        u32 newsid;
        int rc;
        struct cred_security_struct *crsec;

        rc = selinux_determine_inode_label(selinux_cred(old),
                                           d_inode(dentry->d_parent), name,
                                           inode_mode_to_security_class(mode),
                                           &newsid);
        if (rc)
                return rc;

        crsec = selinux_cred(new);
        crsec->create_sid = newsid;
        return 0;
}

static int selinux_inode_init_security(struct inode *inode, struct inode *dir,
                                       const struct qstr *qstr,
                                       struct xattr *xattrs, int *xattr_count)
{
        const struct cred_security_struct *crsec = selinux_cred(current_cred());
        struct superblock_security_struct *sbsec;
        struct xattr *xattr = lsm_get_xattr_slot(xattrs, xattr_count);
        u32 newsid, clen;
        u16 newsclass;
        int rc;
        char *context;

        sbsec = selinux_superblock(dir->i_sb);

        newsid = crsec->create_sid;
        newsclass = inode_mode_to_security_class(inode->i_mode);
        rc = selinux_determine_inode_label(crsec, dir, qstr, newsclass, &newsid);
        if (rc)
                return rc;

        /* Possibly defer initialization to selinux_complete_init. */
        if (sbsec->flags & SE_SBINITIALIZED) {
                struct inode_security_struct *isec = selinux_inode(inode);
                isec->sclass = newsclass;
                isec->sid = newsid;
                isec->initialized = LABEL_INITIALIZED;
        }

        if (!selinux_initialized() ||
            !(sbsec->flags & SBLABEL_MNT))
                return -EOPNOTSUPP;

        if (xattr) {
                rc = security_sid_to_context_force(newsid,
                                                   &context, &clen);
                if (rc)
                        return rc;
                xattr->value = context;
                xattr->value_len = clen;
                xattr->name = XATTR_SELINUX_SUFFIX;
        }

        return 0;
}

static int selinux_inode_init_security_anon(struct inode *inode,
                                            const struct qstr *name,
                                            const struct inode *context_inode)
{
        u32 sid = current_sid();
        struct common_audit_data ad;
        struct inode_security_struct *isec;
        int rc;

        if (unlikely(!selinux_initialized()))
                return 0;

        isec = selinux_inode(inode);

        /*
         * We only get here once per ephemeral inode.  The inode has
         * been initialized via inode_alloc_security but is otherwise
         * untouched.
         */

        if (context_inode) {
                struct inode_security_struct *context_isec =
                        selinux_inode(context_inode);
                if (context_isec->initialized != LABEL_INITIALIZED) {
                        pr_err("SELinux:  context_inode is not initialized\n");
                        return -EACCES;
                }

                isec->sclass = context_isec->sclass;
                isec->sid = context_isec->sid;
        } else {
                isec->sclass = SECCLASS_ANON_INODE;
                rc = security_transition_sid(
                        sid, sid,
                        isec->sclass, name, &isec->sid);
                if (rc)
                        return rc;
        }

        isec->initialized = LABEL_INITIALIZED;
        /*
         * Now that we've initialized security, check whether we're
         * allowed to actually create this type of anonymous inode.
         */

        ad.type = LSM_AUDIT_DATA_ANONINODE;
        ad.u.anonclass = name ? (const char *)name->name : "?";

        return avc_has_perm(sid,
                            isec->sid,
                            isec->sclass,
                            FILE__CREATE,
                            &ad);
}

static int selinux_inode_create(struct inode *dir, struct dentry *dentry, umode_t mode)
{
        return may_create(dir, dentry, SECCLASS_FILE);
}

static int selinux_inode_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry)
{
        return may_link(dir, old_dentry, MAY_LINK);
}

static int selinux_inode_unlink(struct inode *dir, struct dentry *dentry)
{
        return may_link(dir, dentry, MAY_UNLINK);
}

static int selinux_inode_symlink(struct inode *dir, struct dentry *dentry, const char *name)
{
        return may_create(dir, dentry, SECCLASS_LNK_FILE);
}

static int selinux_inode_mkdir(struct inode *dir, struct dentry *dentry, umode_t mask)
{
        return may_create(dir, dentry, SECCLASS_DIR);
}

static int selinux_inode_rmdir(struct inode *dir, struct dentry *dentry)
{
        return may_link(dir, dentry, MAY_RMDIR);
}

static int selinux_inode_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
{
        return may_create(dir, dentry, inode_mode_to_security_class(mode));
}

static int selinux_inode_rename(struct inode *old_inode, struct dentry *old_dentry,
                                struct inode *new_inode, struct dentry *new_dentry)
{
        return may_rename(old_inode, old_dentry, new_inode, new_dentry);
}

static int selinux_inode_readlink(struct dentry *dentry)
{
        const struct cred *cred = current_cred();

        return dentry_has_perm(cred, dentry, FILE__READ);
}

static int selinux_inode_follow_link(struct dentry *dentry, struct inode *inode,
                                     bool rcu)
{
        struct common_audit_data ad;
        struct inode_security_struct *isec;
        u32 sid = current_sid();

        ad.type = LSM_AUDIT_DATA_DENTRY;
        ad.u.dentry = dentry;
        isec = inode_security_rcu(inode, rcu);
        if (IS_ERR(isec))
                return PTR_ERR(isec);

        return avc_has_perm(sid, isec->sid, isec->sclass, FILE__READ, &ad);
}

static noinline int audit_inode_permission(struct inode *inode,
                                           u32 perms, u32 audited, u32 denied,
                                           int result)
{
        struct common_audit_data ad;
        struct inode_security_struct *isec = selinux_inode(inode);

        ad.type = LSM_AUDIT_DATA_INODE;
        ad.u.inode = inode;

        return slow_avc_audit(current_sid(), isec->sid, isec->sclass, perms,
                            audited, denied, result, &ad);
}

/**
 * task_avdcache_reset - Reset the task's AVD cache
 * @tsec: the task's security state
 *
 * Clear the task's AVD cache in @tsec and reset it to the current policy's
 * and task's info.
 */
static inline void task_avdcache_reset(struct task_security_struct *tsec)
{
        memset(&tsec->avdcache.dir, 0, sizeof(tsec->avdcache.dir));
        tsec->avdcache.sid = current_sid();
        tsec->avdcache.seqno = avc_policy_seqno();
        tsec->avdcache.dir_spot = TSEC_AVDC_DIR_SIZE - 1;
}

/**
 * task_avdcache_search - Search the task's AVD cache
 * @tsec: the task's security state
 * @isec: the inode to search for in the cache
 * @avdc: matching avd cache entry returned to the caller
 *
 * Search @tsec for a AVD cache entry that matches @isec and return it to the
 * caller via @avdc.  Returns 0 if a match is found, negative values otherwise.
 */
static inline int task_avdcache_search(struct task_security_struct *tsec,
                                       struct inode_security_struct *isec,
                                       struct avdc_entry **avdc)
{
        int orig, iter;

        /* focused on path walk optimization, only cache directories */
        if (isec->sclass != SECCLASS_DIR)
                return -ENOENT;

        if (unlikely(current_sid() != tsec->avdcache.sid ||
                     tsec->avdcache.seqno != avc_policy_seqno())) {
                task_avdcache_reset(tsec);
                return -ENOENT;
        }

        orig = iter = tsec->avdcache.dir_spot;
        do {
                if (tsec->avdcache.dir[iter].isid == isec->sid) {
                        /* cache hit */
                        tsec->avdcache.dir_spot = iter;
                        *avdc = &tsec->avdcache.dir[iter];
                        return 0;
                }
                iter = (iter - 1) & (TSEC_AVDC_DIR_SIZE - 1);
        } while (iter != orig);

        return -ENOENT;
}

/**
 * task_avdcache_update - Update the task's AVD cache
 * @tsec: the task's security state
 * @isec: the inode associated with the cache entry
 * @avd: the AVD to cache
 * @audited: the permission audit bitmask to cache
 *
 * Update the AVD cache in @tsec with the @avdc and @audited info associated
 * with @isec.
 */
static inline void task_avdcache_update(struct task_security_struct *tsec,
                                        struct inode_security_struct *isec,
                                        struct av_decision *avd,
                                        u32 audited)
{
        int spot;

        /* focused on path walk optimization, only cache directories */
        if (isec->sclass != SECCLASS_DIR)
                return;

        /* update cache */
        spot = (tsec->avdcache.dir_spot + 1) & (TSEC_AVDC_DIR_SIZE - 1);
        tsec->avdcache.dir_spot = spot;
        tsec->avdcache.dir[spot].isid = isec->sid;
        tsec->avdcache.dir[spot].audited = audited;
        tsec->avdcache.dir[spot].allowed = avd->allowed;
        tsec->avdcache.dir[spot].permissive = avd->flags & AVD_FLAGS_PERMISSIVE;
        tsec->avdcache.permissive_neveraudit =
                (avd->flags == (AVD_FLAGS_PERMISSIVE|AVD_FLAGS_NEVERAUDIT));
}

/**
 * selinux_inode_permission - Check if the current task can access an inode
 * @inode: the inode that is being accessed
 * @requested: the accesses being requested
 *
 * Check if the current task is allowed to access @inode according to
 * @requested.  Returns 0 if allowed, negative values otherwise.
 */
static int selinux_inode_permission(struct inode *inode, int requested)
{
        int mask;
        u32 perms;
        u32 sid = current_sid();
        struct task_security_struct *tsec;
        struct inode_security_struct *isec;
        struct avdc_entry *avdc;
        int rc, rc2;
        u32 audited, denied;

        mask = requested & (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND);

        /* No permission to check.  Existence test. */
        if (!mask)
                return 0;

        tsec = selinux_task(current);
        if (task_avdcache_permnoaudit(tsec, sid))
                return 0;

        isec = inode_security_rcu(inode, requested & MAY_NOT_BLOCK);
        if (IS_ERR(isec))
                return PTR_ERR(isec);
        perms = file_mask_to_av(inode->i_mode, mask);

        rc = task_avdcache_search(tsec, isec, &avdc);
        if (likely(!rc)) {
                /* Cache hit. */
                audited = perms & avdc->audited;
                denied = perms & ~avdc->allowed;
                if (unlikely(denied && enforcing_enabled() &&
                             !avdc->permissive))
                        rc = -EACCES;
        } else {
                struct av_decision avd;

                /* Cache miss. */
                rc = avc_has_perm_noaudit(sid, isec->sid, isec->sclass,
                                          perms, 0, &avd);
                audited = avc_audit_required(perms, &avd, rc,
                        (requested & MAY_ACCESS) ? FILE__AUDIT_ACCESS : 0,
                        &denied);
                task_avdcache_update(tsec, isec, &avd, audited);
        }

        if (likely(!audited))
                return rc;

        rc2 = audit_inode_permission(inode, perms, audited, denied, rc);
        if (rc2)
                return rc2;

        return rc;
}

static int selinux_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                                 struct iattr *iattr)
{
        const struct cred *cred = current_cred();
        struct inode *inode = d_backing_inode(dentry);
        unsigned int ia_valid = iattr->ia_valid;
        u32 av = FILE__WRITE;

        /* ATTR_FORCE is just used for ATTR_KILL_S[UG]ID. */
        if (ia_valid & ATTR_FORCE) {
                ia_valid &= ~(ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_MODE |
                              ATTR_FORCE);
                if (!ia_valid)
                        return 0;
        }

        if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID |
                        ATTR_ATIME_SET | ATTR_MTIME_SET | ATTR_TIMES_SET))
                return dentry_has_perm(cred, dentry, FILE__SETATTR);

        if (selinux_policycap_openperm() &&
            inode->i_sb->s_magic != SOCKFS_MAGIC &&
            (ia_valid & ATTR_SIZE) &&
            !(ia_valid & ATTR_FILE))
                av |= FILE__OPEN;

        return dentry_has_perm(cred, dentry, av);
}

static int selinux_inode_getattr(const struct path *path)
{
        struct task_security_struct *tsec;

        tsec = selinux_task(current);

        if (task_avdcache_permnoaudit(tsec, current_sid()))
                return 0;

        return path_has_perm(current_cred(), path, FILE__GETATTR);
}

static bool has_cap_mac_admin(bool audit)
{
        const struct cred *cred = current_cred();
        unsigned int opts = audit ? CAP_OPT_NONE : CAP_OPT_NOAUDIT;

        if (cap_capable(cred, &init_user_ns, CAP_MAC_ADMIN, opts))
                return false;
        if (cred_has_capability(cred, CAP_MAC_ADMIN, opts, true))
                return false;
        return true;
}

/**
 * selinux_inode_xattr_skipcap - Skip the xattr capability checks?
 * @name: name of the xattr
 *
 * Returns 1 to indicate that SELinux "owns" the access control rights to xattrs
 * named @name; the LSM layer should avoid enforcing any traditional
 * capability based access controls on this xattr.  Returns 0 to indicate that
 * SELinux does not "own" the access control rights to xattrs named @name and is
 * deferring to the LSM layer for further access controls, including capability
 * based controls.
 */
static int selinux_inode_xattr_skipcap(const char *name)
{
        /* require capability check if not a selinux xattr */
        return !strcmp(name, XATTR_NAME_SELINUX);
}

static int selinux_inode_setxattr(struct mnt_idmap *idmap,
                                  struct dentry *dentry, const char *name,
                                  const void *value, size_t size, int flags)
{
        struct inode *inode = d_backing_inode(dentry);
        struct inode_security_struct *isec;
        struct superblock_security_struct *sbsec;
        struct common_audit_data ad;
        u32 newsid, sid = current_sid();
        int rc = 0;

        /* if not a selinux xattr, only check the ordinary setattr perm */
        if (strcmp(name, XATTR_NAME_SELINUX))
                return dentry_has_perm(current_cred(), dentry, FILE__SETATTR);

        if (!selinux_initialized())
                return (inode_owner_or_capable(idmap, inode) ? 0 : -EPERM);

        sbsec = selinux_superblock(inode->i_sb);
        if (!(sbsec->flags & SBLABEL_MNT))
                return -EOPNOTSUPP;

        if (!inode_owner_or_capable(idmap, inode))
                return -EPERM;

        ad.type = LSM_AUDIT_DATA_DENTRY;
        ad.u.dentry = dentry;

        isec = backing_inode_security(dentry);
        rc = avc_has_perm(sid, isec->sid, isec->sclass,
                          FILE__RELABELFROM, &ad);
        if (rc)
                return rc;

        rc = security_context_to_sid(value, size, &newsid,
                                     GFP_KERNEL);
        if (rc == -EINVAL) {
                if (!has_cap_mac_admin(true)) {
                        struct audit_buffer *ab;
                        size_t audit_size;

                        /* We strip a nul only if it is at the end, otherwise the
                         * context contains a nul and we should audit that */
                        if (value) {
                                const char *str = value;

                                if (str[size - 1] == '\0')
                                        audit_size = size - 1;
                                else
                                        audit_size = size;
                        } else {
                                audit_size = 0;
                        }
                        ab = audit_log_start(audit_context(),
                                             GFP_ATOMIC, AUDIT_SELINUX_ERR);
                        if (!ab)
                                return rc;
                        audit_log_format(ab, "op=setxattr invalid_context=");
                        audit_log_n_untrustedstring(ab, value, audit_size);
                        audit_log_end(ab);

                        return rc;
                }
                rc = security_context_to_sid_force(value,
                                                   size, &newsid);
        }
        if (rc)
                return rc;

        rc = avc_has_perm(sid, newsid, isec->sclass,
                          FILE__RELABELTO, &ad);
        if (rc)
                return rc;

        rc = security_validate_transition(isec->sid, newsid,
                                          sid, isec->sclass);
        if (rc)
                return rc;

        return avc_has_perm(newsid,
                            sbsec->sid,
                            SECCLASS_FILESYSTEM,
                            FILESYSTEM__ASSOCIATE,
                            &ad);
}

static int selinux_inode_set_acl(struct mnt_idmap *idmap,
                                 struct dentry *dentry, const char *acl_name,
                                 struct posix_acl *kacl)
{
        return dentry_has_perm(current_cred(), dentry, FILE__SETATTR);
}

static int selinux_inode_get_acl(struct mnt_idmap *idmap,
                                 struct dentry *dentry, const char *acl_name)
{
        return dentry_has_perm(current_cred(), dentry, FILE__GETATTR);
}

static int selinux_inode_remove_acl(struct mnt_idmap *idmap,
                                    struct dentry *dentry, const char *acl_name)
{
        return dentry_has_perm(current_cred(), dentry, FILE__SETATTR);
}

static void selinux_inode_post_setxattr(struct dentry *dentry, const char *name,
                                        const void *value, size_t size,
                                        int flags)
{
        struct inode *inode = d_backing_inode(dentry);
        struct inode_security_struct *isec;
        u32 newsid;
        int rc;

        if (strcmp(name, XATTR_NAME_SELINUX)) {
                /* Not an attribute we recognize, so nothing to do. */
                return;
        }

        if (!selinux_initialized()) {
                /* If we haven't even been initialized, then we can't validate
                 * against a policy, so leave the label as invalid. It may
                 * resolve to a valid label on the next revalidation try if
                 * we've since initialized.
                 */
                return;
        }

        rc = security_context_to_sid_force(value, size,
                                           &newsid);
        if (rc) {
                pr_err("SELinux:  unable to map context to SID"
                       "for (%s, %lu), rc=%d\n",
                       inode->i_sb->s_id, inode->i_ino, -rc);
                return;
        }

        isec = backing_inode_security(dentry);
        spin_lock(&isec->lock);
        isec->sclass = inode_mode_to_security_class(inode->i_mode);
        isec->sid = newsid;
        isec->initialized = LABEL_INITIALIZED;
        spin_unlock(&isec->lock);
}

static int selinux_inode_getxattr(struct dentry *dentry, const char *name)
{
        const struct cred *cred = current_cred();

        return dentry_has_perm(cred, dentry, FILE__GETATTR);
}

static int selinux_inode_listxattr(struct dentry *dentry)
{
        const struct cred *cred = current_cred();

        return dentry_has_perm(cred, dentry, FILE__GETATTR);
}

static int selinux_inode_removexattr(struct mnt_idmap *idmap,
                                     struct dentry *dentry, const char *name)
{
        /* if not a selinux xattr, only check the ordinary setattr perm */
        if (strcmp(name, XATTR_NAME_SELINUX))
                return dentry_has_perm(current_cred(), dentry, FILE__SETATTR);

        if (!selinux_initialized())
                return 0;

        /* No one is allowed to remove a SELinux security label.
           You can change the label, but all data must be labeled. */
        return -EACCES;
}

static int selinux_inode_file_setattr(struct dentry *dentry,
                                      struct file_kattr *fa)
{
        return dentry_has_perm(current_cred(), dentry, FILE__SETATTR);
}

static int selinux_inode_file_getattr(struct dentry *dentry,
                                      struct file_kattr *fa)
{
        return dentry_has_perm(current_cred(), dentry, FILE__GETATTR);
}

static int selinux_path_notify(const struct path *path, u64 mask,
                                                unsigned int obj_type)
{
        int ret;
        u32 perm;

        struct common_audit_data ad;

        ad.type = LSM_AUDIT_DATA_PATH;
        ad.u.path = *path;

        /*
         * Set permission needed based on the type of mark being set.
         * Performs an additional check for sb watches.
         */
        switch (obj_type) {
        case FSNOTIFY_OBJ_TYPE_VFSMOUNT:
                perm = FILE__WATCH_MOUNT;
                break;
        case FSNOTIFY_OBJ_TYPE_SB:
                perm = FILE__WATCH_SB;
                ret = superblock_has_perm(current_cred(), path->dentry->d_sb,
                                                FILESYSTEM__WATCH, &ad);
                if (ret)
                        return ret;
                break;
        case FSNOTIFY_OBJ_TYPE_INODE:
                perm = FILE__WATCH;
                break;
        case FSNOTIFY_OBJ_TYPE_MNTNS:
                perm = FILE__WATCH_MOUNTNS;
                break;
        default:
                return -EINVAL;
        }

        /* blocking watches require the file:watch_with_perm permission */
        if (mask & (ALL_FSNOTIFY_PERM_EVENTS))
                perm |= FILE__WATCH_WITH_PERM;

        /* watches on read-like events need the file:watch_reads permission */
        if (mask & (FS_ACCESS | FS_ACCESS_PERM | FS_PRE_ACCESS |
                    FS_CLOSE_NOWRITE))
                perm |= FILE__WATCH_READS;

        return path_has_perm(current_cred(), path, perm);
}

/*
 * Copy the inode security context value to the user.
 *
 * Permission check is handled by selinux_inode_getxattr hook.
 */
static int selinux_inode_getsecurity(struct mnt_idmap *idmap,
                                     struct inode *inode, const char *name,
                                     void **buffer, bool alloc)
{
        u32 size;
        int error;
        char *context = NULL;
        struct inode_security_struct *isec;

        /*
         * If we're not initialized yet, then we can't validate contexts, so
         * just let vfs_getxattr fall back to using the on-disk xattr.
         */
        if (!selinux_initialized() ||
            strcmp(name, XATTR_SELINUX_SUFFIX))
                return -EOPNOTSUPP;

        /*
         * If the caller has CAP_MAC_ADMIN, then get the raw context
         * value even if it is not defined by current policy; otherwise,
         * use the in-core value under current policy.
         * Use the non-auditing forms of the permission checks since
         * getxattr may be called by unprivileged processes commonly
         * and lack of permission just means that we fall back to the
         * in-core context value, not a denial.
         */
        isec = inode_security(inode);
        if (has_cap_mac_admin(false))
                error = security_sid_to_context_force(isec->sid, &context,
                                                      &size);
        else
                error = security_sid_to_context(isec->sid,
                                                &context, &size);
        if (error)
                return error;
        error = size;
        if (alloc) {
                *buffer = context;
                goto out_nofree;
        }
        kfree(context);
out_nofree:
        return error;
}

static int selinux_inode_setsecurity(struct inode *inode, const char *name,
                                     const void *value, size_t size, int flags)
{
        struct inode_security_struct *isec = inode_security_novalidate(inode);
        struct superblock_security_struct *sbsec;
        u32 newsid;
        int rc;

        if (strcmp(name, XATTR_SELINUX_SUFFIX))
                return -EOPNOTSUPP;

        sbsec = selinux_superblock(inode->i_sb);
        if (!(sbsec->flags & SBLABEL_MNT))
                return -EOPNOTSUPP;

        if (!value || !size)
                return -EACCES;

        rc = security_context_to_sid(value, size, &newsid,
                                     GFP_KERNEL);
        if (rc)
                return rc;

        spin_lock(&isec->lock);
        isec->sclass = inode_mode_to_security_class(inode->i_mode);
        isec->sid = newsid;
        isec->initialized = LABEL_INITIALIZED;
        spin_unlock(&isec->lock);
        return 0;
}

static int selinux_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size)
{
        const int len = sizeof(XATTR_NAME_SELINUX);

        if (!selinux_initialized())
                return 0;

        if (buffer && len <= buffer_size)
                memcpy(buffer, XATTR_NAME_SELINUX, len);
        return len;
}

static void selinux_inode_getlsmprop(struct inode *inode, struct lsm_prop *prop)
{
        struct inode_security_struct *isec = inode_security_novalidate(inode);

        prop->selinux.secid = isec->sid;
}

static int selinux_inode_copy_up(struct dentry *src, struct cred **new)
{
        struct lsm_prop prop;
        struct cred_security_struct *crsec;
        struct cred *new_creds = *new;

        if (new_creds == NULL) {
                new_creds = prepare_creds();
                if (!new_creds)
                        return -ENOMEM;
        }

        crsec = selinux_cred(new_creds);
        /* Get label from overlay inode and set it in create_sid */
        selinux_inode_getlsmprop(d_inode(src), &prop);
        crsec->create_sid = prop.selinux.secid;
        *new = new_creds;
        return 0;
}

static int selinux_inode_copy_up_xattr(struct dentry *dentry, const char *name)
{
        /* The copy_up hook above sets the initial context on an inode, but we
         * don't then want to overwrite it by blindly copying all the lower
         * xattrs up.  Instead, filter out SELinux-related xattrs following
         * policy load.
         */
        if (selinux_initialized() && !strcmp(name, XATTR_NAME_SELINUX))
                return -ECANCELED; /* Discard */
        /*
         * Any other attribute apart from SELINUX is not claimed, supported
         * by selinux.
         */
        return -EOPNOTSUPP;
}

/* kernfs node operations */

static int selinux_kernfs_init_security(struct kernfs_node *kn_dir,
                                        struct kernfs_node *kn)
{
        const struct cred_security_struct *crsec = selinux_cred(current_cred());
        u32 parent_sid, newsid, clen;
        int rc;
        char *context;

        rc = kernfs_xattr_get(kn_dir, XATTR_NAME_SELINUX, NULL, 0);
        if (rc == -ENODATA)
                return 0;
        else if (rc < 0)
                return rc;

        clen = (u32)rc;
        context = kmalloc(clen, GFP_KERNEL);
        if (!context)
                return -ENOMEM;

        rc = kernfs_xattr_get(kn_dir, XATTR_NAME_SELINUX, context, clen);
        if (rc < 0) {
                kfree(context);
                return rc;
        }

        rc = security_context_to_sid(context, clen, &parent_sid,
                                     GFP_KERNEL);
        kfree(context);
        if (rc)
                return rc;

        if (crsec->create_sid) {
                newsid = crsec->create_sid;
        } else {
                u16 secclass = inode_mode_to_security_class(kn->mode);
                const char *kn_name;
                struct qstr q;

                /* kn is fresh, can't be renamed, name goes not away */
                kn_name = rcu_dereference_check(kn->name, true);
                q.name = kn_name;
                q.hash_len = hashlen_string(kn_dir, kn_name);

                rc = security_transition_sid(crsec->sid,
                                             parent_sid, secclass, &q,
                                             &newsid);
                if (rc)
                        return rc;
        }

        rc = security_sid_to_context_force(newsid,
                                           &context, &clen);
        if (rc)
                return rc;

        rc = kernfs_xattr_set(kn, XATTR_NAME_SELINUX, context, clen,
                              XATTR_CREATE);
        kfree(context);
        return rc;
}


/* file security operations */

static int selinux_revalidate_file_permission(struct file *file, int mask)
{
        const struct cred *cred = current_cred();
        struct inode *inode = file_inode(file);

        /* file_mask_to_av won't add FILE__WRITE if MAY_APPEND is set */
        if ((file->f_flags & O_APPEND) && (mask & MAY_WRITE))
                mask |= MAY_APPEND;

        return file_has_perm(cred, file,
                             file_mask_to_av(inode->i_mode, mask));
}

static int selinux_file_permission(struct file *file, int mask)
{
        struct inode *inode = file_inode(file);
        struct file_security_struct *fsec = selinux_file(file);
        struct inode_security_struct *isec;
        u32 sid = current_sid();

        if (!mask)
                /* No permission to check.  Existence test. */
                return 0;

        isec = inode_security(inode);
        if (sid == fsec->sid && fsec->isid == isec->sid &&
            fsec->pseqno == avc_policy_seqno())
                /* No change since file_open check. */
                return 0;

        return selinux_revalidate_file_permission(file, mask);
}

static int selinux_file_alloc_security(struct file *file)
{
        struct file_security_struct *fsec = selinux_file(file);
        u32 sid = current_sid();

        fsec->sid = sid;
        fsec->fown_sid = sid;

        return 0;
}

/*
 * Check whether a task has the ioctl permission and cmd
 * operation to an inode.
 */
static int ioctl_has_perm(const struct cred *cred, struct file *file,
                u32 requested, u16 cmd)
{
        struct common_audit_data ad;
        struct file_security_struct *fsec = selinux_file(file);
        struct inode *inode = file_inode(file);
        struct inode_security_struct *isec;
        struct lsm_ioctlop_audit ioctl;
        u32 ssid = cred_sid(cred);
        int rc;
        u8 driver = cmd >> 8;
        u8 xperm = cmd & 0xff;

        ad.type = LSM_AUDIT_DATA_IOCTL_OP;
        ad.u.op = &ioctl;
        ad.u.op->cmd = cmd;
        ad.u.op->path = file->f_path;

        if (ssid != fsec->sid) {
                rc = avc_has_perm(ssid, fsec->sid,
                                SECCLASS_FD,
                                FD__USE,
                                &ad);
                if (rc)
                        goto out;
        }

        if (unlikely(IS_PRIVATE(inode)))
                return 0;

        isec = inode_security(inode);
        rc = avc_has_extended_perms(ssid, isec->sid, isec->sclass, requested,
                                    driver, AVC_EXT_IOCTL, xperm, &ad);
out:
        return rc;
}

static int selinux_file_ioctl(struct file *file, unsigned int cmd,
                              unsigned long arg)
{
        const struct cred *cred = current_cred();
        int error = 0;

        switch (cmd) {
        case FIONREAD:
        case FIBMAP:
        case FIGETBSZ:
        case FS_IOC_GETFLAGS:
        case FS_IOC_GETVERSION:
                error = file_has_perm(cred, file, FILE__GETATTR);
                break;

        case FS_IOC_SETFLAGS:
        case FS_IOC_SETVERSION:
                error = file_has_perm(cred, file, FILE__SETATTR);
                break;

        /* sys_ioctl() checks */
        case FIONBIO:
        case FIOASYNC:
                error = file_has_perm(cred, file, 0);
                break;

        case KDSKBENT:
        case KDSKBSENT:
                error = cred_has_capability(cred, CAP_SYS_TTY_CONFIG,
                                            CAP_OPT_NONE, true);
                break;

        case FIOCLEX:
        case FIONCLEX:
                if (!selinux_policycap_ioctl_skip_cloexec())
                        error = ioctl_has_perm(cred, file, FILE__IOCTL, (u16) cmd);
                break;

        /* default case assumes that the command will go
         * to the file's ioctl() function.
         */
        default:
                error = ioctl_has_perm(cred, file, FILE__IOCTL, (u16) cmd);
        }
        return error;
}

static int selinux_file_ioctl_compat(struct file *file, unsigned int cmd,
                              unsigned long arg)
{
        /*
         * If we are in a 64-bit kernel running 32-bit userspace, we need to
         * make sure we don't compare 32-bit flags to 64-bit flags.
         */
        switch (cmd) {
        case FS_IOC32_GETFLAGS:
                cmd = FS_IOC_GETFLAGS;
                break;
        case FS_IOC32_SETFLAGS:
                cmd = FS_IOC_SETFLAGS;
                break;
        case FS_IOC32_GETVERSION:
                cmd = FS_IOC_GETVERSION;
                break;
        case FS_IOC32_SETVERSION:
                cmd = FS_IOC_SETVERSION;
                break;
        default:
                break;
        }

        return selinux_file_ioctl(file, cmd, arg);
}

static int default_noexec __ro_after_init;

static int file_map_prot_check(struct file *file, unsigned long prot, int shared)
{
        const struct cred *cred = current_cred();
        u32 sid = cred_sid(cred);
        int rc = 0;

        if (default_noexec &&
            (prot & PROT_EXEC) && (!file || IS_PRIVATE(file_inode(file)) ||
                                   (!shared && (prot & PROT_WRITE)))) {
                /*
                 * We are making executable an anonymous mapping or a
                 * private file mapping that will also be writable.
                 * This has an additional check.
                 */
                rc = avc_has_perm(sid, sid, SECCLASS_PROCESS,
                                  PROCESS__EXECMEM, NULL);
                if (rc)
                        goto error;
        }

        if (file) {
                /* read access is always possible with a mapping */
                u32 av = FILE__READ;

                /* write access only matters if the mapping is shared */
                if (shared && (prot & PROT_WRITE))
                        av |= FILE__WRITE;

                if (prot & PROT_EXEC)
                        av |= FILE__EXECUTE;

                return file_has_perm(cred, file, av);
        }

error:
        return rc;
}

static int selinux_mmap_addr(unsigned long addr)
{
        int rc = 0;

        if (addr < CONFIG_LSM_MMAP_MIN_ADDR) {
                u32 sid = current_sid();
                rc = avc_has_perm(sid, sid, SECCLASS_MEMPROTECT,
                                  MEMPROTECT__MMAP_ZERO, NULL);
        }

        return rc;
}

static int selinux_mmap_file(struct file *file,
                             unsigned long reqprot __always_unused,
                             unsigned long prot, unsigned long flags)
{
        struct common_audit_data ad;
        int rc;

        if (file) {
                ad.type = LSM_AUDIT_DATA_FILE;
                ad.u.file = file;
                rc = inode_has_perm(current_cred(), file_inode(file),
                                    FILE__MAP, &ad);
                if (rc)
                        return rc;
        }

        return file_map_prot_check(file, prot,
                                   (flags & MAP_TYPE) == MAP_SHARED);
}

static int selinux_file_mprotect(struct vm_area_struct *vma,
                                 unsigned long reqprot __always_unused,
                                 unsigned long prot)
{
        const struct cred *cred = current_cred();
        u32 sid = cred_sid(cred);

        if (default_noexec &&
            (prot & PROT_EXEC) && !(vma->vm_flags & VM_EXEC)) {
                int rc = 0;
                /*
                 * We don't use the vma_is_initial_heap() helper as it has
                 * a history of problems and is currently broken on systems
                 * where there is no heap, e.g. brk == start_brk.  Before
                 * replacing the conditional below with vma_is_initial_heap(),
                 * or something similar, please ensure that the logic is the
                 * same as what we have below or you have tested every possible
                 * corner case you can think to test.
                 */
                if (vma->vm_start >= vma->vm_mm->start_brk &&
                    vma->vm_end <= vma->vm_mm->brk) {
                        rc = avc_has_perm(sid, sid, SECCLASS_PROCESS,
                                          PROCESS__EXECHEAP, NULL);
                } else if (!vma->vm_file && (vma_is_initial_stack(vma) ||
                            vma_is_stack_for_current(vma))) {
                        rc = avc_has_perm(sid, sid, SECCLASS_PROCESS,
                                          PROCESS__EXECSTACK, NULL);
                } else if (vma->vm_file && vma->anon_vma) {
                        /*
                         * We are making executable a file mapping that has
                         * had some COW done. Since pages might have been
                         * written, check ability to execute the possibly
                         * modified content.  This typically should only
                         * occur for text relocations.
                         */
                        rc = file_has_perm(cred, vma->vm_file, FILE__EXECMOD);
                }
                if (rc)
                        return rc;
        }

        return file_map_prot_check(vma->vm_file, prot, vma->vm_flags&VM_SHARED);
}

static int selinux_file_lock(struct file *file, unsigned int cmd)
{
        const struct cred *cred = current_cred();

        return file_has_perm(cred, file, FILE__LOCK);
}

static int selinux_file_fcntl(struct file *file, unsigned int cmd,
                              unsigned long arg)
{
        const struct cred *cred = current_cred();
        int err = 0;

        switch (cmd) {
        case F_SETFL:
                if ((file->f_flags & O_APPEND) && !(arg & O_APPEND)) {
                        err = file_has_perm(cred, file, FILE__WRITE);
                        break;
                }
                fallthrough;
        case F_SETOWN:
        case F_SETSIG:
        case F_GETFL:
        case F_GETOWN:
        case F_GETSIG:
        case F_GETOWNER_UIDS:
                /* Just check FD__USE permission */
                err = file_has_perm(cred, file, 0);
                break;
        case F_GETLK:
        case F_SETLK:
        case F_SETLKW:
        case F_OFD_GETLK:
        case F_OFD_SETLK:
        case F_OFD_SETLKW:
#if BITS_PER_LONG == 32
        case F_GETLK64:
        case F_SETLK64:
        case F_SETLKW64:
#endif
                err = file_has_perm(cred, file, FILE__LOCK);
                break;
        }

        return err;
}

static void selinux_file_set_fowner(struct file *file)
{
        struct file_security_struct *fsec;

        fsec = selinux_file(file);
        fsec->fown_sid = current_sid();
}

static int selinux_file_send_sigiotask(struct task_struct *tsk,
                                       struct fown_struct *fown, int signum)
{
        struct file *file;
        u32 sid = task_sid_obj(tsk);
        u32 perm;
        struct file_security_struct *fsec;

        /* struct fown_struct is never outside the context of a struct file */
        file = fown->file;

        fsec = selinux_file(file);

        if (!signum)
                perm = signal_to_av(SIGIO); /* as per send_sigio_to_task */
        else
                perm = signal_to_av(signum);

        return avc_has_perm(fsec->fown_sid, sid,
                            SECCLASS_PROCESS, perm, NULL);
}

static int selinux_file_receive(struct file *file)
{
        const struct cred *cred = current_cred();

        return file_has_perm(cred, file, file_to_av(file));
}

static int selinux_file_open(struct file *file)
{
        struct file_security_struct *fsec;
        struct inode_security_struct *isec;

        fsec = selinux_file(file);
        isec = inode_security(file_inode(file));
        /*
         * Save inode label and policy sequence number
         * at open-time so that selinux_file_permission
         * can determine whether revalidation is necessary.
         * Task label is already saved in the file security
         * struct as its SID.
         */
        fsec->isid = isec->sid;
        fsec->pseqno = avc_policy_seqno();
        /*
         * Since the inode label or policy seqno may have changed
         * between the selinux_inode_permission check and the saving
         * of state above, recheck that access is still permitted.
         * Otherwise, access might never be revalidated against the
         * new inode label or new policy.
         * This check is not redundant - do not remove.
         */
        return file_path_has_perm(file->f_cred, file, open_file_to_av(file));
}

/* task security operations */

static int selinux_task_alloc(struct task_struct *task,
                              u64 clone_flags)
{
        u32 sid = current_sid();
        struct task_security_struct *old_tsec = selinux_task(current);
        struct task_security_struct *new_tsec = selinux_task(task);

        *new_tsec = *old_tsec;
        return avc_has_perm(sid, sid, SECCLASS_PROCESS, PROCESS__FORK, NULL);
}

/*
 * prepare a new set of credentials for modification
 */
static int selinux_cred_prepare(struct cred *new, const struct cred *old,
                                gfp_t gfp)
{
        const struct cred_security_struct *old_crsec = selinux_cred(old);
        struct cred_security_struct *crsec = selinux_cred(new);

        *crsec = *old_crsec;
        return 0;
}

/*
 * transfer the SELinux data to a blank set of creds
 */
static void selinux_cred_transfer(struct cred *new, const struct cred *old)
{
        const struct cred_security_struct *old_crsec = selinux_cred(old);
        struct cred_security_struct *crsec = selinux_cred(new);

        *crsec = *old_crsec;
}

static void selinux_cred_getsecid(const struct cred *c, u32 *secid)
{
        *secid = cred_sid(c);
}

static void selinux_cred_getlsmprop(const struct cred *c, struct lsm_prop *prop)
{
        prop->selinux.secid = cred_sid(c);
}

/*
 * set the security data for a kernel service
 * - all the creation contexts are set to unlabelled
 */
static int selinux_kernel_act_as(struct cred *new, u32 secid)
{
        struct cred_security_struct *crsec = selinux_cred(new);
        u32 sid = current_sid();
        int ret;

        ret = avc_has_perm(sid, secid,
                           SECCLASS_KERNEL_SERVICE,
                           KERNEL_SERVICE__USE_AS_OVERRIDE,
                           NULL);
        if (ret == 0) {
                crsec->sid = secid;
                crsec->create_sid = 0;
                crsec->keycreate_sid = 0;
                crsec->sockcreate_sid = 0;
        }
        return ret;
}

/*
 * set the file creation context in a security record to the same as the
 * objective context of the specified inode
 */
static int selinux_kernel_create_files_as(struct cred *new, struct inode *inode)
{
        struct inode_security_struct *isec = inode_security(inode);
        struct cred_security_struct *crsec = selinux_cred(new);
        u32 sid = current_sid();
        int ret;

        ret = avc_has_perm(sid, isec->sid,
                           SECCLASS_KERNEL_SERVICE,
                           KERNEL_SERVICE__CREATE_FILES_AS,
                           NULL);

        if (ret == 0)
                crsec->create_sid = isec->sid;
        return ret;
}

static int selinux_kernel_module_request(char *kmod_name)
{
        struct common_audit_data ad;

        ad.type = LSM_AUDIT_DATA_KMOD;
        ad.u.kmod_name = kmod_name;

        return avc_has_perm(current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM,
                            SYSTEM__MODULE_REQUEST, &ad);
}

static int selinux_kernel_load_from_file(struct file *file, u32 requested)
{
        struct common_audit_data ad;
        struct inode_security_struct *isec;
        struct file_security_struct *fsec;
        u32 sid = current_sid();
        int rc;

        if (file == NULL)
                return avc_has_perm(sid, sid, SECCLASS_SYSTEM, requested, NULL);

        ad.type = LSM_AUDIT_DATA_FILE;
        ad.u.file = file;

        fsec = selinux_file(file);
        if (sid != fsec->sid) {
                rc = avc_has_perm(sid, fsec->sid, SECCLASS_FD, FD__USE, &ad);
                if (rc)
                        return rc;
        }

        isec = inode_security(file_inode(file));
        return avc_has_perm(sid, isec->sid, SECCLASS_SYSTEM, requested, &ad);
}

static int selinux_kernel_read_file(struct file *file,
                                    enum kernel_read_file_id id,
                                    bool contents)
{
        int rc = 0;

        BUILD_BUG_ON_MSG(READING_MAX_ID > 7,
                         "New kernel_read_file_id introduced; update SELinux!");

        switch (id) {
        case READING_FIRMWARE:
                rc = selinux_kernel_load_from_file(file, SYSTEM__FIRMWARE_LOAD);
                break;
        case READING_MODULE:
                rc = selinux_kernel_load_from_file(file, SYSTEM__MODULE_LOAD);
                break;
        case READING_KEXEC_IMAGE:
                rc = selinux_kernel_load_from_file(file,
                                                   SYSTEM__KEXEC_IMAGE_LOAD);
                break;
        case READING_KEXEC_INITRAMFS:
                rc = selinux_kernel_load_from_file(file,
                                                SYSTEM__KEXEC_INITRAMFS_LOAD);
                break;
        case READING_POLICY:
                rc = selinux_kernel_load_from_file(file, SYSTEM__POLICY_LOAD);
                break;
        case READING_X509_CERTIFICATE:
                rc = selinux_kernel_load_from_file(file,
                                                SYSTEM__X509_CERTIFICATE_LOAD);
                break;
        default:
                break;
        }

        return rc;
}

static int selinux_kernel_load_data(enum kernel_load_data_id id, bool contents)
{
        int rc = 0;

        BUILD_BUG_ON_MSG(LOADING_MAX_ID > 7,
                         "New kernel_load_data_id introduced; update SELinux!");

        switch (id) {
        case LOADING_FIRMWARE:
                rc = selinux_kernel_load_from_file(NULL, SYSTEM__FIRMWARE_LOAD);
                break;
        case LOADING_MODULE:
                rc = selinux_kernel_load_from_file(NULL, SYSTEM__MODULE_LOAD);
                break;
        case LOADING_KEXEC_IMAGE:
                rc = selinux_kernel_load_from_file(NULL,
                                                   SYSTEM__KEXEC_IMAGE_LOAD);
                break;
        case LOADING_KEXEC_INITRAMFS:
                rc = selinux_kernel_load_from_file(NULL,
                                                SYSTEM__KEXEC_INITRAMFS_LOAD);
                break;
        case LOADING_POLICY:
                rc = selinux_kernel_load_from_file(NULL,
                                                   SYSTEM__POLICY_LOAD);
                break;
        case LOADING_X509_CERTIFICATE:
                rc = selinux_kernel_load_from_file(NULL,
                                                SYSTEM__X509_CERTIFICATE_LOAD);
                break;
        default:
                break;
        }

        return rc;
}

static int selinux_task_setpgid(struct task_struct *p, pid_t pgid)
{
        return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS,
                            PROCESS__SETPGID, NULL);
}

static int selinux_task_getpgid(struct task_struct *p)
{
        return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS,
                            PROCESS__GETPGID, NULL);
}

static int selinux_task_getsid(struct task_struct *p)
{
        return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS,
                            PROCESS__GETSESSION, NULL);
}

static void selinux_current_getlsmprop_subj(struct lsm_prop *prop)
{
        prop->selinux.secid = current_sid();
}

static void selinux_task_getlsmprop_obj(struct task_struct *p,
                                        struct lsm_prop *prop)
{
        prop->selinux.secid = task_sid_obj(p);
}

static int selinux_task_setnice(struct task_struct *p, int nice)
{
        return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS,
                            PROCESS__SETSCHED, NULL);
}

static int selinux_task_setioprio(struct task_struct *p, int ioprio)
{
        return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS,
                            PROCESS__SETSCHED, NULL);
}

static int selinux_task_getioprio(struct task_struct *p)
{
        return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS,
                            PROCESS__GETSCHED, NULL);
}

static int selinux_task_prlimit(const struct cred *cred, const struct cred *tcred,
                                unsigned int flags)
{
        u32 av = 0;

        if (!flags)
                return 0;
        if (flags & LSM_PRLIMIT_WRITE)
                av |= PROCESS__SETRLIMIT;
        if (flags & LSM_PRLIMIT_READ)
                av |= PROCESS__GETRLIMIT;
        return avc_has_perm(cred_sid(cred), cred_sid(tcred),
                            SECCLASS_PROCESS, av, NULL);
}

static int selinux_task_setrlimit(struct task_struct *p, unsigned int resource,
                struct rlimit *new_rlim)
{
        struct rlimit *old_rlim = p->signal->rlim + resource;

        /* Control the ability to change the hard limit (whether
           lowering or raising it), so that the hard limit can
           later be used as a safe reset point for the soft limit
           upon context transitions.  See selinux_bprm_committing_creds. */
        if (old_rlim->rlim_max != new_rlim->rlim_max)
                return avc_has_perm(current_sid(), task_sid_obj(p),
                                    SECCLASS_PROCESS, PROCESS__SETRLIMIT, NULL);

        return 0;
}

static int selinux_task_setscheduler(struct task_struct *p)
{
        return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS,
                            PROCESS__SETSCHED, NULL);
}

static int selinux_task_getscheduler(struct task_struct *p)
{
        return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS,
                            PROCESS__GETSCHED, NULL);
}

static int selinux_task_movememory(struct task_struct *p)
{
        return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS,
                            PROCESS__SETSCHED, NULL);
}

static int selinux_task_kill(struct task_struct *p, struct kernel_siginfo *info,
                                int sig, const struct cred *cred)
{
        u32 secid;
        u32 perm;

        if (!sig)
                perm = PROCESS__SIGNULL; /* null signal; existence test */
        else
                perm = signal_to_av(sig);
        if (!cred)
                secid = current_sid();
        else
                secid = cred_sid(cred);
        return avc_has_perm(secid, task_sid_obj(p), SECCLASS_PROCESS, perm, NULL);
}

static void selinux_task_to_inode(struct task_struct *p,
                                  struct inode *inode)
{
        struct inode_security_struct *isec = selinux_inode(inode);
        u32 sid = task_sid_obj(p);

        spin_lock(&isec->lock);
        isec->sclass = inode_mode_to_security_class(inode->i_mode);
        isec->sid = sid;
        isec->initialized = LABEL_INITIALIZED;
        spin_unlock(&isec->lock);
}

static int selinux_userns_create(const struct cred *cred)
{
        u32 sid = current_sid();

        return avc_has_perm(sid, sid, SECCLASS_USER_NAMESPACE,
                        USER_NAMESPACE__CREATE, NULL);
}

/* Returns error only if unable to parse addresses */
static int selinux_parse_skb_ipv4(struct sk_buff *skb,
                        struct common_audit_data *ad, u8 *proto)
{
        int offset, ihlen, ret = -EINVAL;
        struct iphdr _iph, *ih;

        offset = skb_network_offset(skb);
        ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
        if (ih == NULL)
                goto out;

        ihlen = ih->ihl * 4;
        if (ihlen < sizeof(_iph))
                goto out;

        ad->u.net->v4info.saddr = ih->saddr;
        ad->u.net->v4info.daddr = ih->daddr;
        ret = 0;

        if (proto)
                *proto = ih->protocol;

        switch (ih->protocol) {
        case IPPROTO_TCP: {
                struct tcphdr _tcph, *th;

                if (ntohs(ih->frag_off) & IP_OFFSET)
                        break;

                offset += ihlen;
                th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
                if (th == NULL)
                        break;

                ad->u.net->sport = th->source;
                ad->u.net->dport = th->dest;
                break;
        }

        case IPPROTO_UDP: {
                struct udphdr _udph, *uh;

                if (ntohs(ih->frag_off) & IP_OFFSET)
                        break;

                offset += ihlen;
                uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph);
                if (uh == NULL)
                        break;

                ad->u.net->sport = uh->source;
                ad->u.net->dport = uh->dest;
                break;
        }

#if IS_ENABLED(CONFIG_IP_SCTP)
        case IPPROTO_SCTP: {
                struct sctphdr _sctph, *sh;

                if (ntohs(ih->frag_off) & IP_OFFSET)
                        break;

                offset += ihlen;
                sh = skb_header_pointer(skb, offset, sizeof(_sctph), &_sctph);
                if (sh == NULL)
                        break;

                ad->u.net->sport = sh->source;
                ad->u.net->dport = sh->dest;
                break;
        }
#endif
        default:
                break;
        }
out:
        return ret;
}

#if IS_ENABLED(CONFIG_IPV6)

/* Returns error only if unable to parse addresses */
static int selinux_parse_skb_ipv6(struct sk_buff *skb,
                        struct common_audit_data *ad, u8 *proto)
{
        u8 nexthdr;
        int ret = -EINVAL, offset;
        struct ipv6hdr _ipv6h, *ip6;
        __be16 frag_off;

        offset = skb_network_offset(skb);
        ip6 = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
        if (ip6 == NULL)
                goto out;

        ad->u.net->v6info.saddr = ip6->saddr;
        ad->u.net->v6info.daddr = ip6->daddr;
        ret = 0;

        nexthdr = ip6->nexthdr;
        offset += sizeof(_ipv6h);
        offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off);
        if (offset < 0)
                goto out;

        if (proto)
                *proto = nexthdr;

        switch (nexthdr) {
        case IPPROTO_TCP: {
                struct tcphdr _tcph, *th;

                th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
                if (th == NULL)
                        break;

                ad->u.net->sport = th->source;
                ad->u.net->dport = th->dest;
                break;
        }

        case IPPROTO_UDP: {
                struct udphdr _udph, *uh;

                uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph);
                if (uh == NULL)
                        break;

                ad->u.net->sport = uh->source;
                ad->u.net->dport = uh->dest;
                break;
        }

#if IS_ENABLED(CONFIG_IP_SCTP)
        case IPPROTO_SCTP: {
                struct sctphdr _sctph, *sh;

                sh = skb_header_pointer(skb, offset, sizeof(_sctph), &_sctph);
                if (sh == NULL)
                        break;

                ad->u.net->sport = sh->source;
                ad->u.net->dport = sh->dest;
                break;
        }
#endif
        /* includes fragments */
        default:
                break;
        }
out:
        return ret;
}

#endif /* IPV6 */

static int selinux_parse_skb(struct sk_buff *skb, struct common_audit_data *ad,
                             char **_addrp, int src, u8 *proto)
{
        char *addrp;
        int ret;

        switch (ad->u.net->family) {
        case PF_INET:
                ret = selinux_parse_skb_ipv4(skb, ad, proto);
                if (ret)
                        goto parse_error;
                addrp = (char *)(src ? &ad->u.net->v4info.saddr :
                                       &ad->u.net->v4info.daddr);
                goto okay;

#if IS_ENABLED(CONFIG_IPV6)
        case PF_INET6:
                ret = selinux_parse_skb_ipv6(skb, ad, proto);
                if (ret)
                        goto parse_error;
                addrp = (char *)(src ? &ad->u.net->v6info.saddr :
                                       &ad->u.net->v6info.daddr);
                goto okay;
#endif        /* IPV6 */
        default:
                addrp = NULL;
                goto okay;
        }

parse_error:
        pr_warn(
               "SELinux: failure in selinux_parse_skb(),"
               " unable to parse packet\n");
        return ret;

okay:
        if (_addrp)
                *_addrp = addrp;
        return 0;
}

/**
 * selinux_skb_peerlbl_sid - Determine the peer label of a packet
 * @skb: the packet
 * @family: protocol family
 * @sid: the packet's peer label SID
 *
 * Description:
 * Check the various different forms of network peer labeling and determine
 * the peer label/SID for the packet; most of the magic actually occurs in
 * the security server function security_net_peersid_cmp().  The function
 * returns zero if the value in @sid is valid (although it may be SECSID_NULL)
 * or -EACCES if @sid is invalid due to inconsistencies with the different
 * peer labels.
 *
 */
static int selinux_skb_peerlbl_sid(struct sk_buff *skb, u16 family, u32 *sid)
{
        int err;
        u32 xfrm_sid;
        u32 nlbl_sid;
        u32 nlbl_type;

        err = selinux_xfrm_skb_sid(skb, &xfrm_sid);
        if (unlikely(err))
                return -EACCES;
        err = selinux_netlbl_skbuff_getsid(skb, family, &nlbl_type, &nlbl_sid);
        if (unlikely(err))
                return -EACCES;

        err = security_net_peersid_resolve(nlbl_sid,
                                           nlbl_type, xfrm_sid, sid);
        if (unlikely(err)) {
                pr_warn(
                       "SELinux: failure in selinux_skb_peerlbl_sid(),"
                       " unable to determine packet's peer label\n");
                return -EACCES;
        }

        return 0;
}

/**
 * selinux_conn_sid - Determine the child socket label for a connection
 * @sk_sid: the parent socket's SID
 * @skb_sid: the packet's SID
 * @conn_sid: the resulting connection SID
 *
 * If @skb_sid is valid then the user:role:type information from @sk_sid is
 * combined with the MLS information from @skb_sid in order to create
 * @conn_sid.  If @skb_sid is not valid then @conn_sid is simply a copy
 * of @sk_sid.  Returns zero on success, negative values on failure.
 *
 */
static int selinux_conn_sid(u32 sk_sid, u32 skb_sid, u32 *conn_sid)
{
        int err = 0;

        if (skb_sid != SECSID_NULL)
                err = security_sid_mls_copy(sk_sid, skb_sid,
                                            conn_sid);
        else
                *conn_sid = sk_sid;

        return err;
}

/* socket security operations */

static int socket_sockcreate_sid(const struct cred_security_struct *crsec,
                                 u16 secclass, u32 *socksid)
{
        if (crsec->sockcreate_sid > SECSID_NULL) {
                *socksid = crsec->sockcreate_sid;
                return 0;
        }

        return security_transition_sid(crsec->sid, crsec->sid,
                                       secclass, NULL, socksid);
}

static bool sock_skip_has_perm(u32 sid)
{
        if (sid == SECINITSID_KERNEL)
                return true;

        /*
         * Before POLICYDB_CAP_USERSPACE_INITIAL_CONTEXT, sockets that
         * inherited the kernel context from early boot used to be skipped
         * here, so preserve that behavior unless the capability is set.
         *
         * By setting the capability the policy signals that it is ready
         * for this quirk to be fixed. Note that sockets created by a kernel
         * thread or a usermode helper executed without a transition will
         * still be skipped in this check regardless of the policycap
         * setting.
         */
        if (!selinux_policycap_userspace_initial_context() &&
            sid == SECINITSID_INIT)
                return true;
        return false;
}


static int sock_has_perm(struct sock *sk, u32 perms)
{
        struct sk_security_struct *sksec = sk->sk_security;
        struct common_audit_data ad;
        struct lsm_network_audit net;

        if (sock_skip_has_perm(sksec->sid))
                return 0;

        ad_net_init_from_sk(&ad, &net, sk);

        return avc_has_perm(current_sid(), sksec->sid, sksec->sclass, perms,
                            &ad);
}

static int selinux_socket_create(int family, int type,
                                 int protocol, int kern)
{
        const struct cred_security_struct *crsec = selinux_cred(current_cred());
        u32 newsid;
        u16 secclass;
        int rc;

        if (kern)
                return 0;

        secclass = socket_type_to_security_class(family, type, protocol);
        rc = socket_sockcreate_sid(crsec, secclass, &newsid);
        if (rc)
                return rc;

        return avc_has_perm(crsec->sid, newsid, secclass, SOCKET__CREATE, NULL);
}

static int selinux_socket_post_create(struct socket *sock, int family,
                                      int type, int protocol, int kern)
{
        const struct cred_security_struct *crsec = selinux_cred(current_cred());
        struct inode_security_struct *isec = inode_security_novalidate(SOCK_INODE(sock));
        struct sk_security_struct *sksec;
        u16 sclass = socket_type_to_security_class(family, type, protocol);
        u32 sid = SECINITSID_KERNEL;
        int err = 0;

        if (!kern) {
                err = socket_sockcreate_sid(crsec, sclass, &sid);
                if (err)
                        return err;
        }

        isec->sclass = sclass;
        isec->sid = sid;
        isec->initialized = LABEL_INITIALIZED;

        if (sock->sk) {
                sksec = selinux_sock(sock->sk);
                sksec->sclass = sclass;
                sksec->sid = sid;
                /* Allows detection of the first association on this socket */
                if (sksec->sclass == SECCLASS_SCTP_SOCKET)
                        sksec->sctp_assoc_state = SCTP_ASSOC_UNSET;

                err = selinux_netlbl_socket_post_create(sock->sk, family);
        }

        return err;
}

static int selinux_socket_socketpair(struct socket *socka,
                                     struct socket *sockb)
{
        struct sk_security_struct *sksec_a = selinux_sock(socka->sk);
        struct sk_security_struct *sksec_b = selinux_sock(sockb->sk);

        sksec_a->peer_sid = sksec_b->sid;
        sksec_b->peer_sid = sksec_a->sid;

        return 0;
}

/* Range of port numbers used to automatically bind.
   Need to determine whether we should perform a name_bind
   permission check between the socket and the port number. */

static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen)
{
        struct sock *sk = sock->sk;
        struct sk_security_struct *sksec = selinux_sock(sk);
        u16 family;
        int err;

        err = sock_has_perm(sk, SOCKET__BIND);
        if (err)
                goto out;

        /* If PF_INET or PF_INET6, check name_bind permission for the port. */
        family = sk->sk_family;
        if (family == PF_INET || family == PF_INET6) {
                char *addrp;
                struct common_audit_data ad;
                struct lsm_network_audit net = {0,};
                struct sockaddr_in *addr4 = NULL;
                struct sockaddr_in6 *addr6 = NULL;
                u16 family_sa;
                unsigned short snum;
                u32 sid, node_perm;

                /*
                 * sctp_bindx(3) calls via selinux_sctp_bind_connect()
                 * that validates multiple binding addresses. Because of this
                 * need to check address->sa_family as it is possible to have
                 * sk->sk_family = PF_INET6 with addr->sa_family = AF_INET.
                 */
                if (addrlen < offsetofend(struct sockaddr, sa_family))
                        return -EINVAL;
                family_sa = address->sa_family;
                switch (family_sa) {
                case AF_UNSPEC:
                case AF_INET:
                        if (addrlen < sizeof(struct sockaddr_in))
                                return -EINVAL;
                        addr4 = (struct sockaddr_in *)address;
                        if (family_sa == AF_UNSPEC) {
                                if (family == PF_INET6) {
                                        /* Length check from inet6_bind_sk() */
                                        if (addrlen < SIN6_LEN_RFC2133)
                                                return -EINVAL;
                                        /* Family check from __inet6_bind() */
                                        goto err_af;
                                }
                                /* see __inet_bind(), we only want to allow
                                 * AF_UNSPEC if the address is INADDR_ANY
                                 */
                                if (addr4->sin_addr.s_addr != htonl(INADDR_ANY))
                                        goto err_af;
                                family_sa = AF_INET;
                        }
                        snum = ntohs(addr4->sin_port);
                        addrp = (char *)&addr4->sin_addr.s_addr;
                        break;
                case AF_INET6:
                        if (addrlen < SIN6_LEN_RFC2133)
                                return -EINVAL;
                        addr6 = (struct sockaddr_in6 *)address;
                        snum = ntohs(addr6->sin6_port);
                        addrp = (char *)&addr6->sin6_addr.s6_addr;
                        break;
                default:
                        goto err_af;
                }

                ad.type = LSM_AUDIT_DATA_NET;
                ad.u.net = &net;
                ad.u.net->sport = htons(snum);
                ad.u.net->family = family_sa;

                if (snum) {
                        int low, high;

                        inet_get_local_port_range(sock_net(sk), &low, &high);

                        if (inet_port_requires_bind_service(sock_net(sk), snum) ||
                            snum < low || snum > high) {
                                err = sel_netport_sid(sk->sk_protocol,
                                                      snum, &sid);
                                if (err)
                                        goto out;
                                err = avc_has_perm(sksec->sid, sid,
                                                   sksec->sclass,
                                                   SOCKET__NAME_BIND, &ad);
                                if (err)
                                        goto out;
                        }
                }

                switch (sksec->sclass) {
                case SECCLASS_TCP_SOCKET:
                        node_perm = TCP_SOCKET__NODE_BIND;
                        break;

                case SECCLASS_UDP_SOCKET:
                        node_perm = UDP_SOCKET__NODE_BIND;
                        break;

                case SECCLASS_SCTP_SOCKET:
                        node_perm = SCTP_SOCKET__NODE_BIND;
                        break;

                default:
                        node_perm = RAWIP_SOCKET__NODE_BIND;
                        break;
                }

                err = sel_netnode_sid(addrp, family_sa, &sid);
                if (err)
                        goto out;

                if (family_sa == AF_INET)
                        ad.u.net->v4info.saddr = addr4->sin_addr.s_addr;
                else
                        ad.u.net->v6info.saddr = addr6->sin6_addr;

                err = avc_has_perm(sksec->sid, sid,
                                   sksec->sclass, node_perm, &ad);
                if (err)
                        goto out;
        }
out:
        return err;
err_af:
        /* Note that SCTP services expect -EINVAL, others -EAFNOSUPPORT. */
        if (sk->sk_protocol == IPPROTO_SCTP)
                return -EINVAL;
        return -EAFNOSUPPORT;
}

/* This supports connect(2) and SCTP connect services such as sctp_connectx(3)
 * and sctp_sendmsg(3) as described in Documentation/security/SCTP.rst
 */
static int selinux_socket_connect_helper(struct socket *sock,
                                         struct sockaddr *address, int addrlen)
{
        struct sock *sk = sock->sk;
        struct sk_security_struct *sksec = selinux_sock(sk);
        int err;

        err = sock_has_perm(sk, SOCKET__CONNECT);
        if (err)
                return err;
        if (addrlen < offsetofend(struct sockaddr, sa_family))
                return -EINVAL;

        /* connect(AF_UNSPEC) has special handling, as it is a documented
         * way to disconnect the socket
         */
        if (address->sa_family == AF_UNSPEC)
                return 0;

        /*
         * If a TCP or SCTP socket, check name_connect permission
         * for the port.
         */
        if (sksec->sclass == SECCLASS_TCP_SOCKET ||
            sksec->sclass == SECCLASS_SCTP_SOCKET) {
                struct common_audit_data ad;
                struct lsm_network_audit net = {0,};
                struct sockaddr_in *addr4 = NULL;
                struct sockaddr_in6 *addr6 = NULL;
                unsigned short snum;
                u32 sid, perm;

                /* sctp_connectx(3) calls via selinux_sctp_bind_connect()
                 * that validates multiple connect addresses. Because of this
                 * need to check address->sa_family as it is possible to have
                 * sk->sk_family = PF_INET6 with addr->sa_family = AF_INET.
                 */
                switch (address->sa_family) {
                case AF_INET:
                        addr4 = (struct sockaddr_in *)address;
                        if (addrlen < sizeof(struct sockaddr_in))
                                return -EINVAL;
                        snum = ntohs(addr4->sin_port);
                        break;
                case AF_INET6:
                        addr6 = (struct sockaddr_in6 *)address;
                        if (addrlen < SIN6_LEN_RFC2133)
                                return -EINVAL;
                        snum = ntohs(addr6->sin6_port);
                        break;
                default:
                        /* Note that SCTP services expect -EINVAL, whereas
                         * others expect -EAFNOSUPPORT.
                         */
                        if (sksec->sclass == SECCLASS_SCTP_SOCKET)
                                return -EINVAL;
                        else
                                return -EAFNOSUPPORT;
                }

                err = sel_netport_sid(sk->sk_protocol, snum, &sid);
                if (err)
                        return err;

                switch (sksec->sclass) {
                case SECCLASS_TCP_SOCKET:
                        perm = TCP_SOCKET__NAME_CONNECT;
                        break;
                case SECCLASS_SCTP_SOCKET:
                        perm = SCTP_SOCKET__NAME_CONNECT;
                        break;
                }

                ad.type = LSM_AUDIT_DATA_NET;
                ad.u.net = &net;
                ad.u.net->dport = htons(snum);
                ad.u.net->family = address->sa_family;
                err = avc_has_perm(sksec->sid, sid, sksec->sclass, perm, &ad);
                if (err)
                        return err;
        }

        return 0;
}

/* Supports connect(2), see comments in selinux_socket_connect_helper() */
static int selinux_socket_connect(struct socket *sock,
                                  struct sockaddr *address, int addrlen)
{
        int err;
        struct sock *sk = sock->sk;

        err = selinux_socket_connect_helper(sock, address, addrlen);
        if (err)
                return err;

        return selinux_netlbl_socket_connect(sk, address);
}

static int selinux_socket_listen(struct socket *sock, int backlog)
{
        return sock_has_perm(sock->sk, SOCKET__LISTEN);
}

static int selinux_socket_accept(struct socket *sock, struct socket *newsock)
{
        int err;
        struct inode_security_struct *isec;
        struct inode_security_struct *newisec;
        u16 sclass;
        u32 sid;

        err = sock_has_perm(sock->sk, SOCKET__ACCEPT);
        if (err)
                return err;

        isec = inode_security_novalidate(SOCK_INODE(sock));
        spin_lock(&isec->lock);
        sclass = isec->sclass;
        sid = isec->sid;
        spin_unlock(&isec->lock);

        newisec = inode_security_novalidate(SOCK_INODE(newsock));
        newisec->sclass = sclass;
        newisec->sid = sid;
        newisec->initialized = LABEL_INITIALIZED;

        return 0;
}

static int selinux_socket_sendmsg(struct socket *sock, struct msghdr *msg,
                                  int size)
{
        return sock_has_perm(sock->sk, SOCKET__WRITE);
}

static int selinux_socket_recvmsg(struct socket *sock, struct msghdr *msg,
                                  int size, int flags)
{
        return sock_has_perm(sock->sk, SOCKET__READ);
}

static int selinux_socket_getsockname(struct socket *sock)
{
        return sock_has_perm(sock->sk, SOCKET__GETATTR);
}

static int selinux_socket_getpeername(struct socket *sock)
{
        return sock_has_perm(sock->sk, SOCKET__GETATTR);
}

static int selinux_socket_setsockopt(struct socket *sock, int level, int optname)
{
        int err;

        err = sock_has_perm(sock->sk, SOCKET__SETOPT);
        if (err)
                return err;

        return selinux_netlbl_socket_setsockopt(sock, level, optname);
}

static int selinux_socket_getsockopt(struct socket *sock, int level,
                                     int optname)
{
        return sock_has_perm(sock->sk, SOCKET__GETOPT);
}

static int selinux_socket_shutdown(struct socket *sock, int how)
{
        return sock_has_perm(sock->sk, SOCKET__SHUTDOWN);
}

static int selinux_socket_unix_stream_connect(struct sock *sock,
                                              struct sock *other,
                                              struct sock *newsk)
{
        struct sk_security_struct *sksec_sock = selinux_sock(sock);
        struct sk_security_struct *sksec_other = selinux_sock(other);
        struct sk_security_struct *sksec_new = selinux_sock(newsk);
        struct common_audit_data ad;
        struct lsm_network_audit net;
        int err;

        ad_net_init_from_sk(&ad, &net, other);

        err = avc_has_perm(sksec_sock->sid, sksec_other->sid,
                           sksec_other->sclass,
                           UNIX_STREAM_SOCKET__CONNECTTO, &ad);
        if (err)
                return err;

        /* server child socket */
        sksec_new->peer_sid = sksec_sock->sid;
        err = security_sid_mls_copy(sksec_other->sid,
                                    sksec_sock->sid, &sksec_new->sid);
        if (err)
                return err;

        /* connecting socket */
        sksec_sock->peer_sid = sksec_new->sid;

        return 0;
}

static int selinux_socket_unix_may_send(struct socket *sock,
                                        struct socket *other)
{
        struct sk_security_struct *ssec = selinux_sock(sock->sk);
        struct sk_security_struct *osec = selinux_sock(other->sk);
        struct common_audit_data ad;
        struct lsm_network_audit net;

        ad_net_init_from_sk(&ad, &net, other->sk);

        return avc_has_perm(ssec->sid, osec->sid, osec->sclass, SOCKET__SENDTO,
                            &ad);
}

static int selinux_inet_sys_rcv_skb(struct net *ns, int ifindex,
                                    char *addrp, u16 family, u32 peer_sid,
                                    struct common_audit_data *ad)
{
        int err;
        u32 if_sid;
        u32 node_sid;

        err = sel_netif_sid(ns, ifindex, &if_sid);
        if (err)
                return err;
        err = avc_has_perm(peer_sid, if_sid,
                           SECCLASS_NETIF, NETIF__INGRESS, ad);
        if (err)
                return err;

        err = sel_netnode_sid(addrp, family, &node_sid);
        if (err)
                return err;
        return avc_has_perm(peer_sid, node_sid,
                            SECCLASS_NODE, NODE__RECVFROM, ad);
}

static int selinux_sock_rcv_skb_compat(struct sock *sk, struct sk_buff *skb,
                                       u16 family)
{
        int err = 0;
        struct sk_security_struct *sksec = selinux_sock(sk);
        u32 sk_sid = sksec->sid;
        struct common_audit_data ad;
        struct lsm_network_audit net;
        char *addrp;

        ad_net_init_from_iif(&ad, &net, skb->skb_iif, family);
        err = selinux_parse_skb(skb, &ad, &addrp, 1, NULL);
        if (err)
                return err;

        if (selinux_secmark_enabled()) {
                err = avc_has_perm(sk_sid, skb->secmark, SECCLASS_PACKET,
                                   PACKET__RECV, &ad);
                if (err)
                        return err;
        }

        err = selinux_netlbl_sock_rcv_skb(sksec, skb, family, &ad);
        if (err)
                return err;
        err = selinux_xfrm_sock_rcv_skb(sksec->sid, skb, &ad);

        return err;
}

static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        int err, peerlbl_active, secmark_active;
        struct sk_security_struct *sksec = selinux_sock(sk);
        u16 family = sk->sk_family;
        u32 sk_sid = sksec->sid;
        struct common_audit_data ad;
        struct lsm_network_audit net;
        char *addrp;

        if (family != PF_INET && family != PF_INET6)
                return 0;

        /* Handle mapped IPv4 packets arriving via IPv6 sockets */
        if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP))
                family = PF_INET;

        /* If any sort of compatibility mode is enabled then handoff processing
         * to the selinux_sock_rcv_skb_compat() function to deal with the
         * special handling.  We do this in an attempt to keep this function
         * as fast and as clean as possible. */
        if (!selinux_policycap_netpeer())
                return selinux_sock_rcv_skb_compat(sk, skb, family);

        secmark_active = selinux_secmark_enabled();
        peerlbl_active = selinux_peerlbl_enabled();
        if (!secmark_active && !peerlbl_active)
                return 0;

        ad_net_init_from_iif(&ad, &net, skb->skb_iif, family);
        err = selinux_parse_skb(skb, &ad, &addrp, 1, NULL);
        if (err)
                return err;

        if (peerlbl_active) {
                u32 peer_sid;

                err = selinux_skb_peerlbl_sid(skb, family, &peer_sid);
                if (err)
                        return err;
                err = selinux_inet_sys_rcv_skb(sock_net(sk), skb->skb_iif,
                                               addrp, family, peer_sid, &ad);
                if (err) {
                        selinux_netlbl_err(skb, family, err, 0);
                        return err;
                }
                err = avc_has_perm(sk_sid, peer_sid, SECCLASS_PEER,
                                   PEER__RECV, &ad);
                if (err) {
                        selinux_netlbl_err(skb, family, err, 0);
                        return err;
                }
        }

        if (secmark_active) {
                err = avc_has_perm(sk_sid, skb->secmark, SECCLASS_PACKET,
                                   PACKET__RECV, &ad);
                if (err)
                        return err;
        }

        return err;
}

static int selinux_socket_getpeersec_stream(struct socket *sock,
                                            sockptr_t optval, sockptr_t optlen,
                                            unsigned int len)
{
        int err = 0;
        char *scontext = NULL;
        u32 scontext_len;
        struct sk_security_struct *sksec = selinux_sock(sock->sk);
        u32 peer_sid = SECSID_NULL;

        if (sksec->sclass == SECCLASS_UNIX_STREAM_SOCKET ||
            sksec->sclass == SECCLASS_TCP_SOCKET ||
            sksec->sclass == SECCLASS_SCTP_SOCKET)
                peer_sid = sksec->peer_sid;
        if (peer_sid == SECSID_NULL)
                return -ENOPROTOOPT;

        err = security_sid_to_context(peer_sid, &scontext,
                                      &scontext_len);
        if (err)
                return err;
        if (scontext_len > len) {
                err = -ERANGE;
                goto out_len;
        }

        if (copy_to_sockptr(optval, scontext, scontext_len))
                err = -EFAULT;
out_len:
        if (copy_to_sockptr(optlen, &scontext_len, sizeof(scontext_len)))
                err = -EFAULT;
        kfree(scontext);
        return err;
}

static int selinux_socket_getpeersec_dgram(struct socket *sock,
                                           struct sk_buff *skb, u32 *secid)
{
        u32 peer_secid = SECSID_NULL;
        u16 family;

        if (skb && skb->protocol == htons(ETH_P_IP))
                family = PF_INET;
        else if (skb && skb->protocol == htons(ETH_P_IPV6))
                family = PF_INET6;
        else if (sock)
                family = sock->sk->sk_family;
        else {
                *secid = SECSID_NULL;
                return -EINVAL;
        }

        if (sock && family == PF_UNIX) {
                struct inode_security_struct *isec;
                isec = inode_security_novalidate(SOCK_INODE(sock));
                peer_secid = isec->sid;
        } else if (skb)
                selinux_skb_peerlbl_sid(skb, family, &peer_secid);

        *secid = peer_secid;
        if (peer_secid == SECSID_NULL)
                return -ENOPROTOOPT;
        return 0;
}

static int selinux_sk_alloc_security(struct sock *sk, int family, gfp_t priority)
{
        struct sk_security_struct *sksec = selinux_sock(sk);

        sksec->peer_sid = SECINITSID_UNLABELED;
        sksec->sid = SECINITSID_UNLABELED;
        sksec->sclass = SECCLASS_SOCKET;
        selinux_netlbl_sk_security_reset(sksec);

        return 0;
}

static void selinux_sk_free_security(struct sock *sk)
{
        struct sk_security_struct *sksec = selinux_sock(sk);

        selinux_netlbl_sk_security_free(sksec);
}

static void selinux_sk_clone_security(const struct sock *sk, struct sock *newsk)
{
        struct sk_security_struct *sksec = selinux_sock(sk);
        struct sk_security_struct *newsksec = selinux_sock(newsk);

        newsksec->sid = sksec->sid;
        newsksec->peer_sid = sksec->peer_sid;
        newsksec->sclass = sksec->sclass;

        selinux_netlbl_sk_security_reset(newsksec);
}

static void selinux_sk_getsecid(const struct sock *sk, u32 *secid)
{
        if (!sk)
                *secid = SECINITSID_ANY_SOCKET;
        else {
                const struct sk_security_struct *sksec = selinux_sock(sk);

                *secid = sksec->sid;
        }
}

static void selinux_sock_graft(struct sock *sk, struct socket *parent)
{
        struct inode_security_struct *isec =
                inode_security_novalidate(SOCK_INODE(parent));
        struct sk_security_struct *sksec = selinux_sock(sk);

        if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6 ||
            sk->sk_family == PF_UNIX)
                isec->sid = sksec->sid;
        sksec->sclass = isec->sclass;
}

/*
 * Determines peer_secid for the asoc and updates socket's peer label
 * if it's the first association on the socket.
 */
static int selinux_sctp_process_new_assoc(struct sctp_association *asoc,
                                          struct sk_buff *skb)
{
        struct sock *sk = asoc->base.sk;
        u16 family = sk->sk_family;
        struct sk_security_struct *sksec = selinux_sock(sk);
        struct common_audit_data ad;
        struct lsm_network_audit net;
        int err;

        /* handle mapped IPv4 packets arriving via IPv6 sockets */
        if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP))
                family = PF_INET;

        if (selinux_peerlbl_enabled()) {
                asoc->peer_secid = SECSID_NULL;

                /* This will return peer_sid = SECSID_NULL if there are
                 * no peer labels, see security_net_peersid_resolve().
                 */
                err = selinux_skb_peerlbl_sid(skb, family, &asoc->peer_secid);
                if (err)
                        return err;

                if (asoc->peer_secid == SECSID_NULL)
                        asoc->peer_secid = SECINITSID_UNLABELED;
        } else {
                asoc->peer_secid = SECINITSID_UNLABELED;
        }

        if (sksec->sctp_assoc_state == SCTP_ASSOC_UNSET) {
                sksec->sctp_assoc_state = SCTP_ASSOC_SET;

                /* Here as first association on socket. As the peer SID
                 * was allowed by peer recv (and the netif/node checks),
                 * then it is approved by policy and used as the primary
                 * peer SID for getpeercon(3).
                 */
                sksec->peer_sid = asoc->peer_secid;
        } else if (sksec->peer_sid != asoc->peer_secid) {
                /* Other association peer SIDs are checked to enforce
                 * consistency among the peer SIDs.
                 */
                ad_net_init_from_sk(&ad, &net, asoc->base.sk);
                err = avc_has_perm(sksec->peer_sid, asoc->peer_secid,
                                   sksec->sclass, SCTP_SOCKET__ASSOCIATION,
                                   &ad);
                if (err)
                        return err;
        }
        return 0;
}

/* Called whenever SCTP receives an INIT or COOKIE ECHO chunk. This
 * happens on an incoming connect(2), sctp_connectx(3) or
 * sctp_sendmsg(3) (with no association already present).
 */
static int selinux_sctp_assoc_request(struct sctp_association *asoc,
                                      struct sk_buff *skb)
{
        struct sk_security_struct *sksec = selinux_sock(asoc->base.sk);
        u32 conn_sid;
        int err;

        if (!selinux_policycap_extsockclass())
                return 0;

        err = selinux_sctp_process_new_assoc(asoc, skb);
        if (err)
                return err;

        /* Compute the MLS component for the connection and store
         * the information in asoc. This will be used by SCTP TCP type
         * sockets and peeled off connections as they cause a new
         * socket to be generated. selinux_sctp_sk_clone() will then
         * plug this into the new socket.
         */
        err = selinux_conn_sid(sksec->sid, asoc->peer_secid, &conn_sid);
        if (err)
                return err;

        asoc->secid = conn_sid;

        /* Set any NetLabel labels including CIPSO/CALIPSO options. */
        return selinux_netlbl_sctp_assoc_request(asoc, skb);
}

/* Called when SCTP receives a COOKIE ACK chunk as the final
 * response to an association request (initited by us).
 */
static int selinux_sctp_assoc_established(struct sctp_association *asoc,
                                          struct sk_buff *skb)
{
        struct sk_security_struct *sksec = selinux_sock(asoc->base.sk);

        if (!selinux_policycap_extsockclass())
                return 0;

        /* Inherit secid from the parent socket - this will be picked up
         * by selinux_sctp_sk_clone() if the association gets peeled off
         * into a new socket.
         */
        asoc->secid = sksec->sid;

        return selinux_sctp_process_new_assoc(asoc, skb);
}

/* Check if sctp IPv4/IPv6 addresses are valid for binding or connecting
 * based on their @optname.
 */
static int selinux_sctp_bind_connect(struct sock *sk, int optname,
                                     struct sockaddr *address,
                                     int addrlen)
{
        int len, err = 0, walk_size = 0;
        void *addr_buf;
        struct sockaddr *addr;
        struct socket *sock;

        if (!selinux_policycap_extsockclass())
                return 0;

        /* Process one or more addresses that may be IPv4 or IPv6 */
        sock = sk->sk_socket;
        addr_buf = address;

        while (walk_size < addrlen) {
                if (walk_size + sizeof(sa_family_t) > addrlen)
                        return -EINVAL;

                addr = addr_buf;
                switch (addr->sa_family) {
                case AF_UNSPEC:
                case AF_INET:
                        len = sizeof(struct sockaddr_in);
                        break;
                case AF_INET6:
                        len = sizeof(struct sockaddr_in6);
                        break;
                default:
                        return -EINVAL;
                }

                if (walk_size + len > addrlen)
                        return -EINVAL;

                err = -EINVAL;
                switch (optname) {
                /* Bind checks */
                case SCTP_PRIMARY_ADDR:
                case SCTP_SET_PEER_PRIMARY_ADDR:
                case SCTP_SOCKOPT_BINDX_ADD:
                        err = selinux_socket_bind(sock, addr, len);
                        break;
                /* Connect checks */
                case SCTP_SOCKOPT_CONNECTX:
                case SCTP_PARAM_SET_PRIMARY:
                case SCTP_PARAM_ADD_IP:
                case SCTP_SENDMSG_CONNECT:
                        err = selinux_socket_connect_helper(sock, addr, len);
                        if (err)
                                return err;

                        /* As selinux_sctp_bind_connect() is called by the
                         * SCTP protocol layer, the socket is already locked,
                         * therefore selinux_netlbl_socket_connect_locked()
                         * is called here. The situations handled are:
                         * sctp_connectx(3), sctp_sendmsg(3), sendmsg(2),
                         * whenever a new IP address is added or when a new
                         * primary address is selected.
                         * Note that an SCTP connect(2) call happens before
                         * the SCTP protocol layer and is handled via
                         * selinux_socket_connect().
                         */
                        err = selinux_netlbl_socket_connect_locked(sk, addr);
                        break;
                }

                if (err)
                        return err;

                addr_buf += len;
                walk_size += len;
        }

        return 0;
}

/* Called whenever a new socket is created by accept(2) or sctp_peeloff(3). */
static void selinux_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk,
                                  struct sock *newsk)
{
        struct sk_security_struct *sksec = selinux_sock(sk);
        struct sk_security_struct *newsksec = selinux_sock(newsk);

        /* If policy does not support SECCLASS_SCTP_SOCKET then call
         * the non-sctp clone version.
         */
        if (!selinux_policycap_extsockclass())
                return selinux_sk_clone_security(sk, newsk);

        newsksec->sid = asoc->secid;
        newsksec->peer_sid = asoc->peer_secid;
        newsksec->sclass = sksec->sclass;
        selinux_netlbl_sctp_sk_clone(sk, newsk);
}

static int selinux_mptcp_add_subflow(struct sock *sk, struct sock *ssk)
{
        struct sk_security_struct *ssksec = selinux_sock(ssk);
        struct sk_security_struct *sksec = selinux_sock(sk);

        ssksec->sclass = sksec->sclass;
        ssksec->sid = sksec->sid;

        /* replace the existing subflow label deleting the existing one
         * and re-recreating a new label using the updated context
         */
        selinux_netlbl_sk_security_free(ssksec);
        return selinux_netlbl_socket_post_create(ssk, ssk->sk_family);
}

static int selinux_inet_conn_request(const struct sock *sk, struct sk_buff *skb,
                                     struct request_sock *req)
{
        struct sk_security_struct *sksec = selinux_sock(sk);
        int err;
        u16 family = req->rsk_ops->family;
        u32 connsid;
        u32 peersid;

        err = selinux_skb_peerlbl_sid(skb, family, &peersid);
        if (err)
                return err;
        err = selinux_conn_sid(sksec->sid, peersid, &connsid);
        if (err)
                return err;
        req->secid = connsid;
        req->peer_secid = peersid;

        return selinux_netlbl_inet_conn_request(req, family);
}

static void selinux_inet_csk_clone(struct sock *newsk,
                                   const struct request_sock *req)
{
        struct sk_security_struct *newsksec = selinux_sock(newsk);

        newsksec->sid = req->secid;
        newsksec->peer_sid = req->peer_secid;
        /* NOTE: Ideally, we should also get the isec->sid for the
           new socket in sync, but we don't have the isec available yet.
           So we will wait until sock_graft to do it, by which
           time it will have been created and available. */

        /* We don't need to take any sort of lock here as we are the only
         * thread with access to newsksec */
        selinux_netlbl_inet_csk_clone(newsk, req->rsk_ops->family);
}

static void selinux_inet_conn_established(struct sock *sk, struct sk_buff *skb)
{
        u16 family = sk->sk_family;
        struct sk_security_struct *sksec = selinux_sock(sk);

        /* handle mapped IPv4 packets arriving via IPv6 sockets */
        if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP))
                family = PF_INET;

        selinux_skb_peerlbl_sid(skb, family, &sksec->peer_sid);
}

static int selinux_secmark_relabel_packet(u32 sid)
{
        return avc_has_perm(current_sid(), sid, SECCLASS_PACKET, PACKET__RELABELTO,
                            NULL);
}

static void selinux_secmark_refcount_inc(void)
{
        atomic_inc(&selinux_secmark_refcount);
}

static void selinux_secmark_refcount_dec(void)
{
        atomic_dec(&selinux_secmark_refcount);
}

static void selinux_req_classify_flow(const struct request_sock *req,
                                      struct flowi_common *flic)
{
        flic->flowic_secid = req->secid;
}

static int selinux_tun_dev_alloc_security(void *security)
{
        struct tun_security_struct *tunsec = selinux_tun_dev(security);

        tunsec->sid = current_sid();
        return 0;
}

static int selinux_tun_dev_create(void)
{
        u32 sid = current_sid();

        /* we aren't taking into account the "sockcreate" SID since the socket
         * that is being created here is not a socket in the traditional sense,
         * instead it is a private sock, accessible only to the kernel, and
         * representing a wide range of network traffic spanning multiple
         * connections unlike traditional sockets - check the TUN driver to
         * get a better understanding of why this socket is special */

        return avc_has_perm(sid, sid, SECCLASS_TUN_SOCKET, TUN_SOCKET__CREATE,
                            NULL);
}

static int selinux_tun_dev_attach_queue(void *security)
{
        struct tun_security_struct *tunsec = selinux_tun_dev(security);

        return avc_has_perm(current_sid(), tunsec->sid, SECCLASS_TUN_SOCKET,
                            TUN_SOCKET__ATTACH_QUEUE, NULL);
}

static int selinux_tun_dev_attach(struct sock *sk, void *security)
{
        struct tun_security_struct *tunsec = selinux_tun_dev(security);
        struct sk_security_struct *sksec = selinux_sock(sk);

        /* we don't currently perform any NetLabel based labeling here and it
         * isn't clear that we would want to do so anyway; while we could apply
         * labeling without the support of the TUN user the resulting labeled
         * traffic from the other end of the connection would almost certainly
         * cause confusion to the TUN user that had no idea network labeling
         * protocols were being used */

        sksec->sid = tunsec->sid;
        sksec->sclass = SECCLASS_TUN_SOCKET;

        return 0;
}

static int selinux_tun_dev_open(void *security)
{
        struct tun_security_struct *tunsec = selinux_tun_dev(security);
        u32 sid = current_sid();
        int err;

        err = avc_has_perm(sid, tunsec->sid, SECCLASS_TUN_SOCKET,
                           TUN_SOCKET__RELABELFROM, NULL);
        if (err)
                return err;
        err = avc_has_perm(sid, sid, SECCLASS_TUN_SOCKET,
                           TUN_SOCKET__RELABELTO, NULL);
        if (err)
                return err;
        tunsec->sid = sid;

        return 0;
}

#ifdef CONFIG_NETFILTER

static unsigned int selinux_ip_forward(void *priv, struct sk_buff *skb,
                                       const struct nf_hook_state *state)
{
        int ifindex;
        u16 family;
        char *addrp;
        u32 peer_sid;
        struct common_audit_data ad;
        struct lsm_network_audit net;
        int secmark_active, peerlbl_active;

        if (!selinux_policycap_netpeer())
                return NF_ACCEPT;

        secmark_active = selinux_secmark_enabled();
        peerlbl_active = selinux_peerlbl_enabled();
        if (!secmark_active && !peerlbl_active)
                return NF_ACCEPT;

        family = state->pf;
        if (selinux_skb_peerlbl_sid(skb, family, &peer_sid) != 0)
                return NF_DROP;

        ifindex = state->in->ifindex;
        ad_net_init_from_iif(&ad, &net, ifindex, family);
        if (selinux_parse_skb(skb, &ad, &addrp, 1, NULL) != 0)
                return NF_DROP;

        if (peerlbl_active) {
                int err;

                err = selinux_inet_sys_rcv_skb(state->net, ifindex,
                                               addrp, family, peer_sid, &ad);
                if (err) {
                        selinux_netlbl_err(skb, family, err, 1);
                        return NF_DROP;
                }
        }

        if (secmark_active)
                if (avc_has_perm(peer_sid, skb->secmark,
                                 SECCLASS_PACKET, PACKET__FORWARD_IN, &ad))
                        return NF_DROP;

        if (netlbl_enabled())
                /* we do this in the FORWARD path and not the POST_ROUTING
                 * path because we want to make sure we apply the necessary
                 * labeling before IPsec is applied so we can leverage AH
                 * protection */
                if (selinux_netlbl_skbuff_setsid(skb, family, peer_sid) != 0)
                        return NF_DROP;

        return NF_ACCEPT;
}

static unsigned int selinux_ip_output(void *priv, struct sk_buff *skb,
                                      const struct nf_hook_state *state)
{
        struct sock *sk;
        u32 sid;

        if (!netlbl_enabled())
                return NF_ACCEPT;

        /* we do this in the LOCAL_OUT path and not the POST_ROUTING path
         * because we want to make sure we apply the necessary labeling
         * before IPsec is applied so we can leverage AH protection */
        sk = skb_to_full_sk(skb);
        if (sk) {
                struct sk_security_struct *sksec;

                if (sk_listener(sk))
                        /* if the socket is the listening state then this
                         * packet is a SYN-ACK packet which means it needs to
                         * be labeled based on the connection/request_sock and
                         * not the parent socket.  unfortunately, we can't
                         * lookup the request_sock yet as it isn't queued on
                         * the parent socket until after the SYN-ACK is sent.
                         * the "solution" is to simply pass the packet as-is
                         * as any IP option based labeling should be copied
                         * from the initial connection request (in the IP
                         * layer).  it is far from ideal, but until we get a
                         * security label in the packet itself this is the
                         * best we can do. */
                        return NF_ACCEPT;

                /* standard practice, label using the parent socket */
                sksec = selinux_sock(sk);
                sid = sksec->sid;
        } else
                sid = SECINITSID_KERNEL;
        if (selinux_netlbl_skbuff_setsid(skb, state->pf, sid) != 0)
                return NF_DROP;

        return NF_ACCEPT;
}


static unsigned int selinux_ip_postroute_compat(struct sk_buff *skb,
                                        const struct nf_hook_state *state)
{
        struct sock *sk;
        struct sk_security_struct *sksec;
        struct common_audit_data ad;
        struct lsm_network_audit net;
        u8 proto = 0;

        sk = skb_to_full_sk(skb);
        if (sk == NULL)
                return NF_ACCEPT;
        sksec = selinux_sock(sk);

        ad_net_init_from_iif(&ad, &net, state->out->ifindex, state->pf);
        if (selinux_parse_skb(skb, &ad, NULL, 0, &proto))
                return NF_DROP;

        if (selinux_secmark_enabled())
                if (avc_has_perm(sksec->sid, skb->secmark,
                                 SECCLASS_PACKET, PACKET__SEND, &ad))
                        return NF_DROP_ERR(-ECONNREFUSED);

        if (selinux_xfrm_postroute_last(sksec->sid, skb, &ad, proto))
                return NF_DROP_ERR(-ECONNREFUSED);

        return NF_ACCEPT;
}

static unsigned int selinux_ip_postroute(void *priv,
                                         struct sk_buff *skb,
                                         const struct nf_hook_state *state)
{
        u16 family;
        u32 secmark_perm;
        u32 peer_sid;
        int ifindex;
        struct sock *sk;
        struct common_audit_data ad;
        struct lsm_network_audit net;
        char *addrp;
        int secmark_active, peerlbl_active;

        /* If any sort of compatibility mode is enabled then handoff processing
         * to the selinux_ip_postroute_compat() function to deal with the
         * special handling.  We do this in an attempt to keep this function
         * as fast and as clean as possible. */
        if (!selinux_policycap_netpeer())
                return selinux_ip_postroute_compat(skb, state);

        secmark_active = selinux_secmark_enabled();
        peerlbl_active = selinux_peerlbl_enabled();
        if (!secmark_active && !peerlbl_active)
                return NF_ACCEPT;

        sk = skb_to_full_sk(skb);

#ifdef CONFIG_XFRM
        /* If skb->dst->xfrm is non-NULL then the packet is undergoing an IPsec
         * packet transformation so allow the packet to pass without any checks
         * since we'll have another chance to perform access control checks
         * when the packet is on it's final way out.
         * NOTE: there appear to be some IPv6 multicast cases where skb->dst
         *       is NULL, in this case go ahead and apply access control.
         * NOTE: if this is a local socket (skb->sk != NULL) that is in the
         *       TCP listening state we cannot wait until the XFRM processing
         *       is done as we will miss out on the SA label if we do;
         *       unfortunately, this means more work, but it is only once per
         *       connection. */
        if (skb_dst(skb) != NULL && skb_dst(skb)->xfrm != NULL &&
            !(sk && sk_listener(sk)))
                return NF_ACCEPT;
#endif

        family = state->pf;
        if (sk == NULL) {
                /* Without an associated socket the packet is either coming
                 * from the kernel or it is being forwarded; check the packet
                 * to determine which and if the packet is being forwarded
                 * query the packet directly to determine the security label. */
                if (skb->skb_iif) {
                        secmark_perm = PACKET__FORWARD_OUT;
                        if (selinux_skb_peerlbl_sid(skb, family, &peer_sid))
                                return NF_DROP;
                } else {
                        secmark_perm = PACKET__SEND;
                        peer_sid = SECINITSID_KERNEL;
                }
        } else if (sk_listener(sk)) {
                /* Locally generated packet but the associated socket is in the
                 * listening state which means this is a SYN-ACK packet.  In
                 * this particular case the correct security label is assigned
                 * to the connection/request_sock but unfortunately we can't
                 * query the request_sock as it isn't queued on the parent
                 * socket until after the SYN-ACK packet is sent; the only
                 * viable choice is to regenerate the label like we do in
                 * selinux_inet_conn_request().  See also selinux_ip_output()
                 * for similar problems. */
                u32 skb_sid;
                struct sk_security_struct *sksec;

                sksec = selinux_sock(sk);
                if (selinux_skb_peerlbl_sid(skb, family, &skb_sid))
                        return NF_DROP;
                /* At this point, if the returned skb peerlbl is SECSID_NULL
                 * and the packet has been through at least one XFRM
                 * transformation then we must be dealing with the "final"
                 * form of labeled IPsec packet; since we've already applied
                 * all of our access controls on this packet we can safely
                 * pass the packet. */
                if (skb_sid == SECSID_NULL) {
                        switch (family) {
                        case PF_INET:
                                if (IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED)
                                        return NF_ACCEPT;
                                break;
                        case PF_INET6:
                                if (IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED)
                                        return NF_ACCEPT;
                                break;
                        default:
                                return NF_DROP_ERR(-ECONNREFUSED);
                        }
                }
                if (selinux_conn_sid(sksec->sid, skb_sid, &peer_sid))
                        return NF_DROP;
                secmark_perm = PACKET__SEND;
        } else {
                /* Locally generated packet, fetch the security label from the
                 * associated socket. */
                struct sk_security_struct *sksec = selinux_sock(sk);
                peer_sid = sksec->sid;
                secmark_perm = PACKET__SEND;
        }

        ifindex = state->out->ifindex;
        ad_net_init_from_iif(&ad, &net, ifindex, family);
        if (selinux_parse_skb(skb, &ad, &addrp, 0, NULL))
                return NF_DROP;

        if (secmark_active)
                if (avc_has_perm(peer_sid, skb->secmark,
                                 SECCLASS_PACKET, secmark_perm, &ad))
                        return NF_DROP_ERR(-ECONNREFUSED);

        if (peerlbl_active) {
                u32 if_sid;
                u32 node_sid;

                if (sel_netif_sid(state->net, ifindex, &if_sid))
                        return NF_DROP;
                if (avc_has_perm(peer_sid, if_sid,
                                 SECCLASS_NETIF, NETIF__EGRESS, &ad))
                        return NF_DROP_ERR(-ECONNREFUSED);

                if (sel_netnode_sid(addrp, family, &node_sid))
                        return NF_DROP;
                if (avc_has_perm(peer_sid, node_sid,
                                 SECCLASS_NODE, NODE__SENDTO, &ad))
                        return NF_DROP_ERR(-ECONNREFUSED);
        }

        return NF_ACCEPT;
}
#endif        /* CONFIG_NETFILTER */

static int nlmsg_sock_has_extended_perms(struct sock *sk, u32 perms, u16 nlmsg_type)
{
        struct sk_security_struct *sksec = sk->sk_security;
        struct common_audit_data ad;
        u8 driver;
        u8 xperm;

        if (sock_skip_has_perm(sksec->sid))
                return 0;

        ad.type = LSM_AUDIT_DATA_NLMSGTYPE;
        ad.u.nlmsg_type = nlmsg_type;

        driver = nlmsg_type >> 8;
        xperm = nlmsg_type & 0xff;

        return avc_has_extended_perms(current_sid(), sksec->sid, sksec->sclass,
                                      perms, driver, AVC_EXT_NLMSG, xperm, &ad);
}

static int selinux_netlink_send(struct sock *sk, struct sk_buff *skb)
{
        int rc = 0;
        unsigned int msg_len;
        unsigned int data_len = skb->len;
        unsigned char *data = skb->data;
        struct nlmsghdr *nlh;
        struct sk_security_struct *sksec = selinux_sock(sk);
        u16 sclass = sksec->sclass;
        u32 perm;

        while (data_len >= nlmsg_total_size(0)) {
                nlh = (struct nlmsghdr *)data;

                /* NOTE: the nlmsg_len field isn't reliably set by some netlink
                 *       users which means we can't reject skb's with bogus
                 *       length fields; our solution is to follow what
                 *       netlink_rcv_skb() does and simply skip processing at
                 *       messages with length fields that are clearly junk
                 */
                if (nlh->nlmsg_len < NLMSG_HDRLEN || nlh->nlmsg_len > data_len)
                        return 0;

                rc = selinux_nlmsg_lookup(sclass, nlh->nlmsg_type, &perm);
                if (rc == 0) {
                        if (selinux_policycap_netlink_xperm()) {
                                rc = nlmsg_sock_has_extended_perms(
                                        sk, perm, nlh->nlmsg_type);
                        } else {
                                rc = sock_has_perm(sk, perm);
                        }
                        if (rc)
                                return rc;
                } else if (rc == -EINVAL) {
                        /* -EINVAL is a missing msg/perm mapping */
                        pr_warn_ratelimited("SELinux: unrecognized netlink"
                                " message: protocol=%hu nlmsg_type=%hu sclass=%s"
                                " pid=%d comm=%s\n",
                                sk->sk_protocol, nlh->nlmsg_type,
                                secclass_map[sclass - 1].name,
                                task_pid_nr(current), current->comm);
                        if (enforcing_enabled() &&
                            !security_get_allow_unknown())
                                return rc;
                        rc = 0;
                } else if (rc == -ENOENT) {
                        /* -ENOENT is a missing socket/class mapping, ignore */
                        rc = 0;
                } else {
                        return rc;
                }

                /* move to the next message after applying netlink padding */
                msg_len = NLMSG_ALIGN(nlh->nlmsg_len);
                if (msg_len >= data_len)
                        return 0;
                data_len -= msg_len;
                data += msg_len;
        }

        return rc;
}

static void ipc_init_security(struct ipc_security_struct *isec, u16 sclass)
{
        isec->sclass = sclass;
        isec->sid = current_sid();
}

static int ipc_has_perm(struct kern_ipc_perm *ipc_perms,
                        u32 perms)
{
        struct ipc_security_struct *isec;
        struct common_audit_data ad;
        u32 sid = current_sid();

        isec = selinux_ipc(ipc_perms);

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = ipc_perms->key;

        return avc_has_perm(sid, isec->sid, isec->sclass, perms, &ad);
}

static int selinux_msg_msg_alloc_security(struct msg_msg *msg)
{
        struct msg_security_struct *msec;

        msec = selinux_msg_msg(msg);
        msec->sid = SECINITSID_UNLABELED;

        return 0;
}

/* message queue security operations */
static int selinux_msg_queue_alloc_security(struct kern_ipc_perm *msq)
{
        struct ipc_security_struct *isec;
        struct common_audit_data ad;
        u32 sid = current_sid();

        isec = selinux_ipc(msq);
        ipc_init_security(isec, SECCLASS_MSGQ);

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = msq->key;

        return avc_has_perm(sid, isec->sid, SECCLASS_MSGQ,
                            MSGQ__CREATE, &ad);
}

static int selinux_msg_queue_associate(struct kern_ipc_perm *msq, int msqflg)
{
        struct ipc_security_struct *isec;
        struct common_audit_data ad;
        u32 sid = current_sid();

        isec = selinux_ipc(msq);

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = msq->key;

        return avc_has_perm(sid, isec->sid, SECCLASS_MSGQ,
                            MSGQ__ASSOCIATE, &ad);
}

static int selinux_msg_queue_msgctl(struct kern_ipc_perm *msq, int cmd)
{
        u32 perms;

        switch (cmd) {
        case IPC_INFO:
        case MSG_INFO:
                /* No specific object, just general system-wide information. */
                return avc_has_perm(current_sid(), SECINITSID_KERNEL,
                                    SECCLASS_SYSTEM, SYSTEM__IPC_INFO, NULL);
        case IPC_STAT:
        case MSG_STAT:
        case MSG_STAT_ANY:
                perms = MSGQ__GETATTR | MSGQ__ASSOCIATE;
                break;
        case IPC_SET:
                perms = MSGQ__SETATTR;
                break;
        case IPC_RMID:
                perms = MSGQ__DESTROY;
                break;
        default:
                return 0;
        }

        return ipc_has_perm(msq, perms);
}

static int selinux_msg_queue_msgsnd(struct kern_ipc_perm *msq, struct msg_msg *msg, int msqflg)
{
        struct ipc_security_struct *isec;
        struct msg_security_struct *msec;
        struct common_audit_data ad;
        u32 sid = current_sid();
        int rc;

        isec = selinux_ipc(msq);
        msec = selinux_msg_msg(msg);

        /*
         * First time through, need to assign label to the message
         */
        if (msec->sid == SECINITSID_UNLABELED) {
                /*
                 * Compute new sid based on current process and
                 * message queue this message will be stored in
                 */
                rc = security_transition_sid(sid, isec->sid,
                                             SECCLASS_MSG, NULL, &msec->sid);
                if (rc)
                        return rc;
        }

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = msq->key;

        /* Can this process write to the queue? */
        rc = avc_has_perm(sid, isec->sid, SECCLASS_MSGQ,
                          MSGQ__WRITE, &ad);
        if (!rc)
                /* Can this process send the message */
                rc = avc_has_perm(sid, msec->sid, SECCLASS_MSG,
                                  MSG__SEND, &ad);
        if (!rc)
                /* Can the message be put in the queue? */
                rc = avc_has_perm(msec->sid, isec->sid, SECCLASS_MSGQ,
                                  MSGQ__ENQUEUE, &ad);

        return rc;
}

static int selinux_msg_queue_msgrcv(struct kern_ipc_perm *msq, struct msg_msg *msg,
                                    struct task_struct *target,
                                    long type, int mode)
{
        struct ipc_security_struct *isec;
        struct msg_security_struct *msec;
        struct common_audit_data ad;
        u32 sid = task_sid_obj(target);
        int rc;

        isec = selinux_ipc(msq);
        msec = selinux_msg_msg(msg);

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = msq->key;

        rc = avc_has_perm(sid, isec->sid,
                          SECCLASS_MSGQ, MSGQ__READ, &ad);
        if (!rc)
                rc = avc_has_perm(sid, msec->sid,
                                  SECCLASS_MSG, MSG__RECEIVE, &ad);
        return rc;
}

/* Shared Memory security operations */
static int selinux_shm_alloc_security(struct kern_ipc_perm *shp)
{
        struct ipc_security_struct *isec;
        struct common_audit_data ad;
        u32 sid = current_sid();

        isec = selinux_ipc(shp);
        ipc_init_security(isec, SECCLASS_SHM);

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = shp->key;

        return avc_has_perm(sid, isec->sid, SECCLASS_SHM,
                            SHM__CREATE, &ad);
}

static int selinux_shm_associate(struct kern_ipc_perm *shp, int shmflg)
{
        struct ipc_security_struct *isec;
        struct common_audit_data ad;
        u32 sid = current_sid();

        isec = selinux_ipc(shp);

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = shp->key;

        return avc_has_perm(sid, isec->sid, SECCLASS_SHM,
                            SHM__ASSOCIATE, &ad);
}

/* Note, at this point, shp is locked down */
static int selinux_shm_shmctl(struct kern_ipc_perm *shp, int cmd)
{
        u32 perms;

        switch (cmd) {
        case IPC_INFO:
        case SHM_INFO:
                /* No specific object, just general system-wide information. */
                return avc_has_perm(current_sid(), SECINITSID_KERNEL,
                                    SECCLASS_SYSTEM, SYSTEM__IPC_INFO, NULL);
        case IPC_STAT:
        case SHM_STAT:
        case SHM_STAT_ANY:
                perms = SHM__GETATTR | SHM__ASSOCIATE;
                break;
        case IPC_SET:
                perms = SHM__SETATTR;
                break;
        case SHM_LOCK:
        case SHM_UNLOCK:
                perms = SHM__LOCK;
                break;
        case IPC_RMID:
                perms = SHM__DESTROY;
                break;
        default:
                return 0;
        }

        return ipc_has_perm(shp, perms);
}

static int selinux_shm_shmat(struct kern_ipc_perm *shp,
                             char __user *shmaddr, int shmflg)
{
        u32 perms;

        if (shmflg & SHM_RDONLY)
                perms = SHM__READ;
        else
                perms = SHM__READ | SHM__WRITE;

        return ipc_has_perm(shp, perms);
}

/* Semaphore security operations */
static int selinux_sem_alloc_security(struct kern_ipc_perm *sma)
{
        struct ipc_security_struct *isec;
        struct common_audit_data ad;
        u32 sid = current_sid();

        isec = selinux_ipc(sma);
        ipc_init_security(isec, SECCLASS_SEM);

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = sma->key;

        return avc_has_perm(sid, isec->sid, SECCLASS_SEM,
                            SEM__CREATE, &ad);
}

static int selinux_sem_associate(struct kern_ipc_perm *sma, int semflg)
{
        struct ipc_security_struct *isec;
        struct common_audit_data ad;
        u32 sid = current_sid();

        isec = selinux_ipc(sma);

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = sma->key;

        return avc_has_perm(sid, isec->sid, SECCLASS_SEM,
                            SEM__ASSOCIATE, &ad);
}

/* Note, at this point, sma is locked down */
static int selinux_sem_semctl(struct kern_ipc_perm *sma, int cmd)
{
        int err;
        u32 perms;

        switch (cmd) {
        case IPC_INFO:
        case SEM_INFO:
                /* No specific object, just general system-wide information. */
                return avc_has_perm(current_sid(), SECINITSID_KERNEL,
                                    SECCLASS_SYSTEM, SYSTEM__IPC_INFO, NULL);
        case GETPID:
        case GETNCNT:
        case GETZCNT:
                perms = SEM__GETATTR;
                break;
        case GETVAL:
        case GETALL:
                perms = SEM__READ;
                break;
        case SETVAL:
        case SETALL:
                perms = SEM__WRITE;
                break;
        case IPC_RMID:
                perms = SEM__DESTROY;
                break;
        case IPC_SET:
                perms = SEM__SETATTR;
                break;
        case IPC_STAT:
        case SEM_STAT:
        case SEM_STAT_ANY:
                perms = SEM__GETATTR | SEM__ASSOCIATE;
                break;
        default:
                return 0;
        }

        err = ipc_has_perm(sma, perms);
        return err;
}

static int selinux_sem_semop(struct kern_ipc_perm *sma,
                             struct sembuf *sops, unsigned nsops, int alter)
{
        u32 perms;

        if (alter)
                perms = SEM__READ | SEM__WRITE;
        else
                perms = SEM__READ;

        return ipc_has_perm(sma, perms);
}

static int selinux_ipc_permission(struct kern_ipc_perm *ipcp, short flag)
{
        u32 av = 0;

        av = 0;
        if (flag & S_IRUGO)
                av |= IPC__UNIX_READ;
        if (flag & S_IWUGO)
                av |= IPC__UNIX_WRITE;

        if (av == 0)
                return 0;

        return ipc_has_perm(ipcp, av);
}

static void selinux_ipc_getlsmprop(struct kern_ipc_perm *ipcp,
                                   struct lsm_prop *prop)
{
        struct ipc_security_struct *isec = selinux_ipc(ipcp);
        prop->selinux.secid = isec->sid;
}

static void selinux_d_instantiate(struct dentry *dentry, struct inode *inode)
{
        if (inode)
                inode_doinit_with_dentry(inode, dentry);
}

static int selinux_lsm_getattr(unsigned int attr, struct task_struct *p,
                               char **value)
{
        const struct cred_security_struct *crsec;
        int error;
        u32 sid;
        u32 len;

        rcu_read_lock();
        crsec = selinux_cred(__task_cred(p));
        if (p != current) {
                error = avc_has_perm(current_sid(), crsec->sid,
                                     SECCLASS_PROCESS, PROCESS__GETATTR, NULL);
                if (error)
                        goto err_unlock;
        }
        switch (attr) {
        case LSM_ATTR_CURRENT:
                sid = crsec->sid;
                break;
        case LSM_ATTR_PREV:
                sid = crsec->osid;
                break;
        case LSM_ATTR_EXEC:
                sid = crsec->exec_sid;
                break;
        case LSM_ATTR_FSCREATE:
                sid = crsec->create_sid;
                break;
        case LSM_ATTR_KEYCREATE:
                sid = crsec->keycreate_sid;
                break;
        case LSM_ATTR_SOCKCREATE:
                sid = crsec->sockcreate_sid;
                break;
        default:
                error = -EOPNOTSUPP;
                goto err_unlock;
        }
        rcu_read_unlock();

        if (sid == SECSID_NULL) {
                *value = NULL;
                return 0;
        }

        error = security_sid_to_context(sid, value, &len);
        if (error)
                return error;
        return len;

err_unlock:
        rcu_read_unlock();
        return error;
}

static int selinux_lsm_setattr(u64 attr, void *value, size_t size)
{
        struct cred_security_struct *crsec;
        struct cred *new;
        u32 mysid = current_sid(), sid = 0, ptsid;
        int error;
        char *str = value;

        /*
         * Basic control over ability to set these attributes at all.
         */
        switch (attr) {
        case LSM_ATTR_EXEC:
                error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS,
                                     PROCESS__SETEXEC, NULL);
                break;
        case LSM_ATTR_FSCREATE:
                error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS,
                                     PROCESS__SETFSCREATE, NULL);
                break;
        case LSM_ATTR_KEYCREATE:
                error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS,
                                     PROCESS__SETKEYCREATE, NULL);
                break;
        case LSM_ATTR_SOCKCREATE:
                error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS,
                                     PROCESS__SETSOCKCREATE, NULL);
                break;
        case LSM_ATTR_CURRENT:
                error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS,
                                     PROCESS__SETCURRENT, NULL);
                break;
        default:
                error = -EOPNOTSUPP;
                break;
        }
        if (error)
                return error;

        /* Obtain a SID for the context, if one was specified. */
        if (size && str[0] && str[0] != '\n') {
                if (str[size-1] == '\n') {
                        str[size-1] = 0;
                        size--;
                }
                error = security_context_to_sid(value, size,
                                                &sid, GFP_KERNEL);
                if (error == -EINVAL && attr == LSM_ATTR_FSCREATE) {
                        if (!has_cap_mac_admin(true)) {
                                struct audit_buffer *ab;
                                size_t audit_size;

                                /* We strip a nul only if it is at the end,
                                 * otherwise the context contains a nul and
                                 * we should audit that */
                                if (str[size - 1] == '\0')
                                        audit_size = size - 1;
                                else
                                        audit_size = size;
                                ab = audit_log_start(audit_context(),
                                                     GFP_ATOMIC,
                                                     AUDIT_SELINUX_ERR);
                                if (!ab)
                                        return error;
                                audit_log_format(ab, "op=fscreate invalid_context=");
                                audit_log_n_untrustedstring(ab, value,
                                                            audit_size);
                                audit_log_end(ab);

                                return error;
                        }
                        error = security_context_to_sid_force(value, size,
                                                        &sid);
                }
                if (error)
                        return error;
        }

        new = prepare_creds();
        if (!new)
                return -ENOMEM;

        /* Permission checking based on the specified context is
           performed during the actual operation (execve,
           open/mkdir/...), when we know the full context of the
           operation.  See selinux_bprm_creds_for_exec for the execve
           checks and may_create for the file creation checks. The
           operation will then fail if the context is not permitted. */
        crsec = selinux_cred(new);
        if (attr == LSM_ATTR_EXEC) {
                crsec->exec_sid = sid;
        } else if (attr == LSM_ATTR_FSCREATE) {
                crsec->create_sid = sid;
        } else if (attr == LSM_ATTR_KEYCREATE) {
                if (sid) {
                        error = avc_has_perm(mysid, sid,
                                             SECCLASS_KEY, KEY__CREATE, NULL);
                        if (error)
                                goto abort_change;
                }
                crsec->keycreate_sid = sid;
        } else if (attr == LSM_ATTR_SOCKCREATE) {
                crsec->sockcreate_sid = sid;
        } else if (attr == LSM_ATTR_CURRENT) {
                error = -EINVAL;
                if (sid == 0)
                        goto abort_change;

                if (!current_is_single_threaded()) {
                        error = security_bounded_transition(crsec->sid, sid);
                        if (error)
                                goto abort_change;
                }

                /* Check permissions for the transition. */
                error = avc_has_perm(crsec->sid, sid, SECCLASS_PROCESS,
                                     PROCESS__DYNTRANSITION, NULL);
                if (error)
                        goto abort_change;

                /* Check for ptracing, and update the task SID if ok.
                   Otherwise, leave SID unchanged and fail. */
                ptsid = ptrace_parent_sid();
                if (ptsid != 0) {
                        error = avc_has_perm(ptsid, sid, SECCLASS_PROCESS,
                                             PROCESS__PTRACE, NULL);
                        if (error)
                                goto abort_change;
                }

                crsec->sid = sid;
        } else {
                error = -EINVAL;
                goto abort_change;
        }

        commit_creds(new);
        return size;

abort_change:
        abort_creds(new);
        return error;
}

/**
 * selinux_getselfattr - Get SELinux current task attributes
 * @attr: the requested attribute
 * @ctx: buffer to receive the result
 * @size: buffer size (input), buffer size used (output)
 * @flags: unused
 *
 * Fill the passed user space @ctx with the details of the requested
 * attribute.
 *
 * Returns the number of attributes on success, an error code otherwise.
 * There will only ever be one attribute.
 */
static int selinux_getselfattr(unsigned int attr, struct lsm_ctx __user *ctx,
                               u32 *size, u32 flags)
{
        int rc;
        char *val = NULL;
        int val_len;

        val_len = selinux_lsm_getattr(attr, current, &val);
        if (val_len < 0)
                return val_len;
        rc = lsm_fill_user_ctx(ctx, size, val, val_len, LSM_ID_SELINUX, 0);
        kfree(val);
        return (!rc ? 1 : rc);
}

static int selinux_setselfattr(unsigned int attr, struct lsm_ctx *ctx,
                               u32 size, u32 flags)
{
        int rc;

        rc = selinux_lsm_setattr(attr, ctx->ctx, ctx->ctx_len);
        if (rc > 0)
                return 0;
        return rc;
}

static int selinux_getprocattr(struct task_struct *p,
                               const char *name, char **value)
{
        unsigned int attr = lsm_name_to_attr(name);
        int rc;

        if (attr) {
                rc = selinux_lsm_getattr(attr, p, value);
                if (rc != -EOPNOTSUPP)
                        return rc;
        }

        return -EINVAL;
}

static int selinux_setprocattr(const char *name, void *value, size_t size)
{
        int attr = lsm_name_to_attr(name);

        if (attr)
                return selinux_lsm_setattr(attr, value, size);
        return -EINVAL;
}

static int selinux_ismaclabel(const char *name)
{
        return (strcmp(name, XATTR_SELINUX_SUFFIX) == 0);
}

static int selinux_secid_to_secctx(u32 secid, struct lsm_context *cp)
{
        u32 seclen;
        int ret;

        if (cp) {
                cp->id = LSM_ID_SELINUX;
                ret = security_sid_to_context(secid, &cp->context, &cp->len);
                if (ret < 0)
                        return ret;
                return cp->len;
        }
        ret = security_sid_to_context(secid, NULL, &seclen);
        if (ret < 0)
                return ret;
        return seclen;
}

static int selinux_lsmprop_to_secctx(struct lsm_prop *prop,
                                     struct lsm_context *cp)
{
        return selinux_secid_to_secctx(prop->selinux.secid, cp);
}

static int selinux_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid)
{
        return security_context_to_sid(secdata, seclen,
                                       secid, GFP_KERNEL);
}

static void selinux_release_secctx(struct lsm_context *cp)
{
        if (cp->id == LSM_ID_SELINUX) {
                kfree(cp->context);
                cp->context = NULL;
                cp->id = LSM_ID_UNDEF;
        }
}

static void selinux_inode_invalidate_secctx(struct inode *inode)
{
        struct inode_security_struct *isec = selinux_inode(inode);

        spin_lock(&isec->lock);
        isec->initialized = LABEL_INVALID;
        spin_unlock(&isec->lock);
}

/*
 *        called with inode->i_mutex locked
 */
static int selinux_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen)
{
        int rc = selinux_inode_setsecurity(inode, XATTR_SELINUX_SUFFIX,
                                           ctx, ctxlen, 0);
        /* Do not return error when suppressing label (SBLABEL_MNT not set). */
        return rc == -EOPNOTSUPP ? 0 : rc;
}

/*
 *        called with inode->i_mutex locked
 */
static int selinux_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen)
{
        return __vfs_setxattr_locked(&nop_mnt_idmap, dentry, XATTR_NAME_SELINUX,
                                     ctx, ctxlen, 0, NULL);
}

static int selinux_inode_getsecctx(struct inode *inode, struct lsm_context *cp)
{
        int len;
        len = selinux_inode_getsecurity(&nop_mnt_idmap, inode,
                                        XATTR_SELINUX_SUFFIX,
                                        (void **)&cp->context, true);
        if (len < 0)
                return len;
        cp->len = len;
        cp->id = LSM_ID_SELINUX;
        return 0;
}
#ifdef CONFIG_KEYS

static int selinux_key_alloc(struct key *k, const struct cred *cred,
                             unsigned long flags)
{
        const struct cred_security_struct *crsec;
        struct key_security_struct *ksec = selinux_key(k);

        crsec = selinux_cred(cred);
        if (crsec->keycreate_sid)
                ksec->sid = crsec->keycreate_sid;
        else
                ksec->sid = crsec->sid;

        return 0;
}

static int selinux_key_permission(key_ref_t key_ref,
                                  const struct cred *cred,
                                  enum key_need_perm need_perm)
{
        struct key *key;
        struct key_security_struct *ksec;
        u32 perm, sid;

        switch (need_perm) {
        case KEY_NEED_VIEW:
                perm = KEY__VIEW;
                break;
        case KEY_NEED_READ:
                perm = KEY__READ;
                break;
        case KEY_NEED_WRITE:
                perm = KEY__WRITE;
                break;
        case KEY_NEED_SEARCH:
                perm = KEY__SEARCH;
                break;
        case KEY_NEED_LINK:
                perm = KEY__LINK;
                break;
        case KEY_NEED_SETATTR:
                perm = KEY__SETATTR;
                break;
        case KEY_NEED_UNLINK:
        case KEY_SYSADMIN_OVERRIDE:
        case KEY_AUTHTOKEN_OVERRIDE:
        case KEY_DEFER_PERM_CHECK:
                return 0;
        default:
                WARN_ON(1);
                return -EPERM;

        }

        sid = cred_sid(cred);
        key = key_ref_to_ptr(key_ref);
        ksec = selinux_key(key);

        return avc_has_perm(sid, ksec->sid, SECCLASS_KEY, perm, NULL);
}

static int selinux_key_getsecurity(struct key *key, char **_buffer)
{
        struct key_security_struct *ksec = selinux_key(key);
        char *context = NULL;
        unsigned len;
        int rc;

        rc = security_sid_to_context(ksec->sid,
                                     &context, &len);
        if (!rc)
                rc = len;
        *_buffer = context;
        return rc;
}

#ifdef CONFIG_KEY_NOTIFICATIONS
static int selinux_watch_key(struct key *key)
{
        struct key_security_struct *ksec = selinux_key(key);
        u32 sid = current_sid();

        return avc_has_perm(sid, ksec->sid, SECCLASS_KEY, KEY__VIEW, NULL);
}
#endif
#endif

#ifdef CONFIG_SECURITY_INFINIBAND
static int selinux_ib_pkey_access(void *ib_sec, u64 subnet_prefix, u16 pkey_val)
{
        struct common_audit_data ad;
        int err;
        u32 sid = 0;
        struct ib_security_struct *sec = ib_sec;
        struct lsm_ibpkey_audit ibpkey;

        err = sel_ib_pkey_sid(subnet_prefix, pkey_val, &sid);
        if (err)
                return err;

        ad.type = LSM_AUDIT_DATA_IBPKEY;
        ibpkey.subnet_prefix = subnet_prefix;
        ibpkey.pkey = pkey_val;
        ad.u.ibpkey = &ibpkey;
        return avc_has_perm(sec->sid, sid,
                            SECCLASS_INFINIBAND_PKEY,
                            INFINIBAND_PKEY__ACCESS, &ad);
}

static int selinux_ib_endport_manage_subnet(void *ib_sec, const char *dev_name,
                                            u8 port_num)
{
        struct common_audit_data ad;
        int err;
        u32 sid = 0;
        struct ib_security_struct *sec = ib_sec;
        struct lsm_ibendport_audit ibendport;

        err = security_ib_endport_sid(dev_name, port_num,
                                      &sid);

        if (err)
                return err;

        ad.type = LSM_AUDIT_DATA_IBENDPORT;
        ibendport.dev_name = dev_name;
        ibendport.port = port_num;
        ad.u.ibendport = &ibendport;
        return avc_has_perm(sec->sid, sid,
                            SECCLASS_INFINIBAND_ENDPORT,
                            INFINIBAND_ENDPORT__MANAGE_SUBNET, &ad);
}

static int selinux_ib_alloc_security(void *ib_sec)
{
        struct ib_security_struct *sec = selinux_ib(ib_sec);

        sec->sid = current_sid();
        return 0;
}
#endif

#ifdef CONFIG_BPF_SYSCALL
static int selinux_bpf(int cmd, union bpf_attr *attr,
                       unsigned int size, bool kernel)
{
        u32 sid = current_sid();
        int ret;

        switch (cmd) {
        case BPF_MAP_CREATE:
                ret = avc_has_perm(sid, sid, SECCLASS_BPF, BPF__MAP_CREATE,
                                   NULL);
                break;
        case BPF_PROG_LOAD:
                ret = avc_has_perm(sid, sid, SECCLASS_BPF, BPF__PROG_LOAD,
                                   NULL);
                break;
        default:
                ret = 0;
                break;
        }

        return ret;
}

static u32 bpf_map_fmode_to_av(fmode_t fmode)
{
        u32 av = 0;

        if (fmode & FMODE_READ)
                av |= BPF__MAP_READ;
        if (fmode & FMODE_WRITE)
                av |= BPF__MAP_WRITE;
        return av;
}

/* This function will check the file pass through unix socket or binder to see
 * if it is a bpf related object. And apply corresponding checks on the bpf
 * object based on the type. The bpf maps and programs, not like other files and
 * socket, are using a shared anonymous inode inside the kernel as their inode.
 * So checking that inode cannot identify if the process have privilege to
 * access the bpf object and that's why we have to add this additional check in
 * selinux_file_receive and selinux_binder_transfer_files.
 */
static int bpf_fd_pass(const struct file *file, u32 sid)
{
        struct bpf_security_struct *bpfsec;
        struct bpf_prog *prog;
        struct bpf_map *map;
        int ret;

        if (file->f_op == &bpf_map_fops) {
                map = file->private_data;
                bpfsec = selinux_bpf_map_security(map);
                ret = avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF,
                                   bpf_map_fmode_to_av(file->f_mode), NULL);
                if (ret)
                        return ret;
        } else if (file->f_op == &bpf_prog_fops) {
                prog = file->private_data;
                bpfsec = selinux_bpf_prog_security(prog);
                ret = avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF,
                                   BPF__PROG_RUN, NULL);
                if (ret)
                        return ret;
        }
        return 0;
}

static int selinux_bpf_map(struct bpf_map *map, fmode_t fmode)
{
        u32 sid = current_sid();
        struct bpf_security_struct *bpfsec;

        bpfsec = selinux_bpf_map_security(map);
        return avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF,
                            bpf_map_fmode_to_av(fmode), NULL);
}

static int selinux_bpf_prog(struct bpf_prog *prog)
{
        u32 sid = current_sid();
        struct bpf_security_struct *bpfsec;

        bpfsec = selinux_bpf_prog_security(prog);
        return avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF,
                            BPF__PROG_RUN, NULL);
}

static int selinux_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
                                  struct bpf_token *token, bool kernel)
{
        struct bpf_security_struct *bpfsec;

        bpfsec = selinux_bpf_map_security(map);
        bpfsec->sid = current_sid();

        return 0;
}

static int selinux_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
                                 struct bpf_token *token, bool kernel)
{
        struct bpf_security_struct *bpfsec;

        bpfsec = selinux_bpf_prog_security(prog);
        bpfsec->sid = current_sid();

        return 0;
}

static int selinux_bpf_token_create(struct bpf_token *token, union bpf_attr *attr,
                                    const struct path *path)
{
        struct bpf_security_struct *bpfsec;

        bpfsec = selinux_bpf_token_security(token);
        bpfsec->sid = current_sid();

        return 0;
}
#endif

struct lsm_blob_sizes selinux_blob_sizes __ro_after_init = {
        .lbs_cred = sizeof(struct cred_security_struct),
        .lbs_task = sizeof(struct task_security_struct),
        .lbs_file = sizeof(struct file_security_struct),
        .lbs_inode = sizeof(struct inode_security_struct),
        .lbs_ipc = sizeof(struct ipc_security_struct),
        .lbs_key = sizeof(struct key_security_struct),
        .lbs_msg_msg = sizeof(struct msg_security_struct),
#ifdef CONFIG_PERF_EVENTS
        .lbs_perf_event = sizeof(struct perf_event_security_struct),
#endif
        .lbs_sock = sizeof(struct sk_security_struct),
        .lbs_superblock = sizeof(struct superblock_security_struct),
        .lbs_xattr_count = SELINUX_INODE_INIT_XATTRS,
        .lbs_tun_dev = sizeof(struct tun_security_struct),
        .lbs_ib = sizeof(struct ib_security_struct),
        .lbs_bpf_map = sizeof(struct bpf_security_struct),
        .lbs_bpf_prog = sizeof(struct bpf_security_struct),
        .lbs_bpf_token = sizeof(struct bpf_security_struct),
};

#ifdef CONFIG_PERF_EVENTS
static int selinux_perf_event_open(int type)
{
        u32 requested, sid = current_sid();

        if (type == PERF_SECURITY_OPEN)
                requested = PERF_EVENT__OPEN;
        else if (type == PERF_SECURITY_CPU)
                requested = PERF_EVENT__CPU;
        else if (type == PERF_SECURITY_KERNEL)
                requested = PERF_EVENT__KERNEL;
        else if (type == PERF_SECURITY_TRACEPOINT)
                requested = PERF_EVENT__TRACEPOINT;
        else
                return -EINVAL;

        return avc_has_perm(sid, sid, SECCLASS_PERF_EVENT,
                            requested, NULL);
}

static int selinux_perf_event_alloc(struct perf_event *event)
{
        struct perf_event_security_struct *perfsec;

        perfsec = selinux_perf_event(event->security);
        perfsec->sid = current_sid();

        return 0;
}

static int selinux_perf_event_read(struct perf_event *event)
{
        struct perf_event_security_struct *perfsec = event->security;
        u32 sid = current_sid();

        return avc_has_perm(sid, perfsec->sid,
                            SECCLASS_PERF_EVENT, PERF_EVENT__READ, NULL);
}

static int selinux_perf_event_write(struct perf_event *event)
{
        struct perf_event_security_struct *perfsec = event->security;
        u32 sid = current_sid();

        return avc_has_perm(sid, perfsec->sid,
                            SECCLASS_PERF_EVENT, PERF_EVENT__WRITE, NULL);
}
#endif

#ifdef CONFIG_IO_URING
/**
 * selinux_uring_override_creds - check the requested cred override
 * @new: the target creds
 *
 * Check to see if the current task is allowed to override it's credentials
 * to service an io_uring operation.
 */
static int selinux_uring_override_creds(const struct cred *new)
{
        return avc_has_perm(current_sid(), cred_sid(new),
                            SECCLASS_IO_URING, IO_URING__OVERRIDE_CREDS, NULL);
}

/**
 * selinux_uring_sqpoll - check if a io_uring polling thread can be created
 *
 * Check to see if the current task is allowed to create a new io_uring
 * kernel polling thread.
 */
static int selinux_uring_sqpoll(void)
{
        u32 sid = current_sid();

        return avc_has_perm(sid, sid,
                            SECCLASS_IO_URING, IO_URING__SQPOLL, NULL);
}

/**
 * selinux_uring_cmd - check if IORING_OP_URING_CMD is allowed
 * @ioucmd: the io_uring command structure
 *
 * Check to see if the current domain is allowed to execute an
 * IORING_OP_URING_CMD against the device/file specified in @ioucmd.
 *
 */
static int selinux_uring_cmd(struct io_uring_cmd *ioucmd)
{
        struct file *file = ioucmd->file;
        struct inode *inode = file_inode(file);
        struct inode_security_struct *isec = selinux_inode(inode);
        struct common_audit_data ad;

        ad.type = LSM_AUDIT_DATA_FILE;
        ad.u.file = file;

        return avc_has_perm(current_sid(), isec->sid,
                            SECCLASS_IO_URING, IO_URING__CMD, &ad);
}

/**
 * selinux_uring_allowed - check if io_uring_setup() can be called
 *
 * Check to see if the current task is allowed to call io_uring_setup().
 */
static int selinux_uring_allowed(void)
{
        u32 sid = current_sid();

        return avc_has_perm(sid, sid, SECCLASS_IO_URING, IO_URING__ALLOWED,
                            NULL);
}
#endif /* CONFIG_IO_URING */

static const struct lsm_id selinux_lsmid = {
        .name = "selinux",
        .id = LSM_ID_SELINUX,
};

/*
 * IMPORTANT NOTE: When adding new hooks, please be careful to keep this order:
 * 1. any hooks that don't belong to (2.) or (3.) below,
 * 2. hooks that both access structures allocated by other hooks, and allocate
 *    structures that can be later accessed by other hooks (mostly "cloning"
 *    hooks),
 * 3. hooks that only allocate structures that can be later accessed by other
 *    hooks ("allocating" hooks).
 *
 * Please follow block comment delimiters in the list to keep this order.
 */
static struct security_hook_list selinux_hooks[] __ro_after_init = {
        LSM_HOOK_INIT(binder_set_context_mgr, selinux_binder_set_context_mgr),
        LSM_HOOK_INIT(binder_transaction, selinux_binder_transaction),
        LSM_HOOK_INIT(binder_transfer_binder, selinux_binder_transfer_binder),
        LSM_HOOK_INIT(binder_transfer_file, selinux_binder_transfer_file),

        LSM_HOOK_INIT(ptrace_access_check, selinux_ptrace_access_check),
        LSM_HOOK_INIT(ptrace_traceme, selinux_ptrace_traceme),
        LSM_HOOK_INIT(capget, selinux_capget),
        LSM_HOOK_INIT(capset, selinux_capset),
        LSM_HOOK_INIT(capable, selinux_capable),
        LSM_HOOK_INIT(quotactl, selinux_quotactl),
        LSM_HOOK_INIT(quota_on, selinux_quota_on),
        LSM_HOOK_INIT(syslog, selinux_syslog),
        LSM_HOOK_INIT(vm_enough_memory, selinux_vm_enough_memory),

        LSM_HOOK_INIT(netlink_send, selinux_netlink_send),

        LSM_HOOK_INIT(bprm_creds_for_exec, selinux_bprm_creds_for_exec),
        LSM_HOOK_INIT(bprm_committing_creds, selinux_bprm_committing_creds),
        LSM_HOOK_INIT(bprm_committed_creds, selinux_bprm_committed_creds),

        LSM_HOOK_INIT(sb_free_mnt_opts, selinux_free_mnt_opts),
        LSM_HOOK_INIT(sb_mnt_opts_compat, selinux_sb_mnt_opts_compat),
        LSM_HOOK_INIT(sb_remount, selinux_sb_remount),
        LSM_HOOK_INIT(sb_kern_mount, selinux_sb_kern_mount),
        LSM_HOOK_INIT(sb_show_options, selinux_sb_show_options),
        LSM_HOOK_INIT(sb_statfs, selinux_sb_statfs),
        LSM_HOOK_INIT(sb_mount, selinux_mount),
        LSM_HOOK_INIT(sb_umount, selinux_umount),
        LSM_HOOK_INIT(sb_set_mnt_opts, selinux_set_mnt_opts),
        LSM_HOOK_INIT(sb_clone_mnt_opts, selinux_sb_clone_mnt_opts),

        LSM_HOOK_INIT(move_mount, selinux_move_mount),

        LSM_HOOK_INIT(dentry_init_security, selinux_dentry_init_security),
        LSM_HOOK_INIT(dentry_create_files_as, selinux_dentry_create_files_as),

        LSM_HOOK_INIT(inode_free_security, selinux_inode_free_security),
        LSM_HOOK_INIT(inode_init_security, selinux_inode_init_security),
        LSM_HOOK_INIT(inode_init_security_anon, selinux_inode_init_security_anon),
        LSM_HOOK_INIT(inode_create, selinux_inode_create),
        LSM_HOOK_INIT(inode_link, selinux_inode_link),
        LSM_HOOK_INIT(inode_unlink, selinux_inode_unlink),
        LSM_HOOK_INIT(inode_symlink, selinux_inode_symlink),
        LSM_HOOK_INIT(inode_mkdir, selinux_inode_mkdir),
        LSM_HOOK_INIT(inode_rmdir, selinux_inode_rmdir),
        LSM_HOOK_INIT(inode_mknod, selinux_inode_mknod),
        LSM_HOOK_INIT(inode_rename, selinux_inode_rename),
        LSM_HOOK_INIT(inode_readlink, selinux_inode_readlink),
        LSM_HOOK_INIT(inode_follow_link, selinux_inode_follow_link),
        LSM_HOOK_INIT(inode_permission, selinux_inode_permission),
        LSM_HOOK_INIT(inode_setattr, selinux_inode_setattr),
        LSM_HOOK_INIT(inode_getattr, selinux_inode_getattr),
        LSM_HOOK_INIT(inode_xattr_skipcap, selinux_inode_xattr_skipcap),
        LSM_HOOK_INIT(inode_setxattr, selinux_inode_setxattr),
        LSM_HOOK_INIT(inode_post_setxattr, selinux_inode_post_setxattr),
        LSM_HOOK_INIT(inode_getxattr, selinux_inode_getxattr),
        LSM_HOOK_INIT(inode_listxattr, selinux_inode_listxattr),
        LSM_HOOK_INIT(inode_removexattr, selinux_inode_removexattr),
        LSM_HOOK_INIT(inode_file_getattr, selinux_inode_file_getattr),
        LSM_HOOK_INIT(inode_file_setattr, selinux_inode_file_setattr),
        LSM_HOOK_INIT(inode_set_acl, selinux_inode_set_acl),
        LSM_HOOK_INIT(inode_get_acl, selinux_inode_get_acl),
        LSM_HOOK_INIT(inode_remove_acl, selinux_inode_remove_acl),
        LSM_HOOK_INIT(inode_getsecurity, selinux_inode_getsecurity),
        LSM_HOOK_INIT(inode_setsecurity, selinux_inode_setsecurity),
        LSM_HOOK_INIT(inode_listsecurity, selinux_inode_listsecurity),
        LSM_HOOK_INIT(inode_getlsmprop, selinux_inode_getlsmprop),
        LSM_HOOK_INIT(inode_copy_up, selinux_inode_copy_up),
        LSM_HOOK_INIT(inode_copy_up_xattr, selinux_inode_copy_up_xattr),
        LSM_HOOK_INIT(path_notify, selinux_path_notify),

        LSM_HOOK_INIT(kernfs_init_security, selinux_kernfs_init_security),

        LSM_HOOK_INIT(file_permission, selinux_file_permission),
        LSM_HOOK_INIT(file_alloc_security, selinux_file_alloc_security),
        LSM_HOOK_INIT(file_ioctl, selinux_file_ioctl),
        LSM_HOOK_INIT(file_ioctl_compat, selinux_file_ioctl_compat),
        LSM_HOOK_INIT(mmap_file, selinux_mmap_file),
        LSM_HOOK_INIT(mmap_addr, selinux_mmap_addr),
        LSM_HOOK_INIT(file_mprotect, selinux_file_mprotect),
        LSM_HOOK_INIT(file_lock, selinux_file_lock),
        LSM_HOOK_INIT(file_fcntl, selinux_file_fcntl),
        LSM_HOOK_INIT(file_set_fowner, selinux_file_set_fowner),
        LSM_HOOK_INIT(file_send_sigiotask, selinux_file_send_sigiotask),
        LSM_HOOK_INIT(file_receive, selinux_file_receive),

        LSM_HOOK_INIT(file_open, selinux_file_open),

        LSM_HOOK_INIT(task_alloc, selinux_task_alloc),
        LSM_HOOK_INIT(cred_prepare, selinux_cred_prepare),
        LSM_HOOK_INIT(cred_transfer, selinux_cred_transfer),
        LSM_HOOK_INIT(cred_getsecid, selinux_cred_getsecid),
        LSM_HOOK_INIT(cred_getlsmprop, selinux_cred_getlsmprop),
        LSM_HOOK_INIT(kernel_act_as, selinux_kernel_act_as),
        LSM_HOOK_INIT(kernel_create_files_as, selinux_kernel_create_files_as),
        LSM_HOOK_INIT(kernel_module_request, selinux_kernel_module_request),
        LSM_HOOK_INIT(kernel_load_data, selinux_kernel_load_data),
        LSM_HOOK_INIT(kernel_read_file, selinux_kernel_read_file),
        LSM_HOOK_INIT(task_setpgid, selinux_task_setpgid),
        LSM_HOOK_INIT(task_getpgid, selinux_task_getpgid),
        LSM_HOOK_INIT(task_getsid, selinux_task_getsid),
        LSM_HOOK_INIT(current_getlsmprop_subj, selinux_current_getlsmprop_subj),
        LSM_HOOK_INIT(task_getlsmprop_obj, selinux_task_getlsmprop_obj),
        LSM_HOOK_INIT(task_setnice, selinux_task_setnice),
        LSM_HOOK_INIT(task_setioprio, selinux_task_setioprio),
        LSM_HOOK_INIT(task_getioprio, selinux_task_getioprio),
        LSM_HOOK_INIT(task_prlimit, selinux_task_prlimit),
        LSM_HOOK_INIT(task_setrlimit, selinux_task_setrlimit),
        LSM_HOOK_INIT(task_setscheduler, selinux_task_setscheduler),
        LSM_HOOK_INIT(task_getscheduler, selinux_task_getscheduler),
        LSM_HOOK_INIT(task_movememory, selinux_task_movememory),
        LSM_HOOK_INIT(task_kill, selinux_task_kill),
        LSM_HOOK_INIT(task_to_inode, selinux_task_to_inode),
        LSM_HOOK_INIT(userns_create, selinux_userns_create),

        LSM_HOOK_INIT(ipc_permission, selinux_ipc_permission),
        LSM_HOOK_INIT(ipc_getlsmprop, selinux_ipc_getlsmprop),

        LSM_HOOK_INIT(msg_queue_associate, selinux_msg_queue_associate),
        LSM_HOOK_INIT(msg_queue_msgctl, selinux_msg_queue_msgctl),
        LSM_HOOK_INIT(msg_queue_msgsnd, selinux_msg_queue_msgsnd),
        LSM_HOOK_INIT(msg_queue_msgrcv, selinux_msg_queue_msgrcv),

        LSM_HOOK_INIT(shm_associate, selinux_shm_associate),
        LSM_HOOK_INIT(shm_shmctl, selinux_shm_shmctl),
        LSM_HOOK_INIT(shm_shmat, selinux_shm_shmat),

        LSM_HOOK_INIT(sem_associate, selinux_sem_associate),
        LSM_HOOK_INIT(sem_semctl, selinux_sem_semctl),
        LSM_HOOK_INIT(sem_semop, selinux_sem_semop),

        LSM_HOOK_INIT(d_instantiate, selinux_d_instantiate),

        LSM_HOOK_INIT(getselfattr, selinux_getselfattr),
        LSM_HOOK_INIT(setselfattr, selinux_setselfattr),
        LSM_HOOK_INIT(getprocattr, selinux_getprocattr),
        LSM_HOOK_INIT(setprocattr, selinux_setprocattr),

        LSM_HOOK_INIT(ismaclabel, selinux_ismaclabel),
        LSM_HOOK_INIT(secctx_to_secid, selinux_secctx_to_secid),
        LSM_HOOK_INIT(release_secctx, selinux_release_secctx),
        LSM_HOOK_INIT(inode_invalidate_secctx, selinux_inode_invalidate_secctx),
        LSM_HOOK_INIT(inode_notifysecctx, selinux_inode_notifysecctx),
        LSM_HOOK_INIT(inode_setsecctx, selinux_inode_setsecctx),

        LSM_HOOK_INIT(unix_stream_connect, selinux_socket_unix_stream_connect),
        LSM_HOOK_INIT(unix_may_send, selinux_socket_unix_may_send),

        LSM_HOOK_INIT(socket_create, selinux_socket_create),
        LSM_HOOK_INIT(socket_post_create, selinux_socket_post_create),
        LSM_HOOK_INIT(socket_socketpair, selinux_socket_socketpair),
        LSM_HOOK_INIT(socket_bind, selinux_socket_bind),
        LSM_HOOK_INIT(socket_connect, selinux_socket_connect),
        LSM_HOOK_INIT(socket_listen, selinux_socket_listen),
        LSM_HOOK_INIT(socket_accept, selinux_socket_accept),
        LSM_HOOK_INIT(socket_sendmsg, selinux_socket_sendmsg),
        LSM_HOOK_INIT(socket_recvmsg, selinux_socket_recvmsg),
        LSM_HOOK_INIT(socket_getsockname, selinux_socket_getsockname),
        LSM_HOOK_INIT(socket_getpeername, selinux_socket_getpeername),
        LSM_HOOK_INIT(socket_getsockopt, selinux_socket_getsockopt),
        LSM_HOOK_INIT(socket_setsockopt, selinux_socket_setsockopt),
        LSM_HOOK_INIT(socket_shutdown, selinux_socket_shutdown),
        LSM_HOOK_INIT(socket_sock_rcv_skb, selinux_socket_sock_rcv_skb),
        LSM_HOOK_INIT(socket_getpeersec_stream,
                        selinux_socket_getpeersec_stream),
        LSM_HOOK_INIT(socket_getpeersec_dgram, selinux_socket_getpeersec_dgram),
        LSM_HOOK_INIT(sk_free_security, selinux_sk_free_security),
        LSM_HOOK_INIT(sk_clone_security, selinux_sk_clone_security),
        LSM_HOOK_INIT(sk_getsecid, selinux_sk_getsecid),
        LSM_HOOK_INIT(sock_graft, selinux_sock_graft),
        LSM_HOOK_INIT(sctp_assoc_request, selinux_sctp_assoc_request),
        LSM_HOOK_INIT(sctp_sk_clone, selinux_sctp_sk_clone),
        LSM_HOOK_INIT(sctp_bind_connect, selinux_sctp_bind_connect),
        LSM_HOOK_INIT(sctp_assoc_established, selinux_sctp_assoc_established),
        LSM_HOOK_INIT(mptcp_add_subflow, selinux_mptcp_add_subflow),
        LSM_HOOK_INIT(inet_conn_request, selinux_inet_conn_request),
        LSM_HOOK_INIT(inet_csk_clone, selinux_inet_csk_clone),
        LSM_HOOK_INIT(inet_conn_established, selinux_inet_conn_established),
        LSM_HOOK_INIT(secmark_relabel_packet, selinux_secmark_relabel_packet),
        LSM_HOOK_INIT(secmark_refcount_inc, selinux_secmark_refcount_inc),
        LSM_HOOK_INIT(secmark_refcount_dec, selinux_secmark_refcount_dec),
        LSM_HOOK_INIT(req_classify_flow, selinux_req_classify_flow),
        LSM_HOOK_INIT(tun_dev_create, selinux_tun_dev_create),
        LSM_HOOK_INIT(tun_dev_attach_queue, selinux_tun_dev_attach_queue),
        LSM_HOOK_INIT(tun_dev_attach, selinux_tun_dev_attach),
        LSM_HOOK_INIT(tun_dev_open, selinux_tun_dev_open),
#ifdef CONFIG_SECURITY_INFINIBAND
        LSM_HOOK_INIT(ib_pkey_access, selinux_ib_pkey_access),
        LSM_HOOK_INIT(ib_endport_manage_subnet,
                      selinux_ib_endport_manage_subnet),
#endif
#ifdef CONFIG_SECURITY_NETWORK_XFRM
        LSM_HOOK_INIT(xfrm_policy_free_security, selinux_xfrm_policy_free),
        LSM_HOOK_INIT(xfrm_policy_delete_security, selinux_xfrm_policy_delete),
        LSM_HOOK_INIT(xfrm_state_free_security, selinux_xfrm_state_free),
        LSM_HOOK_INIT(xfrm_state_delete_security, selinux_xfrm_state_delete),
        LSM_HOOK_INIT(xfrm_policy_lookup, selinux_xfrm_policy_lookup),
        LSM_HOOK_INIT(xfrm_state_pol_flow_match,
                        selinux_xfrm_state_pol_flow_match),
        LSM_HOOK_INIT(xfrm_decode_session, selinux_xfrm_decode_session),
#endif

#ifdef CONFIG_KEYS
        LSM_HOOK_INIT(key_permission, selinux_key_permission),
        LSM_HOOK_INIT(key_getsecurity, selinux_key_getsecurity),
#ifdef CONFIG_KEY_NOTIFICATIONS
        LSM_HOOK_INIT(watch_key, selinux_watch_key),
#endif
#endif

#ifdef CONFIG_AUDIT
        LSM_HOOK_INIT(audit_rule_known, selinux_audit_rule_known),
        LSM_HOOK_INIT(audit_rule_match, selinux_audit_rule_match),
        LSM_HOOK_INIT(audit_rule_free, selinux_audit_rule_free),
#endif

#ifdef CONFIG_BPF_SYSCALL
        LSM_HOOK_INIT(bpf, selinux_bpf),
        LSM_HOOK_INIT(bpf_map, selinux_bpf_map),
        LSM_HOOK_INIT(bpf_prog, selinux_bpf_prog),
#endif

#ifdef CONFIG_PERF_EVENTS
        LSM_HOOK_INIT(perf_event_open, selinux_perf_event_open),
        LSM_HOOK_INIT(perf_event_read, selinux_perf_event_read),
        LSM_HOOK_INIT(perf_event_write, selinux_perf_event_write),
#endif

#ifdef CONFIG_IO_URING
        LSM_HOOK_INIT(uring_override_creds, selinux_uring_override_creds),
        LSM_HOOK_INIT(uring_sqpoll, selinux_uring_sqpoll),
        LSM_HOOK_INIT(uring_cmd, selinux_uring_cmd),
        LSM_HOOK_INIT(uring_allowed, selinux_uring_allowed),
#endif

        /*
         * PUT "CLONING" (ACCESSING + ALLOCATING) HOOKS HERE
         */
        LSM_HOOK_INIT(fs_context_submount, selinux_fs_context_submount),
        LSM_HOOK_INIT(fs_context_dup, selinux_fs_context_dup),
        LSM_HOOK_INIT(fs_context_parse_param, selinux_fs_context_parse_param),
        LSM_HOOK_INIT(sb_eat_lsm_opts, selinux_sb_eat_lsm_opts),
#ifdef CONFIG_SECURITY_NETWORK_XFRM
        LSM_HOOK_INIT(xfrm_policy_clone_security, selinux_xfrm_policy_clone),
#endif

        /*
         * PUT "ALLOCATING" HOOKS HERE
         */
        LSM_HOOK_INIT(msg_msg_alloc_security, selinux_msg_msg_alloc_security),
        LSM_HOOK_INIT(msg_queue_alloc_security,
                      selinux_msg_queue_alloc_security),
        LSM_HOOK_INIT(shm_alloc_security, selinux_shm_alloc_security),
        LSM_HOOK_INIT(sb_alloc_security, selinux_sb_alloc_security),
        LSM_HOOK_INIT(inode_alloc_security, selinux_inode_alloc_security),
        LSM_HOOK_INIT(sem_alloc_security, selinux_sem_alloc_security),
        LSM_HOOK_INIT(secid_to_secctx, selinux_secid_to_secctx),
        LSM_HOOK_INIT(lsmprop_to_secctx, selinux_lsmprop_to_secctx),
        LSM_HOOK_INIT(inode_getsecctx, selinux_inode_getsecctx),
        LSM_HOOK_INIT(sk_alloc_security, selinux_sk_alloc_security),
        LSM_HOOK_INIT(tun_dev_alloc_security, selinux_tun_dev_alloc_security),
#ifdef CONFIG_SECURITY_INFINIBAND
        LSM_HOOK_INIT(ib_alloc_security, selinux_ib_alloc_security),
#endif
#ifdef CONFIG_SECURITY_NETWORK_XFRM
        LSM_HOOK_INIT(xfrm_policy_alloc_security, selinux_xfrm_policy_alloc),
        LSM_HOOK_INIT(xfrm_state_alloc, selinux_xfrm_state_alloc),
        LSM_HOOK_INIT(xfrm_state_alloc_acquire,
                      selinux_xfrm_state_alloc_acquire),
#endif
#ifdef CONFIG_KEYS
        LSM_HOOK_INIT(key_alloc, selinux_key_alloc),
#endif
#ifdef CONFIG_AUDIT
        LSM_HOOK_INIT(audit_rule_init, selinux_audit_rule_init),
#endif
#ifdef CONFIG_BPF_SYSCALL
        LSM_HOOK_INIT(bpf_map_create, selinux_bpf_map_create),
        LSM_HOOK_INIT(bpf_prog_load, selinux_bpf_prog_load),
        LSM_HOOK_INIT(bpf_token_create, selinux_bpf_token_create),
#endif
#ifdef CONFIG_PERF_EVENTS
        LSM_HOOK_INIT(perf_event_alloc, selinux_perf_event_alloc),
#endif
};

static __init int selinux_init(void)
{
        pr_info("SELinux:  Initializing.\n");

        memset(&selinux_state, 0, sizeof(selinux_state));
        enforcing_set(selinux_enforcing_boot);
        selinux_avc_init();
        mutex_init(&selinux_state.status_lock);
        mutex_init(&selinux_state.policy_mutex);

        /* Set the security state for the initial task. */
        cred_init_security();

        /* Inform the audit system that secctx is used */
        audit_cfg_lsm(&selinux_lsmid,
                      AUDIT_CFG_LSM_SECCTX_SUBJECT |
                      AUDIT_CFG_LSM_SECCTX_OBJECT);

        default_noexec = !(VM_DATA_DEFAULT_FLAGS & VM_EXEC);
        if (!default_noexec)
                pr_notice("SELinux:  virtual memory is executable by default\n");

        avc_init();

        avtab_cache_init();

        ebitmap_cache_init();

        hashtab_cache_init();

        security_add_hooks(selinux_hooks, ARRAY_SIZE(selinux_hooks),
                           &selinux_lsmid);

        if (avc_add_callback(selinux_netcache_avc_callback, AVC_CALLBACK_RESET))
                panic("SELinux: Unable to register AVC netcache callback\n");

        if (avc_add_callback(selinux_lsm_notifier_avc_callback, AVC_CALLBACK_RESET))
                panic("SELinux: Unable to register AVC LSM notifier callback\n");

        if (selinux_enforcing_boot)
                pr_debug("SELinux:  Starting in enforcing mode\n");
        else
                pr_debug("SELinux:  Starting in permissive mode\n");

        fs_validate_description("selinux", selinux_fs_parameters);

        return 0;
}

static void delayed_superblock_init(struct super_block *sb, void *unused)
{
        selinux_set_mnt_opts(sb, NULL, 0, NULL);
}

void selinux_complete_init(void)
{
        pr_debug("SELinux:  Completing initialization.\n");

        /* Set up any superblocks initialized prior to the policy load. */
        pr_debug("SELinux:  Setting up existing superblocks.\n");
        iterate_supers(delayed_superblock_init, NULL);
}

/* SELinux requires early initialization in order to label
   all processes and objects when they are created. */
DEFINE_LSM(selinux) = {
        .name = "selinux",
        .flags = LSM_FLAG_LEGACY_MAJOR | LSM_FLAG_EXCLUSIVE,
        .enabled = &selinux_enabled_boot,
        .blobs = &selinux_blob_sizes,
        .init = selinux_init,
};

#if defined(CONFIG_NETFILTER)
static const struct nf_hook_ops selinux_nf_ops[] = {
        {
                .hook =                selinux_ip_postroute,
                .pf =                NFPROTO_IPV4,
                .hooknum =        NF_INET_POST_ROUTING,
                .priority =        NF_IP_PRI_SELINUX_LAST,
        },
        {
                .hook =                selinux_ip_forward,
                .pf =                NFPROTO_IPV4,
                .hooknum =        NF_INET_FORWARD,
                .priority =        NF_IP_PRI_SELINUX_FIRST,
        },
        {
                .hook =                selinux_ip_output,
                .pf =                NFPROTO_IPV4,
                .hooknum =        NF_INET_LOCAL_OUT,
                .priority =        NF_IP_PRI_SELINUX_FIRST,
        },
#if IS_ENABLED(CONFIG_IPV6)
        {
                .hook =                selinux_ip_postroute,
                .pf =                NFPROTO_IPV6,
                .hooknum =        NF_INET_POST_ROUTING,
                .priority =        NF_IP6_PRI_SELINUX_LAST,
        },
        {
                .hook =                selinux_ip_forward,
                .pf =                NFPROTO_IPV6,
                .hooknum =        NF_INET_FORWARD,
                .priority =        NF_IP6_PRI_SELINUX_FIRST,
        },
        {
                .hook =                selinux_ip_output,
                .pf =                NFPROTO_IPV6,
                .hooknum =        NF_INET_LOCAL_OUT,
                .priority =        NF_IP6_PRI_SELINUX_FIRST,
        },
#endif        /* IPV6 */
};

static int __net_init selinux_nf_register(struct net *net)
{
        return nf_register_net_hooks(net, selinux_nf_ops,
                                     ARRAY_SIZE(selinux_nf_ops));
}

static void __net_exit selinux_nf_unregister(struct net *net)
{
        nf_unregister_net_hooks(net, selinux_nf_ops,
                                ARRAY_SIZE(selinux_nf_ops));
}

static struct pernet_operations selinux_net_ops = {
        .init = selinux_nf_register,
        .exit = selinux_nf_unregister,
};

static int __init selinux_nf_ip_init(void)
{
        int err;

        if (!selinux_enabled_boot)
                return 0;

        pr_debug("SELinux:  Registering netfilter hooks\n");

        err = register_pernet_subsys(&selinux_net_ops);
        if (err)
                panic("SELinux: register_pernet_subsys: error %d\n", err);

        return 0;
}
__initcall(selinux_nf_ip_init);
#endif /* CONFIG_NETFILTER */
































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_TTY_H
#define _LINUX_TTY_H

#include <linux/fs.h>
#include <linux/major.h>
#include <linux/termios.h>
#include <linux/workqueue.h>
#include <linux/tty_driver.h>
#include <linux/tty_ldisc.h>
#include <linux/tty_port.h>
#include <linux/mutex.h>
#include <linux/tty_flags.h>
#include <uapi/linux/tty.h>
#include <linux/rwsem.h>
#include <linux/llist.h>


/*
 * (Note: the *_driver.minor_start values 1, 64, 128, 192 are
 * hardcoded at present.)
 */
#define NR_UNIX98_PTY_DEFAULT        4096      /* Default maximum for Unix98 ptys */
#define NR_UNIX98_PTY_RESERVE        1024          /* Default reserve for main devpts */
#define NR_UNIX98_PTY_MAX        (1 << MINORBITS) /* Absolute limit */

/*
 * This character is the same as _POSIX_VDISABLE: it cannot be used as
 * a c_cc[] character, but indicates that a particular special character
 * isn't in use (eg VINTR has no character etc)
 */
#define __DISABLED_CHAR '\0'

#define INTR_CHAR(tty) ((tty)->termios.c_cc[VINTR])
#define QUIT_CHAR(tty) ((tty)->termios.c_cc[VQUIT])
#define ERASE_CHAR(tty) ((tty)->termios.c_cc[VERASE])
#define KILL_CHAR(tty) ((tty)->termios.c_cc[VKILL])
#define EOF_CHAR(tty) ((tty)->termios.c_cc[VEOF])
#define TIME_CHAR(tty) ((tty)->termios.c_cc[VTIME])
#define MIN_CHAR(tty) ((tty)->termios.c_cc[VMIN])
#define SWTC_CHAR(tty) ((tty)->termios.c_cc[VSWTC])
#define START_CHAR(tty) ((tty)->termios.c_cc[VSTART])
#define STOP_CHAR(tty) ((tty)->termios.c_cc[VSTOP])
#define SUSP_CHAR(tty) ((tty)->termios.c_cc[VSUSP])
#define EOL_CHAR(tty) ((tty)->termios.c_cc[VEOL])
#define REPRINT_CHAR(tty) ((tty)->termios.c_cc[VREPRINT])
#define DISCARD_CHAR(tty) ((tty)->termios.c_cc[VDISCARD])
#define WERASE_CHAR(tty) ((tty)->termios.c_cc[VWERASE])
#define LNEXT_CHAR(tty)        ((tty)->termios.c_cc[VLNEXT])
#define EOL2_CHAR(tty) ((tty)->termios.c_cc[VEOL2])

#define _I_FLAG(tty, f)        ((tty)->termios.c_iflag & (f))
#define _O_FLAG(tty, f)        ((tty)->termios.c_oflag & (f))
#define _C_FLAG(tty, f)        ((tty)->termios.c_cflag & (f))
#define _L_FLAG(tty, f)        ((tty)->termios.c_lflag & (f))

#define I_IGNBRK(tty)        _I_FLAG((tty), IGNBRK)
#define I_BRKINT(tty)        _I_FLAG((tty), BRKINT)
#define I_IGNPAR(tty)        _I_FLAG((tty), IGNPAR)
#define I_PARMRK(tty)        _I_FLAG((tty), PARMRK)
#define I_INPCK(tty)        _I_FLAG((tty), INPCK)
#define I_ISTRIP(tty)        _I_FLAG((tty), ISTRIP)
#define I_INLCR(tty)        _I_FLAG((tty), INLCR)
#define I_IGNCR(tty)        _I_FLAG((tty), IGNCR)
#define I_ICRNL(tty)        _I_FLAG((tty), ICRNL)
#define I_IUCLC(tty)        _I_FLAG((tty), IUCLC)
#define I_IXON(tty)        _I_FLAG((tty), IXON)
#define I_IXANY(tty)        _I_FLAG((tty), IXANY)
#define I_IXOFF(tty)        _I_FLAG((tty), IXOFF)
#define I_IMAXBEL(tty)        _I_FLAG((tty), IMAXBEL)
#define I_IUTF8(tty)        _I_FLAG((tty), IUTF8)

#define O_OPOST(tty)        _O_FLAG((tty), OPOST)
#define O_OLCUC(tty)        _O_FLAG((tty), OLCUC)
#define O_ONLCR(tty)        _O_FLAG((tty), ONLCR)
#define O_OCRNL(tty)        _O_FLAG((tty), OCRNL)
#define O_ONOCR(tty)        _O_FLAG((tty), ONOCR)
#define O_ONLRET(tty)        _O_FLAG((tty), ONLRET)
#define O_OFILL(tty)        _O_FLAG((tty), OFILL)
#define O_OFDEL(tty)        _O_FLAG((tty), OFDEL)
#define O_NLDLY(tty)        _O_FLAG((tty), NLDLY)
#define O_CRDLY(tty)        _O_FLAG((tty), CRDLY)
#define O_TABDLY(tty)        _O_FLAG((tty), TABDLY)
#define O_BSDLY(tty)        _O_FLAG((tty), BSDLY)
#define O_VTDLY(tty)        _O_FLAG((tty), VTDLY)
#define O_FFDLY(tty)        _O_FLAG((tty), FFDLY)

#define C_BAUD(tty)        _C_FLAG((tty), CBAUD)
#define C_CSIZE(tty)        _C_FLAG((tty), CSIZE)
#define C_CSTOPB(tty)        _C_FLAG((tty), CSTOPB)
#define C_CREAD(tty)        _C_FLAG((tty), CREAD)
#define C_PARENB(tty)        _C_FLAG((tty), PARENB)
#define C_PARODD(tty)        _C_FLAG((tty), PARODD)
#define C_HUPCL(tty)        _C_FLAG((tty), HUPCL)
#define C_CLOCAL(tty)        _C_FLAG((tty), CLOCAL)
#define C_CIBAUD(tty)        _C_FLAG((tty), CIBAUD)
#define C_CRTSCTS(tty)        _C_FLAG((tty), CRTSCTS)
#define C_CMSPAR(tty)        _C_FLAG((tty), CMSPAR)

#define L_ISIG(tty)        _L_FLAG((tty), ISIG)
#define L_ICANON(tty)        _L_FLAG((tty), ICANON)
#define L_XCASE(tty)        _L_FLAG((tty), XCASE)
#define L_ECHO(tty)        _L_FLAG((tty), ECHO)
#define L_ECHOE(tty)        _L_FLAG((tty), ECHOE)
#define L_ECHOK(tty)        _L_FLAG((tty), ECHOK)
#define L_ECHONL(tty)        _L_FLAG((tty), ECHONL)
#define L_NOFLSH(tty)        _L_FLAG((tty), NOFLSH)
#define L_TOSTOP(tty)        _L_FLAG((tty), TOSTOP)
#define L_ECHOCTL(tty)        _L_FLAG((tty), ECHOCTL)
#define L_ECHOPRT(tty)        _L_FLAG((tty), ECHOPRT)
#define L_ECHOKE(tty)        _L_FLAG((tty), ECHOKE)
#define L_FLUSHO(tty)        _L_FLAG((tty), FLUSHO)
#define L_PENDIN(tty)        _L_FLAG((tty), PENDIN)
#define L_IEXTEN(tty)        _L_FLAG((tty), IEXTEN)
#define L_EXTPROC(tty)        _L_FLAG((tty), EXTPROC)

struct device;
struct signal_struct;
struct tty_operations;

/**
 * struct tty_struct - state associated with a tty while open
 *
 * @kref: reference counting by tty_kref_get() and tty_kref_put(), reaching zero
 *          frees the structure
 * @dev: class device or %NULL (e.g. ptys, serdev)
 * @driver: &struct tty_driver operating this tty
 * @ops: &struct tty_operations of @driver for this tty (open, close, etc.)
 * @index: index of this tty (e.g. to construct @name like tty12)
 * @ldisc_sem: protects line discipline changes (@ldisc) -- lock tty not pty
 * @ldisc: the current line discipline for this tty (n_tty by default)
 * @atomic_write_lock: protects against concurrent writers, i.e. locks
 *                       @write_cnt, @write_buf and similar
 * @legacy_mutex: leftover from history (BKL -> BTM -> @legacy_mutex),
 *                  protecting several operations on this tty
 * @throttle_mutex: protects against concurrent tty_throttle_safe() and
 *                    tty_unthrottle_safe() (but not tty_unthrottle())
 * @termios_rwsem: protects @termios and @termios_locked
 * @winsize_mutex: protects @winsize
 * @termios: termios for the current tty, copied from/to @driver.termios
 * @termios_locked: locked termios (by %TIOCGLCKTRMIOS and %TIOCSLCKTRMIOS
 *                    ioctls)
 * @name: name of the tty constructed by tty_line_name() (e.g. ttyS3)
 * @flags: bitwise OR of %TTY_THROTTLED, %TTY_IO_ERROR, ...
 * @count: count of open processes, reaching zero cancels all the work for
 *           this tty and drops a @kref too (but does not free this tty)
 * @winsize: size of the terminal "window" (cf. @winsize_mutex)
 * @flow: flow settings grouped together
 * @flow.lock: lock for @flow members
 * @flow.stopped: tty stopped/started by stop_tty()/start_tty()
 * @flow.tco_stopped: tty stopped/started by %TCOOFF/%TCOON ioctls (it has
 *                      precedence over @flow.stopped)
 * @ctrl: control settings grouped together
 * @ctrl.lock: lock for @ctrl members
 * @ctrl.pgrp: process group of this tty (setpgrp(2))
 * @ctrl.session: session of this tty (setsid(2)). Writes are protected by both
 *                  @ctrl.lock and @legacy_mutex, readers must use at least one of
 *                  them.
 * @ctrl.pktstatus: packet mode status (bitwise OR of %TIOCPKT_ constants)
 * @ctrl.packet: packet mode enabled
 * @hw_stopped: not controlled by the tty layer, under @driver's control for CTS
 *                handling
 * @receive_room: bytes permitted to feed to @ldisc without any being lost
 * @flow_change: controls behavior of throttling, see tty_throttle_safe() and
 *                 tty_unthrottle_safe()
 * @link: link to another pty (master -> slave and vice versa)
 * @fasync: state for %O_ASYNC (for %SIGIO); managed by fasync_helper()
 * @write_wait: concurrent writers are waiting in this queue until they are
 *                allowed to write
 * @read_wait: readers wait for data in this queue
 * @hangup_work: normally a work to perform a hangup (do_tty_hangup()); while
 *                 freeing the tty, (re)used to release_one_tty()
 * @disc_data: pointer to @ldisc's private data (e.g. to &struct n_tty_data)
 * @driver_data: pointer to @driver's private data (e.g. &struct uart_state)
 * @files_lock:        protects @tty_files list
 * @tty_files: list of (re)openers of this tty (i.e. linked &struct
 *               tty_file_private)
 * @closing: when set during close, n_tty processes only START & STOP chars
 * @write_buf: temporary buffer used during tty_write() to copy user data to
 * @write_cnt: count of bytes written in tty_write() to @write_buf
 * @SAK_work: if the tty has a pending do_SAK, it is queued here
 * @port: persistent storage for this device (i.e. &struct tty_port)
 *
 * All of the state associated with a tty while the tty is open. Persistent
 * storage for tty devices is referenced here as @port and is documented in
 * &struct tty_port.
 */
struct tty_struct {
        struct kref kref;
        int index;
        struct device *dev;
        struct tty_driver *driver;
        struct tty_port *port;
        const struct tty_operations *ops;

        struct tty_ldisc *ldisc;
        struct ld_semaphore ldisc_sem;

        struct mutex atomic_write_lock;
        struct mutex legacy_mutex;
        struct mutex throttle_mutex;
        struct rw_semaphore termios_rwsem;
        struct mutex winsize_mutex;
        struct ktermios termios, termios_locked;
        char name[64];
        unsigned long flags;
        int count;
        unsigned int receive_room;
        struct winsize winsize;

        struct {
                spinlock_t lock;
                bool stopped;
                bool tco_stopped;
        } flow;

        struct {
                struct pid *pgrp;
                struct pid *session;
                spinlock_t lock;
                unsigned char pktstatus;
                bool packet;
        } ctrl;

        bool hw_stopped;
        bool closing;
        int flow_change;

        struct tty_struct *link;
        struct fasync_struct *fasync;
        wait_queue_head_t write_wait;
        wait_queue_head_t read_wait;
        struct work_struct hangup_work;
        void *disc_data;
        void *driver_data;
        spinlock_t files_lock;
        int write_cnt;
        u8 *write_buf;

        struct list_head tty_files;

        struct work_struct SAK_work;
} __randomize_layout;

/* Each of a tty's open files has private_data pointing to tty_file_private */
struct tty_file_private {
        struct tty_struct *tty;
        struct file *file;
        struct list_head list;
};

/**
 * enum tty_struct_flags - TTY Struct Flags
 *
 * These bits are used in the :c:member:`tty_struct.flags` field.
 *
 * So that interrupts won't be able to mess up the queues,
 * copy_to_cooked must be atomic with respect to itself, as must
 * tty->write.  Thus, you must use the inline functions set_bit() and
 * clear_bit() to make things atomic.
 *
 * @TTY_THROTTLED:
 *        Driver input is throttled. The ldisc should call
 *        :c:member:`tty_driver.unthrottle()` in order to resume reception when
 *        it is ready to process more data (at threshold min).
 *
 * @TTY_IO_ERROR:
 *        If set, causes all subsequent userspace read/write calls on the tty to
 *        fail, returning -%EIO. (May be no ldisc too.)
 *
 * @TTY_OTHER_CLOSED:
 *        Device is a pty and the other side has closed.
 *
 * @TTY_EXCLUSIVE:
 *        Exclusive open mode (a single opener).
 *
 * @TTY_DO_WRITE_WAKEUP:
 *        If set, causes the driver to call the
 *        :c:member:`tty_ldisc_ops.write_wakeup()` method in order to resume
 *        transmission when it can accept more data to transmit.
 *
 * @TTY_LDISC_OPEN:
 *        Indicates that a line discipline is open. For debugging purposes only.
 *
 * @TTY_PTY_LOCK:
 *        A flag private to pty code to implement %TIOCSPTLCK/%TIOCGPTLCK logic.
 *
 * @TTY_NO_WRITE_SPLIT:
 *        Prevent driver from splitting up writes into smaller chunks (preserve
 *        write boundaries to driver).
 *
 * @TTY_HUPPED:
 *        The TTY was hung up. This is set post :c:member:`tty_driver.hangup()`.
 *
 * @TTY_HUPPING:
 *        The TTY is in the process of hanging up to abort potential readers.
 *
 * @TTY_LDISC_CHANGING:
 *        Line discipline for this TTY is being changed. I/O should not block
 *        when this is set. Use tty_io_nonblock() to check.
 *
 * @TTY_LDISC_HALTED:
 *        Line discipline for this TTY was stopped. No work should be queued to
 *        this ldisc.
 */
enum tty_struct_flags {
        TTY_THROTTLED,
        TTY_IO_ERROR,
        TTY_OTHER_CLOSED,
        TTY_EXCLUSIVE,
        TTY_DO_WRITE_WAKEUP,
        TTY_LDISC_OPEN,
        TTY_PTY_LOCK,
        TTY_NO_WRITE_SPLIT,
        TTY_HUPPED,
        TTY_HUPPING,
        TTY_LDISC_CHANGING,
        TTY_LDISC_HALTED,
};

static inline bool tty_io_nonblock(struct tty_struct *tty, struct file *file)
{
        return file->f_flags & O_NONBLOCK ||
                test_bit(TTY_LDISC_CHANGING, &tty->flags);
}

static inline bool tty_io_error(struct tty_struct *tty)
{
        return test_bit(TTY_IO_ERROR, &tty->flags);
}

static inline bool tty_throttled(struct tty_struct *tty)
{
        return test_bit(TTY_THROTTLED, &tty->flags);
}

#ifdef CONFIG_TTY
void tty_kref_put(struct tty_struct *tty);
struct pid *tty_get_pgrp(struct tty_struct *tty);
void tty_vhangup_self(void);
void disassociate_ctty(int priv);
dev_t tty_devnum(struct tty_struct *tty);
void proc_clear_tty(struct task_struct *p);
struct tty_struct *get_current_tty(void);
/* tty_io.c */
int __init tty_init(void);
const char *tty_name(const struct tty_struct *tty);
struct tty_struct *tty_kopen_exclusive(dev_t device);
struct tty_struct *tty_kopen_shared(dev_t device);
void tty_kclose(struct tty_struct *tty);
int tty_dev_name_to_number(const char *name, dev_t *number);
#else
static inline void tty_kref_put(struct tty_struct *tty)
{ }
static inline struct pid *tty_get_pgrp(struct tty_struct *tty)
{ return NULL; }
static inline void tty_vhangup_self(void)
{ }
static inline void disassociate_ctty(int priv)
{ }
static inline dev_t tty_devnum(struct tty_struct *tty)
{ return 0; }
static inline void proc_clear_tty(struct task_struct *p)
{ }
static inline struct tty_struct *get_current_tty(void)
{ return NULL; }
/* tty_io.c */
static inline int __init tty_init(void)
{ return 0; }
static inline const char *tty_name(const struct tty_struct *tty)
{ return "(none)"; }
static inline struct tty_struct *tty_kopen_exclusive(dev_t device)
{ return ERR_PTR(-ENODEV); }
static inline void tty_kclose(struct tty_struct *tty)
{ }
static inline int tty_dev_name_to_number(const char *name, dev_t *number)
{ return -ENOTSUPP; }
#endif

extern struct ktermios tty_std_termios;

int vcs_init(void);

extern const struct class tty_class;

/**
 * tty_kref_get - get a tty reference
 * @tty: tty device
 *
 * Returns: a new reference to a tty object
 *
 * Locking: The caller must hold sufficient locks/counts to ensure that their
 * existing reference cannot go away.
 */
static inline struct tty_struct *tty_kref_get(struct tty_struct *tty)
{
        if (tty)
                kref_get(&tty->kref);
        return tty;
}

const char *tty_driver_name(const struct tty_struct *tty);
void tty_wait_until_sent(struct tty_struct *tty, long timeout);
void stop_tty(struct tty_struct *tty);
void start_tty(struct tty_struct *tty);
void tty_write_message(struct tty_struct *tty, char *msg);
int tty_send_xchar(struct tty_struct *tty, u8 ch);
int tty_put_char(struct tty_struct *tty, u8 c);
unsigned int tty_chars_in_buffer(struct tty_struct *tty);
unsigned int tty_write_room(struct tty_struct *tty);
void tty_driver_flush_buffer(struct tty_struct *tty);
void tty_unthrottle(struct tty_struct *tty);
bool tty_throttle_safe(struct tty_struct *tty);
bool tty_unthrottle_safe(struct tty_struct *tty);
int tty_do_resize(struct tty_struct *tty, struct winsize *ws);
int tty_get_icount(struct tty_struct *tty,
                struct serial_icounter_struct *icount);
int tty_get_tiocm(struct tty_struct *tty);
int is_current_pgrp_orphaned(void);
void tty_hangup(struct tty_struct *tty);
void tty_vhangup(struct tty_struct *tty);
int tty_hung_up_p(struct file *filp);
void do_SAK(struct tty_struct *tty);
void __do_SAK(struct tty_struct *tty);
void no_tty(void);
speed_t tty_termios_baud_rate(const struct ktermios *termios);
void tty_termios_encode_baud_rate(struct ktermios *termios, speed_t ibaud,
                speed_t obaud);
void tty_encode_baud_rate(struct tty_struct *tty, speed_t ibaud,
                speed_t obaud);

/**
 * tty_get_baud_rate - get tty bit rates
 * @tty: tty to query
 *
 * Returns: the baud rate as an integer for this terminal
 *
 * Locking: The termios lock must be held by the caller.
 */
static inline speed_t tty_get_baud_rate(const struct tty_struct *tty)
{
        return tty_termios_baud_rate(&tty->termios);
}

unsigned char tty_get_char_size(unsigned int cflag);
unsigned char tty_get_frame_size(unsigned int cflag);

void tty_termios_copy_hw(struct ktermios *new, const struct ktermios *old);
bool tty_termios_hw_change(const struct ktermios *a, const struct ktermios *b);
int tty_set_termios(struct tty_struct *tty, struct ktermios *kt);

void tty_wakeup(struct tty_struct *tty);

int tty_mode_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg);
int tty_perform_flush(struct tty_struct *tty, unsigned long arg);
struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx);
void tty_release_struct(struct tty_struct *tty, int idx);
void tty_init_termios(struct tty_struct *tty);
void tty_save_termios(struct tty_struct *tty);
int tty_standard_install(struct tty_driver *driver,
                struct tty_struct *tty);

extern struct mutex tty_mutex;

/* n_tty.c */
void n_tty_inherit_ops(struct tty_ldisc_ops *ops);
#ifdef CONFIG_TTY
void __init n_tty_init(void);
#else
static inline void n_tty_init(void) { }
#endif

/* tty_audit.c */
#ifdef CONFIG_AUDIT
void tty_audit_exit(void);
void tty_audit_fork(struct signal_struct *sig);
int tty_audit_push(void);
#else
static inline void tty_audit_exit(void)
{
}
static inline void tty_audit_fork(struct signal_struct *sig)
{
}
static inline int tty_audit_push(void)
{
        return 0;
}
#endif

/* tty_ioctl.c */
int n_tty_ioctl_helper(struct tty_struct *tty, unsigned int cmd,
                unsigned long arg);

/* vt.c */

int vt_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg);

long vt_compat_ioctl(struct tty_struct *tty, unsigned int cmd,
                unsigned long arg);

/* tty_mutex.c */
/* functions for preparation of BKL removal */
void tty_lock(struct tty_struct *tty);
int  tty_lock_interruptible(struct tty_struct *tty);
void tty_unlock(struct tty_struct *tty);
void tty_lock_slave(struct tty_struct *tty);
void tty_unlock_slave(struct tty_struct *tty);
void tty_set_lock_subclass(struct tty_struct *tty);

#endif






















































































































































































































    4 



































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PTRACE_H
#define _ASM_X86_PTRACE_H

#include <asm/segment.h>
#include <asm/page_types.h>
#include <uapi/asm/ptrace.h>

#ifndef __ASSEMBLER__
#ifdef __i386__

struct pt_regs {
        /*
         * NB: 32-bit x86 CPUs are inconsistent as what happens in the
         * following cases (where %seg represents a segment register):
         *
         * - pushl %seg: some do a 16-bit write and leave the high
         *   bits alone
         * - movl %seg, [mem]: some do a 16-bit write despite the movl
         * - IDT entry: some (e.g. 486) will leave the high bits of CS
         *   and (if applicable) SS undefined.
         *
         * Fortunately, x86-32 doesn't read the high bits on POP or IRET,
         * so we can just treat all of the segment registers as 16-bit
         * values.
         */
        unsigned long bx;
        unsigned long cx;
        unsigned long dx;
        unsigned long si;
        unsigned long di;
        unsigned long bp;
        unsigned long ax;
        unsigned short ds;
        unsigned short __dsh;
        unsigned short es;
        unsigned short __esh;
        unsigned short fs;
        unsigned short __fsh;
        /*
         * On interrupt, gs and __gsh store the vector number.  They never
         * store gs any more.
         */
        unsigned short gs;
        unsigned short __gsh;
        /* On interrupt, this is the error code. */
        unsigned long orig_ax;
        unsigned long ip;
        unsigned short cs;
        unsigned short __csh;
        unsigned long flags;
        unsigned long sp;
        unsigned short ss;
        unsigned short __ssh;
};

#else /* __i386__ */

struct fred_cs {
                /* CS selector */
        u64        cs        : 16,
                /* Stack level at event time */
                sl        :  2,
                /* IBT in WAIT_FOR_ENDBRANCH state */
                wfe        :  1,
                        : 45;
};

struct fred_ss {
                /* SS selector */
        u64        ss        : 16,
                /* STI state */
                sti        :  1,
                /* Set if syscall, sysenter or INT n */
                swevent        :  1,
                /* Event is NMI type */
                nmi        :  1,
                        : 13,
                /* Event vector */
                vector        :  8,
                        :  8,
                /* Event type */
                type        :  4,
                        :  4,
                /* Event was incident to enclave execution */
                enclave        :  1,
                /* CPU was in long mode */
                lm        :  1,
                /*
                 * Nested exception during FRED delivery, not set
                 * for #DF.
                 */
                nested        :  1,
                        :  1,
                /*
                 * The length of the instruction causing the event.
                 * Only set for INTO, INT1, INT3, INT n, SYSCALL
                 * and SYSENTER.  0 otherwise.
                 */
                insnlen        :  4;
};

struct pt_regs {
        /*
         * C ABI says these regs are callee-preserved. They aren't saved on
         * kernel entry unless syscall needs a complete, fully filled
         * "struct pt_regs".
         */
        unsigned long r15;
        unsigned long r14;
        unsigned long r13;
        unsigned long r12;
        unsigned long bp;
        unsigned long bx;

        /* These regs are callee-clobbered. Always saved on kernel entry. */
        unsigned long r11;
        unsigned long r10;
        unsigned long r9;
        unsigned long r8;
        unsigned long ax;
        unsigned long cx;
        unsigned long dx;
        unsigned long si;
        unsigned long di;

        /*
         * orig_ax is used on entry for:
         * - the syscall number (syscall, sysenter, int80)
         * - error_code stored by the CPU on traps and exceptions
         * - the interrupt number for device interrupts
         *
         * A FRED stack frame starts here:
         *   1) It _always_ includes an error code;
         *
         *   2) The return frame for ERET[US] starts here, but
         *      the content of orig_ax is ignored.
         */
        unsigned long orig_ax;

        /* The IRETQ return frame starts here */
        unsigned long ip;

        union {
                /* CS selector */
                u16                cs;
                /* The extended 64-bit data slot containing CS */
                u64                csx;
                /* The FRED CS extension */
                struct fred_cs        fred_cs;
        };

        unsigned long flags;
        unsigned long sp;

        union {
                /* SS selector */
                u16                ss;
                /* The extended 64-bit data slot containing SS */
                u64                ssx;
                /* The FRED SS extension */
                struct fred_ss        fred_ss;
        };

        /*
         * Top of stack on IDT systems, while FRED systems have extra fields
         * defined above for storing exception related information, e.g. CR2 or
         * DR6.
         */
};

#endif /* !__i386__ */

#ifdef CONFIG_PARAVIRT
#include <asm/paravirt_types.h>
#endif

#include <asm/proto.h>

struct cpuinfo_x86;
struct task_struct;

extern unsigned long profile_pc(struct pt_regs *regs);

extern unsigned long
convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs);
extern void send_sigtrap(struct pt_regs *regs, int error_code, int si_code);


static inline unsigned long regs_return_value(struct pt_regs *regs)
{
        return regs->ax;
}

static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
{
        regs->ax = rc;
}

/*
 * user_mode(regs) determines whether a register set came from user
 * mode.  On x86_32, this is true if V8086 mode was enabled OR if the
 * register set was from protected mode with RPL-3 CS value.  This
 * tricky test checks that with one comparison.
 *
 * On x86_64, vm86 mode is mercifully nonexistent, and we don't need
 * the extra check.
 */
static __always_inline int user_mode(struct pt_regs *regs)
{
#ifdef CONFIG_X86_32
        return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >= USER_RPL;
#else
        return !!(regs->cs & 3);
#endif
}

static __always_inline int v8086_mode(struct pt_regs *regs)
{
#ifdef CONFIG_X86_32
        return (regs->flags & X86_VM_MASK);
#else
        return 0;        /* No V86 mode support in long mode */
#endif
}

static inline bool user_64bit_mode(struct pt_regs *regs)
{
#ifdef CONFIG_X86_64
#ifndef CONFIG_PARAVIRT_XXL
        /*
         * On non-paravirt systems, this is the only long mode CPL 3
         * selector.  We do not allow long mode selectors in the LDT.
         */
        return regs->cs == __USER_CS;
#else
        /* Headers are too twisted for this to go in paravirt.h. */
        return regs->cs == __USER_CS || regs->cs == pv_info.extra_user_64bit_cs;
#endif
#else /* !CONFIG_X86_64 */
        return false;
#endif
}

/*
 * Determine whether the register set came from any context that is running in
 * 64-bit mode.
 */
static inline bool any_64bit_mode(struct pt_regs *regs)
{
#ifdef CONFIG_X86_64
        return !user_mode(regs) || user_64bit_mode(regs);
#else
        return false;
#endif
}

#ifdef CONFIG_X86_64
#define current_user_stack_pointer()        current_pt_regs()->sp
#define compat_user_stack_pointer()        current_pt_regs()->sp

static __always_inline bool ip_within_syscall_gap(struct pt_regs *regs)
{
        bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64 &&
                    regs->ip <  (unsigned long)entry_SYSCALL_64_safe_stack);

        ret = ret || (regs->ip >= (unsigned long)entry_SYSRETQ_unsafe_stack &&
                      regs->ip <  (unsigned long)entry_SYSRETQ_end);
#ifdef CONFIG_IA32_EMULATION
        ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_compat &&
                      regs->ip <  (unsigned long)entry_SYSCALL_compat_safe_stack);
        ret = ret || (regs->ip >= (unsigned long)entry_SYSRETL_compat_unsafe_stack &&
                      regs->ip <  (unsigned long)entry_SYSRETL_compat_end);
#endif

        return ret;
}
#endif

static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
{
        return regs->sp;
}

static inline unsigned long instruction_pointer(struct pt_regs *regs)
{
        return regs->ip;
}

static inline void instruction_pointer_set(struct pt_regs *regs,
                unsigned long val)
{
        regs->ip = val;
}

static inline unsigned long frame_pointer(struct pt_regs *regs)
{
        return regs->bp;
}

static inline unsigned long user_stack_pointer(struct pt_regs *regs)
{
        return regs->sp;
}

static inline void user_stack_pointer_set(struct pt_regs *regs,
                unsigned long val)
{
        regs->sp = val;
}

static __always_inline bool regs_irqs_disabled(struct pt_regs *regs)
{
        return !(regs->flags & X86_EFLAGS_IF);
}

/* Query offset/name of register from its name/offset */
extern int regs_query_register_offset(const char *name);
extern const char *regs_query_register_name(unsigned int offset);
#define MAX_REG_OFFSET (offsetof(struct pt_regs, ss))

/**
 * regs_get_register() - get register value from its offset
 * @regs:        pt_regs from which register value is gotten.
 * @offset:        offset number of the register.
 *
 * regs_get_register returns the value of a register. The @offset is the
 * offset of the register in struct pt_regs address which specified by @regs.
 * If @offset is bigger than MAX_REG_OFFSET, this returns 0.
 */
static inline unsigned long regs_get_register(struct pt_regs *regs,
                                              unsigned int offset)
{
        if (unlikely(offset > MAX_REG_OFFSET))
                return 0;
#ifdef CONFIG_X86_32
        /* The selector fields are 16-bit. */
        if (offset == offsetof(struct pt_regs, cs) ||
            offset == offsetof(struct pt_regs, ss) ||
            offset == offsetof(struct pt_regs, ds) ||
            offset == offsetof(struct pt_regs, es) ||
            offset == offsetof(struct pt_regs, fs) ||
            offset == offsetof(struct pt_regs, gs)) {
                return *(u16 *)((unsigned long)regs + offset);

        }
#endif
        return *(unsigned long *)((unsigned long)regs + offset);
}

/**
 * regs_within_kernel_stack() - check the address in the stack
 * @regs:        pt_regs which contains kernel stack pointer.
 * @addr:        address which is checked.
 *
 * regs_within_kernel_stack() checks @addr is within the kernel stack page(s).
 * If @addr is within the kernel stack, it returns true. If not, returns false.
 */
static inline int regs_within_kernel_stack(struct pt_regs *regs,
                                           unsigned long addr)
{
        return ((addr & ~(THREAD_SIZE - 1)) == (regs->sp & ~(THREAD_SIZE - 1)));
}

/**
 * regs_get_kernel_stack_nth_addr() - get the address of the Nth entry on stack
 * @regs:        pt_regs which contains kernel stack pointer.
 * @n:                stack entry number.
 *
 * regs_get_kernel_stack_nth() returns the address of the @n th entry of the
 * kernel stack which is specified by @regs. If the @n th entry is NOT in
 * the kernel stack, this returns NULL.
 */
static inline unsigned long *regs_get_kernel_stack_nth_addr(struct pt_regs *regs, unsigned int n)
{
        unsigned long *addr = (unsigned long *)regs->sp;

        addr += n;
        if (regs_within_kernel_stack(regs, (unsigned long)addr))
                return addr;
        else
                return NULL;
}

/* To avoid include hell, we can't include uaccess.h */
extern long copy_from_kernel_nofault(void *dst, const void *src, size_t size);

/**
 * regs_get_kernel_stack_nth() - get Nth entry of the stack
 * @regs:        pt_regs which contains kernel stack pointer.
 * @n:                stack entry number.
 *
 * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which
 * is specified by @regs. If the @n th entry is NOT in the kernel stack
 * this returns 0.
 */
static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
                                                      unsigned int n)
{
        unsigned long *addr;
        unsigned long val;
        long ret;

        addr = regs_get_kernel_stack_nth_addr(regs, n);
        if (addr) {
                ret = copy_from_kernel_nofault(&val, addr, sizeof(val));
                if (!ret)
                        return val;
        }
        return 0;
}

/**
 * regs_get_kernel_argument() - get Nth function argument in kernel
 * @regs:        pt_regs of that context
 * @n:                function argument number (start from 0)
 *
 * regs_get_argument() returns @n th argument of the function call.
 * Note that this chooses most probably assignment, in some case
 * it can be incorrect.
 * This is expected to be called from kprobes or ftrace with regs
 * where the top of stack is the return address.
 */
static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs,
                                                     unsigned int n)
{
        static const unsigned int argument_offs[] = {
#ifdef __i386__
                offsetof(struct pt_regs, ax),
                offsetof(struct pt_regs, dx),
                offsetof(struct pt_regs, cx),
#define NR_REG_ARGUMENTS 3
#else
                offsetof(struct pt_regs, di),
                offsetof(struct pt_regs, si),
                offsetof(struct pt_regs, dx),
                offsetof(struct pt_regs, cx),
                offsetof(struct pt_regs, r8),
                offsetof(struct pt_regs, r9),
#define NR_REG_ARGUMENTS 6
#endif
        };

        if (n >= NR_REG_ARGUMENTS) {
                n -= NR_REG_ARGUMENTS - 1;
                return regs_get_kernel_stack_nth(regs, n);
        } else
                return regs_get_register(regs, argument_offs[n]);
}

#define arch_has_single_step()        (1)
#ifdef CONFIG_X86_DEBUGCTLMSR
#define arch_has_block_step()        (1)
#else
#define arch_has_block_step()        (boot_cpu_data.x86 >= 6)
#endif

#define ARCH_HAS_USER_SINGLE_STEP_REPORT

struct user_desc;
extern int do_get_thread_area(struct task_struct *p, int idx,
                              struct user_desc __user *info);
extern int do_set_thread_area(struct task_struct *p, int idx,
                              struct user_desc __user *info, int can_allocate);

#ifdef CONFIG_X86_64
# define do_set_thread_area_64(p, s, t)        do_arch_prctl_64(p, s, t)
#else
# define do_set_thread_area_64(p, s, t)        (0)
#endif

#endif /* !__ASSEMBLER__ */
#endif /* _ASM_X86_PTRACE_H */


























































































































































    1 
























































































































































































































































































































































































































































































































































































































































































































    1 
































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_CPUMASK_H
#define __LINUX_CPUMASK_H

/*
 * Cpumasks provide a bitmap suitable for representing the
 * set of CPUs in a system, one bit position per CPU number.  In general,
 * only nr_cpu_ids (<= NR_CPUS) bits are valid.
 */
#include <linux/cleanup.h>
#include <linux/kernel.h>
#include <linux/bitmap.h>
#include <linux/cpumask_types.h>
#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/gfp_types.h>
#include <linux/numa.h>

/**
 * cpumask_pr_args - printf args to output a cpumask
 * @maskp: cpumask to be printed
 *
 * Can be used to provide arguments for '%*pb[l]' when printing a cpumask.
 */
#define cpumask_pr_args(maskp)                nr_cpu_ids, cpumask_bits(maskp)

#if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS)
#define nr_cpu_ids ((unsigned int)NR_CPUS)
#else
extern unsigned int nr_cpu_ids;
#endif

static __always_inline void set_nr_cpu_ids(unsigned int nr)
{
#if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS)
        WARN_ON(nr != nr_cpu_ids);
#else
        nr_cpu_ids = nr;
#endif
}

/*
 * We have several different "preferred sizes" for the cpumask
 * operations, depending on operation.
 *
 * For example, the bitmap scanning and operating operations have
 * optimized routines that work for the single-word case, but only when
 * the size is constant. So if NR_CPUS fits in one single word, we are
 * better off using that small constant, in order to trigger the
 * optimized bit finding. That is 'small_cpumask_size'.
 *
 * The clearing and copying operations will similarly perform better
 * with a constant size, but we limit that size arbitrarily to four
 * words. We call this 'large_cpumask_size'.
 *
 * Finally, some operations just want the exact limit, either because
 * they set bits or just don't have any faster fixed-sized versions. We
 * call this just 'nr_cpumask_bits'.
 *
 * Note that these optional constants are always guaranteed to be at
 * least as big as 'nr_cpu_ids' itself is, and all our cpumask
 * allocations are at least that size (see cpumask_size()). The
 * optimization comes from being able to potentially use a compile-time
 * constant instead of a run-time generated exact number of CPUs.
 */
#if NR_CPUS <= BITS_PER_LONG
  #define small_cpumask_bits ((unsigned int)NR_CPUS)
  #define large_cpumask_bits ((unsigned int)NR_CPUS)
#elif NR_CPUS <= 4*BITS_PER_LONG
  #define small_cpumask_bits nr_cpu_ids
  #define large_cpumask_bits ((unsigned int)NR_CPUS)
#else
  #define small_cpumask_bits nr_cpu_ids
  #define large_cpumask_bits nr_cpu_ids
#endif
#define nr_cpumask_bits nr_cpu_ids

/*
 * The following particular system cpumasks and operations manage
 * possible, present, active and online cpus.
 *
 *     cpu_possible_mask- has bit 'cpu' set iff cpu is populatable
 *     cpu_present_mask - has bit 'cpu' set iff cpu is populated
 *     cpu_enabled_mask - has bit 'cpu' set iff cpu can be brought online
 *     cpu_online_mask  - has bit 'cpu' set iff cpu available to scheduler
 *     cpu_active_mask  - has bit 'cpu' set iff cpu available to migration
 *
 *  If !CONFIG_HOTPLUG_CPU, present == possible, and active == online.
 *
 *  The cpu_possible_mask is fixed at boot time, as the set of CPU IDs
 *  that it is possible might ever be plugged in at anytime during the
 *  life of that system boot.  The cpu_present_mask is dynamic(*),
 *  representing which CPUs are currently plugged in.  And
 *  cpu_online_mask is the dynamic subset of cpu_present_mask,
 *  indicating those CPUs available for scheduling.
 *
 *  If HOTPLUG is enabled, then cpu_present_mask varies dynamically,
 *  depending on what ACPI reports as currently plugged in, otherwise
 *  cpu_present_mask is just a copy of cpu_possible_mask.
 *
 *  (*) Well, cpu_present_mask is dynamic in the hotplug case.  If not
 *      hotplug, it's a copy of cpu_possible_mask, hence fixed at boot.
 *
 * Subtleties:
 * 1) UP ARCHes (NR_CPUS == 1, CONFIG_SMP not defined) hardcode
 *    assumption that their single CPU is online.  The UP
 *    cpu_{online,possible,present}_masks are placebos.  Changing them
 *    will have no useful affect on the following num_*_cpus()
 *    and cpu_*() macros in the UP case.  This ugliness is a UP
 *    optimization - don't waste any instructions or memory references
 *    asking if you're online or how many CPUs there are if there is
 *    only one CPU.
 */

extern struct cpumask __cpu_possible_mask;
extern struct cpumask __cpu_online_mask;
extern struct cpumask __cpu_enabled_mask;
extern struct cpumask __cpu_present_mask;
extern struct cpumask __cpu_active_mask;
extern struct cpumask __cpu_dying_mask;
#define cpu_possible_mask ((const struct cpumask *)&__cpu_possible_mask)
#define cpu_online_mask   ((const struct cpumask *)&__cpu_online_mask)
#define cpu_enabled_mask   ((const struct cpumask *)&__cpu_enabled_mask)
#define cpu_present_mask  ((const struct cpumask *)&__cpu_present_mask)
#define cpu_active_mask   ((const struct cpumask *)&__cpu_active_mask)
#define cpu_dying_mask    ((const struct cpumask *)&__cpu_dying_mask)

extern atomic_t __num_online_cpus;

extern cpumask_t cpus_booted_once_mask;

static __always_inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits)
{
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
        WARN_ON_ONCE(cpu >= bits);
#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
}

/* verify cpu argument to cpumask_* operators */
static __always_inline unsigned int cpumask_check(unsigned int cpu)
{
        cpu_max_bits_warn(cpu, small_cpumask_bits);
        return cpu;
}

/**
 * cpumask_first - get the first cpu in a cpumask
 * @srcp: the cpumask pointer
 *
 * Return: >= nr_cpu_ids if no cpus set.
 */
static __always_inline unsigned int cpumask_first(const struct cpumask *srcp)
{
        return find_first_bit(cpumask_bits(srcp), small_cpumask_bits);
}

/**
 * cpumask_first_zero - get the first unset cpu in a cpumask
 * @srcp: the cpumask pointer
 *
 * Return: >= nr_cpu_ids if all cpus are set.
 */
static __always_inline unsigned int cpumask_first_zero(const struct cpumask *srcp)
{
        return find_first_zero_bit(cpumask_bits(srcp), small_cpumask_bits);
}

/**
 * cpumask_first_and - return the first cpu from *srcp1 & *srcp2
 * @srcp1: the first input
 * @srcp2: the second input
 *
 * Return: >= nr_cpu_ids if no cpus set in both.  See also cpumask_next_and().
 */
static __always_inline
unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask *srcp2)
{
        return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits);
}

/**
 * cpumask_first_andnot - return the first cpu from *srcp1 & ~*srcp2
 * @srcp1: the first input
 * @srcp2: the second input
 *
 * Return: >= nr_cpu_ids if no such cpu found.
 */
static __always_inline
unsigned int cpumask_first_andnot(const struct cpumask *srcp1, const struct cpumask *srcp2)
{
        return find_first_andnot_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits);
}

/**
 * cpumask_first_and_and - return the first cpu from *srcp1 & *srcp2 & *srcp3
 * @srcp1: the first input
 * @srcp2: the second input
 * @srcp3: the third input
 *
 * Return: >= nr_cpu_ids if no cpus set in all.
 */
static __always_inline
unsigned int cpumask_first_and_and(const struct cpumask *srcp1,
                                   const struct cpumask *srcp2,
                                   const struct cpumask *srcp3)
{
        return find_first_and_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2),
                                      cpumask_bits(srcp3), small_cpumask_bits);
}

/**
 * cpumask_last - get the last CPU in a cpumask
 * @srcp:        - the cpumask pointer
 *
 * Return:        >= nr_cpumask_bits if no CPUs set.
 */
static __always_inline unsigned int cpumask_last(const struct cpumask *srcp)
{
        return find_last_bit(cpumask_bits(srcp), small_cpumask_bits);
}

/**
 * cpumask_next - get the next cpu in a cpumask
 * @n: the cpu prior to the place to search (i.e. return will be > @n)
 * @srcp: the cpumask pointer
 *
 * Return: >= nr_cpu_ids if no further cpus set.
 */
static __always_inline
unsigned int cpumask_next(int n, const struct cpumask *srcp)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpumask_check(n);
        return find_next_bit(cpumask_bits(srcp), small_cpumask_bits, n + 1);
}

/**
 * cpumask_next_zero - get the next unset cpu in a cpumask
 * @n: the cpu prior to the place to search (i.e. return will be > @n)
 * @srcp: the cpumask pointer
 *
 * Return: >= nr_cpu_ids if no further cpus unset.
 */
static __always_inline
unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpumask_check(n);
        return find_next_zero_bit(cpumask_bits(srcp), small_cpumask_bits, n+1);
}

#if NR_CPUS == 1
/* Uniprocessor: there is only one valid CPU */
static __always_inline
unsigned int cpumask_local_spread(unsigned int i, int node)
{
        return 0;
}

static __always_inline
unsigned int cpumask_any_and_distribute(const struct cpumask *src1p,
                                        const struct cpumask *src2p)
{
        return cpumask_first_and(src1p, src2p);
}

static __always_inline
unsigned int cpumask_any_distribute(const struct cpumask *srcp)
{
        return cpumask_first(srcp);
}
#else
unsigned int cpumask_local_spread(unsigned int i, int node);
unsigned int cpumask_any_and_distribute(const struct cpumask *src1p,
                               const struct cpumask *src2p);
unsigned int cpumask_any_distribute(const struct cpumask *srcp);
#endif /* NR_CPUS */

/**
 * cpumask_next_and - get the next cpu in *src1p & *src2p
 * @n: the cpu prior to the place to search (i.e. return will be > @n)
 * @src1p: the first cpumask pointer
 * @src2p: the second cpumask pointer
 *
 * Return: >= nr_cpu_ids if no further cpus set in both.
 */
static __always_inline
unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
                              const struct cpumask *src2p)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpumask_check(n);
        return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p),
                small_cpumask_bits, n + 1);
}

/**
 * cpumask_next_andnot - get the next cpu in *src1p & ~*src2p
 * @n: the cpu prior to the place to search (i.e. return will be > @n)
 * @src1p: the first cpumask pointer
 * @src2p: the second cpumask pointer
 *
 * Return: >= nr_cpu_ids if no further cpus set in both.
 */
static __always_inline
unsigned int cpumask_next_andnot(int n, const struct cpumask *src1p,
                                 const struct cpumask *src2p)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpumask_check(n);
        return find_next_andnot_bit(cpumask_bits(src1p), cpumask_bits(src2p),
                small_cpumask_bits, n + 1);
}

/**
 * cpumask_next_and_wrap - get the next cpu in *src1p & *src2p, starting from
 *                           @n+1. If nothing found, wrap around and start from
 *                           the beginning
 * @n: the cpu prior to the place to search (i.e. search starts from @n+1)
 * @src1p: the first cpumask pointer
 * @src2p: the second cpumask pointer
 *
 * Return: next set bit, wrapped if needed, or >= nr_cpu_ids if @src1p & @src2p is empty.
 */
static __always_inline
unsigned int cpumask_next_and_wrap(int n, const struct cpumask *src1p,
                              const struct cpumask *src2p)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpumask_check(n);
        return find_next_and_bit_wrap(cpumask_bits(src1p), cpumask_bits(src2p),
                small_cpumask_bits, n + 1);
}

/**
 * cpumask_next_wrap - get the next cpu in *src, starting from @n+1. If nothing
 *                       found, wrap around and start from the beginning
 * @n: the cpu prior to the place to search (i.e. search starts from @n+1)
 * @src: cpumask pointer
 *
 * Return: next set bit, wrapped if needed, or >= nr_cpu_ids if @src is empty.
 */
static __always_inline
unsigned int cpumask_next_wrap(int n, const struct cpumask *src)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpumask_check(n);
        return find_next_bit_wrap(cpumask_bits(src), small_cpumask_bits, n + 1);
}

/**
 * cpumask_random - get random cpu in *src.
 * @src: cpumask pointer
 *
 * Return: random set bit, or >= nr_cpu_ids if @src is empty.
 */
static __always_inline
unsigned int cpumask_random(const struct cpumask *src)
{
        return find_random_bit(cpumask_bits(src), nr_cpu_ids);
}

/**
 * for_each_cpu - iterate over every cpu in a mask
 * @cpu: the (optionally unsigned) integer iterator
 * @mask: the cpumask pointer
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu(cpu, mask)                                \
        for_each_set_bit(cpu, cpumask_bits(mask), small_cpumask_bits)

/**
 * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location
 * @cpu: the (optionally unsigned) integer iterator
 * @mask: the cpumask pointer
 * @start: the start location
 *
 * The implementation does not assume any bit in @mask is set (including @start).
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu_wrap(cpu, mask, start)                                \
        for_each_set_bit_wrap(cpu, cpumask_bits(mask), small_cpumask_bits, start)

/**
 * for_each_cpu_and - iterate over every cpu in both masks
 * @cpu: the (optionally unsigned) integer iterator
 * @mask1: the first cpumask pointer
 * @mask2: the second cpumask pointer
 *
 * This saves a temporary CPU mask in many places.  It is equivalent to:
 *        struct cpumask tmp;
 *        cpumask_and(&tmp, &mask1, &mask2);
 *        for_each_cpu(cpu, &tmp)
 *                ...
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu_and(cpu, mask1, mask2)                                \
        for_each_and_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)

/**
 * for_each_cpu_andnot - iterate over every cpu present in one mask, excluding
 *                         those present in another.
 * @cpu: the (optionally unsigned) integer iterator
 * @mask1: the first cpumask pointer
 * @mask2: the second cpumask pointer
 *
 * This saves a temporary CPU mask in many places.  It is equivalent to:
 *        struct cpumask tmp;
 *        cpumask_andnot(&tmp, &mask1, &mask2);
 *        for_each_cpu(cpu, &tmp)
 *                ...
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu_andnot(cpu, mask1, mask2)                                \
        for_each_andnot_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)

/**
 * for_each_cpu_or - iterate over every cpu present in either mask
 * @cpu: the (optionally unsigned) integer iterator
 * @mask1: the first cpumask pointer
 * @mask2: the second cpumask pointer
 *
 * This saves a temporary CPU mask in many places.  It is equivalent to:
 *        struct cpumask tmp;
 *        cpumask_or(&tmp, &mask1, &mask2);
 *        for_each_cpu(cpu, &tmp)
 *                ...
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu_or(cpu, mask1, mask2)                                \
        for_each_or_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)

/**
 * for_each_cpu_from - iterate over CPUs present in @mask, from @cpu to the end of @mask.
 * @cpu: the (optionally unsigned) integer iterator
 * @mask: the cpumask pointer
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu_from(cpu, mask)                                \
        for_each_set_bit_from(cpu, cpumask_bits(mask), small_cpumask_bits)

/**
 * cpumask_any_but - return an arbitrary cpu in a cpumask, but not this one.
 * @mask: the cpumask to search
 * @cpu: the cpu to ignore.
 *
 * Often used to find any cpu but smp_processor_id() in a mask.
 * If @cpu == -1, the function is equivalent to cpumask_any().
 * Return: >= nr_cpu_ids if no cpus set.
 */
static __always_inline
unsigned int cpumask_any_but(const struct cpumask *mask, int cpu)
{
        unsigned int i;

        /* -1 is a legal arg here. */
        if (cpu != -1)
                cpumask_check(cpu);

        for_each_cpu(i, mask)
                if (i != cpu)
                        break;
        return i;
}

/**
 * cpumask_any_and_but - pick an arbitrary cpu from *mask1 & *mask2, but not this one.
 * @mask1: the first input cpumask
 * @mask2: the second input cpumask
 * @cpu: the cpu to ignore
 *
 * If @cpu == -1, the function is equivalent to cpumask_any_and().
 * Returns >= nr_cpu_ids if no cpus set.
 */
static __always_inline
unsigned int cpumask_any_and_but(const struct cpumask *mask1,
                                 const struct cpumask *mask2,
                                 int cpu)
{
        unsigned int i;

        /* -1 is a legal arg here. */
        if (cpu != -1)
                cpumask_check(cpu);

        i = cpumask_first_and(mask1, mask2);
        if (i != cpu)
                return i;

        return cpumask_next_and(cpu, mask1, mask2);
}

/**
 * cpumask_any_andnot_but - pick an arbitrary cpu from *mask1 & ~*mask2, but not this one.
 * @mask1: the first input cpumask
 * @mask2: the second input cpumask
 * @cpu: the cpu to ignore
 *
 * If @cpu == -1, the function returns the first matching cpu.
 * Returns >= nr_cpu_ids if no cpus set.
 */
static __always_inline
unsigned int cpumask_any_andnot_but(const struct cpumask *mask1,
                                    const struct cpumask *mask2,
                                    int cpu)
{
        unsigned int i;

        /* -1 is a legal arg here. */
        if (cpu != -1)
                cpumask_check(cpu);

        i = cpumask_first_andnot(mask1, mask2);
        if (i != cpu)
                return i;

        return cpumask_next_andnot(cpu, mask1, mask2);
}

/**
 * cpumask_nth - get the Nth cpu in a cpumask
 * @srcp: the cpumask pointer
 * @cpu: the Nth cpu to find, starting from 0
 *
 * Return: >= nr_cpu_ids if such cpu doesn't exist.
 */
static __always_inline
unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp)
{
        return find_nth_bit(cpumask_bits(srcp), small_cpumask_bits, cpumask_check(cpu));
}

/**
 * cpumask_nth_and - get the Nth cpu in 2 cpumasks
 * @srcp1: the cpumask pointer
 * @srcp2: the cpumask pointer
 * @cpu: the Nth cpu to find, starting from 0
 *
 * Return: >= nr_cpu_ids if such cpu doesn't exist.
 */
static __always_inline
unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1,
                                                        const struct cpumask *srcp2)
{
        return find_nth_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2),
                                small_cpumask_bits, cpumask_check(cpu));
}

/**
 * cpumask_nth_and_andnot - get the Nth cpu set in 1st and 2nd cpumask, and clear in 3rd.
 * @srcp1: the cpumask pointer
 * @srcp2: the cpumask pointer
 * @srcp3: the cpumask pointer
 * @cpu: the Nth cpu to find, starting from 0
 *
 * Return: >= nr_cpu_ids if such cpu doesn't exist.
 */
static __always_inline
unsigned int cpumask_nth_and_andnot(unsigned int cpu, const struct cpumask *srcp1,
                                                        const struct cpumask *srcp2,
                                                        const struct cpumask *srcp3)
{
        return find_nth_and_andnot_bit(cpumask_bits(srcp1),
                                        cpumask_bits(srcp2),
                                        cpumask_bits(srcp3),
                                        small_cpumask_bits, cpumask_check(cpu));
}

#define CPU_BITS_NONE                                                \
{                                                                \
        [0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL                        \
}

#define CPU_BITS_CPU0                                                \
{                                                                \
        [0] =  1UL                                                \
}

/**
 * cpumask_set_cpu - set a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @dstp: the cpumask pointer
 */
static __always_inline
void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
{
        set_bit(cpumask_check(cpu), cpumask_bits(dstp));
}

static __always_inline
void __cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
{
        __set_bit(cpumask_check(cpu), cpumask_bits(dstp));
}

/**
 * cpumask_clear_cpus - clear cpus in a cpumask
 * @dstp:  the cpumask pointer
 * @cpu:   cpu number (< nr_cpu_ids)
 * @ncpus: number of cpus to clear (< nr_cpu_ids)
 */
static __always_inline void cpumask_clear_cpus(struct cpumask *dstp,
                                                unsigned int cpu, unsigned int ncpus)
{
        cpumask_check(cpu + ncpus - 1);
        bitmap_clear(cpumask_bits(dstp), cpumask_check(cpu), ncpus);
}

/**
 * cpumask_clear_cpu - clear a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @dstp: the cpumask pointer
 */
static __always_inline void cpumask_clear_cpu(int cpu, struct cpumask *dstp)
{
        clear_bit(cpumask_check(cpu), cpumask_bits(dstp));
}

static __always_inline void __cpumask_clear_cpu(int cpu, struct cpumask *dstp)
{
        __clear_bit(cpumask_check(cpu), cpumask_bits(dstp));
}

/**
 * cpumask_test_cpu - test for a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @cpumask: the cpumask pointer
 *
 * Return: true if @cpu is set in @cpumask, else returns false
 */
static __always_inline
bool cpumask_test_cpu(int cpu, const struct cpumask *cpumask)
{
        return test_bit(cpumask_check(cpu), cpumask_bits((cpumask)));
}

/**
 * cpumask_test_and_set_cpu - atomically test and set a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @cpumask: the cpumask pointer
 *
 * test_and_set_bit wrapper for cpumasks.
 *
 * Return: true if @cpu is set in old bitmap of @cpumask, else returns false
 */
static __always_inline
bool cpumask_test_and_set_cpu(int cpu, struct cpumask *cpumask)
{
        return test_and_set_bit(cpumask_check(cpu), cpumask_bits(cpumask));
}

/**
 * cpumask_test_and_clear_cpu - atomically test and clear a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @cpumask: the cpumask pointer
 *
 * test_and_clear_bit wrapper for cpumasks.
 *
 * Return: true if @cpu is set in old bitmap of @cpumask, else returns false
 */
static __always_inline
bool cpumask_test_and_clear_cpu(int cpu, struct cpumask *cpumask)
{
        return test_and_clear_bit(cpumask_check(cpu), cpumask_bits(cpumask));
}

/**
 * cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask
 * @dstp: the cpumask pointer
 */
static __always_inline void cpumask_setall(struct cpumask *dstp)
{
        if (small_const_nbits(small_cpumask_bits)) {
                cpumask_bits(dstp)[0] = BITMAP_LAST_WORD_MASK(nr_cpumask_bits);
                return;
        }
        bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits);
}

/**
 * cpumask_clear - clear all cpus (< nr_cpu_ids) in a cpumask
 * @dstp: the cpumask pointer
 */
static __always_inline void cpumask_clear(struct cpumask *dstp)
{
        bitmap_zero(cpumask_bits(dstp), large_cpumask_bits);
}

/**
 * cpumask_and - *dstp = *src1p & *src2p
 * @dstp: the cpumask result
 * @src1p: the first input
 * @src2p: the second input
 *
 * Return: false if *@dstp is empty, else returns true
 */
static __always_inline
bool cpumask_and(struct cpumask *dstp, const struct cpumask *src1p,
                 const struct cpumask *src2p)
{
        return bitmap_and(cpumask_bits(dstp), cpumask_bits(src1p),
                                       cpumask_bits(src2p), small_cpumask_bits);
}

/**
 * cpumask_or - *dstp = *src1p | *src2p
 * @dstp: the cpumask result
 * @src1p: the first input
 * @src2p: the second input
 */
static __always_inline
void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p,
                const struct cpumask *src2p)
{
        bitmap_or(cpumask_bits(dstp), cpumask_bits(src1p),
                                      cpumask_bits(src2p), small_cpumask_bits);
}

/**
 * cpumask_xor - *dstp = *src1p ^ *src2p
 * @dstp: the cpumask result
 * @src1p: the first input
 * @src2p: the second input
 */
static __always_inline
void cpumask_xor(struct cpumask *dstp, const struct cpumask *src1p,
                 const struct cpumask *src2p)
{
        bitmap_xor(cpumask_bits(dstp), cpumask_bits(src1p),
                                       cpumask_bits(src2p), small_cpumask_bits);
}

/**
 * cpumask_andnot - *dstp = *src1p & ~*src2p
 * @dstp: the cpumask result
 * @src1p: the first input
 * @src2p: the second input
 *
 * Return: false if *@dstp is empty, else returns true
 */
static __always_inline
bool cpumask_andnot(struct cpumask *dstp, const struct cpumask *src1p,
                    const struct cpumask *src2p)
{
        return bitmap_andnot(cpumask_bits(dstp), cpumask_bits(src1p),
                                          cpumask_bits(src2p), small_cpumask_bits);
}

/**
 * cpumask_equal - *src1p == *src2p
 * @src1p: the first input
 * @src2p: the second input
 *
 * Return: true if the cpumasks are equal, false if not
 */
static __always_inline
bool cpumask_equal(const struct cpumask *src1p, const struct cpumask *src2p)
{
        return bitmap_equal(cpumask_bits(src1p), cpumask_bits(src2p),
                                                 small_cpumask_bits);
}

/**
 * cpumask_or_equal - *src1p | *src2p == *src3p
 * @src1p: the first input
 * @src2p: the second input
 * @src3p: the third input
 *
 * Return: true if first cpumask ORed with second cpumask == third cpumask,
 *           otherwise false
 */
static __always_inline
bool cpumask_or_equal(const struct cpumask *src1p, const struct cpumask *src2p,
                      const struct cpumask *src3p)
{
        return bitmap_or_equal(cpumask_bits(src1p), cpumask_bits(src2p),
                               cpumask_bits(src3p), small_cpumask_bits);
}

/**
 * cpumask_intersects - (*src1p & *src2p) != 0
 * @src1p: the first input
 * @src2p: the second input
 *
 * Return: true if first cpumask ANDed with second cpumask is non-empty,
 *           otherwise false
 */
static __always_inline
bool cpumask_intersects(const struct cpumask *src1p, const struct cpumask *src2p)
{
        return bitmap_intersects(cpumask_bits(src1p), cpumask_bits(src2p),
                                                      small_cpumask_bits);
}

/**
 * cpumask_subset - (*src1p & ~*src2p) == 0
 * @src1p: the first input
 * @src2p: the second input
 *
 * Return: true if *@src1p is a subset of *@src2p, else returns false
 */
static __always_inline
bool cpumask_subset(const struct cpumask *src1p, const struct cpumask *src2p)
{
        return bitmap_subset(cpumask_bits(src1p), cpumask_bits(src2p),
                                                  small_cpumask_bits);
}

/**
 * cpumask_empty - *srcp == 0
 * @srcp: the cpumask to that all cpus < nr_cpu_ids are clear.
 *
 * Return: true if srcp is empty (has no bits set), else false
 */
static __always_inline bool cpumask_empty(const struct cpumask *srcp)
{
        return bitmap_empty(cpumask_bits(srcp), small_cpumask_bits);
}

/**
 * cpumask_full - *srcp == 0xFFFFFFFF...
 * @srcp: the cpumask to that all cpus < nr_cpu_ids are set.
 *
 * Return: true if srcp is full (has all bits set), else false
 */
static __always_inline bool cpumask_full(const struct cpumask *srcp)
{
        return bitmap_full(cpumask_bits(srcp), nr_cpumask_bits);
}

/**
 * cpumask_weight - Count of bits in *srcp
 * @srcp: the cpumask to count bits (< nr_cpu_ids) in.
 *
 * Return: count of bits set in *srcp
 */
static __always_inline unsigned int cpumask_weight(const struct cpumask *srcp)
{
        return bitmap_weight(cpumask_bits(srcp), small_cpumask_bits);
}

/**
 * cpumask_weight_and - Count of bits in (*srcp1 & *srcp2)
 * @srcp1: the cpumask to count bits (< nr_cpu_ids) in.
 * @srcp2: the cpumask to count bits (< nr_cpu_ids) in.
 *
 * Return: count of bits set in both *srcp1 and *srcp2
 */
static __always_inline
unsigned int cpumask_weight_and(const struct cpumask *srcp1, const struct cpumask *srcp2)
{
        return bitmap_weight_and(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits);
}

/**
 * cpumask_weight_andnot - Count of bits in (*srcp1 & ~*srcp2)
 * @srcp1: the cpumask to count bits (< nr_cpu_ids) in.
 * @srcp2: the cpumask to count bits (< nr_cpu_ids) in.
 *
 * Return: count of bits set in both *srcp1 and *srcp2
 */
static __always_inline
unsigned int cpumask_weight_andnot(const struct cpumask *srcp1,
                                   const struct cpumask *srcp2)
{
        return bitmap_weight_andnot(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits);
}

/**
 * cpumask_shift_right - *dstp = *srcp >> n
 * @dstp: the cpumask result
 * @srcp: the input to shift
 * @n: the number of bits to shift by
 */
static __always_inline
void cpumask_shift_right(struct cpumask *dstp, const struct cpumask *srcp, int n)
{
        bitmap_shift_right(cpumask_bits(dstp), cpumask_bits(srcp), n,
                                               small_cpumask_bits);
}

/**
 * cpumask_shift_left - *dstp = *srcp << n
 * @dstp: the cpumask result
 * @srcp: the input to shift
 * @n: the number of bits to shift by
 */
static __always_inline
void cpumask_shift_left(struct cpumask *dstp, const struct cpumask *srcp, int n)
{
        bitmap_shift_left(cpumask_bits(dstp), cpumask_bits(srcp), n,
                                              nr_cpumask_bits);
}

/**
 * cpumask_copy - *dstp = *srcp
 * @dstp: the result
 * @srcp: the input cpumask
 */
static __always_inline
void cpumask_copy(struct cpumask *dstp, const struct cpumask *srcp)
{
        bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), large_cpumask_bits);
}

/**
 * cpumask_any - pick an arbitrary cpu from *srcp
 * @srcp: the input cpumask
 *
 * Return: >= nr_cpu_ids if no cpus set.
 */
#define cpumask_any(srcp) cpumask_first(srcp)

/**
 * cpumask_any_and - pick an arbitrary cpu from *mask1 & *mask2
 * @mask1: the first input cpumask
 * @mask2: the second input cpumask
 *
 * Return: >= nr_cpu_ids if no cpus set.
 */
#define cpumask_any_and(mask1, mask2) cpumask_first_and((mask1), (mask2))

/**
 * cpumask_of - the cpumask containing just a given cpu
 * @cpu: the cpu (<= nr_cpu_ids)
 */
#define cpumask_of(cpu) (get_cpu_mask(cpu))

/**
 * cpumask_parse_user - extract a cpumask from a user string
 * @buf: the buffer to extract from
 * @len: the length of the buffer
 * @dstp: the cpumask to set.
 *
 * Return: -errno, or 0 for success.
 */
static __always_inline
int cpumask_parse_user(const char __user *buf, int len, struct cpumask *dstp)
{
        return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits);
}

/**
 * cpumask_parselist_user - extract a cpumask from a user string
 * @buf: the buffer to extract from
 * @len: the length of the buffer
 * @dstp: the cpumask to set.
 *
 * Return: -errno, or 0 for success.
 */
static __always_inline
int cpumask_parselist_user(const char __user *buf, int len, struct cpumask *dstp)
{
        return bitmap_parselist_user(buf, len, cpumask_bits(dstp),
                                     nr_cpumask_bits);
}

/**
 * cpumask_parse - extract a cpumask from a string
 * @buf: the buffer to extract from
 * @dstp: the cpumask to set.
 *
 * Return: -errno, or 0 for success.
 */
static __always_inline int cpumask_parse(const char *buf, struct cpumask *dstp)
{
        return bitmap_parse(buf, UINT_MAX, cpumask_bits(dstp), nr_cpumask_bits);
}

/**
 * cpulist_parse - extract a cpumask from a user string of ranges
 * @buf: the buffer to extract from
 * @dstp: the cpumask to set.
 *
 * Return: -errno, or 0 for success.
 */
static __always_inline int cpulist_parse(const char *buf, struct cpumask *dstp)
{
        return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpumask_bits);
}

/**
 * cpumask_size - calculate size to allocate for a 'struct cpumask' in bytes
 *
 * Return: size to allocate for a &struct cpumask in bytes
 */
static __always_inline unsigned int cpumask_size(void)
{
        return bitmap_size(large_cpumask_bits);
}

#ifdef CONFIG_CPUMASK_OFFSTACK

#define this_cpu_cpumask_var_ptr(x)        this_cpu_read(x)
#define __cpumask_var_read_mostly        __read_mostly

bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node);

static __always_inline
bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
{
        return alloc_cpumask_var_node(mask, flags | __GFP_ZERO, node);
}

/**
 * alloc_cpumask_var - allocate a struct cpumask
 * @mask: pointer to cpumask_var_t where the cpumask is returned
 * @flags: GFP_ flags
 *
 * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is
 * a nop returning a constant 1 (in <linux/cpumask.h>).
 *
 * See alloc_cpumask_var_node.
 *
 * Return: %true if allocation succeeded, %false if not
 */
static __always_inline
bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
        return alloc_cpumask_var_node(mask, flags, NUMA_NO_NODE);
}

static __always_inline
bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
        return alloc_cpumask_var(mask, flags | __GFP_ZERO);
}

void alloc_bootmem_cpumask_var(cpumask_var_t *mask);
void free_cpumask_var(cpumask_var_t mask);
void free_bootmem_cpumask_var(cpumask_var_t mask);

static __always_inline bool cpumask_available(cpumask_var_t mask)
{
        return mask != NULL;
}

#else

#define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x)
#define __cpumask_var_read_mostly

static __always_inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
        return true;
}

static __always_inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
                                          int node)
{
        return true;
}

static __always_inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
        cpumask_clear(*mask);
        return true;
}

static __always_inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
                                          int node)
{
        cpumask_clear(*mask);
        return true;
}

static __always_inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask)
{
}

static __always_inline void free_cpumask_var(cpumask_var_t mask)
{
}

static __always_inline void free_bootmem_cpumask_var(cpumask_var_t mask)
{
}

static __always_inline bool cpumask_available(cpumask_var_t mask)
{
        return true;
}
#endif /* CONFIG_CPUMASK_OFFSTACK */

DEFINE_FREE(free_cpumask_var, struct cpumask *, if (_T) free_cpumask_var(_T));

/* It's common to want to use cpu_all_mask in struct member initializers,
 * so it has to refer to an address rather than a pointer. */
extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS);
#define cpu_all_mask to_cpumask(cpu_all_bits)

/* First bits of cpu_bit_bitmap are in fact unset. */
#define cpu_none_mask to_cpumask(cpu_bit_bitmap[0])

#if NR_CPUS == 1
/* Uniprocessor: the possible/online/present masks are always "1" */
#define for_each_possible_cpu(cpu)        for ((cpu) = 0; (cpu) < 1; (cpu)++)
#define for_each_online_cpu(cpu)        for ((cpu) = 0; (cpu) < 1; (cpu)++)
#define for_each_present_cpu(cpu)        for ((cpu) = 0; (cpu) < 1; (cpu)++)

#define for_each_possible_cpu_wrap(cpu, start)        \
        for ((void)(start), (cpu) = 0; (cpu) < 1; (cpu)++)
#define for_each_online_cpu_wrap(cpu, start)        \
        for ((void)(start), (cpu) = 0; (cpu) < 1; (cpu)++)
#else
#define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask)
#define for_each_online_cpu(cpu)   for_each_cpu((cpu), cpu_online_mask)
#define for_each_enabled_cpu(cpu)   for_each_cpu((cpu), cpu_enabled_mask)
#define for_each_present_cpu(cpu)  for_each_cpu((cpu), cpu_present_mask)

#define for_each_possible_cpu_wrap(cpu, start)        \
        for_each_cpu_wrap((cpu), cpu_possible_mask, (start))
#define for_each_online_cpu_wrap(cpu, start)        \
        for_each_cpu_wrap((cpu), cpu_online_mask, (start))
#endif

/* Wrappers for arch boot code to manipulate normally-constant masks */
void init_cpu_present(const struct cpumask *src);
void init_cpu_possible(const struct cpumask *src);

#define assign_cpu(cpu, mask, val)        \
        assign_bit(cpumask_check(cpu), cpumask_bits(mask), (val))

#define __assign_cpu(cpu, mask, val)        \
        __assign_bit(cpumask_check(cpu), cpumask_bits(mask), (val))

#define set_cpu_possible(cpu, possible)        assign_cpu((cpu), &__cpu_possible_mask, (possible))
#define set_cpu_enabled(cpu, enabled)        assign_cpu((cpu), &__cpu_enabled_mask, (enabled))
#define set_cpu_present(cpu, present)        assign_cpu((cpu), &__cpu_present_mask, (present))
#define set_cpu_active(cpu, active)        assign_cpu((cpu), &__cpu_active_mask, (active))
#define set_cpu_dying(cpu, dying)        assign_cpu((cpu), &__cpu_dying_mask, (dying))

void set_cpu_online(unsigned int cpu, bool online);

/**
 * to_cpumask - convert a NR_CPUS bitmap to a struct cpumask *
 * @bitmap: the bitmap
 *
 * There are a few places where cpumask_var_t isn't appropriate and
 * static cpumasks must be used (eg. very early boot), yet we don't
 * expose the definition of 'struct cpumask'.
 *
 * This does the conversion, and can be used as a constant initializer.
 */
#define to_cpumask(bitmap)                                                \
        ((struct cpumask *)(1 ? (bitmap)                                \
                            : (void *)sizeof(__check_is_bitmap(bitmap))))

static __always_inline int __check_is_bitmap(const unsigned long *bitmap)
{
        return 1;
}

/*
 * Special-case data structure for "single bit set only" constant CPU masks.
 *
 * We pre-generate all the 64 (or 32) possible bit positions, with enough
 * padding to the left and the right, and return the constant pointer
 * appropriately offset.
 */
extern const unsigned long
        cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)];

static __always_inline const struct cpumask *get_cpu_mask(unsigned int cpu)
{
        const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG];
        p -= cpu / BITS_PER_LONG;
        return to_cpumask(p);
}

#if NR_CPUS > 1
/**
 * num_online_cpus() - Read the number of online CPUs
 *
 * Despite the fact that __num_online_cpus is of type atomic_t, this
 * interface gives only a momentary snapshot and is not protected against
 * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held
 * region.
 *
 * Return: momentary snapshot of the number of online CPUs
 */
static __always_inline unsigned int num_online_cpus(void)
{
        return raw_atomic_read(&__num_online_cpus);
}
#define num_possible_cpus()        cpumask_weight(cpu_possible_mask)
#define num_enabled_cpus()        cpumask_weight(cpu_enabled_mask)
#define num_present_cpus()        cpumask_weight(cpu_present_mask)
#define num_active_cpus()        cpumask_weight(cpu_active_mask)

static __always_inline bool cpu_online(unsigned int cpu)
{
        return cpumask_test_cpu(cpu, cpu_online_mask);
}

static __always_inline bool cpu_enabled(unsigned int cpu)
{
        return cpumask_test_cpu(cpu, cpu_enabled_mask);
}

static __always_inline bool cpu_possible(unsigned int cpu)
{
        return cpumask_test_cpu(cpu, cpu_possible_mask);
}

static __always_inline bool cpu_present(unsigned int cpu)
{
        return cpumask_test_cpu(cpu, cpu_present_mask);
}

static __always_inline bool cpu_active(unsigned int cpu)
{
        return cpumask_test_cpu(cpu, cpu_active_mask);
}

static __always_inline bool cpu_dying(unsigned int cpu)
{
        return cpumask_test_cpu(cpu, cpu_dying_mask);
}

#else

#define num_online_cpus()        1U
#define num_possible_cpus()        1U
#define num_enabled_cpus()        1U
#define num_present_cpus()        1U
#define num_active_cpus()        1U

static __always_inline bool cpu_online(unsigned int cpu)
{
        return cpu == 0;
}

static __always_inline bool cpu_possible(unsigned int cpu)
{
        return cpu == 0;
}

static __always_inline bool cpu_enabled(unsigned int cpu)
{
        return cpu == 0;
}

static __always_inline bool cpu_present(unsigned int cpu)
{
        return cpu == 0;
}

static __always_inline bool cpu_active(unsigned int cpu)
{
        return cpu == 0;
}

static __always_inline bool cpu_dying(unsigned int cpu)
{
        return false;
}

#endif /* NR_CPUS > 1 */

#define cpu_is_offline(cpu)        unlikely(!cpu_online(cpu))

#if NR_CPUS <= BITS_PER_LONG
#define CPU_BITS_ALL                                                \
{                                                                \
        [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS)        \
}

#else /* NR_CPUS > BITS_PER_LONG */

#define CPU_BITS_ALL                                                \
{                                                                \
        [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL,                \
        [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS)        \
}
#endif /* NR_CPUS > BITS_PER_LONG */

/**
 * cpumap_print_to_pagebuf  - copies the cpumask into the buffer either
 *        as comma-separated list of cpus or hex values of cpumask
 * @list: indicates whether the cpumap must be list
 * @mask: the cpumask to copy
 * @buf: the buffer to copy into
 *
 * Return: the length of the (null-terminated) @buf string, zero if
 * nothing is copied.
 */
static __always_inline ssize_t
cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask)
{
        return bitmap_print_to_pagebuf(list, buf, cpumask_bits(mask),
                                      nr_cpu_ids);
}

/**
 * cpumap_print_bitmask_to_buf  - copies the cpumask into the buffer as
 *        hex values of cpumask
 *
 * @buf: the buffer to copy into
 * @mask: the cpumask to copy
 * @off: in the string from which we are copying, we copy to @buf
 * @count: the maximum number of bytes to print
 *
 * The function prints the cpumask into the buffer as hex values of
 * cpumask; Typically used by bin_attribute to export cpumask bitmask
 * ABI.
 *
 * Return: the length of how many bytes have been copied, excluding
 * terminating '\0'.
 */
static __always_inline
ssize_t cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask,
                                    loff_t off, size_t count)
{
        return bitmap_print_bitmask_to_buf(buf, cpumask_bits(mask),
                                   nr_cpu_ids, off, count) - 1;
}

/**
 * cpumap_print_list_to_buf  - copies the cpumask into the buffer as
 *        comma-separated list of cpus
 * @buf: the buffer to copy into
 * @mask: the cpumask to copy
 * @off: in the string from which we are copying, we copy to @buf
 * @count: the maximum number of bytes to print
 *
 * Everything is same with the above cpumap_print_bitmask_to_buf()
 * except the print format.
 *
 * Return: the length of how many bytes have been copied, excluding
 * terminating '\0'.
 */
static __always_inline
ssize_t cpumap_print_list_to_buf(char *buf, const struct cpumask *mask,
                                 loff_t off, size_t count)
{
        return bitmap_print_list_to_buf(buf, cpumask_bits(mask),
                                   nr_cpu_ids, off, count) - 1;
}

#if NR_CPUS <= BITS_PER_LONG
#define CPU_MASK_ALL                                                        \
(cpumask_t) { {                                                                \
        [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS)        \
} }
#else
#define CPU_MASK_ALL                                                        \
(cpumask_t) { {                                                                \
        [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL,                        \
        [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS)        \
} }
#endif /* NR_CPUS > BITS_PER_LONG */

#define CPU_MASK_NONE                                                        \
(cpumask_t) { {                                                                \
        [0 ... BITS_TO_LONGS(NR_CPUS)-1] =  0UL                                \
} }

#define CPU_MASK_CPU0                                                        \
(cpumask_t) { {                                                                \
        [0] =  1UL                                                        \
} }

/*
 * Provide a valid theoretical max size for cpumap and cpulist sysfs files
 * to avoid breaking userspace which may allocate a buffer based on the size
 * reported by e.g. fstat.
 *
 * for cpumap NR_CPUS * 9/32 - 1 should be an exact length.
 *
 * For cpulist 7 is (ceil(log10(NR_CPUS)) + 1) allowing for NR_CPUS to be up
 * to 2 orders of magnitude larger than 8192. And then we divide by 2 to
 * cover a worst-case of every other cpu being on one of two nodes for a
 * very large NR_CPUS.
 *
 *  Use PAGE_SIZE as a minimum for smaller configurations while avoiding
 *  unsigned comparison to -1.
 */
#define CPUMAP_FILE_MAX_BYTES  (((NR_CPUS * 9)/32 > PAGE_SIZE) \
                                        ? (NR_CPUS * 9)/32 - 1 : PAGE_SIZE)
#define CPULIST_FILE_MAX_BYTES  (((NR_CPUS * 7)/2 > PAGE_SIZE) ? (NR_CPUS * 7)/2 : PAGE_SIZE)

#endif /* __LINUX_CPUMASK_H */

















































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Percpu refcounts:
 * (C) 2012 Google, Inc.
 * Author: Kent Overstreet <koverstreet@google.com>
 *
 * This implements a refcount with similar semantics to atomic_t - atomic_inc(),
 * atomic_dec_and_test() - but percpu.
 *
 * There's one important difference between percpu refs and normal atomic_t
 * refcounts; you have to keep track of your initial refcount, and then when you
 * start shutting down you call percpu_ref_kill() _before_ dropping the initial
 * refcount.
 *
 * The refcount will have a range of 0 to ((1U << 31) - 1), i.e. one bit less
 * than an atomic_t - this is because of the way shutdown works, see
 * percpu_ref_kill()/PERCPU_COUNT_BIAS.
 *
 * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the
 * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill()
 * puts the ref back in single atomic_t mode, collecting the per cpu refs and
 * issuing the appropriate barriers, and then marks the ref as shutting down so
 * that percpu_ref_put() will check for the ref hitting 0.  After it returns,
 * it's safe to drop the initial ref.
 *
 * USAGE:
 *
 * See fs/aio.c for some example usage; it's used there for struct kioctx, which
 * is created when userspaces calls io_setup(), and destroyed when userspace
 * calls io_destroy() or the process exits.
 *
 * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it
 * removes the kioctx from the proccess's table of kioctxs and kills percpu_ref.
 * After that, there can't be any new users of the kioctx (from lookup_ioctx())
 * and it's then safe to drop the initial ref with percpu_ref_put().
 *
 * Note that the free path, free_ioctx(), needs to go through explicit call_rcu()
 * to synchronize with RCU protected lookup_ioctx().  percpu_ref operations don't
 * imply RCU grace periods of any kind and if a user wants to combine percpu_ref
 * with RCU protection, it must be done explicitly.
 *
 * Code that does a two stage shutdown like this often needs some kind of
 * explicit synchronization to ensure the initial refcount can only be dropped
 * once - percpu_ref_kill() does this for you, it returns true once and false if
 * someone else already called it. The aio code uses it this way, but it's not
 * necessary if the code has some other mechanism to synchronize teardown.
 * around.
 */

#ifndef _LINUX_PERCPU_REFCOUNT_H
#define _LINUX_PERCPU_REFCOUNT_H

#include <linux/atomic.h>
#include <linux/percpu.h>
#include <linux/rcupdate.h>
#include <linux/types.h>
#include <linux/gfp.h>

struct percpu_ref;
typedef void (percpu_ref_func_t)(struct percpu_ref *);

/* flags set in the lower bits of percpu_ref->percpu_count_ptr */
enum {
        __PERCPU_REF_ATOMIC        = 1LU << 0,        /* operating in atomic mode */
        __PERCPU_REF_DEAD        = 1LU << 1,        /* (being) killed */
        __PERCPU_REF_ATOMIC_DEAD = __PERCPU_REF_ATOMIC | __PERCPU_REF_DEAD,

        __PERCPU_REF_FLAG_BITS        = 2,
};

/* @flags for percpu_ref_init() */
enum {
        /*
         * Start w/ ref == 1 in atomic mode.  Can be switched to percpu
         * operation using percpu_ref_switch_to_percpu().  If initialized
         * with this flag, the ref will stay in atomic mode until
         * percpu_ref_switch_to_percpu() is invoked on it.
         * Implies ALLOW_REINIT.
         */
        PERCPU_REF_INIT_ATOMIC        = 1 << 0,

        /*
         * Start dead w/ ref == 0 in atomic mode.  Must be revived with
         * percpu_ref_reinit() before used.  Implies INIT_ATOMIC and
         * ALLOW_REINIT.
         */
        PERCPU_REF_INIT_DEAD        = 1 << 1,

        /*
         * Allow switching from atomic mode to percpu mode.
         */
        PERCPU_REF_ALLOW_REINIT        = 1 << 2,
};

struct percpu_ref_data {
        atomic_long_t                count;
        percpu_ref_func_t        *release;
        percpu_ref_func_t        *confirm_switch;
        bool                        force_atomic:1;
        bool                        allow_reinit:1;
        struct rcu_head                rcu;
        struct percpu_ref        *ref;
};

struct percpu_ref {
        /*
         * The low bit of the pointer indicates whether the ref is in percpu
         * mode; if set, then get/put will manipulate the atomic_t.
         */
        unsigned long                percpu_count_ptr;

        /*
         * 'percpu_ref' is often embedded into user structure, and only
         * 'percpu_count_ptr' is required in fast path, move other fields
         * into 'percpu_ref_data', so we can reduce memory footprint in
         * fast path.
         */
        struct percpu_ref_data  *data;
};

int __must_check percpu_ref_init(struct percpu_ref *ref,
                                 percpu_ref_func_t *release, unsigned int flags,
                                 gfp_t gfp);
void percpu_ref_exit(struct percpu_ref *ref);
void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
                                 percpu_ref_func_t *confirm_switch);
void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref);
void percpu_ref_switch_to_percpu(struct percpu_ref *ref);
void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
                                 percpu_ref_func_t *confirm_kill);
void percpu_ref_resurrect(struct percpu_ref *ref);
void percpu_ref_reinit(struct percpu_ref *ref);
bool percpu_ref_is_zero(struct percpu_ref *ref);

/**
 * percpu_ref_kill - drop the initial ref
 * @ref: percpu_ref to kill
 *
 * Must be used to drop the initial ref on a percpu refcount; must be called
 * precisely once before shutdown.
 *
 * Switches @ref into atomic mode before gathering up the percpu counters
 * and dropping the initial ref.
 *
 * There are no implied RCU grace periods between kill and release.
 */
static inline void percpu_ref_kill(struct percpu_ref *ref)
{
        percpu_ref_kill_and_confirm(ref, NULL);
}

/*
 * Internal helper.  Don't use outside percpu-refcount proper.  The
 * function doesn't return the pointer and let the caller test it for NULL
 * because doing so forces the compiler to generate two conditional
 * branches as it can't assume that @ref->percpu_count is not NULL.
 */
static inline bool __ref_is_percpu(struct percpu_ref *ref,
                                          unsigned long __percpu **percpu_countp)
{
        unsigned long percpu_ptr;

        /*
         * The value of @ref->percpu_count_ptr is tested for
         * !__PERCPU_REF_ATOMIC, which may be set asynchronously, and then
         * used as a pointer.  If the compiler generates a separate fetch
         * when using it as a pointer, __PERCPU_REF_ATOMIC may be set in
         * between contaminating the pointer value, meaning that
         * READ_ONCE() is required when fetching it.
         *
         * The dependency ordering from the READ_ONCE() pairs
         * with smp_store_release() in __percpu_ref_switch_to_percpu().
         */
        percpu_ptr = READ_ONCE(ref->percpu_count_ptr);

        /*
         * Theoretically, the following could test just ATOMIC; however,
         * then we'd have to mask off DEAD separately as DEAD may be
         * visible without ATOMIC if we race with percpu_ref_kill().  DEAD
         * implies ATOMIC anyway.  Test them together.
         */
        if (unlikely(percpu_ptr & __PERCPU_REF_ATOMIC_DEAD))
                return false;

        *percpu_countp = (unsigned long __percpu *)percpu_ptr;
        return true;
}

/**
 * percpu_ref_get_many - increment a percpu refcount
 * @ref: percpu_ref to get
 * @nr: number of references to get
 *
 * Analogous to atomic_long_add().
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline void percpu_ref_get_many(struct percpu_ref *ref, unsigned long nr)
{
        unsigned long __percpu *percpu_count;

        rcu_read_lock();

        if (__ref_is_percpu(ref, &percpu_count))
                this_cpu_add(*percpu_count, nr);
        else
                atomic_long_add(nr, &ref->data->count);

        rcu_read_unlock();
}

/**
 * percpu_ref_get - increment a percpu refcount
 * @ref: percpu_ref to get
 *
 * Analogous to atomic_long_inc().
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline void percpu_ref_get(struct percpu_ref *ref)
{
        percpu_ref_get_many(ref, 1);
}

/**
 * percpu_ref_tryget_many - try to increment a percpu refcount
 * @ref: percpu_ref to try-get
 * @nr: number of references to get
 *
 * Increment a percpu refcount  by @nr unless its count already reached zero.
 * Returns %true on success; %false on failure.
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline bool percpu_ref_tryget_many(struct percpu_ref *ref,
                                          unsigned long nr)
{
        unsigned long __percpu *percpu_count;
        bool ret;

        rcu_read_lock();

        if (__ref_is_percpu(ref, &percpu_count)) {
                this_cpu_add(*percpu_count, nr);
                ret = true;
        } else {
                ret = atomic_long_add_unless(&ref->data->count, nr, 0);
        }

        rcu_read_unlock();

        return ret;
}

/**
 * percpu_ref_tryget - try to increment a percpu refcount
 * @ref: percpu_ref to try-get
 *
 * Increment a percpu refcount unless its count already reached zero.
 * Returns %true on success; %false on failure.
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline bool percpu_ref_tryget(struct percpu_ref *ref)
{
        return percpu_ref_tryget_many(ref, 1);
}

/**
 * percpu_ref_tryget_live_rcu - same as percpu_ref_tryget_live() but the
 * caller is responsible for taking RCU.
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline bool percpu_ref_tryget_live_rcu(struct percpu_ref *ref)
{
        unsigned long __percpu *percpu_count;
        bool ret = false;

        WARN_ON_ONCE(!rcu_read_lock_held());

        if (likely(__ref_is_percpu(ref, &percpu_count))) {
                this_cpu_inc(*percpu_count);
                ret = true;
        } else if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) {
                ret = atomic_long_inc_not_zero(&ref->data->count);
        }
        return ret;
}

/**
 * percpu_ref_tryget_live - try to increment a live percpu refcount
 * @ref: percpu_ref to try-get
 *
 * Increment a percpu refcount unless it has already been killed.  Returns
 * %true on success; %false on failure.
 *
 * Completion of percpu_ref_kill() in itself doesn't guarantee that this
 * function will fail.  For such guarantee, percpu_ref_kill_and_confirm()
 * should be used.  After the confirm_kill callback is invoked, it's
 * guaranteed that no new reference will be given out by
 * percpu_ref_tryget_live().
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline bool percpu_ref_tryget_live(struct percpu_ref *ref)
{
        bool ret = false;

        rcu_read_lock();
        ret = percpu_ref_tryget_live_rcu(ref);
        rcu_read_unlock();
        return ret;
}

/**
 * percpu_ref_put_many - decrement a percpu refcount
 * @ref: percpu_ref to put
 * @nr: number of references to put
 *
 * Decrement the refcount, and if 0, call the release function (which was passed
 * to percpu_ref_init())
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline void percpu_ref_put_many(struct percpu_ref *ref, unsigned long nr)
{
        unsigned long __percpu *percpu_count;

        rcu_read_lock();

        if (__ref_is_percpu(ref, &percpu_count))
                this_cpu_sub(*percpu_count, nr);
        else if (unlikely(atomic_long_sub_and_test(nr, &ref->data->count)))
                ref->data->release(ref);

        rcu_read_unlock();
}

/**
 * percpu_ref_put - decrement a percpu refcount
 * @ref: percpu_ref to put
 *
 * Decrement the refcount, and if 0, call the release function (which was passed
 * to percpu_ref_init())
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline void percpu_ref_put(struct percpu_ref *ref)
{
        percpu_ref_put_many(ref, 1);
}

/**
 * percpu_ref_is_dying - test whether a percpu refcount is dying or dead
 * @ref: percpu_ref to test
 *
 * Returns %true if @ref is dying or dead.
 *
 * This function is safe to call as long as @ref is between init and exit
 * and the caller is responsible for synchronizing against state changes.
 */
static inline bool percpu_ref_is_dying(struct percpu_ref *ref)
{
        return ref->percpu_count_ptr & __PERCPU_REF_DEAD;
}

#endif

















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef _NET_RPS_H
#define _NET_RPS_H

#include <linux/types.h>
#include <linux/static_key.h>
#include <net/sock.h>
#include <net/hotdata.h>

#ifdef CONFIG_RPS

extern struct static_key_false rps_needed;
extern struct static_key_false rfs_needed;

/*
 * This structure holds an RPS map which can be of variable length.  The
 * map is an array of CPUs.
 */
struct rps_map {
        unsigned int        len;
        struct rcu_head        rcu;
        u16                cpus[];
};
#define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + ((_num) * sizeof(u16)))

/*
 * The rps_dev_flow structure contains the mapping of a flow to a CPU, the
 * tail pointer for that CPU's input queue at the time of last enqueue, a
 * hardware filter index, and the hash of the flow if aRFS is enabled.
 */
struct rps_dev_flow {
        u16                cpu;
        u16                filter;
        unsigned int        last_qtail;
#ifdef CONFIG_RFS_ACCEL
        u32                hash;
#endif
};
#define RPS_NO_FILTER 0xffff

/*
 * The rps_dev_flow_table structure contains a table of flow mappings.
 */
struct rps_dev_flow_table {
        u8                        log;
        struct rcu_head                rcu;
        struct rps_dev_flow        flows[];
};
#define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \
    ((_num) * sizeof(struct rps_dev_flow)))

/*
 * The rps_sock_flow_table contains mappings of flows to the last CPU
 * on which they were processed by the application (set in recvmsg).
 * Each entry is a 32bit value. Upper part is the high-order bits
 * of flow hash, lower part is CPU number.
 * rps_cpu_mask is used to partition the space, depending on number of
 * possible CPUs : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1
 * For example, if 64 CPUs are possible, rps_cpu_mask = 0x3f,
 * meaning we use 32-6=26 bits for the hash.
 */
struct rps_sock_flow_table {
        struct rcu_head        rcu;
        u32                mask;

        u32                ents[] ____cacheline_aligned_in_smp;
};
#define        RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num]))

#define RPS_NO_CPU 0xffff

static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
                                        u32 hash)
{
        unsigned int index = hash & table->mask;
        u32 val = hash & ~net_hotdata.rps_cpu_mask;

        /* We only give a hint, preemption can change CPU under us */
        val |= raw_smp_processor_id();

        /* The following WRITE_ONCE() is paired with the READ_ONCE()
         * here, and another one in get_rps_cpu().
         */
        if (READ_ONCE(table->ents[index]) != val)
                WRITE_ONCE(table->ents[index], val);
}

static inline void _sock_rps_record_flow_hash(__u32 hash)
{
        struct rps_sock_flow_table *sock_flow_table;

        if (!hash)
                return;
        rcu_read_lock();
        sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table);
        if (sock_flow_table)
                rps_record_sock_flow(sock_flow_table, hash);
        rcu_read_unlock();
}

static inline void _sock_rps_record_flow(const struct sock *sk)
{
        /* Reading sk->sk_rxhash might incur an expensive cache line
         * miss.
         *
         * TCP_ESTABLISHED does cover almost all states where RFS
         * might be useful, and is cheaper [1] than testing :
         *        IPv4: inet_sk(sk)->inet_daddr
         *        IPv6: ipv6_addr_any(&sk->sk_v6_daddr)
         * OR        an additional socket flag
         * [1] : sk_state and sk_prot are in the same cache line.
         */
        if (sk->sk_state == TCP_ESTABLISHED) {
                /* This READ_ONCE() is paired with the WRITE_ONCE()
                 * from sock_rps_save_rxhash() and sock_rps_reset_rxhash().
                 */
                _sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash));
        }
}

static inline void _sock_rps_delete_flow(const struct sock *sk)
{
        struct rps_sock_flow_table *table;
        u32 hash, index;

        hash = READ_ONCE(sk->sk_rxhash);
        if (!hash)
                return;

        rcu_read_lock();
        table = rcu_dereference(net_hotdata.rps_sock_flow_table);
        if (table) {
                index = hash & table->mask;
                if (READ_ONCE(table->ents[index]) != RPS_NO_CPU)
                        WRITE_ONCE(table->ents[index], RPS_NO_CPU);
        }
        rcu_read_unlock();
}
#endif /* CONFIG_RPS */

static inline bool rfs_is_needed(void)
{
#ifdef CONFIG_RPS
        return static_branch_unlikely(&rfs_needed);
#else
        return false;
#endif
}

static inline void sock_rps_record_flow_hash(__u32 hash)
{
#ifdef CONFIG_RPS
        if (!rfs_is_needed())
                return;

        _sock_rps_record_flow_hash(hash);
#endif
}

static inline void sock_rps_record_flow(const struct sock *sk)
{
#ifdef CONFIG_RPS
        if (!rfs_is_needed())
                return;

        _sock_rps_record_flow(sk);
#endif
}

static inline void sock_rps_delete_flow(const struct sock *sk)
{
#ifdef CONFIG_RPS
        if (!rfs_is_needed())
                return;

        _sock_rps_delete_flow(sk);
#endif
}

static inline u32 rps_input_queue_tail_incr(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
        return ++sd->input_queue_tail;
#else
        return 0;
#endif
}

static inline void rps_input_queue_tail_save(u32 *dest, u32 tail)
{
#ifdef CONFIG_RPS
        WRITE_ONCE(*dest, tail);
#endif
}

static inline void rps_input_queue_head_add(struct softnet_data *sd, int val)
{
#ifdef CONFIG_RPS
        WRITE_ONCE(sd->input_queue_head, sd->input_queue_head + val);
#endif
}

static inline void rps_input_queue_head_incr(struct softnet_data *sd)
{
        rps_input_queue_head_add(sd, 1);
}

#endif /* _NET_RPS_H */




































































































    1 





    1 






    1 
    1 




































    1 































































































































































































































































































































































































































































































































































































































































































































































































    6 


    1 
























    5 
    5 

    5 
    6 























































































































































































    4 








    4 





    1 












    1 


















    6 


    6 
    5 

    4 














    5 
    5 




    6 
    5 































    1 















    6 
























































































































































































































































    6 







































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Support for AES-NI and VAES instructions.  This file contains glue code.
 * The real AES implementations are in aesni-intel_asm.S and other .S files.
 *
 * Copyright (C) 2008, Intel Corp.
 *    Author: Huang Ying <ying.huang@intel.com>
 *
 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
 * interface for 64-bit kernels.
 *    Authors: Adrian Hoban <adrian.hoban@intel.com>
 *             Gabriele Paoloni <gabriele.paoloni@intel.com>
 *             Tadeusz Struk (tadeusz.struk@intel.com)
 *             Aidan O'Mahony (aidan.o.mahony@intel.com)
 *    Copyright (c) 2010, Intel Corporation.
 *
 * Copyright 2024 Google LLC
 */

#include <linux/hardirq.h>
#include <linux/types.h>
#include <linux/module.h>
#include <linux/err.h>
#include <crypto/algapi.h>
#include <crypto/aes.h>
#include <crypto/b128ops.h>
#include <crypto/gcm.h>
#include <crypto/xts.h>
#include <asm/cpu_device_id.h>
#include <asm/simd.h>
#include <crypto/scatterwalk.h>
#include <crypto/internal/aead.h>
#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
#include <linux/jump_label.h>
#include <linux/workqueue.h>
#include <linux/spinlock.h>
#include <linux/static_call.h>


#define AESNI_ALIGN        16
#define AESNI_ALIGN_ATTR __attribute__ ((__aligned__(AESNI_ALIGN)))
#define AES_BLOCK_MASK        (~(AES_BLOCK_SIZE - 1))
#define AESNI_ALIGN_EXTRA ((AESNI_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1))
#define CRYPTO_AES_CTX_SIZE (sizeof(struct crypto_aes_ctx) + AESNI_ALIGN_EXTRA)
#define XTS_AES_CTX_SIZE (sizeof(struct aesni_xts_ctx) + AESNI_ALIGN_EXTRA)

struct aesni_xts_ctx {
        struct crypto_aes_ctx tweak_ctx AESNI_ALIGN_ATTR;
        struct crypto_aes_ctx crypt_ctx AESNI_ALIGN_ATTR;
};

static inline void *aes_align_addr(void *addr)
{
        if (crypto_tfm_ctx_alignment() >= AESNI_ALIGN)
                return addr;
        return PTR_ALIGN(addr, AESNI_ALIGN);
}

asmlinkage void aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
                              unsigned int key_len);
asmlinkage void aesni_enc(const void *ctx, u8 *out, const u8 *in);
asmlinkage void aesni_dec(const void *ctx, u8 *out, const u8 *in);
asmlinkage void aesni_ecb_enc(struct crypto_aes_ctx *ctx, u8 *out,
                              const u8 *in, unsigned int len);
asmlinkage void aesni_ecb_dec(struct crypto_aes_ctx *ctx, u8 *out,
                              const u8 *in, unsigned int len);
asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
                              const u8 *in, unsigned int len, u8 *iv);
asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
                              const u8 *in, unsigned int len, u8 *iv);
asmlinkage void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
                                  const u8 *in, unsigned int len, u8 *iv);
asmlinkage void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
                                  const u8 *in, unsigned int len, u8 *iv);

asmlinkage void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *out,
                              const u8 *in, unsigned int len, u8 *iv);

asmlinkage void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *out,
                              const u8 *in, unsigned int len, u8 *iv);

#ifdef CONFIG_X86_64
asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
                              const u8 *in, unsigned int len, u8 *iv);
#endif

static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
{
        return aes_align_addr(raw_ctx);
}

static inline struct aesni_xts_ctx *aes_xts_ctx(struct crypto_skcipher *tfm)
{
        return aes_align_addr(crypto_skcipher_ctx(tfm));
}

static int aes_set_key_common(struct crypto_aes_ctx *ctx,
                              const u8 *in_key, unsigned int key_len)
{
        int err;

        if (!crypto_simd_usable())
                return aes_expandkey(ctx, in_key, key_len);

        err = aes_check_keylen(key_len);
        if (err)
                return err;

        kernel_fpu_begin();
        aesni_set_key(ctx, in_key, key_len);
        kernel_fpu_end();
        return 0;
}

static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
                       unsigned int key_len)
{
        return aes_set_key_common(aes_ctx(crypto_tfm_ctx(tfm)), in_key,
                                  key_len);
}

static void aesni_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
        struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));

        if (!crypto_simd_usable()) {
                aes_encrypt(ctx, dst, src);
        } else {
                kernel_fpu_begin();
                aesni_enc(ctx, dst, src);
                kernel_fpu_end();
        }
}

static void aesni_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
        struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));

        if (!crypto_simd_usable()) {
                aes_decrypt(ctx, dst, src);
        } else {
                kernel_fpu_begin();
                aesni_dec(ctx, dst, src);
                kernel_fpu_end();
        }
}

static int aesni_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key,
                                 unsigned int len)
{
        return aes_set_key_common(aes_ctx(crypto_skcipher_ctx(tfm)), key, len);
}

static int ecb_encrypt(struct skcipher_request *req)
{
        struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
        struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
        struct skcipher_walk walk;
        unsigned int nbytes;
        int err;

        err = skcipher_walk_virt(&walk, req, false);

        while ((nbytes = walk.nbytes)) {
                kernel_fpu_begin();
                aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
                              nbytes & AES_BLOCK_MASK);
                kernel_fpu_end();
                nbytes &= AES_BLOCK_SIZE - 1;
                err = skcipher_walk_done(&walk, nbytes);
        }

        return err;
}

static int ecb_decrypt(struct skcipher_request *req)
{
        struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
        struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
        struct skcipher_walk walk;
        unsigned int nbytes;
        int err;

        err = skcipher_walk_virt(&walk, req, false);

        while ((nbytes = walk.nbytes)) {
                kernel_fpu_begin();
                aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
                              nbytes & AES_BLOCK_MASK);
                kernel_fpu_end();
                nbytes &= AES_BLOCK_SIZE - 1;
                err = skcipher_walk_done(&walk, nbytes);
        }

        return err;
}

static int cbc_encrypt(struct skcipher_request *req)
{
        struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
        struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
        struct skcipher_walk walk;
        unsigned int nbytes;
        int err;

        err = skcipher_walk_virt(&walk, req, false);

        while ((nbytes = walk.nbytes)) {
                kernel_fpu_begin();
                aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
                              nbytes & AES_BLOCK_MASK, walk.iv);
                kernel_fpu_end();
                nbytes &= AES_BLOCK_SIZE - 1;
                err = skcipher_walk_done(&walk, nbytes);
        }

        return err;
}

static int cbc_decrypt(struct skcipher_request *req)
{
        struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
        struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
        struct skcipher_walk walk;
        unsigned int nbytes;
        int err;

        err = skcipher_walk_virt(&walk, req, false);

        while ((nbytes = walk.nbytes)) {
                kernel_fpu_begin();
                aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
                              nbytes & AES_BLOCK_MASK, walk.iv);
                kernel_fpu_end();
                nbytes &= AES_BLOCK_SIZE - 1;
                err = skcipher_walk_done(&walk, nbytes);
        }

        return err;
}

static int cts_cbc_encrypt(struct skcipher_request *req)
{
        struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
        struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
        int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2;
        struct scatterlist *src = req->src, *dst = req->dst;
        struct scatterlist sg_src[2], sg_dst[2];
        struct skcipher_request subreq;
        struct skcipher_walk walk;
        int err;

        skcipher_request_set_tfm(&subreq, tfm);
        skcipher_request_set_callback(&subreq, skcipher_request_flags(req),
                                      NULL, NULL);

        if (req->cryptlen <= AES_BLOCK_SIZE) {
                if (req->cryptlen < AES_BLOCK_SIZE)
                        return -EINVAL;
                cbc_blocks = 1;
        }

        if (cbc_blocks > 0) {
                skcipher_request_set_crypt(&subreq, req->src, req->dst,
                                           cbc_blocks * AES_BLOCK_SIZE,
                                           req->iv);

                err = cbc_encrypt(&subreq);
                if (err)
                        return err;

                if (req->cryptlen == AES_BLOCK_SIZE)
                        return 0;

                dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen);
                if (req->dst != req->src)
                        dst = scatterwalk_ffwd(sg_dst, req->dst,
                                               subreq.cryptlen);
        }

        /* handle ciphertext stealing */
        skcipher_request_set_crypt(&subreq, src, dst,
                                   req->cryptlen - cbc_blocks * AES_BLOCK_SIZE,
                                   req->iv);

        err = skcipher_walk_virt(&walk, &subreq, false);
        if (err)
                return err;

        kernel_fpu_begin();
        aesni_cts_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
                          walk.nbytes, walk.iv);
        kernel_fpu_end();

        return skcipher_walk_done(&walk, 0);
}

static int cts_cbc_decrypt(struct skcipher_request *req)
{
        struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
        struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
        int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2;
        struct scatterlist *src = req->src, *dst = req->dst;
        struct scatterlist sg_src[2], sg_dst[2];
        struct skcipher_request subreq;
        struct skcipher_walk walk;
        int err;

        skcipher_request_set_tfm(&subreq, tfm);
        skcipher_request_set_callback(&subreq, skcipher_request_flags(req),
                                      NULL, NULL);

        if (req->cryptlen <= AES_BLOCK_SIZE) {
                if (req->cryptlen < AES_BLOCK_SIZE)
                        return -EINVAL;
                cbc_blocks = 1;
        }

        if (cbc_blocks > 0) {
                skcipher_request_set_crypt(&subreq, req->src, req->dst,
                                           cbc_blocks * AES_BLOCK_SIZE,
                                           req->iv);

                err = cbc_decrypt(&subreq);
                if (err)
                        return err;

                if (req->cryptlen == AES_BLOCK_SIZE)
                        return 0;

                dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen);
                if (req->dst != req->src)
                        dst = scatterwalk_ffwd(sg_dst, req->dst,
                                               subreq.cryptlen);
        }

        /* handle ciphertext stealing */
        skcipher_request_set_crypt(&subreq, src, dst,
                                   req->cryptlen - cbc_blocks * AES_BLOCK_SIZE,
                                   req->iv);

        err = skcipher_walk_virt(&walk, &subreq, false);
        if (err)
                return err;

        kernel_fpu_begin();
        aesni_cts_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
                          walk.nbytes, walk.iv);
        kernel_fpu_end();

        return skcipher_walk_done(&walk, 0);
}

#ifdef CONFIG_X86_64
/* This is the non-AVX version. */
static int ctr_crypt_aesni(struct skcipher_request *req)
{
        struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
        struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
        u8 keystream[AES_BLOCK_SIZE];
        struct skcipher_walk walk;
        unsigned int nbytes;
        int err;

        err = skcipher_walk_virt(&walk, req, false);

        while ((nbytes = walk.nbytes) > 0) {
                kernel_fpu_begin();
                if (nbytes & AES_BLOCK_MASK)
                        aesni_ctr_enc(ctx, walk.dst.virt.addr,
                                      walk.src.virt.addr,
                                      nbytes & AES_BLOCK_MASK, walk.iv);
                nbytes &= ~AES_BLOCK_MASK;

                if (walk.nbytes == walk.total && nbytes > 0) {
                        aesni_enc(ctx, keystream, walk.iv);
                        crypto_xor_cpy(walk.dst.virt.addr + walk.nbytes - nbytes,
                                       walk.src.virt.addr + walk.nbytes - nbytes,
                                       keystream, nbytes);
                        crypto_inc(walk.iv, AES_BLOCK_SIZE);
                        nbytes = 0;
                }
                kernel_fpu_end();
                err = skcipher_walk_done(&walk, nbytes);
        }
        return err;
}
#endif

static int xts_setkey_aesni(struct crypto_skcipher *tfm, const u8 *key,
                            unsigned int keylen)
{
        struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm);
        int err;

        err = xts_verify_key(tfm, key, keylen);
        if (err)
                return err;

        keylen /= 2;

        /* first half of xts-key is for crypt */
        err = aes_set_key_common(&ctx->crypt_ctx, key, keylen);
        if (err)
                return err;

        /* second half of xts-key is for tweak */
        return aes_set_key_common(&ctx->tweak_ctx, key + keylen, keylen);
}

typedef void (*xts_encrypt_iv_func)(const struct crypto_aes_ctx *tweak_key,
                                    u8 iv[AES_BLOCK_SIZE]);
typedef void (*xts_crypt_func)(const struct crypto_aes_ctx *key,
                               const u8 *src, u8 *dst, int len,
                               u8 tweak[AES_BLOCK_SIZE]);

/* This handles cases where the source and/or destination span pages. */
static noinline int
xts_crypt_slowpath(struct skcipher_request *req, xts_crypt_func crypt_func)
{
        struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
        const struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm);
        int tail = req->cryptlen % AES_BLOCK_SIZE;
        struct scatterlist sg_src[2], sg_dst[2];
        struct skcipher_request subreq;
        struct skcipher_walk walk;
        struct scatterlist *src, *dst;
        int err;

        /*
         * If the message length isn't divisible by the AES block size, then
         * separate off the last full block and the partial block.  This ensures
         * that they are processed in the same call to the assembly function,
         * which is required for ciphertext stealing.
         */
        if (tail) {
                skcipher_request_set_tfm(&subreq, tfm);
                skcipher_request_set_callback(&subreq,
                                              skcipher_request_flags(req),
                                              NULL, NULL);
                skcipher_request_set_crypt(&subreq, req->src, req->dst,
                                           req->cryptlen - tail - AES_BLOCK_SIZE,
                                           req->iv);
                req = &subreq;
        }

        err = skcipher_walk_virt(&walk, req, false);

        while (walk.nbytes) {
                kernel_fpu_begin();
                (*crypt_func)(&ctx->crypt_ctx,
                              walk.src.virt.addr, walk.dst.virt.addr,
                              walk.nbytes & ~(AES_BLOCK_SIZE - 1), req->iv);
                kernel_fpu_end();
                err = skcipher_walk_done(&walk,
                                         walk.nbytes & (AES_BLOCK_SIZE - 1));
        }

        if (err || !tail)
                return err;

        /* Do ciphertext stealing with the last full block and partial block. */

        dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen);
        if (req->dst != req->src)
                dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen);

        skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail,
                                   req->iv);

        err = skcipher_walk_virt(&walk, req, false);
        if (err)
                return err;

        kernel_fpu_begin();
        (*crypt_func)(&ctx->crypt_ctx, walk.src.virt.addr, walk.dst.virt.addr,
                      walk.nbytes, req->iv);
        kernel_fpu_end();

        return skcipher_walk_done(&walk, 0);
}

/* __always_inline to avoid indirect call in fastpath */
static __always_inline int
xts_crypt(struct skcipher_request *req, xts_encrypt_iv_func encrypt_iv,
          xts_crypt_func crypt_func)
{
        struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
        const struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm);

        if (unlikely(req->cryptlen < AES_BLOCK_SIZE))
                return -EINVAL;

        kernel_fpu_begin();
        (*encrypt_iv)(&ctx->tweak_ctx, req->iv);

        /*
         * In practice, virtually all XTS plaintexts and ciphertexts are either
         * 512 or 4096 bytes and do not use multiple scatterlist elements.  To
         * optimize the performance of these cases, the below fast-path handles
         * single-scatterlist-element messages as efficiently as possible.  The
         * code is 64-bit specific, as it assumes no page mapping is needed.
         */
        if (IS_ENABLED(CONFIG_X86_64) &&
            likely(req->src->length >= req->cryptlen &&
                   req->dst->length >= req->cryptlen)) {
                (*crypt_func)(&ctx->crypt_ctx, sg_virt(req->src),
                              sg_virt(req->dst), req->cryptlen, req->iv);
                kernel_fpu_end();
                return 0;
        }
        kernel_fpu_end();
        return xts_crypt_slowpath(req, crypt_func);
}

static void aesni_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
                                 u8 iv[AES_BLOCK_SIZE])
{
        aesni_enc(tweak_key, iv, iv);
}

static void aesni_xts_encrypt(const struct crypto_aes_ctx *key,
                              const u8 *src, u8 *dst, int len,
                              u8 tweak[AES_BLOCK_SIZE])
{
        aesni_xts_enc(key, dst, src, len, tweak);
}

static void aesni_xts_decrypt(const struct crypto_aes_ctx *key,
                              const u8 *src, u8 *dst, int len,
                              u8 tweak[AES_BLOCK_SIZE])
{
        aesni_xts_dec(key, dst, src, len, tweak);
}

static int xts_encrypt_aesni(struct skcipher_request *req)
{
        return xts_crypt(req, aesni_xts_encrypt_iv, aesni_xts_encrypt);
}

static int xts_decrypt_aesni(struct skcipher_request *req)
{
        return xts_crypt(req, aesni_xts_encrypt_iv, aesni_xts_decrypt);
}

static struct crypto_alg aesni_cipher_alg = {
        .cra_name                = "aes",
        .cra_driver_name        = "aes-aesni",
        .cra_priority                = 300,
        .cra_flags                = CRYPTO_ALG_TYPE_CIPHER,
        .cra_blocksize                = AES_BLOCK_SIZE,
        .cra_ctxsize                = CRYPTO_AES_CTX_SIZE,
        .cra_module                = THIS_MODULE,
        .cra_u        = {
                .cipher        = {
                        .cia_min_keysize        = AES_MIN_KEY_SIZE,
                        .cia_max_keysize        = AES_MAX_KEY_SIZE,
                        .cia_setkey                = aes_set_key,
                        .cia_encrypt                = aesni_encrypt,
                        .cia_decrypt                = aesni_decrypt
                }
        }
};

static struct skcipher_alg aesni_skciphers[] = {
        {
                .base = {
                        .cra_name                = "ecb(aes)",
                        .cra_driver_name        = "ecb-aes-aesni",
                        .cra_priority                = 400,
                        .cra_blocksize                = AES_BLOCK_SIZE,
                        .cra_ctxsize                = CRYPTO_AES_CTX_SIZE,
                        .cra_module                = THIS_MODULE,
                },
                .min_keysize        = AES_MIN_KEY_SIZE,
                .max_keysize        = AES_MAX_KEY_SIZE,
                .setkey                = aesni_skcipher_setkey,
                .encrypt        = ecb_encrypt,
                .decrypt        = ecb_decrypt,
        }, {
                .base = {
                        .cra_name                = "cbc(aes)",
                        .cra_driver_name        = "cbc-aes-aesni",
                        .cra_priority                = 400,
                        .cra_blocksize                = AES_BLOCK_SIZE,
                        .cra_ctxsize                = CRYPTO_AES_CTX_SIZE,
                        .cra_module                = THIS_MODULE,
                },
                .min_keysize        = AES_MIN_KEY_SIZE,
                .max_keysize        = AES_MAX_KEY_SIZE,
                .ivsize                = AES_BLOCK_SIZE,
                .setkey                = aesni_skcipher_setkey,
                .encrypt        = cbc_encrypt,
                .decrypt        = cbc_decrypt,
        }, {
                .base = {
                        .cra_name                = "cts(cbc(aes))",
                        .cra_driver_name        = "cts-cbc-aes-aesni",
                        .cra_priority                = 400,
                        .cra_blocksize                = AES_BLOCK_SIZE,
                        .cra_ctxsize                = CRYPTO_AES_CTX_SIZE,
                        .cra_module                = THIS_MODULE,
                },
                .min_keysize        = AES_MIN_KEY_SIZE,
                .max_keysize        = AES_MAX_KEY_SIZE,
                .ivsize                = AES_BLOCK_SIZE,
                .walksize        = 2 * AES_BLOCK_SIZE,
                .setkey                = aesni_skcipher_setkey,
                .encrypt        = cts_cbc_encrypt,
                .decrypt        = cts_cbc_decrypt,
#ifdef CONFIG_X86_64
        }, {
                .base = {
                        .cra_name                = "ctr(aes)",
                        .cra_driver_name        = "ctr-aes-aesni",
                        .cra_priority                = 400,
                        .cra_blocksize                = 1,
                        .cra_ctxsize                = CRYPTO_AES_CTX_SIZE,
                        .cra_module                = THIS_MODULE,
                },
                .min_keysize        = AES_MIN_KEY_SIZE,
                .max_keysize        = AES_MAX_KEY_SIZE,
                .ivsize                = AES_BLOCK_SIZE,
                .chunksize        = AES_BLOCK_SIZE,
                .setkey                = aesni_skcipher_setkey,
                .encrypt        = ctr_crypt_aesni,
                .decrypt        = ctr_crypt_aesni,
#endif
        }, {
                .base = {
                        .cra_name                = "xts(aes)",
                        .cra_driver_name        = "xts-aes-aesni",
                        .cra_priority                = 401,
                        .cra_blocksize                = AES_BLOCK_SIZE,
                        .cra_ctxsize                = XTS_AES_CTX_SIZE,
                        .cra_module                = THIS_MODULE,
                },
                .min_keysize        = 2 * AES_MIN_KEY_SIZE,
                .max_keysize        = 2 * AES_MAX_KEY_SIZE,
                .ivsize                = AES_BLOCK_SIZE,
                .walksize        = 2 * AES_BLOCK_SIZE,
                .setkey                = xts_setkey_aesni,
                .encrypt        = xts_encrypt_aesni,
                .decrypt        = xts_decrypt_aesni,
        }
};

#ifdef CONFIG_X86_64
asmlinkage void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
                                   u8 iv[AES_BLOCK_SIZE]);

/* __always_inline to avoid indirect call */
static __always_inline int
ctr_crypt(struct skcipher_request *req,
          void (*ctr64_func)(const struct crypto_aes_ctx *key,
                             const u8 *src, u8 *dst, int len,
                             const u64 le_ctr[2]))
{
        struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
        const struct crypto_aes_ctx *key = aes_ctx(crypto_skcipher_ctx(tfm));
        unsigned int nbytes, p1_nbytes, nblocks;
        struct skcipher_walk walk;
        u64 le_ctr[2];
        u64 ctr64;
        int err;

        ctr64 = le_ctr[0] = get_unaligned_be64(&req->iv[8]);
        le_ctr[1] = get_unaligned_be64(&req->iv[0]);

        err = skcipher_walk_virt(&walk, req, false);

        while ((nbytes = walk.nbytes) != 0) {
                if (nbytes < walk.total) {
                        /* Not the end yet, so keep the length block-aligned. */
                        nbytes = round_down(nbytes, AES_BLOCK_SIZE);
                        nblocks = nbytes / AES_BLOCK_SIZE;
                } else {
                        /* It's the end, so include any final partial block. */
                        nblocks = DIV_ROUND_UP(nbytes, AES_BLOCK_SIZE);
                }
                ctr64 += nblocks;

                kernel_fpu_begin();
                if (likely(ctr64 >= nblocks)) {
                        /* The low 64 bits of the counter won't overflow. */
                        (*ctr64_func)(key, walk.src.virt.addr,
                                      walk.dst.virt.addr, nbytes, le_ctr);
                } else {
                        /*
                         * The low 64 bits of the counter will overflow.  The
                         * assembly doesn't handle this case, so split the
                         * operation into two at the point where the overflow
                         * will occur.  After the first part, add the carry bit.
                         */
                        p1_nbytes = min_t(unsigned int, nbytes,
                                          (nblocks - ctr64) * AES_BLOCK_SIZE);
                        (*ctr64_func)(key, walk.src.virt.addr,
                                      walk.dst.virt.addr, p1_nbytes, le_ctr);
                        le_ctr[0] = 0;
                        le_ctr[1]++;
                        (*ctr64_func)(key, walk.src.virt.addr + p1_nbytes,
                                      walk.dst.virt.addr + p1_nbytes,
                                      nbytes - p1_nbytes, le_ctr);
                }
                kernel_fpu_end();
                le_ctr[0] = ctr64;

                err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
        }

        put_unaligned_be64(ctr64, &req->iv[8]);
        put_unaligned_be64(le_ctr[1], &req->iv[0]);

        return err;
}

/* __always_inline to avoid indirect call */
static __always_inline int
xctr_crypt(struct skcipher_request *req,
           void (*xctr_func)(const struct crypto_aes_ctx *key,
                             const u8 *src, u8 *dst, int len,
                             const u8 iv[AES_BLOCK_SIZE], u64 ctr))
{
        struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
        const struct crypto_aes_ctx *key = aes_ctx(crypto_skcipher_ctx(tfm));
        struct skcipher_walk walk;
        unsigned int nbytes;
        u64 ctr = 1;
        int err;

        err = skcipher_walk_virt(&walk, req, false);
        while ((nbytes = walk.nbytes) != 0) {
                if (nbytes < walk.total)
                        nbytes = round_down(nbytes, AES_BLOCK_SIZE);

                kernel_fpu_begin();
                (*xctr_func)(key, walk.src.virt.addr, walk.dst.virt.addr,
                             nbytes, req->iv, ctr);
                kernel_fpu_end();

                ctr += DIV_ROUND_UP(nbytes, AES_BLOCK_SIZE);
                err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
        }
        return err;
}

#define DEFINE_AVX_SKCIPHER_ALGS(suffix, driver_name_suffix, priority)               \
                                                                               \
asmlinkage void                                                                       \
aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src,      \
                         u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]);               \
asmlinkage void                                                                       \
aes_xts_decrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src,      \
                         u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]);               \
                                                                               \
static int xts_encrypt_##suffix(struct skcipher_request *req)                       \
{                                                                               \
        return xts_crypt(req, aes_xts_encrypt_iv, aes_xts_encrypt_##suffix);   \
}                                                                               \
                                                                               \
static int xts_decrypt_##suffix(struct skcipher_request *req)                       \
{                                                                               \
        return xts_crypt(req, aes_xts_encrypt_iv, aes_xts_decrypt_##suffix);   \
}                                                                               \
                                                                               \
asmlinkage void                                                                       \
aes_ctr64_crypt_##suffix(const struct crypto_aes_ctx *key,                       \
                         const u8 *src, u8 *dst, int len, const u64 le_ctr[2]);\
                                                                               \
static int ctr_crypt_##suffix(struct skcipher_request *req)                       \
{                                                                               \
        return ctr_crypt(req, aes_ctr64_crypt_##suffix);                       \
}                                                                               \
                                                                               \
asmlinkage void                                                                       \
aes_xctr_crypt_##suffix(const struct crypto_aes_ctx *key,                       \
                        const u8 *src, u8 *dst, int len,                       \
                        const u8 iv[AES_BLOCK_SIZE], u64 ctr);                       \
                                                                               \
static int xctr_crypt_##suffix(struct skcipher_request *req)                       \
{                                                                               \
        return xctr_crypt(req, aes_xctr_crypt_##suffix);                       \
}                                                                               \
                                                                               \
static struct skcipher_alg skcipher_algs_##suffix[] = {{                       \
        .base.cra_name                = "xts(aes)",                                       \
        .base.cra_driver_name        = "xts-aes-" driver_name_suffix,               \
        .base.cra_priority        = priority,                                       \
        .base.cra_blocksize        = AES_BLOCK_SIZE,                               \
        .base.cra_ctxsize        = XTS_AES_CTX_SIZE,                               \
        .base.cra_module        = THIS_MODULE,                                       \
        .min_keysize                = 2 * AES_MIN_KEY_SIZE,                               \
        .max_keysize                = 2 * AES_MAX_KEY_SIZE,                               \
        .ivsize                        = AES_BLOCK_SIZE,                               \
        .walksize                = 2 * AES_BLOCK_SIZE,                               \
        .setkey                        = xts_setkey_aesni,                               \
        .encrypt                = xts_encrypt_##suffix,                               \
        .decrypt                = xts_decrypt_##suffix,                               \
}, {                                                                               \
        .base.cra_name                = "ctr(aes)",                                       \
        .base.cra_driver_name        = "ctr-aes-" driver_name_suffix,               \
        .base.cra_priority        = priority,                                       \
        .base.cra_blocksize        = 1,                                               \
        .base.cra_ctxsize        = CRYPTO_AES_CTX_SIZE,                               \
        .base.cra_module        = THIS_MODULE,                                       \
        .min_keysize                = AES_MIN_KEY_SIZE,                               \
        .max_keysize                = AES_MAX_KEY_SIZE,                               \
        .ivsize                        = AES_BLOCK_SIZE,                               \
        .chunksize                = AES_BLOCK_SIZE,                               \
        .setkey                        = aesni_skcipher_setkey,                       \
        .encrypt                = ctr_crypt_##suffix,                               \
        .decrypt                = ctr_crypt_##suffix,                               \
}, {                                                                               \
        .base.cra_name                = "xctr(aes)",                                       \
        .base.cra_driver_name        = "xctr-aes-" driver_name_suffix,               \
        .base.cra_priority        = priority,                                       \
        .base.cra_blocksize        = 1,                                               \
        .base.cra_ctxsize        = CRYPTO_AES_CTX_SIZE,                               \
        .base.cra_module        = THIS_MODULE,                                       \
        .min_keysize                = AES_MIN_KEY_SIZE,                               \
        .max_keysize                = AES_MAX_KEY_SIZE,                               \
        .ivsize                        = AES_BLOCK_SIZE,                               \
        .chunksize                = AES_BLOCK_SIZE,                               \
        .setkey                        = aesni_skcipher_setkey,                       \
        .encrypt                = xctr_crypt_##suffix,                               \
        .decrypt                = xctr_crypt_##suffix,                               \
}}

DEFINE_AVX_SKCIPHER_ALGS(aesni_avx, "aesni-avx", 500);
DEFINE_AVX_SKCIPHER_ALGS(vaes_avx2, "vaes-avx2", 600);
DEFINE_AVX_SKCIPHER_ALGS(vaes_avx512, "vaes-avx512", 800);

/* The common part of the x86_64 AES-GCM key struct */
struct aes_gcm_key {
        /* Expanded AES key and the AES key length in bytes */
        struct crypto_aes_ctx aes_key;

        /* RFC4106 nonce (used only by the rfc4106 algorithms) */
        u32 rfc4106_nonce;
};

/* Key struct used by the AES-NI implementations of AES-GCM */
struct aes_gcm_key_aesni {
        /*
         * Common part of the key.  The assembly code requires 16-byte alignment
         * for the round keys; we get this by them being located at the start of
         * the struct and the whole struct being 16-byte aligned.
         */
        struct aes_gcm_key base;

        /*
         * Powers of the hash key H^8 through H^1.  These are 128-bit values.
         * They all have an extra factor of x^-1 and are byte-reversed.  16-byte
         * alignment is required by the assembly code.
         */
        u64 h_powers[8][2] __aligned(16);

        /*
         * h_powers_xored[i] contains the two 64-bit halves of h_powers[i] XOR'd
         * together.  It's used for Karatsuba multiplication.  16-byte alignment
         * is required by the assembly code.
         */
        u64 h_powers_xored[8] __aligned(16);

        /*
         * H^1 times x^64 (and also the usual extra factor of x^-1).  16-byte
         * alignment is required by the assembly code.
         */
        u64 h_times_x64[2] __aligned(16);
};
#define AES_GCM_KEY_AESNI(key)        \
        container_of((key), struct aes_gcm_key_aesni, base)
#define AES_GCM_KEY_AESNI_SIZE        \
        (sizeof(struct aes_gcm_key_aesni) + (15 & ~(CRYPTO_MINALIGN - 1)))

/* Key struct used by the VAES + AVX10 implementations of AES-GCM */
struct aes_gcm_key_avx10 {
        /*
         * Common part of the key.  The assembly code prefers 16-byte alignment
         * for the round keys; we get this by them being located at the start of
         * the struct and the whole struct being 64-byte aligned.
         */
        struct aes_gcm_key base;

        /*
         * Powers of the hash key H^16 through H^1.  These are 128-bit values.
         * They all have an extra factor of x^-1 and are byte-reversed.  This
         * array is aligned to a 64-byte boundary to make it naturally aligned
         * for 512-bit loads, which can improve performance.  (The assembly code
         * doesn't *need* the alignment; this is just an optimization.)
         */
        u64 h_powers[16][2] __aligned(64);

        /* Three padding blocks required by the assembly code */
        u64 padding[3][2];
};
#define AES_GCM_KEY_AVX10(key)        \
        container_of((key), struct aes_gcm_key_avx10, base)
#define AES_GCM_KEY_AVX10_SIZE        \
        (sizeof(struct aes_gcm_key_avx10) + (63 & ~(CRYPTO_MINALIGN - 1)))

/*
 * These flags are passed to the AES-GCM helper functions to specify the
 * specific version of AES-GCM (RFC4106 or not), whether it's encryption or
 * decryption, and which assembly functions should be called.  Assembly
 * functions are selected using flags instead of function pointers to avoid
 * indirect calls (which are very expensive on x86) regardless of inlining.
 */
#define FLAG_RFC4106        BIT(0)
#define FLAG_ENC        BIT(1)
#define FLAG_AVX        BIT(2)
#define FLAG_AVX10_256        BIT(3)
#define FLAG_AVX10_512        BIT(4)

static inline struct aes_gcm_key *
aes_gcm_key_get(struct crypto_aead *tfm, int flags)
{
        if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512))
                return PTR_ALIGN(crypto_aead_ctx(tfm), 64);
        else
                return PTR_ALIGN(crypto_aead_ctx(tfm), 16);
}

asmlinkage void
aes_gcm_precompute_aesni(struct aes_gcm_key_aesni *key);
asmlinkage void
aes_gcm_precompute_aesni_avx(struct aes_gcm_key_aesni *key);
asmlinkage void
aes_gcm_precompute_vaes_avx10_256(struct aes_gcm_key_avx10 *key);
asmlinkage void
aes_gcm_precompute_vaes_avx10_512(struct aes_gcm_key_avx10 *key);

static void aes_gcm_precompute(struct aes_gcm_key *key, int flags)
{
        /*
         * To make things a bit easier on the assembly side, the AVX10
         * implementations use the same key format.  Therefore, a single
         * function using 256-bit vectors would suffice here.  However, it's
         * straightforward to provide a 512-bit one because of how the assembly
         * code is structured, and it works nicely because the total size of the
         * key powers is a multiple of 512 bits.  So we take advantage of that.
         *
         * A similar situation applies to the AES-NI implementations.
         */
        if (flags & FLAG_AVX10_512)
                aes_gcm_precompute_vaes_avx10_512(AES_GCM_KEY_AVX10(key));
        else if (flags & FLAG_AVX10_256)
                aes_gcm_precompute_vaes_avx10_256(AES_GCM_KEY_AVX10(key));
        else if (flags & FLAG_AVX)
                aes_gcm_precompute_aesni_avx(AES_GCM_KEY_AESNI(key));
        else
                aes_gcm_precompute_aesni(AES_GCM_KEY_AESNI(key));
}

asmlinkage void
aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key,
                         u8 ghash_acc[16], const u8 *aad, int aadlen);
asmlinkage void
aes_gcm_aad_update_aesni_avx(const struct aes_gcm_key_aesni *key,
                             u8 ghash_acc[16], const u8 *aad, int aadlen);
asmlinkage void
aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key,
                              u8 ghash_acc[16], const u8 *aad, int aadlen);

static void aes_gcm_aad_update(const struct aes_gcm_key *key, u8 ghash_acc[16],
                               const u8 *aad, int aadlen, int flags)
{
        if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512))
                aes_gcm_aad_update_vaes_avx10(AES_GCM_KEY_AVX10(key), ghash_acc,
                                              aad, aadlen);
        else if (flags & FLAG_AVX)
                aes_gcm_aad_update_aesni_avx(AES_GCM_KEY_AESNI(key), ghash_acc,
                                             aad, aadlen);
        else
                aes_gcm_aad_update_aesni(AES_GCM_KEY_AESNI(key), ghash_acc,
                                         aad, aadlen);
}

asmlinkage void
aes_gcm_enc_update_aesni(const struct aes_gcm_key_aesni *key,
                         const u32 le_ctr[4], u8 ghash_acc[16],
                         const u8 *src, u8 *dst, int datalen);
asmlinkage void
aes_gcm_enc_update_aesni_avx(const struct aes_gcm_key_aesni *key,
                             const u32 le_ctr[4], u8 ghash_acc[16],
                             const u8 *src, u8 *dst, int datalen);
asmlinkage void
aes_gcm_enc_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key,
                                  const u32 le_ctr[4], u8 ghash_acc[16],
                                  const u8 *src, u8 *dst, int datalen);
asmlinkage void
aes_gcm_enc_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key,
                                  const u32 le_ctr[4], u8 ghash_acc[16],
                                  const u8 *src, u8 *dst, int datalen);

asmlinkage void
aes_gcm_dec_update_aesni(const struct aes_gcm_key_aesni *key,
                         const u32 le_ctr[4], u8 ghash_acc[16],
                         const u8 *src, u8 *dst, int datalen);
asmlinkage void
aes_gcm_dec_update_aesni_avx(const struct aes_gcm_key_aesni *key,
                             const u32 le_ctr[4], u8 ghash_acc[16],
                             const u8 *src, u8 *dst, int datalen);
asmlinkage void
aes_gcm_dec_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key,
                                  const u32 le_ctr[4], u8 ghash_acc[16],
                                  const u8 *src, u8 *dst, int datalen);
asmlinkage void
aes_gcm_dec_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key,
                                  const u32 le_ctr[4], u8 ghash_acc[16],
                                  const u8 *src, u8 *dst, int datalen);

/* __always_inline to optimize out the branches based on @flags */
static __always_inline void
aes_gcm_update(const struct aes_gcm_key *key,
               const u32 le_ctr[4], u8 ghash_acc[16],
               const u8 *src, u8 *dst, int datalen, int flags)
{
        if (flags & FLAG_ENC) {
                if (flags & FLAG_AVX10_512)
                        aes_gcm_enc_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key),
                                                          le_ctr, ghash_acc,
                                                          src, dst, datalen);
                else if (flags & FLAG_AVX10_256)
                        aes_gcm_enc_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key),
                                                          le_ctr, ghash_acc,
                                                          src, dst, datalen);
                else if (flags & FLAG_AVX)
                        aes_gcm_enc_update_aesni_avx(AES_GCM_KEY_AESNI(key),
                                                     le_ctr, ghash_acc,
                                                     src, dst, datalen);
                else
                        aes_gcm_enc_update_aesni(AES_GCM_KEY_AESNI(key), le_ctr,
                                                 ghash_acc, src, dst, datalen);
        } else {
                if (flags & FLAG_AVX10_512)
                        aes_gcm_dec_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key),
                                                          le_ctr, ghash_acc,
                                                          src, dst, datalen);
                else if (flags & FLAG_AVX10_256)
                        aes_gcm_dec_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key),
                                                          le_ctr, ghash_acc,
                                                          src, dst, datalen);
                else if (flags & FLAG_AVX)
                        aes_gcm_dec_update_aesni_avx(AES_GCM_KEY_AESNI(key),
                                                     le_ctr, ghash_acc,
                                                     src, dst, datalen);
                else
                        aes_gcm_dec_update_aesni(AES_GCM_KEY_AESNI(key),
                                                 le_ctr, ghash_acc,
                                                 src, dst, datalen);
        }
}

asmlinkage void
aes_gcm_enc_final_aesni(const struct aes_gcm_key_aesni *key,
                        const u32 le_ctr[4], u8 ghash_acc[16],
                        u64 total_aadlen, u64 total_datalen);
asmlinkage void
aes_gcm_enc_final_aesni_avx(const struct aes_gcm_key_aesni *key,
                            const u32 le_ctr[4], u8 ghash_acc[16],
                            u64 total_aadlen, u64 total_datalen);
asmlinkage void
aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
                             const u32 le_ctr[4], u8 ghash_acc[16],
                             u64 total_aadlen, u64 total_datalen);

/* __always_inline to optimize out the branches based on @flags */
static __always_inline void
aes_gcm_enc_final(const struct aes_gcm_key *key,
                  const u32 le_ctr[4], u8 ghash_acc[16],
                  u64 total_aadlen, u64 total_datalen, int flags)
{
        if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512))
                aes_gcm_enc_final_vaes_avx10(AES_GCM_KEY_AVX10(key),
                                             le_ctr, ghash_acc,
                                             total_aadlen, total_datalen);
        else if (flags & FLAG_AVX)
                aes_gcm_enc_final_aesni_avx(AES_GCM_KEY_AESNI(key),
                                            le_ctr, ghash_acc,
                                            total_aadlen, total_datalen);
        else
                aes_gcm_enc_final_aesni(AES_GCM_KEY_AESNI(key),
                                        le_ctr, ghash_acc,
                                        total_aadlen, total_datalen);
}

asmlinkage bool __must_check
aes_gcm_dec_final_aesni(const struct aes_gcm_key_aesni *key,
                        const u32 le_ctr[4], const u8 ghash_acc[16],
                        u64 total_aadlen, u64 total_datalen,
                        const u8 tag[16], int taglen);
asmlinkage bool __must_check
aes_gcm_dec_final_aesni_avx(const struct aes_gcm_key_aesni *key,
                            const u32 le_ctr[4], const u8 ghash_acc[16],
                            u64 total_aadlen, u64 total_datalen,
                            const u8 tag[16], int taglen);
asmlinkage bool __must_check
aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
                             const u32 le_ctr[4], const u8 ghash_acc[16],
                             u64 total_aadlen, u64 total_datalen,
                             const u8 tag[16], int taglen);

/* __always_inline to optimize out the branches based on @flags */
static __always_inline bool __must_check
aes_gcm_dec_final(const struct aes_gcm_key *key, const u32 le_ctr[4],
                  u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen,
                  u8 tag[16], int taglen, int flags)
{
        if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512))
                return aes_gcm_dec_final_vaes_avx10(AES_GCM_KEY_AVX10(key),
                                                    le_ctr, ghash_acc,
                                                    total_aadlen, total_datalen,
                                                    tag, taglen);
        else if (flags & FLAG_AVX)
                return aes_gcm_dec_final_aesni_avx(AES_GCM_KEY_AESNI(key),
                                                   le_ctr, ghash_acc,
                                                   total_aadlen, total_datalen,
                                                   tag, taglen);
        else
                return aes_gcm_dec_final_aesni(AES_GCM_KEY_AESNI(key),
                                               le_ctr, ghash_acc,
                                               total_aadlen, total_datalen,
                                               tag, taglen);
}

/*
 * This is the Integrity Check Value (aka the authentication tag) length and can
 * be 8, 12 or 16 bytes long.
 */
static int common_rfc4106_set_authsize(struct crypto_aead *aead,
                                       unsigned int authsize)
{
        switch (authsize) {
        case 8:
        case 12:
        case 16:
                break;
        default:
                return -EINVAL;
        }

        return 0;
}

static int generic_gcmaes_set_authsize(struct crypto_aead *tfm,
                                       unsigned int authsize)
{
        switch (authsize) {
        case 4:
        case 8:
        case 12:
        case 13:
        case 14:
        case 15:
        case 16:
                break;
        default:
                return -EINVAL;
        }

        return 0;
}

/*
 * This is the setkey function for the x86_64 implementations of AES-GCM.  It
 * saves the RFC4106 nonce if applicable, expands the AES key, and precomputes
 * powers of the hash key.
 *
 * To comply with the crypto_aead API, this has to be usable in no-SIMD context.
 * For that reason, this function includes a portable C implementation of the
 * needed logic.  However, the portable C implementation is very slow, taking
 * about the same time as encrypting 37 KB of data.  To be ready for users that
 * may set a key even somewhat frequently, we therefore also include a SIMD
 * assembly implementation, expanding the AES key using AES-NI and precomputing
 * the hash key powers using PCLMULQDQ or VPCLMULQDQ.
 */
static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key,
                      unsigned int keylen, int flags)
{
        struct aes_gcm_key *key = aes_gcm_key_get(tfm, flags);
        int err;

        if (flags & FLAG_RFC4106) {
                if (keylen < 4)
                        return -EINVAL;
                keylen -= 4;
                key->rfc4106_nonce = get_unaligned_be32(raw_key + keylen);
        }

        /* The assembly code assumes the following offsets. */
        BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, base.aes_key.key_enc) != 0);
        BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, base.aes_key.key_length) != 480);
        BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers) != 496);
        BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers_xored) != 624);
        BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_times_x64) != 688);
        BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_enc) != 0);
        BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_length) != 480);
        BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, h_powers) != 512);
        BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, padding) != 768);

        if (likely(crypto_simd_usable())) {
                err = aes_check_keylen(keylen);
                if (err)
                        return err;
                kernel_fpu_begin();
                aesni_set_key(&key->aes_key, raw_key, keylen);
                aes_gcm_precompute(key, flags);
                kernel_fpu_end();
        } else {
                static const u8 x_to_the_minus1[16] __aligned(__alignof__(be128)) = {
                        [0] = 0xc2, [15] = 1
                };
                static const u8 x_to_the_63[16] __aligned(__alignof__(be128)) = {
                        [7] = 1,
                };
                be128 h1 = {};
                be128 h;
                int i;

                err = aes_expandkey(&key->aes_key, raw_key, keylen);
                if (err)
                        return err;

                /* Encrypt the all-zeroes block to get the hash key H^1 */
                aes_encrypt(&key->aes_key, (u8 *)&h1, (u8 *)&h1);

                /* Compute H^1 * x^-1 */
                h = h1;
                gf128mul_lle(&h, (const be128 *)x_to_the_minus1);

                /* Compute the needed key powers */
                if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) {
                        struct aes_gcm_key_avx10 *k = AES_GCM_KEY_AVX10(key);

                        for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) {
                                k->h_powers[i][0] = be64_to_cpu(h.b);
                                k->h_powers[i][1] = be64_to_cpu(h.a);
                                gf128mul_lle(&h, &h1);
                        }
                        memset(k->padding, 0, sizeof(k->padding));
                } else {
                        struct aes_gcm_key_aesni *k = AES_GCM_KEY_AESNI(key);

                        for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) {
                                k->h_powers[i][0] = be64_to_cpu(h.b);
                                k->h_powers[i][1] = be64_to_cpu(h.a);
                                k->h_powers_xored[i] = k->h_powers[i][0] ^
                                                       k->h_powers[i][1];
                                gf128mul_lle(&h, &h1);
                        }
                        gf128mul_lle(&h1, (const be128 *)x_to_the_63);
                        k->h_times_x64[0] = be64_to_cpu(h1.b);
                        k->h_times_x64[1] = be64_to_cpu(h1.a);
                }
        }
        return 0;
}

/*
 * Initialize @ghash_acc, then pass all @assoclen bytes of associated data
 * (a.k.a. additional authenticated data) from @sg_src through the GHASH update
 * assembly function.  kernel_fpu_begin() must have already been called.
 */
static void gcm_process_assoc(const struct aes_gcm_key *key, u8 ghash_acc[16],
                              struct scatterlist *sg_src, unsigned int assoclen,
                              int flags)
{
        struct scatter_walk walk;
        /*
         * The assembly function requires that the length of any non-last
         * segment of associated data be a multiple of 16 bytes, so this
         * function does the buffering needed to achieve that.
         */
        unsigned int pos = 0;
        u8 buf[16];

        memset(ghash_acc, 0, 16);
        scatterwalk_start(&walk, sg_src);

        while (assoclen) {
                unsigned int orig_len_this_step = scatterwalk_next(
                        &walk, assoclen);
                unsigned int len_this_step = orig_len_this_step;
                unsigned int len;
                const u8 *src = walk.addr;

                if (unlikely(pos)) {
                        len = min(len_this_step, 16 - pos);
                        memcpy(&buf[pos], src, len);
                        pos += len;
                        src += len;
                        len_this_step -= len;
                        if (pos < 16)
                                goto next;
                        aes_gcm_aad_update(key, ghash_acc, buf, 16, flags);
                        pos = 0;
                }
                len = len_this_step;
                if (unlikely(assoclen)) /* Not the last segment yet? */
                        len = round_down(len, 16);
                aes_gcm_aad_update(key, ghash_acc, src, len, flags);
                src += len;
                len_this_step -= len;
                if (unlikely(len_this_step)) {
                        memcpy(buf, src, len_this_step);
                        pos = len_this_step;
                }
next:
                scatterwalk_done_src(&walk, orig_len_this_step);
                if (need_resched()) {
                        kernel_fpu_end();
                        kernel_fpu_begin();
                }
                assoclen -= orig_len_this_step;
        }
        if (unlikely(pos))
                aes_gcm_aad_update(key, ghash_acc, buf, pos, flags);
}


/* __always_inline to optimize out the branches based on @flags */
static __always_inline int
gcm_crypt(struct aead_request *req, int flags)
{
        struct crypto_aead *tfm = crypto_aead_reqtfm(req);
        const struct aes_gcm_key *key = aes_gcm_key_get(tfm, flags);
        unsigned int assoclen = req->assoclen;
        struct skcipher_walk walk;
        unsigned int nbytes;
        u8 ghash_acc[16]; /* GHASH accumulator */
        u32 le_ctr[4]; /* Counter in little-endian format */
        int taglen;
        int err;

        /* Initialize the counter and determine the associated data length. */
        le_ctr[0] = 2;
        if (flags & FLAG_RFC4106) {
                if (unlikely(assoclen != 16 && assoclen != 20))
                        return -EINVAL;
                assoclen -= 8;
                le_ctr[1] = get_unaligned_be32(req->iv + 4);
                le_ctr[2] = get_unaligned_be32(req->iv + 0);
                le_ctr[3] = key->rfc4106_nonce; /* already byte-swapped */
        } else {
                le_ctr[1] = get_unaligned_be32(req->iv + 8);
                le_ctr[2] = get_unaligned_be32(req->iv + 4);
                le_ctr[3] = get_unaligned_be32(req->iv + 0);
        }

        /* Begin walking through the plaintext or ciphertext. */
        if (flags & FLAG_ENC)
                err = skcipher_walk_aead_encrypt(&walk, req, false);
        else
                err = skcipher_walk_aead_decrypt(&walk, req, false);
        if (err)
                return err;

        /*
         * Since the AES-GCM assembly code requires that at least three assembly
         * functions be called to process any message (this is needed to support
         * incremental updates cleanly), to reduce overhead we try to do all
         * three calls in the same kernel FPU section if possible.  We close the
         * section and start a new one if there are multiple data segments or if
         * rescheduling is needed while processing the associated data.
         */
        kernel_fpu_begin();

        /* Pass the associated data through GHASH. */
        gcm_process_assoc(key, ghash_acc, req->src, assoclen, flags);

        /* En/decrypt the data and pass the ciphertext through GHASH. */
        while (unlikely((nbytes = walk.nbytes) < walk.total)) {
                /*
                 * Non-last segment.  In this case, the assembly function
                 * requires that the length be a multiple of 16 (AES_BLOCK_SIZE)
                 * bytes.  The needed buffering of up to 16 bytes is handled by
                 * the skcipher_walk.  Here we just need to round down to a
                 * multiple of 16.
                 */
                nbytes = round_down(nbytes, AES_BLOCK_SIZE);
                aes_gcm_update(key, le_ctr, ghash_acc, walk.src.virt.addr,
                               walk.dst.virt.addr, nbytes, flags);
                le_ctr[0] += nbytes / AES_BLOCK_SIZE;
                kernel_fpu_end();
                err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
                if (err)
                        return err;
                kernel_fpu_begin();
        }
        /* Last segment: process all remaining data. */
        aes_gcm_update(key, le_ctr, ghash_acc, walk.src.virt.addr,
                       walk.dst.virt.addr, nbytes, flags);
        /*
         * The low word of the counter isn't used by the finalize, so there's no
         * need to increment it here.
         */

        /* Finalize */
        taglen = crypto_aead_authsize(tfm);
        if (flags & FLAG_ENC) {
                /* Finish computing the auth tag. */
                aes_gcm_enc_final(key, le_ctr, ghash_acc, assoclen,
                                  req->cryptlen, flags);

                /* Store the computed auth tag in the dst scatterlist. */
                scatterwalk_map_and_copy(ghash_acc, req->dst, req->assoclen +
                                         req->cryptlen, taglen, 1);
        } else {
                unsigned int datalen = req->cryptlen - taglen;
                u8 tag[16];

                /* Get the transmitted auth tag from the src scatterlist. */
                scatterwalk_map_and_copy(tag, req->src, req->assoclen + datalen,
                                         taglen, 0);
                /*
                 * Finish computing the auth tag and compare it to the
                 * transmitted one.  The assembly function does the actual tag
                 * comparison.  Here, just check the boolean result.
                 */
                if (!aes_gcm_dec_final(key, le_ctr, ghash_acc, assoclen,
                                       datalen, tag, taglen, flags))
                        err = -EBADMSG;
        }
        kernel_fpu_end();
        if (nbytes)
                skcipher_walk_done(&walk, 0);
        return err;
}

#define DEFINE_GCM_ALGS(suffix, flags, generic_driver_name, rfc_driver_name,   \
                        ctxsize, priority)                                       \
                                                                               \
static int gcm_setkey_##suffix(struct crypto_aead *tfm, const u8 *raw_key,     \
                               unsigned int keylen)                               \
{                                                                               \
        return gcm_setkey(tfm, raw_key, keylen, (flags));                       \
}                                                                               \
                                                                               \
static int gcm_encrypt_##suffix(struct aead_request *req)                       \
{                                                                               \
        return gcm_crypt(req, (flags) | FLAG_ENC);                               \
}                                                                               \
                                                                               \
static int gcm_decrypt_##suffix(struct aead_request *req)                       \
{                                                                               \
        return gcm_crypt(req, (flags));                                               \
}                                                                               \
                                                                               \
static int rfc4106_setkey_##suffix(struct crypto_aead *tfm, const u8 *raw_key, \
                                   unsigned int keylen)                               \
{                                                                               \
        return gcm_setkey(tfm, raw_key, keylen, (flags) | FLAG_RFC4106);       \
}                                                                               \
                                                                               \
static int rfc4106_encrypt_##suffix(struct aead_request *req)                       \
{                                                                               \
        return gcm_crypt(req, (flags) | FLAG_RFC4106 | FLAG_ENC);               \
}                                                                               \
                                                                               \
static int rfc4106_decrypt_##suffix(struct aead_request *req)                       \
{                                                                               \
        return gcm_crypt(req, (flags) | FLAG_RFC4106);                               \
}                                                                               \
                                                                               \
static struct aead_alg aes_gcm_algs_##suffix[] = { {                               \
        .setkey                        = gcm_setkey_##suffix,                               \
        .setauthsize                = generic_gcmaes_set_authsize,                       \
        .encrypt                = gcm_encrypt_##suffix,                               \
        .decrypt                = gcm_decrypt_##suffix,                               \
        .ivsize                        = GCM_AES_IV_SIZE,                               \
        .chunksize                = AES_BLOCK_SIZE,                               \
        .maxauthsize                = 16,                                               \
        .base = {                                                               \
                .cra_name                = "gcm(aes)",                               \
                .cra_driver_name        = generic_driver_name,                       \
                .cra_priority                = (priority),                               \
                .cra_blocksize                = 1,                                       \
                .cra_ctxsize                = (ctxsize),                               \
                .cra_module                = THIS_MODULE,                               \
        },                                                                       \
}, {                                                                               \
        .setkey                        = rfc4106_setkey_##suffix,                       \
        .setauthsize                = common_rfc4106_set_authsize,                       \
        .encrypt                = rfc4106_encrypt_##suffix,                       \
        .decrypt                = rfc4106_decrypt_##suffix,                       \
        .ivsize                        = GCM_RFC4106_IV_SIZE,                               \
        .chunksize                = AES_BLOCK_SIZE,                               \
        .maxauthsize                = 16,                                               \
        .base = {                                                               \
                .cra_name                = "rfc4106(gcm(aes))",                       \
                .cra_driver_name        = rfc_driver_name,                       \
                .cra_priority                = (priority),                               \
                .cra_blocksize                = 1,                                       \
                .cra_ctxsize                = (ctxsize),                               \
                .cra_module                = THIS_MODULE,                               \
        },                                                                       \
} }

/* aes_gcm_algs_aesni */
DEFINE_GCM_ALGS(aesni, /* no flags */ 0,
                "generic-gcm-aesni", "rfc4106-gcm-aesni",
                AES_GCM_KEY_AESNI_SIZE, 400);

/* aes_gcm_algs_aesni_avx */
DEFINE_GCM_ALGS(aesni_avx, FLAG_AVX,
                "generic-gcm-aesni-avx", "rfc4106-gcm-aesni-avx",
                AES_GCM_KEY_AESNI_SIZE, 500);

/* aes_gcm_algs_vaes_avx10_256 */
DEFINE_GCM_ALGS(vaes_avx10_256, FLAG_AVX10_256,
                "generic-gcm-vaes-avx10_256", "rfc4106-gcm-vaes-avx10_256",
                AES_GCM_KEY_AVX10_SIZE, 700);

/* aes_gcm_algs_vaes_avx10_512 */
DEFINE_GCM_ALGS(vaes_avx10_512, FLAG_AVX10_512,
                "generic-gcm-vaes-avx10_512", "rfc4106-gcm-vaes-avx10_512",
                AES_GCM_KEY_AVX10_SIZE, 800);

static int __init register_avx_algs(void)
{
        int err;

        if (!boot_cpu_has(X86_FEATURE_AVX))
                return 0;
        err = crypto_register_skciphers(skcipher_algs_aesni_avx,
                                        ARRAY_SIZE(skcipher_algs_aesni_avx));
        if (err)
                return err;
        err = crypto_register_aeads(aes_gcm_algs_aesni_avx,
                                    ARRAY_SIZE(aes_gcm_algs_aesni_avx));
        if (err)
                return err;
        /*
         * Note: not all the algorithms registered below actually require
         * VPCLMULQDQ.  But in practice every CPU with VAES also has VPCLMULQDQ.
         * Similarly, the assembler support was added at about the same time.
         * For simplicity, just always check for VAES and VPCLMULQDQ together.
         */
        if (!boot_cpu_has(X86_FEATURE_AVX2) ||
            !boot_cpu_has(X86_FEATURE_VAES) ||
            !boot_cpu_has(X86_FEATURE_VPCLMULQDQ) ||
            !boot_cpu_has(X86_FEATURE_PCLMULQDQ) ||
            !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
                return 0;
        err = crypto_register_skciphers(skcipher_algs_vaes_avx2,
                                        ARRAY_SIZE(skcipher_algs_vaes_avx2));
        if (err)
                return err;

        if (!boot_cpu_has(X86_FEATURE_AVX512BW) ||
            !boot_cpu_has(X86_FEATURE_AVX512VL) ||
            !boot_cpu_has(X86_FEATURE_BMI2) ||
            !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
                               XFEATURE_MASK_AVX512, NULL))
                return 0;

        err = crypto_register_aeads(aes_gcm_algs_vaes_avx10_256,
                                    ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256));
        if (err)
                return err;

        if (boot_cpu_has(X86_FEATURE_PREFER_YMM)) {
                int i;

                for (i = 0; i < ARRAY_SIZE(skcipher_algs_vaes_avx512); i++)
                        skcipher_algs_vaes_avx512[i].base.cra_priority = 1;
                for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512); i++)
                        aes_gcm_algs_vaes_avx10_512[i].base.cra_priority = 1;
        }

        err = crypto_register_skciphers(skcipher_algs_vaes_avx512,
                                        ARRAY_SIZE(skcipher_algs_vaes_avx512));
        if (err)
                return err;
        err = crypto_register_aeads(aes_gcm_algs_vaes_avx10_512,
                                    ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512));
        if (err)
                return err;

        return 0;
}

#define unregister_skciphers(A) \
        if (refcount_read(&(A)[0].base.cra_refcnt) != 0) \
                crypto_unregister_skciphers((A), ARRAY_SIZE(A))
#define unregister_aeads(A) \
        if (refcount_read(&(A)[0].base.cra_refcnt) != 0) \
                crypto_unregister_aeads((A), ARRAY_SIZE(A))

static void unregister_avx_algs(void)
{
        unregister_skciphers(skcipher_algs_aesni_avx);
        unregister_aeads(aes_gcm_algs_aesni_avx);
        unregister_skciphers(skcipher_algs_vaes_avx2);
        unregister_skciphers(skcipher_algs_vaes_avx512);
        unregister_aeads(aes_gcm_algs_vaes_avx10_256);
        unregister_aeads(aes_gcm_algs_vaes_avx10_512);
}
#else /* CONFIG_X86_64 */
static struct aead_alg aes_gcm_algs_aesni[0];

static int __init register_avx_algs(void)
{
        return 0;
}

static void unregister_avx_algs(void)
{
}
#endif /* !CONFIG_X86_64 */

static const struct x86_cpu_id aesni_cpu_id[] = {
        X86_MATCH_FEATURE(X86_FEATURE_AES, NULL),
        {}
};
MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id);

static int __init aesni_init(void)
{
        int err;

        if (!x86_match_cpu(aesni_cpu_id))
                return -ENODEV;

        err = crypto_register_alg(&aesni_cipher_alg);
        if (err)
                return err;

        err = crypto_register_skciphers(aesni_skciphers,
                                        ARRAY_SIZE(aesni_skciphers));
        if (err)
                goto unregister_cipher;

        err = crypto_register_aeads(aes_gcm_algs_aesni,
                                    ARRAY_SIZE(aes_gcm_algs_aesni));
        if (err)
                goto unregister_skciphers;

        err = register_avx_algs();
        if (err)
                goto unregister_avx;

        return 0;

unregister_avx:
        unregister_avx_algs();
        crypto_unregister_aeads(aes_gcm_algs_aesni,
                                ARRAY_SIZE(aes_gcm_algs_aesni));
unregister_skciphers:
        crypto_unregister_skciphers(aesni_skciphers,
                                    ARRAY_SIZE(aesni_skciphers));
unregister_cipher:
        crypto_unregister_alg(&aesni_cipher_alg);
        return err;
}

static void __exit aesni_exit(void)
{
        crypto_unregister_aeads(aes_gcm_algs_aesni,
                                ARRAY_SIZE(aes_gcm_algs_aesni));
        crypto_unregister_skciphers(aesni_skciphers,
                                    ARRAY_SIZE(aesni_skciphers));
        crypto_unregister_alg(&aesni_cipher_alg);
        unregister_avx_algs();
}

module_init(aesni_init);
module_exit(aesni_exit);

MODULE_DESCRIPTION("AES cipher and modes, optimized with AES-NI or VAES instructions");
MODULE_LICENSE("GPL");
MODULE_ALIAS_CRYPTO("aes");




















































































































    3 




































































































    3 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
 * Copyright (c) 2002 David S. Miller (davem@redhat.com)
 * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au>
 *
 * Portions derived from Cryptoapi, by Alexander Kjeldaas <astor@fast.no>
 * and Nettle, by Niels Möller.
 */

#ifndef _CRYPTO_INTERNAL_CIPHER_H
#define _CRYPTO_INTERNAL_CIPHER_H

#include <crypto/algapi.h>

struct crypto_cipher {
        struct crypto_tfm base;
};

/**
 * DOC: Single Block Cipher API
 *
 * The single block cipher API is used with the ciphers of type
 * CRYPTO_ALG_TYPE_CIPHER (listed as type "cipher" in /proc/crypto).
 *
 * Using the single block cipher API calls, operations with the basic cipher
 * primitive can be implemented. These cipher primitives exclude any block
 * chaining operations including IV handling.
 *
 * The purpose of this single block cipher API is to support the implementation
 * of templates or other concepts that only need to perform the cipher operation
 * on one block at a time. Templates invoke the underlying cipher primitive
 * block-wise and process either the input or the output data of these cipher
 * operations.
 */

static inline struct crypto_cipher *__crypto_cipher_cast(struct crypto_tfm *tfm)
{
        return (struct crypto_cipher *)tfm;
}

/**
 * crypto_alloc_cipher() - allocate single block cipher handle
 * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
 *             single block cipher
 * @type: specifies the type of the cipher
 * @mask: specifies the mask for the cipher
 *
 * Allocate a cipher handle for a single block cipher. The returned struct
 * crypto_cipher is the cipher handle that is required for any subsequent API
 * invocation for that single block cipher.
 *
 * Return: allocated cipher handle in case of success; IS_ERR() is true in case
 *           of an error, PTR_ERR() returns the error code.
 */
static inline struct crypto_cipher *crypto_alloc_cipher(const char *alg_name,
                                                        u32 type, u32 mask)
{
        type &= ~CRYPTO_ALG_TYPE_MASK;
        type |= CRYPTO_ALG_TYPE_CIPHER;
        mask |= CRYPTO_ALG_TYPE_MASK;

        return __crypto_cipher_cast(crypto_alloc_base(alg_name, type, mask));
}

static inline struct crypto_tfm *crypto_cipher_tfm(struct crypto_cipher *tfm)
{
        return &tfm->base;
}

/**
 * crypto_free_cipher() - zeroize and free the single block cipher handle
 * @tfm: cipher handle to be freed
 */
static inline void crypto_free_cipher(struct crypto_cipher *tfm)
{
        crypto_free_tfm(crypto_cipher_tfm(tfm));
}

/**
 * crypto_has_cipher() - Search for the availability of a single block cipher
 * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
 *             single block cipher
 * @type: specifies the type of the cipher
 * @mask: specifies the mask for the cipher
 *
 * Return: true when the single block cipher is known to the kernel crypto API;
 *           false otherwise
 */
static inline int crypto_has_cipher(const char *alg_name, u32 type, u32 mask)
{
        type &= ~CRYPTO_ALG_TYPE_MASK;
        type |= CRYPTO_ALG_TYPE_CIPHER;
        mask |= CRYPTO_ALG_TYPE_MASK;

        return crypto_has_alg(alg_name, type, mask);
}

/**
 * crypto_cipher_blocksize() - obtain block size for cipher
 * @tfm: cipher handle
 *
 * The block size for the single block cipher referenced with the cipher handle
 * tfm is returned. The caller may use that information to allocate appropriate
 * memory for the data returned by the encryption or decryption operation
 *
 * Return: block size of cipher
 */
static inline unsigned int crypto_cipher_blocksize(struct crypto_cipher *tfm)
{
        return crypto_tfm_alg_blocksize(crypto_cipher_tfm(tfm));
}

static inline unsigned int crypto_cipher_alignmask(struct crypto_cipher *tfm)
{
        return crypto_tfm_alg_alignmask(crypto_cipher_tfm(tfm));
}

static inline u32 crypto_cipher_get_flags(struct crypto_cipher *tfm)
{
        return crypto_tfm_get_flags(crypto_cipher_tfm(tfm));
}

static inline void crypto_cipher_set_flags(struct crypto_cipher *tfm,
                                           u32 flags)
{
        crypto_tfm_set_flags(crypto_cipher_tfm(tfm), flags);
}

static inline void crypto_cipher_clear_flags(struct crypto_cipher *tfm,
                                             u32 flags)
{
        crypto_tfm_clear_flags(crypto_cipher_tfm(tfm), flags);
}

/**
 * crypto_cipher_setkey() - set key for cipher
 * @tfm: cipher handle
 * @key: buffer holding the key
 * @keylen: length of the key in bytes
 *
 * The caller provided key is set for the single block cipher referenced by the
 * cipher handle.
 *
 * Note, the key length determines the cipher type. Many block ciphers implement
 * different cipher modes depending on the key size, such as AES-128 vs AES-192
 * vs. AES-256. When providing a 16 byte key for an AES cipher handle, AES-128
 * is performed.
 *
 * Return: 0 if the setting of the key was successful; < 0 if an error occurred
 */
int crypto_cipher_setkey(struct crypto_cipher *tfm,
                         const u8 *key, unsigned int keylen);

/**
 * crypto_cipher_encrypt_one() - encrypt one block of plaintext
 * @tfm: cipher handle
 * @dst: points to the buffer that will be filled with the ciphertext
 * @src: buffer holding the plaintext to be encrypted
 *
 * Invoke the encryption operation of one block. The caller must ensure that
 * the plaintext and ciphertext buffers are at least one block in size.
 */
void crypto_cipher_encrypt_one(struct crypto_cipher *tfm,
                               u8 *dst, const u8 *src);

/**
 * crypto_cipher_decrypt_one() - decrypt one block of ciphertext
 * @tfm: cipher handle
 * @dst: points to the buffer that will be filled with the plaintext
 * @src: buffer holding the ciphertext to be decrypted
 *
 * Invoke the decryption operation of one block. The caller must ensure that
 * the plaintext and ciphertext buffers are at least one block in size.
 */
void crypto_cipher_decrypt_one(struct crypto_cipher *tfm,
                               u8 *dst, const u8 *src);

struct crypto_cipher *crypto_clone_cipher(struct crypto_cipher *cipher);

struct crypto_cipher_spawn {
        struct crypto_spawn base;
};

static inline int crypto_grab_cipher(struct crypto_cipher_spawn *spawn,
                                     struct crypto_instance *inst,
                                     const char *name, u32 type, u32 mask)
{
        type &= ~CRYPTO_ALG_TYPE_MASK;
        type |= CRYPTO_ALG_TYPE_CIPHER;
        mask |= CRYPTO_ALG_TYPE_MASK;
        return crypto_grab_spawn(&spawn->base, inst, name, type, mask);
}

static inline void crypto_drop_cipher(struct crypto_cipher_spawn *spawn)
{
        crypto_drop_spawn(&spawn->base);
}

static inline struct crypto_alg *crypto_spawn_cipher_alg(
       struct crypto_cipher_spawn *spawn)
{
        return spawn->base.alg;
}

static inline struct crypto_cipher *crypto_spawn_cipher(
        struct crypto_cipher_spawn *spawn)
{
        u32 type = CRYPTO_ALG_TYPE_CIPHER;
        u32 mask = CRYPTO_ALG_TYPE_MASK;

        return __crypto_cipher_cast(crypto_spawn_tfm(&spawn->base, type, mask));
}

static inline struct cipher_alg *crypto_cipher_alg(struct crypto_cipher *tfm)
{
        return &crypto_cipher_tfm(tfm)->__crt_alg->cra_cipher;
}

#endif









































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
#ifndef _LINUX_RSEQ_H
#define _LINUX_RSEQ_H

#ifdef CONFIG_RSEQ

#include <linux/preempt.h>
#include <linux/sched.h>

#ifdef CONFIG_MEMBARRIER
# define RSEQ_EVENT_GUARD        irq
#else
# define RSEQ_EVENT_GUARD        preempt
#endif

/*
 * Map the event mask on the user-space ABI enum rseq_cs_flags
 * for direct mask checks.
 */
enum rseq_event_mask_bits {
        RSEQ_EVENT_PREEMPT_BIT        = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT,
        RSEQ_EVENT_SIGNAL_BIT        = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT,
        RSEQ_EVENT_MIGRATE_BIT        = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT,
};

enum rseq_event_mask {
        RSEQ_EVENT_PREEMPT        = (1U << RSEQ_EVENT_PREEMPT_BIT),
        RSEQ_EVENT_SIGNAL        = (1U << RSEQ_EVENT_SIGNAL_BIT),
        RSEQ_EVENT_MIGRATE        = (1U << RSEQ_EVENT_MIGRATE_BIT),
};

static inline void rseq_set_notify_resume(struct task_struct *t)
{
        if (t->rseq)
                set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
}

void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);

static inline void rseq_handle_notify_resume(struct ksignal *ksig,
                                             struct pt_regs *regs)
{
        if (current->rseq)
                __rseq_handle_notify_resume(ksig, regs);
}

static inline void rseq_signal_deliver(struct ksignal *ksig,
                                       struct pt_regs *regs)
{
        scoped_guard(RSEQ_EVENT_GUARD)
                __set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
        rseq_handle_notify_resume(ksig, regs);
}

/* rseq_preempt() requires preemption to be disabled. */
static inline void rseq_preempt(struct task_struct *t)
{
        __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
        rseq_set_notify_resume(t);
}

/* rseq_migrate() requires preemption to be disabled. */
static inline void rseq_migrate(struct task_struct *t)
{
        __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask);
        rseq_set_notify_resume(t);
}

/*
 * If parent process has a registered restartable sequences area, the
 * child inherits. Unregister rseq for a clone with CLONE_VM set.
 */
static inline void rseq_fork(struct task_struct *t, u64 clone_flags)
{
        if (clone_flags & CLONE_VM) {
                t->rseq = NULL;
                t->rseq_len = 0;
                t->rseq_sig = 0;
                t->rseq_event_mask = 0;
        } else {
                t->rseq = current->rseq;
                t->rseq_len = current->rseq_len;
                t->rseq_sig = current->rseq_sig;
                t->rseq_event_mask = current->rseq_event_mask;
        }
}

static inline void rseq_execve(struct task_struct *t)
{
        t->rseq = NULL;
        t->rseq_len = 0;
        t->rseq_sig = 0;
        t->rseq_event_mask = 0;
}

#else

static inline void rseq_set_notify_resume(struct task_struct *t)
{
}
static inline void rseq_handle_notify_resume(struct ksignal *ksig,
                                             struct pt_regs *regs)
{
}
static inline void rseq_signal_deliver(struct ksignal *ksig,
                                       struct pt_regs *regs)
{
}
static inline void rseq_preempt(struct task_struct *t)
{
}
static inline void rseq_migrate(struct task_struct *t)
{
}
static inline void rseq_fork(struct task_struct *t, u64 clone_flags)
{
}
static inline void rseq_execve(struct task_struct *t)
{
}

#endif

#ifdef CONFIG_DEBUG_RSEQ

void rseq_syscall(struct pt_regs *regs);

#else

static inline void rseq_syscall(struct pt_regs *regs)
{
}

#endif

#endif /* _LINUX_RSEQ_H */






























































































































































































































































































































































































































































































































































































































































































































































































































































































































    4 


    4 



























































































































































































































































































































































  318 

  319 




















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    4 






    4 





    4 





























































































    4 





















































































    4 






























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MM_H
#define _LINUX_MM_H

#include <linux/errno.h>
#include <linux/mmdebug.h>
#include <linux/gfp.h>
#include <linux/pgalloc_tag.h>
#include <linux/bug.h>
#include <linux/list.h>
#include <linux/mmzone.h>
#include <linux/rbtree.h>
#include <linux/atomic.h>
#include <linux/debug_locks.h>
#include <linux/compiler.h>
#include <linux/mm_types.h>
#include <linux/mmap_lock.h>
#include <linux/range.h>
#include <linux/pfn.h>
#include <linux/percpu-refcount.h>
#include <linux/bit_spinlock.h>
#include <linux/shrinker.h>
#include <linux/resource.h>
#include <linux/page_ext.h>
#include <linux/err.h>
#include <linux/page-flags.h>
#include <linux/page_ref.h>
#include <linux/overflow.h>
#include <linux/sizes.h>
#include <linux/sched.h>
#include <linux/pgtable.h>
#include <linux/kasan.h>
#include <linux/memremap.h>
#include <linux/slab.h>
#include <linux/cacheinfo.h>
#include <linux/rcuwait.h>
#include <linux/bitmap.h>
#include <linux/bitops.h>

struct mempolicy;
struct anon_vma;
struct anon_vma_chain;
struct user_struct;
struct pt_regs;
struct folio_batch;

void arch_mm_preinit(void);
void mm_core_init(void);
void init_mm_internals(void);

extern atomic_long_t _totalram_pages;
static inline unsigned long totalram_pages(void)
{
        return (unsigned long)atomic_long_read(&_totalram_pages);
}

static inline void totalram_pages_inc(void)
{
        atomic_long_inc(&_totalram_pages);
}

static inline void totalram_pages_dec(void)
{
        atomic_long_dec(&_totalram_pages);
}

static inline void totalram_pages_add(long count)
{
        atomic_long_add(count, &_totalram_pages);
}

extern void * high_memory;

/*
 * Convert between pages and MB
 * 20 is the shift for 1MB (2^20 = 1MB)
 * PAGE_SHIFT is the shift for page size (e.g., 12 for 4KB pages)
 * So (20 - PAGE_SHIFT) converts between pages and MB
 */
#define PAGES_TO_MB(pages) ((pages) >> (20 - PAGE_SHIFT))
#define MB_TO_PAGES(mb)    ((mb) << (20 - PAGE_SHIFT))

#ifdef CONFIG_SYSCTL
extern int sysctl_legacy_va_layout;
#else
#define sysctl_legacy_va_layout 0
#endif

#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
extern const int mmap_rnd_bits_min;
extern int mmap_rnd_bits_max __ro_after_init;
extern int mmap_rnd_bits __read_mostly;
#endif
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
extern const int mmap_rnd_compat_bits_min;
extern const int mmap_rnd_compat_bits_max;
extern int mmap_rnd_compat_bits __read_mostly;
#endif

#ifndef DIRECT_MAP_PHYSMEM_END
# ifdef MAX_PHYSMEM_BITS
# define DIRECT_MAP_PHYSMEM_END        ((1ULL << MAX_PHYSMEM_BITS) - 1)
# else
# define DIRECT_MAP_PHYSMEM_END        (((phys_addr_t)-1)&~(1ULL<<63))
# endif
#endif

#include <asm/page.h>
#include <asm/processor.h>

#ifndef __pa_symbol
#define __pa_symbol(x)  __pa(RELOC_HIDE((unsigned long)(x), 0))
#endif

#ifndef page_to_virt
#define page_to_virt(x)        __va(PFN_PHYS(page_to_pfn(x)))
#endif

#ifndef lm_alias
#define lm_alias(x)        __va(__pa_symbol(x))
#endif

/*
 * To prevent common memory management code establishing
 * a zero page mapping on a read fault.
 * This macro should be defined within <asm/pgtable.h>.
 * s390 does this to prevent multiplexing of hardware bits
 * related to the physical page in case of virtualization.
 */
#ifndef mm_forbids_zeropage
#define mm_forbids_zeropage(X)        (0)
#endif

/*
 * On some architectures it is expensive to call memset() for small sizes.
 * If an architecture decides to implement their own version of
 * mm_zero_struct_page they should wrap the defines below in a #ifndef and
 * define their own version of this macro in <asm/pgtable.h>
 */
#if BITS_PER_LONG == 64
/* This function must be updated when the size of struct page grows above 96
 * or reduces below 56. The idea that compiler optimizes out switch()
 * statement, and only leaves move/store instructions. Also the compiler can
 * combine write statements if they are both assignments and can be reordered,
 * this can result in several of the writes here being dropped.
 */
#define        mm_zero_struct_page(pp) __mm_zero_struct_page(pp)
static inline void __mm_zero_struct_page(struct page *page)
{
        unsigned long *_pp = (void *)page;

         /* Check that struct page is either 56, 64, 72, 80, 88 or 96 bytes */
        BUILD_BUG_ON(sizeof(struct page) & 7);
        BUILD_BUG_ON(sizeof(struct page) < 56);
        BUILD_BUG_ON(sizeof(struct page) > 96);

        switch (sizeof(struct page)) {
        case 96:
                _pp[11] = 0;
                fallthrough;
        case 88:
                _pp[10] = 0;
                fallthrough;
        case 80:
                _pp[9] = 0;
                fallthrough;
        case 72:
                _pp[8] = 0;
                fallthrough;
        case 64:
                _pp[7] = 0;
                fallthrough;
        case 56:
                _pp[6] = 0;
                _pp[5] = 0;
                _pp[4] = 0;
                _pp[3] = 0;
                _pp[2] = 0;
                _pp[1] = 0;
                _pp[0] = 0;
        }
}
#else
#define mm_zero_struct_page(pp)  ((void)memset((pp), 0, sizeof(struct page)))
#endif

/*
 * Default maximum number of active map areas, this limits the number of vmas
 * per mm struct. Users can overwrite this number by sysctl but there is a
 * problem.
 *
 * When a program's coredump is generated as ELF format, a section is created
 * per a vma. In ELF, the number of sections is represented in unsigned short.
 * This means the number of sections should be smaller than 65535 at coredump.
 * Because the kernel adds some informative sections to a image of program at
 * generating coredump, we need some margin. The number of extra sections is
 * 1-3 now and depends on arch. We use "5" as safe margin, here.
 *
 * ELF extended numbering allows more than 65535 sections, so 16-bit bound is
 * not a hard limit any more. Although some userspace tools can be surprised by
 * that.
 */
#define MAPCOUNT_ELF_CORE_MARGIN        (5)
#define DEFAULT_MAX_MAP_COUNT        (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)

extern int sysctl_max_map_count;

extern unsigned long sysctl_user_reserve_kbytes;
extern unsigned long sysctl_admin_reserve_kbytes;

#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
bool page_range_contiguous(const struct page *page, unsigned long nr_pages);
#else
static inline bool page_range_contiguous(const struct page *page,
                unsigned long nr_pages)
{
        return true;
}
#endif

/* to align the pointer to the (next) page boundary */
#define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE)

/* to align the pointer to the (prev) page boundary */
#define PAGE_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PAGE_SIZE)

/* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */
#define PAGE_ALIGNED(addr)        IS_ALIGNED((unsigned long)(addr), PAGE_SIZE)

/**
 * folio_page_idx - Return the number of a page in a folio.
 * @folio: The folio.
 * @page: The folio page.
 *
 * This function expects that the page is actually part of the folio.
 * The returned number is relative to the start of the folio.
 */
static inline unsigned long folio_page_idx(const struct folio *folio,
                const struct page *page)
{
        return page - &folio->page;
}

static inline struct folio *lru_to_folio(struct list_head *head)
{
        return list_entry((head)->prev, struct folio, lru);
}

void setup_initial_init_mm(void *start_code, void *end_code,
                           void *end_data, void *brk);

/*
 * Linux kernel virtual memory manager primitives.
 * The idea being to have a "virtual" mm in the same way
 * we have a virtual fs - giving a cleaner interface to the
 * mm details, and allowing different kinds of memory mappings
 * (from shared memory to executable loading to arbitrary
 * mmap() functions).
 */

struct vm_area_struct *vm_area_alloc(struct mm_struct *);
struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
void vm_area_free(struct vm_area_struct *);

#ifndef CONFIG_MMU
extern struct rb_root nommu_region_tree;
extern struct rw_semaphore nommu_region_sem;

extern unsigned int kobjsize(const void *objp);
#endif

/*
 * vm_flags in vm_area_struct, see mm_types.h.
 * When changing, update also include/trace/events/mmflags.h
 */
#define VM_NONE                0x00000000

#define VM_READ                0x00000001        /* currently active flags */
#define VM_WRITE        0x00000002
#define VM_EXEC                0x00000004
#define VM_SHARED        0x00000008

/* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
#define VM_MAYREAD        0x00000010        /* limits for mprotect() etc */
#define VM_MAYWRITE        0x00000020
#define VM_MAYEXEC        0x00000040
#define VM_MAYSHARE        0x00000080

#define VM_GROWSDOWN        0x00000100        /* general info on the segment */
#ifdef CONFIG_MMU
#define VM_UFFD_MISSING        0x00000200        /* missing pages tracking */
#else /* CONFIG_MMU */
#define VM_MAYOVERLAY        0x00000200        /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */
#define VM_UFFD_MISSING        0
#endif /* CONFIG_MMU */
#define VM_PFNMAP        0x00000400        /* Page-ranges managed without "struct page", just pure PFN */
#define VM_UFFD_WP        0x00001000        /* wrprotect pages tracking */

#define VM_LOCKED        0x00002000
#define VM_IO           0x00004000        /* Memory mapped I/O or similar */

                                        /* Used by sys_madvise() */
#define VM_SEQ_READ        0x00008000        /* App will access data sequentially */
#define VM_RAND_READ        0x00010000        /* App will not benefit from clustered reads */

#define VM_DONTCOPY        0x00020000      /* Do not copy this vma on fork */
#define VM_DONTEXPAND        0x00040000        /* Cannot expand with mremap() */
#define VM_LOCKONFAULT        0x00080000        /* Lock the pages covered when they are faulted in */
#define VM_ACCOUNT        0x00100000        /* Is a VM accounted object */
#define VM_NORESERVE        0x00200000        /* should the VM suppress accounting */
#define VM_HUGETLB        0x00400000        /* Huge TLB Page VM */
#define VM_SYNC                0x00800000        /* Synchronous page faults */
#define VM_ARCH_1        0x01000000        /* Architecture-specific flag */
#define VM_WIPEONFORK        0x02000000        /* Wipe VMA contents in child. */
#define VM_DONTDUMP        0x04000000        /* Do not include in the core dump */

#ifdef CONFIG_MEM_SOFT_DIRTY
# define VM_SOFTDIRTY        0x08000000        /* Not soft dirty clean area */
#else
# define VM_SOFTDIRTY        0
#endif

#define VM_MIXEDMAP        0x10000000        /* Can contain "struct page" and pure PFN pages */
#define VM_HUGEPAGE        0x20000000        /* MADV_HUGEPAGE marked this vma */
#define VM_NOHUGEPAGE        0x40000000        /* MADV_NOHUGEPAGE marked this vma */
#define VM_MERGEABLE        BIT(31)                /* KSM may merge identical pages */

#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS
#define VM_HIGH_ARCH_BIT_0        32        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_1        33        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_2        34        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_3        35        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_4        36        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_5        37        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_6        38        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_0        BIT(VM_HIGH_ARCH_BIT_0)
#define VM_HIGH_ARCH_1        BIT(VM_HIGH_ARCH_BIT_1)
#define VM_HIGH_ARCH_2        BIT(VM_HIGH_ARCH_BIT_2)
#define VM_HIGH_ARCH_3        BIT(VM_HIGH_ARCH_BIT_3)
#define VM_HIGH_ARCH_4        BIT(VM_HIGH_ARCH_BIT_4)
#define VM_HIGH_ARCH_5        BIT(VM_HIGH_ARCH_BIT_5)
#define VM_HIGH_ARCH_6        BIT(VM_HIGH_ARCH_BIT_6)
#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */

#ifdef CONFIG_ARCH_HAS_PKEYS
# define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0
# define VM_PKEY_BIT0  VM_HIGH_ARCH_0
# define VM_PKEY_BIT1  VM_HIGH_ARCH_1
# define VM_PKEY_BIT2  VM_HIGH_ARCH_2
#if CONFIG_ARCH_PKEY_BITS > 3
# define VM_PKEY_BIT3  VM_HIGH_ARCH_3
#else
# define VM_PKEY_BIT3  0
#endif
#if CONFIG_ARCH_PKEY_BITS > 4
# define VM_PKEY_BIT4  VM_HIGH_ARCH_4
#else
# define VM_PKEY_BIT4  0
#endif
#endif /* CONFIG_ARCH_HAS_PKEYS */

#ifdef CONFIG_X86_USER_SHADOW_STACK
/*
 * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of
 * support core mm.
 *
 * These VMAs will get a single end guard page. This helps userspace protect
 * itself from attacks. A single page is enough for current shadow stack archs
 * (x86). See the comments near alloc_shstk() in arch/x86/kernel/shstk.c
 * for more details on the guard size.
 */
# define VM_SHADOW_STACK        VM_HIGH_ARCH_5
#endif

#if defined(CONFIG_ARM64_GCS)
/*
 * arm64's Guarded Control Stack implements similar functionality and
 * has similar constraints to shadow stacks.
 */
# define VM_SHADOW_STACK        VM_HIGH_ARCH_6
#endif

#ifndef VM_SHADOW_STACK
# define VM_SHADOW_STACK        VM_NONE
#endif

#if defined(CONFIG_PPC64)
# define VM_SAO                VM_ARCH_1        /* Strong Access Ordering (powerpc) */
#elif defined(CONFIG_PARISC)
# define VM_GROWSUP        VM_ARCH_1
#elif defined(CONFIG_SPARC64)
# define VM_SPARC_ADI        VM_ARCH_1        /* Uses ADI tag for access control */
# define VM_ARCH_CLEAR        VM_SPARC_ADI
#elif defined(CONFIG_ARM64)
# define VM_ARM64_BTI        VM_ARCH_1        /* BTI guarded page, a.k.a. GP bit */
# define VM_ARCH_CLEAR        VM_ARM64_BTI
#elif !defined(CONFIG_MMU)
# define VM_MAPPED_COPY        VM_ARCH_1        /* T if mapped copy of data (nommu mmap) */
#endif

#if defined(CONFIG_ARM64_MTE)
# define VM_MTE                VM_HIGH_ARCH_4        /* Use Tagged memory for access control */
# define VM_MTE_ALLOWED        VM_HIGH_ARCH_5        /* Tagged memory permitted */
#else
# define VM_MTE                VM_NONE
# define VM_MTE_ALLOWED        VM_NONE
#endif

#ifndef VM_GROWSUP
# define VM_GROWSUP        VM_NONE
#endif

#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
# define VM_UFFD_MINOR_BIT        41
# define VM_UFFD_MINOR                BIT(VM_UFFD_MINOR_BIT)        /* UFFD minor faults */
#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
# define VM_UFFD_MINOR                VM_NONE
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */

/*
 * This flag is used to connect VFIO to arch specific KVM code. It
 * indicates that the memory under this VMA is safe for use with any
 * non-cachable memory type inside KVM. Some VFIO devices, on some
 * platforms, are thought to be unsafe and can cause machine crashes
 * if KVM does not lock down the memory type.
 */
#ifdef CONFIG_64BIT
#define VM_ALLOW_ANY_UNCACHED_BIT        39
#define VM_ALLOW_ANY_UNCACHED                BIT(VM_ALLOW_ANY_UNCACHED_BIT)
#else
#define VM_ALLOW_ANY_UNCACHED                VM_NONE
#endif

#ifdef CONFIG_64BIT
#define VM_DROPPABLE_BIT        40
#define VM_DROPPABLE                BIT(VM_DROPPABLE_BIT)
#elif defined(CONFIG_PPC32)
#define VM_DROPPABLE                VM_ARCH_1
#else
#define VM_DROPPABLE                VM_NONE
#endif

#ifdef CONFIG_64BIT
#define VM_SEALED_BIT        42
#define VM_SEALED        BIT(VM_SEALED_BIT)
#else
#define VM_SEALED        VM_NONE
#endif

/* Bits set in the VMA until the stack is in its final location */
#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY)

#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0)

/* Common data flag combinations */
#define VM_DATA_FLAGS_TSK_EXEC        (VM_READ | VM_WRITE | TASK_EXEC | \
                                 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
#define VM_DATA_FLAGS_NON_EXEC        (VM_READ | VM_WRITE | VM_MAYREAD | \
                                 VM_MAYWRITE | VM_MAYEXEC)
#define VM_DATA_FLAGS_EXEC        (VM_READ | VM_WRITE | VM_EXEC | \
                                 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)

#ifndef VM_DATA_DEFAULT_FLAGS                /* arch can override this */
#define VM_DATA_DEFAULT_FLAGS  VM_DATA_FLAGS_EXEC
#endif

#ifndef VM_STACK_DEFAULT_FLAGS                /* arch can override this */
#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
#endif

#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK)

#ifdef CONFIG_STACK_GROWSUP
#define VM_STACK        VM_GROWSUP
#define VM_STACK_EARLY        VM_GROWSDOWN
#else
#define VM_STACK        VM_GROWSDOWN
#define VM_STACK_EARLY        0
#endif

#define VM_STACK_FLAGS        (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)

/* VMA basic access permission flags */
#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC)


/*
 * Special vmas that are non-mergable, non-mlock()able.
 */
#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)

/* This mask prevents VMA from being scanned with khugepaged */
#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB)

/* This mask defines which mm->def_flags a process can inherit its parent */
#define VM_INIT_DEF_MASK        VM_NOHUGEPAGE

/* This mask represents all the VMA flag bits used by mlock */
#define VM_LOCKED_MASK        (VM_LOCKED | VM_LOCKONFAULT)

/* Arch-specific flags to clear when updating VM flags on protection change */
#ifndef VM_ARCH_CLEAR
# define VM_ARCH_CLEAR        VM_NONE
#endif
#define VM_FLAGS_CLEAR        (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR)

/*
 * mapping from the currently active vm_flags protection bits (the
 * low four bits) to a page protection mask..
 */

/*
 * The default fault flags that should be used by most of the
 * arch-specific page fault handlers.
 */
#define FAULT_FLAG_DEFAULT  (FAULT_FLAG_ALLOW_RETRY | \
                             FAULT_FLAG_KILLABLE | \
                             FAULT_FLAG_INTERRUPTIBLE)

/**
 * fault_flag_allow_retry_first - check ALLOW_RETRY the first time
 * @flags: Fault flags.
 *
 * This is mostly used for places where we want to try to avoid taking
 * the mmap_lock for too long a time when waiting for another condition
 * to change, in which case we can try to be polite to release the
 * mmap_lock in the first round to avoid potential starvation of other
 * processes that would also want the mmap_lock.
 *
 * Return: true if the page fault allows retry and this is the first
 * attempt of the fault handling; false otherwise.
 */
static inline bool fault_flag_allow_retry_first(enum fault_flag flags)
{
        return (flags & FAULT_FLAG_ALLOW_RETRY) &&
            (!(flags & FAULT_FLAG_TRIED));
}

#define FAULT_FLAG_TRACE \
        { FAULT_FLAG_WRITE,                "WRITE" }, \
        { FAULT_FLAG_MKWRITE,                "MKWRITE" }, \
        { FAULT_FLAG_ALLOW_RETRY,        "ALLOW_RETRY" }, \
        { FAULT_FLAG_RETRY_NOWAIT,        "RETRY_NOWAIT" }, \
        { FAULT_FLAG_KILLABLE,                "KILLABLE" }, \
        { FAULT_FLAG_TRIED,                "TRIED" }, \
        { FAULT_FLAG_USER,                "USER" }, \
        { FAULT_FLAG_REMOTE,                "REMOTE" }, \
        { FAULT_FLAG_INSTRUCTION,        "INSTRUCTION" }, \
        { FAULT_FLAG_INTERRUPTIBLE,        "INTERRUPTIBLE" }, \
        { FAULT_FLAG_VMA_LOCK,                "VMA_LOCK" }

/*
 * vm_fault is filled by the pagefault handler and passed to the vma's
 * ->fault function. The vma's ->fault is responsible for returning a bitmask
 * of VM_FAULT_xxx flags that give details about how the fault was handled.
 *
 * MM layer fills up gfp_mask for page allocations but fault handler might
 * alter it if its implementation requires a different allocation context.
 *
 * pgoff should be used in favour of virtual_address, if possible.
 */
struct vm_fault {
        const struct {
                struct vm_area_struct *vma;        /* Target VMA */
                gfp_t gfp_mask;                        /* gfp mask to be used for allocations */
                pgoff_t pgoff;                        /* Logical page offset based on vma */
                unsigned long address;                /* Faulting virtual address - masked */
                unsigned long real_address;        /* Faulting virtual address - unmasked */
        };
        enum fault_flag flags;                /* FAULT_FLAG_xxx flags
                                         * XXX: should really be 'const' */
        pmd_t *pmd;                        /* Pointer to pmd entry matching
                                         * the 'address' */
        pud_t *pud;                        /* Pointer to pud entry matching
                                         * the 'address'
                                         */
        union {
                pte_t orig_pte;                /* Value of PTE at the time of fault */
                pmd_t orig_pmd;                /* Value of PMD at the time of fault,
                                         * used by PMD fault only.
                                         */
        };

        struct page *cow_page;                /* Page handler may use for COW fault */
        struct page *page;                /* ->fault handlers should return a
                                         * page here, unless VM_FAULT_NOPAGE
                                         * is set (which is also implied by
                                         * VM_FAULT_ERROR).
                                         */
        /* These three entries are valid only while holding ptl lock */
        pte_t *pte;                        /* Pointer to pte entry matching
                                         * the 'address'. NULL if the page
                                         * table hasn't been allocated.
                                         */
        spinlock_t *ptl;                /* Page table lock.
                                         * Protects pte page table if 'pte'
                                         * is not NULL, otherwise pmd.
                                         */
        pgtable_t prealloc_pte;                /* Pre-allocated pte page table.
                                         * vm_ops->map_pages() sets up a page
                                         * table from atomic context.
                                         * do_fault_around() pre-allocates
                                         * page table to avoid allocation from
                                         * atomic context.
                                         */
};

/*
 * These are the virtual MM functions - opening of an area, closing and
 * unmapping it (needed to keep files on disk up-to-date etc), pointer
 * to the functions called when a no-page or a wp-page exception occurs.
 */
struct vm_operations_struct {
        void (*open)(struct vm_area_struct * area);
        /**
         * @close: Called when the VMA is being removed from the MM.
         * Context: User context.  May sleep.  Caller holds mmap_lock.
         */
        void (*close)(struct vm_area_struct * area);
        /* Called any time before splitting to check if it's allowed */
        int (*may_split)(struct vm_area_struct *area, unsigned long addr);
        int (*mremap)(struct vm_area_struct *area);
        /*
         * Called by mprotect() to make driver-specific permission
         * checks before mprotect() is finalised.   The VMA must not
         * be modified.  Returns 0 if mprotect() can proceed.
         */
        int (*mprotect)(struct vm_area_struct *vma, unsigned long start,
                        unsigned long end, unsigned long newflags);
        vm_fault_t (*fault)(struct vm_fault *vmf);
        vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order);
        vm_fault_t (*map_pages)(struct vm_fault *vmf,
                        pgoff_t start_pgoff, pgoff_t end_pgoff);
        unsigned long (*pagesize)(struct vm_area_struct * area);

        /* notification that a previously read-only page is about to become
         * writable, if an error is returned it will cause a SIGBUS */
        vm_fault_t (*page_mkwrite)(struct vm_fault *vmf);

        /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */
        vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf);

        /* called by access_process_vm when get_user_pages() fails, typically
         * for use by special VMAs. See also generic_access_phys() for a generic
         * implementation useful for any iomem mapping.
         */
        int (*access)(struct vm_area_struct *vma, unsigned long addr,
                      void *buf, int len, int write);

        /* Called by the /proc/PID/maps code to ask the vma whether it
         * has a special name.  Returning non-NULL will also cause this
         * vma to be dumped unconditionally. */
        const char *(*name)(struct vm_area_struct *vma);

#ifdef CONFIG_NUMA
        /*
         * set_policy() op must add a reference to any non-NULL @new mempolicy
         * to hold the policy upon return.  Caller should pass NULL @new to
         * remove a policy and fall back to surrounding context--i.e. do not
         * install a MPOL_DEFAULT policy, nor the task or system default
         * mempolicy.
         */
        int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);

        /*
         * get_policy() op must add reference [mpol_get()] to any policy at
         * (vma,addr) marked as MPOL_SHARED.  The shared policy infrastructure
         * in mm/mempolicy.c will do this automatically.
         * get_policy() must NOT add a ref if the policy at (vma,addr) is not
         * marked as MPOL_SHARED. vma policies are protected by the mmap_lock.
         * If no [shared/vma] mempolicy exists at the addr, get_policy() op
         * must return NULL--i.e., do not "fallback" to task or system default
         * policy.
         */
        struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
                                        unsigned long addr, pgoff_t *ilx);
#endif
#ifdef CONFIG_FIND_NORMAL_PAGE
        /*
         * Called by vm_normal_page() for special PTEs in @vma at @addr. This
         * allows for returning a "normal" page from vm_normal_page() even
         * though the PTE indicates that the "struct page" either does not exist
         * or should not be touched: "special".
         *
         * Do not add new users: this really only works when a "normal" page
         * was mapped, but then the PTE got changed to something weird (+
         * marked special) that would not make pte_pfn() identify the originally
         * inserted page.
         */
        struct page *(*find_normal_page)(struct vm_area_struct *vma,
                                         unsigned long addr);
#endif /* CONFIG_FIND_NORMAL_PAGE */
};

#ifdef CONFIG_NUMA_BALANCING
static inline void vma_numab_state_init(struct vm_area_struct *vma)
{
        vma->numab_state = NULL;
}
static inline void vma_numab_state_free(struct vm_area_struct *vma)
{
        kfree(vma->numab_state);
}
#else
static inline void vma_numab_state_init(struct vm_area_struct *vma) {}
static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
#endif /* CONFIG_NUMA_BALANCING */

/*
 * These must be here rather than mmap_lock.h as dependent on vm_fault type,
 * declared in this header.
 */
#ifdef CONFIG_PER_VMA_LOCK
static inline void release_fault_lock(struct vm_fault *vmf)
{
        if (vmf->flags & FAULT_FLAG_VMA_LOCK)
                vma_end_read(vmf->vma);
        else
                mmap_read_unlock(vmf->vma->vm_mm);
}

static inline void assert_fault_locked(const struct vm_fault *vmf)
{
        if (vmf->flags & FAULT_FLAG_VMA_LOCK)
                vma_assert_locked(vmf->vma);
        else
                mmap_assert_locked(vmf->vma->vm_mm);
}
#else
static inline void release_fault_lock(struct vm_fault *vmf)
{
        mmap_read_unlock(vmf->vma->vm_mm);
}

static inline void assert_fault_locked(const struct vm_fault *vmf)
{
        mmap_assert_locked(vmf->vma->vm_mm);
}
#endif /* CONFIG_PER_VMA_LOCK */

static inline bool mm_flags_test(int flag, const struct mm_struct *mm)
{
        return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
}

static inline bool mm_flags_test_and_set(int flag, struct mm_struct *mm)
{
        return test_and_set_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
}

static inline bool mm_flags_test_and_clear(int flag, struct mm_struct *mm)
{
        return test_and_clear_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
}

static inline void mm_flags_set(int flag, struct mm_struct *mm)
{
        set_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
}

static inline void mm_flags_clear(int flag, struct mm_struct *mm)
{
        clear_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
}

static inline void mm_flags_clear_all(struct mm_struct *mm)
{
        bitmap_zero(ACCESS_PRIVATE(&mm->flags, __mm_flags), NUM_MM_FLAG_BITS);
}

extern const struct vm_operations_struct vma_dummy_vm_ops;

static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
{
        memset(vma, 0, sizeof(*vma));
        vma->vm_mm = mm;
        vma->vm_ops = &vma_dummy_vm_ops;
        INIT_LIST_HEAD(&vma->anon_vma_chain);
        vma_lock_init(vma, false);
}

/* Use when VMA is not part of the VMA tree and needs no locking */
static inline void vm_flags_init(struct vm_area_struct *vma,
                                 vm_flags_t flags)
{
        ACCESS_PRIVATE(vma, __vm_flags) = flags;
}

/*
 * Use when VMA is part of the VMA tree and modifications need coordination
 * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and
 * it should be locked explicitly beforehand.
 */
static inline void vm_flags_reset(struct vm_area_struct *vma,
                                  vm_flags_t flags)
{
        vma_assert_write_locked(vma);
        vm_flags_init(vma, flags);
}

static inline void vm_flags_reset_once(struct vm_area_struct *vma,
                                       vm_flags_t flags)
{
        vma_assert_write_locked(vma);
        WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags);
}

static inline void vm_flags_set(struct vm_area_struct *vma,
                                vm_flags_t flags)
{
        vma_start_write(vma);
        ACCESS_PRIVATE(vma, __vm_flags) |= flags;
}

static inline void vm_flags_clear(struct vm_area_struct *vma,
                                  vm_flags_t flags)
{
        vma_start_write(vma);
        ACCESS_PRIVATE(vma, __vm_flags) &= ~flags;
}

/*
 * Use only if VMA is not part of the VMA tree or has no other users and
 * therefore needs no locking.
 */
static inline void __vm_flags_mod(struct vm_area_struct *vma,
                                  vm_flags_t set, vm_flags_t clear)
{
        vm_flags_init(vma, (vma->vm_flags | set) & ~clear);
}

/*
 * Use only when the order of set/clear operations is unimportant, otherwise
 * use vm_flags_{set|clear} explicitly.
 */
static inline void vm_flags_mod(struct vm_area_struct *vma,
                                vm_flags_t set, vm_flags_t clear)
{
        vma_start_write(vma);
        __vm_flags_mod(vma, set, clear);
}

static inline void vma_set_anonymous(struct vm_area_struct *vma)
{
        vma->vm_ops = NULL;
}

static inline bool vma_is_anonymous(struct vm_area_struct *vma)
{
        return !vma->vm_ops;
}

/*
 * Indicate if the VMA is a heap for the given task; for
 * /proc/PID/maps that is the heap of the main task.
 */
static inline bool vma_is_initial_heap(const struct vm_area_struct *vma)
{
        return vma->vm_start < vma->vm_mm->brk &&
                vma->vm_end > vma->vm_mm->start_brk;
}

/*
 * Indicate if the VMA is a stack for the given task; for
 * /proc/PID/maps that is the stack of the main task.
 */
static inline bool vma_is_initial_stack(const struct vm_area_struct *vma)
{
        /*
         * We make no effort to guess what a given thread considers to be
         * its "stack".  It's not even well-defined for programs written
         * languages like Go.
         */
        return vma->vm_start <= vma->vm_mm->start_stack &&
                vma->vm_end >= vma->vm_mm->start_stack;
}

static inline bool vma_is_temporary_stack(const struct vm_area_struct *vma)
{
        int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);

        if (!maybe_stack)
                return false;

        if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
                                                VM_STACK_INCOMPLETE_SETUP)
                return true;

        return false;
}

static inline bool vma_is_foreign(const struct vm_area_struct *vma)
{
        if (!current->mm)
                return true;

        if (current->mm != vma->vm_mm)
                return true;

        return false;
}

static inline bool vma_is_accessible(const struct vm_area_struct *vma)
{
        return vma->vm_flags & VM_ACCESS_FLAGS;
}

static inline bool is_shared_maywrite(vm_flags_t vm_flags)
{
        return (vm_flags & (VM_SHARED | VM_MAYWRITE)) ==
                (VM_SHARED | VM_MAYWRITE);
}

static inline bool vma_is_shared_maywrite(const struct vm_area_struct *vma)
{
        return is_shared_maywrite(vma->vm_flags);
}

static inline
struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max)
{
        return mas_find(&vmi->mas, max - 1);
}

static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi)
{
        /*
         * Uses mas_find() to get the first VMA when the iterator starts.
         * Calling mas_next() could skip the first entry.
         */
        return mas_find(&vmi->mas, ULONG_MAX);
}

static inline
struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi)
{
        return mas_next_range(&vmi->mas, ULONG_MAX);
}


static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi)
{
        return mas_prev(&vmi->mas, 0);
}

static inline int vma_iter_clear_gfp(struct vma_iterator *vmi,
                        unsigned long start, unsigned long end, gfp_t gfp)
{
        __mas_set_range(&vmi->mas, start, end - 1);
        mas_store_gfp(&vmi->mas, NULL, gfp);
        if (unlikely(mas_is_err(&vmi->mas)))
                return -ENOMEM;

        return 0;
}

/* Free any unused preallocations */
static inline void vma_iter_free(struct vma_iterator *vmi)
{
        mas_destroy(&vmi->mas);
}

static inline int vma_iter_bulk_store(struct vma_iterator *vmi,
                                      struct vm_area_struct *vma)
{
        vmi->mas.index = vma->vm_start;
        vmi->mas.last = vma->vm_end - 1;
        mas_store(&vmi->mas, vma);
        if (unlikely(mas_is_err(&vmi->mas)))
                return -ENOMEM;

        vma_mark_attached(vma);
        return 0;
}

static inline void vma_iter_invalidate(struct vma_iterator *vmi)
{
        mas_pause(&vmi->mas);
}

static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr)
{
        mas_set(&vmi->mas, addr);
}

#define for_each_vma(__vmi, __vma)                                        \
        while (((__vma) = vma_next(&(__vmi))) != NULL)

/* The MM code likes to work with exclusive end addresses */
#define for_each_vma_range(__vmi, __vma, __end)                                \
        while (((__vma) = vma_find(&(__vmi), (__end))) != NULL)

#ifdef CONFIG_SHMEM
/*
 * The vma_is_shmem is not inline because it is used only by slow
 * paths in userfault.
 */
bool vma_is_shmem(const struct vm_area_struct *vma);
bool vma_is_anon_shmem(const struct vm_area_struct *vma);
#else
static inline bool vma_is_shmem(const struct vm_area_struct *vma) { return false; }
static inline bool vma_is_anon_shmem(const struct vm_area_struct *vma) { return false; }
#endif

int vma_is_stack_for_current(const struct vm_area_struct *vma);

/* flush_tlb_range() takes a vma, not a mm, and can care about flags */
#define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) }

struct mmu_gather;
struct inode;

extern void prep_compound_page(struct page *page, unsigned int order);

static inline unsigned int folio_large_order(const struct folio *folio)
{
        return folio->_flags_1 & 0xff;
}

#ifdef NR_PAGES_IN_LARGE_FOLIO
static inline unsigned long folio_large_nr_pages(const struct folio *folio)
{
        return folio->_nr_pages;
}
#else
static inline unsigned long folio_large_nr_pages(const struct folio *folio)
{
        return 1L << folio_large_order(folio);
}
#endif

/*
 * compound_order() can be called without holding a reference, which means
 * that niceties like page_folio() don't work.  These callers should be
 * prepared to handle wild return values.  For example, PG_head may be
 * set before the order is initialised, or this may be a tail page.
 * See compaction.c for some good examples.
 */
static inline unsigned int compound_order(const struct page *page)
{
        const struct folio *folio = (struct folio *)page;

        if (!test_bit(PG_head, &folio->flags.f))
                return 0;
        return folio_large_order(folio);
}

/**
 * folio_order - The allocation order of a folio.
 * @folio: The folio.
 *
 * A folio is composed of 2^order pages.  See get_order() for the definition
 * of order.
 *
 * Return: The order of the folio.
 */
static inline unsigned int folio_order(const struct folio *folio)
{
        if (!folio_test_large(folio))
                return 0;
        return folio_large_order(folio);
}

/**
 * folio_reset_order - Reset the folio order and derived _nr_pages
 * @folio: The folio.
 *
 * Reset the order and derived _nr_pages to 0. Must only be used in the
 * process of splitting large folios.
 */
static inline void folio_reset_order(struct folio *folio)
{
        if (WARN_ON_ONCE(!folio_test_large(folio)))
                return;
        folio->_flags_1 &= ~0xffUL;
#ifdef NR_PAGES_IN_LARGE_FOLIO
        folio->_nr_pages = 0;
#endif
}

#include <linux/huge_mm.h>

/*
 * Methods to modify the page usage count.
 *
 * What counts for a page usage:
 * - cache mapping   (page->mapping)
 * - private data    (page->private)
 * - page mapped in a task's page tables, each mapping
 *   is counted separately
 *
 * Also, many kernel routines increase the page count before a critical
 * routine so they can be sure the page doesn't go away from under them.
 */

/*
 * Drop a ref, return true if the refcount fell to zero (the page has no users)
 */
static inline int put_page_testzero(struct page *page)
{
        VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
        return page_ref_dec_and_test(page);
}

static inline int folio_put_testzero(struct folio *folio)
{
        return put_page_testzero(&folio->page);
}

/*
 * Try to grab a ref unless the page has a refcount of zero, return false if
 * that is the case.
 * This can be called when MMU is off so it must not access
 * any of the virtual mappings.
 */
static inline bool get_page_unless_zero(struct page *page)
{
        return page_ref_add_unless(page, 1, 0);
}

static inline struct folio *folio_get_nontail_page(struct page *page)
{
        if (unlikely(!get_page_unless_zero(page)))
                return NULL;
        return (struct folio *)page;
}

extern int page_is_ram(unsigned long pfn);

enum {
        REGION_INTERSECTS,
        REGION_DISJOINT,
        REGION_MIXED,
};

int region_intersects(resource_size_t offset, size_t size, unsigned long flags,
                      unsigned long desc);

/* Support for virtually mapped pages */
struct page *vmalloc_to_page(const void *addr);
unsigned long vmalloc_to_pfn(const void *addr);

/*
 * Determine if an address is within the vmalloc range
 *
 * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there
 * is no special casing required.
 */
#ifdef CONFIG_MMU
extern bool is_vmalloc_addr(const void *x);
extern int is_vmalloc_or_module_addr(const void *x);
#else
static inline bool is_vmalloc_addr(const void *x)
{
        return false;
}
static inline int is_vmalloc_or_module_addr(const void *x)
{
        return 0;
}
#endif

/*
 * How many times the entire folio is mapped as a single unit (eg by a
 * PMD or PUD entry).  This is probably not what you want, except for
 * debugging purposes or implementation of other core folio_*() primitives.
 */
static inline int folio_entire_mapcount(const struct folio *folio)
{
        VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
        if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio_large_order(folio) == 1))
                return 0;
        return atomic_read(&folio->_entire_mapcount) + 1;
}

static inline int folio_large_mapcount(const struct folio *folio)
{
        VM_WARN_ON_FOLIO(!folio_test_large(folio), folio);
        return atomic_read(&folio->_large_mapcount) + 1;
}

/**
 * folio_mapcount() - Number of mappings of this folio.
 * @folio: The folio.
 *
 * The folio mapcount corresponds to the number of present user page table
 * entries that reference any part of a folio. Each such present user page
 * table entry must be paired with exactly on folio reference.
 *
 * For ordindary folios, each user page table entry (PTE/PMD/PUD/...) counts
 * exactly once.
 *
 * For hugetlb folios, each abstracted "hugetlb" user page table entry that
 * references the entire folio counts exactly once, even when such special
 * page table entries are comprised of multiple ordinary page table entries.
 *
 * Will report 0 for pages which cannot be mapped into userspace, such as
 * slab, page tables and similar.
 *
 * Return: The number of times this folio is mapped.
 */
static inline int folio_mapcount(const struct folio *folio)
{
        int mapcount;

        if (likely(!folio_test_large(folio))) {
                mapcount = atomic_read(&folio->_mapcount) + 1;
                if (page_mapcount_is_type(mapcount))
                        mapcount = 0;
                return mapcount;
        }
        return folio_large_mapcount(folio);
}

/**
 * folio_mapped - Is this folio mapped into userspace?
 * @folio: The folio.
 *
 * Return: True if any page in this folio is referenced by user page tables.
 */
static inline bool folio_mapped(const struct folio *folio)
{
        return folio_mapcount(folio) >= 1;
}

/*
 * Return true if this page is mapped into pagetables.
 * For compound page it returns true if any sub-page of compound page is mapped,
 * even if this particular sub-page is not itself mapped by any PTE or PMD.
 */
static inline bool page_mapped(const struct page *page)
{
        return folio_mapped(page_folio(page));
}

static inline struct page *virt_to_head_page(const void *x)
{
        struct page *page = virt_to_page(x);

        return compound_head(page);
}

static inline struct folio *virt_to_folio(const void *x)
{
        struct page *page = virt_to_page(x);

        return page_folio(page);
}

void __folio_put(struct folio *folio);

void split_page(struct page *page, unsigned int order);
void folio_copy(struct folio *dst, struct folio *src);
int folio_mc_copy(struct folio *dst, struct folio *src);

unsigned long nr_free_buffer_pages(void);

/* Returns the number of bytes in this potentially compound page. */
static inline unsigned long page_size(const struct page *page)
{
        return PAGE_SIZE << compound_order(page);
}

/* Returns the number of bits needed for the number of bytes in a page */
static inline unsigned int page_shift(struct page *page)
{
        return PAGE_SHIFT + compound_order(page);
}

/**
 * thp_order - Order of a transparent huge page.
 * @page: Head page of a transparent huge page.
 */
static inline unsigned int thp_order(struct page *page)
{
        VM_BUG_ON_PGFLAGS(PageTail(page), page);
        return compound_order(page);
}

/**
 * thp_size - Size of a transparent huge page.
 * @page: Head page of a transparent huge page.
 *
 * Return: Number of bytes in this page.
 */
static inline unsigned long thp_size(struct page *page)
{
        return PAGE_SIZE << thp_order(page);
}

#ifdef CONFIG_MMU
/*
 * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
 * servicing faults for write access.  In the normal case, do always want
 * pte_mkwrite.  But get_user_pages can cause write faults for mappings
 * that do not have writing enabled, when used by access_process_vm.
 */
static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
        if (likely(vma->vm_flags & VM_WRITE))
                pte = pte_mkwrite(pte, vma);
        return pte;
}

vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page);
void set_pte_range(struct vm_fault *vmf, struct folio *folio,
                struct page *page, unsigned int nr, unsigned long addr);

vm_fault_t finish_fault(struct vm_fault *vmf);
#endif

/*
 * Multiple processes may "see" the same page. E.g. for untouched
 * mappings of /dev/null, all processes see the same page full of
 * zeroes, and text pages of executables and shared libraries have
 * only one copy in memory, at most, normally.
 *
 * For the non-reserved pages, page_count(page) denotes a reference count.
 *   page_count() == 0 means the page is free. page->lru is then used for
 *   freelist management in the buddy allocator.
 *   page_count() > 0  means the page has been allocated.
 *
 * Pages are allocated by the slab allocator in order to provide memory
 * to kmalloc and kmem_cache_alloc. In this case, the management of the
 * page, and the fields in 'struct page' are the responsibility of mm/slab.c
 * unless a particular usage is carefully commented. (the responsibility of
 * freeing the kmalloc memory is the caller's, of course).
 *
 * A page may be used by anyone else who does a __get_free_page().
 * In this case, page_count still tracks the references, and should only
 * be used through the normal accessor functions. The top bits of page->flags
 * and page->virtual store page management information, but all other fields
 * are unused and could be used privately, carefully. The management of this
 * page is the responsibility of the one who allocated it, and those who have
 * subsequently been given references to it.
 *
 * The other pages (we may call them "pagecache pages") are completely
 * managed by the Linux memory manager: I/O, buffers, swapping etc.
 * The following discussion applies only to them.
 *
 * A pagecache page contains an opaque `private' member, which belongs to the
 * page's address_space. Usually, this is the address of a circular list of
 * the page's disk buffers. PG_private must be set to tell the VM to call
 * into the filesystem to release these pages.
 *
 * A folio may belong to an inode's memory mapping. In this case,
 * folio->mapping points to the inode, and folio->index is the file
 * offset of the folio, in units of PAGE_SIZE.
 *
 * If pagecache pages are not associated with an inode, they are said to be
 * anonymous pages. These may become associated with the swapcache, and in that
 * case PG_swapcache is set, and page->private is an offset into the swapcache.
 *
 * In either case (swapcache or inode backed), the pagecache itself holds one
 * reference to the page. Setting PG_private should also increment the
 * refcount. The each user mapping also has a reference to the page.
 *
 * The pagecache pages are stored in a per-mapping radix tree, which is
 * rooted at mapping->i_pages, and indexed by offset.
 * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space
 * lists, we instead now tag pages as dirty/writeback in the radix tree.
 *
 * All pagecache pages may be subject to I/O:
 * - inode pages may need to be read from disk,
 * - inode pages which have been modified and are MAP_SHARED may need
 *   to be written back to the inode on disk,
 * - anonymous pages (including MAP_PRIVATE file mappings) which have been
 *   modified may need to be swapped out to swap space and (later) to be read
 *   back into memory.
 */

/* 127: arbitrary random number, small enough to assemble well */
#define folio_ref_zero_or_close_to_overflow(folio) \
        ((unsigned int) folio_ref_count(folio) + 127u <= 127u)

/**
 * folio_get - Increment the reference count on a folio.
 * @folio: The folio.
 *
 * Context: May be called in any context, as long as you know that
 * you have a refcount on the folio.  If you do not already have one,
 * folio_try_get() may be the right interface for you to use.
 */
static inline void folio_get(struct folio *folio)
{
        VM_BUG_ON_FOLIO(folio_ref_zero_or_close_to_overflow(folio), folio);
        folio_ref_inc(folio);
}

static inline void get_page(struct page *page)
{
        struct folio *folio = page_folio(page);
        if (WARN_ON_ONCE(folio_test_slab(folio)))
                return;
        if (WARN_ON_ONCE(folio_test_large_kmalloc(folio)))
                return;
        folio_get(folio);
}

static inline __must_check bool try_get_page(struct page *page)
{
        page = compound_head(page);
        if (WARN_ON_ONCE(page_ref_count(page) <= 0))
                return false;
        page_ref_inc(page);
        return true;
}

/**
 * folio_put - Decrement the reference count on a folio.
 * @folio: The folio.
 *
 * If the folio's reference count reaches zero, the memory will be
 * released back to the page allocator and may be used by another
 * allocation immediately.  Do not access the memory or the struct folio
 * after calling folio_put() unless you can be sure that it wasn't the
 * last reference.
 *
 * Context: May be called in process or interrupt context, but not in NMI
 * context.  May be called while holding a spinlock.
 */
static inline void folio_put(struct folio *folio)
{
        if (folio_put_testzero(folio))
                __folio_put(folio);
}

/**
 * folio_put_refs - Reduce the reference count on a folio.
 * @folio: The folio.
 * @refs: The amount to subtract from the folio's reference count.
 *
 * If the folio's reference count reaches zero, the memory will be
 * released back to the page allocator and may be used by another
 * allocation immediately.  Do not access the memory or the struct folio
 * after calling folio_put_refs() unless you can be sure that these weren't
 * the last references.
 *
 * Context: May be called in process or interrupt context, but not in NMI
 * context.  May be called while holding a spinlock.
 */
static inline void folio_put_refs(struct folio *folio, int refs)
{
        if (folio_ref_sub_and_test(folio, refs))
                __folio_put(folio);
}

void folios_put_refs(struct folio_batch *folios, unsigned int *refs);

/*
 * union release_pages_arg - an array of pages or folios
 *
 * release_pages() releases a simple array of multiple pages, and
 * accepts various different forms of said page array: either
 * a regular old boring array of pages, an array of folios, or
 * an array of encoded page pointers.
 *
 * The transparent union syntax for this kind of "any of these
 * argument types" is all kinds of ugly, so look away.
 */
typedef union {
        struct page **pages;
        struct folio **folios;
        struct encoded_page **encoded_pages;
} release_pages_arg __attribute__ ((__transparent_union__));

void release_pages(release_pages_arg, int nr);

/**
 * folios_put - Decrement the reference count on an array of folios.
 * @folios: The folios.
 *
 * Like folio_put(), but for a batch of folios.  This is more efficient
 * than writing the loop yourself as it will optimise the locks which need
 * to be taken if the folios are freed.  The folios batch is returned
 * empty and ready to be reused for another batch; there is no need to
 * reinitialise it.
 *
 * Context: May be called in process or interrupt context, but not in NMI
 * context.  May be called while holding a spinlock.
 */
static inline void folios_put(struct folio_batch *folios)
{
        folios_put_refs(folios, NULL);
}

static inline void put_page(struct page *page)
{
        struct folio *folio = page_folio(page);

        if (folio_test_slab(folio) || folio_test_large_kmalloc(folio))
                return;

        folio_put(folio);
}

/*
 * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload
 * the page's refcount so that two separate items are tracked: the original page
 * reference count, and also a new count of how many pin_user_pages() calls were
 * made against the page. ("gup-pinned" is another term for the latter).
 *
 * With this scheme, pin_user_pages() becomes special: such pages are marked as
 * distinct from normal pages. As such, the unpin_user_page() call (and its
 * variants) must be used in order to release gup-pinned pages.
 *
 * Choice of value:
 *
 * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference
 * counts with respect to pin_user_pages() and unpin_user_page() becomes
 * simpler, due to the fact that adding an even power of two to the page
 * refcount has the effect of using only the upper N bits, for the code that
 * counts up using the bias value. This means that the lower bits are left for
 * the exclusive use of the original code that increments and decrements by one
 * (or at least, by much smaller values than the bias value).
 *
 * Of course, once the lower bits overflow into the upper bits (and this is
 * OK, because subtraction recovers the original values), then visual inspection
 * no longer suffices to directly view the separate counts. However, for normal
 * applications that don't have huge page reference counts, this won't be an
 * issue.
 *
 * Locking: the lockless algorithm described in folio_try_get_rcu()
 * provides safe operation for get_user_pages(), folio_mkclean() and
 * other calls that race to set up page table entries.
 */
#define GUP_PIN_COUNTING_BIAS (1U << 10)

void unpin_user_page(struct page *page);
void unpin_folio(struct folio *folio);
void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
                                 bool make_dirty);
void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
                                      bool make_dirty);
void unpin_user_pages(struct page **pages, unsigned long npages);
void unpin_user_folio(struct folio *folio, unsigned long npages);
void unpin_folios(struct folio **folios, unsigned long nfolios);

static inline bool is_cow_mapping(vm_flags_t flags)
{
        return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}

#ifndef CONFIG_MMU
static inline bool is_nommu_shared_mapping(vm_flags_t flags)
{
        /*
         * NOMMU shared mappings are ordinary MAP_SHARED mappings and selected
         * R/O MAP_PRIVATE file mappings that are an effective R/O overlay of
         * a file mapping. R/O MAP_PRIVATE mappings might still modify
         * underlying memory if ptrace is active, so this is only possible if
         * ptrace does not apply. Note that there is no mprotect() to upgrade
         * write permissions later.
         */
        return flags & (VM_MAYSHARE | VM_MAYOVERLAY);
}
#endif

#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
#define SECTION_IN_PAGE_FLAGS
#endif

/*
 * The identification function is mainly used by the buddy allocator for
 * determining if two pages could be buddies. We are not really identifying
 * the zone since we could be using the section number id if we do not have
 * node id available in page flags.
 * We only guarantee that it will return the same value for two combinable
 * pages in a zone.
 */
static inline int page_zone_id(struct page *page)
{
        return (page->flags.f >> ZONEID_PGSHIFT) & ZONEID_MASK;
}

#ifdef NODE_NOT_IN_PAGE_FLAGS
int memdesc_nid(memdesc_flags_t mdf);
#else
static inline int memdesc_nid(memdesc_flags_t mdf)
{
        return (mdf.f >> NODES_PGSHIFT) & NODES_MASK;
}
#endif

static inline int page_to_nid(const struct page *page)
{
        return memdesc_nid(PF_POISONED_CHECK(page)->flags);
}

static inline int folio_nid(const struct folio *folio)
{
        return memdesc_nid(folio->flags);
}

#ifdef CONFIG_NUMA_BALANCING
/* page access time bits needs to hold at least 4 seconds */
#define PAGE_ACCESS_TIME_MIN_BITS        12
#if LAST_CPUPID_SHIFT < PAGE_ACCESS_TIME_MIN_BITS
#define PAGE_ACCESS_TIME_BUCKETS                                \
        (PAGE_ACCESS_TIME_MIN_BITS - LAST_CPUPID_SHIFT)
#else
#define PAGE_ACCESS_TIME_BUCKETS        0
#endif

#define PAGE_ACCESS_TIME_MASK                                \
        (LAST_CPUPID_MASK << PAGE_ACCESS_TIME_BUCKETS)

static inline int cpu_pid_to_cpupid(int cpu, int pid)
{
        return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK);
}

static inline int cpupid_to_pid(int cpupid)
{
        return cpupid & LAST__PID_MASK;
}

static inline int cpupid_to_cpu(int cpupid)
{
        return (cpupid >> LAST__PID_SHIFT) & LAST__CPU_MASK;
}

static inline int cpupid_to_nid(int cpupid)
{
        return cpu_to_node(cpupid_to_cpu(cpupid));
}

static inline bool cpupid_pid_unset(int cpupid)
{
        return cpupid_to_pid(cpupid) == (-1 & LAST__PID_MASK);
}

static inline bool cpupid_cpu_unset(int cpupid)
{
        return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK);
}

static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid)
{
        return (task_pid & LAST__PID_MASK) == cpupid_to_pid(cpupid);
}

#define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid)
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
{
        return xchg(&folio->_last_cpupid, cpupid & LAST_CPUPID_MASK);
}

static inline int folio_last_cpupid(struct folio *folio)
{
        return folio->_last_cpupid;
}
static inline void page_cpupid_reset_last(struct page *page)
{
        page->_last_cpupid = -1 & LAST_CPUPID_MASK;
}
#else
static inline int folio_last_cpupid(struct folio *folio)
{
        return (folio->flags.f >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
}

int folio_xchg_last_cpupid(struct folio *folio, int cpupid);

static inline void page_cpupid_reset_last(struct page *page)
{
        page->flags.f |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT;
}
#endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */

static inline int folio_xchg_access_time(struct folio *folio, int time)
{
        int last_time;

        last_time = folio_xchg_last_cpupid(folio,
                                           time >> PAGE_ACCESS_TIME_BUCKETS);
        return last_time << PAGE_ACCESS_TIME_BUCKETS;
}

static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
{
        unsigned int pid_bit;

        pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
        if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->pids_active[1])) {
                __set_bit(pid_bit, &vma->numab_state->pids_active[1]);
        }
}

bool folio_use_access_time(struct folio *folio);
#else /* !CONFIG_NUMA_BALANCING */
static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
{
        return folio_nid(folio); /* XXX */
}

static inline int folio_xchg_access_time(struct folio *folio, int time)
{
        return 0;
}

static inline int folio_last_cpupid(struct folio *folio)
{
        return folio_nid(folio); /* XXX */
}

static inline int cpupid_to_nid(int cpupid)
{
        return -1;
}

static inline int cpupid_to_pid(int cpupid)
{
        return -1;
}

static inline int cpupid_to_cpu(int cpupid)
{
        return -1;
}

static inline int cpu_pid_to_cpupid(int nid, int pid)
{
        return -1;
}

static inline bool cpupid_pid_unset(int cpupid)
{
        return true;
}

static inline void page_cpupid_reset_last(struct page *page)
{
}

static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
{
        return false;
}

static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
{
}
static inline bool folio_use_access_time(struct folio *folio)
{
        return false;
}
#endif /* CONFIG_NUMA_BALANCING */

#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)

/*
 * KASAN per-page tags are stored xor'ed with 0xff. This allows to avoid
 * setting tags for all pages to native kernel tag value 0xff, as the default
 * value 0x00 maps to 0xff.
 */

static inline u8 page_kasan_tag(const struct page *page)
{
        u8 tag = KASAN_TAG_KERNEL;

        if (kasan_enabled()) {
                tag = (page->flags.f >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
                tag ^= 0xff;
        }

        return tag;
}

static inline void page_kasan_tag_set(struct page *page, u8 tag)
{
        unsigned long old_flags, flags;

        if (!kasan_enabled())
                return;

        tag ^= 0xff;
        old_flags = READ_ONCE(page->flags.f);
        do {
                flags = old_flags;
                flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT);
                flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
        } while (unlikely(!try_cmpxchg(&page->flags.f, &old_flags, flags)));
}

static inline void page_kasan_tag_reset(struct page *page)
{
        if (kasan_enabled())
                page_kasan_tag_set(page, KASAN_TAG_KERNEL);
}

#else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */

static inline u8 page_kasan_tag(const struct page *page)
{
        return 0xff;
}

static inline void page_kasan_tag_set(struct page *page, u8 tag) { }
static inline void page_kasan_tag_reset(struct page *page) { }

#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */

static inline struct zone *page_zone(const struct page *page)
{
        return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
}

static inline pg_data_t *page_pgdat(const struct page *page)
{
        return NODE_DATA(page_to_nid(page));
}

static inline pg_data_t *folio_pgdat(const struct folio *folio)
{
        return NODE_DATA(folio_nid(folio));
}

static inline struct zone *folio_zone(const struct folio *folio)
{
        return &folio_pgdat(folio)->node_zones[folio_zonenum(folio)];
}

#ifdef SECTION_IN_PAGE_FLAGS
static inline void set_page_section(struct page *page, unsigned long section)
{
        page->flags.f &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
        page->flags.f |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
}

static inline unsigned long memdesc_section(memdesc_flags_t mdf)
{
        return (mdf.f >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
}
#else /* !SECTION_IN_PAGE_FLAGS */
static inline unsigned long memdesc_section(memdesc_flags_t mdf)
{
        return 0;
}
#endif /* SECTION_IN_PAGE_FLAGS */

/**
 * folio_pfn - Return the Page Frame Number of a folio.
 * @folio: The folio.
 *
 * A folio may contain multiple pages.  The pages have consecutive
 * Page Frame Numbers.
 *
 * Return: The Page Frame Number of the first page in the folio.
 */
static inline unsigned long folio_pfn(const struct folio *folio)
{
        return page_to_pfn(&folio->page);
}

static inline struct folio *pfn_folio(unsigned long pfn)
{
        return page_folio(pfn_to_page(pfn));
}

#ifdef CONFIG_MMU
static inline pte_t mk_pte(const struct page *page, pgprot_t pgprot)
{
        return pfn_pte(page_to_pfn(page), pgprot);
}

/**
 * folio_mk_pte - Create a PTE for this folio
 * @folio: The folio to create a PTE for
 * @pgprot: The page protection bits to use
 *
 * Create a page table entry for the first page of this folio.
 * This is suitable for passing to set_ptes().
 *
 * Return: A page table entry suitable for mapping this folio.
 */
static inline pte_t folio_mk_pte(const struct folio *folio, pgprot_t pgprot)
{
        return pfn_pte(folio_pfn(folio), pgprot);
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/**
 * folio_mk_pmd - Create a PMD for this folio
 * @folio: The folio to create a PMD for
 * @pgprot: The page protection bits to use
 *
 * Create a page table entry for the first page of this folio.
 * This is suitable for passing to set_pmd_at().
 *
 * Return: A page table entry suitable for mapping this folio.
 */
static inline pmd_t folio_mk_pmd(const struct folio *folio, pgprot_t pgprot)
{
        return pmd_mkhuge(pfn_pmd(folio_pfn(folio), pgprot));
}

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
/**
 * folio_mk_pud - Create a PUD for this folio
 * @folio: The folio to create a PUD for
 * @pgprot: The page protection bits to use
 *
 * Create a page table entry for the first page of this folio.
 * This is suitable for passing to set_pud_at().
 *
 * Return: A page table entry suitable for mapping this folio.
 */
static inline pud_t folio_mk_pud(const struct folio *folio, pgprot_t pgprot)
{
        return pud_mkhuge(pfn_pud(folio_pfn(folio), pgprot));
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif /* CONFIG_MMU */

static inline bool folio_has_pincount(const struct folio *folio)
{
        if (IS_ENABLED(CONFIG_64BIT))
                return folio_test_large(folio);
        return folio_order(folio) > 1;
}

/**
 * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA.
 * @folio: The folio.
 *
 * This function checks if a folio has been pinned via a call to
 * a function in the pin_user_pages() family.
 *
 * For small folios, the return value is partially fuzzy: false is not fuzzy,
 * because it means "definitely not pinned for DMA", but true means "probably
 * pinned for DMA, but possibly a false positive due to having at least
 * GUP_PIN_COUNTING_BIAS worth of normal folio references".
 *
 * False positives are OK, because: a) it's unlikely for a folio to
 * get that many refcounts, and b) all the callers of this routine are
 * expected to be able to deal gracefully with a false positive.
 *
 * For most large folios, the result will be exactly correct. That's because
 * we have more tracking data available: the _pincount field is used
 * instead of the GUP_PIN_COUNTING_BIAS scheme.
 *
 * For more information, please see Documentation/core-api/pin_user_pages.rst.
 *
 * Return: True, if it is likely that the folio has been "dma-pinned".
 * False, if the folio is definitely not dma-pinned.
 */
static inline bool folio_maybe_dma_pinned(struct folio *folio)
{
        if (folio_has_pincount(folio))
                return atomic_read(&folio->_pincount) > 0;

        /*
         * folio_ref_count() is signed. If that refcount overflows, then
         * folio_ref_count() returns a negative value, and callers will avoid
         * further incrementing the refcount.
         *
         * Here, for that overflow case, use the sign bit to count a little
         * bit higher via unsigned math, and thus still get an accurate result.
         */
        return ((unsigned int)folio_ref_count(folio)) >=
                GUP_PIN_COUNTING_BIAS;
}

/*
 * This should most likely only be called during fork() to see whether we
 * should break the cow immediately for an anon page on the src mm.
 *
 * The caller has to hold the PT lock and the vma->vm_mm->->write_protect_seq.
 */
static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma,
                                          struct folio *folio)
{
        VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1));

        if (!mm_flags_test(MMF_HAS_PINNED, vma->vm_mm))
                return false;

        return folio_maybe_dma_pinned(folio);
}

/**
 * is_zero_page - Query if a page is a zero page
 * @page: The page to query
 *
 * This returns true if @page is one of the permanent zero pages.
 */
static inline bool is_zero_page(const struct page *page)
{
        return is_zero_pfn(page_to_pfn(page));
}

/**
 * is_zero_folio - Query if a folio is a zero page
 * @folio: The folio to query
 *
 * This returns true if @folio is one of the permanent zero pages.
 */
static inline bool is_zero_folio(const struct folio *folio)
{
        return is_zero_page(&folio->page);
}

/* MIGRATE_CMA and ZONE_MOVABLE do not allow pin folios */
#ifdef CONFIG_MIGRATION
static inline bool folio_is_longterm_pinnable(struct folio *folio)
{
#ifdef CONFIG_CMA
        int mt = folio_migratetype(folio);

        if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE)
                return false;
#endif
        /* The zero page can be "pinned" but gets special handling. */
        if (is_zero_folio(folio))
                return true;

        /* Coherent device memory must always allow eviction. */
        if (folio_is_device_coherent(folio))
                return false;

        /*
         * Filesystems can only tolerate transient delays to truncate and
         * hole-punch operations
         */
        if (folio_is_fsdax(folio))
                return false;

        /* Otherwise, non-movable zone folios can be pinned. */
        return !folio_is_zone_movable(folio);

}
#else
static inline bool folio_is_longterm_pinnable(struct folio *folio)
{
        return true;
}
#endif

static inline void set_page_zone(struct page *page, enum zone_type zone)
{
        page->flags.f &= ~(ZONES_MASK << ZONES_PGSHIFT);
        page->flags.f |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
}

static inline void set_page_node(struct page *page, unsigned long node)
{
        page->flags.f &= ~(NODES_MASK << NODES_PGSHIFT);
        page->flags.f |= (node & NODES_MASK) << NODES_PGSHIFT;
}

static inline void set_page_links(struct page *page, enum zone_type zone,
        unsigned long node, unsigned long pfn)
{
        set_page_zone(page, zone);
        set_page_node(page, node);
#ifdef SECTION_IN_PAGE_FLAGS
        set_page_section(page, pfn_to_section_nr(pfn));
#endif
}

/**
 * folio_nr_pages - The number of pages in the folio.
 * @folio: The folio.
 *
 * Return: A positive power of two.
 */
static inline unsigned long folio_nr_pages(const struct folio *folio)
{
        if (!folio_test_large(folio))
                return 1;
        return folio_large_nr_pages(folio);
}

#if !defined(CONFIG_HAVE_GIGANTIC_FOLIOS)
/*
 * We don't expect any folios that exceed buddy sizes (and consequently
 * memory sections).
 */
#define MAX_FOLIO_ORDER                MAX_PAGE_ORDER
#elif defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
/*
 * Only pages within a single memory section are guaranteed to be
 * contiguous. By limiting folios to a single memory section, all folio
 * pages are guaranteed to be contiguous.
 */
#define MAX_FOLIO_ORDER                PFN_SECTION_SHIFT
#elif defined(CONFIG_HUGETLB_PAGE)
/*
 * There is no real limit on the folio size. We limit them to the maximum we
 * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect
 * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit.
 */
#define MAX_FOLIO_ORDER                get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G)
#else
/*
 * Without hugetlb, gigantic folios that are bigger than a single PUD are
 * currently impossible.
 */
#define MAX_FOLIO_ORDER                PUD_ORDER
#endif

#define MAX_FOLIO_NR_PAGES        (1UL << MAX_FOLIO_ORDER)

/*
 * compound_nr() returns the number of pages in this potentially compound
 * page.  compound_nr() can be called on a tail page, and is defined to
 * return 1 in that case.
 */
static inline unsigned long compound_nr(const struct page *page)
{
        const struct folio *folio = (struct folio *)page;

        if (!test_bit(PG_head, &folio->flags.f))
                return 1;
        return folio_large_nr_pages(folio);
}

/**
 * folio_next - Move to the next physical folio.
 * @folio: The folio we're currently operating on.
 *
 * If you have physically contiguous memory which may span more than
 * one folio (eg a &struct bio_vec), use this function to move from one
 * folio to the next.  Do not use it if the memory is only virtually
 * contiguous as the folios are almost certainly not adjacent to each
 * other.  This is the folio equivalent to writing ``page++``.
 *
 * Context: We assume that the folios are refcounted and/or locked at a
 * higher level and do not adjust the reference counts.
 * Return: The next struct folio.
 */
static inline struct folio *folio_next(struct folio *folio)
{
        return (struct folio *)folio_page(folio, folio_nr_pages(folio));
}

/**
 * folio_shift - The size of the memory described by this folio.
 * @folio: The folio.
 *
 * A folio represents a number of bytes which is a power-of-two in size.
 * This function tells you which power-of-two the folio is.  See also
 * folio_size() and folio_order().
 *
 * Context: The caller should have a reference on the folio to prevent
 * it from being split.  It is not necessary for the folio to be locked.
 * Return: The base-2 logarithm of the size of this folio.
 */
static inline unsigned int folio_shift(const struct folio *folio)
{
        return PAGE_SHIFT + folio_order(folio);
}

/**
 * folio_size - The number of bytes in a folio.
 * @folio: The folio.
 *
 * Context: The caller should have a reference on the folio to prevent
 * it from being split.  It is not necessary for the folio to be locked.
 * Return: The number of bytes in this folio.
 */
static inline size_t folio_size(const struct folio *folio)
{
        return PAGE_SIZE << folio_order(folio);
}

/**
 * folio_maybe_mapped_shared - Whether the folio is mapped into the page
 *                               tables of more than one MM
 * @folio: The folio.
 *
 * This function checks if the folio maybe currently mapped into more than one
 * MM ("maybe mapped shared"), or if the folio is certainly mapped into a single
 * MM ("mapped exclusively").
 *
 * For KSM folios, this function also returns "mapped shared" when a folio is
 * mapped multiple times into the same MM, because the individual page mappings
 * are independent.
 *
 * For small anonymous folios and anonymous hugetlb folios, the return
 * value will be exactly correct: non-KSM folios can only be mapped at most once
 * into an MM, and they cannot be partially mapped. KSM folios are
 * considered shared even if mapped multiple times into the same MM.
 *
 * For other folios, the result can be fuzzy:
 *    #. For partially-mappable large folios (THP), the return value can wrongly
 *       indicate "mapped shared" (false positive) if a folio was mapped by
 *       more than two MMs at one point in time.
 *    #. For pagecache folios (including hugetlb), the return value can wrongly
 *       indicate "mapped shared" (false positive) when two VMAs in the same MM
 *       cover the same file range.
 *
 * Further, this function only considers current page table mappings that
 * are tracked using the folio mapcount(s).
 *
 * This function does not consider:
 *    #. If the folio might get mapped in the (near) future (e.g., swapcache,
 *       pagecache, temporary unmapping for migration).
 *    #. If the folio is mapped differently (VM_PFNMAP).
 *    #. If hugetlb page table sharing applies. Callers might want to check
 *       hugetlb_pmd_shared().
 *
 * Return: Whether the folio is estimated to be mapped into more than one MM.
 */
static inline bool folio_maybe_mapped_shared(struct folio *folio)
{
        int mapcount = folio_mapcount(folio);

        /* Only partially-mappable folios require more care. */
        if (!folio_test_large(folio) || unlikely(folio_test_hugetlb(folio)))
                return mapcount > 1;

        /*
         * vm_insert_page() without CONFIG_TRANSPARENT_HUGEPAGE ...
         * simply assume "mapped shared", nobody should really care
         * about this for arbitrary kernel allocations.
         */
        if (!IS_ENABLED(CONFIG_MM_ID))
                return true;

        /*
         * A single mapping implies "mapped exclusively", even if the
         * folio flag says something different: it's easier to handle this
         * case here instead of on the RMAP hot path.
         */
        if (mapcount <= 1)
                return false;
        return test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids);
}

/**
 * folio_expected_ref_count - calculate the expected folio refcount
 * @folio: the folio
 *
 * Calculate the expected folio refcount, taking references from the pagecache,
 * swapcache, PG_private and page table mappings into account. Useful in
 * combination with folio_ref_count() to detect unexpected references (e.g.,
 * GUP or other temporary references).
 *
 * Does currently not consider references from the LRU cache. If the folio
 * was isolated from the LRU (which is the case during migration or split),
 * the LRU cache does not apply.
 *
 * Calling this function on an unmapped folio -- !folio_mapped() -- that is
 * locked will return a stable result.
 *
 * Calling this function on a mapped folio will not result in a stable result,
 * because nothing stops additional page table mappings from coming (e.g.,
 * fork()) or going (e.g., munmap()).
 *
 * Calling this function without the folio lock will also not result in a
 * stable result: for example, the folio might get dropped from the swapcache
 * concurrently.
 *
 * However, even when called without the folio lock or on a mapped folio,
 * this function can be used to detect unexpected references early (for example,
 * if it makes sense to even lock the folio and unmap it).
 *
 * The caller must add any reference (e.g., from folio_try_get()) it might be
 * holding itself to the result.
 *
 * Returns the expected folio refcount.
 */
static inline int folio_expected_ref_count(const struct folio *folio)
{
        const int order = folio_order(folio);
        int ref_count = 0;

        if (WARN_ON_ONCE(page_has_type(&folio->page) && !folio_test_hugetlb(folio)))
                return 0;

        if (folio_test_anon(folio)) {
                /* One reference per page from the swapcache. */
                ref_count += folio_test_swapcache(folio) << order;
        } else {
                /* One reference per page from the pagecache. */
                ref_count += !!folio->mapping << order;
                /* One reference from PG_private. */
                ref_count += folio_test_private(folio);
        }

        /* One reference per page table mapping. */
        return ref_count + folio_mapcount(folio);
}

#ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE
static inline int arch_make_folio_accessible(struct folio *folio)
{
        return 0;
}
#endif

/*
 * Some inline functions in vmstat.h depend on page_zone()
 */
#include <linux/vmstat.h>

#if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL)
#define HASHED_PAGE_VIRTUAL
#endif

#if defined(WANT_PAGE_VIRTUAL)
static inline void *page_address(const struct page *page)
{
        return page->virtual;
}
static inline void set_page_address(struct page *page, void *address)
{
        page->virtual = address;
}
#define page_address_init()  do { } while(0)
#endif

#if defined(HASHED_PAGE_VIRTUAL)
void *page_address(const struct page *page);
void set_page_address(struct page *page, void *virtual);
void page_address_init(void);
#endif

static __always_inline void *lowmem_page_address(const struct page *page)
{
        return page_to_virt(page);
}

#if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL)
#define page_address(page) lowmem_page_address(page)
#define set_page_address(page, address)  do { } while(0)
#define page_address_init()  do { } while(0)
#endif

static inline void *folio_address(const struct folio *folio)
{
        return page_address(&folio->page);
}

/*
 * Return true only if the page has been allocated with
 * ALLOC_NO_WATERMARKS and the low watermark was not
 * met implying that the system is under some pressure.
 */
static inline bool page_is_pfmemalloc(const struct page *page)
{
        /*
         * lru.next has bit 1 set if the page is allocated from the
         * pfmemalloc reserves.  Callers may simply overwrite it if
         * they do not need to preserve that information.
         */
        return (uintptr_t)page->lru.next & BIT(1);
}

/*
 * Return true only if the folio has been allocated with
 * ALLOC_NO_WATERMARKS and the low watermark was not
 * met implying that the system is under some pressure.
 */
static inline bool folio_is_pfmemalloc(const struct folio *folio)
{
        /*
         * lru.next has bit 1 set if the page is allocated from the
         * pfmemalloc reserves.  Callers may simply overwrite it if
         * they do not need to preserve that information.
         */
        return (uintptr_t)folio->lru.next & BIT(1);
}

/*
 * Only to be called by the page allocator on a freshly allocated
 * page.
 */
static inline void set_page_pfmemalloc(struct page *page)
{
        page->lru.next = (void *)BIT(1);
}

static inline void clear_page_pfmemalloc(struct page *page)
{
        page->lru.next = NULL;
}

/*
 * Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
 */
extern void pagefault_out_of_memory(void);

#define offset_in_page(p)        ((unsigned long)(p) & ~PAGE_MASK)
#define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1))

/*
 * Parameter block passed down to zap_pte_range in exceptional cases.
 */
struct zap_details {
        struct folio *single_folio;        /* Locked folio to be unmapped */
        bool even_cows;                        /* Zap COWed private pages too? */
        bool reclaim_pt;                /* Need reclaim page tables? */
        zap_flags_t zap_flags;                /* Extra flags for zapping */
};

/*
 * Whether to drop the pte markers, for example, the uffd-wp information for
 * file-backed memory.  This should only be specified when we will completely
 * drop the page in the mm, either by truncation or unmapping of the vma.  By
 * default, the flag is not set.
 */
#define  ZAP_FLAG_DROP_MARKER        ((__force zap_flags_t) BIT(0))
/* Set in unmap_vmas() to indicate a final unmap call.  Only used by hugetlb */
#define  ZAP_FLAG_UNMAP              ((__force zap_flags_t) BIT(1))

#ifdef CONFIG_SCHED_MM_CID
void sched_mm_cid_before_execve(struct task_struct *t);
void sched_mm_cid_after_execve(struct task_struct *t);
void sched_mm_cid_fork(struct task_struct *t);
void sched_mm_cid_exit_signals(struct task_struct *t);
static inline int task_mm_cid(struct task_struct *t)
{
        return t->mm_cid;
}
#else
static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
static inline void sched_mm_cid_fork(struct task_struct *t) { }
static inline void sched_mm_cid_exit_signals(struct task_struct *t) { }
static inline int task_mm_cid(struct task_struct *t)
{
        /*
         * Use the processor id as a fall-back when the mm cid feature is
         * disabled. This provides functional per-cpu data structure accesses
         * in user-space, althrough it won't provide the memory usage benefits.
         */
        return raw_smp_processor_id();
}
#endif

#ifdef CONFIG_MMU
extern bool can_do_mlock(void);
#else
static inline bool can_do_mlock(void) { return false; }
#endif
extern int user_shm_lock(size_t, struct ucounts *);
extern void user_shm_unlock(size_t, struct ucounts *);

struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
                             pte_t pte);
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
                             pte_t pte);
struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
                                  unsigned long addr, pmd_t pmd);
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
                                pmd_t pmd);
struct page *vm_normal_page_pud(struct vm_area_struct *vma, unsigned long addr,
                pud_t pud);

void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
                  unsigned long size);
void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
                           unsigned long size, struct zap_details *details);
static inline void zap_vma_pages(struct vm_area_struct *vma)
{
        zap_page_range_single(vma, vma->vm_start,
                              vma->vm_end - vma->vm_start, NULL);
}
void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
                struct vm_area_struct *start_vma, unsigned long start,
                unsigned long end, unsigned long tree_end, bool mm_wr_locked);

struct mmu_notifier_range;

void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
                unsigned long end, unsigned long floor, unsigned long ceiling);
int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
                        void *buf, int len, int write);

struct follow_pfnmap_args {
        /**
         * Inputs:
         * @vma: Pointer to @vm_area_struct struct
         * @address: the virtual address to walk
         */
        struct vm_area_struct *vma;
        unsigned long address;
        /**
         * Internals:
         *
         * The caller shouldn't touch any of these.
         */
        spinlock_t *lock;
        pte_t *ptep;
        /**
         * Outputs:
         *
         * @pfn: the PFN of the address
         * @addr_mask: address mask covering pfn
         * @pgprot: the pgprot_t of the mapping
         * @writable: whether the mapping is writable
         * @special: whether the mapping is a special mapping (real PFN maps)
         */
        unsigned long pfn;
        unsigned long addr_mask;
        pgprot_t pgprot;
        bool writable;
        bool special;
};
int follow_pfnmap_start(struct follow_pfnmap_args *args);
void follow_pfnmap_end(struct follow_pfnmap_args *args);

extern void truncate_pagecache(struct inode *inode, loff_t new);
extern void truncate_setsize(struct inode *inode, loff_t newsize);
void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
int generic_error_remove_folio(struct address_space *mapping,
                struct folio *folio);

struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
                unsigned long address, struct pt_regs *regs);

#ifdef CONFIG_MMU
extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
                                  unsigned long address, unsigned int flags,
                                  struct pt_regs *regs);
extern int fixup_user_fault(struct mm_struct *mm,
                            unsigned long address, unsigned int fault_flags,
                            bool *unlocked);
void unmap_mapping_pages(struct address_space *mapping,
                pgoff_t start, pgoff_t nr, bool even_cows);
void unmap_mapping_range(struct address_space *mapping,
                loff_t const holebegin, loff_t const holelen, int even_cows);
#else
static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
                                         unsigned long address, unsigned int flags,
                                         struct pt_regs *regs)
{
        /* should never happen if there's no MMU */
        BUG();
        return VM_FAULT_SIGBUS;
}
static inline int fixup_user_fault(struct mm_struct *mm, unsigned long address,
                unsigned int fault_flags, bool *unlocked)
{
        /* should never happen if there's no MMU */
        BUG();
        return -EFAULT;
}
static inline void unmap_mapping_pages(struct address_space *mapping,
                pgoff_t start, pgoff_t nr, bool even_cows) { }
static inline void unmap_mapping_range(struct address_space *mapping,
                loff_t const holebegin, loff_t const holelen, int even_cows) { }
#endif

static inline void unmap_shared_mapping_range(struct address_space *mapping,
                loff_t const holebegin, loff_t const holelen)
{
        unmap_mapping_range(mapping, holebegin, holelen, 0);
}

static inline struct vm_area_struct *vma_lookup(struct mm_struct *mm,
                                                unsigned long addr);

extern int access_process_vm(struct task_struct *tsk, unsigned long addr,
                void *buf, int len, unsigned int gup_flags);
extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
                void *buf, int len, unsigned int gup_flags);

#ifdef CONFIG_BPF_SYSCALL
extern int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr,
                              void *buf, int len, unsigned int gup_flags);
#endif

long get_user_pages_remote(struct mm_struct *mm,
                           unsigned long start, unsigned long nr_pages,
                           unsigned int gup_flags, struct page **pages,
                           int *locked);
long pin_user_pages_remote(struct mm_struct *mm,
                           unsigned long start, unsigned long nr_pages,
                           unsigned int gup_flags, struct page **pages,
                           int *locked);

/*
 * Retrieves a single page alongside its VMA. Does not support FOLL_NOWAIT.
 */
static inline struct page *get_user_page_vma_remote(struct mm_struct *mm,
                                                    unsigned long addr,
                                                    int gup_flags,
                                                    struct vm_area_struct **vmap)
{
        struct page *page;
        struct vm_area_struct *vma;
        int got;

        if (WARN_ON_ONCE(unlikely(gup_flags & FOLL_NOWAIT)))
                return ERR_PTR(-EINVAL);

        got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL);

        if (got < 0)
                return ERR_PTR(got);

        vma = vma_lookup(mm, addr);
        if (WARN_ON_ONCE(!vma)) {
                put_page(page);
                return ERR_PTR(-EINVAL);
        }

        *vmap = vma;
        return page;
}

long get_user_pages(unsigned long start, unsigned long nr_pages,
                    unsigned int gup_flags, struct page **pages);
long pin_user_pages(unsigned long start, unsigned long nr_pages,
                    unsigned int gup_flags, struct page **pages);
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
                    struct page **pages, unsigned int gup_flags);
long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
                    struct page **pages, unsigned int gup_flags);
long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,
                      struct folio **folios, unsigned int max_folios,
                      pgoff_t *offset);
int folio_add_pins(struct folio *folio, unsigned int pins);

int get_user_pages_fast(unsigned long start, int nr_pages,
                        unsigned int gup_flags, struct page **pages);
int pin_user_pages_fast(unsigned long start, int nr_pages,
                        unsigned int gup_flags, struct page **pages);
void folio_add_pin(struct folio *folio);

int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc);
int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
                        const struct task_struct *task, bool bypass_rlim);

struct kvec;
struct page *get_dump_page(unsigned long addr, int *locked);

bool folio_mark_dirty(struct folio *folio);
bool folio_mark_dirty_lock(struct folio *folio);
bool set_page_dirty(struct page *page);
int set_page_dirty_lock(struct page *page);

int get_cmdline(struct task_struct *task, char *buffer, int buflen);

/*
 * Flags used by change_protection().  For now we make it a bitmap so
 * that we can pass in multiple flags just like parameters.  However
 * for now all the callers are only use one of the flags at the same
 * time.
 */
/*
 * Whether we should manually check if we can map individual PTEs writable,
 * because something (e.g., COW, uffd-wp) blocks that from happening for all
 * PTEs automatically in a writable mapping.
 */
#define  MM_CP_TRY_CHANGE_WRITABLE           (1UL << 0)
/* Whether this protection change is for NUMA hints */
#define  MM_CP_PROT_NUMA                   (1UL << 1)
/* Whether this change is for write protecting */
#define  MM_CP_UFFD_WP                     (1UL << 2) /* do wp */
#define  MM_CP_UFFD_WP_RESOLVE             (1UL << 3) /* Resolve wp */
#define  MM_CP_UFFD_WP_ALL                 (MM_CP_UFFD_WP | \
                                            MM_CP_UFFD_WP_RESOLVE)

bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
                             pte_t pte);
extern long change_protection(struct mmu_gather *tlb,
                              struct vm_area_struct *vma, unsigned long start,
                              unsigned long end, unsigned long cp_flags);
extern int mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
          struct vm_area_struct *vma, struct vm_area_struct **pprev,
          unsigned long start, unsigned long end, vm_flags_t newflags);

/*
 * doesn't attempt to fault and will return short.
 */
int get_user_pages_fast_only(unsigned long start, int nr_pages,
                             unsigned int gup_flags, struct page **pages);

static inline bool get_user_page_fast_only(unsigned long addr,
                        unsigned int gup_flags, struct page **pagep)
{
        return get_user_pages_fast_only(addr, 1, gup_flags, pagep) == 1;
}
/*
 * per-process(per-mm_struct) statistics.
 */
static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
{
        return percpu_counter_read_positive(&mm->rss_stat[member]);
}

static inline unsigned long get_mm_counter_sum(struct mm_struct *mm, int member)
{
        return percpu_counter_sum_positive(&mm->rss_stat[member]);
}

void mm_trace_rss_stat(struct mm_struct *mm, int member);

static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
{
        percpu_counter_add(&mm->rss_stat[member], value);

        mm_trace_rss_stat(mm, member);
}

static inline void inc_mm_counter(struct mm_struct *mm, int member)
{
        percpu_counter_inc(&mm->rss_stat[member]);

        mm_trace_rss_stat(mm, member);
}

static inline void dec_mm_counter(struct mm_struct *mm, int member)
{
        percpu_counter_dec(&mm->rss_stat[member]);

        mm_trace_rss_stat(mm, member);
}

/* Optimized variant when folio is already known not to be anon */
static inline int mm_counter_file(struct folio *folio)
{
        if (folio_test_swapbacked(folio))
                return MM_SHMEMPAGES;
        return MM_FILEPAGES;
}

static inline int mm_counter(struct folio *folio)
{
        if (folio_test_anon(folio))
                return MM_ANONPAGES;
        return mm_counter_file(folio);
}

static inline unsigned long get_mm_rss(struct mm_struct *mm)
{
        return get_mm_counter(mm, MM_FILEPAGES) +
                get_mm_counter(mm, MM_ANONPAGES) +
                get_mm_counter(mm, MM_SHMEMPAGES);
}

static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
{
        return max(mm->hiwater_rss, get_mm_rss(mm));
}

static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm)
{
        return max(mm->hiwater_vm, mm->total_vm);
}

static inline void update_hiwater_rss(struct mm_struct *mm)
{
        unsigned long _rss = get_mm_rss(mm);

        if (data_race(mm->hiwater_rss) < _rss)
                data_race(mm->hiwater_rss = _rss);
}

static inline void update_hiwater_vm(struct mm_struct *mm)
{
        if (mm->hiwater_vm < mm->total_vm)
                mm->hiwater_vm = mm->total_vm;
}

static inline void reset_mm_hiwater_rss(struct mm_struct *mm)
{
        mm->hiwater_rss = get_mm_rss(mm);
}

static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
                                         struct mm_struct *mm)
{
        unsigned long hiwater_rss = get_mm_hiwater_rss(mm);

        if (*maxrss < hiwater_rss)
                *maxrss = hiwater_rss;
}

#ifndef CONFIG_ARCH_HAS_PTE_SPECIAL
static inline int pte_special(pte_t pte)
{
        return 0;
}

static inline pte_t pte_mkspecial(pte_t pte)
{
        return pte;
}
#endif

#ifndef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
static inline bool pmd_special(pmd_t pmd)
{
        return false;
}

static inline pmd_t pmd_mkspecial(pmd_t pmd)
{
        return pmd;
}
#endif        /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */

#ifndef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP
static inline bool pud_special(pud_t pud)
{
        return false;
}

static inline pud_t pud_mkspecial(pud_t pud)
{
        return pud;
}
#endif        /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */

extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
                               spinlock_t **ptl);
static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
                                    spinlock_t **ptl)
{
        pte_t *ptep;
        __cond_lock(*ptl, ptep = __get_locked_pte(mm, addr, ptl));
        return ptep;
}

#ifdef __PAGETABLE_P4D_FOLDED
static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
                                                unsigned long address)
{
        return 0;
}
#else
int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
#endif

#if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU)
static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d,
                                                unsigned long address)
{
        return 0;
}
static inline void mm_inc_nr_puds(struct mm_struct *mm) {}
static inline void mm_dec_nr_puds(struct mm_struct *mm) {}

#else
int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address);

static inline void mm_inc_nr_puds(struct mm_struct *mm)
{
        if (mm_pud_folded(mm))
                return;
        atomic_long_add(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes);
}

static inline void mm_dec_nr_puds(struct mm_struct *mm)
{
        if (mm_pud_folded(mm))
                return;
        atomic_long_sub(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes);
}
#endif

#if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU)
static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
                                                unsigned long address)
{
        return 0;
}

static inline void mm_inc_nr_pmds(struct mm_struct *mm) {}
static inline void mm_dec_nr_pmds(struct mm_struct *mm) {}

#else
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);

static inline void mm_inc_nr_pmds(struct mm_struct *mm)
{
        if (mm_pmd_folded(mm))
                return;
        atomic_long_add(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes);
}

static inline void mm_dec_nr_pmds(struct mm_struct *mm)
{
        if (mm_pmd_folded(mm))
                return;
        atomic_long_sub(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes);
}
#endif

#ifdef CONFIG_MMU
static inline void mm_pgtables_bytes_init(struct mm_struct *mm)
{
        atomic_long_set(&mm->pgtables_bytes, 0);
}

static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm)
{
        return atomic_long_read(&mm->pgtables_bytes);
}

static inline void mm_inc_nr_ptes(struct mm_struct *mm)
{
        atomic_long_add(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes);
}

static inline void mm_dec_nr_ptes(struct mm_struct *mm)
{
        atomic_long_sub(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes);
}
#else

static inline void mm_pgtables_bytes_init(struct mm_struct *mm) {}
static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm)
{
        return 0;
}

static inline void mm_inc_nr_ptes(struct mm_struct *mm) {}
static inline void mm_dec_nr_ptes(struct mm_struct *mm) {}
#endif

int __pte_alloc(struct mm_struct *mm, pmd_t *pmd);
int __pte_alloc_kernel(pmd_t *pmd);

#if defined(CONFIG_MMU)

static inline p4d_t *p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
                unsigned long address)
{
        return (unlikely(pgd_none(*pgd)) && __p4d_alloc(mm, pgd, address)) ?
                NULL : p4d_offset(pgd, address);
}

static inline pud_t *pud_alloc(struct mm_struct *mm, p4d_t *p4d,
                unsigned long address)
{
        return (unlikely(p4d_none(*p4d)) && __pud_alloc(mm, p4d, address)) ?
                NULL : pud_offset(p4d, address);
}

static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
{
        return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))?
                NULL: pmd_offset(pud, address);
}
#endif /* CONFIG_MMU */

enum pt_flags {
        PT_reserved = PG_reserved,
        /* High bits are used for zone/node/section */
};

static inline struct ptdesc *virt_to_ptdesc(const void *x)
{
        return page_ptdesc(virt_to_page(x));
}

/**
 * ptdesc_address - Virtual address of page table.
 * @pt: Page table descriptor.
 *
 * Return: The first byte of the page table described by @pt.
 */
static inline void *ptdesc_address(const struct ptdesc *pt)
{
        return folio_address(ptdesc_folio(pt));
}

static inline bool pagetable_is_reserved(struct ptdesc *pt)
{
        return test_bit(PT_reserved, &pt->pt_flags.f);
}

/**
 * pagetable_alloc - Allocate pagetables
 * @gfp:    GFP flags
 * @order:  desired pagetable order
 *
 * pagetable_alloc allocates memory for page tables as well as a page table
 * descriptor to describe that memory.
 *
 * Return: The ptdesc describing the allocated page tables.
 */
static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order)
{
        struct page *page = alloc_pages_noprof(gfp | __GFP_COMP, order);

        return page_ptdesc(page);
}
#define pagetable_alloc(...)        alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__))

/**
 * pagetable_free - Free pagetables
 * @pt:        The page table descriptor
 *
 * pagetable_free frees the memory of all page tables described by a page
 * table descriptor and the memory for the descriptor itself.
 */
static inline void pagetable_free(struct ptdesc *pt)
{
        struct page *page = ptdesc_page(pt);

        __free_pages(page, compound_order(page));
}

#if defined(CONFIG_SPLIT_PTE_PTLOCKS)
#if ALLOC_SPLIT_PTLOCKS
void __init ptlock_cache_init(void);
bool ptlock_alloc(struct ptdesc *ptdesc);
void ptlock_free(struct ptdesc *ptdesc);

static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc)
{
        return ptdesc->ptl;
}
#else /* ALLOC_SPLIT_PTLOCKS */
static inline void ptlock_cache_init(void)
{
}

static inline bool ptlock_alloc(struct ptdesc *ptdesc)
{
        return true;
}

static inline void ptlock_free(struct ptdesc *ptdesc)
{
}

static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc)
{
        return &ptdesc->ptl;
}
#endif /* ALLOC_SPLIT_PTLOCKS */

static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
        return ptlock_ptr(page_ptdesc(pmd_page(*pmd)));
}

static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte)
{
        BUILD_BUG_ON(IS_ENABLED(CONFIG_HIGHPTE));
        BUILD_BUG_ON(MAX_PTRS_PER_PTE * sizeof(pte_t) > PAGE_SIZE);
        return ptlock_ptr(virt_to_ptdesc(pte));
}

static inline bool ptlock_init(struct ptdesc *ptdesc)
{
        /*
         * prep_new_page() initialize page->private (and therefore page->ptl)
         * with 0. Make sure nobody took it in use in between.
         *
         * It can happen if arch try to use slab for page table allocation:
         * slab code uses page->slab_cache, which share storage with page->ptl.
         */
        VM_BUG_ON_PAGE(*(unsigned long *)&ptdesc->ptl, ptdesc_page(ptdesc));
        if (!ptlock_alloc(ptdesc))
                return false;
        spin_lock_init(ptlock_ptr(ptdesc));
        return true;
}

#else        /* !defined(CONFIG_SPLIT_PTE_PTLOCKS) */
/*
 * We use mm->page_table_lock to guard all pagetable pages of the mm.
 */
static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
        return &mm->page_table_lock;
}
static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte)
{
        return &mm->page_table_lock;
}
static inline void ptlock_cache_init(void) {}
static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; }
static inline void ptlock_free(struct ptdesc *ptdesc) {}
#endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */

static inline unsigned long ptdesc_nr_pages(const struct ptdesc *ptdesc)
{
        return compound_nr(ptdesc_page(ptdesc));
}

static inline void __pagetable_ctor(struct ptdesc *ptdesc)
{
        pg_data_t *pgdat = NODE_DATA(memdesc_nid(ptdesc->pt_flags));

        __SetPageTable(ptdesc_page(ptdesc));
        mod_node_page_state(pgdat, NR_PAGETABLE, ptdesc_nr_pages(ptdesc));
}

static inline void pagetable_dtor(struct ptdesc *ptdesc)
{
        pg_data_t *pgdat = NODE_DATA(memdesc_nid(ptdesc->pt_flags));

        ptlock_free(ptdesc);
        __ClearPageTable(ptdesc_page(ptdesc));
        mod_node_page_state(pgdat, NR_PAGETABLE, -ptdesc_nr_pages(ptdesc));
}

static inline void pagetable_dtor_free(struct ptdesc *ptdesc)
{
        pagetable_dtor(ptdesc);
        pagetable_free(ptdesc);
}

static inline bool pagetable_pte_ctor(struct mm_struct *mm,
                                      struct ptdesc *ptdesc)
{
        if (mm != &init_mm && !ptlock_init(ptdesc))
                return false;
        __pagetable_ctor(ptdesc);
        return true;
}

pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp);
static inline pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr,
                        pmd_t *pmdvalp)
{
        pte_t *pte;

        __cond_lock(RCU, pte = ___pte_offset_map(pmd, addr, pmdvalp));
        return pte;
}
static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr)
{
        return __pte_offset_map(pmd, addr, NULL);
}

pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
                        unsigned long addr, spinlock_t **ptlp);
static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
                        unsigned long addr, spinlock_t **ptlp)
{
        pte_t *pte;

        __cond_lock(RCU, __cond_lock(*ptlp,
                        pte = __pte_offset_map_lock(mm, pmd, addr, ptlp)));
        return pte;
}

pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd,
                                unsigned long addr, spinlock_t **ptlp);
pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd,
                                unsigned long addr, pmd_t *pmdvalp,
                                spinlock_t **ptlp);

#define pte_unmap_unlock(pte, ptl)        do {                \
        spin_unlock(ptl);                                \
        pte_unmap(pte);                                        \
} while (0)

#define pte_alloc(mm, pmd) (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd))

#define pte_alloc_map(mm, pmd, address)                        \
        (pte_alloc(mm, pmd) ? NULL : pte_offset_map(pmd, address))

#define pte_alloc_map_lock(mm, pmd, address, ptlp)        \
        (pte_alloc(mm, pmd) ?                        \
                 NULL : pte_offset_map_lock(mm, pmd, address, ptlp))

#define pte_alloc_kernel(pmd, address)                        \
        ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \
                NULL: pte_offset_kernel(pmd, address))

#if defined(CONFIG_SPLIT_PMD_PTLOCKS)

static inline struct page *pmd_pgtable_page(pmd_t *pmd)
{
        unsigned long mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
        return virt_to_page((void *)((unsigned long) pmd & mask));
}

static inline struct ptdesc *pmd_ptdesc(pmd_t *pmd)
{
        return page_ptdesc(pmd_pgtable_page(pmd));
}

static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
        return ptlock_ptr(pmd_ptdesc(pmd));
}

static inline bool pmd_ptlock_init(struct ptdesc *ptdesc)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        ptdesc->pmd_huge_pte = NULL;
#endif
        return ptlock_init(ptdesc);
}

#define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte)

#else

static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
        return &mm->page_table_lock;
}

static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; }

#define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte)

#endif

static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd)
{
        spinlock_t *ptl = pmd_lockptr(mm, pmd);
        spin_lock(ptl);
        return ptl;
}

static inline bool pagetable_pmd_ctor(struct mm_struct *mm,
                                      struct ptdesc *ptdesc)
{
        if (mm != &init_mm && !pmd_ptlock_init(ptdesc))
                return false;
        ptdesc_pmd_pts_init(ptdesc);
        __pagetable_ctor(ptdesc);
        return true;
}

/*
 * No scalability reason to split PUD locks yet, but follow the same pattern
 * as the PMD locks to make it easier if we decide to.  The VM should not be
 * considered ready to switch to split PUD locks yet; there may be places
 * which need to be converted from page_table_lock.
 */
static inline spinlock_t *pud_lockptr(struct mm_struct *mm, pud_t *pud)
{
        return &mm->page_table_lock;
}

static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud)
{
        spinlock_t *ptl = pud_lockptr(mm, pud);

        spin_lock(ptl);
        return ptl;
}

static inline void pagetable_pud_ctor(struct ptdesc *ptdesc)
{
        __pagetable_ctor(ptdesc);
}

static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc)
{
        __pagetable_ctor(ptdesc);
}

static inline void pagetable_pgd_ctor(struct ptdesc *ptdesc)
{
        __pagetable_ctor(ptdesc);
}

extern void __init pagecache_init(void);
extern void free_initmem(void);

/*
 * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
 * into the buddy system. The freed pages will be poisoned with pattern
 * "poison" if it's within range [0, UCHAR_MAX].
 * Return pages freed into the buddy system.
 */
extern unsigned long free_reserved_area(void *start, void *end,
                                        int poison, const char *s);

extern void adjust_managed_page_count(struct page *page, long count);

extern void reserve_bootmem_region(phys_addr_t start,
                                   phys_addr_t end, int nid);

/* Free the reserved page into the buddy system, so it gets managed. */
void free_reserved_page(struct page *page);

static inline void mark_page_reserved(struct page *page)
{
        SetPageReserved(page);
        adjust_managed_page_count(page, -1);
}

static inline void free_reserved_ptdesc(struct ptdesc *pt)
{
        free_reserved_page(ptdesc_page(pt));
}

/*
 * Default method to free all the __init memory into the buddy system.
 * The freed pages will be poisoned with pattern "poison" if it's within
 * range [0, UCHAR_MAX].
 * Return pages freed into the buddy system.
 */
static inline unsigned long free_initmem_default(int poison)
{
        extern char __init_begin[], __init_end[];

        return free_reserved_area(&__init_begin, &__init_end,
                                  poison, "unused kernel image (initmem)");
}

static inline unsigned long get_num_physpages(void)
{
        int nid;
        unsigned long phys_pages = 0;

        for_each_online_node(nid)
                phys_pages += node_present_pages(nid);

        return phys_pages;
}

/*
 * Using memblock node mappings, an architecture may initialise its
 * zones, allocate the backing mem_map and account for memory holes in an
 * architecture independent manner.
 *
 * An architecture is expected to register range of page frames backed by
 * physical memory with memblock_add[_node]() before calling
 * free_area_init() passing in the PFN each zone ends at. At a basic
 * usage, an architecture is expected to do something like
 *
 * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn,
 *                                                          max_highmem_pfn};
 * for_each_valid_physical_page_range()
 *        memblock_add_node(base, size, nid, MEMBLOCK_NONE)
 * free_area_init(max_zone_pfns);
 */
void free_area_init(unsigned long *max_zone_pfn);
unsigned long node_map_pfn_alignment(void);
extern unsigned long absent_pages_in_range(unsigned long start_pfn,
                                                unsigned long end_pfn);
extern void get_pfn_range_for_nid(unsigned int nid,
                        unsigned long *start_pfn, unsigned long *end_pfn);

#ifndef CONFIG_NUMA
static inline int early_pfn_to_nid(unsigned long pfn)
{
        return 0;
}
#else
/* please see mm/page_alloc.c */
extern int __meminit early_pfn_to_nid(unsigned long pfn);
#endif

extern void mem_init(void);
extern void __init mmap_init(void);

extern void __show_mem(unsigned int flags, nodemask_t *nodemask, int max_zone_idx);
static inline void show_mem(void)
{
        __show_mem(0, NULL, MAX_NR_ZONES - 1);
}
extern long si_mem_available(void);
extern void si_meminfo(struct sysinfo * val);
extern void si_meminfo_node(struct sysinfo *val, int nid);

extern __printf(3, 4)
void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...);

extern void setup_per_cpu_pageset(void);

/* nommu.c */
extern atomic_long_t mmap_pages_allocated;
extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);

/* interval_tree.c */
void vma_interval_tree_insert(struct vm_area_struct *node,
                              struct rb_root_cached *root);
void vma_interval_tree_insert_after(struct vm_area_struct *node,
                                    struct vm_area_struct *prev,
                                    struct rb_root_cached *root);
void vma_interval_tree_remove(struct vm_area_struct *node,
                              struct rb_root_cached *root);
struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root_cached *root,
                                unsigned long start, unsigned long last);
struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
                                unsigned long start, unsigned long last);

#define vma_interval_tree_foreach(vma, root, start, last)                \
        for (vma = vma_interval_tree_iter_first(root, start, last);        \
             vma; vma = vma_interval_tree_iter_next(vma, start, last))

void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
                                   struct rb_root_cached *root);
void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
                                   struct rb_root_cached *root);
struct anon_vma_chain *
anon_vma_interval_tree_iter_first(struct rb_root_cached *root,
                                  unsigned long start, unsigned long last);
struct anon_vma_chain *anon_vma_interval_tree_iter_next(
        struct anon_vma_chain *node, unsigned long start, unsigned long last);
#ifdef CONFIG_DEBUG_VM_RB
void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
#endif

#define anon_vma_interval_tree_foreach(avc, root, start, last)                 \
        for (avc = anon_vma_interval_tree_iter_first(root, start, last); \
             avc; avc = anon_vma_interval_tree_iter_next(avc, start, last))

/* mmap.c */
extern int __vm_enough_memory(const struct mm_struct *mm, long pages, int cap_sys_admin);
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
extern void exit_mmap(struct mm_struct *);
bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma,
                                 unsigned long addr, bool write);

static inline int check_data_rlimit(unsigned long rlim,
                                    unsigned long new,
                                    unsigned long start,
                                    unsigned long end_data,
                                    unsigned long start_data)
{
        if (rlim < RLIM_INFINITY) {
                if (((new - start) + (end_data - start_data)) > rlim)
                        return -ENOSPC;
        }

        return 0;
}

extern int mm_take_all_locks(struct mm_struct *mm);
extern void mm_drop_all_locks(struct mm_struct *mm);

extern int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
extern int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
extern struct file *get_mm_exe_file(struct mm_struct *mm);
extern struct file *get_task_exe_file(struct task_struct *task);

extern bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long npages);
extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages);

extern bool vma_is_special_mapping(const struct vm_area_struct *vma,
                                   const struct vm_special_mapping *sm);
struct vm_area_struct *_install_special_mapping(struct mm_struct *mm,
                                   unsigned long addr, unsigned long len,
                                   vm_flags_t vm_flags,
                                   const struct vm_special_mapping *spec);

unsigned long randomize_stack_top(unsigned long stack_top);
unsigned long randomize_page(unsigned long start, unsigned long range);

unsigned long
__get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
                    unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags);

static inline unsigned long
get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
                  unsigned long pgoff, unsigned long flags)
{
        return __get_unmapped_area(file, addr, len, pgoff, flags, 0);
}

extern unsigned long do_mmap(struct file *file, unsigned long addr,
        unsigned long len, unsigned long prot, unsigned long flags,
        vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate,
        struct list_head *uf);
extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
                         unsigned long start, size_t len, struct list_head *uf,
                         bool unlock);
int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
                    struct mm_struct *mm, unsigned long start,
                    unsigned long end, struct list_head *uf, bool unlock);
extern int do_munmap(struct mm_struct *, unsigned long, size_t,
                     struct list_head *uf);
extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior);

#ifdef CONFIG_MMU
extern int __mm_populate(unsigned long addr, unsigned long len,
                         int ignore_errors);
static inline void mm_populate(unsigned long addr, unsigned long len)
{
        /* Ignore errors */
        (void) __mm_populate(addr, len, 1);
}
#else
static inline void mm_populate(unsigned long addr, unsigned long len) {}
#endif

/* This takes the mm semaphore itself */
extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long);
extern int vm_munmap(unsigned long, size_t);
extern unsigned long __must_check vm_mmap(struct file *, unsigned long,
        unsigned long, unsigned long,
        unsigned long, unsigned long);

struct vm_unmapped_area_info {
#define VM_UNMAPPED_AREA_TOPDOWN 1
        unsigned long flags;
        unsigned long length;
        unsigned long low_limit;
        unsigned long high_limit;
        unsigned long align_mask;
        unsigned long align_offset;
        unsigned long start_gap;
};

extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info);

/* truncate.c */
extern void truncate_inode_pages(struct address_space *, loff_t);
extern void truncate_inode_pages_range(struct address_space *,
                                       loff_t lstart, loff_t lend);
extern void truncate_inode_pages_final(struct address_space *);

/* generic vm_area_ops exported for stackable file systems */
extern vm_fault_t filemap_fault(struct vm_fault *vmf);
extern vm_fault_t filemap_map_pages(struct vm_fault *vmf,
                pgoff_t start_pgoff, pgoff_t end_pgoff);
extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf);

extern unsigned long stack_guard_gap;
/* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
int expand_stack_locked(struct vm_area_struct *vma, unsigned long address);
struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr);

/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
                                             struct vm_area_struct **pprev);

/*
 * Look up the first VMA which intersects the interval [start_addr, end_addr)
 * NULL if none.  Assume start_addr < end_addr.
 */
struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
                        unsigned long start_addr, unsigned long end_addr);

/**
 * vma_lookup() - Find a VMA at a specific address
 * @mm: The process address space.
 * @addr: The user address.
 *
 * Return: The vm_area_struct at the given address, %NULL otherwise.
 */
static inline
struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr)
{
        return mtree_load(&mm->mm_mt, addr);
}

static inline unsigned long stack_guard_start_gap(const struct vm_area_struct *vma)
{
        if (vma->vm_flags & VM_GROWSDOWN)
                return stack_guard_gap;

        /* See reasoning around the VM_SHADOW_STACK definition */
        if (vma->vm_flags & VM_SHADOW_STACK)
                return PAGE_SIZE;

        return 0;
}

static inline unsigned long vm_start_gap(const struct vm_area_struct *vma)
{
        unsigned long gap = stack_guard_start_gap(vma);
        unsigned long vm_start = vma->vm_start;

        vm_start -= gap;
        if (vm_start > vma->vm_start)
                vm_start = 0;
        return vm_start;
}

static inline unsigned long vm_end_gap(const struct vm_area_struct *vma)
{
        unsigned long vm_end = vma->vm_end;

        if (vma->vm_flags & VM_GROWSUP) {
                vm_end += stack_guard_gap;
                if (vm_end < vma->vm_end)
                        vm_end = -PAGE_SIZE;
        }
        return vm_end;
}

static inline unsigned long vma_pages(const struct vm_area_struct *vma)
{
        return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
}

/* Look up the first VMA which exactly match the interval vm_start ... vm_end */
static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
                                unsigned long vm_start, unsigned long vm_end)
{
        struct vm_area_struct *vma = vma_lookup(mm, vm_start);

        if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
                vma = NULL;

        return vma;
}

static inline bool range_in_vma(const struct vm_area_struct *vma,
                                unsigned long start, unsigned long end)
{
        return (vma && vma->vm_start <= start && end <= vma->vm_end);
}

#ifdef CONFIG_MMU
pgprot_t vm_get_page_prot(vm_flags_t vm_flags);
void vma_set_page_prot(struct vm_area_struct *vma);
#else
static inline pgprot_t vm_get_page_prot(vm_flags_t vm_flags)
{
        return __pgprot(0);
}
static inline void vma_set_page_prot(struct vm_area_struct *vma)
{
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
}
#endif

void vma_set_file(struct vm_area_struct *vma, struct file *file);

#ifdef CONFIG_NUMA_BALANCING
unsigned long change_prot_numa(struct vm_area_struct *vma,
                        unsigned long start, unsigned long end);
#endif

struct vm_area_struct *find_extend_vma_locked(struct mm_struct *,
                unsigned long addr);
int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
                        unsigned long pfn, unsigned long size, pgprot_t);
int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
                unsigned long pfn, unsigned long size, pgprot_t prot);
int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
                        struct page **pages, unsigned long *num);
int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num);
int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num);
vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page,
                        bool write);
vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn);
vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn, pgprot_t pgprot);
vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn);
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
                unsigned long addr, unsigned long pfn);
int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);

static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma,
                                unsigned long addr, struct page *page)
{
        int err = vm_insert_page(vma, addr, page);

        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        if (err < 0 && err != -EBUSY)
                return VM_FAULT_SIGBUS;

        return VM_FAULT_NOPAGE;
}

#ifndef io_remap_pfn_range
static inline int io_remap_pfn_range(struct vm_area_struct *vma,
                                     unsigned long addr, unsigned long pfn,
                                     unsigned long size, pgprot_t prot)
{
        return remap_pfn_range(vma, addr, pfn, size, pgprot_decrypted(prot));
}
#endif

static inline vm_fault_t vmf_error(int err)
{
        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        else if (err == -EHWPOISON)
                return VM_FAULT_HWPOISON;
        return VM_FAULT_SIGBUS;
}

/*
 * Convert errno to return value for ->page_mkwrite() calls.
 *
 * This should eventually be merged with vmf_error() above, but will need a
 * careful audit of all vmf_error() callers.
 */
static inline vm_fault_t vmf_fs_error(int err)
{
        if (err == 0)
                return VM_FAULT_LOCKED;
        if (err == -EFAULT || err == -EAGAIN)
                return VM_FAULT_NOPAGE;
        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        /* -ENOSPC, -EDQUOT, -EIO ... */
        return VM_FAULT_SIGBUS;
}

static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
{
        if (vm_fault & VM_FAULT_OOM)
                return -ENOMEM;
        if (vm_fault & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
                return (foll_flags & FOLL_HWPOISON) ? -EHWPOISON : -EFAULT;
        if (vm_fault & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
                return -EFAULT;
        return 0;
}

/*
 * Indicates whether GUP can follow a PROT_NONE mapped page, or whether
 * a (NUMA hinting) fault is required.
 */
static inline bool gup_can_follow_protnone(const struct vm_area_struct *vma,
                                           unsigned int flags)
{
        /*
         * If callers don't want to honor NUMA hinting faults, no need to
         * determine if we would actually have to trigger a NUMA hinting fault.
         */
        if (!(flags & FOLL_HONOR_NUMA_FAULT))
                return true;

        /*
         * NUMA hinting faults don't apply in inaccessible (PROT_NONE) VMAs.
         *
         * Requiring a fault here even for inaccessible VMAs would mean that
         * FOLL_FORCE cannot make any progress, because handle_mm_fault()
         * refuses to process NUMA hinting faults in inaccessible VMAs.
         */
        return !vma_is_accessible(vma);
}

typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);
extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
                               unsigned long size, pte_fn_t fn, void *data);
extern int apply_to_existing_page_range(struct mm_struct *mm,
                                   unsigned long address, unsigned long size,
                                   pte_fn_t fn, void *data);

#ifdef CONFIG_PAGE_POISONING
extern void __kernel_poison_pages(struct page *page, int numpages);
extern void __kernel_unpoison_pages(struct page *page, int numpages);
extern bool _page_poisoning_enabled_early;
DECLARE_STATIC_KEY_FALSE(_page_poisoning_enabled);
static inline bool page_poisoning_enabled(void)
{
        return _page_poisoning_enabled_early;
}
/*
 * For use in fast paths after init_mem_debugging() has run, or when a
 * false negative result is not harmful when called too early.
 */
static inline bool page_poisoning_enabled_static(void)
{
        return static_branch_unlikely(&_page_poisoning_enabled);
}
static inline void kernel_poison_pages(struct page *page, int numpages)
{
        if (page_poisoning_enabled_static())
                __kernel_poison_pages(page, numpages);
}
static inline void kernel_unpoison_pages(struct page *page, int numpages)
{
        if (page_poisoning_enabled_static())
                __kernel_unpoison_pages(page, numpages);
}
#else
static inline bool page_poisoning_enabled(void) { return false; }
static inline bool page_poisoning_enabled_static(void) { return false; }
static inline void __kernel_poison_pages(struct page *page, int nunmpages) { }
static inline void kernel_poison_pages(struct page *page, int numpages) { }
static inline void kernel_unpoison_pages(struct page *page, int numpages) { }
#endif

DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
static inline bool want_init_on_alloc(gfp_t flags)
{
        if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON,
                                &init_on_alloc))
                return true;
        return flags & __GFP_ZERO;
}

DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
static inline bool want_init_on_free(void)
{
        return static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON,
                                   &init_on_free);
}

extern bool _debug_pagealloc_enabled_early;
DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);

static inline bool debug_pagealloc_enabled(void)
{
        return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) &&
                _debug_pagealloc_enabled_early;
}

/*
 * For use in fast paths after mem_debugging_and_hardening_init() has run,
 * or when a false negative result is not harmful when called too early.
 */
static inline bool debug_pagealloc_enabled_static(void)
{
        if (!IS_ENABLED(CONFIG_DEBUG_PAGEALLOC))
                return false;

        return static_branch_unlikely(&_debug_pagealloc_enabled);
}

/*
 * To support DEBUG_PAGEALLOC architecture must ensure that
 * __kernel_map_pages() never fails
 */
extern void __kernel_map_pages(struct page *page, int numpages, int enable);
#ifdef CONFIG_DEBUG_PAGEALLOC
static inline void debug_pagealloc_map_pages(struct page *page, int numpages)
{
        if (debug_pagealloc_enabled_static())
                __kernel_map_pages(page, numpages, 1);
}

static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages)
{
        if (debug_pagealloc_enabled_static())
                __kernel_map_pages(page, numpages, 0);
}

extern unsigned int _debug_guardpage_minorder;
DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled);

static inline unsigned int debug_guardpage_minorder(void)
{
        return _debug_guardpage_minorder;
}

static inline bool debug_guardpage_enabled(void)
{
        return static_branch_unlikely(&_debug_guardpage_enabled);
}

static inline bool page_is_guard(const struct page *page)
{
        if (!debug_guardpage_enabled())
                return false;

        return PageGuard(page);
}

bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order);
static inline bool set_page_guard(struct zone *zone, struct page *page,
                                  unsigned int order)
{
        if (!debug_guardpage_enabled())
                return false;
        return __set_page_guard(zone, page, order);
}

void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order);
static inline void clear_page_guard(struct zone *zone, struct page *page,
                                    unsigned int order)
{
        if (!debug_guardpage_enabled())
                return;
        __clear_page_guard(zone, page, order);
}

#else        /* CONFIG_DEBUG_PAGEALLOC */
static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {}
static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {}
static inline unsigned int debug_guardpage_minorder(void) { return 0; }
static inline bool debug_guardpage_enabled(void) { return false; }
static inline bool page_is_guard(const struct page *page) { return false; }
static inline bool set_page_guard(struct zone *zone, struct page *page,
                        unsigned int order) { return false; }
static inline void clear_page_guard(struct zone *zone, struct page *page,
                                unsigned int order) {}
#endif        /* CONFIG_DEBUG_PAGEALLOC */

#ifdef __HAVE_ARCH_GATE_AREA
extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm);
extern int in_gate_area_no_mm(unsigned long addr);
extern int in_gate_area(struct mm_struct *mm, unsigned long addr);
#else
static inline struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
{
        return NULL;
}
static inline int in_gate_area_no_mm(unsigned long addr) { return 0; }
static inline int in_gate_area(struct mm_struct *mm, unsigned long addr)
{
        return 0;
}
#endif        /* __HAVE_ARCH_GATE_AREA */

bool process_shares_mm(const struct task_struct *p, const struct mm_struct *mm);

void drop_slab(void);

#ifndef CONFIG_MMU
#define randomize_va_space 0
#else
extern int randomize_va_space;
#endif

const char * arch_vma_name(struct vm_area_struct *vma);
#ifdef CONFIG_MMU
void print_vma_addr(char *prefix, unsigned long rip);
#else
static inline void print_vma_addr(char *prefix, unsigned long rip)
{
}
#endif

void *sparse_buffer_alloc(unsigned long size);
unsigned long section_map_size(void);
struct page * __populate_section_memmap(unsigned long pfn,
                unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
                struct dev_pagemap *pgmap);
pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
                            struct vmem_altmap *altmap, unsigned long ptpfn,
                            unsigned long flags);
void *vmemmap_alloc_block(unsigned long size, int node);
struct vmem_altmap;
void *vmemmap_alloc_block_buf(unsigned long size, int node,
                              struct vmem_altmap *altmap);
void vmemmap_verify(pte_t *, int, unsigned long, unsigned long);
void vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
                     unsigned long addr, unsigned long next);
int vmemmap_check_pmd(pmd_t *pmd, int node,
                      unsigned long addr, unsigned long next);
int vmemmap_populate_basepages(unsigned long start, unsigned long end,
                               int node, struct vmem_altmap *altmap);
int vmemmap_populate_hugepages(unsigned long start, unsigned long end,
                               int node, struct vmem_altmap *altmap);
int vmemmap_populate(unsigned long start, unsigned long end, int node,
                struct vmem_altmap *altmap);
int vmemmap_populate_hvo(unsigned long start, unsigned long end, int node,
                         unsigned long headsize);
int vmemmap_undo_hvo(unsigned long start, unsigned long end, int node,
                     unsigned long headsize);
void vmemmap_wrprotect_hvo(unsigned long start, unsigned long end, int node,
                          unsigned long headsize);
void vmemmap_populate_print_last(void);
#ifdef CONFIG_MEMORY_HOTPLUG
void vmemmap_free(unsigned long start, unsigned long end,
                struct vmem_altmap *altmap);
#endif

#ifdef CONFIG_SPARSEMEM_VMEMMAP
static inline unsigned long vmem_altmap_offset(const struct vmem_altmap *altmap)
{
        /* number of pfns from base where pfn_to_page() is valid */
        if (altmap)
                return altmap->reserve + altmap->free;
        return 0;
}

static inline void vmem_altmap_free(struct vmem_altmap *altmap,
                                    unsigned long nr_pfns)
{
        altmap->alloc -= nr_pfns;
}
#else
static inline unsigned long vmem_altmap_offset(const struct vmem_altmap *altmap)
{
        return 0;
}

static inline void vmem_altmap_free(struct vmem_altmap *altmap,
                                    unsigned long nr_pfns)
{
}
#endif

#define VMEMMAP_RESERVE_NR        2
#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap,
                                          struct dev_pagemap *pgmap)
{
        unsigned long nr_pages;
        unsigned long nr_vmemmap_pages;

        if (!pgmap || !is_power_of_2(sizeof(struct page)))
                return false;

        nr_pages = pgmap_vmemmap_nr(pgmap);
        nr_vmemmap_pages = ((nr_pages * sizeof(struct page)) >> PAGE_SHIFT);
        /*
         * For vmemmap optimization with DAX we need minimum 2 vmemmap
         * pages. See layout diagram in Documentation/mm/vmemmap_dedup.rst
         */
        return !altmap && (nr_vmemmap_pages > VMEMMAP_RESERVE_NR);
}
/*
 * If we don't have an architecture override, use the generic rule
 */
#ifndef vmemmap_can_optimize
#define vmemmap_can_optimize __vmemmap_can_optimize
#endif

#else
static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
                                           struct dev_pagemap *pgmap)
{
        return false;
}
#endif

enum mf_flags {
        MF_COUNT_INCREASED = 1 << 0,
        MF_ACTION_REQUIRED = 1 << 1,
        MF_MUST_KILL = 1 << 2,
        MF_SOFT_OFFLINE = 1 << 3,
        MF_UNPOISON = 1 << 4,
        MF_SW_SIMULATED = 1 << 5,
        MF_NO_RETRY = 1 << 6,
        MF_MEM_PRE_REMOVE = 1 << 7,
};
int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
                      unsigned long count, int mf_flags);
extern int memory_failure(unsigned long pfn, int flags);
extern int unpoison_memory(unsigned long pfn);
extern atomic_long_t num_poisoned_pages __read_mostly;
extern int soft_offline_page(unsigned long pfn, int flags);
#ifdef CONFIG_MEMORY_FAILURE
/*
 * Sysfs entries for memory failure handling statistics.
 */
extern const struct attribute_group memory_failure_attr_group;
extern void memory_failure_queue(unsigned long pfn, int flags);
extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
                                        bool *migratable_cleared);
void num_poisoned_pages_inc(unsigned long pfn);
void num_poisoned_pages_sub(unsigned long pfn, long i);
#else
static inline void memory_failure_queue(unsigned long pfn, int flags)
{
}

static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
                                        bool *migratable_cleared)
{
        return 0;
}

static inline void num_poisoned_pages_inc(unsigned long pfn)
{
}

static inline void num_poisoned_pages_sub(unsigned long pfn, long i)
{
}
#endif

#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
extern void memblk_nr_poison_inc(unsigned long pfn);
extern void memblk_nr_poison_sub(unsigned long pfn, long i);
#else
static inline void memblk_nr_poison_inc(unsigned long pfn)
{
}

static inline void memblk_nr_poison_sub(unsigned long pfn, long i)
{
}
#endif

#ifndef arch_memory_failure
static inline int arch_memory_failure(unsigned long pfn, int flags)
{
        return -ENXIO;
}
#endif

#ifndef arch_is_platform_page
static inline bool arch_is_platform_page(u64 paddr)
{
        return false;
}
#endif

/*
 * Error handlers for various types of pages.
 */
enum mf_result {
        MF_IGNORED,        /* Error: cannot be handled */
        MF_FAILED,        /* Error: handling failed */
        MF_DELAYED,        /* Will be handled later */
        MF_RECOVERED,        /* Successfully recovered */
};

enum mf_action_page_type {
        MF_MSG_KERNEL,
        MF_MSG_KERNEL_HIGH_ORDER,
        MF_MSG_DIFFERENT_COMPOUND,
        MF_MSG_HUGE,
        MF_MSG_FREE_HUGE,
        MF_MSG_GET_HWPOISON,
        MF_MSG_UNMAP_FAILED,
        MF_MSG_DIRTY_SWAPCACHE,
        MF_MSG_CLEAN_SWAPCACHE,
        MF_MSG_DIRTY_MLOCKED_LRU,
        MF_MSG_CLEAN_MLOCKED_LRU,
        MF_MSG_DIRTY_UNEVICTABLE_LRU,
        MF_MSG_CLEAN_UNEVICTABLE_LRU,
        MF_MSG_DIRTY_LRU,
        MF_MSG_CLEAN_LRU,
        MF_MSG_TRUNCATED_LRU,
        MF_MSG_BUDDY,
        MF_MSG_DAX,
        MF_MSG_UNSPLIT_THP,
        MF_MSG_ALREADY_POISONED,
        MF_MSG_UNKNOWN,
};

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
void folio_zero_user(struct folio *folio, unsigned long addr_hint);
int copy_user_large_folio(struct folio *dst, struct folio *src,
                          unsigned long addr_hint,
                          struct vm_area_struct *vma);
long copy_folio_from_user(struct folio *dst_folio,
                           const void __user *usr_src,
                           bool allow_pagefault);

/**
 * vma_is_special_huge - Are transhuge page-table entries considered special?
 * @vma: Pointer to the struct vm_area_struct to consider
 *
 * Whether transhuge page-table entries are considered "special" following
 * the definition in vm_normal_page().
 *
 * Return: true if transhuge page-table entries should be considered special,
 * false otherwise.
 */
static inline bool vma_is_special_huge(const struct vm_area_struct *vma)
{
        return vma_is_dax(vma) || (vma->vm_file &&
                                   (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
}

#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */

#if MAX_NUMNODES > 1
void __init setup_nr_node_ids(void);
#else
static inline void setup_nr_node_ids(void) {}
#endif

extern int memcmp_pages(struct page *page1, struct page *page2);

static inline int pages_identical(struct page *page1, struct page *page2)
{
        return !memcmp_pages(page1, page2);
}

#ifdef CONFIG_MAPPING_DIRTY_HELPERS
unsigned long clean_record_shared_mapping_range(struct address_space *mapping,
                                                pgoff_t first_index, pgoff_t nr,
                                                pgoff_t bitmap_pgoff,
                                                unsigned long *bitmap,
                                                pgoff_t *start,
                                                pgoff_t *end);

unsigned long wp_shared_mapping_range(struct address_space *mapping,
                                      pgoff_t first_index, pgoff_t nr);
#endif

#ifdef CONFIG_ANON_VMA_NAME
int set_anon_vma_name(unsigned long addr, unsigned long size,
                      const char __user *uname);
#else
static inline
int set_anon_vma_name(unsigned long addr, unsigned long size,
                      const char __user *uname)
{
        return -EINVAL;
}
#endif

#ifdef CONFIG_UNACCEPTED_MEMORY

bool range_contains_unaccepted_memory(phys_addr_t start, unsigned long size);
void accept_memory(phys_addr_t start, unsigned long size);

#else

static inline bool range_contains_unaccepted_memory(phys_addr_t start,
                                                    unsigned long size)
{
        return false;
}

static inline void accept_memory(phys_addr_t start, unsigned long size)
{
}

#endif

static inline bool pfn_is_unaccepted_memory(unsigned long pfn)
{
        return range_contains_unaccepted_memory(pfn << PAGE_SHIFT, PAGE_SIZE);
}

void vma_pgtable_walk_begin(struct vm_area_struct *vma);
void vma_pgtable_walk_end(struct vm_area_struct *vma);

int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size);
int reserve_mem_release_by_name(const char *name);

#ifdef CONFIG_64BIT
int do_mseal(unsigned long start, size_t len_in, unsigned long flags);
#else
static inline int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
{
        /* noop on 32 bit */
        return 0;
}
#endif

/*
 * user_alloc_needs_zeroing checks if a user folio from page allocator needs to
 * be zeroed or not.
 */
static inline bool user_alloc_needs_zeroing(void)
{
        /*
         * for user folios, arch with cache aliasing requires cache flush and
         * arc changes folio->flags to make icache coherent with dcache, so
         * always return false to make caller use
         * clear_user_page()/clear_user_highpage().
         */
        return cpu_dcache_is_aliasing() || cpu_icache_is_aliasing() ||
               !static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON,
                                   &init_on_alloc);
}

int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status);
int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status);
int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status);


/*
 * mseal of userspace process's system mappings.
 */
#ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS
#define VM_SEALED_SYSMAP        VM_SEALED
#else
#define VM_SEALED_SYSMAP        VM_NONE
#endif

/*
 * DMA mapping IDs for page_pool
 *
 * When DMA-mapping a page, page_pool allocates an ID (from an xarray) and
 * stashes it in the upper bits of page->pp_magic. We always want to be able to
 * unambiguously identify page pool pages (using page_pool_page_is_pp()). Non-PP
 * pages can have arbitrary kernel pointers stored in the same field as pp_magic
 * (since it overlaps with page->lru.next), so we must ensure that we cannot
 * mistake a valid kernel pointer with any of the values we write into this
 * field.
 *
 * On architectures that set POISON_POINTER_DELTA, this is already ensured,
 * since this value becomes part of PP_SIGNATURE; meaning we can just use the
 * space between the PP_SIGNATURE value (without POISON_POINTER_DELTA), and the
 * lowest bits of POISON_POINTER_DELTA. On arches where POISON_POINTER_DELTA is
 * 0, we use the lowest bit of PAGE_OFFSET as the boundary if that value is
 * known at compile-time.
 *
 * If the value of PAGE_OFFSET is not known at compile time, or if it is too
 * small to leave at least 8 bits available above PP_SIGNATURE, we define the
 * number of bits to be 0, which turns off the DMA index tracking altogether
 * (see page_pool_register_dma_index()).
 */
#define PP_DMA_INDEX_SHIFT (1 + __fls(PP_SIGNATURE - POISON_POINTER_DELTA))
#if POISON_POINTER_DELTA > 0
/* PP_SIGNATURE includes POISON_POINTER_DELTA, so limit the size of the DMA
 * index to not overlap with that if set
 */
#define PP_DMA_INDEX_BITS MIN(32, __ffs(POISON_POINTER_DELTA) - PP_DMA_INDEX_SHIFT)
#else
/* Use the lowest bit of PAGE_OFFSET if there's at least 8 bits available; see above */
#define PP_DMA_INDEX_MIN_OFFSET (1 << (PP_DMA_INDEX_SHIFT + 8))
#define PP_DMA_INDEX_BITS ((__builtin_constant_p(PAGE_OFFSET) && \
                            PAGE_OFFSET >= PP_DMA_INDEX_MIN_OFFSET && \
                            !(PAGE_OFFSET & (PP_DMA_INDEX_MIN_OFFSET - 1))) ? \
                              MIN(32, __ffs(PAGE_OFFSET) - PP_DMA_INDEX_SHIFT) : 0)

#endif

#define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \
                                  PP_DMA_INDEX_SHIFT)

/* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is
 * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for
 * the head page of compound page and bit 1 for pfmemalloc page, as well as the
 * bits used for the DMA index. page_is_pfmemalloc() is checked in
 * __page_pool_put_page() to avoid recycling the pfmemalloc page.
 */
#define PP_MAGIC_MASK ~(PP_DMA_INDEX_MASK | 0x3UL)

#ifdef CONFIG_PAGE_POOL
static inline bool page_pool_page_is_pp(const struct page *page)
{
        return (page->pp_magic & PP_MAGIC_MASK) == PP_SIGNATURE;
}
#else
static inline bool page_pool_page_is_pp(const struct page *page)
{
        return false;
}
#endif

#define PAGE_SNAPSHOT_FAITHFUL (1 << 0)
#define PAGE_SNAPSHOT_PG_BUDDY (1 << 1)
#define PAGE_SNAPSHOT_PG_IDLE  (1 << 2)

struct page_snapshot {
        struct folio folio_snapshot;
        struct page page_snapshot;
        unsigned long pfn;
        unsigned long idx;
        unsigned long flags;
};

static inline bool snapshot_page_is_faithful(const struct page_snapshot *ps)
{
        return ps->flags & PAGE_SNAPSHOT_FAITHFUL;
}

void snapshot_page(struct page_snapshot *ps, const struct page *page);

#endif /* _LINUX_MM_H */

















































































































































































































































   39 
   39 






















  154 




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef LLIST_H
#define LLIST_H
/*
 * Lock-less NULL terminated single linked list
 *
 * Cases where locking is not needed:
 * If there are multiple producers and multiple consumers, llist_add can be
 * used in producers and llist_del_all can be used in consumers simultaneously
 * without locking. Also a single consumer can use llist_del_first while
 * multiple producers simultaneously use llist_add, without any locking.
 *
 * Cases where locking is needed:
 * If we have multiple consumers with llist_del_first used in one consumer, and
 * llist_del_first or llist_del_all used in other consumers, then a lock is
 * needed.  This is because llist_del_first depends on list->first->next not
 * changing, but without lock protection, there's no way to be sure about that
 * if a preemption happens in the middle of the delete operation and on being
 * preempted back, the list->first is the same as before causing the cmpxchg in
 * llist_del_first to succeed. For example, while a llist_del_first operation
 * is in progress in one consumer, then a llist_del_first, llist_add,
 * llist_add (or llist_del_all, llist_add, llist_add) sequence in another
 * consumer may cause violations.
 *
 * This can be summarized as follows:
 *
 *           |   add    | del_first |  del_all
 * add       |    -     |     -     |     -
 * del_first |          |     L     |     L
 * del_all   |          |           |     -
 *
 * Where, a particular row's operation can happen concurrently with a column's
 * operation, with "-" being no lock needed, while "L" being lock is needed.
 *
 * The list entries deleted via llist_del_all can be traversed with
 * traversing function such as llist_for_each etc.  But the list
 * entries can not be traversed safely before deleted from the list.
 * The order of deleted entries is from the newest to the oldest added
 * one.  If you want to traverse from the oldest to the newest, you
 * must reverse the order by yourself before traversing.
 *
 * The basic atomic operation of this list is cmpxchg on long.  On
 * architectures that don't have NMI-safe cmpxchg implementation, the
 * list can NOT be used in NMI handlers.  So code that uses the list in
 * an NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
 *
 * Copyright 2010,2011 Intel Corp.
 *   Author: Huang Ying <ying.huang@intel.com>
 */

#include <linux/atomic.h>
#include <linux/container_of.h>
#include <linux/stddef.h>
#include <linux/types.h>

struct llist_head {
        struct llist_node *first;
};

struct llist_node {
        struct llist_node *next;
};

#define LLIST_HEAD_INIT(name)        { NULL }
#define LLIST_HEAD(name)        struct llist_head name = LLIST_HEAD_INIT(name)

/**
 * init_llist_head - initialize lock-less list head
 * @head:        the head for your lock-less list
 */
static inline void init_llist_head(struct llist_head *list)
{
        list->first = NULL;
}

/**
 * init_llist_node - initialize lock-less list node
 * @node:        the node to be initialised
 *
 * In cases where there is a need to test if a node is on
 * a list or not, this initialises the node to clearly
 * not be on any list.
 */
static inline void init_llist_node(struct llist_node *node)
{
        WRITE_ONCE(node->next, node);
}

/**
 * llist_on_list - test if a lock-list list node is on a list
 * @node:        the node to test
 *
 * When a node is on a list the ->next pointer will be NULL or
 * some other node.  It can never point to itself.  We use that
 * in init_llist_node() to record that a node is not on any list,
 * and here to test whether it is on any list.
 */
static inline bool llist_on_list(const struct llist_node *node)
{
        return READ_ONCE(node->next) != node;
}

/**
 * llist_entry - get the struct of this entry
 * @ptr:        the &struct llist_node pointer.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the llist_node within the struct.
 */
#define llist_entry(ptr, type, member)                \
        container_of(ptr, type, member)

/**
 * member_address_is_nonnull - check whether the member address is not NULL
 * @ptr:        the object pointer (struct type * that contains the llist_node)
 * @member:        the name of the llist_node within the struct.
 *
 * This macro is conceptually the same as
 *        &ptr->member != NULL
 * but it works around the fact that compilers can decide that taking a member
 * address is never a NULL pointer.
 *
 * Real objects that start at a high address and have a member at NULL are
 * unlikely to exist, but such pointers may be returned e.g. by the
 * container_of() macro.
 */
#define member_address_is_nonnull(ptr, member)        \
        ((uintptr_t)(ptr) + offsetof(typeof(*(ptr)), member) != 0)

/**
 * llist_for_each - iterate over some deleted entries of a lock-less list
 * @pos:        the &struct llist_node to use as a loop cursor
 * @node:        the first entry of deleted list entries
 *
 * In general, some entries of the lock-less list can be traversed
 * safely only after being deleted from list, so start with an entry
 * instead of list head.
 *
 * If being used on entries deleted from lock-less list directly, the
 * traverse order is from the newest to the oldest added entry.  If
 * you want to traverse from the oldest to the newest, you must
 * reverse the order by yourself before traversing.
 */
#define llist_for_each(pos, node)                        \
        for ((pos) = (node); pos; (pos) = (pos)->next)

/**
 * llist_for_each_safe - iterate over some deleted entries of a lock-less list
 *                         safe against removal of list entry
 * @pos:        the &struct llist_node to use as a loop cursor
 * @n:                another &struct llist_node to use as temporary storage
 * @node:        the first entry of deleted list entries
 *
 * In general, some entries of the lock-less list can be traversed
 * safely only after being deleted from list, so start with an entry
 * instead of list head.
 *
 * If being used on entries deleted from lock-less list directly, the
 * traverse order is from the newest to the oldest added entry.  If
 * you want to traverse from the oldest to the newest, you must
 * reverse the order by yourself before traversing.
 */
#define llist_for_each_safe(pos, n, node)                        \
        for ((pos) = (node); (pos) && ((n) = (pos)->next, true); (pos) = (n))

/**
 * llist_for_each_entry - iterate over some deleted entries of lock-less list of given type
 * @pos:        the type * to use as a loop cursor.
 * @node:        the fist entry of deleted list entries.
 * @member:        the name of the llist_node with the struct.
 *
 * In general, some entries of the lock-less list can be traversed
 * safely only after being removed from list, so start with an entry
 * instead of list head.
 *
 * If being used on entries deleted from lock-less list directly, the
 * traverse order is from the newest to the oldest added entry.  If
 * you want to traverse from the oldest to the newest, you must
 * reverse the order by yourself before traversing.
 */
#define llist_for_each_entry(pos, node, member)                                \
        for ((pos) = llist_entry((node), typeof(*(pos)), member);        \
             member_address_is_nonnull(pos, member);                        \
             (pos) = llist_entry((pos)->member.next, typeof(*(pos)), member))

/**
 * llist_for_each_entry_safe - iterate over some deleted entries of lock-less list of given type
 *                               safe against removal of list entry
 * @pos:        the type * to use as a loop cursor.
 * @n:                another type * to use as temporary storage
 * @node:        the first entry of deleted list entries.
 * @member:        the name of the llist_node with the struct.
 *
 * In general, some entries of the lock-less list can be traversed
 * safely only after being removed from list, so start with an entry
 * instead of list head.
 *
 * If being used on entries deleted from lock-less list directly, the
 * traverse order is from the newest to the oldest added entry.  If
 * you want to traverse from the oldest to the newest, you must
 * reverse the order by yourself before traversing.
 */
#define llist_for_each_entry_safe(pos, n, node, member)                               \
        for (pos = llist_entry((node), typeof(*pos), member);                       \
             member_address_is_nonnull(pos, member) &&                               \
                (n = llist_entry(pos->member.next, typeof(*n), member), true); \
             pos = n)

/**
 * llist_empty - tests whether a lock-less list is empty
 * @head:        the list to test
 *
 * Not guaranteed to be accurate or up to date.  Just a quick way to
 * test whether the list is empty without deleting something from the
 * list.
 */
static inline bool llist_empty(const struct llist_head *head)
{
        return READ_ONCE(head->first) == NULL;
}

static inline struct llist_node *llist_next(struct llist_node *node)
{
        return READ_ONCE(node->next);
}

/**
 * llist_add_batch - add several linked entries in batch
 * @new_first:        first entry in batch to be added
 * @new_last:        last entry in batch to be added
 * @head:        the head for your lock-less list
 *
 * Return whether list is empty before adding.
 */
static inline bool llist_add_batch(struct llist_node *new_first,
                                   struct llist_node *new_last,
                                   struct llist_head *head)
{
        struct llist_node *first = READ_ONCE(head->first);

        do {
                new_last->next = first;
        } while (!try_cmpxchg(&head->first, &first, new_first));

        return !first;
}

static inline bool __llist_add_batch(struct llist_node *new_first,
                                     struct llist_node *new_last,
                                     struct llist_head *head)
{
        new_last->next = head->first;
        head->first = new_first;
        return new_last->next == NULL;
}

/**
 * llist_add - add a new entry
 * @new:        new entry to be added
 * @head:        the head for your lock-less list
 *
 * Returns true if the list was empty prior to adding this entry.
 */
static inline bool llist_add(struct llist_node *new, struct llist_head *head)
{
        return llist_add_batch(new, new, head);
}

static inline bool __llist_add(struct llist_node *new, struct llist_head *head)
{
        return __llist_add_batch(new, new, head);
}

/**
 * llist_del_all - delete all entries from lock-less list
 * @head:        the head of lock-less list to delete all entries
 *
 * If list is empty, return NULL, otherwise, delete all entries and
 * return the pointer to the first entry.  The order of entries
 * deleted is from the newest to the oldest added one.
 */
static inline struct llist_node *llist_del_all(struct llist_head *head)
{
        return xchg(&head->first, NULL);
}

static inline struct llist_node *__llist_del_all(struct llist_head *head)
{
        struct llist_node *first = head->first;

        head->first = NULL;
        return first;
}

extern struct llist_node *llist_del_first(struct llist_head *head);

/**
 * llist_del_first_init - delete first entry from lock-list and mark is as being off-list
 * @head:        the head of lock-less list to delete from.
 *
 * This behave the same as llist_del_first() except that llist_init_node() is called
 * on the returned node so that llist_on_list() will report false for the node.
 */
static inline struct llist_node *llist_del_first_init(struct llist_head *head)
{
        struct llist_node *n = llist_del_first(head);

        if (n)
                init_llist_node(n);
        return n;
}

extern bool llist_del_first_this(struct llist_head *head,
                                 struct llist_node *this);

struct llist_node *llist_reverse_order(struct llist_node *head);

#endif /* LLIST_H */


























































































































































































































































    6 

    6 

    6 

    6 
    6 


















































































































































































































































































  314 


































































































































































































  320 


  320 

  312 

  318 



  319 



  314 
  317 








































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  319 









    6 


  317 


  319 
  314 

  320 
    6 



  310 



  310 



  318 


  317 

  316 
    6 





  318 



  317 



  319 





  316 















  316 

  319 









  315 













  320 










  317 


















  317 
  317 
  318 
  318 
  317 








  315 

















  317 






  315 
  320 





  316 

















  320 


  319 



  319 



  318 
  318 

  315 

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * NET                An implementation of the SOCKET network access protocol.
 *
 * Version:        @(#)socket.c        1.1.93        18/02/95
 *
 * Authors:        Orest Zborowski, <obz@Kodak.COM>
 *                Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *
 * Fixes:
 *                Anonymous        :        NOTSOCK/BADF cleanup. Error fix in
 *                                        shutdown()
 *                Alan Cox        :        verify_area() fixes
 *                Alan Cox        :        Removed DDI
 *                Jonathan Kamens        :        SOCK_DGRAM reconnect bug
 *                Alan Cox        :        Moved a load of checks to the very
 *                                        top level.
 *                Alan Cox        :        Move address structures to/from user
 *                                        mode above the protocol layers.
 *                Rob Janssen        :        Allow 0 length sends.
 *                Alan Cox        :        Asynchronous I/O support (cribbed from the
 *                                        tty drivers).
 *                Niibe Yutaka        :        Asynchronous I/O for writes (4.4BSD style)
 *                Jeff Uphoff        :        Made max number of sockets command-line
 *                                        configurable.
 *                Matti Aarnio        :        Made the number of sockets dynamic,
 *                                        to be allocated when needed, and mr.
 *                                        Uphoff's max is used as max to be
 *                                        allowed to allocate.
 *                Linus                :        Argh. removed all the socket allocation
 *                                        altogether: it's in the inode now.
 *                Alan Cox        :        Made sock_alloc()/sock_release() public
 *                                        for NetROM and future kernel nfsd type
 *                                        stuff.
 *                Alan Cox        :        sendmsg/recvmsg basics.
 *                Tom Dyas        :        Export net symbols.
 *                Marcin Dalecki        :        Fixed problems with CONFIG_NET="n".
 *                Alan Cox        :        Added thread locking to sys_* calls
 *                                        for sockets. May have errors at the
 *                                        moment.
 *                Kevin Buhr        :        Fixed the dumb errors in the above.
 *                Andi Kleen        :        Some small cleanups, optimizations,
 *                                        and fixed a copy_from_user() bug.
 *                Tigran Aivazian        :        sys_send(args) calls sys_sendto(args, NULL, 0)
 *                Tigran Aivazian        :        Made listen(2) backlog sanity checks
 *                                        protocol-independent
 *
 *        This module is effectively the top level interface to the BSD socket
 *        paradigm.
 *
 *        Based upon Swansea University Computer Society NET3.039
 */

#include <linux/bpf-cgroup.h>
#include <linux/ethtool.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/file.h>
#include <linux/splice.h>
#include <linux/net.h>
#include <linux/interrupt.h>
#include <linux/thread_info.h>
#include <linux/rcupdate.h>
#include <linux/netdevice.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/mutex.h>
#include <linux/if_bridge.h>
#include <linux/if_vlan.h>
#include <linux/ptp_classify.h>
#include <linux/init.h>
#include <linux/poll.h>
#include <linux/cache.h>
#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/mount.h>
#include <linux/pseudo_fs.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/compat.h>
#include <linux/kmod.h>
#include <linux/audit.h>
#include <linux/wireless.h>
#include <linux/nsproxy.h>
#include <linux/magic.h>
#include <linux/slab.h>
#include <linux/xattr.h>
#include <linux/nospec.h>
#include <linux/indirect_call_wrapper.h>
#include <linux/io_uring/net.h>

#include <linux/uaccess.h>
#include <asm/unistd.h>

#include <net/compat.h>
#include <net/wext.h>
#include <net/cls_cgroup.h>

#include <net/sock.h>
#include <linux/netfilter.h>

#include <linux/if_tun.h>
#include <linux/ipv6_route.h>
#include <linux/route.h>
#include <linux/termios.h>
#include <linux/sockios.h>
#include <net/busy_poll.h>
#include <linux/errqueue.h>
#include <linux/ptp_clock_kernel.h>
#include <trace/events/sock.h>

#include "core/dev.h"

#ifdef CONFIG_NET_RX_BUSY_POLL
unsigned int sysctl_net_busy_read __read_mostly;
unsigned int sysctl_net_busy_poll __read_mostly;
#endif

static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to);
static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from);
static int sock_mmap(struct file *file, struct vm_area_struct *vma);

static int sock_close(struct inode *inode, struct file *file);
static __poll_t sock_poll(struct file *file,
                              struct poll_table_struct *wait);
static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
#ifdef CONFIG_COMPAT
static long compat_sock_ioctl(struct file *file,
                              unsigned int cmd, unsigned long arg);
#endif
static int sock_fasync(int fd, struct file *filp, int on);
static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
                                struct pipe_inode_info *pipe, size_t len,
                                unsigned int flags);
static void sock_splice_eof(struct file *file);

#ifdef CONFIG_PROC_FS
static void sock_show_fdinfo(struct seq_file *m, struct file *f)
{
        struct socket *sock = f->private_data;
        const struct proto_ops *ops = READ_ONCE(sock->ops);

        if (ops->show_fdinfo)
                ops->show_fdinfo(m, sock);
}
#else
#define sock_show_fdinfo NULL
#endif

/*
 *        Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
 *        in the operation structures but are done directly via the socketcall() multiplexor.
 */

static const struct file_operations socket_file_ops = {
        .owner =        THIS_MODULE,
        .read_iter =        sock_read_iter,
        .write_iter =        sock_write_iter,
        .poll =                sock_poll,
        .unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl = compat_sock_ioctl,
#endif
        .uring_cmd =    io_uring_cmd_sock,
        .mmap =                sock_mmap,
        .release =        sock_close,
        .fasync =        sock_fasync,
        .splice_write = splice_to_socket,
        .splice_read =        sock_splice_read,
        .splice_eof =        sock_splice_eof,
        .show_fdinfo =        sock_show_fdinfo,
};

static const char * const pf_family_names[] = {
        [PF_UNSPEC]        = "PF_UNSPEC",
        [PF_UNIX]        = "PF_UNIX/PF_LOCAL",
        [PF_INET]        = "PF_INET",
        [PF_AX25]        = "PF_AX25",
        [PF_IPX]        = "PF_IPX",
        [PF_APPLETALK]        = "PF_APPLETALK",
        [PF_NETROM]        = "PF_NETROM",
        [PF_BRIDGE]        = "PF_BRIDGE",
        [PF_ATMPVC]        = "PF_ATMPVC",
        [PF_X25]        = "PF_X25",
        [PF_INET6]        = "PF_INET6",
        [PF_ROSE]        = "PF_ROSE",
        [PF_DECnet]        = "PF_DECnet",
        [PF_NETBEUI]        = "PF_NETBEUI",
        [PF_SECURITY]        = "PF_SECURITY",
        [PF_KEY]        = "PF_KEY",
        [PF_NETLINK]        = "PF_NETLINK/PF_ROUTE",
        [PF_PACKET]        = "PF_PACKET",
        [PF_ASH]        = "PF_ASH",
        [PF_ECONET]        = "PF_ECONET",
        [PF_ATMSVC]        = "PF_ATMSVC",
        [PF_RDS]        = "PF_RDS",
        [PF_SNA]        = "PF_SNA",
        [PF_IRDA]        = "PF_IRDA",
        [PF_PPPOX]        = "PF_PPPOX",
        [PF_WANPIPE]        = "PF_WANPIPE",
        [PF_LLC]        = "PF_LLC",
        [PF_IB]                = "PF_IB",
        [PF_MPLS]        = "PF_MPLS",
        [PF_CAN]        = "PF_CAN",
        [PF_TIPC]        = "PF_TIPC",
        [PF_BLUETOOTH]        = "PF_BLUETOOTH",
        [PF_IUCV]        = "PF_IUCV",
        [PF_RXRPC]        = "PF_RXRPC",
        [PF_ISDN]        = "PF_ISDN",
        [PF_PHONET]        = "PF_PHONET",
        [PF_IEEE802154]        = "PF_IEEE802154",
        [PF_CAIF]        = "PF_CAIF",
        [PF_ALG]        = "PF_ALG",
        [PF_NFC]        = "PF_NFC",
        [PF_VSOCK]        = "PF_VSOCK",
        [PF_KCM]        = "PF_KCM",
        [PF_QIPCRTR]        = "PF_QIPCRTR",
        [PF_SMC]        = "PF_SMC",
        [PF_XDP]        = "PF_XDP",
        [PF_MCTP]        = "PF_MCTP",
};

/*
 *        The protocol list. Each protocol is registered in here.
 */

static DEFINE_SPINLOCK(net_family_lock);
static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly;

/*
 * Support routines.
 * Move socket addresses back and forth across the kernel/user
 * divide and look after the messy bits.
 */

/**
 *        move_addr_to_kernel        -        copy a socket address into kernel space
 *        @uaddr: Address in user space
 *        @kaddr: Address in kernel space
 *        @ulen: Length in user space
 *
 *        The address is copied into kernel space. If the provided address is
 *        too long an error code of -EINVAL is returned. If the copy gives
 *        invalid addresses -EFAULT is returned. On a success 0 is returned.
 */

int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr)
{
        if (ulen < 0 || ulen > sizeof(struct sockaddr_storage))
                return -EINVAL;
        if (ulen == 0)
                return 0;
        if (copy_from_user(kaddr, uaddr, ulen))
                return -EFAULT;
        return audit_sockaddr(ulen, kaddr);
}

/**
 *        move_addr_to_user        -        copy an address to user space
 *        @kaddr: kernel space address
 *        @klen: length of address in kernel
 *        @uaddr: user space address
 *        @ulen: pointer to user length field
 *
 *        The value pointed to by ulen on entry is the buffer length available.
 *        This is overwritten with the buffer space used. -EINVAL is returned
 *        if an overlong buffer is specified or a negative buffer size. -EFAULT
 *        is returned if either the buffer or the length field are not
 *        accessible.
 *        After copying the data up to the limit the user specifies, the true
 *        length of the data is written over the length limit the user
 *        specified. Zero is returned for a success.
 */

static int move_addr_to_user(struct sockaddr_storage *kaddr, int klen,
                             void __user *uaddr, int __user *ulen)
{
        int len;

        BUG_ON(klen > sizeof(struct sockaddr_storage));

        if (can_do_masked_user_access())
                ulen = masked_user_access_begin(ulen);
        else if (!user_access_begin(ulen, 4))
                return -EFAULT;

        unsafe_get_user(len, ulen, efault_end);

        if (len > klen)
                len = klen;
        /*
         *      "fromlen shall refer to the value before truncation.."
         *                      1003.1g
         */
        if (len >= 0)
                unsafe_put_user(klen, ulen, efault_end);

        user_access_end();

        if (len) {
                if (len < 0)
                        return -EINVAL;
                if (audit_sockaddr(klen, kaddr))
                        return -ENOMEM;
                if (copy_to_user(uaddr, kaddr, len))
                        return -EFAULT;
        }
        return 0;

efault_end:
        user_access_end();
        return -EFAULT;
}

static struct kmem_cache *sock_inode_cachep __ro_after_init;

static struct inode *sock_alloc_inode(struct super_block *sb)
{
        struct socket_alloc *ei;

        ei = alloc_inode_sb(sb, sock_inode_cachep, GFP_KERNEL);
        if (!ei)
                return NULL;
        init_waitqueue_head(&ei->socket.wq.wait);
        ei->socket.wq.fasync_list = NULL;
        ei->socket.wq.flags = 0;

        ei->socket.state = SS_UNCONNECTED;
        ei->socket.flags = 0;
        ei->socket.ops = NULL;
        ei->socket.sk = NULL;
        ei->socket.file = NULL;

        return &ei->vfs_inode;
}

static void sock_free_inode(struct inode *inode)
{
        struct socket_alloc *ei;

        ei = container_of(inode, struct socket_alloc, vfs_inode);
        kmem_cache_free(sock_inode_cachep, ei);
}

static void init_once(void *foo)
{
        struct socket_alloc *ei = (struct socket_alloc *)foo;

        inode_init_once(&ei->vfs_inode);
}

static void init_inodecache(void)
{
        sock_inode_cachep = kmem_cache_create("sock_inode_cache",
                                              sizeof(struct socket_alloc),
                                              0,
                                              (SLAB_HWCACHE_ALIGN |
                                               SLAB_RECLAIM_ACCOUNT |
                                               SLAB_ACCOUNT),
                                              init_once);
        BUG_ON(sock_inode_cachep == NULL);
}

static const struct super_operations sockfs_ops = {
        .alloc_inode        = sock_alloc_inode,
        .free_inode        = sock_free_inode,
        .statfs                = simple_statfs,
};

/*
 * sockfs_dname() is called from d_path().
 */
static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen)
{
        return dynamic_dname(buffer, buflen, "socket:[%lu]",
                                d_inode(dentry)->i_ino);
}

static const struct dentry_operations sockfs_dentry_operations = {
        .d_dname  = sockfs_dname,
};

static int sockfs_xattr_get(const struct xattr_handler *handler,
                            struct dentry *dentry, struct inode *inode,
                            const char *suffix, void *value, size_t size)
{
        if (value) {
                if (dentry->d_name.len + 1 > size)
                        return -ERANGE;
                memcpy(value, dentry->d_name.name, dentry->d_name.len + 1);
        }
        return dentry->d_name.len + 1;
}

#define XATTR_SOCKPROTONAME_SUFFIX "sockprotoname"
#define XATTR_NAME_SOCKPROTONAME (XATTR_SYSTEM_PREFIX XATTR_SOCKPROTONAME_SUFFIX)
#define XATTR_NAME_SOCKPROTONAME_LEN (sizeof(XATTR_NAME_SOCKPROTONAME)-1)

static const struct xattr_handler sockfs_xattr_handler = {
        .name = XATTR_NAME_SOCKPROTONAME,
        .get = sockfs_xattr_get,
};

static int sockfs_security_xattr_set(const struct xattr_handler *handler,
                                     struct mnt_idmap *idmap,
                                     struct dentry *dentry, struct inode *inode,
                                     const char *suffix, const void *value,
                                     size_t size, int flags)
{
        /* Handled by LSM. */
        return -EAGAIN;
}

static const struct xattr_handler sockfs_security_xattr_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .set = sockfs_security_xattr_set,
};

static const struct xattr_handler * const sockfs_xattr_handlers[] = {
        &sockfs_xattr_handler,
        &sockfs_security_xattr_handler,
        NULL
};

static int sockfs_init_fs_context(struct fs_context *fc)
{
        struct pseudo_fs_context *ctx = init_pseudo(fc, SOCKFS_MAGIC);
        if (!ctx)
                return -ENOMEM;
        ctx->ops = &sockfs_ops;
        ctx->dops = &sockfs_dentry_operations;
        ctx->xattr = sockfs_xattr_handlers;
        return 0;
}

static struct vfsmount *sock_mnt __read_mostly;

static struct file_system_type sock_fs_type = {
        .name =                "sockfs",
        .init_fs_context = sockfs_init_fs_context,
        .kill_sb =        kill_anon_super,
};

/*
 *        Obtains the first available file descriptor and sets it up for use.
 *
 *        These functions create file structures and maps them to fd space
 *        of the current process. On success it returns file descriptor
 *        and file struct implicitly stored in sock->file.
 *        Note that another thread may close file descriptor before we return
 *        from this function. We use the fact that now we do not refer
 *        to socket after mapping. If one day we will need it, this
 *        function will increment ref. count on file by 1.
 *
 *        In any case returned fd MAY BE not valid!
 *        This race condition is unavoidable
 *        with shared fd spaces, we cannot solve it inside kernel,
 *        but we take care of internal coherence yet.
 */

/**
 *        sock_alloc_file - Bind a &socket to a &file
 *        @sock: socket
 *        @flags: file status flags
 *        @dname: protocol name
 *
 *        Returns the &file bound with @sock, implicitly storing it
 *        in sock->file. If dname is %NULL, sets to "".
 *
 *        On failure @sock is released, and an ERR pointer is returned.
 *
 *        This function uses GFP_KERNEL internally.
 */

struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
{
        struct file *file;

        if (!dname)
                dname = sock->sk ? sock->sk->sk_prot_creator->name : "";

        file = alloc_file_pseudo(SOCK_INODE(sock), sock_mnt, dname,
                                O_RDWR | (flags & O_NONBLOCK),
                                &socket_file_ops);
        if (IS_ERR(file)) {
                sock_release(sock);
                return file;
        }

        file->f_mode |= FMODE_NOWAIT;
        sock->file = file;
        file->private_data = sock;
        stream_open(SOCK_INODE(sock), file);
        /*
         * Disable permission and pre-content events, but enable legacy
         * inotify events for legacy users.
         */
        file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM);
        return file;
}
EXPORT_SYMBOL(sock_alloc_file);

static int sock_map_fd(struct socket *sock, int flags)
{
        struct file *newfile;
        int fd = get_unused_fd_flags(flags);
        if (unlikely(fd < 0)) {
                sock_release(sock);
                return fd;
        }

        newfile = sock_alloc_file(sock, flags, NULL);
        if (!IS_ERR(newfile)) {
                fd_install(fd, newfile);
                return fd;
        }

        put_unused_fd(fd);
        return PTR_ERR(newfile);
}

/**
 *        sock_from_file - Return the &socket bounded to @file.
 *        @file: file
 *
 *        On failure returns %NULL.
 */

struct socket *sock_from_file(struct file *file)
{
        if (likely(file->f_op == &socket_file_ops))
                return file->private_data;        /* set in sock_alloc_file */

        return NULL;
}
EXPORT_SYMBOL(sock_from_file);

/**
 *        sockfd_lookup - Go from a file number to its socket slot
 *        @fd: file handle
 *        @err: pointer to an error code return
 *
 *        The file handle passed in is locked and the socket it is bound
 *        to is returned. If an error occurs the err pointer is overwritten
 *        with a negative errno code and NULL is returned. The function checks
 *        for both invalid handles and passing a handle which is not a socket.
 *
 *        On a success the socket object pointer is returned.
 */

struct socket *sockfd_lookup(int fd, int *err)
{
        struct file *file;
        struct socket *sock;

        file = fget(fd);
        if (!file) {
                *err = -EBADF;
                return NULL;
        }

        sock = sock_from_file(file);
        if (!sock) {
                *err = -ENOTSOCK;
                fput(file);
        }
        return sock;
}
EXPORT_SYMBOL(sockfd_lookup);

static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer,
                                size_t size)
{
        ssize_t len;
        ssize_t used = 0;

        len = security_inode_listsecurity(d_inode(dentry), buffer, size);
        if (len < 0)
                return len;
        used += len;
        if (buffer) {
                if (size < used)
                        return -ERANGE;
                buffer += len;
        }

        len = (XATTR_NAME_SOCKPROTONAME_LEN + 1);
        used += len;
        if (buffer) {
                if (size < used)
                        return -ERANGE;
                memcpy(buffer, XATTR_NAME_SOCKPROTONAME, len);
                buffer += len;
        }

        return used;
}

static int sockfs_setattr(struct mnt_idmap *idmap,
                          struct dentry *dentry, struct iattr *iattr)
{
        int err = simple_setattr(&nop_mnt_idmap, dentry, iattr);

        if (!err && (iattr->ia_valid & ATTR_UID)) {
                struct socket *sock = SOCKET_I(d_inode(dentry));

                if (sock->sk) {
                        /* Paired with READ_ONCE() in sk_uid() */
                        WRITE_ONCE(sock->sk->sk_uid, iattr->ia_uid);
                } else {
                        err = -ENOENT;
                }
        }

        return err;
}

static const struct inode_operations sockfs_inode_ops = {
        .listxattr = sockfs_listxattr,
        .setattr = sockfs_setattr,
};

/**
 *        sock_alloc - allocate a socket
 *
 *        Allocate a new inode and socket object. The two are bound together
 *        and initialised. The socket is then returned. If we are out of inodes
 *        NULL is returned. This functions uses GFP_KERNEL internally.
 */

struct socket *sock_alloc(void)
{
        struct inode *inode;
        struct socket *sock;

        inode = new_inode_pseudo(sock_mnt->mnt_sb);
        if (!inode)
                return NULL;

        sock = SOCKET_I(inode);

        inode->i_ino = get_next_ino();
        inode->i_mode = S_IFSOCK | S_IRWXUGO;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
        inode->i_op = &sockfs_inode_ops;

        return sock;
}
EXPORT_SYMBOL(sock_alloc);

static void __sock_release(struct socket *sock, struct inode *inode)
{
        const struct proto_ops *ops = READ_ONCE(sock->ops);

        if (ops) {
                struct module *owner = ops->owner;

                if (inode)
                        inode_lock(inode);
                ops->release(sock);
                sock->sk = NULL;
                if (inode)
                        inode_unlock(inode);
                sock->ops = NULL;
                module_put(owner);
        }

        if (sock->wq.fasync_list)
                pr_err("%s: fasync list not empty!\n", __func__);

        if (!sock->file) {
                iput(SOCK_INODE(sock));
                return;
        }
        sock->file = NULL;
}

/**
 *        sock_release - close a socket
 *        @sock: socket to close
 *
 *        The socket is released from the protocol stack if it has a release
 *        callback, and the inode is then released if the socket is bound to
 *        an inode not a file.
 */
void sock_release(struct socket *sock)
{
        __sock_release(sock, NULL);
}
EXPORT_SYMBOL(sock_release);

void __sock_tx_timestamp(__u32 tsflags, __u8 *tx_flags)
{
        u8 flags = *tx_flags;

        if (tsflags & SOF_TIMESTAMPING_TX_HARDWARE)
                flags |= SKBTX_HW_TSTAMP_NOBPF;

        if (tsflags & SOF_TIMESTAMPING_TX_SOFTWARE)
                flags |= SKBTX_SW_TSTAMP;

        if (tsflags & SOF_TIMESTAMPING_TX_SCHED)
                flags |= SKBTX_SCHED_TSTAMP;

        if (tsflags & SOF_TIMESTAMPING_TX_COMPLETION)
                flags |= SKBTX_COMPLETION_TSTAMP;

        *tx_flags = flags;
}
EXPORT_SYMBOL(__sock_tx_timestamp);

INDIRECT_CALLABLE_DECLARE(int inet_sendmsg(struct socket *, struct msghdr *,
                                           size_t));
INDIRECT_CALLABLE_DECLARE(int inet6_sendmsg(struct socket *, struct msghdr *,
                                            size_t));

static noinline void call_trace_sock_send_length(struct sock *sk, int ret,
                                                 int flags)
{
        trace_sock_send_length(sk, ret, 0);
}

static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg)
{
        int ret = INDIRECT_CALL_INET(READ_ONCE(sock->ops)->sendmsg, inet6_sendmsg,
                                     inet_sendmsg, sock, msg,
                                     msg_data_left(msg));
        BUG_ON(ret == -EIOCBQUEUED);

        if (trace_sock_send_length_enabled())
                call_trace_sock_send_length(sock->sk, ret, 0);
        return ret;
}

static int __sock_sendmsg(struct socket *sock, struct msghdr *msg)
{
        int err = security_socket_sendmsg(sock, msg,
                                          msg_data_left(msg));

        return err ?: sock_sendmsg_nosec(sock, msg);
}

/**
 *        sock_sendmsg - send a message through @sock
 *        @sock: socket
 *        @msg: message to send
 *
 *        Sends @msg through @sock, passing through LSM.
 *        Returns the number of bytes sent, or an error code.
 */
int sock_sendmsg(struct socket *sock, struct msghdr *msg)
{
        struct sockaddr_storage *save_addr = (struct sockaddr_storage *)msg->msg_name;
        struct sockaddr_storage address;
        int save_len = msg->msg_namelen;
        int ret;

        if (msg->msg_name) {
                memcpy(&address, msg->msg_name, msg->msg_namelen);
                msg->msg_name = &address;
        }

        ret = __sock_sendmsg(sock, msg);
        msg->msg_name = save_addr;
        msg->msg_namelen = save_len;

        return ret;
}
EXPORT_SYMBOL(sock_sendmsg);

/**
 *        kernel_sendmsg - send a message through @sock (kernel-space)
 *        @sock: socket
 *        @msg: message header
 *        @vec: kernel vec
 *        @num: vec array length
 *        @size: total message data size
 *
 *        Builds the message data with @vec and sends it through @sock.
 *        Returns the number of bytes sent, or an error code.
 */

int kernel_sendmsg(struct socket *sock, struct msghdr *msg,
                   struct kvec *vec, size_t num, size_t size)
{
        iov_iter_kvec(&msg->msg_iter, ITER_SOURCE, vec, num, size);
        return sock_sendmsg(sock, msg);
}
EXPORT_SYMBOL(kernel_sendmsg);

static bool skb_is_err_queue(const struct sk_buff *skb)
{
        /* pkt_type of skbs enqueued on the error queue are set to
         * PACKET_OUTGOING in skb_set_err_queue(). This is only safe to do
         * in recvmsg, since skbs received on a local socket will never
         * have a pkt_type of PACKET_OUTGOING.
         */
        return skb->pkt_type == PACKET_OUTGOING;
}

/* On transmit, software and hardware timestamps are returned independently.
 * As the two skb clones share the hardware timestamp, which may be updated
 * before the software timestamp is received, a hardware TX timestamp may be
 * returned only if there is no software TX timestamp. Ignore false software
 * timestamps, which may be made in the __sock_recv_timestamp() call when the
 * option SO_TIMESTAMP_OLD(NS) is enabled on the socket, even when the skb has a
 * hardware timestamp.
 */
static bool skb_is_swtx_tstamp(const struct sk_buff *skb, int false_tstamp)
{
        return skb->tstamp && !false_tstamp && skb_is_err_queue(skb);
}

static ktime_t get_timestamp(struct sock *sk, struct sk_buff *skb, int *if_index)
{
        bool cycles = READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_BIND_PHC;
        struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
        struct net_device *orig_dev;
        ktime_t hwtstamp;

        rcu_read_lock();
        orig_dev = dev_get_by_napi_id(skb_napi_id(skb));
        if (orig_dev) {
                *if_index = orig_dev->ifindex;
                hwtstamp = netdev_get_tstamp(orig_dev, shhwtstamps, cycles);
        } else {
                hwtstamp = shhwtstamps->hwtstamp;
        }
        rcu_read_unlock();

        return hwtstamp;
}

static void put_ts_pktinfo(struct msghdr *msg, struct sk_buff *skb,
                           int if_index)
{
        struct scm_ts_pktinfo ts_pktinfo;
        struct net_device *orig_dev;

        if (!skb_mac_header_was_set(skb))
                return;

        memset(&ts_pktinfo, 0, sizeof(ts_pktinfo));

        if (!if_index) {
                rcu_read_lock();
                orig_dev = dev_get_by_napi_id(skb_napi_id(skb));
                if (orig_dev)
                        if_index = orig_dev->ifindex;
                rcu_read_unlock();
        }
        ts_pktinfo.if_index = if_index;

        ts_pktinfo.pkt_length = skb->len - skb_mac_offset(skb);
        put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_PKTINFO,
                 sizeof(ts_pktinfo), &ts_pktinfo);
}

bool skb_has_tx_timestamp(struct sk_buff *skb, const struct sock *sk)
{
        const struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
        u32 tsflags = READ_ONCE(sk->sk_tsflags);

        if (serr->ee.ee_errno != ENOMSG ||
           serr->ee.ee_origin != SO_EE_ORIGIN_TIMESTAMPING)
                return false;

        /* software time stamp available and wanted */
        if ((tsflags & SOF_TIMESTAMPING_SOFTWARE) && skb->tstamp)
                return true;
        /* hardware time stamps available and wanted */
        return (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
                skb_hwtstamps(skb)->hwtstamp;
}

int skb_get_tx_timestamp(struct sk_buff *skb, struct sock *sk,
                          struct timespec64 *ts)
{
        u32 tsflags = READ_ONCE(sk->sk_tsflags);
        ktime_t hwtstamp;
        int if_index = 0;

        if ((tsflags & SOF_TIMESTAMPING_SOFTWARE) &&
            ktime_to_timespec64_cond(skb->tstamp, ts))
                return SOF_TIMESTAMPING_TX_SOFTWARE;

        if (!(tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) ||
            skb_is_swtx_tstamp(skb, false))
                return -ENOENT;

        if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NETDEV)
                hwtstamp = get_timestamp(sk, skb, &if_index);
        else
                hwtstamp = skb_hwtstamps(skb)->hwtstamp;

        if (tsflags & SOF_TIMESTAMPING_BIND_PHC)
                hwtstamp = ptp_convert_timestamp(&hwtstamp,
                                                READ_ONCE(sk->sk_bind_phc));
        if (!ktime_to_timespec64_cond(hwtstamp, ts))
                return -ENOENT;

        return SOF_TIMESTAMPING_TX_HARDWARE;
}

/*
 * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP)
 */
void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
        struct sk_buff *skb)
{
        int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP);
        int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
        struct scm_timestamping_internal tss;
        int empty = 1, false_tstamp = 0;
        struct skb_shared_hwtstamps *shhwtstamps =
                skb_hwtstamps(skb);
        int if_index;
        ktime_t hwtstamp;
        u32 tsflags;

        /* Race occurred between timestamp enabling and packet
           receiving.  Fill in the current time for now. */
        if (need_software_tstamp && skb->tstamp == 0) {
                __net_timestamp(skb);
                false_tstamp = 1;
        }

        if (need_software_tstamp) {
                if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) {
                        if (new_tstamp) {
                                struct __kernel_sock_timeval tv;

                                skb_get_new_timestamp(skb, &tv);
                                put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
                                         sizeof(tv), &tv);
                        } else {
                                struct __kernel_old_timeval tv;

                                skb_get_timestamp(skb, &tv);
                                put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
                                         sizeof(tv), &tv);
                        }
                } else {
                        if (new_tstamp) {
                                struct __kernel_timespec ts;

                                skb_get_new_timestampns(skb, &ts);
                                put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
                                         sizeof(ts), &ts);
                        } else {
                                struct __kernel_old_timespec ts;

                                skb_get_timestampns(skb, &ts);
                                put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
                                         sizeof(ts), &ts);
                        }
                }
        }

        memset(&tss, 0, sizeof(tss));
        tsflags = READ_ONCE(sk->sk_tsflags);
        if ((tsflags & SOF_TIMESTAMPING_SOFTWARE &&
             (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE ||
              skb_is_err_queue(skb) ||
              !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER))) &&
            ktime_to_timespec64_cond(skb->tstamp, tss.ts + 0))
                empty = 0;
        if (shhwtstamps &&
            (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE &&
             (tsflags & SOF_TIMESTAMPING_RX_HARDWARE ||
              skb_is_err_queue(skb) ||
              !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER))) &&
            !skb_is_swtx_tstamp(skb, false_tstamp)) {
                if_index = 0;
                if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NETDEV)
                        hwtstamp = get_timestamp(sk, skb, &if_index);
                else
                        hwtstamp = shhwtstamps->hwtstamp;

                if (tsflags & SOF_TIMESTAMPING_BIND_PHC)
                        hwtstamp = ptp_convert_timestamp(&hwtstamp,
                                                         READ_ONCE(sk->sk_bind_phc));

                if (ktime_to_timespec64_cond(hwtstamp, tss.ts + 2)) {
                        empty = 0;

                        if ((tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) &&
                            !skb_is_err_queue(skb))
                                put_ts_pktinfo(msg, skb, if_index);
                }
        }
        if (!empty) {
                if (sock_flag(sk, SOCK_TSTAMP_NEW))
                        put_cmsg_scm_timestamping64(msg, &tss);
                else
                        put_cmsg_scm_timestamping(msg, &tss);

                if (skb_is_err_queue(skb) && skb->len &&
                    SKB_EXT_ERR(skb)->opt_stats)
                        put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_OPT_STATS,
                                 skb->len, skb->data);
        }
}
EXPORT_SYMBOL_GPL(__sock_recv_timestamp);

#ifdef CONFIG_WIRELESS
void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
        struct sk_buff *skb)
{
        int ack;

        if (!sock_flag(sk, SOCK_WIFI_STATUS))
                return;
        if (!skb->wifi_acked_valid)
                return;

        ack = skb->wifi_acked;

        put_cmsg(msg, SOL_SOCKET, SCM_WIFI_STATUS, sizeof(ack), &ack);
}
EXPORT_SYMBOL_GPL(__sock_recv_wifi_status);
#endif

static inline void sock_recv_drops(struct msghdr *msg, struct sock *sk,
                                   struct sk_buff *skb)
{
        if (sock_flag(sk, SOCK_RXQ_OVFL) && skb && SOCK_SKB_CB(skb)->dropcount)
                put_cmsg(msg, SOL_SOCKET, SO_RXQ_OVFL,
                        sizeof(__u32), &SOCK_SKB_CB(skb)->dropcount);
}

static void sock_recv_mark(struct msghdr *msg, struct sock *sk,
                           struct sk_buff *skb)
{
        if (sock_flag(sk, SOCK_RCVMARK) && skb) {
                /* We must use a bounce buffer for CONFIG_HARDENED_USERCOPY=y */
                __u32 mark = skb->mark;

                put_cmsg(msg, SOL_SOCKET, SO_MARK, sizeof(__u32), &mark);
        }
}

static void sock_recv_priority(struct msghdr *msg, struct sock *sk,
                               struct sk_buff *skb)
{
        if (sock_flag(sk, SOCK_RCVPRIORITY) && skb) {
                __u32 priority = skb->priority;

                put_cmsg(msg, SOL_SOCKET, SO_PRIORITY, sizeof(__u32), &priority);
        }
}

void __sock_recv_cmsgs(struct msghdr *msg, struct sock *sk,
                       struct sk_buff *skb)
{
        sock_recv_timestamp(msg, sk, skb);
        sock_recv_drops(msg, sk, skb);
        sock_recv_mark(msg, sk, skb);
        sock_recv_priority(msg, sk, skb);
}
EXPORT_SYMBOL_GPL(__sock_recv_cmsgs);

INDIRECT_CALLABLE_DECLARE(int inet_recvmsg(struct socket *, struct msghdr *,
                                           size_t, int));
INDIRECT_CALLABLE_DECLARE(int inet6_recvmsg(struct socket *, struct msghdr *,
                                            size_t, int));

static noinline void call_trace_sock_recv_length(struct sock *sk, int ret, int flags)
{
        trace_sock_recv_length(sk, ret, flags);
}

static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
                                     int flags)
{
        int ret = INDIRECT_CALL_INET(READ_ONCE(sock->ops)->recvmsg,
                                     inet6_recvmsg,
                                     inet_recvmsg, sock, msg,
                                     msg_data_left(msg), flags);
        if (trace_sock_recv_length_enabled())
                call_trace_sock_recv_length(sock->sk, ret, flags);
        return ret;
}

/**
 *        sock_recvmsg - receive a message from @sock
 *        @sock: socket
 *        @msg: message to receive
 *        @flags: message flags
 *
 *        Receives @msg from @sock, passing through LSM. Returns the total number
 *        of bytes received, or an error.
 */
int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags)
{
        int err = security_socket_recvmsg(sock, msg, msg_data_left(msg), flags);

        return err ?: sock_recvmsg_nosec(sock, msg, flags);
}
EXPORT_SYMBOL(sock_recvmsg);

/**
 *        kernel_recvmsg - Receive a message from a socket (kernel space)
 *        @sock: The socket to receive the message from
 *        @msg: Received message
 *        @vec: Input s/g array for message data
 *        @num: Size of input s/g array
 *        @size: Number of bytes to read
 *        @flags: Message flags (MSG_DONTWAIT, etc...)
 *
 *        On return the msg structure contains the scatter/gather array passed in the
 *        vec argument. The array is modified so that it consists of the unfilled
 *        portion of the original array.
 *
 *        The returned value is the total number of bytes received, or an error.
 */

int kernel_recvmsg(struct socket *sock, struct msghdr *msg,
                   struct kvec *vec, size_t num, size_t size, int flags)
{
        msg->msg_control_is_user = false;
        iov_iter_kvec(&msg->msg_iter, ITER_DEST, vec, num, size);
        return sock_recvmsg(sock, msg, flags);
}
EXPORT_SYMBOL(kernel_recvmsg);

static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
                                struct pipe_inode_info *pipe, size_t len,
                                unsigned int flags)
{
        struct socket *sock = file->private_data;
        const struct proto_ops *ops;

        ops = READ_ONCE(sock->ops);
        if (unlikely(!ops->splice_read))
                return copy_splice_read(file, ppos, pipe, len, flags);

        return ops->splice_read(sock, ppos, pipe, len, flags);
}

static void sock_splice_eof(struct file *file)
{
        struct socket *sock = file->private_data;
        const struct proto_ops *ops;

        ops = READ_ONCE(sock->ops);
        if (ops->splice_eof)
                ops->splice_eof(sock);
}

static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        struct file *file = iocb->ki_filp;
        struct socket *sock = file->private_data;
        struct msghdr msg = {.msg_iter = *to,
                             .msg_iocb = iocb};
        ssize_t res;

        if (file->f_flags & O_NONBLOCK || (iocb->ki_flags & IOCB_NOWAIT))
                msg.msg_flags = MSG_DONTWAIT;

        if (iocb->ki_pos != 0)
                return -ESPIPE;

        if (!iov_iter_count(to))        /* Match SYS5 behaviour */
                return 0;

        res = sock_recvmsg(sock, &msg, msg.msg_flags);
        *to = msg.msg_iter;
        return res;
}

static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *file = iocb->ki_filp;
        struct socket *sock = file->private_data;
        struct msghdr msg = {.msg_iter = *from,
                             .msg_iocb = iocb};
        ssize_t res;

        if (iocb->ki_pos != 0)
                return -ESPIPE;

        if (file->f_flags & O_NONBLOCK || (iocb->ki_flags & IOCB_NOWAIT))
                msg.msg_flags = MSG_DONTWAIT;

        if (sock->type == SOCK_SEQPACKET)
                msg.msg_flags |= MSG_EOR;

        if (iocb->ki_flags & IOCB_NOSIGNAL)
                msg.msg_flags |= MSG_NOSIGNAL;

        res = __sock_sendmsg(sock, &msg);
        *from = msg.msg_iter;
        return res;
}

/*
 * Atomic setting of ioctl hooks to avoid race
 * with module unload.
 */

static DEFINE_MUTEX(br_ioctl_mutex);
static int (*br_ioctl_hook)(struct net *net, unsigned int cmd,
                            void __user *uarg);

void brioctl_set(int (*hook)(struct net *net, unsigned int cmd,
                             void __user *uarg))
{
        mutex_lock(&br_ioctl_mutex);
        br_ioctl_hook = hook;
        mutex_unlock(&br_ioctl_mutex);
}
EXPORT_SYMBOL(brioctl_set);

int br_ioctl_call(struct net *net, unsigned int cmd, void __user *uarg)
{
        int err = -ENOPKG;

        if (!br_ioctl_hook)
                request_module("bridge");

        mutex_lock(&br_ioctl_mutex);
        if (br_ioctl_hook)
                err = br_ioctl_hook(net, cmd, uarg);
        mutex_unlock(&br_ioctl_mutex);

        return err;
}

static DEFINE_MUTEX(vlan_ioctl_mutex);
static int (*vlan_ioctl_hook) (struct net *, void __user *arg);

void vlan_ioctl_set(int (*hook) (struct net *, void __user *))
{
        mutex_lock(&vlan_ioctl_mutex);
        vlan_ioctl_hook = hook;
        mutex_unlock(&vlan_ioctl_mutex);
}
EXPORT_SYMBOL(vlan_ioctl_set);

static long sock_do_ioctl(struct net *net, struct socket *sock,
                          unsigned int cmd, unsigned long arg)
{
        const struct proto_ops *ops = READ_ONCE(sock->ops);
        struct ifreq ifr;
        bool need_copyout;
        int err;
        void __user *argp = (void __user *)arg;
        void __user *data;

        err = ops->ioctl(sock, cmd, arg);

        /*
         * If this ioctl is unknown try to hand it down
         * to the NIC driver.
         */
        if (err != -ENOIOCTLCMD)
                return err;

        if (!is_socket_ioctl_cmd(cmd))
                return -ENOTTY;

        if (get_user_ifreq(&ifr, &data, argp))
                return -EFAULT;
        err = dev_ioctl(net, cmd, &ifr, data, &need_copyout);
        if (!err && need_copyout)
                if (put_user_ifreq(&ifr, argp))
                        return -EFAULT;

        return err;
}

/*
 *        With an ioctl, arg may well be a user mode pointer, but we don't know
 *        what to do with it - that's up to the protocol still.
 */

static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
        const struct proto_ops  *ops;
        struct socket *sock;
        struct sock *sk;
        void __user *argp = (void __user *)arg;
        int pid, err;
        struct net *net;

        sock = file->private_data;
        ops = READ_ONCE(sock->ops);
        sk = sock->sk;
        net = sock_net(sk);
        if (unlikely(cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15))) {
                struct ifreq ifr;
                void __user *data;
                bool need_copyout;
                if (get_user_ifreq(&ifr, &data, argp))
                        return -EFAULT;
                err = dev_ioctl(net, cmd, &ifr, data, &need_copyout);
                if (!err && need_copyout)
                        if (put_user_ifreq(&ifr, argp))
                                return -EFAULT;
        } else
#ifdef CONFIG_WEXT_CORE
        if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
                err = wext_handle_ioctl(net, cmd, argp);
        } else
#endif
                switch (cmd) {
                case FIOSETOWN:
                case SIOCSPGRP:
                        err = -EFAULT;
                        if (get_user(pid, (int __user *)argp))
                                break;
                        err = f_setown(sock->file, pid, 1);
                        break;
                case FIOGETOWN:
                case SIOCGPGRP:
                        err = put_user(f_getown(sock->file),
                                       (int __user *)argp);
                        break;
                case SIOCGIFBR:
                case SIOCSIFBR:
                case SIOCBRADDBR:
                case SIOCBRDELBR:
                case SIOCBRADDIF:
                case SIOCBRDELIF:
                        err = br_ioctl_call(net, cmd, argp);
                        break;
                case SIOCGIFVLAN:
                case SIOCSIFVLAN:
                        err = -ENOPKG;
                        if (!vlan_ioctl_hook)
                                request_module("8021q");

                        mutex_lock(&vlan_ioctl_mutex);
                        if (vlan_ioctl_hook)
                                err = vlan_ioctl_hook(net, argp);
                        mutex_unlock(&vlan_ioctl_mutex);
                        break;
                case SIOCGSKNS:
                        err = -EPERM;
                        if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                                break;

                        err = open_related_ns(&net->ns, get_net_ns);
                        break;
                case SIOCGSTAMP_OLD:
                case SIOCGSTAMPNS_OLD:
                        if (!ops->gettstamp) {
                                err = -ENOIOCTLCMD;
                                break;
                        }
                        err = ops->gettstamp(sock, argp,
                                             cmd == SIOCGSTAMP_OLD,
                                             !IS_ENABLED(CONFIG_64BIT));
                        break;
                case SIOCGSTAMP_NEW:
                case SIOCGSTAMPNS_NEW:
                        if (!ops->gettstamp) {
                                err = -ENOIOCTLCMD;
                                break;
                        }
                        err = ops->gettstamp(sock, argp,
                                             cmd == SIOCGSTAMP_NEW,
                                             false);
                        break;

                case SIOCGIFCONF:
                        err = dev_ifconf(net, argp);
                        break;

                default:
                        err = sock_do_ioctl(net, sock, cmd, arg);
                        break;
                }
        return err;
}

/**
 *        sock_create_lite - creates a socket
 *        @family: protocol family (AF_INET, ...)
 *        @type: communication type (SOCK_STREAM, ...)
 *        @protocol: protocol (0, ...)
 *        @res: new socket
 *
 *        Creates a new socket and assigns it to @res, passing through LSM.
 *        The new socket initialization is not complete, see kernel_accept().
 *        Returns 0 or an error. On failure @res is set to %NULL.
 *        This function internally uses GFP_KERNEL.
 */

int sock_create_lite(int family, int type, int protocol, struct socket **res)
{
        int err;
        struct socket *sock = NULL;

        err = security_socket_create(family, type, protocol, 1);
        if (err)
                goto out;

        sock = sock_alloc();
        if (!sock) {
                err = -ENOMEM;
                goto out;
        }

        sock->type = type;
        err = security_socket_post_create(sock, family, type, protocol, 1);
        if (err)
                goto out_release;

out:
        *res = sock;
        return err;
out_release:
        sock_release(sock);
        sock = NULL;
        goto out;
}
EXPORT_SYMBOL(sock_create_lite);

/* No kernel lock held - perfect */
static __poll_t sock_poll(struct file *file, poll_table *wait)
{
        struct socket *sock = file->private_data;
        const struct proto_ops *ops = READ_ONCE(sock->ops);
        __poll_t events = poll_requested_events(wait), flag = 0;

        if (!ops->poll)
                return 0;

        if (sk_can_busy_loop(sock->sk)) {
                /* poll once if requested by the syscall */
                if (events & POLL_BUSY_LOOP)
                        sk_busy_loop(sock->sk, 1);

                /* if this socket can poll_ll, tell the system call */
                flag = POLL_BUSY_LOOP;
        }

        return ops->poll(file, sock, wait) | flag;
}

static int sock_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct socket *sock = file->private_data;

        return READ_ONCE(sock->ops)->mmap(file, sock, vma);
}

static int sock_close(struct inode *inode, struct file *filp)
{
        __sock_release(SOCKET_I(inode), inode);
        return 0;
}

/*
 *        Update the socket async list
 *
 *        Fasync_list locking strategy.
 *
 *        1. fasync_list is modified only under process context socket lock
 *           i.e. under semaphore.
 *        2. fasync_list is used under read_lock(&sk->sk_callback_lock)
 *           or under socket lock
 */

static int sock_fasync(int fd, struct file *filp, int on)
{
        struct socket *sock = filp->private_data;
        struct sock *sk = sock->sk;
        struct socket_wq *wq = &sock->wq;

        if (sk == NULL)
                return -EINVAL;

        lock_sock(sk);
        fasync_helper(fd, filp, on, &wq->fasync_list);

        if (!wq->fasync_list)
                sock_reset_flag(sk, SOCK_FASYNC);
        else
                sock_set_flag(sk, SOCK_FASYNC);

        release_sock(sk);
        return 0;
}

/* This function may be called only under rcu_lock */

int sock_wake_async(struct socket_wq *wq, int how, int band)
{
        if (!wq || !wq->fasync_list)
                return -1;

        switch (how) {
        case SOCK_WAKE_WAITD:
                if (test_bit(SOCKWQ_ASYNC_WAITDATA, &wq->flags))
                        break;
                goto call_kill;
        case SOCK_WAKE_SPACE:
                if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags))
                        break;
                fallthrough;
        case SOCK_WAKE_IO:
call_kill:
                kill_fasync(&wq->fasync_list, SIGIO, band);
                break;
        case SOCK_WAKE_URG:
                kill_fasync(&wq->fasync_list, SIGURG, band);
        }

        return 0;
}
EXPORT_SYMBOL(sock_wake_async);

/**
 *        __sock_create - creates a socket
 *        @net: net namespace
 *        @family: protocol family (AF_INET, ...)
 *        @type: communication type (SOCK_STREAM, ...)
 *        @protocol: protocol (0, ...)
 *        @res: new socket
 *        @kern: boolean for kernel space sockets
 *
 *        Creates a new socket and assigns it to @res, passing through LSM.
 *        Returns 0 or an error. On failure @res is set to %NULL. @kern must
 *        be set to true if the socket resides in kernel space.
 *        This function internally uses GFP_KERNEL.
 */

int __sock_create(struct net *net, int family, int type, int protocol,
                         struct socket **res, int kern)
{
        int err;
        struct socket *sock;
        const struct net_proto_family *pf;

        /*
         *      Check protocol is in range
         */
        if (family < 0 || family >= NPROTO)
                return -EAFNOSUPPORT;
        if (type < 0 || type >= SOCK_MAX)
                return -EINVAL;

        /* Compatibility.

           This uglymoron is moved from INET layer to here to avoid
           deadlock in module load.
         */
        if (family == PF_INET && type == SOCK_PACKET) {
                pr_info_once("%s uses obsolete (PF_INET,SOCK_PACKET)\n",
                             current->comm);
                family = PF_PACKET;
        }

        err = security_socket_create(family, type, protocol, kern);
        if (err)
                return err;

        /*
         *        Allocate the socket and allow the family to set things up. if
         *        the protocol is 0, the family is instructed to select an appropriate
         *        default.
         */
        sock = sock_alloc();
        if (!sock) {
                net_warn_ratelimited("socket: no more sockets\n");
                return -ENFILE;        /* Not exactly a match, but its the
                                   closest posix thing */
        }

        sock->type = type;

#ifdef CONFIG_MODULES
        /* Attempt to load a protocol module if the find failed.
         *
         * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
         * requested real, full-featured networking support upon configuration.
         * Otherwise module support will break!
         */
        if (rcu_access_pointer(net_families[family]) == NULL)
                request_module("net-pf-%d", family);
#endif

        rcu_read_lock();
        pf = rcu_dereference(net_families[family]);
        err = -EAFNOSUPPORT;
        if (!pf)
                goto out_release;

        /*
         * We will call the ->create function, that possibly is in a loadable
         * module, so we have to bump that loadable module refcnt first.
         */
        if (!try_module_get(pf->owner))
                goto out_release;

        /* Now protected by module ref count */
        rcu_read_unlock();

        err = pf->create(net, sock, protocol, kern);
        if (err < 0) {
                /* ->create should release the allocated sock->sk object on error
                 * and make sure sock->sk is set to NULL to avoid use-after-free
                 */
                DEBUG_NET_WARN_ONCE(sock->sk,
                                    "%ps must clear sock->sk on failure, family: %d, type: %d, protocol: %d\n",
                                    pf->create, family, type, protocol);
                goto out_module_put;
        }

        /*
         * Now to bump the refcnt of the [loadable] module that owns this
         * socket at sock_release time we decrement its refcnt.
         */
        if (!try_module_get(sock->ops->owner))
                goto out_module_busy;

        /*
         * Now that we're done with the ->create function, the [loadable]
         * module can have its refcnt decremented
         */
        module_put(pf->owner);
        err = security_socket_post_create(sock, family, type, protocol, kern);
        if (err)
                goto out_sock_release;
        *res = sock;

        return 0;

out_module_busy:
        err = -EAFNOSUPPORT;
out_module_put:
        sock->ops = NULL;
        module_put(pf->owner);
out_sock_release:
        sock_release(sock);
        return err;

out_release:
        rcu_read_unlock();
        goto out_sock_release;
}
EXPORT_SYMBOL(__sock_create);

/**
 *        sock_create - creates a socket
 *        @family: protocol family (AF_INET, ...)
 *        @type: communication type (SOCK_STREAM, ...)
 *        @protocol: protocol (0, ...)
 *        @res: new socket
 *
 *        A wrapper around __sock_create().
 *        Returns 0 or an error. This function internally uses GFP_KERNEL.
 */

int sock_create(int family, int type, int protocol, struct socket **res)
{
        return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}
EXPORT_SYMBOL(sock_create);

/**
 *        sock_create_kern - creates a socket (kernel space)
 *        @net: net namespace
 *        @family: protocol family (AF_INET, ...)
 *        @type: communication type (SOCK_STREAM, ...)
 *        @protocol: protocol (0, ...)
 *        @res: new socket
 *
 *        A wrapper around __sock_create().
 *        Returns 0 or an error. This function internally uses GFP_KERNEL.
 */

int sock_create_kern(struct net *net, int family, int type, int protocol, struct socket **res)
{
        return __sock_create(net, family, type, protocol, res, 1);
}
EXPORT_SYMBOL(sock_create_kern);

static struct socket *__sys_socket_create(int family, int type, int protocol)
{
        struct socket *sock;
        int retval;

        /* Check the SOCK_* constants for consistency.  */
        BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
        BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
        BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
        BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);

        if ((type & ~SOCK_TYPE_MASK) & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
                return ERR_PTR(-EINVAL);
        type &= SOCK_TYPE_MASK;

        retval = sock_create(family, type, protocol, &sock);
        if (retval < 0)
                return ERR_PTR(retval);

        return sock;
}

struct file *__sys_socket_file(int family, int type, int protocol)
{
        struct socket *sock;
        int flags;

        sock = __sys_socket_create(family, type, protocol);
        if (IS_ERR(sock))
                return ERR_CAST(sock);

        flags = type & ~SOCK_TYPE_MASK;
        if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
                flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

        return sock_alloc_file(sock, flags, NULL);
}

/*        A hook for bpf progs to attach to and update socket protocol.
 *
 *        A static noinline declaration here could cause the compiler to
 *        optimize away the function. A global noinline declaration will
 *        keep the definition, but may optimize away the callsite.
 *        Therefore, __weak is needed to ensure that the call is still
 *        emitted, by telling the compiler that we don't know what the
 *        function might eventually be.
 */

__bpf_hook_start();

__weak noinline int update_socket_protocol(int family, int type, int protocol)
{
        return protocol;
}

__bpf_hook_end();

int __sys_socket(int family, int type, int protocol)
{
        struct socket *sock;
        int flags;

        sock = __sys_socket_create(family, type,
                                   update_socket_protocol(family, type, protocol));
        if (IS_ERR(sock))
                return PTR_ERR(sock);

        flags = type & ~SOCK_TYPE_MASK;
        if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
                flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

        return sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
}

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
        return __sys_socket(family, type, protocol);
}

/*
 *        Create a pair of connected sockets.
 */

int __sys_socketpair(int family, int type, int protocol, int __user *usockvec)
{
        struct socket *sock1, *sock2;
        int fd1, fd2, err;
        struct file *newfile1, *newfile2;
        int flags;

        flags = type & ~SOCK_TYPE_MASK;
        if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
                return -EINVAL;
        type &= SOCK_TYPE_MASK;

        if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
                flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

        /*
         * reserve descriptors and make sure we won't fail
         * to return them to userland.
         */
        fd1 = get_unused_fd_flags(flags);
        if (unlikely(fd1 < 0))
                return fd1;

        fd2 = get_unused_fd_flags(flags);
        if (unlikely(fd2 < 0)) {
                put_unused_fd(fd1);
                return fd2;
        }

        err = put_user(fd1, &usockvec[0]);
        if (err)
                goto out;

        err = put_user(fd2, &usockvec[1]);
        if (err)
                goto out;

        /*
         * Obtain the first socket and check if the underlying protocol
         * supports the socketpair call.
         */

        err = sock_create(family, type, protocol, &sock1);
        if (unlikely(err < 0))
                goto out;

        err = sock_create(family, type, protocol, &sock2);
        if (unlikely(err < 0)) {
                sock_release(sock1);
                goto out;
        }

        err = security_socket_socketpair(sock1, sock2);
        if (unlikely(err)) {
                sock_release(sock2);
                sock_release(sock1);
                goto out;
        }

        err = READ_ONCE(sock1->ops)->socketpair(sock1, sock2);
        if (unlikely(err < 0)) {
                sock_release(sock2);
                sock_release(sock1);
                goto out;
        }

        newfile1 = sock_alloc_file(sock1, flags, NULL);
        if (IS_ERR(newfile1)) {
                err = PTR_ERR(newfile1);
                sock_release(sock2);
                goto out;
        }

        newfile2 = sock_alloc_file(sock2, flags, NULL);
        if (IS_ERR(newfile2)) {
                err = PTR_ERR(newfile2);
                fput(newfile1);
                goto out;
        }

        audit_fd_pair(fd1, fd2);

        fd_install(fd1, newfile1);
        fd_install(fd2, newfile2);
        return 0;

out:
        put_unused_fd(fd2);
        put_unused_fd(fd1);
        return err;
}

SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol,
                int __user *, usockvec)
{
        return __sys_socketpair(family, type, protocol, usockvec);
}

int __sys_bind_socket(struct socket *sock, struct sockaddr_storage *address,
                      int addrlen)
{
        int err;

        err = security_socket_bind(sock, (struct sockaddr *)address,
                                   addrlen);
        if (!err)
                err = READ_ONCE(sock->ops)->bind(sock,
                                                 (struct sockaddr *)address,
                                                 addrlen);
        return err;
}

/*
 *        Bind a name to a socket. Nothing much to do here since it's
 *        the protocol's responsibility to handle the local address.
 *
 *        We move the socket address to kernel space before we call
 *        the protocol layer (having also checked the address is ok).
 */

int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen)
{
        struct socket *sock;
        struct sockaddr_storage address;
        CLASS(fd, f)(fd);
        int err;

        if (fd_empty(f))
                return -EBADF;
        sock = sock_from_file(fd_file(f));
        if (unlikely(!sock))
                return -ENOTSOCK;

        err = move_addr_to_kernel(umyaddr, addrlen, &address);
        if (unlikely(err))
                return err;

        return __sys_bind_socket(sock, &address, addrlen);
}

SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
{
        return __sys_bind(fd, umyaddr, addrlen);
}

/*
 *        Perform a listen. Basically, we allow the protocol to do anything
 *        necessary for a listen, and if that works, we mark the socket as
 *        ready for listening.
 */
int __sys_listen_socket(struct socket *sock, int backlog)
{
        int somaxconn, err;

        somaxconn = READ_ONCE(sock_net(sock->sk)->core.sysctl_somaxconn);
        if ((unsigned int)backlog > somaxconn)
                backlog = somaxconn;

        err = security_socket_listen(sock, backlog);
        if (!err)
                err = READ_ONCE(sock->ops)->listen(sock, backlog);
        return err;
}

int __sys_listen(int fd, int backlog)
{
        CLASS(fd, f)(fd);
        struct socket *sock;

        if (fd_empty(f))
                return -EBADF;
        sock = sock_from_file(fd_file(f));
        if (unlikely(!sock))
                return -ENOTSOCK;

        return __sys_listen_socket(sock, backlog);
}

SYSCALL_DEFINE2(listen, int, fd, int, backlog)
{
        return __sys_listen(fd, backlog);
}

struct file *do_accept(struct file *file, struct proto_accept_arg *arg,
                       struct sockaddr __user *upeer_sockaddr,
                       int __user *upeer_addrlen, int flags)
{
        struct socket *sock, *newsock;
        struct file *newfile;
        int err, len;
        struct sockaddr_storage address;
        const struct proto_ops *ops;

        sock = sock_from_file(file);
        if (!sock)
                return ERR_PTR(-ENOTSOCK);

        newsock = sock_alloc();
        if (!newsock)
                return ERR_PTR(-ENFILE);
        ops = READ_ONCE(sock->ops);

        newsock->type = sock->type;
        newsock->ops = ops;

        /*
         * We don't need try_module_get here, as the listening socket (sock)
         * has the protocol module (sock->ops->owner) held.
         */
        __module_get(ops->owner);

        newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name);
        if (IS_ERR(newfile))
                return newfile;

        err = security_socket_accept(sock, newsock);
        if (err)
                goto out_fd;

        arg->flags |= sock->file->f_flags;
        err = ops->accept(sock, newsock, arg);
        if (err < 0)
                goto out_fd;

        if (upeer_sockaddr) {
                len = ops->getname(newsock, (struct sockaddr *)&address, 2);
                if (len < 0) {
                        err = -ECONNABORTED;
                        goto out_fd;
                }
                err = move_addr_to_user(&address,
                                        len, upeer_sockaddr, upeer_addrlen);
                if (err < 0)
                        goto out_fd;
        }

        /* File flags are not inherited via accept() unlike another OSes. */
        return newfile;
out_fd:
        fput(newfile);
        return ERR_PTR(err);
}

static int __sys_accept4_file(struct file *file, struct sockaddr __user *upeer_sockaddr,
                              int __user *upeer_addrlen, int flags)
{
        struct proto_accept_arg arg = { };
        struct file *newfile;
        int newfd;

        if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
                return -EINVAL;

        if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
                flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

        newfd = get_unused_fd_flags(flags);
        if (unlikely(newfd < 0))
                return newfd;

        newfile = do_accept(file, &arg, upeer_sockaddr, upeer_addrlen,
                            flags);
        if (IS_ERR(newfile)) {
                put_unused_fd(newfd);
                return PTR_ERR(newfile);
        }
        fd_install(newfd, newfile);
        return newfd;
}

/*
 *        For accept, we attempt to create a new socket, set up the link
 *        with the client, wake up the client, then return the new
 *        connected fd. We collect the address of the connector in kernel
 *        space and move it to user at the very end. This is unclean because
 *        we open the socket then return an error.
 *
 *        1003.1g adds the ability to recvmsg() to query connection pending
 *        status to recvmsg. We need to add that support in a way thats
 *        clean when we restructure accept also.
 */

int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
                  int __user *upeer_addrlen, int flags)
{
        CLASS(fd, f)(fd);

        if (fd_empty(f))
                return -EBADF;
        return __sys_accept4_file(fd_file(f), upeer_sockaddr,
                                         upeer_addrlen, flags);
}

SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,
                int __user *, upeer_addrlen, int, flags)
{
        return __sys_accept4(fd, upeer_sockaddr, upeer_addrlen, flags);
}

SYSCALL_DEFINE3(accept, int, fd, struct sockaddr __user *, upeer_sockaddr,
                int __user *, upeer_addrlen)
{
        return __sys_accept4(fd, upeer_sockaddr, upeer_addrlen, 0);
}

/*
 *        Attempt to connect to a socket with the server address.  The address
 *        is in user space so we verify it is OK and move it to kernel space.
 *
 *        For 1003.1g we need to add clean support for a bind to AF_UNSPEC to
 *        break bindings
 *
 *        NOTE: 1003.1g draft 6.3 is broken with respect to AX.25/NetROM and
 *        other SEQPACKET protocols that take time to connect() as it doesn't
 *        include the -EINPROGRESS status for such sockets.
 */

int __sys_connect_file(struct file *file, struct sockaddr_storage *address,
                       int addrlen, int file_flags)
{
        struct socket *sock;
        int err;

        sock = sock_from_file(file);
        if (!sock) {
                err = -ENOTSOCK;
                goto out;
        }

        err =
            security_socket_connect(sock, (struct sockaddr *)address, addrlen);
        if (err)
                goto out;

        err = READ_ONCE(sock->ops)->connect(sock, (struct sockaddr *)address,
                                addrlen, sock->file->f_flags | file_flags);
out:
        return err;
}

int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen)
{
        struct sockaddr_storage address;
        CLASS(fd, f)(fd);
        int ret;

        if (fd_empty(f))
                return -EBADF;

        ret = move_addr_to_kernel(uservaddr, addrlen, &address);
        if (ret)
                return ret;

        return __sys_connect_file(fd_file(f), &address, addrlen, 0);
}

SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr,
                int, addrlen)
{
        return __sys_connect(fd, uservaddr, addrlen);
}

/*
 *        Get the local address ('name') of a socket object. Move the obtained
 *        name to user space.
 */

int __sys_getsockname(int fd, struct sockaddr __user *usockaddr,
                      int __user *usockaddr_len)
{
        struct socket *sock;
        struct sockaddr_storage address;
        CLASS(fd, f)(fd);
        int err;

        if (fd_empty(f))
                return -EBADF;
        sock = sock_from_file(fd_file(f));
        if (unlikely(!sock))
                return -ENOTSOCK;

        err = security_socket_getsockname(sock);
        if (err)
                return err;

        err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 0);
        if (err < 0)
                return err;

        /* "err" is actually length in this case */
        return move_addr_to_user(&address, err, usockaddr, usockaddr_len);
}

SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr,
                int __user *, usockaddr_len)
{
        return __sys_getsockname(fd, usockaddr, usockaddr_len);
}

/*
 *        Get the remote address ('name') of a socket object. Move the obtained
 *        name to user space.
 */

int __sys_getpeername(int fd, struct sockaddr __user *usockaddr,
                      int __user *usockaddr_len)
{
        struct socket *sock;
        struct sockaddr_storage address;
        CLASS(fd, f)(fd);
        int err;

        if (fd_empty(f))
                return -EBADF;
        sock = sock_from_file(fd_file(f));
        if (unlikely(!sock))
                return -ENOTSOCK;

        err = security_socket_getpeername(sock);
        if (err)
                return err;

        err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 1);
        if (err < 0)
                return err;

        /* "err" is actually length in this case */
        return move_addr_to_user(&address, err, usockaddr, usockaddr_len);
}

SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr,
                int __user *, usockaddr_len)
{
        return __sys_getpeername(fd, usockaddr, usockaddr_len);
}

/*
 *        Send a datagram to a given address. We move the address into kernel
 *        space and check the user space data area is readable before invoking
 *        the protocol.
 */
int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags,
                 struct sockaddr __user *addr,  int addr_len)
{
        struct socket *sock;
        struct sockaddr_storage address;
        int err;
        struct msghdr msg;

        err = import_ubuf(ITER_SOURCE, buff, len, &msg.msg_iter);
        if (unlikely(err))
                return err;

        CLASS(fd, f)(fd);
        if (fd_empty(f))
                return -EBADF;
        sock = sock_from_file(fd_file(f));
        if (unlikely(!sock))
                return -ENOTSOCK;

        msg.msg_name = NULL;
        msg.msg_control = NULL;
        msg.msg_controllen = 0;
        msg.msg_namelen = 0;
        msg.msg_ubuf = NULL;
        if (addr) {
                err = move_addr_to_kernel(addr, addr_len, &address);
                if (err < 0)
                        return err;
                msg.msg_name = (struct sockaddr *)&address;
                msg.msg_namelen = addr_len;
        }
        flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
        if (sock->file->f_flags & O_NONBLOCK)
                flags |= MSG_DONTWAIT;
        msg.msg_flags = flags;
        return __sock_sendmsg(sock, &msg);
}

SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len,
                unsigned int, flags, struct sockaddr __user *, addr,
                int, addr_len)
{
        return __sys_sendto(fd, buff, len, flags, addr, addr_len);
}

/*
 *        Send a datagram down a socket.
 */

SYSCALL_DEFINE4(send, int, fd, void __user *, buff, size_t, len,
                unsigned int, flags)
{
        return __sys_sendto(fd, buff, len, flags, NULL, 0);
}

/*
 *        Receive a frame from the socket and optionally record the address of the
 *        sender. We verify the buffers are writable and if needed move the
 *        sender address from kernel to user space.
 */
int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags,
                   struct sockaddr __user *addr, int __user *addr_len)
{
        struct sockaddr_storage address;
        struct msghdr msg = {
                /* Save some cycles and don't copy the address if not needed */
                .msg_name = addr ? (struct sockaddr *)&address : NULL,
        };
        struct socket *sock;
        int err, err2;

        err = import_ubuf(ITER_DEST, ubuf, size, &msg.msg_iter);
        if (unlikely(err))
                return err;

        CLASS(fd, f)(fd);

        if (fd_empty(f))
                return -EBADF;
        sock = sock_from_file(fd_file(f));
        if (unlikely(!sock))
                return -ENOTSOCK;

        if (sock->file->f_flags & O_NONBLOCK)
                flags |= MSG_DONTWAIT;
        err = sock_recvmsg(sock, &msg, flags);

        if (err >= 0 && addr != NULL) {
                err2 = move_addr_to_user(&address,
                                         msg.msg_namelen, addr, addr_len);
                if (err2 < 0)
                        err = err2;
        }
        return err;
}

SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size,
                unsigned int, flags, struct sockaddr __user *, addr,
                int __user *, addr_len)
{
        return __sys_recvfrom(fd, ubuf, size, flags, addr, addr_len);
}

/*
 *        Receive a datagram from a socket.
 */

SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size,
                unsigned int, flags)
{
        return __sys_recvfrom(fd, ubuf, size, flags, NULL, NULL);
}

static bool sock_use_custom_sol_socket(const struct socket *sock)
{
        return test_bit(SOCK_CUSTOM_SOCKOPT, &sock->flags);
}

int do_sock_setsockopt(struct socket *sock, bool compat, int level,
                       int optname, sockptr_t optval, int optlen)
{
        const struct proto_ops *ops;
        char *kernel_optval = NULL;
        int err;

        if (optlen < 0)
                return -EINVAL;

        err = security_socket_setsockopt(sock, level, optname);
        if (err)
                goto out_put;

        if (!compat)
                err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level, &optname,
                                                     optval, &optlen,
                                                     &kernel_optval);
        if (err < 0)
                goto out_put;
        if (err > 0) {
                err = 0;
                goto out_put;
        }

        if (kernel_optval)
                optval = KERNEL_SOCKPTR(kernel_optval);
        ops = READ_ONCE(sock->ops);
        if (level == SOL_SOCKET && !sock_use_custom_sol_socket(sock))
                err = sock_setsockopt(sock, level, optname, optval, optlen);
        else if (unlikely(!ops->setsockopt))
                err = -EOPNOTSUPP;
        else
                err = ops->setsockopt(sock, level, optname, optval,
                                            optlen);
        kfree(kernel_optval);
out_put:
        return err;
}
EXPORT_SYMBOL(do_sock_setsockopt);

/* Set a socket option. Because we don't know the option lengths we have
 * to pass the user mode parameter for the protocols to sort out.
 */
int __sys_setsockopt(int fd, int level, int optname, char __user *user_optval,
                     int optlen)
{
        sockptr_t optval = USER_SOCKPTR(user_optval);
        bool compat = in_compat_syscall();
        struct socket *sock;
        CLASS(fd, f)(fd);

        if (fd_empty(f))
                return -EBADF;
        sock = sock_from_file(fd_file(f));
        if (unlikely(!sock))
                return -ENOTSOCK;

        return do_sock_setsockopt(sock, compat, level, optname, optval, optlen);
}

SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname,
                char __user *, optval, int, optlen)
{
        return __sys_setsockopt(fd, level, optname, optval, optlen);
}

INDIRECT_CALLABLE_DECLARE(bool tcp_bpf_bypass_getsockopt(int level,
                                                         int optname));

int do_sock_getsockopt(struct socket *sock, bool compat, int level,
                       int optname, sockptr_t optval, sockptr_t optlen)
{
        int max_optlen __maybe_unused = 0;
        const struct proto_ops *ops;
        int err;

        err = security_socket_getsockopt(sock, level, optname);
        if (err)
                return err;

        if (!compat)
                copy_from_sockptr(&max_optlen, optlen, sizeof(int));

        ops = READ_ONCE(sock->ops);
        if (level == SOL_SOCKET) {
                err = sk_getsockopt(sock->sk, level, optname, optval, optlen);
        } else if (unlikely(!ops->getsockopt)) {
                err = -EOPNOTSUPP;
        } else {
                if (WARN_ONCE(optval.is_kernel || optlen.is_kernel,
                              "Invalid argument type"))
                        return -EOPNOTSUPP;

                err = ops->getsockopt(sock, level, optname, optval.user,
                                      optlen.user);
        }

        if (!compat)
                err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname,
                                                     optval, optlen, max_optlen,
                                                     err);

        return err;
}
EXPORT_SYMBOL(do_sock_getsockopt);

/*
 *        Get a socket option. Because we don't know the option lengths we have
 *        to pass a user mode parameter for the protocols to sort out.
 */
int __sys_getsockopt(int fd, int level, int optname, char __user *optval,
                int __user *optlen)
{
        struct socket *sock;
        CLASS(fd, f)(fd);

        if (fd_empty(f))
                return -EBADF;
        sock = sock_from_file(fd_file(f));
        if (unlikely(!sock))
                return -ENOTSOCK;

        return do_sock_getsockopt(sock, in_compat_syscall(), level, optname,
                                 USER_SOCKPTR(optval), USER_SOCKPTR(optlen));
}

SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname,
                char __user *, optval, int __user *, optlen)
{
        return __sys_getsockopt(fd, level, optname, optval, optlen);
}

/*
 *        Shutdown a socket.
 */

int __sys_shutdown_sock(struct socket *sock, int how)
{
        int err;

        err = security_socket_shutdown(sock, how);
        if (!err)
                err = READ_ONCE(sock->ops)->shutdown(sock, how);

        return err;
}

int __sys_shutdown(int fd, int how)
{
        struct socket *sock;
        CLASS(fd, f)(fd);

        if (fd_empty(f))
                return -EBADF;
        sock = sock_from_file(fd_file(f));
        if (unlikely(!sock))
                return -ENOTSOCK;

        return __sys_shutdown_sock(sock, how);
}

SYSCALL_DEFINE2(shutdown, int, fd, int, how)
{
        return __sys_shutdown(fd, how);
}

/* A couple of helpful macros for getting the address of the 32/64 bit
 * fields which are the same type (int / unsigned) on our platforms.
 */
#define COMPAT_MSG(msg, member)        ((MSG_CMSG_COMPAT & flags) ? &msg##_compat->member : &msg->member)
#define COMPAT_NAMELEN(msg)        COMPAT_MSG(msg, msg_namelen)
#define COMPAT_FLAGS(msg)        COMPAT_MSG(msg, msg_flags)

struct used_address {
        struct sockaddr_storage name;
        unsigned int name_len;
};

int __copy_msghdr(struct msghdr *kmsg,
                  struct user_msghdr *msg,
                  struct sockaddr __user **save_addr)
{
        ssize_t err;

        kmsg->msg_control_is_user = true;
        kmsg->msg_get_inq = 0;
        kmsg->msg_control_user = msg->msg_control;
        kmsg->msg_controllen = msg->msg_controllen;
        kmsg->msg_flags = msg->msg_flags;

        kmsg->msg_namelen = msg->msg_namelen;
        if (!msg->msg_name)
                kmsg->msg_namelen = 0;

        if (kmsg->msg_namelen < 0)
                return -EINVAL;

        if (kmsg->msg_namelen > sizeof(struct sockaddr_storage))
                kmsg->msg_namelen = sizeof(struct sockaddr_storage);

        if (save_addr)
                *save_addr = msg->msg_name;

        if (msg->msg_name && kmsg->msg_namelen) {
                if (!save_addr) {
                        err = move_addr_to_kernel(msg->msg_name,
                                                  kmsg->msg_namelen,
                                                  kmsg->msg_name);
                        if (err < 0)
                                return err;
                }
        } else {
                kmsg->msg_name = NULL;
                kmsg->msg_namelen = 0;
        }

        if (msg->msg_iovlen > UIO_MAXIOV)
                return -EMSGSIZE;

        kmsg->msg_iocb = NULL;
        kmsg->msg_ubuf = NULL;
        return 0;
}

static int copy_msghdr_from_user(struct msghdr *kmsg,
                                 struct user_msghdr __user *umsg,
                                 struct sockaddr __user **save_addr,
                                 struct iovec **iov)
{
        struct user_msghdr msg;
        ssize_t err;

        if (copy_from_user(&msg, umsg, sizeof(*umsg)))
                return -EFAULT;

        err = __copy_msghdr(kmsg, &msg, save_addr);
        if (err)
                return err;

        err = import_iovec(save_addr ? ITER_DEST : ITER_SOURCE,
                            msg.msg_iov, msg.msg_iovlen,
                            UIO_FASTIOV, iov, &kmsg->msg_iter);
        return err < 0 ? err : 0;
}

static int ____sys_sendmsg(struct socket *sock, struct msghdr *msg_sys,
                           unsigned int flags, struct used_address *used_address,
                           unsigned int allowed_msghdr_flags)
{
        unsigned char ctl[sizeof(struct cmsghdr) + 20]
                                __aligned(sizeof(__kernel_size_t));
        /* 20 is size of ipv6_pktinfo */
        unsigned char *ctl_buf = ctl;
        int ctl_len;
        ssize_t err;

        err = -ENOBUFS;

        if (msg_sys->msg_controllen > INT_MAX)
                goto out;
        flags |= (msg_sys->msg_flags & allowed_msghdr_flags);
        ctl_len = msg_sys->msg_controllen;
        if ((MSG_CMSG_COMPAT & flags) && ctl_len) {
                err =
                    cmsghdr_from_user_compat_to_kern(msg_sys, sock->sk, ctl,
                                                     sizeof(ctl));
                if (err)
                        goto out;
                ctl_buf = msg_sys->msg_control;
                ctl_len = msg_sys->msg_controllen;
        } else if (ctl_len) {
                BUILD_BUG_ON(sizeof(struct cmsghdr) !=
                             CMSG_ALIGN(sizeof(struct cmsghdr)));
                if (ctl_len > sizeof(ctl)) {
                        ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL);
                        if (ctl_buf == NULL)
                                goto out;
                }
                err = -EFAULT;
                if (copy_from_user(ctl_buf, msg_sys->msg_control_user, ctl_len))
                        goto out_freectl;
                msg_sys->msg_control = ctl_buf;
                msg_sys->msg_control_is_user = false;
        }
        flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
        msg_sys->msg_flags = flags;

        if (sock->file->f_flags & O_NONBLOCK)
                msg_sys->msg_flags |= MSG_DONTWAIT;
        /*
         * If this is sendmmsg() and current destination address is same as
         * previously succeeded address, omit asking LSM's decision.
         * used_address->name_len is initialized to UINT_MAX so that the first
         * destination address never matches.
         */
        if (used_address && msg_sys->msg_name &&
            used_address->name_len == msg_sys->msg_namelen &&
            !memcmp(&used_address->name, msg_sys->msg_name,
                    used_address->name_len)) {
                err = sock_sendmsg_nosec(sock, msg_sys);
                goto out_freectl;
        }
        err = __sock_sendmsg(sock, msg_sys);
        /*
         * If this is sendmmsg() and sending to current destination address was
         * successful, remember it.
         */
        if (used_address && err >= 0) {
                used_address->name_len = msg_sys->msg_namelen;
                if (msg_sys->msg_name)
                        memcpy(&used_address->name, msg_sys->msg_name,
                               used_address->name_len);
        }

out_freectl:
        if (ctl_buf != ctl)
                sock_kfree_s(sock->sk, ctl_buf, ctl_len);
out:
        return err;
}

static int sendmsg_copy_msghdr(struct msghdr *msg,
                               struct user_msghdr __user *umsg, unsigned flags,
                               struct iovec **iov)
{
        int err;

        if (flags & MSG_CMSG_COMPAT) {
                struct compat_msghdr __user *msg_compat;

                msg_compat = (struct compat_msghdr __user *) umsg;
                err = get_compat_msghdr(msg, msg_compat, NULL, iov);
        } else {
                err = copy_msghdr_from_user(msg, umsg, NULL, iov);
        }
        if (err < 0)
                return err;

        return 0;
}

static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
                         struct msghdr *msg_sys, unsigned int flags,
                         struct used_address *used_address,
                         unsigned int allowed_msghdr_flags)
{
        struct sockaddr_storage address;
        struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
        ssize_t err;

        msg_sys->msg_name = &address;

        err = sendmsg_copy_msghdr(msg_sys, msg, flags, &iov);
        if (err < 0)
                return err;

        err = ____sys_sendmsg(sock, msg_sys, flags, used_address,
                                allowed_msghdr_flags);
        kfree(iov);
        return err;
}

/*
 *        BSD sendmsg interface
 */
long __sys_sendmsg_sock(struct socket *sock, struct msghdr *msg,
                        unsigned int flags)
{
        return ____sys_sendmsg(sock, msg, flags, NULL, 0);
}

long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,
                   bool forbid_cmsg_compat)
{
        struct msghdr msg_sys;
        struct socket *sock;

        if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT))
                return -EINVAL;

        CLASS(fd, f)(fd);

        if (fd_empty(f))
                return -EBADF;
        sock = sock_from_file(fd_file(f));
        if (unlikely(!sock))
                return -ENOTSOCK;

        return ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL, 0);
}

SYSCALL_DEFINE3(sendmsg, int, fd, struct user_msghdr __user *, msg, unsigned int, flags)
{
        return __sys_sendmsg(fd, msg, flags, true);
}

/*
 *        Linux sendmmsg interface
 */

int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
                   unsigned int flags, bool forbid_cmsg_compat)
{
        int err, datagrams;
        struct socket *sock;
        struct mmsghdr __user *entry;
        struct compat_mmsghdr __user *compat_entry;
        struct msghdr msg_sys;
        struct used_address used_address;
        unsigned int oflags = flags;

        if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT))
                return -EINVAL;

        if (vlen > UIO_MAXIOV)
                vlen = UIO_MAXIOV;

        datagrams = 0;

        CLASS(fd, f)(fd);

        if (fd_empty(f))
                return -EBADF;
        sock = sock_from_file(fd_file(f));
        if (unlikely(!sock))
                return -ENOTSOCK;

        used_address.name_len = UINT_MAX;
        entry = mmsg;
        compat_entry = (struct compat_mmsghdr __user *)mmsg;
        err = 0;
        flags |= MSG_BATCH;

        while (datagrams < vlen) {
                if (datagrams == vlen - 1)
                        flags = oflags;

                if (MSG_CMSG_COMPAT & flags) {
                        err = ___sys_sendmsg(sock, (struct user_msghdr __user *)compat_entry,
                                             &msg_sys, flags, &used_address, MSG_EOR);
                        if (err < 0)
                                break;
                        err = __put_user(err, &compat_entry->msg_len);
                        ++compat_entry;
                } else {
                        err = ___sys_sendmsg(sock,
                                             (struct user_msghdr __user *)entry,
                                             &msg_sys, flags, &used_address, MSG_EOR);
                        if (err < 0)
                                break;
                        err = put_user(err, &entry->msg_len);
                        ++entry;
                }

                if (err)
                        break;
                ++datagrams;
                if (msg_data_left(&msg_sys))
                        break;
                cond_resched();
        }

        /* We only return an error if no datagrams were able to be sent */
        if (datagrams != 0)
                return datagrams;

        return err;
}

SYSCALL_DEFINE4(sendmmsg, int, fd, struct mmsghdr __user *, mmsg,
                unsigned int, vlen, unsigned int, flags)
{
        return __sys_sendmmsg(fd, mmsg, vlen, flags, true);
}

static int recvmsg_copy_msghdr(struct msghdr *msg,
                               struct user_msghdr __user *umsg, unsigned flags,
                               struct sockaddr __user **uaddr,
                               struct iovec **iov)
{
        ssize_t err;

        if (MSG_CMSG_COMPAT & flags) {
                struct compat_msghdr __user *msg_compat;

                msg_compat = (struct compat_msghdr __user *) umsg;
                err = get_compat_msghdr(msg, msg_compat, uaddr, iov);
        } else {
                err = copy_msghdr_from_user(msg, umsg, uaddr, iov);
        }
        if (err < 0)
                return err;

        return 0;
}

static int ____sys_recvmsg(struct socket *sock, struct msghdr *msg_sys,
                           struct user_msghdr __user *msg,
                           struct sockaddr __user *uaddr,
                           unsigned int flags, int nosec)
{
        struct compat_msghdr __user *msg_compat =
                                        (struct compat_msghdr __user *) msg;
        int __user *uaddr_len = COMPAT_NAMELEN(msg);
        struct sockaddr_storage addr;
        unsigned long cmsg_ptr;
        int len;
        ssize_t err;

        msg_sys->msg_name = &addr;
        cmsg_ptr = (unsigned long)msg_sys->msg_control;
        msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);

        /* We assume all kernel code knows the size of sockaddr_storage */
        msg_sys->msg_namelen = 0;

        if (sock->file->f_flags & O_NONBLOCK)
                flags |= MSG_DONTWAIT;

        if (unlikely(nosec))
                err = sock_recvmsg_nosec(sock, msg_sys, flags);
        else
                err = sock_recvmsg(sock, msg_sys, flags);

        if (err < 0)
                goto out;
        len = err;

        if (uaddr != NULL) {
                err = move_addr_to_user(&addr,
                                        msg_sys->msg_namelen, uaddr,
                                        uaddr_len);
                if (err < 0)
                        goto out;
        }
        err = __put_user((msg_sys->msg_flags & ~MSG_CMSG_COMPAT),
                         COMPAT_FLAGS(msg));
        if (err)
                goto out;
        if (MSG_CMSG_COMPAT & flags)
                err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
                                 &msg_compat->msg_controllen);
        else
                err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
                                 &msg->msg_controllen);
        if (err)
                goto out;
        err = len;
out:
        return err;
}

static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
                         struct msghdr *msg_sys, unsigned int flags, int nosec)
{
        struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
        /* user mode address pointers */
        struct sockaddr __user *uaddr;
        ssize_t err;

        err = recvmsg_copy_msghdr(msg_sys, msg, flags, &uaddr, &iov);
        if (err < 0)
                return err;

        err = ____sys_recvmsg(sock, msg_sys, msg, uaddr, flags, nosec);
        kfree(iov);
        return err;
}

/*
 *        BSD recvmsg interface
 */

long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg,
                        struct user_msghdr __user *umsg,
                        struct sockaddr __user *uaddr, unsigned int flags)
{
        return ____sys_recvmsg(sock, msg, umsg, uaddr, flags, 0);
}

long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,
                   bool forbid_cmsg_compat)
{
        struct msghdr msg_sys;
        struct socket *sock;

        if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT))
                return -EINVAL;

        CLASS(fd, f)(fd);

        if (fd_empty(f))
                return -EBADF;
        sock = sock_from_file(fd_file(f));
        if (unlikely(!sock))
                return -ENOTSOCK;

        return ___sys_recvmsg(sock, msg, &msg_sys, flags, 0);
}

SYSCALL_DEFINE3(recvmsg, int, fd, struct user_msghdr __user *, msg,
                unsigned int, flags)
{
        return __sys_recvmsg(fd, msg, flags, true);
}

/*
 *     Linux recvmmsg interface
 */

static int do_recvmmsg(int fd, struct mmsghdr __user *mmsg,
                          unsigned int vlen, unsigned int flags,
                          struct timespec64 *timeout)
{
        int err = 0, datagrams;
        struct socket *sock;
        struct mmsghdr __user *entry;
        struct compat_mmsghdr __user *compat_entry;
        struct msghdr msg_sys;
        struct timespec64 end_time;
        struct timespec64 timeout64;

        if (timeout &&
            poll_select_set_timeout(&end_time, timeout->tv_sec,
                                    timeout->tv_nsec))
                return -EINVAL;

        datagrams = 0;

        CLASS(fd, f)(fd);

        if (fd_empty(f))
                return -EBADF;
        sock = sock_from_file(fd_file(f));
        if (unlikely(!sock))
                return -ENOTSOCK;

        if (likely(!(flags & MSG_ERRQUEUE))) {
                err = sock_error(sock->sk);
                if (err)
                        return err;
        }

        entry = mmsg;
        compat_entry = (struct compat_mmsghdr __user *)mmsg;

        while (datagrams < vlen) {
                /*
                 * No need to ask LSM for more than the first datagram.
                 */
                if (MSG_CMSG_COMPAT & flags) {
                        err = ___sys_recvmsg(sock, (struct user_msghdr __user *)compat_entry,
                                             &msg_sys, flags & ~MSG_WAITFORONE,
                                             datagrams);
                        if (err < 0)
                                break;
                        err = __put_user(err, &compat_entry->msg_len);
                        ++compat_entry;
                } else {
                        err = ___sys_recvmsg(sock,
                                             (struct user_msghdr __user *)entry,
                                             &msg_sys, flags & ~MSG_WAITFORONE,
                                             datagrams);
                        if (err < 0)
                                break;
                        err = put_user(err, &entry->msg_len);
                        ++entry;
                }

                if (err)
                        break;
                ++datagrams;

                /* MSG_WAITFORONE turns on MSG_DONTWAIT after one packet */
                if (flags & MSG_WAITFORONE)
                        flags |= MSG_DONTWAIT;

                if (timeout) {
                        ktime_get_ts64(&timeout64);
                        *timeout = timespec64_sub(end_time, timeout64);
                        if (timeout->tv_sec < 0) {
                                timeout->tv_sec = timeout->tv_nsec = 0;
                                break;
                        }

                        /* Timeout, return less than vlen datagrams */
                        if (timeout->tv_nsec == 0 && timeout->tv_sec == 0)
                                break;
                }

                /* Out of band data, return right away */
                if (msg_sys.msg_flags & MSG_OOB)
                        break;
                cond_resched();
        }

        if (err == 0)
                return datagrams;

        if (datagrams == 0)
                return err;

        /*
         * We may return less entries than requested (vlen) if the
         * sock is non block and there aren't enough datagrams...
         */
        if (err != -EAGAIN) {
                /*
                 * ... or  if recvmsg returns an error after we
                 * received some datagrams, where we record the
                 * error to return on the next call or if the
                 * app asks about it using getsockopt(SO_ERROR).
                 */
                WRITE_ONCE(sock->sk->sk_err, -err);
        }
        return datagrams;
}

int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg,
                   unsigned int vlen, unsigned int flags,
                   struct __kernel_timespec __user *timeout,
                   struct old_timespec32 __user *timeout32)
{
        int datagrams;
        struct timespec64 timeout_sys;

        if (timeout && get_timespec64(&timeout_sys, timeout))
                return -EFAULT;

        if (timeout32 && get_old_timespec32(&timeout_sys, timeout32))
                return -EFAULT;

        if (!timeout && !timeout32)
                return do_recvmmsg(fd, mmsg, vlen, flags, NULL);

        datagrams = do_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys);

        if (datagrams <= 0)
                return datagrams;

        if (timeout && put_timespec64(&timeout_sys, timeout))
                datagrams = -EFAULT;

        if (timeout32 && put_old_timespec32(&timeout_sys, timeout32))
                datagrams = -EFAULT;

        return datagrams;
}

SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg,
                unsigned int, vlen, unsigned int, flags,
                struct __kernel_timespec __user *, timeout)
{
        if (flags & MSG_CMSG_COMPAT)
                return -EINVAL;

        return __sys_recvmmsg(fd, mmsg, vlen, flags, timeout, NULL);
}

#ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE5(recvmmsg_time32, int, fd, struct mmsghdr __user *, mmsg,
                unsigned int, vlen, unsigned int, flags,
                struct old_timespec32 __user *, timeout)
{
        if (flags & MSG_CMSG_COMPAT)
                return -EINVAL;

        return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL, timeout);
}
#endif

#ifdef __ARCH_WANT_SYS_SOCKETCALL
/* Argument list sizes for sys_socketcall */
#define AL(x) ((x) * sizeof(unsigned long))
static const unsigned char nargs[21] = {
        AL(0), AL(3), AL(3), AL(3), AL(2), AL(3),
        AL(3), AL(3), AL(4), AL(4), AL(4), AL(6),
        AL(6), AL(2), AL(5), AL(5), AL(3), AL(3),
        AL(4), AL(5), AL(4)
};

#undef AL

/*
 *        System call vectors.
 *
 *        Argument checking cleaned up. Saved 20% in size.
 *  This function doesn't need to set the kernel lock because
 *  it is set by the callees.
 */

SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
{
        unsigned long a[AUDITSC_ARGS];
        unsigned long a0, a1;
        int err;
        unsigned int len;

        if (call < 1 || call > SYS_SENDMMSG)
                return -EINVAL;
        call = array_index_nospec(call, SYS_SENDMMSG + 1);

        len = nargs[call];
        if (len > sizeof(a))
                return -EINVAL;

        /* copy_from_user should be SMP safe. */
        if (copy_from_user(a, args, len))
                return -EFAULT;

        err = audit_socketcall(nargs[call] / sizeof(unsigned long), a);
        if (err)
                return err;

        a0 = a[0];
        a1 = a[1];

        switch (call) {
        case SYS_SOCKET:
                err = __sys_socket(a0, a1, a[2]);
                break;
        case SYS_BIND:
                err = __sys_bind(a0, (struct sockaddr __user *)a1, a[2]);
                break;
        case SYS_CONNECT:
                err = __sys_connect(a0, (struct sockaddr __user *)a1, a[2]);
                break;
        case SYS_LISTEN:
                err = __sys_listen(a0, a1);
                break;
        case SYS_ACCEPT:
                err = __sys_accept4(a0, (struct sockaddr __user *)a1,
                                    (int __user *)a[2], 0);
                break;
        case SYS_GETSOCKNAME:
                err =
                    __sys_getsockname(a0, (struct sockaddr __user *)a1,
                                      (int __user *)a[2]);
                break;
        case SYS_GETPEERNAME:
                err =
                    __sys_getpeername(a0, (struct sockaddr __user *)a1,
                                      (int __user *)a[2]);
                break;
        case SYS_SOCKETPAIR:
                err = __sys_socketpair(a0, a1, a[2], (int __user *)a[3]);
                break;
        case SYS_SEND:
                err = __sys_sendto(a0, (void __user *)a1, a[2], a[3],
                                   NULL, 0);
                break;
        case SYS_SENDTO:
                err = __sys_sendto(a0, (void __user *)a1, a[2], a[3],
                                   (struct sockaddr __user *)a[4], a[5]);
                break;
        case SYS_RECV:
                err = __sys_recvfrom(a0, (void __user *)a1, a[2], a[3],
                                     NULL, NULL);
                break;
        case SYS_RECVFROM:
                err = __sys_recvfrom(a0, (void __user *)a1, a[2], a[3],
                                     (struct sockaddr __user *)a[4],
                                     (int __user *)a[5]);
                break;
        case SYS_SHUTDOWN:
                err = __sys_shutdown(a0, a1);
                break;
        case SYS_SETSOCKOPT:
                err = __sys_setsockopt(a0, a1, a[2], (char __user *)a[3],
                                       a[4]);
                break;
        case SYS_GETSOCKOPT:
                err =
                    __sys_getsockopt(a0, a1, a[2], (char __user *)a[3],
                                     (int __user *)a[4]);
                break;
        case SYS_SENDMSG:
                err = __sys_sendmsg(a0, (struct user_msghdr __user *)a1,
                                    a[2], true);
                break;
        case SYS_SENDMMSG:
                err = __sys_sendmmsg(a0, (struct mmsghdr __user *)a1, a[2],
                                     a[3], true);
                break;
        case SYS_RECVMSG:
                err = __sys_recvmsg(a0, (struct user_msghdr __user *)a1,
                                    a[2], true);
                break;
        case SYS_RECVMMSG:
                if (IS_ENABLED(CONFIG_64BIT))
                        err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1,
                                             a[2], a[3],
                                             (struct __kernel_timespec __user *)a[4],
                                             NULL);
                else
                        err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1,
                                             a[2], a[3], NULL,
                                             (struct old_timespec32 __user *)a[4]);
                break;
        case SYS_ACCEPT4:
                err = __sys_accept4(a0, (struct sockaddr __user *)a1,
                                    (int __user *)a[2], a[3]);
                break;
        default:
                err = -EINVAL;
                break;
        }
        return err;
}

#endif                                /* __ARCH_WANT_SYS_SOCKETCALL */

/**
 *        sock_register - add a socket protocol handler
 *        @ops: description of protocol
 *
 *        This function is called by a protocol handler that wants to
 *        advertise its address family, and have it linked into the
 *        socket interface. The value ops->family corresponds to the
 *        socket system call protocol family.
 */
int sock_register(const struct net_proto_family *ops)
{
        int err;

        if (ops->family >= NPROTO) {
                pr_crit("protocol %d >= NPROTO(%d)\n", ops->family, NPROTO);
                return -ENOBUFS;
        }

        spin_lock(&net_family_lock);
        if (rcu_dereference_protected(net_families[ops->family],
                                      lockdep_is_held(&net_family_lock)))
                err = -EEXIST;
        else {
                rcu_assign_pointer(net_families[ops->family], ops);
                err = 0;
        }
        spin_unlock(&net_family_lock);

        pr_info("NET: Registered %s protocol family\n", pf_family_names[ops->family]);
        return err;
}
EXPORT_SYMBOL(sock_register);

/**
 *        sock_unregister - remove a protocol handler
 *        @family: protocol family to remove
 *
 *        This function is called by a protocol handler that wants to
 *        remove its address family, and have it unlinked from the
 *        new socket creation.
 *
 *        If protocol handler is a module, then it can use module reference
 *        counts to protect against new references. If protocol handler is not
 *        a module then it needs to provide its own protection in
 *        the ops->create routine.
 */
void sock_unregister(int family)
{
        BUG_ON(family < 0 || family >= NPROTO);

        spin_lock(&net_family_lock);
        RCU_INIT_POINTER(net_families[family], NULL);
        spin_unlock(&net_family_lock);

        synchronize_rcu();

        pr_info("NET: Unregistered %s protocol family\n", pf_family_names[family]);
}
EXPORT_SYMBOL(sock_unregister);

bool sock_is_registered(int family)
{
        return family < NPROTO && rcu_access_pointer(net_families[family]);
}

static int __init sock_init(void)
{
        int err;
        /*
         *      Initialize the network sysctl infrastructure.
         */
        err = net_sysctl_init();
        if (err)
                goto out;

        /*
         *      Initialize skbuff SLAB cache
         */
        skb_init();

        /*
         *      Initialize the protocols module.
         */

        init_inodecache();

        err = register_filesystem(&sock_fs_type);
        if (err)
                goto out;
        sock_mnt = kern_mount(&sock_fs_type);
        if (IS_ERR(sock_mnt)) {
                err = PTR_ERR(sock_mnt);
                goto out_mount;
        }

        /* The real protocol initialization is performed in later initcalls.
         */

#ifdef CONFIG_NETFILTER
        err = netfilter_init();
        if (err)
                goto out;
#endif

        ptp_classifier_init();

out:
        return err;

out_mount:
        unregister_filesystem(&sock_fs_type);
        goto out;
}

core_initcall(sock_init);        /* early initcall */

#ifdef CONFIG_PROC_FS
void socket_seq_show(struct seq_file *seq)
{
        seq_printf(seq, "sockets: used %d\n",
                   sock_inuse_get(seq->private));
}
#endif                                /* CONFIG_PROC_FS */

/* Handle the fact that while struct ifreq has the same *layout* on
 * 32/64 for everything but ifreq::ifru_ifmap and ifreq::ifru_data,
 * which are handled elsewhere, it still has different *size* due to
 * ifreq::ifru_ifmap (which is 16 bytes on 32 bit, 24 bytes on 64-bit,
 * resulting in struct ifreq being 32 and 40 bytes respectively).
 * As a result, if the struct happens to be at the end of a page and
 * the next page isn't readable/writable, we get a fault. To prevent
 * that, copy back and forth to the full size.
 */
int get_user_ifreq(struct ifreq *ifr, void __user **ifrdata, void __user *arg)
{
        if (in_compat_syscall()) {
                struct compat_ifreq *ifr32 = (struct compat_ifreq *)ifr;

                memset(ifr, 0, sizeof(*ifr));
                if (copy_from_user(ifr32, arg, sizeof(*ifr32)))
                        return -EFAULT;

                if (ifrdata)
                        *ifrdata = compat_ptr(ifr32->ifr_data);

                return 0;
        }

        if (copy_from_user(ifr, arg, sizeof(*ifr)))
                return -EFAULT;

        if (ifrdata)
                *ifrdata = ifr->ifr_data;

        return 0;
}
EXPORT_SYMBOL(get_user_ifreq);

int put_user_ifreq(struct ifreq *ifr, void __user *arg)
{
        size_t size = sizeof(*ifr);

        if (in_compat_syscall())
                size = sizeof(struct compat_ifreq);

        if (copy_to_user(arg, ifr, size))
                return -EFAULT;

        return 0;
}
EXPORT_SYMBOL(put_user_ifreq);

#ifdef CONFIG_COMPAT
static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32)
{
        compat_uptr_t uptr32;
        struct ifreq ifr;
        void __user *saved;
        int err;

        if (get_user_ifreq(&ifr, NULL, uifr32))
                return -EFAULT;

        if (get_user(uptr32, &uifr32->ifr_settings.ifs_ifsu))
                return -EFAULT;

        saved = ifr.ifr_settings.ifs_ifsu.raw_hdlc;
        ifr.ifr_settings.ifs_ifsu.raw_hdlc = compat_ptr(uptr32);

        err = dev_ioctl(net, SIOCWANDEV, &ifr, NULL, NULL);
        if (!err) {
                ifr.ifr_settings.ifs_ifsu.raw_hdlc = saved;
                if (put_user_ifreq(&ifr, uifr32))
                        err = -EFAULT;
        }
        return err;
}

/* Handle ioctls that use ifreq::ifr_data and just need struct ifreq converted */
static int compat_ifr_data_ioctl(struct net *net, unsigned int cmd,
                                 struct compat_ifreq __user *u_ifreq32)
{
        struct ifreq ifreq;
        void __user *data;

        if (!is_socket_ioctl_cmd(cmd))
                return -ENOTTY;
        if (get_user_ifreq(&ifreq, &data, u_ifreq32))
                return -EFAULT;
        ifreq.ifr_data = data;

        return dev_ioctl(net, cmd, &ifreq, data, NULL);
}

static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
                         unsigned int cmd, unsigned long arg)
{
        void __user *argp = compat_ptr(arg);
        struct sock *sk = sock->sk;
        struct net *net = sock_net(sk);
        const struct proto_ops *ops;

        if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15))
                return sock_ioctl(file, cmd, (unsigned long)argp);

        switch (cmd) {
        case SIOCWANDEV:
                return compat_siocwandev(net, argp);
        case SIOCGSTAMP_OLD:
        case SIOCGSTAMPNS_OLD:
                ops = READ_ONCE(sock->ops);
                if (!ops->gettstamp)
                        return -ENOIOCTLCMD;
                return ops->gettstamp(sock, argp, cmd == SIOCGSTAMP_OLD,
                                      !COMPAT_USE_64BIT_TIME);

        case SIOCETHTOOL:
        case SIOCBONDSLAVEINFOQUERY:
        case SIOCBONDINFOQUERY:
        case SIOCSHWTSTAMP:
        case SIOCGHWTSTAMP:
                return compat_ifr_data_ioctl(net, cmd, argp);

        case FIOSETOWN:
        case SIOCSPGRP:
        case FIOGETOWN:
        case SIOCGPGRP:
        case SIOCBRADDBR:
        case SIOCBRDELBR:
        case SIOCBRADDIF:
        case SIOCBRDELIF:
        case SIOCGIFVLAN:
        case SIOCSIFVLAN:
        case SIOCGSKNS:
        case SIOCGSTAMP_NEW:
        case SIOCGSTAMPNS_NEW:
        case SIOCGIFCONF:
        case SIOCSIFBR:
        case SIOCGIFBR:
                return sock_ioctl(file, cmd, arg);

        case SIOCGIFFLAGS:
        case SIOCSIFFLAGS:
        case SIOCGIFMAP:
        case SIOCSIFMAP:
        case SIOCGIFMETRIC:
        case SIOCSIFMETRIC:
        case SIOCGIFMTU:
        case SIOCSIFMTU:
        case SIOCGIFMEM:
        case SIOCSIFMEM:
        case SIOCGIFHWADDR:
        case SIOCSIFHWADDR:
        case SIOCADDMULTI:
        case SIOCDELMULTI:
        case SIOCGIFINDEX:
        case SIOCGIFADDR:
        case SIOCSIFADDR:
        case SIOCSIFHWBROADCAST:
        case SIOCDIFADDR:
        case SIOCGIFBRDADDR:
        case SIOCSIFBRDADDR:
        case SIOCGIFDSTADDR:
        case SIOCSIFDSTADDR:
        case SIOCGIFNETMASK:
        case SIOCSIFNETMASK:
        case SIOCSIFPFLAGS:
        case SIOCGIFPFLAGS:
        case SIOCGIFTXQLEN:
        case SIOCSIFTXQLEN:
        case SIOCGIFNAME:
        case SIOCSIFNAME:
        case SIOCGMIIPHY:
        case SIOCGMIIREG:
        case SIOCSMIIREG:
        case SIOCBONDENSLAVE:
        case SIOCBONDRELEASE:
        case SIOCBONDSETHWADDR:
        case SIOCBONDCHANGEACTIVE:
        case SIOCSARP:
        case SIOCGARP:
        case SIOCDARP:
        case SIOCOUTQ:
        case SIOCOUTQNSD:
        case SIOCATMARK:
                return sock_do_ioctl(net, sock, cmd, arg);
        }

        return -ENOIOCTLCMD;
}

static long compat_sock_ioctl(struct file *file, unsigned int cmd,
                              unsigned long arg)
{
        struct socket *sock = file->private_data;
        const struct proto_ops *ops = READ_ONCE(sock->ops);
        int ret = -ENOIOCTLCMD;
        struct sock *sk;
        struct net *net;

        sk = sock->sk;
        net = sock_net(sk);

        if (ops->compat_ioctl)
                ret = ops->compat_ioctl(sock, cmd, arg);

        if (ret == -ENOIOCTLCMD &&
            (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST))
                ret = compat_wext_handle_ioctl(net, cmd, arg);

        if (ret == -ENOIOCTLCMD)
                ret = compat_sock_ioctl_trans(file, sock, cmd, arg);

        return ret;
}
#endif

/**
 *        kernel_bind - bind an address to a socket (kernel space)
 *        @sock: socket
 *        @addr: address
 *        @addrlen: length of address
 *
 *        Returns 0 or an error.
 */

int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen)
{
        struct sockaddr_storage address;

        memcpy(&address, addr, addrlen);

        return READ_ONCE(sock->ops)->bind(sock, (struct sockaddr *)&address,
                                          addrlen);
}
EXPORT_SYMBOL(kernel_bind);

/**
 *        kernel_listen - move socket to listening state (kernel space)
 *        @sock: socket
 *        @backlog: pending connections queue size
 *
 *        Returns 0 or an error.
 */

int kernel_listen(struct socket *sock, int backlog)
{
        return READ_ONCE(sock->ops)->listen(sock, backlog);
}
EXPORT_SYMBOL(kernel_listen);

/**
 *        kernel_accept - accept a connection (kernel space)
 *        @sock: listening socket
 *        @newsock: new connected socket
 *        @flags: flags
 *
 *        @flags must be SOCK_CLOEXEC, SOCK_NONBLOCK or 0.
 *        If it fails, @newsock is guaranteed to be %NULL.
 *        Returns 0 or an error.
 */

int kernel_accept(struct socket *sock, struct socket **newsock, int flags)
{
        struct sock *sk = sock->sk;
        const struct proto_ops *ops = READ_ONCE(sock->ops);
        struct proto_accept_arg arg = {
                .flags = flags,
                .kern = true,
        };
        int err;

        err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol,
                               newsock);
        if (err < 0)
                goto done;

        err = ops->accept(sock, *newsock, &arg);
        if (err < 0) {
                sock_release(*newsock);
                *newsock = NULL;
                goto done;
        }

        (*newsock)->ops = ops;
        __module_get(ops->owner);

done:
        return err;
}
EXPORT_SYMBOL(kernel_accept);

/**
 *        kernel_connect - connect a socket (kernel space)
 *        @sock: socket
 *        @addr: address
 *        @addrlen: address length
 *        @flags: flags (O_NONBLOCK, ...)
 *
 *        For datagram sockets, @addr is the address to which datagrams are sent
 *        by default, and the only address from which datagrams are received.
 *        For stream sockets, attempts to connect to @addr.
 *        Returns 0 or an error code.
 */

int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen,
                   int flags)
{
        struct sockaddr_storage address;

        memcpy(&address, addr, addrlen);

        return READ_ONCE(sock->ops)->connect(sock, (struct sockaddr *)&address,
                                             addrlen, flags);
}
EXPORT_SYMBOL(kernel_connect);

/**
 *        kernel_getsockname - get the address which the socket is bound (kernel space)
 *        @sock: socket
 *        @addr: address holder
 *
 *         Fills the @addr pointer with the address which the socket is bound.
 *        Returns the length of the address in bytes or an error code.
 */

int kernel_getsockname(struct socket *sock, struct sockaddr *addr)
{
        return READ_ONCE(sock->ops)->getname(sock, addr, 0);
}
EXPORT_SYMBOL(kernel_getsockname);

/**
 *        kernel_getpeername - get the address which the socket is connected (kernel space)
 *        @sock: socket
 *        @addr: address holder
 *
 *         Fills the @addr pointer with the address which the socket is connected.
 *        Returns the length of the address in bytes or an error code.
 */

int kernel_getpeername(struct socket *sock, struct sockaddr *addr)
{
        return READ_ONCE(sock->ops)->getname(sock, addr, 1);
}
EXPORT_SYMBOL(kernel_getpeername);

/**
 *        kernel_sock_shutdown - shut down part of a full-duplex connection (kernel space)
 *        @sock: socket
 *        @how: connection part
 *
 *        Returns 0 or an error.
 */

int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how)
{
        return READ_ONCE(sock->ops)->shutdown(sock, how);
}
EXPORT_SYMBOL(kernel_sock_shutdown);

/**
 *        kernel_sock_ip_overhead - returns the IP overhead imposed by a socket
 *        @sk: socket
 *
 *        This routine returns the IP overhead imposed by a socket i.e.
 *        the length of the underlying IP header, depending on whether
 *        this is an IPv4 or IPv6 socket and the length from IP options turned
 *        on at the socket. Assumes that the caller has a lock on the socket.
 */

u32 kernel_sock_ip_overhead(struct sock *sk)
{
        struct inet_sock *inet;
        struct ip_options_rcu *opt;
        u32 overhead = 0;
#if IS_ENABLED(CONFIG_IPV6)
        struct ipv6_pinfo *np;
        struct ipv6_txoptions *optv6 = NULL;
#endif /* IS_ENABLED(CONFIG_IPV6) */

        if (!sk)
                return overhead;

        switch (sk->sk_family) {
        case AF_INET:
                inet = inet_sk(sk);
                overhead += sizeof(struct iphdr);
                opt = rcu_dereference_protected(inet->inet_opt,
                                                sock_owned_by_user(sk));
                if (opt)
                        overhead += opt->opt.optlen;
                return overhead;
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                np = inet6_sk(sk);
                overhead += sizeof(struct ipv6hdr);
                if (np)
                        optv6 = rcu_dereference_protected(np->opt,
                                                          sock_owned_by_user(sk));
                if (optv6)
                        overhead += (optv6->opt_flen + optv6->opt_nflen);
                return overhead;
#endif /* IS_ENABLED(CONFIG_IPV6) */
        default: /* Returns 0 overhead if the socket is not ipv4 or ipv6 */
                return overhead;
        }
}
EXPORT_SYMBOL(kernel_sock_ip_overhead);


































  317 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/fault-inject.h>
#include <linux/fault-inject-usercopy.h>

static struct {
        struct fault_attr attr;
} fail_usercopy = {
        .attr = FAULT_ATTR_INITIALIZER,
};

static int __init setup_fail_usercopy(char *str)
{
        return setup_fault_attr(&fail_usercopy.attr, str);
}
__setup("fail_usercopy=", setup_fail_usercopy);

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

static int __init fail_usercopy_debugfs(void)
{
        struct dentry *dir;

        dir = fault_create_debugfs_attr("fail_usercopy", NULL,
                                        &fail_usercopy.attr);

        return PTR_ERR_OR_ZERO(dir);
}

late_initcall(fail_usercopy_debugfs);

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

bool should_fail_usercopy(void)
{
        return should_fail(&fail_usercopy.attr, 1);
}
EXPORT_SYMBOL_GPL(should_fail_usercopy);





















































































































































































































   22 







   10 
    8 
   10 
   24 
   10 
    6 







   25 
   15 
   15 
   15 
   14 
   14 





   25 

   25 

   25 

   25 
   15 
















































































































































































   32 


   32 

   32 












   32 















   15 

   15 












    6 













    6 


    5 


    6 
    4 


    6 

    6 






   16 







   52 


   53 





   20 





   31 




   25 






   25 

   25 


   14 
   25 













   28 

   28 
   28 




























































































































   16 


















    2 




































   15 









   15 

   14 
   15 

   15 


    6 
    6 
    6 







    6 



    6 





   15 




   15 
   15 


    8 


    5 
    9 

   15 
    6 








   15 










   10 





    9 










    8 












   19 

    5 
   19 
    5 

    1 
    4 

    1 



   13 
    1 


   14 
   13 


    4 



    1 

    1 
    4 












    1 






    1 



    1 


    1 

    1 
    1 
    1 

    1 
    1 
    1 




    1 


    1 









    1 







    1 
    1 




    1 

    1 
















    5 






    1 





    1 


    1 


    1 
    1 
    1 


    1 



    1 







    1 



    1 
    1 
















    1 


    1 



    1 




    1 



   11 




    4 
    8 



    6 





    8 
    8 

    2 

    1 


    5 
    2 



    6 




    2 








    1 
    1 







    1 












    8 
    3 



    9 


   12 
    4 


    4 



    4 


    4 


    1 
    4 






    1 


    4 
    4 






    4 



    4 


    4 

    1 














    1 

    1 

    1 


   15 






   15 

   15 
   15 

    3 


   12 








    2 

   12 



    9 










    2 

    9 
    3 

    2 









   15 
   12 



   15 






   15 
   15 


   15 

   15 
   14 





    3 





   15 


   15 





   14 


   15 































































































































    3 







   16 






   16 







    3 
   11 
   16 

   12 
   12 




   12 
   16 

    2 








   14 
   16 












    3 


















    3 




   12 



   45 
















   36 







   28 



   27 



   27 
   25 


   33 
   21 















   18 
   15 

   13 
    6 
    6 

    6 




   15 
   15 
   13 

    6 



   13 
   18 




   18 





   18 









   15 


   15 

   18 








    3 

   10 

   16 



   16 
   16 



   16 
   16 

    3 
   16 
   16 
   14 

   18 
   14 









    6 
    7 

    7 
    6 

    5 
    4 
    2 
    4 



    6 





    7 
    1 






    7 



    2 

    2 

    1 










    2 


    2 



    2 

    1 







    7 
    3 
    1 

    1 











    7 


    2 









    7 
    5 
    7 





    7 
    4 
    1 
    1 
    1 












    4 

    3 


    4 









    3 
    3 
    3 

















































    3 





    3 




    3 
    3 
    1 

    3 
    3 











    3 




    3 

    2 


















































    2 






    2 

    2 




    2 
    2 


    2 


    2 



    1 

    2 

    2 





    2 
    2 
    2 



    2 









    1 





    1 











   25 




   25 
   25 
   25 
   17 


   20 
   25 







   15 




   15 

   15 
   15 





   11 
   11 
    6 
    8 

   15 

   15 





    3 














   15 


   15 













   15 


    3 

   15 
   15 




   30 









   30 

   30 







    2 

    2 
    2 
    2 










    3 


   13 


   13 


   12 


   12 

    2 





   11 






   20 














   13 
   22 


   11 
   13 






















   28 
   12 
   28 
    3 

   28 


   28 
   28 



   26 
   23 

   17 










   24 


   28 
   13 

   27 

   22 


   28 
   11 




   27 
    3 

   28 

   24 
   28 

   28 

















































































    6 
   15 


   15 



   16 






   16 






   16 



   16 

   16 
   16 



    6 






    5 
    6 
    6 


    6 


    6 
    6 












    1 





    1 

    1 






























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   17 





   18 


   18 





    3 





    3 


    3 







   29 









   12 

   11 






   16 

   15 






   29 


   28 

   27 
   20 
   19 

   20 
   28 

   28 
    1 


    9 



    8 
    6 
    5 



    1 
    4 
    3 

































   20 
   12 
   10 


    1 




    1 





    1 






   20 















   34 
   30 
    3 




   17 
    5 
    3 
    4 

    3 
    2 
    1 

    1 













   34 








   34 

    4 

   29 
   16 
   34 





   21 
   20 
   20 




   20 
   20 




   10 









   22 
    2 
   28 








    2 

    2 

    2 





   20 


   20 
    1 


   12 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
// SPDX-License-Identifier: GPL-2.0-only
/*
 * xfrm_policy.c
 *
 * Changes:
 *        Mitsuru KANDA @USAGI
 *         Kazunori MIYAZAWA @USAGI
 *         Kunihiro Ishiguro <kunihiro@ipinfusion.com>
 *                 IPv6 support
 *         Kazunori MIYAZAWA @USAGI
 *         YOSHIFUJI Hideaki
 *                 Split up af-specific portion
 *        Derek Atkins <derek@ihtfp.com>                Add the post_input processor
 *
 */

#include <linux/err.h>
#include <linux/slab.h>
#include <linux/kmod.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/workqueue.h>
#include <linux/notifier.h>
#include <linux/netdevice.h>
#include <linux/netfilter.h>
#include <linux/module.h>
#include <linux/cache.h>
#include <linux/cpu.h>
#include <linux/audit.h>
#include <linux/rhashtable.h>
#include <linux/if_tunnel.h>
#include <linux/icmp.h>
#include <net/dst.h>
#include <net/flow.h>
#include <net/inet_ecn.h>
#include <net/xfrm.h>
#include <net/ip.h>
#include <net/gre.h>
#if IS_ENABLED(CONFIG_IPV6_MIP6)
#include <net/mip6.h>
#endif
#ifdef CONFIG_XFRM_STATISTICS
#include <net/snmp.h>
#endif
#ifdef CONFIG_XFRM_ESPINTCP
#include <net/espintcp.h>
#endif
#include <net/inet_dscp.h>

#include "xfrm_hash.h"

#define XFRM_QUEUE_TMO_MIN ((unsigned)(HZ/10))
#define XFRM_QUEUE_TMO_MAX ((unsigned)(60*HZ))
#define XFRM_MAX_QUEUE_LEN        100

struct xfrm_flo {
        struct dst_entry *dst_orig;
        u8 flags;
};

/* prefixes smaller than this are stored in lists, not trees. */
#define INEXACT_PREFIXLEN_IPV4        16
#define INEXACT_PREFIXLEN_IPV6        48

struct xfrm_pol_inexact_node {
        struct rb_node node;
        union {
                xfrm_address_t addr;
                struct rcu_head rcu;
        };
        u8 prefixlen;

        struct rb_root root;

        /* the policies matching this node, can be empty list */
        struct hlist_head hhead;
};

/* xfrm inexact policy search tree:
 * xfrm_pol_inexact_bin = hash(dir,type,family,if_id);
 *  |
 * +---- root_d: sorted by daddr:prefix
 * |                 |
 * |        xfrm_pol_inexact_node
 * |                 |
 * |                 +- root: sorted by saddr/prefix
 * |                 |              |
 * |                 |         xfrm_pol_inexact_node
 * |                 |              |
 * |                 |              + root: unused
 * |                 |              |
 * |                 |              + hhead: saddr:daddr policies
 * |                 |
 * |                 +- coarse policies and all any:daddr policies
 * |
 * +---- root_s: sorted by saddr:prefix
 * |                 |
 * |        xfrm_pol_inexact_node
 * |                 |
 * |                 + root: unused
 * |                 |
 * |                 + hhead: saddr:any policies
 * |
 * +---- coarse policies and all any:any policies
 *
 * Lookups return four candidate lists:
 * 1. any:any list from top-level xfrm_pol_inexact_bin
 * 2. any:daddr list from daddr tree
 * 3. saddr:daddr list from 2nd level daddr tree
 * 4. saddr:any list from saddr tree
 *
 * This result set then needs to be searched for the policy with
 * the lowest priority.  If two candidates have the same priority, the
 * struct xfrm_policy pos member with the lower number is used.
 *
 * This replicates previous single-list-search algorithm which would
 * return first matching policy in the (ordered-by-priority) list.
 */

struct xfrm_pol_inexact_key {
        possible_net_t net;
        u32 if_id;
        u16 family;
        u8 dir, type;
};

struct xfrm_pol_inexact_bin {
        struct xfrm_pol_inexact_key k;
        struct rhash_head head;
        /* list containing '*:*' policies */
        struct hlist_head hhead;

        seqcount_spinlock_t count;
        /* tree sorted by daddr/prefix */
        struct rb_root root_d;

        /* tree sorted by saddr/prefix */
        struct rb_root root_s;

        /* slow path below */
        struct list_head inexact_bins;
        struct rcu_head rcu;
};

enum xfrm_pol_inexact_candidate_type {
        XFRM_POL_CAND_BOTH,
        XFRM_POL_CAND_SADDR,
        XFRM_POL_CAND_DADDR,
        XFRM_POL_CAND_ANY,

        XFRM_POL_CAND_MAX,
};

struct xfrm_pol_inexact_candidates {
        struct hlist_head *res[XFRM_POL_CAND_MAX];
};

struct xfrm_flow_keys {
        struct flow_dissector_key_basic basic;
        struct flow_dissector_key_control control;
        union {
                struct flow_dissector_key_ipv4_addrs ipv4;
                struct flow_dissector_key_ipv6_addrs ipv6;
        } addrs;
        struct flow_dissector_key_ip ip;
        struct flow_dissector_key_icmp icmp;
        struct flow_dissector_key_ports ports;
        struct flow_dissector_key_keyid gre;
};

static struct flow_dissector xfrm_session_dissector __ro_after_init;

static DEFINE_SPINLOCK(xfrm_if_cb_lock);
static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly;

static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock);
static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1]
                                                __read_mostly;

static struct kmem_cache *xfrm_dst_cache __ro_after_init;

static struct rhashtable xfrm_policy_inexact_table;
static const struct rhashtable_params xfrm_pol_inexact_params;

static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr);
static int stale_bundle(struct dst_entry *dst);
static int xfrm_bundle_ok(struct xfrm_dst *xdst);
static void xfrm_policy_queue_process(struct timer_list *t);

static void __xfrm_policy_link(struct xfrm_policy *pol, int dir);
static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
                                                int dir);

static struct xfrm_pol_inexact_bin *
xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir,
                           u32 if_id);

static struct xfrm_pol_inexact_bin *
xfrm_policy_inexact_lookup_rcu(struct net *net,
                               u8 type, u16 family, u8 dir, u32 if_id);
static struct xfrm_policy *
xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy,
                        bool excl);

static bool
xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand,
                                    struct xfrm_pol_inexact_bin *b,
                                    const xfrm_address_t *saddr,
                                    const xfrm_address_t *daddr);

static inline bool xfrm_pol_hold_rcu(struct xfrm_policy *policy)
{
        return refcount_inc_not_zero(&policy->refcnt);
}

static inline bool
__xfrm4_selector_match(const struct xfrm_selector *sel, const struct flowi *fl)
{
        const struct flowi4 *fl4 = &fl->u.ip4;

        return  addr4_match(fl4->daddr, sel->daddr.a4, sel->prefixlen_d) &&
                addr4_match(fl4->saddr, sel->saddr.a4, sel->prefixlen_s) &&
                !((xfrm_flowi_dport(fl, &fl4->uli) ^ sel->dport) & sel->dport_mask) &&
                !((xfrm_flowi_sport(fl, &fl4->uli) ^ sel->sport) & sel->sport_mask) &&
                (fl4->flowi4_proto == sel->proto || !sel->proto) &&
                (fl4->flowi4_oif == sel->ifindex || !sel->ifindex);
}

static inline bool
__xfrm6_selector_match(const struct xfrm_selector *sel, const struct flowi *fl)
{
        const struct flowi6 *fl6 = &fl->u.ip6;

        return  addr_match(&fl6->daddr, &sel->daddr, sel->prefixlen_d) &&
                addr_match(&fl6->saddr, &sel->saddr, sel->prefixlen_s) &&
                !((xfrm_flowi_dport(fl, &fl6->uli) ^ sel->dport) & sel->dport_mask) &&
                !((xfrm_flowi_sport(fl, &fl6->uli) ^ sel->sport) & sel->sport_mask) &&
                (fl6->flowi6_proto == sel->proto || !sel->proto) &&
                (fl6->flowi6_oif == sel->ifindex || !sel->ifindex);
}

bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl,
                         unsigned short family)
{
        switch (family) {
        case AF_INET:
                return __xfrm4_selector_match(sel, fl);
        case AF_INET6:
                return __xfrm6_selector_match(sel, fl);
        }
        return false;
}

static const struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
{
        const struct xfrm_policy_afinfo *afinfo;

        if (unlikely(family >= ARRAY_SIZE(xfrm_policy_afinfo)))
                return NULL;
        rcu_read_lock();
        afinfo = rcu_dereference(xfrm_policy_afinfo[family]);
        if (unlikely(!afinfo))
                rcu_read_unlock();
        return afinfo;
}

/* Called with rcu_read_lock(). */
static const struct xfrm_if_cb *xfrm_if_get_cb(void)
{
        return rcu_dereference(xfrm_if_cb);
}

struct dst_entry *__xfrm_dst_lookup(int family,
                                    const struct xfrm_dst_lookup_params *params)
{
        const struct xfrm_policy_afinfo *afinfo;
        struct dst_entry *dst;

        afinfo = xfrm_policy_get_afinfo(family);
        if (unlikely(afinfo == NULL))
                return ERR_PTR(-EAFNOSUPPORT);

        dst = afinfo->dst_lookup(params);

        rcu_read_unlock();

        return dst;
}
EXPORT_SYMBOL(__xfrm_dst_lookup);

static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x,
                                                dscp_t dscp, int oif,
                                                xfrm_address_t *prev_saddr,
                                                xfrm_address_t *prev_daddr,
                                                int family, u32 mark)
{
        struct xfrm_dst_lookup_params params;
        struct net *net = xs_net(x);
        xfrm_address_t *saddr = &x->props.saddr;
        xfrm_address_t *daddr = &x->id.daddr;
        struct dst_entry *dst;

        if (x->type->flags & XFRM_TYPE_LOCAL_COADDR) {
                saddr = x->coaddr;
                daddr = prev_daddr;
        }
        if (x->type->flags & XFRM_TYPE_REMOTE_COADDR) {
                saddr = prev_saddr;
                daddr = x->coaddr;
        }

        params.net = net;
        params.saddr = saddr;
        params.daddr = daddr;
        params.dscp = dscp;
        params.oif = oif;
        params.mark = mark;
        params.ipproto = x->id.proto;
        if (x->encap) {
                switch (x->encap->encap_type) {
                case UDP_ENCAP_ESPINUDP:
                        params.ipproto = IPPROTO_UDP;
                        params.uli.ports.sport = x->encap->encap_sport;
                        params.uli.ports.dport = x->encap->encap_dport;
                        break;
                case TCP_ENCAP_ESPINTCP:
                        params.ipproto = IPPROTO_TCP;
                        params.uli.ports.sport = x->encap->encap_sport;
                        params.uli.ports.dport = x->encap->encap_dport;
                        break;
                }
        }

        dst = __xfrm_dst_lookup(family, &params);

        if (!IS_ERR(dst)) {
                if (prev_saddr != saddr)
                        memcpy(prev_saddr, saddr,  sizeof(*prev_saddr));
                if (prev_daddr != daddr)
                        memcpy(prev_daddr, daddr,  sizeof(*prev_daddr));
        }

        return dst;
}

static inline unsigned long make_jiffies(long secs)
{
        if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ)
                return MAX_SCHEDULE_TIMEOUT-1;
        else
                return secs*HZ;
}

static void xfrm_policy_timer(struct timer_list *t)
{
        struct xfrm_policy *xp = timer_container_of(xp, t, timer);
        time64_t now = ktime_get_real_seconds();
        time64_t next = TIME64_MAX;
        int warn = 0;
        int dir;

        read_lock(&xp->lock);

        if (unlikely(xp->walk.dead))
                goto out;

        dir = xfrm_policy_id2dir(xp->index);

        if (xp->lft.hard_add_expires_seconds) {
                time64_t tmo = xp->lft.hard_add_expires_seconds +
                        xp->curlft.add_time - now;
                if (tmo <= 0)
                        goto expired;
                if (tmo < next)
                        next = tmo;
        }
        if (xp->lft.hard_use_expires_seconds) {
                time64_t tmo = xp->lft.hard_use_expires_seconds +
                        (READ_ONCE(xp->curlft.use_time) ? : xp->curlft.add_time) - now;
                if (tmo <= 0)
                        goto expired;
                if (tmo < next)
                        next = tmo;
        }
        if (xp->lft.soft_add_expires_seconds) {
                time64_t tmo = xp->lft.soft_add_expires_seconds +
                        xp->curlft.add_time - now;
                if (tmo <= 0) {
                        warn = 1;
                        tmo = XFRM_KM_TIMEOUT;
                }
                if (tmo < next)
                        next = tmo;
        }
        if (xp->lft.soft_use_expires_seconds) {
                time64_t tmo = xp->lft.soft_use_expires_seconds +
                        (READ_ONCE(xp->curlft.use_time) ? : xp->curlft.add_time) - now;
                if (tmo <= 0) {
                        warn = 1;
                        tmo = XFRM_KM_TIMEOUT;
                }
                if (tmo < next)
                        next = tmo;
        }

        if (warn)
                km_policy_expired(xp, dir, 0, 0);
        if (next != TIME64_MAX &&
            !mod_timer(&xp->timer, jiffies + make_jiffies(next)))
                xfrm_pol_hold(xp);

out:
        read_unlock(&xp->lock);
        xfrm_pol_put(xp);
        return;

expired:
        read_unlock(&xp->lock);
        if (!xfrm_policy_delete(xp, dir))
                km_policy_expired(xp, dir, 1, 0);
        xfrm_pol_put(xp);
}

/* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2
 * SPD calls.
 */

struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp)
{
        struct xfrm_policy *policy;

        policy = kzalloc(sizeof(struct xfrm_policy), gfp);

        if (policy) {
                write_pnet(&policy->xp_net, net);
                INIT_LIST_HEAD(&policy->walk.all);
                INIT_HLIST_HEAD(&policy->state_cache_list);
                INIT_HLIST_NODE(&policy->bydst);
                INIT_HLIST_NODE(&policy->byidx);
                rwlock_init(&policy->lock);
                refcount_set(&policy->refcnt, 1);
                skb_queue_head_init(&policy->polq.hold_queue);
                timer_setup(&policy->timer, xfrm_policy_timer, 0);
                timer_setup(&policy->polq.hold_timer,
                            xfrm_policy_queue_process, 0);
        }
        return policy;
}
EXPORT_SYMBOL(xfrm_policy_alloc);

static void xfrm_policy_destroy_rcu(struct rcu_head *head)
{
        struct xfrm_policy *policy = container_of(head, struct xfrm_policy, rcu);

        security_xfrm_policy_free(policy->security);
        kfree(policy);
}

/* Destroy xfrm_policy: descendant resources must be released to this moment. */

void xfrm_policy_destroy(struct xfrm_policy *policy)
{
        BUG_ON(!policy->walk.dead);

        if (timer_delete(&policy->timer) || timer_delete(&policy->polq.hold_timer))
                BUG();

        xfrm_dev_policy_free(policy);
        call_rcu(&policy->rcu, xfrm_policy_destroy_rcu);
}
EXPORT_SYMBOL(xfrm_policy_destroy);

/* Rule must be locked. Release descendant resources, announce
 * entry dead. The rule must be unlinked from lists to the moment.
 */

static void xfrm_policy_kill(struct xfrm_policy *policy)
{
        struct net *net = xp_net(policy);
        struct xfrm_state *x;

        xfrm_dev_policy_delete(policy);

        write_lock_bh(&policy->lock);
        policy->walk.dead = 1;
        write_unlock_bh(&policy->lock);

        atomic_inc(&policy->genid);

        if (timer_delete(&policy->polq.hold_timer))
                xfrm_pol_put(policy);
        skb_queue_purge(&policy->polq.hold_queue);

        if (timer_delete(&policy->timer))
                xfrm_pol_put(policy);

        /* XXX: Flush state cache */
        spin_lock_bh(&net->xfrm.xfrm_state_lock);
        hlist_for_each_entry_rcu(x, &policy->state_cache_list, state_cache) {
                hlist_del_init_rcu(&x->state_cache);
        }
        spin_unlock_bh(&net->xfrm.xfrm_state_lock);

        xfrm_pol_put(policy);
}

static unsigned int xfrm_policy_hashmax __read_mostly = 1 * 1024 * 1024;

static inline unsigned int idx_hash(struct net *net, u32 index)
{
        return __idx_hash(index, net->xfrm.policy_idx_hmask);
}

/* calculate policy hash thresholds */
static void __get_hash_thresh(struct net *net,
                              unsigned short family, int dir,
                              u8 *dbits, u8 *sbits)
{
        switch (family) {
        case AF_INET:
                *dbits = net->xfrm.policy_bydst[dir].dbits4;
                *sbits = net->xfrm.policy_bydst[dir].sbits4;
                break;

        case AF_INET6:
                *dbits = net->xfrm.policy_bydst[dir].dbits6;
                *sbits = net->xfrm.policy_bydst[dir].sbits6;
                break;

        default:
                *dbits = 0;
                *sbits = 0;
        }
}

static struct hlist_head *policy_hash_bysel(struct net *net,
                                            const struct xfrm_selector *sel,
                                            unsigned short family, int dir)
{
        unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
        unsigned int hash;
        u8 dbits;
        u8 sbits;

        __get_hash_thresh(net, family, dir, &dbits, &sbits);
        hash = __sel_hash(sel, family, hmask, dbits, sbits);

        if (hash == hmask + 1)
                return NULL;

        return rcu_dereference_check(net->xfrm.policy_bydst[dir].table,
                     lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash;
}

static struct hlist_head *policy_hash_direct(struct net *net,
                                             const xfrm_address_t *daddr,
                                             const xfrm_address_t *saddr,
                                             unsigned short family, int dir)
{
        unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
        unsigned int hash;
        u8 dbits;
        u8 sbits;

        __get_hash_thresh(net, family, dir, &dbits, &sbits);
        hash = __addr_hash(daddr, saddr, family, hmask, dbits, sbits);

        return rcu_dereference_check(net->xfrm.policy_bydst[dir].table,
                     lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash;
}

static void xfrm_dst_hash_transfer(struct net *net,
                                   struct hlist_head *list,
                                   struct hlist_head *ndsttable,
                                   unsigned int nhashmask,
                                   int dir)
{
        struct hlist_node *tmp, *entry0 = NULL;
        struct xfrm_policy *pol;
        unsigned int h0 = 0;
        u8 dbits;
        u8 sbits;

redo:
        hlist_for_each_entry_safe(pol, tmp, list, bydst) {
                unsigned int h;

                __get_hash_thresh(net, pol->family, dir, &dbits, &sbits);
                h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr,
                                pol->family, nhashmask, dbits, sbits);
                if (!entry0 || pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) {
                        hlist_del_rcu(&pol->bydst);
                        hlist_add_head_rcu(&pol->bydst, ndsttable + h);
                        h0 = h;
                } else {
                        if (h != h0)
                                continue;
                        hlist_del_rcu(&pol->bydst);
                        hlist_add_behind_rcu(&pol->bydst, entry0);
                }
                entry0 = &pol->bydst;
        }
        if (!hlist_empty(list)) {
                entry0 = NULL;
                goto redo;
        }
}

static void xfrm_idx_hash_transfer(struct hlist_head *list,
                                   struct hlist_head *nidxtable,
                                   unsigned int nhashmask)
{
        struct hlist_node *tmp;
        struct xfrm_policy *pol;

        hlist_for_each_entry_safe(pol, tmp, list, byidx) {
                unsigned int h;

                h = __idx_hash(pol->index, nhashmask);
                hlist_add_head(&pol->byidx, nidxtable+h);
        }
}

static unsigned long xfrm_new_hash_mask(unsigned int old_hmask)
{
        return ((old_hmask + 1) << 1) - 1;
}

static void xfrm_bydst_resize(struct net *net, int dir)
{
        unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
        unsigned int nhashmask = xfrm_new_hash_mask(hmask);
        unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head);
        struct hlist_head *ndst = xfrm_hash_alloc(nsize);
        struct hlist_head *odst;
        int i;

        if (!ndst)
                return;

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation);

        odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table,
                                lockdep_is_held(&net->xfrm.xfrm_policy_lock));

        for (i = hmask; i >= 0; i--)
                xfrm_dst_hash_transfer(net, odst + i, ndst, nhashmask, dir);

        rcu_assign_pointer(net->xfrm.policy_bydst[dir].table, ndst);
        net->xfrm.policy_bydst[dir].hmask = nhashmask;

        write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation);
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);

        synchronize_rcu();

        xfrm_hash_free(odst, (hmask + 1) * sizeof(struct hlist_head));
}

static void xfrm_byidx_resize(struct net *net)
{
        unsigned int hmask = net->xfrm.policy_idx_hmask;
        unsigned int nhashmask = xfrm_new_hash_mask(hmask);
        unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head);
        struct hlist_head *oidx = net->xfrm.policy_byidx;
        struct hlist_head *nidx = xfrm_hash_alloc(nsize);
        int i;

        if (!nidx)
                return;

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);

        for (i = hmask; i >= 0; i--)
                xfrm_idx_hash_transfer(oidx + i, nidx, nhashmask);

        net->xfrm.policy_byidx = nidx;
        net->xfrm.policy_idx_hmask = nhashmask;

        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);

        xfrm_hash_free(oidx, (hmask + 1) * sizeof(struct hlist_head));
}

static inline int xfrm_bydst_should_resize(struct net *net, int dir, int *total)
{
        unsigned int cnt = net->xfrm.policy_count[dir];
        unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;

        if (total)
                *total += cnt;

        if ((hmask + 1) < xfrm_policy_hashmax &&
            cnt > hmask)
                return 1;

        return 0;
}

static inline int xfrm_byidx_should_resize(struct net *net, int total)
{
        unsigned int hmask = net->xfrm.policy_idx_hmask;

        if ((hmask + 1) < xfrm_policy_hashmax &&
            total > hmask)
                return 1;

        return 0;
}

void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si)
{
        si->incnt = net->xfrm.policy_count[XFRM_POLICY_IN];
        si->outcnt = net->xfrm.policy_count[XFRM_POLICY_OUT];
        si->fwdcnt = net->xfrm.policy_count[XFRM_POLICY_FWD];
        si->inscnt = net->xfrm.policy_count[XFRM_POLICY_IN+XFRM_POLICY_MAX];
        si->outscnt = net->xfrm.policy_count[XFRM_POLICY_OUT+XFRM_POLICY_MAX];
        si->fwdscnt = net->xfrm.policy_count[XFRM_POLICY_FWD+XFRM_POLICY_MAX];
        si->spdhcnt = net->xfrm.policy_idx_hmask;
        si->spdhmcnt = xfrm_policy_hashmax;
}
EXPORT_SYMBOL(xfrm_spd_getinfo);

static DEFINE_MUTEX(hash_resize_mutex);
static void xfrm_hash_resize(struct work_struct *work)
{
        struct net *net = container_of(work, struct net, xfrm.policy_hash_work);
        int dir, total;

        mutex_lock(&hash_resize_mutex);

        total = 0;
        for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
                if (xfrm_bydst_should_resize(net, dir, &total))
                        xfrm_bydst_resize(net, dir);
        }
        if (xfrm_byidx_should_resize(net, total))
                xfrm_byidx_resize(net);

        mutex_unlock(&hash_resize_mutex);
}

/* Make sure *pol can be inserted into fastbin.
 * Useful to check that later insert requests will be successful
 * (provided xfrm_policy_lock is held throughout).
 */
static struct xfrm_pol_inexact_bin *
xfrm_policy_inexact_alloc_bin(const struct xfrm_policy *pol, u8 dir)
{
        struct xfrm_pol_inexact_bin *bin, *prev;
        struct xfrm_pol_inexact_key k = {
                .family = pol->family,
                .type = pol->type,
                .dir = dir,
                .if_id = pol->if_id,
        };
        struct net *net = xp_net(pol);

        lockdep_assert_held(&net->xfrm.xfrm_policy_lock);

        write_pnet(&k.net, net);
        bin = rhashtable_lookup_fast(&xfrm_policy_inexact_table, &k,
                                     xfrm_pol_inexact_params);
        if (bin)
                return bin;

        bin = kzalloc(sizeof(*bin), GFP_ATOMIC);
        if (!bin)
                return NULL;

        bin->k = k;
        INIT_HLIST_HEAD(&bin->hhead);
        bin->root_d = RB_ROOT;
        bin->root_s = RB_ROOT;
        seqcount_spinlock_init(&bin->count, &net->xfrm.xfrm_policy_lock);

        prev = rhashtable_lookup_get_insert_key(&xfrm_policy_inexact_table,
                                                &bin->k, &bin->head,
                                                xfrm_pol_inexact_params);
        if (!prev) {
                list_add(&bin->inexact_bins, &net->xfrm.inexact_bins);
                return bin;
        }

        kfree(bin);

        return IS_ERR(prev) ? NULL : prev;
}

static bool xfrm_pol_inexact_addr_use_any_list(const xfrm_address_t *addr,
                                               int family, u8 prefixlen)
{
        if (xfrm_addr_any(addr, family))
                return true;

        if (family == AF_INET6 && prefixlen < INEXACT_PREFIXLEN_IPV6)
                return true;

        if (family == AF_INET && prefixlen < INEXACT_PREFIXLEN_IPV4)
                return true;

        return false;
}

static bool
xfrm_policy_inexact_insert_use_any_list(const struct xfrm_policy *policy)
{
        const xfrm_address_t *addr;
        bool saddr_any, daddr_any;
        u8 prefixlen;

        addr = &policy->selector.saddr;
        prefixlen = policy->selector.prefixlen_s;

        saddr_any = xfrm_pol_inexact_addr_use_any_list(addr,
                                                       policy->family,
                                                       prefixlen);
        addr = &policy->selector.daddr;
        prefixlen = policy->selector.prefixlen_d;
        daddr_any = xfrm_pol_inexact_addr_use_any_list(addr,
                                                       policy->family,
                                                       prefixlen);
        return saddr_any && daddr_any;
}

static void xfrm_pol_inexact_node_init(struct xfrm_pol_inexact_node *node,
                                       const xfrm_address_t *addr, u8 prefixlen)
{
        node->addr = *addr;
        node->prefixlen = prefixlen;
}

static struct xfrm_pol_inexact_node *
xfrm_pol_inexact_node_alloc(const xfrm_address_t *addr, u8 prefixlen)
{
        struct xfrm_pol_inexact_node *node;

        node = kzalloc(sizeof(*node), GFP_ATOMIC);
        if (node)
                xfrm_pol_inexact_node_init(node, addr, prefixlen);

        return node;
}

static int xfrm_policy_addr_delta(const xfrm_address_t *a,
                                  const xfrm_address_t *b,
                                  u8 prefixlen, u16 family)
{
        u32 ma, mb, mask;
        unsigned int pdw, pbi;
        int delta = 0;

        switch (family) {
        case AF_INET:
                if (prefixlen == 0)
                        return 0;
                mask = ~0U << (32 - prefixlen);
                ma = ntohl(a->a4) & mask;
                mb = ntohl(b->a4) & mask;
                if (ma < mb)
                        delta = -1;
                else if (ma > mb)
                        delta = 1;
                break;
        case AF_INET6:
                pdw = prefixlen >> 5;
                pbi = prefixlen & 0x1f;

                if (pdw) {
                        delta = memcmp(a->a6, b->a6, pdw << 2);
                        if (delta)
                                return delta;
                }
                if (pbi) {
                        mask = ~0U << (32 - pbi);
                        ma = ntohl(a->a6[pdw]) & mask;
                        mb = ntohl(b->a6[pdw]) & mask;
                        if (ma < mb)
                                delta = -1;
                        else if (ma > mb)
                                delta = 1;
                }
                break;
        default:
                break;
        }

        return delta;
}

static void xfrm_policy_inexact_list_reinsert(struct net *net,
                                              struct xfrm_pol_inexact_node *n,
                                              u16 family)
{
        unsigned int matched_s, matched_d;
        struct xfrm_policy *policy, *p;

        matched_s = 0;
        matched_d = 0;

        list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) {
                struct hlist_node *newpos = NULL;
                bool matches_s, matches_d;

                if (policy->walk.dead || !policy->bydst_reinsert)
                        continue;

                WARN_ON_ONCE(policy->family != family);

                policy->bydst_reinsert = false;
                hlist_for_each_entry(p, &n->hhead, bydst) {
                        if (policy->priority > p->priority)
                                newpos = &p->bydst;
                        else if (policy->priority == p->priority &&
                                 policy->pos > p->pos)
                                newpos = &p->bydst;
                        else
                                break;
                }

                if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET)
                        hlist_add_behind_rcu(&policy->bydst, newpos);
                else
                        hlist_add_head_rcu(&policy->bydst, &n->hhead);

                /* paranoia checks follow.
                 * Check that the reinserted policy matches at least
                 * saddr or daddr for current node prefix.
                 *
                 * Matching both is fine, matching saddr in one policy
                 * (but not daddr) and then matching only daddr in another
                 * is a bug.
                 */
                matches_s = xfrm_policy_addr_delta(&policy->selector.saddr,
                                                   &n->addr,
                                                   n->prefixlen,
                                                   family) == 0;
                matches_d = xfrm_policy_addr_delta(&policy->selector.daddr,
                                                   &n->addr,
                                                   n->prefixlen,
                                                   family) == 0;
                if (matches_s && matches_d)
                        continue;

                WARN_ON_ONCE(!matches_s && !matches_d);
                if (matches_s)
                        matched_s++;
                if (matches_d)
                        matched_d++;
                WARN_ON_ONCE(matched_s && matched_d);
        }
}

static void xfrm_policy_inexact_node_reinsert(struct net *net,
                                              struct xfrm_pol_inexact_node *n,
                                              struct rb_root *new,
                                              u16 family)
{
        struct xfrm_pol_inexact_node *node;
        struct rb_node **p, *parent;

        /* we should not have another subtree here */
        WARN_ON_ONCE(!RB_EMPTY_ROOT(&n->root));
restart:
        parent = NULL;
        p = &new->rb_node;
        while (*p) {
                u8 prefixlen;
                int delta;

                parent = *p;
                node = rb_entry(*p, struct xfrm_pol_inexact_node, node);

                prefixlen = min(node->prefixlen, n->prefixlen);

                delta = xfrm_policy_addr_delta(&n->addr, &node->addr,
                                               prefixlen, family);
                if (delta < 0) {
                        p = &parent->rb_left;
                } else if (delta > 0) {
                        p = &parent->rb_right;
                } else {
                        bool same_prefixlen = node->prefixlen == n->prefixlen;
                        struct xfrm_policy *tmp;

                        hlist_for_each_entry(tmp, &n->hhead, bydst) {
                                tmp->bydst_reinsert = true;
                                hlist_del_rcu(&tmp->bydst);
                        }

                        node->prefixlen = prefixlen;

                        xfrm_policy_inexact_list_reinsert(net, node, family);

                        if (same_prefixlen) {
                                kfree_rcu(n, rcu);
                                return;
                        }

                        rb_erase(*p, new);
                        kfree_rcu(n, rcu);
                        n = node;
                        goto restart;
                }
        }

        rb_link_node_rcu(&n->node, parent, p);
        rb_insert_color(&n->node, new);
}

/* merge nodes v and n */
static void xfrm_policy_inexact_node_merge(struct net *net,
                                           struct xfrm_pol_inexact_node *v,
                                           struct xfrm_pol_inexact_node *n,
                                           u16 family)
{
        struct xfrm_pol_inexact_node *node;
        struct xfrm_policy *tmp;
        struct rb_node *rnode;

        /* To-be-merged node v has a subtree.
         *
         * Dismantle it and insert its nodes to n->root.
         */
        while ((rnode = rb_first(&v->root)) != NULL) {
                node = rb_entry(rnode, struct xfrm_pol_inexact_node, node);
                rb_erase(&node->node, &v->root);
                xfrm_policy_inexact_node_reinsert(net, node, &n->root,
                                                  family);
        }

        hlist_for_each_entry(tmp, &v->hhead, bydst) {
                tmp->bydst_reinsert = true;
                hlist_del_rcu(&tmp->bydst);
        }

        xfrm_policy_inexact_list_reinsert(net, n, family);
}

static struct xfrm_pol_inexact_node *
xfrm_policy_inexact_insert_node(struct net *net,
                                struct rb_root *root,
                                xfrm_address_t *addr,
                                u16 family, u8 prefixlen, u8 dir)
{
        struct xfrm_pol_inexact_node *cached = NULL;
        struct rb_node **p, *parent = NULL;
        struct xfrm_pol_inexact_node *node;

        p = &root->rb_node;
        while (*p) {
                int delta;

                parent = *p;
                node = rb_entry(*p, struct xfrm_pol_inexact_node, node);

                delta = xfrm_policy_addr_delta(addr, &node->addr,
                                               node->prefixlen,
                                               family);
                if (delta == 0 && prefixlen >= node->prefixlen) {
                        WARN_ON_ONCE(cached); /* ipsec policies got lost */
                        return node;
                }

                if (delta < 0)
                        p = &parent->rb_left;
                else
                        p = &parent->rb_right;

                if (prefixlen < node->prefixlen) {
                        delta = xfrm_policy_addr_delta(addr, &node->addr,
                                                       prefixlen,
                                                       family);
                        if (delta)
                                continue;

                        /* This node is a subnet of the new prefix. It needs
                         * to be removed and re-inserted with the smaller
                         * prefix and all nodes that are now also covered
                         * by the reduced prefixlen.
                         */
                        rb_erase(&node->node, root);

                        if (!cached) {
                                xfrm_pol_inexact_node_init(node, addr,
                                                           prefixlen);
                                cached = node;
                        } else {
                                /* This node also falls within the new
                                 * prefixlen. Merge the to-be-reinserted
                                 * node and this one.
                                 */
                                xfrm_policy_inexact_node_merge(net, node,
                                                               cached, family);
                                kfree_rcu(node, rcu);
                        }

                        /* restart */
                        p = &root->rb_node;
                        parent = NULL;
                }
        }

        node = cached;
        if (!node) {
                node = xfrm_pol_inexact_node_alloc(addr, prefixlen);
                if (!node)
                        return NULL;
        }

        rb_link_node_rcu(&node->node, parent, p);
        rb_insert_color(&node->node, root);

        return node;
}

static void xfrm_policy_inexact_gc_tree(struct rb_root *r, bool rm)
{
        struct xfrm_pol_inexact_node *node;
        struct rb_node *rn = rb_first(r);

        while (rn) {
                node = rb_entry(rn, struct xfrm_pol_inexact_node, node);

                xfrm_policy_inexact_gc_tree(&node->root, rm);
                rn = rb_next(rn);

                if (!hlist_empty(&node->hhead) || !RB_EMPTY_ROOT(&node->root)) {
                        WARN_ON_ONCE(rm);
                        continue;
                }

                rb_erase(&node->node, r);
                kfree_rcu(node, rcu);
        }
}

static void __xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b, bool net_exit)
{
        write_seqcount_begin(&b->count);
        xfrm_policy_inexact_gc_tree(&b->root_d, net_exit);
        xfrm_policy_inexact_gc_tree(&b->root_s, net_exit);
        write_seqcount_end(&b->count);

        if (!RB_EMPTY_ROOT(&b->root_d) || !RB_EMPTY_ROOT(&b->root_s) ||
            !hlist_empty(&b->hhead)) {
                WARN_ON_ONCE(net_exit);
                return;
        }

        if (rhashtable_remove_fast(&xfrm_policy_inexact_table, &b->head,
                                   xfrm_pol_inexact_params) == 0) {
                list_del(&b->inexact_bins);
                kfree_rcu(b, rcu);
        }
}

static void xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b)
{
        struct net *net = read_pnet(&b->k.net);

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        __xfrm_policy_inexact_prune_bin(b, false);
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
}

static void __xfrm_policy_inexact_flush(struct net *net)
{
        struct xfrm_pol_inexact_bin *bin, *t;

        lockdep_assert_held(&net->xfrm.xfrm_policy_lock);

        list_for_each_entry_safe(bin, t, &net->xfrm.inexact_bins, inexact_bins)
                __xfrm_policy_inexact_prune_bin(bin, false);
}

static struct hlist_head *
xfrm_policy_inexact_alloc_chain(struct xfrm_pol_inexact_bin *bin,
                                struct xfrm_policy *policy, u8 dir)
{
        struct xfrm_pol_inexact_node *n;
        struct net *net;

        net = xp_net(policy);
        lockdep_assert_held(&net->xfrm.xfrm_policy_lock);

        if (xfrm_policy_inexact_insert_use_any_list(policy))
                return &bin->hhead;

        if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.daddr,
                                               policy->family,
                                               policy->selector.prefixlen_d)) {
                write_seqcount_begin(&bin->count);
                n = xfrm_policy_inexact_insert_node(net,
                                                    &bin->root_s,
                                                    &policy->selector.saddr,
                                                    policy->family,
                                                    policy->selector.prefixlen_s,
                                                    dir);
                write_seqcount_end(&bin->count);
                if (!n)
                        return NULL;

                return &n->hhead;
        }

        /* daddr is fixed */
        write_seqcount_begin(&bin->count);
        n = xfrm_policy_inexact_insert_node(net,
                                            &bin->root_d,
                                            &policy->selector.daddr,
                                            policy->family,
                                            policy->selector.prefixlen_d, dir);
        write_seqcount_end(&bin->count);
        if (!n)
                return NULL;

        /* saddr is wildcard */
        if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.saddr,
                                               policy->family,
                                               policy->selector.prefixlen_s))
                return &n->hhead;

        write_seqcount_begin(&bin->count);
        n = xfrm_policy_inexact_insert_node(net,
                                            &n->root,
                                            &policy->selector.saddr,
                                            policy->family,
                                            policy->selector.prefixlen_s, dir);
        write_seqcount_end(&bin->count);
        if (!n)
                return NULL;

        return &n->hhead;
}

static struct xfrm_policy *
xfrm_policy_inexact_insert(struct xfrm_policy *policy, u8 dir, int excl)
{
        struct xfrm_pol_inexact_bin *bin;
        struct xfrm_policy *delpol;
        struct hlist_head *chain;
        struct net *net;

        bin = xfrm_policy_inexact_alloc_bin(policy, dir);
        if (!bin)
                return ERR_PTR(-ENOMEM);

        net = xp_net(policy);
        lockdep_assert_held(&net->xfrm.xfrm_policy_lock);

        chain = xfrm_policy_inexact_alloc_chain(bin, policy, dir);
        if (!chain) {
                __xfrm_policy_inexact_prune_bin(bin, false);
                return ERR_PTR(-ENOMEM);
        }

        delpol = xfrm_policy_insert_list(chain, policy, excl);
        if (delpol && excl) {
                __xfrm_policy_inexact_prune_bin(bin, false);
                return ERR_PTR(-EEXIST);
        }

        if (delpol)
                __xfrm_policy_inexact_prune_bin(bin, false);

        return delpol;
}

static bool xfrm_policy_is_dead_or_sk(const struct xfrm_policy *policy)
{
        int dir;

        if (policy->walk.dead)
                return true;

        dir = xfrm_policy_id2dir(policy->index);
        return dir >= XFRM_POLICY_MAX;
}

static void xfrm_hash_rebuild(struct work_struct *work)
{
        struct net *net = container_of(work, struct net,
                                       xfrm.policy_hthresh.work);
        struct xfrm_policy *pol;
        struct xfrm_policy *policy;
        struct hlist_head *chain;
        struct hlist_node *newpos;
        int dir;
        unsigned seq;
        u8 lbits4, rbits4, lbits6, rbits6;

        mutex_lock(&hash_resize_mutex);

        /* read selector prefixlen thresholds */
        do {
                seq = read_seqbegin(&net->xfrm.policy_hthresh.lock);

                lbits4 = net->xfrm.policy_hthresh.lbits4;
                rbits4 = net->xfrm.policy_hthresh.rbits4;
                lbits6 = net->xfrm.policy_hthresh.lbits6;
                rbits6 = net->xfrm.policy_hthresh.rbits6;
        } while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq));

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation);

        /* make sure that we can insert the indirect policies again before
         * we start with destructive action.
         */
        list_for_each_entry(policy, &net->xfrm.policy_all, walk.all) {
                struct xfrm_pol_inexact_bin *bin;
                u8 dbits, sbits;

                if (xfrm_policy_is_dead_or_sk(policy))
                        continue;

                dir = xfrm_policy_id2dir(policy->index);
                if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) {
                        if (policy->family == AF_INET) {
                                dbits = rbits4;
                                sbits = lbits4;
                        } else {
                                dbits = rbits6;
                                sbits = lbits6;
                        }
                } else {
                        if (policy->family == AF_INET) {
                                dbits = lbits4;
                                sbits = rbits4;
                        } else {
                                dbits = lbits6;
                                sbits = rbits6;
                        }
                }

                if (policy->selector.prefixlen_d < dbits ||
                    policy->selector.prefixlen_s < sbits)
                        continue;

                bin = xfrm_policy_inexact_alloc_bin(policy, dir);
                if (!bin)
                        goto out_unlock;

                if (!xfrm_policy_inexact_alloc_chain(bin, policy, dir))
                        goto out_unlock;
        }

        for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
                if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) {
                        /* dir out => dst = remote, src = local */
                        net->xfrm.policy_bydst[dir].dbits4 = rbits4;
                        net->xfrm.policy_bydst[dir].sbits4 = lbits4;
                        net->xfrm.policy_bydst[dir].dbits6 = rbits6;
                        net->xfrm.policy_bydst[dir].sbits6 = lbits6;
                } else {
                        /* dir in/fwd => dst = local, src = remote */
                        net->xfrm.policy_bydst[dir].dbits4 = lbits4;
                        net->xfrm.policy_bydst[dir].sbits4 = rbits4;
                        net->xfrm.policy_bydst[dir].dbits6 = lbits6;
                        net->xfrm.policy_bydst[dir].sbits6 = rbits6;
                }
        }

        /* re-insert all policies by order of creation */
        list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) {
                if (xfrm_policy_is_dead_or_sk(policy))
                        continue;

                hlist_del_rcu(&policy->bydst);

                newpos = NULL;
                dir = xfrm_policy_id2dir(policy->index);
                chain = policy_hash_bysel(net, &policy->selector,
                                          policy->family, dir);

                if (!chain) {
                        void *p = xfrm_policy_inexact_insert(policy, dir, 0);

                        WARN_ONCE(IS_ERR(p), "reinsert: %ld\n", PTR_ERR(p));
                        continue;
                }

                hlist_for_each_entry(pol, chain, bydst) {
                        if (policy->priority >= pol->priority)
                                newpos = &pol->bydst;
                        else
                                break;
                }
                if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET)
                        hlist_add_behind_rcu(&policy->bydst, newpos);
                else
                        hlist_add_head_rcu(&policy->bydst, chain);
        }

out_unlock:
        __xfrm_policy_inexact_flush(net);
        write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation);
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);

        mutex_unlock(&hash_resize_mutex);
}

void xfrm_policy_hash_rebuild(struct net *net)
{
        schedule_work(&net->xfrm.policy_hthresh.work);
}
EXPORT_SYMBOL(xfrm_policy_hash_rebuild);

/* Generate new index... KAME seems to generate them ordered by cost
 * of an absolute inpredictability of ordering of rules. This will not pass. */
static u32 xfrm_gen_index(struct net *net, int dir, u32 index)
{
        for (;;) {
                struct hlist_head *list;
                struct xfrm_policy *p;
                u32 idx;
                int found;

                if (!index) {
                        idx = (net->xfrm.idx_generator | dir);
                        net->xfrm.idx_generator += 8;
                } else {
                        idx = index;
                        index = 0;
                }

                if (idx == 0)
                        idx = 8;
                list = net->xfrm.policy_byidx + idx_hash(net, idx);
                found = 0;
                hlist_for_each_entry(p, list, byidx) {
                        if (p->index == idx) {
                                found = 1;
                                break;
                        }
                }
                if (!found)
                        return idx;
        }
}

static inline int selector_cmp(struct xfrm_selector *s1, struct xfrm_selector *s2)
{
        u32 *p1 = (u32 *) s1;
        u32 *p2 = (u32 *) s2;
        int len = sizeof(struct xfrm_selector) / sizeof(u32);
        int i;

        for (i = 0; i < len; i++) {
                if (p1[i] != p2[i])
                        return 1;
        }

        return 0;
}

static void xfrm_policy_requeue(struct xfrm_policy *old,
                                struct xfrm_policy *new)
{
        struct xfrm_policy_queue *pq = &old->polq;
        struct sk_buff_head list;

        if (skb_queue_empty(&pq->hold_queue))
                return;

        __skb_queue_head_init(&list);

        spin_lock_bh(&pq->hold_queue.lock);
        skb_queue_splice_init(&pq->hold_queue, &list);
        if (timer_delete(&pq->hold_timer))
                xfrm_pol_put(old);
        spin_unlock_bh(&pq->hold_queue.lock);

        pq = &new->polq;

        spin_lock_bh(&pq->hold_queue.lock);
        skb_queue_splice(&list, &pq->hold_queue);
        pq->timeout = XFRM_QUEUE_TMO_MIN;
        if (!mod_timer(&pq->hold_timer, jiffies))
                xfrm_pol_hold(new);
        spin_unlock_bh(&pq->hold_queue.lock);
}

static inline bool xfrm_policy_mark_match(const struct xfrm_mark *mark,
                                          struct xfrm_policy *pol)
{
        return mark->v == pol->mark.v && mark->m == pol->mark.m;
}

static u32 xfrm_pol_bin_key(const void *data, u32 len, u32 seed)
{
        const struct xfrm_pol_inexact_key *k = data;
        u32 a = k->type << 24 | k->dir << 16 | k->family;

        return jhash_3words(a, k->if_id, net_hash_mix(read_pnet(&k->net)),
                            seed);
}

static u32 xfrm_pol_bin_obj(const void *data, u32 len, u32 seed)
{
        const struct xfrm_pol_inexact_bin *b = data;

        return xfrm_pol_bin_key(&b->k, 0, seed);
}

static int xfrm_pol_bin_cmp(struct rhashtable_compare_arg *arg,
                            const void *ptr)
{
        const struct xfrm_pol_inexact_key *key = arg->key;
        const struct xfrm_pol_inexact_bin *b = ptr;
        int ret;

        if (!net_eq(read_pnet(&b->k.net), read_pnet(&key->net)))
                return -1;

        ret = b->k.dir ^ key->dir;
        if (ret)
                return ret;

        ret = b->k.type ^ key->type;
        if (ret)
                return ret;

        ret = b->k.family ^ key->family;
        if (ret)
                return ret;

        return b->k.if_id ^ key->if_id;
}

static const struct rhashtable_params xfrm_pol_inexact_params = {
        .head_offset                = offsetof(struct xfrm_pol_inexact_bin, head),
        .hashfn                        = xfrm_pol_bin_key,
        .obj_hashfn                = xfrm_pol_bin_obj,
        .obj_cmpfn                = xfrm_pol_bin_cmp,
        .automatic_shrinking        = true,
};

static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain,
                                                   struct xfrm_policy *policy,
                                                   bool excl)
{
        struct xfrm_policy *pol, *newpos = NULL, *delpol = NULL;

        hlist_for_each_entry(pol, chain, bydst) {
                if (pol->type == policy->type &&
                    pol->if_id == policy->if_id &&
                    !selector_cmp(&pol->selector, &policy->selector) &&
                    xfrm_policy_mark_match(&policy->mark, pol) &&
                    xfrm_sec_ctx_match(pol->security, policy->security) &&
                    !WARN_ON(delpol)) {
                        if (excl)
                                return ERR_PTR(-EEXIST);
                        delpol = pol;
                        if (policy->priority > pol->priority)
                                continue;
                } else if (policy->priority >= pol->priority) {
                        newpos = pol;
                        continue;
                }
                if (delpol)
                        break;
        }

        if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET)
                hlist_add_behind_rcu(&policy->bydst, &newpos->bydst);
        else
                /* Packet offload policies enter to the head
                 * to speed-up lookups.
                 */
                hlist_add_head_rcu(&policy->bydst, chain);

        return delpol;
}

int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
{
        struct net *net = xp_net(policy);
        struct xfrm_policy *delpol;
        struct hlist_head *chain;

        /* Sanitize mark before store */
        policy->mark.v &= policy->mark.m;

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        chain = policy_hash_bysel(net, &policy->selector, policy->family, dir);
        if (chain)
                delpol = xfrm_policy_insert_list(chain, policy, excl);
        else
                delpol = xfrm_policy_inexact_insert(policy, dir, excl);

        if (IS_ERR(delpol)) {
                spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
                return PTR_ERR(delpol);
        }

        __xfrm_policy_link(policy, dir);

        /* After previous checking, family can either be AF_INET or AF_INET6 */
        if (policy->family == AF_INET)
                rt_genid_bump_ipv4(net);
        else
                rt_genid_bump_ipv6(net);

        if (delpol) {
                xfrm_policy_requeue(delpol, policy);
                __xfrm_policy_unlink(delpol, dir);
        }
        policy->index = delpol ? delpol->index : xfrm_gen_index(net, dir, policy->index);
        hlist_add_head(&policy->byidx, net->xfrm.policy_byidx+idx_hash(net, policy->index));
        policy->curlft.add_time = ktime_get_real_seconds();
        policy->curlft.use_time = 0;
        if (!mod_timer(&policy->timer, jiffies + HZ))
                xfrm_pol_hold(policy);
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);

        if (delpol)
                xfrm_policy_kill(delpol);
        else if (xfrm_bydst_should_resize(net, dir, NULL))
                schedule_work(&net->xfrm.policy_hash_work);

        return 0;
}
EXPORT_SYMBOL(xfrm_policy_insert);

static struct xfrm_policy *
__xfrm_policy_bysel_ctx(struct hlist_head *chain, const struct xfrm_mark *mark,
                        u32 if_id, u8 type, int dir, struct xfrm_selector *sel,
                        struct xfrm_sec_ctx *ctx)
{
        struct xfrm_policy *pol;

        if (!chain)
                return NULL;

        hlist_for_each_entry(pol, chain, bydst) {
                if (pol->type == type &&
                    pol->if_id == if_id &&
                    xfrm_policy_mark_match(mark, pol) &&
                    !selector_cmp(sel, &pol->selector) &&
                    xfrm_sec_ctx_match(ctx, pol->security))
                        return pol;
        }

        return NULL;
}

struct xfrm_policy *
xfrm_policy_bysel_ctx(struct net *net, const struct xfrm_mark *mark, u32 if_id,
                      u8 type, int dir, struct xfrm_selector *sel,
                      struct xfrm_sec_ctx *ctx, int delete, int *err)
{
        struct xfrm_pol_inexact_bin *bin = NULL;
        struct xfrm_policy *pol, *ret = NULL;
        struct hlist_head *chain;

        *err = 0;
        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        chain = policy_hash_bysel(net, sel, sel->family, dir);
        if (!chain) {
                struct xfrm_pol_inexact_candidates cand;
                int i;

                bin = xfrm_policy_inexact_lookup(net, type,
                                                 sel->family, dir, if_id);
                if (!bin) {
                        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
                        return NULL;
                }

                if (!xfrm_policy_find_inexact_candidates(&cand, bin,
                                                         &sel->saddr,
                                                         &sel->daddr)) {
                        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
                        return NULL;
                }

                pol = NULL;
                for (i = 0; i < ARRAY_SIZE(cand.res); i++) {
                        struct xfrm_policy *tmp;

                        tmp = __xfrm_policy_bysel_ctx(cand.res[i], mark,
                                                      if_id, type, dir,
                                                      sel, ctx);
                        if (!tmp)
                                continue;

                        if (!pol || tmp->pos < pol->pos)
                                pol = tmp;
                }
        } else {
                pol = __xfrm_policy_bysel_ctx(chain, mark, if_id, type, dir,
                                              sel, ctx);
        }

        if (pol) {
                xfrm_pol_hold(pol);
                if (delete) {
                        *err = security_xfrm_policy_delete(pol->security);
                        if (*err) {
                                spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
                                return pol;
                        }
                        __xfrm_policy_unlink(pol, dir);
                }
                ret = pol;
        }
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);

        if (ret && delete)
                xfrm_policy_kill(ret);
        if (bin && delete)
                xfrm_policy_inexact_prune_bin(bin);
        return ret;
}
EXPORT_SYMBOL(xfrm_policy_bysel_ctx);

struct xfrm_policy *
xfrm_policy_byid(struct net *net, const struct xfrm_mark *mark, u32 if_id,
                 u8 type, int dir, u32 id, int delete, int *err)
{
        struct xfrm_policy *pol, *ret;
        struct hlist_head *chain;

        *err = -ENOENT;
        if (xfrm_policy_id2dir(id) != dir)
                return NULL;

        *err = 0;
        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        chain = net->xfrm.policy_byidx + idx_hash(net, id);
        ret = NULL;
        hlist_for_each_entry(pol, chain, byidx) {
                if (pol->type == type && pol->index == id &&
                    pol->if_id == if_id && xfrm_policy_mark_match(mark, pol)) {
                        xfrm_pol_hold(pol);
                        if (delete) {
                                *err = security_xfrm_policy_delete(
                                                                pol->security);
                                if (*err) {
                                        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
                                        return pol;
                                }
                                __xfrm_policy_unlink(pol, dir);
                        }
                        ret = pol;
                        break;
                }
        }
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);

        if (ret && delete)
                xfrm_policy_kill(ret);
        return ret;
}
EXPORT_SYMBOL(xfrm_policy_byid);

#ifdef CONFIG_SECURITY_NETWORK_XFRM
static inline int
xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid)
{
        struct xfrm_policy *pol;
        int err = 0;

        list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
                if (pol->walk.dead ||
                    xfrm_policy_id2dir(pol->index) >= XFRM_POLICY_MAX ||
                    pol->type != type)
                        continue;

                err = security_xfrm_policy_delete(pol->security);
                if (err) {
                        xfrm_audit_policy_delete(pol, 0, task_valid);
                        return err;
                }
        }
        return err;
}

static inline int xfrm_dev_policy_flush_secctx_check(struct net *net,
                                                     struct net_device *dev,
                                                     bool task_valid)
{
        struct xfrm_policy *pol;
        int err = 0;

        list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
                if (pol->walk.dead ||
                    xfrm_policy_id2dir(pol->index) >= XFRM_POLICY_MAX ||
                    pol->xdo.dev != dev)
                        continue;

                err = security_xfrm_policy_delete(pol->security);
                if (err) {
                        xfrm_audit_policy_delete(pol, 0, task_valid);
                        return err;
                }
        }
        return err;
}
#else
static inline int
xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid)
{
        return 0;
}

static inline int xfrm_dev_policy_flush_secctx_check(struct net *net,
                                                     struct net_device *dev,
                                                     bool task_valid)
{
        return 0;
}
#endif

int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
{
        int dir, err = 0, cnt = 0;
        struct xfrm_policy *pol;

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);

        err = xfrm_policy_flush_secctx_check(net, type, task_valid);
        if (err)
                goto out;

again:
        list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
                if (pol->walk.dead)
                        continue;

                dir = xfrm_policy_id2dir(pol->index);
                if (dir >= XFRM_POLICY_MAX ||
                    pol->type != type)
                        continue;

                __xfrm_policy_unlink(pol, dir);
                spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
                cnt++;
                xfrm_audit_policy_delete(pol, 1, task_valid);
                xfrm_policy_kill(pol);
                spin_lock_bh(&net->xfrm.xfrm_policy_lock);
                goto again;
        }
        if (cnt)
                __xfrm_policy_inexact_flush(net);
        else
                err = -ESRCH;
out:
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
        return err;
}
EXPORT_SYMBOL(xfrm_policy_flush);

int xfrm_dev_policy_flush(struct net *net, struct net_device *dev,
                          bool task_valid)
{
        int dir, err = 0, cnt = 0;
        struct xfrm_policy *pol;

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);

        err = xfrm_dev_policy_flush_secctx_check(net, dev, task_valid);
        if (err)
                goto out;

again:
        list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
                if (pol->walk.dead)
                        continue;

                dir = xfrm_policy_id2dir(pol->index);
                if (dir >= XFRM_POLICY_MAX ||
                    pol->xdo.dev != dev)
                        continue;

                __xfrm_policy_unlink(pol, dir);
                spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
                cnt++;
                xfrm_audit_policy_delete(pol, 1, task_valid);
                xfrm_policy_kill(pol);
                spin_lock_bh(&net->xfrm.xfrm_policy_lock);
                goto again;
        }
        if (cnt)
                __xfrm_policy_inexact_flush(net);
        else
                err = -ESRCH;
out:
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
        return err;
}
EXPORT_SYMBOL(xfrm_dev_policy_flush);

int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk,
                     int (*func)(struct xfrm_policy *, int, int, void*),
                     void *data)
{
        struct xfrm_policy *pol;
        struct xfrm_policy_walk_entry *x;
        int error = 0;

        if (walk->type >= XFRM_POLICY_TYPE_MAX &&
            walk->type != XFRM_POLICY_TYPE_ANY)
                return -EINVAL;

        if (list_empty(&walk->walk.all) && walk->seq != 0)
                return 0;

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        if (list_empty(&walk->walk.all))
                x = list_first_entry(&net->xfrm.policy_all, struct xfrm_policy_walk_entry, all);
        else
                x = list_first_entry(&walk->walk.all,
                                     struct xfrm_policy_walk_entry, all);

        list_for_each_entry_from(x, &net->xfrm.policy_all, all) {
                if (x->dead)
                        continue;
                pol = container_of(x, struct xfrm_policy, walk);
                if (walk->type != XFRM_POLICY_TYPE_ANY &&
                    walk->type != pol->type)
                        continue;
                error = func(pol, xfrm_policy_id2dir(pol->index),
                             walk->seq, data);
                if (error) {
                        list_move_tail(&walk->walk.all, &x->all);
                        goto out;
                }
                walk->seq++;
        }
        if (walk->seq == 0) {
                error = -ENOENT;
                goto out;
        }
        list_del_init(&walk->walk.all);
out:
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
        return error;
}
EXPORT_SYMBOL(xfrm_policy_walk);

void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 type)
{
        INIT_LIST_HEAD(&walk->walk.all);
        walk->walk.dead = 1;
        walk->type = type;
        walk->seq = 0;
}
EXPORT_SYMBOL(xfrm_policy_walk_init);

void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net)
{
        if (list_empty(&walk->walk.all))
                return;

        spin_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME where is net? */
        list_del(&walk->walk.all);
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
}
EXPORT_SYMBOL(xfrm_policy_walk_done);

/*
 * Find policy to apply to this flow.
 *
 * Returns 0 if policy found, else an -errno.
 */
static int xfrm_policy_match(const struct xfrm_policy *pol,
                             const struct flowi *fl,
                             u8 type, u16 family, u32 if_id)
{
        const struct xfrm_selector *sel = &pol->selector;
        int ret = -ESRCH;
        bool match;

        if (pol->family != family ||
            pol->if_id != if_id ||
            (fl->flowi_mark & pol->mark.m) != pol->mark.v ||
            pol->type != type)
                return ret;

        match = xfrm_selector_match(sel, fl, family);
        if (match)
                ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid);
        return ret;
}

static struct xfrm_pol_inexact_node *
xfrm_policy_lookup_inexact_addr(const struct rb_root *r,
                                seqcount_spinlock_t *count,
                                const xfrm_address_t *addr, u16 family)
{
        const struct rb_node *parent;
        int seq;

again:
        seq = read_seqcount_begin(count);

        parent = rcu_dereference_raw(r->rb_node);
        while (parent) {
                struct xfrm_pol_inexact_node *node;
                int delta;

                node = rb_entry(parent, struct xfrm_pol_inexact_node, node);

                delta = xfrm_policy_addr_delta(addr, &node->addr,
                                               node->prefixlen, family);
                if (delta < 0) {
                        parent = rcu_dereference_raw(parent->rb_left);
                        continue;
                } else if (delta > 0) {
                        parent = rcu_dereference_raw(parent->rb_right);
                        continue;
                }

                return node;
        }

        if (read_seqcount_retry(count, seq))
                goto again;

        return NULL;
}

static bool
xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand,
                                    struct xfrm_pol_inexact_bin *b,
                                    const xfrm_address_t *saddr,
                                    const xfrm_address_t *daddr)
{
        struct xfrm_pol_inexact_node *n;
        u16 family;

        if (!b)
                return false;

        family = b->k.family;
        memset(cand, 0, sizeof(*cand));
        cand->res[XFRM_POL_CAND_ANY] = &b->hhead;

        n = xfrm_policy_lookup_inexact_addr(&b->root_d, &b->count, daddr,
                                            family);
        if (n) {
                cand->res[XFRM_POL_CAND_DADDR] = &n->hhead;
                n = xfrm_policy_lookup_inexact_addr(&n->root, &b->count, saddr,
                                                    family);
                if (n)
                        cand->res[XFRM_POL_CAND_BOTH] = &n->hhead;
        }

        n = xfrm_policy_lookup_inexact_addr(&b->root_s, &b->count, saddr,
                                            family);
        if (n)
                cand->res[XFRM_POL_CAND_SADDR] = &n->hhead;

        return true;
}

static struct xfrm_pol_inexact_bin *
xfrm_policy_inexact_lookup_rcu(struct net *net, u8 type, u16 family,
                               u8 dir, u32 if_id)
{
        struct xfrm_pol_inexact_key k = {
                .family = family,
                .type = type,
                .dir = dir,
                .if_id = if_id,
        };

        write_pnet(&k.net, net);

        return rhashtable_lookup(&xfrm_policy_inexact_table, &k,
                                 xfrm_pol_inexact_params);
}

static struct xfrm_pol_inexact_bin *
xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family,
                           u8 dir, u32 if_id)
{
        struct xfrm_pol_inexact_bin *bin;

        lockdep_assert_held(&net->xfrm.xfrm_policy_lock);

        rcu_read_lock();
        bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id);
        rcu_read_unlock();

        return bin;
}

static struct xfrm_policy *
__xfrm_policy_eval_candidates(struct hlist_head *chain,
                              struct xfrm_policy *prefer,
                              const struct flowi *fl,
                              u8 type, u16 family, u32 if_id)
{
        u32 priority = prefer ? prefer->priority : ~0u;
        struct xfrm_policy *pol;

        if (!chain)
                return NULL;

        hlist_for_each_entry_rcu(pol, chain, bydst) {
                int err;

                if (pol->priority > priority)
                        break;

                err = xfrm_policy_match(pol, fl, type, family, if_id);
                if (err) {
                        if (err != -ESRCH)
                                return ERR_PTR(err);

                        continue;
                }

                if (prefer) {
                        /* matches.  Is it older than *prefer? */
                        if (pol->priority == priority &&
                            prefer->pos < pol->pos)
                                return prefer;
                }

                return pol;
        }

        return NULL;
}

static struct xfrm_policy *
xfrm_policy_eval_candidates(struct xfrm_pol_inexact_candidates *cand,
                            struct xfrm_policy *prefer,
                            const struct flowi *fl,
                            u8 type, u16 family, u32 if_id)
{
        struct xfrm_policy *tmp;
        int i;

        for (i = 0; i < ARRAY_SIZE(cand->res); i++) {
                tmp = __xfrm_policy_eval_candidates(cand->res[i],
                                                    prefer,
                                                    fl, type, family, if_id);
                if (!tmp)
                        continue;

                if (IS_ERR(tmp))
                        return tmp;
                prefer = tmp;
        }

        return prefer;
}

static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
                                                     const struct flowi *fl,
                                                     u16 family, u8 dir,
                                                     u32 if_id)
{
        struct xfrm_pol_inexact_candidates cand;
        const xfrm_address_t *daddr, *saddr;
        struct xfrm_pol_inexact_bin *bin;
        struct xfrm_policy *pol, *ret;
        struct hlist_head *chain;
        unsigned int sequence;
        int err;

        daddr = xfrm_flowi_daddr(fl, family);
        saddr = xfrm_flowi_saddr(fl, family);
        if (unlikely(!daddr || !saddr))
                return NULL;

        rcu_read_lock();
 retry:
        do {
                sequence = read_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation);
                chain = policy_hash_direct(net, daddr, saddr, family, dir);
        } while (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence));

        ret = NULL;
        hlist_for_each_entry_rcu(pol, chain, bydst) {
                err = xfrm_policy_match(pol, fl, type, family, if_id);
                if (err) {
                        if (err == -ESRCH)
                                continue;
                        else {
                                ret = ERR_PTR(err);
                                goto fail;
                        }
                } else {
                        ret = pol;
                        break;
                }
        }
        if (ret && ret->xdo.type == XFRM_DEV_OFFLOAD_PACKET)
                goto skip_inexact;

        bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id);
        if (!bin || !xfrm_policy_find_inexact_candidates(&cand, bin, saddr,
                                                         daddr))
                goto skip_inexact;

        pol = xfrm_policy_eval_candidates(&cand, ret, fl, type,
                                          family, if_id);
        if (pol) {
                ret = pol;
                if (IS_ERR(pol))
                        goto fail;
        }

skip_inexact:
        if (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence))
                goto retry;

        if (ret && !xfrm_pol_hold_rcu(ret))
                goto retry;
fail:
        rcu_read_unlock();

        return ret;
}

static struct xfrm_policy *xfrm_policy_lookup(struct net *net,
                                              const struct flowi *fl,
                                              u16 family, u8 dir, u32 if_id)
{
#ifdef CONFIG_XFRM_SUB_POLICY
        struct xfrm_policy *pol;

        pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family,
                                        dir, if_id);
        if (pol != NULL)
                return pol;
#endif
        return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family,
                                         dir, if_id);
}

static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
                                                 const struct flowi *fl,
                                                 u16 family, u32 if_id)
{
        struct xfrm_policy *pol;

        rcu_read_lock();
 again:
        pol = rcu_dereference(sk->sk_policy[dir]);
        if (pol != NULL) {
                bool match;
                int err = 0;

                if (pol->family != family) {
                        pol = NULL;
                        goto out;
                }

                match = xfrm_selector_match(&pol->selector, fl, family);
                if (match) {
                        if ((READ_ONCE(sk->sk_mark) & pol->mark.m) != pol->mark.v ||
                            pol->if_id != if_id) {
                                pol = NULL;
                                goto out;
                        }
                        err = security_xfrm_policy_lookup(pol->security,
                                                      fl->flowi_secid);
                        if (!err) {
                                if (!xfrm_pol_hold_rcu(pol))
                                        goto again;
                        } else if (err == -ESRCH) {
                                pol = NULL;
                        } else {
                                pol = ERR_PTR(err);
                        }
                } else
                        pol = NULL;
        }
out:
        rcu_read_unlock();
        return pol;
}

static u32 xfrm_gen_pos_slow(struct net *net)
{
        struct xfrm_policy *policy;
        u32 i = 0;

        /* oldest entry is last in list */
        list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) {
                if (!xfrm_policy_is_dead_or_sk(policy))
                        policy->pos = ++i;
        }

        return i;
}

static u32 xfrm_gen_pos(struct net *net)
{
        const struct xfrm_policy *policy;
        u32 i = 0;

        /* most recently added policy is at the head of the list */
        list_for_each_entry(policy, &net->xfrm.policy_all, walk.all) {
                if (xfrm_policy_is_dead_or_sk(policy))
                        continue;

                if (policy->pos == UINT_MAX)
                        return xfrm_gen_pos_slow(net);

                i = policy->pos + 1;
                break;
        }

        return i;
}

static void __xfrm_policy_link(struct xfrm_policy *pol, int dir)
{
        struct net *net = xp_net(pol);

        switch (dir) {
        case XFRM_POLICY_IN:
        case XFRM_POLICY_FWD:
        case XFRM_POLICY_OUT:
                pol->pos = xfrm_gen_pos(net);
                break;
        }

        list_add(&pol->walk.all, &net->xfrm.policy_all);
        net->xfrm.policy_count[dir]++;
        xfrm_pol_hold(pol);
}

static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
                                                int dir)
{
        struct net *net = xp_net(pol);

        if (list_empty(&pol->walk.all))
                return NULL;

        /* Socket policies are not hashed. */
        if (!hlist_unhashed(&pol->bydst)) {
                hlist_del_rcu(&pol->bydst);
                hlist_del(&pol->byidx);
        }

        list_del_init(&pol->walk.all);
        net->xfrm.policy_count[dir]--;

        return pol;
}

static void xfrm_sk_policy_link(struct xfrm_policy *pol, int dir)
{
        __xfrm_policy_link(pol, XFRM_POLICY_MAX + dir);
}

static void xfrm_sk_policy_unlink(struct xfrm_policy *pol, int dir)
{
        __xfrm_policy_unlink(pol, XFRM_POLICY_MAX + dir);
}

int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
{
        struct net *net = xp_net(pol);

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        pol = __xfrm_policy_unlink(pol, dir);
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
        if (pol) {
                xfrm_policy_kill(pol);
                return 0;
        }
        return -ENOENT;
}
EXPORT_SYMBOL(xfrm_policy_delete);

int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
{
        struct net *net = sock_net(sk);
        struct xfrm_policy *old_pol;

#ifdef CONFIG_XFRM_SUB_POLICY
        if (pol && pol->type != XFRM_POLICY_TYPE_MAIN)
                return -EINVAL;
#endif

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        old_pol = rcu_dereference_protected(sk->sk_policy[dir],
                                lockdep_is_held(&net->xfrm.xfrm_policy_lock));
        if (pol) {
                pol->curlft.add_time = ktime_get_real_seconds();
                pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir, 0);
                xfrm_sk_policy_link(pol, dir);
        }
        rcu_assign_pointer(sk->sk_policy[dir], pol);
        if (old_pol) {
                if (pol)
                        xfrm_policy_requeue(old_pol, pol);

                /* Unlinking succeeds always. This is the only function
                 * allowed to delete or replace socket policy.
                 */
                xfrm_sk_policy_unlink(old_pol, dir);
        }
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);

        if (old_pol) {
                xfrm_policy_kill(old_pol);
        }
        return 0;
}

static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir)
{
        struct xfrm_policy *newp = xfrm_policy_alloc(xp_net(old), GFP_ATOMIC);
        struct net *net = xp_net(old);

        if (newp) {
                newp->selector = old->selector;
                if (security_xfrm_policy_clone(old->security,
                                               &newp->security)) {
                        kfree(newp);
                        return NULL;  /* ENOMEM */
                }
                newp->lft = old->lft;
                newp->curlft = old->curlft;
                newp->mark = old->mark;
                newp->if_id = old->if_id;
                newp->action = old->action;
                newp->flags = old->flags;
                newp->xfrm_nr = old->xfrm_nr;
                newp->index = old->index;
                newp->type = old->type;
                newp->family = old->family;
                memcpy(newp->xfrm_vec, old->xfrm_vec,
                       newp->xfrm_nr*sizeof(struct xfrm_tmpl));
                spin_lock_bh(&net->xfrm.xfrm_policy_lock);
                xfrm_sk_policy_link(newp, dir);
                spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
                xfrm_pol_put(newp);
        }
        return newp;
}

int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk)
{
        const struct xfrm_policy *p;
        struct xfrm_policy *np;
        int i, ret = 0;

        rcu_read_lock();
        for (i = 0; i < 2; i++) {
                p = rcu_dereference(osk->sk_policy[i]);
                if (p) {
                        np = clone_policy(p, i);
                        if (unlikely(!np)) {
                                ret = -ENOMEM;
                                break;
                        }
                        rcu_assign_pointer(sk->sk_policy[i], np);
                }
        }
        rcu_read_unlock();
        return ret;
}

static int
xfrm_get_saddr(unsigned short family, xfrm_address_t *saddr,
               const struct xfrm_dst_lookup_params *params)
{
        int err;
        const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);

        if (unlikely(afinfo == NULL))
                return -EINVAL;
        err = afinfo->get_saddr(saddr, params);
        rcu_read_unlock();
        return err;
}

/* Resolve list of templates for the flow, given policy. */

static int
xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl,
                      struct xfrm_state **xfrm, unsigned short family)
{
        struct net *net = xp_net(policy);
        int nx;
        int i, error;
        xfrm_address_t *daddr = xfrm_flowi_daddr(fl, family);
        xfrm_address_t *saddr = xfrm_flowi_saddr(fl, family);
        xfrm_address_t tmp;

        for (nx = 0, i = 0; i < policy->xfrm_nr; i++) {
                struct xfrm_state *x;
                xfrm_address_t *remote = daddr;
                xfrm_address_t *local  = saddr;
                struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i];

                if (tmpl->mode == XFRM_MODE_TUNNEL ||
                    tmpl->mode == XFRM_MODE_IPTFS ||
                    tmpl->mode == XFRM_MODE_BEET) {
                        remote = &tmpl->id.daddr;
                        local = &tmpl->saddr;
                        if (xfrm_addr_any(local, tmpl->encap_family)) {
                                struct xfrm_dst_lookup_params params;

                                memset(&params, 0, sizeof(params));
                                params.net = net;
                                params.oif = fl->flowi_oif;
                                params.daddr = remote;
                                error = xfrm_get_saddr(tmpl->encap_family, &tmp,
                                                       &params);
                                if (error)
                                        goto fail;
                                local = &tmp;
                        }
                }

                x = xfrm_state_find(remote, local, fl, tmpl, policy, &error,
                                    family, policy->if_id);
                if (x && x->dir && x->dir != XFRM_SA_DIR_OUT) {
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEDIRERROR);
                        xfrm_state_put(x);
                        error = -EINVAL;
                        goto fail;
                }

                if (x && x->km.state == XFRM_STATE_VALID) {
                        xfrm[nx++] = x;
                        daddr = remote;
                        saddr = local;
                        continue;
                }
                if (x) {
                        error = (x->km.state == XFRM_STATE_ERROR ?
                                 -EINVAL : -EAGAIN);
                        xfrm_state_put(x);
                } else if (error == -ESRCH) {
                        error = -EAGAIN;
                }

                if (!tmpl->optional)
                        goto fail;
        }
        return nx;

fail:
        for (nx--; nx >= 0; nx--)
                xfrm_state_put(xfrm[nx]);
        return error;
}

static int
xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl,
                  struct xfrm_state **xfrm, unsigned short family)
{
        struct xfrm_state *tp[XFRM_MAX_DEPTH];
        struct xfrm_state **tpp = (npols > 1) ? tp : xfrm;
        int cnx = 0;
        int error;
        int ret;
        int i;

        for (i = 0; i < npols; i++) {
                if (cnx + pols[i]->xfrm_nr >= XFRM_MAX_DEPTH) {
                        error = -ENOBUFS;
                        goto fail;
                }

                ret = xfrm_tmpl_resolve_one(pols[i], fl, &tpp[cnx], family);
                if (ret < 0) {
                        error = ret;
                        goto fail;
                } else
                        cnx += ret;
        }

        /* found states are sorted for outbound processing */
        if (npols > 1)
                xfrm_state_sort(xfrm, tpp, cnx, family);

        return cnx;

 fail:
        for (cnx--; cnx >= 0; cnx--)
                xfrm_state_put(tpp[cnx]);
        return error;

}

static dscp_t xfrm_get_dscp(const struct flowi *fl, int family)
{
        if (family == AF_INET)
                return fl->u.ip4.flowi4_dscp;

        return 0;
}

static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
{
        const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
        struct dst_ops *dst_ops;
        struct xfrm_dst *xdst;

        if (!afinfo)
                return ERR_PTR(-EINVAL);

        switch (family) {
        case AF_INET:
                dst_ops = &net->xfrm.xfrm4_dst_ops;
                break;
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                dst_ops = &net->xfrm.xfrm6_dst_ops;
                break;
#endif
        default:
                BUG();
        }
        xdst = dst_alloc(dst_ops, NULL, DST_OBSOLETE_NONE, 0);

        if (likely(xdst)) {
                memset_after(xdst, 0, u.dst);
        } else
                xdst = ERR_PTR(-ENOBUFS);

        rcu_read_unlock();

        return xdst;
}

static void xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst,
                           int nfheader_len)
{
        if (dst->ops->family == AF_INET6) {
                path->path_cookie = rt6_get_cookie(dst_rt6_info(dst));
                path->u.rt6.rt6i_nfheader_len = nfheader_len;
        }
}

static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
                                const struct flowi *fl)
{
        const struct xfrm_policy_afinfo *afinfo =
                xfrm_policy_get_afinfo(xdst->u.dst.ops->family);
        int err;

        if (!afinfo)
                return -EINVAL;

        err = afinfo->fill_dst(xdst, dev, fl);

        rcu_read_unlock();

        return err;
}


/* Allocate chain of dst_entry's, attach known xfrm's, calculate
 * all the metrics... Shortly, bundle a bundle.
 */

static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
                                            struct xfrm_state **xfrm,
                                            struct xfrm_dst **bundle,
                                            int nx,
                                            const struct flowi *fl,
                                            struct dst_entry *dst)
{
        const struct xfrm_state_afinfo *afinfo;
        const struct xfrm_mode *inner_mode;
        struct net *net = xp_net(policy);
        unsigned long now = jiffies;
        struct net_device *dev;
        struct xfrm_dst *xdst_prev = NULL;
        struct xfrm_dst *xdst0 = NULL;
        int i = 0;
        int err;
        int header_len = 0;
        int nfheader_len = 0;
        int trailer_len = 0;
        int family = policy->selector.family;
        xfrm_address_t saddr, daddr;
        dscp_t dscp;

        xfrm_flowi_addr_get(fl, &saddr, &daddr, family);

        dscp = xfrm_get_dscp(fl, family);

        dst_hold(dst);

        for (; i < nx; i++) {
                struct xfrm_dst *xdst = xfrm_alloc_dst(net, family);
                struct dst_entry *dst1 = &xdst->u.dst;

                err = PTR_ERR(xdst);
                if (IS_ERR(xdst)) {
                        dst_release(dst);
                        goto put_states;
                }

                bundle[i] = xdst;
                if (!xdst_prev)
                        xdst0 = xdst;
                else
                        /* Ref count is taken during xfrm_alloc_dst()
                         * No need to do dst_clone() on dst1
                         */
                        xfrm_dst_set_child(xdst_prev, &xdst->u.dst);

                if (xfrm[i]->sel.family == AF_UNSPEC) {
                        inner_mode = xfrm_ip2inner_mode(xfrm[i],
                                                        xfrm_af2proto(family));
                        if (!inner_mode) {
                                err = -EAFNOSUPPORT;
                                dst_release(dst);
                                goto put_states;
                        }
                } else
                        inner_mode = &xfrm[i]->inner_mode;

                xdst->route = dst;
                dst_copy_metrics(dst1, dst);

                if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) {
                        __u32 mark = 0;
                        int oif;

                        if (xfrm[i]->props.smark.v || xfrm[i]->props.smark.m)
                                mark = xfrm_smark_get(fl->flowi_mark, xfrm[i]);

                        if (xfrm[i]->xso.type != XFRM_DEV_OFFLOAD_PACKET)
                                family = xfrm[i]->props.family;

                        oif = fl->flowi_oif ? : fl->flowi_l3mdev;
                        dst = xfrm_dst_lookup(xfrm[i], dscp, oif, &saddr,
                                              &daddr, family, mark);
                        err = PTR_ERR(dst);
                        if (IS_ERR(dst))
                                goto put_states;
                } else
                        dst_hold(dst);

                dst1->xfrm = xfrm[i];
                xdst->xfrm_genid = xfrm[i]->genid;

                dst1->obsolete = DST_OBSOLETE_FORCE_CHK;
                dst1->lastuse = now;

                dst1->input = dst_discard;

                if (xfrm[i]->mode_cbs && xfrm[i]->mode_cbs->output) {
                        dst1->output = xfrm[i]->mode_cbs->output;
                } else {
                        rcu_read_lock();
                        afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family);
                        if (likely(afinfo))
                                dst1->output = afinfo->output;
                        else
                                dst1->output = dst_discard_out;
                        rcu_read_unlock();
                }

                xdst_prev = xdst;

                header_len += xfrm[i]->props.header_len;
                if (xfrm[i]->type->flags & XFRM_TYPE_NON_FRAGMENT)
                        nfheader_len += xfrm[i]->props.header_len;
                trailer_len += xfrm[i]->props.trailer_len;
        }

        xfrm_dst_set_child(xdst_prev, dst);
        xdst0->path = dst;

        err = -ENODEV;
        dev = dst->dev;
        if (!dev)
                goto free_dst;

        xfrm_init_path(xdst0, dst, nfheader_len);
        xfrm_init_pmtu(bundle, nx);

        for (xdst_prev = xdst0; xdst_prev != (struct xfrm_dst *)dst;
             xdst_prev = (struct xfrm_dst *) xfrm_dst_child(&xdst_prev->u.dst)) {
                err = xfrm_fill_dst(xdst_prev, dev, fl);
                if (err)
                        goto free_dst;

                xdst_prev->u.dst.header_len = header_len;
                xdst_prev->u.dst.trailer_len = trailer_len;
                header_len -= xdst_prev->u.dst.xfrm->props.header_len;
                trailer_len -= xdst_prev->u.dst.xfrm->props.trailer_len;
        }

        return &xdst0->u.dst;

put_states:
        for (; i < nx; i++)
                xfrm_state_put(xfrm[i]);
free_dst:
        if (xdst0)
                dst_release_immediate(&xdst0->u.dst);

        return ERR_PTR(err);
}

static int xfrm_expand_policies(const struct flowi *fl, u16 family,
                                struct xfrm_policy **pols,
                                int *num_pols, int *num_xfrms)
{
        int i;

        if (*num_pols == 0 || !pols[0]) {
                *num_pols = 0;
                *num_xfrms = 0;
                return 0;
        }
        if (IS_ERR(pols[0])) {
                *num_pols = 0;
                return PTR_ERR(pols[0]);
        }

        *num_xfrms = pols[0]->xfrm_nr;

#ifdef CONFIG_XFRM_SUB_POLICY
        if (pols[0]->action == XFRM_POLICY_ALLOW &&
            pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
                pols[1] = xfrm_policy_lookup_bytype(xp_net(pols[0]),
                                                    XFRM_POLICY_TYPE_MAIN,
                                                    fl, family,
                                                    XFRM_POLICY_OUT,
                                                    pols[0]->if_id);
                if (pols[1]) {
                        if (IS_ERR(pols[1])) {
                                xfrm_pols_put(pols, *num_pols);
                                *num_pols = 0;
                                return PTR_ERR(pols[1]);
                        }
                        (*num_pols)++;
                        (*num_xfrms) += pols[1]->xfrm_nr;
                }
        }
#endif
        for (i = 0; i < *num_pols; i++) {
                if (pols[i]->action != XFRM_POLICY_ALLOW) {
                        *num_xfrms = -1;
                        break;
                }
        }

        return 0;

}

static struct xfrm_dst *
xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
                               const struct flowi *fl, u16 family,
                               struct dst_entry *dst_orig)
{
        struct net *net = xp_net(pols[0]);
        struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
        struct xfrm_dst *bundle[XFRM_MAX_DEPTH];
        struct xfrm_dst *xdst;
        struct dst_entry *dst;
        int err;

        /* Try to instantiate a bundle */
        err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family);
        if (err <= 0) {
                if (err == 0)
                        return NULL;

                if (err != -EAGAIN)
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
                return ERR_PTR(err);
        }

        dst = xfrm_bundle_create(pols[0], xfrm, bundle, err, fl, dst_orig);
        if (IS_ERR(dst)) {
                XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR);
                return ERR_CAST(dst);
        }

        xdst = (struct xfrm_dst *)dst;
        xdst->num_xfrms = err;
        xdst->num_pols = num_pols;
        memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols);
        xdst->policy_genid = atomic_read(&pols[0]->genid);

        return xdst;
}

static void xfrm_policy_queue_process(struct timer_list *t)
{
        struct sk_buff *skb;
        struct sock *sk;
        struct dst_entry *dst;
        struct xfrm_policy *pol = timer_container_of(pol, t, polq.hold_timer);
        struct net *net = xp_net(pol);
        struct xfrm_policy_queue *pq = &pol->polq;
        struct flowi fl;
        struct sk_buff_head list;
        __u32 skb_mark;

        spin_lock(&pq->hold_queue.lock);
        skb = skb_peek(&pq->hold_queue);
        if (!skb) {
                spin_unlock(&pq->hold_queue.lock);
                goto out;
        }
        dst = skb_dst(skb);
        sk = skb->sk;

        /* Fixup the mark to support VTI. */
        skb_mark = skb->mark;
        skb->mark = pol->mark.v;
        xfrm_decode_session(net, skb, &fl, dst->ops->family);
        skb->mark = skb_mark;
        spin_unlock(&pq->hold_queue.lock);

        dst_hold(xfrm_dst_path(dst));
        dst = xfrm_lookup(net, xfrm_dst_path(dst), &fl, sk, XFRM_LOOKUP_QUEUE);
        if (IS_ERR(dst))
                goto purge_queue;

        if (dst->flags & DST_XFRM_QUEUE) {
                dst_release(dst);

                if (pq->timeout >= XFRM_QUEUE_TMO_MAX)
                        goto purge_queue;

                pq->timeout = pq->timeout << 1;
                if (!mod_timer(&pq->hold_timer, jiffies + pq->timeout))
                        xfrm_pol_hold(pol);
                goto out;
        }

        dst_release(dst);

        __skb_queue_head_init(&list);

        spin_lock(&pq->hold_queue.lock);
        pq->timeout = 0;
        skb_queue_splice_init(&pq->hold_queue, &list);
        spin_unlock(&pq->hold_queue.lock);

        while (!skb_queue_empty(&list)) {
                skb = __skb_dequeue(&list);

                /* Fixup the mark to support VTI. */
                skb_mark = skb->mark;
                skb->mark = pol->mark.v;
                xfrm_decode_session(net, skb, &fl, skb_dst(skb)->ops->family);
                skb->mark = skb_mark;

                dst_hold(xfrm_dst_path(skb_dst(skb)));
                dst = xfrm_lookup(net, xfrm_dst_path(skb_dst(skb)), &fl, skb->sk, 0);
                if (IS_ERR(dst)) {
                        kfree_skb(skb);
                        continue;
                }

                nf_reset_ct(skb);
                skb_dst_drop(skb);
                skb_dst_set(skb, dst);

                dst_output(net, skb_to_full_sk(skb), skb);
        }

out:
        xfrm_pol_put(pol);
        return;

purge_queue:
        pq->timeout = 0;
        skb_queue_purge(&pq->hold_queue);
        xfrm_pol_put(pol);
}

static int xdst_queue_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        unsigned long sched_next;
        struct dst_entry *dst = skb_dst(skb);
        struct xfrm_dst *xdst = (struct xfrm_dst *) dst;
        struct xfrm_policy *pol = xdst->pols[0];
        struct xfrm_policy_queue *pq = &pol->polq;

        if (unlikely(skb_fclone_busy(sk, skb))) {
                kfree_skb(skb);
                return 0;
        }

        if (pq->hold_queue.qlen > XFRM_MAX_QUEUE_LEN) {
                kfree_skb(skb);
                return -EAGAIN;
        }

        skb_dst_force(skb);

        spin_lock_bh(&pq->hold_queue.lock);

        if (!pq->timeout)
                pq->timeout = XFRM_QUEUE_TMO_MIN;

        sched_next = jiffies + pq->timeout;

        if (timer_delete(&pq->hold_timer)) {
                if (time_before(pq->hold_timer.expires, sched_next))
                        sched_next = pq->hold_timer.expires;
                xfrm_pol_put(pol);
        }

        __skb_queue_tail(&pq->hold_queue, skb);
        if (!mod_timer(&pq->hold_timer, sched_next))
                xfrm_pol_hold(pol);

        spin_unlock_bh(&pq->hold_queue.lock);

        return 0;
}

static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net,
                                                 struct xfrm_flo *xflo,
                                                 const struct flowi *fl,
                                                 int num_xfrms,
                                                 u16 family)
{
        int err;
        struct net_device *dev;
        struct dst_entry *dst;
        struct dst_entry *dst1;
        struct xfrm_dst *xdst;

        xdst = xfrm_alloc_dst(net, family);
        if (IS_ERR(xdst))
                return xdst;

        if (!(xflo->flags & XFRM_LOOKUP_QUEUE) ||
            net->xfrm.sysctl_larval_drop ||
            num_xfrms <= 0)
                return xdst;

        dst = xflo->dst_orig;
        dst1 = &xdst->u.dst;
        dst_hold(dst);
        xdst->route = dst;

        dst_copy_metrics(dst1, dst);

        dst1->obsolete = DST_OBSOLETE_FORCE_CHK;
        dst1->flags |= DST_XFRM_QUEUE;
        dst1->lastuse = jiffies;

        dst1->input = dst_discard;
        dst1->output = xdst_queue_output;

        dst_hold(dst);
        xfrm_dst_set_child(xdst, dst);
        xdst->path = dst;

        xfrm_init_path((struct xfrm_dst *)dst1, dst, 0);

        err = -ENODEV;
        dev = dst->dev;
        if (!dev)
                goto free_dst;

        err = xfrm_fill_dst(xdst, dev, fl);
        if (err)
                goto free_dst;

out:
        return xdst;

free_dst:
        dst_release(dst1);
        xdst = ERR_PTR(err);
        goto out;
}

static struct xfrm_dst *xfrm_bundle_lookup(struct net *net,
                                           const struct flowi *fl,
                                           u16 family, u8 dir,
                                           struct xfrm_flo *xflo, u32 if_id)
{
        struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
        int num_pols = 0, num_xfrms = 0, err;
        struct xfrm_dst *xdst;

        /* Resolve policies to use if we couldn't get them from
         * previous cache entry */
        num_pols = 1;
        pols[0] = xfrm_policy_lookup(net, fl, family, dir, if_id);
        err = xfrm_expand_policies(fl, family, pols,
                                           &num_pols, &num_xfrms);
        if (err < 0)
                goto inc_error;
        if (num_pols == 0)
                return NULL;
        if (num_xfrms <= 0)
                goto make_dummy_bundle;

        xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family,
                                              xflo->dst_orig);
        if (IS_ERR(xdst)) {
                err = PTR_ERR(xdst);
                if (err == -EREMOTE) {
                        xfrm_pols_put(pols, num_pols);
                        return NULL;
                }

                if (err != -EAGAIN)
                        goto error;
                goto make_dummy_bundle;
        } else if (xdst == NULL) {
                num_xfrms = 0;
                goto make_dummy_bundle;
        }

        return xdst;

make_dummy_bundle:
        /* We found policies, but there's no bundles to instantiate:
         * either because the policy blocks, has no transformations or
         * we could not build template (no xfrm_states).*/
        xdst = xfrm_create_dummy_bundle(net, xflo, fl, num_xfrms, family);
        if (IS_ERR(xdst)) {
                xfrm_pols_put(pols, num_pols);
                return ERR_CAST(xdst);
        }
        xdst->num_pols = num_pols;
        xdst->num_xfrms = num_xfrms;
        memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols);

        return xdst;

inc_error:
        XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
error:
        xfrm_pols_put(pols, num_pols);
        return ERR_PTR(err);
}

static struct dst_entry *make_blackhole(struct net *net, u16 family,
                                        struct dst_entry *dst_orig)
{
        const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
        struct dst_entry *ret;

        if (!afinfo) {
                dst_release(dst_orig);
                return ERR_PTR(-EINVAL);
        } else {
                ret = afinfo->blackhole_route(net, dst_orig);
        }
        rcu_read_unlock();

        return ret;
}

/* Finds/creates a bundle for given flow and if_id
 *
 * At the moment we eat a raw IP route. Mostly to speed up lookups
 * on interfaces with disabled IPsec.
 *
 * xfrm_lookup uses an if_id of 0 by default, and is provided for
 * compatibility
 */
struct dst_entry *xfrm_lookup_with_ifid(struct net *net,
                                        struct dst_entry *dst_orig,
                                        const struct flowi *fl,
                                        const struct sock *sk,
                                        int flags, u32 if_id)
{
        struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
        struct xfrm_dst *xdst;
        struct dst_entry *dst, *route;
        u16 family = dst_orig->ops->family;
        u8 dir = XFRM_POLICY_OUT;
        int i, err, num_pols, num_xfrms = 0, drop_pols = 0;

        dst = NULL;
        xdst = NULL;
        route = NULL;

        sk = sk_const_to_full_sk(sk);
        if (sk && sk->sk_policy[XFRM_POLICY_OUT]) {
                num_pols = 1;
                pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, family,
                                                if_id);
                err = xfrm_expand_policies(fl, family, pols,
                                           &num_pols, &num_xfrms);
                if (err < 0)
                        goto dropdst;

                if (num_pols) {
                        if (num_xfrms <= 0) {
                                drop_pols = num_pols;
                                goto no_transform;
                        }

                        xdst = xfrm_resolve_and_create_bundle(
                                        pols, num_pols, fl,
                                        family, dst_orig);

                        if (IS_ERR(xdst)) {
                                xfrm_pols_put(pols, num_pols);
                                err = PTR_ERR(xdst);
                                if (err == -EREMOTE)
                                        goto nopol;

                                goto dropdst;
                        } else if (xdst == NULL) {
                                num_xfrms = 0;
                                drop_pols = num_pols;
                                goto no_transform;
                        }

                        route = xdst->route;
                }
        }

        if (xdst == NULL) {
                struct xfrm_flo xflo;

                xflo.dst_orig = dst_orig;
                xflo.flags = flags;

                /* To accelerate a bit...  */
                if (!if_id && ((dst_orig->flags & DST_NOXFRM) ||
                               !net->xfrm.policy_count[XFRM_POLICY_OUT]))
                        goto nopol;

                xdst = xfrm_bundle_lookup(net, fl, family, dir, &xflo, if_id);
                if (xdst == NULL)
                        goto nopol;
                if (IS_ERR(xdst)) {
                        err = PTR_ERR(xdst);
                        goto dropdst;
                }

                num_pols = xdst->num_pols;
                num_xfrms = xdst->num_xfrms;
                memcpy(pols, xdst->pols, sizeof(struct xfrm_policy *) * num_pols);
                route = xdst->route;
        }

        dst = &xdst->u.dst;
        if (route == NULL && num_xfrms > 0) {
                /* The only case when xfrm_bundle_lookup() returns a
                 * bundle with null route, is when the template could
                 * not be resolved. It means policies are there, but
                 * bundle could not be created, since we don't yet
                 * have the xfrm_state's. We need to wait for KM to
                 * negotiate new SA's or bail out with error.*/
                if (net->xfrm.sysctl_larval_drop) {
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
                        err = -EREMOTE;
                        goto error;
                }

                err = -EAGAIN;

                XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
                goto error;
        }

no_transform:
        if (num_pols == 0)
                goto nopol;

        if ((flags & XFRM_LOOKUP_ICMP) &&
            !(pols[0]->flags & XFRM_POLICY_ICMP)) {
                err = -ENOENT;
                goto error;
        }

        for (i = 0; i < num_pols; i++)
                WRITE_ONCE(pols[i]->curlft.use_time, ktime_get_real_seconds());

        if (num_xfrms < 0) {
                /* Prohibit the flow */
                XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLBLOCK);
                err = -EPERM;
                goto error;
        } else if (num_xfrms > 0) {
                /* Flow transformed */
                dst_release(dst_orig);
        } else {
                /* Flow passes untransformed */
                dst_release(dst);
                dst = dst_orig;
        }

ok:
        xfrm_pols_put(pols, drop_pols);
        if (dst->xfrm &&
            (dst->xfrm->props.mode == XFRM_MODE_TUNNEL ||
             dst->xfrm->props.mode == XFRM_MODE_IPTFS))
                dst->flags |= DST_XFRM_TUNNEL;
        return dst;

nopol:
        if ((!dst_orig->dev || !(dst_orig->dev->flags & IFF_LOOPBACK)) &&
            net->xfrm.policy_default[dir] == XFRM_USERPOLICY_BLOCK) {
                err = -EPERM;
                goto error;
        }
        if (!(flags & XFRM_LOOKUP_ICMP)) {
                dst = dst_orig;
                goto ok;
        }
        err = -ENOENT;
error:
        dst_release(dst);
dropdst:
        if (!(flags & XFRM_LOOKUP_KEEP_DST_REF))
                dst_release(dst_orig);
        xfrm_pols_put(pols, drop_pols);
        return ERR_PTR(err);
}
EXPORT_SYMBOL(xfrm_lookup_with_ifid);

/* Main function: finds/creates a bundle for given flow.
 *
 * At the moment we eat a raw IP route. Mostly to speed up lookups
 * on interfaces with disabled IPsec.
 */
struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
                              const struct flowi *fl, const struct sock *sk,
                              int flags)
{
        return xfrm_lookup_with_ifid(net, dst_orig, fl, sk, flags, 0);
}
EXPORT_SYMBOL(xfrm_lookup);

/* Callers of xfrm_lookup_route() must ensure a call to dst_output().
 * Otherwise we may send out blackholed packets.
 */
struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig,
                                    const struct flowi *fl,
                                    const struct sock *sk, int flags)
{
        struct dst_entry *dst = xfrm_lookup(net, dst_orig, fl, sk,
                                            flags | XFRM_LOOKUP_QUEUE |
                                            XFRM_LOOKUP_KEEP_DST_REF);

        if (PTR_ERR(dst) == -EREMOTE)
                return make_blackhole(net, dst_orig->ops->family, dst_orig);

        if (IS_ERR(dst))
                dst_release(dst_orig);

        return dst;
}
EXPORT_SYMBOL(xfrm_lookup_route);

static inline int
xfrm_secpath_reject(int idx, struct sk_buff *skb, const struct flowi *fl)
{
        struct sec_path *sp = skb_sec_path(skb);
        struct xfrm_state *x;

        if (!sp || idx < 0 || idx >= sp->len)
                return 0;
        x = sp->xvec[idx];
        if (!x->type->reject)
                return 0;
        return x->type->reject(x, skb, fl);
}

/* When skb is transformed back to its "native" form, we have to
 * check policy restrictions. At the moment we make this in maximally
 * stupid way. Shame on me. :-) Of course, connected sockets must
 * have policy cached at them.
 */

static inline int
xfrm_state_ok(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x,
              unsigned short family, u32 if_id)
{
        if (xfrm_state_kern(x))
                return tmpl->optional && !xfrm_state_addr_cmp(tmpl, x, tmpl->encap_family);
        return        x->id.proto == tmpl->id.proto &&
                (x->id.spi == tmpl->id.spi || !tmpl->id.spi) &&
                (x->props.reqid == tmpl->reqid || !tmpl->reqid) &&
                x->props.mode == tmpl->mode &&
                (tmpl->allalgs || (tmpl->aalgos & (1<<x->props.aalgo)) ||
                 !(xfrm_id_proto_match(tmpl->id.proto, IPSEC_PROTO_ANY))) &&
                !(x->props.mode != XFRM_MODE_TRANSPORT &&
                  xfrm_state_addr_cmp(tmpl, x, family)) &&
                (if_id == 0 || if_id == x->if_id);
}

/*
 * 0 or more than 0 is returned when validation is succeeded (either bypass
 * because of optional transport mode, or next index of the matched secpath
 * state with the template.
 * -1 is returned when no matching template is found.
 * Otherwise "-2 - errored_index" is returned.
 */
static inline int
xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int start,
               unsigned short family, u32 if_id)
{
        int idx = start;

        if (tmpl->optional) {
                if (tmpl->mode == XFRM_MODE_TRANSPORT)
                        return start;
        } else
                start = -1;
        for (; idx < sp->len; idx++) {
                if (xfrm_state_ok(tmpl, sp->xvec[idx], family, if_id))
                        return ++idx;
                if (sp->xvec[idx]->props.mode != XFRM_MODE_TRANSPORT) {
                        if (idx < sp->verified_cnt) {
                                /* Secpath entry previously verified, consider optional and
                                 * continue searching
                                 */
                                continue;
                        }

                        if (start == -1)
                                start = -2-idx;
                        break;
                }
        }
        return start;
}

static void
decode_session4(const struct xfrm_flow_keys *flkeys, struct flowi *fl, bool reverse)
{
        struct flowi4 *fl4 = &fl->u.ip4;

        memset(fl4, 0, sizeof(struct flowi4));

        if (reverse) {
                fl4->saddr = flkeys->addrs.ipv4.dst;
                fl4->daddr = flkeys->addrs.ipv4.src;
                fl4->fl4_sport = flkeys->ports.dst;
                fl4->fl4_dport = flkeys->ports.src;
        } else {
                fl4->saddr = flkeys->addrs.ipv4.src;
                fl4->daddr = flkeys->addrs.ipv4.dst;
                fl4->fl4_sport = flkeys->ports.src;
                fl4->fl4_dport = flkeys->ports.dst;
        }

        switch (flkeys->basic.ip_proto) {
        case IPPROTO_GRE:
                fl4->fl4_gre_key = flkeys->gre.keyid;
                break;
        case IPPROTO_ICMP:
                fl4->fl4_icmp_type = flkeys->icmp.type;
                fl4->fl4_icmp_code = flkeys->icmp.code;
                break;
        }

        fl4->flowi4_proto = flkeys->basic.ip_proto;
        fl4->flowi4_dscp = inet_dsfield_to_dscp(flkeys->ip.tos);
}

#if IS_ENABLED(CONFIG_IPV6)
static void
decode_session6(const struct xfrm_flow_keys *flkeys, struct flowi *fl, bool reverse)
{
        struct flowi6 *fl6 = &fl->u.ip6;

        memset(fl6, 0, sizeof(struct flowi6));

        if (reverse) {
                fl6->saddr = flkeys->addrs.ipv6.dst;
                fl6->daddr = flkeys->addrs.ipv6.src;
                fl6->fl6_sport = flkeys->ports.dst;
                fl6->fl6_dport = flkeys->ports.src;
        } else {
                fl6->saddr = flkeys->addrs.ipv6.src;
                fl6->daddr = flkeys->addrs.ipv6.dst;
                fl6->fl6_sport = flkeys->ports.src;
                fl6->fl6_dport = flkeys->ports.dst;
        }

        switch (flkeys->basic.ip_proto) {
        case IPPROTO_GRE:
                fl6->fl6_gre_key = flkeys->gre.keyid;
                break;
        case IPPROTO_ICMPV6:
                fl6->fl6_icmp_type = flkeys->icmp.type;
                fl6->fl6_icmp_code = flkeys->icmp.code;
                break;
        }

        fl6->flowi6_proto = flkeys->basic.ip_proto;
}
#endif

int __xfrm_decode_session(struct net *net, struct sk_buff *skb, struct flowi *fl,
                          unsigned int family, int reverse)
{
        struct xfrm_flow_keys flkeys;

        memset(&flkeys, 0, sizeof(flkeys));
        __skb_flow_dissect(net, skb, &xfrm_session_dissector, &flkeys,
                           NULL, 0, 0, 0, FLOW_DISSECTOR_F_STOP_AT_ENCAP);

        switch (family) {
        case AF_INET:
                decode_session4(&flkeys, fl, reverse);
                break;
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                decode_session6(&flkeys, fl, reverse);
                break;
#endif
        default:
                return -EAFNOSUPPORT;
        }

        fl->flowi_mark = skb->mark;
        if (reverse) {
                fl->flowi_oif = skb->skb_iif;
        } else {
                int oif = 0;

                if (skb_dst(skb) && skb_dst(skb)->dev)
                        oif = skb_dst(skb)->dev->ifindex;

                fl->flowi_oif = oif;
        }

        return security_xfrm_decode_session(skb, &fl->flowi_secid);
}
EXPORT_SYMBOL(__xfrm_decode_session);

static inline int secpath_has_nontransport(const struct sec_path *sp, int k, int *idxp)
{
        for (; k < sp->len; k++) {
                if (sp->xvec[k]->props.mode != XFRM_MODE_TRANSPORT) {
                        *idxp = k;
                        return 1;
                }
        }

        return 0;
}

static bool icmp_err_packet(const struct flowi *fl, unsigned short family)
{
        const struct flowi4 *fl4 = &fl->u.ip4;

        if (family == AF_INET &&
            fl4->flowi4_proto == IPPROTO_ICMP &&
            (fl4->fl4_icmp_type == ICMP_DEST_UNREACH ||
             fl4->fl4_icmp_type == ICMP_TIME_EXCEEDED))
                return true;

#if IS_ENABLED(CONFIG_IPV6)
        if (family == AF_INET6) {
                const struct flowi6 *fl6 = &fl->u.ip6;

                if (fl6->flowi6_proto == IPPROTO_ICMPV6 &&
                    (fl6->fl6_icmp_type == ICMPV6_DEST_UNREACH ||
                    fl6->fl6_icmp_type == ICMPV6_PKT_TOOBIG ||
                    fl6->fl6_icmp_type == ICMPV6_TIME_EXCEED))
                        return true;
        }
#endif
        return false;
}

static bool xfrm_icmp_flow_decode(struct sk_buff *skb, unsigned short family,
                                  const struct flowi *fl, struct flowi *fl1)
{
        bool ret = true;
        struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
        int hl = family == AF_INET ? (sizeof(struct iphdr) +  sizeof(struct icmphdr)) :
                 (sizeof(struct ipv6hdr) + sizeof(struct icmp6hdr));

        if (!newskb)
                return true;

        if (!pskb_pull(newskb, hl))
                goto out;

        skb_reset_network_header(newskb);

        if (xfrm_decode_session_reverse(dev_net(skb->dev), newskb, fl1, family) < 0)
                goto out;

        fl1->flowi_oif = fl->flowi_oif;
        fl1->flowi_mark = fl->flowi_mark;
        fl1->flowi_dscp = fl->flowi_dscp;
        nf_nat_decode_session(newskb, fl1, family);
        ret = false;

out:
        consume_skb(newskb);
        return ret;
}

static bool xfrm_selector_inner_icmp_match(struct sk_buff *skb, unsigned short family,
                                           const struct xfrm_selector *sel,
                                           const struct flowi *fl)
{
        bool ret = false;

        if (icmp_err_packet(fl, family)) {
                struct flowi fl1;

                if (xfrm_icmp_flow_decode(skb, family, fl, &fl1))
                        return ret;

                ret = xfrm_selector_match(sel, &fl1, family);
        }

        return ret;
}

static inline struct
xfrm_policy *xfrm_in_fwd_icmp(struct sk_buff *skb,
                              const struct flowi *fl, unsigned short family,
                              u32 if_id)
{
        struct xfrm_policy *pol = NULL;

        if (icmp_err_packet(fl, family)) {
                struct flowi fl1;
                struct net *net = dev_net(skb->dev);

                if (xfrm_icmp_flow_decode(skb, family, fl, &fl1))
                        return pol;

                pol = xfrm_policy_lookup(net, &fl1, family, XFRM_POLICY_FWD, if_id);
                if (IS_ERR(pol))
                        pol = NULL;
        }

        return pol;
}

static inline struct
dst_entry *xfrm_out_fwd_icmp(struct sk_buff *skb, struct flowi *fl,
                             unsigned short family, struct dst_entry *dst)
{
        if (icmp_err_packet(fl, family)) {
                struct net *net = dev_net(skb->dev);
                struct dst_entry *dst2;
                struct flowi fl1;

                if (xfrm_icmp_flow_decode(skb, family, fl, &fl1))
                        return dst;

                dst_hold(dst);

                dst2 = xfrm_lookup(net, dst, &fl1, NULL, (XFRM_LOOKUP_QUEUE | XFRM_LOOKUP_ICMP));

                if (IS_ERR(dst2))
                        return dst;

                if (dst2->xfrm) {
                        dst_release(dst);
                        dst = dst2;
                } else {
                        dst_release(dst2);
                }
        }

        return dst;
}

int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
                        unsigned short family)
{
        struct net *net = dev_net(skb->dev);
        struct xfrm_policy *pol;
        struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
        int npols = 0;
        int xfrm_nr;
        int pi;
        int reverse;
        struct flowi fl;
        int xerr_idx = -1;
        const struct xfrm_if_cb *ifcb;
        struct sec_path *sp;
        u32 if_id = 0;

        rcu_read_lock();
        ifcb = xfrm_if_get_cb();

        if (ifcb) {
                struct xfrm_if_decode_session_result r;

                if (ifcb->decode_session(skb, family, &r)) {
                        if_id = r.if_id;
                        net = r.net;
                }
        }
        rcu_read_unlock();

        reverse = dir & ~XFRM_POLICY_MASK;
        dir &= XFRM_POLICY_MASK;

        if (__xfrm_decode_session(net, skb, &fl, family, reverse) < 0) {
                XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
                return 0;
        }

        nf_nat_decode_session(skb, &fl, family);

        /* First, check used SA against their selectors. */
        sp = skb_sec_path(skb);
        if (sp) {
                int i;

                for (i = sp->len - 1; i >= 0; i--) {
                        struct xfrm_state *x = sp->xvec[i];
                        int ret = 0;

                        if (!xfrm_selector_match(&x->sel, &fl, family)) {
                                ret = 1;
                                if (x->props.flags & XFRM_STATE_ICMP &&
                                    xfrm_selector_inner_icmp_match(skb, family, &x->sel, &fl))
                                        ret = 0;
                                if (ret) {
                                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH);
                                        return 0;
                                }
                        }
                }
        }

        pol = NULL;
        sk = sk_to_full_sk(sk);
        if (sk && sk->sk_policy[dir]) {
                pol = xfrm_sk_policy_lookup(sk, dir, &fl, family, if_id);
                if (IS_ERR(pol)) {
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
                        return 0;
                }
        }

        if (!pol)
                pol = xfrm_policy_lookup(net, &fl, family, dir, if_id);

        if (IS_ERR(pol)) {
                XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
                return 0;
        }

        if (!pol && dir == XFRM_POLICY_FWD)
                pol = xfrm_in_fwd_icmp(skb, &fl, family, if_id);

        if (!pol) {
                const bool is_crypto_offload = sp &&
                        (xfrm_input_state(skb)->xso.type == XFRM_DEV_OFFLOAD_CRYPTO);

                if (net->xfrm.policy_default[dir] == XFRM_USERPOLICY_BLOCK) {
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS);
                        return 0;
                }

                if (sp && secpath_has_nontransport(sp, 0, &xerr_idx) && !is_crypto_offload) {
                        xfrm_secpath_reject(xerr_idx, skb, &fl);
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS);
                        return 0;
                }
                return 1;
        }

        /* This lockless write can happen from different cpus. */
        WRITE_ONCE(pol->curlft.use_time, ktime_get_real_seconds());

        pols[0] = pol;
        npols++;
#ifdef CONFIG_XFRM_SUB_POLICY
        if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
                pols[1] = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN,
                                                    &fl, family,
                                                    XFRM_POLICY_IN, if_id);
                if (pols[1]) {
                        if (IS_ERR(pols[1])) {
                                XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
                                xfrm_pol_put(pols[0]);
                                return 0;
                        }
                        /* This write can happen from different cpus. */
                        WRITE_ONCE(pols[1]->curlft.use_time,
                                   ktime_get_real_seconds());
                        npols++;
                }
        }
#endif

        if (pol->action == XFRM_POLICY_ALLOW) {
                static struct sec_path dummy;
                struct xfrm_tmpl *tp[XFRM_MAX_DEPTH];
                struct xfrm_tmpl *stp[XFRM_MAX_DEPTH];
                struct xfrm_tmpl **tpp = tp;
                int ti = 0;
                int i, k;

                sp = skb_sec_path(skb);
                if (!sp)
                        sp = &dummy;

                for (pi = 0; pi < npols; pi++) {
                        if (pols[pi] != pol &&
                            pols[pi]->action != XFRM_POLICY_ALLOW) {
                                XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK);
                                goto reject;
                        }
                        if (ti + pols[pi]->xfrm_nr >= XFRM_MAX_DEPTH) {
                                XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
                                goto reject_error;
                        }
                        for (i = 0; i < pols[pi]->xfrm_nr; i++)
                                tpp[ti++] = &pols[pi]->xfrm_vec[i];
                }
                xfrm_nr = ti;

                if (npols > 1) {
                        xfrm_tmpl_sort(stp, tpp, xfrm_nr, family);
                        tpp = stp;
                }

                /* For each tunnel xfrm, find the first matching tmpl.
                 * For each tmpl before that, find corresponding xfrm.
                 * Order is _important_. Later we will implement
                 * some barriers, but at the moment barriers
                 * are implied between each two transformations.
                 * Upon success, marks secpath entries as having been
                 * verified to allow them to be skipped in future policy
                 * checks (e.g. nested tunnels).
                 */
                for (i = xfrm_nr-1, k = 0; i >= 0; i--) {
                        k = xfrm_policy_ok(tpp[i], sp, k, family, if_id);
                        if (k < 0) {
                                if (k < -1)
                                        /* "-2 - errored_index" returned */
                                        xerr_idx = -(2+k);
                                XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH);
                                goto reject;
                        }
                }

                if (secpath_has_nontransport(sp, k, &xerr_idx)) {
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH);
                        goto reject;
                }

                xfrm_pols_put(pols, npols);
                sp->verified_cnt = k;

                return 1;
        }
        XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK);

reject:
        xfrm_secpath_reject(xerr_idx, skb, &fl);
reject_error:
        xfrm_pols_put(pols, npols);
        return 0;
}
EXPORT_SYMBOL(__xfrm_policy_check);

int __xfrm_route_forward(struct sk_buff *skb, unsigned short family)
{
        struct net *net = dev_net(skb->dev);
        struct flowi fl;
        struct dst_entry *dst;
        int res = 1;

        if (xfrm_decode_session(net, skb, &fl, family) < 0) {
                XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR);
                return 0;
        }

        skb_dst_force(skb);
        dst = skb_dst(skb);
        if (!dst) {
                XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR);
                return 0;
        }

        /* ignore return value from skb_dstref_steal, xfrm_lookup takes
         * care of dropping the refcnt if needed.
         */
        skb_dstref_steal(skb);

        dst = xfrm_lookup(net, dst, &fl, NULL, XFRM_LOOKUP_QUEUE);
        if (IS_ERR(dst)) {
                res = 0;
                dst = NULL;
        }

        if (dst && !dst->xfrm)
                dst = xfrm_out_fwd_icmp(skb, &fl, family, dst);

        skb_dst_set(skb, dst);
        return res;
}
EXPORT_SYMBOL(__xfrm_route_forward);

/* Optimize later using cookies and generation ids. */

static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie)
{
        /* Code (such as __xfrm4_bundle_create()) sets dst->obsolete
         * to DST_OBSOLETE_FORCE_CHK to force all XFRM destinations to
         * get validated by dst_ops->check on every use.  We do this
         * because when a normal route referenced by an XFRM dst is
         * obsoleted we do not go looking around for all parent
         * referencing XFRM dsts so that we can invalidate them.  It
         * is just too much work.  Instead we make the checks here on
         * every use.  For example:
         *
         *        XFRM dst A --> IPv4 dst X
         *
         * X is the "xdst->route" of A (X is also the "dst->path" of A
         * in this example).  If X is marked obsolete, "A" will not
         * notice.  That's what we are validating here via the
         * stale_bundle() check.
         *
         * When a dst is removed from the fib tree, DST_OBSOLETE_DEAD will
         * be marked on it.
         * This will force stale_bundle() to fail on any xdst bundle with
         * this dst linked in it.
         */
        if (READ_ONCE(dst->obsolete) < 0 && !stale_bundle(dst))
                return dst;

        return NULL;
}

static int stale_bundle(struct dst_entry *dst)
{
        return !xfrm_bundle_ok((struct xfrm_dst *)dst);
}

void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev)
{
        while ((dst = xfrm_dst_child(dst)) && dst->xfrm && dst->dev == dev) {
                dst->dev = blackhole_netdev;
                dev_hold(dst->dev);
                dev_put(dev);
        }
}
EXPORT_SYMBOL(xfrm_dst_ifdown);

static void xfrm_link_failure(struct sk_buff *skb)
{
        /* Impossible. Such dst must be popped before reaches point of failure. */
}

static void xfrm_negative_advice(struct sock *sk, struct dst_entry *dst)
{
        if (READ_ONCE(dst->obsolete))
                sk_dst_reset(sk);
}

static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr)
{
        while (nr--) {
                struct xfrm_dst *xdst = bundle[nr];
                u32 pmtu, route_mtu_cached;
                struct dst_entry *dst;

                dst = &xdst->u.dst;
                pmtu = dst_mtu(xfrm_dst_child(dst));
                xdst->child_mtu_cached = pmtu;

                pmtu = xfrm_state_mtu(dst->xfrm, pmtu);

                route_mtu_cached = dst_mtu(xdst->route);
                xdst->route_mtu_cached = route_mtu_cached;

                if (pmtu > route_mtu_cached)
                        pmtu = route_mtu_cached;

                dst_metric_set(dst, RTAX_MTU, pmtu);
        }
}

/* Check that the bundle accepts the flow and its components are
 * still valid.
 */

static int xfrm_bundle_ok(struct xfrm_dst *first)
{
        struct xfrm_dst *bundle[XFRM_MAX_DEPTH];
        struct dst_entry *dst = &first->u.dst;
        struct xfrm_dst *xdst;
        int start_from, nr;
        u32 mtu;

        if (!dst_check(xfrm_dst_path(dst), ((struct xfrm_dst *)dst)->path_cookie) ||
            (dst->dev && !netif_running(dst->dev)))
                return 0;

        if (dst->flags & DST_XFRM_QUEUE)
                return 1;

        start_from = nr = 0;
        do {
                struct xfrm_dst *xdst = (struct xfrm_dst *)dst;

                if (dst->xfrm->km.state != XFRM_STATE_VALID)
                        return 0;
                if (xdst->xfrm_genid != dst->xfrm->genid)
                        return 0;
                if (xdst->num_pols > 0 &&
                    xdst->policy_genid != atomic_read(&xdst->pols[0]->genid))
                        return 0;

                bundle[nr++] = xdst;

                mtu = dst_mtu(xfrm_dst_child(dst));
                if (xdst->child_mtu_cached != mtu) {
                        start_from = nr;
                        xdst->child_mtu_cached = mtu;
                }

                if (!dst_check(xdst->route, xdst->route_cookie))
                        return 0;
                mtu = dst_mtu(xdst->route);
                if (xdst->route_mtu_cached != mtu) {
                        start_from = nr;
                        xdst->route_mtu_cached = mtu;
                }

                dst = xfrm_dst_child(dst);
        } while (dst->xfrm);

        if (likely(!start_from))
                return 1;

        xdst = bundle[start_from - 1];
        mtu = xdst->child_mtu_cached;
        while (start_from--) {
                dst = &xdst->u.dst;

                mtu = xfrm_state_mtu(dst->xfrm, mtu);
                if (mtu > xdst->route_mtu_cached)
                        mtu = xdst->route_mtu_cached;
                dst_metric_set(dst, RTAX_MTU, mtu);
                if (!start_from)
                        break;

                xdst = bundle[start_from - 1];
                xdst->child_mtu_cached = mtu;
        }

        return 1;
}

static unsigned int xfrm_default_advmss(const struct dst_entry *dst)
{
        return dst_metric_advmss(xfrm_dst_path(dst));
}

static unsigned int xfrm_mtu(const struct dst_entry *dst)
{
        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);

        return mtu ? : dst_mtu(xfrm_dst_path(dst));
}

static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst,
                                        const void *daddr)
{
        while (dst->xfrm) {
                const struct xfrm_state *xfrm = dst->xfrm;

                dst = xfrm_dst_child(dst);

                if (xfrm->props.mode == XFRM_MODE_TRANSPORT)
                        continue;
                if (xfrm->type->flags & XFRM_TYPE_REMOTE_COADDR)
                        daddr = xfrm->coaddr;
                else if (!(xfrm->type->flags & XFRM_TYPE_LOCAL_COADDR))
                        daddr = &xfrm->id.daddr;
        }
        return daddr;
}

static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst,
                                           struct sk_buff *skb,
                                           const void *daddr)
{
        const struct dst_entry *path = xfrm_dst_path(dst);

        if (!skb)
                daddr = xfrm_get_dst_nexthop(dst, daddr);
        return path->ops->neigh_lookup(path, skb, daddr);
}

static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr)
{
        const struct dst_entry *path = xfrm_dst_path(dst);

        daddr = xfrm_get_dst_nexthop(dst, daddr);
        path->ops->confirm_neigh(path, daddr);
}

int xfrm_policy_register_afinfo(const struct xfrm_policy_afinfo *afinfo, int family)
{
        int err = 0;

        if (WARN_ON(family >= ARRAY_SIZE(xfrm_policy_afinfo)))
                return -EAFNOSUPPORT;

        spin_lock(&xfrm_policy_afinfo_lock);
        if (unlikely(xfrm_policy_afinfo[family] != NULL))
                err = -EEXIST;
        else {
                struct dst_ops *dst_ops = afinfo->dst_ops;
                if (likely(dst_ops->kmem_cachep == NULL))
                        dst_ops->kmem_cachep = xfrm_dst_cache;
                if (likely(dst_ops->check == NULL))
                        dst_ops->check = xfrm_dst_check;
                if (likely(dst_ops->default_advmss == NULL))
                        dst_ops->default_advmss = xfrm_default_advmss;
                if (likely(dst_ops->mtu == NULL))
                        dst_ops->mtu = xfrm_mtu;
                if (likely(dst_ops->negative_advice == NULL))
                        dst_ops->negative_advice = xfrm_negative_advice;
                if (likely(dst_ops->link_failure == NULL))
                        dst_ops->link_failure = xfrm_link_failure;
                if (likely(dst_ops->neigh_lookup == NULL))
                        dst_ops->neigh_lookup = xfrm_neigh_lookup;
                if (likely(!dst_ops->confirm_neigh))
                        dst_ops->confirm_neigh = xfrm_confirm_neigh;
                rcu_assign_pointer(xfrm_policy_afinfo[family], afinfo);
        }
        spin_unlock(&xfrm_policy_afinfo_lock);

        return err;
}
EXPORT_SYMBOL(xfrm_policy_register_afinfo);

void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo)
{
        struct dst_ops *dst_ops = afinfo->dst_ops;
        int i;

        for (i = 0; i < ARRAY_SIZE(xfrm_policy_afinfo); i++) {
                if (xfrm_policy_afinfo[i] != afinfo)
                        continue;
                RCU_INIT_POINTER(xfrm_policy_afinfo[i], NULL);
                break;
        }

        synchronize_rcu();

        dst_ops->kmem_cachep = NULL;
        dst_ops->check = NULL;
        dst_ops->negative_advice = NULL;
        dst_ops->link_failure = NULL;
}
EXPORT_SYMBOL(xfrm_policy_unregister_afinfo);

void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb)
{
        spin_lock(&xfrm_if_cb_lock);
        rcu_assign_pointer(xfrm_if_cb, ifcb);
        spin_unlock(&xfrm_if_cb_lock);
}
EXPORT_SYMBOL(xfrm_if_register_cb);

void xfrm_if_unregister_cb(void)
{
        RCU_INIT_POINTER(xfrm_if_cb, NULL);
        synchronize_rcu();
}
EXPORT_SYMBOL(xfrm_if_unregister_cb);

#ifdef CONFIG_XFRM_STATISTICS
static int __net_init xfrm_statistics_init(struct net *net)
{
        int rv;
        net->mib.xfrm_statistics = alloc_percpu(struct linux_xfrm_mib);
        if (!net->mib.xfrm_statistics)
                return -ENOMEM;
        rv = xfrm_proc_init(net);
        if (rv < 0)
                free_percpu(net->mib.xfrm_statistics);
        return rv;
}

static void xfrm_statistics_fini(struct net *net)
{
        xfrm_proc_fini(net);
        free_percpu(net->mib.xfrm_statistics);
}
#else
static int __net_init xfrm_statistics_init(struct net *net)
{
        return 0;
}

static void xfrm_statistics_fini(struct net *net)
{
}
#endif

static int __net_init xfrm_policy_init(struct net *net)
{
        unsigned int hmask, sz;
        int dir, err;

        if (net_eq(net, &init_net)) {
                xfrm_dst_cache = KMEM_CACHE(xfrm_dst, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
                err = rhashtable_init(&xfrm_policy_inexact_table,
                                      &xfrm_pol_inexact_params);
                BUG_ON(err);
        }

        hmask = 8 - 1;
        sz = (hmask+1) * sizeof(struct hlist_head);

        net->xfrm.policy_byidx = xfrm_hash_alloc(sz);
        if (!net->xfrm.policy_byidx)
                goto out_byidx;
        net->xfrm.policy_idx_hmask = hmask;

        for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
                struct xfrm_policy_hash *htab;

                net->xfrm.policy_count[dir] = 0;
                net->xfrm.policy_count[XFRM_POLICY_MAX + dir] = 0;

                htab = &net->xfrm.policy_bydst[dir];
                htab->table = xfrm_hash_alloc(sz);
                if (!htab->table)
                        goto out_bydst;
                htab->hmask = hmask;
                htab->dbits4 = 32;
                htab->sbits4 = 32;
                htab->dbits6 = 128;
                htab->sbits6 = 128;
        }
        net->xfrm.policy_hthresh.lbits4 = 32;
        net->xfrm.policy_hthresh.rbits4 = 32;
        net->xfrm.policy_hthresh.lbits6 = 128;
        net->xfrm.policy_hthresh.rbits6 = 128;

        seqlock_init(&net->xfrm.policy_hthresh.lock);

        INIT_LIST_HEAD(&net->xfrm.policy_all);
        INIT_LIST_HEAD(&net->xfrm.inexact_bins);
        INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize);
        INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild);
        return 0;

out_bydst:
        for (dir--; dir >= 0; dir--) {
                struct xfrm_policy_hash *htab;

                htab = &net->xfrm.policy_bydst[dir];
                xfrm_hash_free(htab->table, sz);
        }
        xfrm_hash_free(net->xfrm.policy_byidx, sz);
out_byidx:
        return -ENOMEM;
}

static void xfrm_policy_fini(struct net *net)
{
        struct xfrm_pol_inexact_bin *b, *t;
        unsigned int sz;
        int dir;

        flush_work(&net->xfrm.policy_hash_work);
#ifdef CONFIG_XFRM_SUB_POLICY
        xfrm_policy_flush(net, XFRM_POLICY_TYPE_SUB, false);
#endif
        xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, false);

        WARN_ON(!list_empty(&net->xfrm.policy_all));

        for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
                struct xfrm_policy_hash *htab;

                htab = &net->xfrm.policy_bydst[dir];
                sz = (htab->hmask + 1) * sizeof(struct hlist_head);
                WARN_ON(!hlist_empty(htab->table));
                xfrm_hash_free(htab->table, sz);
        }

        sz = (net->xfrm.policy_idx_hmask + 1) * sizeof(struct hlist_head);
        WARN_ON(!hlist_empty(net->xfrm.policy_byidx));
        xfrm_hash_free(net->xfrm.policy_byidx, sz);

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        list_for_each_entry_safe(b, t, &net->xfrm.inexact_bins, inexact_bins)
                __xfrm_policy_inexact_prune_bin(b, true);
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
}

static int __net_init xfrm_net_init(struct net *net)
{
        int rv;

        /* Initialize the per-net locks here */
        spin_lock_init(&net->xfrm.xfrm_state_lock);
        spin_lock_init(&net->xfrm.xfrm_policy_lock);
        seqcount_spinlock_init(&net->xfrm.xfrm_policy_hash_generation, &net->xfrm.xfrm_policy_lock);
        mutex_init(&net->xfrm.xfrm_cfg_mutex);
        net->xfrm.policy_default[XFRM_POLICY_IN] = XFRM_USERPOLICY_ACCEPT;
        net->xfrm.policy_default[XFRM_POLICY_FWD] = XFRM_USERPOLICY_ACCEPT;
        net->xfrm.policy_default[XFRM_POLICY_OUT] = XFRM_USERPOLICY_ACCEPT;

        rv = xfrm_statistics_init(net);
        if (rv < 0)
                goto out_statistics;
        rv = xfrm_state_init(net);
        if (rv < 0)
                goto out_state;
        rv = xfrm_policy_init(net);
        if (rv < 0)
                goto out_policy;
        rv = xfrm_sysctl_init(net);
        if (rv < 0)
                goto out_sysctl;

        rv = xfrm_nat_keepalive_net_init(net);
        if (rv < 0)
                goto out_nat_keepalive;

        return 0;

out_nat_keepalive:
        xfrm_sysctl_fini(net);
out_sysctl:
        xfrm_policy_fini(net);
out_policy:
        xfrm_state_fini(net);
out_state:
        xfrm_statistics_fini(net);
out_statistics:
        return rv;
}

static void __net_exit xfrm_net_exit(struct net *net)
{
        xfrm_nat_keepalive_net_fini(net);
        xfrm_sysctl_fini(net);
        xfrm_policy_fini(net);
        xfrm_state_fini(net);
        xfrm_statistics_fini(net);
}

static struct pernet_operations __net_initdata xfrm_net_ops = {
        .init = xfrm_net_init,
        .exit = xfrm_net_exit,
};

static const struct flow_dissector_key xfrm_flow_dissector_keys[] = {
        {
                .key_id = FLOW_DISSECTOR_KEY_CONTROL,
                .offset = offsetof(struct xfrm_flow_keys, control),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_BASIC,
                .offset = offsetof(struct xfrm_flow_keys, basic),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS,
                .offset = offsetof(struct xfrm_flow_keys, addrs.ipv4),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS,
                .offset = offsetof(struct xfrm_flow_keys, addrs.ipv6),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_PORTS,
                .offset = offsetof(struct xfrm_flow_keys, ports),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID,
                .offset = offsetof(struct xfrm_flow_keys, gre),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_IP,
                .offset = offsetof(struct xfrm_flow_keys, ip),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_ICMP,
                .offset = offsetof(struct xfrm_flow_keys, icmp),
        },
};

void __init xfrm_init(void)
{
        skb_flow_dissector_init(&xfrm_session_dissector,
                                xfrm_flow_dissector_keys,
                                ARRAY_SIZE(xfrm_flow_dissector_keys));

        register_pernet_subsys(&xfrm_net_ops);
        xfrm_dev_init();
        xfrm_input_init();

#ifdef CONFIG_XFRM_ESPINTCP
        espintcp_init();
#endif

        register_xfrm_state_bpf();
        xfrm_nat_keepalive_init(AF_INET);
}

#ifdef CONFIG_AUDITSYSCALL
static void xfrm_audit_common_policyinfo(struct xfrm_policy *xp,
                                         struct audit_buffer *audit_buf)
{
        struct xfrm_sec_ctx *ctx = xp->security;
        struct xfrm_selector *sel = &xp->selector;

        if (ctx)
                audit_log_format(audit_buf, " sec_alg=%u sec_doi=%u sec_obj=%s",
                                 ctx->ctx_alg, ctx->ctx_doi, ctx->ctx_str);

        switch (sel->family) {
        case AF_INET:
                audit_log_format(audit_buf, " src=%pI4", &sel->saddr.a4);
                if (sel->prefixlen_s != 32)
                        audit_log_format(audit_buf, " src_prefixlen=%d",
                                         sel->prefixlen_s);
                audit_log_format(audit_buf, " dst=%pI4", &sel->daddr.a4);
                if (sel->prefixlen_d != 32)
                        audit_log_format(audit_buf, " dst_prefixlen=%d",
                                         sel->prefixlen_d);
                break;
        case AF_INET6:
                audit_log_format(audit_buf, " src=%pI6", sel->saddr.a6);
                if (sel->prefixlen_s != 128)
                        audit_log_format(audit_buf, " src_prefixlen=%d",
                                         sel->prefixlen_s);
                audit_log_format(audit_buf, " dst=%pI6", sel->daddr.a6);
                if (sel->prefixlen_d != 128)
                        audit_log_format(audit_buf, " dst_prefixlen=%d",
                                         sel->prefixlen_d);
                break;
        }
}

void xfrm_audit_policy_add(struct xfrm_policy *xp, int result, bool task_valid)
{
        struct audit_buffer *audit_buf;

        audit_buf = xfrm_audit_start("SPD-add");
        if (audit_buf == NULL)
                return;
        xfrm_audit_helper_usrinfo(task_valid, audit_buf);
        audit_log_format(audit_buf, " res=%u", result);
        xfrm_audit_common_policyinfo(xp, audit_buf);
        audit_log_end(audit_buf);
}
EXPORT_SYMBOL_GPL(xfrm_audit_policy_add);

void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result,
                              bool task_valid)
{
        struct audit_buffer *audit_buf;

        audit_buf = xfrm_audit_start("SPD-delete");
        if (audit_buf == NULL)
                return;
        xfrm_audit_helper_usrinfo(task_valid, audit_buf);
        audit_log_format(audit_buf, " res=%u", result);
        xfrm_audit_common_policyinfo(xp, audit_buf);
        audit_log_end(audit_buf);
}
EXPORT_SYMBOL_GPL(xfrm_audit_policy_delete);
#endif

#ifdef CONFIG_XFRM_MIGRATE
static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *sel,
                                                    u8 dir, u8 type, struct net *net, u32 if_id)
{
        struct xfrm_policy *pol;
        struct flowi fl;

        memset(&fl, 0, sizeof(fl));

        fl.flowi_proto = sel->proto;

        switch (sel->family) {
        case AF_INET:
                fl.u.ip4.saddr = sel->saddr.a4;
                fl.u.ip4.daddr = sel->daddr.a4;
                if (sel->proto == IPSEC_ULPROTO_ANY)
                        break;
                fl.u.flowi4_oif = sel->ifindex;
                fl.u.ip4.fl4_sport = sel->sport;
                fl.u.ip4.fl4_dport = sel->dport;
                break;
        case AF_INET6:
                fl.u.ip6.saddr = sel->saddr.in6;
                fl.u.ip6.daddr = sel->daddr.in6;
                if (sel->proto == IPSEC_ULPROTO_ANY)
                        break;
                fl.u.flowi6_oif = sel->ifindex;
                fl.u.ip6.fl4_sport = sel->sport;
                fl.u.ip6.fl4_dport = sel->dport;
                break;
        default:
                return ERR_PTR(-EAFNOSUPPORT);
        }

        rcu_read_lock();

        pol = xfrm_policy_lookup_bytype(net, type, &fl, sel->family, dir, if_id);
        if (IS_ERR_OR_NULL(pol))
                goto out_unlock;

        if (!xfrm_pol_hold_rcu(pol))
                pol = NULL;
out_unlock:
        rcu_read_unlock();
        return pol;
}

static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tmpl *t)
{
        int match = 0;

        if (t->mode == m->mode && t->id.proto == m->proto &&
            (m->reqid == 0 || t->reqid == m->reqid)) {
                switch (t->mode) {
                case XFRM_MODE_TUNNEL:
                case XFRM_MODE_BEET:
                case XFRM_MODE_IPTFS:
                        if (xfrm_addr_equal(&t->id.daddr, &m->old_daddr,
                                            m->old_family) &&
                            xfrm_addr_equal(&t->saddr, &m->old_saddr,
                                            m->old_family)) {
                                match = 1;
                        }
                        break;
                case XFRM_MODE_TRANSPORT:
                        /* in case of transport mode, template does not store
                           any IP addresses, hence we just compare mode and
                           protocol */
                        match = 1;
                        break;
                default:
                        break;
                }
        }
        return match;
}

/* update endpoint address(es) of template(s) */
static int xfrm_policy_migrate(struct xfrm_policy *pol,
                               struct xfrm_migrate *m, int num_migrate,
                               struct netlink_ext_ack *extack)
{
        struct xfrm_migrate *mp;
        int i, j, n = 0;

        write_lock_bh(&pol->lock);
        if (unlikely(pol->walk.dead)) {
                /* target policy has been deleted */
                NL_SET_ERR_MSG(extack, "Target policy not found");
                write_unlock_bh(&pol->lock);
                return -ENOENT;
        }

        for (i = 0; i < pol->xfrm_nr; i++) {
                for (j = 0, mp = m; j < num_migrate; j++, mp++) {
                        if (!migrate_tmpl_match(mp, &pol->xfrm_vec[i]))
                                continue;
                        n++;
                        if (pol->xfrm_vec[i].mode != XFRM_MODE_TUNNEL &&
                            pol->xfrm_vec[i].mode != XFRM_MODE_BEET &&
                            pol->xfrm_vec[i].mode != XFRM_MODE_IPTFS)
                                continue;
                        /* update endpoints */
                        memcpy(&pol->xfrm_vec[i].id.daddr, &mp->new_daddr,
                               sizeof(pol->xfrm_vec[i].id.daddr));
                        memcpy(&pol->xfrm_vec[i].saddr, &mp->new_saddr,
                               sizeof(pol->xfrm_vec[i].saddr));
                        pol->xfrm_vec[i].encap_family = mp->new_family;
                        /* flush bundles */
                        atomic_inc(&pol->genid);
                }
        }

        write_unlock_bh(&pol->lock);

        if (!n)
                return -ENODATA;

        return 0;
}

static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate,
                              struct netlink_ext_ack *extack)
{
        int i, j;

        if (num_migrate < 1 || num_migrate > XFRM_MAX_DEPTH) {
                NL_SET_ERR_MSG(extack, "Invalid number of SAs to migrate, must be 0 < num <= XFRM_MAX_DEPTH (6)");
                return -EINVAL;
        }

        for (i = 0; i < num_migrate; i++) {
                if (xfrm_addr_any(&m[i].new_daddr, m[i].new_family) ||
                    xfrm_addr_any(&m[i].new_saddr, m[i].new_family)) {
                        NL_SET_ERR_MSG(extack, "Addresses in the MIGRATE attribute's list cannot be null");
                        return -EINVAL;
                }

                /* check if there is any duplicated entry */
                for (j = i + 1; j < num_migrate; j++) {
                        if (!memcmp(&m[i].old_daddr, &m[j].old_daddr,
                                    sizeof(m[i].old_daddr)) &&
                            !memcmp(&m[i].old_saddr, &m[j].old_saddr,
                                    sizeof(m[i].old_saddr)) &&
                            m[i].proto == m[j].proto &&
                            m[i].mode == m[j].mode &&
                            m[i].reqid == m[j].reqid &&
                            m[i].old_family == m[j].old_family) {
                                NL_SET_ERR_MSG(extack, "Entries in the MIGRATE attribute's list must be unique");
                                return -EINVAL;
                        }
                }
        }

        return 0;
}

int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
                 struct xfrm_migrate *m, int num_migrate,
                 struct xfrm_kmaddress *k, struct net *net,
                 struct xfrm_encap_tmpl *encap, u32 if_id,
                 struct netlink_ext_ack *extack, struct xfrm_user_offload *xuo)
{
        int i, err, nx_cur = 0, nx_new = 0;
        struct xfrm_policy *pol = NULL;
        struct xfrm_state *x, *xc;
        struct xfrm_state *x_cur[XFRM_MAX_DEPTH];
        struct xfrm_state *x_new[XFRM_MAX_DEPTH];
        struct xfrm_migrate *mp;

        /* Stage 0 - sanity checks */
        err = xfrm_migrate_check(m, num_migrate, extack);
        if (err < 0)
                goto out;

        if (dir >= XFRM_POLICY_MAX) {
                NL_SET_ERR_MSG(extack, "Invalid policy direction");
                err = -EINVAL;
                goto out;
        }

        /* Stage 1 - find policy */
        pol = xfrm_migrate_policy_find(sel, dir, type, net, if_id);
        if (IS_ERR_OR_NULL(pol)) {
                NL_SET_ERR_MSG(extack, "Target policy not found");
                err = IS_ERR(pol) ? PTR_ERR(pol) : -ENOENT;
                goto out;
        }

        /* Stage 2 - find and update state(s) */
        for (i = 0, mp = m; i < num_migrate; i++, mp++) {
                if ((x = xfrm_migrate_state_find(mp, net, if_id))) {
                        x_cur[nx_cur] = x;
                        nx_cur++;
                        xc = xfrm_state_migrate(x, mp, encap, net, xuo, extack);
                        if (xc) {
                                x_new[nx_new] = xc;
                                nx_new++;
                        } else {
                                err = -ENODATA;
                                goto restore_state;
                        }
                }
        }

        /* Stage 3 - update policy */
        err = xfrm_policy_migrate(pol, m, num_migrate, extack);
        if (err < 0)
                goto restore_state;

        /* Stage 4 - delete old state(s) */
        if (nx_cur) {
                xfrm_states_put(x_cur, nx_cur);
                xfrm_states_delete(x_cur, nx_cur);
        }

        /* Stage 5 - announce */
        km_migrate(sel, dir, type, m, num_migrate, k, encap);

        xfrm_pol_put(pol);

        return 0;
out:
        return err;

restore_state:
        if (pol)
                xfrm_pol_put(pol);
        if (nx_cur)
                xfrm_states_put(x_cur, nx_cur);
        if (nx_new)
                xfrm_states_delete(x_new, nx_new);

        return err;
}
EXPORT_SYMBOL(xfrm_migrate);
#endif

















































































  313 
  312 











































  314 

  316 

  317 







































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
// SPDX-License-Identifier: GPL-2.0-or-later
/* Common capabilities, needed by capability.o.
 */

#include <linux/capability.h>
#include <linux/audit.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/lsm_hooks.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/ptrace.h>
#include <linux/xattr.h>
#include <linux/hugetlb.h>
#include <linux/mount.h>
#include <linux/sched.h>
#include <linux/prctl.h>
#include <linux/securebits.h>
#include <linux/user_namespace.h>
#include <linux/binfmts.h>
#include <linux/personality.h>
#include <linux/mnt_idmapping.h>
#include <uapi/linux/lsm.h>

#define CREATE_TRACE_POINTS
#include <trace/events/capability.h>

/*
 * If a non-root user executes a setuid-root binary in
 * !secure(SECURE_NOROOT) mode, then we raise capabilities.
 * However if fE is also set, then the intent is for only
 * the file capabilities to be applied, and the setuid-root
 * bit is left on either to change the uid (plausible) or
 * to get full privilege on a kernel without file capabilities
 * support.  So in that case we do not raise capabilities.
 *
 * Warn if that happens, once per boot.
 */
static void warn_setuid_and_fcaps_mixed(const char *fname)
{
        static int warned;
        if (!warned) {
                printk(KERN_INFO "warning: `%s' has both setuid-root and"
                        " effective capabilities. Therefore not raising all"
                        " capabilities.\n", fname);
                warned = 1;
        }
}

/**
 * cap_capable_helper - Determine whether a task has a particular effective
 * capability.
 * @cred: The credentials to use
 * @target_ns:  The user namespace of the resource being accessed
 * @cred_ns:  The user namespace of the credentials
 * @cap: The capability to check for
 *
 * Determine whether the nominated task has the specified capability amongst
 * its effective set, returning 0 if it does, -ve if it does not.
 *
 * See cap_capable for more details.
 */
static inline int cap_capable_helper(const struct cred *cred,
                                     struct user_namespace *target_ns,
                                     const struct user_namespace *cred_ns,
                                     int cap)
{
        struct user_namespace *ns = target_ns;

        /* See if cred has the capability in the target user namespace
         * by examining the target user namespace and all of the target
         * user namespace's parents.
         */
        for (;;) {
                /* Do we have the necessary capabilities? */
                if (likely(ns == cred_ns))
                        return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;

                /*
                 * If we're already at a lower level than we're looking for,
                 * we're done searching.
                 */
                if (ns->level <= cred_ns->level)
                        return -EPERM;

                /* 
                 * The owner of the user namespace in the parent of the
                 * user namespace has all caps.
                 */
                if ((ns->parent == cred_ns) && uid_eq(ns->owner, cred->euid))
                        return 0;

                /*
                 * If you have a capability in a parent user ns, then you have
                 * it over all children user namespaces as well.
                 */
                ns = ns->parent;
        }

        /* We never get here */
}

/**
 * cap_capable - Determine whether a task has a particular effective capability
 * @cred: The credentials to use
 * @target_ns:  The user namespace of the resource being accessed
 * @cap: The capability to check for
 * @opts: Bitmask of options defined in include/linux/security.h (unused)
 *
 * Determine whether the nominated task has the specified capability amongst
 * its effective set, returning 0 if it does, -ve if it does not.
 *
 * NOTE WELL: cap_capable() has reverse semantics to the capable() call
 * and friends. That is cap_capable() returns an int 0 when a task has
 * a capability, while the kernel's capable(), has_ns_capability(),
 * has_ns_capability_noaudit(), and has_capability_noaudit() return a
 * bool true (1) for this case.
 */
int cap_capable(const struct cred *cred, struct user_namespace *target_ns,
                int cap, unsigned int opts)
{
        const struct user_namespace *cred_ns = cred->user_ns;
        int ret = cap_capable_helper(cred, target_ns, cred_ns, cap);

        trace_cap_capable(cred, target_ns, cred_ns, cap, ret);
        return ret;
}

/**
 * cap_settime - Determine whether the current process may set the system clock
 * @ts: The time to set
 * @tz: The timezone to set
 *
 * Determine whether the current process may set the system clock and timezone
 * information, returning 0 if permission granted, -ve if denied.
 */
int cap_settime(const struct timespec64 *ts, const struct timezone *tz)
{
        if (!capable(CAP_SYS_TIME))
                return -EPERM;
        return 0;
}

/**
 * cap_ptrace_access_check - Determine whether the current process may access
 *                           another
 * @child: The process to be accessed
 * @mode: The mode of attachment.
 *
 * If we are in the same or an ancestor user_ns and have all the target
 * task's capabilities, then ptrace access is allowed.
 * If we have the ptrace capability to the target user_ns, then ptrace
 * access is allowed.
 * Else denied.
 *
 * Determine whether a process may access another, returning 0 if permission
 * granted, -ve if denied.
 */
int cap_ptrace_access_check(struct task_struct *child, unsigned int mode)
{
        int ret = 0;
        const struct cred *cred, *child_cred;
        const kernel_cap_t *caller_caps;

        rcu_read_lock();
        cred = current_cred();
        child_cred = __task_cred(child);
        if (mode & PTRACE_MODE_FSCREDS)
                caller_caps = &cred->cap_effective;
        else
                caller_caps = &cred->cap_permitted;
        if (cred->user_ns == child_cred->user_ns &&
            cap_issubset(child_cred->cap_permitted, *caller_caps))
                goto out;
        if (ns_capable(child_cred->user_ns, CAP_SYS_PTRACE))
                goto out;
        ret = -EPERM;
out:
        rcu_read_unlock();
        return ret;
}

/**
 * cap_ptrace_traceme - Determine whether another process may trace the current
 * @parent: The task proposed to be the tracer
 *
 * If parent is in the same or an ancestor user_ns and has all current's
 * capabilities, then ptrace access is allowed.
 * If parent has the ptrace capability to current's user_ns, then ptrace
 * access is allowed.
 * Else denied.
 *
 * Determine whether the nominated task is permitted to trace the current
 * process, returning 0 if permission is granted, -ve if denied.
 */
int cap_ptrace_traceme(struct task_struct *parent)
{
        int ret = 0;
        const struct cred *cred, *child_cred;

        rcu_read_lock();
        cred = __task_cred(parent);
        child_cred = current_cred();
        if (cred->user_ns == child_cred->user_ns &&
            cap_issubset(child_cred->cap_permitted, cred->cap_permitted))
                goto out;
        if (has_ns_capability(parent, child_cred->user_ns, CAP_SYS_PTRACE))
                goto out;
        ret = -EPERM;
out:
        rcu_read_unlock();
        return ret;
}

/**
 * cap_capget - Retrieve a task's capability sets
 * @target: The task from which to retrieve the capability sets
 * @effective: The place to record the effective set
 * @inheritable: The place to record the inheritable set
 * @permitted: The place to record the permitted set
 *
 * This function retrieves the capabilities of the nominated task and returns
 * them to the caller.
 */
int cap_capget(const struct task_struct *target, kernel_cap_t *effective,
               kernel_cap_t *inheritable, kernel_cap_t *permitted)
{
        const struct cred *cred;

        /* Derived from kernel/capability.c:sys_capget. */
        rcu_read_lock();
        cred = __task_cred(target);
        *effective   = cred->cap_effective;
        *inheritable = cred->cap_inheritable;
        *permitted   = cred->cap_permitted;
        rcu_read_unlock();
        return 0;
}

/*
 * Determine whether the inheritable capabilities are limited to the old
 * permitted set.  Returns 1 if they are limited, 0 if they are not.
 */
static inline int cap_inh_is_capped(void)
{
        /* they are so limited unless the current task has the CAP_SETPCAP
         * capability
         */
        if (cap_capable(current_cred(), current_cred()->user_ns,
                        CAP_SETPCAP, CAP_OPT_NONE) == 0)
                return 0;
        return 1;
}

/**
 * cap_capset - Validate and apply proposed changes to current's capabilities
 * @new: The proposed new credentials; alterations should be made here
 * @old: The current task's current credentials
 * @effective: A pointer to the proposed new effective capabilities set
 * @inheritable: A pointer to the proposed new inheritable capabilities set
 * @permitted: A pointer to the proposed new permitted capabilities set
 *
 * This function validates and applies a proposed mass change to the current
 * process's capability sets.  The changes are made to the proposed new
 * credentials, and assuming no error, will be committed by the caller of LSM.
 */
int cap_capset(struct cred *new,
               const struct cred *old,
               const kernel_cap_t *effective,
               const kernel_cap_t *inheritable,
               const kernel_cap_t *permitted)
{
        if (cap_inh_is_capped() &&
            !cap_issubset(*inheritable,
                          cap_combine(old->cap_inheritable,
                                      old->cap_permitted)))
                /* incapable of using this inheritable set */
                return -EPERM;

        if (!cap_issubset(*inheritable,
                          cap_combine(old->cap_inheritable,
                                      old->cap_bset)))
                /* no new pI capabilities outside bounding set */
                return -EPERM;

        /* verify restrictions on target's new Permitted set */
        if (!cap_issubset(*permitted, old->cap_permitted))
                return -EPERM;

        /* verify the _new_Effective_ is a subset of the _new_Permitted_ */
        if (!cap_issubset(*effective, *permitted))
                return -EPERM;

        new->cap_effective   = *effective;
        new->cap_inheritable = *inheritable;
        new->cap_permitted   = *permitted;

        /*
         * Mask off ambient bits that are no longer both permitted and
         * inheritable.
         */
        new->cap_ambient = cap_intersect(new->cap_ambient,
                                         cap_intersect(*permitted,
                                                       *inheritable));
        if (WARN_ON(!cap_ambient_invariant_ok(new)))
                return -EINVAL;
        return 0;
}

/**
 * cap_inode_need_killpriv - Determine if inode change affects privileges
 * @dentry: The inode/dentry in being changed with change marked ATTR_KILL_PRIV
 *
 * Determine if an inode having a change applied that's marked ATTR_KILL_PRIV
 * affects the security markings on that inode, and if it is, should
 * inode_killpriv() be invoked or the change rejected.
 *
 * Return: 1 if security.capability has a value, meaning inode_killpriv()
 * is required, 0 otherwise, meaning inode_killpriv() is not required.
 */
int cap_inode_need_killpriv(struct dentry *dentry)
{
        struct inode *inode = d_backing_inode(dentry);
        int error;

        error = __vfs_getxattr(dentry, inode, XATTR_NAME_CAPS, NULL, 0);
        return error > 0;
}

/**
 * cap_inode_killpriv - Erase the security markings on an inode
 *
 * @idmap:        idmap of the mount the inode was found from
 * @dentry:        The inode/dentry to alter
 *
 * Erase the privilege-enhancing security markings on an inode.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 *
 * Return: 0 if successful, -ve on error.
 */
int cap_inode_killpriv(struct mnt_idmap *idmap, struct dentry *dentry)
{
        int error;

        error = __vfs_removexattr(idmap, dentry, XATTR_NAME_CAPS);
        if (error == -EOPNOTSUPP)
                error = 0;
        return error;
}

static bool rootid_owns_currentns(vfsuid_t rootvfsuid)
{
        struct user_namespace *ns;
        kuid_t kroot;

        if (!vfsuid_valid(rootvfsuid))
                return false;

        kroot = vfsuid_into_kuid(rootvfsuid);
        for (ns = current_user_ns();; ns = ns->parent) {
                if (from_kuid(ns, kroot) == 0)
                        return true;
                if (ns == &init_user_ns)
                        break;
        }

        return false;
}

static __u32 sansflags(__u32 m)
{
        return m & ~VFS_CAP_FLAGS_EFFECTIVE;
}

static bool is_v2header(int size, const struct vfs_cap_data *cap)
{
        if (size != XATTR_CAPS_SZ_2)
                return false;
        return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_2;
}

static bool is_v3header(int size, const struct vfs_cap_data *cap)
{
        if (size != XATTR_CAPS_SZ_3)
                return false;
        return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_3;
}

/*
 * getsecurity: We are called for security.* before any attempt to read the
 * xattr from the inode itself.
 *
 * This gives us a chance to read the on-disk value and convert it.  If we
 * return -EOPNOTSUPP, then vfs_getxattr() will call the i_op handler.
 *
 * Note we are not called by vfs_getxattr_alloc(), but that is only called
 * by the integrity subsystem, which really wants the unconverted values -
 * so that's good.
 */
int cap_inode_getsecurity(struct mnt_idmap *idmap,
                          struct inode *inode, const char *name, void **buffer,
                          bool alloc)
{
        int size;
        kuid_t kroot;
        vfsuid_t vfsroot;
        u32 nsmagic, magic;
        uid_t root, mappedroot;
        char *tmpbuf = NULL;
        struct vfs_cap_data *cap;
        struct vfs_ns_cap_data *nscap = NULL;
        struct dentry *dentry;
        struct user_namespace *fs_ns;

        if (strcmp(name, "capability") != 0)
                return -EOPNOTSUPP;

        dentry = d_find_any_alias(inode);
        if (!dentry)
                return -EINVAL;
        size = vfs_getxattr_alloc(idmap, dentry, XATTR_NAME_CAPS, &tmpbuf,
                                  sizeof(struct vfs_ns_cap_data), GFP_NOFS);
        dput(dentry);
        /* gcc11 complains if we don't check for !tmpbuf */
        if (size < 0 || !tmpbuf)
                goto out_free;

        fs_ns = inode->i_sb->s_user_ns;
        cap = (struct vfs_cap_data *) tmpbuf;
        if (is_v2header(size, cap)) {
                root = 0;
        } else if (is_v3header(size, cap)) {
                nscap = (struct vfs_ns_cap_data *) tmpbuf;
                root = le32_to_cpu(nscap->rootid);
        } else {
                size = -EINVAL;
                goto out_free;
        }

        kroot = make_kuid(fs_ns, root);

        /* If this is an idmapped mount shift the kuid. */
        vfsroot = make_vfsuid(idmap, fs_ns, kroot);

        /* If the root kuid maps to a valid uid in current ns, then return
         * this as a nscap. */
        mappedroot = from_kuid(current_user_ns(), vfsuid_into_kuid(vfsroot));
        if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) {
                size = sizeof(struct vfs_ns_cap_data);
                if (alloc) {
                        if (!nscap) {
                                /* v2 -> v3 conversion */
                                nscap = kzalloc(size, GFP_ATOMIC);
                                if (!nscap) {
                                        size = -ENOMEM;
                                        goto out_free;
                                }
                                nsmagic = VFS_CAP_REVISION_3;
                                magic = le32_to_cpu(cap->magic_etc);
                                if (magic & VFS_CAP_FLAGS_EFFECTIVE)
                                        nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
                                memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
                                nscap->magic_etc = cpu_to_le32(nsmagic);
                        } else {
                                /* use allocated v3 buffer */
                                tmpbuf = NULL;
                        }
                        nscap->rootid = cpu_to_le32(mappedroot);
                        *buffer = nscap;
                }
                goto out_free;
        }

        if (!rootid_owns_currentns(vfsroot)) {
                size = -EOVERFLOW;
                goto out_free;
        }

        /* This comes from a parent namespace.  Return as a v2 capability */
        size = sizeof(struct vfs_cap_data);
        if (alloc) {
                if (nscap) {
                        /* v3 -> v2 conversion */
                        cap = kzalloc(size, GFP_ATOMIC);
                        if (!cap) {
                                size = -ENOMEM;
                                goto out_free;
                        }
                        magic = VFS_CAP_REVISION_2;
                        nsmagic = le32_to_cpu(nscap->magic_etc);
                        if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE)
                                magic |= VFS_CAP_FLAGS_EFFECTIVE;
                        memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
                        cap->magic_etc = cpu_to_le32(magic);
                } else {
                        /* use unconverted v2 */
                        tmpbuf = NULL;
                }
                *buffer = cap;
        }
out_free:
        kfree(tmpbuf);
        return size;
}

/**
 * rootid_from_xattr - translate root uid of vfs caps
 *
 * @value:        vfs caps value which may be modified by this function
 * @size:        size of @ivalue
 * @task_ns:        user namespace of the caller
 */
static vfsuid_t rootid_from_xattr(const void *value, size_t size,
                                  struct user_namespace *task_ns)
{
        const struct vfs_ns_cap_data *nscap = value;
        uid_t rootid = 0;

        if (size == XATTR_CAPS_SZ_3)
                rootid = le32_to_cpu(nscap->rootid);

        return VFSUIDT_INIT(make_kuid(task_ns, rootid));
}

static bool validheader(size_t size, const struct vfs_cap_data *cap)
{
        return is_v2header(size, cap) || is_v3header(size, cap);
}

/**
 * cap_convert_nscap - check vfs caps
 *
 * @idmap:        idmap of the mount the inode was found from
 * @dentry:        used to retrieve inode to check permissions on
 * @ivalue:        vfs caps value which may be modified by this function
 * @size:        size of @ivalue
 *
 * User requested a write of security.capability.  If needed, update the
 * xattr to change from v2 to v3, or to fixup the v3 rootid.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 *
 * Return: On success, return the new size; on error, return < 0.
 */
int cap_convert_nscap(struct mnt_idmap *idmap, struct dentry *dentry,
                      const void **ivalue, size_t size)
{
        struct vfs_ns_cap_data *nscap;
        uid_t nsrootid;
        const struct vfs_cap_data *cap = *ivalue;
        __u32 magic, nsmagic;
        struct inode *inode = d_backing_inode(dentry);
        struct user_namespace *task_ns = current_user_ns(),
                *fs_ns = inode->i_sb->s_user_ns;
        kuid_t rootid;
        vfsuid_t vfsrootid;
        size_t newsize;

        if (!*ivalue)
                return -EINVAL;
        if (!validheader(size, cap))
                return -EINVAL;
        if (!capable_wrt_inode_uidgid(idmap, inode, CAP_SETFCAP))
                return -EPERM;
        if (size == XATTR_CAPS_SZ_2 && (idmap == &nop_mnt_idmap))
                if (ns_capable(inode->i_sb->s_user_ns, CAP_SETFCAP))
                        /* user is privileged, just write the v2 */
                        return size;

        vfsrootid = rootid_from_xattr(*ivalue, size, task_ns);
        if (!vfsuid_valid(vfsrootid))
                return -EINVAL;

        rootid = from_vfsuid(idmap, fs_ns, vfsrootid);
        if (!uid_valid(rootid))
                return -EINVAL;

        nsrootid = from_kuid(fs_ns, rootid);
        if (nsrootid == -1)
                return -EINVAL;

        newsize = sizeof(struct vfs_ns_cap_data);
        nscap = kmalloc(newsize, GFP_ATOMIC);
        if (!nscap)
                return -ENOMEM;
        nscap->rootid = cpu_to_le32(nsrootid);
        nsmagic = VFS_CAP_REVISION_3;
        magic = le32_to_cpu(cap->magic_etc);
        if (magic & VFS_CAP_FLAGS_EFFECTIVE)
                nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
        nscap->magic_etc = cpu_to_le32(nsmagic);
        memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);

        *ivalue = nscap;
        return newsize;
}

/*
 * Calculate the new process capability sets from the capability sets attached
 * to a file.
 */
static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
                                          struct linux_binprm *bprm,
                                          bool *effective,
                                          bool *has_fcap)
{
        struct cred *new = bprm->cred;
        int ret = 0;

        if (caps->magic_etc & VFS_CAP_FLAGS_EFFECTIVE)
                *effective = true;

        if (caps->magic_etc & VFS_CAP_REVISION_MASK)
                *has_fcap = true;

        /*
         * pP' = (X & fP) | (pI & fI)
         * The addition of pA' is handled later.
         */
        new->cap_permitted.val =
                (new->cap_bset.val & caps->permitted.val) |
                (new->cap_inheritable.val & caps->inheritable.val);

        if (caps->permitted.val & ~new->cap_permitted.val)
                /* insufficient to execute correctly */
                ret = -EPERM;

        /*
         * For legacy apps, with no internal support for recognizing they
         * do not have enough capabilities, we return an error if they are
         * missing some "forced" (aka file-permitted) capabilities.
         */
        return *effective ? ret : 0;
}

/**
 * get_vfs_caps_from_disk - retrieve vfs caps from disk
 *
 * @idmap:        idmap of the mount the inode was found from
 * @dentry:        dentry from which @inode is retrieved
 * @cpu_caps:        vfs capabilities
 *
 * Extract the on-exec-apply capability sets for an executable file.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 */
int get_vfs_caps_from_disk(struct mnt_idmap *idmap,
                           const struct dentry *dentry,
                           struct cpu_vfs_cap_data *cpu_caps)
{
        struct inode *inode = d_backing_inode(dentry);
        __u32 magic_etc;
        int size;
        struct vfs_ns_cap_data data, *nscaps = &data;
        struct vfs_cap_data *caps = (struct vfs_cap_data *) &data;
        kuid_t rootkuid;
        vfsuid_t rootvfsuid;
        struct user_namespace *fs_ns;

        memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data));

        if (!inode)
                return -ENODATA;

        fs_ns = inode->i_sb->s_user_ns;
        size = __vfs_getxattr((struct dentry *)dentry, inode,
                              XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
        if (size == -ENODATA || size == -EOPNOTSUPP)
                /* no data, that's ok */
                return -ENODATA;

        if (size < 0)
                return size;

        if (size < sizeof(magic_etc))
                return -EINVAL;

        cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);

        rootkuid = make_kuid(fs_ns, 0);
        switch (magic_etc & VFS_CAP_REVISION_MASK) {
        case VFS_CAP_REVISION_1:
                if (size != XATTR_CAPS_SZ_1)
                        return -EINVAL;
                break;
        case VFS_CAP_REVISION_2:
                if (size != XATTR_CAPS_SZ_2)
                        return -EINVAL;
                break;
        case VFS_CAP_REVISION_3:
                if (size != XATTR_CAPS_SZ_3)
                        return -EINVAL;
                rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid));
                break;

        default:
                return -EINVAL;
        }

        rootvfsuid = make_vfsuid(idmap, fs_ns, rootkuid);
        if (!vfsuid_valid(rootvfsuid))
                return -ENODATA;

        /* Limit the caps to the mounter of the filesystem
         * or the more limited uid specified in the xattr.
         */
        if (!rootid_owns_currentns(rootvfsuid))
                return -ENODATA;

        cpu_caps->permitted.val = le32_to_cpu(caps->data[0].permitted);
        cpu_caps->inheritable.val = le32_to_cpu(caps->data[0].inheritable);

        /*
         * Rev1 had just a single 32-bit word, later expanded
         * to a second one for the high bits
         */
        if ((magic_etc & VFS_CAP_REVISION_MASK) != VFS_CAP_REVISION_1) {
                cpu_caps->permitted.val += (u64)le32_to_cpu(caps->data[1].permitted) << 32;
                cpu_caps->inheritable.val += (u64)le32_to_cpu(caps->data[1].inheritable) << 32;
        }

        cpu_caps->permitted.val &= CAP_VALID_MASK;
        cpu_caps->inheritable.val &= CAP_VALID_MASK;

        cpu_caps->rootid = vfsuid_into_kuid(rootvfsuid);

        return 0;
}

/*
 * Attempt to get the on-exec apply capability sets for an executable file from
 * its xattrs and, if present, apply them to the proposed credentials being
 * constructed by execve().
 */
static int get_file_caps(struct linux_binprm *bprm, const struct file *file,
                         bool *effective, bool *has_fcap)
{
        int rc = 0;
        struct cpu_vfs_cap_data vcaps;

        cap_clear(bprm->cred->cap_permitted);

        if (!file_caps_enabled)
                return 0;

        if (!mnt_may_suid(file->f_path.mnt))
                return 0;

        /*
         * This check is redundant with mnt_may_suid() but is kept to make
         * explicit that capability bits are limited to s_user_ns and its
         * descendants.
         */
        if (!current_in_userns(file->f_path.mnt->mnt_sb->s_user_ns))
                return 0;

        rc = get_vfs_caps_from_disk(file_mnt_idmap(file),
                                    file->f_path.dentry, &vcaps);
        if (rc < 0) {
                if (rc == -EINVAL)
                        printk(KERN_NOTICE "Invalid argument reading file caps for %s\n",
                                        bprm->filename);
                else if (rc == -ENODATA)
                        rc = 0;
                goto out;
        }

        rc = bprm_caps_from_vfs_caps(&vcaps, bprm, effective, has_fcap);

out:
        if (rc)
                cap_clear(bprm->cred->cap_permitted);

        return rc;
}

static inline bool root_privileged(void) { return !issecure(SECURE_NOROOT); }

static inline bool __is_real(kuid_t uid, struct cred *cred)
{ return uid_eq(cred->uid, uid); }

static inline bool __is_eff(kuid_t uid, struct cred *cred)
{ return uid_eq(cred->euid, uid); }

static inline bool __is_suid(kuid_t uid, struct cred *cred)
{ return !__is_real(uid, cred) && __is_eff(uid, cred); }

/*
 * handle_privileged_root - Handle case of privileged root
 * @bprm: The execution parameters, including the proposed creds
 * @has_fcap: Are any file capabilities set?
 * @effective: Do we have effective root privilege?
 * @root_uid: This namespace' root UID WRT initial USER namespace
 *
 * Handle the case where root is privileged and hasn't been neutered by
 * SECURE_NOROOT.  If file capabilities are set, they won't be combined with
 * set UID root and nothing is changed.  If we are root, cap_permitted is
 * updated.  If we have become set UID root, the effective bit is set.
 */
static void handle_privileged_root(struct linux_binprm *bprm, bool has_fcap,
                                   bool *effective, kuid_t root_uid)
{
        const struct cred *old = current_cred();
        struct cred *new = bprm->cred;

        if (!root_privileged())
                return;
        /*
         * If the legacy file capability is set, then don't set privs
         * for a setuid root binary run by a non-root user.  Do set it
         * for a root user just to cause least surprise to an admin.
         */
        if (has_fcap && __is_suid(root_uid, new)) {
                warn_setuid_and_fcaps_mixed(bprm->filename);
                return;
        }
        /*
         * To support inheritance of root-permissions and suid-root
         * executables under compatibility mode, we override the
         * capability sets for the file.
         */
        if (__is_eff(root_uid, new) || __is_real(root_uid, new)) {
                /* pP' = (cap_bset & ~0) | (pI & ~0) */
                new->cap_permitted = cap_combine(old->cap_bset,
                                                 old->cap_inheritable);
        }
        /*
         * If only the real uid is 0, we do not set the effective bit.
         */
        if (__is_eff(root_uid, new))
                *effective = true;
}

#define __cap_gained(field, target, source) \
        !cap_issubset(target->cap_##field, source->cap_##field)
#define __cap_grew(target, source, cred) \
        !cap_issubset(cred->cap_##target, cred->cap_##source)
#define __cap_full(field, cred) \
        cap_issubset(CAP_FULL_SET, cred->cap_##field)

/*
 * 1) Audit candidate if current->cap_effective is set
 *
 * We do not bother to audit if 3 things are true:
 *   1) cap_effective has all caps
 *   2) we became root *OR* are were already root
 *   3) root is supposed to have all caps (SECURE_NOROOT)
 * Since this is just a normal root execing a process.
 *
 * Number 1 above might fail if you don't have a full bset, but I think
 * that is interesting information to audit.
 *
 * A number of other conditions require logging:
 * 2) something prevented setuid root getting all caps
 * 3) non-setuid root gets fcaps
 * 4) non-setuid root gets ambient
 */
static inline bool nonroot_raised_pE(struct cred *new, const struct cred *old,
                                     kuid_t root, bool has_fcap)
{
        bool ret = false;

        if ((__cap_grew(effective, ambient, new) &&
             !(__cap_full(effective, new) &&
               (__is_eff(root, new) || __is_real(root, new)) &&
               root_privileged())) ||
            (root_privileged() &&
             __is_suid(root, new) &&
             !__cap_full(effective, new)) ||
            (uid_eq(new->euid, old->euid) &&
             ((has_fcap &&
               __cap_gained(permitted, new, old)) ||
              __cap_gained(ambient, new, old))))

                ret = true;

        return ret;
}

/**
 * cap_bprm_creds_from_file - Set up the proposed credentials for execve().
 * @bprm: The execution parameters, including the proposed creds
 * @file: The file to pull the credentials from
 *
 * Set up the proposed credentials for a new execution context being
 * constructed by execve().  The proposed creds in @bprm->cred is altered,
 * which won't take effect immediately.
 *
 * Return: 0 if successful, -ve on error.
 */
int cap_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file)
{
        /* Process setpcap binaries and capabilities for uid 0 */
        const struct cred *old = current_cred();
        struct cred *new = bprm->cred;
        bool effective = false, has_fcap = false, id_changed;
        int ret;
        kuid_t root_uid;

        if (WARN_ON(!cap_ambient_invariant_ok(old)))
                return -EPERM;

        ret = get_file_caps(bprm, file, &effective, &has_fcap);
        if (ret < 0)
                return ret;

        root_uid = make_kuid(new->user_ns, 0);

        handle_privileged_root(bprm, has_fcap, &effective, root_uid);

        /* if we have fs caps, clear dangerous personality flags */
        if (__cap_gained(permitted, new, old))
                bprm->per_clear |= PER_CLEAR_ON_SETID;

        /* Don't let someone trace a set[ug]id/setpcap binary with the revised
         * credentials unless they have the appropriate permit.
         *
         * In addition, if NO_NEW_PRIVS, then ensure we get no new privs.
         */
        id_changed = !uid_eq(new->euid, old->euid) || !in_group_p(new->egid);

        if ((id_changed || __cap_gained(permitted, new, old)) &&
            ((bprm->unsafe & ~LSM_UNSAFE_PTRACE) ||
             !ptracer_capable(current, new->user_ns))) {
                /* downgrade; they get no more than they had, and maybe less */
                if (!ns_capable(new->user_ns, CAP_SETUID) ||
                    (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) {
                        new->euid = new->uid;
                        new->egid = new->gid;
                }
                new->cap_permitted = cap_intersect(new->cap_permitted,
                                                   old->cap_permitted);
        }

        new->suid = new->fsuid = new->euid;
        new->sgid = new->fsgid = new->egid;

        /* File caps or setid cancels ambient. */
        if (has_fcap || id_changed)
                cap_clear(new->cap_ambient);

        /*
         * Now that we've computed pA', update pP' to give:
         *   pP' = (X & fP) | (pI & fI) | pA'
         */
        new->cap_permitted = cap_combine(new->cap_permitted, new->cap_ambient);

        /*
         * Set pE' = (fE ? pP' : pA').  Because pA' is zero if fE is set,
         * this is the same as pE' = (fE ? pP' : 0) | pA'.
         */
        if (effective)
                new->cap_effective = new->cap_permitted;
        else
                new->cap_effective = new->cap_ambient;

        if (WARN_ON(!cap_ambient_invariant_ok(new)))
                return -EPERM;

        if (nonroot_raised_pE(new, old, root_uid, has_fcap)) {
                ret = audit_log_bprm_fcaps(bprm, new, old);
                if (ret < 0)
                        return ret;
        }

        new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);

        if (WARN_ON(!cap_ambient_invariant_ok(new)))
                return -EPERM;

        /* Check for privilege-elevated exec. */
        if (id_changed ||
            !uid_eq(new->euid, old->uid) ||
            !gid_eq(new->egid, old->gid) ||
            (!__is_real(root_uid, new) &&
             (effective ||
              __cap_grew(permitted, ambient, new))))
                bprm->secureexec = 1;

        return 0;
}

/**
 * cap_inode_setxattr - Determine whether an xattr may be altered
 * @dentry: The inode/dentry being altered
 * @name: The name of the xattr to be changed
 * @value: The value that the xattr will be changed to
 * @size: The size of value
 * @flags: The replacement flag
 *
 * Determine whether an xattr may be altered or set on an inode, returning 0 if
 * permission is granted, -ve if denied.
 *
 * This is used to make sure security xattrs don't get updated or set by those
 * who aren't privileged to do so.
 */
int cap_inode_setxattr(struct dentry *dentry, const char *name,
                       const void *value, size_t size, int flags)
{
        struct user_namespace *user_ns = dentry->d_sb->s_user_ns;

        /* Ignore non-security xattrs */
        if (strncmp(name, XATTR_SECURITY_PREFIX,
                        XATTR_SECURITY_PREFIX_LEN) != 0)
                return 0;

        /*
         * For XATTR_NAME_CAPS the check will be done in
         * cap_convert_nscap(), called by setxattr()
         */
        if (strcmp(name, XATTR_NAME_CAPS) == 0)
                return 0;

        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;
        return 0;
}

/**
 * cap_inode_removexattr - Determine whether an xattr may be removed
 *
 * @idmap:        idmap of the mount the inode was found from
 * @dentry:        The inode/dentry being altered
 * @name:        The name of the xattr to be changed
 *
 * Determine whether an xattr may be removed from an inode, returning 0 if
 * permission is granted, -ve if denied.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 *
 * This is used to make sure security xattrs don't get removed by those who
 * aren't privileged to remove them.
 */
int cap_inode_removexattr(struct mnt_idmap *idmap,
                          struct dentry *dentry, const char *name)
{
        struct user_namespace *user_ns = dentry->d_sb->s_user_ns;

        /* Ignore non-security xattrs */
        if (strncmp(name, XATTR_SECURITY_PREFIX,
                        XATTR_SECURITY_PREFIX_LEN) != 0)
                return 0;

        if (strcmp(name, XATTR_NAME_CAPS) == 0) {
                /* security.capability gets namespaced */
                struct inode *inode = d_backing_inode(dentry);
                if (!inode)
                        return -EINVAL;
                if (!capable_wrt_inode_uidgid(idmap, inode, CAP_SETFCAP))
                        return -EPERM;
                return 0;
        }

        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;
        return 0;
}

/*
 * cap_emulate_setxuid() fixes the effective / permitted capabilities of
 * a process after a call to setuid, setreuid, or setresuid.
 *
 *  1) When set*uiding _from_ one of {r,e,s}uid == 0 _to_ all of
 *  {r,e,s}uid != 0, the permitted and effective capabilities are
 *  cleared.
 *
 *  2) When set*uiding _from_ euid == 0 _to_ euid != 0, the effective
 *  capabilities of the process are cleared.
 *
 *  3) When set*uiding _from_ euid != 0 _to_ euid == 0, the effective
 *  capabilities are set to the permitted capabilities.
 *
 *  fsuid is handled elsewhere. fsuid == 0 and {r,e,s}uid!= 0 should
 *  never happen.
 *
 *  -astor
 *
 * cevans - New behaviour, Oct '99
 * A process may, via prctl(), elect to keep its capabilities when it
 * calls setuid() and switches away from uid==0. Both permitted and
 * effective sets will be retained.
 * Without this change, it was impossible for a daemon to drop only some
 * of its privilege. The call to setuid(!=0) would drop all privileges!
 * Keeping uid 0 is not an option because uid 0 owns too many vital
 * files..
 * Thanks to Olaf Kirch and Peter Benie for spotting this.
 */
static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old)
{
        kuid_t root_uid = make_kuid(old->user_ns, 0);

        if ((uid_eq(old->uid, root_uid) ||
             uid_eq(old->euid, root_uid) ||
             uid_eq(old->suid, root_uid)) &&
            (!uid_eq(new->uid, root_uid) &&
             !uid_eq(new->euid, root_uid) &&
             !uid_eq(new->suid, root_uid))) {
                if (!issecure(SECURE_KEEP_CAPS)) {
                        cap_clear(new->cap_permitted);
                        cap_clear(new->cap_effective);
                }

                /*
                 * Pre-ambient programs expect setresuid to nonroot followed
                 * by exec to drop capabilities.  We should make sure that
                 * this remains the case.
                 */
                cap_clear(new->cap_ambient);
        }
        if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid))
                cap_clear(new->cap_effective);
        if (!uid_eq(old->euid, root_uid) && uid_eq(new->euid, root_uid))
                new->cap_effective = new->cap_permitted;
}

/**
 * cap_task_fix_setuid - Fix up the results of setuid() call
 * @new: The proposed credentials
 * @old: The current task's current credentials
 * @flags: Indications of what has changed
 *
 * Fix up the results of setuid() call before the credential changes are
 * actually applied.
 *
 * Return: 0 to grant the changes, -ve to deny them.
 */
int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags)
{
        switch (flags) {
        case LSM_SETID_RE:
        case LSM_SETID_ID:
        case LSM_SETID_RES:
                /* juggle the capabilities to follow [RES]UID changes unless
                 * otherwise suppressed */
                if (!issecure(SECURE_NO_SETUID_FIXUP))
                        cap_emulate_setxuid(new, old);
                break;

        case LSM_SETID_FS:
                /* juggle the capabilities to follow FSUID changes, unless
                 * otherwise suppressed
                 *
                 * FIXME - is fsuser used for all CAP_FS_MASK capabilities?
                 *          if not, we might be a bit too harsh here.
                 */
                if (!issecure(SECURE_NO_SETUID_FIXUP)) {
                        kuid_t root_uid = make_kuid(old->user_ns, 0);
                        if (uid_eq(old->fsuid, root_uid) && !uid_eq(new->fsuid, root_uid))
                                new->cap_effective =
                                        cap_drop_fs_set(new->cap_effective);

                        if (!uid_eq(old->fsuid, root_uid) && uid_eq(new->fsuid, root_uid))
                                new->cap_effective =
                                        cap_raise_fs_set(new->cap_effective,
                                                         new->cap_permitted);
                }
                break;

        default:
                return -EINVAL;
        }

        return 0;
}

/*
 * Rationale: code calling task_setscheduler, task_setioprio, and
 * task_setnice, assumes that
 *   . if capable(cap_sys_nice), then those actions should be allowed
 *   . if not capable(cap_sys_nice), but acting on your own processes,
 *           then those actions should be allowed
 * This is insufficient now since you can call code without suid, but
 * yet with increased caps.
 * So we check for increased caps on the target process.
 */
static int cap_safe_nice(struct task_struct *p)
{
        int is_subset, ret = 0;

        rcu_read_lock();
        is_subset = cap_issubset(__task_cred(p)->cap_permitted,
                                 current_cred()->cap_permitted);
        if (!is_subset && !ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
                ret = -EPERM;
        rcu_read_unlock();

        return ret;
}

/**
 * cap_task_setscheduler - Determine if scheduler policy change is permitted
 * @p: The task to affect
 *
 * Determine if the requested scheduler policy change is permitted for the
 * specified task.
 *
 * Return: 0 if permission is granted, -ve if denied.
 */
int cap_task_setscheduler(struct task_struct *p)
{
        return cap_safe_nice(p);
}

/**
 * cap_task_setioprio - Determine if I/O priority change is permitted
 * @p: The task to affect
 * @ioprio: The I/O priority to set
 *
 * Determine if the requested I/O priority change is permitted for the specified
 * task.
 *
 * Return: 0 if permission is granted, -ve if denied.
 */
int cap_task_setioprio(struct task_struct *p, int ioprio)
{
        return cap_safe_nice(p);
}

/**
 * cap_task_setnice - Determine if task priority change is permitted
 * @p: The task to affect
 * @nice: The nice value to set
 *
 * Determine if the requested task priority change is permitted for the
 * specified task.
 *
 * Return: 0 if permission is granted, -ve if denied.
 */
int cap_task_setnice(struct task_struct *p, int nice)
{
        return cap_safe_nice(p);
}

/*
 * Implement PR_CAPBSET_DROP.  Attempt to remove the specified capability from
 * the current task's bounding set.  Returns 0 on success, -ve on error.
 */
static int cap_prctl_drop(unsigned long cap)
{
        struct cred *new;

        if (!ns_capable(current_user_ns(), CAP_SETPCAP))
                return -EPERM;
        if (!cap_valid(cap))
                return -EINVAL;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;
        cap_lower(new->cap_bset, cap);
        return commit_creds(new);
}

/**
 * cap_task_prctl - Implement process control functions for this security module
 * @option: The process control function requested
 * @arg2: The argument data for this function
 * @arg3: The argument data for this function
 * @arg4: The argument data for this function
 * @arg5: The argument data for this function
 *
 * Allow process control functions (sys_prctl()) to alter capabilities; may
 * also deny access to other functions not otherwise implemented here.
 *
 * Return: 0 or +ve on success, -ENOSYS if this function is not implemented
 * here, other -ve on error.  If -ENOSYS is returned, sys_prctl() and other LSM
 * modules will consider performing the function.
 */
int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
                   unsigned long arg4, unsigned long arg5)
{
        const struct cred *old = current_cred();
        struct cred *new;

        switch (option) {
        case PR_CAPBSET_READ:
                if (!cap_valid(arg2))
                        return -EINVAL;
                return !!cap_raised(old->cap_bset, arg2);

        case PR_CAPBSET_DROP:
                return cap_prctl_drop(arg2);

        /*
         * The next four prctl's remain to assist with transitioning a
         * system from legacy UID=0 based privilege (when filesystem
         * capabilities are not in use) to a system using filesystem
         * capabilities only - as the POSIX.1e draft intended.
         *
         * Note:
         *
         *  PR_SET_SECUREBITS =
         *      issecure_mask(SECURE_KEEP_CAPS_LOCKED)
         *    | issecure_mask(SECURE_NOROOT)
         *    | issecure_mask(SECURE_NOROOT_LOCKED)
         *    | issecure_mask(SECURE_NO_SETUID_FIXUP)
         *    | issecure_mask(SECURE_NO_SETUID_FIXUP_LOCKED)
         *
         * will ensure that the current process and all of its
         * children will be locked into a pure
         * capability-based-privilege environment.
         */
        case PR_SET_SECUREBITS:
                if ((((old->securebits & SECURE_ALL_LOCKS) >> 1)
                     & (old->securebits ^ arg2))                        /*[1]*/
                    || ((old->securebits & SECURE_ALL_LOCKS & ~arg2))        /*[2]*/
                    || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS))        /*[3]*/
                        /*
                         * [1] no changing of bits that are locked
                         * [2] no unlocking of locks
                         * [3] no setting of unsupported bits
                         */
                    )
                        /* cannot change a locked bit */
                        return -EPERM;

                /*
                 * Doing anything requires privilege (go read about the
                 * "sendmail capabilities bug"), except for unprivileged bits.
                 * Indeed, the SECURE_ALL_UNPRIVILEGED bits are not
                 * restrictions enforced by the kernel but by user space on
                 * itself.
                 */
                if (cap_capable(current_cred(), current_cred()->user_ns,
                                CAP_SETPCAP, CAP_OPT_NONE) != 0) {
                        const unsigned long unpriv_and_locks =
                                SECURE_ALL_UNPRIVILEGED |
                                SECURE_ALL_UNPRIVILEGED << 1;
                        const unsigned long changed = old->securebits ^ arg2;

                        /* For legacy reason, denies non-change. */
                        if (!changed)
                                return -EPERM;

                        /* Denies privileged changes. */
                        if (changed & ~unpriv_and_locks)
                                return -EPERM;
                }

                new = prepare_creds();
                if (!new)
                        return -ENOMEM;
                new->securebits = arg2;
                return commit_creds(new);

        case PR_GET_SECUREBITS:
                return old->securebits;

        case PR_GET_KEEPCAPS:
                return !!issecure(SECURE_KEEP_CAPS);

        case PR_SET_KEEPCAPS:
                if (arg2 > 1) /* Note, we rely on arg2 being unsigned here */
                        return -EINVAL;
                if (issecure(SECURE_KEEP_CAPS_LOCKED))
                        return -EPERM;

                new = prepare_creds();
                if (!new)
                        return -ENOMEM;
                if (arg2)
                        new->securebits |= issecure_mask(SECURE_KEEP_CAPS);
                else
                        new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
                return commit_creds(new);

        case PR_CAP_AMBIENT:
                if (arg2 == PR_CAP_AMBIENT_CLEAR_ALL) {
                        if (arg3 | arg4 | arg5)
                                return -EINVAL;

                        new = prepare_creds();
                        if (!new)
                                return -ENOMEM;
                        cap_clear(new->cap_ambient);
                        return commit_creds(new);
                }

                if (((!cap_valid(arg3)) | arg4 | arg5))
                        return -EINVAL;

                if (arg2 == PR_CAP_AMBIENT_IS_SET) {
                        return !!cap_raised(current_cred()->cap_ambient, arg3);
                } else if (arg2 != PR_CAP_AMBIENT_RAISE &&
                           arg2 != PR_CAP_AMBIENT_LOWER) {
                        return -EINVAL;
                } else {
                        if (arg2 == PR_CAP_AMBIENT_RAISE &&
                            (!cap_raised(current_cred()->cap_permitted, arg3) ||
                             !cap_raised(current_cred()->cap_inheritable,
                                         arg3) ||
                             issecure(SECURE_NO_CAP_AMBIENT_RAISE)))
                                return -EPERM;

                        new = prepare_creds();
                        if (!new)
                                return -ENOMEM;
                        if (arg2 == PR_CAP_AMBIENT_RAISE)
                                cap_raise(new->cap_ambient, arg3);
                        else
                                cap_lower(new->cap_ambient, arg3);
                        return commit_creds(new);
                }

        default:
                /* No functionality available - continue with default */
                return -ENOSYS;
        }
}

/**
 * cap_vm_enough_memory - Determine whether a new virtual mapping is permitted
 * @mm: The VM space in which the new mapping is to be made
 * @pages: The size of the mapping
 *
 * Determine whether the allocation of a new virtual mapping by the current
 * task is permitted.
 *
 * Return: 0 if permission granted, negative error code if not.
 */
int cap_vm_enough_memory(struct mm_struct *mm, long pages)
{
        return cap_capable(current_cred(), &init_user_ns, CAP_SYS_ADMIN,
                           CAP_OPT_NOAUDIT);
}

/**
 * cap_mmap_addr - check if able to map given addr
 * @addr: address attempting to be mapped
 *
 * If the process is attempting to map memory below dac_mmap_min_addr they need
 * CAP_SYS_RAWIO.  The other parameters to this function are unused by the
 * capability security module.
 *
 * Return: 0 if this mapping should be allowed or -EPERM if not.
 */
int cap_mmap_addr(unsigned long addr)
{
        int ret = 0;

        if (addr < dac_mmap_min_addr) {
                ret = cap_capable(current_cred(), &init_user_ns, CAP_SYS_RAWIO,
                                  CAP_OPT_NONE);
                /* set PF_SUPERPRIV if it turns out we allow the low mmap */
                if (ret == 0)
                        current->flags |= PF_SUPERPRIV;
        }
        return ret;
}

#ifdef CONFIG_SECURITY

static const struct lsm_id capability_lsmid = {
        .name = "capability",
        .id = LSM_ID_CAPABILITY,
};

static struct security_hook_list capability_hooks[] __ro_after_init = {
        LSM_HOOK_INIT(capable, cap_capable),
        LSM_HOOK_INIT(settime, cap_settime),
        LSM_HOOK_INIT(ptrace_access_check, cap_ptrace_access_check),
        LSM_HOOK_INIT(ptrace_traceme, cap_ptrace_traceme),
        LSM_HOOK_INIT(capget, cap_capget),
        LSM_HOOK_INIT(capset, cap_capset),
        LSM_HOOK_INIT(bprm_creds_from_file, cap_bprm_creds_from_file),
        LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv),
        LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv),
        LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity),
        LSM_HOOK_INIT(mmap_addr, cap_mmap_addr),
        LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid),
        LSM_HOOK_INIT(task_prctl, cap_task_prctl),
        LSM_HOOK_INIT(task_setscheduler, cap_task_setscheduler),
        LSM_HOOK_INIT(task_setioprio, cap_task_setioprio),
        LSM_HOOK_INIT(task_setnice, cap_task_setnice),
        LSM_HOOK_INIT(vm_enough_memory, cap_vm_enough_memory),
};

static int __init capability_init(void)
{
        security_add_hooks(capability_hooks, ARRAY_SIZE(capability_hooks),
                           &capability_lsmid);
        return 0;
}

DEFINE_LSM(capability) = {
        .name = "capability",
        .order = LSM_ORDER_FIRST,
        .init = capability_init,
};

#endif /* CONFIG_SECURITY */










































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_UIDGID_H
#define _LINUX_UIDGID_H

/*
 * A set of types for the internal kernel types representing uids and gids.
 *
 * The types defined in this header allow distinguishing which uids and gids in
 * the kernel are values used by userspace and which uid and gid values are
 * the internal kernel values.  With the addition of user namespaces the values
 * can be different.  Using the type system makes it possible for the compiler
 * to detect when we overlook these differences.
 *
 */
#include <linux/uidgid_types.h>
#include <linux/highuid.h>

struct user_namespace;
extern struct user_namespace init_user_ns;
struct uid_gid_map;

#define KUIDT_INIT(value) (kuid_t){ value }
#define KGIDT_INIT(value) (kgid_t){ value }

#ifdef CONFIG_MULTIUSER
static inline uid_t __kuid_val(kuid_t uid)
{
        return uid.val;
}

static inline gid_t __kgid_val(kgid_t gid)
{
        return gid.val;
}
#else
static inline uid_t __kuid_val(kuid_t uid)
{
        return 0;
}

static inline gid_t __kgid_val(kgid_t gid)
{
        return 0;
}
#endif

#define GLOBAL_ROOT_UID KUIDT_INIT(0)
#define GLOBAL_ROOT_GID KGIDT_INIT(0)

#define INVALID_UID KUIDT_INIT(-1)
#define INVALID_GID KGIDT_INIT(-1)

static inline bool uid_eq(kuid_t left, kuid_t right)
{
        return __kuid_val(left) == __kuid_val(right);
}

static inline bool gid_eq(kgid_t left, kgid_t right)
{
        return __kgid_val(left) == __kgid_val(right);
}

static inline bool uid_gt(kuid_t left, kuid_t right)
{
        return __kuid_val(left) > __kuid_val(right);
}

static inline bool gid_gt(kgid_t left, kgid_t right)
{
        return __kgid_val(left) > __kgid_val(right);
}

static inline bool uid_gte(kuid_t left, kuid_t right)
{
        return __kuid_val(left) >= __kuid_val(right);
}

static inline bool gid_gte(kgid_t left, kgid_t right)
{
        return __kgid_val(left) >= __kgid_val(right);
}

static inline bool uid_lt(kuid_t left, kuid_t right)
{
        return __kuid_val(left) < __kuid_val(right);
}

static inline bool gid_lt(kgid_t left, kgid_t right)
{
        return __kgid_val(left) < __kgid_val(right);
}

static inline bool uid_lte(kuid_t left, kuid_t right)
{
        return __kuid_val(left) <= __kuid_val(right);
}

static inline bool gid_lte(kgid_t left, kgid_t right)
{
        return __kgid_val(left) <= __kgid_val(right);
}

static inline bool uid_valid(kuid_t uid)
{
        return __kuid_val(uid) != (uid_t) -1;
}

static inline bool gid_valid(kgid_t gid)
{
        return __kgid_val(gid) != (gid_t) -1;
}

#ifdef CONFIG_USER_NS

extern kuid_t make_kuid(struct user_namespace *from, uid_t uid);
extern kgid_t make_kgid(struct user_namespace *from, gid_t gid);

extern uid_t from_kuid(struct user_namespace *to, kuid_t uid);
extern gid_t from_kgid(struct user_namespace *to, kgid_t gid);
extern uid_t from_kuid_munged(struct user_namespace *to, kuid_t uid);
extern gid_t from_kgid_munged(struct user_namespace *to, kgid_t gid);

static inline bool kuid_has_mapping(struct user_namespace *ns, kuid_t uid)
{
        return from_kuid(ns, uid) != (uid_t) -1;
}

static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid)
{
        return from_kgid(ns, gid) != (gid_t) -1;
}

u32 map_id_down(struct uid_gid_map *map, u32 id);
u32 map_id_up(struct uid_gid_map *map, u32 id);
u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count);

#else

static inline kuid_t make_kuid(struct user_namespace *from, uid_t uid)
{
        return KUIDT_INIT(uid);
}

static inline kgid_t make_kgid(struct user_namespace *from, gid_t gid)
{
        return KGIDT_INIT(gid);
}

static inline uid_t from_kuid(struct user_namespace *to, kuid_t kuid)
{
        return __kuid_val(kuid);
}

static inline gid_t from_kgid(struct user_namespace *to, kgid_t kgid)
{
        return __kgid_val(kgid);
}

static inline uid_t from_kuid_munged(struct user_namespace *to, kuid_t kuid)
{
        uid_t uid = from_kuid(to, kuid);
        if (uid == (uid_t)-1)
                uid = overflowuid;
        return uid;
}

static inline gid_t from_kgid_munged(struct user_namespace *to, kgid_t kgid)
{
        gid_t gid = from_kgid(to, kgid);
        if (gid == (gid_t)-1)
                gid = overflowgid;
        return gid;
}

static inline bool kuid_has_mapping(struct user_namespace *ns, kuid_t uid)
{
        return uid_valid(uid);
}

static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid)
{
        return gid_valid(gid);
}

static inline u32 map_id_down(struct uid_gid_map *map, u32 id)
{
        return id;
}

static inline u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count)
{
        return id;
}

static inline u32 map_id_up(struct uid_gid_map *map, u32 id)
{
        return id;
}
#endif /* CONFIG_USER_NS */

#endif /* _LINUX_UIDGID_H */






























































   42 








    1 








































   12 

   38 




    2 






    2 









































   41 



   42 

   42 







   42 
   42 































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
  Red Black Trees
  (C) 1999  Andrea Arcangeli <andrea@suse.de>
  

  linux/include/linux/rbtree.h

  To use rbtrees you'll have to implement your own insert and search cores.
  This will avoid us to use callbacks and to drop drammatically performances.
  I know it's not the cleaner way,  but in C (not in C++) to get
  performances and genericity...

  See Documentation/core-api/rbtree.rst for documentation and samples.
*/

#ifndef        _LINUX_RBTREE_H
#define        _LINUX_RBTREE_H

#include <linux/container_of.h>
#include <linux/rbtree_types.h>

#include <linux/stddef.h>
#include <linux/rcupdate.h>

#define rb_parent(r)   ((struct rb_node *)((r)->__rb_parent_color & ~3))

#define        rb_entry(ptr, type, member) container_of(ptr, type, member)

#define RB_EMPTY_ROOT(root)  (READ_ONCE((root)->rb_node) == NULL)

/* 'empty' nodes are nodes that are known not to be inserted in an rbtree */
#define RB_EMPTY_NODE(node)  \
        ((node)->__rb_parent_color == (unsigned long)(node))
#define RB_CLEAR_NODE(node)  \
        ((node)->__rb_parent_color = (unsigned long)(node))


extern void rb_insert_color(struct rb_node *, struct rb_root *);
extern void rb_erase(struct rb_node *, struct rb_root *);


/* Find logical next and previous nodes in a tree */
extern struct rb_node *rb_next(const struct rb_node *);
extern struct rb_node *rb_prev(const struct rb_node *);
extern struct rb_node *rb_first(const struct rb_root *);
extern struct rb_node *rb_last(const struct rb_root *);

/* Postorder iteration - always visit the parent after its children */
extern struct rb_node *rb_first_postorder(const struct rb_root *);
extern struct rb_node *rb_next_postorder(const struct rb_node *);

/* Fast replacement of a single node without remove/rebalance/add/rebalance */
extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
                            struct rb_root *root);
extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
                                struct rb_root *root);

static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
                                struct rb_node **rb_link)
{
        node->__rb_parent_color = (unsigned long)parent;
        node->rb_left = node->rb_right = NULL;

        *rb_link = node;
}

static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
                                    struct rb_node **rb_link)
{
        node->__rb_parent_color = (unsigned long)parent;
        node->rb_left = node->rb_right = NULL;

        rcu_assign_pointer(*rb_link, node);
}

#define rb_entry_safe(ptr, type, member) \
        ({ typeof(ptr) ____ptr = (ptr); \
           ____ptr ? rb_entry(____ptr, type, member) : NULL; \
        })

/**
 * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of
 * given type allowing the backing memory of @pos to be invalidated
 *
 * @pos:        the 'type *' to use as a loop cursor.
 * @n:                another 'type *' to use as temporary storage
 * @root:        'rb_root *' of the rbtree.
 * @field:        the name of the rb_node field within 'type'.
 *
 * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as
 * list_for_each_entry_safe() and allows the iteration to continue independent
 * of changes to @pos by the body of the loop.
 *
 * Note, however, that it cannot handle other modifications that re-order the
 * rbtree it is iterating over. This includes calling rb_erase() on @pos, as
 * rb_erase() may rebalance the tree, causing us to miss some nodes.
 */
#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \
        for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \
             pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \
                        typeof(*pos), field); 1; }); \
             pos = n)

/* Same as rb_first(), but O(1) */
#define rb_first_cached(root) (root)->rb_leftmost

static inline void rb_insert_color_cached(struct rb_node *node,
                                          struct rb_root_cached *root,
                                          bool leftmost)
{
        if (leftmost)
                root->rb_leftmost = node;
        rb_insert_color(node, &root->rb_root);
}


static inline struct rb_node *
rb_erase_cached(struct rb_node *node, struct rb_root_cached *root)
{
        struct rb_node *leftmost = NULL;

        if (root->rb_leftmost == node)
                leftmost = root->rb_leftmost = rb_next(node);

        rb_erase(node, &root->rb_root);

        return leftmost;
}

static inline void rb_replace_node_cached(struct rb_node *victim,
                                          struct rb_node *new,
                                          struct rb_root_cached *root)
{
        if (root->rb_leftmost == victim)
                root->rb_leftmost = new;
        rb_replace_node(victim, new, &root->rb_root);
}

/*
 * The below helper functions use 2 operators with 3 different
 * calling conventions. The operators are related like:
 *
 *        comp(a->key,b) < 0  := less(a,b)
 *        comp(a->key,b) > 0  := less(b,a)
 *        comp(a->key,b) == 0 := !less(a,b) && !less(b,a)
 *
 * If these operators define a partial order on the elements we make no
 * guarantee on which of the elements matching the key is found. See
 * rb_find().
 *
 * The reason for this is to allow the find() interface without requiring an
 * on-stack dummy object, which might not be feasible due to object size.
 */

/**
 * rb_add_cached() - insert @node into the leftmost cached tree @tree
 * @node: node to insert
 * @tree: leftmost cached tree to insert @node into
 * @less: operator defining the (partial) node order
 *
 * Returns @node when it is the new leftmost, or NULL.
 */
static __always_inline struct rb_node *
rb_add_cached(struct rb_node *node, struct rb_root_cached *tree,
              bool (*less)(struct rb_node *, const struct rb_node *))
{
        struct rb_node **link = &tree->rb_root.rb_node;
        struct rb_node *parent = NULL;
        bool leftmost = true;

        while (*link) {
                parent = *link;
                if (less(node, parent)) {
                        link = &parent->rb_left;
                } else {
                        link = &parent->rb_right;
                        leftmost = false;
                }
        }

        rb_link_node(node, parent, link);
        rb_insert_color_cached(node, tree, leftmost);

        return leftmost ? node : NULL;
}

/**
 * rb_add() - insert @node into @tree
 * @node: node to insert
 * @tree: tree to insert @node into
 * @less: operator defining the (partial) node order
 */
static __always_inline void
rb_add(struct rb_node *node, struct rb_root *tree,
       bool (*less)(struct rb_node *, const struct rb_node *))
{
        struct rb_node **link = &tree->rb_node;
        struct rb_node *parent = NULL;

        while (*link) {
                parent = *link;
                if (less(node, parent))
                        link = &parent->rb_left;
                else
                        link = &parent->rb_right;
        }

        rb_link_node(node, parent, link);
        rb_insert_color(node, tree);
}

/**
 * rb_find_add_cached() - find equivalent @node in @tree, or add @node
 * @node: node to look-for / insert
 * @tree: tree to search / modify
 * @cmp: operator defining the node order
 *
 * Returns the rb_node matching @node, or NULL when no match is found and @node
 * is inserted.
 */
static __always_inline struct rb_node *
rb_find_add_cached(struct rb_node *node, struct rb_root_cached *tree,
            int (*cmp)(const struct rb_node *new, const struct rb_node *exist))
{
        bool leftmost = true;
        struct rb_node **link = &tree->rb_root.rb_node;
        struct rb_node *parent = NULL;
        int c;

        while (*link) {
                parent = *link;
                c = cmp(node, parent);

                if (c < 0) {
                        link = &parent->rb_left;
                } else if (c > 0) {
                        link = &parent->rb_right;
                        leftmost = false;
                } else {
                        return parent;
                }
        }

        rb_link_node(node, parent, link);
        rb_insert_color_cached(node, tree, leftmost);
        return NULL;
}

/**
 * rb_find_add() - find equivalent @node in @tree, or add @node
 * @node: node to look-for / insert
 * @tree: tree to search / modify
 * @cmp: operator defining the node order
 *
 * Returns the rb_node matching @node, or NULL when no match is found and @node
 * is inserted.
 */
static __always_inline struct rb_node *
rb_find_add(struct rb_node *node, struct rb_root *tree,
            int (*cmp)(struct rb_node *, const struct rb_node *))
{
        struct rb_node **link = &tree->rb_node;
        struct rb_node *parent = NULL;
        int c;

        while (*link) {
                parent = *link;
                c = cmp(node, parent);

                if (c < 0)
                        link = &parent->rb_left;
                else if (c > 0)
                        link = &parent->rb_right;
                else
                        return parent;
        }

        rb_link_node(node, parent, link);
        rb_insert_color(node, tree);
        return NULL;
}

/**
 * rb_find_add_rcu() - find equivalent @node in @tree, or add @node
 * @node: node to look-for / insert
 * @tree: tree to search / modify
 * @cmp: operator defining the node order
 *
 * Adds a Store-Release for link_node.
 *
 * Returns the rb_node matching @node, or NULL when no match is found and @node
 * is inserted.
 */
static __always_inline struct rb_node *
rb_find_add_rcu(struct rb_node *node, struct rb_root *tree,
                int (*cmp)(struct rb_node *, const struct rb_node *))
{
        struct rb_node **link = &tree->rb_node;
        struct rb_node *parent = NULL;
        int c;

        while (*link) {
                parent = *link;
                c = cmp(node, parent);

                if (c < 0)
                        link = &parent->rb_left;
                else if (c > 0)
                        link = &parent->rb_right;
                else
                        return parent;
        }

        rb_link_node_rcu(node, parent, link);
        rb_insert_color(node, tree);
        return NULL;
}

/**
 * rb_find() - find @key in tree @tree
 * @key: key to match
 * @tree: tree to search
 * @cmp: operator defining the node order
 *
 * Returns the rb_node matching @key or NULL.
 */
static __always_inline struct rb_node *
rb_find(const void *key, const struct rb_root *tree,
        int (*cmp)(const void *key, const struct rb_node *))
{
        struct rb_node *node = tree->rb_node;

        while (node) {
                int c = cmp(key, node);

                if (c < 0)
                        node = node->rb_left;
                else if (c > 0)
                        node = node->rb_right;
                else
                        return node;
        }

        return NULL;
}

/**
 * rb_find_rcu() - find @key in tree @tree
 * @key: key to match
 * @tree: tree to search
 * @cmp: operator defining the node order
 *
 * Notably, tree descent vs concurrent tree rotations is unsound and can result
 * in false-negatives.
 *
 * Returns the rb_node matching @key or NULL.
 */
static __always_inline struct rb_node *
rb_find_rcu(const void *key, const struct rb_root *tree,
            int (*cmp)(const void *key, const struct rb_node *))
{
        struct rb_node *node = tree->rb_node;

        while (node) {
                int c = cmp(key, node);

                if (c < 0)
                        node = rcu_dereference_raw(node->rb_left);
                else if (c > 0)
                        node = rcu_dereference_raw(node->rb_right);
                else
                        return node;
        }

        return NULL;
}

/**
 * rb_find_first() - find the first @key in @tree
 * @key: key to match
 * @tree: tree to search
 * @cmp: operator defining node order
 *
 * Returns the leftmost node matching @key, or NULL.
 */
static __always_inline struct rb_node *
rb_find_first(const void *key, const struct rb_root *tree,
              int (*cmp)(const void *key, const struct rb_node *))
{
        struct rb_node *node = tree->rb_node;
        struct rb_node *match = NULL;

        while (node) {
                int c = cmp(key, node);

                if (c <= 0) {
                        if (!c)
                                match = node;
                        node = node->rb_left;
                } else if (c > 0) {
                        node = node->rb_right;
                }
        }

        return match;
}

/**
 * rb_next_match() - find the next @key in @tree
 * @key: key to match
 * @tree: tree to search
 * @cmp: operator defining node order
 *
 * Returns the next node matching @key, or NULL.
 */
static __always_inline struct rb_node *
rb_next_match(const void *key, struct rb_node *node,
              int (*cmp)(const void *key, const struct rb_node *))
{
        node = rb_next(node);
        if (node && cmp(key, node))
                node = NULL;
        return node;
}

/**
 * rb_for_each() - iterates a subtree matching @key
 * @node: iterator
 * @key: key to match
 * @tree: tree to search
 * @cmp: operator defining node order
 */
#define rb_for_each(node, key, tree, cmp) \
        for ((node) = rb_find_first((key), (tree), (cmp)); \
             (node); (node) = rb_next_match((key), (node), (cmp)))

#endif        /* _LINUX_RBTREE_H */



































































































































    1 











    1 










    1 




    1 


    1 






























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _MM_PERCPU_INTERNAL_H
#define _MM_PERCPU_INTERNAL_H

#include <linux/types.h>
#include <linux/percpu.h>
#include <linux/memcontrol.h>

/*
 * pcpu_block_md is the metadata block struct.
 * Each chunk's bitmap is split into a number of full blocks.
 * All units are in terms of bits.
 *
 * The scan hint is the largest known contiguous area before the contig hint.
 * It is not necessarily the actual largest contig hint though.  There is an
 * invariant that the scan_hint_start > contig_hint_start iff
 * scan_hint == contig_hint.  This is necessary because when scanning forward,
 * we don't know if a new contig hint would be better than the current one.
 */
struct pcpu_block_md {
        int                        scan_hint;        /* scan hint for block */
        int                        scan_hint_start; /* block relative starting
                                                    position of the scan hint */
        int                     contig_hint;    /* contig hint for block */
        int                     contig_hint_start; /* block relative starting
                                                      position of the contig hint */
        int                     left_free;      /* size of free space along
                                                   the left side of the block */
        int                     right_free;     /* size of free space along
                                                   the right side of the block */
        int                     first_free;     /* block position of first free */
        int                        nr_bits;        /* total bits responsible for */
};

struct pcpuobj_ext {
#ifdef CONFIG_MEMCG
        struct obj_cgroup        *cgroup;
#endif
#ifdef CONFIG_MEM_ALLOC_PROFILING
        union codetag_ref        tag;
#endif
};

#if defined(CONFIG_MEMCG) || defined(CONFIG_MEM_ALLOC_PROFILING)
#define NEED_PCPUOBJ_EXT
#endif

struct pcpu_chunk {
#ifdef CONFIG_PERCPU_STATS
        int                        nr_alloc;        /* # of allocations */
        size_t                        max_alloc_size; /* largest allocation size */
#endif

        struct list_head        list;                /* linked to pcpu_slot lists */
        int                        free_bytes;        /* free bytes in the chunk */
        struct pcpu_block_md        chunk_md;
        unsigned long                *bound_map;        /* boundary map */

        /*
         * base_addr is the base address of this chunk.
         * To reduce false sharing, current layout is optimized to make sure
         * base_addr locate in the different cacheline with free_bytes and
         * chunk_md.
         */
        void                        *base_addr ____cacheline_aligned_in_smp;

        unsigned long                *alloc_map;        /* allocation map */
        struct pcpu_block_md        *md_blocks;        /* metadata blocks */

        void                        *data;                /* chunk data */
        bool                        immutable;        /* no [de]population allowed */
        bool                        isolated;        /* isolated from active chunk
                                                   slots */
        int                        start_offset;        /* the overlap with the previous
                                                   region to have a page aligned
                                                   base_addr */
        int                        end_offset;        /* additional area required to
                                                   have the region end page
                                                   aligned */
#ifdef NEED_PCPUOBJ_EXT
        struct pcpuobj_ext        *obj_exts;        /* vector of object cgroups */
#endif

        int                        nr_pages;        /* # of pages served by this chunk */
        int                        nr_populated;        /* # of populated pages */
        int                     nr_empty_pop_pages; /* # of empty populated pages */
        unsigned long                populated[];        /* populated bitmap */
};

static inline bool need_pcpuobj_ext(void)
{
        if (IS_ENABLED(CONFIG_MEM_ALLOC_PROFILING))
                return true;
        if (!mem_cgroup_kmem_disabled())
                return true;
        return false;
}

extern spinlock_t pcpu_lock;

extern struct list_head *pcpu_chunk_lists;
extern int pcpu_nr_slots;
extern int pcpu_sidelined_slot;
extern int pcpu_to_depopulate_slot;
extern int pcpu_nr_empty_pop_pages;

extern struct pcpu_chunk *pcpu_first_chunk;
extern struct pcpu_chunk *pcpu_reserved_chunk;

/**
 * pcpu_chunk_nr_blocks - converts nr_pages to # of md_blocks
 * @chunk: chunk of interest
 *
 * This conversion is from the number of physical pages that the chunk
 * serves to the number of bitmap blocks used.
 */
static inline int pcpu_chunk_nr_blocks(struct pcpu_chunk *chunk)
{
        return chunk->nr_pages * PAGE_SIZE / PCPU_BITMAP_BLOCK_SIZE;
}

/**
 * pcpu_nr_pages_to_map_bits - converts the pages to size of bitmap
 * @pages: number of physical pages
 *
 * This conversion is from physical pages to the number of bits
 * required in the bitmap.
 */
static inline int pcpu_nr_pages_to_map_bits(int pages)
{
        return pages * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
}

/**
 * pcpu_chunk_map_bits - helper to convert nr_pages to size of bitmap
 * @chunk: chunk of interest
 *
 * This conversion is from the number of physical pages that the chunk
 * serves to the number of bits in the bitmap.
 */
static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
{
        return pcpu_nr_pages_to_map_bits(chunk->nr_pages);
}

/**
 * pcpu_obj_full_size - helper to calculate size of each accounted object
 * @size: size of area to allocate in bytes
 *
 * For each accounted object there is an extra space which is used to store
 * obj_cgroup membership if kmemcg is not disabled. Charge it too.
 */
static inline size_t pcpu_obj_full_size(size_t size)
{
        size_t extra_size = 0;

#ifdef CONFIG_MEMCG
        if (!mem_cgroup_kmem_disabled())
                extra_size += size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *);
#endif

        return size * num_possible_cpus() + extra_size;
}

#ifdef CONFIG_PERCPU_STATS

#include <linux/spinlock.h>

struct percpu_stats {
        u64 nr_alloc;                /* lifetime # of allocations */
        u64 nr_dealloc;                /* lifetime # of deallocations */
        u64 nr_cur_alloc;        /* current # of allocations */
        u64 nr_max_alloc;        /* max # of live allocations */
        u32 nr_chunks;                /* current # of live chunks */
        u32 nr_max_chunks;        /* max # of live chunks */
        size_t min_alloc_size;        /* min allocation size */
        size_t max_alloc_size;        /* max allocation size */
};

extern struct percpu_stats pcpu_stats;
extern struct pcpu_alloc_info pcpu_stats_ai;

/*
 * For debug purposes. We don't care about the flexible array.
 */
static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai)
{
        memcpy(&pcpu_stats_ai, ai, sizeof(struct pcpu_alloc_info));

        /* initialize min_alloc_size to unit_size */
        pcpu_stats.min_alloc_size = pcpu_stats_ai.unit_size;
}

/*
 * pcpu_stats_area_alloc - increment area allocation stats
 * @chunk: the location of the area being allocated
 * @size: size of area to allocate in bytes
 *
 * CONTEXT:
 * pcpu_lock.
 */
static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size)
{
        lockdep_assert_held(&pcpu_lock);

        pcpu_stats.nr_alloc++;
        pcpu_stats.nr_cur_alloc++;
        pcpu_stats.nr_max_alloc =
                max(pcpu_stats.nr_max_alloc, pcpu_stats.nr_cur_alloc);
        pcpu_stats.min_alloc_size =
                min(pcpu_stats.min_alloc_size, size);
        pcpu_stats.max_alloc_size =
                max(pcpu_stats.max_alloc_size, size);

        chunk->nr_alloc++;
        chunk->max_alloc_size = max(chunk->max_alloc_size, size);
}

/*
 * pcpu_stats_area_dealloc - decrement allocation stats
 * @chunk: the location of the area being deallocated
 *
 * CONTEXT:
 * pcpu_lock.
 */
static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk)
{
        lockdep_assert_held(&pcpu_lock);

        pcpu_stats.nr_dealloc++;
        pcpu_stats.nr_cur_alloc--;

        chunk->nr_alloc--;
}

/*
 * pcpu_stats_chunk_alloc - increment chunk stats
 */
static inline void pcpu_stats_chunk_alloc(void)
{
        unsigned long flags;
        spin_lock_irqsave(&pcpu_lock, flags);

        pcpu_stats.nr_chunks++;
        pcpu_stats.nr_max_chunks =
                max(pcpu_stats.nr_max_chunks, pcpu_stats.nr_chunks);

        spin_unlock_irqrestore(&pcpu_lock, flags);
}

/*
 * pcpu_stats_chunk_dealloc - decrement chunk stats
 */
static inline void pcpu_stats_chunk_dealloc(void)
{
        unsigned long flags;
        spin_lock_irqsave(&pcpu_lock, flags);

        pcpu_stats.nr_chunks--;

        spin_unlock_irqrestore(&pcpu_lock, flags);
}

#else

static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai)
{
}

static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size)
{
}

static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk)
{
}

static inline void pcpu_stats_chunk_alloc(void)
{
}

static inline void pcpu_stats_chunk_dealloc(void)
{
}

#endif /* !CONFIG_PERCPU_STATS */

#endif











































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kmem

#if !defined(_TRACE_KMEM_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_KMEM_H

#include <linux/types.h>
#include <linux/tracepoint.h>
#include <trace/events/mmflags.h>

TRACE_EVENT(kmem_cache_alloc,

        TP_PROTO(unsigned long call_site,
                 const void *ptr,
                 struct kmem_cache *s,
                 gfp_t gfp_flags,
                 int node),

        TP_ARGS(call_site, ptr, s, gfp_flags, node),

        TP_STRUCT__entry(
                __field(        unsigned long,        call_site        )
                __field(        const void *,        ptr                )
                __string(        name,                s->name                )
                __field(        size_t,                bytes_req        )
                __field(        size_t,                bytes_alloc        )
                __field(        unsigned long,        gfp_flags        )
                __field(        int,                node                )
                __field(        bool,                accounted        )
        ),

        TP_fast_assign(
                __entry->call_site        = call_site;
                __entry->ptr                = ptr;
                __assign_str(name);
                __entry->bytes_req        = s->object_size;
                __entry->bytes_alloc        = s->size;
                __entry->gfp_flags        = (__force unsigned long)gfp_flags;
                __entry->node                = node;
                __entry->accounted        = IS_ENABLED(CONFIG_MEMCG) ?
                                          ((gfp_flags & __GFP_ACCOUNT) ||
                                          (s->flags & SLAB_ACCOUNT)) : false;
        ),

        TP_printk("call_site=%pS ptr=%p name=%s bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s",
                (void *)__entry->call_site,
                __entry->ptr,
                __get_str(name),
                __entry->bytes_req,
                __entry->bytes_alloc,
                show_gfp_flags(__entry->gfp_flags),
                __entry->node,
                __entry->accounted ? "true" : "false")
);

TRACE_EVENT(kmalloc,

        TP_PROTO(unsigned long call_site,
                 const void *ptr,
                 size_t bytes_req,
                 size_t bytes_alloc,
                 gfp_t gfp_flags,
                 int node),

        TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),

        TP_STRUCT__entry(
                __field(        unsigned long,        call_site        )
                __field(        const void *,        ptr                )
                __field(        size_t,                bytes_req        )
                __field(        size_t,                bytes_alloc        )
                __field(        unsigned long,        gfp_flags        )
                __field(        int,                node                )
        ),

        TP_fast_assign(
                __entry->call_site        = call_site;
                __entry->ptr                = ptr;
                __entry->bytes_req        = bytes_req;
                __entry->bytes_alloc        = bytes_alloc;
                __entry->gfp_flags        = (__force unsigned long)gfp_flags;
                __entry->node                = node;
        ),

        TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s",
                (void *)__entry->call_site,
                __entry->ptr,
                __entry->bytes_req,
                __entry->bytes_alloc,
                show_gfp_flags(__entry->gfp_flags),
                __entry->node,
                (IS_ENABLED(CONFIG_MEMCG) &&
                 (__entry->gfp_flags & (__force unsigned long)__GFP_ACCOUNT)) ? "true" : "false")
);

TRACE_EVENT(kfree,

        TP_PROTO(unsigned long call_site, const void *ptr),

        TP_ARGS(call_site, ptr),

        TP_STRUCT__entry(
                __field(        unsigned long,        call_site        )
                __field(        const void *,        ptr                )
        ),

        TP_fast_assign(
                __entry->call_site        = call_site;
                __entry->ptr                = ptr;
        ),

        TP_printk("call_site=%pS ptr=%p",
                  (void *)__entry->call_site, __entry->ptr)
);

TRACE_EVENT(kmem_cache_free,

        TP_PROTO(unsigned long call_site, const void *ptr, const struct kmem_cache *s),

        TP_ARGS(call_site, ptr, s),

        TP_STRUCT__entry(
                __field(        unsigned long,        call_site        )
                __field(        const void *,        ptr                )
                __string(        name,                s->name                )
        ),

        TP_fast_assign(
                __entry->call_site        = call_site;
                __entry->ptr                = ptr;
                __assign_str(name);
        ),

        TP_printk("call_site=%pS ptr=%p name=%s",
                  (void *)__entry->call_site, __entry->ptr, __get_str(name))
);

TRACE_EVENT(mm_page_free,

        TP_PROTO(struct page *page, unsigned int order),

        TP_ARGS(page, order),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
                __field(        unsigned int,        order                )
        ),

        TP_fast_assign(
                __entry->pfn                = page_to_pfn(page);
                __entry->order                = order;
        ),

        TP_printk("page=%p pfn=0x%lx order=%d",
                        pfn_to_page(__entry->pfn),
                        __entry->pfn,
                        __entry->order)
);

TRACE_EVENT(mm_page_free_batched,

        TP_PROTO(struct page *page),

        TP_ARGS(page),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
        ),

        TP_fast_assign(
                __entry->pfn                = page_to_pfn(page);
        ),

        TP_printk("page=%p pfn=0x%lx order=0",
                        pfn_to_page(__entry->pfn),
                        __entry->pfn)
);

TRACE_EVENT(mm_page_alloc,

        TP_PROTO(struct page *page, unsigned int order,
                        gfp_t gfp_flags, int migratetype),

        TP_ARGS(page, order, gfp_flags, migratetype),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
                __field(        unsigned int,        order                )
                __field(        unsigned long,        gfp_flags        )
                __field(        int,                migratetype        )
        ),

        TP_fast_assign(
                __entry->pfn                = page ? page_to_pfn(page) : -1UL;
                __entry->order                = order;
                __entry->gfp_flags        = (__force unsigned long)gfp_flags;
                __entry->migratetype        = migratetype;
        ),

        TP_printk("page=%p pfn=0x%lx order=%d migratetype=%d gfp_flags=%s",
                __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL,
                __entry->pfn != -1UL ? __entry->pfn : 0,
                __entry->order,
                __entry->migratetype,
                show_gfp_flags(__entry->gfp_flags))
);

DECLARE_EVENT_CLASS(mm_page,

        TP_PROTO(struct page *page, unsigned int order, int migratetype,
                 int percpu_refill),

        TP_ARGS(page, order, migratetype, percpu_refill),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
                __field(        unsigned int,        order                )
                __field(        int,                migratetype        )
                __field(        int,                percpu_refill        )
        ),

        TP_fast_assign(
                __entry->pfn                = page ? page_to_pfn(page) : -1UL;
                __entry->order                = order;
                __entry->migratetype        = migratetype;
                __entry->percpu_refill        = percpu_refill;
        ),

        TP_printk("page=%p pfn=0x%lx order=%u migratetype=%d percpu_refill=%d",
                __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL,
                __entry->pfn != -1UL ? __entry->pfn : 0,
                __entry->order,
                __entry->migratetype,
                __entry->percpu_refill)
);

DEFINE_EVENT(mm_page, mm_page_alloc_zone_locked,

        TP_PROTO(struct page *page, unsigned int order, int migratetype,
                 int percpu_refill),

        TP_ARGS(page, order, migratetype, percpu_refill)
);

TRACE_EVENT(mm_page_pcpu_drain,

        TP_PROTO(struct page *page, unsigned int order, int migratetype),

        TP_ARGS(page, order, migratetype),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
                __field(        unsigned int,        order                )
                __field(        int,                migratetype        )
        ),

        TP_fast_assign(
                __entry->pfn                = page ? page_to_pfn(page) : -1UL;
                __entry->order                = order;
                __entry->migratetype        = migratetype;
        ),

        TP_printk("page=%p pfn=0x%lx order=%d migratetype=%d",
                pfn_to_page(__entry->pfn), __entry->pfn,
                __entry->order, __entry->migratetype)
);

TRACE_EVENT(mm_page_alloc_extfrag,

        TP_PROTO(struct page *page,
                int alloc_order, int fallback_order,
                int alloc_migratetype, int fallback_migratetype),

        TP_ARGS(page,
                alloc_order, fallback_order,
                alloc_migratetype, fallback_migratetype),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                        )
                __field(        int,                alloc_order                )
                __field(        int,                fallback_order                )
                __field(        int,                alloc_migratetype        )
                __field(        int,                fallback_migratetype        )
                __field(        int,                change_ownership        )
        ),

        TP_fast_assign(
                __entry->pfn                        = page_to_pfn(page);
                __entry->alloc_order                = alloc_order;
                __entry->fallback_order                = fallback_order;
                __entry->alloc_migratetype        = alloc_migratetype;
                __entry->fallback_migratetype        = fallback_migratetype;
                __entry->change_ownership        = (alloc_migratetype ==
                                        get_pageblock_migratetype(page));
        ),

        TP_printk("page=%p pfn=0x%lx alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d",
                pfn_to_page(__entry->pfn),
                __entry->pfn,
                __entry->alloc_order,
                __entry->fallback_order,
                pageblock_order,
                __entry->alloc_migratetype,
                __entry->fallback_migratetype,
                __entry->fallback_order < pageblock_order,
                __entry->change_ownership)
);

TRACE_EVENT(mm_setup_per_zone_wmarks,

        TP_PROTO(struct zone *zone),

        TP_ARGS(zone),

        TP_STRUCT__entry(
                __field(int, node_id)
                __string(name, zone->name)
                __field(unsigned long, watermark_min)
                __field(unsigned long, watermark_low)
                __field(unsigned long, watermark_high)
                __field(unsigned long, watermark_promo)
        ),

        TP_fast_assign(
                __entry->node_id = zone->zone_pgdat->node_id;
                __assign_str(name);
                __entry->watermark_min = zone->_watermark[WMARK_MIN];
                __entry->watermark_low = zone->_watermark[WMARK_LOW];
                __entry->watermark_high = zone->_watermark[WMARK_HIGH];
                __entry->watermark_promo = zone->_watermark[WMARK_PROMO];
        ),

        TP_printk("node_id=%d zone name=%s watermark min=%lu low=%lu high=%lu promo=%lu",
                  __entry->node_id,
                  __get_str(name),
                  __entry->watermark_min,
                  __entry->watermark_low,
                  __entry->watermark_high,
                  __entry->watermark_promo)
);

TRACE_EVENT(mm_setup_per_zone_lowmem_reserve,

        TP_PROTO(struct zone *zone, struct zone *upper_zone, long lowmem_reserve),

        TP_ARGS(zone, upper_zone, lowmem_reserve),

        TP_STRUCT__entry(
                __field(int, node_id)
                __string(name, zone->name)
                __string(upper_name, upper_zone->name)
                __field(long, lowmem_reserve)
        ),

        TP_fast_assign(
                __entry->node_id = zone->zone_pgdat->node_id;
                __assign_str(name);
                __assign_str(upper_name);
                __entry->lowmem_reserve = lowmem_reserve;
        ),

        TP_printk("node_id=%d zone name=%s upper_zone name=%s lowmem_reserve_pages=%ld",
                  __entry->node_id,
                  __get_str(name),
                  __get_str(upper_name),
                  __entry->lowmem_reserve)
);

TRACE_EVENT(mm_calculate_totalreserve_pages,

        TP_PROTO(unsigned long totalreserve_pages),

        TP_ARGS(totalreserve_pages),

        TP_STRUCT__entry(
                __field(unsigned long, totalreserve_pages)
        ),

        TP_fast_assign(
                __entry->totalreserve_pages = totalreserve_pages;
        ),

        TP_printk("totalreserve_pages=%lu", __entry->totalreserve_pages)
);


/*
 * Required for uniquely and securely identifying mm in rss_stat tracepoint.
 */
#ifndef __PTR_TO_HASHVAL
static unsigned int __maybe_unused mm_ptr_to_hash(const void *ptr)
{
        int ret;
        unsigned long hashval;

        ret = ptr_to_hashval(ptr, &hashval);
        if (ret)
                return 0;

        /* The hashed value is only 32-bit */
        return (unsigned int)hashval;
}
#define __PTR_TO_HASHVAL
#endif

#define TRACE_MM_PAGES                \
        EM(MM_FILEPAGES)        \
        EM(MM_ANONPAGES)        \
        EM(MM_SWAPENTS)                \
        EMe(MM_SHMEMPAGES)

#undef EM
#undef EMe

#define EM(a)        TRACE_DEFINE_ENUM(a);
#define EMe(a)        TRACE_DEFINE_ENUM(a);

TRACE_MM_PAGES

#undef EM
#undef EMe

#define EM(a)        { a, #a },
#define EMe(a)        { a, #a }

TRACE_EVENT(rss_stat,

        TP_PROTO(struct mm_struct *mm,
                int member),

        TP_ARGS(mm, member),

        TP_STRUCT__entry(
                __field(unsigned int, mm_id)
                __field(unsigned int, curr)
                __field(int, member)
                __field(long, size)
        ),

        TP_fast_assign(
                __entry->mm_id = mm_ptr_to_hash(mm);
                __entry->curr = !!(current->mm == mm);
                __entry->member = member;
                __entry->size = (percpu_counter_sum_positive(&mm->rss_stat[member])
                                                            << PAGE_SHIFT);
        ),

        TP_printk("mm_id=%u curr=%d type=%s size=%ldB",
                __entry->mm_id,
                __entry->curr,
                __print_symbolic(__entry->member, TRACE_MM_PAGES),
                __entry->size)
        );
#endif /* _TRACE_KMEM_H */

/* This part must be outside protection */
#include <trace/define_trace.h>



































































































  319 



  318 





































































































































































































































  305 

































































































































































  319 




  302 
  304 




  319 





  303 
  299 


  302 
  304 

  305 
  305 







  318 











































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Generic pidhash and scalable, time-bounded PID allocator
 *
 * (C) 2002-2003 Nadia Yvette Chambers, IBM
 * (C) 2004 Nadia Yvette Chambers, Oracle
 * (C) 2002-2004 Ingo Molnar, Red Hat
 *
 * pid-structures are backing objects for tasks sharing a given ID to chain
 * against. There is very little to them aside from hashing them and
 * parking tasks using given ID's on a list.
 *
 * The hash is always changed with the tasklist_lock write-acquired,
 * and the hash is only accessed with the tasklist_lock at least
 * read-acquired, so there's no additional SMP locking needed here.
 *
 * We have a list of bitmap pages, which bitmaps represent the PID space.
 * Allocating and freeing PIDs is completely lockless. The worst-case
 * allocation scenario when all but one out of 1 million PIDs possible are
 * allocated already: the scanning of 32 list entries and at most PAGE_SIZE
 * bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
 *
 * Pid namespaces:
 *    (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
 *    (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
 *     Many thanks to Oleg Nesterov for comments and help
 *
 */

#include <linux/mm.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/rculist.h>
#include <linux/memblock.h>
#include <linux/pid_namespace.h>
#include <linux/init_task.h>
#include <linux/syscalls.h>
#include <linux/proc_ns.h>
#include <linux/refcount.h>
#include <linux/anon_inodes.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/idr.h>
#include <linux/pidfs.h>
#include <linux/seqlock.h>
#include <net/sock.h>
#include <uapi/linux/pidfd.h>

struct pid init_struct_pid = {
        .count                = REFCOUNT_INIT(1),
        .tasks                = {
                { .first = NULL },
                { .first = NULL },
                { .first = NULL },
        },
        .level                = 0,
        .numbers        = { {
                .nr                = 0,
                .ns                = &init_pid_ns,
        }, }
};

static int pid_max_min = RESERVED_PIDS + 1;
static int pid_max_max = PID_MAX_LIMIT;

/*
 * PID-map pages start out as NULL, they get allocated upon
 * first use and are never deallocated. This way a low pid_max
 * value does not cause lots of bitmaps to be allocated, but
 * the scheme scales to up to 4 million PIDs, runtime.
 */
struct pid_namespace init_pid_ns = {
        .ns.__ns_ref = REFCOUNT_INIT(2),
        .idr = IDR_INIT(init_pid_ns.idr),
        .pid_allocated = PIDNS_ADDING,
        .level = 0,
        .child_reaper = &init_task,
        .user_ns = &init_user_ns,
        .ns.inum = ns_init_inum(&init_pid_ns),
#ifdef CONFIG_PID_NS
        .ns.ops = &pidns_operations,
#endif
        .pid_max = PID_MAX_DEFAULT,
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
        .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
#endif
        .ns.ns_type = ns_common_type(&init_pid_ns),
};
EXPORT_SYMBOL_GPL(init_pid_ns);

static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock);

void put_pid(struct pid *pid)
{
        struct pid_namespace *ns;

        if (!pid)
                return;

        ns = pid->numbers[pid->level].ns;
        if (refcount_dec_and_test(&pid->count)) {
                pidfs_free_pid(pid);
                kmem_cache_free(ns->pid_cachep, pid);
                put_pid_ns(ns);
        }
}
EXPORT_SYMBOL_GPL(put_pid);

static void delayed_put_pid(struct rcu_head *rhp)
{
        struct pid *pid = container_of(rhp, struct pid, rcu);
        put_pid(pid);
}

void free_pid(struct pid *pid)
{
        int i;

        lockdep_assert_not_held(&tasklist_lock);

        spin_lock(&pidmap_lock);
        for (i = 0; i <= pid->level; i++) {
                struct upid *upid = pid->numbers + i;
                struct pid_namespace *ns = upid->ns;
                switch (--ns->pid_allocated) {
                case 2:
                case 1:
                        /* When all that is left in the pid namespace
                         * is the reaper wake up the reaper.  The reaper
                         * may be sleeping in zap_pid_ns_processes().
                         */
                        wake_up_process(ns->child_reaper);
                        break;
                case PIDNS_ADDING:
                        /* Handle a fork failure of the first process */
                        WARN_ON(ns->child_reaper);
                        ns->pid_allocated = 0;
                        break;
                }

                idr_remove(&ns->idr, upid->nr);
        }
        pidfs_remove_pid(pid);
        spin_unlock(&pidmap_lock);

        call_rcu(&pid->rcu, delayed_put_pid);
}

void free_pids(struct pid **pids)
{
        int tmp;

        /*
         * This can batch pidmap_lock.
         */
        for (tmp = PIDTYPE_MAX; --tmp >= 0; )
                if (pids[tmp])
                        free_pid(pids[tmp]);
}

struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
                      size_t set_tid_size)
{
        struct pid *pid;
        enum pid_type type;
        int i, nr;
        struct pid_namespace *tmp;
        struct upid *upid;
        int retval = -ENOMEM;

        /*
         * set_tid_size contains the size of the set_tid array. Starting at
         * the most nested currently active PID namespace it tells alloc_pid()
         * which PID to set for a process in that most nested PID namespace
         * up to set_tid_size PID namespaces. It does not have to set the PID
         * for a process in all nested PID namespaces but set_tid_size must
         * never be greater than the current ns->level + 1.
         */
        if (set_tid_size > ns->level + 1)
                return ERR_PTR(-EINVAL);

        pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
        if (!pid)
                return ERR_PTR(retval);

        tmp = ns;
        pid->level = ns->level;

        for (i = ns->level; i >= 0; i--) {
                int tid = 0;
                int pid_max = READ_ONCE(tmp->pid_max);

                if (set_tid_size) {
                        tid = set_tid[ns->level - i];

                        retval = -EINVAL;
                        if (tid < 1 || tid >= pid_max)
                                goto out_free;
                        /*
                         * Also fail if a PID != 1 is requested and
                         * no PID 1 exists.
                         */
                        if (tid != 1 && !tmp->child_reaper)
                                goto out_free;
                        retval = -EPERM;
                        if (!checkpoint_restore_ns_capable(tmp->user_ns))
                                goto out_free;
                        set_tid_size--;
                }

                idr_preload(GFP_KERNEL);
                spin_lock(&pidmap_lock);

                if (tid) {
                        nr = idr_alloc(&tmp->idr, NULL, tid,
                                       tid + 1, GFP_ATOMIC);
                        /*
                         * If ENOSPC is returned it means that the PID is
                         * alreay in use. Return EEXIST in that case.
                         */
                        if (nr == -ENOSPC)
                                nr = -EEXIST;
                } else {
                        int pid_min = 1;
                        /*
                         * init really needs pid 1, but after reaching the
                         * maximum wrap back to RESERVED_PIDS
                         */
                        if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
                                pid_min = RESERVED_PIDS;

                        /*
                         * Store a null pointer so find_pid_ns does not find
                         * a partially initialized PID (see below).
                         */
                        nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
                                              pid_max, GFP_ATOMIC);
                }
                spin_unlock(&pidmap_lock);
                idr_preload_end();

                if (nr < 0) {
                        retval = (nr == -ENOSPC) ? -EAGAIN : nr;
                        goto out_free;
                }

                pid->numbers[i].nr = nr;
                pid->numbers[i].ns = tmp;
                tmp = tmp->parent;
        }

        /*
         * ENOMEM is not the most obvious choice especially for the case
         * where the child subreaper has already exited and the pid
         * namespace denies the creation of any new processes. But ENOMEM
         * is what we have exposed to userspace for a long time and it is
         * documented behavior for pid namespaces. So we can't easily
         * change it even if there were an error code better suited.
         */
        retval = -ENOMEM;

        get_pid_ns(ns);
        refcount_set(&pid->count, 1);
        spin_lock_init(&pid->lock);
        for (type = 0; type < PIDTYPE_MAX; ++type)
                INIT_HLIST_HEAD(&pid->tasks[type]);

        init_waitqueue_head(&pid->wait_pidfd);
        INIT_HLIST_HEAD(&pid->inodes);

        upid = pid->numbers + ns->level;
        idr_preload(GFP_KERNEL);
        spin_lock(&pidmap_lock);
        if (!(ns->pid_allocated & PIDNS_ADDING))
                goto out_unlock;
        pidfs_add_pid(pid);
        for ( ; upid >= pid->numbers; --upid) {
                /* Make the PID visible to find_pid_ns. */
                idr_replace(&upid->ns->idr, pid, upid->nr);
                upid->ns->pid_allocated++;
        }
        spin_unlock(&pidmap_lock);
        idr_preload_end();

        return pid;

out_unlock:
        spin_unlock(&pidmap_lock);
        idr_preload_end();
        put_pid_ns(ns);

out_free:
        spin_lock(&pidmap_lock);
        while (++i <= ns->level) {
                upid = pid->numbers + i;
                idr_remove(&upid->ns->idr, upid->nr);
        }

        /* On failure to allocate the first pid, reset the state */
        if (ns->pid_allocated == PIDNS_ADDING)
                idr_set_cursor(&ns->idr, 0);

        spin_unlock(&pidmap_lock);

        kmem_cache_free(ns->pid_cachep, pid);
        return ERR_PTR(retval);
}

void disable_pid_allocation(struct pid_namespace *ns)
{
        spin_lock(&pidmap_lock);
        ns->pid_allocated &= ~PIDNS_ADDING;
        spin_unlock(&pidmap_lock);
}

struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
{
        return idr_find(&ns->idr, nr);
}
EXPORT_SYMBOL_GPL(find_pid_ns);

struct pid *find_vpid(int nr)
{
        return find_pid_ns(nr, task_active_pid_ns(current));
}
EXPORT_SYMBOL_GPL(find_vpid);

static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type)
{
        return (type == PIDTYPE_PID) ?
                &task->thread_pid :
                &task->signal->pids[type];
}

/*
 * attach_pid() must be called with the tasklist_lock write-held.
 */
void attach_pid(struct task_struct *task, enum pid_type type)
{
        struct pid *pid;

        lockdep_assert_held_write(&tasklist_lock);

        pid = *task_pid_ptr(task, type);
        hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]);
}

static void __change_pid(struct pid **pids, struct task_struct *task,
                         enum pid_type type, struct pid *new)
{
        struct pid **pid_ptr, *pid;
        int tmp;

        lockdep_assert_held_write(&tasklist_lock);

        pid_ptr = task_pid_ptr(task, type);
        pid = *pid_ptr;

        hlist_del_rcu(&task->pid_links[type]);
        *pid_ptr = new;

        for (tmp = PIDTYPE_MAX; --tmp >= 0; )
                if (pid_has_task(pid, tmp))
                        return;

        WARN_ON(pids[type]);
        pids[type] = pid;
}

void detach_pid(struct pid **pids, struct task_struct *task, enum pid_type type)
{
        __change_pid(pids, task, type, NULL);
}

void change_pid(struct pid **pids, struct task_struct *task, enum pid_type type,
                struct pid *pid)
{
        __change_pid(pids, task, type, pid);
        attach_pid(task, type);
}

void exchange_tids(struct task_struct *left, struct task_struct *right)
{
        struct pid *pid1 = left->thread_pid;
        struct pid *pid2 = right->thread_pid;
        struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID];
        struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID];

        lockdep_assert_held_write(&tasklist_lock);

        /* Swap the single entry tid lists */
        hlists_swap_heads_rcu(head1, head2);

        /* Swap the per task_struct pid */
        rcu_assign_pointer(left->thread_pid, pid2);
        rcu_assign_pointer(right->thread_pid, pid1);

        /* Swap the cached value */
        WRITE_ONCE(left->pid, pid_nr(pid2));
        WRITE_ONCE(right->pid, pid_nr(pid1));
}

/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
void transfer_pid(struct task_struct *old, struct task_struct *new,
                           enum pid_type type)
{
        WARN_ON_ONCE(type == PIDTYPE_PID);
        lockdep_assert_held_write(&tasklist_lock);
        hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]);
}

struct task_struct *pid_task(struct pid *pid, enum pid_type type)
{
        struct task_struct *result = NULL;
        if (pid) {
                struct hlist_node *first;
                first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
                                              lockdep_tasklist_lock_is_held());
                if (first)
                        result = hlist_entry(first, struct task_struct, pid_links[(type)]);
        }
        return result;
}
EXPORT_SYMBOL(pid_task);

/*
 * Must be called under rcu_read_lock().
 */
struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
{
        RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
                         "find_task_by_pid_ns() needs rcu_read_lock() protection");
        return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
}

struct task_struct *find_task_by_vpid(pid_t vnr)
{
        return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
}

struct task_struct *find_get_task_by_vpid(pid_t nr)
{
        struct task_struct *task;

        rcu_read_lock();
        task = find_task_by_vpid(nr);
        if (task)
                get_task_struct(task);
        rcu_read_unlock();

        return task;
}

struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
{
        struct pid *pid;
        rcu_read_lock();
        pid = get_pid(rcu_dereference(*task_pid_ptr(task, type)));
        rcu_read_unlock();
        return pid;
}
EXPORT_SYMBOL_GPL(get_task_pid);

struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
{
        struct task_struct *result;
        rcu_read_lock();
        result = pid_task(pid, type);
        if (result)
                get_task_struct(result);
        rcu_read_unlock();
        return result;
}
EXPORT_SYMBOL_GPL(get_pid_task);

struct pid *find_get_pid(pid_t nr)
{
        struct pid *pid;

        rcu_read_lock();
        pid = get_pid(find_vpid(nr));
        rcu_read_unlock();

        return pid;
}
EXPORT_SYMBOL_GPL(find_get_pid);

pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
{
        struct upid *upid;
        pid_t nr = 0;

        if (pid && ns && ns->level <= pid->level) {
                upid = &pid->numbers[ns->level];
                if (upid->ns == ns)
                        nr = upid->nr;
        }
        return nr;
}
EXPORT_SYMBOL_GPL(pid_nr_ns);

pid_t pid_vnr(struct pid *pid)
{
        return pid_nr_ns(pid, task_active_pid_ns(current));
}
EXPORT_SYMBOL_GPL(pid_vnr);

pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
                        struct pid_namespace *ns)
{
        pid_t nr = 0;

        rcu_read_lock();
        if (!ns)
                ns = task_active_pid_ns(current);
        if (ns)
                nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns);
        rcu_read_unlock();

        return nr;
}
EXPORT_SYMBOL(__task_pid_nr_ns);

struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
{
        return ns_of_pid(task_pid(tsk));
}
EXPORT_SYMBOL_GPL(task_active_pid_ns);

/*
 * Used by proc to find the first pid that is greater than or equal to nr.
 *
 * If there is a pid at nr this function is exactly the same as find_pid_ns.
 */
struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
{
        return idr_get_next(&ns->idr, &nr);
}
EXPORT_SYMBOL_GPL(find_ge_pid);

struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
{
        CLASS(fd, f)(fd);
        struct pid *pid;

        if (fd_empty(f))
                return ERR_PTR(-EBADF);

        pid = pidfd_pid(fd_file(f));
        if (!IS_ERR(pid)) {
                get_pid(pid);
                *flags = fd_file(f)->f_flags;
        }
        return pid;
}

/**
 * pidfd_get_task() - Get the task associated with a pidfd
 *
 * @pidfd: pidfd for which to get the task
 * @flags: flags associated with this pidfd
 *
 * Return the task associated with @pidfd. The function takes a reference on
 * the returned task. The caller is responsible for releasing that reference.
 *
 * Return: On success, the task_struct associated with the pidfd.
 *           On error, a negative errno number will be returned.
 */
struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags)
{
        unsigned int f_flags = 0;
        struct pid *pid;
        struct task_struct *task;
        enum pid_type type;

        switch (pidfd) {
        case  PIDFD_SELF_THREAD:
                type = PIDTYPE_PID;
                pid = get_task_pid(current, type);
                break;
        case  PIDFD_SELF_THREAD_GROUP:
                type = PIDTYPE_TGID;
                pid = get_task_pid(current, type);
                break;
        default:
                pid = pidfd_get_pid(pidfd, &f_flags);
                if (IS_ERR(pid))
                        return ERR_CAST(pid);
                type = PIDTYPE_TGID;
                break;
        }

        task = get_pid_task(pid, type);
        put_pid(pid);
        if (!task)
                return ERR_PTR(-ESRCH);

        *flags = f_flags;
        return task;
}

/**
 * pidfd_create() - Create a new pid file descriptor.
 *
 * @pid:   struct pid that the pidfd will reference
 * @flags: flags to pass
 *
 * This creates a new pid file descriptor with the O_CLOEXEC flag set.
 *
 * Note, that this function can only be called after the fd table has
 * been unshared to avoid leaking the pidfd to the new process.
 *
 * This symbol should not be explicitly exported to loadable modules.
 *
 * Return: On success, a cloexec pidfd is returned.
 *         On error, a negative errno number will be returned.
 */
static int pidfd_create(struct pid *pid, unsigned int flags)
{
        int pidfd;
        struct file *pidfd_file;

        pidfd = pidfd_prepare(pid, flags, &pidfd_file);
        if (pidfd < 0)
                return pidfd;

        fd_install(pidfd, pidfd_file);
        return pidfd;
}

/**
 * sys_pidfd_open() - Open new pid file descriptor.
 *
 * @pid:   pid for which to retrieve a pidfd
 * @flags: flags to pass
 *
 * This creates a new pid file descriptor with the O_CLOEXEC flag set for
 * the task identified by @pid. Without PIDFD_THREAD flag the target task
 * must be a thread-group leader.
 *
 * Return: On success, a cloexec pidfd is returned.
 *         On error, a negative errno number will be returned.
 */
SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
{
        int fd;
        struct pid *p;

        if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD))
                return -EINVAL;

        if (pid <= 0)
                return -EINVAL;

        p = find_get_pid(pid);
        if (!p)
                return -ESRCH;

        fd = pidfd_create(p, flags);

        put_pid(p);
        return fd;
}

#ifdef CONFIG_SYSCTL
static struct ctl_table_set *pid_table_root_lookup(struct ctl_table_root *root)
{
        return &task_active_pid_ns(current)->set;
}

static int set_is_seen(struct ctl_table_set *set)
{
        return &task_active_pid_ns(current)->set == set;
}

static int pid_table_root_permissions(struct ctl_table_header *head,
                                      const struct ctl_table *table)
{
        struct pid_namespace *pidns =
                container_of(head->set, struct pid_namespace, set);
        int mode = table->mode;

        if (ns_capable_noaudit(pidns->user_ns, CAP_SYS_ADMIN) ||
            uid_eq(current_euid(), make_kuid(pidns->user_ns, 0)))
                mode = (mode & S_IRWXU) >> 6;
        else if (in_egroup_p(make_kgid(pidns->user_ns, 0)))
                mode = (mode & S_IRWXG) >> 3;
        else
                mode = mode & S_IROTH;
        return (mode << 6) | (mode << 3) | mode;
}

static void pid_table_root_set_ownership(struct ctl_table_header *head,
                                         kuid_t *uid, kgid_t *gid)
{
        struct pid_namespace *pidns =
                container_of(head->set, struct pid_namespace, set);
        kuid_t ns_root_uid;
        kgid_t ns_root_gid;

        ns_root_uid = make_kuid(pidns->user_ns, 0);
        if (uid_valid(ns_root_uid))
                *uid = ns_root_uid;

        ns_root_gid = make_kgid(pidns->user_ns, 0);
        if (gid_valid(ns_root_gid))
                *gid = ns_root_gid;
}

static struct ctl_table_root pid_table_root = {
        .lookup                = pid_table_root_lookup,
        .permissions        = pid_table_root_permissions,
        .set_ownership        = pid_table_root_set_ownership,
};

static int proc_do_cad_pid(const struct ctl_table *table, int write, void *buffer,
                size_t *lenp, loff_t *ppos)
{
        struct pid *new_pid;
        pid_t tmp_pid;
        int r;
        struct ctl_table tmp_table = *table;

        tmp_pid = pid_vnr(cad_pid);
        tmp_table.data = &tmp_pid;

        r = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
        if (r || !write)
                return r;

        new_pid = find_get_pid(tmp_pid);
        if (!new_pid)
                return -ESRCH;

        put_pid(xchg(&cad_pid, new_pid));
        return 0;
}

static const struct ctl_table pid_table[] = {
        {
                .procname        = "pid_max",
                .data                = &init_pid_ns.pid_max,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = &pid_max_min,
                .extra2                = &pid_max_max,
        },
#ifdef CONFIG_PROC_SYSCTL
        {
                .procname        = "cad_pid",
                .maxlen                = sizeof(int),
                .mode                = 0600,
                .proc_handler        = proc_do_cad_pid,
        },
#endif
};
#endif

int register_pidns_sysctls(struct pid_namespace *pidns)
{
#ifdef CONFIG_SYSCTL
        struct ctl_table *tbl;

        setup_sysctl_set(&pidns->set, &pid_table_root, set_is_seen);

        tbl = kmemdup(pid_table, sizeof(pid_table), GFP_KERNEL);
        if (!tbl)
                return -ENOMEM;
        tbl->data = &pidns->pid_max;
        pidns->pid_max = min(pid_max_max, max_t(int, pidns->pid_max,
                             PIDS_PER_CPU_DEFAULT * num_possible_cpus()));

        pidns->sysctls = __register_sysctl_table(&pidns->set, "kernel", tbl,
                                                 ARRAY_SIZE(pid_table));
        if (!pidns->sysctls) {
                kfree(tbl);
                retire_sysctl_set(&pidns->set);
                return -ENOMEM;
        }
#endif
        return 0;
}

void unregister_pidns_sysctls(struct pid_namespace *pidns)
{
#ifdef CONFIG_SYSCTL
        const struct ctl_table *tbl;

        tbl = pidns->sysctls->ctl_table_arg;
        unregister_sysctl_table(pidns->sysctls);
        retire_sysctl_set(&pidns->set);
        kfree(tbl);
#endif
}

void __init pid_idr_init(void)
{
        /* Verify no one has done anything silly: */
        BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);

        /* bump default and minimum pid_max based on number of cpus */
        init_pid_ns.pid_max = min(pid_max_max, max_t(int, init_pid_ns.pid_max,
                                  PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
        pid_max_min = max_t(int, pid_max_min,
                                PIDS_PER_CPU_MIN * num_possible_cpus());
        pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max, pid_max_min);

        idr_init(&init_pid_ns.idr);

        init_pid_ns.pid_cachep = kmem_cache_create("pid",
                        struct_size_t(struct pid, numbers, 1),
                        __alignof__(struct pid),
                        SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT,
                        NULL);
}

static __init int pid_namespace_sysctl_init(void)
{
#ifdef CONFIG_SYSCTL
        /* "kernel" directory will have already been initialized. */
        BUG_ON(register_pidns_sysctls(&init_pid_ns));
#endif
        return 0;
}
subsys_initcall(pid_namespace_sysctl_init);

static struct file *__pidfd_fget(struct task_struct *task, int fd)
{
        struct file *file;
        int ret;

        ret = down_read_killable(&task->signal->exec_update_lock);
        if (ret)
                return ERR_PTR(ret);

        if (ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS))
                file = fget_task(task, fd);
        else
                file = ERR_PTR(-EPERM);

        up_read(&task->signal->exec_update_lock);

        if (!file) {
                /*
                 * It is possible that the target thread is exiting; it can be
                 * either:
                 * 1. before exit_signals(), which gives a real fd
                 * 2. before exit_files() takes the task_lock() gives a real fd
                 * 3. after exit_files() releases task_lock(), ->files is NULL;
                 *    this has PF_EXITING, since it was set in exit_signals(),
                 *    __pidfd_fget() returns EBADF.
                 * In case 3 we get EBADF, but that really means ESRCH, since
                 * the task is currently exiting and has freed its files
                 * struct, so we fix it up.
                 */
                if (task->flags & PF_EXITING)
                        file = ERR_PTR(-ESRCH);
                else
                        file = ERR_PTR(-EBADF);
        }

        return file;
}

static int pidfd_getfd(struct pid *pid, int fd)
{
        struct task_struct *task;
        struct file *file;
        int ret;

        task = get_pid_task(pid, PIDTYPE_PID);
        if (!task)
                return -ESRCH;

        file = __pidfd_fget(task, fd);
        put_task_struct(task);
        if (IS_ERR(file))
                return PTR_ERR(file);

        ret = receive_fd(file, NULL, O_CLOEXEC);
        fput(file);

        return ret;
}

/**
 * sys_pidfd_getfd() - Get a file descriptor from another process
 *
 * @pidfd:        the pidfd file descriptor of the process
 * @fd:                the file descriptor number to get
 * @flags:        flags on how to get the fd (reserved)
 *
 * This syscall gets a copy of a file descriptor from another process
 * based on the pidfd, and file descriptor number. It requires that
 * the calling process has the ability to ptrace the process represented
 * by the pidfd. The process which is having its file descriptor copied
 * is otherwise unaffected.
 *
 * Return: On success, a cloexec file descriptor is returned.
 *         On error, a negative errno number will be returned.
 */
SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd,
                unsigned int, flags)
{
        struct pid *pid;

        /* flags is currently unused - make sure it's unset */
        if (flags)
                return -EINVAL;

        CLASS(fd, f)(pidfd);
        if (fd_empty(f))
                return -EBADF;

        pid = pidfd_pid(fd_file(f));
        if (IS_ERR(pid))
                return PTR_ERR(pid);

        return pidfd_getfd(pid, fd);
}






















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_TASK_STACK_H
#define _LINUX_SCHED_TASK_STACK_H

/*
 * task->stack (kernel stack) handling interfaces:
 */

#include <linux/sched.h>
#include <linux/magic.h>
#include <linux/refcount.h>
#include <linux/kasan.h>

#ifdef CONFIG_THREAD_INFO_IN_TASK

/*
 * When accessing the stack of a non-current task that might exit, use
 * try_get_task_stack() instead.  task_stack_page will return a pointer
 * that could get freed out from under you.
 */
static __always_inline void *task_stack_page(const struct task_struct *task)
{
        return task->stack;
}

#define setup_thread_stack(new,old)        do { } while(0)

static __always_inline unsigned long *end_of_stack(const struct task_struct *task)
{
#ifdef CONFIG_STACK_GROWSUP
        return (unsigned long *)((unsigned long)task->stack + THREAD_SIZE) - 1;
#else
        return task->stack;
#endif
}

#else

#define task_stack_page(task)        ((void *)(task)->stack)

static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
{
        *task_thread_info(p) = *task_thread_info(org);
        task_thread_info(p)->task = p;
}

/*
 * Return the address of the last usable long on the stack.
 *
 * When the stack grows down, this is just above the thread
 * info struct. Going any lower will corrupt the threadinfo.
 *
 * When the stack grows up, this is the highest address.
 * Beyond that position, we corrupt data on the next page.
 */
static inline unsigned long *end_of_stack(const struct task_struct *p)
{
#ifdef CONFIG_STACK_GROWSUP
        return (unsigned long *)((unsigned long)task_thread_info(p) + THREAD_SIZE) - 1;
#else
        return (unsigned long *)(task_thread_info(p) + 1);
#endif
}

#endif

#ifdef CONFIG_THREAD_INFO_IN_TASK
static inline void *try_get_task_stack(struct task_struct *tsk)
{
        return refcount_inc_not_zero(&tsk->stack_refcount) ?
                task_stack_page(tsk) : NULL;
}

extern void put_task_stack(struct task_struct *tsk);
#else
static inline void *try_get_task_stack(struct task_struct *tsk)
{
        return task_stack_page(tsk);
}

static inline void put_task_stack(struct task_struct *tsk) {}
#endif

void exit_task_stack_account(struct task_struct *tsk);

#define task_stack_end_corrupted(task) \
                (*(end_of_stack(task)) != STACK_END_MAGIC)

static inline int object_is_on_stack(const void *obj)
{
        void *stack = task_stack_page(current);

        obj = kasan_reset_tag(obj);
        return (obj >= stack) && (obj < (stack + THREAD_SIZE));
}

extern void thread_stack_cache_init(void);

#ifdef CONFIG_DEBUG_STACK_USAGE
unsigned long stack_not_used(struct task_struct *p);
#else
static inline unsigned long stack_not_used(struct task_struct *p)
{
        return 0;
}
#endif
extern void set_task_stack_end_magic(struct task_struct *tsk);

static inline int kstack_end(void *addr)
{
        /* Reliable end of stack detection:
         * Some APM bios versions misalign the stack
         */
        return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*)));
}

#endif /* _LINUX_SCHED_TASK_STACK_H */


















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
/* SPDX-License-Identifier: GPL-2.0 */
/*
 *        Routines to manage notifier chains for passing status changes to any
 *        interested routines. We need this instead of hard coded call lists so
 *        that modules can poke their nose into the innards. The network devices
 *        needed them so here they are for the rest of you.
 *
 *                                Alan Cox <Alan.Cox@linux.org>
 */
 
#ifndef _LINUX_NOTIFIER_H
#define _LINUX_NOTIFIER_H
#include <linux/errno.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/srcu.h>

/*
 * Notifier chains are of four types:
 *
 *        Atomic notifier chains: Chain callbacks run in interrupt/atomic
 *                context. Callouts are not allowed to block.
 *        Blocking notifier chains: Chain callbacks run in process context.
 *                Callouts are allowed to block.
 *        Raw notifier chains: There are no restrictions on callbacks,
 *                registration, or unregistration.  All locking and protection
 *                must be provided by the caller.
 *        SRCU notifier chains: A variant of blocking notifier chains, with
 *                the same restrictions.
 *
 * atomic_notifier_chain_register() may be called from an atomic context,
 * but blocking_notifier_chain_register() and srcu_notifier_chain_register()
 * must be called from a process context.  Ditto for the corresponding
 * _unregister() routines.
 *
 * atomic_notifier_chain_unregister(), blocking_notifier_chain_unregister(),
 * and srcu_notifier_chain_unregister() _must not_ be called from within
 * the call chain.
 *
 * SRCU notifier chains are an alternative form of blocking notifier chains.
 * They use SRCU (Sleepable Read-Copy Update) instead of rw-semaphores for
 * protection of the chain links.  This means there is _very_ low overhead
 * in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
 * As compensation, srcu_notifier_chain_unregister() is rather expensive.
 * SRCU notifier chains should be used when the chain will be called very
 * often but notifier_blocks will seldom be removed.
 */

struct notifier_block;

typedef        int (*notifier_fn_t)(struct notifier_block *nb,
                        unsigned long action, void *data);

struct notifier_block {
        notifier_fn_t notifier_call;
        struct notifier_block __rcu *next;
        int priority;
};

struct atomic_notifier_head {
        spinlock_t lock;
        struct notifier_block __rcu *head;
};

struct blocking_notifier_head {
        struct rw_semaphore rwsem;
        struct notifier_block __rcu *head;
};

struct raw_notifier_head {
        struct notifier_block __rcu *head;
};

struct srcu_notifier_head {
        struct mutex mutex;
        struct srcu_usage srcuu;
        struct srcu_struct srcu;
        struct notifier_block __rcu *head;
};

#define ATOMIC_INIT_NOTIFIER_HEAD(name) do {        \
                spin_lock_init(&(name)->lock);        \
                (name)->head = NULL;                \
        } while (0)
#define BLOCKING_INIT_NOTIFIER_HEAD(name) do {        \
                init_rwsem(&(name)->rwsem);        \
                (name)->head = NULL;                \
        } while (0)
#define RAW_INIT_NOTIFIER_HEAD(name) do {        \
                (name)->head = NULL;                \
        } while (0)

/* srcu_notifier_heads must be cleaned up dynamically */
extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
#define srcu_cleanup_notifier_head(name)        \
                cleanup_srcu_struct(&(name)->srcu);

#define ATOMIC_NOTIFIER_INIT(name) {                                \
                .lock = __SPIN_LOCK_UNLOCKED(name.lock),        \
                .head = NULL }
#define BLOCKING_NOTIFIER_INIT(name) {                                \
                .rwsem = __RWSEM_INITIALIZER((name).rwsem),        \
                .head = NULL }
#define RAW_NOTIFIER_INIT(name)        {                                \
                .head = NULL }

#define SRCU_NOTIFIER_INIT(name, pcpu)                                \
        {                                                        \
                .mutex = __MUTEX_INITIALIZER(name.mutex),        \
                .head = NULL,                                        \
                .srcuu = __SRCU_USAGE_INIT(name.srcuu),                \
                .srcu = __SRCU_STRUCT_INIT(name.srcu, name.srcuu, pcpu), \
        }

#define ATOMIC_NOTIFIER_HEAD(name)                                \
        struct atomic_notifier_head name =                        \
                ATOMIC_NOTIFIER_INIT(name)
#define BLOCKING_NOTIFIER_HEAD(name)                                \
        struct blocking_notifier_head name =                        \
                BLOCKING_NOTIFIER_INIT(name)
#define RAW_NOTIFIER_HEAD(name)                                        \
        struct raw_notifier_head name =                                \
                RAW_NOTIFIER_INIT(name)

#ifdef CONFIG_TREE_SRCU
#define _SRCU_NOTIFIER_HEAD(name, mod)                                \
        static DEFINE_PER_CPU(struct srcu_data, name##_head_srcu_data); \
        mod struct srcu_notifier_head name =                        \
                        SRCU_NOTIFIER_INIT(name, name##_head_srcu_data)

#else
#define _SRCU_NOTIFIER_HEAD(name, mod)                                \
        mod struct srcu_notifier_head name =                        \
                        SRCU_NOTIFIER_INIT(name, name)

#endif

#define SRCU_NOTIFIER_HEAD(name)                                \
        _SRCU_NOTIFIER_HEAD(name, /* not static */)

#define SRCU_NOTIFIER_HEAD_STATIC(name)                                \
        _SRCU_NOTIFIER_HEAD(name, static)

#ifdef __KERNEL__

extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
                struct notifier_block *nb);
extern int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
                struct notifier_block *nb);
extern int raw_notifier_chain_register(struct raw_notifier_head *nh,
                struct notifier_block *nb);
extern int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
                struct notifier_block *nb);

extern int atomic_notifier_chain_register_unique_prio(
                struct atomic_notifier_head *nh, struct notifier_block *nb);
extern int blocking_notifier_chain_register_unique_prio(
                struct blocking_notifier_head *nh, struct notifier_block *nb);

extern int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
                struct notifier_block *nb);
extern int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
                struct notifier_block *nb);
extern int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
                struct notifier_block *nb);
extern int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
                struct notifier_block *nb);

extern int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
                unsigned long val, void *v);
extern int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
                unsigned long val, void *v);
extern int raw_notifier_call_chain(struct raw_notifier_head *nh,
                unsigned long val, void *v);
extern int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
                unsigned long val, void *v);

extern int blocking_notifier_call_chain_robust(struct blocking_notifier_head *nh,
                unsigned long val_up, unsigned long val_down, void *v);
extern int raw_notifier_call_chain_robust(struct raw_notifier_head *nh,
                unsigned long val_up, unsigned long val_down, void *v);

extern bool atomic_notifier_call_chain_is_empty(struct atomic_notifier_head *nh);

#define NOTIFY_DONE                0x0000                /* Don't care */
#define NOTIFY_OK                0x0001                /* Suits me */
#define NOTIFY_STOP_MASK        0x8000                /* Don't call further */
#define NOTIFY_BAD                (NOTIFY_STOP_MASK|0x0002)
                                                /* Bad/Veto action */
/*
 * Clean way to return from the notifier and stop further calls.
 */
#define NOTIFY_STOP                (NOTIFY_OK|NOTIFY_STOP_MASK)

/* Encapsulate (negative) errno value (in particular, NOTIFY_BAD <=> EPERM). */
static inline int notifier_from_errno(int err)
{
        if (err)
                return NOTIFY_STOP_MASK | (NOTIFY_OK - err);

        return NOTIFY_OK;
}

/* Restore (negative) errno value from notify return value. */
static inline int notifier_to_errno(int ret)
{
        ret &= ~NOTIFY_STOP_MASK;
        return ret > NOTIFY_OK ? NOTIFY_OK - ret : 0;
}

/*
 *        Declared notifiers so far. I can imagine quite a few more chains
 *        over time (eg laptop power reset chains, reboot chain (to clean 
 *        device units up), device [un]mount chain, module load/unload chain,
 *        low memory chain, screenblank chain (for plug in modular screenblankers) 
 *        VC switch chains (for loadable kernel svgalib VC switch helpers) etc...
 */
 
/* CPU notfiers are defined in include/linux/cpu.h. */

/* netdevice notifiers are defined in include/linux/netdevice.h */

/* reboot notifiers are defined in include/linux/reboot.h. */

/* Hibernation and suspend events are defined in include/linux/suspend.h. */

/* Virtual Terminal events are defined in include/linux/vt.h. */

#define NETLINK_URELEASE        0x0001        /* Unicast netlink socket released */

/* Console keyboard events.
 * Note: KBD_KEYCODE is always sent before KBD_UNBOUND_KEYCODE, KBD_UNICODE and
 * KBD_KEYSYM. */
#define KBD_KEYCODE                0x0001 /* Keyboard keycode, called before any other */
#define KBD_UNBOUND_KEYCODE        0x0002 /* Keyboard keycode which is not bound to any other */
#define KBD_UNICODE                0x0003 /* Keyboard unicode */
#define KBD_KEYSYM                0x0004 /* Keyboard keysym */
#define KBD_POST_KEYSYM                0x0005 /* Called after keyboard keysym interpretation */

#endif /* __KERNEL__ */
#endif /* _LINUX_NOTIFIER_H */


































































































































































































































    1 







   64 
   59 































   13 

    1 







































   64 





   58 





   76 







   58 
















































































































































   12 





















    9 
   19 
































































   58 

































   95 
   50 

   39 












  109 
    8 

















































































































    1 







   39 




















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_FORTIFY_STRING_H_
#define _LINUX_FORTIFY_STRING_H_

#include <linux/bitfield.h>
#include <linux/bug.h>
#include <linux/const.h>
#include <linux/limits.h>

#define __FORTIFY_INLINE extern __always_inline __gnu_inline __overloadable
#define __RENAME(x) __asm__(#x)

#define FORTIFY_REASON_DIR(r)                FIELD_GET(BIT(0), r)
#define FORTIFY_REASON_FUNC(r)                FIELD_GET(GENMASK(7, 1), r)
#define FORTIFY_REASON(func, write)        (FIELD_PREP(BIT(0), write) | \
                                         FIELD_PREP(GENMASK(7, 1), func))

/* Overridden by KUnit tests. */
#ifndef fortify_panic
# define fortify_panic(func, write, avail, size, retfail)        \
         __fortify_panic(FORTIFY_REASON(func, write), avail, size)
#endif
#ifndef fortify_warn_once
# define fortify_warn_once(x...)        WARN_ONCE(x)
#endif

#define FORTIFY_READ                 0
#define FORTIFY_WRITE                 1

#define EACH_FORTIFY_FUNC(macro)        \
        macro(strncpy),                        \
        macro(strnlen),                        \
        macro(strlen),                        \
        macro(strscpy),                        \
        macro(strlcat),                        \
        macro(strcat),                        \
        macro(strncat),                        \
        macro(memset),                        \
        macro(memcpy),                        \
        macro(memmove),                        \
        macro(memscan),                        \
        macro(memcmp),                        \
        macro(memchr),                        \
        macro(memchr_inv),                \
        macro(kmemdup),                        \
        macro(strcpy),                        \
        macro(UNKNOWN),

#define MAKE_FORTIFY_FUNC(func)        FORTIFY_FUNC_##func

enum fortify_func {
        EACH_FORTIFY_FUNC(MAKE_FORTIFY_FUNC)
};

void __fortify_report(const u8 reason, const size_t avail, const size_t size);
void __fortify_panic(const u8 reason, const size_t avail, const size_t size) __cold __noreturn;
void __read_overflow(void) __compiletime_error("detected read beyond size of object (1st parameter)");
void __read_overflow2(void) __compiletime_error("detected read beyond size of object (2nd parameter)");
void __read_overflow2_field(size_t avail, size_t wanted) __compiletime_warning("detected read beyond size of field (2nd parameter); maybe use struct_group()?");
void __write_overflow(void) __compiletime_error("detected write beyond size of object (1st parameter)");
void __write_overflow_field(size_t avail, size_t wanted) __compiletime_warning("detected write beyond size of field (1st parameter); maybe use struct_group()?");

#define __compiletime_strlen(p)                                        \
({                                                                \
        char *__p = (char *)(p);                                \
        size_t __ret = SIZE_MAX;                                \
        const size_t __p_size = __member_size(p);                \
        if (__p_size != SIZE_MAX &&                                \
            __builtin_constant_p(*__p)) {                        \
                size_t __p_len = __p_size - 1;                        \
                if (__builtin_constant_p(__p[__p_len]) &&        \
                    __p[__p_len] == '\0')                        \
                        __ret = __builtin_strlen(__p);                \
        }                                                        \
        __ret;                                                        \
})

#if defined(__SANITIZE_ADDRESS__)

#if !defined(CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX) && !defined(CONFIG_GENERIC_ENTRY)
extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(memset);
extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(memmove);
extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(memcpy);
#elif defined(CONFIG_KASAN_GENERIC)
extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(__asan_memset);
extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(__asan_memmove);
extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(__asan_memcpy);
#else /* CONFIG_KASAN_SW_TAGS */
extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(__hwasan_memset);
extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(__hwasan_memmove);
extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(__hwasan_memcpy);
#endif

extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr);
extern int __underlying_memcmp(const void *p, const void *q, __kernel_size_t size) __RENAME(memcmp);
extern char *__underlying_strcat(char *p, const char *q) __RENAME(strcat);
extern char *__underlying_strcpy(char *p, const char *q) __RENAME(strcpy);
extern __kernel_size_t __underlying_strlen(const char *p) __RENAME(strlen);
extern char *__underlying_strncat(char *p, const char *q, __kernel_size_t count) __RENAME(strncat);
extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __RENAME(strncpy);

#else

#if defined(__SANITIZE_MEMORY__)
/*
 * For KMSAN builds all memcpy/memset/memmove calls should be replaced by the
 * corresponding __msan_XXX functions.
 */
#include <linux/kmsan_string.h>
#define __underlying_memcpy        __msan_memcpy
#define __underlying_memmove        __msan_memmove
#define __underlying_memset        __msan_memset
#else
#define __underlying_memcpy        __builtin_memcpy
#define __underlying_memmove        __builtin_memmove
#define __underlying_memset        __builtin_memset
#endif

#define __underlying_memchr        __builtin_memchr
#define __underlying_memcmp        __builtin_memcmp
#define __underlying_strcat        __builtin_strcat
#define __underlying_strcpy        __builtin_strcpy
#define __underlying_strlen        __builtin_strlen
#define __underlying_strncat        __builtin_strncat
#define __underlying_strncpy        __builtin_strncpy

#endif

/**
 * unsafe_memcpy - memcpy implementation with no FORTIFY bounds checking
 *
 * @dst: Destination memory address to write to
 * @src: Source memory address to read from
 * @bytes: How many bytes to write to @dst from @src
 * @justification: Free-form text or comment describing why the use is needed
 *
 * This should be used for corner cases where the compiler cannot do the
 * right thing, or during transitions between APIs, etc. It should be used
 * very rarely, and includes a place for justification detailing where bounds
 * checking has happened, and why existing solutions cannot be employed.
 */
#define unsafe_memcpy(dst, src, bytes, justification)                \
        __underlying_memcpy(dst, src, bytes)

/*
 * Clang's use of __builtin_*object_size() within inlines needs hinting via
 * __pass_*object_size(). The preference is to only ever use type 1 (member
 * size, rather than struct size), but there remain some stragglers using
 * type 0 that will be converted in the future.
 */
#if __has_builtin(__builtin_dynamic_object_size)
#define POS                        __pass_dynamic_object_size(1)
#define POS0                        __pass_dynamic_object_size(0)
#else
#define POS                        __pass_object_size(1)
#define POS0                        __pass_object_size(0)
#endif

#define __compiletime_lessthan(bounds, length)        (        \
        __builtin_constant_p((bounds) < (length)) &&        \
        (bounds) < (length)                                \
)

/**
 * strncpy - Copy a string to memory with non-guaranteed NUL padding
 *
 * @p: pointer to destination of copy
 * @q: pointer to NUL-terminated source string to copy
 * @size: bytes to write at @p
 *
 * If strlen(@q) >= @size, the copy of @q will stop after @size bytes,
 * and @p will NOT be NUL-terminated
 *
 * If strlen(@q) < @size, following the copy of @q, trailing NUL bytes
 * will be written to @p until @size total bytes have been written.
 *
 * Do not use this function. While FORTIFY_SOURCE tries to avoid
 * over-reads of @q, it cannot defend against writing unterminated
 * results to @p. Using strncpy() remains ambiguous and fragile.
 * Instead, please choose an alternative, so that the expectation
 * of @p's contents is unambiguous:
 *
 * +--------------------+--------------------+------------+
 * | **p** needs to be: | padded to **size** | not padded |
 * +====================+====================+============+
 * |     NUL-terminated | strscpy_pad()      | strscpy()  |
 * +--------------------+--------------------+------------+
 * | not NUL-terminated | strtomem_pad()     | strtomem() |
 * +--------------------+--------------------+------------+
 *
 * Note strscpy*()'s differing return values for detecting truncation,
 * and strtomem*()'s expectation that the destination is marked with
 * __nonstring when it is a character array.
 *
 */
__FORTIFY_INLINE __diagnose_as(__builtin_strncpy, 1, 2, 3)
char *strncpy(char * const POS p, const char *q, __kernel_size_t size)
{
        const size_t p_size = __member_size(p);

        if (__compiletime_lessthan(p_size, size))
                __write_overflow();
        if (p_size < size)
                fortify_panic(FORTIFY_FUNC_strncpy, FORTIFY_WRITE, p_size, size, p);
        return __underlying_strncpy(p, q, size);
}

extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen);
/**
 * strnlen - Return bounded count of characters in a NUL-terminated string
 *
 * @p: pointer to NUL-terminated string to count.
 * @maxlen: maximum number of characters to count.
 *
 * Returns number of characters in @p (NOT including the final NUL), or
 * @maxlen, if no NUL has been found up to there.
 *
 */
__FORTIFY_INLINE __kernel_size_t strnlen(const char * const POS p, __kernel_size_t maxlen)
{
        const size_t p_size = __member_size(p);
        const size_t p_len = __compiletime_strlen(p);
        size_t ret;

        /* We can take compile-time actions when maxlen is const. */
        if (__builtin_constant_p(maxlen) && p_len != SIZE_MAX) {
                /* If p is const, we can use its compile-time-known len. */
                if (maxlen >= p_size)
                        return p_len;
        }

        /* Do not check characters beyond the end of p. */
        ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size);
        if (p_size <= ret && maxlen != ret)
                fortify_panic(FORTIFY_FUNC_strnlen, FORTIFY_READ, p_size, ret + 1, ret);
        return ret;
}

/*
 * Defined after fortified strnlen to reuse it. However, it must still be
 * possible for strlen() to be used on compile-time strings for use in
 * static initializers (i.e. as a constant expression).
 */
/**
 * strlen - Return count of characters in a NUL-terminated string
 *
 * @p: pointer to NUL-terminated string to count.
 *
 * Do not use this function unless the string length is known at
 * compile-time. When @p is unterminated, this function may crash
 * or return unexpected counts that could lead to memory content
 * exposures. Prefer strnlen().
 *
 * Returns number of characters in @p (NOT including the final NUL).
 *
 */
#define strlen(p)                                                        \
        __builtin_choose_expr(__is_constexpr(__builtin_strlen(p)),        \
                __builtin_strlen(p), __fortify_strlen(p))
__FORTIFY_INLINE __diagnose_as(__builtin_strlen, 1)
__kernel_size_t __fortify_strlen(const char * const POS p)
{
        const size_t p_size = __member_size(p);
        __kernel_size_t ret;

        /* Give up if we don't know how large p is. */
        if (p_size == SIZE_MAX)
                return __underlying_strlen(p);
        ret = strnlen(p, p_size);
        if (p_size <= ret)
                fortify_panic(FORTIFY_FUNC_strlen, FORTIFY_READ, p_size, ret + 1, ret);
        return ret;
}

/* Defined after fortified strnlen() to reuse it. */
extern ssize_t __real_strscpy(char *, const char *, size_t) __RENAME(sized_strscpy);
__FORTIFY_INLINE ssize_t sized_strscpy(char * const POS p, const char * const POS q, size_t size)
{
        /* Use string size rather than possible enclosing struct size. */
        const size_t p_size = __member_size(p);
        const size_t q_size = __member_size(q);
        size_t len;

        /* If we cannot get size of p and q default to call strscpy. */
        if (p_size == SIZE_MAX && q_size == SIZE_MAX)
                return __real_strscpy(p, q, size);

        /*
         * If size can be known at compile time and is greater than
         * p_size, generate a compile time write overflow error.
         */
        if (__compiletime_lessthan(p_size, size))
                __write_overflow();

        /* Short-circuit for compile-time known-safe lengths. */
        if (__compiletime_lessthan(p_size, SIZE_MAX)) {
                len = __compiletime_strlen(q);

                if (len < SIZE_MAX && __compiletime_lessthan(len, size)) {
                        __underlying_memcpy(p, q, len + 1);
                        return len;
                }
        }

        /*
         * This call protects from read overflow, because len will default to q
         * length if it smaller than size.
         */
        len = strnlen(q, size);
        /*
         * If len equals size, we will copy only size bytes which leads to
         * -E2BIG being returned.
         * Otherwise we will copy len + 1 because of the final '\O'.
         */
        len = len == size ? size : len + 1;

        /*
         * Generate a runtime write overflow error if len is greater than
         * p_size.
         */
        if (p_size < len)
                fortify_panic(FORTIFY_FUNC_strscpy, FORTIFY_WRITE, p_size, len, -E2BIG);

        /*
         * We can now safely call vanilla strscpy because we are protected from:
         * 1. Read overflow thanks to call to strnlen().
         * 2. Write overflow thanks to above ifs.
         */
        return __real_strscpy(p, q, len);
}

/* Defined after fortified strlen() to reuse it. */
extern size_t __real_strlcat(char *p, const char *q, size_t avail) __RENAME(strlcat);
/**
 * strlcat - Append a string to an existing string
 *
 * @p: pointer to %NUL-terminated string to append to
 * @q: pointer to %NUL-terminated string to append from
 * @avail: Maximum bytes available in @p
 *
 * Appends %NUL-terminated string @q after the %NUL-terminated
 * string at @p, but will not write beyond @avail bytes total,
 * potentially truncating the copy from @q. @p will stay
 * %NUL-terminated only if a %NUL already existed within
 * the @avail bytes of @p. If so, the resulting number of
 * bytes copied from @q will be at most "@avail - strlen(@p) - 1".
 *
 * Do not use this function. While FORTIFY_SOURCE tries to avoid
 * read and write overflows, this is only possible when the sizes
 * of @p and @q are known to the compiler. Prefer building the
 * string with formatting, via scnprintf(), seq_buf, or similar.
 *
 * Returns total bytes that _would_ have been contained by @p
 * regardless of truncation, similar to snprintf(). If return
 * value is >= @avail, the string has been truncated.
 *
 */
__FORTIFY_INLINE
size_t strlcat(char * const POS p, const char * const POS q, size_t avail)
{
        const size_t p_size = __member_size(p);
        const size_t q_size = __member_size(q);
        size_t p_len, copy_len;
        size_t actual, wanted;

        /* Give up immediately if both buffer sizes are unknown. */
        if (p_size == SIZE_MAX && q_size == SIZE_MAX)
                return __real_strlcat(p, q, avail);

        p_len = strnlen(p, avail);
        copy_len = strlen(q);
        wanted = actual = p_len + copy_len;

        /* Cannot append any more: report truncation. */
        if (avail <= p_len)
                return wanted;

        /* Give up if string is already overflowed. */
        if (p_size <= p_len)
                fortify_panic(FORTIFY_FUNC_strlcat, FORTIFY_READ, p_size, p_len + 1, wanted);

        if (actual >= avail) {
                copy_len = avail - p_len - 1;
                actual = p_len + copy_len;
        }

        /* Give up if copy will overflow. */
        if (p_size <= actual)
                fortify_panic(FORTIFY_FUNC_strlcat, FORTIFY_WRITE, p_size, actual + 1, wanted);
        __underlying_memcpy(p + p_len, q, copy_len);
        p[actual] = '\0';

        return wanted;
}

/* Defined after fortified strlcat() to reuse it. */
/**
 * strcat - Append a string to an existing string
 *
 * @p: pointer to NUL-terminated string to append to
 * @q: pointer to NUL-terminated source string to append from
 *
 * Do not use this function. While FORTIFY_SOURCE tries to avoid
 * read and write overflows, this is only possible when the
 * destination buffer size is known to the compiler. Prefer
 * building the string with formatting, via scnprintf() or similar.
 * At the very least, use strncat().
 *
 * Returns @p.
 *
 */
__FORTIFY_INLINE __diagnose_as(__builtin_strcat, 1, 2)
char *strcat(char * const POS p, const char *q)
{
        const size_t p_size = __member_size(p);
        const size_t wanted = strlcat(p, q, p_size);

        if (p_size <= wanted)
                fortify_panic(FORTIFY_FUNC_strcat, FORTIFY_WRITE, p_size, wanted + 1, p);
        return p;
}

/**
 * strncat - Append a string to an existing string
 *
 * @p: pointer to NUL-terminated string to append to
 * @q: pointer to source string to append from
 * @count: Maximum bytes to read from @q
 *
 * Appends at most @count bytes from @q (stopping at the first
 * NUL byte) after the NUL-terminated string at @p. @p will be
 * NUL-terminated.
 *
 * Do not use this function. While FORTIFY_SOURCE tries to avoid
 * read and write overflows, this is only possible when the sizes
 * of @p and @q are known to the compiler. Prefer building the
 * string with formatting, via scnprintf() or similar.
 *
 * Returns @p.
 *
 */
/* Defined after fortified strlen() and strnlen() to reuse them. */
__FORTIFY_INLINE __diagnose_as(__builtin_strncat, 1, 2, 3)
char *strncat(char * const POS p, const char * const POS q, __kernel_size_t count)
{
        const size_t p_size = __member_size(p);
        const size_t q_size = __member_size(q);
        size_t p_len, copy_len, total;

        if (p_size == SIZE_MAX && q_size == SIZE_MAX)
                return __underlying_strncat(p, q, count);
        p_len = strlen(p);
        copy_len = strnlen(q, count);
        total = p_len + copy_len + 1;
        if (p_size < total)
                fortify_panic(FORTIFY_FUNC_strncat, FORTIFY_WRITE, p_size, total, p);
        __underlying_memcpy(p + p_len, q, copy_len);
        p[p_len + copy_len] = '\0';
        return p;
}

__FORTIFY_INLINE bool fortify_memset_chk(__kernel_size_t size,
                                         const size_t p_size,
                                         const size_t p_size_field)
{
        if (__builtin_constant_p(size)) {
                /*
                 * Length argument is a constant expression, so we
                 * can perform compile-time bounds checking where
                 * buffer sizes are also known at compile time.
                 */

                /* Error when size is larger than enclosing struct. */
                if (__compiletime_lessthan(p_size_field, p_size) &&
                    __compiletime_lessthan(p_size, size))
                        __write_overflow();

                /* Warn when write size is larger than dest field. */
                if (__compiletime_lessthan(p_size_field, size))
                        __write_overflow_field(p_size_field, size);
        }
        /*
         * At this point, length argument may not be a constant expression,
         * so run-time bounds checking can be done where buffer sizes are
         * known. (This is not an "else" because the above checks may only
         * be compile-time warnings, and we want to still warn for run-time
         * overflows.)
         */

        /*
         * Always stop accesses beyond the struct that contains the
         * field, when the buffer's remaining size is known.
         * (The SIZE_MAX test is to optimize away checks where the buffer
         * lengths are unknown.)
         */
        if (p_size != SIZE_MAX && p_size < size)
                fortify_panic(FORTIFY_FUNC_memset, FORTIFY_WRITE, p_size, size, true);
        return false;
}

#define __fortify_memset_chk(p, c, size, p_size, p_size_field) ({        \
        size_t __fortify_size = (size_t)(size);                                \
        fortify_memset_chk(__fortify_size, p_size, p_size_field),        \
        __underlying_memset(p, c, __fortify_size);                        \
})

/*
 * __struct_size() vs __member_size() must be captured here to avoid
 * evaluating argument side-effects further into the macro layers.
 */
#ifndef CONFIG_KMSAN
#define memset(p, c, s) __fortify_memset_chk(p, c, s,                        \
                __struct_size(p), __member_size(p))
#endif

/*
 * To make sure the compiler can enforce protection against buffer overflows,
 * memcpy(), memmove(), and memset() must not be used beyond individual
 * struct members. If you need to copy across multiple members, please use
 * struct_group() to create a named mirror of an anonymous struct union.
 * (e.g. see struct sk_buff.) Read overflow checking is currently only
 * done when a write overflow is also present, or when building with W=1.
 *
 * Mitigation coverage matrix
 *                                        Bounds checking at:
 *                                        +-------+-------+-------+-------+
 *                                        | Compile time  |   Run time    |
 * memcpy() argument sizes:                | write | read  | write | read  |
 *        dest     source   length      +-------+-------+-------+-------+
 * memcpy(known,   known,   constant)        |   y   |   y   |  n/a  |  n/a  |
 * memcpy(known,   unknown, constant)        |   y   |   n   |  n/a  |   V   |
 * memcpy(known,   known,   dynamic)        |   n   |   n   |   B   |   B   |
 * memcpy(known,   unknown, dynamic)        |   n   |   n   |   B   |   V   |
 * memcpy(unknown, known,   constant)        |   n   |   y   |   V   |  n/a  |
 * memcpy(unknown, unknown, constant)        |   n   |   n   |   V   |   V   |
 * memcpy(unknown, known,   dynamic)        |   n   |   n   |   V   |   B   |
 * memcpy(unknown, unknown, dynamic)        |   n   |   n   |   V   |   V   |
 *                                        +-------+-------+-------+-------+
 *
 * y = perform deterministic compile-time bounds checking
 * n = cannot perform deterministic compile-time bounds checking
 * n/a = no run-time bounds checking needed since compile-time deterministic
 * B = can perform run-time bounds checking (currently unimplemented)
 * V = vulnerable to run-time overflow (will need refactoring to solve)
 *
 */
__FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size,
                                         const size_t p_size,
                                         const size_t q_size,
                                         const size_t p_size_field,
                                         const size_t q_size_field,
                                         const u8 func)
{
        if (__builtin_constant_p(size)) {
                /*
                 * Length argument is a constant expression, so we
                 * can perform compile-time bounds checking where
                 * buffer sizes are also known at compile time.
                 */

                /* Error when size is larger than enclosing struct. */
                if (__compiletime_lessthan(p_size_field, p_size) &&
                    __compiletime_lessthan(p_size, size))
                        __write_overflow();
                if (__compiletime_lessthan(q_size_field, q_size) &&
                    __compiletime_lessthan(q_size, size))
                        __read_overflow2();

                /* Warn when write size argument larger than dest field. */
                if (__compiletime_lessthan(p_size_field, size))
                        __write_overflow_field(p_size_field, size);
                /*
                 * Warn for source field over-read when building with W=1
                 * or when an over-write happened, so both can be fixed at
                 * the same time.
                 */
                if ((IS_ENABLED(KBUILD_EXTRA_WARN1) ||
                     __compiletime_lessthan(p_size_field, size)) &&
                    __compiletime_lessthan(q_size_field, size))
                        __read_overflow2_field(q_size_field, size);
        }
        /*
         * At this point, length argument may not be a constant expression,
         * so run-time bounds checking can be done where buffer sizes are
         * known. (This is not an "else" because the above checks may only
         * be compile-time warnings, and we want to still warn for run-time
         * overflows.)
         */

        /*
         * Always stop accesses beyond the struct that contains the
         * field, when the buffer's remaining size is known.
         * (The SIZE_MAX test is to optimize away checks where the buffer
         * lengths are unknown.)
         */
        if (p_size != SIZE_MAX && p_size < size)
                fortify_panic(func, FORTIFY_WRITE, p_size, size, true);
        else if (q_size != SIZE_MAX && q_size < size)
                fortify_panic(func, FORTIFY_READ, q_size, size, true);

        /*
         * Warn when writing beyond destination field size.
         *
         * Note the implementation of __builtin_*object_size() behaves
         * like sizeof() when not directly referencing a flexible
         * array member, which means there will be many bounds checks
         * that will appear at run-time, without a way for them to be
         * detected at compile-time (as can be done when the destination
         * is specifically the flexible array member).
         * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101832
         */
        if (p_size_field != SIZE_MAX &&
            p_size != p_size_field && p_size_field < size)
                return true;

        return false;
}

/*
 * To work around what seems to be an optimizer bug, the macro arguments
 * need to have const copies or the values end up changed by the time they
 * reach fortify_warn_once(). See commit 6f7630b1b5bc ("fortify: Capture
 * __bos() results in const temp vars") for more details.
 */
#define __fortify_memcpy_chk(p, q, size, p_size, q_size,                \
                             p_size_field, q_size_field, op) ({                \
        const size_t __fortify_size = (size_t)(size);                        \
        const size_t __p_size = (p_size);                                \
        const size_t __q_size = (q_size);                                \
        const size_t __p_size_field = (p_size_field);                        \
        const size_t __q_size_field = (q_size_field);                        \
        /* Keep a mutable version of the size for the final copy. */        \
        size_t __copy_size = __fortify_size;                                \
        fortify_warn_once(fortify_memcpy_chk(__fortify_size, __p_size,        \
                                     __q_size, __p_size_field,                \
                                     __q_size_field, FORTIFY_FUNC_ ##op), \
                  #op ": detected field-spanning write (size %zu) of single %s (size %zu)\n", \
                  __fortify_size,                                        \
                  "field \"" #p "\" at " FILE_LINE,                        \
                  __p_size_field);                                        \
        /* Hide only the run-time size from value range tracking to */        \
        /* silence compile-time false positive bounds warnings. */        \
        if (!__builtin_constant_p(__copy_size))                                \
                OPTIMIZER_HIDE_VAR(__copy_size);                        \
        __underlying_##op(p, q, __copy_size);                                \
})

/*
 * Notes about compile-time buffer size detection:
 *
 * With these types...
 *
 *        struct middle {
 *                u16 a;
 *                u8 middle_buf[16];
 *                int b;
 *        };
 *        struct end {
 *                u16 a;
 *                u8 end_buf[16];
 *        };
 *        struct flex {
 *                int a;
 *                u8 flex_buf[];
 *        };
 *
 *        void func(TYPE *ptr) { ... }
 *
 * Cases where destination size cannot be currently detected:
 * - the size of ptr's object (seemingly by design, gcc & clang fail):
 *        __builtin_object_size(ptr, 1) == SIZE_MAX
 * - the size of flexible arrays in ptr's obj (by design, dynamic size):
 *        __builtin_object_size(ptr->flex_buf, 1) == SIZE_MAX
 * - the size of ANY array at the end of ptr's obj (gcc and clang bug):
 *        __builtin_object_size(ptr->end_buf, 1) == SIZE_MAX
 *        https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101836
 *
 * Cases where destination size is currently detected:
 * - the size of non-array members within ptr's object:
 *        __builtin_object_size(ptr->a, 1) == 2
 * - the size of non-flexible-array in the middle of ptr's obj:
 *        __builtin_object_size(ptr->middle_buf, 1) == 16
 *
 */

/*
 * __struct_size() vs __member_size() must be captured here to avoid
 * evaluating argument side-effects further into the macro layers.
 */
#define memcpy(p, q, s)  __fortify_memcpy_chk(p, q, s,                        \
                __struct_size(p), __struct_size(q),                        \
                __member_size(p), __member_size(q),                        \
                memcpy)
#define memmove(p, q, s)  __fortify_memcpy_chk(p, q, s,                        \
                __struct_size(p), __struct_size(q),                        \
                __member_size(p), __member_size(q),                        \
                memmove)

extern void *__real_memscan(void *, int, __kernel_size_t) __RENAME(memscan);
__FORTIFY_INLINE void *memscan(void * const POS0 p, int c, __kernel_size_t size)
{
        const size_t p_size = __struct_size(p);

        if (__compiletime_lessthan(p_size, size))
                __read_overflow();
        if (p_size < size)
                fortify_panic(FORTIFY_FUNC_memscan, FORTIFY_READ, p_size, size, NULL);
        return __real_memscan(p, c, size);
}

__FORTIFY_INLINE __diagnose_as(__builtin_memcmp, 1, 2, 3)
int memcmp(const void * const POS0 p, const void * const POS0 q, __kernel_size_t size)
{
        const size_t p_size = __struct_size(p);
        const size_t q_size = __struct_size(q);

        if (__builtin_constant_p(size)) {
                if (__compiletime_lessthan(p_size, size))
                        __read_overflow();
                if (__compiletime_lessthan(q_size, size))
                        __read_overflow2();
        }
        if (p_size < size)
                fortify_panic(FORTIFY_FUNC_memcmp, FORTIFY_READ, p_size, size, INT_MIN);
        else if (q_size < size)
                fortify_panic(FORTIFY_FUNC_memcmp, FORTIFY_READ, q_size, size, INT_MIN);
        return __underlying_memcmp(p, q, size);
}

__FORTIFY_INLINE __diagnose_as(__builtin_memchr, 1, 2, 3)
void *memchr(const void * const POS0 p, int c, __kernel_size_t size)
{
        const size_t p_size = __struct_size(p);

        if (__compiletime_lessthan(p_size, size))
                __read_overflow();
        if (p_size < size)
                fortify_panic(FORTIFY_FUNC_memchr, FORTIFY_READ, p_size, size, NULL);
        return __underlying_memchr(p, c, size);
}

void *__real_memchr_inv(const void *s, int c, size_t n) __RENAME(memchr_inv);
__FORTIFY_INLINE void *memchr_inv(const void * const POS0 p, int c, size_t size)
{
        const size_t p_size = __struct_size(p);

        if (__compiletime_lessthan(p_size, size))
                __read_overflow();
        if (p_size < size)
                fortify_panic(FORTIFY_FUNC_memchr_inv, FORTIFY_READ, p_size, size, NULL);
        return __real_memchr_inv(p, c, size);
}

extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup_noprof)
                                                                    __realloc_size(2);
__FORTIFY_INLINE void *kmemdup_noprof(const void * const POS0 p, size_t size, gfp_t gfp)
{
        const size_t p_size = __struct_size(p);

        if (__compiletime_lessthan(p_size, size))
                __read_overflow();
        if (p_size < size)
                fortify_panic(FORTIFY_FUNC_kmemdup, FORTIFY_READ, p_size, size,
                              __real_kmemdup(p, 0, gfp));
        return __real_kmemdup(p, size, gfp);
}
#define kmemdup(...)        alloc_hooks(kmemdup_noprof(__VA_ARGS__))

/**
 * strcpy - Copy a string into another string buffer
 *
 * @p: pointer to destination of copy
 * @q: pointer to NUL-terminated source string to copy
 *
 * Do not use this function. While FORTIFY_SOURCE tries to avoid
 * overflows, this is only possible when the sizes of @q and @p are
 * known to the compiler. Prefer strscpy(), though note its different
 * return values for detecting truncation.
 *
 * Returns @p.
 *
 */
/* Defined after fortified strlen to reuse it. */
__FORTIFY_INLINE __diagnose_as(__builtin_strcpy, 1, 2)
char *strcpy(char * const POS p, const char * const POS q)
{
        const size_t p_size = __member_size(p);
        const size_t q_size = __member_size(q);
        size_t size;

        /* If neither buffer size is known, immediately give up. */
        if (__builtin_constant_p(p_size) &&
            __builtin_constant_p(q_size) &&
            p_size == SIZE_MAX && q_size == SIZE_MAX)
                return __underlying_strcpy(p, q);
        size = strlen(q) + 1;
        /* Compile-time check for const size overflow. */
        if (__compiletime_lessthan(p_size, size))
                __write_overflow();
        /* Run-time check for dynamic size overflow. */
        if (p_size < size)
                fortify_panic(FORTIFY_FUNC_strcpy, FORTIFY_WRITE, p_size, size, p);
        __underlying_memcpy(p, q, size);
        return p;
}

/* Don't use these outside the FORITFY_SOURCE implementation */
#undef __underlying_memchr
#undef __underlying_memcmp
#undef __underlying_strcat
#undef __underlying_strcpy
#undef __underlying_strlen
#undef __underlying_strncat
#undef __underlying_strncpy

#undef POS
#undef POS0

#endif /* _LINUX_FORTIFY_STRING_H_ */








































































































































































































































































































































































































































































































































  320 

























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_UACCESS_H
#define _ASM_X86_UACCESS_H
/*
 * User space memory access functions
 */
#include <linux/compiler.h>
#include <linux/instrumented.h>
#include <linux/kasan-checks.h>
#include <linux/mm_types.h>
#include <linux/string.h>
#include <linux/mmap_lock.h>
#include <asm/asm.h>
#include <asm/page.h>
#include <asm/smap.h>
#include <asm/extable.h>
#include <asm/tlbflush.h>

#ifdef CONFIG_X86_32
# include <asm/uaccess_32.h>
#else
# include <asm/uaccess_64.h>
#endif

#include <asm-generic/access_ok.h>

extern int __get_user_1(void);
extern int __get_user_2(void);
extern int __get_user_4(void);
extern int __get_user_8(void);
extern int __get_user_nocheck_1(void);
extern int __get_user_nocheck_2(void);
extern int __get_user_nocheck_4(void);
extern int __get_user_nocheck_8(void);
extern int __get_user_bad(void);

#define __uaccess_begin() stac()
#define __uaccess_end()   clac()
#define __uaccess_begin_nospec()        \
({                                        \
        stac();                                \
        barrier_nospec();                \
})

/*
 * This is the smallest unsigned integer type that can fit a value
 * (up to 'long long')
 */
#define __inttype(x) __typeof__(                \
        __typefits(x,char,                        \
          __typefits(x,short,                        \
            __typefits(x,int,                        \
              __typefits(x,long,0ULL)))))

#define __typefits(x,type,not) \
        __builtin_choose_expr(sizeof(x)<=sizeof(type),(unsigned type)0,not)

/*
 * This is used for both get_user() and __get_user() to expand to
 * the proper special function call that has odd calling conventions
 * due to returning both a value and an error, and that depends on
 * the size of the pointer passed in.
 *
 * Careful: we have to cast the result to the type of the pointer
 * for sign reasons.
 *
 * The use of _ASM_DX as the register specifier is a bit of a
 * simplification, as gcc only cares about it as the starting point
 * and not size: for a 64-bit value it will use %ecx:%edx on 32 bits
 * (%ecx being the next register in gcc's x86 register sequence), and
 * %rdx on 64 bits.
 *
 * Clang/LLVM cares about the size of the register, but still wants
 * the base register for something that ends up being a pair.
 */
#define do_get_user_call(fn,x,ptr)                                        \
({                                                                        \
        int __ret_gu;                                                        \
        register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX);                \
        __chk_user_ptr(ptr);                                                \
        asm volatile("call __" #fn "_%c[size]"                                \
                     : "=a" (__ret_gu), "=r" (__val_gu),                \
                        ASM_CALL_CONSTRAINT                                \
                     : "0" (ptr), [size] "i" (sizeof(*(ptr))));                \
        instrument_get_user(__val_gu);                                        \
        (x) = (__force __typeof__(*(ptr))) __val_gu;                        \
        __builtin_expect(__ret_gu, 0);                                        \
})

/**
 * get_user - Get a simple variable from user space.
 * @x:   Variable to store result.
 * @ptr: Source address, in user space.
 *
 * Context: User context only. This function may sleep if pagefaults are
 *          enabled.
 *
 * This macro copies a single simple variable from user space to kernel
 * space.  It supports simple types like char and int, but not larger
 * data types like structures or arrays.
 *
 * @ptr must have pointer-to-simple-variable type, and the result of
 * dereferencing @ptr must be assignable to @x without a cast.
 *
 * Return: zero on success, or -EFAULT on error.
 * On error, the variable @x is set to zero.
 */
#define get_user(x,ptr) ({ might_fault(); do_get_user_call(get_user,x,ptr); })

/**
 * __get_user - Get a simple variable from user space, with less checking.
 * @x:   Variable to store result.
 * @ptr: Source address, in user space.
 *
 * Context: User context only. This function may sleep if pagefaults are
 *          enabled.
 *
 * This macro copies a single simple variable from user space to kernel
 * space.  It supports simple types like char and int, but not larger
 * data types like structures or arrays.
 *
 * @ptr must have pointer-to-simple-variable type, and the result of
 * dereferencing @ptr must be assignable to @x without a cast.
 *
 * Caller must check the pointer with access_ok() before calling this
 * function.
 *
 * Return: zero on success, or -EFAULT on error.
 * On error, the variable @x is set to zero.
 */
#define __get_user(x,ptr) do_get_user_call(get_user_nocheck,x,ptr)


#ifdef CONFIG_X86_32
#define __put_user_goto_u64(x, addr, label)                        \
        asm goto("\n"                                        \
                     "1:        movl %%eax,0(%1)\n"                \
                     "2:        movl %%edx,4(%1)\n"                \
                     _ASM_EXTABLE_UA(1b, %l2)                        \
                     _ASM_EXTABLE_UA(2b, %l2)                        \
                     : : "A" (x), "r" (addr)                        \
                     : : label)

#else
#define __put_user_goto_u64(x, ptr, label) \
        __put_user_goto(x, ptr, "q", "er", label)
#endif

extern void __put_user_bad(void);

/*
 * Strange magic calling convention: pointer in %ecx,
 * value in %eax(:%edx), return value in %ecx. clobbers %rbx
 */
extern void __put_user_1(void);
extern void __put_user_2(void);
extern void __put_user_4(void);
extern void __put_user_8(void);
extern void __put_user_nocheck_1(void);
extern void __put_user_nocheck_2(void);
extern void __put_user_nocheck_4(void);
extern void __put_user_nocheck_8(void);

/*
 * ptr must be evaluated and assigned to the temporary __ptr_pu before
 * the assignment of x to __val_pu, to avoid any function calls
 * involved in the ptr expression (possibly implicitly generated due
 * to KASAN) from clobbering %ax.
 */
#define do_put_user_call(fn,x,ptr)                                        \
({                                                                        \
        int __ret_pu;                                                        \
        void __user *__ptr_pu;                                                \
        register __typeof__(*(ptr)) __val_pu asm("%"_ASM_AX);                \
        __typeof__(*(ptr)) __x = (x); /* eval x once */                        \
        __typeof__(ptr) __ptr = (ptr); /* eval ptr once */                \
        __chk_user_ptr(__ptr);                                                \
        __ptr_pu = __ptr;                                                \
        __val_pu = __x;                                                        \
        asm volatile("call __" #fn "_%c[size]"                                \
                     : "=c" (__ret_pu),                                        \
                        ASM_CALL_CONSTRAINT                                \
                     : "0" (__ptr_pu),                                        \
                       "r" (__val_pu),                                        \
                       [size] "i" (sizeof(*(ptr)))                        \
                     :"ebx");                                                \
        instrument_put_user(__x, __ptr, sizeof(*(ptr)));                \
        __builtin_expect(__ret_pu, 0);                                        \
})

/**
 * put_user - Write a simple value into user space.
 * @x:   Value to copy to user space.
 * @ptr: Destination address, in user space.
 *
 * Context: User context only. This function may sleep if pagefaults are
 *          enabled.
 *
 * This macro copies a single simple value from kernel space to user
 * space.  It supports simple types like char and int, but not larger
 * data types like structures or arrays.
 *
 * @ptr must have pointer-to-simple-variable type, and @x must be assignable
 * to the result of dereferencing @ptr.
 *
 * Return: zero on success, or -EFAULT on error.
 */
#define put_user(x, ptr) ({ might_fault(); do_put_user_call(put_user,x,ptr); })

/**
 * __put_user - Write a simple value into user space, with less checking.
 * @x:   Value to copy to user space.
 * @ptr: Destination address, in user space.
 *
 * Context: User context only. This function may sleep if pagefaults are
 *          enabled.
 *
 * This macro copies a single simple value from kernel space to user
 * space.  It supports simple types like char and int, but not larger
 * data types like structures or arrays.
 *
 * @ptr must have pointer-to-simple-variable type, and @x must be assignable
 * to the result of dereferencing @ptr.
 *
 * Caller must check the pointer with access_ok() before calling this
 * function.
 *
 * Return: zero on success, or -EFAULT on error.
 */
#define __put_user(x, ptr) do_put_user_call(put_user_nocheck,x,ptr)

#define __put_user_size(x, ptr, size, label)                                \
do {                                                                        \
        __typeof__(*(ptr)) __x = (x); /* eval x once */                        \
        __typeof__(ptr) __ptr = (ptr); /* eval ptr once */                \
        __chk_user_ptr(__ptr);                                                \
        switch (size) {                                                        \
        case 1:                                                                \
                __put_user_goto(__x, __ptr, "b", "iq", label);                \
                break;                                                        \
        case 2:                                                                \
                __put_user_goto(__x, __ptr, "w", "ir", label);                \
                break;                                                        \
        case 4:                                                                \
                __put_user_goto(__x, __ptr, "l", "ir", label);                \
                break;                                                        \
        case 8:                                                                \
                __put_user_goto_u64(__x, __ptr, label);                        \
                break;                                                        \
        default:                                                        \
                __put_user_bad();                                        \
        }                                                                \
        instrument_put_user(__x, __ptr, size);                                \
} while (0)

#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT

#ifdef CONFIG_X86_32
#define __get_user_asm_u64(x, ptr, label) do {                                \
        unsigned int __gu_low, __gu_high;                                \
        const unsigned int __user *__gu_ptr;                                \
        __gu_ptr = (const void __user *)(ptr);                                \
        __get_user_asm(__gu_low, __gu_ptr, "l", "=r", label);                \
        __get_user_asm(__gu_high, __gu_ptr+1, "l", "=r", label);        \
        (x) = ((unsigned long long)__gu_high << 32) | __gu_low;                \
} while (0)
#else
#define __get_user_asm_u64(x, ptr, label)                                \
        __get_user_asm(x, ptr, "q", "=r", label)
#endif

#define __get_user_size(x, ptr, size, label)                                \
do {                                                                        \
        __chk_user_ptr(ptr);                                                \
        switch (size) {                                                        \
        case 1:        {                                                        \
                unsigned char x_u8__;                                        \
                __get_user_asm(x_u8__, ptr, "b", "=q", label);                \
                (x) = x_u8__;                                                \
                break;                                                        \
        }                                                                \
        case 2:                                                                \
                __get_user_asm(x, ptr, "w", "=r", label);                \
                break;                                                        \
        case 4:                                                                \
                __get_user_asm(x, ptr, "l", "=r", label);                \
                break;                                                        \
        case 8:                                                                \
                __get_user_asm_u64(x, ptr, label);                        \
                break;                                                        \
        default:                                                        \
                (x) = __get_user_bad();                                        \
        }                                                                \
        instrument_get_user(x);                                                \
} while (0)

#define __get_user_asm(x, addr, itype, ltype, label)                        \
        asm_goto_output("\n"                                                \
                     "1:        mov"itype" %[umem],%[output]\n"                \
                     _ASM_EXTABLE_UA(1b, %l2)                                \
                     : [output] ltype(x)                                \
                     : [umem] "m" (__m(addr))                                \
                     : : label)

#else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT

#ifdef CONFIG_X86_32
#define __get_user_asm_u64(x, ptr, retval)                                \
({                                                                        \
        __typeof__(ptr) __ptr = (ptr);                                        \
        asm volatile("\n"                                                \
                     "1:        movl %[lowbits],%%eax\n"                \
                     "2:        movl %[highbits],%%edx\n"                \
                     "3:\n"                                                \
                     _ASM_EXTABLE_TYPE_REG(1b, 3b, EX_TYPE_EFAULT_REG |        \
                                           EX_FLAG_CLEAR_AX_DX,                \
                                           %[errout])                        \
                     _ASM_EXTABLE_TYPE_REG(2b, 3b, EX_TYPE_EFAULT_REG |        \
                                           EX_FLAG_CLEAR_AX_DX,                \
                                           %[errout])                        \
                     : [errout] "=r" (retval),                                \
                       [output] "=&A"(x)                                \
                     : [lowbits] "m" (__m(__ptr)),                        \
                       [highbits] "m" __m(((u32 __user *)(__ptr)) + 1),        \
                       "0" (retval));                                        \
})

#else
#define __get_user_asm_u64(x, ptr, retval) \
         __get_user_asm(x, ptr, retval, "q")
#endif

#define __get_user_size(x, ptr, size, retval)                                \
do {                                                                        \
        unsigned char x_u8__;                                                \
                                                                        \
        retval = 0;                                                        \
        __chk_user_ptr(ptr);                                                \
        switch (size) {                                                        \
        case 1:                                                                \
                __get_user_asm(x_u8__, ptr, retval, "b");                \
                (x) = x_u8__;                                                \
                break;                                                        \
        case 2:                                                                \
                __get_user_asm(x, ptr, retval, "w");                        \
                break;                                                        \
        case 4:                                                                \
                __get_user_asm(x, ptr, retval, "l");                        \
                break;                                                        \
        case 8:                                                                \
                __get_user_asm_u64(x, ptr, retval);                        \
                break;                                                        \
        default:                                                        \
                (x) = __get_user_bad();                                        \
        }                                                                \
} while (0)

#define __get_user_asm(x, addr, err, itype)                                \
        asm volatile("\n"                                                \
                     "1:        mov"itype" %[umem],%[output]\n"                \
                     "2:\n"                                                \
                     _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG | \
                                           EX_FLAG_CLEAR_AX,                \
                                           %[errout])                        \
                     : [errout] "=r" (err),                                \
                       [output] "=a" (x)                                \
                     : [umem] "m" (__m(addr)),                                \
                       "0" (err))

#endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT

#ifdef CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT
#define __try_cmpxchg_user_asm(itype, ltype, _ptr, _pold, _new, label)        ({ \
        bool success;                                                        \
        __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);                \
        __typeof__(*(_ptr)) __old = *_old;                                \
        __typeof__(*(_ptr)) __new = (_new);                                \
        asm_goto_output("\n"                                                \
                     "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\
                     _ASM_EXTABLE_UA(1b, %l[label])                        \
                     : "=@ccz" (success),                                \
                       [ptr] "+m" (*_ptr),                                \
                       [old] "+a" (__old)                                \
                     : [new] ltype (__new)                                \
                     : "memory"                                                \
                     : label);                                                \
        if (unlikely(!success))                                                \
                *_old = __old;                                                \
        likely(success);                                        })

#ifdef CONFIG_X86_32
#define __try_cmpxchg64_user_asm(_ptr, _pold, _new, label)        ({        \
        bool success;                                                        \
        __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);                \
        __typeof__(*(_ptr)) __old = *_old;                                \
        __typeof__(*(_ptr)) __new = (_new);                                \
        asm_goto_output("\n"                                                \
                     "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n"                \
                     _ASM_EXTABLE_UA(1b, %l[label])                        \
                     : "=@ccz" (success),                                \
                       "+A" (__old),                                        \
                       [ptr] "+m" (*_ptr)                                \
                     : "b" ((u32)__new),                                \
                       "c" ((u32)((u64)__new >> 32))                        \
                     : "memory"                                                \
                     : label);                                                \
        if (unlikely(!success))                                                \
                *_old = __old;                                                \
        likely(success);                                        })
#endif // CONFIG_X86_32
#else  // !CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT
#define __try_cmpxchg_user_asm(itype, ltype, _ptr, _pold, _new, label)        ({ \
        int __err = 0;                                                        \
        bool success;                                                        \
        __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);                \
        __typeof__(*(_ptr)) __old = *_old;                                \
        __typeof__(*(_ptr)) __new = (_new);                                \
        asm volatile("\n"                                                \
                     "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\
                     "2:\n"                                                \
                     _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG,        \
                                           %[errout])                        \
                     : "=@ccz" (success),                                \
                       [errout] "+r" (__err),                                \
                       [ptr] "+m" (*_ptr),                                \
                       [old] "+a" (__old)                                \
                     : [new] ltype (__new)                                \
                     : "memory");                                        \
        if (unlikely(__err))                                                \
                goto label;                                                \
        if (unlikely(!success))                                                \
                *_old = __old;                                                \
        likely(success);                                        })

#ifdef CONFIG_X86_32
/*
 * Unlike the normal CMPXCHG, use output GPR for both success/fail and error.
 * There are only six GPRs available and four (EAX, EBX, ECX, and EDX) are
 * hardcoded by CMPXCHG8B, leaving only ESI and EDI.  If the compiler uses
 * both ESI and EDI for the memory operand, compilation will fail if the error
 * is an input+output as there will be no register available for input.
 */
#define __try_cmpxchg64_user_asm(_ptr, _pold, _new, label)        ({        \
        int __result;                                                        \
        __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);                \
        __typeof__(*(_ptr)) __old = *_old;                                \
        __typeof__(*(_ptr)) __new = (_new);                                \
        asm volatile("\n"                                                \
                     "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n"                \
                     "mov $0, %[result]\n\t"                                \
                     "setz %b[result]\n"                                \
                     "2:\n"                                                \
                     _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG,        \
                                           %[result])                        \
                     : [result] "=q" (__result),                        \
                       "+A" (__old),                                        \
                       [ptr] "+m" (*_ptr)                                \
                     : "b" ((u32)__new),                                \
                       "c" ((u32)((u64)__new >> 32))                        \
                     : "memory", "cc");                                        \
        if (unlikely(__result < 0))                                        \
                goto label;                                                \
        if (unlikely(!__result))                                        \
                *_old = __old;                                                \
        likely(__result);                                        })
#endif // CONFIG_X86_32
#endif // CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT

/* FIXME: this hack is definitely wrong -AK */
struct __large_struct { unsigned long buf[100]; };
#define __m(x) (*(struct __large_struct __user *)(x))

/*
 * Tell gcc we read from memory instead of writing: this is because
 * we do not write to any memory gcc knows about, so there are no
 * aliasing issues.
 */
#define __put_user_goto(x, addr, itype, ltype, label)                        \
        asm goto("\n"                                                        \
                "1:        mov"itype" %0,%1\n"                                \
                _ASM_EXTABLE_UA(1b, %l2)                                \
                : : ltype(x), "m" (__m(addr))                                \
                : : label)

extern unsigned long
copy_from_user_nmi(void *to, const void __user *from, unsigned long n);
extern __must_check long
strncpy_from_user(char *dst, const char __user *src, long count);

extern __must_check long strnlen_user(const char __user *str, long n);

#ifdef CONFIG_ARCH_HAS_COPY_MC
unsigned long __must_check
copy_mc_to_kernel(void *to, const void *from, unsigned len);
#define copy_mc_to_kernel copy_mc_to_kernel

unsigned long __must_check
copy_mc_to_user(void __user *to, const void *from, unsigned len);
#endif

/*
 * movsl can be slow when source and dest are not both 8-byte aligned
 */
#ifdef CONFIG_X86_INTEL_USERCOPY
extern struct movsl_mask {
        int mask;
} ____cacheline_aligned_in_smp movsl_mask;
#endif

#define ARCH_HAS_NOCACHE_UACCESS 1

/*
 * The "unsafe" user accesses aren't really "unsafe", but the naming
 * is a big fat warning: you have to not only do the access_ok()
 * checking before using them, but you have to surround them with the
 * user_access_begin/end() pair.
 */
static __must_check __always_inline bool user_access_begin(const void __user *ptr, size_t len)
{
        if (unlikely(!access_ok(ptr,len)))
                return 0;
        __uaccess_begin_nospec();
        return 1;
}
#define user_access_begin(a,b)        user_access_begin(a,b)
#define user_access_end()        __uaccess_end()

#define user_access_save()        smap_save()
#define user_access_restore(x)        smap_restore(x)

#define unsafe_put_user(x, ptr, label)        \
        __put_user_size((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)), label)

#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#define unsafe_get_user(x, ptr, err_label)                                        \
do {                                                                                \
        __inttype(*(ptr)) __gu_val;                                                \
        __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), err_label);                \
        (x) = (__force __typeof__(*(ptr)))__gu_val;                                \
} while (0)
#else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#define unsafe_get_user(x, ptr, err_label)                                        \
do {                                                                                \
        int __gu_err;                                                                \
        __inttype(*(ptr)) __gu_val;                                                \
        __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err);                \
        (x) = (__force __typeof__(*(ptr)))__gu_val;                                \
        if (unlikely(__gu_err)) goto err_label;                                        \
} while (0)
#endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT

extern void __try_cmpxchg_user_wrong_size(void);

#ifndef CONFIG_X86_32
#define __try_cmpxchg64_user_asm(_ptr, _oldp, _nval, _label)                \
        __try_cmpxchg_user_asm("q", "r", (_ptr), (_oldp), (_nval), _label)
#endif

/*
 * Force the pointer to u<size> to match the size expected by the asm helper.
 * clang/LLVM compiles all cases and only discards the unused paths after
 * processing errors, which breaks i386 if the pointer is an 8-byte value.
 */
#define unsafe_try_cmpxchg_user(_ptr, _oldp, _nval, _label) ({                        \
        bool __ret;                                                                \
        __chk_user_ptr(_ptr);                                                        \
        switch (sizeof(*(_ptr))) {                                                \
        case 1:        __ret = __try_cmpxchg_user_asm("b", "q",                        \
                                               (__force u8 *)(_ptr), (_oldp),        \
                                               (_nval), _label);                \
                break;                                                                \
        case 2:        __ret = __try_cmpxchg_user_asm("w", "r",                        \
                                               (__force u16 *)(_ptr), (_oldp),        \
                                               (_nval), _label);                \
                break;                                                                \
        case 4:        __ret = __try_cmpxchg_user_asm("l", "r",                        \
                                               (__force u32 *)(_ptr), (_oldp),        \
                                               (_nval), _label);                \
                break;                                                                \
        case 8:        __ret = __try_cmpxchg64_user_asm((__force u64 *)(_ptr), (_oldp),\
                                                 (_nval), _label);                \
                break;                                                                \
        default: __try_cmpxchg_user_wrong_size();                                \
        }                                                                        \
        __ret;                                                })

/* "Returns" 0 on success, 1 on failure, -EFAULT if the access faults. */
#define __try_cmpxchg_user(_ptr, _oldp, _nval, _label)        ({                \
        int __ret = -EFAULT;                                                \
        __uaccess_begin_nospec();                                        \
        __ret = !unsafe_try_cmpxchg_user(_ptr, _oldp, _nval, _label);        \
_label:                                                                        \
        __uaccess_end();                                                \
        __ret;                                                                \
                                                        })

/*
 * We want the unsafe accessors to always be inlined and use
 * the error labels - thus the macro games.
 */
#define unsafe_copy_loop(dst, src, len, type, label)                                \
        while (len >= sizeof(type)) {                                                \
                unsafe_put_user(*(type *)(src),(type __user *)(dst),label);        \
                dst += sizeof(type);                                                \
                src += sizeof(type);                                                \
                len -= sizeof(type);                                                \
        }

#define unsafe_copy_to_user(_dst,_src,_len,label)                        \
do {                                                                        \
        char __user *__ucu_dst = (_dst);                                \
        const char *__ucu_src = (_src);                                        \
        size_t __ucu_len = (_len);                                        \
        unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u64, label);        \
        unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u32, label);        \
        unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u16, label);        \
        unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, label);        \
} while (0)

#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#define __get_kernel_nofault(dst, src, type, err_label)                        \
        __get_user_size(*((type *)(dst)), (__force type __user *)(src),        \
                        sizeof(type), err_label)
#else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#define __get_kernel_nofault(dst, src, type, err_label)                        \
do {                                                                        \
        int __kr_err;                                                        \
                                                                        \
        __get_user_size(*((type *)(dst)), (__force type __user *)(src),        \
                        sizeof(type), __kr_err);                        \
        if (unlikely(__kr_err))                                                \
                goto err_label;                                                \
} while (0)
#endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT

#define __put_kernel_nofault(dst, src, type, err_label)                        \
        __put_user_size(*((type *)(src)), (__force type __user *)(dst),        \
                        sizeof(type), err_label)

#endif /* _ASM_X86_UACCESS_H */


































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_UTS_NAMESPACE_H
#define _LINUX_UTS_NAMESPACE_H

#include <linux/ns_common.h>
#include <uapi/linux/utsname.h>

struct user_namespace;
extern struct user_namespace init_user_ns;

struct uts_namespace {
        struct new_utsname name;
        struct user_namespace *user_ns;
        struct ucounts *ucounts;
        struct ns_common ns;
} __randomize_layout;

extern struct uts_namespace init_uts_ns;

#ifdef CONFIG_UTS_NS
static inline struct uts_namespace *to_uts_ns(struct ns_common *ns)
{
        return container_of(ns, struct uts_namespace, ns);
}

static inline void get_uts_ns(struct uts_namespace *ns)
{
        ns_ref_inc(ns);
}

extern struct uts_namespace *copy_utsname(u64 flags,
        struct user_namespace *user_ns, struct uts_namespace *old_ns);
extern void free_uts_ns(struct uts_namespace *ns);

static inline void put_uts_ns(struct uts_namespace *ns)
{
        if (ns_ref_put(ns))
                free_uts_ns(ns);
}

void uts_ns_init(void);
#else
static inline void get_uts_ns(struct uts_namespace *ns)
{
}

static inline void put_uts_ns(struct uts_namespace *ns)
{
}

static inline struct uts_namespace *copy_utsname(u64 flags,
        struct user_namespace *user_ns, struct uts_namespace *old_ns)
{
        if (flags & CLONE_NEWUTS)
                return ERR_PTR(-EINVAL);

        return old_ns;
}

static inline void uts_ns_init(void)
{
}
#endif

#endif /* _LINUX_UTS_NAMESPACE_H */



















































































































































































































































































































































































































































    4 

















    4 


    4 




    4 




    4 



















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGTABLE_DEFS_H
#define _ASM_X86_PGTABLE_DEFS_H

#include <linux/const.h>
#include <linux/mem_encrypt.h>

#include <asm/page_types.h>

#define _PAGE_BIT_PRESENT        0        /* is present */
#define _PAGE_BIT_RW                1        /* writeable */
#define _PAGE_BIT_USER                2        /* userspace addressable */
#define _PAGE_BIT_PWT                3        /* page write through */
#define _PAGE_BIT_PCD                4        /* page cache disabled */
#define _PAGE_BIT_ACCESSED        5        /* was accessed (raised by CPU) */
#define _PAGE_BIT_DIRTY                6        /* was written to (raised by CPU) */
#define _PAGE_BIT_PSE                7        /* 4 MB (or 2MB) page */
#define _PAGE_BIT_PAT                7        /* on 4KB pages */
#define _PAGE_BIT_GLOBAL        8        /* Global TLB entry PPro+ */
#define _PAGE_BIT_SOFTW1        9        /* available for programmer */
#define _PAGE_BIT_SOFTW2        10        /* " */
#define _PAGE_BIT_SOFTW3        11        /* " */
#define _PAGE_BIT_PAT_LARGE        12        /* On 2MB or 1GB pages */
#define _PAGE_BIT_SOFTW4        57        /* available for programmer */
#define _PAGE_BIT_SOFTW5        58        /* available for programmer */
#define _PAGE_BIT_PKEY_BIT0        59        /* Protection Keys, bit 1/4 */
#define _PAGE_BIT_PKEY_BIT1        60        /* Protection Keys, bit 2/4 */
#define _PAGE_BIT_PKEY_BIT2        61        /* Protection Keys, bit 3/4 */
#define _PAGE_BIT_PKEY_BIT3        62        /* Protection Keys, bit 4/4 */
#define _PAGE_BIT_NX                63        /* No execute: only valid after cpuid check */

#define _PAGE_BIT_SPECIAL        _PAGE_BIT_SOFTW1
#define _PAGE_BIT_CPA_TEST        _PAGE_BIT_SOFTW1
#define _PAGE_BIT_UFFD_WP        _PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */
#define _PAGE_BIT_SOFT_DIRTY        _PAGE_BIT_SOFTW3 /* software dirty tracking */
#define _PAGE_BIT_KERNEL_4K        _PAGE_BIT_SOFTW3 /* page must not be converted to large */

#ifdef CONFIG_X86_64
#define _PAGE_BIT_SAVED_DIRTY        _PAGE_BIT_SOFTW5 /* Saved Dirty bit (leaf) */
#define _PAGE_BIT_NOPTISHADOW        _PAGE_BIT_SOFTW5 /* No PTI shadow (root PGD) */
#else
/* Shared with _PAGE_BIT_UFFD_WP which is not supported on 32 bit */
#define _PAGE_BIT_SAVED_DIRTY        _PAGE_BIT_SOFTW2 /* Saved Dirty bit (leaf) */
#define _PAGE_BIT_NOPTISHADOW        _PAGE_BIT_SOFTW2 /* No PTI shadow (root PGD) */
#endif

/* If _PAGE_BIT_PRESENT is clear, we use these: */
/* - if the user mapped it with PROT_NONE; pte_present gives true */
#define _PAGE_BIT_PROTNONE        _PAGE_BIT_GLOBAL

#define _PAGE_PRESENT        (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
#define _PAGE_RW        (_AT(pteval_t, 1) << _PAGE_BIT_RW)
#define _PAGE_USER        (_AT(pteval_t, 1) << _PAGE_BIT_USER)
#define _PAGE_PWT        (_AT(pteval_t, 1) << _PAGE_BIT_PWT)
#define _PAGE_PCD        (_AT(pteval_t, 1) << _PAGE_BIT_PCD)
#define _PAGE_ACCESSED        (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
#define _PAGE_DIRTY        (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
#define _PAGE_PSE        (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
#define _PAGE_GLOBAL        (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
#define _PAGE_SOFTW1        (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
#define _PAGE_SOFTW2        (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
#define _PAGE_SOFTW3        (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW3)
#define _PAGE_PAT        (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
#define _PAGE_SPECIAL        (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
#define _PAGE_CPA_TEST        (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
#define _PAGE_KERNEL_4K        (_AT(pteval_t, 1) << _PAGE_BIT_KERNEL_4K)
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
#define _PAGE_PKEY_BIT0        (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT0)
#define _PAGE_PKEY_BIT1        (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT1)
#define _PAGE_PKEY_BIT2        (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT2)
#define _PAGE_PKEY_BIT3        (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT3)
#else
#define _PAGE_PKEY_BIT0        (_AT(pteval_t, 0))
#define _PAGE_PKEY_BIT1        (_AT(pteval_t, 0))
#define _PAGE_PKEY_BIT2        (_AT(pteval_t, 0))
#define _PAGE_PKEY_BIT3        (_AT(pteval_t, 0))
#endif

#define _PAGE_PKEY_MASK (_PAGE_PKEY_BIT0 | \
                         _PAGE_PKEY_BIT1 | \
                         _PAGE_PKEY_BIT2 | \
                         _PAGE_PKEY_BIT3)

#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
#define _PAGE_KNL_ERRATUM_MASK (_PAGE_DIRTY | _PAGE_ACCESSED)
#else
#define _PAGE_KNL_ERRATUM_MASK 0
#endif

#ifdef CONFIG_MEM_SOFT_DIRTY
#define _PAGE_SOFT_DIRTY        (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY)
#else
#define _PAGE_SOFT_DIRTY        (_AT(pteval_t, 0))
#endif

/*
 * Tracking soft dirty bit when a page goes to a swap is tricky.
 * We need a bit which can be stored in pte _and_ not conflict
 * with swap entry format. On x86 bits 1-4 are *not* involved
 * into swap entry computation, but bit 7 is used for thp migration,
 * so we borrow bit 1 for soft dirty tracking.
 *
 * Please note that this bit must be treated as swap dirty page
 * mark if and only if the PTE/PMD has present bit clear!
 */
#ifdef CONFIG_MEM_SOFT_DIRTY
#define _PAGE_SWP_SOFT_DIRTY        _PAGE_RW
#else
#define _PAGE_SWP_SOFT_DIRTY        (_AT(pteval_t, 0))
#endif

#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
#define _PAGE_UFFD_WP                (_AT(pteval_t, 1) << _PAGE_BIT_UFFD_WP)
#define _PAGE_SWP_UFFD_WP        _PAGE_USER
#else
#define _PAGE_UFFD_WP                (_AT(pteval_t, 0))
#define _PAGE_SWP_UFFD_WP        (_AT(pteval_t, 0))
#endif

#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
#define _PAGE_NX        (_AT(pteval_t, 1) << _PAGE_BIT_NX)
#define _PAGE_SOFTW4        (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW4)
#else
#define _PAGE_NX        (_AT(pteval_t, 0))
#define _PAGE_SOFTW4        (_AT(pteval_t, 0))
#endif

/*
 * The hardware requires shadow stack to be Write=0,Dirty=1. However,
 * there are valid cases where the kernel might create read-only PTEs that
 * are dirty (e.g., fork(), mprotect(), uffd-wp(), soft-dirty tracking). In
 * this case, the _PAGE_SAVED_DIRTY bit is used instead of the HW-dirty bit,
 * to avoid creating a wrong "shadow stack" PTEs. Such PTEs have
 * (Write=0,SavedDirty=1,Dirty=0) set.
 */
#define _PAGE_SAVED_DIRTY        (_AT(pteval_t, 1) << _PAGE_BIT_SAVED_DIRTY)

#define _PAGE_DIRTY_BITS (_PAGE_DIRTY | _PAGE_SAVED_DIRTY)

#define _PAGE_PROTNONE        (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)

#define _PAGE_NOPTISHADOW (_AT(pteval_t, 1) << _PAGE_BIT_NOPTISHADOW)

/*
 * Set of bits not changed in pte_modify.  The pte's
 * protection key is treated like _PAGE_RW, for
 * instance, and is *not* included in this mask since
 * pte_modify() does modify it.
 */
#define _COMMON_PAGE_CHG_MASK        (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT |        \
                                 _PAGE_SPECIAL | _PAGE_ACCESSED |        \
                                 _PAGE_DIRTY_BITS | _PAGE_SOFT_DIRTY |        \
                                 _PAGE_CC | _PAGE_UFFD_WP)
#define _PAGE_CHG_MASK        (_COMMON_PAGE_CHG_MASK | _PAGE_PAT)
#define _HPAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_PAT_LARGE)

/*
 * The cache modes defined here are used to translate between pure SW usage
 * and the HW defined cache mode bits and/or PAT entries.
 *
 * The resulting bits for PWT, PCD and PAT should be chosen in a way
 * to have the WB mode at index 0 (all bits clear). This is the default
 * right now and likely would break too much if changed.
 */
#ifndef __ASSEMBLER__
enum page_cache_mode {
        _PAGE_CACHE_MODE_WB       = 0,
        _PAGE_CACHE_MODE_WC       = 1,
        _PAGE_CACHE_MODE_UC_MINUS = 2,
        _PAGE_CACHE_MODE_UC       = 3,
        _PAGE_CACHE_MODE_WT       = 4,
        _PAGE_CACHE_MODE_WP       = 5,

        _PAGE_CACHE_MODE_NUM      = 8
};
#endif

#define _PAGE_CC                (_AT(pteval_t, cc_get_mask()))
#define _PAGE_ENC                (_AT(pteval_t, sme_me_mask))

#define _PAGE_CACHE_MASK        (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)
#define _PAGE_LARGE_CACHE_MASK        (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT_LARGE)

#define _PAGE_NOCACHE                (cachemode2protval(_PAGE_CACHE_MODE_UC))
#define _PAGE_CACHE_WP                (cachemode2protval(_PAGE_CACHE_MODE_WP))

#define __PP _PAGE_PRESENT
#define __RW _PAGE_RW
#define _USR _PAGE_USER
#define ___A _PAGE_ACCESSED
#define ___D _PAGE_DIRTY
#define ___G _PAGE_GLOBAL
#define __NX _PAGE_NX

#define _ENC _PAGE_ENC
#define __WP _PAGE_CACHE_WP
#define __NC _PAGE_NOCACHE
#define _PSE _PAGE_PSE

#define pgprot_val(x)                ((x).pgprot)
#define __pgprot(x)                ((pgprot_t) { (x) } )
#define __pg(x)                        __pgprot(x)

#define PAGE_NONE             __pg(   0|   0|   0|___A|   0|   0|   0|___G)
#define PAGE_SHARED             __pg(__PP|__RW|_USR|___A|__NX|   0|   0|   0)
#define PAGE_SHARED_EXEC     __pg(__PP|__RW|_USR|___A|   0|   0|   0|   0)
#define PAGE_COPY_NOEXEC     __pg(__PP|   0|_USR|___A|__NX|   0|   0|   0)
#define PAGE_COPY_EXEC             __pg(__PP|   0|_USR|___A|   0|   0|   0|   0)
#define PAGE_COPY             __pg(__PP|   0|_USR|___A|__NX|   0|   0|   0)
#define PAGE_READONLY             __pg(__PP|   0|_USR|___A|__NX|   0|   0|   0)
#define PAGE_READONLY_EXEC   __pg(__PP|   0|_USR|___A|   0|   0|   0|   0)

/*
 * Page tables needs to have Write=1 in order for any lower PTEs to be
 * writable. This includes shadow stack memory (Write=0, Dirty=1)
 */
#define _KERNPG_TABLE_NOENC         (__PP|__RW|   0|___A|   0|___D|   0|   0)
#define _KERNPG_TABLE                 (__PP|__RW|   0|___A|   0|___D|   0|   0| _ENC)
#define _PAGE_TABLE_NOENC         (__PP|__RW|_USR|___A|   0|___D|   0|   0)
#define _PAGE_TABLE                 (__PP|__RW|_USR|___A|   0|___D|   0|   0| _ENC)

#define __PAGE_KERNEL_RO         (__PP|   0|   0|___A|__NX|   0|   0|___G)
#define __PAGE_KERNEL_ROX         (__PP|   0|   0|___A|   0|   0|   0|___G)
#define __PAGE_KERNEL                 (__PP|__RW|   0|___A|__NX|___D|   0|___G)
#define __PAGE_KERNEL_EXEC         (__PP|__RW|   0|___A|   0|___D|   0|___G)
#define __PAGE_KERNEL_NOCACHE         (__PP|__RW|   0|___A|__NX|___D|   0|___G| __NC)
#define __PAGE_KERNEL_VVAR         (__PP|   0|_USR|___A|__NX|   0|   0|___G)
#define __PAGE_KERNEL_LARGE         (__PP|__RW|   0|___A|__NX|___D|_PSE|___G)
#define __PAGE_KERNEL_LARGE_EXEC (__PP|__RW|   0|___A|   0|___D|_PSE|___G)
#define __PAGE_KERNEL_WP         (__PP|__RW|   0|___A|__NX|___D|   0|___G| __WP)


#define __PAGE_KERNEL_IO                __PAGE_KERNEL
#define __PAGE_KERNEL_IO_NOCACHE        __PAGE_KERNEL_NOCACHE


#ifndef __ASSEMBLER__

#define __PAGE_KERNEL_ENC        (__PAGE_KERNEL    | _ENC)
#define __PAGE_KERNEL_ENC_WP        (__PAGE_KERNEL_WP | _ENC)
#define __PAGE_KERNEL_NOENC        (__PAGE_KERNEL    |    0)
#define __PAGE_KERNEL_NOENC_WP        (__PAGE_KERNEL_WP |    0)

#define __pgprot_mask(x)        __pgprot((x) & __default_kernel_pte_mask)

#define PAGE_KERNEL                __pgprot_mask(__PAGE_KERNEL            | _ENC)
#define PAGE_KERNEL_NOENC        __pgprot_mask(__PAGE_KERNEL            |    0)
#define PAGE_KERNEL_RO                __pgprot_mask(__PAGE_KERNEL_RO         | _ENC)
#define PAGE_KERNEL_EXEC        __pgprot_mask(__PAGE_KERNEL_EXEC       | _ENC)
#define PAGE_KERNEL_EXEC_NOENC        __pgprot_mask(__PAGE_KERNEL_EXEC       |    0)
#define PAGE_KERNEL_ROX                __pgprot_mask(__PAGE_KERNEL_ROX        | _ENC)
#define PAGE_KERNEL_NOCACHE        __pgprot_mask(__PAGE_KERNEL_NOCACHE    | _ENC)
#define PAGE_KERNEL_LARGE        __pgprot_mask(__PAGE_KERNEL_LARGE      | _ENC)
#define PAGE_KERNEL_LARGE_EXEC        __pgprot_mask(__PAGE_KERNEL_LARGE_EXEC | _ENC)
#define PAGE_KERNEL_VVAR        __pgprot_mask(__PAGE_KERNEL_VVAR       | _ENC)

#define PAGE_KERNEL_IO                __pgprot_mask(__PAGE_KERNEL_IO)
#define PAGE_KERNEL_IO_NOCACHE        __pgprot_mask(__PAGE_KERNEL_IO_NOCACHE)

#endif        /* __ASSEMBLER__ */

/*
 * early identity mapping  pte attrib macros.
 */
#ifdef CONFIG_X86_64
#define __PAGE_KERNEL_IDENT_LARGE_EXEC        __PAGE_KERNEL_LARGE_EXEC
#else
#define PTE_IDENT_ATTR         0x003                /* PRESENT+RW */
#define PDE_IDENT_ATTR         0x063                /* PRESENT+RW+DIRTY+ACCESSED */
#define PGD_IDENT_ATTR         0x001                /* PRESENT (no other attributes) */
#endif

#ifdef CONFIG_X86_32
# include <asm/pgtable_32_types.h>
#else
# include <asm/pgtable_64_types.h>
#endif

#ifndef __ASSEMBLER__

#include <linux/types.h>

/* Extracts the PFN from a (pte|pmd|pud|pgd)val_t of a 4KB page */
#define PTE_PFN_MASK                ((pteval_t)PHYSICAL_PAGE_MASK)

/*
 *  Extracts the flags from a (pte|pmd|pud|pgd)val_t
 *  This includes the protection key value.
 */
#define PTE_FLAGS_MASK                (~PTE_PFN_MASK)

typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;

typedef struct { pgdval_t pgd; } pgd_t;

static inline pgprot_t pgprot_nx(pgprot_t prot)
{
        return __pgprot(pgprot_val(prot) | _PAGE_NX);
}
#define pgprot_nx pgprot_nx

#ifdef CONFIG_X86_PAE

/*
 * PHYSICAL_PAGE_MASK might be non-constant when SME is compiled in, so we can't
 * use it here.
 */

#define PGD_PAE_PAGE_MASK        ((signed long)PAGE_MASK)
#define PGD_PAE_PHYS_MASK        (((1ULL << __PHYSICAL_MASK_SHIFT)-1) & PGD_PAE_PAGE_MASK)

/*
 * PAE allows Base Address, P, PWT, PCD and AVL bits to be set in PGD entries.
 * All other bits are Reserved MBZ
 */
#define PGD_ALLOWED_BITS        (PGD_PAE_PHYS_MASK | _PAGE_PRESENT | \
                                 _PAGE_PWT | _PAGE_PCD | \
                                 _PAGE_SOFTW1 | _PAGE_SOFTW2 | _PAGE_SOFTW3)

#else
/* No need to mask any bits for !PAE */
#define PGD_ALLOWED_BITS        (~0ULL)
#endif

static inline pgd_t native_make_pgd(pgdval_t val)
{
        return (pgd_t) { val & PGD_ALLOWED_BITS };
}

static inline pgdval_t native_pgd_val(pgd_t pgd)
{
        return pgd.pgd & PGD_ALLOWED_BITS;
}

static inline pgdval_t pgd_flags(pgd_t pgd)
{
        return native_pgd_val(pgd) & PTE_FLAGS_MASK;
}

#if CONFIG_PGTABLE_LEVELS > 4
typedef struct { p4dval_t p4d; } p4d_t;

static inline p4d_t native_make_p4d(pudval_t val)
{
        return (p4d_t) { val };
}

static inline p4dval_t native_p4d_val(p4d_t p4d)
{
        return p4d.p4d;
}
#else
#include <asm-generic/pgtable-nop4d.h>

static inline p4d_t native_make_p4d(pudval_t val)
{
        return (p4d_t) { .pgd = native_make_pgd((pgdval_t)val) };
}

static inline p4dval_t native_p4d_val(p4d_t p4d)
{
        return native_pgd_val(p4d.pgd);
}
#endif

#if CONFIG_PGTABLE_LEVELS > 3
typedef struct { pudval_t pud; } pud_t;

static inline pud_t native_make_pud(pmdval_t val)
{
        return (pud_t) { val };
}

static inline pudval_t native_pud_val(pud_t pud)
{
        return pud.pud;
}
#else
#include <asm-generic/pgtable-nopud.h>

static inline pud_t native_make_pud(pudval_t val)
{
        return (pud_t) { .p4d.pgd = native_make_pgd(val) };
}

static inline pudval_t native_pud_val(pud_t pud)
{
        return native_pgd_val(pud.p4d.pgd);
}
#endif

#if CONFIG_PGTABLE_LEVELS > 2
static inline pmd_t native_make_pmd(pmdval_t val)
{
        return (pmd_t) { .pmd = val };
}

static inline pmdval_t native_pmd_val(pmd_t pmd)
{
        return pmd.pmd;
}
#else
#include <asm-generic/pgtable-nopmd.h>

static inline pmd_t native_make_pmd(pmdval_t val)
{
        return (pmd_t) { .pud.p4d.pgd = native_make_pgd(val) };
}

static inline pmdval_t native_pmd_val(pmd_t pmd)
{
        return native_pgd_val(pmd.pud.p4d.pgd);
}
#endif

static inline p4dval_t p4d_pfn_mask(p4d_t p4d)
{
        /* No 512 GiB huge pages yet */
        return PTE_PFN_MASK;
}

static inline p4dval_t p4d_flags_mask(p4d_t p4d)
{
        return ~p4d_pfn_mask(p4d);
}

static inline p4dval_t p4d_flags(p4d_t p4d)
{
        return native_p4d_val(p4d) & p4d_flags_mask(p4d);
}

static inline pudval_t pud_pfn_mask(pud_t pud)
{
        if (native_pud_val(pud) & _PAGE_PSE)
                return PHYSICAL_PUD_PAGE_MASK;
        else
                return PTE_PFN_MASK;
}

static inline pudval_t pud_flags_mask(pud_t pud)
{
        return ~pud_pfn_mask(pud);
}

static inline pudval_t pud_flags(pud_t pud)
{
        return native_pud_val(pud) & pud_flags_mask(pud);
}

static inline pmdval_t pmd_pfn_mask(pmd_t pmd)
{
        if (native_pmd_val(pmd) & _PAGE_PSE)
                return PHYSICAL_PMD_PAGE_MASK;
        else
                return PTE_PFN_MASK;
}

static inline pmdval_t pmd_flags_mask(pmd_t pmd)
{
        return ~pmd_pfn_mask(pmd);
}

static inline pmdval_t pmd_flags(pmd_t pmd)
{
        return native_pmd_val(pmd) & pmd_flags_mask(pmd);
}

static inline pte_t native_make_pte(pteval_t val)
{
        return (pte_t) { .pte = val };
}

static inline pteval_t native_pte_val(pte_t pte)
{
        return pte.pte;
}

static inline pteval_t pte_flags(pte_t pte)
{
        return native_pte_val(pte) & PTE_FLAGS_MASK;
}

#define __pte2cm_idx(cb)                                \
        ((((cb) >> (_PAGE_BIT_PAT - 2)) & 4) |                \
         (((cb) >> (_PAGE_BIT_PCD - 1)) & 2) |                \
         (((cb) >> _PAGE_BIT_PWT) & 1))
#define __cm_idx2pte(i)                                        \
        ((((i) & 4) << (_PAGE_BIT_PAT - 2)) |                \
         (((i) & 2) << (_PAGE_BIT_PCD - 1)) |                \
         (((i) & 1) << _PAGE_BIT_PWT))

unsigned long cachemode2protval(enum page_cache_mode pcm);

static inline pgprotval_t protval_4k_2_large(pgprotval_t val)
{
        return (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
                ((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
}
static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot)
{
        return __pgprot(protval_4k_2_large(pgprot_val(pgprot)));
}
static inline pgprotval_t protval_large_2_4k(pgprotval_t val)
{
        return (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
                ((val & _PAGE_PAT_LARGE) >>
                 (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
}
static inline pgprot_t pgprot_large_2_4k(pgprot_t pgprot)
{
        return __pgprot(protval_large_2_4k(pgprot_val(pgprot)));
}


typedef struct page *pgtable_t;

extern pteval_t __supported_pte_mask;
extern pteval_t __default_kernel_pte_mask;

#define pgprot_writecombine        pgprot_writecombine
extern pgprot_t pgprot_writecombine(pgprot_t prot);

#define pgprot_writethrough        pgprot_writethrough
extern pgprot_t pgprot_writethrough(pgprot_t prot);

/* Indicate that x86 has its own track and untrack pfn vma functions */
#define __HAVE_PFNMAP_TRACKING

#define __HAVE_PHYS_MEM_ACCESS_PROT
struct file;
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
                              unsigned long size, pgprot_t vma_prot);

/* Install a pte for a particular vaddr in kernel space. */
void set_pte_vaddr(unsigned long vaddr, pte_t pte);

#ifdef CONFIG_X86_32
extern void native_pagetable_init(void);
#else
#define native_pagetable_init        paging_init
#endif

enum pg_level {
        PG_LEVEL_NONE,
        PG_LEVEL_4K,
        PG_LEVEL_2M,
        PG_LEVEL_1G,
        PG_LEVEL_512G,
        PG_LEVEL_256T,
        PG_LEVEL_NUM
};

#ifdef CONFIG_PROC_FS
extern void update_page_count(int level, unsigned long pages);
#else
static inline void update_page_count(int level, unsigned long pages) { }
#endif

/*
 * Helper function that returns the kernel pagetable entry controlling
 * the virtual address 'address'. NULL means no pagetable entry present.
 * NOTE: the return type is pte_t but if the pmd is PSE then we return it
 * as a pte too.
 */
extern pte_t *lookup_address(unsigned long address, unsigned int *level);
extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
                                    unsigned int *level);
pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address,
                                  unsigned int *level, bool *nx, bool *rw);
extern pmd_t *lookup_pmd_address(unsigned long address);
extern phys_addr_t slow_virt_to_phys(void *__address);
extern int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn,
                                          unsigned long address,
                                          unsigned numpages,
                                          unsigned long page_flags);
extern int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address,
                                            unsigned long numpages);
#endif        /* !__ASSEMBLER__ */

#endif /* _ASM_X86_PGTABLE_DEFS_H */















































































    9 











    9 


    9 
    9 
    1 

    8 



    8 

    8 


    8 
    6 


    6 

    6 
    6 







    8 



    8 

    8 







    8 



    8 













    8 


    8 








    1 

    1 

    8 



















































    9 
    9 

    9 







    9 





















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Create default crypto algorithm instances.
 *
 * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
 */

#include <crypto/internal/aead.h>
#include <linux/completion.h>
#include <linux/ctype.h>
#include <linux/err.h>
#include <linux/init.h>
#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/notifier.h>
#include <linux/rtnetlink.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/string.h>

#include "internal.h"

struct cryptomgr_param {
        struct rtattr *tb[CRYPTO_MAX_ATTRS + 2];

        struct {
                struct rtattr attr;
                struct crypto_attr_type data;
        } type;

        struct {
                struct rtattr attr;
                struct crypto_attr_alg data;
        } attrs[CRYPTO_MAX_ATTRS];

        char template[CRYPTO_MAX_ALG_NAME];

        struct crypto_larval *larval;

        u32 otype;
        u32 omask;
};

struct crypto_test_param {
        char driver[CRYPTO_MAX_ALG_NAME];
        char alg[CRYPTO_MAX_ALG_NAME];
        u32 type;
};

static int cryptomgr_probe(void *data)
{
        struct cryptomgr_param *param = data;
        struct crypto_template *tmpl;
        int err = -ENOENT;

        tmpl = crypto_lookup_template(param->template);
        if (!tmpl)
                goto out;

        do {
                err = tmpl->create(tmpl, param->tb);
        } while (err == -EAGAIN && !signal_pending(current));

        crypto_tmpl_put(tmpl);

out:
        param->larval->adult = ERR_PTR(err);
        param->larval->alg.cra_flags |= CRYPTO_ALG_DEAD;
        complete_all(&param->larval->completion);
        crypto_alg_put(&param->larval->alg);
        kfree(param);
        module_put_and_kthread_exit(0);
}

static int cryptomgr_schedule_probe(struct crypto_larval *larval)
{
        struct task_struct *thread;
        struct cryptomgr_param *param;
        const char *name = larval->alg.cra_name;
        const char *p;
        unsigned int len;
        int i;

        if (!try_module_get(THIS_MODULE))
                goto err;

        param = kzalloc(sizeof(*param), GFP_KERNEL);
        if (!param)
                goto err_put_module;

        for (p = name; isalnum(*p) || *p == '-' || *p == '_'; p++)
                ;

        len = p - name;
        if (!len || *p != '(')
                goto err_free_param;

        memcpy(param->template, name, len);

        i = 0;
        for (;;) {
                name = ++p;

                for (; isalnum(*p) || *p == '-' || *p == '_'; p++)
                        ;

                if (*p == '(') {
                        int recursion = 0;

                        for (;;) {
                                if (!*++p)
                                        goto err_free_param;
                                if (*p == '(')
                                        recursion++;
                                else if (*p == ')' && !recursion--)
                                        break;
                        }

                        p++;
                }

                len = p - name;
                if (!len)
                        goto err_free_param;

                param->attrs[i].attr.rta_len = sizeof(param->attrs[i]);
                param->attrs[i].attr.rta_type = CRYPTOA_ALG;
                memcpy(param->attrs[i].data.name, name, len);

                param->tb[i + 1] = &param->attrs[i].attr;
                i++;

                if (i >= CRYPTO_MAX_ATTRS)
                        goto err_free_param;

                if (*p == ')')
                        break;

                if (*p != ',')
                        goto err_free_param;
        }

        param->tb[i + 1] = NULL;

        param->type.attr.rta_len = sizeof(param->type);
        param->type.attr.rta_type = CRYPTOA_TYPE;
        param->type.data.type = larval->alg.cra_flags & ~CRYPTO_ALG_TESTED;
        param->type.data.mask = larval->mask & ~CRYPTO_ALG_TESTED;
        param->tb[0] = &param->type.attr;

        param->otype = larval->alg.cra_flags;
        param->omask = larval->mask;

        crypto_alg_get(&larval->alg);
        param->larval = larval;

        thread = kthread_run(cryptomgr_probe, param, "cryptomgr_probe");
        if (IS_ERR(thread))
                goto err_put_larval;

        return NOTIFY_STOP;

err_put_larval:
        crypto_alg_put(&larval->alg);
err_free_param:
        kfree(param);
err_put_module:
        module_put(THIS_MODULE);
err:
        return NOTIFY_OK;
}

static int cryptomgr_test(void *data)
{
        struct crypto_test_param *param = data;
        u32 type = param->type;
        int err;

        err = alg_test(param->driver, param->alg, type, CRYPTO_ALG_TESTED);

        crypto_alg_tested(param->driver, err);

        kfree(param);
        module_put_and_kthread_exit(0);
}

static int cryptomgr_schedule_test(struct crypto_alg *alg)
{
        struct task_struct *thread;
        struct crypto_test_param *param;

        if (!IS_ENABLED(CONFIG_CRYPTO_SELFTESTS))
                return NOTIFY_DONE;

        if (!try_module_get(THIS_MODULE))
                goto err;

        param = kzalloc(sizeof(*param), GFP_KERNEL);
        if (!param)
                goto err_put_module;

        memcpy(param->driver, alg->cra_driver_name, sizeof(param->driver));
        memcpy(param->alg, alg->cra_name, sizeof(param->alg));
        param->type = alg->cra_flags;

        thread = kthread_run(cryptomgr_test, param, "cryptomgr_test");
        if (IS_ERR(thread))
                goto err_free_param;

        return NOTIFY_STOP;

err_free_param:
        kfree(param);
err_put_module:
        module_put(THIS_MODULE);
err:
        return NOTIFY_OK;
}

static int cryptomgr_notify(struct notifier_block *this, unsigned long msg,
                            void *data)
{
        switch (msg) {
        case CRYPTO_MSG_ALG_REQUEST:
                return cryptomgr_schedule_probe(data);
        case CRYPTO_MSG_ALG_REGISTER:
                return cryptomgr_schedule_test(data);
        case CRYPTO_MSG_ALG_LOADED:
                break;
        }

        return NOTIFY_DONE;
}

static struct notifier_block cryptomgr_notifier = {
        .notifier_call = cryptomgr_notify,
};

static int __init cryptomgr_init(void)
{
        return crypto_register_notifier(&cryptomgr_notifier);
}

static void __exit cryptomgr_exit(void)
{
        int err = crypto_unregister_notifier(&cryptomgr_notifier);
        BUG_ON(err);
}

module_init(cryptomgr_init);
module_exit(cryptomgr_exit);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Crypto Algorithm Manager");















































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_FS_NOTIFY_H
#define _LINUX_FS_NOTIFY_H

/*
 * include/linux/fsnotify.h - generic hooks for filesystem notification, to
 * reduce in-source duplication from both dnotify and inotify.
 *
 * We don't compile any of this away in some complicated menagerie of ifdefs.
 * Instead, we rely on the code inside to optimize away as needed.
 *
 * (C) Copyright 2005 Robert Love
 */

#include <linux/fsnotify_backend.h>
#include <linux/audit.h>
#include <linux/slab.h>
#include <linux/bug.h>

/* Are there any inode/mount/sb objects watched with priority prio or above? */
static inline bool fsnotify_sb_has_priority_watchers(struct super_block *sb,
                                                     int prio)
{
        struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);

        /* Were any marks ever added to any object on this sb? */
        if (!sbinfo)
                return false;

        return atomic_long_read(&sbinfo->watched_objects[prio]);
}

/* Are there any inode/mount/sb objects that are being watched at all? */
static inline bool fsnotify_sb_has_watchers(struct super_block *sb)
{
        return fsnotify_sb_has_priority_watchers(sb, 0);
}

/*
 * Notify this @dir inode about a change in a child directory entry.
 * The directory entry may have turned positive or negative or its inode may
 * have changed (i.e. renamed over).
 *
 * Unlike fsnotify_parent(), the event will be reported regardless of the
 * FS_EVENT_ON_CHILD mask on the parent inode and will not be reported if only
 * the child is interested and not the parent.
 */
static inline int fsnotify_name(__u32 mask, const void *data, int data_type,
                                struct inode *dir, const struct qstr *name,
                                u32 cookie)
{
        if (!fsnotify_sb_has_watchers(dir->i_sb))
                return 0;

        return fsnotify(mask, data, data_type, dir, name, NULL, cookie);
}

static inline void fsnotify_dirent(struct inode *dir, struct dentry *dentry,
                                   __u32 mask)
{
        fsnotify_name(mask, dentry, FSNOTIFY_EVENT_DENTRY, dir, &dentry->d_name, 0);
}

static inline void fsnotify_inode(struct inode *inode, __u32 mask)
{
        if (!fsnotify_sb_has_watchers(inode->i_sb))
                return;

        if (S_ISDIR(inode->i_mode))
                mask |= FS_ISDIR;

        fsnotify(mask, inode, FSNOTIFY_EVENT_INODE, NULL, NULL, inode, 0);
}

/* Notify this dentry's parent about a child's events. */
static inline int fsnotify_parent(struct dentry *dentry, __u32 mask,
                                  const void *data, int data_type)
{
        struct inode *inode = d_inode(dentry);

        if (!fsnotify_sb_has_watchers(inode->i_sb))
                return 0;

        if (S_ISDIR(inode->i_mode)) {
                mask |= FS_ISDIR;

                /* sb/mount marks are not interested in name of directory */
                if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
                        goto notify_child;
        }

        /* disconnected dentry cannot notify parent */
        if (IS_ROOT(dentry))
                goto notify_child;

        return __fsnotify_parent(dentry, mask, data, data_type);

notify_child:
        return fsnotify(mask, data, data_type, NULL, NULL, inode, 0);
}

/*
 * Simple wrappers to consolidate calls to fsnotify_parent() when an event
 * is on a file/dentry.
 */
static inline void fsnotify_dentry(struct dentry *dentry, __u32 mask)
{
        fsnotify_parent(dentry, mask, dentry, FSNOTIFY_EVENT_DENTRY);
}

static inline int fsnotify_path(const struct path *path, __u32 mask)
{
        return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH);
}

static inline int fsnotify_file(struct file *file, __u32 mask)
{
        /*
         * FMODE_NONOTIFY are fds generated by fanotify itself which should not
         * generate new events. We also don't want to generate events for
         * FMODE_PATH fds (involves open & close events) as they are just
         * handle creation / destruction events and not "real" file events.
         */
        if (FMODE_FSNOTIFY_NONE(file->f_mode))
                return 0;

        return fsnotify_path(&file->f_path, mask);
}

#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS

int fsnotify_open_perm_and_set_mode(struct file *file);

/*
 * fsnotify_file_area_perm - permission hook before access to file range
 */
static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
                                          const loff_t *ppos, size_t count)
{
        /*
         * filesystem may be modified in the context of permission events
         * (e.g. by HSM filling a file on access), so sb freeze protection
         * must not be held.
         */
        lockdep_assert_once(file_write_not_started(file));

        if (!(perm_mask & (MAY_READ | MAY_WRITE | MAY_ACCESS)))
                return 0;

        /*
         * read()/write() and other types of access generate pre-content events.
         */
        if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) {
                int ret = fsnotify_pre_content(&file->f_path, ppos, count);

                if (ret)
                        return ret;
        }

        if (!(perm_mask & MAY_READ) ||
            likely(!FMODE_FSNOTIFY_ACCESS_PERM(file->f_mode)))
                return 0;

        /*
         * read() also generates the legacy FS_ACCESS_PERM event, so content
         * scanners can inspect the content filled by pre-content event.
         */
        return fsnotify_path(&file->f_path, FS_ACCESS_PERM);
}

/*
 * fsnotify_mmap_perm - permission hook before mmap of file range
 */
static inline int fsnotify_mmap_perm(struct file *file, int prot,
                                     const loff_t off, size_t len)
{
        /*
         * mmap() generates only pre-content events.
         */
        if (!file || likely(!FMODE_FSNOTIFY_HSM(file->f_mode)))
                return 0;

        return fsnotify_pre_content(&file->f_path, &off, len);
}

/*
 * fsnotify_truncate_perm - permission hook before file truncate
 */
static inline int fsnotify_truncate_perm(const struct path *path, loff_t length)
{
        struct inode *inode = d_inode(path->dentry);

        if (!(inode->i_sb->s_iflags & SB_I_ALLOW_HSM) ||
            !fsnotify_sb_has_priority_watchers(inode->i_sb,
                                               FSNOTIFY_PRIO_PRE_CONTENT))
                return 0;

        return fsnotify_pre_content(path, &length, 0);
}

/*
 * fsnotify_file_perm - permission hook before file access (unknown range)
 */
static inline int fsnotify_file_perm(struct file *file, int perm_mask)
{
        return fsnotify_file_area_perm(file, perm_mask, NULL, 0);
}

#else
static inline int fsnotify_open_perm_and_set_mode(struct file *file)
{
        return 0;
}

static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
                                          const loff_t *ppos, size_t count)
{
        return 0;
}

static inline int fsnotify_mmap_perm(struct file *file, int prot,
                                     const loff_t off, size_t len)
{
        return 0;
}

static inline int fsnotify_truncate_perm(const struct path *path, loff_t length)
{
        return 0;
}

static inline int fsnotify_file_perm(struct file *file, int perm_mask)
{
        return 0;
}
#endif

/*
 * fsnotify_link_count - inode's link count changed
 */
static inline void fsnotify_link_count(struct inode *inode)
{
        fsnotify_inode(inode, FS_ATTRIB);
}

/*
 * fsnotify_move - file old_name at old_dir was moved to new_name at new_dir
 */
static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
                                 const struct qstr *old_name,
                                 int isdir, struct inode *target,
                                 struct dentry *moved)
{
        struct inode *source = moved->d_inode;
        u32 fs_cookie = fsnotify_get_cookie();
        __u32 old_dir_mask = FS_MOVED_FROM;
        __u32 new_dir_mask = FS_MOVED_TO;
        __u32 rename_mask = FS_RENAME;
        const struct qstr *new_name = &moved->d_name;

        if (isdir) {
                old_dir_mask |= FS_ISDIR;
                new_dir_mask |= FS_ISDIR;
                rename_mask |= FS_ISDIR;
        }

        /* Event with information about both old and new parent+name */
        fsnotify_name(rename_mask, moved, FSNOTIFY_EVENT_DENTRY,
                      old_dir, old_name, 0);

        fsnotify_name(old_dir_mask, source, FSNOTIFY_EVENT_INODE,
                      old_dir, old_name, fs_cookie);
        fsnotify_name(new_dir_mask, source, FSNOTIFY_EVENT_INODE,
                      new_dir, new_name, fs_cookie);

        if (target)
                fsnotify_link_count(target);
        fsnotify_inode(source, FS_MOVE_SELF);
        audit_inode_child(new_dir, moved, AUDIT_TYPE_CHILD_CREATE);
}

/*
 * fsnotify_inode_delete - and inode is being evicted from cache, clean up is needed
 */
static inline void fsnotify_inode_delete(struct inode *inode)
{
        __fsnotify_inode_delete(inode);
}

/*
 * fsnotify_vfsmount_delete - a vfsmount is being destroyed, clean up is needed
 */
static inline void fsnotify_vfsmount_delete(struct vfsmount *mnt)
{
        __fsnotify_vfsmount_delete(mnt);
}

static inline void fsnotify_mntns_delete(struct mnt_namespace *mntns)
{
        __fsnotify_mntns_delete(mntns);
}

/*
 * fsnotify_inoderemove - an inode is going away
 */
static inline void fsnotify_inoderemove(struct inode *inode)
{
        fsnotify_inode(inode, FS_DELETE_SELF);
        __fsnotify_inode_delete(inode);
}

/*
 * fsnotify_create - 'name' was linked in
 *
 * Caller must make sure that dentry->d_name is stable.
 * Note: some filesystems (e.g. kernfs) leave @dentry negative and instantiate
 * ->d_inode later
 */
static inline void fsnotify_create(struct inode *dir, struct dentry *dentry)
{
        audit_inode_child(dir, dentry, AUDIT_TYPE_CHILD_CREATE);

        fsnotify_dirent(dir, dentry, FS_CREATE);
}

/*
 * fsnotify_link - new hardlink in 'inode' directory
 *
 * Caller must make sure that new_dentry->d_name is stable.
 * Note: We have to pass also the linked inode ptr as some filesystems leave
 *   new_dentry->d_inode NULL and instantiate inode pointer later
 */
static inline void fsnotify_link(struct inode *dir, struct inode *inode,
                                 struct dentry *new_dentry)
{
        fsnotify_link_count(inode);
        audit_inode_child(dir, new_dentry, AUDIT_TYPE_CHILD_CREATE);

        fsnotify_name(FS_CREATE, inode, FSNOTIFY_EVENT_INODE,
                      dir, &new_dentry->d_name, 0);
}

/*
 * fsnotify_delete - @dentry was unlinked and unhashed
 *
 * Caller must make sure that dentry->d_name is stable.
 *
 * Note: unlike fsnotify_unlink(), we have to pass also the unlinked inode
 * as this may be called after d_delete() and old_dentry may be negative.
 */
static inline void fsnotify_delete(struct inode *dir, struct inode *inode,
                                   struct dentry *dentry)
{
        __u32 mask = FS_DELETE;

        if (S_ISDIR(inode->i_mode))
                mask |= FS_ISDIR;

        fsnotify_name(mask, inode, FSNOTIFY_EVENT_INODE, dir, &dentry->d_name,
                      0);
}

/**
 * d_delete_notify - delete a dentry and call fsnotify_delete()
 * @dentry: The dentry to delete
 *
 * This helper is used to guaranty that the unlinked inode cannot be found
 * by lookup of this name after fsnotify_delete() event has been delivered.
 */
static inline void d_delete_notify(struct inode *dir, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);

        ihold(inode);
        d_delete(dentry);
        fsnotify_delete(dir, inode, dentry);
        iput(inode);
}

/*
 * fsnotify_unlink - 'name' was unlinked
 *
 * Caller must make sure that dentry->d_name is stable.
 */
static inline void fsnotify_unlink(struct inode *dir, struct dentry *dentry)
{
        if (WARN_ON_ONCE(d_is_negative(dentry)))
                return;

        fsnotify_delete(dir, d_inode(dentry), dentry);
}

/*
 * fsnotify_mkdir - directory 'name' was created
 *
 * Caller must make sure that dentry->d_name is stable.
 * Note: some filesystems (e.g. kernfs) leave @dentry negative and instantiate
 * ->d_inode later
 */
static inline void fsnotify_mkdir(struct inode *dir, struct dentry *dentry)
{
        audit_inode_child(dir, dentry, AUDIT_TYPE_CHILD_CREATE);

        fsnotify_dirent(dir, dentry, FS_CREATE | FS_ISDIR);
}

/*
 * fsnotify_rmdir - directory 'name' was removed
 *
 * Caller must make sure that dentry->d_name is stable.
 */
static inline void fsnotify_rmdir(struct inode *dir, struct dentry *dentry)
{
        if (WARN_ON_ONCE(d_is_negative(dentry)))
                return;

        fsnotify_delete(dir, d_inode(dentry), dentry);
}

/*
 * fsnotify_access - file was read
 */
static inline void fsnotify_access(struct file *file)
{
        fsnotify_file(file, FS_ACCESS);
}

/*
 * fsnotify_modify - file was modified
 */
static inline void fsnotify_modify(struct file *file)
{
        fsnotify_file(file, FS_MODIFY);
}

/*
 * fsnotify_open - file was opened
 */
static inline void fsnotify_open(struct file *file)
{
        __u32 mask = FS_OPEN;

        if (file->f_flags & __FMODE_EXEC)
                mask |= FS_OPEN_EXEC;

        fsnotify_file(file, mask);
}

/*
 * fsnotify_close - file was closed
 */
static inline void fsnotify_close(struct file *file)
{
        __u32 mask = (file->f_mode & FMODE_WRITE) ? FS_CLOSE_WRITE :
                                                    FS_CLOSE_NOWRITE;

        fsnotify_file(file, mask);
}

/*
 * fsnotify_xattr - extended attributes were changed
 */
static inline void fsnotify_xattr(struct dentry *dentry)
{
        fsnotify_dentry(dentry, FS_ATTRIB);
}

/*
 * fsnotify_change - notify_change event.  file was modified and/or metadata
 * was changed.
 */
static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
{
        __u32 mask = 0;

        if (ia_valid & ATTR_UID)
                mask |= FS_ATTRIB;
        if (ia_valid & ATTR_GID)
                mask |= FS_ATTRIB;
        if (ia_valid & ATTR_SIZE)
                mask |= FS_MODIFY;

        /* both times implies a utime(s) call */
        if ((ia_valid & (ATTR_ATIME | ATTR_MTIME)) == (ATTR_ATIME | ATTR_MTIME))
                mask |= FS_ATTRIB;
        else if (ia_valid & ATTR_ATIME)
                mask |= FS_ACCESS;
        else if (ia_valid & ATTR_MTIME)
                mask |= FS_MODIFY;

        if (ia_valid & ATTR_MODE)
                mask |= FS_ATTRIB;

        if (mask)
                fsnotify_dentry(dentry, mask);
}

static inline int fsnotify_sb_error(struct super_block *sb, struct inode *inode,
                                    int error)
{
        struct fs_error_report report = {
                .error = error,
                .inode = inode,
                .sb = sb,
        };

        return fsnotify(FS_ERROR, &report, FSNOTIFY_EVENT_ERROR,
                        NULL, NULL, NULL, 0);
}

static inline void fsnotify_mnt_attach(struct mnt_namespace *ns, struct vfsmount *mnt)
{
        fsnotify_mnt(FS_MNT_ATTACH, ns, mnt);
}

static inline void fsnotify_mnt_detach(struct mnt_namespace *ns, struct vfsmount *mnt)
{
        fsnotify_mnt(FS_MNT_DETACH, ns, mnt);
}

static inline void fsnotify_mnt_move(struct mnt_namespace *ns, struct vfsmount *mnt)
{
        fsnotify_mnt(FS_MNT_MOVE, ns, mnt);
}

#endif        /* _LINUX_FS_NOTIFY_H */













































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    8 


















    8 







    8 


































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
// SPDX-License-Identifier: GPL-2.0
/*
 * Common Block IO controller cgroup interface
 *
 * Based on ideas and code from CFQ, CFS and BFQ:
 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
 *
 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
 *                      Paolo Valente <paolo.valente@unimore.it>
 *
 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
 *                       Nauman Rafique <nauman@google.com>
 *
 * For policy-specific per-blkcg data:
 * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
 *                    Arianna Avanzini <avanzini.arianna@gmail.com>
 */
#include <linux/ioprio.h>
#include <linux/kdev_t.h>
#include <linux/module.h>
#include <linux/sched/signal.h>
#include <linux/err.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/slab.h>
#include <linux/delay.h>
#include <linux/atomic.h>
#include <linux/ctype.h>
#include <linux/resume_user_mode.h>
#include <linux/psi.h>
#include <linux/part_stat.h>
#include "blk.h"
#include "blk-cgroup.h"
#include "blk-ioprio.h"
#include "blk-throttle.h"

static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu);

/*
 * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
 * blkcg_pol_register_mutex nests outside of it and synchronizes entire
 * policy [un]register operations including cgroup file additions /
 * removals.  Putting cgroup file registration outside blkcg_pol_mutex
 * allows grabbing it from cgroup callbacks.
 */
static DEFINE_MUTEX(blkcg_pol_register_mutex);
static DEFINE_MUTEX(blkcg_pol_mutex);

struct blkcg blkcg_root;
EXPORT_SYMBOL_GPL(blkcg_root);

struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
EXPORT_SYMBOL_GPL(blkcg_root_css);

static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];

static LIST_HEAD(all_blkcgs);                /* protected by blkcg_pol_mutex */

bool blkcg_debug_stats = false;

static DEFINE_RAW_SPINLOCK(blkg_stat_lock);

#define BLKG_DESTROY_BATCH_SIZE  64

/*
 * Lockless lists for tracking IO stats update
 *
 * New IO stats are stored in the percpu iostat_cpu within blkcg_gq (blkg).
 * There are multiple blkg's (one for each block device) attached to each
 * blkcg. The rstat code keeps track of which cpu has IO stats updated,
 * but it doesn't know which blkg has the updated stats. If there are many
 * block devices in a system, the cost of iterating all the blkg's to flush
 * out the IO stats can be high. To reduce such overhead, a set of percpu
 * lockless lists (lhead) per blkcg are used to track the set of recently
 * updated iostat_cpu's since the last flush. An iostat_cpu will be put
 * onto the lockless list on the update side [blk_cgroup_bio_start()] if
 * not there yet and then removed when being flushed [blkcg_rstat_flush()].
 * References to blkg are gotten and then put back in the process to
 * protect against blkg removal.
 *
 * Return: 0 if successful or -ENOMEM if allocation fails.
 */
static int init_blkcg_llists(struct blkcg *blkcg)
{
        int cpu;

        blkcg->lhead = alloc_percpu_gfp(struct llist_head, GFP_KERNEL);
        if (!blkcg->lhead)
                return -ENOMEM;

        for_each_possible_cpu(cpu)
                init_llist_head(per_cpu_ptr(blkcg->lhead, cpu));
        return 0;
}

/**
 * blkcg_css - find the current css
 *
 * Find the css associated with either the kthread or the current task.
 * This may return a dying css, so it is up to the caller to use tryget logic
 * to confirm it is alive and well.
 */
static struct cgroup_subsys_state *blkcg_css(void)
{
        struct cgroup_subsys_state *css;

        css = kthread_blkcg();
        if (css)
                return css;
        return task_css(current, io_cgrp_id);
}

static void blkg_free_workfn(struct work_struct *work)
{
        struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
                                             free_work);
        struct request_queue *q = blkg->q;
        int i;

        /*
         * pd_free_fn() can also be called from blkcg_deactivate_policy(),
         * in order to make sure pd_free_fn() is called in order, the deletion
         * of the list blkg->q_node is delayed to here from blkg_destroy(), and
         * blkcg_mutex is used to synchronize blkg_free_workfn() and
         * blkcg_deactivate_policy().
         */
        mutex_lock(&q->blkcg_mutex);
        for (i = 0; i < BLKCG_MAX_POLS; i++)
                if (blkg->pd[i])
                        blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
        if (blkg->parent)
                blkg_put(blkg->parent);
        spin_lock_irq(&q->queue_lock);
        list_del_init(&blkg->q_node);
        spin_unlock_irq(&q->queue_lock);
        mutex_unlock(&q->blkcg_mutex);

        blk_put_queue(q);
        free_percpu(blkg->iostat_cpu);
        percpu_ref_exit(&blkg->refcnt);
        kfree(blkg);
}

/**
 * blkg_free - free a blkg
 * @blkg: blkg to free
 *
 * Free @blkg which may be partially allocated.
 */
static void blkg_free(struct blkcg_gq *blkg)
{
        if (!blkg)
                return;

        /*
         * Both ->pd_free_fn() and request queue's release handler may
         * sleep, so free us by scheduling one work func
         */
        INIT_WORK(&blkg->free_work, blkg_free_workfn);
        schedule_work(&blkg->free_work);
}

static void __blkg_release(struct rcu_head *rcu)
{
        struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
        struct blkcg *blkcg = blkg->blkcg;
        int cpu;

#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
        WARN_ON(!bio_list_empty(&blkg->async_bios));
#endif
        /*
         * Flush all the non-empty percpu lockless lists before releasing
         * us, given these stat belongs to us.
         *
         * blkg_stat_lock is for serializing blkg stat update
         */
        for_each_possible_cpu(cpu)
                __blkcg_rstat_flush(blkcg, cpu);

        /* release the blkcg and parent blkg refs this blkg has been holding */
        css_put(&blkg->blkcg->css);
        blkg_free(blkg);
}

/*
 * A group is RCU protected, but having an rcu lock does not mean that one
 * can access all the fields of blkg and assume these are valid.  For
 * example, don't try to follow throtl_data and request queue links.
 *
 * Having a reference to blkg under an rcu allows accesses to only values
 * local to groups like group stats and group rate limits.
 */
static void blkg_release(struct percpu_ref *ref)
{
        struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt);

        call_rcu(&blkg->rcu_head, __blkg_release);
}

#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
static struct workqueue_struct *blkcg_punt_bio_wq;

static void blkg_async_bio_workfn(struct work_struct *work)
{
        struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
                                             async_bio_work);
        struct bio_list bios = BIO_EMPTY_LIST;
        struct bio *bio;
        struct blk_plug plug;
        bool need_plug = false;

        /* as long as there are pending bios, @blkg can't go away */
        spin_lock(&blkg->async_bio_lock);
        bio_list_merge_init(&bios, &blkg->async_bios);
        spin_unlock(&blkg->async_bio_lock);

        /* start plug only when bio_list contains at least 2 bios */
        if (bios.head && bios.head->bi_next) {
                need_plug = true;
                blk_start_plug(&plug);
        }
        while ((bio = bio_list_pop(&bios)))
                submit_bio(bio);
        if (need_plug)
                blk_finish_plug(&plug);
}

/*
 * When a shared kthread issues a bio for a cgroup, doing so synchronously can
 * lead to priority inversions as the kthread can be trapped waiting for that
 * cgroup.  Use this helper instead of submit_bio to punt the actual issuing to
 * a dedicated per-blkcg work item to avoid such priority inversions.
 */
void blkcg_punt_bio_submit(struct bio *bio)
{
        struct blkcg_gq *blkg = bio->bi_blkg;

        if (blkg->parent) {
                spin_lock(&blkg->async_bio_lock);
                bio_list_add(&blkg->async_bios, bio);
                spin_unlock(&blkg->async_bio_lock);
                queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
        } else {
                /* never bounce for the root cgroup */
                submit_bio(bio);
        }
}
EXPORT_SYMBOL_GPL(blkcg_punt_bio_submit);

static int __init blkcg_punt_bio_init(void)
{
        blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
                                            WQ_MEM_RECLAIM | WQ_FREEZABLE |
                                            WQ_UNBOUND | WQ_SYSFS, 0);
        if (!blkcg_punt_bio_wq)
                return -ENOMEM;
        return 0;
}
subsys_initcall(blkcg_punt_bio_init);
#endif /* CONFIG_BLK_CGROUP_PUNT_BIO */

/**
 * bio_blkcg_css - return the blkcg CSS associated with a bio
 * @bio: target bio
 *
 * This returns the CSS for the blkcg associated with a bio, or %NULL if not
 * associated. Callers are expected to either handle %NULL or know association
 * has been done prior to calling this.
 */
struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio)
{
        if (!bio || !bio->bi_blkg)
                return NULL;
        return &bio->bi_blkg->blkcg->css;
}
EXPORT_SYMBOL_GPL(bio_blkcg_css);

/**
 * blkcg_parent - get the parent of a blkcg
 * @blkcg: blkcg of interest
 *
 * Return the parent blkcg of @blkcg.  Can be called anytime.
 */
static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
{
        return css_to_blkcg(blkcg->css.parent);
}

/**
 * blkg_alloc - allocate a blkg
 * @blkcg: block cgroup the new blkg is associated with
 * @disk: gendisk the new blkg is associated with
 * @gfp_mask: allocation mask to use
 *
 * Allocate a new blkg associating @blkcg and @disk.
 */
static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
                                   gfp_t gfp_mask)
{
        struct blkcg_gq *blkg;
        int i, cpu;

        /* alloc and init base part */
        blkg = kzalloc_node(sizeof(*blkg), gfp_mask, disk->queue->node);
        if (!blkg)
                return NULL;
        if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask))
                goto out_free_blkg;
        blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask);
        if (!blkg->iostat_cpu)
                goto out_exit_refcnt;
        if (!blk_get_queue(disk->queue))
                goto out_free_iostat;

        blkg->q = disk->queue;
        INIT_LIST_HEAD(&blkg->q_node);
        blkg->blkcg = blkcg;
        blkg->iostat.blkg = blkg;
#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
        spin_lock_init(&blkg->async_bio_lock);
        bio_list_init(&blkg->async_bios);
        INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
#endif

        u64_stats_init(&blkg->iostat.sync);
        for_each_possible_cpu(cpu) {
                u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
                per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg;
        }

        for (i = 0; i < BLKCG_MAX_POLS; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];
                struct blkg_policy_data *pd;

                if (!blkcg_policy_enabled(disk->queue, pol))
                        continue;

                /* alloc per-policy data and attach it to blkg */
                pd = pol->pd_alloc_fn(disk, blkcg, gfp_mask);
                if (!pd)
                        goto out_free_pds;
                blkg->pd[i] = pd;
                pd->blkg = blkg;
                pd->plid = i;
                pd->online = false;
        }

        return blkg;

out_free_pds:
        while (--i >= 0)
                if (blkg->pd[i])
                        blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
        blk_put_queue(disk->queue);
out_free_iostat:
        free_percpu(blkg->iostat_cpu);
out_exit_refcnt:
        percpu_ref_exit(&blkg->refcnt);
out_free_blkg:
        kfree(blkg);
        return NULL;
}

/*
 * If @new_blkg is %NULL, this function tries to allocate a new one as
 * necessary using %GFP_NOWAIT.  @new_blkg is always consumed on return.
 */
static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
                                    struct blkcg_gq *new_blkg)
{
        struct blkcg_gq *blkg;
        int i, ret;

        lockdep_assert_held(&disk->queue->queue_lock);

        /* request_queue is dying, do not create/recreate a blkg */
        if (blk_queue_dying(disk->queue)) {
                ret = -ENODEV;
                goto err_free_blkg;
        }

        /* blkg holds a reference to blkcg */
        if (!css_tryget_online(&blkcg->css)) {
                ret = -ENODEV;
                goto err_free_blkg;
        }

        /* allocate */
        if (!new_blkg) {
                new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT);
                if (unlikely(!new_blkg)) {
                        ret = -ENOMEM;
                        goto err_put_css;
                }
        }
        blkg = new_blkg;

        /* link parent */
        if (blkcg_parent(blkcg)) {
                blkg->parent = blkg_lookup(blkcg_parent(blkcg), disk->queue);
                if (WARN_ON_ONCE(!blkg->parent)) {
                        ret = -ENODEV;
                        goto err_put_css;
                }
                blkg_get(blkg->parent);
        }

        /* invoke per-policy init */
        for (i = 0; i < BLKCG_MAX_POLS; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];

                if (blkg->pd[i] && pol->pd_init_fn)
                        pol->pd_init_fn(blkg->pd[i]);
        }

        /* insert */
        spin_lock(&blkcg->lock);
        ret = radix_tree_insert(&blkcg->blkg_tree, disk->queue->id, blkg);
        if (likely(!ret)) {
                hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
                list_add(&blkg->q_node, &disk->queue->blkg_list);

                for (i = 0; i < BLKCG_MAX_POLS; i++) {
                        struct blkcg_policy *pol = blkcg_policy[i];

                        if (blkg->pd[i]) {
                                if (pol->pd_online_fn)
                                        pol->pd_online_fn(blkg->pd[i]);
                                blkg->pd[i]->online = true;
                        }
                }
        }
        blkg->online = true;
        spin_unlock(&blkcg->lock);

        if (!ret)
                return blkg;

        /* @blkg failed fully initialized, use the usual release path */
        blkg_put(blkg);
        return ERR_PTR(ret);

err_put_css:
        css_put(&blkcg->css);
err_free_blkg:
        if (new_blkg)
                blkg_free(new_blkg);
        return ERR_PTR(ret);
}

/**
 * blkg_lookup_create - lookup blkg, try to create one if not there
 * @blkcg: blkcg of interest
 * @disk: gendisk of interest
 *
 * Lookup blkg for the @blkcg - @disk pair.  If it doesn't exist, try to
 * create one.  blkg creation is performed recursively from blkcg_root such
 * that all non-root blkg's have access to the parent blkg.  This function
 * should be called under RCU read lock and takes @disk->queue->queue_lock.
 *
 * Returns the blkg or the closest blkg if blkg_create() fails as it walks
 * down from root.
 */
static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
                struct gendisk *disk)
{
        struct request_queue *q = disk->queue;
        struct blkcg_gq *blkg;
        unsigned long flags;

        WARN_ON_ONCE(!rcu_read_lock_held());

        blkg = blkg_lookup(blkcg, q);
        if (blkg)
                return blkg;

        spin_lock_irqsave(&q->queue_lock, flags);
        blkg = blkg_lookup(blkcg, q);
        if (blkg) {
                if (blkcg != &blkcg_root &&
                    blkg != rcu_dereference(blkcg->blkg_hint))
                        rcu_assign_pointer(blkcg->blkg_hint, blkg);
                goto found;
        }

        /*
         * Create blkgs walking down from blkcg_root to @blkcg, so that all
         * non-root blkgs have access to their parents.  Returns the closest
         * blkg to the intended blkg should blkg_create() fail.
         */
        while (true) {
                struct blkcg *pos = blkcg;
                struct blkcg *parent = blkcg_parent(blkcg);
                struct blkcg_gq *ret_blkg = q->root_blkg;

                while (parent) {
                        blkg = blkg_lookup(parent, q);
                        if (blkg) {
                                /* remember closest blkg */
                                ret_blkg = blkg;
                                break;
                        }
                        pos = parent;
                        parent = blkcg_parent(parent);
                }

                blkg = blkg_create(pos, disk, NULL);
                if (IS_ERR(blkg)) {
                        blkg = ret_blkg;
                        break;
                }
                if (pos == blkcg)
                        break;
        }

found:
        spin_unlock_irqrestore(&q->queue_lock, flags);
        return blkg;
}

static void blkg_destroy(struct blkcg_gq *blkg)
{
        struct blkcg *blkcg = blkg->blkcg;
        int i;

        lockdep_assert_held(&blkg->q->queue_lock);
        lockdep_assert_held(&blkcg->lock);

        /*
         * blkg stays on the queue list until blkg_free_workfn(), see details in
         * blkg_free_workfn(), hence this function can be called from
         * blkcg_destroy_blkgs() first and again from blkg_destroy_all() before
         * blkg_free_workfn().
         */
        if (hlist_unhashed(&blkg->blkcg_node))
                return;

        for (i = 0; i < BLKCG_MAX_POLS; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];

                if (blkg->pd[i] && blkg->pd[i]->online) {
                        blkg->pd[i]->online = false;
                        if (pol->pd_offline_fn)
                                pol->pd_offline_fn(blkg->pd[i]);
                }
        }

        blkg->online = false;

        radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
        hlist_del_init_rcu(&blkg->blkcg_node);

        /*
         * Both setting lookup hint to and clearing it from @blkg are done
         * under queue_lock.  If it's not pointing to @blkg now, it never
         * will.  Hint assignment itself can race safely.
         */
        if (rcu_access_pointer(blkcg->blkg_hint) == blkg)
                rcu_assign_pointer(blkcg->blkg_hint, NULL);

        /*
         * Put the reference taken at the time of creation so that when all
         * queues are gone, group can be destroyed.
         */
        percpu_ref_kill(&blkg->refcnt);
}

static void blkg_destroy_all(struct gendisk *disk)
{
        struct request_queue *q = disk->queue;
        struct blkcg_gq *blkg;
        int count = BLKG_DESTROY_BATCH_SIZE;
        int i;

restart:
        spin_lock_irq(&q->queue_lock);
        list_for_each_entry(blkg, &q->blkg_list, q_node) {
                struct blkcg *blkcg = blkg->blkcg;

                if (hlist_unhashed(&blkg->blkcg_node))
                        continue;

                spin_lock(&blkcg->lock);
                blkg_destroy(blkg);
                spin_unlock(&blkcg->lock);

                /*
                 * in order to avoid holding the spin lock for too long, release
                 * it when a batch of blkgs are destroyed.
                 */
                if (!(--count)) {
                        count = BLKG_DESTROY_BATCH_SIZE;
                        spin_unlock_irq(&q->queue_lock);
                        cond_resched();
                        goto restart;
                }
        }

        /*
         * Mark policy deactivated since policy offline has been done, and
         * the free is scheduled, so future blkcg_deactivate_policy() can
         * be bypassed
         */
        for (i = 0; i < BLKCG_MAX_POLS; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];

                if (pol)
                        __clear_bit(pol->plid, q->blkcg_pols);
        }

        q->root_blkg = NULL;
        spin_unlock_irq(&q->queue_lock);
}

static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
{
        int i;

        for (i = 0; i < BLKG_IOSTAT_NR; i++) {
                dst->bytes[i] = src->bytes[i];
                dst->ios[i] = src->ios[i];
        }
}

static void __blkg_clear_stat(struct blkg_iostat_set *bis)
{
        struct blkg_iostat cur = {0};
        unsigned long flags;

        flags = u64_stats_update_begin_irqsave(&bis->sync);
        blkg_iostat_set(&bis->cur, &cur);
        blkg_iostat_set(&bis->last, &cur);
        u64_stats_update_end_irqrestore(&bis->sync, flags);
}

static void blkg_clear_stat(struct blkcg_gq *blkg)
{
        int cpu;

        for_each_possible_cpu(cpu) {
                struct blkg_iostat_set *s = per_cpu_ptr(blkg->iostat_cpu, cpu);

                __blkg_clear_stat(s);
        }
        __blkg_clear_stat(&blkg->iostat);
}

static int blkcg_reset_stats(struct cgroup_subsys_state *css,
                             struct cftype *cftype, u64 val)
{
        struct blkcg *blkcg = css_to_blkcg(css);
        struct blkcg_gq *blkg;
        int i;

        pr_info_once("blkio.%s is deprecated\n", cftype->name);
        mutex_lock(&blkcg_pol_mutex);
        spin_lock_irq(&blkcg->lock);

        /*
         * Note that stat reset is racy - it doesn't synchronize against
         * stat updates.  This is a debug feature which shouldn't exist
         * anyway.  If you get hit by a race, retry.
         */
        hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
                blkg_clear_stat(blkg);
                for (i = 0; i < BLKCG_MAX_POLS; i++) {
                        struct blkcg_policy *pol = blkcg_policy[i];

                        if (blkg->pd[i] && pol->pd_reset_stats_fn)
                                pol->pd_reset_stats_fn(blkg->pd[i]);
                }
        }

        spin_unlock_irq(&blkcg->lock);
        mutex_unlock(&blkcg_pol_mutex);
        return 0;
}

const char *blkg_dev_name(struct blkcg_gq *blkg)
{
        if (!blkg->q->disk)
                return NULL;
        return bdi_dev_name(blkg->q->disk->bdi);
}

/**
 * blkcg_print_blkgs - helper for printing per-blkg data
 * @sf: seq_file to print to
 * @blkcg: blkcg of interest
 * @prfill: fill function to print out a blkg
 * @pol: policy in question
 * @data: data to be passed to @prfill
 * @show_total: to print out sum of prfill return values or not
 *
 * This function invokes @prfill on each blkg of @blkcg if pd for the
 * policy specified by @pol exists.  @prfill is invoked with @sf, the
 * policy data and @data and the matching queue lock held.  If @show_total
 * is %true, the sum of the return values from @prfill is printed with
 * "Total" label at the end.
 *
 * This is to be used to construct print functions for
 * cftype->read_seq_string method.
 */
void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
                       u64 (*prfill)(struct seq_file *,
                                     struct blkg_policy_data *, int),
                       const struct blkcg_policy *pol, int data,
                       bool show_total)
{
        struct blkcg_gq *blkg;
        u64 total = 0;

        rcu_read_lock();
        hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
                spin_lock_irq(&blkg->q->queue_lock);
                if (blkcg_policy_enabled(blkg->q, pol))
                        total += prfill(sf, blkg->pd[pol->plid], data);
                spin_unlock_irq(&blkg->q->queue_lock);
        }
        rcu_read_unlock();

        if (show_total)
                seq_printf(sf, "Total %llu\n", (unsigned long long)total);
}
EXPORT_SYMBOL_GPL(blkcg_print_blkgs);

/**
 * __blkg_prfill_u64 - prfill helper for a single u64 value
 * @sf: seq_file to print to
 * @pd: policy private data of interest
 * @v: value to print
 *
 * Print @v to @sf for the device associated with @pd.
 */
u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
{
        const char *dname = blkg_dev_name(pd->blkg);

        if (!dname)
                return 0;

        seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
        return v;
}
EXPORT_SYMBOL_GPL(__blkg_prfill_u64);

/**
 * blkg_conf_init - initialize a blkg_conf_ctx
 * @ctx: blkg_conf_ctx to initialize
 * @input: input string
 *
 * Initialize @ctx which can be used to parse blkg config input string @input.
 * Once initialized, @ctx can be used with blkg_conf_open_bdev() and
 * blkg_conf_prep(), and must be cleaned up with blkg_conf_exit().
 */
void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input)
{
        *ctx = (struct blkg_conf_ctx){ .input = input };
}
EXPORT_SYMBOL_GPL(blkg_conf_init);

/**
 * blkg_conf_open_bdev - parse and open bdev for per-blkg config update
 * @ctx: blkg_conf_ctx initialized with blkg_conf_init()
 *
 * Parse the device node prefix part, MAJ:MIN, of per-blkg config update from
 * @ctx->input and get and store the matching bdev in @ctx->bdev. @ctx->body is
 * set to point past the device node prefix.
 *
 * This function may be called multiple times on @ctx and the extra calls become
 * NOOPs. blkg_conf_prep() implicitly calls this function. Use this function
 * explicitly if bdev access is needed without resolving the blkcg / policy part
 * of @ctx->input. Returns -errno on error.
 */
int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
{
        char *input = ctx->input;
        unsigned int major, minor;
        struct block_device *bdev;
        int key_len;

        if (ctx->bdev)
                return 0;

        if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
                return -EINVAL;

        input += key_len;
        if (!isspace(*input))
                return -EINVAL;
        input = skip_spaces(input);

        bdev = blkdev_get_no_open(MKDEV(major, minor), false);
        if (!bdev)
                return -ENODEV;
        if (bdev_is_partition(bdev)) {
                blkdev_put_no_open(bdev);
                return -ENODEV;
        }

        mutex_lock(&bdev->bd_queue->rq_qos_mutex);
        if (!disk_live(bdev->bd_disk)) {
                blkdev_put_no_open(bdev);
                mutex_unlock(&bdev->bd_queue->rq_qos_mutex);
                return -ENODEV;
        }

        ctx->body = input;
        ctx->bdev = bdev;
        return 0;
}
/*
 * Similar to blkg_conf_open_bdev, but additionally freezes the queue,
 * ensures the correct locking order between freeze queue and q->rq_qos_mutex.
 *
 * This function returns negative error on failure. On success it returns
 * memflags which must be saved and later passed to blkg_conf_exit_frozen
 * for restoring the memalloc scope.
 */
unsigned long __must_check blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx)
{
        int ret;
        unsigned long memflags;

        if (ctx->bdev)
                return -EINVAL;

        ret = blkg_conf_open_bdev(ctx);
        if (ret < 0)
                return ret;
        /*
         * At this point, we haven’t started protecting anything related to QoS,
         * so we release q->rq_qos_mutex here, which was first acquired in blkg_
         * conf_open_bdev. Later, we re-acquire q->rq_qos_mutex after freezing
         * the queue to maintain the correct locking order.
         */
        mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex);

        memflags = blk_mq_freeze_queue(ctx->bdev->bd_queue);
        mutex_lock(&ctx->bdev->bd_queue->rq_qos_mutex);

        return memflags;
}

/**
 * blkg_conf_prep - parse and prepare for per-blkg config update
 * @blkcg: target block cgroup
 * @pol: target policy
 * @ctx: blkg_conf_ctx initialized with blkg_conf_init()
 *
 * Parse per-blkg config update from @ctx->input and initialize @ctx
 * accordingly. On success, @ctx->body points to the part of @ctx->input
 * following MAJ:MIN, @ctx->bdev points to the target block device and
 * @ctx->blkg to the blkg being configured.
 *
 * blkg_conf_open_bdev() may be called on @ctx beforehand. On success, this
 * function returns with queue lock held and must be followed by
 * blkg_conf_exit().
 */
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
                   struct blkg_conf_ctx *ctx)
        __acquires(&bdev->bd_queue->queue_lock)
{
        struct gendisk *disk;
        struct request_queue *q;
        struct blkcg_gq *blkg;
        int ret;

        ret = blkg_conf_open_bdev(ctx);
        if (ret)
                return ret;

        disk = ctx->bdev->bd_disk;
        q = disk->queue;

        /* Prevent concurrent with blkcg_deactivate_policy() */
        mutex_lock(&q->blkcg_mutex);
        spin_lock_irq(&q->queue_lock);

        if (!blkcg_policy_enabled(q, pol)) {
                ret = -EOPNOTSUPP;
                goto fail_unlock;
        }

        blkg = blkg_lookup(blkcg, q);
        if (blkg)
                goto success;

        /*
         * Create blkgs walking down from blkcg_root to @blkcg, so that all
         * non-root blkgs have access to their parents.
         */
        while (true) {
                struct blkcg *pos = blkcg;
                struct blkcg *parent;
                struct blkcg_gq *new_blkg;

                parent = blkcg_parent(blkcg);
                while (parent && !blkg_lookup(parent, q)) {
                        pos = parent;
                        parent = blkcg_parent(parent);
                }

                /* Drop locks to do new blkg allocation with GFP_KERNEL. */
                spin_unlock_irq(&q->queue_lock);

                new_blkg = blkg_alloc(pos, disk, GFP_NOIO);
                if (unlikely(!new_blkg)) {
                        ret = -ENOMEM;
                        goto fail_exit;
                }

                if (radix_tree_preload(GFP_KERNEL)) {
                        blkg_free(new_blkg);
                        ret = -ENOMEM;
                        goto fail_exit;
                }

                spin_lock_irq(&q->queue_lock);

                if (!blkcg_policy_enabled(q, pol)) {
                        blkg_free(new_blkg);
                        ret = -EOPNOTSUPP;
                        goto fail_preloaded;
                }

                blkg = blkg_lookup(pos, q);
                if (blkg) {
                        blkg_free(new_blkg);
                } else {
                        blkg = blkg_create(pos, disk, new_blkg);
                        if (IS_ERR(blkg)) {
                                ret = PTR_ERR(blkg);
                                goto fail_preloaded;
                        }
                }

                radix_tree_preload_end();

                if (pos == blkcg)
                        goto success;
        }
success:
        mutex_unlock(&q->blkcg_mutex);
        ctx->blkg = blkg;
        return 0;

fail_preloaded:
        radix_tree_preload_end();
fail_unlock:
        spin_unlock_irq(&q->queue_lock);
fail_exit:
        mutex_unlock(&q->blkcg_mutex);
        /*
         * If queue was bypassing, we should retry.  Do so after a
         * short msleep().  It isn't strictly necessary but queue
         * can be bypassing for some time and it's always nice to
         * avoid busy looping.
         */
        if (ret == -EBUSY) {
                msleep(10);
                ret = restart_syscall();
        }
        return ret;
}
EXPORT_SYMBOL_GPL(blkg_conf_prep);

/**
 * blkg_conf_exit - clean up per-blkg config update
 * @ctx: blkg_conf_ctx initialized with blkg_conf_init()
 *
 * Clean up after per-blkg config update. This function must be called on all
 * blkg_conf_ctx's initialized with blkg_conf_init().
 */
void blkg_conf_exit(struct blkg_conf_ctx *ctx)
        __releases(&ctx->bdev->bd_queue->queue_lock)
        __releases(&ctx->bdev->bd_queue->rq_qos_mutex)
{
        if (ctx->blkg) {
                spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock);
                ctx->blkg = NULL;
        }

        if (ctx->bdev) {
                mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex);
                blkdev_put_no_open(ctx->bdev);
                ctx->body = NULL;
                ctx->bdev = NULL;
        }
}
EXPORT_SYMBOL_GPL(blkg_conf_exit);

/*
 * Similar to blkg_conf_exit, but also unfreezes the queue. Should be used
 * when blkg_conf_open_bdev_frozen is used to open the bdev.
 */
void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags)
{
        if (ctx->bdev) {
                struct request_queue *q = ctx->bdev->bd_queue;

                blkg_conf_exit(ctx);
                blk_mq_unfreeze_queue(q, memflags);
        }
}

static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src)
{
        int i;

        for (i = 0; i < BLKG_IOSTAT_NR; i++) {
                dst->bytes[i] += src->bytes[i];
                dst->ios[i] += src->ios[i];
        }
}

static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src)
{
        int i;

        for (i = 0; i < BLKG_IOSTAT_NR; i++) {
                dst->bytes[i] -= src->bytes[i];
                dst->ios[i] -= src->ios[i];
        }
}

static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
                                struct blkg_iostat *last)
{
        struct blkg_iostat delta;
        unsigned long flags;

        /* propagate percpu delta to global */
        flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
        blkg_iostat_set(&delta, cur);
        blkg_iostat_sub(&delta, last);
        blkg_iostat_add(&blkg->iostat.cur, &delta);
        blkg_iostat_add(last, &delta);
        u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
}

static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
{
        struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
        struct llist_node *lnode;
        struct blkg_iostat_set *bisc, *next_bisc;
        unsigned long flags;

        rcu_read_lock();

        lnode = llist_del_all(lhead);
        if (!lnode)
                goto out;

        /*
         * For covering concurrent parent blkg update from blkg_release().
         *
         * When flushing from cgroup, the subsystem rstat lock is always held,
         * so this lock won't cause contention most of time.
         */
        raw_spin_lock_irqsave(&blkg_stat_lock, flags);

        /*
         * Iterate only the iostat_cpu's queued in the lockless list.
         */
        llist_for_each_entry_safe(bisc, next_bisc, lnode, lnode) {
                struct blkcg_gq *blkg = bisc->blkg;
                struct blkcg_gq *parent = blkg->parent;
                struct blkg_iostat cur;
                unsigned int seq;

                /*
                 * Order assignment of `next_bisc` from `bisc->lnode.next` in
                 * llist_for_each_entry_safe and clearing `bisc->lqueued` for
                 * avoiding to assign `next_bisc` with new next pointer added
                 * in blk_cgroup_bio_start() in case of re-ordering.
                 *
                 * The pair barrier is implied in llist_add() in blk_cgroup_bio_start().
                 */
                smp_mb();

                WRITE_ONCE(bisc->lqueued, false);
                if (bisc == &blkg->iostat)
                        goto propagate_up; /* propagate up to parent only */

                /* fetch the current per-cpu values */
                do {
                        seq = u64_stats_fetch_begin(&bisc->sync);
                        blkg_iostat_set(&cur, &bisc->cur);
                } while (u64_stats_fetch_retry(&bisc->sync, seq));

                blkcg_iostat_update(blkg, &cur, &bisc->last);

propagate_up:
                /* propagate global delta to parent (unless that's root) */
                if (parent && parent->parent) {
                        blkcg_iostat_update(parent, &blkg->iostat.cur,
                                            &blkg->iostat.last);
                        /*
                         * Queue parent->iostat to its blkcg's lockless
                         * list to propagate up to the grandparent if the
                         * iostat hasn't been queued yet.
                         */
                        if (!parent->iostat.lqueued) {
                                struct llist_head *plhead;

                                plhead = per_cpu_ptr(parent->blkcg->lhead, cpu);
                                llist_add(&parent->iostat.lnode, plhead);
                                parent->iostat.lqueued = true;
                        }
                }
        }
        raw_spin_unlock_irqrestore(&blkg_stat_lock, flags);
out:
        rcu_read_unlock();
}

static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
{
        /* Root-level stats are sourced from system-wide IO stats */
        if (cgroup_parent(css->cgroup))
                __blkcg_rstat_flush(css_to_blkcg(css), cpu);
}

/*
 * We source root cgroup stats from the system-wide stats to avoid
 * tracking the same information twice and incurring overhead when no
 * cgroups are defined. For that reason, css_rstat_flush in
 * blkcg_print_stat does not actually fill out the iostat in the root
 * cgroup's blkcg_gq.
 *
 * However, we would like to re-use the printing code between the root and
 * non-root cgroups to the extent possible. For that reason, we simulate
 * flushing the root cgroup's stats by explicitly filling in the iostat
 * with disk level statistics.
 */
static void blkcg_fill_root_iostats(void)
{
        struct class_dev_iter iter;
        struct device *dev;

        class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
        while ((dev = class_dev_iter_next(&iter))) {
                struct block_device *bdev = dev_to_bdev(dev);
                struct blkcg_gq *blkg = bdev->bd_disk->queue->root_blkg;
                struct blkg_iostat tmp;
                int cpu;
                unsigned long flags;

                memset(&tmp, 0, sizeof(tmp));
                for_each_possible_cpu(cpu) {
                        struct disk_stats *cpu_dkstats;

                        cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu);
                        tmp.ios[BLKG_IOSTAT_READ] +=
                                cpu_dkstats->ios[STAT_READ];
                        tmp.ios[BLKG_IOSTAT_WRITE] +=
                                cpu_dkstats->ios[STAT_WRITE];
                        tmp.ios[BLKG_IOSTAT_DISCARD] +=
                                cpu_dkstats->ios[STAT_DISCARD];
                        // convert sectors to bytes
                        tmp.bytes[BLKG_IOSTAT_READ] +=
                                cpu_dkstats->sectors[STAT_READ] << 9;
                        tmp.bytes[BLKG_IOSTAT_WRITE] +=
                                cpu_dkstats->sectors[STAT_WRITE] << 9;
                        tmp.bytes[BLKG_IOSTAT_DISCARD] +=
                                cpu_dkstats->sectors[STAT_DISCARD] << 9;
                }

                flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
                blkg_iostat_set(&blkg->iostat.cur, &tmp);
                u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
        }
        class_dev_iter_exit(&iter);
}

static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s)
{
        struct blkg_iostat_set *bis = &blkg->iostat;
        u64 rbytes, wbytes, rios, wios, dbytes, dios;
        const char *dname;
        unsigned seq;
        int i;

        if (!blkg->online)
                return;

        dname = blkg_dev_name(blkg);
        if (!dname)
                return;

        seq_printf(s, "%s ", dname);

        do {
                seq = u64_stats_fetch_begin(&bis->sync);

                rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
                wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
                dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
                rios = bis->cur.ios[BLKG_IOSTAT_READ];
                wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
                dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
        } while (u64_stats_fetch_retry(&bis->sync, seq));

        if (rbytes || wbytes || rios || wios) {
                seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
                        rbytes, wbytes, rios, wios,
                        dbytes, dios);
        }

        if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) {
                seq_printf(s, " use_delay=%d delay_nsec=%llu",
                        atomic_read(&blkg->use_delay),
                        atomic64_read(&blkg->delay_nsec));
        }

        for (i = 0; i < BLKCG_MAX_POLS; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];

                if (!blkg->pd[i] || !pol->pd_stat_fn)
                        continue;

                pol->pd_stat_fn(blkg->pd[i], s);
        }

        seq_puts(s, "\n");
}

static int blkcg_print_stat(struct seq_file *sf, void *v)
{
        struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
        struct blkcg_gq *blkg;

        if (!seq_css(sf)->parent)
                blkcg_fill_root_iostats();
        else
                css_rstat_flush(&blkcg->css);

        rcu_read_lock();
        hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
                spin_lock_irq(&blkg->q->queue_lock);
                blkcg_print_one_stat(blkg, sf);
                spin_unlock_irq(&blkg->q->queue_lock);
        }
        rcu_read_unlock();
        return 0;
}

static struct cftype blkcg_files[] = {
        {
                .name = "stat",
                .seq_show = blkcg_print_stat,
        },
        { }        /* terminate */
};

static struct cftype blkcg_legacy_files[] = {
        {
                .name = "reset_stats",
                .write_u64 = blkcg_reset_stats,
        },
        { }        /* terminate */
};

#ifdef CONFIG_CGROUP_WRITEBACK
struct list_head *blkcg_get_cgwb_list(struct cgroup_subsys_state *css)
{
        return &css_to_blkcg(css)->cgwb_list;
}
#endif

/*
 * blkcg destruction is a three-stage process.
 *
 * 1. Destruction starts.  The blkcg_css_offline() callback is invoked
 *    which offlines writeback.  Here we tie the next stage of blkg destruction
 *    to the completion of writeback associated with the blkcg.  This lets us
 *    avoid punting potentially large amounts of outstanding writeback to root
 *    while maintaining any ongoing policies.  The next stage is triggered when
 *    the nr_cgwbs count goes to zero.
 *
 * 2. When the nr_cgwbs count goes to zero, blkcg_destroy_blkgs() is called
 *    and handles the destruction of blkgs.  Here the css reference held by
 *    the blkg is put back eventually allowing blkcg_css_free() to be called.
 *    This work may occur in cgwb_release_workfn() on the cgwb_release
 *    workqueue.  Any submitted ios that fail to get the blkg ref will be
 *    punted to the root_blkg.
 *
 * 3. Once the blkcg ref count goes to zero, blkcg_css_free() is called.
 *    This finally frees the blkcg.
 */

/**
 * blkcg_destroy_blkgs - responsible for shooting down blkgs
 * @blkcg: blkcg of interest
 *
 * blkgs should be removed while holding both q and blkcg locks.  As blkcg lock
 * is nested inside q lock, this function performs reverse double lock dancing.
 * Destroying the blkgs releases the reference held on the blkcg's css allowing
 * blkcg_css_free to eventually be called.
 *
 * This is the blkcg counterpart of ioc_release_fn().
 */
static void blkcg_destroy_blkgs(struct blkcg *blkcg)
{
        might_sleep();

        spin_lock_irq(&blkcg->lock);

        while (!hlist_empty(&blkcg->blkg_list)) {
                struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
                                                struct blkcg_gq, blkcg_node);
                struct request_queue *q = blkg->q;

                if (need_resched() || !spin_trylock(&q->queue_lock)) {
                        /*
                         * Given that the system can accumulate a huge number
                         * of blkgs in pathological cases, check to see if we
                         * need to rescheduling to avoid softlockup.
                         */
                        spin_unlock_irq(&blkcg->lock);
                        cond_resched();
                        spin_lock_irq(&blkcg->lock);
                        continue;
                }

                blkg_destroy(blkg);
                spin_unlock(&q->queue_lock);
        }

        spin_unlock_irq(&blkcg->lock);
}

/**
 * blkcg_pin_online - pin online state
 * @blkcg_css: blkcg of interest
 *
 * While pinned, a blkcg is kept online.  This is primarily used to
 * impedance-match blkg and cgwb lifetimes so that blkg doesn't go offline
 * while an associated cgwb is still active.
 */
void blkcg_pin_online(struct cgroup_subsys_state *blkcg_css)
{
        refcount_inc(&css_to_blkcg(blkcg_css)->online_pin);
}

/**
 * blkcg_unpin_online - unpin online state
 * @blkcg_css: blkcg of interest
 *
 * This is primarily used to impedance-match blkg and cgwb lifetimes so
 * that blkg doesn't go offline while an associated cgwb is still active.
 * When this count goes to zero, all active cgwbs have finished so the
 * blkcg can continue destruction by calling blkcg_destroy_blkgs().
 */
void blkcg_unpin_online(struct cgroup_subsys_state *blkcg_css)
{
        struct blkcg *blkcg = css_to_blkcg(blkcg_css);

        do {
                struct blkcg *parent;

                if (!refcount_dec_and_test(&blkcg->online_pin))
                        break;

                parent = blkcg_parent(blkcg);
                blkcg_destroy_blkgs(blkcg);
                blkcg = parent;
        } while (blkcg);
}

/**
 * blkcg_css_offline - cgroup css_offline callback
 * @css: css of interest
 *
 * This function is called when @css is about to go away.  Here the cgwbs are
 * offlined first and only once writeback associated with the blkcg has
 * finished do we start step 2 (see above).
 */
static void blkcg_css_offline(struct cgroup_subsys_state *css)
{
        /* this prevents anyone from attaching or migrating to this blkcg */
        wb_blkcg_offline(css);

        /* put the base online pin allowing step 2 to be triggered */
        blkcg_unpin_online(css);
}

static void blkcg_css_free(struct cgroup_subsys_state *css)
{
        struct blkcg *blkcg = css_to_blkcg(css);
        int i;

        mutex_lock(&blkcg_pol_mutex);

        list_del(&blkcg->all_blkcgs_node);

        for (i = 0; i < BLKCG_MAX_POLS; i++)
                if (blkcg->cpd[i])
                        blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);

        mutex_unlock(&blkcg_pol_mutex);

        free_percpu(blkcg->lhead);
        kfree(blkcg);
}

static struct cgroup_subsys_state *
blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
{
        struct blkcg *blkcg;
        int i;

        mutex_lock(&blkcg_pol_mutex);

        if (!parent_css) {
                blkcg = &blkcg_root;
        } else {
                blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
                if (!blkcg)
                        goto unlock;
        }

        if (init_blkcg_llists(blkcg))
                goto free_blkcg;

        for (i = 0; i < BLKCG_MAX_POLS ; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];
                struct blkcg_policy_data *cpd;

                /*
                 * If the policy hasn't been attached yet, wait for it
                 * to be attached before doing anything else. Otherwise,
                 * check if the policy requires any specific per-cgroup
                 * data: if it does, allocate and initialize it.
                 */
                if (!pol || !pol->cpd_alloc_fn)
                        continue;

                cpd = pol->cpd_alloc_fn(GFP_KERNEL);
                if (!cpd)
                        goto free_pd_blkcg;

                blkcg->cpd[i] = cpd;
                cpd->blkcg = blkcg;
                cpd->plid = i;
        }

        spin_lock_init(&blkcg->lock);
        refcount_set(&blkcg->online_pin, 1);
        INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT);
        INIT_HLIST_HEAD(&blkcg->blkg_list);
#ifdef CONFIG_CGROUP_WRITEBACK
        INIT_LIST_HEAD(&blkcg->cgwb_list);
#endif
        list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);

        mutex_unlock(&blkcg_pol_mutex);
        return &blkcg->css;

free_pd_blkcg:
        for (i--; i >= 0; i--)
                if (blkcg->cpd[i])
                        blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
        free_percpu(blkcg->lhead);
free_blkcg:
        if (blkcg != &blkcg_root)
                kfree(blkcg);
unlock:
        mutex_unlock(&blkcg_pol_mutex);
        return ERR_PTR(-ENOMEM);
}

static int blkcg_css_online(struct cgroup_subsys_state *css)
{
        struct blkcg *parent = blkcg_parent(css_to_blkcg(css));

        /*
         * blkcg_pin_online() is used to delay blkcg offline so that blkgs
         * don't go offline while cgwbs are still active on them.  Pin the
         * parent so that offline always happens towards the root.
         */
        if (parent)
                blkcg_pin_online(&parent->css);
        return 0;
}

void blkg_init_queue(struct request_queue *q)
{
        INIT_LIST_HEAD(&q->blkg_list);
        mutex_init(&q->blkcg_mutex);
}

int blkcg_init_disk(struct gendisk *disk)
{
        struct request_queue *q = disk->queue;
        struct blkcg_gq *new_blkg, *blkg;
        bool preloaded;

        new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL);
        if (!new_blkg)
                return -ENOMEM;

        preloaded = !radix_tree_preload(GFP_KERNEL);

        /* Make sure the root blkg exists. */
        /* spin_lock_irq can serve as RCU read-side critical section. */
        spin_lock_irq(&q->queue_lock);
        blkg = blkg_create(&blkcg_root, disk, new_blkg);
        if (IS_ERR(blkg))
                goto err_unlock;
        q->root_blkg = blkg;
        spin_unlock_irq(&q->queue_lock);

        if (preloaded)
                radix_tree_preload_end();

        return 0;

err_unlock:
        spin_unlock_irq(&q->queue_lock);
        if (preloaded)
                radix_tree_preload_end();
        return PTR_ERR(blkg);
}

void blkcg_exit_disk(struct gendisk *disk)
{
        blkg_destroy_all(disk);
        blk_throtl_exit(disk);
}

static void blkcg_exit(struct task_struct *tsk)
{
        if (tsk->throttle_disk)
                put_disk(tsk->throttle_disk);
        tsk->throttle_disk = NULL;
}

struct cgroup_subsys io_cgrp_subsys = {
        .css_alloc = blkcg_css_alloc,
        .css_online = blkcg_css_online,
        .css_offline = blkcg_css_offline,
        .css_free = blkcg_css_free,
        .css_rstat_flush = blkcg_rstat_flush,
        .dfl_cftypes = blkcg_files,
        .legacy_cftypes = blkcg_legacy_files,
        .legacy_name = "blkio",
        .exit = blkcg_exit,
#ifdef CONFIG_MEMCG
        /*
         * This ensures that, if available, memcg is automatically enabled
         * together on the default hierarchy so that the owner cgroup can
         * be retrieved from writeback pages.
         */
        .depends_on = 1 << memory_cgrp_id,
#endif
};
EXPORT_SYMBOL_GPL(io_cgrp_subsys);

/**
 * blkcg_activate_policy - activate a blkcg policy on a gendisk
 * @disk: gendisk of interest
 * @pol: blkcg policy to activate
 *
 * Activate @pol on @disk.  Requires %GFP_KERNEL context.  @disk goes through
 * bypass mode to populate its blkgs with policy_data for @pol.
 *
 * Activation happens with @disk bypassed, so nobody would be accessing blkgs
 * from IO path.  Update of each blkg is protected by both queue and blkcg
 * locks so that holding either lock and testing blkcg_policy_enabled() is
 * always enough for dereferencing policy data.
 *
 * The caller is responsible for synchronizing [de]activations and policy
 * [un]registerations.  Returns 0 on success, -errno on failure.
 */
int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
{
        struct request_queue *q = disk->queue;
        struct blkg_policy_data *pd_prealloc = NULL;
        struct blkcg_gq *blkg, *pinned_blkg = NULL;
        unsigned int memflags;
        int ret;

        if (blkcg_policy_enabled(q, pol))
                return 0;

        /*
         * Policy is allowed to be registered without pd_alloc_fn/pd_free_fn,
         * for example, ioprio. Such policy will work on blkcg level, not disk
         * level, and don't need to be activated.
         */
        if (WARN_ON_ONCE(!pol->pd_alloc_fn || !pol->pd_free_fn))
                return -EINVAL;

        if (queue_is_mq(q))
                memflags = blk_mq_freeze_queue(q);
retry:
        spin_lock_irq(&q->queue_lock);

        /* blkg_list is pushed at the head, reverse walk to initialize parents first */
        list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
                struct blkg_policy_data *pd;

                if (blkg->pd[pol->plid])
                        continue;

                /* If prealloc matches, use it; otherwise try GFP_NOWAIT */
                if (blkg == pinned_blkg) {
                        pd = pd_prealloc;
                        pd_prealloc = NULL;
                } else {
                        pd = pol->pd_alloc_fn(disk, blkg->blkcg,
                                              GFP_NOWAIT);
                }

                if (!pd) {
                        /*
                         * GFP_NOWAIT failed.  Free the existing one and
                         * prealloc for @blkg w/ GFP_KERNEL.
                         */
                        if (pinned_blkg)
                                blkg_put(pinned_blkg);
                        blkg_get(blkg);
                        pinned_blkg = blkg;

                        spin_unlock_irq(&q->queue_lock);

                        if (pd_prealloc)
                                pol->pd_free_fn(pd_prealloc);
                        pd_prealloc = pol->pd_alloc_fn(disk, blkg->blkcg,
                                                       GFP_KERNEL);
                        if (pd_prealloc)
                                goto retry;
                        else
                                goto enomem;
                }

                spin_lock(&blkg->blkcg->lock);

                pd->blkg = blkg;
                pd->plid = pol->plid;
                blkg->pd[pol->plid] = pd;

                if (pol->pd_init_fn)
                        pol->pd_init_fn(pd);

                if (pol->pd_online_fn)
                        pol->pd_online_fn(pd);
                pd->online = true;

                spin_unlock(&blkg->blkcg->lock);
        }

        __set_bit(pol->plid, q->blkcg_pols);
        ret = 0;

        spin_unlock_irq(&q->queue_lock);
out:
        if (queue_is_mq(q))
                blk_mq_unfreeze_queue(q, memflags);
        if (pinned_blkg)
                blkg_put(pinned_blkg);
        if (pd_prealloc)
                pol->pd_free_fn(pd_prealloc);
        return ret;

enomem:
        /* alloc failed, take down everything */
        spin_lock_irq(&q->queue_lock);
        list_for_each_entry(blkg, &q->blkg_list, q_node) {
                struct blkcg *blkcg = blkg->blkcg;
                struct blkg_policy_data *pd;

                spin_lock(&blkcg->lock);
                pd = blkg->pd[pol->plid];
                if (pd) {
                        if (pd->online && pol->pd_offline_fn)
                                pol->pd_offline_fn(pd);
                        pd->online = false;
                        pol->pd_free_fn(pd);
                        blkg->pd[pol->plid] = NULL;
                }
                spin_unlock(&blkcg->lock);
        }
        spin_unlock_irq(&q->queue_lock);
        ret = -ENOMEM;
        goto out;
}
EXPORT_SYMBOL_GPL(blkcg_activate_policy);

/**
 * blkcg_deactivate_policy - deactivate a blkcg policy on a gendisk
 * @disk: gendisk of interest
 * @pol: blkcg policy to deactivate
 *
 * Deactivate @pol on @disk.  Follows the same synchronization rules as
 * blkcg_activate_policy().
 */
void blkcg_deactivate_policy(struct gendisk *disk,
                             const struct blkcg_policy *pol)
{
        struct request_queue *q = disk->queue;
        struct blkcg_gq *blkg;
        unsigned int memflags;

        if (!blkcg_policy_enabled(q, pol))
                return;

        if (queue_is_mq(q))
                memflags = blk_mq_freeze_queue(q);

        mutex_lock(&q->blkcg_mutex);
        spin_lock_irq(&q->queue_lock);

        __clear_bit(pol->plid, q->blkcg_pols);

        list_for_each_entry(blkg, &q->blkg_list, q_node) {
                struct blkcg *blkcg = blkg->blkcg;

                spin_lock(&blkcg->lock);
                if (blkg->pd[pol->plid]) {
                        if (blkg->pd[pol->plid]->online && pol->pd_offline_fn)
                                pol->pd_offline_fn(blkg->pd[pol->plid]);
                        pol->pd_free_fn(blkg->pd[pol->plid]);
                        blkg->pd[pol->plid] = NULL;
                }
                spin_unlock(&blkcg->lock);
        }

        spin_unlock_irq(&q->queue_lock);
        mutex_unlock(&q->blkcg_mutex);

        if (queue_is_mq(q))
                blk_mq_unfreeze_queue(q, memflags);
}
EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);

static void blkcg_free_all_cpd(struct blkcg_policy *pol)
{
        struct blkcg *blkcg;

        list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
                if (blkcg->cpd[pol->plid]) {
                        pol->cpd_free_fn(blkcg->cpd[pol->plid]);
                        blkcg->cpd[pol->plid] = NULL;
                }
        }
}

/**
 * blkcg_policy_register - register a blkcg policy
 * @pol: blkcg policy to register
 *
 * Register @pol with blkcg core.  Might sleep and @pol may be modified on
 * successful registration.  Returns 0 on success and -errno on failure.
 */
int blkcg_policy_register(struct blkcg_policy *pol)
{
        struct blkcg *blkcg;
        int i, ret;

        /*
         * Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs, and policy
         * without pd_alloc_fn/pd_free_fn can't be activated.
         */
        if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
            (!pol->pd_alloc_fn ^ !pol->pd_free_fn))
                return -EINVAL;

        mutex_lock(&blkcg_pol_register_mutex);
        mutex_lock(&blkcg_pol_mutex);

        /* find an empty slot */
        for (i = 0; i < BLKCG_MAX_POLS; i++)
                if (!blkcg_policy[i])
                        break;
        if (i >= BLKCG_MAX_POLS) {
                pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n");
                ret = -ENOSPC;
                goto err_unlock;
        }

        /* register @pol */
        pol->plid = i;
        blkcg_policy[pol->plid] = pol;

        /* allocate and install cpd's */
        if (pol->cpd_alloc_fn) {
                list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
                        struct blkcg_policy_data *cpd;

                        cpd = pol->cpd_alloc_fn(GFP_KERNEL);
                        if (!cpd) {
                                ret = -ENOMEM;
                                goto err_free_cpds;
                        }

                        blkcg->cpd[pol->plid] = cpd;
                        cpd->blkcg = blkcg;
                        cpd->plid = pol->plid;
                }
        }

        mutex_unlock(&blkcg_pol_mutex);

        /* everything is in place, add intf files for the new policy */
        if (pol->dfl_cftypes == pol->legacy_cftypes) {
                WARN_ON(cgroup_add_cftypes(&io_cgrp_subsys,
                                           pol->dfl_cftypes));
        } else {
                WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
                                               pol->dfl_cftypes));
                WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
                                                  pol->legacy_cftypes));
        }
        mutex_unlock(&blkcg_pol_register_mutex);
        return 0;

err_free_cpds:
        if (pol->cpd_free_fn)
                blkcg_free_all_cpd(pol);

        blkcg_policy[pol->plid] = NULL;
err_unlock:
        mutex_unlock(&blkcg_pol_mutex);
        mutex_unlock(&blkcg_pol_register_mutex);
        return ret;
}
EXPORT_SYMBOL_GPL(blkcg_policy_register);

/**
 * blkcg_policy_unregister - unregister a blkcg policy
 * @pol: blkcg policy to unregister
 *
 * Undo blkcg_policy_register(@pol).  Might sleep.
 */
void blkcg_policy_unregister(struct blkcg_policy *pol)
{
        mutex_lock(&blkcg_pol_register_mutex);

        if (WARN_ON(blkcg_policy[pol->plid] != pol))
                goto out_unlock;

        /* kill the intf files first */
        if (pol->dfl_cftypes)
                cgroup_rm_cftypes(pol->dfl_cftypes);
        if (pol->legacy_cftypes)
                cgroup_rm_cftypes(pol->legacy_cftypes);

        /* remove cpds and unregister */
        mutex_lock(&blkcg_pol_mutex);

        if (pol->cpd_free_fn)
                blkcg_free_all_cpd(pol);

        blkcg_policy[pol->plid] = NULL;

        mutex_unlock(&blkcg_pol_mutex);
out_unlock:
        mutex_unlock(&blkcg_pol_register_mutex);
}
EXPORT_SYMBOL_GPL(blkcg_policy_unregister);

/*
 * Scale the accumulated delay based on how long it has been since we updated
 * the delay.  We only call this when we are adding delay, in case it's been a
 * while since we added delay, and when we are checking to see if we need to
 * delay a task, to account for any delays that may have occurred.
 */
static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
{
        u64 old = atomic64_read(&blkg->delay_start);

        /* negative use_delay means no scaling, see blkcg_set_delay() */
        if (atomic_read(&blkg->use_delay) < 0)
                return;

        /*
         * We only want to scale down every second.  The idea here is that we
         * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain
         * time window.  We only want to throttle tasks for recent delay that
         * has occurred, in 1 second time windows since that's the maximum
         * things can be throttled.  We save the current delay window in
         * blkg->last_delay so we know what amount is still left to be charged
         * to the blkg from this point onward.  blkg->last_use keeps track of
         * the use_delay counter.  The idea is if we're unthrottling the blkg we
         * are ok with whatever is happening now, and we can take away more of
         * the accumulated delay as we've already throttled enough that
         * everybody is happy with their IO latencies.
         */
        if (time_before64(old + NSEC_PER_SEC, now) &&
            atomic64_try_cmpxchg(&blkg->delay_start, &old, now)) {
                u64 cur = atomic64_read(&blkg->delay_nsec);
                u64 sub = min_t(u64, blkg->last_delay, now - old);
                int cur_use = atomic_read(&blkg->use_delay);

                /*
                 * We've been unthrottled, subtract a larger chunk of our
                 * accumulated delay.
                 */
                if (cur_use < blkg->last_use)
                        sub = max_t(u64, sub, blkg->last_delay >> 1);

                /*
                 * This shouldn't happen, but handle it anyway.  Our delay_nsec
                 * should only ever be growing except here where we subtract out
                 * min(last_delay, 1 second), but lord knows bugs happen and I'd
                 * rather not end up with negative numbers.
                 */
                if (unlikely(cur < sub)) {
                        atomic64_set(&blkg->delay_nsec, 0);
                        blkg->last_delay = 0;
                } else {
                        atomic64_sub(sub, &blkg->delay_nsec);
                        blkg->last_delay = cur - sub;
                }
                blkg->last_use = cur_use;
        }
}

/*
 * This is called when we want to actually walk up the hierarchy and check to
 * see if we need to throttle, and then actually throttle if there is some
 * accumulated delay.  This should only be called upon return to user space so
 * we're not holding some lock that would induce a priority inversion.
 */
static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
{
        unsigned long pflags;
        bool clamp;
        u64 now = blk_time_get_ns();
        u64 exp;
        u64 delay_nsec = 0;
        int tok;

        while (blkg->parent) {
                int use_delay = atomic_read(&blkg->use_delay);

                if (use_delay) {
                        u64 this_delay;

                        blkcg_scale_delay(blkg, now);
                        this_delay = atomic64_read(&blkg->delay_nsec);
                        if (this_delay > delay_nsec) {
                                delay_nsec = this_delay;
                                clamp = use_delay > 0;
                        }
                }
                blkg = blkg->parent;
        }

        if (!delay_nsec)
                return;

        /*
         * Let's not sleep for all eternity if we've amassed a huge delay.
         * Swapping or metadata IO can accumulate 10's of seconds worth of
         * delay, and we want userspace to be able to do _something_ so cap the
         * delays at 0.25s. If there's 10's of seconds worth of delay then the
         * tasks will be delayed for 0.25 second for every syscall. If
         * blkcg_set_delay() was used as indicated by negative use_delay, the
         * caller is responsible for regulating the range.
         */
        if (clamp)
                delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);

        if (use_memdelay)
                psi_memstall_enter(&pflags);

        exp = ktime_add_ns(now, delay_nsec);
        tok = io_schedule_prepare();
        do {
                __set_current_state(TASK_KILLABLE);
                if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS))
                        break;
        } while (!fatal_signal_pending(current));
        io_schedule_finish(tok);

        if (use_memdelay)
                psi_memstall_leave(&pflags);
}

/**
 * blkcg_maybe_throttle_current - throttle the current task if it has been marked
 *
 * This is only called if we've been marked with set_notify_resume().  Obviously
 * we can be set_notify_resume() for reasons other than blkcg throttling, so we
 * check to see if current->throttle_disk is set and if not this doesn't do
 * anything.  This should only ever be called by the resume code, it's not meant
 * to be called by people willy-nilly as it will actually do the work to
 * throttle the task if it is setup for throttling.
 */
void blkcg_maybe_throttle_current(void)
{
        struct gendisk *disk = current->throttle_disk;
        struct blkcg *blkcg;
        struct blkcg_gq *blkg;
        bool use_memdelay = current->use_memdelay;

        if (!disk)
                return;

        current->throttle_disk = NULL;
        current->use_memdelay = false;

        rcu_read_lock();
        blkcg = css_to_blkcg(blkcg_css());
        if (!blkcg)
                goto out;
        blkg = blkg_lookup(blkcg, disk->queue);
        if (!blkg)
                goto out;
        if (!blkg_tryget(blkg))
                goto out;
        rcu_read_unlock();

        blkcg_maybe_throttle_blkg(blkg, use_memdelay);
        blkg_put(blkg);
        put_disk(disk);
        return;
out:
        rcu_read_unlock();
}

/**
 * blkcg_schedule_throttle - this task needs to check for throttling
 * @disk: disk to throttle
 * @use_memdelay: do we charge this to memory delay for PSI
 *
 * This is called by the IO controller when we know there's delay accumulated
 * for the blkg for this task.  We do not pass the blkg because there are places
 * we call this that may not have that information, the swapping code for
 * instance will only have a block_device at that point.  This set's the
 * notify_resume for the task to check and see if it requires throttling before
 * returning to user space.
 *
 * We will only schedule once per syscall.  You can call this over and over
 * again and it will only do the check once upon return to user space, and only
 * throttle once.  If the task needs to be throttled again it'll need to be
 * re-set at the next time we see the task.
 */
void blkcg_schedule_throttle(struct gendisk *disk, bool use_memdelay)
{
        if (unlikely(current->flags & PF_KTHREAD))
                return;

        if (current->throttle_disk != disk) {
                if (test_bit(GD_DEAD, &disk->state))
                        return;
                get_device(disk_to_dev(disk));

                if (current->throttle_disk)
                        put_disk(current->throttle_disk);
                current->throttle_disk = disk;
        }

        if (use_memdelay)
                current->use_memdelay = use_memdelay;
        set_notify_resume(current);
}

/**
 * blkcg_add_delay - add delay to this blkg
 * @blkg: blkg of interest
 * @now: the current time in nanoseconds
 * @delta: how many nanoseconds of delay to add
 *
 * Charge @delta to the blkg's current delay accumulation.  This is used to
 * throttle tasks if an IO controller thinks we need more throttling.
 */
void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
{
        if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
                return;
        blkcg_scale_delay(blkg, now);
        atomic64_add(delta, &blkg->delay_nsec);
}

/**
 * blkg_tryget_closest - try and get a blkg ref on the closet blkg
 * @bio: target bio
 * @css: target css
 *
 * As the failure mode here is to walk up the blkg tree, this ensure that the
 * blkg->parent pointers are always valid.  This returns the blkg that it ended
 * up taking a reference on or %NULL if no reference was taken.
 */
static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio,
                struct cgroup_subsys_state *css)
{
        struct blkcg_gq *blkg, *ret_blkg = NULL;

        rcu_read_lock();
        blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_bdev->bd_disk);
        while (blkg) {
                if (blkg_tryget(blkg)) {
                        ret_blkg = blkg;
                        break;
                }
                blkg = blkg->parent;
        }
        rcu_read_unlock();

        return ret_blkg;
}

/**
 * bio_associate_blkg_from_css - associate a bio with a specified css
 * @bio: target bio
 * @css: target css
 *
 * Associate @bio with the blkg found by combining the css's blkg and the
 * request_queue of the @bio.  An association failure is handled by walking up
 * the blkg tree.  Therefore, the blkg associated can be anything between @blkg
 * and q->root_blkg.  This situation only happens when a cgroup is dying and
 * then the remaining bios will spill to the closest alive blkg.
 *
 * A reference will be taken on the blkg and will be released when @bio is
 * freed.
 */
void bio_associate_blkg_from_css(struct bio *bio,
                                 struct cgroup_subsys_state *css)
{
        if (bio->bi_blkg)
                blkg_put(bio->bi_blkg);

        if (css && css->parent) {
                bio->bi_blkg = blkg_tryget_closest(bio, css);
        } else {
                blkg_get(bdev_get_queue(bio->bi_bdev)->root_blkg);
                bio->bi_blkg = bdev_get_queue(bio->bi_bdev)->root_blkg;
        }
}
EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);

/**
 * bio_associate_blkg - associate a bio with a blkg
 * @bio: target bio
 *
 * Associate @bio with the blkg found from the bio's css and request_queue.
 * If one is not found, bio_lookup_blkg() creates the blkg.  If a blkg is
 * already associated, the css is reused and association redone as the
 * request_queue may have changed.
 */
void bio_associate_blkg(struct bio *bio)
{
        struct cgroup_subsys_state *css;

        if (blk_op_is_passthrough(bio->bi_opf))
                return;

        rcu_read_lock();

        if (bio->bi_blkg)
                css = bio_blkcg_css(bio);
        else
                css = blkcg_css();

        bio_associate_blkg_from_css(bio, css);

        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(bio_associate_blkg);

/**
 * bio_clone_blkg_association - clone blkg association from src to dst bio
 * @dst: destination bio
 * @src: source bio
 */
void bio_clone_blkg_association(struct bio *dst, struct bio *src)
{
        if (src->bi_blkg)
                bio_associate_blkg_from_css(dst, bio_blkcg_css(src));
}
EXPORT_SYMBOL_GPL(bio_clone_blkg_association);

static int blk_cgroup_io_type(struct bio *bio)
{
        if (op_is_discard(bio->bi_opf))
                return BLKG_IOSTAT_DISCARD;
        if (op_is_write(bio->bi_opf))
                return BLKG_IOSTAT_WRITE;
        return BLKG_IOSTAT_READ;
}

void blk_cgroup_bio_start(struct bio *bio)
{
        struct blkcg *blkcg = bio->bi_blkg->blkcg;
        int rwd = blk_cgroup_io_type(bio), cpu;
        struct blkg_iostat_set *bis;
        unsigned long flags;

        if (!cgroup_subsys_on_dfl(io_cgrp_subsys))
                return;

        /* Root-level stats are sourced from system-wide IO stats */
        if (!cgroup_parent(blkcg->css.cgroup))
                return;

        cpu = get_cpu();
        bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu);
        flags = u64_stats_update_begin_irqsave(&bis->sync);

        /*
         * If the bio is flagged with BIO_CGROUP_ACCT it means this is a split
         * bio and we would have already accounted for the size of the bio.
         */
        if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
                bio_set_flag(bio, BIO_CGROUP_ACCT);
                bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
        }
        bis->cur.ios[rwd]++;

        /*
         * If the iostat_cpu isn't in a lockless list, put it into the
         * list to indicate that a stat update is pending.
         */
        if (!READ_ONCE(bis->lqueued)) {
                struct llist_head *lhead = this_cpu_ptr(blkcg->lhead);

                llist_add(&bis->lnode, lhead);
                WRITE_ONCE(bis->lqueued, true);
        }

        u64_stats_update_end_irqrestore(&bis->sync, flags);
        css_rstat_updated(&blkcg->css, cpu);
        put_cpu();
}

bool blk_cgroup_congested(void)
{
        struct blkcg *blkcg;
        bool ret = false;

        rcu_read_lock();
        for (blkcg = css_to_blkcg(blkcg_css()); blkcg;
             blkcg = blkcg_parent(blkcg)) {
                if (atomic_read(&blkcg->congestion_count)) {
                        ret = true;
                        break;
                }
        }
        rcu_read_unlock();
        return ret;
}

module_param(blkcg_debug_stats, bool, 0644);
MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");




















































































































































































































































































    4 

    4 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_MMU_CONTEXT_H
#define _ASM_X86_MMU_CONTEXT_H

#include <linux/atomic.h>
#include <linux/mm_types.h>
#include <linux/pkeys.h>

#include <trace/events/tlb.h>

#include <asm/tlbflush.h>
#include <asm/paravirt.h>
#include <asm/debugreg.h>
#include <asm/gsseg.h>
#include <asm/desc.h>

extern atomic64_t last_mm_ctx_id;

#ifdef CONFIG_PERF_EVENTS
DECLARE_STATIC_KEY_FALSE(rdpmc_never_available_key);
DECLARE_STATIC_KEY_FALSE(rdpmc_always_available_key);
void cr4_update_pce(void *ignored);
#endif

#ifdef CONFIG_MODIFY_LDT_SYSCALL
/*
 * ldt_structs can be allocated, used, and freed, but they are never
 * modified while live.
 */
struct ldt_struct {
        /*
         * Xen requires page-aligned LDTs with special permissions.  This is
         * needed to prevent us from installing evil descriptors such as
         * call gates.  On native, we could merge the ldt_struct and LDT
         * allocations, but it's not worth trying to optimize.
         */
        struct desc_struct        *entries;
        unsigned int                nr_entries;

        /*
         * If PTI is in use, then the entries array is not mapped while we're
         * in user mode.  The whole array will be aliased at the addressed
         * given by ldt_slot_va(slot).  We use two slots so that we can allocate
         * and map, and enable a new LDT without invalidating the mapping
         * of an older, still-in-use LDT.
         *
         * slot will be -1 if this LDT doesn't have an alias mapping.
         */
        int                        slot;
};

/*
 * Used for LDT copy/destruction.
 */
static inline void init_new_context_ldt(struct mm_struct *mm)
{
        mm->context.ldt = NULL;
        init_rwsem(&mm->context.ldt_usr_sem);
}
int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
void destroy_context_ldt(struct mm_struct *mm);
void ldt_arch_exit_mmap(struct mm_struct *mm);
#else        /* CONFIG_MODIFY_LDT_SYSCALL */
static inline void init_new_context_ldt(struct mm_struct *mm) { }
static inline int ldt_dup_context(struct mm_struct *oldmm,
                                  struct mm_struct *mm)
{
        return 0;
}
static inline void destroy_context_ldt(struct mm_struct *mm) { }
static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
#endif

#ifdef CONFIG_MODIFY_LDT_SYSCALL
extern void load_mm_ldt(struct mm_struct *mm);
extern void switch_ldt(struct mm_struct *prev, struct mm_struct *next);
#else
static inline void load_mm_ldt(struct mm_struct *mm)
{
        clear_LDT();
}
static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
{
        DEBUG_LOCKS_WARN_ON(preemptible());
}
#endif

#ifdef CONFIG_ADDRESS_MASKING
static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
{
        /*
         * When switch_mm_irqs_off() is called for a kthread, it may race with
         * LAM enablement. switch_mm_irqs_off() uses the LAM mask to do two
         * things: populate CR3 and populate 'cpu_tlbstate.lam'. Make sure it
         * reads a single value for both.
         */
        return READ_ONCE(mm->context.lam_cr3_mask);
}

static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
{
        mm->context.lam_cr3_mask = oldmm->context.lam_cr3_mask;
        mm->context.untag_mask = oldmm->context.untag_mask;
}

#define mm_untag_mask mm_untag_mask
static inline unsigned long mm_untag_mask(struct mm_struct *mm)
{
        return mm->context.untag_mask;
}

static inline void mm_reset_untag_mask(struct mm_struct *mm)
{
        mm->context.untag_mask = -1UL;
}

#define arch_pgtable_dma_compat arch_pgtable_dma_compat
static inline bool arch_pgtable_dma_compat(struct mm_struct *mm)
{
        return !mm_lam_cr3_mask(mm) ||
                test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags);
}
#else

static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
{
        return 0;
}

static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
{
}

static inline void mm_reset_untag_mask(struct mm_struct *mm)
{
}
#endif

#define enter_lazy_tlb enter_lazy_tlb
extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);

#define mm_init_global_asid mm_init_global_asid
extern void mm_init_global_asid(struct mm_struct *mm);

extern void mm_free_global_asid(struct mm_struct *mm);

/*
 * Init a new mm.  Used on mm copies, like at fork()
 * and on mm's that are brand-new, like at execve().
 */
#define init_new_context init_new_context
static inline int init_new_context(struct task_struct *tsk,
                                   struct mm_struct *mm)
{
        mutex_init(&mm->context.lock);

        mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
        atomic64_set(&mm->context.tlb_gen, 0);
        mm->context.next_trim_cpumask = jiffies + HZ;

#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
        if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
                /* pkey 0 is the default and allocated implicitly */
                mm->context.pkey_allocation_map = 0x1;
                /* -1 means unallocated or invalid */
                mm->context.execute_only_pkey = -1;
        }
#endif

        mm_init_global_asid(mm);
        mm_reset_untag_mask(mm);
        init_new_context_ldt(mm);
        return 0;
}

#define destroy_context destroy_context
static inline void destroy_context(struct mm_struct *mm)
{
        destroy_context_ldt(mm);
        mm_free_global_asid(mm);
}

extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
                      struct task_struct *tsk);

extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
                               struct task_struct *tsk);
#define switch_mm_irqs_off switch_mm_irqs_off

#define activate_mm(prev, next)                        \
do {                                                \
        paravirt_enter_mmap(next);                \
        switch_mm_irqs_off((prev), (next), NULL);        \
} while (0);

#ifdef CONFIG_X86_32
#define deactivate_mm(tsk, mm)                        \
do {                                                \
        loadsegment(gs, 0);                        \
} while (0)
#else
#define deactivate_mm(tsk, mm)                        \
do {                                                \
        shstk_free(tsk);                        \
        load_gs_index(0);                        \
        loadsegment(fs, 0);                        \
} while (0)
#endif

static inline void arch_dup_pkeys(struct mm_struct *oldmm,
                                  struct mm_struct *mm)
{
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
        if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
                return;

        /* Duplicate the oldmm pkey state in mm: */
        mm->context.pkey_allocation_map = oldmm->context.pkey_allocation_map;
        mm->context.execute_only_pkey   = oldmm->context.execute_only_pkey;
#endif
}

static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
{
        arch_dup_pkeys(oldmm, mm);
        paravirt_enter_mmap(mm);
        dup_lam(oldmm, mm);
        return ldt_dup_context(oldmm, mm);
}

static inline void arch_exit_mmap(struct mm_struct *mm)
{
        paravirt_arch_exit_mmap(mm);
        ldt_arch_exit_mmap(mm);
}

#ifdef CONFIG_X86_64
static inline bool is_64bit_mm(struct mm_struct *mm)
{
        return        !IS_ENABLED(CONFIG_IA32_EMULATION) ||
                !test_bit(MM_CONTEXT_UPROBE_IA32, &mm->context.flags);
}
#else
static inline bool is_64bit_mm(struct mm_struct *mm)
{
        return false;
}
#endif

static inline bool is_notrack_mm(struct mm_struct *mm)
{
        return test_bit(MM_CONTEXT_NOTRACK, &mm->context.flags);
}

static inline void set_notrack_mm(struct mm_struct *mm)
{
        set_bit(MM_CONTEXT_NOTRACK, &mm->context.flags);
}

/*
 * We only want to enforce protection keys on the current process
 * because we effectively have no access to PKRU for other
 * processes or any way to tell *which * PKRU in a threaded
 * process we could use.
 *
 * So do not enforce things if the VMA is not from the current
 * mm, or if we are in a kernel thread.
 */
static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
                bool write, bool execute, bool foreign)
{
        /* pkeys never affect instruction fetches */
        if (execute)
                return true;
        /* allow access if the VMA is not one from this process */
        if (foreign || vma_is_foreign(vma))
                return true;
        return __pkru_allows_pkey(vma_pkey(vma), write);
}

unsigned long __get_current_cr3_fast(void);

#include <asm-generic/mmu_context.h>

extern struct mm_struct *use_temporary_mm(struct mm_struct *temp_mm);
extern void unuse_temporary_mm(struct mm_struct *prev_mm);

#endif /* _ASM_X86_MMU_CONTEXT_H */







































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM sched

#if !defined(_TRACE_SCHED_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_SCHED_H

#include <linux/kthread.h>
#include <linux/sched/numa_balancing.h>
#include <linux/tracepoint.h>
#include <linux/binfmts.h>

/*
 * Tracepoint for calling kthread_stop, performed to end a kthread:
 */
TRACE_EVENT(sched_kthread_stop,

        TP_PROTO(struct task_struct *t),

        TP_ARGS(t),

        TP_STRUCT__entry(
                __string(        comm,        t->comm                )
                __field(        pid_t,        pid                )
        ),

        TP_fast_assign(
                __assign_str(comm);
                __entry->pid        = t->pid;
        ),

        TP_printk("comm=%s pid=%d", __get_str(comm), __entry->pid)
);

/*
 * Tracepoint for the return value of the kthread stopping:
 */
TRACE_EVENT(sched_kthread_stop_ret,

        TP_PROTO(int ret),

        TP_ARGS(ret),

        TP_STRUCT__entry(
                __field(        int,        ret        )
        ),

        TP_fast_assign(
                __entry->ret        = ret;
        ),

        TP_printk("ret=%d", __entry->ret)
);

/**
 * sched_kthread_work_queue_work - called when a work gets queued
 * @worker:        pointer to the kthread_worker
 * @work:        pointer to struct kthread_work
 *
 * This event occurs when a work is queued immediately or once a
 * delayed work is actually queued (ie: once the delay has been
 * reached).
 */
TRACE_EVENT(sched_kthread_work_queue_work,

        TP_PROTO(struct kthread_worker *worker,
                 struct kthread_work *work),

        TP_ARGS(worker, work),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
                __field( void *,        worker)
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = work->func;
                __entry->worker                = worker;
        ),

        TP_printk("work struct=%p function=%ps worker=%p",
                  __entry->work, __entry->function, __entry->worker)
);

/**
 * sched_kthread_work_execute_start - called immediately before the work callback
 * @work:        pointer to struct kthread_work
 *
 * Allows to track kthread work execution.
 */
TRACE_EVENT(sched_kthread_work_execute_start,

        TP_PROTO(struct kthread_work *work),

        TP_ARGS(work),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = work->func;
        ),

        TP_printk("work struct %p: function %ps", __entry->work, __entry->function)
);

/**
 * sched_kthread_work_execute_end - called immediately after the work callback
 * @work:        pointer to struct work_struct
 * @function:   pointer to worker function
 *
 * Allows to track workqueue execution.
 */
TRACE_EVENT(sched_kthread_work_execute_end,

        TP_PROTO(struct kthread_work *work, kthread_work_func_t function),

        TP_ARGS(work, function),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = function;
        ),

        TP_printk("work struct %p: function %ps", __entry->work, __entry->function)
);

/*
 * Tracepoint for waking up a task:
 */
DECLARE_EVENT_CLASS(sched_wakeup_template,

        TP_PROTO(struct task_struct *p),

        TP_ARGS(__perf_task(p)),

        TP_STRUCT__entry(
                __array(        char,        comm,        TASK_COMM_LEN        )
                __field(        pid_t,        pid                        )
                __field(        int,        prio                        )
                __field(        int,        target_cpu                )
        ),

        TP_fast_assign(
                memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
                __entry->pid                = p->pid;
                __entry->prio                = p->prio; /* XXX SCHED_DEADLINE */
                __entry->target_cpu        = task_cpu(p);
        ),

        TP_printk("comm=%s pid=%d prio=%d target_cpu=%03d",
                  __entry->comm, __entry->pid, __entry->prio,
                  __entry->target_cpu)
);

/*
 * Tracepoint called when waking a task; this tracepoint is guaranteed to be
 * called from the waking context.
 */
DEFINE_EVENT(sched_wakeup_template, sched_waking,
             TP_PROTO(struct task_struct *p),
             TP_ARGS(p));

/*
 * Tracepoint called when the task is actually woken; p->state == TASK_RUNNING.
 * It is not always called from the waking context.
 */
DEFINE_EVENT(sched_wakeup_template, sched_wakeup,
             TP_PROTO(struct task_struct *p),
             TP_ARGS(p));

/*
 * Tracepoint for waking up a new task:
 */
DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
             TP_PROTO(struct task_struct *p),
             TP_ARGS(p));

#ifdef CREATE_TRACE_POINTS
static inline long __trace_sched_switch_state(bool preempt,
                                              unsigned int prev_state,
                                              struct task_struct *p)
{
        unsigned int state;

        BUG_ON(p != current);

        /*
         * Preemption ignores task state, therefore preempted tasks are always
         * RUNNING (we will not have dequeued if state != RUNNING).
         */
        if (preempt)
                return TASK_REPORT_MAX;

        /*
         * task_state_index() uses fls() and returns a value from 0-8 range.
         * Decrement it by 1 (except TASK_RUNNING state i.e 0) before using
         * it for left shift operation to get the correct task->state
         * mapping.
         */
        state = __task_state_index(prev_state, p->exit_state);

        return state ? (1 << (state - 1)) : state;
}
#endif /* CREATE_TRACE_POINTS */

/*
 * Tracepoint for task switches, performed by the scheduler:
 */
TRACE_EVENT(sched_switch,

        TP_PROTO(bool preempt,
                 struct task_struct *prev,
                 struct task_struct *next,
                 unsigned int prev_state),

        TP_ARGS(preempt, prev, next, prev_state),

        TP_STRUCT__entry(
                __array(        char,        prev_comm,        TASK_COMM_LEN        )
                __field(        pid_t,        prev_pid                        )
                __field(        int,        prev_prio                        )
                __field(        long,        prev_state                        )
                __array(        char,        next_comm,        TASK_COMM_LEN        )
                __field(        pid_t,        next_pid                        )
                __field(        int,        next_prio                        )
        ),

        TP_fast_assign(
                memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
                __entry->prev_pid        = prev->pid;
                __entry->prev_prio        = prev->prio;
                __entry->prev_state        = __trace_sched_switch_state(preempt, prev_state, prev);
                memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
                __entry->next_pid        = next->pid;
                __entry->next_prio        = next->prio;
                /* XXX SCHED_DEADLINE */
        ),

        TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",
                __entry->prev_comm, __entry->prev_pid, __entry->prev_prio,

                (__entry->prev_state & (TASK_REPORT_MAX - 1)) ?
                  __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|",
                                { TASK_INTERRUPTIBLE, "S" },
                                { TASK_UNINTERRUPTIBLE, "D" },
                                { __TASK_STOPPED, "T" },
                                { __TASK_TRACED, "t" },
                                { EXIT_DEAD, "X" },
                                { EXIT_ZOMBIE, "Z" },
                                { TASK_PARKED, "P" },
                                { TASK_DEAD, "I" }) :
                  "R",

                __entry->prev_state & TASK_REPORT_MAX ? "+" : "",
                __entry->next_comm, __entry->next_pid, __entry->next_prio)
);

/*
 * Tracepoint for a task being migrated:
 */
TRACE_EVENT(sched_migrate_task,

        TP_PROTO(struct task_struct *p, int dest_cpu),

        TP_ARGS(p, dest_cpu),

        TP_STRUCT__entry(
                __string(        comm,        p->comm                )
                __field(        pid_t,        pid                )
                __field(        int,        prio                )
                __field(        int,        orig_cpu        )
                __field(        int,        dest_cpu        )
        ),

        TP_fast_assign(
                __assign_str(comm);
                __entry->pid                = p->pid;
                __entry->prio                = p->prio; /* XXX SCHED_DEADLINE */
                __entry->orig_cpu        = task_cpu(p);
                __entry->dest_cpu        = dest_cpu;
        ),

        TP_printk("comm=%s pid=%d prio=%d orig_cpu=%d dest_cpu=%d",
                  __get_str(comm), __entry->pid, __entry->prio,
                  __entry->orig_cpu, __entry->dest_cpu)
);

DECLARE_EVENT_CLASS(sched_process_template,

        TP_PROTO(struct task_struct *p),

        TP_ARGS(p),

        TP_STRUCT__entry(
                __string(        comm,        p->comm                )
                __field(        pid_t,        pid                )
                __field(        int,        prio                )
        ),

        TP_fast_assign(
                __assign_str(comm);
                __entry->pid                = p->pid;
                __entry->prio                = p->prio; /* XXX SCHED_DEADLINE */
        ),

        TP_printk("comm=%s pid=%d prio=%d",
                  __get_str(comm), __entry->pid, __entry->prio)
);

/*
 * Tracepoint for freeing a task:
 */
DEFINE_EVENT(sched_process_template, sched_process_free,
             TP_PROTO(struct task_struct *p),
             TP_ARGS(p));

/*
 * Tracepoint for a task exiting.
 * Note, it's a superset of sched_process_template and should be kept
 * compatible as much as possible. sched_process_exits has an extra
 * `group_dead` argument, so sched_process_template can't be used,
 * unfortunately, just like sched_migrate_task above.
 */
TRACE_EVENT(sched_process_exit,

        TP_PROTO(struct task_struct *p, bool group_dead),

        TP_ARGS(p, group_dead),

        TP_STRUCT__entry(
                __array(        char,        comm,        TASK_COMM_LEN        )
                __field(        pid_t,        pid                        )
                __field(        int,        prio                        )
                __field(        bool,        group_dead                )
        ),

        TP_fast_assign(
                memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
                __entry->pid                = p->pid;
                __entry->prio                = p->prio; /* XXX SCHED_DEADLINE */
                __entry->group_dead        = group_dead;
        ),

        TP_printk("comm=%s pid=%d prio=%d group_dead=%s",
                  __entry->comm, __entry->pid, __entry->prio,
                  __entry->group_dead ? "true" : "false"
        )
);

/*
 * Tracepoint for waiting on task to unschedule:
 */
DEFINE_EVENT(sched_process_template, sched_wait_task,
        TP_PROTO(struct task_struct *p),
        TP_ARGS(p));

/*
 * Tracepoint for a waiting task:
 */
TRACE_EVENT(sched_process_wait,

        TP_PROTO(struct pid *pid),

        TP_ARGS(pid),

        TP_STRUCT__entry(
                __string(        comm,        current->comm                )
                __field(        pid_t,        pid                        )
                __field(        int,        prio                        )
        ),

        TP_fast_assign(
                __assign_str(comm);
                __entry->pid                = pid_nr(pid);
                __entry->prio                = current->prio; /* XXX SCHED_DEADLINE */
        ),

        TP_printk("comm=%s pid=%d prio=%d",
                  __get_str(comm), __entry->pid, __entry->prio)
);

/*
 * Tracepoint for kernel_clone:
 */
TRACE_EVENT(sched_process_fork,

        TP_PROTO(struct task_struct *parent, struct task_struct *child),

        TP_ARGS(parent, child),

        TP_STRUCT__entry(
                __string(        parent_comm,        parent->comm        )
                __field(        pid_t,                parent_pid        )
                __string(        child_comm,        child->comm        )
                __field(        pid_t,                child_pid        )
        ),

        TP_fast_assign(
                __assign_str(parent_comm);
                __entry->parent_pid        = parent->pid;
                __assign_str(child_comm);
                __entry->child_pid        = child->pid;
        ),

        TP_printk("comm=%s pid=%d child_comm=%s child_pid=%d",
                __get_str(parent_comm), __entry->parent_pid,
                __get_str(child_comm), __entry->child_pid)
);

/*
 * Tracepoint for exec:
 */
TRACE_EVENT(sched_process_exec,

        TP_PROTO(struct task_struct *p, pid_t old_pid,
                 struct linux_binprm *bprm),

        TP_ARGS(p, old_pid, bprm),

        TP_STRUCT__entry(
                __string(        filename,        bprm->filename        )
                __field(        pid_t,                pid                )
                __field(        pid_t,                old_pid                )
        ),

        TP_fast_assign(
                __assign_str(filename);
                __entry->pid                = p->pid;
                __entry->old_pid        = old_pid;
        ),

        TP_printk("filename=%s pid=%d old_pid=%d", __get_str(filename),
                  __entry->pid, __entry->old_pid)
);

/**
 * sched_prepare_exec - called before setting up new exec
 * @task:        pointer to the current task
 * @bprm:        pointer to linux_binprm used for new exec
 *
 * Called before flushing the old exec, where @task is still unchanged, but at
 * the point of no return during switching to the new exec. At the point it is
 * called the exec will either succeed, or on failure terminate the task. Also
 * see the "sched_process_exec" tracepoint, which is called right after @task
 * has successfully switched to the new exec.
 */
TRACE_EVENT(sched_prepare_exec,

        TP_PROTO(struct task_struct *task, struct linux_binprm *bprm),

        TP_ARGS(task, bprm),

        TP_STRUCT__entry(
                __string(        interp,                bprm->interp        )
                __string(        filename,        bprm->filename        )
                __field(        pid_t,                pid                )
                __string(        comm,                task->comm        )
        ),

        TP_fast_assign(
                __assign_str(interp);
                __assign_str(filename);
                __entry->pid = task->pid;
                __assign_str(comm);
        ),

        TP_printk("interp=%s filename=%s pid=%d comm=%s",
                  __get_str(interp), __get_str(filename),
                  __entry->pid, __get_str(comm))
);

#ifdef CONFIG_SCHEDSTATS
#define DEFINE_EVENT_SCHEDSTAT DEFINE_EVENT
#define DECLARE_EVENT_CLASS_SCHEDSTAT DECLARE_EVENT_CLASS
#else
#define DEFINE_EVENT_SCHEDSTAT DEFINE_EVENT_NOP
#define DECLARE_EVENT_CLASS_SCHEDSTAT DECLARE_EVENT_CLASS_NOP
#endif

/*
 * XXX the below sched_stat tracepoints only apply to SCHED_OTHER/BATCH/IDLE
 *     adding sched_stat support to SCHED_FIFO/RR would be welcome.
 */
DECLARE_EVENT_CLASS_SCHEDSTAT(sched_stat_template,

        TP_PROTO(struct task_struct *tsk, u64 delay),

        TP_ARGS(__perf_task(tsk), __perf_count(delay)),

        TP_STRUCT__entry(
                __string( comm,        tsk->comm        )
                __field(  pid_t,        pid        )
                __field(  u64,                delay        )
        ),

        TP_fast_assign(
                __assign_str(comm);
                __entry->pid        = tsk->pid;
                __entry->delay        = delay;
        ),

        TP_printk("comm=%s pid=%d delay=%Lu [ns]",
                        __get_str(comm), __entry->pid,
                        (unsigned long long)__entry->delay)
);

/*
 * Tracepoint for accounting wait time (time the task is runnable
 * but not actually running due to scheduler contention).
 */
DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_wait,
             TP_PROTO(struct task_struct *tsk, u64 delay),
             TP_ARGS(tsk, delay));

/*
 * Tracepoint for accounting sleep time (time the task is not runnable,
 * including iowait, see below).
 */
DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_sleep,
             TP_PROTO(struct task_struct *tsk, u64 delay),
             TP_ARGS(tsk, delay));

/*
 * Tracepoint for accounting iowait time (time the task is not runnable
 * due to waiting on IO to complete).
 */
DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_iowait,
             TP_PROTO(struct task_struct *tsk, u64 delay),
             TP_ARGS(tsk, delay));

/*
 * Tracepoint for accounting blocked time (time the task is in uninterruptible).
 */
DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_blocked,
             TP_PROTO(struct task_struct *tsk, u64 delay),
             TP_ARGS(tsk, delay));

/*
 * Tracepoint for accounting runtime (time the task is executing
 * on a CPU).
 */
DECLARE_EVENT_CLASS(sched_stat_runtime,

        TP_PROTO(struct task_struct *tsk, u64 runtime),

        TP_ARGS(tsk, __perf_count(runtime)),

        TP_STRUCT__entry(
                __string( comm,                tsk->comm        )
                __field(  pid_t,        pid                )
                __field(  u64,                runtime                )
        ),

        TP_fast_assign(
                __assign_str(comm);
                __entry->pid                = tsk->pid;
                __entry->runtime        = runtime;
        ),

        TP_printk("comm=%s pid=%d runtime=%Lu [ns]",
                        __get_str(comm), __entry->pid,
                        (unsigned long long)__entry->runtime)
);

DEFINE_EVENT(sched_stat_runtime, sched_stat_runtime,
             TP_PROTO(struct task_struct *tsk, u64 runtime),
             TP_ARGS(tsk, runtime));

/*
 * Tracepoint for showing priority inheritance modifying a tasks
 * priority.
 */
TRACE_EVENT(sched_pi_setprio,

        TP_PROTO(struct task_struct *tsk, struct task_struct *pi_task),

        TP_ARGS(tsk, pi_task),

        TP_STRUCT__entry(
                __string( comm,                tsk->comm        )
                __field(  pid_t,        pid                )
                __field(  int,                oldprio                )
                __field(  int,                newprio                )
        ),

        TP_fast_assign(
                __assign_str(comm);
                __entry->pid                = tsk->pid;
                __entry->oldprio        = tsk->prio;
                __entry->newprio        = pi_task ?
                                min(tsk->normal_prio, pi_task->prio) :
                                tsk->normal_prio;
                /* XXX SCHED_DEADLINE bits missing */
        ),

        TP_printk("comm=%s pid=%d oldprio=%d newprio=%d",
                        __get_str(comm), __entry->pid,
                        __entry->oldprio, __entry->newprio)
);

#ifdef CONFIG_DETECT_HUNG_TASK
TRACE_EVENT(sched_process_hang,
        TP_PROTO(struct task_struct *tsk),
        TP_ARGS(tsk),

        TP_STRUCT__entry(
                __string( comm,                tsk->comm        )
                __field(  pid_t,        pid                )
        ),

        TP_fast_assign(
                __assign_str(comm);
                __entry->pid = tsk->pid;
        ),

        TP_printk("comm=%s pid=%d", __get_str(comm), __entry->pid)
);
#endif /* CONFIG_DETECT_HUNG_TASK */

#ifdef CONFIG_NUMA_BALANCING
/*
 * Tracks migration of tasks from one runqueue to another. Can be used to
 * detect if automatic NUMA balancing is bouncing between nodes.
 */
TRACE_EVENT(sched_move_numa,

        TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu),

        TP_ARGS(tsk, src_cpu, dst_cpu),

        TP_STRUCT__entry(
                __field( pid_t,        pid                        )
                __field( pid_t,        tgid                        )
                __field( pid_t,        ngid                        )
                __field( int,        src_cpu                        )
                __field( int,        src_nid                        )
                __field( int,        dst_cpu                        )
                __field( int,        dst_nid                        )
        ),

        TP_fast_assign(
                __entry->pid                = task_pid_nr(tsk);
                __entry->tgid                = task_tgid_nr(tsk);
                __entry->ngid                = task_numa_group_id(tsk);
                __entry->src_cpu        = src_cpu;
                __entry->src_nid        = cpu_to_node(src_cpu);
                __entry->dst_cpu        = dst_cpu;
                __entry->dst_nid        = cpu_to_node(dst_cpu);
        ),

        TP_printk("pid=%d tgid=%d ngid=%d src_cpu=%d src_nid=%d dst_cpu=%d dst_nid=%d",
                        __entry->pid, __entry->tgid, __entry->ngid,
                        __entry->src_cpu, __entry->src_nid,
                        __entry->dst_cpu, __entry->dst_nid)
);

DECLARE_EVENT_CLASS(sched_numa_pair_template,

        TP_PROTO(struct task_struct *src_tsk, int src_cpu,
                 struct task_struct *dst_tsk, int dst_cpu),

        TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu),

        TP_STRUCT__entry(
                __field( pid_t,        src_pid                        )
                __field( pid_t,        src_tgid                )
                __field( pid_t,        src_ngid                )
                __field( int,        src_cpu                        )
                __field( int,        src_nid                        )
                __field( pid_t,        dst_pid                        )
                __field( pid_t,        dst_tgid                )
                __field( pid_t,        dst_ngid                )
                __field( int,        dst_cpu                        )
                __field( int,        dst_nid                        )
        ),

        TP_fast_assign(
                __entry->src_pid        = task_pid_nr(src_tsk);
                __entry->src_tgid        = task_tgid_nr(src_tsk);
                __entry->src_ngid        = task_numa_group_id(src_tsk);
                __entry->src_cpu        = src_cpu;
                __entry->src_nid        = cpu_to_node(src_cpu);
                __entry->dst_pid        = dst_tsk ? task_pid_nr(dst_tsk) : 0;
                __entry->dst_tgid        = dst_tsk ? task_tgid_nr(dst_tsk) : 0;
                __entry->dst_ngid        = dst_tsk ? task_numa_group_id(dst_tsk) : 0;
                __entry->dst_cpu        = dst_cpu;
                __entry->dst_nid        = dst_cpu >= 0 ? cpu_to_node(dst_cpu) : -1;
        ),

        TP_printk("src_pid=%d src_tgid=%d src_ngid=%d src_cpu=%d src_nid=%d dst_pid=%d dst_tgid=%d dst_ngid=%d dst_cpu=%d dst_nid=%d",
                        __entry->src_pid, __entry->src_tgid, __entry->src_ngid,
                        __entry->src_cpu, __entry->src_nid,
                        __entry->dst_pid, __entry->dst_tgid, __entry->dst_ngid,
                        __entry->dst_cpu, __entry->dst_nid)
);

DEFINE_EVENT(sched_numa_pair_template, sched_stick_numa,

        TP_PROTO(struct task_struct *src_tsk, int src_cpu,
                 struct task_struct *dst_tsk, int dst_cpu),

        TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu)
);

DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa,

        TP_PROTO(struct task_struct *src_tsk, int src_cpu,
                 struct task_struct *dst_tsk, int dst_cpu),

        TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu)
);

#define NUMAB_SKIP_REASON                                        \
        EM( NUMAB_SKIP_UNSUITABLE,                "unsuitable" )        \
        EM( NUMAB_SKIP_SHARED_RO,                "shared_ro" )        \
        EM( NUMAB_SKIP_INACCESSIBLE,                "inaccessible" )        \
        EM( NUMAB_SKIP_SCAN_DELAY,                "scan_delay" )        \
        EM( NUMAB_SKIP_PID_INACTIVE,                "pid_inactive" )        \
        EM( NUMAB_SKIP_IGNORE_PID,                "ignore_pid_inactive" )                \
        EMe(NUMAB_SKIP_SEQ_COMPLETED,                "seq_completed" )

/* Redefine for export. */
#undef EM
#undef EMe
#define EM(a, b)        TRACE_DEFINE_ENUM(a);
#define EMe(a, b)        TRACE_DEFINE_ENUM(a);

NUMAB_SKIP_REASON

/* Redefine for symbolic printing. */
#undef EM
#undef EMe
#define EM(a, b)        { a, b },
#define EMe(a, b)        { a, b }

TRACE_EVENT(sched_skip_vma_numa,

        TP_PROTO(struct mm_struct *mm, struct vm_area_struct *vma,
                 enum numa_vmaskip_reason reason),

        TP_ARGS(mm, vma, reason),

        TP_STRUCT__entry(
                __field(unsigned long, numa_scan_offset)
                __field(unsigned long, vm_start)
                __field(unsigned long, vm_end)
                __field(enum numa_vmaskip_reason, reason)
        ),

        TP_fast_assign(
                __entry->numa_scan_offset        = mm->numa_scan_offset;
                __entry->vm_start                = vma->vm_start;
                __entry->vm_end                        = vma->vm_end;
                __entry->reason                        = reason;
        ),

        TP_printk("numa_scan_offset=%lX vm_start=%lX vm_end=%lX reason=%s",
                  __entry->numa_scan_offset,
                  __entry->vm_start,
                  __entry->vm_end,
                  __print_symbolic(__entry->reason, NUMAB_SKIP_REASON))
);

TRACE_EVENT(sched_skip_cpuset_numa,

        TP_PROTO(struct task_struct *tsk, nodemask_t *mem_allowed_ptr),

        TP_ARGS(tsk, mem_allowed_ptr),

        TP_STRUCT__entry(
                __array( char,                comm,                TASK_COMM_LEN                )
                __field( pid_t,                pid                                        )
                __field( pid_t,                tgid                                        )
                __field( pid_t,                ngid                                        )
                __array( unsigned long, mem_allowed, BITS_TO_LONGS(MAX_NUMNODES))
        ),

        TP_fast_assign(
                memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
                __entry->pid                 = task_pid_nr(tsk);
                __entry->tgid                 = task_tgid_nr(tsk);
                __entry->ngid                 = task_numa_group_id(tsk);
                BUILD_BUG_ON(sizeof(nodemask_t) != \
                             BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long));
                memcpy(__entry->mem_allowed, mem_allowed_ptr->bits,
                       sizeof(__entry->mem_allowed));
        ),

        TP_printk("comm=%s pid=%d tgid=%d ngid=%d mem_nodes_allowed=%*pbl",
                  __entry->comm,
                  __entry->pid,
                  __entry->tgid,
                  __entry->ngid,
                  MAX_NUMNODES, __entry->mem_allowed)
);
#endif /* CONFIG_NUMA_BALANCING */

/*
 * Tracepoint for waking a polling cpu without an IPI.
 */
TRACE_EVENT(sched_wake_idle_without_ipi,

        TP_PROTO(int cpu),

        TP_ARGS(cpu),

        TP_STRUCT__entry(
                __field(        int,        cpu        )
        ),

        TP_fast_assign(
                __entry->cpu        = cpu;
        ),

        TP_printk("cpu=%d", __entry->cpu)
);

/*
 * Following tracepoints are not exported in tracefs and provide hooking
 * mechanisms only for testing and debugging purposes.
 */
DECLARE_TRACE(pelt_cfs,
        TP_PROTO(struct cfs_rq *cfs_rq),
        TP_ARGS(cfs_rq));

DECLARE_TRACE(pelt_rt,
        TP_PROTO(struct rq *rq),
        TP_ARGS(rq));

DECLARE_TRACE(pelt_dl,
        TP_PROTO(struct rq *rq),
        TP_ARGS(rq));

DECLARE_TRACE(pelt_hw,
        TP_PROTO(struct rq *rq),
        TP_ARGS(rq));

DECLARE_TRACE(pelt_irq,
        TP_PROTO(struct rq *rq),
        TP_ARGS(rq));

DECLARE_TRACE(pelt_se,
        TP_PROTO(struct sched_entity *se),
        TP_ARGS(se));

DECLARE_TRACE(sched_cpu_capacity,
        TP_PROTO(struct rq *rq),
        TP_ARGS(rq));

DECLARE_TRACE(sched_overutilized,
        TP_PROTO(struct root_domain *rd, bool overutilized),
        TP_ARGS(rd, overutilized));

DECLARE_TRACE(sched_util_est_cfs,
        TP_PROTO(struct cfs_rq *cfs_rq),
        TP_ARGS(cfs_rq));

DECLARE_TRACE(sched_util_est_se,
        TP_PROTO(struct sched_entity *se),
        TP_ARGS(se));

DECLARE_TRACE(sched_update_nr_running,
        TP_PROTO(struct rq *rq, int change),
        TP_ARGS(rq, change));

DECLARE_TRACE(sched_compute_energy,
        TP_PROTO(struct task_struct *p, int dst_cpu, unsigned long energy,
                 unsigned long max_util, unsigned long busy_time),
        TP_ARGS(p, dst_cpu, energy, max_util, busy_time));

DECLARE_TRACE(sched_entry,
        TP_PROTO(bool preempt),
        TP_ARGS(preempt));

DECLARE_TRACE(sched_exit,
        TP_PROTO(bool is_switch),
        TP_ARGS(is_switch));

DECLARE_TRACE_CONDITION(sched_set_state,
        TP_PROTO(struct task_struct *tsk, int state),
        TP_ARGS(tsk, state),
        TP_CONDITION(!!(tsk->__state) != !!state));

DECLARE_TRACE(sched_set_need_resched,
        TP_PROTO(struct task_struct *tsk, int cpu, int tif),
        TP_ARGS(tsk, cpu, tif));

#endif /* _TRACE_SCHED_H */

/* This part must be outside protection */
#include <trace/define_trace.h>



















































































































































   40 












































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_TIMER_H
#define _LINUX_TIMER_H

#include <linux/list.h>
#include <linux/ktime.h>
#include <linux/stddef.h>
#include <linux/debugobjects.h>
#include <linux/stringify.h>
#include <linux/timer_types.h>

#ifdef CONFIG_LOCKDEP
/*
 * NB: because we have to copy the lockdep_map, setting the lockdep_map key
 * (second argument) here is required, otherwise it could be initialised to
 * the copy of the lockdep_map later! We use the pointer to and the string
 * "<file>:<line>" as the key resp. the name of the lockdep_map.
 */
#define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn)                                \
        .lockdep_map = STATIC_LOCKDEP_MAP_INIT(_kn, &_kn),
#else
#define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn)
#endif

/*
 * @TIMER_DEFERRABLE: A deferrable timer will work normally when the
 * system is busy, but will not cause a CPU to come out of idle just
 * to service it; instead, the timer will be serviced when the CPU
 * eventually wakes up with a subsequent non-deferrable timer.
 *
 * @TIMER_IRQSAFE: An irqsafe timer is executed with IRQ disabled and
 * it's safe to wait for the completion of the running instance from
 * IRQ handlers, for example, by calling timer_delete_sync().
 *
 * Note: The irq disabled callback execution is a special case for
 * workqueue locking issues. It's not meant for executing random crap
 * with interrupts disabled. Abuse is monitored!
 *
 * @TIMER_PINNED: A pinned timer will always expire on the CPU on which the
 * timer was enqueued. When a particular CPU is required, add_timer_on()
 * has to be used. Enqueue via mod_timer() and add_timer() is always done
 * on the local CPU.
 */
#define TIMER_CPUMASK                0x0003FFFF
#define TIMER_MIGRATING                0x00040000
#define TIMER_BASEMASK                (TIMER_CPUMASK | TIMER_MIGRATING)
#define TIMER_DEFERRABLE        0x00080000
#define TIMER_PINNED                0x00100000
#define TIMER_IRQSAFE                0x00200000
#define TIMER_INIT_FLAGS        (TIMER_DEFERRABLE | TIMER_PINNED | TIMER_IRQSAFE)
#define TIMER_ARRAYSHIFT        22
#define TIMER_ARRAYMASK                0xFFC00000

#define TIMER_TRACE_FLAGMASK        (TIMER_MIGRATING | TIMER_DEFERRABLE | TIMER_PINNED | TIMER_IRQSAFE)

#define __TIMER_INITIALIZER(_function, _flags) {                \
                .entry = { .next = TIMER_ENTRY_STATIC },        \
                .function = (_function),                        \
                .flags = (_flags),                                \
                __TIMER_LOCKDEP_MAP_INITIALIZER(FILE_LINE)        \
        }

#define DEFINE_TIMER(_name, _function)                                \
        struct timer_list _name =                                \
                __TIMER_INITIALIZER(_function, 0)

/*
 * LOCKDEP and DEBUG timer interfaces.
 */
void timer_init_key(struct timer_list *timer,
                    void (*func)(struct timer_list *), unsigned int flags,
                    const char *name, struct lock_class_key *key);

#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
extern void timer_init_key_on_stack(struct timer_list *timer,
                                    void (*func)(struct timer_list *),
                                    unsigned int flags, const char *name,
                                    struct lock_class_key *key);
#else
static inline void timer_init_key_on_stack(struct timer_list *timer,
                                           void (*func)(struct timer_list *),
                                           unsigned int flags,
                                           const char *name,
                                           struct lock_class_key *key)
{
        timer_init_key(timer, func, flags, name, key);
}
#endif

#ifdef CONFIG_LOCKDEP
#define __timer_init(_timer, _fn, _flags)                                \
        do {                                                                \
                static struct lock_class_key __key;                        \
                timer_init_key((_timer), (_fn), (_flags), #_timer, &__key);\
        } while (0)

#define __timer_init_on_stack(_timer, _fn, _flags)                        \
        do {                                                                \
                static struct lock_class_key __key;                        \
                timer_init_key_on_stack((_timer), (_fn), (_flags),        \
                                        #_timer, &__key);                 \
        } while (0)
#else
#define __timer_init(_timer, _fn, _flags)                                \
        timer_init_key((_timer), (_fn), (_flags), NULL, NULL)
#define __timer_init_on_stack(_timer, _fn, _flags)                        \
        timer_init_key_on_stack((_timer), (_fn), (_flags), NULL, NULL)
#endif

/**
 * timer_setup - prepare a timer for first use
 * @timer: the timer in question
 * @callback: the function to call when timer expires
 * @flags: any TIMER_* flags
 *
 * Regular timer initialization should use either DEFINE_TIMER() above,
 * or timer_setup(). For timers on the stack, timer_setup_on_stack() must
 * be used and must be balanced with a call to timer_destroy_on_stack().
 */
#define timer_setup(timer, callback, flags)                        \
        __timer_init((timer), (callback), (flags))

#define timer_setup_on_stack(timer, callback, flags)                \
        __timer_init_on_stack((timer), (callback), (flags))

#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
extern void timer_destroy_on_stack(struct timer_list *timer);
#else
static inline void timer_destroy_on_stack(struct timer_list *timer) { }
#endif

#define timer_container_of(var, callback_timer, timer_fieldname)        \
        container_of(callback_timer, typeof(*var), timer_fieldname)

/**
 * timer_pending - is a timer pending?
 * @timer: the timer in question
 *
 * timer_pending will tell whether a given timer is currently pending,
 * or not. Callers must ensure serialization wrt. other operations done
 * to this timer, eg. interrupt contexts, or other CPUs on SMP.
 *
 * Returns: 1 if the timer is pending, 0 if not.
 */
static inline int timer_pending(const struct timer_list * timer)
{
        return !hlist_unhashed_lockless(&timer->entry);
}

extern void add_timer_on(struct timer_list *timer, int cpu);
extern int mod_timer(struct timer_list *timer, unsigned long expires);
extern int mod_timer_pending(struct timer_list *timer, unsigned long expires);
extern int timer_reduce(struct timer_list *timer, unsigned long expires);

/*
 * The jiffies value which is added to now, when there is no timer
 * in the timer wheel:
 */
#define TIMER_NEXT_MAX_DELTA        ((1UL << 30) - 1)

extern void add_timer(struct timer_list *timer);
extern void add_timer_local(struct timer_list *timer);
extern void add_timer_global(struct timer_list *timer);

extern int timer_delete_sync_try(struct timer_list *timer);
extern int timer_delete_sync(struct timer_list *timer);
extern int timer_delete(struct timer_list *timer);
extern int timer_shutdown_sync(struct timer_list *timer);
extern int timer_shutdown(struct timer_list *timer);

extern void timers_init(void);
struct hrtimer;
extern enum hrtimer_restart it_real_fn(struct hrtimer *);

unsigned long __round_jiffies_relative(unsigned long j, int cpu);
unsigned long round_jiffies(unsigned long j);
unsigned long round_jiffies_relative(unsigned long j);

unsigned long __round_jiffies_up_relative(unsigned long j, int cpu);
unsigned long round_jiffies_up(unsigned long j);
unsigned long round_jiffies_up_relative(unsigned long j);

#ifdef CONFIG_HOTPLUG_CPU
int timers_prepare_cpu(unsigned int cpu);
int timers_dead_cpu(unsigned int cpu);
#else
#define timers_prepare_cpu        NULL
#define timers_dead_cpu                NULL
#endif

#endif





























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_SWITCH_TO_H
#define _ASM_X86_SWITCH_TO_H

#include <linux/sched/task_stack.h>

struct task_struct; /* one of the stranger aspects of C forward declarations */

struct task_struct *__switch_to_asm(struct task_struct *prev,
                                    struct task_struct *next);

__visible struct task_struct *__switch_to(struct task_struct *prev,
                                          struct task_struct *next);

asmlinkage void ret_from_fork_asm(void);
__visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
                             int (*fn)(void *), void *fn_arg);

/*
 * This is the structure pointed to by thread.sp for an inactive task.  The
 * order of the fields must match the code in __switch_to_asm().
 */
struct inactive_task_frame {
#ifdef CONFIG_X86_64
        unsigned long r15;
        unsigned long r14;
        unsigned long r13;
        unsigned long r12;
#else
        unsigned long flags;
        unsigned long si;
        unsigned long di;
#endif
        unsigned long bx;

        /*
         * These two fields must be together.  They form a stack frame header,
         * needed by get_frame_pointer().
         */
        unsigned long bp;
        unsigned long ret_addr;
};

struct fork_frame {
        struct inactive_task_frame frame;
        struct pt_regs regs;
};

#define switch_to(prev, next, last)                                        \
do {                                                                        \
        ((last) = __switch_to_asm((prev), (next)));                        \
} while (0)

#ifdef CONFIG_X86_32
#include <asm/msr.h>

static inline void refresh_sysenter_cs(struct thread_struct *thread)
{
        /* Only happens when SEP is enabled, no need to test "SEP"arately: */
        if (unlikely(this_cpu_read(cpu_tss_rw.x86_tss.ss1) == thread->sysenter_cs))
                return;

        this_cpu_write(cpu_tss_rw.x86_tss.ss1, thread->sysenter_cs);
        wrmsrq(MSR_IA32_SYSENTER_CS, thread->sysenter_cs);
}
#endif

/* This is used when switching tasks or entering/exiting vm86 mode. */
static inline void update_task_stack(struct task_struct *task)
{
        /* sp0 always points to the entry trampoline stack, which is constant: */
#ifdef CONFIG_X86_32
        this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
#else
        if (!cpu_feature_enabled(X86_FEATURE_FRED) && cpu_feature_enabled(X86_FEATURE_XENPV))
                /* Xen PV enters the kernel on the thread stack. */
                load_sp0(task_top_of_stack(task));
#endif
}

static inline void kthread_frame_init(struct inactive_task_frame *frame,
                                      int (*fun)(void *), void *arg)
{
        frame->bx = (unsigned long)fun;
#ifdef CONFIG_X86_32
        frame->di = (unsigned long)arg;
#else
        frame->r12 = (unsigned long)arg;
#endif
}

#endif /* _ASM_X86_SWITCH_TO_H */



















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __IEEE802154_CORE_H
#define __IEEE802154_CORE_H

#include <net/cfg802154.h>

struct cfg802154_registered_device {
        const struct cfg802154_ops *ops;
        struct list_head list;

        /* wpan_phy index, internal only */
        int wpan_phy_idx;

        /* also protected by devlist_mtx */
        int opencount;
        wait_queue_head_t dev_wait;

        /* protected by RTNL only */
        int num_running_ifaces;

        /* associated wpan interfaces, protected by rtnl or RCU */
        struct list_head wpan_dev_list;
        int devlist_generation, wpan_dev_id;

        /* must be last because of the way we do wpan_phy_priv(),
         * and it should at least be aligned to NETDEV_ALIGN
         */
        struct wpan_phy wpan_phy __aligned(NETDEV_ALIGN);
};

static inline struct cfg802154_registered_device *
wpan_phy_to_rdev(struct wpan_phy *wpan_phy)
{
        BUG_ON(!wpan_phy);
        return container_of(wpan_phy, struct cfg802154_registered_device,
                            wpan_phy);
}

extern struct list_head cfg802154_rdev_list;
extern int cfg802154_rdev_list_generation;

int cfg802154_switch_netns(struct cfg802154_registered_device *rdev,
                           struct net *net);
/* free object */
void cfg802154_dev_free(struct cfg802154_registered_device *rdev);
struct cfg802154_registered_device *
cfg802154_rdev_by_wpan_phy_idx(int wpan_phy_idx);
struct wpan_phy *wpan_phy_idx_to_wpan_phy(int wpan_phy_idx);

#endif /* __IEEE802154_CORE_H */























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * descriptor table internals; you almost certainly want file.h instead.
 */

#ifndef __LINUX_FDTABLE_H
#define __LINUX_FDTABLE_H

#include <linux/posix_types.h>
#include <linux/compiler.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/nospec.h>
#include <linux/types.h>
#include <linux/init.h>
#include <linux/fs.h>

#include <linux/atomic.h>

/*
 * The default fd array needs to be at least BITS_PER_LONG,
 * as this is the granularity returned by copy_fdset().
 */
#define NR_OPEN_DEFAULT BITS_PER_LONG

struct fdtable {
        unsigned int max_fds;
        struct file __rcu **fd;      /* current fd array */
        unsigned long *close_on_exec;
        unsigned long *open_fds;
        unsigned long *full_fds_bits;
        struct rcu_head rcu;
};

/*
 * Open file table structure
 */
struct files_struct {
  /*
   * read mostly part
   */
        atomic_t count;
        bool resize_in_progress;
        wait_queue_head_t resize_wait;

        struct fdtable __rcu *fdt;
        struct fdtable fdtab;
  /*
   * written part on a separate cache line in SMP
   */
        spinlock_t file_lock ____cacheline_aligned_in_smp;
        unsigned int next_fd;
        unsigned long close_on_exec_init[1];
        unsigned long open_fds_init[1];
        unsigned long full_fds_bits_init[1];
        struct file __rcu * fd_array[NR_OPEN_DEFAULT];
};

struct file_operations;
struct vfsmount;
struct dentry;

#define rcu_dereference_check_fdtable(files, fdtfd) \
        rcu_dereference_check((fdtfd), lockdep_is_held(&(files)->file_lock))

#define files_fdtable(files) \
        rcu_dereference_check_fdtable((files), (files)->fdt)

/*
 * The caller must ensure that fd table isn't shared or hold rcu or file lock
 */
static inline struct file *files_lookup_fd_raw(struct files_struct *files, unsigned int fd)
{
        struct fdtable *fdt = rcu_dereference_raw(files->fdt);
        unsigned long mask = array_index_mask_nospec(fd, fdt->max_fds);
        struct file *needs_masking;

        /*
         * 'mask' is zero for an out-of-bounds fd, all ones for ok.
         * 'fd&mask' is 'fd' for ok, or 0 for out of bounds.
         *
         * Accessing fdt->fd[0] is ok, but needs masking of the result.
         */
        needs_masking = rcu_dereference_raw(fdt->fd[fd&mask]);
        return (struct file *)(mask & (unsigned long)needs_masking);
}

static inline struct file *files_lookup_fd_locked(struct files_struct *files, unsigned int fd)
{
        RCU_LOCKDEP_WARN(!lockdep_is_held(&files->file_lock),
                           "suspicious rcu_dereference_check() usage");
        return files_lookup_fd_raw(files, fd);
}

static inline bool close_on_exec(unsigned int fd, const struct files_struct *files)
{
        return test_bit(fd, files_fdtable(files)->close_on_exec);
}

struct task_struct;

void put_files_struct(struct files_struct *fs);
int unshare_files(void);
struct fd_range {
        unsigned int from, to;
};
struct files_struct *dup_fd(struct files_struct *, struct fd_range *) __latent_entropy;
void do_close_on_exec(struct files_struct *);
int iterate_fd(struct files_struct *, unsigned,
                int (*)(const void *, struct file *, unsigned),
                const void *);

extern int close_fd(unsigned int fd);
extern struct file *file_close_fd(unsigned int fd);

extern struct kmem_cache *files_cachep;

#endif /* __LINUX_FDTABLE_H */














































































































  268 





  319 





  304 




































  316 

  304 






  301 


















    7 





































































































  319 













  303 




























  305 





















  304 
  302 





  319 
  313 













  318 







  305 























  301 



  305 




















































































































































































   14 








  319 

  319 
  319 

  319 
   25 
  319 







  307 




  308 
  308 






















  319 

  319 






















  316 
  319 
  318 


















































  305 
  301 
    1 
  303 

  305 


  305 

  302 

  305 

  296 


  304 



  198 


  198 


























  197 



  303 






  305 










  305 

    7 
  302 



  305 

  303 


    6 



























































































































































  300 

  305 
  304 


  301 

















    6 

    6 






















    4 














































































    4 




























    4 




































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Resizable, Scalable, Concurrent Hash Table
 *
 * Copyright (c) 2015-2016 Herbert Xu <herbert@gondor.apana.org.au>
 * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch>
 * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
 *
 * Code partially derived from nft_hash
 * Rewritten with rehash code from br_multicast plus single list
 * pointer as suggested by Josh Triplett
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */

#ifndef _LINUX_RHASHTABLE_H
#define _LINUX_RHASHTABLE_H

#include <linux/err.h>
#include <linux/errno.h>
#include <linux/jhash.h>
#include <linux/list_nulls.h>
#include <linux/workqueue.h>
#include <linux/rculist.h>
#include <linux/bit_spinlock.h>

#include <linux/rhashtable-types.h>
/*
 * Objects in an rhashtable have an embedded struct rhash_head
 * which is linked into as hash chain from the hash table - or one
 * of two or more hash tables when the rhashtable is being resized.
 * The end of the chain is marked with a special nulls marks which has
 * the least significant bit set but otherwise stores the address of
 * the hash bucket.  This allows us to be sure we've found the end
 * of the right list.
 * The value stored in the hash bucket has BIT(0) used as a lock bit.
 * This bit must be atomically set before any changes are made to
 * the chain.  To avoid dereferencing this pointer without clearing
 * the bit first, we use an opaque 'struct rhash_lock_head *' for the
 * pointer stored in the bucket.  This struct needs to be defined so
 * that rcu_dereference() works on it, but it has no content so a
 * cast is needed for it to be useful.  This ensures it isn't
 * used by mistake with clearing the lock bit first.
 */
struct rhash_lock_head {};

/* Maximum chain length before rehash
 *
 * The maximum (not average) chain length grows with the size of the hash
 * table, at a rate of (log N)/(log log N).
 *
 * The value of 16 is selected so that even if the hash table grew to
 * 2^32 you would not expect the maximum chain length to exceed it
 * unless we are under attack (or extremely unlucky).
 *
 * As this limit is only to detect attacks, we don't need to set it to a
 * lower value as you'd need the chain length to vastly exceed 16 to have
 * any real effect on the system.
 */
#define RHT_ELASTICITY        16u

/**
 * struct bucket_table - Table of hash buckets
 * @size: Number of hash buckets
 * @nest: Number of bits of first-level nested table.
 * @rehash: Current bucket being rehashed
 * @hash_rnd: Random seed to fold into hash
 * @walkers: List of active walkers
 * @rcu: RCU structure for freeing the table
 * @future_tbl: Table under construction during rehashing
 * @ntbl: Nested table used when out of memory.
 * @buckets: size * hash buckets
 */
struct bucket_table {
        unsigned int                size;
        unsigned int                nest;
        u32                        hash_rnd;
        struct list_head        walkers;
        struct rcu_head                rcu;

        struct bucket_table __rcu *future_tbl;

        struct lockdep_map        dep_map;

        struct rhash_lock_head __rcu *buckets[] ____cacheline_aligned_in_smp;
};

/*
 * NULLS_MARKER() expects a hash value with the low
 * bits mostly likely to be significant, and it discards
 * the msb.
 * We give it an address, in which the bottom bit is
 * always 0, and the msb might be significant.
 * So we shift the address down one bit to align with
 * expectations and avoid losing a significant bit.
 *
 * We never store the NULLS_MARKER in the hash table
 * itself as we need the lsb for locking.
 * Instead we store a NULL
 */
#define        RHT_NULLS_MARKER(ptr)        \
        ((void *)NULLS_MARKER(((unsigned long) (ptr)) >> 1))
#define INIT_RHT_NULLS_HEAD(ptr)        \
        ((ptr) = NULL)

static inline bool rht_is_a_nulls(const struct rhash_head *ptr)
{
        return ((unsigned long) ptr & 1);
}

static inline void *rht_obj(const struct rhashtable *ht,
                            const struct rhash_head *he)
{
        return (char *)he - ht->p.head_offset;
}

static inline unsigned int rht_bucket_index(const struct bucket_table *tbl,
                                            unsigned int hash)
{
        return hash & (tbl->size - 1);
}

static __always_inline unsigned int rht_key_get_hash(struct rhashtable *ht,
        const void *key, const struct rhashtable_params params,
        unsigned int hash_rnd)
{
        unsigned int hash;

        /* params must be equal to ht->p if it isn't constant. */
        if (!__builtin_constant_p(params.key_len))
                hash = ht->p.hashfn(key, ht->key_len, hash_rnd);
        else if (params.key_len) {
                unsigned int key_len = params.key_len;

                if (params.hashfn)
                        hash = params.hashfn(key, key_len, hash_rnd);
                else if (key_len & (sizeof(u32) - 1))
                        hash = jhash(key, key_len, hash_rnd);
                else
                        hash = jhash2(key, key_len / sizeof(u32), hash_rnd);
        } else {
                unsigned int key_len = ht->p.key_len;

                if (params.hashfn)
                        hash = params.hashfn(key, key_len, hash_rnd);
                else
                        hash = jhash(key, key_len, hash_rnd);
        }

        return hash;
}

static __always_inline unsigned int rht_key_hashfn(
        struct rhashtable *ht, const struct bucket_table *tbl,
        const void *key, const struct rhashtable_params params)
{
        unsigned int hash = rht_key_get_hash(ht, key, params, tbl->hash_rnd);

        return rht_bucket_index(tbl, hash);
}

static __always_inline unsigned int rht_head_hashfn(
        struct rhashtable *ht, const struct bucket_table *tbl,
        const struct rhash_head *he, const struct rhashtable_params params)
{
        const char *ptr = rht_obj(ht, he);

        return likely(params.obj_hashfn) ?
               rht_bucket_index(tbl, params.obj_hashfn(ptr, params.key_len ?:
                                                            ht->p.key_len,
                                                       tbl->hash_rnd)) :
               rht_key_hashfn(ht, tbl, ptr + params.key_offset, params);
}

/**
 * rht_grow_above_75 - returns true if nelems > 0.75 * table-size
 * @ht:                hash table
 * @tbl:        current table
 */
static inline bool rht_grow_above_75(const struct rhashtable *ht,
                                     const struct bucket_table *tbl)
{
        /* Expand table when exceeding 75% load */
        return atomic_read(&ht->nelems) > (tbl->size / 4 * 3) &&
               (!ht->p.max_size || tbl->size < ht->p.max_size);
}

/**
 * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size
 * @ht:                hash table
 * @tbl:        current table
 */
static inline bool rht_shrink_below_30(const struct rhashtable *ht,
                                       const struct bucket_table *tbl)
{
        /* Shrink table beneath 30% load */
        return atomic_read(&ht->nelems) < (tbl->size * 3 / 10) &&
               tbl->size > ht->p.min_size;
}

/**
 * rht_grow_above_100 - returns true if nelems > table-size
 * @ht:                hash table
 * @tbl:        current table
 */
static inline bool rht_grow_above_100(const struct rhashtable *ht,
                                      const struct bucket_table *tbl)
{
        return atomic_read(&ht->nelems) > tbl->size &&
                (!ht->p.max_size || tbl->size < ht->p.max_size);
}

/**
 * rht_grow_above_max - returns true if table is above maximum
 * @ht:                hash table
 * @tbl:        current table
 */
static inline bool rht_grow_above_max(const struct rhashtable *ht,
                                      const struct bucket_table *tbl)
{
        return atomic_read(&ht->nelems) >= ht->max_elems;
}

#ifdef CONFIG_PROVE_LOCKING
int lockdep_rht_mutex_is_held(struct rhashtable *ht);
int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash);
#else
static inline int lockdep_rht_mutex_is_held(struct rhashtable *ht)
{
        return 1;
}

static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl,
                                             u32 hash)
{
        return 1;
}
#endif /* CONFIG_PROVE_LOCKING */

void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
                             struct rhash_head *obj);

void rhashtable_walk_enter(struct rhashtable *ht,
                           struct rhashtable_iter *iter);
void rhashtable_walk_exit(struct rhashtable_iter *iter);
int rhashtable_walk_start_check(struct rhashtable_iter *iter) __acquires(RCU);

static inline void rhashtable_walk_start(struct rhashtable_iter *iter)
{
        (void)rhashtable_walk_start_check(iter);
}

void *rhashtable_walk_next(struct rhashtable_iter *iter);
void *rhashtable_walk_peek(struct rhashtable_iter *iter);
void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU);

void rhashtable_free_and_destroy(struct rhashtable *ht,
                                 void (*free_fn)(void *ptr, void *arg),
                                 void *arg);
void rhashtable_destroy(struct rhashtable *ht);

struct rhash_lock_head __rcu **rht_bucket_nested(
        const struct bucket_table *tbl, unsigned int hash);
struct rhash_lock_head __rcu **__rht_bucket_nested(
        const struct bucket_table *tbl, unsigned int hash);
struct rhash_lock_head __rcu **rht_bucket_nested_insert(
        struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash);

#define rht_dereference(p, ht) \
        rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht))

#define rht_dereference_rcu(p, ht) \
        rcu_dereference_all_check(p, lockdep_rht_mutex_is_held(ht))

#define rht_dereference_bucket(p, tbl, hash) \
        rcu_dereference_protected(p, lockdep_rht_bucket_is_held(tbl, hash))

#define rht_dereference_bucket_rcu(p, tbl, hash) \
        rcu_dereference_all_check(p, lockdep_rht_bucket_is_held(tbl, hash))

#define rht_entry(tpos, pos, member) \
        ({ tpos = container_of(pos, typeof(*tpos), member); 1; })

static inline struct rhash_lock_head __rcu *const *rht_bucket(
        const struct bucket_table *tbl, unsigned int hash)
{
        return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) :
                                     &tbl->buckets[hash];
}

static inline struct rhash_lock_head __rcu **rht_bucket_var(
        struct bucket_table *tbl, unsigned int hash)
{
        return unlikely(tbl->nest) ? __rht_bucket_nested(tbl, hash) :
                                     &tbl->buckets[hash];
}

static inline struct rhash_lock_head __rcu **rht_bucket_insert(
        struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash)
{
        return unlikely(tbl->nest) ? rht_bucket_nested_insert(ht, tbl, hash) :
                                     &tbl->buckets[hash];
}

/*
 * We lock a bucket by setting BIT(0) in the pointer - this is always
 * zero in real pointers.  The NULLS mark is never stored in the bucket,
 * rather we store NULL if the bucket is empty.
 * bit_spin_locks do not handle contention well, but the whole point
 * of the hashtable design is to achieve minimum per-bucket contention.
 * A nested hash table might not have a bucket pointer.  In that case
 * we cannot get a lock.  For remove and replace the bucket cannot be
 * interesting and doesn't need locking.
 * For insert we allocate the bucket if this is the last bucket_table,
 * and then take the lock.
 * Sometimes we unlock a bucket by writing a new pointer there.  In that
 * case we don't need to unlock, but we do need to reset state such as
 * local_bh. For that we have rht_assign_unlock().  As rcu_assign_pointer()
 * provides the same release semantics that bit_spin_unlock() provides,
 * this is safe.
 * When we write to a bucket without unlocking, we use rht_assign_locked().
 */

static inline unsigned long rht_lock(struct bucket_table *tbl,
                                     struct rhash_lock_head __rcu **bkt)
{
        unsigned long flags;

        local_irq_save(flags);
        bit_spin_lock(0, (unsigned long *)bkt);
        lock_map_acquire(&tbl->dep_map);
        return flags;
}

static inline unsigned long rht_lock_nested(struct bucket_table *tbl,
                                        struct rhash_lock_head __rcu **bucket,
                                        unsigned int subclass)
{
        unsigned long flags;

        local_irq_save(flags);
        bit_spin_lock(0, (unsigned long *)bucket);
        lock_acquire_exclusive(&tbl->dep_map, subclass, 0, NULL, _THIS_IP_);
        return flags;
}

static inline void rht_unlock(struct bucket_table *tbl,
                              struct rhash_lock_head __rcu **bkt,
                              unsigned long flags)
{
        lock_map_release(&tbl->dep_map);
        bit_spin_unlock(0, (unsigned long *)bkt);
        local_irq_restore(flags);
}

static inline struct rhash_head *__rht_ptr(
        struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt)
{
        return (struct rhash_head *)
                ((unsigned long)p & ~BIT(0) ?:
                 (unsigned long)RHT_NULLS_MARKER(bkt));
}

/*
 * Where 'bkt' is a bucket and might be locked:
 *   rht_ptr_rcu() dereferences that pointer and clears the lock bit.
 *   rht_ptr() dereferences in a context where the bucket is locked.
 *   rht_ptr_exclusive() dereferences in a context where exclusive
 *            access is guaranteed, such as when destroying the table.
 */
static inline struct rhash_head *rht_ptr_rcu(
        struct rhash_lock_head __rcu *const *bkt)
{
        return __rht_ptr(rcu_dereference_all(*bkt), bkt);
}

static inline struct rhash_head *rht_ptr(
        struct rhash_lock_head __rcu *const *bkt,
        struct bucket_table *tbl,
        unsigned int hash)
{
        return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt);
}

static inline struct rhash_head *rht_ptr_exclusive(
        struct rhash_lock_head __rcu *const *bkt)
{
        return __rht_ptr(rcu_dereference_protected(*bkt, 1), bkt);
}

static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt,
                                     struct rhash_head *obj)
{
        if (rht_is_a_nulls(obj))
                obj = NULL;
        rcu_assign_pointer(*bkt, (void *)((unsigned long)obj | BIT(0)));
}

static inline void rht_assign_unlock(struct bucket_table *tbl,
                                     struct rhash_lock_head __rcu **bkt,
                                     struct rhash_head *obj,
                                     unsigned long flags)
{
        if (rht_is_a_nulls(obj))
                obj = NULL;
        lock_map_release(&tbl->dep_map);
        rcu_assign_pointer(*bkt, (void *)obj);
        preempt_enable();
        __release(bitlock);
        local_irq_restore(flags);
}

/**
 * rht_for_each_from - iterate over hash chain from given head
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @head:        the &struct rhash_head to start from
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 */
#define rht_for_each_from(pos, head, tbl, hash) \
        for (pos = head;                        \
             !rht_is_a_nulls(pos);                \
             pos = rht_dereference_bucket((pos)->next, tbl, hash))

/**
 * rht_for_each - iterate over hash chain
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 */
#define rht_for_each(pos, tbl, hash) \
        rht_for_each_from(pos, rht_ptr(rht_bucket(tbl, hash), tbl, hash),  \
                          tbl, hash)

/**
 * rht_for_each_entry_from - iterate over hash chain from given head
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @head:        the &struct rhash_head to start from
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 * @member:        name of the &struct rhash_head within the hashable struct.
 */
#define rht_for_each_entry_from(tpos, pos, head, tbl, hash, member)        \
        for (pos = head;                                                \
             (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);        \
             pos = rht_dereference_bucket((pos)->next, tbl, hash))

/**
 * rht_for_each_entry - iterate over hash chain of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 * @member:        name of the &struct rhash_head within the hashable struct.
 */
#define rht_for_each_entry(tpos, pos, tbl, hash, member)                \
        rht_for_each_entry_from(tpos, pos,                                \
                                rht_ptr(rht_bucket(tbl, hash), tbl, hash), \
                                tbl, hash, member)

/**
 * rht_for_each_entry_safe - safely iterate over hash chain of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @next:        the &struct rhash_head to use as next in loop cursor.
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 * @member:        name of the &struct rhash_head within the hashable struct.
 *
 * This hash chain list-traversal primitive allows for the looped code to
 * remove the loop cursor from the list.
 */
#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member)              \
        for (pos = rht_ptr(rht_bucket(tbl, hash), tbl, hash),                      \
             next = !rht_is_a_nulls(pos) ?                                      \
                       rht_dereference_bucket(pos->next, tbl, hash) : NULL;   \
             (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);              \
             pos = next,                                                      \
             next = !rht_is_a_nulls(pos) ?                                      \
                       rht_dereference_bucket(pos->next, tbl, hash) : NULL)

/**
 * rht_for_each_rcu_from - iterate over rcu hash chain from given head
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @head:        the &struct rhash_head to start from
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 *
 * This hash chain list-traversal primitive may safely run concurrently with
 * the _rcu mutation primitives such as rhashtable_insert() as long as the
 * traversal is guarded by rcu_read_lock().
 */
#define rht_for_each_rcu_from(pos, head, tbl, hash)                        \
        for (({barrier(); }),                                                \
             pos = head;                                                \
             !rht_is_a_nulls(pos);                                        \
             pos = rcu_dereference_all(pos->next))

/**
 * rht_for_each_rcu - iterate over rcu hash chain
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 *
 * This hash chain list-traversal primitive may safely run concurrently with
 * the _rcu mutation primitives such as rhashtable_insert() as long as the
 * traversal is guarded by rcu_read_lock().
 */
#define rht_for_each_rcu(pos, tbl, hash)                        \
        for (({barrier(); }),                                        \
             pos = rht_ptr_rcu(rht_bucket(tbl, hash));                \
             !rht_is_a_nulls(pos);                                \
             pos = rcu_dereference_all(pos->next))

/**
 * rht_for_each_entry_rcu_from - iterated over rcu hash chain from given head
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @head:        the &struct rhash_head to start from
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 * @member:        name of the &struct rhash_head within the hashable struct.
 *
 * This hash chain list-traversal primitive may safely run concurrently with
 * the _rcu mutation primitives such as rhashtable_insert() as long as the
 * traversal is guarded by rcu_read_lock().
 */
#define rht_for_each_entry_rcu_from(tpos, pos, head, tbl, hash, member) \
        for (({barrier(); }),                                                    \
             pos = head;                                                    \
             (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);            \
             pos = rht_dereference_bucket_rcu(pos->next, tbl, hash))

/**
 * rht_for_each_entry_rcu - iterate over rcu hash chain of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 * @member:        name of the &struct rhash_head within the hashable struct.
 *
 * This hash chain list-traversal primitive may safely run concurrently with
 * the _rcu mutation primitives such as rhashtable_insert() as long as the
 * traversal is guarded by rcu_read_lock().
 */
#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member)                   \
        rht_for_each_entry_rcu_from(tpos, pos,                                   \
                                    rht_ptr_rcu(rht_bucket(tbl, hash)),           \
                                    tbl, hash, member)

/**
 * rhl_for_each_rcu - iterate over rcu hash table list
 * @pos:        the &struct rlist_head to use as a loop cursor.
 * @list:        the head of the list
 *
 * This hash chain list-traversal primitive should be used on the
 * list returned by rhltable_lookup.
 */
#define rhl_for_each_rcu(pos, list)                                        \
        for (pos = list; pos; pos = rcu_dereference_all(pos->next))

/**
 * rhl_for_each_entry_rcu - iterate over rcu hash table list of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct rlist_head to use as a loop cursor.
 * @list:        the head of the list
 * @member:        name of the &struct rlist_head within the hashable struct.
 *
 * This hash chain list-traversal primitive should be used on the
 * list returned by rhltable_lookup.
 */
#define rhl_for_each_entry_rcu(tpos, pos, list, member)                        \
        for (pos = list; pos && rht_entry(tpos, pos, member);                \
             pos = rcu_dereference_all(pos->next))

static inline int rhashtable_compare(struct rhashtable_compare_arg *arg,
                                     const void *obj)
{
        struct rhashtable *ht = arg->ht;
        const char *ptr = obj;

        return memcmp(ptr + ht->p.key_offset, arg->key, ht->p.key_len);
}

/* Internal function, do not use. */
static __always_inline struct rhash_head *__rhashtable_lookup(
        struct rhashtable *ht, const void *key,
        const struct rhashtable_params params)
{
        struct rhashtable_compare_arg arg = {
                .ht = ht,
                .key = key,
        };
        struct rhash_lock_head __rcu *const *bkt;
        struct bucket_table *tbl;
        struct rhash_head *he;
        unsigned int hash;

        tbl = rht_dereference_rcu(ht->tbl, ht);
restart:
        hash = rht_key_hashfn(ht, tbl, key, params);
        bkt = rht_bucket(tbl, hash);
        do {
                rht_for_each_rcu_from(he, rht_ptr_rcu(bkt), tbl, hash) {
                        if (params.obj_cmpfn ?
                            params.obj_cmpfn(&arg, rht_obj(ht, he)) :
                            rhashtable_compare(&arg, rht_obj(ht, he)))
                                continue;
                        return he;
                }
                /* An object might have been moved to a different hash chain,
                 * while we walk along it - better check and retry.
                 */
        } while (he != RHT_NULLS_MARKER(bkt));

        /* Ensure we see any new tables. */
        smp_rmb();

        tbl = rht_dereference_rcu(tbl->future_tbl, ht);
        if (unlikely(tbl))
                goto restart;

        return NULL;
}

/**
 * rhashtable_lookup - search hash table
 * @ht:                hash table
 * @key:        the pointer to the key
 * @params:        hash table parameters
 *
 * Computes the hash value for the key and traverses the bucket chain looking
 * for an entry with an identical key. The first matching entry is returned.
 *
 * This must only be called under the RCU read lock.
 *
 * Returns the first entry on which the compare function returned true.
 */
static __always_inline void *rhashtable_lookup(
        struct rhashtable *ht, const void *key,
        const struct rhashtable_params params)
{
        struct rhash_head *he = __rhashtable_lookup(ht, key, params);

        return he ? rht_obj(ht, he) : NULL;
}

/**
 * rhashtable_lookup_fast - search hash table, without RCU read lock
 * @ht:                hash table
 * @key:        the pointer to the key
 * @params:        hash table parameters
 *
 * Computes the hash value for the key and traverses the bucket chain looking
 * for an entry with an identical key. The first matching entry is returned.
 *
 * Only use this function when you have other mechanisms guaranteeing
 * that the object won't go away after the RCU read lock is released.
 *
 * Returns the first entry on which the compare function returned true.
 */
static __always_inline void *rhashtable_lookup_fast(
        struct rhashtable *ht, const void *key,
        const struct rhashtable_params params)
{
        void *obj;

        rcu_read_lock();
        obj = rhashtable_lookup(ht, key, params);
        rcu_read_unlock();

        return obj;
}

/**
 * rhltable_lookup - search hash list table
 * @hlt:        hash table
 * @key:        the pointer to the key
 * @params:        hash table parameters
 *
 * Computes the hash value for the key and traverses the bucket chain looking
 * for an entry with an identical key.  All matching entries are returned
 * in a list.
 *
 * This must only be called under the RCU read lock.
 *
 * Returns the list of entries that match the given key.
 */
static __always_inline struct rhlist_head *rhltable_lookup(
        struct rhltable *hlt, const void *key,
        const struct rhashtable_params params)
{
        struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params);

        return he ? container_of(he, struct rhlist_head, rhead) : NULL;
}

/* Internal function, please use rhashtable_insert_fast() instead. This
 * function returns the existing element already in hashes if there is a clash,
 * otherwise it returns an error via ERR_PTR().
 */
static __always_inline void *__rhashtable_insert_fast(
        struct rhashtable *ht, const void *key, struct rhash_head *obj,
        const struct rhashtable_params params, bool rhlist)
{
        struct rhashtable_compare_arg arg = {
                .ht = ht,
                .key = key,
        };
        struct rhash_lock_head __rcu **bkt;
        struct rhash_head __rcu **pprev;
        struct bucket_table *tbl;
        struct rhash_head *head;
        unsigned long flags;
        unsigned int hash;
        int elasticity;
        void *data;

        rcu_read_lock();

        tbl = rht_dereference_rcu(ht->tbl, ht);
        hash = rht_head_hashfn(ht, tbl, obj, params);
        elasticity = RHT_ELASTICITY;
        bkt = rht_bucket_insert(ht, tbl, hash);
        data = ERR_PTR(-ENOMEM);
        if (!bkt)
                goto out;
        pprev = NULL;
        flags = rht_lock(tbl, bkt);

        if (unlikely(rcu_access_pointer(tbl->future_tbl))) {
slow_path:
                rht_unlock(tbl, bkt, flags);
                rcu_read_unlock();
                return rhashtable_insert_slow(ht, key, obj);
        }

        rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) {
                struct rhlist_head *plist;
                struct rhlist_head *list;

                elasticity--;
                if (!key ||
                    (params.obj_cmpfn ?
                     params.obj_cmpfn(&arg, rht_obj(ht, head)) :
                     rhashtable_compare(&arg, rht_obj(ht, head)))) {
                        pprev = &head->next;
                        continue;
                }

                data = rht_obj(ht, head);

                if (!rhlist)
                        goto out_unlock;


                list = container_of(obj, struct rhlist_head, rhead);
                plist = container_of(head, struct rhlist_head, rhead);

                RCU_INIT_POINTER(list->next, plist);
                head = rht_dereference_bucket(head->next, tbl, hash);
                RCU_INIT_POINTER(list->rhead.next, head);
                if (pprev) {
                        rcu_assign_pointer(*pprev, obj);
                        rht_unlock(tbl, bkt, flags);
                } else
                        rht_assign_unlock(tbl, bkt, obj, flags);
                data = NULL;
                goto out;
        }

        if (elasticity <= 0)
                goto slow_path;

        data = ERR_PTR(-E2BIG);
        if (unlikely(rht_grow_above_max(ht, tbl)))
                goto out_unlock;

        if (unlikely(rht_grow_above_100(ht, tbl)))
                goto slow_path;

        /* Inserting at head of list makes unlocking free. */
        head = rht_ptr(bkt, tbl, hash);

        RCU_INIT_POINTER(obj->next, head);
        if (rhlist) {
                struct rhlist_head *list;

                list = container_of(obj, struct rhlist_head, rhead);
                RCU_INIT_POINTER(list->next, NULL);
        }

        atomic_inc(&ht->nelems);
        rht_assign_unlock(tbl, bkt, obj, flags);

        if (rht_grow_above_75(ht, tbl))
                schedule_work(&ht->run_work);

        data = NULL;
out:
        rcu_read_unlock();

        return data;

out_unlock:
        rht_unlock(tbl, bkt, flags);
        goto out;
}

/**
 * rhashtable_insert_fast - insert object into hash table
 * @ht:                hash table
 * @obj:        pointer to hash head inside object
 * @params:        hash table parameters
 *
 * Will take the per bucket bitlock to protect against mutual mutations
 * on the same bucket. Multiple insertions may occur in parallel unless
 * they map to the same bucket.
 *
 * It is safe to call this function from atomic context.
 *
 * Will trigger an automatic deferred table resizing if residency in the
 * table grows beyond 70%.
 */
static __always_inline int rhashtable_insert_fast(
        struct rhashtable *ht, struct rhash_head *obj,
        const struct rhashtable_params params)
{
        void *ret;

        ret = __rhashtable_insert_fast(ht, NULL, obj, params, false);
        if (IS_ERR(ret))
                return PTR_ERR(ret);

        return ret == NULL ? 0 : -EEXIST;
}

/**
 * rhltable_insert_key - insert object into hash list table
 * @hlt:        hash list table
 * @key:        the pointer to the key
 * @list:        pointer to hash list head inside object
 * @params:        hash table parameters
 *
 * Will take the per bucket bitlock to protect against mutual mutations
 * on the same bucket. Multiple insertions may occur in parallel unless
 * they map to the same bucket.
 *
 * It is safe to call this function from atomic context.
 *
 * Will trigger an automatic deferred table resizing if residency in the
 * table grows beyond 70%.
 */
static __always_inline int rhltable_insert_key(
        struct rhltable *hlt, const void *key, struct rhlist_head *list,
        const struct rhashtable_params params)
{
        return PTR_ERR(__rhashtable_insert_fast(&hlt->ht, key, &list->rhead,
                                                params, true));
}

/**
 * rhltable_insert - insert object into hash list table
 * @hlt:        hash list table
 * @list:        pointer to hash list head inside object
 * @params:        hash table parameters
 *
 * Will take the per bucket bitlock to protect against mutual mutations
 * on the same bucket. Multiple insertions may occur in parallel unless
 * they map to the same bucket.
 *
 * It is safe to call this function from atomic context.
 *
 * Will trigger an automatic deferred table resizing if residency in the
 * table grows beyond 70%.
 */
static __always_inline int rhltable_insert(
        struct rhltable *hlt, struct rhlist_head *list,
        const struct rhashtable_params params)
{
        const char *key = rht_obj(&hlt->ht, &list->rhead);

        key += params.key_offset;

        return rhltable_insert_key(hlt, key, list, params);
}

/**
 * rhashtable_lookup_insert_fast - lookup and insert object into hash table
 * @ht:                hash table
 * @obj:        pointer to hash head inside object
 * @params:        hash table parameters
 *
 * This lookup function may only be used for fixed key hash table (key_len
 * parameter set). It will BUG() if used inappropriately.
 *
 * It is safe to call this function from atomic context.
 *
 * Will trigger an automatic deferred table resizing if residency in the
 * table grows beyond 70%.
 */
static __always_inline int rhashtable_lookup_insert_fast(
        struct rhashtable *ht, struct rhash_head *obj,
        const struct rhashtable_params params)
{
        const char *key = rht_obj(ht, obj);
        void *ret;

        BUG_ON(ht->p.obj_hashfn);

        ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params,
                                       false);
        if (IS_ERR(ret))
                return PTR_ERR(ret);

        return ret == NULL ? 0 : -EEXIST;
}

/**
 * rhashtable_lookup_get_insert_fast - lookup and insert object into hash table
 * @ht:                hash table
 * @obj:        pointer to hash head inside object
 * @params:        hash table parameters
 *
 * Just like rhashtable_lookup_insert_fast(), but this function returns the
 * object if it exists, NULL if it did not and the insertion was successful,
 * and an ERR_PTR otherwise.
 */
static __always_inline void *rhashtable_lookup_get_insert_fast(
        struct rhashtable *ht, struct rhash_head *obj,
        const struct rhashtable_params params)
{
        const char *key = rht_obj(ht, obj);

        BUG_ON(ht->p.obj_hashfn);

        return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params,
                                        false);
}

/**
 * rhashtable_lookup_insert_key - search and insert object to hash table
 *                                  with explicit key
 * @ht:                hash table
 * @key:        key
 * @obj:        pointer to hash head inside object
 * @params:        hash table parameters
 *
 * Lookups may occur in parallel with hashtable mutations and resizing.
 *
 * Will trigger an automatic deferred table resizing if residency in the
 * table grows beyond 70%.
 *
 * Returns zero on success.
 */
static __always_inline int rhashtable_lookup_insert_key(
        struct rhashtable *ht, const void *key, struct rhash_head *obj,
        const struct rhashtable_params params)
{
        void *ret;

        BUG_ON(!ht->p.obj_hashfn || !key);

        ret = __rhashtable_insert_fast(ht, key, obj, params, false);
        if (IS_ERR(ret))
                return PTR_ERR(ret);

        return ret == NULL ? 0 : -EEXIST;
}

/**
 * rhashtable_lookup_get_insert_key - lookup and insert object into hash table
 * @ht:                hash table
 * @key:        key
 * @obj:        pointer to hash head inside object
 * @params:        hash table parameters
 *
 * Just like rhashtable_lookup_insert_key(), but this function returns the
 * object if it exists, NULL if it does not and the insertion was successful,
 * and an ERR_PTR otherwise.
 */
static __always_inline void *rhashtable_lookup_get_insert_key(
        struct rhashtable *ht, const void *key, struct rhash_head *obj,
        const struct rhashtable_params params)
{
        BUG_ON(!ht->p.obj_hashfn || !key);

        return __rhashtable_insert_fast(ht, key, obj, params, false);
}

/* Internal function, please use rhashtable_remove_fast() instead */
static __always_inline int __rhashtable_remove_fast_one(
        struct rhashtable *ht, struct bucket_table *tbl,
        struct rhash_head *obj, const struct rhashtable_params params,
        bool rhlist)
{
        struct rhash_lock_head __rcu **bkt;
        struct rhash_head __rcu **pprev;
        struct rhash_head *he;
        unsigned long flags;
        unsigned int hash;
        int err = -ENOENT;

        hash = rht_head_hashfn(ht, tbl, obj, params);
        bkt = rht_bucket_var(tbl, hash);
        if (!bkt)
                return -ENOENT;
        pprev = NULL;
        flags = rht_lock(tbl, bkt);

        rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
                struct rhlist_head *list;

                list = container_of(he, struct rhlist_head, rhead);

                if (he != obj) {
                        struct rhlist_head __rcu **lpprev;

                        pprev = &he->next;

                        if (!rhlist)
                                continue;

                        do {
                                lpprev = &list->next;
                                list = rht_dereference_bucket(list->next,
                                                              tbl, hash);
                        } while (list && obj != &list->rhead);

                        if (!list)
                                continue;

                        list = rht_dereference_bucket(list->next, tbl, hash);
                        RCU_INIT_POINTER(*lpprev, list);
                        err = 0;
                        break;
                }

                obj = rht_dereference_bucket(obj->next, tbl, hash);
                err = 1;

                if (rhlist) {
                        list = rht_dereference_bucket(list->next, tbl, hash);
                        if (list) {
                                RCU_INIT_POINTER(list->rhead.next, obj);
                                obj = &list->rhead;
                                err = 0;
                        }
                }

                if (pprev) {
                        rcu_assign_pointer(*pprev, obj);
                        rht_unlock(tbl, bkt, flags);
                } else {
                        rht_assign_unlock(tbl, bkt, obj, flags);
                }
                goto unlocked;
        }

        rht_unlock(tbl, bkt, flags);
unlocked:
        if (err > 0) {
                atomic_dec(&ht->nelems);
                if (unlikely(ht->p.automatic_shrinking &&
                             rht_shrink_below_30(ht, tbl)))
                        schedule_work(&ht->run_work);
                err = 0;
        }

        return err;
}

/* Internal function, please use rhashtable_remove_fast() instead */
static __always_inline int __rhashtable_remove_fast(
        struct rhashtable *ht, struct rhash_head *obj,
        const struct rhashtable_params params, bool rhlist)
{
        struct bucket_table *tbl;
        int err;

        rcu_read_lock();

        tbl = rht_dereference_rcu(ht->tbl, ht);

        /* Because we have already taken (and released) the bucket
         * lock in old_tbl, if we find that future_tbl is not yet
         * visible then that guarantees the entry to still be in
         * the old tbl if it exists.
         */
        while ((err = __rhashtable_remove_fast_one(ht, tbl, obj, params,
                                                   rhlist)) &&
               (tbl = rht_dereference_rcu(tbl->future_tbl, ht)))
                ;

        rcu_read_unlock();

        return err;
}

/**
 * rhashtable_remove_fast - remove object from hash table
 * @ht:                hash table
 * @obj:        pointer to hash head inside object
 * @params:        hash table parameters
 *
 * Since the hash chain is single linked, the removal operation needs to
 * walk the bucket chain upon removal. The removal operation is thus
 * considerable slow if the hash table is not correctly sized.
 *
 * Will automatically shrink the table if permitted when residency drops
 * below 30%.
 *
 * Returns zero on success, -ENOENT if the entry could not be found.
 */
static __always_inline int rhashtable_remove_fast(
        struct rhashtable *ht, struct rhash_head *obj,
        const struct rhashtable_params params)
{
        return __rhashtable_remove_fast(ht, obj, params, false);
}

/**
 * rhltable_remove - remove object from hash list table
 * @hlt:        hash list table
 * @list:        pointer to hash list head inside object
 * @params:        hash table parameters
 *
 * Since the hash chain is single linked, the removal operation needs to
 * walk the bucket chain upon removal. The removal operation is thus
 * considerably slower if the hash table is not correctly sized.
 *
 * Will automatically shrink the table if permitted when residency drops
 * below 30%
 *
 * Returns zero on success, -ENOENT if the entry could not be found.
 */
static __always_inline int rhltable_remove(
        struct rhltable *hlt, struct rhlist_head *list,
        const struct rhashtable_params params)
{
        return __rhashtable_remove_fast(&hlt->ht, &list->rhead, params, true);
}

/* Internal function, please use rhashtable_replace_fast() instead */
static __always_inline int __rhashtable_replace_fast(
        struct rhashtable *ht, struct bucket_table *tbl,
        struct rhash_head *obj_old, struct rhash_head *obj_new,
        const struct rhashtable_params params)
{
        struct rhash_lock_head __rcu **bkt;
        struct rhash_head __rcu **pprev;
        struct rhash_head *he;
        unsigned long flags;
        unsigned int hash;
        int err = -ENOENT;

        /* Minimally, the old and new objects must have same hash
         * (which should mean identifiers are the same).
         */
        hash = rht_head_hashfn(ht, tbl, obj_old, params);
        if (hash != rht_head_hashfn(ht, tbl, obj_new, params))
                return -EINVAL;

        bkt = rht_bucket_var(tbl, hash);
        if (!bkt)
                return -ENOENT;

        pprev = NULL;
        flags = rht_lock(tbl, bkt);

        rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
                if (he != obj_old) {
                        pprev = &he->next;
                        continue;
                }

                rcu_assign_pointer(obj_new->next, obj_old->next);
                if (pprev) {
                        rcu_assign_pointer(*pprev, obj_new);
                        rht_unlock(tbl, bkt, flags);
                } else {
                        rht_assign_unlock(tbl, bkt, obj_new, flags);
                }
                err = 0;
                goto unlocked;
        }

        rht_unlock(tbl, bkt, flags);

unlocked:
        return err;
}

/**
 * rhashtable_replace_fast - replace an object in hash table
 * @ht:                hash table
 * @obj_old:        pointer to hash head inside object being replaced
 * @obj_new:        pointer to hash head inside object which is new
 * @params:        hash table parameters
 *
 * Replacing an object doesn't affect the number of elements in the hash table
 * or bucket, so we don't need to worry about shrinking or expanding the
 * table here.
 *
 * Returns zero on success, -ENOENT if the entry could not be found,
 * -EINVAL if hash is not the same for the old and new objects.
 */
static __always_inline int rhashtable_replace_fast(
        struct rhashtable *ht, struct rhash_head *obj_old,
        struct rhash_head *obj_new,
        const struct rhashtable_params params)
{
        struct bucket_table *tbl;
        int err;

        rcu_read_lock();

        tbl = rht_dereference_rcu(ht->tbl, ht);

        /* Because we have already taken (and released) the bucket
         * lock in old_tbl, if we find that future_tbl is not yet
         * visible then that guarantees the entry to still be in
         * the old tbl if it exists.
         */
        while ((err = __rhashtable_replace_fast(ht, tbl, obj_old,
                                                obj_new, params)) &&
               (tbl = rht_dereference_rcu(tbl->future_tbl, ht)))
                ;

        rcu_read_unlock();

        return err;
}

/**
 * rhltable_walk_enter - Initialise an iterator
 * @hlt:        Table to walk over
 * @iter:        Hash table Iterator
 *
 * This function prepares a hash table walk.
 *
 * Note that if you restart a walk after rhashtable_walk_stop you
 * may see the same object twice.  Also, you may miss objects if
 * there are removals in between rhashtable_walk_stop and the next
 * call to rhashtable_walk_start.
 *
 * For a completely stable walk you should construct your own data
 * structure outside the hash table.
 *
 * This function may be called from any process context, including
 * non-preemptable context, but cannot be called from softirq or
 * hardirq context.
 *
 * You must call rhashtable_walk_exit after this function returns.
 */
static inline void rhltable_walk_enter(struct rhltable *hlt,
                                       struct rhashtable_iter *iter)
{
        rhashtable_walk_enter(&hlt->ht, iter);
}

/**
 * rhltable_free_and_destroy - free elements and destroy hash list table
 * @hlt:        the hash list table to destroy
 * @free_fn:        callback to release resources of element
 * @arg:        pointer passed to free_fn
 *
 * See documentation for rhashtable_free_and_destroy.
 */
static inline void rhltable_free_and_destroy(struct rhltable *hlt,
                                             void (*free_fn)(void *ptr,
                                                             void *arg),
                                             void *arg)
{
        rhashtable_free_and_destroy(&hlt->ht, free_fn, arg);
}

static inline void rhltable_destroy(struct rhltable *hlt)
{
        rhltable_free_and_destroy(hlt, NULL, NULL);
}

#endif /* _LINUX_RHASHTABLE_H */






























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RCULIST_NULLS_H
#define _LINUX_RCULIST_NULLS_H

#ifdef __KERNEL__

/*
 * RCU-protected list version
 */
#include <linux/list_nulls.h>
#include <linux/rcupdate.h>

/**
 * hlist_nulls_del_init_rcu - deletes entry from hash list with re-initialization
 * @n: the element to delete from the hash list.
 *
 * Note: hlist_nulls_unhashed() on the node return true after this. It is
 * useful for RCU based read lockfree traversal if the writer side
 * must know if the list entry is still hashed or already unhashed.
 *
 * In particular, it means that we can not poison the forward pointers
 * that may still be used for walking the hash list and we can only
 * zero the pprev pointer so list_unhashed() will return true after
 * this.
 *
 * The caller must take whatever precautions are necessary (such as
 * holding appropriate locks) to avoid racing with another
 * list-mutation primitive, such as hlist_nulls_add_head_rcu() or
 * hlist_nulls_del_rcu(), running on this same list.  However, it is
 * perfectly legal to run concurrently with the _rcu list-traversal
 * primitives, such as hlist_nulls_for_each_entry_rcu().
 */
static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
{
        if (!hlist_nulls_unhashed(n)) {
                __hlist_nulls_del(n);
                WRITE_ONCE(n->pprev, NULL);
        }
}

/**
 * hlist_nulls_first_rcu - returns the first element of the hash list.
 * @head: the head of the list.
 */
#define hlist_nulls_first_rcu(head) \
        (*((struct hlist_nulls_node __rcu __force **)&(head)->first))

/**
 * hlist_nulls_next_rcu - returns the element of the list after @node.
 * @node: element of the list.
 */
#define hlist_nulls_next_rcu(node) \
        (*((struct hlist_nulls_node __rcu __force **)&(node)->next))

/**
 * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization
 * @n: the element to delete from the hash list.
 *
 * Note: hlist_nulls_unhashed() on entry does not return true after this,
 * the entry is in an undefined state. It is useful for RCU based
 * lockfree traversal.
 *
 * In particular, it means that we can not poison the forward
 * pointers that may still be used for walking the hash list.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
 * or hlist_nulls_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_nulls_for_each_entry().
 */
static inline void hlist_nulls_del_rcu(struct hlist_nulls_node *n)
{
        __hlist_nulls_del(n);
        WRITE_ONCE(n->pprev, LIST_POISON2);
}

/**
 * hlist_nulls_add_head_rcu
 * @n: the element to add to the hash list.
 * @h: the list to add to.
 *
 * Description:
 * Adds the specified element to the specified hlist_nulls,
 * while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
 * or hlist_nulls_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.  Regardless of the type of CPU, the
 * list-traversal primitive must be guarded by rcu_read_lock().
 */
static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
                                        struct hlist_nulls_head *h)
{
        struct hlist_nulls_node *first = h->first;

        WRITE_ONCE(n->next, first);
        WRITE_ONCE(n->pprev, &h->first);
        rcu_assign_pointer(hlist_nulls_first_rcu(h), n);
        if (!is_a_nulls(first))
                WRITE_ONCE(first->pprev, &n->next);
}

/**
 * hlist_nulls_add_tail_rcu
 * @n: the element to add to the hash list.
 * @h: the list to add to.
 *
 * Description:
 * Adds the specified element to the specified hlist_nulls,
 * while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
 * or hlist_nulls_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.  Regardless of the type of CPU, the
 * list-traversal primitive must be guarded by rcu_read_lock().
 */
static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n,
                                            struct hlist_nulls_head *h)
{
        struct hlist_nulls_node *i, *last = NULL;

        /* Note: write side code, so rcu accessors are not needed. */
        for (i = h->first; !is_a_nulls(i); i = i->next)
                last = i;

        if (last) {
                WRITE_ONCE(n->next, last->next);
                n->pprev = &last->next;
                rcu_assign_pointer(hlist_nulls_next_rcu(last), n);
        } else {
                hlist_nulls_add_head_rcu(n, h);
        }
}

/* after that hlist_nulls_del will work */
static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n)
{
        n->pprev = &n->next;
        n->next = (struct hlist_nulls_node *)NULLS_MARKER(NULL);
}

/**
 * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_nulls_node to use as a loop cursor.
 * @head:        the head of the list.
 * @member:        the name of the hlist_nulls_node within the struct.
 *
 * The barrier() is needed to make sure compiler doesn't cache first element [1],
 * as this loop can be restarted [2]
 * [1] Documentation/memory-barriers.txt around line 1533
 * [2] Documentation/RCU/rculist_nulls.rst around line 146
 */
#define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member)                        \
        for (({barrier();}),                                                        \
             pos = rcu_dereference_raw(hlist_nulls_first_rcu(head));                \
                (!is_a_nulls(pos)) &&                                                \
                ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \
                pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)))

/**
 * hlist_nulls_for_each_entry_safe -
 *   iterate over list of given type safe against removal of list entry
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_nulls_node to use as a loop cursor.
 * @head:        the head of the list.
 * @member:        the name of the hlist_nulls_node within the struct.
 */
#define hlist_nulls_for_each_entry_safe(tpos, pos, head, member)                \
        for (({barrier();}),                                                        \
             pos = rcu_dereference_raw(hlist_nulls_first_rcu(head));                \
                (!is_a_nulls(pos)) &&                                                \
                ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member);        \
                   pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)); 1; });)
#endif
#endif





















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PID_NS_H
#define _LINUX_PID_NS_H

#include <linux/sched.h>
#include <linux/bug.h>
#include <linux/mm.h>
#include <linux/workqueue.h>
#include <linux/threads.h>
#include <linux/nsproxy.h>
#include <linux/ns_common.h>
#include <linux/idr.h>

/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
#define MAX_PID_NS_LEVEL 32

struct fs_pin;

#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
/* modes for vm.memfd_noexec sysctl */
#define MEMFD_NOEXEC_SCOPE_EXEC                        0 /* MFD_EXEC implied if unset */
#define MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL                1 /* MFD_NOEXEC_SEAL implied if unset */
#define MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED        2 /* same as 1, except MFD_EXEC rejected */
#endif

struct pid_namespace {
        struct idr idr;
        struct rcu_head rcu;
        unsigned int pid_allocated;
        struct task_struct *child_reaper;
        struct kmem_cache *pid_cachep;
        unsigned int level;
        int pid_max;
        struct pid_namespace *parent;
#ifdef CONFIG_BSD_PROCESS_ACCT
        struct fs_pin *bacct;
#endif
        struct user_namespace *user_ns;
        struct ucounts *ucounts;
        int reboot;        /* group exit code if this pidns was rebooted */
        struct ns_common ns;
        struct work_struct        work;
#ifdef CONFIG_SYSCTL
        struct ctl_table_set        set;
        struct ctl_table_header *sysctls;
#if defined(CONFIG_MEMFD_CREATE)
        int memfd_noexec_scope;
#endif
#endif
} __randomize_layout;

extern struct pid_namespace init_pid_ns;

#define PIDNS_ADDING (1U << 31)

#ifdef CONFIG_PID_NS
static inline struct pid_namespace *to_pid_ns(struct ns_common *ns)
{
        return container_of(ns, struct pid_namespace, ns);
}

static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
{
        if (ns != &init_pid_ns)
                ns_ref_inc(ns);
        return ns;
}

#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns)
{
        int scope = MEMFD_NOEXEC_SCOPE_EXEC;

        for (; ns; ns = ns->parent)
                scope = max(scope, READ_ONCE(ns->memfd_noexec_scope));

        return scope;
}
#else
static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns)
{
        return 0;
}
#endif

extern struct pid_namespace *copy_pid_ns(u64 flags,
        struct user_namespace *user_ns, struct pid_namespace *ns);
extern void zap_pid_ns_processes(struct pid_namespace *pid_ns);
extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd);
extern void put_pid_ns(struct pid_namespace *ns);

extern bool pidns_is_ancestor(struct pid_namespace *child,
                              struct pid_namespace *ancestor);

#else /* !CONFIG_PID_NS */
#include <linux/err.h>

static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
{
        return ns;
}

static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns)
{
        return 0;
}

static inline struct pid_namespace *copy_pid_ns(u64 flags,
        struct user_namespace *user_ns, struct pid_namespace *ns)
{
        if (flags & CLONE_NEWPID)
                ns = ERR_PTR(-EINVAL);
        return ns;
}

static inline void put_pid_ns(struct pid_namespace *ns)
{
}

static inline void zap_pid_ns_processes(struct pid_namespace *ns)
{
        BUG();
}

static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
{
        return 0;
}

static inline bool pidns_is_ancestor(struct pid_namespace *child,
                                     struct pid_namespace *ancestor)
{
        return false;
}
#endif /* CONFIG_PID_NS */

extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
void pidhash_init(void);
void pid_idr_init(void);
int register_pidns_sysctls(struct pid_namespace *pidns);
void unregister_pidns_sysctls(struct pid_namespace *pidns);

static inline bool task_is_in_init_pid_ns(struct task_struct *tsk)
{
        return task_active_pid_ns(tsk) == &init_pid_ns;
}

#endif /* _LINUX_PID_NS_H */

















































































































































































































































































































































































    4 
























































    4 


    4 



    4 

































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
// SPDX-License-Identifier: GPL-2.0
#define CREATE_TRACE_POINTS
#include <trace/events/mmap_lock.h>

#include <linux/mm.h>
#include <linux/cgroup.h>
#include <linux/memcontrol.h>
#include <linux/mmap_lock.h>
#include <linux/mutex.h>
#include <linux/percpu.h>
#include <linux/rcupdate.h>
#include <linux/smp.h>
#include <linux/trace_events.h>
#include <linux/local_lock.h>

EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking);
EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned);
EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);

#ifdef CONFIG_TRACING
/*
 * Trace calls must be in a separate file, as otherwise there's a circular
 * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h.
 */

void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write)
{
        trace_mmap_lock_start_locking(mm, write);
}
EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking);

void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
                                           bool success)
{
        trace_mmap_lock_acquire_returned(mm, write, success);
}
EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned);

void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
{
        trace_mmap_lock_released(mm, write);
}
EXPORT_SYMBOL(__mmap_lock_do_trace_released);
#endif /* CONFIG_TRACING */

#ifdef CONFIG_MMU
#ifdef CONFIG_PER_VMA_LOCK
static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching)
{
        unsigned int tgt_refcnt = VMA_LOCK_OFFSET;

        /* Additional refcnt if the vma is attached. */
        if (!detaching)
                tgt_refcnt++;

        /*
         * If vma is detached then only vma_mark_attached() can raise the
         * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
         */
        if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
                return false;

        rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
        rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
                   refcount_read(&vma->vm_refcnt) == tgt_refcnt,
                   TASK_UNINTERRUPTIBLE);
        lock_acquired(&vma->vmlock_dep_map, _RET_IP_);

        return true;
}

static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
{
        *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
        rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
}

void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
{
        bool locked;

        /*
         * __vma_enter_locked() returns false immediately if the vma is not
         * attached, otherwise it waits until refcnt is indicating that vma
         * is attached with no readers.
         */
        locked = __vma_enter_locked(vma, false);

        /*
         * We should use WRITE_ONCE() here because we can have concurrent reads
         * from the early lockless pessimistic check in vma_start_read().
         * We don't really care about the correctness of that early check, but
         * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
         */
        WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);

        if (locked) {
                bool detached;

                __vma_exit_locked(vma, &detached);
                WARN_ON_ONCE(detached); /* vma should remain attached */
        }
}
EXPORT_SYMBOL_GPL(__vma_start_write);

void vma_mark_detached(struct vm_area_struct *vma)
{
        vma_assert_write_locked(vma);
        vma_assert_attached(vma);

        /*
         * We are the only writer, so no need to use vma_refcount_put().
         * The condition below is unlikely because the vma has been already
         * write-locked and readers can increment vm_refcnt only temporarily
         * before they check vm_lock_seq, realize the vma is locked and drop
         * back the vm_refcnt. That is a narrow window for observing a raised
         * vm_refcnt.
         */
        if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
                /* Wait until vma is detached with no readers. */
                if (__vma_enter_locked(vma, true)) {
                        bool detached;

                        __vma_exit_locked(vma, &detached);
                        WARN_ON_ONCE(!detached);
                }
        }
}

/*
 * Try to read-lock a vma. The function is allowed to occasionally yield false
 * locked result to avoid performance overhead, in which case we fall back to
 * using mmap_lock. The function should never yield false unlocked result.
 * False locked result is possible if mm_lock_seq overflows or if vma gets
 * reused and attached to a different mm before we lock it.
 * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
 * detached.
 *
 * IMPORTANT: RCU lock must be held upon entering the function, but upon error
 *            IT IS RELEASED. The caller must handle this correctly.
 */
static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
                                                    struct vm_area_struct *vma)
{
        struct mm_struct *other_mm;
        int oldcnt;

        RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held");
        /*
         * Check before locking. A race might cause false locked result.
         * We can use READ_ONCE() for the mm_lock_seq here, and don't need
         * ACQUIRE semantics, because this is just a lockless check whose result
         * we don't rely on for anything - the mm_lock_seq read against which we
         * need ordering is below.
         */
        if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) {
                vma = NULL;
                goto err;
        }

        /*
         * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire()
         * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET.
         * Acquire fence is required here to avoid reordering against later
         * vm_lock_seq check and checks inside lock_vma_under_rcu().
         */
        if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
                                                              VMA_REF_LIMIT))) {
                /* return EAGAIN if vma got detached from under us */
                vma = oldcnt ? NULL : ERR_PTR(-EAGAIN);
                goto err;
        }

        rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);

        if (unlikely(vma->vm_mm != mm))
                goto err_unstable;

        /*
         * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
         * False unlocked result is impossible because we modify and check
         * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
         * modification invalidates all existing locks.
         *
         * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
         * racing with vma_end_write_all(), we only start reading from the VMA
         * after it has been unlocked.
         * This pairs with RELEASE semantics in vma_end_write_all().
         */
        if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
                vma_refcount_put(vma);
                vma = NULL;
                goto err;
        }

        return vma;
err:
        rcu_read_unlock();

        return vma;
err_unstable:
        /*
         * If vma got attached to another mm from under us, that mm is not
         * stable and can be freed in the narrow window after vma->vm_refcnt
         * is dropped and before rcuwait_wake_up(mm) is called. Grab it before
         * releasing vma->vm_refcnt.
         */
        other_mm = vma->vm_mm; /* use a copy as vma can be freed after we drop vm_refcnt */

        /* __mmdrop() is a heavy operation, do it after dropping RCU lock. */
        rcu_read_unlock();
        mmgrab(other_mm);
        vma_refcount_put(vma);
        mmdrop(other_mm);

        return NULL;
}

/*
 * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
 * stable and not isolated. If the VMA is not found or is being modified the
 * function returns NULL.
 */
struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
                                          unsigned long address)
{
        MA_STATE(mas, &mm->mm_mt, address, address);
        struct vm_area_struct *vma;

retry:
        rcu_read_lock();
        vma = mas_walk(&mas);
        if (!vma) {
                rcu_read_unlock();
                goto inval;
        }

        vma = vma_start_read(mm, vma);
        if (IS_ERR_OR_NULL(vma)) {
                /* Check if the VMA got isolated after we found it */
                if (PTR_ERR(vma) == -EAGAIN) {
                        count_vm_vma_lock_event(VMA_LOCK_MISS);
                        /* The area was replaced with another one */
                        mas_set(&mas, address);
                        goto retry;
                }

                /* Failed to lock the VMA */
                goto inval;
        }
        /*
         * At this point, we have a stable reference to a VMA: The VMA is
         * locked and we know it hasn't already been isolated.
         * From here on, we can access the VMA without worrying about which
         * fields are accessible for RCU readers.
         */
        rcu_read_unlock();

        /* Check if the vma we locked is the right one. */
        if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
                vma_end_read(vma);
                goto inval;
        }

        return vma;

inval:
        count_vm_vma_lock_event(VMA_LOCK_ABORT);
        return NULL;
}

static struct vm_area_struct *lock_next_vma_under_mmap_lock(struct mm_struct *mm,
                                                            struct vma_iterator *vmi,
                                                            unsigned long from_addr)
{
        struct vm_area_struct *vma;
        int ret;

        ret = mmap_read_lock_killable(mm);
        if (ret)
                return ERR_PTR(ret);

        /* Lookup the vma at the last position again under mmap_read_lock */
        vma_iter_set(vmi, from_addr);
        vma = vma_next(vmi);
        if (vma) {
                /* Very unlikely vma->vm_refcnt overflow case */
                if (unlikely(!vma_start_read_locked(vma)))
                        vma = ERR_PTR(-EAGAIN);
        }

        mmap_read_unlock(mm);

        return vma;
}

struct vm_area_struct *lock_next_vma(struct mm_struct *mm,
                                     struct vma_iterator *vmi,
                                     unsigned long from_addr)
{
        struct vm_area_struct *vma;
        unsigned int mm_wr_seq;
        bool mmap_unlocked;

        RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held");
retry:
        /* Start mmap_lock speculation in case we need to verify the vma later */
        mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq);
        vma = vma_next(vmi);
        if (!vma)
                return NULL;

        vma = vma_start_read(mm, vma);
        if (IS_ERR_OR_NULL(vma)) {
                /*
                 * Retry immediately if the vma gets detached from under us.
                 * Infinite loop should not happen because the vma we find will
                 * have to be constantly knocked out from under us.
                 */
                if (PTR_ERR(vma) == -EAGAIN) {
                        /* reset to search from the last address */
                        rcu_read_lock();
                        vma_iter_set(vmi, from_addr);
                        goto retry;
                }

                goto fallback;
        }

        /* Verify the vma is not behind the last search position. */
        if (unlikely(from_addr >= vma->vm_end))
                goto fallback_unlock;

        /*
         * vma can be ahead of the last search position but we need to verify
         * it was not shrunk after we found it and another vma has not been
         * installed ahead of it. Otherwise we might observe a gap that should
         * not be there.
         */
        if (from_addr < vma->vm_start) {
                /* Verify only if the address space might have changed since vma lookup. */
                if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) {
                        vma_iter_set(vmi, from_addr);
                        if (vma != vma_next(vmi))
                                goto fallback_unlock;
                }
        }

        return vma;

fallback_unlock:
        rcu_read_unlock();
        vma_end_read(vma);
fallback:
        vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr);
        rcu_read_lock();
        /* Reinitialize the iterator after re-entering rcu read section */
        vma_iter_set(vmi, IS_ERR_OR_NULL(vma) ? from_addr : vma->vm_end);

        return vma;
}
#endif /* CONFIG_PER_VMA_LOCK */

#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
#include <linux/extable.h>

static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
{
        if (likely(mmap_read_trylock(mm)))
                return true;

        if (regs && !user_mode(regs)) {
                unsigned long ip = exception_ip(regs);
                if (!search_exception_tables(ip))
                        return false;
        }

        return !mmap_read_lock_killable(mm);
}

static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
{
        /*
         * We don't have this operation yet.
         *
         * It should be easy enough to do: it's basically a
         *    atomic_long_try_cmpxchg_acquire()
         * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
         * it also needs the proper lockdep magic etc.
         */
        return false;
}

static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
{
        mmap_read_unlock(mm);
        if (regs && !user_mode(regs)) {
                unsigned long ip = exception_ip(regs);
                if (!search_exception_tables(ip))
                        return false;
        }
        return !mmap_write_lock_killable(mm);
}

/*
 * Helper for page fault handling.
 *
 * This is kind of equivalent to "mmap_read_lock()" followed
 * by "find_extend_vma()", except it's a lot more careful about
 * the locking (and will drop the lock on failure).
 *
 * For example, if we have a kernel bug that causes a page
 * fault, we don't want to just use mmap_read_lock() to get
 * the mm lock, because that would deadlock if the bug were
 * to happen while we're holding the mm lock for writing.
 *
 * So this checks the exception tables on kernel faults in
 * order to only do this all for instructions that are actually
 * expected to fault.
 *
 * We can also actually take the mm lock for writing if we
 * need to extend the vma, which helps the VM layer a lot.
 */
struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
                        unsigned long addr, struct pt_regs *regs)
{
        struct vm_area_struct *vma;

        if (!get_mmap_lock_carefully(mm, regs))
                return NULL;

        vma = find_vma(mm, addr);
        if (likely(vma && (vma->vm_start <= addr)))
                return vma;

        /*
         * Well, dang. We might still be successful, but only
         * if we can extend a vma to do so.
         */
        if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
                mmap_read_unlock(mm);
                return NULL;
        }

        /*
         * We can try to upgrade the mmap lock atomically,
         * in which case we can continue to use the vma
         * we already looked up.
         *
         * Otherwise we'll have to drop the mmap lock and
         * re-take it, and also look up the vma again,
         * re-checking it.
         */
        if (!mmap_upgrade_trylock(mm)) {
                if (!upgrade_mmap_lock_carefully(mm, regs))
                        return NULL;

                vma = find_vma(mm, addr);
                if (!vma)
                        goto fail;
                if (vma->vm_start <= addr)
                        goto success;
                if (!(vma->vm_flags & VM_GROWSDOWN))
                        goto fail;
        }

        if (expand_stack_locked(vma, addr))
                goto fail;

success:
        mmap_write_downgrade(mm);
        return vma;

fail:
        mmap_write_unlock(mm);
        return NULL;
}
#endif /* CONFIG_LOCK_MM_AND_FIND_VMA */

#else /* CONFIG_MMU */

/*
 * At least xtensa ends up having protection faults even with no
 * MMU.. No stack expansion, at least.
 */
struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
                        unsigned long addr, struct pt_regs *regs)
{
        struct vm_area_struct *vma;

        mmap_read_lock(mm);
        vma = vma_lookup(mm, addr);
        if (!vma)
                mmap_read_unlock(mm);
        return vma;
}

#endif /* CONFIG_MMU */




































































































    3 






   20 

























































































   19 
















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Hash algorithms.
 * 
 * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au>
 */

#ifndef _CRYPTO_INTERNAL_HASH_H
#define _CRYPTO_INTERNAL_HASH_H

#include <crypto/algapi.h>
#include <crypto/hash.h>

/* Set this bit to handle partial blocks in the API. */
#define CRYPTO_AHASH_ALG_BLOCK_ONLY        0x01000000

/* Set this bit if final requires at least one byte. */
#define CRYPTO_AHASH_ALG_FINAL_NONZERO        0x02000000

/* Set this bit if finup can deal with multiple blocks. */
#define CRYPTO_AHASH_ALG_FINUP_MAX        0x04000000

/* This bit is set by the Crypto API if export_core is not supported. */
#define CRYPTO_AHASH_ALG_NO_EXPORT_CORE        0x08000000

#define HASH_FBREQ_ON_STACK(name, req) \
        char __##name##_req[sizeof(struct ahash_request) + \
                            MAX_SYNC_HASH_REQSIZE] CRYPTO_MINALIGN_ATTR; \
        struct ahash_request *name = ahash_fbreq_on_stack_init( \
                __##name##_req, (req))

struct ahash_request;
struct scatterlist;

struct crypto_hash_walk {
        const char *data;

        unsigned int offset;
        unsigned int flags;

        struct page *pg;
        unsigned int entrylen;

        unsigned int total;
        struct scatterlist *sg;
};

struct ahash_instance {
        void (*free)(struct ahash_instance *inst);
        union {
                struct {
                        char head[offsetof(struct ahash_alg, halg.base)];
                        struct crypto_instance base;
                } s;
                struct ahash_alg alg;
        };
};

struct shash_instance {
        void (*free)(struct shash_instance *inst);
        union {
                struct {
                        char head[offsetof(struct shash_alg, base)];
                        struct crypto_instance base;
                } s;
                struct shash_alg alg;
        };
};

struct crypto_ahash_spawn {
        struct crypto_spawn base;
};

struct crypto_shash_spawn {
        struct crypto_spawn base;
};

int crypto_hash_walk_done(struct crypto_hash_walk *walk, int err);
int crypto_hash_walk_first(struct ahash_request *req,
                           struct crypto_hash_walk *walk);

static inline int crypto_hash_walk_last(struct crypto_hash_walk *walk)
{
        return !(walk->entrylen | walk->total);
}

int crypto_register_ahash(struct ahash_alg *alg);
void crypto_unregister_ahash(struct ahash_alg *alg);
int crypto_register_ahashes(struct ahash_alg *algs, int count);
void crypto_unregister_ahashes(struct ahash_alg *algs, int count);
int ahash_register_instance(struct crypto_template *tmpl,
                            struct ahash_instance *inst);
void ahash_free_singlespawn_instance(struct ahash_instance *inst);

int shash_no_setkey(struct crypto_shash *tfm, const u8 *key,
                    unsigned int keylen);

static inline bool crypto_shash_alg_has_setkey(struct shash_alg *alg)
{
        return alg->setkey != shash_no_setkey;
}

bool crypto_hash_alg_has_setkey(struct hash_alg_common *halg);

static inline bool crypto_shash_alg_needs_key(struct shash_alg *alg)
{
        return crypto_shash_alg_has_setkey(alg) &&
                !(alg->base.cra_flags & CRYPTO_ALG_OPTIONAL_KEY);
}

static inline bool crypto_hash_alg_needs_key(struct hash_alg_common *alg)
{
        return crypto_hash_alg_has_setkey(alg) &&
                !(alg->base.cra_flags & CRYPTO_ALG_OPTIONAL_KEY);
}

static inline bool crypto_hash_no_export_core(struct crypto_ahash *tfm)
{
        return crypto_hash_alg_common(tfm)->base.cra_flags &
               CRYPTO_AHASH_ALG_NO_EXPORT_CORE;
}

int crypto_grab_ahash(struct crypto_ahash_spawn *spawn,
                      struct crypto_instance *inst,
                      const char *name, u32 type, u32 mask);

static inline void crypto_drop_ahash(struct crypto_ahash_spawn *spawn)
{
        crypto_drop_spawn(&spawn->base);
}

static inline struct hash_alg_common *crypto_spawn_ahash_alg(
        struct crypto_ahash_spawn *spawn)
{
        return __crypto_hash_alg_common(spawn->base.alg);
}

int crypto_register_shash(struct shash_alg *alg);
void crypto_unregister_shash(struct shash_alg *alg);
int crypto_register_shashes(struct shash_alg *algs, int count);
void crypto_unregister_shashes(struct shash_alg *algs, int count);
int shash_register_instance(struct crypto_template *tmpl,
                            struct shash_instance *inst);
void shash_free_singlespawn_instance(struct shash_instance *inst);

int crypto_grab_shash(struct crypto_shash_spawn *spawn,
                      struct crypto_instance *inst,
                      const char *name, u32 type, u32 mask);

static inline void crypto_drop_shash(struct crypto_shash_spawn *spawn)
{
        crypto_drop_spawn(&spawn->base);
}

static inline struct shash_alg *crypto_spawn_shash_alg(
        struct crypto_shash_spawn *spawn)
{
        return __crypto_shash_alg(spawn->base.alg);
}

int shash_ahash_update(struct ahash_request *req, struct shash_desc *desc);
int shash_ahash_finup(struct ahash_request *req, struct shash_desc *desc);
int shash_ahash_digest(struct ahash_request *req, struct shash_desc *desc);

static inline void *crypto_ahash_ctx(struct crypto_ahash *tfm)
{
        return crypto_tfm_ctx(crypto_ahash_tfm(tfm));
}

static inline void *crypto_ahash_ctx_dma(struct crypto_ahash *tfm)
{
        return crypto_tfm_ctx_dma(crypto_ahash_tfm(tfm));
}

static inline struct ahash_alg *__crypto_ahash_alg(struct crypto_alg *alg)
{
        return container_of(__crypto_hash_alg_common(alg), struct ahash_alg,
                            halg);
}

static inline struct ahash_alg *crypto_ahash_alg(struct crypto_ahash *hash)
{
        return container_of(crypto_hash_alg_common(hash), struct ahash_alg,
                            halg);
}

static inline void crypto_ahash_set_statesize(struct crypto_ahash *tfm,
                                              unsigned int size)
{
        tfm->statesize = size;
}

static inline void crypto_ahash_set_reqsize(struct crypto_ahash *tfm,
                                            unsigned int reqsize)
{
        tfm->reqsize = reqsize;
}

static inline bool crypto_ahash_tested(struct crypto_ahash *tfm)
{
        struct crypto_tfm *tfm_base = crypto_ahash_tfm(tfm);

        return tfm_base->__crt_alg->cra_flags & CRYPTO_ALG_TESTED;
}

static inline void crypto_ahash_set_reqsize_dma(struct crypto_ahash *ahash,
                                                unsigned int reqsize)
{
        reqsize += crypto_dma_align() & ~(crypto_tfm_ctx_alignment() - 1);
        ahash->reqsize = reqsize;
}

static inline struct crypto_instance *ahash_crypto_instance(
        struct ahash_instance *inst)
{
        return &inst->s.base;
}

static inline struct ahash_instance *ahash_instance(
        struct crypto_instance *inst)
{
        return container_of(inst, struct ahash_instance, s.base);
}

static inline struct ahash_instance *ahash_alg_instance(
        struct crypto_ahash *ahash)
{
        return ahash_instance(crypto_tfm_alg_instance(&ahash->base));
}

static inline void *ahash_instance_ctx(struct ahash_instance *inst)
{
        return crypto_instance_ctx(ahash_crypto_instance(inst));
}

static inline void *ahash_request_ctx_dma(struct ahash_request *req)
{
        unsigned int align = crypto_dma_align();

        if (align <= crypto_tfm_ctx_alignment())
                align = 1;

        return PTR_ALIGN(ahash_request_ctx(req), align);
}

static inline void ahash_request_complete(struct ahash_request *req, int err)
{
        crypto_request_complete(&req->base, err);
}

static inline u32 ahash_request_flags(struct ahash_request *req)
{
        return crypto_request_flags(&req->base) & ~CRYPTO_AHASH_REQ_PRIVATE;
}

static inline struct crypto_ahash *crypto_spawn_ahash(
        struct crypto_ahash_spawn *spawn)
{
        return crypto_spawn_tfm2(&spawn->base);
}

static inline int ahash_enqueue_request(struct crypto_queue *queue,
                                             struct ahash_request *request)
{
        return crypto_enqueue_request(queue, &request->base);
}

static inline struct ahash_request *ahash_dequeue_request(
        struct crypto_queue *queue)
{
        return ahash_request_cast(crypto_dequeue_request(queue));
}

static inline void *crypto_shash_ctx(struct crypto_shash *tfm)
{
        return crypto_tfm_ctx(&tfm->base);
}

static inline struct crypto_instance *shash_crypto_instance(
        struct shash_instance *inst)
{
        return &inst->s.base;
}

static inline struct shash_instance *shash_instance(
        struct crypto_instance *inst)
{
        return container_of(inst, struct shash_instance, s.base);
}

static inline struct shash_instance *shash_alg_instance(
        struct crypto_shash *shash)
{
        return shash_instance(crypto_tfm_alg_instance(&shash->base));
}

static inline void *shash_instance_ctx(struct shash_instance *inst)
{
        return crypto_instance_ctx(shash_crypto_instance(inst));
}

static inline struct crypto_shash *crypto_spawn_shash(
        struct crypto_shash_spawn *spawn)
{
        return crypto_spawn_tfm2(&spawn->base);
}

static inline struct crypto_shash *__crypto_shash_cast(struct crypto_tfm *tfm)
{
        return container_of(tfm, struct crypto_shash, base);
}

static inline bool ahash_request_isvirt(struct ahash_request *req)
{
        return req->base.flags & CRYPTO_AHASH_REQ_VIRT;
}

static inline bool crypto_ahash_req_virt(struct crypto_ahash *tfm)
{
        return crypto_tfm_req_virt(&tfm->base);
}

static inline struct crypto_ahash *crypto_ahash_fb(struct crypto_ahash *tfm)
{
        return __crypto_ahash_cast(crypto_ahash_tfm(tfm)->fb);
}

static inline struct ahash_request *ahash_fbreq_on_stack_init(
        char *buf, struct ahash_request *old)
{
        struct crypto_ahash *tfm = crypto_ahash_reqtfm(old);
        struct ahash_request *req = (void *)buf;

        crypto_stack_request_init(&req->base,
                                  crypto_ahash_tfm(crypto_ahash_fb(tfm)));
        ahash_request_set_callback(req, ahash_request_flags(old), NULL, NULL);
        req->base.flags &= ~CRYPTO_AHASH_REQ_PRIVATE;
        req->base.flags |= old->base.flags & CRYPTO_AHASH_REQ_PRIVATE;
        req->src = old->src;
        req->result = old->result;
        req->nbytes = old->nbytes;

        return req;
}

/* Return the state size without partial block for block-only algorithms. */
static inline unsigned int crypto_shash_coresize(struct crypto_shash *tfm)
{
        return crypto_shash_statesize(tfm) - crypto_shash_blocksize(tfm) - 1;
}

/* This can only be used if the request was never cloned. */
#define HASH_REQUEST_ZERO(name) \
        memzero_explicit(__##name##_req, sizeof(__##name##_req))

/**
 * crypto_ahash_export_core() - extract core state for message digest
 * @req: reference to the ahash_request handle whose state is exported
 * @out: output buffer of sufficient size that can hold the hash state
 *
 * Export the hash state without the partial block buffer.
 *
 * Context: Softirq or process context.
 * Return: 0 if the export creation was successful; < 0 if an error occurred
 */
int crypto_ahash_export_core(struct ahash_request *req, void *out);

/**
 * crypto_ahash_import_core() - import core state
 * @req: reference to ahash_request handle the state is imported into
 * @in: buffer holding the state
 *
 * Import the hash state without the partial block buffer.
 *
 * Context: Softirq or process context.
 * Return: 0 if the import was successful; < 0 if an error occurred
 */
int crypto_ahash_import_core(struct ahash_request *req, const void *in);

/**
 * crypto_shash_export_core() - extract core state for message digest
 * @desc: reference to the operational state handle whose state is exported
 * @out: output buffer of sufficient size that can hold the hash state
 *
 * Export the hash state without the partial block buffer.
 *
 * Context: Softirq or process context.
 * Return: 0 if the export creation was successful; < 0 if an error occurred
 */
int crypto_shash_export_core(struct shash_desc *desc, void *out);

/**
 * crypto_shash_import_core() - import core state
 * @desc: reference to the operational state handle the state imported into
 * @in: buffer holding the state
 *
 * Import the hash state without the partial block buffer.
 *
 * Context: Softirq or process context.
 * Return: 0 if the import was successful; < 0 if an error occurred
 */
int crypto_shash_import_core(struct shash_desc *desc, const void *in);

#endif        /* _CRYPTO_INTERNAL_HASH_H */






































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGTABLE_64_H
#define _ASM_X86_PGTABLE_64_H

#include <linux/const.h>
#include <asm/pgtable_64_types.h>

#ifndef __ASSEMBLER__

/*
 * This file contains the functions and defines necessary to modify and use
 * the x86-64 page table tree.
 */
#include <asm/processor.h>
#include <linux/bitops.h>
#include <linux/threads.h>
#include <asm/fixmap.h>

extern p4d_t level4_kernel_pgt[512];
extern p4d_t level4_ident_pgt[512];
extern pud_t level3_kernel_pgt[512];
extern pud_t level3_ident_pgt[512];
extern pmd_t level2_kernel_pgt[512];
extern pmd_t level2_fixmap_pgt[512];
extern pmd_t level2_ident_pgt[512];
extern pte_t level1_fixmap_pgt[512 * FIXMAP_PMD_NUM];
extern pgd_t init_top_pgt[];

#define swapper_pg_dir init_top_pgt

extern void paging_init(void);
static inline void sync_initial_page_table(void) { }

#define pte_ERROR(e)                                        \
        pr_err("%s:%d: bad pte %p(%016lx)\n",                \
               __FILE__, __LINE__, &(e), pte_val(e))
#define pmd_ERROR(e)                                        \
        pr_err("%s:%d: bad pmd %p(%016lx)\n",                \
               __FILE__, __LINE__, &(e), pmd_val(e))
#define pud_ERROR(e)                                        \
        pr_err("%s:%d: bad pud %p(%016lx)\n",                \
               __FILE__, __LINE__, &(e), pud_val(e))

#define p4d_ERROR(e)                                        \
        pr_err("%s:%d: bad p4d %p(%016lx)\n",                \
               __FILE__, __LINE__, &(e), p4d_val(e))

#define pgd_ERROR(e)                                        \
        pr_err("%s:%d: bad pgd %p(%016lx)\n",                \
               __FILE__, __LINE__, &(e), pgd_val(e))

struct mm_struct;

#define mm_p4d_folded mm_p4d_folded
static inline bool mm_p4d_folded(struct mm_struct *mm)
{
        return !pgtable_l5_enabled();
}

void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte);
void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);

static inline void native_set_pte(pte_t *ptep, pte_t pte)
{
        WRITE_ONCE(*ptep, pte);
}

static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
                                    pte_t *ptep)
{
        native_set_pte(ptep, native_make_pte(0));
}

static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
{
        native_set_pte(ptep, pte);
}

static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
{
        WRITE_ONCE(*pmdp, pmd);
}

static inline void native_pmd_clear(pmd_t *pmd)
{
        native_set_pmd(pmd, native_make_pmd(0));
}

static inline pte_t native_ptep_get_and_clear(pte_t *xp)
{
#ifdef CONFIG_SMP
        return native_make_pte(xchg(&xp->pte, 0));
#else
        /* native_local_ptep_get_and_clear,
           but duplicated because of cyclic dependency */
        pte_t ret = *xp;
        native_pte_clear(NULL, 0, xp);
        return ret;
#endif
}

static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
{
#ifdef CONFIG_SMP
        return native_make_pmd(xchg(&xp->pmd, 0));
#else
        /* native_local_pmdp_get_and_clear,
           but duplicated because of cyclic dependency */
        pmd_t ret = *xp;
        native_pmd_clear(xp);
        return ret;
#endif
}

static inline void native_set_pud(pud_t *pudp, pud_t pud)
{
        WRITE_ONCE(*pudp, pud);
}

static inline void native_pud_clear(pud_t *pud)
{
        native_set_pud(pud, native_make_pud(0));
}

static inline pud_t native_pudp_get_and_clear(pud_t *xp)
{
#ifdef CONFIG_SMP
        return native_make_pud(xchg(&xp->pud, 0));
#else
        /* native_local_pudp_get_and_clear,
         * but duplicated because of cyclic dependency
         */
        pud_t ret = *xp;

        native_pud_clear(xp);
        return ret;
#endif
}

static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
{
        pgd_t pgd;

        if (pgtable_l5_enabled() ||
            !IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) {
                WRITE_ONCE(*p4dp, p4d);
                return;
        }

        pgd = native_make_pgd(native_p4d_val(p4d));
        pgd = pti_set_user_pgtbl((pgd_t *)p4dp, pgd);
        WRITE_ONCE(*p4dp, native_make_p4d(native_pgd_val(pgd)));
}

static inline void native_p4d_clear(p4d_t *p4d)
{
        native_set_p4d(p4d, native_make_p4d(0));
}

static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
        WRITE_ONCE(*pgdp, pti_set_user_pgtbl(pgdp, pgd));
}

static inline void native_pgd_clear(pgd_t *pgd)
{
        native_set_pgd(pgd, native_make_pgd(0));
}

/*
 * Conversion functions: convert a page and protection to a page entry,
 * and a page entry and page directory to the page they refer to.
 */

/* PGD - Level 4 access */

/* PUD - Level 3 access */

/* PMD - Level 2 access */

/* PTE - Level 1 access */

/*
 * Encode and de-code a swap entry
 *
 * |     ...            | 11| 10|  9|8|7|6|5| 4| 3|2| 1|0| <- bit number
 * |     ...            |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names
 * | TYPE (59-63) | ~OFFSET (9-58)  |0|0|X|X| X| E|F|SD|0| <- swp entry
 *
 * G (8) is aliased and used as a PROT_NONE indicator for
 * !present ptes.  We need to start storing swap entries above
 * there.  We also need to avoid using A and D because of an
 * erratum where they can be incorrectly set by hardware on
 * non-present PTEs.
 *
 * SD Bits 1-4 are not used in non-present format and available for
 * special use described below:
 *
 * SD (1) in swp entry is used to store soft dirty bit, which helps us
 * remember soft dirty over page migration
 *
 * F (2) in swp entry is used to record when a pagetable is
 * writeprotected by userfaultfd WP support.
 *
 * E (3) in swp entry is used to remember PG_anon_exclusive.
 *
 * Bit 7 in swp entry should be 0 because pmd_present checks not only P,
 * but also L and G.
 *
 * The offset is inverted by a binary not operation to make the high
 * physical bits set.
 */
#define SWP_TYPE_BITS                5

#define SWP_OFFSET_FIRST_BIT        (_PAGE_BIT_PROTNONE + 1)

/* We always extract/encode the offset by shifting it all the way up, and then down again */
#define SWP_OFFSET_SHIFT        (SWP_OFFSET_FIRST_BIT+SWP_TYPE_BITS)

#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)

/* Extract the high bits for type */
#define __swp_type(x) ((x).val >> (64 - SWP_TYPE_BITS))

/* Shift up (to get rid of type), then down to get value */
#define __swp_offset(x) (~(x).val << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT)

/*
 * Shift the offset up "too far" by TYPE bits, then down again
 * The offset is inverted by a binary not operation to make the high
 * physical bits set.
 */
#define __swp_entry(type, offset) ((swp_entry_t) { \
        (~(unsigned long)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \
        | ((unsigned long)(type) << (64-SWP_TYPE_BITS)) })

#define __pte_to_swp_entry(pte)                ((swp_entry_t) { pte_val((pte)) })
#define __pmd_to_swp_entry(pmd)                ((swp_entry_t) { pmd_val((pmd)) })
#define __swp_entry_to_pte(x)                (__pte((x).val))
#define __swp_entry_to_pmd(x)                (__pmd((x).val))

extern void cleanup_highmap(void);

#define HAVE_ARCH_UNMAPPED_AREA
#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN

#define PAGE_AGP    PAGE_KERNEL_NOCACHE
#define HAVE_PAGE_AGP 1

/* fs/proc/kcore.c */
#define        kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
#define        kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK)

#define __HAVE_ARCH_PTE_SAME

#define vmemmap ((struct page *)VMEMMAP_START)

extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);

#define gup_fast_permitted gup_fast_permitted
static inline bool gup_fast_permitted(unsigned long start, unsigned long end)
{
        if (end >> __VIRTUAL_MASK_SHIFT)
                return false;
        return true;
}

#include <asm/pgtable-invert.h>

#else /* __ASSEMBLER__ */

#define l4_index(x)        (((x) >> 39) & 511)
#define pud_index(x)        (((x) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))

L4_PAGE_OFFSET = l4_index(__PAGE_OFFSET_BASE_L4)
L4_START_KERNEL = l4_index(__START_KERNEL_map)

L3_START_KERNEL = pud_index(__START_KERNEL_map)

#define SYM_DATA_START_PAGE_ALIGNED(name)                        \
        SYM_START(name, SYM_L_GLOBAL, .balign PAGE_SIZE)

/* Automate the creation of 1 to 1 mapping pmd entries */
#define PMDS(START, PERM, COUNT)                        \
        i = 0 ;                                                \
        .rept (COUNT) ;                                        \
        .quad        (START) + (i << PMD_SHIFT) + (PERM) ;        \
        i = i + 1 ;                                        \
        .endr

#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_PGTABLE_64_H */





































































































































































































































































































































   10 






























































































































































































































































































































































































































































































































































































   16 
   20 




   23 

   41 
    5 













   51 
    2 

   47 

   93 
    9 




   51 














   13 


   18 




   23 


















   19 





    1 





   13 
    1 


    1 

   15 








   22 





    1 

























    8 























































































































































   39 

   29 



























































































































































































































































   28 













   12 














































































    4 





















   12 
   36 
   10 


























































































































































































































































































































































































    2 







   18 


    3 

   27 













































    3 




    4 









    4 
















































    1 






























































































































  143 
   20 



  144 
  124 

   37 



   36 

   25 
























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NET_XFRM_H
#define _NET_XFRM_H

#include <linux/compiler.h>
#include <linux/xfrm.h>
#include <linux/spinlock.h>
#include <linux/list.h>
#include <linux/skbuff.h>
#include <linux/socket.h>
#include <linux/pfkeyv2.h>
#include <linux/ipsec.h>
#include <linux/in6.h>
#include <linux/mutex.h>
#include <linux/audit.h>
#include <linux/slab.h>
#include <linux/refcount.h>
#include <linux/sockptr.h>

#include <net/sock.h>
#include <net/dst.h>
#include <net/inet_dscp.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/ipv6.h>
#include <net/ip6_fib.h>
#include <net/flow.h>
#include <net/gro_cells.h>

#include <linux/interrupt.h>

#ifdef CONFIG_XFRM_STATISTICS
#include <net/snmp.h>
#endif

#define XFRM_PROTO_ESP                50
#define XFRM_PROTO_AH                51
#define XFRM_PROTO_COMP                108
#define XFRM_PROTO_IPIP                4
#define XFRM_PROTO_IPV6                41
#define XFRM_PROTO_IPTFS        IPPROTO_AGGFRAG
#define XFRM_PROTO_ROUTING        IPPROTO_ROUTING
#define XFRM_PROTO_DSTOPTS        IPPROTO_DSTOPTS

#define XFRM_ALIGN4(len)        (((len) + 3) & ~3)
#define XFRM_ALIGN8(len)        (((len) + 7) & ~7)
#define MODULE_ALIAS_XFRM_MODE(family, encap) \
        MODULE_ALIAS("xfrm-mode-" __stringify(family) "-" __stringify(encap))
#define MODULE_ALIAS_XFRM_TYPE(family, proto) \
        MODULE_ALIAS("xfrm-type-" __stringify(family) "-" __stringify(proto))
#define MODULE_ALIAS_XFRM_OFFLOAD_TYPE(family, proto) \
        MODULE_ALIAS("xfrm-offload-" __stringify(family) "-" __stringify(proto))

#ifdef CONFIG_XFRM_STATISTICS
#define XFRM_INC_STATS(net, field)        SNMP_INC_STATS((net)->mib.xfrm_statistics, field)
#define XFRM_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.xfrm_statistics, field, val)
#else
#define XFRM_INC_STATS(net, field)        ((void)(net))
#define XFRM_ADD_STATS(net, field, val) ((void)(net))
#endif


/* Organization of SPD aka "XFRM rules"
   ------------------------------------

   Basic objects:
   - policy rule, struct xfrm_policy (=SPD entry)
   - bundle of transformations, struct dst_entry == struct xfrm_dst (=SA bundle)
   - instance of a transformer, struct xfrm_state (=SA)
   - template to clone xfrm_state, struct xfrm_tmpl

   SPD is organized as hash table (for policies that meet minimum address prefix
   length setting, net->xfrm.policy_hthresh).  Other policies are stored in
   lists, sorted into rbtree ordered by destination and source address networks.
   See net/xfrm/xfrm_policy.c for details.

   (To be compatible with existing pfkeyv2 implementations,
   many rules with priority of 0x7fffffff are allowed to exist and
   such rules are ordered in an unpredictable way, thanks to bsd folks.)

   If "action" is "block", then we prohibit the flow, otherwise:
   if "xfrms_nr" is zero, the flow passes untransformed. Otherwise,
   policy entry has list of up to XFRM_MAX_DEPTH transformations,
   described by templates xfrm_tmpl. Each template is resolved
   to a complete xfrm_state (see below) and we pack bundle of transformations
   to a dst_entry returned to requester.

   dst -. xfrm  .-> xfrm_state #1
    |---. child .-> dst -. xfrm .-> xfrm_state #2
                     |---. child .-> dst -. xfrm .-> xfrm_state #3
                                      |---. child .-> NULL


   Resolution of xrfm_tmpl
   -----------------------
   Template contains:
   1. ->mode                Mode: transport or tunnel
   2. ->id.proto        Protocol: AH/ESP/IPCOMP
   3. ->id.daddr        Remote tunnel endpoint, ignored for transport mode.
      Q: allow to resolve security gateway?
   4. ->id.spi          If not zero, static SPI.
   5. ->saddr                Local tunnel endpoint, ignored for transport mode.
   6. ->algos                List of allowed algos. Plain bitmask now.
      Q: ealgos, aalgos, calgos. What a mess...
   7. ->share                Sharing mode.
      Q: how to implement private sharing mode? To add struct sock* to
      flow id?

   Having this template we search through SAD searching for entries
   with appropriate mode/proto/algo, permitted by selector.
   If no appropriate entry found, it is requested from key manager.

   PROBLEMS:
   Q: How to find all the bundles referring to a physical path for
      PMTU discovery? Seems, dst should contain list of all parents...
      and enter to infinite locking hierarchy disaster.
      No! It is easier, we will not search for them, let them find us.
      We add genid to each dst plus pointer to genid of raw IP route,
      pmtu disc will update pmtu on raw IP route and increase its genid.
      dst_check() will see this for top level and trigger resyncing
      metrics. Plus, it will be made via sk->sk_dst_cache. Solved.
 */

struct xfrm_state_walk {
        struct list_head        all;
        u8                        state;
        u8                        dying;
        u8                        proto;
        u32                        seq;
        struct xfrm_address_filter *filter;
};

enum {
        XFRM_DEV_OFFLOAD_IN = 1,
        XFRM_DEV_OFFLOAD_OUT,
        XFRM_DEV_OFFLOAD_FWD,
};

enum {
        XFRM_DEV_OFFLOAD_UNSPECIFIED,
        XFRM_DEV_OFFLOAD_CRYPTO,
        XFRM_DEV_OFFLOAD_PACKET,
};

enum {
        XFRM_DEV_OFFLOAD_FLAG_ACQ = 1,
};

struct xfrm_dev_offload {
        /* The device for this offload.
         * Device drivers should not use this directly, as that will prevent
         * them from working with bonding device. Instead, the device passed
         * to the add/delete callbacks should be used.
         */
        struct net_device        *dev;
        netdevice_tracker        dev_tracker;
        /* This is a private pointer used by the bonding driver (and eventually
         * should be moved there). Device drivers should not use it.
         * Protected by xfrm_state.lock AND bond.ipsec_lock in most cases,
         * except in the .xdo_dev_state_del() flow, where only xfrm_state.lock
         * is held.
         */
        struct net_device        *real_dev;
        unsigned long                offload_handle;
        u8                        dir : 2;
        u8                        type : 2;
        u8                        flags : 2;
};

struct xfrm_mode {
        u8 encap;
        u8 family;
        u8 flags;
};

/* Flags for xfrm_mode. */
enum {
        XFRM_MODE_FLAG_TUNNEL = 1,
};

enum xfrm_replay_mode {
        XFRM_REPLAY_MODE_LEGACY,
        XFRM_REPLAY_MODE_BMP,
        XFRM_REPLAY_MODE_ESN,
};

/* Full description of state of transformer. */
struct xfrm_state {
        possible_net_t                xs_net;
        union {
                struct hlist_node        gclist;
                struct hlist_node        bydst;
        };
        union {
                struct hlist_node        dev_gclist;
                struct hlist_node        bysrc;
        };
        struct hlist_node        byspi;
        struct hlist_node        byseq;
        struct hlist_node        state_cache;
        struct hlist_node        state_cache_input;

        refcount_t                refcnt;
        spinlock_t                lock;

        u32                        pcpu_num;
        struct xfrm_id                id;
        struct xfrm_selector        sel;
        struct xfrm_mark        mark;
        u32                        if_id;
        u32                        tfcpad;

        u32                        genid;

        /* Key manager bits */
        struct xfrm_state_walk        km;

        /* Parameters of this state. */
        struct {
                u32                reqid;
                u8                mode;
                u8                replay_window;
                u8                aalgo, ealgo, calgo;
                u8                flags;
                u16                family;
                xfrm_address_t        saddr;
                int                header_len;
                int                enc_hdr_len;
                int                trailer_len;
                u32                extra_flags;
                struct xfrm_mark        smark;
        } props;

        struct xfrm_lifetime_cfg lft;

        /* Data for transformer */
        struct xfrm_algo_auth        *aalg;
        struct xfrm_algo        *ealg;
        struct xfrm_algo        *calg;
        struct xfrm_algo_aead        *aead;
        const char                *geniv;

        /* mapping change rate limiting */
        __be16 new_mapping_sport;
        u32 new_mapping;        /* seconds */
        u32 mapping_maxage;        /* seconds for input SA */

        /* Data for encapsulator */
        struct xfrm_encap_tmpl        *encap;

        /* NAT keepalive */
        u32                        nat_keepalive_interval; /* seconds */
        time64_t                nat_keepalive_expiration;

        /* Data for care-of address */
        xfrm_address_t        *coaddr;

        /* IPComp needs an IPIP tunnel for handling uncompressed packets */
        struct xfrm_state        *tunnel;

        /* If a tunnel, number of users + 1 */
        atomic_t                tunnel_users;

        /* State for replay detection */
        struct xfrm_replay_state replay;
        struct xfrm_replay_state_esn *replay_esn;

        /* Replay detection state at the time we sent the last notification */
        struct xfrm_replay_state preplay;
        struct xfrm_replay_state_esn *preplay_esn;

        /* replay detection mode */
        enum xfrm_replay_mode    repl_mode;
        /* internal flag that only holds state for delayed aevent at the
         * moment
        */
        u32                        xflags;

        /* Replay detection notification settings */
        u32                        replay_maxage;
        u32                        replay_maxdiff;

        /* Replay detection notification timer */
        struct timer_list        rtimer;

        /* Statistics */
        struct xfrm_stats        stats;

        struct xfrm_lifetime_cur curlft;
        struct hrtimer                mtimer;

        struct xfrm_dev_offload xso;

        /* used to fix curlft->add_time when changing date */
        long                saved_tmo;

        /* Last used time */
        time64_t                lastused;

        struct page_frag xfrag;

        /* Reference to data common to all the instances of this
         * transformer. */
        const struct xfrm_type        *type;
        struct xfrm_mode        inner_mode;
        struct xfrm_mode        inner_mode_iaf;
        struct xfrm_mode        outer_mode;

        const struct xfrm_type_offload        *type_offload;

        /* Security context */
        struct xfrm_sec_ctx        *security;

        /* Private data of this transformer, format is opaque,
         * interpreted by xfrm_type methods. */
        void                        *data;
        u8                        dir;

        const struct xfrm_mode_cbs        *mode_cbs;
        void                                *mode_data;
};

static inline struct net *xs_net(struct xfrm_state *x)
{
        return read_pnet(&x->xs_net);
}

/* xflags - make enum if more show up */
#define XFRM_TIME_DEFER        1
#define XFRM_SOFT_EXPIRE 2

enum {
        XFRM_STATE_VOID,
        XFRM_STATE_ACQ,
        XFRM_STATE_VALID,
        XFRM_STATE_ERROR,
        XFRM_STATE_EXPIRED,
        XFRM_STATE_DEAD
};

/* callback structure passed from either netlink or pfkey */
struct km_event {
        union {
                u32 hard;
                u32 proto;
                u32 byid;
                u32 aevent;
                u32 type;
        } data;

        u32        seq;
        u32        portid;
        u32        event;
        struct net *net;
};

struct xfrm_if_decode_session_result {
        struct net *net;
        u32 if_id;
};

struct xfrm_if_cb {
        bool (*decode_session)(struct sk_buff *skb,
                               unsigned short family,
                               struct xfrm_if_decode_session_result *res);
};

void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb);
void xfrm_if_unregister_cb(void);

struct xfrm_dst_lookup_params {
        struct net *net;
        dscp_t dscp;
        int oif;
        xfrm_address_t *saddr;
        xfrm_address_t *daddr;
        u32 mark;
        __u8 ipproto;
        union flowi_uli uli;
};

struct net_device;
struct xfrm_type;
struct xfrm_dst;
struct xfrm_policy_afinfo {
        struct dst_ops                *dst_ops;
        struct dst_entry        *(*dst_lookup)(const struct xfrm_dst_lookup_params *params);
        int                        (*get_saddr)(xfrm_address_t *saddr,
                                             const struct xfrm_dst_lookup_params *params);
        int                        (*fill_dst)(struct xfrm_dst *xdst,
                                            struct net_device *dev,
                                            const struct flowi *fl);
        struct dst_entry        *(*blackhole_route)(struct net *net, struct dst_entry *orig);
};

int xfrm_policy_register_afinfo(const struct xfrm_policy_afinfo *afinfo, int family);
void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo);
void km_policy_notify(struct xfrm_policy *xp, int dir,
                      const struct km_event *c);
void km_state_notify(struct xfrm_state *x, const struct km_event *c);

struct xfrm_tmpl;
int km_query(struct xfrm_state *x, struct xfrm_tmpl *t,
             struct xfrm_policy *pol);
void km_state_expired(struct xfrm_state *x, int hard, u32 portid);
int __xfrm_state_delete(struct xfrm_state *x);

struct xfrm_state_afinfo {
        u8                                family;
        u8                                proto;

        const struct xfrm_type_offload *type_offload_esp;

        const struct xfrm_type                *type_esp;
        const struct xfrm_type                *type_ipip;
        const struct xfrm_type                *type_ipip6;
        const struct xfrm_type                *type_comp;
        const struct xfrm_type                *type_ah;
        const struct xfrm_type                *type_routing;
        const struct xfrm_type                *type_dstopts;

        int                        (*output)(struct net *net, struct sock *sk, struct sk_buff *skb);
        int                        (*transport_finish)(struct sk_buff *skb,
                                                    int async);
        void                        (*local_error)(struct sk_buff *skb, u32 mtu);
};

int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo);
int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo);
struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family);
struct xfrm_state_afinfo *xfrm_state_afinfo_get_rcu(unsigned int family);

struct xfrm_input_afinfo {
        u8                        family;
        bool                        is_ipip;
        int                        (*callback)(struct sk_buff *skb, u8 protocol,
                                            int err);
};

int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo);
int xfrm_input_unregister_afinfo(const struct xfrm_input_afinfo *afinfo);

void xfrm_flush_gc(void);

struct xfrm_type {
        struct module                *owner;
        u8                        proto;
        u8                        flags;
#define XFRM_TYPE_NON_FRAGMENT        1
#define XFRM_TYPE_REPLAY_PROT        2
#define XFRM_TYPE_LOCAL_COADDR        4
#define XFRM_TYPE_REMOTE_COADDR        8

        int                        (*init_state)(struct xfrm_state *x,
                                              struct netlink_ext_ack *extack);
        void                        (*destructor)(struct xfrm_state *);
        int                        (*input)(struct xfrm_state *, struct sk_buff *skb);
        int                        (*output)(struct xfrm_state *, struct sk_buff *pskb);
        int                        (*reject)(struct xfrm_state *, struct sk_buff *,
                                          const struct flowi *);
};

int xfrm_register_type(const struct xfrm_type *type, unsigned short family);
void xfrm_unregister_type(const struct xfrm_type *type, unsigned short family);

struct xfrm_type_offload {
        struct module        *owner;
        u8                proto;
        void                (*encap)(struct xfrm_state *, struct sk_buff *pskb);
        int                (*input_tail)(struct xfrm_state *x, struct sk_buff *skb);
        int                (*xmit)(struct xfrm_state *, struct sk_buff *pskb, netdev_features_t features);
};

int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned short family);
void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family);
void xfrm_set_type_offload(struct xfrm_state *x, bool try_load);
static inline void xfrm_unset_type_offload(struct xfrm_state *x)
{
        if (!x->type_offload)
                return;

        module_put(x->type_offload->owner);
        x->type_offload = NULL;
}

/**
 * struct xfrm_mode_cbs - XFRM mode callbacks
 * @owner: module owner or NULL
 * @init_state: Add/init mode specific state in `xfrm_state *x`
 * @clone_state: Copy mode specific values from `orig` to new state `x`
 * @destroy_state: Cleanup mode specific state from `xfrm_state *x`
 * @user_init: Process mode specific netlink attributes from user
 * @copy_to_user: Add netlink attributes to `attrs` based on state in `x`
 * @sa_len: Return space required to store mode specific netlink attributes
 * @get_inner_mtu: Return avail payload space after removing encap overhead
 * @input: Process received packet from SA using mode
 * @output: Output given packet using mode
 * @prepare_output: Add mode specific encapsulation to packet in skb. On return
 *        `transport_header` should point at ESP header, `network_header` should
 *        point at outer IP header and `mac_header` should opint at the
 *        protocol/nexthdr field of the outer IP.
 *
 * One should examine and understand the specific uses of these callbacks in
 * xfrm for further detail on how and when these functions are called. RTSL.
 */
struct xfrm_mode_cbs {
        struct module        *owner;
        int        (*init_state)(struct xfrm_state *x);
        int        (*clone_state)(struct xfrm_state *x, struct xfrm_state *orig);
        void        (*destroy_state)(struct xfrm_state *x);
        int        (*user_init)(struct net *net, struct xfrm_state *x,
                             struct nlattr **attrs,
                             struct netlink_ext_ack *extack);
        int        (*copy_to_user)(struct xfrm_state *x, struct sk_buff *skb);
        unsigned int (*sa_len)(const struct xfrm_state *x);
        u32        (*get_inner_mtu)(struct xfrm_state *x, int outer_mtu);
        int        (*input)(struct xfrm_state *x, struct sk_buff *skb);
        int        (*output)(struct net *net, struct sock *sk, struct sk_buff *skb);
        int        (*prepare_output)(struct xfrm_state *x, struct sk_buff *skb);
};

int xfrm_register_mode_cbs(u8 mode, const struct xfrm_mode_cbs *mode_cbs);
void xfrm_unregister_mode_cbs(u8 mode);

static inline int xfrm_af2proto(unsigned int family)
{
        switch(family) {
        case AF_INET:
                return IPPROTO_IPIP;
        case AF_INET6:
                return IPPROTO_IPV6;
        default:
                return 0;
        }
}

static inline const struct xfrm_mode *xfrm_ip2inner_mode(struct xfrm_state *x, int ipproto)
{
        if ((x->sel.family != AF_UNSPEC) ||
            (ipproto == IPPROTO_IPIP && x->props.family == AF_INET) ||
            (ipproto == IPPROTO_IPV6 && x->props.family == AF_INET6))
                return &x->inner_mode;
        else
                return &x->inner_mode_iaf;
}

struct xfrm_tmpl {
/* id in template is interpreted as:
 * daddr - destination of tunnel, may be zero for transport mode.
 * spi   - zero to acquire spi. Not zero if spi is static, then
 *           daddr must be fixed too.
 * proto - AH/ESP/IPCOMP
 */
        struct xfrm_id                id;

/* Source address of tunnel. Ignored, if it is not a tunnel. */
        xfrm_address_t                saddr;

        unsigned short                encap_family;

        u32                        reqid;

/* Mode: transport, tunnel etc. */
        u8                        mode;

/* Sharing mode: unique, this session only, this user only etc. */
        u8                        share;

/* May skip this transfomration if no SA is found */
        u8                        optional;

/* Skip aalgos/ealgos/calgos checks. */
        u8                        allalgs;

/* Bit mask of algos allowed for acquisition */
        u32                        aalgos;
        u32                        ealgos;
        u32                        calgos;
};

#define XFRM_MAX_DEPTH                6
#define XFRM_MAX_OFFLOAD_DEPTH        1

struct xfrm_policy_walk_entry {
        struct list_head        all;
        u8                        dead;
};

struct xfrm_policy_walk {
        struct xfrm_policy_walk_entry walk;
        u8 type;
        u32 seq;
};

struct xfrm_policy_queue {
        struct sk_buff_head        hold_queue;
        struct timer_list        hold_timer;
        unsigned long                timeout;
};

/**
 *        struct xfrm_policy - xfrm policy
 *        @xp_net: network namespace the policy lives in
 *        @bydst: hlist node for SPD hash table or rbtree list
 *        @byidx: hlist node for index hash table
 *        @state_cache_list: hlist head for policy cached xfrm states
 *        @lock: serialize changes to policy structure members
 *        @refcnt: reference count, freed once it reaches 0
 *        @pos: kernel internal tie-breaker to determine age of policy
 *        @timer: timer
 *        @genid: generation, used to invalidate old policies
 *        @priority: priority, set by userspace
 *        @index:  policy index (autogenerated)
 *        @if_id: virtual xfrm interface id
 *        @mark: packet mark
 *        @selector: selector
 *        @lft: liftime configuration data
 *        @curlft: liftime state
 *        @walk: list head on pernet policy list
 *        @polq: queue to hold packets while aqcuire operaion in progress
 *        @bydst_reinsert: policy tree node needs to be merged
 *        @type: XFRM_POLICY_TYPE_MAIN or _SUB
 *        @action: XFRM_POLICY_ALLOW or _BLOCK
 *        @flags: XFRM_POLICY_LOCALOK, XFRM_POLICY_ICMP
 *        @xfrm_nr: number of used templates in @xfrm_vec
 *        @family: protocol family
 *        @security: SELinux security label
 *        @xfrm_vec: array of templates to resolve state
 *        @rcu: rcu head, used to defer memory release
 *        @xdo: hardware offload state
 */
struct xfrm_policy {
        possible_net_t                xp_net;
        struct hlist_node        bydst;
        struct hlist_node        byidx;

        struct hlist_head        state_cache_list;

        /* This lock only affects elements except for entry. */
        rwlock_t                lock;
        refcount_t                refcnt;
        u32                        pos;
        struct timer_list        timer;

        atomic_t                genid;
        u32                        priority;
        u32                        index;
        u32                        if_id;
        struct xfrm_mark        mark;
        struct xfrm_selector        selector;
        struct xfrm_lifetime_cfg lft;
        struct xfrm_lifetime_cur curlft;
        struct xfrm_policy_walk_entry walk;
        struct xfrm_policy_queue polq;
        bool                    bydst_reinsert;
        u8                        type;
        u8                        action;
        u8                        flags;
        u8                        xfrm_nr;
        u16                        family;
        struct xfrm_sec_ctx        *security;
        struct xfrm_tmpl               xfrm_vec[XFRM_MAX_DEPTH];
        struct rcu_head                rcu;

        struct xfrm_dev_offload xdo;
};

static inline struct net *xp_net(const struct xfrm_policy *xp)
{
        return read_pnet(&xp->xp_net);
}

struct xfrm_kmaddress {
        xfrm_address_t          local;
        xfrm_address_t          remote;
        u32                        reserved;
        u16                        family;
};

struct xfrm_migrate {
        xfrm_address_t                old_daddr;
        xfrm_address_t                old_saddr;
        xfrm_address_t                new_daddr;
        xfrm_address_t                new_saddr;
        u8                        proto;
        u8                        mode;
        u16                        reserved;
        u32                        reqid;
        u16                        old_family;
        u16                        new_family;
};

#define XFRM_KM_TIMEOUT                30
/* what happened */
#define XFRM_REPLAY_UPDATE        XFRM_AE_CR
#define XFRM_REPLAY_TIMEOUT        XFRM_AE_CE

/* default aevent timeout in units of 100ms */
#define XFRM_AE_ETIME                        10
/* Async Event timer multiplier */
#define XFRM_AE_ETH_M                        10
/* default seq threshold size */
#define XFRM_AE_SEQT_SIZE                2

struct xfrm_mgr {
        struct list_head        list;
        int                        (*notify)(struct xfrm_state *x, const struct km_event *c);
        int                        (*acquire)(struct xfrm_state *x, struct xfrm_tmpl *, struct xfrm_policy *xp);
        struct xfrm_policy        *(*compile_policy)(struct sock *sk, int opt, u8 *data, int len, int *dir);
        int                        (*new_mapping)(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport);
        int                        (*notify_policy)(struct xfrm_policy *x, int dir, const struct km_event *c);
        int                        (*report)(struct net *net, u8 proto, struct xfrm_selector *sel, xfrm_address_t *addr);
        int                        (*migrate)(const struct xfrm_selector *sel,
                                           u8 dir, u8 type,
                                           const struct xfrm_migrate *m,
                                           int num_bundles,
                                           const struct xfrm_kmaddress *k,
                                           const struct xfrm_encap_tmpl *encap);
        bool                        (*is_alive)(const struct km_event *c);
};

void xfrm_register_km(struct xfrm_mgr *km);
void xfrm_unregister_km(struct xfrm_mgr *km);

struct xfrm_tunnel_skb_cb {
        union {
                struct inet_skb_parm h4;
                struct inet6_skb_parm h6;
        } header;

        union {
                struct ip_tunnel *ip4;
                struct ip6_tnl *ip6;
        } tunnel;
};

#define XFRM_TUNNEL_SKB_CB(__skb) ((struct xfrm_tunnel_skb_cb *)&((__skb)->cb[0]))

/*
 * This structure is used for the duration where packets are being
 * transformed by IPsec.  As soon as the packet leaves IPsec the
 * area beyond the generic IP part may be overwritten.
 */
struct xfrm_skb_cb {
        struct xfrm_tunnel_skb_cb header;

        /* Sequence number for replay protection. */
        union {
                struct {
                        __u32 low;
                        __u32 hi;
                } output;
                struct {
                        __be32 low;
                        __be32 hi;
                } input;
        } seq;
};

#define XFRM_SKB_CB(__skb) ((struct xfrm_skb_cb *)&((__skb)->cb[0]))

/*
 * This structure is used by the afinfo prepare_input/prepare_output functions
 * to transmit header information to the mode input/output functions.
 */
struct xfrm_mode_skb_cb {
        struct xfrm_tunnel_skb_cb header;

        /* Copied from header for IPv4, always set to zero and DF for IPv6. */
        __be16 id;
        __be16 frag_off;

        /* IP header length (excluding options or extension headers). */
        u8 ihl;

        /* TOS for IPv4, class for IPv6. */
        u8 tos;

        /* TTL for IPv4, hop limitfor IPv6. */
        u8 ttl;

        /* Protocol for IPv4, NH for IPv6. */
        u8 protocol;

        /* Option length for IPv4, zero for IPv6. */
        u8 optlen;

        /* Used by IPv6 only, zero for IPv4. */
        u8 flow_lbl[3];
};

#define XFRM_MODE_SKB_CB(__skb) ((struct xfrm_mode_skb_cb *)&((__skb)->cb[0]))

/*
 * This structure is used by the input processing to locate the SPI and
 * related information.
 */
struct xfrm_spi_skb_cb {
        struct xfrm_tunnel_skb_cb header;

        unsigned int daddroff;
        unsigned int family;
        __be32 seq;
};

#define XFRM_SPI_SKB_CB(__skb) ((struct xfrm_spi_skb_cb *)&((__skb)->cb[0]))

#ifdef CONFIG_AUDITSYSCALL
static inline struct audit_buffer *xfrm_audit_start(const char *op)
{
        struct audit_buffer *audit_buf = NULL;

        if (audit_enabled == AUDIT_OFF)
                return NULL;
        audit_buf = audit_log_start(audit_context(), GFP_ATOMIC,
                                    AUDIT_MAC_IPSEC_EVENT);
        if (audit_buf == NULL)
                return NULL;
        audit_log_format(audit_buf, "op=%s", op);
        return audit_buf;
}

static inline void xfrm_audit_helper_usrinfo(bool task_valid,
                                             struct audit_buffer *audit_buf)
{
        const unsigned int auid = from_kuid(&init_user_ns, task_valid ?
                                            audit_get_loginuid(current) :
                                            INVALID_UID);
        const unsigned int ses = task_valid ? audit_get_sessionid(current) :
                AUDIT_SID_UNSET;

        audit_log_format(audit_buf, " auid=%u ses=%u", auid, ses);
        audit_log_task_context(audit_buf);
}

void xfrm_audit_policy_add(struct xfrm_policy *xp, int result, bool task_valid);
void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result,
                              bool task_valid);
void xfrm_audit_state_add(struct xfrm_state *x, int result, bool task_valid);
void xfrm_audit_state_delete(struct xfrm_state *x, int result, bool task_valid);
void xfrm_audit_state_replay_overflow(struct xfrm_state *x,
                                      struct sk_buff *skb);
void xfrm_audit_state_replay(struct xfrm_state *x, struct sk_buff *skb,
                             __be32 net_seq);
void xfrm_audit_state_notfound_simple(struct sk_buff *skb, u16 family);
void xfrm_audit_state_notfound(struct sk_buff *skb, u16 family, __be32 net_spi,
                               __be32 net_seq);
void xfrm_audit_state_icvfail(struct xfrm_state *x, struct sk_buff *skb,
                              u8 proto);
#else

static inline void xfrm_audit_policy_add(struct xfrm_policy *xp, int result,
                                         bool task_valid)
{
}

static inline void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result,
                                            bool task_valid)
{
}

static inline void xfrm_audit_state_add(struct xfrm_state *x, int result,
                                        bool task_valid)
{
}

static inline void xfrm_audit_state_delete(struct xfrm_state *x, int result,
                                           bool task_valid)
{
}

static inline void xfrm_audit_state_replay_overflow(struct xfrm_state *x,
                                             struct sk_buff *skb)
{
}

static inline void xfrm_audit_state_replay(struct xfrm_state *x,
                                           struct sk_buff *skb, __be32 net_seq)
{
}

static inline void xfrm_audit_state_notfound_simple(struct sk_buff *skb,
                                      u16 family)
{
}

static inline void xfrm_audit_state_notfound(struct sk_buff *skb, u16 family,
                                      __be32 net_spi, __be32 net_seq)
{
}

static inline void xfrm_audit_state_icvfail(struct xfrm_state *x,
                                     struct sk_buff *skb, u8 proto)
{
}
#endif /* CONFIG_AUDITSYSCALL */

static inline void xfrm_pol_hold(struct xfrm_policy *policy)
{
        if (likely(policy != NULL))
                refcount_inc(&policy->refcnt);
}

void xfrm_policy_destroy(struct xfrm_policy *policy);

static inline void xfrm_pol_put(struct xfrm_policy *policy)
{
        if (refcount_dec_and_test(&policy->refcnt))
                xfrm_policy_destroy(policy);
}

static inline void xfrm_pols_put(struct xfrm_policy **pols, int npols)
{
        int i;
        for (i = npols - 1; i >= 0; --i)
                xfrm_pol_put(pols[i]);
}

void __xfrm_state_destroy(struct xfrm_state *);

static inline void __xfrm_state_put(struct xfrm_state *x)
{
        refcount_dec(&x->refcnt);
}

static inline void xfrm_state_put(struct xfrm_state *x)
{
        if (refcount_dec_and_test(&x->refcnt))
                __xfrm_state_destroy(x);
}

static inline void xfrm_state_hold(struct xfrm_state *x)
{
        refcount_inc(&x->refcnt);
}

static inline bool addr_match(const void *token1, const void *token2,
                              unsigned int prefixlen)
{
        const __be32 *a1 = token1;
        const __be32 *a2 = token2;
        unsigned int pdw;
        unsigned int pbi;

        pdw = prefixlen >> 5;          /* num of whole u32 in prefix */
        pbi = prefixlen &  0x1f;  /* num of bits in incomplete u32 in prefix */

        if (pdw)
                if (memcmp(a1, a2, pdw << 2))
                        return false;

        if (pbi) {
                __be32 mask;

                mask = htonl((0xffffffff) << (32 - pbi));

                if ((a1[pdw] ^ a2[pdw]) & mask)
                        return false;
        }

        return true;
}

static inline bool addr4_match(__be32 a1, __be32 a2, u8 prefixlen)
{
        /* C99 6.5.7 (3): u32 << 32 is undefined behaviour */
        if (sizeof(long) == 4 && prefixlen == 0)
                return true;
        return !((a1 ^ a2) & htonl(~0UL << (32 - prefixlen)));
}

static __inline__
__be16 xfrm_flowi_sport(const struct flowi *fl, const union flowi_uli *uli)
{
        __be16 port;
        switch(fl->flowi_proto) {
        case IPPROTO_TCP:
        case IPPROTO_UDP:
        case IPPROTO_UDPLITE:
        case IPPROTO_SCTP:
                port = uli->ports.sport;
                break;
        case IPPROTO_ICMP:
        case IPPROTO_ICMPV6:
                port = htons(uli->icmpt.type);
                break;
        case IPPROTO_MH:
                port = htons(uli->mht.type);
                break;
        case IPPROTO_GRE:
                port = htons(ntohl(uli->gre_key) >> 16);
                break;
        default:
                port = 0;        /*XXX*/
        }
        return port;
}

static __inline__
__be16 xfrm_flowi_dport(const struct flowi *fl, const union flowi_uli *uli)
{
        __be16 port;
        switch(fl->flowi_proto) {
        case IPPROTO_TCP:
        case IPPROTO_UDP:
        case IPPROTO_UDPLITE:
        case IPPROTO_SCTP:
                port = uli->ports.dport;
                break;
        case IPPROTO_ICMP:
        case IPPROTO_ICMPV6:
                port = htons(uli->icmpt.code);
                break;
        case IPPROTO_GRE:
                port = htons(ntohl(uli->gre_key) & 0xffff);
                break;
        default:
                port = 0;        /*XXX*/
        }
        return port;
}

bool xfrm_selector_match(const struct xfrm_selector *sel,
                         const struct flowi *fl, unsigned short family);

#ifdef CONFIG_SECURITY_NETWORK_XFRM
/*        If neither has a context --> match
 *         Otherwise, both must have a context and the sids, doi, alg must match
 */
static inline bool xfrm_sec_ctx_match(struct xfrm_sec_ctx *s1, struct xfrm_sec_ctx *s2)
{
        return ((!s1 && !s2) ||
                (s1 && s2 &&
                 (s1->ctx_sid == s2->ctx_sid) &&
                 (s1->ctx_doi == s2->ctx_doi) &&
                 (s1->ctx_alg == s2->ctx_alg)));
}
#else
static inline bool xfrm_sec_ctx_match(struct xfrm_sec_ctx *s1, struct xfrm_sec_ctx *s2)
{
        return true;
}
#endif

/* A struct encoding bundle of transformations to apply to some set of flow.
 *
 * xdst->child points to the next element of bundle.
 * dst->xfrm  points to an instanse of transformer.
 *
 * Due to unfortunate limitations of current routing cache, which we
 * have no time to fix, it mirrors struct rtable and bound to the same
 * routing key, including saddr,daddr. However, we can have many of
 * bundles differing by session id. All the bundles grow from a parent
 * policy rule.
 */
struct xfrm_dst {
        union {
                struct dst_entry        dst;
                struct rtable                rt;
                struct rt6_info                rt6;
        } u;
        struct dst_entry *route;
        struct dst_entry *child;
        struct dst_entry *path;
        struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
        int num_pols, num_xfrms;
        u32 xfrm_genid;
        u32 policy_genid;
        u32 route_mtu_cached;
        u32 child_mtu_cached;
        u32 route_cookie;
        u32 path_cookie;
};

static inline struct dst_entry *xfrm_dst_path(const struct dst_entry *dst)
{
#ifdef CONFIG_XFRM
        if (dst->xfrm || (dst->flags & DST_XFRM_QUEUE)) {
                const struct xfrm_dst *xdst = (const struct xfrm_dst *) dst;

                return xdst->path;
        }
#endif
        return (struct dst_entry *) dst;
}

static inline struct dst_entry *xfrm_dst_child(const struct dst_entry *dst)
{
#ifdef CONFIG_XFRM
        if (dst->xfrm || (dst->flags & DST_XFRM_QUEUE)) {
                struct xfrm_dst *xdst = (struct xfrm_dst *) dst;
                return xdst->child;
        }
#endif
        return NULL;
}

#ifdef CONFIG_XFRM
static inline void xfrm_dst_set_child(struct xfrm_dst *xdst, struct dst_entry *child)
{
        xdst->child = child;
}

static inline void xfrm_dst_destroy(struct xfrm_dst *xdst)
{
        xfrm_pols_put(xdst->pols, xdst->num_pols);
        dst_release(xdst->route);
        if (likely(xdst->u.dst.xfrm))
                xfrm_state_put(xdst->u.dst.xfrm);
}
#endif

void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev);

struct xfrm_if_parms {
        int link;                /* ifindex of underlying L2 interface */
        u32 if_id;                /* interface identifier */
        bool collect_md;
};

struct xfrm_if {
        struct xfrm_if __rcu *next;        /* next interface in list */
        struct net_device *dev;                /* virtual device associated with interface */
        struct net *net;                /* netns for packet i/o */
        struct xfrm_if_parms p;                /* interface parms */

        struct gro_cells gro_cells;
};

struct xfrm_offload {
        /* Output sequence number for replay protection on offloading. */
        struct {
                __u32 low;
                __u32 hi;
        } seq;

        __u32                        flags;
#define        SA_DELETE_REQ                1
#define        CRYPTO_DONE                2
#define        CRYPTO_NEXT_DONE        4
#define        CRYPTO_FALLBACK                8
#define        XFRM_GSO_SEGMENT        16
#define        XFRM_GRO                32
/* 64 is free */
#define        XFRM_DEV_RESUME                128
#define        XFRM_XMIT                256

        __u32                        status;
#define CRYPTO_SUCCESS                                1
#define CRYPTO_GENERIC_ERROR                        2
#define CRYPTO_TRANSPORT_AH_AUTH_FAILED                4
#define CRYPTO_TRANSPORT_ESP_AUTH_FAILED        8
#define CRYPTO_TUNNEL_AH_AUTH_FAILED                16
#define CRYPTO_TUNNEL_ESP_AUTH_FAILED                32
#define CRYPTO_INVALID_PACKET_SYNTAX                64
#define CRYPTO_INVALID_PROTOCOL                        128

        /* Used to keep whole l2 header for transport mode GRO */
        __u32                        orig_mac_len;

        __u8                        proto;
        __u8                        inner_ipproto;
};

struct sec_path {
        int                        len;
        int                        olen;
        int                        verified_cnt;

        struct xfrm_state        *xvec[XFRM_MAX_DEPTH];
        struct xfrm_offload        ovec[XFRM_MAX_OFFLOAD_DEPTH];
};

struct sec_path *secpath_set(struct sk_buff *skb);

static inline void
secpath_reset(struct sk_buff *skb)
{
#ifdef CONFIG_XFRM
        skb_ext_del(skb, SKB_EXT_SEC_PATH);
#endif
}

static inline int
xfrm_addr_any(const xfrm_address_t *addr, unsigned short family)
{
        switch (family) {
        case AF_INET:
                return addr->a4 == 0;
        case AF_INET6:
                return ipv6_addr_any(&addr->in6);
        }
        return 0;
}

static inline int
__xfrm4_state_addr_cmp(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x)
{
        return        (tmpl->saddr.a4 &&
                 tmpl->saddr.a4 != x->props.saddr.a4);
}

static inline int
__xfrm6_state_addr_cmp(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x)
{
        return        (!ipv6_addr_any((struct in6_addr*)&tmpl->saddr) &&
                 !ipv6_addr_equal((struct in6_addr *)&tmpl->saddr, (struct in6_addr*)&x->props.saddr));
}

static inline int
xfrm_state_addr_cmp(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x, unsigned short family)
{
        switch (family) {
        case AF_INET:
                return __xfrm4_state_addr_cmp(tmpl, x);
        case AF_INET6:
                return __xfrm6_state_addr_cmp(tmpl, x);
        }
        return !0;
}

#ifdef CONFIG_XFRM
static inline struct xfrm_state *xfrm_input_state(struct sk_buff *skb)
{
        struct sec_path *sp = skb_sec_path(skb);

        return sp->xvec[sp->len - 1];
}
#endif

static inline struct xfrm_offload *xfrm_offload(struct sk_buff *skb)
{
#ifdef CONFIG_XFRM
        struct sec_path *sp = skb_sec_path(skb);

        if (!sp || !sp->olen || sp->len != sp->olen)
                return NULL;

        return &sp->ovec[sp->olen - 1];
#else
        return NULL;
#endif
}

#ifdef CONFIG_XFRM
int __xfrm_policy_check(struct sock *, int dir, struct sk_buff *skb,
                        unsigned short family);

static inline bool __xfrm_check_nopolicy(struct net *net, struct sk_buff *skb,
                                         int dir)
{
        if (!net->xfrm.policy_count[dir] && !secpath_exists(skb))
                return net->xfrm.policy_default[dir] == XFRM_USERPOLICY_ACCEPT;

        return false;
}

static inline bool __xfrm_check_dev_nopolicy(struct sk_buff *skb,
                                             int dir, unsigned short family)
{
        if (dir != XFRM_POLICY_OUT && family == AF_INET) {
                /* same dst may be used for traffic originating from
                 * devices with different policy settings.
                 */
                return IPCB(skb)->flags & IPSKB_NOPOLICY;
        }
        return skb_dst(skb) && (skb_dst(skb)->flags & DST_NOPOLICY);
}

static inline int __xfrm_policy_check2(struct sock *sk, int dir,
                                       struct sk_buff *skb,
                                       unsigned int family, int reverse)
{
        struct net *net = dev_net(skb->dev);
        int ndir = dir | (reverse ? XFRM_POLICY_MASK + 1 : 0);
        struct xfrm_offload *xo = xfrm_offload(skb);
        struct xfrm_state *x;

        if (sk && sk->sk_policy[XFRM_POLICY_IN])
                return __xfrm_policy_check(sk, ndir, skb, family);

        if (xo) {
                x = xfrm_input_state(skb);
                if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) {
                        bool check = (xo->flags & CRYPTO_DONE) &&
                                     (xo->status & CRYPTO_SUCCESS);

                        /* The packets here are plain ones and secpath was
                         * needed to indicate that hardware already handled
                         * them and there is no need to do nothing in addition.
                         *
                         * Consume secpath which was set by drivers.
                         */
                        secpath_reset(skb);
                        return check;
                }
        }

        return __xfrm_check_nopolicy(net, skb, dir) ||
               __xfrm_check_dev_nopolicy(skb, dir, family) ||
               __xfrm_policy_check(sk, ndir, skb, family);
}

static inline int xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, unsigned short family)
{
        return __xfrm_policy_check2(sk, dir, skb, family, 0);
}

static inline int xfrm4_policy_check(struct sock *sk, int dir, struct sk_buff *skb)
{
        return xfrm_policy_check(sk, dir, skb, AF_INET);
}

static inline int xfrm6_policy_check(struct sock *sk, int dir, struct sk_buff *skb)
{
        return xfrm_policy_check(sk, dir, skb, AF_INET6);
}

static inline int xfrm4_policy_check_reverse(struct sock *sk, int dir,
                                             struct sk_buff *skb)
{
        return __xfrm_policy_check2(sk, dir, skb, AF_INET, 1);
}

static inline int xfrm6_policy_check_reverse(struct sock *sk, int dir,
                                             struct sk_buff *skb)
{
        return __xfrm_policy_check2(sk, dir, skb, AF_INET6, 1);
}

int __xfrm_decode_session(struct net *net, struct sk_buff *skb, struct flowi *fl,
                          unsigned int family, int reverse);

static inline int xfrm_decode_session(struct net *net, struct sk_buff *skb, struct flowi *fl,
                                      unsigned int family)
{
        return __xfrm_decode_session(net, skb, fl, family, 0);
}

static inline int xfrm_decode_session_reverse(struct net *net, struct sk_buff *skb,
                                              struct flowi *fl,
                                              unsigned int family)
{
        return __xfrm_decode_session(net, skb, fl, family, 1);
}

int __xfrm_route_forward(struct sk_buff *skb, unsigned short family);

static inline int xfrm_route_forward(struct sk_buff *skb, unsigned short family)
{
        struct net *net = dev_net(skb->dev);

        if (!net->xfrm.policy_count[XFRM_POLICY_OUT] &&
            net->xfrm.policy_default[XFRM_POLICY_OUT] == XFRM_USERPOLICY_ACCEPT)
                return true;

        return (skb_dst(skb)->flags & DST_NOXFRM) ||
               __xfrm_route_forward(skb, family);
}

static inline int xfrm4_route_forward(struct sk_buff *skb)
{
        return xfrm_route_forward(skb, AF_INET);
}

static inline int xfrm6_route_forward(struct sk_buff *skb)
{
        return xfrm_route_forward(skb, AF_INET6);
}

int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk);

static inline int xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk)
{
        if (!sk_fullsock(osk))
                return 0;
        sk->sk_policy[0] = NULL;
        sk->sk_policy[1] = NULL;
        if (unlikely(osk->sk_policy[0] || osk->sk_policy[1]))
                return __xfrm_sk_clone_policy(sk, osk);
        return 0;
}

int xfrm_policy_delete(struct xfrm_policy *pol, int dir);

static inline void xfrm_sk_free_policy(struct sock *sk)
{
        struct xfrm_policy *pol;

        pol = rcu_dereference_protected(sk->sk_policy[0], 1);
        if (unlikely(pol != NULL)) {
                xfrm_policy_delete(pol, XFRM_POLICY_MAX);
                sk->sk_policy[0] = NULL;
        }
        pol = rcu_dereference_protected(sk->sk_policy[1], 1);
        if (unlikely(pol != NULL)) {
                xfrm_policy_delete(pol, XFRM_POLICY_MAX+1);
                sk->sk_policy[1] = NULL;
        }
}

#else

static inline void xfrm_sk_free_policy(struct sock *sk) {}
static inline int xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) { return 0; }
static inline int xfrm6_route_forward(struct sk_buff *skb) { return 1; }
static inline int xfrm4_route_forward(struct sk_buff *skb) { return 1; }
static inline int xfrm6_policy_check(struct sock *sk, int dir, struct sk_buff *skb)
{
        return 1;
}
static inline int xfrm4_policy_check(struct sock *sk, int dir, struct sk_buff *skb)
{
        return 1;
}
static inline int xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, unsigned short family)
{
        return 1;
}
static inline int xfrm_decode_session_reverse(struct net *net, struct sk_buff *skb,
                                              struct flowi *fl,
                                              unsigned int family)
{
        return -ENOSYS;
}
static inline int xfrm4_policy_check_reverse(struct sock *sk, int dir,
                                             struct sk_buff *skb)
{
        return 1;
}
static inline int xfrm6_policy_check_reverse(struct sock *sk, int dir,
                                             struct sk_buff *skb)
{
        return 1;
}
#endif

static __inline__
xfrm_address_t *xfrm_flowi_daddr(const struct flowi *fl, unsigned short family)
{
        switch (family){
        case AF_INET:
                return (xfrm_address_t *)&fl->u.ip4.daddr;
        case AF_INET6:
                return (xfrm_address_t *)&fl->u.ip6.daddr;
        }
        return NULL;
}

static __inline__
xfrm_address_t *xfrm_flowi_saddr(const struct flowi *fl, unsigned short family)
{
        switch (family){
        case AF_INET:
                return (xfrm_address_t *)&fl->u.ip4.saddr;
        case AF_INET6:
                return (xfrm_address_t *)&fl->u.ip6.saddr;
        }
        return NULL;
}

static __inline__
void xfrm_flowi_addr_get(const struct flowi *fl,
                         xfrm_address_t *saddr, xfrm_address_t *daddr,
                         unsigned short family)
{
        switch(family) {
        case AF_INET:
                memcpy(&saddr->a4, &fl->u.ip4.saddr, sizeof(saddr->a4));
                memcpy(&daddr->a4, &fl->u.ip4.daddr, sizeof(daddr->a4));
                break;
        case AF_INET6:
                saddr->in6 = fl->u.ip6.saddr;
                daddr->in6 = fl->u.ip6.daddr;
                break;
        }
}

static __inline__ int
__xfrm4_state_addr_check(const struct xfrm_state *x,
                         const xfrm_address_t *daddr, const xfrm_address_t *saddr)
{
        if (daddr->a4 == x->id.daddr.a4 &&
            (saddr->a4 == x->props.saddr.a4 || !saddr->a4 || !x->props.saddr.a4))
                return 1;
        return 0;
}

static __inline__ int
__xfrm6_state_addr_check(const struct xfrm_state *x,
                         const xfrm_address_t *daddr, const xfrm_address_t *saddr)
{
        if (ipv6_addr_equal((struct in6_addr *)daddr, (struct in6_addr *)&x->id.daddr) &&
            (ipv6_addr_equal((struct in6_addr *)saddr, (struct in6_addr *)&x->props.saddr) ||
             ipv6_addr_any((struct in6_addr *)saddr) ||
             ipv6_addr_any((struct in6_addr *)&x->props.saddr)))
                return 1;
        return 0;
}

static __inline__ int
xfrm_state_addr_check(const struct xfrm_state *x,
                      const xfrm_address_t *daddr, const xfrm_address_t *saddr,
                      unsigned short family)
{
        switch (family) {
        case AF_INET:
                return __xfrm4_state_addr_check(x, daddr, saddr);
        case AF_INET6:
                return __xfrm6_state_addr_check(x, daddr, saddr);
        }
        return 0;
}

static __inline__ int
xfrm_state_addr_flow_check(const struct xfrm_state *x, const struct flowi *fl,
                           unsigned short family)
{
        switch (family) {
        case AF_INET:
                return __xfrm4_state_addr_check(x,
                                                (const xfrm_address_t *)&fl->u.ip4.daddr,
                                                (const xfrm_address_t *)&fl->u.ip4.saddr);
        case AF_INET6:
                return __xfrm6_state_addr_check(x,
                                                (const xfrm_address_t *)&fl->u.ip6.daddr,
                                                (const xfrm_address_t *)&fl->u.ip6.saddr);
        }
        return 0;
}

static inline int xfrm_state_kern(const struct xfrm_state *x)
{
        return atomic_read(&x->tunnel_users);
}

static inline bool xfrm_id_proto_valid(u8 proto)
{
        switch (proto) {
        case IPPROTO_AH:
        case IPPROTO_ESP:
        case IPPROTO_COMP:
#if IS_ENABLED(CONFIG_IPV6)
        case IPPROTO_ROUTING:
        case IPPROTO_DSTOPTS:
#endif
                return true;
        default:
                return false;
        }
}

/* IPSEC_PROTO_ANY only matches 3 IPsec protocols, 0 could match all. */
static inline int xfrm_id_proto_match(u8 proto, u8 userproto)
{
        return (!userproto || proto == userproto ||
                (userproto == IPSEC_PROTO_ANY && (proto == IPPROTO_AH ||
                                                  proto == IPPROTO_ESP ||
                                                  proto == IPPROTO_COMP)));
}

/*
 * xfrm algorithm information
 */
struct xfrm_algo_aead_info {
        char *geniv;
        u16 icv_truncbits;
};

struct xfrm_algo_auth_info {
        u16 icv_truncbits;
        u16 icv_fullbits;
};

struct xfrm_algo_encr_info {
        char *geniv;
        u16 blockbits;
        u16 defkeybits;
};

struct xfrm_algo_comp_info {
        u16 threshold;
};

struct xfrm_algo_desc {
        char *name;
        char *compat;
        u8 available:1;
        u8 pfkey_supported:1;
        union {
                struct xfrm_algo_aead_info aead;
                struct xfrm_algo_auth_info auth;
                struct xfrm_algo_encr_info encr;
                struct xfrm_algo_comp_info comp;
        } uinfo;
        struct sadb_alg desc;
};

/* XFRM protocol handlers.  */
struct xfrm4_protocol {
        int (*handler)(struct sk_buff *skb);
        int (*input_handler)(struct sk_buff *skb, int nexthdr, __be32 spi,
                             int encap_type);
        int (*cb_handler)(struct sk_buff *skb, int err);
        int (*err_handler)(struct sk_buff *skb, u32 info);

        struct xfrm4_protocol __rcu *next;
        int priority;
};

struct xfrm6_protocol {
        int (*handler)(struct sk_buff *skb);
        int (*input_handler)(struct sk_buff *skb, int nexthdr, __be32 spi,
                             int encap_type);
        int (*cb_handler)(struct sk_buff *skb, int err);
        int (*err_handler)(struct sk_buff *skb, struct inet6_skb_parm *opt,
                           u8 type, u8 code, int offset, __be32 info);

        struct xfrm6_protocol __rcu *next;
        int priority;
};

/* XFRM tunnel handlers.  */
struct xfrm_tunnel {
        int (*handler)(struct sk_buff *skb);
        int (*cb_handler)(struct sk_buff *skb, int err);
        int (*err_handler)(struct sk_buff *skb, u32 info);

        struct xfrm_tunnel __rcu *next;
        int priority;
};

struct xfrm6_tunnel {
        int (*handler)(struct sk_buff *skb);
        int (*cb_handler)(struct sk_buff *skb, int err);
        int (*err_handler)(struct sk_buff *skb, struct inet6_skb_parm *opt,
                           u8 type, u8 code, int offset, __be32 info);
        struct xfrm6_tunnel __rcu *next;
        int priority;
};

void xfrm_init(void);
void xfrm4_init(void);
int xfrm_state_init(struct net *net);
void xfrm_state_fini(struct net *net);
void xfrm4_state_init(void);
void xfrm4_protocol_init(void);
#ifdef CONFIG_XFRM
int xfrm6_init(void);
void xfrm6_fini(void);
int xfrm6_state_init(void);
void xfrm6_state_fini(void);
int xfrm6_protocol_init(void);
void xfrm6_protocol_fini(void);
#else
static inline int xfrm6_init(void)
{
        return 0;
}
static inline void xfrm6_fini(void)
{
        ;
}
#endif

#ifdef CONFIG_XFRM_STATISTICS
int xfrm_proc_init(struct net *net);
void xfrm_proc_fini(struct net *net);
#endif

int xfrm_sysctl_init(struct net *net);
#ifdef CONFIG_SYSCTL
void xfrm_sysctl_fini(struct net *net);
#else
static inline void xfrm_sysctl_fini(struct net *net)
{
}
#endif

void xfrm_state_walk_init(struct xfrm_state_walk *walk, u8 proto,
                          struct xfrm_address_filter *filter);
int xfrm_state_walk(struct net *net, struct xfrm_state_walk *walk,
                    int (*func)(struct xfrm_state *, int, void*), void *);
void xfrm_state_walk_done(struct xfrm_state_walk *walk, struct net *net);
struct xfrm_state *xfrm_state_alloc(struct net *net);
void xfrm_state_free(struct xfrm_state *x);
struct xfrm_state *xfrm_state_find(const xfrm_address_t *daddr,
                                   const xfrm_address_t *saddr,
                                   const struct flowi *fl,
                                   struct xfrm_tmpl *tmpl,
                                   struct xfrm_policy *pol, int *err,
                                   unsigned short family, u32 if_id);
struct xfrm_state *xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id,
                                       xfrm_address_t *daddr,
                                       xfrm_address_t *saddr,
                                       unsigned short family,
                                       u8 mode, u8 proto, u32 reqid);
struct xfrm_state *xfrm_state_lookup_byspi(struct net *net, __be32 spi,
                                              unsigned short family);
int xfrm_state_check_expire(struct xfrm_state *x);
void xfrm_state_update_stats(struct net *net);
#ifdef CONFIG_XFRM_OFFLOAD
static inline void xfrm_dev_state_update_stats(struct xfrm_state *x)
{
        struct xfrm_dev_offload *xdo = &x->xso;
        struct net_device *dev = READ_ONCE(xdo->dev);

        if (dev && dev->xfrmdev_ops &&
            dev->xfrmdev_ops->xdo_dev_state_update_stats)
                dev->xfrmdev_ops->xdo_dev_state_update_stats(x);

}
#else
static inline void xfrm_dev_state_update_stats(struct xfrm_state *x) {}
#endif
void xfrm_state_insert(struct xfrm_state *x);
int xfrm_state_add(struct xfrm_state *x);
int xfrm_state_update(struct xfrm_state *x);
struct xfrm_state *xfrm_state_lookup(struct net *net, u32 mark,
                                     const xfrm_address_t *daddr, __be32 spi,
                                     u8 proto, unsigned short family);
struct xfrm_state *xfrm_input_state_lookup(struct net *net, u32 mark,
                                           const xfrm_address_t *daddr,
                                           __be32 spi, u8 proto,
                                           unsigned short family);
struct xfrm_state *xfrm_state_lookup_byaddr(struct net *net, u32 mark,
                                            const xfrm_address_t *daddr,
                                            const xfrm_address_t *saddr,
                                            u8 proto,
                                            unsigned short family);
#ifdef CONFIG_XFRM_SUB_POLICY
void xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n,
                    unsigned short family);
void xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n,
                     unsigned short family);
#else
static inline void xfrm_tmpl_sort(struct xfrm_tmpl **d, struct xfrm_tmpl **s,
                                  int n, unsigned short family)
{
}

static inline void xfrm_state_sort(struct xfrm_state **d, struct xfrm_state **s,
                                   int n, unsigned short family)
{
}
#endif

struct xfrmk_sadinfo {
        u32 sadhcnt; /* current hash bkts */
        u32 sadhmcnt; /* max allowed hash bkts */
        u32 sadcnt; /* current running count */
};

struct xfrmk_spdinfo {
        u32 incnt;
        u32 outcnt;
        u32 fwdcnt;
        u32 inscnt;
        u32 outscnt;
        u32 fwdscnt;
        u32 spdhcnt;
        u32 spdhmcnt;
};

struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num);
int xfrm_state_delete(struct xfrm_state *x);
int xfrm_state_flush(struct net *net, u8 proto, bool task_valid);
int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid);
int xfrm_dev_policy_flush(struct net *net, struct net_device *dev,
                          bool task_valid);
void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si);
void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si);
u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq);
int xfrm_init_replay(struct xfrm_state *x, struct netlink_ext_ack *extack);
u32 xfrm_state_mtu(struct xfrm_state *x, int mtu);
int __xfrm_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack);
int xfrm_init_state(struct xfrm_state *x);
int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type);
int xfrm_input_resume(struct sk_buff *skb, int nexthdr);
int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb,
                         int (*finish)(struct net *, struct sock *,
                                       struct sk_buff *));
int xfrm_trans_queue(struct sk_buff *skb,
                     int (*finish)(struct net *, struct sock *,
                                   struct sk_buff *));
int xfrm_output_resume(struct sock *sk, struct sk_buff *skb, int err);
int xfrm_output(struct sock *sk, struct sk_buff *skb);
int xfrm4_tunnel_check_size(struct sk_buff *skb);
#if IS_ENABLED(CONFIG_IPV6)
int xfrm6_tunnel_check_size(struct sk_buff *skb);
#else
static inline int xfrm6_tunnel_check_size(struct sk_buff *skb)
{
        return -EMSGSIZE;
}
#endif

#if IS_ENABLED(CONFIG_NET_PKTGEN)
int pktgen_xfrm_outer_mode_output(struct xfrm_state *x, struct sk_buff *skb);
#endif

void xfrm_local_error(struct sk_buff *skb, int mtu);
int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
                    int encap_type);
int xfrm4_transport_finish(struct sk_buff *skb, int async);
int xfrm4_rcv(struct sk_buff *skb);

static inline int xfrm4_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi)
{
        XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
        XFRM_SPI_SKB_CB(skb)->family = AF_INET;
        XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
        return xfrm_input(skb, nexthdr, spi, 0);
}

int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb);
int xfrm4_protocol_register(struct xfrm4_protocol *handler, unsigned char protocol);
int xfrm4_protocol_deregister(struct xfrm4_protocol *handler, unsigned char protocol);
int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family);
int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family);
void xfrm4_local_error(struct sk_buff *skb, u32 mtu);
int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi,
                  struct ip6_tnl *t);
int xfrm6_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
                    int encap_type);
int xfrm6_transport_finish(struct sk_buff *skb, int async);
int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t);
int xfrm6_rcv(struct sk_buff *skb);
int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
                     xfrm_address_t *saddr, u8 proto);
void xfrm6_local_error(struct sk_buff *skb, u32 mtu);
int xfrm6_protocol_register(struct xfrm6_protocol *handler, unsigned char protocol);
int xfrm6_protocol_deregister(struct xfrm6_protocol *handler, unsigned char protocol);
int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family);
int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family);
__be32 xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr);
__be32 xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr);
int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb);

#ifdef CONFIG_XFRM
void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu);
int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb);
int xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb);
struct sk_buff *xfrm4_gro_udp_encap_rcv(struct sock *sk, struct list_head *head,
                                        struct sk_buff *skb);
struct sk_buff *xfrm6_gro_udp_encap_rcv(struct sock *sk, struct list_head *head,
                                        struct sk_buff *skb);
int xfrm_user_policy(struct sock *sk, int optname, sockptr_t optval,
                     int optlen);
#else
static inline int xfrm_user_policy(struct sock *sk, int optname,
                                   sockptr_t optval, int optlen)
{
         return -ENOPROTOOPT;
}
#endif

struct dst_entry *__xfrm_dst_lookup(int family, const struct xfrm_dst_lookup_params *params);

struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp);

void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 type);
int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk,
                     int (*func)(struct xfrm_policy *, int, int, void*),
                     void *);
void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net);
int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl);
struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net,
                                          const struct xfrm_mark *mark,
                                          u32 if_id, u8 type, int dir,
                                          struct xfrm_selector *sel,
                                          struct xfrm_sec_ctx *ctx, int delete,
                                          int *err);
struct xfrm_policy *xfrm_policy_byid(struct net *net,
                                     const struct xfrm_mark *mark, u32 if_id,
                                     u8 type, int dir, u32 id, int delete,
                                     int *err);
int xfrm_policy_flush(struct net *net, u8 type, bool task_valid);
void xfrm_policy_hash_rebuild(struct net *net);
u32 xfrm_get_acqseq(void);
int verify_spi_info(u8 proto, u32 min, u32 max, struct netlink_ext_ack *extack);
int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi,
                   struct netlink_ext_ack *extack);
struct xfrm_state *xfrm_find_acq(struct net *net, const struct xfrm_mark *mark,
                                 u8 mode, u32 reqid, u32 if_id, u32 pcpu_num, u8 proto,
                                 const xfrm_address_t *daddr,
                                 const xfrm_address_t *saddr, int create,
                                 unsigned short family);
int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol);

#ifdef CONFIG_XFRM_MIGRATE
int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
               const struct xfrm_migrate *m, int num_bundles,
               const struct xfrm_kmaddress *k,
               const struct xfrm_encap_tmpl *encap);
struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net,
                                                u32 if_id);
struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x,
                                      struct xfrm_migrate *m,
                                      struct xfrm_encap_tmpl *encap,
                                      struct net *net,
                                      struct xfrm_user_offload *xuo,
                                      struct netlink_ext_ack *extack);
int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
                 struct xfrm_migrate *m, int num_bundles,
                 struct xfrm_kmaddress *k, struct net *net,
                 struct xfrm_encap_tmpl *encap, u32 if_id,
                 struct netlink_ext_ack *extack,
                 struct xfrm_user_offload *xuo);
#endif

int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport);
void km_policy_expired(struct xfrm_policy *pol, int dir, int hard, u32 portid);
int km_report(struct net *net, u8 proto, struct xfrm_selector *sel,
              xfrm_address_t *addr);

void xfrm_input_init(void);
int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq);

void xfrm_probe_algs(void);
int xfrm_count_pfkey_auth_supported(void);
int xfrm_count_pfkey_enc_supported(void);
struct xfrm_algo_desc *xfrm_aalg_get_byidx(unsigned int idx);
struct xfrm_algo_desc *xfrm_ealg_get_byidx(unsigned int idx);
struct xfrm_algo_desc *xfrm_aalg_get_byid(int alg_id);
struct xfrm_algo_desc *xfrm_ealg_get_byid(int alg_id);
struct xfrm_algo_desc *xfrm_calg_get_byid(int alg_id);
struct xfrm_algo_desc *xfrm_aalg_get_byname(const char *name, int probe);
struct xfrm_algo_desc *xfrm_ealg_get_byname(const char *name, int probe);
struct xfrm_algo_desc *xfrm_calg_get_byname(const char *name, int probe);
struct xfrm_algo_desc *xfrm_aead_get_byname(const char *name, int icv_len,
                                            int probe);

static inline bool xfrm6_addr_equal(const xfrm_address_t *a,
                                    const xfrm_address_t *b)
{
        return ipv6_addr_equal((const struct in6_addr *)a,
                               (const struct in6_addr *)b);
}

static inline bool xfrm_addr_equal(const xfrm_address_t *a,
                                   const xfrm_address_t *b,
                                   sa_family_t family)
{
        switch (family) {
        default:
        case AF_INET:
                return ((__force u32)a->a4 ^ (__force u32)b->a4) == 0;
        case AF_INET6:
                return xfrm6_addr_equal(a, b);
        }
}

static inline int xfrm_policy_id2dir(u32 index)
{
        return index & 7;
}

#ifdef CONFIG_XFRM
void xfrm_replay_advance(struct xfrm_state *x, __be32 net_seq);
int xfrm_replay_check(struct xfrm_state *x, struct sk_buff *skb, __be32 net_seq);
void xfrm_replay_notify(struct xfrm_state *x, int event);
int xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb);
int xfrm_replay_recheck(struct xfrm_state *x, struct sk_buff *skb, __be32 net_seq);

static inline int xfrm_aevent_is_on(struct net *net)
{
        struct sock *nlsk;
        int ret = 0;

        rcu_read_lock();
        nlsk = rcu_dereference(net->xfrm.nlsk);
        if (nlsk)
                ret = netlink_has_listeners(nlsk, XFRMNLGRP_AEVENTS);
        rcu_read_unlock();
        return ret;
}

static inline int xfrm_acquire_is_on(struct net *net)
{
        struct sock *nlsk;
        int ret = 0;

        rcu_read_lock();
        nlsk = rcu_dereference(net->xfrm.nlsk);
        if (nlsk)
                ret = netlink_has_listeners(nlsk, XFRMNLGRP_ACQUIRE);
        rcu_read_unlock();

        return ret;
}
#endif

static inline unsigned int aead_len(struct xfrm_algo_aead *alg)
{
        return sizeof(*alg) + ((alg->alg_key_len + 7) / 8);
}

static inline unsigned int xfrm_alg_len(const struct xfrm_algo *alg)
{
        return sizeof(*alg) + ((alg->alg_key_len + 7) / 8);
}

static inline unsigned int xfrm_alg_auth_len(const struct xfrm_algo_auth *alg)
{
        return sizeof(*alg) + ((alg->alg_key_len + 7) / 8);
}

static inline unsigned int xfrm_replay_state_esn_len(struct xfrm_replay_state_esn *replay_esn)
{
        return sizeof(*replay_esn) + replay_esn->bmp_len * sizeof(__u32);
}

#ifdef CONFIG_XFRM_MIGRATE
static inline int xfrm_replay_clone(struct xfrm_state *x,
                                     struct xfrm_state *orig)
{

        x->replay_esn = kmemdup(orig->replay_esn,
                                xfrm_replay_state_esn_len(orig->replay_esn),
                                GFP_KERNEL);
        if (!x->replay_esn)
                return -ENOMEM;
        x->preplay_esn = kmemdup(orig->preplay_esn,
                                 xfrm_replay_state_esn_len(orig->preplay_esn),
                                 GFP_KERNEL);
        if (!x->preplay_esn)
                return -ENOMEM;

        return 0;
}

static inline struct xfrm_algo_aead *xfrm_algo_aead_clone(struct xfrm_algo_aead *orig)
{
        return kmemdup(orig, aead_len(orig), GFP_KERNEL);
}


static inline struct xfrm_algo *xfrm_algo_clone(struct xfrm_algo *orig)
{
        return kmemdup(orig, xfrm_alg_len(orig), GFP_KERNEL);
}

static inline struct xfrm_algo_auth *xfrm_algo_auth_clone(struct xfrm_algo_auth *orig)
{
        return kmemdup(orig, xfrm_alg_auth_len(orig), GFP_KERNEL);
}

static inline void xfrm_states_put(struct xfrm_state **states, int n)
{
        int i;
        for (i = 0; i < n; i++)
                xfrm_state_put(*(states + i));
}

static inline void xfrm_states_delete(struct xfrm_state **states, int n)
{
        int i;
        for (i = 0; i < n; i++)
                xfrm_state_delete(*(states + i));
}
#endif

void __init xfrm_dev_init(void);

#ifdef CONFIG_XFRM_OFFLOAD
void xfrm_dev_resume(struct sk_buff *skb);
void xfrm_dev_backlog(struct softnet_data *sd);
struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again);
int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
                       struct xfrm_user_offload *xuo,
                       struct netlink_ext_ack *extack);
int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp,
                        struct xfrm_user_offload *xuo, u8 dir,
                        struct netlink_ext_ack *extack);
bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x);
void xfrm_dev_state_delete(struct xfrm_state *x);
void xfrm_dev_state_free(struct xfrm_state *x);

static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x)
{
        struct xfrm_dev_offload *xso = &x->xso;
        struct net_device *dev = READ_ONCE(xso->dev);

        if (dev && dev->xfrmdev_ops->xdo_dev_state_advance_esn)
                dev->xfrmdev_ops->xdo_dev_state_advance_esn(x);
}

static inline bool xfrm_dst_offload_ok(struct dst_entry *dst)
{
        struct xfrm_state *x = dst->xfrm;
        struct xfrm_dst *xdst;

        if (!x || !x->type_offload)
                return false;

        xdst = (struct xfrm_dst *) dst;
        if (!x->xso.offload_handle && !xdst->child->xfrm)
                return true;
        if (x->xso.offload_handle && (x->xso.dev == xfrm_dst_path(dst)->dev) &&
            !xdst->child->xfrm)
                return true;

        return false;
}

static inline void xfrm_dev_policy_delete(struct xfrm_policy *x)
{
        struct xfrm_dev_offload *xdo = &x->xdo;
        struct net_device *dev = xdo->dev;

        if (dev && dev->xfrmdev_ops && dev->xfrmdev_ops->xdo_dev_policy_delete)
                dev->xfrmdev_ops->xdo_dev_policy_delete(x);
}

static inline void xfrm_dev_policy_free(struct xfrm_policy *x)
{
        struct xfrm_dev_offload *xdo = &x->xdo;
        struct net_device *dev = xdo->dev;

        if (dev && dev->xfrmdev_ops) {
                if (dev->xfrmdev_ops->xdo_dev_policy_free)
                        dev->xfrmdev_ops->xdo_dev_policy_free(x);
                xdo->dev = NULL;
                netdev_put(dev, &xdo->dev_tracker);
        }
}
#else
static inline void xfrm_dev_resume(struct sk_buff *skb)
{
}

static inline void xfrm_dev_backlog(struct softnet_data *sd)
{
}

static inline struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again)
{
        return skb;
}

static inline int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_user_offload *xuo, struct netlink_ext_ack *extack)
{
        return 0;
}

static inline void xfrm_dev_state_delete(struct xfrm_state *x)
{
}

static inline void xfrm_dev_state_free(struct xfrm_state *x)
{
}

static inline int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp,
                                      struct xfrm_user_offload *xuo, u8 dir,
                                      struct netlink_ext_ack *extack)
{
        return 0;
}

static inline void xfrm_dev_policy_delete(struct xfrm_policy *x)
{
}

static inline void xfrm_dev_policy_free(struct xfrm_policy *x)
{
}

static inline bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
{
        return false;
}

static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x)
{
}

static inline bool xfrm_dst_offload_ok(struct dst_entry *dst)
{
        return false;
}
#endif

static inline int xfrm_mark_get(struct nlattr **attrs, struct xfrm_mark *m)
{
        if (attrs[XFRMA_MARK])
                memcpy(m, nla_data(attrs[XFRMA_MARK]), sizeof(struct xfrm_mark));
        else
                m->v = m->m = 0;

        return m->v & m->m;
}

static inline int xfrm_mark_put(struct sk_buff *skb, const struct xfrm_mark *m)
{
        int ret = 0;

        if (m->m | m->v)
                ret = nla_put(skb, XFRMA_MARK, sizeof(struct xfrm_mark), m);
        return ret;
}

static inline __u32 xfrm_smark_get(__u32 mark, struct xfrm_state *x)
{
        struct xfrm_mark *m = &x->props.smark;

        return (m->v & m->m) | (mark & ~m->m);
}

static inline int xfrm_if_id_put(struct sk_buff *skb, __u32 if_id)
{
        int ret = 0;

        if (if_id)
                ret = nla_put_u32(skb, XFRMA_IF_ID, if_id);
        return ret;
}

static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x,
                                    unsigned int family)
{
        bool tunnel = false;

        switch(family) {
        case AF_INET:
                if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4)
                        tunnel = true;
                break;
        case AF_INET6:
                if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6)
                        tunnel = true;
                break;
        }
        if (tunnel && !(x->outer_mode.flags & XFRM_MODE_FLAG_TUNNEL))
                return -EINVAL;

        return 0;
}

extern const int xfrm_msg_min[XFRM_NR_MSGTYPES];
extern const struct nla_policy xfrma_policy[XFRMA_MAX+1];

struct xfrm_translator {
        /* Allocate frag_list and put compat translation there */
        int (*alloc_compat)(struct sk_buff *skb, const struct nlmsghdr *src);

        /* Allocate nlmsg with 64-bit translaton of received 32-bit message */
        struct nlmsghdr *(*rcv_msg_compat)(const struct nlmsghdr *nlh,
                        int maxtype, const struct nla_policy *policy,
                        struct netlink_ext_ack *extack);

        /* Translate 32-bit user_policy from sockptr */
        int (*xlate_user_policy_sockptr)(u8 **pdata32, int optlen);

        struct module *owner;
};

#if IS_ENABLED(CONFIG_XFRM_USER_COMPAT)
extern int xfrm_register_translator(struct xfrm_translator *xtr);
extern int xfrm_unregister_translator(struct xfrm_translator *xtr);
extern struct xfrm_translator *xfrm_get_translator(void);
extern void xfrm_put_translator(struct xfrm_translator *xtr);
#else
static inline struct xfrm_translator *xfrm_get_translator(void)
{
        return NULL;
}
static inline void xfrm_put_translator(struct xfrm_translator *xtr)
{
}
#endif

#if IS_ENABLED(CONFIG_IPV6)
static inline bool xfrm6_local_dontfrag(const struct sock *sk)
{
        int proto;

        if (!sk || sk->sk_family != AF_INET6)
                return false;

        proto = sk->sk_protocol;
        if (proto == IPPROTO_UDP || proto == IPPROTO_RAW)
                return inet6_test_bit(DONTFRAG, sk);

        return false;
}
#endif

#if (IS_BUILTIN(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \
    (IS_MODULE(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))

extern struct metadata_dst __percpu *xfrm_bpf_md_dst;

int register_xfrm_interface_bpf(void);

#else

static inline int register_xfrm_interface_bpf(void)
{
        return 0;
}

#endif

#if IS_ENABLED(CONFIG_DEBUG_INFO_BTF)
int register_xfrm_state_bpf(void);
#else
static inline int register_xfrm_state_bpf(void)
{
        return 0;
}
#endif

int xfrm_nat_keepalive_init(unsigned short family);
void xfrm_nat_keepalive_fini(unsigned short family);
int xfrm_nat_keepalive_net_init(struct net *net);
int xfrm_nat_keepalive_net_fini(struct net *net);
void xfrm_nat_keepalive_state_updated(struct xfrm_state *x);

#endif        /* _NET_XFRM_H */

















































































































































































































































































































































  316 





  310 
  313 

















  313 










































































  311 
    1 

    1 
  309 


  309 



























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
// SPDX-License-Identifier: GPL-2.0
/*
 * linux/kernel/capability.c
 *
 * Copyright (C) 1997  Andrew Main <zefram@fysh.org>
 *
 * Integrated into 2.1.97+,  Andrew G. Morgan <morgan@kernel.org>
 * 30 May 2002:        Cleanup, Robert M. Love <rml@tech9.net>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/export.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <linux/uaccess.h>

int file_caps_enabled = 1;

static int __init file_caps_disable(char *str)
{
        file_caps_enabled = 0;
        return 1;
}
__setup("no_file_caps", file_caps_disable);

#ifdef CONFIG_MULTIUSER
/*
 * More recent versions of libcap are available from:
 *
 *   http://www.kernel.org/pub/linux/libs/security/linux-privs/
 */

static void warn_legacy_capability_use(void)
{
        pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n",
                     current->comm);
}

/*
 * Version 2 capabilities worked fine, but the linux/capability.h file
 * that accompanied their introduction encouraged their use without
 * the necessary user-space source code changes. As such, we have
 * created a version 3 with equivalent functionality to version 2, but
 * with a header change to protect legacy source code from using
 * version 2 when it wanted to use version 1. If your system has code
 * that trips the following warning, it is using version 2 specific
 * capabilities and may be doing so insecurely.
 *
 * The remedy is to either upgrade your version of libcap (to 2.10+,
 * if the application is linked against it), or recompile your
 * application with modern kernel headers and this warning will go
 * away.
 */

static void warn_deprecated_v2(void)
{
        pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n",
                     current->comm);
}

/*
 * Version check. Return the number of u32s in each capability flag
 * array, or a negative value on error.
 */
static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
{
        __u32 version;

        if (get_user(version, &header->version))
                return -EFAULT;

        switch (version) {
        case _LINUX_CAPABILITY_VERSION_1:
                warn_legacy_capability_use();
                *tocopy = _LINUX_CAPABILITY_U32S_1;
                break;
        case _LINUX_CAPABILITY_VERSION_2:
                warn_deprecated_v2();
                fallthrough;        /* v3 is otherwise equivalent to v2 */
        case _LINUX_CAPABILITY_VERSION_3:
                *tocopy = _LINUX_CAPABILITY_U32S_3;
                break;
        default:
                if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version))
                        return -EFAULT;
                return -EINVAL;
        }

        return 0;
}

/*
 * The only thing that can change the capabilities of the current
 * process is the current process. As such, we can't be in this code
 * at the same time as we are in the process of setting capabilities
 * in this process. The net result is that we can limit our use of
 * locks to when we are reading the caps of another process.
 */
static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
                                     kernel_cap_t *pIp, kernel_cap_t *pPp)
{
        int ret;

        if (pid && (pid != task_pid_vnr(current))) {
                const struct task_struct *target;

                rcu_read_lock();

                target = find_task_by_vpid(pid);
                if (!target)
                        ret = -ESRCH;
                else
                        ret = security_capget(target, pEp, pIp, pPp);

                rcu_read_unlock();
        } else
                ret = security_capget(current, pEp, pIp, pPp);

        return ret;
}

/**
 * sys_capget - get the capabilities of a given process.
 * @header: pointer to struct that contains capability version and
 *        target pid data
 * @dataptr: pointer to struct that contains the effective, permitted,
 *        and inheritable capabilities that are returned
 *
 * Returns 0 on success and < 0 on error.
 */
SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
{
        int ret = 0;
        pid_t pid;
        unsigned tocopy;
        kernel_cap_t pE, pI, pP;
        struct __user_cap_data_struct kdata[2];

        ret = cap_validate_magic(header, &tocopy);
        if ((dataptr == NULL) || (ret != 0))
                return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret;

        if (get_user(pid, &header->pid))
                return -EFAULT;

        if (pid < 0)
                return -EINVAL;

        ret = cap_get_target_pid(pid, &pE, &pI, &pP);
        if (ret)
                return ret;

        /*
         * Annoying legacy format with 64-bit capabilities exposed
         * as two sets of 32-bit fields, so we need to split the
         * capability values up.
         */
        kdata[0].effective   = pE.val; kdata[1].effective   = pE.val >> 32;
        kdata[0].permitted   = pP.val; kdata[1].permitted   = pP.val >> 32;
        kdata[0].inheritable = pI.val; kdata[1].inheritable = pI.val >> 32;

        /*
         * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S,
         * we silently drop the upper capabilities here. This
         * has the effect of making older libcap
         * implementations implicitly drop upper capability
         * bits when they perform a: capget/modify/capset
         * sequence.
         *
         * This behavior is considered fail-safe
         * behavior. Upgrading the application to a newer
         * version of libcap will enable access to the newer
         * capabilities.
         *
         * An alternative would be to return an error here
         * (-ERANGE), but that causes legacy applications to
         * unexpectedly fail; the capget/modify/capset aborts
         * before modification is attempted and the application
         * fails.
         */
        if (copy_to_user(dataptr, kdata, tocopy * sizeof(kdata[0])))
                return -EFAULT;

        return 0;
}

static kernel_cap_t mk_kernel_cap(u32 low, u32 high)
{
        return (kernel_cap_t) { (low | ((u64)high << 32)) & CAP_VALID_MASK };
}

/**
 * sys_capset - set capabilities for a process or (*) a group of processes
 * @header: pointer to struct that contains capability version and
 *        target pid data
 * @data: pointer to struct that contains the effective, permitted,
 *        and inheritable capabilities
 *
 * Set capabilities for the current process only.  The ability to any other
 * process(es) has been deprecated and removed.
 *
 * The restrictions on setting capabilities are specified as:
 *
 * I: any raised capabilities must be a subset of the old permitted
 * P: any raised capabilities must be a subset of the old permitted
 * E: must be set to a subset of new permitted
 *
 * Returns 0 on success and < 0 on error.
 */
SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
{
        struct __user_cap_data_struct kdata[2] = { { 0, }, };
        unsigned tocopy, copybytes;
        kernel_cap_t inheritable, permitted, effective;
        struct cred *new;
        int ret;
        pid_t pid;

        ret = cap_validate_magic(header, &tocopy);
        if (ret != 0)
                return ret;

        if (get_user(pid, &header->pid))
                return -EFAULT;

        /* may only affect current now */
        if (pid != 0 && pid != task_pid_vnr(current))
                return -EPERM;

        copybytes = tocopy * sizeof(struct __user_cap_data_struct);
        if (copybytes > sizeof(kdata))
                return -EFAULT;

        if (copy_from_user(&kdata, data, copybytes))
                return -EFAULT;

        effective   = mk_kernel_cap(kdata[0].effective,   kdata[1].effective);
        permitted   = mk_kernel_cap(kdata[0].permitted,   kdata[1].permitted);
        inheritable = mk_kernel_cap(kdata[0].inheritable, kdata[1].inheritable);

        new = prepare_creds();
        if (!new)
                return -ENOMEM;

        ret = security_capset(new, current_cred(),
                              &effective, &inheritable, &permitted);
        if (ret < 0)
                goto error;

        audit_log_capset(new, current_cred());

        return commit_creds(new);

error:
        abort_creds(new);
        return ret;
}

/**
 * has_ns_capability - Does a task have a capability in a specific user ns
 * @t: The task in question
 * @ns: target user namespace
 * @cap: The capability to be tested for
 *
 * Return true if the specified task has the given superior capability
 * currently in effect to the specified user namespace, false if not.
 *
 * Note that this does not set PF_SUPERPRIV on the task.
 */
bool has_ns_capability(struct task_struct *t,
                       struct user_namespace *ns, int cap)
{
        int ret;

        rcu_read_lock();
        ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NONE);
        rcu_read_unlock();

        return (ret == 0);
}

/**
 * has_ns_capability_noaudit - Does a task have a capability (unaudited)
 * in a specific user ns.
 * @t: The task in question
 * @ns: target user namespace
 * @cap: The capability to be tested for
 *
 * Return true if the specified task has the given superior capability
 * currently in effect to the specified user namespace, false if not.
 * Do not write an audit message for the check.
 *
 * Note that this does not set PF_SUPERPRIV on the task.
 */
bool has_ns_capability_noaudit(struct task_struct *t,
                               struct user_namespace *ns, int cap)
{
        int ret;

        rcu_read_lock();
        ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NOAUDIT);
        rcu_read_unlock();

        return (ret == 0);
}

/**
 * has_capability_noaudit - Does a task have a capability (unaudited) in the
 * initial user ns
 * @t: The task in question
 * @cap: The capability to be tested for
 *
 * Return true if the specified task has the given superior capability
 * currently in effect to init_user_ns, false if not.  Don't write an
 * audit message for the check.
 *
 * Note that this does not set PF_SUPERPRIV on the task.
 */
bool has_capability_noaudit(struct task_struct *t, int cap)
{
        return has_ns_capability_noaudit(t, &init_user_ns, cap);
}
EXPORT_SYMBOL(has_capability_noaudit);

static bool ns_capable_common(struct user_namespace *ns,
                              int cap,
                              unsigned int opts)
{
        int capable;

        if (unlikely(!cap_valid(cap))) {
                pr_crit("capable() called with invalid cap=%u\n", cap);
                BUG();
        }

        capable = security_capable(current_cred(), ns, cap, opts);
        if (capable == 0) {
                current->flags |= PF_SUPERPRIV;
                return true;
        }
        return false;
}

/**
 * ns_capable - Determine if the current task has a superior capability in effect
 * @ns:  The usernamespace we want the capability in
 * @cap: The capability to be tested for
 *
 * Return true if the current task has the given superior capability currently
 * available for use, false if not.
 *
 * This sets PF_SUPERPRIV on the task if the capability is available on the
 * assumption that it's about to be used.
 */
bool ns_capable(struct user_namespace *ns, int cap)
{
        return ns_capable_common(ns, cap, CAP_OPT_NONE);
}
EXPORT_SYMBOL(ns_capable);

/**
 * ns_capable_noaudit - Determine if the current task has a superior capability
 * (unaudited) in effect
 * @ns:  The usernamespace we want the capability in
 * @cap: The capability to be tested for
 *
 * Return true if the current task has the given superior capability currently
 * available for use, false if not.
 *
 * This sets PF_SUPERPRIV on the task if the capability is available on the
 * assumption that it's about to be used.
 */
bool ns_capable_noaudit(struct user_namespace *ns, int cap)
{
        return ns_capable_common(ns, cap, CAP_OPT_NOAUDIT);
}
EXPORT_SYMBOL(ns_capable_noaudit);

/**
 * ns_capable_setid - Determine if the current task has a superior capability
 * in effect, while signalling that this check is being done from within a
 * setid or setgroups syscall.
 * @ns:  The usernamespace we want the capability in
 * @cap: The capability to be tested for
 *
 * Return true if the current task has the given superior capability currently
 * available for use, false if not.
 *
 * This sets PF_SUPERPRIV on the task if the capability is available on the
 * assumption that it's about to be used.
 */
bool ns_capable_setid(struct user_namespace *ns, int cap)
{
        return ns_capable_common(ns, cap, CAP_OPT_INSETID);
}
EXPORT_SYMBOL(ns_capable_setid);

/**
 * capable - Determine if the current task has a superior capability in effect
 * @cap: The capability to be tested for
 *
 * Return true if the current task has the given superior capability currently
 * available for use, false if not.
 *
 * This sets PF_SUPERPRIV on the task if the capability is available on the
 * assumption that it's about to be used.
 */
bool capable(int cap)
{
        return ns_capable(&init_user_ns, cap);
}
EXPORT_SYMBOL(capable);
#endif /* CONFIG_MULTIUSER */

/**
 * file_ns_capable - Determine if the file's opener had a capability in effect
 * @file:  The file we want to check
 * @ns:  The usernamespace we want the capability in
 * @cap: The capability to be tested for
 *
 * Return true if task that opened the file had a capability in effect
 * when the file was opened.
 *
 * This does not set PF_SUPERPRIV because the caller may not
 * actually be privileged.
 */
bool file_ns_capable(const struct file *file, struct user_namespace *ns,
                     int cap)
{

        if (WARN_ON_ONCE(!cap_valid(cap)))
                return false;

        if (security_capable(file->f_cred, ns, cap, CAP_OPT_NONE) == 0)
                return true;

        return false;
}
EXPORT_SYMBOL(file_ns_capable);

/**
 * privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode?
 * @ns: The user namespace in question
 * @idmap: idmap of the mount @inode was found from
 * @inode: The inode in question
 *
 * Return true if the inode uid and gid are within the namespace.
 */
bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
                                 struct mnt_idmap *idmap,
                                 const struct inode *inode)
{
        return vfsuid_has_mapping(ns, i_uid_into_vfsuid(idmap, inode)) &&
               vfsgid_has_mapping(ns, i_gid_into_vfsgid(idmap, inode));
}

/**
 * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
 * @idmap: idmap of the mount @inode was found from
 * @inode: The inode in question
 * @cap: The capability in question
 *
 * Return true if the current task has the given capability targeted at
 * its own user namespace and that the given inode's uid and gid are
 * mapped into the current user namespace.
 */
bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap,
                              const struct inode *inode, int cap)
{
        struct user_namespace *ns = current_user_ns();

        return ns_capable(ns, cap) &&
               privileged_wrt_inode_uidgid(ns, idmap, inode);
}
EXPORT_SYMBOL(capable_wrt_inode_uidgid);

/**
 * ptracer_capable - Determine if the ptracer holds CAP_SYS_PTRACE in the namespace
 * @tsk: The task that may be ptraced
 * @ns: The user namespace to search for CAP_SYS_PTRACE in
 *
 * Return true if the task that is ptracing the current task had CAP_SYS_PTRACE
 * in the specified user namespace.
 */
bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns)
{
        int ret = 0;  /* An absent tracer adds no restrictions */
        const struct cred *cred;

        rcu_read_lock();
        cred = rcu_dereference(tsk->ptracer_cred);
        if (cred)
                ret = security_capable(cred, ns, CAP_SYS_PTRACE,
                                       CAP_OPT_NOAUDIT);
        rcu_read_unlock();
        return (ret == 0);
}








































































  319 











  313 













  319 





















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_NET_SCM_H
#define __LINUX_NET_SCM_H

#include <linux/limits.h>
#include <linux/net.h>
#include <linux/cred.h>
#include <linux/file.h>
#include <linux/security.h>
#include <linux/pid.h>
#include <linux/nsproxy.h>
#include <linux/sched/signal.h>
#include <net/compat.h>

/* Well, we should have at least one descriptor open
 * to accept passed FDs 8)
 */
#define SCM_MAX_FD        253

struct scm_creds {
        u32        pid;
        kuid_t        uid;
        kgid_t        gid;
};

#ifdef CONFIG_UNIX
struct unix_edge;
#endif

struct scm_fp_list {
        short                        count;
        short                        count_unix;
        short                        max;
#ifdef CONFIG_UNIX
        bool                        inflight;
        bool                        dead;
        struct list_head        vertices;
        struct unix_edge        *edges;
#endif
        struct user_struct        *user;
        struct file                *fp[SCM_MAX_FD];
};

struct scm_cookie {
        struct pid                *pid;                /* Skb credentials */
        struct scm_fp_list        *fp;                /* Passed files                */
        struct scm_creds        creds;                /* Skb credentials        */
#ifdef CONFIG_SECURITY_NETWORK
        u32                        secid;                /* Passed security ID         */
#endif
};

void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm);
void scm_detach_fds_compat(struct msghdr *msg, struct scm_cookie *scm);
int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm);
void __scm_destroy(struct scm_cookie *scm);
struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl);

#ifdef CONFIG_SECURITY_NETWORK
static __inline__ void unix_get_peersec_dgram(struct socket *sock, struct scm_cookie *scm)
{
        security_socket_getpeersec_dgram(sock, NULL, &scm->secid);
}
#else
static __inline__ void unix_get_peersec_dgram(struct socket *sock, struct scm_cookie *scm)
{ }
#endif /* CONFIG_SECURITY_NETWORK */

static __inline__ void scm_set_cred(struct scm_cookie *scm,
                                    struct pid *pid, kuid_t uid, kgid_t gid)
{
        scm->pid = get_pid(pid);
        scm->creds.pid = pid_vnr(pid);
        scm->creds.uid = uid;
        scm->creds.gid = gid;
}

static __inline__ void scm_destroy_cred(struct scm_cookie *scm)
{
        put_pid(scm->pid);
        scm->pid = NULL;
}

static __inline__ void scm_destroy(struct scm_cookie *scm)
{
        scm_destroy_cred(scm);
        if (scm->fp)
                __scm_destroy(scm);
}

static __inline__ int scm_send(struct socket *sock, struct msghdr *msg,
                               struct scm_cookie *scm, bool forcecreds)
{
        memset(scm, 0, sizeof(*scm));
        scm->creds.uid = INVALID_UID;
        scm->creds.gid = INVALID_GID;
        if (forcecreds)
                scm_set_cred(scm, task_tgid(current), current_uid(), current_gid());
        unix_get_peersec_dgram(sock, scm);
        if (msg->msg_controllen <= 0)
                return 0;
        return __scm_send(sock, msg, scm);
}

void scm_recv(struct socket *sock, struct msghdr *msg,
              struct scm_cookie *scm, int flags);
void scm_recv_unix(struct socket *sock, struct msghdr *msg,
                   struct scm_cookie *scm, int flags);

static inline int scm_recv_one_fd(struct file *f, int __user *ufd,
                                  unsigned int flags)
{
        if (!ufd)
                return -EFAULT;
        return receive_fd(f, ufd, flags);
}

#endif /* __LINUX_NET_SCM_H */





























   39 







   38 





   39 




   46 





   45 















   38 
   39 

   39 


   39 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * printk_safe.c - Safe printk for printk-deadlock-prone contexts
 */

#include <linux/preempt.h>
#include <linux/kdb.h>
#include <linux/smp.h>
#include <linux/cpumask.h>
#include <linux/printk.h>
#include <linux/kprobes.h>

#include "internal.h"

/* Context where printk messages are never suppressed */
static atomic_t force_con;

void printk_force_console_enter(void)
{
        atomic_inc(&force_con);
}

void printk_force_console_exit(void)
{
        atomic_dec(&force_con);
}

bool is_printk_force_console(void)
{
        return atomic_read(&force_con);
}

static DEFINE_PER_CPU(int, printk_context);

/* Can be preempted by NMI. */
void __printk_safe_enter(void)
{
        this_cpu_inc(printk_context);
}

/* Can be preempted by NMI. */
void __printk_safe_exit(void)
{
        this_cpu_dec(printk_context);
}

void __printk_deferred_enter(void)
{
        cant_migrate();
        __printk_safe_enter();
}

void __printk_deferred_exit(void)
{
        cant_migrate();
        __printk_safe_exit();
}

bool is_printk_legacy_deferred(void)
{
        /*
         * The per-CPU variable @printk_context can be read safely in any
         * context. CPU migration is always disabled when set.
         *
         * A context holding the printk_cpu_sync must not spin waiting for
         * another CPU. For legacy printing, it could be the console_lock
         * or the port lock.
         */
        return (force_legacy_kthread() ||
                this_cpu_read(printk_context) ||
                in_nmi() ||
                is_printk_cpu_sync_owner());
}

asmlinkage int vprintk(const char *fmt, va_list args)
{
#ifdef CONFIG_KGDB_KDB
        /* Allow to pass printk() to kdb but avoid a recursion. */
        if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0))
                return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
#endif
        return vprintk_default(fmt, args);
}
EXPORT_SYMBOL(vprintk);





















































































































































































































































































  316 









  317 





  317 


































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * net/dst.h        Protocol independent destination cache definitions.
 *
 * Authors:        Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 */

#ifndef _NET_DST_H
#define _NET_DST_H

#include <net/dst_ops.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/rcupdate.h>
#include <linux/bug.h>
#include <linux/jiffies.h>
#include <linux/refcount.h>
#include <linux/rcuref.h>
#include <net/neighbour.h>
#include <asm/processor.h>
#include <linux/indirect_call_wrapper.h>

struct sk_buff;

struct dst_entry {
        union {
                struct net_device       *dev;
                struct net_device __rcu *dev_rcu;
        };
        struct  dst_ops                *ops;
        unsigned long                _metrics;
        unsigned long           expires;
#ifdef CONFIG_XFRM
        struct xfrm_state        *xfrm;
#else
        void                        *__pad1;
#endif
        int                        (*input)(struct sk_buff *);
        int                        (*output)(struct net *net, struct sock *sk, struct sk_buff *skb);

        unsigned short                flags;
#define DST_NOXFRM                0x0002
#define DST_NOPOLICY                0x0004
#define DST_NOCOUNT                0x0008
#define DST_FAKE_RTABLE                0x0010
#define DST_XFRM_TUNNEL                0x0020
#define DST_XFRM_QUEUE                0x0040
#define DST_METADATA                0x0080

        /* A non-zero value of dst->obsolete forces by-hand validation
         * of the route entry.  Positive values are set by the generic
         * dst layer to indicate that the entry has been forcefully
         * destroyed.
         *
         * Negative values are used by the implementation layer code to
         * force invocation of the dst_ops->check() method.
         */
        short                        obsolete;
#define DST_OBSOLETE_NONE        0
#define DST_OBSOLETE_DEAD        2
#define DST_OBSOLETE_FORCE_CHK        -1
#define DST_OBSOLETE_KILL        -2
        unsigned short                header_len;        /* more space at head required */
        unsigned short                trailer_len;        /* space to reserve at tail */

        /*
         * __rcuref wants to be on a different cache line from
         * input/output/ops or performance tanks badly
         */
#ifdef CONFIG_64BIT
        rcuref_t                __rcuref;        /* 64-bit offset 64 */
#endif
        int                        __use;
        unsigned long                lastuse;
        struct rcu_head                rcu_head;
        short                        error;
        short                        __pad;
        __u32                        tclassid;
#ifndef CONFIG_64BIT
        struct lwtunnel_state   *lwtstate;
        rcuref_t                __rcuref;        /* 32-bit offset 64 */
#endif
        netdevice_tracker        dev_tracker;

        /*
         * Used by rtable and rt6_info. Moves lwtstate into the next cache
         * line on 64bit so that lwtstate does not cause false sharing with
         * __rcuref under contention of __rcuref. This also puts the
         * frequently accessed members of rtable and rt6_info out of the
         * __rcuref cache line.
         */
        struct list_head        rt_uncached;
        struct uncached_list        *rt_uncached_list;
#ifdef CONFIG_64BIT
        struct lwtunnel_state   *lwtstate;
#endif
};

struct dst_metrics {
        u32                metrics[RTAX_MAX];
        refcount_t        refcnt;
} __aligned(4);                /* Low pointer bits contain DST_METRICS_FLAGS */
extern const struct dst_metrics dst_default_metrics;

u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old);

#define DST_METRICS_READ_ONLY                0x1UL
#define DST_METRICS_REFCOUNTED                0x2UL
#define DST_METRICS_FLAGS                0x3UL
#define __DST_METRICS_PTR(Y)        \
        ((u32 *)((Y) & ~DST_METRICS_FLAGS))
#define DST_METRICS_PTR(X)        __DST_METRICS_PTR((X)->_metrics)

static inline bool dst_metrics_read_only(const struct dst_entry *dst)
{
        return dst->_metrics & DST_METRICS_READ_ONLY;
}

void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old);

static inline void dst_destroy_metrics_generic(struct dst_entry *dst)
{
        unsigned long val = dst->_metrics;
        if (!(val & DST_METRICS_READ_ONLY))
                __dst_destroy_metrics_generic(dst, val);
}

static inline u32 *dst_metrics_write_ptr(struct dst_entry *dst)
{
        unsigned long p = dst->_metrics;

        BUG_ON(!p);

        if (p & DST_METRICS_READ_ONLY)
                return dst->ops->cow_metrics(dst, p);
        return __DST_METRICS_PTR(p);
}

/* This may only be invoked before the entry has reached global
 * visibility.
 */
static inline void dst_init_metrics(struct dst_entry *dst,
                                    const u32 *src_metrics,
                                    bool read_only)
{
        dst->_metrics = ((unsigned long) src_metrics) |
                (read_only ? DST_METRICS_READ_ONLY : 0);
}

static inline void dst_copy_metrics(struct dst_entry *dest, const struct dst_entry *src)
{
        u32 *dst_metrics = dst_metrics_write_ptr(dest);

        if (dst_metrics) {
                u32 *src_metrics = DST_METRICS_PTR(src);

                memcpy(dst_metrics, src_metrics, RTAX_MAX * sizeof(u32));
        }
}

static inline u32 *dst_metrics_ptr(struct dst_entry *dst)
{
        return DST_METRICS_PTR(dst);
}

static inline u32
dst_metric_raw(const struct dst_entry *dst, const int metric)
{
        u32 *p = DST_METRICS_PTR(dst);

        return p[metric-1];
}

static inline u32
dst_metric(const struct dst_entry *dst, const int metric)
{
        WARN_ON_ONCE(metric == RTAX_HOPLIMIT ||
                     metric == RTAX_ADVMSS ||
                     metric == RTAX_MTU);
        return dst_metric_raw(dst, metric);
}

static inline u32
dst_metric_advmss(const struct dst_entry *dst)
{
        u32 advmss = dst_metric_raw(dst, RTAX_ADVMSS);

        if (!advmss)
                advmss = dst->ops->default_advmss(dst);

        return advmss;
}

static inline void dst_metric_set(struct dst_entry *dst, int metric, u32 val)
{
        u32 *p = dst_metrics_write_ptr(dst);

        if (p)
                p[metric-1] = val;
}

/* Kernel-internal feature bits that are unallocated in user space. */
#define DST_FEATURE_ECN_CA        (1U << 31)

#define DST_FEATURE_MASK        (DST_FEATURE_ECN_CA)
#define DST_FEATURE_ECN_MASK        (DST_FEATURE_ECN_CA | RTAX_FEATURE_ECN)

static inline u32
dst_feature(const struct dst_entry *dst, u32 feature)
{
        return dst_metric(dst, RTAX_FEATURES) & feature;
}

INDIRECT_CALLABLE_DECLARE(unsigned int ip6_mtu(const struct dst_entry *));
INDIRECT_CALLABLE_DECLARE(unsigned int ipv4_mtu(const struct dst_entry *));
static inline u32 dst_mtu(const struct dst_entry *dst)
{
        return INDIRECT_CALL_INET(dst->ops->mtu, ip6_mtu, ipv4_mtu, dst);
}

/* RTT metrics are stored in milliseconds for user ABI, but used as jiffies */
static inline unsigned long dst_metric_rtt(const struct dst_entry *dst, int metric)
{
        return msecs_to_jiffies(dst_metric(dst, metric));
}

static inline int
dst_metric_locked(const struct dst_entry *dst, int metric)
{
        return dst_metric(dst, RTAX_LOCK) & (1 << metric);
}

static inline void dst_hold(struct dst_entry *dst)
{
        /*
         * If your kernel compilation stops here, please check
         * the placement of __rcuref in struct dst_entry
         */
        BUILD_BUG_ON(offsetof(struct dst_entry, __rcuref) & 63);
        WARN_ON(!rcuref_get(&dst->__rcuref));
}

static inline void dst_use_noref(struct dst_entry *dst, unsigned long time)
{
        if (unlikely(time != READ_ONCE(dst->lastuse))) {
                dst->__use++;
                WRITE_ONCE(dst->lastuse, time);
        }
}

static inline struct dst_entry *dst_clone(struct dst_entry *dst)
{
        if (dst)
                dst_hold(dst);
        return dst;
}

void dst_release(struct dst_entry *dst);

void dst_release_immediate(struct dst_entry *dst);

static inline void refdst_drop(unsigned long refdst)
{
        if (!(refdst & SKB_DST_NOREF))
                dst_release((struct dst_entry *)(refdst & SKB_DST_PTRMASK));
}

/**
 * skb_dst_drop - drops skb dst
 * @skb: buffer
 *
 * Drops dst reference count if a reference was taken.
 */
static inline void skb_dst_drop(struct sk_buff *skb)
{
        if (skb->_skb_refdst) {
                refdst_drop(skb->_skb_refdst);
                skb->_skb_refdst = 0UL;
        }
}

static inline void __skb_dst_copy(struct sk_buff *nskb, unsigned long refdst)
{
        nskb->slow_gro |= !!refdst;
        nskb->_skb_refdst = refdst;
        if (!(nskb->_skb_refdst & SKB_DST_NOREF))
                dst_clone(skb_dst(nskb));
}

static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb)
{
        __skb_dst_copy(nskb, oskb->_skb_refdst);
}

/**
 * dst_hold_safe - Take a reference on a dst if possible
 * @dst: pointer to dst entry
 *
 * This helper returns false if it could not safely
 * take a reference on a dst.
 */
static inline bool dst_hold_safe(struct dst_entry *dst)
{
        return rcuref_get(&dst->__rcuref);
}

/**
 * skb_dst_force - makes sure skb dst is refcounted
 * @skb: buffer
 *
 * If dst is not yet refcounted and not destroyed, grab a ref on it.
 * Returns: true if dst is refcounted.
 */
static inline bool skb_dst_force(struct sk_buff *skb)
{
        if (skb_dst_is_noref(skb)) {
                struct dst_entry *dst = skb_dst(skb);

                WARN_ON(!rcu_read_lock_held());
                if (!dst_hold_safe(dst))
                        dst = NULL;

                skb->_skb_refdst = (unsigned long)dst;
                skb->slow_gro |= !!dst;
        }

        return skb->_skb_refdst != 0UL;
}


/**
 *        __skb_tunnel_rx - prepare skb for rx reinsert
 *        @skb: buffer
 *        @dev: tunnel device
 *        @net: netns for packet i/o
 *
 *        After decapsulation, packet is going to re-enter (netif_rx()) our stack,
 *        so make some cleanups. (no accounting done)
 */
static inline void __skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev,
                                   struct net *net)
{
        skb->dev = dev;

        /*
         * Clear hash so that we can recalculate the hash for the
         * encapsulated packet, unless we have already determine the hash
         * over the L4 4-tuple.
         */
        skb_clear_hash_if_not_l4(skb);
        skb_set_queue_mapping(skb, 0);
        skb_scrub_packet(skb, !net_eq(net, dev_net(dev)));
}

/**
 *        skb_tunnel_rx - prepare skb for rx reinsert
 *        @skb: buffer
 *        @dev: tunnel device
 *        @net: netns for packet i/o
 *
 *        After decapsulation, packet is going to re-enter (netif_rx()) our stack,
 *        so make some cleanups, and perform accounting.
 *        Note: this accounting is not SMP safe.
 */
static inline void skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev,
                                 struct net *net)
{
        DEV_STATS_INC(dev, rx_packets);
        DEV_STATS_ADD(dev, rx_bytes, skb->len);
        __skb_tunnel_rx(skb, dev, net);
}

static inline u32 dst_tclassid(const struct sk_buff *skb)
{
#ifdef CONFIG_IP_ROUTE_CLASSID
        const struct dst_entry *dst;

        dst = skb_dst(skb);
        if (dst)
                return dst->tclassid;
#endif
        return 0;
}

int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
static inline int dst_discard(struct sk_buff *skb)
{
        return dst_discard_out(&init_net, skb->sk, skb);
}
void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
                int initial_obsolete, unsigned short flags);
void dst_init(struct dst_entry *dst, struct dst_ops *ops,
              struct net_device *dev, int initial_obsolete,
              unsigned short flags);
void dst_dev_put(struct dst_entry *dst);

static inline void dst_confirm(struct dst_entry *dst)
{
}

static inline struct neighbour *dst_neigh_lookup(const struct dst_entry *dst, const void *daddr)
{
        struct neighbour *n = dst->ops->neigh_lookup(dst, NULL, daddr);
        return IS_ERR(n) ? NULL : n;
}

static inline struct neighbour *dst_neigh_lookup_skb(const struct dst_entry *dst,
                                                     struct sk_buff *skb)
{
        struct neighbour *n;

        if (WARN_ON_ONCE(!dst->ops->neigh_lookup))
                return NULL;

        n = dst->ops->neigh_lookup(dst, skb, NULL);

        return IS_ERR(n) ? NULL : n;
}

static inline void dst_confirm_neigh(const struct dst_entry *dst,
                                     const void *daddr)
{
        if (dst->ops->confirm_neigh)
                dst->ops->confirm_neigh(dst, daddr);
}

static inline void dst_link_failure(struct sk_buff *skb)
{
        struct dst_entry *dst = skb_dst(skb);
        if (dst && dst->ops && dst->ops->link_failure)
                dst->ops->link_failure(skb);
}

static inline void dst_set_expires(struct dst_entry *dst, int timeout)
{
        unsigned long old, expires = jiffies + timeout;

        if (expires == 0)
                expires = 1;

        old = READ_ONCE(dst->expires);

        if (!old || time_before(expires, old))
                WRITE_ONCE(dst->expires, expires);
}

static inline unsigned int dst_dev_overhead(struct dst_entry *dst,
                                            struct sk_buff *skb)
{
        if (likely(dst))
                return LL_RESERVED_SPACE(dst->dev);

        return skb->mac_len;
}

INDIRECT_CALLABLE_DECLARE(int ip6_output(struct net *, struct sock *,
                                         struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(int ip_output(struct net *, struct sock *,
                                         struct sk_buff *));
/* Output packet to network from transport.  */
static inline int dst_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        return INDIRECT_CALL_INET(READ_ONCE(skb_dst(skb)->output),
                                  ip6_output, ip_output,
                                  net, sk, skb);
}

INDIRECT_CALLABLE_DECLARE(int ip6_input(struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(int ip_local_deliver(struct sk_buff *));
/* Input packet from network to transport.  */
static inline int dst_input(struct sk_buff *skb)
{
        return INDIRECT_CALL_INET(READ_ONCE(skb_dst(skb)->input),
                                  ip6_input, ip_local_deliver, skb);
}

INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
                                                          u32));
INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
                                                           u32));
static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie)
{
        if (READ_ONCE(dst->obsolete))
                dst = INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check,
                                         ipv4_dst_check, dst, cookie);
        return dst;
}

/* Flags for xfrm_lookup flags argument. */
enum {
        XFRM_LOOKUP_ICMP = 1 << 0,
        XFRM_LOOKUP_QUEUE = 1 << 1,
        XFRM_LOOKUP_KEEP_DST_REF = 1 << 2,
};

struct flowi;
#ifndef CONFIG_XFRM
static inline struct dst_entry *xfrm_lookup(struct net *net,
                                            struct dst_entry *dst_orig,
                                            const struct flowi *fl,
                                            const struct sock *sk,
                                            int flags)
{
        return dst_orig;
}

static inline struct dst_entry *
xfrm_lookup_with_ifid(struct net *net, struct dst_entry *dst_orig,
                      const struct flowi *fl, const struct sock *sk,
                      int flags, u32 if_id)
{
        return dst_orig;
}

static inline struct dst_entry *xfrm_lookup_route(struct net *net,
                                                  struct dst_entry *dst_orig,
                                                  const struct flowi *fl,
                                                  const struct sock *sk,
                                                  int flags)
{
        return dst_orig;
}

static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst)
{
        return NULL;
}

#else
struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
                              const struct flowi *fl, const struct sock *sk,
                              int flags);

struct dst_entry *xfrm_lookup_with_ifid(struct net *net,
                                        struct dst_entry *dst_orig,
                                        const struct flowi *fl,
                                        const struct sock *sk, int flags,
                                        u32 if_id);

struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig,
                                    const struct flowi *fl, const struct sock *sk,
                                    int flags);

/* skb attached with this dst needs transformation if dst->xfrm is valid */
static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst)
{
        return dst->xfrm;
}
#endif

static inline void skb_dst_update_pmtu(struct sk_buff *skb, u32 mtu)
{
        struct dst_entry *dst = skb_dst(skb);

        if (dst && dst->ops->update_pmtu)
                dst->ops->update_pmtu(dst, NULL, skb, mtu, true);
}

/* update dst pmtu but not do neighbor confirm */
static inline void skb_dst_update_pmtu_no_confirm(struct sk_buff *skb, u32 mtu)
{
        struct dst_entry *dst = skb_dst(skb);

        if (dst && dst->ops->update_pmtu)
                dst->ops->update_pmtu(dst, NULL, skb, mtu, false);
}

static inline struct net_device *dst_dev(const struct dst_entry *dst)
{
        return READ_ONCE(dst->dev);
}

static inline struct net_device *dst_dev_rcu(const struct dst_entry *dst)
{
        return rcu_dereference(dst->dev_rcu);
}

static inline struct net *dst_dev_net_rcu(const struct dst_entry *dst)
{
        return dev_net_rcu(dst_dev_rcu(dst));
}

static inline struct net_device *skb_dst_dev(const struct sk_buff *skb)
{
        return dst_dev(skb_dst(skb));
}

static inline struct net_device *skb_dst_dev_rcu(const struct sk_buff *skb)
{
        return dst_dev_rcu(skb_dst(skb));
}

static inline struct net *skb_dst_dev_net(const struct sk_buff *skb)
{
        return dev_net(skb_dst_dev(skb));
}

static inline struct net *skb_dst_dev_net_rcu(const struct sk_buff *skb)
{
        return dev_net_rcu(skb_dst_dev_rcu(skb));
}

struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie);
void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
                               struct sk_buff *skb, u32 mtu, bool confirm_neigh);
void dst_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
                            struct sk_buff *skb);
u32 *dst_blackhole_cow_metrics(struct dst_entry *dst, unsigned long old);
struct neighbour *dst_blackhole_neigh_lookup(const struct dst_entry *dst,
                                             struct sk_buff *skb,
                                             const void *daddr);
unsigned int dst_blackhole_mtu(const struct dst_entry *dst);

#endif /* _NET_DST_H */














































    4 













































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM maple_tree

#if !defined(_TRACE_MM_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_MM_H


#include <linux/tracepoint.h>

struct ma_state;

TRACE_EVENT(ma_op,

        TP_PROTO(const char *fn, struct ma_state *mas),

        TP_ARGS(fn, mas),

        TP_STRUCT__entry(
                        __field(const char *, fn)
                        __field(unsigned long, min)
                        __field(unsigned long, max)
                        __field(unsigned long, index)
                        __field(unsigned long, last)
                        __field(void *, node)
        ),

        TP_fast_assign(
                        __entry->fn                = fn;
                        __entry->min                = mas->min;
                        __entry->max                = mas->max;
                        __entry->index                = mas->index;
                        __entry->last                = mas->last;
                        __entry->node                = mas->node;
        ),

        TP_printk("%s\tNode: %p (%lu %lu) range: %lu-%lu",
                  __entry->fn,
                  (void *) __entry->node,
                  (unsigned long) __entry->min,
                  (unsigned long) __entry->max,
                  (unsigned long) __entry->index,
                  (unsigned long) __entry->last
        )
)
TRACE_EVENT(ma_read,

        TP_PROTO(const char *fn, struct ma_state *mas),

        TP_ARGS(fn, mas),

        TP_STRUCT__entry(
                        __field(const char *, fn)
                        __field(unsigned long, min)
                        __field(unsigned long, max)
                        __field(unsigned long, index)
                        __field(unsigned long, last)
                        __field(void *, node)
        ),

        TP_fast_assign(
                        __entry->fn                = fn;
                        __entry->min                = mas->min;
                        __entry->max                = mas->max;
                        __entry->index                = mas->index;
                        __entry->last                = mas->last;
                        __entry->node                = mas->node;
        ),

        TP_printk("%s\tNode: %p (%lu %lu) range: %lu-%lu",
                  __entry->fn,
                  (void *) __entry->node,
                  (unsigned long) __entry->min,
                  (unsigned long) __entry->max,
                  (unsigned long) __entry->index,
                  (unsigned long) __entry->last
        )
)

TRACE_EVENT(ma_write,

        TP_PROTO(const char *fn, struct ma_state *mas, unsigned long piv,
                 void *val),

        TP_ARGS(fn, mas, piv, val),

        TP_STRUCT__entry(
                        __field(const char *, fn)
                        __field(unsigned long, min)
                        __field(unsigned long, max)
                        __field(unsigned long, index)
                        __field(unsigned long, last)
                        __field(unsigned long, piv)
                        __field(void *, val)
                        __field(void *, node)
        ),

        TP_fast_assign(
                        __entry->fn                = fn;
                        __entry->min                = mas->min;
                        __entry->max                = mas->max;
                        __entry->index                = mas->index;
                        __entry->last                = mas->last;
                        __entry->piv                = piv;
                        __entry->val                = val;
                        __entry->node                = mas->node;
        ),

        TP_printk("%s\tNode %p (%lu %lu) range:%lu-%lu piv (%lu) val %p",
                  __entry->fn,
                  (void *) __entry->node,
                  (unsigned long) __entry->min,
                  (unsigned long) __entry->max,
                  (unsigned long) __entry->index,
                  (unsigned long) __entry->last,
                  (unsigned long) __entry->piv,
                  (void *) __entry->val
        )
)
#endif /* _TRACE_MM_H */

/* This part must be outside protection */
#include <trace/define_trace.h>



































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
/* SPDX-License-Identifier: GPL-2.0
 *
 *        Network memory
 *
 *        Author:        Mina Almasry <almasrymina@google.com>
 */

#ifndef _NET_NETMEM_H
#define _NET_NETMEM_H

#include <linux/dma-mapping.h>
#include <linux/mm.h>
#include <net/net_debug.h>

/* These fields in struct page are used by the page_pool and net stack:
 *
 *        struct {
 *                unsigned long pp_magic;
 *                struct page_pool *pp;
 *                unsigned long _pp_mapping_pad;
 *                unsigned long dma_addr;
 *                atomic_long_t pp_ref_count;
 *        };
 *
 * We mirror the page_pool fields here so the page_pool can access these
 * fields without worrying whether the underlying fields belong to a
 * page or netmem_desc.
 *
 * CAUTION: Do not update the fields in netmem_desc without also
 * updating the anonymous aliasing union in struct net_iov.
 */
struct netmem_desc {
        unsigned long _flags;
        unsigned long pp_magic;
        struct page_pool *pp;
        unsigned long _pp_mapping_pad;
        unsigned long dma_addr;
        atomic_long_t pp_ref_count;
};

#define NETMEM_DESC_ASSERT_OFFSET(pg, desc)        \
        static_assert(offsetof(struct page, pg) == \
                      offsetof(struct netmem_desc, desc))
NETMEM_DESC_ASSERT_OFFSET(flags, _flags);
NETMEM_DESC_ASSERT_OFFSET(pp_magic, pp_magic);
NETMEM_DESC_ASSERT_OFFSET(pp, pp);
NETMEM_DESC_ASSERT_OFFSET(_pp_mapping_pad, _pp_mapping_pad);
NETMEM_DESC_ASSERT_OFFSET(dma_addr, dma_addr);
NETMEM_DESC_ASSERT_OFFSET(pp_ref_count, pp_ref_count);
#undef NETMEM_DESC_ASSERT_OFFSET

/*
 * Since struct netmem_desc uses the space in struct page, the size
 * should be checked, until struct netmem_desc has its own instance from
 * slab, to avoid conflicting with other members within struct page.
 */
static_assert(sizeof(struct netmem_desc) <= offsetof(struct page, _refcount));

/* net_iov */

DECLARE_STATIC_KEY_FALSE(page_pool_mem_providers);

/*  We overload the LSB of the struct page pointer to indicate whether it's
 *  a page or net_iov.
 */
#define NET_IOV 0x01UL

enum net_iov_type {
        NET_IOV_DMABUF,
        NET_IOV_IOURING,

        /* Force size to unsigned long to make the NET_IOV_ASSERTS below pass.
         */
        NET_IOV_MAX = ULONG_MAX
};

/* A memory descriptor representing abstract networking I/O vectors,
 * generally for non-pages memory that doesn't have its corresponding
 * struct page and needs to be explicitly allocated through slab.
 *
 * net_iovs are allocated and used by networking code, and the size of
 * the chunk is PAGE_SIZE.
 *
 * This memory can be any form of non-struct paged memory.  Examples
 * include imported dmabuf memory and imported io_uring memory.  See
 * net_iov_type for all the supported types.
 *
 * @pp_magic:        pp field, similar to the one in struct page/struct
 *                netmem_desc.
 * @pp:                the pp this net_iov belongs to, if any.
 * @dma_addr:        the dma addrs of the net_iov. Needed for the network
 *                card to send/receive this net_iov.
 * @pp_ref_count: the pp ref count of this net_iov, exactly the same
 *                usage as struct page/struct netmem_desc.
 * @owner:        the net_iov_area this net_iov belongs to, if any.
 * @type:        the type of the memory.  Different types of net_iovs are
 *                supported.
 */
struct net_iov {
        union {
                struct netmem_desc desc;

                /* XXX: The following part should be removed once all
                 * the references to them are converted so as to be
                 * accessed via netmem_desc e.g. niov->desc.pp instead
                 * of niov->pp.
                 */
                struct {
                        unsigned long _flags;
                        unsigned long pp_magic;
                        struct page_pool *pp;
                        unsigned long _pp_mapping_pad;
                        unsigned long dma_addr;
                        atomic_long_t pp_ref_count;
                };
        };
        struct net_iov_area *owner;
        enum net_iov_type type;
};

struct net_iov_area {
        /* Array of net_iovs for this area. */
        struct net_iov *niovs;
        size_t num_niovs;

        /* Offset into the dma-buf where this chunk starts.  */
        unsigned long base_virtual;
};

/* net_iov is union'ed with struct netmem_desc mirroring struct page, so
 * the page_pool can access these fields without worrying whether the
 * underlying fields are accessed via netmem_desc or directly via
 * net_iov, until all the references to them are converted so as to be
 * accessed via netmem_desc e.g. niov->desc.pp instead of niov->pp.
 *
 * The non-net stack fields of struct page are private to the mm stack
 * and must never be mirrored to net_iov.
 */
#define NET_IOV_ASSERT_OFFSET(desc, iov)                    \
        static_assert(offsetof(struct netmem_desc, desc) == \
                      offsetof(struct net_iov, iov))
NET_IOV_ASSERT_OFFSET(_flags, _flags);
NET_IOV_ASSERT_OFFSET(pp_magic, pp_magic);
NET_IOV_ASSERT_OFFSET(pp, pp);
NET_IOV_ASSERT_OFFSET(_pp_mapping_pad, _pp_mapping_pad);
NET_IOV_ASSERT_OFFSET(dma_addr, dma_addr);
NET_IOV_ASSERT_OFFSET(pp_ref_count, pp_ref_count);
#undef NET_IOV_ASSERT_OFFSET

static inline struct net_iov_area *net_iov_owner(const struct net_iov *niov)
{
        return niov->owner;
}

static inline unsigned int net_iov_idx(const struct net_iov *niov)
{
        return niov - net_iov_owner(niov)->niovs;
}

/* netmem */

/**
 * typedef netmem_ref - a nonexistent type marking a reference to generic
 * network memory.
 *
 * A netmem_ref can be a struct page* or a struct net_iov* underneath.
 *
 * Use the supplied helpers to obtain the underlying memory pointer and fields.
 */
typedef unsigned long __bitwise netmem_ref;

static inline bool netmem_is_net_iov(const netmem_ref netmem)
{
        return (__force unsigned long)netmem & NET_IOV;
}

/**
 * __netmem_to_page - unsafely get pointer to the &page backing @netmem
 * @netmem: netmem reference to convert
 *
 * Unsafe version of netmem_to_page(). When @netmem is always page-backed,
 * e.g. when it's a header buffer, performs faster and generates smaller
 * object code (no check for the LSB, no WARN). When @netmem points to IOV,
 * provokes undefined behaviour.
 *
 * Return: pointer to the &page (garbage if @netmem is not page-backed).
 */
static inline struct page *__netmem_to_page(netmem_ref netmem)
{
        return (__force struct page *)netmem;
}

static inline struct page *netmem_to_page(netmem_ref netmem)
{
        if (WARN_ON_ONCE(netmem_is_net_iov(netmem)))
                return NULL;

        return __netmem_to_page(netmem);
}

static inline struct net_iov *netmem_to_net_iov(netmem_ref netmem)
{
        if (netmem_is_net_iov(netmem))
                return (struct net_iov *)((__force unsigned long)netmem &
                                          ~NET_IOV);

        DEBUG_NET_WARN_ON_ONCE(true);
        return NULL;
}

static inline netmem_ref net_iov_to_netmem(struct net_iov *niov)
{
        return (__force netmem_ref)((unsigned long)niov | NET_IOV);
}

#define page_to_netmem(p)        (_Generic((p),                        \
        const struct page * :        (__force const netmem_ref)(p),        \
        struct page * :                (__force netmem_ref)(p)))

/**
 * virt_to_netmem - convert virtual memory pointer to a netmem reference
 * @data: host memory pointer to convert
 *
 * Return: netmem reference to the &page backing this virtual address.
 */
static inline netmem_ref virt_to_netmem(const void *data)
{
        return page_to_netmem(virt_to_page(data));
}

static inline int netmem_ref_count(netmem_ref netmem)
{
        /* The non-pp refcount of net_iov is always 1. On net_iov, we only
         * support pp refcounting which uses the pp_ref_count field.
         */
        if (netmem_is_net_iov(netmem))
                return 1;

        return page_ref_count(netmem_to_page(netmem));
}

static inline unsigned long netmem_pfn_trace(netmem_ref netmem)
{
        if (netmem_is_net_iov(netmem))
                return 0;

        return page_to_pfn(netmem_to_page(netmem));
}

/**
 * __netmem_to_nmdesc - unsafely get pointer to the &netmem_desc backing
 * @netmem
 * @netmem: netmem reference to convert
 *
 * Unsafe version that can be used only when @netmem is always backed by
 * system memory, performs faster and generates smaller object code (no
 * check for the LSB, no WARN). When @netmem points to IOV, provokes
 * undefined behaviour.
 *
 * Return: pointer to the &netmem_desc (garbage if @netmem is not backed
 * by system memory).
 */
static inline struct netmem_desc *__netmem_to_nmdesc(netmem_ref netmem)
{
        return (__force struct netmem_desc *)netmem;
}

/* __netmem_clear_lsb - convert netmem_ref to struct net_iov * for access to
 * common fields.
 * @netmem: netmem reference to extract as net_iov.
 *
 * All the sub types of netmem_ref (page, net_iov) have the same pp, pp_magic,
 * dma_addr, and pp_ref_count fields at the same offsets. Thus, we can access
 * these fields without a type check to make sure that the underlying mem is
 * net_iov or page.
 *
 * The resulting value of this function can only be used to access the fields
 * that are NET_IOV_ASSERT_OFFSET'd. Accessing any other fields will result in
 * undefined behavior.
 *
 * Return: the netmem_ref cast to net_iov* regardless of its underlying type.
 */
static inline struct net_iov *__netmem_clear_lsb(netmem_ref netmem)
{
        return (struct net_iov *)((__force unsigned long)netmem & ~NET_IOV);
}

/* XXX: How to extract netmem_desc from page must be changed, once
 * netmem_desc no longer overlays on page and will be allocated through
 * slab.
 */
#define __pp_page_to_nmdesc(p)        (_Generic((p),                                \
        const struct page * :        (const struct netmem_desc *)(p),        \
        struct page * :                (struct netmem_desc *)(p)))

/* CAUTION: Check if the page is a pp page before calling this helper or
 * know it's a pp page.
 */
#define pp_page_to_nmdesc(p)                                                \
({                                                                        \
        DEBUG_NET_WARN_ON_ONCE(!page_pool_page_is_pp(p));                \
        __pp_page_to_nmdesc(p);                                                \
})

/**
 * __netmem_get_pp - unsafely get pointer to the &page_pool backing @netmem
 * @netmem: netmem reference to get the pointer from
 *
 * Unsafe version of netmem_get_pp(). When @netmem is always page-backed,
 * e.g. when it's a header buffer, performs faster and generates smaller
 * object code (avoids clearing the LSB). When @netmem points to IOV,
 * provokes invalid memory access.
 *
 * Return: pointer to the &page_pool (garbage if @netmem is not page-backed).
 */
static inline struct page_pool *__netmem_get_pp(netmem_ref netmem)
{
        return __netmem_to_nmdesc(netmem)->pp;
}

static inline struct page_pool *netmem_get_pp(netmem_ref netmem)
{
        return __netmem_clear_lsb(netmem)->pp;
}

static inline atomic_long_t *netmem_get_pp_ref_count_ref(netmem_ref netmem)
{
        return &__netmem_clear_lsb(netmem)->pp_ref_count;
}

static inline bool netmem_is_pref_nid(netmem_ref netmem, int pref_nid)
{
        /* NUMA node preference only makes sense if we're allocating
         * system memory. Memory providers (which give us net_iovs)
         * choose for us.
         */
        if (netmem_is_net_iov(netmem))
                return true;

        return page_to_nid(netmem_to_page(netmem)) == pref_nid;
}

static inline netmem_ref netmem_compound_head(netmem_ref netmem)
{
        /* niov are never compounded */
        if (netmem_is_net_iov(netmem))
                return netmem;

        return page_to_netmem(compound_head(netmem_to_page(netmem)));
}

/**
 * __netmem_address - unsafely get pointer to the memory backing @netmem
 * @netmem: netmem reference to get the pointer for
 *
 * Unsafe version of netmem_address(). When @netmem is always page-backed,
 * e.g. when it's a header buffer, performs faster and generates smaller
 * object code (no check for the LSB). When @netmem points to IOV, provokes
 * undefined behaviour.
 *
 * Return: pointer to the memory (garbage if @netmem is not page-backed).
 */
static inline void *__netmem_address(netmem_ref netmem)
{
        return page_address(__netmem_to_page(netmem));
}

static inline void *netmem_address(netmem_ref netmem)
{
        if (netmem_is_net_iov(netmem))
                return NULL;

        return __netmem_address(netmem);
}

/**
 * netmem_is_pfmemalloc - check if @netmem was allocated under memory pressure
 * @netmem: netmem reference to check
 *
 * Return: true if @netmem is page-backed and the page was allocated under
 * memory pressure, false otherwise.
 */
static inline bool netmem_is_pfmemalloc(netmem_ref netmem)
{
        if (netmem_is_net_iov(netmem))
                return false;

        return page_is_pfmemalloc(netmem_to_page(netmem));
}

static inline unsigned long netmem_get_dma_addr(netmem_ref netmem)
{
        return __netmem_clear_lsb(netmem)->dma_addr;
}

void get_netmem(netmem_ref netmem);
void put_netmem(netmem_ref netmem);

#define netmem_dma_unmap_addr_set(NETMEM, PTR, ADDR_NAME, VAL)   \
        do {                                                     \
                if (!netmem_is_net_iov(NETMEM))                  \
                        dma_unmap_addr_set(PTR, ADDR_NAME, VAL); \
                else                                             \
                        dma_unmap_addr_set(PTR, ADDR_NAME, 0);   \
        } while (0)

static inline void netmem_dma_unmap_page_attrs(struct device *dev,
                                               dma_addr_t addr, size_t size,
                                               enum dma_data_direction dir,
                                               unsigned long attrs)
{
        if (!addr)
                return;

        dma_unmap_page_attrs(dev, addr, size, dir, attrs);
}

#endif /* _NET_NETMEM_H */




















































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  NET  is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the Ethernet handlers.
 *
 * Version:        @(#)eth.h        1.0.4        05/13/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *
 *                Relocated to include/linux where it belongs by Alan Cox
 *                                                        <gw4pts@gw4pts.ampr.org>
 */
#ifndef _LINUX_ETHERDEVICE_H
#define _LINUX_ETHERDEVICE_H

#include <linux/if_ether.h>
#include <linux/netdevice.h>
#include <linux/random.h>
#include <linux/crc32.h>
#include <linux/unaligned.h>
#include <asm/bitsperlong.h>

#ifdef __KERNEL__
struct device;
struct fwnode_handle;

int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr);
int platform_get_ethdev_address(struct device *dev, struct net_device *netdev);
unsigned char *arch_get_platform_mac_address(void);
int nvmem_get_mac_address(struct device *dev, void *addrbuf);
int device_get_mac_address(struct device *dev, char *addr);
int device_get_ethdev_address(struct device *dev, struct net_device *netdev);
int fwnode_get_mac_address(struct fwnode_handle *fwnode, char *addr);

u32 eth_get_headlen(const struct net_device *dev, const void *data, u32 len);
__be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev);
extern const struct header_ops eth_header_ops;

int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
               const void *daddr, const void *saddr, unsigned len);
int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr);
int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh,
                     __be16 type);
void eth_header_cache_update(struct hh_cache *hh, const struct net_device *dev,
                             const unsigned char *haddr);
__be16 eth_header_parse_protocol(const struct sk_buff *skb);
int eth_prepare_mac_addr_change(struct net_device *dev, void *p);
void eth_commit_mac_addr_change(struct net_device *dev, void *p);
int eth_mac_addr(struct net_device *dev, void *p);
int eth_validate_addr(struct net_device *dev);

struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
                                            unsigned int rxqs);
#define alloc_etherdev(sizeof_priv) alloc_etherdev_mq(sizeof_priv, 1)
#define alloc_etherdev_mq(sizeof_priv, count) alloc_etherdev_mqs(sizeof_priv, count, count)

struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv,
                                           unsigned int txqs,
                                           unsigned int rxqs);
#define devm_alloc_etherdev(dev, sizeof_priv) devm_alloc_etherdev_mqs(dev, sizeof_priv, 1, 1)

struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb);
int eth_gro_complete(struct sk_buff *skb, int nhoff);

/* Reserved Ethernet Addresses per IEEE 802.1Q */
static const u8 eth_reserved_addr_base[ETH_ALEN] __aligned(2) =
{ 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 };
#define eth_stp_addr eth_reserved_addr_base

static const u8 eth_ipv4_mcast_addr_base[ETH_ALEN] __aligned(2) =
{ 0x01, 0x00, 0x5e, 0x00, 0x00, 0x00 };

static const u8 eth_ipv6_mcast_addr_base[ETH_ALEN] __aligned(2) =
{ 0x33, 0x33, 0x00, 0x00, 0x00, 0x00 };

/**
 * is_link_local_ether_addr - Determine if given Ethernet address is link-local
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return: true if address is link local reserved addr (01:80:c2:00:00:0X) per
 * IEEE 802.1Q 8.6.3 Frame filtering.
 *
 * Please note: addr must be aligned to u16.
 */
static inline bool is_link_local_ether_addr(const u8 *addr)
{
        __be16 *a = (__be16 *)addr;
        static const __be16 *b = (const __be16 *)eth_reserved_addr_base;
        static const __be16 m = cpu_to_be16(0xfff0);

#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        return (((*(const u32 *)addr) ^ (*(const u32 *)b)) |
                (__force int)((a[2] ^ b[2]) & m)) == 0;
#else
        return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | ((a[2] ^ b[2]) & m)) == 0;
#endif
}

/**
 * is_zero_ether_addr - Determine if give Ethernet address is all zeros.
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return: true if the address is all zeroes.
 *
 * Please note: addr must be aligned to u16.
 */
static inline bool is_zero_ether_addr(const u8 *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        return ((*(const u32 *)addr) | (*(const u16 *)(addr + 4))) == 0;
#else
        return (*(const u16 *)(addr + 0) |
                *(const u16 *)(addr + 2) |
                *(const u16 *)(addr + 4)) == 0;
#endif
}

/**
 * is_multicast_ether_addr - Determine if the Ethernet address is a multicast.
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return: true if the address is a multicast address.
 * By definition the broadcast address is also a multicast address.
 */
static inline bool is_multicast_ether_addr(const u8 *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        u32 a = *(const u32 *)addr;
#else
        u16 a = *(const u16 *)addr;
#endif
#ifdef __BIG_ENDIAN
        return 0x01 & (a >> ((sizeof(a) * 8) - 8));
#else
        return 0x01 & a;
#endif
}

static inline bool is_multicast_ether_addr_64bits(const u8 *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
#ifdef __BIG_ENDIAN
        return 0x01 & ((*(const u64 *)addr) >> 56);
#else
        return 0x01 & (*(const u64 *)addr);
#endif
#else
        return is_multicast_ether_addr(addr);
#endif
}

/**
 * is_local_ether_addr - Determine if the Ethernet address is locally-assigned one (IEEE 802).
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return: true if the address is a local address.
 */
static inline bool is_local_ether_addr(const u8 *addr)
{
        return 0x02 & addr[0];
}

/**
 * is_broadcast_ether_addr - Determine if the Ethernet address is broadcast
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return: true if the address is the broadcast address.
 *
 * Please note: addr must be aligned to u16.
 */
static inline bool is_broadcast_ether_addr(const u8 *addr)
{
        return (*(const u16 *)(addr + 0) &
                *(const u16 *)(addr + 2) &
                *(const u16 *)(addr + 4)) == 0xffff;
}

/**
 * is_unicast_ether_addr - Determine if the Ethernet address is unicast
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return: true if the address is a unicast address.
 */
static inline bool is_unicast_ether_addr(const u8 *addr)
{
        return !is_multicast_ether_addr(addr);
}

/**
 * is_valid_ether_addr - Determine if the given Ethernet address is valid
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Check that the Ethernet address (MAC) is not 00:00:00:00:00:00, is not
 * a multicast address, and is not FF:FF:FF:FF:FF:FF.
 *
 * Return: true if the address is valid.
 *
 * Please note: addr must be aligned to u16.
 */
static inline bool is_valid_ether_addr(const u8 *addr)
{
        /* FF:FF:FF:FF:FF:FF is a multicast address so we don't need to
         * explicitly check for it here. */
        return !is_multicast_ether_addr(addr) && !is_zero_ether_addr(addr);
}

/**
 * eth_proto_is_802_3 - Determine if a given Ethertype/length is a protocol
 * @proto: Ethertype/length value to be tested
 *
 * Check that the value from the Ethertype/length field is a valid Ethertype.
 *
 * Return: true if the valid is an 802.3 supported Ethertype.
 */
static inline bool eth_proto_is_802_3(__be16 proto)
{
#ifndef __BIG_ENDIAN
        /* if CPU is little endian mask off bits representing LSB */
        proto &= htons(0xFF00);
#endif
        /* cast both to u16 and compare since LSB can be ignored */
        return (__force u16)proto >= (__force u16)htons(ETH_P_802_3_MIN);
}

/**
 * eth_random_addr - Generate software assigned random Ethernet address
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Generate a random Ethernet address (MAC) that is not multicast
 * and has the local assigned bit set.
 */
static inline void eth_random_addr(u8 *addr)
{
        get_random_bytes(addr, ETH_ALEN);
        addr[0] &= 0xfe;        /* clear multicast bit */
        addr[0] |= 0x02;        /* set local assignment bit (IEEE802) */
}

/**
 * eth_broadcast_addr - Assign broadcast address
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Assign the broadcast address to the given address array.
 */
static inline void eth_broadcast_addr(u8 *addr)
{
        memset(addr, 0xff, ETH_ALEN);
}

/**
 * eth_zero_addr - Assign zero address
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Assign the zero address to the given address array.
 */
static inline void eth_zero_addr(u8 *addr)
{
        memset(addr, 0x00, ETH_ALEN);
}

/**
 * eth_hw_addr_random - Generate software assigned random Ethernet and
 * set device flag
 * @dev: pointer to net_device structure
 *
 * Generate a random Ethernet address (MAC) to be used by a net device
 * and set addr_assign_type so the state can be read by sysfs and be
 * used by userspace.
 */
static inline void eth_hw_addr_random(struct net_device *dev)
{
        u8 addr[ETH_ALEN];

        eth_random_addr(addr);
        __dev_addr_set(dev, addr, ETH_ALEN);
        dev->addr_assign_type = NET_ADDR_RANDOM;
}

/**
 * eth_hw_addr_crc - Calculate CRC from netdev_hw_addr
 * @ha: pointer to hardware address
 *
 * Calculate CRC from a hardware address as basis for filter hashes.
 */
static inline u32 eth_hw_addr_crc(struct netdev_hw_addr *ha)
{
        return ether_crc(ETH_ALEN, ha->addr);
}

/**
 * ether_addr_copy - Copy an Ethernet address
 * @dst: Pointer to a six-byte array Ethernet address destination
 * @src: Pointer to a six-byte array Ethernet address source
 *
 * Please note: dst & src must both be aligned to u16.
 */
static inline void ether_addr_copy(u8 *dst, const u8 *src)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        *(u32 *)dst = *(const u32 *)src;
        *(u16 *)(dst + 4) = *(const u16 *)(src + 4);
#else
        u16 *a = (u16 *)dst;
        const u16 *b = (const u16 *)src;

        a[0] = b[0];
        a[1] = b[1];
        a[2] = b[2];
#endif
}

/**
 * eth_hw_addr_set - Assign Ethernet address to a net_device
 * @dev: pointer to net_device structure
 * @addr: address to assign
 *
 * Assign given address to the net_device, addr_assign_type is not changed.
 */
static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr)
{
        __dev_addr_set(dev, addr, ETH_ALEN);
}

/**
 * eth_hw_addr_inherit - Copy dev_addr from another net_device
 * @dst: pointer to net_device to copy dev_addr to
 * @src: pointer to net_device to copy dev_addr from
 *
 * Copy the Ethernet address from one net_device to another along with
 * the address attributes (addr_assign_type).
 */
static inline void eth_hw_addr_inherit(struct net_device *dst,
                                       struct net_device *src)
{
        dst->addr_assign_type = src->addr_assign_type;
        eth_hw_addr_set(dst, src->dev_addr);
}

/**
 * ether_addr_equal - Compare two Ethernet addresses
 * @addr1: Pointer to a six-byte array containing the Ethernet address
 * @addr2: Pointer other six-byte array containing the Ethernet address
 *
 * Compare two Ethernet addresses, returns true if equal
 *
 * Please note: addr1 & addr2 must both be aligned to u16.
 */
static inline bool ether_addr_equal(const u8 *addr1, const u8 *addr2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        u32 fold = ((*(const u32 *)addr1) ^ (*(const u32 *)addr2)) |
                   ((*(const u16 *)(addr1 + 4)) ^ (*(const u16 *)(addr2 + 4)));

        return fold == 0;
#else
        const u16 *a = (const u16 *)addr1;
        const u16 *b = (const u16 *)addr2;

        return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2])) == 0;
#endif
}

/**
 * ether_addr_equal_64bits - Compare two Ethernet addresses
 * @addr1: Pointer to an array of 8 bytes
 * @addr2: Pointer to an other array of 8 bytes
 *
 * Compare two Ethernet addresses, returns true if equal, false otherwise.
 *
 * The function doesn't need any conditional branches and possibly uses
 * word memory accesses on CPU allowing cheap unaligned memory reads.
 * arrays = { byte1, byte2, byte3, byte4, byte5, byte6, pad1, pad2 }
 *
 * Please note that alignment of addr1 & addr2 are only guaranteed to be 16 bits.
 */

static inline bool ether_addr_equal_64bits(const u8 *addr1, const u8 *addr2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        u64 fold = (*(const u64 *)addr1) ^ (*(const u64 *)addr2);

#ifdef __BIG_ENDIAN
        return (fold >> 16) == 0;
#else
        return (fold << 16) == 0;
#endif
#else
        return ether_addr_equal(addr1, addr2);
#endif
}

/**
 * ether_addr_equal_unaligned - Compare two not u16 aligned Ethernet addresses
 * @addr1: Pointer to a six-byte array containing the Ethernet address
 * @addr2: Pointer other six-byte array containing the Ethernet address
 *
 * Compare two Ethernet addresses, returns true if equal
 *
 * Please note: Use only when any Ethernet address may not be u16 aligned.
 */
static inline bool ether_addr_equal_unaligned(const u8 *addr1, const u8 *addr2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        return ether_addr_equal(addr1, addr2);
#else
        return memcmp(addr1, addr2, ETH_ALEN) == 0;
#endif
}

/**
 * ether_addr_equal_masked - Compare two Ethernet addresses with a mask
 * @addr1: Pointer to a six-byte array containing the 1st Ethernet address
 * @addr2: Pointer to a six-byte array containing the 2nd Ethernet address
 * @mask: Pointer to a six-byte array containing the Ethernet address bitmask
 *
 * Compare two Ethernet addresses with a mask, returns true if for every bit
 * set in the bitmask the equivalent bits in the ethernet addresses are equal.
 * Using a mask with all bits set is a slower ether_addr_equal.
 */
static inline bool ether_addr_equal_masked(const u8 *addr1, const u8 *addr2,
                                           const u8 *mask)
{
        int i;

        for (i = 0; i < ETH_ALEN; i++) {
                if ((addr1[i] ^ addr2[i]) & mask[i])
                        return false;
        }

        return true;
}

static inline bool ether_addr_is_ipv4_mcast(const u8 *addr)
{
        u8 mask[ETH_ALEN] = { 0xff, 0xff, 0xff, 0x80, 0x00, 0x00 };

        return ether_addr_equal_masked(addr, eth_ipv4_mcast_addr_base, mask);
}

static inline bool ether_addr_is_ipv6_mcast(const u8 *addr)
{
        u8 mask[ETH_ALEN] = { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 };

        return ether_addr_equal_masked(addr, eth_ipv6_mcast_addr_base, mask);
}

static inline bool ether_addr_is_ip_mcast(const u8 *addr)
{
        return ether_addr_is_ipv4_mcast(addr) ||
                ether_addr_is_ipv6_mcast(addr);
}

/**
 * ether_addr_to_u64 - Convert an Ethernet address into a u64 value.
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return: a u64 value of the address
 */
static inline u64 ether_addr_to_u64(const u8 *addr)
{
        u64 u = 0;
        int i;

        for (i = 0; i < ETH_ALEN; i++)
                u = u << 8 | addr[i];

        return u;
}

/**
 * u64_to_ether_addr - Convert a u64 to an Ethernet address.
 * @u: u64 to convert to an Ethernet MAC address
 * @addr: Pointer to a six-byte array to contain the Ethernet address
 */
static inline void u64_to_ether_addr(u64 u, u8 *addr)
{
        int i;

        for (i = ETH_ALEN - 1; i >= 0; i--) {
                addr[i] = u & 0xff;
                u = u >> 8;
        }
}

/**
 * eth_addr_dec - Decrement the given MAC address
 *
 * @addr: Pointer to a six-byte array containing Ethernet address to decrement
 */
static inline void eth_addr_dec(u8 *addr)
{
        u64 u = ether_addr_to_u64(addr);

        u--;
        u64_to_ether_addr(u, addr);
}

/**
 * eth_addr_inc() - Increment the given MAC address.
 * @addr: Pointer to a six-byte array containing Ethernet address to increment.
 */
static inline void eth_addr_inc(u8 *addr)
{
        u64 u = ether_addr_to_u64(addr);

        u++;
        u64_to_ether_addr(u, addr);
}

/**
 * eth_addr_add() - Add (or subtract) an offset to/from the given MAC address.
 *
 * @offset: Offset to add.
 * @addr: Pointer to a six-byte array containing Ethernet address to increment.
 */
static inline void eth_addr_add(u8 *addr, long offset)
{
        u64 u = ether_addr_to_u64(addr);

        u += offset;
        u64_to_ether_addr(u, addr);
}

/**
 * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
 * @dev: Pointer to a device structure
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Compare passed address with all addresses of the device. Return true if the
 * address if one of the device addresses.
 *
 * Note that this function calls ether_addr_equal_64bits() so take care of
 * the right padding.
 */
static inline bool is_etherdev_addr(const struct net_device *dev,
                                    const u8 addr[6 + 2])
{
        struct netdev_hw_addr *ha;
        bool res = false;

        rcu_read_lock();
        for_each_dev_addr(dev, ha) {
                res = ether_addr_equal_64bits(addr, ha->addr);
                if (res)
                        break;
        }
        rcu_read_unlock();
        return res;
}
#endif        /* __KERNEL__ */

/**
 * compare_ether_header - Compare two Ethernet headers
 * @a: Pointer to Ethernet header
 * @b: Pointer to Ethernet header
 *
 * Compare two Ethernet headers, returns 0 if equal.
 * This assumes that the network header (i.e., IP header) is 4-byte
 * aligned OR the platform can handle unaligned access.  This is the
 * case for all packets coming into netif_receive_skb or similar
 * entry points.
 */

static inline unsigned long compare_ether_header(const void *a, const void *b)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        unsigned long fold;

        /*
         * We want to compare 14 bytes:
         *  [a0 ... a13] ^ [b0 ... b13]
         * Use two long XOR, ORed together, with an overlap of two bytes.
         *  [a0  a1  a2  a3  a4  a5  a6  a7 ] ^ [b0  b1  b2  b3  b4  b5  b6  b7 ] |
         *  [a6  a7  a8  a9  a10 a11 a12 a13] ^ [b6  b7  b8  b9  b10 b11 b12 b13]
         * This means the [a6 a7] ^ [b6 b7] part is done two times.
        */
        fold = *(unsigned long *)a ^ *(unsigned long *)b;
        fold |= *(unsigned long *)(a + 6) ^ *(unsigned long *)(b + 6);
        return fold;
#else
        u32 *a32 = (u32 *)((u8 *)a + 2);
        u32 *b32 = (u32 *)((u8 *)b + 2);

        return (*(u16 *)a ^ *(u16 *)b) | (a32[0] ^ b32[0]) |
               (a32[1] ^ b32[1]) | (a32[2] ^ b32[2]);
#endif
}

/**
 * eth_hw_addr_gen - Generate and assign Ethernet address to a port
 * @dev: pointer to port's net_device structure
 * @base_addr: base Ethernet address
 * @id: offset to add to the base address
 *
 * Generate a MAC address using a base address and an offset and assign it
 * to a net_device. Commonly used by switch drivers which need to compute
 * addresses for all their ports. addr_assign_type is not changed.
 */
static inline void eth_hw_addr_gen(struct net_device *dev, const u8 *base_addr,
                                   unsigned int id)
{
        u64 u = ether_addr_to_u64(base_addr);
        u8 addr[ETH_ALEN];

        u += id;
        u64_to_ether_addr(u, addr);
        eth_hw_addr_set(dev, addr);
}

/**
 * eth_skb_pkt_type - Assign packet type if destination address does not match
 * @skb: Assigned a packet type if address does not match @dev address
 * @dev: Network device used to compare packet address against
 *
 * If the destination MAC address of the packet does not match the network
 * device address, assign an appropriate packet type.
 */
static inline void eth_skb_pkt_type(struct sk_buff *skb,
                                    const struct net_device *dev)
{
        const struct ethhdr *eth = eth_hdr(skb);

        if (unlikely(!ether_addr_equal_64bits(eth->h_dest, dev->dev_addr))) {
                if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) {
                        if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast))
                                skb->pkt_type = PACKET_BROADCAST;
                        else
                                skb->pkt_type = PACKET_MULTICAST;
                } else {
                        skb->pkt_type = PACKET_OTHERHOST;
                }
        }
}

static inline struct ethhdr *eth_skb_pull_mac(struct sk_buff *skb)
{
        struct ethhdr *eth = (struct ethhdr *)skb->data;

        skb_pull_inline(skb, ETH_HLEN);
        return eth;
}

/**
 * eth_skb_pad - Pad buffer to minimum number of octets for Ethernet frame
 * @skb: Buffer to pad
 *
 * An Ethernet frame should have a minimum size of 60 bytes.  This function
 * takes short frames and pads them with zeros up to the 60 byte limit.
 */
static inline int eth_skb_pad(struct sk_buff *skb)
{
        return skb_put_padto(skb, ETH_ZLEN);
}

#endif        /* _LINUX_ETHERDEVICE_H */
























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    5 
    5 






    5 







    1 








    5 

    4 















    6 
    3 



    3 





    5 



    2 






    6 






    6 

    2 
    6 







    6 







    2 
    1 


    1 


    2 








    1 




    4 
    5 




    5 


    6 
    4 







   13 

    5 

    8 
    2 

    2 



   11 
    4 

    8 



    3 

    2 
    7 










    8 


    2 

    1 



    1 












    7 



   13 
    3 




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (C)2002 USAGI/WIDE Project
 *
 * Authors
 *
 *        Mitsuru KANDA @USAGI       : IPv6 Support
 *        Kazunori MIYAZAWA @USAGI   :
 *        Kunihiro Ishiguro <kunihiro@ipinfusion.com>
 *
 *        This file is derived from net/ipv4/esp.c
 */

#define pr_fmt(fmt) "IPv6: " fmt

#include <crypto/aead.h>
#include <crypto/authenc.h>
#include <linux/err.h>
#include <linux/module.h>
#include <net/ip.h>
#include <net/xfrm.h>
#include <net/esp.h>
#include <linux/scatterlist.h>
#include <linux/kernel.h>
#include <linux/pfkeyv2.h>
#include <linux/random.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <net/ip6_checksum.h>
#include <net/ip6_route.h>
#include <net/icmp.h>
#include <net/ipv6.h>
#include <net/protocol.h>
#include <net/udp.h>
#include <linux/icmpv6.h>
#include <net/tcp.h>
#include <net/espintcp.h>
#include <net/inet6_hashtables.h>
#include <linux/skbuff_ref.h>

#include <linux/highmem.h>

struct esp_skb_cb {
        struct xfrm_skb_cb xfrm;
        void *tmp;
};

struct esp_output_extra {
        __be32 seqhi;
        u32 esphoff;
};

#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))

/*
 * Allocate an AEAD request structure with extra space for SG and IV.
 *
 * For alignment considerations the upper 32 bits of the sequence number are
 * placed at the front, if present. Followed by the IV, the request and finally
 * the SG list.
 *
 * TODO: Use spare space in skb for this where possible.
 */
static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqihlen)
{
        unsigned int len;

        len = seqihlen;

        len += crypto_aead_ivsize(aead);

        if (len) {
                len += crypto_aead_alignmask(aead) &
                       ~(crypto_tfm_ctx_alignment() - 1);
                len = ALIGN(len, crypto_tfm_ctx_alignment());
        }

        len += sizeof(struct aead_request) + crypto_aead_reqsize(aead);
        len = ALIGN(len, __alignof__(struct scatterlist));

        len += sizeof(struct scatterlist) * nfrags;

        return kmalloc(len, GFP_ATOMIC);
}

static inline void *esp_tmp_extra(void *tmp)
{
        return PTR_ALIGN(tmp, __alignof__(struct esp_output_extra));
}

static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen)
{
        return crypto_aead_ivsize(aead) ?
               PTR_ALIGN((u8 *)tmp + seqhilen,
                         crypto_aead_alignmask(aead) + 1) : tmp + seqhilen;
}

static inline struct aead_request *esp_tmp_req(struct crypto_aead *aead, u8 *iv)
{
        struct aead_request *req;

        req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead),
                                crypto_tfm_ctx_alignment());
        aead_request_set_tfm(req, aead);
        return req;
}

static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
                                             struct aead_request *req)
{
        return (void *)ALIGN((unsigned long)(req + 1) +
                             crypto_aead_reqsize(aead),
                             __alignof__(struct scatterlist));
}

static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb)
{
        struct crypto_aead *aead = x->data;
        int extralen = 0;
        u8 *iv;
        struct aead_request *req;
        struct scatterlist *sg;

        if (x->props.flags & XFRM_STATE_ESN)
                extralen += sizeof(struct esp_output_extra);

        iv = esp_tmp_iv(aead, tmp, extralen);
        req = esp_tmp_req(aead, iv);

        /* Unref skb_frag_pages in the src scatterlist if necessary.
         * Skip the first sg which comes from skb->data.
         */
        if (req->src != req->dst)
                for (sg = sg_next(req->src); sg; sg = sg_next(sg))
                        skb_page_unref(page_to_netmem(sg_page(sg)),
                                       skb->pp_recycle);
}

#ifdef CONFIG_INET6_ESPINTCP
static struct sock *esp6_find_tcp_sk(struct xfrm_state *x)
{
        struct xfrm_encap_tmpl *encap = x->encap;
        struct net *net = xs_net(x);
        __be16 sport, dport;
        struct sock *sk;

        spin_lock_bh(&x->lock);
        sport = encap->encap_sport;
        dport = encap->encap_dport;
        spin_unlock_bh(&x->lock);

        sk = __inet6_lookup_established(net, &x->id.daddr.in6, dport,
                                        &x->props.saddr.in6, ntohs(sport), 0, 0);
        if (!sk)
                return ERR_PTR(-ENOENT);

        if (!tcp_is_ulp_esp(sk)) {
                sock_put(sk);
                return ERR_PTR(-EINVAL);
        }

        return sk;
}

static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb)
{
        struct sock *sk;
        int err;

        rcu_read_lock();

        sk = esp6_find_tcp_sk(x);
        err = PTR_ERR_OR_ZERO(sk);
        if (err) {
                kfree_skb(skb);
                goto out;
        }

        bh_lock_sock(sk);
        if (sock_owned_by_user(sk))
                err = espintcp_queue_out(sk, skb);
        else
                err = espintcp_push_skb(sk, skb);
        bh_unlock_sock(sk);

        sock_put(sk);

out:
        rcu_read_unlock();
        return err;
}

static int esp_output_tcp_encap_cb(struct net *net, struct sock *sk,
                                   struct sk_buff *skb)
{
        struct dst_entry *dst = skb_dst(skb);
        struct xfrm_state *x = dst->xfrm;

        return esp_output_tcp_finish(x, skb);
}

static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
{
        int err;

        local_bh_disable();
        err = xfrm_trans_queue_net(xs_net(x), skb, esp_output_tcp_encap_cb);
        local_bh_enable();

        /* EINPROGRESS just happens to do the right thing.  It
         * actually means that the skb has been consumed and
         * isn't coming back.
         */
        return err ?: -EINPROGRESS;
}
#else
static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
{
        WARN_ON(1);
        return -EOPNOTSUPP;
}
#endif

static void esp_output_encap_csum(struct sk_buff *skb)
{
        /* UDP encap with IPv6 requires a valid checksum */
        if (*skb_mac_header(skb) == IPPROTO_UDP) {
                struct udphdr *uh = udp_hdr(skb);
                struct ipv6hdr *ip6h = ipv6_hdr(skb);
                int len = ntohs(uh->len);
                unsigned int offset = skb_transport_offset(skb);
                __wsum csum = skb_checksum(skb, offset, skb->len - offset, 0);

                uh->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
                                            len, IPPROTO_UDP, csum);
                if (uh->check == 0)
                        uh->check = CSUM_MANGLED_0;
        }
}

static void esp_output_done(void *data, int err)
{
        struct sk_buff *skb = data;
        struct xfrm_offload *xo = xfrm_offload(skb);
        void *tmp;
        struct xfrm_state *x;

        if (xo && (xo->flags & XFRM_DEV_RESUME)) {
                struct sec_path *sp = skb_sec_path(skb);

                x = sp->xvec[sp->len - 1];
        } else {
                x = skb_dst(skb)->xfrm;
        }

        tmp = ESP_SKB_CB(skb)->tmp;
        esp_ssg_unref(x, tmp, skb);
        kfree(tmp);

        esp_output_encap_csum(skb);

        if (xo && (xo->flags & XFRM_DEV_RESUME)) {
                if (err) {
                        XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR);
                        kfree_skb(skb);
                        return;
                }

                skb_push(skb, skb->data - skb_mac_header(skb));
                secpath_reset(skb);
                xfrm_dev_resume(skb);
        } else {
                if (!err &&
                    x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
                        esp_output_tail_tcp(x, skb);
                else
                        xfrm_output_resume(skb_to_full_sk(skb), skb, err);
        }
}

/* Move ESP header back into place. */
static void esp_restore_header(struct sk_buff *skb, unsigned int offset)
{
        struct ip_esp_hdr *esph = (void *)(skb->data + offset);
        void *tmp = ESP_SKB_CB(skb)->tmp;
        __be32 *seqhi = esp_tmp_extra(tmp);

        esph->seq_no = esph->spi;
        esph->spi = *seqhi;
}

static void esp_output_restore_header(struct sk_buff *skb)
{
        void *tmp = ESP_SKB_CB(skb)->tmp;
        struct esp_output_extra *extra = esp_tmp_extra(tmp);

        esp_restore_header(skb, skb_transport_offset(skb) + extra->esphoff -
                                sizeof(__be32));
}

static struct ip_esp_hdr *esp_output_set_esn(struct sk_buff *skb,
                                             struct xfrm_state *x,
                                             struct ip_esp_hdr *esph,
                                             struct esp_output_extra *extra)
{
        /* For ESN we move the header forward by 4 bytes to
         * accommodate the high bits.  We will move it back after
         * encryption.
         */
        if ((x->props.flags & XFRM_STATE_ESN)) {
                __u32 seqhi;
                struct xfrm_offload *xo = xfrm_offload(skb);

                if (xo)
                        seqhi = xo->seq.hi;
                else
                        seqhi = XFRM_SKB_CB(skb)->seq.output.hi;

                extra->esphoff = (unsigned char *)esph -
                                 skb_transport_header(skb);
                esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4);
                extra->seqhi = esph->spi;
                esph->seq_no = htonl(seqhi);
        }

        esph->spi = x->id.spi;

        return esph;
}

static void esp_output_done_esn(void *data, int err)
{
        struct sk_buff *skb = data;

        esp_output_restore_header(skb);
        esp_output_done(data, err);
}

static struct ip_esp_hdr *esp6_output_udp_encap(struct sk_buff *skb,
                                               int encap_type,
                                               struct esp_info *esp,
                                               __be16 sport,
                                               __be16 dport)
{
        struct udphdr *uh;
        unsigned int len;

        len = skb->len + esp->tailen - skb_transport_offset(skb);
        if (len > U16_MAX)
                return ERR_PTR(-EMSGSIZE);

        uh = (struct udphdr *)esp->esph;
        uh->source = sport;
        uh->dest = dport;
        uh->len = htons(len);
        uh->check = 0;

        *skb_mac_header(skb) = IPPROTO_UDP;

        return (struct ip_esp_hdr *)(uh + 1);
}

#ifdef CONFIG_INET6_ESPINTCP
static struct ip_esp_hdr *esp6_output_tcp_encap(struct xfrm_state *x,
                                                struct sk_buff *skb,
                                                struct esp_info *esp)
{
        __be16 *lenp = (void *)esp->esph;
        struct ip_esp_hdr *esph;
        unsigned int len;
        struct sock *sk;

        len = skb->len + esp->tailen - skb_transport_offset(skb);
        if (len > IP_MAX_MTU)
                return ERR_PTR(-EMSGSIZE);

        rcu_read_lock();
        sk = esp6_find_tcp_sk(x);
        rcu_read_unlock();

        if (IS_ERR(sk))
                return ERR_CAST(sk);

        sock_put(sk);

        *lenp = htons(len);
        esph = (struct ip_esp_hdr *)(lenp + 1);

        return esph;
}
#else
static struct ip_esp_hdr *esp6_output_tcp_encap(struct xfrm_state *x,
                                                struct sk_buff *skb,
                                                struct esp_info *esp)
{
        return ERR_PTR(-EOPNOTSUPP);
}
#endif

static int esp6_output_encap(struct xfrm_state *x, struct sk_buff *skb,
                            struct esp_info *esp)
{
        struct xfrm_encap_tmpl *encap = x->encap;
        struct ip_esp_hdr *esph;
        __be16 sport, dport;
        int encap_type;

        spin_lock_bh(&x->lock);
        sport = encap->encap_sport;
        dport = encap->encap_dport;
        encap_type = encap->encap_type;
        spin_unlock_bh(&x->lock);

        switch (encap_type) {
        default:
        case UDP_ENCAP_ESPINUDP:
                esph = esp6_output_udp_encap(skb, encap_type, esp, sport, dport);
                break;
        case TCP_ENCAP_ESPINTCP:
                esph = esp6_output_tcp_encap(x, skb, esp);
                break;
        }

        if (IS_ERR(esph))
                return PTR_ERR(esph);

        esp->esph = esph;

        return 0;
}

int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp)
{
        u8 *tail;
        int nfrags;
        int esph_offset;
        struct page *page;
        struct sk_buff *trailer;
        int tailen = esp->tailen;

        if (x->encap) {
                int err = esp6_output_encap(x, skb, esp);

                if (err < 0)
                        return err;
        }

        if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE ||
            ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE)
                goto cow;

        if (!skb_cloned(skb)) {
                if (tailen <= skb_tailroom(skb)) {
                        nfrags = 1;
                        trailer = skb;
                        tail = skb_tail_pointer(trailer);

                        goto skip_cow;
                } else if ((skb_shinfo(skb)->nr_frags < MAX_SKB_FRAGS)
                           && !skb_has_frag_list(skb)) {
                        int allocsize;
                        struct sock *sk = skb->sk;
                        struct page_frag *pfrag = &x->xfrag;

                        esp->inplace = false;

                        allocsize = ALIGN(tailen, L1_CACHE_BYTES);

                        spin_lock_bh(&x->lock);

                        if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
                                spin_unlock_bh(&x->lock);
                                goto cow;
                        }

                        page = pfrag->page;
                        get_page(page);

                        tail = page_address(page) + pfrag->offset;

                        esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto);

                        nfrags = skb_shinfo(skb)->nr_frags;

                        __skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
                                             tailen);
                        skb_shinfo(skb)->nr_frags = ++nfrags;

                        pfrag->offset = pfrag->offset + allocsize;

                        spin_unlock_bh(&x->lock);

                        nfrags++;

                        skb->len += tailen;
                        skb->data_len += tailen;
                        skb->truesize += tailen;
                        if (sk && sk_fullsock(sk))
                                refcount_add(tailen, &sk->sk_wmem_alloc);

                        goto out;
                }
        }

cow:
        esph_offset = (unsigned char *)esp->esph - skb_transport_header(skb);

        nfrags = skb_cow_data(skb, tailen, &trailer);
        if (nfrags < 0)
                goto out;
        tail = skb_tail_pointer(trailer);
        esp->esph = (struct ip_esp_hdr *)(skb_transport_header(skb) + esph_offset);

skip_cow:
        esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto);
        pskb_put(skb, trailer, tailen);

out:
        return nfrags;
}
EXPORT_SYMBOL_GPL(esp6_output_head);

int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp)
{
        u8 *iv;
        int alen;
        void *tmp;
        int ivlen;
        int assoclen;
        int extralen;
        struct page *page;
        struct ip_esp_hdr *esph;
        struct aead_request *req;
        struct crypto_aead *aead;
        struct scatterlist *sg, *dsg;
        struct esp_output_extra *extra;
        int err = -ENOMEM;

        assoclen = sizeof(struct ip_esp_hdr);
        extralen = 0;

        if (x->props.flags & XFRM_STATE_ESN) {
                extralen += sizeof(*extra);
                assoclen += sizeof(__be32);
        }

        aead = x->data;
        alen = crypto_aead_authsize(aead);
        ivlen = crypto_aead_ivsize(aead);

        tmp = esp_alloc_tmp(aead, esp->nfrags + 2, extralen);
        if (!tmp)
                goto error;

        extra = esp_tmp_extra(tmp);
        iv = esp_tmp_iv(aead, tmp, extralen);
        req = esp_tmp_req(aead, iv);
        sg = esp_req_sg(aead, req);

        if (esp->inplace)
                dsg = sg;
        else
                dsg = &sg[esp->nfrags];

        esph = esp_output_set_esn(skb, x, esp->esph, extra);
        esp->esph = esph;

        sg_init_table(sg, esp->nfrags);
        err = skb_to_sgvec(skb, sg,
                           (unsigned char *)esph - skb->data,
                           assoclen + ivlen + esp->clen + alen);
        if (unlikely(err < 0))
                goto error_free;

        if (!esp->inplace) {
                int allocsize;
                struct page_frag *pfrag = &x->xfrag;

                allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES);

                spin_lock_bh(&x->lock);
                if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
                        spin_unlock_bh(&x->lock);
                        goto error_free;
                }

                skb_shinfo(skb)->nr_frags = 1;

                page = pfrag->page;
                get_page(page);
                /* replace page frags in skb with new page */
                __skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len);
                pfrag->offset = pfrag->offset + allocsize;
                spin_unlock_bh(&x->lock);

                sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1);
                err = skb_to_sgvec(skb, dsg,
                                   (unsigned char *)esph - skb->data,
                                   assoclen + ivlen + esp->clen + alen);
                if (unlikely(err < 0))
                        goto error_free;
        }

        if ((x->props.flags & XFRM_STATE_ESN))
                aead_request_set_callback(req, 0, esp_output_done_esn, skb);
        else
                aead_request_set_callback(req, 0, esp_output_done, skb);

        aead_request_set_crypt(req, sg, dsg, ivlen + esp->clen, iv);
        aead_request_set_ad(req, assoclen);

        memset(iv, 0, ivlen);
        memcpy(iv + ivlen - min(ivlen, 8), (u8 *)&esp->seqno + 8 - min(ivlen, 8),
               min(ivlen, 8));

        ESP_SKB_CB(skb)->tmp = tmp;
        err = crypto_aead_encrypt(req);

        switch (err) {
        case -EINPROGRESS:
                goto error;

        case -ENOSPC:
                err = NET_XMIT_DROP;
                break;

        case 0:
                if ((x->props.flags & XFRM_STATE_ESN))
                        esp_output_restore_header(skb);
                esp_output_encap_csum(skb);
        }

        if (sg != dsg)
                esp_ssg_unref(x, tmp, skb);

        if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
                err = esp_output_tail_tcp(x, skb);

error_free:
        kfree(tmp);
error:
        return err;
}
EXPORT_SYMBOL_GPL(esp6_output_tail);

static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
{
        int alen;
        int blksize;
        struct ip_esp_hdr *esph;
        struct crypto_aead *aead;
        struct esp_info esp;

        esp.inplace = true;

        esp.proto = *skb_mac_header(skb);
        *skb_mac_header(skb) = IPPROTO_ESP;

        /* skb is pure payload to encrypt */

        aead = x->data;
        alen = crypto_aead_authsize(aead);

        esp.tfclen = 0;
        if (x->tfcpad) {
                struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
                u32 padto;

                padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached));
                if (skb->len < padto)
                        esp.tfclen = padto - skb->len;
        }
        blksize = ALIGN(crypto_aead_blocksize(aead), 4);
        esp.clen = ALIGN(skb->len + 2 + esp.tfclen, blksize);
        esp.plen = esp.clen - skb->len - esp.tfclen;
        esp.tailen = esp.tfclen + esp.plen + alen;

        esp.esph = ip_esp_hdr(skb);

        esp.nfrags = esp6_output_head(x, skb, &esp);
        if (esp.nfrags < 0)
                return esp.nfrags;

        esph = esp.esph;
        esph->spi = x->id.spi;

        esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
        esp.seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low +
                            ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32));

        skb_push(skb, -skb_network_offset(skb));

        return esp6_output_tail(x, skb, &esp);
}

static inline int esp_remove_trailer(struct sk_buff *skb)
{
        struct xfrm_state *x = xfrm_input_state(skb);
        struct crypto_aead *aead = x->data;
        int alen, hlen, elen;
        int padlen, trimlen;
        __wsum csumdiff;
        u8 nexthdr[2];
        int ret;

        alen = crypto_aead_authsize(aead);
        hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
        elen = skb->len - hlen;

        ret = skb_copy_bits(skb, skb->len - alen - 2, nexthdr, 2);
        BUG_ON(ret);

        ret = -EINVAL;
        padlen = nexthdr[0];
        if (padlen + 2 + alen >= elen) {
                net_dbg_ratelimited("ipsec esp packet is garbage padlen=%d, elen=%d\n",
                                    padlen + 2, elen - alen);
                goto out;
        }

        trimlen = alen + padlen + 2;
        if (skb->ip_summed == CHECKSUM_COMPLETE) {
                csumdiff = skb_checksum(skb, skb->len - trimlen, trimlen, 0);
                skb->csum = csum_block_sub(skb->csum, csumdiff,
                                           skb->len - trimlen);
        }
        ret = pskb_trim(skb, skb->len - trimlen);
        if (unlikely(ret))
                return ret;

        ret = nexthdr[1];

out:
        return ret;
}

int esp6_input_done2(struct sk_buff *skb, int err)
{
        struct xfrm_state *x = xfrm_input_state(skb);
        struct xfrm_offload *xo = xfrm_offload(skb);
        struct crypto_aead *aead = x->data;
        int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
        int hdr_len = skb_network_header_len(skb);

        if (!xo || !(xo->flags & CRYPTO_DONE))
                kfree(ESP_SKB_CB(skb)->tmp);

        if (unlikely(err))
                goto out;

        err = esp_remove_trailer(skb);
        if (unlikely(err < 0))
                goto out;

        if (x->encap) {
                const struct ipv6hdr *ip6h = ipv6_hdr(skb);
                int offset = skb_network_offset(skb) + sizeof(*ip6h);
                struct xfrm_encap_tmpl *encap = x->encap;
                u8 nexthdr = ip6h->nexthdr;
                __be16 frag_off, source;
                struct udphdr *uh;
                struct tcphdr *th;

                offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off);
                if (offset == -1) {
                        err = -EINVAL;
                        goto out;
                }

                uh = (void *)(skb->data + offset);
                th = (void *)(skb->data + offset);
                hdr_len += offset;

                switch (x->encap->encap_type) {
                case TCP_ENCAP_ESPINTCP:
                        source = th->source;
                        break;
                case UDP_ENCAP_ESPINUDP:
                        source = uh->source;
                        break;
                default:
                        WARN_ON_ONCE(1);
                        err = -EINVAL;
                        goto out;
                }

                /*
                 * 1) if the NAT-T peer's IP or port changed then
                 *    advertise the change to the keying daemon.
                 *    This is an inbound SA, so just compare
                 *    SRC ports.
                 */
                if (!ipv6_addr_equal(&ip6h->saddr, &x->props.saddr.in6) ||
                    source != encap->encap_sport) {
                        xfrm_address_t ipaddr;

                        memcpy(&ipaddr.a6, &ip6h->saddr.s6_addr, sizeof(ipaddr.a6));
                        km_new_mapping(x, &ipaddr, source);

                        /* XXX: perhaps add an extra
                         * policy check here, to see
                         * if we should allow or
                         * reject a packet from a
                         * different source
                         * address/port.
                         */
                }

                /*
                 * 2) ignore UDP/TCP checksums in case
                 *    of NAT-T in Transport Mode, or
                 *    perform other post-processing fixes
                 *    as per draft-ietf-ipsec-udp-encaps-06,
                 *    section 3.1.2
                 */
                if (x->props.mode == XFRM_MODE_TRANSPORT)
                        skb->ip_summed = CHECKSUM_UNNECESSARY;
        }

        skb_postpull_rcsum(skb, skb_network_header(skb),
                           skb_network_header_len(skb));
        skb_pull_rcsum(skb, hlen);
        if (x->props.mode == XFRM_MODE_TUNNEL ||
            x->props.mode == XFRM_MODE_IPTFS)
                skb_reset_transport_header(skb);
        else
                skb_set_transport_header(skb, -hdr_len);

        /* RFC4303: Drop dummy packets without any error */
        if (err == IPPROTO_NONE)
                err = -EINVAL;

out:
        return err;
}
EXPORT_SYMBOL_GPL(esp6_input_done2);

static void esp_input_done(void *data, int err)
{
        struct sk_buff *skb = data;

        xfrm_input_resume(skb, esp6_input_done2(skb, err));
}

static void esp_input_restore_header(struct sk_buff *skb)
{
        esp_restore_header(skb, 0);
        __skb_pull(skb, 4);
}

static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi)
{
        struct xfrm_state *x = xfrm_input_state(skb);

        /* For ESN we move the header forward by 4 bytes to
         * accommodate the high bits.  We will move it back after
         * decryption.
         */
        if ((x->props.flags & XFRM_STATE_ESN)) {
                struct ip_esp_hdr *esph = skb_push(skb, 4);

                *seqhi = esph->spi;
                esph->spi = esph->seq_no;
                esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
        }
}

static void esp_input_done_esn(void *data, int err)
{
        struct sk_buff *skb = data;

        esp_input_restore_header(skb);
        esp_input_done(data, err);
}

static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
{
        struct crypto_aead *aead = x->data;
        struct aead_request *req;
        struct sk_buff *trailer;
        int ivlen = crypto_aead_ivsize(aead);
        int elen = skb->len - sizeof(struct ip_esp_hdr) - ivlen;
        int nfrags;
        int assoclen;
        int seqhilen;
        int ret = 0;
        void *tmp;
        __be32 *seqhi;
        u8 *iv;
        struct scatterlist *sg;

        if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + ivlen)) {
                ret = -EINVAL;
                goto out;
        }

        if (elen <= 0) {
                ret = -EINVAL;
                goto out;
        }

        assoclen = sizeof(struct ip_esp_hdr);
        seqhilen = 0;

        if (x->props.flags & XFRM_STATE_ESN) {
                seqhilen += sizeof(__be32);
                assoclen += seqhilen;
        }

        if (!skb_cloned(skb)) {
                if (!skb_is_nonlinear(skb)) {
                        nfrags = 1;

                        goto skip_cow;
                } else if (!skb_has_frag_list(skb)) {
                        nfrags = skb_shinfo(skb)->nr_frags;
                        nfrags++;

                        goto skip_cow;
                }
        }

        nfrags = skb_cow_data(skb, 0, &trailer);
        if (nfrags < 0) {
                ret = -EINVAL;
                goto out;
        }

skip_cow:
        ret = -ENOMEM;
        tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
        if (!tmp)
                goto out;

        ESP_SKB_CB(skb)->tmp = tmp;
        seqhi = esp_tmp_extra(tmp);
        iv = esp_tmp_iv(aead, tmp, seqhilen);
        req = esp_tmp_req(aead, iv);
        sg = esp_req_sg(aead, req);

        esp_input_set_header(skb, seqhi);

        sg_init_table(sg, nfrags);
        ret = skb_to_sgvec(skb, sg, 0, skb->len);
        if (unlikely(ret < 0)) {
                kfree(tmp);
                goto out;
        }

        skb->ip_summed = CHECKSUM_NONE;

        if ((x->props.flags & XFRM_STATE_ESN))
                aead_request_set_callback(req, 0, esp_input_done_esn, skb);
        else
                aead_request_set_callback(req, 0, esp_input_done, skb);

        aead_request_set_crypt(req, sg, sg, elen + ivlen, iv);
        aead_request_set_ad(req, assoclen);

        ret = crypto_aead_decrypt(req);
        if (ret == -EINPROGRESS)
                goto out;

        if ((x->props.flags & XFRM_STATE_ESN))
                esp_input_restore_header(skb);

        ret = esp6_input_done2(skb, ret);

out:
        return ret;
}

static int esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
                    u8 type, u8 code, int offset, __be32 info)
{
        struct net *net = dev_net(skb->dev);
        const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data;
        struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data + offset);
        struct xfrm_state *x;

        if (type != ICMPV6_PKT_TOOBIG &&
            type != NDISC_REDIRECT)
                return 0;

        x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
                              esph->spi, IPPROTO_ESP, AF_INET6);
        if (!x)
                return 0;

        if (type == NDISC_REDIRECT)
                ip6_redirect(skb, net, skb->dev->ifindex, 0,
                             sock_net_uid(net, NULL));
        else
                ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
        xfrm_state_put(x);

        return 0;
}

static void esp6_destroy(struct xfrm_state *x)
{
        struct crypto_aead *aead = x->data;

        if (!aead)
                return;

        crypto_free_aead(aead);
}

static int esp_init_aead(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
        char aead_name[CRYPTO_MAX_ALG_NAME];
        struct crypto_aead *aead;
        int err;

        if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)",
                     x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME) {
                NL_SET_ERR_MSG(extack, "Algorithm name is too long");
                return -ENAMETOOLONG;
        }

        aead = crypto_alloc_aead(aead_name, 0, 0);
        err = PTR_ERR(aead);
        if (IS_ERR(aead))
                goto error;

        x->data = aead;

        err = crypto_aead_setkey(aead, x->aead->alg_key,
                                 (x->aead->alg_key_len + 7) / 8);
        if (err)
                goto error;

        err = crypto_aead_setauthsize(aead, x->aead->alg_icv_len / 8);
        if (err)
                goto error;

        return 0;

error:
        NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
        return err;
}

static int esp_init_authenc(struct xfrm_state *x,
                            struct netlink_ext_ack *extack)
{
        struct crypto_aead *aead;
        struct crypto_authenc_key_param *param;
        struct rtattr *rta;
        char *key;
        char *p;
        char authenc_name[CRYPTO_MAX_ALG_NAME];
        unsigned int keylen;
        int err;

        err = -ENAMETOOLONG;

        if ((x->props.flags & XFRM_STATE_ESN)) {
                if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
                             "%s%sauthencesn(%s,%s)%s",
                             x->geniv ?: "", x->geniv ? "(" : "",
                             x->aalg ? x->aalg->alg_name : "digest_null",
                             x->ealg->alg_name,
                             x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) {
                        NL_SET_ERR_MSG(extack, "Algorithm name is too long");
                        goto error;
                }
        } else {
                if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
                             "%s%sauthenc(%s,%s)%s",
                             x->geniv ?: "", x->geniv ? "(" : "",
                             x->aalg ? x->aalg->alg_name : "digest_null",
                             x->ealg->alg_name,
                             x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) {
                        NL_SET_ERR_MSG(extack, "Algorithm name is too long");
                        goto error;
                }
        }

        aead = crypto_alloc_aead(authenc_name, 0, 0);
        err = PTR_ERR(aead);
        if (IS_ERR(aead)) {
                NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
                goto error;
        }

        x->data = aead;

        keylen = (x->aalg ? (x->aalg->alg_key_len + 7) / 8 : 0) +
                 (x->ealg->alg_key_len + 7) / 8 + RTA_SPACE(sizeof(*param));
        err = -ENOMEM;
        key = kmalloc(keylen, GFP_KERNEL);
        if (!key)
                goto error;

        p = key;
        rta = (void *)p;
        rta->rta_type = CRYPTO_AUTHENC_KEYA_PARAM;
        rta->rta_len = RTA_LENGTH(sizeof(*param));
        param = RTA_DATA(rta);
        p += RTA_SPACE(sizeof(*param));

        if (x->aalg) {
                struct xfrm_algo_desc *aalg_desc;

                memcpy(p, x->aalg->alg_key, (x->aalg->alg_key_len + 7) / 8);
                p += (x->aalg->alg_key_len + 7) / 8;

                aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
                BUG_ON(!aalg_desc);

                err = -EINVAL;
                if (aalg_desc->uinfo.auth.icv_fullbits / 8 !=
                    crypto_aead_authsize(aead)) {
                        NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
                        goto free_key;
                }

                err = crypto_aead_setauthsize(
                        aead, x->aalg->alg_trunc_len / 8);
                if (err) {
                        NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
                        goto free_key;
                }
        }

        param->enckeylen = cpu_to_be32((x->ealg->alg_key_len + 7) / 8);
        memcpy(p, x->ealg->alg_key, (x->ealg->alg_key_len + 7) / 8);

        err = crypto_aead_setkey(aead, key, keylen);

free_key:
        kfree(key);

error:
        return err;
}

static int esp6_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
        struct crypto_aead *aead;
        u32 align;
        int err;

        x->data = NULL;

        if (x->aead) {
                err = esp_init_aead(x, extack);
        } else if (x->ealg) {
                err = esp_init_authenc(x, extack);
        } else {
                NL_SET_ERR_MSG(extack, "ESP: AEAD or CRYPT must be provided");
                err = -EINVAL;
        }

        if (err)
                goto error;

        aead = x->data;

        x->props.header_len = sizeof(struct ip_esp_hdr) +
                              crypto_aead_ivsize(aead);
        switch (x->props.mode) {
        case XFRM_MODE_BEET:
                if (x->sel.family != AF_INET6)
                        x->props.header_len += IPV4_BEET_PHMAXLEN +
                                               (sizeof(struct ipv6hdr) - sizeof(struct iphdr));
                break;
        default:
        case XFRM_MODE_TRANSPORT:
                break;
        case XFRM_MODE_TUNNEL:
                x->props.header_len += sizeof(struct ipv6hdr);
                break;
        }

        if (x->encap) {
                struct xfrm_encap_tmpl *encap = x->encap;

                switch (encap->encap_type) {
                default:
                        NL_SET_ERR_MSG(extack, "Unsupported encapsulation type for ESP");
                        err = -EINVAL;
                        goto error;
                case UDP_ENCAP_ESPINUDP:
                        x->props.header_len += sizeof(struct udphdr);
                        break;
#ifdef CONFIG_INET6_ESPINTCP
                case TCP_ENCAP_ESPINTCP:
                        /* only the length field, TCP encap is done by
                         * the socket
                         */
                        x->props.header_len += 2;
                        break;
#endif
                }
        }

        align = ALIGN(crypto_aead_blocksize(aead), 4);
        x->props.trailer_len = align + 1 + crypto_aead_authsize(aead);

error:
        return err;
}

static int esp6_rcv_cb(struct sk_buff *skb, int err)
{
        return 0;
}

static const struct xfrm_type esp6_type = {
        .owner                = THIS_MODULE,
        .proto                = IPPROTO_ESP,
        .flags                = XFRM_TYPE_REPLAY_PROT,
        .init_state        = esp6_init_state,
        .destructor        = esp6_destroy,
        .input                = esp6_input,
        .output                = esp6_output,
};

static struct xfrm6_protocol esp6_protocol = {
        .handler        =        xfrm6_rcv,
        .input_handler        =        xfrm_input,
        .cb_handler        =        esp6_rcv_cb,
        .err_handler        =        esp6_err,
        .priority        =        0,
};

static int __init esp6_init(void)
{
        if (xfrm_register_type(&esp6_type, AF_INET6) < 0) {
                pr_info("%s: can't add xfrm type\n", __func__);
                return -EAGAIN;
        }
        if (xfrm6_protocol_register(&esp6_protocol, IPPROTO_ESP) < 0) {
                pr_info("%s: can't add protocol\n", __func__);
                xfrm_unregister_type(&esp6_type, AF_INET6);
                return -EAGAIN;
        }

        return 0;
}

static void __exit esp6_fini(void)
{
        if (xfrm6_protocol_deregister(&esp6_protocol, IPPROTO_ESP) < 0)
                pr_info("%s: can't remove protocol\n", __func__);
        xfrm_unregister_type(&esp6_type, AF_INET6);
}

module_init(esp6_init);
module_exit(esp6_fini);

MODULE_DESCRIPTION("IPv6 ESP transformation helpers");
MODULE_LICENSE("GPL");
MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_ESP);





























  317 












































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Supervisor Mode Access Prevention support
 *
 * Copyright (C) 2012 Intel Corporation
 * Author: H. Peter Anvin <hpa@linux.intel.com>
 */

#ifndef _ASM_X86_SMAP_H
#define _ASM_X86_SMAP_H

#include <asm/nops.h>
#include <asm/cpufeatures.h>
#include <asm/alternative.h>

#ifdef __ASSEMBLER__

#define ASM_CLAC \
        ALTERNATIVE "", "clac", X86_FEATURE_SMAP

#define ASM_STAC \
        ALTERNATIVE "", "stac", X86_FEATURE_SMAP

#else /* __ASSEMBLER__ */

static __always_inline void clac(void)
{
        /* Note: a barrier is implicit in alternative() */
        alternative("", "clac", X86_FEATURE_SMAP);
}

static __always_inline void stac(void)
{
        /* Note: a barrier is implicit in alternative() */
        alternative("", "stac", X86_FEATURE_SMAP);
}

static __always_inline unsigned long smap_save(void)
{
        unsigned long flags;

        asm volatile ("# smap_save\n\t"
                      ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE
                                  "", "pushf; pop %0; clac",
                                  X86_FEATURE_SMAP)
                      : "=rm" (flags) : : "memory", "cc");

        return flags;
}

static __always_inline void smap_restore(unsigned long flags)
{
        asm volatile ("# smap_restore\n\t"
                      ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE
                                  "", "push %0; popf",
                                  X86_FEATURE_SMAP)
                      : : "g" (flags) : "memory", "cc");
}

/* These macros can be used in asm() statements */
#define ASM_CLAC \
        ALTERNATIVE("", "clac", X86_FEATURE_SMAP)
#define ASM_STAC \
        ALTERNATIVE("", "stac", X86_FEATURE_SMAP)

#define ASM_CLAC_UNSAFE \
        ALTERNATIVE("", ANNOTATE_IGNORE_ALTERNATIVE "clac", X86_FEATURE_SMAP)
#define ASM_STAC_UNSAFE \
        ALTERNATIVE("", ANNOTATE_IGNORE_ALTERNATIVE "stac", X86_FEATURE_SMAP)

#endif /* __ASSEMBLER__ */

#endif /* _ASM_X86_SMAP_H */











































































































































































































































































































  316 



  314 
  317 






  319 


  315 





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  Kernel Probes (KProbes)
 *
 * Copyright (C) IBM Corporation, 2002, 2004
 *
 * 2002-Oct        Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
 *                Probes initial implementation (includes suggestions from
 *                Rusty Russell).
 * 2004-Aug        Updated by Prasanna S Panchamukhi <prasanna@in.ibm.com> with
 *                hlists and exceptions notifier as suggested by Andi Kleen.
 * 2004-July        Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
 *                interface to access function arguments.
 * 2004-Sep        Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes
 *                exceptions notifier to be first on the priority list.
 * 2005-May        Hien Nguyen <hien@us.ibm.com>, Jim Keniston
 *                <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
 *                <prasanna@in.ibm.com> added function-return probes.
 */

#define pr_fmt(fmt) "kprobes: " fmt

#include <linux/kprobes.h>
#include <linux/hash.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/stddef.h>
#include <linux/export.h>
#include <linux/kallsyms.h>
#include <linux/freezer.h>
#include <linux/seq_file.h>
#include <linux/debugfs.h>
#include <linux/sysctl.h>
#include <linux/kdebug.h>
#include <linux/memory.h>
#include <linux/ftrace.h>
#include <linux/cpu.h>
#include <linux/jump_label.h>
#include <linux/static_call.h>
#include <linux/perf_event.h>
#include <linux/execmem.h>
#include <linux/cleanup.h>

#include <asm/sections.h>
#include <asm/cacheflush.h>
#include <asm/errno.h>
#include <linux/uaccess.h>

#define KPROBE_HASH_BITS 6
#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)

#if !defined(CONFIG_OPTPROBES) || !defined(CONFIG_SYSCTL)
#define kprobe_sysctls_init() do { } while (0)
#endif

static int kprobes_initialized;
/* kprobe_table can be accessed by
 * - Normal hlist traversal and RCU add/del under 'kprobe_mutex' is held.
 * Or
 * - RCU hlist traversal under disabling preempt (breakpoint handlers)
 */
static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];

/* NOTE: change this value only with 'kprobe_mutex' held */
static bool kprobes_all_disarmed;

/* This protects 'kprobe_table' and 'optimizing_list' */
static DEFINE_MUTEX(kprobe_mutex);
static DEFINE_PER_CPU(struct kprobe *, kprobe_instance);

kprobe_opcode_t * __weak kprobe_lookup_name(const char *name,
                                        unsigned int __unused)
{
        return ((kprobe_opcode_t *)(kallsyms_lookup_name(name)));
}

/*
 * Blacklist -- list of 'struct kprobe_blacklist_entry' to store info where
 * kprobes can not probe.
 */
static LIST_HEAD(kprobe_blacklist);

#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
/*
 * 'kprobe::ainsn.insn' points to the copy of the instruction to be
 * single-stepped. x86_64, POWER4 and above have no-exec support and
 * stepping on the instruction on a vmalloced/kmalloced/data page
 * is a recipe for disaster
 */
struct kprobe_insn_page {
        struct list_head list;
        kprobe_opcode_t *insns;                /* Page of instruction slots */
        struct kprobe_insn_cache *cache;
        int nused;
        int ngarbage;
        char slot_used[];
};

static int slots_per_page(struct kprobe_insn_cache *c)
{
        return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t));
}

enum kprobe_slot_state {
        SLOT_CLEAN = 0,
        SLOT_DIRTY = 1,
        SLOT_USED = 2,
};

void __weak *alloc_insn_page(void)
{
        /*
         * Use execmem_alloc() so this page is within +/- 2GB of where the
         * kernel image and loaded module images reside. This is required
         * for most of the architectures.
         * (e.g. x86-64 needs this to handle the %rip-relative fixups.)
         */
        return execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
}

static void free_insn_page(void *page)
{
        execmem_free(page);
}

struct kprobe_insn_cache kprobe_insn_slots = {
        .mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex),
        .alloc = alloc_insn_page,
        .free = free_insn_page,
        .sym = KPROBE_INSN_PAGE_SYM,
        .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
        .insn_size = MAX_INSN_SIZE,
        .nr_garbage = 0,
};
static int collect_garbage_slots(struct kprobe_insn_cache *c);

/**
 * __get_insn_slot - Find a slot on an executable page for an instruction.
 * @c: Pointer to kprobe instruction cache
 *
 * Description: Locates available slot on existing executable pages,
 *              allocates an executable page if there's no room on existing ones.
 * Return: Pointer to instruction slot on success, NULL on failure.
 */
kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
{
        struct kprobe_insn_page *kip;

        /* Since the slot array is not protected by rcu, we need a mutex */
        guard(mutex)(&c->mutex);
        do {
                guard(rcu)();
                list_for_each_entry_rcu(kip, &c->pages, list) {
                        if (kip->nused < slots_per_page(c)) {
                                int i;

                                for (i = 0; i < slots_per_page(c); i++) {
                                        if (kip->slot_used[i] == SLOT_CLEAN) {
                                                kip->slot_used[i] = SLOT_USED;
                                                kip->nused++;
                                                return kip->insns + (i * c->insn_size);
                                        }
                                }
                                /* kip->nused is broken. Fix it. */
                                kip->nused = slots_per_page(c);
                                WARN_ON(1);
                        }
                }
        /* If there are any garbage slots, collect it and try again. */
        } while (c->nr_garbage && collect_garbage_slots(c) == 0);

        /* All out of space.  Need to allocate a new page. */
        kip = kmalloc(struct_size(kip, slot_used, slots_per_page(c)), GFP_KERNEL);
        if (!kip)
                return NULL;

        kip->insns = c->alloc();
        if (!kip->insns) {
                kfree(kip);
                return NULL;
        }
        INIT_LIST_HEAD(&kip->list);
        memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c));
        kip->slot_used[0] = SLOT_USED;
        kip->nused = 1;
        kip->ngarbage = 0;
        kip->cache = c;
        list_add_rcu(&kip->list, &c->pages);

        /* Record the perf ksymbol register event after adding the page */
        perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, (unsigned long)kip->insns,
                           PAGE_SIZE, false, c->sym);

        return kip->insns;
}

/* Return true if all garbages are collected, otherwise false. */
static bool collect_one_slot(struct kprobe_insn_page *kip, int idx)
{
        kip->slot_used[idx] = SLOT_CLEAN;
        kip->nused--;
        if (kip->nused != 0)
                return false;

        /*
         * Page is no longer in use.  Free it unless
         * it's the last one.  We keep the last one
         * so as not to have to set it up again the
         * next time somebody inserts a probe.
         */
        if (!list_is_singular(&kip->list)) {
                /*
                 * Record perf ksymbol unregister event before removing
                 * the page.
                 */
                perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
                                   (unsigned long)kip->insns, PAGE_SIZE, true,
                                   kip->cache->sym);
                list_del_rcu(&kip->list);
                synchronize_rcu();
                kip->cache->free(kip->insns);
                kfree(kip);
        }
        return true;
}

static int collect_garbage_slots(struct kprobe_insn_cache *c)
{
        struct kprobe_insn_page *kip, *next;

        /* Ensure no-one is interrupted on the garbages */
        synchronize_rcu();

        list_for_each_entry_safe(kip, next, &c->pages, list) {
                int i;

                if (kip->ngarbage == 0)
                        continue;
                kip->ngarbage = 0;        /* we will collect all garbages */
                for (i = 0; i < slots_per_page(c); i++) {
                        if (kip->slot_used[i] == SLOT_DIRTY && collect_one_slot(kip, i))
                                break;
                }
        }
        c->nr_garbage = 0;
        return 0;
}

static long __find_insn_page(struct kprobe_insn_cache *c,
        kprobe_opcode_t *slot, struct kprobe_insn_page **pkip)
{
        struct kprobe_insn_page *kip = NULL;
        long idx;

        guard(rcu)();
        list_for_each_entry_rcu(kip, &c->pages, list) {
                idx = ((long)slot - (long)kip->insns) /
                        (c->insn_size * sizeof(kprobe_opcode_t));
                if (idx >= 0 && idx < slots_per_page(c)) {
                        *pkip = kip;
                        return idx;
                }
        }
        /* Could not find this slot. */
        WARN_ON(1);
        *pkip = NULL;
        return -1;
}

void __free_insn_slot(struct kprobe_insn_cache *c,
                      kprobe_opcode_t *slot, int dirty)
{
        struct kprobe_insn_page *kip = NULL;
        long idx;

        guard(mutex)(&c->mutex);
        idx = __find_insn_page(c, slot, &kip);
        /* Mark and sweep: this may sleep */
        if (kip) {
                /* Check double free */
                WARN_ON(kip->slot_used[idx] != SLOT_USED);
                if (dirty) {
                        kip->slot_used[idx] = SLOT_DIRTY;
                        kip->ngarbage++;
                        if (++c->nr_garbage > slots_per_page(c))
                                collect_garbage_slots(c);
                } else {
                        collect_one_slot(kip, idx);
                }
        }
}

/*
 * Check given address is on the page of kprobe instruction slots.
 * This will be used for checking whether the address on a stack
 * is on a text area or not.
 */
bool __is_insn_slot_addr(struct kprobe_insn_cache *c, unsigned long addr)
{
        struct kprobe_insn_page *kip;
        bool ret = false;

        rcu_read_lock();
        list_for_each_entry_rcu(kip, &c->pages, list) {
                if (addr >= (unsigned long)kip->insns &&
                    addr < (unsigned long)kip->insns + PAGE_SIZE) {
                        ret = true;
                        break;
                }
        }
        rcu_read_unlock();

        return ret;
}

int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum,
                             unsigned long *value, char *type, char *sym)
{
        struct kprobe_insn_page *kip;
        int ret = -ERANGE;

        rcu_read_lock();
        list_for_each_entry_rcu(kip, &c->pages, list) {
                if ((*symnum)--)
                        continue;
                strscpy(sym, c->sym, KSYM_NAME_LEN);
                *type = 't';
                *value = (unsigned long)kip->insns;
                ret = 0;
                break;
        }
        rcu_read_unlock();

        return ret;
}

#ifdef CONFIG_OPTPROBES
void __weak *alloc_optinsn_page(void)
{
        return alloc_insn_page();
}

void __weak free_optinsn_page(void *page)
{
        free_insn_page(page);
}

/* For optimized_kprobe buffer */
struct kprobe_insn_cache kprobe_optinsn_slots = {
        .mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex),
        .alloc = alloc_optinsn_page,
        .free = free_optinsn_page,
        .sym = KPROBE_OPTINSN_PAGE_SYM,
        .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
        /* .insn_size is initialized later */
        .nr_garbage = 0,
};
#endif /* CONFIG_OPTPROBES */
#endif /* __ARCH_WANT_KPROBES_INSN_SLOT */

/* We have preemption disabled.. so it is safe to use __ versions */
static inline void set_kprobe_instance(struct kprobe *kp)
{
        __this_cpu_write(kprobe_instance, kp);
}

static inline void reset_kprobe_instance(void)
{
        __this_cpu_write(kprobe_instance, NULL);
}

/*
 * This routine is called either:
 *        - under the 'kprobe_mutex' - during kprobe_[un]register().
 *                                OR
 *        - with preemption disabled - from architecture specific code.
 */
struct kprobe *get_kprobe(void *addr)
{
        struct hlist_head *head;
        struct kprobe *p;

        head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
        hlist_for_each_entry_rcu(p, head, hlist,
                                 lockdep_is_held(&kprobe_mutex)) {
                if (p->addr == addr)
                        return p;
        }

        return NULL;
}
NOKPROBE_SYMBOL(get_kprobe);

static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);

/* Return true if 'p' is an aggregator */
static inline bool kprobe_aggrprobe(struct kprobe *p)
{
        return p->pre_handler == aggr_pre_handler;
}

/* Return true if 'p' is unused */
static inline bool kprobe_unused(struct kprobe *p)
{
        return kprobe_aggrprobe(p) && kprobe_disabled(p) &&
               list_empty(&p->list);
}

/* Keep all fields in the kprobe consistent. */
static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p)
{
        memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t));
        memcpy(&p->ainsn, &ap->ainsn, sizeof(struct arch_specific_insn));
}

#ifdef CONFIG_OPTPROBES
/* NOTE: This is protected by 'kprobe_mutex'. */
static bool kprobes_allow_optimization;

/*
 * Call all 'kprobe::pre_handler' on the list, but ignores its return value.
 * This must be called from arch-dep optimized caller.
 */
void opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
{
        struct kprobe *kp;

        list_for_each_entry_rcu(kp, &p->list, list) {
                if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
                        set_kprobe_instance(kp);
                        kp->pre_handler(kp, regs);
                }
                reset_kprobe_instance();
        }
}
NOKPROBE_SYMBOL(opt_pre_handler);

/* Free optimized instructions and optimized_kprobe */
static void free_aggr_kprobe(struct kprobe *p)
{
        struct optimized_kprobe *op;

        op = container_of(p, struct optimized_kprobe, kp);
        arch_remove_optimized_kprobe(op);
        arch_remove_kprobe(p);
        kfree(op);
}

/* Return true if the kprobe is ready for optimization. */
static inline int kprobe_optready(struct kprobe *p)
{
        struct optimized_kprobe *op;

        if (kprobe_aggrprobe(p)) {
                op = container_of(p, struct optimized_kprobe, kp);
                return arch_prepared_optinsn(&op->optinsn);
        }

        return 0;
}

/* Return true if the kprobe is disarmed. Note: p must be on hash list */
bool kprobe_disarmed(struct kprobe *p)
{
        struct optimized_kprobe *op;

        /* If kprobe is not aggr/opt probe, just return kprobe is disabled */
        if (!kprobe_aggrprobe(p))
                return kprobe_disabled(p);

        op = container_of(p, struct optimized_kprobe, kp);

        return kprobe_disabled(p) && list_empty(&op->list);
}

/* Return true if the probe is queued on (un)optimizing lists */
static bool kprobe_queued(struct kprobe *p)
{
        struct optimized_kprobe *op;

        if (kprobe_aggrprobe(p)) {
                op = container_of(p, struct optimized_kprobe, kp);
                if (!list_empty(&op->list))
                        return true;
        }
        return false;
}

/*
 * Return an optimized kprobe whose optimizing code replaces
 * instructions including 'addr' (exclude breakpoint).
 */
static struct kprobe *get_optimized_kprobe(kprobe_opcode_t *addr)
{
        int i;
        struct kprobe *p = NULL;
        struct optimized_kprobe *op;

        /* Don't check i == 0, since that is a breakpoint case. */
        for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH / sizeof(kprobe_opcode_t); i++)
                p = get_kprobe(addr - i);

        if (p && kprobe_optready(p)) {
                op = container_of(p, struct optimized_kprobe, kp);
                if (arch_within_optimized_kprobe(op, addr))
                        return p;
        }

        return NULL;
}

/* Optimization staging list, protected by 'kprobe_mutex' */
static LIST_HEAD(optimizing_list);
static LIST_HEAD(unoptimizing_list);
static LIST_HEAD(freeing_list);

static void kprobe_optimizer(struct work_struct *work);
static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
#define OPTIMIZE_DELAY 5

/*
 * Optimize (replace a breakpoint with a jump) kprobes listed on
 * 'optimizing_list'.
 */
static void do_optimize_kprobes(void)
{
        lockdep_assert_held(&text_mutex);
        /*
         * The optimization/unoptimization refers 'online_cpus' via
         * stop_machine() and cpu-hotplug modifies the 'online_cpus'.
         * And same time, 'text_mutex' will be held in cpu-hotplug and here.
         * This combination can cause a deadlock (cpu-hotplug tries to lock
         * 'text_mutex' but stop_machine() can not be done because
         * the 'online_cpus' has been changed)
         * To avoid this deadlock, caller must have locked cpu-hotplug
         * for preventing cpu-hotplug outside of 'text_mutex' locking.
         */
        lockdep_assert_cpus_held();

        /* Optimization never be done when disarmed */
        if (kprobes_all_disarmed || !kprobes_allow_optimization ||
            list_empty(&optimizing_list))
                return;

        arch_optimize_kprobes(&optimizing_list);
}

/*
 * Unoptimize (replace a jump with a breakpoint and remove the breakpoint
 * if need) kprobes listed on 'unoptimizing_list'.
 */
static void do_unoptimize_kprobes(void)
{
        struct optimized_kprobe *op, *tmp;

        lockdep_assert_held(&text_mutex);
        /* See comment in do_optimize_kprobes() */
        lockdep_assert_cpus_held();

        if (!list_empty(&unoptimizing_list))
                arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);

        /* Loop on 'freeing_list' for disarming and removing from kprobe hash list */
        list_for_each_entry_safe(op, tmp, &freeing_list, list) {
                /* Switching from detour code to origin */
                op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
                /* Disarm probes if marked disabled and not gone */
                if (kprobe_disabled(&op->kp) && !kprobe_gone(&op->kp))
                        arch_disarm_kprobe(&op->kp);
                if (kprobe_unused(&op->kp)) {
                        /*
                         * Remove unused probes from hash list. After waiting
                         * for synchronization, these probes are reclaimed.
                         * (reclaiming is done by do_free_cleaned_kprobes().)
                         */
                        hlist_del_rcu(&op->kp.hlist);
                } else
                        list_del_init(&op->list);
        }
}

/* Reclaim all kprobes on the 'freeing_list' */
static void do_free_cleaned_kprobes(void)
{
        struct optimized_kprobe *op, *tmp;

        list_for_each_entry_safe(op, tmp, &freeing_list, list) {
                list_del_init(&op->list);
                if (WARN_ON_ONCE(!kprobe_unused(&op->kp))) {
                        /*
                         * This must not happen, but if there is a kprobe
                         * still in use, keep it on kprobes hash list.
                         */
                        continue;
                }
                free_aggr_kprobe(&op->kp);
        }
}

/* Start optimizer after OPTIMIZE_DELAY passed */
static void kick_kprobe_optimizer(void)
{
        schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
}

/* Kprobe jump optimizer */
static void kprobe_optimizer(struct work_struct *work)
{
        guard(mutex)(&kprobe_mutex);

        scoped_guard(cpus_read_lock) {
                guard(mutex)(&text_mutex);

                /*
                 * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
                 * kprobes before waiting for quiesence period.
                 */
                do_unoptimize_kprobes();

                /*
                 * Step 2: Wait for quiesence period to ensure all potentially
                 * preempted tasks to have normally scheduled. Because optprobe
                 * may modify multiple instructions, there is a chance that Nth
                 * instruction is preempted. In that case, such tasks can return
                 * to 2nd-Nth byte of jump instruction. This wait is for avoiding it.
                 * Note that on non-preemptive kernel, this is transparently converted
                 * to synchronoze_sched() to wait for all interrupts to have completed.
                 */
                synchronize_rcu_tasks();

                /* Step 3: Optimize kprobes after quiesence period */
                do_optimize_kprobes();

                /* Step 4: Free cleaned kprobes after quiesence period */
                do_free_cleaned_kprobes();
        }

        /* Step 5: Kick optimizer again if needed */
        if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
                kick_kprobe_optimizer();
}

static void wait_for_kprobe_optimizer_locked(void)
{
        lockdep_assert_held(&kprobe_mutex);

        while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) {
                mutex_unlock(&kprobe_mutex);

                /* This will also make 'optimizing_work' execute immmediately */
                flush_delayed_work(&optimizing_work);
                /* 'optimizing_work' might not have been queued yet, relax */
                cpu_relax();

                mutex_lock(&kprobe_mutex);
        }
}

/* Wait for completing optimization and unoptimization */
void wait_for_kprobe_optimizer(void)
{
        guard(mutex)(&kprobe_mutex);

        wait_for_kprobe_optimizer_locked();
}

bool optprobe_queued_unopt(struct optimized_kprobe *op)
{
        struct optimized_kprobe *_op;

        list_for_each_entry(_op, &unoptimizing_list, list) {
                if (op == _op)
                        return true;
        }

        return false;
}

/* Optimize kprobe if p is ready to be optimized */
static void optimize_kprobe(struct kprobe *p)
{
        struct optimized_kprobe *op;

        /* Check if the kprobe is disabled or not ready for optimization. */
        if (!kprobe_optready(p) || !kprobes_allow_optimization ||
            (kprobe_disabled(p) || kprobes_all_disarmed))
                return;

        /* kprobes with 'post_handler' can not be optimized */
        if (p->post_handler)
                return;

        op = container_of(p, struct optimized_kprobe, kp);

        /* Check there is no other kprobes at the optimized instructions */
        if (arch_check_optimized_kprobe(op) < 0)
                return;

        /* Check if it is already optimized. */
        if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) {
                if (optprobe_queued_unopt(op)) {
                        /* This is under unoptimizing. Just dequeue the probe */
                        list_del_init(&op->list);
                }
                return;
        }
        op->kp.flags |= KPROBE_FLAG_OPTIMIZED;

        /*
         * On the 'unoptimizing_list' and 'optimizing_list',
         * 'op' must have OPTIMIZED flag
         */
        if (WARN_ON_ONCE(!list_empty(&op->list)))
                return;

        list_add(&op->list, &optimizing_list);
        kick_kprobe_optimizer();
}

/* Short cut to direct unoptimizing */
static void force_unoptimize_kprobe(struct optimized_kprobe *op)
{
        lockdep_assert_cpus_held();
        arch_unoptimize_kprobe(op);
        op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
}

/* Unoptimize a kprobe if p is optimized */
static void unoptimize_kprobe(struct kprobe *p, bool force)
{
        struct optimized_kprobe *op;

        if (!kprobe_aggrprobe(p) || kprobe_disarmed(p))
                return; /* This is not an optprobe nor optimized */

        op = container_of(p, struct optimized_kprobe, kp);
        if (!kprobe_optimized(p))
                return;

        if (!list_empty(&op->list)) {
                if (optprobe_queued_unopt(op)) {
                        /* Queued in unoptimizing queue */
                        if (force) {
                                /*
                                 * Forcibly unoptimize the kprobe here, and queue it
                                 * in the freeing list for release afterwards.
                                 */
                                force_unoptimize_kprobe(op);
                                list_move(&op->list, &freeing_list);
                        }
                } else {
                        /* Dequeue from the optimizing queue */
                        list_del_init(&op->list);
                        op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
                }
                return;
        }

        /* Optimized kprobe case */
        if (force) {
                /* Forcibly update the code: this is a special case */
                force_unoptimize_kprobe(op);
        } else {
                list_add(&op->list, &unoptimizing_list);
                kick_kprobe_optimizer();
        }
}

/* Cancel unoptimizing for reusing */
static int reuse_unused_kprobe(struct kprobe *ap)
{
        struct optimized_kprobe *op;

        /*
         * Unused kprobe MUST be on the way of delayed unoptimizing (means
         * there is still a relative jump) and disabled.
         */
        op = container_of(ap, struct optimized_kprobe, kp);
        WARN_ON_ONCE(list_empty(&op->list));
        /* Enable the probe again */
        ap->flags &= ~KPROBE_FLAG_DISABLED;
        /* Optimize it again. (remove from 'op->list') */
        if (!kprobe_optready(ap))
                return -EINVAL;

        optimize_kprobe(ap);
        return 0;
}

/* Remove optimized instructions */
static void kill_optimized_kprobe(struct kprobe *p)
{
        struct optimized_kprobe *op;

        op = container_of(p, struct optimized_kprobe, kp);
        if (!list_empty(&op->list))
                /* Dequeue from the (un)optimization queue */
                list_del_init(&op->list);
        op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;

        if (kprobe_unused(p)) {
                /*
                 * Unused kprobe is on unoptimizing or freeing list. We move it
                 * to freeing_list and let the kprobe_optimizer() remove it from
                 * the kprobe hash list and free it.
                 */
                if (optprobe_queued_unopt(op))
                        list_move(&op->list, &freeing_list);
        }

        /* Don't touch the code, because it is already freed. */
        arch_remove_optimized_kprobe(op);
}

static inline
void __prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
{
        if (!kprobe_ftrace(p))
                arch_prepare_optimized_kprobe(op, p);
}

/* Try to prepare optimized instructions */
static void prepare_optimized_kprobe(struct kprobe *p)
{
        struct optimized_kprobe *op;

        op = container_of(p, struct optimized_kprobe, kp);
        __prepare_optimized_kprobe(op, p);
}

/* Allocate new optimized_kprobe and try to prepare optimized instructions. */
static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
{
        struct optimized_kprobe *op;

        op = kzalloc(sizeof(struct optimized_kprobe), GFP_KERNEL);
        if (!op)
                return NULL;

        INIT_LIST_HEAD(&op->list);
        op->kp.addr = p->addr;
        __prepare_optimized_kprobe(op, p);

        return &op->kp;
}

static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);

/*
 * Prepare an optimized_kprobe and optimize it.
 * NOTE: 'p' must be a normal registered kprobe.
 */
static void try_to_optimize_kprobe(struct kprobe *p)
{
        struct kprobe *ap;
        struct optimized_kprobe *op;

        /* Impossible to optimize ftrace-based kprobe. */
        if (kprobe_ftrace(p))
                return;

        /* For preparing optimization, jump_label_text_reserved() is called. */
        guard(cpus_read_lock)();
        guard(jump_label_lock)();
        guard(mutex)(&text_mutex);

        ap = alloc_aggr_kprobe(p);
        if (!ap)
                return;

        op = container_of(ap, struct optimized_kprobe, kp);
        if (!arch_prepared_optinsn(&op->optinsn)) {
                /* If failed to setup optimizing, fallback to kprobe. */
                arch_remove_optimized_kprobe(op);
                kfree(op);
                return;
        }

        init_aggr_kprobe(ap, p);
        optimize_kprobe(ap);        /* This just kicks optimizer thread. */
}

static void optimize_all_kprobes(void)
{
        struct hlist_head *head;
        struct kprobe *p;
        unsigned int i;

        guard(mutex)(&kprobe_mutex);
        /* If optimization is already allowed, just return. */
        if (kprobes_allow_optimization)
                return;

        cpus_read_lock();
        kprobes_allow_optimization = true;
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry(p, head, hlist)
                        if (!kprobe_disabled(p))
                                optimize_kprobe(p);
        }
        cpus_read_unlock();
        pr_info("kprobe jump-optimization is enabled. All kprobes are optimized if possible.\n");
}

#ifdef CONFIG_SYSCTL
static void unoptimize_all_kprobes(void)
{
        struct hlist_head *head;
        struct kprobe *p;
        unsigned int i;

        guard(mutex)(&kprobe_mutex);
        /* If optimization is already prohibited, just return. */
        if (!kprobes_allow_optimization)
                return;

        cpus_read_lock();
        kprobes_allow_optimization = false;
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry(p, head, hlist) {
                        if (!kprobe_disabled(p))
                                unoptimize_kprobe(p, false);
                }
        }
        cpus_read_unlock();
        /* Wait for unoptimizing completion. */
        wait_for_kprobe_optimizer_locked();
        pr_info("kprobe jump-optimization is disabled. All kprobes are based on software breakpoint.\n");
}

static DEFINE_MUTEX(kprobe_sysctl_mutex);
static int sysctl_kprobes_optimization;
static int proc_kprobes_optimization_handler(const struct ctl_table *table,
                                             int write, void *buffer,
                                             size_t *length, loff_t *ppos)
{
        int ret;

        guard(mutex)(&kprobe_sysctl_mutex);
        sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
        ret = proc_dointvec_minmax(table, write, buffer, length, ppos);

        if (sysctl_kprobes_optimization)
                optimize_all_kprobes();
        else
                unoptimize_all_kprobes();

        return ret;
}

static const struct ctl_table kprobe_sysctls[] = {
        {
                .procname        = "kprobes-optimization",
                .data                = &sysctl_kprobes_optimization,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_kprobes_optimization_handler,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
};

static void __init kprobe_sysctls_init(void)
{
        register_sysctl_init("debug", kprobe_sysctls);
}
#endif /* CONFIG_SYSCTL */

/* Put a breakpoint for a probe. */
static void __arm_kprobe(struct kprobe *p)
{
        struct kprobe *_p;

        lockdep_assert_held(&text_mutex);

        /* Find the overlapping optimized kprobes. */
        _p = get_optimized_kprobe(p->addr);
        if (unlikely(_p))
                /* Fallback to unoptimized kprobe */
                unoptimize_kprobe(_p, true);

        arch_arm_kprobe(p);
        optimize_kprobe(p);        /* Try to optimize (add kprobe to a list) */
}

/* Remove the breakpoint of a probe. */
static void __disarm_kprobe(struct kprobe *p, bool reopt)
{
        struct kprobe *_p;

        lockdep_assert_held(&text_mutex);

        /* Try to unoptimize */
        unoptimize_kprobe(p, kprobes_all_disarmed);

        if (!kprobe_queued(p)) {
                arch_disarm_kprobe(p);
                /* If another kprobe was blocked, re-optimize it. */
                _p = get_optimized_kprobe(p->addr);
                if (unlikely(_p) && reopt)
                        optimize_kprobe(_p);
        }
        /*
         * TODO: Since unoptimization and real disarming will be done by
         * the worker thread, we can not check whether another probe are
         * unoptimized because of this probe here. It should be re-optimized
         * by the worker thread.
         */
}

#else /* !CONFIG_OPTPROBES */

#define optimize_kprobe(p)                        do {} while (0)
#define unoptimize_kprobe(p, f)                        do {} while (0)
#define kill_optimized_kprobe(p)                do {} while (0)
#define prepare_optimized_kprobe(p)                do {} while (0)
#define try_to_optimize_kprobe(p)                do {} while (0)
#define __arm_kprobe(p)                                arch_arm_kprobe(p)
#define __disarm_kprobe(p, o)                        arch_disarm_kprobe(p)
#define kprobe_disarmed(p)                        kprobe_disabled(p)
#define wait_for_kprobe_optimizer_locked()                        \
        lockdep_assert_held(&kprobe_mutex)

static int reuse_unused_kprobe(struct kprobe *ap)
{
        /*
         * If the optimized kprobe is NOT supported, the aggr kprobe is
         * released at the same time that the last aggregated kprobe is
         * unregistered.
         * Thus there should be no chance to reuse unused kprobe.
         */
        WARN_ON_ONCE(1);
        return -EINVAL;
}

static void free_aggr_kprobe(struct kprobe *p)
{
        arch_remove_kprobe(p);
        kfree(p);
}

static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
{
        return kzalloc(sizeof(struct kprobe), GFP_KERNEL);
}
#endif /* CONFIG_OPTPROBES */

#ifdef CONFIG_KPROBES_ON_FTRACE
static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
        .func = kprobe_ftrace_handler,
        .flags = FTRACE_OPS_FL_SAVE_REGS,
};

static struct ftrace_ops kprobe_ipmodify_ops __read_mostly = {
        .func = kprobe_ftrace_handler,
        .flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_IPMODIFY,
};

static int kprobe_ipmodify_enabled;
static int kprobe_ftrace_enabled;
bool kprobe_ftrace_disabled;

static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
                               int *cnt)
{
        int ret;

        lockdep_assert_held(&kprobe_mutex);

        ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 0, 0);
        if (WARN_ONCE(ret < 0, "Failed to arm kprobe-ftrace at %pS (error %d)\n", p->addr, ret))
                return ret;

        if (*cnt == 0) {
                ret = register_ftrace_function(ops);
                if (WARN(ret < 0, "Failed to register kprobe-ftrace (error %d)\n", ret)) {
                        /*
                         * At this point, sinec ops is not registered, we should be sefe from
                         * registering empty filter.
                         */
                        ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0);
                        return ret;
                }
        }

        (*cnt)++;
        return ret;
}

static int arm_kprobe_ftrace(struct kprobe *p)
{
        bool ipmodify = (p->post_handler != NULL);

        return __arm_kprobe_ftrace(p,
                ipmodify ? &kprobe_ipmodify_ops : &kprobe_ftrace_ops,
                ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled);
}

static int __disarm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
                                  int *cnt)
{
        int ret;

        lockdep_assert_held(&kprobe_mutex);

        if (*cnt == 1) {
                ret = unregister_ftrace_function(ops);
                if (WARN(ret < 0, "Failed to unregister kprobe-ftrace (error %d)\n", ret))
                        return ret;
        }

        (*cnt)--;

        ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0);
        WARN_ONCE(ret < 0, "Failed to disarm kprobe-ftrace at %pS (error %d)\n",
                  p->addr, ret);
        return ret;
}

static int disarm_kprobe_ftrace(struct kprobe *p)
{
        bool ipmodify = (p->post_handler != NULL);

        return __disarm_kprobe_ftrace(p,
                ipmodify ? &kprobe_ipmodify_ops : &kprobe_ftrace_ops,
                ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled);
}

void kprobe_ftrace_kill(void)
{
        kprobe_ftrace_disabled = true;
}
#else        /* !CONFIG_KPROBES_ON_FTRACE */
static inline int arm_kprobe_ftrace(struct kprobe *p)
{
        return -ENODEV;
}

static inline int disarm_kprobe_ftrace(struct kprobe *p)
{
        return -ENODEV;
}
#endif

static int prepare_kprobe(struct kprobe *p)
{
        /* Must ensure p->addr is really on ftrace */
        if (kprobe_ftrace(p))
                return arch_prepare_kprobe_ftrace(p);

        return arch_prepare_kprobe(p);
}

static int arm_kprobe(struct kprobe *kp)
{
        if (unlikely(kprobe_ftrace(kp)))
                return arm_kprobe_ftrace(kp);

        guard(cpus_read_lock)();
        guard(mutex)(&text_mutex);
        __arm_kprobe(kp);
        return 0;
}

static int disarm_kprobe(struct kprobe *kp, bool reopt)
{
        if (unlikely(kprobe_ftrace(kp)))
                return disarm_kprobe_ftrace(kp);

        guard(cpus_read_lock)();
        guard(mutex)(&text_mutex);
        __disarm_kprobe(kp, reopt);
        return 0;
}

/*
 * Aggregate handlers for multiple kprobes support - these handlers
 * take care of invoking the individual kprobe handlers on p->list
 */
static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
{
        struct kprobe *kp;

        list_for_each_entry_rcu(kp, &p->list, list) {
                if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
                        set_kprobe_instance(kp);
                        if (kp->pre_handler(kp, regs))
                                return 1;
                }
                reset_kprobe_instance();
        }
        return 0;
}
NOKPROBE_SYMBOL(aggr_pre_handler);

static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
                              unsigned long flags)
{
        struct kprobe *kp;

        list_for_each_entry_rcu(kp, &p->list, list) {
                if (kp->post_handler && likely(!kprobe_disabled(kp))) {
                        set_kprobe_instance(kp);
                        kp->post_handler(kp, regs, flags);
                        reset_kprobe_instance();
                }
        }
}
NOKPROBE_SYMBOL(aggr_post_handler);

/* Walks the list and increments 'nmissed' if 'p' has child probes. */
void kprobes_inc_nmissed_count(struct kprobe *p)
{
        struct kprobe *kp;

        if (!kprobe_aggrprobe(p)) {
                p->nmissed++;
        } else {
                list_for_each_entry_rcu(kp, &p->list, list)
                        kp->nmissed++;
        }
}
NOKPROBE_SYMBOL(kprobes_inc_nmissed_count);

static struct kprobe kprobe_busy = {
        .addr = (void *) get_kprobe,
};

void kprobe_busy_begin(void)
{
        struct kprobe_ctlblk *kcb;

        preempt_disable();
        __this_cpu_write(current_kprobe, &kprobe_busy);
        kcb = get_kprobe_ctlblk();
        kcb->kprobe_status = KPROBE_HIT_ACTIVE;
}

void kprobe_busy_end(void)
{
        __this_cpu_write(current_kprobe, NULL);
        preempt_enable();
}

/* Add the new probe to 'ap->list'. */
static int add_new_kprobe(struct kprobe *ap, struct kprobe *p)
{
        if (p->post_handler)
                unoptimize_kprobe(ap, true);        /* Fall back to normal kprobe */

        list_add_rcu(&p->list, &ap->list);
        if (p->post_handler && !ap->post_handler)
                ap->post_handler = aggr_post_handler;

        return 0;
}

/*
 * Fill in the required fields of the aggregator kprobe. Replace the
 * earlier kprobe in the hlist with the aggregator kprobe.
 */
static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
{
        /* Copy the insn slot of 'p' to 'ap'. */
        copy_kprobe(p, ap);
        flush_insn_slot(ap);
        ap->addr = p->addr;
        ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED;
        ap->pre_handler = aggr_pre_handler;
        /* We don't care the kprobe which has gone. */
        if (p->post_handler && !kprobe_gone(p))
                ap->post_handler = aggr_post_handler;

        INIT_LIST_HEAD(&ap->list);
        INIT_HLIST_NODE(&ap->hlist);

        list_add_rcu(&p->list, &ap->list);
        hlist_replace_rcu(&p->hlist, &ap->hlist);
}

/*
 * This registers the second or subsequent kprobe at the same address.
 */
static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
{
        int ret = 0;
        struct kprobe *ap = orig_p;

        scoped_guard(cpus_read_lock) {
                /* For preparing optimization, jump_label_text_reserved() is called */
                guard(jump_label_lock)();
                guard(mutex)(&text_mutex);

                if (!kprobe_aggrprobe(orig_p)) {
                        /* If 'orig_p' is not an 'aggr_kprobe', create new one. */
                        ap = alloc_aggr_kprobe(orig_p);
                        if (!ap)
                                return -ENOMEM;
                        init_aggr_kprobe(ap, orig_p);
                } else if (kprobe_unused(ap)) {
                        /* This probe is going to die. Rescue it */
                        ret = reuse_unused_kprobe(ap);
                        if (ret)
                                return ret;
                }

                if (kprobe_gone(ap)) {
                        /*
                         * Attempting to insert new probe at the same location that
                         * had a probe in the module vaddr area which already
                         * freed. So, the instruction slot has already been
                         * released. We need a new slot for the new probe.
                         */
                        ret = arch_prepare_kprobe(ap);
                        if (ret)
                                /*
                                 * Even if fail to allocate new slot, don't need to
                                 * free the 'ap'. It will be used next time, or
                                 * freed by unregister_kprobe().
                                 */
                                return ret;

                        /* Prepare optimized instructions if possible. */
                        prepare_optimized_kprobe(ap);

                        /*
                         * Clear gone flag to prevent allocating new slot again, and
                         * set disabled flag because it is not armed yet.
                         */
                        ap->flags = (ap->flags & ~KPROBE_FLAG_GONE)
                                        | KPROBE_FLAG_DISABLED;
                }

                /* Copy the insn slot of 'p' to 'ap'. */
                copy_kprobe(ap, p);
                ret = add_new_kprobe(ap, p);
        }

        if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) {
                ap->flags &= ~KPROBE_FLAG_DISABLED;
                if (!kprobes_all_disarmed) {
                        /* Arm the breakpoint again. */
                        ret = arm_kprobe(ap);
                        if (ret) {
                                ap->flags |= KPROBE_FLAG_DISABLED;
                                list_del_rcu(&p->list);
                                synchronize_rcu();
                        }
                }
        }
        return ret;
}

bool __weak arch_within_kprobe_blacklist(unsigned long addr)
{
        /* The '__kprobes' functions and entry code must not be probed. */
        return addr >= (unsigned long)__kprobes_text_start &&
               addr < (unsigned long)__kprobes_text_end;
}

static bool __within_kprobe_blacklist(unsigned long addr)
{
        struct kprobe_blacklist_entry *ent;

        if (arch_within_kprobe_blacklist(addr))
                return true;
        /*
         * If 'kprobe_blacklist' is defined, check the address and
         * reject any probe registration in the prohibited area.
         */
        list_for_each_entry(ent, &kprobe_blacklist, list) {
                if (addr >= ent->start_addr && addr < ent->end_addr)
                        return true;
        }
        return false;
}

bool within_kprobe_blacklist(unsigned long addr)
{
        char symname[KSYM_NAME_LEN], *p;

        if (__within_kprobe_blacklist(addr))
                return true;

        /* Check if the address is on a suffixed-symbol */
        if (!lookup_symbol_name(addr, symname)) {
                p = strchr(symname, '.');
                if (!p)
                        return false;
                *p = '\0';
                addr = (unsigned long)kprobe_lookup_name(symname, 0);
                if (addr)
                        return __within_kprobe_blacklist(addr);
        }
        return false;
}

/*
 * arch_adjust_kprobe_addr - adjust the address
 * @addr: symbol base address
 * @offset: offset within the symbol
 * @on_func_entry: was this @addr+@offset on the function entry
 *
 * Typically returns @addr + @offset, except for special cases where the
 * function might be prefixed by a CFI landing pad, in that case any offset
 * inside the landing pad is mapped to the first 'real' instruction of the
 * symbol.
 *
 * Specifically, for things like IBT/BTI, skip the resp. ENDBR/BTI.C
 * instruction at +0.
 */
kprobe_opcode_t *__weak arch_adjust_kprobe_addr(unsigned long addr,
                                                unsigned long offset,
                                                bool *on_func_entry)
{
        *on_func_entry = !offset;
        return (kprobe_opcode_t *)(addr + offset);
}

/*
 * If 'symbol_name' is specified, look it up and add the 'offset'
 * to it. This way, we can specify a relative address to a symbol.
 * This returns encoded errors if it fails to look up symbol or invalid
 * combination of parameters.
 */
static kprobe_opcode_t *
_kprobe_addr(kprobe_opcode_t *addr, const char *symbol_name,
             unsigned long offset, bool *on_func_entry)
{
        if ((symbol_name && addr) || (!symbol_name && !addr))
                return ERR_PTR(-EINVAL);

        if (symbol_name) {
                /*
                 * Input: @sym + @offset
                 * Output: @addr + @offset
                 *
                 * NOTE: kprobe_lookup_name() does *NOT* fold the offset
                 *       argument into it's output!
                 */
                addr = kprobe_lookup_name(symbol_name, offset);
                if (!addr)
                        return ERR_PTR(-ENOENT);
        }

        /*
         * So here we have @addr + @offset, displace it into a new
         * @addr' + @offset' where @addr' is the symbol start address.
         */
        addr = (void *)addr + offset;
        if (!kallsyms_lookup_size_offset((unsigned long)addr, NULL, &offset))
                return ERR_PTR(-ENOENT);
        addr = (void *)addr - offset;

        /*
         * Then ask the architecture to re-combine them, taking care of
         * magical function entry details while telling us if this was indeed
         * at the start of the function.
         */
        addr = arch_adjust_kprobe_addr((unsigned long)addr, offset, on_func_entry);
        if (!addr)
                return ERR_PTR(-EINVAL);

        return addr;
}

static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
{
        bool on_func_entry;

        return _kprobe_addr(p->addr, p->symbol_name, p->offset, &on_func_entry);
}

/*
 * Check the 'p' is valid and return the aggregator kprobe
 * at the same address.
 */
static struct kprobe *__get_valid_kprobe(struct kprobe *p)
{
        struct kprobe *ap, *list_p;

        lockdep_assert_held(&kprobe_mutex);

        ap = get_kprobe(p->addr);
        if (unlikely(!ap))
                return NULL;

        if (p == ap)
                return ap;

        list_for_each_entry(list_p, &ap->list, list)
                if (list_p == p)
                /* kprobe p is a valid probe */
                        return ap;

        return NULL;
}

/*
 * Warn and return error if the kprobe is being re-registered since
 * there must be a software bug.
 */
static inline int warn_kprobe_rereg(struct kprobe *p)
{
        guard(mutex)(&kprobe_mutex);

        if (WARN_ON_ONCE(__get_valid_kprobe(p)))
                return -EINVAL;

        return 0;
}

static int check_ftrace_location(struct kprobe *p)
{
        unsigned long addr = (unsigned long)p->addr;

        if (ftrace_location(addr) == addr) {
#ifdef CONFIG_KPROBES_ON_FTRACE
                p->flags |= KPROBE_FLAG_FTRACE;
#else
                return -EINVAL;
#endif
        }
        return 0;
}

static bool is_cfi_preamble_symbol(unsigned long addr)
{
        char symbuf[KSYM_NAME_LEN];

        if (lookup_symbol_name(addr, symbuf))
                return false;

        return str_has_prefix(symbuf, "__cfi_") ||
                str_has_prefix(symbuf, "__pfx_");
}

static int check_kprobe_address_safe(struct kprobe *p,
                                     struct module **probed_mod)
{
        int ret;

        ret = check_ftrace_location(p);
        if (ret)
                return ret;

        guard(jump_label_lock)();

        /* Ensure the address is in a text area, and find a module if exists. */
        *probed_mod = NULL;
        if (!core_kernel_text((unsigned long) p->addr)) {
                guard(rcu)();
                *probed_mod = __module_text_address((unsigned long) p->addr);
                if (!(*probed_mod))
                        return -EINVAL;

                /*
                 * We must hold a refcount of the probed module while updating
                 * its code to prohibit unexpected unloading.
                 */
                if (unlikely(!try_module_get(*probed_mod)))
                        return -ENOENT;
        }
        /* Ensure it is not in reserved area. */
        if (in_gate_area_no_mm((unsigned long) p->addr) ||
            within_kprobe_blacklist((unsigned long) p->addr) ||
            jump_label_text_reserved(p->addr, p->addr) ||
            static_call_text_reserved(p->addr, p->addr) ||
            find_bug((unsigned long)p->addr) ||
            is_cfi_preamble_symbol((unsigned long)p->addr)) {
                module_put(*probed_mod);
                return -EINVAL;
        }

        /* Get module refcount and reject __init functions for loaded modules. */
        if (IS_ENABLED(CONFIG_MODULES) && *probed_mod) {
                /*
                 * If the module freed '.init.text', we couldn't insert
                 * kprobes in there.
                 */
                if (within_module_init((unsigned long)p->addr, *probed_mod) &&
                    !module_is_coming(*probed_mod)) {
                        module_put(*probed_mod);
                        return -ENOENT;
                }
        }

        return 0;
}

static int __register_kprobe(struct kprobe *p)
{
        int ret;
        struct kprobe *old_p;

        guard(mutex)(&kprobe_mutex);

        old_p = get_kprobe(p->addr);
        if (old_p)
                /* Since this may unoptimize 'old_p', locking 'text_mutex'. */
                return register_aggr_kprobe(old_p, p);

        scoped_guard(cpus_read_lock) {
                /* Prevent text modification */
                guard(mutex)(&text_mutex);
                ret = prepare_kprobe(p);
                if (ret)
                        return ret;
        }

        INIT_HLIST_NODE(&p->hlist);
        hlist_add_head_rcu(&p->hlist,
                       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);

        if (!kprobes_all_disarmed && !kprobe_disabled(p)) {
                ret = arm_kprobe(p);
                if (ret) {
                        hlist_del_rcu(&p->hlist);
                        synchronize_rcu();
                }
        }

        /* Try to optimize kprobe */
        try_to_optimize_kprobe(p);
        return 0;
}

int register_kprobe(struct kprobe *p)
{
        int ret;
        struct module *probed_mod;
        kprobe_opcode_t *addr;
        bool on_func_entry;

        /* Canonicalize probe address from symbol */
        addr = _kprobe_addr(p->addr, p->symbol_name, p->offset, &on_func_entry);
        if (IS_ERR(addr))
                return PTR_ERR(addr);
        p->addr = addr;

        ret = warn_kprobe_rereg(p);
        if (ret)
                return ret;

        /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
        p->flags &= KPROBE_FLAG_DISABLED;
        if (on_func_entry)
                p->flags |= KPROBE_FLAG_ON_FUNC_ENTRY;
        p->nmissed = 0;
        INIT_LIST_HEAD(&p->list);

        ret = check_kprobe_address_safe(p, &probed_mod);
        if (ret)
                return ret;

        ret = __register_kprobe(p);

        if (probed_mod)
                module_put(probed_mod);

        return ret;
}
EXPORT_SYMBOL_GPL(register_kprobe);

/* Check if all probes on the 'ap' are disabled. */
static bool aggr_kprobe_disabled(struct kprobe *ap)
{
        struct kprobe *kp;

        lockdep_assert_held(&kprobe_mutex);

        list_for_each_entry(kp, &ap->list, list)
                if (!kprobe_disabled(kp))
                        /*
                         * Since there is an active probe on the list,
                         * we can't disable this 'ap'.
                         */
                        return false;

        return true;
}

static struct kprobe *__disable_kprobe(struct kprobe *p)
{
        struct kprobe *orig_p;
        int ret;

        lockdep_assert_held(&kprobe_mutex);

        /* Get an original kprobe for return */
        orig_p = __get_valid_kprobe(p);
        if (unlikely(orig_p == NULL))
                return ERR_PTR(-EINVAL);

        if (kprobe_disabled(p))
                return orig_p;

        /* Disable probe if it is a child probe */
        if (p != orig_p)
                p->flags |= KPROBE_FLAG_DISABLED;

        /* Try to disarm and disable this/parent probe */
        if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
                /*
                 * Don't be lazy here.  Even if 'kprobes_all_disarmed'
                 * is false, 'orig_p' might not have been armed yet.
                 * Note arm_all_kprobes() __tries__ to arm all kprobes
                 * on the best effort basis.
                 */
                if (!kprobes_all_disarmed && !kprobe_disabled(orig_p)) {
                        ret = disarm_kprobe(orig_p, true);
                        if (ret) {
                                p->flags &= ~KPROBE_FLAG_DISABLED;
                                return ERR_PTR(ret);
                        }
                }
                orig_p->flags |= KPROBE_FLAG_DISABLED;
        }

        return orig_p;
}

/*
 * Unregister a kprobe without a scheduler synchronization.
 */
static int __unregister_kprobe_top(struct kprobe *p)
{
        struct kprobe *ap, *list_p;

        /* Disable kprobe. This will disarm it if needed. */
        ap = __disable_kprobe(p);
        if (IS_ERR(ap))
                return PTR_ERR(ap);

        WARN_ON(ap != p && !kprobe_aggrprobe(ap));

        /*
         * If the probe is an independent(and non-optimized) kprobe
         * (not an aggrprobe), the last kprobe on the aggrprobe, or
         * kprobe is already disarmed, just remove from the hash list.
         */
        if (ap == p ||
                (list_is_singular(&ap->list) && kprobe_disarmed(ap))) {
                /*
                 * !disarmed could be happen if the probe is under delayed
                 * unoptimizing.
                 */
                hlist_del_rcu(&ap->hlist);
                return 0;
        }

        /* If disabling probe has special handlers, update aggrprobe */
        if (p->post_handler && !kprobe_gone(p)) {
                list_for_each_entry(list_p, &ap->list, list) {
                        if ((list_p != p) && (list_p->post_handler))
                                break;
                }
                /* No other probe has post_handler */
                if (list_entry_is_head(list_p, &ap->list, list)) {
                        /*
                         * For the kprobe-on-ftrace case, we keep the
                         * post_handler setting to identify this aggrprobe
                         * armed with kprobe_ipmodify_ops.
                         */
                        if (!kprobe_ftrace(ap))
                                ap->post_handler = NULL;
                }
        }

        /*
         * Remove from the aggrprobe: this path will do nothing in
         * __unregister_kprobe_bottom().
         */
        list_del_rcu(&p->list);
        if (!kprobe_disabled(ap) && !kprobes_all_disarmed)
                /*
                 * Try to optimize this probe again, because post
                 * handler may have been changed.
                 */
                optimize_kprobe(ap);
        return 0;

}

static void __unregister_kprobe_bottom(struct kprobe *p)
{
        struct kprobe *ap;

        if (list_empty(&p->list))
                /* This is an independent kprobe */
                arch_remove_kprobe(p);
        else if (list_is_singular(&p->list)) {
                /* This is the last child of an aggrprobe */
                ap = list_entry(p->list.next, struct kprobe, list);
                list_del(&p->list);
                free_aggr_kprobe(ap);
        }
        /* Otherwise, do nothing. */
}

int register_kprobes(struct kprobe **kps, int num)
{
        int i, ret = 0;

        if (num <= 0)
                return -EINVAL;
        for (i = 0; i < num; i++) {
                ret = register_kprobe(kps[i]);
                if (ret < 0) {
                        if (i > 0)
                                unregister_kprobes(kps, i);
                        break;
                }
        }
        return ret;
}
EXPORT_SYMBOL_GPL(register_kprobes);

void unregister_kprobe(struct kprobe *p)
{
        unregister_kprobes(&p, 1);
}
EXPORT_SYMBOL_GPL(unregister_kprobe);

void unregister_kprobes(struct kprobe **kps, int num)
{
        int i;

        if (num <= 0)
                return;
        scoped_guard(mutex, &kprobe_mutex) {
                for (i = 0; i < num; i++)
                        if (__unregister_kprobe_top(kps[i]) < 0)
                                kps[i]->addr = NULL;
        }
        synchronize_rcu();
        for (i = 0; i < num; i++)
                if (kps[i]->addr)
                        __unregister_kprobe_bottom(kps[i]);
}
EXPORT_SYMBOL_GPL(unregister_kprobes);

int __weak kprobe_exceptions_notify(struct notifier_block *self,
                                        unsigned long val, void *data)
{
        return NOTIFY_DONE;
}
NOKPROBE_SYMBOL(kprobe_exceptions_notify);

static struct notifier_block kprobe_exceptions_nb = {
        .notifier_call = kprobe_exceptions_notify,
        .priority = 0x7fffffff /* we need to be notified first */
};

#ifdef CONFIG_KRETPROBES

#if !defined(CONFIG_KRETPROBE_ON_RETHOOK)

/* callbacks for objpool of kretprobe instances */
static int kretprobe_init_inst(void *nod, void *context)
{
        struct kretprobe_instance *ri = nod;

        ri->rph = context;
        return 0;
}
static int kretprobe_fini_pool(struct objpool_head *head, void *context)
{
        kfree(context);
        return 0;
}

static void free_rp_inst_rcu(struct rcu_head *head)
{
        struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu);
        struct kretprobe_holder *rph = ri->rph;

        objpool_drop(ri, &rph->pool);
}
NOKPROBE_SYMBOL(free_rp_inst_rcu);

static void recycle_rp_inst(struct kretprobe_instance *ri)
{
        struct kretprobe *rp = get_kretprobe(ri);

        if (likely(rp))
                objpool_push(ri, &rp->rph->pool);
        else
                call_rcu(&ri->rcu, free_rp_inst_rcu);
}
NOKPROBE_SYMBOL(recycle_rp_inst);

/*
 * This function is called from delayed_put_task_struct() when a task is
 * dead and cleaned up to recycle any kretprobe instances associated with
 * this task. These left over instances represent probed functions that
 * have been called but will never return.
 */
void kprobe_flush_task(struct task_struct *tk)
{
        struct kretprobe_instance *ri;
        struct llist_node *node;

        /* Early boot, not yet initialized. */
        if (unlikely(!kprobes_initialized))
                return;

        kprobe_busy_begin();

        node = __llist_del_all(&tk->kretprobe_instances);
        while (node) {
                ri = container_of(node, struct kretprobe_instance, llist);
                node = node->next;

                recycle_rp_inst(ri);
        }

        kprobe_busy_end();
}
NOKPROBE_SYMBOL(kprobe_flush_task);

static inline void free_rp_inst(struct kretprobe *rp)
{
        struct kretprobe_holder *rph = rp->rph;

        if (!rph)
                return;
        rp->rph = NULL;
        objpool_fini(&rph->pool);
}

/* This assumes the 'tsk' is the current task or the is not running. */
static kprobe_opcode_t *__kretprobe_find_ret_addr(struct task_struct *tsk,
                                                  struct llist_node **cur)
{
        struct kretprobe_instance *ri = NULL;
        struct llist_node *node = *cur;

        if (!node)
                node = tsk->kretprobe_instances.first;
        else
                node = node->next;

        while (node) {
                ri = container_of(node, struct kretprobe_instance, llist);
                if (ri->ret_addr != kretprobe_trampoline_addr()) {
                        *cur = node;
                        return ri->ret_addr;
                }
                node = node->next;
        }
        return NULL;
}
NOKPROBE_SYMBOL(__kretprobe_find_ret_addr);

/**
 * kretprobe_find_ret_addr -- Find correct return address modified by kretprobe
 * @tsk: Target task
 * @fp: A frame pointer
 * @cur: a storage of the loop cursor llist_node pointer for next call
 *
 * Find the correct return address modified by a kretprobe on @tsk in unsigned
 * long type. If it finds the return address, this returns that address value,
 * or this returns 0.
 * The @tsk must be 'current' or a task which is not running. @fp is a hint
 * to get the currect return address - which is compared with the
 * kretprobe_instance::fp field. The @cur is a loop cursor for searching the
 * kretprobe return addresses on the @tsk. The '*@cur' should be NULL at the
 * first call, but '@cur' itself must NOT NULL.
 */
unsigned long kretprobe_find_ret_addr(struct task_struct *tsk, void *fp,
                                      struct llist_node **cur)
{
        struct kretprobe_instance *ri;
        kprobe_opcode_t *ret;

        if (WARN_ON_ONCE(!cur))
                return 0;

        do {
                ret = __kretprobe_find_ret_addr(tsk, cur);
                if (!ret)
                        break;
                ri = container_of(*cur, struct kretprobe_instance, llist);
        } while (ri->fp != fp);

        return (unsigned long)ret;
}
NOKPROBE_SYMBOL(kretprobe_find_ret_addr);

void __weak arch_kretprobe_fixup_return(struct pt_regs *regs,
                                        kprobe_opcode_t *correct_ret_addr)
{
        /*
         * Do nothing by default. Please fill this to update the fake return
         * address on the stack with the correct one on each arch if possible.
         */
}

unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs,
                                             void *frame_pointer)
{
        struct kretprobe_instance *ri = NULL;
        struct llist_node *first, *node = NULL;
        kprobe_opcode_t *correct_ret_addr;
        struct kretprobe *rp;

        /* Find correct address and all nodes for this frame. */
        correct_ret_addr = __kretprobe_find_ret_addr(current, &node);
        if (!correct_ret_addr) {
                pr_err("kretprobe: Return address not found, not execute handler. Maybe there is a bug in the kernel.\n");
                BUG_ON(1);
        }

        /*
         * Set the return address as the instruction pointer, because if the
         * user handler calls stack_trace_save_regs() with this 'regs',
         * the stack trace will start from the instruction pointer.
         */
        instruction_pointer_set(regs, (unsigned long)correct_ret_addr);

        /* Run the user handler of the nodes. */
        first = current->kretprobe_instances.first;
        while (first) {
                ri = container_of(first, struct kretprobe_instance, llist);

                if (WARN_ON_ONCE(ri->fp != frame_pointer))
                        break;

                rp = get_kretprobe(ri);
                if (rp && rp->handler) {
                        struct kprobe *prev = kprobe_running();

                        __this_cpu_write(current_kprobe, &rp->kp);
                        ri->ret_addr = correct_ret_addr;
                        rp->handler(ri, regs);
                        __this_cpu_write(current_kprobe, prev);
                }
                if (first == node)
                        break;

                first = first->next;
        }

        arch_kretprobe_fixup_return(regs, correct_ret_addr);

        /* Unlink all nodes for this frame. */
        first = current->kretprobe_instances.first;
        current->kretprobe_instances.first = node->next;
        node->next = NULL;

        /* Recycle free instances. */
        while (first) {
                ri = container_of(first, struct kretprobe_instance, llist);
                first = first->next;

                recycle_rp_inst(ri);
        }

        return (unsigned long)correct_ret_addr;
}
NOKPROBE_SYMBOL(__kretprobe_trampoline_handler)

/*
 * This kprobe pre_handler is registered with every kretprobe. When probe
 * hits it will set up the return probe.
 */
static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
{
        struct kretprobe *rp = container_of(p, struct kretprobe, kp);
        struct kretprobe_holder *rph = rp->rph;
        struct kretprobe_instance *ri;

        ri = objpool_pop(&rph->pool);
        if (!ri) {
                rp->nmissed++;
                return 0;
        }

        if (rp->entry_handler && rp->entry_handler(ri, regs)) {
                objpool_push(ri, &rph->pool);
                return 0;
        }

        arch_prepare_kretprobe(ri, regs);

        __llist_add(&ri->llist, &current->kretprobe_instances);

        return 0;
}
NOKPROBE_SYMBOL(pre_handler_kretprobe);
#else /* CONFIG_KRETPROBE_ON_RETHOOK */
/*
 * This kprobe pre_handler is registered with every kretprobe. When probe
 * hits it will set up the return probe.
 */
static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
{
        struct kretprobe *rp = container_of(p, struct kretprobe, kp);
        struct kretprobe_instance *ri;
        struct rethook_node *rhn;

        rhn = rethook_try_get(rp->rh);
        if (!rhn) {
                rp->nmissed++;
                return 0;
        }

        ri = container_of(rhn, struct kretprobe_instance, node);

        if (rp->entry_handler && rp->entry_handler(ri, regs))
                rethook_recycle(rhn);
        else
                rethook_hook(rhn, regs, kprobe_ftrace(p));

        return 0;
}
NOKPROBE_SYMBOL(pre_handler_kretprobe);

static void kretprobe_rethook_handler(struct rethook_node *rh, void *data,
                                      unsigned long ret_addr,
                                      struct pt_regs *regs)
{
        struct kretprobe *rp = (struct kretprobe *)data;
        struct kretprobe_instance *ri;
        struct kprobe_ctlblk *kcb;

        /* The data must NOT be null. This means rethook data structure is broken. */
        if (WARN_ON_ONCE(!data) || !rp->handler)
                return;

        __this_cpu_write(current_kprobe, &rp->kp);
        kcb = get_kprobe_ctlblk();
        kcb->kprobe_status = KPROBE_HIT_ACTIVE;

        ri = container_of(rh, struct kretprobe_instance, node);
        rp->handler(ri, regs);

        __this_cpu_write(current_kprobe, NULL);
}
NOKPROBE_SYMBOL(kretprobe_rethook_handler);

#endif /* !CONFIG_KRETPROBE_ON_RETHOOK */

/**
 * kprobe_on_func_entry() -- check whether given address is function entry
 * @addr: Target address
 * @sym:  Target symbol name
 * @offset: The offset from the symbol or the address
 *
 * This checks whether the given @addr+@offset or @sym+@offset is on the
 * function entry address or not.
 * This returns 0 if it is the function entry, or -EINVAL if it is not.
 * And also it returns -ENOENT if it fails the symbol or address lookup.
 * Caller must pass @addr or @sym (either one must be NULL), or this
 * returns -EINVAL.
 */
int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
{
        bool on_func_entry;
        kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset, &on_func_entry);

        if (IS_ERR(kp_addr))
                return PTR_ERR(kp_addr);

        if (!on_func_entry)
                return -EINVAL;

        return 0;
}

int register_kretprobe(struct kretprobe *rp)
{
        int ret;
        int i;
        void *addr;

        ret = kprobe_on_func_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset);
        if (ret)
                return ret;

        /* If only 'rp->kp.addr' is specified, check reregistering kprobes */
        if (rp->kp.addr && warn_kprobe_rereg(&rp->kp))
                return -EINVAL;

        if (kretprobe_blacklist_size) {
                addr = kprobe_addr(&rp->kp);
                if (IS_ERR(addr))
                        return PTR_ERR(addr);

                for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
                        if (kretprobe_blacklist[i].addr == addr)
                                return -EINVAL;
                }
        }

        if (rp->data_size > KRETPROBE_MAX_DATA_SIZE)
                return -E2BIG;

        rp->kp.pre_handler = pre_handler_kretprobe;
        rp->kp.post_handler = NULL;

        /* Pre-allocate memory for max kretprobe instances */
        if (rp->maxactive <= 0)
                rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());

#ifdef CONFIG_KRETPROBE_ON_RETHOOK
        rp->rh = rethook_alloc((void *)rp, kretprobe_rethook_handler,
                                sizeof(struct kretprobe_instance) +
                                rp->data_size, rp->maxactive);
        if (IS_ERR(rp->rh))
                return PTR_ERR(rp->rh);

        rp->nmissed = 0;
        /* Establish function entry probe point */
        ret = register_kprobe(&rp->kp);
        if (ret != 0) {
                rethook_free(rp->rh);
                rp->rh = NULL;
        }
#else        /* !CONFIG_KRETPROBE_ON_RETHOOK */
        rp->rph = kzalloc(sizeof(struct kretprobe_holder), GFP_KERNEL);
        if (!rp->rph)
                return -ENOMEM;

        if (objpool_init(&rp->rph->pool, rp->maxactive, rp->data_size +
                        sizeof(struct kretprobe_instance), GFP_KERNEL,
                        rp->rph, kretprobe_init_inst, kretprobe_fini_pool)) {
                kfree(rp->rph);
                rp->rph = NULL;
                return -ENOMEM;
        }
        rcu_assign_pointer(rp->rph->rp, rp);
        rp->nmissed = 0;
        /* Establish function entry probe point */
        ret = register_kprobe(&rp->kp);
        if (ret != 0)
                free_rp_inst(rp);
#endif
        return ret;
}
EXPORT_SYMBOL_GPL(register_kretprobe);

int register_kretprobes(struct kretprobe **rps, int num)
{
        int ret = 0, i;

        if (num <= 0)
                return -EINVAL;
        for (i = 0; i < num; i++) {
                ret = register_kretprobe(rps[i]);
                if (ret < 0) {
                        if (i > 0)
                                unregister_kretprobes(rps, i);
                        break;
                }
        }
        return ret;
}
EXPORT_SYMBOL_GPL(register_kretprobes);

void unregister_kretprobe(struct kretprobe *rp)
{
        unregister_kretprobes(&rp, 1);
}
EXPORT_SYMBOL_GPL(unregister_kretprobe);

void unregister_kretprobes(struct kretprobe **rps, int num)
{
        int i;

        if (num <= 0)
                return;
        for (i = 0; i < num; i++) {
                guard(mutex)(&kprobe_mutex);

                if (__unregister_kprobe_top(&rps[i]->kp) < 0)
                        rps[i]->kp.addr = NULL;
#ifdef CONFIG_KRETPROBE_ON_RETHOOK
                rethook_free(rps[i]->rh);
#else
                rcu_assign_pointer(rps[i]->rph->rp, NULL);
#endif
        }

        synchronize_rcu();
        for (i = 0; i < num; i++) {
                if (rps[i]->kp.addr) {
                        __unregister_kprobe_bottom(&rps[i]->kp);
#ifndef CONFIG_KRETPROBE_ON_RETHOOK
                        free_rp_inst(rps[i]);
#endif
                }
        }
}
EXPORT_SYMBOL_GPL(unregister_kretprobes);

#else /* CONFIG_KRETPROBES */
int register_kretprobe(struct kretprobe *rp)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL_GPL(register_kretprobe);

int register_kretprobes(struct kretprobe **rps, int num)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL_GPL(register_kretprobes);

void unregister_kretprobe(struct kretprobe *rp)
{
}
EXPORT_SYMBOL_GPL(unregister_kretprobe);

void unregister_kretprobes(struct kretprobe **rps, int num)
{
}
EXPORT_SYMBOL_GPL(unregister_kretprobes);

static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
{
        return 0;
}
NOKPROBE_SYMBOL(pre_handler_kretprobe);

#endif /* CONFIG_KRETPROBES */

/* Set the kprobe gone and remove its instruction buffer. */
static void kill_kprobe(struct kprobe *p)
{
        struct kprobe *kp;

        lockdep_assert_held(&kprobe_mutex);

        /*
         * The module is going away. We should disarm the kprobe which
         * is using ftrace, because ftrace framework is still available at
         * 'MODULE_STATE_GOING' notification.
         */
        if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed)
                disarm_kprobe_ftrace(p);

        p->flags |= KPROBE_FLAG_GONE;
        if (kprobe_aggrprobe(p)) {
                /*
                 * If this is an aggr_kprobe, we have to list all the
                 * chained probes and mark them GONE.
                 */
                list_for_each_entry(kp, &p->list, list)
                        kp->flags |= KPROBE_FLAG_GONE;
                p->post_handler = NULL;
                kill_optimized_kprobe(p);
        }
        /*
         * Here, we can remove insn_slot safely, because no thread calls
         * the original probed function (which will be freed soon) any more.
         */
        arch_remove_kprobe(p);
}

/* Disable one kprobe */
int disable_kprobe(struct kprobe *kp)
{
        struct kprobe *p;

        guard(mutex)(&kprobe_mutex);

        /* Disable this kprobe */
        p = __disable_kprobe(kp);

        return IS_ERR(p) ? PTR_ERR(p) : 0;
}
EXPORT_SYMBOL_GPL(disable_kprobe);

/* Enable one kprobe */
int enable_kprobe(struct kprobe *kp)
{
        int ret = 0;
        struct kprobe *p;

        guard(mutex)(&kprobe_mutex);

        /* Check whether specified probe is valid. */
        p = __get_valid_kprobe(kp);
        if (unlikely(p == NULL))
                return -EINVAL;

        if (kprobe_gone(kp))
                /* This kprobe has gone, we couldn't enable it. */
                return -EINVAL;

        if (p != kp)
                kp->flags &= ~KPROBE_FLAG_DISABLED;

        if (!kprobes_all_disarmed && kprobe_disabled(p)) {
                p->flags &= ~KPROBE_FLAG_DISABLED;
                ret = arm_kprobe(p);
                if (ret) {
                        p->flags |= KPROBE_FLAG_DISABLED;
                        if (p != kp)
                                kp->flags |= KPROBE_FLAG_DISABLED;
                }
        }
        return ret;
}
EXPORT_SYMBOL_GPL(enable_kprobe);

/* Caller must NOT call this in usual path. This is only for critical case */
void dump_kprobe(struct kprobe *kp)
{
        pr_err("Dump kprobe:\n.symbol_name = %s, .offset = %x, .addr = %pS\n",
               kp->symbol_name, kp->offset, kp->addr);
}
NOKPROBE_SYMBOL(dump_kprobe);

int kprobe_add_ksym_blacklist(unsigned long entry)
{
        struct kprobe_blacklist_entry *ent;
        unsigned long offset = 0, size = 0;

        if (!kernel_text_address(entry) ||
            !kallsyms_lookup_size_offset(entry, &size, &offset))
                return -EINVAL;

        ent = kmalloc(sizeof(*ent), GFP_KERNEL);
        if (!ent)
                return -ENOMEM;
        ent->start_addr = entry;
        ent->end_addr = entry + size;
        INIT_LIST_HEAD(&ent->list);
        list_add_tail(&ent->list, &kprobe_blacklist);

        return (int)size;
}

/* Add all symbols in given area into kprobe blacklist */
int kprobe_add_area_blacklist(unsigned long start, unsigned long end)
{
        unsigned long entry;
        int ret = 0;

        for (entry = start; entry < end; entry += ret) {
                ret = kprobe_add_ksym_blacklist(entry);
                if (ret < 0)
                        return ret;
                if (ret == 0)        /* In case of alias symbol */
                        ret = 1;
        }
        return 0;
}

int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
                                   char *type, char *sym)
{
        return -ERANGE;
}

int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
                       char *sym)
{
#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
        if (!kprobe_cache_get_kallsym(&kprobe_insn_slots, &symnum, value, type, sym))
                return 0;
#ifdef CONFIG_OPTPROBES
        if (!kprobe_cache_get_kallsym(&kprobe_optinsn_slots, &symnum, value, type, sym))
                return 0;
#endif
#endif
        if (!arch_kprobe_get_kallsym(&symnum, value, type, sym))
                return 0;
        return -ERANGE;
}

int __init __weak arch_populate_kprobe_blacklist(void)
{
        return 0;
}

/*
 * Lookup and populate the kprobe_blacklist.
 *
 * Unlike the kretprobe blacklist, we'll need to determine
 * the range of addresses that belong to the said functions,
 * since a kprobe need not necessarily be at the beginning
 * of a function.
 */
static int __init populate_kprobe_blacklist(unsigned long *start,
                                             unsigned long *end)
{
        unsigned long entry;
        unsigned long *iter;
        int ret;

        for (iter = start; iter < end; iter++) {
                entry = (unsigned long)dereference_symbol_descriptor((void *)*iter);
                ret = kprobe_add_ksym_blacklist(entry);
                if (ret == -EINVAL)
                        continue;
                if (ret < 0)
                        return ret;
        }

        /* Symbols in '__kprobes_text' are blacklisted */
        ret = kprobe_add_area_blacklist((unsigned long)__kprobes_text_start,
                                        (unsigned long)__kprobes_text_end);
        if (ret)
                return ret;

        /* Symbols in 'noinstr' section are blacklisted */
        ret = kprobe_add_area_blacklist((unsigned long)__noinstr_text_start,
                                        (unsigned long)__noinstr_text_end);

        return ret ? : arch_populate_kprobe_blacklist();
}

#ifdef CONFIG_MODULES
/* Remove all symbols in given area from kprobe blacklist */
static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end)
{
        struct kprobe_blacklist_entry *ent, *n;

        list_for_each_entry_safe(ent, n, &kprobe_blacklist, list) {
                if (ent->start_addr < start || ent->start_addr >= end)
                        continue;
                list_del(&ent->list);
                kfree(ent);
        }
}

static void kprobe_remove_ksym_blacklist(unsigned long entry)
{
        kprobe_remove_area_blacklist(entry, entry + 1);
}

static void add_module_kprobe_blacklist(struct module *mod)
{
        unsigned long start, end;
        int i;

        if (mod->kprobe_blacklist) {
                for (i = 0; i < mod->num_kprobe_blacklist; i++)
                        kprobe_add_ksym_blacklist(mod->kprobe_blacklist[i]);
        }

        start = (unsigned long)mod->kprobes_text_start;
        if (start) {
                end = start + mod->kprobes_text_size;
                kprobe_add_area_blacklist(start, end);
        }

        start = (unsigned long)mod->noinstr_text_start;
        if (start) {
                end = start + mod->noinstr_text_size;
                kprobe_add_area_blacklist(start, end);
        }
}

static void remove_module_kprobe_blacklist(struct module *mod)
{
        unsigned long start, end;
        int i;

        if (mod->kprobe_blacklist) {
                for (i = 0; i < mod->num_kprobe_blacklist; i++)
                        kprobe_remove_ksym_blacklist(mod->kprobe_blacklist[i]);
        }

        start = (unsigned long)mod->kprobes_text_start;
        if (start) {
                end = start + mod->kprobes_text_size;
                kprobe_remove_area_blacklist(start, end);
        }

        start = (unsigned long)mod->noinstr_text_start;
        if (start) {
                end = start + mod->noinstr_text_size;
                kprobe_remove_area_blacklist(start, end);
        }
}

/* Module notifier call back, checking kprobes on the module */
static int kprobes_module_callback(struct notifier_block *nb,
                                   unsigned long val, void *data)
{
        struct module *mod = data;
        struct hlist_head *head;
        struct kprobe *p;
        unsigned int i;
        int checkcore = (val == MODULE_STATE_GOING);

        guard(mutex)(&kprobe_mutex);

        if (val == MODULE_STATE_COMING)
                add_module_kprobe_blacklist(mod);

        if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE)
                return NOTIFY_DONE;

        /*
         * When 'MODULE_STATE_GOING' was notified, both of module '.text' and
         * '.init.text' sections would be freed. When 'MODULE_STATE_LIVE' was
         * notified, only '.init.text' section would be freed. We need to
         * disable kprobes which have been inserted in the sections.
         */
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry(p, head, hlist)
                        if (within_module_init((unsigned long)p->addr, mod) ||
                            (checkcore &&
                             within_module_core((unsigned long)p->addr, mod))) {
                                /*
                                 * The vaddr this probe is installed will soon
                                 * be vfreed buy not synced to disk. Hence,
                                 * disarming the breakpoint isn't needed.
                                 *
                                 * Note, this will also move any optimized probes
                                 * that are pending to be removed from their
                                 * corresponding lists to the 'freeing_list' and
                                 * will not be touched by the delayed
                                 * kprobe_optimizer() work handler.
                                 */
                                kill_kprobe(p);
                        }
        }
        if (val == MODULE_STATE_GOING)
                remove_module_kprobe_blacklist(mod);
        return NOTIFY_DONE;
}

static struct notifier_block kprobe_module_nb = {
        .notifier_call = kprobes_module_callback,
        .priority = 0
};

static int kprobe_register_module_notifier(void)
{
        return register_module_notifier(&kprobe_module_nb);
}
#else
static int kprobe_register_module_notifier(void)
{
        return 0;
}
#endif /* CONFIG_MODULES */

void kprobe_free_init_mem(void)
{
        void *start = (void *)(&__init_begin);
        void *end = (void *)(&__init_end);
        struct hlist_head *head;
        struct kprobe *p;
        int i;

        guard(mutex)(&kprobe_mutex);

        /* Kill all kprobes on initmem because the target code has been freed. */
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry(p, head, hlist) {
                        if (start <= (void *)p->addr && (void *)p->addr < end)
                                kill_kprobe(p);
                }
        }
}

static int __init init_kprobes(void)
{
        int i, err;

        /* FIXME allocate the probe table, currently defined statically */
        /* initialize all list heads */
        for (i = 0; i < KPROBE_TABLE_SIZE; i++)
                INIT_HLIST_HEAD(&kprobe_table[i]);

        err = populate_kprobe_blacklist(__start_kprobe_blacklist,
                                        __stop_kprobe_blacklist);
        if (err)
                pr_err("Failed to populate blacklist (error %d), kprobes not restricted, be careful using them!\n", err);

        if (kretprobe_blacklist_size) {
                /* lookup the function address from its name */
                for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
                        kretprobe_blacklist[i].addr =
                                kprobe_lookup_name(kretprobe_blacklist[i].name, 0);
                        if (!kretprobe_blacklist[i].addr)
                                pr_err("Failed to lookup symbol '%s' for kretprobe blacklist. Maybe the target function is removed or renamed.\n",
                                       kretprobe_blacklist[i].name);
                }
        }

        /* By default, kprobes are armed */
        kprobes_all_disarmed = false;

#if defined(CONFIG_OPTPROBES) && defined(__ARCH_WANT_KPROBES_INSN_SLOT)
        /* Init 'kprobe_optinsn_slots' for allocation */
        kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
#endif

        err = arch_init_kprobes();
        if (!err)
                err = register_die_notifier(&kprobe_exceptions_nb);
        if (!err)
                err = kprobe_register_module_notifier();

        kprobes_initialized = (err == 0);
        kprobe_sysctls_init();
        return err;
}
early_initcall(init_kprobes);

#if defined(CONFIG_OPTPROBES)
static int __init init_optprobes(void)
{
        /*
         * Enable kprobe optimization - this kicks the optimizer which
         * depends on synchronize_rcu_tasks() and ksoftirqd, that is
         * not spawned in early initcall. So delay the optimization.
         */
        optimize_all_kprobes();

        return 0;
}
subsys_initcall(init_optprobes);
#endif

#ifdef CONFIG_DEBUG_FS
static void report_probe(struct seq_file *pi, struct kprobe *p,
                const char *sym, int offset, char *modname, struct kprobe *pp)
{
        char *kprobe_type;
        void *addr = p->addr;

        if (p->pre_handler == pre_handler_kretprobe)
                kprobe_type = "r";
        else
                kprobe_type = "k";

        if (!kallsyms_show_value(pi->file->f_cred))
                addr = NULL;

        if (sym)
                seq_printf(pi, "%px  %s  %s+0x%x  %s ",
                        addr, kprobe_type, sym, offset,
                        (modname ? modname : " "));
        else        /* try to use %pS */
                seq_printf(pi, "%px  %s  %pS ",
                        addr, kprobe_type, p->addr);

        if (!pp)
                pp = p;
        seq_printf(pi, "%s%s%s%s\n",
                (kprobe_gone(p) ? "[GONE]" : ""),
                ((kprobe_disabled(p) && !kprobe_gone(p)) ?  "[DISABLED]" : ""),
                (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""),
                (kprobe_ftrace(pp) ? "[FTRACE]" : ""));
}

static void *kprobe_seq_start(struct seq_file *f, loff_t *pos)
{
        return (*pos < KPROBE_TABLE_SIZE) ? pos : NULL;
}

static void *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos)
{
        (*pos)++;
        if (*pos >= KPROBE_TABLE_SIZE)
                return NULL;
        return pos;
}

static void kprobe_seq_stop(struct seq_file *f, void *v)
{
        /* Nothing to do */
}

static int show_kprobe_addr(struct seq_file *pi, void *v)
{
        struct hlist_head *head;
        struct kprobe *p, *kp;
        const char *sym;
        unsigned int i = *(loff_t *) v;
        unsigned long offset = 0;
        char *modname, namebuf[KSYM_NAME_LEN];

        head = &kprobe_table[i];
        preempt_disable();
        hlist_for_each_entry_rcu(p, head, hlist) {
                sym = kallsyms_lookup((unsigned long)p->addr, NULL,
                                        &offset, &modname, namebuf);
                if (kprobe_aggrprobe(p)) {
                        list_for_each_entry_rcu(kp, &p->list, list)
                                report_probe(pi, kp, sym, offset, modname, p);
                } else
                        report_probe(pi, p, sym, offset, modname, NULL);
        }
        preempt_enable();
        return 0;
}

static const struct seq_operations kprobes_sops = {
        .start = kprobe_seq_start,
        .next  = kprobe_seq_next,
        .stop  = kprobe_seq_stop,
        .show  = show_kprobe_addr
};

DEFINE_SEQ_ATTRIBUTE(kprobes);

/* kprobes/blacklist -- shows which functions can not be probed */
static void *kprobe_blacklist_seq_start(struct seq_file *m, loff_t *pos)
{
        mutex_lock(&kprobe_mutex);
        return seq_list_start(&kprobe_blacklist, *pos);
}

static void *kprobe_blacklist_seq_next(struct seq_file *m, void *v, loff_t *pos)
{
        return seq_list_next(v, &kprobe_blacklist, pos);
}

static int kprobe_blacklist_seq_show(struct seq_file *m, void *v)
{
        struct kprobe_blacklist_entry *ent =
                list_entry(v, struct kprobe_blacklist_entry, list);

        /*
         * If '/proc/kallsyms' is not showing kernel address, we won't
         * show them here either.
         */
        if (!kallsyms_show_value(m->file->f_cred))
                seq_printf(m, "0x%px-0x%px\t%ps\n", NULL, NULL,
                           (void *)ent->start_addr);
        else
                seq_printf(m, "0x%px-0x%px\t%ps\n", (void *)ent->start_addr,
                           (void *)ent->end_addr, (void *)ent->start_addr);
        return 0;
}

static void kprobe_blacklist_seq_stop(struct seq_file *f, void *v)
{
        mutex_unlock(&kprobe_mutex);
}

static const struct seq_operations kprobe_blacklist_sops = {
        .start = kprobe_blacklist_seq_start,
        .next  = kprobe_blacklist_seq_next,
        .stop  = kprobe_blacklist_seq_stop,
        .show  = kprobe_blacklist_seq_show,
};
DEFINE_SEQ_ATTRIBUTE(kprobe_blacklist);

static int arm_all_kprobes(void)
{
        struct hlist_head *head;
        struct kprobe *p;
        unsigned int i, total = 0, errors = 0;
        int err, ret = 0;

        guard(mutex)(&kprobe_mutex);

        /* If kprobes are armed, just return */
        if (!kprobes_all_disarmed)
                return 0;

        /*
         * optimize_kprobe() called by arm_kprobe() checks
         * kprobes_all_disarmed, so set kprobes_all_disarmed before
         * arm_kprobe.
         */
        kprobes_all_disarmed = false;
        /* Arming kprobes doesn't optimize kprobe itself */
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                /* Arm all kprobes on a best-effort basis */
                hlist_for_each_entry(p, head, hlist) {
                        if (!kprobe_disabled(p)) {
                                err = arm_kprobe(p);
                                if (err)  {
                                        errors++;
                                        ret = err;
                                }
                                total++;
                        }
                }
        }

        if (errors)
                pr_warn("Kprobes globally enabled, but failed to enable %d out of %d probes. Please check which kprobes are kept disabled via debugfs.\n",
                        errors, total);
        else
                pr_info("Kprobes globally enabled\n");

        return ret;
}

static int disarm_all_kprobes(void)
{
        struct hlist_head *head;
        struct kprobe *p;
        unsigned int i, total = 0, errors = 0;
        int err, ret = 0;

        guard(mutex)(&kprobe_mutex);

        /* If kprobes are already disarmed, just return */
        if (kprobes_all_disarmed)
                return 0;

        kprobes_all_disarmed = true;

        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                /* Disarm all kprobes on a best-effort basis */
                hlist_for_each_entry(p, head, hlist) {
                        if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) {
                                err = disarm_kprobe(p, false);
                                if (err) {
                                        errors++;
                                        ret = err;
                                }
                                total++;
                        }
                }
        }

        if (errors)
                pr_warn("Kprobes globally disabled, but failed to disable %d out of %d probes. Please check which kprobes are kept enabled via debugfs.\n",
                        errors, total);
        else
                pr_info("Kprobes globally disabled\n");

        /* Wait for disarming all kprobes by optimizer */
        wait_for_kprobe_optimizer_locked();
        return ret;
}

/*
 * XXX: The debugfs bool file interface doesn't allow for callbacks
 * when the bool state is switched. We can reuse that facility when
 * available
 */
static ssize_t read_enabled_file_bool(struct file *file,
               char __user *user_buf, size_t count, loff_t *ppos)
{
        char buf[3];

        if (!kprobes_all_disarmed)
                buf[0] = '1';
        else
                buf[0] = '0';
        buf[1] = '\n';
        buf[2] = 0x00;
        return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
}

static ssize_t write_enabled_file_bool(struct file *file,
               const char __user *user_buf, size_t count, loff_t *ppos)
{
        bool enable;
        int ret;

        ret = kstrtobool_from_user(user_buf, count, &enable);
        if (ret)
                return ret;

        ret = enable ? arm_all_kprobes() : disarm_all_kprobes();
        if (ret)
                return ret;

        return count;
}

static const struct file_operations fops_kp = {
        .read =         read_enabled_file_bool,
        .write =        write_enabled_file_bool,
        .llseek =        default_llseek,
};

static int __init debugfs_kprobe_init(void)
{
        struct dentry *dir;

        dir = debugfs_create_dir("kprobes", NULL);

        debugfs_create_file("list", 0400, dir, NULL, &kprobes_fops);

        debugfs_create_file("enabled", 0600, dir, NULL, &fops_kp);

        debugfs_create_file("blacklist", 0400, dir, NULL,
                            &kprobe_blacklist_fops);

        return 0;
}

late_initcall(debugfs_kprobe_init);
#endif /* CONFIG_DEBUG_FS */


















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  318 

  319 

  319 









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  319 










  314 



  318 









  319 



























































































































  315 










  314 


  319 




























































  315 














  314 














  319 











  314 


















































  316 



  314 





  316 


  319 





  316 



  319 

  318 










  314 

  318 
  319 
  319 
  319 






  317 



  319 
  319 


  319 
  318 












  317 

  312 












































































  316 






















  315 



  316 
  318 







  315 



  319 


















  314 






























































































  318 




































































































































  316 

  316 
  317 






































































































































  317 

























































































  317 











































































  317 







  317 











































  317 


  316 



  318 

  317 
  319 

  319 







  317 










  315 

  316 



  319 
  314 






  317 


  316 
  319 
























  317 
















  319 

  319 

  318 












  317 











  318 




  319 
  317 

  318 

  319 
  316 

  316 














  316 
  316 





  319 






  315 

  318 




  316 
  316 






















  319 
  316 













































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
8073
8074
8075
8076
8077
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387
8388
8389
8390
8391
8392
8393
8394
8395
8396
8397
8398
8399
8400
8401
8402
8403
8404
8405
8406
8407
8408
8409
8410
8411
8412
8413
8414
8415
8416
8417
8418
8419
8420
8421
8422
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468
8469
8470
8471
8472
8473
8474
8475
8476
8477
8478
8479
8480
8481
8482
8483
8484
8485
8486
8487
8488
8489
8490
8491
8492
8493
8494
8495
8496
8497
8498
8499
8500
8501
8502
8503
8504
8505
8506
8507
8508
8509
8510
8511
8512
8513
8514
8515
8516
8517
8518
8519
8520
8521
8522
8523
8524
8525
8526
8527
8528
8529
8530
8531
8532
8533
8534
8535
8536
8537
8538
8539
8540
8541
8542
8543
8544
8545
8546
8547
8548
8549
8550
8551
8552
8553
8554
8555
8556
8557
8558
8559
8560
8561
8562
8563
8564
8565
8566
8567
8568
8569
8570
8571
8572
8573
8574
8575
8576
8577
8578
8579
8580
8581
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595
8596
8597
8598
8599
8600
8601
8602
8603
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623
8624
8625
8626
8627
8628
8629
8630
8631
8632
8633
8634
8635
8636
8637
8638
8639
8640
8641
8642
8643
8644
8645
8646
8647
8648
8649
8650
8651
8652
8653
8654
8655
8656
8657
8658
8659
8660
8661
8662
8663
8664
8665
8666
8667
8668
8669
8670
8671
8672
8673
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696
8697
8698
8699
8700
8701
8702
8703
8704
8705
8706
8707
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732
8733
8734
8735
8736
8737
8738
8739
8740
8741
8742
8743
8744
8745
8746
8747
8748
8749
8750
8751
8752
8753
8754
8755
8756
8757
8758
8759
8760
8761
8762
8763
8764
8765
8766
8767
8768
8769
8770
8771
8772
8773
8774
8775
8776
8777
8778
8779
8780
8781
8782
8783
8784
8785
8786
8787
8788
8789
8790
8791
8792
8793
8794
8795
8796
8797
8798
8799
8800
8801
8802
8803
8804
8805
8806
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816
8817
8818
8819
8820
8821
8822
8823
8824
8825
8826
8827
8828
8829
8830
8831
8832
8833
8834
8835
8836
8837
8838
8839
8840
8841
8842
8843
8844
8845
8846
8847
8848
8849
8850
8851
8852
8853
8854
8855
8856
8857
8858
8859
8860
8861
8862
8863
8864
8865
8866
8867
8868
8869
8870
8871
8872
8873
8874
8875
8876
8877
8878
8879
8880
8881
8882
8883
8884
8885
8886
8887
8888
8889
8890
8891
8892
8893
8894
8895
8896
8897
8898
8899
8900
8901
8902
8903
8904
8905
8906
8907
8908
8909
8910
8911
8912
8913
8914
8915
8916
8917
8918
8919
8920
8921
8922
8923
8924
8925
8926
8927
8928
8929
8930
8931
8932
8933
8934
8935
8936
8937
8938
8939
8940
8941
8942
8943
8944
8945
8946
8947
8948
8949
8950
8951
8952
8953
8954
8955
8956
8957
8958
8959
8960
8961
8962
8963
8964
8965
8966
8967
8968
8969
8970
8971
8972
8973
8974
8975
8976
8977
8978
8979
8980
8981
8982
8983
8984
8985
8986
8987
8988
8989
8990
8991
8992
8993
8994
8995
8996
8997
8998
8999
9000
9001
9002
9003
9004
9005
9006
9007
9008
9009
9010
9011
9012
9013
9014
9015
9016
9017
9018
9019
9020
9021
9022
9023
9024
9025
9026
9027
9028
9029
9030
9031
9032
9033
9034
9035
9036
9037
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054
9055
9056
9057
9058
9059
9060
9061
9062
9063
9064
9065
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078
9079
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096
9097
9098
9099
9100
9101
9102
9103
9104
9105
9106
9107
9108
9109
9110
9111
9112
9113
9114
9115
9116
9117
9118
9119
9120
9121
9122
9123
9124
9125
9126
9127
9128
9129
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144
9145
9146
9147
9148
9149
9150
9151
9152
9153
9154
9155
9156
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189
9190
9191
9192
9193
9194
9195
9196
9197
9198
9199
9200
9201
9202
9203
9204
9205
9206
9207
9208
9209
9210
9211
9212
9213
9214
9215
9216
9217
9218
9219
9220
9221
9222
9223
9224
9225
9226
9227
9228
9229
9230
9231
9232
9233
9234
9235
9236
9237
9238
9239
9240
9241
9242
9243
9244
9245
9246
9247
9248
9249
9250
9251
9252
9253
9254
9255
9256
9257
9258
9259
9260
9261
9262
9263
9264
9265
9266
9267
9268
9269
9270
9271
9272
9273
9274
9275
9276
9277
9278
9279
9280
9281
9282
9283
9284
9285
9286
9287
9288
9289
9290
9291
9292
9293
9294
9295
9296
9297
9298
9299
9300
9301
9302
9303
9304
9305
9306
9307
9308
9309
9310
9311
9312
9313
9314
9315
9316
9317
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351
9352
9353
9354
9355
9356
9357
9358
9359
9360
9361
9362
9363
9364
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
9440
9441
9442
9443
9444
9445
9446
9447
9448
9449
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
9469
9470
9471
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481
9482
9483
9484
9485
9486
9487
9488
9489
9490
9491
9492
9493
9494
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508
9509
9510
9511
9512
9513
9514
9515
9516
9517
9518
9519
9520
9521
9522
9523
9524
9525
9526
9527
9528
9529
9530
9531
9532
9533
9534
9535
9536
9537
9538
9539
9540
9541
9542
9543
9544
9545
9546
9547
9548
9549
9550
9551
9552
9553
9554
9555
9556
9557
9558
9559
9560
9561
9562
9563
9564
9565
9566
9567
9568
9569
9570
9571
9572
9573
9574
9575
9576
9577
9578
9579
9580
9581
9582
9583
9584
9585
9586
9587
9588
9589
9590
9591
9592
9593
9594
9595
9596
9597
9598
9599
9600
9601
9602
9603
9604
9605
9606
9607
9608
9609
9610
9611
9612
9613
9614
9615
9616
9617
9618
9619
9620
9621
9622
9623
9624
9625
9626
9627
9628
9629
9630
9631
9632
9633
9634
9635
9636
9637
9638
9639
9640
9641
9642
9643
9644
9645
9646
9647
9648
9649
9650
9651
9652
9653
9654
9655
9656
9657
9658
9659
9660
9661
9662
9663
9664
9665
9666
9667
9668
9669
9670
9671
9672
9673
9674
9675
9676
9677
9678
9679
9680
9681
9682
9683
9684
9685
9686
9687
9688
9689
9690
9691
9692
9693
9694
9695
9696
9697
9698
9699
9700
9701
9702
9703
9704
9705
9706
9707
9708
9709
9710
9711
9712
9713
9714
9715
9716
9717
9718
9719
9720
9721
9722
9723
9724
9725
9726
9727
9728
9729
9730
9731
9732
9733
9734
9735
9736
9737
9738
9739
9740
9741
9742
9743
9744
9745
9746
9747
9748
9749
9750
9751
9752
9753
9754
9755
9756
9757
9758
9759
9760
9761
9762
9763
9764
9765
9766
9767
9768
9769
9770
9771
9772
9773
9774
9775
9776
9777
9778
9779
9780
9781
9782
9783
9784
9785
9786
9787
9788
9789
9790
9791
9792
9793
9794
9795
9796
9797
9798
9799
9800
9801
9802
9803
9804
9805
9806
9807
9808
9809
9810
9811
9812
9813
9814
9815
9816
9817
9818
9819
9820
9821
9822
9823
9824
9825
9826
9827
9828
9829
9830
9831
9832
9833
9834
9835
9836
9837
9838
9839
9840
9841
9842
9843
9844
9845
9846
9847
9848
9849
9850
9851
9852
9853
9854
9855
9856
9857
9858
9859
9860
9861
9862
9863
9864
9865
9866
9867
9868
9869
9870
9871
9872
9873
9874
9875
9876
9877
9878
9879
9880
9881
9882
9883
9884
9885
9886
9887
9888
9889
9890
9891
9892
9893
9894
9895
9896
9897
9898
9899
9900
9901
9902
9903
9904
9905
9906
9907
9908
9909
9910
9911
9912
9913
9914
9915
9916
9917
9918
9919
9920
9921
9922
9923
9924
9925
9926
9927
9928
9929
9930
9931
9932
9933
9934
9935
9936
9937
9938
9939
9940
9941
9942
9943
9944
9945
9946
9947
9948
9949
9950
9951
9952
9953
9954
9955
9956
9957
9958
9959
9960
9961
9962
9963
9964
9965
9966
9967
9968
9969
9970
9971
9972
9973
9974
9975
9976
9977
9978
9979
9980
9981
9982
9983
9984
9985
9986
9987
9988
9989
9990
9991
9992
9993
9994
9995
9996
9997
9998
9999
10000
10001
10002
10003
10004
10005
10006
10007
10008
10009
10010
10011
10012
10013
10014
10015
10016
10017
10018
10019
10020
10021
10022
10023
10024
10025
10026
10027
10028
10029
10030
10031
10032
10033
10034
10035
10036
10037
10038
10039
10040
10041
10042
10043
10044
10045
10046
10047
10048
10049
10050
10051
10052
10053
10054
10055
10056
10057
10058
10059
10060
10061
10062
10063
10064
10065
10066
10067
10068
10069
10070
10071
10072
10073
10074
10075
10076
10077
10078
10079
10080
10081
10082
10083
10084
10085
10086
10087
10088
10089
10090
10091
10092
10093
10094
10095
10096
10097
10098
10099
10100
10101
10102
10103
10104
10105
10106
10107
10108
10109
10110
10111
10112
10113
10114
10115
10116
10117
10118
10119
10120
10121
10122
10123
10124
10125
10126
10127
10128
10129
10130
10131
10132
10133
10134
10135
10136
10137
10138
10139
10140
10141
10142
10143
10144
10145
10146
10147
10148
10149
10150
10151
10152
10153
10154
10155
10156
10157
10158
10159
10160
10161
10162
10163
10164
10165
10166
10167
10168
10169
10170
10171
10172
10173
10174
10175
10176
10177
10178
10179
10180
10181
10182
10183
10184
10185
10186
10187
10188
10189
10190
10191
10192
10193
10194
10195
10196
10197
10198
10199
10200
10201
10202
10203
10204
10205
10206
10207
10208
10209
10210
10211
10212
10213
10214
10215
10216
10217
10218
10219
10220
10221
10222
10223
10224
10225
10226
10227
10228
10229
10230
10231
10232
10233
10234
10235
10236
10237
10238
10239
10240
10241
10242
10243
10244
10245
10246
10247
10248
10249
10250
10251
10252
10253
10254
10255
10256
10257
10258
10259
10260
10261
10262
10263
10264
10265
10266
10267
10268
10269
10270
10271
10272
10273
10274
10275
10276
10277
10278
10279
10280
10281
10282
10283
10284
10285
10286
10287
10288
10289
10290
10291
10292
10293
10294
10295
10296
10297
10298
10299
10300
10301
10302
10303
10304
10305
10306
10307
10308
10309
10310
10311
10312
10313
10314
10315
10316
10317
10318
10319
10320
10321
10322
10323
10324
10325
10326
10327
10328
10329
10330
10331
10332
10333
10334
10335
10336
10337
10338
10339
10340
10341
10342
10343
10344
10345
10346
10347
10348
10349
10350
10351
10352
10353
10354
10355
10356
10357
10358
10359
10360
10361
10362
10363
10364
10365
10366
10367
10368
10369
10370
10371
10372
10373
10374
10375
10376
10377
10378
10379
10380
10381
10382
10383
10384
10385
10386
10387
10388
10389
10390
10391
10392
10393
10394
10395
10396
10397
10398
10399
10400
10401
10402
10403
10404
10405
10406
10407
10408
10409
10410
10411
10412
10413
10414
10415
10416
10417
10418
10419
10420
10421
10422
10423
10424
10425
10426
10427
10428
10429
10430
10431
10432
10433
10434
10435
10436
10437
10438
10439
10440
10441
10442
10443
10444
10445
10446
10447
10448
10449
10450
10451
10452
10453
10454
10455
10456
10457
10458
10459
10460
10461
10462
10463
10464
10465
10466
10467
10468
10469
10470
10471
10472
10473
10474
10475
10476
10477
10478
10479
10480
10481
10482
10483
10484
10485
10486
10487
10488
10489
10490
10491
10492
10493
10494
10495
10496
10497
10498
10499
10500
10501
10502
10503
10504
10505
10506
10507
10508
10509
10510
10511
10512
10513
10514
10515
10516
10517
10518
10519
10520
10521
10522
10523
10524
10525
10526
10527
10528
10529
10530
10531
10532
10533
10534
10535
10536
10537
10538
10539
10540
10541
10542
10543
10544
10545
10546
10547
10548
10549
10550
10551
10552
10553
10554
10555
10556
10557
10558
10559
10560
10561
10562
10563
10564
10565
10566
10567
10568
10569
10570
10571
10572
10573
10574
10575
10576
10577
10578
10579
10580
10581
10582
10583
10584
10585
10586
10587
10588
10589
10590
10591
10592
10593
10594
10595
10596
10597
10598
10599
10600
10601
10602
10603
10604
10605
10606
10607
10608
10609
10610
10611
10612
10613
10614
10615
10616
10617
10618
10619
10620
10621
10622
10623
10624
10625
10626
10627
10628
10629
10630
10631
10632
10633
10634
10635
10636
10637
10638
10639
10640
10641
10642
10643
10644
10645
10646
10647
10648
10649
10650
10651
10652
10653
10654
10655
10656
10657
10658
10659
10660
10661
10662
10663
10664
10665
10666
10667
10668
10669
10670
10671
10672
10673
10674
10675
10676
10677
10678
10679
10680
10681
10682
10683
10684
10685
10686
10687
10688
10689
10690
10691
10692
10693
10694
10695
10696
10697
10698
10699
10700
10701
10702
10703
10704
10705
10706
10707
10708
10709
10710
10711
10712
10713
10714
10715
10716
10717
10718
10719
10720
10721
10722
10723
10724
10725
10726
10727
10728
10729
10730
10731
10732
10733
10734
10735
10736
10737
10738
10739
10740
10741
10742
10743
10744
10745
10746
10747
10748
10749
10750
10751
10752
10753
10754
10755
10756
10757
10758
10759
10760
10761
10762
10763
10764
10765
10766
10767
10768
10769
10770
10771
10772
10773
10774
10775
10776
10777
10778
10779
10780
10781
10782
10783
10784
10785
10786
10787
10788
10789
10790
10791
10792
10793
10794
10795
10796
10797
10798
10799
10800
10801
10802
10803
10804
10805
10806
10807
10808
10809
10810
10811
10812
10813
10814
10815
10816
10817
10818
10819
10820
10821
10822
10823
10824
10825
10826
10827
10828
10829
10830
10831
10832
10833
10834
10835
10836
10837
10838
10839
10840
10841
10842
10843
10844
10845
10846
10847
10848
10849
10850
10851
10852
10853
10854
10855
10856
10857
10858
10859
10860
10861
10862
10863
10864
10865
10866
10867
10868
10869
10870
10871
10872
10873
10874
10875
10876
10877
10878
10879
10880
10881
10882
10883
10884
10885
10886
10887
10888
10889
10890
10891
10892
10893
10894
10895
10896
10897
10898
10899
10900
10901
10902
10903
10904
10905
10906
10907
10908
10909
10910
10911
10912
10913
10914
10915
10916
10917
10918
10919
10920
10921
10922
10923
10924
10925
10926
10927
10928
10929
10930
10931
10932
10933
10934
10935
10936
10937
10938
10939
10940
10941
10942
10943
10944
10945
10946
10947
10948
10949
10950
10951
10952
10953
10954
10955
10956
10957
10958
10959
10960
10961
10962
10963
10964
10965
10966
10967
10968
10969
10970
10971
10972
10973
10974
10975
10976
10977
10978
10979
10980
10981
10982
10983
10984
10985
10986
10987
10988
10989
10990
10991
10992
10993
10994
10995
10996
10997
10998
10999
11000
11001
11002
11003
11004
11005
11006
11007
11008
11009
11010
11011
11012
11013
11014
11015
11016
11017
11018
11019
11020
11021
11022
11023
11024
11025
11026
11027
11028
11029
11030
11031
11032
11033
11034
11035
11036
11037
11038
11039
11040
11041
11042
11043
11044
11045
11046
11047
11048
11049
11050
11051
11052
11053
11054
11055
11056
11057
11058
11059
11060
11061
11062
11063
11064
11065
11066
11067
11068
11069
11070
11071
11072
11073
11074
11075
11076
11077
11078
11079
11080
11081
11082
11083
11084
11085
11086
11087
11088
11089
11090
11091
11092
11093
11094
11095
11096
11097
11098
11099
11100
11101
11102
11103
11104
11105
11106
11107
11108
11109
11110
11111
11112
11113
11114
11115
11116
11117
11118
11119
11120
11121
11122
11123
11124
11125
11126
11127
11128
11129
11130
11131
11132
11133
11134
11135
11136
11137
11138
11139
11140
11141
11142
11143
11144
11145
11146
11147
11148
11149
11150
11151
11152
11153
11154
11155
11156
11157
11158
11159
11160
11161
11162
11163
11164
11165
11166
11167
11168
11169
11170
11171
11172
11173
11174
11175
11176
11177
11178
11179
11180
11181
11182
11183
11184
11185
11186
11187
11188
11189
11190
11191
11192
11193
11194
11195
11196
11197
11198
11199
11200
11201
11202
11203
11204
11205
11206
11207
11208
11209
11210
11211
11212
11213
11214
11215
11216
11217
11218
11219
11220
11221
11222
11223
11224
11225
11226
11227
11228
11229
11230
11231
11232
11233
11234
11235
11236
11237
11238
11239
11240
11241
11242
11243
11244
11245
11246
11247
11248
11249
11250
11251
11252
11253
11254
11255
11256
11257
11258
11259
11260
11261
11262
11263
11264
11265
11266
11267
11268
11269
11270
11271
11272
11273
11274
11275
11276
11277
11278
11279
11280
11281
11282
11283
11284
11285
11286
11287
11288
11289
11290
11291
11292
11293
11294
11295
11296
11297
11298
11299
11300
11301
11302
11303
11304
11305
11306
11307
11308
11309
11310
11311
11312
11313
11314
11315
11316
11317
11318
11319
11320
11321
11322
11323
11324
11325
11326
11327
11328
11329
11330
11331
11332
11333
11334
11335
11336
11337
11338
11339
11340
11341
11342
11343
11344
11345
11346
11347
11348
11349
11350
11351
11352
11353
11354
11355
11356
11357
11358
11359
11360
11361
11362
11363
11364
11365
11366
11367
11368
11369
11370
11371
11372
11373
11374
11375
11376
11377
11378
11379
11380
11381
11382
11383
11384
11385
11386
11387
11388
11389
11390
11391
11392
11393
11394
11395
11396
11397
11398
11399
11400
11401
11402
11403
11404
11405
11406
11407
11408
11409
11410
11411
11412
11413
11414
11415
11416
11417
11418
11419
11420
11421
11422
11423
11424
11425
11426
11427
11428
11429
11430
11431
11432
11433
11434
11435
11436
11437
11438
11439
11440
11441
11442
11443
11444
11445
11446
11447
11448
11449
11450
11451
11452
11453
11454
11455
11456
11457
11458
11459
11460
11461
11462
11463
11464
11465
11466
11467
11468
11469
11470
11471
11472
11473
11474
11475
11476
11477
11478
11479
11480
11481
11482
11483
11484
11485
11486
11487
11488
11489
11490
11491
11492
11493
11494
11495
11496
11497
11498
11499
11500
11501
11502
11503
11504
11505
11506
11507
11508
11509
11510
11511
11512
11513
11514
11515
11516
11517
11518
11519
11520
11521
11522
11523
11524
11525
11526
11527
11528
11529
11530
11531
11532
11533
11534
11535
11536
11537
11538
11539
11540
11541
11542
11543
11544
11545
11546
11547
11548
11549
11550
11551
11552
11553
11554
11555
11556
11557
11558
11559
11560
11561
11562
11563
11564
11565
11566
11567
11568
11569
11570
11571
11572
11573
11574
11575
11576
11577
11578
11579
11580
11581
11582
11583
11584
11585
11586
11587
11588
11589
11590
11591
11592
11593
11594
11595
11596
11597
11598
11599
11600
11601
11602
11603
11604
11605
11606
11607
11608
11609
11610
11611
11612
11613
11614
11615
11616
11617
11618
11619
11620
11621
11622
11623
11624
11625
11626
11627
11628
11629
11630
11631
11632
11633
11634
11635
11636
11637
11638
11639
11640
11641
11642
11643
11644
11645
11646
11647
11648
11649
11650
11651
11652
11653
11654
11655
11656
11657
11658
11659
11660
11661
11662
11663
11664
11665
11666
11667
11668
11669
11670
11671
11672
11673
11674
11675
11676
11677
11678
11679
11680
11681
11682
11683
11684
11685
11686
11687
11688
11689
11690
11691
11692
11693
11694
11695
11696
11697
11698
11699
11700
11701
11702
11703
11704
11705
11706
11707
11708
11709
11710
11711
11712
11713
11714
11715
11716
11717
11718
11719
11720
11721
11722
11723
11724
11725
11726
11727
11728
11729
11730
11731
11732
11733
11734
11735
11736
11737
11738
11739
11740
11741
11742
11743
11744
11745
11746
11747
11748
11749
11750
11751
11752
11753
11754
11755
11756
11757
11758
11759
11760
11761
11762
11763
11764
11765
11766
11767
11768
11769
11770
11771
11772
11773
11774
11775
11776
11777
11778
11779
11780
11781
11782
11783
11784
11785
11786
11787
11788
11789
11790
11791
11792
11793
11794
11795
11796
11797
11798
11799
11800
11801
11802
11803
11804
11805
11806
11807
11808
11809
11810
11811
11812
11813
11814
11815
11816
11817
11818
11819
11820
11821
11822
11823
11824
11825
11826
11827
11828
11829
11830
11831
11832
11833
11834
11835
11836
11837
11838
11839
11840
11841
11842
11843
11844
11845
11846
11847
11848
11849
11850
11851
11852
11853
11854
11855
11856
11857
11858
11859
11860
11861
11862
11863
11864
11865
11866
11867
11868
11869
11870
11871
11872
11873
11874
11875
11876
11877
11878
11879
11880
11881
11882
11883
11884
11885
11886
11887
11888
11889
11890
11891
11892
11893
11894
11895
11896
11897
11898
11899
11900
11901
11902
11903
11904
11905
11906
11907
11908
11909
11910
11911
11912
11913
11914
11915
11916
11917
11918
11919
11920
11921
11922
11923
11924
11925
11926
11927
11928
11929
11930
11931
11932
11933
11934
11935
11936
11937
11938
11939
11940
11941
11942
11943
11944
11945
11946
11947
11948
11949
11950
11951
11952
11953
11954
11955
11956
11957
11958
11959
11960
11961
11962
11963
11964
11965
11966
11967
11968
11969
11970
11971
11972
11973
11974
11975
11976
11977
11978
11979
11980
11981
11982
11983
11984
11985
11986
11987
11988
11989
11990
11991
11992
11993
11994
11995
11996
11997
11998
11999
12000
12001
12002
12003
12004
12005
12006
12007
12008
12009
12010
12011
12012
12013
12014
12015
12016
12017
12018
12019
12020
12021
12022
12023
12024
12025
12026
12027
12028
12029
12030
12031
12032
12033
12034
12035
12036
12037
12038
12039
12040
12041
12042
12043
12044
12045
12046
12047
12048
12049
12050
12051
12052
12053
12054
12055
12056
12057
12058
12059
12060
12061
12062
12063
12064
12065
12066
12067
12068
12069
12070
12071
12072
12073
12074
12075
12076
12077
12078
12079
12080
12081
12082
12083
12084
12085
12086
12087
12088
12089
12090
12091
12092
12093
12094
12095
12096
12097
12098
12099
12100
12101
12102
12103
12104
12105
12106
12107
12108
12109
12110
12111
12112
12113
12114
12115
12116
12117
12118
12119
12120
12121
12122
12123
12124
12125
12126
12127
12128
12129
12130
12131
12132
12133
12134
12135
12136
12137
12138
12139
12140
12141
12142
12143
12144
12145
12146
12147
12148
12149
12150
12151
12152
12153
12154
12155
12156
12157
12158
12159
12160
12161
12162
12163
12164
12165
12166
12167
12168
12169
12170
12171
12172
12173
12174
12175
12176
12177
12178
12179
12180
12181
12182
12183
12184
12185
12186
12187
12188
12189
12190
12191
12192
12193
12194
12195
12196
12197
12198
12199
12200
12201
12202
12203
12204
12205
12206
12207
12208
12209
12210
12211
12212
12213
12214
12215
12216
12217
12218
12219
12220
12221
12222
12223
12224
12225
12226
12227
12228
12229
12230
12231
12232
12233
12234
12235
12236
12237
12238
12239
12240
12241
12242
12243
12244
12245
12246
12247
12248
12249
12250
12251
12252
12253
12254
12255
12256
12257
12258
12259
12260
12261
12262
12263
12264
12265
12266
12267
12268
12269
12270
12271
12272
12273
12274
12275
12276
12277
12278
12279
12280
12281
12282
12283
12284
12285
12286
12287
12288
12289
12290
12291
12292
12293
12294
12295
12296
12297
12298
12299
12300
12301
12302
12303
12304
12305
12306
12307
12308
12309
12310
12311
12312
12313
12314
12315
12316
12317
12318
12319
12320
12321
12322
12323
12324
12325
12326
12327
12328
12329
12330
12331
12332
12333
12334
12335
12336
12337
12338
12339
12340
12341
12342
12343
12344
12345
12346
12347
12348
12349
12350
12351
12352
12353
12354
12355
12356
12357
12358
12359
12360
12361
12362
12363
12364
12365
12366
12367
12368
12369
12370
12371
12372
12373
12374
12375
12376
12377
12378
12379
12380
12381
12382
12383
12384
12385
12386
12387
12388
12389
12390
12391
12392
12393
12394
12395
12396
12397
12398
12399
12400
12401
12402
12403
12404
12405
12406
12407
12408
12409
12410
12411
12412
12413
12414
12415
12416
12417
12418
12419
12420
12421
12422
12423
12424
12425
12426
12427
12428
12429
12430
12431
12432
12433
12434
12435
12436
12437
12438
12439
12440
12441
12442
12443
12444
12445
12446
12447
12448
12449
12450
12451
12452
12453
12454
12455
12456
12457
12458
12459
12460
12461
12462
12463
12464
12465
12466
12467
12468
12469
12470
12471
12472
12473
12474
12475
12476
12477
12478
12479
12480
12481
12482
12483
12484
12485
12486
12487
12488
12489
12490
12491
12492
12493
12494
12495
12496
12497
12498
12499
12500
12501
12502
12503
12504
12505
12506
12507
12508
12509
12510
12511
12512
12513
12514
12515
12516
12517
12518
12519
12520
12521
12522
12523
12524
12525
12526
12527
12528
12529
12530
12531
12532
12533
12534
12535
12536
12537
12538
12539
12540
12541
12542
12543
12544
12545
12546
12547
12548
12549
12550
12551
12552
12553
12554
12555
12556
12557
12558
12559
12560
12561
12562
12563
12564
12565
12566
12567
12568
12569
12570
12571
12572
12573
12574
12575
12576
12577
12578
12579
12580
12581
12582
12583
12584
12585
12586
12587
12588
12589
12590
12591
12592
12593
12594
12595
12596
12597
12598
12599
12600
12601
12602
12603
12604
12605
12606
12607
12608
12609
12610
12611
12612
12613
12614
12615
12616
12617
12618
12619
12620
12621
12622
12623
12624
12625
12626
12627
12628
12629
12630
12631
12632
12633
12634
12635
12636
12637
12638
12639
12640
12641
12642
12643
12644
12645
12646
12647
12648
12649
12650
12651
12652
12653
12654
12655
12656
12657
12658
12659
12660
12661
12662
12663
12664
12665
12666
12667
12668
12669
12670
12671
12672
12673
12674
12675
12676
12677
12678
12679
12680
12681
12682
12683
12684
12685
12686
12687
12688
12689
12690
12691
12692
12693
12694
12695
12696
12697
12698
12699
12700
12701
12702
12703
12704
12705
12706
12707
12708
12709
12710
12711
12712
12713
12714
12715
12716
12717
12718
12719
12720
12721
12722
12723
12724
12725
12726
12727
12728
12729
12730
12731
12732
12733
12734
12735
12736
12737
12738
12739
12740
12741
12742
12743
12744
12745
12746
12747
12748
12749
12750
12751
12752
12753
12754
12755
12756
12757
12758
12759
12760
12761
12762
12763
12764
12765
12766
12767
12768
12769
12770
12771
12772
12773
12774
12775
12776
12777
12778
12779
12780
12781
12782
12783
12784
12785
12786
12787
12788
12789
12790
12791
12792
12793
12794
12795
12796
12797
12798
12799
12800
12801
12802
12803
12804
12805
12806
12807
12808
12809
12810
12811
12812
12813
12814
12815
12816
12817
12818
12819
12820
12821
12822
12823
12824
12825
12826
12827
12828
12829
12830
12831
12832
12833
12834
12835
12836
12837
12838
12839
12840
12841
12842
12843
12844
12845
12846
12847
12848
12849
12850
12851
12852
12853
12854
12855
12856
12857
12858
12859
12860
12861
12862
12863
12864
12865
12866
12867
12868
12869
12870
12871
12872
12873
12874
12875
12876
12877
12878
12879
12880
12881
12882
12883
12884
12885
12886
12887
12888
12889
12890
12891
12892
12893
12894
12895
12896
12897
12898
12899
12900
12901
12902
12903
12904
12905
12906
12907
12908
12909
12910
12911
12912
12913
12914
12915
12916
12917
12918
12919
12920
12921
12922
12923
12924
12925
12926
12927
12928
12929
12930
12931
12932
12933
12934
12935
12936
12937
12938
12939
12940
12941
12942
12943
12944
12945
12946
12947
12948
12949
12950
12951
12952
12953
12954
12955
12956
12957
12958
12959
12960
12961
12962
12963
12964
12965
12966
12967
12968
12969
12970
12971
12972
12973
12974
12975
12976
12977
12978
12979
12980
12981
12982
12983
12984
12985
12986
12987
12988
12989
12990
12991
12992
12993
12994
12995
12996
12997
12998
12999
13000
13001
13002
13003
13004
13005
13006
13007
13008
13009
13010
13011
13012
13013
13014
13015
13016
13017
13018
13019
13020
13021
13022
13023
13024
13025
13026
13027
13028
13029
13030
13031
13032
13033
13034
13035
13036
13037
13038
13039
13040
13041
13042
13043
13044
13045
13046
13047
13048
13049
13050
13051
13052
13053
13054
13055
13056
13057
13058
13059
13060
13061
13062
13063
13064
13065
13066
13067
13068
13069
13070
13071
13072
13073
13074
13075
13076
13077
13078
13079
13080
13081
13082
13083
13084
13085
13086
13087
13088
13089
13090
13091
13092
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *      NET3    Protocol independent device support routines.
 *
 *        Derived from the non IP parts of dev.c 1.0.19
 *              Authors:        Ross Biro
 *                                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                                Mark Evans, <evansmp@uhura.aston.ac.uk>
 *
 *        Additional Authors:
 *                Florian la Roche <rzsfl@rz.uni-sb.de>
 *                Alan Cox <gw4pts@gw4pts.ampr.org>
 *                David Hinds <dahinds@users.sourceforge.net>
 *                Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
 *                Adam Sulmicki <adam@cfar.umd.edu>
 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
 *
 *        Changes:
 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
 *                                      to 2 if register_netdev gets called
 *                                      before net_dev_init & also removed a
 *                                      few lines of code in the process.
 *                Alan Cox        :        device private ioctl copies fields back.
 *                Alan Cox        :        Transmit queue code does relevant
 *                                        stunts to keep the queue safe.
 *                Alan Cox        :        Fixed double lock.
 *                Alan Cox        :        Fixed promisc NULL pointer trap
 *                ????????        :        Support the full private ioctl range
 *                Alan Cox        :        Moved ioctl permission check into
 *                                        drivers
 *                Tim Kordas        :        SIOCADDMULTI/SIOCDELMULTI
 *                Alan Cox        :        100 backlog just doesn't cut it when
 *                                        you start doing multicast video 8)
 *                Alan Cox        :        Rewrote net_bh and list manager.
 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
 *                Alan Cox        :        Took out transmit every packet pass
 *                                        Saved a few bytes in the ioctl handler
 *                Alan Cox        :        Network driver sets packet type before
 *                                        calling netif_rx. Saves a function
 *                                        call a packet.
 *                Alan Cox        :        Hashed net_bh()
 *                Richard Kooijman:        Timestamp fixes.
 *                Alan Cox        :        Wrong field in SIOCGIFDSTADDR
 *                Alan Cox        :        Device lock protection.
 *              Alan Cox        :       Fixed nasty side effect of device close
 *                                        changes.
 *                Rudi Cilibrasi        :        Pass the right thing to
 *                                        set_mac_address()
 *                Dave Miller        :        32bit quantity for the device lock to
 *                                        make it work out on a Sparc.
 *                Bjorn Ekwall        :        Added KERNELD hack.
 *                Alan Cox        :        Cleaned up the backlog initialise.
 *                Craig Metz        :        SIOCGIFCONF fix if space for under
 *                                        1 device.
 *            Thomas Bogendoerfer :        Return ENODEV for dev_open, if there
 *                                        is no device open function.
 *                Andi Kleen        :        Fix error reporting for SIOCGIFCONF
 *            Michael Chastain        :        Fix signed/unsigned for SIOCGIFCONF
 *                Cyrus Durgin        :        Cleaned for KMOD
 *                Adam Sulmicki   :        Bug Fix : Network Device Unload
 *                                        A network device unload needs to purge
 *                                        the backlog queue.
 *        Paul Rusty Russell        :        SIOCSIFNAME
 *              Pekka Riikonen  :        Netdev boot-time settings code
 *              Andrew Morton   :       Make unregister_netdevice wait
 *                                      indefinitely on dev->refcnt
 *              J Hadi Salim    :       - Backlog queue sampling
 *                                        - netif_rx() feedback
 */

#include <linux/uaccess.h>
#include <linux/bitmap.h>
#include <linux/capability.h>
#include <linux/cpu.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/hash.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/isolation.h>
#include <linux/sched/mm.h>
#include <linux/smpboot.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/if_ether.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/ethtool.h>
#include <linux/ethtool_netlink.h>
#include <linux/skbuff.h>
#include <linux/kthread.h>
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/busy_poll.h>
#include <linux/rtnetlink.h>
#include <linux/stat.h>
#include <net/dsa.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
#include <net/gro.h>
#include <net/netdev_queues.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
#include <net/checksum.h>
#include <net/xfrm.h>
#include <net/tcx.h>
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/netpoll.h>
#include <linux/rcupdate.h>
#include <linux/delay.h>
#include <net/iw_handler.h>
#include <asm/current.h>
#include <linux/audit.h>
#include <linux/dmaengine.h>
#include <linux/err.h>
#include <linux/ctype.h>
#include <linux/if_arp.h>
#include <linux/if_vlan.h>
#include <linux/ip.h>
#include <net/ip.h>
#include <net/mpls.h>
#include <linux/ipv6.h>
#include <linux/in.h>
#include <linux/jhash.h>
#include <linux/random.h>
#include <trace/events/napi.h>
#include <trace/events/net.h>
#include <trace/events/skb.h>
#include <trace/events/qdisc.h>
#include <trace/events/xdp.h>
#include <linux/inetdevice.h>
#include <linux/cpu_rmap.h>
#include <linux/static_key.h>
#include <linux/hashtable.h>
#include <linux/vmalloc.h>
#include <linux/if_macvlan.h>
#include <linux/errqueue.h>
#include <linux/hrtimer.h>
#include <linux/netfilter_netdev.h>
#include <linux/crash_dump.h>
#include <linux/sctp.h>
#include <net/udp_tunnel.h>
#include <linux/net_namespace.h>
#include <linux/indirect_call_wrapper.h>
#include <net/devlink.h>
#include <linux/pm_runtime.h>
#include <linux/prandom.h>
#include <linux/once_lite.h>
#include <net/netdev_lock.h>
#include <net/netdev_rx_queue.h>
#include <net/page_pool/types.h>
#include <net/page_pool/helpers.h>
#include <net/page_pool/memory_provider.h>
#include <net/rps.h>
#include <linux/phy_link_topology.h>

#include "dev.h"
#include "devmem.h"
#include "net-sysfs.h"

static DEFINE_SPINLOCK(ptype_lock);
struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;

static int netif_rx_internal(struct sk_buff *skb);
static int call_netdevice_notifiers_extack(unsigned long val,
                                           struct net_device *dev,
                                           struct netlink_ext_ack *extack);

static DEFINE_MUTEX(ifalias_mutex);

/* protects napi_hash addition/deletion and napi_gen_id */
static DEFINE_SPINLOCK(napi_hash_lock);

static unsigned int napi_gen_id = NR_CPUS;
static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);

static inline void dev_base_seq_inc(struct net *net)
{
        unsigned int val = net->dev_base_seq + 1;

        WRITE_ONCE(net->dev_base_seq, val ?: 1);
}

static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
{
        unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));

        return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
}

static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
{
        return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
}

#ifndef CONFIG_PREEMPT_RT

static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key);

static int __init setup_backlog_napi_threads(char *arg)
{
        static_branch_enable(&use_backlog_threads_key);
        return 0;
}
early_param("thread_backlog_napi", setup_backlog_napi_threads);

static bool use_backlog_threads(void)
{
        return static_branch_unlikely(&use_backlog_threads_key);
}

#else

static bool use_backlog_threads(void)
{
        return true;
}

#endif

static inline void backlog_lock_irq_save(struct softnet_data *sd,
                                         unsigned long *flags)
{
        if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
                spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
        else
                local_irq_save(*flags);
}

static inline void backlog_lock_irq_disable(struct softnet_data *sd)
{
        if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
                spin_lock_irq(&sd->input_pkt_queue.lock);
        else
                local_irq_disable();
}

static inline void backlog_unlock_irq_restore(struct softnet_data *sd,
                                              unsigned long *flags)
{
        if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
                spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
        else
                local_irq_restore(*flags);
}

static inline void backlog_unlock_irq_enable(struct softnet_data *sd)
{
        if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
                spin_unlock_irq(&sd->input_pkt_queue.lock);
        else
                local_irq_enable();
}

static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
                                                       const char *name)
{
        struct netdev_name_node *name_node;

        name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
        if (!name_node)
                return NULL;
        INIT_HLIST_NODE(&name_node->hlist);
        name_node->dev = dev;
        name_node->name = name;
        return name_node;
}

static struct netdev_name_node *
netdev_name_node_head_alloc(struct net_device *dev)
{
        struct netdev_name_node *name_node;

        name_node = netdev_name_node_alloc(dev, dev->name);
        if (!name_node)
                return NULL;
        INIT_LIST_HEAD(&name_node->list);
        return name_node;
}

static void netdev_name_node_free(struct netdev_name_node *name_node)
{
        kfree(name_node);
}

static void netdev_name_node_add(struct net *net,
                                 struct netdev_name_node *name_node)
{
        hlist_add_head_rcu(&name_node->hlist,
                           dev_name_hash(net, name_node->name));
}

static void netdev_name_node_del(struct netdev_name_node *name_node)
{
        hlist_del_rcu(&name_node->hlist);
}

static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
                                                        const char *name)
{
        struct hlist_head *head = dev_name_hash(net, name);
        struct netdev_name_node *name_node;

        hlist_for_each_entry(name_node, head, hlist)
                if (!strcmp(name_node->name, name))
                        return name_node;
        return NULL;
}

static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
                                                            const char *name)
{
        struct hlist_head *head = dev_name_hash(net, name);
        struct netdev_name_node *name_node;

        hlist_for_each_entry_rcu(name_node, head, hlist)
                if (!strcmp(name_node->name, name))
                        return name_node;
        return NULL;
}

bool netdev_name_in_use(struct net *net, const char *name)
{
        return netdev_name_node_lookup(net, name);
}
EXPORT_SYMBOL(netdev_name_in_use);

int netdev_name_node_alt_create(struct net_device *dev, const char *name)
{
        struct netdev_name_node *name_node;
        struct net *net = dev_net(dev);

        name_node = netdev_name_node_lookup(net, name);
        if (name_node)
                return -EEXIST;
        name_node = netdev_name_node_alloc(dev, name);
        if (!name_node)
                return -ENOMEM;
        netdev_name_node_add(net, name_node);
        /* The node that holds dev->name acts as a head of per-device list. */
        list_add_tail_rcu(&name_node->list, &dev->name_node->list);

        return 0;
}

static void netdev_name_node_alt_free(struct rcu_head *head)
{
        struct netdev_name_node *name_node =
                container_of(head, struct netdev_name_node, rcu);

        kfree(name_node->name);
        netdev_name_node_free(name_node);
}

static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
{
        netdev_name_node_del(name_node);
        list_del(&name_node->list);
        call_rcu(&name_node->rcu, netdev_name_node_alt_free);
}

int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
{
        struct netdev_name_node *name_node;
        struct net *net = dev_net(dev);

        name_node = netdev_name_node_lookup(net, name);
        if (!name_node)
                return -ENOENT;
        /* lookup might have found our primary name or a name belonging
         * to another device.
         */
        if (name_node == dev->name_node || name_node->dev != dev)
                return -EINVAL;

        __netdev_name_node_alt_destroy(name_node);
        return 0;
}

static void netdev_name_node_alt_flush(struct net_device *dev)
{
        struct netdev_name_node *name_node, *tmp;

        list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) {
                list_del(&name_node->list);
                netdev_name_node_alt_free(&name_node->rcu);
        }
}

/* Device list insertion */
static void list_netdevice(struct net_device *dev)
{
        struct netdev_name_node *name_node;
        struct net *net = dev_net(dev);

        ASSERT_RTNL();

        list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
        netdev_name_node_add(net, dev->name_node);
        hlist_add_head_rcu(&dev->index_hlist,
                           dev_index_hash(net, dev->ifindex));

        netdev_for_each_altname(dev, name_node)
                netdev_name_node_add(net, name_node);

        /* We reserved the ifindex, this can't fail */
        WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL));

        dev_base_seq_inc(net);
}

/* Device list removal
 * caller must respect a RCU grace period before freeing/reusing dev
 */
static void unlist_netdevice(struct net_device *dev)
{
        struct netdev_name_node *name_node;
        struct net *net = dev_net(dev);

        ASSERT_RTNL();

        xa_erase(&net->dev_by_index, dev->ifindex);

        netdev_for_each_altname(dev, name_node)
                netdev_name_node_del(name_node);

        /* Unlink dev from the device chain */
        list_del_rcu(&dev->dev_list);
        netdev_name_node_del(dev->name_node);
        hlist_del_rcu(&dev->index_hlist);

        dev_base_seq_inc(dev_net(dev));
}

/*
 *        Our notifier list
 */

static RAW_NOTIFIER_HEAD(netdev_chain);

/*
 *        Device drivers call our routines to queue packets here. We empty the
 *        queue in the local softnet handler.
 */

DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data) = {
        .process_queue_bh_lock = INIT_LOCAL_LOCK(process_queue_bh_lock),
};
EXPORT_PER_CPU_SYMBOL(softnet_data);

/* Page_pool has a lockless array/stack to alloc/recycle pages.
 * PP consumers must pay attention to run APIs in the appropriate context
 * (e.g. NAPI context).
 */
DEFINE_PER_CPU(struct page_pool_bh, system_page_pool) = {
        .bh_lock = INIT_LOCAL_LOCK(bh_lock),
};

#ifdef CONFIG_LOCKDEP
/*
 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 * according to dev->type
 */
static const unsigned short netdev_lock_type[] = {
         ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
         ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
         ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
         ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
         ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
         ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
         ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
         ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
         ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
         ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
         ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
         ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
         ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
         ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
         ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};

static const char *const netdev_lock_name[] = {
        "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
        "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
        "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
        "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
        "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
        "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
        "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
        "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
        "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
        "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
        "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
        "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
        "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
        "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
        "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};

static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];

static inline unsigned short netdev_lock_pos(unsigned short dev_type)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
                if (netdev_lock_type[i] == dev_type)
                        return i;
        /* the last key is used by default */
        return ARRAY_SIZE(netdev_lock_type) - 1;
}

static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
                                                 unsigned short dev_type)
{
        int i;

        i = netdev_lock_pos(dev_type);
        lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
                                   netdev_lock_name[i]);
}

static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
{
        int i;

        i = netdev_lock_pos(dev->type);
        lockdep_set_class_and_name(&dev->addr_list_lock,
                                   &netdev_addr_lock_key[i],
                                   netdev_lock_name[i]);
}
#else
static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
                                                 unsigned short dev_type)
{
}

static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
{
}
#endif

/*******************************************************************************
 *
 *                Protocol management and registration routines
 *
 *******************************************************************************/


/*
 *        Add a protocol ID to the list. Now that the input handler is
 *        smarter we can dispense with all the messy stuff that used to be
 *        here.
 *
 *        BEWARE!!! Protocol handlers, mangling input packets,
 *        MUST BE last in hash buckets and checking protocol handlers
 *        MUST start from promiscuous ptype_all chain in net_bh.
 *        It is true now, do not change it.
 *        Explanation follows: if protocol handler, mangling packet, will
 *        be the first on list, it is not able to sense, that packet
 *        is cloned and should be copied-on-write, so that it will
 *        change it and subsequent readers will get broken packet.
 *                                                        --ANK (980803)
 */

static inline struct list_head *ptype_head(const struct packet_type *pt)
{
        if (pt->type == htons(ETH_P_ALL)) {
                if (!pt->af_packet_net && !pt->dev)
                        return NULL;

                return pt->dev ? &pt->dev->ptype_all :
                                 &pt->af_packet_net->ptype_all;
        }

        if (pt->dev)
                return &pt->dev->ptype_specific;

        return pt->af_packet_net ? &pt->af_packet_net->ptype_specific :
                                 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
}

/**
 *        dev_add_pack - add packet handler
 *        @pt: packet type declaration
 *
 *        Add a protocol handler to the networking stack. The passed &packet_type
 *        is linked into kernel lists and may not be freed until it has been
 *        removed from the kernel lists.
 *
 *        This call does not sleep therefore it can not
 *        guarantee all CPU's that are in middle of receiving packets
 *        will see the new packet type (until the next received packet).
 */

void dev_add_pack(struct packet_type *pt)
{
        struct list_head *head = ptype_head(pt);

        if (WARN_ON_ONCE(!head))
                return;

        spin_lock(&ptype_lock);
        list_add_rcu(&pt->list, head);
        spin_unlock(&ptype_lock);
}
EXPORT_SYMBOL(dev_add_pack);

/**
 *        __dev_remove_pack         - remove packet handler
 *        @pt: packet type declaration
 *
 *        Remove a protocol handler that was previously added to the kernel
 *        protocol handlers by dev_add_pack(). The passed &packet_type is removed
 *        from the kernel lists and can be freed or reused once this function
 *        returns.
 *
 *      The packet type might still be in use by receivers
 *        and must not be freed until after all the CPU's have gone
 *        through a quiescent state.
 */
void __dev_remove_pack(struct packet_type *pt)
{
        struct list_head *head = ptype_head(pt);
        struct packet_type *pt1;

        if (!head)
                return;

        spin_lock(&ptype_lock);

        list_for_each_entry(pt1, head, list) {
                if (pt == pt1) {
                        list_del_rcu(&pt->list);
                        goto out;
                }
        }

        pr_warn("dev_remove_pack: %p not found\n", pt);
out:
        spin_unlock(&ptype_lock);
}
EXPORT_SYMBOL(__dev_remove_pack);

/**
 *        dev_remove_pack         - remove packet handler
 *        @pt: packet type declaration
 *
 *        Remove a protocol handler that was previously added to the kernel
 *        protocol handlers by dev_add_pack(). The passed &packet_type is removed
 *        from the kernel lists and can be freed or reused once this function
 *        returns.
 *
 *        This call sleeps to guarantee that no CPU is looking at the packet
 *        type after return.
 */
void dev_remove_pack(struct packet_type *pt)
{
        __dev_remove_pack(pt);

        synchronize_net();
}
EXPORT_SYMBOL(dev_remove_pack);


/*******************************************************************************
 *
 *                            Device Interface Subroutines
 *
 *******************************************************************************/

/**
 *        dev_get_iflink        - get 'iflink' value of a interface
 *        @dev: targeted interface
 *
 *        Indicates the ifindex the interface is linked to.
 *        Physical interfaces have the same 'ifindex' and 'iflink' values.
 */

int dev_get_iflink(const struct net_device *dev)
{
        if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
                return dev->netdev_ops->ndo_get_iflink(dev);

        return READ_ONCE(dev->ifindex);
}
EXPORT_SYMBOL(dev_get_iflink);

/**
 *        dev_fill_metadata_dst - Retrieve tunnel egress information.
 *        @dev: targeted interface
 *        @skb: The packet.
 *
 *        For better visibility of tunnel traffic OVS needs to retrieve
 *        egress tunnel information for a packet. Following API allows
 *        user to get this info.
 */
int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
{
        struct ip_tunnel_info *info;

        if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
                return -EINVAL;

        info = skb_tunnel_info_unclone(skb);
        if (!info)
                return -ENOMEM;
        if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
                return -EINVAL;

        return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
}
EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);

static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack)
{
        int k = stack->num_paths++;

        if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX))
                return NULL;

        return &stack->path[k];
}

int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
                          struct net_device_path_stack *stack)
{
        const struct net_device *last_dev;
        struct net_device_path_ctx ctx = {
                .dev        = dev,
        };
        struct net_device_path *path;
        int ret = 0;

        memcpy(ctx.daddr, daddr, sizeof(ctx.daddr));
        stack->num_paths = 0;
        while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
                last_dev = ctx.dev;
                path = dev_fwd_path(stack);
                if (!path)
                        return -1;

                memset(path, 0, sizeof(struct net_device_path));
                ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
                if (ret < 0)
                        return -1;

                if (WARN_ON_ONCE(last_dev == ctx.dev))
                        return -1;
        }

        if (!ctx.dev)
                return ret;

        path = dev_fwd_path(stack);
        if (!path)
                return -1;
        path->type = DEV_PATH_ETHERNET;
        path->dev = ctx.dev;

        return ret;
}
EXPORT_SYMBOL_GPL(dev_fill_forward_path);

/* must be called under rcu_read_lock(), as we dont take a reference */
static struct napi_struct *napi_by_id(unsigned int napi_id)
{
        unsigned int hash = napi_id % HASH_SIZE(napi_hash);
        struct napi_struct *napi;

        hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
                if (napi->napi_id == napi_id)
                        return napi;

        return NULL;
}

/* must be called under rcu_read_lock(), as we dont take a reference */
static struct napi_struct *
netdev_napi_by_id(struct net *net, unsigned int napi_id)
{
        struct napi_struct *napi;

        napi = napi_by_id(napi_id);
        if (!napi)
                return NULL;

        if (WARN_ON_ONCE(!napi->dev))
                return NULL;
        if (!net_eq(net, dev_net(napi->dev)))
                return NULL;

        return napi;
}

/**
 *        netdev_napi_by_id_lock() - find a device by NAPI ID and lock it
 *        @net: the applicable net namespace
 *        @napi_id: ID of a NAPI of a target device
 *
 *        Find a NAPI instance with @napi_id. Lock its device.
 *        The device must be in %NETREG_REGISTERED state for lookup to succeed.
 *        netdev_unlock() must be called to release it.
 *
 *        Return: pointer to NAPI, its device with lock held, NULL if not found.
 */
struct napi_struct *
netdev_napi_by_id_lock(struct net *net, unsigned int napi_id)
{
        struct napi_struct *napi;
        struct net_device *dev;

        rcu_read_lock();
        napi = netdev_napi_by_id(net, napi_id);
        if (!napi || READ_ONCE(napi->dev->reg_state) != NETREG_REGISTERED) {
                rcu_read_unlock();
                return NULL;
        }

        dev = napi->dev;
        dev_hold(dev);
        rcu_read_unlock();

        dev = __netdev_put_lock(dev, net);
        if (!dev)
                return NULL;

        rcu_read_lock();
        napi = netdev_napi_by_id(net, napi_id);
        if (napi && napi->dev != dev)
                napi = NULL;
        rcu_read_unlock();

        if (!napi)
                netdev_unlock(dev);
        return napi;
}

/**
 *        __dev_get_by_name        - find a device by its name
 *        @net: the applicable net namespace
 *        @name: name to find
 *
 *        Find an interface by name. Must be called under RTNL semaphore.
 *        If the name is found a pointer to the device is returned.
 *        If the name is not found then %NULL is returned. The
 *        reference counters are not incremented so the caller must be
 *        careful with locks.
 */

struct net_device *__dev_get_by_name(struct net *net, const char *name)
{
        struct netdev_name_node *node_name;

        node_name = netdev_name_node_lookup(net, name);
        return node_name ? node_name->dev : NULL;
}
EXPORT_SYMBOL(__dev_get_by_name);

/**
 * dev_get_by_name_rcu        - find a device by its name
 * @net: the applicable net namespace
 * @name: name to find
 *
 * Find an interface by name.
 * If the name is found a pointer to the device is returned.
 * If the name is not found then %NULL is returned.
 * The reference counters are not incremented so the caller must be
 * careful with locks. The caller must hold RCU lock.
 */

struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
{
        struct netdev_name_node *node_name;

        node_name = netdev_name_node_lookup_rcu(net, name);
        return node_name ? node_name->dev : NULL;
}
EXPORT_SYMBOL(dev_get_by_name_rcu);

/* Deprecated for new users, call netdev_get_by_name() instead */
struct net_device *dev_get_by_name(struct net *net, const char *name)
{
        struct net_device *dev;

        rcu_read_lock();
        dev = dev_get_by_name_rcu(net, name);
        dev_hold(dev);
        rcu_read_unlock();
        return dev;
}
EXPORT_SYMBOL(dev_get_by_name);

/**
 *        netdev_get_by_name() - find a device by its name
 *        @net: the applicable net namespace
 *        @name: name to find
 *        @tracker: tracking object for the acquired reference
 *        @gfp: allocation flags for the tracker
 *
 *        Find an interface by name. This can be called from any
 *        context and does its own locking. The returned handle has
 *        the usage count incremented and the caller must use netdev_put() to
 *        release it when it is no longer needed. %NULL is returned if no
 *        matching device is found.
 */
struct net_device *netdev_get_by_name(struct net *net, const char *name,
                                      netdevice_tracker *tracker, gfp_t gfp)
{
        struct net_device *dev;

        dev = dev_get_by_name(net, name);
        if (dev)
                netdev_tracker_alloc(dev, tracker, gfp);
        return dev;
}
EXPORT_SYMBOL(netdev_get_by_name);

/**
 *        __dev_get_by_index - find a device by its ifindex
 *        @net: the applicable net namespace
 *        @ifindex: index of device
 *
 *        Search for an interface by index. Returns %NULL if the device
 *        is not found or a pointer to the device. The device has not
 *        had its reference counter increased so the caller must be careful
 *        about locking. The caller must hold the RTNL semaphore.
 */

struct net_device *__dev_get_by_index(struct net *net, int ifindex)
{
        struct net_device *dev;
        struct hlist_head *head = dev_index_hash(net, ifindex);

        hlist_for_each_entry(dev, head, index_hlist)
                if (dev->ifindex == ifindex)
                        return dev;

        return NULL;
}
EXPORT_SYMBOL(__dev_get_by_index);

/**
 *        dev_get_by_index_rcu - find a device by its ifindex
 *        @net: the applicable net namespace
 *        @ifindex: index of device
 *
 *        Search for an interface by index. Returns %NULL if the device
 *        is not found or a pointer to the device. The device has not
 *        had its reference counter increased so the caller must be careful
 *        about locking. The caller must hold RCU lock.
 */

struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
{
        struct net_device *dev;
        struct hlist_head *head = dev_index_hash(net, ifindex);

        hlist_for_each_entry_rcu(dev, head, index_hlist)
                if (dev->ifindex == ifindex)
                        return dev;

        return NULL;
}
EXPORT_SYMBOL(dev_get_by_index_rcu);

/* Deprecated for new users, call netdev_get_by_index() instead */
struct net_device *dev_get_by_index(struct net *net, int ifindex)
{
        struct net_device *dev;

        rcu_read_lock();
        dev = dev_get_by_index_rcu(net, ifindex);
        dev_hold(dev);
        rcu_read_unlock();
        return dev;
}
EXPORT_SYMBOL(dev_get_by_index);

/**
 *        netdev_get_by_index() - find a device by its ifindex
 *        @net: the applicable net namespace
 *        @ifindex: index of device
 *        @tracker: tracking object for the acquired reference
 *        @gfp: allocation flags for the tracker
 *
 *        Search for an interface by index. Returns NULL if the device
 *        is not found or a pointer to the device. The device returned has
 *        had a reference added and the pointer is safe until the user calls
 *        netdev_put() to indicate they have finished with it.
 */
struct net_device *netdev_get_by_index(struct net *net, int ifindex,
                                       netdevice_tracker *tracker, gfp_t gfp)
{
        struct net_device *dev;

        dev = dev_get_by_index(net, ifindex);
        if (dev)
                netdev_tracker_alloc(dev, tracker, gfp);
        return dev;
}
EXPORT_SYMBOL(netdev_get_by_index);

/**
 *        dev_get_by_napi_id - find a device by napi_id
 *        @napi_id: ID of the NAPI struct
 *
 *        Search for an interface by NAPI ID. Returns %NULL if the device
 *        is not found or a pointer to the device. The device has not had
 *        its reference counter increased so the caller must be careful
 *        about locking. The caller must hold RCU lock.
 */
struct net_device *dev_get_by_napi_id(unsigned int napi_id)
{
        struct napi_struct *napi;

        WARN_ON_ONCE(!rcu_read_lock_held());

        if (!napi_id_valid(napi_id))
                return NULL;

        napi = napi_by_id(napi_id);

        return napi ? napi->dev : NULL;
}

/* Release the held reference on the net_device, and if the net_device
 * is still registered try to lock the instance lock. If device is being
 * unregistered NULL will be returned (but the reference has been released,
 * either way!)
 *
 * This helper is intended for locking net_device after it has been looked up
 * using a lockless lookup helper. Lock prevents the instance from going away.
 */
struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net)
{
        netdev_lock(dev);
        if (dev->reg_state > NETREG_REGISTERED ||
            dev->moving_ns || !net_eq(dev_net(dev), net)) {
                netdev_unlock(dev);
                dev_put(dev);
                return NULL;
        }
        dev_put(dev);
        return dev;
}

static struct net_device *
__netdev_put_lock_ops_compat(struct net_device *dev, struct net *net)
{
        netdev_lock_ops_compat(dev);
        if (dev->reg_state > NETREG_REGISTERED ||
            dev->moving_ns || !net_eq(dev_net(dev), net)) {
                netdev_unlock_ops_compat(dev);
                dev_put(dev);
                return NULL;
        }
        dev_put(dev);
        return dev;
}

/**
 *        netdev_get_by_index_lock() - find a device by its ifindex
 *        @net: the applicable net namespace
 *        @ifindex: index of device
 *
 *        Search for an interface by index. If a valid device
 *        with @ifindex is found it will be returned with netdev->lock held.
 *        netdev_unlock() must be called to release it.
 *
 *        Return: pointer to a device with lock held, NULL if not found.
 */
struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex)
{
        struct net_device *dev;

        dev = dev_get_by_index(net, ifindex);
        if (!dev)
                return NULL;

        return __netdev_put_lock(dev, net);
}

struct net_device *
netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex)
{
        struct net_device *dev;

        dev = dev_get_by_index(net, ifindex);
        if (!dev)
                return NULL;

        return __netdev_put_lock_ops_compat(dev, net);
}

struct net_device *
netdev_xa_find_lock(struct net *net, struct net_device *dev,
                    unsigned long *index)
{
        if (dev)
                netdev_unlock(dev);

        do {
                rcu_read_lock();
                dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT);
                if (!dev) {
                        rcu_read_unlock();
                        return NULL;
                }
                dev_hold(dev);
                rcu_read_unlock();

                dev = __netdev_put_lock(dev, net);
                if (dev)
                        return dev;

                (*index)++;
        } while (true);
}

struct net_device *
netdev_xa_find_lock_ops_compat(struct net *net, struct net_device *dev,
                               unsigned long *index)
{
        if (dev)
                netdev_unlock_ops_compat(dev);

        do {
                rcu_read_lock();
                dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT);
                if (!dev) {
                        rcu_read_unlock();
                        return NULL;
                }
                dev_hold(dev);
                rcu_read_unlock();

                dev = __netdev_put_lock_ops_compat(dev, net);
                if (dev)
                        return dev;

                (*index)++;
        } while (true);
}

static DEFINE_SEQLOCK(netdev_rename_lock);

void netdev_copy_name(struct net_device *dev, char *name)
{
        unsigned int seq;

        do {
                seq = read_seqbegin(&netdev_rename_lock);
                strscpy(name, dev->name, IFNAMSIZ);
        } while (read_seqretry(&netdev_rename_lock, seq));
}

/**
 *        netdev_get_name - get a netdevice name, knowing its ifindex.
 *        @net: network namespace
 *        @name: a pointer to the buffer where the name will be stored.
 *        @ifindex: the ifindex of the interface to get the name from.
 */
int netdev_get_name(struct net *net, char *name, int ifindex)
{
        struct net_device *dev;
        int ret;

        rcu_read_lock();

        dev = dev_get_by_index_rcu(net, ifindex);
        if (!dev) {
                ret = -ENODEV;
                goto out;
        }

        netdev_copy_name(dev, name);

        ret = 0;
out:
        rcu_read_unlock();
        return ret;
}

static bool dev_addr_cmp(struct net_device *dev, unsigned short type,
                         const char *ha)
{
        return dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len);
}

/**
 *        dev_getbyhwaddr_rcu - find a device by its hardware address
 *        @net: the applicable net namespace
 *        @type: media type of device
 *        @ha: hardware address
 *
 *        Search for an interface by MAC address. Returns NULL if the device
 *        is not found or a pointer to the device.
 *        The caller must hold RCU.
 *        The returned device has not had its ref count increased
 *        and the caller must therefore be careful about locking
 *
 */

struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
                                       const char *ha)
{
        struct net_device *dev;

        for_each_netdev_rcu(net, dev)
                if (dev_addr_cmp(dev, type, ha))
                        return dev;

        return NULL;
}
EXPORT_SYMBOL(dev_getbyhwaddr_rcu);

/**
 * dev_getbyhwaddr() - find a device by its hardware address
 * @net: the applicable net namespace
 * @type: media type of device
 * @ha: hardware address
 *
 * Similar to dev_getbyhwaddr_rcu(), but the owner needs to hold
 * rtnl_lock.
 *
 * Context: rtnl_lock() must be held.
 * Return: pointer to the net_device, or NULL if not found
 */
struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type,
                                   const char *ha)
{
        struct net_device *dev;

        ASSERT_RTNL();
        for_each_netdev(net, dev)
                if (dev_addr_cmp(dev, type, ha))
                        return dev;

        return NULL;
}
EXPORT_SYMBOL(dev_getbyhwaddr);

struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
{
        struct net_device *dev, *ret = NULL;

        rcu_read_lock();
        for_each_netdev_rcu(net, dev)
                if (dev->type == type) {
                        dev_hold(dev);
                        ret = dev;
                        break;
                }
        rcu_read_unlock();
        return ret;
}
EXPORT_SYMBOL(dev_getfirstbyhwtype);

/**
 * netdev_get_by_flags_rcu - find any device with given flags
 * @net: the applicable net namespace
 * @tracker: tracking object for the acquired reference
 * @if_flags: IFF_* values
 * @mask: bitmask of bits in if_flags to check
 *
 * Search for any interface with the given flags.
 *
 * Context: rcu_read_lock() must be held.
 * Returns: NULL if a device is not found or a pointer to the device.
 */
struct net_device *netdev_get_by_flags_rcu(struct net *net, netdevice_tracker *tracker,
                                           unsigned short if_flags, unsigned short mask)
{
        struct net_device *dev;

        for_each_netdev_rcu(net, dev) {
                if (((READ_ONCE(dev->flags) ^ if_flags) & mask) == 0) {
                        netdev_hold(dev, tracker, GFP_ATOMIC);
                        return dev;
                }
        }

        return NULL;
}
EXPORT_IPV6_MOD(netdev_get_by_flags_rcu);

/**
 *        dev_valid_name - check if name is okay for network device
 *        @name: name string
 *
 *        Network device names need to be valid file names to
 *        allow sysfs to work.  We also disallow any kind of
 *        whitespace.
 */
bool dev_valid_name(const char *name)
{
        if (*name == '\0')
                return false;
        if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
                return false;
        if (!strcmp(name, ".") || !strcmp(name, ".."))
                return false;

        while (*name) {
                if (*name == '/' || *name == ':' || isspace(*name))
                        return false;
                name++;
        }
        return true;
}
EXPORT_SYMBOL(dev_valid_name);

/**
 *        __dev_alloc_name - allocate a name for a device
 *        @net: network namespace to allocate the device name in
 *        @name: name format string
 *        @res: result name string
 *
 *        Passed a format string - eg "lt%d" it will try and find a suitable
 *        id. It scans list of devices to build up a free map, then chooses
 *        the first empty slot. The caller must hold the dev_base or rtnl lock
 *        while allocating the name and adding the device in order to avoid
 *        duplicates.
 *        Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 *        Returns the number of the unit assigned or a negative errno code.
 */

static int __dev_alloc_name(struct net *net, const char *name, char *res)
{
        int i = 0;
        const char *p;
        const int max_netdevices = 8*PAGE_SIZE;
        unsigned long *inuse;
        struct net_device *d;
        char buf[IFNAMSIZ];

        /* Verify the string as this thing may have come from the user.
         * There must be one "%d" and no other "%" characters.
         */
        p = strchr(name, '%');
        if (!p || p[1] != 'd' || strchr(p + 2, '%'))
                return -EINVAL;

        /* Use one page as a bit array of possible slots */
        inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC);
        if (!inuse)
                return -ENOMEM;

        for_each_netdev(net, d) {
                struct netdev_name_node *name_node;

                netdev_for_each_altname(d, name_node) {
                        if (!sscanf(name_node->name, name, &i))
                                continue;
                        if (i < 0 || i >= max_netdevices)
                                continue;

                        /* avoid cases where sscanf is not exact inverse of printf */
                        snprintf(buf, IFNAMSIZ, name, i);
                        if (!strncmp(buf, name_node->name, IFNAMSIZ))
                                __set_bit(i, inuse);
                }
                if (!sscanf(d->name, name, &i))
                        continue;
                if (i < 0 || i >= max_netdevices)
                        continue;

                /* avoid cases where sscanf is not exact inverse of printf */
                snprintf(buf, IFNAMSIZ, name, i);
                if (!strncmp(buf, d->name, IFNAMSIZ))
                        __set_bit(i, inuse);
        }

        i = find_first_zero_bit(inuse, max_netdevices);
        bitmap_free(inuse);
        if (i == max_netdevices)
                return -ENFILE;

        /* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */
        strscpy(buf, name, IFNAMSIZ);
        snprintf(res, IFNAMSIZ, buf, i);
        return i;
}

/* Returns negative errno or allocated unit id (see __dev_alloc_name()) */
static int dev_prep_valid_name(struct net *net, struct net_device *dev,
                               const char *want_name, char *out_name,
                               int dup_errno)
{
        if (!dev_valid_name(want_name))
                return -EINVAL;

        if (strchr(want_name, '%'))
                return __dev_alloc_name(net, want_name, out_name);

        if (netdev_name_in_use(net, want_name))
                return -dup_errno;
        if (out_name != want_name)
                strscpy(out_name, want_name, IFNAMSIZ);
        return 0;
}

/**
 *        dev_alloc_name - allocate a name for a device
 *        @dev: device
 *        @name: name format string
 *
 *        Passed a format string - eg "lt%d" it will try and find a suitable
 *        id. It scans list of devices to build up a free map, then chooses
 *        the first empty slot. The caller must hold the dev_base or rtnl lock
 *        while allocating the name and adding the device in order to avoid
 *        duplicates.
 *        Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 *        Returns the number of the unit assigned or a negative errno code.
 */

int dev_alloc_name(struct net_device *dev, const char *name)
{
        return dev_prep_valid_name(dev_net(dev), dev, name, dev->name, ENFILE);
}
EXPORT_SYMBOL(dev_alloc_name);

static int dev_get_valid_name(struct net *net, struct net_device *dev,
                              const char *name)
{
        int ret;

        ret = dev_prep_valid_name(net, dev, name, dev->name, EEXIST);
        return ret < 0 ? ret : 0;
}

int netif_change_name(struct net_device *dev, const char *newname)
{
        struct net *net = dev_net(dev);
        unsigned char old_assign_type;
        char oldname[IFNAMSIZ];
        int err = 0;
        int ret;

        ASSERT_RTNL_NET(net);

        if (!strncmp(newname, dev->name, IFNAMSIZ))
                return 0;

        memcpy(oldname, dev->name, IFNAMSIZ);

        write_seqlock_bh(&netdev_rename_lock);
        err = dev_get_valid_name(net, dev, newname);
        write_sequnlock_bh(&netdev_rename_lock);

        if (err < 0)
                return err;

        if (oldname[0] && !strchr(oldname, '%'))
                netdev_info(dev, "renamed from %s%s\n", oldname,
                            dev->flags & IFF_UP ? " (while UP)" : "");

        old_assign_type = dev->name_assign_type;
        WRITE_ONCE(dev->name_assign_type, NET_NAME_RENAMED);

rollback:
        ret = device_rename(&dev->dev, dev->name);
        if (ret) {
                write_seqlock_bh(&netdev_rename_lock);
                memcpy(dev->name, oldname, IFNAMSIZ);
                write_sequnlock_bh(&netdev_rename_lock);
                WRITE_ONCE(dev->name_assign_type, old_assign_type);
                return ret;
        }

        netdev_adjacent_rename_links(dev, oldname);

        netdev_name_node_del(dev->name_node);

        synchronize_net();

        netdev_name_node_add(net, dev->name_node);

        ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
        ret = notifier_to_errno(ret);

        if (ret) {
                /* err >= 0 after dev_alloc_name() or stores the first errno */
                if (err >= 0) {
                        err = ret;
                        write_seqlock_bh(&netdev_rename_lock);
                        memcpy(dev->name, oldname, IFNAMSIZ);
                        write_sequnlock_bh(&netdev_rename_lock);
                        memcpy(oldname, newname, IFNAMSIZ);
                        WRITE_ONCE(dev->name_assign_type, old_assign_type);
                        old_assign_type = NET_NAME_RENAMED;
                        goto rollback;
                } else {
                        netdev_err(dev, "name change rollback failed: %d\n",
                                   ret);
                }
        }

        return err;
}

int netif_set_alias(struct net_device *dev, const char *alias, size_t len)
{
        struct dev_ifalias *new_alias = NULL;

        if (len >= IFALIASZ)
                return -EINVAL;

        if (len) {
                new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
                if (!new_alias)
                        return -ENOMEM;

                memcpy(new_alias->ifalias, alias, len);
                new_alias->ifalias[len] = 0;
        }

        mutex_lock(&ifalias_mutex);
        new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
                                        mutex_is_locked(&ifalias_mutex));
        mutex_unlock(&ifalias_mutex);

        if (new_alias)
                kfree_rcu(new_alias, rcuhead);

        return len;
}

/**
 *        dev_get_alias - get ifalias of a device
 *        @dev: device
 *        @name: buffer to store name of ifalias
 *        @len: size of buffer
 *
 *        get ifalias for a device.  Caller must make sure dev cannot go
 *        away,  e.g. rcu read lock or own a reference count to device.
 */
int dev_get_alias(const struct net_device *dev, char *name, size_t len)
{
        const struct dev_ifalias *alias;
        int ret = 0;

        rcu_read_lock();
        alias = rcu_dereference(dev->ifalias);
        if (alias)
                ret = snprintf(name, len, "%s", alias->ifalias);
        rcu_read_unlock();

        return ret;
}

/**
 *        netdev_features_change - device changes features
 *        @dev: device to cause notification
 *
 *        Called to indicate a device has changed features.
 */
void netdev_features_change(struct net_device *dev)
{
        call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
}
EXPORT_SYMBOL(netdev_features_change);

void netif_state_change(struct net_device *dev)
{
        netdev_ops_assert_locked_or_invisible(dev);

        if (dev->flags & IFF_UP) {
                struct netdev_notifier_change_info change_info = {
                        .info.dev = dev,
                };

                call_netdevice_notifiers_info(NETDEV_CHANGE,
                                              &change_info.info);
                rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL);
        }
}

/**
 * __netdev_notify_peers - notify network peers about existence of @dev,
 * to be called when rtnl lock is already held.
 * @dev: network device
 *
 * Generate traffic such that interested network peers are aware of
 * @dev, such as by generating a gratuitous ARP. This may be used when
 * a device wants to inform the rest of the network about some sort of
 * reconfiguration such as a failover event or virtual machine
 * migration.
 */
void __netdev_notify_peers(struct net_device *dev)
{
        ASSERT_RTNL();
        call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
        call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
}
EXPORT_SYMBOL(__netdev_notify_peers);

/**
 * netdev_notify_peers - notify network peers about existence of @dev
 * @dev: network device
 *
 * Generate traffic such that interested network peers are aware of
 * @dev, such as by generating a gratuitous ARP. This may be used when
 * a device wants to inform the rest of the network about some sort of
 * reconfiguration such as a failover event or virtual machine
 * migration.
 */
void netdev_notify_peers(struct net_device *dev)
{
        rtnl_lock();
        __netdev_notify_peers(dev);
        rtnl_unlock();
}
EXPORT_SYMBOL(netdev_notify_peers);

static int napi_threaded_poll(void *data);

static int napi_kthread_create(struct napi_struct *n)
{
        int err = 0;

        /* Create and wake up the kthread once to put it in
         * TASK_INTERRUPTIBLE mode to avoid the blocked task
         * warning and work with loadavg.
         */
        n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
                                n->dev->name, n->napi_id);
        if (IS_ERR(n->thread)) {
                err = PTR_ERR(n->thread);
                pr_err("kthread_run failed with err %d\n", err);
                n->thread = NULL;
        }

        return err;
}

static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        int ret;

        ASSERT_RTNL();
        dev_addr_check(dev);

        if (!netif_device_present(dev)) {
                /* may be detached because parent is runtime-suspended */
                if (dev->dev.parent)
                        pm_runtime_resume(dev->dev.parent);
                if (!netif_device_present(dev))
                        return -ENODEV;
        }

        /* Block netpoll from trying to do any rx path servicing.
         * If we don't do this there is a chance ndo_poll_controller
         * or ndo_poll may be running while we open the device
         */
        netpoll_poll_disable(dev);

        ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
        ret = notifier_to_errno(ret);
        if (ret)
                return ret;

        set_bit(__LINK_STATE_START, &dev->state);

        netdev_ops_assert_locked(dev);

        if (ops->ndo_validate_addr)
                ret = ops->ndo_validate_addr(dev);

        if (!ret && ops->ndo_open)
                ret = ops->ndo_open(dev);

        netpoll_poll_enable(dev);

        if (ret)
                clear_bit(__LINK_STATE_START, &dev->state);
        else {
                netif_set_up(dev, true);
                dev_set_rx_mode(dev);
                dev_activate(dev);
                add_device_randomness(dev->dev_addr, dev->addr_len);
        }

        return ret;
}

int netif_open(struct net_device *dev, struct netlink_ext_ack *extack)
{
        int ret;

        if (dev->flags & IFF_UP)
                return 0;

        ret = __dev_open(dev, extack);
        if (ret < 0)
                return ret;

        rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
        call_netdevice_notifiers(NETDEV_UP, dev);

        return ret;
}

static void __dev_close_many(struct list_head *head)
{
        struct net_device *dev;

        ASSERT_RTNL();
        might_sleep();

        list_for_each_entry(dev, head, close_list) {
                /* Temporarily disable netpoll until the interface is down */
                netpoll_poll_disable(dev);

                call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);

                clear_bit(__LINK_STATE_START, &dev->state);

                /* Synchronize to scheduled poll. We cannot touch poll list, it
                 * can be even on different cpu. So just clear netif_running().
                 *
                 * dev->stop() will invoke napi_disable() on all of it's
                 * napi_struct instances on this device.
                 */
                smp_mb__after_atomic(); /* Commit netif_running(). */
        }

        dev_deactivate_many(head);

        list_for_each_entry(dev, head, close_list) {
                const struct net_device_ops *ops = dev->netdev_ops;

                /*
                 *        Call the device specific close. This cannot fail.
                 *        Only if device is UP
                 *
                 *        We allow it to be called even after a DETACH hot-plug
                 *        event.
                 */

                netdev_ops_assert_locked(dev);

                if (ops->ndo_stop)
                        ops->ndo_stop(dev);

                netif_set_up(dev, false);
                netpoll_poll_enable(dev);
        }
}

static void __dev_close(struct net_device *dev)
{
        LIST_HEAD(single);

        list_add(&dev->close_list, &single);
        __dev_close_many(&single);
        list_del(&single);
}

void netif_close_many(struct list_head *head, bool unlink)
{
        struct net_device *dev, *tmp;

        /* Remove the devices that don't need to be closed */
        list_for_each_entry_safe(dev, tmp, head, close_list)
                if (!(dev->flags & IFF_UP))
                        list_del_init(&dev->close_list);

        __dev_close_many(head);

        list_for_each_entry_safe(dev, tmp, head, close_list) {
                rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
                call_netdevice_notifiers(NETDEV_DOWN, dev);
                if (unlink)
                        list_del_init(&dev->close_list);
        }
}
EXPORT_SYMBOL_NS_GPL(netif_close_many, "NETDEV_INTERNAL");

void netif_close(struct net_device *dev)
{
        if (dev->flags & IFF_UP) {
                LIST_HEAD(single);

                list_add(&dev->close_list, &single);
                netif_close_many(&single, true);
                list_del(&single);
        }
}
EXPORT_SYMBOL(netif_close);

void netif_disable_lro(struct net_device *dev)
{
        struct net_device *lower_dev;
        struct list_head *iter;

        dev->wanted_features &= ~NETIF_F_LRO;
        netdev_update_features(dev);

        if (unlikely(dev->features & NETIF_F_LRO))
                netdev_WARN(dev, "failed to disable LRO!\n");

        netdev_for_each_lower_dev(dev, lower_dev, iter) {
                netdev_lock_ops(lower_dev);
                netif_disable_lro(lower_dev);
                netdev_unlock_ops(lower_dev);
        }
}
EXPORT_IPV6_MOD(netif_disable_lro);

/**
 *        dev_disable_gro_hw - disable HW Generic Receive Offload on a device
 *        @dev: device
 *
 *        Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
 *        called under RTNL.  This is needed if Generic XDP is installed on
 *        the device.
 */
static void dev_disable_gro_hw(struct net_device *dev)
{
        dev->wanted_features &= ~NETIF_F_GRO_HW;
        netdev_update_features(dev);

        if (unlikely(dev->features & NETIF_F_GRO_HW))
                netdev_WARN(dev, "failed to disable GRO_HW!\n");
}

const char *netdev_cmd_to_name(enum netdev_cmd cmd)
{
#define N(val)                                                 \
        case NETDEV_##val:                                \
                return "NETDEV_" __stringify(val);
        switch (cmd) {
        N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
        N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
        N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
        N(POST_INIT) N(PRE_UNINIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN)
        N(CHANGEUPPER) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA)
        N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE)
        N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
        N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
        N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
        N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
        N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
        N(XDP_FEAT_CHANGE)
        }
#undef N
        return "UNKNOWN_NETDEV_EVENT";
}
EXPORT_SYMBOL_GPL(netdev_cmd_to_name);

static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
                                   struct net_device *dev)
{
        struct netdev_notifier_info info = {
                .dev = dev,
        };

        return nb->notifier_call(nb, val, &info);
}

static int call_netdevice_register_notifiers(struct notifier_block *nb,
                                             struct net_device *dev)
{
        int err;

        err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
        err = notifier_to_errno(err);
        if (err)
                return err;

        if (!(dev->flags & IFF_UP))
                return 0;

        call_netdevice_notifier(nb, NETDEV_UP, dev);
        return 0;
}

static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
                                                struct net_device *dev)
{
        if (dev->flags & IFF_UP) {
                call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
                                        dev);
                call_netdevice_notifier(nb, NETDEV_DOWN, dev);
        }
        call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
}

static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
                                                 struct net *net)
{
        struct net_device *dev;
        int err;

        for_each_netdev(net, dev) {
                netdev_lock_ops(dev);
                err = call_netdevice_register_notifiers(nb, dev);
                netdev_unlock_ops(dev);
                if (err)
                        goto rollback;
        }
        return 0;

rollback:
        for_each_netdev_continue_reverse(net, dev)
                call_netdevice_unregister_notifiers(nb, dev);
        return err;
}

static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
                                                    struct net *net)
{
        struct net_device *dev;

        for_each_netdev(net, dev)
                call_netdevice_unregister_notifiers(nb, dev);
}

static int dev_boot_phase = 1;

/**
 * register_netdevice_notifier - register a network notifier block
 * @nb: notifier
 *
 * Register a notifier to be called when network device events occur.
 * The notifier passed is linked into the kernel structures and must
 * not be reused until it has been unregistered. A negative errno code
 * is returned on a failure.
 *
 * When registered all registration and up events are replayed
 * to the new notifier to allow device to have a race free
 * view of the network device list.
 */

int register_netdevice_notifier(struct notifier_block *nb)
{
        struct net *net;
        int err;

        /* Close race with setup_net() and cleanup_net() */
        down_write(&pernet_ops_rwsem);

        /* When RTNL is removed, we need protection for netdev_chain. */
        rtnl_lock();

        err = raw_notifier_chain_register(&netdev_chain, nb);
        if (err)
                goto unlock;
        if (dev_boot_phase)
                goto unlock;
        for_each_net(net) {
                __rtnl_net_lock(net);
                err = call_netdevice_register_net_notifiers(nb, net);
                __rtnl_net_unlock(net);
                if (err)
                        goto rollback;
        }

unlock:
        rtnl_unlock();
        up_write(&pernet_ops_rwsem);
        return err;

rollback:
        for_each_net_continue_reverse(net) {
                __rtnl_net_lock(net);
                call_netdevice_unregister_net_notifiers(nb, net);
                __rtnl_net_unlock(net);
        }

        raw_notifier_chain_unregister(&netdev_chain, nb);
        goto unlock;
}
EXPORT_SYMBOL(register_netdevice_notifier);

/**
 * unregister_netdevice_notifier - unregister a network notifier block
 * @nb: notifier
 *
 * Unregister a notifier previously registered by
 * register_netdevice_notifier(). The notifier is unlinked into the
 * kernel structures and may then be reused. A negative errno code
 * is returned on a failure.
 *
 * After unregistering unregister and down device events are synthesized
 * for all devices on the device list to the removed notifier to remove
 * the need for special case cleanup code.
 */

int unregister_netdevice_notifier(struct notifier_block *nb)
{
        struct net *net;
        int err;

        /* Close race with setup_net() and cleanup_net() */
        down_write(&pernet_ops_rwsem);
        rtnl_lock();
        err = raw_notifier_chain_unregister(&netdev_chain, nb);
        if (err)
                goto unlock;

        for_each_net(net) {
                __rtnl_net_lock(net);
                call_netdevice_unregister_net_notifiers(nb, net);
                __rtnl_net_unlock(net);
        }

unlock:
        rtnl_unlock();
        up_write(&pernet_ops_rwsem);
        return err;
}
EXPORT_SYMBOL(unregister_netdevice_notifier);

static int __register_netdevice_notifier_net(struct net *net,
                                             struct notifier_block *nb,
                                             bool ignore_call_fail)
{
        int err;

        err = raw_notifier_chain_register(&net->netdev_chain, nb);
        if (err)
                return err;
        if (dev_boot_phase)
                return 0;

        err = call_netdevice_register_net_notifiers(nb, net);
        if (err && !ignore_call_fail)
                goto chain_unregister;

        return 0;

chain_unregister:
        raw_notifier_chain_unregister(&net->netdev_chain, nb);
        return err;
}

static int __unregister_netdevice_notifier_net(struct net *net,
                                               struct notifier_block *nb)
{
        int err;

        err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
        if (err)
                return err;

        call_netdevice_unregister_net_notifiers(nb, net);
        return 0;
}

/**
 * register_netdevice_notifier_net - register a per-netns network notifier block
 * @net: network namespace
 * @nb: notifier
 *
 * Register a notifier to be called when network device events occur.
 * The notifier passed is linked into the kernel structures and must
 * not be reused until it has been unregistered. A negative errno code
 * is returned on a failure.
 *
 * When registered all registration and up events are replayed
 * to the new notifier to allow device to have a race free
 * view of the network device list.
 */

int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
{
        int err;

        rtnl_net_lock(net);
        err = __register_netdevice_notifier_net(net, nb, false);
        rtnl_net_unlock(net);

        return err;
}
EXPORT_SYMBOL(register_netdevice_notifier_net);

/**
 * unregister_netdevice_notifier_net - unregister a per-netns
 *                                     network notifier block
 * @net: network namespace
 * @nb: notifier
 *
 * Unregister a notifier previously registered by
 * register_netdevice_notifier_net(). The notifier is unlinked from the
 * kernel structures and may then be reused. A negative errno code
 * is returned on a failure.
 *
 * After unregistering unregister and down device events are synthesized
 * for all devices on the device list to the removed notifier to remove
 * the need for special case cleanup code.
 */

int unregister_netdevice_notifier_net(struct net *net,
                                      struct notifier_block *nb)
{
        int err;

        rtnl_net_lock(net);
        err = __unregister_netdevice_notifier_net(net, nb);
        rtnl_net_unlock(net);

        return err;
}
EXPORT_SYMBOL(unregister_netdevice_notifier_net);

static void __move_netdevice_notifier_net(struct net *src_net,
                                          struct net *dst_net,
                                          struct notifier_block *nb)
{
        __unregister_netdevice_notifier_net(src_net, nb);
        __register_netdevice_notifier_net(dst_net, nb, true);
}

static void rtnl_net_dev_lock(struct net_device *dev)
{
        bool again;

        do {
                struct net *net;

                again = false;

                /* netns might be being dismantled. */
                rcu_read_lock();
                net = dev_net_rcu(dev);
                net_passive_inc(net);
                rcu_read_unlock();

                rtnl_net_lock(net);

#ifdef CONFIG_NET_NS
                /* dev might have been moved to another netns. */
                if (!net_eq(net, rcu_access_pointer(dev->nd_net.net))) {
                        rtnl_net_unlock(net);
                        net_passive_dec(net);
                        again = true;
                }
#endif
        } while (again);
}

static void rtnl_net_dev_unlock(struct net_device *dev)
{
        struct net *net = dev_net(dev);

        rtnl_net_unlock(net);
        net_passive_dec(net);
}

int register_netdevice_notifier_dev_net(struct net_device *dev,
                                        struct notifier_block *nb,
                                        struct netdev_net_notifier *nn)
{
        int err;

        rtnl_net_dev_lock(dev);
        err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
        if (!err) {
                nn->nb = nb;
                list_add(&nn->list, &dev->net_notifier_list);
        }
        rtnl_net_dev_unlock(dev);

        return err;
}
EXPORT_SYMBOL(register_netdevice_notifier_dev_net);

int unregister_netdevice_notifier_dev_net(struct net_device *dev,
                                          struct notifier_block *nb,
                                          struct netdev_net_notifier *nn)
{
        int err;

        rtnl_net_dev_lock(dev);
        list_del(&nn->list);
        err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
        rtnl_net_dev_unlock(dev);

        return err;
}
EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);

static void move_netdevice_notifiers_dev_net(struct net_device *dev,
                                             struct net *net)
{
        struct netdev_net_notifier *nn;

        list_for_each_entry(nn, &dev->net_notifier_list, list)
                __move_netdevice_notifier_net(dev_net(dev), net, nn->nb);
}

/**
 *        call_netdevice_notifiers_info - call all network notifier blocks
 *        @val: value passed unmodified to notifier function
 *        @info: notifier information data
 *
 *        Call all network notifier blocks.  Parameters and return value
 *        are as for raw_notifier_call_chain().
 */

int call_netdevice_notifiers_info(unsigned long val,
                                  struct netdev_notifier_info *info)
{
        struct net *net = dev_net(info->dev);
        int ret;

        ASSERT_RTNL();

        /* Run per-netns notifier block chain first, then run the global one.
         * Hopefully, one day, the global one is going to be removed after
         * all notifier block registrators get converted to be per-netns.
         */
        ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
        if (ret & NOTIFY_STOP_MASK)
                return ret;
        return raw_notifier_call_chain(&netdev_chain, val, info);
}

/**
 *        call_netdevice_notifiers_info_robust - call per-netns notifier blocks
 *                                               for and rollback on error
 *        @val_up: value passed unmodified to notifier function
 *        @val_down: value passed unmodified to the notifier function when
 *                   recovering from an error on @val_up
 *        @info: notifier information data
 *
 *        Call all per-netns network notifier blocks, but not notifier blocks on
 *        the global notifier chain. Parameters and return value are as for
 *        raw_notifier_call_chain_robust().
 */

static int
call_netdevice_notifiers_info_robust(unsigned long val_up,
                                     unsigned long val_down,
                                     struct netdev_notifier_info *info)
{
        struct net *net = dev_net(info->dev);

        ASSERT_RTNL();

        return raw_notifier_call_chain_robust(&net->netdev_chain,
                                              val_up, val_down, info);
}

static int call_netdevice_notifiers_extack(unsigned long val,
                                           struct net_device *dev,
                                           struct netlink_ext_ack *extack)
{
        struct netdev_notifier_info info = {
                .dev = dev,
                .extack = extack,
        };

        return call_netdevice_notifiers_info(val, &info);
}

/**
 *        call_netdevice_notifiers - call all network notifier blocks
 *      @val: value passed unmodified to notifier function
 *      @dev: net_device pointer passed unmodified to notifier function
 *
 *        Call all network notifier blocks.  Parameters and return value
 *        are as for raw_notifier_call_chain().
 */

int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
{
        return call_netdevice_notifiers_extack(val, dev, NULL);
}
EXPORT_SYMBOL(call_netdevice_notifiers);

/**
 *        call_netdevice_notifiers_mtu - call all network notifier blocks
 *        @val: value passed unmodified to notifier function
 *        @dev: net_device pointer passed unmodified to notifier function
 *        @arg: additional u32 argument passed to the notifier function
 *
 *        Call all network notifier blocks.  Parameters and return value
 *        are as for raw_notifier_call_chain().
 */
static int call_netdevice_notifiers_mtu(unsigned long val,
                                        struct net_device *dev, u32 arg)
{
        struct netdev_notifier_info_ext info = {
                .info.dev = dev,
                .ext.mtu = arg,
        };

        BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);

        return call_netdevice_notifiers_info(val, &info.info);
}

#ifdef CONFIG_NET_INGRESS
static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);

void net_inc_ingress_queue(void)
{
        static_branch_inc(&ingress_needed_key);
}
EXPORT_SYMBOL_GPL(net_inc_ingress_queue);

void net_dec_ingress_queue(void)
{
        static_branch_dec(&ingress_needed_key);
}
EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
#endif

#ifdef CONFIG_NET_EGRESS
static DEFINE_STATIC_KEY_FALSE(egress_needed_key);

void net_inc_egress_queue(void)
{
        static_branch_inc(&egress_needed_key);
}
EXPORT_SYMBOL_GPL(net_inc_egress_queue);

void net_dec_egress_queue(void)
{
        static_branch_dec(&egress_needed_key);
}
EXPORT_SYMBOL_GPL(net_dec_egress_queue);
#endif

#ifdef CONFIG_NET_CLS_ACT
DEFINE_STATIC_KEY_FALSE(tcf_sw_enabled_key);
EXPORT_SYMBOL(tcf_sw_enabled_key);
#endif

DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
EXPORT_SYMBOL(netstamp_needed_key);
#ifdef CONFIG_JUMP_LABEL
static atomic_t netstamp_needed_deferred;
static atomic_t netstamp_wanted;
static void netstamp_clear(struct work_struct *work)
{
        int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
        int wanted;

        wanted = atomic_add_return(deferred, &netstamp_wanted);
        if (wanted > 0)
                static_branch_enable(&netstamp_needed_key);
        else
                static_branch_disable(&netstamp_needed_key);
}
static DECLARE_WORK(netstamp_work, netstamp_clear);
#endif

void net_enable_timestamp(void)
{
#ifdef CONFIG_JUMP_LABEL
        int wanted = atomic_read(&netstamp_wanted);

        while (wanted > 0) {
                if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted + 1))
                        return;
        }
        atomic_inc(&netstamp_needed_deferred);
        schedule_work(&netstamp_work);
#else
        static_branch_inc(&netstamp_needed_key);
#endif
}
EXPORT_SYMBOL(net_enable_timestamp);

void net_disable_timestamp(void)
{
#ifdef CONFIG_JUMP_LABEL
        int wanted = atomic_read(&netstamp_wanted);

        while (wanted > 1) {
                if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted - 1))
                        return;
        }
        atomic_dec(&netstamp_needed_deferred);
        schedule_work(&netstamp_work);
#else
        static_branch_dec(&netstamp_needed_key);
#endif
}
EXPORT_SYMBOL(net_disable_timestamp);

static inline void net_timestamp_set(struct sk_buff *skb)
{
        skb->tstamp = 0;
        skb->tstamp_type = SKB_CLOCK_REALTIME;
        if (static_branch_unlikely(&netstamp_needed_key))
                skb->tstamp = ktime_get_real();
}

#define net_timestamp_check(COND, SKB)                                \
        if (static_branch_unlikely(&netstamp_needed_key)) {        \
                if ((COND) && !(SKB)->tstamp)                        \
                        (SKB)->tstamp = ktime_get_real();        \
        }                                                        \

bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
{
        return __is_skb_forwardable(dev, skb, true);
}
EXPORT_SYMBOL_GPL(is_skb_forwardable);

static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb,
                              bool check_mtu)
{
        int ret = ____dev_forward_skb(dev, skb, check_mtu);

        if (likely(!ret)) {
                skb->protocol = eth_type_trans(skb, dev);
                skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
        }

        return ret;
}

int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
        return __dev_forward_skb2(dev, skb, true);
}
EXPORT_SYMBOL_GPL(__dev_forward_skb);

/**
 * dev_forward_skb - loopback an skb to another netif
 *
 * @dev: destination network device
 * @skb: buffer to forward
 *
 * return values:
 *        NET_RX_SUCCESS        (no congestion)
 *        NET_RX_DROP     (packet was dropped, but freed)
 *
 * dev_forward_skb can be used for injecting an skb from the
 * start_xmit function of one device into the receive queue
 * of another device.
 *
 * The receiving device may be in another namespace, so
 * we have to clear all information in the skb that could
 * impact namespace isolation.
 */
int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
        return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
}
EXPORT_SYMBOL_GPL(dev_forward_skb);

int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb)
{
        return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb);
}

static inline int deliver_skb(struct sk_buff *skb,
                              struct packet_type *pt_prev,
                              struct net_device *orig_dev)
{
        if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
                return -ENOMEM;
        refcount_inc(&skb->users);
        return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}

static inline void deliver_ptype_list_skb(struct sk_buff *skb,
                                          struct packet_type **pt,
                                          struct net_device *orig_dev,
                                          __be16 type,
                                          struct list_head *ptype_list)
{
        struct packet_type *ptype, *pt_prev = *pt;

        list_for_each_entry_rcu(ptype, ptype_list, list) {
                if (ptype->type != type)
                        continue;
                if (pt_prev)
                        deliver_skb(skb, pt_prev, orig_dev);
                pt_prev = ptype;
        }
        *pt = pt_prev;
}

static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
{
        if (!ptype->af_packet_priv || !skb->sk)
                return false;

        if (ptype->id_match)
                return ptype->id_match(ptype, skb->sk);
        else if ((struct sock *)ptype->af_packet_priv == skb->sk)
                return true;

        return false;
}

/**
 * dev_nit_active_rcu - return true if any network interface taps are in use
 *
 * The caller must hold the RCU lock
 *
 * @dev: network device to check for the presence of taps
 */
bool dev_nit_active_rcu(const struct net_device *dev)
{
        /* Callers may hold either RCU or RCU BH lock */
        WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());

        return !list_empty(&dev_net(dev)->ptype_all) ||
               !list_empty(&dev->ptype_all);
}
EXPORT_SYMBOL_GPL(dev_nit_active_rcu);

/*
 *        Support routine. Sends outgoing frames to any network
 *        taps currently in use.
 */

void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
{
        struct packet_type *ptype, *pt_prev = NULL;
        struct list_head *ptype_list;
        struct sk_buff *skb2 = NULL;

        rcu_read_lock();
        ptype_list = &dev_net_rcu(dev)->ptype_all;
again:
        list_for_each_entry_rcu(ptype, ptype_list, list) {
                if (READ_ONCE(ptype->ignore_outgoing))
                        continue;

                /* Never send packets back to the socket
                 * they originated from - MvS (miquels@drinkel.ow.org)
                 */
                if (skb_loop_sk(ptype, skb))
                        continue;

                if (pt_prev) {
                        deliver_skb(skb2, pt_prev, skb->dev);
                        pt_prev = ptype;
                        continue;
                }

                /* need to clone skb, done only once */
                skb2 = skb_clone(skb, GFP_ATOMIC);
                if (!skb2)
                        goto out_unlock;

                net_timestamp_set(skb2);

                /* skb->nh should be correctly
                 * set by sender, so that the second statement is
                 * just protection against buggy protocols.
                 */
                skb_reset_mac_header(skb2);

                if (skb_network_header(skb2) < skb2->data ||
                    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
                        net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
                                             ntohs(skb2->protocol),
                                             dev->name);
                        skb_reset_network_header(skb2);
                }

                skb2->transport_header = skb2->network_header;
                skb2->pkt_type = PACKET_OUTGOING;
                pt_prev = ptype;
        }

        if (ptype_list != &dev->ptype_all) {
                ptype_list = &dev->ptype_all;
                goto again;
        }
out_unlock:
        if (pt_prev) {
                if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
                        pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
                else
                        kfree_skb(skb2);
        }
        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);

/**
 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
 * @dev: Network device
 * @txq: number of queues available
 *
 * If real_num_tx_queues is changed the tc mappings may no longer be
 * valid. To resolve this verify the tc mapping remains valid and if
 * not NULL the mapping. With no priorities mapping to this
 * offset/count pair it will no longer be used. In the worst case TC0
 * is invalid nothing can be done so disable priority mappings. If is
 * expected that drivers will fix this mapping if they can before
 * calling netif_set_real_num_tx_queues.
 */
static void netif_setup_tc(struct net_device *dev, unsigned int txq)
{
        int i;
        struct netdev_tc_txq *tc = &dev->tc_to_txq[0];

        /* If TC0 is invalidated disable TC mapping */
        if (tc->offset + tc->count > txq) {
                netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
                dev->num_tc = 0;
                return;
        }

        /* Invalidated prio to tc mappings set to TC0 */
        for (i = 1; i < TC_BITMASK + 1; i++) {
                int q = netdev_get_prio_tc_map(dev, i);

                tc = &dev->tc_to_txq[q];
                if (tc->offset + tc->count > txq) {
                        netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
                                    i, q);
                        netdev_set_prio_tc_map(dev, i, 0);
                }
        }
}

int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
{
        if (dev->num_tc) {
                struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
                int i;

                /* walk through the TCs and see if it falls into any of them */
                for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
                        if ((txq - tc->offset) < tc->count)
                                return i;
                }

                /* didn't find it, just return -1 to indicate no match */
                return -1;
        }

        return 0;
}
EXPORT_SYMBOL(netdev_txq_to_tc);

#ifdef CONFIG_XPS
static struct static_key xps_needed __read_mostly;
static struct static_key xps_rxqs_needed __read_mostly;
static DEFINE_MUTEX(xps_map_mutex);
#define xmap_dereference(P)                \
        rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))

static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
                             struct xps_dev_maps *old_maps, int tci, u16 index)
{
        struct xps_map *map = NULL;
        int pos;

        map = xmap_dereference(dev_maps->attr_map[tci]);
        if (!map)
                return false;

        for (pos = map->len; pos--;) {
                if (map->queues[pos] != index)
                        continue;

                if (map->len > 1) {
                        map->queues[pos] = map->queues[--map->len];
                        break;
                }

                if (old_maps)
                        RCU_INIT_POINTER(old_maps->attr_map[tci], NULL);
                RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
                kfree_rcu(map, rcu);
                return false;
        }

        return true;
}

static bool remove_xps_queue_cpu(struct net_device *dev,
                                 struct xps_dev_maps *dev_maps,
                                 int cpu, u16 offset, u16 count)
{
        int num_tc = dev_maps->num_tc;
        bool active = false;
        int tci;

        for (tci = cpu * num_tc; num_tc--; tci++) {
                int i, j;

                for (i = count, j = offset; i--; j++) {
                        if (!remove_xps_queue(dev_maps, NULL, tci, j))
                                break;
                }

                active |= i < 0;
        }

        return active;
}

static void reset_xps_maps(struct net_device *dev,
                           struct xps_dev_maps *dev_maps,
                           enum xps_map_type type)
{
        static_key_slow_dec_cpuslocked(&xps_needed);
        if (type == XPS_RXQS)
                static_key_slow_dec_cpuslocked(&xps_rxqs_needed);

        RCU_INIT_POINTER(dev->xps_maps[type], NULL);

        kfree_rcu(dev_maps, rcu);
}

static void clean_xps_maps(struct net_device *dev, enum xps_map_type type,
                           u16 offset, u16 count)
{
        struct xps_dev_maps *dev_maps;
        bool active = false;
        int i, j;

        dev_maps = xmap_dereference(dev->xps_maps[type]);
        if (!dev_maps)
                return;

        for (j = 0; j < dev_maps->nr_ids; j++)
                active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count);
        if (!active)
                reset_xps_maps(dev, dev_maps, type);

        if (type == XPS_CPUS) {
                for (i = offset + (count - 1); count--; i--)
                        netdev_queue_numa_node_write(
                                netdev_get_tx_queue(dev, i), NUMA_NO_NODE);
        }
}

static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
                                   u16 count)
{
        if (!static_key_false(&xps_needed))
                return;

        cpus_read_lock();
        mutex_lock(&xps_map_mutex);

        if (static_key_false(&xps_rxqs_needed))
                clean_xps_maps(dev, XPS_RXQS, offset, count);

        clean_xps_maps(dev, XPS_CPUS, offset, count);

        mutex_unlock(&xps_map_mutex);
        cpus_read_unlock();
}

static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
{
        netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
}

static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
                                      u16 index, bool is_rxqs_map)
{
        struct xps_map *new_map;
        int alloc_len = XPS_MIN_MAP_ALLOC;
        int i, pos;

        for (pos = 0; map && pos < map->len; pos++) {
                if (map->queues[pos] != index)
                        continue;
                return map;
        }

        /* Need to add tx-queue to this CPU's/rx-queue's existing map */
        if (map) {
                if (pos < map->alloc_len)
                        return map;

                alloc_len = map->alloc_len * 2;
        }

        /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
         *  map
         */
        if (is_rxqs_map)
                new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
        else
                new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
                                       cpu_to_node(attr_index));
        if (!new_map)
                return NULL;

        for (i = 0; i < pos; i++)
                new_map->queues[i] = map->queues[i];
        new_map->alloc_len = alloc_len;
        new_map->len = pos;

        return new_map;
}

/* Copy xps maps at a given index */
static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps,
                              struct xps_dev_maps *new_dev_maps, int index,
                              int tc, bool skip_tc)
{
        int i, tci = index * dev_maps->num_tc;
        struct xps_map *map;

        /* copy maps belonging to foreign traffic classes */
        for (i = 0; i < dev_maps->num_tc; i++, tci++) {
                if (i == tc && skip_tc)
                        continue;

                /* fill in the new device map from the old device map */
                map = xmap_dereference(dev_maps->attr_map[tci]);
                RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
        }
}

/* Must be called under cpus_read_lock */
int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
                          u16 index, enum xps_map_type type)
{
        struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL;
        const unsigned long *online_mask = NULL;
        bool active = false, copy = false;
        int i, j, tci, numa_node_id = -2;
        int maps_sz, num_tc = 1, tc = 0;
        struct xps_map *map, *new_map;
        unsigned int nr_ids;

        WARN_ON_ONCE(index >= dev->num_tx_queues);

        if (dev->num_tc) {
                /* Do not allow XPS on subordinate device directly */
                num_tc = dev->num_tc;
                if (num_tc < 0)
                        return -EINVAL;

                /* If queue belongs to subordinate dev use its map */
                dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;

                tc = netdev_txq_to_tc(dev, index);
                if (tc < 0)
                        return -EINVAL;
        }

        mutex_lock(&xps_map_mutex);

        dev_maps = xmap_dereference(dev->xps_maps[type]);
        if (type == XPS_RXQS) {
                maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
                nr_ids = dev->num_rx_queues;
        } else {
                maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
                if (num_possible_cpus() > 1)
                        online_mask = cpumask_bits(cpu_online_mask);
                nr_ids = nr_cpu_ids;
        }

        if (maps_sz < L1_CACHE_BYTES)
                maps_sz = L1_CACHE_BYTES;

        /* The old dev_maps could be larger or smaller than the one we're
         * setting up now, as dev->num_tc or nr_ids could have been updated in
         * between. We could try to be smart, but let's be safe instead and only
         * copy foreign traffic classes if the two map sizes match.
         */
        if (dev_maps &&
            dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids)
                copy = true;

        /* allocate memory for queue storage */
        for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
             j < nr_ids;) {
                if (!new_dev_maps) {
                        new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
                        if (!new_dev_maps) {
                                mutex_unlock(&xps_map_mutex);
                                return -ENOMEM;
                        }

                        new_dev_maps->nr_ids = nr_ids;
                        new_dev_maps->num_tc = num_tc;
                }

                tci = j * num_tc + tc;
                map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL;

                map = expand_xps_map(map, j, index, type == XPS_RXQS);
                if (!map)
                        goto error;

                RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
        }

        if (!new_dev_maps)
                goto out_no_new_maps;

        if (!dev_maps) {
                /* Increment static keys at most once per type */
                static_key_slow_inc_cpuslocked(&xps_needed);
                if (type == XPS_RXQS)
                        static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
        }

        for (j = 0; j < nr_ids; j++) {
                bool skip_tc = false;

                tci = j * num_tc + tc;
                if (netif_attr_test_mask(j, mask, nr_ids) &&
                    netif_attr_test_online(j, online_mask, nr_ids)) {
                        /* add tx-queue to CPU/rx-queue maps */
                        int pos = 0;

                        skip_tc = true;

                        map = xmap_dereference(new_dev_maps->attr_map[tci]);
                        while ((pos < map->len) && (map->queues[pos] != index))
                                pos++;

                        if (pos == map->len)
                                map->queues[map->len++] = index;
#ifdef CONFIG_NUMA
                        if (type == XPS_CPUS) {
                                if (numa_node_id == -2)
                                        numa_node_id = cpu_to_node(j);
                                else if (numa_node_id != cpu_to_node(j))
                                        numa_node_id = -1;
                        }
#endif
                }

                if (copy)
                        xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc,
                                          skip_tc);
        }

        rcu_assign_pointer(dev->xps_maps[type], new_dev_maps);

        /* Cleanup old maps */
        if (!dev_maps)
                goto out_no_old_maps;

        for (j = 0; j < dev_maps->nr_ids; j++) {
                for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) {
                        map = xmap_dereference(dev_maps->attr_map[tci]);
                        if (!map)
                                continue;

                        if (copy) {
                                new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
                                if (map == new_map)
                                        continue;
                        }

                        RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
                        kfree_rcu(map, rcu);
                }
        }

        old_dev_maps = dev_maps;

out_no_old_maps:
        dev_maps = new_dev_maps;
        active = true;

out_no_new_maps:
        if (type == XPS_CPUS)
                /* update Tx queue numa node */
                netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
                                             (numa_node_id >= 0) ?
                                             numa_node_id : NUMA_NO_NODE);

        if (!dev_maps)
                goto out_no_maps;

        /* removes tx-queue from unused CPUs/rx-queues */
        for (j = 0; j < dev_maps->nr_ids; j++) {
                tci = j * dev_maps->num_tc;

                for (i = 0; i < dev_maps->num_tc; i++, tci++) {
                        if (i == tc &&
                            netif_attr_test_mask(j, mask, dev_maps->nr_ids) &&
                            netif_attr_test_online(j, online_mask, dev_maps->nr_ids))
                                continue;

                        active |= remove_xps_queue(dev_maps,
                                                   copy ? old_dev_maps : NULL,
                                                   tci, index);
                }
        }

        if (old_dev_maps)
                kfree_rcu(old_dev_maps, rcu);

        /* free map if not active */
        if (!active)
                reset_xps_maps(dev, dev_maps, type);

out_no_maps:
        mutex_unlock(&xps_map_mutex);

        return 0;
error:
        /* remove any maps that we added */
        for (j = 0; j < nr_ids; j++) {
                for (i = num_tc, tci = j * num_tc; i--; tci++) {
                        new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
                        map = copy ?
                              xmap_dereference(dev_maps->attr_map[tci]) :
                              NULL;
                        if (new_map && new_map != map)
                                kfree(new_map);
                }
        }

        mutex_unlock(&xps_map_mutex);

        kfree(new_dev_maps);
        return -ENOMEM;
}
EXPORT_SYMBOL_GPL(__netif_set_xps_queue);

int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
                        u16 index)
{
        int ret;

        cpus_read_lock();
        ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS);
        cpus_read_unlock();

        return ret;
}
EXPORT_SYMBOL(netif_set_xps_queue);

#endif
static void netdev_unbind_all_sb_channels(struct net_device *dev)
{
        struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];

        /* Unbind any subordinate channels */
        while (txq-- != &dev->_tx[0]) {
                if (txq->sb_dev)
                        netdev_unbind_sb_channel(dev, txq->sb_dev);
        }
}

void netdev_reset_tc(struct net_device *dev)
{
#ifdef CONFIG_XPS
        netif_reset_xps_queues_gt(dev, 0);
#endif
        netdev_unbind_all_sb_channels(dev);

        /* Reset TC configuration of device */
        dev->num_tc = 0;
        memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
        memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
}
EXPORT_SYMBOL(netdev_reset_tc);

int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
{
        if (tc >= dev->num_tc)
                return -EINVAL;

#ifdef CONFIG_XPS
        netif_reset_xps_queues(dev, offset, count);
#endif
        dev->tc_to_txq[tc].count = count;
        dev->tc_to_txq[tc].offset = offset;
        return 0;
}
EXPORT_SYMBOL(netdev_set_tc_queue);

int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
{
        if (num_tc > TC_MAX_QUEUE)
                return -EINVAL;

#ifdef CONFIG_XPS
        netif_reset_xps_queues_gt(dev, 0);
#endif
        netdev_unbind_all_sb_channels(dev);

        dev->num_tc = num_tc;
        return 0;
}
EXPORT_SYMBOL(netdev_set_num_tc);

void netdev_unbind_sb_channel(struct net_device *dev,
                              struct net_device *sb_dev)
{
        struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];

#ifdef CONFIG_XPS
        netif_reset_xps_queues_gt(sb_dev, 0);
#endif
        memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
        memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));

        while (txq-- != &dev->_tx[0]) {
                if (txq->sb_dev == sb_dev)
                        txq->sb_dev = NULL;
        }
}
EXPORT_SYMBOL(netdev_unbind_sb_channel);

int netdev_bind_sb_channel_queue(struct net_device *dev,
                                 struct net_device *sb_dev,
                                 u8 tc, u16 count, u16 offset)
{
        /* Make certain the sb_dev and dev are already configured */
        if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
                return -EINVAL;

        /* We cannot hand out queues we don't have */
        if ((offset + count) > dev->real_num_tx_queues)
                return -EINVAL;

        /* Record the mapping */
        sb_dev->tc_to_txq[tc].count = count;
        sb_dev->tc_to_txq[tc].offset = offset;

        /* Provide a way for Tx queue to find the tc_to_txq map or
         * XPS map for itself.
         */
        while (count--)
                netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;

        return 0;
}
EXPORT_SYMBOL(netdev_bind_sb_channel_queue);

int netdev_set_sb_channel(struct net_device *dev, u16 channel)
{
        /* Do not use a multiqueue device to represent a subordinate channel */
        if (netif_is_multiqueue(dev))
                return -ENODEV;

        /* We allow channels 1 - 32767 to be used for subordinate channels.
         * Channel 0 is meant to be "native" mode and used only to represent
         * the main root device. We allow writing 0 to reset the device back
         * to normal mode after being used as a subordinate channel.
         */
        if (channel > S16_MAX)
                return -EINVAL;

        dev->num_tc = -channel;

        return 0;
}
EXPORT_SYMBOL(netdev_set_sb_channel);

/*
 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
 */
int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
{
        bool disabling;
        int rc;

        disabling = txq < dev->real_num_tx_queues;

        if (txq < 1 || txq > dev->num_tx_queues)
                return -EINVAL;

        if (dev->reg_state == NETREG_REGISTERED ||
            dev->reg_state == NETREG_UNREGISTERING) {
                netdev_ops_assert_locked(dev);

                rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
                                                  txq);
                if (rc)
                        return rc;

                if (dev->num_tc)
                        netif_setup_tc(dev, txq);

                net_shaper_set_real_num_tx_queues(dev, txq);

                dev_qdisc_change_real_num_tx(dev, txq);

                dev->real_num_tx_queues = txq;

                if (disabling) {
                        synchronize_net();
                        qdisc_reset_all_tx_gt(dev, txq);
#ifdef CONFIG_XPS
                        netif_reset_xps_queues_gt(dev, txq);
#endif
                }
        } else {
                dev->real_num_tx_queues = txq;
        }

        return 0;
}
EXPORT_SYMBOL(netif_set_real_num_tx_queues);

/**
 *        netif_set_real_num_rx_queues - set actual number of RX queues used
 *        @dev: Network device
 *        @rxq: Actual number of RX queues
 *
 *        This must be called either with the rtnl_lock held or before
 *        registration of the net device.  Returns 0 on success, or a
 *        negative error code.  If called before registration, it always
 *        succeeds.
 */
int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
{
        int rc;

        if (rxq < 1 || rxq > dev->num_rx_queues)
                return -EINVAL;

        if (dev->reg_state == NETREG_REGISTERED) {
                netdev_ops_assert_locked(dev);

                rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
                                                  rxq);
                if (rc)
                        return rc;
        }

        dev->real_num_rx_queues = rxq;
        return 0;
}
EXPORT_SYMBOL(netif_set_real_num_rx_queues);

/**
 *        netif_set_real_num_queues - set actual number of RX and TX queues used
 *        @dev: Network device
 *        @txq: Actual number of TX queues
 *        @rxq: Actual number of RX queues
 *
 *        Set the real number of both TX and RX queues.
 *        Does nothing if the number of queues is already correct.
 */
int netif_set_real_num_queues(struct net_device *dev,
                              unsigned int txq, unsigned int rxq)
{
        unsigned int old_rxq = dev->real_num_rx_queues;
        int err;

        if (txq < 1 || txq > dev->num_tx_queues ||
            rxq < 1 || rxq > dev->num_rx_queues)
                return -EINVAL;

        /* Start from increases, so the error path only does decreases -
         * decreases can't fail.
         */
        if (rxq > dev->real_num_rx_queues) {
                err = netif_set_real_num_rx_queues(dev, rxq);
                if (err)
                        return err;
        }
        if (txq > dev->real_num_tx_queues) {
                err = netif_set_real_num_tx_queues(dev, txq);
                if (err)
                        goto undo_rx;
        }
        if (rxq < dev->real_num_rx_queues)
                WARN_ON(netif_set_real_num_rx_queues(dev, rxq));
        if (txq < dev->real_num_tx_queues)
                WARN_ON(netif_set_real_num_tx_queues(dev, txq));

        return 0;
undo_rx:
        WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq));
        return err;
}
EXPORT_SYMBOL(netif_set_real_num_queues);

/**
 * netif_set_tso_max_size() - set the max size of TSO frames supported
 * @dev:        netdev to update
 * @size:        max skb->len of a TSO frame
 *
 * Set the limit on the size of TSO super-frames the device can handle.
 * Unless explicitly set the stack will assume the value of
 * %GSO_LEGACY_MAX_SIZE.
 */
void netif_set_tso_max_size(struct net_device *dev, unsigned int size)
{
        dev->tso_max_size = min(GSO_MAX_SIZE, size);
        if (size < READ_ONCE(dev->gso_max_size))
                netif_set_gso_max_size(dev, size);
        if (size < READ_ONCE(dev->gso_ipv4_max_size))
                netif_set_gso_ipv4_max_size(dev, size);
}
EXPORT_SYMBOL(netif_set_tso_max_size);

/**
 * netif_set_tso_max_segs() - set the max number of segs supported for TSO
 * @dev:        netdev to update
 * @segs:        max number of TCP segments
 *
 * Set the limit on the number of TCP segments the device can generate from
 * a single TSO super-frame.
 * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS.
 */
void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs)
{
        dev->tso_max_segs = segs;
        if (segs < READ_ONCE(dev->gso_max_segs))
                netif_set_gso_max_segs(dev, segs);
}
EXPORT_SYMBOL(netif_set_tso_max_segs);

/**
 * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper
 * @to:                netdev to update
 * @from:        netdev from which to copy the limits
 */
void netif_inherit_tso_max(struct net_device *to, const struct net_device *from)
{
        netif_set_tso_max_size(to, from->tso_max_size);
        netif_set_tso_max_segs(to, from->tso_max_segs);
}
EXPORT_SYMBOL(netif_inherit_tso_max);

/**
 * netif_get_num_default_rss_queues - default number of RSS queues
 *
 * Default value is the number of physical cores if there are only 1 or 2, or
 * divided by 2 if there are more.
 */
int netif_get_num_default_rss_queues(void)
{
        cpumask_var_t cpus;
        int cpu, count = 0;

        if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL)))
                return 1;

        cpumask_copy(cpus, cpu_online_mask);
        for_each_cpu(cpu, cpus) {
                ++count;
                cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu));
        }
        free_cpumask_var(cpus);

        return count > 2 ? DIV_ROUND_UP(count, 2) : count;
}
EXPORT_SYMBOL(netif_get_num_default_rss_queues);

static void __netif_reschedule(struct Qdisc *q)
{
        struct softnet_data *sd;
        unsigned long flags;

        local_irq_save(flags);
        sd = this_cpu_ptr(&softnet_data);
        q->next_sched = NULL;
        *sd->output_queue_tailp = q;
        sd->output_queue_tailp = &q->next_sched;
        raise_softirq_irqoff(NET_TX_SOFTIRQ);
        local_irq_restore(flags);
}

void __netif_schedule(struct Qdisc *q)
{
        if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
                __netif_reschedule(q);
}
EXPORT_SYMBOL(__netif_schedule);

struct dev_kfree_skb_cb {
        enum skb_drop_reason reason;
};

static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
{
        return (struct dev_kfree_skb_cb *)skb->cb;
}

void netif_schedule_queue(struct netdev_queue *txq)
{
        rcu_read_lock();
        if (!netif_xmit_stopped(txq)) {
                struct Qdisc *q = rcu_dereference(txq->qdisc);

                __netif_schedule(q);
        }
        rcu_read_unlock();
}
EXPORT_SYMBOL(netif_schedule_queue);

void netif_tx_wake_queue(struct netdev_queue *dev_queue)
{
        if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
                struct Qdisc *q;

                rcu_read_lock();
                q = rcu_dereference(dev_queue->qdisc);
                __netif_schedule(q);
                rcu_read_unlock();
        }
}
EXPORT_SYMBOL(netif_tx_wake_queue);

void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason)
{
        unsigned long flags;

        if (unlikely(!skb))
                return;

        if (likely(refcount_read(&skb->users) == 1)) {
                smp_rmb();
                refcount_set(&skb->users, 0);
        } else if (likely(!refcount_dec_and_test(&skb->users))) {
                return;
        }
        get_kfree_skb_cb(skb)->reason = reason;
        local_irq_save(flags);
        skb->next = __this_cpu_read(softnet_data.completion_queue);
        __this_cpu_write(softnet_data.completion_queue, skb);
        raise_softirq_irqoff(NET_TX_SOFTIRQ);
        local_irq_restore(flags);
}
EXPORT_SYMBOL(dev_kfree_skb_irq_reason);

void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason)
{
        if (in_hardirq() || irqs_disabled())
                dev_kfree_skb_irq_reason(skb, reason);
        else
                kfree_skb_reason(skb, reason);
}
EXPORT_SYMBOL(dev_kfree_skb_any_reason);


/**
 * netif_device_detach - mark device as removed
 * @dev: network device
 *
 * Mark device as removed from system and therefore no longer available.
 */
void netif_device_detach(struct net_device *dev)
{
        if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
            netif_running(dev)) {
                netif_tx_stop_all_queues(dev);
        }
}
EXPORT_SYMBOL(netif_device_detach);

/**
 * netif_device_attach - mark device as attached
 * @dev: network device
 *
 * Mark device as attached from system and restart if needed.
 */
void netif_device_attach(struct net_device *dev)
{
        if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
            netif_running(dev)) {
                netif_tx_wake_all_queues(dev);
                netdev_watchdog_up(dev);
        }
}
EXPORT_SYMBOL(netif_device_attach);

/*
 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
 * to be used as a distribution range.
 */
static u16 skb_tx_hash(const struct net_device *dev,
                       const struct net_device *sb_dev,
                       struct sk_buff *skb)
{
        u32 hash;
        u16 qoffset = 0;
        u16 qcount = dev->real_num_tx_queues;

        if (dev->num_tc) {
                u8 tc = netdev_get_prio_tc_map(dev, skb->priority);

                qoffset = sb_dev->tc_to_txq[tc].offset;
                qcount = sb_dev->tc_to_txq[tc].count;
                if (unlikely(!qcount)) {
                        net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n",
                                             sb_dev->name, qoffset, tc);
                        qoffset = 0;
                        qcount = dev->real_num_tx_queues;
                }
        }

        if (skb_rx_queue_recorded(skb)) {
                DEBUG_NET_WARN_ON_ONCE(qcount == 0);
                hash = skb_get_rx_queue(skb);
                if (hash >= qoffset)
                        hash -= qoffset;
                while (unlikely(hash >= qcount))
                        hash -= qcount;
                return hash + qoffset;
        }

        return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
}

void skb_warn_bad_offload(const struct sk_buff *skb)
{
        static const netdev_features_t null_features;
        struct net_device *dev = skb->dev;
        const char *name = "";

        if (!net_ratelimit())
                return;

        if (dev) {
                if (dev->dev.parent)
                        name = dev_driver_string(dev->dev.parent);
                else
                        name = netdev_name(dev);
        }
        skb_dump(KERN_WARNING, skb, false);
        WARN(1, "%s: caps=(%pNF, %pNF)\n",
             name, dev ? &dev->features : &null_features,
             skb->sk ? &skb->sk->sk_route_caps : &null_features);
}

/*
 * Invalidate hardware checksum when packet is to be mangled, and
 * complete checksum manually on outgoing path.
 */
int skb_checksum_help(struct sk_buff *skb)
{
        __wsum csum;
        int ret = 0, offset;

        if (skb->ip_summed == CHECKSUM_COMPLETE)
                goto out_set_summed;

        if (unlikely(skb_is_gso(skb))) {
                skb_warn_bad_offload(skb);
                return -EINVAL;
        }

        if (!skb_frags_readable(skb)) {
                return -EFAULT;
        }

        /* Before computing a checksum, we should make sure no frag could
         * be modified by an external entity : checksum could be wrong.
         */
        if (skb_has_shared_frag(skb)) {
                ret = __skb_linearize(skb);
                if (ret)
                        goto out;
        }

        offset = skb_checksum_start_offset(skb);
        ret = -EINVAL;
        if (unlikely(offset >= skb_headlen(skb))) {
                DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
                WARN_ONCE(true, "offset (%d) >= skb_headlen() (%u)\n",
                          offset, skb_headlen(skb));
                goto out;
        }
        csum = skb_checksum(skb, offset, skb->len - offset, 0);

        offset += skb->csum_offset;
        if (unlikely(offset + sizeof(__sum16) > skb_headlen(skb))) {
                DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
                WARN_ONCE(true, "offset+2 (%zu) > skb_headlen() (%u)\n",
                          offset + sizeof(__sum16), skb_headlen(skb));
                goto out;
        }
        ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
        if (ret)
                goto out;

        *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
out_set_summed:
        skb->ip_summed = CHECKSUM_NONE;
out:
        return ret;
}
EXPORT_SYMBOL(skb_checksum_help);

#ifdef CONFIG_NET_CRC32C
int skb_crc32c_csum_help(struct sk_buff *skb)
{
        u32 crc;
        int ret = 0, offset, start;

        if (skb->ip_summed != CHECKSUM_PARTIAL)
                goto out;

        if (unlikely(skb_is_gso(skb)))
                goto out;

        /* Before computing a checksum, we should make sure no frag could
         * be modified by an external entity : checksum could be wrong.
         */
        if (unlikely(skb_has_shared_frag(skb))) {
                ret = __skb_linearize(skb);
                if (ret)
                        goto out;
        }
        start = skb_checksum_start_offset(skb);
        offset = start + offsetof(struct sctphdr, checksum);
        if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
                ret = -EINVAL;
                goto out;
        }

        ret = skb_ensure_writable(skb, offset + sizeof(__le32));
        if (ret)
                goto out;

        crc = ~skb_crc32c(skb, start, skb->len - start, ~0);
        *(__le32 *)(skb->data + offset) = cpu_to_le32(crc);
        skb_reset_csum_not_inet(skb);
out:
        return ret;
}
EXPORT_SYMBOL(skb_crc32c_csum_help);
#endif /* CONFIG_NET_CRC32C */

__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
{
        __be16 type = skb->protocol;

        /* Tunnel gso handlers can set protocol to ethernet. */
        if (type == htons(ETH_P_TEB)) {
                struct ethhdr *eth;

                if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
                        return 0;

                eth = (struct ethhdr *)skb->data;
                type = eth->h_proto;
        }

        return vlan_get_protocol_and_depth(skb, type, depth);
}


/* Take action when hardware reception checksum errors are detected. */
#ifdef CONFIG_BUG
static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
{
        netdev_err(dev, "hw csum failure\n");
        skb_dump(KERN_ERR, skb, true);
        dump_stack();
}

void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
{
        DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb);
}
EXPORT_SYMBOL(netdev_rx_csum_fault);
#endif

/* XXX: check that highmem exists at all on the given machine. */
static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
{
#ifdef CONFIG_HIGHMEM
        int i;

        if (!(dev->features & NETIF_F_HIGHDMA)) {
                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
                        struct page *page = skb_frag_page(frag);

                        if (page && PageHighMem(page))
                                return 1;
                }
        }
#endif
        return 0;
}

/* If MPLS offload request, verify we are testing hardware MPLS features
 * instead of standard features for the netdev.
 */
#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
static netdev_features_t net_mpls_features(struct sk_buff *skb,
                                           netdev_features_t features,
                                           __be16 type)
{
        if (eth_p_mpls(type))
                features &= skb->dev->mpls_features;

        return features;
}
#else
static netdev_features_t net_mpls_features(struct sk_buff *skb,
                                           netdev_features_t features,
                                           __be16 type)
{
        return features;
}
#endif

static netdev_features_t harmonize_features(struct sk_buff *skb,
        netdev_features_t features)
{
        __be16 type;

        type = skb_network_protocol(skb, NULL);
        features = net_mpls_features(skb, features, type);

        if (skb->ip_summed != CHECKSUM_NONE &&
            !can_checksum_protocol(features, type)) {
                features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
        }
        if (illegal_highdma(skb->dev, skb))
                features &= ~NETIF_F_SG;

        return features;
}

netdev_features_t passthru_features_check(struct sk_buff *skb,
                                          struct net_device *dev,
                                          netdev_features_t features)
{
        return features;
}
EXPORT_SYMBOL(passthru_features_check);

static netdev_features_t dflt_features_check(struct sk_buff *skb,
                                             struct net_device *dev,
                                             netdev_features_t features)
{
        return vlan_features_check(skb, features);
}

static netdev_features_t gso_features_check(const struct sk_buff *skb,
                                            struct net_device *dev,
                                            netdev_features_t features)
{
        u16 gso_segs = skb_shinfo(skb)->gso_segs;

        if (gso_segs > READ_ONCE(dev->gso_max_segs))
                return features & ~NETIF_F_GSO_MASK;

        if (unlikely(skb->len >= netif_get_gso_max_size(dev, skb)))
                return features & ~NETIF_F_GSO_MASK;

        if (!skb_shinfo(skb)->gso_type) {
                skb_warn_bad_offload(skb);
                return features & ~NETIF_F_GSO_MASK;
        }

        /* Support for GSO partial features requires software
         * intervention before we can actually process the packets
         * so we need to strip support for any partial features now
         * and we can pull them back in after we have partially
         * segmented the frame.
         */
        if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
                features &= ~dev->gso_partial_features;

        /* Make sure to clear the IPv4 ID mangling feature if the IPv4 header
         * has the potential to be fragmented so that TSO does not generate
         * segments with the same ID. For encapsulated packets, the ID mangling
         * feature is guaranteed not to use the same ID for the outer IPv4
         * headers of the generated segments if the headers have the potential
         * to be fragmented, so there is no need to clear the IPv4 ID mangling
         * feature (see the section about NETIF_F_TSO_MANGLEID in
         * segmentation-offloads.rst).
         */
        if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
                struct iphdr *iph = skb->encapsulation ?
                                    inner_ip_hdr(skb) : ip_hdr(skb);

                if (!(iph->frag_off & htons(IP_DF)))
                        features &= ~NETIF_F_TSO_MANGLEID;
        }

        /* NETIF_F_IPV6_CSUM does not support IPv6 extension headers,
         * so neither does TSO that depends on it.
         */
        if (features & NETIF_F_IPV6_CSUM &&
            (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6 ||
             (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 &&
              vlan_get_protocol(skb) == htons(ETH_P_IPV6))) &&
            skb_transport_header_was_set(skb) &&
            skb_network_header_len(skb) != sizeof(struct ipv6hdr) &&
            !ipv6_has_hopopt_jumbo(skb))
                features &= ~(NETIF_F_IPV6_CSUM | NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4);

        return features;
}

netdev_features_t netif_skb_features(struct sk_buff *skb)
{
        struct net_device *dev = skb->dev;
        netdev_features_t features = dev->features;

        if (skb_is_gso(skb))
                features = gso_features_check(skb, dev, features);

        /* If encapsulation offload request, verify we are testing
         * hardware encapsulation features instead of standard
         * features for the netdev
         */
        if (skb->encapsulation)
                features &= dev->hw_enc_features;

        if (skb_vlan_tagged(skb))
                features = netdev_intersect_features(features,
                                                     dev->vlan_features |
                                                     NETIF_F_HW_VLAN_CTAG_TX |
                                                     NETIF_F_HW_VLAN_STAG_TX);

        if (dev->netdev_ops->ndo_features_check)
                features &= dev->netdev_ops->ndo_features_check(skb, dev,
                                                                features);
        else
                features &= dflt_features_check(skb, dev, features);

        return harmonize_features(skb, features);
}
EXPORT_SYMBOL(netif_skb_features);

static int xmit_one(struct sk_buff *skb, struct net_device *dev,
                    struct netdev_queue *txq, bool more)
{
        unsigned int len;
        int rc;

        if (dev_nit_active_rcu(dev))
                dev_queue_xmit_nit(skb, dev);

        len = skb->len;
        trace_net_dev_start_xmit(skb, dev);
        rc = netdev_start_xmit(skb, dev, txq, more);
        trace_net_dev_xmit(skb, rc, dev, len);

        return rc;
}

struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
                                    struct netdev_queue *txq, int *ret)
{
        struct sk_buff *skb = first;
        int rc = NETDEV_TX_OK;

        while (skb) {
                struct sk_buff *next = skb->next;

                skb_mark_not_on_list(skb);
                rc = xmit_one(skb, dev, txq, next != NULL);
                if (unlikely(!dev_xmit_complete(rc))) {
                        skb->next = next;
                        goto out;
                }

                skb = next;
                if (netif_tx_queue_stopped(txq) && skb) {
                        rc = NETDEV_TX_BUSY;
                        break;
                }
        }

out:
        *ret = rc;
        return skb;
}

static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
                                          netdev_features_t features)
{
        if (skb_vlan_tag_present(skb) &&
            !vlan_hw_offload_capable(features, skb->vlan_proto))
                skb = __vlan_hwaccel_push_inside(skb);
        return skb;
}

int skb_csum_hwoffload_help(struct sk_buff *skb,
                            const netdev_features_t features)
{
        if (unlikely(skb_csum_is_sctp(skb)))
                return !!(features & NETIF_F_SCTP_CRC) ? 0 :
                        skb_crc32c_csum_help(skb);

        if (features & NETIF_F_HW_CSUM)
                return 0;

        if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
                if (vlan_get_protocol(skb) == htons(ETH_P_IPV6) &&
                    skb_network_header_len(skb) != sizeof(struct ipv6hdr) &&
                    !ipv6_has_hopopt_jumbo(skb))
                        goto sw_checksum;

                switch (skb->csum_offset) {
                case offsetof(struct tcphdr, check):
                case offsetof(struct udphdr, check):
                        return 0;
                }
        }

sw_checksum:
        return skb_checksum_help(skb);
}
EXPORT_SYMBOL(skb_csum_hwoffload_help);

/* Checks if this SKB belongs to an HW offloaded socket
 * and whether any SW fallbacks are required based on dev.
 * Check decrypted mark in case skb_orphan() cleared socket.
 */
static struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb,
                                            struct net_device *dev)
{
#ifdef CONFIG_SOCK_VALIDATE_XMIT
        struct sk_buff *(*sk_validate)(struct sock *sk, struct net_device *dev,
                                       struct sk_buff *skb);
        struct sock *sk = skb->sk;

        sk_validate = NULL;
        if (sk) {
                if (sk_fullsock(sk))
                        sk_validate = sk->sk_validate_xmit_skb;
                else if (sk_is_inet(sk) && sk->sk_state == TCP_TIME_WAIT)
                        sk_validate = inet_twsk(sk)->tw_validate_xmit_skb;
        }

        if (sk_validate) {
                skb = sk_validate(sk, dev, skb);
        } else if (unlikely(skb_is_decrypted(skb))) {
                pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n");
                kfree_skb(skb);
                skb = NULL;
        }
#endif

        return skb;
}

static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb,
                                                    struct net_device *dev)
{
        struct skb_shared_info *shinfo;
        struct net_iov *niov;

        if (likely(skb_frags_readable(skb)))
                goto out;

        if (!dev->netmem_tx)
                goto out_free;

        shinfo = skb_shinfo(skb);

        if (shinfo->nr_frags > 0) {
                niov = netmem_to_net_iov(skb_frag_netmem(&shinfo->frags[0]));
                if (net_is_devmem_iov(niov) &&
                    net_devmem_iov_binding(niov)->dev != dev)
                        goto out_free;
        }

out:
        return skb;

out_free:
        kfree_skb(skb);
        return NULL;
}

static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
{
        netdev_features_t features;

        skb = validate_xmit_unreadable_skb(skb, dev);
        if (unlikely(!skb))
                goto out_null;

        features = netif_skb_features(skb);
        skb = validate_xmit_vlan(skb, features);
        if (unlikely(!skb))
                goto out_null;

        skb = sk_validate_xmit_skb(skb, dev);
        if (unlikely(!skb))
                goto out_null;

        if (netif_needs_gso(skb, features)) {
                struct sk_buff *segs;

                segs = skb_gso_segment(skb, features);
                if (IS_ERR(segs)) {
                        goto out_kfree_skb;
                } else if (segs) {
                        consume_skb(skb);
                        skb = segs;
                }
        } else {
                if (skb_needs_linearize(skb, features) &&
                    __skb_linearize(skb))
                        goto out_kfree_skb;

                /* If packet is not checksummed and device does not
                 * support checksumming for this protocol, complete
                 * checksumming here.
                 */
                if (skb->ip_summed == CHECKSUM_PARTIAL) {
                        if (skb->encapsulation)
                                skb_set_inner_transport_header(skb,
                                                               skb_checksum_start_offset(skb));
                        else
                                skb_set_transport_header(skb,
                                                         skb_checksum_start_offset(skb));
                        if (skb_csum_hwoffload_help(skb, features))
                                goto out_kfree_skb;
                }
        }

        skb = validate_xmit_xfrm(skb, features, again);

        return skb;

out_kfree_skb:
        kfree_skb(skb);
out_null:
        dev_core_stats_tx_dropped_inc(dev);
        return NULL;
}

struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
{
        struct sk_buff *next, *head = NULL, *tail;

        for (; skb != NULL; skb = next) {
                next = skb->next;
                skb_mark_not_on_list(skb);

                /* in case skb won't be segmented, point to itself */
                skb->prev = skb;

                skb = validate_xmit_skb(skb, dev, again);
                if (!skb)
                        continue;

                if (!head)
                        head = skb;
                else
                        tail->next = skb;
                /* If skb was segmented, skb->prev points to
                 * the last segment. If not, it still contains skb.
                 */
                tail = skb->prev;
        }
        return head;
}
EXPORT_SYMBOL_GPL(validate_xmit_skb_list);

static void qdisc_pkt_len_init(struct sk_buff *skb)
{
        const struct skb_shared_info *shinfo = skb_shinfo(skb);

        qdisc_skb_cb(skb)->pkt_len = skb->len;

        /* To get more precise estimation of bytes sent on wire,
         * we add to pkt_len the headers size of all segments
         */
        if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
                u16 gso_segs = shinfo->gso_segs;
                unsigned int hdr_len;

                /* mac layer + network layer */
                if (!skb->encapsulation)
                        hdr_len = skb_transport_offset(skb);
                else
                        hdr_len = skb_inner_transport_offset(skb);

                /* + transport layer */
                if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
                        const struct tcphdr *th;
                        struct tcphdr _tcphdr;

                        th = skb_header_pointer(skb, hdr_len,
                                                sizeof(_tcphdr), &_tcphdr);
                        if (likely(th))
                                hdr_len += __tcp_hdrlen(th);
                } else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
                        struct udphdr _udphdr;

                        if (skb_header_pointer(skb, hdr_len,
                                               sizeof(_udphdr), &_udphdr))
                                hdr_len += sizeof(struct udphdr);
                }

                if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) {
                        int payload = skb->len - hdr_len;

                        /* Malicious packet. */
                        if (payload <= 0)
                                return;
                        gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size);
                }
                qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
        }
}

static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q,
                             struct sk_buff **to_free,
                             struct netdev_queue *txq)
{
        int rc;

        rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK;
        if (rc == NET_XMIT_SUCCESS)
                trace_qdisc_enqueue(q, txq, skb);
        return rc;
}

static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
                                 struct net_device *dev,
                                 struct netdev_queue *txq)
{
        spinlock_t *root_lock = qdisc_lock(q);
        struct sk_buff *to_free = NULL;
        bool contended;
        int rc;

        qdisc_calculate_pkt_len(skb, q);

        tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_DROP);

        if (q->flags & TCQ_F_NOLOCK) {
                if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) &&
                    qdisc_run_begin(q)) {
                        /* Retest nolock_qdisc_is_empty() within the protection
                         * of q->seqlock to protect from racing with requeuing.
                         */
                        if (unlikely(!nolock_qdisc_is_empty(q))) {
                                rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
                                __qdisc_run(q);
                                qdisc_run_end(q);

                                goto no_lock_out;
                        }

                        qdisc_bstats_cpu_update(q, skb);
                        if (sch_direct_xmit(skb, q, dev, txq, NULL, true) &&
                            !nolock_qdisc_is_empty(q))
                                __qdisc_run(q);

                        qdisc_run_end(q);
                        return NET_XMIT_SUCCESS;
                }

                rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
                qdisc_run(q);

no_lock_out:
                if (unlikely(to_free))
                        kfree_skb_list_reason(to_free,
                                              tcf_get_drop_reason(to_free));
                return rc;
        }

        if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) {
                kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP);
                return NET_XMIT_DROP;
        }
        /*
         * Heuristic to force contended enqueues to serialize on a
         * separate lock before trying to get qdisc main lock.
         * This permits qdisc->running owner to get the lock more
         * often and dequeue packets faster.
         * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit
         * and then other tasks will only enqueue packets. The packets will be
         * sent after the qdisc owner is scheduled again. To prevent this
         * scenario the task always serialize on the lock.
         */
        contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
        if (unlikely(contended))
                spin_lock(&q->busylock);

        spin_lock(root_lock);
        if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
                __qdisc_drop(skb, &to_free);
                rc = NET_XMIT_DROP;
        } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
                   qdisc_run_begin(q)) {
                /*
                 * This is a work-conserving queue; there are no old skbs
                 * waiting to be sent out; and the qdisc is not running -
                 * xmit the skb directly.
                 */

                qdisc_bstats_update(q, skb);

                if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
                        if (unlikely(contended)) {
                                spin_unlock(&q->busylock);
                                contended = false;
                        }
                        __qdisc_run(q);
                }

                qdisc_run_end(q);
                rc = NET_XMIT_SUCCESS;
        } else {
                WRITE_ONCE(q->owner, smp_processor_id());
                rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
                WRITE_ONCE(q->owner, -1);
                if (qdisc_run_begin(q)) {
                        if (unlikely(contended)) {
                                spin_unlock(&q->busylock);
                                contended = false;
                        }
                        __qdisc_run(q);
                        qdisc_run_end(q);
                }
        }
        spin_unlock(root_lock);
        if (unlikely(to_free))
                kfree_skb_list_reason(to_free,
                                      tcf_get_drop_reason(to_free));
        if (unlikely(contended))
                spin_unlock(&q->busylock);
        return rc;
}

#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
static void skb_update_prio(struct sk_buff *skb)
{
        const struct netprio_map *map;
        const struct sock *sk;
        unsigned int prioidx;

        if (skb->priority)
                return;
        map = rcu_dereference_bh(skb->dev->priomap);
        if (!map)
                return;
        sk = skb_to_full_sk(skb);
        if (!sk)
                return;

        prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);

        if (prioidx < map->priomap_len)
                skb->priority = map->priomap[prioidx];
}
#else
#define skb_update_prio(skb)
#endif

/**
 *        dev_loopback_xmit - loop back @skb
 *        @net: network namespace this loopback is happening in
 *        @sk:  sk needed to be a netfilter okfn
 *        @skb: buffer to transmit
 */
int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        skb_reset_mac_header(skb);
        __skb_pull(skb, skb_network_offset(skb));
        skb->pkt_type = PACKET_LOOPBACK;
        if (skb->ip_summed == CHECKSUM_NONE)
                skb->ip_summed = CHECKSUM_UNNECESSARY;
        DEBUG_NET_WARN_ON_ONCE(!skb_dst(skb));
        skb_dst_force(skb);
        netif_rx(skb);
        return 0;
}
EXPORT_SYMBOL(dev_loopback_xmit);

#ifdef CONFIG_NET_EGRESS
static struct netdev_queue *
netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb)
{
        int qm = skb_get_queue_mapping(skb);

        return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm));
}

#ifndef CONFIG_PREEMPT_RT
static bool netdev_xmit_txqueue_skipped(void)
{
        return __this_cpu_read(softnet_data.xmit.skip_txqueue);
}

void netdev_xmit_skip_txqueue(bool skip)
{
        __this_cpu_write(softnet_data.xmit.skip_txqueue, skip);
}
EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);

#else
static bool netdev_xmit_txqueue_skipped(void)
{
        return current->net_xmit.skip_txqueue;
}

void netdev_xmit_skip_txqueue(bool skip)
{
        current->net_xmit.skip_txqueue = skip;
}
EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
#endif
#endif /* CONFIG_NET_EGRESS */

#ifdef CONFIG_NET_XGRESS
static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
                  enum skb_drop_reason *drop_reason)
{
        int ret = TC_ACT_UNSPEC;
#ifdef CONFIG_NET_CLS_ACT
        struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq);
        struct tcf_result res;

        if (!miniq)
                return ret;

        /* Global bypass */
        if (!static_branch_likely(&tcf_sw_enabled_key))
                return ret;

        /* Block-wise bypass */
        if (tcf_block_bypass_sw(miniq->block))
                return ret;

        tc_skb_cb(skb)->mru = 0;
        tc_skb_cb(skb)->post_ct = false;
        tcf_set_drop_reason(skb, *drop_reason);

        mini_qdisc_bstats_cpu_update(miniq, skb);
        ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
        /* Only tcf related quirks below. */
        switch (ret) {
        case TC_ACT_SHOT:
                *drop_reason = tcf_get_drop_reason(skb);
                mini_qdisc_qstats_cpu_drop(miniq);
                break;
        case TC_ACT_OK:
        case TC_ACT_RECLASSIFY:
                skb->tc_index = TC_H_MIN(res.classid);
                break;
        }
#endif /* CONFIG_NET_CLS_ACT */
        return ret;
}

static DEFINE_STATIC_KEY_FALSE(tcx_needed_key);

void tcx_inc(void)
{
        static_branch_inc(&tcx_needed_key);
}

void tcx_dec(void)
{
        static_branch_dec(&tcx_needed_key);
}

static __always_inline enum tcx_action_base
tcx_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb,
        const bool needs_mac)
{
        const struct bpf_mprog_fp *fp;
        const struct bpf_prog *prog;
        int ret = TCX_NEXT;

        if (needs_mac)
                __skb_push(skb, skb->mac_len);
        bpf_mprog_foreach_prog(entry, fp, prog) {
                bpf_compute_data_pointers(skb);
                ret = bpf_prog_run(prog, skb);
                if (ret != TCX_NEXT)
                        break;
        }
        if (needs_mac)
                __skb_pull(skb, skb->mac_len);
        return tcx_action_code(skb, ret);
}

static __always_inline struct sk_buff *
sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
                   struct net_device *orig_dev, bool *another)
{
        struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
        enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS;
        struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
        int sch_ret;

        if (!entry)
                return skb;

        bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
        if (*pt_prev) {
                *ret = deliver_skb(skb, *pt_prev, orig_dev);
                *pt_prev = NULL;
        }

        qdisc_skb_cb(skb)->pkt_len = skb->len;
        tcx_set_ingress(skb, true);

        if (static_branch_unlikely(&tcx_needed_key)) {
                sch_ret = tcx_run(entry, skb, true);
                if (sch_ret != TC_ACT_UNSPEC)
                        goto ingress_verdict;
        }
        sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
ingress_verdict:
        switch (sch_ret) {
        case TC_ACT_REDIRECT:
                /* skb_mac_header check was done by BPF, so we can safely
                 * push the L2 header back before redirecting to another
                 * netdev.
                 */
                __skb_push(skb, skb->mac_len);
                if (skb_do_redirect(skb) == -EAGAIN) {
                        __skb_pull(skb, skb->mac_len);
                        *another = true;
                        break;
                }
                *ret = NET_RX_SUCCESS;
                bpf_net_ctx_clear(bpf_net_ctx);
                return NULL;
        case TC_ACT_SHOT:
                kfree_skb_reason(skb, drop_reason);
                *ret = NET_RX_DROP;
                bpf_net_ctx_clear(bpf_net_ctx);
                return NULL;
        /* used by tc_run */
        case TC_ACT_STOLEN:
        case TC_ACT_QUEUED:
        case TC_ACT_TRAP:
                consume_skb(skb);
                fallthrough;
        case TC_ACT_CONSUMED:
                *ret = NET_RX_SUCCESS;
                bpf_net_ctx_clear(bpf_net_ctx);
                return NULL;
        }
        bpf_net_ctx_clear(bpf_net_ctx);

        return skb;
}

static __always_inline struct sk_buff *
sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
{
        struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
        enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS;
        struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
        int sch_ret;

        if (!entry)
                return skb;

        bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);

        /* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was
         * already set by the caller.
         */
        if (static_branch_unlikely(&tcx_needed_key)) {
                sch_ret = tcx_run(entry, skb, false);
                if (sch_ret != TC_ACT_UNSPEC)
                        goto egress_verdict;
        }
        sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
egress_verdict:
        switch (sch_ret) {
        case TC_ACT_REDIRECT:
                /* No need to push/pop skb's mac_header here on egress! */
                skb_do_redirect(skb);
                *ret = NET_XMIT_SUCCESS;
                bpf_net_ctx_clear(bpf_net_ctx);
                return NULL;
        case TC_ACT_SHOT:
                kfree_skb_reason(skb, drop_reason);
                *ret = NET_XMIT_DROP;
                bpf_net_ctx_clear(bpf_net_ctx);
                return NULL;
        /* used by tc_run */
        case TC_ACT_STOLEN:
        case TC_ACT_QUEUED:
        case TC_ACT_TRAP:
                consume_skb(skb);
                fallthrough;
        case TC_ACT_CONSUMED:
                *ret = NET_XMIT_SUCCESS;
                bpf_net_ctx_clear(bpf_net_ctx);
                return NULL;
        }
        bpf_net_ctx_clear(bpf_net_ctx);

        return skb;
}
#else
static __always_inline struct sk_buff *
sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
                   struct net_device *orig_dev, bool *another)
{
        return skb;
}

static __always_inline struct sk_buff *
sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
{
        return skb;
}
#endif /* CONFIG_NET_XGRESS */

#ifdef CONFIG_XPS
static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
                               struct xps_dev_maps *dev_maps, unsigned int tci)
{
        int tc = netdev_get_prio_tc_map(dev, skb->priority);
        struct xps_map *map;
        int queue_index = -1;

        if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids)
                return queue_index;

        tci *= dev_maps->num_tc;
        tci += tc;

        map = rcu_dereference(dev_maps->attr_map[tci]);
        if (map) {
                if (map->len == 1)
                        queue_index = map->queues[0];
                else
                        queue_index = map->queues[reciprocal_scale(
                                                skb_get_hash(skb), map->len)];
                if (unlikely(queue_index >= dev->real_num_tx_queues))
                        queue_index = -1;
        }
        return queue_index;
}
#endif

static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
                         struct sk_buff *skb)
{
#ifdef CONFIG_XPS
        struct xps_dev_maps *dev_maps;
        struct sock *sk = skb->sk;
        int queue_index = -1;

        if (!static_key_false(&xps_needed))
                return -1;

        rcu_read_lock();
        if (!static_key_false(&xps_rxqs_needed))
                goto get_cpus_map;

        dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]);
        if (dev_maps) {
                int tci = sk_rx_queue_get(sk);

                if (tci >= 0)
                        queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
                                                          tci);
        }

get_cpus_map:
        if (queue_index < 0) {
                dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]);
                if (dev_maps) {
                        unsigned int tci = skb->sender_cpu - 1;

                        queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
                                                          tci);
                }
        }
        rcu_read_unlock();

        return queue_index;
#else
        return -1;
#endif
}

u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
                     struct net_device *sb_dev)
{
        return 0;
}
EXPORT_SYMBOL(dev_pick_tx_zero);

u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
                     struct net_device *sb_dev)
{
        struct sock *sk = skb->sk;
        int queue_index = sk_tx_queue_get(sk);

        sb_dev = sb_dev ? : dev;

        if (queue_index < 0 || skb->ooo_okay ||
            queue_index >= dev->real_num_tx_queues) {
                int new_index = get_xps_queue(dev, sb_dev, skb);

                if (new_index < 0)
                        new_index = skb_tx_hash(dev, sb_dev, skb);

                if (queue_index != new_index && sk &&
                    sk_fullsock(sk) &&
                    rcu_access_pointer(sk->sk_dst_cache))
                        sk_tx_queue_set(sk, new_index);

                queue_index = new_index;
        }

        return queue_index;
}
EXPORT_SYMBOL(netdev_pick_tx);

struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
                                         struct sk_buff *skb,
                                         struct net_device *sb_dev)
{
        int queue_index = 0;

#ifdef CONFIG_XPS
        u32 sender_cpu = skb->sender_cpu - 1;

        if (sender_cpu >= (u32)NR_CPUS)
                skb->sender_cpu = raw_smp_processor_id() + 1;
#endif

        if (dev->real_num_tx_queues != 1) {
                const struct net_device_ops *ops = dev->netdev_ops;

                if (ops->ndo_select_queue)
                        queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
                else
                        queue_index = netdev_pick_tx(dev, skb, sb_dev);

                queue_index = netdev_cap_txqueue(dev, queue_index);
        }

        skb_set_queue_mapping(skb, queue_index);
        return netdev_get_tx_queue(dev, queue_index);
}

/**
 * __dev_queue_xmit() - transmit a buffer
 * @skb:        buffer to transmit
 * @sb_dev:        suboordinate device used for L2 forwarding offload
 *
 * Queue a buffer for transmission to a network device. The caller must
 * have set the device and priority and built the buffer before calling
 * this function. The function can be called from an interrupt.
 *
 * When calling this method, interrupts MUST be enabled. This is because
 * the BH enable code must have IRQs enabled so that it will not deadlock.
 *
 * Regardless of the return value, the skb is consumed, so it is currently
 * difficult to retry a send to this method. (You can bump the ref count
 * before sending to hold a reference for retry if you are careful.)
 *
 * Return:
 * * 0                                - buffer successfully transmitted
 * * positive qdisc return code        - NET_XMIT_DROP etc.
 * * negative errno                - other errors
 */
int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
{
        struct net_device *dev = skb->dev;
        struct netdev_queue *txq = NULL;
        struct Qdisc *q;
        int rc = -ENOMEM;
        bool again = false;

        skb_reset_mac_header(skb);
        skb_assert_len(skb);

        if (unlikely(skb_shinfo(skb)->tx_flags &
                     (SKBTX_SCHED_TSTAMP | SKBTX_BPF)))
                __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);

        /* Disable soft irqs for various locks below. Also
         * stops preemption for RCU.
         */
        rcu_read_lock_bh();

        skb_update_prio(skb);

        qdisc_pkt_len_init(skb);
        tcx_set_ingress(skb, false);
#ifdef CONFIG_NET_EGRESS
        if (static_branch_unlikely(&egress_needed_key)) {
                if (nf_hook_egress_active()) {
                        skb = nf_hook_egress(skb, &rc, dev);
                        if (!skb)
                                goto out;
                }

                netdev_xmit_skip_txqueue(false);

                nf_skip_egress(skb, true);
                skb = sch_handle_egress(skb, &rc, dev);
                if (!skb)
                        goto out;
                nf_skip_egress(skb, false);

                if (netdev_xmit_txqueue_skipped())
                        txq = netdev_tx_queue_mapping(dev, skb);
        }
#endif
        /* If device/qdisc don't need skb->dst, release it right now while
         * its hot in this cpu cache.
         */
        if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
                skb_dst_drop(skb);
        else
                skb_dst_force(skb);

        if (!txq)
                txq = netdev_core_pick_tx(dev, skb, sb_dev);

        q = rcu_dereference_bh(txq->qdisc);

        trace_net_dev_queue(skb);
        if (q->enqueue) {
                rc = __dev_xmit_skb(skb, q, dev, txq);
                goto out;
        }

        /* The device has no queue. Common case for software devices:
         * loopback, all the sorts of tunnels...

         * Really, it is unlikely that netif_tx_lock protection is necessary
         * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
         * counters.)
         * However, it is possible, that they rely on protection
         * made by us here.

         * Check this and shot the lock. It is not prone from deadlocks.
         *Either shot noqueue qdisc, it is even simpler 8)
         */
        if (dev->flags & IFF_UP) {
                int cpu = smp_processor_id(); /* ok because BHs are off */

                /* Other cpus might concurrently change txq->xmit_lock_owner
                 * to -1 or to their cpu id, but not to our id.
                 */
                if (READ_ONCE(txq->xmit_lock_owner) != cpu) {
                        if (dev_xmit_recursion())
                                goto recursion_alert;

                        skb = validate_xmit_skb(skb, dev, &again);
                        if (!skb)
                                goto out;

                        HARD_TX_LOCK(dev, txq, cpu);

                        if (!netif_xmit_stopped(txq)) {
                                dev_xmit_recursion_inc();
                                skb = dev_hard_start_xmit(skb, dev, txq, &rc);
                                dev_xmit_recursion_dec();
                                if (dev_xmit_complete(rc)) {
                                        HARD_TX_UNLOCK(dev, txq);
                                        goto out;
                                }
                        }
                        HARD_TX_UNLOCK(dev, txq);
                        net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
                                             dev->name);
                } else {
                        /* Recursion is detected! It is possible,
                         * unfortunately
                         */
recursion_alert:
                        net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
                                             dev->name);
                }
        }

        rc = -ENETDOWN;
        rcu_read_unlock_bh();

        dev_core_stats_tx_dropped_inc(dev);
        kfree_skb_list(skb);
        return rc;
out:
        rcu_read_unlock_bh();
        return rc;
}
EXPORT_SYMBOL(__dev_queue_xmit);

int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
{
        struct net_device *dev = skb->dev;
        struct sk_buff *orig_skb = skb;
        struct netdev_queue *txq;
        int ret = NETDEV_TX_BUSY;
        bool again = false;

        if (unlikely(!netif_running(dev) ||
                     !netif_carrier_ok(dev)))
                goto drop;

        skb = validate_xmit_skb_list(skb, dev, &again);
        if (skb != orig_skb)
                goto drop;

        skb_set_queue_mapping(skb, queue_id);
        txq = skb_get_tx_queue(dev, skb);

        local_bh_disable();

        dev_xmit_recursion_inc();
        HARD_TX_LOCK(dev, txq, smp_processor_id());
        if (!netif_xmit_frozen_or_drv_stopped(txq))
                ret = netdev_start_xmit(skb, dev, txq, false);
        HARD_TX_UNLOCK(dev, txq);
        dev_xmit_recursion_dec();

        local_bh_enable();
        return ret;
drop:
        dev_core_stats_tx_dropped_inc(dev);
        kfree_skb_list(skb);
        return NET_XMIT_DROP;
}
EXPORT_SYMBOL(__dev_direct_xmit);

/*************************************************************************
 *                        Receiver routines
 *************************************************************************/
static DEFINE_PER_CPU(struct task_struct *, backlog_napi);

int weight_p __read_mostly = 64;           /* old backlog weight */
int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */

/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
                                     struct napi_struct *napi)
{
        struct task_struct *thread;

        lockdep_assert_irqs_disabled();

        if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
                /* Paired with smp_mb__before_atomic() in
                 * napi_enable()/netif_set_threaded().
                 * Use READ_ONCE() to guarantee a complete
                 * read on napi->thread. Only call
                 * wake_up_process() when it's not NULL.
                 */
                thread = READ_ONCE(napi->thread);
                if (thread) {
                        if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi))
                                goto use_local_napi;

                        set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
                        wake_up_process(thread);
                        return;
                }
        }

use_local_napi:
        DEBUG_NET_WARN_ON_ONCE(!list_empty(&napi->poll_list));
        list_add_tail(&napi->poll_list, &sd->poll_list);
        WRITE_ONCE(napi->list_owner, smp_processor_id());
        /* If not called from net_rx_action()
         * we have to raise NET_RX_SOFTIRQ.
         */
        if (!sd->in_net_rx_action)
                raise_softirq_irqoff(NET_RX_SOFTIRQ);
}

#ifdef CONFIG_RPS

struct static_key_false rps_needed __read_mostly;
EXPORT_SYMBOL(rps_needed);
struct static_key_false rfs_needed __read_mostly;
EXPORT_SYMBOL(rfs_needed);

static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table)
{
        return hash_32(hash, flow_table->log);
}

#ifdef CONFIG_RFS_ACCEL
/**
 * rps_flow_is_active - check whether the flow is recently active.
 * @rflow: Specific flow to check activity.
 * @flow_table: per-queue flowtable that @rflow belongs to.
 * @cpu: CPU saved in @rflow.
 *
 * If the CPU has processed many packets since the flow's last activity
 * (beyond 10 times the table size), the flow is considered stale.
 *
 * Return: true if flow was recently active.
 */
static bool rps_flow_is_active(struct rps_dev_flow *rflow,
                               struct rps_dev_flow_table *flow_table,
                               unsigned int cpu)
{
        unsigned int flow_last_active;
        unsigned int sd_input_head;

        if (cpu >= nr_cpu_ids)
                return false;

        sd_input_head = READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head);
        flow_last_active = READ_ONCE(rflow->last_qtail);

        return (int)(sd_input_head - flow_last_active) <
                (int)(10 << flow_table->log);
}
#endif

static struct rps_dev_flow *
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
            struct rps_dev_flow *rflow, u16 next_cpu, u32 hash,
            u32 flow_id)
{
        if (next_cpu < nr_cpu_ids) {
                u32 head;
#ifdef CONFIG_RFS_ACCEL
                struct netdev_rx_queue *rxqueue;
                struct rps_dev_flow_table *flow_table;
                struct rps_dev_flow *old_rflow;
                struct rps_dev_flow *tmp_rflow;
                unsigned int tmp_cpu;
                u16 rxq_index;
                int rc;

                /* Should we steer this flow to a different hardware queue? */
                if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
                    !(dev->features & NETIF_F_NTUPLE))
                        goto out;
                rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
                if (rxq_index == skb_get_rx_queue(skb))
                        goto out;

                rxqueue = dev->_rx + rxq_index;
                flow_table = rcu_dereference(rxqueue->rps_flow_table);
                if (!flow_table)
                        goto out;

                tmp_rflow = &flow_table->flows[flow_id];
                tmp_cpu = READ_ONCE(tmp_rflow->cpu);

                if (READ_ONCE(tmp_rflow->filter) != RPS_NO_FILTER) {
                        if (rps_flow_is_active(tmp_rflow, flow_table,
                                               tmp_cpu)) {
                                if (hash != READ_ONCE(tmp_rflow->hash) ||
                                    next_cpu == tmp_cpu)
                                        goto out;
                        }
                }

                rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
                                                        rxq_index, flow_id);
                if (rc < 0)
                        goto out;

                old_rflow = rflow;
                rflow = tmp_rflow;
                WRITE_ONCE(rflow->filter, rc);
                WRITE_ONCE(rflow->hash, hash);

                if (old_rflow->filter == rc)
                        WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER);
        out:
#endif
                head = READ_ONCE(per_cpu(softnet_data, next_cpu).input_queue_head);
                rps_input_queue_tail_save(&rflow->last_qtail, head);
        }

        WRITE_ONCE(rflow->cpu, next_cpu);
        return rflow;
}

/*
 * get_rps_cpu is called from netif_receive_skb and returns the target
 * CPU from the RPS map of the receiving queue for a given skb.
 * rcu_read_lock must be held on entry.
 */
static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
                       struct rps_dev_flow **rflowp)
{
        const struct rps_sock_flow_table *sock_flow_table;
        struct netdev_rx_queue *rxqueue = dev->_rx;
        struct rps_dev_flow_table *flow_table;
        struct rps_map *map;
        int cpu = -1;
        u32 flow_id;
        u32 tcpu;
        u32 hash;

        if (skb_rx_queue_recorded(skb)) {
                u16 index = skb_get_rx_queue(skb);

                if (unlikely(index >= dev->real_num_rx_queues)) {
                        WARN_ONCE(dev->real_num_rx_queues > 1,
                                  "%s received packet on queue %u, but number "
                                  "of RX queues is %u\n",
                                  dev->name, index, dev->real_num_rx_queues);
                        goto done;
                }
                rxqueue += index;
        }

        /* Avoid computing hash if RFS/RPS is not active for this rxqueue */

        flow_table = rcu_dereference(rxqueue->rps_flow_table);
        map = rcu_dereference(rxqueue->rps_map);
        if (!flow_table && !map)
                goto done;

        skb_reset_network_header(skb);
        hash = skb_get_hash(skb);
        if (!hash)
                goto done;

        sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table);
        if (flow_table && sock_flow_table) {
                struct rps_dev_flow *rflow;
                u32 next_cpu;
                u32 ident;

                /* First check into global flow table if there is a match.
                 * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow().
                 */
                ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]);
                if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask)
                        goto try_rps;

                next_cpu = ident & net_hotdata.rps_cpu_mask;

                /* OK, now we know there is a match,
                 * we can look at the local (per receive queue) flow table
                 */
                flow_id = rfs_slot(hash, flow_table);
                rflow = &flow_table->flows[flow_id];
                tcpu = rflow->cpu;

                /*
                 * If the desired CPU (where last recvmsg was done) is
                 * different from current CPU (one in the rx-queue flow
                 * table entry), switch if one of the following holds:
                 *   - Current CPU is unset (>= nr_cpu_ids).
                 *   - Current CPU is offline.
                 *   - The current CPU's queue tail has advanced beyond the
                 *     last packet that was enqueued using this table entry.
                 *     This guarantees that all previous packets for the flow
                 *     have been dequeued, thus preserving in order delivery.
                 */
                if (unlikely(tcpu != next_cpu) &&
                    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
                     ((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) -
                      rflow->last_qtail)) >= 0)) {
                        tcpu = next_cpu;
                        rflow = set_rps_cpu(dev, skb, rflow, next_cpu, hash,
                                            flow_id);
                }

                if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
                        *rflowp = rflow;
                        cpu = tcpu;
                        goto done;
                }
        }

try_rps:

        if (map) {
                tcpu = map->cpus[reciprocal_scale(hash, map->len)];
                if (cpu_online(tcpu)) {
                        cpu = tcpu;
                        goto done;
                }
        }

done:
        return cpu;
}

#ifdef CONFIG_RFS_ACCEL

/**
 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
 * @dev: Device on which the filter was set
 * @rxq_index: RX queue index
 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
 *
 * Drivers that implement ndo_rx_flow_steer() should periodically call
 * this function for each installed filter and remove the filters for
 * which it returns %true.
 */
bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
                         u32 flow_id, u16 filter_id)
{
        struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
        struct rps_dev_flow_table *flow_table;
        struct rps_dev_flow *rflow;
        bool expire = true;

        rcu_read_lock();
        flow_table = rcu_dereference(rxqueue->rps_flow_table);
        if (flow_table && flow_id < (1UL << flow_table->log)) {
                unsigned int cpu;

                rflow = &flow_table->flows[flow_id];
                cpu = READ_ONCE(rflow->cpu);
                if (READ_ONCE(rflow->filter) == filter_id &&
                    rps_flow_is_active(rflow, flow_table, cpu))
                        expire = false;
        }
        rcu_read_unlock();
        return expire;
}
EXPORT_SYMBOL(rps_may_expire_flow);

#endif /* CONFIG_RFS_ACCEL */

/* Called from hardirq (IPI) context */
static void rps_trigger_softirq(void *data)
{
        struct softnet_data *sd = data;

        ____napi_schedule(sd, &sd->backlog);
        /* Pairs with READ_ONCE() in softnet_seq_show() */
        WRITE_ONCE(sd->received_rps, sd->received_rps + 1);
}

#endif /* CONFIG_RPS */

/* Called from hardirq (IPI) context */
static void trigger_rx_softirq(void *data)
{
        struct softnet_data *sd = data;

        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
        smp_store_release(&sd->defer_ipi_scheduled, 0);
}

/*
 * After we queued a packet into sd->input_pkt_queue,
 * we need to make sure this queue is serviced soon.
 *
 * - If this is another cpu queue, link it to our rps_ipi_list,
 *   and make sure we will process rps_ipi_list from net_rx_action().
 *
 * - If this is our own queue, NAPI schedule our backlog.
 *   Note that this also raises NET_RX_SOFTIRQ.
 */
static void napi_schedule_rps(struct softnet_data *sd)
{
        struct softnet_data *mysd = this_cpu_ptr(&softnet_data);

#ifdef CONFIG_RPS
        if (sd != mysd) {
                if (use_backlog_threads()) {
                        __napi_schedule_irqoff(&sd->backlog);
                        return;
                }

                sd->rps_ipi_next = mysd->rps_ipi_list;
                mysd->rps_ipi_list = sd;

                /* If not called from net_rx_action() or napi_threaded_poll()
                 * we have to raise NET_RX_SOFTIRQ.
                 */
                if (!mysd->in_net_rx_action && !mysd->in_napi_threaded_poll)
                        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
                return;
        }
#endif /* CONFIG_RPS */
        __napi_schedule_irqoff(&mysd->backlog);
}

void kick_defer_list_purge(unsigned int cpu)
{
        struct softnet_data *sd = &per_cpu(softnet_data, cpu);
        unsigned long flags;

        if (use_backlog_threads()) {
                backlog_lock_irq_save(sd, &flags);

                if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
                        __napi_schedule_irqoff(&sd->backlog);

                backlog_unlock_irq_restore(sd, &flags);

        } else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) {
                smp_call_function_single_async(cpu, &sd->defer_csd);
        }
}

#ifdef CONFIG_NET_FLOW_LIMIT
int netdev_flow_limit_table_len __read_mostly = (1 << 12);
#endif

static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
{
#ifdef CONFIG_NET_FLOW_LIMIT
        struct sd_flow_limit *fl;
        struct softnet_data *sd;
        unsigned int old_flow, new_flow;

        if (qlen < (READ_ONCE(net_hotdata.max_backlog) >> 1))
                return false;

        sd = this_cpu_ptr(&softnet_data);

        rcu_read_lock();
        fl = rcu_dereference(sd->flow_limit);
        if (fl) {
                new_flow = hash_32(skb_get_hash(skb), fl->log_buckets);
                old_flow = fl->history[fl->history_head];
                fl->history[fl->history_head] = new_flow;

                fl->history_head++;
                fl->history_head &= FLOW_LIMIT_HISTORY - 1;

                if (likely(fl->buckets[old_flow]))
                        fl->buckets[old_flow]--;

                if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
                        /* Pairs with READ_ONCE() in softnet_seq_show() */
                        WRITE_ONCE(fl->count, fl->count + 1);
                        rcu_read_unlock();
                        return true;
                }
        }
        rcu_read_unlock();
#endif
        return false;
}

/*
 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
 * queue (may be a remote CPU queue).
 */
static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
                              unsigned int *qtail)
{
        enum skb_drop_reason reason;
        struct softnet_data *sd;
        unsigned long flags;
        unsigned int qlen;
        int max_backlog;
        u32 tail;

        reason = SKB_DROP_REASON_DEV_READY;
        if (!netif_running(skb->dev))
                goto bad_dev;

        reason = SKB_DROP_REASON_CPU_BACKLOG;
        sd = &per_cpu(softnet_data, cpu);

        qlen = skb_queue_len_lockless(&sd->input_pkt_queue);
        max_backlog = READ_ONCE(net_hotdata.max_backlog);
        if (unlikely(qlen > max_backlog))
                goto cpu_backlog_drop;
        backlog_lock_irq_save(sd, &flags);
        qlen = skb_queue_len(&sd->input_pkt_queue);
        if (qlen <= max_backlog && !skb_flow_limit(skb, qlen)) {
                if (!qlen) {
                        /* Schedule NAPI for backlog device. We can use
                         * non atomic operation as we own the queue lock.
                         */
                        if (!__test_and_set_bit(NAPI_STATE_SCHED,
                                                &sd->backlog.state))
                                napi_schedule_rps(sd);
                }
                __skb_queue_tail(&sd->input_pkt_queue, skb);
                tail = rps_input_queue_tail_incr(sd);
                backlog_unlock_irq_restore(sd, &flags);

                /* save the tail outside of the critical section */
                rps_input_queue_tail_save(qtail, tail);
                return NET_RX_SUCCESS;
        }

        backlog_unlock_irq_restore(sd, &flags);

cpu_backlog_drop:
        numa_drop_add(&sd->drop_counters, 1);
bad_dev:
        dev_core_stats_rx_dropped_inc(skb->dev);
        kfree_skb_reason(skb, reason);
        return NET_RX_DROP;
}

static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
{
        struct net_device *dev = skb->dev;
        struct netdev_rx_queue *rxqueue;

        rxqueue = dev->_rx;

        if (skb_rx_queue_recorded(skb)) {
                u16 index = skb_get_rx_queue(skb);

                if (unlikely(index >= dev->real_num_rx_queues)) {
                        WARN_ONCE(dev->real_num_rx_queues > 1,
                                  "%s received packet on queue %u, but number "
                                  "of RX queues is %u\n",
                                  dev->name, index, dev->real_num_rx_queues);

                        return rxqueue; /* Return first rxqueue */
                }
                rxqueue += index;
        }
        return rxqueue;
}

u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
                             const struct bpf_prog *xdp_prog)
{
        void *orig_data, *orig_data_end, *hard_start;
        struct netdev_rx_queue *rxqueue;
        bool orig_bcast, orig_host;
        u32 mac_len, frame_sz;
        __be16 orig_eth_type;
        struct ethhdr *eth;
        u32 metalen, act;
        int off;

        /* The XDP program wants to see the packet starting at the MAC
         * header.
         */
        mac_len = skb->data - skb_mac_header(skb);
        hard_start = skb->data - skb_headroom(skb);

        /* SKB "head" area always have tailroom for skb_shared_info */
        frame_sz = (void *)skb_end_pointer(skb) - hard_start;
        frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));

        rxqueue = netif_get_rxqueue(skb);
        xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
        xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
                         skb_headlen(skb) + mac_len, true);
        if (skb_is_nonlinear(skb)) {
                skb_shinfo(skb)->xdp_frags_size = skb->data_len;
                xdp_buff_set_frags_flag(xdp);
        } else {
                xdp_buff_clear_frags_flag(xdp);
        }

        orig_data_end = xdp->data_end;
        orig_data = xdp->data;
        eth = (struct ethhdr *)xdp->data;
        orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr);
        orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
        orig_eth_type = eth->h_proto;

        act = bpf_prog_run_xdp(xdp_prog, xdp);

        /* check if bpf_xdp_adjust_head was used */
        off = xdp->data - orig_data;
        if (off) {
                if (off > 0)
                        __skb_pull(skb, off);
                else if (off < 0)
                        __skb_push(skb, -off);

                skb->mac_header += off;
                skb_reset_network_header(skb);
        }

        /* check if bpf_xdp_adjust_tail was used */
        off = xdp->data_end - orig_data_end;
        if (off != 0) {
                skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
                skb->len += off; /* positive on grow, negative on shrink */
        }

        /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
         * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
         */
        if (xdp_buff_has_frags(xdp))
                skb->data_len = skb_shinfo(skb)->xdp_frags_size;
        else
                skb->data_len = 0;

        /* check if XDP changed eth hdr such SKB needs update */
        eth = (struct ethhdr *)xdp->data;
        if ((orig_eth_type != eth->h_proto) ||
            (orig_host != ether_addr_equal_64bits(eth->h_dest,
                                                  skb->dev->dev_addr)) ||
            (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
                __skb_push(skb, ETH_HLEN);
                skb->pkt_type = PACKET_HOST;
                skb->protocol = eth_type_trans(skb, skb->dev);
        }

        /* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull
         * before calling us again on redirect path. We do not call do_redirect
         * as we leave that up to the caller.
         *
         * Caller is responsible for managing lifetime of skb (i.e. calling
         * kfree_skb in response to actions it cannot handle/XDP_DROP).
         */
        switch (act) {
        case XDP_REDIRECT:
        case XDP_TX:
                __skb_push(skb, mac_len);
                break;
        case XDP_PASS:
                metalen = xdp->data - xdp->data_meta;
                if (metalen)
                        skb_metadata_set(skb, metalen);
                break;
        }

        return act;
}

static int
netif_skb_check_for_xdp(struct sk_buff **pskb, const struct bpf_prog *prog)
{
        struct sk_buff *skb = *pskb;
        int err, hroom, troom;

        local_lock_nested_bh(&system_page_pool.bh_lock);
        err = skb_cow_data_for_xdp(this_cpu_read(system_page_pool.pool), pskb, prog);
        local_unlock_nested_bh(&system_page_pool.bh_lock);
        if (!err)
                return 0;

        /* In case we have to go down the path and also linearize,
         * then lets do the pskb_expand_head() work just once here.
         */
        hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
        troom = skb->tail + skb->data_len - skb->end;
        err = pskb_expand_head(skb,
                               hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
                               troom > 0 ? troom + 128 : 0, GFP_ATOMIC);
        if (err)
                return err;

        return skb_linearize(skb);
}

static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
                                     struct xdp_buff *xdp,
                                     const struct bpf_prog *xdp_prog)
{
        struct sk_buff *skb = *pskb;
        u32 mac_len, act = XDP_DROP;

        /* Reinjected packets coming from act_mirred or similar should
         * not get XDP generic processing.
         */
        if (skb_is_redirected(skb))
                return XDP_PASS;

        /* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM
         * bytes. This is the guarantee that also native XDP provides,
         * thus we need to do it here as well.
         */
        mac_len = skb->data - skb_mac_header(skb);
        __skb_push(skb, mac_len);

        if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
            skb_headroom(skb) < XDP_PACKET_HEADROOM) {
                if (netif_skb_check_for_xdp(pskb, xdp_prog))
                        goto do_drop;
        }

        __skb_pull(*pskb, mac_len);

        act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog);
        switch (act) {
        case XDP_REDIRECT:
        case XDP_TX:
        case XDP_PASS:
                break;
        default:
                bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act);
                fallthrough;
        case XDP_ABORTED:
                trace_xdp_exception((*pskb)->dev, xdp_prog, act);
                fallthrough;
        case XDP_DROP:
        do_drop:
                kfree_skb(*pskb);
                break;
        }

        return act;
}

/* When doing generic XDP we have to bypass the qdisc layer and the
 * network taps in order to match in-driver-XDP behavior. This also means
 * that XDP packets are able to starve other packets going through a qdisc,
 * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX
 * queues, so they do not have this starvation issue.
 */
void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog)
{
        struct net_device *dev = skb->dev;
        struct netdev_queue *txq;
        bool free_skb = true;
        int cpu, rc;

        txq = netdev_core_pick_tx(dev, skb, NULL);
        cpu = smp_processor_id();
        HARD_TX_LOCK(dev, txq, cpu);
        if (!netif_xmit_frozen_or_drv_stopped(txq)) {
                rc = netdev_start_xmit(skb, dev, txq, 0);
                if (dev_xmit_complete(rc))
                        free_skb = false;
        }
        HARD_TX_UNLOCK(dev, txq);
        if (free_skb) {
                trace_xdp_exception(dev, xdp_prog, XDP_TX);
                dev_core_stats_tx_dropped_inc(dev);
                kfree_skb(skb);
        }
}

static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);

int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb)
{
        struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;

        if (xdp_prog) {
                struct xdp_buff xdp;
                u32 act;
                int err;

                bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
                act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog);
                if (act != XDP_PASS) {
                        switch (act) {
                        case XDP_REDIRECT:
                                err = xdp_do_generic_redirect((*pskb)->dev, *pskb,
                                                              &xdp, xdp_prog);
                                if (err)
                                        goto out_redir;
                                break;
                        case XDP_TX:
                                generic_xdp_tx(*pskb, xdp_prog);
                                break;
                        }
                        bpf_net_ctx_clear(bpf_net_ctx);
                        return XDP_DROP;
                }
                bpf_net_ctx_clear(bpf_net_ctx);
        }
        return XDP_PASS;
out_redir:
        bpf_net_ctx_clear(bpf_net_ctx);
        kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP);
        return XDP_DROP;
}
EXPORT_SYMBOL_GPL(do_xdp_generic);

static int netif_rx_internal(struct sk_buff *skb)
{
        int ret;

        net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);

        trace_netif_rx(skb);

#ifdef CONFIG_RPS
        if (static_branch_unlikely(&rps_needed)) {
                struct rps_dev_flow voidflow, *rflow = &voidflow;
                int cpu;

                rcu_read_lock();

                cpu = get_rps_cpu(skb->dev, skb, &rflow);
                if (cpu < 0)
                        cpu = smp_processor_id();

                ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);

                rcu_read_unlock();
        } else
#endif
        {
                unsigned int qtail;

                ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail);
        }
        return ret;
}

/**
 *        __netif_rx        -        Slightly optimized version of netif_rx
 *        @skb: buffer to post
 *
 *        This behaves as netif_rx except that it does not disable bottom halves.
 *        As a result this function may only be invoked from the interrupt context
 *        (either hard or soft interrupt).
 */
int __netif_rx(struct sk_buff *skb)
{
        int ret;

        lockdep_assert_once(hardirq_count() | softirq_count());

        trace_netif_rx_entry(skb);
        ret = netif_rx_internal(skb);
        trace_netif_rx_exit(ret);
        return ret;
}
EXPORT_SYMBOL(__netif_rx);

/**
 *        netif_rx        -        post buffer to the network code
 *        @skb: buffer to post
 *
 *        This function receives a packet from a device driver and queues it for
 *        the upper (protocol) levels to process via the backlog NAPI device. It
 *        always succeeds. The buffer may be dropped during processing for
 *        congestion control or by the protocol layers.
 *        The network buffer is passed via the backlog NAPI device. Modern NIC
 *        driver should use NAPI and GRO.
 *        This function can used from interrupt and from process context. The
 *        caller from process context must not disable interrupts before invoking
 *        this function.
 *
 *        return values:
 *        NET_RX_SUCCESS        (no congestion)
 *        NET_RX_DROP     (packet was dropped)
 *
 */
int netif_rx(struct sk_buff *skb)
{
        bool need_bh_off = !(hardirq_count() | softirq_count());
        int ret;

        if (need_bh_off)
                local_bh_disable();
        trace_netif_rx_entry(skb);
        ret = netif_rx_internal(skb);
        trace_netif_rx_exit(ret);
        if (need_bh_off)
                local_bh_enable();
        return ret;
}
EXPORT_SYMBOL(netif_rx);

static __latent_entropy void net_tx_action(void)
{
        struct softnet_data *sd = this_cpu_ptr(&softnet_data);

        if (sd->completion_queue) {
                struct sk_buff *clist;

                local_irq_disable();
                clist = sd->completion_queue;
                sd->completion_queue = NULL;
                local_irq_enable();

                while (clist) {
                        struct sk_buff *skb = clist;

                        clist = clist->next;

                        WARN_ON(refcount_read(&skb->users));
                        if (likely(get_kfree_skb_cb(skb)->reason == SKB_CONSUMED))
                                trace_consume_skb(skb, net_tx_action);
                        else
                                trace_kfree_skb(skb, net_tx_action,
                                                get_kfree_skb_cb(skb)->reason, NULL);

                        if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
                                __kfree_skb(skb);
                        else
                                __napi_kfree_skb(skb,
                                                 get_kfree_skb_cb(skb)->reason);
                }
        }

        if (sd->output_queue) {
                struct Qdisc *head;

                local_irq_disable();
                head = sd->output_queue;
                sd->output_queue = NULL;
                sd->output_queue_tailp = &sd->output_queue;
                local_irq_enable();

                rcu_read_lock();

                while (head) {
                        struct Qdisc *q = head;
                        spinlock_t *root_lock = NULL;

                        head = head->next_sched;

                        /* We need to make sure head->next_sched is read
                         * before clearing __QDISC_STATE_SCHED
                         */
                        smp_mb__before_atomic();

                        if (!(q->flags & TCQ_F_NOLOCK)) {
                                root_lock = qdisc_lock(q);
                                spin_lock(root_lock);
                        } else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
                                                     &q->state))) {
                                /* There is a synchronize_net() between
                                 * STATE_DEACTIVATED flag being set and
                                 * qdisc_reset()/some_qdisc_is_busy() in
                                 * dev_deactivate(), so we can safely bail out
                                 * early here to avoid data race between
                                 * qdisc_deactivate() and some_qdisc_is_busy()
                                 * for lockless qdisc.
                                 */
                                clear_bit(__QDISC_STATE_SCHED, &q->state);
                                continue;
                        }

                        clear_bit(__QDISC_STATE_SCHED, &q->state);
                        qdisc_run(q);
                        if (root_lock)
                                spin_unlock(root_lock);
                }

                rcu_read_unlock();
        }

        xfrm_dev_backlog(sd);
}

#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
/* This hook is defined here for ATM LANE */
int (*br_fdb_test_addr_hook)(struct net_device *dev,
                             unsigned char *addr) __read_mostly;
EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
#endif

/**
 *        netdev_is_rx_handler_busy - check if receive handler is registered
 *        @dev: device to check
 *
 *        Check if a receive handler is already registered for a given device.
 *        Return true if there one.
 *
 *        The caller must hold the rtnl_mutex.
 */
bool netdev_is_rx_handler_busy(struct net_device *dev)
{
        ASSERT_RTNL();
        return dev && rtnl_dereference(dev->rx_handler);
}
EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);

/**
 *        netdev_rx_handler_register - register receive handler
 *        @dev: device to register a handler for
 *        @rx_handler: receive handler to register
 *        @rx_handler_data: data pointer that is used by rx handler
 *
 *        Register a receive handler for a device. This handler will then be
 *        called from __netif_receive_skb. A negative errno code is returned
 *        on a failure.
 *
 *        The caller must hold the rtnl_mutex.
 *
 *        For a general description of rx_handler, see enum rx_handler_result.
 */
int netdev_rx_handler_register(struct net_device *dev,
                               rx_handler_func_t *rx_handler,
                               void *rx_handler_data)
{
        if (netdev_is_rx_handler_busy(dev))
                return -EBUSY;

        if (dev->priv_flags & IFF_NO_RX_HANDLER)
                return -EINVAL;

        /* Note: rx_handler_data must be set before rx_handler */
        rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
        rcu_assign_pointer(dev->rx_handler, rx_handler);

        return 0;
}
EXPORT_SYMBOL_GPL(netdev_rx_handler_register);

/**
 *        netdev_rx_handler_unregister - unregister receive handler
 *        @dev: device to unregister a handler from
 *
 *        Unregister a receive handler from a device.
 *
 *        The caller must hold the rtnl_mutex.
 */
void netdev_rx_handler_unregister(struct net_device *dev)
{

        ASSERT_RTNL();
        RCU_INIT_POINTER(dev->rx_handler, NULL);
        /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
         * section has a guarantee to see a non NULL rx_handler_data
         * as well.
         */
        synchronize_net();
        RCU_INIT_POINTER(dev->rx_handler_data, NULL);
}
EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);

/*
 * Limit the use of PFMEMALLOC reserves to those protocols that implement
 * the special handling of PFMEMALLOC skbs.
 */
static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
{
        switch (skb->protocol) {
        case htons(ETH_P_ARP):
        case htons(ETH_P_IP):
        case htons(ETH_P_IPV6):
        case htons(ETH_P_8021Q):
        case htons(ETH_P_8021AD):
                return true;
        default:
                return false;
        }
}

static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
                             int *ret, struct net_device *orig_dev)
{
        if (nf_hook_ingress_active(skb)) {
                int ingress_retval;

                if (*pt_prev) {
                        *ret = deliver_skb(skb, *pt_prev, orig_dev);
                        *pt_prev = NULL;
                }

                rcu_read_lock();
                ingress_retval = nf_hook_ingress(skb);
                rcu_read_unlock();
                return ingress_retval;
        }
        return 0;
}

static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
                                    struct packet_type **ppt_prev)
{
        enum skb_drop_reason drop_reason = SKB_DROP_REASON_UNHANDLED_PROTO;
        struct packet_type *ptype, *pt_prev;
        rx_handler_func_t *rx_handler;
        struct sk_buff *skb = *pskb;
        struct net_device *orig_dev;
        bool deliver_exact = false;
        int ret = NET_RX_DROP;
        __be16 type;

        net_timestamp_check(!READ_ONCE(net_hotdata.tstamp_prequeue), skb);

        trace_netif_receive_skb(skb);

        orig_dev = skb->dev;

        skb_reset_network_header(skb);
#if !defined(CONFIG_DEBUG_NET)
        /* We plan to no longer reset the transport header here.
         * Give some time to fuzzers and dev build to catch bugs
         * in network stacks.
         */
        if (!skb_transport_header_was_set(skb))
                skb_reset_transport_header(skb);
#endif
        skb_reset_mac_len(skb);

        pt_prev = NULL;

another_round:
        skb->skb_iif = skb->dev->ifindex;

        __this_cpu_inc(softnet_data.processed);

        if (static_branch_unlikely(&generic_xdp_needed_key)) {
                int ret2;

                migrate_disable();
                ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
                                      &skb);
                migrate_enable();

                if (ret2 != XDP_PASS) {
                        ret = NET_RX_DROP;
                        goto out;
                }
        }

        if (eth_type_vlan(skb->protocol)) {
                skb = skb_vlan_untag(skb);
                if (unlikely(!skb))
                        goto out;
        }

        if (skb_skip_tc_classify(skb))
                goto skip_classify;

        if (pfmemalloc)
                goto skip_taps;

        list_for_each_entry_rcu(ptype, &dev_net_rcu(skb->dev)->ptype_all,
                                list) {
                if (pt_prev)
                        ret = deliver_skb(skb, pt_prev, orig_dev);
                pt_prev = ptype;
        }

        list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
                if (pt_prev)
                        ret = deliver_skb(skb, pt_prev, orig_dev);
                pt_prev = ptype;
        }

skip_taps:
#ifdef CONFIG_NET_INGRESS
        if (static_branch_unlikely(&ingress_needed_key)) {
                bool another = false;

                nf_skip_egress(skb, true);
                skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
                                         &another);
                if (another)
                        goto another_round;
                if (!skb)
                        goto out;

                nf_skip_egress(skb, false);
                if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
                        goto out;
        }
#endif
        skb_reset_redirect(skb);
skip_classify:
        if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) {
                drop_reason = SKB_DROP_REASON_PFMEMALLOC;
                goto drop;
        }

        if (skb_vlan_tag_present(skb)) {
                if (pt_prev) {
                        ret = deliver_skb(skb, pt_prev, orig_dev);
                        pt_prev = NULL;
                }
                if (vlan_do_receive(&skb))
                        goto another_round;
                else if (unlikely(!skb))
                        goto out;
        }

        rx_handler = rcu_dereference(skb->dev->rx_handler);
        if (rx_handler) {
                if (pt_prev) {
                        ret = deliver_skb(skb, pt_prev, orig_dev);
                        pt_prev = NULL;
                }
                switch (rx_handler(&skb)) {
                case RX_HANDLER_CONSUMED:
                        ret = NET_RX_SUCCESS;
                        goto out;
                case RX_HANDLER_ANOTHER:
                        goto another_round;
                case RX_HANDLER_EXACT:
                        deliver_exact = true;
                        break;
                case RX_HANDLER_PASS:
                        break;
                default:
                        BUG();
                }
        }

        if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
check_vlan_id:
                if (skb_vlan_tag_get_id(skb)) {
                        /* Vlan id is non 0 and vlan_do_receive() above couldn't
                         * find vlan device.
                         */
                        skb->pkt_type = PACKET_OTHERHOST;
                } else if (eth_type_vlan(skb->protocol)) {
                        /* Outer header is 802.1P with vlan 0, inner header is
                         * 802.1Q or 802.1AD and vlan_do_receive() above could
                         * not find vlan dev for vlan id 0.
                         */
                        __vlan_hwaccel_clear_tag(skb);
                        skb = skb_vlan_untag(skb);
                        if (unlikely(!skb))
                                goto out;
                        if (vlan_do_receive(&skb))
                                /* After stripping off 802.1P header with vlan 0
                                 * vlan dev is found for inner header.
                                 */
                                goto another_round;
                        else if (unlikely(!skb))
                                goto out;
                        else
                                /* We have stripped outer 802.1P vlan 0 header.
                                 * But could not find vlan dev.
                                 * check again for vlan id to set OTHERHOST.
                                 */
                                goto check_vlan_id;
                }
                /* Note: we might in the future use prio bits
                 * and set skb->priority like in vlan_do_receive()
                 * For the time being, just ignore Priority Code Point
                 */
                __vlan_hwaccel_clear_tag(skb);
        }

        type = skb->protocol;

        /* deliver only exact match when indicated */
        if (likely(!deliver_exact)) {
                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
                                       &ptype_base[ntohs(type) &
                                                   PTYPE_HASH_MASK]);

                /* orig_dev and skb->dev could belong to different netns;
                 * Even in such case we need to traverse only the list
                 * coming from skb->dev, as the ptype owner (packet socket)
                 * will use dev_net(skb->dev) to do namespace filtering.
                 */
                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
                                       &dev_net_rcu(skb->dev)->ptype_specific);
        }

        deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
                               &orig_dev->ptype_specific);

        if (unlikely(skb->dev != orig_dev)) {
                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
                                       &skb->dev->ptype_specific);
        }

        if (pt_prev) {
                *ppt_prev = pt_prev;
        } else {
drop:
                if (!deliver_exact)
                        dev_core_stats_rx_dropped_inc(skb->dev);
                else
                        dev_core_stats_rx_nohandler_inc(skb->dev);

                kfree_skb_reason(skb, drop_reason);
                /* Jamal, now you will not able to escape explaining
                 * me how you were going to use this. :-)
                 */
                ret = NET_RX_DROP;
        }

out:
        /* The invariant here is that if *ppt_prev is not NULL
         * then skb should also be non-NULL.
         *
         * Apparently *ppt_prev assignment above holds this invariant due to
         * skb dereferencing near it.
         */
        *pskb = skb;
        return ret;
}

static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
{
        struct net_device *orig_dev = skb->dev;
        struct packet_type *pt_prev = NULL;
        int ret;

        ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
        if (pt_prev)
                ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
                                         skb->dev, pt_prev, orig_dev);
        return ret;
}

/**
 *        netif_receive_skb_core - special purpose version of netif_receive_skb
 *        @skb: buffer to process
 *
 *        More direct receive version of netif_receive_skb().  It should
 *        only be used by callers that have a need to skip RPS and Generic XDP.
 *        Caller must also take care of handling if ``(page_is_)pfmemalloc``.
 *
 *        This function may only be called from softirq context and interrupts
 *        should be enabled.
 *
 *        Return values (usually ignored):
 *        NET_RX_SUCCESS: no congestion
 *        NET_RX_DROP: packet was dropped
 */
int netif_receive_skb_core(struct sk_buff *skb)
{
        int ret;

        rcu_read_lock();
        ret = __netif_receive_skb_one_core(skb, false);
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL(netif_receive_skb_core);

static inline void __netif_receive_skb_list_ptype(struct list_head *head,
                                                  struct packet_type *pt_prev,
                                                  struct net_device *orig_dev)
{
        struct sk_buff *skb, *next;

        if (!pt_prev)
                return;
        if (list_empty(head))
                return;
        if (pt_prev->list_func != NULL)
                INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
                                   ip_list_rcv, head, pt_prev, orig_dev);
        else
                list_for_each_entry_safe(skb, next, head, list) {
                        skb_list_del_init(skb);
                        pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
                }
}

static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
{
        /* Fast-path assumptions:
         * - There is no RX handler.
         * - Only one packet_type matches.
         * If either of these fails, we will end up doing some per-packet
         * processing in-line, then handling the 'last ptype' for the whole
         * sublist.  This can't cause out-of-order delivery to any single ptype,
         * because the 'last ptype' must be constant across the sublist, and all
         * other ptypes are handled per-packet.
         */
        /* Current (common) ptype of sublist */
        struct packet_type *pt_curr = NULL;
        /* Current (common) orig_dev of sublist */
        struct net_device *od_curr = NULL;
        struct sk_buff *skb, *next;
        LIST_HEAD(sublist);

        list_for_each_entry_safe(skb, next, head, list) {
                struct net_device *orig_dev = skb->dev;
                struct packet_type *pt_prev = NULL;

                skb_list_del_init(skb);
                __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
                if (!pt_prev)
                        continue;
                if (pt_curr != pt_prev || od_curr != orig_dev) {
                        /* dispatch old sublist */
                        __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
                        /* start new sublist */
                        INIT_LIST_HEAD(&sublist);
                        pt_curr = pt_prev;
                        od_curr = orig_dev;
                }
                list_add_tail(&skb->list, &sublist);
        }

        /* dispatch final sublist */
        __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
}

static int __netif_receive_skb(struct sk_buff *skb)
{
        int ret;

        if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
                unsigned int noreclaim_flag;

                /*
                 * PFMEMALLOC skbs are special, they should
                 * - be delivered to SOCK_MEMALLOC sockets only
                 * - stay away from userspace
                 * - have bounded memory usage
                 *
                 * Use PF_MEMALLOC as this saves us from propagating the allocation
                 * context down to all allocation sites.
                 */
                noreclaim_flag = memalloc_noreclaim_save();
                ret = __netif_receive_skb_one_core(skb, true);
                memalloc_noreclaim_restore(noreclaim_flag);
        } else
                ret = __netif_receive_skb_one_core(skb, false);

        return ret;
}

static void __netif_receive_skb_list(struct list_head *head)
{
        unsigned long noreclaim_flag = 0;
        struct sk_buff *skb, *next;
        bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */

        list_for_each_entry_safe(skb, next, head, list) {
                if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
                        struct list_head sublist;

                        /* Handle the previous sublist */
                        list_cut_before(&sublist, head, &skb->list);
                        if (!list_empty(&sublist))
                                __netif_receive_skb_list_core(&sublist, pfmemalloc);
                        pfmemalloc = !pfmemalloc;
                        /* See comments in __netif_receive_skb */
                        if (pfmemalloc)
                                noreclaim_flag = memalloc_noreclaim_save();
                        else
                                memalloc_noreclaim_restore(noreclaim_flag);
                }
        }
        /* Handle the remaining sublist */
        if (!list_empty(head))
                __netif_receive_skb_list_core(head, pfmemalloc);
        /* Restore pflags */
        if (pfmemalloc)
                memalloc_noreclaim_restore(noreclaim_flag);
}

static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
{
        struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
        struct bpf_prog *new = xdp->prog;
        int ret = 0;

        switch (xdp->command) {
        case XDP_SETUP_PROG:
                rcu_assign_pointer(dev->xdp_prog, new);
                if (old)
                        bpf_prog_put(old);

                if (old && !new) {
                        static_branch_dec(&generic_xdp_needed_key);
                } else if (new && !old) {
                        static_branch_inc(&generic_xdp_needed_key);
                        netif_disable_lro(dev);
                        dev_disable_gro_hw(dev);
                }
                break;

        default:
                ret = -EINVAL;
                break;
        }

        return ret;
}

static int netif_receive_skb_internal(struct sk_buff *skb)
{
        int ret;

        net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);

        if (skb_defer_rx_timestamp(skb))
                return NET_RX_SUCCESS;

        rcu_read_lock();
#ifdef CONFIG_RPS
        if (static_branch_unlikely(&rps_needed)) {
                struct rps_dev_flow voidflow, *rflow = &voidflow;
                int cpu = get_rps_cpu(skb->dev, skb, &rflow);

                if (cpu >= 0) {
                        ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
                        rcu_read_unlock();
                        return ret;
                }
        }
#endif
        ret = __netif_receive_skb(skb);
        rcu_read_unlock();
        return ret;
}

void netif_receive_skb_list_internal(struct list_head *head)
{
        struct sk_buff *skb, *next;
        LIST_HEAD(sublist);

        list_for_each_entry_safe(skb, next, head, list) {
                net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue),
                                    skb);
                skb_list_del_init(skb);
                if (!skb_defer_rx_timestamp(skb))
                        list_add_tail(&skb->list, &sublist);
        }
        list_splice_init(&sublist, head);

        rcu_read_lock();
#ifdef CONFIG_RPS
        if (static_branch_unlikely(&rps_needed)) {
                list_for_each_entry_safe(skb, next, head, list) {
                        struct rps_dev_flow voidflow, *rflow = &voidflow;
                        int cpu = get_rps_cpu(skb->dev, skb, &rflow);

                        if (cpu >= 0) {
                                /* Will be handled, remove from list */
                                skb_list_del_init(skb);
                                enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
                        }
                }
        }
#endif
        __netif_receive_skb_list(head);
        rcu_read_unlock();
}

/**
 *        netif_receive_skb - process receive buffer from network
 *        @skb: buffer to process
 *
 *        netif_receive_skb() is the main receive data processing function.
 *        It always succeeds. The buffer may be dropped during processing
 *        for congestion control or by the protocol layers.
 *
 *        This function may only be called from softirq context and interrupts
 *        should be enabled.
 *
 *        Return values (usually ignored):
 *        NET_RX_SUCCESS: no congestion
 *        NET_RX_DROP: packet was dropped
 */
int netif_receive_skb(struct sk_buff *skb)
{
        int ret;

        trace_netif_receive_skb_entry(skb);

        ret = netif_receive_skb_internal(skb);
        trace_netif_receive_skb_exit(ret);

        return ret;
}
EXPORT_SYMBOL(netif_receive_skb);

/**
 *        netif_receive_skb_list - process many receive buffers from network
 *        @head: list of skbs to process.
 *
 *        Since return value of netif_receive_skb() is normally ignored, and
 *        wouldn't be meaningful for a list, this function returns void.
 *
 *        This function may only be called from softirq context and interrupts
 *        should be enabled.
 */
void netif_receive_skb_list(struct list_head *head)
{
        struct sk_buff *skb;

        if (list_empty(head))
                return;
        if (trace_netif_receive_skb_list_entry_enabled()) {
                list_for_each_entry(skb, head, list)
                        trace_netif_receive_skb_list_entry(skb);
        }
        netif_receive_skb_list_internal(head);
        trace_netif_receive_skb_list_exit(0);
}
EXPORT_SYMBOL(netif_receive_skb_list);

/* Network device is going away, flush any packets still pending */
static void flush_backlog(struct work_struct *work)
{
        struct sk_buff *skb, *tmp;
        struct sk_buff_head list;
        struct softnet_data *sd;

        __skb_queue_head_init(&list);
        local_bh_disable();
        sd = this_cpu_ptr(&softnet_data);

        backlog_lock_irq_disable(sd);
        skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
                if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) {
                        __skb_unlink(skb, &sd->input_pkt_queue);
                        __skb_queue_tail(&list, skb);
                        rps_input_queue_head_incr(sd);
                }
        }
        backlog_unlock_irq_enable(sd);

        local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
        skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
                if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) {
                        __skb_unlink(skb, &sd->process_queue);
                        __skb_queue_tail(&list, skb);
                        rps_input_queue_head_incr(sd);
                }
        }
        local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
        local_bh_enable();

        __skb_queue_purge_reason(&list, SKB_DROP_REASON_DEV_READY);
}

static bool flush_required(int cpu)
{
#if IS_ENABLED(CONFIG_RPS)
        struct softnet_data *sd = &per_cpu(softnet_data, cpu);
        bool do_flush;

        backlog_lock_irq_disable(sd);

        /* as insertion into process_queue happens with the rps lock held,
         * process_queue access may race only with dequeue
         */
        do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
                   !skb_queue_empty_lockless(&sd->process_queue);
        backlog_unlock_irq_enable(sd);

        return do_flush;
#endif
        /* without RPS we can't safely check input_pkt_queue: during a
         * concurrent remote skb_queue_splice() we can detect as empty both
         * input_pkt_queue and process_queue even if the latter could end-up
         * containing a lot of packets.
         */
        return true;
}

struct flush_backlogs {
        cpumask_t                flush_cpus;
        struct work_struct        w[];
};

static struct flush_backlogs *flush_backlogs_alloc(void)
{
        return kmalloc(struct_size_t(struct flush_backlogs, w, nr_cpu_ids),
                       GFP_KERNEL);
}

static struct flush_backlogs *flush_backlogs_fallback;
static DEFINE_MUTEX(flush_backlogs_mutex);

static void flush_all_backlogs(void)
{
        struct flush_backlogs *ptr = flush_backlogs_alloc();
        unsigned int cpu;

        if (!ptr) {
                mutex_lock(&flush_backlogs_mutex);
                ptr = flush_backlogs_fallback;
        }
        cpumask_clear(&ptr->flush_cpus);

        cpus_read_lock();

        for_each_online_cpu(cpu) {
                if (flush_required(cpu)) {
                        INIT_WORK(&ptr->w[cpu], flush_backlog);
                        queue_work_on(cpu, system_highpri_wq, &ptr->w[cpu]);
                        __cpumask_set_cpu(cpu, &ptr->flush_cpus);
                }
        }

        /* we can have in flight packet[s] on the cpus we are not flushing,
         * synchronize_net() in unregister_netdevice_many() will take care of
         * them.
         */
        for_each_cpu(cpu, &ptr->flush_cpus)
                flush_work(&ptr->w[cpu]);

        cpus_read_unlock();

        if (ptr != flush_backlogs_fallback)
                kfree(ptr);
        else
                mutex_unlock(&flush_backlogs_mutex);
}

static void net_rps_send_ipi(struct softnet_data *remsd)
{
#ifdef CONFIG_RPS
        while (remsd) {
                struct softnet_data *next = remsd->rps_ipi_next;

                if (cpu_online(remsd->cpu))
                        smp_call_function_single_async(remsd->cpu, &remsd->csd);
                remsd = next;
        }
#endif
}

/*
 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
 * Note: called with local irq disabled, but exits with local irq enabled.
 */
static void net_rps_action_and_irq_enable(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
        struct softnet_data *remsd = sd->rps_ipi_list;

        if (!use_backlog_threads() && remsd) {
                sd->rps_ipi_list = NULL;

                local_irq_enable();

                /* Send pending IPI's to kick RPS processing on remote cpus. */
                net_rps_send_ipi(remsd);
        } else
#endif
                local_irq_enable();
}

static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
        return !use_backlog_threads() && sd->rps_ipi_list;
#else
        return false;
#endif
}

static int process_backlog(struct napi_struct *napi, int quota)
{
        struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
        bool again = true;
        int work = 0;

        /* Check if we have pending ipi, its better to send them now,
         * not waiting net_rx_action() end.
         */
        if (sd_has_rps_ipi_waiting(sd)) {
                local_irq_disable();
                net_rps_action_and_irq_enable(sd);
        }

        napi->weight = READ_ONCE(net_hotdata.dev_rx_weight);
        while (again) {
                struct sk_buff *skb;

                local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
                while ((skb = __skb_dequeue(&sd->process_queue))) {
                        local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
                        rcu_read_lock();
                        __netif_receive_skb(skb);
                        rcu_read_unlock();
                        if (++work >= quota) {
                                rps_input_queue_head_add(sd, work);
                                return work;
                        }

                        local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
                }
                local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);

                backlog_lock_irq_disable(sd);
                if (skb_queue_empty(&sd->input_pkt_queue)) {
                        /*
                         * Inline a custom version of __napi_complete().
                         * only current cpu owns and manipulates this napi,
                         * and NAPI_STATE_SCHED is the only possible flag set
                         * on backlog.
                         * We can use a plain write instead of clear_bit(),
                         * and we dont need an smp_mb() memory barrier.
                         */
                        napi->state &= NAPIF_STATE_THREADED;
                        again = false;
                } else {
                        local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
                        skb_queue_splice_tail_init(&sd->input_pkt_queue,
                                                   &sd->process_queue);
                        local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
                }
                backlog_unlock_irq_enable(sd);
        }

        if (work)
                rps_input_queue_head_add(sd, work);
        return work;
}

/**
 * __napi_schedule - schedule for receive
 * @n: entry to schedule
 *
 * The entry's receive function will be scheduled to run.
 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
 */
void __napi_schedule(struct napi_struct *n)
{
        unsigned long flags;

        local_irq_save(flags);
        ____napi_schedule(this_cpu_ptr(&softnet_data), n);
        local_irq_restore(flags);
}
EXPORT_SYMBOL(__napi_schedule);

/**
 *        napi_schedule_prep - check if napi can be scheduled
 *        @n: napi context
 *
 * Test if NAPI routine is already running, and if not mark
 * it as running.  This is used as a condition variable to
 * insure only one NAPI poll instance runs.  We also make
 * sure there is no pending NAPI disable.
 */
bool napi_schedule_prep(struct napi_struct *n)
{
        unsigned long new, val = READ_ONCE(n->state);

        do {
                if (unlikely(val & NAPIF_STATE_DISABLE))
                        return false;
                new = val | NAPIF_STATE_SCHED;

                /* Sets STATE_MISSED bit if STATE_SCHED was already set
                 * This was suggested by Alexander Duyck, as compiler
                 * emits better code than :
                 * if (val & NAPIF_STATE_SCHED)
                 *     new |= NAPIF_STATE_MISSED;
                 */
                new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
                                                   NAPIF_STATE_MISSED;
        } while (!try_cmpxchg(&n->state, &val, new));

        return !(val & NAPIF_STATE_SCHED);
}
EXPORT_SYMBOL(napi_schedule_prep);

/**
 * __napi_schedule_irqoff - schedule for receive
 * @n: entry to schedule
 *
 * Variant of __napi_schedule() assuming hard irqs are masked.
 *
 * On PREEMPT_RT enabled kernels this maps to __napi_schedule()
 * because the interrupt disabled assumption might not be true
 * due to force-threaded interrupts and spinlock substitution.
 */
void __napi_schedule_irqoff(struct napi_struct *n)
{
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                ____napi_schedule(this_cpu_ptr(&softnet_data), n);
        else
                __napi_schedule(n);
}
EXPORT_SYMBOL(__napi_schedule_irqoff);

bool napi_complete_done(struct napi_struct *n, int work_done)
{
        unsigned long flags, val, new, timeout = 0;
        bool ret = true;

        /*
         * 1) Don't let napi dequeue from the cpu poll list
         *    just in case its running on a different cpu.
         * 2) If we are busy polling, do nothing here, we have
         *    the guarantee we will be called later.
         */
        if (unlikely(n->state & (NAPIF_STATE_NPSVC |
                                 NAPIF_STATE_IN_BUSY_POLL)))
                return false;

        if (work_done) {
                if (n->gro.bitmask)
                        timeout = napi_get_gro_flush_timeout(n);
                n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n);
        }
        if (n->defer_hard_irqs_count > 0) {
                n->defer_hard_irqs_count--;
                timeout = napi_get_gro_flush_timeout(n);
                if (timeout)
                        ret = false;
        }

        /*
         * When the NAPI instance uses a timeout and keeps postponing
         * it, we need to bound somehow the time packets are kept in
         * the GRO layer.
         */
        gro_flush_normal(&n->gro, !!timeout);

        if (unlikely(!list_empty(&n->poll_list))) {
                /* If n->poll_list is not empty, we need to mask irqs */
                local_irq_save(flags);
                list_del_init(&n->poll_list);
                local_irq_restore(flags);
        }
        WRITE_ONCE(n->list_owner, -1);

        val = READ_ONCE(n->state);
        do {
                WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));

                new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
                              NAPIF_STATE_SCHED_THREADED |
                              NAPIF_STATE_PREFER_BUSY_POLL);

                /* If STATE_MISSED was set, leave STATE_SCHED set,
                 * because we will call napi->poll() one more time.
                 * This C code was suggested by Alexander Duyck to help gcc.
                 */
                new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
                                                    NAPIF_STATE_SCHED;
        } while (!try_cmpxchg(&n->state, &val, new));

        if (unlikely(val & NAPIF_STATE_MISSED)) {
                __napi_schedule(n);
                return false;
        }

        if (timeout)
                hrtimer_start(&n->timer, ns_to_ktime(timeout),
                              HRTIMER_MODE_REL_PINNED);
        return ret;
}
EXPORT_SYMBOL(napi_complete_done);

static void skb_defer_free_flush(void)
{
        struct llist_node *free_list;
        struct sk_buff *skb, *next;
        struct skb_defer_node *sdn;
        int node;

        for_each_node(node) {
                sdn = this_cpu_ptr(net_hotdata.skb_defer_nodes) + node;

                if (llist_empty(&sdn->defer_list))
                        continue;
                atomic_long_set(&sdn->defer_count, 0);
                free_list = llist_del_all(&sdn->defer_list);

                llist_for_each_entry_safe(skb, next, free_list, ll_node) {
                        napi_consume_skb(skb, 1);
                }
        }
}

#if defined(CONFIG_NET_RX_BUSY_POLL)

static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
{
        if (!skip_schedule) {
                gro_normal_list(&napi->gro);
                __napi_schedule(napi);
                return;
        }

        /* Flush too old packets. If HZ < 1000, flush all packets */
        gro_flush_normal(&napi->gro, HZ >= 1000);

        clear_bit(NAPI_STATE_SCHED, &napi->state);
}

enum {
        NAPI_F_PREFER_BUSY_POLL        = 1,
        NAPI_F_END_ON_RESCHED        = 2,
};

static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
                           unsigned flags, u16 budget)
{
        struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
        bool skip_schedule = false;
        unsigned long timeout;
        int rc;

        /* Busy polling means there is a high chance device driver hard irq
         * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
         * set in napi_schedule_prep().
         * Since we are about to call napi->poll() once more, we can safely
         * clear NAPI_STATE_MISSED.
         *
         * Note: x86 could use a single "lock and ..." instruction
         * to perform these two clear_bit()
         */
        clear_bit(NAPI_STATE_MISSED, &napi->state);
        clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);

        local_bh_disable();
        bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);

        if (flags & NAPI_F_PREFER_BUSY_POLL) {
                napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi);
                timeout = napi_get_gro_flush_timeout(napi);
                if (napi->defer_hard_irqs_count && timeout) {
                        hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
                        skip_schedule = true;
                }
        }

        /* All we really want here is to re-enable device interrupts.
         * Ideally, a new ndo_busy_poll_stop() could avoid another round.
         */
        rc = napi->poll(napi, budget);
        /* We can't gro_normal_list() here, because napi->poll() might have
         * rearmed the napi (napi_complete_done()) in which case it could
         * already be running on another CPU.
         */
        trace_napi_poll(napi, rc, budget);
        netpoll_poll_unlock(have_poll_lock);
        if (rc == budget)
                __busy_poll_stop(napi, skip_schedule);
        bpf_net_ctx_clear(bpf_net_ctx);
        local_bh_enable();
}

static void __napi_busy_loop(unsigned int napi_id,
                      bool (*loop_end)(void *, unsigned long),
                      void *loop_end_arg, unsigned flags, u16 budget)
{
        unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
        int (*napi_poll)(struct napi_struct *napi, int budget);
        struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
        void *have_poll_lock = NULL;
        struct napi_struct *napi;

        WARN_ON_ONCE(!rcu_read_lock_held());

restart:
        napi_poll = NULL;

        napi = napi_by_id(napi_id);
        if (!napi)
                return;

        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                preempt_disable();
        for (;;) {
                int work = 0;

                local_bh_disable();
                bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
                if (!napi_poll) {
                        unsigned long val = READ_ONCE(napi->state);

                        /* If multiple threads are competing for this napi,
                         * we avoid dirtying napi->state as much as we can.
                         */
                        if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
                                   NAPIF_STATE_IN_BUSY_POLL)) {
                                if (flags & NAPI_F_PREFER_BUSY_POLL)
                                        set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
                                goto count;
                        }
                        if (cmpxchg(&napi->state, val,
                                    val | NAPIF_STATE_IN_BUSY_POLL |
                                          NAPIF_STATE_SCHED) != val) {
                                if (flags & NAPI_F_PREFER_BUSY_POLL)
                                        set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
                                goto count;
                        }
                        have_poll_lock = netpoll_poll_lock(napi);
                        napi_poll = napi->poll;
                }
                work = napi_poll(napi, budget);
                trace_napi_poll(napi, work, budget);
                gro_normal_list(&napi->gro);
count:
                if (work > 0)
                        __NET_ADD_STATS(dev_net(napi->dev),
                                        LINUX_MIB_BUSYPOLLRXPACKETS, work);
                skb_defer_free_flush();
                bpf_net_ctx_clear(bpf_net_ctx);
                local_bh_enable();

                if (!loop_end || loop_end(loop_end_arg, start_time))
                        break;

                if (unlikely(need_resched())) {
                        if (flags & NAPI_F_END_ON_RESCHED)
                                break;
                        if (napi_poll)
                                busy_poll_stop(napi, have_poll_lock, flags, budget);
                        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                                preempt_enable();
                        rcu_read_unlock();
                        cond_resched();
                        rcu_read_lock();
                        if (loop_end(loop_end_arg, start_time))
                                return;
                        goto restart;
                }
                cpu_relax();
        }
        if (napi_poll)
                busy_poll_stop(napi, have_poll_lock, flags, budget);
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                preempt_enable();
}

void napi_busy_loop_rcu(unsigned int napi_id,
                        bool (*loop_end)(void *, unsigned long),
                        void *loop_end_arg, bool prefer_busy_poll, u16 budget)
{
        unsigned flags = NAPI_F_END_ON_RESCHED;

        if (prefer_busy_poll)
                flags |= NAPI_F_PREFER_BUSY_POLL;

        __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
}

void napi_busy_loop(unsigned int napi_id,
                    bool (*loop_end)(void *, unsigned long),
                    void *loop_end_arg, bool prefer_busy_poll, u16 budget)
{
        unsigned flags = prefer_busy_poll ? NAPI_F_PREFER_BUSY_POLL : 0;

        rcu_read_lock();
        __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
        rcu_read_unlock();
}
EXPORT_SYMBOL(napi_busy_loop);

void napi_suspend_irqs(unsigned int napi_id)
{
        struct napi_struct *napi;

        rcu_read_lock();
        napi = napi_by_id(napi_id);
        if (napi) {
                unsigned long timeout = napi_get_irq_suspend_timeout(napi);

                if (timeout)
                        hrtimer_start(&napi->timer, ns_to_ktime(timeout),
                                      HRTIMER_MODE_REL_PINNED);
        }
        rcu_read_unlock();
}

void napi_resume_irqs(unsigned int napi_id)
{
        struct napi_struct *napi;

        rcu_read_lock();
        napi = napi_by_id(napi_id);
        if (napi) {
                /* If irq_suspend_timeout is set to 0 between the call to
                 * napi_suspend_irqs and now, the original value still
                 * determines the safety timeout as intended and napi_watchdog
                 * will resume irq processing.
                 */
                if (napi_get_irq_suspend_timeout(napi)) {
                        local_bh_disable();
                        napi_schedule(napi);
                        local_bh_enable();
                }
        }
        rcu_read_unlock();
}

#endif /* CONFIG_NET_RX_BUSY_POLL */

static void __napi_hash_add_with_id(struct napi_struct *napi,
                                    unsigned int napi_id)
{
        napi->gro.cached_napi_id = napi_id;

        WRITE_ONCE(napi->napi_id, napi_id);
        hlist_add_head_rcu(&napi->napi_hash_node,
                           &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
}

static void napi_hash_add_with_id(struct napi_struct *napi,
                                  unsigned int napi_id)
{
        unsigned long flags;

        spin_lock_irqsave(&napi_hash_lock, flags);
        WARN_ON_ONCE(napi_by_id(napi_id));
        __napi_hash_add_with_id(napi, napi_id);
        spin_unlock_irqrestore(&napi_hash_lock, flags);
}

static void napi_hash_add(struct napi_struct *napi)
{
        unsigned long flags;

        if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
                return;

        spin_lock_irqsave(&napi_hash_lock, flags);

        /* 0..NR_CPUS range is reserved for sender_cpu use */
        do {
                if (unlikely(!napi_id_valid(++napi_gen_id)))
                        napi_gen_id = MIN_NAPI_ID;
        } while (napi_by_id(napi_gen_id));

        __napi_hash_add_with_id(napi, napi_gen_id);

        spin_unlock_irqrestore(&napi_hash_lock, flags);
}

/* Warning : caller is responsible to make sure rcu grace period
 * is respected before freeing memory containing @napi
 */
static void napi_hash_del(struct napi_struct *napi)
{
        unsigned long flags;

        spin_lock_irqsave(&napi_hash_lock, flags);

        hlist_del_init_rcu(&napi->napi_hash_node);

        spin_unlock_irqrestore(&napi_hash_lock, flags);
}

static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
{
        struct napi_struct *napi;

        napi = container_of(timer, struct napi_struct, timer);

        /* Note : we use a relaxed variant of napi_schedule_prep() not setting
         * NAPI_STATE_MISSED, since we do not react to a device IRQ.
         */
        if (!napi_disable_pending(napi) &&
            !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
                clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
                __napi_schedule_irqoff(napi);
        }

        return HRTIMER_NORESTART;
}

static void napi_stop_kthread(struct napi_struct *napi)
{
        unsigned long val, new;

        /* Wait until the napi STATE_THREADED is unset. */
        while (true) {
                val = READ_ONCE(napi->state);

                /* If napi kthread own this napi or the napi is idle,
                 * STATE_THREADED can be unset here.
                 */
                if ((val & NAPIF_STATE_SCHED_THREADED) ||
                    !(val & NAPIF_STATE_SCHED)) {
                        new = val & (~NAPIF_STATE_THREADED);
                } else {
                        msleep(20);
                        continue;
                }

                if (try_cmpxchg(&napi->state, &val, new))
                        break;
        }

        /* Once STATE_THREADED is unset, wait for SCHED_THREADED to be unset by
         * the kthread.
         */
        while (true) {
                if (!test_bit(NAPI_STATE_SCHED_THREADED, &napi->state))
                        break;

                msleep(20);
        }

        kthread_stop(napi->thread);
        napi->thread = NULL;
}

int napi_set_threaded(struct napi_struct *napi,
                      enum netdev_napi_threaded threaded)
{
        if (threaded) {
                if (!napi->thread) {
                        int err = napi_kthread_create(napi);

                        if (err)
                                return err;
                }
        }

        if (napi->config)
                napi->config->threaded = threaded;

        /* Setting/unsetting threaded mode on a napi might not immediately
         * take effect, if the current napi instance is actively being
         * polled. In this case, the switch between threaded mode and
         * softirq mode will happen in the next round of napi_schedule().
         * This should not cause hiccups/stalls to the live traffic.
         */
        if (!threaded && napi->thread) {
                napi_stop_kthread(napi);
        } else {
                /* Make sure kthread is created before THREADED bit is set. */
                smp_mb__before_atomic();
                assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);
        }

        return 0;
}

int netif_set_threaded(struct net_device *dev,
                       enum netdev_napi_threaded threaded)
{
        struct napi_struct *napi;
        int i, err = 0;

        netdev_assert_locked_or_invisible(dev);

        if (threaded) {
                list_for_each_entry(napi, &dev->napi_list, dev_list) {
                        if (!napi->thread) {
                                err = napi_kthread_create(napi);
                                if (err) {
                                        threaded = NETDEV_NAPI_THREADED_DISABLED;
                                        break;
                                }
                        }
                }
        }

        WRITE_ONCE(dev->threaded, threaded);

        /* The error should not occur as the kthreads are already created. */
        list_for_each_entry(napi, &dev->napi_list, dev_list)
                WARN_ON_ONCE(napi_set_threaded(napi, threaded));

        /* Override the config for all NAPIs even if currently not listed */
        for (i = 0; i < dev->num_napi_configs; i++)
                dev->napi_config[i].threaded = threaded;

        return err;
}

/**
 * netif_threaded_enable() - enable threaded NAPIs
 * @dev: net_device instance
 *
 * Enable threaded mode for the NAPI instances of the device. This may be useful
 * for devices where multiple NAPI instances get scheduled by a single
 * interrupt. Threaded NAPI allows moving the NAPI processing to cores other
 * than the core where IRQ is mapped.
 *
 * This function should be called before @dev is registered.
 */
void netif_threaded_enable(struct net_device *dev)
{
        WARN_ON_ONCE(netif_set_threaded(dev, NETDEV_NAPI_THREADED_ENABLED));
}
EXPORT_SYMBOL(netif_threaded_enable);

/**
 * netif_queue_set_napi - Associate queue with the napi
 * @dev: device to which NAPI and queue belong
 * @queue_index: Index of queue
 * @type: queue type as RX or TX
 * @napi: NAPI context, pass NULL to clear previously set NAPI
 *
 * Set queue with its corresponding napi context. This should be done after
 * registering the NAPI handler for the queue-vector and the queues have been
 * mapped to the corresponding interrupt vector.
 */
void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
                          enum netdev_queue_type type, struct napi_struct *napi)
{
        struct netdev_rx_queue *rxq;
        struct netdev_queue *txq;

        if (WARN_ON_ONCE(napi && !napi->dev))
                return;
        netdev_ops_assert_locked_or_invisible(dev);

        switch (type) {
        case NETDEV_QUEUE_TYPE_RX:
                rxq = __netif_get_rx_queue(dev, queue_index);
                rxq->napi = napi;
                return;
        case NETDEV_QUEUE_TYPE_TX:
                txq = netdev_get_tx_queue(dev, queue_index);
                txq->napi = napi;
                return;
        default:
                return;
        }
}
EXPORT_SYMBOL(netif_queue_set_napi);

static void
netif_napi_irq_notify(struct irq_affinity_notify *notify,
                      const cpumask_t *mask)
{
        struct napi_struct *napi =
                container_of(notify, struct napi_struct, notify);
#ifdef CONFIG_RFS_ACCEL
        struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap;
        int err;
#endif

        if (napi->config && napi->dev->irq_affinity_auto)
                cpumask_copy(&napi->config->affinity_mask, mask);

#ifdef CONFIG_RFS_ACCEL
        if (napi->dev->rx_cpu_rmap_auto) {
                err = cpu_rmap_update(rmap, napi->napi_rmap_idx, mask);
                if (err)
                        netdev_warn(napi->dev, "RMAP update failed (%d)\n",
                                    err);
        }
#endif
}

#ifdef CONFIG_RFS_ACCEL
static void netif_napi_affinity_release(struct kref *ref)
{
        struct napi_struct *napi =
                container_of(ref, struct napi_struct, notify.kref);
        struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap;

        netdev_assert_locked(napi->dev);
        WARN_ON(test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER,
                                   &napi->state));

        if (!napi->dev->rx_cpu_rmap_auto)
                return;
        rmap->obj[napi->napi_rmap_idx] = NULL;
        napi->napi_rmap_idx = -1;
        cpu_rmap_put(rmap);
}

int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs)
{
        if (dev->rx_cpu_rmap_auto)
                return 0;

        dev->rx_cpu_rmap = alloc_irq_cpu_rmap(num_irqs);
        if (!dev->rx_cpu_rmap)
                return -ENOMEM;

        dev->rx_cpu_rmap_auto = true;
        return 0;
}
EXPORT_SYMBOL(netif_enable_cpu_rmap);

static void netif_del_cpu_rmap(struct net_device *dev)
{
        struct cpu_rmap *rmap = dev->rx_cpu_rmap;

        if (!dev->rx_cpu_rmap_auto)
                return;

        /* Free the rmap */
        cpu_rmap_put(rmap);
        dev->rx_cpu_rmap = NULL;
        dev->rx_cpu_rmap_auto = false;
}

#else
static void netif_napi_affinity_release(struct kref *ref)
{
}

int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs)
{
        return 0;
}
EXPORT_SYMBOL(netif_enable_cpu_rmap);

static void netif_del_cpu_rmap(struct net_device *dev)
{
}
#endif

void netif_set_affinity_auto(struct net_device *dev)
{
        unsigned int i, maxqs, numa;

        maxqs = max(dev->num_tx_queues, dev->num_rx_queues);
        numa = dev_to_node(&dev->dev);

        for (i = 0; i < maxqs; i++)
                cpumask_set_cpu(cpumask_local_spread(i, numa),
                                &dev->napi_config[i].affinity_mask);

        dev->irq_affinity_auto = true;
}
EXPORT_SYMBOL(netif_set_affinity_auto);

void netif_napi_set_irq_locked(struct napi_struct *napi, int irq)
{
        int rc;

        netdev_assert_locked_or_invisible(napi->dev);

        if (napi->irq == irq)
                return;

        /* Remove existing resources */
        if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state))
                irq_set_affinity_notifier(napi->irq, NULL);

        napi->irq = irq;
        if (irq < 0 ||
            (!napi->dev->rx_cpu_rmap_auto && !napi->dev->irq_affinity_auto))
                return;

        /* Abort for buggy drivers */
        if (napi->dev->irq_affinity_auto && WARN_ON_ONCE(!napi->config))
                return;

#ifdef CONFIG_RFS_ACCEL
        if (napi->dev->rx_cpu_rmap_auto) {
                rc = cpu_rmap_add(napi->dev->rx_cpu_rmap, napi);
                if (rc < 0)
                        return;

                cpu_rmap_get(napi->dev->rx_cpu_rmap);
                napi->napi_rmap_idx = rc;
        }
#endif

        /* Use core IRQ notifier */
        napi->notify.notify = netif_napi_irq_notify;
        napi->notify.release = netif_napi_affinity_release;
        rc = irq_set_affinity_notifier(irq, &napi->notify);
        if (rc) {
                netdev_warn(napi->dev, "Unable to set IRQ notifier (%d)\n",
                            rc);
                goto put_rmap;
        }

        set_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state);
        return;

put_rmap:
#ifdef CONFIG_RFS_ACCEL
        if (napi->dev->rx_cpu_rmap_auto) {
                napi->dev->rx_cpu_rmap->obj[napi->napi_rmap_idx] = NULL;
                cpu_rmap_put(napi->dev->rx_cpu_rmap);
                napi->napi_rmap_idx = -1;
        }
#endif
        napi->notify.notify = NULL;
        napi->notify.release = NULL;
}
EXPORT_SYMBOL(netif_napi_set_irq_locked);

static void napi_restore_config(struct napi_struct *n)
{
        n->defer_hard_irqs = n->config->defer_hard_irqs;
        n->gro_flush_timeout = n->config->gro_flush_timeout;
        n->irq_suspend_timeout = n->config->irq_suspend_timeout;

        if (n->dev->irq_affinity_auto &&
            test_bit(NAPI_STATE_HAS_NOTIFIER, &n->state))
                irq_set_affinity(n->irq, &n->config->affinity_mask);

        /* a NAPI ID might be stored in the config, if so use it. if not, use
         * napi_hash_add to generate one for us.
         */
        if (n->config->napi_id) {
                napi_hash_add_with_id(n, n->config->napi_id);
        } else {
                napi_hash_add(n);
                n->config->napi_id = n->napi_id;
        }

        WARN_ON_ONCE(napi_set_threaded(n, n->config->threaded));
}

static void napi_save_config(struct napi_struct *n)
{
        n->config->defer_hard_irqs = n->defer_hard_irqs;
        n->config->gro_flush_timeout = n->gro_flush_timeout;
        n->config->irq_suspend_timeout = n->irq_suspend_timeout;
        napi_hash_del(n);
}

/* Netlink wants the NAPI list to be sorted by ID, if adding a NAPI which will
 * inherit an existing ID try to insert it at the right position.
 */
static void
netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi)
{
        unsigned int new_id, pos_id;
        struct list_head *higher;
        struct napi_struct *pos;

        new_id = UINT_MAX;
        if (napi->config && napi->config->napi_id)
                new_id = napi->config->napi_id;

        higher = &dev->napi_list;
        list_for_each_entry(pos, &dev->napi_list, dev_list) {
                if (napi_id_valid(pos->napi_id))
                        pos_id = pos->napi_id;
                else if (pos->config)
                        pos_id = pos->config->napi_id;
                else
                        pos_id = UINT_MAX;

                if (pos_id <= new_id)
                        break;
                higher = &pos->dev_list;
        }
        list_add_rcu(&napi->dev_list, higher); /* adds after higher */
}

/* Double check that napi_get_frags() allocates skbs with
 * skb->head being backed by slab, not a page fragment.
 * This is to make sure bug fixed in 3226b158e67c
 * ("net: avoid 32 x truesize under-estimation for tiny skbs")
 * does not accidentally come back.
 */
static void napi_get_frags_check(struct napi_struct *napi)
{
        struct sk_buff *skb;

        local_bh_disable();
        skb = napi_get_frags(napi);
        WARN_ON_ONCE(skb && skb->head_frag);
        napi_free_frags(napi);
        local_bh_enable();
}

void netif_napi_add_weight_locked(struct net_device *dev,
                                  struct napi_struct *napi,
                                  int (*poll)(struct napi_struct *, int),
                                  int weight)
{
        netdev_assert_locked(dev);
        if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
                return;

        INIT_LIST_HEAD(&napi->poll_list);
        INIT_HLIST_NODE(&napi->napi_hash_node);
        hrtimer_setup(&napi->timer, napi_watchdog, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
        gro_init(&napi->gro);
        napi->skb = NULL;
        napi->poll = poll;
        if (weight > NAPI_POLL_WEIGHT)
                netdev_err_once(dev, "%s() called with weight %d\n", __func__,
                                weight);
        napi->weight = weight;
        napi->dev = dev;
#ifdef CONFIG_NETPOLL
        napi->poll_owner = -1;
#endif
        napi->list_owner = -1;
        set_bit(NAPI_STATE_SCHED, &napi->state);
        set_bit(NAPI_STATE_NPSVC, &napi->state);
        netif_napi_dev_list_add(dev, napi);

        /* default settings from sysfs are applied to all NAPIs. any per-NAPI
         * configuration will be loaded in napi_enable
         */
        napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs));
        napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout));

        napi_get_frags_check(napi);
        /* Create kthread for this napi if dev->threaded is set.
         * Clear dev->threaded if kthread creation failed so that
         * threaded mode will not be enabled in napi_enable().
         */
        if (napi_get_threaded_config(dev, napi))
                if (napi_kthread_create(napi))
                        dev->threaded = NETDEV_NAPI_THREADED_DISABLED;
        netif_napi_set_irq_locked(napi, -1);
}
EXPORT_SYMBOL(netif_napi_add_weight_locked);

void napi_disable_locked(struct napi_struct *n)
{
        unsigned long val, new;

        might_sleep();
        netdev_assert_locked(n->dev);

        set_bit(NAPI_STATE_DISABLE, &n->state);

        val = READ_ONCE(n->state);
        do {
                while (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) {
                        usleep_range(20, 200);
                        val = READ_ONCE(n->state);
                }

                new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC;
                new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL);
        } while (!try_cmpxchg(&n->state, &val, new));

        hrtimer_cancel(&n->timer);

        if (n->config)
                napi_save_config(n);
        else
                napi_hash_del(n);

        clear_bit(NAPI_STATE_DISABLE, &n->state);
}
EXPORT_SYMBOL(napi_disable_locked);

/**
 * napi_disable() - prevent NAPI from scheduling
 * @n: NAPI context
 *
 * Stop NAPI from being scheduled on this context.
 * Waits till any outstanding processing completes.
 * Takes netdev_lock() for associated net_device.
 */
void napi_disable(struct napi_struct *n)
{
        netdev_lock(n->dev);
        napi_disable_locked(n);
        netdev_unlock(n->dev);
}
EXPORT_SYMBOL(napi_disable);

void napi_enable_locked(struct napi_struct *n)
{
        unsigned long new, val = READ_ONCE(n->state);

        if (n->config)
                napi_restore_config(n);
        else
                napi_hash_add(n);

        do {
                BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));

                new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC);
                if (n->dev->threaded && n->thread)
                        new |= NAPIF_STATE_THREADED;
        } while (!try_cmpxchg(&n->state, &val, new));
}
EXPORT_SYMBOL(napi_enable_locked);

/**
 * napi_enable() - enable NAPI scheduling
 * @n: NAPI context
 *
 * Enable scheduling of a NAPI instance.
 * Must be paired with napi_disable().
 * Takes netdev_lock() for associated net_device.
 */
void napi_enable(struct napi_struct *n)
{
        netdev_lock(n->dev);
        napi_enable_locked(n);
        netdev_unlock(n->dev);
}
EXPORT_SYMBOL(napi_enable);

/* Must be called in process context */
void __netif_napi_del_locked(struct napi_struct *napi)
{
        netdev_assert_locked(napi->dev);

        if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
                return;

        /* Make sure NAPI is disabled (or was never enabled). */
        WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state));

        if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state))
                irq_set_affinity_notifier(napi->irq, NULL);

        if (napi->config) {
                napi->index = -1;
                napi->config = NULL;
        }

        list_del_rcu(&napi->dev_list);
        napi_free_frags(napi);

        gro_cleanup(&napi->gro);

        if (napi->thread) {
                kthread_stop(napi->thread);
                napi->thread = NULL;
        }
}
EXPORT_SYMBOL(__netif_napi_del_locked);

static int __napi_poll(struct napi_struct *n, bool *repoll)
{
        int work, weight;

        weight = n->weight;

        /* This NAPI_STATE_SCHED test is for avoiding a race
         * with netpoll's poll_napi().  Only the entity which
         * obtains the lock and sees NAPI_STATE_SCHED set will
         * actually make the ->poll() call.  Therefore we avoid
         * accidentally calling ->poll() when NAPI is not scheduled.
         */
        work = 0;
        if (napi_is_scheduled(n)) {
                work = n->poll(n, weight);
                trace_napi_poll(n, work, weight);

                xdp_do_check_flushed(n);
        }

        if (unlikely(work > weight))
                netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
                                n->poll, work, weight);

        if (likely(work < weight))
                return work;

        /* Drivers must not modify the NAPI state if they
         * consume the entire weight.  In such cases this code
         * still "owns" the NAPI instance and therefore can
         * move the instance around on the list at-will.
         */
        if (unlikely(napi_disable_pending(n))) {
                napi_complete(n);
                return work;
        }

        /* The NAPI context has more processing work, but busy-polling
         * is preferred. Exit early.
         */
        if (napi_prefer_busy_poll(n)) {
                if (napi_complete_done(n, work)) {
                        /* If timeout is not set, we need to make sure
                         * that the NAPI is re-scheduled.
                         */
                        napi_schedule(n);
                }
                return work;
        }

        /* Flush too old packets. If HZ < 1000, flush all packets */
        gro_flush_normal(&n->gro, HZ >= 1000);

        /* Some drivers may have called napi_schedule
         * prior to exhausting their budget.
         */
        if (unlikely(!list_empty(&n->poll_list))) {
                pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
                             n->dev ? n->dev->name : "backlog");
                return work;
        }

        *repoll = true;

        return work;
}

static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
        bool do_repoll = false;
        void *have;
        int work;

        list_del_init(&n->poll_list);

        have = netpoll_poll_lock(n);

        work = __napi_poll(n, &do_repoll);

        if (do_repoll) {
#if defined(CONFIG_DEBUG_NET)
                if (unlikely(!napi_is_scheduled(n)))
                        pr_crit("repoll requested for device %s %ps but napi is not scheduled.\n",
                                n->dev->name, n->poll);
#endif
                list_add_tail(&n->poll_list, repoll);
        }
        netpoll_poll_unlock(have);

        return work;
}

static int napi_thread_wait(struct napi_struct *napi)
{
        set_current_state(TASK_INTERRUPTIBLE);

        while (!kthread_should_stop()) {
                /* Testing SCHED_THREADED bit here to make sure the current
                 * kthread owns this napi and could poll on this napi.
                 * Testing SCHED bit is not enough because SCHED bit might be
                 * set by some other busy poll thread or by napi_disable().
                 */
                if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) {
                        WARN_ON(!list_empty(&napi->poll_list));
                        __set_current_state(TASK_RUNNING);
                        return 0;
                }

                schedule();
                set_current_state(TASK_INTERRUPTIBLE);
        }
        __set_current_state(TASK_RUNNING);

        return -1;
}

static void napi_threaded_poll_loop(struct napi_struct *napi)
{
        struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
        struct softnet_data *sd;
        unsigned long last_qs = jiffies;

        for (;;) {
                bool repoll = false;
                void *have;

                local_bh_disable();
                bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);

                sd = this_cpu_ptr(&softnet_data);
                sd->in_napi_threaded_poll = true;

                have = netpoll_poll_lock(napi);
                __napi_poll(napi, &repoll);
                netpoll_poll_unlock(have);

                sd->in_napi_threaded_poll = false;
                barrier();

                if (sd_has_rps_ipi_waiting(sd)) {
                        local_irq_disable();
                        net_rps_action_and_irq_enable(sd);
                }
                skb_defer_free_flush();
                bpf_net_ctx_clear(bpf_net_ctx);
                local_bh_enable();

                if (!repoll)
                        break;

                rcu_softirq_qs_periodic(last_qs);
                cond_resched();
        }
}

static int napi_threaded_poll(void *data)
{
        struct napi_struct *napi = data;

        while (!napi_thread_wait(napi))
                napi_threaded_poll_loop(napi);

        return 0;
}

static __latent_entropy void net_rx_action(void)
{
        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
        unsigned long time_limit = jiffies +
                usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs));
        struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
        int budget = READ_ONCE(net_hotdata.netdev_budget);
        LIST_HEAD(list);
        LIST_HEAD(repoll);

        bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
start:
        sd->in_net_rx_action = true;
        local_irq_disable();
        list_splice_init(&sd->poll_list, &list);
        local_irq_enable();

        for (;;) {
                struct napi_struct *n;

                skb_defer_free_flush();

                if (list_empty(&list)) {
                        if (list_empty(&repoll)) {
                                sd->in_net_rx_action = false;
                                barrier();
                                /* We need to check if ____napi_schedule()
                                 * had refilled poll_list while
                                 * sd->in_net_rx_action was true.
                                 */
                                if (!list_empty(&sd->poll_list))
                                        goto start;
                                if (!sd_has_rps_ipi_waiting(sd))
                                        goto end;
                        }
                        break;
                }

                n = list_first_entry(&list, struct napi_struct, poll_list);
                budget -= napi_poll(n, &repoll);

                /* If softirq window is exhausted then punt.
                 * Allow this to run for 2 jiffies since which will allow
                 * an average latency of 1.5/HZ.
                 */
                if (unlikely(budget <= 0 ||
                             time_after_eq(jiffies, time_limit))) {
                        /* Pairs with READ_ONCE() in softnet_seq_show() */
                        WRITE_ONCE(sd->time_squeeze, sd->time_squeeze + 1);
                        break;
                }
        }

        local_irq_disable();

        list_splice_tail_init(&sd->poll_list, &list);
        list_splice_tail(&repoll, &list);
        list_splice(&list, &sd->poll_list);
        if (!list_empty(&sd->poll_list))
                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
        else
                sd->in_net_rx_action = false;

        net_rps_action_and_irq_enable(sd);
end:
        bpf_net_ctx_clear(bpf_net_ctx);
}

struct netdev_adjacent {
        struct net_device *dev;
        netdevice_tracker dev_tracker;

        /* upper master flag, there can only be one master device per list */
        bool master;

        /* lookup ignore flag */
        bool ignore;

        /* counter for the number of times this device was added to us */
        u16 ref_nr;

        /* private field for the users */
        void *private;

        struct list_head list;
        struct rcu_head rcu;
};

static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
                                                 struct list_head *adj_list)
{
        struct netdev_adjacent *adj;

        list_for_each_entry(adj, adj_list, list) {
                if (adj->dev == adj_dev)
                        return adj;
        }
        return NULL;
}

static int ____netdev_has_upper_dev(struct net_device *upper_dev,
                                    struct netdev_nested_priv *priv)
{
        struct net_device *dev = (struct net_device *)priv->data;

        return upper_dev == dev;
}

/**
 * netdev_has_upper_dev - Check if device is linked to an upper device
 * @dev: device
 * @upper_dev: upper device to check
 *
 * Find out if a device is linked to specified upper device and return true
 * in case it is. Note that this checks only immediate upper device,
 * not through a complete stack of devices. The caller must hold the RTNL lock.
 */
bool netdev_has_upper_dev(struct net_device *dev,
                          struct net_device *upper_dev)
{
        struct netdev_nested_priv priv = {
                .data = (void *)upper_dev,
        };

        ASSERT_RTNL();

        return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
                                             &priv);
}
EXPORT_SYMBOL(netdev_has_upper_dev);

/**
 * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device
 * @dev: device
 * @upper_dev: upper device to check
 *
 * Find out if a device is linked to specified upper device and return true
 * in case it is. Note that this checks the entire upper device chain.
 * The caller must hold rcu lock.
 */

bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
                                  struct net_device *upper_dev)
{
        struct netdev_nested_priv priv = {
                .data = (void *)upper_dev,
        };

        return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
                                               &priv);
}
EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);

/**
 * netdev_has_any_upper_dev - Check if device is linked to some device
 * @dev: device
 *
 * Find out if a device is linked to an upper device and return true in case
 * it is. The caller must hold the RTNL lock.
 */
bool netdev_has_any_upper_dev(struct net_device *dev)
{
        ASSERT_RTNL();

        return !list_empty(&dev->adj_list.upper);
}
EXPORT_SYMBOL(netdev_has_any_upper_dev);

/**
 * netdev_master_upper_dev_get - Get master upper device
 * @dev: device
 *
 * Find a master upper device and return pointer to it or NULL in case
 * it's not there. The caller must hold the RTNL lock.
 */
struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
{
        struct netdev_adjacent *upper;

        ASSERT_RTNL();

        if (list_empty(&dev->adj_list.upper))
                return NULL;

        upper = list_first_entry(&dev->adj_list.upper,
                                 struct netdev_adjacent, list);
        if (likely(upper->master))
                return upper->dev;
        return NULL;
}
EXPORT_SYMBOL(netdev_master_upper_dev_get);

static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
{
        struct netdev_adjacent *upper;

        ASSERT_RTNL();

        if (list_empty(&dev->adj_list.upper))
                return NULL;

        upper = list_first_entry(&dev->adj_list.upper,
                                 struct netdev_adjacent, list);
        if (likely(upper->master) && !upper->ignore)
                return upper->dev;
        return NULL;
}

/**
 * netdev_has_any_lower_dev - Check if device is linked to some device
 * @dev: device
 *
 * Find out if a device is linked to a lower device and return true in case
 * it is. The caller must hold the RTNL lock.
 */
static bool netdev_has_any_lower_dev(struct net_device *dev)
{
        ASSERT_RTNL();

        return !list_empty(&dev->adj_list.lower);
}

void *netdev_adjacent_get_private(struct list_head *adj_list)
{
        struct netdev_adjacent *adj;

        adj = list_entry(adj_list, struct netdev_adjacent, list);

        return adj->private;
}
EXPORT_SYMBOL(netdev_adjacent_get_private);

/**
 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
 * @dev: device
 * @iter: list_head ** of the current position
 *
 * Gets the next device from the dev's upper list, starting from iter
 * position. The caller must hold RCU read lock.
 */
struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
                                                 struct list_head **iter)
{
        struct netdev_adjacent *upper;

        WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());

        upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);

        if (&upper->list == &dev->adj_list.upper)
                return NULL;

        *iter = &upper->list;

        return upper->dev;
}
EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);

static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
                                                  struct list_head **iter,
                                                  bool *ignore)
{
        struct netdev_adjacent *upper;

        upper = list_entry((*iter)->next, struct netdev_adjacent, list);

        if (&upper->list == &dev->adj_list.upper)
                return NULL;

        *iter = &upper->list;
        *ignore = upper->ignore;

        return upper->dev;
}

static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
                                                    struct list_head **iter)
{
        struct netdev_adjacent *upper;

        WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());

        upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);

        if (&upper->list == &dev->adj_list.upper)
                return NULL;

        *iter = &upper->list;

        return upper->dev;
}

static int __netdev_walk_all_upper_dev(struct net_device *dev,
                                       int (*fn)(struct net_device *dev,
                                         struct netdev_nested_priv *priv),
                                       struct netdev_nested_priv *priv)
{
        struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
        int ret, cur = 0;
        bool ignore;

        now = dev;
        iter = &dev->adj_list.upper;

        while (1) {
                if (now != dev) {
                        ret = fn(now, priv);
                        if (ret)
                                return ret;
                }

                next = NULL;
                while (1) {
                        udev = __netdev_next_upper_dev(now, &iter, &ignore);
                        if (!udev)
                                break;
                        if (ignore)
                                continue;

                        next = udev;
                        niter = &udev->adj_list.upper;
                        dev_stack[cur] = now;
                        iter_stack[cur++] = iter;
                        break;
                }

                if (!next) {
                        if (!cur)
                                return 0;
                        next = dev_stack[--cur];
                        niter = iter_stack[cur];
                }

                now = next;
                iter = niter;
        }

        return 0;
}

int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
                                  int (*fn)(struct net_device *dev,
                                            struct netdev_nested_priv *priv),
                                  struct netdev_nested_priv *priv)
{
        struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
        int ret, cur = 0;

        now = dev;
        iter = &dev->adj_list.upper;

        while (1) {
                if (now != dev) {
                        ret = fn(now, priv);
                        if (ret)
                                return ret;
                }

                next = NULL;
                while (1) {
                        udev = netdev_next_upper_dev_rcu(now, &iter);
                        if (!udev)
                                break;

                        next = udev;
                        niter = &udev->adj_list.upper;
                        dev_stack[cur] = now;
                        iter_stack[cur++] = iter;
                        break;
                }

                if (!next) {
                        if (!cur)
                                return 0;
                        next = dev_stack[--cur];
                        niter = iter_stack[cur];
                }

                now = next;
                iter = niter;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);

static bool __netdev_has_upper_dev(struct net_device *dev,
                                   struct net_device *upper_dev)
{
        struct netdev_nested_priv priv = {
                .flags = 0,
                .data = (void *)upper_dev,
        };

        ASSERT_RTNL();

        return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
                                           &priv);
}

/**
 * netdev_lower_get_next_private - Get the next ->private from the
 *                                   lower neighbour list
 * @dev: device
 * @iter: list_head ** of the current position
 *
 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 * list, starting from iter position. The caller must hold either hold the
 * RTNL lock or its own locking that guarantees that the neighbour lower
 * list will remain unchanged.
 */
void *netdev_lower_get_next_private(struct net_device *dev,
                                    struct list_head **iter)
{
        struct netdev_adjacent *lower;

        lower = list_entry(*iter, struct netdev_adjacent, list);

        if (&lower->list == &dev->adj_list.lower)
                return NULL;

        *iter = lower->list.next;

        return lower->private;
}
EXPORT_SYMBOL(netdev_lower_get_next_private);

/**
 * netdev_lower_get_next_private_rcu - Get the next ->private from the
 *                                       lower neighbour list, RCU
 *                                       variant
 * @dev: device
 * @iter: list_head ** of the current position
 *
 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 * list, starting from iter position. The caller must hold RCU read lock.
 */
void *netdev_lower_get_next_private_rcu(struct net_device *dev,
                                        struct list_head **iter)
{
        struct netdev_adjacent *lower;

        WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());

        lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);

        if (&lower->list == &dev->adj_list.lower)
                return NULL;

        *iter = &lower->list;

        return lower->private;
}
EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);

/**
 * netdev_lower_get_next - Get the next device from the lower neighbour
 *                         list
 * @dev: device
 * @iter: list_head ** of the current position
 *
 * Gets the next netdev_adjacent from the dev's lower neighbour
 * list, starting from iter position. The caller must hold RTNL lock or
 * its own locking that guarantees that the neighbour lower
 * list will remain unchanged.
 */
void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
{
        struct netdev_adjacent *lower;

        lower = list_entry(*iter, struct netdev_adjacent, list);

        if (&lower->list == &dev->adj_list.lower)
                return NULL;

        *iter = lower->list.next;

        return lower->dev;
}
EXPORT_SYMBOL(netdev_lower_get_next);

static struct net_device *netdev_next_lower_dev(struct net_device *dev,
                                                struct list_head **iter)
{
        struct netdev_adjacent *lower;

        lower = list_entry((*iter)->next, struct netdev_adjacent, list);

        if (&lower->list == &dev->adj_list.lower)
                return NULL;

        *iter = &lower->list;

        return lower->dev;
}

static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
                                                  struct list_head **iter,
                                                  bool *ignore)
{
        struct netdev_adjacent *lower;

        lower = list_entry((*iter)->next, struct netdev_adjacent, list);

        if (&lower->list == &dev->adj_list.lower)
                return NULL;

        *iter = &lower->list;
        *ignore = lower->ignore;

        return lower->dev;
}

int netdev_walk_all_lower_dev(struct net_device *dev,
                              int (*fn)(struct net_device *dev,
                                        struct netdev_nested_priv *priv),
                              struct netdev_nested_priv *priv)
{
        struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
        int ret, cur = 0;

        now = dev;
        iter = &dev->adj_list.lower;

        while (1) {
                if (now != dev) {
                        ret = fn(now, priv);
                        if (ret)
                                return ret;
                }

                next = NULL;
                while (1) {
                        ldev = netdev_next_lower_dev(now, &iter);
                        if (!ldev)
                                break;

                        next = ldev;
                        niter = &ldev->adj_list.lower;
                        dev_stack[cur] = now;
                        iter_stack[cur++] = iter;
                        break;
                }

                if (!next) {
                        if (!cur)
                                return 0;
                        next = dev_stack[--cur];
                        niter = iter_stack[cur];
                }

                now = next;
                iter = niter;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);

static int __netdev_walk_all_lower_dev(struct net_device *dev,
                                       int (*fn)(struct net_device *dev,
                                         struct netdev_nested_priv *priv),
                                       struct netdev_nested_priv *priv)
{
        struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
        int ret, cur = 0;
        bool ignore;

        now = dev;
        iter = &dev->adj_list.lower;

        while (1) {
                if (now != dev) {
                        ret = fn(now, priv);
                        if (ret)
                                return ret;
                }

                next = NULL;
                while (1) {
                        ldev = __netdev_next_lower_dev(now, &iter, &ignore);
                        if (!ldev)
                                break;
                        if (ignore)
                                continue;

                        next = ldev;
                        niter = &ldev->adj_list.lower;
                        dev_stack[cur] = now;
                        iter_stack[cur++] = iter;
                        break;
                }

                if (!next) {
                        if (!cur)
                                return 0;
                        next = dev_stack[--cur];
                        niter = iter_stack[cur];
                }

                now = next;
                iter = niter;
        }

        return 0;
}

struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
                                             struct list_head **iter)
{
        struct netdev_adjacent *lower;

        lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
        if (&lower->list == &dev->adj_list.lower)
                return NULL;

        *iter = &lower->list;

        return lower->dev;
}
EXPORT_SYMBOL(netdev_next_lower_dev_rcu);

static u8 __netdev_upper_depth(struct net_device *dev)
{
        struct net_device *udev;
        struct list_head *iter;
        u8 max_depth = 0;
        bool ignore;

        for (iter = &dev->adj_list.upper,
             udev = __netdev_next_upper_dev(dev, &iter, &ignore);
             udev;
             udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
                if (ignore)
                        continue;
                if (max_depth < udev->upper_level)
                        max_depth = udev->upper_level;
        }

        return max_depth;
}

static u8 __netdev_lower_depth(struct net_device *dev)
{
        struct net_device *ldev;
        struct list_head *iter;
        u8 max_depth = 0;
        bool ignore;

        for (iter = &dev->adj_list.lower,
             ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
             ldev;
             ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
                if (ignore)
                        continue;
                if (max_depth < ldev->lower_level)
                        max_depth = ldev->lower_level;
        }

        return max_depth;
}

static int __netdev_update_upper_level(struct net_device *dev,
                                       struct netdev_nested_priv *__unused)
{
        dev->upper_level = __netdev_upper_depth(dev) + 1;
        return 0;
}

#ifdef CONFIG_LOCKDEP
static LIST_HEAD(net_unlink_list);

static void net_unlink_todo(struct net_device *dev)
{
        if (list_empty(&dev->unlink_list))
                list_add_tail(&dev->unlink_list, &net_unlink_list);
}
#endif

static int __netdev_update_lower_level(struct net_device *dev,
                                       struct netdev_nested_priv *priv)
{
        dev->lower_level = __netdev_lower_depth(dev) + 1;

#ifdef CONFIG_LOCKDEP
        if (!priv)
                return 0;

        if (priv->flags & NESTED_SYNC_IMM)
                dev->nested_level = dev->lower_level - 1;
        if (priv->flags & NESTED_SYNC_TODO)
                net_unlink_todo(dev);
#endif
        return 0;
}

int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
                                  int (*fn)(struct net_device *dev,
                                            struct netdev_nested_priv *priv),
                                  struct netdev_nested_priv *priv)
{
        struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
        int ret, cur = 0;

        now = dev;
        iter = &dev->adj_list.lower;

        while (1) {
                if (now != dev) {
                        ret = fn(now, priv);
                        if (ret)
                                return ret;
                }

                next = NULL;
                while (1) {
                        ldev = netdev_next_lower_dev_rcu(now, &iter);
                        if (!ldev)
                                break;

                        next = ldev;
                        niter = &ldev->adj_list.lower;
                        dev_stack[cur] = now;
                        iter_stack[cur++] = iter;
                        break;
                }

                if (!next) {
                        if (!cur)
                                return 0;
                        next = dev_stack[--cur];
                        niter = iter_stack[cur];
                }

                now = next;
                iter = niter;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);

/**
 * netdev_lower_get_first_private_rcu - Get the first ->private from the
 *                                       lower neighbour list, RCU
 *                                       variant
 * @dev: device
 *
 * Gets the first netdev_adjacent->private from the dev's lower neighbour
 * list. The caller must hold RCU read lock.
 */
void *netdev_lower_get_first_private_rcu(struct net_device *dev)
{
        struct netdev_adjacent *lower;

        lower = list_first_or_null_rcu(&dev->adj_list.lower,
                        struct netdev_adjacent, list);
        if (lower)
                return lower->private;
        return NULL;
}
EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);

/**
 * netdev_master_upper_dev_get_rcu - Get master upper device
 * @dev: device
 *
 * Find a master upper device and return pointer to it or NULL in case
 * it's not there. The caller must hold the RCU read lock.
 */
struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
{
        struct netdev_adjacent *upper;

        upper = list_first_or_null_rcu(&dev->adj_list.upper,
                                       struct netdev_adjacent, list);
        if (upper && likely(upper->master))
                return upper->dev;
        return NULL;
}
EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);

static int netdev_adjacent_sysfs_add(struct net_device *dev,
                              struct net_device *adj_dev,
                              struct list_head *dev_list)
{
        char linkname[IFNAMSIZ+7];

        sprintf(linkname, dev_list == &dev->adj_list.upper ?
                "upper_%s" : "lower_%s", adj_dev->name);
        return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
                                 linkname);
}
static void netdev_adjacent_sysfs_del(struct net_device *dev,
                               char *name,
                               struct list_head *dev_list)
{
        char linkname[IFNAMSIZ+7];

        sprintf(linkname, dev_list == &dev->adj_list.upper ?
                "upper_%s" : "lower_%s", name);
        sysfs_remove_link(&(dev->dev.kobj), linkname);
}

static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
                                                 struct net_device *adj_dev,
                                                 struct list_head *dev_list)
{
        return (dev_list == &dev->adj_list.upper ||
                dev_list == &dev->adj_list.lower) &&
                net_eq(dev_net(dev), dev_net(adj_dev));
}

static int __netdev_adjacent_dev_insert(struct net_device *dev,
                                        struct net_device *adj_dev,
                                        struct list_head *dev_list,
                                        void *private, bool master)
{
        struct netdev_adjacent *adj;
        int ret;

        adj = __netdev_find_adj(adj_dev, dev_list);

        if (adj) {
                adj->ref_nr += 1;
                pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
                         dev->name, adj_dev->name, adj->ref_nr);

                return 0;
        }

        adj = kmalloc(sizeof(*adj), GFP_KERNEL);
        if (!adj)
                return -ENOMEM;

        adj->dev = adj_dev;
        adj->master = master;
        adj->ref_nr = 1;
        adj->private = private;
        adj->ignore = false;
        netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL);

        pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
                 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);

        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
                ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
                if (ret)
                        goto free_adj;
        }

        /* Ensure that master link is always the first item in list. */
        if (master) {
                ret = sysfs_create_link(&(dev->dev.kobj),
                                        &(adj_dev->dev.kobj), "master");
                if (ret)
                        goto remove_symlinks;

                list_add_rcu(&adj->list, dev_list);
        } else {
                list_add_tail_rcu(&adj->list, dev_list);
        }

        return 0;

remove_symlinks:
        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
                netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
free_adj:
        netdev_put(adj_dev, &adj->dev_tracker);
        kfree(adj);

        return ret;
}

static void __netdev_adjacent_dev_remove(struct net_device *dev,
                                         struct net_device *adj_dev,
                                         u16 ref_nr,
                                         struct list_head *dev_list)
{
        struct netdev_adjacent *adj;

        pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
                 dev->name, adj_dev->name, ref_nr);

        adj = __netdev_find_adj(adj_dev, dev_list);

        if (!adj) {
                pr_err("Adjacency does not exist for device %s from %s\n",
                       dev->name, adj_dev->name);
                WARN_ON(1);
                return;
        }

        if (adj->ref_nr > ref_nr) {
                pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
                         dev->name, adj_dev->name, ref_nr,
                         adj->ref_nr - ref_nr);
                adj->ref_nr -= ref_nr;
                return;
        }

        if (adj->master)
                sysfs_remove_link(&(dev->dev.kobj), "master");

        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
                netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);

        list_del_rcu(&adj->list);
        pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
                 adj_dev->name, dev->name, adj_dev->name);
        netdev_put(adj_dev, &adj->dev_tracker);
        kfree_rcu(adj, rcu);
}

static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
                                            struct net_device *upper_dev,
                                            struct list_head *up_list,
                                            struct list_head *down_list,
                                            void *private, bool master)
{
        int ret;

        ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
                                           private, master);
        if (ret)
                return ret;

        ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
                                           private, false);
        if (ret) {
                __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
                return ret;
        }

        return 0;
}

static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
                                               struct net_device *upper_dev,
                                               u16 ref_nr,
                                               struct list_head *up_list,
                                               struct list_head *down_list)
{
        __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
        __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
}

static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
                                                struct net_device *upper_dev,
                                                void *private, bool master)
{
        return __netdev_adjacent_dev_link_lists(dev, upper_dev,
                                                &dev->adj_list.upper,
                                                &upper_dev->adj_list.lower,
                                                private, master);
}

static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
                                                   struct net_device *upper_dev)
{
        __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
                                           &dev->adj_list.upper,
                                           &upper_dev->adj_list.lower);
}

static int __netdev_upper_dev_link(struct net_device *dev,
                                   struct net_device *upper_dev, bool master,
                                   void *upper_priv, void *upper_info,
                                   struct netdev_nested_priv *priv,
                                   struct netlink_ext_ack *extack)
{
        struct netdev_notifier_changeupper_info changeupper_info = {
                .info = {
                        .dev = dev,
                        .extack = extack,
                },
                .upper_dev = upper_dev,
                .master = master,
                .linking = true,
                .upper_info = upper_info,
        };
        struct net_device *master_dev;
        int ret = 0;

        ASSERT_RTNL();

        if (dev == upper_dev)
                return -EBUSY;

        /* To prevent loops, check if dev is not upper device to upper_dev. */
        if (__netdev_has_upper_dev(upper_dev, dev))
                return -EBUSY;

        if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
                return -EMLINK;

        if (!master) {
                if (__netdev_has_upper_dev(dev, upper_dev))
                        return -EEXIST;
        } else {
                master_dev = __netdev_master_upper_dev_get(dev);
                if (master_dev)
                        return master_dev == upper_dev ? -EEXIST : -EBUSY;
        }

        ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
                                            &changeupper_info.info);
        ret = notifier_to_errno(ret);
        if (ret)
                return ret;

        ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
                                                   master);
        if (ret)
                return ret;

        ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
                                            &changeupper_info.info);
        ret = notifier_to_errno(ret);
        if (ret)
                goto rollback;

        __netdev_update_upper_level(dev, NULL);
        __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);

        __netdev_update_lower_level(upper_dev, priv);
        __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
                                    priv);

        return 0;

rollback:
        __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);

        return ret;
}

/**
 * netdev_upper_dev_link - Add a link to the upper device
 * @dev: device
 * @upper_dev: new upper device
 * @extack: netlink extended ack
 *
 * Adds a link to device which is upper to this one. The caller must hold
 * the RTNL lock. On a failure a negative errno code is returned.
 * On success the reference counts are adjusted and the function
 * returns zero.
 */
int netdev_upper_dev_link(struct net_device *dev,
                          struct net_device *upper_dev,
                          struct netlink_ext_ack *extack)
{
        struct netdev_nested_priv priv = {
                .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
                .data = NULL,
        };

        return __netdev_upper_dev_link(dev, upper_dev, false,
                                       NULL, NULL, &priv, extack);
}
EXPORT_SYMBOL(netdev_upper_dev_link);

/**
 * netdev_master_upper_dev_link - Add a master link to the upper device
 * @dev: device
 * @upper_dev: new upper device
 * @upper_priv: upper device private
 * @upper_info: upper info to be passed down via notifier
 * @extack: netlink extended ack
 *
 * Adds a link to device which is upper to this one. In this case, only
 * one master upper device can be linked, although other non-master devices
 * might be linked as well. The caller must hold the RTNL lock.
 * On a failure a negative errno code is returned. On success the reference
 * counts are adjusted and the function returns zero.
 */
int netdev_master_upper_dev_link(struct net_device *dev,
                                 struct net_device *upper_dev,
                                 void *upper_priv, void *upper_info,
                                 struct netlink_ext_ack *extack)
{
        struct netdev_nested_priv priv = {
                .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
                .data = NULL,
        };

        return __netdev_upper_dev_link(dev, upper_dev, true,
                                       upper_priv, upper_info, &priv, extack);
}
EXPORT_SYMBOL(netdev_master_upper_dev_link);

static void __netdev_upper_dev_unlink(struct net_device *dev,
                                      struct net_device *upper_dev,
                                      struct netdev_nested_priv *priv)
{
        struct netdev_notifier_changeupper_info changeupper_info = {
                .info = {
                        .dev = dev,
                },
                .upper_dev = upper_dev,
                .linking = false,
        };

        ASSERT_RTNL();

        changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;

        call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
                                      &changeupper_info.info);

        __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);

        call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
                                      &changeupper_info.info);

        __netdev_update_upper_level(dev, NULL);
        __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);

        __netdev_update_lower_level(upper_dev, priv);
        __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
                                    priv);
}

/**
 * netdev_upper_dev_unlink - Removes a link to upper device
 * @dev: device
 * @upper_dev: new upper device
 *
 * Removes a link to device which is upper to this one. The caller must hold
 * the RTNL lock.
 */
void netdev_upper_dev_unlink(struct net_device *dev,
                             struct net_device *upper_dev)
{
        struct netdev_nested_priv priv = {
                .flags = NESTED_SYNC_TODO,
                .data = NULL,
        };

        __netdev_upper_dev_unlink(dev, upper_dev, &priv);
}
EXPORT_SYMBOL(netdev_upper_dev_unlink);

static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
                                      struct net_device *lower_dev,
                                      bool val)
{
        struct netdev_adjacent *adj;

        adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
        if (adj)
                adj->ignore = val;

        adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
        if (adj)
                adj->ignore = val;
}

static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
                                        struct net_device *lower_dev)
{
        __netdev_adjacent_dev_set(upper_dev, lower_dev, true);
}

static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
                                       struct net_device *lower_dev)
{
        __netdev_adjacent_dev_set(upper_dev, lower_dev, false);
}

int netdev_adjacent_change_prepare(struct net_device *old_dev,
                                   struct net_device *new_dev,
                                   struct net_device *dev,
                                   struct netlink_ext_ack *extack)
{
        struct netdev_nested_priv priv = {
                .flags = 0,
                .data = NULL,
        };
        int err;

        if (!new_dev)
                return 0;

        if (old_dev && new_dev != old_dev)
                netdev_adjacent_dev_disable(dev, old_dev);
        err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
                                      extack);
        if (err) {
                if (old_dev && new_dev != old_dev)
                        netdev_adjacent_dev_enable(dev, old_dev);
                return err;
        }

        return 0;
}
EXPORT_SYMBOL(netdev_adjacent_change_prepare);

void netdev_adjacent_change_commit(struct net_device *old_dev,
                                   struct net_device *new_dev,
                                   struct net_device *dev)
{
        struct netdev_nested_priv priv = {
                .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
                .data = NULL,
        };

        if (!new_dev || !old_dev)
                return;

        if (new_dev == old_dev)
                return;

        netdev_adjacent_dev_enable(dev, old_dev);
        __netdev_upper_dev_unlink(old_dev, dev, &priv);
}
EXPORT_SYMBOL(netdev_adjacent_change_commit);

void netdev_adjacent_change_abort(struct net_device *old_dev,
                                  struct net_device *new_dev,
                                  struct net_device *dev)
{
        struct netdev_nested_priv priv = {
                .flags = 0,
                .data = NULL,
        };

        if (!new_dev)
                return;

        if (old_dev && new_dev != old_dev)
                netdev_adjacent_dev_enable(dev, old_dev);

        __netdev_upper_dev_unlink(new_dev, dev, &priv);
}
EXPORT_SYMBOL(netdev_adjacent_change_abort);

/**
 * netdev_bonding_info_change - Dispatch event about slave change
 * @dev: device
 * @bonding_info: info to dispatch
 *
 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
 * The caller must hold the RTNL lock.
 */
void netdev_bonding_info_change(struct net_device *dev,
                                struct netdev_bonding_info *bonding_info)
{
        struct netdev_notifier_bonding_info info = {
                .info.dev = dev,
        };

        memcpy(&info.bonding_info, bonding_info,
               sizeof(struct netdev_bonding_info));
        call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
                                      &info.info);
}
EXPORT_SYMBOL(netdev_bonding_info_change);

static int netdev_offload_xstats_enable_l3(struct net_device *dev,
                                           struct netlink_ext_ack *extack)
{
        struct netdev_notifier_offload_xstats_info info = {
                .info.dev = dev,
                .info.extack = extack,
                .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
        };
        int err;
        int rc;

        dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3),
                                         GFP_KERNEL);
        if (!dev->offload_xstats_l3)
                return -ENOMEM;

        rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE,
                                                  NETDEV_OFFLOAD_XSTATS_DISABLE,
                                                  &info.info);
        err = notifier_to_errno(rc);
        if (err)
                goto free_stats;

        return 0;

free_stats:
        kfree(dev->offload_xstats_l3);
        dev->offload_xstats_l3 = NULL;
        return err;
}

int netdev_offload_xstats_enable(struct net_device *dev,
                                 enum netdev_offload_xstats_type type,
                                 struct netlink_ext_ack *extack)
{
        ASSERT_RTNL();

        if (netdev_offload_xstats_enabled(dev, type))
                return -EALREADY;

        switch (type) {
        case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
                return netdev_offload_xstats_enable_l3(dev, extack);
        }

        WARN_ON(1);
        return -EINVAL;
}
EXPORT_SYMBOL(netdev_offload_xstats_enable);

static void netdev_offload_xstats_disable_l3(struct net_device *dev)
{
        struct netdev_notifier_offload_xstats_info info = {
                .info.dev = dev,
                .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
        };

        call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE,
                                      &info.info);
        kfree(dev->offload_xstats_l3);
        dev->offload_xstats_l3 = NULL;
}

int netdev_offload_xstats_disable(struct net_device *dev,
                                  enum netdev_offload_xstats_type type)
{
        ASSERT_RTNL();

        if (!netdev_offload_xstats_enabled(dev, type))
                return -EALREADY;

        switch (type) {
        case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
                netdev_offload_xstats_disable_l3(dev);
                return 0;
        }

        WARN_ON(1);
        return -EINVAL;
}
EXPORT_SYMBOL(netdev_offload_xstats_disable);

static void netdev_offload_xstats_disable_all(struct net_device *dev)
{
        netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3);
}

static struct rtnl_hw_stats64 *
netdev_offload_xstats_get_ptr(const struct net_device *dev,
                              enum netdev_offload_xstats_type type)
{
        switch (type) {
        case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
                return dev->offload_xstats_l3;
        }

        WARN_ON(1);
        return NULL;
}

bool netdev_offload_xstats_enabled(const struct net_device *dev,
                                   enum netdev_offload_xstats_type type)
{
        ASSERT_RTNL();

        return netdev_offload_xstats_get_ptr(dev, type);
}
EXPORT_SYMBOL(netdev_offload_xstats_enabled);

struct netdev_notifier_offload_xstats_ru {
        bool used;
};

struct netdev_notifier_offload_xstats_rd {
        struct rtnl_hw_stats64 stats;
        bool used;
};

static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest,
                                  const struct rtnl_hw_stats64 *src)
{
        dest->rx_packets          += src->rx_packets;
        dest->tx_packets          += src->tx_packets;
        dest->rx_bytes                  += src->rx_bytes;
        dest->tx_bytes                  += src->tx_bytes;
        dest->rx_errors                  += src->rx_errors;
        dest->tx_errors                  += src->tx_errors;
        dest->rx_dropped          += src->rx_dropped;
        dest->tx_dropped          += src->tx_dropped;
        dest->multicast                  += src->multicast;
}

static int netdev_offload_xstats_get_used(struct net_device *dev,
                                          enum netdev_offload_xstats_type type,
                                          bool *p_used,
                                          struct netlink_ext_ack *extack)
{
        struct netdev_notifier_offload_xstats_ru report_used = {};
        struct netdev_notifier_offload_xstats_info info = {
                .info.dev = dev,
                .info.extack = extack,
                .type = type,
                .report_used = &report_used,
        };
        int rc;

        WARN_ON(!netdev_offload_xstats_enabled(dev, type));
        rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED,
                                           &info.info);
        *p_used = report_used.used;
        return notifier_to_errno(rc);
}

static int netdev_offload_xstats_get_stats(struct net_device *dev,
                                           enum netdev_offload_xstats_type type,
                                           struct rtnl_hw_stats64 *p_stats,
                                           bool *p_used,
                                           struct netlink_ext_ack *extack)
{
        struct netdev_notifier_offload_xstats_rd report_delta = {};
        struct netdev_notifier_offload_xstats_info info = {
                .info.dev = dev,
                .info.extack = extack,
                .type = type,
                .report_delta = &report_delta,
        };
        struct rtnl_hw_stats64 *stats;
        int rc;

        stats = netdev_offload_xstats_get_ptr(dev, type);
        if (WARN_ON(!stats))
                return -EINVAL;

        rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
                                           &info.info);

        /* Cache whatever we got, even if there was an error, otherwise the
         * successful stats retrievals would get lost.
         */
        netdev_hw_stats64_add(stats, &report_delta.stats);

        if (p_stats)
                *p_stats = *stats;
        *p_used = report_delta.used;

        return notifier_to_errno(rc);
}

int netdev_offload_xstats_get(struct net_device *dev,
                              enum netdev_offload_xstats_type type,
                              struct rtnl_hw_stats64 *p_stats, bool *p_used,
                              struct netlink_ext_ack *extack)
{
        ASSERT_RTNL();

        if (p_stats)
                return netdev_offload_xstats_get_stats(dev, type, p_stats,
                                                       p_used, extack);
        else
                return netdev_offload_xstats_get_used(dev, type, p_used,
                                                      extack);
}
EXPORT_SYMBOL(netdev_offload_xstats_get);

void
netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta,
                                   const struct rtnl_hw_stats64 *stats)
{
        report_delta->used = true;
        netdev_hw_stats64_add(&report_delta->stats, stats);
}
EXPORT_SYMBOL(netdev_offload_xstats_report_delta);

void
netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used)
{
        report_used->used = true;
}
EXPORT_SYMBOL(netdev_offload_xstats_report_used);

void netdev_offload_xstats_push_delta(struct net_device *dev,
                                      enum netdev_offload_xstats_type type,
                                      const struct rtnl_hw_stats64 *p_stats)
{
        struct rtnl_hw_stats64 *stats;

        ASSERT_RTNL();

        stats = netdev_offload_xstats_get_ptr(dev, type);
        if (WARN_ON(!stats))
                return;

        netdev_hw_stats64_add(stats, p_stats);
}
EXPORT_SYMBOL(netdev_offload_xstats_push_delta);

/**
 * netdev_get_xmit_slave - Get the xmit slave of master device
 * @dev: device
 * @skb: The packet
 * @all_slaves: assume all the slaves are active
 *
 * The reference counters are not incremented so the caller must be
 * careful with locks. The caller must hold RCU lock.
 * %NULL is returned if no slave is found.
 */

struct net_device *netdev_get_xmit_slave(struct net_device *dev,
                                         struct sk_buff *skb,
                                         bool all_slaves)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (!ops->ndo_get_xmit_slave)
                return NULL;
        return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
}
EXPORT_SYMBOL(netdev_get_xmit_slave);

static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev,
                                                  struct sock *sk)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (!ops->ndo_sk_get_lower_dev)
                return NULL;
        return ops->ndo_sk_get_lower_dev(dev, sk);
}

/**
 * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket
 * @dev: device
 * @sk: the socket
 *
 * %NULL is returned if no lower device is found.
 */

struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
                                            struct sock *sk)
{
        struct net_device *lower;

        lower = netdev_sk_get_lower_dev(dev, sk);
        while (lower) {
                dev = lower;
                lower = netdev_sk_get_lower_dev(dev, sk);
        }

        return dev;
}
EXPORT_SYMBOL(netdev_sk_get_lowest_dev);

static void netdev_adjacent_add_links(struct net_device *dev)
{
        struct netdev_adjacent *iter;

        struct net *net = dev_net(dev);

        list_for_each_entry(iter, &dev->adj_list.upper, list) {
                if (!net_eq(net, dev_net(iter->dev)))
                        continue;
                netdev_adjacent_sysfs_add(iter->dev, dev,
                                          &iter->dev->adj_list.lower);
                netdev_adjacent_sysfs_add(dev, iter->dev,
                                          &dev->adj_list.upper);
        }

        list_for_each_entry(iter, &dev->adj_list.lower, list) {
                if (!net_eq(net, dev_net(iter->dev)))
                        continue;
                netdev_adjacent_sysfs_add(iter->dev, dev,
                                          &iter->dev->adj_list.upper);
                netdev_adjacent_sysfs_add(dev, iter->dev,
                                          &dev->adj_list.lower);
        }
}

static void netdev_adjacent_del_links(struct net_device *dev)
{
        struct netdev_adjacent *iter;

        struct net *net = dev_net(dev);

        list_for_each_entry(iter, &dev->adj_list.upper, list) {
                if (!net_eq(net, dev_net(iter->dev)))
                        continue;
                netdev_adjacent_sysfs_del(iter->dev, dev->name,
                                          &iter->dev->adj_list.lower);
                netdev_adjacent_sysfs_del(dev, iter->dev->name,
                                          &dev->adj_list.upper);
        }

        list_for_each_entry(iter, &dev->adj_list.lower, list) {
                if (!net_eq(net, dev_net(iter->dev)))
                        continue;
                netdev_adjacent_sysfs_del(iter->dev, dev->name,
                                          &iter->dev->adj_list.upper);
                netdev_adjacent_sysfs_del(dev, iter->dev->name,
                                          &dev->adj_list.lower);
        }
}

void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
{
        struct netdev_adjacent *iter;

        struct net *net = dev_net(dev);

        list_for_each_entry(iter, &dev->adj_list.upper, list) {
                if (!net_eq(net, dev_net(iter->dev)))
                        continue;
                netdev_adjacent_sysfs_del(iter->dev, oldname,
                                          &iter->dev->adj_list.lower);
                netdev_adjacent_sysfs_add(iter->dev, dev,
                                          &iter->dev->adj_list.lower);
        }

        list_for_each_entry(iter, &dev->adj_list.lower, list) {
                if (!net_eq(net, dev_net(iter->dev)))
                        continue;
                netdev_adjacent_sysfs_del(iter->dev, oldname,
                                          &iter->dev->adj_list.upper);
                netdev_adjacent_sysfs_add(iter->dev, dev,
                                          &iter->dev->adj_list.upper);
        }
}

void *netdev_lower_dev_get_private(struct net_device *dev,
                                   struct net_device *lower_dev)
{
        struct netdev_adjacent *lower;

        if (!lower_dev)
                return NULL;
        lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
        if (!lower)
                return NULL;

        return lower->private;
}
EXPORT_SYMBOL(netdev_lower_dev_get_private);


/**
 * netdev_lower_state_changed - Dispatch event about lower device state change
 * @lower_dev: device
 * @lower_state_info: state to dispatch
 *
 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
 * The caller must hold the RTNL lock.
 */
void netdev_lower_state_changed(struct net_device *lower_dev,
                                void *lower_state_info)
{
        struct netdev_notifier_changelowerstate_info changelowerstate_info = {
                .info.dev = lower_dev,
        };

        ASSERT_RTNL();
        changelowerstate_info.lower_state_info = lower_state_info;
        call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
                                      &changelowerstate_info.info);
}
EXPORT_SYMBOL(netdev_lower_state_changed);

static void dev_change_rx_flags(struct net_device *dev, int flags)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (ops->ndo_change_rx_flags)
                ops->ndo_change_rx_flags(dev, flags);
}

static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
{
        unsigned int old_flags = dev->flags;
        unsigned int promiscuity, flags;
        kuid_t uid;
        kgid_t gid;

        ASSERT_RTNL();

        promiscuity = dev->promiscuity + inc;
        if (promiscuity == 0) {
                /*
                 * Avoid overflow.
                 * If inc causes overflow, untouch promisc and return error.
                 */
                if (unlikely(inc > 0)) {
                        netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n");
                        return -EOVERFLOW;
                }
                flags = old_flags & ~IFF_PROMISC;
        } else {
                flags = old_flags | IFF_PROMISC;
        }
        WRITE_ONCE(dev->promiscuity, promiscuity);
        if (flags != old_flags) {
                WRITE_ONCE(dev->flags, flags);
                netdev_info(dev, "%s promiscuous mode\n",
                            dev->flags & IFF_PROMISC ? "entered" : "left");
                if (audit_enabled) {
                        current_uid_gid(&uid, &gid);
                        audit_log(audit_context(), GFP_ATOMIC,
                                  AUDIT_ANOM_PROMISCUOUS,
                                  "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
                                  dev->name, (dev->flags & IFF_PROMISC),
                                  (old_flags & IFF_PROMISC),
                                  from_kuid(&init_user_ns, audit_get_loginuid(current)),
                                  from_kuid(&init_user_ns, uid),
                                  from_kgid(&init_user_ns, gid),
                                  audit_get_sessionid(current));
                }

                dev_change_rx_flags(dev, IFF_PROMISC);
        }
        if (notify) {
                /* The ops lock is only required to ensure consistent locking
                 * for `NETDEV_CHANGE` notifiers. This function is sometimes
                 * called without the lock, even for devices that are ops
                 * locked, such as in `dev_uc_sync_multiple` when using
                 * bonding or teaming.
                 */
                netdev_ops_assert_locked(dev);
                __dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL);
        }
        return 0;
}

int netif_set_promiscuity(struct net_device *dev, int inc)
{
        unsigned int old_flags = dev->flags;
        int err;

        err = __dev_set_promiscuity(dev, inc, true);
        if (err < 0)
                return err;
        if (dev->flags != old_flags)
                dev_set_rx_mode(dev);
        return err;
}

int netif_set_allmulti(struct net_device *dev, int inc, bool notify)
{
        unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
        unsigned int allmulti, flags;

        ASSERT_RTNL();

        allmulti = dev->allmulti + inc;
        if (allmulti == 0) {
                /*
                 * Avoid overflow.
                 * If inc causes overflow, untouch allmulti and return error.
                 */
                if (unlikely(inc > 0)) {
                        netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n");
                        return -EOVERFLOW;
                }
                flags = old_flags & ~IFF_ALLMULTI;
        } else {
                flags = old_flags | IFF_ALLMULTI;
        }
        WRITE_ONCE(dev->allmulti, allmulti);
        if (flags != old_flags) {
                WRITE_ONCE(dev->flags, flags);
                netdev_info(dev, "%s allmulticast mode\n",
                            dev->flags & IFF_ALLMULTI ? "entered" : "left");
                dev_change_rx_flags(dev, IFF_ALLMULTI);
                dev_set_rx_mode(dev);
                if (notify)
                        __dev_notify_flags(dev, old_flags,
                                           dev->gflags ^ old_gflags, 0, NULL);
        }
        return 0;
}

/*
 *        Upload unicast and multicast address lists to device and
 *        configure RX filtering. When the device doesn't support unicast
 *        filtering it is put in promiscuous mode while unicast addresses
 *        are present.
 */
void __dev_set_rx_mode(struct net_device *dev)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        /* dev_open will call this function so the list will stay sane. */
        if (!(dev->flags&IFF_UP))
                return;

        if (!netif_device_present(dev))
                return;

        if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
                /* Unicast addresses changes may only happen under the rtnl,
                 * therefore calling __dev_set_promiscuity here is safe.
                 */
                if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
                        __dev_set_promiscuity(dev, 1, false);
                        dev->uc_promisc = true;
                } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
                        __dev_set_promiscuity(dev, -1, false);
                        dev->uc_promisc = false;
                }
        }

        if (ops->ndo_set_rx_mode)
                ops->ndo_set_rx_mode(dev);
}

void dev_set_rx_mode(struct net_device *dev)
{
        netif_addr_lock_bh(dev);
        __dev_set_rx_mode(dev);
        netif_addr_unlock_bh(dev);
}

/**
 * netif_get_flags() - get flags reported to userspace
 * @dev: device
 *
 * Get the combination of flag bits exported through APIs to userspace.
 */
unsigned int netif_get_flags(const struct net_device *dev)
{
        unsigned int flags;

        flags = (READ_ONCE(dev->flags) & ~(IFF_PROMISC |
                                IFF_ALLMULTI |
                                IFF_RUNNING |
                                IFF_LOWER_UP |
                                IFF_DORMANT)) |
                (READ_ONCE(dev->gflags) & (IFF_PROMISC |
                                IFF_ALLMULTI));

        if (netif_running(dev)) {
                if (netif_oper_up(dev))
                        flags |= IFF_RUNNING;
                if (netif_carrier_ok(dev))
                        flags |= IFF_LOWER_UP;
                if (netif_dormant(dev))
                        flags |= IFF_DORMANT;
        }

        return flags;
}
EXPORT_SYMBOL(netif_get_flags);

int __dev_change_flags(struct net_device *dev, unsigned int flags,
                       struct netlink_ext_ack *extack)
{
        unsigned int old_flags = dev->flags;
        int ret;

        ASSERT_RTNL();

        /*
         *        Set the flags on our device.
         */

        dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
                               IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
                               IFF_AUTOMEDIA)) |
                     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
                                    IFF_ALLMULTI));

        /*
         *        Load in the correct multicast list now the flags have changed.
         */

        if ((old_flags ^ flags) & IFF_MULTICAST)
                dev_change_rx_flags(dev, IFF_MULTICAST);

        dev_set_rx_mode(dev);

        /*
         *        Have we downed the interface. We handle IFF_UP ourselves
         *        according to user attempts to set it, rather than blindly
         *        setting it.
         */

        ret = 0;
        if ((old_flags ^ flags) & IFF_UP) {
                if (old_flags & IFF_UP)
                        __dev_close(dev);
                else
                        ret = __dev_open(dev, extack);
        }

        if ((flags ^ dev->gflags) & IFF_PROMISC) {
                int inc = (flags & IFF_PROMISC) ? 1 : -1;
                old_flags = dev->flags;

                dev->gflags ^= IFF_PROMISC;

                if (__dev_set_promiscuity(dev, inc, false) >= 0)
                        if (dev->flags != old_flags)
                                dev_set_rx_mode(dev);
        }

        /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
         * is important. Some (broken) drivers set IFF_PROMISC, when
         * IFF_ALLMULTI is requested not asking us and not reporting.
         */
        if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
                int inc = (flags & IFF_ALLMULTI) ? 1 : -1;

                dev->gflags ^= IFF_ALLMULTI;
                netif_set_allmulti(dev, inc, false);
        }

        return ret;
}

void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
                        unsigned int gchanges, u32 portid,
                        const struct nlmsghdr *nlh)
{
        unsigned int changes = dev->flags ^ old_flags;

        if (gchanges)
                rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC, portid, nlh);

        if (changes & IFF_UP) {
                if (dev->flags & IFF_UP)
                        call_netdevice_notifiers(NETDEV_UP, dev);
                else
                        call_netdevice_notifiers(NETDEV_DOWN, dev);
        }

        if (dev->flags & IFF_UP &&
            (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
                struct netdev_notifier_change_info change_info = {
                        .info = {
                                .dev = dev,
                        },
                        .flags_changed = changes,
                };

                call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
        }
}

int netif_change_flags(struct net_device *dev, unsigned int flags,
                       struct netlink_ext_ack *extack)
{
        int ret;
        unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;

        ret = __dev_change_flags(dev, flags, extack);
        if (ret < 0)
                return ret;

        changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
        __dev_notify_flags(dev, old_flags, changes, 0, NULL);
        return ret;
}

int __netif_set_mtu(struct net_device *dev, int new_mtu)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (ops->ndo_change_mtu)
                return ops->ndo_change_mtu(dev, new_mtu);

        /* Pairs with all the lockless reads of dev->mtu in the stack */
        WRITE_ONCE(dev->mtu, new_mtu);
        return 0;
}
EXPORT_SYMBOL_NS_GPL(__netif_set_mtu, "NETDEV_INTERNAL");

int dev_validate_mtu(struct net_device *dev, int new_mtu,
                     struct netlink_ext_ack *extack)
{
        /* MTU must be positive, and in range */
        if (new_mtu < 0 || new_mtu < dev->min_mtu) {
                NL_SET_ERR_MSG(extack, "mtu less than device minimum");
                return -EINVAL;
        }

        if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
                NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
                return -EINVAL;
        }
        return 0;
}

/**
 * netif_set_mtu_ext() - Change maximum transfer unit
 * @dev: device
 * @new_mtu: new transfer unit
 * @extack: netlink extended ack
 *
 * Change the maximum transfer size of the network device.
 *
 * Return: 0 on success, -errno on failure.
 */
int netif_set_mtu_ext(struct net_device *dev, int new_mtu,
                      struct netlink_ext_ack *extack)
{
        int err, orig_mtu;

        netdev_ops_assert_locked(dev);

        if (new_mtu == dev->mtu)
                return 0;

        err = dev_validate_mtu(dev, new_mtu, extack);
        if (err)
                return err;

        if (!netif_device_present(dev))
                return -ENODEV;

        err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
        err = notifier_to_errno(err);
        if (err)
                return err;

        orig_mtu = dev->mtu;
        err = __netif_set_mtu(dev, new_mtu);

        if (!err) {
                err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
                                                   orig_mtu);
                err = notifier_to_errno(err);
                if (err) {
                        /* setting mtu back and notifying everyone again,
                         * so that they have a chance to revert changes.
                         */
                        __netif_set_mtu(dev, orig_mtu);
                        call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
                                                     new_mtu);
                }
        }
        return err;
}

int netif_set_mtu(struct net_device *dev, int new_mtu)
{
        struct netlink_ext_ack extack;
        int err;

        memset(&extack, 0, sizeof(extack));
        err = netif_set_mtu_ext(dev, new_mtu, &extack);
        if (err && extack._msg)
                net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
        return err;
}
EXPORT_SYMBOL(netif_set_mtu);

int netif_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
{
        unsigned int orig_len = dev->tx_queue_len;
        int res;

        if (new_len != (unsigned int)new_len)
                return -ERANGE;

        if (new_len != orig_len) {
                WRITE_ONCE(dev->tx_queue_len, new_len);
                res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
                res = notifier_to_errno(res);
                if (res)
                        goto err_rollback;
                res = dev_qdisc_change_tx_queue_len(dev);
                if (res)
                        goto err_rollback;
        }

        return 0;

err_rollback:
        netdev_err(dev, "refused to change device tx_queue_len\n");
        WRITE_ONCE(dev->tx_queue_len, orig_len);
        return res;
}

void netif_set_group(struct net_device *dev, int new_group)
{
        dev->group = new_group;
}

/**
 * netif_pre_changeaddr_notify() - Call NETDEV_PRE_CHANGEADDR.
 * @dev: device
 * @addr: new address
 * @extack: netlink extended ack
 *
 * Return: 0 on success, -errno on failure.
 */
int netif_pre_changeaddr_notify(struct net_device *dev, const char *addr,
                                struct netlink_ext_ack *extack)
{
        struct netdev_notifier_pre_changeaddr_info info = {
                .info.dev = dev,
                .info.extack = extack,
                .dev_addr = addr,
        };
        int rc;

        rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
        return notifier_to_errno(rc);
}
EXPORT_SYMBOL_NS_GPL(netif_pre_changeaddr_notify, "NETDEV_INTERNAL");

int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss,
                          struct netlink_ext_ack *extack)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        int err;

        if (!ops->ndo_set_mac_address)
                return -EOPNOTSUPP;
        if (ss->ss_family != dev->type)
                return -EINVAL;
        if (!netif_device_present(dev))
                return -ENODEV;
        err = netif_pre_changeaddr_notify(dev, ss->__data, extack);
        if (err)
                return err;
        if (memcmp(dev->dev_addr, ss->__data, dev->addr_len)) {
                err = ops->ndo_set_mac_address(dev, ss);
                if (err)
                        return err;
        }
        dev->addr_assign_type = NET_ADDR_SET;
        call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
        add_device_randomness(dev->dev_addr, dev->addr_len);
        return 0;
}

DECLARE_RWSEM(dev_addr_sem);

/* "sa" is a true struct sockaddr with limited "sa_data" member. */
int netif_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
{
        size_t size = sizeof(sa->sa_data_min);
        struct net_device *dev;
        int ret = 0;

        down_read(&dev_addr_sem);
        rcu_read_lock();

        dev = dev_get_by_name_rcu(net, dev_name);
        if (!dev) {
                ret = -ENODEV;
                goto unlock;
        }
        if (!dev->addr_len)
                memset(sa->sa_data, 0, size);
        else
                memcpy(sa->sa_data, dev->dev_addr,
                       min_t(size_t, size, dev->addr_len));
        sa->sa_family = dev->type;

unlock:
        rcu_read_unlock();
        up_read(&dev_addr_sem);
        return ret;
}
EXPORT_SYMBOL_NS_GPL(netif_get_mac_address, "NETDEV_INTERNAL");

int netif_change_carrier(struct net_device *dev, bool new_carrier)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (!ops->ndo_change_carrier)
                return -EOPNOTSUPP;
        if (!netif_device_present(dev))
                return -ENODEV;
        return ops->ndo_change_carrier(dev, new_carrier);
}

/**
 *        dev_get_phys_port_id - Get device physical port ID
 *        @dev: device
 *        @ppid: port ID
 *
 *        Get device physical port ID
 */
int dev_get_phys_port_id(struct net_device *dev,
                         struct netdev_phys_item_id *ppid)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (!ops->ndo_get_phys_port_id)
                return -EOPNOTSUPP;
        return ops->ndo_get_phys_port_id(dev, ppid);
}

/**
 *        dev_get_phys_port_name - Get device physical port name
 *        @dev: device
 *        @name: port name
 *        @len: limit of bytes to copy to name
 *
 *        Get device physical port name
 */
int dev_get_phys_port_name(struct net_device *dev,
                           char *name, size_t len)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        int err;

        if (ops->ndo_get_phys_port_name) {
                err = ops->ndo_get_phys_port_name(dev, name, len);
                if (err != -EOPNOTSUPP)
                        return err;
        }
        return devlink_compat_phys_port_name_get(dev, name, len);
}

/**
 * netif_get_port_parent_id() - Get the device's port parent identifier
 * @dev: network device
 * @ppid: pointer to a storage for the port's parent identifier
 * @recurse: allow/disallow recursion to lower devices
 *
 * Get the devices's port parent identifier.
 *
 * Return: 0 on success, -errno on failure.
 */
int netif_get_port_parent_id(struct net_device *dev,
                             struct netdev_phys_item_id *ppid, bool recurse)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        struct netdev_phys_item_id first = { };
        struct net_device *lower_dev;
        struct list_head *iter;
        int err;

        if (ops->ndo_get_port_parent_id) {
                err = ops->ndo_get_port_parent_id(dev, ppid);
                if (err != -EOPNOTSUPP)
                        return err;
        }

        err = devlink_compat_switch_id_get(dev, ppid);
        if (!recurse || err != -EOPNOTSUPP)
                return err;

        netdev_for_each_lower_dev(dev, lower_dev, iter) {
                err = netif_get_port_parent_id(lower_dev, ppid, true);
                if (err)
                        break;
                if (!first.id_len)
                        first = *ppid;
                else if (memcmp(&first, ppid, sizeof(*ppid)))
                        return -EOPNOTSUPP;
        }

        return err;
}
EXPORT_SYMBOL(netif_get_port_parent_id);

/**
 *        netdev_port_same_parent_id - Indicate if two network devices have
 *        the same port parent identifier
 *        @a: first network device
 *        @b: second network device
 */
bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
{
        struct netdev_phys_item_id a_id = { };
        struct netdev_phys_item_id b_id = { };

        if (netif_get_port_parent_id(a, &a_id, true) ||
            netif_get_port_parent_id(b, &b_id, true))
                return false;

        return netdev_phys_item_id_same(&a_id, &b_id);
}
EXPORT_SYMBOL(netdev_port_same_parent_id);

int netif_change_proto_down(struct net_device *dev, bool proto_down)
{
        if (!dev->change_proto_down)
                return -EOPNOTSUPP;
        if (!netif_device_present(dev))
                return -ENODEV;
        if (proto_down)
                netif_carrier_off(dev);
        else
                netif_carrier_on(dev);
        WRITE_ONCE(dev->proto_down, proto_down);
        return 0;
}

/**
 *        netdev_change_proto_down_reason_locked - proto down reason
 *
 *        @dev: device
 *        @mask: proto down mask
 *        @value: proto down value
 */
void netdev_change_proto_down_reason_locked(struct net_device *dev,
                                            unsigned long mask, u32 value)
{
        u32 proto_down_reason;
        int b;

        if (!mask) {
                proto_down_reason = value;
        } else {
                proto_down_reason = dev->proto_down_reason;
                for_each_set_bit(b, &mask, 32) {
                        if (value & (1 << b))
                                proto_down_reason |= BIT(b);
                        else
                                proto_down_reason &= ~BIT(b);
                }
        }
        WRITE_ONCE(dev->proto_down_reason, proto_down_reason);
}

struct bpf_xdp_link {
        struct bpf_link link;
        struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
        int flags;
};

static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
{
        if (flags & XDP_FLAGS_HW_MODE)
                return XDP_MODE_HW;
        if (flags & XDP_FLAGS_DRV_MODE)
                return XDP_MODE_DRV;
        if (flags & XDP_FLAGS_SKB_MODE)
                return XDP_MODE_SKB;
        return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
}

static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
{
        switch (mode) {
        case XDP_MODE_SKB:
                return generic_xdp_install;
        case XDP_MODE_DRV:
        case XDP_MODE_HW:
                return dev->netdev_ops->ndo_bpf;
        default:
                return NULL;
        }
}

static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
                                         enum bpf_xdp_mode mode)
{
        return dev->xdp_state[mode].link;
}

static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
                                     enum bpf_xdp_mode mode)
{
        struct bpf_xdp_link *link = dev_xdp_link(dev, mode);

        if (link)
                return link->link.prog;
        return dev->xdp_state[mode].prog;
}

u8 dev_xdp_prog_count(struct net_device *dev)
{
        u8 count = 0;
        int i;

        for (i = 0; i < __MAX_XDP_MODE; i++)
                if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
                        count++;
        return count;
}
EXPORT_SYMBOL_GPL(dev_xdp_prog_count);

u8 dev_xdp_sb_prog_count(struct net_device *dev)
{
        u8 count = 0;
        int i;

        for (i = 0; i < __MAX_XDP_MODE; i++)
                if (dev->xdp_state[i].prog &&
                    !dev->xdp_state[i].prog->aux->xdp_has_frags)
                        count++;
        return count;
}

int netif_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf)
{
        if (!dev->netdev_ops->ndo_bpf)
                return -EOPNOTSUPP;

        if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED &&
            bpf->command == XDP_SETUP_PROG &&
            bpf->prog && !bpf->prog->aux->xdp_has_frags) {
                NL_SET_ERR_MSG(bpf->extack,
                               "unable to propagate XDP to device using tcp-data-split");
                return -EBUSY;
        }

        if (dev_get_min_mp_channel_count(dev)) {
                NL_SET_ERR_MSG(bpf->extack, "unable to propagate XDP to device using memory provider");
                return -EBUSY;
        }

        return dev->netdev_ops->ndo_bpf(dev, bpf);
}
EXPORT_SYMBOL_GPL(netif_xdp_propagate);

u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
{
        struct bpf_prog *prog = dev_xdp_prog(dev, mode);

        return prog ? prog->aux->id : 0;
}

static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
                             struct bpf_xdp_link *link)
{
        dev->xdp_state[mode].link = link;
        dev->xdp_state[mode].prog = NULL;
}

static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
                             struct bpf_prog *prog)
{
        dev->xdp_state[mode].link = NULL;
        dev->xdp_state[mode].prog = prog;
}

static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
                           bpf_op_t bpf_op, struct netlink_ext_ack *extack,
                           u32 flags, struct bpf_prog *prog)
{
        struct netdev_bpf xdp;
        int err;

        netdev_ops_assert_locked(dev);

        if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED &&
            prog && !prog->aux->xdp_has_frags) {
                NL_SET_ERR_MSG(extack, "unable to install XDP to device using tcp-data-split");
                return -EBUSY;
        }

        if (dev_get_min_mp_channel_count(dev)) {
                NL_SET_ERR_MSG(extack, "unable to install XDP to device using memory provider");
                return -EBUSY;
        }

        memset(&xdp, 0, sizeof(xdp));
        xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
        xdp.extack = extack;
        xdp.flags = flags;
        xdp.prog = prog;

        /* Drivers assume refcnt is already incremented (i.e, prog pointer is
         * "moved" into driver), so they don't increment it on their own, but
         * they do decrement refcnt when program is detached or replaced.
         * Given net_device also owns link/prog, we need to bump refcnt here
         * to prevent drivers from underflowing it.
         */
        if (prog)
                bpf_prog_inc(prog);
        err = bpf_op(dev, &xdp);
        if (err) {
                if (prog)
                        bpf_prog_put(prog);
                return err;
        }

        if (mode != XDP_MODE_HW)
                bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);

        return 0;
}

static void dev_xdp_uninstall(struct net_device *dev)
{
        struct bpf_xdp_link *link;
        struct bpf_prog *prog;
        enum bpf_xdp_mode mode;
        bpf_op_t bpf_op;

        ASSERT_RTNL();

        for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
                prog = dev_xdp_prog(dev, mode);
                if (!prog)
                        continue;

                bpf_op = dev_xdp_bpf_op(dev, mode);
                if (!bpf_op)
                        continue;

                WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));

                /* auto-detach link from net device */
                link = dev_xdp_link(dev, mode);
                if (link)
                        link->dev = NULL;
                else
                        bpf_prog_put(prog);

                dev_xdp_set_link(dev, mode, NULL);
        }
}

static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
                          struct bpf_xdp_link *link, struct bpf_prog *new_prog,
                          struct bpf_prog *old_prog, u32 flags)
{
        unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
        struct bpf_prog *cur_prog;
        struct net_device *upper;
        struct list_head *iter;
        enum bpf_xdp_mode mode;
        bpf_op_t bpf_op;
        int err;

        ASSERT_RTNL();

        /* either link or prog attachment, never both */
        if (link && (new_prog || old_prog))
                return -EINVAL;
        /* link supports only XDP mode flags */
        if (link && (flags & ~XDP_FLAGS_MODES)) {
                NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
                return -EINVAL;
        }
        /* just one XDP mode bit should be set, zero defaults to drv/skb mode */
        if (num_modes > 1) {
                NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
                return -EINVAL;
        }
        /* avoid ambiguity if offload + drv/skb mode progs are both loaded */
        if (!num_modes && dev_xdp_prog_count(dev) > 1) {
                NL_SET_ERR_MSG(extack,
                               "More than one program loaded, unset mode is ambiguous");
                return -EINVAL;
        }
        /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
        if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
                NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
                return -EINVAL;
        }

        mode = dev_xdp_mode(dev, flags);
        /* can't replace attached link */
        if (dev_xdp_link(dev, mode)) {
                NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
                return -EBUSY;
        }

        /* don't allow if an upper device already has a program */
        netdev_for_each_upper_dev_rcu(dev, upper, iter) {
                if (dev_xdp_prog_count(upper) > 0) {
                        NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program");
                        return -EEXIST;
                }
        }

        cur_prog = dev_xdp_prog(dev, mode);
        /* can't replace attached prog with link */
        if (link && cur_prog) {
                NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
                return -EBUSY;
        }
        if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
                NL_SET_ERR_MSG(extack, "Active program does not match expected");
                return -EEXIST;
        }

        /* put effective new program into new_prog */
        if (link)
                new_prog = link->link.prog;

        if (new_prog) {
                bool offload = mode == XDP_MODE_HW;
                enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
                                               ? XDP_MODE_DRV : XDP_MODE_SKB;

                if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
                        NL_SET_ERR_MSG(extack, "XDP program already attached");
                        return -EBUSY;
                }
                if (!offload && dev_xdp_prog(dev, other_mode)) {
                        NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
                        return -EEXIST;
                }
                if (!offload && bpf_prog_is_offloaded(new_prog->aux)) {
                        NL_SET_ERR_MSG(extack, "Using offloaded program without HW_MODE flag is not supported");
                        return -EINVAL;
                }
                if (bpf_prog_is_dev_bound(new_prog->aux) && !bpf_offload_dev_match(new_prog, dev)) {
                        NL_SET_ERR_MSG(extack, "Program bound to different device");
                        return -EINVAL;
                }
                if (bpf_prog_is_dev_bound(new_prog->aux) && mode == XDP_MODE_SKB) {
                        NL_SET_ERR_MSG(extack, "Can't attach device-bound programs in generic mode");
                        return -EINVAL;
                }
                if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
                        NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
                        return -EINVAL;
                }
                if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
                        NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
                        return -EINVAL;
                }
        }

        /* don't call drivers if the effective program didn't change */
        if (new_prog != cur_prog) {
                bpf_op = dev_xdp_bpf_op(dev, mode);
                if (!bpf_op) {
                        NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
                        return -EOPNOTSUPP;
                }

                err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
                if (err)
                        return err;
        }

        if (link)
                dev_xdp_set_link(dev, mode, link);
        else
                dev_xdp_set_prog(dev, mode, new_prog);
        if (cur_prog)
                bpf_prog_put(cur_prog);

        return 0;
}

static int dev_xdp_attach_link(struct net_device *dev,
                               struct netlink_ext_ack *extack,
                               struct bpf_xdp_link *link)
{
        return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
}

static int dev_xdp_detach_link(struct net_device *dev,
                               struct netlink_ext_ack *extack,
                               struct bpf_xdp_link *link)
{
        enum bpf_xdp_mode mode;
        bpf_op_t bpf_op;

        ASSERT_RTNL();

        mode = dev_xdp_mode(dev, link->flags);
        if (dev_xdp_link(dev, mode) != link)
                return -EINVAL;

        bpf_op = dev_xdp_bpf_op(dev, mode);
        WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
        dev_xdp_set_link(dev, mode, NULL);
        return 0;
}

static void bpf_xdp_link_release(struct bpf_link *link)
{
        struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);

        rtnl_lock();

        /* if racing with net_device's tear down, xdp_link->dev might be
         * already NULL, in which case link was already auto-detached
         */
        if (xdp_link->dev) {
                netdev_lock_ops(xdp_link->dev);
                WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
                netdev_unlock_ops(xdp_link->dev);
                xdp_link->dev = NULL;
        }

        rtnl_unlock();
}

static int bpf_xdp_link_detach(struct bpf_link *link)
{
        bpf_xdp_link_release(link);
        return 0;
}

static void bpf_xdp_link_dealloc(struct bpf_link *link)
{
        struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);

        kfree(xdp_link);
}

static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
                                     struct seq_file *seq)
{
        struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
        u32 ifindex = 0;

        rtnl_lock();
        if (xdp_link->dev)
                ifindex = xdp_link->dev->ifindex;
        rtnl_unlock();

        seq_printf(seq, "ifindex:\t%u\n", ifindex);
}

static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
                                       struct bpf_link_info *info)
{
        struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
        u32 ifindex = 0;

        rtnl_lock();
        if (xdp_link->dev)
                ifindex = xdp_link->dev->ifindex;
        rtnl_unlock();

        info->xdp.ifindex = ifindex;
        return 0;
}

static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
                               struct bpf_prog *old_prog)
{
        struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
        enum bpf_xdp_mode mode;
        bpf_op_t bpf_op;
        int err = 0;

        rtnl_lock();

        /* link might have been auto-released already, so fail */
        if (!xdp_link->dev) {
                err = -ENOLINK;
                goto out_unlock;
        }

        if (old_prog && link->prog != old_prog) {
                err = -EPERM;
                goto out_unlock;
        }
        old_prog = link->prog;
        if (old_prog->type != new_prog->type ||
            old_prog->expected_attach_type != new_prog->expected_attach_type) {
                err = -EINVAL;
                goto out_unlock;
        }

        if (old_prog == new_prog) {
                /* no-op, don't disturb drivers */
                bpf_prog_put(new_prog);
                goto out_unlock;
        }

        netdev_lock_ops(xdp_link->dev);
        mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
        bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
        err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
                              xdp_link->flags, new_prog);
        netdev_unlock_ops(xdp_link->dev);
        if (err)
                goto out_unlock;

        old_prog = xchg(&link->prog, new_prog);
        bpf_prog_put(old_prog);

out_unlock:
        rtnl_unlock();
        return err;
}

static const struct bpf_link_ops bpf_xdp_link_lops = {
        .release = bpf_xdp_link_release,
        .dealloc = bpf_xdp_link_dealloc,
        .detach = bpf_xdp_link_detach,
        .show_fdinfo = bpf_xdp_link_show_fdinfo,
        .fill_link_info = bpf_xdp_link_fill_link_info,
        .update_prog = bpf_xdp_link_update,
};

int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
        struct net *net = current->nsproxy->net_ns;
        struct bpf_link_primer link_primer;
        struct netlink_ext_ack extack = {};
        struct bpf_xdp_link *link;
        struct net_device *dev;
        int err, fd;

        rtnl_lock();
        dev = dev_get_by_index(net, attr->link_create.target_ifindex);
        if (!dev) {
                rtnl_unlock();
                return -EINVAL;
        }

        link = kzalloc(sizeof(*link), GFP_USER);
        if (!link) {
                err = -ENOMEM;
                goto unlock;
        }

        bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog,
                      attr->link_create.attach_type);
        link->dev = dev;
        link->flags = attr->link_create.flags;

        err = bpf_link_prime(&link->link, &link_primer);
        if (err) {
                kfree(link);
                goto unlock;
        }

        netdev_lock_ops(dev);
        err = dev_xdp_attach_link(dev, &extack, link);
        netdev_unlock_ops(dev);
        rtnl_unlock();

        if (err) {
                link->dev = NULL;
                bpf_link_cleanup(&link_primer);
                trace_bpf_xdp_link_attach_failed(extack._msg);
                goto out_put_dev;
        }

        fd = bpf_link_settle(&link_primer);
        /* link itself doesn't hold dev's refcnt to not complicate shutdown */
        dev_put(dev);
        return fd;

unlock:
        rtnl_unlock();

out_put_dev:
        dev_put(dev);
        return err;
}

/**
 *        dev_change_xdp_fd - set or clear a bpf program for a device rx path
 *        @dev: device
 *        @extack: netlink extended ack
 *        @fd: new program fd or negative value to clear
 *        @expected_fd: old program fd that userspace expects to replace or clear
 *        @flags: xdp-related flags
 *
 *        Set or clear a bpf program for a device
 */
int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
                      int fd, int expected_fd, u32 flags)
{
        enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
        struct bpf_prog *new_prog = NULL, *old_prog = NULL;
        int err;

        ASSERT_RTNL();

        if (fd >= 0) {
                new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
                                                 mode != XDP_MODE_SKB);
                if (IS_ERR(new_prog))
                        return PTR_ERR(new_prog);
        }

        if (expected_fd >= 0) {
                old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
                                                 mode != XDP_MODE_SKB);
                if (IS_ERR(old_prog)) {
                        err = PTR_ERR(old_prog);
                        old_prog = NULL;
                        goto err_out;
                }
        }

        err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);

err_out:
        if (err && new_prog)
                bpf_prog_put(new_prog);
        if (old_prog)
                bpf_prog_put(old_prog);
        return err;
}

u32 dev_get_min_mp_channel_count(const struct net_device *dev)
{
        int i;

        netdev_ops_assert_locked(dev);

        for (i = dev->real_num_rx_queues - 1; i >= 0; i--)
                if (dev->_rx[i].mp_params.mp_priv)
                        /* The channel count is the idx plus 1. */
                        return i + 1;

        return 0;
}

/**
 * dev_index_reserve() - allocate an ifindex in a namespace
 * @net: the applicable net namespace
 * @ifindex: requested ifindex, pass %0 to get one allocated
 *
 * Allocate a ifindex for a new device. Caller must either use the ifindex
 * to store the device (via list_netdevice()) or call dev_index_release()
 * to give the index up.
 *
 * Return: a suitable unique value for a new device interface number or -errno.
 */
static int dev_index_reserve(struct net *net, u32 ifindex)
{
        int err;

        if (ifindex > INT_MAX) {
                DEBUG_NET_WARN_ON_ONCE(1);
                return -EINVAL;
        }

        if (!ifindex)
                err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL,
                                      xa_limit_31b, &net->ifindex, GFP_KERNEL);
        else
                err = xa_insert(&net->dev_by_index, ifindex, NULL, GFP_KERNEL);
        if (err < 0)
                return err;

        return ifindex;
}

static void dev_index_release(struct net *net, int ifindex)
{
        /* Expect only unused indexes, unlist_netdevice() removes the used */
        WARN_ON(xa_erase(&net->dev_by_index, ifindex));
}

static bool from_cleanup_net(void)
{
#ifdef CONFIG_NET_NS
        return current == READ_ONCE(cleanup_net_task);
#else
        return false;
#endif
}

/* Delayed registration/unregisteration */
LIST_HEAD(net_todo_list);
DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
atomic_t dev_unreg_count = ATOMIC_INIT(0);

static void net_set_todo(struct net_device *dev)
{
        list_add_tail(&dev->todo_list, &net_todo_list);
}

static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
        struct net_device *upper, netdev_features_t features)
{
        netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
        netdev_features_t feature;
        int feature_bit;

        for_each_netdev_feature(upper_disables, feature_bit) {
                feature = __NETIF_F_BIT(feature_bit);
                if (!(upper->wanted_features & feature)
                    && (features & feature)) {
                        netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
                                   &feature, upper->name);
                        features &= ~feature;
                }
        }

        return features;
}

static void netdev_sync_lower_features(struct net_device *upper,
        struct net_device *lower, netdev_features_t features)
{
        netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
        netdev_features_t feature;
        int feature_bit;

        for_each_netdev_feature(upper_disables, feature_bit) {
                feature = __NETIF_F_BIT(feature_bit);
                if (!(features & feature) && (lower->features & feature)) {
                        netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
                                   &feature, lower->name);
                        netdev_lock_ops(lower);
                        lower->wanted_features &= ~feature;
                        __netdev_update_features(lower);

                        if (unlikely(lower->features & feature))
                                netdev_WARN(upper, "failed to disable %pNF on %s!\n",
                                            &feature, lower->name);
                        else
                                netdev_features_change(lower);
                        netdev_unlock_ops(lower);
                }
        }
}

static bool netdev_has_ip_or_hw_csum(netdev_features_t features)
{
        netdev_features_t ip_csum_mask = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
        bool ip_csum = (features & ip_csum_mask) == ip_csum_mask;
        bool hw_csum = features & NETIF_F_HW_CSUM;

        return ip_csum || hw_csum;
}

static netdev_features_t netdev_fix_features(struct net_device *dev,
        netdev_features_t features)
{
        /* Fix illegal checksum combinations */
        if ((features & NETIF_F_HW_CSUM) &&
            (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
                netdev_warn(dev, "mixed HW and IP checksum settings.\n");
                features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
        }

        /* TSO requires that SG is present as well. */
        if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
                netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
                features &= ~NETIF_F_ALL_TSO;
        }

        if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
                                        !(features & NETIF_F_IP_CSUM)) {
                netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
                features &= ~NETIF_F_TSO;
                features &= ~NETIF_F_TSO_ECN;
        }

        if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
                                         !(features & NETIF_F_IPV6_CSUM)) {
                netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
                features &= ~NETIF_F_TSO6;
        }

        /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
        if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
                features &= ~NETIF_F_TSO_MANGLEID;

        /* TSO ECN requires that TSO is present as well. */
        if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
                features &= ~NETIF_F_TSO_ECN;

        /* Software GSO depends on SG. */
        if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
                netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
                features &= ~NETIF_F_GSO;
        }

        /* GSO partial features require GSO partial be set */
        if ((features & dev->gso_partial_features) &&
            !(features & NETIF_F_GSO_PARTIAL)) {
                netdev_dbg(dev,
                           "Dropping partially supported GSO features since no GSO partial.\n");
                features &= ~dev->gso_partial_features;
        }

        if (!(features & NETIF_F_RXCSUM)) {
                /* NETIF_F_GRO_HW implies doing RXCSUM since every packet
                 * successfully merged by hardware must also have the
                 * checksum verified by hardware.  If the user does not
                 * want to enable RXCSUM, logically, we should disable GRO_HW.
                 */
                if (features & NETIF_F_GRO_HW) {
                        netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
                        features &= ~NETIF_F_GRO_HW;
                }
        }

        /* LRO/HW-GRO features cannot be combined with RX-FCS */
        if (features & NETIF_F_RXFCS) {
                if (features & NETIF_F_LRO) {
                        netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
                        features &= ~NETIF_F_LRO;
                }

                if (features & NETIF_F_GRO_HW) {
                        netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
                        features &= ~NETIF_F_GRO_HW;
                }
        }

        if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) {
                netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n");
                features &= ~NETIF_F_LRO;
        }

        if ((features & NETIF_F_HW_TLS_TX) && !netdev_has_ip_or_hw_csum(features)) {
                netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
                features &= ~NETIF_F_HW_TLS_TX;
        }

        if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
                netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
                features &= ~NETIF_F_HW_TLS_RX;
        }

        if ((features & NETIF_F_GSO_UDP_L4) && !netdev_has_ip_or_hw_csum(features)) {
                netdev_dbg(dev, "Dropping USO feature since no CSUM feature.\n");
                features &= ~NETIF_F_GSO_UDP_L4;
        }

        return features;
}

int __netdev_update_features(struct net_device *dev)
{
        struct net_device *upper, *lower;
        netdev_features_t features;
        struct list_head *iter;
        int err = -1;

        ASSERT_RTNL();
        netdev_ops_assert_locked(dev);

        features = netdev_get_wanted_features(dev);

        if (dev->netdev_ops->ndo_fix_features)
                features = dev->netdev_ops->ndo_fix_features(dev, features);

        /* driver might be less strict about feature dependencies */
        features = netdev_fix_features(dev, features);

        /* some features can't be enabled if they're off on an upper device */
        netdev_for_each_upper_dev_rcu(dev, upper, iter)
                features = netdev_sync_upper_features(dev, upper, features);

        if (dev->features == features)
                goto sync_lower;

        netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
                &dev->features, &features);

        if (dev->netdev_ops->ndo_set_features)
                err = dev->netdev_ops->ndo_set_features(dev, features);
        else
                err = 0;

        if (unlikely(err < 0)) {
                netdev_err(dev,
                        "set_features() failed (%d); wanted %pNF, left %pNF\n",
                        err, &features, &dev->features);
                /* return non-0 since some features might have changed and
                 * it's better to fire a spurious notification than miss it
                 */
                return -1;
        }

sync_lower:
        /* some features must be disabled on lower devices when disabled
         * on an upper device (think: bonding master or bridge)
         */
        netdev_for_each_lower_dev(dev, lower, iter)
                netdev_sync_lower_features(dev, lower, features);

        if (!err) {
                netdev_features_t diff = features ^ dev->features;

                if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
                        /* udp_tunnel_{get,drop}_rx_info both need
                         * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
                         * device, or they won't do anything.
                         * Thus we need to update dev->features
                         * *before* calling udp_tunnel_get_rx_info,
                         * but *after* calling udp_tunnel_drop_rx_info.
                         */
                        udp_tunnel_nic_lock(dev);
                        if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
                                dev->features = features;
                                udp_tunnel_get_rx_info(dev);
                        } else {
                                udp_tunnel_drop_rx_info(dev);
                        }
                        udp_tunnel_nic_unlock(dev);
                }

                if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
                        if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
                                dev->features = features;
                                err |= vlan_get_rx_ctag_filter_info(dev);
                        } else {
                                vlan_drop_rx_ctag_filter_info(dev);
                        }
                }

                if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
                        if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
                                dev->features = features;
                                err |= vlan_get_rx_stag_filter_info(dev);
                        } else {
                                vlan_drop_rx_stag_filter_info(dev);
                        }
                }

                dev->features = features;
        }

        return err < 0 ? 0 : 1;
}

/**
 *        netdev_update_features - recalculate device features
 *        @dev: the device to check
 *
 *        Recalculate dev->features set and send notifications if it
 *        has changed. Should be called after driver or hardware dependent
 *        conditions might have changed that influence the features.
 */
void netdev_update_features(struct net_device *dev)
{
        if (__netdev_update_features(dev))
                netdev_features_change(dev);
}
EXPORT_SYMBOL(netdev_update_features);

/**
 *        netdev_change_features - recalculate device features
 *        @dev: the device to check
 *
 *        Recalculate dev->features set and send notifications even
 *        if they have not changed. Should be called instead of
 *        netdev_update_features() if also dev->vlan_features might
 *        have changed to allow the changes to be propagated to stacked
 *        VLAN devices.
 */
void netdev_change_features(struct net_device *dev)
{
        __netdev_update_features(dev);
        netdev_features_change(dev);
}
EXPORT_SYMBOL(netdev_change_features);

/**
 *        netif_stacked_transfer_operstate -        transfer operstate
 *        @rootdev: the root or lower level device to transfer state from
 *        @dev: the device to transfer operstate to
 *
 *        Transfer operational state from root to device. This is normally
 *        called when a stacking relationship exists between the root
 *        device and the device(a leaf device).
 */
void netif_stacked_transfer_operstate(const struct net_device *rootdev,
                                        struct net_device *dev)
{
        if (rootdev->operstate == IF_OPER_DORMANT)
                netif_dormant_on(dev);
        else
                netif_dormant_off(dev);

        if (rootdev->operstate == IF_OPER_TESTING)
                netif_testing_on(dev);
        else
                netif_testing_off(dev);

        if (netif_carrier_ok(rootdev))
                netif_carrier_on(dev);
        else
                netif_carrier_off(dev);
}
EXPORT_SYMBOL(netif_stacked_transfer_operstate);

static int netif_alloc_rx_queues(struct net_device *dev)
{
        unsigned int i, count = dev->num_rx_queues;
        struct netdev_rx_queue *rx;
        size_t sz = count * sizeof(*rx);
        int err = 0;

        BUG_ON(count < 1);

        rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
        if (!rx)
                return -ENOMEM;

        dev->_rx = rx;

        for (i = 0; i < count; i++) {
                rx[i].dev = dev;

                /* XDP RX-queue setup */
                err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0);
                if (err < 0)
                        goto err_rxq_info;
        }
        return 0;

err_rxq_info:
        /* Rollback successful reg's and free other resources */
        while (i--)
                xdp_rxq_info_unreg(&rx[i].xdp_rxq);
        kvfree(dev->_rx);
        dev->_rx = NULL;
        return err;
}

static void netif_free_rx_queues(struct net_device *dev)
{
        unsigned int i, count = dev->num_rx_queues;

        /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
        if (!dev->_rx)
                return;

        for (i = 0; i < count; i++)
                xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);

        kvfree(dev->_rx);
}

static void netdev_init_one_queue(struct net_device *dev,
                                  struct netdev_queue *queue, void *_unused)
{
        /* Initialize queue lock */
        spin_lock_init(&queue->_xmit_lock);
        netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
        queue->xmit_lock_owner = -1;
        netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
        queue->dev = dev;
#ifdef CONFIG_BQL
        dql_init(&queue->dql, HZ);
#endif
}

static void netif_free_tx_queues(struct net_device *dev)
{
        kvfree(dev->_tx);
}

static int netif_alloc_netdev_queues(struct net_device *dev)
{
        unsigned int count = dev->num_tx_queues;
        struct netdev_queue *tx;
        size_t sz = count * sizeof(*tx);

        if (count < 1 || count > 0xffff)
                return -EINVAL;

        tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
        if (!tx)
                return -ENOMEM;

        dev->_tx = tx;

        netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
        spin_lock_init(&dev->tx_global_lock);

        return 0;
}

void netif_tx_stop_all_queues(struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);

                netif_tx_stop_queue(txq);
        }
}
EXPORT_SYMBOL(netif_tx_stop_all_queues);

static int netdev_do_alloc_pcpu_stats(struct net_device *dev)
{
        void __percpu *v;

        /* Drivers implementing ndo_get_peer_dev must support tstat
         * accounting, so that skb_do_redirect() can bump the dev's
         * RX stats upon network namespace switch.
         */
        if (dev->netdev_ops->ndo_get_peer_dev &&
            dev->pcpu_stat_type != NETDEV_PCPU_STAT_TSTATS)
                return -EOPNOTSUPP;

        switch (dev->pcpu_stat_type) {
        case NETDEV_PCPU_STAT_NONE:
                return 0;
        case NETDEV_PCPU_STAT_LSTATS:
                v = dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
                break;
        case NETDEV_PCPU_STAT_TSTATS:
                v = dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
                break;
        case NETDEV_PCPU_STAT_DSTATS:
                v = dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats);
                break;
        default:
                return -EINVAL;
        }

        return v ? 0 : -ENOMEM;
}

static void netdev_do_free_pcpu_stats(struct net_device *dev)
{
        switch (dev->pcpu_stat_type) {
        case NETDEV_PCPU_STAT_NONE:
                return;
        case NETDEV_PCPU_STAT_LSTATS:
                free_percpu(dev->lstats);
                break;
        case NETDEV_PCPU_STAT_TSTATS:
                free_percpu(dev->tstats);
                break;
        case NETDEV_PCPU_STAT_DSTATS:
                free_percpu(dev->dstats);
                break;
        }
}

static void netdev_free_phy_link_topology(struct net_device *dev)
{
        struct phy_link_topology *topo = dev->link_topo;

        if (IS_ENABLED(CONFIG_PHYLIB) && topo) {
                xa_destroy(&topo->phys);
                kfree(topo);
                dev->link_topo = NULL;
        }
}

/**
 * register_netdevice() - register a network device
 * @dev: device to register
 *
 * Take a prepared network device structure and make it externally accessible.
 * A %NETDEV_REGISTER message is sent to the netdev notifier chain.
 * Callers must hold the rtnl lock - you may want register_netdev()
 * instead of this.
 */
int register_netdevice(struct net_device *dev)
{
        int ret;
        struct net *net = dev_net(dev);

        BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
                     NETDEV_FEATURE_COUNT);
        BUG_ON(dev_boot_phase);
        ASSERT_RTNL();

        might_sleep();

        /* When net_device's are persistent, this will be fatal. */
        BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
        BUG_ON(!net);

        ret = ethtool_check_ops(dev->ethtool_ops);
        if (ret)
                return ret;

        /* rss ctx ID 0 is reserved for the default context, start from 1 */
        xa_init_flags(&dev->ethtool->rss_ctx, XA_FLAGS_ALLOC1);
        mutex_init(&dev->ethtool->rss_lock);

        spin_lock_init(&dev->addr_list_lock);
        netdev_set_addr_lockdep_class(dev);

        ret = dev_get_valid_name(net, dev, dev->name);
        if (ret < 0)
                goto out;

        ret = -ENOMEM;
        dev->name_node = netdev_name_node_head_alloc(dev);
        if (!dev->name_node)
                goto out;

        /* Init, if this function is available */
        if (dev->netdev_ops->ndo_init) {
                ret = dev->netdev_ops->ndo_init(dev);
                if (ret) {
                        if (ret > 0)
                                ret = -EIO;
                        goto err_free_name;
                }
        }

        if (((dev->hw_features | dev->features) &
             NETIF_F_HW_VLAN_CTAG_FILTER) &&
            (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
             !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
                netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
                ret = -EINVAL;
                goto err_uninit;
        }

        ret = netdev_do_alloc_pcpu_stats(dev);
        if (ret)
                goto err_uninit;

        ret = dev_index_reserve(net, dev->ifindex);
        if (ret < 0)
                goto err_free_pcpu;
        dev->ifindex = ret;

        /* Transfer changeable features to wanted_features and enable
         * software offloads (GSO and GRO).
         */
        dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
        dev->features |= NETIF_F_SOFT_FEATURES;

        if (dev->udp_tunnel_nic_info) {
                dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
                dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
        }

        dev->wanted_features = dev->features & dev->hw_features;

        if (!(dev->flags & IFF_LOOPBACK))
                dev->hw_features |= NETIF_F_NOCACHE_COPY;

        /* If IPv4 TCP segmentation offload is supported we should also
         * allow the device to enable segmenting the frame with the option
         * of ignoring a static IP ID value.  This doesn't enable the
         * feature itself but allows the user to enable it later.
         */
        if (dev->hw_features & NETIF_F_TSO)
                dev->hw_features |= NETIF_F_TSO_MANGLEID;
        if (dev->vlan_features & NETIF_F_TSO)
                dev->vlan_features |= NETIF_F_TSO_MANGLEID;
        if (dev->mpls_features & NETIF_F_TSO)
                dev->mpls_features |= NETIF_F_TSO_MANGLEID;
        if (dev->hw_enc_features & NETIF_F_TSO)
                dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;

        /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
         */
        dev->vlan_features |= NETIF_F_HIGHDMA;

        /* Make NETIF_F_SG inheritable to tunnel devices.
         */
        dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;

        /* Make NETIF_F_SG inheritable to MPLS.
         */
        dev->mpls_features |= NETIF_F_SG;

        ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
        ret = notifier_to_errno(ret);
        if (ret)
                goto err_ifindex_release;

        ret = netdev_register_kobject(dev);

        netdev_lock(dev);
        WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED);
        netdev_unlock(dev);

        if (ret)
                goto err_uninit_notify;

        netdev_lock_ops(dev);
        __netdev_update_features(dev);
        netdev_unlock_ops(dev);

        /*
         *        Default initial state at registry is that the
         *        device is present.
         */

        set_bit(__LINK_STATE_PRESENT, &dev->state);

        linkwatch_init_dev(dev);

        dev_init_scheduler(dev);

        netdev_hold(dev, &dev->dev_registered_tracker, GFP_KERNEL);
        list_netdevice(dev);

        add_device_randomness(dev->dev_addr, dev->addr_len);

        /* If the device has permanent device address, driver should
         * set dev_addr and also addr_assign_type should be set to
         * NET_ADDR_PERM (default value).
         */
        if (dev->addr_assign_type == NET_ADDR_PERM)
                memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);

        /* Notify protocols, that a new device appeared. */
        netdev_lock_ops(dev);
        ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
        netdev_unlock_ops(dev);
        ret = notifier_to_errno(ret);
        if (ret) {
                /* Expect explicit free_netdev() on failure */
                dev->needs_free_netdev = false;
                unregister_netdevice_queue(dev, NULL);
                goto out;
        }
        /*
         *        Prevent userspace races by waiting until the network
         *        device is fully setup before sending notifications.
         */
        if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing))
                rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);

out:
        return ret;

err_uninit_notify:
        call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
err_ifindex_release:
        dev_index_release(net, dev->ifindex);
err_free_pcpu:
        netdev_do_free_pcpu_stats(dev);
err_uninit:
        if (dev->netdev_ops->ndo_uninit)
                dev->netdev_ops->ndo_uninit(dev);
        if (dev->priv_destructor)
                dev->priv_destructor(dev);
err_free_name:
        netdev_name_node_free(dev->name_node);
        goto out;
}
EXPORT_SYMBOL(register_netdevice);

/* Initialize the core of a dummy net device.
 * The setup steps dummy netdevs need which normal netdevs get by going
 * through register_netdevice().
 */
static void init_dummy_netdev(struct net_device *dev)
{
        /* make sure we BUG if trying to hit standard
         * register/unregister code path
         */
        dev->reg_state = NETREG_DUMMY;

        /* a dummy interface is started by default */
        set_bit(__LINK_STATE_PRESENT, &dev->state);
        set_bit(__LINK_STATE_START, &dev->state);

        /* Note : We dont allocate pcpu_refcnt for dummy devices,
         * because users of this 'device' dont need to change
         * its refcount.
         */
}

/**
 *        register_netdev        - register a network device
 *        @dev: device to register
 *
 *        Take a completed network device structure and add it to the kernel
 *        interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
 *        chain. 0 is returned on success. A negative errno code is returned
 *        on a failure to set up the device, or if the name is a duplicate.
 *
 *        This is a wrapper around register_netdevice that takes the rtnl semaphore
 *        and expands the device name if you passed a format string to
 *        alloc_netdev.
 */
int register_netdev(struct net_device *dev)
{
        struct net *net = dev_net(dev);
        int err;

        if (rtnl_net_lock_killable(net))
                return -EINTR;

        err = register_netdevice(dev);

        rtnl_net_unlock(net);

        return err;
}
EXPORT_SYMBOL(register_netdev);

int netdev_refcnt_read(const struct net_device *dev)
{
#ifdef CONFIG_PCPU_DEV_REFCNT
        int i, refcnt = 0;

        for_each_possible_cpu(i)
                refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
        return refcnt;
#else
        return refcount_read(&dev->dev_refcnt);
#endif
}
EXPORT_SYMBOL(netdev_refcnt_read);

int netdev_unregister_timeout_secs __read_mostly = 10;

#define WAIT_REFS_MIN_MSECS 1
#define WAIT_REFS_MAX_MSECS 250
/**
 * netdev_wait_allrefs_any - wait until all references are gone.
 * @list: list of net_devices to wait on
 *
 * This is called when unregistering network devices.
 *
 * Any protocol or device that holds a reference should register
 * for netdevice notification, and cleanup and put back the
 * reference if they receive an UNREGISTER event.
 * We can get stuck here if buggy protocols don't correctly
 * call dev_put.
 */
static struct net_device *netdev_wait_allrefs_any(struct list_head *list)
{
        unsigned long rebroadcast_time, warning_time;
        struct net_device *dev;
        int wait = 0;

        rebroadcast_time = warning_time = jiffies;

        list_for_each_entry(dev, list, todo_list)
                if (netdev_refcnt_read(dev) == 1)
                        return dev;

        while (true) {
                if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
                        rtnl_lock();

                        /* Rebroadcast unregister notification */
                        list_for_each_entry(dev, list, todo_list)
                                call_netdevice_notifiers(NETDEV_UNREGISTER, dev);

                        __rtnl_unlock();
                        rcu_barrier();
                        rtnl_lock();

                        list_for_each_entry(dev, list, todo_list)
                                if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
                                             &dev->state)) {
                                        /* We must not have linkwatch events
                                         * pending on unregister. If this
                                         * happens, we simply run the queue
                                         * unscheduled, resulting in a noop
                                         * for this device.
                                         */
                                        linkwatch_run_queue();
                                        break;
                                }

                        __rtnl_unlock();

                        rebroadcast_time = jiffies;
                }

                rcu_barrier();

                if (!wait) {
                        wait = WAIT_REFS_MIN_MSECS;
                } else {
                        msleep(wait);
                        wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
                }

                list_for_each_entry(dev, list, todo_list)
                        if (netdev_refcnt_read(dev) == 1)
                                return dev;

                if (time_after(jiffies, warning_time +
                               READ_ONCE(netdev_unregister_timeout_secs) * HZ)) {
                        list_for_each_entry(dev, list, todo_list) {
                                pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
                                         dev->name, netdev_refcnt_read(dev));
                                ref_tracker_dir_print(&dev->refcnt_tracker, 10);
                        }

                        warning_time = jiffies;
                }
        }
}

/* The sequence is:
 *
 *        rtnl_lock();
 *        ...
 *        register_netdevice(x1);
 *        register_netdevice(x2);
 *        ...
 *        unregister_netdevice(y1);
 *        unregister_netdevice(y2);
 *      ...
 *        rtnl_unlock();
 *        free_netdev(y1);
 *        free_netdev(y2);
 *
 * We are invoked by rtnl_unlock().
 * This allows us to deal with problems:
 * 1) We can delete sysfs objects which invoke hotplug
 *    without deadlocking with linkwatch via keventd.
 * 2) Since we run with the RTNL semaphore not held, we can sleep
 *    safely in order to wait for the netdev refcnt to drop to zero.
 *
 * We must not return until all unregister events added during
 * the interval the lock was held have been completed.
 */
void netdev_run_todo(void)
{
        struct net_device *dev, *tmp;
        struct list_head list;
        int cnt;
#ifdef CONFIG_LOCKDEP
        struct list_head unlink_list;

        list_replace_init(&net_unlink_list, &unlink_list);

        while (!list_empty(&unlink_list)) {
                dev = list_first_entry(&unlink_list, struct net_device,
                                       unlink_list);
                list_del_init(&dev->unlink_list);
                dev->nested_level = dev->lower_level - 1;
        }
#endif

        /* Snapshot list, allow later requests */
        list_replace_init(&net_todo_list, &list);

        __rtnl_unlock();

        /* Wait for rcu callbacks to finish before next phase */
        if (!list_empty(&list))
                rcu_barrier();

        list_for_each_entry_safe(dev, tmp, &list, todo_list) {
                if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
                        netdev_WARN(dev, "run_todo but not unregistering\n");
                        list_del(&dev->todo_list);
                        continue;
                }

                netdev_lock(dev);
                WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED);
                netdev_unlock(dev);
                linkwatch_sync_dev(dev);
        }

        cnt = 0;
        while (!list_empty(&list)) {
                dev = netdev_wait_allrefs_any(&list);
                list_del(&dev->todo_list);

                /* paranoia */
                BUG_ON(netdev_refcnt_read(dev) != 1);
                BUG_ON(!list_empty(&dev->ptype_all));
                BUG_ON(!list_empty(&dev->ptype_specific));
                WARN_ON(rcu_access_pointer(dev->ip_ptr));
                WARN_ON(rcu_access_pointer(dev->ip6_ptr));

                netdev_do_free_pcpu_stats(dev);
                if (dev->priv_destructor)
                        dev->priv_destructor(dev);
                if (dev->needs_free_netdev)
                        free_netdev(dev);

                cnt++;

                /* Free network device */
                kobject_put(&dev->dev.kobj);
        }
        if (cnt && atomic_sub_and_test(cnt, &dev_unreg_count))
                wake_up(&netdev_unregistering_wq);
}

/* Collate per-cpu network dstats statistics
 *
 * Read per-cpu network statistics from dev->dstats and populate the related
 * fields in @s.
 */
static void dev_fetch_dstats(struct rtnl_link_stats64 *s,
                             const struct pcpu_dstats __percpu *dstats)
{
        int cpu;

        for_each_possible_cpu(cpu) {
                u64 rx_packets, rx_bytes, rx_drops;
                u64 tx_packets, tx_bytes, tx_drops;
                const struct pcpu_dstats *stats;
                unsigned int start;

                stats = per_cpu_ptr(dstats, cpu);
                do {
                        start = u64_stats_fetch_begin(&stats->syncp);
                        rx_packets = u64_stats_read(&stats->rx_packets);
                        rx_bytes   = u64_stats_read(&stats->rx_bytes);
                        rx_drops   = u64_stats_read(&stats->rx_drops);
                        tx_packets = u64_stats_read(&stats->tx_packets);
                        tx_bytes   = u64_stats_read(&stats->tx_bytes);
                        tx_drops   = u64_stats_read(&stats->tx_drops);
                } while (u64_stats_fetch_retry(&stats->syncp, start));

                s->rx_packets += rx_packets;
                s->rx_bytes   += rx_bytes;
                s->rx_dropped += rx_drops;
                s->tx_packets += tx_packets;
                s->tx_bytes   += tx_bytes;
                s->tx_dropped += tx_drops;
        }
}

/* ndo_get_stats64 implementation for dtstats-based accounting.
 *
 * Populate @s from dev->stats and dev->dstats. This is used internally by the
 * core for NETDEV_PCPU_STAT_DSTAT-type stats collection.
 */
static void dev_get_dstats64(const struct net_device *dev,
                             struct rtnl_link_stats64 *s)
{
        netdev_stats_to_stats64(s, &dev->stats);
        dev_fetch_dstats(s, dev->dstats);
}

/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
 * all the same fields in the same order as net_device_stats, with only
 * the type differing, but rtnl_link_stats64 may have additional fields
 * at the end for newer counters.
 */
void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
                             const struct net_device_stats *netdev_stats)
{
        size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t);
        const atomic_long_t *src = (atomic_long_t *)netdev_stats;
        u64 *dst = (u64 *)stats64;

        BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
        for (i = 0; i < n; i++)
                dst[i] = (unsigned long)atomic_long_read(&src[i]);
        /* zero out counters that only exist in rtnl_link_stats64 */
        memset((char *)stats64 + n * sizeof(u64), 0,
               sizeof(*stats64) - n * sizeof(u64));
}
EXPORT_SYMBOL(netdev_stats_to_stats64);

static __cold struct net_device_core_stats __percpu *netdev_core_stats_alloc(
                struct net_device *dev)
{
        struct net_device_core_stats __percpu *p;

        p = alloc_percpu_gfp(struct net_device_core_stats,
                             GFP_ATOMIC | __GFP_NOWARN);

        if (p && cmpxchg(&dev->core_stats, NULL, p))
                free_percpu(p);

        /* This READ_ONCE() pairs with the cmpxchg() above */
        return READ_ONCE(dev->core_stats);
}

noinline void netdev_core_stats_inc(struct net_device *dev, u32 offset)
{
        /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
        struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats);
        unsigned long __percpu *field;

        if (unlikely(!p)) {
                p = netdev_core_stats_alloc(dev);
                if (!p)
                        return;
        }

        field = (unsigned long __percpu *)((void __percpu *)p + offset);
        this_cpu_inc(*field);
}
EXPORT_SYMBOL_GPL(netdev_core_stats_inc);

/**
 *        dev_get_stats        - get network device statistics
 *        @dev: device to get statistics from
 *        @storage: place to store stats
 *
 *        Get network statistics from device. Return @storage.
 *        The device driver may provide its own method by setting
 *        dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
 *        otherwise the internal statistics structure is used.
 */
struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
                                        struct rtnl_link_stats64 *storage)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        const struct net_device_core_stats __percpu *p;

        /*
         * IPv{4,6} and udp tunnels share common stat helpers and use
         * different stat type (NETDEV_PCPU_STAT_TSTATS vs
         * NETDEV_PCPU_STAT_DSTATS). Ensure the accounting is consistent.
         */
        BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_bytes) !=
                     offsetof(struct pcpu_dstats, rx_bytes));
        BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_packets) !=
                     offsetof(struct pcpu_dstats, rx_packets));
        BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_bytes) !=
                     offsetof(struct pcpu_dstats, tx_bytes));
        BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_packets) !=
                     offsetof(struct pcpu_dstats, tx_packets));

        if (ops->ndo_get_stats64) {
                memset(storage, 0, sizeof(*storage));
                ops->ndo_get_stats64(dev, storage);
        } else if (ops->ndo_get_stats) {
                netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
        } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) {
                dev_get_tstats64(dev, storage);
        } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_DSTATS) {
                dev_get_dstats64(dev, storage);
        } else {
                netdev_stats_to_stats64(storage, &dev->stats);
        }

        /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
        p = READ_ONCE(dev->core_stats);
        if (p) {
                const struct net_device_core_stats *core_stats;
                int i;

                for_each_possible_cpu(i) {
                        core_stats = per_cpu_ptr(p, i);
                        storage->rx_dropped += READ_ONCE(core_stats->rx_dropped);
                        storage->tx_dropped += READ_ONCE(core_stats->tx_dropped);
                        storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler);
                        storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped);
                }
        }
        return storage;
}
EXPORT_SYMBOL(dev_get_stats);

/**
 *        dev_fetch_sw_netstats - get per-cpu network device statistics
 *        @s: place to store stats
 *        @netstats: per-cpu network stats to read from
 *
 *        Read per-cpu network statistics and populate the related fields in @s.
 */
void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
                           const struct pcpu_sw_netstats __percpu *netstats)
{
        int cpu;

        for_each_possible_cpu(cpu) {
                u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
                const struct pcpu_sw_netstats *stats;
                unsigned int start;

                stats = per_cpu_ptr(netstats, cpu);
                do {
                        start = u64_stats_fetch_begin(&stats->syncp);
                        rx_packets = u64_stats_read(&stats->rx_packets);
                        rx_bytes   = u64_stats_read(&stats->rx_bytes);
                        tx_packets = u64_stats_read(&stats->tx_packets);
                        tx_bytes   = u64_stats_read(&stats->tx_bytes);
                } while (u64_stats_fetch_retry(&stats->syncp, start));

                s->rx_packets += rx_packets;
                s->rx_bytes   += rx_bytes;
                s->tx_packets += tx_packets;
                s->tx_bytes   += tx_bytes;
        }
}
EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);

/**
 *        dev_get_tstats64 - ndo_get_stats64 implementation
 *        @dev: device to get statistics from
 *        @s: place to store stats
 *
 *        Populate @s from dev->stats and dev->tstats. Can be used as
 *        ndo_get_stats64() callback.
 */
void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s)
{
        netdev_stats_to_stats64(s, &dev->stats);
        dev_fetch_sw_netstats(s, dev->tstats);
}
EXPORT_SYMBOL_GPL(dev_get_tstats64);

struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
{
        struct netdev_queue *queue = dev_ingress_queue(dev);

#ifdef CONFIG_NET_CLS_ACT
        if (queue)
                return queue;
        queue = kzalloc(sizeof(*queue), GFP_KERNEL);
        if (!queue)
                return NULL;
        netdev_init_one_queue(dev, queue, NULL);
        RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
        RCU_INIT_POINTER(queue->qdisc_sleeping, &noop_qdisc);
        rcu_assign_pointer(dev->ingress_queue, queue);
#endif
        return queue;
}

static const struct ethtool_ops default_ethtool_ops;

void netdev_set_default_ethtool_ops(struct net_device *dev,
                                    const struct ethtool_ops *ops)
{
        if (dev->ethtool_ops == &default_ethtool_ops)
                dev->ethtool_ops = ops;
}
EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);

/**
 * netdev_sw_irq_coalesce_default_on() - enable SW IRQ coalescing by default
 * @dev: netdev to enable the IRQ coalescing on
 *
 * Sets a conservative default for SW IRQ coalescing. Users can use
 * sysfs attributes to override the default values.
 */
void netdev_sw_irq_coalesce_default_on(struct net_device *dev)
{
        WARN_ON(dev->reg_state == NETREG_REGISTERED);

        if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
                netdev_set_gro_flush_timeout(dev, 20000);
                netdev_set_defer_hard_irqs(dev, 1);
        }
}
EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on);

/**
 * alloc_netdev_mqs - allocate network device
 * @sizeof_priv: size of private data to allocate space for
 * @name: device name format string
 * @name_assign_type: origin of device name
 * @setup: callback to initialize device
 * @txqs: the number of TX subqueues to allocate
 * @rxqs: the number of RX subqueues to allocate
 *
 * Allocates a struct net_device with private data area for driver use
 * and performs basic initialization.  Also allocates subqueue structs
 * for each queue on the device.
 */
struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
                unsigned char name_assign_type,
                void (*setup)(struct net_device *),
                unsigned int txqs, unsigned int rxqs)
{
        struct net_device *dev;
        size_t napi_config_sz;
        unsigned int maxqs;

        BUG_ON(strlen(name) >= sizeof(dev->name));

        if (txqs < 1) {
                pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
                return NULL;
        }

        if (rxqs < 1) {
                pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
                return NULL;
        }

        maxqs = max(txqs, rxqs);

        dev = kvzalloc(struct_size(dev, priv, sizeof_priv),
                       GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
        if (!dev)
                return NULL;

        dev->priv_len = sizeof_priv;

        ref_tracker_dir_init(&dev->refcnt_tracker, 128, "netdev");
#ifdef CONFIG_PCPU_DEV_REFCNT
        dev->pcpu_refcnt = alloc_percpu(int);
        if (!dev->pcpu_refcnt)
                goto free_dev;
        __dev_hold(dev);
#else
        refcount_set(&dev->dev_refcnt, 1);
#endif

        if (dev_addr_init(dev))
                goto free_pcpu;

        dev_mc_init(dev);
        dev_uc_init(dev);

        dev_net_set(dev, &init_net);

        dev->gso_max_size = GSO_LEGACY_MAX_SIZE;
        dev->xdp_zc_max_segs = 1;
        dev->gso_max_segs = GSO_MAX_SEGS;
        dev->gro_max_size = GRO_LEGACY_MAX_SIZE;
        dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE;
        dev->gro_ipv4_max_size = GRO_LEGACY_MAX_SIZE;
        dev->tso_max_size = TSO_LEGACY_MAX_SIZE;
        dev->tso_max_segs = TSO_MAX_SEGS;
        dev->upper_level = 1;
        dev->lower_level = 1;
#ifdef CONFIG_LOCKDEP
        dev->nested_level = 0;
        INIT_LIST_HEAD(&dev->unlink_list);
#endif

        INIT_LIST_HEAD(&dev->napi_list);
        INIT_LIST_HEAD(&dev->unreg_list);
        INIT_LIST_HEAD(&dev->close_list);
        INIT_LIST_HEAD(&dev->link_watch_list);
        INIT_LIST_HEAD(&dev->adj_list.upper);
        INIT_LIST_HEAD(&dev->adj_list.lower);
        INIT_LIST_HEAD(&dev->ptype_all);
        INIT_LIST_HEAD(&dev->ptype_specific);
        INIT_LIST_HEAD(&dev->net_notifier_list);
#ifdef CONFIG_NET_SCHED
        hash_init(dev->qdisc_hash);
#endif

        mutex_init(&dev->lock);

        dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
        setup(dev);

        if (!dev->tx_queue_len) {
                dev->priv_flags |= IFF_NO_QUEUE;
                dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
        }

        dev->num_tx_queues = txqs;
        dev->real_num_tx_queues = txqs;
        if (netif_alloc_netdev_queues(dev))
                goto free_all;

        dev->num_rx_queues = rxqs;
        dev->real_num_rx_queues = rxqs;
        if (netif_alloc_rx_queues(dev))
                goto free_all;
        dev->ethtool = kzalloc(sizeof(*dev->ethtool), GFP_KERNEL_ACCOUNT);
        if (!dev->ethtool)
                goto free_all;

        dev->cfg = kzalloc(sizeof(*dev->cfg), GFP_KERNEL_ACCOUNT);
        if (!dev->cfg)
                goto free_all;
        dev->cfg_pending = dev->cfg;

        dev->num_napi_configs = maxqs;
        napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config));
        dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT);
        if (!dev->napi_config)
                goto free_all;

        strscpy(dev->name, name);
        dev->name_assign_type = name_assign_type;
        dev->group = INIT_NETDEV_GROUP;
        if (!dev->ethtool_ops)
                dev->ethtool_ops = &default_ethtool_ops;

        nf_hook_netdev_init(dev);

        return dev;

free_all:
        free_netdev(dev);
        return NULL;

free_pcpu:
#ifdef CONFIG_PCPU_DEV_REFCNT
        free_percpu(dev->pcpu_refcnt);
free_dev:
#endif
        kvfree(dev);
        return NULL;
}
EXPORT_SYMBOL(alloc_netdev_mqs);

static void netdev_napi_exit(struct net_device *dev)
{
        if (!list_empty(&dev->napi_list)) {
                struct napi_struct *p, *n;

                netdev_lock(dev);
                list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
                        __netif_napi_del_locked(p);
                netdev_unlock(dev);

                synchronize_net();
        }

        kvfree(dev->napi_config);
}

/**
 * free_netdev - free network device
 * @dev: device
 *
 * This function does the last stage of destroying an allocated device
 * interface. The reference to the device object is released. If this
 * is the last reference then it will be freed.Must be called in process
 * context.
 */
void free_netdev(struct net_device *dev)
{
        might_sleep();

        /* When called immediately after register_netdevice() failed the unwind
         * handling may still be dismantling the device. Handle that case by
         * deferring the free.
         */
        if (dev->reg_state == NETREG_UNREGISTERING) {
                ASSERT_RTNL();
                dev->needs_free_netdev = true;
                return;
        }

        WARN_ON(dev->cfg != dev->cfg_pending);
        kfree(dev->cfg);
        kfree(dev->ethtool);
        netif_free_tx_queues(dev);
        netif_free_rx_queues(dev);

        kfree(rcu_dereference_protected(dev->ingress_queue, 1));

        /* Flush device addresses */
        dev_addr_flush(dev);

        netdev_napi_exit(dev);

        netif_del_cpu_rmap(dev);

        ref_tracker_dir_exit(&dev->refcnt_tracker);
#ifdef CONFIG_PCPU_DEV_REFCNT
        free_percpu(dev->pcpu_refcnt);
        dev->pcpu_refcnt = NULL;
#endif
        free_percpu(dev->core_stats);
        dev->core_stats = NULL;
        free_percpu(dev->xdp_bulkq);
        dev->xdp_bulkq = NULL;

        netdev_free_phy_link_topology(dev);

        mutex_destroy(&dev->lock);

        /*  Compatibility with error handling in drivers */
        if (dev->reg_state == NETREG_UNINITIALIZED ||
            dev->reg_state == NETREG_DUMMY) {
                kvfree(dev);
                return;
        }

        BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
        WRITE_ONCE(dev->reg_state, NETREG_RELEASED);

        /* will free via device release */
        put_device(&dev->dev);
}
EXPORT_SYMBOL(free_netdev);

/**
 * alloc_netdev_dummy - Allocate and initialize a dummy net device.
 * @sizeof_priv: size of private data to allocate space for
 *
 * Return: the allocated net_device on success, NULL otherwise
 */
struct net_device *alloc_netdev_dummy(int sizeof_priv)
{
        return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN,
                            init_dummy_netdev);
}
EXPORT_SYMBOL_GPL(alloc_netdev_dummy);

/**
 *        synchronize_net -  Synchronize with packet receive processing
 *
 *        Wait for packets currently being received to be done.
 *        Does not block later packets from starting.
 */
void synchronize_net(void)
{
        might_sleep();
        if (from_cleanup_net() || rtnl_is_locked())
                synchronize_rcu_expedited();
        else
                synchronize_rcu();
}
EXPORT_SYMBOL(synchronize_net);

static void netdev_rss_contexts_free(struct net_device *dev)
{
        struct ethtool_rxfh_context *ctx;
        unsigned long context;

        mutex_lock(&dev->ethtool->rss_lock);
        xa_for_each(&dev->ethtool->rss_ctx, context, ctx) {
                xa_erase(&dev->ethtool->rss_ctx, context);
                dev->ethtool_ops->remove_rxfh_context(dev, ctx, context, NULL);
                kfree(ctx);
        }
        xa_destroy(&dev->ethtool->rss_ctx);
        mutex_unlock(&dev->ethtool->rss_lock);
}

/**
 *        unregister_netdevice_queue - remove device from the kernel
 *        @dev: device
 *        @head: list
 *
 *        This function shuts down a device interface and removes it
 *        from the kernel tables.
 *        If head not NULL, device is queued to be unregistered later.
 *
 *        Callers must hold the rtnl semaphore.  You may want
 *        unregister_netdev() instead of this.
 */

void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
{
        ASSERT_RTNL();

        if (head) {
                list_move_tail(&dev->unreg_list, head);
        } else {
                LIST_HEAD(single);

                list_add(&dev->unreg_list, &single);
                unregister_netdevice_many(&single);
        }
}
EXPORT_SYMBOL(unregister_netdevice_queue);

static void dev_memory_provider_uninstall(struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->real_num_rx_queues; i++) {
                struct netdev_rx_queue *rxq = &dev->_rx[i];
                struct pp_memory_provider_params *p = &rxq->mp_params;

                if (p->mp_ops && p->mp_ops->uninstall)
                        p->mp_ops->uninstall(rxq->mp_params.mp_priv, rxq);
        }
}

/* devices must be UP and netdev_lock()'d */
static void netif_close_many_and_unlock(struct list_head *close_head)
{
        struct net_device *dev, *tmp;

        netif_close_many(close_head, false);

        /* ... now unlock them */
        list_for_each_entry_safe(dev, tmp, close_head, close_list) {
                netdev_unlock(dev);
                list_del_init(&dev->close_list);
        }
}

static void netif_close_many_and_unlock_cond(struct list_head *close_head)
{
#ifdef CONFIG_LOCKDEP
        /* We can only track up to MAX_LOCK_DEPTH locks per task.
         *
         * Reserve half the available slots for additional locks possibly
         * taken by notifiers and (soft)irqs.
         */
        unsigned int limit = MAX_LOCK_DEPTH / 2;

        if (lockdep_depth(current) > limit)
                netif_close_many_and_unlock(close_head);
#endif
}

void unregister_netdevice_many_notify(struct list_head *head,
                                      u32 portid, const struct nlmsghdr *nlh)
{
        struct net_device *dev, *tmp;
        LIST_HEAD(close_head);
        int cnt = 0;

        BUG_ON(dev_boot_phase);
        ASSERT_RTNL();

        if (list_empty(head))
                return;

        list_for_each_entry_safe(dev, tmp, head, unreg_list) {
                /* Some devices call without registering
                 * for initialization unwind. Remove those
                 * devices and proceed with the remaining.
                 */
                if (dev->reg_state == NETREG_UNINITIALIZED) {
                        pr_debug("unregister_netdevice: device %s/%p never was registered\n",
                                 dev->name, dev);

                        WARN_ON(1);
                        list_del(&dev->unreg_list);
                        continue;
                }
                dev->dismantle = true;
                BUG_ON(dev->reg_state != NETREG_REGISTERED);
        }

        /* If device is running, close it first. Start with ops locked... */
        list_for_each_entry(dev, head, unreg_list) {
                if (!(dev->flags & IFF_UP))
                        continue;
                if (netdev_need_ops_lock(dev)) {
                        list_add_tail(&dev->close_list, &close_head);
                        netdev_lock(dev);
                }
                netif_close_many_and_unlock_cond(&close_head);
        }
        netif_close_many_and_unlock(&close_head);
        /* ... now go over the rest. */
        list_for_each_entry(dev, head, unreg_list) {
                if (!netdev_need_ops_lock(dev))
                        list_add_tail(&dev->close_list, &close_head);
        }
        netif_close_many(&close_head, true);

        list_for_each_entry(dev, head, unreg_list) {
                /* And unlink it from device chain. */
                unlist_netdevice(dev);
                netdev_lock(dev);
                WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING);
                netdev_unlock(dev);
        }
        flush_all_backlogs();

        synchronize_net();

        list_for_each_entry(dev, head, unreg_list) {
                struct sk_buff *skb = NULL;

                /* Shutdown queueing discipline. */
                netdev_lock_ops(dev);
                dev_shutdown(dev);
                dev_tcx_uninstall(dev);
                dev_xdp_uninstall(dev);
                dev_memory_provider_uninstall(dev);
                netdev_unlock_ops(dev);
                bpf_dev_bound_netdev_unregister(dev);

                netdev_offload_xstats_disable_all(dev);

                /* Notify protocols, that we are about to destroy
                 * this device. They should clean all the things.
                 */
                call_netdevice_notifiers(NETDEV_UNREGISTER, dev);

                if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing))
                        skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
                                                     GFP_KERNEL, NULL, 0,
                                                     portid, nlh);

                /*
                 *        Flush the unicast and multicast chains
                 */
                dev_uc_flush(dev);
                dev_mc_flush(dev);

                netdev_name_node_alt_flush(dev);
                netdev_name_node_free(dev->name_node);

                netdev_rss_contexts_free(dev);

                call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);

                if (dev->netdev_ops->ndo_uninit)
                        dev->netdev_ops->ndo_uninit(dev);

                mutex_destroy(&dev->ethtool->rss_lock);

                net_shaper_flush_netdev(dev);

                if (skb)
                        rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh);

                /* Notifier chain MUST detach us all upper devices. */
                WARN_ON(netdev_has_any_upper_dev(dev));
                WARN_ON(netdev_has_any_lower_dev(dev));

                /* Remove entries from kobject tree */
                netdev_unregister_kobject(dev);
#ifdef CONFIG_XPS
                /* Remove XPS queueing entries */
                netif_reset_xps_queues_gt(dev, 0);
#endif
        }

        synchronize_net();

        list_for_each_entry(dev, head, unreg_list) {
                netdev_put(dev, &dev->dev_registered_tracker);
                net_set_todo(dev);
                cnt++;
        }
        atomic_add(cnt, &dev_unreg_count);

        list_del(head);
}

/**
 *        unregister_netdevice_many - unregister many devices
 *        @head: list of devices
 *
 *  Note: As most callers use a stack allocated list_head,
 *  we force a list_del() to make sure stack won't be corrupted later.
 */
void unregister_netdevice_many(struct list_head *head)
{
        unregister_netdevice_many_notify(head, 0, NULL);
}
EXPORT_SYMBOL(unregister_netdevice_many);

/**
 *        unregister_netdev - remove device from the kernel
 *        @dev: device
 *
 *        This function shuts down a device interface and removes it
 *        from the kernel tables.
 *
 *        This is just a wrapper for unregister_netdevice that takes
 *        the rtnl semaphore.  In general you want to use this and not
 *        unregister_netdevice.
 */
void unregister_netdev(struct net_device *dev)
{
        rtnl_net_dev_lock(dev);
        unregister_netdevice(dev);
        rtnl_net_dev_unlock(dev);
}
EXPORT_SYMBOL(unregister_netdev);

int __dev_change_net_namespace(struct net_device *dev, struct net *net,
                               const char *pat, int new_ifindex,
                               struct netlink_ext_ack *extack)
{
        struct netdev_name_node *name_node;
        struct net *net_old = dev_net(dev);
        char new_name[IFNAMSIZ] = {};
        int err, new_nsid;

        ASSERT_RTNL();

        /* Don't allow namespace local devices to be moved. */
        err = -EINVAL;
        if (dev->netns_immutable) {
                NL_SET_ERR_MSG(extack, "The interface netns is immutable");
                goto out;
        }

        /* Ensure the device has been registered */
        if (dev->reg_state != NETREG_REGISTERED) {
                NL_SET_ERR_MSG(extack, "The interface isn't registered");
                goto out;
        }

        /* Get out if there is nothing todo */
        err = 0;
        if (net_eq(net_old, net))
                goto out;

        /* Pick the destination device name, and ensure
         * we can use it in the destination network namespace.
         */
        err = -EEXIST;
        if (netdev_name_in_use(net, dev->name)) {
                /* We get here if we can't use the current device name */
                if (!pat) {
                        NL_SET_ERR_MSG(extack,
                                       "An interface with the same name exists in the target netns");
                        goto out;
                }
                err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST);
                if (err < 0) {
                        NL_SET_ERR_MSG_FMT(extack,
                                           "Unable to use '%s' for the new interface name in the target netns",
                                           pat);
                        goto out;
                }
        }
        /* Check that none of the altnames conflicts. */
        err = -EEXIST;
        netdev_for_each_altname(dev, name_node) {
                if (netdev_name_in_use(net, name_node->name)) {
                        NL_SET_ERR_MSG_FMT(extack,
                                           "An interface with the altname %s exists in the target netns",
                                           name_node->name);
                        goto out;
                }
        }

        /* Check that new_ifindex isn't used yet. */
        if (new_ifindex) {
                err = dev_index_reserve(net, new_ifindex);
                if (err < 0) {
                        NL_SET_ERR_MSG_FMT(extack,
                                           "The ifindex %d is not available in the target netns",
                                           new_ifindex);
                        goto out;
                }
        } else {
                /* If there is an ifindex conflict assign a new one */
                err = dev_index_reserve(net, dev->ifindex);
                if (err == -EBUSY)
                        err = dev_index_reserve(net, 0);
                if (err < 0) {
                        NL_SET_ERR_MSG(extack,
                                       "Unable to allocate a new ifindex in the target netns");
                        goto out;
                }
                new_ifindex = err;
        }

        /*
         * And now a mini version of register_netdevice unregister_netdevice.
         */

        netdev_lock_ops(dev);
        /* If device is running close it first. */
        netif_close(dev);
        /* And unlink it from device chain */
        unlist_netdevice(dev);

        if (!netdev_need_ops_lock(dev))
                netdev_lock(dev);
        dev->moving_ns = true;
        netdev_unlock(dev);

        synchronize_net();

        /* Shutdown queueing discipline. */
        netdev_lock_ops(dev);
        dev_shutdown(dev);
        netdev_unlock_ops(dev);

        /* Notify protocols, that we are about to destroy
         * this device. They should clean all the things.
         *
         * Note that dev->reg_state stays at NETREG_REGISTERED.
         * This is wanted because this way 8021q and macvlan know
         * the device is just moving and can keep their slaves up.
         */
        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
        rcu_barrier();

        new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);

        rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
                            new_ifindex);

        /*
         *        Flush the unicast and multicast chains
         */
        dev_uc_flush(dev);
        dev_mc_flush(dev);

        /* Send a netdev-removed uevent to the old namespace */
        kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
        netdev_adjacent_del_links(dev);

        /* Move per-net netdevice notifiers that are following the netdevice */
        move_netdevice_notifiers_dev_net(dev, net);

        /* Actually switch the network namespace */
        netdev_lock(dev);
        dev_net_set(dev, net);
        netdev_unlock(dev);
        dev->ifindex = new_ifindex;

        if (new_name[0]) {
                /* Rename the netdev to prepared name */
                write_seqlock_bh(&netdev_rename_lock);
                strscpy(dev->name, new_name, IFNAMSIZ);
                write_sequnlock_bh(&netdev_rename_lock);
        }

        /* Fixup kobjects */
        dev_set_uevent_suppress(&dev->dev, 1);
        err = device_rename(&dev->dev, dev->name);
        dev_set_uevent_suppress(&dev->dev, 0);
        WARN_ON(err);

        /* Send a netdev-add uevent to the new namespace */
        kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
        netdev_adjacent_add_links(dev);

        /* Adapt owner in case owning user namespace of target network
         * namespace is different from the original one.
         */
        err = netdev_change_owner(dev, net_old, net);
        WARN_ON(err);

        netdev_lock(dev);
        dev->moving_ns = false;
        if (!netdev_need_ops_lock(dev))
                netdev_unlock(dev);

        /* Add the device back in the hashes */
        list_netdevice(dev);
        /* Notify protocols, that a new device appeared. */
        call_netdevice_notifiers(NETDEV_REGISTER, dev);
        netdev_unlock_ops(dev);

        /*
         *        Prevent userspace races by waiting until the network
         *        device is fully setup before sending notifications.
         */
        rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);

        synchronize_net();
        err = 0;
out:
        return err;
}

static int dev_cpu_dead(unsigned int oldcpu)
{
        struct sk_buff **list_skb;
        struct sk_buff *skb;
        unsigned int cpu;
        struct softnet_data *sd, *oldsd, *remsd = NULL;

        local_irq_disable();
        cpu = smp_processor_id();
        sd = &per_cpu(softnet_data, cpu);
        oldsd = &per_cpu(softnet_data, oldcpu);

        /* Find end of our completion_queue. */
        list_skb = &sd->completion_queue;
        while (*list_skb)
                list_skb = &(*list_skb)->next;
        /* Append completion queue from offline CPU. */
        *list_skb = oldsd->completion_queue;
        oldsd->completion_queue = NULL;

        /* Append output queue from offline CPU. */
        if (oldsd->output_queue) {
                *sd->output_queue_tailp = oldsd->output_queue;
                sd->output_queue_tailp = oldsd->output_queue_tailp;
                oldsd->output_queue = NULL;
                oldsd->output_queue_tailp = &oldsd->output_queue;
        }
        /* Append NAPI poll list from offline CPU, with one exception :
         * process_backlog() must be called by cpu owning percpu backlog.
         * We properly handle process_queue & input_pkt_queue later.
         */
        while (!list_empty(&oldsd->poll_list)) {
                struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
                                                            struct napi_struct,
                                                            poll_list);

                list_del_init(&napi->poll_list);
                if (napi->poll == process_backlog)
                        napi->state &= NAPIF_STATE_THREADED;
                else
                        ____napi_schedule(sd, napi);
        }

        raise_softirq_irqoff(NET_TX_SOFTIRQ);
        local_irq_enable();

        if (!use_backlog_threads()) {
#ifdef CONFIG_RPS
                remsd = oldsd->rps_ipi_list;
                oldsd->rps_ipi_list = NULL;
#endif
                /* send out pending IPI's on offline CPU */
                net_rps_send_ipi(remsd);
        }

        /* Process offline CPU's input_pkt_queue */
        while ((skb = __skb_dequeue(&oldsd->process_queue))) {
                netif_rx(skb);
                rps_input_queue_head_incr(oldsd);
        }
        while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
                netif_rx(skb);
                rps_input_queue_head_incr(oldsd);
        }

        return 0;
}

/**
 *        netdev_increment_features - increment feature set by one
 *        @all: current feature set
 *        @one: new feature set
 *        @mask: mask feature set
 *
 *        Computes a new feature set after adding a device with feature set
 *        @one to the master device with current feature set @all.  Will not
 *        enable anything that is off in @mask. Returns the new feature set.
 */
netdev_features_t netdev_increment_features(netdev_features_t all,
        netdev_features_t one, netdev_features_t mask)
{
        if (mask & NETIF_F_HW_CSUM)
                mask |= NETIF_F_CSUM_MASK;
        mask |= NETIF_F_VLAN_CHALLENGED;

        all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
        all &= one | ~NETIF_F_ALL_FOR_ALL;

        /* If one device supports hw checksumming, set for all. */
        if (all & NETIF_F_HW_CSUM)
                all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);

        return all;
}
EXPORT_SYMBOL(netdev_increment_features);

static struct hlist_head * __net_init netdev_create_hash(void)
{
        int i;
        struct hlist_head *hash;

        hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
        if (hash != NULL)
                for (i = 0; i < NETDEV_HASHENTRIES; i++)
                        INIT_HLIST_HEAD(&hash[i]);

        return hash;
}

/* Initialize per network namespace state */
static int __net_init netdev_init(struct net *net)
{
        BUILD_BUG_ON(GRO_HASH_BUCKETS >
                     BITS_PER_BYTE * sizeof_field(struct gro_node, bitmask));

        INIT_LIST_HEAD(&net->dev_base_head);

        net->dev_name_head = netdev_create_hash();
        if (net->dev_name_head == NULL)
                goto err_name;

        net->dev_index_head = netdev_create_hash();
        if (net->dev_index_head == NULL)
                goto err_idx;

        xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC1);

        RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);

        return 0;

err_idx:
        kfree(net->dev_name_head);
err_name:
        return -ENOMEM;
}

/**
 *        netdev_drivername - network driver for the device
 *        @dev: network device
 *
 *        Determine network driver for device.
 */
const char *netdev_drivername(const struct net_device *dev)
{
        const struct device_driver *driver;
        const struct device *parent;
        const char *empty = "";

        parent = dev->dev.parent;
        if (!parent)
                return empty;

        driver = parent->driver;
        if (driver && driver->name)
                return driver->name;
        return empty;
}

static void __netdev_printk(const char *level, const struct net_device *dev,
                            struct va_format *vaf)
{
        if (dev && dev->dev.parent) {
                dev_printk_emit(level[1] - '0',
                                dev->dev.parent,
                                "%s %s %s%s: %pV",
                                dev_driver_string(dev->dev.parent),
                                dev_name(dev->dev.parent),
                                netdev_name(dev), netdev_reg_state(dev),
                                vaf);
        } else if (dev) {
                printk("%s%s%s: %pV",
                       level, netdev_name(dev), netdev_reg_state(dev), vaf);
        } else {
                printk("%s(NULL net_device): %pV", level, vaf);
        }
}

void netdev_printk(const char *level, const struct net_device *dev,
                   const char *format, ...)
{
        struct va_format vaf;
        va_list args;

        va_start(args, format);

        vaf.fmt = format;
        vaf.va = &args;

        __netdev_printk(level, dev, &vaf);

        va_end(args);
}
EXPORT_SYMBOL(netdev_printk);

#define define_netdev_printk_level(func, level)                        \
void func(const struct net_device *dev, const char *fmt, ...)        \
{                                                                \
        struct va_format vaf;                                        \
        va_list args;                                                \
                                                                \
        va_start(args, fmt);                                        \
                                                                \
        vaf.fmt = fmt;                                                \
        vaf.va = &args;                                                \
                                                                \
        __netdev_printk(level, dev, &vaf);                        \
                                                                \
        va_end(args);                                                \
}                                                                \
EXPORT_SYMBOL(func);

define_netdev_printk_level(netdev_emerg, KERN_EMERG);
define_netdev_printk_level(netdev_alert, KERN_ALERT);
define_netdev_printk_level(netdev_crit, KERN_CRIT);
define_netdev_printk_level(netdev_err, KERN_ERR);
define_netdev_printk_level(netdev_warn, KERN_WARNING);
define_netdev_printk_level(netdev_notice, KERN_NOTICE);
define_netdev_printk_level(netdev_info, KERN_INFO);

static void __net_exit netdev_exit(struct net *net)
{
        kfree(net->dev_name_head);
        kfree(net->dev_index_head);
        xa_destroy(&net->dev_by_index);
        if (net != &init_net)
                WARN_ON_ONCE(!list_empty(&net->dev_base_head));
}

static struct pernet_operations __net_initdata netdev_net_ops = {
        .init = netdev_init,
        .exit = netdev_exit,
};

static void __net_exit default_device_exit_net(struct net *net)
{
        struct netdev_name_node *name_node, *tmp;
        struct net_device *dev, *aux;
        /*
         * Push all migratable network devices back to the
         * initial network namespace
         */
        ASSERT_RTNL();
        for_each_netdev_safe(net, dev, aux) {
                int err;
                char fb_name[IFNAMSIZ];

                /* Ignore unmoveable devices (i.e. loopback) */
                if (dev->netns_immutable)
                        continue;

                /* Leave virtual devices for the generic cleanup */
                if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
                        continue;

                /* Push remaining network devices to init_net */
                snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
                if (netdev_name_in_use(&init_net, fb_name))
                        snprintf(fb_name, IFNAMSIZ, "dev%%d");

                netdev_for_each_altname_safe(dev, name_node, tmp)
                        if (netdev_name_in_use(&init_net, name_node->name))
                                __netdev_name_node_alt_destroy(name_node);

                err = dev_change_net_namespace(dev, &init_net, fb_name);
                if (err) {
                        pr_emerg("%s: failed to move %s to init_net: %d\n",
                                 __func__, dev->name, err);
                        BUG();
                }
        }
}

static void __net_exit default_device_exit_batch(struct list_head *net_list)
{
        /* At exit all network devices most be removed from a network
         * namespace.  Do this in the reverse order of registration.
         * Do this across as many network namespaces as possible to
         * improve batching efficiency.
         */
        struct net_device *dev;
        struct net *net;
        LIST_HEAD(dev_kill_list);

        rtnl_lock();
        list_for_each_entry(net, net_list, exit_list) {
                default_device_exit_net(net);
                cond_resched();
        }

        list_for_each_entry(net, net_list, exit_list) {
                for_each_netdev_reverse(net, dev) {
                        if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
                                dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
                        else
                                unregister_netdevice_queue(dev, &dev_kill_list);
                }
        }
        unregister_netdevice_many(&dev_kill_list);
        rtnl_unlock();
}

static struct pernet_operations __net_initdata default_device_ops = {
        .exit_batch = default_device_exit_batch,
};

static void __init net_dev_struct_check(void)
{
        /* TX read-mostly hotpath */
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags_fast);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, real_num_tx_queues);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_size);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_ipv4_max_size);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_segs);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_partial_features);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, num_tc);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, mtu);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, needed_headroom);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tc_to_txq);
#ifdef CONFIG_XPS
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, xps_maps);
#endif
#ifdef CONFIG_NETFILTER_EGRESS
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, nf_hooks_egress);
#endif
#ifdef CONFIG_NET_XGRESS
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tcx_egress);
#endif
        CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 160);

        /* TXRX read-mostly hotpath */
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, lstats);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, state);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr);
        CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 46);

        /* RX read-mostly hotpath */
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler_data);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, nd_net);
#ifdef CONFIG_NETPOLL
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, npinfo);
#endif
#ifdef CONFIG_NET_XGRESS
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
#endif
        CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 92);
}

/*
 *        Initialize the DEV module. At boot time this walks the device list and
 *        unhooks any devices that fail to initialise (normally hardware not
 *        present) and leaves us with a valid list of present and active devices.
 *
 */

/* We allocate 256 pages for each CPU if PAGE_SHIFT is 12 */
#define SYSTEM_PERCPU_PAGE_POOL_SIZE        ((1 << 20) / PAGE_SIZE)

static int net_page_pool_create(int cpuid)
{
#if IS_ENABLED(CONFIG_PAGE_POOL)
        struct page_pool_params page_pool_params = {
                .pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE,
                .flags = PP_FLAG_SYSTEM_POOL,
                .nid = cpu_to_mem(cpuid),
        };
        struct page_pool *pp_ptr;
        int err;

        pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid);
        if (IS_ERR(pp_ptr))
                return -ENOMEM;

        err = xdp_reg_page_pool(pp_ptr);
        if (err) {
                page_pool_destroy(pp_ptr);
                return err;
        }

        per_cpu(system_page_pool.pool, cpuid) = pp_ptr;
#endif
        return 0;
}

static int backlog_napi_should_run(unsigned int cpu)
{
        struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
        struct napi_struct *napi = &sd->backlog;

        return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
}

static void run_backlog_napi(unsigned int cpu)
{
        struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);

        napi_threaded_poll_loop(&sd->backlog);
}

static void backlog_napi_setup(unsigned int cpu)
{
        struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
        struct napi_struct *napi = &sd->backlog;

        napi->thread = this_cpu_read(backlog_napi);
        set_bit(NAPI_STATE_THREADED, &napi->state);
}

static struct smp_hotplug_thread backlog_threads = {
        .store                        = &backlog_napi,
        .thread_should_run        = backlog_napi_should_run,
        .thread_fn                = run_backlog_napi,
        .thread_comm                = "backlog_napi/%u",
        .setup                        = backlog_napi_setup,
};

/*
 *       This is called single threaded during boot, so no need
 *       to take the rtnl semaphore.
 */
static int __init net_dev_init(void)
{
        int i, rc = -ENOMEM;

        BUG_ON(!dev_boot_phase);

        net_dev_struct_check();

        if (dev_proc_init())
                goto out;

        if (netdev_kobject_init())
                goto out;

        for (i = 0; i < PTYPE_HASH_SIZE; i++)
                INIT_LIST_HEAD(&ptype_base[i]);

        if (register_pernet_subsys(&netdev_net_ops))
                goto out;

        /*
         *        Initialise the packet receive queues.
         */

        flush_backlogs_fallback = flush_backlogs_alloc();
        if (!flush_backlogs_fallback)
                goto out;

        for_each_possible_cpu(i) {
                struct softnet_data *sd = &per_cpu(softnet_data, i);

                skb_queue_head_init(&sd->input_pkt_queue);
                skb_queue_head_init(&sd->process_queue);
#ifdef CONFIG_XFRM_OFFLOAD
                skb_queue_head_init(&sd->xfrm_backlog);
#endif
                INIT_LIST_HEAD(&sd->poll_list);
                sd->output_queue_tailp = &sd->output_queue;
#ifdef CONFIG_RPS
                INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
                sd->cpu = i;
#endif
                INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);

                gro_init(&sd->backlog.gro);
                sd->backlog.poll = process_backlog;
                sd->backlog.weight = weight_p;
                INIT_LIST_HEAD(&sd->backlog.poll_list);

                if (net_page_pool_create(i))
                        goto out;
        }
        net_hotdata.skb_defer_nodes =
                 __alloc_percpu(sizeof(struct skb_defer_node) * nr_node_ids,
                                __alignof__(struct skb_defer_node));
        if (!net_hotdata.skb_defer_nodes)
                goto out;
        if (use_backlog_threads())
                smpboot_register_percpu_thread(&backlog_threads);

        dev_boot_phase = 0;

        /* The loopback device is special if any other network devices
         * is present in a network namespace the loopback device must
         * be present. Since we now dynamically allocate and free the
         * loopback device ensure this invariant is maintained by
         * keeping the loopback device as the first device on the
         * list of network devices.  Ensuring the loopback devices
         * is the first device that appears and the last network device
         * that disappears.
         */
        if (register_pernet_device(&loopback_net_ops))
                goto out;

        if (register_pernet_device(&default_device_ops))
                goto out;

        open_softirq(NET_TX_SOFTIRQ, net_tx_action);
        open_softirq(NET_RX_SOFTIRQ, net_rx_action);

        rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
                                       NULL, dev_cpu_dead);
        WARN_ON(rc < 0);
        rc = 0;

        /* avoid static key IPIs to isolated CPUs */
        if (housekeeping_enabled(HK_TYPE_MISC))
                net_enable_timestamp();
out:
        if (rc < 0) {
                for_each_possible_cpu(i) {
                        struct page_pool *pp_ptr;

                        pp_ptr = per_cpu(system_page_pool.pool, i);
                        if (!pp_ptr)
                                continue;

                        xdp_unreg_page_pool(pp_ptr);
                        page_pool_destroy(pp_ptr);
                        per_cpu(system_page_pool.pool, i) = NULL;
                }
        }

        return rc;
}

subsys_initcall(net_dev_init);












































































































































































































































































































































































































































































































































































































































































































    7 




    7 
    2 




    6 



    6 





    5 

    1 












    5 





    5 




    2 







    1 


    4 

    7 



    2 
    3 


    4 

































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (C)2002 USAGI/WIDE Project
 *
 * Authors
 *
 *        Mitsuru KANDA @USAGI       : IPv6 Support
 *        Kazunori MIYAZAWA @USAGI   :
 *        Kunihiro Ishiguro <kunihiro@ipinfusion.com>
 *
 *        This file is derived from net/ipv4/ah.c.
 */

#define pr_fmt(fmt) "IPv6: " fmt

#include <crypto/hash.h>
#include <crypto/utils.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <net/ip.h>
#include <net/ah.h>
#include <linux/crypto.h>
#include <linux/pfkeyv2.h>
#include <linux/string.h>
#include <linux/scatterlist.h>
#include <net/ip6_route.h>
#include <net/icmp.h>
#include <net/ipv6.h>
#include <net/protocol.h>
#include <net/xfrm.h>

#define IPV6HDR_BASELEN 8

struct tmp_ext {
#if IS_ENABLED(CONFIG_IPV6_MIP6)
                struct in6_addr saddr;
#endif
                struct in6_addr daddr;
                char hdrs[];
};

struct ah_skb_cb {
        struct xfrm_skb_cb xfrm;
        void *tmp;
};

#define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0]))

/* Helper to save IPv6 addresses and extension headers to temporary storage */
static inline void ah6_save_hdrs(struct tmp_ext *iph_ext,
                                 struct ipv6hdr *top_iph, int extlen)
{
        if (!extlen)
                return;

#if IS_ENABLED(CONFIG_IPV6_MIP6)
        iph_ext->saddr = top_iph->saddr;
#endif
        iph_ext->daddr = top_iph->daddr;
        memcpy(&iph_ext->hdrs, top_iph + 1, extlen - sizeof(*iph_ext));
}

/* Helper to restore IPv6 addresses and extension headers from temporary storage */
static inline void ah6_restore_hdrs(struct ipv6hdr *top_iph,
                                    struct tmp_ext *iph_ext, int extlen)
{
        if (!extlen)
                return;

#if IS_ENABLED(CONFIG_IPV6_MIP6)
        top_iph->saddr = iph_ext->saddr;
#endif
        top_iph->daddr = iph_ext->daddr;
        memcpy(top_iph + 1, &iph_ext->hdrs, extlen - sizeof(*iph_ext));
}

static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags,
                          unsigned int size)
{
        unsigned int len;

        len = size + crypto_ahash_digestsize(ahash);

        len = ALIGN(len, crypto_tfm_ctx_alignment());

        len += sizeof(struct ahash_request) + crypto_ahash_reqsize(ahash);
        len = ALIGN(len, __alignof__(struct scatterlist));

        len += sizeof(struct scatterlist) * nfrags;

        return kmalloc(len, GFP_ATOMIC);
}

static inline struct tmp_ext *ah_tmp_ext(void *base)
{
        return base + IPV6HDR_BASELEN;
}

static inline u8 *ah_tmp_auth(u8 *tmp, unsigned int offset)
{
        return tmp + offset;
}

static inline u8 *ah_tmp_icv(void *tmp, unsigned int offset)
{
        return tmp + offset;
}

static inline struct ahash_request *ah_tmp_req(struct crypto_ahash *ahash,
                                               u8 *icv)
{
        struct ahash_request *req;

        req = (void *)PTR_ALIGN(icv + crypto_ahash_digestsize(ahash),
                                crypto_tfm_ctx_alignment());

        ahash_request_set_tfm(req, ahash);

        return req;
}

static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash,
                                             struct ahash_request *req)
{
        return (void *)ALIGN((unsigned long)(req + 1) +
                             crypto_ahash_reqsize(ahash),
                             __alignof__(struct scatterlist));
}

static bool zero_out_mutable_opts(struct ipv6_opt_hdr *opthdr)
{
        u8 *opt = (u8 *)opthdr;
        int len = ipv6_optlen(opthdr);
        int off = 0;
        int optlen = 0;

        off += 2;
        len -= 2;

        while (len > 0) {

                switch (opt[off]) {

                case IPV6_TLV_PAD1:
                        optlen = 1;
                        break;
                default:
                        if (len < 2)
                                goto bad;
                        optlen = opt[off+1]+2;
                        if (len < optlen)
                                goto bad;
                        if (opt[off] & 0x20)
                                memset(&opt[off+2], 0, opt[off+1]);
                        break;
                }

                off += optlen;
                len -= optlen;
        }
        if (len == 0)
                return true;

bad:
        return false;
}

#if IS_ENABLED(CONFIG_IPV6_MIP6)
/**
 *        ipv6_rearrange_destopt - rearrange IPv6 destination options header
 *        @iph: IPv6 header
 *        @destopt: destionation options header
 */
static void ipv6_rearrange_destopt(struct ipv6hdr *iph, struct ipv6_opt_hdr *destopt)
{
        u8 *opt = (u8 *)destopt;
        int len = ipv6_optlen(destopt);
        int off = 0;
        int optlen = 0;

        off += 2;
        len -= 2;

        while (len > 0) {

                switch (opt[off]) {

                case IPV6_TLV_PAD1:
                        optlen = 1;
                        break;
                default:
                        if (len < 2)
                                goto bad;
                        optlen = opt[off+1]+2;
                        if (len < optlen)
                                goto bad;

                        /* Rearrange the source address in @iph and the
                         * addresses in home address option for final source.
                         * See 11.3.2 of RFC 3775 for details.
                         */
                        if (opt[off] == IPV6_TLV_HAO) {
                                struct ipv6_destopt_hao *hao;

                                hao = (struct ipv6_destopt_hao *)&opt[off];
                                if (hao->length != sizeof(hao->addr)) {
                                        net_warn_ratelimited("destopt hao: invalid header length: %u\n",
                                                             hao->length);
                                        goto bad;
                                }
                                swap(hao->addr, iph->saddr);
                        }
                        break;
                }

                off += optlen;
                len -= optlen;
        }
        /* Note: ok if len == 0 */
bad:
        return;
}
#else
static void ipv6_rearrange_destopt(struct ipv6hdr *iph, struct ipv6_opt_hdr *destopt) {}
#endif

/**
 *        ipv6_rearrange_rthdr - rearrange IPv6 routing header
 *        @iph: IPv6 header
 *        @rthdr: routing header
 *
 *        Rearrange the destination address in @iph and the addresses in @rthdr
 *        so that they appear in the order they will at the final destination.
 *        See Appendix A2 of RFC 2402 for details.
 */
static void ipv6_rearrange_rthdr(struct ipv6hdr *iph, struct ipv6_rt_hdr *rthdr)
{
        int segments, segments_left;
        struct in6_addr *addrs;
        struct in6_addr final_addr;

        segments_left = rthdr->segments_left;
        if (segments_left == 0)
                return;
        rthdr->segments_left = 0;

        /* The value of rthdr->hdrlen has been verified either by the system
         * call if it is locally generated, or by ipv6_rthdr_rcv() for incoming
         * packets.  So we can assume that it is even and that segments is
         * greater than or equal to segments_left.
         *
         * For the same reason we can assume that this option is of type 0.
         */
        segments = rthdr->hdrlen >> 1;

        addrs = ((struct rt0_hdr *)rthdr)->addr;
        final_addr = addrs[segments - 1];

        addrs += segments - segments_left;
        memmove(addrs + 1, addrs, (segments_left - 1) * sizeof(*addrs));

        addrs[0] = iph->daddr;
        iph->daddr = final_addr;
}

static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len, int dir)
{
        union {
                struct ipv6hdr *iph;
                struct ipv6_opt_hdr *opth;
                struct ipv6_rt_hdr *rth;
                char *raw;
        } exthdr = { .iph = iph };
        char *end = exthdr.raw + len;
        int nexthdr = iph->nexthdr;

        exthdr.iph++;

        while (exthdr.raw < end) {
                switch (nexthdr) {
                case NEXTHDR_DEST:
                        if (dir == XFRM_POLICY_OUT)
                                ipv6_rearrange_destopt(iph, exthdr.opth);
                        fallthrough;
                case NEXTHDR_HOP:
                        if (!zero_out_mutable_opts(exthdr.opth)) {
                                net_dbg_ratelimited("overrun %sopts\n",
                                                    nexthdr == NEXTHDR_HOP ?
                                                    "hop" : "dest");
                                return -EINVAL;
                        }
                        break;

                case NEXTHDR_ROUTING:
                        ipv6_rearrange_rthdr(iph, exthdr.rth);
                        break;

                default:
                        return 0;
                }

                nexthdr = exthdr.opth->nexthdr;
                exthdr.raw += ipv6_optlen(exthdr.opth);
        }

        return 0;
}

static void ah6_output_done(void *data, int err)
{
        int extlen;
        u8 *iph_base;
        u8 *icv;
        struct sk_buff *skb = data;
        struct xfrm_state *x = skb_dst(skb)->xfrm;
        struct ah_data *ahp = x->data;
        struct ipv6hdr *top_iph = ipv6_hdr(skb);
        struct ip_auth_hdr *ah = ip_auth_hdr(skb);
        struct tmp_ext *iph_ext;

        extlen = skb_network_header_len(skb) - sizeof(struct ipv6hdr);
        if (extlen)
                extlen += sizeof(*iph_ext);

        iph_base = AH_SKB_CB(skb)->tmp;
        iph_ext = ah_tmp_ext(iph_base);
        icv = ah_tmp_icv(iph_ext, extlen);

        memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
        memcpy(top_iph, iph_base, IPV6HDR_BASELEN);

        ah6_restore_hdrs(top_iph, iph_ext, extlen);

        kfree(AH_SKB_CB(skb)->tmp);
        xfrm_output_resume(skb->sk, skb, err);
}

static int ah6_output(struct xfrm_state *x, struct sk_buff *skb)
{
        int err;
        int nfrags;
        int extlen;
        u8 *iph_base;
        u8 *icv;
        u8 nexthdr;
        struct sk_buff *trailer;
        struct crypto_ahash *ahash;
        struct ahash_request *req;
        struct scatterlist *sg;
        struct ipv6hdr *top_iph;
        struct ip_auth_hdr *ah;
        struct ah_data *ahp;
        struct tmp_ext *iph_ext;
        int seqhi_len = 0;
        __be32 *seqhi;
        int sglists = 0;
        struct scatterlist *seqhisg;

        ahp = x->data;
        ahash = ahp->ahash;

        err = skb_cow_data(skb, 0, &trailer);
        if (err < 0)
                goto out;
        nfrags = err;

        skb_push(skb, -skb_network_offset(skb));
        extlen = skb_network_header_len(skb) - sizeof(struct ipv6hdr);
        if (extlen)
                extlen += sizeof(*iph_ext);

        if (x->props.flags & XFRM_STATE_ESN) {
                sglists = 1;
                seqhi_len = sizeof(*seqhi);
        }
        err = -ENOMEM;
        iph_base = ah_alloc_tmp(ahash, nfrags + sglists, IPV6HDR_BASELEN +
                                extlen + seqhi_len);
        if (!iph_base)
                goto out;

        iph_ext = ah_tmp_ext(iph_base);
        seqhi = (__be32 *)((char *)iph_ext + extlen);
        icv = ah_tmp_icv(seqhi, seqhi_len);
        req = ah_tmp_req(ahash, icv);
        sg = ah_req_sg(ahash, req);
        seqhisg = sg + nfrags;

        ah = ip_auth_hdr(skb);
        memset(ah->auth_data, 0, ahp->icv_trunc_len);

        top_iph = ipv6_hdr(skb);
        top_iph->payload_len = htons(skb->len - sizeof(*top_iph));

        nexthdr = *skb_mac_header(skb);
        *skb_mac_header(skb) = IPPROTO_AH;

        /* When there are no extension headers, we only need to save the first
         * 8 bytes of the base IP header.
         */
        memcpy(iph_base, top_iph, IPV6HDR_BASELEN);

        ah6_save_hdrs(iph_ext, top_iph, extlen);
        if (extlen) {
                err = ipv6_clear_mutable_options(top_iph,
                                                 extlen - sizeof(*iph_ext) +
                                                 sizeof(*top_iph),
                                                 XFRM_POLICY_OUT);
                if (err)
                        goto out_free;
        }

        ah->nexthdr = nexthdr;

        top_iph->priority    = 0;
        top_iph->flow_lbl[0] = 0;
        top_iph->flow_lbl[1] = 0;
        top_iph->flow_lbl[2] = 0;
        top_iph->hop_limit   = 0;

        ah->hdrlen  = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;

        ah->reserved = 0;
        ah->spi = x->id.spi;
        ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);

        sg_init_table(sg, nfrags + sglists);
        err = skb_to_sgvec_nomark(skb, sg, 0, skb->len);
        if (unlikely(err < 0))
                goto out_free;

        if (x->props.flags & XFRM_STATE_ESN) {
                /* Attach seqhi sg right after packet payload */
                *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
                sg_set_buf(seqhisg, seqhi, seqhi_len);
        }
        ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);
        ahash_request_set_callback(req, 0, ah6_output_done, skb);

        AH_SKB_CB(skb)->tmp = iph_base;

        err = crypto_ahash_digest(req);
        if (err) {
                if (err == -EINPROGRESS)
                        goto out;

                if (err == -ENOSPC)
                        err = NET_XMIT_DROP;
                goto out_free;
        }

        memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
        memcpy(top_iph, iph_base, IPV6HDR_BASELEN);

        ah6_restore_hdrs(top_iph, iph_ext, extlen);

out_free:
        kfree(iph_base);
out:
        return err;
}

static void ah6_input_done(void *data, int err)
{
        u8 *auth_data;
        u8 *icv;
        u8 *work_iph;
        struct sk_buff *skb = data;
        struct xfrm_state *x = xfrm_input_state(skb);
        struct ah_data *ahp = x->data;
        struct ip_auth_hdr *ah = ip_auth_hdr(skb);
        int hdr_len = skb_network_header_len(skb);
        int ah_hlen = ipv6_authlen(ah);

        if (err)
                goto out;

        work_iph = AH_SKB_CB(skb)->tmp;
        auth_data = ah_tmp_auth(work_iph, hdr_len);
        icv = ah_tmp_icv(auth_data, ahp->icv_trunc_len);

        err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0;
        if (err)
                goto out;

        err = ah->nexthdr;

        skb->network_header += ah_hlen;
        memcpy(skb_network_header(skb), work_iph, hdr_len);
        __skb_pull(skb, ah_hlen + hdr_len);
        if (x->props.mode == XFRM_MODE_TUNNEL)
                skb_reset_transport_header(skb);
        else
                skb_set_transport_header(skb, -hdr_len);
out:
        kfree(AH_SKB_CB(skb)->tmp);
        xfrm_input_resume(skb, err);
}



static int ah6_input(struct xfrm_state *x, struct sk_buff *skb)
{
        /*
         * Before process AH
         * [IPv6][Ext1][Ext2][AH][Dest][Payload]
         * |<-------------->| hdr_len
         *
         * To erase AH:
         * Keeping copy of cleared headers. After AH processing,
         * Moving the pointer of skb->network_header by using skb_pull as long
         * as AH header length. Then copy back the copy as long as hdr_len
         * If destination header following AH exists, copy it into after [Ext2].
         *
         * |<>|[IPv6][Ext1][Ext2][Dest][Payload]
         * There is offset of AH before IPv6 header after the process.
         */

        u8 *auth_data;
        u8 *icv;
        u8 *work_iph;
        struct sk_buff *trailer;
        struct crypto_ahash *ahash;
        struct ahash_request *req;
        struct scatterlist *sg;
        struct ip_auth_hdr *ah;
        struct ipv6hdr *ip6h;
        struct ah_data *ahp;
        u16 hdr_len;
        u16 ah_hlen;
        int nexthdr;
        int nfrags;
        int err = -ENOMEM;
        int seqhi_len = 0;
        __be32 *seqhi;
        int sglists = 0;
        struct scatterlist *seqhisg;

        if (!pskb_may_pull(skb, sizeof(struct ip_auth_hdr)))
                goto out;

        /* We are going to _remove_ AH header to keep sockets happy,
         * so... Later this can change. */
        if (skb_unclone(skb, GFP_ATOMIC))
                goto out;

        skb->ip_summed = CHECKSUM_NONE;

        hdr_len = skb_network_header_len(skb);
        ah = (struct ip_auth_hdr *)skb->data;
        ahp = x->data;
        ahash = ahp->ahash;

        nexthdr = ah->nexthdr;
        ah_hlen = ipv6_authlen(ah);

        if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
            ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
                goto out;

        if (!pskb_may_pull(skb, ah_hlen))
                goto out;

        err = skb_cow_data(skb, 0, &trailer);
        if (err < 0)
                goto out;
        nfrags = err;

        ah = (struct ip_auth_hdr *)skb->data;
        ip6h = ipv6_hdr(skb);

        skb_push(skb, hdr_len);

        if (x->props.flags & XFRM_STATE_ESN) {
                sglists = 1;
                seqhi_len = sizeof(*seqhi);
        }

        work_iph = ah_alloc_tmp(ahash, nfrags + sglists, hdr_len +
                                ahp->icv_trunc_len + seqhi_len);
        if (!work_iph) {
                err = -ENOMEM;
                goto out;
        }

        auth_data = ah_tmp_auth((u8 *)work_iph, hdr_len);
        seqhi = (__be32 *)(auth_data + ahp->icv_trunc_len);
        icv = ah_tmp_icv(seqhi, seqhi_len);
        req = ah_tmp_req(ahash, icv);
        sg = ah_req_sg(ahash, req);
        seqhisg = sg + nfrags;

        memcpy(work_iph, ip6h, hdr_len);
        memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
        memset(ah->auth_data, 0, ahp->icv_trunc_len);

        err = ipv6_clear_mutable_options(ip6h, hdr_len, XFRM_POLICY_IN);
        if (err)
                goto out_free;

        ip6h->priority    = 0;
        ip6h->flow_lbl[0] = 0;
        ip6h->flow_lbl[1] = 0;
        ip6h->flow_lbl[2] = 0;
        ip6h->hop_limit   = 0;

        sg_init_table(sg, nfrags + sglists);
        err = skb_to_sgvec_nomark(skb, sg, 0, skb->len);
        if (unlikely(err < 0))
                goto out_free;

        if (x->props.flags & XFRM_STATE_ESN) {
                /* Attach seqhi sg right after packet payload */
                *seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
                sg_set_buf(seqhisg, seqhi, seqhi_len);
        }

        ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);
        ahash_request_set_callback(req, 0, ah6_input_done, skb);

        AH_SKB_CB(skb)->tmp = work_iph;

        err = crypto_ahash_digest(req);
        if (err) {
                if (err == -EINPROGRESS)
                        goto out;

                goto out_free;
        }

        err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0;
        if (err)
                goto out_free;

        skb->network_header += ah_hlen;
        memcpy(skb_network_header(skb), work_iph, hdr_len);
        __skb_pull(skb, ah_hlen + hdr_len);

        if (x->props.mode == XFRM_MODE_TUNNEL)
                skb_reset_transport_header(skb);
        else
                skb_set_transport_header(skb, -hdr_len);

        err = nexthdr;

out_free:
        kfree(work_iph);
out:
        return err;
}

static int ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
                   u8 type, u8 code, int offset, __be32 info)
{
        struct net *net = dev_net(skb->dev);
        struct ipv6hdr *iph = (struct ipv6hdr *)skb->data;
        struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+offset);
        struct xfrm_state *x;

        if (type != ICMPV6_PKT_TOOBIG &&
            type != NDISC_REDIRECT)
                return 0;

        x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET6);
        if (!x)
                return 0;

        if (type == NDISC_REDIRECT)
                ip6_redirect(skb, net, skb->dev->ifindex, 0,
                             sock_net_uid(net, NULL));
        else
                ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
        xfrm_state_put(x);

        return 0;
}

static int ah6_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
        struct ah_data *ahp = NULL;
        struct xfrm_algo_desc *aalg_desc;
        struct crypto_ahash *ahash;

        if (!x->aalg) {
                NL_SET_ERR_MSG(extack, "AH requires a state with an AUTH algorithm");
                goto error;
        }

        if (x->encap) {
                NL_SET_ERR_MSG(extack, "AH is not compatible with encapsulation");
                goto error;
        }

        ahp = kzalloc(sizeof(*ahp), GFP_KERNEL);
        if (!ahp)
                return -ENOMEM;

        ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0);
        if (IS_ERR(ahash)) {
                NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
                goto error;
        }

        ahp->ahash = ahash;
        if (crypto_ahash_setkey(ahash, x->aalg->alg_key,
                               (x->aalg->alg_key_len + 7) / 8)) {
                NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
                goto error;
        }

        /*
         * Lookup the algorithm description maintained by xfrm_algo,
         * verify crypto transform properties, and store information
         * we need for AH processing.  This lookup cannot fail here
         * after a successful crypto_alloc_hash().
         */
        aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
        BUG_ON(!aalg_desc);

        if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
            crypto_ahash_digestsize(ahash)) {
                NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
                goto error;
        }

        ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
        ahp->icv_trunc_len = x->aalg->alg_trunc_len/8;

        x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
                                          ahp->icv_trunc_len);
        switch (x->props.mode) {
        case XFRM_MODE_BEET:
        case XFRM_MODE_TRANSPORT:
                break;
        case XFRM_MODE_TUNNEL:
                x->props.header_len += sizeof(struct ipv6hdr);
                break;
        default:
                NL_SET_ERR_MSG(extack, "Invalid mode requested for AH, must be one of TRANSPORT, TUNNEL, BEET");
                goto error;
        }
        x->data = ahp;

        return 0;

error:
        if (ahp) {
                crypto_free_ahash(ahp->ahash);
                kfree(ahp);
        }
        return -EINVAL;
}

static void ah6_destroy(struct xfrm_state *x)
{
        struct ah_data *ahp = x->data;

        if (!ahp)
                return;

        crypto_free_ahash(ahp->ahash);
        kfree(ahp);
}

static int ah6_rcv_cb(struct sk_buff *skb, int err)
{
        return 0;
}

static const struct xfrm_type ah6_type = {
        .owner                = THIS_MODULE,
        .proto                = IPPROTO_AH,
        .flags                = XFRM_TYPE_REPLAY_PROT,
        .init_state        = ah6_init_state,
        .destructor        = ah6_destroy,
        .input                = ah6_input,
        .output                = ah6_output,
};

static struct xfrm6_protocol ah6_protocol = {
        .handler        =        xfrm6_rcv,
        .input_handler        =        xfrm_input,
        .cb_handler        =        ah6_rcv_cb,
        .err_handler        =        ah6_err,
        .priority        =        0,
};

static int __init ah6_init(void)
{
        if (xfrm_register_type(&ah6_type, AF_INET6) < 0) {
                pr_info("%s: can't add xfrm type\n", __func__);
                return -EAGAIN;
        }

        if (xfrm6_protocol_register(&ah6_protocol, IPPROTO_AH) < 0) {
                pr_info("%s: can't add protocol\n", __func__);
                xfrm_unregister_type(&ah6_type, AF_INET6);
                return -EAGAIN;
        }

        return 0;
}

static void __exit ah6_fini(void)
{
        if (xfrm6_protocol_deregister(&ah6_protocol, IPPROTO_AH) < 0)
                pr_info("%s: can't remove protocol\n", __func__);

        xfrm_unregister_type(&ah6_type, AF_INET6);
}

module_init(ah6_init);
module_exit(ah6_fini);

MODULE_DESCRIPTION("IPv6 AH transformation helpers");
MODULE_LICENSE("GPL");
MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_AH);















    5 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_SIMD_H
#define _ASM_SIMD_H

#include <asm/fpu/api.h>
#include <linux/compiler_attributes.h>
#include <linux/types.h>

/*
 * may_use_simd - whether it is allowable at this time to issue SIMD
 *                instructions or access the SIMD register file
 */
static __must_check inline bool may_use_simd(void)
{
        return irq_fpu_usable();
}

#endif        /* _ASM_SIMD_H */


















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   39 


































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
// SPDX-License-Identifier: GPL-2.0

// Generated by scripts/atomic/gen-atomic-long.sh
// DO NOT MODIFY THIS FILE DIRECTLY

#ifndef _LINUX_ATOMIC_LONG_H
#define _LINUX_ATOMIC_LONG_H

#include <linux/compiler.h>
#include <asm/types.h>

#ifdef CONFIG_64BIT
typedef atomic64_t atomic_long_t;
#define ATOMIC_LONG_INIT(i)                ATOMIC64_INIT(i)
#define atomic_long_cond_read_acquire        atomic64_cond_read_acquire
#define atomic_long_cond_read_relaxed        atomic64_cond_read_relaxed
#else
typedef atomic_t atomic_long_t;
#define ATOMIC_LONG_INIT(i)                ATOMIC_INIT(i)
#define atomic_long_cond_read_acquire        atomic_cond_read_acquire
#define atomic_long_cond_read_relaxed        atomic_cond_read_relaxed
#endif

/**
 * raw_atomic_long_read() - atomic load with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically loads the value of @v with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_read() elsewhere.
 *
 * Return: The value loaded from @v.
 */
static __always_inline long
raw_atomic_long_read(const atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_read(v);
#else
        return raw_atomic_read(v);
#endif
}

/**
 * raw_atomic_long_read_acquire() - atomic load with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically loads the value of @v with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_read_acquire() elsewhere.
 *
 * Return: The value loaded from @v.
 */
static __always_inline long
raw_atomic_long_read_acquire(const atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_read_acquire(v);
#else
        return raw_atomic_read_acquire(v);
#endif
}

/**
 * raw_atomic_long_set() - atomic set with relaxed ordering
 * @v: pointer to atomic_long_t
 * @i: long value to assign
 *
 * Atomically sets @v to @i with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_set() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_set(atomic_long_t *v, long i)
{
#ifdef CONFIG_64BIT
        raw_atomic64_set(v, i);
#else
        raw_atomic_set(v, i);
#endif
}

/**
 * raw_atomic_long_set_release() - atomic set with release ordering
 * @v: pointer to atomic_long_t
 * @i: long value to assign
 *
 * Atomically sets @v to @i with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_set_release() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_set_release(atomic_long_t *v, long i)
{
#ifdef CONFIG_64BIT
        raw_atomic64_set_release(v, i);
#else
        raw_atomic_set_release(v, i);
#endif
}

/**
 * raw_atomic_long_add() - atomic add with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_add(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_add(i, v);
#else
        raw_atomic_add(i, v);
#endif
}

/**
 * raw_atomic_long_add_return() - atomic add with full ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_add_return(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_return(i, v);
#else
        return raw_atomic_add_return(i, v);
#endif
}

/**
 * raw_atomic_long_add_return_acquire() - atomic add with acquire ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_add_return_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_return_acquire(i, v);
#else
        return raw_atomic_add_return_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_add_return_release() - atomic add with release ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_add_return_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_return_release(i, v);
#else
        return raw_atomic_add_return_release(i, v);
#endif
}

/**
 * raw_atomic_long_add_return_relaxed() - atomic add with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_add_return_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_return_relaxed(i, v);
#else
        return raw_atomic_add_return_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_add() - atomic add with full ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_add() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_add(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_add(i, v);
#else
        return raw_atomic_fetch_add(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_add_acquire() - atomic add with acquire ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_add_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_add_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_add_acquire(i, v);
#else
        return raw_atomic_fetch_add_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_add_release() - atomic add with release ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_add_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_add_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_add_release(i, v);
#else
        return raw_atomic_fetch_add_release(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_add_relaxed() - atomic add with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_add_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_add_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_add_relaxed(i, v);
#else
        return raw_atomic_fetch_add_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_sub() - atomic subtract with relaxed ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_sub() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_sub(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_sub(i, v);
#else
        raw_atomic_sub(i, v);
#endif
}

/**
 * raw_atomic_long_sub_return() - atomic subtract with full ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_sub_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_sub_return(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_sub_return(i, v);
#else
        return raw_atomic_sub_return(i, v);
#endif
}

/**
 * raw_atomic_long_sub_return_acquire() - atomic subtract with acquire ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_sub_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_sub_return_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_sub_return_acquire(i, v);
#else
        return raw_atomic_sub_return_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_sub_return_release() - atomic subtract with release ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_sub_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_sub_return_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_sub_return_release(i, v);
#else
        return raw_atomic_sub_return_release(i, v);
#endif
}

/**
 * raw_atomic_long_sub_return_relaxed() - atomic subtract with relaxed ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_sub_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_sub_return_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_sub_return_relaxed(i, v);
#else
        return raw_atomic_sub_return_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_sub() - atomic subtract with full ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_sub() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_sub(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_sub(i, v);
#else
        return raw_atomic_fetch_sub(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_sub_acquire() - atomic subtract with acquire ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_sub_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_sub_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_sub_acquire(i, v);
#else
        return raw_atomic_fetch_sub_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_sub_release() - atomic subtract with release ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_sub_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_sub_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_sub_release(i, v);
#else
        return raw_atomic_fetch_sub_release(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_sub_relaxed() - atomic subtract with relaxed ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_sub_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_sub_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_sub_relaxed(i, v);
#else
        return raw_atomic_fetch_sub_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_inc() - atomic increment with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_inc(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_inc(v);
#else
        raw_atomic_inc(v);
#endif
}

/**
 * raw_atomic_long_inc_return() - atomic increment with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_inc_return(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_return(v);
#else
        return raw_atomic_inc_return(v);
#endif
}

/**
 * raw_atomic_long_inc_return_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_inc_return_acquire(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_return_acquire(v);
#else
        return raw_atomic_inc_return_acquire(v);
#endif
}

/**
 * raw_atomic_long_inc_return_release() - atomic increment with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_inc_return_release(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_return_release(v);
#else
        return raw_atomic_inc_return_release(v);
#endif
}

/**
 * raw_atomic_long_inc_return_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_inc_return_relaxed(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_return_relaxed(v);
#else
        return raw_atomic_inc_return_relaxed(v);
#endif
}

/**
 * raw_atomic_long_fetch_inc() - atomic increment with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_inc() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_inc(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_inc(v);
#else
        return raw_atomic_fetch_inc(v);
#endif
}

/**
 * raw_atomic_long_fetch_inc_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_inc_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_inc_acquire(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_inc_acquire(v);
#else
        return raw_atomic_fetch_inc_acquire(v);
#endif
}

/**
 * raw_atomic_long_fetch_inc_release() - atomic increment with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_inc_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_inc_release(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_inc_release(v);
#else
        return raw_atomic_fetch_inc_release(v);
#endif
}

/**
 * raw_atomic_long_fetch_inc_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_inc_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_inc_relaxed(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_inc_relaxed(v);
#else
        return raw_atomic_fetch_inc_relaxed(v);
#endif
}

/**
 * raw_atomic_long_dec() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_dec(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_dec(v);
#else
        raw_atomic_dec(v);
#endif
}

/**
 * raw_atomic_long_dec_return() - atomic decrement with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_dec_return(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_return(v);
#else
        return raw_atomic_dec_return(v);
#endif
}

/**
 * raw_atomic_long_dec_return_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_dec_return_acquire(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_return_acquire(v);
#else
        return raw_atomic_dec_return_acquire(v);
#endif
}

/**
 * raw_atomic_long_dec_return_release() - atomic decrement with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_dec_return_release(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_return_release(v);
#else
        return raw_atomic_dec_return_release(v);
#endif
}

/**
 * raw_atomic_long_dec_return_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_dec_return_relaxed(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_return_relaxed(v);
#else
        return raw_atomic_dec_return_relaxed(v);
#endif
}

/**
 * raw_atomic_long_fetch_dec() - atomic decrement with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_dec() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_dec(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_dec(v);
#else
        return raw_atomic_fetch_dec(v);
#endif
}

/**
 * raw_atomic_long_fetch_dec_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_dec_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_dec_acquire(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_dec_acquire(v);
#else
        return raw_atomic_fetch_dec_acquire(v);
#endif
}

/**
 * raw_atomic_long_fetch_dec_release() - atomic decrement with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_dec_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_dec_release(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_dec_release(v);
#else
        return raw_atomic_fetch_dec_release(v);
#endif
}

/**
 * raw_atomic_long_fetch_dec_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_dec_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_dec_relaxed(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_dec_relaxed(v);
#else
        return raw_atomic_fetch_dec_relaxed(v);
#endif
}

/**
 * raw_atomic_long_and() - atomic bitwise AND with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_and() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_and(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_and(i, v);
#else
        raw_atomic_and(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_and() - atomic bitwise AND with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_and() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_and(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_and(i, v);
#else
        return raw_atomic_fetch_and(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_and_acquire() - atomic bitwise AND with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_and_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_and_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_and_acquire(i, v);
#else
        return raw_atomic_fetch_and_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_and_release() - atomic bitwise AND with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_and_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_and_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_and_release(i, v);
#else
        return raw_atomic_fetch_and_release(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_and_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_and_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_and_relaxed(i, v);
#else
        return raw_atomic_fetch_and_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_andnot() - atomic bitwise AND NOT with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_andnot() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_andnot(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_andnot(i, v);
#else
        raw_atomic_andnot(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_andnot() - atomic bitwise AND NOT with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_andnot() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_andnot(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_andnot(i, v);
#else
        return raw_atomic_fetch_andnot(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_andnot_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_andnot_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_andnot_acquire(i, v);
#else
        return raw_atomic_fetch_andnot_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_andnot_release() - atomic bitwise AND NOT with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_andnot_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_andnot_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_andnot_release(i, v);
#else
        return raw_atomic_fetch_andnot_release(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_andnot_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_andnot_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_andnot_relaxed(i, v);
#else
        return raw_atomic_fetch_andnot_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_or() - atomic bitwise OR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_or() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_or(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_or(i, v);
#else
        raw_atomic_or(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_or() - atomic bitwise OR with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_or() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_or(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_or(i, v);
#else
        return raw_atomic_fetch_or(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_or_acquire() - atomic bitwise OR with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_or_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_or_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_or_acquire(i, v);
#else
        return raw_atomic_fetch_or_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_or_release() - atomic bitwise OR with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_or_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_or_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_or_release(i, v);
#else
        return raw_atomic_fetch_or_release(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_or_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_or_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_or_relaxed(i, v);
#else
        return raw_atomic_fetch_or_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_xor() - atomic bitwise XOR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_xor() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_xor(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_xor(i, v);
#else
        raw_atomic_xor(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_xor() - atomic bitwise XOR with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_xor() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_xor(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_xor(i, v);
#else
        return raw_atomic_fetch_xor(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_xor_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_xor_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_xor_acquire(i, v);
#else
        return raw_atomic_fetch_xor_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_xor_release() - atomic bitwise XOR with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_xor_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_xor_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_xor_release(i, v);
#else
        return raw_atomic_fetch_xor_release(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_xor_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_xor_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_xor_relaxed(i, v);
#else
        return raw_atomic_fetch_xor_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_xchg() - atomic exchange with full ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_xchg() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_xchg(atomic_long_t *v, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_xchg(v, new);
#else
        return raw_atomic_xchg(v, new);
#endif
}

/**
 * raw_atomic_long_xchg_acquire() - atomic exchange with acquire ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_xchg_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_xchg_acquire(atomic_long_t *v, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_xchg_acquire(v, new);
#else
        return raw_atomic_xchg_acquire(v, new);
#endif
}

/**
 * raw_atomic_long_xchg_release() - atomic exchange with release ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_xchg_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_xchg_release(atomic_long_t *v, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_xchg_release(v, new);
#else
        return raw_atomic_xchg_release(v, new);
#endif
}

/**
 * raw_atomic_long_xchg_relaxed() - atomic exchange with relaxed ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_xchg_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_xchg_relaxed(atomic_long_t *v, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_xchg_relaxed(v, new);
#else
        return raw_atomic_xchg_relaxed(v, new);
#endif
}

/**
 * raw_atomic_long_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_cmpxchg() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_cmpxchg(atomic_long_t *v, long old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_cmpxchg(v, old, new);
#else
        return raw_atomic_cmpxchg(v, old, new);
#endif
}

/**
 * raw_atomic_long_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_cmpxchg_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_cmpxchg_acquire(atomic_long_t *v, long old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_cmpxchg_acquire(v, old, new);
#else
        return raw_atomic_cmpxchg_acquire(v, old, new);
#endif
}

/**
 * raw_atomic_long_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_cmpxchg_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_cmpxchg_release(atomic_long_t *v, long old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_cmpxchg_release(v, old, new);
#else
        return raw_atomic_cmpxchg_release(v, old, new);
#endif
}

/**
 * raw_atomic_long_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_cmpxchg_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_cmpxchg_relaxed(atomic_long_t *v, long old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_cmpxchg_relaxed(v, old, new);
#else
        return raw_atomic_cmpxchg_relaxed(v, old, new);
#endif
}

/**
 * raw_atomic_long_try_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_try_cmpxchg(v, (s64 *)old, new);
#else
        return raw_atomic_try_cmpxchg(v, (int *)old, new);
#endif
}

/**
 * raw_atomic_long_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_acquire() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_try_cmpxchg_acquire(v, (s64 *)old, new);
#else
        return raw_atomic_try_cmpxchg_acquire(v, (int *)old, new);
#endif
}

/**
 * raw_atomic_long_try_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_release() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_try_cmpxchg_release(v, (s64 *)old, new);
#else
        return raw_atomic_try_cmpxchg_release(v, (int *)old, new);
#endif
}

/**
 * raw_atomic_long_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_relaxed() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_try_cmpxchg_relaxed(v, (s64 *)old, new);
#else
        return raw_atomic_try_cmpxchg_relaxed(v, (int *)old, new);
#endif
}

/**
 * raw_atomic_long_sub_and_test() - atomic subtract and test if zero with full ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_sub_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_sub_and_test(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_sub_and_test(i, v);
#else
        return raw_atomic_sub_and_test(i, v);
#endif
}

/**
 * raw_atomic_long_dec_and_test() - atomic decrement and test if zero with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_dec_and_test(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_and_test(v);
#else
        return raw_atomic_dec_and_test(v);
#endif
}

/**
 * raw_atomic_long_inc_and_test() - atomic increment and test if zero with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_inc_and_test(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_and_test(v);
#else
        return raw_atomic_inc_and_test(v);
#endif
}

/**
 * raw_atomic_long_add_negative() - atomic add and test if negative with full ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_negative() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_add_negative(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_negative(i, v);
#else
        return raw_atomic_add_negative(i, v);
#endif
}

/**
 * raw_atomic_long_add_negative_acquire() - atomic add and test if negative with acquire ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_negative_acquire() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_add_negative_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_negative_acquire(i, v);
#else
        return raw_atomic_add_negative_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_add_negative_release() - atomic add and test if negative with release ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_negative_release() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_add_negative_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_negative_release(i, v);
#else
        return raw_atomic_add_negative_release(i, v);
#endif
}

/**
 * raw_atomic_long_add_negative_relaxed() - atomic add and test if negative with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_negative_relaxed() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_add_negative_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_negative_relaxed(i, v);
#else
        return raw_atomic_add_negative_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_long_t
 * @a: long value to add
 * @u: long value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_add_unless() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_add_unless(v, a, u);
#else
        return raw_atomic_fetch_add_unless(v, a, u);
#endif
}

/**
 * raw_atomic_long_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_long_t
 * @a: long value to add
 * @u: long value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_unless() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_add_unless(atomic_long_t *v, long a, long u)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_unless(v, a, u);
#else
        return raw_atomic_add_unless(v, a, u);
#endif
}

/**
 * raw_atomic_long_inc_not_zero() - atomic increment unless zero with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_not_zero() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_inc_not_zero(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_not_zero(v);
#else
        return raw_atomic_inc_not_zero(v);
#endif
}

/**
 * raw_atomic_long_inc_unless_negative() - atomic increment unless negative with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_unless_negative() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_inc_unless_negative(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_unless_negative(v);
#else
        return raw_atomic_inc_unless_negative(v);
#endif
}

/**
 * raw_atomic_long_dec_unless_positive() - atomic decrement unless positive with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_unless_positive() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_dec_unless_positive(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_unless_positive(v);
#else
        return raw_atomic_dec_unless_positive(v);
#endif
}

/**
 * raw_atomic_long_dec_if_positive() - atomic decrement if positive with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_if_positive() elsewhere.
 *
 * Return: The old value of (@v - 1), regardless of whether @v was updated.
 */
static __always_inline long
raw_atomic_long_dec_if_positive(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_if_positive(v);
#else
        return raw_atomic_dec_if_positive(v);
#endif
}

#endif /* _LINUX_ATOMIC_LONG_H */
// eadf183c3600b8b92b91839dd3be6bcc560c752d




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
8073
8074
8075
8076
8077
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387
8388
8389
8390
8391
8392
8393
8394
8395
8396
8397
8398
8399
8400
8401
8402
8403
8404
8405
8406
8407
8408
8409
8410
8411
8412
8413
8414
8415
8416
8417
8418
8419
8420
8421
8422
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468
8469
8470
8471
8472
8473
8474
8475
8476
8477
8478
8479
8480
8481
8482
8483
8484
8485
8486
8487
8488
8489
8490
8491
8492
8493
8494
8495
8496
8497
8498
8499
8500
8501
8502
8503
8504
8505
8506
8507
8508
8509
8510
8511
8512
8513
8514
8515
8516
8517
8518
8519
8520
8521
8522
8523
8524
8525
8526
8527
8528
8529
8530
8531
8532
8533
8534
8535
8536
8537
8538
8539
8540
8541
8542
8543
8544
8545
8546
8547
8548
8549
8550
8551
8552
8553
8554
8555
8556
8557
8558
8559
8560
8561
8562
8563
8564
8565
8566
8567
8568
8569
8570
8571
8572
8573
8574
8575
8576
8577
8578
8579
8580
8581
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595
8596
8597
8598
8599
8600
8601
8602
8603
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623
8624
8625
8626
8627
8628
8629
8630
8631
8632
8633
8634
8635
8636
8637
8638
8639
8640
8641
8642
8643
8644
8645
8646
8647
8648
8649
8650
8651
8652
8653
8654
8655
8656
8657
8658
8659
8660
8661
8662
8663
8664
8665
8666
8667
8668
8669
8670
8671
8672
8673
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696
8697
8698
8699
8700
8701
8702
8703
8704
8705
8706
8707
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732
8733
8734
8735
8736
8737
8738
8739
8740
8741
8742
8743
8744
8745
8746
8747
8748
8749
8750
8751
8752
8753
8754
8755
8756
8757
8758
8759
8760
8761
8762
8763
8764
8765
8766
8767
8768
8769
8770
8771
8772
8773
8774
8775
8776
8777
8778
8779
8780
8781
8782
8783
8784
8785
8786
8787
8788
8789
8790
8791
8792
8793
8794
8795
8796
8797
8798
8799
8800
8801
8802
8803
8804
8805
8806
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816
8817
8818
8819
8820
8821
8822
8823
8824
8825
8826
8827
8828
8829
8830
8831
8832
8833
8834
8835
8836
8837
8838
8839
8840
8841
8842
8843
8844
8845
8846
8847
8848
8849
8850
8851
8852
8853
8854
8855
8856
8857
8858
8859
8860
8861
8862
8863
8864
8865
8866
8867
8868
8869
8870
8871
8872
8873
8874
8875
8876
8877
8878
8879
8880
8881
8882
8883
8884
8885
8886
8887
8888
8889
8890
8891
8892
8893
8894
8895
8896
8897
8898
8899
8900
8901
8902
8903
8904
8905
8906
8907
8908
8909
8910
8911
8912
8913
8914
8915
8916
8917
8918
8919
8920
8921
8922
8923
8924
8925
8926
8927
8928
8929
8930
8931
8932
8933
8934
8935
8936
8937
8938
8939
8940
8941
8942
8943
8944
8945
8946
8947
8948
8949
8950
8951
8952
8953
8954
8955
8956
8957
8958
8959
8960
8961
8962
8963
8964
8965
8966
8967
8968
8969
8970
8971
8972
8973
8974
8975
8976
8977
8978
8979
8980
8981
8982
8983
8984
8985
8986
8987
8988
8989
8990
8991
8992
8993
8994
8995
8996
8997
8998
8999
9000
9001
9002
9003
9004
9005
9006
9007
9008
9009
9010
9011
9012
9013
9014
9015
9016
9017
9018
9019
9020
9021
9022
9023
9024
9025
9026
9027
9028
9029
9030
9031
9032
9033
9034
9035
9036
9037
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054
9055
9056
9057
9058
9059
9060
9061
9062
9063
9064
9065
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078
9079
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096
9097
9098
9099
9100
9101
9102
9103
9104
9105
9106
9107
9108
9109
9110
9111
9112
9113
9114
9115
9116
9117
9118
9119
9120
9121
9122
9123
9124
9125
9126
9127
9128
9129
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144
9145
9146
9147
9148
9149
9150
9151
9152
9153
9154
9155
9156
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189
9190
9191
9192
9193
9194
9195
9196
9197
9198
9199
9200
9201
9202
9203
9204
9205
9206
9207
9208
9209
9210
9211
9212
9213
9214
9215
9216
9217
9218
9219
9220
9221
9222
9223
9224
9225
9226
9227
9228
9229
9230
9231
9232
9233
9234
9235
9236
9237
9238
9239
9240
9241
9242
9243
9244
9245
9246
9247
9248
9249
9250
9251
9252
9253
9254
9255
9256
9257
9258
9259
9260
9261
9262
9263
9264
9265
9266
9267
9268
9269
9270
9271
9272
9273
9274
9275
9276
9277
9278
9279
9280
9281
9282
9283
9284
9285
9286
9287
9288
9289
9290
9291
9292
9293
9294
9295
9296
9297
9298
9299
9300
9301
9302
9303
9304
9305
9306
9307
9308
9309
9310
9311
9312
9313
9314
9315
9316
9317
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351
9352
9353
9354
9355
9356
9357
9358
9359
9360
9361
9362
9363
9364
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
9440
9441
9442
9443
9444
9445
9446
9447
9448
9449
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
9469
9470
9471
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481
9482
9483
9484
9485
9486
9487
9488
9489
9490
9491
9492
9493
9494
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508
9509
9510
9511
9512
9513
9514
9515
9516
9517
9518
9519
9520
9521
9522
9523
9524
9525
9526
9527
9528
9529
9530
9531
9532
9533
9534
9535
9536
9537
9538
9539
9540
9541
9542
9543
9544
9545
9546
9547
9548
9549
9550
9551
9552
9553
9554
9555
9556
9557
9558
9559
9560
9561
9562
9563
9564
9565
9566
9567
9568
9569
9570
9571
9572
9573
9574
9575
9576
9577
9578
9579
9580
9581
9582
9583
9584
9585
9586
9587
9588
9589
9590
9591
9592
9593
9594
9595
9596
9597
9598
9599
9600
9601
9602
9603
9604
9605
9606
9607
9608
9609
9610
9611
9612
9613
9614
9615
9616
9617
9618
9619
9620
9621
9622
9623
9624
9625
9626
9627
9628
9629
9630
9631
9632
9633
9634
9635
9636
9637
9638
9639
9640
9641
9642
9643
9644
9645
9646
9647
9648
9649
9650
9651
9652
9653
9654
9655
9656
9657
9658
9659
9660
9661
9662
9663
9664
9665
9666
9667
9668
9669
9670
9671
9672
9673
9674
9675
9676
9677
9678
9679
9680
9681
9682
9683
9684
9685
9686
9687
9688
9689
9690
9691
9692
9693
9694
9695
9696
9697
9698
9699
9700
9701
9702
9703
9704
9705
9706
9707
9708
9709
9710
9711
9712
9713
9714
9715
9716
9717
9718
9719
9720
9721
9722
9723
9724
9725
9726
9727
9728
9729
9730
9731
9732
9733
9734
9735
9736
9737
9738
9739
9740
9741
9742
9743
9744
9745
9746
9747
9748
9749
9750
9751
9752
9753
9754
9755
9756
9757
9758
9759
9760
9761
9762
9763
9764
9765
9766
9767
9768
9769
9770
9771
9772
9773
9774
9775
9776
9777
9778
9779
9780
9781
9782
9783
9784
9785
9786
9787
9788
9789
9790
9791
9792
9793
9794
9795
9796
9797
9798
9799
9800
9801
9802
9803
9804
9805
9806
9807
9808
9809
9810
9811
9812
9813
9814
9815
9816
9817
9818
9819
9820
9821
9822
9823
9824
9825
9826
9827
9828
9829
9830
9831
9832
9833
9834
9835
9836
9837
9838
9839
9840
9841
9842
9843
9844
9845
9846
9847
9848
9849
9850
9851
9852
9853
9854
9855
9856
9857
9858
9859
9860
9861
9862
9863
9864
9865
9866
9867
9868
9869
9870
9871
9872
9873
9874
9875
9876
9877
9878
9879
9880
9881
9882
9883
9884
9885
9886
9887
9888
9889
9890
9891
9892
9893
9894
9895
9896
9897
9898
9899
9900
9901
9902
9903
9904
9905
9906
9907
9908
9909
9910
9911
9912
9913
9914
9915
9916
9917
9918
9919
9920
9921
9922
9923
9924
9925
9926
9927
9928
9929
9930
9931
9932
9933
9934
9935
9936
9937
9938
9939
9940
9941
9942
9943
9944
9945
9946
9947
9948
9949
9950
9951
9952
9953
9954
9955
9956
9957
9958
9959
9960
9961
9962
9963
9964
9965
9966
9967
9968
9969
9970
9971
9972
9973
9974
9975
9976
9977
9978
9979
9980
9981
9982
9983
9984
9985
9986
9987
9988
9989
9990
9991
9992
9993
9994
9995
9996
9997
9998
9999
10000
10001
10002
10003
10004
10005
10006
10007
10008
10009
10010
10011
10012
10013
10014
10015
10016
10017
10018
10019
10020
10021
10022
10023
10024
10025
10026
10027
10028
10029
10030
10031
10032
10033
10034
10035
10036
10037
10038
10039
10040
10041
10042
10043
10044
10045
10046
10047
10048
10049
10050
10051
10052
10053
10054
10055
10056
10057
10058
10059
10060
10061
10062
10063
10064
10065
10066
10067
10068
10069
10070
10071
10072
10073
10074
10075
10076
10077
10078
10079
10080
10081
10082
10083
10084
10085
10086
10087
10088
10089
10090
10091
10092
10093
10094
10095
10096
10097
10098
10099
10100
10101
10102
10103
10104
10105
10106
10107
10108
10109
10110
10111
10112
10113
10114
10115
10116
10117
10118
10119
10120
10121
10122
10123
10124
10125
10126
10127
10128
10129
10130
10131
10132
10133
10134
10135
10136
10137
10138
10139
10140
10141
10142
10143
10144
10145
10146
10147
10148
10149
10150
10151
10152
10153
10154
10155
10156
10157
10158
10159
10160
10161
10162
10163
10164
10165
10166
10167
10168
10169
10170
10171
10172
10173
10174
10175
10176
10177
10178
10179
10180
10181
10182
10183
10184
10185
10186
10187
10188
10189
10190
10191
10192
10193
10194
10195
10196
10197
10198
10199
10200
10201
10202
10203
10204
10205
10206
10207
10208
10209
10210
10211
10212
10213
10214
10215
10216
10217
10218
10219
10220
10221
10222
10223
10224
10225
10226
10227
10228
10229
10230
10231
10232
10233
10234
10235
10236
10237
10238
10239
10240
10241
10242
10243
10244
10245
10246
10247
10248
10249
10250
10251
10252
10253
10254
10255
10256
10257
10258
10259
10260
10261
10262
10263
10264
10265
10266
10267
10268
10269
10270
10271
10272
10273
10274
10275
10276
10277
10278
10279
10280
10281
10282
10283
10284
10285
10286
10287
10288
10289
10290
10291
10292
10293
10294
10295
10296
10297
10298
10299
10300
10301
10302
10303
10304
10305
10306
10307
10308
10309
10310
10311
10312
10313
10314
10315
10316
10317
10318
10319
10320
10321
10322
10323
10324
10325
10326
10327
10328
10329
10330
10331
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef __NET_CFG80211_H
#define __NET_CFG80211_H
/*
 * 802.11 device and configuration interface
 *
 * Copyright 2006-2010        Johannes Berg <johannes@sipsolutions.net>
 * Copyright 2013-2014 Intel Mobile Communications GmbH
 * Copyright 2015-2017        Intel Deutschland GmbH
 * Copyright (C) 2018-2025 Intel Corporation
 */

#include <linux/ethtool.h>
#include <uapi/linux/rfkill.h>
#include <linux/netdevice.h>
#include <linux/debugfs.h>
#include <linux/list.h>
#include <linux/bug.h>
#include <linux/netlink.h>
#include <linux/skbuff.h>
#include <linux/nl80211.h>
#include <linux/if_ether.h>
#include <linux/ieee80211.h>
#include <linux/net.h>
#include <linux/rfkill.h>
#include <net/regulatory.h>

/**
 * DOC: Introduction
 *
 * cfg80211 is the configuration API for 802.11 devices in Linux. It bridges
 * userspace and drivers, and offers some utility functionality associated
 * with 802.11. cfg80211 must, directly or indirectly via mac80211, be used
 * by all modern wireless drivers in Linux, so that they offer a consistent
 * API through nl80211. For backward compatibility, cfg80211 also offers
 * wireless extensions to userspace, but hides them from drivers completely.
 *
 * Additionally, cfg80211 contains code to help enforce regulatory spectrum
 * use restrictions.
 */


/**
 * DOC: Device registration
 *
 * In order for a driver to use cfg80211, it must register the hardware device
 * with cfg80211. This happens through a number of hardware capability structs
 * described below.
 *
 * The fundamental structure for each device is the 'wiphy', of which each
 * instance describes a physical wireless device connected to the system. Each
 * such wiphy can have zero, one, or many virtual interfaces associated with
 * it, which need to be identified as such by pointing the network interface's
 * @ieee80211_ptr pointer to a &struct wireless_dev which further describes
 * the wireless part of the interface. Normally this struct is embedded in the
 * network interface's private data area. Drivers can optionally allow creating
 * or destroying virtual interfaces on the fly, but without at least one or the
 * ability to create some the wireless device isn't useful.
 *
 * Each wiphy structure contains device capability information, and also has
 * a pointer to the various operations the driver offers. The definitions and
 * structures here describe these capabilities in detail.
 */

struct wiphy;

/*
 * wireless hardware capability structures
 */

/**
 * enum ieee80211_channel_flags - channel flags
 *
 * Channel flags set by the regulatory control code.
 *
 * @IEEE80211_CHAN_DISABLED: This channel is disabled.
 * @IEEE80211_CHAN_NO_IR: do not initiate radiation, this includes
 *        sending probe requests or beaconing.
 * @IEEE80211_CHAN_PSD: Power spectral density (in dBm) is set for this
 *        channel.
 * @IEEE80211_CHAN_RADAR: Radar detection is required on this channel.
 * @IEEE80211_CHAN_NO_HT40PLUS: extension channel above this channel
 *        is not permitted.
 * @IEEE80211_CHAN_NO_HT40MINUS: extension channel below this channel
 *        is not permitted.
 * @IEEE80211_CHAN_NO_OFDM: OFDM is not allowed on this channel.
 * @IEEE80211_CHAN_NO_80MHZ: If the driver supports 80 MHz on the band,
 *        this flag indicates that an 80 MHz channel cannot use this
 *        channel as the control or any of the secondary channels.
 *        This may be due to the driver or due to regulatory bandwidth
 *        restrictions.
 * @IEEE80211_CHAN_NO_160MHZ: If the driver supports 160 MHz on the band,
 *        this flag indicates that an 160 MHz channel cannot use this
 *        channel as the control or any of the secondary channels.
 *        This may be due to the driver or due to regulatory bandwidth
 *        restrictions.
 * @IEEE80211_CHAN_INDOOR_ONLY: see %NL80211_FREQUENCY_ATTR_INDOOR_ONLY
 * @IEEE80211_CHAN_IR_CONCURRENT: see %NL80211_FREQUENCY_ATTR_IR_CONCURRENT
 * @IEEE80211_CHAN_NO_20MHZ: 20 MHz bandwidth is not permitted
 *        on this channel.
 * @IEEE80211_CHAN_NO_10MHZ: 10 MHz bandwidth is not permitted
 *        on this channel.
 * @IEEE80211_CHAN_NO_HE: HE operation is not permitted on this channel.
 * @IEEE80211_CHAN_NO_320MHZ: If the driver supports 320 MHz on the band,
 *        this flag indicates that a 320 MHz channel cannot use this
 *        channel as the control or any of the secondary channels.
 *        This may be due to the driver or due to regulatory bandwidth
 *        restrictions.
 * @IEEE80211_CHAN_NO_EHT: EHT operation is not permitted on this channel.
 * @IEEE80211_CHAN_DFS_CONCURRENT: See %NL80211_RRF_DFS_CONCURRENT
 * @IEEE80211_CHAN_NO_6GHZ_VLP_CLIENT: Client connection with VLP AP
 *        not permitted using this channel
 * @IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT: Client connection with AFC AP
 *        not permitted using this channel
 * @IEEE80211_CHAN_CAN_MONITOR: This channel can be used for monitor
 *        mode even in the presence of other (regulatory) restrictions,
 *        even if it is otherwise disabled.
 * @IEEE80211_CHAN_ALLOW_6GHZ_VLP_AP: Allow using this channel for AP operation
 *        with very low power (VLP), even if otherwise set to NO_IR.
 * @IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY: Allow activity on a 20 MHz channel,
 *        even if otherwise set to NO_IR.
 * @IEEE80211_CHAN_S1G_NO_PRIMARY: Prevents the channel for use as an S1G
 *        primary channel. Does not prevent the wider operating channel
 *        described by the chandef from being used. In order for a 2MHz primary
 *        to be used, both 1MHz subchannels shall not contain this flag.
 * @IEEE80211_CHAN_NO_4MHZ: 4 MHz bandwidth is not permitted on this channel.
 * @IEEE80211_CHAN_NO_8MHZ: 8 MHz bandwidth is not permitted on this channel.
 * @IEEE80211_CHAN_NO_16MHZ: 16 MHz bandwidth is not permitted on this channel.
 */
enum ieee80211_channel_flags {
        IEEE80211_CHAN_DISABLED                        = BIT(0),
        IEEE80211_CHAN_NO_IR                        = BIT(1),
        IEEE80211_CHAN_PSD                        = BIT(2),
        IEEE80211_CHAN_RADAR                        = BIT(3),
        IEEE80211_CHAN_NO_HT40PLUS                = BIT(4),
        IEEE80211_CHAN_NO_HT40MINUS                = BIT(5),
        IEEE80211_CHAN_NO_OFDM                        = BIT(6),
        IEEE80211_CHAN_NO_80MHZ                        = BIT(7),
        IEEE80211_CHAN_NO_160MHZ                = BIT(8),
        IEEE80211_CHAN_INDOOR_ONLY                = BIT(9),
        IEEE80211_CHAN_IR_CONCURRENT                = BIT(10),
        IEEE80211_CHAN_NO_20MHZ                        = BIT(11),
        IEEE80211_CHAN_NO_10MHZ                        = BIT(12),
        IEEE80211_CHAN_NO_HE                        = BIT(13),
        /* can use free bits here */
        IEEE80211_CHAN_NO_320MHZ                = BIT(19),
        IEEE80211_CHAN_NO_EHT                        = BIT(20),
        IEEE80211_CHAN_DFS_CONCURRENT                = BIT(21),
        IEEE80211_CHAN_NO_6GHZ_VLP_CLIENT        = BIT(22),
        IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT        = BIT(23),
        IEEE80211_CHAN_CAN_MONITOR                = BIT(24),
        IEEE80211_CHAN_ALLOW_6GHZ_VLP_AP        = BIT(25),
        IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY     = BIT(26),
        IEEE80211_CHAN_S1G_NO_PRIMARY                = BIT(27),
        IEEE80211_CHAN_NO_4MHZ                        = BIT(28),
        IEEE80211_CHAN_NO_8MHZ                        = BIT(29),
        IEEE80211_CHAN_NO_16MHZ                        = BIT(30),
};

#define IEEE80211_CHAN_NO_HT40 \
        (IEEE80211_CHAN_NO_HT40PLUS | IEEE80211_CHAN_NO_HT40MINUS)

#define IEEE80211_DFS_MIN_CAC_TIME_MS                60000
#define IEEE80211_DFS_MIN_NOP_TIME_MS                (30 * 60 * 1000)

/**
 * struct ieee80211_channel - channel definition
 *
 * This structure describes a single channel for use
 * with cfg80211.
 *
 * @center_freq: center frequency in MHz
 * @freq_offset: offset from @center_freq, in KHz
 * @hw_value: hardware-specific value for the channel
 * @flags: channel flags from &enum ieee80211_channel_flags.
 * @orig_flags: channel flags at registration time, used by regulatory
 *        code to support devices with additional restrictions
 * @band: band this channel belongs to.
 * @max_antenna_gain: maximum antenna gain in dBi
 * @max_power: maximum transmission power (in dBm)
 * @max_reg_power: maximum regulatory transmission power (in dBm)
 * @beacon_found: helper to regulatory code to indicate when a beacon
 *        has been found on this channel. Use regulatory_hint_found_beacon()
 *        to enable this, this is useful only on 5 GHz band.
 * @orig_mag: internal use
 * @orig_mpwr: internal use
 * @dfs_state: current state of this channel. Only relevant if radar is required
 *        on this channel.
 * @dfs_state_entered: timestamp (jiffies) when the dfs state was entered.
 * @dfs_cac_ms: DFS CAC time in milliseconds, this is valid for DFS channels.
 * @psd: power spectral density (in dBm)
 */
struct ieee80211_channel {
        enum nl80211_band band;
        u32 center_freq;
        u16 freq_offset;
        u16 hw_value;
        u32 flags;
        int max_antenna_gain;
        int max_power;
        int max_reg_power;
        bool beacon_found;
        u32 orig_flags;
        int orig_mag, orig_mpwr;
        enum nl80211_dfs_state dfs_state;
        unsigned long dfs_state_entered;
        unsigned int dfs_cac_ms;
        s8 psd;
};

/**
 * enum ieee80211_rate_flags - rate flags
 *
 * Hardware/specification flags for rates. These are structured
 * in a way that allows using the same bitrate structure for
 * different bands/PHY modes.
 *
 * @IEEE80211_RATE_SHORT_PREAMBLE: Hardware can send with short
 *        preamble on this bitrate; only relevant in 2.4GHz band and
 *        with CCK rates.
 * @IEEE80211_RATE_MANDATORY_A: This bitrate is a mandatory rate
 *        when used with 802.11a (on the 5 GHz band); filled by the
 *        core code when registering the wiphy.
 * @IEEE80211_RATE_MANDATORY_B: This bitrate is a mandatory rate
 *        when used with 802.11b (on the 2.4 GHz band); filled by the
 *        core code when registering the wiphy.
 * @IEEE80211_RATE_MANDATORY_G: This bitrate is a mandatory rate
 *        when used with 802.11g (on the 2.4 GHz band); filled by the
 *        core code when registering the wiphy.
 * @IEEE80211_RATE_ERP_G: This is an ERP rate in 802.11g mode.
 * @IEEE80211_RATE_SUPPORTS_5MHZ: Rate can be used in 5 MHz mode
 * @IEEE80211_RATE_SUPPORTS_10MHZ: Rate can be used in 10 MHz mode
 */
enum ieee80211_rate_flags {
        IEEE80211_RATE_SHORT_PREAMBLE        = BIT(0),
        IEEE80211_RATE_MANDATORY_A        = BIT(1),
        IEEE80211_RATE_MANDATORY_B        = BIT(2),
        IEEE80211_RATE_MANDATORY_G        = BIT(3),
        IEEE80211_RATE_ERP_G                = BIT(4),
        IEEE80211_RATE_SUPPORTS_5MHZ        = BIT(5),
        IEEE80211_RATE_SUPPORTS_10MHZ        = BIT(6),
};

/**
 * enum ieee80211_bss_type - BSS type filter
 *
 * @IEEE80211_BSS_TYPE_ESS: Infrastructure BSS
 * @IEEE80211_BSS_TYPE_PBSS: Personal BSS
 * @IEEE80211_BSS_TYPE_IBSS: Independent BSS
 * @IEEE80211_BSS_TYPE_MBSS: Mesh BSS
 * @IEEE80211_BSS_TYPE_ANY: Wildcard value for matching any BSS type
 */
enum ieee80211_bss_type {
        IEEE80211_BSS_TYPE_ESS,
        IEEE80211_BSS_TYPE_PBSS,
        IEEE80211_BSS_TYPE_IBSS,
        IEEE80211_BSS_TYPE_MBSS,
        IEEE80211_BSS_TYPE_ANY
};

/**
 * enum ieee80211_privacy - BSS privacy filter
 *
 * @IEEE80211_PRIVACY_ON: privacy bit set
 * @IEEE80211_PRIVACY_OFF: privacy bit clear
 * @IEEE80211_PRIVACY_ANY: Wildcard value for matching any privacy setting
 */
enum ieee80211_privacy {
        IEEE80211_PRIVACY_ON,
        IEEE80211_PRIVACY_OFF,
        IEEE80211_PRIVACY_ANY
};

#define IEEE80211_PRIVACY(x)        \
        ((x) ? IEEE80211_PRIVACY_ON : IEEE80211_PRIVACY_OFF)

/**
 * struct ieee80211_rate - bitrate definition
 *
 * This structure describes a bitrate that an 802.11 PHY can
 * operate with. The two values @hw_value and @hw_value_short
 * are only for driver use when pointers to this structure are
 * passed around.
 *
 * @flags: rate-specific flags from &enum ieee80211_rate_flags
 * @bitrate: bitrate in units of 100 Kbps
 * @hw_value: driver/hardware value for this rate
 * @hw_value_short: driver/hardware value for this rate when
 *        short preamble is used
 */
struct ieee80211_rate {
        u32 flags;
        u16 bitrate;
        u16 hw_value, hw_value_short;
};

/**
 * struct ieee80211_he_obss_pd - AP settings for spatial reuse
 *
 * @enable: is the feature enabled.
 * @sr_ctrl: The SR Control field of SRP element.
 * @non_srg_max_offset: non-SRG maximum tx power offset
 * @min_offset: minimal tx power offset an associated station shall use
 * @max_offset: maximum tx power offset an associated station shall use
 * @bss_color_bitmap: bitmap that indicates the BSS color values used by
 *        members of the SRG
 * @partial_bssid_bitmap: bitmap that indicates the partial BSSID values
 *        used by members of the SRG
 */
struct ieee80211_he_obss_pd {
        bool enable;
        u8 sr_ctrl;
        u8 non_srg_max_offset;
        u8 min_offset;
        u8 max_offset;
        u8 bss_color_bitmap[8];
        u8 partial_bssid_bitmap[8];
};

/**
 * struct cfg80211_he_bss_color - AP settings for BSS coloring
 *
 * @color: the current color.
 * @enabled: HE BSS color is used
 * @partial: define the AID equation.
 */
struct cfg80211_he_bss_color {
        u8 color;
        bool enabled;
        bool partial;
};

/**
 * struct ieee80211_sta_ht_cap - STA's HT capabilities
 *
 * This structure describes most essential parameters needed
 * to describe 802.11n HT capabilities for an STA.
 *
 * @ht_supported: is HT supported by the STA
 * @cap: HT capabilities map as described in 802.11n spec
 * @ampdu_factor: Maximum A-MPDU length factor
 * @ampdu_density: Minimum A-MPDU spacing
 * @mcs: Supported MCS rates
 */
struct ieee80211_sta_ht_cap {
        u16 cap; /* use IEEE80211_HT_CAP_ */
        bool ht_supported;
        u8 ampdu_factor;
        u8 ampdu_density;
        struct ieee80211_mcs_info mcs;
};

/**
 * struct ieee80211_sta_vht_cap - STA's VHT capabilities
 *
 * This structure describes most essential parameters needed
 * to describe 802.11ac VHT capabilities for an STA.
 *
 * @vht_supported: is VHT supported by the STA
 * @cap: VHT capabilities map as described in 802.11ac spec
 * @vht_mcs: Supported VHT MCS rates
 */
struct ieee80211_sta_vht_cap {
        bool vht_supported;
        u32 cap; /* use IEEE80211_VHT_CAP_ */
        struct ieee80211_vht_mcs_info vht_mcs;
};

#define IEEE80211_HE_PPE_THRES_MAX_LEN                25

/**
 * struct ieee80211_sta_he_cap - STA's HE capabilities
 *
 * This structure describes most essential parameters needed
 * to describe 802.11ax HE capabilities for a STA.
 *
 * @has_he: true iff HE data is valid.
 * @he_cap_elem: Fixed portion of the HE capabilities element.
 * @he_mcs_nss_supp: The supported NSS/MCS combinations.
 * @ppe_thres: Holds the PPE Thresholds data.
 */
struct ieee80211_sta_he_cap {
        bool has_he;
        struct ieee80211_he_cap_elem he_cap_elem;
        struct ieee80211_he_mcs_nss_supp he_mcs_nss_supp;
        u8 ppe_thres[IEEE80211_HE_PPE_THRES_MAX_LEN];
};

/**
 * struct ieee80211_eht_mcs_nss_supp - EHT max supported NSS per MCS
 *
 * See P802.11be_D1.3 Table 9-401k - "Subfields of the Supported EHT-MCS
 * and NSS Set field"
 *
 * @only_20mhz: MCS/NSS support for 20 MHz-only STA.
 * @bw: MCS/NSS support for 80, 160 and 320 MHz
 * @bw._80: MCS/NSS support for BW <= 80 MHz
 * @bw._160: MCS/NSS support for BW = 160 MHz
 * @bw._320: MCS/NSS support for BW = 320 MHz
 */
struct ieee80211_eht_mcs_nss_supp {
        union {
                struct ieee80211_eht_mcs_nss_supp_20mhz_only only_20mhz;
                struct {
                        struct ieee80211_eht_mcs_nss_supp_bw _80;
                        struct ieee80211_eht_mcs_nss_supp_bw _160;
                        struct ieee80211_eht_mcs_nss_supp_bw _320;
                } __packed bw;
        } __packed;
} __packed;

#define IEEE80211_EHT_PPE_THRES_MAX_LEN                32

/**
 * struct ieee80211_sta_eht_cap - STA's EHT capabilities
 *
 * This structure describes most essential parameters needed
 * to describe 802.11be EHT capabilities for a STA.
 *
 * @has_eht: true iff EHT data is valid.
 * @eht_cap_elem: Fixed portion of the eht capabilities element.
 * @eht_mcs_nss_supp: The supported NSS/MCS combinations.
 * @eht_ppe_thres: Holds the PPE Thresholds data.
 */
struct ieee80211_sta_eht_cap {
        bool has_eht;
        struct ieee80211_eht_cap_elem_fixed eht_cap_elem;
        struct ieee80211_eht_mcs_nss_supp eht_mcs_nss_supp;
        u8 eht_ppe_thres[IEEE80211_EHT_PPE_THRES_MAX_LEN];
};

/* sparse defines __CHECKER__; see Documentation/dev-tools/sparse.rst */
#ifdef __CHECKER__
/*
 * This is used to mark the sband->iftype_data pointer which is supposed
 * to be an array with special access semantics (per iftype), but a lot
 * of code got it wrong in the past, so with this marking sparse will be
 * noisy when the pointer is used directly.
 */
# define __iftd                __attribute__((noderef, address_space(__iftype_data)))
#else
# define __iftd
#endif /* __CHECKER__ */

/**
 * struct ieee80211_sband_iftype_data - sband data per interface type
 *
 * This structure encapsulates sband data that is relevant for the
 * interface types defined in @types_mask.  Each type in the
 * @types_mask must be unique across all instances of iftype_data.
 *
 * @types_mask: interface types mask
 * @he_cap: holds the HE capabilities
 * @he_6ghz_capa: HE 6 GHz capabilities, must be filled in for a
 *        6 GHz band channel (and 0 may be valid value).
 * @eht_cap: STA's EHT capabilities
 * @vendor_elems: vendor element(s) to advertise
 * @vendor_elems.data: vendor element(s) data
 * @vendor_elems.len: vendor element(s) length
 */
struct ieee80211_sband_iftype_data {
        u16 types_mask;
        struct ieee80211_sta_he_cap he_cap;
        struct ieee80211_he_6ghz_capa he_6ghz_capa;
        struct ieee80211_sta_eht_cap eht_cap;
        struct {
                const u8 *data;
                unsigned int len;
        } vendor_elems;
};

/**
 * enum ieee80211_edmg_bw_config - allowed channel bandwidth configurations
 *
 * @IEEE80211_EDMG_BW_CONFIG_4: 2.16GHz
 * @IEEE80211_EDMG_BW_CONFIG_5: 2.16GHz and 4.32GHz
 * @IEEE80211_EDMG_BW_CONFIG_6: 2.16GHz, 4.32GHz and 6.48GHz
 * @IEEE80211_EDMG_BW_CONFIG_7: 2.16GHz, 4.32GHz, 6.48GHz and 8.64GHz
 * @IEEE80211_EDMG_BW_CONFIG_8: 2.16GHz and 2.16GHz + 2.16GHz
 * @IEEE80211_EDMG_BW_CONFIG_9: 2.16GHz, 4.32GHz and 2.16GHz + 2.16GHz
 * @IEEE80211_EDMG_BW_CONFIG_10: 2.16GHz, 4.32GHz, 6.48GHz and 2.16GHz+2.16GHz
 * @IEEE80211_EDMG_BW_CONFIG_11: 2.16GHz, 4.32GHz, 6.48GHz, 8.64GHz and
 *        2.16GHz+2.16GHz
 * @IEEE80211_EDMG_BW_CONFIG_12: 2.16GHz, 2.16GHz + 2.16GHz and
 *        4.32GHz + 4.32GHz
 * @IEEE80211_EDMG_BW_CONFIG_13: 2.16GHz, 4.32GHz, 2.16GHz + 2.16GHz and
 *        4.32GHz + 4.32GHz
 * @IEEE80211_EDMG_BW_CONFIG_14: 2.16GHz, 4.32GHz, 6.48GHz, 2.16GHz + 2.16GHz
 *        and 4.32GHz + 4.32GHz
 * @IEEE80211_EDMG_BW_CONFIG_15: 2.16GHz, 4.32GHz, 6.48GHz, 8.64GHz,
 *        2.16GHz + 2.16GHz and 4.32GHz + 4.32GHz
 */
enum ieee80211_edmg_bw_config {
        IEEE80211_EDMG_BW_CONFIG_4        = 4,
        IEEE80211_EDMG_BW_CONFIG_5        = 5,
        IEEE80211_EDMG_BW_CONFIG_6        = 6,
        IEEE80211_EDMG_BW_CONFIG_7        = 7,
        IEEE80211_EDMG_BW_CONFIG_8        = 8,
        IEEE80211_EDMG_BW_CONFIG_9        = 9,
        IEEE80211_EDMG_BW_CONFIG_10        = 10,
        IEEE80211_EDMG_BW_CONFIG_11        = 11,
        IEEE80211_EDMG_BW_CONFIG_12        = 12,
        IEEE80211_EDMG_BW_CONFIG_13        = 13,
        IEEE80211_EDMG_BW_CONFIG_14        = 14,
        IEEE80211_EDMG_BW_CONFIG_15        = 15,
};

/**
 * struct ieee80211_edmg - EDMG configuration
 *
 * This structure describes most essential parameters needed
 * to describe 802.11ay EDMG configuration
 *
 * @channels: bitmap that indicates the 2.16 GHz channel(s)
 *        that are allowed to be used for transmissions.
 *        Bit 0 indicates channel 1, bit 1 indicates channel 2, etc.
 *        Set to 0 indicate EDMG not supported.
 * @bw_config: Channel BW Configuration subfield encodes
 *        the allowed channel bandwidth configurations
 */
struct ieee80211_edmg {
        u8 channels;
        enum ieee80211_edmg_bw_config bw_config;
};

/**
 * struct ieee80211_sta_s1g_cap - STA's S1G capabilities
 *
 * This structure describes most essential parameters needed
 * to describe 802.11ah S1G capabilities for a STA.
 *
 * @s1g: is STA an S1G STA
 * @cap: S1G capabilities information
 * @nss_mcs: Supported NSS MCS set
 */
struct ieee80211_sta_s1g_cap {
        bool s1g;
        u8 cap[10]; /* use S1G_CAPAB_ */
        u8 nss_mcs[5];
};

/**
 * struct ieee80211_supported_band - frequency band definition
 *
 * This structure describes a frequency band a wiphy
 * is able to operate in.
 *
 * @channels: Array of channels the hardware can operate with
 *        in this band.
 * @band: the band this structure represents
 * @n_channels: Number of channels in @channels
 * @bitrates: Array of bitrates the hardware can operate with
 *        in this band. Must be sorted to give a valid "supported
 *        rates" IE, i.e. CCK rates first, then OFDM.
 * @n_bitrates: Number of bitrates in @bitrates
 * @ht_cap: HT capabilities in this band
 * @vht_cap: VHT capabilities in this band
 * @s1g_cap: S1G capabilities in this band
 * @edmg_cap: EDMG capabilities in this band
 * @s1g_cap: S1G capabilities in this band (S1G band only, of course)
 * @n_iftype_data: number of iftype data entries
 * @iftype_data: interface type data entries.  Note that the bits in
 *        @types_mask inside this structure cannot overlap (i.e. only
 *        one occurrence of each type is allowed across all instances of
 *        iftype_data).
 */
struct ieee80211_supported_band {
        struct ieee80211_channel *channels;
        struct ieee80211_rate *bitrates;
        enum nl80211_band band;
        int n_channels;
        int n_bitrates;
        struct ieee80211_sta_ht_cap ht_cap;
        struct ieee80211_sta_vht_cap vht_cap;
        struct ieee80211_sta_s1g_cap s1g_cap;
        struct ieee80211_edmg edmg_cap;
        u16 n_iftype_data;
        const struct ieee80211_sband_iftype_data __iftd *iftype_data;
};

/**
 * _ieee80211_set_sband_iftype_data - set sband iftype data array
 * @sband: the sband to initialize
 * @iftd: the iftype data array pointer
 * @n_iftd: the length of the iftype data array
 *
 * Set the sband iftype data array; use this where the length cannot
 * be derived from the ARRAY_SIZE() of the argument, but prefer
 * ieee80211_set_sband_iftype_data() where it can be used.
 */
static inline void
_ieee80211_set_sband_iftype_data(struct ieee80211_supported_band *sband,
                                 const struct ieee80211_sband_iftype_data *iftd,
                                 u16 n_iftd)
{
        sband->iftype_data = (const void __iftd __force *)iftd;
        sband->n_iftype_data = n_iftd;
}

/**
 * ieee80211_set_sband_iftype_data - set sband iftype data array
 * @sband: the sband to initialize
 * @iftd: the iftype data array
 */
#define ieee80211_set_sband_iftype_data(sband, iftd)        \
        _ieee80211_set_sband_iftype_data(sband, iftd, ARRAY_SIZE(iftd))

/**
 * for_each_sband_iftype_data - iterate sband iftype data entries
 * @sband: the sband whose iftype_data array to iterate
 * @i: iterator counter
 * @iftd: iftype data pointer to set
 */
#define for_each_sband_iftype_data(sband, i, iftd)                                \
        for (i = 0, iftd = (const void __force *)&(sband)->iftype_data[i];        \
             i < (sband)->n_iftype_data;                                        \
             i++, iftd = (const void __force *)&(sband)->iftype_data[i])

/**
 * ieee80211_get_sband_iftype_data - return sband data for a given iftype
 * @sband: the sband to search for the STA on
 * @iftype: enum nl80211_iftype
 *
 * Return: pointer to struct ieee80211_sband_iftype_data, or NULL is none found
 */
static inline const struct ieee80211_sband_iftype_data *
ieee80211_get_sband_iftype_data(const struct ieee80211_supported_band *sband,
                                u8 iftype)
{
        const struct ieee80211_sband_iftype_data *data;
        int i;

        if (WARN_ON(iftype >= NUM_NL80211_IFTYPES))
                return NULL;

        if (iftype == NL80211_IFTYPE_AP_VLAN)
                iftype = NL80211_IFTYPE_AP;

        for_each_sband_iftype_data(sband, i, data) {
                if (data->types_mask & BIT(iftype))
                        return data;
        }

        return NULL;
}

/**
 * ieee80211_get_he_iftype_cap - return HE capabilities for an sband's iftype
 * @sband: the sband to search for the iftype on
 * @iftype: enum nl80211_iftype
 *
 * Return: pointer to the struct ieee80211_sta_he_cap, or NULL is none found
 */
static inline const struct ieee80211_sta_he_cap *
ieee80211_get_he_iftype_cap(const struct ieee80211_supported_band *sband,
                            u8 iftype)
{
        const struct ieee80211_sband_iftype_data *data =
                ieee80211_get_sband_iftype_data(sband, iftype);

        if (data && data->he_cap.has_he)
                return &data->he_cap;

        return NULL;
}

/**
 * ieee80211_get_he_6ghz_capa - return HE 6 GHz capabilities
 * @sband: the sband to search for the STA on
 * @iftype: the iftype to search for
 *
 * Return: the 6GHz capabilities
 */
static inline __le16
ieee80211_get_he_6ghz_capa(const struct ieee80211_supported_band *sband,
                           enum nl80211_iftype iftype)
{
        const struct ieee80211_sband_iftype_data *data =
                ieee80211_get_sband_iftype_data(sband, iftype);

        if (WARN_ON(!data || !data->he_cap.has_he))
                return 0;

        return data->he_6ghz_capa.capa;
}

/**
 * ieee80211_get_eht_iftype_cap - return ETH capabilities for an sband's iftype
 * @sband: the sband to search for the iftype on
 * @iftype: enum nl80211_iftype
 *
 * Return: pointer to the struct ieee80211_sta_eht_cap, or NULL is none found
 */
static inline const struct ieee80211_sta_eht_cap *
ieee80211_get_eht_iftype_cap(const struct ieee80211_supported_band *sband,
                             enum nl80211_iftype iftype)
{
        const struct ieee80211_sband_iftype_data *data =
                ieee80211_get_sband_iftype_data(sband, iftype);

        if (data && data->eht_cap.has_eht)
                return &data->eht_cap;

        return NULL;
}

/**
 * wiphy_read_of_freq_limits - read frequency limits from device tree
 *
 * @wiphy: the wireless device to get extra limits for
 *
 * Some devices may have extra limitations specified in DT. This may be useful
 * for chipsets that normally support more bands but are limited due to board
 * design (e.g. by antennas or external power amplifier).
 *
 * This function reads info from DT and uses it to *modify* channels (disable
 * unavailable ones). It's usually a *bad* idea to use it in drivers with
 * shared channel data as DT limitations are device specific. You should make
 * sure to call it only if channels in wiphy are copied and can be modified
 * without affecting other devices.
 *
 * As this function access device node it has to be called after set_wiphy_dev.
 * It also modifies channels so they have to be set first.
 * If using this helper, call it before wiphy_register().
 */
#ifdef CONFIG_OF
void wiphy_read_of_freq_limits(struct wiphy *wiphy);
#else /* CONFIG_OF */
static inline void wiphy_read_of_freq_limits(struct wiphy *wiphy)
{
}
#endif /* !CONFIG_OF */


/*
 * Wireless hardware/device configuration structures and methods
 */

/**
 * DOC: Actions and configuration
 *
 * Each wireless device and each virtual interface offer a set of configuration
 * operations and other actions that are invoked by userspace. Each of these
 * actions is described in the operations structure, and the parameters these
 * operations use are described separately.
 *
 * Additionally, some operations are asynchronous and expect to get status
 * information via some functions that drivers need to call.
 *
 * Scanning and BSS list handling with its associated functionality is described
 * in a separate chapter.
 */

#define VHT_MUMIMO_GROUPS_DATA_LEN (WLAN_MEMBERSHIP_LEN +\
                                    WLAN_USER_POSITION_LEN)

/**
 * struct vif_params - describes virtual interface parameters
 * @flags: monitor interface flags, unchanged if 0, otherwise
 *        %MONITOR_FLAG_CHANGED will be set
 * @use_4addr: use 4-address frames
 * @macaddr: address to use for this virtual interface.
 *        If this parameter is set to zero address the driver may
 *        determine the address as needed.
 *        This feature is only fully supported by drivers that enable the
 *        %NL80211_FEATURE_MAC_ON_CREATE flag.  Others may support creating
 **        only p2p devices with specified MAC.
 * @vht_mumimo_groups: MU-MIMO groupID, used for monitoring MU-MIMO packets
 *        belonging to that MU-MIMO groupID; %NULL if not changed
 * @vht_mumimo_follow_addr: MU-MIMO follow address, used for monitoring
 *        MU-MIMO packets going to the specified station; %NULL if not changed
 */
struct vif_params {
        u32 flags;
        int use_4addr;
        u8 macaddr[ETH_ALEN];
        const u8 *vht_mumimo_groups;
        const u8 *vht_mumimo_follow_addr;
};

/**
 * struct key_params - key information
 *
 * Information about a key
 *
 * @key: key material
 * @key_len: length of key material
 * @cipher: cipher suite selector
 * @seq: sequence counter (IV/PN) for TKIP and CCMP keys, only used
 *        with the get_key() callback, must be in little endian,
 *        length given by @seq_len.
 * @seq_len: length of @seq.
 * @vlan_id: vlan_id for VLAN group key (if nonzero)
 * @mode: key install mode (RX_TX, NO_TX or SET_TX)
 */
struct key_params {
        const u8 *key;
        const u8 *seq;
        int key_len;
        int seq_len;
        u16 vlan_id;
        u32 cipher;
        enum nl80211_key_mode mode;
};

/**
 * struct cfg80211_chan_def - channel definition
 * @chan: the (control) channel
 * @width: channel width
 * @center_freq1: center frequency of first segment
 * @center_freq2: center frequency of second segment
 *        (only with 80+80 MHz)
 * @edmg: define the EDMG channels configuration.
 *        If edmg is requested (i.e. the .channels member is non-zero),
 *        chan will define the primary channel and all other
 *        parameters are ignored.
 * @freq1_offset: offset from @center_freq1, in KHz
 * @punctured: mask of the punctured 20 MHz subchannels, with
 *        bits turned on being disabled (punctured); numbered
 *        from lower to higher frequency (like in the spec)
 * @s1g_primary_2mhz: Indicates if the control channel pointed to
 *        by 'chan' exists as a 1MHz primary subchannel within an
 *        S1G 2MHz primary channel.
 */
struct cfg80211_chan_def {
        struct ieee80211_channel *chan;
        enum nl80211_chan_width width;
        u32 center_freq1;
        u32 center_freq2;
        struct ieee80211_edmg edmg;
        u16 freq1_offset;
        u16 punctured;
        bool s1g_primary_2mhz;
};

/*
 * cfg80211_bitrate_mask - masks for bitrate control
 */
struct cfg80211_bitrate_mask {
        struct {
                u32 legacy;
                u8 ht_mcs[IEEE80211_HT_MCS_MASK_LEN];
                u16 vht_mcs[NL80211_VHT_NSS_MAX];
                u16 he_mcs[NL80211_HE_NSS_MAX];
                u16 eht_mcs[NL80211_EHT_NSS_MAX];
                enum nl80211_txrate_gi gi;
                enum nl80211_he_gi he_gi;
                enum nl80211_eht_gi eht_gi;
                enum nl80211_he_ltf he_ltf;
                enum nl80211_eht_ltf eht_ltf;
        } control[NUM_NL80211_BANDS];
};


/**
 * struct cfg80211_tid_cfg - TID specific configuration
 * @config_override: Flag to notify driver to reset TID configuration
 *        of the peer.
 * @tids: bitmap of TIDs to modify
 * @mask: bitmap of attributes indicating which parameter changed,
 *        similar to &nl80211_tid_config_supp.
 * @noack: noack configuration value for the TID
 * @retry_long: retry count value
 * @retry_short: retry count value
 * @ampdu: Enable/Disable MPDU aggregation
 * @rtscts: Enable/Disable RTS/CTS
 * @amsdu: Enable/Disable MSDU aggregation
 * @txrate_type: Tx bitrate mask type
 * @txrate_mask: Tx bitrate to be applied for the TID
 */
struct cfg80211_tid_cfg {
        bool config_override;
        u8 tids;
        u64 mask;
        enum nl80211_tid_config noack;
        u8 retry_long, retry_short;
        enum nl80211_tid_config ampdu;
        enum nl80211_tid_config rtscts;
        enum nl80211_tid_config amsdu;
        enum nl80211_tx_rate_setting txrate_type;
        struct cfg80211_bitrate_mask txrate_mask;
};

/**
 * struct cfg80211_tid_config - TID configuration
 * @peer: Station's MAC address
 * @n_tid_conf: Number of TID specific configurations to be applied
 * @tid_conf: Configuration change info
 */
struct cfg80211_tid_config {
        const u8 *peer;
        u32 n_tid_conf;
        struct cfg80211_tid_cfg tid_conf[] __counted_by(n_tid_conf);
};

/**
 * struct cfg80211_fils_aad - FILS AAD data
 * @macaddr: STA MAC address
 * @kek: FILS KEK
 * @kek_len: FILS KEK length
 * @snonce: STA Nonce
 * @anonce: AP Nonce
 */
struct cfg80211_fils_aad {
        const u8 *macaddr;
        const u8 *kek;
        u8 kek_len;
        const u8 *snonce;
        const u8 *anonce;
};

/**
 * struct cfg80211_set_hw_timestamp - enable/disable HW timestamping
 * @macaddr: peer MAC address. NULL to enable/disable HW timestamping for all
 *        addresses.
 * @enable: if set, enable HW timestamping for the specified MAC address.
 *        Otherwise disable HW timestamping for the specified MAC address.
 */
struct cfg80211_set_hw_timestamp {
        const u8 *macaddr;
        bool enable;
};

/**
 * cfg80211_get_chandef_type - return old channel type from chandef
 * @chandef: the channel definition
 *
 * Return: The old channel type (NOHT, HT20, HT40+/-) from a given
 * chandef, which must have a bandwidth allowing this conversion.
 */
static inline enum nl80211_channel_type
cfg80211_get_chandef_type(const struct cfg80211_chan_def *chandef)
{
        switch (chandef->width) {
        case NL80211_CHAN_WIDTH_20_NOHT:
                return NL80211_CHAN_NO_HT;
        case NL80211_CHAN_WIDTH_20:
                return NL80211_CHAN_HT20;
        case NL80211_CHAN_WIDTH_40:
                if (chandef->center_freq1 > chandef->chan->center_freq)
                        return NL80211_CHAN_HT40PLUS;
                return NL80211_CHAN_HT40MINUS;
        default:
                WARN_ON(1);
                return NL80211_CHAN_NO_HT;
        }
}

/**
 * cfg80211_chandef_create - create channel definition using channel type
 * @chandef: the channel definition struct to fill
 * @channel: the control channel
 * @chantype: the channel type
 *
 * Given a channel type, create a channel definition.
 */
void cfg80211_chandef_create(struct cfg80211_chan_def *chandef,
                             struct ieee80211_channel *channel,
                             enum nl80211_channel_type chantype);

/**
 * cfg80211_chandef_identical - check if two channel definitions are identical
 * @chandef1: first channel definition
 * @chandef2: second channel definition
 *
 * Return: %true if the channels defined by the channel definitions are
 * identical, %false otherwise.
 */
static inline bool
cfg80211_chandef_identical(const struct cfg80211_chan_def *chandef1,
                           const struct cfg80211_chan_def *chandef2)
{
        return (chandef1->chan == chandef2->chan &&
                chandef1->width == chandef2->width &&
                chandef1->center_freq1 == chandef2->center_freq1 &&
                chandef1->freq1_offset == chandef2->freq1_offset &&
                chandef1->center_freq2 == chandef2->center_freq2 &&
                chandef1->punctured == chandef2->punctured);
}

/**
 * cfg80211_chandef_is_edmg - check if chandef represents an EDMG channel
 *
 * @chandef: the channel definition
 *
 * Return: %true if EDMG defined, %false otherwise.
 */
static inline bool
cfg80211_chandef_is_edmg(const struct cfg80211_chan_def *chandef)
{
        return chandef->edmg.channels || chandef->edmg.bw_config;
}

/**
 * cfg80211_chandef_is_s1g - check if chandef represents an S1G channel
 * @chandef: the channel definition
 *
 * Return: %true if S1G.
 */
static inline bool
cfg80211_chandef_is_s1g(const struct cfg80211_chan_def *chandef)
{
        return chandef->chan->band == NL80211_BAND_S1GHZ;
}

/**
 * cfg80211_chandef_compatible - check if two channel definitions are compatible
 * @chandef1: first channel definition
 * @chandef2: second channel definition
 *
 * Return: %NULL if the given channel definitions are incompatible,
 * chandef1 or chandef2 otherwise.
 */
const struct cfg80211_chan_def *
cfg80211_chandef_compatible(const struct cfg80211_chan_def *chandef1,
                            const struct cfg80211_chan_def *chandef2);

/**
 * nl80211_chan_width_to_mhz - get the channel width in MHz
 * @chan_width: the channel width from &enum nl80211_chan_width
 *
 * Return: channel width in MHz if the chan_width from &enum nl80211_chan_width
 * is valid. -1 otherwise.
 */
int nl80211_chan_width_to_mhz(enum nl80211_chan_width chan_width);

/**
 * cfg80211_chandef_get_width - return chandef width in MHz
 * @c: chandef to return bandwidth for
 * Return: channel width in MHz for the given chandef; note that it returns
 *        80 for 80+80 configurations
 */
static inline int cfg80211_chandef_get_width(const struct cfg80211_chan_def *c)
{
        return nl80211_chan_width_to_mhz(c->width);
}

/**
 * cfg80211_chandef_valid - check if a channel definition is valid
 * @chandef: the channel definition to check
 * Return: %true if the channel definition is valid. %false otherwise.
 */
bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef);

/**
 * cfg80211_chandef_usable - check if secondary channels can be used
 * @wiphy: the wiphy to validate against
 * @chandef: the channel definition to check
 * @prohibited_flags: the regulatory channel flags that must not be set
 * Return: %true if secondary channels are usable. %false otherwise.
 */
bool cfg80211_chandef_usable(struct wiphy *wiphy,
                             const struct cfg80211_chan_def *chandef,
                             u32 prohibited_flags);

/**
 * cfg80211_chandef_dfs_required - checks if radar detection is required
 * @wiphy: the wiphy to validate against
 * @chandef: the channel definition to check
 * @iftype: the interface type as specified in &enum nl80211_iftype
 * Returns:
 *        1 if radar detection is required, 0 if it is not, < 0 on error
 */
int cfg80211_chandef_dfs_required(struct wiphy *wiphy,
                                  const struct cfg80211_chan_def *chandef,
                                  enum nl80211_iftype iftype);

/**
 * cfg80211_chandef_dfs_usable - checks if chandef is DFS usable and we
 *                                 can/need start CAC on such channel
 * @wiphy: the wiphy to validate against
 * @chandef: the channel definition to check
 *
 * Return: true if all channels available and at least
 *           one channel requires CAC (NL80211_DFS_USABLE)
 */
bool cfg80211_chandef_dfs_usable(struct wiphy *wiphy,
                                 const struct cfg80211_chan_def *chandef);

/**
 * cfg80211_chandef_dfs_cac_time - get the DFS CAC time (in ms) for given
 *                                   channel definition
 * @wiphy: the wiphy to validate against
 * @chandef: the channel definition to check
 *
 * Returns: DFS CAC time (in ms) which applies for this channel definition
 */
unsigned int
cfg80211_chandef_dfs_cac_time(struct wiphy *wiphy,
                              const struct cfg80211_chan_def *chandef);

/**
 * cfg80211_chandef_primary - calculate primary 40/80/160 MHz freq
 * @chandef: chandef to calculate for
 * @primary_chan_width: primary channel width to calculate center for
 * @punctured: punctured sub-channel bitmap, will be recalculated
 *        according to the new bandwidth, can be %NULL
 *
 * Returns: the primary 40/80/160 MHz channel center frequency, or -1
 *        for errors, updating the punctured bitmap
 */
int cfg80211_chandef_primary(const struct cfg80211_chan_def *chandef,
                             enum nl80211_chan_width primary_chan_width,
                             u16 *punctured);

/**
 * nl80211_send_chandef - sends the channel definition.
 * @msg: the msg to send channel definition
 * @chandef: the channel definition to check
 *
 * Returns: 0 if sent the channel definition to msg, < 0 on error
 **/
int nl80211_send_chandef(struct sk_buff *msg, const struct cfg80211_chan_def *chandef);

/**
 * ieee80211_chandef_max_power - maximum transmission power for the chandef
 *
 * In some regulations, the transmit power may depend on the configured channel
 * bandwidth which may be defined as dBm/MHz. This function returns the actual
 * max_power for non-standard (20 MHz) channels.
 *
 * @chandef: channel definition for the channel
 *
 * Returns: maximum allowed transmission power in dBm for the chandef
 */
static inline int
ieee80211_chandef_max_power(struct cfg80211_chan_def *chandef)
{
        switch (chandef->width) {
        case NL80211_CHAN_WIDTH_5:
                return min(chandef->chan->max_reg_power - 6,
                           chandef->chan->max_power);
        case NL80211_CHAN_WIDTH_10:
                return min(chandef->chan->max_reg_power - 3,
                           chandef->chan->max_power);
        default:
                break;
        }
        return chandef->chan->max_power;
}

/**
 * cfg80211_any_usable_channels - check for usable channels
 * @wiphy: the wiphy to check for
 * @band_mask: which bands to check on
 * @prohibited_flags: which channels to not consider usable,
 *        %IEEE80211_CHAN_DISABLED is always taken into account
 *
 * Return: %true if usable channels found, %false otherwise
 */
bool cfg80211_any_usable_channels(struct wiphy *wiphy,
                                  unsigned long band_mask,
                                  u32 prohibited_flags);

/**
 * enum survey_info_flags - survey information flags
 *
 * @SURVEY_INFO_NOISE_DBM: noise (in dBm) was filled in
 * @SURVEY_INFO_IN_USE: channel is currently being used
 * @SURVEY_INFO_TIME: active time (in ms) was filled in
 * @SURVEY_INFO_TIME_BUSY: busy time was filled in
 * @SURVEY_INFO_TIME_EXT_BUSY: extension channel busy time was filled in
 * @SURVEY_INFO_TIME_RX: receive time was filled in
 * @SURVEY_INFO_TIME_TX: transmit time was filled in
 * @SURVEY_INFO_TIME_SCAN: scan time was filled in
 * @SURVEY_INFO_TIME_BSS_RX: local BSS receive time was filled in
 *
 * Used by the driver to indicate which info in &struct survey_info
 * it has filled in during the get_survey().
 */
enum survey_info_flags {
        SURVEY_INFO_NOISE_DBM                = BIT(0),
        SURVEY_INFO_IN_USE                = BIT(1),
        SURVEY_INFO_TIME                = BIT(2),
        SURVEY_INFO_TIME_BUSY                = BIT(3),
        SURVEY_INFO_TIME_EXT_BUSY        = BIT(4),
        SURVEY_INFO_TIME_RX                = BIT(5),
        SURVEY_INFO_TIME_TX                = BIT(6),
        SURVEY_INFO_TIME_SCAN                = BIT(7),
        SURVEY_INFO_TIME_BSS_RX                = BIT(8),
};

/**
 * struct survey_info - channel survey response
 *
 * @channel: the channel this survey record reports, may be %NULL for a single
 *        record to report global statistics
 * @filled: bitflag of flags from &enum survey_info_flags
 * @noise: channel noise in dBm. This and all following fields are
 *        optional
 * @time: amount of time in ms the radio was turn on (on the channel)
 * @time_busy: amount of time the primary channel was sensed busy
 * @time_ext_busy: amount of time the extension channel was sensed busy
 * @time_rx: amount of time the radio spent receiving data
 * @time_tx: amount of time the radio spent transmitting data
 * @time_scan: amount of time the radio spent for scanning
 * @time_bss_rx: amount of time the radio spent receiving data on a local BSS
 *
 * Used by dump_survey() to report back per-channel survey information.
 *
 * This structure can later be expanded with things like
 * channel duty cycle etc.
 */
struct survey_info {
        struct ieee80211_channel *channel;
        u64 time;
        u64 time_busy;
        u64 time_ext_busy;
        u64 time_rx;
        u64 time_tx;
        u64 time_scan;
        u64 time_bss_rx;
        u32 filled;
        s8 noise;
};

#define CFG80211_MAX_NUM_AKM_SUITES        10

/**
 * struct cfg80211_crypto_settings - Crypto settings
 * @wpa_versions: indicates which, if any, WPA versions are enabled
 *        (from enum nl80211_wpa_versions)
 * @cipher_group: group key cipher suite (or 0 if unset)
 * @n_ciphers_pairwise: number of AP supported unicast ciphers
 * @ciphers_pairwise: unicast key cipher suites
 * @n_akm_suites: number of AKM suites
 * @akm_suites: AKM suites
 * @control_port: Whether user space controls IEEE 802.1X port, i.e.,
 *        sets/clears %NL80211_STA_FLAG_AUTHORIZED. If true, the driver is
 *        required to assume that the port is unauthorized until authorized by
 *        user space. Otherwise, port is marked authorized by default.
 * @control_port_ethertype: the control port protocol that should be
 *        allowed through even on unauthorized ports
 * @control_port_no_encrypt: TRUE to prevent encryption of control port
 *        protocol frames.
 * @control_port_over_nl80211: TRUE if userspace expects to exchange control
 *        port frames over NL80211 instead of the network interface.
 * @control_port_no_preauth: disables pre-auth rx over the nl80211 control
 *        port for mac80211
 * @psk: PSK (for devices supporting 4-way-handshake offload)
 * @sae_pwd: password for SAE authentication (for devices supporting SAE
 *        offload)
 * @sae_pwd_len: length of SAE password (for devices supporting SAE offload)
 * @sae_pwe: The mechanisms allowed for SAE PWE derivation:
 *
 *        NL80211_SAE_PWE_UNSPECIFIED
 *          Not-specified, used to indicate userspace did not specify any
 *          preference. The driver should follow its internal policy in
 *          such a scenario.
 *
 *        NL80211_SAE_PWE_HUNT_AND_PECK
 *          Allow hunting-and-pecking loop only
 *
 *        NL80211_SAE_PWE_HASH_TO_ELEMENT
 *          Allow hash-to-element only
 *
 *        NL80211_SAE_PWE_BOTH
 *          Allow either hunting-and-pecking loop or hash-to-element
 */
struct cfg80211_crypto_settings {
        u32 wpa_versions;
        u32 cipher_group;
        int n_ciphers_pairwise;
        u32 ciphers_pairwise[NL80211_MAX_NR_CIPHER_SUITES];
        int n_akm_suites;
        u32 akm_suites[CFG80211_MAX_NUM_AKM_SUITES];
        bool control_port;
        __be16 control_port_ethertype;
        bool control_port_no_encrypt;
        bool control_port_over_nl80211;
        bool control_port_no_preauth;
        const u8 *psk;
        const u8 *sae_pwd;
        u8 sae_pwd_len;
        enum nl80211_sae_pwe_mechanism sae_pwe;
};

/**
 * struct cfg80211_mbssid_config - AP settings for multi bssid
 *
 * @tx_wdev: pointer to the transmitted interface in the MBSSID set
 * @tx_link_id: link ID of the transmitted profile in an MLD.
 * @index: index of this AP in the multi bssid group.
 * @ema: set to true if the beacons should be sent out in EMA mode.
 */
struct cfg80211_mbssid_config {
        struct wireless_dev *tx_wdev;
        u8 tx_link_id;
        u8 index;
        bool ema;
};

/**
 * struct cfg80211_mbssid_elems - Multiple BSSID elements
 *
 * @cnt: Number of elements in array %elems.
 *
 * @elem: Array of multiple BSSID element(s) to be added into Beacon frames.
 * @elem.data: Data for multiple BSSID elements.
 * @elem.len: Length of data.
 */
struct cfg80211_mbssid_elems {
        u8 cnt;
        struct {
                const u8 *data;
                size_t len;
        } elem[] __counted_by(cnt);
};

/**
 * struct cfg80211_rnr_elems - Reduced neighbor report (RNR) elements
 *
 * @cnt: Number of elements in array %elems.
 *
 * @elem: Array of RNR element(s) to be added into Beacon frames.
 * @elem.data: Data for RNR elements.
 * @elem.len: Length of data.
 */
struct cfg80211_rnr_elems {
        u8 cnt;
        struct {
                const u8 *data;
                size_t len;
        } elem[] __counted_by(cnt);
};

/**
 * struct cfg80211_beacon_data - beacon data
 * @link_id: the link ID for the AP MLD link sending this beacon
 * @head: head portion of beacon (before TIM IE)
 *        or %NULL if not changed
 * @tail: tail portion of beacon (after TIM IE)
 *        or %NULL if not changed
 * @head_len: length of @head
 * @tail_len: length of @tail
 * @beacon_ies: extra information element(s) to add into Beacon frames or %NULL
 * @beacon_ies_len: length of beacon_ies in octets
 * @proberesp_ies: extra information element(s) to add into Probe Response
 *        frames or %NULL
 * @proberesp_ies_len: length of proberesp_ies in octets
 * @assocresp_ies: extra information element(s) to add into (Re)Association
 *        Response frames or %NULL
 * @assocresp_ies_len: length of assocresp_ies in octets
 * @probe_resp_len: length of probe response template (@probe_resp)
 * @probe_resp: probe response template (AP mode only)
 * @mbssid_ies: multiple BSSID elements
 * @rnr_ies: reduced neighbor report elements
 * @ftm_responder: enable FTM responder functionality; -1 for no change
 *        (which also implies no change in LCI/civic location data)
 * @lci: Measurement Report element content, starting with Measurement Token
 *        (measurement type 8)
 * @civicloc: Measurement Report element content, starting with Measurement
 *        Token (measurement type 11)
 * @lci_len: LCI data length
 * @civicloc_len: Civic location data length
 * @he_bss_color: BSS Color settings
 * @he_bss_color_valid: indicates whether bss color
 *        attribute is present in beacon data or not.
 */
struct cfg80211_beacon_data {
        unsigned int link_id;

        const u8 *head, *tail;
        const u8 *beacon_ies;
        const u8 *proberesp_ies;
        const u8 *assocresp_ies;
        const u8 *probe_resp;
        const u8 *lci;
        const u8 *civicloc;
        struct cfg80211_mbssid_elems *mbssid_ies;
        struct cfg80211_rnr_elems *rnr_ies;
        s8 ftm_responder;

        size_t head_len, tail_len;
        size_t beacon_ies_len;
        size_t proberesp_ies_len;
        size_t assocresp_ies_len;
        size_t probe_resp_len;
        size_t lci_len;
        size_t civicloc_len;
        struct cfg80211_he_bss_color he_bss_color;
        bool he_bss_color_valid;
};

struct mac_address {
        u8 addr[ETH_ALEN];
};

/**
 * struct cfg80211_acl_data - Access control list data
 *
 * @acl_policy: ACL policy to be applied on the station's
 *        entry specified by mac_addr
 * @n_acl_entries: Number of MAC address entries passed
 * @mac_addrs: List of MAC addresses of stations to be used for ACL
 */
struct cfg80211_acl_data {
        enum nl80211_acl_policy acl_policy;
        int n_acl_entries;

        /* Keep it last */
        struct mac_address mac_addrs[] __counted_by(n_acl_entries);
};

/**
 * struct cfg80211_fils_discovery - FILS discovery parameters from
 * IEEE Std 802.11ai-2016, Annex C.3 MIB detail.
 *
 * @update: Set to true if the feature configuration should be updated.
 * @min_interval: Minimum packet interval in TUs (0 - 10000)
 * @max_interval: Maximum packet interval in TUs (0 - 10000)
 * @tmpl_len: Template length
 * @tmpl: Template data for FILS discovery frame including the action
 *        frame headers.
 */
struct cfg80211_fils_discovery {
        bool update;
        u32 min_interval;
        u32 max_interval;
        size_t tmpl_len;
        const u8 *tmpl;
};

/**
 * struct cfg80211_unsol_bcast_probe_resp - Unsolicited broadcast probe
 *        response parameters in 6GHz.
 *
 * @update: Set to true if the feature configuration should be updated.
 * @interval: Packet interval in TUs. Maximum allowed is 20 TU, as mentioned
 *        in IEEE P802.11ax/D6.0 26.17.2.3.2 - AP behavior for fast passive
 *        scanning
 * @tmpl_len: Template length
 * @tmpl: Template data for probe response
 */
struct cfg80211_unsol_bcast_probe_resp {
        bool update;
        u32 interval;
        size_t tmpl_len;
        const u8 *tmpl;
};

/**
 * struct cfg80211_s1g_short_beacon - S1G short beacon data.
 *
 * @update: Set to true if the feature configuration should be updated.
 * @short_head: Short beacon head.
 * @short_tail: Short beacon tail.
 * @short_head_len: Short beacon head len.
 * @short_tail_len: Short beacon tail len.
 */
struct cfg80211_s1g_short_beacon {
        bool update;
        const u8 *short_head;
        const u8 *short_tail;
        size_t short_head_len;
        size_t short_tail_len;
};

/**
 * struct cfg80211_ap_settings - AP configuration
 *
 * Used to configure an AP interface.
 *
 * @chandef: defines the channel to use
 * @beacon: beacon data
 * @beacon_interval: beacon interval
 * @dtim_period: DTIM period
 * @ssid: SSID to be used in the BSS (note: may be %NULL if not provided from
 *        user space)
 * @ssid_len: length of @ssid
 * @hidden_ssid: whether to hide the SSID in Beacon/Probe Response frames
 * @crypto: crypto settings
 * @privacy: the BSS uses privacy
 * @auth_type: Authentication type (algorithm)
 * @inactivity_timeout: time in seconds to determine station's inactivity.
 * @p2p_ctwindow: P2P CT Window
 * @p2p_opp_ps: P2P opportunistic PS
 * @acl: ACL configuration used by the drivers which has support for
 *        MAC address based access control
 * @pbss: If set, start as a PCP instead of AP. Relevant for DMG
 *        networks.
 * @beacon_rate: bitrate to be used for beacons
 * @ht_cap: HT capabilities (or %NULL if HT isn't enabled)
 * @vht_cap: VHT capabilities (or %NULL if VHT isn't enabled)
 * @he_cap: HE capabilities (or %NULL if HE isn't enabled)
 * @eht_cap: EHT capabilities (or %NULL if EHT isn't enabled)
 * @eht_oper: EHT operation IE (or %NULL if EHT isn't enabled)
 * @ht_required: stations must support HT
 * @vht_required: stations must support VHT
 * @twt_responder: Enable Target Wait Time
 * @he_required: stations must support HE
 * @sae_h2e_required: stations must support direct H2E technique in SAE
 * @flags: flags, as defined in &enum nl80211_ap_settings_flags
 * @he_obss_pd: OBSS Packet Detection settings
 * @he_oper: HE operation IE (or %NULL if HE isn't enabled)
 * @fils_discovery: FILS discovery transmission parameters
 * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters
 * @mbssid_config: AP settings for multiple bssid
 * @s1g_long_beacon_period: S1G long beacon period
 * @s1g_short_beacon: S1G short beacon data
 */
struct cfg80211_ap_settings {
        struct cfg80211_chan_def chandef;

        struct cfg80211_beacon_data beacon;

        int beacon_interval, dtim_period;
        const u8 *ssid;
        size_t ssid_len;
        enum nl80211_hidden_ssid hidden_ssid;
        struct cfg80211_crypto_settings crypto;
        bool privacy;
        enum nl80211_auth_type auth_type;
        int inactivity_timeout;
        u8 p2p_ctwindow;
        bool p2p_opp_ps;
        const struct cfg80211_acl_data *acl;
        bool pbss;
        struct cfg80211_bitrate_mask beacon_rate;

        const struct ieee80211_ht_cap *ht_cap;
        const struct ieee80211_vht_cap *vht_cap;
        const struct ieee80211_he_cap_elem *he_cap;
        const struct ieee80211_he_operation *he_oper;
        const struct ieee80211_eht_cap_elem *eht_cap;
        const struct ieee80211_eht_operation *eht_oper;
        bool ht_required, vht_required, he_required, sae_h2e_required;
        bool twt_responder;
        u32 flags;
        struct ieee80211_he_obss_pd he_obss_pd;
        struct cfg80211_fils_discovery fils_discovery;
        struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp;
        struct cfg80211_mbssid_config mbssid_config;
        u8 s1g_long_beacon_period;
        struct cfg80211_s1g_short_beacon s1g_short_beacon;
};


/**
 * struct cfg80211_ap_update - AP configuration update
 *
 * Subset of &struct cfg80211_ap_settings, for updating a running AP.
 *
 * @beacon: beacon data
 * @fils_discovery: FILS discovery transmission parameters
 * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters
 * @s1g_short_beacon: S1G short beacon data
 */
struct cfg80211_ap_update {
        struct cfg80211_beacon_data beacon;
        struct cfg80211_fils_discovery fils_discovery;
        struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp;
        struct cfg80211_s1g_short_beacon s1g_short_beacon;
};

/**
 * struct cfg80211_csa_settings - channel switch settings
 *
 * Used for channel switch
 *
 * @chandef: defines the channel to use after the switch
 * @beacon_csa: beacon data while performing the switch
 * @counter_offsets_beacon: offsets of the counters within the beacon (tail)
 * @counter_offsets_presp: offsets of the counters within the probe response
 * @n_counter_offsets_beacon: number of csa counters the beacon (tail)
 * @n_counter_offsets_presp: number of csa counters in the probe response
 * @beacon_after: beacon data to be used on the new channel
 * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters
 * @radar_required: whether radar detection is required on the new channel
 * @block_tx: whether transmissions should be blocked while changing
 * @count: number of beacons until switch
 * @link_id: defines the link on which channel switch is expected during
 *        MLO. 0 in case of non-MLO.
 */
struct cfg80211_csa_settings {
        struct cfg80211_chan_def chandef;
        struct cfg80211_beacon_data beacon_csa;
        const u16 *counter_offsets_beacon;
        const u16 *counter_offsets_presp;
        unsigned int n_counter_offsets_beacon;
        unsigned int n_counter_offsets_presp;
        struct cfg80211_beacon_data beacon_after;
        struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp;
        bool radar_required;
        bool block_tx;
        u8 count;
        u8 link_id;
};

/**
 * struct cfg80211_color_change_settings - color change settings
 *
 * Used for bss color change
 *
 * @beacon_color_change: beacon data while performing the color countdown
 * @counter_offset_beacon: offsets of the counters within the beacon (tail)
 * @counter_offset_presp: offsets of the counters within the probe response
 * @beacon_next: beacon data to be used after the color change
 * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters
 * @count: number of beacons until the color change
 * @color: the color used after the change
 * @link_id: defines the link on which color change is expected during MLO.
 *        0 in case of non-MLO.
 */
struct cfg80211_color_change_settings {
        struct cfg80211_beacon_data beacon_color_change;
        u16 counter_offset_beacon;
        u16 counter_offset_presp;
        struct cfg80211_beacon_data beacon_next;
        struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp;
        u8 count;
        u8 color;
        u8 link_id;
};

/**
 * struct iface_combination_params - input parameters for interface combinations
 *
 * Used to pass interface combination parameters
 *
 * @radio_idx: wiphy radio index or -1 for global
 * @num_different_channels: the number of different channels we want
 *        to use for verification
 * @radar_detect: a bitmap where each bit corresponds to a channel
 *        width where radar detection is needed, as in the definition of
 *        &struct ieee80211_iface_combination.@radar_detect_widths
 * @iftype_num: array with the number of interfaces of each interface
 *        type.  The index is the interface type as specified in &enum
 *        nl80211_iftype.
 * @new_beacon_int: set this to the beacon interval of a new interface
 *        that's not operating yet, if such is to be checked as part of
 *        the verification
 */
struct iface_combination_params {
        int radio_idx;
        int num_different_channels;
        u8 radar_detect;
        int iftype_num[NUM_NL80211_IFTYPES];
        u32 new_beacon_int;
};

/**
 * enum station_parameters_apply_mask - station parameter values to apply
 * @STATION_PARAM_APPLY_UAPSD: apply new uAPSD parameters (uapsd_queues, max_sp)
 * @STATION_PARAM_APPLY_CAPABILITY: apply new capability
 * @STATION_PARAM_APPLY_PLINK_STATE: apply new plink state
 *
 * Not all station parameters have in-band "no change" signalling,
 * for those that don't these flags will are used.
 */
enum station_parameters_apply_mask {
        STATION_PARAM_APPLY_UAPSD = BIT(0),
        STATION_PARAM_APPLY_CAPABILITY = BIT(1),
        STATION_PARAM_APPLY_PLINK_STATE = BIT(2),
};

/**
 * struct sta_txpwr - station txpower configuration
 *
 * Used to configure txpower for station.
 *
 * @power: tx power (in dBm) to be used for sending data traffic. If tx power
 *        is not provided, the default per-interface tx power setting will be
 *        overriding. Driver should be picking up the lowest tx power, either tx
 *        power per-interface or per-station.
 * @type: In particular if TPC %type is NL80211_TX_POWER_LIMITED then tx power
 *        will be less than or equal to specified from userspace, whereas if TPC
 *        %type is NL80211_TX_POWER_AUTOMATIC then it indicates default tx power.
 *        NL80211_TX_POWER_FIXED is not a valid configuration option for
 *        per peer TPC.
 */
struct sta_txpwr {
        s16 power;
        enum nl80211_tx_power_setting type;
};

/**
 * struct link_station_parameters - link station parameters
 *
 * Used to change and create a new link station.
 *
 * @mld_mac: MAC address of the station
 * @link_id: the link id (-1 for non-MLD station)
 * @link_mac: MAC address of the link
 * @supported_rates: supported rates in IEEE 802.11 format
 *        (or NULL for no change)
 * @supported_rates_len: number of supported rates
 * @ht_capa: HT capabilities of station
 * @vht_capa: VHT capabilities of station
 * @opmode_notif: operating mode field from Operating Mode Notification
 * @opmode_notif_used: information if operating mode field is used
 * @he_capa: HE capabilities of station
 * @he_capa_len: the length of the HE capabilities
 * @txpwr: transmit power for an associated station
 * @txpwr_set: txpwr field is set
 * @he_6ghz_capa: HE 6 GHz Band capabilities of station
 * @eht_capa: EHT capabilities of station
 * @eht_capa_len: the length of the EHT capabilities
 * @s1g_capa: S1G capabilities of station
 */
struct link_station_parameters {
        const u8 *mld_mac;
        int link_id;
        const u8 *link_mac;
        const u8 *supported_rates;
        u8 supported_rates_len;
        const struct ieee80211_ht_cap *ht_capa;
        const struct ieee80211_vht_cap *vht_capa;
        u8 opmode_notif;
        bool opmode_notif_used;
        const struct ieee80211_he_cap_elem *he_capa;
        u8 he_capa_len;
        struct sta_txpwr txpwr;
        bool txpwr_set;
        const struct ieee80211_he_6ghz_capa *he_6ghz_capa;
        const struct ieee80211_eht_cap_elem *eht_capa;
        u8 eht_capa_len;
        const struct ieee80211_s1g_cap *s1g_capa;
};

/**
 * struct link_station_del_parameters - link station deletion parameters
 *
 * Used to delete a link station entry (or all stations).
 *
 * @mld_mac: MAC address of the station
 * @link_id: the link id
 */
struct link_station_del_parameters {
        const u8 *mld_mac;
        u32 link_id;
};

/**
 * struct cfg80211_ttlm_params: TID to link mapping parameters
 *
 * Used for setting a TID to link mapping.
 *
 * @dlink: Downlink TID to link mapping, as defined in section 9.4.2.314
 *     (TID-To-Link Mapping element) in Draft P802.11be_D4.0.
 * @ulink: Uplink TID to link mapping, as defined in section 9.4.2.314
 *     (TID-To-Link Mapping element) in Draft P802.11be_D4.0.
 */
struct cfg80211_ttlm_params {
        u16 dlink[8];
        u16 ulink[8];
};

/**
 * struct station_parameters - station parameters
 *
 * Used to change and create a new station.
 *
 * @vlan: vlan interface station should belong to
 * @sta_flags_mask: station flags that changed
 *        (bitmask of BIT(%NL80211_STA_FLAG_...))
 * @sta_flags_set: station flags values
 *        (bitmask of BIT(%NL80211_STA_FLAG_...))
 * @listen_interval: listen interval or -1 for no change
 * @aid: AID or zero for no change
 * @vlan_id: VLAN ID for station (if nonzero)
 * @peer_aid: mesh peer AID or zero for no change
 * @plink_action: plink action to take
 * @plink_state: set the peer link state for a station
 * @uapsd_queues: bitmap of queues configured for uapsd. same format
 *        as the AC bitmap in the QoS info field
 * @max_sp: max Service Period. same format as the MAX_SP in the
 *        QoS info field (but already shifted down)
 * @sta_modify_mask: bitmap indicating which parameters changed
 *        (for those that don't have a natural "no change" value),
 *        see &enum station_parameters_apply_mask
 * @local_pm: local link-specific mesh power save mode (no change when set
 *        to unknown)
 * @capability: station capability
 * @ext_capab: extended capabilities of the station
 * @ext_capab_len: number of extended capabilities
 * @supported_channels: supported channels in IEEE 802.11 format
 * @supported_channels_len: number of supported channels
 * @supported_oper_classes: supported oper classes in IEEE 802.11 format
 * @supported_oper_classes_len: number of supported operating classes
 * @support_p2p_ps: information if station supports P2P PS mechanism
 * @airtime_weight: airtime scheduler weight for this station
 * @eml_cap_present: Specifies if EML capabilities field (@eml_cap) is
 *        present/updated
 * @eml_cap: EML capabilities of this station
 * @link_sta_params: link related params.
 */
struct station_parameters {
        struct net_device *vlan;
        u32 sta_flags_mask, sta_flags_set;
        u32 sta_modify_mask;
        int listen_interval;
        u16 aid;
        u16 vlan_id;
        u16 peer_aid;
        u8 plink_action;
        u8 plink_state;
        u8 uapsd_queues;
        u8 max_sp;
        enum nl80211_mesh_power_mode local_pm;
        u16 capability;
        const u8 *ext_capab;
        u8 ext_capab_len;
        const u8 *supported_channels;
        u8 supported_channels_len;
        const u8 *supported_oper_classes;
        u8 supported_oper_classes_len;
        int support_p2p_ps;
        u16 airtime_weight;
        bool eml_cap_present;
        u16 eml_cap;
        struct link_station_parameters link_sta_params;
};

/**
 * struct station_del_parameters - station deletion parameters
 *
 * Used to delete a station entry (or all stations).
 *
 * @mac: MAC address of the station to remove or NULL to remove all stations
 * @subtype: Management frame subtype to use for indicating removal
 *        (10 = Disassociation, 12 = Deauthentication)
 * @reason_code: Reason code for the Disassociation/Deauthentication frame
 * @link_id: Link ID indicating a link that stations to be flushed must be
 *        using; valid only for MLO, but can also be -1 for MLO to really
 *        remove all stations.
 */
struct station_del_parameters {
        const u8 *mac;
        u8 subtype;
        u16 reason_code;
        int link_id;
};

/**
 * enum cfg80211_station_type - the type of station being modified
 * @CFG80211_STA_AP_CLIENT: client of an AP interface
 * @CFG80211_STA_AP_CLIENT_UNASSOC: client of an AP interface that is still
 *        unassociated (update properties for this type of client is permitted)
 * @CFG80211_STA_AP_MLME_CLIENT: client of an AP interface that has
 *        the AP MLME in the device
 * @CFG80211_STA_AP_STA: AP station on managed interface
 * @CFG80211_STA_IBSS: IBSS station
 * @CFG80211_STA_TDLS_PEER_SETUP: TDLS peer on managed interface (dummy entry
 *        while TDLS setup is in progress, it moves out of this state when
 *        being marked authorized; use this only if TDLS with external setup is
 *        supported/used)
 * @CFG80211_STA_TDLS_PEER_ACTIVE: TDLS peer on managed interface (active
 *        entry that is operating, has been marked authorized by userspace)
 * @CFG80211_STA_MESH_PEER_KERNEL: peer on mesh interface (kernel managed)
 * @CFG80211_STA_MESH_PEER_USER: peer on mesh interface (user managed)
 */
enum cfg80211_station_type {
        CFG80211_STA_AP_CLIENT,
        CFG80211_STA_AP_CLIENT_UNASSOC,
        CFG80211_STA_AP_MLME_CLIENT,
        CFG80211_STA_AP_STA,
        CFG80211_STA_IBSS,
        CFG80211_STA_TDLS_PEER_SETUP,
        CFG80211_STA_TDLS_PEER_ACTIVE,
        CFG80211_STA_MESH_PEER_KERNEL,
        CFG80211_STA_MESH_PEER_USER,
};

/**
 * cfg80211_check_station_change - validate parameter changes
 * @wiphy: the wiphy this operates on
 * @params: the new parameters for a station
 * @statype: the type of station being modified
 *
 * Utility function for the @change_station driver method. Call this function
 * with the appropriate station type looking up the station (and checking that
 * it exists). It will verify whether the station change is acceptable.
 *
 * Return: 0 if the change is acceptable, otherwise an error code. Note that
 * it may modify the parameters for backward compatibility reasons, so don't
 * use them before calling this.
 */
int cfg80211_check_station_change(struct wiphy *wiphy,
                                  struct station_parameters *params,
                                  enum cfg80211_station_type statype);

/**
 * enum rate_info_flags - bitrate info flags
 *
 * Used by the driver to indicate the specific rate transmission
 * type for 802.11n transmissions.
 *
 * @RATE_INFO_FLAGS_MCS: mcs field filled with HT MCS
 * @RATE_INFO_FLAGS_VHT_MCS: mcs field filled with VHT MCS
 * @RATE_INFO_FLAGS_SHORT_GI: 400ns guard interval
 * @RATE_INFO_FLAGS_DMG: 60GHz MCS
 * @RATE_INFO_FLAGS_HE_MCS: HE MCS information
 * @RATE_INFO_FLAGS_EDMG: 60GHz MCS in EDMG mode
 * @RATE_INFO_FLAGS_EXTENDED_SC_DMG: 60GHz extended SC MCS
 * @RATE_INFO_FLAGS_EHT_MCS: EHT MCS information
 * @RATE_INFO_FLAGS_S1G_MCS: MCS field filled with S1G MCS
 */
enum rate_info_flags {
        RATE_INFO_FLAGS_MCS                        = BIT(0),
        RATE_INFO_FLAGS_VHT_MCS                        = BIT(1),
        RATE_INFO_FLAGS_SHORT_GI                = BIT(2),
        RATE_INFO_FLAGS_DMG                        = BIT(3),
        RATE_INFO_FLAGS_HE_MCS                        = BIT(4),
        RATE_INFO_FLAGS_EDMG                        = BIT(5),
        RATE_INFO_FLAGS_EXTENDED_SC_DMG                = BIT(6),
        RATE_INFO_FLAGS_EHT_MCS                        = BIT(7),
        RATE_INFO_FLAGS_S1G_MCS                        = BIT(8),
};

/**
 * enum rate_info_bw - rate bandwidth information
 *
 * Used by the driver to indicate the rate bandwidth.
 *
 * @RATE_INFO_BW_5: 5 MHz bandwidth
 * @RATE_INFO_BW_10: 10 MHz bandwidth
 * @RATE_INFO_BW_20: 20 MHz bandwidth
 * @RATE_INFO_BW_40: 40 MHz bandwidth
 * @RATE_INFO_BW_80: 80 MHz bandwidth
 * @RATE_INFO_BW_160: 160 MHz bandwidth
 * @RATE_INFO_BW_HE_RU: bandwidth determined by HE RU allocation
 * @RATE_INFO_BW_320: 320 MHz bandwidth
 * @RATE_INFO_BW_EHT_RU: bandwidth determined by EHT RU allocation
 * @RATE_INFO_BW_1: 1 MHz bandwidth
 * @RATE_INFO_BW_2: 2 MHz bandwidth
 * @RATE_INFO_BW_4: 4 MHz bandwidth
 * @RATE_INFO_BW_8: 8 MHz bandwidth
 * @RATE_INFO_BW_16: 16 MHz bandwidth
 */
enum rate_info_bw {
        RATE_INFO_BW_20 = 0,
        RATE_INFO_BW_5,
        RATE_INFO_BW_10,
        RATE_INFO_BW_40,
        RATE_INFO_BW_80,
        RATE_INFO_BW_160,
        RATE_INFO_BW_HE_RU,
        RATE_INFO_BW_320,
        RATE_INFO_BW_EHT_RU,
        RATE_INFO_BW_1,
        RATE_INFO_BW_2,
        RATE_INFO_BW_4,
        RATE_INFO_BW_8,
        RATE_INFO_BW_16,
};

/**
 * struct rate_info - bitrate information
 *
 * Information about a receiving or transmitting bitrate
 *
 * @flags: bitflag of flags from &enum rate_info_flags
 * @legacy: bitrate in 100kbit/s for 802.11abg
 * @mcs: mcs index if struct describes an HT/VHT/HE/EHT/S1G rate
 * @nss: number of streams (VHT & HE only)
 * @bw: bandwidth (from &enum rate_info_bw)
 * @he_gi: HE guard interval (from &enum nl80211_he_gi)
 * @he_dcm: HE DCM value
 * @he_ru_alloc: HE RU allocation (from &enum nl80211_he_ru_alloc,
 *        only valid if bw is %RATE_INFO_BW_HE_RU)
 * @n_bonded_ch: In case of EDMG the number of bonded channels (1-4)
 * @eht_gi: EHT guard interval (from &enum nl80211_eht_gi)
 * @eht_ru_alloc: EHT RU allocation (from &enum nl80211_eht_ru_alloc,
 *        only valid if bw is %RATE_INFO_BW_EHT_RU)
 */
struct rate_info {
        u16 flags;
        u16 legacy;
        u8 mcs;
        u8 nss;
        u8 bw;
        u8 he_gi;
        u8 he_dcm;
        u8 he_ru_alloc;
        u8 n_bonded_ch;
        u8 eht_gi;
        u8 eht_ru_alloc;
};

/**
 * enum bss_param_flags - bitrate info flags
 *
 * Used by the driver to indicate the specific rate transmission
 * type for 802.11n transmissions.
 *
 * @BSS_PARAM_FLAGS_CTS_PROT: whether CTS protection is enabled
 * @BSS_PARAM_FLAGS_SHORT_PREAMBLE: whether short preamble is enabled
 * @BSS_PARAM_FLAGS_SHORT_SLOT_TIME: whether short slot time is enabled
 */
enum bss_param_flags {
        BSS_PARAM_FLAGS_CTS_PROT        = BIT(0),
        BSS_PARAM_FLAGS_SHORT_PREAMBLE        = BIT(1),
        BSS_PARAM_FLAGS_SHORT_SLOT_TIME        = BIT(2),
};

/**
 * struct sta_bss_parameters - BSS parameters for the attached station
 *
 * Information about the currently associated BSS
 *
 * @flags: bitflag of flags from &enum bss_param_flags
 * @dtim_period: DTIM period for the BSS
 * @beacon_interval: beacon interval
 */
struct sta_bss_parameters {
        u8 flags;
        u8 dtim_period;
        u16 beacon_interval;
};

/**
 * struct cfg80211_txq_stats - TXQ statistics for this TID
 * @filled: bitmap of flags using the bits of &enum nl80211_txq_stats to
 *        indicate the relevant values in this struct are filled
 * @backlog_bytes: total number of bytes currently backlogged
 * @backlog_packets: total number of packets currently backlogged
 * @flows: number of new flows seen
 * @drops: total number of packets dropped
 * @ecn_marks: total number of packets marked with ECN CE
 * @overlimit: number of drops due to queue space overflow
 * @overmemory: number of drops due to memory limit overflow
 * @collisions: number of hash collisions
 * @tx_bytes: total number of bytes dequeued
 * @tx_packets: total number of packets dequeued
 * @max_flows: maximum number of flows supported
 */
struct cfg80211_txq_stats {
        u32 filled;
        u32 backlog_bytes;
        u32 backlog_packets;
        u32 flows;
        u32 drops;
        u32 ecn_marks;
        u32 overlimit;
        u32 overmemory;
        u32 collisions;
        u32 tx_bytes;
        u32 tx_packets;
        u32 max_flows;
};

/**
 * struct cfg80211_tid_stats - per-TID statistics
 * @filled: bitmap of flags using the bits of &enum nl80211_tid_stats to
 *        indicate the relevant values in this struct are filled
 * @rx_msdu: number of received MSDUs
 * @tx_msdu: number of (attempted) transmitted MSDUs
 * @tx_msdu_retries: number of retries (not counting the first) for
 *        transmitted MSDUs
 * @tx_msdu_failed: number of failed transmitted MSDUs
 * @txq_stats: TXQ statistics
 */
struct cfg80211_tid_stats {
        u32 filled;
        u64 rx_msdu;
        u64 tx_msdu;
        u64 tx_msdu_retries;
        u64 tx_msdu_failed;
        struct cfg80211_txq_stats txq_stats;
};

#define IEEE80211_MAX_CHAINS        4

/**
 * struct link_station_info - link station information
 *
 * Link station information filled by driver for get_station() and
 *        dump_station().
 * @filled: bit flag of flags using the bits of &enum nl80211_sta_info to
 *        indicate the relevant values in this struct for them
 * @connected_time: time(in secs) since a link of station is last connected
 * @inactive_time: time since last activity for link station(tx/rx)
 *        in milliseconds
 * @assoc_at: bootime (ns) of the last association of link of station
 * @rx_bytes: bytes (size of MPDUs) received from this link of station
 * @tx_bytes: bytes (size of MPDUs) transmitted to this link of station
 * @signal: The signal strength, type depends on the wiphy's signal_type.
 *        For CFG80211_SIGNAL_TYPE_MBM, value is expressed in _dBm_.
 * @signal_avg: Average signal strength, type depends on the wiphy's
 *        signal_type. For CFG80211_SIGNAL_TYPE_MBM, value is expressed in _dBm_
 * @chains: bitmask for filled values in @chain_signal, @chain_signal_avg
 * @chain_signal: per-chain signal strength of last received packet in dBm
 * @chain_signal_avg: per-chain signal strength average in dBm
 * @txrate: current unicast bitrate from this link of station
 * @rxrate: current unicast bitrate to this link of station
 * @rx_packets: packets (MSDUs & MMPDUs) received from this link of station
 * @tx_packets: packets (MSDUs & MMPDUs) transmitted to this link of station
 * @tx_retries: cumulative retry counts (MPDUs) for this link of station
 * @tx_failed: number of failed transmissions (MPDUs) (retries exceeded, no ACK)
 * @rx_dropped_misc:  Dropped for un-specified reason.
 * @bss_param: current BSS parameters
 * @beacon_loss_count: Number of times beacon loss event has triggered.
 * @expected_throughput: expected throughput in kbps (including 802.11 headers)
 *        towards this station.
 * @rx_beacon: number of beacons received from this peer
 * @rx_beacon_signal_avg: signal strength average (in dBm) for beacons received
 *        from this peer
 * @rx_duration: aggregate PPDU duration(usecs) for all the frames from a peer
 * @tx_duration: aggregate PPDU duration(usecs) for all the frames to a peer
 * @airtime_weight: current airtime scheduling weight
 * @pertid: per-TID statistics, see &struct cfg80211_tid_stats, using the last
 *        (IEEE80211_NUM_TIDS) index for MSDUs not encapsulated in QoS-MPDUs.
 *        Note that this doesn't use the @filled bit, but is used if non-NULL.
 * @ack_signal: signal strength (in dBm) of the last ACK frame.
 * @avg_ack_signal: average rssi value of ack packet for the no of msdu's has
 *        been sent.
 * @rx_mpdu_count: number of MPDUs received from this station
 * @fcs_err_count: number of packets (MPDUs) received from this station with
 *        an FCS error. This counter should be incremented only when TA of the
 *        received packet with an FCS error matches the peer MAC address.
 * @addr: For MLO STA connection, filled with address of the link of station.
 */
struct link_station_info {
        u64 filled;
        u32 connected_time;
        u32 inactive_time;
        u64 assoc_at;
        u64 rx_bytes;
        u64 tx_bytes;
        s8 signal;
        s8 signal_avg;

        u8 chains;
        s8 chain_signal[IEEE80211_MAX_CHAINS];
        s8 chain_signal_avg[IEEE80211_MAX_CHAINS];

        struct rate_info txrate;
        struct rate_info rxrate;
        u32 rx_packets;
        u32 tx_packets;
        u32 tx_retries;
        u32 tx_failed;
        u32 rx_dropped_misc;
        struct sta_bss_parameters bss_param;

        u32 beacon_loss_count;

        u32 expected_throughput;

        u64 tx_duration;
        u64 rx_duration;
        u64 rx_beacon;
        u8 rx_beacon_signal_avg;

        u16 airtime_weight;

        s8 ack_signal;
        s8 avg_ack_signal;
        struct cfg80211_tid_stats *pertid;

        u32 rx_mpdu_count;
        u32 fcs_err_count;

        u8 addr[ETH_ALEN] __aligned(2);
};

/**
 * struct station_info - station information
 *
 * Station information filled by driver for get_station() and dump_station.
 *
 * @filled: bitflag of flags using the bits of &enum nl80211_sta_info to
 *        indicate the relevant values in this struct for them
 * @connected_time: time(in secs) since a station is last connected
 * @inactive_time: time since last station activity (tx/rx) in milliseconds
 * @assoc_at: bootime (ns) of the last association
 * @rx_bytes: bytes (size of MPDUs) received from this station
 * @tx_bytes: bytes (size of MPDUs) transmitted to this station
 * @signal: The signal strength, type depends on the wiphy's signal_type.
 *        For CFG80211_SIGNAL_TYPE_MBM, value is expressed in _dBm_.
 * @signal_avg: Average signal strength, type depends on the wiphy's signal_type.
 *        For CFG80211_SIGNAL_TYPE_MBM, value is expressed in _dBm_.
 * @chains: bitmask for filled values in @chain_signal, @chain_signal_avg
 * @chain_signal: per-chain signal strength of last received packet in dBm
 * @chain_signal_avg: per-chain signal strength average in dBm
 * @txrate: current unicast bitrate from this station
 * @rxrate: current unicast bitrate to this station
 * @rx_packets: packets (MSDUs & MMPDUs) received from this station
 * @tx_packets: packets (MSDUs & MMPDUs) transmitted to this station
 * @tx_retries: cumulative retry counts (MPDUs)
 * @tx_failed: number of failed transmissions (MPDUs) (retries exceeded, no ACK)
 * @rx_dropped_misc:  Dropped for un-specified reason.
 * @bss_param: current BSS parameters
 * @generation: generation number for nl80211 dumps.
 *        This number should increase every time the list of stations
 *        changes, i.e. when a station is added or removed, so that
 *        userspace can tell whether it got a consistent snapshot.
 * @beacon_loss_count: Number of times beacon loss event has triggered.
 * @assoc_req_ies: IEs from (Re)Association Request.
 *        This is used only when in AP mode with drivers that do not use
 *        user space MLME/SME implementation. The information is provided for
 *        the cfg80211_new_sta() calls to notify user space of the IEs.
 * @assoc_req_ies_len: Length of assoc_req_ies buffer in octets.
 * @sta_flags: station flags mask & values
 * @t_offset: Time offset of the station relative to this host.
 * @llid: mesh local link id
 * @plid: mesh peer link id
 * @plink_state: mesh peer link state
 * @connected_to_gate: true if mesh STA has a path to mesh gate
 * @connected_to_as: true if mesh STA has a path to authentication server
 * @airtime_link_metric: mesh airtime link metric.
 * @local_pm: local mesh STA power save mode
 * @peer_pm: peer mesh STA power save mode
 * @nonpeer_pm: non-peer mesh STA power save mode
 * @expected_throughput: expected throughput in kbps (including 802.11 headers)
 *        towards this station.
 * @rx_beacon: number of beacons received from this peer
 * @rx_beacon_signal_avg: signal strength average (in dBm) for beacons received
 *        from this peer
 * @rx_duration: aggregate PPDU duration(usecs) for all the frames from a peer
 * @tx_duration: aggregate PPDU duration(usecs) for all the frames to a peer
 * @airtime_weight: current airtime scheduling weight
 * @pertid: per-TID statistics, see &struct cfg80211_tid_stats, using the last
 *        (IEEE80211_NUM_TIDS) index for MSDUs not encapsulated in QoS-MPDUs.
 *        Note that this doesn't use the @filled bit, but is used if non-NULL.
 * @ack_signal: signal strength (in dBm) of the last ACK frame.
 * @avg_ack_signal: average rssi value of ack packet for the no of msdu's has
 *        been sent.
 * @rx_mpdu_count: number of MPDUs received from this station
 * @fcs_err_count: number of packets (MPDUs) received from this station with
 *        an FCS error. This counter should be incremented only when TA of the
 *        received packet with an FCS error matches the peer MAC address.
 * @mlo_params_valid: Indicates @assoc_link_id and @mld_addr fields are filled
 *        by driver. Drivers use this only in cfg80211_new_sta() calls when AP
 *        MLD's MLME/SME is offload to driver. Drivers won't fill this
 *        information in cfg80211_del_sta_sinfo(), get_station() and
 *        dump_station() callbacks.
 * @assoc_link_id: Indicates MLO link ID of the AP, with which the station
 *        completed (re)association. This information filled for both MLO
 *        and non-MLO STA connections when the AP affiliated with an MLD.
 * @mld_addr: For MLO STA connection, filled with MLD address of the station.
 *        For non-MLO STA connection, filled with all zeros.
 * @assoc_resp_ies: IEs from (Re)Association Response.
 *        This is used only when in AP mode with drivers that do not use user
 *        space MLME/SME implementation. The information is provided only for the
 *        cfg80211_new_sta() calls to notify user space of the IEs. Drivers won't
 *        fill this information in cfg80211_del_sta_sinfo(), get_station() and
 *        dump_station() callbacks. User space needs this information to determine
 *        the accepted and rejected affiliated links of the connected station.
 * @assoc_resp_ies_len: Length of @assoc_resp_ies buffer in octets.
 * @valid_links: bitmap of valid links, or 0 for non-MLO. Drivers fill this
 *        information in cfg80211_new_sta(), cfg80211_del_sta_sinfo(),
 *        get_station() and dump_station() callbacks.
 * @links: reference to Link sta entries for MLO STA, all link specific
 *        information is accessed through links[link_id].
 */
struct station_info {
        u64 filled;
        u32 connected_time;
        u32 inactive_time;
        u64 assoc_at;
        u64 rx_bytes;
        u64 tx_bytes;
        s8 signal;
        s8 signal_avg;

        u8 chains;
        s8 chain_signal[IEEE80211_MAX_CHAINS];
        s8 chain_signal_avg[IEEE80211_MAX_CHAINS];

        struct rate_info txrate;
        struct rate_info rxrate;
        u32 rx_packets;
        u32 tx_packets;
        u32 tx_retries;
        u32 tx_failed;
        u32 rx_dropped_misc;
        struct sta_bss_parameters bss_param;
        struct nl80211_sta_flag_update sta_flags;

        int generation;

        u32 beacon_loss_count;

        const u8 *assoc_req_ies;
        size_t assoc_req_ies_len;

        s64 t_offset;
        u16 llid;
        u16 plid;
        u8 plink_state;
        u8 connected_to_gate;
        u8 connected_to_as;
        u32 airtime_link_metric;
        enum nl80211_mesh_power_mode local_pm;
        enum nl80211_mesh_power_mode peer_pm;
        enum nl80211_mesh_power_mode nonpeer_pm;

        u32 expected_throughput;

        u16 airtime_weight;

        s8 ack_signal;
        s8 avg_ack_signal;
        struct cfg80211_tid_stats *pertid;

        u64 tx_duration;
        u64 rx_duration;
        u64 rx_beacon;
        u8 rx_beacon_signal_avg;

        u32 rx_mpdu_count;
        u32 fcs_err_count;

        bool mlo_params_valid;
        u8 assoc_link_id;
        u8 mld_addr[ETH_ALEN] __aligned(2);
        const u8 *assoc_resp_ies;
        size_t assoc_resp_ies_len;

        u16 valid_links;
        struct link_station_info *links[IEEE80211_MLD_MAX_NUM_LINKS];
};

/**
 * struct cfg80211_sar_sub_specs - sub specs limit
 * @power: power limitation in 0.25dbm
 * @freq_range_index: index the power limitation applies to
 */
struct cfg80211_sar_sub_specs {
        s32 power;
        u32 freq_range_index;
};

/**
 * struct cfg80211_sar_specs - sar limit specs
 * @type: it's set with power in 0.25dbm or other types
 * @num_sub_specs: number of sar sub specs
 * @sub_specs: memory to hold the sar sub specs
 */
struct cfg80211_sar_specs {
        enum nl80211_sar_type type;
        u32 num_sub_specs;
        struct cfg80211_sar_sub_specs sub_specs[] __counted_by(num_sub_specs);
};


/**
 * struct cfg80211_sar_freq_ranges - sar frequency ranges
 * @start_freq:  start range edge frequency
 * @end_freq:    end range edge frequency
 */
struct cfg80211_sar_freq_ranges {
        u32 start_freq;
        u32 end_freq;
};

/**
 * struct cfg80211_sar_capa - sar limit capability
 * @type: it's set via power in 0.25dbm or other types
 * @num_freq_ranges: number of frequency ranges
 * @freq_ranges: memory to hold the freq ranges.
 *
 * Note: WLAN driver may append new ranges or split an existing
 * range to small ones and then append them.
 */
struct cfg80211_sar_capa {
        enum nl80211_sar_type type;
        u32 num_freq_ranges;
        const struct cfg80211_sar_freq_ranges *freq_ranges;
};

#if IS_ENABLED(CONFIG_CFG80211)
/**
 * cfg80211_get_station - retrieve information about a given station
 * @dev: the device where the station is supposed to be connected to
 * @mac_addr: the mac address of the station of interest
 * @sinfo: pointer to the structure to fill with the information
 *
 * Return: 0 on success and sinfo is filled with the available information
 * otherwise returns a negative error code and the content of sinfo has to be
 * considered undefined.
 */
int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr,
                         struct station_info *sinfo);
#else
static inline int cfg80211_get_station(struct net_device *dev,
                                       const u8 *mac_addr,
                                       struct station_info *sinfo)
{
        return -ENOENT;
}
#endif

/**
 * enum monitor_flags - monitor flags
 *
 * Monitor interface configuration flags. Note that these must be the bits
 * according to the nl80211 flags.
 *
 * @MONITOR_FLAG_CHANGED: set if the flags were changed
 * @MONITOR_FLAG_FCSFAIL: pass frames with bad FCS
 * @MONITOR_FLAG_PLCPFAIL: pass frames with bad PLCP
 * @MONITOR_FLAG_CONTROL: pass control frames
 * @MONITOR_FLAG_OTHER_BSS: disable BSSID filtering
 * @MONITOR_FLAG_COOK_FRAMES: deprecated, will unconditionally be refused
 * @MONITOR_FLAG_ACTIVE: active monitor, ACKs frames on its MAC address
 * @MONITOR_FLAG_SKIP_TX: do not pass locally transmitted frames
 */
enum monitor_flags {
        MONITOR_FLAG_CHANGED                = BIT(__NL80211_MNTR_FLAG_INVALID),
        MONITOR_FLAG_FCSFAIL                = BIT(NL80211_MNTR_FLAG_FCSFAIL),
        MONITOR_FLAG_PLCPFAIL                = BIT(NL80211_MNTR_FLAG_PLCPFAIL),
        MONITOR_FLAG_CONTROL                = BIT(NL80211_MNTR_FLAG_CONTROL),
        MONITOR_FLAG_OTHER_BSS                = BIT(NL80211_MNTR_FLAG_OTHER_BSS),
        MONITOR_FLAG_COOK_FRAMES        = BIT(NL80211_MNTR_FLAG_COOK_FRAMES),
        MONITOR_FLAG_ACTIVE                = BIT(NL80211_MNTR_FLAG_ACTIVE),
        MONITOR_FLAG_SKIP_TX                = BIT(NL80211_MNTR_FLAG_SKIP_TX),
};

/**
 * enum mpath_info_flags -  mesh path information flags
 *
 * Used by the driver to indicate which info in &struct mpath_info it has filled
 * in during get_station() or dump_station().
 *
 * @MPATH_INFO_FRAME_QLEN: @frame_qlen filled
 * @MPATH_INFO_SN: @sn filled
 * @MPATH_INFO_METRIC: @metric filled
 * @MPATH_INFO_EXPTIME: @exptime filled
 * @MPATH_INFO_DISCOVERY_TIMEOUT: @discovery_timeout filled
 * @MPATH_INFO_DISCOVERY_RETRIES: @discovery_retries filled
 * @MPATH_INFO_FLAGS: @flags filled
 * @MPATH_INFO_HOP_COUNT: @hop_count filled
 * @MPATH_INFO_PATH_CHANGE: @path_change_count filled
 */
enum mpath_info_flags {
        MPATH_INFO_FRAME_QLEN                = BIT(0),
        MPATH_INFO_SN                        = BIT(1),
        MPATH_INFO_METRIC                = BIT(2),
        MPATH_INFO_EXPTIME                = BIT(3),
        MPATH_INFO_DISCOVERY_TIMEOUT        = BIT(4),
        MPATH_INFO_DISCOVERY_RETRIES        = BIT(5),
        MPATH_INFO_FLAGS                = BIT(6),
        MPATH_INFO_HOP_COUNT                = BIT(7),
        MPATH_INFO_PATH_CHANGE                = BIT(8),
};

/**
 * struct mpath_info - mesh path information
 *
 * Mesh path information filled by driver for get_mpath() and dump_mpath().
 *
 * @filled: bitfield of flags from &enum mpath_info_flags
 * @frame_qlen: number of queued frames for this destination
 * @sn: target sequence number
 * @metric: metric (cost) of this mesh path
 * @exptime: expiration time for the mesh path from now, in msecs
 * @flags: mesh path flags from &enum mesh_path_flags
 * @discovery_timeout: total mesh path discovery timeout, in msecs
 * @discovery_retries: mesh path discovery retries
 * @generation: generation number for nl80211 dumps.
 *        This number should increase every time the list of mesh paths
 *        changes, i.e. when a station is added or removed, so that
 *        userspace can tell whether it got a consistent snapshot.
 * @hop_count: hops to destination
 * @path_change_count: total number of path changes to destination
 */
struct mpath_info {
        u32 filled;
        u32 frame_qlen;
        u32 sn;
        u32 metric;
        u32 exptime;
        u32 discovery_timeout;
        u8 discovery_retries;
        u8 flags;
        u8 hop_count;
        u32 path_change_count;

        int generation;
};

/**
 * enum wiphy_bss_param_flags - bit positions for supported bss parameters.
 *
 * @WIPHY_BSS_PARAM_CTS_PROT: support changing CTS protection.
 * @WIPHY_BSS_PARAM_SHORT_PREAMBLE: support changing short preamble usage.
 * @WIPHY_BSS_PARAM_SHORT_SLOT_TIME: support changing short slot time usage.
 * @WIPHY_BSS_PARAM_BASIC_RATES: support reconfiguring basic rates.
 * @WIPHY_BSS_PARAM_AP_ISOLATE: support changing AP isolation.
 * @WIPHY_BSS_PARAM_HT_OPMODE: support changing HT operating mode.
 * @WIPHY_BSS_PARAM_P2P_CTWINDOW: support reconfiguring ctwindow.
 * @WIPHY_BSS_PARAM_P2P_OPPPS: support changing P2P opportunistic power-save.
 */
enum wiphy_bss_param_flags {
        WIPHY_BSS_PARAM_CTS_PROT = BIT(0),
        WIPHY_BSS_PARAM_SHORT_PREAMBLE = BIT(1),
        WIPHY_BSS_PARAM_SHORT_SLOT_TIME = BIT(2),
        WIPHY_BSS_PARAM_BASIC_RATES = BIT(3),
        WIPHY_BSS_PARAM_AP_ISOLATE = BIT(4),
        WIPHY_BSS_PARAM_HT_OPMODE = BIT(5),
        WIPHY_BSS_PARAM_P2P_CTWINDOW = BIT(6),
        WIPHY_BSS_PARAM_P2P_OPPPS = BIT(7),
};

/**
 * struct bss_parameters - BSS parameters
 *
 * Used to change BSS parameters (mainly for AP mode).
 *
 * @link_id: link_id or -1 for non-MLD
 * @use_cts_prot: Whether to use CTS protection
 *        (0 = no, 1 = yes, -1 = do not change)
 * @use_short_preamble: Whether the use of short preambles is allowed
 *        (0 = no, 1 = yes, -1 = do not change)
 * @use_short_slot_time: Whether the use of short slot time is allowed
 *        (0 = no, 1 = yes, -1 = do not change)
 * @basic_rates: basic rates in IEEE 802.11 format
 *        (or NULL for no change)
 * @basic_rates_len: number of basic rates
 * @ap_isolate: do not forward packets between connected stations
 *        (0 = no, 1 = yes, -1 = do not change)
 * @ht_opmode: HT Operation mode
 *        (u16 = opmode, -1 = do not change)
 * @p2p_ctwindow: P2P CT Window (-1 = no change)
 * @p2p_opp_ps: P2P opportunistic PS (-1 = no change)
 */
struct bss_parameters {
        int link_id;
        int use_cts_prot;
        int use_short_preamble;
        int use_short_slot_time;
        const u8 *basic_rates;
        u8 basic_rates_len;
        int ap_isolate;
        int ht_opmode;
        s8 p2p_ctwindow, p2p_opp_ps;
};

/**
 * struct mesh_config - 802.11s mesh configuration
 *
 * These parameters can be changed while the mesh is active.
 *
 * @dot11MeshRetryTimeout: the initial retry timeout in millisecond units used
 *        by the Mesh Peering Open message
 * @dot11MeshConfirmTimeout: the initial retry timeout in millisecond units
 *        used by the Mesh Peering Open message
 * @dot11MeshHoldingTimeout: the confirm timeout in millisecond units used by
 *        the mesh peering management to close a mesh peering
 * @dot11MeshMaxPeerLinks: the maximum number of peer links allowed on this
 *        mesh interface
 * @dot11MeshMaxRetries: the maximum number of peer link open retries that can
 *        be sent to establish a new peer link instance in a mesh
 * @dot11MeshTTL: the value of TTL field set at a source mesh STA
 * @element_ttl: the value of TTL field set at a mesh STA for path selection
 *        elements
 * @auto_open_plinks: whether we should automatically open peer links when we
 *        detect compatible mesh peers
 * @dot11MeshNbrOffsetMaxNeighbor: the maximum number of neighbors to
 *        synchronize to for 11s default synchronization method
 * @dot11MeshHWMPmaxPREQretries: the number of action frames containing a PREQ
 *        that an originator mesh STA can send to a particular path target
 * @path_refresh_time: how frequently to refresh mesh paths in milliseconds
 * @min_discovery_timeout: the minimum length of time to wait until giving up on
 *        a path discovery in milliseconds
 * @dot11MeshHWMPactivePathTimeout: the time (in TUs) for which mesh STAs
 *        receiving a PREQ shall consider the forwarding information from the
 *        root to be valid. (TU = time unit)
 * @dot11MeshHWMPpreqMinInterval: the minimum interval of time (in TUs) during
 *        which a mesh STA can send only one action frame containing a PREQ
 *        element
 * @dot11MeshHWMPperrMinInterval: the minimum interval of time (in TUs) during
 *        which a mesh STA can send only one Action frame containing a PERR
 *        element
 * @dot11MeshHWMPnetDiameterTraversalTime: the interval of time (in TUs) that
 *        it takes for an HWMP information element to propagate across the mesh
 * @dot11MeshHWMPRootMode: the configuration of a mesh STA as root mesh STA
 * @dot11MeshHWMPRannInterval: the interval of time (in TUs) between root
 *        announcements are transmitted
 * @dot11MeshGateAnnouncementProtocol: whether to advertise that this mesh
 *        station has access to a broader network beyond the MBSS. (This is
 *        missnamed in draft 12.0: dot11MeshGateAnnouncementProtocol set to true
 *        only means that the station will announce others it's a mesh gate, but
 *        not necessarily using the gate announcement protocol. Still keeping the
 *        same nomenclature to be in sync with the spec)
 * @dot11MeshForwarding: whether the Mesh STA is forwarding or non-forwarding
 *        entity (default is TRUE - forwarding entity)
 * @rssi_threshold: the threshold for average signal strength of candidate
 *        station to establish a peer link
 * @ht_opmode: mesh HT protection mode
 *
 * @dot11MeshHWMPactivePathToRootTimeout: The time (in TUs) for which mesh STAs
 *        receiving a proactive PREQ shall consider the forwarding information to
 *        the root mesh STA to be valid.
 *
 * @dot11MeshHWMProotInterval: The interval of time (in TUs) between proactive
 *        PREQs are transmitted.
 * @dot11MeshHWMPconfirmationInterval: The minimum interval of time (in TUs)
 *        during which a mesh STA can send only one Action frame containing
 *        a PREQ element for root path confirmation.
 * @power_mode: The default mesh power save mode which will be the initial
 *        setting for new peer links.
 * @dot11MeshAwakeWindowDuration: The duration in TUs the STA will remain awake
 *        after transmitting its beacon.
 * @plink_timeout: If no tx activity is seen from a STA we've established
 *        peering with for longer than this time (in seconds), then remove it
 *        from the STA's list of peers.  Default is 30 minutes.
 * @dot11MeshConnectedToAuthServer: if set to true then this mesh STA
 *        will advertise that it is connected to a authentication server
 *        in the mesh formation field.
 * @dot11MeshConnectedToMeshGate: if set to true, advertise that this STA is
 *      connected to a mesh gate in mesh formation info.  If false, the
 *      value in mesh formation is determined by the presence of root paths
 *      in the mesh path table
 * @dot11MeshNolearn: Try to avoid multi-hop path discovery (e.g. PREQ/PREP
 *      for HWMP) if the destination is a direct neighbor. Note that this might
 *      not be the optimal decision as a multi-hop route might be better. So
 *      if using this setting you will likely also want to disable
 *      dot11MeshForwarding and use another mesh routing protocol on top.
 */
struct mesh_config {
        u16 dot11MeshRetryTimeout;
        u16 dot11MeshConfirmTimeout;
        u16 dot11MeshHoldingTimeout;
        u16 dot11MeshMaxPeerLinks;
        u8 dot11MeshMaxRetries;
        u8 dot11MeshTTL;
        u8 element_ttl;
        bool auto_open_plinks;
        u32 dot11MeshNbrOffsetMaxNeighbor;
        u8 dot11MeshHWMPmaxPREQretries;
        u32 path_refresh_time;
        u16 min_discovery_timeout;
        u32 dot11MeshHWMPactivePathTimeout;
        u16 dot11MeshHWMPpreqMinInterval;
        u16 dot11MeshHWMPperrMinInterval;
        u16 dot11MeshHWMPnetDiameterTraversalTime;
        u8 dot11MeshHWMPRootMode;
        bool dot11MeshConnectedToMeshGate;
        bool dot11MeshConnectedToAuthServer;
        u16 dot11MeshHWMPRannInterval;
        bool dot11MeshGateAnnouncementProtocol;
        bool dot11MeshForwarding;
        s32 rssi_threshold;
        u16 ht_opmode;
        u32 dot11MeshHWMPactivePathToRootTimeout;
        u16 dot11MeshHWMProotInterval;
        u16 dot11MeshHWMPconfirmationInterval;
        enum nl80211_mesh_power_mode power_mode;
        u16 dot11MeshAwakeWindowDuration;
        u32 plink_timeout;
        bool dot11MeshNolearn;
};

/**
 * struct mesh_setup - 802.11s mesh setup configuration
 * @chandef: defines the channel to use
 * @mesh_id: the mesh ID
 * @mesh_id_len: length of the mesh ID, at least 1 and at most 32 bytes
 * @sync_method: which synchronization method to use
 * @path_sel_proto: which path selection protocol to use
 * @path_metric: which metric to use
 * @auth_id: which authentication method this mesh is using
 * @ie: vendor information elements (optional)
 * @ie_len: length of vendor information elements
 * @is_authenticated: this mesh requires authentication
 * @is_secure: this mesh uses security
 * @user_mpm: userspace handles all MPM functions
 * @dtim_period: DTIM period to use
 * @beacon_interval: beacon interval to use
 * @mcast_rate: multicast rate for Mesh Node [6Mbps is the default for 802.11a]
 * @basic_rates: basic rates to use when creating the mesh
 * @beacon_rate: bitrate to be used for beacons
 * @userspace_handles_dfs: whether user space controls DFS operation, i.e.
 *        changes the channel when a radar is detected. This is required
 *        to operate on DFS channels.
 * @control_port_over_nl80211: TRUE if userspace expects to exchange control
 *        port frames over NL80211 instead of the network interface.
 *
 * These parameters are fixed when the mesh is created.
 */
struct mesh_setup {
        struct cfg80211_chan_def chandef;
        const u8 *mesh_id;
        u8 mesh_id_len;
        u8 sync_method;
        u8 path_sel_proto;
        u8 path_metric;
        u8 auth_id;
        const u8 *ie;
        u8 ie_len;
        bool is_authenticated;
        bool is_secure;
        bool user_mpm;
        u8 dtim_period;
        u16 beacon_interval;
        int mcast_rate[NUM_NL80211_BANDS];
        u32 basic_rates;
        struct cfg80211_bitrate_mask beacon_rate;
        bool userspace_handles_dfs;
        bool control_port_over_nl80211;
};

/**
 * struct ocb_setup - 802.11p OCB mode setup configuration
 * @chandef: defines the channel to use
 *
 * These parameters are fixed when connecting to the network
 */
struct ocb_setup {
        struct cfg80211_chan_def chandef;
};

/**
 * struct ieee80211_txq_params - TX queue parameters
 * @ac: AC identifier
 * @txop: Maximum burst time in units of 32 usecs, 0 meaning disabled
 * @cwmin: Minimum contention window [a value of the form 2^n-1 in the range
 *        1..32767]
 * @cwmax: Maximum contention window [a value of the form 2^n-1 in the range
 *        1..32767]
 * @aifs: Arbitration interframe space [0..255]
 * @link_id: link_id or -1 for non-MLD
 */
struct ieee80211_txq_params {
        enum nl80211_ac ac;
        u16 txop;
        u16 cwmin;
        u16 cwmax;
        u8 aifs;
        int link_id;
};

/**
 * DOC: Scanning and BSS list handling
 *
 * The scanning process itself is fairly simple, but cfg80211 offers quite
 * a bit of helper functionality. To start a scan, the scan operation will
 * be invoked with a scan definition. This scan definition contains the
 * channels to scan, and the SSIDs to send probe requests for (including the
 * wildcard, if desired). A passive scan is indicated by having no SSIDs to
 * probe. Additionally, a scan request may contain extra information elements
 * that should be added to the probe request. The IEs are guaranteed to be
 * well-formed, and will not exceed the maximum length the driver advertised
 * in the wiphy structure.
 *
 * When scanning finds a BSS, cfg80211 needs to be notified of that, because
 * it is responsible for maintaining the BSS list; the driver should not
 * maintain a list itself. For this notification, various functions exist.
 *
 * Since drivers do not maintain a BSS list, there are also a number of
 * functions to search for a BSS and obtain information about it from the
 * BSS structure cfg80211 maintains. The BSS list is also made available
 * to userspace.
 */

/**
 * struct cfg80211_ssid - SSID description
 * @ssid: the SSID
 * @ssid_len: length of the ssid
 */
struct cfg80211_ssid {
        u8 ssid[IEEE80211_MAX_SSID_LEN];
        u8 ssid_len;
};

/**
 * struct cfg80211_scan_info - information about completed scan
 * @scan_start_tsf: scan start time in terms of the TSF of the BSS that the
 *        wireless device that requested the scan is connected to. If this
 *        information is not available, this field is left zero.
 * @tsf_bssid: the BSSID according to which %scan_start_tsf is set.
 * @aborted: set to true if the scan was aborted for any reason,
 *        userspace will be notified of that
 */
struct cfg80211_scan_info {
        u64 scan_start_tsf;
        u8 tsf_bssid[ETH_ALEN] __aligned(2);
        bool aborted;
};

/**
 * struct cfg80211_scan_6ghz_params - relevant for 6 GHz only
 *
 * @short_ssid: short ssid to scan for
 * @bssid: bssid to scan for
 * @channel_idx: idx of the channel in the channel array in the scan request
 *         which the above info is relevant to
 * @unsolicited_probe: the AP transmits unsolicited probe response every 20 TU
 * @short_ssid_valid: @short_ssid is valid and can be used
 * @psc_no_listen: when set, and the channel is a PSC channel, no need to wait
 *       20 TUs before starting to send probe requests.
 * @psd_20: The AP's 20 MHz PSD value.
 */
struct cfg80211_scan_6ghz_params {
        u32 short_ssid;
        u32 channel_idx;
        u8 bssid[ETH_ALEN];
        bool unsolicited_probe;
        bool short_ssid_valid;
        bool psc_no_listen;
        s8 psd_20;
};

/**
 * struct cfg80211_scan_request - scan request description
 *
 * @ssids: SSIDs to scan for (active scan only)
 * @n_ssids: number of SSIDs
 * @channels: channels to scan on.
 * @n_channels: total number of channels to scan
 * @ie: optional information element(s) to add into Probe Request or %NULL
 * @ie_len: length of ie in octets
 * @duration: how long to listen on each channel, in TUs. If
 *        %duration_mandatory is not set, this is the maximum dwell time and
 *        the actual dwell time may be shorter.
 * @duration_mandatory: if set, the scan duration must be as specified by the
 *        %duration field.
 * @flags: control flags from &enum nl80211_scan_flags
 * @rates: bitmap of rates to advertise for each band
 * @wiphy: the wiphy this was for
 * @scan_start: time (in jiffies) when the scan started
 * @wdev: the wireless device to scan for
 * @no_cck: used to send probe requests at non CCK rate in 2GHz band
 * @mac_addr: MAC address used with randomisation
 * @mac_addr_mask: MAC address mask used with randomisation, bits that
 *        are 0 in the mask should be randomised, bits that are 1 should
 *        be taken from the @mac_addr
 * @scan_6ghz: relevant for split scan request only,
 *        true if this is a 6 GHz scan request
 * @first_part: %true if this is the first part of a split scan request or a
 *        scan that was not split. May be %true for a @scan_6ghz scan if no other
 *        channels were requested
 * @n_6ghz_params: number of 6 GHz params
 * @scan_6ghz_params: 6 GHz params
 * @bssid: BSSID to scan for (most commonly, the wildcard BSSID)
 * @tsf_report_link_id: for MLO, indicates the link ID of the BSS that should be
 *      used for TSF reporting. Can be set to -1 to indicate no preference.
 */
struct cfg80211_scan_request {
        struct cfg80211_ssid *ssids;
        int n_ssids;
        u32 n_channels;
        const u8 *ie;
        size_t ie_len;
        u16 duration;
        bool duration_mandatory;
        u32 flags;

        u32 rates[NUM_NL80211_BANDS];

        struct wireless_dev *wdev;

        u8 mac_addr[ETH_ALEN] __aligned(2);
        u8 mac_addr_mask[ETH_ALEN] __aligned(2);
        u8 bssid[ETH_ALEN] __aligned(2);
        struct wiphy *wiphy;
        unsigned long scan_start;
        bool no_cck;
        bool scan_6ghz;
        bool first_part;
        u32 n_6ghz_params;
        struct cfg80211_scan_6ghz_params *scan_6ghz_params;
        s8 tsf_report_link_id;

        /* keep last */
        struct ieee80211_channel *channels[];
};

static inline void get_random_mask_addr(u8 *buf, const u8 *addr, const u8 *mask)
{
        int i;

        get_random_bytes(buf, ETH_ALEN);
        for (i = 0; i < ETH_ALEN; i++) {
                buf[i] &= ~mask[i];
                buf[i] |= addr[i] & mask[i];
        }
}

/**
 * struct cfg80211_match_set - sets of attributes to match
 *
 * @ssid: SSID to be matched; may be zero-length in case of BSSID match
 *        or no match (RSSI only)
 * @bssid: BSSID to be matched; may be all-zero BSSID in case of SSID match
 *        or no match (RSSI only)
 * @rssi_thold: don't report scan results below this threshold (in s32 dBm)
 */
struct cfg80211_match_set {
        struct cfg80211_ssid ssid;
        u8 bssid[ETH_ALEN];
        s32 rssi_thold;
};

/**
 * struct cfg80211_sched_scan_plan - scan plan for scheduled scan
 *
 * @interval: interval between scheduled scan iterations. In seconds.
 * @iterations: number of scan iterations in this scan plan. Zero means
 *        infinite loop.
 *        The last scan plan will always have this parameter set to zero,
 *        all other scan plans will have a finite number of iterations.
 */
struct cfg80211_sched_scan_plan {
        u32 interval;
        u32 iterations;
};

/**
 * struct cfg80211_bss_select_adjust - BSS selection with RSSI adjustment.
 *
 * @band: band of BSS which should match for RSSI level adjustment.
 * @delta: value of RSSI level adjustment.
 */
struct cfg80211_bss_select_adjust {
        enum nl80211_band band;
        s8 delta;
};

/**
 * struct cfg80211_sched_scan_request - scheduled scan request description
 *
 * @reqid: identifies this request.
 * @ssids: SSIDs to scan for (passed in the probe_reqs in active scans)
 * @n_ssids: number of SSIDs
 * @n_channels: total number of channels to scan
 * @ie: optional information element(s) to add into Probe Request or %NULL
 * @ie_len: length of ie in octets
 * @flags: control flags from &enum nl80211_scan_flags
 * @match_sets: sets of parameters to be matched for a scan result
 *        entry to be considered valid and to be passed to the host
 *        (others are filtered out).
 *        If omitted, all results are passed.
 * @n_match_sets: number of match sets
 * @report_results: indicates that results were reported for this request
 * @wiphy: the wiphy this was for
 * @dev: the interface
 * @scan_start: start time of the scheduled scan
 * @channels: channels to scan
 * @min_rssi_thold: for drivers only supporting a single threshold, this
 *        contains the minimum over all matchsets
 * @mac_addr: MAC address used with randomisation
 * @mac_addr_mask: MAC address mask used with randomisation, bits that
 *        are 0 in the mask should be randomised, bits that are 1 should
 *        be taken from the @mac_addr
 * @scan_plans: scan plans to be executed in this scheduled scan. Lowest
 *        index must be executed first.
 * @n_scan_plans: number of scan plans, at least 1.
 * @rcu_head: RCU callback used to free the struct
 * @owner_nlportid: netlink portid of owner (if this should is a request
 *        owned by a particular socket)
 * @nl_owner_dead: netlink owner socket was closed - this request be freed
 * @list: for keeping list of requests.
 * @delay: delay in seconds to use before starting the first scan
 *        cycle.  The driver may ignore this parameter and start
 *        immediately (or at any other time), if this feature is not
 *        supported.
 * @relative_rssi_set: Indicates whether @relative_rssi is set or not.
 * @relative_rssi: Relative RSSI threshold in dB to restrict scan result
 *        reporting in connected state to cases where a matching BSS is determined
 *        to have better or slightly worse RSSI than the current connected BSS.
 *        The relative RSSI threshold values are ignored in disconnected state.
 * @rssi_adjust: delta dB of RSSI preference to be given to the BSSs that belong
 *        to the specified band while deciding whether a better BSS is reported
 *        using @relative_rssi. If delta is a negative number, the BSSs that
 *        belong to the specified band will be penalized by delta dB in relative
 *        comparisons.
 */
struct cfg80211_sched_scan_request {
        u64 reqid;
        struct cfg80211_ssid *ssids;
        int n_ssids;
        u32 n_channels;
        const u8 *ie;
        size_t ie_len;
        u32 flags;
        struct cfg80211_match_set *match_sets;
        int n_match_sets;
        s32 min_rssi_thold;
        u32 delay;
        struct cfg80211_sched_scan_plan *scan_plans;
        int n_scan_plans;

        u8 mac_addr[ETH_ALEN] __aligned(2);
        u8 mac_addr_mask[ETH_ALEN] __aligned(2);

        bool relative_rssi_set;
        s8 relative_rssi;
        struct cfg80211_bss_select_adjust rssi_adjust;

        /* internal */
        struct wiphy *wiphy;
        struct net_device *dev;
        unsigned long scan_start;
        bool report_results;
        struct rcu_head rcu_head;
        u32 owner_nlportid;
        bool nl_owner_dead;
        struct list_head list;

        /* keep last */
        struct ieee80211_channel *channels[] __counted_by(n_channels);
};

/**
 * enum cfg80211_signal_type - signal type
 *
 * @CFG80211_SIGNAL_TYPE_NONE: no signal strength information available
 * @CFG80211_SIGNAL_TYPE_MBM: signal strength in mBm (100*dBm)
 * @CFG80211_SIGNAL_TYPE_UNSPEC: signal strength, increasing from 0 through 100
 */
enum cfg80211_signal_type {
        CFG80211_SIGNAL_TYPE_NONE,
        CFG80211_SIGNAL_TYPE_MBM,
        CFG80211_SIGNAL_TYPE_UNSPEC,
};

/**
 * struct cfg80211_inform_bss - BSS inform data
 * @chan: channel the frame was received on
 * @signal: signal strength value, according to the wiphy's
 *        signal type
 * @boottime_ns: timestamp (CLOCK_BOOTTIME) when the information was
 *        received; should match the time when the frame was actually
 *        received by the device (not just by the host, in case it was
 *        buffered on the device) and be accurate to about 10ms.
 *        If the frame isn't buffered, just passing the return value of
 *        ktime_get_boottime_ns() is likely appropriate.
 * @parent_tsf: the time at the start of reception of the first octet of the
 *        timestamp field of the frame. The time is the TSF of the BSS specified
 *        by %parent_bssid.
 * @parent_bssid: the BSS according to which %parent_tsf is set. This is set to
 *        the BSS that requested the scan in which the beacon/probe was received.
 * @chains: bitmask for filled values in @chain_signal.
 * @chain_signal: per-chain signal strength of last received BSS in dBm.
 * @restrict_use: restrict usage, if not set, assume @use_for is
 *        %NL80211_BSS_USE_FOR_NORMAL.
 * @use_for: bitmap of possible usage for this BSS, see
 *        &enum nl80211_bss_use_for
 * @cannot_use_reasons: the reasons (bitmap) for not being able to connect,
 *        if @restrict_use is set and @use_for is zero (empty); may be 0 for
 *        unspecified reasons; see &enum nl80211_bss_cannot_use_reasons
 * @drv_data: Data to be passed through to @inform_bss
 */
struct cfg80211_inform_bss {
        struct ieee80211_channel *chan;
        s32 signal;
        u64 boottime_ns;
        u64 parent_tsf;
        u8 parent_bssid[ETH_ALEN] __aligned(2);
        u8 chains;
        s8 chain_signal[IEEE80211_MAX_CHAINS];

        u8 restrict_use:1, use_for:7;
        u8 cannot_use_reasons;

        void *drv_data;
};

/**
 * struct cfg80211_bss_ies - BSS entry IE data
 * @tsf: TSF contained in the frame that carried these IEs
 * @rcu_head: internal use, for freeing
 * @len: length of the IEs
 * @from_beacon: these IEs are known to come from a beacon
 * @data: IE data
 */
struct cfg80211_bss_ies {
        u64 tsf;
        struct rcu_head rcu_head;
        int len;
        bool from_beacon;
        u8 data[];
};

/**
 * struct cfg80211_bss - BSS description
 *
 * This structure describes a BSS (which may also be a mesh network)
 * for use in scan results and similar.
 *
 * @channel: channel this BSS is on
 * @bssid: BSSID of the BSS
 * @beacon_interval: the beacon interval as from the frame
 * @capability: the capability field in host byte order
 * @ies: the information elements (Note that there is no guarantee that these
 *        are well-formed!); this is a pointer to either the beacon_ies or
 *        proberesp_ies depending on whether Probe Response frame has been
 *        received. It is always non-%NULL.
 * @beacon_ies: the information elements from the last Beacon frame
 *        (implementation note: if @hidden_beacon_bss is set this struct doesn't
 *        own the beacon_ies, but they're just pointers to the ones from the
 *        @hidden_beacon_bss struct)
 * @proberesp_ies: the information elements from the last Probe Response frame
 * @proberesp_ecsa_stuck: ECSA element is stuck in the Probe Response frame,
 *        cannot rely on it having valid data
 * @hidden_beacon_bss: in case this BSS struct represents a probe response from
 *        a BSS that hides the SSID in its beacon, this points to the BSS struct
 *        that holds the beacon data. @beacon_ies is still valid, of course, and
 *        points to the same data as hidden_beacon_bss->beacon_ies in that case.
 * @transmitted_bss: pointer to the transmitted BSS, if this is a
 *        non-transmitted one (multi-BSSID support)
 * @nontrans_list: list of non-transmitted BSS, if this is a transmitted one
 *        (multi-BSSID support)
 * @signal: signal strength value (type depends on the wiphy's signal_type)
 * @ts_boottime: timestamp of the last BSS update in nanoseconds since boot
 * @chains: bitmask for filled values in @chain_signal.
 * @chain_signal: per-chain signal strength of last received BSS in dBm.
 * @bssid_index: index in the multiple BSS set
 * @max_bssid_indicator: max number of members in the BSS set
 * @use_for: bitmap of possible usage for this BSS, see
 *        &enum nl80211_bss_use_for
 * @cannot_use_reasons: the reasons (bitmap) for not being able to connect,
 *        if @restrict_use is set and @use_for is zero (empty); may be 0 for
 *        unspecified reasons; see &enum nl80211_bss_cannot_use_reasons
 * @priv: private area for driver use, has at least wiphy->bss_priv_size bytes
 */
struct cfg80211_bss {
        struct ieee80211_channel *channel;

        const struct cfg80211_bss_ies __rcu *ies;
        const struct cfg80211_bss_ies __rcu *beacon_ies;
        const struct cfg80211_bss_ies __rcu *proberesp_ies;

        struct cfg80211_bss *hidden_beacon_bss;
        struct cfg80211_bss *transmitted_bss;
        struct list_head nontrans_list;

        s32 signal;

        u64 ts_boottime;

        u16 beacon_interval;
        u16 capability;

        u8 bssid[ETH_ALEN];
        u8 chains;
        s8 chain_signal[IEEE80211_MAX_CHAINS];

        u8 proberesp_ecsa_stuck:1;

        u8 bssid_index;
        u8 max_bssid_indicator;

        u8 use_for;
        u8 cannot_use_reasons;

        u8 priv[] __aligned(sizeof(void *));
};

/**
 * ieee80211_bss_get_elem - find element with given ID
 * @bss: the bss to search
 * @id: the element ID
 *
 * Note that the return value is an RCU-protected pointer, so
 * rcu_read_lock() must be held when calling this function.
 * Return: %NULL if not found.
 */
const struct element *ieee80211_bss_get_elem(struct cfg80211_bss *bss, u8 id);

/**
 * ieee80211_bss_get_ie - find IE with given ID
 * @bss: the bss to search
 * @id: the element ID
 *
 * Note that the return value is an RCU-protected pointer, so
 * rcu_read_lock() must be held when calling this function.
 * Return: %NULL if not found.
 */
static inline const u8 *ieee80211_bss_get_ie(struct cfg80211_bss *bss, u8 id)
{
        return (const void *)ieee80211_bss_get_elem(bss, id);
}


/**
 * struct cfg80211_auth_request - Authentication request data
 *
 * This structure provides information needed to complete IEEE 802.11
 * authentication.
 *
 * @bss: The BSS to authenticate with, the callee must obtain a reference
 *        to it if it needs to keep it.
 * @supported_selectors: List of selectors that should be assumed to be
 *        supported by the station.
 *        SAE_H2E must be assumed supported if set to %NULL.
 * @supported_selectors_len: Length of supported_selectors in octets.
 * @auth_type: Authentication type (algorithm)
 * @ie: Extra IEs to add to Authentication frame or %NULL
 * @ie_len: Length of ie buffer in octets
 * @key_len: length of WEP key for shared key authentication
 * @key_idx: index of WEP key for shared key authentication
 * @key: WEP key for shared key authentication
 * @auth_data: Fields and elements in Authentication frames. This contains
 *        the authentication frame body (non-IE and IE data), excluding the
 *        Authentication algorithm number, i.e., starting at the Authentication
 *        transaction sequence number field.
 * @auth_data_len: Length of auth_data buffer in octets
 * @link_id: if >= 0, indicates authentication should be done as an MLD,
 *        the interface address is included as the MLD address and the
 *        necessary link (with the given link_id) will be created (and
 *        given an MLD address) by the driver
 * @ap_mld_addr: AP MLD address in case of authentication request with
 *        an AP MLD, valid iff @link_id >= 0
 */
struct cfg80211_auth_request {
        struct cfg80211_bss *bss;
        const u8 *ie;
        size_t ie_len;
        const u8 *supported_selectors;
        u8 supported_selectors_len;
        enum nl80211_auth_type auth_type;
        const u8 *key;
        u8 key_len;
        s8 key_idx;
        const u8 *auth_data;
        size_t auth_data_len;
        s8 link_id;
        const u8 *ap_mld_addr;
};

/**
 * struct cfg80211_assoc_link - per-link information for MLO association
 * @bss: the BSS pointer, see also &struct cfg80211_assoc_request::bss;
 *        if this is %NULL for a link, that link is not requested
 * @elems: extra elements for the per-STA profile for this link
 * @elems_len: length of the elements
 * @disabled: If set this link should be included during association etc. but it
 *        should not be used until enabled by the AP MLD.
 * @error: per-link error code, must be <= 0. If there is an error, then the
 *        operation as a whole must fail.
 */
struct cfg80211_assoc_link {
        struct cfg80211_bss *bss;
        const u8 *elems;
        size_t elems_len;
        bool disabled;
        int error;
};

/**
 * struct cfg80211_ml_reconf_req - MLO link reconfiguration request
 * @add_links: data for links to add, see &struct cfg80211_assoc_link
 * @rem_links: bitmap of links to remove
 * @ext_mld_capa_ops: extended MLD capabilities and operations set by
 *        userspace for the ML reconfiguration action frame
 */
struct cfg80211_ml_reconf_req {
        struct cfg80211_assoc_link add_links[IEEE80211_MLD_MAX_NUM_LINKS];
        u16 rem_links;
        u16 ext_mld_capa_ops;
};

/**
 * enum cfg80211_assoc_req_flags - Over-ride default behaviour in association.
 *
 * @ASSOC_REQ_DISABLE_HT:  Disable HT (802.11n)
 * @ASSOC_REQ_DISABLE_VHT:  Disable VHT
 * @ASSOC_REQ_USE_RRM: Declare RRM capability in this association
 * @CONNECT_REQ_EXTERNAL_AUTH_SUPPORT: User space indicates external
 *        authentication capability. Drivers can offload authentication to
 *        userspace if this flag is set. Only applicable for cfg80211_connect()
 *        request (connect callback).
 * @ASSOC_REQ_DISABLE_HE:  Disable HE
 * @ASSOC_REQ_DISABLE_EHT:  Disable EHT
 * @CONNECT_REQ_MLO_SUPPORT: Userspace indicates support for handling MLD links.
 *        Drivers shall disable MLO features for the current association if this
 *        flag is not set.
 * @ASSOC_REQ_SPP_AMSDU: SPP A-MSDUs will be used on this connection (if any)
 */
enum cfg80211_assoc_req_flags {
        ASSOC_REQ_DISABLE_HT                        = BIT(0),
        ASSOC_REQ_DISABLE_VHT                        = BIT(1),
        ASSOC_REQ_USE_RRM                        = BIT(2),
        CONNECT_REQ_EXTERNAL_AUTH_SUPPORT        = BIT(3),
        ASSOC_REQ_DISABLE_HE                        = BIT(4),
        ASSOC_REQ_DISABLE_EHT                        = BIT(5),
        CONNECT_REQ_MLO_SUPPORT                        = BIT(6),
        ASSOC_REQ_SPP_AMSDU                        = BIT(7),
};

/**
 * struct cfg80211_assoc_request - (Re)Association request data
 *
 * This structure provides information needed to complete IEEE 802.11
 * (re)association.
 * @bss: The BSS to associate with. If the call is successful the driver is
 *        given a reference that it must give back to cfg80211_send_rx_assoc()
 *        or to cfg80211_assoc_timeout(). To ensure proper refcounting, new
 *        association requests while already associating must be rejected.
 *        This also applies to the @links.bss parameter, which is used instead
 *        of this one (it is %NULL) for MLO associations.
 * @ie: Extra IEs to add to (Re)Association Request frame or %NULL
 * @ie_len: Length of ie buffer in octets
 * @use_mfp: Use management frame protection (IEEE 802.11w) in this association
 * @crypto: crypto settings
 * @prev_bssid: previous BSSID, if not %NULL use reassociate frame. This is used
 *        to indicate a request to reassociate within the ESS instead of a request
 *        do the initial association with the ESS. When included, this is set to
 *        the BSSID of the current association, i.e., to the value that is
 *        included in the Current AP address field of the Reassociation Request
 *        frame.
 * @flags:  See &enum cfg80211_assoc_req_flags
 * @supported_selectors: supported BSS selectors in IEEE 802.11 format
 *        (or %NULL for no change).
 *        If %NULL, then support for SAE_H2E should be assumed.
 * @supported_selectors_len: number of supported BSS selectors
 * @ht_capa:  HT Capabilities over-rides.  Values set in ht_capa_mask
 *        will be used in ht_capa.  Un-supported values will be ignored.
 * @ht_capa_mask:  The bits of ht_capa which are to be used.
 * @vht_capa: VHT capability override
 * @vht_capa_mask: VHT capability mask indicating which fields to use
 * @fils_kek: FILS KEK for protecting (Re)Association Request/Response frame or
 *        %NULL if FILS is not used.
 * @fils_kek_len: Length of fils_kek in octets
 * @fils_nonces: FILS nonces (part of AAD) for protecting (Re)Association
 *        Request/Response frame or %NULL if FILS is not used. This field starts
 *        with 16 octets of STA Nonce followed by 16 octets of AP Nonce.
 * @s1g_capa: S1G capability override
 * @s1g_capa_mask: S1G capability override mask
 * @links: per-link information for MLO connections
 * @link_id: >= 0 for MLO connections, where links are given, and indicates
 *        the link on which the association request should be sent
 * @ap_mld_addr: AP MLD address in case of MLO association request,
 *        valid iff @link_id >= 0
 * @ext_mld_capa_ops: extended MLD capabilities and operations set by
 *        userspace for the association
 */
struct cfg80211_assoc_request {
        struct cfg80211_bss *bss;
        const u8 *ie, *prev_bssid;
        size_t ie_len;
        struct cfg80211_crypto_settings crypto;
        bool use_mfp;
        u32 flags;
        const u8 *supported_selectors;
        u8 supported_selectors_len;
        struct ieee80211_ht_cap ht_capa;
        struct ieee80211_ht_cap ht_capa_mask;
        struct ieee80211_vht_cap vht_capa, vht_capa_mask;
        const u8 *fils_kek;
        size_t fils_kek_len;
        const u8 *fils_nonces;
        struct ieee80211_s1g_cap s1g_capa, s1g_capa_mask;
        struct cfg80211_assoc_link links[IEEE80211_MLD_MAX_NUM_LINKS];
        const u8 *ap_mld_addr;
        s8 link_id;
        u16 ext_mld_capa_ops;
};

/**
 * struct cfg80211_deauth_request - Deauthentication request data
 *
 * This structure provides information needed to complete IEEE 802.11
 * deauthentication.
 *
 * @bssid: the BSSID or AP MLD address to deauthenticate from
 * @ie: Extra IEs to add to Deauthentication frame or %NULL
 * @ie_len: Length of ie buffer in octets
 * @reason_code: The reason code for the deauthentication
 * @local_state_change: if set, change local state only and
 *        do not set a deauth frame
 */
struct cfg80211_deauth_request {
        const u8 *bssid;
        const u8 *ie;
        size_t ie_len;
        u16 reason_code;
        bool local_state_change;
};

/**
 * struct cfg80211_disassoc_request - Disassociation request data
 *
 * This structure provides information needed to complete IEEE 802.11
 * disassociation.
 *
 * @ap_addr: the BSSID or AP MLD address to disassociate from
 * @ie: Extra IEs to add to Disassociation frame or %NULL
 * @ie_len: Length of ie buffer in octets
 * @reason_code: The reason code for the disassociation
 * @local_state_change: This is a request for a local state only, i.e., no
 *        Disassociation frame is to be transmitted.
 */
struct cfg80211_disassoc_request {
        const u8 *ap_addr;
        const u8 *ie;
        size_t ie_len;
        u16 reason_code;
        bool local_state_change;
};

/**
 * struct cfg80211_ibss_params - IBSS parameters
 *
 * This structure defines the IBSS parameters for the join_ibss()
 * method.
 *
 * @ssid: The SSID, will always be non-null.
 * @ssid_len: The length of the SSID, will always be non-zero.
 * @bssid: Fixed BSSID requested, maybe be %NULL, if set do not
 *        search for IBSSs with a different BSSID.
 * @chandef: defines the channel to use if no other IBSS to join can be found
 * @channel_fixed: The channel should be fixed -- do not search for
 *        IBSSs to join on other channels.
 * @ie: information element(s) to include in the beacon
 * @ie_len: length of that
 * @beacon_interval: beacon interval to use
 * @privacy: this is a protected network, keys will be configured
 *        after joining
 * @control_port: whether user space controls IEEE 802.1X port, i.e.,
 *        sets/clears %NL80211_STA_FLAG_AUTHORIZED. If true, the driver is
 *        required to assume that the port is unauthorized until authorized by
 *        user space. Otherwise, port is marked authorized by default.
 * @control_port_over_nl80211: TRUE if userspace expects to exchange control
 *        port frames over NL80211 instead of the network interface.
 * @userspace_handles_dfs: whether user space controls DFS operation, i.e.
 *        changes the channel when a radar is detected. This is required
 *        to operate on DFS channels.
 * @basic_rates: bitmap of basic rates to use when creating the IBSS
 * @mcast_rate: per-band multicast rate index + 1 (0: disabled)
 * @ht_capa:  HT Capabilities over-rides.  Values set in ht_capa_mask
 *        will be used in ht_capa.  Un-supported values will be ignored.
 * @ht_capa_mask:  The bits of ht_capa which are to be used.
 * @wep_keys: static WEP keys, if not NULL points to an array of
 *        CFG80211_MAX_WEP_KEYS WEP keys
 * @wep_tx_key: key index (0..3) of the default TX static WEP key
 */
struct cfg80211_ibss_params {
        const u8 *ssid;
        const u8 *bssid;
        struct cfg80211_chan_def chandef;
        const u8 *ie;
        u8 ssid_len, ie_len;
        u16 beacon_interval;
        u32 basic_rates;
        bool channel_fixed;
        bool privacy;
        bool control_port;
        bool control_port_over_nl80211;
        bool userspace_handles_dfs;
        int mcast_rate[NUM_NL80211_BANDS];
        struct ieee80211_ht_cap ht_capa;
        struct ieee80211_ht_cap ht_capa_mask;
        struct key_params *wep_keys;
        int wep_tx_key;
};

/**
 * struct cfg80211_bss_selection - connection parameters for BSS selection.
 *
 * @behaviour: requested BSS selection behaviour.
 * @param: parameters for requestion behaviour.
 * @param.band_pref: preferred band for %NL80211_BSS_SELECT_ATTR_BAND_PREF.
 * @param.adjust: parameters for %NL80211_BSS_SELECT_ATTR_RSSI_ADJUST.
 */
struct cfg80211_bss_selection {
        enum nl80211_bss_select_attr behaviour;
        union {
                enum nl80211_band band_pref;
                struct cfg80211_bss_select_adjust adjust;
        } param;
};

/**
 * struct cfg80211_connect_params - Connection parameters
 *
 * This structure provides information needed to complete IEEE 802.11
 * authentication and association.
 *
 * @channel: The channel to use or %NULL if not specified (auto-select based
 *        on scan results)
 * @channel_hint: The channel of the recommended BSS for initial connection or
 *        %NULL if not specified
 * @bssid: The AP BSSID or %NULL if not specified (auto-select based on scan
 *        results)
 * @bssid_hint: The recommended AP BSSID for initial connection to the BSS or
 *        %NULL if not specified. Unlike the @bssid parameter, the driver is
 *        allowed to ignore this @bssid_hint if it has knowledge of a better BSS
 *        to use.
 * @ssid: SSID
 * @ssid_len: Length of ssid in octets
 * @auth_type: Authentication type (algorithm)
 * @ie: IEs for association request
 * @ie_len: Length of assoc_ie in octets
 * @privacy: indicates whether privacy-enabled APs should be used
 * @mfp: indicate whether management frame protection is used
 * @crypto: crypto settings
 * @key_len: length of WEP key for shared key authentication
 * @key_idx: index of WEP key for shared key authentication
 * @key: WEP key for shared key authentication
 * @flags:  See &enum cfg80211_assoc_req_flags
 * @bg_scan_period:  Background scan period in seconds
 *        or -1 to indicate that default value is to be used.
 * @ht_capa:  HT Capabilities over-rides.  Values set in ht_capa_mask
 *        will be used in ht_capa.  Un-supported values will be ignored.
 * @ht_capa_mask:  The bits of ht_capa which are to be used.
 * @vht_capa:  VHT Capability overrides
 * @vht_capa_mask: The bits of vht_capa which are to be used.
 * @pbss: if set, connect to a PCP instead of AP. Valid for DMG
 *        networks.
 * @bss_select: criteria to be used for BSS selection.
 * @prev_bssid: previous BSSID, if not %NULL use reassociate frame. This is used
 *        to indicate a request to reassociate within the ESS instead of a request
 *        do the initial association with the ESS. When included, this is set to
 *        the BSSID of the current association, i.e., to the value that is
 *        included in the Current AP address field of the Reassociation Request
 *        frame.
 * @fils_erp_username: EAP re-authentication protocol (ERP) username part of the
 *        NAI or %NULL if not specified. This is used to construct FILS wrapped
 *        data IE.
 * @fils_erp_username_len: Length of @fils_erp_username in octets.
 * @fils_erp_realm: EAP re-authentication protocol (ERP) realm part of NAI or
 *        %NULL if not specified. This specifies the domain name of ER server and
 *        is used to construct FILS wrapped data IE.
 * @fils_erp_realm_len: Length of @fils_erp_realm in octets.
 * @fils_erp_next_seq_num: The next sequence number to use in the FILS ERP
 *        messages. This is also used to construct FILS wrapped data IE.
 * @fils_erp_rrk: ERP re-authentication Root Key (rRK) used to derive additional
 *        keys in FILS or %NULL if not specified.
 * @fils_erp_rrk_len: Length of @fils_erp_rrk in octets.
 * @want_1x: indicates user-space supports and wants to use 802.1X driver
 *        offload of 4-way handshake.
 * @edmg: define the EDMG channels.
 *        This may specify multiple channels and bonding options for the driver
 *        to choose from, based on BSS configuration.
 */
struct cfg80211_connect_params {
        struct ieee80211_channel *channel;
        struct ieee80211_channel *channel_hint;
        const u8 *bssid;
        const u8 *bssid_hint;
        const u8 *ssid;
        size_t ssid_len;
        enum nl80211_auth_type auth_type;
        const u8 *ie;
        size_t ie_len;
        bool privacy;
        enum nl80211_mfp mfp;
        struct cfg80211_crypto_settings crypto;
        const u8 *key;
        u8 key_len, key_idx;
        u32 flags;
        int bg_scan_period;
        struct ieee80211_ht_cap ht_capa;
        struct ieee80211_ht_cap ht_capa_mask;
        struct ieee80211_vht_cap vht_capa;
        struct ieee80211_vht_cap vht_capa_mask;
        bool pbss;
        struct cfg80211_bss_selection bss_select;
        const u8 *prev_bssid;
        const u8 *fils_erp_username;
        size_t fils_erp_username_len;
        const u8 *fils_erp_realm;
        size_t fils_erp_realm_len;
        u16 fils_erp_next_seq_num;
        const u8 *fils_erp_rrk;
        size_t fils_erp_rrk_len;
        bool want_1x;
        struct ieee80211_edmg edmg;
};

/**
 * enum cfg80211_connect_params_changed - Connection parameters being updated
 *
 * This enum provides information of all connect parameters that
 * have to be updated as part of update_connect_params() call.
 *
 * @UPDATE_ASSOC_IES: Indicates whether association request IEs are updated
 * @UPDATE_FILS_ERP_INFO: Indicates that FILS connection parameters (realm,
 *        username, erp sequence number and rrk) are updated
 * @UPDATE_AUTH_TYPE: Indicates that authentication type is updated
 */
enum cfg80211_connect_params_changed {
        UPDATE_ASSOC_IES                = BIT(0),
        UPDATE_FILS_ERP_INFO                = BIT(1),
        UPDATE_AUTH_TYPE                = BIT(2),
};

/**
 * enum wiphy_params_flags - set_wiphy_params bitfield values
 * @WIPHY_PARAM_RETRY_SHORT: wiphy->retry_short has changed
 * @WIPHY_PARAM_RETRY_LONG: wiphy->retry_long has changed
 * @WIPHY_PARAM_FRAG_THRESHOLD: wiphy->frag_threshold has changed
 * @WIPHY_PARAM_RTS_THRESHOLD: wiphy->rts_threshold has changed
 * @WIPHY_PARAM_COVERAGE_CLASS: coverage class changed
 * @WIPHY_PARAM_DYN_ACK: dynack has been enabled
 * @WIPHY_PARAM_TXQ_LIMIT: TXQ packet limit has been changed
 * @WIPHY_PARAM_TXQ_MEMORY_LIMIT: TXQ memory limit has been changed
 * @WIPHY_PARAM_TXQ_QUANTUM: TXQ scheduler quantum
 */
enum wiphy_params_flags {
        WIPHY_PARAM_RETRY_SHORT                = BIT(0),
        WIPHY_PARAM_RETRY_LONG                = BIT(1),
        WIPHY_PARAM_FRAG_THRESHOLD        = BIT(2),
        WIPHY_PARAM_RTS_THRESHOLD        = BIT(3),
        WIPHY_PARAM_COVERAGE_CLASS        = BIT(4),
        WIPHY_PARAM_DYN_ACK                = BIT(5),
        WIPHY_PARAM_TXQ_LIMIT                = BIT(6),
        WIPHY_PARAM_TXQ_MEMORY_LIMIT        = BIT(7),
        WIPHY_PARAM_TXQ_QUANTUM                = BIT(8),
};

#define IEEE80211_DEFAULT_AIRTIME_WEIGHT        256

/* The per TXQ device queue limit in airtime */
#define IEEE80211_DEFAULT_AQL_TXQ_LIMIT_L        5000
#define IEEE80211_DEFAULT_AQL_TXQ_LIMIT_H        12000

/* The per interface airtime threshold to switch to lower queue limit */
#define IEEE80211_AQL_THRESHOLD                        24000

/**
 * struct cfg80211_pmksa - PMK Security Association
 *
 * This structure is passed to the set/del_pmksa() method for PMKSA
 * caching.
 *
 * @bssid: The AP's BSSID (may be %NULL).
 * @pmkid: The identifier to refer a PMKSA.
 * @pmk: The PMK for the PMKSA identified by @pmkid. This is used for key
 *        derivation by a FILS STA. Otherwise, %NULL.
 * @pmk_len: Length of the @pmk. The length of @pmk can differ depending on
 *        the hash algorithm used to generate this.
 * @ssid: SSID to specify the ESS within which a PMKSA is valid when using FILS
 *        cache identifier (may be %NULL).
 * @ssid_len: Length of the @ssid in octets.
 * @cache_id: 2-octet cache identifier advertized by a FILS AP identifying the
 *        scope of PMKSA. This is valid only if @ssid_len is non-zero (may be
 *        %NULL).
 * @pmk_lifetime: Maximum lifetime for PMKSA in seconds
 *        (dot11RSNAConfigPMKLifetime) or 0 if not specified.
 *        The configured PMKSA must not be used for PMKSA caching after
 *        expiration and any keys derived from this PMK become invalid on
 *        expiration, i.e., the current association must be dropped if the PMK
 *        used for it expires.
 * @pmk_reauth_threshold: Threshold time for reauthentication (percentage of
 *        PMK lifetime, dot11RSNAConfigPMKReauthThreshold) or 0 if not specified.
 *        Drivers are expected to trigger a full authentication instead of using
 *        this PMKSA for caching when reassociating to a new BSS after this
 *        threshold to generate a new PMK before the current one expires.
 */
struct cfg80211_pmksa {
        const u8 *bssid;
        const u8 *pmkid;
        const u8 *pmk;
        size_t pmk_len;
        const u8 *ssid;
        size_t ssid_len;
        const u8 *cache_id;
        u32 pmk_lifetime;
        u8 pmk_reauth_threshold;
};

/**
 * struct cfg80211_pkt_pattern - packet pattern
 * @mask: bitmask where to match pattern and where to ignore bytes,
 *        one bit per byte, in same format as nl80211
 * @pattern: bytes to match where bitmask is 1
 * @pattern_len: length of pattern (in bytes)
 * @pkt_offset: packet offset (in bytes)
 *
 * Internal note: @mask and @pattern are allocated in one chunk of
 * memory, free @mask only!
 */
struct cfg80211_pkt_pattern {
        const u8 *mask, *pattern;
        int pattern_len;
        int pkt_offset;
};

/**
 * struct cfg80211_wowlan_tcp - TCP connection parameters
 *
 * @sock: (internal) socket for source port allocation
 * @src: source IP address
 * @dst: destination IP address
 * @dst_mac: destination MAC address
 * @src_port: source port
 * @dst_port: destination port
 * @payload_len: data payload length
 * @payload: data payload buffer
 * @payload_seq: payload sequence stamping configuration
 * @data_interval: interval at which to send data packets
 * @wake_len: wakeup payload match length
 * @wake_data: wakeup payload match data
 * @wake_mask: wakeup payload match mask
 * @tokens_size: length of the tokens buffer
 * @payload_tok: payload token usage configuration
 */
struct cfg80211_wowlan_tcp {
        struct socket *sock;
        __be32 src, dst;
        u16 src_port, dst_port;
        u8 dst_mac[ETH_ALEN];
        int payload_len;
        const u8 *payload;
        struct nl80211_wowlan_tcp_data_seq payload_seq;
        u32 data_interval;
        u32 wake_len;
        const u8 *wake_data, *wake_mask;
        u32 tokens_size;
        /* must be last, variable member */
        struct nl80211_wowlan_tcp_data_token payload_tok;
};

/**
 * struct cfg80211_wowlan - Wake on Wireless-LAN support info
 *
 * This structure defines the enabled WoWLAN triggers for the device.
 * @any: wake up on any activity -- special trigger if device continues
 *        operating as normal during suspend
 * @disconnect: wake up if getting disconnected
 * @magic_pkt: wake up on receiving magic packet
 * @patterns: wake up on receiving packet matching a pattern
 * @n_patterns: number of patterns
 * @gtk_rekey_failure: wake up on GTK rekey failure
 * @eap_identity_req: wake up on EAP identity request packet
 * @four_way_handshake: wake up on 4-way handshake
 * @rfkill_release: wake up when rfkill is released
 * @tcp: TCP connection establishment/wakeup parameters, see nl80211.h.
 *        NULL if not configured.
 * @nd_config: configuration for the scan to be used for net detect wake.
 */
struct cfg80211_wowlan {
        bool any, disconnect, magic_pkt, gtk_rekey_failure,
             eap_identity_req, four_way_handshake,
             rfkill_release;
        struct cfg80211_pkt_pattern *patterns;
        struct cfg80211_wowlan_tcp *tcp;
        int n_patterns;
        struct cfg80211_sched_scan_request *nd_config;
};

/**
 * struct cfg80211_coalesce_rules - Coalesce rule parameters
 *
 * This structure defines coalesce rule for the device.
 * @delay: maximum coalescing delay in msecs.
 * @condition: condition for packet coalescence.
 *        see &enum nl80211_coalesce_condition.
 * @patterns: array of packet patterns
 * @n_patterns: number of patterns
 */
struct cfg80211_coalesce_rules {
        int delay;
        enum nl80211_coalesce_condition condition;
        struct cfg80211_pkt_pattern *patterns;
        int n_patterns;
};

/**
 * struct cfg80211_coalesce - Packet coalescing settings
 *
 * This structure defines coalescing settings.
 * @rules: array of coalesce rules
 * @n_rules: number of rules
 */
struct cfg80211_coalesce {
        int n_rules;
        struct cfg80211_coalesce_rules rules[] __counted_by(n_rules);
};

/**
 * struct cfg80211_wowlan_nd_match - information about the match
 *
 * @ssid: SSID of the match that triggered the wake up
 * @n_channels: Number of channels where the match occurred.  This
 *        value may be zero if the driver can't report the channels.
 * @channels: center frequencies of the channels where a match
 *        occurred (in MHz)
 */
struct cfg80211_wowlan_nd_match {
        struct cfg80211_ssid ssid;
        int n_channels;
        u32 channels[] __counted_by(n_channels);
};

/**
 * struct cfg80211_wowlan_nd_info - net detect wake up information
 *
 * @n_matches: Number of match information instances provided in
 *        @matches.  This value may be zero if the driver can't provide
 *        match information.
 * @matches: Array of pointers to matches containing information about
 *        the matches that triggered the wake up.
 */
struct cfg80211_wowlan_nd_info {
        int n_matches;
        struct cfg80211_wowlan_nd_match *matches[] __counted_by(n_matches);
};

/**
 * struct cfg80211_wowlan_wakeup - wakeup report
 * @disconnect: woke up by getting disconnected
 * @magic_pkt: woke up by receiving magic packet
 * @gtk_rekey_failure: woke up by GTK rekey failure
 * @eap_identity_req: woke up by EAP identity request packet
 * @four_way_handshake: woke up by 4-way handshake
 * @rfkill_release: woke up by rfkill being released
 * @pattern_idx: pattern that caused wakeup, -1 if not due to pattern
 * @packet_present_len: copied wakeup packet data
 * @packet_len: original wakeup packet length
 * @packet: The packet causing the wakeup, if any.
 * @packet_80211:  For pattern match, magic packet and other data
 *        frame triggers an 802.3 frame should be reported, for
 *        disconnect due to deauth 802.11 frame. This indicates which
 *        it is.
 * @tcp_match: TCP wakeup packet received
 * @tcp_connlost: TCP connection lost or failed to establish
 * @tcp_nomoretokens: TCP data ran out of tokens
 * @net_detect: if not %NULL, woke up because of net detect
 * @unprot_deauth_disassoc: woke up due to unprotected deauth or
 *        disassoc frame (in MFP).
 */
struct cfg80211_wowlan_wakeup {
        bool disconnect, magic_pkt, gtk_rekey_failure,
             eap_identity_req, four_way_handshake,
             rfkill_release, packet_80211,
             tcp_match, tcp_connlost, tcp_nomoretokens,
             unprot_deauth_disassoc;
        s32 pattern_idx;
        u32 packet_present_len, packet_len;
        const void *packet;
        struct cfg80211_wowlan_nd_info *net_detect;
};

/**
 * struct cfg80211_gtk_rekey_data - rekey data
 * @kek: key encryption key (@kek_len bytes)
 * @kck: key confirmation key (@kck_len bytes)
 * @replay_ctr: replay counter (NL80211_REPLAY_CTR_LEN bytes)
 * @kek_len: length of kek
 * @kck_len: length of kck
 * @akm: akm (oui, id)
 */
struct cfg80211_gtk_rekey_data {
        const u8 *kek, *kck, *replay_ctr;
        u32 akm;
        u8 kek_len, kck_len;
};

/**
 * struct cfg80211_update_ft_ies_params - FT IE Information
 *
 * This structure provides information needed to update the fast transition IE
 *
 * @md: The Mobility Domain ID, 2 Octet value
 * @ie: Fast Transition IEs
 * @ie_len: Length of ft_ie in octets
 */
struct cfg80211_update_ft_ies_params {
        u16 md;
        const u8 *ie;
        size_t ie_len;
};

/**
 * struct cfg80211_mgmt_tx_params - mgmt tx parameters
 *
 * This structure provides information needed to transmit a mgmt frame
 *
 * @chan: channel to use
 * @offchan: indicates whether off channel operation is required
 * @wait: duration for ROC
 * @buf: buffer to transmit
 * @len: buffer length
 * @no_cck: don't use cck rates for this frame
 * @dont_wait_for_ack: tells the low level not to wait for an ack
 * @n_csa_offsets: length of csa_offsets array
 * @csa_offsets: array of all the csa offsets in the frame
 * @link_id: for MLO, the link ID to transmit on, -1 if not given; note
 *        that the link ID isn't validated (much), it's in range but the
 *        link might not exist (or be used by the receiver STA)
 */
struct cfg80211_mgmt_tx_params {
        struct ieee80211_channel *chan;
        bool offchan;
        unsigned int wait;
        const u8 *buf;
        size_t len;
        bool no_cck;
        bool dont_wait_for_ack;
        int n_csa_offsets;
        const u16 *csa_offsets;
        int link_id;
};

/**
 * struct cfg80211_dscp_exception - DSCP exception
 *
 * @dscp: DSCP value that does not adhere to the user priority range definition
 * @up: user priority value to which the corresponding DSCP value belongs
 */
struct cfg80211_dscp_exception {
        u8 dscp;
        u8 up;
};

/**
 * struct cfg80211_dscp_range - DSCP range definition for user priority
 *
 * @low: lowest DSCP value of this user priority range, inclusive
 * @high: highest DSCP value of this user priority range, inclusive
 */
struct cfg80211_dscp_range {
        u8 low;
        u8 high;
};

/* QoS Map Set element length defined in IEEE Std 802.11-2012, 8.4.2.97 */
#define IEEE80211_QOS_MAP_MAX_EX        21
#define IEEE80211_QOS_MAP_LEN_MIN        16
#define IEEE80211_QOS_MAP_LEN_MAX \
        (IEEE80211_QOS_MAP_LEN_MIN + 2 * IEEE80211_QOS_MAP_MAX_EX)

/**
 * struct cfg80211_qos_map - QoS Map Information
 *
 * This struct defines the Interworking QoS map setting for DSCP values
 *
 * @num_des: number of DSCP exceptions (0..21)
 * @dscp_exception: optionally up to maximum of 21 DSCP exceptions from
 *        the user priority DSCP range definition
 * @up: DSCP range definition for a particular user priority
 */
struct cfg80211_qos_map {
        u8 num_des;
        struct cfg80211_dscp_exception dscp_exception[IEEE80211_QOS_MAP_MAX_EX];
        struct cfg80211_dscp_range up[8];
};

/**
 * struct cfg80211_nan_band_config - NAN band specific configuration
 *
 * @chan: Pointer to the IEEE 802.11 channel structure. The channel to be used
 *        for NAN operations on this band. For 2.4 GHz band, this is always
 *        channel 6. For 5 GHz band, the channel is either 44 or 149, according
 *        to the regulatory constraints. If chan pointer is NULL the entire band
 *        configuration entry is considered invalid and should not be used.
 * @rssi_close: RSSI close threshold used for NAN state transition algorithm
 *        as described in chapters 3.3.6 and 3.3.7 "NAN Device Role and State
 *        Transition" of Wi-Fi Aware Specification v4.0. If not
 *        specified (set to 0), default device value is used. The value should
 *        be greater than -60 dBm.
 * @rssi_middle: RSSI middle threshold used for NAN state transition algorithm.
 *        as described in chapters 3.3.6 and 3.3.7 "NAN Device Role and State
 *        Transition" of Wi-Fi Aware Specification v4.0. If not
 *        specified (set to 0), default device value is used. The value should be
 *        greater than -75 dBm and less than rssi_close.
 * @awake_dw_interval: Committed DW interval. Valid values range: 0-5. 0
 *        indicates no wakeup for DW and can't be used on 2.4GHz band, otherwise
 *        2^(n-1).
 * @disable_scan: If true, the device will not scan this band for cluster
 *         merge. Disabling scan on 2.4 GHz band is not allowed.
 */
struct cfg80211_nan_band_config {
        struct ieee80211_channel *chan;
        s8 rssi_close;
        s8 rssi_middle;
        u8 awake_dw_interval;
        bool disable_scan;
};

/**
 * struct cfg80211_nan_conf - NAN configuration
 *
 * This struct defines NAN configuration parameters
 *
 * @master_pref: master preference (1 - 255)
 * @bands: operating bands, a bitmap of &enum nl80211_band values.
 *        For instance, for NL80211_BAND_2GHZ, bit 0 would be set
 *        (i.e. BIT(NL80211_BAND_2GHZ)).
 * @cluster_id: cluster ID used for NAN synchronization. This is a MAC address
 *        that can take a value from 50-6F-9A-01-00-00 to 50-6F-9A-01-FF-FF.
 *        If NULL, the device will pick a random Cluster ID.
 * @scan_period: period (in seconds) between NAN scans.
 * @scan_dwell_time: dwell time (in milliseconds) for NAN scans.
 * @discovery_beacon_interval: interval (in TUs) for discovery beacons.
 * @enable_dw_notification: flag to enable/disable discovery window
 *        notifications.
 * @band_cfgs: array of band specific configurations, indexed by
 *        &enum nl80211_band values.
 * @extra_nan_attrs: pointer to additional NAN attributes.
 * @extra_nan_attrs_len: length of the additional NAN attributes.
 * @vendor_elems: pointer to vendor-specific elements.
 * @vendor_elems_len: length of the vendor-specific elements.
 */
struct cfg80211_nan_conf {
        u8 master_pref;
        u8 bands;
        const u8 *cluster_id;
        u16 scan_period;
        u16 scan_dwell_time;
        u8 discovery_beacon_interval;
        bool enable_dw_notification;
        struct cfg80211_nan_band_config band_cfgs[NUM_NL80211_BANDS];
        const u8 *extra_nan_attrs;
        u16 extra_nan_attrs_len;
        const u8 *vendor_elems;
        u16 vendor_elems_len;
};

/**
 * enum cfg80211_nan_conf_changes - indicates changed fields in NAN
 * configuration
 *
 * @CFG80211_NAN_CONF_CHANGED_PREF: master preference
 * @CFG80211_NAN_CONF_CHANGED_BANDS: operating bands
 * @CFG80211_NAN_CONF_CHANGED_CONFIG: changed additional configuration.
 *        When this flag is set, it indicates that some additional attribute(s)
 *        (other then master_pref and bands) have been changed. In this case,
 *        all the unchanged attributes will be properly configured to their
 *        previous values. The driver doesn't need to store any
 *        previous configuration besides master_pref and bands.
 */
enum cfg80211_nan_conf_changes {
        CFG80211_NAN_CONF_CHANGED_PREF = BIT(0),
        CFG80211_NAN_CONF_CHANGED_BANDS = BIT(1),
        CFG80211_NAN_CONF_CHANGED_CONFIG = BIT(2),
};

/**
 * struct cfg80211_nan_func_filter - a NAN function Rx / Tx filter
 *
 * @filter: the content of the filter
 * @len: the length of the filter
 */
struct cfg80211_nan_func_filter {
        const u8 *filter;
        u8 len;
};

/**
 * struct cfg80211_nan_func - a NAN function
 *
 * @type: &enum nl80211_nan_function_type
 * @service_id: the service ID of the function
 * @publish_type: &nl80211_nan_publish_type
 * @close_range: if true, the range should be limited. Threshold is
 *        implementation specific.
 * @publish_bcast: if true, the solicited publish should be broadcasted
 * @subscribe_active: if true, the subscribe is active
 * @followup_id: the instance ID for follow up
 * @followup_reqid: the requester instance ID for follow up
 * @followup_dest: MAC address of the recipient of the follow up
 * @ttl: time to live counter in DW.
 * @serv_spec_info: Service Specific Info
 * @serv_spec_info_len: Service Specific Info length
 * @srf_include: if true, SRF is inclusive
 * @srf_bf: Bloom Filter
 * @srf_bf_len: Bloom Filter length
 * @srf_bf_idx: Bloom Filter index
 * @srf_macs: SRF MAC addresses
 * @srf_num_macs: number of MAC addresses in SRF
 * @rx_filters: rx filters that are matched with corresponding peer's tx_filter
 * @tx_filters: filters that should be transmitted in the SDF.
 * @num_rx_filters: length of &rx_filters.
 * @num_tx_filters: length of &tx_filters.
 * @instance_id: driver allocated id of the function.
 * @cookie: unique NAN function identifier.
 */
struct cfg80211_nan_func {
        enum nl80211_nan_function_type type;
        u8 service_id[NL80211_NAN_FUNC_SERVICE_ID_LEN];
        u8 publish_type;
        bool close_range;
        bool publish_bcast;
        bool subscribe_active;
        u8 followup_id;
        u8 followup_reqid;
        struct mac_address followup_dest;
        u32 ttl;
        const u8 *serv_spec_info;
        u8 serv_spec_info_len;
        bool srf_include;
        const u8 *srf_bf;
        u8 srf_bf_len;
        u8 srf_bf_idx;
        struct mac_address *srf_macs;
        int srf_num_macs;
        struct cfg80211_nan_func_filter *rx_filters;
        struct cfg80211_nan_func_filter *tx_filters;
        u8 num_tx_filters;
        u8 num_rx_filters;
        u8 instance_id;
        u64 cookie;
};

/**
 * struct cfg80211_pmk_conf - PMK configuration
 *
 * @aa: authenticator address
 * @pmk_len: PMK length in bytes.
 * @pmk: the PMK material
 * @pmk_r0_name: PMK-R0 Name. NULL if not applicable (i.e., the PMK
 *        is not PMK-R0). When pmk_r0_name is not NULL, the pmk field
 *        holds PMK-R0.
 */
struct cfg80211_pmk_conf {
        const u8 *aa;
        u8 pmk_len;
        const u8 *pmk;
        const u8 *pmk_r0_name;
};

/**
 * struct cfg80211_external_auth_params - Trigger External authentication.
 *
 * Commonly used across the external auth request and event interfaces.
 *
 * @action: action type / trigger for external authentication. Only significant
 *        for the authentication request event interface (driver to user space).
 * @bssid: BSSID of the peer with which the authentication has
 *        to happen. Used by both the authentication request event and
 *        authentication response command interface.
 * @ssid: SSID of the AP.  Used by both the authentication request event and
 *        authentication response command interface.
 * @key_mgmt_suite: AKM suite of the respective authentication. Used by the
 *        authentication request event interface.
 * @status: status code, %WLAN_STATUS_SUCCESS for successful authentication,
 *        use %WLAN_STATUS_UNSPECIFIED_FAILURE if user space cannot give you
 *        the real status code for failures. Used only for the authentication
 *        response command interface (user space to driver).
 * @pmkid: The identifier to refer a PMKSA.
 * @mld_addr: MLD address of the peer. Used by the authentication request event
 *        interface. Driver indicates this to enable MLO during the authentication
 *        offload to user space. Driver shall look at %NL80211_ATTR_MLO_SUPPORT
 *        flag capability in NL80211_CMD_CONNECT to know whether the user space
 *        supports enabling MLO during the authentication offload.
 *        User space should use the address of the interface (on which the
 *        authentication request event reported) as self MLD address. User space
 *        and driver should use MLD addresses in RA, TA and BSSID fields of
 *        authentication frames sent or received via cfg80211. The driver
 *        translates the MLD addresses to/from link addresses based on the link
 *        chosen for the authentication.
 */
struct cfg80211_external_auth_params {
        enum nl80211_external_auth_action action;
        u8 bssid[ETH_ALEN] __aligned(2);
        struct cfg80211_ssid ssid;
        unsigned int key_mgmt_suite;
        u16 status;
        const u8 *pmkid;
        u8 mld_addr[ETH_ALEN] __aligned(2);
};

/**
 * struct cfg80211_ftm_responder_stats - FTM responder statistics
 *
 * @filled: bitflag of flags using the bits of &enum nl80211_ftm_stats to
 *        indicate the relevant values in this struct for them
 * @success_num: number of FTM sessions in which all frames were successfully
 *        answered
 * @partial_num: number of FTM sessions in which part of frames were
 *        successfully answered
 * @failed_num: number of failed FTM sessions
 * @asap_num: number of ASAP FTM sessions
 * @non_asap_num: number of  non-ASAP FTM sessions
 * @total_duration_ms: total sessions durations - gives an indication
 *        of how much time the responder was busy
 * @unknown_triggers_num: number of unknown FTM triggers - triggers from
 *        initiators that didn't finish successfully the negotiation phase with
 *        the responder
 * @reschedule_requests_num: number of FTM reschedule requests - initiator asks
 *        for a new scheduling although it already has scheduled FTM slot
 * @out_of_window_triggers_num: total FTM triggers out of scheduled window
 */
struct cfg80211_ftm_responder_stats {
        u32 filled;
        u32 success_num;
        u32 partial_num;
        u32 failed_num;
        u32 asap_num;
        u32 non_asap_num;
        u64 total_duration_ms;
        u32 unknown_triggers_num;
        u32 reschedule_requests_num;
        u32 out_of_window_triggers_num;
};

/**
 * struct cfg80211_pmsr_ftm_result - FTM result
 * @failure_reason: if this measurement failed (PMSR status is
 *        %NL80211_PMSR_STATUS_FAILURE), this gives a more precise
 *        reason than just "failure"
 * @burst_index: if reporting partial results, this is the index
 *        in [0 .. num_bursts-1] of the burst that's being reported
 * @num_ftmr_attempts: number of FTM request frames transmitted
 * @num_ftmr_successes: number of FTM request frames acked
 * @busy_retry_time: if failure_reason is %NL80211_PMSR_FTM_FAILURE_PEER_BUSY,
 *        fill this to indicate in how many seconds a retry is deemed possible
 *        by the responder
 * @num_bursts_exp: actual number of bursts exponent negotiated
 * @burst_duration: actual burst duration negotiated
 * @ftms_per_burst: actual FTMs per burst negotiated
 * @lci_len: length of LCI information (if present)
 * @civicloc_len: length of civic location information (if present)
 * @lci: LCI data (may be %NULL)
 * @civicloc: civic location data (may be %NULL)
 * @rssi_avg: average RSSI over FTM action frames reported
 * @rssi_spread: spread of the RSSI over FTM action frames reported
 * @tx_rate: bitrate for transmitted FTM action frame response
 * @rx_rate: bitrate of received FTM action frame
 * @rtt_avg: average of RTTs measured (must have either this or @dist_avg)
 * @rtt_variance: variance of RTTs measured (note that standard deviation is
 *        the square root of the variance)
 * @rtt_spread: spread of the RTTs measured
 * @dist_avg: average of distances (mm) measured
 *        (must have either this or @rtt_avg)
 * @dist_variance: variance of distances measured (see also @rtt_variance)
 * @dist_spread: spread of distances measured (see also @rtt_spread)
 * @num_ftmr_attempts_valid: @num_ftmr_attempts is valid
 * @num_ftmr_successes_valid: @num_ftmr_successes is valid
 * @rssi_avg_valid: @rssi_avg is valid
 * @rssi_spread_valid: @rssi_spread is valid
 * @tx_rate_valid: @tx_rate is valid
 * @rx_rate_valid: @rx_rate is valid
 * @rtt_avg_valid: @rtt_avg is valid
 * @rtt_variance_valid: @rtt_variance is valid
 * @rtt_spread_valid: @rtt_spread is valid
 * @dist_avg_valid: @dist_avg is valid
 * @dist_variance_valid: @dist_variance is valid
 * @dist_spread_valid: @dist_spread is valid
 */
struct cfg80211_pmsr_ftm_result {
        const u8 *lci;
        const u8 *civicloc;
        unsigned int lci_len;
        unsigned int civicloc_len;
        enum nl80211_peer_measurement_ftm_failure_reasons failure_reason;
        u32 num_ftmr_attempts, num_ftmr_successes;
        s16 burst_index;
        u8 busy_retry_time;
        u8 num_bursts_exp;
        u8 burst_duration;
        u8 ftms_per_burst;
        s32 rssi_avg;
        s32 rssi_spread;
        struct rate_info tx_rate, rx_rate;
        s64 rtt_avg;
        s64 rtt_variance;
        s64 rtt_spread;
        s64 dist_avg;
        s64 dist_variance;
        s64 dist_spread;

        u16 num_ftmr_attempts_valid:1,
            num_ftmr_successes_valid:1,
            rssi_avg_valid:1,
            rssi_spread_valid:1,
            tx_rate_valid:1,
            rx_rate_valid:1,
            rtt_avg_valid:1,
            rtt_variance_valid:1,
            rtt_spread_valid:1,
            dist_avg_valid:1,
            dist_variance_valid:1,
            dist_spread_valid:1;
};

/**
 * struct cfg80211_pmsr_result - peer measurement result
 * @addr: address of the peer
 * @host_time: host time (use ktime_get_boottime() adjust to the time when the
 *        measurement was made)
 * @ap_tsf: AP's TSF at measurement time
 * @status: status of the measurement
 * @final: if reporting partial results, mark this as the last one; if not
 *        reporting partial results always set this flag
 * @ap_tsf_valid: indicates the @ap_tsf value is valid
 * @type: type of the measurement reported, note that we only support reporting
 *        one type at a time, but you can report multiple results separately and
 *        they're all aggregated for userspace.
 * @ftm: FTM result
 */
struct cfg80211_pmsr_result {
        u64 host_time, ap_tsf;
        enum nl80211_peer_measurement_status status;

        u8 addr[ETH_ALEN];

        u8 final:1,
           ap_tsf_valid:1;

        enum nl80211_peer_measurement_type type;

        union {
                struct cfg80211_pmsr_ftm_result ftm;
        };
};

/**
 * struct cfg80211_pmsr_ftm_request_peer - FTM request data
 * @requested: indicates FTM is requested
 * @preamble: frame preamble to use
 * @burst_period: burst period to use
 * @asap: indicates to use ASAP mode
 * @num_bursts_exp: number of bursts exponent
 * @burst_duration: burst duration
 * @ftms_per_burst: number of FTMs per burst
 * @ftmr_retries: number of retries for FTM request
 * @request_lci: request LCI information
 * @request_civicloc: request civic location information
 * @trigger_based: use trigger based ranging for the measurement
 *                 If neither @trigger_based nor @non_trigger_based is set,
 *                 EDCA based ranging will be used.
 * @non_trigger_based: use non trigger based ranging for the measurement
 *                 If neither @trigger_based nor @non_trigger_based is set,
 *                 EDCA based ranging will be used.
 * @lmr_feedback: negotiate for I2R LMR feedback. Only valid if either
 *                 @trigger_based or @non_trigger_based is set.
 * @bss_color: the bss color of the responder. Optional. Set to zero to
 *        indicate the driver should set the BSS color. Only valid if
 *        @non_trigger_based or @trigger_based is set.
 *
 * See also nl80211 for the respective attribute documentation.
 */
struct cfg80211_pmsr_ftm_request_peer {
        enum nl80211_preamble preamble;
        u16 burst_period;
        u8 requested:1,
           asap:1,
           request_lci:1,
           request_civicloc:1,
           trigger_based:1,
           non_trigger_based:1,
           lmr_feedback:1;
        u8 num_bursts_exp;
        u8 burst_duration;
        u8 ftms_per_burst;
        u8 ftmr_retries;
        u8 bss_color;
};

/**
 * struct cfg80211_pmsr_request_peer - peer data for a peer measurement request
 * @addr: MAC address
 * @chandef: channel to use
 * @report_ap_tsf: report the associated AP's TSF
 * @ftm: FTM data, see &struct cfg80211_pmsr_ftm_request_peer
 */
struct cfg80211_pmsr_request_peer {
        u8 addr[ETH_ALEN];
        struct cfg80211_chan_def chandef;
        u8 report_ap_tsf:1;
        struct cfg80211_pmsr_ftm_request_peer ftm;
};

/**
 * struct cfg80211_pmsr_request - peer measurement request
 * @cookie: cookie, set by cfg80211
 * @nl_portid: netlink portid - used by cfg80211
 * @drv_data: driver data for this request, if required for aborting,
 *        not otherwise freed or anything by cfg80211
 * @mac_addr: MAC address used for (randomised) request
 * @mac_addr_mask: MAC address mask used for randomisation, bits that
 *        are 0 in the mask should be randomised, bits that are 1 should
 *        be taken from the @mac_addr
 * @list: used by cfg80211 to hold on to the request
 * @timeout: timeout (in milliseconds) for the whole operation, if
 *        zero it means there's no timeout
 * @n_peers: number of peers to do measurements with
 * @peers: per-peer measurement request data
 */
struct cfg80211_pmsr_request {
        u64 cookie;
        void *drv_data;
        u32 n_peers;
        u32 nl_portid;

        u32 timeout;

        u8 mac_addr[ETH_ALEN] __aligned(2);
        u8 mac_addr_mask[ETH_ALEN] __aligned(2);

        struct list_head list;

        struct cfg80211_pmsr_request_peer peers[] __counted_by(n_peers);
};

/**
 * struct cfg80211_update_owe_info - OWE Information
 *
 * This structure provides information needed for the drivers to offload OWE
 * (Opportunistic Wireless Encryption) processing to the user space.
 *
 * Commonly used across update_owe_info request and event interfaces.
 *
 * @peer: MAC address of the peer device for which the OWE processing
 *        has to be done.
 * @status: status code, %WLAN_STATUS_SUCCESS for successful OWE info
 *        processing, use %WLAN_STATUS_UNSPECIFIED_FAILURE if user space
 *        cannot give you the real status code for failures. Used only for
 *        OWE update request command interface (user space to driver).
 * @ie: IEs obtained from the peer or constructed by the user space. These are
 *        the IEs of the remote peer in the event from the host driver and
 *        the constructed IEs by the user space in the request interface.
 * @ie_len: Length of IEs in octets.
 * @assoc_link_id: MLO link ID of the AP, with which (re)association requested
 *        by peer. This will be filled by driver for both MLO and non-MLO station
 *        connections when the AP affiliated with an MLD. For non-MLD AP mode, it
 *        will be -1. Used only with OWE update event (driver to user space).
 * @peer_mld_addr: For MLO connection, MLD address of the peer. For non-MLO
 *        connection, it will be all zeros. This is applicable only when
 *        @assoc_link_id is not -1, i.e., the AP affiliated with an MLD. Used only
 *        with OWE update event (driver to user space).
 */
struct cfg80211_update_owe_info {
        u8 peer[ETH_ALEN] __aligned(2);
        u16 status;
        const u8 *ie;
        size_t ie_len;
        int assoc_link_id;
        u8 peer_mld_addr[ETH_ALEN] __aligned(2);
};

/**
 * struct mgmt_frame_regs - management frame registrations data
 * @global_stypes: bitmap of management frame subtypes registered
 *        for the entire device
 * @interface_stypes: bitmap of management frame subtypes registered
 *        for the given interface
 * @global_mcast_stypes: mcast RX is needed globally for these subtypes
 * @interface_mcast_stypes: mcast RX is needed on this interface
 *        for these subtypes
 */
struct mgmt_frame_regs {
        u32 global_stypes, interface_stypes;
        u32 global_mcast_stypes, interface_mcast_stypes;
};

/**
 * struct cfg80211_ops - backend description for wireless configuration
 *
 * This struct is registered by fullmac card drivers and/or wireless stacks
 * in order to handle configuration requests on their interfaces.
 *
 * All callbacks except where otherwise noted should return 0
 * on success or a negative error code.
 *
 * All operations are invoked with the wiphy mutex held. The RTNL may be
 * held in addition (due to wireless extensions) but this cannot be relied
 * upon except in cases where documented below. Note that due to ordering,
 * the RTNL also cannot be acquired in any handlers.
 *
 * @suspend: wiphy device needs to be suspended. The variable @wow will
 *        be %NULL or contain the enabled Wake-on-Wireless triggers that are
 *        configured for the device.
 * @resume: wiphy device needs to be resumed
 * @set_wakeup: Called when WoWLAN is enabled/disabled, use this callback
 *        to call device_set_wakeup_enable() to enable/disable wakeup from
 *        the device.
 *
 * @add_virtual_intf: create a new virtual interface with the given name,
 *        must set the struct wireless_dev's iftype. Beware: You must create
 *        the new netdev in the wiphy's network namespace! Returns the struct
 *        wireless_dev, or an ERR_PTR. For P2P device wdevs, the driver must
 *        also set the address member in the wdev.
 *        This additionally holds the RTNL to be able to do netdev changes.
 *
 * @del_virtual_intf: remove the virtual interface
 *        This additionally holds the RTNL to be able to do netdev changes.
 *
 * @change_virtual_intf: change type/configuration of virtual interface,
 *        keep the struct wireless_dev's iftype updated.
 *        This additionally holds the RTNL to be able to do netdev changes.
 *
 * @add_intf_link: Add a new MLO link to the given interface. Note that
 *        the wdev->link[] data structure has been updated, so the new link
 *        address is available.
 * @del_intf_link: Remove an MLO link from the given interface.
 *
 * @add_key: add a key with the given parameters. @mac_addr will be %NULL
 *        when adding a group key. @link_id will be -1 for non-MLO connection.
 *        For MLO connection, @link_id will be >= 0 for group key and -1 for
 *        pairwise key, @mac_addr will be peer's MLD address for MLO pairwise key.
 *
 * @get_key: get information about the key with the given parameters.
 *        @mac_addr will be %NULL when requesting information for a group
 *        key. All pointers given to the @callback function need not be valid
 *        after it returns. This function should return an error if it is
 *        not possible to retrieve the key, -ENOENT if it doesn't exist.
 *        @link_id will be -1 for non-MLO connection. For MLO connection,
 *        @link_id will be >= 0 for group key and -1 for pairwise key, @mac_addr
 *        will be peer's MLD address for MLO pairwise key.
 *
 * @del_key: remove a key given the @mac_addr (%NULL for a group key)
 *        and @key_index, return -ENOENT if the key doesn't exist. @link_id will
 *        be -1 for non-MLO connection. For MLO connection, @link_id will be >= 0
 *        for group key and -1 for pairwise key, @mac_addr will be peer's MLD
 *        address for MLO pairwise key.
 *
 * @set_default_key: set the default key on an interface. @link_id will be >= 0
 *        for MLO connection and -1 for non-MLO connection.
 *
 * @set_default_mgmt_key: set the default management frame key on an interface.
 *        @link_id will be >= 0 for MLO connection and -1 for non-MLO connection.
 *
 * @set_default_beacon_key: set the default Beacon frame key on an interface.
 *        @link_id will be >= 0 for MLO connection and -1 for non-MLO connection.
 *
 * @set_rekey_data: give the data necessary for GTK rekeying to the driver
 *
 * @start_ap: Start acting in AP mode defined by the parameters.
 * @change_beacon: Change the beacon parameters for an access point mode
 *        interface. This should reject the call when AP mode wasn't started.
 * @stop_ap: Stop being an AP, including stopping beaconing.
 *
 * @add_station: Add a new station.
 * @del_station: Remove a station
 * @change_station: Modify a given station. Note that flags changes are not much
 *        validated in cfg80211, in particular the auth/assoc/authorized flags
 *        might come to the driver in invalid combinations -- make sure to check
 *        them, also against the existing state! Drivers must call
 *        cfg80211_check_station_change() to validate the information.
 * @get_station: get station information for the station identified by @mac
 * @dump_station: dump station callback -- resume dump at index @idx
 *
 * @add_mpath: add a fixed mesh path
 * @del_mpath: delete a given mesh path
 * @change_mpath: change a given mesh path
 * @get_mpath: get a mesh path for the given parameters
 * @dump_mpath: dump mesh path callback -- resume dump at index @idx
 * @get_mpp: get a mesh proxy path for the given parameters
 * @dump_mpp: dump mesh proxy path callback -- resume dump at index @idx
 * @join_mesh: join the mesh network with the specified parameters
 *        (invoked with the wireless_dev mutex held)
 * @leave_mesh: leave the current mesh network
 *        (invoked with the wireless_dev mutex held)
 *
 * @get_mesh_config: Get the current mesh configuration
 *
 * @update_mesh_config: Update mesh parameters on a running mesh.
 *        The mask is a bitfield which tells us which parameters to
 *        set, and which to leave alone.
 *
 * @change_bss: Modify parameters for a given BSS.
 *
 * @inform_bss: Called by cfg80211 while being informed about new BSS data
 *        for every BSS found within the reported data or frame. This is called
 *        from within the cfg8011 inform_bss handlers while holding the bss_lock.
 *        The data parameter is passed through from drv_data inside
 *        struct cfg80211_inform_bss.
 *        The new IE data for the BSS is explicitly passed.
 *
 * @set_txq_params: Set TX queue parameters
 *
 * @libertas_set_mesh_channel: Only for backward compatibility for libertas,
 *        as it doesn't implement join_mesh and needs to set the channel to
 *        join the mesh instead.
 *
 * @set_monitor_channel: Set the monitor mode channel for the device. If other
 *        interfaces are active this callback should reject the configuration.
 *        If no interfaces are active or the device is down, the channel should
 *        be stored for when a monitor interface becomes active.
 *
 * @scan: Request to do a scan. If returning zero, the scan request is given
 *        the driver, and will be valid until passed to cfg80211_scan_done().
 *        For scan results, call cfg80211_inform_bss(); you can call this outside
 *        the scan/scan_done bracket too.
 * @abort_scan: Tell the driver to abort an ongoing scan. The driver shall
 *        indicate the status of the scan through cfg80211_scan_done().
 *
 * @auth: Request to authenticate with the specified peer
 *        (invoked with the wireless_dev mutex held)
 * @assoc: Request to (re)associate with the specified peer
 *        (invoked with the wireless_dev mutex held)
 * @deauth: Request to deauthenticate from the specified peer
 *        (invoked with the wireless_dev mutex held)
 * @disassoc: Request to disassociate from the specified peer
 *        (invoked with the wireless_dev mutex held)
 *
 * @connect: Connect to the ESS with the specified parameters. When connected,
 *        call cfg80211_connect_result()/cfg80211_connect_bss() with status code
 *        %WLAN_STATUS_SUCCESS. If the connection fails for some reason, call
 *        cfg80211_connect_result()/cfg80211_connect_bss() with the status code
 *        from the AP or cfg80211_connect_timeout() if no frame with status code
 *        was received.
 *        The driver is allowed to roam to other BSSes within the ESS when the
 *        other BSS matches the connect parameters. When such roaming is initiated
 *        by the driver, the driver is expected to verify that the target matches
 *        the configured security parameters and to use Reassociation Request
 *        frame instead of Association Request frame.
 *        The connect function can also be used to request the driver to perform a
 *        specific roam when connected to an ESS. In that case, the prev_bssid
 *        parameter is set to the BSSID of the currently associated BSS as an
 *        indication of requesting reassociation.
 *        In both the driver-initiated and new connect() call initiated roaming
 *        cases, the result of roaming is indicated with a call to
 *        cfg80211_roamed(). (invoked with the wireless_dev mutex held)
 * @update_connect_params: Update the connect parameters while connected to a
 *        BSS. The updated parameters can be used by driver/firmware for
 *        subsequent BSS selection (roaming) decisions and to form the
 *        Authentication/(Re)Association Request frames. This call does not
 *        request an immediate disassociation or reassociation with the current
 *        BSS, i.e., this impacts only subsequent (re)associations. The bits in
 *        changed are defined in &enum cfg80211_connect_params_changed.
 *        (invoked with the wireless_dev mutex held)
 * @disconnect: Disconnect from the BSS/ESS or stop connection attempts if
 *      connection is in progress. Once done, call cfg80211_disconnected() in
 *      case connection was already established (invoked with the
 *      wireless_dev mutex held), otherwise call cfg80211_connect_timeout().
 *
 * @join_ibss: Join the specified IBSS (or create if necessary). Once done, call
 *        cfg80211_ibss_joined(), also call that function when changing BSSID due
 *        to a merge.
 *        (invoked with the wireless_dev mutex held)
 * @leave_ibss: Leave the IBSS.
 *        (invoked with the wireless_dev mutex held)
 *
 * @set_mcast_rate: Set the specified multicast rate (only if vif is in ADHOC or
 *        MESH mode)
 *
 * @set_wiphy_params: Notify that wiphy parameters have changed;
 *        @changed bitfield (see &enum wiphy_params_flags) describes which values
 *        have changed. The actual parameter values are available in
 *        struct wiphy. If returning an error, no value should be changed.
 *
 * @set_tx_power: set the transmit power according to the parameters,
 *        the power passed is in mBm, to get dBm use MBM_TO_DBM(). The
 *        wdev may be %NULL if power was set for the wiphy, and will
 *        always be %NULL unless the driver supports per-vif TX power
 *        (as advertised by the nl80211 feature flag.)
 * @get_tx_power: store the current TX power into the dbm variable;
 *        return 0 if successful
 *
 * @rfkill_poll: polls the hw rfkill line, use cfg80211 reporting
 *        functions to adjust rfkill hw state
 *
 * @dump_survey: get site survey information.
 *
 * @remain_on_channel: Request the driver to remain awake on the specified
 *        channel for the specified duration to complete an off-channel
 *        operation (e.g., public action frame exchange). When the driver is
 *        ready on the requested channel, it must indicate this with an event
 *        notification by calling cfg80211_ready_on_channel().
 * @cancel_remain_on_channel: Cancel an on-going remain-on-channel operation.
 *        This allows the operation to be terminated prior to timeout based on
 *        the duration value.
 * @mgmt_tx: Transmit a management frame.
 * @mgmt_tx_cancel_wait: Cancel the wait time from transmitting a management
 *        frame on another channel
 *
 * @testmode_cmd: run a test mode command; @wdev may be %NULL
 * @testmode_dump: Implement a test mode dump. The cb->args[2] and up may be
 *        used by the function, but 0 and 1 must not be touched. Additionally,
 *        return error codes other than -ENOBUFS and -ENOENT will terminate the
 *        dump and return to userspace with an error, so be careful. If any data
 *        was passed in from userspace then the data/len arguments will be present
 *        and point to the data contained in %NL80211_ATTR_TESTDATA.
 *
 * @set_bitrate_mask: set the bitrate mask configuration
 *
 * @set_pmksa: Cache a PMKID for a BSSID. This is mostly useful for fullmac
 *        devices running firmwares capable of generating the (re) association
 *        RSN IE. It allows for faster roaming between WPA2 BSSIDs.
 * @del_pmksa: Delete a cached PMKID.
 * @flush_pmksa: Flush all cached PMKIDs.
 * @set_power_mgmt: Configure WLAN power management. A timeout value of -1
 *        allows the driver to adjust the dynamic ps timeout value.
 * @set_cqm_rssi_config: Configure connection quality monitor RSSI threshold.
 *        After configuration, the driver should (soon) send an event indicating
 *        the current level is above/below the configured threshold; this may
 *        need some care when the configuration is changed (without first being
 *        disabled.)
 * @set_cqm_rssi_range_config: Configure two RSSI thresholds in the
 *        connection quality monitor.  An event is to be sent only when the
 *        signal level is found to be outside the two values.  The driver should
 *        set %NL80211_EXT_FEATURE_CQM_RSSI_LIST if this method is implemented.
 *        If it is provided then there's no point providing @set_cqm_rssi_config.
 * @set_cqm_txe_config: Configure connection quality monitor TX error
 *        thresholds.
 * @sched_scan_start: Tell the driver to start a scheduled scan.
 * @sched_scan_stop: Tell the driver to stop an ongoing scheduled scan with
 *        given request id. This call must stop the scheduled scan and be ready
 *        for starting a new one before it returns, i.e. @sched_scan_start may be
 *        called immediately after that again and should not fail in that case.
 *        The driver should not call cfg80211_sched_scan_stopped() for a requested
 *        stop (when this method returns 0).
 *
 * @update_mgmt_frame_registrations: Notify the driver that management frame
 *        registrations were updated. The callback is allowed to sleep.
 *
 * @set_antenna: Set antenna configuration (tx_ant, rx_ant) on the device.
 *        Parameters are bitmaps of allowed antennas to use for TX/RX. Drivers may
 *        reject TX/RX mask combinations they cannot support by returning -EINVAL
 *        (also see nl80211.h @NL80211_ATTR_WIPHY_ANTENNA_TX).
 *
 * @get_antenna: Get current antenna configuration from device (tx_ant, rx_ant).
 *
 * @tdls_mgmt: Transmit a TDLS management frame.
 * @tdls_oper: Perform a high-level TDLS operation (e.g. TDLS link setup).
 *
 * @probe_client: probe an associated client, must return a cookie that it
 *        later passes to cfg80211_probe_status().
 *
 * @set_noack_map: Set the NoAck Map for the TIDs.
 *
 * @get_channel: Get the current operating channel for the virtual interface.
 *        For monitor interfaces, it should return %NULL unless there's a single
 *        current monitoring channel.
 *
 * @start_p2p_device: Start the given P2P device.
 * @stop_p2p_device: Stop the given P2P device.
 *
 * @set_mac_acl: Sets MAC address control list in AP and P2P GO mode.
 *        Parameters include ACL policy, an array of MAC address of stations
 *        and the number of MAC addresses. If there is already a list in driver
 *        this new list replaces the existing one. Driver has to clear its ACL
 *        when number of MAC addresses entries is passed as 0. Drivers which
 *        advertise the support for MAC based ACL have to implement this callback.
 *
 * @start_radar_detection: Start radar detection in the driver.
 *
 * @end_cac: End running CAC, probably because a related CAC
 *        was finished on another phy.
 *
 * @update_ft_ies: Provide updated Fast BSS Transition information to the
 *        driver. If the SME is in the driver/firmware, this information can be
 *        used in building Authentication and Reassociation Request frames.
 *
 * @crit_proto_start: Indicates a critical protocol needs more link reliability
 *        for a given duration (milliseconds). The protocol is provided so the
 *        driver can take the most appropriate actions.
 * @crit_proto_stop: Indicates critical protocol no longer needs increased link
 *        reliability. This operation can not fail.
 * @set_coalesce: Set coalesce parameters.
 *
 * @channel_switch: initiate channel-switch procedure (with CSA). Driver is
 *        responsible for veryfing if the switch is possible. Since this is
 *        inherently tricky driver may decide to disconnect an interface later
 *        with cfg80211_stop_iface(). This doesn't mean driver can accept
 *        everything. It should do it's best to verify requests and reject them
 *        as soon as possible.
 *
 * @set_qos_map: Set QoS mapping information to the driver
 *
 * @set_ap_chanwidth: Set the AP (including P2P GO) mode channel width for the
 *        given interface This is used e.g. for dynamic HT 20/40 MHz channel width
 *        changes during the lifetime of the BSS.
 *
 * @add_tx_ts: validate (if admitted_time is 0) or add a TX TS to the device
 *        with the given parameters; action frame exchange has been handled by
 *        userspace so this just has to modify the TX path to take the TS into
 *        account.
 *        If the admitted time is 0 just validate the parameters to make sure
 *        the session can be created at all; it is valid to just always return
 *        success for that but that may result in inefficient behaviour (handshake
 *        with the peer followed by immediate teardown when the addition is later
 *        rejected)
 * @del_tx_ts: remove an existing TX TS
 *
 * @join_ocb: join the OCB network with the specified parameters
 *        (invoked with the wireless_dev mutex held)
 * @leave_ocb: leave the current OCB network
 *        (invoked with the wireless_dev mutex held)
 *
 * @tdls_channel_switch: Start channel-switching with a TDLS peer. The driver
 *        is responsible for continually initiating channel-switching operations
 *        and returning to the base channel for communication with the AP.
 * @tdls_cancel_channel_switch: Stop channel-switching with a TDLS peer. Both
 *        peers must be on the base channel when the call completes.
 * @start_nan: Start the NAN interface.
 * @stop_nan: Stop the NAN interface.
 * @add_nan_func: Add a NAN function. Returns negative value on failure.
 *        On success @nan_func ownership is transferred to the driver and
 *        it may access it outside of the scope of this function. The driver
 *        should free the @nan_func when no longer needed by calling
 *        cfg80211_free_nan_func().
 *        On success the driver should assign an instance_id in the
 *        provided @nan_func.
 * @del_nan_func: Delete a NAN function.
 * @nan_change_conf: changes NAN configuration. The changed parameters must
 *        be specified in @changes (using &enum cfg80211_nan_conf_changes);
 *        All other parameters must be ignored.
 *
 * @set_multicast_to_unicast: configure multicast to unicast conversion for BSS
 *
 * @get_txq_stats: Get TXQ stats for interface or phy. If wdev is %NULL, this
 *      function should return phy stats, and interface stats otherwise.
 *
 * @set_pmk: configure the PMK to be used for offloaded 802.1X 4-Way handshake.
 *        If not deleted through @del_pmk the PMK remains valid until disconnect
 *        upon which the driver should clear it.
 *        (invoked with the wireless_dev mutex held)
 * @del_pmk: delete the previously configured PMK for the given authenticator.
 *        (invoked with the wireless_dev mutex held)
 *
 * @external_auth: indicates result of offloaded authentication processing from
 *     user space
 *
 * @tx_control_port: TX a control port frame (EAPoL).  The noencrypt parameter
 *        tells the driver that the frame should not be encrypted.
 *
 * @get_ftm_responder_stats: Retrieve FTM responder statistics, if available.
 *        Statistics should be cumulative, currently no way to reset is provided.
 * @start_pmsr: start peer measurement (e.g. FTM)
 * @abort_pmsr: abort peer measurement
 *
 * @update_owe_info: Provide updated OWE info to driver. Driver implementing SME
 *        but offloading OWE processing to the user space will get the updated
 *        DH IE through this interface.
 *
 * @probe_mesh_link: Probe direct Mesh peer's link quality by sending data frame
 *        and overrule HWMP path selection algorithm.
 * @set_tid_config: TID specific configuration, this can be peer or BSS specific
 *        This callback may sleep.
 * @reset_tid_config: Reset TID specific configuration for the peer, for the
 *        given TIDs. This callback may sleep.
 *
 * @set_sar_specs: Update the SAR (TX power) settings.
 *
 * @color_change: Initiate a color change.
 *
 * @set_fils_aad: Set FILS AAD data to the AP driver so that the driver can use
 *        those to decrypt (Re)Association Request and encrypt (Re)Association
 *        Response frame.
 *
 * @set_radar_background: Configure dedicated offchannel chain available for
 *        radar/CAC detection on some hw. This chain can't be used to transmit
 *        or receive frames and it is bounded to a running wdev.
 *        Background radar/CAC detection allows to avoid the CAC downtime
 *        switching to a different channel during CAC detection on the selected
 *        radar channel.
 *        The caller is expected to set chandef pointer to NULL in order to
 *        disable background CAC/radar detection.
 * @add_link_station: Add a link to a station.
 * @mod_link_station: Modify a link of a station.
 * @del_link_station: Remove a link of a station.
 *
 * @set_hw_timestamp: Enable/disable HW timestamping of TM/FTM frames.
 * @set_ttlm: set the TID to link mapping.
 * @set_epcs: Enable/Disable EPCS for station mode.
 * @get_radio_mask: get bitmask of radios in use.
 *        (invoked with the wiphy mutex held)
 * @assoc_ml_reconf: Request a non-AP MLO connection to perform ML
 *        reconfiguration, i.e., add and/or remove links to/from the
 *        association using ML reconfiguration action frames. Successfully added
 *        links will be added to the set of valid links. Successfully removed
 *        links will be removed from the set of valid links. The driver must
 *        indicate removed links by calling cfg80211_links_removed() and added
 *        links by calling cfg80211_mlo_reconf_add_done(). When calling
 *        cfg80211_mlo_reconf_add_done() the bss pointer must be given for each
 *        link for which MLO reconfiguration 'add' operation was requested.
 */
struct cfg80211_ops {
        int        (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow);
        int        (*resume)(struct wiphy *wiphy);
        void        (*set_wakeup)(struct wiphy *wiphy, bool enabled);

        struct wireless_dev * (*add_virtual_intf)(struct wiphy *wiphy,
                                                  const char *name,
                                                  unsigned char name_assign_type,
                                                  enum nl80211_iftype type,
                                                  struct vif_params *params);
        int        (*del_virtual_intf)(struct wiphy *wiphy,
                                    struct wireless_dev *wdev);
        int        (*change_virtual_intf)(struct wiphy *wiphy,
                                       struct net_device *dev,
                                       enum nl80211_iftype type,
                                       struct vif_params *params);

        int        (*add_intf_link)(struct wiphy *wiphy,
                                 struct wireless_dev *wdev,
                                 unsigned int link_id);
        void        (*del_intf_link)(struct wiphy *wiphy,
                                 struct wireless_dev *wdev,
                                 unsigned int link_id);

        int        (*add_key)(struct wiphy *wiphy, struct net_device *netdev,
                           int link_id, u8 key_index, bool pairwise,
                           const u8 *mac_addr, struct key_params *params);
        int        (*get_key)(struct wiphy *wiphy, struct net_device *netdev,
                           int link_id, u8 key_index, bool pairwise,
                           const u8 *mac_addr, void *cookie,
                           void (*callback)(void *cookie, struct key_params*));
        int        (*del_key)(struct wiphy *wiphy, struct net_device *netdev,
                           int link_id, u8 key_index, bool pairwise,
                           const u8 *mac_addr);
        int        (*set_default_key)(struct wiphy *wiphy,
                                   struct net_device *netdev, int link_id,
                                   u8 key_index, bool unicast, bool multicast);
        int        (*set_default_mgmt_key)(struct wiphy *wiphy,
                                        struct net_device *netdev, int link_id,
                                        u8 key_index);
        int        (*set_default_beacon_key)(struct wiphy *wiphy,
                                          struct net_device *netdev,
                                          int link_id,
                                          u8 key_index);

        int        (*start_ap)(struct wiphy *wiphy, struct net_device *dev,
                            struct cfg80211_ap_settings *settings);
        int        (*change_beacon)(struct wiphy *wiphy, struct net_device *dev,
                                 struct cfg80211_ap_update *info);
        int        (*stop_ap)(struct wiphy *wiphy, struct net_device *dev,
                           unsigned int link_id);


        int        (*add_station)(struct wiphy *wiphy, struct net_device *dev,
                               const u8 *mac,
                               struct station_parameters *params);
        int        (*del_station)(struct wiphy *wiphy, struct net_device *dev,
                               struct station_del_parameters *params);
        int        (*change_station)(struct wiphy *wiphy, struct net_device *dev,
                                  const u8 *mac,
                                  struct station_parameters *params);
        int        (*get_station)(struct wiphy *wiphy, struct net_device *dev,
                               const u8 *mac, struct station_info *sinfo);
        int        (*dump_station)(struct wiphy *wiphy, struct net_device *dev,
                                int idx, u8 *mac, struct station_info *sinfo);

        int        (*add_mpath)(struct wiphy *wiphy, struct net_device *dev,
                               const u8 *dst, const u8 *next_hop);
        int        (*del_mpath)(struct wiphy *wiphy, struct net_device *dev,
                               const u8 *dst);
        int        (*change_mpath)(struct wiphy *wiphy, struct net_device *dev,
                                  const u8 *dst, const u8 *next_hop);
        int        (*get_mpath)(struct wiphy *wiphy, struct net_device *dev,
                             u8 *dst, u8 *next_hop, struct mpath_info *pinfo);
        int        (*dump_mpath)(struct wiphy *wiphy, struct net_device *dev,
                              int idx, u8 *dst, u8 *next_hop,
                              struct mpath_info *pinfo);
        int        (*get_mpp)(struct wiphy *wiphy, struct net_device *dev,
                           u8 *dst, u8 *mpp, struct mpath_info *pinfo);
        int        (*dump_mpp)(struct wiphy *wiphy, struct net_device *dev,
                            int idx, u8 *dst, u8 *mpp,
                            struct mpath_info *pinfo);
        int        (*get_mesh_config)(struct wiphy *wiphy,
                                struct net_device *dev,
                                struct mesh_config *conf);
        int        (*update_mesh_config)(struct wiphy *wiphy,
                                      struct net_device *dev, u32 mask,
                                      const struct mesh_config *nconf);
        int        (*join_mesh)(struct wiphy *wiphy, struct net_device *dev,
                             const struct mesh_config *conf,
                             const struct mesh_setup *setup);
        int        (*leave_mesh)(struct wiphy *wiphy, struct net_device *dev);

        int        (*join_ocb)(struct wiphy *wiphy, struct net_device *dev,
                            struct ocb_setup *setup);
        int        (*leave_ocb)(struct wiphy *wiphy, struct net_device *dev);

        int        (*change_bss)(struct wiphy *wiphy, struct net_device *dev,
                              struct bss_parameters *params);

        void        (*inform_bss)(struct wiphy *wiphy, struct cfg80211_bss *bss,
                              const struct cfg80211_bss_ies *ies, void *data);

        int        (*set_txq_params)(struct wiphy *wiphy, struct net_device *dev,
                                  struct ieee80211_txq_params *params);

        int        (*libertas_set_mesh_channel)(struct wiphy *wiphy,
                                             struct net_device *dev,
                                             struct ieee80211_channel *chan);

        int        (*set_monitor_channel)(struct wiphy *wiphy,
                                       struct net_device *dev,
                                       struct cfg80211_chan_def *chandef);

        int        (*scan)(struct wiphy *wiphy,
                        struct cfg80211_scan_request *request);
        void        (*abort_scan)(struct wiphy *wiphy, struct wireless_dev *wdev);

        int        (*auth)(struct wiphy *wiphy, struct net_device *dev,
                        struct cfg80211_auth_request *req);
        int        (*assoc)(struct wiphy *wiphy, struct net_device *dev,
                         struct cfg80211_assoc_request *req);
        int        (*deauth)(struct wiphy *wiphy, struct net_device *dev,
                          struct cfg80211_deauth_request *req);
        int        (*disassoc)(struct wiphy *wiphy, struct net_device *dev,
                            struct cfg80211_disassoc_request *req);

        int        (*connect)(struct wiphy *wiphy, struct net_device *dev,
                           struct cfg80211_connect_params *sme);
        int        (*update_connect_params)(struct wiphy *wiphy,
                                         struct net_device *dev,
                                         struct cfg80211_connect_params *sme,
                                         u32 changed);
        int        (*disconnect)(struct wiphy *wiphy, struct net_device *dev,
                              u16 reason_code);

        int        (*join_ibss)(struct wiphy *wiphy, struct net_device *dev,
                             struct cfg80211_ibss_params *params);
        int        (*leave_ibss)(struct wiphy *wiphy, struct net_device *dev);

        int        (*set_mcast_rate)(struct wiphy *wiphy, struct net_device *dev,
                                  int rate[NUM_NL80211_BANDS]);

        int        (*set_wiphy_params)(struct wiphy *wiphy, int radio_idx,
                                    u32 changed);

        int        (*set_tx_power)(struct wiphy *wiphy, struct wireless_dev *wdev,
                                int radio_idx,
                                enum nl80211_tx_power_setting type, int mbm);
        int        (*get_tx_power)(struct wiphy *wiphy, struct wireless_dev *wdev,
                                int radio_idx, unsigned int link_id, int *dbm);

        void        (*rfkill_poll)(struct wiphy *wiphy);

#ifdef CONFIG_NL80211_TESTMODE
        int        (*testmode_cmd)(struct wiphy *wiphy, struct wireless_dev *wdev,
                                void *data, int len);
        int        (*testmode_dump)(struct wiphy *wiphy, struct sk_buff *skb,
                                 struct netlink_callback *cb,
                                 void *data, int len);
#endif

        int        (*set_bitrate_mask)(struct wiphy *wiphy,
                                    struct net_device *dev,
                                    unsigned int link_id,
                                    const u8 *peer,
                                    const struct cfg80211_bitrate_mask *mask);

        int        (*dump_survey)(struct wiphy *wiphy, struct net_device *netdev,
                        int idx, struct survey_info *info);

        int        (*set_pmksa)(struct wiphy *wiphy, struct net_device *netdev,
                             struct cfg80211_pmksa *pmksa);
        int        (*del_pmksa)(struct wiphy *wiphy, struct net_device *netdev,
                             struct cfg80211_pmksa *pmksa);
        int        (*flush_pmksa)(struct wiphy *wiphy, struct net_device *netdev);

        int        (*remain_on_channel)(struct wiphy *wiphy,
                                     struct wireless_dev *wdev,
                                     struct ieee80211_channel *chan,
                                     unsigned int duration,
                                     u64 *cookie);
        int        (*cancel_remain_on_channel)(struct wiphy *wiphy,
                                            struct wireless_dev *wdev,
                                            u64 cookie);

        int        (*mgmt_tx)(struct wiphy *wiphy, struct wireless_dev *wdev,
                           struct cfg80211_mgmt_tx_params *params,
                           u64 *cookie);
        int        (*mgmt_tx_cancel_wait)(struct wiphy *wiphy,
                                       struct wireless_dev *wdev,
                                       u64 cookie);

        int        (*set_power_mgmt)(struct wiphy *wiphy, struct net_device *dev,
                                  bool enabled, int timeout);

        int        (*set_cqm_rssi_config)(struct wiphy *wiphy,
                                       struct net_device *dev,
                                       s32 rssi_thold, u32 rssi_hyst);

        int        (*set_cqm_rssi_range_config)(struct wiphy *wiphy,
                                             struct net_device *dev,
                                             s32 rssi_low, s32 rssi_high);

        int        (*set_cqm_txe_config)(struct wiphy *wiphy,
                                      struct net_device *dev,
                                      u32 rate, u32 pkts, u32 intvl);

        void        (*update_mgmt_frame_registrations)(struct wiphy *wiphy,
                                                   struct wireless_dev *wdev,
                                                   struct mgmt_frame_regs *upd);

        int        (*set_antenna)(struct wiphy *wiphy, int radio_idx,
                               u32 tx_ant, u32 rx_ant);
        int        (*get_antenna)(struct wiphy *wiphy, int radio_idx,
                               u32 *tx_ant, u32 *rx_ant);

        int        (*sched_scan_start)(struct wiphy *wiphy,
                                struct net_device *dev,
                                struct cfg80211_sched_scan_request *request);
        int        (*sched_scan_stop)(struct wiphy *wiphy, struct net_device *dev,
                                   u64 reqid);

        int        (*set_rekey_data)(struct wiphy *wiphy, struct net_device *dev,
                                  struct cfg80211_gtk_rekey_data *data);

        int        (*tdls_mgmt)(struct wiphy *wiphy, struct net_device *dev,
                             const u8 *peer, int link_id,
                             u8 action_code, u8 dialog_token, u16 status_code,
                             u32 peer_capability, bool initiator,
                             const u8 *buf, size_t len);
        int        (*tdls_oper)(struct wiphy *wiphy, struct net_device *dev,
                             const u8 *peer, enum nl80211_tdls_operation oper);

        int        (*probe_client)(struct wiphy *wiphy, struct net_device *dev,
                                const u8 *peer, u64 *cookie);

        int        (*set_noack_map)(struct wiphy *wiphy,
                                  struct net_device *dev,
                                  u16 noack_map);

        int        (*get_channel)(struct wiphy *wiphy,
                               struct wireless_dev *wdev,
                               unsigned int link_id,
                               struct cfg80211_chan_def *chandef);

        int        (*start_p2p_device)(struct wiphy *wiphy,
                                    struct wireless_dev *wdev);
        void        (*stop_p2p_device)(struct wiphy *wiphy,
                                   struct wireless_dev *wdev);

        int        (*set_mac_acl)(struct wiphy *wiphy, struct net_device *dev,
                               const struct cfg80211_acl_data *params);

        int        (*start_radar_detection)(struct wiphy *wiphy,
                                         struct net_device *dev,
                                         struct cfg80211_chan_def *chandef,
                                         u32 cac_time_ms, int link_id);
        void        (*end_cac)(struct wiphy *wiphy,
                           struct net_device *dev, unsigned int link_id);
        int        (*update_ft_ies)(struct wiphy *wiphy, struct net_device *dev,
                                 struct cfg80211_update_ft_ies_params *ftie);
        int        (*crit_proto_start)(struct wiphy *wiphy,
                                    struct wireless_dev *wdev,
                                    enum nl80211_crit_proto_id protocol,
                                    u16 duration);
        void        (*crit_proto_stop)(struct wiphy *wiphy,
                                   struct wireless_dev *wdev);
        int        (*set_coalesce)(struct wiphy *wiphy,
                                struct cfg80211_coalesce *coalesce);

        int        (*channel_switch)(struct wiphy *wiphy,
                                  struct net_device *dev,
                                  struct cfg80211_csa_settings *params);

        int     (*set_qos_map)(struct wiphy *wiphy,
                               struct net_device *dev,
                               struct cfg80211_qos_map *qos_map);

        int        (*set_ap_chanwidth)(struct wiphy *wiphy, struct net_device *dev,
                                    unsigned int link_id,
                                    struct cfg80211_chan_def *chandef);

        int        (*add_tx_ts)(struct wiphy *wiphy, struct net_device *dev,
                             u8 tsid, const u8 *peer, u8 user_prio,
                             u16 admitted_time);
        int        (*del_tx_ts)(struct wiphy *wiphy, struct net_device *dev,
                             u8 tsid, const u8 *peer);

        int        (*tdls_channel_switch)(struct wiphy *wiphy,
                                       struct net_device *dev,
                                       const u8 *addr, u8 oper_class,
                                       struct cfg80211_chan_def *chandef);
        void        (*tdls_cancel_channel_switch)(struct wiphy *wiphy,
                                              struct net_device *dev,
                                              const u8 *addr);
        int        (*start_nan)(struct wiphy *wiphy, struct wireless_dev *wdev,
                             struct cfg80211_nan_conf *conf);
        void        (*stop_nan)(struct wiphy *wiphy, struct wireless_dev *wdev);
        int        (*add_nan_func)(struct wiphy *wiphy, struct wireless_dev *wdev,
                                struct cfg80211_nan_func *nan_func);
        void        (*del_nan_func)(struct wiphy *wiphy, struct wireless_dev *wdev,
                               u64 cookie);
        int        (*nan_change_conf)(struct wiphy *wiphy,
                                   struct wireless_dev *wdev,
                                   struct cfg80211_nan_conf *conf,
                                   u32 changes);

        int        (*set_multicast_to_unicast)(struct wiphy *wiphy,
                                            struct net_device *dev,
                                            const bool enabled);

        int        (*get_txq_stats)(struct wiphy *wiphy,
                                 struct wireless_dev *wdev,
                                 struct cfg80211_txq_stats *txqstats);

        int        (*set_pmk)(struct wiphy *wiphy, struct net_device *dev,
                           const struct cfg80211_pmk_conf *conf);
        int        (*del_pmk)(struct wiphy *wiphy, struct net_device *dev,
                           const u8 *aa);
        int     (*external_auth)(struct wiphy *wiphy, struct net_device *dev,
                                 struct cfg80211_external_auth_params *params);

        int        (*tx_control_port)(struct wiphy *wiphy,
                                   struct net_device *dev,
                                   const u8 *buf, size_t len,
                                   const u8 *dest, const __be16 proto,
                                   const bool noencrypt, int link_id,
                                   u64 *cookie);

        int        (*get_ftm_responder_stats)(struct wiphy *wiphy,
                                struct net_device *dev,
                                struct cfg80211_ftm_responder_stats *ftm_stats);

        int        (*start_pmsr)(struct wiphy *wiphy, struct wireless_dev *wdev,
                              struct cfg80211_pmsr_request *request);
        void        (*abort_pmsr)(struct wiphy *wiphy, struct wireless_dev *wdev,
                              struct cfg80211_pmsr_request *request);
        int        (*update_owe_info)(struct wiphy *wiphy, struct net_device *dev,
                                   struct cfg80211_update_owe_info *owe_info);
        int        (*probe_mesh_link)(struct wiphy *wiphy, struct net_device *dev,
                                   const u8 *buf, size_t len);
        int     (*set_tid_config)(struct wiphy *wiphy, struct net_device *dev,
                                  struct cfg80211_tid_config *tid_conf);
        int        (*reset_tid_config)(struct wiphy *wiphy, struct net_device *dev,
                                    const u8 *peer, u8 tids);
        int        (*set_sar_specs)(struct wiphy *wiphy,
                                 struct cfg80211_sar_specs *sar);
        int        (*color_change)(struct wiphy *wiphy,
                                struct net_device *dev,
                                struct cfg80211_color_change_settings *params);
        int     (*set_fils_aad)(struct wiphy *wiphy, struct net_device *dev,
                                struct cfg80211_fils_aad *fils_aad);
        int        (*set_radar_background)(struct wiphy *wiphy,
                                        struct cfg80211_chan_def *chandef);
        int        (*add_link_station)(struct wiphy *wiphy, struct net_device *dev,
                                    struct link_station_parameters *params);
        int        (*mod_link_station)(struct wiphy *wiphy, struct net_device *dev,
                                    struct link_station_parameters *params);
        int        (*del_link_station)(struct wiphy *wiphy, struct net_device *dev,
                                    struct link_station_del_parameters *params);
        int        (*set_hw_timestamp)(struct wiphy *wiphy, struct net_device *dev,
                                    struct cfg80211_set_hw_timestamp *hwts);
        int        (*set_ttlm)(struct wiphy *wiphy, struct net_device *dev,
                            struct cfg80211_ttlm_params *params);
        u32        (*get_radio_mask)(struct wiphy *wiphy, struct net_device *dev);
        int     (*assoc_ml_reconf)(struct wiphy *wiphy, struct net_device *dev,
                                   struct cfg80211_ml_reconf_req *req);
        int        (*set_epcs)(struct wiphy *wiphy, struct net_device *dev,
                            bool val);
};

/*
 * wireless hardware and networking interfaces structures
 * and registration/helper functions
 */

/**
 * enum wiphy_flags - wiphy capability flags
 *
 * @WIPHY_FLAG_SPLIT_SCAN_6GHZ: if set to true, the scan request will be split
 *         into two, first for legacy bands and second for 6 GHz.
 * @WIPHY_FLAG_NETNS_OK: if not set, do not allow changing the netns of this
 *        wiphy at all
 * @WIPHY_FLAG_PS_ON_BY_DEFAULT: if set to true, powersave will be enabled
 *        by default -- this flag will be set depending on the kernel's default
 *        on wiphy_new(), but can be changed by the driver if it has a good
 *        reason to override the default
 * @WIPHY_FLAG_4ADDR_AP: supports 4addr mode even on AP (with a single station
 *        on a VLAN interface). This flag also serves an extra purpose of
 *        supporting 4ADDR AP mode on devices which do not support AP/VLAN iftype.
 * @WIPHY_FLAG_4ADDR_STATION: supports 4addr mode even as a station
 * @WIPHY_FLAG_CONTROL_PORT_PROTOCOL: This device supports setting the
 *        control port protocol ethertype. The device also honours the
 *        control_port_no_encrypt flag.
 * @WIPHY_FLAG_IBSS_RSN: The device supports IBSS RSN.
 * @WIPHY_FLAG_MESH_AUTH: The device supports mesh authentication by routing
 *        auth frames to userspace. See @NL80211_MESH_SETUP_USERSPACE_AUTH.
 * @WIPHY_FLAG_SUPPORTS_FW_ROAM: The device supports roaming feature in the
 *        firmware.
 * @WIPHY_FLAG_AP_UAPSD: The device supports uapsd on AP.
 * @WIPHY_FLAG_SUPPORTS_TDLS: The device supports TDLS (802.11z) operation.
 * @WIPHY_FLAG_TDLS_EXTERNAL_SETUP: The device does not handle TDLS (802.11z)
 *        link setup/discovery operations internally. Setup, discovery and
 *        teardown packets should be sent through the @NL80211_CMD_TDLS_MGMT
 *        command. When this flag is not set, @NL80211_CMD_TDLS_OPER should be
 *        used for asking the driver/firmware to perform a TDLS operation.
 * @WIPHY_FLAG_HAVE_AP_SME: device integrates AP SME
 * @WIPHY_FLAG_REPORTS_OBSS: the device will report beacons from other BSSes
 *        when there are virtual interfaces in AP mode by calling
 *        cfg80211_report_obss_beacon().
 * @WIPHY_FLAG_AP_PROBE_RESP_OFFLOAD: When operating as an AP, the device
 *        responds to probe-requests in hardware.
 * @WIPHY_FLAG_OFFCHAN_TX: Device supports direct off-channel TX.
 * @WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL: Device supports remain-on-channel call.
 * @WIPHY_FLAG_SUPPORTS_5_10_MHZ: Device supports 5 MHz and 10 MHz channels.
 * @WIPHY_FLAG_HAS_CHANNEL_SWITCH: Device supports channel switch in
 *        beaconing mode (AP, IBSS, Mesh, ...).
 * @WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK: The device supports bigger kek and kck keys
 * @WIPHY_FLAG_SUPPORTS_MLO: This is a temporary flag gating the MLO APIs,
 *        in order to not have them reachable in normal drivers, until we have
 *        complete feature/interface combinations/etc. advertisement. No driver
 *        should set this flag for now.
 * @WIPHY_FLAG_SUPPORTS_EXT_KCK_32: The device supports 32-byte KCK keys.
 * @WIPHY_FLAG_NOTIFY_REGDOM_BY_DRIVER: The device could handle reg notify for
 *        NL80211_REGDOM_SET_BY_DRIVER.
 * @WIPHY_FLAG_CHANNEL_CHANGE_ON_BEACON: reg_call_notifier() is called if driver
 *        set this flag to update channels on beacon hints.
 * @WIPHY_FLAG_SUPPORTS_NSTR_NONPRIMARY: support connection to non-primary link
 *        of an NSTR mobile AP MLD.
 * @WIPHY_FLAG_DISABLE_WEXT: disable wireless extensions for this device
 */
enum wiphy_flags {
        WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK                = BIT(0),
        WIPHY_FLAG_SUPPORTS_MLO                        = BIT(1),
        WIPHY_FLAG_SPLIT_SCAN_6GHZ                = BIT(2),
        WIPHY_FLAG_NETNS_OK                        = BIT(3),
        WIPHY_FLAG_PS_ON_BY_DEFAULT                = BIT(4),
        WIPHY_FLAG_4ADDR_AP                        = BIT(5),
        WIPHY_FLAG_4ADDR_STATION                = BIT(6),
        WIPHY_FLAG_CONTROL_PORT_PROTOCOL        = BIT(7),
        WIPHY_FLAG_IBSS_RSN                        = BIT(8),
        WIPHY_FLAG_DISABLE_WEXT                        = BIT(9),
        WIPHY_FLAG_MESH_AUTH                        = BIT(10),
        WIPHY_FLAG_SUPPORTS_EXT_KCK_32          = BIT(11),
        WIPHY_FLAG_SUPPORTS_NSTR_NONPRIMARY        = BIT(12),
        WIPHY_FLAG_SUPPORTS_FW_ROAM                = BIT(13),
        WIPHY_FLAG_AP_UAPSD                        = BIT(14),
        WIPHY_FLAG_SUPPORTS_TDLS                = BIT(15),
        WIPHY_FLAG_TDLS_EXTERNAL_SETUP                = BIT(16),
        WIPHY_FLAG_HAVE_AP_SME                        = BIT(17),
        WIPHY_FLAG_REPORTS_OBSS                        = BIT(18),
        WIPHY_FLAG_AP_PROBE_RESP_OFFLOAD        = BIT(19),
        WIPHY_FLAG_OFFCHAN_TX                        = BIT(20),
        WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL        = BIT(21),
        WIPHY_FLAG_SUPPORTS_5_10_MHZ                = BIT(22),
        WIPHY_FLAG_HAS_CHANNEL_SWITCH                = BIT(23),
        WIPHY_FLAG_NOTIFY_REGDOM_BY_DRIVER        = BIT(24),
        WIPHY_FLAG_CHANNEL_CHANGE_ON_BEACON     = BIT(25),
};

/**
 * struct ieee80211_iface_limit - limit on certain interface types
 * @max: maximum number of interfaces of these types
 * @types: interface types (bits)
 */
struct ieee80211_iface_limit {
        u16 max;
        u16 types;
};

/**
 * struct ieee80211_iface_combination - possible interface combination
 *
 * With this structure the driver can describe which interface
 * combinations it supports concurrently. When set in a struct wiphy_radio,
 * the combinations refer to combinations of interfaces currently active on
 * that radio.
 *
 * Examples:
 *
 * 1. Allow #STA <= 1, #AP <= 1, matching BI, channels = 1, 2 total:
 *
 *    .. code-block:: c
 *
 *        struct ieee80211_iface_limit limits1[] = {
 *                { .max = 1, .types = BIT(NL80211_IFTYPE_STATION), },
 *                { .max = 1, .types = BIT(NL80211_IFTYPE_AP), },
 *        };
 *        struct ieee80211_iface_combination combination1 = {
 *                .limits = limits1,
 *                .n_limits = ARRAY_SIZE(limits1),
 *                .max_interfaces = 2,
 *                .beacon_int_infra_match = true,
 *        };
 *
 *
 * 2. Allow #{AP, P2P-GO} <= 8, channels = 1, 8 total:
 *
 *    .. code-block:: c
 *
 *        struct ieee80211_iface_limit limits2[] = {
 *                { .max = 8, .types = BIT(NL80211_IFTYPE_AP) |
 *                                     BIT(NL80211_IFTYPE_P2P_GO), },
 *        };
 *        struct ieee80211_iface_combination combination2 = {
 *                .limits = limits2,
 *                .n_limits = ARRAY_SIZE(limits2),
 *                .max_interfaces = 8,
 *                .num_different_channels = 1,
 *        };
 *
 *
 * 3. Allow #STA <= 1, #{P2P-client,P2P-GO} <= 3 on two channels, 4 total.
 *
 *    This allows for an infrastructure connection and three P2P connections.
 *
 *    .. code-block:: c
 *
 *        struct ieee80211_iface_limit limits3[] = {
 *                { .max = 1, .types = BIT(NL80211_IFTYPE_STATION), },
 *                { .max = 3, .types = BIT(NL80211_IFTYPE_P2P_GO) |
 *                                     BIT(NL80211_IFTYPE_P2P_CLIENT), },
 *        };
 *        struct ieee80211_iface_combination combination3 = {
 *                .limits = limits3,
 *                .n_limits = ARRAY_SIZE(limits3),
 *                .max_interfaces = 4,
 *                .num_different_channels = 2,
 *        };
 *
 */
struct ieee80211_iface_combination {
        /**
         * @limits:
         * limits for the given interface types
         */
        const struct ieee80211_iface_limit *limits;

        /**
         * @num_different_channels:
         * can use up to this many different channels
         */
        u32 num_different_channels;

        /**
         * @max_interfaces:
         * maximum number of interfaces in total allowed in this group
         */
        u16 max_interfaces;

        /**
         * @n_limits:
         * number of limitations
         */
        u8 n_limits;

        /**
         * @beacon_int_infra_match:
         * In this combination, the beacon intervals between infrastructure
         * and AP types must match. This is required only in special cases.
         */
        bool beacon_int_infra_match;

        /**
         * @radar_detect_widths:
         * bitmap of channel widths supported for radar detection
         */
        u8 radar_detect_widths;

        /**
         * @radar_detect_regions:
         * bitmap of regions supported for radar detection
         */
        u8 radar_detect_regions;

        /**
         * @beacon_int_min_gcd:
         * This interface combination supports different beacon intervals.
         *
         * = 0
         *   all beacon intervals for different interface must be same.
         * > 0
         *   any beacon interval for the interface part of this combination AND
         *   GCD of all beacon intervals from beaconing interfaces of this
         *   combination must be greater or equal to this value.
         */
        u32 beacon_int_min_gcd;
};

struct ieee80211_txrx_stypes {
        u16 tx, rx;
};

/**
 * enum wiphy_wowlan_support_flags - WoWLAN support flags
 * @WIPHY_WOWLAN_ANY: supports wakeup for the special "any"
 *        trigger that keeps the device operating as-is and
 *        wakes up the host on any activity, for example a
 *        received packet that passed filtering; note that the
 *        packet should be preserved in that case
 * @WIPHY_WOWLAN_MAGIC_PKT: supports wakeup on magic packet
 *        (see nl80211.h)
 * @WIPHY_WOWLAN_DISCONNECT: supports wakeup on disconnect
 * @WIPHY_WOWLAN_SUPPORTS_GTK_REKEY: supports GTK rekeying while asleep
 * @WIPHY_WOWLAN_GTK_REKEY_FAILURE: supports wakeup on GTK rekey failure
 * @WIPHY_WOWLAN_EAP_IDENTITY_REQ: supports wakeup on EAP identity request
 * @WIPHY_WOWLAN_4WAY_HANDSHAKE: supports wakeup on 4-way handshake failure
 * @WIPHY_WOWLAN_RFKILL_RELEASE: supports wakeup on RF-kill release
 * @WIPHY_WOWLAN_NET_DETECT: supports wakeup on network detection
 */
enum wiphy_wowlan_support_flags {
        WIPHY_WOWLAN_ANY                = BIT(0),
        WIPHY_WOWLAN_MAGIC_PKT                = BIT(1),
        WIPHY_WOWLAN_DISCONNECT                = BIT(2),
        WIPHY_WOWLAN_SUPPORTS_GTK_REKEY        = BIT(3),
        WIPHY_WOWLAN_GTK_REKEY_FAILURE        = BIT(4),
        WIPHY_WOWLAN_EAP_IDENTITY_REQ        = BIT(5),
        WIPHY_WOWLAN_4WAY_HANDSHAKE        = BIT(6),
        WIPHY_WOWLAN_RFKILL_RELEASE        = BIT(7),
        WIPHY_WOWLAN_NET_DETECT                = BIT(8),
};

struct wiphy_wowlan_tcp_support {
        const struct nl80211_wowlan_tcp_data_token_feature *tok;
        u32 data_payload_max;
        u32 data_interval_max;
        u32 wake_payload_max;
        bool seq;
};

/**
 * struct wiphy_wowlan_support - WoWLAN support data
 * @flags: see &enum wiphy_wowlan_support_flags
 * @n_patterns: number of supported wakeup patterns
 *        (see nl80211.h for the pattern definition)
 * @pattern_max_len: maximum length of each pattern
 * @pattern_min_len: minimum length of each pattern
 * @max_pkt_offset: maximum Rx packet offset
 * @max_nd_match_sets: maximum number of matchsets for net-detect,
 *        similar, but not necessarily identical, to max_match_sets for
 *        scheduled scans.
 *        See &struct cfg80211_sched_scan_request.@match_sets for more
 *        details.
 * @tcp: TCP wakeup support information
 */
struct wiphy_wowlan_support {
        u32 flags;
        int n_patterns;
        int pattern_max_len;
        int pattern_min_len;
        int max_pkt_offset;
        int max_nd_match_sets;
        const struct wiphy_wowlan_tcp_support *tcp;
};

/**
 * struct wiphy_coalesce_support - coalesce support data
 * @n_rules: maximum number of coalesce rules
 * @max_delay: maximum supported coalescing delay in msecs
 * @n_patterns: number of supported patterns in a rule
 *        (see nl80211.h for the pattern definition)
 * @pattern_max_len: maximum length of each pattern
 * @pattern_min_len: minimum length of each pattern
 * @max_pkt_offset: maximum Rx packet offset
 */
struct wiphy_coalesce_support {
        int n_rules;
        int max_delay;
        int n_patterns;
        int pattern_max_len;
        int pattern_min_len;
        int max_pkt_offset;
};

/**
 * enum wiphy_vendor_command_flags - validation flags for vendor commands
 * @WIPHY_VENDOR_CMD_NEED_WDEV: vendor command requires wdev
 * @WIPHY_VENDOR_CMD_NEED_NETDEV: vendor command requires netdev
 * @WIPHY_VENDOR_CMD_NEED_RUNNING: interface/wdev must be up & running
 *        (must be combined with %_WDEV or %_NETDEV)
 */
enum wiphy_vendor_command_flags {
        WIPHY_VENDOR_CMD_NEED_WDEV = BIT(0),
        WIPHY_VENDOR_CMD_NEED_NETDEV = BIT(1),
        WIPHY_VENDOR_CMD_NEED_RUNNING = BIT(2),
};

/**
 * enum wiphy_opmode_flag - Station's ht/vht operation mode information flags
 *
 * @STA_OPMODE_MAX_BW_CHANGED: Max Bandwidth changed
 * @STA_OPMODE_SMPS_MODE_CHANGED: SMPS mode changed
 * @STA_OPMODE_N_SS_CHANGED: max N_SS (number of spatial streams) changed
 *
 */
enum wiphy_opmode_flag {
        STA_OPMODE_MAX_BW_CHANGED        = BIT(0),
        STA_OPMODE_SMPS_MODE_CHANGED        = BIT(1),
        STA_OPMODE_N_SS_CHANGED                = BIT(2),
};

/**
 * struct sta_opmode_info - Station's ht/vht operation mode information
 * @changed: contains value from &enum wiphy_opmode_flag
 * @smps_mode: New SMPS mode value from &enum nl80211_smps_mode of a station
 * @bw: new max bandwidth value from &enum nl80211_chan_width of a station
 * @rx_nss: new rx_nss value of a station
 */

struct sta_opmode_info {
        u32 changed;
        enum nl80211_smps_mode smps_mode;
        enum nl80211_chan_width bw;
        u8 rx_nss;
};

#define VENDOR_CMD_RAW_DATA ((const struct nla_policy *)(long)(-ENODATA))

/**
 * struct wiphy_vendor_command - vendor command definition
 * @info: vendor command identifying information, as used in nl80211
 * @flags: flags, see &enum wiphy_vendor_command_flags
 * @doit: callback for the operation, note that wdev is %NULL if the
 *        flags didn't ask for a wdev and non-%NULL otherwise; the data
 *        pointer may be %NULL if userspace provided no data at all
 * @dumpit: dump callback, for transferring bigger/multiple items. The
 *        @storage points to cb->args[5], ie. is preserved over the multiple
 *        dumpit calls.
 * @policy: policy pointer for attributes within %NL80211_ATTR_VENDOR_DATA.
 *        Set this to %VENDOR_CMD_RAW_DATA if no policy can be given and the
 *        attribute is just raw data (e.g. a firmware command).
 * @maxattr: highest attribute number in policy
 * It's recommended to not have the same sub command with both @doit and
 * @dumpit, so that userspace can assume certain ones are get and others
 * are used with dump requests.
 */
struct wiphy_vendor_command {
        struct nl80211_vendor_cmd_info info;
        u32 flags;
        int (*doit)(struct wiphy *wiphy, struct wireless_dev *wdev,
                    const void *data, int data_len);
        int (*dumpit)(struct wiphy *wiphy, struct wireless_dev *wdev,
                      struct sk_buff *skb, const void *data, int data_len,
                      unsigned long *storage);
        const struct nla_policy *policy;
        unsigned int maxattr;
};

/**
 * struct wiphy_iftype_ext_capab - extended capabilities per interface type
 * @iftype: interface type
 * @extended_capabilities: extended capabilities supported by the driver,
 *        additional capabilities might be supported by userspace; these are the
 *        802.11 extended capabilities ("Extended Capabilities element") and are
 *        in the same format as in the information element. See IEEE Std
 *        802.11-2012 8.4.2.29 for the defined fields.
 * @extended_capabilities_mask: mask of the valid values
 * @extended_capabilities_len: length of the extended capabilities
 * @eml_capabilities: EML capabilities (for MLO)
 * @mld_capa_and_ops: MLD capabilities and operations (for MLO)
 */
struct wiphy_iftype_ext_capab {
        enum nl80211_iftype iftype;
        const u8 *extended_capabilities;
        const u8 *extended_capabilities_mask;
        u8 extended_capabilities_len;
        u16 eml_capabilities;
        u16 mld_capa_and_ops;
};

/**
 * cfg80211_get_iftype_ext_capa - lookup interface type extended capability
 * @wiphy: the wiphy to look up from
 * @type: the interface type to look up
 *
 * Return: The extended capability for the given interface @type, may be %NULL
 */
const struct wiphy_iftype_ext_capab *
cfg80211_get_iftype_ext_capa(struct wiphy *wiphy, enum nl80211_iftype type);

/**
 * struct cfg80211_pmsr_capabilities - cfg80211 peer measurement capabilities
 * @max_peers: maximum number of peers in a single measurement
 * @report_ap_tsf: can report assoc AP's TSF for radio resource measurement
 * @randomize_mac_addr: can randomize MAC address for measurement
 * @ftm: FTM measurement data
 * @ftm.supported: FTM measurement is supported
 * @ftm.asap: ASAP-mode is supported
 * @ftm.non_asap: non-ASAP-mode is supported
 * @ftm.request_lci: can request LCI data
 * @ftm.request_civicloc: can request civic location data
 * @ftm.preambles: bitmap of preambles supported (&enum nl80211_preamble)
 * @ftm.bandwidths: bitmap of bandwidths supported (&enum nl80211_chan_width)
 * @ftm.max_bursts_exponent: maximum burst exponent supported
 *        (set to -1 if not limited; note that setting this will necessarily
 *        forbid using the value 15 to let the responder pick)
 * @ftm.max_ftms_per_burst: maximum FTMs per burst supported (set to 0 if
 *        not limited)
 * @ftm.trigger_based: trigger based ranging measurement is supported
 * @ftm.non_trigger_based: non trigger based ranging measurement is supported
 */
struct cfg80211_pmsr_capabilities {
        unsigned int max_peers;
        u8 report_ap_tsf:1,
           randomize_mac_addr:1;

        struct {
                u32 preambles;
                u32 bandwidths;
                s8 max_bursts_exponent;
                u8 max_ftms_per_burst;
                u8 supported:1,
                   asap:1,
                   non_asap:1,
                   request_lci:1,
                   request_civicloc:1,
                   trigger_based:1,
                   non_trigger_based:1;
        } ftm;
};

/**
 * struct wiphy_iftype_akm_suites - This structure encapsulates supported akm
 * suites for interface types defined in @iftypes_mask. Each type in the
 * @iftypes_mask must be unique across all instances of iftype_akm_suites.
 *
 * @iftypes_mask: bitmask of interfaces types
 * @akm_suites: points to an array of supported akm suites
 * @n_akm_suites: number of supported AKM suites
 */
struct wiphy_iftype_akm_suites {
        u16 iftypes_mask;
        const u32 *akm_suites;
        int n_akm_suites;
};

/**
 * struct wiphy_radio_cfg - physical radio config of a wiphy
 * This structure describes the configurations of a physical radio in a
 * wiphy. It is used to denote per-radio attributes belonging to a wiphy.
 *
 * @rts_threshold: RTS threshold (dot11RTSThreshold);
 *        -1 (default) = RTS/CTS disabled
 */
struct wiphy_radio_cfg {
        u32 rts_threshold;
};

/**
 * struct wiphy_radio_freq_range - wiphy frequency range
 * @start_freq:  start range edge frequency (kHz)
 * @end_freq:    end range edge frequency (kHz)
 */
struct wiphy_radio_freq_range {
        u32 start_freq;
        u32 end_freq;
};


/**
 * struct wiphy_radio - physical radio of a wiphy
 * This structure describes a physical radio belonging to a wiphy.
 * It is used to describe concurrent-channel capabilities. Only one channel
 * can be active on the radio described by struct wiphy_radio.
 *
 * @freq_range: frequency range that the radio can operate on.
 * @n_freq_range: number of elements in @freq_range
 *
 * @iface_combinations: Valid interface combinations array, should not
 *        list single interface types.
 * @n_iface_combinations: number of entries in @iface_combinations array.
 *
 * @antenna_mask: bitmask of antennas connected to this radio.
 */
struct wiphy_radio {
        const struct wiphy_radio_freq_range *freq_range;
        int n_freq_range;

        const struct ieee80211_iface_combination *iface_combinations;
        int n_iface_combinations;

        u32 antenna_mask;
};

/**
 * enum wiphy_nan_flags - NAN capabilities
 *
 * @WIPHY_NAN_FLAGS_CONFIGURABLE_SYNC: Device supports NAN configurable
 *     synchronization.
 * @WIPHY_NAN_FLAGS_USERSPACE_DE: Device doesn't support DE offload.
 */
enum wiphy_nan_flags {
        WIPHY_NAN_FLAGS_CONFIGURABLE_SYNC = BIT(0),
        WIPHY_NAN_FLAGS_USERSPACE_DE   = BIT(1),
};

/**
 * struct wiphy_nan_capa - NAN capabilities
 *
 * This structure describes the NAN capabilities of a wiphy.
 *
 * @flags: NAN capabilities flags, see &enum wiphy_nan_flags
 * @op_mode: NAN operation mode, as defined in Wi-Fi Aware (TM) specification
 *     Table 81.
 * @n_antennas: number of antennas supported by the device for Tx/Rx. Lower
 *     nibble indicates the number of TX antennas and upper nibble indicates the
 *     number of RX antennas. Value 0 indicates the information is not
 *     available.
 * @max_channel_switch_time: maximum channel switch time in milliseconds.
 * @dev_capabilities: NAN device capabilities as defined in Wi-Fi Aware (TM)
 *     specification Table 79 (Capabilities field).
 */
struct wiphy_nan_capa {
        u32 flags;
        u8 op_mode;
        u8 n_antennas;
        u16 max_channel_switch_time;
        u8 dev_capabilities;
};

#define CFG80211_HW_TIMESTAMP_ALL_PEERS        0xffff

/**
 * struct wiphy - wireless hardware description
 * @mtx: mutex for the data (structures) of this device
 * @reg_notifier: the driver's regulatory notification callback,
 *        note that if your driver uses wiphy_apply_custom_regulatory()
 *        the reg_notifier's request can be passed as NULL
 * @regd: the driver's regulatory domain, if one was requested via
 *        the regulatory_hint() API. This can be used by the driver
 *        on the reg_notifier() if it chooses to ignore future
 *        regulatory domain changes caused by other drivers.
 * @signal_type: signal type reported in &struct cfg80211_bss.
 * @cipher_suites: supported cipher suites
 * @n_cipher_suites: number of supported cipher suites
 * @akm_suites: supported AKM suites. These are the default AKMs supported if
 *        the supported AKMs not advertized for a specific interface type in
 *        iftype_akm_suites.
 * @n_akm_suites: number of supported AKM suites
 * @iftype_akm_suites: array of supported akm suites info per interface type.
 *        Note that the bits in @iftypes_mask inside this structure cannot
 *        overlap (i.e. only one occurrence of each type is allowed across all
 *        instances of iftype_akm_suites).
 * @num_iftype_akm_suites: number of interface types for which supported akm
 *        suites are specified separately.
 * @retry_short: Retry limit for short frames (dot11ShortRetryLimit)
 * @retry_long: Retry limit for long frames (dot11LongRetryLimit)
 * @frag_threshold: Fragmentation threshold (dot11FragmentationThreshold);
 *        -1 = fragmentation disabled, only odd values >= 256 used
 * @rts_threshold: RTS threshold (dot11RTSThreshold); -1 = RTS/CTS disabled
 * @_net: the network namespace this wiphy currently lives in
 * @perm_addr: permanent MAC address of this device
 * @addr_mask: If the device supports multiple MAC addresses by masking,
 *        set this to a mask with variable bits set to 1, e.g. if the last
 *        four bits are variable then set it to 00-00-00-00-00-0f. The actual
 *        variable bits shall be determined by the interfaces added, with
 *        interfaces not matching the mask being rejected to be brought up.
 * @n_addresses: number of addresses in @addresses.
 * @addresses: If the device has more than one address, set this pointer
 *        to a list of addresses (6 bytes each). The first one will be used
 *        by default for perm_addr. In this case, the mask should be set to
 *        all-zeroes. In this case it is assumed that the device can handle
 *        the same number of arbitrary MAC addresses.
 * @registered: protects ->resume and ->suspend sysfs callbacks against
 *        unregister hardware
 * @debugfsdir: debugfs directory used for this wiphy (ieee80211/<wiphyname>).
 *        It will be renamed automatically on wiphy renames
 * @dev: (virtual) struct device for this wiphy. The item in
 *        /sys/class/ieee80211/ points to this. You need use set_wiphy_dev()
 *        (see below).
 * @wext: wireless extension handlers
 * @priv: driver private data (sized according to wiphy_new() parameter)
 * @interface_modes: bitmask of interfaces types valid for this wiphy,
 *        must be set by driver
 * @iface_combinations: Valid interface combinations array, should not
 *        list single interface types.
 * @n_iface_combinations: number of entries in @iface_combinations array.
 * @software_iftypes: bitmask of software interface types, these are not
 *        subject to any restrictions since they are purely managed in SW.
 * @flags: wiphy flags, see &enum wiphy_flags
 * @regulatory_flags: wiphy regulatory flags, see
 *        &enum ieee80211_regulatory_flags
 * @features: features advertised to nl80211, see &enum nl80211_feature_flags.
 * @ext_features: extended features advertised to nl80211, see
 *        &enum nl80211_ext_feature_index.
 * @bss_priv_size: each BSS struct has private data allocated with it,
 *        this variable determines its size
 * @max_scan_ssids: maximum number of SSIDs the device can scan for in
 *        any given scan
 * @max_sched_scan_reqs: maximum number of scheduled scan requests that
 *        the device can run concurrently.
 * @max_sched_scan_ssids: maximum number of SSIDs the device can scan
 *        for in any given scheduled scan
 * @max_match_sets: maximum number of match sets the device can handle
 *        when performing a scheduled scan, 0 if filtering is not
 *        supported.
 * @max_scan_ie_len: maximum length of user-controlled IEs device can
 *        add to probe request frames transmitted during a scan, must not
 *        include fixed IEs like supported rates
 * @max_sched_scan_ie_len: same as max_scan_ie_len, but for scheduled
 *        scans
 * @max_sched_scan_plans: maximum number of scan plans (scan interval and number
 *        of iterations) for scheduled scan supported by the device.
 * @max_sched_scan_plan_interval: maximum interval (in seconds) for a
 *        single scan plan supported by the device.
 * @max_sched_scan_plan_iterations: maximum number of iterations for a single
 *        scan plan supported by the device.
 * @coverage_class: current coverage class
 * @fw_version: firmware version for ethtool reporting
 * @hw_version: hardware version for ethtool reporting
 * @max_num_pmkids: maximum number of PMKIDs supported by device
 * @privid: a pointer that drivers can use to identify if an arbitrary
 *        wiphy is theirs, e.g. in global notifiers
 * @bands: information about bands/channels supported by this device
 *
 * @mgmt_stypes: bitmasks of frame subtypes that can be subscribed to or
 *        transmitted through nl80211, points to an array indexed by interface
 *        type
 *
 * @available_antennas_tx: bitmap of antennas which are available to be
 *        configured as TX antennas. Antenna configuration commands will be
 *        rejected unless this or @available_antennas_rx is set.
 *
 * @available_antennas_rx: bitmap of antennas which are available to be
 *        configured as RX antennas. Antenna configuration commands will be
 *        rejected unless this or @available_antennas_tx is set.
 *
 * @probe_resp_offload:
 *         Bitmap of supported protocols for probe response offloading.
 *         See &enum nl80211_probe_resp_offload_support_attr. Only valid
 *         when the wiphy flag @WIPHY_FLAG_AP_PROBE_RESP_OFFLOAD is set.
 *
 * @max_remain_on_channel_duration: Maximum time a remain-on-channel operation
 *        may request, if implemented.
 *
 * @wowlan: WoWLAN support information
 * @wowlan_config: current WoWLAN configuration; this should usually not be
 *        used since access to it is necessarily racy, use the parameter passed
 *        to the suspend() operation instead.
 *
 * @ap_sme_capa: AP SME capabilities, flags from &enum nl80211_ap_sme_features.
 * @ht_capa_mod_mask:  Specify what ht_cap values can be over-ridden.
 *        If null, then none can be over-ridden.
 * @vht_capa_mod_mask:  Specify what VHT capabilities can be over-ridden.
 *        If null, then none can be over-ridden.
 *
 * @wdev_list: the list of associated (virtual) interfaces; this list must
 *        not be modified by the driver, but can be read with RTNL/RCU protection.
 *
 * @max_acl_mac_addrs: Maximum number of MAC addresses that the device
 *        supports for ACL.
 *
 * @extended_capabilities: extended capabilities supported by the driver,
 *        additional capabilities might be supported by userspace; these are
 *        the 802.11 extended capabilities ("Extended Capabilities element")
 *        and are in the same format as in the information element. See
 *        802.11-2012 8.4.2.29 for the defined fields. These are the default
 *        extended capabilities to be used if the capabilities are not specified
 *        for a specific interface type in iftype_ext_capab.
 * @extended_capabilities_mask: mask of the valid values
 * @extended_capabilities_len: length of the extended capabilities
 * @iftype_ext_capab: array of extended capabilities per interface type
 * @num_iftype_ext_capab: number of interface types for which extended
 *        capabilities are specified separately.
 * @coalesce: packet coalescing support information
 *
 * @vendor_commands: array of vendor commands supported by the hardware
 * @n_vendor_commands: number of vendor commands
 * @vendor_events: array of vendor events supported by the hardware
 * @n_vendor_events: number of vendor events
 *
 * @max_ap_assoc_sta: maximum number of associated stations supported in AP mode
 *        (including P2P GO) or 0 to indicate no such limit is advertised. The
 *        driver is allowed to advertise a theoretical limit that it can reach in
 *        some cases, but may not always reach.
 *
 * @max_num_csa_counters: Number of supported csa_counters in beacons
 *        and probe responses.  This value should be set if the driver
 *        wishes to limit the number of csa counters. Default (0) means
 *        infinite.
 * @bss_param_support: bitmask indicating which bss_parameters as defined in
 *        &struct bss_parameters the driver can actually handle in the
 *        .change_bss() callback. The bit positions are defined in &enum
 *        wiphy_bss_param_flags.
 *
 * @bss_select_support: bitmask indicating the BSS selection criteria supported
 *        by the driver in the .connect() callback. The bit position maps to the
 *        attribute indices defined in &enum nl80211_bss_select_attr.
 *
 * @nan_supported_bands: bands supported by the device in NAN mode, a
 *        bitmap of &enum nl80211_band values.  For instance, for
 *        NL80211_BAND_2GHZ, bit 0 would be set
 *        (i.e. BIT(NL80211_BAND_2GHZ)).
 * @nan_capa: NAN capabilities
 *
 * @txq_limit: configuration of internal TX queue frame limit
 * @txq_memory_limit: configuration internal TX queue memory limit
 * @txq_quantum: configuration of internal TX queue scheduler quantum
 *
 * @tx_queue_len: allow setting transmit queue len for drivers not using
 *        wake_tx_queue
 *
 * @support_mbssid: can HW support association with nontransmitted AP
 * @support_only_he_mbssid: don't parse MBSSID elements if it is not
 *        HE AP, in order to avoid compatibility issues.
 *        @support_mbssid must be set for this to have any effect.
 *
 * @pmsr_capa: peer measurement capabilities
 *
 * @tid_config_support: describes the per-TID config support that the
 *        device has
 * @tid_config_support.vif: bitmap of attributes (configurations)
 *        supported by the driver for each vif
 * @tid_config_support.peer: bitmap of attributes (configurations)
 *        supported by the driver for each peer
 * @tid_config_support.max_retry: maximum supported retry count for
 *        long/short retry configuration
 *
 * @max_data_retry_count: maximum supported per TID retry count for
 *        configuration through the %NL80211_TID_CONFIG_ATTR_RETRY_SHORT and
 *        %NL80211_TID_CONFIG_ATTR_RETRY_LONG attributes
 * @sar_capa: SAR control capabilities
 * @rfkill: a pointer to the rfkill structure
 *
 * @mbssid_max_interfaces: maximum number of interfaces supported by the driver
 *        in a multiple BSSID set. This field must be set to a non-zero value
 *        by the driver to advertise MBSSID support.
 * @ema_max_profile_periodicity: maximum profile periodicity supported by
 *        the driver. Setting this field to a non-zero value indicates that the
 *        driver supports enhanced multi-BSSID advertisements (EMA AP).
 * @max_num_akm_suites: maximum number of AKM suites allowed for
 *        configuration through %NL80211_CMD_CONNECT, %NL80211_CMD_ASSOCIATE and
 *        %NL80211_CMD_START_AP. Set to NL80211_MAX_NR_AKM_SUITES if not set by
 *        driver. If set by driver minimum allowed value is
 *        NL80211_MAX_NR_AKM_SUITES in order to avoid compatibility issues with
 *        legacy userspace and maximum allowed value is
 *        CFG80211_MAX_NUM_AKM_SUITES.
 *
 * @hw_timestamp_max_peers: maximum number of peers that the driver supports
 *        enabling HW timestamping for concurrently. Setting this field to a
 *        non-zero value indicates that the driver supports HW timestamping.
 *        A value of %CFG80211_HW_TIMESTAMP_ALL_PEERS indicates the driver
 *        supports enabling HW timestamping for all peers (i.e. no need to
 *        specify a mac address).
 *
 * @radio_cfg: configuration of radios belonging to a muli-radio wiphy. This
 *        struct contains a list of all radio specific attributes and should be
 *        used only for multi-radio wiphy.
 *
 * @radio: radios belonging to this wiphy
 * @n_radio: number of radios
 */
struct wiphy {
        struct mutex mtx;

        /* assign these fields before you register the wiphy */

        u8 perm_addr[ETH_ALEN];
        u8 addr_mask[ETH_ALEN];

        struct mac_address *addresses;

        const struct ieee80211_txrx_stypes *mgmt_stypes;

        const struct ieee80211_iface_combination *iface_combinations;
        int n_iface_combinations;
        u16 software_iftypes;

        u16 n_addresses;

        /* Supported interface modes, OR together BIT(NL80211_IFTYPE_...) */
        u16 interface_modes;

        u16 max_acl_mac_addrs;

        u32 flags, regulatory_flags, features;
        u8 ext_features[DIV_ROUND_UP(NUM_NL80211_EXT_FEATURES, 8)];

        u32 ap_sme_capa;

        enum cfg80211_signal_type signal_type;

        int bss_priv_size;
        u8 max_scan_ssids;
        u8 max_sched_scan_reqs;
        u8 max_sched_scan_ssids;
        u8 max_match_sets;
        u16 max_scan_ie_len;
        u16 max_sched_scan_ie_len;
        u32 max_sched_scan_plans;
        u32 max_sched_scan_plan_interval;
        u32 max_sched_scan_plan_iterations;

        int n_cipher_suites;
        const u32 *cipher_suites;

        int n_akm_suites;
        const u32 *akm_suites;

        const struct wiphy_iftype_akm_suites *iftype_akm_suites;
        unsigned int num_iftype_akm_suites;

        u8 retry_short;
        u8 retry_long;
        u32 frag_threshold;
        u32 rts_threshold;
        u8 coverage_class;

        char fw_version[ETHTOOL_FWVERS_LEN];
        u32 hw_version;

#ifdef CONFIG_PM
        const struct wiphy_wowlan_support *wowlan;
        struct cfg80211_wowlan *wowlan_config;
#endif

        u16 max_remain_on_channel_duration;

        u8 max_num_pmkids;

        u32 available_antennas_tx;
        u32 available_antennas_rx;

        u32 probe_resp_offload;

        const u8 *extended_capabilities, *extended_capabilities_mask;
        u8 extended_capabilities_len;

        const struct wiphy_iftype_ext_capab *iftype_ext_capab;
        unsigned int num_iftype_ext_capab;

        const void *privid;

        struct ieee80211_supported_band *bands[NUM_NL80211_BANDS];

        void (*reg_notifier)(struct wiphy *wiphy,
                             struct regulatory_request *request);

        struct wiphy_radio_cfg *radio_cfg;

        /* fields below are read-only, assigned by cfg80211 */

        const struct ieee80211_regdomain __rcu *regd;

        struct device dev;

        bool registered;

        struct dentry *debugfsdir;

        const struct ieee80211_ht_cap *ht_capa_mod_mask;
        const struct ieee80211_vht_cap *vht_capa_mod_mask;

        struct list_head wdev_list;

        possible_net_t _net;

#ifdef CONFIG_CFG80211_WEXT
        const struct iw_handler_def *wext;
#endif

        const struct wiphy_coalesce_support *coalesce;

        const struct wiphy_vendor_command *vendor_commands;
        const struct nl80211_vendor_cmd_info *vendor_events;
        int n_vendor_commands, n_vendor_events;

        u16 max_ap_assoc_sta;

        u8 max_num_csa_counters;

        u32 bss_param_support;
        u32 bss_select_support;

        u8 nan_supported_bands;
        struct wiphy_nan_capa nan_capa;

        u32 txq_limit;
        u32 txq_memory_limit;
        u32 txq_quantum;

        unsigned long tx_queue_len;

        u8 support_mbssid:1,
           support_only_he_mbssid:1;

        const struct cfg80211_pmsr_capabilities *pmsr_capa;

        struct {
                u64 peer, vif;
                u8 max_retry;
        } tid_config_support;

        u8 max_data_retry_count;

        const struct cfg80211_sar_capa *sar_capa;

        struct rfkill *rfkill;

        u8 mbssid_max_interfaces;
        u8 ema_max_profile_periodicity;
        u16 max_num_akm_suites;

        u16 hw_timestamp_max_peers;

        int n_radio;
        const struct wiphy_radio *radio;

        char priv[] __aligned(NETDEV_ALIGN);
};

static inline struct net *wiphy_net(struct wiphy *wiphy)
{
        return read_pnet(&wiphy->_net);
}

static inline void wiphy_net_set(struct wiphy *wiphy, struct net *net)
{
        write_pnet(&wiphy->_net, net);
}

/**
 * wiphy_priv - return priv from wiphy
 *
 * @wiphy: the wiphy whose priv pointer to return
 * Return: The priv of @wiphy.
 */
static inline void *wiphy_priv(struct wiphy *wiphy)
{
        BUG_ON(!wiphy);
        return &wiphy->priv;
}

/**
 * priv_to_wiphy - return the wiphy containing the priv
 *
 * @priv: a pointer previously returned by wiphy_priv
 * Return: The wiphy of @priv.
 */
static inline struct wiphy *priv_to_wiphy(void *priv)
{
        BUG_ON(!priv);
        return container_of(priv, struct wiphy, priv);
}

/**
 * set_wiphy_dev - set device pointer for wiphy
 *
 * @wiphy: The wiphy whose device to bind
 * @dev: The device to parent it to
 */
static inline void set_wiphy_dev(struct wiphy *wiphy, struct device *dev)
{
        wiphy->dev.parent = dev;
}

/**
 * wiphy_dev - get wiphy dev pointer
 *
 * @wiphy: The wiphy whose device struct to look up
 * Return: The dev of @wiphy.
 */
static inline struct device *wiphy_dev(struct wiphy *wiphy)
{
        return wiphy->dev.parent;
}

/**
 * wiphy_name - get wiphy name
 *
 * @wiphy: The wiphy whose name to return
 * Return: The name of @wiphy.
 */
static inline const char *wiphy_name(const struct wiphy *wiphy)
{
        return dev_name(&wiphy->dev);
}

/**
 * wiphy_new_nm - create a new wiphy for use with cfg80211
 *
 * @ops: The configuration operations for this device
 * @sizeof_priv: The size of the private area to allocate
 * @requested_name: Request a particular name.
 *        NULL is valid value, and means use the default phy%d naming.
 *
 * Create a new wiphy and associate the given operations with it.
 * @sizeof_priv bytes are allocated for private use.
 *
 * Return: A pointer to the new wiphy. This pointer must be
 * assigned to each netdev's ieee80211_ptr for proper operation.
 */
struct wiphy *wiphy_new_nm(const struct cfg80211_ops *ops, int sizeof_priv,
                           const char *requested_name);

/**
 * wiphy_new - create a new wiphy for use with cfg80211
 *
 * @ops: The configuration operations for this device
 * @sizeof_priv: The size of the private area to allocate
 *
 * Create a new wiphy and associate the given operations with it.
 * @sizeof_priv bytes are allocated for private use.
 *
 * Return: A pointer to the new wiphy. This pointer must be
 * assigned to each netdev's ieee80211_ptr for proper operation.
 */
static inline struct wiphy *wiphy_new(const struct cfg80211_ops *ops,
                                      int sizeof_priv)
{
        return wiphy_new_nm(ops, sizeof_priv, NULL);
}

/**
 * wiphy_register - register a wiphy with cfg80211
 *
 * @wiphy: The wiphy to register.
 *
 * Return: A non-negative wiphy index or a negative error code.
 */
int wiphy_register(struct wiphy *wiphy);

/* this is a define for better error reporting (file/line) */
#define lockdep_assert_wiphy(wiphy) lockdep_assert_held(&(wiphy)->mtx)

/**
 * rcu_dereference_wiphy - rcu_dereference with debug checking
 * @wiphy: the wiphy to check the locking on
 * @p: The pointer to read, prior to dereferencing
 *
 * Do an rcu_dereference(p), but check caller either holds rcu_read_lock()
 * or RTNL. Note: Please prefer wiphy_dereference() or rcu_dereference().
 */
#define rcu_dereference_wiphy(wiphy, p)                                \
        rcu_dereference_check(p, lockdep_is_held(&wiphy->mtx))

/**
 * wiphy_dereference - fetch RCU pointer when updates are prevented by wiphy mtx
 * @wiphy: the wiphy to check the locking on
 * @p: The pointer to read, prior to dereferencing
 *
 * Return: the value of the specified RCU-protected pointer, but omit the
 * READ_ONCE(), because caller holds the wiphy mutex used for updates.
 */
#define wiphy_dereference(wiphy, p)                                \
        rcu_dereference_protected(p, lockdep_is_held(&wiphy->mtx))

/**
 * get_wiphy_regdom - get custom regdomain for the given wiphy
 * @wiphy: the wiphy to get the regdomain from
 *
 * Context: Requires any of RTNL, wiphy mutex or RCU protection.
 *
 * Return: pointer to the regulatory domain associated with the wiphy
 */
const struct ieee80211_regdomain *get_wiphy_regdom(struct wiphy *wiphy);

/**
 * wiphy_unregister - deregister a wiphy from cfg80211
 *
 * @wiphy: The wiphy to unregister.
 *
 * After this call, no more requests can be made with this priv
 * pointer, but the call may sleep to wait for an outstanding
 * request that is being handled.
 */
void wiphy_unregister(struct wiphy *wiphy);

/**
 * wiphy_free - free wiphy
 *
 * @wiphy: The wiphy to free
 */
void wiphy_free(struct wiphy *wiphy);

/* internal structs */
struct cfg80211_conn;
struct cfg80211_internal_bss;
struct cfg80211_cached_keys;
struct cfg80211_cqm_config;

/**
 * wiphy_lock - lock the wiphy
 * @wiphy: the wiphy to lock
 *
 * This is needed around registering and unregistering netdevs that
 * aren't created through cfg80211 calls, since that requires locking
 * in cfg80211 when the notifiers is called, but that cannot
 * differentiate which way it's called.
 *
 * It can also be used by drivers for their own purposes.
 *
 * When cfg80211 ops are called, the wiphy is already locked.
 *
 * Note that this makes sure that no workers that have been queued
 * with wiphy_queue_work() are running.
 */
static inline void wiphy_lock(struct wiphy *wiphy)
        __acquires(&wiphy->mtx)
{
        mutex_lock(&wiphy->mtx);
        __acquire(&wiphy->mtx);
}

/**
 * wiphy_unlock - unlock the wiphy again
 * @wiphy: the wiphy to unlock
 */
static inline void wiphy_unlock(struct wiphy *wiphy)
        __releases(&wiphy->mtx)
{
        __release(&wiphy->mtx);
        mutex_unlock(&wiphy->mtx);
}

DEFINE_GUARD(wiphy, struct wiphy *,
             mutex_lock(&_T->mtx),
             mutex_unlock(&_T->mtx))

struct wiphy_work;
typedef void (*wiphy_work_func_t)(struct wiphy *, struct wiphy_work *);

struct wiphy_work {
        struct list_head entry;
        wiphy_work_func_t func;
};

static inline void wiphy_work_init(struct wiphy_work *work,
                                   wiphy_work_func_t func)
{
        INIT_LIST_HEAD(&work->entry);
        work->func = func;
}

/**
 * wiphy_work_queue - queue work for the wiphy
 * @wiphy: the wiphy to queue for
 * @work: the work item
 *
 * This is useful for work that must be done asynchronously, and work
 * queued here has the special property that the wiphy mutex will be
 * held as if wiphy_lock() was called, and that it cannot be running
 * after wiphy_lock() was called. Therefore, wiphy_cancel_work() can
 * use just cancel_work() instead of cancel_work_sync(), it requires
 * being in a section protected by wiphy_lock().
 */
void wiphy_work_queue(struct wiphy *wiphy, struct wiphy_work *work);

/**
 * wiphy_work_cancel - cancel previously queued work
 * @wiphy: the wiphy, for debug purposes
 * @work: the work to cancel
 *
 * Cancel the work *without* waiting for it, this assumes being
 * called under the wiphy mutex acquired by wiphy_lock().
 */
void wiphy_work_cancel(struct wiphy *wiphy, struct wiphy_work *work);

/**
 * wiphy_work_flush - flush previously queued work
 * @wiphy: the wiphy, for debug purposes
 * @work: the work to flush, this can be %NULL to flush all work
 *
 * Flush the work (i.e. run it if pending). This must be called
 * under the wiphy mutex acquired by wiphy_lock().
 */
void wiphy_work_flush(struct wiphy *wiphy, struct wiphy_work *work);

struct wiphy_delayed_work {
        struct wiphy_work work;
        struct wiphy *wiphy;
        struct timer_list timer;
};

void wiphy_delayed_work_timer(struct timer_list *t);

static inline void wiphy_delayed_work_init(struct wiphy_delayed_work *dwork,
                                           wiphy_work_func_t func)
{
        timer_setup(&dwork->timer, wiphy_delayed_work_timer, 0);
        wiphy_work_init(&dwork->work, func);
}

/**
 * wiphy_delayed_work_queue - queue delayed work for the wiphy
 * @wiphy: the wiphy to queue for
 * @dwork: the delayable worker
 * @delay: number of jiffies to wait before queueing
 *
 * This is useful for work that must be done asynchronously, and work
 * queued here has the special property that the wiphy mutex will be
 * held as if wiphy_lock() was called, and that it cannot be running
 * after wiphy_lock() was called. Therefore, wiphy_cancel_work() can
 * use just cancel_work() instead of cancel_work_sync(), it requires
 * being in a section protected by wiphy_lock().
 *
 * Note that these are scheduled with a timer where the accuracy
 * becomes less the longer in the future the scheduled timer is. Use
 * wiphy_hrtimer_work_queue() if the timer must be not be late by more
 * than approximately 10 percent.
 */
void wiphy_delayed_work_queue(struct wiphy *wiphy,
                              struct wiphy_delayed_work *dwork,
                              unsigned long delay);

/**
 * wiphy_delayed_work_cancel - cancel previously queued delayed work
 * @wiphy: the wiphy, for debug purposes
 * @dwork: the delayed work to cancel
 *
 * Cancel the work *without* waiting for it, this assumes being
 * called under the wiphy mutex acquired by wiphy_lock().
 */
void wiphy_delayed_work_cancel(struct wiphy *wiphy,
                               struct wiphy_delayed_work *dwork);

/**
 * wiphy_delayed_work_flush - flush previously queued delayed work
 * @wiphy: the wiphy, for debug purposes
 * @dwork: the delayed work to flush
 *
 * Flush the work (i.e. run it if pending). This must be called
 * under the wiphy mutex acquired by wiphy_lock().
 */
void wiphy_delayed_work_flush(struct wiphy *wiphy,
                              struct wiphy_delayed_work *dwork);

/**
 * wiphy_delayed_work_pending - Find out whether a wiphy delayable
 * work item is currently pending.
 *
 * @wiphy: the wiphy, for debug purposes
 * @dwork: the delayed work in question
 *
 * Return: true if timer is pending, false otherwise
 *
 * How wiphy_delayed_work_queue() works is by setting a timer which
 * when it expires calls wiphy_work_queue() to queue the wiphy work.
 * Because wiphy_delayed_work_queue() uses mod_timer(), if it is
 * called twice and the second call happens before the first call
 * deadline, the work will rescheduled for the second deadline and
 * won't run before that.
 *
 * wiphy_delayed_work_pending() can be used to detect if calling
 * wiphy_work_delayed_work_queue() would start a new work schedule
 * or delayed a previous one. As seen below it cannot be used to
 * detect precisely if the work has finished to execute nor if it
 * is currently executing.
 *
 *      CPU0                                CPU1
 * wiphy_delayed_work_queue(wk)
 *  mod_timer(wk->timer)
 *                                     wiphy_delayed_work_pending(wk) -> true
 *
 * [...]
 * expire_timers(wk->timer)
 *  detach_timer(wk->timer)
 *                                     wiphy_delayed_work_pending(wk) -> false
 *  wk->timer->function()                          |
 *   wiphy_work_queue(wk)                          | delayed work pending
 *    list_add_tail()                              | returns false but
 *    queue_work(cfg80211_wiphy_work)              | wk->func() has not
 *                                                 | been run yet
 * [...]                                           |
 *  cfg80211_wiphy_work()                          |
 *   wk->func()                                    V
 *
 */
bool wiphy_delayed_work_pending(struct wiphy *wiphy,
                                struct wiphy_delayed_work *dwork);

struct wiphy_hrtimer_work {
        struct wiphy_work work;
        struct wiphy *wiphy;
        struct hrtimer timer;
};

enum hrtimer_restart wiphy_hrtimer_work_timer(struct hrtimer *t);

static inline void wiphy_hrtimer_work_init(struct wiphy_hrtimer_work *hrwork,
                                           wiphy_work_func_t func)
{
        hrtimer_setup(&hrwork->timer, wiphy_hrtimer_work_timer,
                      CLOCK_BOOTTIME, HRTIMER_MODE_REL);
        wiphy_work_init(&hrwork->work, func);
}

/**
 * wiphy_hrtimer_work_queue - queue hrtimer work for the wiphy
 * @wiphy: the wiphy to queue for
 * @hrwork: the high resolution timer worker
 * @delay: the delay given as a ktime_t
 *
 * Please refer to wiphy_delayed_work_queue(). The difference is that
 * the hrtimer work uses a high resolution timer for scheduling. This
 * may be needed if timeouts might be scheduled further in the future
 * and the accuracy of the normal timer is not sufficient.
 *
 * Expect a delay of a few milliseconds as the timer is scheduled
 * with some slack and some more time may pass between queueing the
 * work and its start.
 */
void wiphy_hrtimer_work_queue(struct wiphy *wiphy,
                              struct wiphy_hrtimer_work *hrwork,
                              ktime_t delay);

/**
 * wiphy_hrtimer_work_cancel - cancel previously queued hrtimer work
 * @wiphy: the wiphy, for debug purposes
 * @hrtimer: the hrtimer work to cancel
 *
 * Cancel the work *without* waiting for it, this assumes being
 * called under the wiphy mutex acquired by wiphy_lock().
 */
void wiphy_hrtimer_work_cancel(struct wiphy *wiphy,
                               struct wiphy_hrtimer_work *hrtimer);

/**
 * wiphy_hrtimer_work_flush - flush previously queued hrtimer work
 * @wiphy: the wiphy, for debug purposes
 * @hrwork: the hrtimer work to flush
 *
 * Flush the work (i.e. run it if pending). This must be called
 * under the wiphy mutex acquired by wiphy_lock().
 */
void wiphy_hrtimer_work_flush(struct wiphy *wiphy,
                              struct wiphy_hrtimer_work *hrwork);

/**
 * wiphy_hrtimer_work_pending - Find out whether a wiphy hrtimer
 * work item is currently pending.
 *
 * @wiphy: the wiphy, for debug purposes
 * @hrwork: the hrtimer work in question
 *
 * Return: true if timer is pending, false otherwise
 *
 * Please refer to the wiphy_delayed_work_pending() documentation as
 * this is the equivalent function for hrtimer based delayed work
 * items.
 */
bool wiphy_hrtimer_work_pending(struct wiphy *wiphy,
                                struct wiphy_hrtimer_work *hrwork);

/**
 * enum ieee80211_ap_reg_power - regulatory power for an Access Point
 *
 * @IEEE80211_REG_UNSET_AP: Access Point has no regulatory power mode
 * @IEEE80211_REG_LPI_AP: Indoor Access Point
 * @IEEE80211_REG_SP_AP: Standard power Access Point
 * @IEEE80211_REG_VLP_AP: Very low power Access Point
 */
enum ieee80211_ap_reg_power {
        IEEE80211_REG_UNSET_AP,
        IEEE80211_REG_LPI_AP,
        IEEE80211_REG_SP_AP,
        IEEE80211_REG_VLP_AP,
};

/**
 * struct wireless_dev - wireless device state
 *
 * For netdevs, this structure must be allocated by the driver
 * that uses the ieee80211_ptr field in struct net_device (this
 * is intentional so it can be allocated along with the netdev.)
 * It need not be registered then as netdev registration will
 * be intercepted by cfg80211 to see the new wireless device,
 * however, drivers must lock the wiphy before registering or
 * unregistering netdevs if they pre-create any netdevs (in ops
 * called from cfg80211, the wiphy is already locked.)
 *
 * For non-netdev uses, it must also be allocated by the driver
 * in response to the cfg80211 callbacks that require it, as
 * there's no netdev registration in that case it may not be
 * allocated outside of callback operations that return it.
 *
 * @wiphy: pointer to hardware description
 * @iftype: interface type
 * @registered: is this wdev already registered with cfg80211
 * @registering: indicates we're doing registration under wiphy lock
 *        for the notifier
 * @list: (private) Used to collect the interfaces
 * @netdev: (private) Used to reference back to the netdev, may be %NULL
 * @identifier: (private) Identifier used in nl80211 to identify this
 *        wireless device if it has no netdev
 * @u: union containing data specific to @iftype
 * @connected: indicates if connected or not (STA mode)
 * @wext: (private) Used by the internal wireless extensions compat code
 * @wext.ibss: (private) IBSS data part of wext handling
 * @wext.connect: (private) connection handling data
 * @wext.keys: (private) (WEP) key data
 * @wext.ie: (private) extra elements for association
 * @wext.ie_len: (private) length of extra elements
 * @wext.bssid: (private) selected network BSSID
 * @wext.ssid: (private) selected network SSID
 * @wext.default_key: (private) selected default key index
 * @wext.default_mgmt_key: (private) selected default management key index
 * @wext.prev_bssid: (private) previous BSSID for reassociation
 * @wext.prev_bssid_valid: (private) previous BSSID validity
 * @use_4addr: indicates 4addr mode is used on this interface, must be
 *        set by driver (if supported) on add_interface BEFORE registering the
 *        netdev and may otherwise be used by driver read-only, will be update
 *        by cfg80211 on change_interface
 * @mgmt_registrations: list of registrations for management frames
 * @mgmt_registrations_need_update: mgmt registrations were updated,
 *        need to propagate the update to the driver
 * @address: The address for this device, valid only if @netdev is %NULL
 * @is_running: true if this is a non-netdev device that has been started, e.g.
 *        the P2P Device.
 * @ps: powersave mode is enabled
 * @ps_timeout: dynamic powersave timeout
 * @ap_unexpected_nlportid: (private) netlink port ID of application
 *        registered for unexpected class 3 frames (AP mode)
 * @conn: (private) cfg80211 software SME connection state machine data
 * @connect_keys: (private) keys to set after connection is established
 * @conn_bss_type: connecting/connected BSS type
 * @conn_owner_nlportid: (private) connection owner socket port ID
 * @disconnect_wk: (private) auto-disconnect work
 * @disconnect_bssid: (private) the BSSID to use for auto-disconnect
 * @event_list: (private) list for internal event processing
 * @event_lock: (private) lock for event list
 * @owner_nlportid: (private) owner socket port ID
 * @nl_owner_dead: (private) owner socket went away
 * @cqm_rssi_work: (private) CQM RSSI reporting work
 * @cqm_config: (private) nl80211 RSSI monitor state
 * @pmsr_list: (private) peer measurement requests
 * @pmsr_lock: (private) peer measurements requests/results lock
 * @pmsr_free_wk: (private) peer measurements cleanup work
 * @unprot_beacon_reported: (private) timestamp of last
 *        unprotected beacon report
 * @links: array of %IEEE80211_MLD_MAX_NUM_LINKS elements containing @addr
 *        @ap and @client for each link
 * @links.cac_started: true if DFS channel availability check has been
 *        started
 * @links.cac_start_time: timestamp (jiffies) when the dfs state was
 *        entered.
 * @links.cac_time_ms: CAC time in ms
 * @valid_links: bitmap describing what elements of @links are valid
 * @radio_mask: Bitmask of radios that this interface is allowed to operate on.
 */
struct wireless_dev {
        struct wiphy *wiphy;
        enum nl80211_iftype iftype;

        /* the remainder of this struct should be private to cfg80211 */
        struct list_head list;
        struct net_device *netdev;

        u32 identifier;

        struct list_head mgmt_registrations;
        u8 mgmt_registrations_need_update:1;

        bool use_4addr, is_running, registered, registering;

        u8 address[ETH_ALEN] __aligned(sizeof(u16));

        /* currently used for IBSS and SME - might be rearranged later */
        struct cfg80211_conn *conn;
        struct cfg80211_cached_keys *connect_keys;
        enum ieee80211_bss_type conn_bss_type;
        u32 conn_owner_nlportid;

        struct work_struct disconnect_wk;
        u8 disconnect_bssid[ETH_ALEN];

        struct list_head event_list;
        spinlock_t event_lock;

        u8 connected:1;

        bool ps;
        int ps_timeout;

        u32 ap_unexpected_nlportid;

        u32 owner_nlportid;
        bool nl_owner_dead;

#ifdef CONFIG_CFG80211_WEXT
        /* wext data */
        struct {
                struct cfg80211_ibss_params ibss;
                struct cfg80211_connect_params connect;
                struct cfg80211_cached_keys *keys;
                const u8 *ie;
                size_t ie_len;
                u8 bssid[ETH_ALEN];
                u8 prev_bssid[ETH_ALEN];
                u8 ssid[IEEE80211_MAX_SSID_LEN];
                s8 default_key, default_mgmt_key;
                bool prev_bssid_valid;
        } wext;
#endif

        struct wiphy_work cqm_rssi_work;
        struct cfg80211_cqm_config __rcu *cqm_config;

        struct list_head pmsr_list;
        spinlock_t pmsr_lock;
        struct work_struct pmsr_free_wk;

        unsigned long unprot_beacon_reported;

        union {
                struct {
                        u8 connected_addr[ETH_ALEN] __aligned(2);
                        u8 ssid[IEEE80211_MAX_SSID_LEN];
                        u8 ssid_len;
                } client;
                struct {
                        int beacon_interval;
                        struct cfg80211_chan_def preset_chandef;
                        struct cfg80211_chan_def chandef;
                        u8 id[IEEE80211_MAX_MESH_ID_LEN];
                        u8 id_len, id_up_len;
                } mesh;
                struct {
                        struct cfg80211_chan_def preset_chandef;
                        u8 ssid[IEEE80211_MAX_SSID_LEN];
                        u8 ssid_len;
                } ap;
                struct {
                        struct cfg80211_internal_bss *current_bss;
                        struct cfg80211_chan_def chandef;
                        int beacon_interval;
                        u8 ssid[IEEE80211_MAX_SSID_LEN];
                        u8 ssid_len;
                } ibss;
                struct {
                        struct cfg80211_chan_def chandef;
                } ocb;
                struct {
                        u8 cluster_id[ETH_ALEN] __aligned(2);
                } nan;
        } u;

        struct {
                u8 addr[ETH_ALEN] __aligned(2);
                union {
                        struct {
                                unsigned int beacon_interval;
                                struct cfg80211_chan_def chandef;
                        } ap;
                        struct {
                                struct cfg80211_internal_bss *current_bss;
                        } client;
                };

                bool cac_started;
                unsigned long cac_start_time;
                unsigned int cac_time_ms;
        } links[IEEE80211_MLD_MAX_NUM_LINKS];
        u16 valid_links;

        u32 radio_mask;
};

static inline const u8 *wdev_address(struct wireless_dev *wdev)
{
        if (wdev->netdev)
                return wdev->netdev->dev_addr;
        return wdev->address;
}

static inline bool wdev_running(struct wireless_dev *wdev)
{
        if (wdev->netdev)
                return netif_running(wdev->netdev);
        return wdev->is_running;
}

/**
 * wdev_priv - return wiphy priv from wireless_dev
 *
 * @wdev: The wireless device whose wiphy's priv pointer to return
 * Return: The wiphy priv of @wdev.
 */
static inline void *wdev_priv(struct wireless_dev *wdev)
{
        BUG_ON(!wdev);
        return wiphy_priv(wdev->wiphy);
}

/**
 * wdev_chandef - return chandef pointer from wireless_dev
 * @wdev: the wdev
 * @link_id: the link ID for MLO
 *
 * Return: The chandef depending on the mode, or %NULL.
 */
struct cfg80211_chan_def *wdev_chandef(struct wireless_dev *wdev,
                                       unsigned int link_id);

static inline void WARN_INVALID_LINK_ID(struct wireless_dev *wdev,
                                        unsigned int link_id)
{
        WARN_ON(link_id && !wdev->valid_links);
        WARN_ON(wdev->valid_links &&
                !(wdev->valid_links & BIT(link_id)));
}

#define for_each_valid_link(link_info, link_id)                        \
        for (link_id = 0;                                        \
             link_id < ((link_info)->valid_links ?                \
                        ARRAY_SIZE((link_info)->links) : 1);        \
             link_id++)                                                \
                if (!(link_info)->valid_links ||                \
                    ((link_info)->valid_links & BIT(link_id)))

/**
 * DOC: Utility functions
 *
 * cfg80211 offers a number of utility functions that can be useful.
 */

/**
 * ieee80211_channel_equal - compare two struct ieee80211_channel
 *
 * @a: 1st struct ieee80211_channel
 * @b: 2nd struct ieee80211_channel
 * Return: true if center frequency of @a == @b
 */
static inline bool
ieee80211_channel_equal(struct ieee80211_channel *a,
                        struct ieee80211_channel *b)
{
        return (a->center_freq == b->center_freq &&
                a->freq_offset == b->freq_offset);
}

/**
 * ieee80211_channel_to_khz - convert ieee80211_channel to frequency in KHz
 * @chan: struct ieee80211_channel to convert
 * Return: The corresponding frequency (in KHz)
 */
static inline u32
ieee80211_channel_to_khz(const struct ieee80211_channel *chan)
{
        return MHZ_TO_KHZ(chan->center_freq) + chan->freq_offset;
}

/**
 * ieee80211_channel_to_freq_khz - convert channel number to frequency
 * @chan: channel number
 * @band: band, necessary due to channel number overlap
 * Return: The corresponding frequency (in KHz), or 0 if the conversion failed.
 */
u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band);

/**
 * ieee80211_channel_to_frequency - convert channel number to frequency
 * @chan: channel number
 * @band: band, necessary due to channel number overlap
 * Return: The corresponding frequency (in MHz), or 0 if the conversion failed.
 */
static inline int
ieee80211_channel_to_frequency(int chan, enum nl80211_band band)
{
        return KHZ_TO_MHZ(ieee80211_channel_to_freq_khz(chan, band));
}

/**
 * ieee80211_freq_khz_to_channel - convert frequency to channel number
 * @freq: center frequency in KHz
 * Return: The corresponding channel, or 0 if the conversion failed.
 */
int ieee80211_freq_khz_to_channel(u32 freq);

/**
 * ieee80211_frequency_to_channel - convert frequency to channel number
 * @freq: center frequency in MHz
 * Return: The corresponding channel, or 0 if the conversion failed.
 */
static inline int
ieee80211_frequency_to_channel(int freq)
{
        return ieee80211_freq_khz_to_channel(MHZ_TO_KHZ(freq));
}

/**
 * ieee80211_get_channel_khz - get channel struct from wiphy for specified
 * frequency
 * @wiphy: the struct wiphy to get the channel for
 * @freq: the center frequency (in KHz) of the channel
 * Return: The channel struct from @wiphy at @freq.
 */
struct ieee80211_channel *
ieee80211_get_channel_khz(struct wiphy *wiphy, u32 freq);

/**
 * ieee80211_get_channel - get channel struct from wiphy for specified frequency
 *
 * @wiphy: the struct wiphy to get the channel for
 * @freq: the center frequency (in MHz) of the channel
 * Return: The channel struct from @wiphy at @freq.
 */
static inline struct ieee80211_channel *
ieee80211_get_channel(struct wiphy *wiphy, int freq)
{
        return ieee80211_get_channel_khz(wiphy, MHZ_TO_KHZ(freq));
}

/**
 * cfg80211_channel_is_psc - Check if the channel is a 6 GHz PSC
 * @chan: control channel to check
 *
 * The Preferred Scanning Channels (PSC) are defined in
 * Draft IEEE P802.11ax/D5.0, 26.17.2.3.3
 *
 * Return: %true if channel is a PSC, %false otherwise
 */
static inline bool cfg80211_channel_is_psc(struct ieee80211_channel *chan)
{
        if (chan->band != NL80211_BAND_6GHZ)
                return false;

        return ieee80211_frequency_to_channel(chan->center_freq) % 16 == 5;
}

/**
 * cfg80211_radio_chandef_valid - Check if the radio supports the chandef
 *
 * @radio: wiphy radio
 * @chandef: chandef for current channel
 *
 * Return: whether or not the given chandef is valid for the given radio
 */
bool cfg80211_radio_chandef_valid(const struct wiphy_radio *radio,
                                  const struct cfg80211_chan_def *chandef);

/**
 * cfg80211_wdev_channel_allowed - Check if the wdev may use the channel
 *
 * @wdev: the wireless device
 * @chan: channel to check
 *
 * Return: whether or not the wdev may use the channel
 */
bool cfg80211_wdev_channel_allowed(struct wireless_dev *wdev,
                                   struct ieee80211_channel *chan);

/**
 * ieee80211_get_response_rate - get basic rate for a given rate
 *
 * @sband: the band to look for rates in
 * @basic_rates: bitmap of basic rates
 * @bitrate: the bitrate for which to find the basic rate
 *
 * Return: The basic rate corresponding to a given bitrate, that
 * is the next lower bitrate contained in the basic rate map,
 * which is, for this function, given as a bitmap of indices of
 * rates in the band's bitrate table.
 */
const struct ieee80211_rate *
ieee80211_get_response_rate(struct ieee80211_supported_band *sband,
                            u32 basic_rates, int bitrate);

/**
 * ieee80211_mandatory_rates - get mandatory rates for a given band
 * @sband: the band to look for rates in
 *
 * Return: a bitmap of the mandatory rates for the given band, bits
 * are set according to the rate position in the bitrates array.
 */
u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband);

/*
 * Radiotap parsing functions -- for controlled injection support
 *
 * Implemented in net/wireless/radiotap.c
 * Documentation in Documentation/networking/radiotap-headers.rst
 */

struct radiotap_align_size {
        uint8_t align:4, size:4;
};

struct ieee80211_radiotap_namespace {
        const struct radiotap_align_size *align_size;
        int n_bits;
        uint32_t oui;
        uint8_t subns;
};

struct ieee80211_radiotap_vendor_namespaces {
        const struct ieee80211_radiotap_namespace *ns;
        int n_ns;
};

/**
 * struct ieee80211_radiotap_iterator - tracks walk thru present radiotap args
 * @this_arg_index: index of current arg, valid after each successful call
 *        to ieee80211_radiotap_iterator_next()
 * @this_arg: pointer to current radiotap arg; it is valid after each
 *        call to ieee80211_radiotap_iterator_next() but also after
 *        ieee80211_radiotap_iterator_init() where it will point to
 *        the beginning of the actual data portion
 * @this_arg_size: length of the current arg, for convenience
 * @current_namespace: pointer to the current namespace definition
 *        (or internally %NULL if the current namespace is unknown)
 * @is_radiotap_ns: indicates whether the current namespace is the default
 *        radiotap namespace or not
 *
 * @_rtheader: pointer to the radiotap header we are walking through
 * @_max_length: length of radiotap header in cpu byte ordering
 * @_arg_index: next argument index
 * @_arg: next argument pointer
 * @_next_bitmap: internal pointer to next present u32
 * @_bitmap_shifter: internal shifter for curr u32 bitmap, b0 set == arg present
 * @_vns: vendor namespace definitions
 * @_next_ns_data: beginning of the next namespace's data
 * @_reset_on_ext: internal; reset the arg index to 0 when going to the
 *        next bitmap word
 *
 * Describes the radiotap parser state. Fields prefixed with an underscore
 * must not be used by users of the parser, only by the parser internally.
 */

struct ieee80211_radiotap_iterator {
        struct ieee80211_radiotap_header *_rtheader;
        const struct ieee80211_radiotap_vendor_namespaces *_vns;
        const struct ieee80211_radiotap_namespace *current_namespace;

        unsigned char *_arg, *_next_ns_data;
        __le32 *_next_bitmap;

        unsigned char *this_arg;
        int this_arg_index;
        int this_arg_size;

        int is_radiotap_ns;

        int _max_length;
        int _arg_index;
        uint32_t _bitmap_shifter;
        int _reset_on_ext;
};

int
ieee80211_radiotap_iterator_init(struct ieee80211_radiotap_iterator *iterator,
                                 struct ieee80211_radiotap_header *radiotap_header,
                                 int max_length,
                                 const struct ieee80211_radiotap_vendor_namespaces *vns);

int
ieee80211_radiotap_iterator_next(struct ieee80211_radiotap_iterator *iterator);


extern const unsigned char rfc1042_header[6];
extern const unsigned char bridge_tunnel_header[6];

/**
 * ieee80211_get_hdrlen_from_skb - get header length from data
 *
 * @skb: the frame
 *
 * Given an skb with a raw 802.11 header at the data pointer this function
 * returns the 802.11 header length.
 *
 * Return: The 802.11 header length in bytes (not including encryption
 * headers). Or 0 if the data in the sk_buff is too short to contain a valid
 * 802.11 header.
 */
unsigned int ieee80211_get_hdrlen_from_skb(const struct sk_buff *skb);

/**
 * ieee80211_hdrlen - get header length in bytes from frame control
 * @fc: frame control field in little-endian format
 * Return: The header length in bytes.
 */
unsigned int __attribute_const__ ieee80211_hdrlen(__le16 fc);

/**
 * ieee80211_get_mesh_hdrlen - get mesh extension header length
 * @meshhdr: the mesh extension header, only the flags field
 *        (first byte) will be accessed
 * Return: The length of the extension header, which is always at
 * least 6 bytes and at most 18 if address 5 and 6 are present.
 */
unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr);

/**
 * DOC: Data path helpers
 *
 * In addition to generic utilities, cfg80211 also offers
 * functions that help implement the data path for devices
 * that do not do the 802.11/802.3 conversion on the device.
 */

/**
 * ieee80211_data_to_8023_exthdr - convert an 802.11 data frame to 802.3
 * @skb: the 802.11 data frame
 * @ehdr: pointer to a &struct ethhdr that will get the header, instead
 *        of it being pushed into the SKB
 * @addr: the device MAC address
 * @iftype: the virtual interface type
 * @data_offset: offset of payload after the 802.11 header
 * @is_amsdu: true if the 802.11 header is A-MSDU
 * Return: 0 on success. Non-zero on error.
 */
int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr,
                                  const u8 *addr, enum nl80211_iftype iftype,
                                  u8 data_offset, bool is_amsdu);

/**
 * ieee80211_data_to_8023 - convert an 802.11 data frame to 802.3
 * @skb: the 802.11 data frame
 * @addr: the device MAC address
 * @iftype: the virtual interface type
 * Return: 0 on success. Non-zero on error.
 */
static inline int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr,
                                         enum nl80211_iftype iftype)
{
        return ieee80211_data_to_8023_exthdr(skb, NULL, addr, iftype, 0, false);
}

/**
 * ieee80211_is_valid_amsdu - check if subframe lengths of an A-MSDU are valid
 *
 * This is used to detect non-standard A-MSDU frames, e.g. the ones generated
 * by ath10k and ath11k, where the subframe length includes the length of the
 * mesh control field.
 *
 * @skb: The input A-MSDU frame without any headers.
 * @mesh_hdr: the type of mesh header to test
 *        0: non-mesh A-MSDU length field
 *        1: big-endian mesh A-MSDU length field
 *        2: little-endian mesh A-MSDU length field
 * Returns: true if subframe header lengths are valid for the @mesh_hdr mode
 */
bool ieee80211_is_valid_amsdu(struct sk_buff *skb, u8 mesh_hdr);

/**
 * ieee80211_amsdu_to_8023s - decode an IEEE 802.11n A-MSDU frame
 *
 * Decode an IEEE 802.11 A-MSDU and convert it to a list of 802.3 frames.
 * The @list will be empty if the decode fails. The @skb must be fully
 * header-less before being passed in here; it is freed in this function.
 *
 * @skb: The input A-MSDU frame without any headers.
 * @list: The output list of 802.3 frames. It must be allocated and
 *        initialized by the caller.
 * @addr: The device MAC address.
 * @iftype: The device interface type.
 * @extra_headroom: The hardware extra headroom for SKBs in the @list.
 * @check_da: DA to check in the inner ethernet header, or NULL
 * @check_sa: SA to check in the inner ethernet header, or NULL
 * @mesh_control: see mesh_hdr in ieee80211_is_valid_amsdu
 */
void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list,
                              const u8 *addr, enum nl80211_iftype iftype,
                              const unsigned int extra_headroom,
                              const u8 *check_da, const u8 *check_sa,
                              u8 mesh_control);

/**
 * ieee80211_get_8023_tunnel_proto - get RFC1042 or bridge tunnel encap protocol
 *
 * Check for RFC1042 or bridge tunnel header and fetch the encapsulated
 * protocol.
 *
 * @hdr: pointer to the MSDU payload
 * @proto: destination pointer to store the protocol
 * Return: true if encapsulation was found
 */
bool ieee80211_get_8023_tunnel_proto(const void *hdr, __be16 *proto);

/**
 * ieee80211_strip_8023_mesh_hdr - strip mesh header from converted 802.3 frames
 *
 * Strip the mesh header, which was left in by ieee80211_data_to_8023 as part
 * of the MSDU data. Also move any source/destination addresses from the mesh
 * header to the ethernet header (if present).
 *
 * @skb: The 802.3 frame with embedded mesh header
 *
 * Return: 0 on success. Non-zero on error.
 */
int ieee80211_strip_8023_mesh_hdr(struct sk_buff *skb);

/**
 * cfg80211_classify8021d - determine the 802.1p/1d tag for a data frame
 * @skb: the data frame
 * @qos_map: Interworking QoS mapping or %NULL if not in use
 * Return: The 802.1p/1d tag.
 */
unsigned int cfg80211_classify8021d(struct sk_buff *skb,
                                    struct cfg80211_qos_map *qos_map);

/**
 * cfg80211_find_elem_match - match information element and byte array in data
 *
 * @eid: element ID
 * @ies: data consisting of IEs
 * @len: length of data
 * @match: byte array to match
 * @match_len: number of bytes in the match array
 * @match_offset: offset in the IE data where the byte array should match.
 *        Note the difference to cfg80211_find_ie_match() which considers
 *        the offset to start from the element ID byte, but here we take
 *        the data portion instead.
 *
 * Return: %NULL if the element ID could not be found or if
 * the element is invalid (claims to be longer than the given
 * data) or if the byte array doesn't match; otherwise return the
 * requested element struct.
 *
 * Note: There are no checks on the element length other than
 * having to fit into the given data and being large enough for the
 * byte array to match.
 */
const struct element *
cfg80211_find_elem_match(u8 eid, const u8 *ies, unsigned int len,
                         const u8 *match, unsigned int match_len,
                         unsigned int match_offset);

/**
 * cfg80211_find_ie_match - match information element and byte array in data
 *
 * @eid: element ID
 * @ies: data consisting of IEs
 * @len: length of data
 * @match: byte array to match
 * @match_len: number of bytes in the match array
 * @match_offset: offset in the IE where the byte array should match.
 *        If match_len is zero, this must also be set to zero.
 *        Otherwise this must be set to 2 or more, because the first
 *        byte is the element id, which is already compared to eid, and
 *        the second byte is the IE length.
 *
 * Return: %NULL if the element ID could not be found or if
 * the element is invalid (claims to be longer than the given
 * data) or if the byte array doesn't match, or a pointer to the first
 * byte of the requested element, that is the byte containing the
 * element ID.
 *
 * Note: There are no checks on the element length other than
 * having to fit into the given data and being large enough for the
 * byte array to match.
 */
static inline const u8 *
cfg80211_find_ie_match(u8 eid, const u8 *ies, unsigned int len,
                       const u8 *match, unsigned int match_len,
                       unsigned int match_offset)
{
        /* match_offset can't be smaller than 2, unless match_len is
         * zero, in which case match_offset must be zero as well.
         */
        if (WARN_ON((match_len && match_offset < 2) ||
                    (!match_len && match_offset)))
                return NULL;

        return (const void *)cfg80211_find_elem_match(eid, ies, len,
                                                      match, match_len,
                                                      match_offset ?
                                                        match_offset - 2 : 0);
}

/**
 * cfg80211_find_elem - find information element in data
 *
 * @eid: element ID
 * @ies: data consisting of IEs
 * @len: length of data
 *
 * Return: %NULL if the element ID could not be found or if
 * the element is invalid (claims to be longer than the given
 * data) or if the byte array doesn't match; otherwise return the
 * requested element struct.
 *
 * Note: There are no checks on the element length other than
 * having to fit into the given data.
 */
static inline const struct element *
cfg80211_find_elem(u8 eid, const u8 *ies, int len)
{
        return cfg80211_find_elem_match(eid, ies, len, NULL, 0, 0);
}

/**
 * cfg80211_find_ie - find information element in data
 *
 * @eid: element ID
 * @ies: data consisting of IEs
 * @len: length of data
 *
 * Return: %NULL if the element ID could not be found or if
 * the element is invalid (claims to be longer than the given
 * data), or a pointer to the first byte of the requested
 * element, that is the byte containing the element ID.
 *
 * Note: There are no checks on the element length other than
 * having to fit into the given data.
 */
static inline const u8 *cfg80211_find_ie(u8 eid, const u8 *ies, int len)
{
        return cfg80211_find_ie_match(eid, ies, len, NULL, 0, 0);
}

/**
 * cfg80211_find_ext_elem - find information element with EID Extension in data
 *
 * @ext_eid: element ID Extension
 * @ies: data consisting of IEs
 * @len: length of data
 *
 * Return: %NULL if the extended element could not be found or if
 * the element is invalid (claims to be longer than the given
 * data) or if the byte array doesn't match; otherwise return the
 * requested element struct.
 *
 * Note: There are no checks on the element length other than
 * having to fit into the given data.
 */
static inline const struct element *
cfg80211_find_ext_elem(u8 ext_eid, const u8 *ies, int len)
{
        return cfg80211_find_elem_match(WLAN_EID_EXTENSION, ies, len,
                                        &ext_eid, 1, 0);
}

/**
 * cfg80211_find_ext_ie - find information element with EID Extension in data
 *
 * @ext_eid: element ID Extension
 * @ies: data consisting of IEs
 * @len: length of data
 *
 * Return: %NULL if the extended element ID could not be found or if
 * the element is invalid (claims to be longer than the given
 * data), or a pointer to the first byte of the requested
 * element, that is the byte containing the element ID.
 *
 * Note: There are no checks on the element length other than
 * having to fit into the given data.
 */
static inline const u8 *cfg80211_find_ext_ie(u8 ext_eid, const u8 *ies, int len)
{
        return cfg80211_find_ie_match(WLAN_EID_EXTENSION, ies, len,
                                      &ext_eid, 1, 2);
}

/**
 * cfg80211_find_vendor_elem - find vendor specific information element in data
 *
 * @oui: vendor OUI
 * @oui_type: vendor-specific OUI type (must be < 0xff), negative means any
 * @ies: data consisting of IEs
 * @len: length of data
 *
 * Return: %NULL if the vendor specific element ID could not be found or if the
 * element is invalid (claims to be longer than the given data); otherwise
 * return the element structure for the requested element.
 *
 * Note: There are no checks on the element length other than having to fit into
 * the given data.
 */
const struct element *cfg80211_find_vendor_elem(unsigned int oui, int oui_type,
                                                const u8 *ies,
                                                unsigned int len);

/**
 * cfg80211_find_vendor_ie - find vendor specific information element in data
 *
 * @oui: vendor OUI
 * @oui_type: vendor-specific OUI type (must be < 0xff), negative means any
 * @ies: data consisting of IEs
 * @len: length of data
 *
 * Return: %NULL if the vendor specific element ID could not be found or if the
 * element is invalid (claims to be longer than the given data), or a pointer to
 * the first byte of the requested element, that is the byte containing the
 * element ID.
 *
 * Note: There are no checks on the element length other than having to fit into
 * the given data.
 */
static inline const u8 *
cfg80211_find_vendor_ie(unsigned int oui, int oui_type,
                        const u8 *ies, unsigned int len)
{
        return (const void *)cfg80211_find_vendor_elem(oui, oui_type, ies, len);
}

/**
 * enum cfg80211_rnr_iter_ret - reduced neighbor report iteration state
 * @RNR_ITER_CONTINUE: continue iterating with the next entry
 * @RNR_ITER_BREAK: break iteration and return success
 * @RNR_ITER_ERROR: break iteration and return error
 */
enum cfg80211_rnr_iter_ret {
        RNR_ITER_CONTINUE,
        RNR_ITER_BREAK,
        RNR_ITER_ERROR,
};

/**
 * cfg80211_iter_rnr - iterate reduced neighbor report entries
 * @elems: the frame elements to iterate RNR elements and then
 *        their entries in
 * @elems_len: length of the elements
 * @iter: iteration function, see also &enum cfg80211_rnr_iter_ret
 *        for the return value
 * @iter_data: additional data passed to the iteration function
 * Return: %true on success (after successfully iterating all entries
 *        or if the iteration function returned %RNR_ITER_BREAK),
 *        %false on error (iteration function returned %RNR_ITER_ERROR
 *        or elements were malformed.)
 */
bool cfg80211_iter_rnr(const u8 *elems, size_t elems_len,
                       enum cfg80211_rnr_iter_ret
                       (*iter)(void *data, u8 type,
                               const struct ieee80211_neighbor_ap_info *info,
                               const u8 *tbtt_info, u8 tbtt_info_len),
                       void *iter_data);

/**
 * cfg80211_defragment_element - Defrag the given element data into a buffer
 *
 * @elem: the element to defragment
 * @ies: elements where @elem is contained
 * @ieslen: length of @ies
 * @data: buffer to store element data, or %NULL to just determine size
 * @data_len: length of @data, or 0
 * @frag_id: the element ID of fragments
 *
 * Return: length of @data, or -EINVAL on error
 *
 * Copy out all data from an element that may be fragmented into @data, while
 * skipping all headers.
 *
 * The function uses memmove() internally. It is acceptable to defragment an
 * element in-place.
 */
ssize_t cfg80211_defragment_element(const struct element *elem, const u8 *ies,
                                    size_t ieslen, u8 *data, size_t data_len,
                                    u8 frag_id);

/**
 * cfg80211_send_layer2_update - send layer 2 update frame
 *
 * @dev: network device
 * @addr: STA MAC address
 *
 * Wireless drivers can use this function to update forwarding tables in bridge
 * devices upon STA association.
 */
void cfg80211_send_layer2_update(struct net_device *dev, const u8 *addr);

/**
 * DOC: Regulatory enforcement infrastructure
 *
 * TODO
 */

/**
 * regulatory_hint - driver hint to the wireless core a regulatory domain
 * @wiphy: the wireless device giving the hint (used only for reporting
 *        conflicts)
 * @alpha2: the ISO/IEC 3166 alpha2 the driver claims its regulatory domain
 *        should be in. If @rd is set this should be NULL. Note that if you
 *        set this to NULL you should still set rd->alpha2 to some accepted
 *        alpha2.
 *
 * Wireless drivers can use this function to hint to the wireless core
 * what it believes should be the current regulatory domain by
 * giving it an ISO/IEC 3166 alpha2 country code it knows its regulatory
 * domain should be in or by providing a completely build regulatory domain.
 * If the driver provides an ISO/IEC 3166 alpha2 userspace will be queried
 * for a regulatory domain structure for the respective country.
 *
 * The wiphy must have been registered to cfg80211 prior to this call.
 * For cfg80211 drivers this means you must first use wiphy_register(),
 * for mac80211 drivers you must first use ieee80211_register_hw().
 *
 * Drivers should check the return value, its possible you can get
 * an -ENOMEM.
 *
 * Return: 0 on success. -ENOMEM.
 */
int regulatory_hint(struct wiphy *wiphy, const char *alpha2);

/**
 * regulatory_set_wiphy_regd - set regdom info for self managed drivers
 * @wiphy: the wireless device we want to process the regulatory domain on
 * @rd: the regulatory domain information to use for this wiphy
 *
 * Set the regulatory domain information for self-managed wiphys, only they
 * may use this function. See %REGULATORY_WIPHY_SELF_MANAGED for more
 * information.
 *
 * Return: 0 on success. -EINVAL, -EPERM
 */
int regulatory_set_wiphy_regd(struct wiphy *wiphy,
                              struct ieee80211_regdomain *rd);

/**
 * regulatory_set_wiphy_regd_sync - set regdom for self-managed drivers
 * @wiphy: the wireless device we want to process the regulatory domain on
 * @rd: the regulatory domain information to use for this wiphy
 *
 * This functions requires the RTNL and the wiphy mutex to be held and
 * applies the new regdomain synchronously to this wiphy. For more details
 * see regulatory_set_wiphy_regd().
 *
 * Return: 0 on success. -EINVAL, -EPERM
 */
int regulatory_set_wiphy_regd_sync(struct wiphy *wiphy,
                                   struct ieee80211_regdomain *rd);

/**
 * wiphy_apply_custom_regulatory - apply a custom driver regulatory domain
 * @wiphy: the wireless device we want to process the regulatory domain on
 * @regd: the custom regulatory domain to use for this wiphy
 *
 * Drivers can sometimes have custom regulatory domains which do not apply
 * to a specific country. Drivers can use this to apply such custom regulatory
 * domains. This routine must be called prior to wiphy registration. The
 * custom regulatory domain will be trusted completely and as such previous
 * default channel settings will be disregarded. If no rule is found for a
 * channel on the regulatory domain the channel will be disabled.
 * Drivers using this for a wiphy should also set the wiphy flag
 * REGULATORY_CUSTOM_REG or cfg80211 will set it for the wiphy
 * that called this helper.
 */
void wiphy_apply_custom_regulatory(struct wiphy *wiphy,
                                   const struct ieee80211_regdomain *regd);

/**
 * freq_reg_info - get regulatory information for the given frequency
 * @wiphy: the wiphy for which we want to process this rule for
 * @center_freq: Frequency in KHz for which we want regulatory information for
 *
 * Use this function to get the regulatory rule for a specific frequency on
 * a given wireless device. If the device has a specific regulatory domain
 * it wants to follow we respect that unless a country IE has been received
 * and processed already.
 *
 * Return: A valid pointer, or, when an error occurs, for example if no rule
 * can be found, the return value is encoded using ERR_PTR(). Use IS_ERR() to
 * check and PTR_ERR() to obtain the numeric return value. The numeric return
 * value will be -ERANGE if we determine the given center_freq does not even
 * have a regulatory rule for a frequency range in the center_freq's band.
 * See freq_in_rule_band() for our current definition of a band -- this is
 * purely subjective and right now it's 802.11 specific.
 */
const struct ieee80211_reg_rule *freq_reg_info(struct wiphy *wiphy,
                                               u32 center_freq);

/**
 * reg_initiator_name - map regulatory request initiator enum to name
 * @initiator: the regulatory request initiator
 *
 * You can use this to map the regulatory request initiator enum to a
 * proper string representation.
 *
 * Return: pointer to string representation of the initiator
 */
const char *reg_initiator_name(enum nl80211_reg_initiator initiator);

/**
 * regulatory_pre_cac_allowed - check if pre-CAC allowed in the current regdom
 * @wiphy: wiphy for which pre-CAC capability is checked.
 *
 * Pre-CAC is allowed only in some regdomains (notable ETSI).
 *
 * Return: %true if allowed, %false otherwise
 */
bool regulatory_pre_cac_allowed(struct wiphy *wiphy);

/**
 * DOC: Internal regulatory db functions
 *
 */

/**
 * reg_query_regdb_wmm -  Query internal regulatory db for wmm rule
 * Regulatory self-managed driver can use it to proactively
 *
 * @alpha2: the ISO/IEC 3166 alpha2 wmm rule to be queried.
 * @freq: the frequency (in MHz) to be queried.
 * @rule: pointer to store the wmm rule from the regulatory db.
 *
 * Self-managed wireless drivers can use this function to  query
 * the internal regulatory database to check whether the given
 * ISO/IEC 3166 alpha2 country and freq have wmm rule limitations.
 *
 * Drivers should check the return value, its possible you can get
 * an -ENODATA.
 *
 * Return: 0 on success. -ENODATA.
 */
int reg_query_regdb_wmm(char *alpha2, int freq,
                        struct ieee80211_reg_rule *rule);

/*
 * callbacks for asynchronous cfg80211 methods, notification
 * functions and BSS handling helpers
 */

/**
 * cfg80211_scan_done - notify that scan finished
 *
 * @request: the corresponding scan request
 * @info: information about the completed scan
 */
void cfg80211_scan_done(struct cfg80211_scan_request *request,
                        struct cfg80211_scan_info *info);

/**
 * cfg80211_sched_scan_results - notify that new scan results are available
 *
 * @wiphy: the wiphy which got scheduled scan results
 * @reqid: identifier for the related scheduled scan request
 */
void cfg80211_sched_scan_results(struct wiphy *wiphy, u64 reqid);

/**
 * cfg80211_sched_scan_stopped - notify that the scheduled scan has stopped
 *
 * @wiphy: the wiphy on which the scheduled scan stopped
 * @reqid: identifier for the related scheduled scan request
 *
 * The driver can call this function to inform cfg80211 that the
 * scheduled scan had to be stopped, for whatever reason.  The driver
 * is then called back via the sched_scan_stop operation when done.
 */
void cfg80211_sched_scan_stopped(struct wiphy *wiphy, u64 reqid);

/**
 * cfg80211_sched_scan_stopped_locked - notify that the scheduled scan has stopped
 *
 * @wiphy: the wiphy on which the scheduled scan stopped
 * @reqid: identifier for the related scheduled scan request
 *
 * The driver can call this function to inform cfg80211 that the
 * scheduled scan had to be stopped, for whatever reason.  The driver
 * is then called back via the sched_scan_stop operation when done.
 * This function should be called with the wiphy mutex held.
 */
void cfg80211_sched_scan_stopped_locked(struct wiphy *wiphy, u64 reqid);

/**
 * cfg80211_inform_bss_frame_data - inform cfg80211 of a received BSS frame
 * @wiphy: the wiphy reporting the BSS
 * @data: the BSS metadata
 * @mgmt: the management frame (probe response or beacon)
 * @len: length of the management frame
 * @gfp: context flags
 *
 * This informs cfg80211 that BSS information was found and
 * the BSS should be updated/added.
 *
 * Return: A referenced struct, must be released with cfg80211_put_bss()!
 * Or %NULL on error.
 */
struct cfg80211_bss * __must_check
cfg80211_inform_bss_frame_data(struct wiphy *wiphy,
                               struct cfg80211_inform_bss *data,
                               struct ieee80211_mgmt *mgmt, size_t len,
                               gfp_t gfp);

static inline struct cfg80211_bss * __must_check
cfg80211_inform_bss_frame(struct wiphy *wiphy,
                          struct ieee80211_channel *rx_channel,
                          struct ieee80211_mgmt *mgmt, size_t len,
                          s32 signal, gfp_t gfp)
{
        struct cfg80211_inform_bss data = {
                .chan = rx_channel,
                .signal = signal,
        };

        return cfg80211_inform_bss_frame_data(wiphy, &data, mgmt, len, gfp);
}

/**
 * cfg80211_gen_new_bssid - generate a nontransmitted BSSID for multi-BSSID
 * @bssid: transmitter BSSID
 * @max_bssid: max BSSID indicator, taken from Multiple BSSID element
 * @mbssid_index: BSSID index, taken from Multiple BSSID index element
 * @new_bssid: calculated nontransmitted BSSID
 */
static inline void cfg80211_gen_new_bssid(const u8 *bssid, u8 max_bssid,
                                          u8 mbssid_index, u8 *new_bssid)
{
        u64 bssid_u64 = ether_addr_to_u64(bssid);
        u64 mask = GENMASK_ULL(max_bssid - 1, 0);
        u64 new_bssid_u64;

        new_bssid_u64 = bssid_u64 & ~mask;

        new_bssid_u64 |= ((bssid_u64 & mask) + mbssid_index) & mask;

        u64_to_ether_addr(new_bssid_u64, new_bssid);
}

/**
 * cfg80211_is_element_inherited - returns if element ID should be inherited
 * @element: element to check
 * @non_inherit_element: non inheritance element
 *
 * Return: %true if should be inherited, %false otherwise
 */
bool cfg80211_is_element_inherited(const struct element *element,
                                   const struct element *non_inherit_element);

/**
 * cfg80211_merge_profile - merges a MBSSID profile if it is split between IEs
 * @ie: ies
 * @ielen: length of IEs
 * @mbssid_elem: current MBSSID element
 * @sub_elem: current MBSSID subelement (profile)
 * @merged_ie: location of the merged profile
 * @max_copy_len: max merged profile length
 *
 * Return: the number of bytes merged
 */
size_t cfg80211_merge_profile(const u8 *ie, size_t ielen,
                              const struct element *mbssid_elem,
                              const struct element *sub_elem,
                              u8 *merged_ie, size_t max_copy_len);

/**
 * enum cfg80211_bss_frame_type - frame type that the BSS data came from
 * @CFG80211_BSS_FTYPE_UNKNOWN: driver doesn't know whether the data is
 *        from a beacon or probe response
 * @CFG80211_BSS_FTYPE_BEACON: data comes from a beacon
 * @CFG80211_BSS_FTYPE_PRESP: data comes from a probe response
 * @CFG80211_BSS_FTYPE_S1G_BEACON: data comes from an S1G beacon
 */
enum cfg80211_bss_frame_type {
        CFG80211_BSS_FTYPE_UNKNOWN,
        CFG80211_BSS_FTYPE_BEACON,
        CFG80211_BSS_FTYPE_PRESP,
        CFG80211_BSS_FTYPE_S1G_BEACON,
};

/**
 * cfg80211_get_ies_channel_number - returns the channel number from ies
 * @ie: IEs
 * @ielen: length of IEs
 * @band: enum nl80211_band of the channel
 *
 * Return: the channel number, or -1 if none could be determined.
 */
int cfg80211_get_ies_channel_number(const u8 *ie, size_t ielen,
                                    enum nl80211_band band);

/**
 * cfg80211_ssid_eq - compare two SSIDs
 * @a: first SSID
 * @b: second SSID
 *
 * Return: %true if SSIDs are equal, %false otherwise.
 */
static inline bool
cfg80211_ssid_eq(struct cfg80211_ssid *a, struct cfg80211_ssid *b)
{
        if (WARN_ON(!a || !b))
                return false;
        if (a->ssid_len != b->ssid_len)
                return false;
        return memcmp(a->ssid, b->ssid, a->ssid_len) ? false : true;
}

/**
 * cfg80211_inform_bss_data - inform cfg80211 of a new BSS
 *
 * @wiphy: the wiphy reporting the BSS
 * @data: the BSS metadata
 * @ftype: frame type (if known)
 * @bssid: the BSSID of the BSS
 * @tsf: the TSF sent by the peer in the beacon/probe response (or 0)
 * @capability: the capability field sent by the peer
 * @beacon_interval: the beacon interval announced by the peer
 * @ie: additional IEs sent by the peer
 * @ielen: length of the additional IEs
 * @gfp: context flags
 *
 * This informs cfg80211 that BSS information was found and
 * the BSS should be updated/added.
 *
 * Return: A referenced struct, must be released with cfg80211_put_bss()!
 * Or %NULL on error.
 */
struct cfg80211_bss * __must_check
cfg80211_inform_bss_data(struct wiphy *wiphy,
                         struct cfg80211_inform_bss *data,
                         enum cfg80211_bss_frame_type ftype,
                         const u8 *bssid, u64 tsf, u16 capability,
                         u16 beacon_interval, const u8 *ie, size_t ielen,
                         gfp_t gfp);

static inline struct cfg80211_bss * __must_check
cfg80211_inform_bss(struct wiphy *wiphy,
                    struct ieee80211_channel *rx_channel,
                    enum cfg80211_bss_frame_type ftype,
                    const u8 *bssid, u64 tsf, u16 capability,
                    u16 beacon_interval, const u8 *ie, size_t ielen,
                    s32 signal, gfp_t gfp)
{
        struct cfg80211_inform_bss data = {
                .chan = rx_channel,
                .signal = signal,
        };

        return cfg80211_inform_bss_data(wiphy, &data, ftype, bssid, tsf,
                                        capability, beacon_interval, ie, ielen,
                                        gfp);
}

/**
 * __cfg80211_get_bss - get a BSS reference
 * @wiphy: the wiphy this BSS struct belongs to
 * @channel: the channel to search on (or %NULL)
 * @bssid: the desired BSSID (or %NULL)
 * @ssid: the desired SSID (or %NULL)
 * @ssid_len: length of the SSID (or 0)
 * @bss_type: type of BSS, see &enum ieee80211_bss_type
 * @privacy: privacy filter, see &enum ieee80211_privacy
 * @use_for: indicates which use is intended
 *
 * Return: Reference-counted BSS on success. %NULL on error.
 */
struct cfg80211_bss *__cfg80211_get_bss(struct wiphy *wiphy,
                                        struct ieee80211_channel *channel,
                                        const u8 *bssid,
                                        const u8 *ssid, size_t ssid_len,
                                        enum ieee80211_bss_type bss_type,
                                        enum ieee80211_privacy privacy,
                                        u32 use_for);

/**
 * cfg80211_get_bss - get a BSS reference
 * @wiphy: the wiphy this BSS struct belongs to
 * @channel: the channel to search on (or %NULL)
 * @bssid: the desired BSSID (or %NULL)
 * @ssid: the desired SSID (or %NULL)
 * @ssid_len: length of the SSID (or 0)
 * @bss_type: type of BSS, see &enum ieee80211_bss_type
 * @privacy: privacy filter, see &enum ieee80211_privacy
 *
 * This version implies regular usage, %NL80211_BSS_USE_FOR_NORMAL.
 *
 * Return: Reference-counted BSS on success. %NULL on error.
 */
static inline struct cfg80211_bss *
cfg80211_get_bss(struct wiphy *wiphy, struct ieee80211_channel *channel,
                 const u8 *bssid, const u8 *ssid, size_t ssid_len,
                 enum ieee80211_bss_type bss_type,
                 enum ieee80211_privacy privacy)
{
        return __cfg80211_get_bss(wiphy, channel, bssid, ssid, ssid_len,
                                  bss_type, privacy,
                                  NL80211_BSS_USE_FOR_NORMAL);
}

static inline struct cfg80211_bss *
cfg80211_get_ibss(struct wiphy *wiphy,
                  struct ieee80211_channel *channel,
                  const u8 *ssid, size_t ssid_len)
{
        return cfg80211_get_bss(wiphy, channel, NULL, ssid, ssid_len,
                                IEEE80211_BSS_TYPE_IBSS,
                                IEEE80211_PRIVACY_ANY);
}

/**
 * cfg80211_ref_bss - reference BSS struct
 * @wiphy: the wiphy this BSS struct belongs to
 * @bss: the BSS struct to reference
 *
 * Increments the refcount of the given BSS struct.
 */
void cfg80211_ref_bss(struct wiphy *wiphy, struct cfg80211_bss *bss);

/**
 * cfg80211_put_bss - unref BSS struct
 * @wiphy: the wiphy this BSS struct belongs to
 * @bss: the BSS struct
 *
 * Decrements the refcount of the given BSS struct.
 */
void cfg80211_put_bss(struct wiphy *wiphy, struct cfg80211_bss *bss);

/**
 * cfg80211_unlink_bss - unlink BSS from internal data structures
 * @wiphy: the wiphy
 * @bss: the bss to remove
 *
 * This function removes the given BSS from the internal data structures
 * thereby making it no longer show up in scan results etc. Use this
 * function when you detect a BSS is gone. Normally BSSes will also time
 * out, so it is not necessary to use this function at all.
 */
void cfg80211_unlink_bss(struct wiphy *wiphy, struct cfg80211_bss *bss);

/**
 * cfg80211_bss_iter - iterate all BSS entries
 *
 * This function iterates over the BSS entries associated with the given wiphy
 * and calls the callback for the iterated BSS. The iterator function is not
 * allowed to call functions that might modify the internal state of the BSS DB.
 *
 * @wiphy: the wiphy
 * @chandef: if given, the iterator function will be called only if the channel
 *     of the currently iterated BSS is a subset of the given channel.
 * @iter: the iterator function to call
 * @iter_data: an argument to the iterator function
 */
void cfg80211_bss_iter(struct wiphy *wiphy,
                       struct cfg80211_chan_def *chandef,
                       void (*iter)(struct wiphy *wiphy,
                                    struct cfg80211_bss *bss,
                                    void *data),
                       void *iter_data);

/**
 * cfg80211_rx_mlme_mgmt - notification of processed MLME management frame
 * @dev: network device
 * @buf: authentication frame (header + body)
 * @len: length of the frame data
 *
 * This function is called whenever an authentication, disassociation or
 * deauthentication frame has been received and processed in station mode.
 * After being asked to authenticate via cfg80211_ops::auth() the driver must
 * call either this function or cfg80211_auth_timeout().
 * After being asked to associate via cfg80211_ops::assoc() the driver must
 * call either this function or cfg80211_auth_timeout().
 * While connected, the driver must calls this for received and processed
 * disassociation and deauthentication frames. If the frame couldn't be used
 * because it was unprotected, the driver must call the function
 * cfg80211_rx_unprot_mlme_mgmt() instead.
 *
 * This function may sleep. The caller must hold the corresponding wdev's mutex.
 */
void cfg80211_rx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len);

/**
 * cfg80211_auth_timeout - notification of timed out authentication
 * @dev: network device
 * @addr: The MAC address of the device with which the authentication timed out
 *
 * This function may sleep. The caller must hold the corresponding wdev's
 * mutex.
 */
void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr);

/**
 * struct cfg80211_rx_assoc_resp_data - association response data
 * @buf: (Re)Association Response frame (header + body)
 * @len: length of the frame data
 * @uapsd_queues: bitmap of queues configured for uapsd. Same format
 *        as the AC bitmap in the QoS info field
 * @req_ies: information elements from the (Re)Association Request frame
 * @req_ies_len: length of req_ies data
 * @ap_mld_addr: AP MLD address (in case of MLO)
 * @links: per-link information indexed by link ID, use links[0] for
 *        non-MLO connections
 * @links.bss: the BSS that association was requested with, ownership of the
 *      pointer moves to cfg80211 in the call to cfg80211_rx_assoc_resp()
 * @links.status: Set this (along with a BSS pointer) for links that
 *        were rejected by the AP.
 */
struct cfg80211_rx_assoc_resp_data {
        const u8 *buf;
        size_t len;
        const u8 *req_ies;
        size_t req_ies_len;
        int uapsd_queues;
        const u8 *ap_mld_addr;
        struct {
                u8 addr[ETH_ALEN] __aligned(2);
                struct cfg80211_bss *bss;
                u16 status;
        } links[IEEE80211_MLD_MAX_NUM_LINKS];
};

/**
 * cfg80211_rx_assoc_resp - notification of processed association response
 * @dev: network device
 * @data: association response data, &struct cfg80211_rx_assoc_resp_data
 *
 * After being asked to associate via cfg80211_ops::assoc() the driver must
 * call either this function or cfg80211_auth_timeout().
 *
 * This function may sleep. The caller must hold the corresponding wdev's mutex.
 */
void cfg80211_rx_assoc_resp(struct net_device *dev,
                            const struct cfg80211_rx_assoc_resp_data *data);

/**
 * struct cfg80211_assoc_failure - association failure data
 * @ap_mld_addr: AP MLD address, or %NULL
 * @bss: list of BSSes, must use entry 0 for non-MLO connections
 *        (@ap_mld_addr is %NULL)
 * @timeout: indicates the association failed due to timeout, otherwise
 *        the association was abandoned for a reason reported through some
 *        other API (e.g. deauth RX)
 */
struct cfg80211_assoc_failure {
        const u8 *ap_mld_addr;
        struct cfg80211_bss *bss[IEEE80211_MLD_MAX_NUM_LINKS];
        bool timeout;
};

/**
 * cfg80211_assoc_failure - notification of association failure
 * @dev: network device
 * @data: data describing the association failure
 *
 * This function may sleep. The caller must hold the corresponding wdev's mutex.
 */
void cfg80211_assoc_failure(struct net_device *dev,
                            struct cfg80211_assoc_failure *data);

/**
 * cfg80211_tx_mlme_mgmt - notification of transmitted deauth/disassoc frame
 * @dev: network device
 * @buf: 802.11 frame (header + body)
 * @len: length of the frame data
 * @reconnect: immediate reconnect is desired (include the nl80211 attribute)
 *
 * This function is called whenever deauthentication has been processed in
 * station mode. This includes both received deauthentication frames and
 * locally generated ones. This function may sleep. The caller must hold the
 * corresponding wdev's mutex.
 */
void cfg80211_tx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len,
                           bool reconnect);

/**
 * cfg80211_rx_unprot_mlme_mgmt - notification of unprotected mlme mgmt frame
 * @dev: network device
 * @buf: received management frame (header + body)
 * @len: length of the frame data
 *
 * This function is called whenever a received deauthentication or dissassoc
 * frame has been dropped in station mode because of MFP being used but the
 * frame was not protected. This is also used to notify reception of a Beacon
 * frame that was dropped because it did not include a valid MME MIC while
 * beacon protection was enabled (BIGTK configured in station mode).
 *
 * This function may sleep.
 */
void cfg80211_rx_unprot_mlme_mgmt(struct net_device *dev,
                                  const u8 *buf, size_t len);

/**
 * cfg80211_michael_mic_failure - notification of Michael MIC failure (TKIP)
 * @dev: network device
 * @addr: The source MAC address of the frame
 * @key_type: The key type that the received frame used
 * @key_id: Key identifier (0..3). Can be -1 if missing.
 * @tsc: The TSC value of the frame that generated the MIC failure (6 octets)
 * @gfp: allocation flags
 *
 * This function is called whenever the local MAC detects a MIC failure in a
 * received frame. This matches with MLME-MICHAELMICFAILURE.indication()
 * primitive.
 */
void cfg80211_michael_mic_failure(struct net_device *dev, const u8 *addr,
                                  enum nl80211_key_type key_type, int key_id,
                                  const u8 *tsc, gfp_t gfp);

/**
 * cfg80211_ibss_joined - notify cfg80211 that device joined an IBSS
 *
 * @dev: network device
 * @bssid: the BSSID of the IBSS joined
 * @channel: the channel of the IBSS joined
 * @gfp: allocation flags
 *
 * This function notifies cfg80211 that the device joined an IBSS or
 * switched to a different BSSID. Before this function can be called,
 * either a beacon has to have been received from the IBSS, or one of
 * the cfg80211_inform_bss{,_frame} functions must have been called
 * with the locally generated beacon -- this guarantees that there is
 * always a scan result for this IBSS. cfg80211 will handle the rest.
 */
void cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid,
                          struct ieee80211_channel *channel, gfp_t gfp);

/**
 * cfg80211_notify_new_peer_candidate - notify cfg80211 of a new mesh peer
 *                                         candidate
 *
 * @dev: network device
 * @macaddr: the MAC address of the new candidate
 * @ie: information elements advertised by the peer candidate
 * @ie_len: length of the information elements buffer
 * @sig_dbm: signal level in dBm
 * @gfp: allocation flags
 *
 * This function notifies cfg80211 that the mesh peer candidate has been
 * detected, most likely via a beacon or, less likely, via a probe response.
 * cfg80211 then sends a notification to userspace.
 */
void cfg80211_notify_new_peer_candidate(struct net_device *dev,
                const u8 *macaddr, const u8 *ie, u8 ie_len,
                int sig_dbm, gfp_t gfp);

/**
 * DOC: RFkill integration
 *
 * RFkill integration in cfg80211 is almost invisible to drivers,
 * as cfg80211 automatically registers an rfkill instance for each
 * wireless device it knows about. Soft kill is also translated
 * into disconnecting and turning all interfaces off. Drivers are
 * expected to turn off the device when all interfaces are down.
 *
 * However, devices may have a hard RFkill line, in which case they
 * also need to interact with the rfkill subsystem, via cfg80211.
 * They can do this with a few helper functions documented here.
 */

/**
 * wiphy_rfkill_set_hw_state_reason - notify cfg80211 about hw block state
 * @wiphy: the wiphy
 * @blocked: block status
 * @reason: one of reasons in &enum rfkill_hard_block_reasons
 */
void wiphy_rfkill_set_hw_state_reason(struct wiphy *wiphy, bool blocked,
                                      enum rfkill_hard_block_reasons reason);

static inline void wiphy_rfkill_set_hw_state(struct wiphy *wiphy, bool blocked)
{
        wiphy_rfkill_set_hw_state_reason(wiphy, blocked,
                                         RFKILL_HARD_BLOCK_SIGNAL);
}

/**
 * wiphy_rfkill_start_polling - start polling rfkill
 * @wiphy: the wiphy
 */
void wiphy_rfkill_start_polling(struct wiphy *wiphy);

/**
 * wiphy_rfkill_stop_polling - stop polling rfkill
 * @wiphy: the wiphy
 */
static inline void wiphy_rfkill_stop_polling(struct wiphy *wiphy)
{
        rfkill_pause_polling(wiphy->rfkill);
}

/**
 * DOC: Vendor commands
 *
 * Occasionally, there are special protocol or firmware features that
 * can't be implemented very openly. For this and similar cases, the
 * vendor command functionality allows implementing the features with
 * (typically closed-source) userspace and firmware, using nl80211 as
 * the configuration mechanism.
 *
 * A driver supporting vendor commands must register them as an array
 * in struct wiphy, with handlers for each one. Each command has an
 * OUI and sub command ID to identify it.
 *
 * Note that this feature should not be (ab)used to implement protocol
 * features that could openly be shared across drivers. In particular,
 * it must never be required to use vendor commands to implement any
 * "normal" functionality that higher-level userspace like connection
 * managers etc. need.
 */

struct sk_buff *__cfg80211_alloc_reply_skb(struct wiphy *wiphy,
                                           enum nl80211_commands cmd,
                                           enum nl80211_attrs attr,
                                           int approxlen);

struct sk_buff *__cfg80211_alloc_event_skb(struct wiphy *wiphy,
                                           struct wireless_dev *wdev,
                                           enum nl80211_commands cmd,
                                           enum nl80211_attrs attr,
                                           unsigned int portid,
                                           int vendor_event_idx,
                                           int approxlen, gfp_t gfp);

void __cfg80211_send_event_skb(struct sk_buff *skb, gfp_t gfp);

/**
 * cfg80211_vendor_cmd_alloc_reply_skb - allocate vendor command reply
 * @wiphy: the wiphy
 * @approxlen: an upper bound of the length of the data that will
 *        be put into the skb
 *
 * This function allocates and pre-fills an skb for a reply to
 * a vendor command. Since it is intended for a reply, calling
 * it outside of a vendor command's doit() operation is invalid.
 *
 * The returned skb is pre-filled with some identifying data in
 * a way that any data that is put into the skb (with skb_put(),
 * nla_put() or similar) will end up being within the
 * %NL80211_ATTR_VENDOR_DATA attribute, so all that needs to be done
 * with the skb is adding data for the corresponding userspace tool
 * which can then read that data out of the vendor data attribute.
 * You must not modify the skb in any other way.
 *
 * When done, call cfg80211_vendor_cmd_reply() with the skb and return
 * its error code as the result of the doit() operation.
 *
 * Return: An allocated and pre-filled skb. %NULL if any errors happen.
 */
static inline struct sk_buff *
cfg80211_vendor_cmd_alloc_reply_skb(struct wiphy *wiphy, int approxlen)
{
        return __cfg80211_alloc_reply_skb(wiphy, NL80211_CMD_VENDOR,
                                          NL80211_ATTR_VENDOR_DATA, approxlen);
}

/**
 * cfg80211_vendor_cmd_reply - send the reply skb
 * @skb: The skb, must have been allocated with
 *        cfg80211_vendor_cmd_alloc_reply_skb()
 *
 * Since calling this function will usually be the last thing
 * before returning from the vendor command doit() you should
 * return the error code.  Note that this function consumes the
 * skb regardless of the return value.
 *
 * Return: An error code or 0 on success.
 */
int cfg80211_vendor_cmd_reply(struct sk_buff *skb);

/**
 * cfg80211_vendor_cmd_get_sender - get the current sender netlink ID
 * @wiphy: the wiphy
 *
 * Return: the current netlink port ID in a vendor command handler.
 *
 * Context: May only be called from a vendor command handler
 */
unsigned int cfg80211_vendor_cmd_get_sender(struct wiphy *wiphy);

/**
 * cfg80211_vendor_event_alloc - allocate vendor-specific event skb
 * @wiphy: the wiphy
 * @wdev: the wireless device
 * @event_idx: index of the vendor event in the wiphy's vendor_events
 * @approxlen: an upper bound of the length of the data that will
 *        be put into the skb
 * @gfp: allocation flags
 *
 * This function allocates and pre-fills an skb for an event on the
 * vendor-specific multicast group.
 *
 * If wdev != NULL, both the ifindex and identifier of the specified
 * wireless device are added to the event message before the vendor data
 * attribute.
 *
 * When done filling the skb, call cfg80211_vendor_event() with the
 * skb to send the event.
 *
 * Return: An allocated and pre-filled skb. %NULL if any errors happen.
 */
static inline struct sk_buff *
cfg80211_vendor_event_alloc(struct wiphy *wiphy, struct wireless_dev *wdev,
                             int approxlen, int event_idx, gfp_t gfp)
{
        return __cfg80211_alloc_event_skb(wiphy, wdev, NL80211_CMD_VENDOR,
                                          NL80211_ATTR_VENDOR_DATA,
                                          0, event_idx, approxlen, gfp);
}

/**
 * cfg80211_vendor_event_alloc_ucast - alloc unicast vendor-specific event skb
 * @wiphy: the wiphy
 * @wdev: the wireless device
 * @event_idx: index of the vendor event in the wiphy's vendor_events
 * @portid: port ID of the receiver
 * @approxlen: an upper bound of the length of the data that will
 *        be put into the skb
 * @gfp: allocation flags
 *
 * This function allocates and pre-fills an skb for an event to send to
 * a specific (userland) socket. This socket would previously have been
 * obtained by cfg80211_vendor_cmd_get_sender(), and the caller MUST take
 * care to register a netlink notifier to see when the socket closes.
 *
 * If wdev != NULL, both the ifindex and identifier of the specified
 * wireless device are added to the event message before the vendor data
 * attribute.
 *
 * When done filling the skb, call cfg80211_vendor_event() with the
 * skb to send the event.
 *
 * Return: An allocated and pre-filled skb. %NULL if any errors happen.
 */
static inline struct sk_buff *
cfg80211_vendor_event_alloc_ucast(struct wiphy *wiphy,
                                  struct wireless_dev *wdev,
                                  unsigned int portid, int approxlen,
                                  int event_idx, gfp_t gfp)
{
        return __cfg80211_alloc_event_skb(wiphy, wdev, NL80211_CMD_VENDOR,
                                          NL80211_ATTR_VENDOR_DATA,
                                          portid, event_idx, approxlen, gfp);
}

/**
 * cfg80211_vendor_event - send the event
 * @skb: The skb, must have been allocated with cfg80211_vendor_event_alloc()
 * @gfp: allocation flags
 *
 * This function sends the given @skb, which must have been allocated
 * by cfg80211_vendor_event_alloc(), as an event. It always consumes it.
 */
static inline void cfg80211_vendor_event(struct sk_buff *skb, gfp_t gfp)
{
        __cfg80211_send_event_skb(skb, gfp);
}

#ifdef CONFIG_NL80211_TESTMODE
/**
 * DOC: Test mode
 *
 * Test mode is a set of utility functions to allow drivers to
 * interact with driver-specific tools to aid, for instance,
 * factory programming.
 *
 * This chapter describes how drivers interact with it. For more
 * information see the nl80211 book's chapter on it.
 */

/**
 * cfg80211_testmode_alloc_reply_skb - allocate testmode reply
 * @wiphy: the wiphy
 * @approxlen: an upper bound of the length of the data that will
 *        be put into the skb
 *
 * This function allocates and pre-fills an skb for a reply to
 * the testmode command. Since it is intended for a reply, calling
 * it outside of the @testmode_cmd operation is invalid.
 *
 * The returned skb is pre-filled with the wiphy index and set up in
 * a way that any data that is put into the skb (with skb_put(),
 * nla_put() or similar) will end up being within the
 * %NL80211_ATTR_TESTDATA attribute, so all that needs to be done
 * with the skb is adding data for the corresponding userspace tool
 * which can then read that data out of the testdata attribute. You
 * must not modify the skb in any other way.
 *
 * When done, call cfg80211_testmode_reply() with the skb and return
 * its error code as the result of the @testmode_cmd operation.
 *
 * Return: An allocated and pre-filled skb. %NULL if any errors happen.
 */
static inline struct sk_buff *
cfg80211_testmode_alloc_reply_skb(struct wiphy *wiphy, int approxlen)
{
        return __cfg80211_alloc_reply_skb(wiphy, NL80211_CMD_TESTMODE,
                                          NL80211_ATTR_TESTDATA, approxlen);
}

/**
 * cfg80211_testmode_reply - send the reply skb
 * @skb: The skb, must have been allocated with
 *        cfg80211_testmode_alloc_reply_skb()
 *
 * Since calling this function will usually be the last thing
 * before returning from the @testmode_cmd you should return
 * the error code.  Note that this function consumes the skb
 * regardless of the return value.
 *
 * Return: An error code or 0 on success.
 */
static inline int cfg80211_testmode_reply(struct sk_buff *skb)
{
        return cfg80211_vendor_cmd_reply(skb);
}

/**
 * cfg80211_testmode_alloc_event_skb - allocate testmode event
 * @wiphy: the wiphy
 * @approxlen: an upper bound of the length of the data that will
 *        be put into the skb
 * @gfp: allocation flags
 *
 * This function allocates and pre-fills an skb for an event on the
 * testmode multicast group.
 *
 * The returned skb is set up in the same way as with
 * cfg80211_testmode_alloc_reply_skb() but prepared for an event. As
 * there, you should simply add data to it that will then end up in the
 * %NL80211_ATTR_TESTDATA attribute. Again, you must not modify the skb
 * in any other way.
 *
 * When done filling the skb, call cfg80211_testmode_event() with the
 * skb to send the event.
 *
 * Return: An allocated and pre-filled skb. %NULL if any errors happen.
 */
static inline struct sk_buff *
cfg80211_testmode_alloc_event_skb(struct wiphy *wiphy, int approxlen, gfp_t gfp)
{
        return __cfg80211_alloc_event_skb(wiphy, NULL, NL80211_CMD_TESTMODE,
                                          NL80211_ATTR_TESTDATA, 0, -1,
                                          approxlen, gfp);
}

/**
 * cfg80211_testmode_event - send the event
 * @skb: The skb, must have been allocated with
 *        cfg80211_testmode_alloc_event_skb()
 * @gfp: allocation flags
 *
 * This function sends the given @skb, which must have been allocated
 * by cfg80211_testmode_alloc_event_skb(), as an event. It always
 * consumes it.
 */
static inline void cfg80211_testmode_event(struct sk_buff *skb, gfp_t gfp)
{
        __cfg80211_send_event_skb(skb, gfp);
}

#define CFG80211_TESTMODE_CMD(cmd)        .testmode_cmd = (cmd),
#define CFG80211_TESTMODE_DUMP(cmd)        .testmode_dump = (cmd),
#else
#define CFG80211_TESTMODE_CMD(cmd)
#define CFG80211_TESTMODE_DUMP(cmd)
#endif

/**
 * struct cfg80211_fils_resp_params - FILS connection response params
 * @kek: KEK derived from a successful FILS connection (may be %NULL)
 * @kek_len: Length of @fils_kek in octets
 * @update_erp_next_seq_num: Boolean value to specify whether the value in
 *        @erp_next_seq_num is valid.
 * @erp_next_seq_num: The next sequence number to use in ERP message in
 *        FILS Authentication. This value should be specified irrespective of the
 *        status for a FILS connection.
 * @pmk: A new PMK if derived from a successful FILS connection (may be %NULL).
 * @pmk_len: Length of @pmk in octets
 * @pmkid: A new PMKID if derived from a successful FILS connection or the PMKID
 *        used for this FILS connection (may be %NULL).
 */
struct cfg80211_fils_resp_params {
        const u8 *kek;
        size_t kek_len;
        bool update_erp_next_seq_num;
        u16 erp_next_seq_num;
        const u8 *pmk;
        size_t pmk_len;
        const u8 *pmkid;
};

/**
 * struct cfg80211_connect_resp_params - Connection response params
 * @status: Status code, %WLAN_STATUS_SUCCESS for successful connection, use
 *        %WLAN_STATUS_UNSPECIFIED_FAILURE if your device cannot give you
 *        the real status code for failures. If this call is used to report a
 *        failure due to a timeout (e.g., not receiving an Authentication frame
 *        from the AP) instead of an explicit rejection by the AP, -1 is used to
 *        indicate that this is a failure, but without a status code.
 *        @timeout_reason is used to report the reason for the timeout in that
 *        case.
 * @req_ie: Association request IEs (may be %NULL)
 * @req_ie_len: Association request IEs length
 * @resp_ie: Association response IEs (may be %NULL)
 * @resp_ie_len: Association response IEs length
 * @fils: FILS connection response parameters.
 * @timeout_reason: Reason for connection timeout. This is used when the
 *        connection fails due to a timeout instead of an explicit rejection from
 *        the AP. %NL80211_TIMEOUT_UNSPECIFIED is used when the timeout reason is
 *        not known. This value is used only if @status < 0 to indicate that the
 *        failure is due to a timeout and not due to explicit rejection by the AP.
 *        This value is ignored in other cases (@status >= 0).
 * @valid_links: For MLO connection, BIT mask of the valid link ids. Otherwise
 *        zero.
 * @ap_mld_addr: For MLO connection, MLD address of the AP. Otherwise %NULL.
 * @links : For MLO connection, contains link info for the valid links indicated
 *        using @valid_links. For non-MLO connection, links[0] contains the
 *        connected AP info.
 * @links.addr: For MLO connection, MAC address of the STA link. Otherwise
 *        %NULL.
 * @links.bssid: For MLO connection, MAC address of the AP link. For non-MLO
 *        connection, links[0].bssid points to the BSSID of the AP (may be %NULL).
 * @links.bss: For MLO connection, entry of bss to which STA link is connected.
 *        For non-MLO connection, links[0].bss points to entry of bss to which STA
 *        is connected. It can be obtained through cfg80211_get_bss() (may be
 *        %NULL). It is recommended to store the bss from the connect_request and
 *        hold a reference to it and return through this param to avoid a warning
 *        if the bss is expired during the connection, esp. for those drivers
 *        implementing connect op. Only one parameter among @bssid and @bss needs
 *        to be specified.
 * @links.status: per-link status code, to report a status code that's not
 *        %WLAN_STATUS_SUCCESS for a given link, it must also be in the
 *        @valid_links bitmap and may have a BSS pointer (which is then released)
 */
struct cfg80211_connect_resp_params {
        int status;
        const u8 *req_ie;
        size_t req_ie_len;
        const u8 *resp_ie;
        size_t resp_ie_len;
        struct cfg80211_fils_resp_params fils;
        enum nl80211_timeout_reason timeout_reason;

        const u8 *ap_mld_addr;
        u16 valid_links;
        struct {
                const u8 *addr;
                const u8 *bssid;
                struct cfg80211_bss *bss;
                u16 status;
        } links[IEEE80211_MLD_MAX_NUM_LINKS];
};

/**
 * cfg80211_connect_done - notify cfg80211 of connection result
 *
 * @dev: network device
 * @params: connection response parameters
 * @gfp: allocation flags
 *
 * It should be called by the underlying driver once execution of the connection
 * request from connect() has been completed. This is similar to
 * cfg80211_connect_bss(), but takes a structure pointer for connection response
 * parameters. Only one of the functions among cfg80211_connect_bss(),
 * cfg80211_connect_result(), cfg80211_connect_timeout(),
 * and cfg80211_connect_done() should be called.
 */
void cfg80211_connect_done(struct net_device *dev,
                           struct cfg80211_connect_resp_params *params,
                           gfp_t gfp);

/**
 * cfg80211_connect_bss - notify cfg80211 of connection result
 *
 * @dev: network device
 * @bssid: the BSSID of the AP
 * @bss: Entry of bss to which STA got connected to, can be obtained through
 *        cfg80211_get_bss() (may be %NULL). But it is recommended to store the
 *        bss from the connect_request and hold a reference to it and return
 *        through this param to avoid a warning if the bss is expired during the
 *        connection, esp. for those drivers implementing connect op.
 *        Only one parameter among @bssid and @bss needs to be specified.
 * @req_ie: association request IEs (maybe be %NULL)
 * @req_ie_len: association request IEs length
 * @resp_ie: association response IEs (may be %NULL)
 * @resp_ie_len: assoc response IEs length
 * @status: status code, %WLAN_STATUS_SUCCESS for successful connection, use
 *        %WLAN_STATUS_UNSPECIFIED_FAILURE if your device cannot give you
 *        the real status code for failures. If this call is used to report a
 *        failure due to a timeout (e.g., not receiving an Authentication frame
 *        from the AP) instead of an explicit rejection by the AP, -1 is used to
 *        indicate that this is a failure, but without a status code.
 *        @timeout_reason is used to report the reason for the timeout in that
 *        case.
 * @gfp: allocation flags
 * @timeout_reason: reason for connection timeout. This is used when the
 *        connection fails due to a timeout instead of an explicit rejection from
 *        the AP. %NL80211_TIMEOUT_UNSPECIFIED is used when the timeout reason is
 *        not known. This value is used only if @status < 0 to indicate that the
 *        failure is due to a timeout and not due to explicit rejection by the AP.
 *        This value is ignored in other cases (@status >= 0).
 *
 * It should be called by the underlying driver once execution of the connection
 * request from connect() has been completed. This is similar to
 * cfg80211_connect_result(), but with the option of identifying the exact bss
 * entry for the connection. Only one of the functions among
 * cfg80211_connect_bss(), cfg80211_connect_result(),
 * cfg80211_connect_timeout(), and cfg80211_connect_done() should be called.
 */
static inline void
cfg80211_connect_bss(struct net_device *dev, const u8 *bssid,
                     struct cfg80211_bss *bss, const u8 *req_ie,
                     size_t req_ie_len, const u8 *resp_ie,
                     size_t resp_ie_len, int status, gfp_t gfp,
                     enum nl80211_timeout_reason timeout_reason)
{
        struct cfg80211_connect_resp_params params;

        memset(&params, 0, sizeof(params));
        params.status = status;
        params.links[0].bssid = bssid;
        params.links[0].bss = bss;
        params.req_ie = req_ie;
        params.req_ie_len = req_ie_len;
        params.resp_ie = resp_ie;
        params.resp_ie_len = resp_ie_len;
        params.timeout_reason = timeout_reason;

        cfg80211_connect_done(dev, &params, gfp);
}

/**
 * cfg80211_connect_result - notify cfg80211 of connection result
 *
 * @dev: network device
 * @bssid: the BSSID of the AP
 * @req_ie: association request IEs (maybe be %NULL)
 * @req_ie_len: association request IEs length
 * @resp_ie: association response IEs (may be %NULL)
 * @resp_ie_len: assoc response IEs length
 * @status: status code, %WLAN_STATUS_SUCCESS for successful connection, use
 *        %WLAN_STATUS_UNSPECIFIED_FAILURE if your device cannot give you
 *        the real status code for failures.
 * @gfp: allocation flags
 *
 * It should be called by the underlying driver once execution of the connection
 * request from connect() has been completed. This is similar to
 * cfg80211_connect_bss() which allows the exact bss entry to be specified. Only
 * one of the functions among cfg80211_connect_bss(), cfg80211_connect_result(),
 * cfg80211_connect_timeout(), and cfg80211_connect_done() should be called.
 */
static inline void
cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
                        const u8 *req_ie, size_t req_ie_len,
                        const u8 *resp_ie, size_t resp_ie_len,
                        u16 status, gfp_t gfp)
{
        cfg80211_connect_bss(dev, bssid, NULL, req_ie, req_ie_len, resp_ie,
                             resp_ie_len, status, gfp,
                             NL80211_TIMEOUT_UNSPECIFIED);
}

/**
 * cfg80211_connect_timeout - notify cfg80211 of connection timeout
 *
 * @dev: network device
 * @bssid: the BSSID of the AP
 * @req_ie: association request IEs (maybe be %NULL)
 * @req_ie_len: association request IEs length
 * @gfp: allocation flags
 * @timeout_reason: reason for connection timeout.
 *
 * It should be called by the underlying driver whenever connect() has failed
 * in a sequence where no explicit authentication/association rejection was
 * received from the AP. This could happen, e.g., due to not being able to send
 * out the Authentication or Association Request frame or timing out while
 * waiting for the response. Only one of the functions among
 * cfg80211_connect_bss(), cfg80211_connect_result(),
 * cfg80211_connect_timeout(), and cfg80211_connect_done() should be called.
 */
static inline void
cfg80211_connect_timeout(struct net_device *dev, const u8 *bssid,
                         const u8 *req_ie, size_t req_ie_len, gfp_t gfp,
                         enum nl80211_timeout_reason timeout_reason)
{
        cfg80211_connect_bss(dev, bssid, NULL, req_ie, req_ie_len, NULL, 0, -1,
                             gfp, timeout_reason);
}

/**
 * struct cfg80211_roam_info - driver initiated roaming information
 *
 * @req_ie: association request IEs (maybe be %NULL)
 * @req_ie_len: association request IEs length
 * @resp_ie: association response IEs (may be %NULL)
 * @resp_ie_len: assoc response IEs length
 * @fils: FILS related roaming information.
 * @valid_links: For MLO roaming, BIT mask of the new valid links is set.
 *        Otherwise zero.
 * @ap_mld_addr: For MLO roaming, MLD address of the new AP. Otherwise %NULL.
 * @links : For MLO roaming, contains new link info for the valid links set in
 *        @valid_links. For non-MLO roaming, links[0] contains the new AP info.
 * @links.addr: For MLO roaming, MAC address of the STA link. Otherwise %NULL.
 * @links.bssid: For MLO roaming, MAC address of the new AP link. For non-MLO
 *        roaming, links[0].bssid points to the BSSID of the new AP. May be
 *        %NULL if %links.bss is set.
 * @links.channel: the channel of the new AP.
 * @links.bss: For MLO roaming, entry of new bss to which STA link got
 *        roamed. For non-MLO roaming, links[0].bss points to entry of bss to
 *        which STA got roamed (may be %NULL if %links.bssid is set)
 */
struct cfg80211_roam_info {
        const u8 *req_ie;
        size_t req_ie_len;
        const u8 *resp_ie;
        size_t resp_ie_len;
        struct cfg80211_fils_resp_params fils;

        const u8 *ap_mld_addr;
        u16 valid_links;
        struct {
                const u8 *addr;
                const u8 *bssid;
                struct ieee80211_channel *channel;
                struct cfg80211_bss *bss;
        } links[IEEE80211_MLD_MAX_NUM_LINKS];
};

/**
 * cfg80211_roamed - notify cfg80211 of roaming
 *
 * @dev: network device
 * @info: information about the new BSS. struct &cfg80211_roam_info.
 * @gfp: allocation flags
 *
 * This function may be called with the driver passing either the BSSID of the
 * new AP or passing the bss entry to avoid a race in timeout of the bss entry.
 * It should be called by the underlying driver whenever it roamed from one AP
 * to another while connected. Drivers which have roaming implemented in
 * firmware should pass the bss entry to avoid a race in bss entry timeout where
 * the bss entry of the new AP is seen in the driver, but gets timed out by the
 * time it is accessed in __cfg80211_roamed() due to delay in scheduling
 * rdev->event_work. In case of any failures, the reference is released
 * either in cfg80211_roamed() or in __cfg80211_romed(), Otherwise, it will be
 * released while disconnecting from the current bss.
 */
void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info,
                     gfp_t gfp);

/**
 * cfg80211_port_authorized - notify cfg80211 of successful security association
 *
 * @dev: network device
 * @peer_addr: BSSID of the AP/P2P GO in case of STA/GC or STA/GC MAC address
 *        in case of AP/P2P GO
 * @td_bitmap: transition disable policy
 * @td_bitmap_len: Length of transition disable policy
 * @gfp: allocation flags
 *
 * This function should be called by a driver that supports 4 way handshake
 * offload after a security association was successfully established (i.e.,
 * the 4 way handshake was completed successfully). The call to this function
 * should be preceded with a call to cfg80211_connect_result(),
 * cfg80211_connect_done(), cfg80211_connect_bss() or cfg80211_roamed() to
 * indicate the 802.11 association.
 * This function can also be called by AP/P2P GO driver that supports
 * authentication offload. In this case the peer_mac passed is that of
 * associated STA/GC.
 */
void cfg80211_port_authorized(struct net_device *dev, const u8 *peer_addr,
                              const u8* td_bitmap, u8 td_bitmap_len, gfp_t gfp);

/**
 * cfg80211_disconnected - notify cfg80211 that connection was dropped
 *
 * @dev: network device
 * @ie: information elements of the deauth/disassoc frame (may be %NULL)
 * @ie_len: length of IEs
 * @reason: reason code for the disconnection, set it to 0 if unknown
 * @locally_generated: disconnection was requested locally
 * @gfp: allocation flags
 *
 * After it calls this function, the driver should enter an idle state
 * and not try to connect to any AP any more.
 */
void cfg80211_disconnected(struct net_device *dev, u16 reason,
                           const u8 *ie, size_t ie_len,
                           bool locally_generated, gfp_t gfp);

/**
 * cfg80211_ready_on_channel - notification of remain_on_channel start
 * @wdev: wireless device
 * @cookie: the request cookie
 * @chan: The current channel (from remain_on_channel request)
 * @duration: Duration in milliseconds that the driver intents to remain on the
 *        channel
 * @gfp: allocation flags
 */
void cfg80211_ready_on_channel(struct wireless_dev *wdev, u64 cookie,
                               struct ieee80211_channel *chan,
                               unsigned int duration, gfp_t gfp);

/**
 * cfg80211_remain_on_channel_expired - remain_on_channel duration expired
 * @wdev: wireless device
 * @cookie: the request cookie
 * @chan: The current channel (from remain_on_channel request)
 * @gfp: allocation flags
 */
void cfg80211_remain_on_channel_expired(struct wireless_dev *wdev, u64 cookie,
                                        struct ieee80211_channel *chan,
                                        gfp_t gfp);

/**
 * cfg80211_tx_mgmt_expired - tx_mgmt duration expired
 * @wdev: wireless device
 * @cookie: the requested cookie
 * @chan: The current channel (from tx_mgmt request)
 * @gfp: allocation flags
 */
void cfg80211_tx_mgmt_expired(struct wireless_dev *wdev, u64 cookie,
                              struct ieee80211_channel *chan, gfp_t gfp);

/**
 * cfg80211_sinfo_alloc_tid_stats - allocate per-tid statistics.
 *
 * @sinfo: the station information
 * @gfp: allocation flags
 *
 * Return: 0 on success. Non-zero on error.
 */
int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp);

/**
 * cfg80211_link_sinfo_alloc_tid_stats - allocate per-tid statistics.
 *
 * @link_sinfo: the link station information
 * @gfp: allocation flags
 *
 * Return: 0 on success. Non-zero on error.
 */
int cfg80211_link_sinfo_alloc_tid_stats(struct link_station_info *link_sinfo,
                                        gfp_t gfp);

/**
 * cfg80211_sinfo_release_content - release contents of station info
 * @sinfo: the station information
 *
 * Releases any potentially allocated sub-information of the station
 * information, but not the struct itself (since it's typically on
 * the stack.)
 */
static inline void cfg80211_sinfo_release_content(struct station_info *sinfo)
{
        kfree(sinfo->pertid);

        for (int link_id = 0; link_id < ARRAY_SIZE(sinfo->links); link_id++) {
                if (sinfo->links[link_id]) {
                        kfree(sinfo->links[link_id]->pertid);
                        kfree(sinfo->links[link_id]);
                }
        }
}

/**
 * cfg80211_new_sta - notify userspace about station
 *
 * @dev: the netdev
 * @mac_addr: the station's address
 * @sinfo: the station information
 * @gfp: allocation flags
 */
void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr,
                      struct station_info *sinfo, gfp_t gfp);

/**
 * cfg80211_del_sta_sinfo - notify userspace about deletion of a station
 * @dev: the netdev
 * @mac_addr: the station's address. For MLD station, MLD address is used.
 * @sinfo: the station information/statistics
 * @gfp: allocation flags
 */
void cfg80211_del_sta_sinfo(struct net_device *dev, const u8 *mac_addr,
                            struct station_info *sinfo, gfp_t gfp);

/**
 * cfg80211_del_sta - notify userspace about deletion of a station
 *
 * @dev: the netdev
 * @mac_addr: the station's address. For MLD station, MLD address is used.
 * @gfp: allocation flags
 */
static inline void cfg80211_del_sta(struct net_device *dev,
                                    const u8 *mac_addr, gfp_t gfp)
{
        cfg80211_del_sta_sinfo(dev, mac_addr, NULL, gfp);
}

/**
 * cfg80211_conn_failed - connection request failed notification
 *
 * @dev: the netdev
 * @mac_addr: the station's address
 * @reason: the reason for connection failure
 * @gfp: allocation flags
 *
 * Whenever a station tries to connect to an AP and if the station
 * could not connect to the AP as the AP has rejected the connection
 * for some reasons, this function is called.
 *
 * The reason for connection failure can be any of the value from
 * nl80211_connect_failed_reason enum
 */
void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr,
                          enum nl80211_connect_failed_reason reason,
                          gfp_t gfp);

/**
 * struct cfg80211_rx_info - received management frame info
 *
 * @freq: Frequency on which the frame was received in kHz
 * @sig_dbm: signal strength in dBm, or 0 if unknown
 * @have_link_id: indicates the frame was received on a link of
 *        an MLD, i.e. the @link_id field is valid
 * @link_id: the ID of the link the frame was received        on
 * @buf: Management frame (header + body)
 * @len: length of the frame data
 * @flags: flags, as defined in &enum nl80211_rxmgmt_flags
 * @rx_tstamp: Hardware timestamp of frame RX in nanoseconds
 * @ack_tstamp: Hardware timestamp of ack TX in nanoseconds
 */
struct cfg80211_rx_info {
        int freq;
        int sig_dbm;
        bool have_link_id;
        u8 link_id;
        const u8 *buf;
        size_t len;
        u32 flags;
        u64 rx_tstamp;
        u64 ack_tstamp;
};

/**
 * cfg80211_rx_mgmt_ext - management frame notification with extended info
 * @wdev: wireless device receiving the frame
 * @info: RX info as defined in struct cfg80211_rx_info
 *
 * This function is called whenever an Action frame is received for a station
 * mode interface, but is not processed in kernel.
 *
 * Return: %true if a user space application has registered for this frame.
 * For action frames, that makes it responsible for rejecting unrecognized
 * action frames; %false otherwise, in which case for action frames the
 * driver is responsible for rejecting the frame.
 */
bool cfg80211_rx_mgmt_ext(struct wireless_dev *wdev,
                          struct cfg80211_rx_info *info);

/**
 * cfg80211_rx_mgmt_khz - notification of received, unprocessed management frame
 * @wdev: wireless device receiving the frame
 * @freq: Frequency on which the frame was received in KHz
 * @sig_dbm: signal strength in dBm, or 0 if unknown
 * @buf: Management frame (header + body)
 * @len: length of the frame data
 * @flags: flags, as defined in enum nl80211_rxmgmt_flags
 *
 * This function is called whenever an Action frame is received for a station
 * mode interface, but is not processed in kernel.
 *
 * Return: %true if a user space application has registered for this frame.
 * For action frames, that makes it responsible for rejecting unrecognized
 * action frames; %false otherwise, in which case for action frames the
 * driver is responsible for rejecting the frame.
 */
static inline bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq,
                                        int sig_dbm, const u8 *buf, size_t len,
                                        u32 flags)
{
        struct cfg80211_rx_info info = {
                .freq = freq,
                .sig_dbm = sig_dbm,
                .buf = buf,
                .len = len,
                .flags = flags
        };

        return cfg80211_rx_mgmt_ext(wdev, &info);
}

/**
 * cfg80211_rx_mgmt - notification of received, unprocessed management frame
 * @wdev: wireless device receiving the frame
 * @freq: Frequency on which the frame was received in MHz
 * @sig_dbm: signal strength in dBm, or 0 if unknown
 * @buf: Management frame (header + body)
 * @len: length of the frame data
 * @flags: flags, as defined in enum nl80211_rxmgmt_flags
 *
 * This function is called whenever an Action frame is received for a station
 * mode interface, but is not processed in kernel.
 *
 * Return: %true if a user space application has registered for this frame.
 * For action frames, that makes it responsible for rejecting unrecognized
 * action frames; %false otherwise, in which case for action frames the
 * driver is responsible for rejecting the frame.
 */
static inline bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq,
                                    int sig_dbm, const u8 *buf, size_t len,
                                    u32 flags)
{
        struct cfg80211_rx_info info = {
                .freq = MHZ_TO_KHZ(freq),
                .sig_dbm = sig_dbm,
                .buf = buf,
                .len = len,
                .flags = flags
        };

        return cfg80211_rx_mgmt_ext(wdev, &info);
}

/**
 * struct cfg80211_tx_status - TX status for management frame information
 *
 * @cookie: Cookie returned by cfg80211_ops::mgmt_tx()
 * @tx_tstamp: hardware TX timestamp in nanoseconds
 * @ack_tstamp: hardware ack RX timestamp in nanoseconds
 * @buf: Management frame (header + body)
 * @len: length of the frame data
 * @ack: Whether frame was acknowledged
 */
struct cfg80211_tx_status {
        u64 cookie;
        u64 tx_tstamp;
        u64 ack_tstamp;
        const u8 *buf;
        size_t len;
        bool ack;
};

/**
 * cfg80211_mgmt_tx_status_ext - TX status notification with extended info
 * @wdev: wireless device receiving the frame
 * @status: TX status data
 * @gfp: context flags
 *
 * This function is called whenever a management frame was requested to be
 * transmitted with cfg80211_ops::mgmt_tx() to report the TX status of the
 * transmission attempt with extended info.
 */
void cfg80211_mgmt_tx_status_ext(struct wireless_dev *wdev,
                                 struct cfg80211_tx_status *status, gfp_t gfp);

/**
 * cfg80211_mgmt_tx_status - notification of TX status for management frame
 * @wdev: wireless device receiving the frame
 * @cookie: Cookie returned by cfg80211_ops::mgmt_tx()
 * @buf: Management frame (header + body)
 * @len: length of the frame data
 * @ack: Whether frame was acknowledged
 * @gfp: context flags
 *
 * This function is called whenever a management frame was requested to be
 * transmitted with cfg80211_ops::mgmt_tx() to report the TX status of the
 * transmission attempt.
 */
static inline void cfg80211_mgmt_tx_status(struct wireless_dev *wdev,
                                           u64 cookie, const u8 *buf,
                                           size_t len, bool ack, gfp_t gfp)
{
        struct cfg80211_tx_status status = {
                .cookie = cookie,
                .buf = buf,
                .len = len,
                .ack = ack
        };

        cfg80211_mgmt_tx_status_ext(wdev, &status, gfp);
}

/**
 * cfg80211_control_port_tx_status - notification of TX status for control
 *                                   port frames
 * @wdev: wireless device receiving the frame
 * @cookie: Cookie returned by cfg80211_ops::tx_control_port()
 * @buf: Data frame (header + body)
 * @len: length of the frame data
 * @ack: Whether frame was acknowledged
 * @gfp: context flags
 *
 * This function is called whenever a control port frame was requested to be
 * transmitted with cfg80211_ops::tx_control_port() to report the TX status of
 * the transmission attempt.
 */
void cfg80211_control_port_tx_status(struct wireless_dev *wdev, u64 cookie,
                                     const u8 *buf, size_t len, bool ack,
                                     gfp_t gfp);

/**
 * cfg80211_rx_control_port - notification about a received control port frame
 * @dev: The device the frame matched to
 * @skb: The skbuf with the control port frame.  It is assumed that the skbuf
 *        is 802.3 formatted (with 802.3 header).  The skb can be non-linear.
 *        This function does not take ownership of the skb, so the caller is
 *        responsible for any cleanup.  The caller must also ensure that
 *        skb->protocol is set appropriately.
 * @unencrypted: Whether the frame was received unencrypted
 * @link_id: the link the frame was received on, -1 if not applicable or unknown
 *
 * This function is used to inform userspace about a received control port
 * frame.  It should only be used if userspace indicated it wants to receive
 * control port frames over nl80211.
 *
 * The frame is the data portion of the 802.3 or 802.11 data frame with all
 * network layer headers removed (e.g. the raw EAPoL frame).
 *
 * Return: %true if the frame was passed to userspace
 */
bool cfg80211_rx_control_port(struct net_device *dev, struct sk_buff *skb,
                              bool unencrypted, int link_id);

/**
 * cfg80211_cqm_rssi_notify - connection quality monitoring rssi event
 * @dev: network device
 * @rssi_event: the triggered RSSI event
 * @rssi_level: new RSSI level value or 0 if not available
 * @gfp: context flags
 *
 * This function is called when a configured connection quality monitoring
 * rssi threshold reached event occurs.
 */
void cfg80211_cqm_rssi_notify(struct net_device *dev,
                              enum nl80211_cqm_rssi_threshold_event rssi_event,
                              s32 rssi_level, gfp_t gfp);

/**
 * cfg80211_cqm_pktloss_notify - notify userspace about packetloss to peer
 * @dev: network device
 * @peer: peer's MAC address
 * @num_packets: how many packets were lost -- should be a fixed threshold
 *        but probably no less than maybe 50, or maybe a throughput dependent
 *        threshold (to account for temporary interference)
 * @gfp: context flags
 */
void cfg80211_cqm_pktloss_notify(struct net_device *dev,
                                 const u8 *peer, u32 num_packets, gfp_t gfp);

/**
 * cfg80211_cqm_txe_notify - TX error rate event
 * @dev: network device
 * @peer: peer's MAC address
 * @num_packets: how many packets were lost
 * @rate: % of packets which failed transmission
 * @intvl: interval (in s) over which the TX failure threshold was breached.
 * @gfp: context flags
 *
 * Notify userspace when configured % TX failures over number of packets in a
 * given interval is exceeded.
 */
void cfg80211_cqm_txe_notify(struct net_device *dev, const u8 *peer,
                             u32 num_packets, u32 rate, u32 intvl, gfp_t gfp);

/**
 * cfg80211_cqm_beacon_loss_notify - beacon loss event
 * @dev: network device
 * @gfp: context flags
 *
 * Notify userspace about beacon loss from the connected AP.
 */
void cfg80211_cqm_beacon_loss_notify(struct net_device *dev, gfp_t gfp);

/**
 * __cfg80211_radar_event - radar detection event
 * @wiphy: the wiphy
 * @chandef: chandef for the current channel
 * @offchan: the radar has been detected on the offchannel chain
 * @gfp: context flags
 *
 * This function is called when a radar is detected on the current chanenl.
 */
void __cfg80211_radar_event(struct wiphy *wiphy,
                            struct cfg80211_chan_def *chandef,
                            bool offchan, gfp_t gfp);

static inline void
cfg80211_radar_event(struct wiphy *wiphy,
                     struct cfg80211_chan_def *chandef,
                     gfp_t gfp)
{
        __cfg80211_radar_event(wiphy, chandef, false, gfp);
}

static inline void
cfg80211_background_radar_event(struct wiphy *wiphy,
                                struct cfg80211_chan_def *chandef,
                                gfp_t gfp)
{
        __cfg80211_radar_event(wiphy, chandef, true, gfp);
}

/**
 * cfg80211_sta_opmode_change_notify - STA's ht/vht operation mode change event
 * @dev: network device
 * @mac: MAC address of a station which opmode got modified
 * @sta_opmode: station's current opmode value
 * @gfp: context flags
 *
 * Driver should call this function when station's opmode modified via action
 * frame.
 */
void cfg80211_sta_opmode_change_notify(struct net_device *dev, const u8 *mac,
                                       struct sta_opmode_info *sta_opmode,
                                       gfp_t gfp);

/**
 * cfg80211_cac_event - Channel availability check (CAC) event
 * @netdev: network device
 * @chandef: chandef for the current channel
 * @event: type of event
 * @gfp: context flags
 * @link_id: valid link_id for MLO operation or 0 otherwise.
 *
 * This function is called when a Channel availability check (CAC) is finished
 * or aborted. This must be called to notify the completion of a CAC process,
 * also by full-MAC drivers.
 */
void cfg80211_cac_event(struct net_device *netdev,
                        const struct cfg80211_chan_def *chandef,
                        enum nl80211_radar_event event, gfp_t gfp,
                        unsigned int link_id);

/**
 * cfg80211_background_cac_abort - Channel Availability Check offchan abort event
 * @wiphy: the wiphy
 *
 * This function is called by the driver when a Channel Availability Check
 * (CAC) is aborted by a offchannel dedicated chain.
 */
void cfg80211_background_cac_abort(struct wiphy *wiphy);

/**
 * cfg80211_gtk_rekey_notify - notify userspace about driver rekeying
 * @dev: network device
 * @bssid: BSSID of AP (to avoid races)
 * @replay_ctr: new replay counter
 * @gfp: allocation flags
 */
void cfg80211_gtk_rekey_notify(struct net_device *dev, const u8 *bssid,
                               const u8 *replay_ctr, gfp_t gfp);

/**
 * cfg80211_pmksa_candidate_notify - notify about PMKSA caching candidate
 * @dev: network device
 * @index: candidate index (the smaller the index, the higher the priority)
 * @bssid: BSSID of AP
 * @preauth: Whether AP advertises support for RSN pre-authentication
 * @gfp: allocation flags
 */
void cfg80211_pmksa_candidate_notify(struct net_device *dev, int index,
                                     const u8 *bssid, bool preauth, gfp_t gfp);

/**
 * cfg80211_rx_spurious_frame - inform userspace about a spurious frame
 * @dev: The device the frame matched to
 * @link_id: the link the frame was received on, -1 if not applicable or unknown
 * @addr: the transmitter address
 * @gfp: context flags
 *
 * This function is used in AP mode (only!) to inform userspace that
 * a spurious class 3 frame was received, to be able to deauth the
 * sender.
 * Return: %true if the frame was passed to userspace (or this failed
 * for a reason other than not having a subscription.)
 */
bool cfg80211_rx_spurious_frame(struct net_device *dev, const u8 *addr,
                                int link_id, gfp_t gfp);

/**
 * cfg80211_rx_unexpected_4addr_frame - inform about unexpected WDS frame
 * @dev: The device the frame matched to
 * @addr: the transmitter address
 * @link_id: the link the frame was received on, -1 if not applicable or unknown
 * @gfp: context flags
 *
 * This function is used in AP mode (only!) to inform userspace that
 * an associated station sent a 4addr frame but that wasn't expected.
 * It is allowed and desirable to send this event only once for each
 * station to avoid event flooding.
 * Return: %true if the frame was passed to userspace (or this failed
 * for a reason other than not having a subscription.)
 */
bool cfg80211_rx_unexpected_4addr_frame(struct net_device *dev, const u8 *addr,
                                        int link_id, gfp_t gfp);

/**
 * cfg80211_probe_status - notify userspace about probe status
 * @dev: the device the probe was sent on
 * @addr: the address of the peer
 * @cookie: the cookie filled in @probe_client previously
 * @acked: indicates whether probe was acked or not
 * @ack_signal: signal strength (in dBm) of the ACK frame.
 * @is_valid_ack_signal: indicates the ack_signal is valid or not.
 * @gfp: allocation flags
 */
void cfg80211_probe_status(struct net_device *dev, const u8 *addr,
                           u64 cookie, bool acked, s32 ack_signal,
                           bool is_valid_ack_signal, gfp_t gfp);

/**
 * cfg80211_report_obss_beacon_khz - report beacon from other APs
 * @wiphy: The wiphy that received the beacon
 * @frame: the frame
 * @len: length of the frame
 * @freq: frequency the frame was received on in KHz
 * @sig_dbm: signal strength in dBm, or 0 if unknown
 *
 * Use this function to report to userspace when a beacon was
 * received. It is not useful to call this when there is no
 * netdev that is in AP/GO mode.
 */
void cfg80211_report_obss_beacon_khz(struct wiphy *wiphy, const u8 *frame,
                                     size_t len, int freq, int sig_dbm);

/**
 * cfg80211_report_obss_beacon - report beacon from other APs
 * @wiphy: The wiphy that received the beacon
 * @frame: the frame
 * @len: length of the frame
 * @freq: frequency the frame was received on
 * @sig_dbm: signal strength in dBm, or 0 if unknown
 *
 * Use this function to report to userspace when a beacon was
 * received. It is not useful to call this when there is no
 * netdev that is in AP/GO mode.
 */
static inline void cfg80211_report_obss_beacon(struct wiphy *wiphy,
                                               const u8 *frame, size_t len,
                                               int freq, int sig_dbm)
{
        cfg80211_report_obss_beacon_khz(wiphy, frame, len, MHZ_TO_KHZ(freq),
                                        sig_dbm);
}

/**
 * struct cfg80211_beaconing_check_config - beacon check configuration
 * @iftype: the interface type to check for
 * @relax: allow IR-relaxation conditions to apply (e.g. another
 *        interface connected already on the same channel)
 *        NOTE: If this is set, wiphy mutex must be held.
 * @reg_power: &enum ieee80211_ap_reg_power value indicating the
 *        advertised/used 6 GHz regulatory power setting
 */
struct cfg80211_beaconing_check_config {
        enum nl80211_iftype iftype;
        enum ieee80211_ap_reg_power reg_power;
        bool relax;
};

/**
 * cfg80211_reg_check_beaconing - check if beaconing is allowed
 * @wiphy: the wiphy
 * @chandef: the channel definition
 * @cfg: additional parameters for the checking
 *
 * Return: %true if there is no secondary channel or the secondary channel(s)
 * can be used for beaconing (i.e. is not a radar channel etc.)
 */
bool cfg80211_reg_check_beaconing(struct wiphy *wiphy,
                                  struct cfg80211_chan_def *chandef,
                                  struct cfg80211_beaconing_check_config *cfg);

/**
 * cfg80211_reg_can_beacon - check if beaconing is allowed
 * @wiphy: the wiphy
 * @chandef: the channel definition
 * @iftype: interface type
 *
 * Return: %true if there is no secondary channel or the secondary channel(s)
 * can be used for beaconing (i.e. is not a radar channel etc.)
 */
static inline bool
cfg80211_reg_can_beacon(struct wiphy *wiphy,
                        struct cfg80211_chan_def *chandef,
                        enum nl80211_iftype iftype)
{
        struct cfg80211_beaconing_check_config config = {
                .iftype = iftype,
        };

        return cfg80211_reg_check_beaconing(wiphy, chandef, &config);
}

/**
 * cfg80211_reg_can_beacon_relax - check if beaconing is allowed with relaxation
 * @wiphy: the wiphy
 * @chandef: the channel definition
 * @iftype: interface type
 *
 * Return: %true if there is no secondary channel or the secondary channel(s)
 * can be used for beaconing (i.e. is not a radar channel etc.). This version
 * also checks if IR-relaxation conditions apply, to allow beaconing under
 * more permissive conditions.
 *
 * Context: Requires the wiphy mutex to be held.
 */
static inline bool
cfg80211_reg_can_beacon_relax(struct wiphy *wiphy,
                              struct cfg80211_chan_def *chandef,
                              enum nl80211_iftype iftype)
{
        struct cfg80211_beaconing_check_config config = {
                .iftype = iftype,
                .relax = true,
        };

        return cfg80211_reg_check_beaconing(wiphy, chandef, &config);
}

/**
 * cfg80211_ch_switch_notify - update wdev channel and notify userspace
 * @dev: the device which switched channels
 * @chandef: the new channel definition
 * @link_id: the link ID for MLO, must be 0 for non-MLO
 *
 * Caller must hold wiphy mutex, therefore must only be called from sleepable
 * driver context!
 */
void cfg80211_ch_switch_notify(struct net_device *dev,
                               struct cfg80211_chan_def *chandef,
                               unsigned int link_id);

/**
 * cfg80211_ch_switch_started_notify - notify channel switch start
 * @dev: the device on which the channel switch started
 * @chandef: the future channel definition
 * @link_id: the link ID for MLO, must be 0 for non-MLO
 * @count: the number of TBTTs until the channel switch happens
 * @quiet: whether or not immediate quiet was requested by the AP
 *
 * Inform the userspace about the channel switch that has just
 * started, so that it can take appropriate actions (eg. starting
 * channel switch on other vifs), if necessary.
 */
void cfg80211_ch_switch_started_notify(struct net_device *dev,
                                       struct cfg80211_chan_def *chandef,
                                       unsigned int link_id, u8 count,
                                       bool quiet);

/**
 * ieee80211_operating_class_to_band - convert operating class to band
 *
 * @operating_class: the operating class to convert
 * @band: band pointer to fill
 *
 * Return: %true if the conversion was successful, %false otherwise.
 */
bool ieee80211_operating_class_to_band(u8 operating_class,
                                       enum nl80211_band *band);

/**
 * ieee80211_operating_class_to_chandef - convert operating class to chandef
 *
 * @operating_class: the operating class to convert
 * @chan: the ieee80211_channel to convert
 * @chandef: a pointer to the resulting chandef
 *
 * Return: %true if the conversion was successful, %false otherwise.
 */
bool ieee80211_operating_class_to_chandef(u8 operating_class,
                                          struct ieee80211_channel *chan,
                                          struct cfg80211_chan_def *chandef);

/**
 * ieee80211_chandef_to_operating_class - convert chandef to operation class
 *
 * @chandef: the chandef to convert
 * @op_class: a pointer to the resulting operating class
 *
 * Return: %true if the conversion was successful, %false otherwise.
 */
bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef,
                                          u8 *op_class);

/**
 * ieee80211_chandef_to_khz - convert chandef to frequency in KHz
 *
 * @chandef: the chandef to convert
 *
 * Return: the center frequency of chandef (1st segment) in KHz.
 */
static inline u32
ieee80211_chandef_to_khz(const struct cfg80211_chan_def *chandef)
{
        return MHZ_TO_KHZ(chandef->center_freq1) + chandef->freq1_offset;
}

/**
 * cfg80211_tdls_oper_request - request userspace to perform TDLS operation
 * @dev: the device on which the operation is requested
 * @peer: the MAC address of the peer device
 * @oper: the requested TDLS operation (NL80211_TDLS_SETUP or
 *        NL80211_TDLS_TEARDOWN)
 * @reason_code: the reason code for teardown request
 * @gfp: allocation flags
 *
 * This function is used to request userspace to perform TDLS operation that
 * requires knowledge of keys, i.e., link setup or teardown when the AP
 * connection uses encryption. This is optional mechanism for the driver to use
 * if it can automatically determine when a TDLS link could be useful (e.g.,
 * based on traffic and signal strength for a peer).
 */
void cfg80211_tdls_oper_request(struct net_device *dev, const u8 *peer,
                                enum nl80211_tdls_operation oper,
                                u16 reason_code, gfp_t gfp);

/**
 * cfg80211_calculate_bitrate - calculate actual bitrate (in 100Kbps units)
 * @rate: given rate_info to calculate bitrate from
 *
 * Return: calculated bitrate
 */
u32 cfg80211_calculate_bitrate(struct rate_info *rate);

/**
 * cfg80211_unregister_wdev - remove the given wdev
 * @wdev: struct wireless_dev to remove
 *
 * This function removes the device so it can no longer be used. It is necessary
 * to call this function even when cfg80211 requests the removal of the device
 * by calling the del_virtual_intf() callback. The function must also be called
 * when the driver wishes to unregister the wdev, e.g. when the hardware device
 * is unbound from the driver.
 *
 * Context: Requires the RTNL and wiphy mutex to be held.
 */
void cfg80211_unregister_wdev(struct wireless_dev *wdev);

/**
 * cfg80211_register_netdevice - register the given netdev
 * @dev: the netdev to register
 *
 * Note: In contexts coming from cfg80211 callbacks, you must call this rather
 * than register_netdevice(), unregister_netdev() is impossible as the RTNL is
 * held. Otherwise, both register_netdevice() and register_netdev() are usable
 * instead as well.
 *
 * Context: Requires the RTNL and wiphy mutex to be held.
 *
 * Return: 0 on success. Non-zero on error.
 */
int cfg80211_register_netdevice(struct net_device *dev);

/**
 * cfg80211_unregister_netdevice - unregister the given netdev
 * @dev: the netdev to register
 *
 * Note: In contexts coming from cfg80211 callbacks, you must call this rather
 * than unregister_netdevice(), unregister_netdev() is impossible as the RTNL
 * is held. Otherwise, both unregister_netdevice() and unregister_netdev() are
 * usable instead as well.
 *
 * Context: Requires the RTNL and wiphy mutex to be held.
 */
static inline void cfg80211_unregister_netdevice(struct net_device *dev)
{
#if IS_ENABLED(CONFIG_CFG80211)
        cfg80211_unregister_wdev(dev->ieee80211_ptr);
#endif
}

/**
 * struct cfg80211_ft_event_params - FT Information Elements
 * @ies: FT IEs
 * @ies_len: length of the FT IE in bytes
 * @target_ap: target AP's MAC address
 * @ric_ies: RIC IE
 * @ric_ies_len: length of the RIC IE in bytes
 */
struct cfg80211_ft_event_params {
        const u8 *ies;
        size_t ies_len;
        const u8 *target_ap;
        const u8 *ric_ies;
        size_t ric_ies_len;
};

/**
 * cfg80211_ft_event - notify userspace about FT IE and RIC IE
 * @netdev: network device
 * @ft_event: IE information
 */
void cfg80211_ft_event(struct net_device *netdev,
                       struct cfg80211_ft_event_params *ft_event);

/**
 * cfg80211_get_p2p_attr - find and copy a P2P attribute from IE buffer
 * @ies: the input IE buffer
 * @len: the input length
 * @attr: the attribute ID to find
 * @buf: output buffer, can be %NULL if the data isn't needed, e.g.
 *        if the function is only called to get the needed buffer size
 * @bufsize: size of the output buffer
 *
 * The function finds a given P2P attribute in the (vendor) IEs and
 * copies its contents to the given buffer.
 *
 * Return: A negative error code (-%EILSEQ or -%ENOENT) if the data is
 * malformed or the attribute can't be found (respectively), or the
 * length of the found attribute (which can be zero).
 */
int cfg80211_get_p2p_attr(const u8 *ies, unsigned int len,
                          enum ieee80211_p2p_attr_id attr,
                          u8 *buf, unsigned int bufsize);

/**
 * ieee80211_ie_split_ric - split an IE buffer according to ordering (with RIC)
 * @ies: the IE buffer
 * @ielen: the length of the IE buffer
 * @ids: an array with element IDs that are allowed before
 *        the split. A WLAN_EID_EXTENSION value means that the next
 *        EID in the list is a sub-element of the EXTENSION IE.
 * @n_ids: the size of the element ID array
 * @after_ric: array IE types that come after the RIC element
 * @n_after_ric: size of the @after_ric array
 * @offset: offset where to start splitting in the buffer
 *
 * This function splits an IE buffer by updating the @offset
 * variable to point to the location where the buffer should be
 * split.
 *
 * It assumes that the given IE buffer is well-formed, this
 * has to be guaranteed by the caller!
 *
 * It also assumes that the IEs in the buffer are ordered
 * correctly, if not the result of using this function will not
 * be ordered correctly either, i.e. it does no reordering.
 *
 * Return: The offset where the next part of the buffer starts, which
 * may be @ielen if the entire (remainder) of the buffer should be
 * used.
 */
size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen,
                              const u8 *ids, int n_ids,
                              const u8 *after_ric, int n_after_ric,
                              size_t offset);

/**
 * ieee80211_ie_split - split an IE buffer according to ordering
 * @ies: the IE buffer
 * @ielen: the length of the IE buffer
 * @ids: an array with element IDs that are allowed before
 *        the split. A WLAN_EID_EXTENSION value means that the next
 *        EID in the list is a sub-element of the EXTENSION IE.
 * @n_ids: the size of the element ID array
 * @offset: offset where to start splitting in the buffer
 *
 * This function splits an IE buffer by updating the @offset
 * variable to point to the location where the buffer should be
 * split.
 *
 * It assumes that the given IE buffer is well-formed, this
 * has to be guaranteed by the caller!
 *
 * It also assumes that the IEs in the buffer are ordered
 * correctly, if not the result of using this function will not
 * be ordered correctly either, i.e. it does no reordering.
 *
 * Return: The offset where the next part of the buffer starts, which
 * may be @ielen if the entire (remainder) of the buffer should be
 * used.
 */
static inline size_t ieee80211_ie_split(const u8 *ies, size_t ielen,
                                        const u8 *ids, int n_ids, size_t offset)
{
        return ieee80211_ie_split_ric(ies, ielen, ids, n_ids, NULL, 0, offset);
}

/**
 * ieee80211_fragment_element - fragment the last element in skb
 * @skb: The skbuf that the element was added to
 * @len_pos: Pointer to length of the element to fragment
 * @frag_id: The element ID to use for fragments
 *
 * This function fragments all data after @len_pos, adding fragmentation
 * elements with the given ID as appropriate. The SKB will grow in size
 * accordingly.
 */
void ieee80211_fragment_element(struct sk_buff *skb, u8 *len_pos, u8 frag_id);

/**
 * cfg80211_report_wowlan_wakeup - report wakeup from WoWLAN
 * @wdev: the wireless device reporting the wakeup
 * @wakeup: the wakeup report
 * @gfp: allocation flags
 *
 * This function reports that the given device woke up. If it
 * caused the wakeup, report the reason(s), otherwise you may
 * pass %NULL as the @wakeup parameter to advertise that something
 * else caused the wakeup.
 */
void cfg80211_report_wowlan_wakeup(struct wireless_dev *wdev,
                                   struct cfg80211_wowlan_wakeup *wakeup,
                                   gfp_t gfp);

/**
 * cfg80211_crit_proto_stopped() - indicate critical protocol stopped by driver.
 *
 * @wdev: the wireless device for which critical protocol is stopped.
 * @gfp: allocation flags
 *
 * This function can be called by the driver to indicate it has reverted
 * operation back to normal. One reason could be that the duration given
 * by .crit_proto_start() has expired.
 */
void cfg80211_crit_proto_stopped(struct wireless_dev *wdev, gfp_t gfp);

/**
 * ieee80211_get_num_supported_channels - get number of channels device has
 * @wiphy: the wiphy
 *
 * Return: the number of channels supported by the device.
 */
unsigned int ieee80211_get_num_supported_channels(struct wiphy *wiphy);

/**
 * cfg80211_check_combinations - check interface combinations
 *
 * @wiphy: the wiphy
 * @params: the interface combinations parameter
 *
 * This function can be called by the driver to check whether a
 * combination of interfaces and their types are allowed according to
 * the interface combinations.
 *
 * Return: 0 if combinations are allowed. Non-zero on error.
 */
int cfg80211_check_combinations(struct wiphy *wiphy,
                                struct iface_combination_params *params);

/**
 * cfg80211_iter_combinations - iterate over matching combinations
 *
 * @wiphy: the wiphy
 * @params: the interface combinations parameter
 * @iter: function to call for each matching combination
 * @data: pointer to pass to iter function
 *
 * This function can be called by the driver to check what possible
 * combinations it fits in at a given moment, e.g. for channel switching
 * purposes.
 *
 * Return: 0 on success. Non-zero on error.
 */
int cfg80211_iter_combinations(struct wiphy *wiphy,
                               struct iface_combination_params *params,
                               void (*iter)(const struct ieee80211_iface_combination *c,
                                            void *data),
                               void *data);
/**
 * cfg80211_get_radio_idx_by_chan - get the radio index by the channel
 *
 * @wiphy: the wiphy
 * @chan: channel for which the supported radio index is required
 *
 * Return: radio index on success or -EINVAL otherwise
 */
int cfg80211_get_radio_idx_by_chan(struct wiphy *wiphy,
                                   const struct ieee80211_channel *chan);


/**
 * cfg80211_stop_iface - trigger interface disconnection
 *
 * @wiphy: the wiphy
 * @wdev: wireless device
 * @gfp: context flags
 *
 * Trigger interface to be stopped as if AP was stopped, IBSS/mesh left, STA
 * disconnected.
 *
 * Note: This doesn't need any locks and is asynchronous.
 */
void cfg80211_stop_iface(struct wiphy *wiphy, struct wireless_dev *wdev,
                         gfp_t gfp);

/**
 * cfg80211_shutdown_all_interfaces - shut down all interfaces for a wiphy
 * @wiphy: the wiphy to shut down
 *
 * This function shuts down all interfaces belonging to this wiphy by
 * calling dev_close() (and treating non-netdev interfaces as needed).
 * It shouldn't really be used unless there are some fatal device errors
 * that really can't be recovered in any other way.
 *
 * Callers must hold the RTNL and be able to deal with callbacks into
 * the driver while the function is running.
 */
void cfg80211_shutdown_all_interfaces(struct wiphy *wiphy);

/**
 * wiphy_ext_feature_set - set the extended feature flag
 *
 * @wiphy: the wiphy to modify.
 * @ftidx: extended feature bit index.
 *
 * The extended features are flagged in multiple bytes (see
 * &struct wiphy.@ext_features)
 */
static inline void wiphy_ext_feature_set(struct wiphy *wiphy,
                                         enum nl80211_ext_feature_index ftidx)
{
        u8 *ft_byte;

        ft_byte = &wiphy->ext_features[ftidx / 8];
        *ft_byte |= BIT(ftidx % 8);
}

/**
 * wiphy_ext_feature_isset - check the extended feature flag
 *
 * @wiphy: the wiphy to modify.
 * @ftidx: extended feature bit index.
 *
 * The extended features are flagged in multiple bytes (see
 * &struct wiphy.@ext_features)
 *
 * Return: %true if extended feature flag is set, %false otherwise
 */
static inline bool
wiphy_ext_feature_isset(struct wiphy *wiphy,
                        enum nl80211_ext_feature_index ftidx)
{
        u8 ft_byte;

        ft_byte = wiphy->ext_features[ftidx / 8];
        return (ft_byte & BIT(ftidx % 8)) != 0;
}

/**
 * cfg80211_free_nan_func - free NAN function
 * @f: NAN function that should be freed
 *
 * Frees all the NAN function and all it's allocated members.
 */
void cfg80211_free_nan_func(struct cfg80211_nan_func *f);

/**
 * struct cfg80211_nan_match_params - NAN match parameters
 * @type: the type of the function that triggered a match. If it is
 *         %NL80211_NAN_FUNC_SUBSCRIBE it means that we replied to a subscriber.
 *         If it is %NL80211_NAN_FUNC_PUBLISH, it means that we got a discovery
 *         result.
 *         If it is %NL80211_NAN_FUNC_FOLLOW_UP, we received a follow up.
 * @inst_id: the local instance id
 * @peer_inst_id: the instance id of the peer's function
 * @addr: the MAC address of the peer
 * @info_len: the length of the &info
 * @info: the Service Specific Info from the peer (if any)
 * @cookie: unique identifier of the corresponding function
 */
struct cfg80211_nan_match_params {
        enum nl80211_nan_function_type type;
        u8 inst_id;
        u8 peer_inst_id;
        const u8 *addr;
        u8 info_len;
        const u8 *info;
        u64 cookie;
};

/**
 * cfg80211_nan_match - report a match for a NAN function.
 * @wdev: the wireless device reporting the match
 * @match: match notification parameters
 * @gfp: allocation flags
 *
 * This function reports that the a NAN function had a match. This
 * can be a subscribe that had a match or a solicited publish that
 * was sent. It can also be a follow up that was received.
 */
void cfg80211_nan_match(struct wireless_dev *wdev,
                        struct cfg80211_nan_match_params *match, gfp_t gfp);

/**
 * cfg80211_nan_func_terminated - notify about NAN function termination.
 *
 * @wdev: the wireless device reporting the match
 * @inst_id: the local instance id
 * @reason: termination reason (one of the NL80211_NAN_FUNC_TERM_REASON_*)
 * @cookie: unique NAN function identifier
 * @gfp: allocation flags
 *
 * This function reports that the a NAN function is terminated.
 */
void cfg80211_nan_func_terminated(struct wireless_dev *wdev,
                                  u8 inst_id,
                                  enum nl80211_nan_func_term_reason reason,
                                  u64 cookie, gfp_t gfp);

/* ethtool helper */
void cfg80211_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info);

/**
 * cfg80211_external_auth_request - userspace request for authentication
 * @netdev: network device
 * @params: External authentication parameters
 * @gfp: allocation flags
 * Returns: 0 on success, < 0 on error
 */
int cfg80211_external_auth_request(struct net_device *netdev,
                                   struct cfg80211_external_auth_params *params,
                                   gfp_t gfp);

/**
 * cfg80211_pmsr_report - report peer measurement result data
 * @wdev: the wireless device reporting the measurement
 * @req: the original measurement request
 * @result: the result data
 * @gfp: allocation flags
 */
void cfg80211_pmsr_report(struct wireless_dev *wdev,
                          struct cfg80211_pmsr_request *req,
                          struct cfg80211_pmsr_result *result,
                          gfp_t gfp);

/**
 * cfg80211_pmsr_complete - report peer measurement completed
 * @wdev: the wireless device reporting the measurement
 * @req: the original measurement request
 * @gfp: allocation flags
 *
 * Report that the entire measurement completed, after this
 * the request pointer will no longer be valid.
 */
void cfg80211_pmsr_complete(struct wireless_dev *wdev,
                            struct cfg80211_pmsr_request *req,
                            gfp_t gfp);

/**
 * cfg80211_iftype_allowed - check whether the interface can be allowed
 * @wiphy: the wiphy
 * @iftype: interface type
 * @is_4addr: use_4addr flag, must be '0' when check_swif is '1'
 * @check_swif: check iftype against software interfaces
 *
 * Check whether the interface is allowed to operate; additionally, this API
 * can be used to check iftype against the software interfaces when
 * check_swif is '1'.
 *
 * Return: %true if allowed, %false otherwise
 */
bool cfg80211_iftype_allowed(struct wiphy *wiphy, enum nl80211_iftype iftype,
                             bool is_4addr, u8 check_swif);


/**
 * cfg80211_assoc_comeback - notification of association that was
 * temporarily rejected with a comeback
 * @netdev: network device
 * @ap_addr: AP (MLD) address that rejected the association
 * @timeout: timeout interval value TUs.
 *
 * this function may sleep. the caller must hold the corresponding wdev's mutex.
 */
void cfg80211_assoc_comeback(struct net_device *netdev,
                             const u8 *ap_addr, u32 timeout);

/* Logging, debugging and troubleshooting/diagnostic helpers. */

/* wiphy_printk helpers, similar to dev_printk */

#define wiphy_printk(level, wiphy, format, args...)                \
        dev_printk(level, &(wiphy)->dev, format, ##args)
#define wiphy_emerg(wiphy, format, args...)                        \
        dev_emerg(&(wiphy)->dev, format, ##args)
#define wiphy_alert(wiphy, format, args...)                        \
        dev_alert(&(wiphy)->dev, format, ##args)
#define wiphy_crit(wiphy, format, args...)                        \
        dev_crit(&(wiphy)->dev, format, ##args)
#define wiphy_err(wiphy, format, args...)                        \
        dev_err(&(wiphy)->dev, format, ##args)
#define wiphy_warn(wiphy, format, args...)                        \
        dev_warn(&(wiphy)->dev, format, ##args)
#define wiphy_notice(wiphy, format, args...)                        \
        dev_notice(&(wiphy)->dev, format, ##args)
#define wiphy_info(wiphy, format, args...)                        \
        dev_info(&(wiphy)->dev, format, ##args)
#define wiphy_info_once(wiphy, format, args...)                        \
        dev_info_once(&(wiphy)->dev, format, ##args)

#define wiphy_err_ratelimited(wiphy, format, args...)                \
        dev_err_ratelimited(&(wiphy)->dev, format, ##args)
#define wiphy_warn_ratelimited(wiphy, format, args...)                \
        dev_warn_ratelimited(&(wiphy)->dev, format, ##args)

#define wiphy_debug(wiphy, format, args...)                        \
        wiphy_printk(KERN_DEBUG, wiphy, format, ##args)

#define wiphy_dbg(wiphy, format, args...)                        \
        dev_dbg(&(wiphy)->dev, format, ##args)

#if defined(VERBOSE_DEBUG)
#define wiphy_vdbg        wiphy_dbg
#else
#define wiphy_vdbg(wiphy, format, args...)                                \
({                                                                        \
        if (0)                                                                \
                wiphy_printk(KERN_DEBUG, wiphy, format, ##args);        \
        0;                                                                \
})
#endif

/*
 * wiphy_WARN() acts like wiphy_printk(), but with the key difference
 * of using a WARN/WARN_ON to get the message out, including the
 * file/line information and a backtrace.
 */
#define wiphy_WARN(wiphy, format, args...)                        \
        WARN(1, "wiphy: %s\n" format, wiphy_name(wiphy), ##args);

/**
 * cfg80211_update_owe_info_event - Notify the peer's OWE info to user space
 * @netdev: network device
 * @owe_info: peer's owe info
 * @gfp: allocation flags
 */
void cfg80211_update_owe_info_event(struct net_device *netdev,
                                    struct cfg80211_update_owe_info *owe_info,
                                    gfp_t gfp);

/**
 * cfg80211_bss_flush - resets all the scan entries
 * @wiphy: the wiphy
 */
void cfg80211_bss_flush(struct wiphy *wiphy);

/**
 * cfg80211_bss_color_notify - notify about bss color event
 * @dev: network device
 * @cmd: the actual event we want to notify
 * @count: the number of TBTTs until the color change happens
 * @color_bitmap: representations of the colors that the local BSS is aware of
 * @link_id: valid link_id in case of MLO or 0 for non-MLO.
 *
 * Return: 0 on success. Non-zero on error.
 */
int cfg80211_bss_color_notify(struct net_device *dev,
                              enum nl80211_commands cmd, u8 count,
                              u64 color_bitmap, u8 link_id);

/**
 * cfg80211_obss_color_collision_notify - notify about bss color collision
 * @dev: network device
 * @color_bitmap: representations of the colors that the local BSS is aware of
 * @link_id: valid link_id in case of MLO or 0 for non-MLO.
 *
 * Return: 0 on success. Non-zero on error.
 */
static inline int cfg80211_obss_color_collision_notify(struct net_device *dev,
                                                       u64 color_bitmap,
                                                       u8 link_id)
{
        return cfg80211_bss_color_notify(dev, NL80211_CMD_OBSS_COLOR_COLLISION,
                                         0, color_bitmap, link_id);
}

/**
 * cfg80211_color_change_started_notify - notify color change start
 * @dev: the device on which the color is switched
 * @count: the number of TBTTs until the color change happens
 * @link_id: valid link_id in case of MLO or 0 for non-MLO.
 *
 * Inform the userspace about the color change that has started.
 *
 * Return: 0 on success. Non-zero on error.
 */
static inline int cfg80211_color_change_started_notify(struct net_device *dev,
                                                       u8 count, u8 link_id)
{
        return cfg80211_bss_color_notify(dev, NL80211_CMD_COLOR_CHANGE_STARTED,
                                         count, 0, link_id);
}

/**
 * cfg80211_color_change_aborted_notify - notify color change abort
 * @dev: the device on which the color is switched
 * @link_id: valid link_id in case of MLO or 0 for non-MLO.
 *
 * Inform the userspace about the color change that has aborted.
 *
 * Return: 0 on success. Non-zero on error.
 */
static inline int cfg80211_color_change_aborted_notify(struct net_device *dev,
                                                       u8 link_id)
{
        return cfg80211_bss_color_notify(dev, NL80211_CMD_COLOR_CHANGE_ABORTED,
                                         0, 0, link_id);
}

/**
 * cfg80211_color_change_notify - notify color change completion
 * @dev: the device on which the color was switched
 * @link_id: valid link_id in case of MLO or 0 for non-MLO.
 *
 * Inform the userspace about the color change that has completed.
 *
 * Return: 0 on success. Non-zero on error.
 */
static inline int cfg80211_color_change_notify(struct net_device *dev,
                                               u8 link_id)
{
        return cfg80211_bss_color_notify(dev,
                                         NL80211_CMD_COLOR_CHANGE_COMPLETED,
                                         0, 0, link_id);
}

/**
 * cfg80211_links_removed - Notify about removed STA MLD setup links.
 * @dev: network device.
 * @link_mask: BIT mask of removed STA MLD setup link IDs.
 *
 * Inform cfg80211 and the userspace about removed STA MLD setup links due to
 * AP MLD removing the corresponding affiliated APs with Multi-Link
 * reconfiguration. Note that it's not valid to remove all links, in this
 * case disconnect instead.
 * Also note that the wdev mutex must be held.
 */
void cfg80211_links_removed(struct net_device *dev, u16 link_mask);

/**
 * struct cfg80211_mlo_reconf_done_data - MLO reconfiguration data
 * @buf: MLO Reconfiguration Response frame (header + body)
 * @len: length of the frame data
 * @driver_initiated: Indicates whether the add links request is initiated by
 *        driver. This is set to true when the link reconfiguration request
 *        initiated by driver due to AP link recommendation requests
 *        (Ex: BTM (BSS Transition Management) request) handling offloaded to
 *        driver.
 * @added_links: BIT mask of links successfully added to the association
 * @links: per-link information indexed by link ID
 * @links.bss: the BSS that MLO reconfiguration was requested for, ownership of
 *      the pointer moves to cfg80211 in the call to
 *      cfg80211_mlo_reconf_add_done().
 *
 * The BSS pointer must be set for each link for which 'add' operation was
 * requested in the assoc_ml_reconf callback.
 */
struct cfg80211_mlo_reconf_done_data {
        const u8 *buf;
        size_t len;
        bool driver_initiated;
        u16 added_links;
        struct {
                struct cfg80211_bss *bss;
                u8 *addr;
        } links[IEEE80211_MLD_MAX_NUM_LINKS];
};

/**
 * cfg80211_mlo_reconf_add_done - Notify about MLO reconfiguration result
 * @dev: network device.
 * @data: MLO reconfiguration done data, &struct cfg80211_mlo_reconf_done_data
 *
 * Inform cfg80211 and the userspace that processing of ML reconfiguration
 * request to add links to the association is done.
 */
void cfg80211_mlo_reconf_add_done(struct net_device *dev,
                                  struct cfg80211_mlo_reconf_done_data *data);

/**
 * cfg80211_schedule_channels_check - schedule regulatory check if needed
 * @wdev: the wireless device to check
 *
 * In case the device supports NO_IR or DFS relaxations, schedule regulatory
 * channels check, as previous concurrent operation conditions may not
 * hold anymore.
 */
void cfg80211_schedule_channels_check(struct wireless_dev *wdev);

/**
 * cfg80211_epcs_changed - Notify about a change in EPCS state
 * @netdev: the wireless device whose EPCS state changed
 * @enabled: set to true if EPCS was enabled, otherwise set to false.
 */
void cfg80211_epcs_changed(struct net_device *netdev, bool enabled);

/**
 * cfg80211_next_nan_dw_notif - Notify about the next NAN Discovery Window (DW)
 * @wdev: Pointer to the wireless device structure
 * @chan: DW channel (6, 44 or 149)
 * @gfp: Memory allocation flags
 */
void cfg80211_next_nan_dw_notif(struct wireless_dev *wdev,
                                struct ieee80211_channel *chan, gfp_t gfp);

/**
 * cfg80211_nan_cluster_joined - Notify about NAN cluster join
 * @wdev: Pointer to the wireless device structure
 * @cluster_id: Cluster ID of the NAN cluster that was joined or started
 * @new_cluster: Indicates if this is a new cluster or an existing one
 * @gfp: Memory allocation flags
 *
 * This function is used to notify user space when a NAN cluster has been
 * joined, providing the cluster ID and a flag whether it is a new cluster.
 */
void cfg80211_nan_cluster_joined(struct wireless_dev *wdev,
                                 const u8 *cluster_id, bool new_cluster,
                                 gfp_t gfp);

#ifdef CONFIG_CFG80211_DEBUGFS
/**
 * wiphy_locked_debugfs_read - do a locked read in debugfs
 * @wiphy: the wiphy to use
 * @file: the file being read
 * @buf: the buffer to fill and then read from
 * @bufsize: size of the buffer
 * @userbuf: the user buffer to copy to
 * @count: read count
 * @ppos: read position
 * @handler: the read handler to call (under wiphy lock)
 * @data: additional data to pass to the read handler
 *
 * Return: the number of characters read, or a negative errno
 */
ssize_t wiphy_locked_debugfs_read(struct wiphy *wiphy, struct file *file,
                                  char *buf, size_t bufsize,
                                  char __user *userbuf, size_t count,
                                  loff_t *ppos,
                                  ssize_t (*handler)(struct wiphy *wiphy,
                                                     struct file *file,
                                                     char *buf,
                                                     size_t bufsize,
                                                     void *data),
                                  void *data);

/**
 * wiphy_locked_debugfs_write - do a locked write in debugfs
 * @wiphy: the wiphy to use
 * @file: the file being written to
 * @buf: the buffer to copy the user data to
 * @bufsize: size of the buffer
 * @userbuf: the user buffer to copy from
 * @count: read count
 * @handler: the write handler to call (under wiphy lock)
 * @data: additional data to pass to the write handler
 *
 * Return: the number of characters written, or a negative errno
 */
ssize_t wiphy_locked_debugfs_write(struct wiphy *wiphy, struct file *file,
                                   char *buf, size_t bufsize,
                                   const char __user *userbuf, size_t count,
                                   ssize_t (*handler)(struct wiphy *wiphy,
                                                      struct file *file,
                                                      char *buf,
                                                      size_t count,
                                                      void *data),
                                   void *data);
#endif

/**
 * cfg80211_s1g_get_start_freq_khz - get S1G chandef start frequency
 * @chandef: the chandef to use
 *
 * Return: the chandefs starting frequency in KHz
 */
static inline u32
cfg80211_s1g_get_start_freq_khz(const struct cfg80211_chan_def *chandef)
{
        u32 bw_mhz = cfg80211_chandef_get_width(chandef);
        u32 center_khz =
                MHZ_TO_KHZ(chandef->center_freq1) + chandef->freq1_offset;
        return center_khz - bw_mhz * 500 + 500;
}

/**
 * cfg80211_s1g_get_end_freq_khz - get S1G chandef end frequency
 * @chandef: the chandef to use
 *
 * Return: the chandefs ending frequency in KHz
 */
static inline u32
cfg80211_s1g_get_end_freq_khz(const struct cfg80211_chan_def *chandef)
{
        u32 bw_mhz = cfg80211_chandef_get_width(chandef);
        u32 center_khz =
                MHZ_TO_KHZ(chandef->center_freq1) + chandef->freq1_offset;
        return center_khz + bw_mhz * 500 - 500;
}

/**
 * cfg80211_s1g_get_primary_sibling - retrieve the sibling 1MHz subchannel
 *        for an S1G chandef using a 2MHz primary channel.
 * @wiphy: wiphy the channel belongs to
 * @chandef: the chandef to use
 *
 * When chandef::s1g_primary_2mhz is set to true, we are operating on a 2MHz
 * primary channel. The 1MHz subchannel designated by the primary channel
 * location exists within chandef::chan, whilst the 'sibling' is denoted as
 * being the other 1MHz subchannel that make up the 2MHz primary channel.
 *
 * Returns: the sibling 1MHz &struct ieee80211_channel, or %NULL on failure.
 */
static inline struct ieee80211_channel *
cfg80211_s1g_get_primary_sibling(struct wiphy *wiphy,
                                 const struct cfg80211_chan_def *chandef)
{
        int width_mhz = cfg80211_chandef_get_width(chandef);
        u32 pri_1mhz_khz, sibling_1mhz_khz, op_low_1mhz_khz, pri_index;

        if (!chandef->s1g_primary_2mhz || width_mhz < 2)
                return NULL;

        pri_1mhz_khz = ieee80211_channel_to_khz(chandef->chan);
        op_low_1mhz_khz = cfg80211_s1g_get_start_freq_khz(chandef);

        /*
         * Compute the index of the primary 1 MHz subchannel within the
         * operating channel, relative to the lowest 1 MHz center frequency.
         * Flip the least significant bit to select the even/odd sibling,
         * then translate that index back into a channel frequency.
         */
        pri_index = (pri_1mhz_khz - op_low_1mhz_khz) / 1000;
        sibling_1mhz_khz = op_low_1mhz_khz + ((pri_index ^ 1) * 1000);

        return ieee80211_get_channel_khz(wiphy, sibling_1mhz_khz);
}

#endif /* __NET_CFG80211_H */































































































































































































































































































































































































































































































































   19 
   17 




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* internal.h: mm/ internal definitions
 *
 * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */
#ifndef __MM_INTERNAL_H
#define __MM_INTERNAL_H

#include <linux/fs.h>
#include <linux/khugepaged.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/pagemap.h>
#include <linux/pagewalk.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/swap_cgroup.h>
#include <linux/tracepoint-defs.h>

/* Internal core VMA manipulation functions. */
#include "vma.h"

struct folio_batch;

/*
 * Maintains state across a page table move. The operation assumes both source
 * and destination VMAs already exist and are specified by the user.
 *
 * Partial moves are permitted, but the old and new ranges must both reside
 * within a VMA.
 *
 * mmap lock must be held in write and VMA write locks must be held on any VMA
 * that is visible.
 *
 * Use the PAGETABLE_MOVE() macro to initialise this struct.
 *
 * The old_addr and new_addr fields are updated as the page table move is
 * executed.
 *
 * NOTE: The page table move is affected by reading from [old_addr, old_end),
 * and old_addr may be updated for better page table alignment, so len_in
 * represents the length of the range being copied as specified by the user.
 */
struct pagetable_move_control {
        struct vm_area_struct *old; /* Source VMA. */
        struct vm_area_struct *new; /* Destination VMA. */
        unsigned long old_addr; /* Address from which the move begins. */
        unsigned long old_end; /* Exclusive address at which old range ends. */
        unsigned long new_addr; /* Address to move page tables to. */
        unsigned long len_in; /* Bytes to remap specified by user. */

        bool need_rmap_locks; /* Do rmap locks need to be taken? */
        bool for_stack; /* Is this an early temp stack being moved? */
};

#define PAGETABLE_MOVE(name, old_, new_, old_addr_, new_addr_, len_)        \
        struct pagetable_move_control name = {                                \
                .old = old_,                                                \
                .new = new_,                                                \
                .old_addr = old_addr_,                                        \
                .old_end = (old_addr_) + (len_),                        \
                .new_addr = new_addr_,                                        \
                .len_in = len_,                                                \
        }

/*
 * The set of flags that only affect watermark checking and reclaim
 * behaviour. This is used by the MM to obey the caller constraints
 * about IO, FS and watermark checking while ignoring placement
 * hints such as HIGHMEM usage.
 */
#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\
                        __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\
                        __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\
                        __GFP_NOLOCKDEP)

/* The GFP flags allowed during early boot */
#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS))

/* Control allocation cpuset and node placement constraints */
#define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)

/* Do not use these with a slab allocator */
#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)

/*
 * Different from WARN_ON_ONCE(), no warning will be issued
 * when we specify __GFP_NOWARN.
 */
#define WARN_ON_ONCE_GFP(cond, gfp)        ({                                \
        static bool __section(".data..once") __warned;                        \
        int __ret_warn_once = !!(cond);                                        \
                                                                        \
        if (unlikely(!(gfp & __GFP_NOWARN) && __ret_warn_once && !__warned)) { \
                __warned = true;                                        \
                WARN_ON(1);                                                \
        }                                                                \
        unlikely(__ret_warn_once);                                        \
})

void page_writeback_init(void);

/*
 * If a 16GB hugetlb folio were mapped by PTEs of all of its 4kB pages,
 * its nr_pages_mapped would be 0x400000: choose the ENTIRELY_MAPPED bit
 * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE).  Hugetlb currently
 * leaves nr_pages_mapped at 0, but avoid surprise if it participates later.
 */
#define ENTIRELY_MAPPED                0x800000
#define FOLIO_PAGES_MAPPED        (ENTIRELY_MAPPED - 1)

/*
 * Flags passed to __show_mem() and show_free_areas() to suppress output in
 * various contexts.
 */
#define SHOW_MEM_FILTER_NODES                (0x0001u)        /* disallowed nodes */

/*
 * How many individual pages have an elevated _mapcount.  Excludes
 * the folio's entire_mapcount.
 *
 * Don't use this function outside of debugging code.
 */
static inline int folio_nr_pages_mapped(const struct folio *folio)
{
        if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT))
                return -1;
        return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED;
}

/*
 * Retrieve the first entry of a folio based on a provided entry within the
 * folio. We cannot rely on folio->swap as there is no guarantee that it has
 * been initialized. Used for calling arch_swap_restore()
 */
static inline swp_entry_t folio_swap(swp_entry_t entry,
                const struct folio *folio)
{
        swp_entry_t swap = {
                .val = ALIGN_DOWN(entry.val, folio_nr_pages(folio)),
        };

        return swap;
}

static inline void *folio_raw_mapping(const struct folio *folio)
{
        unsigned long mapping = (unsigned long)folio->mapping;

        return (void *)(mapping & ~FOLIO_MAPPING_FLAGS);
}

/*
 * This is a file-backed mapping, and is about to be memory mapped - invoke its
 * mmap hook and safely handle error conditions. On error, VMA hooks will be
 * mutated.
 *
 * @file: File which backs the mapping.
 * @vma:  VMA which we are mapping.
 *
 * Returns: 0 if success, error otherwise.
 */
static inline int mmap_file(struct file *file, struct vm_area_struct *vma)
{
        int err = vfs_mmap(file, vma);

        if (likely(!err))
                return 0;

        /*
         * OK, we tried to call the file hook for mmap(), but an error
         * arose. The mapping is in an inconsistent state and we most not invoke
         * any further hooks on it.
         */
        vma->vm_ops = &vma_dummy_vm_ops;

        return err;
}

/*
 * If the VMA has a close hook then close it, and since closing it might leave
 * it in an inconsistent state which makes the use of any hooks suspect, clear
 * them down by installing dummy empty hooks.
 */
static inline void vma_close(struct vm_area_struct *vma)
{
        if (vma->vm_ops && vma->vm_ops->close) {
                vma->vm_ops->close(vma);

                /*
                 * The mapping is in an inconsistent state, and no further hooks
                 * may be invoked upon it.
                 */
                vma->vm_ops = &vma_dummy_vm_ops;
        }
}

#ifdef CONFIG_MMU

/* Flags for folio_pte_batch(). */
typedef int __bitwise fpb_t;

/* Compare PTEs respecting the dirty bit. */
#define FPB_RESPECT_DIRTY                ((__force fpb_t)BIT(0))

/* Compare PTEs respecting the soft-dirty bit. */
#define FPB_RESPECT_SOFT_DIRTY                ((__force fpb_t)BIT(1))

/* Compare PTEs respecting the writable bit. */
#define FPB_RESPECT_WRITE                ((__force fpb_t)BIT(2))

/*
 * Merge PTE write bits: if any PTE in the batch is writable, modify the
 * PTE at @ptentp to be writable.
 */
#define FPB_MERGE_WRITE                        ((__force fpb_t)BIT(3))

/*
 * Merge PTE young and dirty bits: if any PTE in the batch is young or dirty,
 * modify the PTE at @ptentp to be young or dirty, respectively.
 */
#define FPB_MERGE_YOUNG_DIRTY                ((__force fpb_t)BIT(4))

static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
{
        if (!(flags & FPB_RESPECT_DIRTY))
                pte = pte_mkclean(pte);
        if (likely(!(flags & FPB_RESPECT_SOFT_DIRTY)))
                pte = pte_clear_soft_dirty(pte);
        if (likely(!(flags & FPB_RESPECT_WRITE)))
                pte = pte_wrprotect(pte);
        return pte_mkold(pte);
}

/**
 * folio_pte_batch_flags - detect a PTE batch for a large folio
 * @folio: The large folio to detect a PTE batch for.
 * @vma: The VMA. Only relevant with FPB_MERGE_WRITE, otherwise can be NULL.
 * @ptep: Page table pointer for the first entry.
 * @ptentp: Pointer to a COPY of the first page table entry whose flags this
 *            function updates based on @flags if appropriate.
 * @max_nr: The maximum number of table entries to consider.
 * @flags: Flags to modify the PTE batch semantics.
 *
 * Detect a PTE batch: consecutive (present) PTEs that map consecutive
 * pages of the same large folio in a single VMA and a single page table.
 *
 * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN,
 * the accessed bit, writable bit, dirty bit (unless FPB_RESPECT_DIRTY is set)
 * and soft-dirty bit (unless FPB_RESPECT_SOFT_DIRTY is set).
 *
 * @ptep must map any page of the folio. max_nr must be at least one and
 * must be limited by the caller so scanning cannot exceed a single VMA and
 * a single page table.
 *
 * Depending on the FPB_MERGE_* flags, the pte stored at @ptentp will
 * be updated: it's crucial that a pointer to a COPY of the first
 * page table entry, obtained through ptep_get(), is provided as @ptentp.
 *
 * This function will be inlined to optimize based on the input parameters;
 * consider using folio_pte_batch() instead if applicable.
 *
 * Return: the number of table entries in the batch.
 */
static inline unsigned int folio_pte_batch_flags(struct folio *folio,
                struct vm_area_struct *vma, pte_t *ptep, pte_t *ptentp,
                unsigned int max_nr, fpb_t flags)
{
        bool any_writable = false, any_young = false, any_dirty = false;
        pte_t expected_pte, pte = *ptentp;
        unsigned int nr, cur_nr;

        VM_WARN_ON_FOLIO(!pte_present(pte), folio);
        VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio);
        VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio);
        /*
         * Ensure this is a pointer to a copy not a pointer into a page table.
         * If this is a stack value, it won't be a valid virtual address, but
         * that's fine because it also cannot be pointing into the page table.
         */
        VM_WARN_ON(virt_addr_valid(ptentp) && PageTable(virt_to_page(ptentp)));

        /* Limit max_nr to the actual remaining PFNs in the folio we could batch. */
        max_nr = min_t(unsigned long, max_nr,
                       folio_pfn(folio) + folio_nr_pages(folio) - pte_pfn(pte));

        nr = pte_batch_hint(ptep, pte);
        expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags);
        ptep = ptep + nr;

        while (nr < max_nr) {
                pte = ptep_get(ptep);

                if (!pte_same(__pte_batch_clear_ignored(pte, flags), expected_pte))
                        break;

                if (flags & FPB_MERGE_WRITE)
                        any_writable |= pte_write(pte);
                if (flags & FPB_MERGE_YOUNG_DIRTY) {
                        any_young |= pte_young(pte);
                        any_dirty |= pte_dirty(pte);
                }

                cur_nr = pte_batch_hint(ptep, pte);
                expected_pte = pte_advance_pfn(expected_pte, cur_nr);
                ptep += cur_nr;
                nr += cur_nr;
        }

        if (any_writable)
                *ptentp = pte_mkwrite(*ptentp, vma);
        if (any_young)
                *ptentp = pte_mkyoung(*ptentp);
        if (any_dirty)
                *ptentp = pte_mkdirty(*ptentp);

        return min(nr, max_nr);
}

unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte,
                unsigned int max_nr);

/**
 * pte_move_swp_offset - Move the swap entry offset field of a swap pte
 *         forward or backward by delta
 * @pte: The initial pte state; is_swap_pte(pte) must be true and
 *         non_swap_entry() must be false.
 * @delta: The direction and the offset we are moving; forward if delta
 *         is positive; backward if delta is negative
 *
 * Moves the swap offset, while maintaining all other fields, including
 * swap type, and any swp pte bits. The resulting pte is returned.
 */
static inline pte_t pte_move_swp_offset(pte_t pte, long delta)
{
        swp_entry_t entry = pte_to_swp_entry(pte);
        pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
                                                   (swp_offset(entry) + delta)));

        if (pte_swp_soft_dirty(pte))
                new = pte_swp_mksoft_dirty(new);
        if (pte_swp_exclusive(pte))
                new = pte_swp_mkexclusive(new);
        if (pte_swp_uffd_wp(pte))
                new = pte_swp_mkuffd_wp(new);

        return new;
}


/**
 * pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
 * @pte: The initial pte state; is_swap_pte(pte) must be true and
 *         non_swap_entry() must be false.
 *
 * Increments the swap offset, while maintaining all other fields, including
 * swap type, and any swp pte bits. The resulting pte is returned.
 */
static inline pte_t pte_next_swp_offset(pte_t pte)
{
        return pte_move_swp_offset(pte, 1);
}

/**
 * swap_pte_batch - detect a PTE batch for a set of contiguous swap entries
 * @start_ptep: Page table pointer for the first entry.
 * @max_nr: The maximum number of table entries to consider.
 * @pte: Page table entry for the first entry.
 *
 * Detect a batch of contiguous swap entries: consecutive (non-present) PTEs
 * containing swap entries all with consecutive offsets and targeting the same
 * swap type, all with matching swp pte bits.
 *
 * max_nr must be at least one and must be limited by the caller so scanning
 * cannot exceed a single page table.
 *
 * Return: the number of table entries in the batch.
 */
static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
{
        pte_t expected_pte = pte_next_swp_offset(pte);
        const pte_t *end_ptep = start_ptep + max_nr;
        swp_entry_t entry = pte_to_swp_entry(pte);
        pte_t *ptep = start_ptep + 1;
        unsigned short cgroup_id;

        VM_WARN_ON(max_nr < 1);
        VM_WARN_ON(!is_swap_pte(pte));
        VM_WARN_ON(non_swap_entry(entry));

        cgroup_id = lookup_swap_cgroup_id(entry);
        while (ptep < end_ptep) {
                pte = ptep_get(ptep);

                if (!pte_same(pte, expected_pte))
                        break;
                if (lookup_swap_cgroup_id(pte_to_swp_entry(pte)) != cgroup_id)
                        break;
                expected_pte = pte_next_swp_offset(expected_pte);
                ptep++;
        }

        return ptep - start_ptep;
}
#endif /* CONFIG_MMU */

void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
                                                int nr_throttled);
static inline void acct_reclaim_writeback(struct folio *folio)
{
        pg_data_t *pgdat = folio_pgdat(folio);
        int nr_throttled = atomic_read(&pgdat->nr_writeback_throttled);

        if (nr_throttled)
                __acct_reclaim_writeback(pgdat, folio, nr_throttled);
}

static inline void wake_throttle_isolated(pg_data_t *pgdat)
{
        wait_queue_head_t *wqh;

        wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_ISOLATED];
        if (waitqueue_active(wqh))
                wake_up(wqh);
}

vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf);
static inline vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)
{
        vm_fault_t ret = __vmf_anon_prepare(vmf);

        if (unlikely(ret & VM_FAULT_RETRY))
                vma_end_read(vmf->vma);
        return ret;
}

vm_fault_t do_swap_page(struct vm_fault *vmf);
void folio_rotate_reclaimable(struct folio *folio);
bool __folio_end_writeback(struct folio *folio);
void deactivate_file_folio(struct folio *folio);
void folio_activate(struct folio *folio);

void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
                   struct vm_area_struct *start_vma, unsigned long floor,
                   unsigned long ceiling, bool mm_wr_locked);
void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);

struct zap_details;
void unmap_page_range(struct mmu_gather *tlb,
                             struct vm_area_struct *vma,
                             unsigned long addr, unsigned long end,
                             struct zap_details *details);
void zap_page_range_single_batched(struct mmu_gather *tlb,
                struct vm_area_struct *vma, unsigned long addr,
                unsigned long size, struct zap_details *details);
int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio,
                           gfp_t gfp);

void page_cache_ra_order(struct readahead_control *, struct file_ra_state *);
void force_page_cache_ra(struct readahead_control *, unsigned long nr);
static inline void force_page_cache_readahead(struct address_space *mapping,
                struct file *file, pgoff_t index, unsigned long nr_to_read)
{
        DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, index);
        force_page_cache_ra(&ractl, nr_to_read);
}

unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
                pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
                pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
void filemap_free_folio(struct address_space *mapping, struct folio *folio);
int truncate_inode_folio(struct address_space *mapping, struct folio *folio);
bool truncate_inode_partial_folio(struct folio *folio, loff_t start,
                loff_t end);
long mapping_evict_folio(struct address_space *mapping, struct folio *folio);
unsigned long mapping_try_invalidate(struct address_space *mapping,
                pgoff_t start, pgoff_t end, unsigned long *nr_failed);

/**
 * folio_evictable - Test whether a folio is evictable.
 * @folio: The folio to test.
 *
 * Test whether @folio is evictable -- i.e., should be placed on
 * active/inactive lists vs unevictable list.
 *
 * Reasons folio might not be evictable:
 * 1. folio's mapping marked unevictable
 * 2. One of the pages in the folio is part of an mlocked VMA
 */
static inline bool folio_evictable(struct folio *folio)
{
        bool ret;

        /* Prevent address_space of inode and swap cache from being freed */
        rcu_read_lock();
        ret = !mapping_unevictable(folio_mapping(folio)) &&
                        !folio_test_mlocked(folio);
        rcu_read_unlock();
        return ret;
}

/*
 * Turn a non-refcounted page (->_refcount == 0) into refcounted with
 * a count of one.
 */
static inline void set_page_refcounted(struct page *page)
{
        VM_BUG_ON_PAGE(PageTail(page), page);
        VM_BUG_ON_PAGE(page_ref_count(page), page);
        set_page_count(page, 1);
}

/*
 * Return true if a folio needs ->release_folio() calling upon it.
 */
static inline bool folio_needs_release(struct folio *folio)
{
        struct address_space *mapping = folio_mapping(folio);

        return folio_has_private(folio) ||
                (mapping && mapping_release_always(mapping));
}

extern unsigned long highest_memmap_pfn;

/*
 * Maximum number of reclaim retries without progress before the OOM
 * killer is consider the only way forward.
 */
#define MAX_RECLAIM_RETRIES 16

/*
 * in mm/vmscan.c:
 */
bool folio_isolate_lru(struct folio *folio);
void folio_putback_lru(struct folio *folio);
extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason);
#ifdef CONFIG_NUMA
int user_proactive_reclaim(char *buf,
                           struct mem_cgroup *memcg, pg_data_t *pgdat);
#else
static inline int user_proactive_reclaim(char *buf,
                           struct mem_cgroup *memcg, pg_data_t *pgdat)
{
        return 0;
}
#endif

/*
 * in mm/rmap.c:
 */
pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);

/*
 * in mm/page_alloc.c
 */
#define K(x) ((x) << (PAGE_SHIFT-10))

extern char * const zone_names[MAX_NR_ZONES];

/* perform sanity checks on struct pages being allocated or freed */
DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);

extern int min_free_kbytes;
extern int defrag_mode;

void setup_per_zone_wmarks(void);
void calculate_min_free_kbytes(void);
int __meminit init_per_zone_wmark_min(void);
void page_alloc_sysctl_init(void);

/*
 * Structure for holding the mostly immutable allocation parameters passed
 * between functions involved in allocations, including the alloc_pages*
 * family of functions.
 *
 * nodemask, migratetype and highest_zoneidx are initialized only once in
 * __alloc_pages() and then never change.
 *
 * zonelist, preferred_zone and highest_zoneidx are set first in
 * __alloc_pages() for the fast path, and might be later changed
 * in __alloc_pages_slowpath(). All other functions pass the whole structure
 * by a const pointer.
 */
struct alloc_context {
        struct zonelist *zonelist;
        nodemask_t *nodemask;
        struct zoneref *preferred_zoneref;
        int migratetype;

        /*
         * highest_zoneidx represents highest usable zone index of
         * the allocation request. Due to the nature of the zone,
         * memory on lower zone than the highest_zoneidx will be
         * protected by lowmem_reserve[highest_zoneidx].
         *
         * highest_zoneidx is also used by reclaim/compaction to limit
         * the target zone since higher zone than this index cannot be
         * usable for this allocation request.
         */
        enum zone_type highest_zoneidx;
        bool spread_dirty_pages;
};

/*
 * This function returns the order of a free page in the buddy system. In
 * general, page_zone(page)->lock must be held by the caller to prevent the
 * page from being allocated in parallel and returning garbage as the order.
 * If a caller does not hold page_zone(page)->lock, it must guarantee that the
 * page cannot be allocated or merged in parallel. Alternatively, it must
 * handle invalid values gracefully, and use buddy_order_unsafe() below.
 */
static inline unsigned int buddy_order(struct page *page)
{
        /* PageBuddy() must be checked by the caller */
        return page_private(page);
}

/*
 * Like buddy_order(), but for callers who cannot afford to hold the zone lock.
 * PageBuddy() should be checked first by the caller to minimize race window,
 * and invalid values must be handled gracefully.
 *
 * READ_ONCE is used so that if the caller assigns the result into a local
 * variable and e.g. tests it for valid range before using, the compiler cannot
 * decide to remove the variable and inline the page_private(page) multiple
 * times, potentially observing different values in the tests and the actual
 * use of the result.
 */
#define buddy_order_unsafe(page)        READ_ONCE(page_private(page))

/*
 * This function checks whether a page is free && is the buddy
 * we can coalesce a page and its buddy if
 * (a) the buddy is not in a hole (check before calling!) &&
 * (b) the buddy is in the buddy system &&
 * (c) a page and its buddy have the same order &&
 * (d) a page and its buddy are in the same zone.
 *
 * For recording whether a page is in the buddy system, we set PageBuddy.
 * Setting, clearing, and testing PageBuddy is serialized by zone->lock.
 *
 * For recording page's order, we use page_private(page).
 */
static inline bool page_is_buddy(struct page *page, struct page *buddy,
                                 unsigned int order)
{
        if (!page_is_guard(buddy) && !PageBuddy(buddy))
                return false;

        if (buddy_order(buddy) != order)
                return false;

        /*
         * zone check is done late to avoid uselessly calculating
         * zone/node ids for pages that could never merge.
         */
        if (page_zone_id(page) != page_zone_id(buddy))
                return false;

        VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);

        return true;
}

/*
 * Locate the struct page for both the matching buddy in our
 * pair (buddy1) and the combined O(n+1) page they form (page).
 *
 * 1) Any buddy B1 will have an order O twin B2 which satisfies
 * the following equation:
 *     B2 = B1 ^ (1 << O)
 * For example, if the starting buddy (buddy2) is #8 its order
 * 1 buddy is #10:
 *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
 *
 * 2) Any buddy B will have an order O+1 parent P which
 * satisfies the following equation:
 *     P = B & ~(1 << O)
 *
 * Assumption: *_mem_map is contiguous at least up to MAX_PAGE_ORDER
 */
static inline unsigned long
__find_buddy_pfn(unsigned long page_pfn, unsigned int order)
{
        return page_pfn ^ (1 << order);
}

/*
 * Find the buddy of @page and validate it.
 * @page: The input page
 * @pfn: The pfn of the page, it saves a call to page_to_pfn() when the
 *       function is used in the performance-critical __free_one_page().
 * @order: The order of the page
 * @buddy_pfn: The output pointer to the buddy pfn, it also saves a call to
 *             page_to_pfn().
 *
 * The found buddy can be a non PageBuddy, out of @page's zone, or its order is
 * not the same as @page. The validation is necessary before use it.
 *
 * Return: the found buddy page or NULL if not found.
 */
static inline struct page *find_buddy_page_pfn(struct page *page,
                        unsigned long pfn, unsigned int order, unsigned long *buddy_pfn)
{
        unsigned long __buddy_pfn = __find_buddy_pfn(pfn, order);
        struct page *buddy;

        buddy = page + (__buddy_pfn - pfn);
        if (buddy_pfn)
                *buddy_pfn = __buddy_pfn;

        if (page_is_buddy(page, buddy, order))
                return buddy;
        return NULL;
}

extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
                                unsigned long end_pfn, struct zone *zone);

static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
                                unsigned long end_pfn, struct zone *zone)
{
        if (zone->contiguous)
                return pfn_to_page(start_pfn);

        return __pageblock_pfn_to_page(start_pfn, end_pfn, zone);
}

void set_zone_contiguous(struct zone *zone);
bool pfn_range_intersects_zones(int nid, unsigned long start_pfn,
                           unsigned long nr_pages);

static inline void clear_zone_contiguous(struct zone *zone)
{
        zone->contiguous = false;
}

extern int __isolate_free_page(struct page *page, unsigned int order);
extern void __putback_isolated_page(struct page *page, unsigned int order,
                                    int mt);
extern void memblock_free_pages(struct page *page, unsigned long pfn,
                                        unsigned int order);
extern void __free_pages_core(struct page *page, unsigned int order,
                enum meminit_context context);

/*
 * This will have no effect, other than possibly generating a warning, if the
 * caller passes in a non-large folio.
 */
static inline void folio_set_order(struct folio *folio, unsigned int order)
{
        if (WARN_ON_ONCE(!order || !folio_test_large(folio)))
                return;
        VM_WARN_ON_ONCE(order > MAX_FOLIO_ORDER);

        folio->_flags_1 = (folio->_flags_1 & ~0xffUL) | order;
#ifdef NR_PAGES_IN_LARGE_FOLIO
        folio->_nr_pages = 1U << order;
#endif
}

bool __folio_unqueue_deferred_split(struct folio *folio);
static inline bool folio_unqueue_deferred_split(struct folio *folio)
{
        if (folio_order(folio) <= 1 || !folio_test_large_rmappable(folio))
                return false;

        /*
         * At this point, there is no one trying to add the folio to
         * deferred_list. If folio is not in deferred_list, it's safe
         * to check without acquiring the split_queue_lock.
         */
        if (data_race(list_empty(&folio->_deferred_list)))
                return false;

        return __folio_unqueue_deferred_split(folio);
}

static inline struct folio *page_rmappable_folio(struct page *page)
{
        struct folio *folio = (struct folio *)page;

        if (folio && folio_test_large(folio))
                folio_set_large_rmappable(folio);
        return folio;
}

static inline void prep_compound_head(struct page *page, unsigned int order)
{
        struct folio *folio = (struct folio *)page;

        folio_set_order(folio, order);
        atomic_set(&folio->_large_mapcount, -1);
        if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
                atomic_set(&folio->_nr_pages_mapped, 0);
        if (IS_ENABLED(CONFIG_MM_ID)) {
                folio->_mm_ids = 0;
                folio->_mm_id_mapcount[0] = -1;
                folio->_mm_id_mapcount[1] = -1;
        }
        if (IS_ENABLED(CONFIG_64BIT) || order > 1) {
                atomic_set(&folio->_pincount, 0);
                atomic_set(&folio->_entire_mapcount, -1);
        }
        if (order > 1)
                INIT_LIST_HEAD(&folio->_deferred_list);
}

static inline void prep_compound_tail(struct page *head, int tail_idx)
{
        struct page *p = head + tail_idx;

        p->mapping = TAIL_MAPPING;
        set_compound_head(p, head);
        set_page_private(p, 0);
}

void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags);
extern bool free_pages_prepare(struct page *page, unsigned int order);

extern int user_min_free_kbytes;

struct page *__alloc_frozen_pages_noprof(gfp_t, unsigned int order, int nid,
                nodemask_t *);
#define __alloc_frozen_pages(...) \
        alloc_hooks(__alloc_frozen_pages_noprof(__VA_ARGS__))
void free_frozen_pages(struct page *page, unsigned int order);
void free_unref_folios(struct folio_batch *fbatch);

#ifdef CONFIG_NUMA
struct page *alloc_frozen_pages_noprof(gfp_t, unsigned int order);
#else
static inline struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order)
{
        return __alloc_frozen_pages_noprof(gfp, order, numa_node_id(), NULL);
}
#endif

#define alloc_frozen_pages(...) \
        alloc_hooks(alloc_frozen_pages_noprof(__VA_ARGS__))

struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order);
#define alloc_frozen_pages_nolock(...) \
        alloc_hooks(alloc_frozen_pages_nolock_noprof(__VA_ARGS__))

extern void zone_pcp_reset(struct zone *zone);
extern void zone_pcp_disable(struct zone *zone);
extern void zone_pcp_enable(struct zone *zone);
extern void zone_pcp_init(struct zone *zone);

extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
                          phys_addr_t min_addr,
                          int nid, bool exact_nid);

void memmap_init_range(unsigned long, int, unsigned long, unsigned long,
                unsigned long, enum meminit_context, struct vmem_altmap *, int,
                bool);

#if defined CONFIG_COMPACTION || defined CONFIG_CMA

/*
 * in mm/compaction.c
 */
/*
 * compact_control is used to track pages being migrated and the free pages
 * they are being migrated to during memory compaction. The free_pfn starts
 * at the end of a zone and migrate_pfn begins at the start. Movable pages
 * are moved to the end of a zone during a compaction run and the run
 * completes when free_pfn <= migrate_pfn
 */
struct compact_control {
        struct list_head freepages[NR_PAGE_ORDERS];        /* List of free pages to migrate to */
        struct list_head migratepages;        /* List of pages being migrated */
        unsigned int nr_freepages;        /* Number of isolated free pages */
        unsigned int nr_migratepages;        /* Number of pages to migrate */
        unsigned long free_pfn;                /* isolate_freepages search base */
        /*
         * Acts as an in/out parameter to page isolation for migration.
         * isolate_migratepages uses it as a search base.
         * isolate_migratepages_block will update the value to the next pfn
         * after the last isolated one.
         */
        unsigned long migrate_pfn;
        unsigned long fast_start_pfn;        /* a pfn to start linear scan from */
        struct zone *zone;
        unsigned long total_migrate_scanned;
        unsigned long total_free_scanned;
        unsigned short fast_search_fail;/* failures to use free list searches */
        short search_order;                /* order to start a fast search at */
        const gfp_t gfp_mask;                /* gfp mask of a direct compactor */
        int order;                        /* order a direct compactor needs */
        int migratetype;                /* migratetype of direct compactor */
        const unsigned int alloc_flags;        /* alloc flags of a direct compactor */
        const int highest_zoneidx;        /* zone index of a direct compactor */
        enum migrate_mode mode;                /* Async or sync migration mode */
        bool ignore_skip_hint;                /* Scan blocks even if marked skip */
        bool no_set_skip_hint;                /* Don't mark blocks for skipping */
        bool ignore_block_suitable;        /* Scan blocks considered unsuitable */
        bool direct_compaction;                /* False from kcompactd or /proc/... */
        bool proactive_compaction;        /* kcompactd proactive compaction */
        bool whole_zone;                /* Whole zone should/has been scanned */
        bool contended;                        /* Signal lock contention */
        bool finish_pageblock;                /* Scan the remainder of a pageblock. Used
                                         * when there are potentially transient
                                         * isolation or migration failures to
                                         * ensure forward progress.
                                         */
        bool alloc_contig;                /* alloc_contig_range allocation */
};

/*
 * Used in direct compaction when a page should be taken from the freelists
 * immediately when one is created during the free path.
 */
struct capture_control {
        struct compact_control *cc;
        struct page *page;
};

unsigned long
isolate_freepages_range(struct compact_control *cc,
                        unsigned long start_pfn, unsigned long end_pfn);
int
isolate_migratepages_range(struct compact_control *cc,
                           unsigned long low_pfn, unsigned long end_pfn);

/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
void init_cma_reserved_pageblock(struct page *page);

#endif /* CONFIG_COMPACTION || CONFIG_CMA */

struct cma;

#ifdef CONFIG_CMA
void *cma_reserve_early(struct cma *cma, unsigned long size);
void init_cma_pageblock(struct page *page);
#else
static inline void *cma_reserve_early(struct cma *cma, unsigned long size)
{
        return NULL;
}
static inline void init_cma_pageblock(struct page *page)
{
}
#endif


int find_suitable_fallback(struct free_area *area, unsigned int order,
                           int migratetype, bool claimable);

static inline bool free_area_empty(struct free_area *area, int migratetype)
{
        return list_empty(&area->free_list[migratetype]);
}

/* mm/util.c */
struct anon_vma *folio_anon_vma(const struct folio *folio);

#ifdef CONFIG_MMU
void unmap_mapping_folio(struct folio *folio);
extern long populate_vma_page_range(struct vm_area_struct *vma,
                unsigned long start, unsigned long end, int *locked);
extern long faultin_page_range(struct mm_struct *mm, unsigned long start,
                unsigned long end, bool write, int *locked);
bool mlock_future_ok(const struct mm_struct *mm, vm_flags_t vm_flags,
                unsigned long bytes);

/*
 * NOTE: This function can't tell whether the folio is "fully mapped" in the
 * range.
 * "fully mapped" means all the pages of folio is associated with the page
 * table of range while this function just check whether the folio range is
 * within the range [start, end). Function caller needs to do page table
 * check if it cares about the page table association.
 *
 * Typical usage (like mlock or madvise) is:
 * Caller knows at least 1 page of folio is associated with page table of VMA
 * and the range [start, end) is intersect with the VMA range. Caller wants
 * to know whether the folio is fully associated with the range. It calls
 * this function to check whether the folio is in the range first. Then checks
 * the page table to know whether the folio is fully mapped to the range.
 */
static inline bool
folio_within_range(struct folio *folio, struct vm_area_struct *vma,
                unsigned long start, unsigned long end)
{
        pgoff_t pgoff, addr;
        unsigned long vma_pglen = vma_pages(vma);

        VM_WARN_ON_FOLIO(folio_test_ksm(folio), folio);
        if (start > end)
                return false;

        if (start < vma->vm_start)
                start = vma->vm_start;

        if (end > vma->vm_end)
                end = vma->vm_end;

        pgoff = folio_pgoff(folio);

        /* if folio start address is not in vma range */
        if (!in_range(pgoff, vma->vm_pgoff, vma_pglen))
                return false;

        addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);

        return !(addr < start || end - addr < folio_size(folio));
}

static inline bool
folio_within_vma(struct folio *folio, struct vm_area_struct *vma)
{
        return folio_within_range(folio, vma, vma->vm_start, vma->vm_end);
}

/*
 * mlock_vma_folio() and munlock_vma_folio():
 * should be called with vma's mmap_lock held for read or write,
 * under page table lock for the pte/pmd being added or removed.
 *
 * mlock is usually called at the end of folio_add_*_rmap_*(), munlock at
 * the end of folio_remove_rmap_*(); but new anon folios are managed by
 * folio_add_lru_vma() calling mlock_new_folio().
 */
void mlock_folio(struct folio *folio);
static inline void mlock_vma_folio(struct folio *folio,
                                struct vm_area_struct *vma)
{
        /*
         * The VM_SPECIAL check here serves two purposes.
         * 1) VM_IO check prevents migration from double-counting during mlock.
         * 2) Although mmap_region() and mlock_fixup() take care that VM_LOCKED
         *    is never left set on a VM_SPECIAL vma, there is an interval while
         *    file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may
         *    still be set while VM_SPECIAL bits are added: so ignore it then.
         */
        if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED))
                mlock_folio(folio);
}

void munlock_folio(struct folio *folio);
static inline void munlock_vma_folio(struct folio *folio,
                                        struct vm_area_struct *vma)
{
        /*
         * munlock if the function is called. Ideally, we should only
         * do munlock if any page of folio is unmapped from VMA and
         * cause folio not fully mapped to VMA.
         *
         * But it's not easy to confirm that's the situation. So we
         * always munlock the folio and page reclaim will correct it
         * if it's wrong.
         */
        if (unlikely(vma->vm_flags & VM_LOCKED))
                munlock_folio(folio);
}

void mlock_new_folio(struct folio *folio);
bool need_mlock_drain(int cpu);
void mlock_drain_local(void);
void mlock_drain_remote(int cpu);

extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);

/**
 * vma_address - Find the virtual address a page range is mapped at
 * @vma: The vma which maps this object.
 * @pgoff: The page offset within its object.
 * @nr_pages: The number of pages to consider.
 *
 * If any page in this range is mapped by this VMA, return the first address
 * where any of these pages appear.  Otherwise, return -EFAULT.
 */
static inline unsigned long vma_address(const struct vm_area_struct *vma,
                pgoff_t pgoff, unsigned long nr_pages)
{
        unsigned long address;

        if (pgoff >= vma->vm_pgoff) {
                address = vma->vm_start +
                        ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
                /* Check for address beyond vma (or wrapped through 0?) */
                if (address < vma->vm_start || address >= vma->vm_end)
                        address = -EFAULT;
        } else if (pgoff + nr_pages - 1 >= vma->vm_pgoff) {
                /* Test above avoids possibility of wrap to 0 on 32-bit */
                address = vma->vm_start;
        } else {
                address = -EFAULT;
        }
        return address;
}

/*
 * Then at what user virtual address will none of the range be found in vma?
 * Assumes that vma_address() already returned a good starting address.
 */
static inline unsigned long vma_address_end(struct page_vma_mapped_walk *pvmw)
{
        struct vm_area_struct *vma = pvmw->vma;
        pgoff_t pgoff;
        unsigned long address;

        /* Common case, plus ->pgoff is invalid for KSM */
        if (pvmw->nr_pages == 1)
                return pvmw->address + PAGE_SIZE;

        pgoff = pvmw->pgoff + pvmw->nr_pages;
        address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
        /* Check for address beyond vma (or wrapped through 0?) */
        if (address < vma->vm_start || address > vma->vm_end)
                address = vma->vm_end;
        return address;
}

static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
                                                    struct file *fpin)
{
        int flags = vmf->flags;

        if (fpin)
                return fpin;

        /*
         * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
         * anything, so we only pin the file and drop the mmap_lock if only
         * FAULT_FLAG_ALLOW_RETRY is set, while this is the first attempt.
         */
        if (fault_flag_allow_retry_first(flags) &&
            !(flags & FAULT_FLAG_RETRY_NOWAIT)) {
                fpin = get_file(vmf->vma->vm_file);
                release_fault_lock(vmf);
        }
        return fpin;
}
#else /* !CONFIG_MMU */
static inline void unmap_mapping_folio(struct folio *folio) { }
static inline void mlock_new_folio(struct folio *folio) { }
static inline bool need_mlock_drain(int cpu) { return false; }
static inline void mlock_drain_local(void) { }
static inline void mlock_drain_remote(int cpu) { }
static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
{
}
#endif /* !CONFIG_MMU */

/* Memory initialisation debug and verification */
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
DECLARE_STATIC_KEY_TRUE(deferred_pages);

bool __init deferred_grow_zone(struct zone *zone, unsigned int order);
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */

void init_deferred_page(unsigned long pfn, int nid);

enum mminit_level {
        MMINIT_WARNING,
        MMINIT_VERIFY,
        MMINIT_TRACE
};

#ifdef CONFIG_DEBUG_MEMORY_INIT

extern int mminit_loglevel;

#define mminit_dprintk(level, prefix, fmt, arg...) \
do { \
        if (level < mminit_loglevel) { \
                if (level <= MMINIT_WARNING) \
                        pr_warn("mminit::" prefix " " fmt, ##arg);        \
                else \
                        printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \
        } \
} while (0)

extern void mminit_verify_pageflags_layout(void);
extern void mminit_verify_zonelist(void);
#else

static inline void mminit_dprintk(enum mminit_level level,
                                const char *prefix, const char *fmt, ...)
{
}

static inline void mminit_verify_pageflags_layout(void)
{
}

static inline void mminit_verify_zonelist(void)
{
}
#endif /* CONFIG_DEBUG_MEMORY_INIT */

#define NODE_RECLAIM_NOSCAN        -2
#define NODE_RECLAIM_FULL        -1
#define NODE_RECLAIM_SOME        0
#define NODE_RECLAIM_SUCCESS        1

#ifdef CONFIG_NUMA
extern int node_reclaim_mode;

extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
extern int find_next_best_node(int node, nodemask_t *used_node_mask);
#else
#define node_reclaim_mode 0

static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
                                unsigned int order)
{
        return NODE_RECLAIM_NOSCAN;
}
static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
{
        return NUMA_NO_NODE;
}
#endif

static inline bool node_reclaim_enabled(void)
{
        /* Is any node_reclaim_mode bit set? */
        return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP);
}

/*
 * mm/memory-failure.c
 */
#ifdef CONFIG_MEMORY_FAILURE
int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill);
void shake_folio(struct folio *folio);
typedef int hwpoison_filter_func_t(struct page *p);
void hwpoison_filter_register(hwpoison_filter_func_t *filter);
void hwpoison_filter_unregister(void);

#define MAGIC_HWPOISON        0x48575053U        /* HWPS */
void SetPageHWPoisonTakenOff(struct page *page);
void ClearPageHWPoisonTakenOff(struct page *page);
bool take_page_off_buddy(struct page *page);
bool put_page_back_buddy(struct page *page);
struct task_struct *task_early_kill(struct task_struct *tsk, int force_early);
void add_to_kill_ksm(struct task_struct *tsk, const struct page *p,
                     struct vm_area_struct *vma, struct list_head *to_kill,
                     unsigned long ksm_addr);
unsigned long page_mapped_in_vma(const struct page *page,
                struct vm_area_struct *vma);

#else
static inline int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill)
{
        return -EBUSY;
}
#endif

extern unsigned long  __must_check vm_mmap_pgoff(struct file *, unsigned long,
        unsigned long, unsigned long,
        unsigned long, unsigned long);

extern void set_pageblock_order(void);
unsigned long reclaim_pages(struct list_head *folio_list);
unsigned int reclaim_clean_pages_from_list(struct zone *zone,
                                            struct list_head *folio_list);
/* The ALLOC_WMARK bits are used as an index to zone->watermark */
#define ALLOC_WMARK_MIN                WMARK_MIN
#define ALLOC_WMARK_LOW                WMARK_LOW
#define ALLOC_WMARK_HIGH        WMARK_HIGH
#define ALLOC_NO_WATERMARKS        0x04 /* don't check watermarks at all */

/* Mask to get the watermark bits */
#define ALLOC_WMARK_MASK        (ALLOC_NO_WATERMARKS-1)

/*
 * Only MMU archs have async oom victim reclaim - aka oom_reaper so we
 * cannot assume a reduced access to memory reserves is sufficient for
 * !MMU
 */
#ifdef CONFIG_MMU
#define ALLOC_OOM                0x08
#else
#define ALLOC_OOM                ALLOC_NO_WATERMARKS
#endif

#define ALLOC_NON_BLOCK                 0x10 /* Caller cannot block. Allow access
                                       * to 25% of the min watermark or
                                       * 62.5% if __GFP_HIGH is set.
                                       */
#define ALLOC_MIN_RESERVE         0x20 /* __GFP_HIGH set. Allow access to 50%
                                       * of the min watermark.
                                       */
#define ALLOC_CPUSET                 0x40 /* check for correct cpuset */
#define ALLOC_CMA                 0x80 /* allow allocations from CMA areas */
#ifdef CONFIG_ZONE_DMA32
#define ALLOC_NOFRAGMENT        0x100 /* avoid mixing pageblock types */
#else
#define ALLOC_NOFRAGMENT          0x0
#endif
#define ALLOC_HIGHATOMIC        0x200 /* Allows access to MIGRATE_HIGHATOMIC */
#define ALLOC_TRYLOCK                0x400 /* Only use spin_trylock in allocation path */
#define ALLOC_KSWAPD                0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */

/* Flags that allow allocations below the min watermark. */
#define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM)

enum ttu_flags;
struct tlbflush_unmap_batch;


/*
 * only for MM internal work items which do not depend on
 * any allocations or locks which might depend on allocations
 */
extern struct workqueue_struct *mm_percpu_wq;

#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
void try_to_unmap_flush(void);
void try_to_unmap_flush_dirty(void);
void flush_tlb_batched_pending(struct mm_struct *mm);
#else
static inline void try_to_unmap_flush(void)
{
}
static inline void try_to_unmap_flush_dirty(void)
{
}
static inline void flush_tlb_batched_pending(struct mm_struct *mm)
{
}
#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */

extern const struct trace_print_flags pageflag_names[];
extern const struct trace_print_flags vmaflag_names[];
extern const struct trace_print_flags gfpflag_names[];

void setup_zone_pageset(struct zone *zone);

struct migration_target_control {
        int nid;                /* preferred node id */
        nodemask_t *nmask;
        gfp_t gfp_mask;
        enum migrate_reason reason;
};

/*
 * mm/filemap.c
 */
size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
                              struct folio *folio, loff_t fpos, size_t size);

/*
 * mm/vmalloc.c
 */
#ifdef CONFIG_MMU
void __init vmalloc_init(void);
int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end,
                pgprot_t prot, struct page **pages, unsigned int page_shift);
unsigned int get_vm_area_page_order(struct vm_struct *vm);
#else
static inline void vmalloc_init(void)
{
}

static inline
int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end,
                pgprot_t prot, struct page **pages, unsigned int page_shift)
{
        return -EINVAL;
}
#endif

int __must_check __vmap_pages_range_noflush(unsigned long addr,
                               unsigned long end, pgprot_t prot,
                               struct page **pages, unsigned int page_shift);

void vunmap_range_noflush(unsigned long start, unsigned long end);

void __vunmap_range_noflush(unsigned long start, unsigned long end);

int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
                      unsigned long addr, int *flags, bool writable,
                      int *last_cpupid);

void free_zone_device_folio(struct folio *folio);
int migrate_device_coherent_folio(struct folio *folio);

struct vm_struct *__get_vm_area_node(unsigned long size,
                                     unsigned long align, unsigned long shift,
                                     unsigned long vm_flags, unsigned long start,
                                     unsigned long end, int node, gfp_t gfp_mask,
                                     const void *caller);

/*
 * mm/gup.c
 */
int __must_check try_grab_folio(struct folio *folio, int refs,
                                unsigned int flags);

/*
 * mm/huge_memory.c
 */
void touch_pud(struct vm_area_struct *vma, unsigned long addr,
               pud_t *pud, bool write);
void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
               pmd_t *pmd, bool write);

/*
 * Parses a string with mem suffixes into its order. Useful to parse kernel
 * parameters.
 */
static inline int get_order_from_str(const char *size_str,
                                     unsigned long valid_orders)
{
        unsigned long size;
        char *endptr;
        int order;

        size = memparse(size_str, &endptr);

        if (!is_power_of_2(size))
                return -EINVAL;
        order = get_order(size);
        if (BIT(order) & ~valid_orders)
                return -EINVAL;

        return order;
}

enum {
        /* mark page accessed */
        FOLL_TOUCH = 1 << 16,
        /* a retry, previous pass started an IO */
        FOLL_TRIED = 1 << 17,
        /* we are working on non-current tsk/mm */
        FOLL_REMOTE = 1 << 18,
        /* pages must be released via unpin_user_page */
        FOLL_PIN = 1 << 19,
        /* gup_fast: prevent fall-back to slow gup */
        FOLL_FAST_ONLY = 1 << 20,
        /* allow unlocking the mmap lock */
        FOLL_UNLOCKABLE = 1 << 21,
        /* VMA lookup+checks compatible with MADV_POPULATE_(READ|WRITE) */
        FOLL_MADV_POPULATE = 1 << 22,
};

#define INTERNAL_GUP_FLAGS (FOLL_TOUCH | FOLL_TRIED | FOLL_REMOTE | FOLL_PIN | \
                            FOLL_FAST_ONLY | FOLL_UNLOCKABLE | \
                            FOLL_MADV_POPULATE)

/*
 * Indicates for which pages that are write-protected in the page table,
 * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the
 * GUP pin will remain consistent with the pages mapped into the page tables
 * of the MM.
 *
 * Temporary unmapping of PageAnonExclusive() pages or clearing of
 * PageAnonExclusive() has to protect against concurrent GUP:
 * * Ordinary GUP: Using the PT lock
 * * GUP-fast and fork(): mm->write_protect_seq
 * * GUP-fast and KSM or temporary unmapping (swap, migration): see
 *    folio_try_share_anon_rmap_*()
 *
 * Must be called with the (sub)page that's actually referenced via the
 * page table entry, which might not necessarily be the head page for a
 * PTE-mapped THP.
 *
 * If the vma is NULL, we're coming from the GUP-fast path and might have
 * to fallback to the slow path just to lookup the vma.
 */
static inline bool gup_must_unshare(struct vm_area_struct *vma,
                                    unsigned int flags, struct page *page)
{
        /*
         * FOLL_WRITE is implicitly handled correctly as the page table entry
         * has to be writable -- and if it references (part of) an anonymous
         * folio, that part is required to be marked exclusive.
         */
        if ((flags & (FOLL_WRITE | FOLL_PIN)) != FOLL_PIN)
                return false;
        /*
         * Note: PageAnon(page) is stable until the page is actually getting
         * freed.
         */
        if (!PageAnon(page)) {
                /*
                 * We only care about R/O long-term pining: R/O short-term
                 * pinning does not have the semantics to observe successive
                 * changes through the process page tables.
                 */
                if (!(flags & FOLL_LONGTERM))
                        return false;

                /* We really need the vma ... */
                if (!vma)
                        return true;

                /*
                 * ... because we only care about writable private ("COW")
                 * mappings where we have to break COW early.
                 */
                return is_cow_mapping(vma->vm_flags);
        }

        /* Paired with a memory barrier in folio_try_share_anon_rmap_*(). */
        if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
                smp_rmb();

        /*
         * Note that KSM pages cannot be exclusive, and consequently,
         * cannot get pinned.
         */
        return !PageAnonExclusive(page);
}

extern bool mirrored_kernelcore;
bool memblock_has_mirror(void);
void memblock_free_all(void);

static __always_inline void vma_set_range(struct vm_area_struct *vma,
                                          unsigned long start, unsigned long end,
                                          pgoff_t pgoff)
{
        vma->vm_start = start;
        vma->vm_end = end;
        vma->vm_pgoff = pgoff;
}

static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
{
        /*
         * NOTE: we must check this before VM_SOFTDIRTY on soft-dirty
         * enablements, because when without soft-dirty being compiled in,
         * VM_SOFTDIRTY is defined as 0x0, then !(vm_flags & VM_SOFTDIRTY)
         * will be constantly true.
         */
        if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
                return false;

        /*
         * Soft-dirty is kind of special: its tracking is enabled when the
         * vma flags not set.
         */
        return !(vma->vm_flags & VM_SOFTDIRTY);
}

static inline bool pmd_needs_soft_dirty_wp(struct vm_area_struct *vma, pmd_t pmd)
{
        return vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd);
}

static inline bool pte_needs_soft_dirty_wp(struct vm_area_struct *vma, pte_t pte)
{
        return vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte);
}

void __meminit __init_single_page(struct page *page, unsigned long pfn,
                                unsigned long zone, int nid);
void __meminit __init_page_from_nid(unsigned long pfn, int nid);

/* shrinker related functions */
unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
                          int priority);

#ifdef CONFIG_SHRINKER_DEBUG
static inline __printf(2, 0) int shrinker_debugfs_name_alloc(
                        struct shrinker *shrinker, const char *fmt, va_list ap)
{
        shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);

        return shrinker->name ? 0 : -ENOMEM;
}

static inline void shrinker_debugfs_name_free(struct shrinker *shrinker)
{
        kfree_const(shrinker->name);
        shrinker->name = NULL;
}

extern int shrinker_debugfs_add(struct shrinker *shrinker);
extern struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
                                              int *debugfs_id);
extern void shrinker_debugfs_remove(struct dentry *debugfs_entry,
                                    int debugfs_id);
#else /* CONFIG_SHRINKER_DEBUG */
static inline int shrinker_debugfs_add(struct shrinker *shrinker)
{
        return 0;
}
static inline int shrinker_debugfs_name_alloc(struct shrinker *shrinker,
                                              const char *fmt, va_list ap)
{
        return 0;
}
static inline void shrinker_debugfs_name_free(struct shrinker *shrinker)
{
}
static inline struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
                                                     int *debugfs_id)
{
        *debugfs_id = -1;
        return NULL;
}
static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry,
                                           int debugfs_id)
{
}
#endif /* CONFIG_SHRINKER_DEBUG */

/* Only track the nodes of mappings with shadow entries */
void workingset_update_node(struct xa_node *node);
extern struct list_lru shadow_nodes;
#define mapping_set_update(xas, mapping) do {                        \
        if (!dax_mapping(mapping) && !shmem_mapping(mapping)) {        \
                xas_set_update(xas, workingset_update_node);        \
                xas_set_lru(xas, &shadow_nodes);                \
        }                                                        \
} while (0)

/* mremap.c */
unsigned long move_page_tables(struct pagetable_move_control *pmc);

#ifdef CONFIG_UNACCEPTED_MEMORY
void accept_page(struct page *page);
#else /* CONFIG_UNACCEPTED_MEMORY */
static inline void accept_page(struct page *page)
{
}
#endif /* CONFIG_UNACCEPTED_MEMORY */

/* pagewalk.c */
int walk_page_range_mm(struct mm_struct *mm, unsigned long start,
                unsigned long end, const struct mm_walk_ops *ops,
                void *private);
int walk_page_range_debug(struct mm_struct *mm, unsigned long start,
                          unsigned long end, const struct mm_walk_ops *ops,
                          pgd_t *pgd, void *private);

/* pt_reclaim.c */
bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval);
void free_pte(struct mm_struct *mm, unsigned long addr, struct mmu_gather *tlb,
              pmd_t pmdval);
void try_to_free_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr,
                     struct mmu_gather *tlb);

#ifdef CONFIG_PT_RECLAIM
bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
                           struct zap_details *details);
#else
static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
                                         struct zap_details *details)
{
        return false;
}
#endif /* CONFIG_PT_RECLAIM */

void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm);
int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm);

#endif        /* __MM_INTERNAL_H */






























































































































































































































































































































































































































































































































































































































































































































































































































    4 









































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
/* SPDX-License-Identifier: GPL-2.0+ */
#ifndef _LINUX_MAPLE_TREE_H
#define _LINUX_MAPLE_TREE_H
/*
 * Maple Tree - An RCU-safe adaptive tree for storing ranges
 * Copyright (c) 2018-2022 Oracle
 * Authors:     Liam R. Howlett <Liam.Howlett@Oracle.com>
 *              Matthew Wilcox <willy@infradead.org>
 */

#include <linux/kernel.h>
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
/* #define CONFIG_MAPLE_RCU_DISABLED */

/*
 * Allocated nodes are mutable until they have been inserted into the tree,
 * at which time they cannot change their type until they have been removed
 * from the tree and an RCU grace period has passed.
 *
 * Removed nodes have their ->parent set to point to themselves.  RCU readers
 * check ->parent before relying on the value that they loaded from the
 * slots array.  This lets us reuse the slots array for the RCU head.
 *
 * Nodes in the tree point to their parent unless bit 0 is set.
 */
#if defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64)
/* 64bit sizes */
#define MAPLE_NODE_SLOTS        31        /* 256 bytes including ->parent */
#define MAPLE_RANGE64_SLOTS        16        /* 256 bytes */
#define MAPLE_ARANGE64_SLOTS        10        /* 240 bytes */
#define MAPLE_ALLOC_SLOTS        (MAPLE_NODE_SLOTS - 1)
#else
/* 32bit sizes */
#define MAPLE_NODE_SLOTS        63        /* 256 bytes including ->parent */
#define MAPLE_RANGE64_SLOTS        32        /* 256 bytes */
#define MAPLE_ARANGE64_SLOTS        21        /* 240 bytes */
#define MAPLE_ALLOC_SLOTS        (MAPLE_NODE_SLOTS - 2)
#endif /* defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) */

#define MAPLE_NODE_MASK                255UL

/*
 * The node->parent of the root node has bit 0 set and the rest of the pointer
 * is a pointer to the tree itself.  No more bits are available in this pointer
 * (on m68k, the data structure may only be 2-byte aligned).
 *
 * Internal non-root nodes can only have maple_range_* nodes as parents.  The
 * parent pointer is 256B aligned like all other tree nodes.  When storing a 32
 * or 64 bit values, the offset can fit into 4 bits.  The 16 bit values need an
 * extra bit to store the offset.  This extra bit comes from a reuse of the last
 * bit in the node type.  This is possible by using bit 1 to indicate if bit 2
 * is part of the type or the slot.
 *
 * Once the type is decided, the decision of an allocation range type or a
 * range type is done by examining the immutable tree flag for the
 * MT_FLAGS_ALLOC_RANGE flag.
 *
 *  Node types:
 *   0b??1 = Root
 *   0b?00 = 16 bit nodes
 *   0b010 = 32 bit nodes
 *   0b110 = 64 bit nodes
 *
 *  Slot size and location in the parent pointer:
 *   type  : slot location
 *   0b??1 : Root
 *   0b?00 : 16 bit values, type in 0-1, slot in 2-6
 *   0b010 : 32 bit values, type in 0-2, slot in 3-6
 *   0b110 : 64 bit values, type in 0-2, slot in 3-6
 */

/*
 * This metadata is used to optimize the gap updating code and in reverse
 * searching for gaps or any other code that needs to find the end of the data.
 */
struct maple_metadata {
        unsigned char end;        /* end of data */
        unsigned char gap;        /* offset of largest gap */
};

/*
 * Leaf nodes do not store pointers to nodes, they store user data.  Users may
 * store almost any bit pattern.  As noted above, the optimisation of storing an
 * entry at 0 in the root pointer cannot be done for data which have the bottom
 * two bits set to '10'.  We also reserve values with the bottom two bits set to
 * '10' which are below 4096 (ie 2, 6, 10 .. 4094) for internal use.  Some APIs
 * return errnos as a negative errno shifted right by two bits and the bottom
 * two bits set to '10', and while choosing to store these values in the array
 * is not an error, it may lead to confusion if you're testing for an error with
 * mas_is_err().
 *
 * Non-leaf nodes store the type of the node pointed to (enum maple_type in bits
 * 3-6), bit 2 is reserved.  That leaves bits 0-1 unused for now.
 *
 * In regular B-Tree terms, pivots are called keys.  The term pivot is used to
 * indicate that the tree is specifying ranges,  Pivots may appear in the
 * subtree with an entry attached to the value whereas keys are unique to a
 * specific position of a B-tree.  Pivot values are inclusive of the slot with
 * the same index.
 */

struct maple_range_64 {
        struct maple_pnode *parent;
        unsigned long pivot[MAPLE_RANGE64_SLOTS - 1];
        union {
                void __rcu *slot[MAPLE_RANGE64_SLOTS];
                struct {
                        void __rcu *pad[MAPLE_RANGE64_SLOTS - 1];
                        struct maple_metadata meta;
                };
        };
};

/*
 * At tree creation time, the user can specify that they're willing to trade off
 * storing fewer entries in a tree in return for storing more information in
 * each node.
 *
 * The maple tree supports recording the largest range of NULL entries available
 * in this node, also called gaps.  This optimises the tree for allocating a
 * range.
 */
struct maple_arange_64 {
        struct maple_pnode *parent;
        unsigned long pivot[MAPLE_ARANGE64_SLOTS - 1];
        void __rcu *slot[MAPLE_ARANGE64_SLOTS];
        unsigned long gap[MAPLE_ARANGE64_SLOTS];
        struct maple_metadata meta;
};

struct maple_alloc {
        unsigned long total;
        unsigned char node_count;
        unsigned int request_count;
        struct maple_alloc *slot[MAPLE_ALLOC_SLOTS];
};

struct maple_topiary {
        struct maple_pnode *parent;
        struct maple_enode *next; /* Overlaps the pivot */
};

enum maple_type {
        maple_dense,
        maple_leaf_64,
        maple_range_64,
        maple_arange_64,
};

enum store_type {
        wr_invalid,
        wr_new_root,
        wr_store_root,
        wr_exact_fit,
        wr_spanning_store,
        wr_split_store,
        wr_rebalance,
        wr_append,
        wr_node_store,
        wr_slot_store,
};

/**
 * DOC: Maple tree flags
 *
 * * MT_FLAGS_ALLOC_RANGE        - Track gaps in this tree
 * * MT_FLAGS_USE_RCU                - Operate in RCU mode
 * * MT_FLAGS_HEIGHT_OFFSET        - The position of the tree height in the flags
 * * MT_FLAGS_HEIGHT_MASK        - The mask for the maple tree height value
 * * MT_FLAGS_LOCK_MASK                - How the mt_lock is used
 * * MT_FLAGS_LOCK_IRQ                - Acquired irq-safe
 * * MT_FLAGS_LOCK_BH                - Acquired bh-safe
 * * MT_FLAGS_LOCK_EXTERN        - mt_lock is not used
 *
 * MAPLE_HEIGHT_MAX        The largest height that can be stored
 */
#define MT_FLAGS_ALLOC_RANGE        0x01
#define MT_FLAGS_USE_RCU        0x02
#define MT_FLAGS_HEIGHT_OFFSET        0x02
#define MT_FLAGS_HEIGHT_MASK        0x7C
#define MT_FLAGS_LOCK_MASK        0x300
#define MT_FLAGS_LOCK_IRQ        0x100
#define MT_FLAGS_LOCK_BH        0x200
#define MT_FLAGS_LOCK_EXTERN        0x300
#define MT_FLAGS_ALLOC_WRAPPED        0x0800

#define MAPLE_HEIGHT_MAX        31


#define MAPLE_NODE_TYPE_MASK        0x0F
#define MAPLE_NODE_TYPE_SHIFT        0x03

#define MAPLE_RESERVED_RANGE        4096

#ifdef CONFIG_LOCKDEP
#define mt_lock_is_held(mt)                                             \
        (!(mt)->ma_external_lock || lock_is_held((mt)->ma_external_lock))

#define mt_write_lock_is_held(mt)                                        \
        (!(mt)->ma_external_lock ||                                        \
         lock_is_held_type((mt)->ma_external_lock, 0))

#define mt_set_external_lock(mt, lock)                                        \
        (mt)->ma_external_lock = &(lock)->dep_map

#define mt_on_stack(mt)                        (mt).ma_external_lock = NULL
#else
#define mt_lock_is_held(mt)                1
#define mt_write_lock_is_held(mt)        1
#define mt_set_external_lock(mt, lock)        do { } while (0)
#define mt_on_stack(mt)                        do { } while (0)
#endif

/*
 * If the tree contains a single entry at index 0, it is usually stored in
 * tree->ma_root.  To optimise for the page cache, an entry which ends in '00',
 * '01' or '11' is stored in the root, but an entry which ends in '10' will be
 * stored in a node.  Bits 3-6 are used to store enum maple_type.
 *
 * The flags are used both to store some immutable information about this tree
 * (set at tree creation time) and dynamic information set under the spinlock.
 *
 * Another use of flags are to indicate global states of the tree.  This is the
 * case with the MT_FLAGS_USE_RCU flag, which indicates the tree is currently in
 * RCU mode.  This mode was added to allow the tree to reuse nodes instead of
 * re-allocating and RCU freeing nodes when there is a single user.
 */
struct maple_tree {
        union {
                spinlock_t                ma_lock;
#ifdef CONFIG_LOCKDEP
                struct lockdep_map        *ma_external_lock;
#endif
        };
        unsigned int        ma_flags;
        void __rcu      *ma_root;
};

/**
 * MTREE_INIT() - Initialize a maple tree
 * @name: The maple tree name
 * @__flags: The maple tree flags
 *
 */
#define MTREE_INIT(name, __flags) {                                        \
        .ma_lock = __SPIN_LOCK_UNLOCKED((name).ma_lock),                \
        .ma_flags = __flags,                                                \
        .ma_root = NULL,                                                \
}

/**
 * MTREE_INIT_EXT() - Initialize a maple tree with an external lock.
 * @name: The tree name
 * @__flags: The maple tree flags
 * @__lock: The external lock
 */
#ifdef CONFIG_LOCKDEP
#define MTREE_INIT_EXT(name, __flags, __lock) {                                \
        .ma_external_lock = &(__lock).dep_map,                                \
        .ma_flags = (__flags),                                                \
        .ma_root = NULL,                                                \
}
#else
#define MTREE_INIT_EXT(name, __flags, __lock)        MTREE_INIT(name, __flags)
#endif

#define DEFINE_MTREE(name)                                                \
        struct maple_tree name = MTREE_INIT(name, 0)

#define mtree_lock(mt)                spin_lock((&(mt)->ma_lock))
#define mtree_lock_nested(mas, subclass) \
                spin_lock_nested((&(mt)->ma_lock), subclass)
#define mtree_unlock(mt)        spin_unlock((&(mt)->ma_lock))

/*
 * The Maple Tree squeezes various bits in at various points which aren't
 * necessarily obvious.  Usually, this is done by observing that pointers are
 * N-byte aligned and thus the bottom log_2(N) bits are available for use.  We
 * don't use the high bits of pointers to store additional information because
 * we don't know what bits are unused on any given architecture.
 *
 * Nodes are 256 bytes in size and are also aligned to 256 bytes, giving us 8
 * low bits for our own purposes.  Nodes are currently of 4 types:
 * 1. Single pointer (Range is 0-0)
 * 2. Non-leaf Allocation Range nodes
 * 3. Non-leaf Range nodes
 * 4. Leaf Range nodes All nodes consist of a number of node slots,
 *    pivots, and a parent pointer.
 */

struct maple_node {
        union {
                struct {
                        struct maple_pnode *parent;
                        void __rcu *slot[MAPLE_NODE_SLOTS];
                };
                struct {
                        void *pad;
                        struct rcu_head rcu;
                        struct maple_enode *piv_parent;
                        unsigned char parent_slot;
                        enum maple_type type;
                        unsigned char slot_len;
                        unsigned int ma_flags;
                };
                struct maple_range_64 mr64;
                struct maple_arange_64 ma64;
                struct maple_alloc alloc;
        };
};

/*
 * More complicated stores can cause two nodes to become one or three and
 * potentially alter the height of the tree.  Either half of the tree may need
 * to be rebalanced against the other.  The ma_topiary struct is used to track
 * which nodes have been 'cut' from the tree so that the change can be done
 * safely at a later date.  This is done to support RCU.
 */
struct ma_topiary {
        struct maple_enode *head;
        struct maple_enode *tail;
        struct maple_tree *mtree;
};

void *mtree_load(struct maple_tree *mt, unsigned long index);

int mtree_insert(struct maple_tree *mt, unsigned long index,
                void *entry, gfp_t gfp);
int mtree_insert_range(struct maple_tree *mt, unsigned long first,
                unsigned long last, void *entry, gfp_t gfp);
int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp,
                void *entry, unsigned long size, unsigned long min,
                unsigned long max, gfp_t gfp);
int mtree_alloc_cyclic(struct maple_tree *mt, unsigned long *startp,
                void *entry, unsigned long range_lo, unsigned long range_hi,
                unsigned long *next, gfp_t gfp);
int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp,
                void *entry, unsigned long size, unsigned long min,
                unsigned long max, gfp_t gfp);

int mtree_store_range(struct maple_tree *mt, unsigned long first,
                      unsigned long last, void *entry, gfp_t gfp);
int mtree_store(struct maple_tree *mt, unsigned long index,
                void *entry, gfp_t gfp);
void *mtree_erase(struct maple_tree *mt, unsigned long index);

int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp);
int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp);

void mtree_destroy(struct maple_tree *mt);
void __mt_destroy(struct maple_tree *mt);

/**
 * mtree_empty() - Determine if a tree has any present entries.
 * @mt: Maple Tree.
 *
 * Context: Any context.
 * Return: %true if the tree contains only NULL pointers.
 */
static inline bool mtree_empty(const struct maple_tree *mt)
{
        return mt->ma_root == NULL;
}

/* Advanced API */

/*
 * Maple State Status
 * ma_active means the maple state is pointing to a node and offset and can
 * continue operating on the tree.
 * ma_start means we have not searched the tree.
 * ma_root means we have searched the tree and the entry we found lives in
 * the root of the tree (ie it has index 0, length 1 and is the only entry in
 * the tree).
 * ma_none means we have searched the tree and there is no node in the
 * tree for this entry.  For example, we searched for index 1 in an empty
 * tree.  Or we have a tree which points to a full leaf node and we
 * searched for an entry which is larger than can be contained in that
 * leaf node.
 * ma_pause means the data within the maple state may be stale, restart the
 * operation
 * ma_overflow means the search has reached the upper limit of the search
 * ma_underflow means the search has reached the lower limit of the search
 * ma_error means there was an error, check the node for the error number.
 */
enum maple_status {
        ma_active,
        ma_start,
        ma_root,
        ma_none,
        ma_pause,
        ma_overflow,
        ma_underflow,
        ma_error,
};

/*
 * The maple state is defined in the struct ma_state and is used to keep track
 * of information during operations, and even between operations when using the
 * advanced API.
 *
 * If state->node has bit 0 set then it references a tree location which is not
 * a node (eg the root).  If bit 1 is set, the rest of the bits are a negative
 * errno.  Bit 2 (the 'unallocated slots' bit) is clear.  Bits 3-6 indicate the
 * node type.
 *
 * state->alloc either has a request number of nodes or an allocated node.  If
 * stat->alloc has a requested number of nodes, the first bit will be set (0x1)
 * and the remaining bits are the value.  If state->alloc is a node, then the
 * node will be of type maple_alloc.  maple_alloc has MAPLE_NODE_SLOTS - 1 for
 * storing more allocated nodes, a total number of nodes allocated, and the
 * node_count in this node.  node_count is the number of allocated nodes in this
 * node.  The scaling beyond MAPLE_NODE_SLOTS - 1 is handled by storing further
 * nodes into state->alloc->slot[0]'s node.  Nodes are taken from state->alloc
 * by removing a node from the state->alloc node until state->alloc->node_count
 * is 1, when state->alloc is returned and the state->alloc->slot[0] is promoted
 * to state->alloc.  Nodes are pushed onto state->alloc by putting the current
 * state->alloc into the pushed node's slot[0].
 *
 * The state also contains the implied min/max of the state->node, the depth of
 * this search, and the offset. The implied min/max are either from the parent
 * node or are 0-oo for the root node.  The depth is incremented or decremented
 * every time a node is walked down or up.  The offset is the slot/pivot of
 * interest in the node - either for reading or writing.
 *
 * When returning a value the maple state index and last respectively contain
 * the start and end of the range for the entry.  Ranges are inclusive in the
 * Maple Tree.
 *
 * The status of the state is used to determine how the next action should treat
 * the state.  For instance, if the status is ma_start then the next action
 * should start at the root of the tree and walk down.  If the status is
 * ma_pause then the node may be stale data and should be discarded.  If the
 * status is ma_overflow, then the last action hit the upper limit.
 *
 */
struct ma_state {
        struct maple_tree *tree;        /* The tree we're operating in */
        unsigned long index;                /* The index we're operating on - range start */
        unsigned long last;                /* The last index we're operating on - range end */
        struct maple_enode *node;        /* The node containing this entry */
        unsigned long min;                /* The minimum index of this node - implied pivot min */
        unsigned long max;                /* The maximum index of this node - implied pivot max */
        struct slab_sheaf *sheaf;        /* Allocated nodes for this operation */
        struct maple_node *alloc;        /* A single allocated node for fast path writes */
        unsigned long node_request;        /* The number of nodes to allocate for this operation */
        enum maple_status status;        /* The status of the state (active, start, none, etc) */
        unsigned char depth;                /* depth of tree descent during write */
        unsigned char offset;
        unsigned char mas_flags;
        unsigned char end;                /* The end of the node */
        enum store_type store_type;        /* The type of store needed for this operation */
};

struct ma_wr_state {
        struct ma_state *mas;
        struct maple_node *node;        /* Decoded mas->node */
        unsigned long r_min;                /* range min */
        unsigned long r_max;                /* range max */
        enum maple_type type;                /* mas->node type */
        unsigned char offset_end;        /* The offset where the write ends */
        unsigned long *pivots;                /* mas->node->pivots pointer */
        unsigned long end_piv;                /* The pivot at the offset end */
        void __rcu **slots;                /* mas->node->slots pointer */
        void *entry;                        /* The entry to write */
        void *content;                        /* The existing entry that is being overwritten */
        unsigned char vacant_height;        /* Height of lowest node with free space */
        unsigned char sufficient_height;/* Height of lowest node with min sufficiency + 1 nodes */
};

#define mas_lock(mas)           spin_lock(&((mas)->tree->ma_lock))
#define mas_lock_nested(mas, subclass) \
                spin_lock_nested(&((mas)->tree->ma_lock), subclass)
#define mas_unlock(mas)         spin_unlock(&((mas)->tree->ma_lock))

/*
 * Special values for ma_state.node.
 * MA_ERROR represents an errno.  After dropping the lock and attempting
 * to resolve the error, the walk would have to be restarted from the
 * top of the tree as the tree may have been modified.
 */
#define MA_ERROR(err) \
                ((struct maple_enode *)(((unsigned long)err << 2) | 2UL))

/*
 * When changing MA_STATE, remember to also change rust/kernel/maple_tree.rs
 */
#define MA_STATE(name, mt, first, end)                                        \
        struct ma_state name = {                                        \
                .tree = mt,                                                \
                .index = first,                                                \
                .last = end,                                                \
                .node = NULL,                                                \
                .status = ma_start,                                        \
                .min = 0,                                                \
                .max = ULONG_MAX,                                        \
                .sheaf = NULL,                                                \
                .alloc = NULL,                                                \
                .node_request = 0,                                        \
                .mas_flags = 0,                                                \
                .store_type = wr_invalid,                                \
        }

#define MA_WR_STATE(name, ma_state, wr_entry)                                \
        struct ma_wr_state name = {                                        \
                .mas = ma_state,                                        \
                .content = NULL,                                        \
                .entry = wr_entry,                                        \
                .vacant_height = 0,                                        \
                .sufficient_height = 0                                        \
        }

#define MA_TOPIARY(name, tree)                                                \
        struct ma_topiary name = {                                        \
                .head = NULL,                                                \
                .tail = NULL,                                                \
                .mtree = tree,                                                \
        }

void *mas_walk(struct ma_state *mas);
void *mas_store(struct ma_state *mas, void *entry);
void *mas_erase(struct ma_state *mas);
int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp);
void mas_store_prealloc(struct ma_state *mas, void *entry);
void *mas_find(struct ma_state *mas, unsigned long max);
void *mas_find_range(struct ma_state *mas, unsigned long max);
void *mas_find_rev(struct ma_state *mas, unsigned long min);
void *mas_find_range_rev(struct ma_state *mas, unsigned long max);
int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp);
int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp,
                void *entry, unsigned long range_lo, unsigned long range_hi,
                unsigned long *next, gfp_t gfp);

bool mas_nomem(struct ma_state *mas, gfp_t gfp);
void mas_pause(struct ma_state *mas);
void maple_tree_init(void);
void mas_destroy(struct ma_state *mas);
int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries);

void *mas_prev(struct ma_state *mas, unsigned long min);
void *mas_prev_range(struct ma_state *mas, unsigned long max);
void *mas_next(struct ma_state *mas, unsigned long max);
void *mas_next_range(struct ma_state *mas, unsigned long max);

int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max,
                   unsigned long size);
/*
 * This finds an empty area from the highest address to the lowest.
 * AKA "Topdown" version,
 */
int mas_empty_area_rev(struct ma_state *mas, unsigned long min,
                       unsigned long max, unsigned long size);

static inline void mas_init(struct ma_state *mas, struct maple_tree *tree,
                            unsigned long addr)
{
        memset(mas, 0, sizeof(struct ma_state));
        mas->tree = tree;
        mas->index = mas->last = addr;
        mas->max = ULONG_MAX;
        mas->status = ma_start;
        mas->node = NULL;
}

static inline bool mas_is_active(struct ma_state *mas)
{
        return mas->status == ma_active;
}

static inline bool mas_is_err(struct ma_state *mas)
{
        return mas->status == ma_error;
}

/**
 * mas_reset() - Reset a Maple Tree operation state.
 * @mas: Maple Tree operation state.
 *
 * Resets the error or walk state of the @mas so future walks of the
 * array will start from the root.  Use this if you have dropped the
 * lock and want to reuse the ma_state.
 *
 * Context: Any context.
 */
static __always_inline void mas_reset(struct ma_state *mas)
{
        mas->status = ma_start;
        mas->node = NULL;
}

/**
 * mas_for_each() - Iterate over a range of the maple tree.
 * @__mas: Maple Tree operation state (maple_state)
 * @__entry: Entry retrieved from the tree
 * @__max: maximum index to retrieve from the tree
 *
 * When returned, mas->index and mas->last will hold the entire range for the
 * entry.
 *
 * Note: may return the zero entry.
 */
#define mas_for_each(__mas, __entry, __max) \
        while (((__entry) = mas_find((__mas), (__max))) != NULL)

/**
 * mas_for_each_rev() - Iterate over a range of the maple tree in reverse order.
 * @__mas: Maple Tree operation state (maple_state)
 * @__entry: Entry retrieved from the tree
 * @__min: minimum index to retrieve from the tree
 *
 * When returned, mas->index and mas->last will hold the entire range for the
 * entry.
 *
 * Note: may return the zero entry.
 */
#define mas_for_each_rev(__mas, __entry, __min) \
        while (((__entry) = mas_find_rev((__mas), (__min))) != NULL)

#ifdef CONFIG_DEBUG_MAPLE_TREE
enum mt_dump_format {
        mt_dump_dec,
        mt_dump_hex,
};

extern atomic_t maple_tree_tests_run;
extern atomic_t maple_tree_tests_passed;

void mt_dump(const struct maple_tree *mt, enum mt_dump_format format);
void mas_dump(const struct ma_state *mas);
void mas_wr_dump(const struct ma_wr_state *wr_mas);
void mt_validate(struct maple_tree *mt);
void mt_cache_shrink(void);
#define MT_BUG_ON(__tree, __x) do {                                        \
        atomic_inc(&maple_tree_tests_run);                                \
        if (__x) {                                                        \
                pr_info("BUG at %s:%d (%u)\n",                                \
                __func__, __LINE__, __x);                                \
                mt_dump(__tree, mt_dump_hex);                                \
                pr_info("Pass: %u Run:%u\n",                                \
                        atomic_read(&maple_tree_tests_passed),                \
                        atomic_read(&maple_tree_tests_run));                \
                dump_stack();                                                \
        } else {                                                        \
                atomic_inc(&maple_tree_tests_passed);                        \
        }                                                                \
} while (0)

#define MAS_BUG_ON(__mas, __x) do {                                        \
        atomic_inc(&maple_tree_tests_run);                                \
        if (__x) {                                                        \
                pr_info("BUG at %s:%d (%u)\n",                                \
                __func__, __LINE__, __x);                                \
                mas_dump(__mas);                                        \
                mt_dump((__mas)->tree, mt_dump_hex);                        \
                pr_info("Pass: %u Run:%u\n",                                \
                        atomic_read(&maple_tree_tests_passed),                \
                        atomic_read(&maple_tree_tests_run));                \
                dump_stack();                                                \
        } else {                                                        \
                atomic_inc(&maple_tree_tests_passed);                        \
        }                                                                \
} while (0)

#define MAS_WR_BUG_ON(__wrmas, __x) do {                                \
        atomic_inc(&maple_tree_tests_run);                                \
        if (__x) {                                                        \
                pr_info("BUG at %s:%d (%u)\n",                                \
                __func__, __LINE__, __x);                                \
                mas_wr_dump(__wrmas);                                        \
                mas_dump((__wrmas)->mas);                                \
                mt_dump((__wrmas)->mas->tree, mt_dump_hex);                \
                pr_info("Pass: %u Run:%u\n",                                \
                        atomic_read(&maple_tree_tests_passed),                \
                        atomic_read(&maple_tree_tests_run));                \
                dump_stack();                                                \
        } else {                                                        \
                atomic_inc(&maple_tree_tests_passed);                        \
        }                                                                \
} while (0)

#define MT_WARN_ON(__tree, __x)  ({                                        \
        int ret = !!(__x);                                                \
        atomic_inc(&maple_tree_tests_run);                                \
        if (ret) {                                                        \
                pr_info("WARN at %s:%d (%u)\n",                                \
                __func__, __LINE__, __x);                                \
                mt_dump(__tree, mt_dump_hex);                                \
                pr_info("Pass: %u Run:%u\n",                                \
                        atomic_read(&maple_tree_tests_passed),                \
                        atomic_read(&maple_tree_tests_run));                \
                dump_stack();                                                \
        } else {                                                        \
                atomic_inc(&maple_tree_tests_passed);                        \
        }                                                                \
        unlikely(ret);                                                        \
})

#define MAS_WARN_ON(__mas, __x) ({                                        \
        int ret = !!(__x);                                                \
        atomic_inc(&maple_tree_tests_run);                                \
        if (ret) {                                                        \
                pr_info("WARN at %s:%d (%u)\n",                                \
                __func__, __LINE__, __x);                                \
                mas_dump(__mas);                                        \
                mt_dump((__mas)->tree, mt_dump_hex);                        \
                pr_info("Pass: %u Run:%u\n",                                \
                        atomic_read(&maple_tree_tests_passed),                \
                        atomic_read(&maple_tree_tests_run));                \
                dump_stack();                                                \
        } else {                                                        \
                atomic_inc(&maple_tree_tests_passed);                        \
        }                                                                \
        unlikely(ret);                                                        \
})

#define MAS_WR_WARN_ON(__wrmas, __x) ({                                        \
        int ret = !!(__x);                                                \
        atomic_inc(&maple_tree_tests_run);                                \
        if (ret) {                                                        \
                pr_info("WARN at %s:%d (%u)\n",                                \
                __func__, __LINE__, __x);                                \
                mas_wr_dump(__wrmas);                                        \
                mas_dump((__wrmas)->mas);                                \
                mt_dump((__wrmas)->mas->tree, mt_dump_hex);                \
                pr_info("Pass: %u Run:%u\n",                                \
                        atomic_read(&maple_tree_tests_passed),                \
                        atomic_read(&maple_tree_tests_run));                \
                dump_stack();                                                \
        } else {                                                        \
                atomic_inc(&maple_tree_tests_passed);                        \
        }                                                                \
        unlikely(ret);                                                        \
})
#else
#define MT_BUG_ON(__tree, __x)                BUG_ON(__x)
#define MAS_BUG_ON(__mas, __x)                BUG_ON(__x)
#define MAS_WR_BUG_ON(__mas, __x)        BUG_ON(__x)
#define MT_WARN_ON(__tree, __x)                WARN_ON(__x)
#define MAS_WARN_ON(__mas, __x)                WARN_ON(__x)
#define MAS_WR_WARN_ON(__mas, __x)        WARN_ON(__x)
#endif /* CONFIG_DEBUG_MAPLE_TREE */

/**
 * __mas_set_range() - Set up Maple Tree operation state to a sub-range of the
 * current location.
 * @mas: Maple Tree operation state.
 * @start: New start of range in the Maple Tree.
 * @last: New end of range in the Maple Tree.
 *
 * set the internal maple state values to a sub-range.
 * Please use mas_set_range() if you do not know where you are in the tree.
 */
static inline void __mas_set_range(struct ma_state *mas, unsigned long start,
                unsigned long last)
{
        /* Ensure the range starts within the current slot */
        MAS_WARN_ON(mas, mas_is_active(mas) &&
                   (mas->index > start || mas->last < start));
        mas->index = start;
        mas->last = last;
}

/**
 * mas_set_range() - Set up Maple Tree operation state for a different index.
 * @mas: Maple Tree operation state.
 * @start: New start of range in the Maple Tree.
 * @last: New end of range in the Maple Tree.
 *
 * Move the operation state to refer to a different range.  This will
 * have the effect of starting a walk from the top; see mas_next()
 * to move to an adjacent index.
 */
static inline
void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last)
{
        mas_reset(mas);
        __mas_set_range(mas, start, last);
}

/**
 * mas_set() - Set up Maple Tree operation state for a different index.
 * @mas: Maple Tree operation state.
 * @index: New index into the Maple Tree.
 *
 * Move the operation state to refer to a different index.  This will
 * have the effect of starting a walk from the top; see mas_next()
 * to move to an adjacent index.
 */
static inline void mas_set(struct ma_state *mas, unsigned long index)
{

        mas_set_range(mas, index, index);
}

static inline bool mt_external_lock(const struct maple_tree *mt)
{
        return (mt->ma_flags & MT_FLAGS_LOCK_MASK) == MT_FLAGS_LOCK_EXTERN;
}

/**
 * mt_init_flags() - Initialise an empty maple tree with flags.
 * @mt: Maple Tree
 * @flags: maple tree flags.
 *
 * If you need to initialise a Maple Tree with special flags (eg, an
 * allocation tree), use this function.
 *
 * Context: Any context.
 */
static inline void mt_init_flags(struct maple_tree *mt, unsigned int flags)
{
        mt->ma_flags = flags;
        if (!mt_external_lock(mt))
                spin_lock_init(&mt->ma_lock);
        rcu_assign_pointer(mt->ma_root, NULL);
}

/**
 * mt_init() - Initialise an empty maple tree.
 * @mt: Maple Tree
 *
 * An empty Maple Tree.
 *
 * Context: Any context.
 */
static inline void mt_init(struct maple_tree *mt)
{
        mt_init_flags(mt, 0);
}

static inline bool mt_in_rcu(struct maple_tree *mt)
{
#ifdef CONFIG_MAPLE_RCU_DISABLED
        return false;
#endif
        return mt->ma_flags & MT_FLAGS_USE_RCU;
}

/**
 * mt_clear_in_rcu() - Switch the tree to non-RCU mode.
 * @mt: The Maple Tree
 */
static inline void mt_clear_in_rcu(struct maple_tree *mt)
{
        if (!mt_in_rcu(mt))
                return;

        if (mt_external_lock(mt)) {
                WARN_ON(!mt_lock_is_held(mt));
                mt->ma_flags &= ~MT_FLAGS_USE_RCU;
        } else {
                mtree_lock(mt);
                mt->ma_flags &= ~MT_FLAGS_USE_RCU;
                mtree_unlock(mt);
        }
}

/**
 * mt_set_in_rcu() - Switch the tree to RCU safe mode.
 * @mt: The Maple Tree
 */
static inline void mt_set_in_rcu(struct maple_tree *mt)
{
        if (mt_in_rcu(mt))
                return;

        if (mt_external_lock(mt)) {
                WARN_ON(!mt_lock_is_held(mt));
                mt->ma_flags |= MT_FLAGS_USE_RCU;
        } else {
                mtree_lock(mt);
                mt->ma_flags |= MT_FLAGS_USE_RCU;
                mtree_unlock(mt);
        }
}

static inline unsigned int mt_height(const struct maple_tree *mt)
{
        return (mt->ma_flags & MT_FLAGS_HEIGHT_MASK) >> MT_FLAGS_HEIGHT_OFFSET;
}

void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max);
void *mt_find_after(struct maple_tree *mt, unsigned long *index,
                    unsigned long max);
void *mt_prev(struct maple_tree *mt, unsigned long index,  unsigned long min);
void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max);

/**
 * mt_for_each - Iterate over each entry starting at index until max.
 * @__tree: The Maple Tree
 * @__entry: The current entry
 * @__index: The index to start the search from. Subsequently used as iterator.
 * @__max: The maximum limit for @index
 *
 * This iterator skips all entries, which resolve to a NULL pointer,
 * e.g. entries which has been reserved with XA_ZERO_ENTRY.
 */
#define mt_for_each(__tree, __entry, __index, __max) \
        for (__entry = mt_find(__tree, &(__index), __max); \
                __entry; __entry = mt_find_after(__tree, &(__index), __max))

#endif /*_LINUX_MAPLE_TREE_H */




































































































   67 
   68 





















































































































































































































































































































































































































    8 





    8 






    8 




























    8 

    8 
    8 





























    8 






































































































   69 



   69 










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
// SPDX-License-Identifier: GPL-2.0-only
/* Kernel thread helper functions.
 *   Copyright (C) 2004 IBM Corporation, Rusty Russell.
 *   Copyright (C) 2009 Red Hat, Inc.
 *
 * Creation is done via kthreadd, so that we get a clean environment
 * even if we're invoked from userspace (think modprobe, hotplug cpu,
 * etc.).
 */
#include <uapi/linux/sched/types.h>
#include <linux/mm.h>
#include <linux/mmu_context.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/kthread.h>
#include <linux/completion.h>
#include <linux/err.h>
#include <linux/cgroup.h>
#include <linux/cpuset.h>
#include <linux/unistd.h>
#include <linux/file.h>
#include <linux/export.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/freezer.h>
#include <linux/ptrace.h>
#include <linux/uaccess.h>
#include <linux/numa.h>
#include <linux/sched/isolation.h>
#include <trace/events/sched.h>


static DEFINE_SPINLOCK(kthread_create_lock);
static LIST_HEAD(kthread_create_list);
struct task_struct *kthreadd_task;

static LIST_HEAD(kthreads_hotplug);
static DEFINE_MUTEX(kthreads_hotplug_lock);

struct kthread_create_info
{
        /* Information passed to kthread() from kthreadd. */
        char *full_name;
        int (*threadfn)(void *data);
        void *data;
        int node;

        /* Result passed back to kthread_create() from kthreadd. */
        struct task_struct *result;
        struct completion *done;

        struct list_head list;
};

struct kthread {
        unsigned long flags;
        unsigned int cpu;
        unsigned int node;
        int started;
        int result;
        int (*threadfn)(void *);
        void *data;
        struct completion parked;
        struct completion exited;
#ifdef CONFIG_BLK_CGROUP
        struct cgroup_subsys_state *blkcg_css;
#endif
        /* To store the full name if task comm is truncated. */
        char *full_name;
        struct task_struct *task;
        struct list_head hotplug_node;
        struct cpumask *preferred_affinity;
};

enum KTHREAD_BITS {
        KTHREAD_IS_PER_CPU = 0,
        KTHREAD_SHOULD_STOP,
        KTHREAD_SHOULD_PARK,
};

static inline struct kthread *to_kthread(struct task_struct *k)
{
        WARN_ON(!(k->flags & PF_KTHREAD));
        return k->worker_private;
}

/*
 * Variant of to_kthread() that doesn't assume @p is a kthread.
 *
 * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will
 * always remain a kthread.  For kthreads p->worker_private always
 * points to a struct kthread.  For tasks that are not kthreads
 * p->worker_private is used to point to other things.
 *
 * Return NULL for any task that is not a kthread.
 */
static inline struct kthread *__to_kthread(struct task_struct *p)
{
        void *kthread = p->worker_private;
        if (kthread && !(p->flags & PF_KTHREAD))
                kthread = NULL;
        return kthread;
}

void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk)
{
        struct kthread *kthread = to_kthread(tsk);

        if (!kthread || !kthread->full_name) {
                strscpy(buf, tsk->comm, buf_size);
                return;
        }

        strscpy_pad(buf, kthread->full_name, buf_size);
}

bool set_kthread_struct(struct task_struct *p)
{
        struct kthread *kthread;

        if (WARN_ON_ONCE(to_kthread(p)))
                return false;

        kthread = kzalloc(sizeof(*kthread), GFP_KERNEL);
        if (!kthread)
                return false;

        init_completion(&kthread->exited);
        init_completion(&kthread->parked);
        INIT_LIST_HEAD(&kthread->hotplug_node);
        p->vfork_done = &kthread->exited;

        kthread->task = p;
        kthread->node = tsk_fork_get_node(current);
        p->worker_private = kthread;
        return true;
}

void free_kthread_struct(struct task_struct *k)
{
        struct kthread *kthread;

        /*
         * Can be NULL if kmalloc() in set_kthread_struct() failed.
         */
        kthread = to_kthread(k);
        if (!kthread)
                return;

#ifdef CONFIG_BLK_CGROUP
        WARN_ON_ONCE(kthread->blkcg_css);
#endif
        k->worker_private = NULL;
        kfree(kthread->full_name);
        kfree(kthread);
}

/**
 * kthread_should_stop - should this kthread return now?
 *
 * When someone calls kthread_stop() on your kthread, it will be woken
 * and this will return true.  You should then return, and your return
 * value will be passed through to kthread_stop().
 */
bool kthread_should_stop(void)
{
        return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags);
}
EXPORT_SYMBOL(kthread_should_stop);

static bool __kthread_should_park(struct task_struct *k)
{
        return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags);
}

/**
 * kthread_should_park - should this kthread park now?
 *
 * When someone calls kthread_park() on your kthread, it will be woken
 * and this will return true.  You should then do the necessary
 * cleanup and call kthread_parkme()
 *
 * Similar to kthread_should_stop(), but this keeps the thread alive
 * and in a park position. kthread_unpark() "restarts" the thread and
 * calls the thread function again.
 */
bool kthread_should_park(void)
{
        return __kthread_should_park(current);
}
EXPORT_SYMBOL_GPL(kthread_should_park);

bool kthread_should_stop_or_park(void)
{
        struct kthread *kthread = __to_kthread(current);

        if (!kthread)
                return false;

        return kthread->flags & (BIT(KTHREAD_SHOULD_STOP) | BIT(KTHREAD_SHOULD_PARK));
}

/**
 * kthread_freezable_should_stop - should this freezable kthread return now?
 * @was_frozen: optional out parameter, indicates whether %current was frozen
 *
 * kthread_should_stop() for freezable kthreads, which will enter
 * refrigerator if necessary.  This function is safe from kthread_stop() /
 * freezer deadlock and freezable kthreads should use this function instead
 * of calling try_to_freeze() directly.
 */
bool kthread_freezable_should_stop(bool *was_frozen)
{
        bool frozen = false;

        might_sleep();

        if (unlikely(freezing(current)))
                frozen = __refrigerator(true);

        if (was_frozen)
                *was_frozen = frozen;

        return kthread_should_stop();
}
EXPORT_SYMBOL_GPL(kthread_freezable_should_stop);

/**
 * kthread_func - return the function specified on kthread creation
 * @task: kthread task in question
 *
 * Returns NULL if the task is not a kthread.
 */
void *kthread_func(struct task_struct *task)
{
        struct kthread *kthread = __to_kthread(task);
        if (kthread)
                return kthread->threadfn;
        return NULL;
}
EXPORT_SYMBOL_GPL(kthread_func);

/**
 * kthread_data - return data value specified on kthread creation
 * @task: kthread task in question
 *
 * Return the data value specified when kthread @task was created.
 * The caller is responsible for ensuring the validity of @task when
 * calling this function.
 */
void *kthread_data(struct task_struct *task)
{
        return to_kthread(task)->data;
}
EXPORT_SYMBOL_GPL(kthread_data);

/**
 * kthread_probe_data - speculative version of kthread_data()
 * @task: possible kthread task in question
 *
 * @task could be a kthread task.  Return the data value specified when it
 * was created if accessible.  If @task isn't a kthread task or its data is
 * inaccessible for any reason, %NULL is returned.  This function requires
 * that @task itself is safe to dereference.
 */
void *kthread_probe_data(struct task_struct *task)
{
        struct kthread *kthread = __to_kthread(task);
        void *data = NULL;

        if (kthread)
                copy_from_kernel_nofault(&data, &kthread->data, sizeof(data));
        return data;
}

static void __kthread_parkme(struct kthread *self)
{
        for (;;) {
                /*
                 * TASK_PARKED is a special state; we must serialize against
                 * possible pending wakeups to avoid store-store collisions on
                 * task->state.
                 *
                 * Such a collision might possibly result in the task state
                 * changin from TASK_PARKED and us failing the
                 * wait_task_inactive() in kthread_park().
                 */
                set_special_state(TASK_PARKED);
                if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags))
                        break;

                /*
                 * Thread is going to call schedule(), do not preempt it,
                 * or the caller of kthread_park() may spend more time in
                 * wait_task_inactive().
                 */
                preempt_disable();
                complete(&self->parked);
                schedule_preempt_disabled();
                preempt_enable();
        }
        __set_current_state(TASK_RUNNING);
}

void kthread_parkme(void)
{
        __kthread_parkme(to_kthread(current));
}
EXPORT_SYMBOL_GPL(kthread_parkme);

/**
 * kthread_exit - Cause the current kthread return @result to kthread_stop().
 * @result: The integer value to return to kthread_stop().
 *
 * While kthread_exit can be called directly, it exists so that
 * functions which do some additional work in non-modular code such as
 * module_put_and_kthread_exit can be implemented.
 *
 * Does not return.
 */
void __noreturn kthread_exit(long result)
{
        struct kthread *kthread = to_kthread(current);
        kthread->result = result;
        if (!list_empty(&kthread->hotplug_node)) {
                mutex_lock(&kthreads_hotplug_lock);
                list_del(&kthread->hotplug_node);
                mutex_unlock(&kthreads_hotplug_lock);

                if (kthread->preferred_affinity) {
                        kfree(kthread->preferred_affinity);
                        kthread->preferred_affinity = NULL;
                }
        }
        do_exit(0);
}
EXPORT_SYMBOL(kthread_exit);

/**
 * kthread_complete_and_exit - Exit the current kthread.
 * @comp: Completion to complete
 * @code: The integer value to return to kthread_stop().
 *
 * If present, complete @comp and then return code to kthread_stop().
 *
 * A kernel thread whose module may be removed after the completion of
 * @comp can use this function to exit safely.
 *
 * Does not return.
 */
void __noreturn kthread_complete_and_exit(struct completion *comp, long code)
{
        if (comp)
                complete(comp);

        kthread_exit(code);
}
EXPORT_SYMBOL(kthread_complete_and_exit);

static void kthread_fetch_affinity(struct kthread *kthread, struct cpumask *cpumask)
{
        const struct cpumask *pref;

        if (kthread->preferred_affinity) {
                pref = kthread->preferred_affinity;
        } else {
                if (WARN_ON_ONCE(kthread->node == NUMA_NO_NODE))
                        return;
                pref = cpumask_of_node(kthread->node);
        }

        cpumask_and(cpumask, pref, housekeeping_cpumask(HK_TYPE_KTHREAD));
        if (cpumask_empty(cpumask))
                cpumask_copy(cpumask, housekeeping_cpumask(HK_TYPE_KTHREAD));
}

static void kthread_affine_node(void)
{
        struct kthread *kthread = to_kthread(current);
        cpumask_var_t affinity;

        WARN_ON_ONCE(kthread_is_per_cpu(current));

        if (kthread->node == NUMA_NO_NODE) {
                housekeeping_affine(current, HK_TYPE_KTHREAD);
        } else {
                if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) {
                        WARN_ON_ONCE(1);
                        return;
                }

                mutex_lock(&kthreads_hotplug_lock);
                WARN_ON_ONCE(!list_empty(&kthread->hotplug_node));
                list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
                /*
                 * The node cpumask is racy when read from kthread() but:
                 * - a racing CPU going down will either fail on the subsequent
                 *   call to set_cpus_allowed_ptr() or be migrated to housekeepers
                 *   afterwards by the scheduler.
                 * - a racing CPU going up will be handled by kthreads_online_cpu()
                 */
                kthread_fetch_affinity(kthread, affinity);
                set_cpus_allowed_ptr(current, affinity);
                mutex_unlock(&kthreads_hotplug_lock);

                free_cpumask_var(affinity);
        }
}

static int kthread(void *_create)
{
        static const struct sched_param param = { .sched_priority = 0 };
        /* Copy data: it's on kthread's stack */
        struct kthread_create_info *create = _create;
        int (*threadfn)(void *data) = create->threadfn;
        void *data = create->data;
        struct completion *done;
        struct kthread *self;
        int ret;

        self = to_kthread(current);

        /* Release the structure when caller killed by a fatal signal. */
        done = xchg(&create->done, NULL);
        if (!done) {
                kfree(create->full_name);
                kfree(create);
                kthread_exit(-EINTR);
        }

        self->full_name = create->full_name;
        self->threadfn = threadfn;
        self->data = data;

        /*
         * The new thread inherited kthreadd's priority and CPU mask. Reset
         * back to default in case they have been changed.
         */
        sched_setscheduler_nocheck(current, SCHED_NORMAL, &param);

        /* OK, tell user we're spawned, wait for stop or wakeup */
        __set_current_state(TASK_UNINTERRUPTIBLE);
        create->result = current;
        /*
         * Thread is going to call schedule(), do not preempt it,
         * or the creator may spend more time in wait_task_inactive().
         */
        preempt_disable();
        complete(done);
        schedule_preempt_disabled();
        preempt_enable();

        self->started = 1;

        if (!(current->flags & PF_NO_SETAFFINITY) && !self->preferred_affinity)
                kthread_affine_node();

        ret = -EINTR;
        if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
                cgroup_kthread_ready();
                __kthread_parkme(self);
                ret = threadfn(data);
        }
        kthread_exit(ret);
}

/* called from kernel_clone() to get node information for about to be created task */
int tsk_fork_get_node(struct task_struct *tsk)
{
#ifdef CONFIG_NUMA
        if (tsk == kthreadd_task)
                return tsk->pref_node_fork;
#endif
        return NUMA_NO_NODE;
}

static void create_kthread(struct kthread_create_info *create)
{
        int pid;

#ifdef CONFIG_NUMA
        current->pref_node_fork = create->node;
#endif
        /* We want our own signal handler (we take no signals by default). */
        pid = kernel_thread(kthread, create, create->full_name,
                            CLONE_FS | CLONE_FILES | SIGCHLD);
        if (pid < 0) {
                /* Release the structure when caller killed by a fatal signal. */
                struct completion *done = xchg(&create->done, NULL);

                kfree(create->full_name);
                if (!done) {
                        kfree(create);
                        return;
                }
                create->result = ERR_PTR(pid);
                complete(done);
        }
}

static __printf(4, 0)
struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
                                                    void *data, int node,
                                                    const char namefmt[],
                                                    va_list args)
{
        DECLARE_COMPLETION_ONSTACK(done);
        struct task_struct *task;
        struct kthread_create_info *create = kmalloc(sizeof(*create),
                                                     GFP_KERNEL);

        if (!create)
                return ERR_PTR(-ENOMEM);
        create->threadfn = threadfn;
        create->data = data;
        create->node = node;
        create->done = &done;
        create->full_name = kvasprintf(GFP_KERNEL, namefmt, args);
        if (!create->full_name) {
                task = ERR_PTR(-ENOMEM);
                goto free_create;
        }

        spin_lock(&kthread_create_lock);
        list_add_tail(&create->list, &kthread_create_list);
        spin_unlock(&kthread_create_lock);

        wake_up_process(kthreadd_task);
        /*
         * Wait for completion in killable state, for I might be chosen by
         * the OOM killer while kthreadd is trying to allocate memory for
         * new kernel thread.
         */
        if (unlikely(wait_for_completion_killable(&done))) {
                /*
                 * If I was killed by a fatal signal before kthreadd (or new
                 * kernel thread) calls complete(), leave the cleanup of this
                 * structure to that thread.
                 */
                if (xchg(&create->done, NULL))
                        return ERR_PTR(-EINTR);
                /*
                 * kthreadd (or new kernel thread) will call complete()
                 * shortly.
                 */
                wait_for_completion(&done);
        }
        task = create->result;
free_create:
        kfree(create);
        return task;
}

/**
 * kthread_create_on_node - create a kthread.
 * @threadfn: the function to run until signal_pending(current).
 * @data: data ptr for @threadfn.
 * @node: task and thread structures for the thread are allocated on this node
 * @namefmt: printf-style name for the thread.
 *
 * Description: This helper function creates and names a kernel
 * thread.  The thread will be stopped: use wake_up_process() to start
 * it.  See also kthread_run().  The new thread has SCHED_NORMAL policy and
 * is affine to all CPUs.
 *
 * If thread is going to be bound on a particular cpu, give its node
 * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
 * When woken, the thread will run @threadfn() with @data as its
 * argument. @threadfn() can either return directly if it is a
 * standalone thread for which no one will call kthread_stop(), or
 * return when 'kthread_should_stop()' is true (which means
 * kthread_stop() has been called).  The return value should be zero
 * or a negative error number; it will be passed to kthread_stop().
 *
 * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR).
 */
struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
                                           void *data, int node,
                                           const char namefmt[],
                                           ...)
{
        struct task_struct *task;
        va_list args;

        va_start(args, namefmt);
        task = __kthread_create_on_node(threadfn, data, node, namefmt, args);
        va_end(args);

        return task;
}
EXPORT_SYMBOL(kthread_create_on_node);

static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state)
{
        unsigned long flags;

        if (!wait_task_inactive(p, state)) {
                WARN_ON(1);
                return;
        }

        /* It's safe because the task is inactive. */
        raw_spin_lock_irqsave(&p->pi_lock, flags);
        do_set_cpus_allowed(p, mask);
        p->flags |= PF_NO_SETAFFINITY;
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
}

static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state)
{
        __kthread_bind_mask(p, cpumask_of(cpu), state);
}

void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
{
        struct kthread *kthread = to_kthread(p);
        __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE);
        WARN_ON_ONCE(kthread->started);
}

/**
 * kthread_bind - bind a just-created kthread to a cpu.
 * @p: thread created by kthread_create().
 * @cpu: cpu (might not be online, must be possible) for @k to run on.
 *
 * Description: This function is equivalent to set_cpus_allowed(),
 * except that @cpu doesn't need to be online, and the thread must be
 * stopped (i.e., just returned from kthread_create()).
 */
void kthread_bind(struct task_struct *p, unsigned int cpu)
{
        struct kthread *kthread = to_kthread(p);
        __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE);
        WARN_ON_ONCE(kthread->started);
}
EXPORT_SYMBOL(kthread_bind);

/**
 * kthread_create_on_cpu - Create a cpu bound kthread
 * @threadfn: the function to run until signal_pending(current).
 * @data: data ptr for @threadfn.
 * @cpu: The cpu on which the thread should be bound,
 * @namefmt: printf-style name for the thread. Format is restricted
 *             to "name.*%u". Code fills in cpu number.
 *
 * Description: This helper function creates and names a kernel thread
 */
struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
                                          void *data, unsigned int cpu,
                                          const char *namefmt)
{
        struct task_struct *p;

        p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt,
                                   cpu);
        if (IS_ERR(p))
                return p;
        kthread_bind(p, cpu);
        /* CPU hotplug need to bind once again when unparking the thread. */
        to_kthread(p)->cpu = cpu;
        return p;
}
EXPORT_SYMBOL(kthread_create_on_cpu);

void kthread_set_per_cpu(struct task_struct *k, int cpu)
{
        struct kthread *kthread = to_kthread(k);
        if (!kthread)
                return;

        WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY));

        if (cpu < 0) {
                clear_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
                return;
        }

        kthread->cpu = cpu;
        set_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
}

bool kthread_is_per_cpu(struct task_struct *p)
{
        struct kthread *kthread = __to_kthread(p);
        if (!kthread)
                return false;

        return test_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
}

/**
 * kthread_unpark - unpark a thread created by kthread_create().
 * @k:                thread created by kthread_create().
 *
 * Sets kthread_should_park() for @k to return false, wakes it, and
 * waits for it to return. If the thread is marked percpu then its
 * bound to the cpu again.
 */
void kthread_unpark(struct task_struct *k)
{
        struct kthread *kthread = to_kthread(k);

        if (!test_bit(KTHREAD_SHOULD_PARK, &kthread->flags))
                return;
        /*
         * Newly created kthread was parked when the CPU was offline.
         * The binding was lost and we need to set it again.
         */
        if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
                __kthread_bind(k, kthread->cpu, TASK_PARKED);

        clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
        /*
         * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup.
         */
        wake_up_state(k, TASK_PARKED);
}
EXPORT_SYMBOL_GPL(kthread_unpark);

/**
 * kthread_park - park a thread created by kthread_create().
 * @k: thread created by kthread_create().
 *
 * Sets kthread_should_park() for @k to return true, wakes it, and
 * waits for it to return. This can also be called after kthread_create()
 * instead of calling wake_up_process(): the thread will park without
 * calling threadfn().
 *
 * Returns 0 if the thread is parked, -ENOSYS if the thread exited.
 * If called by the kthread itself just the park bit is set.
 */
int kthread_park(struct task_struct *k)
{
        struct kthread *kthread = to_kthread(k);

        if (WARN_ON(k->flags & PF_EXITING))
                return -ENOSYS;

        if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)))
                return -EBUSY;

        set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
        if (k != current) {
                wake_up_process(k);
                /*
                 * Wait for __kthread_parkme() to complete(), this means we
                 * _will_ have TASK_PARKED and are about to call schedule().
                 */
                wait_for_completion(&kthread->parked);
                /*
                 * Now wait for that schedule() to complete and the task to
                 * get scheduled out.
                 */
                WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED));
        }

        return 0;
}
EXPORT_SYMBOL_GPL(kthread_park);

/**
 * kthread_stop - stop a thread created by kthread_create().
 * @k: thread created by kthread_create().
 *
 * Sets kthread_should_stop() for @k to return true, wakes it, and
 * waits for it to exit. This can also be called after kthread_create()
 * instead of calling wake_up_process(): the thread will exit without
 * calling threadfn().
 *
 * If threadfn() may call kthread_exit() itself, the caller must ensure
 * task_struct can't go away.
 *
 * Returns the result of threadfn(), or %-EINTR if wake_up_process()
 * was never called.
 */
int kthread_stop(struct task_struct *k)
{
        struct kthread *kthread;
        int ret;

        trace_sched_kthread_stop(k);

        get_task_struct(k);
        kthread = to_kthread(k);
        set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
        kthread_unpark(k);
        set_tsk_thread_flag(k, TIF_NOTIFY_SIGNAL);
        wake_up_process(k);
        wait_for_completion(&kthread->exited);
        ret = kthread->result;
        put_task_struct(k);

        trace_sched_kthread_stop_ret(ret);
        return ret;
}
EXPORT_SYMBOL(kthread_stop);

/**
 * kthread_stop_put - stop a thread and put its task struct
 * @k: thread created by kthread_create().
 *
 * Stops a thread created by kthread_create() and put its task_struct.
 * Only use when holding an extra task struct reference obtained by
 * calling get_task_struct().
 */
int kthread_stop_put(struct task_struct *k)
{
        int ret;

        ret = kthread_stop(k);
        put_task_struct(k);
        return ret;
}
EXPORT_SYMBOL(kthread_stop_put);

int kthreadd(void *unused)
{
        static const char comm[TASK_COMM_LEN] = "kthreadd";
        struct task_struct *tsk = current;

        /* Setup a clean context for our children to inherit. */
        set_task_comm(tsk, comm);
        ignore_signals(tsk);
        set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD));
        set_mems_allowed(node_states[N_MEMORY]);

        current->flags |= PF_NOFREEZE;
        cgroup_init_kthreadd();

        for (;;) {
                set_current_state(TASK_INTERRUPTIBLE);
                if (list_empty(&kthread_create_list))
                        schedule();
                __set_current_state(TASK_RUNNING);

                spin_lock(&kthread_create_lock);
                while (!list_empty(&kthread_create_list)) {
                        struct kthread_create_info *create;

                        create = list_entry(kthread_create_list.next,
                                            struct kthread_create_info, list);
                        list_del_init(&create->list);
                        spin_unlock(&kthread_create_lock);

                        create_kthread(create);

                        spin_lock(&kthread_create_lock);
                }
                spin_unlock(&kthread_create_lock);
        }

        return 0;
}

int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
{
        struct kthread *kthread = to_kthread(p);
        cpumask_var_t affinity;
        unsigned long flags;
        int ret = 0;

        if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) {
                WARN_ON(1);
                return -EINVAL;
        }

        WARN_ON_ONCE(kthread->preferred_affinity);

        if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
                return -ENOMEM;

        kthread->preferred_affinity = kzalloc(sizeof(struct cpumask), GFP_KERNEL);
        if (!kthread->preferred_affinity) {
                ret = -ENOMEM;
                goto out;
        }

        mutex_lock(&kthreads_hotplug_lock);
        cpumask_copy(kthread->preferred_affinity, mask);
        WARN_ON_ONCE(!list_empty(&kthread->hotplug_node));
        list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
        kthread_fetch_affinity(kthread, affinity);

        /* It's safe because the task is inactive. */
        raw_spin_lock_irqsave(&p->pi_lock, flags);
        do_set_cpus_allowed(p, affinity);
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);

        mutex_unlock(&kthreads_hotplug_lock);
out:
        free_cpumask_var(affinity);

        return ret;
}
EXPORT_SYMBOL_GPL(kthread_affine_preferred);

/*
 * Re-affine kthreads according to their preferences
 * and the newly online CPU. The CPU down part is handled
 * by select_fallback_rq() which default re-affines to
 * housekeepers from other nodes in case the preferred
 * affinity doesn't apply anymore.
 */
static int kthreads_online_cpu(unsigned int cpu)
{
        cpumask_var_t affinity;
        struct kthread *k;
        int ret;

        guard(mutex)(&kthreads_hotplug_lock);

        if (list_empty(&kthreads_hotplug))
                return 0;

        if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
                return -ENOMEM;

        ret = 0;

        list_for_each_entry(k, &kthreads_hotplug, hotplug_node) {
                if (WARN_ON_ONCE((k->task->flags & PF_NO_SETAFFINITY) ||
                                 kthread_is_per_cpu(k->task))) {
                        ret = -EINVAL;
                        continue;
                }
                kthread_fetch_affinity(k, affinity);
                set_cpus_allowed_ptr(k->task, affinity);
        }

        free_cpumask_var(affinity);

        return ret;
}

static int kthreads_init(void)
{
        return cpuhp_setup_state(CPUHP_AP_KTHREADS_ONLINE, "kthreads:online",
                                kthreads_online_cpu, NULL);
}
early_initcall(kthreads_init);

void __kthread_init_worker(struct kthread_worker *worker,
                                const char *name,
                                struct lock_class_key *key)
{
        memset(worker, 0, sizeof(struct kthread_worker));
        raw_spin_lock_init(&worker->lock);
        lockdep_set_class_and_name(&worker->lock, key, name);
        INIT_LIST_HEAD(&worker->work_list);
        INIT_LIST_HEAD(&worker->delayed_work_list);
}
EXPORT_SYMBOL_GPL(__kthread_init_worker);

/**
 * kthread_worker_fn - kthread function to process kthread_worker
 * @worker_ptr: pointer to initialized kthread_worker
 *
 * This function implements the main cycle of kthread worker. It processes
 * work_list until it is stopped with kthread_stop(). It sleeps when the queue
 * is empty.
 *
 * The works are not allowed to keep any locks, disable preemption or interrupts
 * when they finish. There is defined a safe point for freezing when one work
 * finishes and before a new one is started.
 *
 * Also the works must not be handled by more than one worker at the same time,
 * see also kthread_queue_work().
 */
int kthread_worker_fn(void *worker_ptr)
{
        struct kthread_worker *worker = worker_ptr;
        struct kthread_work *work;

        /*
         * FIXME: Update the check and remove the assignment when all kthread
         * worker users are created using kthread_create_worker*() functions.
         */
        WARN_ON(worker->task && worker->task != current);
        worker->task = current;

        if (worker->flags & KTW_FREEZABLE)
                set_freezable();

repeat:
        set_current_state(TASK_INTERRUPTIBLE);        /* mb paired w/ kthread_stop */

        if (kthread_should_stop()) {
                __set_current_state(TASK_RUNNING);
                raw_spin_lock_irq(&worker->lock);
                worker->task = NULL;
                raw_spin_unlock_irq(&worker->lock);
                return 0;
        }

        work = NULL;
        raw_spin_lock_irq(&worker->lock);
        if (!list_empty(&worker->work_list)) {
                work = list_first_entry(&worker->work_list,
                                        struct kthread_work, node);
                list_del_init(&work->node);
        }
        worker->current_work = work;
        raw_spin_unlock_irq(&worker->lock);

        if (work) {
                kthread_work_func_t func = work->func;
                __set_current_state(TASK_RUNNING);
                trace_sched_kthread_work_execute_start(work);
                work->func(work);
                /*
                 * Avoid dereferencing work after this point.  The trace
                 * event only cares about the address.
                 */
                trace_sched_kthread_work_execute_end(work, func);
        } else if (!freezing(current)) {
                schedule();
        } else {
                /*
                 * Handle the case where the current remains
                 * TASK_INTERRUPTIBLE. try_to_freeze() expects
                 * the current to be TASK_RUNNING.
                 */
                __set_current_state(TASK_RUNNING);
        }

        try_to_freeze();
        cond_resched();
        goto repeat;
}
EXPORT_SYMBOL_GPL(kthread_worker_fn);

static __printf(3, 0) struct kthread_worker *
__kthread_create_worker_on_node(unsigned int flags, int node,
                                const char namefmt[], va_list args)
{
        struct kthread_worker *worker;
        struct task_struct *task;

        worker = kzalloc(sizeof(*worker), GFP_KERNEL);
        if (!worker)
                return ERR_PTR(-ENOMEM);

        kthread_init_worker(worker);

        task = __kthread_create_on_node(kthread_worker_fn, worker,
                                        node, namefmt, args);
        if (IS_ERR(task))
                goto fail_task;

        worker->flags = flags;
        worker->task = task;

        return worker;

fail_task:
        kfree(worker);
        return ERR_CAST(task);
}

/**
 * kthread_create_worker_on_node - create a kthread worker
 * @flags: flags modifying the default behavior of the worker
 * @node: task structure for the thread is allocated on this node
 * @namefmt: printf-style name for the kthread worker (task).
 *
 * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
 * when the needed structures could not get allocated, and ERR_PTR(-EINTR)
 * when the caller was killed by a fatal signal.
 */
struct kthread_worker *
kthread_create_worker_on_node(unsigned int flags, int node, const char namefmt[], ...)
{
        struct kthread_worker *worker;
        va_list args;

        va_start(args, namefmt);
        worker = __kthread_create_worker_on_node(flags, node, namefmt, args);
        va_end(args);

        return worker;
}
EXPORT_SYMBOL(kthread_create_worker_on_node);

/**
 * kthread_create_worker_on_cpu - create a kthread worker and bind it
 *        to a given CPU and the associated NUMA node.
 * @cpu: CPU number
 * @flags: flags modifying the default behavior of the worker
 * @namefmt: printf-style name for the thread. Format is restricted
 *             to "name.*%u". Code fills in cpu number.
 *
 * Use a valid CPU number if you want to bind the kthread worker
 * to the given CPU and the associated NUMA node.
 *
 * A good practice is to add the cpu number also into the worker name.
 * For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu).
 *
 * CPU hotplug:
 * The kthread worker API is simple and generic. It just provides a way
 * to create, use, and destroy workers.
 *
 * It is up to the API user how to handle CPU hotplug. They have to decide
 * how to handle pending work items, prevent queuing new ones, and
 * restore the functionality when the CPU goes off and on. There are a
 * few catches:
 *
 *    - CPU affinity gets lost when it is scheduled on an offline CPU.
 *
 *    - The worker might not exist when the CPU was off when the user
 *      created the workers.
 *
 * Good practice is to implement two CPU hotplug callbacks and to
 * destroy/create the worker when the CPU goes down/up.
 *
 * Return:
 * The pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
 * when the needed structures could not get allocated, and ERR_PTR(-EINTR)
 * when the caller was killed by a fatal signal.
 */
struct kthread_worker *
kthread_create_worker_on_cpu(int cpu, unsigned int flags,
                             const char namefmt[])
{
        struct kthread_worker *worker;

        worker = kthread_create_worker_on_node(flags, cpu_to_node(cpu), namefmt, cpu);
        if (!IS_ERR(worker))
                kthread_bind(worker->task, cpu);

        return worker;
}
EXPORT_SYMBOL(kthread_create_worker_on_cpu);

/*
 * Returns true when the work could not be queued at the moment.
 * It happens when it is already pending in a worker list
 * or when it is being cancelled.
 */
static inline bool queuing_blocked(struct kthread_worker *worker,
                                   struct kthread_work *work)
{
        lockdep_assert_held(&worker->lock);

        return !list_empty(&work->node) || work->canceling;
}

static void kthread_insert_work_sanity_check(struct kthread_worker *worker,
                                             struct kthread_work *work)
{
        lockdep_assert_held(&worker->lock);
        WARN_ON_ONCE(!list_empty(&work->node));
        /* Do not use a work with >1 worker, see kthread_queue_work() */
        WARN_ON_ONCE(work->worker && work->worker != worker);
}

/* insert @work before @pos in @worker */
static void kthread_insert_work(struct kthread_worker *worker,
                                struct kthread_work *work,
                                struct list_head *pos)
{
        kthread_insert_work_sanity_check(worker, work);

        trace_sched_kthread_work_queue_work(worker, work);

        list_add_tail(&work->node, pos);
        work->worker = worker;
        if (!worker->current_work && likely(worker->task))
                wake_up_process(worker->task);
}

/**
 * kthread_queue_work - queue a kthread_work
 * @worker: target kthread_worker
 * @work: kthread_work to queue
 *
 * Queue @work to work processor @task for async execution.  @task
 * must have been created with kthread_create_worker().  Returns %true
 * if @work was successfully queued, %false if it was already pending.
 *
 * Reinitialize the work if it needs to be used by another worker.
 * For example, when the worker was stopped and started again.
 */
bool kthread_queue_work(struct kthread_worker *worker,
                        struct kthread_work *work)
{
        bool ret = false;
        unsigned long flags;

        raw_spin_lock_irqsave(&worker->lock, flags);
        if (!queuing_blocked(worker, work)) {
                kthread_insert_work(worker, work, &worker->work_list);
                ret = true;
        }
        raw_spin_unlock_irqrestore(&worker->lock, flags);
        return ret;
}
EXPORT_SYMBOL_GPL(kthread_queue_work);

/**
 * kthread_delayed_work_timer_fn - callback that queues the associated kthread
 *        delayed work when the timer expires.
 * @t: pointer to the expired timer
 *
 * The format of the function is defined by struct timer_list.
 * It should have been called from irqsafe timer with irq already off.
 */
void kthread_delayed_work_timer_fn(struct timer_list *t)
{
        struct kthread_delayed_work *dwork = timer_container_of(dwork, t,
                                                                timer);
        struct kthread_work *work = &dwork->work;
        struct kthread_worker *worker = work->worker;
        unsigned long flags;

        /*
         * This might happen when a pending work is reinitialized.
         * It means that it is used a wrong way.
         */
        if (WARN_ON_ONCE(!worker))
                return;

        raw_spin_lock_irqsave(&worker->lock, flags);
        /* Work must not be used with >1 worker, see kthread_queue_work(). */
        WARN_ON_ONCE(work->worker != worker);

        /* Move the work from worker->delayed_work_list. */
        WARN_ON_ONCE(list_empty(&work->node));
        list_del_init(&work->node);
        if (!work->canceling)
                kthread_insert_work(worker, work, &worker->work_list);

        raw_spin_unlock_irqrestore(&worker->lock, flags);
}
EXPORT_SYMBOL(kthread_delayed_work_timer_fn);

static void __kthread_queue_delayed_work(struct kthread_worker *worker,
                                         struct kthread_delayed_work *dwork,
                                         unsigned long delay)
{
        struct timer_list *timer = &dwork->timer;
        struct kthread_work *work = &dwork->work;

        WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn);

        /*
         * If @delay is 0, queue @dwork->work immediately.  This is for
         * both optimization and correctness.  The earliest @timer can
         * expire is on the closest next tick and delayed_work users depend
         * on that there's no such delay when @delay is 0.
         */
        if (!delay) {
                kthread_insert_work(worker, work, &worker->work_list);
                return;
        }

        /* Be paranoid and try to detect possible races already now. */
        kthread_insert_work_sanity_check(worker, work);

        list_add(&work->node, &worker->delayed_work_list);
        work->worker = worker;
        timer->expires = jiffies + delay;
        add_timer(timer);
}

/**
 * kthread_queue_delayed_work - queue the associated kthread work
 *        after a delay.
 * @worker: target kthread_worker
 * @dwork: kthread_delayed_work to queue
 * @delay: number of jiffies to wait before queuing
 *
 * If the work has not been pending it starts a timer that will queue
 * the work after the given @delay. If @delay is zero, it queues the
 * work immediately.
 *
 * Return: %false if the @work has already been pending. It means that
 * either the timer was running or the work was queued. It returns %true
 * otherwise.
 */
bool kthread_queue_delayed_work(struct kthread_worker *worker,
                                struct kthread_delayed_work *dwork,
                                unsigned long delay)
{
        struct kthread_work *work = &dwork->work;
        unsigned long flags;
        bool ret = false;

        raw_spin_lock_irqsave(&worker->lock, flags);

        if (!queuing_blocked(worker, work)) {
                __kthread_queue_delayed_work(worker, dwork, delay);
                ret = true;
        }

        raw_spin_unlock_irqrestore(&worker->lock, flags);
        return ret;
}
EXPORT_SYMBOL_GPL(kthread_queue_delayed_work);

struct kthread_flush_work {
        struct kthread_work        work;
        struct completion        done;
};

static void kthread_flush_work_fn(struct kthread_work *work)
{
        struct kthread_flush_work *fwork =
                container_of(work, struct kthread_flush_work, work);
        complete(&fwork->done);
}

/**
 * kthread_flush_work - flush a kthread_work
 * @work: work to flush
 *
 * If @work is queued or executing, wait for it to finish execution.
 */
void kthread_flush_work(struct kthread_work *work)
{
        struct kthread_flush_work fwork = {
                KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
                COMPLETION_INITIALIZER_ONSTACK(fwork.done),
        };
        struct kthread_worker *worker;
        bool noop = false;

        worker = work->worker;
        if (!worker)
                return;

        raw_spin_lock_irq(&worker->lock);
        /* Work must not be used with >1 worker, see kthread_queue_work(). */
        WARN_ON_ONCE(work->worker != worker);

        if (!list_empty(&work->node))
                kthread_insert_work(worker, &fwork.work, work->node.next);
        else if (worker->current_work == work)
                kthread_insert_work(worker, &fwork.work,
                                    worker->work_list.next);
        else
                noop = true;

        raw_spin_unlock_irq(&worker->lock);

        if (!noop)
                wait_for_completion(&fwork.done);
}
EXPORT_SYMBOL_GPL(kthread_flush_work);

/*
 * Make sure that the timer is neither set nor running and could
 * not manipulate the work list_head any longer.
 *
 * The function is called under worker->lock. The lock is temporary
 * released but the timer can't be set again in the meantime.
 */
static void kthread_cancel_delayed_work_timer(struct kthread_work *work,
                                              unsigned long *flags)
{
        struct kthread_delayed_work *dwork =
                container_of(work, struct kthread_delayed_work, work);
        struct kthread_worker *worker = work->worker;

        /*
         * timer_delete_sync() must be called to make sure that the timer
         * callback is not running. The lock must be temporary released
         * to avoid a deadlock with the callback. In the meantime,
         * any queuing is blocked by setting the canceling counter.
         */
        work->canceling++;
        raw_spin_unlock_irqrestore(&worker->lock, *flags);
        timer_delete_sync(&dwork->timer);
        raw_spin_lock_irqsave(&worker->lock, *flags);
        work->canceling--;
}

/*
 * This function removes the work from the worker queue.
 *
 * It is called under worker->lock. The caller must make sure that
 * the timer used by delayed work is not running, e.g. by calling
 * kthread_cancel_delayed_work_timer().
 *
 * The work might still be in use when this function finishes. See the
 * current_work proceed by the worker.
 *
 * Return: %true if @work was pending and successfully canceled,
 *        %false if @work was not pending
 */
static bool __kthread_cancel_work(struct kthread_work *work)
{
        /*
         * Try to remove the work from a worker list. It might either
         * be from worker->work_list or from worker->delayed_work_list.
         */
        if (!list_empty(&work->node)) {
                list_del_init(&work->node);
                return true;
        }

        return false;
}

/**
 * kthread_mod_delayed_work - modify delay of or queue a kthread delayed work
 * @worker: kthread worker to use
 * @dwork: kthread delayed work to queue
 * @delay: number of jiffies to wait before queuing
 *
 * If @dwork is idle, equivalent to kthread_queue_delayed_work(). Otherwise,
 * modify @dwork's timer so that it expires after @delay. If @delay is zero,
 * @work is guaranteed to be queued immediately.
 *
 * Return: %false if @dwork was idle and queued, %true otherwise.
 *
 * A special case is when the work is being canceled in parallel.
 * It might be caused either by the real kthread_cancel_delayed_work_sync()
 * or yet another kthread_mod_delayed_work() call. We let the other command
 * win and return %true here. The return value can be used for reference
 * counting and the number of queued works stays the same. Anyway, the caller
 * is supposed to synchronize these operations a reasonable way.
 *
 * This function is safe to call from any context including IRQ handler.
 * See __kthread_cancel_work() and kthread_delayed_work_timer_fn()
 * for details.
 */
bool kthread_mod_delayed_work(struct kthread_worker *worker,
                              struct kthread_delayed_work *dwork,
                              unsigned long delay)
{
        struct kthread_work *work = &dwork->work;
        unsigned long flags;
        int ret;

        raw_spin_lock_irqsave(&worker->lock, flags);

        /* Do not bother with canceling when never queued. */
        if (!work->worker) {
                ret = false;
                goto fast_queue;
        }

        /* Work must not be used with >1 worker, see kthread_queue_work() */
        WARN_ON_ONCE(work->worker != worker);

        /*
         * Temporary cancel the work but do not fight with another command
         * that is canceling the work as well.
         *
         * It is a bit tricky because of possible races with another
         * mod_delayed_work() and cancel_delayed_work() callers.
         *
         * The timer must be canceled first because worker->lock is released
         * when doing so. But the work can be removed from the queue (list)
         * only when it can be queued again so that the return value can
         * be used for reference counting.
         */
        kthread_cancel_delayed_work_timer(work, &flags);
        if (work->canceling) {
                /* The number of works in the queue does not change. */
                ret = true;
                goto out;
        }
        ret = __kthread_cancel_work(work);

fast_queue:
        __kthread_queue_delayed_work(worker, dwork, delay);
out:
        raw_spin_unlock_irqrestore(&worker->lock, flags);
        return ret;
}
EXPORT_SYMBOL_GPL(kthread_mod_delayed_work);

static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork)
{
        struct kthread_worker *worker = work->worker;
        unsigned long flags;
        int ret = false;

        if (!worker)
                goto out;

        raw_spin_lock_irqsave(&worker->lock, flags);
        /* Work must not be used with >1 worker, see kthread_queue_work(). */
        WARN_ON_ONCE(work->worker != worker);

        if (is_dwork)
                kthread_cancel_delayed_work_timer(work, &flags);

        ret = __kthread_cancel_work(work);

        if (worker->current_work != work)
                goto out_fast;

        /*
         * The work is in progress and we need to wait with the lock released.
         * In the meantime, block any queuing by setting the canceling counter.
         */
        work->canceling++;
        raw_spin_unlock_irqrestore(&worker->lock, flags);
        kthread_flush_work(work);
        raw_spin_lock_irqsave(&worker->lock, flags);
        work->canceling--;

out_fast:
        raw_spin_unlock_irqrestore(&worker->lock, flags);
out:
        return ret;
}

/**
 * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish
 * @work: the kthread work to cancel
 *
 * Cancel @work and wait for its execution to finish.  This function
 * can be used even if the work re-queues itself. On return from this
 * function, @work is guaranteed to be not pending or executing on any CPU.
 *
 * kthread_cancel_work_sync(&delayed_work->work) must not be used for
 * delayed_work's. Use kthread_cancel_delayed_work_sync() instead.
 *
 * The caller must ensure that the worker on which @work was last
 * queued can't be destroyed before this function returns.
 *
 * Return: %true if @work was pending, %false otherwise.
 */
bool kthread_cancel_work_sync(struct kthread_work *work)
{
        return __kthread_cancel_work_sync(work, false);
}
EXPORT_SYMBOL_GPL(kthread_cancel_work_sync);

/**
 * kthread_cancel_delayed_work_sync - cancel a kthread delayed work and
 *        wait for it to finish.
 * @dwork: the kthread delayed work to cancel
 *
 * This is kthread_cancel_work_sync() for delayed works.
 *
 * Return: %true if @dwork was pending, %false otherwise.
 */
bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *dwork)
{
        return __kthread_cancel_work_sync(&dwork->work, true);
}
EXPORT_SYMBOL_GPL(kthread_cancel_delayed_work_sync);

/**
 * kthread_flush_worker - flush all current works on a kthread_worker
 * @worker: worker to flush
 *
 * Wait until all currently executing or pending works on @worker are
 * finished.
 */
void kthread_flush_worker(struct kthread_worker *worker)
{
        struct kthread_flush_work fwork = {
                KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
                COMPLETION_INITIALIZER_ONSTACK(fwork.done),
        };

        kthread_queue_work(worker, &fwork.work);
        wait_for_completion(&fwork.done);
}
EXPORT_SYMBOL_GPL(kthread_flush_worker);

/**
 * kthread_destroy_worker - destroy a kthread worker
 * @worker: worker to be destroyed
 *
 * Flush and destroy @worker.  The simple flush is enough because the kthread
 * worker API is used only in trivial scenarios.  There are no multi-step state
 * machines needed.
 *
 * Note that this function is not responsible for handling delayed work, so
 * caller should be responsible for queuing or canceling all delayed work items
 * before invoke this function.
 */
void kthread_destroy_worker(struct kthread_worker *worker)
{
        struct task_struct *task;

        task = worker->task;
        if (WARN_ON(!task))
                return;

        kthread_flush_worker(worker);
        kthread_stop(task);
        WARN_ON(!list_empty(&worker->delayed_work_list));
        WARN_ON(!list_empty(&worker->work_list));
        kfree(worker);
}
EXPORT_SYMBOL(kthread_destroy_worker);

/**
 * kthread_use_mm - make the calling kthread operate on an address space
 * @mm: address space to operate on
 */
void kthread_use_mm(struct mm_struct *mm)
{
        struct mm_struct *active_mm;
        struct task_struct *tsk = current;

        WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
        WARN_ON_ONCE(tsk->mm);

        /*
         * It is possible for mm to be the same as tsk->active_mm, but
         * we must still mmgrab(mm) and mmdrop_lazy_tlb(active_mm),
         * because these references are not equivalent.
         */
        mmgrab(mm);

        task_lock(tsk);
        /* Hold off tlb flush IPIs while switching mm's */
        local_irq_disable();
        active_mm = tsk->active_mm;
        tsk->active_mm = mm;
        tsk->mm = mm;
        membarrier_update_current_mm(mm);
        switch_mm_irqs_off(active_mm, mm, tsk);
        local_irq_enable();
        task_unlock(tsk);
#ifdef finish_arch_post_lock_switch
        finish_arch_post_lock_switch();
#endif

        /*
         * When a kthread starts operating on an address space, the loop
         * in membarrier_{private,global}_expedited() may not observe
         * that tsk->mm, and not issue an IPI. Membarrier requires a
         * memory barrier after storing to tsk->mm, before accessing
         * user-space memory. A full memory barrier for membarrier
         * {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by
         * mmdrop_lazy_tlb().
         */
        mmdrop_lazy_tlb(active_mm);
}
EXPORT_SYMBOL_GPL(kthread_use_mm);

/**
 * kthread_unuse_mm - reverse the effect of kthread_use_mm()
 * @mm: address space to operate on
 */
void kthread_unuse_mm(struct mm_struct *mm)
{
        struct task_struct *tsk = current;

        WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
        WARN_ON_ONCE(!tsk->mm);

        task_lock(tsk);
        /*
         * When a kthread stops operating on an address space, the loop
         * in membarrier_{private,global}_expedited() may not observe
         * that tsk->mm, and not issue an IPI. Membarrier requires a
         * memory barrier after accessing user-space memory, before
         * clearing tsk->mm.
         */
        smp_mb__after_spinlock();
        local_irq_disable();
        tsk->mm = NULL;
        membarrier_update_current_mm(NULL);
        mmgrab_lazy_tlb(mm);
        /* active_mm is still 'mm' */
        enter_lazy_tlb(mm, tsk);
        local_irq_enable();
        task_unlock(tsk);

        mmdrop(mm);
}
EXPORT_SYMBOL_GPL(kthread_unuse_mm);

#ifdef CONFIG_BLK_CGROUP
/**
 * kthread_associate_blkcg - associate blkcg to current kthread
 * @css: the cgroup info
 *
 * Current thread must be a kthread. The thread is running jobs on behalf of
 * other threads. In some cases, we expect the jobs attach cgroup info of
 * original threads instead of that of current thread. This function stores
 * original thread's cgroup info in current kthread context for later
 * retrieval.
 */
void kthread_associate_blkcg(struct cgroup_subsys_state *css)
{
        struct kthread *kthread;

        if (!(current->flags & PF_KTHREAD))
                return;
        kthread = to_kthread(current);
        if (!kthread)
                return;

        if (kthread->blkcg_css) {
                css_put(kthread->blkcg_css);
                kthread->blkcg_css = NULL;
        }
        if (css) {
                css_get(css);
                kthread->blkcg_css = css;
        }
}
EXPORT_SYMBOL(kthread_associate_blkcg);

/**
 * kthread_blkcg - get associated blkcg css of current kthread
 *
 * Current thread must be a kthread.
 */
struct cgroup_subsys_state *kthread_blkcg(void)
{
        struct kthread *kthread;

        if (current->flags & PF_KTHREAD) {
                kthread = to_kthread(current);
                if (kthread)
                        return kthread->blkcg_css;
        }
        return NULL;
}
#endif





























































































   12 



   13 
   13 







































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   13 






   13 
   13 
   13 

   13 












   13 



   13 


   12 

   13 







   13 


   13 

   13 






   13 



   13 


   13 

   13 











   13 


   13 


   13 






   13 


   13 









   13 
   13 






   13 




   13 
   13 


   12 

   13 

























   13 


   12 














   13 



















   13 




   13 
   13 

   12 

   13 




   13 
   13 















   13 





































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        Linux INET6 implementation
 *        Forwarding Information Database
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>
 *
 *        Changes:
 *        Yuji SEKIYA @USAGI:        Support default route on router node;
 *                                remove ip6_null_entry from the top of
 *                                routing table.
 *        Ville Nuorvala:                Fixed routing subtrees.
 */

#define pr_fmt(fmt) "IPv6: " fmt

#include <linux/bpf.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/net.h>
#include <linux/route.h>
#include <linux/netdevice.h>
#include <linux/in6.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/slab.h>

#include <net/ip.h>
#include <net/ipv6.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
#include <net/lwtunnel.h>
#include <net/fib_notifier.h>

#include <net/ip_fib.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>

static struct kmem_cache *fib6_node_kmem __read_mostly;

struct fib6_cleaner {
        struct fib6_walker w;
        struct net *net;
        int (*func)(struct fib6_info *, void *arg);
        int sernum;
        void *arg;
        bool skip_notify;
};

#ifdef CONFIG_IPV6_SUBTREES
#define FWS_INIT FWS_S
#else
#define FWS_INIT FWS_L
#endif

static struct fib6_info *fib6_find_prefix(struct net *net,
                                         struct fib6_table *table,
                                         struct fib6_node *fn);
static struct fib6_node *fib6_repair_tree(struct net *net,
                                          struct fib6_table *table,
                                          struct fib6_node *fn);
static int fib6_walk(struct net *net, struct fib6_walker *w);
static int fib6_walk_continue(struct fib6_walker *w);

/*
 *        A routing update causes an increase of the serial number on the
 *        affected subtree. This allows for cached routes to be asynchronously
 *        tested when modifications are made to the destination cache as a
 *        result of redirects, path MTU changes, etc.
 */

static void fib6_gc_timer_cb(struct timer_list *t);

#define FOR_WALKERS(net, w) \
        list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh)

static void fib6_walker_link(struct net *net, struct fib6_walker *w)
{
        write_lock_bh(&net->ipv6.fib6_walker_lock);
        list_add(&w->lh, &net->ipv6.fib6_walkers);
        write_unlock_bh(&net->ipv6.fib6_walker_lock);
}

static void fib6_walker_unlink(struct net *net, struct fib6_walker *w)
{
        write_lock_bh(&net->ipv6.fib6_walker_lock);
        list_del(&w->lh);
        write_unlock_bh(&net->ipv6.fib6_walker_lock);
}

static int fib6_new_sernum(struct net *net)
{
        int new, old = atomic_read(&net->ipv6.fib6_sernum);

        do {
                new = old < INT_MAX ? old + 1 : 1;
        } while (!atomic_try_cmpxchg(&net->ipv6.fib6_sernum, &old, new));

        return new;
}

enum {
        FIB6_NO_SERNUM_CHANGE = 0,
};

void fib6_update_sernum(struct net *net, struct fib6_info *f6i)
{
        struct fib6_node *fn;

        fn = rcu_dereference_protected(f6i->fib6_node,
                        lockdep_is_held(&f6i->fib6_table->tb6_lock));
        if (fn)
                WRITE_ONCE(fn->fn_sernum, fib6_new_sernum(net));
}

/*
 *        Auxiliary address test functions for the radix tree.
 *
 *        These assume a 32bit processor (although it will work on
 *        64bit processors)
 */

/*
 *        test bit
 */
#if defined(__LITTLE_ENDIAN)
# define BITOP_BE32_SWIZZLE        (0x1F & ~7)
#else
# define BITOP_BE32_SWIZZLE        0
#endif

static __be32 addr_bit_set(const void *token, int fn_bit)
{
        const __be32 *addr = token;
        /*
         * Here,
         *        1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)
         * is optimized version of
         *        htonl(1 << ((~fn_bit)&0x1F))
         * See include/asm-generic/bitops/le.h.
         */
        return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) &
               addr[fn_bit >> 5];
}

struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh)
{
        struct fib6_info *f6i;
        size_t sz = sizeof(*f6i);

        if (with_fib6_nh)
                sz += sizeof(struct fib6_nh);

        f6i = kzalloc(sz, gfp_flags);
        if (!f6i)
                return NULL;

        /* fib6_siblings is a union with nh_list, so this initializes both */
        INIT_LIST_HEAD(&f6i->fib6_siblings);
        refcount_set(&f6i->fib6_ref, 1);

        INIT_HLIST_NODE(&f6i->gc_link);

        return f6i;
}

void fib6_info_destroy_rcu(struct rcu_head *head)
{
        struct fib6_info *f6i = container_of(head, struct fib6_info, rcu);

        WARN_ON(f6i->fib6_node);

        if (f6i->nh)
                nexthop_put(f6i->nh);
        else
                fib6_nh_release(f6i->fib6_nh);

        ip_fib_metrics_put(f6i->fib6_metrics);
        kfree(f6i);
}
EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu);

static struct fib6_node *node_alloc(struct net *net)
{
        struct fib6_node *fn;

        fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC);
        if (fn)
                net->ipv6.rt6_stats->fib_nodes++;

        return fn;
}

static void node_free_immediate(struct net *net, struct fib6_node *fn)
{
        kmem_cache_free(fib6_node_kmem, fn);
        net->ipv6.rt6_stats->fib_nodes--;
}

static void node_free(struct net *net, struct fib6_node *fn)
{
        kfree_rcu(fn, rcu);
        net->ipv6.rt6_stats->fib_nodes--;
}

static void fib6_free_table(struct fib6_table *table)
{
        inetpeer_invalidate_tree(&table->tb6_peers);
        kfree(table);
}

static void fib6_link_table(struct net *net, struct fib6_table *tb)
{
        unsigned int h;

        /*
         * Initialize table lock at a single place to give lockdep a key,
         * tables aren't visible prior to being linked to the list.
         */
        spin_lock_init(&tb->tb6_lock);
        h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1);

        /*
         * No protection necessary, this is the only list mutatation
         * operation, tables never disappear once they exist.
         */
        hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]);
}

#ifdef CONFIG_IPV6_MULTIPLE_TABLES

static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
{
        struct fib6_table *table;

        table = kzalloc(sizeof(*table), GFP_ATOMIC);
        if (table) {
                table->tb6_id = id;
                rcu_assign_pointer(table->tb6_root.leaf,
                                   net->ipv6.fib6_null_entry);
                table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
                inet_peer_base_init(&table->tb6_peers);
                INIT_HLIST_HEAD(&table->tb6_gc_hlist);
        }

        return table;
}

struct fib6_table *fib6_new_table(struct net *net, u32 id)
{
        struct fib6_table *tb, *new_tb;

        if (id == 0)
                id = RT6_TABLE_MAIN;

        tb = fib6_get_table(net, id);
        if (tb)
                return tb;

        new_tb = fib6_alloc_table(net, id);
        if (!new_tb)
                return NULL;

        spin_lock_bh(&net->ipv6.fib_table_hash_lock);

        tb = fib6_get_table(net, id);
        if (unlikely(tb)) {
                spin_unlock_bh(&net->ipv6.fib_table_hash_lock);
                kfree(new_tb);
                return tb;
        }

        fib6_link_table(net, new_tb);

        spin_unlock_bh(&net->ipv6.fib_table_hash_lock);

        return new_tb;
}
EXPORT_SYMBOL_GPL(fib6_new_table);

struct fib6_table *fib6_get_table(struct net *net, u32 id)
{
        struct hlist_head *head;
        struct fib6_table *tb;

        if (!id)
                id = RT6_TABLE_MAIN;

        head = &net->ipv6.fib_table_hash[id & (FIB6_TABLE_HASHSZ - 1)];

        /* See comment in fib6_link_table().  RCU is not required,
         * but rcu_dereference_raw() is used to avoid data-race.
         */
        hlist_for_each_entry_rcu(tb, head, tb6_hlist, true)
                if (tb->tb6_id == id)
                        return tb;

        return NULL;
}
EXPORT_SYMBOL_GPL(fib6_get_table);

static void __net_init fib6_tables_init(struct net *net)
{
        fib6_link_table(net, net->ipv6.fib6_main_tbl);
        fib6_link_table(net, net->ipv6.fib6_local_tbl);
}
#else

struct fib6_table *fib6_new_table(struct net *net, u32 id)
{
        return fib6_get_table(net, id);
}

struct fib6_table *fib6_get_table(struct net *net, u32 id)
{
          return net->ipv6.fib6_main_tbl;
}

struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
                                   const struct sk_buff *skb,
                                   int flags, pol_lookup_t lookup)
{
        struct rt6_info *rt;

        rt = pol_lookup_func(lookup,
                        net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
        if (rt->dst.error == -EAGAIN) {
                ip6_rt_put_flags(rt, flags);
                rt = net->ipv6.ip6_null_entry;
                if (!(flags & RT6_LOOKUP_F_DST_NOREF))
                        dst_hold(&rt->dst);
        }

        return &rt->dst;
}

/* called with rcu lock held; no reference taken on fib6_info */
int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
                struct fib6_result *res, int flags)
{
        return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6,
                                 res, flags);
}

static void __net_init fib6_tables_init(struct net *net)
{
        fib6_link_table(net, net->ipv6.fib6_main_tbl);
}

#endif

unsigned int fib6_tables_seq_read(const struct net *net)
{
        unsigned int h, fib_seq = 0;

        rcu_read_lock();
        for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
                const struct hlist_head *head = &net->ipv6.fib_table_hash[h];
                const struct fib6_table *tb;

                hlist_for_each_entry_rcu(tb, head, tb6_hlist)
                        fib_seq += READ_ONCE(tb->fib_seq);
        }
        rcu_read_unlock();

        return fib_seq;
}

static int call_fib6_entry_notifier(struct notifier_block *nb,
                                    enum fib_event_type event_type,
                                    struct fib6_info *rt,
                                    struct netlink_ext_ack *extack)
{
        struct fib6_entry_notifier_info info = {
                .info.extack = extack,
                .rt = rt,
        };

        return call_fib6_notifier(nb, event_type, &info.info);
}

static int call_fib6_multipath_entry_notifier(struct notifier_block *nb,
                                              enum fib_event_type event_type,
                                              struct fib6_info *rt,
                                              unsigned int nsiblings,
                                              struct netlink_ext_ack *extack)
{
        struct fib6_entry_notifier_info info = {
                .info.extack = extack,
                .rt = rt,
                .nsiblings = nsiblings,
        };

        return call_fib6_notifier(nb, event_type, &info.info);
}

int call_fib6_entry_notifiers(struct net *net,
                              enum fib_event_type event_type,
                              struct fib6_info *rt,
                              struct netlink_ext_ack *extack)
{
        struct fib6_entry_notifier_info info = {
                .info.extack = extack,
                .rt = rt,
        };

        WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1);
        return call_fib6_notifiers(net, event_type, &info.info);
}

int call_fib6_multipath_entry_notifiers(struct net *net,
                                        enum fib_event_type event_type,
                                        struct fib6_info *rt,
                                        unsigned int nsiblings,
                                        struct netlink_ext_ack *extack)
{
        struct fib6_entry_notifier_info info = {
                .info.extack = extack,
                .rt = rt,
                .nsiblings = nsiblings,
        };

        WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1);
        return call_fib6_notifiers(net, event_type, &info.info);
}

int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt)
{
        struct fib6_entry_notifier_info info = {
                .rt = rt,
                .nsiblings = rt->fib6_nsiblings,
        };

        WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1);
        return call_fib6_notifiers(net, FIB_EVENT_ENTRY_REPLACE, &info.info);
}

struct fib6_dump_arg {
        struct net *net;
        struct notifier_block *nb;
        struct netlink_ext_ack *extack;
};

static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg)
{
        enum fib_event_type fib_event = FIB_EVENT_ENTRY_REPLACE;
        unsigned int nsiblings;
        int err;

        if (!rt || rt == arg->net->ipv6.fib6_null_entry)
                return 0;

        nsiblings = READ_ONCE(rt->fib6_nsiblings);
        if (nsiblings)
                err = call_fib6_multipath_entry_notifier(arg->nb, fib_event,
                                                         rt,
                                                         nsiblings,
                                                         arg->extack);
        else
                err = call_fib6_entry_notifier(arg->nb, fib_event, rt,
                                               arg->extack);

        return err;
}

static int fib6_node_dump(struct fib6_walker *w)
{
        int err;

        err = fib6_rt_dump(w->leaf, w->args);
        w->leaf = NULL;
        return err;
}

static int fib6_table_dump(struct net *net, struct fib6_table *tb,
                           struct fib6_walker *w)
{
        int err;

        w->root = &tb->tb6_root;
        spin_lock_bh(&tb->tb6_lock);
        err = fib6_walk(net, w);
        spin_unlock_bh(&tb->tb6_lock);
        return err;
}

/* Called with rcu_read_lock() */
int fib6_tables_dump(struct net *net, struct notifier_block *nb,
                     struct netlink_ext_ack *extack)
{
        struct fib6_dump_arg arg;
        struct fib6_walker *w;
        unsigned int h;
        int err = 0;

        w = kzalloc(sizeof(*w), GFP_ATOMIC);
        if (!w)
                return -ENOMEM;

        w->func = fib6_node_dump;
        arg.net = net;
        arg.nb = nb;
        arg.extack = extack;
        w->args = &arg;

        for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
                struct hlist_head *head = &net->ipv6.fib_table_hash[h];
                struct fib6_table *tb;

                hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
                        err = fib6_table_dump(net, tb, w);
                        if (err)
                                goto out;
                }
        }

out:
        kfree(w);

        /* The tree traversal function should never return a positive value. */
        return err > 0 ? -EINVAL : err;
}

static int fib6_dump_node(struct fib6_walker *w)
{
        int res;
        struct fib6_info *rt;

        for_each_fib6_walker_rt(w) {
                res = rt6_dump_route(rt, w->args, w->skip_in_node);
                if (res >= 0) {
                        /* Frame is full, suspend walking */
                        w->leaf = rt;

                        /* We'll restart from this node, so if some routes were
                         * already dumped, skip them next time.
                         */
                        w->skip_in_node += res;

                        return 1;
                }
                w->skip_in_node = 0;

                /* Multipath routes are dumped in one route with the
                 * RTA_MULTIPATH attribute. Jump 'rt' to point to the
                 * last sibling of this route (no need to dump the
                 * sibling routes again)
                 */
                if (rt->fib6_nsiblings)
                        rt = list_last_entry(&rt->fib6_siblings,
                                             struct fib6_info,
                                             fib6_siblings);
        }
        w->leaf = NULL;
        return 0;
}

static void fib6_dump_end(struct netlink_callback *cb)
{
        struct net *net = sock_net(cb->skb->sk);
        struct fib6_walker *w = (void *)cb->args[2];

        if (w) {
                if (cb->args[4]) {
                        cb->args[4] = 0;
                        fib6_walker_unlink(net, w);
                }
                cb->args[2] = 0;
                kfree(w);
        }
        cb->done = (void *)cb->args[3];
        cb->args[1] = 3;
}

static int fib6_dump_done(struct netlink_callback *cb)
{
        fib6_dump_end(cb);
        return cb->done ? cb->done(cb) : 0;
}

static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
                           struct netlink_callback *cb)
{
        struct net *net = sock_net(skb->sk);
        struct fib6_walker *w;
        int res;

        w = (void *)cb->args[2];
        w->root = &table->tb6_root;

        if (cb->args[4] == 0) {
                w->count = 0;
                w->skip = 0;
                w->skip_in_node = 0;

                spin_lock_bh(&table->tb6_lock);
                res = fib6_walk(net, w);
                spin_unlock_bh(&table->tb6_lock);
                if (res > 0) {
                        cb->args[4] = 1;
                        cb->args[5] = READ_ONCE(w->root->fn_sernum);
                }
        } else {
                int sernum = READ_ONCE(w->root->fn_sernum);
                if (cb->args[5] != sernum) {
                        /* Begin at the root if the tree changed */
                        cb->args[5] = sernum;
                        w->state = FWS_INIT;
                        w->node = w->root;
                        w->skip = w->count;
                        w->skip_in_node = 0;
                } else
                        w->skip = 0;

                spin_lock_bh(&table->tb6_lock);
                res = fib6_walk_continue(w);
                spin_unlock_bh(&table->tb6_lock);
                if (res <= 0) {
                        fib6_walker_unlink(net, w);
                        cb->args[4] = 0;
                }
        }

        return res;
}

static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct rt6_rtnl_dump_arg arg = {
                .filter.dump_exceptions = true,
                .filter.dump_routes = true,
                .filter.rtnl_held = false,
        };
        const struct nlmsghdr *nlh = cb->nlh;
        struct net *net = sock_net(skb->sk);
        unsigned int e = 0, s_e;
        struct hlist_head *head;
        struct fib6_walker *w;
        struct fib6_table *tb;
        unsigned int h, s_h;
        int err = 0;

        rcu_read_lock();
        if (cb->strict_check) {
                err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb);
                if (err < 0)
                        goto unlock;
        } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
                struct rtmsg *rtm = nlmsg_data(nlh);

                if (rtm->rtm_flags & RTM_F_PREFIX)
                        arg.filter.flags = RTM_F_PREFIX;
        }

        w = (void *)cb->args[2];
        if (!w) {
                /* New dump:
                 *
                 * 1. allocate and initialize walker.
                 */
                w = kzalloc(sizeof(*w), GFP_ATOMIC);
                if (!w) {
                        err = -ENOMEM;
                        goto unlock;
                }
                w->func = fib6_dump_node;
                cb->args[2] = (long)w;

                /* 2. hook callback destructor.
                 */
                cb->args[3] = (long)cb->done;
                cb->done = fib6_dump_done;

        }

        arg.skb = skb;
        arg.cb = cb;
        arg.net = net;
        w->args = &arg;

        if (arg.filter.table_id) {
                tb = fib6_get_table(net, arg.filter.table_id);
                if (!tb) {
                        if (rtnl_msg_family(cb->nlh) != PF_INET6)
                                goto unlock;

                        NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist");
                        err = -ENOENT;
                        goto unlock;
                }

                if (!cb->args[0]) {
                        err = fib6_dump_table(tb, skb, cb);
                        if (!err)
                                cb->args[0] = 1;
                }
                goto unlock;
        }

        s_h = cb->args[0];
        s_e = cb->args[1];

        for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) {
                e = 0;
                head = &net->ipv6.fib_table_hash[h];
                hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
                        if (e < s_e)
                                goto next;
                        err = fib6_dump_table(tb, skb, cb);
                        if (err != 0)
                                goto out;
next:
                        e++;
                }
        }
out:
        cb->args[1] = e;
        cb->args[0] = h;

unlock:
        rcu_read_unlock();
        if (err <= 0)
                fib6_dump_end(cb);
        return err;
}

void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val)
{
        if (!f6i)
                return;

        if (f6i->fib6_metrics == &dst_default_metrics) {
                struct dst_metrics *p = kzalloc(sizeof(*p), GFP_ATOMIC);

                if (!p)
                        return;

                refcount_set(&p->refcnt, 1);
                f6i->fib6_metrics = p;
        }

        f6i->fib6_metrics->metrics[metric - 1] = val;
}

/*
 *        Routing Table
 *
 *        return the appropriate node for a routing tree "add" operation
 *        by either creating and inserting or by returning an existing
 *        node.
 */

static struct fib6_node *fib6_add_1(struct net *net,
                                    struct fib6_table *table,
                                    struct fib6_node *root,
                                    struct in6_addr *addr, int plen,
                                    int offset, int allow_create,
                                    int replace_required,
                                    struct netlink_ext_ack *extack)
{
        struct fib6_node *fn, *in, *ln;
        struct fib6_node *pn = NULL;
        struct rt6key *key;
        int        bit;
        __be32        dir = 0;

        /* insert node in tree */

        fn = root;

        do {
                struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
                                            lockdep_is_held(&table->tb6_lock));
                key = (struct rt6key *)((u8 *)leaf + offset);

                /*
                 *        Prefix match
                 */
                if (plen < fn->fn_bit ||
                    !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) {
                        if (!allow_create) {
                                if (replace_required) {
                                        NL_SET_ERR_MSG(extack,
                                                       "Can not replace route - no match found");
                                        pr_warn("Can't replace route, no match found\n");
                                        return ERR_PTR(-ENOENT);
                                }
                                pr_warn("NLM_F_CREATE should be set when creating new route\n");
                        }
                        goto insert_above;
                }

                /*
                 *        Exact match ?
                 */

                if (plen == fn->fn_bit) {
                        /* clean up an intermediate node */
                        if (!(fn->fn_flags & RTN_RTINFO)) {
                                RCU_INIT_POINTER(fn->leaf, NULL);
                                fib6_info_release(leaf);
                        /* remove null_entry in the root node */
                        } else if (fn->fn_flags & RTN_TL_ROOT &&
                                   rcu_access_pointer(fn->leaf) ==
                                   net->ipv6.fib6_null_entry) {
                                RCU_INIT_POINTER(fn->leaf, NULL);
                        }

                        return fn;
                }

                /*
                 *        We have more bits to go
                 */

                /* Try to walk down on tree. */
                dir = addr_bit_set(addr, fn->fn_bit);
                pn = fn;
                fn = dir ?
                     rcu_dereference_protected(fn->right,
                                        lockdep_is_held(&table->tb6_lock)) :
                     rcu_dereference_protected(fn->left,
                                        lockdep_is_held(&table->tb6_lock));
        } while (fn);

        if (!allow_create) {
                /* We should not create new node because
                 * NLM_F_REPLACE was specified without NLM_F_CREATE
                 * I assume it is safe to require NLM_F_CREATE when
                 * REPLACE flag is used! Later we may want to remove the
                 * check for replace_required, because according
                 * to netlink specification, NLM_F_CREATE
                 * MUST be specified if new route is created.
                 * That would keep IPv6 consistent with IPv4
                 */
                if (replace_required) {
                        NL_SET_ERR_MSG(extack,
                                       "Can not replace route - no match found");
                        pr_warn("Can't replace route, no match found\n");
                        return ERR_PTR(-ENOENT);
                }
                pr_warn("NLM_F_CREATE should be set when creating new route\n");
        }
        /*
         *        We walked to the bottom of tree.
         *        Create new leaf node without children.
         */

        ln = node_alloc(net);

        if (!ln)
                return ERR_PTR(-ENOMEM);
        ln->fn_bit = plen;
        RCU_INIT_POINTER(ln->parent, pn);

        if (dir)
                rcu_assign_pointer(pn->right, ln);
        else
                rcu_assign_pointer(pn->left, ln);

        return ln;


insert_above:
        /*
         * split since we don't have a common prefix anymore or
         * we have a less significant route.
         * we've to insert an intermediate node on the list
         * this new node will point to the one we need to create
         * and the current
         */

        pn = rcu_dereference_protected(fn->parent,
                                       lockdep_is_held(&table->tb6_lock));

        /* find 1st bit in difference between the 2 addrs.

           See comment in __ipv6_addr_diff: bit may be an invalid value,
           but if it is >= plen, the value is ignored in any case.
         */

        bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr));

        /*
         *                (intermediate)[in]
         *                  /           \
         *        (new leaf node)[ln] (old node)[fn]
         */
        if (plen > bit) {
                in = node_alloc(net);
                ln = node_alloc(net);

                if (!in || !ln) {
                        if (in)
                                node_free_immediate(net, in);
                        if (ln)
                                node_free_immediate(net, ln);
                        return ERR_PTR(-ENOMEM);
                }

                /*
                 * new intermediate node.
                 * RTN_RTINFO will
                 * be off since that an address that chooses one of
                 * the branches would not match less specific routes
                 * in the other branch
                 */

                in->fn_bit = bit;

                RCU_INIT_POINTER(in->parent, pn);
                in->leaf = fn->leaf;
                fib6_info_hold(rcu_dereference_protected(in->leaf,
                                lockdep_is_held(&table->tb6_lock)));

                /* update parent pointer */
                if (dir)
                        rcu_assign_pointer(pn->right, in);
                else
                        rcu_assign_pointer(pn->left, in);

                ln->fn_bit = plen;

                RCU_INIT_POINTER(ln->parent, in);
                rcu_assign_pointer(fn->parent, in);

                if (addr_bit_set(addr, bit)) {
                        rcu_assign_pointer(in->right, ln);
                        rcu_assign_pointer(in->left, fn);
                } else {
                        rcu_assign_pointer(in->left, ln);
                        rcu_assign_pointer(in->right, fn);
                }
        } else { /* plen <= bit */

                /*
                 *                (new leaf node)[ln]
                 *                  /           \
                 *             (old node)[fn] NULL
                 */

                ln = node_alloc(net);

                if (!ln)
                        return ERR_PTR(-ENOMEM);

                ln->fn_bit = plen;

                RCU_INIT_POINTER(ln->parent, pn);

                if (addr_bit_set(&key->addr, plen))
                        RCU_INIT_POINTER(ln->right, fn);
                else
                        RCU_INIT_POINTER(ln->left, fn);

                rcu_assign_pointer(fn->parent, ln);

                if (dir)
                        rcu_assign_pointer(pn->right, ln);
                else
                        rcu_assign_pointer(pn->left, ln);
        }
        return ln;
}

static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh,
                                  const struct fib6_info *match)
{
        int cpu;

        if (!fib6_nh->rt6i_pcpu)
                return;

        rcu_read_lock();
        /* release the reference to this fib entry from
         * all of its cached pcpu routes
         */
        for_each_possible_cpu(cpu) {
                struct rt6_info **ppcpu_rt;
                struct rt6_info *pcpu_rt;

                ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);

                /* Paired with xchg() in rt6_get_pcpu_route() */
                pcpu_rt = READ_ONCE(*ppcpu_rt);

                /* only dropping the 'from' reference if the cached route
                 * is using 'match'. The cached pcpu_rt->from only changes
                 * from a fib6_info to NULL (ip6_dst_destroy); it can never
                 * change from one fib6_info reference to another
                 */
                if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) {
                        struct fib6_info *from;

                        from = unrcu_pointer(xchg(&pcpu_rt->from, NULL));
                        fib6_info_release(from);
                }
        }
        rcu_read_unlock();
}

static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg)
{
        struct fib6_info *arg = _arg;

        __fib6_drop_pcpu_from(nh, arg);
        return 0;
}

static void fib6_drop_pcpu_from(struct fib6_info *f6i)
{
        /* Make sure rt6_make_pcpu_route() wont add other percpu routes
         * while we are cleaning them here.
         */
        f6i->fib6_destroying = 1;
        mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */

        if (f6i->nh) {
                rcu_read_lock();
                nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, f6i);
                rcu_read_unlock();
        } else {
                struct fib6_nh *fib6_nh;

                fib6_nh = f6i->fib6_nh;
                __fib6_drop_pcpu_from(fib6_nh, f6i);
        }
}

static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
                          struct net *net)
{
        struct fib6_table *table = rt->fib6_table;

        /* Flush all cached dst in exception table */
        rt6_flush_exceptions(rt);
        fib6_drop_pcpu_from(rt);

        if (rt->nh) {
                spin_lock(&rt->nh->lock);

                if (!list_empty(&rt->nh_list))
                        list_del_init(&rt->nh_list);

                spin_unlock(&rt->nh->lock);
        }

        if (refcount_read(&rt->fib6_ref) != 1) {
                /* This route is used as dummy address holder in some split
                 * nodes. It is not leaked, but it still holds other resources,
                 * which must be released in time. So, scan ascendant nodes
                 * and replace dummy references to this route with references
                 * to still alive ones.
                 */
                while (fn) {
                        struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
                                            lockdep_is_held(&table->tb6_lock));
                        struct fib6_info *new_leaf;
                        if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) {
                                new_leaf = fib6_find_prefix(net, table, fn);
                                fib6_info_hold(new_leaf);

                                rcu_assign_pointer(fn->leaf, new_leaf);
                                fib6_info_release(rt);
                        }
                        fn = rcu_dereference_protected(fn->parent,
                                    lockdep_is_held(&table->tb6_lock));
                }
        }

        fib6_clean_expires(rt);
        fib6_remove_gc_list(rt);
}

/*
 *        Insert routing information in a node.
 */

static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
                            struct nl_info *info, struct netlink_ext_ack *extack,
                            struct list_head *purge_list)
{
        struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
                                    lockdep_is_held(&rt->fib6_table->tb6_lock));
        struct fib6_info *iter = NULL;
        struct fib6_info __rcu **ins;
        struct fib6_info __rcu **fallback_ins = NULL;
        int replace = (info->nlh &&
                       (info->nlh->nlmsg_flags & NLM_F_REPLACE));
        int add = (!info->nlh ||
                   (info->nlh->nlmsg_flags & NLM_F_CREATE));
        int found = 0;
        bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
        bool notify_sibling_rt = false;
        u16 nlflags = NLM_F_EXCL;
        int err;

        if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND))
                nlflags |= NLM_F_APPEND;

        ins = &fn->leaf;

        for (iter = leaf; iter;
             iter = rcu_dereference_protected(iter->fib6_next,
                                lockdep_is_held(&rt->fib6_table->tb6_lock))) {
                /*
                 *        Search for duplicates
                 */

                if (iter->fib6_metric == rt->fib6_metric) {
                        /*
                         *        Same priority level
                         */
                        if (info->nlh &&
                            (info->nlh->nlmsg_flags & NLM_F_EXCL))
                                return -EEXIST;

                        nlflags &= ~NLM_F_EXCL;
                        if (replace) {
                                if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
                                        found++;
                                        break;
                                }
                                fallback_ins = fallback_ins ?: ins;
                                goto next_iter;
                        }

                        if (rt6_duplicate_nexthop(iter, rt)) {
                                if (rt->fib6_nsiblings)
                                        WRITE_ONCE(rt->fib6_nsiblings, 0);
                                if (!(iter->fib6_flags & RTF_EXPIRES))
                                        return -EEXIST;
                                if (!(rt->fib6_flags & RTF_EXPIRES)) {
                                        fib6_clean_expires(iter);
                                        fib6_remove_gc_list(iter);
                                } else {
                                        fib6_set_expires(iter, rt->expires);
                                        fib6_add_gc_list(iter);
                                }

                                if (rt->fib6_pmtu)
                                        fib6_metric_set(iter, RTAX_MTU,
                                                        rt->fib6_pmtu);
                                return -EEXIST;
                        }
                        /* If we have the same destination and the same metric,
                         * but not the same gateway, then the route we try to
                         * add is sibling to this route, increment our counter
                         * of siblings, and later we will add our route to the
                         * list.
                         * Only static routes (which don't have flag
                         * RTF_EXPIRES) are used for ECMPv6.
                         *
                         * To avoid long list, we only had siblings if the
                         * route have a gateway.
                         */
                        if (rt_can_ecmp &&
                            rt6_qualify_for_ecmp(iter))
                                WRITE_ONCE(rt->fib6_nsiblings,
                                           rt->fib6_nsiblings + 1);
                }

                if (iter->fib6_metric > rt->fib6_metric)
                        break;

next_iter:
                ins = &iter->fib6_next;
        }

        if (fallback_ins && !found) {
                /* No matching route with same ecmp-able-ness found, replace
                 * first matching route
                 */
                ins = fallback_ins;
                iter = rcu_dereference_protected(*ins,
                                    lockdep_is_held(&rt->fib6_table->tb6_lock));
                found++;
        }

        /* Reset round-robin state, if necessary */
        if (ins == &fn->leaf)
                fn->rr_ptr = NULL;

        /* Link this route to others same route. */
        if (rt->fib6_nsiblings) {
                unsigned int fib6_nsiblings;
                struct fib6_info *sibling, *temp_sibling;

                /* Find the first route that have the same metric */
                sibling = leaf;
                notify_sibling_rt = true;
                while (sibling) {
                        if (sibling->fib6_metric == rt->fib6_metric &&
                            rt6_qualify_for_ecmp(sibling)) {
                                list_add_tail_rcu(&rt->fib6_siblings,
                                                  &sibling->fib6_siblings);
                                break;
                        }
                        sibling = rcu_dereference_protected(sibling->fib6_next,
                                    lockdep_is_held(&rt->fib6_table->tb6_lock));
                        notify_sibling_rt = false;
                }
                /* For each sibling in the list, increment the counter of
                 * siblings. BUG() if counters does not match, list of siblings
                 * is broken!
                 */
                fib6_nsiblings = 0;
                list_for_each_entry_safe(sibling, temp_sibling,
                                         &rt->fib6_siblings, fib6_siblings) {
                        WRITE_ONCE(sibling->fib6_nsiblings,
                                   sibling->fib6_nsiblings + 1);
                        BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings);
                        fib6_nsiblings++;
                }
                BUG_ON(fib6_nsiblings != rt->fib6_nsiblings);
                rcu_read_lock();
                rt6_multipath_rebalance(temp_sibling);
                rcu_read_unlock();
        }

        /*
         *        insert node
         */
        if (!replace) {
                if (!add)
                        pr_warn("NLM_F_CREATE should be set when creating new route\n");

add:
                nlflags |= NLM_F_CREATE;

                /* The route should only be notified if it is the first
                 * route in the node or if it is added as a sibling
                 * route to the first route in the node.
                 */
                if (!info->skip_notify_kernel &&
                    (notify_sibling_rt || ins == &fn->leaf)) {
                        enum fib_event_type fib_event;

                        if (notify_sibling_rt)
                                fib_event = FIB_EVENT_ENTRY_APPEND;
                        else
                                fib_event = FIB_EVENT_ENTRY_REPLACE;
                        err = call_fib6_entry_notifiers(info->nl_net,
                                                        fib_event, rt,
                                                        extack);
                        if (err) {
                                struct fib6_info *sibling, *next_sibling;

                                /* If the route has siblings, then it first
                                 * needs to be unlinked from them.
                                 */
                                if (!rt->fib6_nsiblings)
                                        return err;

                                list_for_each_entry_safe(sibling, next_sibling,
                                                         &rt->fib6_siblings,
                                                         fib6_siblings)
                                        WRITE_ONCE(sibling->fib6_nsiblings,
                                                   sibling->fib6_nsiblings - 1);
                                WRITE_ONCE(rt->fib6_nsiblings, 0);
                                list_del_rcu(&rt->fib6_siblings);
                                rcu_read_lock();
                                rt6_multipath_rebalance(next_sibling);
                                rcu_read_unlock();
                                return err;
                        }
                }

                rcu_assign_pointer(rt->fib6_next, iter);
                fib6_info_hold(rt);
                rcu_assign_pointer(rt->fib6_node, fn);
                rcu_assign_pointer(*ins, rt);
                if (!info->skip_notify)
                        inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
                info->nl_net->ipv6.rt6_stats->fib_rt_entries++;

                if (!(fn->fn_flags & RTN_RTINFO)) {
                        info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
                        fn->fn_flags |= RTN_RTINFO;
                }

        } else {
                int nsiblings;

                if (!found) {
                        if (add)
                                goto add;
                        pr_warn("NLM_F_REPLACE set, but no existing node found!\n");
                        return -ENOENT;
                }

                if (!info->skip_notify_kernel && ins == &fn->leaf) {
                        err = call_fib6_entry_notifiers(info->nl_net,
                                                        FIB_EVENT_ENTRY_REPLACE,
                                                        rt, extack);
                        if (err)
                                return err;
                }

                fib6_info_hold(rt);
                rcu_assign_pointer(rt->fib6_node, fn);
                rt->fib6_next = iter->fib6_next;
                rcu_assign_pointer(*ins, rt);
                if (!info->skip_notify)
                        inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
                if (!(fn->fn_flags & RTN_RTINFO)) {
                        info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
                        fn->fn_flags |= RTN_RTINFO;
                }
                nsiblings = iter->fib6_nsiblings;
                iter->fib6_node = NULL;
                list_add(&iter->purge_link, purge_list);
                if (rcu_access_pointer(fn->rr_ptr) == iter)
                        fn->rr_ptr = NULL;

                if (nsiblings) {
                        /* Replacing an ECMP route, remove all siblings */
                        ins = &rt->fib6_next;
                        iter = rcu_dereference_protected(*ins,
                                    lockdep_is_held(&rt->fib6_table->tb6_lock));
                        while (iter) {
                                if (iter->fib6_metric > rt->fib6_metric)
                                        break;
                                if (rt6_qualify_for_ecmp(iter)) {
                                        *ins = iter->fib6_next;
                                        iter->fib6_node = NULL;
                                        list_add(&iter->purge_link, purge_list);
                                        if (rcu_access_pointer(fn->rr_ptr) == iter)
                                                fn->rr_ptr = NULL;
                                        nsiblings--;
                                        info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
                                } else {
                                        ins = &iter->fib6_next;
                                }
                                iter = rcu_dereference_protected(*ins,
                                        lockdep_is_held(&rt->fib6_table->tb6_lock));
                        }
                        WARN_ON(nsiblings != 0);
                }
        }

        return 0;
}

static int fib6_add_rt2node_nh(struct fib6_node *fn, struct fib6_info *rt,
                               struct nl_info *info, struct netlink_ext_ack *extack,
                               struct list_head *purge_list)
{
        int err;

        spin_lock(&rt->nh->lock);

        if (rt->nh->dead) {
                NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
                err = -EINVAL;
        } else {
                err = fib6_add_rt2node(fn, rt, info, extack, purge_list);
                if (!err)
                        list_add(&rt->nh_list, &rt->nh->f6i_list);
        }

        spin_unlock(&rt->nh->lock);

        return err;
}

static void fib6_start_gc(struct net *net, struct fib6_info *rt)
{
        if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
            (rt->fib6_flags & RTF_EXPIRES))
                mod_timer(&net->ipv6.ip6_fib_timer,
                          jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
}

void fib6_force_start_gc(struct net *net)
{
        if (!timer_pending(&net->ipv6.ip6_fib_timer))
                mod_timer(&net->ipv6.ip6_fib_timer,
                          jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
}

static void __fib6_update_sernum_upto_root(struct fib6_info *rt,
                                           int sernum)
{
        struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node,
                                lockdep_is_held(&rt->fib6_table->tb6_lock));

        /* paired with smp_rmb() in fib6_get_cookie_safe() */
        smp_wmb();
        while (fn) {
                WRITE_ONCE(fn->fn_sernum, sernum);
                fn = rcu_dereference_protected(fn->parent,
                                lockdep_is_held(&rt->fib6_table->tb6_lock));
        }
}

void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt)
{
        __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net));
}

/* allow ipv4 to update sernum via ipv6_stub */
void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i)
{
        spin_lock_bh(&f6i->fib6_table->tb6_lock);
        fib6_update_sernum_upto_root(net, f6i);
        spin_unlock_bh(&f6i->fib6_table->tb6_lock);
}

/*
 *        Add routing information to the routing tree.
 *        <destination addr>/<source addr>
 *        with source addr info in sub-trees
 *        Need to own table->tb6_lock
 */

int fib6_add(struct fib6_node *root, struct fib6_info *rt,
             struct nl_info *info, struct netlink_ext_ack *extack)
{
        struct fib6_table *table = rt->fib6_table;
        LIST_HEAD(purge_list);
        struct fib6_node *fn;
#ifdef CONFIG_IPV6_SUBTREES
        struct fib6_node *pn = NULL;
#endif
        int err = -ENOMEM;
        int allow_create = 1;
        int replace_required = 0;

        if (info->nlh) {
                if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
                        allow_create = 0;
                if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
                        replace_required = 1;
        }
        if (!allow_create && !replace_required)
                pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");

        fn = fib6_add_1(info->nl_net, table, root,
                        &rt->fib6_dst.addr, rt->fib6_dst.plen,
                        offsetof(struct fib6_info, fib6_dst), allow_create,
                        replace_required, extack);
        if (IS_ERR(fn)) {
                err = PTR_ERR(fn);
                fn = NULL;
                goto out;
        }

#ifdef CONFIG_IPV6_SUBTREES
        pn = fn;

        if (rt->fib6_src.plen) {
                struct fib6_node *sn;

                if (!rcu_access_pointer(fn->subtree)) {
                        struct fib6_node *sfn;

                        /*
                         * Create subtree.
                         *
                         *                fn[main tree]
                         *                |
                         *                sfn[subtree root]
                         *                   \
                         *                    sn[new leaf node]
                         */

                        /* Create subtree root node */
                        sfn = node_alloc(info->nl_net);
                        if (!sfn)
                                goto failure;

                        fib6_info_hold(info->nl_net->ipv6.fib6_null_entry);
                        rcu_assign_pointer(sfn->leaf,
                                           info->nl_net->ipv6.fib6_null_entry);
                        sfn->fn_flags = RTN_ROOT;

                        /* Now add the first leaf node to new subtree */

                        sn = fib6_add_1(info->nl_net, table, sfn,
                                        &rt->fib6_src.addr, rt->fib6_src.plen,
                                        offsetof(struct fib6_info, fib6_src),
                                        allow_create, replace_required, extack);

                        if (IS_ERR(sn)) {
                                /* If it is failed, discard just allocated
                                   root, and then (in failure) stale node
                                   in main tree.
                                 */
                                node_free_immediate(info->nl_net, sfn);
                                err = PTR_ERR(sn);
                                goto failure;
                        }

                        /* Now link new subtree to main tree */
                        rcu_assign_pointer(sfn->parent, fn);
                        rcu_assign_pointer(fn->subtree, sfn);
                } else {
                        sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn),
                                        &rt->fib6_src.addr, rt->fib6_src.plen,
                                        offsetof(struct fib6_info, fib6_src),
                                        allow_create, replace_required, extack);

                        if (IS_ERR(sn)) {
                                err = PTR_ERR(sn);
                                goto failure;
                        }
                }

                if (!rcu_access_pointer(fn->leaf)) {
                        if (fn->fn_flags & RTN_TL_ROOT) {
                                /* put back null_entry for root node */
                                rcu_assign_pointer(fn->leaf,
                                            info->nl_net->ipv6.fib6_null_entry);
                        } else {
                                fib6_info_hold(rt);
                                rcu_assign_pointer(fn->leaf, rt);
                        }
                }
                fn = sn;
        }
#endif

        if (rt->nh)
                err = fib6_add_rt2node_nh(fn, rt, info, extack, &purge_list);
        else
                err = fib6_add_rt2node(fn, rt, info, extack, &purge_list);
        if (!err) {
                struct fib6_info *iter, *next;

                list_for_each_entry_safe(iter, next, &purge_list, purge_link) {
                        list_del(&iter->purge_link);
                        fib6_purge_rt(iter, fn, info->nl_net);
                        fib6_info_release(iter);
                }

                __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net));

                if (rt->fib6_flags & RTF_EXPIRES)
                        fib6_add_gc_list(rt);

                fib6_start_gc(info->nl_net, rt);
        }

out:
        if (err) {
#ifdef CONFIG_IPV6_SUBTREES
                /*
                 * If fib6_add_1 has cleared the old leaf pointer in the
                 * super-tree leaf node we have to find a new one for it.
                 */
                if (pn != fn) {
                        struct fib6_info *pn_leaf =
                                rcu_dereference_protected(pn->leaf,
                                    lockdep_is_held(&table->tb6_lock));
                        if (pn_leaf == rt) {
                                pn_leaf = NULL;
                                RCU_INIT_POINTER(pn->leaf, NULL);
                                fib6_info_release(rt);
                        }
                        if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) {
                                pn_leaf = fib6_find_prefix(info->nl_net, table,
                                                           pn);
                                if (!pn_leaf)
                                        pn_leaf =
                                            info->nl_net->ipv6.fib6_null_entry;
                                fib6_info_hold(pn_leaf);
                                rcu_assign_pointer(pn->leaf, pn_leaf);
                        }
                }
#endif
                goto failure;
        } else if (fib6_requires_src(rt)) {
                fib6_routes_require_src_inc(info->nl_net);
        }
        return err;

failure:
        /* fn->leaf could be NULL and fib6_repair_tree() needs to be called if:
         * 1. fn is an intermediate node and we failed to add the new
         * route to it in both subtree creation failure and fib6_add_rt2node()
         * failure case.
         * 2. fn is the root node in the table and we fail to add the first
         * default route to it.
         */
        if (fn &&
            (!(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)) ||
             (fn->fn_flags & RTN_TL_ROOT &&
              !rcu_access_pointer(fn->leaf))))
                fib6_repair_tree(info->nl_net, table, fn);
        return err;
}

/*
 *        Routing tree lookup
 *
 */

struct lookup_args {
        int                        offset;                /* key offset on fib6_info */
        const struct in6_addr        *addr;                /* search key                        */
};

static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root,
                                            struct lookup_args *args)
{
        struct fib6_node *fn;
        __be32 dir;

        if (unlikely(args->offset == 0))
                return NULL;

        /*
         *        Descend on a tree
         */

        fn = root;

        for (;;) {
                struct fib6_node *next;

                dir = addr_bit_set(args->addr, fn->fn_bit);

                next = dir ? rcu_dereference(fn->right) :
                             rcu_dereference(fn->left);

                if (next) {
                        fn = next;
                        continue;
                }
                break;
        }

        while (fn) {
                struct fib6_node *subtree = FIB6_SUBTREE(fn);

                if (subtree || fn->fn_flags & RTN_RTINFO) {
                        struct fib6_info *leaf = rcu_dereference(fn->leaf);
                        struct rt6key *key;

                        if (!leaf)
                                goto backtrack;

                        key = (struct rt6key *) ((u8 *)leaf + args->offset);

                        if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) {
#ifdef CONFIG_IPV6_SUBTREES
                                if (subtree) {
                                        struct fib6_node *sfn;
                                        sfn = fib6_node_lookup_1(subtree,
                                                                 args + 1);
                                        if (!sfn)
                                                goto backtrack;
                                        fn = sfn;
                                }
#endif
                                if (fn->fn_flags & RTN_RTINFO)
                                        return fn;
                        }
                }
backtrack:
                if (fn->fn_flags & RTN_ROOT)
                        break;

                fn = rcu_dereference(fn->parent);
        }

        return NULL;
}

/* called with rcu_read_lock() held
 */
struct fib6_node *fib6_node_lookup(struct fib6_node *root,
                                   const struct in6_addr *daddr,
                                   const struct in6_addr *saddr)
{
        struct fib6_node *fn;
        struct lookup_args args[] = {
                {
                        .offset = offsetof(struct fib6_info, fib6_dst),
                        .addr = daddr,
                },
#ifdef CONFIG_IPV6_SUBTREES
                {
                        .offset = offsetof(struct fib6_info, fib6_src),
                        .addr = saddr,
                },
#endif
                {
                        .offset = 0,        /* sentinel */
                }
        };

        fn = fib6_node_lookup_1(root, daddr ? args : args + 1);
        if (!fn || fn->fn_flags & RTN_TL_ROOT)
                fn = root;

        return fn;
}

/*
 *        Get node with specified destination prefix (and source prefix,
 *        if subtrees are used)
 *        exact_match == true means we try to find fn with exact match of
 *        the passed in prefix addr
 *        exact_match == false means we try to find fn with longest prefix
 *        match of the passed in prefix addr. This is useful for finding fn
 *        for cached route as it will be stored in the exception table under
 *        the node with longest prefix length.
 */


static struct fib6_node *fib6_locate_1(struct fib6_node *root,
                                       const struct in6_addr *addr,
                                       int plen, int offset,
                                       bool exact_match)
{
        struct fib6_node *fn, *prev = NULL;

        for (fn = root; fn ; ) {
                struct fib6_info *leaf = rcu_dereference(fn->leaf);
                struct rt6key *key;

                /* This node is being deleted */
                if (!leaf) {
                        if (plen <= fn->fn_bit)
                                goto out;
                        else
                                goto next;
                }

                key = (struct rt6key *)((u8 *)leaf + offset);

                /*
                 *        Prefix match
                 */
                if (plen < fn->fn_bit ||
                    !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit))
                        goto out;

                if (plen == fn->fn_bit)
                        return fn;

                if (fn->fn_flags & RTN_RTINFO)
                        prev = fn;

next:
                /*
                 *        We have more bits to go
                 */
                if (addr_bit_set(addr, fn->fn_bit))
                        fn = rcu_dereference(fn->right);
                else
                        fn = rcu_dereference(fn->left);
        }
out:
        if (exact_match)
                return NULL;
        else
                return prev;
}

struct fib6_node *fib6_locate(struct fib6_node *root,
                              const struct in6_addr *daddr, int dst_len,
                              const struct in6_addr *saddr, int src_len,
                              bool exact_match)
{
        struct fib6_node *fn;

        fn = fib6_locate_1(root, daddr, dst_len,
                           offsetof(struct fib6_info, fib6_dst),
                           exact_match);

#ifdef CONFIG_IPV6_SUBTREES
        if (src_len) {
                WARN_ON(saddr == NULL);
                if (fn) {
                        struct fib6_node *subtree = FIB6_SUBTREE(fn);

                        if (subtree) {
                                fn = fib6_locate_1(subtree, saddr, src_len,
                                           offsetof(struct fib6_info, fib6_src),
                                           exact_match);
                        }
                }
        }
#endif

        if (fn && fn->fn_flags & RTN_RTINFO)
                return fn;

        return NULL;
}


/*
 *        Deletion
 *
 */

static struct fib6_info *fib6_find_prefix(struct net *net,
                                         struct fib6_table *table,
                                         struct fib6_node *fn)
{
        struct fib6_node *child_left, *child_right;

        if (fn->fn_flags & RTN_ROOT)
                return net->ipv6.fib6_null_entry;

        while (fn) {
                child_left = rcu_dereference_protected(fn->left,
                                    lockdep_is_held(&table->tb6_lock));
                child_right = rcu_dereference_protected(fn->right,
                                    lockdep_is_held(&table->tb6_lock));
                if (child_left)
                        return rcu_dereference_protected(child_left->leaf,
                                        lockdep_is_held(&table->tb6_lock));
                if (child_right)
                        return rcu_dereference_protected(child_right->leaf,
                                        lockdep_is_held(&table->tb6_lock));

                fn = FIB6_SUBTREE(fn);
        }
        return NULL;
}

/*
 *        Called to trim the tree of intermediate nodes when possible. "fn"
 *        is the node we want to try and remove.
 *        Need to own table->tb6_lock
 */

static struct fib6_node *fib6_repair_tree(struct net *net,
                                          struct fib6_table *table,
                                          struct fib6_node *fn)
{
        int children;
        int nstate;
        struct fib6_node *child;
        struct fib6_walker *w;
        int iter = 0;

        /* Set fn->leaf to null_entry for root node. */
        if (fn->fn_flags & RTN_TL_ROOT) {
                rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry);
                return fn;
        }

        for (;;) {
                struct fib6_node *fn_r = rcu_dereference_protected(fn->right,
                                            lockdep_is_held(&table->tb6_lock));
                struct fib6_node *fn_l = rcu_dereference_protected(fn->left,
                                            lockdep_is_held(&table->tb6_lock));
                struct fib6_node *pn = rcu_dereference_protected(fn->parent,
                                            lockdep_is_held(&table->tb6_lock));
                struct fib6_node *pn_r = rcu_dereference_protected(pn->right,
                                            lockdep_is_held(&table->tb6_lock));
                struct fib6_node *pn_l = rcu_dereference_protected(pn->left,
                                            lockdep_is_held(&table->tb6_lock));
                struct fib6_info *fn_leaf = rcu_dereference_protected(fn->leaf,
                                            lockdep_is_held(&table->tb6_lock));
                struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf,
                                            lockdep_is_held(&table->tb6_lock));
                struct fib6_info *new_fn_leaf;

                pr_debug("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
                iter++;

                WARN_ON(fn->fn_flags & RTN_RTINFO);
                WARN_ON(fn->fn_flags & RTN_TL_ROOT);
                WARN_ON(fn_leaf);

                children = 0;
                child = NULL;
                if (fn_r) {
                        child = fn_r;
                        children |= 1;
                }
                if (fn_l) {
                        child = fn_l;
                        children |= 2;
                }

                if (children == 3 || FIB6_SUBTREE(fn)
#ifdef CONFIG_IPV6_SUBTREES
                    /* Subtree root (i.e. fn) may have one child */
                    || (children && fn->fn_flags & RTN_ROOT)
#endif
                    ) {
                        new_fn_leaf = fib6_find_prefix(net, table, fn);
#if RT6_DEBUG >= 2
                        if (!new_fn_leaf) {
                                WARN_ON(!new_fn_leaf);
                                new_fn_leaf = net->ipv6.fib6_null_entry;
                        }
#endif
                        fib6_info_hold(new_fn_leaf);
                        rcu_assign_pointer(fn->leaf, new_fn_leaf);
                        return pn;
                }

#ifdef CONFIG_IPV6_SUBTREES
                if (FIB6_SUBTREE(pn) == fn) {
                        WARN_ON(!(fn->fn_flags & RTN_ROOT));
                        RCU_INIT_POINTER(pn->subtree, NULL);
                        nstate = FWS_L;
                } else {
                        WARN_ON(fn->fn_flags & RTN_ROOT);
#endif
                        if (pn_r == fn)
                                rcu_assign_pointer(pn->right, child);
                        else if (pn_l == fn)
                                rcu_assign_pointer(pn->left, child);
#if RT6_DEBUG >= 2
                        else
                                WARN_ON(1);
#endif
                        if (child)
                                rcu_assign_pointer(child->parent, pn);
                        nstate = FWS_R;
#ifdef CONFIG_IPV6_SUBTREES
                }
#endif

                read_lock(&net->ipv6.fib6_walker_lock);
                FOR_WALKERS(net, w) {
                        if (!child) {
                                if (w->node == fn) {
                                        pr_debug("W %p adjusted by delnode 1, s=%d/%d\n",
                                                 w, w->state, nstate);
                                        w->node = pn;
                                        w->state = nstate;
                                }
                        } else {
                                if (w->node == fn) {
                                        w->node = child;
                                        if (children&2) {
                                                pr_debug("W %p adjusted by delnode 2, s=%d\n",
                                                         w, w->state);
                                                w->state = w->state >= FWS_R ? FWS_U : FWS_INIT;
                                        } else {
                                                pr_debug("W %p adjusted by delnode 2, s=%d\n",
                                                         w, w->state);
                                                w->state = w->state >= FWS_C ? FWS_U : FWS_INIT;
                                        }
                                }
                        }
                }
                read_unlock(&net->ipv6.fib6_walker_lock);

                node_free(net, fn);
                if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn))
                        return pn;

                RCU_INIT_POINTER(pn->leaf, NULL);
                fib6_info_release(pn_leaf);
                fn = pn;
        }
}

static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
                           struct fib6_info __rcu **rtp, struct nl_info *info)
{
        struct fib6_info *leaf, *replace_rt = NULL;
        struct fib6_walker *w;
        struct fib6_info *rt = rcu_dereference_protected(*rtp,
                                    lockdep_is_held(&table->tb6_lock));
        struct net *net = info->nl_net;
        bool notify_del = false;

        /* If the deleted route is the first in the node and it is not part of
         * a multipath route, then we need to replace it with the next route
         * in the node, if exists.
         */
        leaf = rcu_dereference_protected(fn->leaf,
                                         lockdep_is_held(&table->tb6_lock));
        if (leaf == rt && !rt->fib6_nsiblings) {
                if (rcu_access_pointer(rt->fib6_next))
                        replace_rt = rcu_dereference_protected(rt->fib6_next,
                                            lockdep_is_held(&table->tb6_lock));
                else
                        notify_del = true;
        }

        /* Unlink it */
        *rtp = rt->fib6_next;
        rt->fib6_node = NULL;
        net->ipv6.rt6_stats->fib_rt_entries--;
        net->ipv6.rt6_stats->fib_discarded_routes++;

        /* Reset round-robin state, if necessary */
        if (rcu_access_pointer(fn->rr_ptr) == rt)
                fn->rr_ptr = NULL;

        /* Remove this entry from other siblings */
        if (rt->fib6_nsiblings) {
                struct fib6_info *sibling, *next_sibling;

                /* The route is deleted from a multipath route. If this
                 * multipath route is the first route in the node, then we need
                 * to emit a delete notification. Otherwise, we need to skip
                 * the notification.
                 */
                if (rt->fib6_metric == leaf->fib6_metric &&
                    rt6_qualify_for_ecmp(leaf))
                        notify_del = true;
                list_for_each_entry_safe(sibling, next_sibling,
                                         &rt->fib6_siblings, fib6_siblings)
                        WRITE_ONCE(sibling->fib6_nsiblings,
                                   sibling->fib6_nsiblings - 1);
                WRITE_ONCE(rt->fib6_nsiblings, 0);
                list_del_rcu(&rt->fib6_siblings);
                rt6_multipath_rebalance(next_sibling);
        }

        /* Adjust walkers */
        read_lock(&net->ipv6.fib6_walker_lock);
        FOR_WALKERS(net, w) {
                if (w->state == FWS_C && w->leaf == rt) {
                        pr_debug("walker %p adjusted by delroute\n", w);
                        w->leaf = rcu_dereference_protected(rt->fib6_next,
                                            lockdep_is_held(&table->tb6_lock));
                        if (!w->leaf)
                                w->state = FWS_U;
                }
        }
        read_unlock(&net->ipv6.fib6_walker_lock);

        /* If it was last route, call fib6_repair_tree() to:
         * 1. For root node, put back null_entry as how the table was created.
         * 2. For other nodes, expunge its radix tree node.
         */
        if (!rcu_access_pointer(fn->leaf)) {
                if (!(fn->fn_flags & RTN_TL_ROOT)) {
                        fn->fn_flags &= ~RTN_RTINFO;
                        net->ipv6.rt6_stats->fib_route_nodes--;
                }
                fn = fib6_repair_tree(net, table, fn);
        }

        fib6_purge_rt(rt, fn, net);

        if (!info->skip_notify_kernel) {
                if (notify_del)
                        call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL,
                                                  rt, NULL);
                else if (replace_rt)
                        call_fib6_entry_notifiers_replace(net, replace_rt);
        }
        if (!info->skip_notify)
                inet6_rt_notify(RTM_DELROUTE, rt, info, 0);

        fib6_info_release(rt);
}

/* Need to own table->tb6_lock */
int fib6_del(struct fib6_info *rt, struct nl_info *info)
{
        struct net *net = info->nl_net;
        struct fib6_info __rcu **rtp;
        struct fib6_info __rcu **rtp_next;
        struct fib6_table *table;
        struct fib6_node *fn;

        if (rt == net->ipv6.fib6_null_entry)
                return -ENOENT;

        table = rt->fib6_table;
        fn = rcu_dereference_protected(rt->fib6_node,
                                       lockdep_is_held(&table->tb6_lock));
        if (!fn)
                return -ENOENT;

        WARN_ON(!(fn->fn_flags & RTN_RTINFO));

        /*
         *        Walk the leaf entries looking for ourself
         */

        for (rtp = &fn->leaf; *rtp; rtp = rtp_next) {
                struct fib6_info *cur = rcu_dereference_protected(*rtp,
                                        lockdep_is_held(&table->tb6_lock));
                if (rt == cur) {
                        if (fib6_requires_src(cur))
                                fib6_routes_require_src_dec(info->nl_net);
                        fib6_del_route(table, fn, rtp, info);
                        return 0;
                }
                rtp_next = &cur->fib6_next;
        }
        return -ENOENT;
}

/*
 *        Tree traversal function.
 *
 *        Certainly, it is not interrupt safe.
 *        However, it is internally reenterable wrt itself and fib6_add/fib6_del.
 *        It means, that we can modify tree during walking
 *        and use this function for garbage collection, clone pruning,
 *        cleaning tree when a device goes down etc. etc.
 *
 *        It guarantees that every node will be traversed,
 *        and that it will be traversed only once.
 *
 *        Callback function w->func may return:
 *        0 -> continue walking.
 *        positive value -> walking is suspended (used by tree dumps,
 *        and probably by gc, if it will be split to several slices)
 *        negative value -> terminate walking.
 *
 *        The function itself returns:
 *        0   -> walk is complete.
 *        >0  -> walk is incomplete (i.e. suspended)
 *        <0  -> walk is terminated by an error.
 *
 *        This function is called with tb6_lock held.
 */

static int fib6_walk_continue(struct fib6_walker *w)
{
        struct fib6_node *fn, *pn, *left, *right;

        /* w->root should always be table->tb6_root */
        WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT));

        for (;;) {
                fn = w->node;
                if (!fn)
                        return 0;

                switch (w->state) {
#ifdef CONFIG_IPV6_SUBTREES
                case FWS_S:
                        if (FIB6_SUBTREE(fn)) {
                                w->node = FIB6_SUBTREE(fn);
                                continue;
                        }
                        w->state = FWS_L;
                        fallthrough;
#endif
                case FWS_L:
                        left = rcu_dereference_protected(fn->left, 1);
                        if (left) {
                                w->node = left;
                                w->state = FWS_INIT;
                                continue;
                        }
                        w->state = FWS_R;
                        fallthrough;
                case FWS_R:
                        right = rcu_dereference_protected(fn->right, 1);
                        if (right) {
                                w->node = right;
                                w->state = FWS_INIT;
                                continue;
                        }
                        w->state = FWS_C;
                        w->leaf = rcu_dereference_protected(fn->leaf, 1);
                        fallthrough;
                case FWS_C:
                        if (w->leaf && fn->fn_flags & RTN_RTINFO) {
                                int err;

                                if (w->skip) {
                                        w->skip--;
                                        goto skip;
                                }

                                err = w->func(w);
                                if (err)
                                        return err;

                                w->count++;
                                continue;
                        }
skip:
                        w->state = FWS_U;
                        fallthrough;
                case FWS_U:
                        if (fn == w->root)
                                return 0;
                        pn = rcu_dereference_protected(fn->parent, 1);
                        left = rcu_dereference_protected(pn->left, 1);
                        right = rcu_dereference_protected(pn->right, 1);
                        w->node = pn;
#ifdef CONFIG_IPV6_SUBTREES
                        if (FIB6_SUBTREE(pn) == fn) {
                                WARN_ON(!(fn->fn_flags & RTN_ROOT));
                                w->state = FWS_L;
                                continue;
                        }
#endif
                        if (left == fn) {
                                w->state = FWS_R;
                                continue;
                        }
                        if (right == fn) {
                                w->state = FWS_C;
                                w->leaf = rcu_dereference_protected(w->node->leaf, 1);
                                continue;
                        }
#if RT6_DEBUG >= 2
                        WARN_ON(1);
#endif
                }
        }
}

static int fib6_walk(struct net *net, struct fib6_walker *w)
{
        int res;

        w->state = FWS_INIT;
        w->node = w->root;

        fib6_walker_link(net, w);
        res = fib6_walk_continue(w);
        if (res <= 0)
                fib6_walker_unlink(net, w);
        return res;
}

static int fib6_clean_node(struct fib6_walker *w)
{
        int res;
        struct fib6_info *rt;
        struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w);
        struct nl_info info = {
                .nl_net = c->net,
                .skip_notify = c->skip_notify,
        };

        if (c->sernum != FIB6_NO_SERNUM_CHANGE &&
            READ_ONCE(w->node->fn_sernum) != c->sernum)
                WRITE_ONCE(w->node->fn_sernum, c->sernum);

        if (!c->func) {
                WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE);
                w->leaf = NULL;
                return 0;
        }

        for_each_fib6_walker_rt(w) {
                res = c->func(rt, c->arg);
                if (res == -1) {
                        w->leaf = rt;
                        res = fib6_del(rt, &info);
                        if (res) {
#if RT6_DEBUG >= 2
                                pr_debug("%s: del failed: rt=%p@%p err=%d\n",
                                         __func__, rt,
                                         rcu_access_pointer(rt->fib6_node),
                                         res);
#endif
                                continue;
                        }
                        return 0;
                } else if (res == -2) {
                        if (WARN_ON(!rt->fib6_nsiblings))
                                continue;
                        rt = list_last_entry(&rt->fib6_siblings,
                                             struct fib6_info, fib6_siblings);
                        continue;
                }
                WARN_ON(res != 0);
        }
        w->leaf = rt;
        return 0;
}

/*
 *        Convenient frontend to tree walker.
 *
 *        func is called on each route.
 *                It may return -2 -> skip multipath route.
 *                              -1 -> delete this route.
 *                              0  -> continue walking
 */

static void fib6_clean_tree(struct net *net, struct fib6_node *root,
                            int (*func)(struct fib6_info *, void *arg),
                            int sernum, void *arg, bool skip_notify)
{
        struct fib6_cleaner c;

        c.w.root = root;
        c.w.func = fib6_clean_node;
        c.w.count = 0;
        c.w.skip = 0;
        c.w.skip_in_node = 0;
        c.func = func;
        c.sernum = sernum;
        c.arg = arg;
        c.net = net;
        c.skip_notify = skip_notify;

        fib6_walk(net, &c.w);
}

static void __fib6_clean_all(struct net *net,
                             int (*func)(struct fib6_info *, void *),
                             int sernum, void *arg, bool skip_notify)
{
        struct fib6_table *table;
        struct hlist_head *head;
        unsigned int h;

        rcu_read_lock();
        for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
                head = &net->ipv6.fib_table_hash[h];
                hlist_for_each_entry_rcu(table, head, tb6_hlist) {
                        spin_lock_bh(&table->tb6_lock);
                        fib6_clean_tree(net, &table->tb6_root,
                                        func, sernum, arg, skip_notify);
                        spin_unlock_bh(&table->tb6_lock);
                }
        }
        rcu_read_unlock();
}

void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *),
                    void *arg)
{
        __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false);
}

void fib6_clean_all_skip_notify(struct net *net,
                                int (*func)(struct fib6_info *, void *),
                                void *arg)
{
        __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true);
}

static void fib6_flush_trees(struct net *net)
{
        int new_sernum = fib6_new_sernum(net);

        __fib6_clean_all(net, NULL, new_sernum, NULL, false);
}

/*
 *        Garbage collection
 */

static int fib6_age(struct fib6_info *rt, struct fib6_gc_args *gc_args)
{
        unsigned long now = jiffies;

        /*
         *        check addrconf expiration here.
         *        Routes are expired even if they are in use.
         */

        if (rt->fib6_flags & RTF_EXPIRES && rt->expires) {
                if (time_after(now, rt->expires)) {
                        pr_debug("expiring %p\n", rt);
                        return -1;
                }
                gc_args->more++;
        }

        /*        Also age clones in the exception table.
         *        Note, that clones are aged out
         *        only if they are not in use now.
         */
        rt6_age_exceptions(rt, gc_args, now);

        return 0;
}

static void fib6_gc_table(struct net *net,
                          struct fib6_table *tb6,
                          struct fib6_gc_args *gc_args)
{
        struct fib6_info *rt;
        struct hlist_node *n;
        struct nl_info info = {
                .nl_net = net,
                .skip_notify = false,
        };

        hlist_for_each_entry_safe(rt, n, &tb6->tb6_gc_hlist, gc_link)
                if (fib6_age(rt, gc_args) == -1)
                        fib6_del(rt, &info);
}

static void fib6_gc_all(struct net *net, struct fib6_gc_args *gc_args)
{
        struct fib6_table *table;
        struct hlist_head *head;
        unsigned int h;

        rcu_read_lock();
        for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
                head = &net->ipv6.fib_table_hash[h];
                hlist_for_each_entry_rcu(table, head, tb6_hlist) {
                        spin_lock_bh(&table->tb6_lock);

                        fib6_gc_table(net, table, gc_args);

                        spin_unlock_bh(&table->tb6_lock);
                }
        }
        rcu_read_unlock();
}

void fib6_run_gc(unsigned long expires, struct net *net, bool force)
{
        struct fib6_gc_args gc_args;
        unsigned long now;

        if (force) {
                spin_lock_bh(&net->ipv6.fib6_gc_lock);
        } else if (!spin_trylock_bh(&net->ipv6.fib6_gc_lock)) {
                mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ);
                return;
        }
        gc_args.timeout = expires ? (int)expires :
                          net->ipv6.sysctl.ip6_rt_gc_interval;
        gc_args.more = 0;

        fib6_gc_all(net, &gc_args);
        now = jiffies;
        net->ipv6.ip6_rt_last_gc = now;

        if (gc_args.more)
                mod_timer(&net->ipv6.ip6_fib_timer,
                          round_jiffies(now
                                        + net->ipv6.sysctl.ip6_rt_gc_interval));
        else
                timer_delete(&net->ipv6.ip6_fib_timer);
        spin_unlock_bh(&net->ipv6.fib6_gc_lock);
}

static void fib6_gc_timer_cb(struct timer_list *t)
{
        struct net *arg = timer_container_of(arg, t, ipv6.ip6_fib_timer);

        fib6_run_gc(0, arg, true);
}

static int __net_init fib6_net_init(struct net *net)
{
        size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
        int err;

        err = fib6_notifier_init(net);
        if (err)
                return err;

        /* Default to 3-tuple */
        net->ipv6.sysctl.multipath_hash_fields =
                FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK;

        spin_lock_init(&net->ipv6.fib6_gc_lock);
        rwlock_init(&net->ipv6.fib6_walker_lock);
        INIT_LIST_HEAD(&net->ipv6.fib6_walkers);
        timer_setup(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, 0);

        net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL);
        if (!net->ipv6.rt6_stats)
                goto out_notifier;

        /* Avoid false sharing : Use at least a full cache line */
        size = max_t(size_t, size, L1_CACHE_BYTES);

        net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL);
        if (!net->ipv6.fib_table_hash)
                goto out_rt6_stats;

        spin_lock_init(&net->ipv6.fib_table_hash_lock);

        net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl),
                                          GFP_KERNEL);
        if (!net->ipv6.fib6_main_tbl)
                goto out_fib_table_hash;

        net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
        rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf,
                           net->ipv6.fib6_null_entry);
        net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
                RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
        inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers);
        INIT_HLIST_HEAD(&net->ipv6.fib6_main_tbl->tb6_gc_hlist);

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
        net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl),
                                           GFP_KERNEL);
        if (!net->ipv6.fib6_local_tbl)
                goto out_fib6_main_tbl;
        net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
        rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf,
                           net->ipv6.fib6_null_entry);
        net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
                RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
        inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers);
        INIT_HLIST_HEAD(&net->ipv6.fib6_local_tbl->tb6_gc_hlist);
#endif
        fib6_tables_init(net);

        return 0;

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
out_fib6_main_tbl:
        kfree(net->ipv6.fib6_main_tbl);
#endif
out_fib_table_hash:
        kfree(net->ipv6.fib_table_hash);
out_rt6_stats:
        kfree(net->ipv6.rt6_stats);
out_notifier:
        fib6_notifier_exit(net);
        return -ENOMEM;
}

static void fib6_net_exit(struct net *net)
{
        unsigned int i;

        timer_delete_sync(&net->ipv6.ip6_fib_timer);

        for (i = 0; i < FIB6_TABLE_HASHSZ; i++) {
                struct hlist_head *head = &net->ipv6.fib_table_hash[i];
                struct hlist_node *tmp;
                struct fib6_table *tb;

                hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) {
                        hlist_del(&tb->tb6_hlist);
                        fib6_free_table(tb);
                }
        }

        kfree(net->ipv6.fib_table_hash);
        kfree(net->ipv6.rt6_stats);
        fib6_notifier_exit(net);
}

static struct pernet_operations fib6_net_ops = {
        .init = fib6_net_init,
        .exit = fib6_net_exit,
};

static const struct rtnl_msg_handler fib6_rtnl_msg_handlers[] __initconst_or_module = {
        {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE,
         .dumpit = inet6_dump_fib,
         .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE},
};

int __init fib6_init(void)
{
        int ret = -ENOMEM;

        fib6_node_kmem = KMEM_CACHE(fib6_node,
                                    SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT);
        if (!fib6_node_kmem)
                goto out;

        ret = register_pernet_subsys(&fib6_net_ops);
        if (ret)
                goto out_kmem_cache_create;

        ret = rtnl_register_many(fib6_rtnl_msg_handlers);
        if (ret)
                goto out_unregister_subsys;

        __fib6_flush_trees = fib6_flush_trees;
out:
        return ret;

out_unregister_subsys:
        unregister_pernet_subsys(&fib6_net_ops);
out_kmem_cache_create:
        kmem_cache_destroy(fib6_node_kmem);
        goto out;
}

void fib6_gc_cleanup(void)
{
        unregister_pernet_subsys(&fib6_net_ops);
        kmem_cache_destroy(fib6_node_kmem);
}

#ifdef CONFIG_PROC_FS
static int ipv6_route_native_seq_show(struct seq_file *seq, void *v)
{
        struct fib6_info *rt = v;
        struct ipv6_route_iter *iter = seq->private;
        struct fib6_nh *fib6_nh = rt->fib6_nh;
        unsigned int flags = rt->fib6_flags;
        const struct net_device *dev;

        if (rt->nh)
                fib6_nh = nexthop_fib6_nh(rt->nh);

        seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);

#ifdef CONFIG_IPV6_SUBTREES
        seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen);
#else
        seq_puts(seq, "00000000000000000000000000000000 00 ");
#endif
        if (fib6_nh->fib_nh_gw_family) {
                flags |= RTF_GATEWAY;
                seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6);
        } else {
                seq_puts(seq, "00000000000000000000000000000000");
        }

        dev = fib6_nh->fib_nh_dev;
        seq_printf(seq, " %08x %08x %08x %08x %8s\n",
                   rt->fib6_metric, refcount_read(&rt->fib6_ref), 0,
                   flags, dev ? dev->name : "");
        iter->w.leaf = NULL;
        return 0;
}

static int ipv6_route_yield(struct fib6_walker *w)
{
        struct ipv6_route_iter *iter = w->args;

        if (!iter->skip)
                return 1;

        do {
                iter->w.leaf = rcu_dereference_protected(
                                iter->w.leaf->fib6_next,
                                lockdep_is_held(&iter->tbl->tb6_lock));
                iter->skip--;
                if (!iter->skip && iter->w.leaf)
                        return 1;
        } while (iter->w.leaf);

        return 0;
}

static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter,
                                      struct net *net)
{
        memset(&iter->w, 0, sizeof(iter->w));
        iter->w.func = ipv6_route_yield;
        iter->w.root = &iter->tbl->tb6_root;
        iter->w.state = FWS_INIT;
        iter->w.node = iter->w.root;
        iter->w.args = iter;
        iter->sernum = READ_ONCE(iter->w.root->fn_sernum);
        INIT_LIST_HEAD(&iter->w.lh);
        fib6_walker_link(net, &iter->w);
}

static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl,
                                                    struct net *net)
{
        unsigned int h;
        struct hlist_node *node;

        if (tbl) {
                h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1;
                node = rcu_dereference(hlist_next_rcu(&tbl->tb6_hlist));
        } else {
                h = 0;
                node = NULL;
        }

        while (!node && h < FIB6_TABLE_HASHSZ) {
                node = rcu_dereference(
                        hlist_first_rcu(&net->ipv6.fib_table_hash[h++]));
        }
        return hlist_entry_safe(node, struct fib6_table, tb6_hlist);
}

static void ipv6_route_check_sernum(struct ipv6_route_iter *iter)
{
        int sernum = READ_ONCE(iter->w.root->fn_sernum);

        if (iter->sernum != sernum) {
                iter->sernum = sernum;
                iter->w.state = FWS_INIT;
                iter->w.node = iter->w.root;
                WARN_ON(iter->w.skip);
                iter->w.skip = iter->w.count;
        }
}

static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        int r;
        struct fib6_info *n;
        struct net *net = seq_file_net(seq);
        struct ipv6_route_iter *iter = seq->private;

        ++(*pos);
        if (!v)
                goto iter_table;

        n = rcu_dereference(((struct fib6_info *)v)->fib6_next);
        if (n)
                return n;

iter_table:
        ipv6_route_check_sernum(iter);
        spin_lock_bh(&iter->tbl->tb6_lock);
        r = fib6_walk_continue(&iter->w);
        spin_unlock_bh(&iter->tbl->tb6_lock);
        if (r > 0) {
                return iter->w.leaf;
        } else if (r < 0) {
                fib6_walker_unlink(net, &iter->w);
                return NULL;
        }
        fib6_walker_unlink(net, &iter->w);

        iter->tbl = ipv6_route_seq_next_table(iter->tbl, net);
        if (!iter->tbl)
                return NULL;

        ipv6_route_seq_setup_walk(iter, net);
        goto iter_table;
}

static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos)
        __acquires(RCU)
{
        struct net *net = seq_file_net(seq);
        struct ipv6_route_iter *iter = seq->private;

        rcu_read_lock();
        iter->tbl = ipv6_route_seq_next_table(NULL, net);
        iter->skip = *pos;

        if (iter->tbl) {
                loff_t p = 0;

                ipv6_route_seq_setup_walk(iter, net);
                return ipv6_route_seq_next(seq, NULL, &p);
        } else {
                return NULL;
        }
}

static bool ipv6_route_iter_active(struct ipv6_route_iter *iter)
{
        struct fib6_walker *w = &iter->w;
        return w->node && !(w->state == FWS_U && w->node == w->root);
}

static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v)
        __releases(RCU)
{
        struct net *net = seq_file_net(seq);
        struct ipv6_route_iter *iter = seq->private;

        if (ipv6_route_iter_active(iter))
                fib6_walker_unlink(net, &iter->w);

        rcu_read_unlock();
}

#if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL)
static int ipv6_route_prog_seq_show(struct bpf_prog *prog,
                                    struct bpf_iter_meta *meta,
                                    void *v)
{
        struct bpf_iter__ipv6_route ctx;

        ctx.meta = meta;
        ctx.rt = v;
        return bpf_iter_run_prog(prog, &ctx);
}

static int ipv6_route_seq_show(struct seq_file *seq, void *v)
{
        struct ipv6_route_iter *iter = seq->private;
        struct bpf_iter_meta meta;
        struct bpf_prog *prog;
        int ret;

        meta.seq = seq;
        prog = bpf_iter_get_info(&meta, false);
        if (!prog)
                return ipv6_route_native_seq_show(seq, v);

        ret = ipv6_route_prog_seq_show(prog, &meta, v);
        iter->w.leaf = NULL;

        return ret;
}

static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
{
        struct bpf_iter_meta meta;
        struct bpf_prog *prog;

        if (!v) {
                meta.seq = seq;
                prog = bpf_iter_get_info(&meta, true);
                if (prog)
                        (void)ipv6_route_prog_seq_show(prog, &meta, v);
        }

        ipv6_route_native_seq_stop(seq, v);
}
#else
static int ipv6_route_seq_show(struct seq_file *seq, void *v)
{
        return ipv6_route_native_seq_show(seq, v);
}

static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
{
        ipv6_route_native_seq_stop(seq, v);
}
#endif

const struct seq_operations ipv6_route_seq_ops = {
        .start        = ipv6_route_seq_start,
        .next        = ipv6_route_seq_next,
        .stop        = ipv6_route_seq_stop,
        .show        = ipv6_route_seq_show
};
#endif /* CONFIG_PROC_FS */










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RMAP_H
#define _LINUX_RMAP_H
/*
 * Declarations for Reverse Mapping functions in mm/rmap.c
 */

#include <linux/list.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/rwsem.h>
#include <linux/memcontrol.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/memremap.h>
#include <linux/bit_spinlock.h>

/*
 * The anon_vma heads a list of private "related" vmas, to scan if
 * an anonymous page pointing to this anon_vma needs to be unmapped:
 * the vmas on the list will be related by forking, or by splitting.
 *
 * Since vmas come and go as they are split and merged (particularly
 * in mprotect), the mapping field of an anonymous page cannot point
 * directly to a vma: instead it points to an anon_vma, on whose list
 * the related vmas can be easily linked or unlinked.
 *
 * After unlinking the last vma on the list, we must garbage collect
 * the anon_vma object itself: we're guaranteed no page can be
 * pointing to this anon_vma once its vma list is empty.
 */
struct anon_vma {
        struct anon_vma *root;                /* Root of this anon_vma tree */
        struct rw_semaphore rwsem;        /* W: modification, R: walking the list */
        /*
         * The refcount is taken on an anon_vma when there is no
         * guarantee that the vma of page tables will exist for
         * the duration of the operation. A caller that takes
         * the reference is responsible for clearing up the
         * anon_vma if they are the last user on release
         */
        atomic_t refcount;

        /*
         * Count of child anon_vmas. Equals to the count of all anon_vmas that
         * have ->parent pointing to this one, including itself.
         *
         * This counter is used for making decision about reusing anon_vma
         * instead of forking new one. See comments in function anon_vma_clone.
         */
        unsigned long num_children;
        /* Count of VMAs whose ->anon_vma pointer points to this object. */
        unsigned long num_active_vmas;

        struct anon_vma *parent;        /* Parent of this anon_vma */

        /*
         * NOTE: the LSB of the rb_root.rb_node is set by
         * mm_take_all_locks() _after_ taking the above lock. So the
         * rb_root must only be read/written after taking the above lock
         * to be sure to see a valid next pointer. The LSB bit itself
         * is serialized by a system wide lock only visible to
         * mm_take_all_locks() (mm_all_locks_mutex).
         */

        /* Interval tree of private "related" vmas */
        struct rb_root_cached rb_root;
};

/*
 * The copy-on-write semantics of fork mean that an anon_vma
 * can become associated with multiple processes. Furthermore,
 * each child process will have its own anon_vma, where new
 * pages for that process are instantiated.
 *
 * This structure allows us to find the anon_vmas associated
 * with a VMA, or the VMAs associated with an anon_vma.
 * The "same_vma" list contains the anon_vma_chains linking
 * all the anon_vmas associated with this VMA.
 * The "rb" field indexes on an interval tree the anon_vma_chains
 * which link all the VMAs associated with this anon_vma.
 */
struct anon_vma_chain {
        struct vm_area_struct *vma;
        struct anon_vma *anon_vma;
        struct list_head same_vma;   /* locked by mmap_lock & page_table_lock */
        struct rb_node rb;                        /* locked by anon_vma->rwsem */
        unsigned long rb_subtree_last;
#ifdef CONFIG_DEBUG_VM_RB
        unsigned long cached_vma_start, cached_vma_last;
#endif
};

enum ttu_flags {
        TTU_SPLIT_HUGE_PMD        = 0x4,        /* split huge PMD if any */
        TTU_IGNORE_MLOCK        = 0x8,        /* ignore mlock */
        TTU_SYNC                = 0x10,        /* avoid racy checks with PVMW_SYNC */
        TTU_HWPOISON                = 0x20,        /* do convert pte to hwpoison entry */
        TTU_BATCH_FLUSH                = 0x40,        /* Batch TLB flushes where possible
                                         * and caller guarantees they will
                                         * do a final flush if necessary */
        TTU_RMAP_LOCKED                = 0x80,        /* do not grab rmap lock:
                                         * caller holds it */
};

#ifdef CONFIG_MMU
static inline void get_anon_vma(struct anon_vma *anon_vma)
{
        atomic_inc(&anon_vma->refcount);
}

void __put_anon_vma(struct anon_vma *anon_vma);

static inline void put_anon_vma(struct anon_vma *anon_vma)
{
        if (atomic_dec_and_test(&anon_vma->refcount))
                __put_anon_vma(anon_vma);
}

static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
{
        down_write(&anon_vma->root->rwsem);
}

static inline int anon_vma_trylock_write(struct anon_vma *anon_vma)
{
        return down_write_trylock(&anon_vma->root->rwsem);
}

static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
{
        up_write(&anon_vma->root->rwsem);
}

static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
{
        down_read(&anon_vma->root->rwsem);
}

static inline int anon_vma_trylock_read(struct anon_vma *anon_vma)
{
        return down_read_trylock(&anon_vma->root->rwsem);
}

static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
{
        up_read(&anon_vma->root->rwsem);
}


/*
 * anon_vma helper functions.
 */
void anon_vma_init(void);        /* create anon_vma_cachep */
int  __anon_vma_prepare(struct vm_area_struct *);
void unlink_anon_vmas(struct vm_area_struct *);
int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);

static inline int anon_vma_prepare(struct vm_area_struct *vma)
{
        if (likely(vma->anon_vma))
                return 0;

        return __anon_vma_prepare(vma);
}

static inline void anon_vma_merge(struct vm_area_struct *vma,
                                  struct vm_area_struct *next)
{
        VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma);
        unlink_anon_vmas(next);
}

struct anon_vma *folio_get_anon_vma(const struct folio *folio);

#ifdef CONFIG_MM_ID
static __always_inline void folio_lock_large_mapcount(struct folio *folio)
{
        bit_spin_lock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids);
}

static __always_inline void folio_unlock_large_mapcount(struct folio *folio)
{
        __bit_spin_unlock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids);
}

static inline unsigned int folio_mm_id(const struct folio *folio, int idx)
{
        VM_WARN_ON_ONCE(idx != 0 && idx != 1);
        return folio->_mm_id[idx] & MM_ID_MASK;
}

static inline void folio_set_mm_id(struct folio *folio, int idx, mm_id_t id)
{
        VM_WARN_ON_ONCE(idx != 0 && idx != 1);
        folio->_mm_id[idx] &= ~MM_ID_MASK;
        folio->_mm_id[idx] |= id;
}

static inline void __folio_large_mapcount_sanity_checks(const struct folio *folio,
                int diff, mm_id_t mm_id)
{
        VM_WARN_ON_ONCE(!folio_test_large(folio) || folio_test_hugetlb(folio));
        VM_WARN_ON_ONCE(diff <= 0);
        VM_WARN_ON_ONCE(mm_id < MM_ID_MIN || mm_id > MM_ID_MAX);

        /*
         * Make sure we can detect at least one complete PTE mapping of the
         * folio in a single MM as "exclusively mapped". This is primarily
         * a check on 32bit, where we currently reduce the size of the per-MM
         * mapcount to a short.
         */
        VM_WARN_ON_ONCE(diff > folio_large_nr_pages(folio));
        VM_WARN_ON_ONCE(folio_large_nr_pages(folio) - 1 > MM_ID_MAPCOUNT_MAX);

        VM_WARN_ON_ONCE(folio_mm_id(folio, 0) == MM_ID_DUMMY &&
                        folio->_mm_id_mapcount[0] != -1);
        VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != MM_ID_DUMMY &&
                        folio->_mm_id_mapcount[0] < 0);
        VM_WARN_ON_ONCE(folio_mm_id(folio, 1) == MM_ID_DUMMY &&
                        folio->_mm_id_mapcount[1] != -1);
        VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY &&
                        folio->_mm_id_mapcount[1] < 0);
        VM_WARN_ON_ONCE(!folio_mapped(folio) &&
                        test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids));
}

static __always_inline void folio_set_large_mapcount(struct folio *folio,
                int mapcount, struct vm_area_struct *vma)
{
        __folio_large_mapcount_sanity_checks(folio, mapcount, vma->vm_mm->mm_id);

        VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != MM_ID_DUMMY);
        VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY);

        /* Note: mapcounts start at -1. */
        atomic_set(&folio->_large_mapcount, mapcount - 1);
        folio->_mm_id_mapcount[0] = mapcount - 1;
        folio_set_mm_id(folio, 0, vma->vm_mm->mm_id);
}

static __always_inline int folio_add_return_large_mapcount(struct folio *folio,
                int diff, struct vm_area_struct *vma)
{
        const mm_id_t mm_id = vma->vm_mm->mm_id;
        int new_mapcount_val;

        folio_lock_large_mapcount(folio);
        __folio_large_mapcount_sanity_checks(folio, diff, mm_id);

        new_mapcount_val = atomic_read(&folio->_large_mapcount) + diff;
        atomic_set(&folio->_large_mapcount, new_mapcount_val);

        /*
         * If a folio is mapped more than once into an MM on 32bit, we
         * can in theory overflow the per-MM mapcount (although only for
         * fairly large folios), turning it negative. In that case, just
         * free up the slot and mark the folio "mapped shared", otherwise
         * we might be in trouble when unmapping pages later.
         */
        if (folio_mm_id(folio, 0) == mm_id) {
                folio->_mm_id_mapcount[0] += diff;
                if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[0] < 0)) {
                        folio->_mm_id_mapcount[0] = -1;
                        folio_set_mm_id(folio, 0, MM_ID_DUMMY);
                        folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
                }
        } else if (folio_mm_id(folio, 1) == mm_id) {
                folio->_mm_id_mapcount[1] += diff;
                if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[1] < 0)) {
                        folio->_mm_id_mapcount[1] = -1;
                        folio_set_mm_id(folio, 1, MM_ID_DUMMY);
                        folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
                }
        } else if (folio_mm_id(folio, 0) == MM_ID_DUMMY) {
                folio_set_mm_id(folio, 0, mm_id);
                folio->_mm_id_mapcount[0] = diff - 1;
                /* We might have other mappings already. */
                if (new_mapcount_val != diff - 1)
                        folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
        } else if (folio_mm_id(folio, 1) == MM_ID_DUMMY) {
                folio_set_mm_id(folio, 1, mm_id);
                folio->_mm_id_mapcount[1] = diff - 1;
                /* Slot 0 certainly has mappings as well. */
                folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
        }
        folio_unlock_large_mapcount(folio);
        return new_mapcount_val + 1;
}
#define folio_add_large_mapcount folio_add_return_large_mapcount

static __always_inline int folio_sub_return_large_mapcount(struct folio *folio,
                int diff, struct vm_area_struct *vma)
{
        const mm_id_t mm_id = vma->vm_mm->mm_id;
        int new_mapcount_val;

        folio_lock_large_mapcount(folio);
        __folio_large_mapcount_sanity_checks(folio, diff, mm_id);

        new_mapcount_val = atomic_read(&folio->_large_mapcount) - diff;
        atomic_set(&folio->_large_mapcount, new_mapcount_val);

        /*
         * There are valid corner cases where we might underflow a per-MM
         * mapcount (some mappings added when no slot was free, some mappings
         * added once a slot was free), so we always set it to -1 once we go
         * negative.
         */
        if (folio_mm_id(folio, 0) == mm_id) {
                folio->_mm_id_mapcount[0] -= diff;
                if (folio->_mm_id_mapcount[0] >= 0)
                        goto out;
                folio->_mm_id_mapcount[0] = -1;
                folio_set_mm_id(folio, 0, MM_ID_DUMMY);
        } else if (folio_mm_id(folio, 1) == mm_id) {
                folio->_mm_id_mapcount[1] -= diff;
                if (folio->_mm_id_mapcount[1] >= 0)
                        goto out;
                folio->_mm_id_mapcount[1] = -1;
                folio_set_mm_id(folio, 1, MM_ID_DUMMY);
        }

        /*
         * If one MM slot owns all mappings, the folio is mapped exclusively.
         * Note that if the folio is now unmapped (new_mapcount_val == -1), both
         * slots must be free (mapcount == -1), and we'll also mark it as
         * exclusive.
         */
        if (folio->_mm_id_mapcount[0] == new_mapcount_val ||
            folio->_mm_id_mapcount[1] == new_mapcount_val)
                folio->_mm_ids &= ~FOLIO_MM_IDS_SHARED_BIT;
out:
        folio_unlock_large_mapcount(folio);
        return new_mapcount_val + 1;
}
#define folio_sub_large_mapcount folio_sub_return_large_mapcount
#else /* !CONFIG_MM_ID */
/*
 * See __folio_rmap_sanity_checks(), we might map large folios even without
 * CONFIG_TRANSPARENT_HUGEPAGE. We'll keep that working for now.
 */
static inline void folio_set_large_mapcount(struct folio *folio, int mapcount,
                struct vm_area_struct *vma)
{
        /* Note: mapcounts start at -1. */
        atomic_set(&folio->_large_mapcount, mapcount - 1);
}

static inline void folio_add_large_mapcount(struct folio *folio,
                int diff, struct vm_area_struct *vma)
{
        atomic_add(diff, &folio->_large_mapcount);
}

static inline int folio_add_return_large_mapcount(struct folio *folio,
                int diff, struct vm_area_struct *vma)
{
        BUILD_BUG();
}

static inline void folio_sub_large_mapcount(struct folio *folio,
                int diff, struct vm_area_struct *vma)
{
        atomic_sub(diff, &folio->_large_mapcount);
}

static inline int folio_sub_return_large_mapcount(struct folio *folio,
                int diff, struct vm_area_struct *vma)
{
        BUILD_BUG();
}
#endif /* CONFIG_MM_ID */

#define folio_inc_large_mapcount(folio, vma) \
        folio_add_large_mapcount(folio, 1, vma)
#define folio_inc_return_large_mapcount(folio, vma) \
        folio_add_return_large_mapcount(folio, 1, vma)
#define folio_dec_large_mapcount(folio, vma) \
        folio_sub_large_mapcount(folio, 1, vma)
#define folio_dec_return_large_mapcount(folio, vma) \
        folio_sub_return_large_mapcount(folio, 1, vma)

/* RMAP flags, currently only relevant for some anon rmap operations. */
typedef int __bitwise rmap_t;

/*
 * No special request: A mapped anonymous (sub)page is possibly shared between
 * processes.
 */
#define RMAP_NONE                ((__force rmap_t)0)

/* The anonymous (sub)page is exclusive to a single process. */
#define RMAP_EXCLUSIVE                ((__force rmap_t)BIT(0))

static __always_inline void __folio_rmap_sanity_checks(const struct folio *folio,
                const struct page *page, int nr_pages, enum pgtable_level level)
{
        /* hugetlb folios are handled separately. */
        VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);

        /* When (un)mapping zeropages, we should never touch ref+mapcount. */
        VM_WARN_ON_FOLIO(is_zero_folio(folio), folio);

        /*
         * TODO: we get driver-allocated folios that have nothing to do with
         * the rmap using vm_insert_page(); therefore, we cannot assume that
         * folio_test_large_rmappable() holds for large folios. We should
         * handle any desired mapcount+stats accounting for these folios in
         * VM_MIXEDMAP VMAs separately, and then sanity-check here that
         * we really only get rmappable folios.
         */

        VM_WARN_ON_ONCE(nr_pages <= 0);
        VM_WARN_ON_FOLIO(page_folio(page) != folio, folio);
        VM_WARN_ON_FOLIO(page_folio(page + nr_pages - 1) != folio, folio);

        switch (level) {
        case PGTABLE_LEVEL_PTE:
                break;
        case PGTABLE_LEVEL_PMD:
                /*
                 * We don't support folios larger than a single PMD yet. So
                 * when PGTABLE_LEVEL_PMD is set, we assume that we are creating
                 * a single "entire" mapping of the folio.
                 */
                VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio);
                VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio);
                break;
        case PGTABLE_LEVEL_PUD:
                /*
                 * Assume that we are creating a single "entire" mapping of the
                 * folio.
                 */
                VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PUD_NR, folio);
                VM_WARN_ON_FOLIO(nr_pages != HPAGE_PUD_NR, folio);
                break;
        default:
                BUILD_BUG();
        }

        /*
         * Anon folios must have an associated live anon_vma as long as they're
         * mapped into userspace.
         * Note that the atomic_read() mainly does two things:
         *
         * 1. In KASAN builds with CONFIG_SLUB_RCU_DEBUG, it causes KASAN to
         *    check that the associated anon_vma has not yet been freed (subject
         *    to KASAN's usual limitations). This check will pass if the
         *    anon_vma's refcount has already dropped to 0 but an RCU grace
         *    period hasn't passed since then.
         * 2. If the anon_vma has not yet been freed, it checks that the
         *    anon_vma still has a nonzero refcount (as opposed to being in the
         *    middle of an RCU delay for getting freed).
         */
        if (folio_test_anon(folio) && !folio_test_ksm(folio)) {
                unsigned long mapping = (unsigned long)folio->mapping;
                struct anon_vma *anon_vma;

                anon_vma = (void *)(mapping - FOLIO_MAPPING_ANON);
                VM_WARN_ON_FOLIO(atomic_read(&anon_vma->refcount) == 0, folio);
        }
}

/*
 * rmap interfaces called when adding or removing pte of page
 */
void folio_move_anon_rmap(struct folio *, struct vm_area_struct *);
void folio_add_anon_rmap_ptes(struct folio *, struct page *, int nr_pages,
                struct vm_area_struct *, unsigned long address, rmap_t flags);
#define folio_add_anon_rmap_pte(folio, page, vma, address, flags) \
        folio_add_anon_rmap_ptes(folio, page, 1, vma, address, flags)
void folio_add_anon_rmap_pmd(struct folio *, struct page *,
                struct vm_area_struct *, unsigned long address, rmap_t flags);
void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
                unsigned long address, rmap_t flags);
void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages,
                struct vm_area_struct *);
#define folio_add_file_rmap_pte(folio, page, vma) \
        folio_add_file_rmap_ptes(folio, page, 1, vma)
void folio_add_file_rmap_pmd(struct folio *, struct page *,
                struct vm_area_struct *);
void folio_add_file_rmap_pud(struct folio *, struct page *,
                struct vm_area_struct *);
void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages,
                struct vm_area_struct *);
#define folio_remove_rmap_pte(folio, page, vma) \
        folio_remove_rmap_ptes(folio, page, 1, vma)
void folio_remove_rmap_pmd(struct folio *, struct page *,
                struct vm_area_struct *);
void folio_remove_rmap_pud(struct folio *, struct page *,
                struct vm_area_struct *);

void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *,
                unsigned long address, rmap_t flags);
void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
                unsigned long address);

/* See folio_try_dup_anon_rmap_*() */
static inline int hugetlb_try_dup_anon_rmap(struct folio *folio,
                struct vm_area_struct *vma)
{
        VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
        VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);

        if (PageAnonExclusive(&folio->page)) {
                if (unlikely(folio_needs_cow_for_dma(vma, folio)))
                        return -EBUSY;
                ClearPageAnonExclusive(&folio->page);
        }
        atomic_inc(&folio->_entire_mapcount);
        atomic_inc(&folio->_large_mapcount);
        return 0;
}

/* See folio_try_share_anon_rmap_*() */
static inline int hugetlb_try_share_anon_rmap(struct folio *folio)
{
        VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
        VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
        VM_WARN_ON_FOLIO(!PageAnonExclusive(&folio->page), folio);

        /* Paired with the memory barrier in try_grab_folio(). */
        if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
                smp_mb();

        if (unlikely(folio_maybe_dma_pinned(folio)))
                return -EBUSY;
        ClearPageAnonExclusive(&folio->page);

        /*
         * This is conceptually a smp_wmb() paired with the smp_rmb() in
         * gup_must_unshare().
         */
        if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
                smp_mb__after_atomic();
        return 0;
}

static inline void hugetlb_add_file_rmap(struct folio *folio)
{
        VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
        VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);

        atomic_inc(&folio->_entire_mapcount);
        atomic_inc(&folio->_large_mapcount);
}

static inline void hugetlb_remove_rmap(struct folio *folio)
{
        VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);

        atomic_dec(&folio->_entire_mapcount);
        atomic_dec(&folio->_large_mapcount);
}

static __always_inline void __folio_dup_file_rmap(struct folio *folio,
                struct page *page, int nr_pages, struct vm_area_struct *dst_vma,
                enum pgtable_level level)
{
        const int orig_nr_pages = nr_pages;

        __folio_rmap_sanity_checks(folio, page, nr_pages, level);

        switch (level) {
        case PGTABLE_LEVEL_PTE:
                if (!folio_test_large(folio)) {
                        atomic_inc(&folio->_mapcount);
                        break;
                }

                if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) {
                        do {
                                atomic_inc(&page->_mapcount);
                        } while (page++, --nr_pages > 0);
                }
                folio_add_large_mapcount(folio, orig_nr_pages, dst_vma);
                break;
        case PGTABLE_LEVEL_PMD:
        case PGTABLE_LEVEL_PUD:
                atomic_inc(&folio->_entire_mapcount);
                folio_inc_large_mapcount(folio, dst_vma);
                break;
        default:
                BUILD_BUG();
        }
}

/**
 * folio_dup_file_rmap_ptes - duplicate PTE mappings of a page range of a folio
 * @folio:        The folio to duplicate the mappings of
 * @page:        The first page to duplicate the mappings of
 * @nr_pages:        The number of pages of which the mapping will be duplicated
 * @dst_vma:        The destination vm area
 *
 * The page range of the folio is defined by [page, page + nr_pages)
 *
 * The caller needs to hold the page table lock.
 */
static inline void folio_dup_file_rmap_ptes(struct folio *folio,
                struct page *page, int nr_pages, struct vm_area_struct *dst_vma)
{
        __folio_dup_file_rmap(folio, page, nr_pages, dst_vma, PGTABLE_LEVEL_PTE);
}

static __always_inline void folio_dup_file_rmap_pte(struct folio *folio,
                struct page *page, struct vm_area_struct *dst_vma)
{
        __folio_dup_file_rmap(folio, page, 1, dst_vma, PGTABLE_LEVEL_PTE);
}

/**
 * folio_dup_file_rmap_pmd - duplicate a PMD mapping of a page range of a folio
 * @folio:        The folio to duplicate the mapping of
 * @page:        The first page to duplicate the mapping of
 * @dst_vma:        The destination vm area
 *
 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
 *
 * The caller needs to hold the page table lock.
 */
static inline void folio_dup_file_rmap_pmd(struct folio *folio,
                struct page *page, struct vm_area_struct *dst_vma)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, dst_vma, PGTABLE_LEVEL_PTE);
#else
        WARN_ON_ONCE(true);
#endif
}

static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
                struct page *page, int nr_pages, struct vm_area_struct *dst_vma,
                struct vm_area_struct *src_vma, enum pgtable_level level)
{
        const int orig_nr_pages = nr_pages;
        bool maybe_pinned;
        int i;

        VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
        __folio_rmap_sanity_checks(folio, page, nr_pages, level);

        /*
         * If this folio may have been pinned by the parent process,
         * don't allow to duplicate the mappings but instead require to e.g.,
         * copy the subpage immediately for the child so that we'll always
         * guarantee the pinned folio won't be randomly replaced in the
         * future on write faults.
         */
        maybe_pinned = likely(!folio_is_device_private(folio)) &&
                       unlikely(folio_needs_cow_for_dma(src_vma, folio));

        /*
         * No need to check+clear for already shared PTEs/PMDs of the
         * folio. But if any page is PageAnonExclusive, we must fallback to
         * copying if the folio maybe pinned.
         */
        switch (level) {
        case PGTABLE_LEVEL_PTE:
                if (unlikely(maybe_pinned)) {
                        for (i = 0; i < nr_pages; i++)
                                if (PageAnonExclusive(page + i))
                                        return -EBUSY;
                }

                if (!folio_test_large(folio)) {
                        if (PageAnonExclusive(page))
                                ClearPageAnonExclusive(page);
                        atomic_inc(&folio->_mapcount);
                        break;
                }

                do {
                        if (PageAnonExclusive(page))
                                ClearPageAnonExclusive(page);
                        if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
                                atomic_inc(&page->_mapcount);
                } while (page++, --nr_pages > 0);
                folio_add_large_mapcount(folio, orig_nr_pages, dst_vma);
                break;
        case PGTABLE_LEVEL_PMD:
        case PGTABLE_LEVEL_PUD:
                if (PageAnonExclusive(page)) {
                        if (unlikely(maybe_pinned))
                                return -EBUSY;
                        ClearPageAnonExclusive(page);
                }
                atomic_inc(&folio->_entire_mapcount);
                folio_inc_large_mapcount(folio, dst_vma);
                break;
        default:
                BUILD_BUG();
        }
        return 0;
}

/**
 * folio_try_dup_anon_rmap_ptes - try duplicating PTE mappings of a page range
 *                                  of a folio
 * @folio:        The folio to duplicate the mappings of
 * @page:        The first page to duplicate the mappings of
 * @nr_pages:        The number of pages of which the mapping will be duplicated
 * @dst_vma:        The destination vm area
 * @src_vma:        The vm area from which the mappings are duplicated
 *
 * The page range of the folio is defined by [page, page + nr_pages)
 *
 * The caller needs to hold the page table lock and the
 * vma->vma_mm->write_protect_seq.
 *
 * Duplicating the mappings can only fail if the folio may be pinned; device
 * private folios cannot get pinned and consequently this function cannot fail
 * for them.
 *
 * If duplicating the mappings succeeded, the duplicated PTEs have to be R/O in
 * the parent and the child. They must *not* be writable after this call
 * succeeded.
 *
 * Returns 0 if duplicating the mappings succeeded. Returns -EBUSY otherwise.
 */
static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio,
                struct page *page, int nr_pages, struct vm_area_struct *dst_vma,
                struct vm_area_struct *src_vma)
{
        return __folio_try_dup_anon_rmap(folio, page, nr_pages, dst_vma,
                                         src_vma, PGTABLE_LEVEL_PTE);
}

static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio,
                struct page *page, struct vm_area_struct *dst_vma,
                struct vm_area_struct *src_vma)
{
        return __folio_try_dup_anon_rmap(folio, page, 1, dst_vma, src_vma,
                                         PGTABLE_LEVEL_PTE);
}

/**
 * folio_try_dup_anon_rmap_pmd - try duplicating a PMD mapping of a page range
 *                                 of a folio
 * @folio:        The folio to duplicate the mapping of
 * @page:        The first page to duplicate the mapping of
 * @dst_vma:        The destination vm area
 * @src_vma:        The vm area from which the mapping is duplicated
 *
 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
 *
 * The caller needs to hold the page table lock and the
 * vma->vma_mm->write_protect_seq.
 *
 * Duplicating the mapping can only fail if the folio may be pinned; device
 * private folios cannot get pinned and consequently this function cannot fail
 * for them.
 *
 * If duplicating the mapping succeeds, the duplicated PMD has to be R/O in
 * the parent and the child. They must *not* be writable after this call
 * succeeded.
 *
 * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise.
 */
static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio,
                struct page *page, struct vm_area_struct *dst_vma,
                struct vm_area_struct *src_vma)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, dst_vma,
                                         src_vma, PGTABLE_LEVEL_PMD);
#else
        WARN_ON_ONCE(true);
        return -EBUSY;
#endif
}

static __always_inline int __folio_try_share_anon_rmap(struct folio *folio,
                struct page *page, int nr_pages, enum pgtable_level level)
{
        VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
        VM_WARN_ON_FOLIO(!PageAnonExclusive(page), folio);
        __folio_rmap_sanity_checks(folio, page, nr_pages, level);

        /* device private folios cannot get pinned via GUP. */
        if (unlikely(folio_is_device_private(folio))) {
                ClearPageAnonExclusive(page);
                return 0;
        }

        /*
         * We have to make sure that when we clear PageAnonExclusive, that
         * the page is not pinned and that concurrent GUP-fast won't succeed in
         * concurrently pinning the page.
         *
         * Conceptually, PageAnonExclusive clearing consists of:
         * (A1) Clear PTE
         * (A2) Check if the page is pinned; back off if so.
         * (A3) Clear PageAnonExclusive
         * (A4) Restore PTE (optional, but certainly not writable)
         *
         * When clearing PageAnonExclusive, we cannot possibly map the page
         * writable again, because anon pages that may be shared must never
         * be writable. So in any case, if the PTE was writable it cannot
         * be writable anymore afterwards and there would be a PTE change. Only
         * if the PTE wasn't writable, there might not be a PTE change.
         *
         * Conceptually, GUP-fast pinning of an anon page consists of:
         * (B1) Read the PTE
         * (B2) FOLL_WRITE: check if the PTE is not writable; back off if so.
         * (B3) Pin the mapped page
         * (B4) Check if the PTE changed by re-reading it; back off if so.
         * (B5) If the original PTE is not writable, check if
         *        PageAnonExclusive is not set; back off if so.
         *
         * If the PTE was writable, we only have to make sure that GUP-fast
         * observes a PTE change and properly backs off.
         *
         * If the PTE was not writable, we have to make sure that GUP-fast either
         * detects a (temporary) PTE change or that PageAnonExclusive is cleared
         * and properly backs off.
         *
         * Consequently, when clearing PageAnonExclusive(), we have to make
         * sure that (A1), (A2)/(A3) and (A4) happen in the right memory
         * order. In GUP-fast pinning code, we have to make sure that (B3),(B4)
         * and (B5) happen in the right memory order.
         *
         * We assume that there might not be a memory barrier after
         * clearing/invalidating the PTE (A1) and before restoring the PTE (A4),
         * so we use explicit ones here.
         */

        /* Paired with the memory barrier in try_grab_folio(). */
        if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
                smp_mb();

        if (unlikely(folio_maybe_dma_pinned(folio)))
                return -EBUSY;
        ClearPageAnonExclusive(page);

        /*
         * This is conceptually a smp_wmb() paired with the smp_rmb() in
         * gup_must_unshare().
         */
        if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
                smp_mb__after_atomic();
        return 0;
}

/**
 * folio_try_share_anon_rmap_pte - try marking an exclusive anonymous page
 *                                   mapped by a PTE possibly shared to prepare
 *                                   for KSM or temporary unmapping
 * @folio:        The folio to share a mapping of
 * @page:        The mapped exclusive page
 *
 * The caller needs to hold the page table lock and has to have the page table
 * entries cleared/invalidated.
 *
 * This is similar to folio_try_dup_anon_rmap_pte(), however, not used during
 * fork() to duplicate mappings, but instead to prepare for KSM or temporarily
 * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pte().
 *
 * Marking the mapped page shared can only fail if the folio maybe pinned;
 * device private folios cannot get pinned and consequently this function cannot
 * fail.
 *
 * Returns 0 if marking the mapped page possibly shared succeeded. Returns
 * -EBUSY otherwise.
 */
static inline int folio_try_share_anon_rmap_pte(struct folio *folio,
                struct page *page)
{
        return __folio_try_share_anon_rmap(folio, page, 1, PGTABLE_LEVEL_PTE);
}

/**
 * folio_try_share_anon_rmap_pmd - try marking an exclusive anonymous page
 *                                   range mapped by a PMD possibly shared to
 *                                   prepare for temporary unmapping
 * @folio:        The folio to share the mapping of
 * @page:        The first page to share the mapping of
 *
 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
 *
 * The caller needs to hold the page table lock and has to have the page table
 * entries cleared/invalidated.
 *
 * This is similar to folio_try_dup_anon_rmap_pmd(), however, not used during
 * fork() to duplicate a mapping, but instead to prepare for temporarily
 * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pmd().
 *
 * Marking the mapped pages shared can only fail if the folio maybe pinned;
 * device private folios cannot get pinned and consequently this function cannot
 * fail.
 *
 * Returns 0 if marking the mapped pages possibly shared succeeded. Returns
 * -EBUSY otherwise.
 */
static inline int folio_try_share_anon_rmap_pmd(struct folio *folio,
                struct page *page)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        return __folio_try_share_anon_rmap(folio, page, HPAGE_PMD_NR,
                                           PGTABLE_LEVEL_PMD);
#else
        WARN_ON_ONCE(true);
        return -EBUSY;
#endif
}

/*
 * Called from mm/vmscan.c to handle paging out
 */
int folio_referenced(struct folio *, int is_locked,
                        struct mem_cgroup *memcg, vm_flags_t *vm_flags);

void try_to_migrate(struct folio *folio, enum ttu_flags flags);
void try_to_unmap(struct folio *, enum ttu_flags flags);

struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
                void *owner, struct folio **foliop);

/* Avoid racy checks */
#define PVMW_SYNC                (1 << 0)
/* Look for migration entries rather than present PTEs */
#define PVMW_MIGRATION                (1 << 1)

/* Result flags */

/* The page is mapped across page table boundary */
#define PVMW_PGTABLE_CROSSED        (1 << 16)

struct page_vma_mapped_walk {
        unsigned long pfn;
        unsigned long nr_pages;
        pgoff_t pgoff;
        struct vm_area_struct *vma;
        unsigned long address;
        pmd_t *pmd;
        pte_t *pte;
        spinlock_t *ptl;
        unsigned int flags;
};

#define DEFINE_FOLIO_VMA_WALK(name, _folio, _vma, _address, _flags)        \
        struct page_vma_mapped_walk name = {                                \
                .pfn = folio_pfn(_folio),                                \
                .nr_pages = folio_nr_pages(_folio),                        \
                .pgoff = folio_pgoff(_folio),                                \
                .vma = _vma,                                                \
                .address = _address,                                        \
                .flags = _flags,                                        \
        }

static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
{
        /* HugeTLB pte is set to the relevant page table entry without pte_mapped. */
        if (pvmw->pte && !is_vm_hugetlb_page(pvmw->vma))
                pte_unmap(pvmw->pte);
        if (pvmw->ptl)
                spin_unlock(pvmw->ptl);
}

/**
 * page_vma_mapped_walk_restart - Restart the page table walk.
 * @pvmw: Pointer to struct page_vma_mapped_walk.
 *
 * It restarts the page table walk when changes occur in the page
 * table, such as splitting a PMD. Ensures that the PTL held during
 * the previous walk is released and resets the state to allow for
 * a new walk starting at the current address stored in pvmw->address.
 */
static inline void
page_vma_mapped_walk_restart(struct page_vma_mapped_walk *pvmw)
{
        WARN_ON_ONCE(!pvmw->pmd && !pvmw->pte);

        if (likely(pvmw->ptl))
                spin_unlock(pvmw->ptl);
        else
                WARN_ON_ONCE(1);

        pvmw->ptl = NULL;
        pvmw->pmd = NULL;
        pvmw->pte = NULL;
}

bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);
unsigned long page_address_in_vma(const struct folio *folio,
                const struct page *, const struct vm_area_struct *);

/*
 * Cleans the PTEs of shared mappings.
 * (and since clean PTEs should also be readonly, write protects them too)
 *
 * returns the number of cleaned PTEs.
 */
int folio_mkclean(struct folio *);

int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff,
                unsigned long pfn, unsigned long nr_pages);

int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
                      struct vm_area_struct *vma);

enum rmp_flags {
        RMP_LOCKED                = 1 << 0,
        RMP_USE_SHARED_ZEROPAGE        = 1 << 1,
};

void remove_migration_ptes(struct folio *src, struct folio *dst, int flags);

/*
 * rmap_walk_control: To control rmap traversing for specific needs
 *
 * arg: passed to rmap_one() and invalid_vma()
 * try_lock: bail out if the rmap lock is contended
 * contended: indicate the rmap traversal bailed out due to lock contention
 * rmap_one: executed on each vma where page is mapped
 * done: for checking traversing termination condition
 * anon_lock: for getting anon_lock by optimized way rather than default
 * invalid_vma: for skipping uninterested vma
 */
struct rmap_walk_control {
        void *arg;
        bool try_lock;
        bool contended;
        /*
         * Return false if page table scanning in rmap_walk should be stopped.
         * Otherwise, return true.
         */
        bool (*rmap_one)(struct folio *folio, struct vm_area_struct *vma,
                                        unsigned long addr, void *arg);
        int (*done)(struct folio *folio);
        struct anon_vma *(*anon_lock)(const struct folio *folio,
                                      struct rmap_walk_control *rwc);
        bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
};

void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc);
void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc);
struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio,
                                          struct rmap_walk_control *rwc);

#else        /* !CONFIG_MMU */

#define anon_vma_init()                do {} while (0)
#define anon_vma_prepare(vma)        (0)

static inline int folio_referenced(struct folio *folio, int is_locked,
                                  struct mem_cgroup *memcg,
                                  vm_flags_t *vm_flags)
{
        *vm_flags = 0;
        return 0;
}

static inline void try_to_unmap(struct folio *folio, enum ttu_flags flags)
{
}

static inline int folio_mkclean(struct folio *folio)
{
        return 0;
}
#endif        /* CONFIG_MMU */

#endif        /* _LINUX_RMAP_H */
















































































































    1 

























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * RNG: Random Number Generator  algorithms under the crypto API
 *
 * Copyright (c) 2008 Neil Horman <nhorman@tuxdriver.com>
 * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au>
 */

#ifndef _CRYPTO_RNG_H
#define _CRYPTO_RNG_H

#include <linux/atomic.h>
#include <linux/container_of.h>
#include <linux/crypto.h>

struct crypto_rng;

/**
 * struct rng_alg - random number generator definition
 *
 * @generate:        The function defined by this variable obtains a
 *                random number. The random number generator transform
 *                must generate the random number out of the context
 *                provided with this call, plus any additional data
 *                if provided to the call.
 * @seed:        Seed or reseed the random number generator.  With the
 *                invocation of this function call, the random number
 *                generator shall become ready for generation.  If the
 *                random number generator requires a seed for setting
 *                up a new state, the seed must be provided by the
 *                consumer while invoking this function. The required
 *                size of the seed is defined with @seedsize .
 * @set_ent:        Set entropy that would otherwise be obtained from
 *                entropy source.  Internal use only.
 * @seedsize:        The seed size required for a random number generator
 *                initialization defined with this variable. Some
 *                random number generators does not require a seed
 *                as the seeding is implemented internally without
 *                the need of support by the consumer. In this case,
 *                the seed size is set to zero.
 * @base:        Common crypto API algorithm data structure.
 */
struct rng_alg {
        int (*generate)(struct crypto_rng *tfm,
                        const u8 *src, unsigned int slen,
                        u8 *dst, unsigned int dlen);
        int (*seed)(struct crypto_rng *tfm, const u8 *seed, unsigned int slen);
        void (*set_ent)(struct crypto_rng *tfm, const u8 *data,
                        unsigned int len);

        unsigned int seedsize;

        struct crypto_alg base;
};

struct crypto_rng {
        struct crypto_tfm base;
};

extern struct crypto_rng *crypto_default_rng;

int crypto_get_default_rng(void);
void crypto_put_default_rng(void);

/**
 * DOC: Random number generator API
 *
 * The random number generator API is used with the ciphers of type
 * CRYPTO_ALG_TYPE_RNG (listed as type "rng" in /proc/crypto)
 */

/**
 * crypto_alloc_rng() -- allocate RNG handle
 * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
 *              message digest cipher
 * @type: specifies the type of the cipher
 * @mask: specifies the mask for the cipher
 *
 * Allocate a cipher handle for a random number generator. The returned struct
 * crypto_rng is the cipher handle that is required for any subsequent
 * API invocation for that random number generator.
 *
 * For all random number generators, this call creates a new private copy of
 * the random number generator that does not share a state with other
 * instances. The only exception is the "krng" random number generator which
 * is a kernel crypto API use case for the get_random_bytes() function of the
 * /dev/random driver.
 *
 * Return: allocated cipher handle in case of success; IS_ERR() is true in case
 *           of an error, PTR_ERR() returns the error code.
 */
struct crypto_rng *crypto_alloc_rng(const char *alg_name, u32 type, u32 mask);

static inline struct crypto_tfm *crypto_rng_tfm(struct crypto_rng *tfm)
{
        return &tfm->base;
}

static inline struct rng_alg *__crypto_rng_alg(struct crypto_alg *alg)
{
        return container_of(alg, struct rng_alg, base);
}

/**
 * crypto_rng_alg() - obtain 'struct rng_alg' pointer from RNG handle
 * @tfm: RNG handle
 *
 * Return: Pointer to 'struct rng_alg', derived from @tfm RNG handle
 */
static inline struct rng_alg *crypto_rng_alg(struct crypto_rng *tfm)
{
        return __crypto_rng_alg(crypto_rng_tfm(tfm)->__crt_alg);
}

/**
 * crypto_free_rng() - zeroize and free RNG handle
 * @tfm: cipher handle to be freed
 *
 * If @tfm is a NULL or error pointer, this function does nothing.
 */
static inline void crypto_free_rng(struct crypto_rng *tfm)
{
        crypto_destroy_tfm(tfm, crypto_rng_tfm(tfm));
}

/**
 * crypto_rng_generate() - get random number
 * @tfm: cipher handle
 * @src: Input buffer holding additional data, may be NULL
 * @slen: Length of additional data
 * @dst: output buffer holding the random numbers
 * @dlen: length of the output buffer
 *
 * This function fills the caller-allocated buffer with random
 * numbers using the random number generator referenced by the
 * cipher handle.
 *
 * Return: 0 function was successful; < 0 if an error occurred
 */
static inline int crypto_rng_generate(struct crypto_rng *tfm,
                                      const u8 *src, unsigned int slen,
                                      u8 *dst, unsigned int dlen)
{
        return crypto_rng_alg(tfm)->generate(tfm, src, slen, dst, dlen);
}

/**
 * crypto_rng_get_bytes() - get random number
 * @tfm: cipher handle
 * @rdata: output buffer holding the random numbers
 * @dlen: length of the output buffer
 *
 * This function fills the caller-allocated buffer with random numbers using the
 * random number generator referenced by the cipher handle.
 *
 * Return: 0 function was successful; < 0 if an error occurred
 */
static inline int crypto_rng_get_bytes(struct crypto_rng *tfm,
                                       u8 *rdata, unsigned int dlen)
{
        return crypto_rng_generate(tfm, NULL, 0, rdata, dlen);
}

/**
 * crypto_rng_reset() - re-initialize the RNG
 * @tfm: cipher handle
 * @seed: seed input data
 * @slen: length of the seed input data
 *
 * The reset function completely re-initializes the random number generator
 * referenced by the cipher handle by clearing the current state. The new state
 * is initialized with the caller provided seed or automatically, depending
 * on the random number generator type (the ANSI X9.31 RNG requires
 * caller-provided seed, the SP800-90A DRBGs perform an automatic seeding).
 * The seed is provided as a parameter to this function call. The provided seed
 * should have the length of the seed size defined for the random number
 * generator as defined by crypto_rng_seedsize.
 *
 * Return: 0 if the setting of the key was successful; < 0 if an error occurred
 */
int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed,
                     unsigned int slen);

/**
 * crypto_rng_seedsize() - obtain seed size of RNG
 * @tfm: cipher handle
 *
 * The function returns the seed size for the random number generator
 * referenced by the cipher handle. This value may be zero if the random
 * number generator does not implement or require a reseeding. For example,
 * the SP800-90A DRBGs implement an automated reseeding after reaching a
 * pre-defined threshold.
 *
 * Return: seed size for the random number generator
 */
static inline int crypto_rng_seedsize(struct crypto_rng *tfm)
{
        return crypto_rng_alg(tfm)->seedsize;
}

#endif





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  308 




  312 
  313 



















































































































































































































































































































































































































































































































































































































































































































































































































  267 


  268 

  266 
  263 
  268 


  265 
  267 
  266 







































































































































































  303 


  302 






  304 



  304 



  302 


  300 
  305 








































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Generic socket support routines. Memory allocators, socket lock/release
 *                handler for protocols to use and generic option handler.
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Florian La Roche, <flla@stud.uni-sb.de>
 *                Alan Cox, <A.Cox@swansea.ac.uk>
 *
 * Fixes:
 *                Alan Cox        :         Numerous verify_area() problems
 *                Alan Cox        :        Connecting on a connecting socket
 *                                        now returns an error for tcp.
 *                Alan Cox        :        sock->protocol is set correctly.
 *                                        and is not sometimes left as 0.
 *                Alan Cox        :        connect handles icmp errors on a
 *                                        connect properly. Unfortunately there
 *                                        is a restart syscall nasty there. I
 *                                        can't match BSD without hacking the C
 *                                        library. Ideas urgently sought!
 *                Alan Cox        :        Disallow bind() to addresses that are
 *                                        not ours - especially broadcast ones!!
 *                Alan Cox        :        Socket 1024 _IS_ ok for users. (fencepost)
 *                Alan Cox        :        sock_wfree/sock_rfree don't destroy sockets,
 *                                        instead they leave that for the DESTROY timer.
 *                Alan Cox        :        Clean up error flag in accept
 *                Alan Cox        :        TCP ack handling is buggy, the DESTROY timer
 *                                        was buggy. Put a remove_sock() in the handler
 *                                        for memory when we hit 0. Also altered the timer
 *                                        code. The ACK stuff can wait and needs major
 *                                        TCP layer surgery.
 *                Alan Cox        :        Fixed TCP ack bug, removed remove sock
 *                                        and fixed timer/inet_bh race.
 *                Alan Cox        :        Added zapped flag for TCP
 *                Alan Cox        :        Move kfree_skb into skbuff.c and tidied up surplus code
 *                Alan Cox        :        for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
 *                Alan Cox        :        kfree_s calls now are kfree_skbmem so we can track skb resources
 *                Alan Cox        :        Supports socket option broadcast now as does udp. Packet and raw need fixing.
 *                Alan Cox        :        Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
 *                Rick Sladkey        :        Relaxed UDP rules for matching packets.
 *                C.E.Hawkins        :        IFF_PROMISC/SIOCGHWADDR support
 *        Pauline Middelink        :        identd support
 *                Alan Cox        :        Fixed connect() taking signals I think.
 *                Alan Cox        :        SO_LINGER supported
 *                Alan Cox        :        Error reporting fixes
 *                Anonymous        :        inet_create tidied up (sk->reuse setting)
 *                Alan Cox        :        inet sockets don't set sk->type!
 *                Alan Cox        :        Split socket option code
 *                Alan Cox        :        Callbacks
 *                Alan Cox        :        Nagle flag for Charles & Johannes stuff
 *                Alex                :        Removed restriction on inet fioctl
 *                Alan Cox        :        Splitting INET from NET core
 *                Alan Cox        :        Fixed bogus SO_TYPE handling in getsockopt()
 *                Adam Caldwell        :        Missing return in SO_DONTROUTE/SO_DEBUG code
 *                Alan Cox        :        Split IP from generic code
 *                Alan Cox        :        New kfree_skbmem()
 *                Alan Cox        :        Make SO_DEBUG superuser only.
 *                Alan Cox        :        Allow anyone to clear SO_DEBUG
 *                                        (compatibility fix)
 *                Alan Cox        :        Added optimistic memory grabbing for AF_UNIX throughput.
 *                Alan Cox        :        Allocator for a socket is settable.
 *                Alan Cox        :        SO_ERROR includes soft errors.
 *                Alan Cox        :        Allow NULL arguments on some SO_ opts
 *                Alan Cox        :         Generic socket allocation to make hooks
 *                                        easier (suggested by Craig Metz).
 *                Michael Pall        :        SO_ERROR returns positive errno again
 *              Steve Whitehouse:       Added default destructor to free
 *                                      protocol private data.
 *              Steve Whitehouse:       Added various other default routines
 *                                      common to several socket families.
 *              Chris Evans     :       Call suser() check last on F_SETOWN
 *                Jay Schulist        :        Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
 *                Andi Kleen        :        Add sock_kmalloc()/sock_kfree_s()
 *                Andi Kleen        :        Fix write_space callback
 *                Chris Evans        :        Security fixes - signedness again
 *                Arnaldo C. Melo :       cleanups, use skb_queue_purge
 *
 * To Fix:
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/unaligned.h>
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/errqueue.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/interrupt.h>
#include <linux/poll.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/user_namespace.h>
#include <linux/static_key.h>
#include <linux/memcontrol.h>
#include <linux/prefetch.h>
#include <linux/compat.h>
#include <linux/mroute.h>
#include <linux/mroute6.h>
#include <linux/icmpv6.h>

#include <linux/uaccess.h>

#include <linux/netdevice.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
#include <linux/skbuff_ref.h>
#include <net/net_namespace.h>
#include <net/request_sock.h>
#include <net/sock.h>
#include <net/proto_memory.h>
#include <linux/net_tstamp.h>
#include <net/xfrm.h>
#include <linux/ipsec.h>
#include <net/cls_cgroup.h>
#include <net/netprio_cgroup.h>
#include <linux/sock_diag.h>

#include <linux/filter.h>
#include <net/sock_reuseport.h>
#include <net/bpf_sk_storage.h>

#include <trace/events/sock.h>

#include <net/tcp.h>
#include <net/busy_poll.h>
#include <net/phonet/phonet.h>

#include <linux/ethtool.h>

#include <uapi/linux/pidfd.h>

#include "dev.h"

static DEFINE_MUTEX(proto_list_mutex);
static LIST_HEAD(proto_list);

static void sock_def_write_space_wfree(struct sock *sk);
static void sock_def_write_space(struct sock *sk);

/**
 * sk_ns_capable - General socket capability test
 * @sk: Socket to use a capability on or through
 * @user_ns: The user namespace of the capability to use
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket had when the socket was
 * created and the current process has the capability @cap in the user
 * namespace @user_ns.
 */
bool sk_ns_capable(const struct sock *sk,
                   struct user_namespace *user_ns, int cap)
{
        return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
                ns_capable(user_ns, cap);
}
EXPORT_SYMBOL(sk_ns_capable);

/**
 * sk_capable - Socket global capability test
 * @sk: Socket to use a capability on or through
 * @cap: The global capability to use
 *
 * Test to see if the opener of the socket had when the socket was
 * created and the current process has the capability @cap in all user
 * namespaces.
 */
bool sk_capable(const struct sock *sk, int cap)
{
        return sk_ns_capable(sk, &init_user_ns, cap);
}
EXPORT_SYMBOL(sk_capable);

/**
 * sk_net_capable - Network namespace socket capability test
 * @sk: Socket to use a capability on or through
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket had when the socket was created
 * and the current process has the capability @cap over the network namespace
 * the socket is a member of.
 */
bool sk_net_capable(const struct sock *sk, int cap)
{
        return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
}
EXPORT_SYMBOL(sk_net_capable);

/*
 * Each address family might have different locking rules, so we have
 * one slock key per address family and separate keys for internal and
 * userspace sockets.
 */
static struct lock_class_key af_family_keys[AF_MAX];
static struct lock_class_key af_family_kern_keys[AF_MAX];
static struct lock_class_key af_family_slock_keys[AF_MAX];
static struct lock_class_key af_family_kern_slock_keys[AF_MAX];

/*
 * Make lock validator output more readable. (we pre-construct these
 * strings build-time, so that runtime initialization of socket
 * locks is fast):
 */

#define _sock_locks(x)                                                  \
  x "AF_UNSPEC",        x "AF_UNIX"     ,        x "AF_INET"     , \
  x "AF_AX25"  ,        x "AF_IPX"      ,        x "AF_APPLETALK", \
  x "AF_NETROM",        x "AF_BRIDGE"   ,        x "AF_ATMPVC"   , \
  x "AF_X25"   ,        x "AF_INET6"    ,        x "AF_ROSE"     , \
  x "AF_DECnet",        x "AF_NETBEUI"  ,        x "AF_SECURITY" , \
  x "AF_KEY"   ,        x "AF_NETLINK"  ,        x "AF_PACKET"   , \
  x "AF_ASH"   ,        x "AF_ECONET"   ,        x "AF_ATMSVC"   , \
  x "AF_RDS"   ,        x "AF_SNA"      ,        x "AF_IRDA"     , \
  x "AF_PPPOX" ,        x "AF_WANPIPE"  ,        x "AF_LLC"      , \
  x "27"       ,        x "28"          ,        x "AF_CAN"      , \
  x "AF_TIPC"  ,        x "AF_BLUETOOTH",        x "IUCV"        , \
  x "AF_RXRPC" ,        x "AF_ISDN"     ,        x "AF_PHONET"   , \
  x "AF_IEEE802154",        x "AF_CAIF"        ,        x "AF_ALG"      , \
  x "AF_NFC"   ,        x "AF_VSOCK"    ,        x "AF_KCM"      , \
  x "AF_QIPCRTR",        x "AF_SMC"        ,        x "AF_XDP"        , \
  x "AF_MCTP"  , \
  x "AF_MAX"

static const char *const af_family_key_strings[AF_MAX+1] = {
        _sock_locks("sk_lock-")
};
static const char *const af_family_slock_key_strings[AF_MAX+1] = {
        _sock_locks("slock-")
};
static const char *const af_family_clock_key_strings[AF_MAX+1] = {
        _sock_locks("clock-")
};

static const char *const af_family_kern_key_strings[AF_MAX+1] = {
        _sock_locks("k-sk_lock-")
};
static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
        _sock_locks("k-slock-")
};
static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
        _sock_locks("k-clock-")
};
static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
        _sock_locks("rlock-")
};
static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
        _sock_locks("wlock-")
};
static const char *const af_family_elock_key_strings[AF_MAX+1] = {
        _sock_locks("elock-")
};

/*
 * sk_callback_lock and sk queues locking rules are per-address-family,
 * so split the lock classes by using a per-AF key:
 */
static struct lock_class_key af_callback_keys[AF_MAX];
static struct lock_class_key af_rlock_keys[AF_MAX];
static struct lock_class_key af_wlock_keys[AF_MAX];
static struct lock_class_key af_elock_keys[AF_MAX];
static struct lock_class_key af_kern_callback_keys[AF_MAX];

/* Run time adjustable parameters. */
__u32 sysctl_wmem_max __read_mostly = 4 << 20;
EXPORT_SYMBOL(sysctl_wmem_max);
__u32 sysctl_rmem_max __read_mostly = 4 << 20;
EXPORT_SYMBOL(sysctl_rmem_max);
__u32 sysctl_wmem_default __read_mostly = SK_WMEM_DEFAULT;
__u32 sysctl_rmem_default __read_mostly = SK_RMEM_DEFAULT;

DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
EXPORT_SYMBOL_GPL(memalloc_socks_key);

/**
 * sk_set_memalloc - sets %SOCK_MEMALLOC
 * @sk: socket to set it on
 *
 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 * It's the responsibility of the admin to adjust min_free_kbytes
 * to meet the requirements
 */
void sk_set_memalloc(struct sock *sk)
{
        sock_set_flag(sk, SOCK_MEMALLOC);
        sk->sk_allocation |= __GFP_MEMALLOC;
        static_branch_inc(&memalloc_socks_key);
}
EXPORT_SYMBOL_GPL(sk_set_memalloc);

void sk_clear_memalloc(struct sock *sk)
{
        sock_reset_flag(sk, SOCK_MEMALLOC);
        sk->sk_allocation &= ~__GFP_MEMALLOC;
        static_branch_dec(&memalloc_socks_key);

        /*
         * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
         * progress of swapping. SOCK_MEMALLOC may be cleared while
         * it has rmem allocations due to the last swapfile being deactivated
         * but there is a risk that the socket is unusable due to exceeding
         * the rmem limits. Reclaim the reserves and obey rmem limits again.
         */
        sk_mem_reclaim(sk);
}
EXPORT_SYMBOL_GPL(sk_clear_memalloc);

int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
{
        int ret;
        unsigned int noreclaim_flag;

        /* these should have been dropped before queueing */
        BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));

        noreclaim_flag = memalloc_noreclaim_save();
        ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
                                 tcp_v6_do_rcv,
                                 tcp_v4_do_rcv,
                                 sk, skb);
        memalloc_noreclaim_restore(noreclaim_flag);

        return ret;
}
EXPORT_SYMBOL(__sk_backlog_rcv);

void sk_error_report(struct sock *sk)
{
        sk->sk_error_report(sk);

        switch (sk->sk_family) {
        case AF_INET:
                fallthrough;
        case AF_INET6:
                trace_inet_sk_error_report(sk);
                break;
        default:
                break;
        }
}
EXPORT_SYMBOL(sk_error_report);

int sock_get_timeout(long timeo, void *optval, bool old_timeval)
{
        struct __kernel_sock_timeval tv;

        if (timeo == MAX_SCHEDULE_TIMEOUT) {
                tv.tv_sec = 0;
                tv.tv_usec = 0;
        } else {
                tv.tv_sec = timeo / HZ;
                tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
        }

        if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
                struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
                *(struct old_timeval32 *)optval = tv32;
                return sizeof(tv32);
        }

        if (old_timeval) {
                struct __kernel_old_timeval old_tv;
                old_tv.tv_sec = tv.tv_sec;
                old_tv.tv_usec = tv.tv_usec;
                *(struct __kernel_old_timeval *)optval = old_tv;
                return sizeof(old_tv);
        }

        *(struct __kernel_sock_timeval *)optval = tv;
        return sizeof(tv);
}
EXPORT_SYMBOL(sock_get_timeout);

int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
                           sockptr_t optval, int optlen, bool old_timeval)
{
        if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
                struct old_timeval32 tv32;

                if (optlen < sizeof(tv32))
                        return -EINVAL;

                if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
                        return -EFAULT;
                tv->tv_sec = tv32.tv_sec;
                tv->tv_usec = tv32.tv_usec;
        } else if (old_timeval) {
                struct __kernel_old_timeval old_tv;

                if (optlen < sizeof(old_tv))
                        return -EINVAL;
                if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
                        return -EFAULT;
                tv->tv_sec = old_tv.tv_sec;
                tv->tv_usec = old_tv.tv_usec;
        } else {
                if (optlen < sizeof(*tv))
                        return -EINVAL;
                if (copy_from_sockptr(tv, optval, sizeof(*tv)))
                        return -EFAULT;
        }

        return 0;
}
EXPORT_SYMBOL(sock_copy_user_timeval);

static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
                            bool old_timeval)
{
        struct __kernel_sock_timeval tv;
        int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
        long val;

        if (err)
                return err;

        if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
                return -EDOM;

        if (tv.tv_sec < 0) {
                static int warned __read_mostly;

                WRITE_ONCE(*timeo_p, 0);
                if (warned < 10 && net_ratelimit()) {
                        warned++;
                        pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
                                __func__, current->comm, task_pid_nr(current));
                }
                return 0;
        }
        val = MAX_SCHEDULE_TIMEOUT;
        if ((tv.tv_sec || tv.tv_usec) &&
            (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
                val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
                                                    USEC_PER_SEC / HZ);
        WRITE_ONCE(*timeo_p, val);
        return 0;
}

static bool sk_set_prio_allowed(const struct sock *sk, int val)
{
        return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) ||
                sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
                sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN));
}

static bool sock_needs_netstamp(const struct sock *sk)
{
        switch (sk->sk_family) {
        case AF_UNSPEC:
        case AF_UNIX:
                return false;
        default:
                return true;
        }
}

static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
{
        if (sk->sk_flags & flags) {
                sk->sk_flags &= ~flags;
                if (sock_needs_netstamp(sk) &&
                    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
                        net_disable_timestamp();
        }
}


int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        unsigned long flags;
        struct sk_buff_head *list = &sk->sk_receive_queue;

        if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
                sk_drops_inc(sk);
                trace_sock_rcvqueue_full(sk, skb);
                return -ENOMEM;
        }

        if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
                sk_drops_inc(sk);
                return -ENOBUFS;
        }

        skb->dev = NULL;
        skb_set_owner_r(skb, sk);

        /* we escape from rcu protected region, make sure we dont leak
         * a norefcounted dst
         */
        skb_dst_force(skb);

        spin_lock_irqsave(&list->lock, flags);
        sock_skb_set_dropcount(sk, skb);
        __skb_queue_tail(list, skb);
        spin_unlock_irqrestore(&list->lock, flags);

        if (!sock_flag(sk, SOCK_DEAD))
                sk->sk_data_ready(sk);
        return 0;
}
EXPORT_SYMBOL(__sock_queue_rcv_skb);

int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
                              enum skb_drop_reason *reason)
{
        enum skb_drop_reason drop_reason;
        int err;

        err = sk_filter_reason(sk, skb, &drop_reason);
        if (err)
                goto out;

        err = __sock_queue_rcv_skb(sk, skb);
        switch (err) {
        case -ENOMEM:
                drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
                break;
        case -ENOBUFS:
                drop_reason = SKB_DROP_REASON_PROTO_MEM;
                break;
        default:
                drop_reason = SKB_NOT_DROPPED_YET;
                break;
        }
out:
        if (reason)
                *reason = drop_reason;
        return err;
}
EXPORT_SYMBOL(sock_queue_rcv_skb_reason);

int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
                     const int nested, unsigned int trim_cap, bool refcounted)
{
        enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
        int rc = NET_RX_SUCCESS;
        int err;

        if (sk_filter_trim_cap(sk, skb, trim_cap, &reason))
                goto discard_and_relse;

        skb->dev = NULL;

        if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
                sk_drops_inc(sk);
                reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
                goto discard_and_relse;
        }
        if (nested)
                bh_lock_sock_nested(sk);
        else
                bh_lock_sock(sk);
        if (!sock_owned_by_user(sk)) {
                /*
                 * trylock + unlock semantics:
                 */
                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);

                rc = sk_backlog_rcv(sk, skb);

                mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
        } else if ((err = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)))) {
                bh_unlock_sock(sk);
                if (err == -ENOMEM)
                        reason = SKB_DROP_REASON_PFMEMALLOC;
                if (err == -ENOBUFS)
                        reason = SKB_DROP_REASON_SOCKET_BACKLOG;
                sk_drops_inc(sk);
                goto discard_and_relse;
        }

        bh_unlock_sock(sk);
out:
        if (refcounted)
                sock_put(sk);
        return rc;
discard_and_relse:
        sk_skb_reason_drop(sk, skb, reason);
        goto out;
}
EXPORT_SYMBOL(__sk_receive_skb);

INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
                                                          u32));
INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
                                                           u32));
struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
{
        struct dst_entry *dst = __sk_dst_get(sk);

        if (dst && READ_ONCE(dst->obsolete) &&
            INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
                               dst, cookie) == NULL) {
                sk_tx_queue_clear(sk);
                WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
                RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
                dst_release(dst);
                return NULL;
        }

        return dst;
}
EXPORT_SYMBOL(__sk_dst_check);

struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
{
        struct dst_entry *dst = sk_dst_get(sk);

        if (dst && READ_ONCE(dst->obsolete) &&
            INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
                               dst, cookie) == NULL) {
                sk_dst_reset(sk);
                dst_release(dst);
                return NULL;
        }

        return dst;
}
EXPORT_SYMBOL(sk_dst_check);

static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
{
        int ret = -ENOPROTOOPT;
#ifdef CONFIG_NETDEVICES
        struct net *net = sock_net(sk);

        /* Sorry... */
        ret = -EPERM;
        if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
                goto out;

        ret = -EINVAL;
        if (ifindex < 0)
                goto out;

        /* Paired with all READ_ONCE() done locklessly. */
        WRITE_ONCE(sk->sk_bound_dev_if, ifindex);

        if (sk->sk_prot->rehash)
                sk->sk_prot->rehash(sk);
        sk_dst_reset(sk);

        ret = 0;

out:
#endif

        return ret;
}

int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
{
        int ret;

        if (lock_sk)
                lock_sock(sk);
        ret = sock_bindtoindex_locked(sk, ifindex);
        if (lock_sk)
                release_sock(sk);

        return ret;
}
EXPORT_SYMBOL(sock_bindtoindex);

static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
{
        int ret = -ENOPROTOOPT;
#ifdef CONFIG_NETDEVICES
        struct net *net = sock_net(sk);
        char devname[IFNAMSIZ];
        int index;

        ret = -EINVAL;
        if (optlen < 0)
                goto out;

        /* Bind this socket to a particular device like "eth0",
         * as specified in the passed interface name. If the
         * name is "" or the option length is zero the socket
         * is not bound.
         */
        if (optlen > IFNAMSIZ - 1)
                optlen = IFNAMSIZ - 1;
        memset(devname, 0, sizeof(devname));

        ret = -EFAULT;
        if (copy_from_sockptr(devname, optval, optlen))
                goto out;

        index = 0;
        if (devname[0] != '\0') {
                struct net_device *dev;

                rcu_read_lock();
                dev = dev_get_by_name_rcu(net, devname);
                if (dev)
                        index = dev->ifindex;
                rcu_read_unlock();
                ret = -ENODEV;
                if (!dev)
                        goto out;
        }

        sockopt_lock_sock(sk);
        ret = sock_bindtoindex_locked(sk, index);
        sockopt_release_sock(sk);
out:
#endif

        return ret;
}

static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
                                sockptr_t optlen, int len)
{
        int ret = -ENOPROTOOPT;
#ifdef CONFIG_NETDEVICES
        int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
        struct net *net = sock_net(sk);
        char devname[IFNAMSIZ];

        if (bound_dev_if == 0) {
                len = 0;
                goto zero;
        }

        ret = -EINVAL;
        if (len < IFNAMSIZ)
                goto out;

        ret = netdev_get_name(net, devname, bound_dev_if);
        if (ret)
                goto out;

        len = strlen(devname) + 1;

        ret = -EFAULT;
        if (copy_to_sockptr(optval, devname, len))
                goto out;

zero:
        ret = -EFAULT;
        if (copy_to_sockptr(optlen, &len, sizeof(int)))
                goto out;

        ret = 0;

out:
#endif

        return ret;
}

bool sk_mc_loop(const struct sock *sk)
{
        if (dev_recursion_level())
                return false;
        if (!sk)
                return true;
        /* IPV6_ADDRFORM can change sk->sk_family under us. */
        switch (READ_ONCE(sk->sk_family)) {
        case AF_INET:
                return inet_test_bit(MC_LOOP, sk);
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                return inet6_test_bit(MC6_LOOP, sk);
#endif
        }
        WARN_ON_ONCE(1);
        return true;
}
EXPORT_SYMBOL(sk_mc_loop);

void sock_set_reuseaddr(struct sock *sk)
{
        lock_sock(sk);
        sk->sk_reuse = SK_CAN_REUSE;
        release_sock(sk);
}
EXPORT_SYMBOL(sock_set_reuseaddr);

void sock_set_reuseport(struct sock *sk)
{
        lock_sock(sk);
        sk->sk_reuseport = true;
        release_sock(sk);
}
EXPORT_SYMBOL(sock_set_reuseport);

void sock_no_linger(struct sock *sk)
{
        lock_sock(sk);
        WRITE_ONCE(sk->sk_lingertime, 0);
        sock_set_flag(sk, SOCK_LINGER);
        release_sock(sk);
}
EXPORT_SYMBOL(sock_no_linger);

void sock_set_priority(struct sock *sk, u32 priority)
{
        WRITE_ONCE(sk->sk_priority, priority);
}
EXPORT_SYMBOL(sock_set_priority);

void sock_set_sndtimeo(struct sock *sk, s64 secs)
{
        if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
                WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
        else
                WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
}
EXPORT_SYMBOL(sock_set_sndtimeo);

static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
{
        sock_valbool_flag(sk, SOCK_RCVTSTAMP, val);
        sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns);
        if (val)  {
                sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
        }
}

void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
{
        switch (optname) {
        case SO_TIMESTAMP_OLD:
                __sock_set_timestamps(sk, valbool, false, false);
                break;
        case SO_TIMESTAMP_NEW:
                __sock_set_timestamps(sk, valbool, true, false);
                break;
        case SO_TIMESTAMPNS_OLD:
                __sock_set_timestamps(sk, valbool, false, true);
                break;
        case SO_TIMESTAMPNS_NEW:
                __sock_set_timestamps(sk, valbool, true, true);
                break;
        }
}

static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
{
        struct net *net = sock_net(sk);
        struct net_device *dev = NULL;
        bool match = false;
        int *vclock_index;
        int i, num;

        if (sk->sk_bound_dev_if)
                dev = dev_get_by_index(net, sk->sk_bound_dev_if);

        if (!dev) {
                pr_err("%s: sock not bind to device\n", __func__);
                return -EOPNOTSUPP;
        }

        num = ethtool_get_phc_vclocks(dev, &vclock_index);
        dev_put(dev);

        for (i = 0; i < num; i++) {
                if (*(vclock_index + i) == phc_index) {
                        match = true;
                        break;
                }
        }

        if (num > 0)
                kfree(vclock_index);

        if (!match)
                return -EINVAL;

        WRITE_ONCE(sk->sk_bind_phc, phc_index);

        return 0;
}

int sock_set_timestamping(struct sock *sk, int optname,
                          struct so_timestamping timestamping)
{
        int val = timestamping.flags;
        int ret;

        if (val & ~SOF_TIMESTAMPING_MASK)
                return -EINVAL;

        if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
            !(val & SOF_TIMESTAMPING_OPT_ID))
                return -EINVAL;

        if (val & SOF_TIMESTAMPING_OPT_ID &&
            !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
                if (sk_is_tcp(sk)) {
                        if ((1 << sk->sk_state) &
                            (TCPF_CLOSE | TCPF_LISTEN))
                                return -EINVAL;
                        if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
                                atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
                        else
                                atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
                } else {
                        atomic_set(&sk->sk_tskey, 0);
                }
        }

        if (val & SOF_TIMESTAMPING_OPT_STATS &&
            !(val & SOF_TIMESTAMPING_OPT_TSONLY))
                return -EINVAL;

        if (val & SOF_TIMESTAMPING_BIND_PHC) {
                ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
                if (ret)
                        return ret;
        }

        WRITE_ONCE(sk->sk_tsflags, val);
        sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
        sock_valbool_flag(sk, SOCK_TIMESTAMPING_ANY, !!(val & TSFLAGS_ANY));

        if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
                sock_enable_timestamp(sk,
                                      SOCK_TIMESTAMPING_RX_SOFTWARE);
        else
                sock_disable_timestamp(sk,
                                       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
        return 0;
}

#if defined(CONFIG_CGROUP_BPF)
void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
{
        struct bpf_sock_ops_kern sock_ops;

        memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
        sock_ops.op = op;
        sock_ops.is_fullsock = 1;
        sock_ops.sk = sk;
        bpf_skops_init_skb(&sock_ops, skb, 0);
        __cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS);
}
#endif

void sock_set_keepalive(struct sock *sk)
{
        lock_sock(sk);
        if (sk->sk_prot->keepalive)
                sk->sk_prot->keepalive(sk, true);
        sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
        release_sock(sk);
}
EXPORT_SYMBOL(sock_set_keepalive);

static void __sock_set_rcvbuf(struct sock *sk, int val)
{
        /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
         * as a negative value.
         */
        val = min_t(int, val, INT_MAX / 2);
        sk->sk_userlocks |= SOCK_RCVBUF_LOCK;

        /* We double it on the way in to account for "struct sk_buff" etc.
         * overhead.   Applications assume that the SO_RCVBUF setting they make
         * will allow that much actual data to be received on that socket.
         *
         * Applications are unaware that "struct sk_buff" and other overheads
         * allocate from the receive buffer during socket buffer allocation.
         *
         * And after considering the possible alternatives, returning the value
         * we actually used in getsockopt is the most desirable behavior.
         */
        WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
}

void sock_set_rcvbuf(struct sock *sk, int val)
{
        lock_sock(sk);
        __sock_set_rcvbuf(sk, val);
        release_sock(sk);
}
EXPORT_SYMBOL(sock_set_rcvbuf);

static void __sock_set_mark(struct sock *sk, u32 val)
{
        if (val != sk->sk_mark) {
                WRITE_ONCE(sk->sk_mark, val);
                sk_dst_reset(sk);
        }
}

void sock_set_mark(struct sock *sk, u32 val)
{
        lock_sock(sk);
        __sock_set_mark(sk, val);
        release_sock(sk);
}
EXPORT_SYMBOL(sock_set_mark);

static void sock_release_reserved_memory(struct sock *sk, int bytes)
{
        /* Round down bytes to multiple of pages */
        bytes = round_down(bytes, PAGE_SIZE);

        WARN_ON(bytes > sk->sk_reserved_mem);
        WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
        sk_mem_reclaim(sk);
}

static int sock_reserve_memory(struct sock *sk, int bytes)
{
        long allocated;
        bool charged;
        int pages;

        if (!mem_cgroup_sk_enabled(sk) || !sk_has_account(sk))
                return -EOPNOTSUPP;

        if (!bytes)
                return 0;

        pages = sk_mem_pages(bytes);

        /* pre-charge to memcg */
        charged = mem_cgroup_sk_charge(sk, pages,
                                       GFP_KERNEL | __GFP_RETRY_MAYFAIL);
        if (!charged)
                return -ENOMEM;

        /* pre-charge to forward_alloc */
        sk_memory_allocated_add(sk, pages);
        allocated = sk_memory_allocated(sk);
        /* If the system goes into memory pressure with this
         * precharge, give up and return error.
         */
        if (allocated > sk_prot_mem_limits(sk, 1)) {
                sk_memory_allocated_sub(sk, pages);
                mem_cgroup_sk_uncharge(sk, pages);
                return -ENOMEM;
        }
        sk_forward_alloc_add(sk, pages << PAGE_SHIFT);

        WRITE_ONCE(sk->sk_reserved_mem,
                   sk->sk_reserved_mem + (pages << PAGE_SHIFT));

        return 0;
}

#ifdef CONFIG_PAGE_POOL

/* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED
 * in 1 syscall. The limit exists to limit the amount of memory the kernel
 * allocates to copy these tokens, and to prevent looping over the frags for
 * too long.
 */
#define MAX_DONTNEED_TOKENS 128
#define MAX_DONTNEED_FRAGS 1024

static noinline_for_stack int
sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
{
        unsigned int num_tokens, i, j, k, netmem_num = 0;
        struct dmabuf_token *tokens;
        int ret = 0, num_frags = 0;
        netmem_ref netmems[16];

        if (!sk_is_tcp(sk))
                return -EBADF;

        if (optlen % sizeof(*tokens) ||
            optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
                return -EINVAL;

        num_tokens = optlen / sizeof(*tokens);
        tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL);
        if (!tokens)
                return -ENOMEM;

        if (copy_from_sockptr(tokens, optval, optlen)) {
                kvfree(tokens);
                return -EFAULT;
        }

        xa_lock_bh(&sk->sk_user_frags);
        for (i = 0; i < num_tokens; i++) {
                for (j = 0; j < tokens[i].token_count; j++) {
                        if (++num_frags > MAX_DONTNEED_FRAGS)
                                goto frag_limit_reached;

                        netmem_ref netmem = (__force netmem_ref)__xa_erase(
                                &sk->sk_user_frags, tokens[i].token_start + j);

                        if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
                                continue;

                        netmems[netmem_num++] = netmem;
                        if (netmem_num == ARRAY_SIZE(netmems)) {
                                xa_unlock_bh(&sk->sk_user_frags);
                                for (k = 0; k < netmem_num; k++)
                                        WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
                                netmem_num = 0;
                                xa_lock_bh(&sk->sk_user_frags);
                        }
                        ret++;
                }
        }

frag_limit_reached:
        xa_unlock_bh(&sk->sk_user_frags);
        for (k = 0; k < netmem_num; k++)
                WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));

        kvfree(tokens);
        return ret;
}
#endif

void sockopt_lock_sock(struct sock *sk)
{
        /* When current->bpf_ctx is set, the setsockopt is called from
         * a bpf prog.  bpf has ensured the sk lock has been
         * acquired before calling setsockopt().
         */
        if (has_current_bpf_ctx())
                return;

        lock_sock(sk);
}
EXPORT_SYMBOL(sockopt_lock_sock);

void sockopt_release_sock(struct sock *sk)
{
        if (has_current_bpf_ctx())
                return;

        release_sock(sk);
}
EXPORT_SYMBOL(sockopt_release_sock);

bool sockopt_ns_capable(struct user_namespace *ns, int cap)
{
        return has_current_bpf_ctx() || ns_capable(ns, cap);
}
EXPORT_SYMBOL(sockopt_ns_capable);

bool sockopt_capable(int cap)
{
        return has_current_bpf_ctx() || capable(cap);
}
EXPORT_SYMBOL(sockopt_capable);

static int sockopt_validate_clockid(__kernel_clockid_t value)
{
        switch (value) {
        case CLOCK_REALTIME:
        case CLOCK_MONOTONIC:
        case CLOCK_TAI:
                return 0;
        }
        return -EINVAL;
}

/*
 *        This is meant for all protocols to use and covers goings on
 *        at the socket level. Everything here is generic.
 */

int sk_setsockopt(struct sock *sk, int level, int optname,
                  sockptr_t optval, unsigned int optlen)
{
        struct so_timestamping timestamping;
        struct socket *sock = sk->sk_socket;
        struct sock_txtime sk_txtime;
        int val;
        int valbool;
        struct linger ling;
        int ret = 0;

        /*
         *        Options without arguments
         */

        if (optname == SO_BINDTODEVICE)
                return sock_setbindtodevice(sk, optval, optlen);

        if (optlen < sizeof(int))
                return -EINVAL;

        if (copy_from_sockptr(&val, optval, sizeof(val)))
                return -EFAULT;

        valbool = val ? 1 : 0;

        /* handle options which do not require locking the socket. */
        switch (optname) {
        case SO_PRIORITY:
                if (sk_set_prio_allowed(sk, val)) {
                        sock_set_priority(sk, val);
                        return 0;
                }
                return -EPERM;
        case SO_TYPE:
        case SO_PROTOCOL:
        case SO_DOMAIN:
        case SO_ERROR:
                return -ENOPROTOOPT;
#ifdef CONFIG_NET_RX_BUSY_POLL
        case SO_BUSY_POLL:
                if (val < 0)
                        return -EINVAL;
                WRITE_ONCE(sk->sk_ll_usec, val);
                return 0;
        case SO_PREFER_BUSY_POLL:
                if (valbool && !sockopt_capable(CAP_NET_ADMIN))
                        return -EPERM;
                WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
                return 0;
        case SO_BUSY_POLL_BUDGET:
                if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
                    !sockopt_capable(CAP_NET_ADMIN))
                        return -EPERM;
                if (val < 0 || val > U16_MAX)
                        return -EINVAL;
                WRITE_ONCE(sk->sk_busy_poll_budget, val);
                return 0;
#endif
        case SO_MAX_PACING_RATE:
                {
                unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
                unsigned long pacing_rate;

                if (sizeof(ulval) != sizeof(val) &&
                    optlen >= sizeof(ulval) &&
                    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
                        return -EFAULT;
                }
                if (ulval != ~0UL)
                        cmpxchg(&sk->sk_pacing_status,
                                SK_PACING_NONE,
                                SK_PACING_NEEDED);
                /* Pairs with READ_ONCE() from sk_getsockopt() */
                WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
                pacing_rate = READ_ONCE(sk->sk_pacing_rate);
                if (ulval < pacing_rate)
                        WRITE_ONCE(sk->sk_pacing_rate, ulval);
                return 0;
                }
        case SO_TXREHASH:
                if (!sk_is_tcp(sk))
                        return -EOPNOTSUPP;
                if (val < -1 || val > 1)
                        return -EINVAL;
                if ((u8)val == SOCK_TXREHASH_DEFAULT)
                        val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
                /* Paired with READ_ONCE() in tcp_rtx_synack()
                 * and sk_getsockopt().
                 */
                WRITE_ONCE(sk->sk_txrehash, (u8)val);
                return 0;
        case SO_PEEK_OFF:
                {
                int (*set_peek_off)(struct sock *sk, int val);

                set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
                if (set_peek_off)
                        ret = set_peek_off(sk, val);
                else
                        ret = -EOPNOTSUPP;
                return ret;
                }
#ifdef CONFIG_PAGE_POOL
        case SO_DEVMEM_DONTNEED:
                return sock_devmem_dontneed(sk, optval, optlen);
#endif
        case SO_SNDTIMEO_OLD:
        case SO_SNDTIMEO_NEW:
                return sock_set_timeout(&sk->sk_sndtimeo, optval,
                                        optlen, optname == SO_SNDTIMEO_OLD);
        case SO_RCVTIMEO_OLD:
        case SO_RCVTIMEO_NEW:
                return sock_set_timeout(&sk->sk_rcvtimeo, optval,
                                        optlen, optname == SO_RCVTIMEO_OLD);
        }

        sockopt_lock_sock(sk);

        switch (optname) {
        case SO_DEBUG:
                if (val && !sockopt_capable(CAP_NET_ADMIN))
                        ret = -EACCES;
                else
                        sock_valbool_flag(sk, SOCK_DBG, valbool);
                break;
        case SO_REUSEADDR:
                sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
                break;
        case SO_REUSEPORT:
                if (valbool && !sk_is_inet(sk))
                        ret = -EOPNOTSUPP;
                else
                        sk->sk_reuseport = valbool;
                break;
        case SO_DONTROUTE:
                sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
                sk_dst_reset(sk);
                break;
        case SO_BROADCAST:
                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
                break;
        case SO_SNDBUF:
                /* Don't error on this BSD doesn't and if you think
                 * about it this is right. Otherwise apps have to
                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
                 * are treated in BSD as hints
                 */
                val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
set_sndbuf:
                /* Ensure val * 2 fits into an int, to prevent max_t()
                 * from treating it as a negative value.
                 */
                val = min_t(int, val, INT_MAX / 2);
                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
                WRITE_ONCE(sk->sk_sndbuf,
                           max_t(int, val * 2, SOCK_MIN_SNDBUF));
                /* Wake up sending tasks if we upped the value. */
                sk->sk_write_space(sk);
                break;

        case SO_SNDBUFFORCE:
                if (!sockopt_capable(CAP_NET_ADMIN)) {
                        ret = -EPERM;
                        break;
                }

                /* No negative values (to prevent underflow, as val will be
                 * multiplied by 2).
                 */
                if (val < 0)
                        val = 0;
                goto set_sndbuf;

        case SO_RCVBUF:
                /* Don't error on this BSD doesn't and if you think
                 * about it this is right. Otherwise apps have to
                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
                 * are treated in BSD as hints
                 */
                __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
                break;

        case SO_RCVBUFFORCE:
                if (!sockopt_capable(CAP_NET_ADMIN)) {
                        ret = -EPERM;
                        break;
                }

                /* No negative values (to prevent underflow, as val will be
                 * multiplied by 2).
                 */
                __sock_set_rcvbuf(sk, max(val, 0));
                break;

        case SO_KEEPALIVE:
                if (sk->sk_prot->keepalive)
                        sk->sk_prot->keepalive(sk, valbool);
                sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
                break;

        case SO_OOBINLINE:
                sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
                break;

        case SO_NO_CHECK:
                sk->sk_no_check_tx = valbool;
                break;

        case SO_LINGER:
                if (optlen < sizeof(ling)) {
                        ret = -EINVAL;        /* 1003.1g */
                        break;
                }
                if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
                        ret = -EFAULT;
                        break;
                }
                if (!ling.l_onoff) {
                        sock_reset_flag(sk, SOCK_LINGER);
                } else {
                        unsigned long t_sec = ling.l_linger;

                        if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
                                WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
                        else
                                WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
                        sock_set_flag(sk, SOCK_LINGER);
                }
                break;

        case SO_BSDCOMPAT:
                break;

        case SO_TIMESTAMP_OLD:
        case SO_TIMESTAMP_NEW:
        case SO_TIMESTAMPNS_OLD:
        case SO_TIMESTAMPNS_NEW:
                sock_set_timestamp(sk, optname, valbool);
                break;

        case SO_TIMESTAMPING_NEW:
        case SO_TIMESTAMPING_OLD:
                if (optlen == sizeof(timestamping)) {
                        if (copy_from_sockptr(&timestamping, optval,
                                              sizeof(timestamping))) {
                                ret = -EFAULT;
                                break;
                        }
                } else {
                        memset(&timestamping, 0, sizeof(timestamping));
                        timestamping.flags = val;
                }
                ret = sock_set_timestamping(sk, optname, timestamping);
                break;

        case SO_RCVLOWAT:
                {
                int (*set_rcvlowat)(struct sock *sk, int val) = NULL;

                if (val < 0)
                        val = INT_MAX;
                if (sock)
                        set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
                if (set_rcvlowat)
                        ret = set_rcvlowat(sk, val);
                else
                        WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
                break;
                }
        case SO_ATTACH_FILTER: {
                struct sock_fprog fprog;

                ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
                if (!ret)
                        ret = sk_attach_filter(&fprog, sk);
                break;
        }
        case SO_ATTACH_BPF:
                ret = -EINVAL;
                if (optlen == sizeof(u32)) {
                        u32 ufd;

                        ret = -EFAULT;
                        if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
                                break;

                        ret = sk_attach_bpf(ufd, sk);
                }
                break;

        case SO_ATTACH_REUSEPORT_CBPF: {
                struct sock_fprog fprog;

                ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
                if (!ret)
                        ret = sk_reuseport_attach_filter(&fprog, sk);
                break;
        }
        case SO_ATTACH_REUSEPORT_EBPF:
                ret = -EINVAL;
                if (optlen == sizeof(u32)) {
                        u32 ufd;

                        ret = -EFAULT;
                        if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
                                break;

                        ret = sk_reuseport_attach_bpf(ufd, sk);
                }
                break;

        case SO_DETACH_REUSEPORT_BPF:
                ret = reuseport_detach_prog(sk);
                break;

        case SO_DETACH_FILTER:
                ret = sk_detach_filter(sk);
                break;

        case SO_LOCK_FILTER:
                if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
                        ret = -EPERM;
                else
                        sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
                break;

        case SO_MARK:
                if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
                    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
                        ret = -EPERM;
                        break;
                }

                __sock_set_mark(sk, val);
                break;
        case SO_RCVMARK:
                sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
                break;

        case SO_RCVPRIORITY:
                sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool);
                break;

        case SO_RXQ_OVFL:
                sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
                break;

        case SO_WIFI_STATUS:
                sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
                break;

        case SO_NOFCS:
                sock_valbool_flag(sk, SOCK_NOFCS, valbool);
                break;

        case SO_SELECT_ERR_QUEUE:
                sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
                break;

        case SO_PASSCRED:
                if (sk_may_scm_recv(sk))
                        sk->sk_scm_credentials = valbool;
                else
                        ret = -EOPNOTSUPP;
                break;

        case SO_PASSSEC:
                if (IS_ENABLED(CONFIG_SECURITY_NETWORK) && sk_may_scm_recv(sk))
                        sk->sk_scm_security = valbool;
                else
                        ret = -EOPNOTSUPP;
                break;

        case SO_PASSPIDFD:
                if (sk_is_unix(sk))
                        sk->sk_scm_pidfd = valbool;
                else
                        ret = -EOPNOTSUPP;
                break;

        case SO_PASSRIGHTS:
                if (sk_is_unix(sk))
                        sk->sk_scm_rights = valbool;
                else
                        ret = -EOPNOTSUPP;
                break;

        case SO_INCOMING_CPU:
                reuseport_update_incoming_cpu(sk, val);
                break;

        case SO_CNX_ADVICE:
                if (val == 1)
                        dst_negative_advice(sk);
                break;

        case SO_ZEROCOPY:
                if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
                        if (!(sk_is_tcp(sk) ||
                              (sk->sk_type == SOCK_DGRAM &&
                               sk->sk_protocol == IPPROTO_UDP)))
                                ret = -EOPNOTSUPP;
                } else if (sk->sk_family != PF_RDS) {
                        ret = -EOPNOTSUPP;
                }
                if (!ret) {
                        if (val < 0 || val > 1)
                                ret = -EINVAL;
                        else
                                sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
                }
                break;

        case SO_TXTIME:
                if (optlen != sizeof(struct sock_txtime)) {
                        ret = -EINVAL;
                        break;
                } else if (copy_from_sockptr(&sk_txtime, optval,
                           sizeof(struct sock_txtime))) {
                        ret = -EFAULT;
                        break;
                } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
                        ret = -EINVAL;
                        break;
                }
                /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
                 * scheduler has enough safe guards.
                 */
                if (sk_txtime.clockid != CLOCK_MONOTONIC &&
                    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
                        ret = -EPERM;
                        break;
                }

                ret = sockopt_validate_clockid(sk_txtime.clockid);
                if (ret)
                        break;

                sock_valbool_flag(sk, SOCK_TXTIME, true);
                sk->sk_clockid = sk_txtime.clockid;
                sk->sk_txtime_deadline_mode =
                        !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
                sk->sk_txtime_report_errors =
                        !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
                break;

        case SO_BINDTOIFINDEX:
                ret = sock_bindtoindex_locked(sk, val);
                break;

        case SO_BUF_LOCK:
                if (val & ~SOCK_BUF_LOCK_MASK) {
                        ret = -EINVAL;
                        break;
                }
                sk->sk_userlocks = val | (sk->sk_userlocks &
                                          ~SOCK_BUF_LOCK_MASK);
                break;

        case SO_RESERVE_MEM:
        {
                int delta;

                if (val < 0) {
                        ret = -EINVAL;
                        break;
                }

                delta = val - sk->sk_reserved_mem;
                if (delta < 0)
                        sock_release_reserved_memory(sk, -delta);
                else
                        ret = sock_reserve_memory(sk, delta);
                break;
        }

        default:
                ret = -ENOPROTOOPT;
                break;
        }
        sockopt_release_sock(sk);
        return ret;
}

int sock_setsockopt(struct socket *sock, int level, int optname,
                    sockptr_t optval, unsigned int optlen)
{
        return sk_setsockopt(sock->sk, level, optname,
                             optval, optlen);
}
EXPORT_SYMBOL(sock_setsockopt);

static const struct cred *sk_get_peer_cred(struct sock *sk)
{
        const struct cred *cred;

        spin_lock(&sk->sk_peer_lock);
        cred = get_cred(sk->sk_peer_cred);
        spin_unlock(&sk->sk_peer_lock);

        return cred;
}

static void cred_to_ucred(struct pid *pid, const struct cred *cred,
                          struct ucred *ucred)
{
        ucred->pid = pid_vnr(pid);
        ucred->uid = ucred->gid = -1;
        if (cred) {
                struct user_namespace *current_ns = current_user_ns();

                ucred->uid = from_kuid_munged(current_ns, cred->euid);
                ucred->gid = from_kgid_munged(current_ns, cred->egid);
        }
}

static int groups_to_user(sockptr_t dst, const struct group_info *src)
{
        struct user_namespace *user_ns = current_user_ns();
        int i;

        for (i = 0; i < src->ngroups; i++) {
                gid_t gid = from_kgid_munged(user_ns, src->gid[i]);

                if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
                        return -EFAULT;
        }

        return 0;
}

int sk_getsockopt(struct sock *sk, int level, int optname,
                  sockptr_t optval, sockptr_t optlen)
{
        struct socket *sock = sk->sk_socket;

        union {
                int val;
                u64 val64;
                unsigned long ulval;
                struct linger ling;
                struct old_timeval32 tm32;
                struct __kernel_old_timeval tm;
                struct  __kernel_sock_timeval stm;
                struct sock_txtime txtime;
                struct so_timestamping timestamping;
        } v;

        int lv = sizeof(int);
        int len;

        if (copy_from_sockptr(&len, optlen, sizeof(int)))
                return -EFAULT;
        if (len < 0)
                return -EINVAL;

        memset(&v, 0, sizeof(v));

        switch (optname) {
        case SO_DEBUG:
                v.val = sock_flag(sk, SOCK_DBG);
                break;

        case SO_DONTROUTE:
                v.val = sock_flag(sk, SOCK_LOCALROUTE);
                break;

        case SO_BROADCAST:
                v.val = sock_flag(sk, SOCK_BROADCAST);
                break;

        case SO_SNDBUF:
                v.val = READ_ONCE(sk->sk_sndbuf);
                break;

        case SO_RCVBUF:
                v.val = READ_ONCE(sk->sk_rcvbuf);
                break;

        case SO_REUSEADDR:
                v.val = sk->sk_reuse;
                break;

        case SO_REUSEPORT:
                v.val = sk->sk_reuseport;
                break;

        case SO_KEEPALIVE:
                v.val = sock_flag(sk, SOCK_KEEPOPEN);
                break;

        case SO_TYPE:
                v.val = sk->sk_type;
                break;

        case SO_PROTOCOL:
                v.val = sk->sk_protocol;
                break;

        case SO_DOMAIN:
                v.val = sk->sk_family;
                break;

        case SO_ERROR:
                v.val = -sock_error(sk);
                if (v.val == 0)
                        v.val = xchg(&sk->sk_err_soft, 0);
                break;

        case SO_OOBINLINE:
                v.val = sock_flag(sk, SOCK_URGINLINE);
                break;

        case SO_NO_CHECK:
                v.val = sk->sk_no_check_tx;
                break;

        case SO_PRIORITY:
                v.val = READ_ONCE(sk->sk_priority);
                break;

        case SO_LINGER:
                lv                = sizeof(v.ling);
                v.ling.l_onoff        = sock_flag(sk, SOCK_LINGER);
                v.ling.l_linger        = READ_ONCE(sk->sk_lingertime) / HZ;
                break;

        case SO_BSDCOMPAT:
                break;

        case SO_TIMESTAMP_OLD:
                v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
                                !sock_flag(sk, SOCK_TSTAMP_NEW) &&
                                !sock_flag(sk, SOCK_RCVTSTAMPNS);
                break;

        case SO_TIMESTAMPNS_OLD:
                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
                break;

        case SO_TIMESTAMP_NEW:
                v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
                break;

        case SO_TIMESTAMPNS_NEW:
                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
                break;

        case SO_TIMESTAMPING_OLD:
        case SO_TIMESTAMPING_NEW:
                lv = sizeof(v.timestamping);
                /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
                 * returning the flags when they were set through the same option.
                 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
                 */
                if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
                        v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
                        v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
                }
                break;

        case SO_RCVTIMEO_OLD:
        case SO_RCVTIMEO_NEW:
                lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
                                      SO_RCVTIMEO_OLD == optname);
                break;

        case SO_SNDTIMEO_OLD:
        case SO_SNDTIMEO_NEW:
                lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
                                      SO_SNDTIMEO_OLD == optname);
                break;

        case SO_RCVLOWAT:
                v.val = READ_ONCE(sk->sk_rcvlowat);
                break;

        case SO_SNDLOWAT:
                v.val = 1;
                break;

        case SO_PASSCRED:
                if (!sk_may_scm_recv(sk))
                        return -EOPNOTSUPP;

                v.val = sk->sk_scm_credentials;
                break;

        case SO_PASSPIDFD:
                if (!sk_is_unix(sk))
                        return -EOPNOTSUPP;

                v.val = sk->sk_scm_pidfd;
                break;

        case SO_PASSRIGHTS:
                if (!sk_is_unix(sk))
                        return -EOPNOTSUPP;

                v.val = sk->sk_scm_rights;
                break;

        case SO_PEERCRED:
        {
                struct ucred peercred;
                if (len > sizeof(peercred))
                        len = sizeof(peercred);

                spin_lock(&sk->sk_peer_lock);
                cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
                spin_unlock(&sk->sk_peer_lock);

                if (copy_to_sockptr(optval, &peercred, len))
                        return -EFAULT;
                goto lenout;
        }

        case SO_PEERPIDFD:
        {
                struct pid *peer_pid;
                struct file *pidfd_file = NULL;
                unsigned int flags = 0;
                int pidfd;

                if (len > sizeof(pidfd))
                        len = sizeof(pidfd);

                spin_lock(&sk->sk_peer_lock);
                peer_pid = get_pid(sk->sk_peer_pid);
                spin_unlock(&sk->sk_peer_lock);

                if (!peer_pid)
                        return -ENODATA;

                /* The use of PIDFD_STALE requires stashing of struct pid
                 * on pidfs with pidfs_register_pid() and only AF_UNIX
                 * were prepared for this.
                 */
                if (sk->sk_family == AF_UNIX)
                        flags = PIDFD_STALE;

                pidfd = pidfd_prepare(peer_pid, flags, &pidfd_file);
                put_pid(peer_pid);
                if (pidfd < 0)
                        return pidfd;

                if (copy_to_sockptr(optval, &pidfd, len) ||
                    copy_to_sockptr(optlen, &len, sizeof(int))) {
                        put_unused_fd(pidfd);
                        fput(pidfd_file);

                        return -EFAULT;
                }

                fd_install(pidfd, pidfd_file);
                return 0;
        }

        case SO_PEERGROUPS:
        {
                const struct cred *cred;
                int ret, n;

                cred = sk_get_peer_cred(sk);
                if (!cred)
                        return -ENODATA;

                n = cred->group_info->ngroups;
                if (len < n * sizeof(gid_t)) {
                        len = n * sizeof(gid_t);
                        put_cred(cred);
                        return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
                }
                len = n * sizeof(gid_t);

                ret = groups_to_user(optval, cred->group_info);
                put_cred(cred);
                if (ret)
                        return ret;
                goto lenout;
        }

        case SO_PEERNAME:
        {
                struct sockaddr_storage address;

                lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
                if (lv < 0)
                        return -ENOTCONN;
                if (lv < len)
                        return -EINVAL;
                if (copy_to_sockptr(optval, &address, len))
                        return -EFAULT;
                goto lenout;
        }

        /* Dubious BSD thing... Probably nobody even uses it, but
         * the UNIX standard wants it for whatever reason... -DaveM
         */
        case SO_ACCEPTCONN:
                v.val = sk->sk_state == TCP_LISTEN;
                break;

        case SO_PASSSEC:
                if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || !sk_may_scm_recv(sk))
                        return -EOPNOTSUPP;

                v.val = sk->sk_scm_security;
                break;

        case SO_PEERSEC:
                return security_socket_getpeersec_stream(sock,
                                                         optval, optlen, len);

        case SO_MARK:
                v.val = READ_ONCE(sk->sk_mark);
                break;

        case SO_RCVMARK:
                v.val = sock_flag(sk, SOCK_RCVMARK);
                break;

        case SO_RCVPRIORITY:
                v.val = sock_flag(sk, SOCK_RCVPRIORITY);
                break;

        case SO_RXQ_OVFL:
                v.val = sock_flag(sk, SOCK_RXQ_OVFL);
                break;

        case SO_WIFI_STATUS:
                v.val = sock_flag(sk, SOCK_WIFI_STATUS);
                break;

        case SO_PEEK_OFF:
                if (!READ_ONCE(sock->ops)->set_peek_off)
                        return -EOPNOTSUPP;

                v.val = READ_ONCE(sk->sk_peek_off);
                break;
        case SO_NOFCS:
                v.val = sock_flag(sk, SOCK_NOFCS);
                break;

        case SO_BINDTODEVICE:
                return sock_getbindtodevice(sk, optval, optlen, len);

        case SO_GET_FILTER:
                len = sk_get_filter(sk, optval, len);
                if (len < 0)
                        return len;

                goto lenout;

        case SO_LOCK_FILTER:
                v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
                break;

        case SO_BPF_EXTENSIONS:
                v.val = bpf_tell_extensions();
                break;

        case SO_SELECT_ERR_QUEUE:
                v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
                break;

#ifdef CONFIG_NET_RX_BUSY_POLL
        case SO_BUSY_POLL:
                v.val = READ_ONCE(sk->sk_ll_usec);
                break;
        case SO_PREFER_BUSY_POLL:
                v.val = READ_ONCE(sk->sk_prefer_busy_poll);
                break;
#endif

        case SO_MAX_PACING_RATE:
                /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
                if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
                        lv = sizeof(v.ulval);
                        v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
                } else {
                        /* 32bit version */
                        v.val = min_t(unsigned long, ~0U,
                                      READ_ONCE(sk->sk_max_pacing_rate));
                }
                break;

        case SO_INCOMING_CPU:
                v.val = READ_ONCE(sk->sk_incoming_cpu);
                break;

        case SO_MEMINFO:
        {
                u32 meminfo[SK_MEMINFO_VARS];

                sk_get_meminfo(sk, meminfo);

                len = min_t(unsigned int, len, sizeof(meminfo));
                if (copy_to_sockptr(optval, &meminfo, len))
                        return -EFAULT;

                goto lenout;
        }

#ifdef CONFIG_NET_RX_BUSY_POLL
        case SO_INCOMING_NAPI_ID:
                v.val = READ_ONCE(sk->sk_napi_id);

                /* aggregate non-NAPI IDs down to 0 */
                if (!napi_id_valid(v.val))
                        v.val = 0;

                break;
#endif

        case SO_COOKIE:
                lv = sizeof(u64);
                if (len < lv)
                        return -EINVAL;
                v.val64 = sock_gen_cookie(sk);
                break;

        case SO_ZEROCOPY:
                v.val = sock_flag(sk, SOCK_ZEROCOPY);
                break;

        case SO_TXTIME:
                lv = sizeof(v.txtime);
                v.txtime.clockid = sk->sk_clockid;
                v.txtime.flags |= sk->sk_txtime_deadline_mode ?
                                  SOF_TXTIME_DEADLINE_MODE : 0;
                v.txtime.flags |= sk->sk_txtime_report_errors ?
                                  SOF_TXTIME_REPORT_ERRORS : 0;
                break;

        case SO_BINDTOIFINDEX:
                v.val = READ_ONCE(sk->sk_bound_dev_if);
                break;

        case SO_NETNS_COOKIE:
                lv = sizeof(u64);
                if (len != lv)
                        return -EINVAL;
                v.val64 = sock_net(sk)->net_cookie;
                break;

        case SO_BUF_LOCK:
                v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
                break;

        case SO_RESERVE_MEM:
                v.val = READ_ONCE(sk->sk_reserved_mem);
                break;

        case SO_TXREHASH:
                if (!sk_is_tcp(sk))
                        return -EOPNOTSUPP;

                /* Paired with WRITE_ONCE() in sk_setsockopt() */
                v.val = READ_ONCE(sk->sk_txrehash);
                break;

        default:
                /* We implement the SO_SNDLOWAT etc to not be settable
                 * (1003.1g 7).
                 */
                return -ENOPROTOOPT;
        }

        if (len > lv)
                len = lv;
        if (copy_to_sockptr(optval, &v, len))
                return -EFAULT;
lenout:
        if (copy_to_sockptr(optlen, &len, sizeof(int)))
                return -EFAULT;
        return 0;
}

/*
 * Initialize an sk_lock.
 *
 * (We also register the sk_lock with the lock validator.)
 */
static inline void sock_lock_init(struct sock *sk)
{
        sk_owner_clear(sk);

        if (sk->sk_kern_sock)
                sock_lock_init_class_and_name(
                        sk,
                        af_family_kern_slock_key_strings[sk->sk_family],
                        af_family_kern_slock_keys + sk->sk_family,
                        af_family_kern_key_strings[sk->sk_family],
                        af_family_kern_keys + sk->sk_family);
        else
                sock_lock_init_class_and_name(
                        sk,
                        af_family_slock_key_strings[sk->sk_family],
                        af_family_slock_keys + sk->sk_family,
                        af_family_key_strings[sk->sk_family],
                        af_family_keys + sk->sk_family);
}

/*
 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
 * even temporarily, because of RCU lookups. sk_node should also be left as is.
 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
 */
static void sock_copy(struct sock *nsk, const struct sock *osk)
{
        const struct proto *prot = READ_ONCE(osk->sk_prot);
#ifdef CONFIG_SECURITY_NETWORK
        void *sptr = nsk->sk_security;
#endif

        /* If we move sk_tx_queue_mapping out of the private section,
         * we must check if sk_tx_queue_clear() is called after
         * sock_copy() in sk_clone_lock().
         */
        BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
                     offsetof(struct sock, sk_dontcopy_begin) ||
                     offsetof(struct sock, sk_tx_queue_mapping) >=
                     offsetof(struct sock, sk_dontcopy_end));

        memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));

        unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
                      prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
                      /* alloc is larger than struct, see sk_prot_alloc() */);

#ifdef CONFIG_SECURITY_NETWORK
        nsk->sk_security = sptr;
        security_sk_clone(osk, nsk);
#endif
}

static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
                int family)
{
        struct sock *sk;
        struct kmem_cache *slab;

        slab = prot->slab;
        if (slab != NULL) {
                sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
                if (!sk)
                        return sk;
                if (want_init_on_alloc(priority))
                        sk_prot_clear_nulls(sk, prot->obj_size);
        } else
                sk = kmalloc(prot->obj_size, priority);

        if (sk != NULL) {
                if (security_sk_alloc(sk, family, priority))
                        goto out_free;

                if (!try_module_get(prot->owner))
                        goto out_free_sec;
        }

        return sk;

out_free_sec:
        security_sk_free(sk);
out_free:
        if (slab != NULL)
                kmem_cache_free(slab, sk);
        else
                kfree(sk);
        return NULL;
}

static void sk_prot_free(struct proto *prot, struct sock *sk)
{
        struct kmem_cache *slab;
        struct module *owner;

        owner = prot->owner;
        slab = prot->slab;

        cgroup_sk_free(&sk->sk_cgrp_data);
        mem_cgroup_sk_free(sk);
        security_sk_free(sk);

        sk_owner_put(sk);

        if (slab != NULL)
                kmem_cache_free(slab, sk);
        else
                kfree(sk);
        module_put(owner);
}

/**
 *        sk_alloc - All socket objects are allocated here
 *        @net: the applicable net namespace
 *        @family: protocol family
 *        @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
 *        @prot: struct proto associated with this new sock instance
 *        @kern: is this to be a kernel socket?
 */
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
                      struct proto *prot, int kern)
{
        struct sock *sk;

        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
        if (sk) {
                sk->sk_family = family;
                /*
                 * See comment in struct sock definition to understand
                 * why we need sk_prot_creator -acme
                 */
                sk->sk_prot = sk->sk_prot_creator = prot;
                sk->sk_kern_sock = kern;
                sock_lock_init(sk);
                sk->sk_net_refcnt = kern ? 0 : 1;
                if (likely(sk->sk_net_refcnt)) {
                        get_net_track(net, &sk->ns_tracker, priority);
                        sock_inuse_add(net, 1);
                } else {
                        net_passive_inc(net);
                        __netns_tracker_alloc(net, &sk->ns_tracker,
                                              false, priority);
                }

                sock_net_set(sk, net);
                refcount_set(&sk->sk_wmem_alloc, 1);

                mem_cgroup_sk_alloc(sk);
                cgroup_sk_alloc(&sk->sk_cgrp_data);
                sock_update_classid(&sk->sk_cgrp_data);
                sock_update_netprioidx(&sk->sk_cgrp_data);
                sk_tx_queue_clear(sk);
        }

        return sk;
}
EXPORT_SYMBOL(sk_alloc);

/* Sockets having SOCK_RCU_FREE will call this function after one RCU
 * grace period. This is the case for UDP sockets and TCP listeners.
 */
static void __sk_destruct(struct rcu_head *head)
{
        struct sock *sk = container_of(head, struct sock, sk_rcu);
        struct net *net = sock_net(sk);
        struct sk_filter *filter;

        if (sk->sk_destruct)
                sk->sk_destruct(sk);

        filter = rcu_dereference_check(sk->sk_filter,
                                       refcount_read(&sk->sk_wmem_alloc) == 0);
        if (filter) {
                sk_filter_uncharge(sk, filter);
                RCU_INIT_POINTER(sk->sk_filter, NULL);
        }

        sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);

#ifdef CONFIG_BPF_SYSCALL
        bpf_sk_storage_free(sk);
#endif

        if (atomic_read(&sk->sk_omem_alloc))
                pr_debug("%s: optmem leakage (%d bytes) detected\n",
                         __func__, atomic_read(&sk->sk_omem_alloc));

        if (sk->sk_frag.page) {
                put_page(sk->sk_frag.page);
                sk->sk_frag.page = NULL;
        }

        /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
        put_cred(sk->sk_peer_cred);
        put_pid(sk->sk_peer_pid);

        if (likely(sk->sk_net_refcnt)) {
                put_net_track(net, &sk->ns_tracker);
        } else {
                __netns_tracker_free(net, &sk->ns_tracker, false);
                net_passive_dec(net);
        }
        sk_prot_free(sk->sk_prot_creator, sk);
}

void sk_net_refcnt_upgrade(struct sock *sk)
{
        struct net *net = sock_net(sk);

        WARN_ON_ONCE(sk->sk_net_refcnt);
        __netns_tracker_free(net, &sk->ns_tracker, false);
        net_passive_dec(net);
        sk->sk_net_refcnt = 1;
        get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
        sock_inuse_add(net, 1);
}
EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade);

void sk_destruct(struct sock *sk)
{
        bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);

        if (rcu_access_pointer(sk->sk_reuseport_cb)) {
                reuseport_detach_sock(sk);
                use_call_rcu = true;
        }

        if (use_call_rcu)
                call_rcu(&sk->sk_rcu, __sk_destruct);
        else
                __sk_destruct(&sk->sk_rcu);
}

static void __sk_free(struct sock *sk)
{
        if (likely(sk->sk_net_refcnt))
                sock_inuse_add(sock_net(sk), -1);

        if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
                sock_diag_broadcast_destroy(sk);
        else
                sk_destruct(sk);
}

void sk_free(struct sock *sk)
{
        /*
         * We subtract one from sk_wmem_alloc and can know if
         * some packets are still in some tx queue.
         * If not null, sock_wfree() will call __sk_free(sk) later
         */
        if (refcount_dec_and_test(&sk->sk_wmem_alloc))
                __sk_free(sk);
}
EXPORT_SYMBOL(sk_free);

static void sk_init_common(struct sock *sk)
{
        skb_queue_head_init(&sk->sk_receive_queue);
        skb_queue_head_init(&sk->sk_write_queue);
        skb_queue_head_init(&sk->sk_error_queue);

        rwlock_init(&sk->sk_callback_lock);
        lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
                        af_rlock_keys + sk->sk_family,
                        af_family_rlock_key_strings[sk->sk_family]);
        lockdep_set_class_and_name(&sk->sk_write_queue.lock,
                        af_wlock_keys + sk->sk_family,
                        af_family_wlock_key_strings[sk->sk_family]);
        lockdep_set_class_and_name(&sk->sk_error_queue.lock,
                        af_elock_keys + sk->sk_family,
                        af_family_elock_key_strings[sk->sk_family]);
        if (sk->sk_kern_sock)
                lockdep_set_class_and_name(&sk->sk_callback_lock,
                        af_kern_callback_keys + sk->sk_family,
                        af_family_kern_clock_key_strings[sk->sk_family]);
        else
                lockdep_set_class_and_name(&sk->sk_callback_lock,
                        af_callback_keys + sk->sk_family,
                        af_family_clock_key_strings[sk->sk_family]);
}

/**
 *        sk_clone_lock - clone a socket, and lock its clone
 *        @sk: the socket to clone
 *        @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
 *
 *        Caller must unlock socket even in error path (bh_unlock_sock(newsk))
 */
struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
{
        struct proto *prot = READ_ONCE(sk->sk_prot);
        struct sk_filter *filter;
        bool is_charged = true;
        struct sock *newsk;

        newsk = sk_prot_alloc(prot, priority, sk->sk_family);
        if (!newsk)
                goto out;

        sock_copy(newsk, sk);

        newsk->sk_prot_creator = prot;

        /* SANITY */
        if (likely(newsk->sk_net_refcnt)) {
                get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
                sock_inuse_add(sock_net(newsk), 1);
        } else {
                /* Kernel sockets are not elevating the struct net refcount.
                 * Instead, use a tracker to more easily detect if a layer
                 * is not properly dismantling its kernel sockets at netns
                 * destroy time.
                 */
                net_passive_inc(sock_net(newsk));
                __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
                                      false, priority);
        }
        sk_node_init(&newsk->sk_node);
        sock_lock_init(newsk);
        bh_lock_sock(newsk);
        newsk->sk_backlog.head        = newsk->sk_backlog.tail = NULL;
        newsk->sk_backlog.len = 0;

        atomic_set(&newsk->sk_rmem_alloc, 0);

        /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
        refcount_set(&newsk->sk_wmem_alloc, 1);

        atomic_set(&newsk->sk_omem_alloc, 0);
        sk_init_common(newsk);

        newsk->sk_dst_cache        = NULL;
        newsk->sk_dst_pending_confirm = 0;
        newsk->sk_wmem_queued        = 0;
        newsk->sk_forward_alloc = 0;
        newsk->sk_reserved_mem  = 0;
        DEBUG_NET_WARN_ON_ONCE(newsk->sk_drop_counters);
        sk_drops_reset(newsk);
        newsk->sk_send_head        = NULL;
        newsk->sk_userlocks        = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
        atomic_set(&newsk->sk_zckey, 0);

        sock_reset_flag(newsk, SOCK_DONE);

#ifdef CONFIG_MEMCG
        /* sk->sk_memcg will be populated at accept() time */
        newsk->sk_memcg = NULL;
#endif

        cgroup_sk_clone(&newsk->sk_cgrp_data);

        rcu_read_lock();
        filter = rcu_dereference(sk->sk_filter);
        if (filter != NULL)
                /* though it's an empty new sock, the charging may fail
                 * if sysctl_optmem_max was changed between creation of
                 * original socket and cloning
                 */
                is_charged = sk_filter_charge(newsk, filter);
        RCU_INIT_POINTER(newsk->sk_filter, filter);
        rcu_read_unlock();

        if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
                /* We need to make sure that we don't uncharge the new
                 * socket if we couldn't charge it in the first place
                 * as otherwise we uncharge the parent's filter.
                 */
                if (!is_charged)
                        RCU_INIT_POINTER(newsk->sk_filter, NULL);

                goto free;
        }

        RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);

        if (bpf_sk_storage_clone(sk, newsk))
                goto free;

        /* Clear sk_user_data if parent had the pointer tagged
         * as not suitable for copying when cloning.
         */
        if (sk_user_data_is_nocopy(newsk))
                newsk->sk_user_data = NULL;

        newsk->sk_err           = 0;
        newsk->sk_err_soft = 0;
        newsk->sk_priority = 0;
        newsk->sk_incoming_cpu = raw_smp_processor_id();

        /* Before updating sk_refcnt, we must commit prior changes to memory
         * (Documentation/RCU/rculist_nulls.rst for details)
         */
        smp_wmb();
        refcount_set(&newsk->sk_refcnt, 2);

        sk_set_socket(newsk, NULL);
        sk_tx_queue_clear(newsk);
        RCU_INIT_POINTER(newsk->sk_wq, NULL);

        if (newsk->sk_prot->sockets_allocated)
                sk_sockets_allocated_inc(newsk);

        if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
                net_enable_timestamp();
out:
        return newsk;
free:
        /* It is still raw copy of parent, so invalidate
         * destructor and make plain sk_free()
         */
        newsk->sk_destruct = NULL;
        bh_unlock_sock(newsk);
        sk_free(newsk);
        newsk = NULL;
        goto out;
}
EXPORT_SYMBOL_GPL(sk_clone_lock);

static u32 sk_dst_gso_max_size(struct sock *sk, const struct net_device *dev)
{
        bool is_ipv6 = false;
        u32 max_size;

#if IS_ENABLED(CONFIG_IPV6)
        is_ipv6 = (sk->sk_family == AF_INET6 &&
                   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
#endif
        /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
        max_size = is_ipv6 ? READ_ONCE(dev->gso_max_size) :
                        READ_ONCE(dev->gso_ipv4_max_size);
        if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
                max_size = GSO_LEGACY_MAX_SIZE;

        return max_size - (MAX_TCP_HEADER + 1);
}

void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
{
        const struct net_device *dev;
        u32 max_segs = 1;

        rcu_read_lock();
        dev = dst_dev_rcu(dst);
        sk->sk_route_caps = dev->features;
        if (sk_is_tcp(sk)) {
                struct inet_connection_sock *icsk = inet_csk(sk);

                sk->sk_route_caps |= NETIF_F_GSO;
                icsk->icsk_ack.dst_quick_ack = dst_metric(dst, RTAX_QUICKACK);
        }
        if (sk->sk_route_caps & NETIF_F_GSO)
                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
        if (unlikely(sk->sk_gso_disabled))
                sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
        if (sk_can_gso(sk)) {
                if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
                } else {
                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
                        sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dev);
                        /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
                        max_segs = max_t(u32, READ_ONCE(dev->gso_max_segs), 1);
                }
        }
        sk->sk_gso_max_segs = max_segs;
        sk_dst_set(sk, dst);
        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(sk_setup_caps);

/*
 *        Simple resource managers for sockets.
 */


/*
 * Write buffer destructor automatically called from kfree_skb.
 */
void sock_wfree(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;
        unsigned int len = skb->truesize;
        bool free;

        if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
                if (sock_flag(sk, SOCK_RCU_FREE) &&
                    sk->sk_write_space == sock_def_write_space) {
                        rcu_read_lock();
                        free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
                        sock_def_write_space_wfree(sk);
                        rcu_read_unlock();
                        if (unlikely(free))
                                __sk_free(sk);
                        return;
                }

                /*
                 * Keep a reference on sk_wmem_alloc, this will be released
                 * after sk_write_space() call
                 */
                WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
                sk->sk_write_space(sk);
                len = 1;
        }
        /*
         * if sk_wmem_alloc reaches 0, we must finish what sk_free()
         * could not do because of in-flight packets
         */
        if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
                __sk_free(sk);
}
EXPORT_SYMBOL(sock_wfree);

/* This variant of sock_wfree() is used by TCP,
 * since it sets SOCK_USE_WRITE_QUEUE.
 */
void __sock_wfree(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;

        if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
                __sk_free(sk);
}

void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
{
        skb_orphan(skb);
#ifdef CONFIG_INET
        if (unlikely(!sk_fullsock(sk)))
                return skb_set_owner_edemux(skb, sk);
#endif
        skb->sk = sk;
        skb->destructor = sock_wfree;
        skb_set_hash_from_sk(skb, sk);
        /*
         * We used to take a refcount on sk, but following operation
         * is enough to guarantee sk_free() won't free this sock until
         * all in-flight packets are completed
         */
        refcount_add(skb->truesize, &sk->sk_wmem_alloc);
}
EXPORT_SYMBOL(skb_set_owner_w);

static bool can_skb_orphan_partial(const struct sk_buff *skb)
{
        /* Drivers depend on in-order delivery for crypto offload,
         * partial orphan breaks out-of-order-OK logic.
         */
        if (skb_is_decrypted(skb))
                return false;

        return (skb->destructor == sock_wfree ||
                (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
}

/* This helper is used by netem, as it can hold packets in its
 * delay queue. We want to allow the owner socket to send more
 * packets, as if they were already TX completed by a typical driver.
 * But we also want to keep skb->sk set because some packet schedulers
 * rely on it (sch_fq for example).
 */
void skb_orphan_partial(struct sk_buff *skb)
{
        if (skb_is_tcp_pure_ack(skb))
                return;

        if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
                return;

        skb_orphan(skb);
}
EXPORT_SYMBOL(skb_orphan_partial);

/*
 * Read buffer destructor automatically called from kfree_skb.
 */
void sock_rfree(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;
        unsigned int len = skb->truesize;

        atomic_sub(len, &sk->sk_rmem_alloc);
        sk_mem_uncharge(sk, len);
}
EXPORT_SYMBOL(sock_rfree);

/*
 * Buffer destructor for skbs that are not used directly in read or write
 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
 */
void sock_efree(struct sk_buff *skb)
{
        sock_put(skb->sk);
}
EXPORT_SYMBOL(sock_efree);

/* Buffer destructor for prefetch/receive path where reference count may
 * not be held, e.g. for listen sockets.
 */
#ifdef CONFIG_INET
void sock_pfree(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;

        if (!sk_is_refcounted(sk))
                return;

        if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
                inet_reqsk(sk)->rsk_listener = NULL;
                reqsk_free(inet_reqsk(sk));
                return;
        }

        sock_gen_put(sk);
}
EXPORT_SYMBOL(sock_pfree);
#endif /* CONFIG_INET */

/*
 * Allocate a skb from the socket's send buffer.
 */
struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
                             gfp_t priority)
{
        if (force ||
            refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
                struct sk_buff *skb = alloc_skb(size, priority);

                if (skb) {
                        skb_set_owner_w(skb, sk);
                        return skb;
                }
        }
        return NULL;
}
EXPORT_SYMBOL(sock_wmalloc);

static void sock_ofree(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;

        atomic_sub(skb->truesize, &sk->sk_omem_alloc);
}

struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
                             gfp_t priority)
{
        struct sk_buff *skb;

        /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
        if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
            READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
                return NULL;

        skb = alloc_skb(size, priority);
        if (!skb)
                return NULL;

        atomic_add(skb->truesize, &sk->sk_omem_alloc);
        skb->sk = sk;
        skb->destructor = sock_ofree;
        return skb;
}

/*
 * Allocate a memory block from the socket's option memory buffer.
 */
void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
{
        int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);

        if ((unsigned int)size <= optmem_max &&
            atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
                void *mem;
                /* First do the add, to avoid the race if kmalloc
                 * might sleep.
                 */
                atomic_add(size, &sk->sk_omem_alloc);
                mem = kmalloc(size, priority);
                if (mem)
                        return mem;
                atomic_sub(size, &sk->sk_omem_alloc);
        }
        return NULL;
}
EXPORT_SYMBOL(sock_kmalloc);

/*
 * Duplicate the input "src" memory block using the socket's
 * option memory buffer.
 */
void *sock_kmemdup(struct sock *sk, const void *src,
                   int size, gfp_t priority)
{
        void *mem;

        mem = sock_kmalloc(sk, size, priority);
        if (mem)
                memcpy(mem, src, size);
        return mem;
}
EXPORT_SYMBOL(sock_kmemdup);

/* Free an option memory block. Note, we actually want the inline
 * here as this allows gcc to detect the nullify and fold away the
 * condition entirely.
 */
static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
                                  const bool nullify)
{
        if (WARN_ON_ONCE(!mem))
                return;
        if (nullify)
                kfree_sensitive(mem);
        else
                kfree(mem);
        atomic_sub(size, &sk->sk_omem_alloc);
}

void sock_kfree_s(struct sock *sk, void *mem, int size)
{
        __sock_kfree_s(sk, mem, size, false);
}
EXPORT_SYMBOL(sock_kfree_s);

void sock_kzfree_s(struct sock *sk, void *mem, int size)
{
        __sock_kfree_s(sk, mem, size, true);
}
EXPORT_SYMBOL(sock_kzfree_s);

/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
   I think, these locks should be removed for datagram sockets.
 */
static long sock_wait_for_wmem(struct sock *sk, long timeo)
{
        DEFINE_WAIT(wait);

        sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
        for (;;) {
                if (!timeo)
                        break;
                if (signal_pending(current))
                        break;
                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
                if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
                        break;
                if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
                        break;
                if (READ_ONCE(sk->sk_err))
                        break;
                timeo = schedule_timeout(timeo);
        }
        finish_wait(sk_sleep(sk), &wait);
        return timeo;
}


/*
 *        Generic send/receive buffer handlers
 */

struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
                                     unsigned long data_len, int noblock,
                                     int *errcode, int max_page_order)
{
        struct sk_buff *skb;
        long timeo;
        int err;

        timeo = sock_sndtimeo(sk, noblock);
        for (;;) {
                err = sock_error(sk);
                if (err != 0)
                        goto failure;

                err = -EPIPE;
                if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
                        goto failure;

                if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
                        break;

                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
                err = -EAGAIN;
                if (!timeo)
                        goto failure;
                if (signal_pending(current))
                        goto interrupted;
                timeo = sock_wait_for_wmem(sk, timeo);
        }
        skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
                                   errcode, sk->sk_allocation);
        if (skb)
                skb_set_owner_w(skb, sk);
        return skb;

interrupted:
        err = sock_intr_errno(timeo);
failure:
        *errcode = err;
        return NULL;
}
EXPORT_SYMBOL(sock_alloc_send_pskb);

int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
                     struct sockcm_cookie *sockc)
{
        u32 tsflags;

        BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31));

        switch (cmsg->cmsg_type) {
        case SO_MARK:
                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
                    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
                        return -EPERM;
                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
                        return -EINVAL;
                sockc->mark = *(u32 *)CMSG_DATA(cmsg);
                break;
        case SO_TIMESTAMPING_OLD:
        case SO_TIMESTAMPING_NEW:
                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
                        return -EINVAL;

                tsflags = *(u32 *)CMSG_DATA(cmsg);
                if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
                        return -EINVAL;

                sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
                sockc->tsflags |= tsflags;
                break;
        case SCM_TXTIME:
                if (!sock_flag(sk, SOCK_TXTIME))
                        return -EINVAL;
                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
                        return -EINVAL;
                sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
                break;
        case SCM_TS_OPT_ID:
                if (sk_is_tcp(sk))
                        return -EINVAL;
                tsflags = READ_ONCE(sk->sk_tsflags);
                if (!(tsflags & SOF_TIMESTAMPING_OPT_ID))
                        return -EINVAL;
                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
                        return -EINVAL;
                sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg);
                sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID;
                break;
        /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
        case SCM_RIGHTS:
        case SCM_CREDENTIALS:
                break;
        case SO_PRIORITY:
                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
                        return -EINVAL;
                if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg)))
                        return -EPERM;
                sockc->priority = *(u32 *)CMSG_DATA(cmsg);
                break;
        case SCM_DEVMEM_DMABUF:
                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
                        return -EINVAL;
                sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg);
                break;
        default:
                return -EINVAL;
        }
        return 0;
}
EXPORT_SYMBOL(__sock_cmsg_send);

int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
                   struct sockcm_cookie *sockc)
{
        struct cmsghdr *cmsg;
        int ret;

        for_each_cmsghdr(cmsg, msg) {
                if (!CMSG_OK(msg, cmsg))
                        return -EINVAL;
                if (cmsg->cmsg_level != SOL_SOCKET)
                        continue;
                ret = __sock_cmsg_send(sk, cmsg, sockc);
                if (ret)
                        return ret;
        }
        return 0;
}
EXPORT_SYMBOL(sock_cmsg_send);

static void sk_enter_memory_pressure(struct sock *sk)
{
        if (!sk->sk_prot->enter_memory_pressure)
                return;

        sk->sk_prot->enter_memory_pressure(sk);
}

static void sk_leave_memory_pressure(struct sock *sk)
{
        if (sk->sk_prot->leave_memory_pressure) {
                INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
                                     tcp_leave_memory_pressure, sk);
        } else {
                unsigned long *memory_pressure = sk->sk_prot->memory_pressure;

                if (memory_pressure && READ_ONCE(*memory_pressure))
                        WRITE_ONCE(*memory_pressure, 0);
        }
}

DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);

/**
 * skb_page_frag_refill - check that a page_frag contains enough room
 * @sz: minimum size of the fragment we want to get
 * @pfrag: pointer to page_frag
 * @gfp: priority for memory allocation
 *
 * Note: While this allocator tries to use high order pages, there is
 * no guarantee that allocations succeed. Therefore, @sz MUST be
 * less or equal than PAGE_SIZE.
 */
bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
{
        if (pfrag->page) {
                if (page_ref_count(pfrag->page) == 1) {
                        pfrag->offset = 0;
                        return true;
                }
                if (pfrag->offset + sz <= pfrag->size)
                        return true;
                put_page(pfrag->page);
        }

        pfrag->offset = 0;
        if (SKB_FRAG_PAGE_ORDER &&
            !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
                /* Avoid direct reclaim but allow kswapd to wake */
                pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
                                          __GFP_COMP | __GFP_NOWARN |
                                          __GFP_NORETRY,
                                          SKB_FRAG_PAGE_ORDER);
                if (likely(pfrag->page)) {
                        pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
                        return true;
                }
        }
        pfrag->page = alloc_page(gfp);
        if (likely(pfrag->page)) {
                pfrag->size = PAGE_SIZE;
                return true;
        }
        return false;
}
EXPORT_SYMBOL(skb_page_frag_refill);

bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
{
        if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
                return true;

        sk_enter_memory_pressure(sk);
        sk_stream_moderate_sndbuf(sk);
        return false;
}
EXPORT_SYMBOL(sk_page_frag_refill);

void __lock_sock(struct sock *sk)
        __releases(&sk->sk_lock.slock)
        __acquires(&sk->sk_lock.slock)
{
        DEFINE_WAIT(wait);

        for (;;) {
                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
                                        TASK_UNINTERRUPTIBLE);
                spin_unlock_bh(&sk->sk_lock.slock);
                schedule();
                spin_lock_bh(&sk->sk_lock.slock);
                if (!sock_owned_by_user(sk))
                        break;
        }
        finish_wait(&sk->sk_lock.wq, &wait);
}

void __release_sock(struct sock *sk)
        __releases(&sk->sk_lock.slock)
        __acquires(&sk->sk_lock.slock)
{
        struct sk_buff *skb, *next;
        int nb = 0;

        while ((skb = sk->sk_backlog.head) != NULL) {
                sk->sk_backlog.head = sk->sk_backlog.tail = NULL;

                spin_unlock_bh(&sk->sk_lock.slock);

                while (1) {
                        next = skb->next;
                        prefetch(next);
                        DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
                        skb_mark_not_on_list(skb);
                        sk_backlog_rcv(sk, skb);

                        skb = next;
                        if (!skb)
                                break;

                        if (!(++nb & 15))
                                cond_resched();
                }

                spin_lock_bh(&sk->sk_lock.slock);
        }

        /*
         * Doing the zeroing here guarantee we can not loop forever
         * while a wild producer attempts to flood us.
         */
        sk->sk_backlog.len = 0;
}

void __sk_flush_backlog(struct sock *sk)
{
        spin_lock_bh(&sk->sk_lock.slock);
        __release_sock(sk);

        if (sk->sk_prot->release_cb)
                INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
                                     tcp_release_cb, sk);

        spin_unlock_bh(&sk->sk_lock.slock);
}
EXPORT_SYMBOL_GPL(__sk_flush_backlog);

/**
 * sk_wait_data - wait for data to arrive at sk_receive_queue
 * @sk:    sock to wait on
 * @timeo: for how long
 * @skb:   last skb seen on sk_receive_queue
 *
 * Now socket state including sk->sk_err is changed only under lock,
 * hence we may omit checks after joining wait queue.
 * We check receive queue before schedule() only as optimization;
 * it is very likely that release_sock() added new data.
 */
int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
{
        DEFINE_WAIT_FUNC(wait, woken_wake_function);
        int rc;

        add_wait_queue(sk_sleep(sk), &wait);
        sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
        rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
        sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
        remove_wait_queue(sk_sleep(sk), &wait);
        return rc;
}
EXPORT_SYMBOL(sk_wait_data);

/**
 *        __sk_mem_raise_allocated - increase memory_allocated
 *        @sk: socket
 *        @size: memory size to allocate
 *        @amt: pages to allocate
 *        @kind: allocation type
 *
 *        Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
 *
 *        Unlike the globally shared limits among the sockets under same protocol,
 *        consuming the budget of a memcg won't have direct effect on other ones.
 *        So be optimistic about memcg's tolerance, and leave the callers to decide
 *        whether or not to raise allocated through sk_under_memory_pressure() or
 *        its variants.
 */
int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
{
        bool memcg_enabled = false, charged = false;
        struct proto *prot = sk->sk_prot;
        long allocated;

        sk_memory_allocated_add(sk, amt);
        allocated = sk_memory_allocated(sk);

        if (mem_cgroup_sk_enabled(sk)) {
                memcg_enabled = true;
                charged = mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge());
                if (!charged)
                        goto suppress_allocation;
        }

        /* Under limit. */
        if (allocated <= sk_prot_mem_limits(sk, 0)) {
                sk_leave_memory_pressure(sk);
                return 1;
        }

        /* Under pressure. */
        if (allocated > sk_prot_mem_limits(sk, 1))
                sk_enter_memory_pressure(sk);

        /* Over hard limit. */
        if (allocated > sk_prot_mem_limits(sk, 2))
                goto suppress_allocation;

        /* Guarantee minimum buffer size under pressure (either global
         * or memcg) to make sure features described in RFC 7323 (TCP
         * Extensions for High Performance) work properly.
         *
         * This rule does NOT stand when exceeds global or memcg's hard
         * limit, or else a DoS attack can be taken place by spawning
         * lots of sockets whose usage are under minimum buffer size.
         */
        if (kind == SK_MEM_RECV) {
                if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
                        return 1;

        } else { /* SK_MEM_SEND */
                int wmem0 = sk_get_wmem0(sk, prot);

                if (sk->sk_type == SOCK_STREAM) {
                        if (sk->sk_wmem_queued < wmem0)
                                return 1;
                } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
                                return 1;
                }
        }

        if (sk_has_memory_pressure(sk)) {
                u64 alloc;

                /* The following 'average' heuristic is within the
                 * scope of global accounting, so it only makes
                 * sense for global memory pressure.
                 */
                if (!sk_under_global_memory_pressure(sk))
                        return 1;

                /* Try to be fair among all the sockets under global
                 * pressure by allowing the ones that below average
                 * usage to raise.
                 */
                alloc = sk_sockets_allocated_read_positive(sk);
                if (sk_prot_mem_limits(sk, 2) > alloc *
                    sk_mem_pages(sk->sk_wmem_queued +
                                 atomic_read(&sk->sk_rmem_alloc) +
                                 sk->sk_forward_alloc))
                        return 1;
        }

suppress_allocation:

        if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
                sk_stream_moderate_sndbuf(sk);

                /* Fail only if socket is _under_ its sndbuf.
                 * In this case we cannot block, so that we have to fail.
                 */
                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
                        /* Force charge with __GFP_NOFAIL */
                        if (memcg_enabled && !charged)
                                mem_cgroup_sk_charge(sk, amt,
                                                     gfp_memcg_charge() | __GFP_NOFAIL);
                        return 1;
                }
        }

        trace_sock_exceed_buf_limit(sk, prot, allocated, kind);

        sk_memory_allocated_sub(sk, amt);

        if (charged)
                mem_cgroup_sk_uncharge(sk, amt);

        return 0;
}

/**
 *        __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
 *        @sk: socket
 *        @size: memory size to allocate
 *        @kind: allocation type
 *
 *        If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
 *        rmem allocation. This function assumes that protocols which have
 *        memory_pressure use sk_wmem_queued as write buffer accounting.
 */
int __sk_mem_schedule(struct sock *sk, int size, int kind)
{
        int ret, amt = sk_mem_pages(size);

        sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
        ret = __sk_mem_raise_allocated(sk, size, amt, kind);
        if (!ret)
                sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
        return ret;
}
EXPORT_SYMBOL(__sk_mem_schedule);

/**
 *        __sk_mem_reduce_allocated - reclaim memory_allocated
 *        @sk: socket
 *        @amount: number of quanta
 *
 *        Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
 */
void __sk_mem_reduce_allocated(struct sock *sk, int amount)
{
        sk_memory_allocated_sub(sk, amount);

        if (mem_cgroup_sk_enabled(sk))
                mem_cgroup_sk_uncharge(sk, amount);

        if (sk_under_global_memory_pressure(sk) &&
            (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
                sk_leave_memory_pressure(sk);
}

/**
 *        __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
 *        @sk: socket
 *        @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
 */
void __sk_mem_reclaim(struct sock *sk, int amount)
{
        amount >>= PAGE_SHIFT;
        sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
        __sk_mem_reduce_allocated(sk, amount);
}
EXPORT_SYMBOL(__sk_mem_reclaim);

int sk_set_peek_off(struct sock *sk, int val)
{
        WRITE_ONCE(sk->sk_peek_off, val);
        return 0;
}
EXPORT_SYMBOL_GPL(sk_set_peek_off);

/*
 * Set of default routines for initialising struct proto_ops when
 * the protocol does not support a particular function. In certain
 * cases where it makes no sense for a protocol to have a "do nothing"
 * function, some default processing is provided.
 */

int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_bind);

int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
                    int len, int flags)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_connect);

int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_socketpair);

int sock_no_accept(struct socket *sock, struct socket *newsock,
                   struct proto_accept_arg *arg)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_accept);

int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
                    int peer)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_getname);

int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_ioctl);

int sock_no_listen(struct socket *sock, int backlog)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_listen);

int sock_no_shutdown(struct socket *sock, int how)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_shutdown);

int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_sendmsg);

int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_sendmsg_locked);

int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
                    int flags)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_recvmsg);

int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
{
        /* Mirror missing mmap method error code */
        return -ENODEV;
}
EXPORT_SYMBOL(sock_no_mmap);

/*
 * When a file is received (via SCM_RIGHTS, etc), we must bump the
 * various sock-based usage counts.
 */
void __receive_sock(struct file *file)
{
        struct socket *sock;

        sock = sock_from_file(file);
        if (sock) {
                sock_update_netprioidx(&sock->sk->sk_cgrp_data);
                sock_update_classid(&sock->sk->sk_cgrp_data);
        }
}

/*
 *        Default Socket Callbacks
 */

static void sock_def_wakeup(struct sock *sk)
{
        struct socket_wq *wq;

        rcu_read_lock();
        wq = rcu_dereference(sk->sk_wq);
        if (skwq_has_sleeper(wq))
                wake_up_interruptible_all(&wq->wait);
        rcu_read_unlock();
}

static void sock_def_error_report(struct sock *sk)
{
        struct socket_wq *wq;

        rcu_read_lock();
        wq = rcu_dereference(sk->sk_wq);
        if (skwq_has_sleeper(wq))
                wake_up_interruptible_poll(&wq->wait, EPOLLERR);
        sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
        rcu_read_unlock();
}

void sock_def_readable(struct sock *sk)
{
        struct socket_wq *wq;

        trace_sk_data_ready(sk);

        rcu_read_lock();
        wq = rcu_dereference(sk->sk_wq);
        if (skwq_has_sleeper(wq))
                wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
                                                EPOLLRDNORM | EPOLLRDBAND);
        sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
        rcu_read_unlock();
}

static void sock_def_write_space(struct sock *sk)
{
        struct socket_wq *wq;

        rcu_read_lock();

        /* Do not wake up a writer until he can make "significant"
         * progress.  --DaveM
         */
        if (sock_writeable(sk)) {
                wq = rcu_dereference(sk->sk_wq);
                if (skwq_has_sleeper(wq))
                        wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
                                                EPOLLWRNORM | EPOLLWRBAND);

                /* Should agree with poll, otherwise some programs break */
                sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
        }

        rcu_read_unlock();
}

/* An optimised version of sock_def_write_space(), should only be called
 * for SOCK_RCU_FREE sockets under RCU read section and after putting
 * ->sk_wmem_alloc.
 */
static void sock_def_write_space_wfree(struct sock *sk)
{
        /* Do not wake up a writer until he can make "significant"
         * progress.  --DaveM
         */
        if (sock_writeable(sk)) {
                struct socket_wq *wq = rcu_dereference(sk->sk_wq);

                /* rely on refcount_sub from sock_wfree() */
                smp_mb__after_atomic();
                if (wq && waitqueue_active(&wq->wait))
                        wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
                                                EPOLLWRNORM | EPOLLWRBAND);

                /* Should agree with poll, otherwise some programs break */
                sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
        }
}

static void sock_def_destruct(struct sock *sk)
{
}

void sk_send_sigurg(struct sock *sk)
{
        if (sk->sk_socket && sk->sk_socket->file)
                if (send_sigurg(sk->sk_socket->file))
                        sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
}
EXPORT_SYMBOL(sk_send_sigurg);

void sk_reset_timer(struct sock *sk, struct timer_list* timer,
                    unsigned long expires)
{
        if (!mod_timer(timer, expires))
                sock_hold(sk);
}
EXPORT_SYMBOL(sk_reset_timer);

void sk_stop_timer(struct sock *sk, struct timer_list* timer)
{
        if (timer_delete(timer))
                __sock_put(sk);
}
EXPORT_SYMBOL(sk_stop_timer);

void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
{
        if (timer_delete_sync(timer))
                __sock_put(sk);
}
EXPORT_SYMBOL(sk_stop_timer_sync);

void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
{
        sk_init_common(sk);
        sk->sk_send_head        =        NULL;

        timer_setup(&sk->sk_timer, NULL, 0);

        sk->sk_allocation        =        GFP_KERNEL;
        sk->sk_rcvbuf                =        READ_ONCE(sysctl_rmem_default);
        sk->sk_sndbuf                =        READ_ONCE(sysctl_wmem_default);
        sk->sk_state                =        TCP_CLOSE;
        sk->sk_use_task_frag        =        true;
        sk_set_socket(sk, sock);

        sock_set_flag(sk, SOCK_ZAPPED);

        if (sock) {
                sk->sk_type        =        sock->type;
                RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
                sock->sk        =        sk;
        } else {
                RCU_INIT_POINTER(sk->sk_wq, NULL);
        }
        sk->sk_uid        =        uid;

        sk->sk_state_change        =        sock_def_wakeup;
        sk->sk_data_ready        =        sock_def_readable;
        sk->sk_write_space        =        sock_def_write_space;
        sk->sk_error_report        =        sock_def_error_report;
        sk->sk_destruct                =        sock_def_destruct;

        sk->sk_frag.page        =        NULL;
        sk->sk_frag.offset        =        0;
        sk->sk_peek_off                =        -1;

        sk->sk_peer_pid         =        NULL;
        sk->sk_peer_cred        =        NULL;
        spin_lock_init(&sk->sk_peer_lock);

        sk->sk_write_pending        =        0;
        sk->sk_rcvlowat                =        1;
        sk->sk_rcvtimeo                =        MAX_SCHEDULE_TIMEOUT;
        sk->sk_sndtimeo                =        MAX_SCHEDULE_TIMEOUT;

        sk->sk_stamp = SK_DEFAULT_STAMP;
#if BITS_PER_LONG==32
        seqlock_init(&sk->sk_stamp_seq);
#endif
        atomic_set(&sk->sk_zckey, 0);

#ifdef CONFIG_NET_RX_BUSY_POLL
        sk->sk_napi_id                =        0;
        sk->sk_ll_usec                =        READ_ONCE(sysctl_net_busy_read);
#endif

        sk->sk_max_pacing_rate = ~0UL;
        sk->sk_pacing_rate = ~0UL;
        WRITE_ONCE(sk->sk_pacing_shift, 10);
        sk->sk_incoming_cpu = -1;

        sk_rx_queue_clear(sk);
        /*
         * Before updating sk_refcnt, we must commit prior changes to memory
         * (Documentation/RCU/rculist_nulls.rst for details)
         */
        smp_wmb();
        refcount_set(&sk->sk_refcnt, 1);
        sk_drops_reset(sk);
}
EXPORT_SYMBOL(sock_init_data_uid);

void sock_init_data(struct socket *sock, struct sock *sk)
{
        kuid_t uid = sock ?
                SOCK_INODE(sock)->i_uid :
                make_kuid(sock_net(sk)->user_ns, 0);

        sock_init_data_uid(sock, sk, uid);
}
EXPORT_SYMBOL(sock_init_data);

void lock_sock_nested(struct sock *sk, int subclass)
{
        /* The sk_lock has mutex_lock() semantics here. */
        mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);

        might_sleep();
        spin_lock_bh(&sk->sk_lock.slock);
        if (sock_owned_by_user_nocheck(sk))
                __lock_sock(sk);
        sk->sk_lock.owned = 1;
        spin_unlock_bh(&sk->sk_lock.slock);
}
EXPORT_SYMBOL(lock_sock_nested);

void release_sock(struct sock *sk)
{
        spin_lock_bh(&sk->sk_lock.slock);
        if (sk->sk_backlog.tail)
                __release_sock(sk);

        if (sk->sk_prot->release_cb)
                INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
                                     tcp_release_cb, sk);

        sock_release_ownership(sk);
        if (waitqueue_active(&sk->sk_lock.wq))
                wake_up(&sk->sk_lock.wq);
        spin_unlock_bh(&sk->sk_lock.slock);
}
EXPORT_SYMBOL(release_sock);

bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
{
        might_sleep();
        spin_lock_bh(&sk->sk_lock.slock);

        if (!sock_owned_by_user_nocheck(sk)) {
                /*
                 * Fast path return with bottom halves disabled and
                 * sock::sk_lock.slock held.
                 *
                 * The 'mutex' is not contended and holding
                 * sock::sk_lock.slock prevents all other lockers to
                 * proceed so the corresponding unlock_sock_fast() can
                 * avoid the slow path of release_sock() completely and
                 * just release slock.
                 *
                 * From a semantical POV this is equivalent to 'acquiring'
                 * the 'mutex', hence the corresponding lockdep
                 * mutex_release() has to happen in the fast path of
                 * unlock_sock_fast().
                 */
                return false;
        }

        __lock_sock(sk);
        sk->sk_lock.owned = 1;
        __acquire(&sk->sk_lock.slock);
        spin_unlock_bh(&sk->sk_lock.slock);
        return true;
}
EXPORT_SYMBOL(__lock_sock_fast);

int sock_gettstamp(struct socket *sock, void __user *userstamp,
                   bool timeval, bool time32)
{
        struct sock *sk = sock->sk;
        struct timespec64 ts;

        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
        ts = ktime_to_timespec64(sock_read_timestamp(sk));
        if (ts.tv_sec == -1)
                return -ENOENT;
        if (ts.tv_sec == 0) {
                ktime_t kt = ktime_get_real();
                sock_write_timestamp(sk, kt);
                ts = ktime_to_timespec64(kt);
        }

        if (timeval)
                ts.tv_nsec /= 1000;

#ifdef CONFIG_COMPAT_32BIT_TIME
        if (time32)
                return put_old_timespec32(&ts, userstamp);
#endif
#ifdef CONFIG_SPARC64
        /* beware of padding in sparc64 timeval */
        if (timeval && !in_compat_syscall()) {
                struct __kernel_old_timeval __user tv = {
                        .tv_sec = ts.tv_sec,
                        .tv_usec = ts.tv_nsec,
                };
                if (copy_to_user(userstamp, &tv, sizeof(tv)))
                        return -EFAULT;
                return 0;
        }
#endif
        return put_timespec64(&ts, userstamp);
}
EXPORT_SYMBOL(sock_gettstamp);

void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
{
        if (!sock_flag(sk, flag)) {
                unsigned long previous_flags = sk->sk_flags;

                sock_set_flag(sk, flag);
                /*
                 * we just set one of the two flags which require net
                 * time stamping, but time stamping might have been on
                 * already because of the other one
                 */
                if (sock_needs_netstamp(sk) &&
                    !(previous_flags & SK_FLAGS_TIMESTAMP))
                        net_enable_timestamp();
        }
}

int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
                       int level, int type)
{
        struct sock_exterr_skb *serr;
        struct sk_buff *skb;
        int copied, err;

        err = -EAGAIN;
        skb = sock_dequeue_err_skb(sk);
        if (skb == NULL)
                goto out;

        copied = skb->len;
        if (copied > len) {
                msg->msg_flags |= MSG_TRUNC;
                copied = len;
        }
        err = skb_copy_datagram_msg(skb, 0, msg, copied);
        if (err)
                goto out_free_skb;

        sock_recv_timestamp(msg, sk, skb);

        serr = SKB_EXT_ERR(skb);
        put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);

        msg->msg_flags |= MSG_ERRQUEUE;
        err = copied;

out_free_skb:
        kfree_skb(skb);
out:
        return err;
}
EXPORT_SYMBOL(sock_recv_errqueue);

/*
 *        Get a socket option on an socket.
 *
 *        FIX: POSIX 1003.1g is very ambiguous here. It states that
 *        asynchronous errors should be reported by getsockopt. We assume
 *        this means if you specify SO_ERROR (otherwise what is the point of it).
 */
int sock_common_getsockopt(struct socket *sock, int level, int optname,
                           char __user *optval, int __user *optlen)
{
        struct sock *sk = sock->sk;

        /* IPV6_ADDRFORM can change sk->sk_prot under us. */
        return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
}
EXPORT_SYMBOL(sock_common_getsockopt);

int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                        int flags)
{
        struct sock *sk = sock->sk;
        int addr_len = 0;
        int err;

        err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
        if (err >= 0)
                msg->msg_namelen = addr_len;
        return err;
}
EXPORT_SYMBOL(sock_common_recvmsg);

/*
 *        Set socket options on an inet socket.
 */
int sock_common_setsockopt(struct socket *sock, int level, int optname,
                           sockptr_t optval, unsigned int optlen)
{
        struct sock *sk = sock->sk;

        /* IPV6_ADDRFORM can change sk->sk_prot under us. */
        return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
}
EXPORT_SYMBOL(sock_common_setsockopt);

void sk_common_release(struct sock *sk)
{
        if (sk->sk_prot->destroy)
                sk->sk_prot->destroy(sk);

        /*
         * Observation: when sk_common_release is called, processes have
         * no access to socket. But net still has.
         * Step one, detach it from networking:
         *
         * A. Remove from hash tables.
         */

        sk->sk_prot->unhash(sk);

        /*
         * In this point socket cannot receive new packets, but it is possible
         * that some packets are in flight because some CPU runs receiver and
         * did hash table lookup before we unhashed socket. They will achieve
         * receive queue and will be purged by socket destructor.
         *
         * Also we still have packets pending on receive queue and probably,
         * our own packets waiting in device queues. sock_destroy will drain
         * receive queue, but transmitted packets will delay socket destruction
         * until the last reference will be released.
         */

        sock_orphan(sk);

        xfrm_sk_free_policy(sk);

        sock_put(sk);
}
EXPORT_SYMBOL(sk_common_release);

void sk_get_meminfo(const struct sock *sk, u32 *mem)
{
        memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);

        mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
        mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
        mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
        mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
        mem[SK_MEMINFO_FWD_ALLOC] = READ_ONCE(sk->sk_forward_alloc);
        mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
        mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
        mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
        mem[SK_MEMINFO_DROPS] = sk_drops_read(sk);
}

#ifdef CONFIG_PROC_FS
static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);

int sock_prot_inuse_get(struct net *net, struct proto *prot)
{
        int cpu, idx = prot->inuse_idx;
        int res = 0;

        for_each_possible_cpu(cpu)
                res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];

        return res >= 0 ? res : 0;
}
EXPORT_SYMBOL_GPL(sock_prot_inuse_get);

int sock_inuse_get(struct net *net)
{
        int cpu, res = 0;

        for_each_possible_cpu(cpu)
                res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;

        return res;
}

EXPORT_SYMBOL_GPL(sock_inuse_get);

static int __net_init sock_inuse_init_net(struct net *net)
{
        net->core.prot_inuse = alloc_percpu(struct prot_inuse);
        if (net->core.prot_inuse == NULL)
                return -ENOMEM;
        return 0;
}

static void __net_exit sock_inuse_exit_net(struct net *net)
{
        free_percpu(net->core.prot_inuse);
}

static struct pernet_operations net_inuse_ops = {
        .init = sock_inuse_init_net,
        .exit = sock_inuse_exit_net,
};

static __init int net_inuse_init(void)
{
        if (register_pernet_subsys(&net_inuse_ops))
                panic("Cannot initialize net inuse counters");

        return 0;
}

core_initcall(net_inuse_init);

static int assign_proto_idx(struct proto *prot)
{
        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);

        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) {
                pr_err("PROTO_INUSE_NR exhausted\n");
                return -ENOSPC;
        }

        set_bit(prot->inuse_idx, proto_inuse_idx);
        return 0;
}

static void release_proto_idx(struct proto *prot)
{
        if (prot->inuse_idx != PROTO_INUSE_NR)
                clear_bit(prot->inuse_idx, proto_inuse_idx);
}
#else
static inline int assign_proto_idx(struct proto *prot)
{
        return 0;
}

static inline void release_proto_idx(struct proto *prot)
{
}

#endif

static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
{
        if (!twsk_prot)
                return;
        kfree(twsk_prot->twsk_slab_name);
        twsk_prot->twsk_slab_name = NULL;
        kmem_cache_destroy(twsk_prot->twsk_slab);
        twsk_prot->twsk_slab = NULL;
}

static int tw_prot_init(const struct proto *prot)
{
        struct timewait_sock_ops *twsk_prot = prot->twsk_prot;

        if (!twsk_prot)
                return 0;

        twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
                                              prot->name);
        if (!twsk_prot->twsk_slab_name)
                return -ENOMEM;

        twsk_prot->twsk_slab =
                kmem_cache_create(twsk_prot->twsk_slab_name,
                                  twsk_prot->twsk_obj_size, 0,
                                  SLAB_ACCOUNT | prot->slab_flags,
                                  NULL);
        if (!twsk_prot->twsk_slab) {
                pr_crit("%s: Can't create timewait sock SLAB cache!\n",
                        prot->name);
                return -ENOMEM;
        }

        return 0;
}

static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
{
        if (!rsk_prot)
                return;
        kfree(rsk_prot->slab_name);
        rsk_prot->slab_name = NULL;
        kmem_cache_destroy(rsk_prot->slab);
        rsk_prot->slab = NULL;
}

static int req_prot_init(const struct proto *prot)
{
        struct request_sock_ops *rsk_prot = prot->rsk_prot;

        if (!rsk_prot)
                return 0;

        rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
                                        prot->name);
        if (!rsk_prot->slab_name)
                return -ENOMEM;

        rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
                                           rsk_prot->obj_size, 0,
                                           SLAB_ACCOUNT | prot->slab_flags,
                                           NULL);

        if (!rsk_prot->slab) {
                pr_crit("%s: Can't create request sock SLAB cache!\n",
                        prot->name);
                return -ENOMEM;
        }
        return 0;
}

int proto_register(struct proto *prot, int alloc_slab)
{
        int ret = -ENOBUFS;

        if (prot->memory_allocated && !prot->sysctl_mem) {
                pr_err("%s: missing sysctl_mem\n", prot->name);
                return -EINVAL;
        }
        if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
                pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
                return -EINVAL;
        }
        if (alloc_slab) {
                prot->slab = kmem_cache_create_usercopy(prot->name,
                                        prot->obj_size, 0,
                                        SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
                                        prot->slab_flags,
                                        prot->useroffset, prot->usersize,
                                        NULL);

                if (prot->slab == NULL) {
                        pr_crit("%s: Can't create sock SLAB cache!\n",
                                prot->name);
                        goto out;
                }

                if (req_prot_init(prot))
                        goto out_free_request_sock_slab;

                if (tw_prot_init(prot))
                        goto out_free_timewait_sock_slab;
        }

        mutex_lock(&proto_list_mutex);
        ret = assign_proto_idx(prot);
        if (ret) {
                mutex_unlock(&proto_list_mutex);
                goto out_free_timewait_sock_slab;
        }
        list_add(&prot->node, &proto_list);
        mutex_unlock(&proto_list_mutex);
        return ret;

out_free_timewait_sock_slab:
        if (alloc_slab)
                tw_prot_cleanup(prot->twsk_prot);
out_free_request_sock_slab:
        if (alloc_slab) {
                req_prot_cleanup(prot->rsk_prot);

                kmem_cache_destroy(prot->slab);
                prot->slab = NULL;
        }
out:
        return ret;
}
EXPORT_SYMBOL(proto_register);

void proto_unregister(struct proto *prot)
{
        mutex_lock(&proto_list_mutex);
        release_proto_idx(prot);
        list_del(&prot->node);
        mutex_unlock(&proto_list_mutex);

        kmem_cache_destroy(prot->slab);
        prot->slab = NULL;

        req_prot_cleanup(prot->rsk_prot);
        tw_prot_cleanup(prot->twsk_prot);
}
EXPORT_SYMBOL(proto_unregister);

int sock_load_diag_module(int family, int protocol)
{
        if (!protocol) {
                if (!sock_is_registered(family))
                        return -ENOENT;

                return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
                                      NETLINK_SOCK_DIAG, family);
        }

#ifdef CONFIG_INET
        if (family == AF_INET &&
            protocol != IPPROTO_RAW &&
            protocol < MAX_INET_PROTOS &&
            !rcu_access_pointer(inet_protos[protocol]))
                return -ENOENT;
#endif

        return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
                              NETLINK_SOCK_DIAG, family, protocol);
}
EXPORT_SYMBOL(sock_load_diag_module);

#ifdef CONFIG_PROC_FS
static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
        __acquires(proto_list_mutex)
{
        mutex_lock(&proto_list_mutex);
        return seq_list_start_head(&proto_list, *pos);
}

static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        return seq_list_next(v, &proto_list, pos);
}

static void proto_seq_stop(struct seq_file *seq, void *v)
        __releases(proto_list_mutex)
{
        mutex_unlock(&proto_list_mutex);
}

static char proto_method_implemented(const void *method)
{
        return method == NULL ? 'n' : 'y';
}
static long sock_prot_memory_allocated(struct proto *proto)
{
        return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
}

static const char *sock_prot_memory_pressure(struct proto *proto)
{
        return proto->memory_pressure != NULL ?
        proto_memory_pressure(proto) ? "yes" : "no" : "NI";
}

static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
{

        seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
                   proto->name,
                   proto->obj_size,
                   sock_prot_inuse_get(seq_file_net(seq), proto),
                   sock_prot_memory_allocated(proto),
                   sock_prot_memory_pressure(proto),
                   proto->max_header,
                   proto->slab == NULL ? "no" : "yes",
                   module_name(proto->owner),
                   proto_method_implemented(proto->close),
                   proto_method_implemented(proto->connect),
                   proto_method_implemented(proto->disconnect),
                   proto_method_implemented(proto->accept),
                   proto_method_implemented(proto->ioctl),
                   proto_method_implemented(proto->init),
                   proto_method_implemented(proto->destroy),
                   proto_method_implemented(proto->shutdown),
                   proto_method_implemented(proto->setsockopt),
                   proto_method_implemented(proto->getsockopt),
                   proto_method_implemented(proto->sendmsg),
                   proto_method_implemented(proto->recvmsg),
                   proto_method_implemented(proto->bind),
                   proto_method_implemented(proto->backlog_rcv),
                   proto_method_implemented(proto->hash),
                   proto_method_implemented(proto->unhash),
                   proto_method_implemented(proto->get_port),
                   proto_method_implemented(proto->enter_memory_pressure));
}

static int proto_seq_show(struct seq_file *seq, void *v)
{
        if (v == &proto_list)
                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
                           "protocol",
                           "size",
                           "sockets",
                           "memory",
                           "press",
                           "maxhdr",
                           "slab",
                           "module",
                           "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
        else
                proto_seq_printf(seq, list_entry(v, struct proto, node));
        return 0;
}

static const struct seq_operations proto_seq_ops = {
        .start  = proto_seq_start,
        .next   = proto_seq_next,
        .stop   = proto_seq_stop,
        .show   = proto_seq_show,
};

static __net_init int proto_init_net(struct net *net)
{
        if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
                        sizeof(struct seq_net_private)))
                return -ENOMEM;

        return 0;
}

static __net_exit void proto_exit_net(struct net *net)
{
        remove_proc_entry("protocols", net->proc_net);
}


static __net_initdata struct pernet_operations proto_net_ops = {
        .init = proto_init_net,
        .exit = proto_exit_net,
};

static int __init proto_init(void)
{
        return register_pernet_subsys(&proto_net_ops);
}

subsys_initcall(proto_init);

#endif /* PROC_FS */

#ifdef CONFIG_NET_RX_BUSY_POLL
bool sk_busy_loop_end(void *p, unsigned long start_time)
{
        struct sock *sk = p;

        if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
                return true;

        if (sk_is_udp(sk) &&
            !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
                return true;

        return sk_busy_loop_timeout(sk, start_time);
}
EXPORT_SYMBOL(sk_busy_loop_end);
#endif /* CONFIG_NET_RX_BUSY_POLL */

int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
{
        if (!sk->sk_prot->bind_add)
                return -EOPNOTSUPP;
        return sk->sk_prot->bind_add(sk, addr, addr_len);
}
EXPORT_SYMBOL(sock_bind_add);

/* Copy 'size' bytes from userspace and return `size` back to userspace */
int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
                     void __user *arg, void *karg, size_t size)
{
        int ret;

        if (copy_from_user(karg, arg, size))
                return -EFAULT;

        ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
        if (ret)
                return ret;

        if (copy_to_user(arg, karg, size))
                return -EFAULT;

        return 0;
}
EXPORT_SYMBOL(sock_ioctl_inout);

/* This is the most common ioctl prep function, where the result (4 bytes) is
 * copied back to userspace if the ioctl() returns successfully. No input is
 * copied from userspace as input argument.
 */
static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
{
        int ret, karg = 0;

        ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
        if (ret)
                return ret;

        return put_user(karg, (int __user *)arg);
}

/* A wrapper around sock ioctls, which copies the data from userspace
 * (depending on the protocol/ioctl), and copies back the result to userspace.
 * The main motivation for this function is to pass kernel memory to the
 * protocol ioctl callbacks, instead of userspace memory.
 */
int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
{
        int rc = 1;

        if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
                rc = ipmr_sk_ioctl(sk, cmd, arg);
        else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
                rc = ip6mr_sk_ioctl(sk, cmd, arg);
        else if (sk_is_phonet(sk))
                rc = phonet_sk_ioctl(sk, cmd, arg);

        /* If ioctl was processed, returns its value */
        if (rc <= 0)
                return rc;

        /* Otherwise call the default handler */
        return sock_ioctl_out(sk, cmd, arg);
}
EXPORT_SYMBOL(sk_ioctl);

static int __init sock_struct_check(void)
{
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);

        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);

        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
#ifdef CONFIG_MEMCG
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
#endif

        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);

        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_err_soft);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);

        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_uid);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_protocol);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndbuf);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
        return 0;
}

core_initcall(sock_struct_check);























































































































































































































































































































































   13 
   13 

   13 
















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
// SPDX-License-Identifier: GPL-2.0-only
/*
 * async.c: Asynchronous function calls for boot performance
 *
 * (C) Copyright 2009 Intel Corporation
 * Author: Arjan van de Ven <arjan@linux.intel.com>
 */


/*

Goals and Theory of Operation

The primary goal of this feature is to reduce the kernel boot time,
by doing various independent hardware delays and discovery operations
decoupled and not strictly serialized.

More specifically, the asynchronous function call concept allows
certain operations (primarily during system boot) to happen
asynchronously, out of order, while these operations still
have their externally visible parts happen sequentially and in-order.
(not unlike how out-of-order CPUs retire their instructions in order)

Key to the asynchronous function call implementation is the concept of
a "sequence cookie" (which, although it has an abstracted type, can be
thought of as a monotonically incrementing number).

The async core will assign each scheduled event such a sequence cookie and
pass this to the called functions.

The asynchronously called function should before doing a globally visible
operation, such as registering device numbers, call the
async_synchronize_cookie() function and pass in its own cookie. The
async_synchronize_cookie() function will make sure that all asynchronous
operations that were scheduled prior to the operation corresponding with the
cookie have completed.

Subsystem/driver initialization code that scheduled asynchronous probe
functions, but which shares global resources with other drivers/subsystems
that do not use the asynchronous call feature, need to do a full
synchronization with the async_synchronize_full() function, before returning
from their init function. This is to maintain strict ordering between the
asynchronous and synchronous parts of the kernel.

*/

#include <linux/async.h>
#include <linux/atomic.h>
#include <linux/export.h>
#include <linux/ktime.h>
#include <linux/pid.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/wait.h>
#include <linux/workqueue.h>

#include "workqueue_internal.h"

static async_cookie_t next_cookie = 1;

#define MAX_WORK                32768
#define ASYNC_COOKIE_MAX        ULLONG_MAX        /* infinity cookie */

static LIST_HEAD(async_global_pending);        /* pending from all registered doms */
static ASYNC_DOMAIN(async_dfl_domain);
static DEFINE_SPINLOCK(async_lock);
static struct workqueue_struct *async_wq;

struct async_entry {
        struct list_head        domain_list;
        struct list_head        global_list;
        struct work_struct        work;
        async_cookie_t                cookie;
        async_func_t                func;
        void                        *data;
        struct async_domain        *domain;
};

static DECLARE_WAIT_QUEUE_HEAD(async_done);

static atomic_t entry_count;

static long long microseconds_since(ktime_t start)
{
        ktime_t now = ktime_get();
        return ktime_to_ns(ktime_sub(now, start)) >> 10;
}

static async_cookie_t lowest_in_progress(struct async_domain *domain)
{
        struct async_entry *first = NULL;
        async_cookie_t ret = ASYNC_COOKIE_MAX;
        unsigned long flags;

        spin_lock_irqsave(&async_lock, flags);

        if (domain) {
                if (!list_empty(&domain->pending))
                        first = list_first_entry(&domain->pending,
                                        struct async_entry, domain_list);
        } else {
                if (!list_empty(&async_global_pending))
                        first = list_first_entry(&async_global_pending,
                                        struct async_entry, global_list);
        }

        if (first)
                ret = first->cookie;

        spin_unlock_irqrestore(&async_lock, flags);
        return ret;
}

/*
 * pick the first pending entry and run it
 */
static void async_run_entry_fn(struct work_struct *work)
{
        struct async_entry *entry =
                container_of(work, struct async_entry, work);
        unsigned long flags;
        ktime_t calltime;

        /* 1) run (and print duration) */
        pr_debug("calling  %lli_%pS @ %i\n", (long long)entry->cookie,
                 entry->func, task_pid_nr(current));
        calltime = ktime_get();

        entry->func(entry->data, entry->cookie);

        pr_debug("initcall %lli_%pS returned after %lld usecs\n",
                 (long long)entry->cookie, entry->func,
                 microseconds_since(calltime));

        /* 2) remove self from the pending queues */
        spin_lock_irqsave(&async_lock, flags);
        list_del_init(&entry->domain_list);
        list_del_init(&entry->global_list);

        /* 3) free the entry */
        kfree(entry);
        atomic_dec(&entry_count);

        spin_unlock_irqrestore(&async_lock, flags);

        /* 4) wake up any waiters */
        wake_up(&async_done);
}

static async_cookie_t __async_schedule_node_domain(async_func_t func,
                                                   void *data, int node,
                                                   struct async_domain *domain,
                                                   struct async_entry *entry)
{
        async_cookie_t newcookie;
        unsigned long flags;

        INIT_LIST_HEAD(&entry->domain_list);
        INIT_LIST_HEAD(&entry->global_list);
        INIT_WORK(&entry->work, async_run_entry_fn);
        entry->func = func;
        entry->data = data;
        entry->domain = domain;

        spin_lock_irqsave(&async_lock, flags);

        /* allocate cookie and queue */
        newcookie = entry->cookie = next_cookie++;

        list_add_tail(&entry->domain_list, &domain->pending);
        if (domain->registered)
                list_add_tail(&entry->global_list, &async_global_pending);

        atomic_inc(&entry_count);
        spin_unlock_irqrestore(&async_lock, flags);

        /* schedule for execution */
        queue_work_node(node, async_wq, &entry->work);

        return newcookie;
}

/**
 * async_schedule_node_domain - NUMA specific version of async_schedule_domain
 * @func: function to execute asynchronously
 * @data: data pointer to pass to the function
 * @node: NUMA node that we want to schedule this on or close to
 * @domain: the domain
 *
 * Returns an async_cookie_t that may be used for checkpointing later.
 * @domain may be used in the async_synchronize_*_domain() functions to
 * wait within a certain synchronization domain rather than globally.
 *
 * Note: This function may be called from atomic or non-atomic contexts.
 *
 * The node requested will be honored on a best effort basis. If the node
 * has no CPUs associated with it then the work is distributed among all
 * available CPUs.
 */
async_cookie_t async_schedule_node_domain(async_func_t func, void *data,
                                          int node, struct async_domain *domain)
{
        struct async_entry *entry;
        unsigned long flags;
        async_cookie_t newcookie;

        /* allow irq-off callers */
        entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC);

        /*
         * If we're out of memory or if there's too much work
         * pending already, we execute synchronously.
         */
        if (!entry || atomic_read(&entry_count) > MAX_WORK) {
                kfree(entry);
                spin_lock_irqsave(&async_lock, flags);
                newcookie = next_cookie++;
                spin_unlock_irqrestore(&async_lock, flags);

                /* low on memory.. run synchronously */
                func(data, newcookie);
                return newcookie;
        }

        return __async_schedule_node_domain(func, data, node, domain, entry);
}
EXPORT_SYMBOL_GPL(async_schedule_node_domain);

/**
 * async_schedule_node - NUMA specific version of async_schedule
 * @func: function to execute asynchronously
 * @data: data pointer to pass to the function
 * @node: NUMA node that we want to schedule this on or close to
 *
 * Returns an async_cookie_t that may be used for checkpointing later.
 * Note: This function may be called from atomic or non-atomic contexts.
 *
 * The node requested will be honored on a best effort basis. If the node
 * has no CPUs associated with it then the work is distributed among all
 * available CPUs.
 */
async_cookie_t async_schedule_node(async_func_t func, void *data, int node)
{
        return async_schedule_node_domain(func, data, node, &async_dfl_domain);
}
EXPORT_SYMBOL_GPL(async_schedule_node);

/**
 * async_schedule_dev_nocall - A simplified variant of async_schedule_dev()
 * @func: function to execute asynchronously
 * @dev: device argument to be passed to function
 *
 * @dev is used as both the argument for the function and to provide NUMA
 * context for where to run the function.
 *
 * If the asynchronous execution of @func is scheduled successfully, return
 * true. Otherwise, do nothing and return false, unlike async_schedule_dev()
 * that will run the function synchronously then.
 */
bool async_schedule_dev_nocall(async_func_t func, struct device *dev)
{
        struct async_entry *entry;

        entry = kzalloc(sizeof(struct async_entry), GFP_KERNEL);

        /* Give up if there is no memory or too much work. */
        if (!entry || atomic_read(&entry_count) > MAX_WORK) {
                kfree(entry);
                return false;
        }

        __async_schedule_node_domain(func, dev, dev_to_node(dev),
                                     &async_dfl_domain, entry);
        return true;
}

/**
 * async_synchronize_full - synchronize all asynchronous function calls
 *
 * This function waits until all asynchronous function calls have been done.
 */
void async_synchronize_full(void)
{
        async_synchronize_full_domain(NULL);
}
EXPORT_SYMBOL_GPL(async_synchronize_full);

/**
 * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
 * @domain: the domain to synchronize
 *
 * This function waits until all asynchronous function calls for the
 * synchronization domain specified by @domain have been done.
 */
void async_synchronize_full_domain(struct async_domain *domain)
{
        async_synchronize_cookie_domain(ASYNC_COOKIE_MAX, domain);
}
EXPORT_SYMBOL_GPL(async_synchronize_full_domain);

/**
 * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing
 * @cookie: async_cookie_t to use as checkpoint
 * @domain: the domain to synchronize (%NULL for all registered domains)
 *
 * This function waits until all asynchronous function calls for the
 * synchronization domain specified by @domain submitted prior to @cookie
 * have been done.
 */
void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain)
{
        ktime_t starttime;

        pr_debug("async_waiting @ %i\n", task_pid_nr(current));
        starttime = ktime_get();

        wait_event(async_done, lowest_in_progress(domain) >= cookie);

        pr_debug("async_continuing @ %i after %lli usec\n", task_pid_nr(current),
                 microseconds_since(starttime));
}
EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain);

/**
 * async_synchronize_cookie - synchronize asynchronous function calls with cookie checkpointing
 * @cookie: async_cookie_t to use as checkpoint
 *
 * This function waits until all asynchronous function calls prior to @cookie
 * have been done.
 */
void async_synchronize_cookie(async_cookie_t cookie)
{
        async_synchronize_cookie_domain(cookie, &async_dfl_domain);
}
EXPORT_SYMBOL_GPL(async_synchronize_cookie);

/**
 * current_is_async - is %current an async worker task?
 *
 * Returns %true if %current is an async worker task.
 */
bool current_is_async(void)
{
        struct worker *worker = current_wq_worker();

        return worker && worker->current_func == async_run_entry_fn;
}
EXPORT_SYMBOL_GPL(current_is_async);

void __init async_init(void)
{
        /*
         * Async can schedule a number of interdependent work items. However,
         * unbound workqueues can handle only upto min_active interdependent
         * work items. The default min_active of 8 isn't sufficient for async
         * and can lead to stalls. Let's use a dedicated workqueue with raised
         * min_active.
         */
        async_wq = alloc_workqueue("async", WQ_UNBOUND, 0);
        BUG_ON(!async_wq);
        workqueue_set_min_active(async_wq, WQ_DFL_ACTIVE);
}






























   57 

   59 

   60 

   59 




   59 


   60 
   57 
























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
// SPDX-License-Identifier: GPL-2.0
#include <linux/fault-inject.h>
#include <linux/debugfs.h>
#include <linux/error-injection.h>
#include <linux/mm.h>

static struct {
        struct fault_attr attr;

        bool ignore_gfp_highmem;
        bool ignore_gfp_reclaim;
        u32 min_order;
} fail_page_alloc = {
        .attr = FAULT_ATTR_INITIALIZER,
        .ignore_gfp_reclaim = true,
        .ignore_gfp_highmem = true,
        .min_order = 1,
};

static int __init setup_fail_page_alloc(char *str)
{
        return setup_fault_attr(&fail_page_alloc.attr, str);
}
__setup("fail_page_alloc=", setup_fail_page_alloc);

bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
        int flags = 0;

        if (order < fail_page_alloc.min_order)
                return false;
        if (gfp_mask & __GFP_NOFAIL)
                return false;
        if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
                return false;
        if (fail_page_alloc.ignore_gfp_reclaim &&
                        (gfp_mask & __GFP_DIRECT_RECLAIM))
                return false;

        /* See comment in __should_failslab() */
        if (gfp_mask & __GFP_NOWARN)
                flags |= FAULT_NOWARN;

        return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags);
}
ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

static int __init fail_page_alloc_debugfs(void)
{
        umode_t mode = S_IFREG | 0600;
        struct dentry *dir;

        dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
                                        &fail_page_alloc.attr);

        debugfs_create_bool("ignore-gfp-wait", mode, dir,
                            &fail_page_alloc.ignore_gfp_reclaim);
        debugfs_create_bool("ignore-gfp-highmem", mode, dir,
                            &fail_page_alloc.ignore_gfp_highmem);
        debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);

        return 0;
}

late_initcall(fail_page_alloc_debugfs);

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */


















































































































    9 
    9 













































































































































































































    7 

    7 




















    7 







    6 











    7 



    7 























    7 
    6 








    7 
    7 







    7 








    6 
    6 



    7 
    7 































































































































   57 



   15 























   15 


    1 


   15 
    2 













































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
/*
 * Copyright (C) 2017-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
 * Copyright Matt Mackall <mpm@selenic.com>, 2003, 2004, 2005
 * Copyright Theodore Ts'o, 1994, 1995, 1996, 1997, 1998, 1999. All rights reserved.
 *
 * This driver produces cryptographically secure pseudorandom data. It is divided
 * into roughly six sections, each with a section header:
 *
 *   - Initialization and readiness waiting.
 *   - Fast key erasure RNG, the "crng".
 *   - Entropy accumulation and extraction routines.
 *   - Entropy collection routines.
 *   - Userspace reader/writer interfaces.
 *   - Sysctl interface.
 *
 * The high level overview is that there is one input pool, into which
 * various pieces of data are hashed. Prior to initialization, some of that
 * data is then "credited" as having a certain number of bits of entropy.
 * When enough bits of entropy are available, the hash is finalized and
 * handed as a key to a stream cipher that expands it indefinitely for
 * various consumers. This key is periodically refreshed as the various
 * entropy collectors, described below, add data to the input pool.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/utsname.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/major.h>
#include <linux/string.h>
#include <linux/fcntl.h>
#include <linux/slab.h>
#include <linux/random.h>
#include <linux/poll.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/interrupt.h>
#include <linux/mm.h>
#include <linux/nodemask.h>
#include <linux/spinlock.h>
#include <linux/kthread.h>
#include <linux/percpu.h>
#include <linux/ptrace.h>
#include <linux/workqueue.h>
#include <linux/irq.h>
#include <linux/ratelimit.h>
#include <linux/syscalls.h>
#include <linux/completion.h>
#include <linux/uuid.h>
#include <linux/uaccess.h>
#include <linux/suspend.h>
#include <linux/siphash.h>
#include <linux/sched/isolation.h>
#include <crypto/chacha.h>
#include <crypto/blake2s.h>
#ifdef CONFIG_VDSO_GETRANDOM
#include <vdso/getrandom.h>
#include <vdso/datapage.h>
#include <vdso/vsyscall.h>
#endif
#include <asm/archrandom.h>
#include <asm/processor.h>
#include <asm/irq.h>
#include <asm/irq_regs.h>
#include <asm/io.h>

/*********************************************************************
 *
 * Initialization and readiness waiting.
 *
 * Much of the RNG infrastructure is devoted to various dependencies
 * being able to wait until the RNG has collected enough entropy and
 * is ready for safe consumption.
 *
 *********************************************************************/

/*
 * crng_init is protected by base_crng->lock, and only increases
 * its value (from empty->early->ready).
 */
static enum {
        CRNG_EMPTY = 0, /* Little to no entropy collected */
        CRNG_EARLY = 1, /* At least POOL_EARLY_BITS collected */
        CRNG_READY = 2  /* Fully initialized with POOL_READY_BITS collected */
} crng_init __read_mostly = CRNG_EMPTY;
static DEFINE_STATIC_KEY_FALSE(crng_is_ready);
#define crng_ready() (static_branch_likely(&crng_is_ready) || crng_init >= CRNG_READY)
/* Various types of waiters for crng_init->CRNG_READY transition. */
static DECLARE_WAIT_QUEUE_HEAD(crng_init_wait);
static struct fasync_struct *fasync;
static ATOMIC_NOTIFIER_HEAD(random_ready_notifier);

/* Control how we warn userspace. */
static struct ratelimit_state urandom_warning =
        RATELIMIT_STATE_INIT_FLAGS("urandom_warning", HZ, 3, RATELIMIT_MSG_ON_RELEASE);
static int ratelimit_disable __read_mostly =
        IS_ENABLED(CONFIG_WARN_ALL_UNSEEDED_RANDOM);
module_param_named(ratelimit_disable, ratelimit_disable, int, 0644);
MODULE_PARM_DESC(ratelimit_disable, "Disable random ratelimit suppression");

/*
 * Returns whether or not the input pool has been seeded and thus guaranteed
 * to supply cryptographically secure random numbers. This applies to: the
 * /dev/urandom device, the get_random_bytes function, and the get_random_{u8,
 * u16,u32,u64,long} family of functions.
 *
 * Returns: true if the input pool has been seeded.
 *          false if the input pool has not been seeded.
 */
bool rng_is_initialized(void)
{
        return crng_ready();
}
EXPORT_SYMBOL(rng_is_initialized);

static void __cold crng_set_ready(struct work_struct *work)
{
        static_branch_enable(&crng_is_ready);
}

/* Used by wait_for_random_bytes(), and considered an entropy collector, below. */
static void try_to_generate_entropy(void);

/*
 * Wait for the input pool to be seeded and thus guaranteed to supply
 * cryptographically secure random numbers. This applies to: the /dev/urandom
 * device, the get_random_bytes function, and the get_random_{u8,u16,u32,u64,
 * long} family of functions. Using any of these functions without first
 * calling this function forfeits the guarantee of security.
 *
 * Returns: 0 if the input pool has been seeded.
 *          -ERESTARTSYS if the function was interrupted by a signal.
 */
int wait_for_random_bytes(void)
{
        while (!crng_ready()) {
                int ret;

                try_to_generate_entropy();
                ret = wait_event_interruptible_timeout(crng_init_wait, crng_ready(), HZ);
                if (ret)
                        return ret > 0 ? 0 : ret;
        }
        return 0;
}
EXPORT_SYMBOL(wait_for_random_bytes);

/*
 * Add a callback function that will be invoked when the crng is initialised,
 * or immediately if it already has been. Only use this is you are absolutely
 * sure it is required. Most users should instead be able to test
 * `rng_is_initialized()` on demand, or make use of `get_random_bytes_wait()`.
 */
int __cold execute_with_initialized_rng(struct notifier_block *nb)
{
        unsigned long flags;
        int ret = 0;

        spin_lock_irqsave(&random_ready_notifier.lock, flags);
        if (crng_ready())
                nb->notifier_call(nb, 0, NULL);
        else
                ret = raw_notifier_chain_register((struct raw_notifier_head *)&random_ready_notifier.head, nb);
        spin_unlock_irqrestore(&random_ready_notifier.lock, flags);
        return ret;
}

#define warn_unseeded_randomness() \
        if (IS_ENABLED(CONFIG_WARN_ALL_UNSEEDED_RANDOM) && !crng_ready()) \
                printk_deferred(KERN_NOTICE "random: %s called from %pS with crng_init=%d\n", \
                                __func__, (void *)_RET_IP_, crng_init)


/*********************************************************************
 *
 * Fast key erasure RNG, the "crng".
 *
 * These functions expand entropy from the entropy extractor into
 * long streams for external consumption using the "fast key erasure"
 * RNG described at <https://blog.cr.yp.to/20170723-random.html>.
 *
 * There are a few exported interfaces for use by other drivers:
 *
 *        void get_random_bytes(void *buf, size_t len)
 *        u8 get_random_u8()
 *        u16 get_random_u16()
 *        u32 get_random_u32()
 *        u32 get_random_u32_below(u32 ceil)
 *        u32 get_random_u32_above(u32 floor)
 *        u32 get_random_u32_inclusive(u32 floor, u32 ceil)
 *        u64 get_random_u64()
 *        unsigned long get_random_long()
 *
 * These interfaces will return the requested number of random bytes
 * into the given buffer or as a return value. This is equivalent to
 * a read from /dev/urandom. The u8, u16, u32, u64, long family of
 * functions may be higher performance for one-off random integers,
 * because they do a bit of buffering and do not invoke reseeding
 * until the buffer is emptied.
 *
 *********************************************************************/

enum {
        CRNG_RESEED_START_INTERVAL = HZ,
        CRNG_RESEED_INTERVAL = 60 * HZ
};

static struct {
        u8 key[CHACHA_KEY_SIZE] __aligned(__alignof__(long));
        unsigned long generation;
        spinlock_t lock;
} base_crng = {
        .lock = __SPIN_LOCK_UNLOCKED(base_crng.lock)
};

struct crng {
        u8 key[CHACHA_KEY_SIZE];
        unsigned long generation;
        local_lock_t lock;
};

static DEFINE_PER_CPU(struct crng, crngs) = {
        .generation = ULONG_MAX,
        .lock = INIT_LOCAL_LOCK(crngs.lock),
};

/*
 * Return the interval until the next reseeding, which is normally
 * CRNG_RESEED_INTERVAL, but during early boot, it is at an interval
 * proportional to the uptime.
 */
static unsigned int crng_reseed_interval(void)
{
        static bool early_boot = true;

        if (unlikely(READ_ONCE(early_boot))) {
                time64_t uptime = ktime_get_seconds();
                if (uptime >= CRNG_RESEED_INTERVAL / HZ * 2)
                        WRITE_ONCE(early_boot, false);
                else
                        return max_t(unsigned int, CRNG_RESEED_START_INTERVAL,
                                     (unsigned int)uptime / 2 * HZ);
        }
        return CRNG_RESEED_INTERVAL;
}

/* Used by crng_reseed() and crng_make_state() to extract a new seed from the input pool. */
static void extract_entropy(void *buf, size_t len);

/* This extracts a new crng key from the input pool. */
static void crng_reseed(struct work_struct *work)
{
        static DECLARE_DELAYED_WORK(next_reseed, crng_reseed);
        unsigned long flags;
        unsigned long next_gen;
        u8 key[CHACHA_KEY_SIZE];

        /* Immediately schedule the next reseeding, so that it fires sooner rather than later. */
        if (likely(system_unbound_wq))
                queue_delayed_work(system_unbound_wq, &next_reseed, crng_reseed_interval());

        extract_entropy(key, sizeof(key));

        /*
         * We copy the new key into the base_crng, overwriting the old one,
         * and update the generation counter. We avoid hitting ULONG_MAX,
         * because the per-cpu crngs are initialized to ULONG_MAX, so this
         * forces new CPUs that come online to always initialize.
         */
        spin_lock_irqsave(&base_crng.lock, flags);
        memcpy(base_crng.key, key, sizeof(base_crng.key));
        next_gen = base_crng.generation + 1;
        if (next_gen == ULONG_MAX)
                ++next_gen;
        WRITE_ONCE(base_crng.generation, next_gen);
#ifdef CONFIG_VDSO_GETRANDOM
        /* base_crng.generation's invalid value is ULONG_MAX, while
         * vdso_k_rng_data->generation's invalid value is 0, so add one to the
         * former to arrive at the latter. Use smp_store_release so that this
         * is ordered with the write above to base_crng.generation. Pairs with
         * the smp_rmb() before the syscall in the vDSO code.
         *
         * Cast to unsigned long for 32-bit architectures, since atomic 64-bit
         * operations are not supported on those architectures. This is safe
         * because base_crng.generation is a 32-bit value. On big-endian
         * architectures it will be stored in the upper 32 bits, but that's okay
         * because the vDSO side only checks whether the value changed, without
         * actually using or interpreting the value.
         */
        smp_store_release((unsigned long *)&vdso_k_rng_data->generation, next_gen + 1);
#endif
        if (!static_branch_likely(&crng_is_ready))
                crng_init = CRNG_READY;
        spin_unlock_irqrestore(&base_crng.lock, flags);
        memzero_explicit(key, sizeof(key));
}

/*
 * This generates a ChaCha block using the provided key, and then
 * immediately overwrites that key with half the block. It returns
 * the resultant ChaCha state to the user, along with the second
 * half of the block containing 32 bytes of random data that may
 * be used; random_data_len may not be greater than 32.
 *
 * The returned ChaCha state contains within it a copy of the old
 * key value, at index 4, so the state should always be zeroed out
 * immediately after using in order to maintain forward secrecy.
 * If the state cannot be erased in a timely manner, then it is
 * safer to set the random_data parameter to &chacha_state->x[4]
 * so that this function overwrites it before returning.
 */
static void crng_fast_key_erasure(u8 key[CHACHA_KEY_SIZE],
                                  struct chacha_state *chacha_state,
                                  u8 *random_data, size_t random_data_len)
{
        u8 first_block[CHACHA_BLOCK_SIZE];

        BUG_ON(random_data_len > 32);

        chacha_init_consts(chacha_state);
        memcpy(&chacha_state->x[4], key, CHACHA_KEY_SIZE);
        memset(&chacha_state->x[12], 0, sizeof(u32) * 4);
        chacha20_block(chacha_state, first_block);

        memcpy(key, first_block, CHACHA_KEY_SIZE);
        memcpy(random_data, first_block + CHACHA_KEY_SIZE, random_data_len);
        memzero_explicit(first_block, sizeof(first_block));
}

/*
 * This function returns a ChaCha state that you may use for generating
 * random data. It also returns up to 32 bytes on its own of random data
 * that may be used; random_data_len may not be greater than 32.
 */
static void crng_make_state(struct chacha_state *chacha_state,
                            u8 *random_data, size_t random_data_len)
{
        unsigned long flags;
        struct crng *crng;

        BUG_ON(random_data_len > 32);

        /*
         * For the fast path, we check whether we're ready, unlocked first, and
         * then re-check once locked later. In the case where we're really not
         * ready, we do fast key erasure with the base_crng directly, extracting
         * when crng_init is CRNG_EMPTY.
         */
        if (!crng_ready()) {
                bool ready;

                spin_lock_irqsave(&base_crng.lock, flags);
                ready = crng_ready();
                if (!ready) {
                        if (crng_init == CRNG_EMPTY)
                                extract_entropy(base_crng.key, sizeof(base_crng.key));
                        crng_fast_key_erasure(base_crng.key, chacha_state,
                                              random_data, random_data_len);
                }
                spin_unlock_irqrestore(&base_crng.lock, flags);
                if (!ready)
                        return;
        }

        local_lock_irqsave(&crngs.lock, flags);
        crng = raw_cpu_ptr(&crngs);

        /*
         * If our per-cpu crng is older than the base_crng, then it means
         * somebody reseeded the base_crng. In that case, we do fast key
         * erasure on the base_crng, and use its output as the new key
         * for our per-cpu crng. This brings us up to date with base_crng.
         */
        if (unlikely(crng->generation != READ_ONCE(base_crng.generation))) {
                spin_lock(&base_crng.lock);
                crng_fast_key_erasure(base_crng.key, chacha_state,
                                      crng->key, sizeof(crng->key));
                crng->generation = base_crng.generation;
                spin_unlock(&base_crng.lock);
        }

        /*
         * Finally, when we've made it this far, our per-cpu crng has an up
         * to date key, and we can do fast key erasure with it to produce
         * some random data and a ChaCha state for the caller. All other
         * branches of this function are "unlikely", so most of the time we
         * should wind up here immediately.
         */
        crng_fast_key_erasure(crng->key, chacha_state, random_data, random_data_len);
        local_unlock_irqrestore(&crngs.lock, flags);
}

static void _get_random_bytes(void *buf, size_t len)
{
        struct chacha_state chacha_state;
        u8 tmp[CHACHA_BLOCK_SIZE];
        size_t first_block_len;

        if (!len)
                return;

        first_block_len = min_t(size_t, 32, len);
        crng_make_state(&chacha_state, buf, first_block_len);
        len -= first_block_len;
        buf += first_block_len;

        while (len) {
                if (len < CHACHA_BLOCK_SIZE) {
                        chacha20_block(&chacha_state, tmp);
                        memcpy(buf, tmp, len);
                        memzero_explicit(tmp, sizeof(tmp));
                        break;
                }

                chacha20_block(&chacha_state, buf);
                if (unlikely(chacha_state.x[12] == 0))
                        ++chacha_state.x[13];
                len -= CHACHA_BLOCK_SIZE;
                buf += CHACHA_BLOCK_SIZE;
        }

        chacha_zeroize_state(&chacha_state);
}

/*
 * This returns random bytes in arbitrary quantities. The quality of the
 * random bytes is good as /dev/urandom. In order to ensure that the
 * randomness provided by this function is okay, the function
 * wait_for_random_bytes() should be called and return 0 at least once
 * at any point prior.
 */
void get_random_bytes(void *buf, size_t len)
{
        warn_unseeded_randomness();
        _get_random_bytes(buf, len);
}
EXPORT_SYMBOL(get_random_bytes);

static ssize_t get_random_bytes_user(struct iov_iter *iter)
{
        struct chacha_state chacha_state;
        u8 block[CHACHA_BLOCK_SIZE];
        size_t ret = 0, copied;

        if (unlikely(!iov_iter_count(iter)))
                return 0;

        /*
         * Immediately overwrite the ChaCha key at index 4 with random
         * bytes, in case userspace causes copy_to_iter() below to sleep
         * forever, so that we still retain forward secrecy in that case.
         */
        crng_make_state(&chacha_state, (u8 *)&chacha_state.x[4],
                        CHACHA_KEY_SIZE);
        /*
         * However, if we're doing a read of len <= 32, we don't need to
         * use chacha_state after, so we can simply return those bytes to
         * the user directly.
         */
        if (iov_iter_count(iter) <= CHACHA_KEY_SIZE) {
                ret = copy_to_iter(&chacha_state.x[4], CHACHA_KEY_SIZE, iter);
                goto out_zero_chacha;
        }

        for (;;) {
                chacha20_block(&chacha_state, block);
                if (unlikely(chacha_state.x[12] == 0))
                        ++chacha_state.x[13];

                copied = copy_to_iter(block, sizeof(block), iter);
                ret += copied;
                if (!iov_iter_count(iter) || copied != sizeof(block))
                        break;

                BUILD_BUG_ON(PAGE_SIZE % sizeof(block) != 0);
                if (ret % PAGE_SIZE == 0) {
                        if (signal_pending(current))
                                break;
                        cond_resched();
                }
        }

        memzero_explicit(block, sizeof(block));
out_zero_chacha:
        chacha_zeroize_state(&chacha_state);
        return ret ? ret : -EFAULT;
}

/*
 * Batched entropy returns random integers. The quality of the random
 * number is good as /dev/urandom. In order to ensure that the randomness
 * provided by this function is okay, the function wait_for_random_bytes()
 * should be called and return 0 at least once at any point prior.
 */

#define DEFINE_BATCHED_ENTROPY(type)                                                \
struct batch_ ##type {                                                                \
        /*                                                                        \
         * We make this 1.5x a ChaCha block, so that we get the                        \
         * remaining 32 bytes from fast key erasure, plus one full                \
         * block from the detached ChaCha state. We can increase                \
         * the size of this later if needed so long as we keep the                \
         * formula of (integer_blocks + 0.5) * CHACHA_BLOCK_SIZE.                \
         */                                                                        \
        type entropy[CHACHA_BLOCK_SIZE * 3 / (2 * sizeof(type))];                \
        local_lock_t lock;                                                        \
        unsigned long generation;                                                \
        unsigned int position;                                                        \
};                                                                                \
                                                                                \
static DEFINE_PER_CPU(struct batch_ ##type, batched_entropy_ ##type) = {        \
        .lock = INIT_LOCAL_LOCK(batched_entropy_ ##type.lock),                        \
        .position = UINT_MAX                                                        \
};                                                                                \
                                                                                \
type get_random_ ##type(void)                                                        \
{                                                                                \
        type ret;                                                                \
        unsigned long flags;                                                        \
        struct batch_ ##type *batch;                                                \
        unsigned long next_gen;                                                        \
                                                                                \
        warn_unseeded_randomness();                                                \
                                                                                \
        if  (!crng_ready()) {                                                        \
                _get_random_bytes(&ret, sizeof(ret));                                \
                return ret;                                                        \
        }                                                                        \
                                                                                \
        local_lock_irqsave(&batched_entropy_ ##type.lock, flags);                \
        batch = raw_cpu_ptr(&batched_entropy_##type);                                \
                                                                                \
        next_gen = READ_ONCE(base_crng.generation);                                \
        if (batch->position >= ARRAY_SIZE(batch->entropy) ||                        \
            next_gen != batch->generation) {                                        \
                _get_random_bytes(batch->entropy, sizeof(batch->entropy));        \
                batch->position = 0;                                                \
                batch->generation = next_gen;                                        \
        }                                                                        \
                                                                                \
        ret = batch->entropy[batch->position];                                        \
        batch->entropy[batch->position] = 0;                                        \
        ++batch->position;                                                        \
        local_unlock_irqrestore(&batched_entropy_ ##type.lock, flags);                \
        return ret;                                                                \
}                                                                                \
EXPORT_SYMBOL(get_random_ ##type);

DEFINE_BATCHED_ENTROPY(u8)
DEFINE_BATCHED_ENTROPY(u16)
DEFINE_BATCHED_ENTROPY(u32)
DEFINE_BATCHED_ENTROPY(u64)

u32 __get_random_u32_below(u32 ceil)
{
        /*
         * This is the slow path for variable ceil. It is still fast, most of
         * the time, by doing traditional reciprocal multiplication and
         * opportunistically comparing the lower half to ceil itself, before
         * falling back to computing a larger bound, and then rejecting samples
         * whose lower half would indicate a range indivisible by ceil. The use
         * of `-ceil % ceil` is analogous to `2^32 % ceil`, but is computable
         * in 32-bits.
         */
        u32 rand = get_random_u32();
        u64 mult;

        /*
         * This function is technically undefined for ceil == 0, and in fact
         * for the non-underscored constant version in the header, we build bug
         * on that. But for the non-constant case, it's convenient to have that
         * evaluate to being a straight call to get_random_u32(), so that
         * get_random_u32_inclusive() can work over its whole range without
         * undefined behavior.
         */
        if (unlikely(!ceil))
                return rand;

        mult = (u64)ceil * rand;
        if (unlikely((u32)mult < ceil)) {
                u32 bound = -ceil % ceil;
                while (unlikely((u32)mult < bound))
                        mult = (u64)ceil * get_random_u32();
        }
        return mult >> 32;
}
EXPORT_SYMBOL(__get_random_u32_below);

#ifdef CONFIG_SMP
/*
 * This function is called when the CPU is coming up, with entry
 * CPUHP_RANDOM_PREPARE, which comes before CPUHP_WORKQUEUE_PREP.
 */
int __cold random_prepare_cpu(unsigned int cpu)
{
        /*
         * When the cpu comes back online, immediately invalidate both
         * the per-cpu crng and all batches, so that we serve fresh
         * randomness.
         */
        per_cpu_ptr(&crngs, cpu)->generation = ULONG_MAX;
        per_cpu_ptr(&batched_entropy_u8, cpu)->position = UINT_MAX;
        per_cpu_ptr(&batched_entropy_u16, cpu)->position = UINT_MAX;
        per_cpu_ptr(&batched_entropy_u32, cpu)->position = UINT_MAX;
        per_cpu_ptr(&batched_entropy_u64, cpu)->position = UINT_MAX;
        return 0;
}
#endif


/**********************************************************************
 *
 * Entropy accumulation and extraction routines.
 *
 * Callers may add entropy via:
 *
 *     static void mix_pool_bytes(const void *buf, size_t len)
 *
 * After which, if added entropy should be credited:
 *
 *     static void credit_init_bits(size_t bits)
 *
 * Finally, extract entropy via:
 *
 *     static void extract_entropy(void *buf, size_t len)
 *
 **********************************************************************/

enum {
        POOL_BITS = BLAKE2S_HASH_SIZE * 8,
        POOL_READY_BITS = POOL_BITS, /* When crng_init->CRNG_READY */
        POOL_EARLY_BITS = POOL_READY_BITS / 2 /* When crng_init->CRNG_EARLY */
};

static struct {
        struct blake2s_state hash;
        spinlock_t lock;
        unsigned int init_bits;
} input_pool = {
        .hash.h = { BLAKE2S_IV0 ^ (0x01010000 | BLAKE2S_HASH_SIZE),
                    BLAKE2S_IV1, BLAKE2S_IV2, BLAKE2S_IV3, BLAKE2S_IV4,
                    BLAKE2S_IV5, BLAKE2S_IV6, BLAKE2S_IV7 },
        .hash.outlen = BLAKE2S_HASH_SIZE,
        .lock = __SPIN_LOCK_UNLOCKED(input_pool.lock),
};

static void _mix_pool_bytes(const void *buf, size_t len)
{
        blake2s_update(&input_pool.hash, buf, len);
}

/*
 * This function adds bytes into the input pool. It does not
 * update the initialization bit counter; the caller should call
 * credit_init_bits if this is appropriate.
 */
static void mix_pool_bytes(const void *buf, size_t len)
{
        unsigned long flags;

        spin_lock_irqsave(&input_pool.lock, flags);
        _mix_pool_bytes(buf, len);
        spin_unlock_irqrestore(&input_pool.lock, flags);
}

/*
 * This is an HKDF-like construction for using the hashed collected entropy
 * as a PRF key, that's then expanded block-by-block.
 */
static void extract_entropy(void *buf, size_t len)
{
        unsigned long flags;
        u8 seed[BLAKE2S_HASH_SIZE], next_key[BLAKE2S_HASH_SIZE];
        struct {
                unsigned long rdseed[32 / sizeof(long)];
                size_t counter;
        } block;
        size_t i, longs;

        for (i = 0; i < ARRAY_SIZE(block.rdseed);) {
                longs = arch_get_random_seed_longs(&block.rdseed[i], ARRAY_SIZE(block.rdseed) - i);
                if (longs) {
                        i += longs;
                        continue;
                }
                longs = arch_get_random_longs(&block.rdseed[i], ARRAY_SIZE(block.rdseed) - i);
                if (longs) {
                        i += longs;
                        continue;
                }
                block.rdseed[i++] = random_get_entropy();
        }

        spin_lock_irqsave(&input_pool.lock, flags);

        /* seed = HASHPRF(last_key, entropy_input) */
        blake2s_final(&input_pool.hash, seed);

        /* next_key = HASHPRF(seed, RDSEED || 0) */
        block.counter = 0;
        blake2s(next_key, (u8 *)&block, seed, sizeof(next_key), sizeof(block), sizeof(seed));
        blake2s_init_key(&input_pool.hash, BLAKE2S_HASH_SIZE, next_key, sizeof(next_key));

        spin_unlock_irqrestore(&input_pool.lock, flags);
        memzero_explicit(next_key, sizeof(next_key));

        while (len) {
                i = min_t(size_t, len, BLAKE2S_HASH_SIZE);
                /* output = HASHPRF(seed, RDSEED || ++counter) */
                ++block.counter;
                blake2s(buf, (u8 *)&block, seed, i, sizeof(block), sizeof(seed));
                len -= i;
                buf += i;
        }

        memzero_explicit(seed, sizeof(seed));
        memzero_explicit(&block, sizeof(block));
}

#define credit_init_bits(bits) if (!crng_ready()) _credit_init_bits(bits)

static void __cold _credit_init_bits(size_t bits)
{
        static DECLARE_WORK(set_ready, crng_set_ready);
        unsigned int new, orig, add;
        unsigned long flags;
        int m;

        if (!bits)
                return;

        add = min_t(size_t, bits, POOL_BITS);

        orig = READ_ONCE(input_pool.init_bits);
        do {
                new = min_t(unsigned int, POOL_BITS, orig + add);
        } while (!try_cmpxchg(&input_pool.init_bits, &orig, new));

        if (orig < POOL_READY_BITS && new >= POOL_READY_BITS) {
                crng_reseed(NULL); /* Sets crng_init to CRNG_READY under base_crng.lock. */
                if (static_key_initialized && system_unbound_wq)
                        queue_work(system_unbound_wq, &set_ready);
                atomic_notifier_call_chain(&random_ready_notifier, 0, NULL);
#ifdef CONFIG_VDSO_GETRANDOM
                WRITE_ONCE(vdso_k_rng_data->is_ready, true);
#endif
                wake_up_interruptible(&crng_init_wait);
                kill_fasync(&fasync, SIGIO, POLL_IN);
                pr_notice("crng init done\n");
                m = ratelimit_state_get_miss(&urandom_warning);
                if (m)
                        pr_notice("%d urandom warning(s) missed due to ratelimiting\n", m);
        } else if (orig < POOL_EARLY_BITS && new >= POOL_EARLY_BITS) {
                spin_lock_irqsave(&base_crng.lock, flags);
                /* Check if crng_init is CRNG_EMPTY, to avoid race with crng_reseed(). */
                if (crng_init == CRNG_EMPTY) {
                        extract_entropy(base_crng.key, sizeof(base_crng.key));
                        crng_init = CRNG_EARLY;
                }
                spin_unlock_irqrestore(&base_crng.lock, flags);
        }
}


/**********************************************************************
 *
 * Entropy collection routines.
 *
 * The following exported functions are used for pushing entropy into
 * the above entropy accumulation routines:
 *
 *        void add_device_randomness(const void *buf, size_t len);
 *        void add_hwgenerator_randomness(const void *buf, size_t len, size_t entropy, bool sleep_after);
 *        void add_bootloader_randomness(const void *buf, size_t len);
 *        void add_vmfork_randomness(const void *unique_vm_id, size_t len);
 *        void add_interrupt_randomness(int irq);
 *        void add_input_randomness(unsigned int type, unsigned int code, unsigned int value);
 *        void add_disk_randomness(struct gendisk *disk);
 *
 * add_device_randomness() adds data to the input pool that
 * is likely to differ between two devices (or possibly even per boot).
 * This would be things like MAC addresses or serial numbers, or the
 * read-out of the RTC. This does *not* credit any actual entropy to
 * the pool, but it initializes the pool to different values for devices
 * that might otherwise be identical and have very little entropy
 * available to them (particularly common in the embedded world).
 *
 * add_hwgenerator_randomness() is for true hardware RNGs, and will credit
 * entropy as specified by the caller. If the entropy pool is full it will
 * block until more entropy is needed.
 *
 * add_bootloader_randomness() is called by bootloader drivers, such as EFI
 * and device tree, and credits its input depending on whether or not the
 * command line option 'random.trust_bootloader'.
 *
 * add_vmfork_randomness() adds a unique (but not necessarily secret) ID
 * representing the current instance of a VM to the pool, without crediting,
 * and then force-reseeds the crng so that it takes effect immediately.
 *
 * add_interrupt_randomness() uses the interrupt timing as random
 * inputs to the entropy pool. Using the cycle counters and the irq source
 * as inputs, it feeds the input pool roughly once a second or after 64
 * interrupts, crediting 1 bit of entropy for whichever comes first.
 *
 * add_input_randomness() uses the input layer interrupt timing, as well
 * as the event type information from the hardware.
 *
 * add_disk_randomness() uses what amounts to the seek time of block
 * layer request events, on a per-disk_devt basis, as input to the
 * entropy pool. Note that high-speed solid state drives with very low
 * seek times do not make for good sources of entropy, as their seek
 * times are usually fairly consistent.
 *
 * The last two routines try to estimate how many bits of entropy
 * to credit. They do this by keeping track of the first and second
 * order deltas of the event timings.
 *
 **********************************************************************/

static bool trust_cpu __initdata = true;
static bool trust_bootloader __initdata = true;
static int __init parse_trust_cpu(char *arg)
{
        return kstrtobool(arg, &trust_cpu);
}
static int __init parse_trust_bootloader(char *arg)
{
        return kstrtobool(arg, &trust_bootloader);
}
early_param("random.trust_cpu", parse_trust_cpu);
early_param("random.trust_bootloader", parse_trust_bootloader);

static int random_pm_notification(struct notifier_block *nb, unsigned long action, void *data)
{
        unsigned long flags, entropy = random_get_entropy();

        /*
         * Encode a representation of how long the system has been suspended,
         * in a way that is distinct from prior system suspends.
         */
        ktime_t stamps[] = { ktime_get(), ktime_get_boottime(), ktime_get_real() };

        spin_lock_irqsave(&input_pool.lock, flags);
        _mix_pool_bytes(&action, sizeof(action));
        _mix_pool_bytes(stamps, sizeof(stamps));
        _mix_pool_bytes(&entropy, sizeof(entropy));
        spin_unlock_irqrestore(&input_pool.lock, flags);

        if (crng_ready() && (action == PM_RESTORE_PREPARE ||
            (action == PM_POST_SUSPEND && !IS_ENABLED(CONFIG_PM_AUTOSLEEP) &&
             !IS_ENABLED(CONFIG_PM_USERSPACE_AUTOSLEEP)))) {
                crng_reseed(NULL);
                pr_notice("crng reseeded on system resumption\n");
        }
        return 0;
}

static struct notifier_block pm_notifier = { .notifier_call = random_pm_notification };

/*
 * This is called extremely early, before time keeping functionality is
 * available, but arch randomness is. Interrupts are not yet enabled.
 */
void __init random_init_early(const char *command_line)
{
        unsigned long entropy[BLAKE2S_BLOCK_SIZE / sizeof(long)];
        size_t i, longs, arch_bits;

#if defined(LATENT_ENTROPY_PLUGIN)
        static const u8 compiletime_seed[BLAKE2S_BLOCK_SIZE] __initconst __latent_entropy;
        _mix_pool_bytes(compiletime_seed, sizeof(compiletime_seed));
#endif

        for (i = 0, arch_bits = sizeof(entropy) * 8; i < ARRAY_SIZE(entropy);) {
                longs = arch_get_random_seed_longs(entropy, ARRAY_SIZE(entropy) - i);
                if (longs) {
                        _mix_pool_bytes(entropy, sizeof(*entropy) * longs);
                        i += longs;
                        continue;
                }
                longs = arch_get_random_longs(entropy, ARRAY_SIZE(entropy) - i);
                if (longs) {
                        _mix_pool_bytes(entropy, sizeof(*entropy) * longs);
                        i += longs;
                        continue;
                }
                arch_bits -= sizeof(*entropy) * 8;
                ++i;
        }

        _mix_pool_bytes(init_utsname(), sizeof(*(init_utsname())));
        _mix_pool_bytes(command_line, strlen(command_line));

        /* Reseed if already seeded by earlier phases. */
        if (crng_ready())
                crng_reseed(NULL);
        else if (trust_cpu)
                _credit_init_bits(arch_bits);
}

/*
 * This is called a little bit after the prior function, and now there is
 * access to timestamps counters. Interrupts are not yet enabled.
 */
void __init random_init(void)
{
        unsigned long entropy = random_get_entropy();
        ktime_t now = ktime_get_real();

        _mix_pool_bytes(&now, sizeof(now));
        _mix_pool_bytes(&entropy, sizeof(entropy));
        add_latent_entropy();

        /*
         * If we were initialized by the cpu or bootloader before jump labels
         * or workqueues are initialized, then we should enable the static
         * branch here, where it's guaranteed that these have been initialized.
         */
        if (!static_branch_likely(&crng_is_ready) && crng_init >= CRNG_READY)
                crng_set_ready(NULL);

        /* Reseed if already seeded by earlier phases. */
        if (crng_ready())
                crng_reseed(NULL);

        WARN_ON(register_pm_notifier(&pm_notifier));

        WARN(!entropy, "Missing cycle counter and fallback timer; RNG "
                       "entropy collection will consequently suffer.");
}

/*
 * Add device- or boot-specific data to the input pool to help
 * initialize it.
 *
 * None of this adds any entropy; it is meant to avoid the problem of
 * the entropy pool having similar initial state across largely
 * identical devices.
 */
void add_device_randomness(const void *buf, size_t len)
{
        unsigned long entropy = random_get_entropy();
        unsigned long flags;

        spin_lock_irqsave(&input_pool.lock, flags);
        _mix_pool_bytes(&entropy, sizeof(entropy));
        _mix_pool_bytes(buf, len);
        spin_unlock_irqrestore(&input_pool.lock, flags);
}
EXPORT_SYMBOL(add_device_randomness);

/*
 * Interface for in-kernel drivers of true hardware RNGs. Those devices
 * may produce endless random bits, so this function will sleep for
 * some amount of time after, if the sleep_after parameter is true.
 */
void add_hwgenerator_randomness(const void *buf, size_t len, size_t entropy, bool sleep_after)
{
        mix_pool_bytes(buf, len);
        credit_init_bits(entropy);

        /*
         * Throttle writing to once every reseed interval, unless we're not yet
         * initialized or no entropy is credited.
         */
        if (sleep_after && !kthread_should_stop() && (crng_ready() || !entropy))
                schedule_timeout_interruptible(crng_reseed_interval());
}
EXPORT_SYMBOL_GPL(add_hwgenerator_randomness);

/*
 * Handle random seed passed by bootloader, and credit it depending
 * on the command line option 'random.trust_bootloader'.
 */
void __init add_bootloader_randomness(const void *buf, size_t len)
{
        mix_pool_bytes(buf, len);
        if (trust_bootloader)
                credit_init_bits(len * 8);
}

#if IS_ENABLED(CONFIG_VMGENID)
static BLOCKING_NOTIFIER_HEAD(vmfork_chain);

/*
 * Handle a new unique VM ID, which is unique, not secret, so we
 * don't credit it, but we do immediately force a reseed after so
 * that it's used by the crng posthaste.
 */
void __cold add_vmfork_randomness(const void *unique_vm_id, size_t len)
{
        add_device_randomness(unique_vm_id, len);
        if (crng_ready()) {
                crng_reseed(NULL);
                pr_notice("crng reseeded due to virtual machine fork\n");
        }
        blocking_notifier_call_chain(&vmfork_chain, 0, NULL);
}
#if IS_MODULE(CONFIG_VMGENID)
EXPORT_SYMBOL_GPL(add_vmfork_randomness);
#endif

int __cold register_random_vmfork_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&vmfork_chain, nb);
}
EXPORT_SYMBOL_GPL(register_random_vmfork_notifier);

int __cold unregister_random_vmfork_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_unregister(&vmfork_chain, nb);
}
EXPORT_SYMBOL_GPL(unregister_random_vmfork_notifier);
#endif

struct fast_pool {
        unsigned long pool[4];
        unsigned long last;
        unsigned int count;
        struct timer_list mix;
};

static void mix_interrupt_randomness(struct timer_list *work);

static DEFINE_PER_CPU(struct fast_pool, irq_randomness) = {
#ifdef CONFIG_64BIT
#define FASTMIX_PERM SIPHASH_PERMUTATION
        .pool = { SIPHASH_CONST_0, SIPHASH_CONST_1, SIPHASH_CONST_2, SIPHASH_CONST_3 },
#else
#define FASTMIX_PERM HSIPHASH_PERMUTATION
        .pool = { HSIPHASH_CONST_0, HSIPHASH_CONST_1, HSIPHASH_CONST_2, HSIPHASH_CONST_3 },
#endif
        .mix = __TIMER_INITIALIZER(mix_interrupt_randomness, 0)
};

/*
 * This is [Half]SipHash-1-x, starting from an empty key. Because
 * the key is fixed, it assumes that its inputs are non-malicious,
 * and therefore this has no security on its own. s represents the
 * four-word SipHash state, while v represents a two-word input.
 */
static void fast_mix(unsigned long s[4], unsigned long v1, unsigned long v2)
{
        s[3] ^= v1;
        FASTMIX_PERM(s[0], s[1], s[2], s[3]);
        s[0] ^= v1;
        s[3] ^= v2;
        FASTMIX_PERM(s[0], s[1], s[2], s[3]);
        s[0] ^= v2;
}

#ifdef CONFIG_SMP
/*
 * This function is called when the CPU has just come online, with
 * entry CPUHP_AP_RANDOM_ONLINE, just after CPUHP_AP_WORKQUEUE_ONLINE.
 */
int __cold random_online_cpu(unsigned int cpu)
{
        /*
         * During CPU shutdown and before CPU onlining, add_interrupt_
         * randomness() may schedule mix_interrupt_randomness(), and
         * set the MIX_INFLIGHT flag. However, because the worker can
         * be scheduled on a different CPU during this period, that
         * flag will never be cleared. For that reason, we zero out
         * the flag here, which runs just after workqueues are onlined
         * for the CPU again. This also has the effect of setting the
         * irq randomness count to zero so that new accumulated irqs
         * are fresh.
         */
        per_cpu_ptr(&irq_randomness, cpu)->count = 0;
        return 0;
}
#endif

static void mix_interrupt_randomness(struct timer_list *work)
{
        struct fast_pool *fast_pool = container_of(work, struct fast_pool, mix);
        /*
         * The size of the copied stack pool is explicitly 2 longs so that we
         * only ever ingest half of the siphash output each time, retaining
         * the other half as the next "key" that carries over. The entropy is
         * supposed to be sufficiently dispersed between bits so on average
         * we don't wind up "losing" some.
         */
        unsigned long pool[2];
        unsigned int count;

        /* Check to see if we're running on the wrong CPU due to hotplug. */
        local_irq_disable();
        if (fast_pool != this_cpu_ptr(&irq_randomness)) {
                local_irq_enable();
                return;
        }

        /*
         * Copy the pool to the stack so that the mixer always has a
         * consistent view, before we reenable irqs again.
         */
        memcpy(pool, fast_pool->pool, sizeof(pool));
        count = fast_pool->count;
        fast_pool->count = 0;
        fast_pool->last = jiffies;
        local_irq_enable();

        mix_pool_bytes(pool, sizeof(pool));
        credit_init_bits(clamp_t(unsigned int, (count & U16_MAX) / 64, 1, sizeof(pool) * 8));

        memzero_explicit(pool, sizeof(pool));
}

void add_interrupt_randomness(int irq)
{
        enum { MIX_INFLIGHT = 1U << 31 };
        unsigned long entropy = random_get_entropy();
        struct fast_pool *fast_pool = this_cpu_ptr(&irq_randomness);
        struct pt_regs *regs = get_irq_regs();
        unsigned int new_count;

        fast_mix(fast_pool->pool, entropy,
                 (regs ? instruction_pointer(regs) : _RET_IP_) ^ swab(irq));
        new_count = ++fast_pool->count;

        if (new_count & MIX_INFLIGHT)
                return;

        if (new_count < 1024 && !time_is_before_jiffies(fast_pool->last + HZ))
                return;

        fast_pool->count |= MIX_INFLIGHT;
        if (!timer_pending(&fast_pool->mix)) {
                fast_pool->mix.expires = jiffies;
                add_timer_on(&fast_pool->mix, raw_smp_processor_id());
        }
}
EXPORT_SYMBOL_GPL(add_interrupt_randomness);

/* There is one of these per entropy source */
struct timer_rand_state {
        unsigned long last_time;
        long last_delta, last_delta2;
};

/*
 * This function adds entropy to the entropy "pool" by using timing
 * delays. It uses the timer_rand_state structure to make an estimate
 * of how many bits of entropy this call has added to the pool. The
 * value "num" is also added to the pool; it should somehow describe
 * the type of event that just happened.
 */
static void add_timer_randomness(struct timer_rand_state *state, unsigned int num)
{
        unsigned long entropy = random_get_entropy(), now = jiffies, flags;
        long delta, delta2, delta3;
        unsigned int bits;

        /*
         * If we're in a hard IRQ, add_interrupt_randomness() will be called
         * sometime after, so mix into the fast pool.
         */
        if (in_hardirq()) {
                fast_mix(this_cpu_ptr(&irq_randomness)->pool, entropy, num);
        } else {
                spin_lock_irqsave(&input_pool.lock, flags);
                _mix_pool_bytes(&entropy, sizeof(entropy));
                _mix_pool_bytes(&num, sizeof(num));
                spin_unlock_irqrestore(&input_pool.lock, flags);
        }

        if (crng_ready())
                return;

        /*
         * Calculate number of bits of randomness we probably added.
         * We take into account the first, second and third-order deltas
         * in order to make our estimate.
         */
        delta = now - READ_ONCE(state->last_time);
        WRITE_ONCE(state->last_time, now);

        delta2 = delta - READ_ONCE(state->last_delta);
        WRITE_ONCE(state->last_delta, delta);

        delta3 = delta2 - READ_ONCE(state->last_delta2);
        WRITE_ONCE(state->last_delta2, delta2);

        if (delta < 0)
                delta = -delta;
        if (delta2 < 0)
                delta2 = -delta2;
        if (delta3 < 0)
                delta3 = -delta3;
        if (delta > delta2)
                delta = delta2;
        if (delta > delta3)
                delta = delta3;

        /*
         * delta is now minimum absolute delta. Round down by 1 bit
         * on general principles, and limit entropy estimate to 11 bits.
         */
        bits = min(fls(delta >> 1), 11);

        /*
         * As mentioned above, if we're in a hard IRQ, add_interrupt_randomness()
         * will run after this, which uses a different crediting scheme of 1 bit
         * per every 64 interrupts. In order to let that function do accounting
         * close to the one in this function, we credit a full 64/64 bit per bit,
         * and then subtract one to account for the extra one added.
         */
        if (in_hardirq())
                this_cpu_ptr(&irq_randomness)->count += max(1u, bits * 64) - 1;
        else
                _credit_init_bits(bits);
}

void add_input_randomness(unsigned int type, unsigned int code, unsigned int value)
{
        static unsigned char last_value;
        static struct timer_rand_state input_timer_state = { INITIAL_JIFFIES };

        /* Ignore autorepeat and the like. */
        if (value == last_value)
                return;

        last_value = value;
        add_timer_randomness(&input_timer_state,
                             (type << 4) ^ code ^ (code >> 4) ^ value);
}
EXPORT_SYMBOL_GPL(add_input_randomness);

#ifdef CONFIG_BLOCK
void add_disk_randomness(struct gendisk *disk)
{
        if (!disk || !disk->random)
                return;
        /* First major is 1, so we get >= 0x200 here. */
        add_timer_randomness(disk->random, 0x100 + disk_devt(disk));
}
EXPORT_SYMBOL_GPL(add_disk_randomness);

void __cold rand_initialize_disk(struct gendisk *disk)
{
        struct timer_rand_state *state;

        /*
         * If kzalloc returns null, we just won't use that entropy
         * source.
         */
        state = kzalloc(sizeof(struct timer_rand_state), GFP_KERNEL);
        if (state) {
                state->last_time = INITIAL_JIFFIES;
                disk->random = state;
        }
}
#endif

struct entropy_timer_state {
        unsigned long entropy;
        struct timer_list timer;
        atomic_t samples;
        unsigned int samples_per_bit;
};

/*
 * Each time the timer fires, we expect that we got an unpredictable jump in
 * the cycle counter. Even if the timer is running on another CPU, the timer
 * activity will be touching the stack of the CPU that is generating entropy.
 *
 * Note that we don't re-arm the timer in the timer itself - we are happy to be
 * scheduled away, since that just makes the load more complex, but we do not
 * want the timer to keep ticking unless the entropy loop is running.
 *
 * So the re-arming always happens in the entropy loop itself.
 */
static void __cold entropy_timer(struct timer_list *timer)
{
        struct entropy_timer_state *state = container_of(timer, struct entropy_timer_state, timer);
        unsigned long entropy = random_get_entropy();

        mix_pool_bytes(&entropy, sizeof(entropy));
        if (atomic_inc_return(&state->samples) % state->samples_per_bit == 0)
                credit_init_bits(1);
}

/*
 * If we have an actual cycle counter, see if we can generate enough entropy
 * with timing noise.
 */
static void __cold try_to_generate_entropy(void)
{
        enum { NUM_TRIAL_SAMPLES = 8192, MAX_SAMPLES_PER_BIT = HZ / 15 };
        u8 stack_bytes[sizeof(struct entropy_timer_state) + SMP_CACHE_BYTES - 1];
        struct entropy_timer_state *stack = PTR_ALIGN((void *)stack_bytes, SMP_CACHE_BYTES);
        unsigned int i, num_different = 0;
        unsigned long last = random_get_entropy();
        int cpu = -1;

        for (i = 0; i < NUM_TRIAL_SAMPLES - 1; ++i) {
                stack->entropy = random_get_entropy();
                if (stack->entropy != last)
                        ++num_different;
                last = stack->entropy;
        }
        stack->samples_per_bit = DIV_ROUND_UP(NUM_TRIAL_SAMPLES, num_different + 1);
        if (stack->samples_per_bit > MAX_SAMPLES_PER_BIT)
                return;

        atomic_set(&stack->samples, 0);
        timer_setup_on_stack(&stack->timer, entropy_timer, 0);
        while (!crng_ready() && !signal_pending(current)) {
                /*
                 * Check !timer_pending() and then ensure that any previous callback has finished
                 * executing by checking timer_delete_sync_try(), before queueing the next one.
                 */
                if (!timer_pending(&stack->timer) && timer_delete_sync_try(&stack->timer) >= 0) {
                        struct cpumask timer_cpus;
                        unsigned int num_cpus;

                        /*
                         * Preemption must be disabled here, both to read the current CPU number
                         * and to avoid scheduling a timer on a dead CPU.
                         */
                        preempt_disable();

                        /* Only schedule callbacks on timer CPUs that are online. */
                        cpumask_and(&timer_cpus, housekeeping_cpumask(HK_TYPE_TIMER), cpu_online_mask);
                        num_cpus = cpumask_weight(&timer_cpus);
                        /* In very bizarre case of misconfiguration, fallback to all online. */
                        if (unlikely(num_cpus == 0)) {
                                timer_cpus = *cpu_online_mask;
                                num_cpus = cpumask_weight(&timer_cpus);
                        }

                        /* Basic CPU round-robin, which avoids the current CPU. */
                        do {
                                cpu = cpumask_next(cpu, &timer_cpus);
                                if (cpu >= nr_cpu_ids)
                                        cpu = cpumask_first(&timer_cpus);
                        } while (cpu == smp_processor_id() && num_cpus > 1);

                        /* Expiring the timer at `jiffies` means it's the next tick. */
                        stack->timer.expires = jiffies;

                        add_timer_on(&stack->timer, cpu);

                        preempt_enable();
                }
                mix_pool_bytes(&stack->entropy, sizeof(stack->entropy));
                schedule();
                stack->entropy = random_get_entropy();
        }
        mix_pool_bytes(&stack->entropy, sizeof(stack->entropy));

        timer_delete_sync(&stack->timer);
        timer_destroy_on_stack(&stack->timer);
}


/**********************************************************************
 *
 * Userspace reader/writer interfaces.
 *
 * getrandom(2) is the primary modern interface into the RNG and should
 * be used in preference to anything else.
 *
 * Reading from /dev/random has the same functionality as calling
 * getrandom(2) with flags=0. In earlier versions, however, it had
 * vastly different semantics and should therefore be avoided, to
 * prevent backwards compatibility issues.
 *
 * Reading from /dev/urandom has the same functionality as calling
 * getrandom(2) with flags=GRND_INSECURE. Because it does not block
 * waiting for the RNG to be ready, it should not be used.
 *
 * Writing to either /dev/random or /dev/urandom adds entropy to
 * the input pool but does not credit it.
 *
 * Polling on /dev/random indicates when the RNG is initialized, on
 * the read side, and when it wants new entropy, on the write side.
 *
 * Both /dev/random and /dev/urandom have the same set of ioctls for
 * adding entropy, getting the entropy count, zeroing the count, and
 * reseeding the crng.
 *
 **********************************************************************/

SYSCALL_DEFINE3(getrandom, char __user *, ubuf, size_t, len, unsigned int, flags)
{
        struct iov_iter iter;
        int ret;

        if (flags & ~(GRND_NONBLOCK | GRND_RANDOM | GRND_INSECURE))
                return -EINVAL;

        /*
         * Requesting insecure and blocking randomness at the same time makes
         * no sense.
         */
        if ((flags & (GRND_INSECURE | GRND_RANDOM)) == (GRND_INSECURE | GRND_RANDOM))
                return -EINVAL;

        if (!crng_ready() && !(flags & GRND_INSECURE)) {
                if (flags & GRND_NONBLOCK)
                        return -EAGAIN;
                ret = wait_for_random_bytes();
                if (unlikely(ret))
                        return ret;
        }

        ret = import_ubuf(ITER_DEST, ubuf, len, &iter);
        if (unlikely(ret))
                return ret;
        return get_random_bytes_user(&iter);
}

static __poll_t random_poll(struct file *file, poll_table *wait)
{
        poll_wait(file, &crng_init_wait, wait);
        return crng_ready() ? EPOLLIN | EPOLLRDNORM : EPOLLOUT | EPOLLWRNORM;
}

static ssize_t write_pool_user(struct iov_iter *iter)
{
        u8 block[BLAKE2S_BLOCK_SIZE];
        ssize_t ret = 0;
        size_t copied;

        if (unlikely(!iov_iter_count(iter)))
                return 0;

        for (;;) {
                copied = copy_from_iter(block, sizeof(block), iter);
                ret += copied;
                mix_pool_bytes(block, copied);
                if (!iov_iter_count(iter) || copied != sizeof(block))
                        break;

                BUILD_BUG_ON(PAGE_SIZE % sizeof(block) != 0);
                if (ret % PAGE_SIZE == 0) {
                        if (signal_pending(current))
                                break;
                        cond_resched();
                }
        }

        memzero_explicit(block, sizeof(block));
        return ret ? ret : -EFAULT;
}

static ssize_t random_write_iter(struct kiocb *kiocb, struct iov_iter *iter)
{
        return write_pool_user(iter);
}

static ssize_t urandom_read_iter(struct kiocb *kiocb, struct iov_iter *iter)
{
        static int maxwarn = 10;

        /*
         * Opportunistically attempt to initialize the RNG on platforms that
         * have fast cycle counters, but don't (for now) require it to succeed.
         */
        if (!crng_ready())
                try_to_generate_entropy();

        if (!crng_ready()) {
                if (!ratelimit_disable && maxwarn <= 0)
                        ratelimit_state_inc_miss(&urandom_warning);
                else if (ratelimit_disable || __ratelimit(&urandom_warning)) {
                        --maxwarn;
                        pr_notice("%s: uninitialized urandom read (%zu bytes read)\n",
                                  current->comm, iov_iter_count(iter));
                }
        }

        return get_random_bytes_user(iter);
}

static ssize_t random_read_iter(struct kiocb *kiocb, struct iov_iter *iter)
{
        int ret;

        if (!crng_ready() &&
            ((kiocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO)) ||
             (kiocb->ki_filp->f_flags & O_NONBLOCK)))
                return -EAGAIN;

        ret = wait_for_random_bytes();
        if (ret != 0)
                return ret;
        return get_random_bytes_user(iter);
}

static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
{
        int __user *p = (int __user *)arg;
        int ent_count;

        switch (cmd) {
        case RNDGETENTCNT:
                /* Inherently racy, no point locking. */
                if (put_user(input_pool.init_bits, p))
                        return -EFAULT;
                return 0;
        case RNDADDTOENTCNT:
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                if (get_user(ent_count, p))
                        return -EFAULT;
                if (ent_count < 0)
                        return -EINVAL;
                credit_init_bits(ent_count);
                return 0;
        case RNDADDENTROPY: {
                struct iov_iter iter;
                ssize_t ret;
                int len;

                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                if (get_user(ent_count, p++))
                        return -EFAULT;
                if (ent_count < 0)
                        return -EINVAL;
                if (get_user(len, p++))
                        return -EFAULT;
                ret = import_ubuf(ITER_SOURCE, p, len, &iter);
                if (unlikely(ret))
                        return ret;
                ret = write_pool_user(&iter);
                if (unlikely(ret < 0))
                        return ret;
                /* Since we're crediting, enforce that it was all written into the pool. */
                if (unlikely(ret != len))
                        return -EFAULT;
                credit_init_bits(ent_count);
                return 0;
        }
        case RNDZAPENTCNT:
        case RNDCLEARPOOL:
                /* No longer has any effect. */
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                return 0;
        case RNDRESEEDCRNG:
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                if (!crng_ready())
                        return -ENODATA;
                crng_reseed(NULL);
                return 0;
        default:
                return -EINVAL;
        }
}

static int random_fasync(int fd, struct file *filp, int on)
{
        return fasync_helper(fd, filp, on, &fasync);
}

const struct file_operations random_fops = {
        .read_iter = random_read_iter,
        .write_iter = random_write_iter,
        .poll = random_poll,
        .unlocked_ioctl = random_ioctl,
        .compat_ioctl = compat_ptr_ioctl,
        .fasync = random_fasync,
        .llseek = noop_llseek,
        .splice_read = copy_splice_read,
        .splice_write = iter_file_splice_write,
};

const struct file_operations urandom_fops = {
        .read_iter = urandom_read_iter,
        .write_iter = random_write_iter,
        .unlocked_ioctl = random_ioctl,
        .compat_ioctl = compat_ptr_ioctl,
        .fasync = random_fasync,
        .llseek = noop_llseek,
        .splice_read = copy_splice_read,
        .splice_write = iter_file_splice_write,
};


/********************************************************************
 *
 * Sysctl interface.
 *
 * These are partly unused legacy knobs with dummy values to not break
 * userspace and partly still useful things. They are usually accessible
 * in /proc/sys/kernel/random/ and are as follows:
 *
 * - boot_id - a UUID representing the current boot.
 *
 * - uuid - a random UUID, different each time the file is read.
 *
 * - poolsize - the number of bits of entropy that the input pool can
 *   hold, tied to the POOL_BITS constant.
 *
 * - entropy_avail - the number of bits of entropy currently in the
 *   input pool. Always <= poolsize.
 *
 * - write_wakeup_threshold - the amount of entropy in the input pool
 *   below which write polls to /dev/random will unblock, requesting
 *   more entropy, tied to the POOL_READY_BITS constant. It is writable
 *   to avoid breaking old userspaces, but writing to it does not
 *   change any behavior of the RNG.
 *
 * - urandom_min_reseed_secs - fixed to the value CRNG_RESEED_INTERVAL.
 *   It is writable to avoid breaking old userspaces, but writing
 *   to it does not change any behavior of the RNG.
 *
 ********************************************************************/

#ifdef CONFIG_SYSCTL

#include <linux/sysctl.h>

static int sysctl_random_min_urandom_seed = CRNG_RESEED_INTERVAL / HZ;
static int sysctl_random_write_wakeup_bits = POOL_READY_BITS;
static int sysctl_poolsize = POOL_BITS;
static u8 sysctl_bootid[UUID_SIZE];

/*
 * This function is used to return both the bootid UUID, and random
 * UUID. The difference is in whether table->data is NULL; if it is,
 * then a new UUID is generated and returned to the user.
 */
static int proc_do_uuid(const struct ctl_table *table, int write, void *buf,
                        size_t *lenp, loff_t *ppos)
{
        u8 tmp_uuid[UUID_SIZE], *uuid;
        char uuid_string[UUID_STRING_LEN + 1];
        struct ctl_table fake_table = {
                .data = uuid_string,
                .maxlen = UUID_STRING_LEN
        };

        if (write)
                return -EPERM;

        uuid = table->data;
        if (!uuid) {
                uuid = tmp_uuid;
                generate_random_uuid(uuid);
        } else {
                static DEFINE_SPINLOCK(bootid_spinlock);

                spin_lock(&bootid_spinlock);
                if (!uuid[8])
                        generate_random_uuid(uuid);
                spin_unlock(&bootid_spinlock);
        }

        snprintf(uuid_string, sizeof(uuid_string), "%pU", uuid);
        return proc_dostring(&fake_table, 0, buf, lenp, ppos);
}

/* The same as proc_dointvec, but writes don't change anything. */
static int proc_do_rointvec(const struct ctl_table *table, int write, void *buf,
                            size_t *lenp, loff_t *ppos)
{
        return write ? 0 : proc_dointvec(table, 0, buf, lenp, ppos);
}

static const struct ctl_table random_table[] = {
        {
                .procname        = "poolsize",
                .data                = &sysctl_poolsize,
                .maxlen                = sizeof(int),
                .mode                = 0444,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "entropy_avail",
                .data                = &input_pool.init_bits,
                .maxlen                = sizeof(int),
                .mode                = 0444,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "write_wakeup_threshold",
                .data                = &sysctl_random_write_wakeup_bits,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_do_rointvec,
        },
        {
                .procname        = "urandom_min_reseed_secs",
                .data                = &sysctl_random_min_urandom_seed,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_do_rointvec,
        },
        {
                .procname        = "boot_id",
                .data                = &sysctl_bootid,
                .mode                = 0444,
                .proc_handler        = proc_do_uuid,
        },
        {
                .procname        = "uuid",
                .mode                = 0444,
                .proc_handler        = proc_do_uuid,
        },
};

/*
 * random_init() is called before sysctl_init(),
 * so we cannot call register_sysctl_init() in random_init()
 */
static int __init random_sysctls_init(void)
{
        register_sysctl_init("kernel/random", random_table);
        return 0;
}
device_initcall(random_sysctls_init);
#endif





































































































































































































































































































































































































































































































































































































































































































































  316 


  319 
  319 



























  314 


  314 
  313 
  314 


  315 









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Linux Socket Filter - Kernel level socket filtering
 *
 * Based on the design of the Berkeley Packet Filter. The new
 * internal format has been designed by PLUMgrid:
 *
 *        Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
 *
 * Authors:
 *
 *        Jay Schulist <jschlst@samba.org>
 *        Alexei Starovoitov <ast@plumgrid.com>
 *        Daniel Borkmann <dborkman@redhat.com>
 *
 * Andi Kleen - Fix a few bad bugs and races.
 * Kris Katterjohn - Added many additional checks in bpf_check_classic()
 */

#include <uapi/linux/btf.h>
#include <crypto/sha1.h>
#include <linux/filter.h>
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
#include <linux/prandom.h>
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/objtool.h>
#include <linux/overflow.h>
#include <linux/rbtree_latch.h>
#include <linux/kallsyms.h>
#include <linux/rcupdate.h>
#include <linux/perf_event.h>
#include <linux/extable.h>
#include <linux/log2.h>
#include <linux/bpf_verifier.h>
#include <linux/nodemask.h>
#include <linux/nospec.h>
#include <linux/bpf_mem_alloc.h>
#include <linux/memcontrol.h>
#include <linux/execmem.h>
#include <crypto/sha2.h>

#include <asm/barrier.h>
#include <linux/unaligned.h>

/* Registers */
#define BPF_R0        regs[BPF_REG_0]
#define BPF_R1        regs[BPF_REG_1]
#define BPF_R2        regs[BPF_REG_2]
#define BPF_R3        regs[BPF_REG_3]
#define BPF_R4        regs[BPF_REG_4]
#define BPF_R5        regs[BPF_REG_5]
#define BPF_R6        regs[BPF_REG_6]
#define BPF_R7        regs[BPF_REG_7]
#define BPF_R8        regs[BPF_REG_8]
#define BPF_R9        regs[BPF_REG_9]
#define BPF_R10        regs[BPF_REG_10]

/* Named registers */
#define DST        regs[insn->dst_reg]
#define SRC        regs[insn->src_reg]
#define FP        regs[BPF_REG_FP]
#define AX        regs[BPF_REG_AX]
#define ARG1        regs[BPF_REG_ARG1]
#define CTX        regs[BPF_REG_CTX]
#define OFF        insn->off
#define IMM        insn->imm

struct bpf_mem_alloc bpf_global_ma;
bool bpf_global_ma_set;

/* No hurry in this branch
 *
 * Exported for the bpf jit load helper.
 */
void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size)
{
        u8 *ptr = NULL;

        if (k >= SKF_NET_OFF) {
                ptr = skb_network_header(skb) + k - SKF_NET_OFF;
        } else if (k >= SKF_LL_OFF) {
                if (unlikely(!skb_mac_header_was_set(skb)))
                        return NULL;
                ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
        }
        if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
                return ptr;

        return NULL;
}

/* tell bpf programs that include vmlinux.h kernel's PAGE_SIZE */
enum page_size_enum {
        __PAGE_SIZE = PAGE_SIZE
};

struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
{
        gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
        struct bpf_prog_aux *aux;
        struct bpf_prog *fp;

        size = round_up(size, __PAGE_SIZE);
        fp = __vmalloc(size, gfp_flags);
        if (fp == NULL)
                return NULL;

        aux = kzalloc(sizeof(*aux), bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
        if (aux == NULL) {
                vfree(fp);
                return NULL;
        }
        fp->active = alloc_percpu_gfp(int, bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
        if (!fp->active) {
                vfree(fp);
                kfree(aux);
                return NULL;
        }

        fp->pages = size / PAGE_SIZE;
        fp->aux = aux;
        fp->aux->main_prog_aux = aux;
        fp->aux->prog = fp;
        fp->jit_requested = ebpf_jit_enabled();
        fp->blinding_requested = bpf_jit_blinding_enabled(fp);
#ifdef CONFIG_CGROUP_BPF
        aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID;
#endif

        INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode);
#ifdef CONFIG_FINEIBT
        INIT_LIST_HEAD_RCU(&fp->aux->ksym_prefix.lnode);
#endif
        mutex_init(&fp->aux->used_maps_mutex);
        mutex_init(&fp->aux->ext_mutex);
        mutex_init(&fp->aux->dst_mutex);

#ifdef CONFIG_BPF_SYSCALL
        bpf_prog_stream_init(fp);
#endif

        return fp;
}

struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
{
        gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
        struct bpf_prog *prog;
        int cpu;

        prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags);
        if (!prog)
                return NULL;

        prog->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags);
        if (!prog->stats) {
                free_percpu(prog->active);
                kfree(prog->aux);
                vfree(prog);
                return NULL;
        }

        for_each_possible_cpu(cpu) {
                struct bpf_prog_stats *pstats;

                pstats = per_cpu_ptr(prog->stats, cpu);
                u64_stats_init(&pstats->syncp);
        }
        return prog;
}
EXPORT_SYMBOL_GPL(bpf_prog_alloc);

int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
{
        if (!prog->aux->nr_linfo || !prog->jit_requested)
                return 0;

        prog->aux->jited_linfo = kvcalloc(prog->aux->nr_linfo,
                                          sizeof(*prog->aux->jited_linfo),
                                          bpf_memcg_flags(GFP_KERNEL | __GFP_NOWARN));
        if (!prog->aux->jited_linfo)
                return -ENOMEM;

        return 0;
}

void bpf_prog_jit_attempt_done(struct bpf_prog *prog)
{
        if (prog->aux->jited_linfo &&
            (!prog->jited || !prog->aux->jited_linfo[0])) {
                kvfree(prog->aux->jited_linfo);
                prog->aux->jited_linfo = NULL;
        }

        kfree(prog->aux->kfunc_tab);
        prog->aux->kfunc_tab = NULL;
}

/* The jit engine is responsible to provide an array
 * for insn_off to the jited_off mapping (insn_to_jit_off).
 *
 * The idx to this array is the insn_off.  Hence, the insn_off
 * here is relative to the prog itself instead of the main prog.
 * This array has one entry for each xlated bpf insn.
 *
 * jited_off is the byte off to the end of the jited insn.
 *
 * Hence, with
 * insn_start:
 *      The first bpf insn off of the prog.  The insn off
 *      here is relative to the main prog.
 *      e.g. if prog is a subprog, insn_start > 0
 * linfo_idx:
 *      The prog's idx to prog->aux->linfo and jited_linfo
 *
 * jited_linfo[linfo_idx] = prog->bpf_func
 *
 * For i > linfo_idx,
 *
 * jited_linfo[i] = prog->bpf_func +
 *        insn_to_jit_off[linfo[i].insn_off - insn_start - 1]
 */
void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
                               const u32 *insn_to_jit_off)
{
        u32 linfo_idx, insn_start, insn_end, nr_linfo, i;
        const struct bpf_line_info *linfo;
        void **jited_linfo;

        if (!prog->aux->jited_linfo || prog->aux->func_idx > prog->aux->func_cnt)
                /* Userspace did not provide linfo */
                return;

        linfo_idx = prog->aux->linfo_idx;
        linfo = &prog->aux->linfo[linfo_idx];
        insn_start = linfo[0].insn_off;
        insn_end = insn_start + prog->len;

        jited_linfo = &prog->aux->jited_linfo[linfo_idx];
        jited_linfo[0] = prog->bpf_func;

        nr_linfo = prog->aux->nr_linfo - linfo_idx;

        for (i = 1; i < nr_linfo && linfo[i].insn_off < insn_end; i++)
                /* The verifier ensures that linfo[i].insn_off is
                 * strictly increasing
                 */
                jited_linfo[i] = prog->bpf_func +
                        insn_to_jit_off[linfo[i].insn_off - insn_start - 1];
}

struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
                                  gfp_t gfp_extra_flags)
{
        gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
        struct bpf_prog *fp;
        u32 pages;

        size = round_up(size, PAGE_SIZE);
        pages = size / PAGE_SIZE;
        if (pages <= fp_old->pages)
                return fp_old;

        fp = __vmalloc(size, gfp_flags);
        if (fp) {
                memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
                fp->pages = pages;
                fp->aux->prog = fp;

                /* We keep fp->aux from fp_old around in the new
                 * reallocated structure.
                 */
                fp_old->aux = NULL;
                fp_old->stats = NULL;
                fp_old->active = NULL;
                __bpf_prog_free(fp_old);
        }

        return fp;
}

void __bpf_prog_free(struct bpf_prog *fp)
{
        if (fp->aux) {
                mutex_destroy(&fp->aux->used_maps_mutex);
                mutex_destroy(&fp->aux->dst_mutex);
                kfree(fp->aux->poke_tab);
                kfree(fp->aux);
        }
        free_percpu(fp->stats);
        free_percpu(fp->active);
        vfree(fp);
}

int bpf_prog_calc_tag(struct bpf_prog *fp)
{
        size_t size = bpf_prog_insn_size(fp);
        struct bpf_insn *dst;
        bool was_ld_map;
        u32 i;

        dst = vmalloc(size);
        if (!dst)
                return -ENOMEM;

        /* We need to take out the map fd for the digest calculation
         * since they are unstable from user space side.
         */
        for (i = 0, was_ld_map = false; i < fp->len; i++) {
                dst[i] = fp->insnsi[i];
                if (!was_ld_map &&
                    dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) &&
                    (dst[i].src_reg == BPF_PSEUDO_MAP_FD ||
                     dst[i].src_reg == BPF_PSEUDO_MAP_VALUE)) {
                        was_ld_map = true;
                        dst[i].imm = 0;
                } else if (was_ld_map &&
                           dst[i].code == 0 &&
                           dst[i].dst_reg == 0 &&
                           dst[i].src_reg == 0 &&
                           dst[i].off == 0) {
                        was_ld_map = false;
                        dst[i].imm = 0;
                } else {
                        was_ld_map = false;
                }
        }
        sha256((u8 *)dst, size, fp->digest);
        vfree(dst);
        return 0;
}

static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old,
                                s32 end_new, s32 curr, const bool probe_pass)
{
        const s64 imm_min = S32_MIN, imm_max = S32_MAX;
        s32 delta = end_new - end_old;
        s64 imm = insn->imm;

        if (curr < pos && curr + imm + 1 >= end_old)
                imm += delta;
        else if (curr >= end_new && curr + imm + 1 < end_new)
                imm -= delta;
        if (imm < imm_min || imm > imm_max)
                return -ERANGE;
        if (!probe_pass)
                insn->imm = imm;
        return 0;
}

static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old,
                                s32 end_new, s32 curr, const bool probe_pass)
{
        s64 off_min, off_max, off;
        s32 delta = end_new - end_old;

        if (insn->code == (BPF_JMP32 | BPF_JA)) {
                off = insn->imm;
                off_min = S32_MIN;
                off_max = S32_MAX;
        } else {
                off = insn->off;
                off_min = S16_MIN;
                off_max = S16_MAX;
        }

        if (curr < pos && curr + off + 1 >= end_old)
                off += delta;
        else if (curr >= end_new && curr + off + 1 < end_new)
                off -= delta;
        if (off < off_min || off > off_max)
                return -ERANGE;
        if (!probe_pass) {
                if (insn->code == (BPF_JMP32 | BPF_JA))
                        insn->imm = off;
                else
                        insn->off = off;
        }
        return 0;
}

static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old,
                            s32 end_new, const bool probe_pass)
{
        u32 i, insn_cnt = prog->len + (probe_pass ? end_new - end_old : 0);
        struct bpf_insn *insn = prog->insnsi;
        int ret = 0;

        for (i = 0; i < insn_cnt; i++, insn++) {
                u8 code;

                /* In the probing pass we still operate on the original,
                 * unpatched image in order to check overflows before we
                 * do any other adjustments. Therefore skip the patchlet.
                 */
                if (probe_pass && i == pos) {
                        i = end_new;
                        insn = prog->insnsi + end_old;
                }
                if (bpf_pseudo_func(insn)) {
                        ret = bpf_adj_delta_to_imm(insn, pos, end_old,
                                                   end_new, i, probe_pass);
                        if (ret)
                                return ret;
                        continue;
                }
                code = insn->code;
                if ((BPF_CLASS(code) != BPF_JMP &&
                     BPF_CLASS(code) != BPF_JMP32) ||
                    BPF_OP(code) == BPF_EXIT)
                        continue;
                /* Adjust offset of jmps if we cross patch boundaries. */
                if (BPF_OP(code) == BPF_CALL) {
                        if (insn->src_reg != BPF_PSEUDO_CALL)
                                continue;
                        ret = bpf_adj_delta_to_imm(insn, pos, end_old,
                                                   end_new, i, probe_pass);
                } else {
                        ret = bpf_adj_delta_to_off(insn, pos, end_old,
                                                   end_new, i, probe_pass);
                }
                if (ret)
                        break;
        }

        return ret;
}

static void bpf_adj_linfo(struct bpf_prog *prog, u32 off, u32 delta)
{
        struct bpf_line_info *linfo;
        u32 i, nr_linfo;

        nr_linfo = prog->aux->nr_linfo;
        if (!nr_linfo || !delta)
                return;

        linfo = prog->aux->linfo;

        for (i = 0; i < nr_linfo; i++)
                if (off < linfo[i].insn_off)
                        break;

        /* Push all off < linfo[i].insn_off by delta */
        for (; i < nr_linfo; i++)
                linfo[i].insn_off += delta;
}

struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
                                       const struct bpf_insn *patch, u32 len)
{
        u32 insn_adj_cnt, insn_rest, insn_delta = len - 1;
        const u32 cnt_max = S16_MAX;
        struct bpf_prog *prog_adj;
        int err;

        /* Since our patchlet doesn't expand the image, we're done. */
        if (insn_delta == 0) {
                memcpy(prog->insnsi + off, patch, sizeof(*patch));
                return prog;
        }

        insn_adj_cnt = prog->len + insn_delta;

        /* Reject anything that would potentially let the insn->off
         * target overflow when we have excessive program expansions.
         * We need to probe here before we do any reallocation where
         * we afterwards may not fail anymore.
         */
        if (insn_adj_cnt > cnt_max &&
            (err = bpf_adj_branches(prog, off, off + 1, off + len, true)))
                return ERR_PTR(err);

        /* Several new instructions need to be inserted. Make room
         * for them. Likely, there's no need for a new allocation as
         * last page could have large enough tailroom.
         */
        prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt),
                                    GFP_USER);
        if (!prog_adj)
                return ERR_PTR(-ENOMEM);

        prog_adj->len = insn_adj_cnt;

        /* Patching happens in 3 steps:
         *
         * 1) Move over tail of insnsi from next instruction onwards,
         *    so we can patch the single target insn with one or more
         *    new ones (patching is always from 1 to n insns, n > 0).
         * 2) Inject new instructions at the target location.
         * 3) Adjust branch offsets if necessary.
         */
        insn_rest = insn_adj_cnt - off - len;

        memmove(prog_adj->insnsi + off + len, prog_adj->insnsi + off + 1,
                sizeof(*patch) * insn_rest);
        memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len);

        /* We are guaranteed to not fail at this point, otherwise
         * the ship has sailed to reverse to the original state. An
         * overflow cannot happen at this point.
         */
        BUG_ON(bpf_adj_branches(prog_adj, off, off + 1, off + len, false));

        bpf_adj_linfo(prog_adj, off, insn_delta);

        return prog_adj;
}

int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt)
{
        int err;

        /* Branch offsets can't overflow when program is shrinking, no need
         * to call bpf_adj_branches(..., true) here
         */
        memmove(prog->insnsi + off, prog->insnsi + off + cnt,
                sizeof(struct bpf_insn) * (prog->len - off - cnt));
        prog->len -= cnt;

        err = bpf_adj_branches(prog, off, off + cnt, off, false);
        WARN_ON_ONCE(err);
        return err;
}

static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp)
{
        int i;

        for (i = 0; i < fp->aux->real_func_cnt; i++)
                bpf_prog_kallsyms_del(fp->aux->func[i]);
}

void bpf_prog_kallsyms_del_all(struct bpf_prog *fp)
{
        bpf_prog_kallsyms_del_subprogs(fp);
        bpf_prog_kallsyms_del(fp);
}

#ifdef CONFIG_BPF_JIT
/* All BPF JIT sysctl knobs here. */
int bpf_jit_enable   __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
int bpf_jit_harden   __read_mostly;
long bpf_jit_limit   __read_mostly;
long bpf_jit_limit_max __read_mostly;

static void
bpf_prog_ksym_set_addr(struct bpf_prog *prog)
{
        WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog));

        prog->aux->ksym.start = (unsigned long) prog->bpf_func;
        prog->aux->ksym.end   = prog->aux->ksym.start + prog->jited_len;
}

static void
bpf_prog_ksym_set_name(struct bpf_prog *prog)
{
        char *sym = prog->aux->ksym.name;
        const char *end = sym + KSYM_NAME_LEN;
        const struct btf_type *type;
        const char *func_name;

        BUILD_BUG_ON(sizeof("bpf_prog_") +
                     sizeof(prog->tag) * 2 +
                     /* name has been null terminated.
                      * We should need +1 for the '_' preceding
                      * the name.  However, the null character
                      * is double counted between the name and the
                      * sizeof("bpf_prog_") above, so we omit
                      * the +1 here.
                      */
                     sizeof(prog->aux->name) > KSYM_NAME_LEN);

        sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
        sym  = bin2hex(sym, prog->tag, sizeof(prog->tag));

        /* prog->aux->name will be ignored if full btf name is available */
        if (prog->aux->func_info_cnt && prog->aux->func_idx < prog->aux->func_info_cnt) {
                type = btf_type_by_id(prog->aux->btf,
                                      prog->aux->func_info[prog->aux->func_idx].type_id);
                func_name = btf_name_by_offset(prog->aux->btf, type->name_off);
                snprintf(sym, (size_t)(end - sym), "_%s", func_name);
                return;
        }

        if (prog->aux->name[0])
                snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name);
        else
                *sym = 0;
}

static unsigned long bpf_get_ksym_start(struct latch_tree_node *n)
{
        return container_of(n, struct bpf_ksym, tnode)->start;
}

static __always_inline bool bpf_tree_less(struct latch_tree_node *a,
                                          struct latch_tree_node *b)
{
        return bpf_get_ksym_start(a) < bpf_get_ksym_start(b);
}

static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n)
{
        unsigned long val = (unsigned long)key;
        const struct bpf_ksym *ksym;

        ksym = container_of(n, struct bpf_ksym, tnode);

        if (val < ksym->start)
                return -1;
        /* Ensure that we detect return addresses as part of the program, when
         * the final instruction is a call for a program part of the stack
         * trace. Therefore, do val > ksym->end instead of val >= ksym->end.
         */
        if (val > ksym->end)
                return  1;

        return 0;
}

static const struct latch_tree_ops bpf_tree_ops = {
        .less        = bpf_tree_less,
        .comp        = bpf_tree_comp,
};

static DEFINE_SPINLOCK(bpf_lock);
static LIST_HEAD(bpf_kallsyms);
static struct latch_tree_root bpf_tree __cacheline_aligned;

void bpf_ksym_add(struct bpf_ksym *ksym)
{
        spin_lock_bh(&bpf_lock);
        WARN_ON_ONCE(!list_empty(&ksym->lnode));
        list_add_tail_rcu(&ksym->lnode, &bpf_kallsyms);
        latch_tree_insert(&ksym->tnode, &bpf_tree, &bpf_tree_ops);
        spin_unlock_bh(&bpf_lock);
}

static void __bpf_ksym_del(struct bpf_ksym *ksym)
{
        if (list_empty(&ksym->lnode))
                return;

        latch_tree_erase(&ksym->tnode, &bpf_tree, &bpf_tree_ops);
        list_del_rcu(&ksym->lnode);
}

void bpf_ksym_del(struct bpf_ksym *ksym)
{
        spin_lock_bh(&bpf_lock);
        __bpf_ksym_del(ksym);
        spin_unlock_bh(&bpf_lock);
}

static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
{
        return fp->jited && !bpf_prog_was_classic(fp);
}

void bpf_prog_kallsyms_add(struct bpf_prog *fp)
{
        if (!bpf_prog_kallsyms_candidate(fp) ||
            !bpf_token_capable(fp->aux->token, CAP_BPF))
                return;

        bpf_prog_ksym_set_addr(fp);
        bpf_prog_ksym_set_name(fp);
        fp->aux->ksym.prog = true;

        bpf_ksym_add(&fp->aux->ksym);

#ifdef CONFIG_FINEIBT
        /*
         * When FineIBT, code in the __cfi_foo() symbols can get executed
         * and hence unwinder needs help.
         */
        if (cfi_mode != CFI_FINEIBT)
                return;

        snprintf(fp->aux->ksym_prefix.name, KSYM_NAME_LEN,
                 "__cfi_%s", fp->aux->ksym.name);

        fp->aux->ksym_prefix.start = (unsigned long) fp->bpf_func - 16;
        fp->aux->ksym_prefix.end   = (unsigned long) fp->bpf_func;

        bpf_ksym_add(&fp->aux->ksym_prefix);
#endif
}

void bpf_prog_kallsyms_del(struct bpf_prog *fp)
{
        if (!bpf_prog_kallsyms_candidate(fp))
                return;

        bpf_ksym_del(&fp->aux->ksym);
#ifdef CONFIG_FINEIBT
        if (cfi_mode != CFI_FINEIBT)
                return;
        bpf_ksym_del(&fp->aux->ksym_prefix);
#endif
}

static struct bpf_ksym *bpf_ksym_find(unsigned long addr)
{
        struct latch_tree_node *n;

        n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops);
        return n ? container_of(n, struct bpf_ksym, tnode) : NULL;
}

int __bpf_address_lookup(unsigned long addr, unsigned long *size,
                                 unsigned long *off, char *sym)
{
        struct bpf_ksym *ksym;
        int ret = 0;

        rcu_read_lock();
        ksym = bpf_ksym_find(addr);
        if (ksym) {
                unsigned long symbol_start = ksym->start;
                unsigned long symbol_end = ksym->end;

                ret = strscpy(sym, ksym->name, KSYM_NAME_LEN);

                if (size)
                        *size = symbol_end - symbol_start;
                if (off)
                        *off  = addr - symbol_start;
        }
        rcu_read_unlock();

        return ret;
}

bool is_bpf_text_address(unsigned long addr)
{
        bool ret;

        rcu_read_lock();
        ret = bpf_ksym_find(addr) != NULL;
        rcu_read_unlock();

        return ret;
}

struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
{
        struct bpf_ksym *ksym;

        WARN_ON_ONCE(!rcu_read_lock_held());
        ksym = bpf_ksym_find(addr);

        return ksym && ksym->prog ?
               container_of(ksym, struct bpf_prog_aux, ksym)->prog :
               NULL;
}

const struct exception_table_entry *search_bpf_extables(unsigned long addr)
{
        const struct exception_table_entry *e = NULL;
        struct bpf_prog *prog;

        rcu_read_lock();
        prog = bpf_prog_ksym_find(addr);
        if (!prog)
                goto out;
        if (!prog->aux->num_exentries)
                goto out;

        e = search_extable(prog->aux->extable, prog->aux->num_exentries, addr);
out:
        rcu_read_unlock();
        return e;
}

int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
                    char *sym)
{
        struct bpf_ksym *ksym;
        unsigned int it = 0;
        int ret = -ERANGE;

        if (!bpf_jit_kallsyms_enabled())
                return ret;

        rcu_read_lock();
        list_for_each_entry_rcu(ksym, &bpf_kallsyms, lnode) {
                if (it++ != symnum)
                        continue;

                strscpy(sym, ksym->name, KSYM_NAME_LEN);

                *value = ksym->start;
                *type  = BPF_SYM_ELF_TYPE;

                ret = 0;
                break;
        }
        rcu_read_unlock();

        return ret;
}

int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
                                struct bpf_jit_poke_descriptor *poke)
{
        struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab;
        static const u32 poke_tab_max = 1024;
        u32 slot = prog->aux->size_poke_tab;
        u32 size = slot + 1;

        if (size > poke_tab_max)
                return -ENOSPC;
        if (poke->tailcall_target || poke->tailcall_target_stable ||
            poke->tailcall_bypass || poke->adj_off || poke->bypass_addr)
                return -EINVAL;

        switch (poke->reason) {
        case BPF_POKE_REASON_TAIL_CALL:
                if (!poke->tail_call.map)
                        return -EINVAL;
                break;
        default:
                return -EINVAL;
        }

        tab = krealloc_array(tab, size, sizeof(*poke), GFP_KERNEL);
        if (!tab)
                return -ENOMEM;

        memcpy(&tab[slot], poke, sizeof(*poke));
        prog->aux->size_poke_tab = size;
        prog->aux->poke_tab = tab;

        return slot;
}

/*
 * BPF program pack allocator.
 *
 * Most BPF programs are pretty small. Allocating a hole page for each
 * program is sometime a waste. Many small bpf program also adds pressure
 * to instruction TLB. To solve this issue, we introduce a BPF program pack
 * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86)
 * to host BPF programs.
 */
#define BPF_PROG_CHUNK_SHIFT        6
#define BPF_PROG_CHUNK_SIZE        (1 << BPF_PROG_CHUNK_SHIFT)
#define BPF_PROG_CHUNK_MASK        (~(BPF_PROG_CHUNK_SIZE - 1))

struct bpf_prog_pack {
        struct list_head list;
        void *ptr;
        unsigned long bitmap[];
};

void bpf_jit_fill_hole_with_zero(void *area, unsigned int size)
{
        memset(area, 0, size);
}

#define BPF_PROG_SIZE_TO_NBITS(size)        (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)

static DEFINE_MUTEX(pack_mutex);
static LIST_HEAD(pack_list);

/* PMD_SIZE is not available in some special config, e.g. ARCH=arm with
 * CONFIG_MMU=n. Use PAGE_SIZE in these cases.
 */
#ifdef PMD_SIZE
/* PMD_SIZE is really big for some archs. It doesn't make sense to
 * reserve too much memory in one allocation. Hardcode BPF_PROG_PACK_SIZE to
 * 2MiB * num_possible_nodes(). On most architectures PMD_SIZE will be
 * greater than or equal to 2MB.
 */
#define BPF_PROG_PACK_SIZE (SZ_2M * num_possible_nodes())
#else
#define BPF_PROG_PACK_SIZE PAGE_SIZE
#endif

#define BPF_PROG_CHUNK_COUNT (BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE)

static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
        struct bpf_prog_pack *pack;
        int err;

        pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)),
                       GFP_KERNEL);
        if (!pack)
                return NULL;
        pack->ptr = bpf_jit_alloc_exec(BPF_PROG_PACK_SIZE);
        if (!pack->ptr)
                goto out;
        bpf_fill_ill_insns(pack->ptr, BPF_PROG_PACK_SIZE);
        bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE);

        set_vm_flush_reset_perms(pack->ptr);
        err = set_memory_rox((unsigned long)pack->ptr,
                             BPF_PROG_PACK_SIZE / PAGE_SIZE);
        if (err)
                goto out;
        list_add_tail(&pack->list, &pack_list);
        return pack;

out:
        bpf_jit_free_exec(pack->ptr);
        kfree(pack);
        return NULL;
}

void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
        unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
        struct bpf_prog_pack *pack;
        unsigned long pos;
        void *ptr = NULL;

        mutex_lock(&pack_mutex);
        if (size > BPF_PROG_PACK_SIZE) {
                size = round_up(size, PAGE_SIZE);
                ptr = bpf_jit_alloc_exec(size);
                if (ptr) {
                        int err;

                        bpf_fill_ill_insns(ptr, size);
                        set_vm_flush_reset_perms(ptr);
                        err = set_memory_rox((unsigned long)ptr,
                                             size / PAGE_SIZE);
                        if (err) {
                                bpf_jit_free_exec(ptr);
                                ptr = NULL;
                        }
                }
                goto out;
        }
        list_for_each_entry(pack, &pack_list, list) {
                pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
                                                 nbits, 0);
                if (pos < BPF_PROG_CHUNK_COUNT)
                        goto found_free_area;
        }

        pack = alloc_new_pack(bpf_fill_ill_insns);
        if (!pack)
                goto out;

        pos = 0;

found_free_area:
        bitmap_set(pack->bitmap, pos, nbits);
        ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT);

out:
        mutex_unlock(&pack_mutex);
        return ptr;
}

void bpf_prog_pack_free(void *ptr, u32 size)
{
        struct bpf_prog_pack *pack = NULL, *tmp;
        unsigned int nbits;
        unsigned long pos;

        mutex_lock(&pack_mutex);
        if (size > BPF_PROG_PACK_SIZE) {
                bpf_jit_free_exec(ptr);
                goto out;
        }

        list_for_each_entry(tmp, &pack_list, list) {
                if (ptr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > ptr) {
                        pack = tmp;
                        break;
                }
        }

        if (WARN_ONCE(!pack, "bpf_prog_pack bug\n"))
                goto out;

        nbits = BPF_PROG_SIZE_TO_NBITS(size);
        pos = ((unsigned long)ptr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT;

        WARN_ONCE(bpf_arch_text_invalidate(ptr, size),
                  "bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n");

        bitmap_clear(pack->bitmap, pos, nbits);
        if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
                                       BPF_PROG_CHUNK_COUNT, 0) == 0) {
                list_del(&pack->list);
                bpf_jit_free_exec(pack->ptr);
                kfree(pack);
        }
out:
        mutex_unlock(&pack_mutex);
}

static atomic_long_t bpf_jit_current;

/* Can be overridden by an arch's JIT compiler if it has a custom,
 * dedicated BPF backend memory area, or if neither of the two
 * below apply.
 */
u64 __weak bpf_jit_alloc_exec_limit(void)
{
#if defined(MODULES_VADDR)
        return MODULES_END - MODULES_VADDR;
#else
        return VMALLOC_END - VMALLOC_START;
#endif
}

static int __init bpf_jit_charge_init(void)
{
        /* Only used as heuristic here to derive limit. */
        bpf_jit_limit_max = bpf_jit_alloc_exec_limit();
        bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 1,
                                            PAGE_SIZE), LONG_MAX);
        return 0;
}
pure_initcall(bpf_jit_charge_init);

int bpf_jit_charge_modmem(u32 size)
{
        if (atomic_long_add_return(size, &bpf_jit_current) > READ_ONCE(bpf_jit_limit)) {
                if (!bpf_capable()) {
                        atomic_long_sub(size, &bpf_jit_current);
                        return -EPERM;
                }
        }

        return 0;
}

void bpf_jit_uncharge_modmem(u32 size)
{
        atomic_long_sub(size, &bpf_jit_current);
}

void *__weak bpf_jit_alloc_exec(unsigned long size)
{
        return execmem_alloc(EXECMEM_BPF, size);
}

void __weak bpf_jit_free_exec(void *addr)
{
        execmem_free(addr);
}

struct bpf_binary_header *
bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
                     unsigned int alignment,
                     bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
        struct bpf_binary_header *hdr;
        u32 size, hole, start;

        WARN_ON_ONCE(!is_power_of_2(alignment) ||
                     alignment > BPF_IMAGE_ALIGNMENT);

        /* Most of BPF filters are really small, but if some of them
         * fill a page, allow at least 128 extra bytes to insert a
         * random section of illegal instructions.
         */
        size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);

        if (bpf_jit_charge_modmem(size))
                return NULL;
        hdr = bpf_jit_alloc_exec(size);
        if (!hdr) {
                bpf_jit_uncharge_modmem(size);
                return NULL;
        }

        /* Fill space with illegal/arch-dep instructions. */
        bpf_fill_ill_insns(hdr, size);

        hdr->size = size;
        hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
                     PAGE_SIZE - sizeof(*hdr));
        start = get_random_u32_below(hole) & ~(alignment - 1);

        /* Leave a random number of instructions before BPF code. */
        *image_ptr = &hdr->image[start];

        return hdr;
}

void bpf_jit_binary_free(struct bpf_binary_header *hdr)
{
        u32 size = hdr->size;

        bpf_jit_free_exec(hdr);
        bpf_jit_uncharge_modmem(size);
}

/* Allocate jit binary from bpf_prog_pack allocator.
 * Since the allocated memory is RO+X, the JIT engine cannot write directly
 * to the memory. To solve this problem, a RW buffer is also allocated at
 * as the same time. The JIT engine should calculate offsets based on the
 * RO memory address, but write JITed program to the RW buffer. Once the
 * JIT engine finishes, it calls bpf_jit_binary_pack_finalize, which copies
 * the JITed program to the RO memory.
 */
struct bpf_binary_header *
bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
                          unsigned int alignment,
                          struct bpf_binary_header **rw_header,
                          u8 **rw_image,
                          bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
        struct bpf_binary_header *ro_header;
        u32 size, hole, start;

        WARN_ON_ONCE(!is_power_of_2(alignment) ||
                     alignment > BPF_IMAGE_ALIGNMENT);

        /* add 16 bytes for a random section of illegal instructions */
        size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE);

        if (bpf_jit_charge_modmem(size))
                return NULL;
        ro_header = bpf_prog_pack_alloc(size, bpf_fill_ill_insns);
        if (!ro_header) {
                bpf_jit_uncharge_modmem(size);
                return NULL;
        }

        *rw_header = kvmalloc(size, GFP_KERNEL);
        if (!*rw_header) {
                bpf_prog_pack_free(ro_header, size);
                bpf_jit_uncharge_modmem(size);
                return NULL;
        }

        /* Fill space with illegal/arch-dep instructions. */
        bpf_fill_ill_insns(*rw_header, size);
        (*rw_header)->size = size;

        hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)),
                     BPF_PROG_CHUNK_SIZE - sizeof(*ro_header));
        start = get_random_u32_below(hole) & ~(alignment - 1);

        *image_ptr = &ro_header->image[start];
        *rw_image = &(*rw_header)->image[start];

        return ro_header;
}

/* Copy JITed text from rw_header to its final location, the ro_header. */
int bpf_jit_binary_pack_finalize(struct bpf_binary_header *ro_header,
                                 struct bpf_binary_header *rw_header)
{
        void *ptr;

        ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size);

        kvfree(rw_header);

        if (IS_ERR(ptr)) {
                bpf_prog_pack_free(ro_header, ro_header->size);
                return PTR_ERR(ptr);
        }
        return 0;
}

/* bpf_jit_binary_pack_free is called in two different scenarios:
 *   1) when the program is freed after;
 *   2) when the JIT engine fails (before bpf_jit_binary_pack_finalize).
 * For case 2), we need to free both the RO memory and the RW buffer.
 *
 * bpf_jit_binary_pack_free requires proper ro_header->size. However,
 * bpf_jit_binary_pack_alloc does not set it. Therefore, ro_header->size
 * must be set with either bpf_jit_binary_pack_finalize (normal path) or
 * bpf_arch_text_copy (when jit fails).
 */
void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
                              struct bpf_binary_header *rw_header)
{
        u32 size = ro_header->size;

        bpf_prog_pack_free(ro_header, size);
        kvfree(rw_header);
        bpf_jit_uncharge_modmem(size);
}

struct bpf_binary_header *
bpf_jit_binary_pack_hdr(const struct bpf_prog *fp)
{
        unsigned long real_start = (unsigned long)fp->bpf_func;
        unsigned long addr;

        addr = real_start & BPF_PROG_CHUNK_MASK;
        return (void *)addr;
}

static inline struct bpf_binary_header *
bpf_jit_binary_hdr(const struct bpf_prog *fp)
{
        unsigned long real_start = (unsigned long)fp->bpf_func;
        unsigned long addr;

        addr = real_start & PAGE_MASK;
        return (void *)addr;
}

/* This symbol is only overridden by archs that have different
 * requirements than the usual eBPF JITs, f.e. when they only
 * implement cBPF JIT, do not set images read-only, etc.
 */
void __weak bpf_jit_free(struct bpf_prog *fp)
{
        if (fp->jited) {
                struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);

                bpf_jit_binary_free(hdr);
                WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
        }

        bpf_prog_unlock_free(fp);
}

int bpf_jit_get_func_addr(const struct bpf_prog *prog,
                          const struct bpf_insn *insn, bool extra_pass,
                          u64 *func_addr, bool *func_addr_fixed)
{
        s16 off = insn->off;
        s32 imm = insn->imm;
        u8 *addr;
        int err;

        *func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL;
        if (!*func_addr_fixed) {
                /* Place-holder address till the last pass has collected
                 * all addresses for JITed subprograms in which case we
                 * can pick them up from prog->aux.
                 */
                if (!extra_pass)
                        addr = NULL;
                else if (prog->aux->func &&
                         off >= 0 && off < prog->aux->real_func_cnt)
                        addr = (u8 *)prog->aux->func[off]->bpf_func;
                else
                        return -EINVAL;
        } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
                   bpf_jit_supports_far_kfunc_call()) {
                err = bpf_get_kfunc_addr(prog, insn->imm, insn->off, &addr);
                if (err)
                        return err;
        } else {
                /* Address of a BPF helper call. Since part of the core
                 * kernel, it's always at a fixed location. __bpf_call_base
                 * and the helper with imm relative to it are both in core
                 * kernel.
                 */
                addr = (u8 *)__bpf_call_base + imm;
        }

        *func_addr = (unsigned long)addr;
        return 0;
}

const char *bpf_jit_get_prog_name(struct bpf_prog *prog)
{
        if (prog->aux->ksym.prog)
                return prog->aux->ksym.name;
        return prog->aux->name;
}

static int bpf_jit_blind_insn(const struct bpf_insn *from,
                              const struct bpf_insn *aux,
                              struct bpf_insn *to_buff,
                              bool emit_zext)
{
        struct bpf_insn *to = to_buff;
        u32 imm_rnd = get_random_u32();
        s16 off;

        BUILD_BUG_ON(BPF_REG_AX  + 1 != MAX_BPF_JIT_REG);
        BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG);

        /* Constraints on AX register:
         *
         * AX register is inaccessible from user space. It is mapped in
         * all JITs, and used here for constant blinding rewrites. It is
         * typically "stateless" meaning its contents are only valid within
         * the executed instruction, but not across several instructions.
         * There are a few exceptions however which are further detailed
         * below.
         *
         * Constant blinding is only used by JITs, not in the interpreter.
         * The interpreter uses AX in some occasions as a local temporary
         * register e.g. in DIV or MOD instructions.
         *
         * In restricted circumstances, the verifier can also use the AX
         * register for rewrites as long as they do not interfere with
         * the above cases!
         */
        if (from->dst_reg == BPF_REG_AX || from->src_reg == BPF_REG_AX)
                goto out;

        if (from->imm == 0 &&
            (from->code == (BPF_ALU   | BPF_MOV | BPF_K) ||
             from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) {
                *to++ = BPF_ALU64_REG(BPF_XOR, from->dst_reg, from->dst_reg);
                goto out;
        }

        switch (from->code) {
        case BPF_ALU | BPF_ADD | BPF_K:
        case BPF_ALU | BPF_SUB | BPF_K:
        case BPF_ALU | BPF_AND | BPF_K:
        case BPF_ALU | BPF_OR  | BPF_K:
        case BPF_ALU | BPF_XOR | BPF_K:
        case BPF_ALU | BPF_MUL | BPF_K:
        case BPF_ALU | BPF_MOV | BPF_K:
        case BPF_ALU | BPF_DIV | BPF_K:
        case BPF_ALU | BPF_MOD | BPF_K:
                *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
                *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                *to++ = BPF_ALU32_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off);
                break;

        case BPF_ALU64 | BPF_ADD | BPF_K:
        case BPF_ALU64 | BPF_SUB | BPF_K:
        case BPF_ALU64 | BPF_AND | BPF_K:
        case BPF_ALU64 | BPF_OR  | BPF_K:
        case BPF_ALU64 | BPF_XOR | BPF_K:
        case BPF_ALU64 | BPF_MUL | BPF_K:
        case BPF_ALU64 | BPF_MOV | BPF_K:
        case BPF_ALU64 | BPF_DIV | BPF_K:
        case BPF_ALU64 | BPF_MOD | BPF_K:
                *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
                *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                *to++ = BPF_ALU64_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off);
                break;

        case BPF_JMP | BPF_JEQ  | BPF_K:
        case BPF_JMP | BPF_JNE  | BPF_K:
        case BPF_JMP | BPF_JGT  | BPF_K:
        case BPF_JMP | BPF_JLT  | BPF_K:
        case BPF_JMP | BPF_JGE  | BPF_K:
        case BPF_JMP | BPF_JLE  | BPF_K:
        case BPF_JMP | BPF_JSGT | BPF_K:
        case BPF_JMP | BPF_JSLT | BPF_K:
        case BPF_JMP | BPF_JSGE | BPF_K:
        case BPF_JMP | BPF_JSLE | BPF_K:
        case BPF_JMP | BPF_JSET | BPF_K:
                /* Accommodate for extra offset in case of a backjump. */
                off = from->off;
                if (off < 0)
                        off -= 2;
                *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
                *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                *to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off);
                break;

        case BPF_JMP32 | BPF_JEQ  | BPF_K:
        case BPF_JMP32 | BPF_JNE  | BPF_K:
        case BPF_JMP32 | BPF_JGT  | BPF_K:
        case BPF_JMP32 | BPF_JLT  | BPF_K:
        case BPF_JMP32 | BPF_JGE  | BPF_K:
        case BPF_JMP32 | BPF_JLE  | BPF_K:
        case BPF_JMP32 | BPF_JSGT | BPF_K:
        case BPF_JMP32 | BPF_JSLT | BPF_K:
        case BPF_JMP32 | BPF_JSGE | BPF_K:
        case BPF_JMP32 | BPF_JSLE | BPF_K:
        case BPF_JMP32 | BPF_JSET | BPF_K:
                /* Accommodate for extra offset in case of a backjump. */
                off = from->off;
                if (off < 0)
                        off -= 2;
                *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
                *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                *to++ = BPF_JMP32_REG(from->code, from->dst_reg, BPF_REG_AX,
                                      off);
                break;

        case BPF_LD | BPF_IMM | BPF_DW:
                *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm);
                *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                *to++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32);
                *to++ = BPF_ALU64_REG(BPF_MOV, aux[0].dst_reg, BPF_REG_AX);
                break;
        case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */
                *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm);
                *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                if (emit_zext)
                        *to++ = BPF_ZEXT_REG(BPF_REG_AX);
                *to++ = BPF_ALU64_REG(BPF_OR,  aux[0].dst_reg, BPF_REG_AX);
                break;

        case BPF_ST | BPF_MEM | BPF_DW:
        case BPF_ST | BPF_MEM | BPF_W:
        case BPF_ST | BPF_MEM | BPF_H:
        case BPF_ST | BPF_MEM | BPF_B:
                *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
                *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                *to++ = BPF_STX_MEM(from->code, from->dst_reg, BPF_REG_AX, from->off);
                break;
        }
out:
        return to - to_buff;
}

static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other,
                                              gfp_t gfp_extra_flags)
{
        gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
        struct bpf_prog *fp;

        fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags);
        if (fp != NULL) {
                /* aux->prog still points to the fp_other one, so
                 * when promoting the clone to the real program,
                 * this still needs to be adapted.
                 */
                memcpy(fp, fp_other, fp_other->pages * PAGE_SIZE);
        }

        return fp;
}

static void bpf_prog_clone_free(struct bpf_prog *fp)
{
        /* aux was stolen by the other clone, so we cannot free
         * it from this path! It will be freed eventually by the
         * other program on release.
         *
         * At this point, we don't need a deferred release since
         * clone is guaranteed to not be locked.
         */
        fp->aux = NULL;
        fp->stats = NULL;
        fp->active = NULL;
        __bpf_prog_free(fp);
}

void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other)
{
        /* We have to repoint aux->prog to self, as we don't
         * know whether fp here is the clone or the original.
         */
        fp->aux->prog = fp;
        bpf_prog_clone_free(fp_other);
}

struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
{
        struct bpf_insn insn_buff[16], aux[2];
        struct bpf_prog *clone, *tmp;
        int insn_delta, insn_cnt;
        struct bpf_insn *insn;
        int i, rewritten;

        if (!prog->blinding_requested || prog->blinded)
                return prog;

        clone = bpf_prog_clone_create(prog, GFP_USER);
        if (!clone)
                return ERR_PTR(-ENOMEM);

        insn_cnt = clone->len;
        insn = clone->insnsi;

        for (i = 0; i < insn_cnt; i++, insn++) {
                if (bpf_pseudo_func(insn)) {
                        /* ld_imm64 with an address of bpf subprog is not
                         * a user controlled constant. Don't randomize it,
                         * since it will conflict with jit_subprogs() logic.
                         */
                        insn++;
                        i++;
                        continue;
                }

                /* We temporarily need to hold the original ld64 insn
                 * so that we can still access the first part in the
                 * second blinding run.
                 */
                if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW) &&
                    insn[1].code == 0)
                        memcpy(aux, insn, sizeof(aux));

                rewritten = bpf_jit_blind_insn(insn, aux, insn_buff,
                                                clone->aux->verifier_zext);
                if (!rewritten)
                        continue;

                tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten);
                if (IS_ERR(tmp)) {
                        /* Patching may have repointed aux->prog during
                         * realloc from the original one, so we need to
                         * fix it up here on error.
                         */
                        bpf_jit_prog_release_other(prog, clone);
                        return tmp;
                }

                clone = tmp;
                insn_delta = rewritten - 1;

                /* Walk new program and skip insns we just inserted. */
                insn = clone->insnsi + i + insn_delta;
                insn_cnt += insn_delta;
                i        += insn_delta;
        }

        clone->blinded = 1;
        return clone;
}
#endif /* CONFIG_BPF_JIT */

/* Base function for offset calculation. Needs to go into .text section,
 * therefore keeping it non-static as well; will also be used by JITs
 * anyway later on, so do not let the compiler omit it. This also needs
 * to go into kallsyms for correlation from e.g. bpftool, so naming
 * must not change.
 */
noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
        return 0;
}
EXPORT_SYMBOL_GPL(__bpf_call_base);

/* All UAPI available opcodes. */
#define BPF_INSN_MAP(INSN_2, INSN_3)                \
        /* 32 bit ALU operations. */                \
        /*   Register based. */                        \
        INSN_3(ALU, ADD,  X),                        \
        INSN_3(ALU, SUB,  X),                        \
        INSN_3(ALU, AND,  X),                        \
        INSN_3(ALU, OR,   X),                        \
        INSN_3(ALU, LSH,  X),                        \
        INSN_3(ALU, RSH,  X),                        \
        INSN_3(ALU, XOR,  X),                        \
        INSN_3(ALU, MUL,  X),                        \
        INSN_3(ALU, MOV,  X),                        \
        INSN_3(ALU, ARSH, X),                        \
        INSN_3(ALU, DIV,  X),                        \
        INSN_3(ALU, MOD,  X),                        \
        INSN_2(ALU, NEG),                        \
        INSN_3(ALU, END, TO_BE),                \
        INSN_3(ALU, END, TO_LE),                \
        /*   Immediate based. */                \
        INSN_3(ALU, ADD,  K),                        \
        INSN_3(ALU, SUB,  K),                        \
        INSN_3(ALU, AND,  K),                        \
        INSN_3(ALU, OR,   K),                        \
        INSN_3(ALU, LSH,  K),                        \
        INSN_3(ALU, RSH,  K),                        \
        INSN_3(ALU, XOR,  K),                        \
        INSN_3(ALU, MUL,  K),                        \
        INSN_3(ALU, MOV,  K),                        \
        INSN_3(ALU, ARSH, K),                        \
        INSN_3(ALU, DIV,  K),                        \
        INSN_3(ALU, MOD,  K),                        \
        /* 64 bit ALU operations. */                \
        /*   Register based. */                        \
        INSN_3(ALU64, ADD,  X),                        \
        INSN_3(ALU64, SUB,  X),                        \
        INSN_3(ALU64, AND,  X),                        \
        INSN_3(ALU64, OR,   X),                        \
        INSN_3(ALU64, LSH,  X),                        \
        INSN_3(ALU64, RSH,  X),                        \
        INSN_3(ALU64, XOR,  X),                        \
        INSN_3(ALU64, MUL,  X),                        \
        INSN_3(ALU64, MOV,  X),                        \
        INSN_3(ALU64, ARSH, X),                        \
        INSN_3(ALU64, DIV,  X),                        \
        INSN_3(ALU64, MOD,  X),                        \
        INSN_2(ALU64, NEG),                        \
        INSN_3(ALU64, END, TO_LE),                \
        /*   Immediate based. */                \
        INSN_3(ALU64, ADD,  K),                        \
        INSN_3(ALU64, SUB,  K),                        \
        INSN_3(ALU64, AND,  K),                        \
        INSN_3(ALU64, OR,   K),                        \
        INSN_3(ALU64, LSH,  K),                        \
        INSN_3(ALU64, RSH,  K),                        \
        INSN_3(ALU64, XOR,  K),                        \
        INSN_3(ALU64, MUL,  K),                        \
        INSN_3(ALU64, MOV,  K),                        \
        INSN_3(ALU64, ARSH, K),                        \
        INSN_3(ALU64, DIV,  K),                        \
        INSN_3(ALU64, MOD,  K),                        \
        /* Call instruction. */                        \
        INSN_2(JMP, CALL),                        \
        /* Exit instruction. */                        \
        INSN_2(JMP, EXIT),                        \
        /* 32-bit Jump instructions. */                \
        /*   Register based. */                        \
        INSN_3(JMP32, JEQ,  X),                        \
        INSN_3(JMP32, JNE,  X),                        \
        INSN_3(JMP32, JGT,  X),                        \
        INSN_3(JMP32, JLT,  X),                        \
        INSN_3(JMP32, JGE,  X),                        \
        INSN_3(JMP32, JLE,  X),                        \
        INSN_3(JMP32, JSGT, X),                        \
        INSN_3(JMP32, JSLT, X),                        \
        INSN_3(JMP32, JSGE, X),                        \
        INSN_3(JMP32, JSLE, X),                        \
        INSN_3(JMP32, JSET, X),                        \
        /*   Immediate based. */                \
        INSN_3(JMP32, JEQ,  K),                        \
        INSN_3(JMP32, JNE,  K),                        \
        INSN_3(JMP32, JGT,  K),                        \
        INSN_3(JMP32, JLT,  K),                        \
        INSN_3(JMP32, JGE,  K),                        \
        INSN_3(JMP32, JLE,  K),                        \
        INSN_3(JMP32, JSGT, K),                        \
        INSN_3(JMP32, JSLT, K),                        \
        INSN_3(JMP32, JSGE, K),                        \
        INSN_3(JMP32, JSLE, K),                        \
        INSN_3(JMP32, JSET, K),                        \
        /* Jump instructions. */                \
        /*   Register based. */                        \
        INSN_3(JMP, JEQ,  X),                        \
        INSN_3(JMP, JNE,  X),                        \
        INSN_3(JMP, JGT,  X),                        \
        INSN_3(JMP, JLT,  X),                        \
        INSN_3(JMP, JGE,  X),                        \
        INSN_3(JMP, JLE,  X),                        \
        INSN_3(JMP, JSGT, X),                        \
        INSN_3(JMP, JSLT, X),                        \
        INSN_3(JMP, JSGE, X),                        \
        INSN_3(JMP, JSLE, X),                        \
        INSN_3(JMP, JSET, X),                        \
        /*   Immediate based. */                \
        INSN_3(JMP, JEQ,  K),                        \
        INSN_3(JMP, JNE,  K),                        \
        INSN_3(JMP, JGT,  K),                        \
        INSN_3(JMP, JLT,  K),                        \
        INSN_3(JMP, JGE,  K),                        \
        INSN_3(JMP, JLE,  K),                        \
        INSN_3(JMP, JSGT, K),                        \
        INSN_3(JMP, JSLT, K),                        \
        INSN_3(JMP, JSGE, K),                        \
        INSN_3(JMP, JSLE, K),                        \
        INSN_3(JMP, JSET, K),                        \
        INSN_2(JMP, JA),                        \
        INSN_2(JMP32, JA),                        \
        /* Atomic operations. */                \
        INSN_3(STX, ATOMIC, B),                        \
        INSN_3(STX, ATOMIC, H),                        \
        INSN_3(STX, ATOMIC, W),                        \
        INSN_3(STX, ATOMIC, DW),                \
        /* Store instructions. */                \
        /*   Register based. */                        \
        INSN_3(STX, MEM,  B),                        \
        INSN_3(STX, MEM,  H),                        \
        INSN_3(STX, MEM,  W),                        \
        INSN_3(STX, MEM,  DW),                        \
        /*   Immediate based. */                \
        INSN_3(ST, MEM, B),                        \
        INSN_3(ST, MEM, H),                        \
        INSN_3(ST, MEM, W),                        \
        INSN_3(ST, MEM, DW),                        \
        /* Load instructions. */                \
        /*   Register based. */                        \
        INSN_3(LDX, MEM, B),                        \
        INSN_3(LDX, MEM, H),                        \
        INSN_3(LDX, MEM, W),                        \
        INSN_3(LDX, MEM, DW),                        \
        INSN_3(LDX, MEMSX, B),                        \
        INSN_3(LDX, MEMSX, H),                        \
        INSN_3(LDX, MEMSX, W),                        \
        /*   Immediate based. */                \
        INSN_3(LD, IMM, DW)

bool bpf_opcode_in_insntable(u8 code)
{
#define BPF_INSN_2_TBL(x, y)    [BPF_##x | BPF_##y] = true
#define BPF_INSN_3_TBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = true
        static const bool public_insntable[256] = {
                [0 ... 255] = false,
                /* Now overwrite non-defaults ... */
                BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL),
                /* UAPI exposed, but rewritten opcodes. cBPF carry-over. */
                [BPF_LD | BPF_ABS | BPF_B] = true,
                [BPF_LD | BPF_ABS | BPF_H] = true,
                [BPF_LD | BPF_ABS | BPF_W] = true,
                [BPF_LD | BPF_IND | BPF_B] = true,
                [BPF_LD | BPF_IND | BPF_H] = true,
                [BPF_LD | BPF_IND | BPF_W] = true,
                [BPF_JMP | BPF_JCOND] = true,
        };
#undef BPF_INSN_3_TBL
#undef BPF_INSN_2_TBL
        return public_insntable[code];
}

#ifndef CONFIG_BPF_JIT_ALWAYS_ON
/**
 *        ___bpf_prog_run - run eBPF program on a given context
 *        @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers
 *        @insn: is the array of eBPF instructions
 *
 * Decode and execute eBPF instructions.
 *
 * Return: whatever value is in %BPF_R0 at program exit
 */
static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
{
#define BPF_INSN_2_LBL(x, y)    [BPF_##x | BPF_##y] = &&x##_##y
#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
        static const void * const jumptable[256] __annotate_jump_table = {
                [0 ... 255] = &&default_label,
                /* Now overwrite non-defaults ... */
                BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL),
                /* Non-UAPI available opcodes. */
                [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS,
                [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
                [BPF_ST  | BPF_NOSPEC] = &&ST_NOSPEC,
                [BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B,
                [BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H,
                [BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W,
                [BPF_LDX | BPF_PROBE_MEM | BPF_DW] = &&LDX_PROBE_MEM_DW,
                [BPF_LDX | BPF_PROBE_MEMSX | BPF_B] = &&LDX_PROBE_MEMSX_B,
                [BPF_LDX | BPF_PROBE_MEMSX | BPF_H] = &&LDX_PROBE_MEMSX_H,
                [BPF_LDX | BPF_PROBE_MEMSX | BPF_W] = &&LDX_PROBE_MEMSX_W,
        };
#undef BPF_INSN_3_LBL
#undef BPF_INSN_2_LBL
        u32 tail_call_cnt = 0;

#define CONT         ({ insn++; goto select_insn; })
#define CONT_JMP ({ insn++; goto select_insn; })

select_insn:
        goto *jumptable[insn->code];

        /* Explicitly mask the register-based shift amounts with 63 or 31
         * to avoid undefined behavior. Normally this won't affect the
         * generated code, for example, in case of native 64 bit archs such
         * as x86-64 or arm64, the compiler is optimizing the AND away for
         * the interpreter. In case of JITs, each of the JIT backends compiles
         * the BPF shift operations to machine instructions which produce
         * implementation-defined results in such a case; the resulting
         * contents of the register may be arbitrary, but program behaviour
         * as a whole remains defined. In other words, in case of JIT backends,
         * the AND must /not/ be added to the emitted LSH/RSH/ARSH translation.
         */
        /* ALU (shifts) */
#define SHT(OPCODE, OP)                                        \
        ALU64_##OPCODE##_X:                                \
                DST = DST OP (SRC & 63);                \
                CONT;                                        \
        ALU_##OPCODE##_X:                                \
                DST = (u32) DST OP ((u32) SRC & 31);        \
                CONT;                                        \
        ALU64_##OPCODE##_K:                                \
                DST = DST OP IMM;                        \
                CONT;                                        \
        ALU_##OPCODE##_K:                                \
                DST = (u32) DST OP (u32) IMM;                \
                CONT;
        /* ALU (rest) */
#define ALU(OPCODE, OP)                                        \
        ALU64_##OPCODE##_X:                                \
                DST = DST OP SRC;                        \
                CONT;                                        \
        ALU_##OPCODE##_X:                                \
                DST = (u32) DST OP (u32) SRC;                \
                CONT;                                        \
        ALU64_##OPCODE##_K:                                \
                DST = DST OP IMM;                        \
                CONT;                                        \
        ALU_##OPCODE##_K:                                \
                DST = (u32) DST OP (u32) IMM;                \
                CONT;
        ALU(ADD,  +)
        ALU(SUB,  -)
        ALU(AND,  &)
        ALU(OR,   |)
        ALU(XOR,  ^)
        ALU(MUL,  *)
        SHT(LSH, <<)
        SHT(RSH, >>)
#undef SHT
#undef ALU
        ALU_NEG:
                DST = (u32) -DST;
                CONT;
        ALU64_NEG:
                DST = -DST;
                CONT;
        ALU_MOV_X:
                switch (OFF) {
                case 0:
                        DST = (u32) SRC;
                        break;
                case 8:
                        DST = (u32)(s8) SRC;
                        break;
                case 16:
                        DST = (u32)(s16) SRC;
                        break;
                }
                CONT;
        ALU_MOV_K:
                DST = (u32) IMM;
                CONT;
        ALU64_MOV_X:
                switch (OFF) {
                case 0:
                        DST = SRC;
                        break;
                case 8:
                        DST = (s8) SRC;
                        break;
                case 16:
                        DST = (s16) SRC;
                        break;
                case 32:
                        DST = (s32) SRC;
                        break;
                }
                CONT;
        ALU64_MOV_K:
                DST = IMM;
                CONT;
        LD_IMM_DW:
                DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
                insn++;
                CONT;
        ALU_ARSH_X:
                DST = (u64) (u32) (((s32) DST) >> (SRC & 31));
                CONT;
        ALU_ARSH_K:
                DST = (u64) (u32) (((s32) DST) >> IMM);
                CONT;
        ALU64_ARSH_X:
                (*(s64 *) &DST) >>= (SRC & 63);
                CONT;
        ALU64_ARSH_K:
                (*(s64 *) &DST) >>= IMM;
                CONT;
        ALU64_MOD_X:
                switch (OFF) {
                case 0:
                        div64_u64_rem(DST, SRC, &AX);
                        DST = AX;
                        break;
                case 1:
                        AX = div64_s64(DST, SRC);
                        DST = DST - AX * SRC;
                        break;
                }
                CONT;
        ALU_MOD_X:
                switch (OFF) {
                case 0:
                        AX = (u32) DST;
                        DST = do_div(AX, (u32) SRC);
                        break;
                case 1:
                        AX = abs((s32)DST);
                        AX = do_div(AX, abs((s32)SRC));
                        if ((s32)DST < 0)
                                DST = (u32)-AX;
                        else
                                DST = (u32)AX;
                        break;
                }
                CONT;
        ALU64_MOD_K:
                switch (OFF) {
                case 0:
                        div64_u64_rem(DST, IMM, &AX);
                        DST = AX;
                        break;
                case 1:
                        AX = div64_s64(DST, IMM);
                        DST = DST - AX * IMM;
                        break;
                }
                CONT;
        ALU_MOD_K:
                switch (OFF) {
                case 0:
                        AX = (u32) DST;
                        DST = do_div(AX, (u32) IMM);
                        break;
                case 1:
                        AX = abs((s32)DST);
                        AX = do_div(AX, abs((s32)IMM));
                        if ((s32)DST < 0)
                                DST = (u32)-AX;
                        else
                                DST = (u32)AX;
                        break;
                }
                CONT;
        ALU64_DIV_X:
                switch (OFF) {
                case 0:
                        DST = div64_u64(DST, SRC);
                        break;
                case 1:
                        DST = div64_s64(DST, SRC);
                        break;
                }
                CONT;
        ALU_DIV_X:
                switch (OFF) {
                case 0:
                        AX = (u32) DST;
                        do_div(AX, (u32) SRC);
                        DST = (u32) AX;
                        break;
                case 1:
                        AX = abs((s32)DST);
                        do_div(AX, abs((s32)SRC));
                        if (((s32)DST < 0) == ((s32)SRC < 0))
                                DST = (u32)AX;
                        else
                                DST = (u32)-AX;
                        break;
                }
                CONT;
        ALU64_DIV_K:
                switch (OFF) {
                case 0:
                        DST = div64_u64(DST, IMM);
                        break;
                case 1:
                        DST = div64_s64(DST, IMM);
                        break;
                }
                CONT;
        ALU_DIV_K:
                switch (OFF) {
                case 0:
                        AX = (u32) DST;
                        do_div(AX, (u32) IMM);
                        DST = (u32) AX;
                        break;
                case 1:
                        AX = abs((s32)DST);
                        do_div(AX, abs((s32)IMM));
                        if (((s32)DST < 0) == ((s32)IMM < 0))
                                DST = (u32)AX;
                        else
                                DST = (u32)-AX;
                        break;
                }
                CONT;
        ALU_END_TO_BE:
                switch (IMM) {
                case 16:
                        DST = (__force u16) cpu_to_be16(DST);
                        break;
                case 32:
                        DST = (__force u32) cpu_to_be32(DST);
                        break;
                case 64:
                        DST = (__force u64) cpu_to_be64(DST);
                        break;
                }
                CONT;
        ALU_END_TO_LE:
                switch (IMM) {
                case 16:
                        DST = (__force u16) cpu_to_le16(DST);
                        break;
                case 32:
                        DST = (__force u32) cpu_to_le32(DST);
                        break;
                case 64:
                        DST = (__force u64) cpu_to_le64(DST);
                        break;
                }
                CONT;
        ALU64_END_TO_LE:
                switch (IMM) {
                case 16:
                        DST = (__force u16) __swab16(DST);
                        break;
                case 32:
                        DST = (__force u32) __swab32(DST);
                        break;
                case 64:
                        DST = (__force u64) __swab64(DST);
                        break;
                }
                CONT;

        /* CALL */
        JMP_CALL:
                /* Function call scratches BPF_R1-BPF_R5 registers,
                 * preserves BPF_R6-BPF_R9, and stores return value
                 * into BPF_R0.
                 */
                BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
                                                       BPF_R4, BPF_R5);
                CONT;

        JMP_CALL_ARGS:
                BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2,
                                                            BPF_R3, BPF_R4,
                                                            BPF_R5,
                                                            insn + insn->off + 1);
                CONT;

        JMP_TAIL_CALL: {
                struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
                struct bpf_array *array = container_of(map, struct bpf_array, map);
                struct bpf_prog *prog;
                u32 index = BPF_R3;

                if (unlikely(index >= array->map.max_entries))
                        goto out;

                if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT))
                        goto out;

                tail_call_cnt++;

                prog = READ_ONCE(array->ptrs[index]);
                if (!prog)
                        goto out;

                /* ARG1 at this point is guaranteed to point to CTX from
                 * the verifier side due to the fact that the tail call is
                 * handled like a helper, that is, bpf_tail_call_proto,
                 * where arg1_type is ARG_PTR_TO_CTX.
                 */
                insn = prog->insnsi;
                goto select_insn;
out:
                CONT;
        }
        JMP_JA:
                insn += insn->off;
                CONT;
        JMP32_JA:
                insn += insn->imm;
                CONT;
        JMP_EXIT:
                return BPF_R0;
        /* JMP */
#define COND_JMP(SIGN, OPCODE, CMP_OP)                                \
        JMP_##OPCODE##_X:                                        \
                if ((SIGN##64) DST CMP_OP (SIGN##64) SRC) {        \
                        insn += insn->off;                        \
                        CONT_JMP;                                \
                }                                                \
                CONT;                                                \
        JMP32_##OPCODE##_X:                                        \
                if ((SIGN##32) DST CMP_OP (SIGN##32) SRC) {        \
                        insn += insn->off;                        \
                        CONT_JMP;                                \
                }                                                \
                CONT;                                                \
        JMP_##OPCODE##_K:                                        \
                if ((SIGN##64) DST CMP_OP (SIGN##64) IMM) {        \
                        insn += insn->off;                        \
                        CONT_JMP;                                \
                }                                                \
                CONT;                                                \
        JMP32_##OPCODE##_K:                                        \
                if ((SIGN##32) DST CMP_OP (SIGN##32) IMM) {        \
                        insn += insn->off;                        \
                        CONT_JMP;                                \
                }                                                \
                CONT;
        COND_JMP(u, JEQ, ==)
        COND_JMP(u, JNE, !=)
        COND_JMP(u, JGT, >)
        COND_JMP(u, JLT, <)
        COND_JMP(u, JGE, >=)
        COND_JMP(u, JLE, <=)
        COND_JMP(u, JSET, &)
        COND_JMP(s, JSGT, >)
        COND_JMP(s, JSLT, <)
        COND_JMP(s, JSGE, >=)
        COND_JMP(s, JSLE, <=)
#undef COND_JMP
        /* ST, STX and LDX*/
        ST_NOSPEC:
                /* Speculation barrier for mitigating Speculative Store Bypass,
                 * Bounds-Check Bypass and Type Confusion. In case of arm64, we
                 * rely on the firmware mitigation as controlled via the ssbd
                 * kernel parameter. Whenever the mitigation is enabled, it
                 * works for all of the kernel code with no need to provide any
                 * additional instructions here. In case of x86, we use 'lfence'
                 * insn for mitigation. We reuse preexisting logic from Spectre
                 * v1 mitigation that happens to produce the required code on
                 * x86 for v4 as well.
                 */
                barrier_nospec();
                CONT;
#define LDST(SIZEOP, SIZE)                                                \
        STX_MEM_##SIZEOP:                                                \
                *(SIZE *)(unsigned long) (DST + insn->off) = SRC;        \
                CONT;                                                        \
        ST_MEM_##SIZEOP:                                                \
                *(SIZE *)(unsigned long) (DST + insn->off) = IMM;        \
                CONT;                                                        \
        LDX_MEM_##SIZEOP:                                                \
                DST = *(SIZE *)(unsigned long) (SRC + insn->off);        \
                CONT;                                                        \
        LDX_PROBE_MEM_##SIZEOP:                                                \
                bpf_probe_read_kernel_common(&DST, sizeof(SIZE),        \
                              (const void *)(long) (SRC + insn->off));        \
                DST = *((SIZE *)&DST);                                        \
                CONT;

        LDST(B,   u8)
        LDST(H,  u16)
        LDST(W,  u32)
        LDST(DW, u64)
#undef LDST

#define LDSX(SIZEOP, SIZE)                                                \
        LDX_MEMSX_##SIZEOP:                                                \
                DST = *(SIZE *)(unsigned long) (SRC + insn->off);        \
                CONT;                                                        \
        LDX_PROBE_MEMSX_##SIZEOP:                                        \
                bpf_probe_read_kernel_common(&DST, sizeof(SIZE),                \
                                      (const void *)(long) (SRC + insn->off));        \
                DST = *((SIZE *)&DST);                                        \
                CONT;

        LDSX(B,   s8)
        LDSX(H,  s16)
        LDSX(W,  s32)
#undef LDSX

#define ATOMIC_ALU_OP(BOP, KOP)                                                \
                case BOP:                                                \
                        if (BPF_SIZE(insn->code) == BPF_W)                \
                                atomic_##KOP((u32) SRC, (atomic_t *)(unsigned long) \
                                             (DST + insn->off));        \
                        else if (BPF_SIZE(insn->code) == BPF_DW)        \
                                atomic64_##KOP((u64) SRC, (atomic64_t *)(unsigned long) \
                                               (DST + insn->off));        \
                        else                                                \
                                goto default_label;                        \
                        break;                                                \
                case BOP | BPF_FETCH:                                        \
                        if (BPF_SIZE(insn->code) == BPF_W)                \
                                SRC = (u32) atomic_fetch_##KOP(                \
                                        (u32) SRC,                        \
                                        (atomic_t *)(unsigned long) (DST + insn->off)); \
                        else if (BPF_SIZE(insn->code) == BPF_DW)        \
                                SRC = (u64) atomic64_fetch_##KOP(        \
                                        (u64) SRC,                        \
                                        (atomic64_t *)(unsigned long) (DST + insn->off)); \
                        else                                                \
                                goto default_label;                        \
                        break;

        STX_ATOMIC_DW:
        STX_ATOMIC_W:
        STX_ATOMIC_H:
        STX_ATOMIC_B:
                switch (IMM) {
                /* Atomic read-modify-write instructions support only W and DW
                 * size modifiers.
                 */
                ATOMIC_ALU_OP(BPF_ADD, add)
                ATOMIC_ALU_OP(BPF_AND, and)
                ATOMIC_ALU_OP(BPF_OR, or)
                ATOMIC_ALU_OP(BPF_XOR, xor)
#undef ATOMIC_ALU_OP

                case BPF_XCHG:
                        if (BPF_SIZE(insn->code) == BPF_W)
                                SRC = (u32) atomic_xchg(
                                        (atomic_t *)(unsigned long) (DST + insn->off),
                                        (u32) SRC);
                        else if (BPF_SIZE(insn->code) == BPF_DW)
                                SRC = (u64) atomic64_xchg(
                                        (atomic64_t *)(unsigned long) (DST + insn->off),
                                        (u64) SRC);
                        else
                                goto default_label;
                        break;
                case BPF_CMPXCHG:
                        if (BPF_SIZE(insn->code) == BPF_W)
                                BPF_R0 = (u32) atomic_cmpxchg(
                                        (atomic_t *)(unsigned long) (DST + insn->off),
                                        (u32) BPF_R0, (u32) SRC);
                        else if (BPF_SIZE(insn->code) == BPF_DW)
                                BPF_R0 = (u64) atomic64_cmpxchg(
                                        (atomic64_t *)(unsigned long) (DST + insn->off),
                                        (u64) BPF_R0, (u64) SRC);
                        else
                                goto default_label;
                        break;
                /* Atomic load and store instructions support all size
                 * modifiers.
                 */
                case BPF_LOAD_ACQ:
                        switch (BPF_SIZE(insn->code)) {
#define LOAD_ACQUIRE(SIZEOP, SIZE)                                \
                        case BPF_##SIZEOP:                        \
                                DST = (SIZE)smp_load_acquire(        \
                                        (SIZE *)(unsigned long)(SRC + insn->off));        \
                                break;
                        LOAD_ACQUIRE(B,   u8)
                        LOAD_ACQUIRE(H,  u16)
                        LOAD_ACQUIRE(W,  u32)
#ifdef CONFIG_64BIT
                        LOAD_ACQUIRE(DW, u64)
#endif
#undef LOAD_ACQUIRE
                        default:
                                goto default_label;
                        }
                        break;
                case BPF_STORE_REL:
                        switch (BPF_SIZE(insn->code)) {
#define STORE_RELEASE(SIZEOP, SIZE)                        \
                        case BPF_##SIZEOP:                \
                                smp_store_release(        \
                                        (SIZE *)(unsigned long)(DST + insn->off), (SIZE)SRC);        \
                                break;
                        STORE_RELEASE(B,   u8)
                        STORE_RELEASE(H,  u16)
                        STORE_RELEASE(W,  u32)
#ifdef CONFIG_64BIT
                        STORE_RELEASE(DW, u64)
#endif
#undef STORE_RELEASE
                        default:
                                goto default_label;
                        }
                        break;

                default:
                        goto default_label;
                }
                CONT;

        default_label:
                /* If we ever reach this, we have a bug somewhere. Die hard here
                 * instead of just returning 0; we could be somewhere in a subprog,
                 * so execution could continue otherwise which we do /not/ want.
                 *
                 * Note, verifier whitelists all opcodes in bpf_opcode_in_insntable().
                 */
                pr_warn("BPF interpreter: unknown opcode %02x (imm: 0x%x)\n",
                        insn->code, insn->imm);
                BUG_ON(1);
                return 0;
}

#define PROG_NAME(stack_size) __bpf_prog_run##stack_size
#define DEFINE_BPF_PROG_RUN(stack_size) \
static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \
{ \
        u64 stack[stack_size / sizeof(u64)]; \
        u64 regs[MAX_BPF_EXT_REG] = {}; \
\
        kmsan_unpoison_memory(stack, sizeof(stack)); \
        FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
        ARG1 = (u64) (unsigned long) ctx; \
        return ___bpf_prog_run(regs, insn); \
}

#define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size
#define DEFINE_BPF_PROG_RUN_ARGS(stack_size) \
static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \
                                      const struct bpf_insn *insn) \
{ \
        u64 stack[stack_size / sizeof(u64)]; \
        u64 regs[MAX_BPF_EXT_REG]; \
\
        kmsan_unpoison_memory(stack, sizeof(stack)); \
        FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
        BPF_R1 = r1; \
        BPF_R2 = r2; \
        BPF_R3 = r3; \
        BPF_R4 = r4; \
        BPF_R5 = r5; \
        return ___bpf_prog_run(regs, insn); \
}

#define EVAL1(FN, X) FN(X)
#define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y)
#define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y)
#define EVAL4(FN, X, Y...) FN(X) EVAL3(FN, Y)
#define EVAL5(FN, X, Y...) FN(X) EVAL4(FN, Y)
#define EVAL6(FN, X, Y...) FN(X) EVAL5(FN, Y)

EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192);
EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384);
EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512);

EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 32, 64, 96, 128, 160, 192);
EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 224, 256, 288, 320, 352, 384);
EVAL4(DEFINE_BPF_PROG_RUN_ARGS, 416, 448, 480, 512);

#define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size),

static unsigned int (*interpreters[])(const void *ctx,
                                      const struct bpf_insn *insn) = {
EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
};
#undef PROG_NAME_LIST
#define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size),
static __maybe_unused
u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5,
                           const struct bpf_insn *insn) = {
EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
};
#undef PROG_NAME_LIST

#ifdef CONFIG_BPF_SYSCALL
void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth)
{
        stack_depth = max_t(u32, stack_depth, 1);
        insn->off = (s16) insn->imm;
        insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] -
                __bpf_call_base_args;
        insn->code = BPF_JMP | BPF_CALL_ARGS;
}
#endif
#endif

static unsigned int __bpf_prog_ret0_warn(const void *ctx,
                                         const struct bpf_insn *insn)
{
        /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON
         * is not working properly, so warn about it!
         */
        WARN_ON_ONCE(1);
        return 0;
}

static bool __bpf_prog_map_compatible(struct bpf_map *map,
                                      const struct bpf_prog *fp)
{
        enum bpf_prog_type prog_type = resolve_prog_type(fp);
        struct bpf_prog_aux *aux = fp->aux;
        enum bpf_cgroup_storage_type i;
        bool ret = false;
        u64 cookie;

        if (fp->kprobe_override)
                return ret;

        spin_lock(&map->owner_lock);
        /* There's no owner yet where we could check for compatibility. */
        if (!map->owner) {
                map->owner = bpf_map_owner_alloc(map);
                if (!map->owner)
                        goto err;
                map->owner->type  = prog_type;
                map->owner->jited = fp->jited;
                map->owner->xdp_has_frags = aux->xdp_has_frags;
                map->owner->expected_attach_type = fp->expected_attach_type;
                map->owner->attach_func_proto = aux->attach_func_proto;
                for_each_cgroup_storage_type(i) {
                        map->owner->storage_cookie[i] =
                                aux->cgroup_storage[i] ?
                                aux->cgroup_storage[i]->cookie : 0;
                }
                ret = true;
        } else {
                ret = map->owner->type  == prog_type &&
                      map->owner->jited == fp->jited &&
                      map->owner->xdp_has_frags == aux->xdp_has_frags;
                if (ret &&
                    map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
                    map->owner->expected_attach_type != fp->expected_attach_type)
                        ret = false;
                for_each_cgroup_storage_type(i) {
                        if (!ret)
                                break;
                        cookie = aux->cgroup_storage[i] ?
                                 aux->cgroup_storage[i]->cookie : 0;
                        ret = map->owner->storage_cookie[i] == cookie ||
                              !cookie;
                }
                if (ret &&
                    map->owner->attach_func_proto != aux->attach_func_proto) {
                        switch (prog_type) {
                        case BPF_PROG_TYPE_TRACING:
                        case BPF_PROG_TYPE_LSM:
                        case BPF_PROG_TYPE_EXT:
                        case BPF_PROG_TYPE_STRUCT_OPS:
                                ret = false;
                                break;
                        default:
                                break;
                        }
                }
        }
err:
        spin_unlock(&map->owner_lock);
        return ret;
}

bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp)
{
        /* XDP programs inserted into maps are not guaranteed to run on
         * a particular netdev (and can run outside driver context entirely
         * in the case of devmap and cpumap). Until device checks
         * are implemented, prohibit adding dev-bound programs to program maps.
         */
        if (bpf_prog_is_dev_bound(fp->aux))
                return false;

        return __bpf_prog_map_compatible(map, fp);
}

static int bpf_check_tail_call(const struct bpf_prog *fp)
{
        struct bpf_prog_aux *aux = fp->aux;
        int i, ret = 0;

        mutex_lock(&aux->used_maps_mutex);
        for (i = 0; i < aux->used_map_cnt; i++) {
                struct bpf_map *map = aux->used_maps[i];

                if (!map_type_contains_progs(map))
                        continue;

                if (!__bpf_prog_map_compatible(map, fp)) {
                        ret = -EINVAL;
                        goto out;
                }
        }

out:
        mutex_unlock(&aux->used_maps_mutex);
        return ret;
}

static bool bpf_prog_select_interpreter(struct bpf_prog *fp)
{
        bool select_interpreter = false;
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
        u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
        u32 idx = (round_up(stack_depth, 32) / 32) - 1;

        /* may_goto may cause stack size > 512, leading to idx out-of-bounds.
         * But for non-JITed programs, we don't need bpf_func, so no bounds
         * check needed.
         */
        if (idx < ARRAY_SIZE(interpreters)) {
                fp->bpf_func = interpreters[idx];
                select_interpreter = true;
        } else {
                fp->bpf_func = __bpf_prog_ret0_warn;
        }
#else
        fp->bpf_func = __bpf_prog_ret0_warn;
#endif
        return select_interpreter;
}

/**
 *        bpf_prog_select_runtime - select exec runtime for BPF program
 *        @fp: bpf_prog populated with BPF program
 *        @err: pointer to error variable
 *
 * Try to JIT eBPF program, if JIT is not available, use interpreter.
 * The BPF program will be executed via bpf_prog_run() function.
 *
 * Return: the &fp argument along with &err set to 0 for success or
 * a negative errno code on failure
 */
struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
{
        /* In case of BPF to BPF calls, verifier did all the prep
         * work with regards to JITing, etc.
         */
        bool jit_needed = false;

        if (fp->bpf_func)
                goto finalize;

        if (IS_ENABLED(CONFIG_BPF_JIT_ALWAYS_ON) ||
            bpf_prog_has_kfunc_call(fp))
                jit_needed = true;

        if (!bpf_prog_select_interpreter(fp))
                jit_needed = true;

        /* eBPF JITs can rewrite the program in case constant
         * blinding is active. However, in case of error during
         * blinding, bpf_int_jit_compile() must always return a
         * valid program, which in this case would simply not
         * be JITed, but falls back to the interpreter.
         */
        if (!bpf_prog_is_offloaded(fp->aux)) {
                *err = bpf_prog_alloc_jited_linfo(fp);
                if (*err)
                        return fp;

                fp = bpf_int_jit_compile(fp);
                bpf_prog_jit_attempt_done(fp);
                if (!fp->jited && jit_needed) {
                        *err = -ENOTSUPP;
                        return fp;
                }
        } else {
                *err = bpf_prog_offload_compile(fp);
                if (*err)
                        return fp;
        }

finalize:
        *err = bpf_prog_lock_ro(fp);
        if (*err)
                return fp;

        /* The tail call compatibility check can only be done at
         * this late stage as we need to determine, if we deal
         * with JITed or non JITed program concatenations and not
         * all eBPF JITs might immediately support all features.
         */
        *err = bpf_check_tail_call(fp);

        return fp;
}
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);

static unsigned int __bpf_prog_ret1(const void *ctx,
                                    const struct bpf_insn *insn)
{
        return 1;
}

static struct bpf_prog_dummy {
        struct bpf_prog prog;
} dummy_bpf_prog = {
        .prog = {
                .bpf_func = __bpf_prog_ret1,
        },
};

struct bpf_empty_prog_array bpf_empty_prog_array = {
        .null_prog = NULL,
};
EXPORT_SYMBOL(bpf_empty_prog_array);

struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
{
        struct bpf_prog_array *p;

        if (prog_cnt)
                p = kzalloc(struct_size(p, items, prog_cnt + 1), flags);
        else
                p = &bpf_empty_prog_array.hdr;

        return p;
}

void bpf_prog_array_free(struct bpf_prog_array *progs)
{
        if (!progs || progs == &bpf_empty_prog_array.hdr)
                return;
        kfree_rcu(progs, rcu);
}

static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu)
{
        struct bpf_prog_array *progs;

        /* If RCU Tasks Trace grace period implies RCU grace period, there is
         * no need to call kfree_rcu(), just call kfree() directly.
         */
        progs = container_of(rcu, struct bpf_prog_array, rcu);
        if (rcu_trace_implies_rcu_gp())
                kfree(progs);
        else
                kfree_rcu(progs, rcu);
}

void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs)
{
        if (!progs || progs == &bpf_empty_prog_array.hdr)
                return;
        call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb);
}

int bpf_prog_array_length(struct bpf_prog_array *array)
{
        struct bpf_prog_array_item *item;
        u32 cnt = 0;

        for (item = array->items; item->prog; item++)
                if (item->prog != &dummy_bpf_prog.prog)
                        cnt++;
        return cnt;
}

bool bpf_prog_array_is_empty(struct bpf_prog_array *array)
{
        struct bpf_prog_array_item *item;

        for (item = array->items; item->prog; item++)
                if (item->prog != &dummy_bpf_prog.prog)
                        return false;
        return true;
}

static bool bpf_prog_array_copy_core(struct bpf_prog_array *array,
                                     u32 *prog_ids,
                                     u32 request_cnt)
{
        struct bpf_prog_array_item *item;
        int i = 0;

        for (item = array->items; item->prog; item++) {
                if (item->prog == &dummy_bpf_prog.prog)
                        continue;
                prog_ids[i] = item->prog->aux->id;
                if (++i == request_cnt) {
                        item++;
                        break;
                }
        }

        return !!(item->prog);
}

int bpf_prog_array_copy_to_user(struct bpf_prog_array *array,
                                __u32 __user *prog_ids, u32 cnt)
{
        unsigned long err = 0;
        bool nospc;
        u32 *ids;

        /* users of this function are doing:
         * cnt = bpf_prog_array_length();
         * if (cnt > 0)
         *     bpf_prog_array_copy_to_user(..., cnt);
         * so below kcalloc doesn't need extra cnt > 0 check.
         */
        ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN);
        if (!ids)
                return -ENOMEM;
        nospc = bpf_prog_array_copy_core(array, ids, cnt);
        err = copy_to_user(prog_ids, ids, cnt * sizeof(u32));
        kfree(ids);
        if (err)
                return -EFAULT;
        if (nospc)
                return -ENOSPC;
        return 0;
}

void bpf_prog_array_delete_safe(struct bpf_prog_array *array,
                                struct bpf_prog *old_prog)
{
        struct bpf_prog_array_item *item;

        for (item = array->items; item->prog; item++)
                if (item->prog == old_prog) {
                        WRITE_ONCE(item->prog, &dummy_bpf_prog.prog);
                        break;
                }
}

/**
 * bpf_prog_array_delete_safe_at() - Replaces the program at the given
 *                                   index into the program array with
 *                                   a dummy no-op program.
 * @array: a bpf_prog_array
 * @index: the index of the program to replace
 *
 * Skips over dummy programs, by not counting them, when calculating
 * the position of the program to replace.
 *
 * Return:
 * * 0                - Success
 * * -EINVAL        - Invalid index value. Must be a non-negative integer.
 * * -ENOENT        - Index out of range
 */
int bpf_prog_array_delete_safe_at(struct bpf_prog_array *array, int index)
{
        return bpf_prog_array_update_at(array, index, &dummy_bpf_prog.prog);
}

/**
 * bpf_prog_array_update_at() - Updates the program at the given index
 *                              into the program array.
 * @array: a bpf_prog_array
 * @index: the index of the program to update
 * @prog: the program to insert into the array
 *
 * Skips over dummy programs, by not counting them, when calculating
 * the position of the program to update.
 *
 * Return:
 * * 0                - Success
 * * -EINVAL        - Invalid index value. Must be a non-negative integer.
 * * -ENOENT        - Index out of range
 */
int bpf_prog_array_update_at(struct bpf_prog_array *array, int index,
                             struct bpf_prog *prog)
{
        struct bpf_prog_array_item *item;

        if (unlikely(index < 0))
                return -EINVAL;

        for (item = array->items; item->prog; item++) {
                if (item->prog == &dummy_bpf_prog.prog)
                        continue;
                if (!index) {
                        WRITE_ONCE(item->prog, prog);
                        return 0;
                }
                index--;
        }
        return -ENOENT;
}

int bpf_prog_array_copy(struct bpf_prog_array *old_array,
                        struct bpf_prog *exclude_prog,
                        struct bpf_prog *include_prog,
                        u64 bpf_cookie,
                        struct bpf_prog_array **new_array)
{
        int new_prog_cnt, carry_prog_cnt = 0;
        struct bpf_prog_array_item *existing, *new;
        struct bpf_prog_array *array;
        bool found_exclude = false;

        /* Figure out how many existing progs we need to carry over to
         * the new array.
         */
        if (old_array) {
                existing = old_array->items;
                for (; existing->prog; existing++) {
                        if (existing->prog == exclude_prog) {
                                found_exclude = true;
                                continue;
                        }
                        if (existing->prog != &dummy_bpf_prog.prog)
                                carry_prog_cnt++;
                        if (existing->prog == include_prog)
                                return -EEXIST;
                }
        }

        if (exclude_prog && !found_exclude)
                return -ENOENT;

        /* How many progs (not NULL) will be in the new array? */
        new_prog_cnt = carry_prog_cnt;
        if (include_prog)
                new_prog_cnt += 1;

        /* Do we have any prog (not NULL) in the new array? */
        if (!new_prog_cnt) {
                *new_array = NULL;
                return 0;
        }

        /* +1 as the end of prog_array is marked with NULL */
        array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL);
        if (!array)
                return -ENOMEM;
        new = array->items;

        /* Fill in the new prog array */
        if (carry_prog_cnt) {
                existing = old_array->items;
                for (; existing->prog; existing++) {
                        if (existing->prog == exclude_prog ||
                            existing->prog == &dummy_bpf_prog.prog)
                                continue;

                        new->prog = existing->prog;
                        new->bpf_cookie = existing->bpf_cookie;
                        new++;
                }
        }
        if (include_prog) {
                new->prog = include_prog;
                new->bpf_cookie = bpf_cookie;
                new++;
        }
        new->prog = NULL;
        *new_array = array;
        return 0;
}

int bpf_prog_array_copy_info(struct bpf_prog_array *array,
                             u32 *prog_ids, u32 request_cnt,
                             u32 *prog_cnt)
{
        u32 cnt = 0;

        if (array)
                cnt = bpf_prog_array_length(array);

        *prog_cnt = cnt;

        /* return early if user requested only program count or nothing to copy */
        if (!request_cnt || !cnt)
                return 0;

        /* this function is called under trace/bpf_trace.c: bpf_event_mutex */
        return bpf_prog_array_copy_core(array, prog_ids, request_cnt) ? -ENOSPC
                                                                     : 0;
}

void __bpf_free_used_maps(struct bpf_prog_aux *aux,
                          struct bpf_map **used_maps, u32 len)
{
        struct bpf_map *map;
        bool sleepable;
        u32 i;

        sleepable = aux->prog->sleepable;
        for (i = 0; i < len; i++) {
                map = used_maps[i];
                if (map->ops->map_poke_untrack)
                        map->ops->map_poke_untrack(map, aux);
                if (sleepable)
                        atomic64_dec(&map->sleepable_refcnt);
                bpf_map_put(map);
        }
}

static void bpf_free_used_maps(struct bpf_prog_aux *aux)
{
        __bpf_free_used_maps(aux, aux->used_maps, aux->used_map_cnt);
        kfree(aux->used_maps);
}

void __bpf_free_used_btfs(struct btf_mod_pair *used_btfs, u32 len)
{
#ifdef CONFIG_BPF_SYSCALL
        struct btf_mod_pair *btf_mod;
        u32 i;

        for (i = 0; i < len; i++) {
                btf_mod = &used_btfs[i];
                if (btf_mod->module)
                        module_put(btf_mod->module);
                btf_put(btf_mod->btf);
        }
#endif
}

static void bpf_free_used_btfs(struct bpf_prog_aux *aux)
{
        __bpf_free_used_btfs(aux->used_btfs, aux->used_btf_cnt);
        kfree(aux->used_btfs);
}

static void bpf_prog_free_deferred(struct work_struct *work)
{
        struct bpf_prog_aux *aux;
        int i;

        aux = container_of(work, struct bpf_prog_aux, work);
#ifdef CONFIG_BPF_SYSCALL
        bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab);
        bpf_prog_stream_free(aux->prog);
#endif
#ifdef CONFIG_CGROUP_BPF
        if (aux->cgroup_atype != CGROUP_BPF_ATTACH_TYPE_INVALID)
                bpf_cgroup_atype_put(aux->cgroup_atype);
#endif
        bpf_free_used_maps(aux);
        bpf_free_used_btfs(aux);
        if (bpf_prog_is_dev_bound(aux))
                bpf_prog_dev_bound_destroy(aux->prog);
#ifdef CONFIG_PERF_EVENTS
        if (aux->prog->has_callchain_buf)
                put_callchain_buffers();
#endif
        if (aux->dst_trampoline)
                bpf_trampoline_put(aux->dst_trampoline);
        for (i = 0; i < aux->real_func_cnt; i++) {
                /* We can just unlink the subprog poke descriptor table as
                 * it was originally linked to the main program and is also
                 * released along with it.
                 */
                aux->func[i]->aux->poke_tab = NULL;
                bpf_jit_free(aux->func[i]);
        }
        if (aux->real_func_cnt) {
                kfree(aux->func);
                bpf_prog_unlock_free(aux->prog);
        } else {
                bpf_jit_free(aux->prog);
        }
}

void bpf_prog_free(struct bpf_prog *fp)
{
        struct bpf_prog_aux *aux = fp->aux;

        if (aux->dst_prog)
                bpf_prog_put(aux->dst_prog);
        bpf_token_put(aux->token);
        INIT_WORK(&aux->work, bpf_prog_free_deferred);
        schedule_work(&aux->work);
}
EXPORT_SYMBOL_GPL(bpf_prog_free);

/* RNG for unprivileged user space with separated state from prandom_u32(). */
static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state);

void bpf_user_rnd_init_once(void)
{
        prandom_init_once(&bpf_user_rnd_state);
}

BPF_CALL_0(bpf_user_rnd_u32)
{
        /* Should someone ever have the rather unwise idea to use some
         * of the registers passed into this function, then note that
         * this function is called from native eBPF and classic-to-eBPF
         * transformations. Register assignments from both sides are
         * different, f.e. classic always sets fn(ctx, A, X) here.
         */
        struct rnd_state *state;
        u32 res;

        state = &get_cpu_var(bpf_user_rnd_state);
        res = prandom_u32_state(state);
        put_cpu_var(bpf_user_rnd_state);

        return res;
}

BPF_CALL_0(bpf_get_raw_cpu_id)
{
        return raw_smp_processor_id();
}

/* Weak definitions of helper functions in case we don't have bpf syscall. */
const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
const struct bpf_func_proto bpf_map_update_elem_proto __weak;
const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
const struct bpf_func_proto bpf_map_push_elem_proto __weak;
const struct bpf_func_proto bpf_map_pop_elem_proto __weak;
const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto __weak;
const struct bpf_func_proto bpf_spin_lock_proto __weak;
const struct bpf_func_proto bpf_spin_unlock_proto __weak;
const struct bpf_func_proto bpf_jiffies64_proto __weak;

const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;
const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak;
const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak;
const struct bpf_func_proto bpf_ktime_get_tai_ns_proto __weak;

const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
const struct bpf_func_proto bpf_get_current_comm_proto __weak;
const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto __weak;
const struct bpf_func_proto bpf_get_local_storage_proto __weak;
const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_snprintf_btf_proto __weak;
const struct bpf_func_proto bpf_seq_printf_btf_proto __weak;
const struct bpf_func_proto bpf_set_retval_proto __weak;
const struct bpf_func_proto bpf_get_retval_proto __weak;

const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
{
        return NULL;
}

const struct bpf_func_proto * __weak bpf_get_trace_vprintk_proto(void)
{
        return NULL;
}

const struct bpf_func_proto * __weak bpf_get_perf_event_read_value_proto(void)
{
        return NULL;
}

u64 __weak
bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
                 void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
{
        return -ENOTSUPP;
}
EXPORT_SYMBOL_GPL(bpf_event_output);

/* Always built-in helper functions. */
const struct bpf_func_proto bpf_tail_call_proto = {
        /* func is unused for tail_call, we set it to pass the
         * get_helper_proto check
         */
        .func                = BPF_PTR_POISON,
        .gpl_only        = false,
        .ret_type        = RET_VOID,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
};

/* Stub for JITs that only support cBPF. eBPF programs are interpreted.
 * It is encouraged to implement bpf_int_jit_compile() instead, so that
 * eBPF and implicitly also cBPF can get JITed!
 */
struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog)
{
        return prog;
}

/* Stub for JITs that support eBPF. All cBPF code gets transformed into
 * eBPF by the kernel and is later compiled by bpf_int_jit_compile().
 */
void __weak bpf_jit_compile(struct bpf_prog *prog)
{
}

bool __weak bpf_helper_changes_pkt_data(enum bpf_func_id func_id)
{
        return false;
}

/* Return TRUE if the JIT backend wants verifier to enable sub-register usage
 * analysis code and wants explicit zero extension inserted by verifier.
 * Otherwise, return FALSE.
 *
 * The verifier inserts an explicit zero extension after BPF_CMPXCHGs even if
 * you don't override this. JITs that don't want these extra insns can detect
 * them using insn_is_zext.
 */
bool __weak bpf_jit_needs_zext(void)
{
        return false;
}

/* By default, enable the verifier's mitigations against Spectre v1 and v4 for
 * all archs. The value returned must not change at runtime as there is
 * currently no support for reloading programs that were loaded without
 * mitigations.
 */
bool __weak bpf_jit_bypass_spec_v1(void)
{
        return false;
}

bool __weak bpf_jit_bypass_spec_v4(void)
{
        return false;
}

/* Return true if the JIT inlines the call to the helper corresponding to
 * the imm.
 *
 * The verifier will not patch the insn->imm for the call to the helper if
 * this returns true.
 */
bool __weak bpf_jit_inlines_helper_call(s32 imm)
{
        return false;
}

/* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */
bool __weak bpf_jit_supports_subprog_tailcalls(void)
{
        return false;
}

bool __weak bpf_jit_supports_percpu_insn(void)
{
        return false;
}

bool __weak bpf_jit_supports_kfunc_call(void)
{
        return false;
}

bool __weak bpf_jit_supports_far_kfunc_call(void)
{
        return false;
}

bool __weak bpf_jit_supports_arena(void)
{
        return false;
}

bool __weak bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena)
{
        return false;
}

u64 __weak bpf_arch_uaddress_limit(void)
{
#if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE)
        return TASK_SIZE;
#else
        return 0;
#endif
}

/* Return TRUE if the JIT backend satisfies the following two conditions:
 * 1) JIT backend supports atomic_xchg() on pointer-sized words.
 * 2) Under the specific arch, the implementation of xchg() is the same
 *    as atomic_xchg() on pointer-sized words.
 */
bool __weak bpf_jit_supports_ptr_xchg(void)
{
        return false;
}

/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
 * skb_copy_bits(), so provide a weak definition of it for NET-less config.
 */
int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
                         int len)
{
        return -EFAULT;
}

int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
                              void *addr1, void *addr2)
{
        return -ENOTSUPP;
}

void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len)
{
        return ERR_PTR(-ENOTSUPP);
}

int __weak bpf_arch_text_invalidate(void *dst, size_t len)
{
        return -ENOTSUPP;
}

bool __weak bpf_jit_supports_exceptions(void)
{
        return false;
}

bool __weak bpf_jit_supports_private_stack(void)
{
        return false;
}

void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
{
}

bool __weak bpf_jit_supports_timed_may_goto(void)
{
        return false;
}

u64 __weak arch_bpf_timed_may_goto(void)
{
        return 0;
}

static noinline void bpf_prog_report_may_goto_violation(void)
{
#ifdef CONFIG_BPF_SYSCALL
        struct bpf_stream_stage ss;
        struct bpf_prog *prog;

        prog = bpf_prog_find_from_stack();
        if (!prog)
                return;
        bpf_stream_stage(ss, prog, BPF_STDERR, ({
                bpf_stream_printk(ss, "ERROR: Timeout detected for may_goto instruction\n");
                bpf_stream_dump_stack(ss);
        }));
#endif
}

u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p)
{
        u64 time = ktime_get_mono_fast_ns();

        /* Populate the timestamp for this stack frame, and refresh count. */
        if (!p->timestamp) {
                p->timestamp = time;
                return BPF_MAX_TIMED_LOOPS;
        }
        /* Check if we've exhausted our time slice, and zero count. */
        if (unlikely(time - p->timestamp >= (NSEC_PER_SEC / 4))) {
                bpf_prog_report_may_goto_violation();
                return 0;
        }
        /* Refresh the count for the stack frame. */
        return BPF_MAX_TIMED_LOOPS;
}

/* for configs without MMU or 32-bit */
__weak const struct bpf_map_ops arena_map_ops;
__weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
{
        return 0;
}
__weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
{
        return 0;
}

#ifdef CONFIG_BPF_SYSCALL
static int __init bpf_global_ma_init(void)
{
        int ret;

        ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false);
        bpf_global_ma_set = !ret;
        return ret;
}
late_initcall(bpf_global_ma_init);
#endif

DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
EXPORT_SYMBOL(bpf_stats_enabled_key);

/* All definitions of tracepoints related to BPF. */
#define CREATE_TRACE_POINTS
#include <linux/bpf_trace.h>

EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx);

#ifdef CONFIG_BPF_SYSCALL

int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char **filep,
                           const char **linep, int *nump)
{
        int idx = -1, insn_start, insn_end, len;
        struct bpf_line_info *linfo;
        void **jited_linfo;
        struct btf *btf;
        int nr_linfo;

        btf = prog->aux->btf;
        linfo = prog->aux->linfo;
        jited_linfo = prog->aux->jited_linfo;

        if (!btf || !linfo || !jited_linfo)
                return -EINVAL;
        len = prog->aux->func ? prog->aux->func[prog->aux->func_idx]->len : prog->len;

        linfo = &prog->aux->linfo[prog->aux->linfo_idx];
        jited_linfo = &prog->aux->jited_linfo[prog->aux->linfo_idx];

        insn_start = linfo[0].insn_off;
        insn_end = insn_start + len;
        nr_linfo = prog->aux->nr_linfo - prog->aux->linfo_idx;

        for (int i = 0; i < nr_linfo &&
             linfo[i].insn_off >= insn_start && linfo[i].insn_off < insn_end; i++) {
                if (jited_linfo[i] >= (void *)ip)
                        break;
                idx = i;
        }

        if (idx == -1)
                return -ENOENT;

        /* Get base component of the file path. */
        *filep = btf_name_by_offset(btf, linfo[idx].file_name_off);
        *filep = kbasename(*filep);
        /* Obtain the source line, and strip whitespace in prefix. */
        *linep = btf_name_by_offset(btf, linfo[idx].line_off);
        while (isspace(**linep))
                *linep += 1;
        *nump = BPF_LINE_INFO_LINE_NUM(linfo[idx].line_col);
        return 0;
}

struct walk_stack_ctx {
        struct bpf_prog *prog;
};

static bool find_from_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp)
{
        struct walk_stack_ctx *ctxp = cookie;
        struct bpf_prog *prog;

        /*
         * The RCU read lock is held to safely traverse the latch tree, but we
         * don't need its protection when accessing the prog, since it has an
         * active stack frame on the current stack trace, and won't disappear.
         */
        rcu_read_lock();
        prog = bpf_prog_ksym_find(ip);
        rcu_read_unlock();
        if (!prog)
                return true;
        /* Make sure we return the main prog if we found a subprog */
        ctxp->prog = prog->aux->main_prog_aux->prog;
        return false;
}

struct bpf_prog *bpf_prog_find_from_stack(void)
{
        struct walk_stack_ctx ctx = {};

        arch_bpf_stack_walk(find_from_stack_cb, &ctx);
        return ctx.prog;
}

#endif

















































































































































































































































































    4 
    4 























































































































































































































































































































































































































































    4 







    4 











    4 








    4 

    4 
















































































































































































































































































    4 




    4 










    4 





















    4 

































    4 












































    4 



    4 
















































































































































































































































































































































































































































































    4 


    4 
    4 
    4 


    4 





































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGTABLE_H
#define _ASM_X86_PGTABLE_H

#include <linux/mem_encrypt.h>
#include <asm/page.h>
#include <asm/pgtable_types.h>

/*
 * Macro to mark a page protection value as UC-
 */
#define pgprot_noncached(prot)                                                \
        ((boot_cpu_data.x86 > 3)                                        \
         ? (__pgprot(pgprot_val(prot) |                                        \
                     cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS)))        \
         : (prot))

#ifndef __ASSEMBLER__
#include <linux/spinlock.h>
#include <asm/x86_init.h>
#include <asm/pkru.h>
#include <asm/fpu/api.h>
#include <asm/coco.h>
#include <asm-generic/pgtable_uffd.h>
#include <linux/page_table_check.h>

extern pgd_t early_top_pgt[PTRS_PER_PGD];
bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd);

struct seq_file;
void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm);
void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm,
                                   bool user);
bool ptdump_walk_pgd_level_checkwx(void);
#define ptdump_check_wx ptdump_walk_pgd_level_checkwx
void ptdump_walk_user_pgd_level_checkwx(void);

/*
 * Macros to add or remove encryption attribute
 */
#define pgprot_encrypted(prot)        __pgprot(cc_mkenc(pgprot_val(prot)))
#define pgprot_decrypted(prot)        __pgprot(cc_mkdec(pgprot_val(prot)))

#ifdef CONFIG_DEBUG_WX
#define debug_checkwx_user()        ptdump_walk_user_pgd_level_checkwx()
#else
#define debug_checkwx_user()        do { } while (0)
#endif

/*
 * ZERO_PAGE is a global shared page that is always zero: used
 * for zero-mapped memory areas etc..
 */
extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]
        __visible;
#define ZERO_PAGE(vaddr) ((void)(vaddr),virt_to_page(empty_zero_page))

extern spinlock_t pgd_lock;
extern struct list_head pgd_list;

extern struct mm_struct *pgd_page_get_mm(struct page *page);

extern pmdval_t early_pmd_flags;

#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else  /* !CONFIG_PARAVIRT_XXL */
#define set_pte(ptep, pte)                native_set_pte(ptep, pte)

#define set_pte_atomic(ptep, pte)                                        \
        native_set_pte_atomic(ptep, pte)

#define set_pmd(pmdp, pmd)                native_set_pmd(pmdp, pmd)

#ifndef __PAGETABLE_P4D_FOLDED
#define set_pgd(pgdp, pgd)                native_set_pgd(pgdp, pgd)
#define pgd_clear(pgd)                        (pgtable_l5_enabled() ? native_pgd_clear(pgd) : 0)
#endif

#ifndef set_p4d
# define set_p4d(p4dp, p4d)                native_set_p4d(p4dp, p4d)
#endif

#ifndef __PAGETABLE_PUD_FOLDED
#define p4d_clear(p4d)                        native_p4d_clear(p4d)
#endif

#ifndef set_pud
# define set_pud(pudp, pud)                native_set_pud(pudp, pud)
#endif

#ifndef __PAGETABLE_PUD_FOLDED
#define pud_clear(pud)                        native_pud_clear(pud)
#endif

#define pte_clear(mm, addr, ptep)        native_pte_clear(mm, addr, ptep)
#define pmd_clear(pmd)                        native_pmd_clear(pmd)

#define pgd_val(x)        native_pgd_val(x)
#define __pgd(x)        native_make_pgd(x)

#ifndef __PAGETABLE_P4D_FOLDED
#define p4d_val(x)        native_p4d_val(x)
#define __p4d(x)        native_make_p4d(x)
#endif

#ifndef __PAGETABLE_PUD_FOLDED
#define pud_val(x)        native_pud_val(x)
#define __pud(x)        native_make_pud(x)
#endif

#ifndef __PAGETABLE_PMD_FOLDED
#define pmd_val(x)        native_pmd_val(x)
#define __pmd(x)        native_make_pmd(x)
#endif

#define pte_val(x)        native_pte_val(x)
#define __pte(x)        native_make_pte(x)

#define arch_end_context_switch(prev)        do {} while(0)
#endif        /* CONFIG_PARAVIRT_XXL */

static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
{
        pmdval_t v = native_pmd_val(pmd);

        return native_make_pmd(v | set);
}

static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
{
        pmdval_t v = native_pmd_val(pmd);

        return native_make_pmd(v & ~clear);
}

static inline pud_t pud_set_flags(pud_t pud, pudval_t set)
{
        pudval_t v = native_pud_val(pud);

        return native_make_pud(v | set);
}

static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear)
{
        pudval_t v = native_pud_val(pud);

        return native_make_pud(v & ~clear);
}

/*
 * The following only work if pte_present() is true.
 * Undefined behaviour if not..
 */
static inline bool pte_dirty(pte_t pte)
{
        return pte_flags(pte) & _PAGE_DIRTY_BITS;
}

static inline bool pte_shstk(pte_t pte)
{
        return cpu_feature_enabled(X86_FEATURE_SHSTK) &&
               (pte_flags(pte) & (_PAGE_RW | _PAGE_DIRTY)) == _PAGE_DIRTY;
}

static inline int pte_young(pte_t pte)
{
        return pte_flags(pte) & _PAGE_ACCESSED;
}

static inline bool pte_decrypted(pte_t pte)
{
        return cc_mkdec(pte_val(pte)) == pte_val(pte);
}

#define pmd_dirty pmd_dirty
static inline bool pmd_dirty(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_DIRTY_BITS;
}

static inline bool pmd_shstk(pmd_t pmd)
{
        return cpu_feature_enabled(X86_FEATURE_SHSTK) &&
               (pmd_flags(pmd) & (_PAGE_RW | _PAGE_DIRTY | _PAGE_PSE)) ==
               (_PAGE_DIRTY | _PAGE_PSE);
}

#define pmd_young pmd_young
static inline int pmd_young(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_ACCESSED;
}

static inline bool pud_dirty(pud_t pud)
{
        return pud_flags(pud) & _PAGE_DIRTY_BITS;
}

static inline int pud_young(pud_t pud)
{
        return pud_flags(pud) & _PAGE_ACCESSED;
}

static inline bool pud_shstk(pud_t pud)
{
        return cpu_feature_enabled(X86_FEATURE_SHSTK) &&
               (pud_flags(pud) & (_PAGE_RW | _PAGE_DIRTY | _PAGE_PSE)) ==
               (_PAGE_DIRTY | _PAGE_PSE);
}

static inline int pte_write(pte_t pte)
{
        /*
         * Shadow stack pages are logically writable, but do not have
         * _PAGE_RW.  Check for them separately from _PAGE_RW itself.
         */
        return (pte_flags(pte) & _PAGE_RW) || pte_shstk(pte);
}

#define pmd_write pmd_write
static inline int pmd_write(pmd_t pmd)
{
        /*
         * Shadow stack pages are logically writable, but do not have
         * _PAGE_RW.  Check for them separately from _PAGE_RW itself.
         */
        return (pmd_flags(pmd) & _PAGE_RW) || pmd_shstk(pmd);
}

#define pud_write pud_write
static inline int pud_write(pud_t pud)
{
        return pud_flags(pud) & _PAGE_RW;
}

static inline int pte_huge(pte_t pte)
{
        return pte_flags(pte) & _PAGE_PSE;
}

static inline int pte_global(pte_t pte)
{
        return pte_flags(pte) & _PAGE_GLOBAL;
}

static inline int pte_exec(pte_t pte)
{
        return !(pte_flags(pte) & _PAGE_NX);
}

static inline int pte_special(pte_t pte)
{
        return pte_flags(pte) & _PAGE_SPECIAL;
}

/* Entries that were set to PROT_NONE are inverted */

static inline u64 protnone_mask(u64 val);

#define PFN_PTE_SHIFT        PAGE_SHIFT

static inline unsigned long pte_pfn(pte_t pte)
{
        phys_addr_t pfn = pte_val(pte);
        pfn ^= protnone_mask(pfn);
        return (pfn & PTE_PFN_MASK) >> PAGE_SHIFT;
}

static inline unsigned long pmd_pfn(pmd_t pmd)
{
        phys_addr_t pfn = pmd_val(pmd);
        pfn ^= protnone_mask(pfn);
        return (pfn & pmd_pfn_mask(pmd)) >> PAGE_SHIFT;
}

#define pud_pfn pud_pfn
static inline unsigned long pud_pfn(pud_t pud)
{
        phys_addr_t pfn = pud_val(pud);
        pfn ^= protnone_mask(pfn);
        return (pfn & pud_pfn_mask(pud)) >> PAGE_SHIFT;
}

static inline unsigned long p4d_pfn(p4d_t p4d)
{
        return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT;
}

static inline unsigned long pgd_pfn(pgd_t pgd)
{
        return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT;
}

#define pte_page(pte)        pfn_to_page(pte_pfn(pte))

#define pmd_leaf pmd_leaf
static inline bool pmd_leaf(pmd_t pte)
{
        return pmd_flags(pte) & _PAGE_PSE;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline int pmd_trans_huge(pmd_t pmd)
{
        return (pmd_val(pmd) & _PAGE_PSE) == _PAGE_PSE;
}

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static inline int pud_trans_huge(pud_t pud)
{
        return (pud_val(pud) & _PAGE_PSE) == _PAGE_PSE;
}
#endif

#define has_transparent_hugepage has_transparent_hugepage
static inline int has_transparent_hugepage(void)
{
        return boot_cpu_has(X86_FEATURE_PSE);
}

#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
static inline bool pmd_special(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_SPECIAL;
}

static inline pmd_t pmd_mkspecial(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_SPECIAL);
}
#endif        /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */

#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP
static inline bool pud_special(pud_t pud)
{
        return pud_flags(pud) & _PAGE_SPECIAL;
}

static inline pud_t pud_mkspecial(pud_t pud)
{
        return pud_set_flags(pud, _PAGE_SPECIAL);
}
#endif        /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
{
        pteval_t v = native_pte_val(pte);

        return native_make_pte(v | set);
}

static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
{
        pteval_t v = native_pte_val(pte);

        return native_make_pte(v & ~clear);
}

/*
 * Write protection operations can result in Dirty=1,Write=0 PTEs. But in the
 * case of X86_FEATURE_USER_SHSTK, these PTEs denote shadow stack memory. So
 * when creating dirty, write-protected memory, a software bit is used:
 * _PAGE_BIT_SAVED_DIRTY. The following functions take a PTE and transition the
 * Dirty bit to SavedDirty, and vice-vesra.
 *
 * This shifting is only done if needed. In the case of shifting
 * Dirty->SavedDirty, the condition is if the PTE is Write=0. In the case of
 * shifting SavedDirty->Dirty, the condition is Write=1.
 */
static inline pgprotval_t mksaveddirty_shift(pgprotval_t v)
{
        pgprotval_t cond = (~v >> _PAGE_BIT_RW) & 1;

        v |= ((v >> _PAGE_BIT_DIRTY) & cond) << _PAGE_BIT_SAVED_DIRTY;
        v &= ~(cond << _PAGE_BIT_DIRTY);

        return v;
}

static inline pgprotval_t clear_saveddirty_shift(pgprotval_t v)
{
        pgprotval_t cond = (v >> _PAGE_BIT_RW) & 1;

        v |= ((v >> _PAGE_BIT_SAVED_DIRTY) & cond) << _PAGE_BIT_DIRTY;
        v &= ~(cond << _PAGE_BIT_SAVED_DIRTY);

        return v;
}

static inline pte_t pte_mksaveddirty(pte_t pte)
{
        pteval_t v = native_pte_val(pte);

        v = mksaveddirty_shift(v);
        return native_make_pte(v);
}

static inline pte_t pte_clear_saveddirty(pte_t pte)
{
        pteval_t v = native_pte_val(pte);

        v = clear_saveddirty_shift(v);
        return native_make_pte(v);
}

static inline pte_t pte_wrprotect(pte_t pte)
{
        pte = pte_clear_flags(pte, _PAGE_RW);

        /*
         * Blindly clearing _PAGE_RW might accidentally create
         * a shadow stack PTE (Write=0,Dirty=1). Move the hardware
         * dirty value to the software bit, if present.
         */
        return pte_mksaveddirty(pte);
}

#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline int pte_uffd_wp(pte_t pte)
{
        return pte_flags(pte) & _PAGE_UFFD_WP;
}

static inline pte_t pte_mkuffd_wp(pte_t pte)
{
        return pte_wrprotect(pte_set_flags(pte, _PAGE_UFFD_WP));
}

static inline pte_t pte_clear_uffd_wp(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_UFFD_WP);
}
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */

static inline pte_t pte_mkclean(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_DIRTY_BITS);
}

static inline pte_t pte_mkold(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_ACCESSED);
}

static inline pte_t pte_mkexec(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_NX);
}

static inline pte_t pte_mkdirty(pte_t pte)
{
        pte = pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);

        return pte_mksaveddirty(pte);
}

static inline pte_t pte_mkwrite_shstk(pte_t pte)
{
        pte = pte_clear_flags(pte, _PAGE_RW);

        return pte_set_flags(pte, _PAGE_DIRTY);
}

static inline pte_t pte_mkyoung(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_ACCESSED);
}

static inline pte_t pte_mkwrite_novma(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_RW);
}

struct vm_area_struct;
pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma);
#define pte_mkwrite pte_mkwrite

static inline pte_t pte_mkhuge(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_PSE);
}

static inline pte_t pte_clrhuge(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_PSE);
}

static inline pte_t pte_mkglobal(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_GLOBAL);
}

static inline pte_t pte_clrglobal(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_GLOBAL);
}

static inline pte_t pte_mkspecial(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_SPECIAL);
}

/* See comments above mksaveddirty_shift() */
static inline pmd_t pmd_mksaveddirty(pmd_t pmd)
{
        pmdval_t v = native_pmd_val(pmd);

        v = mksaveddirty_shift(v);
        return native_make_pmd(v);
}

/* See comments above mksaveddirty_shift() */
static inline pmd_t pmd_clear_saveddirty(pmd_t pmd)
{
        pmdval_t v = native_pmd_val(pmd);

        v = clear_saveddirty_shift(v);
        return native_make_pmd(v);
}

static inline pmd_t pmd_wrprotect(pmd_t pmd)
{
        pmd = pmd_clear_flags(pmd, _PAGE_RW);

        /*
         * Blindly clearing _PAGE_RW might accidentally create
         * a shadow stack PMD (RW=0, Dirty=1). Move the hardware
         * dirty value to the software bit.
         */
        return pmd_mksaveddirty(pmd);
}

#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline int pmd_uffd_wp(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_UFFD_WP;
}

static inline pmd_t pmd_mkuffd_wp(pmd_t pmd)
{
        return pmd_wrprotect(pmd_set_flags(pmd, _PAGE_UFFD_WP));
}

static inline pmd_t pmd_clear_uffd_wp(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_UFFD_WP);
}
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */

static inline pmd_t pmd_mkold(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_ACCESSED);
}

static inline pmd_t pmd_mkclean(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_DIRTY_BITS);
}

static inline pmd_t pmd_mkdirty(pmd_t pmd)
{
        pmd = pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);

        return pmd_mksaveddirty(pmd);
}

static inline pmd_t pmd_mkwrite_shstk(pmd_t pmd)
{
        pmd = pmd_clear_flags(pmd, _PAGE_RW);

        return pmd_set_flags(pmd, _PAGE_DIRTY);
}

static inline pmd_t pmd_mkhuge(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_PSE);
}

static inline pmd_t pmd_mkyoung(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_ACCESSED);
}

static inline pmd_t pmd_mkwrite_novma(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_RW);
}

pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
#define pmd_mkwrite pmd_mkwrite

/* See comments above mksaveddirty_shift() */
static inline pud_t pud_mksaveddirty(pud_t pud)
{
        pudval_t v = native_pud_val(pud);

        v = mksaveddirty_shift(v);
        return native_make_pud(v);
}

/* See comments above mksaveddirty_shift() */
static inline pud_t pud_clear_saveddirty(pud_t pud)
{
        pudval_t v = native_pud_val(pud);

        v = clear_saveddirty_shift(v);
        return native_make_pud(v);
}

static inline pud_t pud_mkold(pud_t pud)
{
        return pud_clear_flags(pud, _PAGE_ACCESSED);
}

static inline pud_t pud_mkclean(pud_t pud)
{
        return pud_clear_flags(pud, _PAGE_DIRTY_BITS);
}

static inline pud_t pud_wrprotect(pud_t pud)
{
        pud = pud_clear_flags(pud, _PAGE_RW);

        /*
         * Blindly clearing _PAGE_RW might accidentally create
         * a shadow stack PUD (RW=0, Dirty=1). Move the hardware
         * dirty value to the software bit.
         */
        return pud_mksaveddirty(pud);
}

static inline pud_t pud_mkdirty(pud_t pud)
{
        pud = pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);

        return pud_mksaveddirty(pud);
}

static inline pud_t pud_mkhuge(pud_t pud)
{
        return pud_set_flags(pud, _PAGE_PSE);
}

static inline pud_t pud_mkyoung(pud_t pud)
{
        return pud_set_flags(pud, _PAGE_ACCESSED);
}

static inline pud_t pud_mkwrite(pud_t pud)
{
        pud = pud_set_flags(pud, _PAGE_RW);

        return pud_clear_saveddirty(pud);
}

#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
static inline int pte_soft_dirty(pte_t pte)
{
        return pte_flags(pte) & _PAGE_SOFT_DIRTY;
}

static inline int pmd_soft_dirty(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_SOFT_DIRTY;
}

static inline int pud_soft_dirty(pud_t pud)
{
        return pud_flags(pud) & _PAGE_SOFT_DIRTY;
}

static inline pte_t pte_mksoft_dirty(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_SOFT_DIRTY);
}

static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
}

static inline pud_t pud_mksoft_dirty(pud_t pud)
{
        return pud_set_flags(pud, _PAGE_SOFT_DIRTY);
}

static inline pte_t pte_clear_soft_dirty(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_SOFT_DIRTY);
}

static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
}

static inline pud_t pud_clear_soft_dirty(pud_t pud)
{
        return pud_clear_flags(pud, _PAGE_SOFT_DIRTY);
}

#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */

/*
 * Mask out unsupported bits in a present pgprot.  Non-present pgprots
 * can use those bits for other purposes, so leave them be.
 */
static inline pgprotval_t massage_pgprot(pgprot_t pgprot)
{
        pgprotval_t protval = pgprot_val(pgprot);

        if (protval & _PAGE_PRESENT)
                protval &= __supported_pte_mask;

        return protval;
}

static inline pgprotval_t check_pgprot(pgprot_t pgprot)
{
        pgprotval_t massaged_val = massage_pgprot(pgprot);

        /* mmdebug.h can not be included here because of dependencies */
#ifdef CONFIG_DEBUG_VM
        WARN_ONCE(pgprot_val(pgprot) != massaged_val,
                  "attempted to set unsupported pgprot: %016llx "
                  "bits: %016llx supported: %016llx\n",
                  (u64)pgprot_val(pgprot),
                  (u64)pgprot_val(pgprot) ^ massaged_val,
                  (u64)__supported_pte_mask);
#endif

        return massaged_val;
}

static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
{
        phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
        /* This bit combination is used to mark shadow stacks */
        WARN_ON_ONCE((pgprot_val(pgprot) & (_PAGE_DIRTY | _PAGE_RW)) ==
                        _PAGE_DIRTY);
        pfn ^= protnone_mask(pgprot_val(pgprot));
        pfn &= PTE_PFN_MASK;
        return __pte(pfn | check_pgprot(pgprot));
}

static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
{
        phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
        pfn ^= protnone_mask(pgprot_val(pgprot));
        pfn &= PHYSICAL_PMD_PAGE_MASK;
        return __pmd(pfn | check_pgprot(pgprot));
}

static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot)
{
        phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
        pfn ^= protnone_mask(pgprot_val(pgprot));
        pfn &= PHYSICAL_PUD_PAGE_MASK;
        return __pud(pfn | check_pgprot(pgprot));
}

static inline pmd_t pmd_mkinvalid(pmd_t pmd)
{
        return pfn_pmd(pmd_pfn(pmd),
                      __pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE)));
}

static inline pud_t pud_mkinvalid(pud_t pud)
{
        return pfn_pud(pud_pfn(pud),
                       __pgprot(pud_flags(pud) & ~(_PAGE_PRESENT|_PAGE_PROTNONE)));
}

static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask);

static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
        pteval_t val = pte_val(pte), oldval = val;
        pte_t pte_result;

        /*
         * Chop off the NX bit (if present), and add the NX portion of
         * the newprot (if present):
         */
        val &= _PAGE_CHG_MASK;
        val |= check_pgprot(newprot) & ~_PAGE_CHG_MASK;
        val = flip_protnone_guard(oldval, val, PTE_PFN_MASK);

        pte_result = __pte(val);

        /*
         * To avoid creating Write=0,Dirty=1 PTEs, pte_modify() needs to avoid:
         *  1. Marking Write=0 PTEs Dirty=1
         *  2. Marking Dirty=1 PTEs Write=0
         *
         * The first case cannot happen because the _PAGE_CHG_MASK will filter
         * out any Dirty bit passed in newprot. Handle the second case by
         * going through the mksaveddirty exercise. Only do this if the old
         * value was Write=1 to avoid doing this on Shadow Stack PTEs.
         */
        if (oldval & _PAGE_RW)
                pte_result = pte_mksaveddirty(pte_result);
        else
                pte_result = pte_clear_saveddirty(pte_result);

        return pte_result;
}

static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
{
        pmdval_t val = pmd_val(pmd), oldval = val;
        pmd_t pmd_result;

        val &= (_HPAGE_CHG_MASK & ~_PAGE_DIRTY);
        val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK;
        val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK);

        pmd_result = __pmd(val);

        /*
         * Avoid creating shadow stack PMD by accident.  See comment in
         * pte_modify().
         */
        if (oldval & _PAGE_RW)
                pmd_result = pmd_mksaveddirty(pmd_result);
        else
                pmd_result = pmd_clear_saveddirty(pmd_result);

        return pmd_result;
}

static inline pud_t pud_modify(pud_t pud, pgprot_t newprot)
{
        pudval_t val = pud_val(pud), oldval = val;
        pud_t pud_result;

        val &= _HPAGE_CHG_MASK;
        val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK;
        val = flip_protnone_guard(oldval, val, PHYSICAL_PUD_PAGE_MASK);

        pud_result = __pud(val);

        /*
         * Avoid creating shadow stack PUD by accident.  See comment in
         * pte_modify().
         */
        if (oldval & _PAGE_RW)
                pud_result = pud_mksaveddirty(pud_result);
        else
                pud_result = pud_clear_saveddirty(pud_result);

        return pud_result;
}

/*
 * mprotect needs to preserve PAT and encryption bits when updating
 * vm_page_prot
 */
#define pgprot_modify pgprot_modify
static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
{
        pgprotval_t preservebits = pgprot_val(oldprot) & _PAGE_CHG_MASK;
        pgprotval_t addbits = pgprot_val(newprot) & ~_PAGE_CHG_MASK;
        return __pgprot(preservebits | addbits);
}

#define pte_pgprot(x) __pgprot(pte_flags(x))
#define pmd_pgprot(x) __pgprot(pmd_flags(x))
#define pud_pgprot(x) __pgprot(pud_flags(x))
#define p4d_pgprot(x) __pgprot(p4d_flags(x))

#define canon_pgprot(p) __pgprot(massage_pgprot(p))

static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
                                         enum page_cache_mode pcm,
                                         enum page_cache_mode new_pcm)
{
        /*
         * PAT type is always WB for untracked ranges, so no need to check.
         */
        if (x86_platform.is_untracked_pat_range(paddr, paddr + size))
                return 1;

        /*
         * Certain new memtypes are not allowed with certain
         * requested memtype:
         * - request is uncached, return cannot be write-back
         * - request is write-combine, return cannot be write-back
         * - request is write-through, return cannot be write-back
         * - request is write-through, return cannot be write-combine
         */
        if ((pcm == _PAGE_CACHE_MODE_UC_MINUS &&
             new_pcm == _PAGE_CACHE_MODE_WB) ||
            (pcm == _PAGE_CACHE_MODE_WC &&
             new_pcm == _PAGE_CACHE_MODE_WB) ||
            (pcm == _PAGE_CACHE_MODE_WT &&
             new_pcm == _PAGE_CACHE_MODE_WB) ||
            (pcm == _PAGE_CACHE_MODE_WT &&
             new_pcm == _PAGE_CACHE_MODE_WC)) {
                return 0;
        }

        return 1;
}

pmd_t *populate_extra_pmd(unsigned long vaddr);
pte_t *populate_extra_pte(unsigned long vaddr);

#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd);

/*
 * Take a PGD location (pgdp) and a pgd value that needs to be set there.
 * Populates the user and returns the resulting PGD that must be set in
 * the kernel copy of the page tables.
 */
static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
{
        if (!static_cpu_has(X86_FEATURE_PTI))
                return pgd;
        return __pti_set_user_pgtbl(pgdp, pgd);
}
#else   /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */
static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
{
        return pgd;
}
#endif  /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */

#endif        /* __ASSEMBLER__ */


#ifdef CONFIG_X86_32
# include <asm/pgtable_32.h>
#else
# include <asm/pgtable_64.h>
#endif

#ifndef __ASSEMBLER__
#include <linux/mm_types.h>
#include <linux/mmdebug.h>
#include <linux/log2.h>
#include <asm/fixmap.h>

static inline int pte_none(pte_t pte)
{
        return !(pte.pte & ~(_PAGE_KNL_ERRATUM_MASK));
}

#define __HAVE_ARCH_PTE_SAME
static inline int pte_same(pte_t a, pte_t b)
{
        return a.pte == b.pte;
}

static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
{
        if (__pte_needs_invert(pte_val(pte)))
                return __pte(pte_val(pte) - (nr << PFN_PTE_SHIFT));
        return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT));
}
#define pte_advance_pfn        pte_advance_pfn

static inline int pte_present(pte_t a)
{
        return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
}

#define pte_accessible pte_accessible
static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
{
        if (pte_flags(a) & _PAGE_PRESENT)
                return true;

        if ((pte_flags(a) & _PAGE_PROTNONE) &&
                        atomic_read(&mm->tlb_flush_pending))
                return true;

        return false;
}

static inline int pmd_present(pmd_t pmd)
{
        /*
         * Checking for _PAGE_PSE is needed too because
         * split_huge_page will temporarily clear the present bit (but
         * the _PAGE_PSE flag will remain set at all times while the
         * _PAGE_PRESENT bit is clear).
         */
        return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE);
}

#ifdef CONFIG_NUMA_BALANCING
/*
 * These work without NUMA balancing but the kernel does not care. See the
 * comment in include/linux/pgtable.h
 */
static inline int pte_protnone(pte_t pte)
{
        return (pte_flags(pte) & (_PAGE_PROTNONE | _PAGE_PRESENT))
                == _PAGE_PROTNONE;
}

static inline int pmd_protnone(pmd_t pmd)
{
        return (pmd_flags(pmd) & (_PAGE_PROTNONE | _PAGE_PRESENT))
                == _PAGE_PROTNONE;
}
#endif /* CONFIG_NUMA_BALANCING */

static inline int pmd_none(pmd_t pmd)
{
        /* Only check low word on 32-bit platforms, since it might be
           out of sync with upper half. */
        unsigned long val = native_pmd_val(pmd);
        return (val & ~_PAGE_KNL_ERRATUM_MASK) == 0;
}

static inline unsigned long pmd_page_vaddr(pmd_t pmd)
{
        return (unsigned long)__va(pmd_val(pmd) & pmd_pfn_mask(pmd));
}

/*
 * Currently stuck as a macro due to indirect forward reference to
 * linux/mmzone.h's __section_mem_map_addr() definition:
 */
#define pmd_page(pmd)        pfn_to_page(pmd_pfn(pmd))

static inline int pmd_bad(pmd_t pmd)
{
        return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) !=
               (_KERNPG_TABLE & ~_PAGE_ACCESSED);
}

static inline unsigned long pages_to_mb(unsigned long npg)
{
        return npg >> (20 - PAGE_SHIFT);
}

#if CONFIG_PGTABLE_LEVELS > 2
static inline int pud_none(pud_t pud)
{
        return (native_pud_val(pud) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0;
}

static inline int pud_present(pud_t pud)
{
        return pud_flags(pud) & _PAGE_PRESENT;
}

static inline pmd_t *pud_pgtable(pud_t pud)
{
        return (pmd_t *)__va(pud_val(pud) & pud_pfn_mask(pud));
}

/*
 * Currently stuck as a macro due to indirect forward reference to
 * linux/mmzone.h's __section_mem_map_addr() definition:
 */
#define pud_page(pud)        pfn_to_page(pud_pfn(pud))

#define pud_leaf pud_leaf
static inline bool pud_leaf(pud_t pud)
{
        return pud_val(pud) & _PAGE_PSE;
}

static inline int pud_bad(pud_t pud)
{
        return (pud_flags(pud) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
}
#endif        /* CONFIG_PGTABLE_LEVELS > 2 */

#if CONFIG_PGTABLE_LEVELS > 3
static inline int p4d_none(p4d_t p4d)
{
        return (native_p4d_val(p4d) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0;
}

static inline int p4d_present(p4d_t p4d)
{
        return p4d_flags(p4d) & _PAGE_PRESENT;
}

static inline pud_t *p4d_pgtable(p4d_t p4d)
{
        return (pud_t *)__va(p4d_val(p4d) & p4d_pfn_mask(p4d));
}

/*
 * Currently stuck as a macro due to indirect forward reference to
 * linux/mmzone.h's __section_mem_map_addr() definition:
 */
#define p4d_page(p4d)        pfn_to_page(p4d_pfn(p4d))

static inline int p4d_bad(p4d_t p4d)
{
        unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER;

        if (IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION))
                ignore_flags |= _PAGE_NX;

        return (p4d_flags(p4d) & ~ignore_flags) != 0;
}
#endif  /* CONFIG_PGTABLE_LEVELS > 3 */

static inline unsigned long p4d_index(unsigned long address)
{
        return (address >> P4D_SHIFT) & (PTRS_PER_P4D - 1);
}

#if CONFIG_PGTABLE_LEVELS > 4
static inline int pgd_present(pgd_t pgd)
{
        if (!pgtable_l5_enabled())
                return 1;
        return pgd_flags(pgd) & _PAGE_PRESENT;
}

static inline unsigned long pgd_page_vaddr(pgd_t pgd)
{
        return (unsigned long)__va((unsigned long)pgd_val(pgd) & PTE_PFN_MASK);
}

/*
 * Currently stuck as a macro due to indirect forward reference to
 * linux/mmzone.h's __section_mem_map_addr() definition:
 */
#define pgd_page(pgd)        pfn_to_page(pgd_pfn(pgd))

/* to find an entry in a page-table-directory. */
static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
{
        if (!pgtable_l5_enabled())
                return (p4d_t *)pgd;
        return (p4d_t *)pgd_page_vaddr(*pgd) + p4d_index(address);
}

static inline int pgd_bad(pgd_t pgd)
{
        unsigned long ignore_flags = _PAGE_USER;

        if (!pgtable_l5_enabled())
                return 0;

        if (IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION))
                ignore_flags |= _PAGE_NX;

        return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
}

static inline int pgd_none(pgd_t pgd)
{
        if (!pgtable_l5_enabled())
                return 0;
        /*
         * There is no need to do a workaround for the KNL stray
         * A/D bit erratum here.  PGDs only point to page tables
         * except on 32-bit non-PAE which is not supported on
         * KNL.
         */
        return !native_pgd_val(pgd);
}
#endif        /* CONFIG_PGTABLE_LEVELS > 4 */

#endif        /* __ASSEMBLER__ */

#define KERNEL_PGD_BOUNDARY        pgd_index(PAGE_OFFSET)
#define KERNEL_PGD_PTRS                (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)

#ifndef __ASSEMBLER__

extern int direct_gbpages;
void init_mem_mapping(void);
void early_alloc_pgt_buf(void);
void __init poking_init(void);
unsigned long init_memory_mapping(unsigned long start,
                                  unsigned long end, pgprot_t prot);

#ifdef CONFIG_X86_64
extern pgd_t trampoline_pgd_entry;
#endif

/* local pte updates need not use xchg for locking */
static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
{
        pte_t res = *ptep;

        /* Pure native function needs no input for mm, addr */
        native_pte_clear(NULL, 0, ptep);
        return res;
}

static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp)
{
        pmd_t res = *pmdp;

        native_pmd_clear(pmdp);
        return res;
}

static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp)
{
        pud_t res = *pudp;

        native_pud_clear(pudp);
        return res;
}

static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
                              pmd_t *pmdp, pmd_t pmd)
{
        page_table_check_pmd_set(mm, pmdp, pmd);
        set_pmd(pmdp, pmd);
}

static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
                              pud_t *pudp, pud_t pud)
{
        page_table_check_pud_set(mm, pudp, pud);
        native_set_pud(pudp, pud);
}

/*
 * We only update the dirty/accessed state if we set
 * the dirty bit by hand in the kernel, since the hardware
 * will do the accessed bit for us, and we don't want to
 * race with other CPU's that might be updating the dirty
 * bit at the same time.
 */
struct vm_area_struct;

#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
extern int ptep_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pte_t *ptep,
                                 pte_t entry, int dirty);

#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
                                     unsigned long addr, pte_t *ptep);

#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
extern int ptep_clear_flush_young(struct vm_area_struct *vma,
                                  unsigned long address, pte_t *ptep);

#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
                                       pte_t *ptep)
{
        pte_t pte = native_ptep_get_and_clear(ptep);
        page_table_check_pte_clear(mm, pte);
        return pte;
}

#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
                                            unsigned long addr, pte_t *ptep,
                                            int full)
{
        pte_t pte;
        if (full) {
                /*
                 * Full address destruction in progress; paravirt does not
                 * care about updates and native needs no locking
                 */
                pte = native_local_ptep_get_and_clear(ptep);
                page_table_check_pte_clear(mm, pte);
        } else {
                pte = ptep_get_and_clear(mm, addr, ptep);
        }
        return pte;
}

#define __HAVE_ARCH_PTEP_SET_WRPROTECT
static inline void ptep_set_wrprotect(struct mm_struct *mm,
                                      unsigned long addr, pte_t *ptep)
{
        /*
         * Avoid accidentally creating shadow stack PTEs
         * (Write=0,Dirty=1).  Use cmpxchg() to prevent races with
         * the hardware setting Dirty=1.
         */
        pte_t old_pte, new_pte;

        old_pte = READ_ONCE(*ptep);
        do {
                new_pte = pte_wrprotect(old_pte);
        } while (!try_cmpxchg((long *)&ptep->pte, (long *)&old_pte, *(long *)&new_pte));
}

#define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0)

#define  __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
extern int pmdp_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp,
                                 pmd_t entry, int dirty);
extern int pudp_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pud_t *pudp,
                                 pud_t entry, int dirty);

#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
                                     unsigned long addr, pmd_t *pmdp);
extern int pudp_test_and_clear_young(struct vm_area_struct *vma,
                                     unsigned long addr, pud_t *pudp);

#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
                                  unsigned long address, pmd_t *pmdp);


#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
                                       pmd_t *pmdp)
{
        pmd_t pmd = native_pmdp_get_and_clear(pmdp);

        page_table_check_pmd_clear(mm, pmd);

        return pmd;
}

#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
                                        unsigned long addr, pud_t *pudp)
{
        pud_t pud = native_pudp_get_and_clear(pudp);

        page_table_check_pud_clear(mm, pud);

        return pud;
}

#define __HAVE_ARCH_PMDP_SET_WRPROTECT
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long addr, pmd_t *pmdp)
{
        /*
         * Avoid accidentally creating shadow stack PTEs
         * (Write=0,Dirty=1).  Use cmpxchg() to prevent races with
         * the hardware setting Dirty=1.
         */
        pmd_t old_pmd, new_pmd;

        old_pmd = READ_ONCE(*pmdp);
        do {
                new_pmd = pmd_wrprotect(old_pmd);
        } while (!try_cmpxchg((long *)pmdp, (long *)&old_pmd, *(long *)&new_pmd));
}

#ifndef pmdp_establish
#define pmdp_establish pmdp_establish
static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmdp, pmd_t pmd)
{
        page_table_check_pmd_set(vma->vm_mm, pmdp, pmd);
        if (IS_ENABLED(CONFIG_SMP)) {
                return xchg(pmdp, pmd);
        } else {
                pmd_t old = *pmdp;
                WRITE_ONCE(*pmdp, pmd);
                return old;
        }
}
#endif

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static inline pud_t pudp_establish(struct vm_area_struct *vma,
                unsigned long address, pud_t *pudp, pud_t pud)
{
        page_table_check_pud_set(vma->vm_mm, pudp, pud);
        if (IS_ENABLED(CONFIG_SMP)) {
                return xchg(pudp, pud);
        } else {
                pud_t old = *pudp;
                WRITE_ONCE(*pudp, pud);
                return old;
        }
}
#endif

#define __HAVE_ARCH_PMDP_INVALIDATE_AD
extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma,
                                unsigned long address, pmd_t *pmdp);

pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address,
                      pud_t *pudp);

/*
 * Page table pages are page-aligned.  The lower half of the top
 * level is used for userspace and the top half for the kernel.
 *
 * Returns true for parts of the PGD that map userspace and
 * false for the parts that map the kernel.
 */
static inline bool pgdp_maps_userspace(void *__ptr)
{
        unsigned long ptr = (unsigned long)__ptr;

        return (((ptr & ~PAGE_MASK) / sizeof(pgd_t)) < PGD_KERNEL_START);
}

#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
/*
 * All top-level MITIGATION_PAGE_TABLE_ISOLATION page tables are order-1 pages
 * (8k-aligned and 8k in size).  The kernel one is at the beginning 4k and
 * the user one is in the last 4k.  To switch between them, you
 * just need to flip the 12th bit in their addresses.
 */
#define PTI_PGTABLE_SWITCH_BIT        PAGE_SHIFT

/*
 * This generates better code than the inline assembly in
 * __set_bit().
 */
static inline void *ptr_set_bit(void *ptr, int bit)
{
        unsigned long __ptr = (unsigned long)ptr;

        __ptr |= BIT(bit);
        return (void *)__ptr;
}
static inline void *ptr_clear_bit(void *ptr, int bit)
{
        unsigned long __ptr = (unsigned long)ptr;

        __ptr &= ~BIT(bit);
        return (void *)__ptr;
}

static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp)
{
        return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
}

static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp)
{
        return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
}

static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp)
{
        return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
}

static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
{
        return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
}
#endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */

/*
 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
 *
 *  dst - pointer to pgd range anywhere on a pgd page
 *  src - ""
 *  count - the number of pgds to copy.
 *
 * dst and src can be on the same page, but the range must not overlap,
 * and must not cross a page boundary.
 */
static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{
        memcpy(dst, src, count * sizeof(pgd_t));
#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
        if (!static_cpu_has(X86_FEATURE_PTI))
                return;
        /* Clone the user space pgd as well */
        memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src),
               count * sizeof(pgd_t));
#endif
}

#define PTE_SHIFT ilog2(PTRS_PER_PTE)
static inline int page_level_shift(enum pg_level level)
{
        return (PAGE_SHIFT - PTE_SHIFT) + level * PTE_SHIFT;
}
static inline unsigned long page_level_size(enum pg_level level)
{
        return 1UL << page_level_shift(level);
}
static inline unsigned long page_level_mask(enum pg_level level)
{
        return ~(page_level_size(level) - 1);
}

/*
 * The x86 doesn't have any external MMU info: the kernel page
 * tables contain all the necessary information.
 */
static inline void update_mmu_cache(struct vm_area_struct *vma,
                unsigned long addr, pte_t *ptep)
{
}
static inline void update_mmu_cache_range(struct vm_fault *vmf,
                struct vm_area_struct *vma, unsigned long addr,
                pte_t *ptep, unsigned int nr)
{
}
static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
                unsigned long addr, pmd_t *pmd)
{
}
static inline void update_mmu_cache_pud(struct vm_area_struct *vma,
                unsigned long addr, pud_t *pud)
{
}
static inline pte_t pte_swp_mkexclusive(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_SWP_EXCLUSIVE);
}

static inline bool pte_swp_exclusive(pte_t pte)
{
        return pte_flags(pte) & _PAGE_SWP_EXCLUSIVE;
}

static inline pte_t pte_swp_clear_exclusive(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_SWP_EXCLUSIVE);
}

#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
}

static inline int pte_swp_soft_dirty(pte_t pte)
{
        return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
}

static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
}

#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_SWP_SOFT_DIRTY);
}

static inline int pmd_swp_soft_dirty(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_SWP_SOFT_DIRTY;
}

static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_SWP_SOFT_DIRTY);
}
#endif
#endif

#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline pte_t pte_swp_mkuffd_wp(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_SWP_UFFD_WP);
}

static inline int pte_swp_uffd_wp(pte_t pte)
{
        return pte_flags(pte) & _PAGE_SWP_UFFD_WP;
}

static inline pte_t pte_swp_clear_uffd_wp(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_SWP_UFFD_WP);
}

static inline pmd_t pmd_swp_mkuffd_wp(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_SWP_UFFD_WP);
}

static inline int pmd_swp_uffd_wp(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_SWP_UFFD_WP;
}

static inline pmd_t pmd_swp_clear_uffd_wp(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_SWP_UFFD_WP);
}
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */

static inline u16 pte_flags_pkey(unsigned long pte_flags)
{
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
        /* ifdef to avoid doing 59-bit shift on 32-bit values */
        return (pte_flags & _PAGE_PKEY_MASK) >> _PAGE_BIT_PKEY_BIT0;
#else
        return 0;
#endif
}

static inline bool __pkru_allows_pkey(u16 pkey, bool write)
{
        u32 pkru = read_pkru();

        if (!__pkru_allows_read(pkru, pkey))
                return false;
        if (write && !__pkru_allows_write(pkru, pkey))
                return false;

        return true;
}

/*
 * 'pteval' can come from a PTE, PMD or PUD.  We only check
 * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
 * same value on all 3 types.
 */
static inline bool __pte_access_permitted(unsigned long pteval, bool write)
{
        unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;

        /*
         * Write=0,Dirty=1 PTEs are shadow stack, which the kernel
         * shouldn't generally allow access to, but since they
         * are already Write=0, the below logic covers both cases.
         */
        if (write)
                need_pte_bits |= _PAGE_RW;

        if ((pteval & need_pte_bits) != need_pte_bits)
                return 0;

        return __pkru_allows_pkey(pte_flags_pkey(pteval), write);
}

#define pte_access_permitted pte_access_permitted
static inline bool pte_access_permitted(pte_t pte, bool write)
{
        return __pte_access_permitted(pte_val(pte), write);
}

#define pmd_access_permitted pmd_access_permitted
static inline bool pmd_access_permitted(pmd_t pmd, bool write)
{
        return __pte_access_permitted(pmd_val(pmd), write);
}

#define pud_access_permitted pud_access_permitted
static inline bool pud_access_permitted(pud_t pud, bool write)
{
        return __pte_access_permitted(pud_val(pud), write);
}

#define __HAVE_ARCH_PFN_MODIFY_ALLOWED 1
extern bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot);

static inline bool arch_has_pfn_modify_check(void)
{
        return boot_cpu_has_bug(X86_BUG_L1TF);
}

#define arch_check_zapped_pte arch_check_zapped_pte
void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte);

#define arch_check_zapped_pmd arch_check_zapped_pmd
void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd);

#define arch_check_zapped_pud arch_check_zapped_pud
void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud);

#ifdef CONFIG_XEN_PV
#define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young
static inline bool arch_has_hw_nonleaf_pmd_young(void)
{
        return !cpu_feature_enabled(X86_FEATURE_XENPV);
}
#endif

#ifdef CONFIG_PAGE_TABLE_CHECK
static inline bool pte_user_accessible_page(pte_t pte)
{
        return (pte_val(pte) & _PAGE_PRESENT) && (pte_val(pte) & _PAGE_USER);
}

static inline bool pmd_user_accessible_page(pmd_t pmd)
{
        return pmd_leaf(pmd) && (pmd_val(pmd) & _PAGE_PRESENT) && (pmd_val(pmd) & _PAGE_USER);
}

static inline bool pud_user_accessible_page(pud_t pud)
{
        return pud_leaf(pud) && (pud_val(pud) & _PAGE_PRESENT) && (pud_val(pud) & _PAGE_USER);
}
#endif

#ifdef CONFIG_X86_SGX
int arch_memory_failure(unsigned long pfn, int flags);
#define arch_memory_failure arch_memory_failure

bool arch_is_platform_page(u64 paddr);
#define arch_is_platform_page arch_is_platform_page
#endif

/*
 * Use set_p*_safe(), and elide TLB flushing, when confident that *no*
 * TLB flush will be required as a result of the "set". For example, use
 * in scenarios where it is known ahead of time that the routine is
 * setting non-present entries, or re-setting an existing entry to the
 * same value. Otherwise, use the typical "set" helpers and flush the
 * TLB.
 */
#define set_pte_safe(ptep, pte) \
({ \
        WARN_ON_ONCE(pte_present(*ptep) && !pte_same(*ptep, pte)); \
        set_pte(ptep, pte); \
})

#define set_pmd_safe(pmdp, pmd) \
({ \
        WARN_ON_ONCE(pmd_present(*pmdp) && !pmd_same(*pmdp, pmd)); \
        set_pmd(pmdp, pmd); \
})

#define set_pud_safe(pudp, pud) \
({ \
        WARN_ON_ONCE(pud_present(*pudp) && !pud_same(*pudp, pud)); \
        set_pud(pudp, pud); \
})

#define set_p4d_safe(p4dp, p4d) \
({ \
        WARN_ON_ONCE(p4d_present(*p4dp) && !p4d_same(*p4dp, p4d)); \
        set_p4d(p4dp, p4d); \
})

#define set_pgd_safe(pgdp, pgd) \
({ \
        WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \
        set_pgd(pgdp, pgd); \
})
#endif        /* __ASSEMBLER__ */

#endif /* _ASM_X86_PGTABLE_H */
























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Device memory TCP support
 *
 * Authors:        Mina Almasry <almasrymina@google.com>
 *                Willem de Bruijn <willemb@google.com>
 *                Kaiyuan Zhang <kaiyuanz@google.com>
 *
 */
#ifndef _NET_DEVMEM_H
#define _NET_DEVMEM_H

#include <net/netmem.h>
#include <net/netdev_netlink.h>

struct netlink_ext_ack;

struct net_devmem_dmabuf_binding {
        struct dma_buf *dmabuf;
        struct dma_buf_attachment *attachment;
        struct sg_table *sgt;
        struct net_device *dev;
        struct gen_pool *chunk_pool;
        /* Protect dev */
        struct mutex lock;

        /* The user holds a ref (via the netlink API) for as long as they want
         * the binding to remain alive. Each page pool using this binding holds
         * a ref to keep the binding alive. The page_pool does not release the
         * ref until all the net_iovs allocated from this binding are released
         * back to the page_pool.
         *
         * The binding undos itself and unmaps the underlying dmabuf once all
         * those refs are dropped and the binding is no longer desired or in
         * use.
         *
         * net_devmem_get_net_iov() on dmabuf net_iovs will increment this
         * reference, making sure that the binding remains alive until all the
         * net_iovs are no longer used. net_iovs allocated from this binding
         * that are stuck in the TX path for any reason (such as awaiting
         * retransmits) hold a reference to the binding until the skb holding
         * them is freed.
         */
        refcount_t ref;

        /* The list of bindings currently active. Used for netlink to notify us
         * of the user dropping the bind.
         */
        struct list_head list;

        /* rxq's this binding is active on. */
        struct xarray bound_rxqs;

        /* ID of this binding. Globally unique to all bindings currently
         * active.
         */
        u32 id;

        /* DMA direction, FROM_DEVICE for Rx binding, TO_DEVICE for Tx. */
        enum dma_data_direction direction;

        /* Array of net_iov pointers for this binding, sorted by virtual
         * address. This array is convenient to map the virtual addresses to
         * net_iovs in the TX path.
         */
        struct net_iov **tx_vec;

        struct work_struct unbind_w;
};

#if defined(CONFIG_NET_DEVMEM)
/* Owner of the dma-buf chunks inserted into the gen pool. Each scatterlist
 * entry from the dmabuf is inserted into the genpool as a chunk, and needs
 * this owner struct to keep track of some metadata necessary to create
 * allocations from this chunk.
 */
struct dmabuf_genpool_chunk_owner {
        struct net_iov_area area;
        struct net_devmem_dmabuf_binding *binding;

        /* dma_addr of the start of the chunk.  */
        dma_addr_t base_dma_addr;
};

void __net_devmem_dmabuf_binding_free(struct work_struct *wq);
struct net_devmem_dmabuf_binding *
net_devmem_bind_dmabuf(struct net_device *dev,
                       struct device *dma_dev,
                       enum dma_data_direction direction,
                       unsigned int dmabuf_fd, struct netdev_nl_sock *priv,
                       struct netlink_ext_ack *extack);
struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id);
void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding);
int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
                                    struct net_devmem_dmabuf_binding *binding,
                                    struct netlink_ext_ack *extack);
void net_devmem_bind_tx_release(struct sock *sk);

static inline struct dmabuf_genpool_chunk_owner *
net_devmem_iov_to_chunk_owner(const struct net_iov *niov)
{
        struct net_iov_area *owner = net_iov_owner(niov);

        return container_of(owner, struct dmabuf_genpool_chunk_owner, area);
}

static inline struct net_devmem_dmabuf_binding *
net_devmem_iov_binding(const struct net_iov *niov)
{
        return net_devmem_iov_to_chunk_owner(niov)->binding;
}

static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov)
{
        return net_devmem_iov_binding(niov)->id;
}

static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov)
{
        struct net_iov_area *owner = net_iov_owner(niov);

        return owner->base_virtual +
               ((unsigned long)net_iov_idx(niov) << PAGE_SHIFT);
}

static inline bool
net_devmem_dmabuf_binding_get(struct net_devmem_dmabuf_binding *binding)
{
        return refcount_inc_not_zero(&binding->ref);
}

static inline void
net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding)
{
        if (!refcount_dec_and_test(&binding->ref))
                return;

        INIT_WORK(&binding->unbind_w, __net_devmem_dmabuf_binding_free);
        schedule_work(&binding->unbind_w);
}

void net_devmem_get_net_iov(struct net_iov *niov);
void net_devmem_put_net_iov(struct net_iov *niov);

struct net_iov *
net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding);
void net_devmem_free_dmabuf(struct net_iov *ppiov);

bool net_is_devmem_iov(struct net_iov *niov);
struct net_devmem_dmabuf_binding *
net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id);
struct net_iov *
net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr,
                       size_t *off, size_t *size);

#else
struct net_devmem_dmabuf_binding;

static inline void
net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding)
{
}

static inline void net_devmem_get_net_iov(struct net_iov *niov)
{
}

static inline void net_devmem_put_net_iov(struct net_iov *niov)
{
}

static inline struct net_devmem_dmabuf_binding *
net_devmem_bind_dmabuf(struct net_device *dev,
                       struct device *dma_dev,
                       enum dma_data_direction direction,
                       unsigned int dmabuf_fd,
                       struct netdev_nl_sock *priv,
                       struct netlink_ext_ack *extack)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id)
{
        return NULL;
}

static inline void
net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
{
}

static inline int
net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
                                struct net_devmem_dmabuf_binding *binding,
                                struct netlink_ext_ack *extack)

{
        return -EOPNOTSUPP;
}

static inline struct net_iov *
net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding)
{
        return NULL;
}

static inline void net_devmem_free_dmabuf(struct net_iov *ppiov)
{
}

static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov)
{
        return 0;
}

static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov)
{
        return 0;
}

static inline bool net_is_devmem_iov(struct net_iov *niov)
{
        return false;
}

static inline struct net_devmem_dmabuf_binding *
net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline struct net_iov *
net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr,
                       size_t *off, size_t *size)
{
        return NULL;
}

static inline struct net_devmem_dmabuf_binding *
net_devmem_iov_binding(const struct net_iov *niov)
{
        return NULL;
}
#endif

#endif /* _NET_DEVMEM_H */











































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Mutexes: blocking mutual exclusion locks
 *
 * started by Ingo Molnar:
 *
 *  Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
 *
 * This file contains the main data structure and API definitions.
 */
#ifndef __LINUX_MUTEX_H
#define __LINUX_MUTEX_H

#include <asm/current.h>
#include <linux/list.h>
#include <linux/spinlock_types.h>
#include <linux/lockdep.h>
#include <linux/atomic.h>
#include <asm/processor.h>
#include <linux/osq_lock.h>
#include <linux/debug_locks.h>
#include <linux/cleanup.h>
#include <linux/mutex_types.h>

struct device;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define __DEP_MAP_MUTEX_INITIALIZER(lockname)                        \
                , .dep_map = {                                        \
                        .name = #lockname,                        \
                        .wait_type_inner = LD_WAIT_SLEEP,        \
                }
#else
# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
#endif

#ifdef CONFIG_DEBUG_MUTEXES

# define __DEBUG_MUTEX_INITIALIZER(lockname)                                \
        , .magic = &lockname

extern void mutex_destroy(struct mutex *lock);

#else

# define __DEBUG_MUTEX_INITIALIZER(lockname)

static inline void mutex_destroy(struct mutex *lock) {}

#endif

/**
 * mutex_init - initialize the mutex
 * @mutex: the mutex to be initialized
 *
 * Initialize the mutex to unlocked state.
 *
 * It is not allowed to initialize an already locked mutex.
 */
#define mutex_init(mutex)                                                \
do {                                                                        \
        static struct lock_class_key __key;                                \
                                                                        \
        __mutex_init((mutex), #mutex, &__key);                                \
} while (0)

/**
 * mutex_init_with_key - initialize a mutex with a given lockdep key
 * @mutex: the mutex to be initialized
 * @key: the lockdep key to be associated with the mutex
 *
 * Initialize the mutex to the unlocked state.
 *
 * It is not allowed to initialize an already locked mutex.
 */
#define mutex_init_with_key(mutex, key) __mutex_init((mutex), #mutex, (key))

#ifndef CONFIG_PREEMPT_RT
#define __MUTEX_INITIALIZER(lockname) \
                { .owner = ATOMIC_LONG_INIT(0) \
                , .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
                , .wait_list = LIST_HEAD_INIT(lockname.wait_list) \
                __DEBUG_MUTEX_INITIALIZER(lockname) \
                __DEP_MAP_MUTEX_INITIALIZER(lockname) }

#define DEFINE_MUTEX(mutexname) \
        struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)

extern void __mutex_init(struct mutex *lock, const char *name,
                         struct lock_class_key *key);

/**
 * mutex_is_locked - is the mutex locked
 * @lock: the mutex to be queried
 *
 * Returns true if the mutex is locked, false if unlocked.
 */
extern bool mutex_is_locked(struct mutex *lock);

#else /* !CONFIG_PREEMPT_RT */
/*
 * Preempt-RT variant based on rtmutexes.
 */

#define __MUTEX_INITIALIZER(mutexname)                                        \
{                                                                        \
        .rtmutex = __RT_MUTEX_BASE_INITIALIZER(mutexname.rtmutex)        \
        __DEP_MAP_MUTEX_INITIALIZER(mutexname)                                \
}

#define DEFINE_MUTEX(mutexname)                                                \
        struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)

extern void __mutex_rt_init(struct mutex *lock, const char *name,
                            struct lock_class_key *key);

#define mutex_is_locked(l)        rt_mutex_base_is_locked(&(l)->rtmutex)

#define __mutex_init(mutex, name, key)                        \
do {                                                        \
        rt_mutex_base_init(&(mutex)->rtmutex);                \
        __mutex_rt_init((mutex), name, key);                \
} while (0)

#endif /* CONFIG_PREEMPT_RT */

#ifdef CONFIG_DEBUG_MUTEXES

int __must_check __devm_mutex_init(struct device *dev, struct mutex *lock);

#else

static inline int __must_check __devm_mutex_init(struct device *dev, struct mutex *lock)
{
        /*
         * When CONFIG_DEBUG_MUTEXES is off mutex_destroy() is just a nop so
         * no really need to register it in the devm subsystem.
         */
        return 0;
}

#endif

#define __mutex_init_ret(mutex)                                \
({                                                        \
        typeof(mutex) mutex_ = (mutex);                        \
                                                        \
        mutex_init(mutex_);                                \
        mutex_;                                                \
})

#define devm_mutex_init(dev, mutex) \
        __devm_mutex_init(dev, __mutex_init_ret(mutex))

/*
 * See kernel/locking/mutex.c for detailed documentation of these APIs.
 * Also see Documentation/locking/mutex-design.rst.
 */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
extern void mutex_lock_nested(struct mutex *lock, unsigned int subclass);
extern void _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);
extern int __must_check mutex_lock_interruptible_nested(struct mutex *lock,
                                        unsigned int subclass);
extern int __must_check _mutex_lock_killable(struct mutex *lock,
                unsigned int subclass, struct lockdep_map *nest_lock);
extern void mutex_lock_io_nested(struct mutex *lock, unsigned int subclass);

#define mutex_lock(lock) mutex_lock_nested(lock, 0)
#define mutex_lock_interruptible(lock) mutex_lock_interruptible_nested(lock, 0)
#define mutex_lock_killable(lock) _mutex_lock_killable(lock, 0, NULL)
#define mutex_lock_io(lock) mutex_lock_io_nested(lock, 0)

#define mutex_lock_nest_lock(lock, nest_lock)                                \
do {                                                                        \
        typecheck(struct lockdep_map *, &(nest_lock)->dep_map);        \
        _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map);                \
} while (0)

#define mutex_lock_killable_nest_lock(lock, nest_lock)                        \
(                                                                        \
        typecheck(struct lockdep_map *, &(nest_lock)->dep_map),                \
        _mutex_lock_killable(lock, 0, &(nest_lock)->dep_map)                \
)

#define mutex_lock_killable_nested(lock, subclass) \
        _mutex_lock_killable(lock, subclass, NULL)

#else
extern void mutex_lock(struct mutex *lock);
extern int __must_check mutex_lock_interruptible(struct mutex *lock);
extern int __must_check mutex_lock_killable(struct mutex *lock);
extern void mutex_lock_io(struct mutex *lock);

# define mutex_lock_nested(lock, subclass) mutex_lock(lock)
# define mutex_lock_interruptible_nested(lock, subclass) mutex_lock_interruptible(lock)
# define mutex_lock_killable_nested(lock, subclass) mutex_lock_killable(lock)
# define mutex_lock_killable_nest_lock(lock, nest_lock) mutex_lock_killable(lock)
# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
# define mutex_lock_io_nested(lock, subclass) mutex_lock_io(lock)
#endif

/*
 * NOTE: mutex_trylock() follows the spin_trylock() convention,
 *       not the down_trylock() convention!
 *
 * Returns 1 if the mutex has been acquired successfully, and 0 on contention.
 */

#ifdef CONFIG_DEBUG_LOCK_ALLOC
extern int _mutex_trylock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);

#define mutex_trylock_nest_lock(lock, nest_lock)                \
(                                                                \
        typecheck(struct lockdep_map *, &(nest_lock)->dep_map),        \
        _mutex_trylock_nest_lock(lock, &(nest_lock)->dep_map)        \
)

#define mutex_trylock(lock) _mutex_trylock_nest_lock(lock, NULL)
#else
extern int mutex_trylock(struct mutex *lock);
#define mutex_trylock_nest_lock(lock, nest_lock) mutex_trylock(lock)
#endif

extern void mutex_unlock(struct mutex *lock);

extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);

DEFINE_GUARD(mutex, struct mutex *, mutex_lock(_T), mutex_unlock(_T))
DEFINE_GUARD_COND(mutex, _try, mutex_trylock(_T))
DEFINE_GUARD_COND(mutex, _intr, mutex_lock_interruptible(_T), _RET == 0)

extern unsigned long mutex_get_owner(struct mutex *lock);

#endif /* __LINUX_MUTEX_H */







































































































































































































































































































































































    1 
























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
#ifndef __LINUX_OVERFLOW_H
#define __LINUX_OVERFLOW_H

#include <linux/compiler.h>
#include <linux/limits.h>
#include <linux/const.h>

/*
 * We need to compute the minimum and maximum values representable in a given
 * type. These macros may also be useful elsewhere. It would seem more obvious
 * to do something like:
 *
 * #define type_min(T) (T)(is_signed_type(T) ? (T)1 << (8*sizeof(T)-1) : 0)
 * #define type_max(T) (T)(is_signed_type(T) ? ((T)1 << (8*sizeof(T)-1)) - 1 : ~(T)0)
 *
 * Unfortunately, the middle expressions, strictly speaking, have
 * undefined behaviour, and at least some versions of gcc warn about
 * the type_max expression (but not if -fsanitize=undefined is in
 * effect; in that case, the warning is deferred to runtime...).
 *
 * The slightly excessive casting in type_min is to make sure the
 * macros also produce sensible values for the exotic type _Bool. [The
 * overflow checkers only almost work for _Bool, but that's
 * a-feature-not-a-bug, since people shouldn't be doing arithmetic on
 * _Bools. Besides, the gcc builtins don't allow _Bool* as third
 * argument.]
 *
 * Idea stolen from
 * https://mail-index.netbsd.org/tech-misc/2007/02/05/0000.html -
 * credit to Christian Biere.
 */
#define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type)))
#define __type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T)))
#define type_max(t)        __type_max(typeof(t))
#define __type_min(T) ((T)((T)-type_max(T)-(T)1))
#define type_min(t)        __type_min(typeof(t))

/*
 * Avoids triggering -Wtype-limits compilation warning,
 * while using unsigned data types to check a < 0.
 */
#define is_non_negative(a) ((a) > 0 || (a) == 0)
#define is_negative(a) (!(is_non_negative(a)))

/*
 * Allows for effectively applying __must_check to a macro so we can have
 * both the type-agnostic benefits of the macros while also being able to
 * enforce that the return value is, in fact, checked.
 */
static inline bool __must_check __must_check_overflow(bool overflow)
{
        return unlikely(overflow);
}

/**
 * check_add_overflow() - Calculate addition with overflow checking
 * @a: first addend
 * @b: second addend
 * @d: pointer to store sum
 *
 * Returns true on wrap-around, false otherwise.
 *
 * *@d holds the results of the attempted addition, regardless of whether
 * wrap-around occurred.
 */
#define check_add_overflow(a, b, d)        \
        __must_check_overflow(__builtin_add_overflow(a, b, d))

/**
 * wrapping_add() - Intentionally perform a wrapping addition
 * @type: type for result of calculation
 * @a: first addend
 * @b: second addend
 *
 * Return the potentially wrapped-around addition without
 * tripping any wrap-around sanitizers that may be enabled.
 */
#define wrapping_add(type, a, b)                                \
        ({                                                        \
                type __val;                                        \
                __builtin_add_overflow(a, b, &__val);                \
                __val;                                                \
        })

/**
 * wrapping_assign_add() - Intentionally perform a wrapping increment assignment
 * @var: variable to be incremented
 * @offset: amount to add
 *
 * Increments @var by @offset with wrap-around. Returns the resulting
 * value of @var. Will not trip any wrap-around sanitizers.
 *
 * Returns the new value of @var.
 */
#define wrapping_assign_add(var, offset)                                \
        ({                                                                \
                typeof(var) *__ptr = &(var);                                \
                *__ptr = wrapping_add(typeof(var), *__ptr, offset);        \
        })

/**
 * check_sub_overflow() - Calculate subtraction with overflow checking
 * @a: minuend; value to subtract from
 * @b: subtrahend; value to subtract from @a
 * @d: pointer to store difference
 *
 * Returns true on wrap-around, false otherwise.
 *
 * *@d holds the results of the attempted subtraction, regardless of whether
 * wrap-around occurred.
 */
#define check_sub_overflow(a, b, d)        \
        __must_check_overflow(__builtin_sub_overflow(a, b, d))

/**
 * wrapping_sub() - Intentionally perform a wrapping subtraction
 * @type: type for result of calculation
 * @a: minuend; value to subtract from
 * @b: subtrahend; value to subtract from @a
 *
 * Return the potentially wrapped-around subtraction without
 * tripping any wrap-around sanitizers that may be enabled.
 */
#define wrapping_sub(type, a, b)                                \
        ({                                                        \
                type __val;                                        \
                __builtin_sub_overflow(a, b, &__val);                \
                __val;                                                \
        })

/**
 * wrapping_assign_sub() - Intentionally perform a wrapping decrement assign
 * @var: variable to be decremented
 * @offset: amount to subtract
 *
 * Decrements @var by @offset with wrap-around. Returns the resulting
 * value of @var. Will not trip any wrap-around sanitizers.
 *
 * Returns the new value of @var.
 */
#define wrapping_assign_sub(var, offset)                                \
        ({                                                                \
                typeof(var) *__ptr = &(var);                                \
                *__ptr = wrapping_sub(typeof(var), *__ptr, offset);        \
        })

/**
 * check_mul_overflow() - Calculate multiplication with overflow checking
 * @a: first factor
 * @b: second factor
 * @d: pointer to store product
 *
 * Returns true on wrap-around, false otherwise.
 *
 * *@d holds the results of the attempted multiplication, regardless of whether
 * wrap-around occurred.
 */
#define check_mul_overflow(a, b, d)        \
        __must_check_overflow(__builtin_mul_overflow(a, b, d))

/**
 * wrapping_mul() - Intentionally perform a wrapping multiplication
 * @type: type for result of calculation
 * @a: first factor
 * @b: second factor
 *
 * Return the potentially wrapped-around multiplication without
 * tripping any wrap-around sanitizers that may be enabled.
 */
#define wrapping_mul(type, a, b)                                \
        ({                                                        \
                type __val;                                        \
                __builtin_mul_overflow(a, b, &__val);                \
                __val;                                                \
        })

/**
 * check_shl_overflow() - Calculate a left-shifted value and check overflow
 * @a: Value to be shifted
 * @s: How many bits left to shift
 * @d: Pointer to where to store the result
 *
 * Computes *@d = (@a << @s)
 *
 * Returns true if '*@d' cannot hold the result or when '@a << @s' doesn't
 * make sense. Example conditions:
 *
 * - '@a << @s' causes bits to be lost when stored in *@d.
 * - '@s' is garbage (e.g. negative) or so large that the result of
 *   '@a << @s' is guaranteed to be 0.
 * - '@a' is negative.
 * - '@a << @s' sets the sign bit, if any, in '*@d'.
 *
 * '*@d' will hold the results of the attempted shift, but is not
 * considered "safe for use" if true is returned.
 */
#define check_shl_overflow(a, s, d) __must_check_overflow(({                \
        typeof(a) _a = a;                                                \
        typeof(s) _s = s;                                                \
        typeof(d) _d = d;                                                \
        unsigned long long _a_full = _a;                                \
        unsigned int _to_shift =                                        \
                is_non_negative(_s) && _s < 8 * sizeof(*d) ? _s : 0;        \
        *_d = (_a_full << _to_shift);                                        \
        (_to_shift != _s || is_negative(*_d) || is_negative(_a) ||        \
        (*_d >> _to_shift) != _a);                                        \
}))

#define __overflows_type_constexpr(x, T) (                        \
        is_unsigned_type(typeof(x)) ?                                \
                (x) > type_max(T) :                                \
        is_unsigned_type(typeof(T)) ?                                \
                (x) < 0 || (x) > type_max(T) :                        \
        (x) < type_min(T) || (x) > type_max(T))

#define __overflows_type(x, T)                ({        \
        typeof(T) v = 0;                        \
        check_add_overflow((x), v, &v);                \
})

/**
 * overflows_type - helper for checking the overflows between value, variables,
 *                    or data type
 *
 * @n: source constant value or variable to be checked
 * @T: destination variable or data type proposed to store @x
 *
 * Compares the @x expression for whether or not it can safely fit in
 * the storage of the type in @T. @x and @T can have different types.
 * If @x is a constant expression, this will also resolve to a constant
 * expression.
 *
 * Returns: true if overflow can occur, false otherwise.
 */
#define overflows_type(n, T)                                        \
        __builtin_choose_expr(__is_constexpr(n),                \
                              __overflows_type_constexpr(n, T),        \
                              __overflows_type(n, T))

/**
 * range_overflows() - Check if a range is out of bounds
 * @start: Start of the range.
 * @size:  Size of the range.
 * @max:   Exclusive upper boundary.
 *
 * A strict check to determine if the range [@start, @start + @size) is
 * invalid with respect to the allowable range [0, @max). Any range
 * starting at or beyond @max is considered an overflow, even if @size is 0.
 *
 * Returns: true if the range is out of bounds.
 */
#define range_overflows(start, size, max) ({ \
        typeof(start) start__ = (start); \
        typeof(size) size__ = (size); \
        typeof(max) max__ = (max); \
        (void)(&start__ == &size__); \
        (void)(&start__ == &max__); \
        start__ >= max__ || size__ > max__ - start__; \
})

/**
 * range_overflows_t() - Check if a range is out of bounds
 * @type:  Data type to use.
 * @start: Start of the range.
 * @size:  Size of the range.
 * @max:   Exclusive upper boundary.
 *
 * Same as range_overflows() but forcing the parameters to @type.
 *
 * Returns: true if the range is out of bounds.
 */
#define range_overflows_t(type, start, size, max) \
        range_overflows((type)(start), (type)(size), (type)(max))

/**
 * range_end_overflows() - Check if a range's endpoint is out of bounds
 * @start: Start of the range.
 * @size:  Size of the range.
 * @max:   Exclusive upper boundary.
 *
 * Checks only if the endpoint of a range (@start + @size) exceeds @max.
 * Unlike range_overflows(), a zero-sized range at the boundary (@start == @max)
 * is not considered an overflow. Useful for iterator-style checks.
 *
 * Returns: true if the endpoint exceeds the boundary.
 */
#define range_end_overflows(start, size, max) ({ \
        typeof(start) start__ = (start); \
        typeof(size) size__ = (size); \
        typeof(max) max__ = (max); \
        (void)(&start__ == &size__); \
        (void)(&start__ == &max__); \
        start__ > max__ || size__ > max__ - start__; \
})

/**
 * range_end_overflows_t() - Check if a range's endpoint is out of bounds
 * @type:  Data type to use.
 * @start: Start of the range.
 * @size:  Size of the range.
 * @max:   Exclusive upper boundary.
 *
 * Same as range_end_overflows() but forcing the parameters to @type.
 *
 * Returns: true if the endpoint exceeds the boundary.
 */
#define range_end_overflows_t(type, start, size, max) \
        range_end_overflows((type)(start), (type)(size), (type)(max))

/**
 * castable_to_type - like __same_type(), but also allows for casted literals
 *
 * @n: variable or constant value
 * @T: variable or data type
 *
 * Unlike the __same_type() macro, this allows a constant value as the
 * first argument. If this value would not overflow into an assignment
 * of the second argument's type, it returns true. Otherwise, this falls
 * back to __same_type().
 */
#define castable_to_type(n, T)                                                \
        __builtin_choose_expr(__is_constexpr(n),                        \
                              !__overflows_type_constexpr(n, T),        \
                              __same_type(n, T))

/**
 * size_mul() - Calculate size_t multiplication with saturation at SIZE_MAX
 * @factor1: first factor
 * @factor2: second factor
 *
 * Returns: calculate @factor1 * @factor2, both promoted to size_t,
 * with any overflow causing the return value to be SIZE_MAX. The
 * lvalue must be size_t to avoid implicit type conversion.
 */
static inline size_t __must_check size_mul(size_t factor1, size_t factor2)
{
        size_t bytes;

        if (check_mul_overflow(factor1, factor2, &bytes))
                return SIZE_MAX;

        return bytes;
}

/**
 * size_add() - Calculate size_t addition with saturation at SIZE_MAX
 * @addend1: first addend
 * @addend2: second addend
 *
 * Returns: calculate @addend1 + @addend2, both promoted to size_t,
 * with any overflow causing the return value to be SIZE_MAX. The
 * lvalue must be size_t to avoid implicit type conversion.
 */
static inline size_t __must_check size_add(size_t addend1, size_t addend2)
{
        size_t bytes;

        if (check_add_overflow(addend1, addend2, &bytes))
                return SIZE_MAX;

        return bytes;
}

/**
 * size_sub() - Calculate size_t subtraction with saturation at SIZE_MAX
 * @minuend: value to subtract from
 * @subtrahend: value to subtract from @minuend
 *
 * Returns: calculate @minuend - @subtrahend, both promoted to size_t,
 * with any overflow causing the return value to be SIZE_MAX. For
 * composition with the size_add() and size_mul() helpers, neither
 * argument may be SIZE_MAX (or the result with be forced to SIZE_MAX).
 * The lvalue must be size_t to avoid implicit type conversion.
 */
static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend)
{
        size_t bytes;

        if (minuend == SIZE_MAX || subtrahend == SIZE_MAX ||
            check_sub_overflow(minuend, subtrahend, &bytes))
                return SIZE_MAX;

        return bytes;
}

/**
 * array_size() - Calculate size of 2-dimensional array.
 * @a: dimension one
 * @b: dimension two
 *
 * Calculates size of 2-dimensional array: @a * @b.
 *
 * Returns: number of bytes needed to represent the array or SIZE_MAX on
 * overflow.
 */
#define array_size(a, b)        size_mul(a, b)

/**
 * array3_size() - Calculate size of 3-dimensional array.
 * @a: dimension one
 * @b: dimension two
 * @c: dimension three
 *
 * Calculates size of 3-dimensional array: @a * @b * @c.
 *
 * Returns: number of bytes needed to represent the array or SIZE_MAX on
 * overflow.
 */
#define array3_size(a, b, c)        size_mul(size_mul(a, b), c)

/**
 * flex_array_size() - Calculate size of a flexible array member
 *                     within an enclosing structure.
 * @p: Pointer to the structure.
 * @member: Name of the flexible array member.
 * @count: Number of elements in the array.
 *
 * Calculates size of a flexible array of @count number of @member
 * elements, at the end of structure @p.
 *
 * Return: number of bytes needed or SIZE_MAX on overflow.
 */
#define flex_array_size(p, member, count)                                \
        __builtin_choose_expr(__is_constexpr(count),                        \
                (count) * sizeof(*(p)->member) + __must_be_array((p)->member),        \
                size_mul(count, sizeof(*(p)->member) + __must_be_array((p)->member)))

/**
 * struct_size() - Calculate size of structure with trailing flexible array.
 * @p: Pointer to the structure.
 * @member: Name of the array member.
 * @count: Number of elements in the array.
 *
 * Calculates size of memory needed for structure of @p followed by an
 * array of @count number of @member elements.
 *
 * Return: number of bytes needed or SIZE_MAX on overflow.
 */
#define struct_size(p, member, count)                                        \
        __builtin_choose_expr(__is_constexpr(count),                        \
                sizeof(*(p)) + flex_array_size(p, member, count),        \
                size_add(sizeof(*(p)), flex_array_size(p, member, count)))

/**
 * struct_size_t() - Calculate size of structure with trailing flexible array
 * @type: structure type name.
 * @member: Name of the array member.
 * @count: Number of elements in the array.
 *
 * Calculates size of memory needed for structure @type followed by an
 * array of @count number of @member elements. Prefer using struct_size()
 * when possible instead, to keep calculations associated with a specific
 * instance variable of type @type.
 *
 * Return: number of bytes needed or SIZE_MAX on overflow.
 */
#define struct_size_t(type, member, count)                                        \
        struct_size((type *)NULL, member, count)

/**
 * __DEFINE_FLEX() - helper macro for DEFINE_FLEX() family.
 * Enables caller macro to pass arbitrary trailing expressions
 *
 * @type: structure type name, including "struct" keyword.
 * @name: Name for a variable to define.
 * @member: Name of the array member.
 * @count: Number of elements in the array; must be compile-time const.
 * @trailer: Trailing expressions for attributes and/or initializers.
 */
#define __DEFINE_FLEX(type, name, member, count, trailer...)                        \
        _Static_assert(__builtin_constant_p(count),                                \
                       "onstack flex array members require compile-time const count"); \
        union {                                                                        \
                u8 bytes[struct_size_t(type, member, count)];                        \
                type obj;                                                        \
        } name##_u trailer;                                                        \
        type *name = (type *)&name##_u

/**
 * _DEFINE_FLEX() - helper macro for DEFINE_FLEX() family.
 * Enables caller macro to pass (different) initializer.
 *
 * @type: structure type name, including "struct" keyword.
 * @name: Name for a variable to define.
 * @member: Name of the array member.
 * @count: Number of elements in the array; must be compile-time const.
 * @initializer: Initializer expression (e.g., pass `= { }` at minimum).
 */
#define _DEFINE_FLEX(type, name, member, count, initializer...)                        \
        __DEFINE_FLEX(type, name, member, count, = { .obj initializer })

/**
 * DEFINE_RAW_FLEX() - Define an on-stack instance of structure with a trailing
 * flexible array member, when it does not have a __counted_by annotation.
 *
 * @type: structure type name, including "struct" keyword.
 * @name: Name for a variable to define.
 * @member: Name of the array member.
 * @count: Number of elements in the array; must be compile-time const.
 *
 * Define a zeroed, on-stack, instance of @type structure with a trailing
 * flexible array member.
 * Use __struct_size(@name) to get compile-time size of it afterwards.
 * Use __member_size(@name->member) to get compile-time size of @name members.
 * Use STACK_FLEX_ARRAY_SIZE(@name, @member) to get compile-time number of
 * elements in array @member.
 */
#define DEFINE_RAW_FLEX(type, name, member, count)        \
        __DEFINE_FLEX(type, name, member, count, = { })

/**
 * DEFINE_FLEX() - Define an on-stack instance of structure with a trailing
 * flexible array member.
 *
 * @TYPE: structure type name, including "struct" keyword.
 * @NAME: Name for a variable to define.
 * @MEMBER: Name of the array member.
 * @COUNTER: Name of the __counted_by member.
 * @COUNT: Number of elements in the array; must be compile-time const.
 *
 * Define a zeroed, on-stack, instance of @TYPE structure with a trailing
 * flexible array member.
 * Use __struct_size(@NAME) to get compile-time size of it afterwards.
 * Use __member_size(@NAME->member) to get compile-time size of @NAME members.
 * Use STACK_FLEX_ARRAY_SIZE(@name, @member) to get compile-time number of
 * elements in array @member.
 */
#define DEFINE_FLEX(TYPE, NAME, MEMBER, COUNTER, COUNT)        \
        _DEFINE_FLEX(TYPE, NAME, MEMBER, COUNT, = { .COUNTER = COUNT, })

/**
 * STACK_FLEX_ARRAY_SIZE() - helper macro for DEFINE_FLEX() family.
 * Returns the number of elements in @array.
 *
 * @name: Name for a variable defined in DEFINE_RAW_FLEX()/DEFINE_FLEX().
 * @array: Name of the array member.
 */
#define STACK_FLEX_ARRAY_SIZE(name, array)                                                \
        (__member_size((name)->array) / sizeof(*(name)->array) +                        \
                                                __must_be_array((name)->array))

#endif /* __LINUX_OVERFLOW_H */




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _TRACE_SYSCALL_H
#define _TRACE_SYSCALL_H

#include <linux/tracepoint.h>
#include <linux/unistd.h>
#include <linux/trace_events.h>
#include <linux/thread_info.h>

#include <asm/ptrace.h>


/*
 * A syscall entry in the ftrace syscalls array.
 *
 * @name: name of the syscall
 * @syscall_nr: number of the syscall
 * @nb_args: number of parameters it takes
 * @types: list of types as strings
 * @args: list of args as strings (args[i] matches types[i])
 * @enter_fields: list of fields for syscall_enter trace event
 * @enter_event: associated syscall_enter trace event
 * @exit_event: associated syscall_exit trace event
 */
struct syscall_metadata {
        const char        *name;
        int                syscall_nr;
        int                nb_args;
        const char        **types;
        const char        **args;
        struct list_head enter_fields;

        struct trace_event_call *enter_event;
        struct trace_event_call *exit_event;
};

#if defined(CONFIG_TRACEPOINTS) && defined(CONFIG_HAVE_SYSCALL_TRACEPOINTS)
static inline void syscall_tracepoint_update(struct task_struct *p)
{
        if (test_syscall_work(SYSCALL_TRACEPOINT))
                set_task_syscall_work(p, SYSCALL_TRACEPOINT);
        else
                clear_task_syscall_work(p, SYSCALL_TRACEPOINT);
}
#else
static inline void syscall_tracepoint_update(struct task_struct *p)
{
}
#endif

#endif /* _TRACE_SYSCALL_H */














































































































































































































































































































































































   15 
    1 






































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_STRING_H_
#define _LINUX_STRING_H_

#include <linux/args.h>
#include <linux/array_size.h>
#include <linux/cleanup.h>        /* for DEFINE_FREE() */
#include <linux/compiler.h>        /* for inline */
#include <linux/types.h>        /* for size_t */
#include <linux/stddef.h>        /* for NULL */
#include <linux/err.h>                /* for ERR_PTR() */
#include <linux/errno.h>        /* for E2BIG */
#include <linux/overflow.h>        /* for check_mul_overflow() */
#include <linux/stdarg.h>
#include <uapi/linux/string.h>

extern char *strndup_user(const char __user *, long);
extern void *memdup_user(const void __user *, size_t) __realloc_size(2);
extern void *vmemdup_user(const void __user *, size_t) __realloc_size(2);
extern void *memdup_user_nul(const void __user *, size_t);

/**
 * memdup_array_user - duplicate array from user space
 * @src: source address in user space
 * @n: number of array members to copy
 * @size: size of one array member
 *
 * Return: an ERR_PTR() on failure. Result is physically
 * contiguous, to be freed by kfree().
 */
static inline __realloc_size(2, 3)
void *memdup_array_user(const void __user *src, size_t n, size_t size)
{
        size_t nbytes;

        if (check_mul_overflow(n, size, &nbytes))
                return ERR_PTR(-EOVERFLOW);

        return memdup_user(src, nbytes);
}

/**
 * vmemdup_array_user - duplicate array from user space
 * @src: source address in user space
 * @n: number of array members to copy
 * @size: size of one array member
 *
 * Return: an ERR_PTR() on failure. Result may be not
 * physically contiguous. Use kvfree() to free.
 */
static inline __realloc_size(2, 3)
void *vmemdup_array_user(const void __user *src, size_t n, size_t size)
{
        size_t nbytes;

        if (check_mul_overflow(n, size, &nbytes))
                return ERR_PTR(-EOVERFLOW);

        return vmemdup_user(src, nbytes);
}

/*
 * Include machine specific inline routines
 */
#include <asm/string.h>

#ifndef __HAVE_ARCH_STRCPY
extern char * strcpy(char *,const char *);
#endif
#ifndef __HAVE_ARCH_STRNCPY
extern char * strncpy(char *,const char *, __kernel_size_t);
#endif
ssize_t sized_strscpy(char *, const char *, size_t);

/*
 * The 2 argument style can only be used when dst is an array with a
 * known size.
 */
#define __strscpy0(dst, src, ...)        \
        sized_strscpy(dst, src, sizeof(dst) + __must_be_array(dst) +        \
                                __must_be_cstr(dst) + __must_be_cstr(src))
#define __strscpy1(dst, src, size)        \
        sized_strscpy(dst, src, size + __must_be_cstr(dst) + __must_be_cstr(src))

#define __strscpy_pad0(dst, src, ...)        \
        sized_strscpy_pad(dst, src, sizeof(dst) + __must_be_array(dst) +        \
                                    __must_be_cstr(dst) + __must_be_cstr(src))
#define __strscpy_pad1(dst, src, size)        \
        sized_strscpy_pad(dst, src, size + __must_be_cstr(dst) + __must_be_cstr(src))

/**
 * strscpy - Copy a C-string into a sized buffer
 * @dst: Where to copy the string to
 * @src: Where to copy the string from
 * @...: Size of destination buffer (optional)
 *
 * Copy the source string @src, or as much of it as fits, into the
 * destination @dst buffer. The behavior is undefined if the string
 * buffers overlap. The destination @dst buffer is always NUL terminated,
 * unless it's zero-sized.
 *
 * The size argument @... is only required when @dst is not an array, or
 * when the copy needs to be smaller than sizeof(@dst).
 *
 * Preferred to strncpy() since it always returns a valid string, and
 * doesn't unnecessarily force the tail of the destination buffer to be
 * zero padded. If padding is desired please use strscpy_pad().
 *
 * Returns the number of characters copied in @dst (not including the
 * trailing %NUL) or -E2BIG if @size is 0 or the copy from @src was
 * truncated.
 */
#define strscpy(dst, src, ...)        \
        CONCATENATE(__strscpy, COUNT_ARGS(__VA_ARGS__))(dst, src, __VA_ARGS__)

#define sized_strscpy_pad(dest, src, count)        ({                        \
        char *__dst = (dest);                                                \
        const char *__src = (src);                                        \
        const size_t __count = (count);                                        \
        ssize_t __wrote;                                                \
                                                                        \
        __wrote = sized_strscpy(__dst, __src, __count);                        \
        if (__wrote >= 0 && __wrote < __count)                                \
                memset(__dst + __wrote + 1, 0, __count - __wrote - 1);        \
        __wrote;                                                        \
})

/**
 * strscpy_pad() - Copy a C-string into a sized buffer
 * @dst: Where to copy the string to
 * @src: Where to copy the string from
 * @...: Size of destination buffer
 *
 * Copy the string, or as much of it as fits, into the dest buffer. The
 * behavior is undefined if the string buffers overlap. The destination
 * buffer is always %NUL terminated, unless it's zero-sized.
 *
 * If the source string is shorter than the destination buffer, the
 * remaining bytes in the buffer will be filled with %NUL bytes.
 *
 * For full explanation of why you may want to consider using the
 * 'strscpy' functions please see the function docstring for strscpy().
 *
 * Returns:
 * * The number of characters copied (not including the trailing %NULs)
 * * -E2BIG if count is 0 or @src was truncated.
 */
#define strscpy_pad(dst, src, ...)        \
        CONCATENATE(__strscpy_pad, COUNT_ARGS(__VA_ARGS__))(dst, src, __VA_ARGS__)

#ifndef __HAVE_ARCH_STRCAT
extern char * strcat(char *, const char *);
#endif
#ifndef __HAVE_ARCH_STRNCAT
extern char * strncat(char *, const char *, __kernel_size_t);
#endif
#ifndef __HAVE_ARCH_STRLCAT
extern size_t strlcat(char *, const char *, __kernel_size_t);
#endif
#ifndef __HAVE_ARCH_STRCMP
extern int strcmp(const char *,const char *);
#endif
#ifndef __HAVE_ARCH_STRNCMP
extern int strncmp(const char *,const char *,__kernel_size_t);
#endif
#ifndef __HAVE_ARCH_STRCASECMP
extern int strcasecmp(const char *s1, const char *s2);
#endif
#ifndef __HAVE_ARCH_STRNCASECMP
extern int strncasecmp(const char *s1, const char *s2, size_t n);
#endif
#ifndef __HAVE_ARCH_STRCHR
extern char * strchr(const char *,int);
#endif
#ifndef __HAVE_ARCH_STRCHRNUL
extern char * strchrnul(const char *,int);
#endif
extern char * strnchrnul(const char *, size_t, int);
#ifndef __HAVE_ARCH_STRNCHR
extern char * strnchr(const char *, size_t, int);
#endif
#ifndef __HAVE_ARCH_STRRCHR
extern char * strrchr(const char *,int);
#endif
extern char * __must_check skip_spaces(const char *);

extern char *strim(char *);

static inline __must_check char *strstrip(char *str)
{
        return strim(str);
}

#ifndef __HAVE_ARCH_STRSTR
extern char * strstr(const char *, const char *);
#endif
#ifndef __HAVE_ARCH_STRNSTR
extern char * strnstr(const char *, const char *, size_t);
#endif
#ifndef __HAVE_ARCH_STRLEN
extern __kernel_size_t strlen(const char *);
#endif
#ifndef __HAVE_ARCH_STRNLEN
extern __kernel_size_t strnlen(const char *,__kernel_size_t);
#endif
#ifndef __HAVE_ARCH_STRPBRK
extern char * strpbrk(const char *,const char *);
#endif
#ifndef __HAVE_ARCH_STRSEP
extern char * strsep(char **,const char *);
#endif
#ifndef __HAVE_ARCH_STRSPN
extern __kernel_size_t strspn(const char *,const char *);
#endif
#ifndef __HAVE_ARCH_STRCSPN
extern __kernel_size_t strcspn(const char *,const char *);
#endif

#ifndef __HAVE_ARCH_MEMSET
extern void * memset(void *,int,__kernel_size_t);
#endif

#ifndef __HAVE_ARCH_MEMSET16
extern void *memset16(uint16_t *, uint16_t, __kernel_size_t);
#endif

#ifndef __HAVE_ARCH_MEMSET32
extern void *memset32(uint32_t *, uint32_t, __kernel_size_t);
#endif

#ifndef __HAVE_ARCH_MEMSET64
extern void *memset64(uint64_t *, uint64_t, __kernel_size_t);
#endif

static inline void *memset_l(unsigned long *p, unsigned long v,
                __kernel_size_t n)
{
        if (BITS_PER_LONG == 32)
                return memset32((uint32_t *)p, v, n);
        else
                return memset64((uint64_t *)p, v, n);
}

static inline void *memset_p(void **p, void *v, __kernel_size_t n)
{
        if (BITS_PER_LONG == 32)
                return memset32((uint32_t *)p, (uintptr_t)v, n);
        else
                return memset64((uint64_t *)p, (uintptr_t)v, n);
}

extern void **__memcat_p(void **a, void **b);
#define memcat_p(a, b) ({                                        \
        BUILD_BUG_ON_MSG(!__same_type(*(a), *(b)),                \
                         "type mismatch in memcat_p()");        \
        (typeof(*a) *)__memcat_p((void **)(a), (void **)(b));        \
})

#ifndef __HAVE_ARCH_MEMCPY
extern void * memcpy(void *,const void *,__kernel_size_t);
#endif
#ifndef __HAVE_ARCH_MEMMOVE
extern void * memmove(void *,const void *,__kernel_size_t);
#endif
#ifndef __HAVE_ARCH_MEMSCAN
extern void * memscan(void *,int,__kernel_size_t);
#endif
#ifndef __HAVE_ARCH_MEMCMP
extern int memcmp(const void *,const void *,__kernel_size_t);
#endif
#ifndef __HAVE_ARCH_BCMP
extern int bcmp(const void *,const void *,__kernel_size_t);
#endif
#ifndef __HAVE_ARCH_MEMCHR
extern void * memchr(const void *,int,__kernel_size_t);
#endif
#ifndef __HAVE_ARCH_MEMCPY_FLUSHCACHE
static inline void memcpy_flushcache(void *dst, const void *src, size_t cnt)
{
        memcpy(dst, src, cnt);
}
#endif

void *memchr_inv(const void *s, int c, size_t n);
char *strreplace(char *str, char old, char new);

/**
 * mem_is_zero - Check if an area of memory is all 0's.
 * @s: The memory area
 * @n: The size of the area
 *
 * Return: True if the area of memory is all 0's.
 */
static inline bool mem_is_zero(const void *s, size_t n)
{
        return !memchr_inv(s, 0, n);
}

extern void kfree_const(const void *x);

extern char *kstrdup(const char *s, gfp_t gfp) __malloc;
extern const char *kstrdup_const(const char *s, gfp_t gfp);
extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
extern void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp) __realloc_size(2);
#define kmemdup(...)        alloc_hooks(kmemdup_noprof(__VA_ARGS__))

extern void *kvmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2);
extern char *kmemdup_nul(const char *s, size_t len, gfp_t gfp);
extern void *kmemdup_array(const void *src, size_t count, size_t element_size, gfp_t gfp)
                __realloc_size(2, 3);

/* lib/argv_split.c */
extern char **argv_split(gfp_t gfp, const char *str, int *argcp);
extern void argv_free(char **argv);

DEFINE_FREE(argv_free, char **, if (!IS_ERR_OR_NULL(_T)) argv_free(_T))

/* lib/cmdline.c */
extern int get_option(char **str, int *pint);
extern char *get_options(const char *str, int nints, int *ints);
extern unsigned long long memparse(const char *ptr, char **retptr);
extern bool parse_option_str(const char *str, const char *option);
extern char *next_arg(char *args, char **param, char **val);

extern bool sysfs_streq(const char *s1, const char *s2);
int match_string(const char * const *array, size_t n, const char *string);
int __sysfs_match_string(const char * const *array, size_t n, const char *s);

/**
 * sysfs_match_string - matches given string in an array
 * @_a: array of strings
 * @_s: string to match with
 *
 * Helper for __sysfs_match_string(). Calculates the size of @a automatically.
 */
#define sysfs_match_string(_a, _s) __sysfs_match_string(_a, ARRAY_SIZE(_a), _s)

#ifdef CONFIG_BINARY_PRINTF
__printf(3, 0) int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args);
__printf(3, 0) int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf);
#endif

extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
                                       const void *from, size_t available);

int ptr_to_hashval(const void *ptr, unsigned long *hashval_out);

size_t memweight(const void *ptr, size_t bytes);

/**
 * memzero_explicit - Fill a region of memory (e.g. sensitive
 *                      keying data) with 0s.
 * @s: Pointer to the start of the area.
 * @count: The size of the area.
 *
 * Note: usually using memset() is just fine (!), but in cases
 * where clearing out _local_ data at the end of a scope is
 * necessary, memzero_explicit() should be used instead in
 * order to prevent the compiler from optimising away zeroing.
 *
 * memzero_explicit() doesn't need an arch-specific version as
 * it just invokes the one of memset() implicitly.
 */
static inline void memzero_explicit(void *s, size_t count)
{
        memset(s, 0, count);
        barrier_data(s);
}

/**
 * kbasename - return the last part of a pathname.
 *
 * @path: path to extract the filename from.
 */
static inline const char *kbasename(const char *path)
{
        const char *tail = strrchr(path, '/');
        return tail ? tail + 1 : path;
}

#if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE)
#include <linux/fortify-string.h>
#endif
#ifndef unsafe_memcpy
#define unsafe_memcpy(dst, src, bytes, justification)                \
        memcpy(dst, src, bytes)
#endif

void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count,
                    int pad);

/**
 * strtomem_pad - Copy NUL-terminated string to non-NUL-terminated buffer
 *
 * @dest: Pointer of destination character array (marked as __nonstring)
 * @src: Pointer to NUL-terminated string
 * @pad: Padding character to fill any remaining bytes of @dest after copy
 *
 * This is a replacement for strncpy() uses where the destination is not
 * a NUL-terminated string, but with bounds checking on the source size, and
 * an explicit padding character. If padding is not required, use strtomem().
 *
 * Note that the size of @dest is not an argument, as the length of @dest
 * must be discoverable by the compiler.
 */
#define strtomem_pad(dest, src, pad)        do {                                \
        const size_t _dest_len = __must_be_byte_array(dest) +                \
                                 __must_be_noncstr(dest) +                \
                                 ARRAY_SIZE(dest);                        \
        const size_t _src_len = __must_be_cstr(src) +                        \
                                __builtin_object_size(src, 1);                \
                                                                        \
        BUILD_BUG_ON(!__builtin_constant_p(_dest_len) ||                \
                     _dest_len == (size_t)-1);                                \
        memcpy_and_pad(dest, _dest_len, src,                                \
                       strnlen(src, min(_src_len, _dest_len)), pad);        \
} while (0)

/**
 * strtomem - Copy NUL-terminated string to non-NUL-terminated buffer
 *
 * @dest: Pointer of destination character array (marked as __nonstring)
 * @src: Pointer to NUL-terminated string
 *
 * This is a replacement for strncpy() uses where the destination is not
 * a NUL-terminated string, but with bounds checking on the source size, and
 * without trailing padding. If padding is required, use strtomem_pad().
 *
 * Note that the size of @dest is not an argument, as the length of @dest
 * must be discoverable by the compiler.
 */
#define strtomem(dest, src)        do {                                        \
        const size_t _dest_len = __must_be_byte_array(dest) +                \
                                 __must_be_noncstr(dest) +                \
                                 ARRAY_SIZE(dest);                        \
        const size_t _src_len = __must_be_cstr(src) +                        \
                                __builtin_object_size(src, 1);                \
                                                                        \
        BUILD_BUG_ON(!__builtin_constant_p(_dest_len) ||                \
                     _dest_len == (size_t)-1);                                \
        memcpy(dest, src, strnlen(src, min(_src_len, _dest_len)));        \
} while (0)

/**
 * memtostr - Copy a possibly non-NUL-term string to a NUL-term string
 * @dest: Pointer to destination NUL-terminates string
 * @src: Pointer to character array (likely marked as __nonstring)
 *
 * This is a replacement for strncpy() uses where the source is not
 * a NUL-terminated string.
 *
 * Note that sizes of @dest and @src must be known at compile-time.
 */
#define memtostr(dest, src)        do {                                        \
        const size_t _dest_len = __must_be_byte_array(dest) +                \
                                 __must_be_cstr(dest) +                        \
                                 ARRAY_SIZE(dest);                        \
        const size_t _src_len = __must_be_noncstr(src) +                \
                                __builtin_object_size(src, 1);                \
        const size_t _src_chars = strnlen(src, _src_len);                \
        const size_t _copy_len = min(_dest_len - 1, _src_chars);        \
                                                                        \
        BUILD_BUG_ON(!__builtin_constant_p(_dest_len) ||                \
                     !__builtin_constant_p(_src_len) ||                        \
                     _dest_len == 0 || _dest_len == (size_t)-1 ||        \
                     _src_len == 0 || _src_len == (size_t)-1);                \
        memcpy(dest, src, _copy_len);                                        \
        dest[_copy_len] = '\0';                                                \
} while (0)

/**
 * memtostr_pad - Copy a possibly non-NUL-term string to a NUL-term string
 *                with NUL padding in the destination
 * @dest: Pointer to destination NUL-terminates string
 * @src: Pointer to character array (likely marked as __nonstring)
 *
 * This is a replacement for strncpy() uses where the source is not
 * a NUL-terminated string.
 *
 * Note that sizes of @dest and @src must be known at compile-time.
 */
#define memtostr_pad(dest, src)                do {                                \
        const size_t _dest_len = __must_be_byte_array(dest) +                \
                                 __must_be_cstr(dest) +                        \
                                 ARRAY_SIZE(dest);                        \
        const size_t _src_len = __must_be_noncstr(src) +                \
                                __builtin_object_size(src, 1);                \
        const size_t _src_chars = strnlen(src, _src_len);                \
        const size_t _copy_len = min(_dest_len - 1, _src_chars);        \
                                                                        \
        BUILD_BUG_ON(!__builtin_constant_p(_dest_len) ||                \
                     !__builtin_constant_p(_src_len) ||                        \
                     _dest_len == 0 || _dest_len == (size_t)-1 ||        \
                     _src_len == 0 || _src_len == (size_t)-1);                \
        memcpy(dest, src, _copy_len);                                        \
        memset(&dest[_copy_len], 0, _dest_len - _copy_len);                \
} while (0)

/**
 * memset_after - Set a value after a struct member to the end of a struct
 *
 * @obj: Address of target struct instance
 * @v: Byte value to repeatedly write
 * @member: after which struct member to start writing bytes
 *
 * This is good for clearing padding following the given member.
 */
#define memset_after(obj, v, member)                                        \
({                                                                        \
        u8 *__ptr = (u8 *)(obj);                                        \
        typeof(v) __val = (v);                                                \
        memset(__ptr + offsetofend(typeof(*(obj)), member), __val,        \
               sizeof(*(obj)) - offsetofend(typeof(*(obj)), member));        \
})

/**
 * memset_startat - Set a value starting at a member to the end of a struct
 *
 * @obj: Address of target struct instance
 * @v: Byte value to repeatedly write
 * @member: struct member to start writing at
 *
 * Note that if there is padding between the prior member and the target
 * member, memset_after() should be used to clear the prior padding.
 */
#define memset_startat(obj, v, member)                                        \
({                                                                        \
        u8 *__ptr = (u8 *)(obj);                                        \
        typeof(v) __val = (v);                                                \
        memset(__ptr + offsetof(typeof(*(obj)), member), __val,                \
               sizeof(*(obj)) - offsetof(typeof(*(obj)), member));        \
})

/**
 * str_has_prefix - Test if a string has a given prefix
 * @str: The string to test
 * @prefix: The string to see if @str starts with
 *
 * A common way to test a prefix of a string is to do:
 *  strncmp(str, prefix, sizeof(prefix) - 1)
 *
 * But this can lead to bugs due to typos, or if prefix is a pointer
 * and not a constant. Instead use str_has_prefix().
 *
 * Returns:
 * * strlen(@prefix) if @str starts with @prefix
 * * 0 if @str does not start with @prefix
 */
static __always_inline size_t str_has_prefix(const char *str, const char *prefix)
{
        size_t len = strlen(prefix);
        return strncmp(str, prefix, len) == 0 ? len : 0;
}

/**
 * strstarts - does @str start with @prefix?
 * @str: string to examine
 * @prefix: prefix to look for.
 */
static inline bool strstarts(const char *str, const char *prefix)
{
        return strncmp(str, prefix, strlen(prefix)) == 0;
}

#endif /* _LINUX_STRING_H_ */














































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the TCP protocol.
 *
 * Version:        @(#)tcp.h        1.0.2        04/28/93
 *
 * Author:        Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 */
#ifndef _LINUX_TCP_H
#define _LINUX_TCP_H


#include <linux/skbuff.h>
#include <linux/win_minmax.h>
#include <net/sock.h>
#include <net/inet_connection_sock.h>
#include <net/inet_timewait_sock.h>
#include <uapi/linux/tcp.h>

static inline struct tcphdr *tcp_hdr(const struct sk_buff *skb)
{
        return (struct tcphdr *)skb_transport_header(skb);
}

static inline unsigned int __tcp_hdrlen(const struct tcphdr *th)
{
        return th->doff * 4;
}

static inline unsigned int tcp_hdrlen(const struct sk_buff *skb)
{
        return __tcp_hdrlen(tcp_hdr(skb));
}

static inline struct tcphdr *inner_tcp_hdr(const struct sk_buff *skb)
{
        return (struct tcphdr *)skb_inner_transport_header(skb);
}

static inline unsigned int inner_tcp_hdrlen(const struct sk_buff *skb)
{
        return inner_tcp_hdr(skb)->doff * 4;
}

/**
 * skb_tcp_all_headers - Returns size of all headers for a TCP packet
 * @skb: buffer
 *
 * Used in TX path, for a packet known to be a TCP one.
 *
 * if (skb_is_gso(skb)) {
 *         int hlen = skb_tcp_all_headers(skb);
 *         ...
 */
static inline int skb_tcp_all_headers(const struct sk_buff *skb)
{
        return skb_transport_offset(skb) + tcp_hdrlen(skb);
}

/**
 * skb_inner_tcp_all_headers - Returns size of all headers for an encap TCP packet
 * @skb: buffer
 *
 * Used in TX path, for a packet known to be a TCP one.
 *
 * if (skb_is_gso(skb) && skb->encapsulation) {
 *         int hlen = skb_inner_tcp_all_headers(skb);
 *         ...
 */
static inline int skb_inner_tcp_all_headers(const struct sk_buff *skb)
{
        return skb_inner_transport_offset(skb) + inner_tcp_hdrlen(skb);
}

static inline unsigned int tcp_optlen(const struct sk_buff *skb)
{
        return (tcp_hdr(skb)->doff - 5) * 4;
}

/* TCP Fast Open */
#define TCP_FASTOPEN_COOKIE_MIN        4        /* Min Fast Open Cookie size in bytes */
#define TCP_FASTOPEN_COOKIE_MAX        16        /* Max Fast Open Cookie size in bytes */
#define TCP_FASTOPEN_COOKIE_SIZE 8        /* the size employed by this impl. */

/* TCP Fast Open Cookie as stored in memory */
struct tcp_fastopen_cookie {
        __le64        val[DIV_ROUND_UP(TCP_FASTOPEN_COOKIE_MAX, sizeof(u64))];
        s8        len;
        bool        exp;        /* In RFC6994 experimental option format */
};

/* This defines a selective acknowledgement block. */
struct tcp_sack_block_wire {
        __be32        start_seq;
        __be32        end_seq;
};

struct tcp_sack_block {
        u32        start_seq;
        u32        end_seq;
};

/*These are used to set the sack_ok field in struct tcp_options_received */
#define TCP_SACK_SEEN     (1 << 0)   /*1 = peer is SACK capable, */
#define TCP_DSACK_SEEN    (1 << 2)   /*1 = DSACK was received from peer*/

struct tcp_options_received {
/*        PAWS/RTTM data        */
        int        ts_recent_stamp;/* Time we stored ts_recent (for aging) */
        u32        ts_recent;        /* Time stamp to echo next                */
        u32        rcv_tsval;        /* Time stamp value                     */
        u32        rcv_tsecr;        /* Time stamp echo reply                */
        u16         saw_tstamp : 1,        /* Saw TIMESTAMP on last packet                */
                tstamp_ok : 1,        /* TIMESTAMP seen on SYN packet                */
                dsack : 1,        /* D-SACK is scheduled                        */
                wscale_ok : 1,        /* Wscale seen on SYN packet                */
                sack_ok : 3,        /* SACK seen on SYN packet                */
                smc_ok : 1,        /* SMC seen on SYN packet                */
                snd_wscale : 4,        /* Window scaling received from sender        */
                rcv_wscale : 4;        /* Window scaling to send to receiver        */
        u8        accecn:6,        /* AccECN index in header, 0=no options        */
                saw_unknown:1,        /* Received unknown option                */
                unused:1;
        u8        num_sacks;        /* Number of SACK blocks                */
        u16        user_mss;        /* mss requested by user in ioctl        */
        u16        mss_clamp;        /* Maximal mss, negotiated at connection setup */
};

static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
{
        rx_opt->tstamp_ok = rx_opt->sack_ok = 0;
        rx_opt->wscale_ok = rx_opt->snd_wscale = 0;
#if IS_ENABLED(CONFIG_SMC)
        rx_opt->smc_ok = 0;
#endif
}

/* This is the max number of SACKS that we'll generate and process. It's safe
 * to increase this, although since:
 *   size = TCPOLEN_SACK_BASE_ALIGNED (4) + n * TCPOLEN_SACK_PERBLOCK (8)
 * only four options will fit in a standard TCP header */
#define TCP_NUM_SACKS 4

struct tcp_request_sock_ops;

struct tcp_request_sock {
        struct inet_request_sock         req;
        const struct tcp_request_sock_ops *af_specific;
        u64                                snt_synack; /* first SYNACK sent time */
        bool                                tfo_listener;
        bool                                is_mptcp;
        bool                                req_usec_ts;
#if IS_ENABLED(CONFIG_MPTCP)
        bool                                drop_req;
#endif
        u32                                txhash;
        u32                                rcv_isn;
        u32                                snt_isn;
        u32                                ts_off;
        u32                                snt_tsval_first;
        u32                                snt_tsval_last;
        u32                                last_oow_ack_time; /* last SYNACK */
        u32                                rcv_nxt; /* the ack # by SYNACK. For
                                                  * FastOpen it's the seq#
                                                  * after data-in-SYN.
                                                  */
        u8                                syn_tos;
        bool                                accecn_ok;
        u8                                syn_ect_snt: 2,
                                        syn_ect_rcv: 2,
                                        accecn_fail_mode:4;
        u8                                saw_accecn_opt  :2;
#ifdef CONFIG_TCP_AO
        u8                                ao_keyid;
        u8                                ao_rcv_next;
        bool                                used_tcp_ao;
#endif
};

static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
{
        return (struct tcp_request_sock *)req;
}

static inline bool tcp_rsk_used_ao(const struct request_sock *req)
{
#ifndef CONFIG_TCP_AO
        return false;
#else
        return tcp_rsk(req)->used_tcp_ao;
#endif
}

#define TCP_RMEM_TO_WIN_SCALE 8

struct tcp_sock {
        /* Cacheline organization can be found documented in
         * Documentation/networking/net_cachelines/tcp_sock.rst.
         * Please update the document when adding new fields.
         */

        /* inet_connection_sock has to be the first member of tcp_sock */
        struct inet_connection_sock        inet_conn;

        /* TX read-mostly hotpath cache lines */
        __cacheline_group_begin(tcp_sock_read_tx);
        u32        max_window;        /* Maximal window ever seen from peer        */
        u32        rcv_ssthresh;        /* Current window clamp                        */
        u32        reordering;        /* Packet reordering metric.                */
        u32        notsent_lowat;        /* TCP_NOTSENT_LOWAT */
        u16        gso_segs;        /* Max number of segs per GSO packet        */
        /* from STCP, retrans queue hinting */
        struct sk_buff *retransmit_skb_hint;
#if defined(CONFIG_TLS_DEVICE)
        void (*tcp_clean_acked)(struct sock *sk, u32 acked_seq);
#endif
        __cacheline_group_end(tcp_sock_read_tx);

        /* TXRX read-mostly hotpath cache lines */
        __cacheline_group_begin(tcp_sock_read_txrx);
        u32        tsoffset;        /* timestamp offset */
        u32        snd_wnd;        /* The window we expect to receive        */
        u32        mss_cache;        /* Cached effective mss, not including SACKS */
        u32        snd_cwnd;        /* Sending congestion window                */
        u32        prr_out;        /* Total number of pkts sent during Recovery. */
        u32        lost_out;        /* Lost packets                        */
        u32        sacked_out;        /* SACK'd packets                        */
        u16        tcp_header_len;        /* Bytes of tcp header to send                */
        u8        scaling_ratio;        /* see tcp_win_from_space() */
        u8        chrono_type : 2,        /* current chronograph type */
                repair      : 1,
                tcp_usec_ts : 1, /* TSval values in usec */
                is_sack_reneg:1,    /* in recovery from loss with SACK reneg? */
                is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */
                recvmsg_inq : 1;/* Indicate # of bytes in queue upon recvmsg */
        __cacheline_group_end(tcp_sock_read_txrx);

        /* RX read-mostly hotpath cache lines */
        __cacheline_group_begin(tcp_sock_read_rx);
        u32        copied_seq;        /* Head of yet unread data */
        u32        snd_wl1;        /* Sequence for window update                */
        u32        tlp_high_seq;        /* snd_nxt at the time of TLP */
        u32        rttvar_us;        /* smoothed mdev_max                        */
        u32        retrans_out;        /* Retransmitted packets out                */
        u16        advmss;                /* Advertised MSS                        */
        u16        urg_data;        /* Saved octet of OOB data and control flags */
        u32        lost;                /* Total data packets lost incl. rexmits */
        u32        snd_ssthresh;        /* Slow start size threshold                */
        struct  minmax rtt_min;
        /* OOO segments go in this rbtree. Socket lock must be held. */
        struct rb_root        out_of_order_queue;
        __cacheline_group_end(tcp_sock_read_rx);

        /* TX read-write hotpath cache lines */
        __cacheline_group_begin(tcp_sock_write_tx) ____cacheline_aligned;
        u32        segs_out;        /* RFC4898 tcpEStatsPerfSegsOut
                                 * The total number of segments sent.
                                 */
        u32        data_segs_out;        /* RFC4898 tcpEStatsPerfDataSegsOut
                                 * total number of data segments sent.
                                 */
        u64        bytes_sent;        /* RFC4898 tcpEStatsPerfHCDataOctetsOut
                                 * total number of data bytes sent.
                                 */
        u32        snd_sml;        /* Last byte of the most recently transmitted small packet */
        u32        chrono_start;        /* Start time in jiffies of a TCP chrono */
        u32        chrono_stat[3];        /* Time in jiffies for chrono_stat stats */
        u32        write_seq;        /* Tail(+1) of data held in tcp send buffer */
        u32        pushed_seq;        /* Last pushed seq, required to talk to windows */
        u32        lsndtime;        /* timestamp of last sent data packet (for restart window) */
        u32        mdev_us;        /* medium deviation                        */
        u32        rtt_seq;        /* sequence number to update rttvar        */
        u64        tcp_wstamp_ns;        /* departure time for next sent data packet */
        u64        accecn_opt_tstamp;        /* Last AccECN option sent timestamp */
        struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */
        struct sk_buff *highest_sack;   /* skb just after the highest
                                         * skb with SACKed bit set
                                         * (validity guaranteed only if
                                         * sacked_out > 0)
                                         */
        u8        ecn_flags;        /* ECN status bits.                        */
        __cacheline_group_end(tcp_sock_write_tx);

        /* TXRX read-write hotpath cache lines */
        __cacheline_group_begin(tcp_sock_write_txrx);
/*
 *        Header prediction flags
 *        0x5?10 << 16 + snd_wnd in net byte order
 */
        u8        nonagle     : 4,/* Disable Nagle algorithm?             */
                rate_app_limited:1;  /* rate_{delivered,interval_us} limited? */
        u8        received_ce_pending:4, /* Not yet transmit cnt of received_ce */
                unused2:4;
        u8        accecn_minlen:2,/* Minimum length of AccECN option sent */
                est_ecnfield:2,/* ECN field for AccECN delivered estimates */
                accecn_opt_demand:2,/* Demand AccECN option for n next ACKs */
                prev_ecnfield:2; /* ECN bits from the previous segment */
        __be32        pred_flags;
        u64        tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */
        u64        tcp_mstamp;        /* most recent packet received/sent */
        u32        rcv_nxt;        /* What we want to receive next                */
        u32        snd_nxt;        /* Next sequence we send                */
        u32        snd_una;        /* First byte we want an ack for        */
        u32        window_clamp;        /* Maximal window to advertise                */
        u32        srtt_us;        /* smoothed round trip time << 3 in usecs */
        u32        packets_out;        /* Packets which are "in flight"        */
        u32        snd_up;                /* Urgent pointer                */
        u32        delivered;        /* Total data packets delivered incl. rexmits */
        u32        delivered_ce;        /* Like the above but only ECE marked packets */
        u32        received_ce;        /* Like the above but for rcvd CE marked pkts */
        u32        received_ecn_bytes[3]; /* received byte counters for three ECN
                                        * types: INET_ECN_ECT_1, INET_ECN_ECT_0,
                                        * and INET_ECN_CE
                                        */
        u32        app_limited;        /* limited until "delivered" reaches this val */
        u32        rcv_wnd;        /* Current receiver window                */
        u32        rcv_tstamp;        /* timestamp of last received ACK (for keepalives) */
/*
 *      Options received (usually on last packet, some only on SYN packets).
 */
        struct tcp_options_received rx_opt;
        __cacheline_group_end(tcp_sock_write_txrx);

        /* RX read-write hotpath cache lines */
        __cacheline_group_begin(tcp_sock_write_rx) __aligned(8);
        u64        bytes_received;
                                /* RFC4898 tcpEStatsAppHCThruOctetsReceived
                                 * sum(delta(rcv_nxt)), or how many bytes
                                 * were acked.
                                 */
        u32        segs_in;        /* RFC4898 tcpEStatsPerfSegsIn
                                 * total number of segments in.
                                 */
        u32        data_segs_in;        /* RFC4898 tcpEStatsPerfDataSegsIn
                                 * total number of data segments in.
                                 */
        u32        rcv_wup;        /* rcv_nxt on last window update sent        */
        u32        max_packets_out;  /* max packets_out in last window */
        u32        cwnd_usage_seq;  /* right edge of cwnd usage tracking flight */
        u32        rate_delivered;    /* saved rate sample: packets delivered */
        u32        rate_interval_us;  /* saved rate sample: time elapsed */
        u32        rcv_rtt_last_tsecr;
        u32        delivered_ecn_bytes[3];
        u64        first_tx_mstamp;  /* start of window send phase */
        u64        delivered_mstamp; /* time we reached "delivered" */
        u64        bytes_acked;        /* RFC4898 tcpEStatsAppHCThruOctetsAcked
                                 * sum(delta(snd_una)), or how many bytes
                                 * were acked.
                                 */
        struct {
                u32        rtt_us;
                u32        seq;
                u64        time;
        } rcv_rtt_est;
/* Receiver queue space */
        struct {
                int        space;
                u32        seq;
                u64        time;
        } rcvq_space;
        __cacheline_group_end(tcp_sock_write_rx);
        /* End of Hot Path */

/*
 *        RFC793 variables by their proper names. This means you can
 *        read the code and the spec side by side (and laugh ...)
 *        See RFC793 and RFC1122. The RFC writes these in capitals.
 */
        u32        dsack_dups;        /* RFC4898 tcpEStatsStackDSACKDups
                                 * total number of DSACK blocks received
                                 */
        u32        compressed_ack_rcv_nxt;
        struct list_head tsq_node; /* anchor in tsq_tasklet.head list */

        /* Information of the most recently (s)acked skb */
        struct tcp_rack {
                u64 mstamp; /* (Re)sent time of the skb */
                u32 rtt_us;  /* Associated RTT */
                u32 end_seq; /* Ending TCP sequence of the skb */
                u32 last_delivered; /* tp->delivered at last reo_wnd adj */
                u8 reo_wnd_steps;   /* Allowed reordering window */
#define TCP_RACK_RECOVERY_THRESH 16
                u8 reo_wnd_persist:5, /* No. of recovery since last adj */
                   dsack_seen:1, /* Whether DSACK seen after last adj */
                   advanced:1;         /* mstamp advanced since last lost marking */
        } rack;
        u8        compressed_ack;
        u8        dup_ack_counter:2,
                tlp_retrans:1,        /* TLP is a retransmission */
                syn_ect_snt:2,        /* AccECN ECT memory, only */
                syn_ect_rcv:2;        /* ... needed during 3WHS + first seqno */
        u8        thin_lto    : 1,/* Use linear timeouts for thin streams */
                fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
                fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
                fastopen_client_fail:2, /* reason why fastopen failed */
                frto        : 1;/* F-RTO (RFC5682) activated in CA_Loss */
        u8        repair_queue;
        u8        save_syn:2,        /* Save headers of SYN packet */
                syn_data:1,        /* SYN includes data */
                syn_fastopen:1,        /* SYN includes Fast Open option */
                syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
                syn_fastopen_ch:1, /* Active TFO re-enabling probe */
                syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
                syn_fastopen_child:1; /* created TFO passive child socket */

        u8        keepalive_probes; /* num of allowed keep alive probes        */
        u8        accecn_fail_mode:4,        /* AccECN failure handling */
                saw_accecn_opt:2;        /* An AccECN option was seen */
        u32        tcp_tx_delay;        /* delay (in usec) added to TX packets */

/* RTT measurement */
        u32        mdev_max_us;        /* maximal mdev for the last rtt period        */

        u32        reord_seen;        /* number of data packet reordering events */

/*
 *        Slow start and congestion control (see also Nagle, and Karn & Partridge)
 */
        u32        snd_cwnd_cnt;        /* Linear increase counter                */
        u32        snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */
        u32        snd_cwnd_used;
        u32        snd_cwnd_stamp;
        u32        prior_cwnd;        /* cwnd right before starting loss recovery */
        u32        prr_delivered;        /* Number of newly delivered packets to
                                 * receiver in Recovery. */
        u32        last_oow_ack_time;  /* timestamp of last out-of-window ACK */

        struct hrtimer        pacing_timer;
        struct hrtimer        compressed_ack_timer;

        struct sk_buff        *ooo_last_skb; /* cache rb_last(out_of_order_queue) */

        /* SACKs data, these 2 need to be together (see tcp_options_write) */
        struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
        struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/

        struct tcp_sack_block recv_sack_cache[4];

        u32        prior_ssthresh; /* ssthresh saved at recovery start        */
        u32        high_seq;        /* snd_nxt at onset of congestion        */

        u32        retrans_stamp;        /* Timestamp of the last retransmit,
                                 * also used in SYN-SENT to remember stamp of
                                 * the first SYN. */
        u32        undo_marker;        /* snd_una upon a new recovery episode. */
        int        undo_retrans;        /* number of undoable retransmissions. */
        u32        mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG
                           * while socket was owned by user.
                           */
        u64        bytes_retrans;        /* RFC4898 tcpEStatsPerfOctetsRetrans
                                 * Total data bytes retransmitted
                                 */
        u32        total_retrans;        /* Total retransmits for entire connection */
        u32        rto_stamp;        /* Start time (ms) of last CA_Loss recovery */
        u16        total_rto;        /* Total number of RTO timeouts, including
                                 * SYN/SYN-ACK and recurring timeouts.
                                 */
        u16        total_rto_recoveries;        /* Total number of RTO recoveries,
                                         * including any unfinished recovery.
                                         */
        u32        total_rto_time;        /* ms spent in (completed) RTO recoveries. */

        u32        urg_seq;        /* Seq of received urgent pointer */
        unsigned int                keepalive_time;          /* time before keep alive takes place */
        unsigned int                keepalive_intvl;  /* time interval between keep alive probes */

        int                        linger2;


/* Sock_ops bpf program related variables */
#ifdef CONFIG_BPF
        u8        bpf_sock_ops_cb_flags;  /* Control calling BPF programs
                                         * values defined in uapi/linux/tcp.h
                                         */
        u8        bpf_chg_cc_inprogress:1; /* In the middle of
                                          * bpf_setsockopt(TCP_CONGESTION),
                                          * it is to avoid the bpf_tcp_cc->init()
                                          * to recur itself by calling
                                          * bpf_setsockopt(TCP_CONGESTION, "itself").
                                          */
#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG)
#else
#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0
#endif

        u16 timeout_rehash;        /* Timeout-triggered rehash attempts */

        u32 rcv_ooopack; /* Received out-of-order packets, for tcpinfo */

/* TCP-specific MTU probe information. */
        struct {
                u32                  probe_seq_start;
                u32                  probe_seq_end;
        } mtu_probe;
        u32     plb_rehash;     /* PLB-triggered rehash attempts */
#if IS_ENABLED(CONFIG_MPTCP)
        bool        is_mptcp;
#endif
#if IS_ENABLED(CONFIG_SMC)
        bool        syn_smc;        /* SYN includes SMC */
        bool        (*smc_hs_congested)(const struct sock *sk);
#endif

#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
/* TCP AF-Specific parts; only used by TCP-AO/MD5 Signature support so far */
        const struct tcp_sock_af_ops        *af_specific;

#ifdef CONFIG_TCP_MD5SIG
/* TCP MD5 Signature Option information */
        struct tcp_md5sig_info        __rcu *md5sig_info;
#endif
#ifdef CONFIG_TCP_AO
        struct tcp_ao_info        __rcu *ao_info;
#endif
#endif

/* TCP fastopen related information */
        struct tcp_fastopen_request *fastopen_req;
        /* fastopen_rsk points to request_sock that resulted in this big
         * socket. Used to retransmit SYNACKs etc.
         */
        struct request_sock __rcu *fastopen_rsk;
        struct saved_syn *saved_syn;
};

enum tsq_enum {
        TSQ_THROTTLED,
        TSQ_QUEUED,
        TCP_TSQ_DEFERRED,           /* tcp_tasklet_func() found socket was owned */
        TCP_WRITE_TIMER_DEFERRED,  /* tcp_write_timer() found socket was owned */
        TCP_DELACK_TIMER_DEFERRED, /* tcp_delack_timer() found socket was owned */
        TCP_MTU_REDUCED_DEFERRED,  /* tcp_v{4|6}_err() could not call
                                    * tcp_v{4|6}_mtu_reduced()
                                    */
        TCP_ACK_DEFERRED,           /* TX pure ack is deferred */
};

enum tsq_flags {
        TSQF_THROTTLED                        = BIT(TSQ_THROTTLED),
        TSQF_QUEUED                        = BIT(TSQ_QUEUED),
        TCPF_TSQ_DEFERRED                = BIT(TCP_TSQ_DEFERRED),
        TCPF_WRITE_TIMER_DEFERRED        = BIT(TCP_WRITE_TIMER_DEFERRED),
        TCPF_DELACK_TIMER_DEFERRED        = BIT(TCP_DELACK_TIMER_DEFERRED),
        TCPF_MTU_REDUCED_DEFERRED        = BIT(TCP_MTU_REDUCED_DEFERRED),
        TCPF_ACK_DEFERRED                = BIT(TCP_ACK_DEFERRED),
};

#define tcp_sk(ptr) container_of_const(ptr, struct tcp_sock, inet_conn.icsk_inet.sk)

/* Variant of tcp_sk() upgrading a const sock to a read/write tcp socket.
 * Used in context of (lockless) tcp listeners.
 */
#define tcp_sk_rw(ptr) container_of(ptr, struct tcp_sock, inet_conn.icsk_inet.sk)

struct tcp_timewait_sock {
        struct inet_timewait_sock tw_sk;
#define tw_rcv_nxt tw_sk.__tw_common.skc_tw_rcv_nxt
#define tw_snd_nxt tw_sk.__tw_common.skc_tw_snd_nxt
        u32                          tw_rcv_wnd;
        u32                          tw_ts_offset;
        u32                          tw_ts_recent;

        /* The time we sent the last out-of-window ACK: */
        u32                          tw_last_oow_ack_time;

        int                          tw_ts_recent_stamp;
        u32                          tw_tx_delay;
#ifdef CONFIG_TCP_MD5SIG
        struct tcp_md5sig_key          *tw_md5_key;
#endif
#ifdef CONFIG_TCP_AO
        struct tcp_ao_info        __rcu *ao_info;
#endif
};

static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
{
        return (struct tcp_timewait_sock *)sk;
}

static inline bool tcp_passive_fastopen(const struct sock *sk)
{
        return sk->sk_state == TCP_SYN_RECV &&
               rcu_access_pointer(tcp_sk(sk)->fastopen_rsk) != NULL;
}

static inline void fastopen_queue_tune(struct sock *sk, int backlog)
{
        struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
        int somaxconn = READ_ONCE(sock_net(sk)->core.sysctl_somaxconn);

        WRITE_ONCE(queue->fastopenq.max_qlen, min_t(unsigned int, backlog, somaxconn));
}

static inline void tcp_move_syn(struct tcp_sock *tp,
                                struct request_sock *req)
{
        tp->saved_syn = req->saved_syn;
        req->saved_syn = NULL;
}

static inline void tcp_saved_syn_free(struct tcp_sock *tp)
{
        kfree(tp->saved_syn);
        tp->saved_syn = NULL;
}

static inline u32 tcp_saved_syn_len(const struct saved_syn *saved_syn)
{
        return saved_syn->mac_hdrlen + saved_syn->network_hdrlen +
                saved_syn->tcp_hdrlen;
}

struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
                                               const struct sk_buff *orig_skb,
                                               const struct sk_buff *ack_skb);

static inline u16 tcp_mss_clamp(const struct tcp_sock *tp, u16 mss)
{
        /* We use READ_ONCE() here because socket might not be locked.
         * This happens for listeners.
         */
        u16 user_mss = READ_ONCE(tp->rx_opt.user_mss);

        return (user_mss && user_mss < mss) ? user_mss : mss;
}

int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount,
                  int shiftlen);

void __tcp_sock_set_cork(struct sock *sk, bool on);
void tcp_sock_set_cork(struct sock *sk, bool on);
int tcp_sock_set_keepcnt(struct sock *sk, int val);
int tcp_sock_set_keepidle_locked(struct sock *sk, int val);
int tcp_sock_set_keepidle(struct sock *sk, int val);
int tcp_sock_set_keepintvl(struct sock *sk, int val);
void __tcp_sock_set_nodelay(struct sock *sk, bool on);
void tcp_sock_set_nodelay(struct sock *sk);
void tcp_sock_set_quickack(struct sock *sk, int val);
int tcp_sock_set_syncnt(struct sock *sk, int val);
int tcp_sock_set_user_timeout(struct sock *sk, int val);
int tcp_sock_set_maxseg(struct sock *sk, int val);

static inline bool dst_tcp_usec_ts(const struct dst_entry *dst)
{
        return dst_feature(dst, RTAX_FEATURE_TCP_USEC_TS);
}

#endif        /* _LINUX_TCP_H */
















































































































































































































































































































































































































































































































































  316 
  316 
  314 






















  314 



  316 





  311 
  314 











































































































































































































































































































































































































































































































































































































  315 


  312 
  315 





  315 

  312 

  313 



  315 



















   13 






   13 



   12 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Implementation of the kernel access vector cache (AVC).
 *
 * Authors:  Stephen Smalley, <stephen.smalley.work@gmail.com>
 *             James Morris <jmorris@redhat.com>
 *
 * Update:   KaiGai, Kohei <kaigai@ak.jp.nec.com>
 *        Replaced the avc_lock spinlock by RCU.
 *
 * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
 */
#include <linux/types.h>
#include <linux/stddef.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/dcache.h>
#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/percpu.h>
#include <linux/list.h>
#include <net/sock.h>
#include <linux/un.h>
#include <net/af_unix.h>
#include <linux/ip.h>
#include <linux/audit.h>
#include <linux/ipv6.h>
#include <net/ipv6.h>
#include "avc.h"
#include "avc_ss.h"
#include "classmap.h"

#define CREATE_TRACE_POINTS
#include <trace/events/avc.h>

#define AVC_CACHE_SLOTS                        512
#define AVC_DEF_CACHE_THRESHOLD                512
#define AVC_CACHE_RECLAIM                16

#ifdef CONFIG_SECURITY_SELINUX_AVC_STATS
#define avc_cache_stats_incr(field)        this_cpu_inc(avc_cache_stats.field)
#else
#define avc_cache_stats_incr(field)        do {} while (0)
#endif

struct avc_entry {
        u32                        ssid;
        u32                        tsid;
        u16                        tclass;
        struct av_decision        avd;
        struct avc_xperms_node        *xp_node;
};

struct avc_node {
        struct avc_entry        ae;
        struct hlist_node        list; /* anchored in avc_cache->slots[i] */
        struct rcu_head                rhead;
};

struct avc_xperms_decision_node {
        struct extended_perms_decision xpd;
        struct list_head xpd_list; /* list of extended_perms_decision */
};

struct avc_xperms_node {
        struct extended_perms xp;
        struct list_head xpd_head; /* list head of extended_perms_decision */
};

struct avc_cache {
        struct hlist_head        slots[AVC_CACHE_SLOTS]; /* head for avc_node->list */
        spinlock_t                slots_lock[AVC_CACHE_SLOTS]; /* lock for writes */
        atomic_t                lru_hint;        /* LRU hint for reclaim scan */
        atomic_t                active_nodes;
        u32                        latest_notif;        /* latest revocation notification */
};

struct avc_callback_node {
        int (*callback) (u32 event);
        u32 events;
        struct avc_callback_node *next;
};

#ifdef CONFIG_SECURITY_SELINUX_AVC_STATS
DEFINE_PER_CPU(struct avc_cache_stats, avc_cache_stats) = { 0 };
#endif

struct selinux_avc {
        unsigned int avc_cache_threshold;
        struct avc_cache avc_cache;
};

static struct selinux_avc selinux_avc;

void selinux_avc_init(void)
{
        int i;

        selinux_avc.avc_cache_threshold = AVC_DEF_CACHE_THRESHOLD;
        for (i = 0; i < AVC_CACHE_SLOTS; i++) {
                INIT_HLIST_HEAD(&selinux_avc.avc_cache.slots[i]);
                spin_lock_init(&selinux_avc.avc_cache.slots_lock[i]);
        }
        atomic_set(&selinux_avc.avc_cache.active_nodes, 0);
        atomic_set(&selinux_avc.avc_cache.lru_hint, 0);
}

unsigned int avc_get_cache_threshold(void)
{
        return selinux_avc.avc_cache_threshold;
}

void avc_set_cache_threshold(unsigned int cache_threshold)
{
        selinux_avc.avc_cache_threshold = cache_threshold;
}

static struct avc_callback_node *avc_callbacks __ro_after_init;
static struct kmem_cache *avc_node_cachep __ro_after_init;
static struct kmem_cache *avc_xperms_data_cachep __ro_after_init;
static struct kmem_cache *avc_xperms_decision_cachep __ro_after_init;
static struct kmem_cache *avc_xperms_cachep __ro_after_init;

static inline u32 avc_hash(u32 ssid, u32 tsid, u16 tclass)
{
        return (ssid ^ (tsid<<2) ^ (tclass<<4)) & (AVC_CACHE_SLOTS - 1);
}

/**
 * avc_init - Initialize the AVC.
 *
 * Initialize the access vector cache.
 */
void __init avc_init(void)
{
        avc_node_cachep = KMEM_CACHE(avc_node, SLAB_PANIC);
        avc_xperms_cachep = KMEM_CACHE(avc_xperms_node, SLAB_PANIC);
        avc_xperms_decision_cachep = KMEM_CACHE(avc_xperms_decision_node, SLAB_PANIC);
        avc_xperms_data_cachep = KMEM_CACHE(extended_perms_data, SLAB_PANIC);
}

int avc_get_hash_stats(char *page)
{
        int i, chain_len, max_chain_len, slots_used;
        struct avc_node *node;
        struct hlist_head *head;

        rcu_read_lock();

        slots_used = 0;
        max_chain_len = 0;
        for (i = 0; i < AVC_CACHE_SLOTS; i++) {
                head = &selinux_avc.avc_cache.slots[i];
                if (!hlist_empty(head)) {
                        slots_used++;
                        chain_len = 0;
                        hlist_for_each_entry_rcu(node, head, list)
                                chain_len++;
                        if (chain_len > max_chain_len)
                                max_chain_len = chain_len;
                }
        }

        rcu_read_unlock();

        return scnprintf(page, PAGE_SIZE, "entries: %d\nbuckets used: %d/%d\n"
                         "longest chain: %d\n",
                         atomic_read(&selinux_avc.avc_cache.active_nodes),
                         slots_used, AVC_CACHE_SLOTS, max_chain_len);
}

/*
 * using a linked list for extended_perms_decision lookup because the list is
 * always small. i.e. less than 5, typically 1
 */
static struct extended_perms_decision *
avc_xperms_decision_lookup(u8 driver, u8 base_perm,
                           struct avc_xperms_node *xp_node)
{
        struct avc_xperms_decision_node *xpd_node;

        list_for_each_entry(xpd_node, &xp_node->xpd_head, xpd_list) {
                if (xpd_node->xpd.driver == driver &&
                    xpd_node->xpd.base_perm == base_perm)
                        return &xpd_node->xpd;
        }
        return NULL;
}

static inline unsigned int
avc_xperms_has_perm(struct extended_perms_decision *xpd,
                                        u8 perm, u8 which)
{
        unsigned int rc = 0;

        if ((which == XPERMS_ALLOWED) &&
                        (xpd->used & XPERMS_ALLOWED))
                rc = security_xperm_test(xpd->allowed->p, perm);
        else if ((which == XPERMS_AUDITALLOW) &&
                        (xpd->used & XPERMS_AUDITALLOW))
                rc = security_xperm_test(xpd->auditallow->p, perm);
        else if ((which == XPERMS_DONTAUDIT) &&
                        (xpd->used & XPERMS_DONTAUDIT))
                rc = security_xperm_test(xpd->dontaudit->p, perm);
        return rc;
}

static void avc_xperms_allow_perm(struct avc_xperms_node *xp_node,
                                  u8 driver, u8 base_perm, u8 perm)
{
        struct extended_perms_decision *xpd;
        security_xperm_set(xp_node->xp.drivers.p, driver);
        xp_node->xp.base_perms |= base_perm;
        xpd = avc_xperms_decision_lookup(driver, base_perm, xp_node);
        if (xpd && xpd->allowed)
                security_xperm_set(xpd->allowed->p, perm);
}

static void avc_xperms_decision_free(struct avc_xperms_decision_node *xpd_node)
{
        struct extended_perms_decision *xpd;

        xpd = &xpd_node->xpd;
        if (xpd->allowed)
                kmem_cache_free(avc_xperms_data_cachep, xpd->allowed);
        if (xpd->auditallow)
                kmem_cache_free(avc_xperms_data_cachep, xpd->auditallow);
        if (xpd->dontaudit)
                kmem_cache_free(avc_xperms_data_cachep, xpd->dontaudit);
        kmem_cache_free(avc_xperms_decision_cachep, xpd_node);
}

static void avc_xperms_free(struct avc_xperms_node *xp_node)
{
        struct avc_xperms_decision_node *xpd_node, *tmp;

        if (!xp_node)
                return;

        list_for_each_entry_safe(xpd_node, tmp, &xp_node->xpd_head, xpd_list) {
                list_del(&xpd_node->xpd_list);
                avc_xperms_decision_free(xpd_node);
        }
        kmem_cache_free(avc_xperms_cachep, xp_node);
}

static void avc_copy_xperms_decision(struct extended_perms_decision *dest,
                                        struct extended_perms_decision *src)
{
        dest->base_perm = src->base_perm;
        dest->driver = src->driver;
        dest->used = src->used;
        if (dest->used & XPERMS_ALLOWED)
                memcpy(dest->allowed->p, src->allowed->p,
                                sizeof(src->allowed->p));
        if (dest->used & XPERMS_AUDITALLOW)
                memcpy(dest->auditallow->p, src->auditallow->p,
                                sizeof(src->auditallow->p));
        if (dest->used & XPERMS_DONTAUDIT)
                memcpy(dest->dontaudit->p, src->dontaudit->p,
                                sizeof(src->dontaudit->p));
}

/*
 * similar to avc_copy_xperms_decision, but only copy decision
 * information relevant to this perm
 */
static inline void avc_quick_copy_xperms_decision(u8 perm,
                        struct extended_perms_decision *dest,
                        struct extended_perms_decision *src)
{
        /*
         * compute index of the u32 of the 256 bits (8 u32s) that contain this
         * command permission
         */
        u8 i = perm >> 5;

        dest->base_perm = src->base_perm;
        dest->used = src->used;
        if (dest->used & XPERMS_ALLOWED)
                dest->allowed->p[i] = src->allowed->p[i];
        if (dest->used & XPERMS_AUDITALLOW)
                dest->auditallow->p[i] = src->auditallow->p[i];
        if (dest->used & XPERMS_DONTAUDIT)
                dest->dontaudit->p[i] = src->dontaudit->p[i];
}

static struct avc_xperms_decision_node
                *avc_xperms_decision_alloc(u8 which)
{
        struct avc_xperms_decision_node *xpd_node;
        struct extended_perms_decision *xpd;

        xpd_node = kmem_cache_zalloc(avc_xperms_decision_cachep, GFP_NOWAIT);
        if (!xpd_node)
                return NULL;

        xpd = &xpd_node->xpd;
        if (which & XPERMS_ALLOWED) {
                xpd->allowed = kmem_cache_zalloc(avc_xperms_data_cachep,
                                                 GFP_NOWAIT);
                if (!xpd->allowed)
                        goto error;
        }
        if (which & XPERMS_AUDITALLOW) {
                xpd->auditallow = kmem_cache_zalloc(avc_xperms_data_cachep,
                                                    GFP_NOWAIT);
                if (!xpd->auditallow)
                        goto error;
        }
        if (which & XPERMS_DONTAUDIT) {
                xpd->dontaudit = kmem_cache_zalloc(avc_xperms_data_cachep,
                                                   GFP_NOWAIT);
                if (!xpd->dontaudit)
                        goto error;
        }
        return xpd_node;
error:
        avc_xperms_decision_free(xpd_node);
        return NULL;
}

static int avc_add_xperms_decision(struct avc_node *node,
                        struct extended_perms_decision *src)
{
        struct avc_xperms_decision_node *dest_xpd;

        dest_xpd = avc_xperms_decision_alloc(src->used);
        if (!dest_xpd)
                return -ENOMEM;
        avc_copy_xperms_decision(&dest_xpd->xpd, src);
        list_add(&dest_xpd->xpd_list, &node->ae.xp_node->xpd_head);
        node->ae.xp_node->xp.len++;
        return 0;
}

static struct avc_xperms_node *avc_xperms_alloc(void)
{
        struct avc_xperms_node *xp_node;

        xp_node = kmem_cache_zalloc(avc_xperms_cachep, GFP_NOWAIT);
        if (!xp_node)
                return xp_node;
        INIT_LIST_HEAD(&xp_node->xpd_head);
        return xp_node;
}

static int avc_xperms_populate(struct avc_node *node,
                                struct avc_xperms_node *src)
{
        struct avc_xperms_node *dest;
        struct avc_xperms_decision_node *dest_xpd;
        struct avc_xperms_decision_node *src_xpd;

        if (src->xp.len == 0)
                return 0;
        dest = avc_xperms_alloc();
        if (!dest)
                return -ENOMEM;

        memcpy(dest->xp.drivers.p, src->xp.drivers.p, sizeof(dest->xp.drivers.p));
        dest->xp.len = src->xp.len;
        dest->xp.base_perms = src->xp.base_perms;

        /* for each source xpd allocate a destination xpd and copy */
        list_for_each_entry(src_xpd, &src->xpd_head, xpd_list) {
                dest_xpd = avc_xperms_decision_alloc(src_xpd->xpd.used);
                if (!dest_xpd)
                        goto error;
                avc_copy_xperms_decision(&dest_xpd->xpd, &src_xpd->xpd);
                list_add(&dest_xpd->xpd_list, &dest->xpd_head);
        }
        node->ae.xp_node = dest;
        return 0;
error:
        avc_xperms_free(dest);
        return -ENOMEM;

}

static inline u32 avc_xperms_audit_required(u32 requested,
                                        struct av_decision *avd,
                                        struct extended_perms_decision *xpd,
                                        u8 perm,
                                        int result,
                                        u32 *deniedp)
{
        u32 denied, audited;

        denied = requested & ~avd->allowed;
        if (unlikely(denied)) {
                audited = denied & avd->auditdeny;
                if (audited && xpd) {
                        if (avc_xperms_has_perm(xpd, perm, XPERMS_DONTAUDIT))
                                audited = 0;
                }
        } else if (result) {
                audited = denied = requested;
        } else {
                audited = requested & avd->auditallow;
                if (audited && xpd) {
                        if (!avc_xperms_has_perm(xpd, perm, XPERMS_AUDITALLOW))
                                audited = 0;
                }
        }

        *deniedp = denied;
        return audited;
}

static inline int avc_xperms_audit(u32 ssid, u32 tsid, u16 tclass,
                                   u32 requested, struct av_decision *avd,
                                   struct extended_perms_decision *xpd,
                                   u8 perm, int result,
                                   struct common_audit_data *ad)
{
        u32 audited, denied;

        audited = avc_xperms_audit_required(
                        requested, avd, xpd, perm, result, &denied);
        if (likely(!audited))
                return 0;
        return slow_avc_audit(ssid, tsid, tclass, requested,
                        audited, denied, result, ad);
}

static void avc_node_free(struct rcu_head *rhead)
{
        struct avc_node *node = container_of(rhead, struct avc_node, rhead);
        avc_xperms_free(node->ae.xp_node);
        kmem_cache_free(avc_node_cachep, node);
        avc_cache_stats_incr(frees);
}

static void avc_node_delete(struct avc_node *node)
{
        hlist_del_rcu(&node->list);
        call_rcu(&node->rhead, avc_node_free);
        atomic_dec(&selinux_avc.avc_cache.active_nodes);
}

static void avc_node_kill(struct avc_node *node)
{
        avc_xperms_free(node->ae.xp_node);
        kmem_cache_free(avc_node_cachep, node);
        avc_cache_stats_incr(frees);
        atomic_dec(&selinux_avc.avc_cache.active_nodes);
}

static void avc_node_replace(struct avc_node *new, struct avc_node *old)
{
        hlist_replace_rcu(&old->list, &new->list);
        call_rcu(&old->rhead, avc_node_free);
        atomic_dec(&selinux_avc.avc_cache.active_nodes);
}

static inline int avc_reclaim_node(void)
{
        struct avc_node *node;
        int hvalue, try, ecx;
        unsigned long flags;
        struct hlist_head *head;
        spinlock_t *lock;

        for (try = 0, ecx = 0; try < AVC_CACHE_SLOTS; try++) {
                hvalue = atomic_inc_return(&selinux_avc.avc_cache.lru_hint) &
                        (AVC_CACHE_SLOTS - 1);
                head = &selinux_avc.avc_cache.slots[hvalue];
                lock = &selinux_avc.avc_cache.slots_lock[hvalue];

                if (!spin_trylock_irqsave(lock, flags))
                        continue;

                rcu_read_lock();
                hlist_for_each_entry(node, head, list) {
                        avc_node_delete(node);
                        avc_cache_stats_incr(reclaims);
                        ecx++;
                        if (ecx >= AVC_CACHE_RECLAIM) {
                                rcu_read_unlock();
                                spin_unlock_irqrestore(lock, flags);
                                goto out;
                        }
                }
                rcu_read_unlock();
                spin_unlock_irqrestore(lock, flags);
        }
out:
        return ecx;
}

static struct avc_node *avc_alloc_node(void)
{
        struct avc_node *node;

        node = kmem_cache_zalloc(avc_node_cachep, GFP_NOWAIT);
        if (!node)
                goto out;

        INIT_HLIST_NODE(&node->list);
        avc_cache_stats_incr(allocations);

        if (atomic_inc_return(&selinux_avc.avc_cache.active_nodes) >
            selinux_avc.avc_cache_threshold)
                avc_reclaim_node();

out:
        return node;
}

static void avc_node_populate(struct avc_node *node, u32 ssid, u32 tsid, u16 tclass, struct av_decision *avd)
{
        node->ae.ssid = ssid;
        node->ae.tsid = tsid;
        node->ae.tclass = tclass;
        memcpy(&node->ae.avd, avd, sizeof(node->ae.avd));
}

static inline struct avc_node *avc_search_node(u32 ssid, u32 tsid, u16 tclass)
{
        struct avc_node *node, *ret = NULL;
        u32 hvalue;
        struct hlist_head *head;

        hvalue = avc_hash(ssid, tsid, tclass);
        head = &selinux_avc.avc_cache.slots[hvalue];
        hlist_for_each_entry_rcu(node, head, list) {
                if (ssid == node->ae.ssid &&
                    tclass == node->ae.tclass &&
                    tsid == node->ae.tsid) {
                        ret = node;
                        break;
                }
        }

        return ret;
}

/**
 * avc_lookup - Look up an AVC entry.
 * @ssid: source security identifier
 * @tsid: target security identifier
 * @tclass: target security class
 *
 * Look up an AVC entry that is valid for the
 * (@ssid, @tsid), interpreting the permissions
 * based on @tclass.  If a valid AVC entry exists,
 * then this function returns the avc_node.
 * Otherwise, this function returns NULL.
 */
static struct avc_node *avc_lookup(u32 ssid, u32 tsid, u16 tclass)
{
        struct avc_node *node;

        avc_cache_stats_incr(lookups);
        node = avc_search_node(ssid, tsid, tclass);

        if (node)
                return node;

        avc_cache_stats_incr(misses);
        return NULL;
}

static int avc_latest_notif_update(u32 seqno, int is_insert)
{
        int ret = 0;
        static DEFINE_SPINLOCK(notif_lock);
        unsigned long flag;

        spin_lock_irqsave(&notif_lock, flag);
        if (is_insert) {
                if (seqno < selinux_avc.avc_cache.latest_notif) {
                        pr_warn("SELinux: avc:  seqno %d < latest_notif %d\n",
                               seqno, selinux_avc.avc_cache.latest_notif);
                        ret = -EAGAIN;
                }
        } else {
                if (seqno > selinux_avc.avc_cache.latest_notif)
                        selinux_avc.avc_cache.latest_notif = seqno;
        }
        spin_unlock_irqrestore(&notif_lock, flag);

        return ret;
}

/**
 * avc_insert - Insert an AVC entry.
 * @ssid: source security identifier
 * @tsid: target security identifier
 * @tclass: target security class
 * @avd: resulting av decision
 * @xp_node: resulting extended permissions
 *
 * Insert an AVC entry for the SID pair
 * (@ssid, @tsid) and class @tclass.
 * The access vectors and the sequence number are
 * normally provided by the security server in
 * response to a security_compute_av() call.  If the
 * sequence number @avd->seqno is not less than the latest
 * revocation notification, then the function copies
 * the access vectors into a cache entry.
 */
static void avc_insert(u32 ssid, u32 tsid, u16 tclass,
                       struct av_decision *avd, struct avc_xperms_node *xp_node)
{
        struct avc_node *pos, *node = NULL;
        u32 hvalue;
        unsigned long flag;
        spinlock_t *lock;
        struct hlist_head *head;

        if (avc_latest_notif_update(avd->seqno, 1))
                return;

        node = avc_alloc_node();
        if (!node)
                return;

        avc_node_populate(node, ssid, tsid, tclass, avd);
        if (avc_xperms_populate(node, xp_node)) {
                avc_node_kill(node);
                return;
        }

        hvalue = avc_hash(ssid, tsid, tclass);
        head = &selinux_avc.avc_cache.slots[hvalue];
        lock = &selinux_avc.avc_cache.slots_lock[hvalue];
        spin_lock_irqsave(lock, flag);
        hlist_for_each_entry(pos, head, list) {
                if (pos->ae.ssid == ssid &&
                        pos->ae.tsid == tsid &&
                        pos->ae.tclass == tclass) {
                        avc_node_replace(node, pos);
                        goto found;
                }
        }
        hlist_add_head_rcu(&node->list, head);
found:
        spin_unlock_irqrestore(lock, flag);
}

/**
 * avc_audit_pre_callback - SELinux specific information
 * will be called by generic audit code
 * @ab: the audit buffer
 * @a: audit_data
 */
static void avc_audit_pre_callback(struct audit_buffer *ab, void *a)
{
        struct common_audit_data *ad = a;
        struct selinux_audit_data *sad = ad->selinux_audit_data;
        u32 av = sad->audited, perm;
        const char *const *perms;
        u32 i;

        audit_log_format(ab, "avc:  %s ", sad->denied ? "denied" : "granted");

        if (av == 0) {
                audit_log_format(ab, " null");
                return;
        }

        perms = secclass_map[sad->tclass-1].perms;

        audit_log_format(ab, " {");
        i = 0;
        perm = 1;
        while (i < (sizeof(av) * 8)) {
                if ((perm & av) && perms[i]) {
                        audit_log_format(ab, " %s", perms[i]);
                        av &= ~perm;
                }
                i++;
                perm <<= 1;
        }

        if (av)
                audit_log_format(ab, " 0x%x", av);

        audit_log_format(ab, " } for ");
}

/**
 * avc_audit_post_callback - SELinux specific information
 * will be called by generic audit code
 * @ab: the audit buffer
 * @a: audit_data
 */
static void avc_audit_post_callback(struct audit_buffer *ab, void *a)
{
        struct common_audit_data *ad = a;
        struct selinux_audit_data *sad = ad->selinux_audit_data;
        char *scontext = NULL;
        char *tcontext = NULL;
        const char *tclass = NULL;
        u32 scontext_len;
        u32 tcontext_len;
        int rc;

        rc = security_sid_to_context(sad->ssid, &scontext,
                                     &scontext_len);
        if (rc)
                audit_log_format(ab, " ssid=%d", sad->ssid);
        else
                audit_log_format(ab, " scontext=%s", scontext);

        rc = security_sid_to_context(sad->tsid, &tcontext,
                                     &tcontext_len);
        if (rc)
                audit_log_format(ab, " tsid=%d", sad->tsid);
        else
                audit_log_format(ab, " tcontext=%s", tcontext);

        tclass = secclass_map[sad->tclass-1].name;
        audit_log_format(ab, " tclass=%s", tclass);

        if (sad->denied)
                audit_log_format(ab, " permissive=%u", sad->result ? 0 : 1);

        trace_selinux_audited(sad, scontext, tcontext, tclass);
        kfree(tcontext);
        kfree(scontext);

        /* in case of invalid context report also the actual context string */
        rc = security_sid_to_context_inval(sad->ssid, &scontext,
                                           &scontext_len);
        if (!rc && scontext) {
                if (scontext_len && scontext[scontext_len - 1] == '\0')
                        scontext_len--;
                audit_log_format(ab, " srawcon=");
                audit_log_n_untrustedstring(ab, scontext, scontext_len);
                kfree(scontext);
        }

        rc = security_sid_to_context_inval(sad->tsid, &scontext,
                                           &scontext_len);
        if (!rc && scontext) {
                if (scontext_len && scontext[scontext_len - 1] == '\0')
                        scontext_len--;
                audit_log_format(ab, " trawcon=");
                audit_log_n_untrustedstring(ab, scontext, scontext_len);
                kfree(scontext);
        }
}

/*
 * This is the slow part of avc audit with big stack footprint.
 * Note that it is non-blocking and can be called from under
 * rcu_read_lock().
 */
noinline int slow_avc_audit(u32 ssid, u32 tsid, u16 tclass,
                            u32 requested, u32 audited, u32 denied, int result,
                            struct common_audit_data *a)
{
        struct common_audit_data stack_data;
        struct selinux_audit_data sad;

        if (WARN_ON(!tclass || tclass >= ARRAY_SIZE(secclass_map)))
                return -EINVAL;

        if (!a) {
                a = &stack_data;
                a->type = LSM_AUDIT_DATA_NONE;
        }

        sad.tclass = tclass;
        sad.requested = requested;
        sad.ssid = ssid;
        sad.tsid = tsid;
        sad.audited = audited;
        sad.denied = denied;
        sad.result = result;

        a->selinux_audit_data = &sad;

        common_lsm_audit(a, avc_audit_pre_callback, avc_audit_post_callback);
        return 0;
}

/**
 * avc_add_callback - Register a callback for security events.
 * @callback: callback function
 * @events: security events
 *
 * Register a callback function for events in the set @events.
 * Returns %0 on success or -%ENOMEM if insufficient memory
 * exists to add the callback.
 */
int __init avc_add_callback(int (*callback)(u32 event), u32 events)
{
        struct avc_callback_node *c;
        int rc = 0;

        c = kmalloc(sizeof(*c), GFP_KERNEL);
        if (!c) {
                rc = -ENOMEM;
                goto out;
        }

        c->callback = callback;
        c->events = events;
        c->next = avc_callbacks;
        avc_callbacks = c;
out:
        return rc;
}

/**
 * avc_update_node - Update an AVC entry
 * @event : Updating event
 * @perms : Permission mask bits
 * @driver: xperm driver information
 * @base_perm: the base permission associated with the extended permission
 * @xperm: xperm permissions
 * @ssid: AVC entry source sid
 * @tsid: AVC entry target sid
 * @tclass : AVC entry target object class
 * @seqno : sequence number when decision was made
 * @xpd: extended_perms_decision to be added to the node
 * @flags: the AVC_* flags, e.g. AVC_EXTENDED_PERMS, or 0.
 *
 * if a valid AVC entry doesn't exist,this function returns -ENOENT.
 * if kmalloc() called internal returns NULL, this function returns -ENOMEM.
 * otherwise, this function updates the AVC entry. The original AVC-entry object
 * will release later by RCU.
 */
static int avc_update_node(u32 event, u32 perms, u8 driver, u8 base_perm,
                           u8 xperm, u32 ssid, u32 tsid, u16 tclass, u32 seqno,
                           struct extended_perms_decision *xpd, u32 flags)
{
        u32 hvalue;
        int rc = 0;
        unsigned long flag;
        struct avc_node *pos, *node, *orig = NULL;
        struct hlist_head *head;
        spinlock_t *lock;

        node = avc_alloc_node();
        if (!node) {
                rc = -ENOMEM;
                goto out;
        }

        /* Lock the target slot */
        hvalue = avc_hash(ssid, tsid, tclass);

        head = &selinux_avc.avc_cache.slots[hvalue];
        lock = &selinux_avc.avc_cache.slots_lock[hvalue];

        spin_lock_irqsave(lock, flag);

        hlist_for_each_entry(pos, head, list) {
                if (ssid == pos->ae.ssid &&
                    tsid == pos->ae.tsid &&
                    tclass == pos->ae.tclass &&
                    seqno == pos->ae.avd.seqno){
                        orig = pos;
                        break;
                }
        }

        if (!orig) {
                rc = -ENOENT;
                avc_node_kill(node);
                goto out_unlock;
        }

        /*
         * Copy and replace original node.
         */

        avc_node_populate(node, ssid, tsid, tclass, &orig->ae.avd);

        if (orig->ae.xp_node) {
                rc = avc_xperms_populate(node, orig->ae.xp_node);
                if (rc) {
                        avc_node_kill(node);
                        goto out_unlock;
                }
        }

        switch (event) {
        case AVC_CALLBACK_GRANT:
                node->ae.avd.allowed |= perms;
                if (node->ae.xp_node && (flags & AVC_EXTENDED_PERMS))
                        avc_xperms_allow_perm(node->ae.xp_node, driver, base_perm, xperm);
                break;
        case AVC_CALLBACK_TRY_REVOKE:
        case AVC_CALLBACK_REVOKE:
                node->ae.avd.allowed &= ~perms;
                break;
        case AVC_CALLBACK_AUDITALLOW_ENABLE:
                node->ae.avd.auditallow |= perms;
                break;
        case AVC_CALLBACK_AUDITALLOW_DISABLE:
                node->ae.avd.auditallow &= ~perms;
                break;
        case AVC_CALLBACK_AUDITDENY_ENABLE:
                node->ae.avd.auditdeny |= perms;
                break;
        case AVC_CALLBACK_AUDITDENY_DISABLE:
                node->ae.avd.auditdeny &= ~perms;
                break;
        case AVC_CALLBACK_ADD_XPERMS:
                rc = avc_add_xperms_decision(node, xpd);
                if (rc) {
                        avc_node_kill(node);
                        goto out_unlock;
                }
                break;
        }
        avc_node_replace(node, orig);
out_unlock:
        spin_unlock_irqrestore(lock, flag);
out:
        return rc;
}

/**
 * avc_flush - Flush the cache
 */
static void avc_flush(void)
{
        struct hlist_head *head;
        struct avc_node *node;
        spinlock_t *lock;
        unsigned long flag;
        int i;

        for (i = 0; i < AVC_CACHE_SLOTS; i++) {
                head = &selinux_avc.avc_cache.slots[i];
                lock = &selinux_avc.avc_cache.slots_lock[i];

                spin_lock_irqsave(lock, flag);
                /*
                 * With preemptible RCU, the outer spinlock does not
                 * prevent RCU grace periods from ending.
                 */
                rcu_read_lock();
                hlist_for_each_entry(node, head, list)
                        avc_node_delete(node);
                rcu_read_unlock();
                spin_unlock_irqrestore(lock, flag);
        }
}

/**
 * avc_ss_reset - Flush the cache and revalidate migrated permissions.
 * @seqno: policy sequence number
 */
int avc_ss_reset(u32 seqno)
{
        struct avc_callback_node *c;
        int rc = 0, tmprc;

        avc_flush();

        for (c = avc_callbacks; c; c = c->next) {
                if (c->events & AVC_CALLBACK_RESET) {
                        tmprc = c->callback(AVC_CALLBACK_RESET);
                        /* save the first error encountered for the return
                           value and continue processing the callbacks */
                        if (!rc)
                                rc = tmprc;
                }
        }

        avc_latest_notif_update(seqno, 0);
        return rc;
}

/**
 * avc_compute_av - Add an entry to the AVC based on the security policy
 * @ssid: subject
 * @tsid: object/target
 * @tclass: object class
 * @avd: access vector decision
 * @xp_node: AVC extended permissions node
 *
 * Slow-path helper function for avc_has_perm_noaudit, when the avc_node lookup
 * fails.  Don't inline this, since it's the slow-path and just results in a
 * bigger stack frame.
 */
static noinline void avc_compute_av(u32 ssid, u32 tsid, u16 tclass,
                                    struct av_decision *avd,
                                    struct avc_xperms_node *xp_node)
{
        INIT_LIST_HEAD(&xp_node->xpd_head);
        security_compute_av(ssid, tsid, tclass, avd, &xp_node->xp);
        avc_insert(ssid, tsid, tclass, avd, xp_node);
}

static noinline int avc_denied(u32 ssid, u32 tsid, u16 tclass, u32 requested,
                               u8 driver, u8 base_perm, u8 xperm,
                               unsigned int flags, struct av_decision *avd)
{
        if (flags & AVC_STRICT)
                return -EACCES;

        if (enforcing_enabled() &&
            !(avd->flags & AVD_FLAGS_PERMISSIVE))
                return -EACCES;

        avc_update_node(AVC_CALLBACK_GRANT, requested, driver, base_perm,
                        xperm, ssid, tsid, tclass, avd->seqno, NULL, flags);
        return 0;
}

/*
 * The avc extended permissions logic adds an additional 256 bits of
 * permissions to an avc node when extended permissions for that node are
 * specified in the avtab. If the additional 256 permissions is not adequate,
 * as-is the case with ioctls, then multiple may be chained together and the
 * driver field is used to specify which set contains the permission.
 */
int avc_has_extended_perms(u32 ssid, u32 tsid, u16 tclass, u32 requested,
                           u8 driver, u8 base_perm, u8 xperm,
                           struct common_audit_data *ad)
{
        struct avc_node *node;
        struct av_decision avd;
        u32 denied;
        struct extended_perms_decision local_xpd;
        struct extended_perms_decision *xpd = NULL;
        struct extended_perms_data allowed;
        struct extended_perms_data auditallow;
        struct extended_perms_data dontaudit;
        struct avc_xperms_node local_xp_node;
        struct avc_xperms_node *xp_node;
        int rc = 0, rc2;

        xp_node = &local_xp_node;
        if (WARN_ON(!requested))
                return -EACCES;

        rcu_read_lock();

        node = avc_lookup(ssid, tsid, tclass);
        if (unlikely(!node)) {
                avc_compute_av(ssid, tsid, tclass, &avd, xp_node);
        } else {
                memcpy(&avd, &node->ae.avd, sizeof(avd));
                xp_node = node->ae.xp_node;
        }
        /* if extended permissions are not defined, only consider av_decision */
        if (!xp_node || !xp_node->xp.len)
                goto decision;

        local_xpd.allowed = &allowed;
        local_xpd.auditallow = &auditallow;
        local_xpd.dontaudit = &dontaudit;

        xpd = avc_xperms_decision_lookup(driver, base_perm, xp_node);
        if (unlikely(!xpd)) {
                /*
                 * Compute the extended_perms_decision only if the driver
                 * is flagged and the base permission is known.
                 */
                if (!security_xperm_test(xp_node->xp.drivers.p, driver) ||
                    !(xp_node->xp.base_perms & base_perm)) {
                        avd.allowed &= ~requested;
                        goto decision;
                }
                rcu_read_unlock();
                security_compute_xperms_decision(ssid, tsid, tclass, driver,
                                                 base_perm, &local_xpd);
                rcu_read_lock();
                avc_update_node(AVC_CALLBACK_ADD_XPERMS, requested, driver,
                                base_perm, xperm, ssid, tsid, tclass, avd.seqno,
                                &local_xpd, 0);
        } else {
                avc_quick_copy_xperms_decision(xperm, &local_xpd, xpd);
        }
        xpd = &local_xpd;

        if (!avc_xperms_has_perm(xpd, xperm, XPERMS_ALLOWED))
                avd.allowed &= ~requested;

decision:
        denied = requested & ~(avd.allowed);
        if (unlikely(denied))
                rc = avc_denied(ssid, tsid, tclass, requested, driver,
                                base_perm, xperm, AVC_EXTENDED_PERMS, &avd);

        rcu_read_unlock();

        rc2 = avc_xperms_audit(ssid, tsid, tclass, requested,
                        &avd, xpd, xperm, rc, ad);
        if (rc2)
                return rc2;
        return rc;
}

/**
 * avc_perm_nonode - Add an entry to the AVC
 * @ssid: subject
 * @tsid: object/target
 * @tclass: object class
 * @requested: requested permissions
 * @flags: AVC flags
 * @avd: access vector decision
 *
 * This is the "we have no node" part of avc_has_perm_noaudit(), which is
 * unlikely and needs extra stack space for the new node that we generate, so
 * don't inline it.
 */
static noinline int avc_perm_nonode(u32 ssid, u32 tsid, u16 tclass,
                                    u32 requested, unsigned int flags,
                                    struct av_decision *avd)
{
        u32 denied;
        struct avc_xperms_node xp_node;

        avc_compute_av(ssid, tsid, tclass, avd, &xp_node);
        denied = requested & ~(avd->allowed);
        if (unlikely(denied))
                return avc_denied(ssid, tsid, tclass, requested, 0, 0, 0,
                                  flags, avd);
        return 0;
}

/**
 * avc_has_perm_noaudit - Check permissions but perform no auditing.
 * @ssid: source security identifier
 * @tsid: target security identifier
 * @tclass: target security class
 * @requested: requested permissions, interpreted based on @tclass
 * @flags:  AVC_STRICT or 0
 * @avd: access vector decisions
 *
 * Check the AVC to determine whether the @requested permissions are granted
 * for the SID pair (@ssid, @tsid), interpreting the permissions
 * based on @tclass, and call the security server on a cache miss to obtain
 * a new decision and add it to the cache.  Return a copy of the decisions
 * in @avd.  Return %0 if all @requested permissions are granted,
 * -%EACCES if any permissions are denied, or another -errno upon
 * other errors.  This function is typically called by avc_has_perm(),
 * but may also be called directly to separate permission checking from
 * auditing, e.g. in cases where a lock must be held for the check but
 * should be released for the auditing.
 */
inline int avc_has_perm_noaudit(u32 ssid, u32 tsid,
                                u16 tclass, u32 requested,
                                unsigned int flags,
                                struct av_decision *avd)
{
        u32 denied;
        struct avc_node *node;

        if (WARN_ON(!requested))
                return -EACCES;

        rcu_read_lock();
        node = avc_lookup(ssid, tsid, tclass);
        if (unlikely(!node)) {
                rcu_read_unlock();
                return avc_perm_nonode(ssid, tsid, tclass, requested,
                                       flags, avd);
        }
        denied = requested & ~node->ae.avd.allowed;
        memcpy(avd, &node->ae.avd, sizeof(*avd));
        rcu_read_unlock();

        if (unlikely(denied))
                return avc_denied(ssid, tsid, tclass, requested, 0, 0, 0,
                                  flags, avd);
        return 0;
}

/**
 * avc_has_perm - Check permissions and perform any appropriate auditing.
 * @ssid: source security identifier
 * @tsid: target security identifier
 * @tclass: target security class
 * @requested: requested permissions, interpreted based on @tclass
 * @auditdata: auxiliary audit data
 *
 * Check the AVC to determine whether the @requested permissions are granted
 * for the SID pair (@ssid, @tsid), interpreting the permissions
 * based on @tclass, and call the security server on a cache miss to obtain
 * a new decision and add it to the cache.  Audit the granting or denial of
 * permissions in accordance with the policy.  Return %0 if all @requested
 * permissions are granted, -%EACCES if any permissions are denied, or
 * another -errno upon other errors.
 */
int avc_has_perm(u32 ssid, u32 tsid, u16 tclass,
                 u32 requested, struct common_audit_data *auditdata)
{
        struct av_decision avd;
        int rc, rc2;

        rc = avc_has_perm_noaudit(ssid, tsid, tclass, requested, 0,
                                  &avd);

        rc2 = avc_audit(ssid, tsid, tclass, requested, &avd, rc,
                        auditdata);
        if (rc2)
                return rc2;
        return rc;
}

u32 avc_policy_seqno(void)
{
        return selinux_avc.avc_cache.latest_notif;
}








































































































































































































































    1 




    1 

    1 






    1 
    1 


    1 





































    1 





    1 

























    1 

























    1 




    1 







































































    1 




    1 
    1 

    1 








    1 





    1 


    1 

    1 







    1 








    1 







































































    1 
















    1 


    1 



















    1 

    1 




    1 





























    1 














    1 





    1 
    1 

    1 

    1 
    1 














    1 

    1 

























    1 


    1 



    1 




























    1 



    1 
    1 

    1 













































    1 




    1 



    1 








    1 




























    1 


















    1 






    1 




    1 







    1 



























































    1 



    1 











    1 




    1 

















































































































































































    1 


    1 


    1 







    1 



    1 































    1 








    1 

    1 





    1 


    1 


























    1 





    1 

    1 






    1 





    1 













    1 




    1 



    1 
    1 































































































































































































































































































































































    1 










    1 






    1 







    1 

    1 
























































































    1 














    1 
    1 
    1 












    1 

    1 



    1 


    1 



















    1 


















    1 
    1 

    1 



    1 
































    1 




    1 
    1 







    1 



    1 













    1 



    1 


    1 


    1 

































































































































































































































































































































































































    3 

    1 



    2 
    2 


    3 










    3 




















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/percpu.c - percpu memory allocator
 *
 * Copyright (C) 2009                SUSE Linux Products GmbH
 * Copyright (C) 2009                Tejun Heo <tj@kernel.org>
 *
 * Copyright (C) 2017                Facebook Inc.
 * Copyright (C) 2017                Dennis Zhou <dennis@kernel.org>
 *
 * The percpu allocator handles both static and dynamic areas.  Percpu
 * areas are allocated in chunks which are divided into units.  There is
 * a 1-to-1 mapping for units to possible cpus.  These units are grouped
 * based on NUMA properties of the machine.
 *
 *  c0                           c1                         c2
 *  -------------------          -------------------        ------------
 * | u0 | u1 | u2 | u3 |        | u0 | u1 | u2 | u3 |      | u0 | u1 | u
 *  -------------------  ......  -------------------  ....  ------------
 *
 * Allocation is done by offsets into a unit's address space.  Ie., an
 * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0,
 * c1:u1, c1:u2, etc.  On NUMA machines, the mapping may be non-linear
 * and even sparse.  Access is handled by configuring percpu base
 * registers according to the cpu to unit mappings and offsetting the
 * base address using pcpu_unit_size.
 *
 * There is special consideration for the first chunk which must handle
 * the static percpu variables in the kernel image as allocation services
 * are not online yet.  In short, the first chunk is structured like so:
 *
 *                  <Static | [Reserved] | Dynamic>
 *
 * The static data is copied from the original section managed by the
 * linker.  The reserved section, if non-zero, primarily manages static
 * percpu variables from kernel modules.  Finally, the dynamic section
 * takes care of normal allocations.
 *
 * The allocator organizes chunks into lists according to free size and
 * memcg-awareness.  To make a percpu allocation memcg-aware the __GFP_ACCOUNT
 * flag should be passed.  All memcg-aware allocations are sharing one set
 * of chunks and all unaccounted allocations and allocations performed
 * by processes belonging to the root memory cgroup are using the second set.
 *
 * The allocator tries to allocate from the fullest chunk first. Each chunk
 * is managed by a bitmap with metadata blocks.  The allocation map is updated
 * on every allocation and free to reflect the current state while the boundary
 * map is only updated on allocation.  Each metadata block contains
 * information to help mitigate the need to iterate over large portions
 * of the bitmap.  The reverse mapping from page to chunk is stored in
 * the page's index.  Lastly, units are lazily backed and grow in unison.
 *
 * There is a unique conversion that goes on here between bytes and bits.
 * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE.  The chunk
 * tracks the number of pages it is responsible for in nr_pages.  Helper
 * functions are used to convert from between the bytes, bits, and blocks.
 * All hints are managed in bits unless explicitly stated.
 *
 * To use this allocator, arch code should do the following:
 *
 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
 *   regular address to percpu pointer and back if they need to be
 *   different from the default
 *
 * - use pcpu_setup_first_chunk() during percpu area initialization to
 *   setup the first chunk containing the kernel static percpu area
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/bitmap.h>
#include <linux/cpumask.h>
#include <linux/memblock.h>
#include <linux/err.h>
#include <linux/list.h>
#include <linux/log2.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/percpu.h>
#include <linux/pfn.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
#include <linux/kmemleak.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/memcontrol.h>

#include <asm/cacheflush.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
#include <asm/io.h>

#define CREATE_TRACE_POINTS
#include <trace/events/percpu.h>

#include "percpu-internal.h"

/*
 * The slots are sorted by the size of the biggest continuous free area.
 * 1-31 bytes share the same slot.
 */
#define PCPU_SLOT_BASE_SHIFT                5
/* chunks in slots below this are subject to being sidelined on failed alloc */
#define PCPU_SLOT_FAIL_THRESHOLD        3

#define PCPU_EMPTY_POP_PAGES_LOW        2
#define PCPU_EMPTY_POP_PAGES_HIGH        4

#ifdef CONFIG_SMP
/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
#ifndef __addr_to_pcpu_ptr
#define __addr_to_pcpu_ptr(addr)                                        \
        (void __percpu *)((unsigned long)(addr) -                        \
                          (unsigned long)pcpu_base_addr        +                \
                          (unsigned long)__per_cpu_start)
#endif
#ifndef __pcpu_ptr_to_addr
#define __pcpu_ptr_to_addr(ptr)                                                \
        (void __force *)((unsigned long)(ptr) +                                \
                         (unsigned long)pcpu_base_addr -                \
                         (unsigned long)__per_cpu_start)
#endif
#else        /* CONFIG_SMP */
/* on UP, it's always identity mapped */
#define __addr_to_pcpu_ptr(addr)        (void __percpu *)(addr)
#define __pcpu_ptr_to_addr(ptr)                (void __force *)(ptr)
#endif        /* CONFIG_SMP */

static int pcpu_unit_pages __ro_after_init;
static int pcpu_unit_size __ro_after_init;
static int pcpu_nr_units __ro_after_init;
static int pcpu_atom_size __ro_after_init;
int pcpu_nr_slots __ro_after_init;
static int pcpu_free_slot __ro_after_init;
int pcpu_sidelined_slot __ro_after_init;
int pcpu_to_depopulate_slot __ro_after_init;
static size_t pcpu_chunk_struct_size __ro_after_init;

/* cpus with the lowest and highest unit addresses */
static unsigned int pcpu_low_unit_cpu __ro_after_init;
static unsigned int pcpu_high_unit_cpu __ro_after_init;

/* the address of the first chunk which starts with the kernel static area */
void *pcpu_base_addr __ro_after_init;

static const int *pcpu_unit_map __ro_after_init;                /* cpu -> unit */
const unsigned long *pcpu_unit_offsets __ro_after_init;        /* cpu -> unit offset */

/* group information, used for vm allocation */
static int pcpu_nr_groups __ro_after_init;
static const unsigned long *pcpu_group_offsets __ro_after_init;
static const size_t *pcpu_group_sizes __ro_after_init;

/*
 * The first chunk which always exists.  Note that unlike other
 * chunks, this one can be allocated and mapped in several different
 * ways and thus often doesn't live in the vmalloc area.
 */
struct pcpu_chunk *pcpu_first_chunk __ro_after_init;

/*
 * Optional reserved chunk.  This chunk reserves part of the first
 * chunk and serves it for reserved allocations.  When the reserved
 * region doesn't exist, the following variable is NULL.
 */
struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;

DEFINE_SPINLOCK(pcpu_lock);        /* all internal data structures */
static DEFINE_MUTEX(pcpu_alloc_mutex);        /* chunk create/destroy, [de]pop, map ext */

struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */

/*
 * The number of empty populated pages, protected by pcpu_lock.
 * The reserved chunk doesn't contribute to the count.
 */
int pcpu_nr_empty_pop_pages;

/*
 * The number of populated pages in use by the allocator, protected by
 * pcpu_lock.  This number is kept per a unit per chunk (i.e. when a page gets
 * allocated/deallocated, it is allocated/deallocated in all units of a chunk
 * and increments/decrements this count by 1).
 */
static unsigned long pcpu_nr_populated;

/*
 * Balance work is used to populate or destroy chunks asynchronously.  We
 * try to keep the number of populated free pages between
 * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
 * empty chunk.
 */
static void pcpu_balance_workfn(struct work_struct *work);
static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
static bool pcpu_async_enabled __read_mostly;
static bool pcpu_atomic_alloc_failed;

static void pcpu_schedule_balance_work(void)
{
        if (pcpu_async_enabled)
                schedule_work(&pcpu_balance_work);
}

/**
 * pcpu_addr_in_chunk - check if the address is served from this chunk
 * @chunk: chunk of interest
 * @addr: percpu address
 *
 * RETURNS:
 * True if the address is served from this chunk.
 */
static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
{
        void *start_addr, *end_addr;

        if (!chunk)
                return false;

        start_addr = chunk->base_addr + chunk->start_offset;
        end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE -
                   chunk->end_offset;

        return addr >= start_addr && addr < end_addr;
}

static int __pcpu_size_to_slot(int size)
{
        int highbit = fls(size);        /* size is in bytes */
        return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
}

static int pcpu_size_to_slot(int size)
{
        if (size == pcpu_unit_size)
                return pcpu_free_slot;
        return __pcpu_size_to_slot(size);
}

static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
{
        const struct pcpu_block_md *chunk_md = &chunk->chunk_md;

        if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE ||
            chunk_md->contig_hint == 0)
                return 0;

        return pcpu_size_to_slot(chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE);
}

/* set the pointer to a chunk in a page struct */
static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
{
        page->private = (unsigned long)pcpu;
}

/* obtain pointer to a chunk from a page struct */
static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
{
        return (struct pcpu_chunk *)page->private;
}

static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
{
        return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
}

static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx)
{
        return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT);
}

static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
                                     unsigned int cpu, int page_idx)
{
        return (unsigned long)chunk->base_addr +
               pcpu_unit_page_offset(cpu, page_idx);
}

/*
 * The following are helper functions to help access bitmaps and convert
 * between bitmap offsets to address offsets.
 */
static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index)
{
        return chunk->alloc_map +
               (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG);
}

static unsigned long pcpu_off_to_block_index(int off)
{
        return off / PCPU_BITMAP_BLOCK_BITS;
}

static unsigned long pcpu_off_to_block_off(int off)
{
        return off & (PCPU_BITMAP_BLOCK_BITS - 1);
}

static unsigned long pcpu_block_off_to_off(int index, int off)
{
        return index * PCPU_BITMAP_BLOCK_BITS + off;
}

/**
 * pcpu_check_block_hint - check against the contig hint
 * @block: block of interest
 * @bits: size of allocation
 * @align: alignment of area (max PAGE_SIZE)
 *
 * Check to see if the allocation can fit in the block's contig hint.
 * Note, a chunk uses the same hints as a block so this can also check against
 * the chunk's contig hint.
 */
static bool pcpu_check_block_hint(struct pcpu_block_md *block, int bits,
                                  size_t align)
{
        int bit_off = ALIGN(block->contig_hint_start, align) -
                block->contig_hint_start;

        return bit_off + bits <= block->contig_hint;
}

/*
 * pcpu_next_hint - determine which hint to use
 * @block: block of interest
 * @alloc_bits: size of allocation
 *
 * This determines if we should scan based on the scan_hint or first_free.
 * In general, we want to scan from first_free to fulfill allocations by
 * first fit.  However, if we know a scan_hint at position scan_hint_start
 * cannot fulfill an allocation, we can begin scanning from there knowing
 * the contig_hint will be our fallback.
 */
static int pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits)
{
        /*
         * The three conditions below determine if we can skip past the
         * scan_hint.  First, does the scan hint exist.  Second, is the
         * contig_hint after the scan_hint (possibly not true iff
         * contig_hint == scan_hint).  Third, is the allocation request
         * larger than the scan_hint.
         */
        if (block->scan_hint &&
            block->contig_hint_start > block->scan_hint_start &&
            alloc_bits > block->scan_hint)
                return block->scan_hint_start + block->scan_hint;

        return block->first_free;
}

/**
 * pcpu_next_md_free_region - finds the next hint free area
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of free area
 *
 * Helper function for pcpu_for_each_md_free_region.  It checks
 * block->contig_hint and performs aggregation across blocks to find the
 * next hint.  It modifies bit_off and bits in-place to be consumed in the
 * loop.
 */
static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off,
                                     int *bits)
{
        int i = pcpu_off_to_block_index(*bit_off);
        int block_off = pcpu_off_to_block_off(*bit_off);
        struct pcpu_block_md *block;

        *bits = 0;
        for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
             block++, i++) {
                /* handles contig area across blocks */
                if (*bits) {
                        *bits += block->left_free;
                        if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
                                continue;
                        return;
                }

                /*
                 * This checks three things.  First is there a contig_hint to
                 * check.  Second, have we checked this hint before by
                 * comparing the block_off.  Third, is this the same as the
                 * right contig hint.  In the last case, it spills over into
                 * the next block and should be handled by the contig area
                 * across blocks code.
                 */
                *bits = block->contig_hint;
                if (*bits && block->contig_hint_start >= block_off &&
                    *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) {
                        *bit_off = pcpu_block_off_to_off(i,
                                        block->contig_hint_start);
                        return;
                }
                /* reset to satisfy the second predicate above */
                block_off = 0;

                *bits = block->right_free;
                *bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free;
        }
}

/**
 * pcpu_next_fit_region - finds fit areas for a given allocation request
 * @chunk: chunk of interest
 * @alloc_bits: size of allocation
 * @align: alignment of area (max PAGE_SIZE)
 * @bit_off: chunk offset
 * @bits: size of free area
 *
 * Finds the next free region that is viable for use with a given size and
 * alignment.  This only returns if there is a valid area to be used for this
 * allocation.  block->first_free is returned if the allocation request fits
 * within the block to see if the request can be fulfilled prior to the contig
 * hint.
 */
static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
                                 int align, int *bit_off, int *bits)
{
        int i = pcpu_off_to_block_index(*bit_off);
        int block_off = pcpu_off_to_block_off(*bit_off);
        struct pcpu_block_md *block;

        *bits = 0;
        for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
             block++, i++) {
                /* handles contig area across blocks */
                if (*bits) {
                        *bits += block->left_free;
                        if (*bits >= alloc_bits)
                                return;
                        if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
                                continue;
                }

                /* check block->contig_hint */
                *bits = ALIGN(block->contig_hint_start, align) -
                        block->contig_hint_start;
                /*
                 * This uses the block offset to determine if this has been
                 * checked in the prior iteration.
                 */
                if (block->contig_hint &&
                    block->contig_hint_start >= block_off &&
                    block->contig_hint >= *bits + alloc_bits) {
                        int start = pcpu_next_hint(block, alloc_bits);

                        *bits += alloc_bits + block->contig_hint_start -
                                 start;
                        *bit_off = pcpu_block_off_to_off(i, start);
                        return;
                }
                /* reset to satisfy the second predicate above */
                block_off = 0;

                *bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free,
                                 align);
                *bits = PCPU_BITMAP_BLOCK_BITS - *bit_off;
                *bit_off = pcpu_block_off_to_off(i, *bit_off);
                if (*bits >= alloc_bits)
                        return;
        }

        /* no valid offsets were found - fail condition */
        *bit_off = pcpu_chunk_map_bits(chunk);
}

/*
 * Metadata free area iterators.  These perform aggregation of free areas
 * based on the metadata blocks and return the offset @bit_off and size in
 * bits of the free area @bits.  pcpu_for_each_fit_region only returns when
 * a fit is found for the allocation request.
 */
#define pcpu_for_each_md_free_region(chunk, bit_off, bits)                \
        for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits));        \
             (bit_off) < pcpu_chunk_map_bits((chunk));                        \
             (bit_off) += (bits) + 1,                                        \
             pcpu_next_md_free_region((chunk), &(bit_off), &(bits)))

#define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits)     \
        for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
                                  &(bits));                                      \
             (bit_off) < pcpu_chunk_map_bits((chunk));                              \
             (bit_off) += (bits),                                              \
             pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
                                  &(bits)))

/**
 * pcpu_mem_zalloc - allocate memory
 * @size: bytes to allocate
 * @gfp: allocation flags
 *
 * Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
 * kzalloc() is used; otherwise, the equivalent of vzalloc() is used.
 * This is to facilitate passing through whitelisted flags.  The
 * returned memory is always zeroed.
 *
 * RETURNS:
 * Pointer to the allocated area on success, NULL on failure.
 */
static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
{
        if (WARN_ON_ONCE(!slab_is_available()))
                return NULL;

        if (size <= PAGE_SIZE)
                return kzalloc(size, gfp);
        else
                return __vmalloc(size, gfp | __GFP_ZERO);
}

/**
 * pcpu_mem_free - free memory
 * @ptr: memory to free
 *
 * Free @ptr.  @ptr should have been allocated using pcpu_mem_zalloc().
 */
static void pcpu_mem_free(void *ptr)
{
        kvfree(ptr);
}

static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot,
                              bool move_front)
{
        if (chunk != pcpu_reserved_chunk) {
                if (move_front)
                        list_move(&chunk->list, &pcpu_chunk_lists[slot]);
                else
                        list_move_tail(&chunk->list, &pcpu_chunk_lists[slot]);
        }
}

static void pcpu_chunk_move(struct pcpu_chunk *chunk, int slot)
{
        __pcpu_chunk_move(chunk, slot, true);
}

/**
 * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
 * @chunk: chunk of interest
 * @oslot: the previous slot it was on
 *
 * This function is called after an allocation or free changed @chunk.
 * New slot according to the changed state is determined and @chunk is
 * moved to the slot.  Note that the reserved chunk is never put on
 * chunk slots.
 *
 * CONTEXT:
 * pcpu_lock.
 */
static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
{
        int nslot = pcpu_chunk_slot(chunk);

        /* leave isolated chunks in-place */
        if (chunk->isolated)
                return;

        if (oslot != nslot)
                __pcpu_chunk_move(chunk, nslot, oslot < nslot);
}

static void pcpu_isolate_chunk(struct pcpu_chunk *chunk)
{
        lockdep_assert_held(&pcpu_lock);

        if (!chunk->isolated) {
                chunk->isolated = true;
                pcpu_nr_empty_pop_pages -= chunk->nr_empty_pop_pages;
        }
        list_move(&chunk->list, &pcpu_chunk_lists[pcpu_to_depopulate_slot]);
}

static void pcpu_reintegrate_chunk(struct pcpu_chunk *chunk)
{
        lockdep_assert_held(&pcpu_lock);

        if (chunk->isolated) {
                chunk->isolated = false;
                pcpu_nr_empty_pop_pages += chunk->nr_empty_pop_pages;
                pcpu_chunk_relocate(chunk, -1);
        }
}

/*
 * pcpu_update_empty_pages - update empty page counters
 * @chunk: chunk of interest
 * @nr: nr of empty pages
 *
 * This is used to keep track of the empty pages now based on the premise
 * a md_block covers a page.  The hint update functions recognize if a block
 * is made full or broken to calculate deltas for keeping track of free pages.
 */
static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr)
{
        chunk->nr_empty_pop_pages += nr;
        if (chunk != pcpu_reserved_chunk && !chunk->isolated)
                pcpu_nr_empty_pop_pages += nr;
}

/*
 * pcpu_region_overlap - determines if two regions overlap
 * @a: start of first region, inclusive
 * @b: end of first region, exclusive
 * @x: start of second region, inclusive
 * @y: end of second region, exclusive
 *
 * This is used to determine if the hint region [a, b) overlaps with the
 * allocated region [x, y).
 */
static inline bool pcpu_region_overlap(int a, int b, int x, int y)
{
        return (a < y) && (x < b);
}

/**
 * pcpu_block_update - updates a block given a free area
 * @block: block of interest
 * @start: start offset in block
 * @end: end offset in block
 *
 * Updates a block given a known free area.  The region [start, end) is
 * expected to be the entirety of the free area within a block.  Chooses
 * the best starting offset if the contig hints are equal.
 */
static void pcpu_block_update(struct pcpu_block_md *block, int start, int end)
{
        int contig = end - start;

        block->first_free = min(block->first_free, start);
        if (start == 0)
                block->left_free = contig;

        if (end == block->nr_bits)
                block->right_free = contig;

        if (contig > block->contig_hint) {
                /* promote the old contig_hint to be the new scan_hint */
                if (start > block->contig_hint_start) {
                        if (block->contig_hint > block->scan_hint) {
                                block->scan_hint_start =
                                        block->contig_hint_start;
                                block->scan_hint = block->contig_hint;
                        } else if (start < block->scan_hint_start) {
                                /*
                                 * The old contig_hint == scan_hint.  But, the
                                 * new contig is larger so hold the invariant
                                 * scan_hint_start < contig_hint_start.
                                 */
                                block->scan_hint = 0;
                        }
                } else {
                        block->scan_hint = 0;
                }
                block->contig_hint_start = start;
                block->contig_hint = contig;
        } else if (contig == block->contig_hint) {
                if (block->contig_hint_start &&
                    (!start ||
                     __ffs(start) > __ffs(block->contig_hint_start))) {
                        /* start has a better alignment so use it */
                        block->contig_hint_start = start;
                        if (start < block->scan_hint_start &&
                            block->contig_hint > block->scan_hint)
                                block->scan_hint = 0;
                } else if (start > block->scan_hint_start ||
                           block->contig_hint > block->scan_hint) {
                        /*
                         * Knowing contig == contig_hint, update the scan_hint
                         * if it is farther than or larger than the current
                         * scan_hint.
                         */
                        block->scan_hint_start = start;
                        block->scan_hint = contig;
                }
        } else {
                /*
                 * The region is smaller than the contig_hint.  So only update
                 * the scan_hint if it is larger than or equal and farther than
                 * the current scan_hint.
                 */
                if ((start < block->contig_hint_start &&
                     (contig > block->scan_hint ||
                      (contig == block->scan_hint &&
                       start > block->scan_hint_start)))) {
                        block->scan_hint_start = start;
                        block->scan_hint = contig;
                }
        }
}

/*
 * pcpu_block_update_scan - update a block given a free area from a scan
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of free area
 *
 * Finding the final allocation spot first goes through pcpu_find_block_fit()
 * to find a block that can hold the allocation and then pcpu_alloc_area()
 * where a scan is used.  When allocations require specific alignments,
 * we can inadvertently create holes which will not be seen in the alloc
 * or free paths.
 *
 * This takes a given free area hole and updates a block as it may change the
 * scan_hint.  We need to scan backwards to ensure we don't miss free bits
 * from alignment.
 */
static void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off,
                                   int bits)
{
        int s_off = pcpu_off_to_block_off(bit_off);
        int e_off = s_off + bits;
        int s_index, l_bit;
        struct pcpu_block_md *block;

        if (e_off > PCPU_BITMAP_BLOCK_BITS)
                return;

        s_index = pcpu_off_to_block_index(bit_off);
        block = chunk->md_blocks + s_index;

        /* scan backwards in case of alignment skipping free bits */
        l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off);
        s_off = (s_off == l_bit) ? 0 : l_bit + 1;

        pcpu_block_update(block, s_off, e_off);
}

/**
 * pcpu_chunk_refresh_hint - updates metadata about a chunk
 * @chunk: chunk of interest
 * @full_scan: if we should scan from the beginning
 *
 * Iterates over the metadata blocks to find the largest contig area.
 * A full scan can be avoided on the allocation path as this is triggered
 * if we broke the contig_hint.  In doing so, the scan_hint will be before
 * the contig_hint or after if the scan_hint == contig_hint.  This cannot
 * be prevented on freeing as we want to find the largest area possibly
 * spanning blocks.
 */
static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        int bit_off, bits;

        /* promote scan_hint to contig_hint */
        if (!full_scan && chunk_md->scan_hint) {
                bit_off = chunk_md->scan_hint_start + chunk_md->scan_hint;
                chunk_md->contig_hint_start = chunk_md->scan_hint_start;
                chunk_md->contig_hint = chunk_md->scan_hint;
                chunk_md->scan_hint = 0;
        } else {
                bit_off = chunk_md->first_free;
                chunk_md->contig_hint = 0;
        }

        bits = 0;
        pcpu_for_each_md_free_region(chunk, bit_off, bits)
                pcpu_block_update(chunk_md, bit_off, bit_off + bits);
}

/**
 * pcpu_block_refresh_hint
 * @chunk: chunk of interest
 * @index: index of the metadata block
 *
 * Scans over the block beginning at first_free and updates the block
 * metadata accordingly.
 */
static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
{
        struct pcpu_block_md *block = chunk->md_blocks + index;
        unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
        unsigned int start, end;        /* region start, region end */

        /* promote scan_hint to contig_hint */
        if (block->scan_hint) {
                start = block->scan_hint_start + block->scan_hint;
                block->contig_hint_start = block->scan_hint_start;
                block->contig_hint = block->scan_hint;
                block->scan_hint = 0;
        } else {
                start = block->first_free;
                block->contig_hint = 0;
        }

        block->right_free = 0;

        /* iterate over free areas and update the contig hints */
        for_each_clear_bitrange_from(start, end, alloc_map, PCPU_BITMAP_BLOCK_BITS)
                pcpu_block_update(block, start, end);
}

/**
 * pcpu_block_update_hint_alloc - update hint on allocation path
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of request
 *
 * Updates metadata for the allocation path.  The metadata only has to be
 * refreshed by a full scan iff the chunk's contig hint is broken.  Block level
 * scans are required if the block's contig hint is broken.
 */
static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
                                         int bits)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        int nr_empty_pages = 0;
        struct pcpu_block_md *s_block, *e_block, *block;
        int s_index, e_index;        /* block indexes of the freed allocation */
        int s_off, e_off;        /* block offsets of the freed allocation */

        /*
         * Calculate per block offsets.
         * The calculation uses an inclusive range, but the resulting offsets
         * are [start, end).  e_index always points to the last block in the
         * range.
         */
        s_index = pcpu_off_to_block_index(bit_off);
        e_index = pcpu_off_to_block_index(bit_off + bits - 1);
        s_off = pcpu_off_to_block_off(bit_off);
        e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;

        s_block = chunk->md_blocks + s_index;
        e_block = chunk->md_blocks + e_index;

        /*
         * Update s_block.
         */
        if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
                nr_empty_pages++;

        /*
         * block->first_free must be updated if the allocation takes its place.
         * If the allocation breaks the contig_hint, a scan is required to
         * restore this hint.
         */
        if (s_off == s_block->first_free)
                s_block->first_free = find_next_zero_bit(
                                        pcpu_index_alloc_map(chunk, s_index),
                                        PCPU_BITMAP_BLOCK_BITS,
                                        s_off + bits);

        if (pcpu_region_overlap(s_block->scan_hint_start,
                                s_block->scan_hint_start + s_block->scan_hint,
                                s_off,
                                s_off + bits))
                s_block->scan_hint = 0;

        if (pcpu_region_overlap(s_block->contig_hint_start,
                                s_block->contig_hint_start +
                                s_block->contig_hint,
                                s_off,
                                s_off + bits)) {
                /* block contig hint is broken - scan to fix it */
                if (!s_off)
                        s_block->left_free = 0;
                pcpu_block_refresh_hint(chunk, s_index);
        } else {
                /* update left and right contig manually */
                s_block->left_free = min(s_block->left_free, s_off);
                if (s_index == e_index)
                        s_block->right_free = min_t(int, s_block->right_free,
                                        PCPU_BITMAP_BLOCK_BITS - e_off);
                else
                        s_block->right_free = 0;
        }

        /*
         * Update e_block.
         */
        if (s_index != e_index) {
                if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
                        nr_empty_pages++;

                /*
                 * When the allocation is across blocks, the end is along
                 * the left part of the e_block.
                 */
                e_block->first_free = find_next_zero_bit(
                                pcpu_index_alloc_map(chunk, e_index),
                                PCPU_BITMAP_BLOCK_BITS, e_off);

                if (e_off == PCPU_BITMAP_BLOCK_BITS) {
                        /* reset the block */
                        e_block++;
                } else {
                        if (e_off > e_block->scan_hint_start)
                                e_block->scan_hint = 0;

                        e_block->left_free = 0;
                        if (e_off > e_block->contig_hint_start) {
                                /* contig hint is broken - scan to fix it */
                                pcpu_block_refresh_hint(chunk, e_index);
                        } else {
                                e_block->right_free =
                                        min_t(int, e_block->right_free,
                                              PCPU_BITMAP_BLOCK_BITS - e_off);
                        }
                }

                /* update in-between md_blocks */
                nr_empty_pages += (e_index - s_index - 1);
                for (block = s_block + 1; block < e_block; block++) {
                        block->scan_hint = 0;
                        block->contig_hint = 0;
                        block->left_free = 0;
                        block->right_free = 0;
                }
        }

        /*
         * If the allocation is not atomic, some blocks may not be
         * populated with pages, while we account it here.  The number
         * of pages will be added back with pcpu_chunk_populated()
         * when populating pages.
         */
        if (nr_empty_pages)
                pcpu_update_empty_pages(chunk, -nr_empty_pages);

        if (pcpu_region_overlap(chunk_md->scan_hint_start,
                                chunk_md->scan_hint_start +
                                chunk_md->scan_hint,
                                bit_off,
                                bit_off + bits))
                chunk_md->scan_hint = 0;

        /*
         * The only time a full chunk scan is required is if the chunk
         * contig hint is broken.  Otherwise, it means a smaller space
         * was used and therefore the chunk contig hint is still correct.
         */
        if (pcpu_region_overlap(chunk_md->contig_hint_start,
                                chunk_md->contig_hint_start +
                                chunk_md->contig_hint,
                                bit_off,
                                bit_off + bits))
                pcpu_chunk_refresh_hint(chunk, false);
}

/**
 * pcpu_block_update_hint_free - updates the block hints on the free path
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of request
 *
 * Updates metadata for the allocation path.  This avoids a blind block
 * refresh by making use of the block contig hints.  If this fails, it scans
 * forward and backward to determine the extent of the free area.  This is
 * capped at the boundary of blocks.
 *
 * A chunk update is triggered if a page becomes free, a block becomes free,
 * or the free spans across blocks.  This tradeoff is to minimize iterating
 * over the block metadata to update chunk_md->contig_hint.
 * chunk_md->contig_hint may be off by up to a page, but it will never be more
 * than the available space.  If the contig hint is contained in one block, it
 * will be accurate.
 */
static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
                                        int bits)
{
        int nr_empty_pages = 0;
        struct pcpu_block_md *s_block, *e_block, *block;
        int s_index, e_index;        /* block indexes of the freed allocation */
        int s_off, e_off;        /* block offsets of the freed allocation */
        int start, end;                /* start and end of the whole free area */

        /*
         * Calculate per block offsets.
         * The calculation uses an inclusive range, but the resulting offsets
         * are [start, end).  e_index always points to the last block in the
         * range.
         */
        s_index = pcpu_off_to_block_index(bit_off);
        e_index = pcpu_off_to_block_index(bit_off + bits - 1);
        s_off = pcpu_off_to_block_off(bit_off);
        e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;

        s_block = chunk->md_blocks + s_index;
        e_block = chunk->md_blocks + e_index;

        /*
         * Check if the freed area aligns with the block->contig_hint.
         * If it does, then the scan to find the beginning/end of the
         * larger free area can be avoided.
         *
         * start and end refer to beginning and end of the free area
         * within each their respective blocks.  This is not necessarily
         * the entire free area as it may span blocks past the beginning
         * or end of the block.
         */
        start = s_off;
        if (s_off == s_block->contig_hint + s_block->contig_hint_start) {
                start = s_block->contig_hint_start;
        } else {
                /*
                 * Scan backwards to find the extent of the free area.
                 * find_last_bit returns the starting bit, so if the start bit
                 * is returned, that means there was no last bit and the
                 * remainder of the chunk is free.
                 */
                int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index),
                                          start);
                start = (start == l_bit) ? 0 : l_bit + 1;
        }

        end = e_off;
        if (e_off == e_block->contig_hint_start)
                end = e_block->contig_hint_start + e_block->contig_hint;
        else
                end = find_next_bit(pcpu_index_alloc_map(chunk, e_index),
                                    PCPU_BITMAP_BLOCK_BITS, end);

        /* update s_block */
        e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS;
        if (!start && e_off == PCPU_BITMAP_BLOCK_BITS)
                nr_empty_pages++;
        pcpu_block_update(s_block, start, e_off);

        /* freeing in the same block */
        if (s_index != e_index) {
                /* update e_block */
                if (end == PCPU_BITMAP_BLOCK_BITS)
                        nr_empty_pages++;
                pcpu_block_update(e_block, 0, end);

                /* reset md_blocks in the middle */
                nr_empty_pages += (e_index - s_index - 1);
                for (block = s_block + 1; block < e_block; block++) {
                        block->first_free = 0;
                        block->scan_hint = 0;
                        block->contig_hint_start = 0;
                        block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
                        block->left_free = PCPU_BITMAP_BLOCK_BITS;
                        block->right_free = PCPU_BITMAP_BLOCK_BITS;
                }
        }

        if (nr_empty_pages)
                pcpu_update_empty_pages(chunk, nr_empty_pages);

        /*
         * Refresh chunk metadata when the free makes a block free or spans
         * across blocks.  The contig_hint may be off by up to a page, but if
         * the contig_hint is contained in a block, it will be accurate with
         * the else condition below.
         */
        if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index)
                pcpu_chunk_refresh_hint(chunk, true);
        else
                pcpu_block_update(&chunk->chunk_md,
                                  pcpu_block_off_to_off(s_index, start),
                                  end);
}

/**
 * pcpu_is_populated - determines if the region is populated
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of area
 * @next_off: return value for the next offset to start searching
 *
 * For atomic allocations, check if the backing pages are populated.
 *
 * RETURNS:
 * Bool if the backing pages are populated.
 * next_index is to skip over unpopulated blocks in pcpu_find_block_fit.
 */
static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
                              int *next_off)
{
        unsigned int start, end;

        start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
        end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);

        start = find_next_zero_bit(chunk->populated, end, start);
        if (start >= end)
                return true;

        end = find_next_bit(chunk->populated, end, start + 1);

        *next_off = end * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
        return false;
}

/**
 * pcpu_find_block_fit - finds the block index to start searching
 * @chunk: chunk of interest
 * @alloc_bits: size of request in allocation units
 * @align: alignment of area (max PAGE_SIZE bytes)
 * @pop_only: use populated regions only
 *
 * Given a chunk and an allocation spec, find the offset to begin searching
 * for a free region.  This iterates over the bitmap metadata blocks to
 * find an offset that will be guaranteed to fit the requirements.  It is
 * not quite first fit as if the allocation does not fit in the contig hint
 * of a block or chunk, it is skipped.  This errs on the side of caution
 * to prevent excess iteration.  Poor alignment can cause the allocator to
 * skip over blocks and chunks that have valid free areas.
 *
 * RETURNS:
 * The offset in the bitmap to begin searching.
 * -1 if no offset is found.
 */
static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
                               size_t align, bool pop_only)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        int bit_off, bits, next_off;

        /*
         * This is an optimization to prevent scanning by assuming if the
         * allocation cannot fit in the global hint, there is memory pressure
         * and creating a new chunk would happen soon.
         */
        if (!pcpu_check_block_hint(chunk_md, alloc_bits, align))
                return -1;

        bit_off = pcpu_next_hint(chunk_md, alloc_bits);
        bits = 0;
        pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) {
                if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
                                                   &next_off))
                        break;

                bit_off = next_off;
                bits = 0;
        }

        if (bit_off == pcpu_chunk_map_bits(chunk))
                return -1;

        return bit_off;
}

/*
 * pcpu_find_zero_area - modified from bitmap_find_next_zero_area_off()
 * @map: the address to base the search on
 * @size: the bitmap size in bits
 * @start: the bitnumber to start searching at
 * @nr: the number of zeroed bits we're looking for
 * @align_mask: alignment mask for zero area
 * @largest_off: offset of the largest area skipped
 * @largest_bits: size of the largest area skipped
 *
 * The @align_mask should be one less than a power of 2.
 *
 * This is a modified version of bitmap_find_next_zero_area_off() to remember
 * the largest area that was skipped.  This is imperfect, but in general is
 * good enough.  The largest remembered region is the largest failed region
 * seen.  This does not include anything we possibly skipped due to alignment.
 * pcpu_block_update_scan() does scan backwards to try and recover what was
 * lost to alignment.  While this can cause scanning to miss earlier possible
 * free areas, smaller allocations will eventually fill those holes.
 */
static unsigned long pcpu_find_zero_area(unsigned long *map,
                                         unsigned long size,
                                         unsigned long start,
                                         unsigned long nr,
                                         unsigned long align_mask,
                                         unsigned long *largest_off,
                                         unsigned long *largest_bits)
{
        unsigned long index, end, i, area_off, area_bits;
again:
        index = find_next_zero_bit(map, size, start);

        /* Align allocation */
        index = __ALIGN_MASK(index, align_mask);
        area_off = index;

        end = index + nr;
        if (end > size)
                return end;
        i = find_next_bit(map, end, index);
        if (i < end) {
                area_bits = i - area_off;
                /* remember largest unused area with best alignment */
                if (area_bits > *largest_bits ||
                    (area_bits == *largest_bits && *largest_off &&
                     (!area_off || __ffs(area_off) > __ffs(*largest_off)))) {
                        *largest_off = area_off;
                        *largest_bits = area_bits;
                }

                start = i + 1;
                goto again;
        }
        return index;
}

/**
 * pcpu_alloc_area - allocates an area from a pcpu_chunk
 * @chunk: chunk of interest
 * @alloc_bits: size of request in allocation units
 * @align: alignment of area (max PAGE_SIZE)
 * @start: bit_off to start searching
 *
 * This function takes in a @start offset to begin searching to fit an
 * allocation of @alloc_bits with alignment @align.  It needs to scan
 * the allocation map because if it fits within the block's contig hint,
 * @start will be block->first_free. This is an attempt to fill the
 * allocation prior to breaking the contig hint.  The allocation and
 * boundary maps are updated accordingly if it confirms a valid
 * free area.
 *
 * RETURNS:
 * Allocated addr offset in @chunk on success.
 * -1 if no matching area is found.
 */
static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
                           size_t align, int start)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        size_t align_mask = (align) ? (align - 1) : 0;
        unsigned long area_off = 0, area_bits = 0;
        int bit_off, end, oslot;

        lockdep_assert_held(&pcpu_lock);

        oslot = pcpu_chunk_slot(chunk);

        /*
         * Search to find a fit.
         */
        end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS,
                    pcpu_chunk_map_bits(chunk));
        bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits,
                                      align_mask, &area_off, &area_bits);
        if (bit_off >= end)
                return -1;

        if (area_bits)
                pcpu_block_update_scan(chunk, area_off, area_bits);

        /* update alloc map */
        bitmap_set(chunk->alloc_map, bit_off, alloc_bits);

        /* update boundary map */
        set_bit(bit_off, chunk->bound_map);
        bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1);
        set_bit(bit_off + alloc_bits, chunk->bound_map);

        chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;

        /* update first free bit */
        if (bit_off == chunk_md->first_free)
                chunk_md->first_free = find_next_zero_bit(
                                        chunk->alloc_map,
                                        pcpu_chunk_map_bits(chunk),
                                        bit_off + alloc_bits);

        pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits);

        pcpu_chunk_relocate(chunk, oslot);

        return bit_off * PCPU_MIN_ALLOC_SIZE;
}

/**
 * pcpu_free_area - frees the corresponding offset
 * @chunk: chunk of interest
 * @off: addr offset into chunk
 *
 * This function determines the size of an allocation to free using
 * the boundary bitmap and clears the allocation map.
 *
 * RETURNS:
 * Number of freed bytes.
 */
static int pcpu_free_area(struct pcpu_chunk *chunk, int off)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        int bit_off, bits, end, oslot, freed;

        lockdep_assert_held(&pcpu_lock);
        pcpu_stats_area_dealloc(chunk);

        oslot = pcpu_chunk_slot(chunk);

        bit_off = off / PCPU_MIN_ALLOC_SIZE;

        /* find end index */
        end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
                            bit_off + 1);
        bits = end - bit_off;
        bitmap_clear(chunk->alloc_map, bit_off, bits);

        freed = bits * PCPU_MIN_ALLOC_SIZE;

        /* update metadata */
        chunk->free_bytes += freed;

        /* update first free bit */
        chunk_md->first_free = min(chunk_md->first_free, bit_off);

        pcpu_block_update_hint_free(chunk, bit_off, bits);

        pcpu_chunk_relocate(chunk, oslot);

        return freed;
}

static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits)
{
        block->scan_hint = 0;
        block->contig_hint = nr_bits;
        block->left_free = nr_bits;
        block->right_free = nr_bits;
        block->first_free = 0;
        block->nr_bits = nr_bits;
}

static void pcpu_init_md_blocks(struct pcpu_chunk *chunk)
{
        struct pcpu_block_md *md_block;

        /* init the chunk's block */
        pcpu_init_md_block(&chunk->chunk_md, pcpu_chunk_map_bits(chunk));

        for (md_block = chunk->md_blocks;
             md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk);
             md_block++)
                pcpu_init_md_block(md_block, PCPU_BITMAP_BLOCK_BITS);
}

/**
 * pcpu_alloc_first_chunk - creates chunks that serve the first chunk
 * @tmp_addr: the start of the region served
 * @map_size: size of the region served
 *
 * This is responsible for creating the chunks that serve the first chunk.  The
 * base_addr is page aligned down of @tmp_addr while the region end is page
 * aligned up.  Offsets are kept track of to determine the region served. All
 * this is done to appease the bitmap allocator in avoiding partial blocks.
 *
 * RETURNS:
 * Chunk serving the region at @tmp_addr of @map_size.
 */
static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
                                                         int map_size)
{
        struct pcpu_chunk *chunk;
        unsigned long aligned_addr;
        int start_offset, offset_bits, region_size, region_bits;
        size_t alloc_size;

        /* region calculations */
        aligned_addr = tmp_addr & PAGE_MASK;

        start_offset = tmp_addr - aligned_addr;
        region_size = ALIGN(start_offset + map_size, PAGE_SIZE);

        /* allocate chunk */
        alloc_size = struct_size(chunk, populated,
                                 BITS_TO_LONGS(region_size >> PAGE_SHIFT));
        chunk = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);

        INIT_LIST_HEAD(&chunk->list);

        chunk->base_addr = (void *)aligned_addr;
        chunk->start_offset = start_offset;
        chunk->end_offset = region_size - chunk->start_offset - map_size;

        chunk->nr_pages = region_size >> PAGE_SHIFT;
        region_bits = pcpu_chunk_map_bits(chunk);

        alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]);
        chunk->alloc_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);

        alloc_size =
                BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]);
        chunk->bound_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);

        alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]);
        chunk->md_blocks = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
#ifdef NEED_PCPUOBJ_EXT
        /* first chunk is free to use */
        chunk->obj_exts = NULL;
#endif
        pcpu_init_md_blocks(chunk);

        /* manage populated page bitmap */
        chunk->immutable = true;
        bitmap_fill(chunk->populated, chunk->nr_pages);
        chunk->nr_populated = chunk->nr_pages;
        chunk->nr_empty_pop_pages = chunk->nr_pages;

        chunk->free_bytes = map_size;

        if (chunk->start_offset) {
                /* hide the beginning of the bitmap */
                offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
                bitmap_set(chunk->alloc_map, 0, offset_bits);
                set_bit(0, chunk->bound_map);
                set_bit(offset_bits, chunk->bound_map);

                chunk->chunk_md.first_free = offset_bits;

                pcpu_block_update_hint_alloc(chunk, 0, offset_bits);
        }

        if (chunk->end_offset) {
                /* hide the end of the bitmap */
                offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE;
                bitmap_set(chunk->alloc_map,
                           pcpu_chunk_map_bits(chunk) - offset_bits,
                           offset_bits);
                set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE,
                        chunk->bound_map);
                set_bit(region_bits, chunk->bound_map);

                pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk)
                                             - offset_bits, offset_bits);
        }

        return chunk;
}

static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
{
        struct pcpu_chunk *chunk;
        int region_bits;

        chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
        if (!chunk)
                return NULL;

        INIT_LIST_HEAD(&chunk->list);
        chunk->nr_pages = pcpu_unit_pages;
        region_bits = pcpu_chunk_map_bits(chunk);

        chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
                                           sizeof(chunk->alloc_map[0]), gfp);
        if (!chunk->alloc_map)
                goto alloc_map_fail;

        chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
                                           sizeof(chunk->bound_map[0]), gfp);
        if (!chunk->bound_map)
                goto bound_map_fail;

        chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
                                           sizeof(chunk->md_blocks[0]), gfp);
        if (!chunk->md_blocks)
                goto md_blocks_fail;

#ifdef NEED_PCPUOBJ_EXT
        if (need_pcpuobj_ext()) {
                chunk->obj_exts =
                        pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) *
                                        sizeof(struct pcpuobj_ext), gfp);
                if (!chunk->obj_exts)
                        goto objcg_fail;
        }
#endif

        pcpu_init_md_blocks(chunk);

        /* init metadata */
        chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;

        return chunk;

#ifdef NEED_PCPUOBJ_EXT
objcg_fail:
        pcpu_mem_free(chunk->md_blocks);
#endif
md_blocks_fail:
        pcpu_mem_free(chunk->bound_map);
bound_map_fail:
        pcpu_mem_free(chunk->alloc_map);
alloc_map_fail:
        pcpu_mem_free(chunk);

        return NULL;
}

static void pcpu_free_chunk(struct pcpu_chunk *chunk)
{
        if (!chunk)
                return;
#ifdef NEED_PCPUOBJ_EXT
        pcpu_mem_free(chunk->obj_exts);
#endif
        pcpu_mem_free(chunk->md_blocks);
        pcpu_mem_free(chunk->bound_map);
        pcpu_mem_free(chunk->alloc_map);
        pcpu_mem_free(chunk);
}

/**
 * pcpu_chunk_populated - post-population bookkeeping
 * @chunk: pcpu_chunk which got populated
 * @page_start: the start page
 * @page_end: the end page
 *
 * Pages in [@page_start,@page_end) have been populated to @chunk.  Update
 * the bookkeeping information accordingly.  Must be called after each
 * successful population.
 */
static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
                                 int page_end)
{
        int nr = page_end - page_start;

        lockdep_assert_held(&pcpu_lock);

        bitmap_set(chunk->populated, page_start, nr);
        chunk->nr_populated += nr;
        pcpu_nr_populated += nr;

        pcpu_update_empty_pages(chunk, nr);
}

/**
 * pcpu_chunk_depopulated - post-depopulation bookkeeping
 * @chunk: pcpu_chunk which got depopulated
 * @page_start: the start page
 * @page_end: the end page
 *
 * Pages in [@page_start,@page_end) have been depopulated from @chunk.
 * Update the bookkeeping information accordingly.  Must be called after
 * each successful depopulation.
 */
static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
                                   int page_start, int page_end)
{
        int nr = page_end - page_start;

        lockdep_assert_held(&pcpu_lock);

        bitmap_clear(chunk->populated, page_start, nr);
        chunk->nr_populated -= nr;
        pcpu_nr_populated -= nr;

        pcpu_update_empty_pages(chunk, -nr);
}

/*
 * Chunk management implementation.
 *
 * To allow different implementations, chunk alloc/free and
 * [de]population are implemented in a separate file which is pulled
 * into this file and compiled together.  The following functions
 * should be implemented.
 *
 * pcpu_populate_chunk                - populate the specified range of a chunk
 * pcpu_depopulate_chunk        - depopulate the specified range of a chunk
 * pcpu_post_unmap_tlb_flush        - flush tlb for the specified range of a chunk
 * pcpu_create_chunk                - create a new chunk
 * pcpu_destroy_chunk                - destroy a chunk, always preceded by full depop
 * pcpu_addr_to_page                - translate address to physical address
 * pcpu_verify_alloc_info        - check alloc_info is acceptable during init
 */
static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
                               int page_start, int page_end, gfp_t gfp);
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
                                  int page_start, int page_end);
static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
                                      int page_start, int page_end);
static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
static struct page *pcpu_addr_to_page(void *addr);
static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);

#ifdef CONFIG_NEED_PER_CPU_KM
#include "percpu-km.c"
#else
#include "percpu-vm.c"
#endif

/**
 * pcpu_chunk_addr_search - determine chunk containing specified address
 * @addr: address for which the chunk needs to be determined.
 *
 * This is an internal function that handles all but static allocations.
 * Static percpu address values should never be passed into the allocator.
 *
 * RETURNS:
 * The address of the found chunk.
 */
static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
{
        /* is it in the dynamic region (first chunk)? */
        if (pcpu_addr_in_chunk(pcpu_first_chunk, addr))
                return pcpu_first_chunk;

        /* is it in the reserved region? */
        if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr))
                return pcpu_reserved_chunk;

        /*
         * The address is relative to unit0 which might be unused and
         * thus unmapped.  Offset the address to the unit space of the
         * current processor before looking it up in the vmalloc
         * space.  Note that any possible cpu id can be used here, so
         * there's no need to worry about preemption or cpu hotplug.
         */
        addr += pcpu_unit_offsets[raw_smp_processor_id()];
        return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
}

#ifdef CONFIG_MEMCG
static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
                                      struct obj_cgroup **objcgp)
{
        struct obj_cgroup *objcg;

        if (!memcg_kmem_online() || !(gfp & __GFP_ACCOUNT))
                return true;

        objcg = current_obj_cgroup();
        if (!objcg)
                return true;

        if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size)))
                return false;

        *objcgp = objcg;
        return true;
}

static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
                                       struct pcpu_chunk *chunk, int off,
                                       size_t size)
{
        if (!objcg)
                return;

        if (likely(chunk && chunk->obj_exts)) {
                obj_cgroup_get(objcg);
                chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = objcg;

                rcu_read_lock();
                mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
                                pcpu_obj_full_size(size));
                rcu_read_unlock();
        } else {
                obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size));
        }
}

static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
{
        struct obj_cgroup *objcg;

        if (unlikely(!chunk->obj_exts))
                return;

        objcg = chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup;
        if (!objcg)
                return;
        chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = NULL;

        obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size));

        rcu_read_lock();
        mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
                        -pcpu_obj_full_size(size));
        rcu_read_unlock();

        obj_cgroup_put(objcg);
}

#else /* CONFIG_MEMCG */
static bool
pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp)
{
        return true;
}

static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
                                       struct pcpu_chunk *chunk, int off,
                                       size_t size)
{
}

static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
{
}
#endif /* CONFIG_MEMCG */

#ifdef CONFIG_MEM_ALLOC_PROFILING
static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off,
                                      size_t size)
{
        if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts)) {
                alloc_tag_add(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag,
                              current->alloc_tag, size);
        }
}

static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
{
        if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts))
                alloc_tag_sub(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag, size);
}
#else
static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off,
                                      size_t size)
{
}

static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
{
}
#endif

/**
 * pcpu_alloc - the percpu allocator
 * @size: size of area to allocate in bytes
 * @align: alignment of area (max PAGE_SIZE)
 * @reserved: allocate from the reserved chunk if available
 * @gfp: allocation flags
 *
 * Allocate percpu area of @size bytes aligned at @align.  If @gfp doesn't
 * contain %GFP_KERNEL, the allocation is atomic. If @gfp has __GFP_NOWARN
 * then no warning will be triggered on invalid or failed allocation
 * requests.
 *
 * RETURNS:
 * Percpu pointer to the allocated area on success, NULL on failure.
 */
void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved,
                                 gfp_t gfp)
{
        gfp_t pcpu_gfp;
        bool is_atomic;
        bool do_warn;
        struct obj_cgroup *objcg = NULL;
        static atomic_t warn_limit = ATOMIC_INIT(10);
        struct pcpu_chunk *chunk, *next;
        const char *err;
        int slot, off, cpu, ret;
        unsigned long flags;
        void __percpu *ptr;
        size_t bits, bit_align;

        gfp = current_gfp_context(gfp);
        /* whitelisted flags that can be passed to the backing allocators */
        pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
        is_atomic = !gfpflags_allow_blocking(gfp);
        do_warn = !(gfp & __GFP_NOWARN);

        /*
         * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE,
         * therefore alignment must be a minimum of that many bytes.
         * An allocation may have internal fragmentation from rounding up
         * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes.
         */
        if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
                align = PCPU_MIN_ALLOC_SIZE;

        size = ALIGN(size, PCPU_MIN_ALLOC_SIZE);
        bits = size >> PCPU_MIN_ALLOC_SHIFT;
        bit_align = align >> PCPU_MIN_ALLOC_SHIFT;

        if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
                     !is_power_of_2(align))) {
                WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation\n",
                     size, align);
                return NULL;
        }

        if (unlikely(!pcpu_memcg_pre_alloc_hook(size, gfp, &objcg)))
                return NULL;

        if (!is_atomic) {
                /*
                 * pcpu_balance_workfn() allocates memory under this mutex,
                 * and it may wait for memory reclaim. Allow current task
                 * to become OOM victim, in case of memory pressure.
                 */
                if (gfp & __GFP_NOFAIL) {
                        mutex_lock(&pcpu_alloc_mutex);
                } else if (mutex_lock_killable(&pcpu_alloc_mutex)) {
                        pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
                        return NULL;
                }
        }

        spin_lock_irqsave(&pcpu_lock, flags);

        /* serve reserved allocations from the reserved chunk if available */
        if (reserved && pcpu_reserved_chunk) {
                chunk = pcpu_reserved_chunk;

                off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic);
                if (off < 0) {
                        err = "alloc from reserved chunk failed";
                        goto fail_unlock;
                }

                off = pcpu_alloc_area(chunk, bits, bit_align, off);
                if (off >= 0)
                        goto area_found;

                err = "alloc from reserved chunk failed";
                goto fail_unlock;
        }

restart:
        /* search through normal chunks */
        for (slot = pcpu_size_to_slot(size); slot <= pcpu_free_slot; slot++) {
                list_for_each_entry_safe(chunk, next, &pcpu_chunk_lists[slot],
                                         list) {
                        off = pcpu_find_block_fit(chunk, bits, bit_align,
                                                  is_atomic);
                        if (off < 0) {
                                if (slot < PCPU_SLOT_FAIL_THRESHOLD)
                                        pcpu_chunk_move(chunk, 0);
                                continue;
                        }

                        off = pcpu_alloc_area(chunk, bits, bit_align, off);
                        if (off >= 0) {
                                pcpu_reintegrate_chunk(chunk);
                                goto area_found;
                        }
                }
        }

        spin_unlock_irqrestore(&pcpu_lock, flags);

        if (is_atomic) {
                err = "atomic alloc failed, no space left";
                goto fail;
        }

        /* No space left.  Create a new chunk. */
        if (list_empty(&pcpu_chunk_lists[pcpu_free_slot])) {
                chunk = pcpu_create_chunk(pcpu_gfp);
                if (!chunk) {
                        err = "failed to allocate new chunk";
                        goto fail;
                }

                spin_lock_irqsave(&pcpu_lock, flags);
                pcpu_chunk_relocate(chunk, -1);
        } else {
                spin_lock_irqsave(&pcpu_lock, flags);
        }

        goto restart;

area_found:
        pcpu_stats_area_alloc(chunk, size);

        if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
                pcpu_schedule_balance_work();

        spin_unlock_irqrestore(&pcpu_lock, flags);

        /* populate if not all pages are already there */
        if (!is_atomic) {
                unsigned int page_end, rs, re;

                rs = PFN_DOWN(off);
                page_end = PFN_UP(off + size);

                for_each_clear_bitrange_from(rs, re, chunk->populated, page_end) {
                        WARN_ON(chunk->immutable);

                        ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);

                        spin_lock_irqsave(&pcpu_lock, flags);
                        if (ret) {
                                pcpu_free_area(chunk, off);
                                err = "failed to populate";
                                goto fail_unlock;
                        }
                        pcpu_chunk_populated(chunk, rs, re);
                        spin_unlock_irqrestore(&pcpu_lock, flags);
                }

                mutex_unlock(&pcpu_alloc_mutex);
        }

        /* clear the areas and return address relative to base address */
        for_each_possible_cpu(cpu)
                memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);

        ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
        kmemleak_alloc_percpu(ptr, size, gfp);

        trace_percpu_alloc_percpu(_RET_IP_, reserved, is_atomic, size, align,
                                  chunk->base_addr, off, ptr,
                                  pcpu_obj_full_size(size), gfp);

        pcpu_memcg_post_alloc_hook(objcg, chunk, off, size);

        pcpu_alloc_tag_alloc_hook(chunk, off, size);

        return ptr;

fail_unlock:
        spin_unlock_irqrestore(&pcpu_lock, flags);
fail:
        trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);

        if (do_warn) {
                int remaining = atomic_dec_if_positive(&warn_limit);

                if (remaining >= 0) {
                        pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
                                size, align, is_atomic, err);
                        if (!is_atomic)
                                dump_stack();
                        if (remaining == 0)
                                pr_info("limit reached, disable warning\n");
                }
        }

        if (is_atomic) {
                /* see the flag handling in pcpu_balance_workfn() */
                pcpu_atomic_alloc_failed = true;
                pcpu_schedule_balance_work();
        } else {
                mutex_unlock(&pcpu_alloc_mutex);
        }

        pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);

        return NULL;
}
EXPORT_SYMBOL_GPL(pcpu_alloc_noprof);

/**
 * pcpu_balance_free - manage the amount of free chunks
 * @empty_only: free chunks only if there are no populated pages
 *
 * If empty_only is %false, reclaim all fully free chunks regardless of the
 * number of populated pages.  Otherwise, only reclaim chunks that have no
 * populated pages.
 *
 * CONTEXT:
 * pcpu_lock (can be dropped temporarily)
 */
static void pcpu_balance_free(bool empty_only)
{
        LIST_HEAD(to_free);
        struct list_head *free_head = &pcpu_chunk_lists[pcpu_free_slot];
        struct pcpu_chunk *chunk, *next;

        lockdep_assert_held(&pcpu_lock);

        /*
         * There's no reason to keep around multiple unused chunks and VM
         * areas can be scarce.  Destroy all free chunks except for one.
         */
        list_for_each_entry_safe(chunk, next, free_head, list) {
                WARN_ON(chunk->immutable);

                /* spare the first one */
                if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
                        continue;

                if (!empty_only || chunk->nr_empty_pop_pages == 0)
                        list_move(&chunk->list, &to_free);
        }

        if (list_empty(&to_free))
                return;

        spin_unlock_irq(&pcpu_lock);
        list_for_each_entry_safe(chunk, next, &to_free, list) {
                unsigned int rs, re;

                for_each_set_bitrange(rs, re, chunk->populated, chunk->nr_pages) {
                        pcpu_depopulate_chunk(chunk, rs, re);
                        spin_lock_irq(&pcpu_lock);
                        pcpu_chunk_depopulated(chunk, rs, re);
                        spin_unlock_irq(&pcpu_lock);
                }
                pcpu_destroy_chunk(chunk);
                cond_resched();
        }
        spin_lock_irq(&pcpu_lock);
}

/**
 * pcpu_balance_populated - manage the amount of populated pages
 *
 * Maintain a certain amount of populated pages to satisfy atomic allocations.
 * It is possible that this is called when physical memory is scarce causing
 * OOM killer to be triggered.  We should avoid doing so until an actual
 * allocation causes the failure as it is possible that requests can be
 * serviced from already backed regions.
 *
 * CONTEXT:
 * pcpu_lock (can be dropped temporarily)
 */
static void pcpu_balance_populated(void)
{
        /* gfp flags passed to underlying allocators */
        const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
        struct pcpu_chunk *chunk;
        int slot, nr_to_pop, ret;

        lockdep_assert_held(&pcpu_lock);

        /*
         * Ensure there are certain number of free populated pages for
         * atomic allocs.  Fill up from the most packed so that atomic
         * allocs don't increase fragmentation.  If atomic allocation
         * failed previously, always populate the maximum amount.  This
         * should prevent atomic allocs larger than PAGE_SIZE from keeping
         * failing indefinitely; however, large atomic allocs are not
         * something we support properly and can be highly unreliable and
         * inefficient.
         */
retry_pop:
        if (pcpu_atomic_alloc_failed) {
                nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
                /* best effort anyway, don't worry about synchronization */
                pcpu_atomic_alloc_failed = false;
        } else {
                nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
                                  pcpu_nr_empty_pop_pages,
                                  0, PCPU_EMPTY_POP_PAGES_HIGH);
        }

        for (slot = pcpu_size_to_slot(PAGE_SIZE); slot <= pcpu_free_slot; slot++) {
                unsigned int nr_unpop = 0, rs, re;

                if (!nr_to_pop)
                        break;

                list_for_each_entry(chunk, &pcpu_chunk_lists[slot], list) {
                        nr_unpop = chunk->nr_pages - chunk->nr_populated;
                        if (nr_unpop)
                                break;
                }

                if (!nr_unpop)
                        continue;

                /* @chunk can't go away while pcpu_alloc_mutex is held */
                for_each_clear_bitrange(rs, re, chunk->populated, chunk->nr_pages) {
                        int nr = min_t(int, re - rs, nr_to_pop);

                        spin_unlock_irq(&pcpu_lock);
                        ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
                        cond_resched();
                        spin_lock_irq(&pcpu_lock);
                        if (!ret) {
                                nr_to_pop -= nr;
                                pcpu_chunk_populated(chunk, rs, rs + nr);
                        } else {
                                nr_to_pop = 0;
                        }

                        if (!nr_to_pop)
                                break;
                }
        }

        if (nr_to_pop) {
                /* ran out of chunks to populate, create a new one and retry */
                spin_unlock_irq(&pcpu_lock);
                chunk = pcpu_create_chunk(gfp);
                cond_resched();
                spin_lock_irq(&pcpu_lock);
                if (chunk) {
                        pcpu_chunk_relocate(chunk, -1);
                        goto retry_pop;
                }
        }
}

/**
 * pcpu_reclaim_populated - scan over to_depopulate chunks and free empty pages
 *
 * Scan over chunks in the depopulate list and try to release unused populated
 * pages back to the system.  Depopulated chunks are sidelined to prevent
 * repopulating these pages unless required.  Fully free chunks are reintegrated
 * and freed accordingly (1 is kept around).  If we drop below the empty
 * populated pages threshold, reintegrate the chunk if it has empty free pages.
 * Each chunk is scanned in the reverse order to keep populated pages close to
 * the beginning of the chunk.
 *
 * CONTEXT:
 * pcpu_lock (can be dropped temporarily)
 *
 */
static void pcpu_reclaim_populated(void)
{
        struct pcpu_chunk *chunk;
        struct pcpu_block_md *block;
        int freed_page_start, freed_page_end;
        int i, end;
        bool reintegrate;

        lockdep_assert_held(&pcpu_lock);

        /*
         * Once a chunk is isolated to the to_depopulate list, the chunk is no
         * longer discoverable to allocations whom may populate pages.  The only
         * other accessor is the free path which only returns area back to the
         * allocator not touching the populated bitmap.
         */
        while ((chunk = list_first_entry_or_null(
                        &pcpu_chunk_lists[pcpu_to_depopulate_slot],
                        struct pcpu_chunk, list))) {
                WARN_ON(chunk->immutable);

                /*
                 * Scan chunk's pages in the reverse order to keep populated
                 * pages close to the beginning of the chunk.
                 */
                freed_page_start = chunk->nr_pages;
                freed_page_end = 0;
                reintegrate = false;
                for (i = chunk->nr_pages - 1, end = -1; i >= 0; i--) {
                        /* no more work to do */
                        if (chunk->nr_empty_pop_pages == 0)
                                break;

                        /* reintegrate chunk to prevent atomic alloc failures */
                        if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) {
                                reintegrate = true;
                                break;
                        }

                        /*
                         * If the page is empty and populated, start or
                         * extend the (i, end) range.  If i == 0, decrease
                         * i and perform the depopulation to cover the last
                         * (first) page in the chunk.
                         */
                        block = chunk->md_blocks + i;
                        if (block->contig_hint == PCPU_BITMAP_BLOCK_BITS &&
                            test_bit(i, chunk->populated)) {
                                if (end == -1)
                                        end = i;
                                if (i > 0)
                                        continue;
                                i--;
                        }

                        /* depopulate if there is an active range */
                        if (end == -1)
                                continue;

                        spin_unlock_irq(&pcpu_lock);
                        pcpu_depopulate_chunk(chunk, i + 1, end + 1);
                        cond_resched();
                        spin_lock_irq(&pcpu_lock);

                        pcpu_chunk_depopulated(chunk, i + 1, end + 1);
                        freed_page_start = min(freed_page_start, i + 1);
                        freed_page_end = max(freed_page_end, end + 1);

                        /* reset the range and continue */
                        end = -1;
                }

                /* batch tlb flush per chunk to amortize cost */
                if (freed_page_start < freed_page_end) {
                        spin_unlock_irq(&pcpu_lock);
                        pcpu_post_unmap_tlb_flush(chunk,
                                                  freed_page_start,
                                                  freed_page_end);
                        cond_resched();
                        spin_lock_irq(&pcpu_lock);
                }

                if (reintegrate || chunk->free_bytes == pcpu_unit_size)
                        pcpu_reintegrate_chunk(chunk);
                else
                        list_move_tail(&chunk->list,
                                       &pcpu_chunk_lists[pcpu_sidelined_slot]);
        }
}

/**
 * pcpu_balance_workfn - manage the amount of free chunks and populated pages
 * @work: unused
 *
 * For each chunk type, manage the number of fully free chunks and the number of
 * populated pages.  An important thing to consider is when pages are freed and
 * how they contribute to the global counts.
 */
static void pcpu_balance_workfn(struct work_struct *work)
{
        /*
         * pcpu_balance_free() is called twice because the first time we may
         * trim pages in the active pcpu_nr_empty_pop_pages which may cause us
         * to grow other chunks.  This then gives pcpu_reclaim_populated() time
         * to move fully free chunks to the active list to be freed if
         * appropriate.
         *
         * Enforce GFP_NOIO allocations because we have pcpu_alloc users
         * constrained to GFP_NOIO/NOFS contexts and they could form lock
         * dependency through pcpu_alloc_mutex
         */
        unsigned int flags = memalloc_noio_save();
        mutex_lock(&pcpu_alloc_mutex);
        spin_lock_irq(&pcpu_lock);

        pcpu_balance_free(false);
        pcpu_reclaim_populated();
        pcpu_balance_populated();
        pcpu_balance_free(true);

        spin_unlock_irq(&pcpu_lock);
        mutex_unlock(&pcpu_alloc_mutex);
        memalloc_noio_restore(flags);
}

/**
 * free_percpu - free percpu area
 * @ptr: pointer to area to free
 *
 * Free percpu area @ptr.
 *
 * CONTEXT:
 * Can be called from atomic context.
 */
void free_percpu(void __percpu *ptr)
{
        void *addr;
        struct pcpu_chunk *chunk;
        unsigned long flags;
        int size, off;
        bool need_balance = false;

        if (!ptr)
                return;

        kmemleak_free_percpu(ptr);

        addr = __pcpu_ptr_to_addr(ptr);
        chunk = pcpu_chunk_addr_search(addr);
        off = addr - chunk->base_addr;

        spin_lock_irqsave(&pcpu_lock, flags);
        size = pcpu_free_area(chunk, off);

        pcpu_alloc_tag_free_hook(chunk, off, size);

        pcpu_memcg_free_hook(chunk, off, size);

        /*
         * If there are more than one fully free chunks, wake up grim reaper.
         * If the chunk is isolated, it may be in the process of being
         * reclaimed.  Let reclaim manage cleaning up of that chunk.
         */
        if (!chunk->isolated && chunk->free_bytes == pcpu_unit_size) {
                struct pcpu_chunk *pos;

                list_for_each_entry(pos, &pcpu_chunk_lists[pcpu_free_slot], list)
                        if (pos != chunk) {
                                need_balance = true;
                                break;
                        }
        } else if (pcpu_should_reclaim_chunk(chunk)) {
                pcpu_isolate_chunk(chunk);
                need_balance = true;
        }

        trace_percpu_free_percpu(chunk->base_addr, off, ptr);

        spin_unlock_irqrestore(&pcpu_lock, flags);

        if (need_balance)
                pcpu_schedule_balance_work();
}
EXPORT_SYMBOL_GPL(free_percpu);

bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
{
#ifdef CONFIG_SMP
        const size_t static_size = __per_cpu_end - __per_cpu_start;
        void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
        unsigned int cpu;

        for_each_possible_cpu(cpu) {
                void *start = per_cpu_ptr(base, cpu);
                void *va = (void *)addr;

                if (va >= start && va < start + static_size) {
                        if (can_addr) {
                                *can_addr = (unsigned long) (va - start);
                                *can_addr += (unsigned long)
                                        per_cpu_ptr(base, get_boot_cpu_id());
                        }
                        return true;
                }
        }
#endif
        /* on UP, can't distinguish from other static vars, always false */
        return false;
}

/**
 * is_kernel_percpu_address - test whether address is from static percpu area
 * @addr: address to test
 *
 * Test whether @addr belongs to in-kernel static percpu area.  Module
 * static percpu areas are not considered.  For those, use
 * is_module_percpu_address().
 *
 * RETURNS:
 * %true if @addr is from in-kernel static percpu area, %false otherwise.
 */
bool is_kernel_percpu_address(unsigned long addr)
{
        return __is_kernel_percpu_address(addr, NULL);
}

/**
 * per_cpu_ptr_to_phys - convert translated percpu address to physical address
 * @addr: the address to be converted to physical address
 *
 * Given @addr which is dereferenceable address obtained via one of
 * percpu access macros, this function translates it into its physical
 * address.  The caller is responsible for ensuring @addr stays valid
 * until this function finishes.
 *
 * percpu allocator has special setup for the first chunk, which currently
 * supports either embedding in linear address space or vmalloc mapping,
 * and, from the second one, the backing allocator (currently either vm or
 * km) provides translation.
 *
 * The addr can be translated simply without checking if it falls into the
 * first chunk. But the current code reflects better how percpu allocator
 * actually works, and the verification can discover both bugs in percpu
 * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current
 * code.
 *
 * RETURNS:
 * The physical address for @addr.
 */
phys_addr_t per_cpu_ptr_to_phys(void *addr)
{
        void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
        bool in_first_chunk = false;
        unsigned long first_low, first_high;
        unsigned int cpu;

        /*
         * The following test on unit_low/high isn't strictly
         * necessary but will speed up lookups of addresses which
         * aren't in the first chunk.
         *
         * The address check is against full chunk sizes.  pcpu_base_addr
         * points to the beginning of the first chunk including the
         * static region.  Assumes good intent as the first chunk may
         * not be full (ie. < pcpu_unit_pages in size).
         */
        first_low = (unsigned long)pcpu_base_addr +
                    pcpu_unit_page_offset(pcpu_low_unit_cpu, 0);
        first_high = (unsigned long)pcpu_base_addr +
                     pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages);
        if ((unsigned long)addr >= first_low &&
            (unsigned long)addr < first_high) {
                for_each_possible_cpu(cpu) {
                        void *start = per_cpu_ptr(base, cpu);

                        if (addr >= start && addr < start + pcpu_unit_size) {
                                in_first_chunk = true;
                                break;
                        }
                }
        }

        if (in_first_chunk) {
                if (!is_vmalloc_addr(addr))
                        return __pa(addr);
                else
                        return page_to_phys(vmalloc_to_page(addr)) +
                               offset_in_page(addr);
        } else
                return page_to_phys(pcpu_addr_to_page(addr)) +
                       offset_in_page(addr);
}

/**
 * pcpu_alloc_alloc_info - allocate percpu allocation info
 * @nr_groups: the number of groups
 * @nr_units: the number of units
 *
 * Allocate ai which is large enough for @nr_groups groups containing
 * @nr_units units.  The returned ai's groups[0].cpu_map points to the
 * cpu_map array which is long enough for @nr_units and filled with
 * NR_CPUS.  It's the caller's responsibility to initialize cpu_map
 * pointer of other groups.
 *
 * RETURNS:
 * Pointer to the allocated pcpu_alloc_info on success, NULL on
 * failure.
 */
struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
                                                      int nr_units)
{
        struct pcpu_alloc_info *ai;
        size_t base_size, ai_size;
        void *ptr;
        int unit;

        base_size = ALIGN(struct_size(ai, groups, nr_groups),
                          __alignof__(ai->groups[0].cpu_map[0]));
        ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);

        ptr = memblock_alloc(PFN_ALIGN(ai_size), PAGE_SIZE);
        if (!ptr)
                return NULL;
        ai = ptr;
        ptr += base_size;

        ai->groups[0].cpu_map = ptr;

        for (unit = 0; unit < nr_units; unit++)
                ai->groups[0].cpu_map[unit] = NR_CPUS;

        ai->nr_groups = nr_groups;
        ai->__ai_size = PFN_ALIGN(ai_size);

        return ai;
}

/**
 * pcpu_free_alloc_info - free percpu allocation info
 * @ai: pcpu_alloc_info to free
 *
 * Free @ai which was allocated by pcpu_alloc_alloc_info().
 */
void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
{
        memblock_free(ai, ai->__ai_size);
}

/**
 * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
 * @lvl: loglevel
 * @ai: allocation info to dump
 *
 * Print out information about @ai using loglevel @lvl.
 */
static void pcpu_dump_alloc_info(const char *lvl,
                                 const struct pcpu_alloc_info *ai)
{
        int group_width = 1, cpu_width = 1, width;
        char empty_str[] = "--------";
        int alloc = 0, alloc_end = 0;
        int group, v;
        int upa, apl;        /* units per alloc, allocs per line */

        v = ai->nr_groups;
        while (v /= 10)
                group_width++;

        v = num_possible_cpus();
        while (v /= 10)
                cpu_width++;
        empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';

        upa = ai->alloc_size / ai->unit_size;
        width = upa * (cpu_width + 1) + group_width + 3;
        apl = rounddown_pow_of_two(max(60 / width, 1));

        printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
               lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
               ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);

        for (group = 0; group < ai->nr_groups; group++) {
                const struct pcpu_group_info *gi = &ai->groups[group];
                int unit = 0, unit_end = 0;

                BUG_ON(gi->nr_units % upa);
                for (alloc_end += gi->nr_units / upa;
                     alloc < alloc_end; alloc++) {
                        if (!(alloc % apl)) {
                                pr_cont("\n");
                                printk("%spcpu-alloc: ", lvl);
                        }
                        pr_cont("[%0*d] ", group_width, group);

                        for (unit_end += upa; unit < unit_end; unit++)
                                if (gi->cpu_map[unit] != NR_CPUS)
                                        pr_cont("%0*d ",
                                                cpu_width, gi->cpu_map[unit]);
                                else
                                        pr_cont("%s ", empty_str);
                }
        }
        pr_cont("\n");
}

/**
 * pcpu_setup_first_chunk - initialize the first percpu chunk
 * @ai: pcpu_alloc_info describing how to percpu area is shaped
 * @base_addr: mapped address
 *
 * Initialize the first percpu chunk which contains the kernel static
 * percpu area.  This function is to be called from arch percpu area
 * setup path.
 *
 * @ai contains all information necessary to initialize the first
 * chunk and prime the dynamic percpu allocator.
 *
 * @ai->static_size is the size of static percpu area.
 *
 * @ai->reserved_size, if non-zero, specifies the amount of bytes to
 * reserve after the static area in the first chunk.  This reserves
 * the first chunk such that it's available only through reserved
 * percpu allocation.  This is primarily used to serve module percpu
 * static areas on architectures where the addressing model has
 * limited offset range for symbol relocations to guarantee module
 * percpu symbols fall inside the relocatable range.
 *
 * @ai->dyn_size determines the number of bytes available for dynamic
 * allocation in the first chunk.  The area between @ai->static_size +
 * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
 *
 * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
 * and equal to or larger than @ai->static_size + @ai->reserved_size +
 * @ai->dyn_size.
 *
 * @ai->atom_size is the allocation atom size and used as alignment
 * for vm areas.
 *
 * @ai->alloc_size is the allocation size and always multiple of
 * @ai->atom_size.  This is larger than @ai->atom_size if
 * @ai->unit_size is larger than @ai->atom_size.
 *
 * @ai->nr_groups and @ai->groups describe virtual memory layout of
 * percpu areas.  Units which should be colocated are put into the
 * same group.  Dynamic VM areas will be allocated according to these
 * groupings.  If @ai->nr_groups is zero, a single group containing
 * all units is assumed.
 *
 * The caller should have mapped the first chunk at @base_addr and
 * copied static data to each unit.
 *
 * The first chunk will always contain a static and a dynamic region.
 * However, the static region is not managed by any chunk.  If the first
 * chunk also contains a reserved region, it is served by two chunks -
 * one for the reserved region and one for the dynamic region.  They
 * share the same vm, but use offset regions in the area allocation map.
 * The chunk serving the dynamic region is circulated in the chunk slots
 * and available for dynamic allocation like any other chunk.
 */
void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
                                   void *base_addr)
{
        size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
        size_t static_size, dyn_size;
        unsigned long *group_offsets;
        size_t *group_sizes;
        unsigned long *unit_off;
        unsigned int cpu;
        int *unit_map;
        int group, unit, i;
        unsigned long tmp_addr;
        size_t alloc_size;

#define PCPU_SETUP_BUG_ON(cond)        do {                                        \
        if (unlikely(cond)) {                                                \
                pr_emerg("failed to initialize, %s\n", #cond);                \
                pr_emerg("cpu_possible_mask=%*pb\n",                        \
                         cpumask_pr_args(cpu_possible_mask));                \
                pcpu_dump_alloc_info(KERN_EMERG, ai);                        \
                BUG();                                                        \
        }                                                                \
} while (0)

        /* sanity checks */
        PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
#ifdef CONFIG_SMP
        PCPU_SETUP_BUG_ON(!ai->static_size);
        PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start));
#endif
        PCPU_SETUP_BUG_ON(!base_addr);
        PCPU_SETUP_BUG_ON(offset_in_page(base_addr));
        PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
        PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
        PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
        PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE));
        PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
        PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE));
        PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) ||
                            IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE)));
        PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);

        /* process group information and build config tables accordingly */
        alloc_size = ai->nr_groups * sizeof(group_offsets[0]);
        group_offsets = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);

        alloc_size = ai->nr_groups * sizeof(group_sizes[0]);
        group_sizes = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);

        alloc_size = nr_cpu_ids * sizeof(unit_map[0]);
        unit_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);

        alloc_size = nr_cpu_ids * sizeof(unit_off[0]);
        unit_off = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);

        for (cpu = 0; cpu < nr_cpu_ids; cpu++)
                unit_map[cpu] = UINT_MAX;

        pcpu_low_unit_cpu = NR_CPUS;
        pcpu_high_unit_cpu = NR_CPUS;

        for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
                const struct pcpu_group_info *gi = &ai->groups[group];

                group_offsets[group] = gi->base_offset;
                group_sizes[group] = gi->nr_units * ai->unit_size;

                for (i = 0; i < gi->nr_units; i++) {
                        cpu = gi->cpu_map[i];
                        if (cpu == NR_CPUS)
                                continue;

                        PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids);
                        PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
                        PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);

                        unit_map[cpu] = unit + i;
                        unit_off[cpu] = gi->base_offset + i * ai->unit_size;

                        /* determine low/high unit_cpu */
                        if (pcpu_low_unit_cpu == NR_CPUS ||
                            unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
                                pcpu_low_unit_cpu = cpu;
                        if (pcpu_high_unit_cpu == NR_CPUS ||
                            unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
                                pcpu_high_unit_cpu = cpu;
                }
        }
        pcpu_nr_units = unit;

        for_each_possible_cpu(cpu)
                PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);

        /* we're done parsing the input, undefine BUG macro and dump config */
#undef PCPU_SETUP_BUG_ON
        pcpu_dump_alloc_info(KERN_DEBUG, ai);

        pcpu_nr_groups = ai->nr_groups;
        pcpu_group_offsets = group_offsets;
        pcpu_group_sizes = group_sizes;
        pcpu_unit_map = unit_map;
        pcpu_unit_offsets = unit_off;

        /* determine basic parameters */
        pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
        pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
        pcpu_atom_size = ai->atom_size;
        pcpu_chunk_struct_size = struct_size((struct pcpu_chunk *)0, populated,
                                             BITS_TO_LONGS(pcpu_unit_pages));

        pcpu_stats_save_ai(ai);

        /*
         * Allocate chunk slots.  The slots after the active slots are:
         *   sidelined_slot - isolated, depopulated chunks
         *   free_slot - fully free chunks
         *   to_depopulate_slot - isolated, chunks to depopulate
         */
        pcpu_sidelined_slot = __pcpu_size_to_slot(pcpu_unit_size) + 1;
        pcpu_free_slot = pcpu_sidelined_slot + 1;
        pcpu_to_depopulate_slot = pcpu_free_slot + 1;
        pcpu_nr_slots = pcpu_to_depopulate_slot + 1;
        pcpu_chunk_lists = memblock_alloc_or_panic(pcpu_nr_slots *
                                          sizeof(pcpu_chunk_lists[0]),
                                          SMP_CACHE_BYTES);

        for (i = 0; i < pcpu_nr_slots; i++)
                INIT_LIST_HEAD(&pcpu_chunk_lists[i]);

        /*
         * The end of the static region needs to be aligned with the
         * minimum allocation size as this offsets the reserved and
         * dynamic region.  The first chunk ends page aligned by
         * expanding the dynamic region, therefore the dynamic region
         * can be shrunk to compensate while still staying above the
         * configured sizes.
         */
        static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE);
        dyn_size = ai->dyn_size - (static_size - ai->static_size);

        /*
         * Initialize first chunk:
         * This chunk is broken up into 3 parts:
         *                < static | [reserved] | dynamic >
         * - static - there is no backing chunk because these allocations can
         *   never be freed.
         * - reserved (pcpu_reserved_chunk) - exists primarily to serve
         *   allocations from module load.
         * - dynamic (pcpu_first_chunk) - serves the dynamic part of the first
         *   chunk.
         */
        tmp_addr = (unsigned long)base_addr + static_size;
        if (ai->reserved_size)
                pcpu_reserved_chunk = pcpu_alloc_first_chunk(tmp_addr,
                                                ai->reserved_size);
        tmp_addr = (unsigned long)base_addr + static_size + ai->reserved_size;
        pcpu_first_chunk = pcpu_alloc_first_chunk(tmp_addr, dyn_size);

        pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
        pcpu_chunk_relocate(pcpu_first_chunk, -1);

        /* include all regions of the first chunk */
        pcpu_nr_populated += PFN_DOWN(size_sum);

        pcpu_stats_chunk_alloc();
        trace_percpu_create_chunk(base_addr);

        /* we're done */
        pcpu_base_addr = base_addr;
}

#ifdef CONFIG_SMP

const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
        [PCPU_FC_AUTO]        = "auto",
        [PCPU_FC_EMBED]        = "embed",
        [PCPU_FC_PAGE]        = "page",
};

enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;

static int __init percpu_alloc_setup(char *str)
{
        if (!str)
                return -EINVAL;

        if (0)
                /* nada */;
#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
        else if (!strcmp(str, "embed"))
                pcpu_chosen_fc = PCPU_FC_EMBED;
#endif
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
        else if (!strcmp(str, "page"))
                pcpu_chosen_fc = PCPU_FC_PAGE;
#endif
        else
                pr_warn("unknown allocator %s specified\n", str);

        return 0;
}
early_param("percpu_alloc", percpu_alloc_setup);

/*
 * pcpu_embed_first_chunk() is used by the generic percpu setup.
 * Build it if needed by the arch config or the generic setup is going
 * to be used.
 */
#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
        !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
#define BUILD_EMBED_FIRST_CHUNK
#endif

/* build pcpu_page_first_chunk() iff needed by the arch config */
#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
#define BUILD_PAGE_FIRST_CHUNK
#endif

/* pcpu_build_alloc_info() is used by both embed and page first chunk */
#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
/**
 * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
 * @reserved_size: the size of reserved percpu area in bytes
 * @dyn_size: minimum free size for dynamic allocation in bytes
 * @atom_size: allocation atom size
 * @cpu_distance_fn: callback to determine distance between cpus, optional
 *
 * This function determines grouping of units, their mappings to cpus
 * and other parameters considering needed percpu size, allocation
 * atom size and distances between CPUs.
 *
 * Groups are always multiples of atom size and CPUs which are of
 * LOCAL_DISTANCE both ways are grouped together and share space for
 * units in the same group.  The returned configuration is guaranteed
 * to have CPUs on different nodes on different groups and >=75% usage
 * of allocated virtual address space.
 *
 * RETURNS:
 * On success, pointer to the new allocation_info is returned.  On
 * failure, ERR_PTR value is returned.
 */
static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
                                size_t reserved_size, size_t dyn_size,
                                size_t atom_size,
                                pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
{
        static int group_map[NR_CPUS] __initdata;
        static int group_cnt[NR_CPUS] __initdata;
        static struct cpumask mask __initdata;
        const size_t static_size = __per_cpu_end - __per_cpu_start;
        int nr_groups = 1, nr_units = 0;
        size_t size_sum, min_unit_size, alloc_size;
        int upa, max_upa, best_upa;        /* units_per_alloc */
        int last_allocs, group, unit;
        unsigned int cpu, tcpu;
        struct pcpu_alloc_info *ai;
        unsigned int *cpu_map;

        /* this function may be called multiple times */
        memset(group_map, 0, sizeof(group_map));
        memset(group_cnt, 0, sizeof(group_cnt));
        cpumask_clear(&mask);

        /* calculate size_sum and ensure dyn_size is enough for early alloc */
        size_sum = PFN_ALIGN(static_size + reserved_size +
                            max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
        dyn_size = size_sum - static_size - reserved_size;

        /*
         * Determine min_unit_size, alloc_size and max_upa such that
         * alloc_size is multiple of atom_size and is the smallest
         * which can accommodate 4k aligned segments which are equal to
         * or larger than min_unit_size.
         */
        min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);

        /* determine the maximum # of units that can fit in an allocation */
        alloc_size = roundup(min_unit_size, atom_size);
        upa = alloc_size / min_unit_size;
        while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
                upa--;
        max_upa = upa;

        cpumask_copy(&mask, cpu_possible_mask);

        /* group cpus according to their proximity */
        for (group = 0; !cpumask_empty(&mask); group++) {
                /* pop the group's first cpu */
                cpu = cpumask_first(&mask);
                group_map[cpu] = group;
                group_cnt[group]++;
                cpumask_clear_cpu(cpu, &mask);

                for_each_cpu(tcpu, &mask) {
                        if (!cpu_distance_fn ||
                            (cpu_distance_fn(cpu, tcpu) == LOCAL_DISTANCE &&
                             cpu_distance_fn(tcpu, cpu) == LOCAL_DISTANCE)) {
                                group_map[tcpu] = group;
                                group_cnt[group]++;
                                cpumask_clear_cpu(tcpu, &mask);
                        }
                }
        }
        nr_groups = group;

        /*
         * Wasted space is caused by a ratio imbalance of upa to group_cnt.
         * Expand the unit_size until we use >= 75% of the units allocated.
         * Related to atom_size, which could be much larger than the unit_size.
         */
        last_allocs = INT_MAX;
        best_upa = 0;
        for (upa = max_upa; upa; upa--) {
                int allocs = 0, wasted = 0;

                if (alloc_size % upa || (offset_in_page(alloc_size / upa)))
                        continue;

                for (group = 0; group < nr_groups; group++) {
                        int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
                        allocs += this_allocs;
                        wasted += this_allocs * upa - group_cnt[group];
                }

                /*
                 * Don't accept if wastage is over 1/3.  The
                 * greater-than comparison ensures upa==1 always
                 * passes the following check.
                 */
                if (wasted > num_possible_cpus() / 3)
                        continue;

                /* and then don't consume more memory */
                if (allocs > last_allocs)
                        break;
                last_allocs = allocs;
                best_upa = upa;
        }
        BUG_ON(!best_upa);
        upa = best_upa;

        /* allocate and fill alloc_info */
        for (group = 0; group < nr_groups; group++)
                nr_units += roundup(group_cnt[group], upa);

        ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
        if (!ai)
                return ERR_PTR(-ENOMEM);
        cpu_map = ai->groups[0].cpu_map;

        for (group = 0; group < nr_groups; group++) {
                ai->groups[group].cpu_map = cpu_map;
                cpu_map += roundup(group_cnt[group], upa);
        }

        ai->static_size = static_size;
        ai->reserved_size = reserved_size;
        ai->dyn_size = dyn_size;
        ai->unit_size = alloc_size / upa;
        ai->atom_size = atom_size;
        ai->alloc_size = alloc_size;

        for (group = 0, unit = 0; group < nr_groups; group++) {
                struct pcpu_group_info *gi = &ai->groups[group];

                /*
                 * Initialize base_offset as if all groups are located
                 * back-to-back.  The caller should update this to
                 * reflect actual allocation.
                 */
                gi->base_offset = unit * ai->unit_size;

                for_each_possible_cpu(cpu)
                        if (group_map[cpu] == group)
                                gi->cpu_map[gi->nr_units++] = cpu;
                gi->nr_units = roundup(gi->nr_units, upa);
                unit += gi->nr_units;
        }
        BUG_ON(unit != nr_units);

        return ai;
}

static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align,
                                   pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
{
        const unsigned long goal = __pa(MAX_DMA_ADDRESS);
#ifdef CONFIG_NUMA
        int node = NUMA_NO_NODE;
        void *ptr;

        if (cpu_to_nd_fn)
                node = cpu_to_nd_fn(cpu);

        if (node == NUMA_NO_NODE || !node_online(node) || !NODE_DATA(node)) {
                ptr = memblock_alloc_from(size, align, goal);
                pr_info("cpu %d has no node %d or node-local memory\n",
                        cpu, node);
                pr_debug("per cpu data for cpu%d %zu bytes at 0x%llx\n",
                         cpu, size, (u64)__pa(ptr));
        } else {
                ptr = memblock_alloc_try_nid(size, align, goal,
                                             MEMBLOCK_ALLOC_ACCESSIBLE,
                                             node);

                pr_debug("per cpu data for cpu%d %zu bytes on node%d at 0x%llx\n",
                         cpu, size, node, (u64)__pa(ptr));
        }
        return ptr;
#else
        return memblock_alloc_from(size, align, goal);
#endif
}

static void __init pcpu_fc_free(void *ptr, size_t size)
{
        memblock_free(ptr, size);
}
#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */

#if defined(BUILD_EMBED_FIRST_CHUNK)
/**
 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
 * @reserved_size: the size of reserved percpu area in bytes
 * @dyn_size: minimum free size for dynamic allocation in bytes
 * @atom_size: allocation atom size
 * @cpu_distance_fn: callback to determine distance between cpus, optional
 * @cpu_to_nd_fn: callback to convert cpu to it's node, optional
 *
 * This is a helper to ease setting up embedded first percpu chunk and
 * can be called where pcpu_setup_first_chunk() is expected.
 *
 * If this function is used to setup the first chunk, it is allocated
 * by calling pcpu_fc_alloc and used as-is without being mapped into
 * vmalloc area.  Allocations are always whole multiples of @atom_size
 * aligned to @atom_size.
 *
 * This enables the first chunk to piggy back on the linear physical
 * mapping which often uses larger page size.  Please note that this
 * can result in very sparse cpu->unit mapping on NUMA machines thus
 * requiring large vmalloc address space.  Don't use this allocator if
 * vmalloc space is not orders of magnitude larger than distances
 * between node memory addresses (ie. 32bit NUMA machines).
 *
 * @dyn_size specifies the minimum dynamic area size.
 *
 * If the needed size is smaller than the minimum or specified unit
 * size, the leftover is returned using pcpu_fc_free.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
                                  size_t atom_size,
                                  pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
                                  pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
{
        void *base = (void *)ULONG_MAX;
        void **areas = NULL;
        struct pcpu_alloc_info *ai;
        size_t size_sum, areas_size;
        unsigned long max_distance;
        int group, i, highest_group, rc = 0;

        ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
                                   cpu_distance_fn);
        if (IS_ERR(ai))
                return PTR_ERR(ai);

        size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
        areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));

        areas = memblock_alloc(areas_size, SMP_CACHE_BYTES);
        if (!areas) {
                rc = -ENOMEM;
                goto out_free;
        }

        /* allocate, copy and determine base address & max_distance */
        highest_group = 0;
        for (group = 0; group < ai->nr_groups; group++) {
                struct pcpu_group_info *gi = &ai->groups[group];
                unsigned int cpu = NR_CPUS;
                void *ptr;

                for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
                        cpu = gi->cpu_map[i];
                BUG_ON(cpu == NR_CPUS);

                /* allocate space for the whole group */
                ptr = pcpu_fc_alloc(cpu, gi->nr_units * ai->unit_size, atom_size, cpu_to_nd_fn);
                if (!ptr) {
                        rc = -ENOMEM;
                        goto out_free_areas;
                }
                /* kmemleak tracks the percpu allocations separately */
                kmemleak_ignore_phys(__pa(ptr));
                areas[group] = ptr;

                base = min(ptr, base);
                if (ptr > areas[highest_group])
                        highest_group = group;
        }
        max_distance = areas[highest_group] - base;
        max_distance += ai->unit_size * ai->groups[highest_group].nr_units;

        /* warn if maximum distance is further than 75% of vmalloc space */
        if (max_distance > VMALLOC_TOTAL * 3 / 4) {
                pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n",
                                max_distance, VMALLOC_TOTAL);
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
                /* and fail if we have fallback */
                rc = -EINVAL;
                goto out_free_areas;
#endif
        }

        /*
         * Copy data and free unused parts.  This should happen after all
         * allocations are complete; otherwise, we may end up with
         * overlapping groups.
         */
        for (group = 0; group < ai->nr_groups; group++) {
                struct pcpu_group_info *gi = &ai->groups[group];
                void *ptr = areas[group];

                for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
                        if (gi->cpu_map[i] == NR_CPUS) {
                                /* unused unit, free whole */
                                pcpu_fc_free(ptr, ai->unit_size);
                                continue;
                        }
                        /* copy and return the unused part */
                        memcpy(ptr, __per_cpu_start, ai->static_size);
                        pcpu_fc_free(ptr + size_sum, ai->unit_size - size_sum);
                }
        }

        /* base address is now known, determine group base offsets */
        for (group = 0; group < ai->nr_groups; group++) {
                ai->groups[group].base_offset = areas[group] - base;
        }

        pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu\n",
                PFN_DOWN(size_sum), ai->static_size, ai->reserved_size,
                ai->dyn_size, ai->unit_size);

        pcpu_setup_first_chunk(ai, base);
        goto out_free;

out_free_areas:
        for (group = 0; group < ai->nr_groups; group++)
                if (areas[group])
                        pcpu_fc_free(areas[group],
                                ai->groups[group].nr_units * ai->unit_size);
out_free:
        pcpu_free_alloc_info(ai);
        if (areas)
                memblock_free(areas, areas_size);
        return rc;
}
#endif /* BUILD_EMBED_FIRST_CHUNK */

#ifdef BUILD_PAGE_FIRST_CHUNK
#include <linux/pgalloc.h>

#ifndef P4D_TABLE_SIZE
#define P4D_TABLE_SIZE PAGE_SIZE
#endif

#ifndef PUD_TABLE_SIZE
#define PUD_TABLE_SIZE PAGE_SIZE
#endif

#ifndef PMD_TABLE_SIZE
#define PMD_TABLE_SIZE PAGE_SIZE
#endif

#ifndef PTE_TABLE_SIZE
#define PTE_TABLE_SIZE PAGE_SIZE
#endif
void __init __weak pcpu_populate_pte(unsigned long addr)
{
        pgd_t *pgd = pgd_offset_k(addr);
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;

        if (pgd_none(*pgd)) {
                p4d = memblock_alloc_or_panic(P4D_TABLE_SIZE, P4D_TABLE_SIZE);
                pgd_populate_kernel(addr, pgd, p4d);
        }

        p4d = p4d_offset(pgd, addr);
        if (p4d_none(*p4d)) {
                pud = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
                p4d_populate_kernel(addr, p4d, pud);
        }

        pud = pud_offset(p4d, addr);
        if (pud_none(*pud)) {
                pmd = memblock_alloc_or_panic(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
                pud_populate(&init_mm, pud, pmd);
        }

        pmd = pmd_offset(pud, addr);
        if (!pmd_present(*pmd)) {
                pte_t *new;

                new = memblock_alloc_or_panic(PTE_TABLE_SIZE, PTE_TABLE_SIZE);
                pmd_populate_kernel(&init_mm, pmd, new);
        }

        return;
}

/**
 * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
 * @reserved_size: the size of reserved percpu area in bytes
 * @cpu_to_nd_fn: callback to convert cpu to it's node, optional
 *
 * This is a helper to ease setting up page-remapped first percpu
 * chunk and can be called where pcpu_setup_first_chunk() is expected.
 *
 * This is the basic allocator.  Static percpu area is allocated
 * page-by-page into vmalloc area.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
{
        static struct vm_struct vm;
        struct pcpu_alloc_info *ai;
        char psize_str[16];
        int unit_pages;
        size_t pages_size;
        struct page **pages;
        int unit, i, j, rc = 0;
        int upa;
        int nr_g0_units;

        snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);

        ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
        if (IS_ERR(ai))
                return PTR_ERR(ai);
        BUG_ON(ai->nr_groups != 1);
        upa = ai->alloc_size/ai->unit_size;
        nr_g0_units = roundup(num_possible_cpus(), upa);
        if (WARN_ON(ai->groups[0].nr_units != nr_g0_units)) {
                pcpu_free_alloc_info(ai);
                return -EINVAL;
        }

        unit_pages = ai->unit_size >> PAGE_SHIFT;

        /* unaligned allocations can't be freed, round up to page size */
        pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
                               sizeof(pages[0]));
        pages = memblock_alloc_or_panic(pages_size, SMP_CACHE_BYTES);

        /* allocate pages */
        j = 0;
        for (unit = 0; unit < num_possible_cpus(); unit++) {
                unsigned int cpu = ai->groups[0].cpu_map[unit];
                for (i = 0; i < unit_pages; i++) {
                        void *ptr;

                        ptr = pcpu_fc_alloc(cpu, PAGE_SIZE, PAGE_SIZE, cpu_to_nd_fn);
                        if (!ptr) {
                                pr_warn("failed to allocate %s page for cpu%u\n",
                                                psize_str, cpu);
                                goto enomem;
                        }
                        /* kmemleak tracks the percpu allocations separately */
                        kmemleak_ignore_phys(__pa(ptr));
                        pages[j++] = virt_to_page(ptr);
                }
        }

        /* allocate vm area, map the pages and copy static data */
        vm.flags = VM_ALLOC;
        vm.size = num_possible_cpus() * ai->unit_size;
        vm_area_register_early(&vm, PAGE_SIZE);

        for (unit = 0; unit < num_possible_cpus(); unit++) {
                unsigned long unit_addr =
                        (unsigned long)vm.addr + unit * ai->unit_size;

                for (i = 0; i < unit_pages; i++)
                        pcpu_populate_pte(unit_addr + (i << PAGE_SHIFT));

                /* pte already populated, the following shouldn't fail */
                rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
                                      unit_pages);
                if (rc < 0)
                        panic("failed to map percpu area, err=%d\n", rc);

                flush_cache_vmap_early(unit_addr, unit_addr + ai->unit_size);

                /* copy static data */
                memcpy((void *)unit_addr, __per_cpu_start, ai->static_size);
        }

        /* we're ready, commit */
        pr_info("%d %s pages/cpu s%zu r%zu d%zu\n",
                unit_pages, psize_str, ai->static_size,
                ai->reserved_size, ai->dyn_size);

        pcpu_setup_first_chunk(ai, vm.addr);
        goto out_free_ar;

enomem:
        while (--j >= 0)
                pcpu_fc_free(page_address(pages[j]), PAGE_SIZE);
        rc = -ENOMEM;
out_free_ar:
        memblock_free(pages, pages_size);
        pcpu_free_alloc_info(ai);
        return rc;
}
#endif /* BUILD_PAGE_FIRST_CHUNK */

#ifndef        CONFIG_HAVE_SETUP_PER_CPU_AREA
/*
 * Generic SMP percpu area setup.
 *
 * The embedding helper is used because its behavior closely resembles
 * the original non-dynamic generic percpu area setup.  This is
 * important because many archs have addressing restrictions and might
 * fail if the percpu area is located far away from the previous
 * location.  As an added bonus, in non-NUMA cases, embedding is
 * generally a good idea TLB-wise because percpu area can piggy back
 * on the physical linear memory mapping which uses large page
 * mappings on applicable archs.
 */
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(__per_cpu_offset);

void __init setup_per_cpu_areas(void)
{
        unsigned long delta;
        unsigned int cpu;
        int rc;

        /*
         * Always reserve area for module percpu variables.  That's
         * what the legacy allocator did.
         */
        rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE,
                                    PAGE_SIZE, NULL, NULL);
        if (rc < 0)
                panic("Failed to initialize percpu areas.");

        delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
        for_each_possible_cpu(cpu)
                __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
}
#endif        /* CONFIG_HAVE_SETUP_PER_CPU_AREA */

#else        /* CONFIG_SMP */

/*
 * UP percpu area setup.
 *
 * UP always uses km-based percpu allocator with identity mapping.
 * Static percpu variables are indistinguishable from the usual static
 * variables and don't require any special preparation.
 */
void __init setup_per_cpu_areas(void)
{
        const size_t unit_size =
                roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
                                         PERCPU_DYNAMIC_RESERVE));
        struct pcpu_alloc_info *ai;
        void *fc;

        ai = pcpu_alloc_alloc_info(1, 1);
        fc = memblock_alloc_from(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
        if (!ai || !fc)
                panic("Failed to allocate memory for percpu areas.");
        /* kmemleak tracks the percpu allocations separately */
        kmemleak_ignore_phys(__pa(fc));

        ai->dyn_size = unit_size;
        ai->unit_size = unit_size;
        ai->atom_size = unit_size;
        ai->alloc_size = unit_size;
        ai->groups[0].nr_units = 1;
        ai->groups[0].cpu_map[0] = 0;

        pcpu_setup_first_chunk(ai, fc);
        pcpu_free_alloc_info(ai);
}

#endif        /* CONFIG_SMP */

/*
 * pcpu_nr_pages - calculate total number of populated backing pages
 *
 * This reflects the number of pages populated to back chunks.  Metadata is
 * excluded in the number exposed in meminfo as the number of backing pages
 * scales with the number of cpus and can quickly outweigh the memory used for
 * metadata.  It also keeps this calculation nice and simple.
 *
 * RETURNS:
 * Total number of populated backing pages in use by the allocator.
 */
unsigned long pcpu_nr_pages(void)
{
        return data_race(READ_ONCE(pcpu_nr_populated)) * pcpu_nr_units;
}

/*
 * Percpu allocator is initialized early during boot when neither slab or
 * workqueue is available.  Plug async management until everything is up
 * and running.
 */
static int __init percpu_enable_async(void)
{
        pcpu_async_enabled = true;
        return 0;
}
subsys_initcall(percpu_enable_async);














































   20 
   15 




   24 





    3 



   22 
   25 




    9 


    1 






    9 




    9 

    9 





    8 


    9 







    9 










    9 







    9 




    1 








    1 
    1 
    1 

    8 






    1 
    1 




    1 



    1 




    1 
    1 

    1 


    1 


    1 
    9 



































































































































    2 









   20 

   20 

   19 
    5 

   20 


    5 
   20 































































    1 












































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Synchronous Cryptographic Hash operations.
 *
 * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au>
 */

#include <crypto/scatterwalk.h>
#include <linux/cryptouser.h>
#include <linux/err.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/seq_file.h>
#include <linux/string.h>
#include <net/netlink.h>

#include "hash.h"

static inline bool crypto_shash_block_only(struct crypto_shash *tfm)
{
        return crypto_shash_alg(tfm)->base.cra_flags &
               CRYPTO_AHASH_ALG_BLOCK_ONLY;
}

static inline bool crypto_shash_final_nonzero(struct crypto_shash *tfm)
{
        return crypto_shash_alg(tfm)->base.cra_flags &
               CRYPTO_AHASH_ALG_FINAL_NONZERO;
}

static inline bool crypto_shash_finup_max(struct crypto_shash *tfm)
{
        return crypto_shash_alg(tfm)->base.cra_flags &
               CRYPTO_AHASH_ALG_FINUP_MAX;
}

int shash_no_setkey(struct crypto_shash *tfm, const u8 *key,
                    unsigned int keylen)
{
        return -ENOSYS;
}
EXPORT_SYMBOL_GPL(shash_no_setkey);

static void shash_set_needkey(struct crypto_shash *tfm, struct shash_alg *alg)
{
        if (crypto_shash_alg_needs_key(alg))
                crypto_shash_set_flags(tfm, CRYPTO_TFM_NEED_KEY);
}

int crypto_shash_setkey(struct crypto_shash *tfm, const u8 *key,
                        unsigned int keylen)
{
        struct shash_alg *shash = crypto_shash_alg(tfm);
        int err;

        err = shash->setkey(tfm, key, keylen);
        if (unlikely(err)) {
                shash_set_needkey(tfm, shash);
                return err;
        }

        crypto_shash_clear_flags(tfm, CRYPTO_TFM_NEED_KEY);
        return 0;
}
EXPORT_SYMBOL_GPL(crypto_shash_setkey);

static int __crypto_shash_init(struct shash_desc *desc)
{
        struct crypto_shash *tfm = desc->tfm;

        if (crypto_shash_block_only(tfm)) {
                u8 *buf = shash_desc_ctx(desc);

                buf += crypto_shash_descsize(tfm) - 1;
                *buf = 0;
        }

        return crypto_shash_alg(tfm)->init(desc);
}

int crypto_shash_init(struct shash_desc *desc)
{
        if (crypto_shash_get_flags(desc->tfm) & CRYPTO_TFM_NEED_KEY)
                return -ENOKEY;
        return __crypto_shash_init(desc);
}
EXPORT_SYMBOL_GPL(crypto_shash_init);

static int shash_default_finup(struct shash_desc *desc, const u8 *data,
                               unsigned int len, u8 *out)
{
        struct shash_alg *shash = crypto_shash_alg(desc->tfm);

        return shash->update(desc, data, len) ?:
               shash->final(desc, out);
}

static int crypto_shash_op_and_zero(
        int (*op)(struct shash_desc *desc, const u8 *data,
                  unsigned int len, u8 *out),
        struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out)
{
        int err;

        err = op(desc, data, len, out);
        memset(shash_desc_ctx(desc), 0, crypto_shash_descsize(desc->tfm));
        return err;
}

int crypto_shash_finup(struct shash_desc *restrict desc, const u8 *data,
                       unsigned int len, u8 *restrict out)
{
        struct crypto_shash *tfm = desc->tfm;
        u8 *blenp = shash_desc_ctx(desc);
        bool finup_max, nonzero;
        unsigned int bs;
        int err;
        u8 *buf;

        if (!crypto_shash_block_only(tfm)) {
                if (out)
                        goto finup;
                return crypto_shash_alg(tfm)->update(desc, data, len);
        }

        finup_max = out && crypto_shash_finup_max(tfm);

        /* Retain extra block for final nonzero algorithms. */
        nonzero = crypto_shash_final_nonzero(tfm);

        /*
         * The partial block buffer follows the algorithm desc context.
         * The byte following that contains the length.
         */
        blenp += crypto_shash_descsize(tfm) - 1;
        bs = crypto_shash_blocksize(tfm);
        buf = blenp - bs;

        if (likely(!*blenp && finup_max))
                goto finup;

        while ((*blenp + len) >= bs + nonzero) {
                unsigned int nbytes = len - nonzero;
                const u8 *src = data;

                if (*blenp) {
                        memcpy(buf + *blenp, data, bs - *blenp);
                        nbytes = bs;
                        src = buf;
                }

                err = crypto_shash_alg(tfm)->update(desc, src, nbytes);
                if (err < 0)
                        return err;

                data += nbytes - err - *blenp;
                len -= nbytes - err - *blenp;
                *blenp = 0;
        }

        if (*blenp || !out) {
                memcpy(buf + *blenp, data, len);
                *blenp += len;
                if (!out)
                        return 0;
                data = buf;
                len = *blenp;
        }

finup:
        return crypto_shash_op_and_zero(crypto_shash_alg(tfm)->finup, desc,
                                        data, len, out);
}
EXPORT_SYMBOL_GPL(crypto_shash_finup);

static int shash_default_digest(struct shash_desc *desc, const u8 *data,
                                unsigned int len, u8 *out)
{
        return __crypto_shash_init(desc) ?:
               crypto_shash_finup(desc, data, len, out);
}

int crypto_shash_digest(struct shash_desc *desc, const u8 *data,
                        unsigned int len, u8 *out)
{
        struct crypto_shash *tfm = desc->tfm;

        if (crypto_shash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
                return -ENOKEY;

        return crypto_shash_op_and_zero(crypto_shash_alg(tfm)->digest, desc,
                                        data, len, out);
}
EXPORT_SYMBOL_GPL(crypto_shash_digest);

int crypto_shash_tfm_digest(struct crypto_shash *tfm, const u8 *data,
                            unsigned int len, u8 *out)
{
        SHASH_DESC_ON_STACK(desc, tfm);

        desc->tfm = tfm;
        return crypto_shash_digest(desc, data, len, out);
}
EXPORT_SYMBOL_GPL(crypto_shash_tfm_digest);

static int __crypto_shash_export(struct shash_desc *desc, void *out,
                                 int (*export)(struct shash_desc *desc,
                                               void *out))
{
        struct crypto_shash *tfm = desc->tfm;
        u8 *buf = shash_desc_ctx(desc);
        unsigned int plen, ss;

        plen = crypto_shash_blocksize(tfm) + 1;
        ss = crypto_shash_statesize(tfm);
        if (crypto_shash_block_only(tfm))
                ss -= plen;
        if (!export) {
                memcpy(out, buf, ss);
                return 0;
        }

        return export(desc, out);
}

int crypto_shash_export_core(struct shash_desc *desc, void *out)
{
        return __crypto_shash_export(desc, out,
                                     crypto_shash_alg(desc->tfm)->export_core);
}
EXPORT_SYMBOL_GPL(crypto_shash_export_core);

int crypto_shash_export(struct shash_desc *desc, void *out)
{
        struct crypto_shash *tfm = desc->tfm;

        if (crypto_shash_block_only(tfm)) {
                unsigned int plen = crypto_shash_blocksize(tfm) + 1;
                unsigned int descsize = crypto_shash_descsize(tfm);
                unsigned int ss = crypto_shash_statesize(tfm);
                u8 *buf = shash_desc_ctx(desc);

                memcpy(out + ss - plen, buf + descsize - plen, plen);
        }
        return __crypto_shash_export(desc, out, crypto_shash_alg(tfm)->export);
}
EXPORT_SYMBOL_GPL(crypto_shash_export);

static int __crypto_shash_import(struct shash_desc *desc, const void *in,
                                 int (*import)(struct shash_desc *desc,
                                               const void *in))
{
        struct crypto_shash *tfm = desc->tfm;
        unsigned int descsize, plen, ss;
        u8 *buf = shash_desc_ctx(desc);

        if (crypto_shash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
                return -ENOKEY;

        ss = crypto_shash_statesize(tfm);
        if (crypto_shash_block_only(tfm)) {
                plen = crypto_shash_blocksize(tfm) + 1;
                ss -= plen;
                descsize = crypto_shash_descsize(tfm);
                buf[descsize - 1] = 0;
        }
        if (!import) {
                memcpy(buf, in, ss);
                return 0;
        }

        return import(desc, in);
}

int crypto_shash_import_core(struct shash_desc *desc, const void *in)
{
        return __crypto_shash_import(desc, in,
                                     crypto_shash_alg(desc->tfm)->import_core);
}
EXPORT_SYMBOL_GPL(crypto_shash_import_core);

int crypto_shash_import(struct shash_desc *desc, const void *in)
{
        struct crypto_shash *tfm = desc->tfm;
        int err;

        err = __crypto_shash_import(desc, in, crypto_shash_alg(tfm)->import);
        if (crypto_shash_block_only(tfm)) {
                unsigned int plen = crypto_shash_blocksize(tfm) + 1;
                unsigned int descsize = crypto_shash_descsize(tfm);
                unsigned int ss = crypto_shash_statesize(tfm);
                u8 *buf = shash_desc_ctx(desc);

                memcpy(buf + descsize - plen, in + ss - plen, plen);
                if (buf[descsize - 1] >= plen)
                        err = -EOVERFLOW;
        }
        return err;
}
EXPORT_SYMBOL_GPL(crypto_shash_import);

static void crypto_shash_exit_tfm(struct crypto_tfm *tfm)
{
        struct crypto_shash *hash = __crypto_shash_cast(tfm);
        struct shash_alg *alg = crypto_shash_alg(hash);

        alg->exit_tfm(hash);
}

static int crypto_shash_init_tfm(struct crypto_tfm *tfm)
{
        struct crypto_shash *hash = __crypto_shash_cast(tfm);
        struct shash_alg *alg = crypto_shash_alg(hash);

        shash_set_needkey(hash, alg);

        if (alg->exit_tfm)
                tfm->exit = crypto_shash_exit_tfm;

        if (!alg->init_tfm)
                return 0;

        return alg->init_tfm(hash);
}

static void crypto_shash_free_instance(struct crypto_instance *inst)
{
        struct shash_instance *shash = shash_instance(inst);

        shash->free(shash);
}

static int __maybe_unused crypto_shash_report(
        struct sk_buff *skb, struct crypto_alg *alg)
{
        struct crypto_report_hash rhash;
        struct shash_alg *salg = __crypto_shash_alg(alg);

        memset(&rhash, 0, sizeof(rhash));

        strscpy(rhash.type, "shash", sizeof(rhash.type));

        rhash.blocksize = alg->cra_blocksize;
        rhash.digestsize = salg->digestsize;

        return nla_put(skb, CRYPTOCFGA_REPORT_HASH, sizeof(rhash), &rhash);
}

static void crypto_shash_show(struct seq_file *m, struct crypto_alg *alg)
        __maybe_unused;
static void crypto_shash_show(struct seq_file *m, struct crypto_alg *alg)
{
        struct shash_alg *salg = __crypto_shash_alg(alg);

        seq_printf(m, "type         : shash\n");
        seq_printf(m, "blocksize    : %u\n", alg->cra_blocksize);
        seq_printf(m, "digestsize   : %u\n", salg->digestsize);
}

const struct crypto_type crypto_shash_type = {
        .extsize = crypto_alg_extsize,
        .init_tfm = crypto_shash_init_tfm,
        .free = crypto_shash_free_instance,
#ifdef CONFIG_PROC_FS
        .show = crypto_shash_show,
#endif
#if IS_ENABLED(CONFIG_CRYPTO_USER)
        .report = crypto_shash_report,
#endif
        .maskclear = ~CRYPTO_ALG_TYPE_MASK,
        .maskset = CRYPTO_ALG_TYPE_MASK,
        .type = CRYPTO_ALG_TYPE_SHASH,
        .tfmsize = offsetof(struct crypto_shash, base),
        .algsize = offsetof(struct shash_alg, base),
};

int crypto_grab_shash(struct crypto_shash_spawn *spawn,
                      struct crypto_instance *inst,
                      const char *name, u32 type, u32 mask)
{
        spawn->base.frontend = &crypto_shash_type;
        return crypto_grab_spawn(&spawn->base, inst, name, type, mask);
}
EXPORT_SYMBOL_GPL(crypto_grab_shash);

struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type,
                                        u32 mask)
{
        return crypto_alloc_tfm(alg_name, &crypto_shash_type, type, mask);
}
EXPORT_SYMBOL_GPL(crypto_alloc_shash);

int crypto_has_shash(const char *alg_name, u32 type, u32 mask)
{
        return crypto_type_has_alg(alg_name, &crypto_shash_type, type, mask);
}
EXPORT_SYMBOL_GPL(crypto_has_shash);

struct crypto_shash *crypto_clone_shash(struct crypto_shash *hash)
{
        struct crypto_tfm *tfm = crypto_shash_tfm(hash);
        struct shash_alg *alg = crypto_shash_alg(hash);
        struct crypto_shash *nhash;
        int err;

        if (!crypto_shash_alg_has_setkey(alg)) {
                tfm = crypto_tfm_get(tfm);
                if (IS_ERR(tfm))
                        return ERR_CAST(tfm);

                return hash;
        }

        if (!alg->clone_tfm && (alg->init_tfm || alg->base.cra_init))
                return ERR_PTR(-ENOSYS);

        nhash = crypto_clone_tfm(&crypto_shash_type, tfm);
        if (IS_ERR(nhash))
                return nhash;

        if (alg->clone_tfm) {
                err = alg->clone_tfm(nhash, hash);
                if (err) {
                        crypto_free_shash(nhash);
                        return ERR_PTR(err);
                }
        }

        if (alg->exit_tfm)
                crypto_shash_tfm(nhash)->exit = crypto_shash_exit_tfm;

        return nhash;
}
EXPORT_SYMBOL_GPL(crypto_clone_shash);

int hash_prepare_alg(struct hash_alg_common *alg)
{
        struct crypto_alg *base = &alg->base;

        if (alg->digestsize > HASH_MAX_DIGESTSIZE)
                return -EINVAL;

        /* alignmask is not useful for hashes, so it is not supported. */
        if (base->cra_alignmask)
                return -EINVAL;

        base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;

        return 0;
}

static int shash_default_export_core(struct shash_desc *desc, void *out)
{
        return -ENOSYS;
}

static int shash_default_import_core(struct shash_desc *desc, const void *in)
{
        return -ENOSYS;
}

static int shash_prepare_alg(struct shash_alg *alg)
{
        struct crypto_alg *base = &alg->halg.base;
        int err;

        if ((alg->export && !alg->import) || (alg->import && !alg->export))
                return -EINVAL;

        err = hash_prepare_alg(&alg->halg);
        if (err)
                return err;

        base->cra_type = &crypto_shash_type;
        base->cra_flags |= CRYPTO_ALG_TYPE_SHASH;
        base->cra_flags |= CRYPTO_ALG_REQ_VIRT;

        /*
         * Handle missing optional functions.  For each one we can either
         * install a default here, or we can leave the pointer as NULL and check
         * the pointer for NULL in crypto_shash_*(), avoiding an indirect call
         * when the default behavior is desired.  For ->finup and ->digest we
         * install defaults, since for optimal performance algorithms should
         * implement these anyway.  On the other hand, for ->import and
         * ->export the common case and best performance comes from the simple
         * memcpy of the shash_desc_ctx, so when those pointers are NULL we
         * leave them NULL and provide the memcpy with no indirect call.
         */
        if (!alg->finup)
                alg->finup = shash_default_finup;
        if (!alg->digest)
                alg->digest = shash_default_digest;
        if (!alg->export && !alg->halg.statesize)
                alg->halg.statesize = alg->descsize;
        if (!alg->setkey)
                alg->setkey = shash_no_setkey;

        if (base->cra_flags & CRYPTO_AHASH_ALG_BLOCK_ONLY) {
                BUILD_BUG_ON(MAX_ALGAPI_BLOCKSIZE >= 256);
                alg->descsize += base->cra_blocksize + 1;
                alg->statesize += base->cra_blocksize + 1;
                alg->export_core = alg->export;
                alg->import_core = alg->import;
        } else if (!alg->export_core || !alg->import_core) {
                alg->export_core = shash_default_export_core;
                alg->import_core = shash_default_import_core;
                base->cra_flags |= CRYPTO_AHASH_ALG_NO_EXPORT_CORE;
        }

        if (alg->descsize > HASH_MAX_DESCSIZE)
                return -EINVAL;
        if (alg->statesize > HASH_MAX_STATESIZE)
                return -EINVAL;

        base->cra_reqsize = sizeof(struct shash_desc) + alg->descsize;

        return 0;
}

int crypto_register_shash(struct shash_alg *alg)
{
        struct crypto_alg *base = &alg->base;
        int err;

        err = shash_prepare_alg(alg);
        if (err)
                return err;

        return crypto_register_alg(base);
}
EXPORT_SYMBOL_GPL(crypto_register_shash);

void crypto_unregister_shash(struct shash_alg *alg)
{
        crypto_unregister_alg(&alg->base);
}
EXPORT_SYMBOL_GPL(crypto_unregister_shash);

int crypto_register_shashes(struct shash_alg *algs, int count)
{
        int i, ret;

        for (i = 0; i < count; i++) {
                ret = crypto_register_shash(&algs[i]);
                if (ret)
                        goto err;
        }

        return 0;

err:
        for (--i; i >= 0; --i)
                crypto_unregister_shash(&algs[i]);

        return ret;
}
EXPORT_SYMBOL_GPL(crypto_register_shashes);

void crypto_unregister_shashes(struct shash_alg *algs, int count)
{
        int i;

        for (i = count - 1; i >= 0; --i)
                crypto_unregister_shash(&algs[i]);
}
EXPORT_SYMBOL_GPL(crypto_unregister_shashes);

int shash_register_instance(struct crypto_template *tmpl,
                            struct shash_instance *inst)
{
        int err;

        if (WARN_ON(!inst->free))
                return -EINVAL;

        err = shash_prepare_alg(&inst->alg);
        if (err)
                return err;

        return crypto_register_instance(tmpl, shash_crypto_instance(inst));
}
EXPORT_SYMBOL_GPL(shash_register_instance);

void shash_free_singlespawn_instance(struct shash_instance *inst)
{
        crypto_drop_spawn(shash_instance_ctx(inst));
        kfree(inst);
}
EXPORT_SYMBOL_GPL(shash_free_singlespawn_instance);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Synchronous cryptographic hash type");



























   60 




















































  300 







































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_PREEMPT_H
#define __ASM_PREEMPT_H

#include <asm/rmwcc.h>
#include <asm/percpu.h>

#include <linux/static_call_types.h>

DECLARE_PER_CPU_CACHE_HOT(int, __preempt_count);

/* We use the MSB mostly because its available */
#define PREEMPT_NEED_RESCHED        0x80000000

/*
 * We use the PREEMPT_NEED_RESCHED bit as an inverted NEED_RESCHED such
 * that a decrement hitting 0 means we can and should reschedule.
 */
#define PREEMPT_ENABLED        (0 + PREEMPT_NEED_RESCHED)

/*
 * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users
 * that think a non-zero value indicates we cannot preempt.
 */
static __always_inline int preempt_count(void)
{
        return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED;
}

static __always_inline void preempt_count_set(int pc)
{
        int old, new;

        old = raw_cpu_read_4(__preempt_count);
        do {
                new = (old & PREEMPT_NEED_RESCHED) |
                        (pc & ~PREEMPT_NEED_RESCHED);
        } while (!raw_cpu_try_cmpxchg_4(__preempt_count, &old, new));
}

/*
 * must be macros to avoid header recursion hell
 */
#define init_task_preempt_count(p) do { } while (0)

#define init_idle_preempt_count(p, cpu) do { \
        per_cpu(__preempt_count, (cpu)) = PREEMPT_DISABLED; \
} while (0)

/*
 * We fold the NEED_RESCHED bit into the preempt count such that
 * preempt_enable() can decrement and test for needing to reschedule with a
 * single instruction.
 *
 * We invert the actual bit, so that when the decrement hits 0 we know we both
 * need to resched (the bit is cleared) and can resched (no preempt count).
 */

static __always_inline void set_preempt_need_resched(void)
{
        raw_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED);
}

static __always_inline void clear_preempt_need_resched(void)
{
        raw_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED);
}

static __always_inline bool test_preempt_need_resched(void)
{
        return !(raw_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED);
}

/*
 * The various preempt_count add/sub methods
 */

static __always_inline void __preempt_count_add(int val)
{
        raw_cpu_add_4(__preempt_count, val);
}

static __always_inline void __preempt_count_sub(int val)
{
        raw_cpu_add_4(__preempt_count, -val);
}

/*
 * Because we keep PREEMPT_NEED_RESCHED set when we do _not_ need to reschedule
 * a decrement which hits zero means we have no preempt_count and should
 * reschedule.
 */
static __always_inline bool __preempt_count_dec_and_test(void)
{
        return GEN_UNARY_RMWcc("decl", __my_cpu_var(__preempt_count), e,
                               __percpu_arg([var]));
}

/*
 * Returns true when we need to resched and can (barring IRQ state).
 */
static __always_inline bool should_resched(int preempt_offset)
{
        return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
}

#ifdef CONFIG_PREEMPTION

extern asmlinkage void preempt_schedule(void);
extern asmlinkage void preempt_schedule_thunk(void);

#define preempt_schedule_dynamic_enabled        preempt_schedule_thunk
#define preempt_schedule_dynamic_disabled        NULL

extern asmlinkage void preempt_schedule_notrace(void);
extern asmlinkage void preempt_schedule_notrace_thunk(void);

#define preempt_schedule_notrace_dynamic_enabled        preempt_schedule_notrace_thunk
#define preempt_schedule_notrace_dynamic_disabled        NULL

#ifdef CONFIG_PREEMPT_DYNAMIC

DECLARE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled);

#define __preempt_schedule() \
do { \
        __STATIC_CALL_MOD_ADDRESSABLE(preempt_schedule); \
        asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule) : ASM_CALL_CONSTRAINT); \
} while (0)

DECLARE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled);

#define __preempt_schedule_notrace() \
do { \
        __STATIC_CALL_MOD_ADDRESSABLE(preempt_schedule_notrace); \
        asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule_notrace) : ASM_CALL_CONSTRAINT); \
} while (0)

#else /* PREEMPT_DYNAMIC */

#define __preempt_schedule() \
        asm volatile ("call preempt_schedule_thunk" : ASM_CALL_CONSTRAINT);

#define __preempt_schedule_notrace() \
        asm volatile ("call preempt_schedule_notrace_thunk" : ASM_CALL_CONSTRAINT);

#endif /* PREEMPT_DYNAMIC */

#endif /* PREEMPTION */

#endif /* __ASM_PREEMPT_H */





















































































































































































































































































































































  320 











































































































































































































































































































































































































































































































































  108 


  320 
























  319 

  319 

  320 

   70 
    2 















  316 

    3 


  319 








  317 

    2 

  317 









































  317 


  316 


  318 



































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
/* SPDX-License-Identifier: GPL-2.0+ */
/*
 * Read-Copy Update mechanism for mutual exclusion
 *
 * Copyright IBM Corporation, 2001
 *
 * Author: Dipankar Sarma <dipankar@in.ibm.com>
 *
 * Based on the original work by Paul McKenney <paulmck@vnet.ibm.com>
 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
 * Papers:
 * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
 *
 * For detailed explanation of Read-Copy Update mechanism see -
 *                http://lse.sourceforge.net/locking/rcupdate.html
 *
 */

#ifndef __LINUX_RCUPDATE_H
#define __LINUX_RCUPDATE_H

#include <linux/types.h>
#include <linux/compiler.h>
#include <linux/atomic.h>
#include <linux/irqflags.h>
#include <linux/sched.h>
#include <linux/bottom_half.h>
#include <linux/lockdep.h>
#include <linux/cleanup.h>
#include <asm/processor.h>
#include <linux/context_tracking_irq.h>

#define ULONG_CMP_GE(a, b)        (ULONG_MAX / 2 >= (a) - (b))
#define ULONG_CMP_LT(a, b)        (ULONG_MAX / 2 < (a) - (b))

#define RCU_SEQ_CTR_SHIFT    2
#define RCU_SEQ_STATE_MASK   ((1 << RCU_SEQ_CTR_SHIFT) - 1)

/* Exported common interfaces */
void call_rcu(struct rcu_head *head, rcu_callback_t func);
void rcu_barrier_tasks(void);
void synchronize_rcu(void);

struct rcu_gp_oldstate;
unsigned long get_completed_synchronize_rcu(void);
void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);

// Maximum number of unsigned long values corresponding to
// not-yet-completed RCU grace periods.
#define NUM_ACTIVE_RCU_POLL_OLDSTATE 2

/**
 * same_state_synchronize_rcu - Are two old-state values identical?
 * @oldstate1: First old-state value.
 * @oldstate2: Second old-state value.
 *
 * The two old-state values must have been obtained from either
 * get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or
 * get_completed_synchronize_rcu().  Returns @true if the two values are
 * identical and @false otherwise.  This allows structures whose lifetimes
 * are tracked by old-state values to push these values to a list header,
 * allowing those structures to be slightly smaller.
 */
static inline bool same_state_synchronize_rcu(unsigned long oldstate1, unsigned long oldstate2)
{
        return oldstate1 == oldstate2;
}

#ifdef CONFIG_PREEMPT_RCU

void __rcu_read_lock(void);
void __rcu_read_unlock(void);

/*
 * Defined as a macro as it is a very low level header included from
 * areas that don't even know about current.  This gives the rcu_read_lock()
 * nesting depth, but makes sense only if CONFIG_PREEMPT_RCU -- in other
 * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
 */
#define rcu_preempt_depth() READ_ONCE(current->rcu_read_lock_nesting)

#else /* #ifdef CONFIG_PREEMPT_RCU */

#ifdef CONFIG_TINY_RCU
#define rcu_read_unlock_strict() do { } while (0)
#else
void rcu_read_unlock_strict(void);
#endif

static inline void __rcu_read_lock(void)
{
        preempt_disable();
}

static inline void __rcu_read_unlock(void)
{
        if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
                rcu_read_unlock_strict();
        preempt_enable();
}

static inline int rcu_preempt_depth(void)
{
        return 0;
}

#endif /* #else #ifdef CONFIG_PREEMPT_RCU */

#ifdef CONFIG_RCU_LAZY
void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func);
#else
static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
{
        call_rcu(head, func);
}
#endif

/* Internal to kernel */
void rcu_init(void);
extern int rcu_scheduler_active;
void rcu_sched_clock_irq(int user);

#ifdef CONFIG_RCU_STALL_COMMON
void rcu_sysrq_start(void);
void rcu_sysrq_end(void);
#else /* #ifdef CONFIG_RCU_STALL_COMMON */
static inline void rcu_sysrq_start(void) { }
static inline void rcu_sysrq_end(void) { }
#endif /* #else #ifdef CONFIG_RCU_STALL_COMMON */

#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_VIRT_XFER_TO_GUEST_WORK))
void rcu_irq_work_resched(void);
#else
static __always_inline void rcu_irq_work_resched(void) { }
#endif

#ifdef CONFIG_RCU_NOCB_CPU
void rcu_init_nohz(void);
int rcu_nocb_cpu_offload(int cpu);
int rcu_nocb_cpu_deoffload(int cpu);
void rcu_nocb_flush_deferred_wakeup(void);

#define RCU_NOCB_LOCKDEP_WARN(c, s) RCU_LOCKDEP_WARN(c, s)

#else /* #ifdef CONFIG_RCU_NOCB_CPU */

static inline void rcu_init_nohz(void) { }
static inline int rcu_nocb_cpu_offload(int cpu) { return -EINVAL; }
static inline int rcu_nocb_cpu_deoffload(int cpu) { return 0; }
static inline void rcu_nocb_flush_deferred_wakeup(void) { }

#define RCU_NOCB_LOCKDEP_WARN(c, s)

#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */

/*
 * Note a quasi-voluntary context switch for RCU-tasks's benefit.
 * This is a macro rather than an inline function to avoid #include hell.
 */
#ifdef CONFIG_TASKS_RCU_GENERIC

# ifdef CONFIG_TASKS_RCU
# define rcu_tasks_classic_qs(t, preempt)                                \
        do {                                                                \
                if (!(preempt) && READ_ONCE((t)->rcu_tasks_holdout))        \
                        WRITE_ONCE((t)->rcu_tasks_holdout, false);        \
        } while (0)
void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func);
void synchronize_rcu_tasks(void);
void rcu_tasks_torture_stats_print(char *tt, char *tf);
# else
# define rcu_tasks_classic_qs(t, preempt) do { } while (0)
# define call_rcu_tasks call_rcu
# define synchronize_rcu_tasks synchronize_rcu
# endif

# ifdef CONFIG_TASKS_TRACE_RCU
// Bits for ->trc_reader_special.b.need_qs field.
#define TRC_NEED_QS                0x1  // Task needs a quiescent state.
#define TRC_NEED_QS_CHECKED        0x2  // Task has been checked for needing quiescent state.

u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new);
void rcu_tasks_trace_qs_blkd(struct task_struct *t);

# define rcu_tasks_trace_qs(t)                                                        \
        do {                                                                        \
                int ___rttq_nesting = READ_ONCE((t)->trc_reader_nesting);        \
                                                                                \
                if (unlikely(READ_ONCE((t)->trc_reader_special.b.need_qs) == TRC_NEED_QS) &&        \
                    likely(!___rttq_nesting)) {                                        \
                        rcu_trc_cmpxchg_need_qs((t), TRC_NEED_QS, TRC_NEED_QS_CHECKED);        \
                } else if (___rttq_nesting && ___rttq_nesting != INT_MIN &&        \
                           !READ_ONCE((t)->trc_reader_special.b.blocked)) {        \
                        rcu_tasks_trace_qs_blkd(t);                                \
                }                                                                \
        } while (0)
void rcu_tasks_trace_torture_stats_print(char *tt, char *tf);
# else
# define rcu_tasks_trace_qs(t) do { } while (0)
# endif

#define rcu_tasks_qs(t, preempt)                                        \
do {                                                                        \
        rcu_tasks_classic_qs((t), (preempt));                                \
        rcu_tasks_trace_qs(t);                                                \
} while (0)

# ifdef CONFIG_TASKS_RUDE_RCU
void synchronize_rcu_tasks_rude(void);
void rcu_tasks_rude_torture_stats_print(char *tt, char *tf);
# endif

#define rcu_note_voluntary_context_switch(t) rcu_tasks_qs(t, false)
void exit_tasks_rcu_start(void);
void exit_tasks_rcu_finish(void);
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
#define rcu_tasks_classic_qs(t, preempt) do { } while (0)
#define rcu_tasks_qs(t, preempt) do { } while (0)
#define rcu_note_voluntary_context_switch(t) do { } while (0)
#define call_rcu_tasks call_rcu
#define synchronize_rcu_tasks synchronize_rcu
static inline void exit_tasks_rcu_start(void) { }
static inline void exit_tasks_rcu_finish(void) { }
#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */

/**
 * rcu_trace_implies_rcu_gp - does an RCU Tasks Trace grace period imply an RCU grace period?
 *
 * As an accident of implementation, an RCU Tasks Trace grace period also
 * acts as an RCU grace period.  However, this could change at any time.
 * Code relying on this accident must call this function to verify that
 * this accident is still happening.
 *
 * You have been warned!
 */
static inline bool rcu_trace_implies_rcu_gp(void) { return true; }

/**
 * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU
 *
 * This macro resembles cond_resched(), except that it is defined to
 * report potential quiescent states to RCU-tasks even if the cond_resched()
 * machinery were to be shut off, as some advocate for PREEMPTION kernels.
 */
#define cond_resched_tasks_rcu_qs() \
do { \
        rcu_tasks_qs(current, false); \
        cond_resched(); \
} while (0)

/**
 * rcu_softirq_qs_periodic - Report RCU and RCU-Tasks quiescent states
 * @old_ts: jiffies at start of processing.
 *
 * This helper is for long-running softirq handlers, such as NAPI threads in
 * networking. The caller should initialize the variable passed in as @old_ts
 * at the beginning of the softirq handler. When invoked frequently, this macro
 * will invoke rcu_softirq_qs() every 100 milliseconds thereafter, which will
 * provide both RCU and RCU-Tasks quiescent states. Note that this macro
 * modifies its old_ts argument.
 *
 * Because regions of code that have disabled softirq act as RCU read-side
 * critical sections, this macro should be invoked with softirq (and
 * preemption) enabled.
 *
 * The macro is not needed when CONFIG_PREEMPT_RT is defined. RT kernels would
 * have more chance to invoke schedule() calls and provide necessary quiescent
 * states. As a contrast, calling cond_resched() only won't achieve the same
 * effect because cond_resched() does not provide RCU-Tasks quiescent states.
 */
#define rcu_softirq_qs_periodic(old_ts) \
do { \
        if (!IS_ENABLED(CONFIG_PREEMPT_RT) && \
            time_after(jiffies, (old_ts) + HZ / 10)) { \
                preempt_disable(); \
                rcu_softirq_qs(); \
                preempt_enable(); \
                (old_ts) = jiffies; \
        } \
} while (0)

/*
 * Infrastructure to implement the synchronize_() primitives in
 * TREE_RCU and rcu_barrier_() primitives in TINY_RCU.
 */

#if defined(CONFIG_TREE_RCU)
#include <linux/rcutree.h>
#elif defined(CONFIG_TINY_RCU)
#include <linux/rcutiny.h>
#else
#error "Unknown RCU implementation specified to kernel configuration"
#endif

/*
 * The init_rcu_head_on_stack() and destroy_rcu_head_on_stack() calls
 * are needed for dynamic initialization and destruction of rcu_head
 * on the stack, and init_rcu_head()/destroy_rcu_head() are needed for
 * dynamic initialization and destruction of statically allocated rcu_head
 * structures.  However, rcu_head structures allocated dynamically in the
 * heap don't need any initialization.
 */
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
void init_rcu_head(struct rcu_head *head);
void destroy_rcu_head(struct rcu_head *head);
void init_rcu_head_on_stack(struct rcu_head *head);
void destroy_rcu_head_on_stack(struct rcu_head *head);
#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
static inline void init_rcu_head(struct rcu_head *head) { }
static inline void destroy_rcu_head(struct rcu_head *head) { }
static inline void init_rcu_head_on_stack(struct rcu_head *head) { }
static inline void destroy_rcu_head_on_stack(struct rcu_head *head) { }
#endif        /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */

#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU)
bool rcu_lockdep_current_cpu_online(void);
#else /* #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */
static inline bool rcu_lockdep_current_cpu_online(void) { return true; }
#endif /* #else #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */

extern struct lockdep_map rcu_lock_map;
extern struct lockdep_map rcu_bh_lock_map;
extern struct lockdep_map rcu_sched_lock_map;
extern struct lockdep_map rcu_callback_map;

#ifdef CONFIG_DEBUG_LOCK_ALLOC

static inline void rcu_lock_acquire(struct lockdep_map *map)
{
        lock_acquire(map, 0, 0, 2, 0, NULL, _THIS_IP_);
}

static inline void rcu_try_lock_acquire(struct lockdep_map *map)
{
        lock_acquire(map, 0, 1, 2, 0, NULL, _THIS_IP_);
}

static inline void rcu_lock_release(struct lockdep_map *map)
{
        lock_release(map, _THIS_IP_);
}

int debug_lockdep_rcu_enabled(void);
int rcu_read_lock_held(void);
int rcu_read_lock_bh_held(void);
int rcu_read_lock_sched_held(void);
int rcu_read_lock_any_held(void);

#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */

# define rcu_lock_acquire(a)                do { } while (0)
# define rcu_try_lock_acquire(a)        do { } while (0)
# define rcu_lock_release(a)                do { } while (0)

static inline int rcu_read_lock_held(void)
{
        return 1;
}

static inline int rcu_read_lock_bh_held(void)
{
        return 1;
}

static inline int rcu_read_lock_sched_held(void)
{
        return !preemptible();
}

static inline int rcu_read_lock_any_held(void)
{
        return !preemptible();
}

static inline int debug_lockdep_rcu_enabled(void)
{
        return 0;
}

#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */

#ifdef CONFIG_PROVE_RCU

/**
 * RCU_LOCKDEP_WARN - emit lockdep splat if specified condition is met
 * @c: condition to check
 * @s: informative message
 *
 * This checks debug_lockdep_rcu_enabled() before checking (c) to
 * prevent early boot splats due to lockdep not yet being initialized,
 * and rechecks it after checking (c) to prevent false-positive splats
 * due to races with lockdep being disabled.  See commit 3066820034b5dd
 * ("rcu: Reject RCU_LOCKDEP_WARN() false positives") for more detail.
 */
#define RCU_LOCKDEP_WARN(c, s)                                                \
        do {                                                                \
                static bool __section(".data..unlikely") __warned;        \
                if (debug_lockdep_rcu_enabled() && (c) &&                \
                    debug_lockdep_rcu_enabled() && !__warned) {                \
                        __warned = true;                                \
                        lockdep_rcu_suspicious(__FILE__, __LINE__, s);        \
                }                                                        \
        } while (0)

#ifndef CONFIG_PREEMPT_RCU
static inline void rcu_preempt_sleep_check(void)
{
        RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map),
                         "Illegal context switch in RCU read-side critical section");
}
#else // #ifndef CONFIG_PREEMPT_RCU
static inline void rcu_preempt_sleep_check(void) { }
#endif // #else // #ifndef CONFIG_PREEMPT_RCU

#define rcu_sleep_check()                                                \
        do {                                                                \
                rcu_preempt_sleep_check();                                \
                if (!IS_ENABLED(CONFIG_PREEMPT_RT))                        \
                    RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map),        \
                                 "Illegal context switch in RCU-bh read-side critical section"); \
                RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map),        \
                                 "Illegal context switch in RCU-sched read-side critical section"); \
        } while (0)

// See RCU_LOCKDEP_WARN() for an explanation of the double call to
// debug_lockdep_rcu_enabled().
static inline bool lockdep_assert_rcu_helper(bool c)
{
        return debug_lockdep_rcu_enabled() &&
               (c || !rcu_is_watching() || !rcu_lockdep_current_cpu_online()) &&
               debug_lockdep_rcu_enabled();
}

/**
 * lockdep_assert_in_rcu_read_lock - WARN if not protected by rcu_read_lock()
 *
 * Splats if lockdep is enabled and there is no rcu_read_lock() in effect.
 */
#define lockdep_assert_in_rcu_read_lock() \
        WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_lock_map)))

/**
 * lockdep_assert_in_rcu_read_lock_bh - WARN if not protected by rcu_read_lock_bh()
 *
 * Splats if lockdep is enabled and there is no rcu_read_lock_bh() in effect.
 * Note that local_bh_disable() and friends do not suffice here, instead an
 * actual rcu_read_lock_bh() is required.
 */
#define lockdep_assert_in_rcu_read_lock_bh() \
        WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_bh_lock_map)))

/**
 * lockdep_assert_in_rcu_read_lock_sched - WARN if not protected by rcu_read_lock_sched()
 *
 * Splats if lockdep is enabled and there is no rcu_read_lock_sched()
 * in effect.  Note that preempt_disable() and friends do not suffice here,
 * instead an actual rcu_read_lock_sched() is required.
 */
#define lockdep_assert_in_rcu_read_lock_sched() \
        WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_sched_lock_map)))

/**
 * lockdep_assert_in_rcu_reader - WARN if not within some type of RCU reader
 *
 * Splats if lockdep is enabled and there is no RCU reader of any
 * type in effect.  Note that regions of code protected by things like
 * preempt_disable, local_bh_disable(), and local_irq_disable() all qualify
 * as RCU readers.
 *
 * Note that this will never trigger in PREEMPT_NONE or PREEMPT_VOLUNTARY
 * kernels that are not also built with PREEMPT_COUNT.  But if you have
 * lockdep enabled, you might as well also enable PREEMPT_COUNT.
 */
#define lockdep_assert_in_rcu_reader()                                                                \
        WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_lock_map) &&                        \
                                               !lock_is_held(&rcu_bh_lock_map) &&                \
                                               !lock_is_held(&rcu_sched_lock_map) &&                \
                                               preemptible()))

#else /* #ifdef CONFIG_PROVE_RCU */

#define RCU_LOCKDEP_WARN(c, s) do { } while (0 && (c))
#define rcu_sleep_check() do { } while (0)

#define lockdep_assert_in_rcu_read_lock() do { } while (0)
#define lockdep_assert_in_rcu_read_lock_bh() do { } while (0)
#define lockdep_assert_in_rcu_read_lock_sched() do { } while (0)
#define lockdep_assert_in_rcu_reader() do { } while (0)

#endif /* #else #ifdef CONFIG_PROVE_RCU */

/*
 * Helper functions for rcu_dereference_check(), rcu_dereference_protected()
 * and rcu_assign_pointer().  Some of these could be folded into their
 * callers, but they are left separate in order to ease introduction of
 * multiple pointers markings to match different RCU implementations
 * (e.g., __srcu), should this make sense in the future.
 */

#ifdef __CHECKER__
#define rcu_check_sparse(p, space) \
        ((void)(((typeof(*p) space *)p) == p))
#else /* #ifdef __CHECKER__ */
#define rcu_check_sparse(p, space)
#endif /* #else #ifdef __CHECKER__ */

#define __unrcu_pointer(p, local)                                        \
({                                                                        \
        typeof(*p) *local = (typeof(*p) *__force)(p);                        \
        rcu_check_sparse(p, __rcu);                                        \
        ((typeof(*p) __force __kernel *)(local));                         \
})
/**
 * unrcu_pointer - mark a pointer as not being RCU protected
 * @p: pointer needing to lose its __rcu property
 *
 * Converts @p from an __rcu pointer to a __kernel pointer.
 * This allows an __rcu pointer to be used with xchg() and friends.
 */
#define unrcu_pointer(p) __unrcu_pointer(p, __UNIQUE_ID(rcu))

#define __rcu_access_pointer(p, local, space) \
({ \
        typeof(*p) *local = (typeof(*p) *__force)READ_ONCE(p); \
        rcu_check_sparse(p, space); \
        ((typeof(*p) __force __kernel *)(local)); \
})
#define __rcu_dereference_check(p, local, c, space) \
({ \
        /* Dependency order vs. p above. */ \
        typeof(*p) *local = (typeof(*p) *__force)READ_ONCE(p); \
        RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \
        rcu_check_sparse(p, space); \
        ((typeof(*p) __force __kernel *)(local)); \
})
#define __rcu_dereference_protected(p, local, c, space) \
({ \
        RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_protected() usage"); \
        rcu_check_sparse(p, space); \
        ((typeof(*p) __force __kernel *)(p)); \
})
#define __rcu_dereference_raw(p, local) \
({ \
        /* Dependency order vs. p above. */ \
        typeof(p) local = READ_ONCE(p); \
        ((typeof(*p) __force __kernel *)(local)); \
})
#define rcu_dereference_raw(p) __rcu_dereference_raw(p, __UNIQUE_ID(rcu))

/**
 * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
 * @v: The value to statically initialize with.
 */
#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)

/**
 * rcu_assign_pointer() - assign to RCU-protected pointer
 * @p: pointer to assign to
 * @v: value to assign (publish)
 *
 * Assigns the specified value to the specified RCU-protected
 * pointer, ensuring that any concurrent RCU readers will see
 * any prior initialization.
 *
 * Inserts memory barriers on architectures that require them
 * (which is most of them), and also prevents the compiler from
 * reordering the code that initializes the structure after the pointer
 * assignment.  More importantly, this call documents which pointers
 * will be dereferenced by RCU read-side code.
 *
 * In some special cases, you may use RCU_INIT_POINTER() instead
 * of rcu_assign_pointer().  RCU_INIT_POINTER() is a bit faster due
 * to the fact that it does not constrain either the CPU or the compiler.
 * That said, using RCU_INIT_POINTER() when you should have used
 * rcu_assign_pointer() is a very bad thing that results in
 * impossible-to-diagnose memory corruption.  So please be careful.
 * See the RCU_INIT_POINTER() comment header for details.
 *
 * Note that rcu_assign_pointer() evaluates each of its arguments only
 * once, appearances notwithstanding.  One of the "extra" evaluations
 * is in typeof() and the other visible only to sparse (__CHECKER__),
 * neither of which actually execute the argument.  As with most cpp
 * macros, this execute-arguments-only-once property is important, so
 * please be careful when making changes to rcu_assign_pointer() and the
 * other macros that it invokes.
 */
#define rcu_assign_pointer(p, v)                                              \
do {                                                                              \
        uintptr_t _r_a_p__v = (uintptr_t)(v);                                      \
        rcu_check_sparse(p, __rcu);                                              \
                                                                              \
        if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL)              \
                WRITE_ONCE((p), (typeof(p))(_r_a_p__v));                      \
        else                                                                      \
                smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
} while (0)

/**
 * rcu_replace_pointer() - replace an RCU pointer, returning its old value
 * @rcu_ptr: RCU pointer, whose old value is returned
 * @ptr: regular pointer
 * @c: the lockdep conditions under which the dereference will take place
 *
 * Perform a replacement, where @rcu_ptr is an RCU-annotated
 * pointer and @c is the lockdep argument that is passed to the
 * rcu_dereference_protected() call used to read that pointer.  The old
 * value of @rcu_ptr is returned, and @rcu_ptr is set to @ptr.
 */
#define rcu_replace_pointer(rcu_ptr, ptr, c)                                \
({                                                                        \
        typeof(ptr) __tmp = rcu_dereference_protected((rcu_ptr), (c));        \
        rcu_assign_pointer((rcu_ptr), (ptr));                                \
        __tmp;                                                                \
})

/**
 * rcu_access_pointer() - fetch RCU pointer with no dereferencing
 * @p: The pointer to read
 *
 * Return the value of the specified RCU-protected pointer, but omit the
 * lockdep checks for being in an RCU read-side critical section.  This is
 * useful when the value of this pointer is accessed, but the pointer is
 * not dereferenced, for example, when testing an RCU-protected pointer
 * against NULL.  Although rcu_access_pointer() may also be used in cases
 * where update-side locks prevent the value of the pointer from changing,
 * you should instead use rcu_dereference_protected() for this use case.
 * Within an RCU read-side critical section, there is little reason to
 * use rcu_access_pointer().
 *
 * It is usually best to test the rcu_access_pointer() return value
 * directly in order to avoid accidental dereferences being introduced
 * by later inattentive changes.  In other words, assigning the
 * rcu_access_pointer() return value to a local variable results in an
 * accident waiting to happen.
 *
 * It is also permissible to use rcu_access_pointer() when read-side
 * access to the pointer was removed at least one grace period ago, as is
 * the case in the context of the RCU callback that is freeing up the data,
 * or after a synchronize_rcu() returns.  This can be useful when tearing
 * down multi-linked structures after a grace period has elapsed.  However,
 * rcu_dereference_protected() is normally preferred for this use case.
 */
#define rcu_access_pointer(p) __rcu_access_pointer((p), __UNIQUE_ID(rcu), __rcu)

/**
 * rcu_dereference_check() - rcu_dereference with debug checking
 * @p: The pointer to read, prior to dereferencing
 * @c: The conditions under which the dereference will take place
 *
 * Do an rcu_dereference(), but check that the conditions under which the
 * dereference will take place are correct.  Typically the conditions
 * indicate the various locking conditions that should be held at that
 * point.  The check should return true if the conditions are satisfied.
 * An implicit check for being in an RCU read-side critical section
 * (rcu_read_lock()) is included.
 *
 * For example:
 *
 *        bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock));
 *
 * could be used to indicate to lockdep that foo->bar may only be dereferenced
 * if either rcu_read_lock() is held, or that the lock required to replace
 * the bar struct at foo->bar is held.
 *
 * Note that the list of conditions may also include indications of when a lock
 * need not be held, for example during initialisation or destruction of the
 * target struct:
 *
 *        bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock) ||
 *                                              atomic_read(&foo->usage) == 0);
 *
 * Inserts memory barriers on architectures that require them
 * (currently only the Alpha), prevents the compiler from refetching
 * (and from merging fetches), and, more importantly, documents exactly
 * which pointers are protected by RCU and checks that the pointer is
 * annotated as __rcu.
 */
#define rcu_dereference_check(p, c) \
        __rcu_dereference_check((p), __UNIQUE_ID(rcu), \
                                (c) || rcu_read_lock_held(), __rcu)

/**
 * rcu_dereference_bh_check() - rcu_dereference_bh with debug checking
 * @p: The pointer to read, prior to dereferencing
 * @c: The conditions under which the dereference will take place
 *
 * This is the RCU-bh counterpart to rcu_dereference_check().  However,
 * please note that starting in v5.0 kernels, vanilla RCU grace periods
 * wait for local_bh_disable() regions of code in addition to regions of
 * code demarked by rcu_read_lock() and rcu_read_unlock().  This means
 * that synchronize_rcu(), call_rcu, and friends all take not only
 * rcu_read_lock() but also rcu_read_lock_bh() into account.
 */
#define rcu_dereference_bh_check(p, c) \
        __rcu_dereference_check((p), __UNIQUE_ID(rcu), \
                                (c) || rcu_read_lock_bh_held(), __rcu)

/**
 * rcu_dereference_sched_check() - rcu_dereference_sched with debug checking
 * @p: The pointer to read, prior to dereferencing
 * @c: The conditions under which the dereference will take place
 *
 * This is the RCU-sched counterpart to rcu_dereference_check().
 * However, please note that starting in v5.0 kernels, vanilla RCU grace
 * periods wait for preempt_disable() regions of code in addition to
 * regions of code demarked by rcu_read_lock() and rcu_read_unlock().
 * This means that synchronize_rcu(), call_rcu, and friends all take not
 * only rcu_read_lock() but also rcu_read_lock_sched() into account.
 */
#define rcu_dereference_sched_check(p, c) \
        __rcu_dereference_check((p), __UNIQUE_ID(rcu), \
                                (c) || rcu_read_lock_sched_held(), \
                                __rcu)

/**
 * rcu_dereference_all_check() - rcu_dereference_all with debug checking
 * @p: The pointer to read, prior to dereferencing
 * @c: The conditions under which the dereference will take place
 *
 * This is similar to rcu_dereference_check(), but allows protection
 * by all forms of vanilla RCU readers, including preemption disabled,
 * bh-disabled, and interrupt-disabled regions of code.  Note that "vanilla
 * RCU" excludes SRCU and the various Tasks RCU flavors.  Please note
 * that this macro should not be backported to any Linux-kernel version
 * preceding v5.0 due to changes in synchronize_rcu() semantics prior
 * to that version.
 */
#define rcu_dereference_all_check(p, c) \
        __rcu_dereference_check((p), __UNIQUE_ID(rcu), \
                                (c) || rcu_read_lock_any_held(), \
                                __rcu)

/*
 * The tracing infrastructure traces RCU (we want that), but unfortunately
 * some of the RCU checks causes tracing to lock up the system.
 *
 * The no-tracing version of rcu_dereference_raw() must not call
 * rcu_read_lock_held().
 */
#define rcu_dereference_raw_check(p) \
        __rcu_dereference_check((p), __UNIQUE_ID(rcu), 1, __rcu)

/**
 * rcu_dereference_protected() - fetch RCU pointer when updates prevented
 * @p: The pointer to read, prior to dereferencing
 * @c: The conditions under which the dereference will take place
 *
 * Return the value of the specified RCU-protected pointer, but omit
 * the READ_ONCE().  This is useful in cases where update-side locks
 * prevent the value of the pointer from changing.  Please note that this
 * primitive does *not* prevent the compiler from repeating this reference
 * or combining it with other references, so it should not be used without
 * protection of appropriate locks.
 *
 * This function is only for update-side use.  Using this function
 * when protected only by rcu_read_lock() will result in infrequent
 * but very ugly failures.
 */
#define rcu_dereference_protected(p, c) \
        __rcu_dereference_protected((p), __UNIQUE_ID(rcu), (c), __rcu)


/**
 * rcu_dereference() - fetch RCU-protected pointer for dereferencing
 * @p: The pointer to read, prior to dereferencing
 *
 * This is a simple wrapper around rcu_dereference_check().
 */
#define rcu_dereference(p) rcu_dereference_check(p, 0)

/**
 * rcu_dereference_bh() - fetch an RCU-bh-protected pointer for dereferencing
 * @p: The pointer to read, prior to dereferencing
 *
 * Makes rcu_dereference_check() do the dirty work.
 */
#define rcu_dereference_bh(p) rcu_dereference_bh_check(p, 0)

/**
 * rcu_dereference_sched() - fetch RCU-sched-protected pointer for dereferencing
 * @p: The pointer to read, prior to dereferencing
 *
 * Makes rcu_dereference_check() do the dirty work.
 */
#define rcu_dereference_sched(p) rcu_dereference_sched_check(p, 0)

/**
 * rcu_dereference_all() - fetch RCU-all-protected pointer for dereferencing
 * @p: The pointer to read, prior to dereferencing
 *
 * Makes rcu_dereference_check() do the dirty work.
 */
#define rcu_dereference_all(p) rcu_dereference_all_check(p, 0)

/**
 * rcu_pointer_handoff() - Hand off a pointer from RCU to other mechanism
 * @p: The pointer to hand off
 *
 * This is simply an identity function, but it documents where a pointer
 * is handed off from RCU to some other synchronization mechanism, for
 * example, reference counting or locking.  In C11, it would map to
 * kill_dependency().  It could be used as follows::
 *
 *        rcu_read_lock();
 *        p = rcu_dereference(gp);
 *        long_lived = is_long_lived(p);
 *        if (long_lived) {
 *                if (!atomic_inc_not_zero(p->refcnt))
 *                        long_lived = false;
 *                else
 *                        p = rcu_pointer_handoff(p);
 *        }
 *        rcu_read_unlock();
 */
#define rcu_pointer_handoff(p) (p)

/**
 * rcu_read_lock() - mark the beginning of an RCU read-side critical section
 *
 * When synchronize_rcu() is invoked on one CPU while other CPUs
 * are within RCU read-side critical sections, then the
 * synchronize_rcu() is guaranteed to block until after all the other
 * CPUs exit their critical sections.  Similarly, if call_rcu() is invoked
 * on one CPU while other CPUs are within RCU read-side critical
 * sections, invocation of the corresponding RCU callback is deferred
 * until after the all the other CPUs exit their critical sections.
 *
 * Both synchronize_rcu() and call_rcu() also wait for regions of code
 * with preemption disabled, including regions of code with interrupts or
 * softirqs disabled.
 *
 * Note, however, that RCU callbacks are permitted to run concurrently
 * with new RCU read-side critical sections.  One way that this can happen
 * is via the following sequence of events: (1) CPU 0 enters an RCU
 * read-side critical section, (2) CPU 1 invokes call_rcu() to register
 * an RCU callback, (3) CPU 0 exits the RCU read-side critical section,
 * (4) CPU 2 enters a RCU read-side critical section, (5) the RCU
 * callback is invoked.  This is legal, because the RCU read-side critical
 * section that was running concurrently with the call_rcu() (and which
 * therefore might be referencing something that the corresponding RCU
 * callback would free up) has completed before the corresponding
 * RCU callback is invoked.
 *
 * RCU read-side critical sections may be nested.  Any deferred actions
 * will be deferred until the outermost RCU read-side critical section
 * completes.
 *
 * You can avoid reading and understanding the next paragraph by
 * following this rule: don't put anything in an rcu_read_lock() RCU
 * read-side critical section that would block in a !PREEMPTION kernel.
 * But if you want the full story, read on!
 *
 * In non-preemptible RCU implementations (pure TREE_RCU and TINY_RCU),
 * it is illegal to block while in an RCU read-side critical section.
 * In preemptible RCU implementations (PREEMPT_RCU) in CONFIG_PREEMPTION
 * kernel builds, RCU read-side critical sections may be preempted,
 * but explicit blocking is illegal.  Finally, in preemptible RCU
 * implementations in real-time (with -rt patchset) kernel builds, RCU
 * read-side critical sections may be preempted and they may also block, but
 * only when acquiring spinlocks that are subject to priority inheritance.
 */
static __always_inline void rcu_read_lock(void)
{
        __rcu_read_lock();
        __acquire(RCU);
        rcu_lock_acquire(&rcu_lock_map);
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_lock() used illegally while idle");
}

/*
 * So where is rcu_write_lock()?  It does not exist, as there is no
 * way for writers to lock out RCU readers.  This is a feature, not
 * a bug -- this property is what provides RCU's performance benefits.
 * Of course, writers must coordinate with each other.  The normal
 * spinlock primitives work well for this, but any other technique may be
 * used as well.  RCU does not care how the writers keep out of each
 * others' way, as long as they do so.
 */

/**
 * rcu_read_unlock() - marks the end of an RCU read-side critical section.
 *
 * In almost all situations, rcu_read_unlock() is immune from deadlock.
 * This deadlock immunity also extends to the scheduler's runqueue
 * and priority-inheritance spinlocks, courtesy of the quiescent-state
 * deferral that is carried out when rcu_read_unlock() is invoked with
 * interrupts disabled.
 *
 * See rcu_read_lock() for more information.
 */
static inline void rcu_read_unlock(void)
{
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_unlock() used illegally while idle");
        rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */
        __release(RCU);
        __rcu_read_unlock();
}

/**
 * rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section
 *
 * This is equivalent to rcu_read_lock(), but also disables softirqs.
 * Note that anything else that disables softirqs can also serve as an RCU
 * read-side critical section.  However, please note that this equivalence
 * applies only to v5.0 and later.  Before v5.0, rcu_read_lock() and
 * rcu_read_lock_bh() were unrelated.
 *
 * Note that rcu_read_lock_bh() and the matching rcu_read_unlock_bh()
 * must occur in the same context, for example, it is illegal to invoke
 * rcu_read_unlock_bh() from one task if the matching rcu_read_lock_bh()
 * was invoked from some other task.
 */
static inline void rcu_read_lock_bh(void)
{
        local_bh_disable();
        __acquire(RCU_BH);
        rcu_lock_acquire(&rcu_bh_lock_map);
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_lock_bh() used illegally while idle");
}

/**
 * rcu_read_unlock_bh() - marks the end of a softirq-only RCU critical section
 *
 * See rcu_read_lock_bh() for more information.
 */
static inline void rcu_read_unlock_bh(void)
{
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_unlock_bh() used illegally while idle");
        rcu_lock_release(&rcu_bh_lock_map);
        __release(RCU_BH);
        local_bh_enable();
}

/**
 * rcu_read_lock_sched() - mark the beginning of a RCU-sched critical section
 *
 * This is equivalent to rcu_read_lock(), but also disables preemption.
 * Read-side critical sections can also be introduced by anything else that
 * disables preemption, including local_irq_disable() and friends.  However,
 * please note that the equivalence to rcu_read_lock() applies only to
 * v5.0 and later.  Before v5.0, rcu_read_lock() and rcu_read_lock_sched()
 * were unrelated.
 *
 * Note that rcu_read_lock_sched() and the matching rcu_read_unlock_sched()
 * must occur in the same context, for example, it is illegal to invoke
 * rcu_read_unlock_sched() from process context if the matching
 * rcu_read_lock_sched() was invoked from an NMI handler.
 */
static inline void rcu_read_lock_sched(void)
{
        preempt_disable();
        __acquire(RCU_SCHED);
        rcu_lock_acquire(&rcu_sched_lock_map);
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_lock_sched() used illegally while idle");
}

/* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */
static inline notrace void rcu_read_lock_sched_notrace(void)
{
        preempt_disable_notrace();
        __acquire(RCU_SCHED);
}

/**
 * rcu_read_unlock_sched() - marks the end of a RCU-classic critical section
 *
 * See rcu_read_lock_sched() for more information.
 */
static inline void rcu_read_unlock_sched(void)
{
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_unlock_sched() used illegally while idle");
        rcu_lock_release(&rcu_sched_lock_map);
        __release(RCU_SCHED);
        preempt_enable();
}

/* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */
static inline notrace void rcu_read_unlock_sched_notrace(void)
{
        __release(RCU_SCHED);
        preempt_enable_notrace();
}

static __always_inline void rcu_read_lock_dont_migrate(void)
{
        if (IS_ENABLED(CONFIG_PREEMPT_RCU))
                migrate_disable();
        rcu_read_lock();
}

static inline void rcu_read_unlock_migrate(void)
{
        rcu_read_unlock();
        if (IS_ENABLED(CONFIG_PREEMPT_RCU))
                migrate_enable();
}

/**
 * RCU_INIT_POINTER() - initialize an RCU protected pointer
 * @p: The pointer to be initialized.
 * @v: The value to initialized the pointer to.
 *
 * Initialize an RCU-protected pointer in special cases where readers
 * do not need ordering constraints on the CPU or the compiler.  These
 * special cases are:
 *
 * 1.        This use of RCU_INIT_POINTER() is NULLing out the pointer *or*
 * 2.        The caller has taken whatever steps are required to prevent
 *        RCU readers from concurrently accessing this pointer *or*
 * 3.        The referenced data structure has already been exposed to
 *        readers either at compile time or via rcu_assign_pointer() *and*
 *
 *        a.        You have not made *any* reader-visible changes to
 *                this structure since then *or*
 *        b.        It is OK for readers accessing this structure from its
 *                new location to see the old state of the structure.  (For
 *                example, the changes were to statistical counters or to
 *                other state where exact synchronization is not required.)
 *
 * Failure to follow these rules governing use of RCU_INIT_POINTER() will
 * result in impossible-to-diagnose memory corruption.  As in the structures
 * will look OK in crash dumps, but any concurrent RCU readers might
 * see pre-initialized values of the referenced data structure.  So
 * please be very careful how you use RCU_INIT_POINTER()!!!
 *
 * If you are creating an RCU-protected linked structure that is accessed
 * by a single external-to-structure RCU-protected pointer, then you may
 * use RCU_INIT_POINTER() to initialize the internal RCU-protected
 * pointers, but you must use rcu_assign_pointer() to initialize the
 * external-to-structure pointer *after* you have completely initialized
 * the reader-accessible portions of the linked structure.
 *
 * Note that unlike rcu_assign_pointer(), RCU_INIT_POINTER() provides no
 * ordering guarantees for either the CPU or the compiler.
 */
#define RCU_INIT_POINTER(p, v) \
        do { \
                rcu_check_sparse(p, __rcu); \
                WRITE_ONCE(p, RCU_INITIALIZER(v)); \
        } while (0)

/**
 * RCU_POINTER_INITIALIZER() - statically initialize an RCU protected pointer
 * @p: The pointer to be initialized.
 * @v: The value to initialized the pointer to.
 *
 * GCC-style initialization for an RCU-protected pointer in a structure field.
 */
#define RCU_POINTER_INITIALIZER(p, v) \
                .p = RCU_INITIALIZER(v)

/**
 * kfree_rcu() - kfree an object after a grace period.
 * @ptr: pointer to kfree for double-argument invocations.
 * @rhf: the name of the struct rcu_head within the type of @ptr.
 *
 * Many rcu callbacks functions just call kfree() on the base structure.
 * These functions are trivial, but their size adds up, and furthermore
 * when they are used in a kernel module, that module must invoke the
 * high-latency rcu_barrier() function at module-unload time.
 *
 * The kfree_rcu() function handles this issue. In order to have a universal
 * callback function handling different offsets of rcu_head, the callback needs
 * to determine the starting address of the freed object, which can be a large
 * kmalloc or vmalloc allocation. To allow simply aligning the pointer down to
 * page boundary for those, only offsets up to 4095 bytes can be accommodated.
 * If the offset is larger than 4095 bytes, a compile-time error will
 * be generated in kvfree_rcu_arg_2(). If this error is triggered, you can
 * either fall back to use of call_rcu() or rearrange the structure to
 * position the rcu_head structure into the first 4096 bytes.
 *
 * The object to be freed can be allocated either by kmalloc() or
 * kmem_cache_alloc().
 *
 * Note that the allowable offset might decrease in the future.
 *
 * The BUILD_BUG_ON check must not involve any function calls, hence the
 * checks are done in macros here.
 */
#define kfree_rcu(ptr, rhf) kvfree_rcu_arg_2(ptr, rhf)
#define kvfree_rcu(ptr, rhf) kvfree_rcu_arg_2(ptr, rhf)

/**
 * kfree_rcu_mightsleep() - kfree an object after a grace period.
 * @ptr: pointer to kfree for single-argument invocations.
 *
 * When it comes to head-less variant, only one argument
 * is passed and that is just a pointer which has to be
 * freed after a grace period. Therefore the semantic is
 *
 *     kfree_rcu_mightsleep(ptr);
 *
 * where @ptr is the pointer to be freed by kvfree().
 *
 * Please note, head-less way of freeing is permitted to
 * use from a context that has to follow might_sleep()
 * annotation. Otherwise, please switch and embed the
 * rcu_head structure within the type of @ptr.
 */
#define kfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr)
#define kvfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr)

/*
 * In mm/slab_common.c, no suitable header to include here.
 */
void kvfree_call_rcu(struct rcu_head *head, void *ptr);

/*
 * The BUILD_BUG_ON() makes sure the rcu_head offset can be handled. See the
 * comment of kfree_rcu() for details.
 */
#define kvfree_rcu_arg_2(ptr, rhf)                                        \
do {                                                                        \
        typeof (ptr) ___p = (ptr);                                        \
                                                                        \
        if (___p) {                                                        \
                BUILD_BUG_ON(offsetof(typeof(*(ptr)), rhf) >= 4096);        \
                kvfree_call_rcu(&((___p)->rhf), (void *) (___p));        \
        }                                                                \
} while (0)

#define kvfree_rcu_arg_1(ptr)                                        \
do {                                                                \
        typeof(ptr) ___p = (ptr);                                \
                                                                \
        if (___p)                                                \
                kvfree_call_rcu(NULL, (void *) (___p));                \
} while (0)

/*
 * Place this after a lock-acquisition primitive to guarantee that
 * an UNLOCK+LOCK pair acts as a full barrier.  This guarantee applies
 * if the UNLOCK and LOCK are executed by the same CPU or if the
 * UNLOCK and LOCK operate on the same lock variable.
 */
#ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE
#define smp_mb__after_unlock_lock()        smp_mb()  /* Full ordering for lock. */
#else /* #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */
#define smp_mb__after_unlock_lock()        do { } while (0)
#endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */


/* Has the specified rcu_head structure been handed to call_rcu()? */

/**
 * rcu_head_init - Initialize rcu_head for rcu_head_after_call_rcu()
 * @rhp: The rcu_head structure to initialize.
 *
 * If you intend to invoke rcu_head_after_call_rcu() to test whether a
 * given rcu_head structure has already been passed to call_rcu(), then
 * you must also invoke this rcu_head_init() function on it just after
 * allocating that structure.  Calls to this function must not race with
 * calls to call_rcu(), rcu_head_after_call_rcu(), or callback invocation.
 */
static inline void rcu_head_init(struct rcu_head *rhp)
{
        rhp->func = (rcu_callback_t)~0L;
}

/**
 * rcu_head_after_call_rcu() - Has this rcu_head been passed to call_rcu()?
 * @rhp: The rcu_head structure to test.
 * @f: The function passed to call_rcu() along with @rhp.
 *
 * Returns @true if the @rhp has been passed to call_rcu() with @func,
 * and @false otherwise.  Emits a warning in any other case, including
 * the case where @rhp has already been invoked after a grace period.
 * Calls to this function must not race with callback invocation.  One way
 * to avoid such races is to enclose the call to rcu_head_after_call_rcu()
 * in an RCU read-side critical section that includes a read-side fetch
 * of the pointer to the structure containing @rhp.
 */
static inline bool
rcu_head_after_call_rcu(struct rcu_head *rhp, rcu_callback_t f)
{
        rcu_callback_t func = READ_ONCE(rhp->func);

        if (func == f)
                return true;
        WARN_ON_ONCE(func != (rcu_callback_t)~0L);
        return false;
}

/* kernel/ksysfs.c definitions */
extern int rcu_expedited;
extern int rcu_normal;

DEFINE_LOCK_GUARD_0(rcu,
        do {
                rcu_read_lock();
                /*
                 * sparse doesn't call the cleanup function,
                 * so just release immediately and don't track
                 * the context. We don't need to anyway, since
                 * the whole point of the guard is to not need
                 * the explicit unlock.
                 */
                __release(RCU);
        } while (0),
        rcu_read_unlock())

#endif /* __LINUX_RCUPDATE_H */






















































































































































  320 









  316 


























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef _LINUX_FILE_REF_H
#define _LINUX_FILE_REF_H

#include <linux/atomic.h>
#include <linux/preempt.h>
#include <linux/types.h>

/*
 * file_ref is a reference count implementation specifically for use by
 * files. It takes inspiration from rcuref but differs in key aspects
 * such as support for SLAB_TYPESAFE_BY_RCU type caches.
 *
 * FILE_REF_ONEREF                FILE_REF_MAXREF
 * 0x0000000000000000UL      0x7FFFFFFFFFFFFFFFUL
 * <-------------------valid ------------------->
 *
 *                       FILE_REF_SATURATED
 * 0x8000000000000000UL 0xA000000000000000UL 0xBFFFFFFFFFFFFFFFUL
 * <-----------------------saturation zone---------------------->
 *
 * FILE_REF_RELEASED                   FILE_REF_DEAD
 * 0xC000000000000000UL         0xE000000000000000UL
 * <-------------------dead zone------------------->
 *
 * FILE_REF_NOREF
 * 0xFFFFFFFFFFFFFFFFUL
 */

#ifdef CONFIG_64BIT
#define FILE_REF_ONEREF                0x0000000000000000UL
#define FILE_REF_MAXREF                0x7FFFFFFFFFFFFFFFUL
#define FILE_REF_SATURATED        0xA000000000000000UL
#define FILE_REF_RELEASED        0xC000000000000000UL
#define FILE_REF_DEAD                0xE000000000000000UL
#define FILE_REF_NOREF                0xFFFFFFFFFFFFFFFFUL
#else
#define FILE_REF_ONEREF                0x00000000U
#define FILE_REF_MAXREF                0x7FFFFFFFU
#define FILE_REF_SATURATED        0xA0000000U
#define FILE_REF_RELEASED        0xC0000000U
#define FILE_REF_DEAD                0xE0000000U
#define FILE_REF_NOREF                0xFFFFFFFFU
#endif

typedef struct {
#ifdef CONFIG_64BIT
        atomic64_t refcnt;
#else
        atomic_t refcnt;
#endif
} file_ref_t;

/**
 * file_ref_init - Initialize a file reference count
 * @ref: Pointer to the reference count
 * @cnt: The initial reference count typically '1'
 */
static inline void file_ref_init(file_ref_t *ref, unsigned long cnt)
{
        atomic_long_set(&ref->refcnt, cnt - 1);
}

bool __file_ref_put(file_ref_t *ref, unsigned long cnt);

/**
 * file_ref_get - Acquire one reference on a file
 * @ref: Pointer to the reference count
 *
 * Similar to atomic_inc_not_zero() but saturates at FILE_REF_MAXREF.
 *
 * Provides full memory ordering.
 *
 * Return: False if the attempt to acquire a reference failed. This happens
 *         when the last reference has been put already. True if a reference
 *         was successfully acquired
 */
static __always_inline __must_check bool file_ref_get(file_ref_t *ref)
{
        /*
         * Unconditionally increase the reference count with full
         * ordering. The saturation and dead zones provide enough
         * tolerance for this.
         *
         * If this indicates negative the file in question the fail can
         * be freed and immediately reused due to SLAB_TYPSAFE_BY_RCU.
         * Hence, unconditionally altering the file reference count to
         * e.g., reset the file reference count back to the middle of
         * the deadzone risk end up marking someone else's file as dead
         * behind their back.
         *
         * It would be possible to do a careful:
         *
         * cnt = atomic_long_inc_return();
         * if (likely(cnt >= 0))
         *        return true;
         *
         * and then something like:
         *
         * if (cnt >= FILE_REF_RELEASE)
         *        atomic_long_try_cmpxchg(&ref->refcnt, &cnt, FILE_REF_DEAD),
         *
         * to set the value back to the middle of the deadzone. But it's
         * practically impossible to go from FILE_REF_DEAD to
         * FILE_REF_ONEREF. It would need 2305843009213693952/2^61
         * file_ref_get()s to resurrect such a dead file.
         */
        return !atomic_long_add_negative(1, &ref->refcnt);
}

/**
 * file_ref_inc - Acquire one reference on a file
 * @ref: Pointer to the reference count
 *
 * Acquire an additional reference on a file. Warns if the caller didn't
 * already hold a reference.
 */
static __always_inline void file_ref_inc(file_ref_t *ref)
{
        long prior = atomic_long_fetch_inc_relaxed(&ref->refcnt);
        WARN_ONCE(prior < 0, "file_ref_inc() on a released file reference");
}

/**
 * file_ref_put -- Release a file reference
 * @ref:        Pointer to the reference count
 *
 * Provides release memory ordering, such that prior loads and stores
 * are done before, and provides an acquire ordering on success such
 * that free() must come after.
 *
 * Return: True if this was the last reference with no future references
 *         possible. This signals the caller that it can safely release
 *         the object which is protected by the reference counter.
 *         False if there are still active references or the put() raced
 *         with a concurrent get()/put() pair. Caller is not allowed to
 *         release the protected object.
 */
static __always_inline __must_check bool file_ref_put(file_ref_t *ref)
{
        long cnt;

        /*
         * While files are SLAB_TYPESAFE_BY_RCU and thus file_ref_put()
         * calls don't risk UAFs when a file is recyclyed, it is still
         * vulnerable to UAFs caused by freeing the whole slab page once
         * it becomes unused. Prevent file_ref_put() from being
         * preempted protects against this.
         */
        guard(preempt)();
        /*
         * Unconditionally decrease the reference count. The saturation
         * and dead zones provide enough tolerance for this. If this
         * fails then we need to handle the last reference drop and
         * cases inside the saturation and dead zones.
         */
        cnt = atomic_long_dec_return(&ref->refcnt);
        if (cnt >= 0)
                return false;
        return __file_ref_put(ref, cnt);
}

/**
 * file_ref_put_close - drop a reference expecting it would transition to FILE_REF_NOREF
 * @ref:        Pointer to the reference count
 *
 * Semantically it is equivalent to calling file_ref_put(), but it trades lower
 * performance in face of other CPUs also modifying the refcount for higher
 * performance when this happens to be the last reference.
 *
 * For the last reference file_ref_put() issues 2 atomics. One to drop the
 * reference and another to transition it to FILE_REF_DEAD. This routine does
 * the work in one step, but in order to do it has to pre-read the variable which
 * decreases scalability.
 *
 * Use with close() et al, stick to file_ref_put() by default.
 */
static __always_inline __must_check bool file_ref_put_close(file_ref_t *ref)
{
        long old;

        old = atomic_long_read(&ref->refcnt);
        if (likely(old == FILE_REF_ONEREF)) {
                if (likely(atomic_long_try_cmpxchg(&ref->refcnt, &old, FILE_REF_DEAD)))
                        return true;
        }
        return file_ref_put(ref);
}

/**
 * file_ref_read - Read the number of file references
 * @ref: Pointer to the reference count
 *
 * Return: The number of held references (0 ... N)
 */
static inline unsigned long file_ref_read(file_ref_t *ref)
{
        unsigned long c = atomic_long_read(&ref->refcnt);

        /* Return 0 if within the DEAD zone. */
        return c >= FILE_REF_RELEASED ? 0 : c + 1;
}

/*
 * __file_ref_read_raw - Return the value stored in ref->refcnt
 * @ref: Pointer to the reference count
 *
 * Return: The raw value found in the counter
 *
 * A hack for file_needs_f_pos_lock(), you probably want to use
 * file_ref_read() instead.
 */
static inline unsigned long __file_ref_read_raw(file_ref_t *ref)
{
        return atomic_long_read(&ref->refcnt);
}

#endif


























































    1 









    1 


















    1 












    1 



    1 






























    1 
    1 

    1 










    1 















    1 


    1 


    1 
    1 




















































    1 





















    1 



    1 











    1 





    1 













    1 































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
/*
 * Non-physical true random number generator based on timing jitter --
 * Linux Kernel Crypto API specific code
 *
 * Copyright Stephan Mueller <smueller@chronox.de>, 2015 - 2023
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, and the entire permission notice in its entirety,
 *    including the disclaimer of warranties.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the author may not be used to endorse or promote
 *    products derived from this software without specific prior
 *    written permission.
 *
 * ALTERNATIVELY, this product may be distributed under the terms of
 * the GNU General Public License, in which case the provisions of the GPL2 are
 * required INSTEAD OF the above restrictions.  (This clause is
 * necessary due to a potential bad interaction between the GPL and
 * the restrictions contained in a BSD-style copyright.)
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
 * WHICH ARE HEREBY DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
 * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
 * DAMAGE.
 */

#include <crypto/hash.h>
#include <crypto/sha3.h>
#include <linux/fips.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/time.h>
#include <crypto/internal/rng.h>

#include "jitterentropy.h"

#define JENT_CONDITIONING_HASH        "sha3-256-generic"

/***************************************************************************
 * Helper function
 ***************************************************************************/

void *jent_kvzalloc(unsigned int len)
{
        return kvzalloc(len, GFP_KERNEL);
}

void jent_kvzfree(void *ptr, unsigned int len)
{
        kvfree_sensitive(ptr, len);
}

void *jent_zalloc(unsigned int len)
{
        return kzalloc(len, GFP_KERNEL);
}

void jent_zfree(void *ptr)
{
        kfree_sensitive(ptr);
}

/*
 * Obtain a high-resolution time stamp value. The time stamp is used to measure
 * the execution time of a given code path and its variations. Hence, the time
 * stamp must have a sufficiently high resolution.
 *
 * Note, if the function returns zero because a given architecture does not
 * implement a high-resolution time stamp, the RNG code's runtime test
 * will detect it and will not produce output.
 */
void jent_get_nstime(__u64 *out)
{
        __u64 tmp = 0;

        tmp = random_get_entropy();

        /*
         * If random_get_entropy does not return a value, i.e. it is not
         * implemented for a given architecture, use a clock source.
         * hoping that there are timers we can work with.
         */
        if (tmp == 0)
                tmp = ktime_get_ns();

        *out = tmp;
        jent_raw_hires_entropy_store(tmp);
}

int jent_hash_time(void *hash_state, __u64 time, u8 *addtl,
                   unsigned int addtl_len, __u64 hash_loop_cnt,
                   unsigned int stuck)
{
        struct shash_desc *hash_state_desc = (struct shash_desc *)hash_state;
        SHASH_DESC_ON_STACK(desc, hash_state_desc->tfm);
        u8 intermediary[SHA3_256_DIGEST_SIZE];
        __u64 j = 0;
        int ret;

        desc->tfm = hash_state_desc->tfm;

        if (sizeof(intermediary) != crypto_shash_digestsize(desc->tfm)) {
                pr_warn_ratelimited("Unexpected digest size\n");
                return -EINVAL;
        }
        kmsan_unpoison_memory(intermediary, sizeof(intermediary));

        /*
         * This loop fills a buffer which is injected into the entropy pool.
         * The main reason for this loop is to execute something over which we
         * can perform a timing measurement. The injection of the resulting
         * data into the pool is performed to ensure the result is used and
         * the compiler cannot optimize the loop away in case the result is not
         * used at all. Yet that data is considered "additional information"
         * considering the terminology from SP800-90A without any entropy.
         *
         * Note, it does not matter which or how much data you inject, we are
         * interested in one Keccack1600 compression operation performed with
         * the crypto_shash_final.
         */
        for (j = 0; j < hash_loop_cnt; j++) {
                ret = crypto_shash_init(desc) ?:
                      crypto_shash_update(desc, intermediary,
                                          sizeof(intermediary)) ?:
                      crypto_shash_finup(desc, addtl, addtl_len, intermediary);
                if (ret)
                        goto err;
        }

        /*
         * Inject the data from the previous loop into the pool. This data is
         * not considered to contain any entropy, but it stirs the pool a bit.
         */
        ret = crypto_shash_update(hash_state_desc, intermediary, sizeof(intermediary));
        if (ret)
                goto err;

        /*
         * Insert the time stamp into the hash context representing the pool.
         *
         * If the time stamp is stuck, do not finally insert the value into the
         * entropy pool. Although this operation should not do any harm even
         * when the time stamp has no entropy, SP800-90B requires that any
         * conditioning operation to have an identical amount of input data
         * according to section 3.1.5.
         */
        if (stuck) {
                time = 0;
        }

        ret = crypto_shash_update(hash_state_desc, (u8 *)&time, sizeof(__u64));

err:
        shash_desc_zero(desc);
        memzero_explicit(intermediary, sizeof(intermediary));

        return ret;
}

int jent_read_random_block(void *hash_state, char *dst, unsigned int dst_len)
{
        struct shash_desc *hash_state_desc = (struct shash_desc *)hash_state;
        u8 jent_block[SHA3_256_DIGEST_SIZE];
        /* Obtain data from entropy pool and re-initialize it */
        int ret = crypto_shash_final(hash_state_desc, jent_block) ?:
                  crypto_shash_init(hash_state_desc) ?:
                  crypto_shash_update(hash_state_desc, jent_block,
                                      sizeof(jent_block));

        if (!ret && dst_len)
                memcpy(dst, jent_block, dst_len);

        memzero_explicit(jent_block, sizeof(jent_block));
        return ret;
}

/***************************************************************************
 * Kernel crypto API interface
 ***************************************************************************/

struct jitterentropy {
        spinlock_t jent_lock;
        struct rand_data *entropy_collector;
        struct crypto_shash *tfm;
        struct shash_desc *sdesc;
};

static void jent_kcapi_cleanup(struct crypto_tfm *tfm)
{
        struct jitterentropy *rng = crypto_tfm_ctx(tfm);

        spin_lock(&rng->jent_lock);

        if (rng->sdesc) {
                shash_desc_zero(rng->sdesc);
                kfree(rng->sdesc);
        }
        rng->sdesc = NULL;

        if (rng->tfm)
                crypto_free_shash(rng->tfm);
        rng->tfm = NULL;

        if (rng->entropy_collector)
                jent_entropy_collector_free(rng->entropy_collector);
        rng->entropy_collector = NULL;
        spin_unlock(&rng->jent_lock);
}

static int jent_kcapi_init(struct crypto_tfm *tfm)
{
        struct jitterentropy *rng = crypto_tfm_ctx(tfm);
        struct crypto_shash *hash;
        struct shash_desc *sdesc;
        int size, ret = 0;

        spin_lock_init(&rng->jent_lock);

        /*
         * Use SHA3-256 as conditioner. We allocate only the generic
         * implementation as we are not interested in high-performance. The
         * execution time of the SHA3 operation is measured and adds to the
         * Jitter RNG's unpredictable behavior. If we have a slower hash
         * implementation, the execution timing variations are larger. When
         * using a fast implementation, we would need to call it more often
         * as its variations are lower.
         */
        hash = crypto_alloc_shash(JENT_CONDITIONING_HASH, 0, 0);
        if (IS_ERR(hash)) {
                pr_err("Cannot allocate conditioning digest\n");
                return PTR_ERR(hash);
        }
        rng->tfm = hash;

        size = sizeof(struct shash_desc) + crypto_shash_descsize(hash);
        sdesc = kmalloc(size, GFP_KERNEL);
        if (!sdesc) {
                ret = -ENOMEM;
                goto err;
        }

        sdesc->tfm = hash;
        crypto_shash_init(sdesc);
        rng->sdesc = sdesc;

        rng->entropy_collector =
                jent_entropy_collector_alloc(CONFIG_CRYPTO_JITTERENTROPY_OSR, 0,
                                             sdesc);
        if (!rng->entropy_collector) {
                ret = -ENOMEM;
                goto err;
        }

        spin_lock_init(&rng->jent_lock);
        return 0;

err:
        jent_kcapi_cleanup(tfm);
        return ret;
}

static int jent_kcapi_random(struct crypto_rng *tfm,
                             const u8 *src, unsigned int slen,
                             u8 *rdata, unsigned int dlen)
{
        struct jitterentropy *rng = crypto_rng_ctx(tfm);
        int ret = 0;

        spin_lock(&rng->jent_lock);

        ret = jent_read_entropy(rng->entropy_collector, rdata, dlen);

        if (ret == -3) {
                /* Handle permanent health test error */
                /*
                 * If the kernel was booted with fips=1, it implies that
                 * the entire kernel acts as a FIPS 140 module. In this case
                 * an SP800-90B permanent health test error is treated as
                 * a FIPS module error.
                 */
                if (fips_enabled)
                        panic("Jitter RNG permanent health test failure\n");

                pr_err("Jitter RNG permanent health test failure\n");
                ret = -EFAULT;
        } else if (ret == -2) {
                /* Handle intermittent health test error */
                pr_warn_ratelimited("Reset Jitter RNG due to intermittent health test failure\n");
                ret = -EAGAIN;
        } else if (ret == -1) {
                /* Handle other errors */
                ret = -EINVAL;
        }

        spin_unlock(&rng->jent_lock);

        return ret;
}

static int jent_kcapi_reset(struct crypto_rng *tfm,
                            const u8 *seed, unsigned int slen)
{
        return 0;
}

static struct rng_alg jent_alg = {
        .generate                = jent_kcapi_random,
        .seed                        = jent_kcapi_reset,
        .seedsize                = 0,
        .base                        = {
                .cra_name               = "jitterentropy_rng",
                .cra_driver_name        = "jitterentropy_rng",
                .cra_priority           = 100,
                .cra_ctxsize            = sizeof(struct jitterentropy),
                .cra_module             = THIS_MODULE,
                .cra_init               = jent_kcapi_init,
                .cra_exit               = jent_kcapi_cleanup,
        }
};

static int __init jent_mod_init(void)
{
        SHASH_DESC_ON_STACK(desc, tfm);
        struct crypto_shash *tfm;
        int ret = 0;

        jent_testing_init();

        tfm = crypto_alloc_shash(JENT_CONDITIONING_HASH, 0, 0);
        if (IS_ERR(tfm)) {
                jent_testing_exit();
                return PTR_ERR(tfm);
        }

        desc->tfm = tfm;
        crypto_shash_init(desc);
        ret = jent_entropy_init(CONFIG_CRYPTO_JITTERENTROPY_OSR, 0, desc, NULL);
        shash_desc_zero(desc);
        crypto_free_shash(tfm);
        if (ret) {
                /* Handle permanent health test error */
                if (fips_enabled)
                        panic("jitterentropy: Initialization failed with host not compliant with requirements: %d\n", ret);

                jent_testing_exit();
                pr_info("jitterentropy: Initialization failed with host not compliant with requirements: %d\n", ret);
                return -EFAULT;
        }
        return crypto_register_rng(&jent_alg);
}

static void __exit jent_mod_exit(void)
{
        jent_testing_exit();
        crypto_unregister_rng(&jent_alg);
}

module_init(jent_mod_init);
module_exit(jent_mod_exit);

MODULE_LICENSE("Dual BSD/GPL");
MODULE_AUTHOR("Stephan Mueller <smueller@chronox.de>");
MODULE_DESCRIPTION("Non-physical True Random Number Generator based on CPU Jitter");
MODULE_ALIAS_CRYPTO("jitterentropy_rng");

































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * A security identifier table (sidtab) is a lookup table
 * of security context structures indexed by SID value.
 *
 * Original author: Stephen Smalley, <stephen.smalley.work@gmail.com>
 * Author: Ondrej Mosnacek, <omosnacek@gmail.com>
 *
 * Copyright (C) 2018 Red Hat, Inc.
 */

#ifndef _SS_SIDTAB_H_
#define _SS_SIDTAB_H_

#include <linux/spinlock_types.h>
#include <linux/log2.h>
#include <linux/hashtable.h>

#include "context.h"

struct sidtab_entry {
        u32 sid;
        u32 hash;
        struct context context;
#if CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE > 0
        struct sidtab_str_cache __rcu *cache;
#endif
        struct hlist_node list;
};

union sidtab_entry_inner {
        struct sidtab_node_inner *ptr_inner;
        struct sidtab_node_leaf *ptr_leaf;
};

/* align node size to page boundary */
#define SIDTAB_NODE_ALLOC_SHIFT PAGE_SHIFT
#define SIDTAB_NODE_ALLOC_SIZE        PAGE_SIZE

#define size_to_shift(size) ((size) == 1 ? 1 : (const_ilog2((size)-1) + 1))

#define SIDTAB_INNER_SHIFT         \
        (SIDTAB_NODE_ALLOC_SHIFT - \
         size_to_shift(sizeof(union sidtab_entry_inner)))
#define SIDTAB_INNER_ENTRIES ((size_t)1 << SIDTAB_INNER_SHIFT)
#define SIDTAB_LEAF_ENTRIES \
        (SIDTAB_NODE_ALLOC_SIZE / sizeof(struct sidtab_entry))

#define SIDTAB_MAX_BITS 32
#define SIDTAB_MAX        U32_MAX
/* ensure enough tree levels for SIDTAB_MAX entries */
#define SIDTAB_MAX_LEVEL                                                   \
        DIV_ROUND_UP(SIDTAB_MAX_BITS - size_to_shift(SIDTAB_LEAF_ENTRIES), \
                     SIDTAB_INNER_SHIFT)

struct sidtab_node_leaf {
        struct sidtab_entry entries[SIDTAB_LEAF_ENTRIES];
};

struct sidtab_node_inner {
        union sidtab_entry_inner entries[SIDTAB_INNER_ENTRIES];
};

struct sidtab_isid_entry {
        int set;
        struct sidtab_entry entry;
};

struct sidtab_convert_params {
        struct convert_context_args *args;
        struct sidtab *target;
};

#define SIDTAB_HASH_BITS    CONFIG_SECURITY_SELINUX_SIDTAB_HASH_BITS
#define SIDTAB_HASH_BUCKETS (1 << SIDTAB_HASH_BITS)

struct sidtab {
        /*
         * lock-free read access only for as many items as a prior read of
         * 'count'
         */
        union sidtab_entry_inner roots[SIDTAB_MAX_LEVEL + 1];
        /*
         * access atomically via {READ|WRITE}_ONCE(); only increment under
         * spinlock
         */
        u32 count;
        /* access only under spinlock */
        struct sidtab_convert_params *convert;
        bool frozen;
        spinlock_t lock;

#if CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE > 0
        /* SID -> context string cache */
        u32 cache_free_slots;
        struct list_head cache_lru_list;
        spinlock_t cache_lock;
#endif

        /* index == SID - 1 (no entry for SECSID_NULL) */
        struct sidtab_isid_entry isids[SECINITSID_NUM];

        /* Hash table for fast reverse context-to-sid lookups. */
        DECLARE_HASHTABLE(context_to_sid, SIDTAB_HASH_BITS);
};

int sidtab_init(struct sidtab *s);
int sidtab_set_initial(struct sidtab *s, u32 sid, struct context *context);
struct sidtab_entry *sidtab_search_entry(struct sidtab *s, u32 sid);
struct sidtab_entry *sidtab_search_entry_force(struct sidtab *s, u32 sid);

static inline struct context *sidtab_search(struct sidtab *s, u32 sid)
{
        struct sidtab_entry *entry = sidtab_search_entry(s, sid);

        return entry ? &entry->context : NULL;
}

static inline struct context *sidtab_search_force(struct sidtab *s, u32 sid)
{
        struct sidtab_entry *entry = sidtab_search_entry_force(s, sid);

        return entry ? &entry->context : NULL;
}

int sidtab_convert(struct sidtab *s, struct sidtab_convert_params *params);

void sidtab_cancel_convert(struct sidtab *s);

void sidtab_freeze_begin(struct sidtab *s, unsigned long *flags)
        __acquires(&s->lock);
void sidtab_freeze_end(struct sidtab *s, unsigned long *flags)
        __releases(&s->lock);

int sidtab_context_to_sid(struct sidtab *s, struct context *context, u32 *sid);

void sidtab_destroy(struct sidtab *s);

int sidtab_hash_stats(struct sidtab *sidtab, char *page);

#if CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE > 0
void sidtab_sid2str_put(struct sidtab *s, struct sidtab_entry *entry,
                        const char *str, u32 str_len);
int sidtab_sid2str_get(struct sidtab *s, struct sidtab_entry *entry, char **out,
                       u32 *out_len);
#else
static inline void sidtab_sid2str_put(struct sidtab *s,
                                      struct sidtab_entry *entry,
                                      const char *str, u32 str_len)
{
}
static inline int sidtab_sid2str_get(struct sidtab *s,
                                     struct sidtab_entry *entry, char **out,
                                     u32 *out_len)
{
        return -ENOENT;
}
#endif /* CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE > 0 */

#endif /* _SS_SIDTAB_H_ */



















































































































































































































































































































































































   39 





    5 

























   39 














    4 


   39 














   39 
   39 







   39 




































   39 
   36 
   39 








































   39 






   39 

    5 









    5 




















    5 














    5 
    5 
























    5 














    5 



















   39 
   39 




























   38 




    5 

















































































   39 






















































    4 






















































    4 







   38 











   39 
































    4 












































   39 










    4 












   39 








   39 

































   39 







   39 







   39 










   39 
   39 

























   39 













   39 


   39 
   39 

































































































   39 
   38 

   39 










   39 


















   39 
















   39 


   39 















   39 




   39 





   39 
   39 









































































































































































































































   39 









   39 


   39 
   39 




   39 









































   38 






   39 

   39 



































   39 








   39 

   39 


    4 
































   36 
   36 

    4 







   39 


   39 

   39 





   39 








   39 
   36 




   38 



































   39 




















































   39 












































   39 


   39 










   39 



   39 



   39 


   39 


   38 
















   39 












   39 

   39 
   36 







   39 

    4 


   39 












   39 
















   39 




   39 


   39 










   39 







   39 







   39 

















































































































































   39 



   39 


   39 






   39 

   38 

























   39 




   39 

























   39 
















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
// SPDX-License-Identifier: GPL-2.0

#include <kunit/visibility.h>
#include <linux/kernel.h>
#include <linux/irqflags.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/bug.h>
#include "printk_ringbuffer.h"
#include "internal.h"

/**
 * DOC: printk_ringbuffer overview
 *
 * Data Structure
 * --------------
 * The printk_ringbuffer is made up of 3 internal ringbuffers:
 *
 *   desc_ring
 *     A ring of descriptors and their meta data (such as sequence number,
 *     timestamp, loglevel, etc.) as well as internal state information about
 *     the record and logical positions specifying where in the other
 *     ringbuffer the text strings are located.
 *
 *   text_data_ring
 *     A ring of data blocks. A data block consists of an unsigned long
 *     integer (ID) that maps to a desc_ring index followed by the text
 *     string of the record.
 *
 * The internal state information of a descriptor is the key element to allow
 * readers and writers to locklessly synchronize access to the data.
 *
 * Implementation
 * --------------
 *
 * Descriptor Ring
 * ~~~~~~~~~~~~~~~
 * The descriptor ring is an array of descriptors. A descriptor contains
 * essential meta data to track the data of a printk record using
 * blk_lpos structs pointing to associated text data blocks (see
 * "Data Rings" below). Each descriptor is assigned an ID that maps
 * directly to index values of the descriptor array and has a state. The ID
 * and the state are bitwise combined into a single descriptor field named
 * @state_var, allowing ID and state to be synchronously and atomically
 * updated.
 *
 * Descriptors have four states:
 *
 *   reserved
 *     A writer is modifying the record.
 *
 *   committed
 *     The record and all its data are written. A writer can reopen the
 *     descriptor (transitioning it back to reserved), but in the committed
 *     state the data is consistent.
 *
 *   finalized
 *     The record and all its data are complete and available for reading. A
 *     writer cannot reopen the descriptor.
 *
 *   reusable
 *     The record exists, but its text and/or meta data may no longer be
 *     available.
 *
 * Querying the @state_var of a record requires providing the ID of the
 * descriptor to query. This can yield a possible fifth (pseudo) state:
 *
 *   miss
 *     The descriptor being queried has an unexpected ID.
 *
 * The descriptor ring has a @tail_id that contains the ID of the oldest
 * descriptor and @head_id that contains the ID of the newest descriptor.
 *
 * When a new descriptor should be created (and the ring is full), the tail
 * descriptor is invalidated by first transitioning to the reusable state and
 * then invalidating all tail data blocks up to and including the data blocks
 * associated with the tail descriptor (for the text ring). Then
 * @tail_id is advanced, followed by advancing @head_id. And finally the
 * @state_var of the new descriptor is initialized to the new ID and reserved
 * state.
 *
 * The @tail_id can only be advanced if the new @tail_id would be in the
 * committed or reusable queried state. This makes it possible that a valid
 * sequence number of the tail is always available.
 *
 * Descriptor Finalization
 * ~~~~~~~~~~~~~~~~~~~~~~~
 * When a writer calls the commit function prb_commit(), record data is
 * fully stored and is consistent within the ringbuffer. However, a writer can
 * reopen that record, claiming exclusive access (as with prb_reserve()), and
 * modify that record. When finished, the writer must again commit the record.
 *
 * In order for a record to be made available to readers (and also become
 * recyclable for writers), it must be finalized. A finalized record cannot be
 * reopened and can never become "unfinalized". Record finalization can occur
 * in three different scenarios:
 *
 *   1) A writer can simultaneously commit and finalize its record by calling
 *      prb_final_commit() instead of prb_commit().
 *
 *   2) When a new record is reserved and the previous record has been
 *      committed via prb_commit(), that previous record is automatically
 *      finalized.
 *
 *   3) When a record is committed via prb_commit() and a newer record
 *      already exists, the record being committed is automatically finalized.
 *
 * Data Ring
 * ~~~~~~~~~
 * The text data ring is a byte array composed of data blocks. Data blocks are
 * referenced by blk_lpos structs that point to the logical position of the
 * beginning of a data block and the beginning of the next adjacent data
 * block. Logical positions are mapped directly to index values of the byte
 * array ringbuffer.
 *
 * Each data block consists of an ID followed by the writer data. The ID is
 * the identifier of a descriptor that is associated with the data block. A
 * given data block is considered valid if all of the following conditions
 * are met:
 *
 *   1) The descriptor associated with the data block is in the committed
 *      or finalized queried state.
 *
 *   2) The blk_lpos struct within the descriptor associated with the data
 *      block references back to the same data block.
 *
 *   3) The data block is within the head/tail logical position range.
 *
 * If the writer data of a data block would extend beyond the end of the
 * byte array, only the ID of the data block is stored at the logical
 * position and the full data block (ID and writer data) is stored at the
 * beginning of the byte array. The referencing blk_lpos will point to the
 * ID before the wrap and the next data block will be at the logical
 * position adjacent the full data block after the wrap.
 *
 * Data rings have a @tail_lpos that points to the beginning of the oldest
 * data block and a @head_lpos that points to the logical position of the
 * next (not yet existing) data block.
 *
 * When a new data block should be created (and the ring is full), tail data
 * blocks will first be invalidated by putting their associated descriptors
 * into the reusable state and then pushing the @tail_lpos forward beyond
 * them. Then the @head_lpos is pushed forward and is associated with a new
 * descriptor. If a data block is not valid, the @tail_lpos cannot be
 * advanced beyond it.
 *
 * Info Array
 * ~~~~~~~~~~
 * The general meta data of printk records are stored in printk_info structs,
 * stored in an array with the same number of elements as the descriptor ring.
 * Each info corresponds to the descriptor of the same index in the
 * descriptor ring. Info validity is confirmed by evaluating the corresponding
 * descriptor before and after loading the info.
 *
 * Usage
 * -----
 * Here are some simple examples demonstrating writers and readers. For the
 * examples a global ringbuffer (test_rb) is available (which is not the
 * actual ringbuffer used by printk)::
 *
 *        DEFINE_PRINTKRB(test_rb, 15, 5);
 *
 * This ringbuffer allows up to 32768 records (2 ^ 15) and has a size of
 * 1 MiB (2 ^ (15 + 5)) for text data.
 *
 * Sample writer code::
 *
 *        const char *textstr = "message text";
 *        struct prb_reserved_entry e;
 *        struct printk_record r;
 *
 *        // specify how much to allocate
 *        prb_rec_init_wr(&r, strlen(textstr) + 1);
 *
 *        if (prb_reserve(&e, &test_rb, &r)) {
 *                snprintf(r.text_buf, r.text_buf_size, "%s", textstr);
 *
 *                r.info->text_len = strlen(textstr);
 *                r.info->ts_nsec = local_clock();
 *                r.info->caller_id = printk_caller_id();
 *
 *                // commit and finalize the record
 *                prb_final_commit(&e);
 *        }
 *
 * Note that additional writer functions are available to extend a record
 * after it has been committed but not yet finalized. This can be done as
 * long as no new records have been reserved and the caller is the same.
 *
 * Sample writer code (record extending)::
 *
 *                // alternate rest of previous example
 *
 *                r.info->text_len = strlen(textstr);
 *                r.info->ts_nsec = local_clock();
 *                r.info->caller_id = printk_caller_id();
 *
 *                // commit the record (but do not finalize yet)
 *                prb_commit(&e);
 *        }
 *
 *        ...
 *
 *        // specify additional 5 bytes text space to extend
 *        prb_rec_init_wr(&r, 5);
 *
 *        // try to extend, but only if it does not exceed 32 bytes
 *        if (prb_reserve_in_last(&e, &test_rb, &r, printk_caller_id(), 32)) {
 *                snprintf(&r.text_buf[r.info->text_len],
 *                         r.text_buf_size - r.info->text_len, "hello");
 *
 *                r.info->text_len += 5;
 *
 *                // commit and finalize the record
 *                prb_final_commit(&e);
 *        }
 *
 * Sample reader code::
 *
 *        struct printk_info info;
 *        struct printk_record r;
 *        char text_buf[32];
 *        u64 seq;
 *
 *        prb_rec_init_rd(&r, &info, &text_buf[0], sizeof(text_buf));
 *
 *        prb_for_each_record(0, &test_rb, &seq, &r) {
 *                if (info.seq != seq)
 *                        pr_warn("lost %llu records\n", info.seq - seq);
 *
 *                if (info.text_len > r.text_buf_size) {
 *                        pr_warn("record %llu text truncated\n", info.seq);
 *                        text_buf[r.text_buf_size - 1] = 0;
 *                }
 *
 *                pr_info("%llu: %llu: %s\n", info.seq, info.ts_nsec,
 *                        &text_buf[0]);
 *        }
 *
 * Note that additional less convenient reader functions are available to
 * allow complex record access.
 *
 * ABA Issues
 * ~~~~~~~~~~
 * To help avoid ABA issues, descriptors are referenced by IDs (array index
 * values combined with tagged bits counting array wraps) and data blocks are
 * referenced by logical positions (array index values combined with tagged
 * bits counting array wraps). However, on 32-bit systems the number of
 * tagged bits is relatively small such that an ABA incident is (at least
 * theoretically) possible. For example, if 4 million maximally sized (1KiB)
 * printk messages were to occur in NMI context on a 32-bit system, the
 * interrupted context would not be able to recognize that the 32-bit integer
 * completely wrapped and thus represents a different data block than the one
 * the interrupted context expects.
 *
 * To help combat this possibility, additional state checking is performed
 * (such as using cmpxchg() even though set() would suffice). These extra
 * checks are commented as such and will hopefully catch any ABA issue that
 * a 32-bit system might experience.
 *
 * Memory Barriers
 * ~~~~~~~~~~~~~~~
 * Multiple memory barriers are used. To simplify proving correctness and
 * generating litmus tests, lines of code related to memory barriers
 * (loads, stores, and the associated memory barriers) are labeled::
 *
 *        LMM(function:letter)
 *
 * Comments reference the labels using only the "function:letter" part.
 *
 * The memory barrier pairs and their ordering are:
 *
 *   desc_reserve:D / desc_reserve:B
 *     push descriptor tail (id), then push descriptor head (id)
 *
 *   desc_reserve:D / data_push_tail:B
 *     push data tail (lpos), then set new descriptor reserved (state)
 *
 *   desc_reserve:D / desc_push_tail:C
 *     push descriptor tail (id), then set new descriptor reserved (state)
 *
 *   desc_reserve:D / prb_first_seq:C
 *     push descriptor tail (id), then set new descriptor reserved (state)
 *
 *   desc_reserve:F / desc_read:D
 *     set new descriptor id and reserved (state), then allow writer changes
 *
 *   data_alloc:A (or data_realloc:A) / desc_read:D
 *     set old descriptor reusable (state), then modify new data block area
 *
 *   data_alloc:A (or data_realloc:A) / data_push_tail:B
 *     push data tail (lpos), then modify new data block area
 *
 *   _prb_commit:B / desc_read:B
 *     store writer changes, then set new descriptor committed (state)
 *
 *   desc_reopen_last:A / _prb_commit:B
 *     set descriptor reserved (state), then read descriptor data
 *
 *   _prb_commit:B / desc_reserve:D
 *     set new descriptor committed (state), then check descriptor head (id)
 *
 *   data_push_tail:D / data_push_tail:A
 *     set descriptor reusable (state), then push data tail (lpos)
 *
 *   desc_push_tail:B / desc_reserve:D
 *     set descriptor reusable (state), then push descriptor tail (id)
 *
 *   desc_update_last_finalized:A / desc_last_finalized_seq:A
 *     store finalized record, then set new highest finalized sequence number
 */

#define DATA_SIZE(data_ring)                _DATA_SIZE((data_ring)->size_bits)
#define DATA_SIZE_MASK(data_ring)        (DATA_SIZE(data_ring) - 1)

#define DESCS_COUNT(desc_ring)                _DESCS_COUNT((desc_ring)->count_bits)
#define DESCS_COUNT_MASK(desc_ring)        (DESCS_COUNT(desc_ring) - 1)

/* Determine the data array index from a logical position. */
#define DATA_INDEX(data_ring, lpos)        ((lpos) & DATA_SIZE_MASK(data_ring))

/* Determine the desc array index from an ID or sequence number. */
#define DESC_INDEX(desc_ring, n)        ((n) & DESCS_COUNT_MASK(desc_ring))

/* Determine how many times the data array has wrapped. */
#define DATA_WRAPS(data_ring, lpos)        ((lpos) >> (data_ring)->size_bits)

/* Determine if a logical position refers to a data-less block. */
#define LPOS_DATALESS(lpos)                ((lpos) & 1UL)
#define BLK_DATALESS(blk)                (LPOS_DATALESS((blk)->begin) && \
                                         LPOS_DATALESS((blk)->next))

/* Get the logical position at index 0 of the current wrap. */
#define DATA_THIS_WRAP_START_LPOS(data_ring, lpos) \
((lpos) & ~DATA_SIZE_MASK(data_ring))

/* Get the ID for the same index of the previous wrap as the given ID. */
#define DESC_ID_PREV_WRAP(desc_ring, id) \
DESC_ID((id) - DESCS_COUNT(desc_ring))

/*
 * A data block: mapped directly to the beginning of the data block area
 * specified as a logical position within the data ring.
 *
 * @id:   the ID of the associated descriptor
 * @data: the writer data
 *
 * Note that the size of a data block is only known by its associated
 * descriptor.
 */
struct prb_data_block {
        unsigned long        id;
        char                data[];
};

/*
 * Return the descriptor associated with @n. @n can be either a
 * descriptor ID or a sequence number.
 */
static struct prb_desc *to_desc(struct prb_desc_ring *desc_ring, u64 n)
{
        return &desc_ring->descs[DESC_INDEX(desc_ring, n)];
}

/*
 * Return the printk_info associated with @n. @n can be either a
 * descriptor ID or a sequence number.
 */
static struct printk_info *to_info(struct prb_desc_ring *desc_ring, u64 n)
{
        return &desc_ring->infos[DESC_INDEX(desc_ring, n)];
}

static struct prb_data_block *to_block(struct prb_data_ring *data_ring,
                                       unsigned long begin_lpos)
{
        return (void *)&data_ring->data[DATA_INDEX(data_ring, begin_lpos)];
}

/*
 * Increase the data size to account for data block meta data plus any
 * padding so that the adjacent data block is aligned on the ID size.
 */
static unsigned int to_blk_size(unsigned int size)
{
        struct prb_data_block *db = NULL;

        size += sizeof(*db);
        size = ALIGN(size, sizeof(db->id));
        return size;
}

/*
 * Sanity checker for reserve size. The ringbuffer code assumes that a data
 * block does not exceed the maximum possible size that could fit within the
 * ringbuffer. This function provides that basic size check so that the
 * assumption is safe. In particular, it guarantees that data_push_tail() will
 * never attempt to push the tail beyond the head.
 */
static bool data_check_size(struct prb_data_ring *data_ring, unsigned int size)
{
        /* Data-less blocks take no space. */
        if (size == 0)
                return true;

        /*
         * If data blocks were allowed to be larger than half the data ring
         * size, a wrapping data block could require more space than the full
         * ringbuffer.
         */
        return to_blk_size(size) <= DATA_SIZE(data_ring) / 2;
}

/* Query the state of a descriptor. */
static enum desc_state get_desc_state(unsigned long id,
                                      unsigned long state_val)
{
        if (id != DESC_ID(state_val))
                return desc_miss;

        return DESC_STATE(state_val);
}

/*
 * Get a copy of a specified descriptor and return its queried state. If the
 * descriptor is in an inconsistent state (miss or reserved), the caller can
 * only expect the descriptor's @state_var field to be valid.
 *
 * The sequence number and caller_id can be optionally retrieved. Like all
 * non-state_var data, they are only valid if the descriptor is in a
 * consistent state.
 */
static enum desc_state desc_read(struct prb_desc_ring *desc_ring,
                                 unsigned long id, struct prb_desc *desc_out,
                                 u64 *seq_out, u32 *caller_id_out)
{
        struct printk_info *info = to_info(desc_ring, id);
        struct prb_desc *desc = to_desc(desc_ring, id);
        atomic_long_t *state_var = &desc->state_var;
        enum desc_state d_state;
        unsigned long state_val;

        /* Check the descriptor state. */
        state_val = atomic_long_read(state_var); /* LMM(desc_read:A) */
        d_state = get_desc_state(id, state_val);
        if (d_state == desc_miss || d_state == desc_reserved) {
                /*
                 * The descriptor is in an inconsistent state. Set at least
                 * @state_var so that the caller can see the details of
                 * the inconsistent state.
                 */
                goto out;
        }

        /*
         * Guarantee the state is loaded before copying the descriptor
         * content. This avoids copying obsolete descriptor content that might
         * not apply to the descriptor state. This pairs with _prb_commit:B.
         *
         * Memory barrier involvement:
         *
         * If desc_read:A reads from _prb_commit:B, then desc_read:C reads
         * from _prb_commit:A.
         *
         * Relies on:
         *
         * WMB from _prb_commit:A to _prb_commit:B
         *    matching
         * RMB from desc_read:A to desc_read:C
         */
        smp_rmb(); /* LMM(desc_read:B) */

        /*
         * Copy the descriptor data. The data is not valid until the
         * state has been re-checked. A memcpy() for all of @desc
         * cannot be used because of the atomic_t @state_var field.
         */
        if (desc_out) {
                memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos,
                       sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */
        }
        if (seq_out)
                *seq_out = info->seq; /* also part of desc_read:C */
        if (caller_id_out)
                *caller_id_out = info->caller_id; /* also part of desc_read:C */

        /*
         * 1. Guarantee the descriptor content is loaded before re-checking
         *    the state. This avoids reading an obsolete descriptor state
         *    that may not apply to the copied content. This pairs with
         *    desc_reserve:F.
         *
         *    Memory barrier involvement:
         *
         *    If desc_read:C reads from desc_reserve:G, then desc_read:E
         *    reads from desc_reserve:F.
         *
         *    Relies on:
         *
         *    WMB from desc_reserve:F to desc_reserve:G
         *       matching
         *    RMB from desc_read:C to desc_read:E
         *
         * 2. Guarantee the record data is loaded before re-checking the
         *    state. This avoids reading an obsolete descriptor state that may
         *    not apply to the copied data. This pairs with data_alloc:A and
         *    data_realloc:A.
         *
         *    Memory barrier involvement:
         *
         *    If copy_data:A reads from data_alloc:B, then desc_read:E
         *    reads from desc_make_reusable:A.
         *
         *    Relies on:
         *
         *    MB from desc_make_reusable:A to data_alloc:B
         *       matching
         *    RMB from desc_read:C to desc_read:E
         *
         *    Note: desc_make_reusable:A and data_alloc:B can be different
         *          CPUs. However, the data_alloc:B CPU (which performs the
         *          full memory barrier) must have previously seen
         *          desc_make_reusable:A.
         */
        smp_rmb(); /* LMM(desc_read:D) */

        /*
         * The data has been copied. Return the current descriptor state,
         * which may have changed since the load above.
         */
        state_val = atomic_long_read(state_var); /* LMM(desc_read:E) */
        d_state = get_desc_state(id, state_val);
out:
        if (desc_out)
                atomic_long_set(&desc_out->state_var, state_val);
        return d_state;
}

/*
 * Take a specified descriptor out of the finalized state by attempting
 * the transition from finalized to reusable. Either this context or some
 * other context will have been successful.
 */
static void desc_make_reusable(struct prb_desc_ring *desc_ring,
                               unsigned long id)
{
        unsigned long val_finalized = DESC_SV(id, desc_finalized);
        unsigned long val_reusable = DESC_SV(id, desc_reusable);
        struct prb_desc *desc = to_desc(desc_ring, id);
        atomic_long_t *state_var = &desc->state_var;

        atomic_long_cmpxchg_relaxed(state_var, val_finalized,
                                    val_reusable); /* LMM(desc_make_reusable:A) */
}

/*
 * Given the text data ring, put the associated descriptor of each
 * data block from @lpos_begin until @lpos_end into the reusable state.
 *
 * If there is any problem making the associated descriptor reusable, either
 * the descriptor has not yet been finalized or another writer context has
 * already pushed the tail lpos past the problematic data block. Regardless,
 * on error the caller can re-load the tail lpos to determine the situation.
 */
static bool data_make_reusable(struct printk_ringbuffer *rb,
                               unsigned long lpos_begin,
                               unsigned long lpos_end,
                               unsigned long *lpos_out)
{

        struct prb_data_ring *data_ring = &rb->text_data_ring;
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        struct prb_data_block *blk;
        enum desc_state d_state;
        struct prb_desc desc;
        struct prb_data_blk_lpos *blk_lpos = &desc.text_blk_lpos;
        unsigned long id;

        /* Loop until @lpos_begin has advanced to or beyond @lpos_end. */
        while ((lpos_end - lpos_begin) - 1 < DATA_SIZE(data_ring)) {
                blk = to_block(data_ring, lpos_begin);

                /*
                 * Load the block ID from the data block. This is a data race
                 * against a writer that may have newly reserved this data
                 * area. If the loaded value matches a valid descriptor ID,
                 * the blk_lpos of that descriptor will be checked to make
                 * sure it points back to this data block. If the check fails,
                 * the data area has been recycled by another writer.
                 */
                id = blk->id; /* LMM(data_make_reusable:A) */

                d_state = desc_read(desc_ring, id, &desc,
                                    NULL, NULL); /* LMM(data_make_reusable:B) */

                switch (d_state) {
                case desc_miss:
                case desc_reserved:
                case desc_committed:
                        return false;
                case desc_finalized:
                        /*
                         * This data block is invalid if the descriptor
                         * does not point back to it.
                         */
                        if (blk_lpos->begin != lpos_begin)
                                return false;
                        desc_make_reusable(desc_ring, id);
                        break;
                case desc_reusable:
                        /*
                         * This data block is invalid if the descriptor
                         * does not point back to it.
                         */
                        if (blk_lpos->begin != lpos_begin)
                                return false;
                        break;
                }

                /* Advance @lpos_begin to the next data block. */
                lpos_begin = blk_lpos->next;
        }

        *lpos_out = lpos_begin;
        return true;
}

/*
 * Advance the data ring tail to at least @lpos. This function puts
 * descriptors into the reusable state if the tail is pushed beyond
 * their associated data block.
 */
static bool data_push_tail(struct printk_ringbuffer *rb, unsigned long lpos)
{
        struct prb_data_ring *data_ring = &rb->text_data_ring;
        unsigned long tail_lpos_new;
        unsigned long tail_lpos;
        unsigned long next_lpos;

        /* If @lpos is from a data-less block, there is nothing to do. */
        if (LPOS_DATALESS(lpos))
                return true;

        /*
         * Any descriptor states that have transitioned to reusable due to the
         * data tail being pushed to this loaded value will be visible to this
         * CPU. This pairs with data_push_tail:D.
         *
         * Memory barrier involvement:
         *
         * If data_push_tail:A reads from data_push_tail:D, then this CPU can
         * see desc_make_reusable:A.
         *
         * Relies on:
         *
         * MB from desc_make_reusable:A to data_push_tail:D
         *    matches
         * READFROM from data_push_tail:D to data_push_tail:A
         *    thus
         * READFROM from desc_make_reusable:A to this CPU
         */
        tail_lpos = atomic_long_read(&data_ring->tail_lpos); /* LMM(data_push_tail:A) */

        /*
         * Loop until the tail lpos is at or beyond @lpos. This condition
         * may already be satisfied, resulting in no full memory barrier
         * from data_push_tail:D being performed. However, since this CPU
         * sees the new tail lpos, any descriptor states that transitioned to
         * the reusable state must already be visible.
         */
        while ((lpos - tail_lpos) - 1 < DATA_SIZE(data_ring)) {
                /*
                 * Make all descriptors reusable that are associated with
                 * data blocks before @lpos.
                 */
                if (!data_make_reusable(rb, tail_lpos, lpos, &next_lpos)) {
                        /*
                         * 1. Guarantee the block ID loaded in
                         *    data_make_reusable() is performed before
                         *    reloading the tail lpos. The failed
                         *    data_make_reusable() may be due to a newly
                         *    recycled data area causing the tail lpos to
                         *    have been previously pushed. This pairs with
                         *    data_alloc:A and data_realloc:A.
                         *
                         *    Memory barrier involvement:
                         *
                         *    If data_make_reusable:A reads from data_alloc:B,
                         *    then data_push_tail:C reads from
                         *    data_push_tail:D.
                         *
                         *    Relies on:
                         *
                         *    MB from data_push_tail:D to data_alloc:B
                         *       matching
                         *    RMB from data_make_reusable:A to
                         *    data_push_tail:C
                         *
                         *    Note: data_push_tail:D and data_alloc:B can be
                         *          different CPUs. However, the data_alloc:B
                         *          CPU (which performs the full memory
                         *          barrier) must have previously seen
                         *          data_push_tail:D.
                         *
                         * 2. Guarantee the descriptor state loaded in
                         *    data_make_reusable() is performed before
                         *    reloading the tail lpos. The failed
                         *    data_make_reusable() may be due to a newly
                         *    recycled descriptor causing the tail lpos to
                         *    have been previously pushed. This pairs with
                         *    desc_reserve:D.
                         *
                         *    Memory barrier involvement:
                         *
                         *    If data_make_reusable:B reads from
                         *    desc_reserve:F, then data_push_tail:C reads
                         *    from data_push_tail:D.
                         *
                         *    Relies on:
                         *
                         *    MB from data_push_tail:D to desc_reserve:F
                         *       matching
                         *    RMB from data_make_reusable:B to
                         *    data_push_tail:C
                         *
                         *    Note: data_push_tail:D and desc_reserve:F can
                         *          be different CPUs. However, the
                         *          desc_reserve:F CPU (which performs the
                         *          full memory barrier) must have previously
                         *          seen data_push_tail:D.
                         */
                        smp_rmb(); /* LMM(data_push_tail:B) */

                        tail_lpos_new = atomic_long_read(&data_ring->tail_lpos
                                                        ); /* LMM(data_push_tail:C) */
                        if (tail_lpos_new == tail_lpos)
                                return false;

                        /* Another CPU pushed the tail. Try again. */
                        tail_lpos = tail_lpos_new;
                        continue;
                }

                /*
                 * Guarantee any descriptor states that have transitioned to
                 * reusable are stored before pushing the tail lpos. A full
                 * memory barrier is needed since other CPUs may have made
                 * the descriptor states reusable. This pairs with
                 * data_push_tail:A.
                 */
                if (atomic_long_try_cmpxchg(&data_ring->tail_lpos, &tail_lpos,
                                            next_lpos)) { /* LMM(data_push_tail:D) */
                        break;
                }
        }

        return true;
}

/*
 * Advance the desc ring tail. This function advances the tail by one
 * descriptor, thus invalidating the oldest descriptor. Before advancing
 * the tail, the tail descriptor is made reusable and all data blocks up to
 * and including the descriptor's data block are invalidated (i.e. the data
 * ring tail is pushed past the data block of the descriptor being made
 * reusable).
 */
static bool desc_push_tail(struct printk_ringbuffer *rb,
                           unsigned long tail_id)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        enum desc_state d_state;
        struct prb_desc desc;

        d_state = desc_read(desc_ring, tail_id, &desc, NULL, NULL);

        switch (d_state) {
        case desc_miss:
                /*
                 * If the ID is exactly 1 wrap behind the expected, it is
                 * in the process of being reserved by another writer and
                 * must be considered reserved.
                 */
                if (DESC_ID(atomic_long_read(&desc.state_var)) ==
                    DESC_ID_PREV_WRAP(desc_ring, tail_id)) {
                        return false;
                }

                /*
                 * The ID has changed. Another writer must have pushed the
                 * tail and recycled the descriptor already. Success is
                 * returned because the caller is only interested in the
                 * specified tail being pushed, which it was.
                 */
                return true;
        case desc_reserved:
        case desc_committed:
                return false;
        case desc_finalized:
                desc_make_reusable(desc_ring, tail_id);
                break;
        case desc_reusable:
                break;
        }

        /*
         * Data blocks must be invalidated before their associated
         * descriptor can be made available for recycling. Invalidating
         * them later is not possible because there is no way to trust
         * data blocks once their associated descriptor is gone.
         */

        if (!data_push_tail(rb, desc.text_blk_lpos.next))
                return false;

        /*
         * Check the next descriptor after @tail_id before pushing the tail
         * to it because the tail must always be in a finalized or reusable
         * state. The implementation of prb_first_seq() relies on this.
         *
         * A successful read implies that the next descriptor is less than or
         * equal to @head_id so there is no risk of pushing the tail past the
         * head.
         */
        d_state = desc_read(desc_ring, DESC_ID(tail_id + 1), &desc,
                            NULL, NULL); /* LMM(desc_push_tail:A) */

        if (d_state == desc_finalized || d_state == desc_reusable) {
                /*
                 * Guarantee any descriptor states that have transitioned to
                 * reusable are stored before pushing the tail ID. This allows
                 * verifying the recycled descriptor state. A full memory
                 * barrier is needed since other CPUs may have made the
                 * descriptor states reusable. This pairs with desc_reserve:D.
                 */
                atomic_long_cmpxchg(&desc_ring->tail_id, tail_id,
                                    DESC_ID(tail_id + 1)); /* LMM(desc_push_tail:B) */
        } else {
                /*
                 * Guarantee the last state load from desc_read() is before
                 * reloading @tail_id in order to see a new tail ID in the
                 * case that the descriptor has been recycled. This pairs
                 * with desc_reserve:D.
                 *
                 * Memory barrier involvement:
                 *
                 * If desc_push_tail:A reads from desc_reserve:F, then
                 * desc_push_tail:D reads from desc_push_tail:B.
                 *
                 * Relies on:
                 *
                 * MB from desc_push_tail:B to desc_reserve:F
                 *    matching
                 * RMB from desc_push_tail:A to desc_push_tail:D
                 *
                 * Note: desc_push_tail:B and desc_reserve:F can be different
                 *       CPUs. However, the desc_reserve:F CPU (which performs
                 *       the full memory barrier) must have previously seen
                 *       desc_push_tail:B.
                 */
                smp_rmb(); /* LMM(desc_push_tail:C) */

                /*
                 * Re-check the tail ID. The descriptor following @tail_id is
                 * not in an allowed tail state. But if the tail has since
                 * been moved by another CPU, then it does not matter.
                 */
                if (atomic_long_read(&desc_ring->tail_id) == tail_id) /* LMM(desc_push_tail:D) */
                        return false;
        }

        return true;
}

/* Reserve a new descriptor, invalidating the oldest if necessary. */
static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        unsigned long prev_state_val;
        unsigned long id_prev_wrap;
        struct prb_desc *desc;
        unsigned long head_id;
        unsigned long id;

        head_id = atomic_long_read(&desc_ring->head_id); /* LMM(desc_reserve:A) */

        do {
                id = DESC_ID(head_id + 1);
                id_prev_wrap = DESC_ID_PREV_WRAP(desc_ring, id);

                /*
                 * Guarantee the head ID is read before reading the tail ID.
                 * Since the tail ID is updated before the head ID, this
                 * guarantees that @id_prev_wrap is never ahead of the tail
                 * ID. This pairs with desc_reserve:D.
                 *
                 * Memory barrier involvement:
                 *
                 * If desc_reserve:A reads from desc_reserve:D, then
                 * desc_reserve:C reads from desc_push_tail:B.
                 *
                 * Relies on:
                 *
                 * MB from desc_push_tail:B to desc_reserve:D
                 *    matching
                 * RMB from desc_reserve:A to desc_reserve:C
                 *
                 * Note: desc_push_tail:B and desc_reserve:D can be different
                 *       CPUs. However, the desc_reserve:D CPU (which performs
                 *       the full memory barrier) must have previously seen
                 *       desc_push_tail:B.
                 */
                smp_rmb(); /* LMM(desc_reserve:B) */

                if (id_prev_wrap == atomic_long_read(&desc_ring->tail_id
                                                    )) { /* LMM(desc_reserve:C) */
                        /*
                         * Make space for the new descriptor by
                         * advancing the tail.
                         */
                        if (!desc_push_tail(rb, id_prev_wrap))
                                return false;
                }

                /*
                 * 1. Guarantee the tail ID is read before validating the
                 *    recycled descriptor state. A read memory barrier is
                 *    sufficient for this. This pairs with desc_push_tail:B.
                 *
                 *    Memory barrier involvement:
                 *
                 *    If desc_reserve:C reads from desc_push_tail:B, then
                 *    desc_reserve:E reads from desc_make_reusable:A.
                 *
                 *    Relies on:
                 *
                 *    MB from desc_make_reusable:A to desc_push_tail:B
                 *       matching
                 *    RMB from desc_reserve:C to desc_reserve:E
                 *
                 *    Note: desc_make_reusable:A and desc_push_tail:B can be
                 *          different CPUs. However, the desc_push_tail:B CPU
                 *          (which performs the full memory barrier) must have
                 *          previously seen desc_make_reusable:A.
                 *
                 * 2. Guarantee the tail ID is stored before storing the head
                 *    ID. This pairs with desc_reserve:B.
                 *
                 * 3. Guarantee any data ring tail changes are stored before
                 *    recycling the descriptor. Data ring tail changes can
                 *    happen via desc_push_tail()->data_push_tail(). A full
                 *    memory barrier is needed since another CPU may have
                 *    pushed the data ring tails. This pairs with
                 *    data_push_tail:B.
                 *
                 * 4. Guarantee a new tail ID is stored before recycling the
                 *    descriptor. A full memory barrier is needed since
                 *    another CPU may have pushed the tail ID. This pairs
                 *    with desc_push_tail:C and this also pairs with
                 *    prb_first_seq:C.
                 *
                 * 5. Guarantee the head ID is stored before trying to
                 *    finalize the previous descriptor. This pairs with
                 *    _prb_commit:B.
                 */
        } while (!atomic_long_try_cmpxchg(&desc_ring->head_id, &head_id,
                                          id)); /* LMM(desc_reserve:D) */

        desc = to_desc(desc_ring, id);

        /*
         * If the descriptor has been recycled, verify the old state val.
         * See "ABA Issues" about why this verification is performed.
         */
        prev_state_val = atomic_long_read(&desc->state_var); /* LMM(desc_reserve:E) */
        if (prev_state_val &&
            get_desc_state(id_prev_wrap, prev_state_val) != desc_reusable) {
                WARN_ON_ONCE(1);
                return false;
        }

        /*
         * Assign the descriptor a new ID and set its state to reserved.
         * See "ABA Issues" about why cmpxchg() instead of set() is used.
         *
         * Guarantee the new descriptor ID and state is stored before making
         * any other changes. A write memory barrier is sufficient for this.
         * This pairs with desc_read:D.
         */
        if (!atomic_long_try_cmpxchg(&desc->state_var, &prev_state_val,
                        DESC_SV(id, desc_reserved))) { /* LMM(desc_reserve:F) */
                WARN_ON_ONCE(1);
                return false;
        }

        /* Now data in @desc can be modified: LMM(desc_reserve:G) */

        *id_out = id;
        return true;
}

/* Determine the end of a data block. */
static unsigned long get_next_lpos(struct prb_data_ring *data_ring,
                                   unsigned long lpos, unsigned int size)
{
        unsigned long begin_lpos;
        unsigned long next_lpos;

        begin_lpos = lpos;
        next_lpos = lpos + size;

        /* First check if the data block does not wrap. */
        if (DATA_WRAPS(data_ring, begin_lpos) == DATA_WRAPS(data_ring, next_lpos))
                return next_lpos;

        /* Wrapping data blocks store their data at the beginning. */
        return (DATA_THIS_WRAP_START_LPOS(data_ring, next_lpos) + size);
}

/*
 * Allocate a new data block, invalidating the oldest data block(s)
 * if necessary. This function also associates the data block with
 * a specified descriptor.
 */
static char *data_alloc(struct printk_ringbuffer *rb, unsigned int size,
                        struct prb_data_blk_lpos *blk_lpos, unsigned long id)
{
        struct prb_data_ring *data_ring = &rb->text_data_ring;
        struct prb_data_block *blk;
        unsigned long begin_lpos;
        unsigned long next_lpos;

        if (size == 0) {
                /*
                 * Data blocks are not created for empty lines. Instead, the
                 * reader will recognize these special lpos values and handle
                 * it appropriately.
                 */
                blk_lpos->begin = EMPTY_LINE_LPOS;
                blk_lpos->next = EMPTY_LINE_LPOS;
                return NULL;
        }

        size = to_blk_size(size);

        begin_lpos = atomic_long_read(&data_ring->head_lpos);

        do {
                next_lpos = get_next_lpos(data_ring, begin_lpos, size);

                /*
                 * data_check_size() prevents data block allocation that could
                 * cause illegal ringbuffer states. But double check that the
                 * used space will not be bigger than the ring buffer. Wrapped
                 * messages need to reserve more space, see get_next_lpos().
                 *
                 * Specify a data-less block when the check or the allocation
                 * fails.
                 */
                if (WARN_ON_ONCE(next_lpos - begin_lpos > DATA_SIZE(data_ring)) ||
                    !data_push_tail(rb, next_lpos - DATA_SIZE(data_ring))) {
                        blk_lpos->begin = FAILED_LPOS;
                        blk_lpos->next = FAILED_LPOS;
                        return NULL;
                }

                /*
                 * 1. Guarantee any descriptor states that have transitioned
                 *    to reusable are stored before modifying the newly
                 *    allocated data area. A full memory barrier is needed
                 *    since other CPUs may have made the descriptor states
                 *    reusable. See data_push_tail:A about why the reusable
                 *    states are visible. This pairs with desc_read:D.
                 *
                 * 2. Guarantee any updated tail lpos is stored before
                 *    modifying the newly allocated data area. Another CPU may
                 *    be in data_make_reusable() and is reading a block ID
                 *    from this area. data_make_reusable() can handle reading
                 *    a garbage block ID value, but then it must be able to
                 *    load a new tail lpos. A full memory barrier is needed
                 *    since other CPUs may have updated the tail lpos. This
                 *    pairs with data_push_tail:B.
                 */
        } while (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &begin_lpos,
                                          next_lpos)); /* LMM(data_alloc:A) */

        blk = to_block(data_ring, begin_lpos);
        blk->id = id; /* LMM(data_alloc:B) */

        if (DATA_WRAPS(data_ring, begin_lpos) != DATA_WRAPS(data_ring, next_lpos)) {
                /* Wrapping data blocks store their data at the beginning. */
                blk = to_block(data_ring, 0);

                /*
                 * Store the ID on the wrapped block for consistency.
                 * The printk_ringbuffer does not actually use it.
                 */
                blk->id = id;
        }

        blk_lpos->begin = begin_lpos;
        blk_lpos->next = next_lpos;

        return &blk->data[0];
}

/*
 * Try to resize an existing data block associated with the descriptor
 * specified by @id. If the resized data block should become wrapped, it
 * copies the old data to the new data block. If @size yields a data block
 * with the same or less size, the data block is left as is.
 *
 * Fail if this is not the last allocated data block or if there is not
 * enough space or it is not possible make enough space.
 *
 * Return a pointer to the beginning of the entire data buffer or NULL on
 * failure.
 */
static char *data_realloc(struct printk_ringbuffer *rb, unsigned int size,
                          struct prb_data_blk_lpos *blk_lpos, unsigned long id)
{
        struct prb_data_ring *data_ring = &rb->text_data_ring;
        struct prb_data_block *blk;
        unsigned long head_lpos;
        unsigned long next_lpos;
        bool wrapped;

        /* Reallocation only works if @blk_lpos is the newest data block. */
        head_lpos = atomic_long_read(&data_ring->head_lpos);
        if (head_lpos != blk_lpos->next)
                return NULL;

        /* Keep track if @blk_lpos was a wrapping data block. */
        wrapped = (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, blk_lpos->next));

        size = to_blk_size(size);

        next_lpos = get_next_lpos(data_ring, blk_lpos->begin, size);

        /* If the data block does not increase, there is nothing to do. */
        if (head_lpos - next_lpos < DATA_SIZE(data_ring)) {
                if (wrapped)
                        blk = to_block(data_ring, 0);
                else
                        blk = to_block(data_ring, blk_lpos->begin);
                return &blk->data[0];
        }

        /*
         * data_check_size() prevents data block reallocation that could
         * cause illegal ringbuffer states. But double check that the
         * new used space will not be bigger than the ring buffer. Wrapped
         * messages need to reserve more space, see get_next_lpos().
         *
         * Specify failure when the check or the allocation fails.
         */
        if (WARN_ON_ONCE(next_lpos - blk_lpos->begin > DATA_SIZE(data_ring)) ||
            !data_push_tail(rb, next_lpos - DATA_SIZE(data_ring))) {
                return NULL;
        }

        /* The memory barrier involvement is the same as data_alloc:A. */
        if (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &head_lpos,
                                     next_lpos)) { /* LMM(data_realloc:A) */
                return NULL;
        }

        blk = to_block(data_ring, blk_lpos->begin);

        if (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, next_lpos)) {
                struct prb_data_block *old_blk = blk;

                /* Wrapping data blocks store their data at the beginning. */
                blk = to_block(data_ring, 0);

                /*
                 * Store the ID on the wrapped block for consistency.
                 * The printk_ringbuffer does not actually use it.
                 */
                blk->id = id;

                if (!wrapped) {
                        /*
                         * Since the allocated space is now in the newly
                         * created wrapping data block, copy the content
                         * from the old data block.
                         */
                        memcpy(&blk->data[0], &old_blk->data[0],
                               (blk_lpos->next - blk_lpos->begin) - sizeof(blk->id));
                }
        }

        blk_lpos->next = next_lpos;

        return &blk->data[0];
}

/* Return the number of bytes used by a data block. */
static unsigned int space_used(struct prb_data_ring *data_ring,
                               struct prb_data_blk_lpos *blk_lpos)
{
        /* Data-less blocks take no space. */
        if (BLK_DATALESS(blk_lpos))
                return 0;

        if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next)) {
                /* Data block does not wrap. */
                return (DATA_INDEX(data_ring, blk_lpos->next) -
                        DATA_INDEX(data_ring, blk_lpos->begin));
        }

        /*
         * For wrapping data blocks, the trailing (wasted) space is
         * also counted.
         */
        return (DATA_INDEX(data_ring, blk_lpos->next) +
                DATA_SIZE(data_ring) - DATA_INDEX(data_ring, blk_lpos->begin));
}

/*
 * Given @blk_lpos, return a pointer to the writer data from the data block
 * and calculate the size of the data part. A NULL pointer is returned if
 * @blk_lpos specifies values that could never be legal.
 *
 * This function (used by readers) performs strict validation on the lpos
 * values to possibly detect bugs in the writer code. A WARN_ON_ONCE() is
 * triggered if an internal error is detected.
 */
static const char *get_data(struct prb_data_ring *data_ring,
                            struct prb_data_blk_lpos *blk_lpos,
                            unsigned int *data_size)
{
        struct prb_data_block *db;

        /* Data-less data block description. */
        if (BLK_DATALESS(blk_lpos)) {
                /*
                 * Records that are just empty lines are also valid, even
                 * though they do not have a data block. For such records
                 * explicitly return empty string data to signify success.
                 */
                if (blk_lpos->begin == EMPTY_LINE_LPOS &&
                    blk_lpos->next == EMPTY_LINE_LPOS) {
                        *data_size = 0;
                        return "";
                }

                /* Data lost, invalid, or otherwise unavailable. */
                return NULL;
        }

        /* Regular data block: @begin less than @next and in same wrap. */
        if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next) &&
            blk_lpos->begin < blk_lpos->next) {
                db = to_block(data_ring, blk_lpos->begin);
                *data_size = blk_lpos->next - blk_lpos->begin;

        /* Wrapping data block: @begin is one wrap behind @next. */
        } else if (DATA_WRAPS(data_ring, blk_lpos->begin + DATA_SIZE(data_ring)) ==
                   DATA_WRAPS(data_ring, blk_lpos->next)) {
                db = to_block(data_ring, 0);
                *data_size = DATA_INDEX(data_ring, blk_lpos->next);

        /* Illegal block description. */
        } else {
                WARN_ON_ONCE(1);
                return NULL;
        }

        /* A valid data block will always be aligned to the ID size. */
        if (WARN_ON_ONCE(blk_lpos->begin != ALIGN(blk_lpos->begin, sizeof(db->id))) ||
            WARN_ON_ONCE(blk_lpos->next != ALIGN(blk_lpos->next, sizeof(db->id)))) {
                return NULL;
        }

        /* A valid data block will always have at least an ID. */
        if (WARN_ON_ONCE(*data_size < sizeof(db->id)))
                return NULL;

        /* Subtract block ID space from size to reflect data size. */
        *data_size -= sizeof(db->id);

        return &db->data[0];
}

/*
 * Attempt to transition the newest descriptor from committed back to reserved
 * so that the record can be modified by a writer again. This is only possible
 * if the descriptor is not yet finalized and the provided @caller_id matches.
 */
static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring,
                                         u32 caller_id, unsigned long *id_out)
{
        unsigned long prev_state_val;
        enum desc_state d_state;
        struct prb_desc desc;
        struct prb_desc *d;
        unsigned long id;
        u32 cid;

        id = atomic_long_read(&desc_ring->head_id);

        /*
         * To reduce unnecessarily reopening, first check if the descriptor
         * state and caller ID are correct.
         */
        d_state = desc_read(desc_ring, id, &desc, NULL, &cid);
        if (d_state != desc_committed || cid != caller_id)
                return NULL;

        d = to_desc(desc_ring, id);

        prev_state_val = DESC_SV(id, desc_committed);

        /*
         * Guarantee the reserved state is stored before reading any
         * record data. A full memory barrier is needed because @state_var
         * modification is followed by reading. This pairs with _prb_commit:B.
         *
         * Memory barrier involvement:
         *
         * If desc_reopen_last:A reads from _prb_commit:B, then
         * prb_reserve_in_last:A reads from _prb_commit:A.
         *
         * Relies on:
         *
         * WMB from _prb_commit:A to _prb_commit:B
         *    matching
         * MB If desc_reopen_last:A to prb_reserve_in_last:A
         */
        if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val,
                        DESC_SV(id, desc_reserved))) { /* LMM(desc_reopen_last:A) */
                return NULL;
        }

        *id_out = id;
        return d;
}

/**
 * prb_reserve_in_last() - Re-reserve and extend the space in the ringbuffer
 *                         used by the newest record.
 *
 * @e:         The entry structure to setup.
 * @rb:        The ringbuffer to re-reserve and extend data in.
 * @r:         The record structure to allocate buffers for.
 * @caller_id: The caller ID of the caller (reserving writer).
 * @max_size:  Fail if the extended size would be greater than this.
 *
 * This is the public function available to writers to re-reserve and extend
 * data.
 *
 * The writer specifies the text size to extend (not the new total size) by
 * setting the @text_buf_size field of @r. To ensure proper initialization
 * of @r, prb_rec_init_wr() should be used.
 *
 * This function will fail if @caller_id does not match the caller ID of the
 * newest record. In that case the caller must reserve new data using
 * prb_reserve().
 *
 * Context: Any context. Disables local interrupts on success.
 * Return: true if text data could be extended, otherwise false.
 *
 * On success:
 *
 *   - @r->text_buf points to the beginning of the entire text buffer.
 *
 *   - @r->text_buf_size is set to the new total size of the buffer.
 *
 *   - @r->info is not touched so that @r->info->text_len could be used
 *     to append the text.
 *
 *   - prb_record_text_space() can be used on @e to query the new
 *     actually used space.
 *
 * Important: All @r->info fields will already be set with the current values
 *            for the record. I.e. @r->info->text_len will be less than
 *            @text_buf_size. Writers can use @r->info->text_len to know
 *            where concatenation begins and writers should update
 *            @r->info->text_len after concatenating.
 */
bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
                         struct printk_record *r, u32 caller_id, unsigned int max_size)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        struct printk_info *info;
        unsigned int data_size;
        struct prb_desc *d;
        unsigned long id;

        local_irq_save(e->irqflags);

        /* Transition the newest descriptor back to the reserved state. */
        d = desc_reopen_last(desc_ring, caller_id, &id);
        if (!d) {
                local_irq_restore(e->irqflags);
                goto fail_reopen;
        }

        /* Now the writer has exclusive access: LMM(prb_reserve_in_last:A) */

        info = to_info(desc_ring, id);

        /*
         * Set the @e fields here so that prb_commit() can be used if
         * anything fails from now on.
         */
        e->rb = rb;
        e->id = id;

        /*
         * desc_reopen_last() checked the caller_id, but there was no
         * exclusive access at that point. The descriptor may have
         * changed since then.
         */
        if (caller_id != info->caller_id)
                goto fail;

        if (BLK_DATALESS(&d->text_blk_lpos)) {
                if (WARN_ON_ONCE(info->text_len != 0)) {
                        pr_warn_once("wrong text_len value (%hu, expecting 0)\n",
                                     info->text_len);
                        info->text_len = 0;
                }

                if (!data_check_size(&rb->text_data_ring, r->text_buf_size))
                        goto fail;

                if (r->text_buf_size > max_size)
                        goto fail;

                r->text_buf = data_alloc(rb, r->text_buf_size,
                                         &d->text_blk_lpos, id);
        } else {
                if (!get_data(&rb->text_data_ring, &d->text_blk_lpos, &data_size))
                        goto fail;

                /*
                 * Increase the buffer size to include the original size. If
                 * the meta data (@text_len) is not sane, use the full data
                 * block size.
                 */
                if (WARN_ON_ONCE(info->text_len > data_size)) {
                        pr_warn_once("wrong text_len value (%hu, expecting <=%u)\n",
                                     info->text_len, data_size);
                        info->text_len = data_size;
                }
                r->text_buf_size += info->text_len;

                if (!data_check_size(&rb->text_data_ring, r->text_buf_size))
                        goto fail;

                if (r->text_buf_size > max_size)
                        goto fail;

                r->text_buf = data_realloc(rb, r->text_buf_size,
                                           &d->text_blk_lpos, id);
        }
        if (r->text_buf_size && !r->text_buf)
                goto fail;

        r->info = info;

        e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos);

        return true;
fail:
        prb_commit(e);
        /* prb_commit() re-enabled interrupts. */
fail_reopen:
        /* Make it clear to the caller that the re-reserve failed. */
        memset(r, 0, sizeof(*r));
        return false;
}

/*
 * @last_finalized_seq value guarantees that all records up to and including
 * this sequence number are finalized and can be read. The only exception are
 * too old records which have already been overwritten.
 *
 * It is also guaranteed that @last_finalized_seq only increases.
 *
 * Be aware that finalized records following non-finalized records are not
 * reported because they are not yet available to the reader. For example,
 * a new record stored via printk() will not be available to a printer if
 * it follows a record that has not been finalized yet. However, once that
 * non-finalized record becomes finalized, @last_finalized_seq will be
 * appropriately updated and the full set of finalized records will be
 * available to the printer. And since each printk() caller will either
 * directly print or trigger deferred printing of all available unprinted
 * records, all printk() messages will get printed.
 */
static u64 desc_last_finalized_seq(struct printk_ringbuffer *rb)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        unsigned long ulseq;

        /*
         * Guarantee the sequence number is loaded before loading the
         * associated record in order to guarantee that the record can be
         * seen by this CPU. This pairs with desc_update_last_finalized:A.
         */
        ulseq = atomic_long_read_acquire(&desc_ring->last_finalized_seq
                                        ); /* LMM(desc_last_finalized_seq:A) */

        return __ulseq_to_u64seq(rb, ulseq);
}

static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq,
                            struct printk_record *r, unsigned int *line_count);

/*
 * Check if there are records directly following @last_finalized_seq that are
 * finalized. If so, update @last_finalized_seq to the latest of these
 * records. It is not allowed to skip over records that are not yet finalized.
 */
static void desc_update_last_finalized(struct printk_ringbuffer *rb)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        u64 old_seq = desc_last_finalized_seq(rb);
        unsigned long oldval;
        unsigned long newval;
        u64 finalized_seq;
        u64 try_seq;

try_again:
        finalized_seq = old_seq;
        try_seq = finalized_seq + 1;

        /* Try to find later finalized records. */
        while (_prb_read_valid(rb, &try_seq, NULL, NULL)) {
                finalized_seq = try_seq;
                try_seq++;
        }

        /* No update needed if no later finalized record was found. */
        if (finalized_seq == old_seq)
                return;

        oldval = __u64seq_to_ulseq(old_seq);
        newval = __u64seq_to_ulseq(finalized_seq);

        /*
         * Set the sequence number of a later finalized record that has been
         * seen.
         *
         * Guarantee the record data is visible to other CPUs before storing
         * its sequence number. This pairs with desc_last_finalized_seq:A.
         *
         * Memory barrier involvement:
         *
         * If desc_last_finalized_seq:A reads from
         * desc_update_last_finalized:A, then desc_read:A reads from
         * _prb_commit:B.
         *
         * Relies on:
         *
         * RELEASE from _prb_commit:B to desc_update_last_finalized:A
         *    matching
         * ACQUIRE from desc_last_finalized_seq:A to desc_read:A
         *
         * Note: _prb_commit:B and desc_update_last_finalized:A can be
         *       different CPUs. However, the desc_update_last_finalized:A
         *       CPU (which performs the release) must have previously seen
         *       _prb_commit:B.
         */
        if (!atomic_long_try_cmpxchg_release(&desc_ring->last_finalized_seq,
                                &oldval, newval)) { /* LMM(desc_update_last_finalized:A) */
                old_seq = __ulseq_to_u64seq(rb, oldval);
                goto try_again;
        }
}

/*
 * Attempt to finalize a specified descriptor. If this fails, the descriptor
 * is either already final or it will finalize itself when the writer commits.
 */
static void desc_make_final(struct printk_ringbuffer *rb, unsigned long id)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        unsigned long prev_state_val = DESC_SV(id, desc_committed);
        struct prb_desc *d = to_desc(desc_ring, id);

        if (atomic_long_try_cmpxchg_relaxed(&d->state_var, &prev_state_val,
                        DESC_SV(id, desc_finalized))) { /* LMM(desc_make_final:A) */
                desc_update_last_finalized(rb);
        }
}

/**
 * prb_reserve() - Reserve space in the ringbuffer.
 *
 * @e:  The entry structure to setup.
 * @rb: The ringbuffer to reserve data in.
 * @r:  The record structure to allocate buffers for.
 *
 * This is the public function available to writers to reserve data.
 *
 * The writer specifies the text size to reserve by setting the
 * @text_buf_size field of @r. To ensure proper initialization of @r,
 * prb_rec_init_wr() should be used.
 *
 * Context: Any context. Disables local interrupts on success.
 * Return: true if at least text data could be allocated, otherwise false.
 *
 * On success, the fields @info and @text_buf of @r will be set by this
 * function and should be filled in by the writer before committing. Also
 * on success, prb_record_text_space() can be used on @e to query the actual
 * space used for the text data block.
 *
 * Important: @info->text_len needs to be set correctly by the writer in
 *            order for data to be readable and/or extended. Its value
 *            is initialized to 0.
 */
bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
                 struct printk_record *r)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        struct printk_info *info;
        struct prb_desc *d;
        unsigned long id;
        u64 seq;

        if (!data_check_size(&rb->text_data_ring, r->text_buf_size))
                goto fail;

        /*
         * Descriptors in the reserved state act as blockers to all further
         * reservations once the desc_ring has fully wrapped. Disable
         * interrupts during the reserve/commit window in order to minimize
         * the likelihood of this happening.
         */
        local_irq_save(e->irqflags);

        if (!desc_reserve(rb, &id)) {
                /* Descriptor reservation failures are tracked. */
                atomic_long_inc(&rb->fail);
                local_irq_restore(e->irqflags);
                goto fail;
        }

        d = to_desc(desc_ring, id);
        info = to_info(desc_ring, id);

        /*
         * All @info fields (except @seq) are cleared and must be filled in
         * by the writer. Save @seq before clearing because it is used to
         * determine the new sequence number.
         */
        seq = info->seq;
        memset(info, 0, sizeof(*info));

        /*
         * Set the @e fields here so that prb_commit() can be used if
         * text data allocation fails.
         */
        e->rb = rb;
        e->id = id;

        /*
         * Initialize the sequence number if it has "never been set".
         * Otherwise just increment it by a full wrap.
         *
         * @seq is considered "never been set" if it has a value of 0,
         * _except_ for @infos[0], which was specially setup by the ringbuffer
         * initializer and therefore is always considered as set.
         *
         * See the "Bootstrap" comment block in printk_ringbuffer.h for
         * details about how the initializer bootstraps the descriptors.
         */
        if (seq == 0 && DESC_INDEX(desc_ring, id) != 0)
                info->seq = DESC_INDEX(desc_ring, id);
        else
                info->seq = seq + DESCS_COUNT(desc_ring);

        /*
         * New data is about to be reserved. Once that happens, previous
         * descriptors are no longer able to be extended. Finalize the
         * previous descriptor now so that it can be made available to
         * readers. (For seq==0 there is no previous descriptor.)
         */
        if (info->seq > 0)
                desc_make_final(rb, DESC_ID(id - 1));

        r->text_buf = data_alloc(rb, r->text_buf_size, &d->text_blk_lpos, id);
        /* If text data allocation fails, a data-less record is committed. */
        if (r->text_buf_size && !r->text_buf) {
                prb_commit(e);
                /* prb_commit() re-enabled interrupts. */
                goto fail;
        }

        r->info = info;

        /* Record full text space used by record. */
        e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos);

        return true;
fail:
        /* Make it clear to the caller that the reserve failed. */
        memset(r, 0, sizeof(*r));
        return false;
}
EXPORT_SYMBOL_IF_KUNIT(prb_reserve);

/* Commit the data (possibly finalizing it) and restore interrupts. */
static void _prb_commit(struct prb_reserved_entry *e, unsigned long state_val)
{
        struct prb_desc_ring *desc_ring = &e->rb->desc_ring;
        struct prb_desc *d = to_desc(desc_ring, e->id);
        unsigned long prev_state_val = DESC_SV(e->id, desc_reserved);

        /* Now the writer has finished all writing: LMM(_prb_commit:A) */

        /*
         * Set the descriptor as committed. See "ABA Issues" about why
         * cmpxchg() instead of set() is used.
         *
         * 1  Guarantee all record data is stored before the descriptor state
         *    is stored as committed. A write memory barrier is sufficient
         *    for this. This pairs with desc_read:B and desc_reopen_last:A.
         *
         * 2. Guarantee the descriptor state is stored as committed before
         *    re-checking the head ID in order to possibly finalize this
         *    descriptor. This pairs with desc_reserve:D.
         *
         *    Memory barrier involvement:
         *
         *    If prb_commit:A reads from desc_reserve:D, then
         *    desc_make_final:A reads from _prb_commit:B.
         *
         *    Relies on:
         *
         *    MB _prb_commit:B to prb_commit:A
         *       matching
         *    MB desc_reserve:D to desc_make_final:A
         */
        if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val,
                        DESC_SV(e->id, state_val))) { /* LMM(_prb_commit:B) */
                WARN_ON_ONCE(1);
        }

        /* Restore interrupts, the reserve/commit window is finished. */
        local_irq_restore(e->irqflags);
}

/**
 * prb_commit() - Commit (previously reserved) data to the ringbuffer.
 *
 * @e: The entry containing the reserved data information.
 *
 * This is the public function available to writers to commit data.
 *
 * Note that the data is not yet available to readers until it is finalized.
 * Finalizing happens automatically when space for the next record is
 * reserved.
 *
 * See prb_final_commit() for a version of this function that finalizes
 * immediately.
 *
 * Context: Any context. Enables local interrupts.
 */
void prb_commit(struct prb_reserved_entry *e)
{
        struct prb_desc_ring *desc_ring = &e->rb->desc_ring;
        unsigned long head_id;

        _prb_commit(e, desc_committed);

        /*
         * If this descriptor is no longer the head (i.e. a new record has
         * been allocated), extending the data for this record is no longer
         * allowed and therefore it must be finalized.
         */
        head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_commit:A) */
        if (head_id != e->id)
                desc_make_final(e->rb, e->id);
}
EXPORT_SYMBOL_IF_KUNIT(prb_commit);

/**
 * prb_final_commit() - Commit and finalize (previously reserved) data to
 *                      the ringbuffer.
 *
 * @e: The entry containing the reserved data information.
 *
 * This is the public function available to writers to commit+finalize data.
 *
 * By finalizing, the data is made immediately available to readers.
 *
 * This function should only be used if there are no intentions of extending
 * this data using prb_reserve_in_last().
 *
 * Context: Any context. Enables local interrupts.
 */
void prb_final_commit(struct prb_reserved_entry *e)
{
        _prb_commit(e, desc_finalized);

        desc_update_last_finalized(e->rb);
}

/*
 * Count the number of lines in provided text. All text has at least 1 line
 * (even if @text_size is 0). Each '\n' processed is counted as an additional
 * line.
 */
static unsigned int count_lines(const char *text, unsigned int text_size)
{
        unsigned int next_size = text_size;
        unsigned int line_count = 1;
        const char *next = text;

        while (next_size) {
                next = memchr(next, '\n', next_size);
                if (!next)
                        break;
                line_count++;
                next++;
                next_size = text_size - (next - text);
        }

        return line_count;
}

/*
 * Given @blk_lpos, copy an expected @len of data into the provided buffer.
 * If @line_count is provided, count the number of lines in the data.
 *
 * This function (used by readers) performs strict validation on the data
 * size to possibly detect bugs in the writer code. A WARN_ON_ONCE() is
 * triggered if an internal error is detected.
 */
static bool copy_data(struct prb_data_ring *data_ring,
                      struct prb_data_blk_lpos *blk_lpos, u16 len, char *buf,
                      unsigned int buf_size, unsigned int *line_count)
{
        unsigned int data_size;
        const char *data;

        /* Caller might not want any data. */
        if ((!buf || !buf_size) && !line_count)
                return true;

        data = get_data(data_ring, blk_lpos, &data_size);
        if (!data)
                return false;

        /*
         * Actual cannot be less than expected. It can be more than expected
         * because of the trailing alignment padding.
         *
         * Note that invalid @len values can occur because the caller loads
         * the value during an allowed data race.
         */
        if (data_size < (unsigned int)len)
                return false;

        /* Caller interested in the line count? */
        if (line_count)
                *line_count = count_lines(data, len);

        /* Caller interested in the data content? */
        if (!buf || !buf_size)
                return true;

        data_size = min_t(unsigned int, buf_size, len);

        memcpy(&buf[0], data, data_size); /* LMM(copy_data:A) */
        return true;
}

/*
 * This is an extended version of desc_read(). It gets a copy of a specified
 * descriptor. However, it also verifies that the record is finalized and has
 * the sequence number @seq. On success, 0 is returned.
 *
 * Error return values:
 * -EINVAL: A finalized record with sequence number @seq does not exist.
 * -ENOENT: A finalized record with sequence number @seq exists, but its data
 *          is not available. This is a valid record, so readers should
 *          continue with the next record.
 */
static int desc_read_finalized_seq(struct prb_desc_ring *desc_ring,
                                   unsigned long id, u64 seq,
                                   struct prb_desc *desc_out)
{
        struct prb_data_blk_lpos *blk_lpos = &desc_out->text_blk_lpos;
        enum desc_state d_state;
        u64 s;

        d_state = desc_read(desc_ring, id, desc_out, &s, NULL);

        /*
         * An unexpected @id (desc_miss) or @seq mismatch means the record
         * does not exist. A descriptor in the reserved or committed state
         * means the record does not yet exist for the reader.
         */
        if (d_state == desc_miss ||
            d_state == desc_reserved ||
            d_state == desc_committed ||
            s != seq) {
                return -EINVAL;
        }

        /*
         * A descriptor in the reusable state may no longer have its data
         * available; report it as existing but with lost data. Or the record
         * may actually be a record with lost data.
         */
        if (d_state == desc_reusable ||
            (blk_lpos->begin == FAILED_LPOS && blk_lpos->next == FAILED_LPOS)) {
                return -ENOENT;
        }

        return 0;
}

/*
 * Copy the ringbuffer data from the record with @seq to the provided
 * @r buffer. On success, 0 is returned.
 *
 * See desc_read_finalized_seq() for error return values.
 */
static int prb_read(struct printk_ringbuffer *rb, u64 seq,
                    struct printk_record *r, unsigned int *line_count)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        struct printk_info *info = to_info(desc_ring, seq);
        struct prb_desc *rdesc = to_desc(desc_ring, seq);
        atomic_long_t *state_var = &rdesc->state_var;
        struct prb_desc desc;
        unsigned long id;
        int err;

        /* Extract the ID, used to specify the descriptor to read. */
        id = DESC_ID(atomic_long_read(state_var));

        /* Get a local copy of the correct descriptor (if available). */
        err = desc_read_finalized_seq(desc_ring, id, seq, &desc);

        /*
         * If @r is NULL, the caller is only interested in the availability
         * of the record.
         */
        if (err || !r)
                return err;

        /* If requested, copy meta data. */
        if (r->info)
                memcpy(r->info, info, sizeof(*(r->info)));

        /* Copy text data. If it fails, this is a data-less record. */
        if (!copy_data(&rb->text_data_ring, &desc.text_blk_lpos, info->text_len,
                       r->text_buf, r->text_buf_size, line_count)) {
                return -ENOENT;
        }

        /* Ensure the record is still finalized and has the same @seq. */
        return desc_read_finalized_seq(desc_ring, id, seq, &desc);
}

/* Get the sequence number of the tail descriptor. */
u64 prb_first_seq(struct printk_ringbuffer *rb)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        enum desc_state d_state;
        struct prb_desc desc;
        unsigned long id;
        u64 seq;

        for (;;) {
                id = atomic_long_read(&rb->desc_ring.tail_id); /* LMM(prb_first_seq:A) */

                d_state = desc_read(desc_ring, id, &desc, &seq, NULL); /* LMM(prb_first_seq:B) */

                /*
                 * This loop will not be infinite because the tail is
                 * _always_ in the finalized or reusable state.
                 */
                if (d_state == desc_finalized || d_state == desc_reusable)
                        break;

                /*
                 * Guarantee the last state load from desc_read() is before
                 * reloading @tail_id in order to see a new tail in the case
                 * that the descriptor has been recycled. This pairs with
                 * desc_reserve:D.
                 *
                 * Memory barrier involvement:
                 *
                 * If prb_first_seq:B reads from desc_reserve:F, then
                 * prb_first_seq:A reads from desc_push_tail:B.
                 *
                 * Relies on:
                 *
                 * MB from desc_push_tail:B to desc_reserve:F
                 *    matching
                 * RMB prb_first_seq:B to prb_first_seq:A
                 */
                smp_rmb(); /* LMM(prb_first_seq:C) */
        }

        return seq;
}

/**
 * prb_next_reserve_seq() - Get the sequence number after the most recently
 *                  reserved record.
 *
 * @rb:  The ringbuffer to get the sequence number from.
 *
 * This is the public function available to readers to see what sequence
 * number will be assigned to the next reserved record.
 *
 * Note that depending on the situation, this value can be equal to or
 * higher than the sequence number returned by prb_next_seq().
 *
 * Context: Any context.
 * Return: The sequence number that will be assigned to the next record
 *         reserved.
 */
u64 prb_next_reserve_seq(struct printk_ringbuffer *rb)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        unsigned long last_finalized_id;
        atomic_long_t *state_var;
        u64 last_finalized_seq;
        unsigned long head_id;
        struct prb_desc desc;
        unsigned long diff;
        struct prb_desc *d;
        int err;

        /*
         * It may not be possible to read a sequence number for @head_id.
         * So the ID of @last_finailzed_seq is used to calculate what the
         * sequence number of @head_id will be.
         */

try_again:
        last_finalized_seq = desc_last_finalized_seq(rb);

        /*
         * @head_id is loaded after @last_finalized_seq to ensure that
         * it points to the record with @last_finalized_seq or newer.
         *
         * Memory barrier involvement:
         *
         * If desc_last_finalized_seq:A reads from
         * desc_update_last_finalized:A, then
         * prb_next_reserve_seq:A reads from desc_reserve:D.
         *
         * Relies on:
         *
         * RELEASE from desc_reserve:D to desc_update_last_finalized:A
         *    matching
         * ACQUIRE from desc_last_finalized_seq:A to prb_next_reserve_seq:A
         *
         * Note: desc_reserve:D and desc_update_last_finalized:A can be
         *       different CPUs. However, the desc_update_last_finalized:A CPU
         *       (which performs the release) must have previously seen
         *       desc_read:C, which implies desc_reserve:D can be seen.
         */
        head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_next_reserve_seq:A) */

        d = to_desc(desc_ring, last_finalized_seq);
        state_var = &d->state_var;

        /* Extract the ID, used to specify the descriptor to read. */
        last_finalized_id = DESC_ID(atomic_long_read(state_var));

        /* Ensure @last_finalized_id is correct. */
        err = desc_read_finalized_seq(desc_ring, last_finalized_id, last_finalized_seq, &desc);

        if (err == -EINVAL) {
                if (last_finalized_seq == 0) {
                        /*
                         * No record has been finalized or even reserved yet.
                         *
                         * The @head_id is initialized such that the first
                         * increment will yield the first record (seq=0).
                         * Handle it separately to avoid a negative @diff
                         * below.
                         */
                        if (head_id == DESC0_ID(desc_ring->count_bits))
                                return 0;

                        /*
                         * One or more descriptors are already reserved. Use
                         * the descriptor ID of the first one (@seq=0) for
                         * the @diff below.
                         */
                        last_finalized_id = DESC0_ID(desc_ring->count_bits) + 1;
                } else {
                        /* Record must have been overwritten. Try again. */
                        goto try_again;
                }
        }

        /* Diff of known descriptor IDs to compute related sequence numbers. */
        diff = head_id - last_finalized_id;

        /*
         * @head_id points to the most recently reserved record, but this
         * function returns the sequence number that will be assigned to the
         * next (not yet reserved) record. Thus +1 is needed.
         */
        return (last_finalized_seq + diff + 1);
}

/*
 * Non-blocking read of a record.
 *
 * On success @seq is updated to the record that was read and (if provided)
 * @r and @line_count will contain the read/calculated data.
 *
 * On failure @seq is updated to a record that is not yet available to the
 * reader, but it will be the next record available to the reader.
 *
 * Note: When the current CPU is in panic, this function will skip over any
 *       non-existent/non-finalized records in order to allow the panic CPU
 *       to print any and all records that have been finalized.
 */
static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq,
                            struct printk_record *r, unsigned int *line_count)
{
        u64 tail_seq;
        int err;

        while ((err = prb_read(rb, *seq, r, line_count))) {
                tail_seq = prb_first_seq(rb);

                if (*seq < tail_seq) {
                        /*
                         * Behind the tail. Catch up and try again. This
                         * can happen for -ENOENT and -EINVAL cases.
                         */
                        *seq = tail_seq;

                } else if (err == -ENOENT) {
                        /* Record exists, but the data was lost. Skip. */
                        (*seq)++;

                } else {
                        /*
                         * Non-existent/non-finalized record. Must stop.
                         *
                         * For panic situations it cannot be expected that
                         * non-finalized records will become finalized. But
                         * there may be other finalized records beyond that
                         * need to be printed for a panic situation. If this
                         * is the panic CPU, skip this
                         * non-existent/non-finalized record unless non-panic
                         * CPUs are still running and their debugging is
                         * explicitly enabled.
                         *
                         * Note that new messages printed on panic CPU are
                         * finalized when we are here. The only exception
                         * might be the last message without trailing newline.
                         * But it would have the sequence number returned
                         * by "prb_next_reserve_seq() - 1".
                         */
                        if (panic_on_this_cpu() &&
                            (!debug_non_panic_cpus || legacy_allow_panic_sync) &&
                            ((*seq + 1) < prb_next_reserve_seq(rb))) {
                                (*seq)++;
                        } else {
                                return false;
                        }
                }
        }

        return true;
}

/**
 * prb_read_valid() - Non-blocking read of a requested record or (if gone)
 *                    the next available record.
 *
 * @rb:  The ringbuffer to read from.
 * @seq: The sequence number of the record to read.
 * @r:   A record data buffer to store the read record to.
 *
 * This is the public function available to readers to read a record.
 *
 * The reader provides the @info and @text_buf buffers of @r to be
 * filled in. Any of the buffer pointers can be set to NULL if the reader
 * is not interested in that data. To ensure proper initialization of @r,
 * prb_rec_init_rd() should be used.
 *
 * Context: Any context.
 * Return: true if a record was read, otherwise false.
 *
 * On success, the reader must check r->info.seq to see which record was
 * actually read. This allows the reader to detect dropped records.
 *
 * Failure means @seq refers to a record not yet available to the reader.
 */
bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq,
                    struct printk_record *r)
{
        return _prb_read_valid(rb, &seq, r, NULL);
}
EXPORT_SYMBOL_IF_KUNIT(prb_read_valid);

/**
 * prb_read_valid_info() - Non-blocking read of meta data for a requested
 *                         record or (if gone) the next available record.
 *
 * @rb:         The ringbuffer to read from.
 * @seq:        The sequence number of the record to read.
 * @info:       A buffer to store the read record meta data to.
 * @line_count: A buffer to store the number of lines in the record text.
 *
 * This is the public function available to readers to read only the
 * meta data of a record.
 *
 * The reader provides the @info, @line_count buffers to be filled in.
 * Either of the buffer pointers can be set to NULL if the reader is not
 * interested in that data.
 *
 * Context: Any context.
 * Return: true if a record's meta data was read, otherwise false.
 *
 * On success, the reader must check info->seq to see which record meta data
 * was actually read. This allows the reader to detect dropped records.
 *
 * Failure means @seq refers to a record not yet available to the reader.
 */
bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq,
                         struct printk_info *info, unsigned int *line_count)
{
        struct printk_record r;

        prb_rec_init_rd(&r, info, NULL, 0);

        return _prb_read_valid(rb, &seq, &r, line_count);
}

/**
 * prb_first_valid_seq() - Get the sequence number of the oldest available
 *                         record.
 *
 * @rb: The ringbuffer to get the sequence number from.
 *
 * This is the public function available to readers to see what the
 * first/oldest valid sequence number is.
 *
 * This provides readers a starting point to begin iterating the ringbuffer.
 *
 * Context: Any context.
 * Return: The sequence number of the first/oldest record or, if the
 *         ringbuffer is empty, 0 is returned.
 */
u64 prb_first_valid_seq(struct printk_ringbuffer *rb)
{
        u64 seq = 0;

        if (!_prb_read_valid(rb, &seq, NULL, NULL))
                return 0;

        return seq;
}

/**
 * prb_next_seq() - Get the sequence number after the last available record.
 *
 * @rb:  The ringbuffer to get the sequence number from.
 *
 * This is the public function available to readers to see what the next
 * newest sequence number available to readers will be.
 *
 * This provides readers a sequence number to jump to if all currently
 * available records should be skipped. It is guaranteed that all records
 * previous to the returned value have been finalized and are (or were)
 * available to the reader.
 *
 * Context: Any context.
 * Return: The sequence number of the next newest (not yet available) record
 *         for readers.
 */
u64 prb_next_seq(struct printk_ringbuffer *rb)
{
        u64 seq;

        seq = desc_last_finalized_seq(rb);

        /*
         * Begin searching after the last finalized record.
         *
         * On 0, the search must begin at 0 because of hack#2
         * of the bootstrapping phase it is not known if a
         * record at index 0 exists.
         */
        if (seq != 0)
                seq++;

        /*
         * The information about the last finalized @seq might be inaccurate.
         * Search forward to find the current one.
         */
        while (_prb_read_valid(rb, &seq, NULL, NULL))
                seq++;

        return seq;
}

/**
 * prb_init() - Initialize a ringbuffer to use provided external buffers.
 *
 * @rb:       The ringbuffer to initialize.
 * @text_buf: The data buffer for text data.
 * @textbits: The size of @text_buf as a power-of-2 value.
 * @descs:    The descriptor buffer for ringbuffer records.
 * @descbits: The count of @descs items as a power-of-2 value.
 * @infos:    The printk_info buffer for ringbuffer records.
 *
 * This is the public function available to writers to setup a ringbuffer
 * during runtime using provided buffers.
 *
 * This must match the initialization of DEFINE_PRINTKRB().
 *
 * Context: Any context.
 */
void prb_init(struct printk_ringbuffer *rb,
              char *text_buf, unsigned int textbits,
              struct prb_desc *descs, unsigned int descbits,
              struct printk_info *infos)
{
        memset(descs, 0, _DESCS_COUNT(descbits) * sizeof(descs[0]));
        memset(infos, 0, _DESCS_COUNT(descbits) * sizeof(infos[0]));

        rb->desc_ring.count_bits = descbits;
        rb->desc_ring.descs = descs;
        rb->desc_ring.infos = infos;
        atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits));
        atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits));
        atomic_long_set(&rb->desc_ring.last_finalized_seq, 0);

        rb->text_data_ring.size_bits = textbits;
        rb->text_data_ring.data = text_buf;
        atomic_long_set(&rb->text_data_ring.head_lpos, BLK0_LPOS(textbits));
        atomic_long_set(&rb->text_data_ring.tail_lpos, BLK0_LPOS(textbits));

        atomic_long_set(&rb->fail, 0);

        atomic_long_set(&(descs[_DESCS_COUNT(descbits) - 1].state_var), DESC0_SV(descbits));
        descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.begin = FAILED_LPOS;
        descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.next = FAILED_LPOS;

        infos[0].seq = -(u64)_DESCS_COUNT(descbits);
        infos[_DESCS_COUNT(descbits) - 1].seq = 0;
}
EXPORT_SYMBOL_IF_KUNIT(prb_init);

/**
 * prb_record_text_space() - Query the full actual used ringbuffer space for
 *                           the text data of a reserved entry.
 *
 * @e: The successfully reserved entry to query.
 *
 * This is the public function available to writers to see how much actual
 * space is used in the ringbuffer to store the text data of the specified
 * entry.
 *
 * This function is only valid if @e has been successfully reserved using
 * prb_reserve().
 *
 * Context: Any context.
 * Return: The size in bytes used by the text data of the associated record.
 */
unsigned int prb_record_text_space(struct prb_reserved_entry *e)
{
        return e->text_space;
}




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef _LINUX_IO_URING_H
#define _LINUX_IO_URING_H

#include <linux/sched.h>
#include <linux/xarray.h>
#include <uapi/linux/io_uring.h>

#if defined(CONFIG_IO_URING)
void __io_uring_cancel(bool cancel_all);
void __io_uring_free(struct task_struct *tsk);
void io_uring_unreg_ringfd(void);
const char *io_uring_get_opcode(u8 opcode);
bool io_is_uring_fops(struct file *file);

static inline void io_uring_files_cancel(void)
{
        if (current->io_uring)
                __io_uring_cancel(false);
}
static inline void io_uring_task_cancel(void)
{
        if (current->io_uring)
                __io_uring_cancel(true);
}
static inline void io_uring_free(struct task_struct *tsk)
{
        if (tsk->io_uring)
                __io_uring_free(tsk);
}
#else
static inline void io_uring_task_cancel(void)
{
}
static inline void io_uring_files_cancel(void)
{
}
static inline void io_uring_free(struct task_struct *tsk)
{
}
static inline const char *io_uring_get_opcode(u8 opcode)
{
        return "";
}
static inline bool io_is_uring_fops(struct file *file)
{
        return false;
}
#endif

#endif
































































































































































































   39 






















































   44 



   39 

   40 






   39 

   39 














   44 
   44 

   30 






























   43 






   44 



















































































































































   44 








   43 





   44 



   44 



   44 







   43 








   44 

   43 








    7 





   44 




   44 
   39 

   38 



   44 





   44 












   44 


   44 
    9 

    9 



   44 





   44 
   44 

   44 


   43 





   43 










































   55 



   54 

















   56 

   56 



   55 
   56 

   56 

   56 


   55 














































   55 






   55 



   55 









   56 



   56 


   54 


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   61 
























   60 















   60 










   60 
   61 




   51 
   61 


   61 


   61 




   60 


   39 
   61 








   61 
















   60 

   61 




































   38 

   39 



   39 


   61 
   44 

   60 







   39 

































   44 

































   60 







   60 











   61 


   60 

   55 

   59 


   61 
   39 
   58 

   61 





   43 
   44 

   39 
   43 








   39 


























   56 





























   61 
   61 
   59 





   60 

   39 





















   39 




   39 





















   16 




































































   38 


























































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/lib/vsprintf.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/* vsprintf.c -- Lars Wirzenius & Linus Torvalds. */
/*
 * Wirzenius wrote this portably, Torvalds fucked it up :-)
 */

/*
 * Fri Jul 13 2001 Crutcher Dunnavant <crutcher+kernel@datastacks.com>
 * - changed to provide snprintf and vsnprintf functions
 * So Feb  1 16:51:32 CET 2004 Juergen Quade <quade@hsnr.de>
 * - scnprintf and vscnprintf
 */

#include <linux/stdarg.h>
#include <linux/build_bug.h>
#include <linux/clk.h>
#include <linux/clk-provider.h>
#include <linux/errname.h>
#include <linux/module.h>        /* for KSYM_SYMBOL_LEN */
#include <linux/types.h>
#include <linux/string.h>
#include <linux/ctype.h>
#include <linux/kernel.h>
#include <linux/kallsyms.h>
#include <linux/math64.h>
#include <linux/uaccess.h>
#include <linux/ioport.h>
#include <linux/dcache.h>
#include <linux/cred.h>
#include <linux/rtc.h>
#include <linux/sprintf.h>
#include <linux/time.h>
#include <linux/uuid.h>
#include <linux/of.h>
#include <net/addrconf.h>
#include <linux/siphash.h>
#include <linux/compiler.h>
#include <linux/property.h>
#include <linux/notifier.h>
#ifdef CONFIG_BLOCK
#include <linux/blkdev.h>
#endif

#include "../mm/internal.h"        /* For the trace_print_flags arrays */

#include <asm/page.h>                /* for PAGE_SIZE */
#include <asm/byteorder.h>        /* cpu_to_le16 */
#include <linux/unaligned.h>

#include <linux/string_helpers.h>
#include "kstrtox.h"

/* Disable pointer hashing if requested */
bool no_hash_pointers __ro_after_init;
EXPORT_SYMBOL_GPL(no_hash_pointers);

/*
 * Hashed pointers policy selected by "hash_pointers=..." boot param
 *
 * `auto`   - Hashed pointers enabled unless disabled by slub_debug_enabled=true
 * `always` - Hashed pointers enabled unconditionally
 * `never`  - Hashed pointers disabled unconditionally
 */
enum hash_pointers_policy {
        HASH_PTR_AUTO = 0,
        HASH_PTR_ALWAYS,
        HASH_PTR_NEVER
};
static enum hash_pointers_policy hash_pointers_mode __initdata;

noinline
static unsigned long long simple_strntoull(const char *startp, char **endp, unsigned int base, size_t max_chars)
{
        const char *cp;
        unsigned long long result = 0ULL;
        size_t prefix_chars;
        unsigned int rv;

        cp = _parse_integer_fixup_radix(startp, &base);
        prefix_chars = cp - startp;
        if (prefix_chars < max_chars) {
                rv = _parse_integer_limit(cp, base, &result, max_chars - prefix_chars);
                /* FIXME */
                cp += (rv & ~KSTRTOX_OVERFLOW);
        } else {
                /* Field too short for prefix + digit, skip over without converting */
                cp = startp + max_chars;
        }

        if (endp)
                *endp = (char *)cp;

        return result;
}

/**
 * simple_strtoull - convert a string to an unsigned long long
 * @cp: The start of the string
 * @endp: A pointer to the end of the parsed string will be placed here
 * @base: The number base to use
 *
 * This function has caveats. Please use kstrtoull instead.
 */
noinline
unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base)
{
        return simple_strntoull(cp, endp, base, INT_MAX);
}
EXPORT_SYMBOL(simple_strtoull);

/**
 * simple_strtoul - convert a string to an unsigned long
 * @cp: The start of the string
 * @endp: A pointer to the end of the parsed string will be placed here
 * @base: The number base to use
 *
 * This function has caveats. Please use kstrtoul instead.
 */
unsigned long simple_strtoul(const char *cp, char **endp, unsigned int base)
{
        return simple_strtoull(cp, endp, base);
}
EXPORT_SYMBOL(simple_strtoul);

unsigned long simple_strntoul(const char *cp, char **endp, unsigned int base,
                              size_t max_chars)
{
        return simple_strntoull(cp, endp, base, max_chars);
}
EXPORT_SYMBOL(simple_strntoul);

/**
 * simple_strtol - convert a string to a signed long
 * @cp: The start of the string
 * @endp: A pointer to the end of the parsed string will be placed here
 * @base: The number base to use
 *
 * This function has caveats. Please use kstrtol instead.
 */
long simple_strtol(const char *cp, char **endp, unsigned int base)
{
        if (*cp == '-')
                return -simple_strtoul(cp + 1, endp, base);

        return simple_strtoul(cp, endp, base);
}
EXPORT_SYMBOL(simple_strtol);

noinline
static long long simple_strntoll(const char *cp, char **endp, unsigned int base, size_t max_chars)
{
        /*
         * simple_strntoull() safely handles receiving max_chars==0 in the
         * case cp[0] == '-' && max_chars == 1.
         * If max_chars == 0 we can drop through and pass it to simple_strntoull()
         * and the content of *cp is irrelevant.
         */
        if (*cp == '-' && max_chars > 0)
                return -simple_strntoull(cp + 1, endp, base, max_chars - 1);

        return simple_strntoull(cp, endp, base, max_chars);
}

/**
 * simple_strtoll - convert a string to a signed long long
 * @cp: The start of the string
 * @endp: A pointer to the end of the parsed string will be placed here
 * @base: The number base to use
 *
 * This function has caveats. Please use kstrtoll instead.
 */
long long simple_strtoll(const char *cp, char **endp, unsigned int base)
{
        return simple_strntoll(cp, endp, base, INT_MAX);
}
EXPORT_SYMBOL(simple_strtoll);

static inline int skip_atoi(const char **s)
{
        int i = 0;

        do {
                i = i*10 + *((*s)++) - '0';
        } while (isdigit(**s));

        return i;
}

/*
 * Decimal conversion is by far the most typical, and is used for
 * /proc and /sys data. This directly impacts e.g. top performance
 * with many processes running. We optimize it for speed by emitting
 * two characters at a time, using a 200 byte lookup table. This
 * roughly halves the number of multiplications compared to computing
 * the digits one at a time. Implementation strongly inspired by the
 * previous version, which in turn used ideas described at
 * <http://www.cs.uiowa.edu/~jones/bcd/divide.html> (with permission
 * from the author, Douglas W. Jones).
 *
 * It turns out there is precisely one 26 bit fixed-point
 * approximation a of 64/100 for which x/100 == (x * (u64)a) >> 32
 * holds for all x in [0, 10^8-1], namely a = 0x28f5c29. The actual
 * range happens to be somewhat larger (x <= 1073741898), but that's
 * irrelevant for our purpose.
 *
 * For dividing a number in the range [10^4, 10^6-1] by 100, we still
 * need a 32x32->64 bit multiply, so we simply use the same constant.
 *
 * For dividing a number in the range [100, 10^4-1] by 100, there are
 * several options. The simplest is (x * 0x147b) >> 19, which is valid
 * for all x <= 43698.
 */

static const u16 decpair[100] = {
#define _(x) (__force u16) cpu_to_le16(((x % 10) | ((x / 10) << 8)) + 0x3030)
        _( 0), _( 1), _( 2), _( 3), _( 4), _( 5), _( 6), _( 7), _( 8), _( 9),
        _(10), _(11), _(12), _(13), _(14), _(15), _(16), _(17), _(18), _(19),
        _(20), _(21), _(22), _(23), _(24), _(25), _(26), _(27), _(28), _(29),
        _(30), _(31), _(32), _(33), _(34), _(35), _(36), _(37), _(38), _(39),
        _(40), _(41), _(42), _(43), _(44), _(45), _(46), _(47), _(48), _(49),
        _(50), _(51), _(52), _(53), _(54), _(55), _(56), _(57), _(58), _(59),
        _(60), _(61), _(62), _(63), _(64), _(65), _(66), _(67), _(68), _(69),
        _(70), _(71), _(72), _(73), _(74), _(75), _(76), _(77), _(78), _(79),
        _(80), _(81), _(82), _(83), _(84), _(85), _(86), _(87), _(88), _(89),
        _(90), _(91), _(92), _(93), _(94), _(95), _(96), _(97), _(98), _(99),
#undef _
};

/*
 * This will print a single '0' even if r == 0, since we would
 * immediately jump to out_r where two 0s would be written but only
 * one of them accounted for in buf. This is needed by ip4_string
 * below. All other callers pass a non-zero value of r.
*/
static noinline_for_stack
char *put_dec_trunc8(char *buf, unsigned r)
{
        unsigned q;

        /* 1 <= r < 10^8 */
        if (r < 100)
                goto out_r;

        /* 100 <= r < 10^8 */
        q = (r * (u64)0x28f5c29) >> 32;
        *((u16 *)buf) = decpair[r - 100*q];
        buf += 2;

        /* 1 <= q < 10^6 */
        if (q < 100)
                goto out_q;

        /*  100 <= q < 10^6 */
        r = (q * (u64)0x28f5c29) >> 32;
        *((u16 *)buf) = decpair[q - 100*r];
        buf += 2;

        /* 1 <= r < 10^4 */
        if (r < 100)
                goto out_r;

        /* 100 <= r < 10^4 */
        q = (r * 0x147b) >> 19;
        *((u16 *)buf) = decpair[r - 100*q];
        buf += 2;
out_q:
        /* 1 <= q < 100 */
        r = q;
out_r:
        /* 1 <= r < 100 */
        *((u16 *)buf) = decpair[r];
        buf += r < 10 ? 1 : 2;
        return buf;
}

#if BITS_PER_LONG == 64 && BITS_PER_LONG_LONG == 64
static noinline_for_stack
char *put_dec_full8(char *buf, unsigned r)
{
        unsigned q;

        /* 0 <= r < 10^8 */
        q = (r * (u64)0x28f5c29) >> 32;
        *((u16 *)buf) = decpair[r - 100*q];
        buf += 2;

        /* 0 <= q < 10^6 */
        r = (q * (u64)0x28f5c29) >> 32;
        *((u16 *)buf) = decpair[q - 100*r];
        buf += 2;

        /* 0 <= r < 10^4 */
        q = (r * 0x147b) >> 19;
        *((u16 *)buf) = decpair[r - 100*q];
        buf += 2;

        /* 0 <= q < 100 */
        *((u16 *)buf) = decpair[q];
        buf += 2;
        return buf;
}

static noinline_for_stack
char *put_dec(char *buf, unsigned long long n)
{
        if (n >= 100*1000*1000)
                buf = put_dec_full8(buf, do_div(n, 100*1000*1000));
        /* 1 <= n <= 1.6e11 */
        if (n >= 100*1000*1000)
                buf = put_dec_full8(buf, do_div(n, 100*1000*1000));
        /* 1 <= n < 1e8 */
        return put_dec_trunc8(buf, n);
}

#elif BITS_PER_LONG == 32 && BITS_PER_LONG_LONG == 64

static void
put_dec_full4(char *buf, unsigned r)
{
        unsigned q;

        /* 0 <= r < 10^4 */
        q = (r * 0x147b) >> 19;
        *((u16 *)buf) = decpair[r - 100*q];
        buf += 2;
        /* 0 <= q < 100 */
        *((u16 *)buf) = decpair[q];
}

/*
 * Call put_dec_full4 on x % 10000, return x / 10000.
 * The approximation x/10000 == (x * 0x346DC5D7) >> 43
 * holds for all x < 1,128,869,999.  The largest value this
 * helper will ever be asked to convert is 1,125,520,955.
 * (second call in the put_dec code, assuming n is all-ones).
 */
static noinline_for_stack
unsigned put_dec_helper4(char *buf, unsigned x)
{
        uint32_t q = (x * (uint64_t)0x346DC5D7) >> 43;

        put_dec_full4(buf, x - q * 10000);
        return q;
}

/* Based on code by Douglas W. Jones found at
 * <http://www.cs.uiowa.edu/~jones/bcd/decimal.html#sixtyfour>
 * (with permission from the author).
 * Performs no 64-bit division and hence should be fast on 32-bit machines.
 */
static
char *put_dec(char *buf, unsigned long long n)
{
        uint32_t d3, d2, d1, q, h;

        if (n < 100*1000*1000)
                return put_dec_trunc8(buf, n);

        d1  = ((uint32_t)n >> 16); /* implicit "& 0xffff" */
        h   = (n >> 32);
        d2  = (h      ) & 0xffff;
        d3  = (h >> 16); /* implicit "& 0xffff" */

        /* n = 2^48 d3 + 2^32 d2 + 2^16 d1 + d0
             = 281_4749_7671_0656 d3 + 42_9496_7296 d2 + 6_5536 d1 + d0 */
        q   = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff);
        q = put_dec_helper4(buf, q);

        q += 7671 * d3 + 9496 * d2 + 6 * d1;
        q = put_dec_helper4(buf+4, q);

        q += 4749 * d3 + 42 * d2;
        q = put_dec_helper4(buf+8, q);

        q += 281 * d3;
        buf += 12;
        if (q)
                buf = put_dec_trunc8(buf, q);
        else while (buf[-1] == '0')
                --buf;

        return buf;
}

#endif

/*
 * Convert passed number to decimal string.
 * Returns the length of string.  On buffer overflow, returns 0.
 *
 * If speed is not important, use snprintf(). It's easy to read the code.
 */
int num_to_str(char *buf, int size, unsigned long long num, unsigned int width)
{
        /* put_dec requires 2-byte alignment of the buffer. */
        char tmp[sizeof(num) * 3] __aligned(2);
        int idx, len;

        /* put_dec() may work incorrectly for num = 0 (generate "", not "0") */
        if (num <= 9) {
                tmp[0] = '0' + num;
                len = 1;
        } else {
                len = put_dec(tmp, num) - tmp;
        }

        if (len > size || width > size)
                return 0;

        if (width > len) {
                width = width - len;
                for (idx = 0; idx < width; idx++)
                        buf[idx] = ' ';
        } else {
                width = 0;
        }

        for (idx = 0; idx < len; ++idx)
                buf[idx + width] = tmp[len - idx - 1];

        return len + width;
}

#define SIGN        1                /* unsigned/signed */
#define LEFT        2                /* left justified */
#define PLUS        4                /* show plus */
#define SPACE        8                /* space if plus */
#define ZEROPAD        16                /* pad with zero, must be 16 == '0' - ' ' */
#define SMALL        32                /* use lowercase in hex (must be 32 == 0x20) */
#define SPECIAL        64                /* prefix hex with "0x", octal with "0" */

static_assert(ZEROPAD == ('0' - ' '));
static_assert(SMALL == ('a' ^ 'A'));

enum format_state {
        FORMAT_STATE_NONE, /* Just a string part */
        FORMAT_STATE_NUM,
        FORMAT_STATE_WIDTH,
        FORMAT_STATE_PRECISION,
        FORMAT_STATE_CHAR,
        FORMAT_STATE_STR,
        FORMAT_STATE_PTR,
        FORMAT_STATE_PERCENT_CHAR,
        FORMAT_STATE_INVALID,
};

struct printf_spec {
        unsigned char        flags;                /* flags to number() */
        unsigned char        base;                /* number base, 8, 10 or 16 only */
        short                precision;        /* # of digits/chars */
        int                field_width;        /* width of output field */
} __packed;
static_assert(sizeof(struct printf_spec) == 8);

#define FIELD_WIDTH_MAX ((1 << 23) - 1)
#define PRECISION_MAX ((1 << 15) - 1)

static noinline_for_stack
char *number(char *buf, char *end, unsigned long long num,
             struct printf_spec spec)
{
        /* put_dec requires 2-byte alignment of the buffer. */
        char tmp[3 * sizeof(num)] __aligned(2);
        char sign;
        char locase;
        int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10);
        int i;
        bool is_zero = num == 0LL;
        int field_width = spec.field_width;
        int precision = spec.precision;

        /* locase = 0 or 0x20. ORing digits or letters with 'locase'
         * produces same digits or (maybe lowercased) letters */
        locase = (spec.flags & SMALL);
        if (spec.flags & LEFT)
                spec.flags &= ~ZEROPAD;
        sign = 0;
        if (spec.flags & SIGN) {
                if ((signed long long)num < 0) {
                        sign = '-';
                        num = -(signed long long)num;
                        field_width--;
                } else if (spec.flags & PLUS) {
                        sign = '+';
                        field_width--;
                } else if (spec.flags & SPACE) {
                        sign = ' ';
                        field_width--;
                }
        }
        if (need_pfx) {
                if (spec.base == 16)
                        field_width -= 2;
                else if (!is_zero)
                        field_width--;
        }

        /* generate full string in tmp[], in reverse order */
        i = 0;
        if (num < spec.base)
                tmp[i++] = hex_asc_upper[num] | locase;
        else if (spec.base != 10) { /* 8 or 16 */
                int mask = spec.base - 1;
                int shift = 3;

                if (spec.base == 16)
                        shift = 4;
                do {
                        tmp[i++] = (hex_asc_upper[((unsigned char)num) & mask] | locase);
                        num >>= shift;
                } while (num);
        } else { /* base 10 */
                i = put_dec(tmp, num) - tmp;
        }

        /* printing 100 using %2d gives "100", not "00" */
        if (i > precision)
                precision = i;
        /* leading space padding */
        field_width -= precision;
        if (!(spec.flags & (ZEROPAD | LEFT))) {
                while (--field_width >= 0) {
                        if (buf < end)
                                *buf = ' ';
                        ++buf;
                }
        }
        /* sign */
        if (sign) {
                if (buf < end)
                        *buf = sign;
                ++buf;
        }
        /* "0x" / "0" prefix */
        if (need_pfx) {
                if (spec.base == 16 || !is_zero) {
                        if (buf < end)
                                *buf = '0';
                        ++buf;
                }
                if (spec.base == 16) {
                        if (buf < end)
                                *buf = ('X' | locase);
                        ++buf;
                }
        }
        /* zero or space padding */
        if (!(spec.flags & LEFT)) {
                char c = ' ' + (spec.flags & ZEROPAD);

                while (--field_width >= 0) {
                        if (buf < end)
                                *buf = c;
                        ++buf;
                }
        }
        /* hmm even more zero padding? */
        while (i <= --precision) {
                if (buf < end)
                        *buf = '0';
                ++buf;
        }
        /* actual digits of result */
        while (--i >= 0) {
                if (buf < end)
                        *buf = tmp[i];
                ++buf;
        }
        /* trailing space padding */
        while (--field_width >= 0) {
                if (buf < end)
                        *buf = ' ';
                ++buf;
        }

        return buf;
}

static noinline_for_stack
char *special_hex_number(char *buf, char *end, unsigned long long num, int size)
{
        struct printf_spec spec;

        spec.field_width = 2 + 2 * size;        /* 0x + hex */
        spec.flags = SPECIAL | SMALL | ZEROPAD;
        spec.base = 16;
        spec.precision = -1;

        return number(buf, end, num, spec);
}

static void move_right(char *buf, char *end, unsigned len, unsigned spaces)
{
        size_t size;
        if (buf >= end)        /* nowhere to put anything */
                return;
        size = end - buf;
        if (size <= spaces) {
                memset(buf, ' ', size);
                return;
        }
        if (len) {
                if (len > size - spaces)
                        len = size - spaces;
                memmove(buf + spaces, buf, len);
        }
        memset(buf, ' ', spaces);
}

/*
 * Handle field width padding for a string.
 * @buf: current buffer position
 * @n: length of string
 * @end: end of output buffer
 * @spec: for field width and flags
 * Returns: new buffer position after padding.
 */
static noinline_for_stack
char *widen_string(char *buf, int n, char *end, struct printf_spec spec)
{
        unsigned spaces;

        if (likely(n >= spec.field_width))
                return buf;
        /* we want to pad the sucker */
        spaces = spec.field_width - n;
        if (!(spec.flags & LEFT)) {
                move_right(buf - n, end, n, spaces);
                return buf + spaces;
        }
        while (spaces--) {
                if (buf < end)
                        *buf = ' ';
                ++buf;
        }
        return buf;
}

/* Handle string from a well known address. */
static char *string_nocheck(char *buf, char *end, const char *s,
                            struct printf_spec spec)
{
        int len = 0;
        int lim = spec.precision;

        while (lim--) {
                char c = *s++;
                if (!c)
                        break;
                if (buf < end)
                        *buf = c;
                ++buf;
                ++len;
        }
        return widen_string(buf, len, end, spec);
}

static char *err_ptr(char *buf, char *end, void *ptr,
                     struct printf_spec spec)
{
        int err = PTR_ERR(ptr);
        const char *sym = errname(err);

        if (sym)
                return string_nocheck(buf, end, sym, spec);

        /*
         * Somebody passed ERR_PTR(-1234) or some other non-existing
         * Efoo - or perhaps CONFIG_SYMBOLIC_ERRNAME=n. Fall back to
         * printing it as its decimal representation.
         */
        spec.flags |= SIGN;
        spec.base = 10;
        return number(buf, end, err, spec);
}

/* Be careful: error messages must fit into the given buffer. */
static char *error_string(char *buf, char *end, const char *s,
                          struct printf_spec spec)
{
        /*
         * Hard limit to avoid a completely insane messages. It actually
         * works pretty well because most error messages are in
         * the many pointer format modifiers.
         */
        if (spec.precision == -1)
                spec.precision = 2 * sizeof(void *);

        return string_nocheck(buf, end, s, spec);
}

/*
 * Do not call any complex external code here. Nested printk()/vsprintf()
 * might cause infinite loops. Failures might break printk() and would
 * be hard to debug.
 */
static const char *check_pointer_msg(const void *ptr)
{
        if (!ptr)
                return "(null)";

        if ((unsigned long)ptr < PAGE_SIZE || IS_ERR_VALUE(ptr))
                return "(efault)";

        return NULL;
}

static int check_pointer(char **buf, char *end, const void *ptr,
                         struct printf_spec spec)
{
        const char *err_msg;

        err_msg = check_pointer_msg(ptr);
        if (err_msg) {
                *buf = error_string(*buf, end, err_msg, spec);
                return -EFAULT;
        }

        return 0;
}

static noinline_for_stack
char *string(char *buf, char *end, const char *s,
             struct printf_spec spec)
{
        if (check_pointer(&buf, end, s, spec))
                return buf;

        return string_nocheck(buf, end, s, spec);
}

static char *pointer_string(char *buf, char *end,
                            const void *ptr,
                            struct printf_spec spec)
{
        spec.base = 16;
        spec.flags |= SMALL;
        if (spec.field_width == -1) {
                spec.field_width = 2 * sizeof(ptr);
                spec.flags |= ZEROPAD;
        }

        return number(buf, end, (unsigned long int)ptr, spec);
}

/* Make pointers available for printing early in the boot sequence. */
static int debug_boot_weak_hash __ro_after_init;

static int __init debug_boot_weak_hash_enable(char *str)
{
        debug_boot_weak_hash = 1;
        pr_info("debug_boot_weak_hash enabled\n");
        return 0;
}
early_param("debug_boot_weak_hash", debug_boot_weak_hash_enable);

static bool filled_random_ptr_key __read_mostly;
static siphash_key_t ptr_key __read_mostly;

static int fill_ptr_key(struct notifier_block *nb, unsigned long action, void *data)
{
        get_random_bytes(&ptr_key, sizeof(ptr_key));

        /* Pairs with smp_rmb() before reading ptr_key. */
        smp_wmb();
        WRITE_ONCE(filled_random_ptr_key, true);
        return NOTIFY_DONE;
}

static int __init vsprintf_init_hashval(void)
{
        static struct notifier_block fill_ptr_key_nb = { .notifier_call = fill_ptr_key };
        execute_with_initialized_rng(&fill_ptr_key_nb);
        return 0;
}
subsys_initcall(vsprintf_init_hashval)

/* Maps a pointer to a 32 bit unique identifier. */
static inline int __ptr_to_hashval(const void *ptr, unsigned long *hashval_out)
{
        unsigned long hashval;

        if (!READ_ONCE(filled_random_ptr_key))
                return -EBUSY;

        /* Pairs with smp_wmb() after writing ptr_key. */
        smp_rmb();

#ifdef CONFIG_64BIT
        hashval = (unsigned long)siphash_1u64((u64)ptr, &ptr_key);
        /*
         * Mask off the first 32 bits, this makes explicit that we have
         * modified the address (and 32 bits is plenty for a unique ID).
         */
        hashval = hashval & 0xffffffff;
#else
        hashval = (unsigned long)siphash_1u32((u32)ptr, &ptr_key);
#endif
        *hashval_out = hashval;
        return 0;
}

int ptr_to_hashval(const void *ptr, unsigned long *hashval_out)
{
        return __ptr_to_hashval(ptr, hashval_out);
}

static char *ptr_to_id(char *buf, char *end, const void *ptr,
                       struct printf_spec spec)
{
        const char *str = sizeof(ptr) == 8 ? "(____ptrval____)" : "(ptrval)";
        unsigned long hashval;
        int ret;

        /*
         * Print the real pointer value for NULL and error pointers,
         * as they are not actual addresses.
         */
        if (IS_ERR_OR_NULL(ptr))
                return pointer_string(buf, end, ptr, spec);

        /* When debugging early boot use non-cryptographically secure hash. */
        if (unlikely(debug_boot_weak_hash)) {
                hashval = hash_long((unsigned long)ptr, 32);
                return pointer_string(buf, end, (const void *)hashval, spec);
        }

        ret = __ptr_to_hashval(ptr, &hashval);
        if (ret) {
                spec.field_width = 2 * sizeof(ptr);
                /* string length must be less than default_width */
                return error_string(buf, end, str, spec);
        }

        return pointer_string(buf, end, (const void *)hashval, spec);
}

static char *default_pointer(char *buf, char *end, const void *ptr,
                             struct printf_spec spec)
{
        /*
         * default is to _not_ leak addresses, so hash before printing,
         * unless no_hash_pointers is specified on the command line.
         */
        if (unlikely(no_hash_pointers))
                return pointer_string(buf, end, ptr, spec);

        return ptr_to_id(buf, end, ptr, spec);
}

int kptr_restrict __read_mostly;

static noinline_for_stack
char *restricted_pointer(char *buf, char *end, const void *ptr,
                         struct printf_spec spec)
{
        switch (kptr_restrict) {
        case 0:
                /* Handle as %p, hash and do _not_ leak addresses. */
                return default_pointer(buf, end, ptr, spec);
        case 1: {
                const struct cred *cred;

                /*
                 * kptr_restrict==1 cannot be used in IRQ context
                 * because its test for CAP_SYSLOG would be meaningless.
                 */
                if (in_hardirq() || in_serving_softirq() || in_nmi()) {
                        if (spec.field_width == -1)
                                spec.field_width = 2 * sizeof(ptr);
                        return error_string(buf, end, "pK-error", spec);
                }

                /*
                 * Only print the real pointer value if the current
                 * process has CAP_SYSLOG and is running with the
                 * same credentials it started with. This is because
                 * access to files is checked at open() time, but %pK
                 * checks permission at read() time. We don't want to
                 * leak pointer values if a binary opens a file using
                 * %pK and then elevates privileges before reading it.
                 */
                cred = current_cred();
                if (!has_capability_noaudit(current, CAP_SYSLOG) ||
                    !uid_eq(cred->euid, cred->uid) ||
                    !gid_eq(cred->egid, cred->gid))
                        ptr = NULL;
                break;
        }
        case 2:
        default:
                /* Always print 0's for %pK */
                ptr = NULL;
                break;
        }

        return pointer_string(buf, end, ptr, spec);
}

static noinline_for_stack
char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_spec spec,
                  const char *fmt)
{
        const char *array[4], *s;
        const struct dentry *p;
        int depth;
        int i, n;

        switch (fmt[1]) {
                case '2': case '3': case '4':
                        depth = fmt[1] - '0';
                        break;
                default:
                        depth = 1;
        }

        rcu_read_lock();
        for (i = 0; i < depth; i++, d = p) {
                if (check_pointer(&buf, end, d, spec)) {
                        rcu_read_unlock();
                        return buf;
                }

                p = READ_ONCE(d->d_parent);
                array[i] = READ_ONCE(d->d_name.name);
                if (p == d) {
                        if (i)
                                array[i] = "";
                        i++;
                        break;
                }
        }
        s = array[--i];
        for (n = 0; n != spec.precision; n++, buf++) {
                char c = *s++;
                if (!c) {
                        if (!i)
                                break;
                        c = '/';
                        s = array[--i];
                }
                if (buf < end)
                        *buf = c;
        }
        rcu_read_unlock();
        return widen_string(buf, n, end, spec);
}

static noinline_for_stack
char *file_dentry_name(char *buf, char *end, const struct file *f,
                        struct printf_spec spec, const char *fmt)
{
        if (check_pointer(&buf, end, f, spec))
                return buf;

        return dentry_name(buf, end, f->f_path.dentry, spec, fmt);
}
#ifdef CONFIG_BLOCK
static noinline_for_stack
char *bdev_name(char *buf, char *end, struct block_device *bdev,
                struct printf_spec spec, const char *fmt)
{
        struct gendisk *hd;

        if (check_pointer(&buf, end, bdev, spec))
                return buf;

        hd = bdev->bd_disk;
        buf = string(buf, end, hd->disk_name, spec);
        if (bdev_is_partition(bdev)) {
                if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) {
                        if (buf < end)
                                *buf = 'p';
                        buf++;
                }
                buf = number(buf, end, bdev_partno(bdev), spec);
        }
        return buf;
}
#endif

static noinline_for_stack
char *symbol_string(char *buf, char *end, void *ptr,
                    struct printf_spec spec, const char *fmt)
{
        unsigned long value;
#ifdef CONFIG_KALLSYMS
        char sym[KSYM_SYMBOL_LEN];
#endif

        if (fmt[1] == 'R')
                ptr = __builtin_extract_return_addr(ptr);
        value = (unsigned long)ptr;

#ifdef CONFIG_KALLSYMS
        if (*fmt == 'B' && fmt[1] == 'b')
                sprint_backtrace_build_id(sym, value);
        else if (*fmt == 'B')
                sprint_backtrace(sym, value);
        else if (*fmt == 'S' && (fmt[1] == 'b' || (fmt[1] == 'R' && fmt[2] == 'b')))
                sprint_symbol_build_id(sym, value);
        else if (*fmt != 's')
                sprint_symbol(sym, value);
        else
                sprint_symbol_no_offset(sym, value);

        return string_nocheck(buf, end, sym, spec);
#else
        return special_hex_number(buf, end, value, sizeof(void *));
#endif
}

static const struct printf_spec default_str_spec = {
        .field_width = -1,
        .precision = -1,
};

static const struct printf_spec default_flag_spec = {
        .base = 16,
        .precision = -1,
        .flags = SPECIAL | SMALL,
};

static const struct printf_spec default_dec_spec = {
        .base = 10,
        .precision = -1,
};

static const struct printf_spec default_dec02_spec = {
        .base = 10,
        .field_width = 2,
        .precision = -1,
        .flags = ZEROPAD,
};

static const struct printf_spec default_dec04_spec = {
        .base = 10,
        .field_width = 4,
        .precision = -1,
        .flags = ZEROPAD,
};

static noinline_for_stack
char *hex_range(char *buf, char *end, u64 start_val, u64 end_val,
                struct printf_spec spec)
{
        buf = number(buf, end, start_val, spec);
        if (start_val == end_val)
                return buf;

        if (buf < end)
                *buf = '-';
        ++buf;
        return number(buf, end, end_val, spec);
}

static noinline_for_stack
char *resource_string(char *buf, char *end, struct resource *res,
                      struct printf_spec spec, const char *fmt)
{
#ifndef IO_RSRC_PRINTK_SIZE
#define IO_RSRC_PRINTK_SIZE        6
#endif

#ifndef MEM_RSRC_PRINTK_SIZE
#define MEM_RSRC_PRINTK_SIZE        10
#endif
        static const struct printf_spec io_spec = {
                .base = 16,
                .field_width = IO_RSRC_PRINTK_SIZE,
                .precision = -1,
                .flags = SPECIAL | SMALL | ZEROPAD,
        };
        static const struct printf_spec mem_spec = {
                .base = 16,
                .field_width = MEM_RSRC_PRINTK_SIZE,
                .precision = -1,
                .flags = SPECIAL | SMALL | ZEROPAD,
        };
        static const struct printf_spec bus_spec = {
                .base = 16,
                .field_width = 2,
                .precision = -1,
                .flags = SMALL | ZEROPAD,
        };
        static const struct printf_spec str_spec = {
                .field_width = -1,
                .precision = 10,
                .flags = LEFT,
        };

        /* 32-bit res (sizeof==4): 10 chars in dec, 10 in hex ("0x" + 8)
         * 64-bit res (sizeof==8): 20 chars in dec, 18 in hex ("0x" + 16) */
#define RSRC_BUF_SIZE                ((2 * sizeof(resource_size_t)) + 4)
#define FLAG_BUF_SIZE                (2 * sizeof(res->flags))
#define DECODED_BUF_SIZE        sizeof("[mem - 64bit pref window disabled]")
#define RAW_BUF_SIZE                sizeof("[mem - flags 0x]")
        char sym[MAX(2*RSRC_BUF_SIZE + DECODED_BUF_SIZE,
                     2*RSRC_BUF_SIZE + FLAG_BUF_SIZE + RAW_BUF_SIZE)];

        char *p = sym, *pend = sym + sizeof(sym);
        int decode = (fmt[0] == 'R') ? 1 : 0;
        const struct printf_spec *specp;

        if (check_pointer(&buf, end, res, spec))
                return buf;

        *p++ = '[';
        if (res->flags & IORESOURCE_IO) {
                p = string_nocheck(p, pend, "io  ", str_spec);
                specp = &io_spec;
        } else if (res->flags & IORESOURCE_MEM) {
                p = string_nocheck(p, pend, "mem ", str_spec);
                specp = &mem_spec;
        } else if (res->flags & IORESOURCE_IRQ) {
                p = string_nocheck(p, pend, "irq ", str_spec);
                specp = &default_dec_spec;
        } else if (res->flags & IORESOURCE_DMA) {
                p = string_nocheck(p, pend, "dma ", str_spec);
                specp = &default_dec_spec;
        } else if (res->flags & IORESOURCE_BUS) {
                p = string_nocheck(p, pend, "bus ", str_spec);
                specp = &bus_spec;
        } else {
                p = string_nocheck(p, pend, "??? ", str_spec);
                specp = &mem_spec;
                decode = 0;
        }
        if (decode && res->flags & IORESOURCE_UNSET) {
                p = string_nocheck(p, pend, "size ", str_spec);
                p = number(p, pend, resource_size(res), *specp);
        } else {
                p = hex_range(p, pend, res->start, res->end, *specp);
        }
        if (decode) {
                if (res->flags & IORESOURCE_MEM_64)
                        p = string_nocheck(p, pend, " 64bit", str_spec);
                if (res->flags & IORESOURCE_PREFETCH)
                        p = string_nocheck(p, pend, " pref", str_spec);
                if (res->flags & IORESOURCE_WINDOW)
                        p = string_nocheck(p, pend, " window", str_spec);
                if (res->flags & IORESOURCE_DISABLED)
                        p = string_nocheck(p, pend, " disabled", str_spec);
        } else {
                p = string_nocheck(p, pend, " flags ", str_spec);
                p = number(p, pend, res->flags, default_flag_spec);
        }
        *p++ = ']';
        *p = '\0';

        return string_nocheck(buf, end, sym, spec);
}

static noinline_for_stack
char *range_string(char *buf, char *end, const struct range *range,
                   struct printf_spec spec, const char *fmt)
{
        char sym[sizeof("[range 0x0123456789abcdef-0x0123456789abcdef]")];
        char *p = sym, *pend = sym + sizeof(sym);

        struct printf_spec range_spec = {
                .field_width = 2 + 2 * sizeof(range->start), /* 0x + 2 * 8 */
                .flags = SPECIAL | SMALL | ZEROPAD,
                .base = 16,
                .precision = -1,
        };

        if (check_pointer(&buf, end, range, spec))
                return buf;

        p = string_nocheck(p, pend, "[range ", default_str_spec);
        p = hex_range(p, pend, range->start, range->end, range_spec);
        *p++ = ']';
        *p = '\0';

        return string_nocheck(buf, end, sym, spec);
}

static noinline_for_stack
char *hex_string(char *buf, char *end, u8 *addr, struct printf_spec spec,
                 const char *fmt)
{
        int i, len = 1;                /* if we pass '%ph[CDN]', field width remains
                                   negative value, fallback to the default */
        char separator;

        if (spec.field_width == 0)
                /* nothing to print */
                return buf;

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        switch (fmt[1]) {
        case 'C':
                separator = ':';
                break;
        case 'D':
                separator = '-';
                break;
        case 'N':
                separator = 0;
                break;
        default:
                separator = ' ';
                break;
        }

        if (spec.field_width > 0)
                len = min_t(int, spec.field_width, 64);

        for (i = 0; i < len; ++i) {
                if (buf < end)
                        *buf = hex_asc_hi(addr[i]);
                ++buf;
                if (buf < end)
                        *buf = hex_asc_lo(addr[i]);
                ++buf;

                if (separator && i != len - 1) {
                        if (buf < end)
                                *buf = separator;
                        ++buf;
                }
        }

        return buf;
}

static noinline_for_stack
char *bitmap_string(char *buf, char *end, const unsigned long *bitmap,
                    struct printf_spec spec, const char *fmt)
{
        const int CHUNKSZ = 32;
        int nr_bits = max_t(int, spec.field_width, 0);
        int i, chunksz;
        bool first = true;

        if (check_pointer(&buf, end, bitmap, spec))
                return buf;

        /* reused to print numbers */
        spec = (struct printf_spec){ .flags = SMALL | ZEROPAD, .base = 16 };

        chunksz = nr_bits & (CHUNKSZ - 1);
        if (chunksz == 0)
                chunksz = CHUNKSZ;

        i = ALIGN(nr_bits, CHUNKSZ) - CHUNKSZ;
        for (; i >= 0; i -= CHUNKSZ) {
                u32 chunkmask, val;
                int word, bit;

                chunkmask = ((1ULL << chunksz) - 1);
                word = i / BITS_PER_LONG;
                bit = i % BITS_PER_LONG;
                val = (bitmap[word] >> bit) & chunkmask;

                if (!first) {
                        if (buf < end)
                                *buf = ',';
                        buf++;
                }
                first = false;

                spec.field_width = DIV_ROUND_UP(chunksz, 4);
                buf = number(buf, end, val, spec);

                chunksz = CHUNKSZ;
        }
        return buf;
}

static noinline_for_stack
char *bitmap_list_string(char *buf, char *end, const unsigned long *bitmap,
                         struct printf_spec spec, const char *fmt)
{
        int nr_bits = max_t(int, spec.field_width, 0);
        bool first = true;
        int rbot, rtop;

        if (check_pointer(&buf, end, bitmap, spec))
                return buf;

        for_each_set_bitrange(rbot, rtop, bitmap, nr_bits) {
                if (!first) {
                        if (buf < end)
                                *buf = ',';
                        buf++;
                }
                first = false;

                buf = number(buf, end, rbot, default_dec_spec);
                if (rtop == rbot + 1)
                        continue;

                if (buf < end)
                        *buf = '-';
                buf = number(++buf, end, rtop - 1, default_dec_spec);
        }
        return buf;
}

static noinline_for_stack
char *mac_address_string(char *buf, char *end, u8 *addr,
                         struct printf_spec spec, const char *fmt)
{
        char mac_addr[sizeof("xx:xx:xx:xx:xx:xx")];
        char *p = mac_addr;
        int i;
        char separator;
        bool reversed = false;

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        switch (fmt[1]) {
        case 'F':
                separator = '-';
                break;

        case 'R':
                reversed = true;
                fallthrough;

        default:
                separator = ':';
                break;
        }

        for (i = 0; i < 6; i++) {
                if (reversed)
                        p = hex_byte_pack(p, addr[5 - i]);
                else
                        p = hex_byte_pack(p, addr[i]);

                if (fmt[0] == 'M' && i != 5)
                        *p++ = separator;
        }
        *p = '\0';

        return string_nocheck(buf, end, mac_addr, spec);
}

static noinline_for_stack
char *ip4_string(char *p, const u8 *addr, const char *fmt)
{
        int i;
        bool leading_zeros = (fmt[0] == 'i');
        int index;
        int step;

        switch (fmt[2]) {
        case 'h':
#ifdef __BIG_ENDIAN
                index = 0;
                step = 1;
#else
                index = 3;
                step = -1;
#endif
                break;
        case 'l':
                index = 3;
                step = -1;
                break;
        case 'n':
        case 'b':
        default:
                index = 0;
                step = 1;
                break;
        }
        for (i = 0; i < 4; i++) {
                char temp[4] __aligned(2);        /* hold each IP quad in reverse order */
                int digits = put_dec_trunc8(temp, addr[index]) - temp;
                if (leading_zeros) {
                        if (digits < 3)
                                *p++ = '0';
                        if (digits < 2)
                                *p++ = '0';
                }
                /* reverse the digits in the quad */
                while (digits--)
                        *p++ = temp[digits];
                if (i < 3)
                        *p++ = '.';
                index += step;
        }
        *p = '\0';

        return p;
}

static noinline_for_stack
char *ip6_compressed_string(char *p, const char *addr)
{
        int i, j, range;
        unsigned char zerolength[8];
        int longest = 1;
        int colonpos = -1;
        u16 word;
        u8 hi, lo;
        bool needcolon = false;
        bool useIPv4;
        struct in6_addr in6;

        memcpy(&in6, addr, sizeof(struct in6_addr));

        useIPv4 = ipv6_addr_v4mapped(&in6) || ipv6_addr_is_isatap(&in6);

        memset(zerolength, 0, sizeof(zerolength));

        if (useIPv4)
                range = 6;
        else
                range = 8;

        /* find position of longest 0 run */
        for (i = 0; i < range; i++) {
                for (j = i; j < range; j++) {
                        if (in6.s6_addr16[j] != 0)
                                break;
                        zerolength[i]++;
                }
        }
        for (i = 0; i < range; i++) {
                if (zerolength[i] > longest) {
                        longest = zerolength[i];
                        colonpos = i;
                }
        }
        if (longest == 1)                /* don't compress a single 0 */
                colonpos = -1;

        /* emit address */
        for (i = 0; i < range; i++) {
                if (i == colonpos) {
                        if (needcolon || i == 0)
                                *p++ = ':';
                        *p++ = ':';
                        needcolon = false;
                        i += longest - 1;
                        continue;
                }
                if (needcolon) {
                        *p++ = ':';
                        needcolon = false;
                }
                /* hex u16 without leading 0s */
                word = ntohs(in6.s6_addr16[i]);
                hi = word >> 8;
                lo = word & 0xff;
                if (hi) {
                        if (hi > 0x0f)
                                p = hex_byte_pack(p, hi);
                        else
                                *p++ = hex_asc_lo(hi);
                        p = hex_byte_pack(p, lo);
                }
                else if (lo > 0x0f)
                        p = hex_byte_pack(p, lo);
                else
                        *p++ = hex_asc_lo(lo);
                needcolon = true;
        }

        if (useIPv4) {
                if (needcolon)
                        *p++ = ':';
                p = ip4_string(p, &in6.s6_addr[12], "I4");
        }
        *p = '\0';

        return p;
}

static noinline_for_stack
char *ip6_string(char *p, const char *addr, const char *fmt)
{
        int i;

        for (i = 0; i < 8; i++) {
                p = hex_byte_pack(p, *addr++);
                p = hex_byte_pack(p, *addr++);
                if (fmt[0] == 'I' && i != 7)
                        *p++ = ':';
        }
        *p = '\0';

        return p;
}

static noinline_for_stack
char *ip6_addr_string(char *buf, char *end, const u8 *addr,
                      struct printf_spec spec, const char *fmt)
{
        char ip6_addr[sizeof("xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255")];

        if (fmt[0] == 'I' && fmt[2] == 'c')
                ip6_compressed_string(ip6_addr, addr);
        else
                ip6_string(ip6_addr, addr, fmt);

        return string_nocheck(buf, end, ip6_addr, spec);
}

static noinline_for_stack
char *ip4_addr_string(char *buf, char *end, const u8 *addr,
                      struct printf_spec spec, const char *fmt)
{
        char ip4_addr[sizeof("255.255.255.255")];

        ip4_string(ip4_addr, addr, fmt);

        return string_nocheck(buf, end, ip4_addr, spec);
}

static noinline_for_stack
char *ip6_addr_string_sa(char *buf, char *end, const struct sockaddr_in6 *sa,
                         struct printf_spec spec, const char *fmt)
{
        bool have_p = false, have_s = false, have_f = false, have_c = false;
        char ip6_addr[sizeof("[xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255]") +
                      sizeof(":12345") + sizeof("/123456789") +
                      sizeof("%1234567890")];
        char *p = ip6_addr, *pend = ip6_addr + sizeof(ip6_addr);
        const u8 *addr = (const u8 *) &sa->sin6_addr;
        char fmt6[2] = { fmt[0], '6' };
        u8 off = 0;

        fmt++;
        while (isalpha(*++fmt)) {
                switch (*fmt) {
                case 'p':
                        have_p = true;
                        break;
                case 'f':
                        have_f = true;
                        break;
                case 's':
                        have_s = true;
                        break;
                case 'c':
                        have_c = true;
                        break;
                }
        }

        if (have_p || have_s || have_f) {
                *p = '[';
                off = 1;
        }

        if (fmt6[0] == 'I' && have_c)
                p = ip6_compressed_string(ip6_addr + off, addr);
        else
                p = ip6_string(ip6_addr + off, addr, fmt6);

        if (have_p || have_s || have_f)
                *p++ = ']';

        if (have_p) {
                *p++ = ':';
                p = number(p, pend, ntohs(sa->sin6_port), spec);
        }
        if (have_f) {
                *p++ = '/';
                p = number(p, pend, ntohl(sa->sin6_flowinfo &
                                          IPV6_FLOWINFO_MASK), spec);
        }
        if (have_s) {
                *p++ = '%';
                p = number(p, pend, sa->sin6_scope_id, spec);
        }
        *p = '\0';

        return string_nocheck(buf, end, ip6_addr, spec);
}

static noinline_for_stack
char *ip4_addr_string_sa(char *buf, char *end, const struct sockaddr_in *sa,
                         struct printf_spec spec, const char *fmt)
{
        bool have_p = false;
        char *p, ip4_addr[sizeof("255.255.255.255") + sizeof(":12345")];
        char *pend = ip4_addr + sizeof(ip4_addr);
        const u8 *addr = (const u8 *) &sa->sin_addr.s_addr;
        char fmt4[3] = { fmt[0], '4', 0 };

        fmt++;
        while (isalpha(*++fmt)) {
                switch (*fmt) {
                case 'p':
                        have_p = true;
                        break;
                case 'h':
                case 'l':
                case 'n':
                case 'b':
                        fmt4[2] = *fmt;
                        break;
                }
        }

        p = ip4_string(ip4_addr, addr, fmt4);
        if (have_p) {
                *p++ = ':';
                p = number(p, pend, ntohs(sa->sin_port), spec);
        }
        *p = '\0';

        return string_nocheck(buf, end, ip4_addr, spec);
}

static noinline_for_stack
char *ip_addr_string(char *buf, char *end, const void *ptr,
                     struct printf_spec spec, const char *fmt)
{
        char *err_fmt_msg;

        if (check_pointer(&buf, end, ptr, spec))
                return buf;

        switch (fmt[1]) {
        case '6':
                return ip6_addr_string(buf, end, ptr, spec, fmt);
        case '4':
                return ip4_addr_string(buf, end, ptr, spec, fmt);
        case 'S': {
                const union {
                        struct sockaddr                raw;
                        struct sockaddr_in        v4;
                        struct sockaddr_in6        v6;
                } *sa = ptr;

                switch (sa->raw.sa_family) {
                case AF_INET:
                        return ip4_addr_string_sa(buf, end, &sa->v4, spec, fmt);
                case AF_INET6:
                        return ip6_addr_string_sa(buf, end, &sa->v6, spec, fmt);
                default:
                        return error_string(buf, end, "(einval)", spec);
                }}
        }

        err_fmt_msg = fmt[0] == 'i' ? "(%pi?)" : "(%pI?)";
        return error_string(buf, end, err_fmt_msg, spec);
}

static noinline_for_stack
char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec,
                     const char *fmt)
{
        bool found = true;
        int count = 1;
        unsigned int flags = 0;
        int len;

        if (spec.field_width == 0)
                return buf;                                /* nothing to print */

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        do {
                switch (fmt[count++]) {
                case 'a':
                        flags |= ESCAPE_ANY;
                        break;
                case 'c':
                        flags |= ESCAPE_SPECIAL;
                        break;
                case 'h':
                        flags |= ESCAPE_HEX;
                        break;
                case 'n':
                        flags |= ESCAPE_NULL;
                        break;
                case 'o':
                        flags |= ESCAPE_OCTAL;
                        break;
                case 'p':
                        flags |= ESCAPE_NP;
                        break;
                case 's':
                        flags |= ESCAPE_SPACE;
                        break;
                default:
                        found = false;
                        break;
                }
        } while (found);

        if (!flags)
                flags = ESCAPE_ANY_NP;

        len = spec.field_width < 0 ? 1 : spec.field_width;

        /*
         * string_escape_mem() writes as many characters as it can to
         * the given buffer, and returns the total size of the output
         * had the buffer been big enough.
         */
        buf += string_escape_mem(addr, len, buf, buf < end ? end - buf : 0, flags, NULL);

        return buf;
}

__diag_push();
__diag_ignore(GCC, all, "-Wsuggest-attribute=format",
              "Not a valid __printf() conversion candidate.");
static char *va_format(char *buf, char *end, struct va_format *va_fmt,
                       struct printf_spec spec)
{
        va_list va;

        if (check_pointer(&buf, end, va_fmt, spec))
                return buf;

        va_copy(va, *va_fmt->va);
        buf += vsnprintf(buf, end > buf ? end - buf : 0, va_fmt->fmt, va);
        va_end(va);

        return buf;
}
__diag_pop();

static noinline_for_stack
char *uuid_string(char *buf, char *end, const u8 *addr,
                  struct printf_spec spec, const char *fmt)
{
        char uuid[UUID_STRING_LEN + 1];
        char *p = uuid;
        int i;
        const u8 *index = uuid_index;
        bool uc = false;

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        switch (*(++fmt)) {
        case 'L':
                uc = true;
                fallthrough;
        case 'l':
                index = guid_index;
                break;
        case 'B':
                uc = true;
                break;
        }

        for (i = 0; i < 16; i++) {
                if (uc)
                        p = hex_byte_pack_upper(p, addr[index[i]]);
                else
                        p = hex_byte_pack(p, addr[index[i]]);
                switch (i) {
                case 3:
                case 5:
                case 7:
                case 9:
                        *p++ = '-';
                        break;
                }
        }

        *p = 0;

        return string_nocheck(buf, end, uuid, spec);
}

static noinline_for_stack
char *netdev_bits(char *buf, char *end, const void *addr,
                  struct printf_spec spec,  const char *fmt)
{
        unsigned long long num;
        int size;

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        switch (fmt[1]) {
        case 'F':
                num = *(const netdev_features_t *)addr;
                size = sizeof(netdev_features_t);
                break;
        default:
                return error_string(buf, end, "(%pN?)", spec);
        }

        return special_hex_number(buf, end, num, size);
}

static noinline_for_stack
char *fourcc_string(char *buf, char *end, const u32 *fourcc,
                    struct printf_spec spec, const char *fmt)
{
        char output[sizeof("0123 little-endian (0x01234567)")];
        char *p = output;
        unsigned int i;
        bool pixel_fmt = false;
        u32 orig, val;

        if (fmt[1] != 'c')
                return error_string(buf, end, "(%p4?)", spec);

        if (check_pointer(&buf, end, fourcc, spec))
                return buf;

        orig = get_unaligned(fourcc);
        switch (fmt[2]) {
        case 'h':
                if (fmt[3] == 'R')
                        orig = swab32(orig);
                break;
        case 'l':
                orig = (__force u32)cpu_to_le32(orig);
                break;
        case 'b':
                orig = (__force u32)cpu_to_be32(orig);
                break;
        case 'c':
                /* Pixel formats are printed LSB-first */
                pixel_fmt = true;
                break;
        default:
                return error_string(buf, end, "(%p4?)", spec);
        }

        val = pixel_fmt ? swab32(orig & ~BIT(31)) : orig;

        for (i = 0; i < sizeof(u32); i++) {
                unsigned char c = val >> ((3 - i) * 8);

                /* Print non-control ASCII characters as-is, dot otherwise */
                *p++ = isascii(c) && isprint(c) ? c : '.';
        }

        if (pixel_fmt) {
                *p++ = ' ';
                strcpy(p, orig & BIT(31) ? "big-endian" : "little-endian");
                p += strlen(p);
        }

        *p++ = ' ';
        *p++ = '(';
        p = special_hex_number(p, output + sizeof(output) - 2, orig, sizeof(u32));
        *p++ = ')';
        *p = '\0';

        return string(buf, end, output, spec);
}

static noinline_for_stack
char *address_val(char *buf, char *end, const void *addr,
                  struct printf_spec spec, const char *fmt)
{
        unsigned long long num;
        int size;

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        switch (fmt[1]) {
        case 'd':
                num = *(const dma_addr_t *)addr;
                size = sizeof(dma_addr_t);
                break;
        case 'p':
        default:
                num = *(const phys_addr_t *)addr;
                size = sizeof(phys_addr_t);
                break;
        }

        return special_hex_number(buf, end, num, size);
}

static noinline_for_stack
char *date_str(char *buf, char *end, const struct rtc_time *tm, bool r)
{
        int year = tm->tm_year + (r ? 0 : 1900);
        int mon = tm->tm_mon + (r ? 0 : 1);

        buf = number(buf, end, year, default_dec04_spec);
        if (buf < end)
                *buf = '-';
        buf++;

        buf = number(buf, end, mon, default_dec02_spec);
        if (buf < end)
                *buf = '-';
        buf++;

        return number(buf, end, tm->tm_mday, default_dec02_spec);
}

static noinline_for_stack
char *time_str(char *buf, char *end, const struct rtc_time *tm, bool r)
{
        buf = number(buf, end, tm->tm_hour, default_dec02_spec);
        if (buf < end)
                *buf = ':';
        buf++;

        buf = number(buf, end, tm->tm_min, default_dec02_spec);
        if (buf < end)
                *buf = ':';
        buf++;

        return number(buf, end, tm->tm_sec, default_dec02_spec);
}

static noinline_for_stack
char *rtc_str(char *buf, char *end, const struct rtc_time *tm,
              struct printf_spec spec, const char *fmt)
{
        bool have_t = true, have_d = true;
        bool raw = false, iso8601_separator = true;
        bool found = true;
        int count = 2;

        if (check_pointer(&buf, end, tm, spec))
                return buf;

        switch (fmt[count]) {
        case 'd':
                have_t = false;
                count++;
                break;
        case 't':
                have_d = false;
                count++;
                break;
        }

        do {
                switch (fmt[count++]) {
                case 'r':
                        raw = true;
                        break;
                case 's':
                        iso8601_separator = false;
                        break;
                default:
                        found = false;
                        break;
                }
        } while (found);

        if (have_d)
                buf = date_str(buf, end, tm, raw);
        if (have_d && have_t) {
                if (buf < end)
                        *buf = iso8601_separator ? 'T' : ' ';
                buf++;
        }
        if (have_t)
                buf = time_str(buf, end, tm, raw);

        return buf;
}

static noinline_for_stack
char *time64_str(char *buf, char *end, const time64_t time,
                 struct printf_spec spec, const char *fmt)
{
        struct rtc_time rtc_time;
        struct tm tm;

        time64_to_tm(time, 0, &tm);

        rtc_time.tm_sec = tm.tm_sec;
        rtc_time.tm_min = tm.tm_min;
        rtc_time.tm_hour = tm.tm_hour;
        rtc_time.tm_mday = tm.tm_mday;
        rtc_time.tm_mon = tm.tm_mon;
        rtc_time.tm_year = tm.tm_year;
        rtc_time.tm_wday = tm.tm_wday;
        rtc_time.tm_yday = tm.tm_yday;

        rtc_time.tm_isdst = 0;

        return rtc_str(buf, end, &rtc_time, spec, fmt);
}

static noinline_for_stack
char *time_and_date(char *buf, char *end, void *ptr, struct printf_spec spec,
                    const char *fmt)
{
        switch (fmt[1]) {
        case 'R':
                return rtc_str(buf, end, (const struct rtc_time *)ptr, spec, fmt);
        case 'T':
                return time64_str(buf, end, *(const time64_t *)ptr, spec, fmt);
        default:
                return error_string(buf, end, "(%pt?)", spec);
        }
}

static noinline_for_stack
char *clock(char *buf, char *end, struct clk *clk, struct printf_spec spec,
            const char *fmt)
{
        if (!IS_ENABLED(CONFIG_HAVE_CLK))
                return error_string(buf, end, "(%pC?)", spec);

        if (check_pointer(&buf, end, clk, spec))
                return buf;

#ifdef CONFIG_COMMON_CLK
        return string(buf, end, __clk_get_name(clk), spec);
#else
        return ptr_to_id(buf, end, clk, spec);
#endif
}

static
char *format_flags(char *buf, char *end, unsigned long flags,
                                        const struct trace_print_flags *names)
{
        unsigned long mask;

        for ( ; flags && names->name; names++) {
                mask = names->mask;
                if ((flags & mask) != mask)
                        continue;

                buf = string(buf, end, names->name, default_str_spec);

                flags &= ~mask;
                if (flags) {
                        if (buf < end)
                                *buf = '|';
                        buf++;
                }
        }

        if (flags)
                buf = number(buf, end, flags, default_flag_spec);

        return buf;
}

struct page_flags_fields {
        int width;
        int shift;
        int mask;
        const struct printf_spec *spec;
        const char *name;
};

static const struct page_flags_fields pff[] = {
        {SECTIONS_WIDTH, SECTIONS_PGSHIFT, SECTIONS_MASK,
         &default_dec_spec, "section"},
        {NODES_WIDTH, NODES_PGSHIFT, NODES_MASK,
         &default_dec_spec, "node"},
        {ZONES_WIDTH, ZONES_PGSHIFT, ZONES_MASK,
         &default_dec_spec, "zone"},
        {LAST_CPUPID_WIDTH, LAST_CPUPID_PGSHIFT, LAST_CPUPID_MASK,
         &default_flag_spec, "lastcpupid"},
        {KASAN_TAG_WIDTH, KASAN_TAG_PGSHIFT, KASAN_TAG_MASK,
         &default_flag_spec, "kasantag"},
};

static
char *format_page_flags(char *buf, char *end, unsigned long flags)
{
        unsigned long main_flags = flags & PAGEFLAGS_MASK;
        bool append = false;
        int i;

        buf = number(buf, end, flags, default_flag_spec);
        if (buf < end)
                *buf = '(';
        buf++;

        /* Page flags from the main area. */
        if (main_flags) {
                buf = format_flags(buf, end, main_flags, pageflag_names);
                append = true;
        }

        /* Page flags from the fields area */
        for (i = 0; i < ARRAY_SIZE(pff); i++) {
                /* Skip undefined fields. */
                if (!pff[i].width)
                        continue;

                /* Format: Flag Name + '=' (equals sign) + Number + '|' (separator) */
                if (append) {
                        if (buf < end)
                                *buf = '|';
                        buf++;
                }

                buf = string(buf, end, pff[i].name, default_str_spec);
                if (buf < end)
                        *buf = '=';
                buf++;
                buf = number(buf, end, (flags >> pff[i].shift) & pff[i].mask,
                             *pff[i].spec);

                append = true;
        }
        if (buf < end)
                *buf = ')';
        buf++;

        return buf;
}

static noinline_for_stack
char *flags_string(char *buf, char *end, void *flags_ptr,
                   struct printf_spec spec, const char *fmt)
{
        unsigned long flags;
        const struct trace_print_flags *names;

        if (check_pointer(&buf, end, flags_ptr, spec))
                return buf;

        switch (fmt[1]) {
        case 'p':
                return format_page_flags(buf, end, *(unsigned long *)flags_ptr);
        case 'v':
                flags = *(unsigned long *)flags_ptr;
                names = vmaflag_names;
                break;
        case 'g':
                flags = (__force unsigned long)(*(gfp_t *)flags_ptr);
                names = gfpflag_names;
                break;
        default:
                return error_string(buf, end, "(%pG?)", spec);
        }

        return format_flags(buf, end, flags, names);
}

static noinline_for_stack
char *fwnode_full_name_string(struct fwnode_handle *fwnode, char *buf,
                              char *end)
{
        int depth;

        /* Loop starting from the root node to the current node. */
        for (depth = fwnode_count_parents(fwnode); depth >= 0; depth--) {
                /*
                 * Only get a reference for other nodes (i.e. parent nodes).
                 * fwnode refcount may be 0 here.
                 */
                struct fwnode_handle *__fwnode = depth ?
                        fwnode_get_nth_parent(fwnode, depth) : fwnode;

                buf = string(buf, end, fwnode_get_name_prefix(__fwnode),
                             default_str_spec);
                buf = string(buf, end, fwnode_get_name(__fwnode),
                             default_str_spec);

                if (depth)
                        fwnode_handle_put(__fwnode);
        }

        return buf;
}

static noinline_for_stack
char *device_node_string(char *buf, char *end, struct device_node *dn,
                         struct printf_spec spec, const char *fmt)
{
        char tbuf[sizeof("xxxx") + 1];
        const char *p;
        int ret;
        char *buf_start = buf;
        struct property *prop;
        bool has_mult, pass;

        struct printf_spec str_spec = spec;
        str_spec.field_width = -1;

        if (fmt[0] != 'F')
                return error_string(buf, end, "(%pO?)", spec);

        if (!IS_ENABLED(CONFIG_OF))
                return error_string(buf, end, "(%pOF?)", spec);

        if (check_pointer(&buf, end, dn, spec))
                return buf;

        /* simple case without anything any more format specifiers */
        fmt++;
        if (fmt[0] == '\0' || strcspn(fmt,"fnpPFcC") > 0)
                fmt = "f";

        for (pass = false; strspn(fmt,"fnpPFcC"); fmt++, pass = true) {
                int precision;
                if (pass) {
                        if (buf < end)
                                *buf = ':';
                        buf++;
                }

                switch (*fmt) {
                case 'f':        /* full_name */
                        buf = fwnode_full_name_string(of_fwnode_handle(dn), buf,
                                                      end);
                        break;
                case 'n':        /* name */
                        p = fwnode_get_name(of_fwnode_handle(dn));
                        precision = str_spec.precision;
                        str_spec.precision = strchrnul(p, '@') - p;
                        buf = string(buf, end, p, str_spec);
                        str_spec.precision = precision;
                        break;
                case 'p':        /* phandle */
                        buf = number(buf, end, (unsigned int)dn->phandle, default_dec_spec);
                        break;
                case 'P':        /* path-spec */
                        p = fwnode_get_name(of_fwnode_handle(dn));
                        if (!p[1])
                                p = "/";
                        buf = string(buf, end, p, str_spec);
                        break;
                case 'F':        /* flags */
                        tbuf[0] = of_node_check_flag(dn, OF_DYNAMIC) ? 'D' : '-';
                        tbuf[1] = of_node_check_flag(dn, OF_DETACHED) ? 'd' : '-';
                        tbuf[2] = of_node_check_flag(dn, OF_POPULATED) ? 'P' : '-';
                        tbuf[3] = of_node_check_flag(dn, OF_POPULATED_BUS) ? 'B' : '-';
                        tbuf[4] = 0;
                        buf = string_nocheck(buf, end, tbuf, str_spec);
                        break;
                case 'c':        /* major compatible string */
                        ret = of_property_read_string(dn, "compatible", &p);
                        if (!ret)
                                buf = string(buf, end, p, str_spec);
                        break;
                case 'C':        /* full compatible string */
                        has_mult = false;
                        of_property_for_each_string(dn, "compatible", prop, p) {
                                if (has_mult)
                                        buf = string_nocheck(buf, end, ",", str_spec);
                                buf = string_nocheck(buf, end, "\"", str_spec);
                                buf = string(buf, end, p, str_spec);
                                buf = string_nocheck(buf, end, "\"", str_spec);

                                has_mult = true;
                        }
                        break;
                default:
                        break;
                }
        }

        return widen_string(buf, buf - buf_start, end, spec);
}

static noinline_for_stack
char *fwnode_string(char *buf, char *end, struct fwnode_handle *fwnode,
                    struct printf_spec spec, const char *fmt)
{
        struct printf_spec str_spec = spec;
        char *buf_start = buf;

        str_spec.field_width = -1;

        if (*fmt != 'w')
                return error_string(buf, end, "(%pf?)", spec);

        if (check_pointer(&buf, end, fwnode, spec))
                return buf;

        fmt++;

        switch (*fmt) {
        case 'P':        /* name */
                buf = string(buf, end, fwnode_get_name(fwnode), str_spec);
                break;
        case 'f':        /* full_name */
        default:
                buf = fwnode_full_name_string(fwnode, buf, end);
                break;
        }

        return widen_string(buf, buf - buf_start, end, spec);
}

static noinline_for_stack
char *resource_or_range(const char *fmt, char *buf, char *end, void *ptr,
                        struct printf_spec spec)
{
        if (*fmt == 'r' && fmt[1] == 'a')
                return range_string(buf, end, ptr, spec, fmt);
        return resource_string(buf, end, ptr, spec, fmt);
}

void __init hash_pointers_finalize(bool slub_debug)
{
        switch (hash_pointers_mode) {
        case HASH_PTR_ALWAYS:
                no_hash_pointers = false;
                break;
        case HASH_PTR_NEVER:
                no_hash_pointers = true;
                break;
        case HASH_PTR_AUTO:
        default:
                no_hash_pointers = slub_debug;
                break;
        }

        if (!no_hash_pointers)
                return;

        pr_warn("**********************************************************\n");
        pr_warn("**   NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE   **\n");
        pr_warn("**                                                      **\n");
        pr_warn("** This system shows unhashed kernel memory addresses   **\n");
        pr_warn("** via the console, logs, and other interfaces. This    **\n");
        pr_warn("** might reduce the security of your system.            **\n");
        pr_warn("**                                                      **\n");
        pr_warn("** If you see this message and you are not debugging    **\n");
        pr_warn("** the kernel, report this immediately to your system   **\n");
        pr_warn("** administrator!                                       **\n");
        pr_warn("**                                                      **\n");
        pr_warn("** Use hash_pointers=always to force this mode off      **\n");
        pr_warn("**                                                      **\n");
        pr_warn("**   NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE   **\n");
        pr_warn("**********************************************************\n");
}

static int __init hash_pointers_mode_parse(char *str)
{
        if (!str) {
                pr_warn("Hash pointers mode empty; falling back to auto.\n");
                hash_pointers_mode = HASH_PTR_AUTO;
        } else if (strncmp(str, "auto", 4) == 0)   {
                pr_info("Hash pointers mode set to auto.\n");
                hash_pointers_mode = HASH_PTR_AUTO;
        } else if (strncmp(str, "never", 5) == 0) {
                pr_info("Hash pointers mode set to never.\n");
                hash_pointers_mode = HASH_PTR_NEVER;
        } else if (strncmp(str, "always", 6) == 0) {
                pr_info("Hash pointers mode set to always.\n");
                hash_pointers_mode = HASH_PTR_ALWAYS;
        } else {
                pr_warn("Unknown hash_pointers mode '%s' specified; assuming auto.\n", str);
                hash_pointers_mode = HASH_PTR_AUTO;
        }

        return 0;
}
early_param("hash_pointers", hash_pointers_mode_parse);

static int __init no_hash_pointers_enable(char *str)
{
        return hash_pointers_mode_parse("never");
}
early_param("no_hash_pointers", no_hash_pointers_enable);

/*
 * Show a '%p' thing.  A kernel extension is that the '%p' is followed
 * by an extra set of alphanumeric characters that are extended format
 * specifiers.
 *
 * Please update scripts/checkpatch.pl when adding/removing conversion
 * characters.  (Search for "check for vsprintf extension").
 *
 * Right now we handle:
 *
 * - 'S' For symbolic direct pointers (or function descriptors) with offset
 * - 's' For symbolic direct pointers (or function descriptors) without offset
 * - '[Ss]R' as above with __builtin_extract_return_addr() translation
 * - 'S[R]b' as above with module build ID (for use in backtraces)
 * - '[Ff]' %pf and %pF were obsoleted and later removed in favor of
 *            %ps and %pS. Be careful when re-using these specifiers.
 * - 'B' For backtraced symbolic direct pointers with offset
 * - 'Bb' as above with module build ID (for use in backtraces)
 * - 'R' For decoded struct resource, e.g., [mem 0x0-0x1f 64bit pref]
 * - 'r' For raw struct resource, e.g., [mem 0x0-0x1f flags 0x201]
 * - 'ra' For struct ranges, e.g., [range 0x0000000000000000 - 0x00000000000000ff]
 * - 'b[l]' For a bitmap, the number of bits is determined by the field
 *       width which must be explicitly specified either as part of the
 *       format string '%32b[l]' or through '%*b[l]', [l] selects
 *       range-list format instead of hex format
 * - 'M' For a 6-byte MAC address, it prints the address in the
 *       usual colon-separated hex notation
 * - 'm' For a 6-byte MAC address, it prints the hex address without colons
 * - 'MF' For a 6-byte MAC FDDI address, it prints the address
 *       with a dash-separated hex notation
 * - '[mM]R' For a 6-byte MAC address, Reverse order (Bluetooth)
 * - 'I' [46] for IPv4/IPv6 addresses printed in the usual way
 *       IPv4 uses dot-separated decimal without leading 0's (1.2.3.4)
 *       IPv6 uses colon separated network-order 16 bit hex with leading 0's
 *       [S][pfs]
 *       Generic IPv4/IPv6 address (struct sockaddr *) that falls back to
 *       [4] or [6] and is able to print port [p], flowinfo [f], scope [s]
 * - 'i' [46] for 'raw' IPv4/IPv6 addresses
 *       IPv6 omits the colons (01020304...0f)
 *       IPv4 uses dot-separated decimal with leading 0's (010.123.045.006)
 *       [S][pfs]
 *       Generic IPv4/IPv6 address (struct sockaddr *) that falls back to
 *       [4] or [6] and is able to print port [p], flowinfo [f], scope [s]
 * - '[Ii][4S][hnbl]' IPv4 addresses in host, network, big or little endian order
 * - 'I[6S]c' for IPv6 addresses printed as specified by
 *       https://tools.ietf.org/html/rfc5952
 * - 'E[achnops]' For an escaped buffer, where rules are defined by combination
 *                of the following flags (see string_escape_mem() for the
 *                details):
 *                  a - ESCAPE_ANY
 *                  c - ESCAPE_SPECIAL
 *                  h - ESCAPE_HEX
 *                  n - ESCAPE_NULL
 *                  o - ESCAPE_OCTAL
 *                  p - ESCAPE_NP
 *                  s - ESCAPE_SPACE
 *                By default ESCAPE_ANY_NP is used.
 * - 'U' For a 16 byte UUID/GUID, it prints the UUID/GUID in the form
 *       "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
 *       Options for %pU are:
 *         b big endian lower case hex (default)
 *         B big endian UPPER case hex
 *         l little endian lower case hex
 *         L little endian UPPER case hex
 *           big endian output byte order is:
 *             [0][1][2][3]-[4][5]-[6][7]-[8][9]-[10][11][12][13][14][15]
 *           little endian output byte order is:
 *             [3][2][1][0]-[5][4]-[7][6]-[8][9]-[10][11][12][13][14][15]
 * - 'V' For a struct va_format which contains a format string * and va_list *,
 *       call vsnprintf(->format, *->va_list).
 *       Implements a "recursive vsnprintf".
 *       Do not use this feature without some mechanism to verify the
 *       correctness of the format string and va_list arguments.
 * - 'K' For a kernel pointer that should be hidden from unprivileged users.
 *       Use only for procfs, sysfs and similar files, not printk(); please
 *       read the documentation (path below) first.
 * - 'NF' For a netdev_features_t
 * - '4cc' V4L2 or DRM FourCC code, with endianness and raw numerical value.
 * - '4c[h[R]lb]' For generic FourCC code with raw numerical value. Both are
 *         displayed in the big-endian format. This is the opposite of V4L2 or
 *         DRM FourCCs.
 *         The additional specifiers define what endianness is used to load
 *         the stored bytes. The data might be interpreted using the host,
 *         reversed host byte order, little-endian, or big-endian.
 * - 'h[CDN]' For a variable-length buffer, it prints it as a hex string with
 *            a certain separator (' ' by default):
 *              C colon
 *              D dash
 *              N no separator
 *            The maximum supported length is 64 bytes of the input. Consider
 *            to use print_hex_dump() for the larger input.
 * - 'a[pd]' For address types [p] phys_addr_t, [d] dma_addr_t and derivatives
 *           (default assumed to be phys_addr_t, passed by reference)
 * - 'd[234]' For a dentry name (optionally 2-4 last components)
 * - 'D[234]' Same as 'd' but for a struct file
 * - 'g' For block_device name (gendisk + partition number)
 * - 't[RT][dt][r][s]' For time and date as represented by:
 *      R    struct rtc_time
 *      T    time64_t
 * - 'C' For a clock, it prints the name (Common Clock Framework) or address
 *       (legacy clock framework) of the clock
 * - 'G' For flags to be printed as a collection of symbolic strings that would
 *       construct the specific value. Supported flags given by option:
 *       p page flags (see struct page) given as pointer to unsigned long
 *       g gfp flags (GFP_* and __GFP_*) given as pointer to gfp_t
 *       v vma flags (VM_*) given as pointer to unsigned long
 * - 'OF[fnpPcCF]'  For a device tree object
 *                  Without any optional arguments prints the full_name
 *                  f device node full_name
 *                  n device node name
 *                  p device node phandle
 *                  P device node path spec (name + @unit)
 *                  F device node flags
 *                  c major compatible string
 *                  C full compatible string
 * - 'fw[fP]'        For a firmware node (struct fwnode_handle) pointer
 *                Without an option prints the full name of the node
 *                f full name
 *                P node name, including a possible unit address
 * - 'x' For printing the address unmodified. Equivalent to "%lx".
 *       Please read the documentation (path below) before using!
 * - '[ku]s' For a BPF/tracing related format specifier, e.g. used out of
 *           bpf_trace_printk() where [ku] prefix specifies either kernel (k)
 *           or user (u) memory to probe, and:
 *              s a string, equivalent to "%s" on direct vsnprintf() use
 *
 * ** When making changes please also update:
 *        Documentation/core-api/printk-formats.rst
 *
 * Note: The default behaviour (unadorned %p) is to hash the address,
 * rendering it useful as a unique identifier.
 *
 * There is also a '%pA' format specifier, but it is only intended to be used
 * from Rust code to format core::fmt::Arguments. Do *not* use it from C.
 * See rust/kernel/print.rs for details.
 */
static noinline_for_stack
char *pointer(const char *fmt, char *buf, char *end, void *ptr,
              struct printf_spec spec)
{
        switch (*fmt) {
        case 'S':
        case 's':
                ptr = dereference_symbol_descriptor(ptr);
                fallthrough;
        case 'B':
                return symbol_string(buf, end, ptr, spec, fmt);
        case 'R':
        case 'r':
                return resource_or_range(fmt, buf, end, ptr, spec);
        case 'h':
                return hex_string(buf, end, ptr, spec, fmt);
        case 'b':
                switch (fmt[1]) {
                case 'l':
                        return bitmap_list_string(buf, end, ptr, spec, fmt);
                default:
                        return bitmap_string(buf, end, ptr, spec, fmt);
                }
        case 'M':                        /* Colon separated: 00:01:02:03:04:05 */
        case 'm':                        /* Contiguous: 000102030405 */
                                        /* [mM]F (FDDI) */
                                        /* [mM]R (Reverse order; Bluetooth) */
                return mac_address_string(buf, end, ptr, spec, fmt);
        case 'I':                        /* Formatted IP supported
                                         * 4:        1.2.3.4
                                         * 6:        0001:0203:...:0708
                                         * 6c:        1::708 or 1::1.2.3.4
                                         */
        case 'i':                        /* Contiguous:
                                         * 4:        001.002.003.004
                                         * 6:   000102...0f
                                         */
                return ip_addr_string(buf, end, ptr, spec, fmt);
        case 'E':
                return escaped_string(buf, end, ptr, spec, fmt);
        case 'U':
                return uuid_string(buf, end, ptr, spec, fmt);
        case 'V':
                return va_format(buf, end, ptr, spec);
        case 'K':
                return restricted_pointer(buf, end, ptr, spec);
        case 'N':
                return netdev_bits(buf, end, ptr, spec, fmt);
        case '4':
                return fourcc_string(buf, end, ptr, spec, fmt);
        case 'a':
                return address_val(buf, end, ptr, spec, fmt);
        case 'd':
                return dentry_name(buf, end, ptr, spec, fmt);
        case 't':
                return time_and_date(buf, end, ptr, spec, fmt);
        case 'C':
                return clock(buf, end, ptr, spec, fmt);
        case 'D':
                return file_dentry_name(buf, end, ptr, spec, fmt);
#ifdef CONFIG_BLOCK
        case 'g':
                return bdev_name(buf, end, ptr, spec, fmt);
#endif

        case 'G':
                return flags_string(buf, end, ptr, spec, fmt);
        case 'O':
                return device_node_string(buf, end, ptr, spec, fmt + 1);
        case 'f':
                return fwnode_string(buf, end, ptr, spec, fmt + 1);
        case 'A':
                if (!IS_ENABLED(CONFIG_RUST)) {
                        WARN_ONCE(1, "Please remove %%pA from non-Rust code\n");
                        return error_string(buf, end, "(%pA?)", spec);
                }
                return rust_fmt_argument(buf, end, ptr);
        case 'x':
                return pointer_string(buf, end, ptr, spec);
        case 'e':
                /* %pe with a non-ERR_PTR gets treated as plain %p */
                if (!IS_ERR(ptr))
                        return default_pointer(buf, end, ptr, spec);
                return err_ptr(buf, end, ptr, spec);
        case 'u':
        case 'k':
                switch (fmt[1]) {
                case 's':
                        return string(buf, end, ptr, spec);
                default:
                        return error_string(buf, end, "(einval)", spec);
                }
        default:
                return default_pointer(buf, end, ptr, spec);
        }
}

struct fmt {
        const char *str;
        unsigned char state;        // enum format_state
        unsigned char size;        // size of numbers
};

#define SPEC_CHAR(x, flag) [(x)-32] = flag
static unsigned char spec_flag(unsigned char c)
{
        static const unsigned char spec_flag_array[] = {
                SPEC_CHAR(' ', SPACE),
                SPEC_CHAR('#', SPECIAL),
                SPEC_CHAR('+', PLUS),
                SPEC_CHAR('-', LEFT),
                SPEC_CHAR('0', ZEROPAD),
        };
        c -= 32;
        return (c < sizeof(spec_flag_array)) ? spec_flag_array[c] : 0;
}

/*
 * Helper function to decode printf style format.
 * Each call decode a token from the format and return the
 * number of characters read (or likely the delta where it wants
 * to go on the next call).
 * The decoded token is returned through the parameters
 *
 * 'h', 'l', or 'L' for integer fields
 * 'z' support added 23/7/1999 S.H.
 * 'z' changed to 'Z' --davidm 1/25/99
 * 'Z' changed to 'z' --adobriyan 2017-01-25
 * 't' added for ptrdiff_t
 *
 * @fmt: the format string
 * @type of the token returned
 * @flags: various flags such as +, -, # tokens..
 * @field_width: overwritten width
 * @base: base of the number (octal, hex, ...)
 * @precision: precision of a number
 * @qualifier: qualifier of a number (long, size_t, ...)
 */
static noinline_for_stack
struct fmt format_decode(struct fmt fmt, struct printf_spec *spec)
{
        const char *start = fmt.str;
        char flag;

        /* we finished early by reading the field width */
        if (unlikely(fmt.state == FORMAT_STATE_WIDTH)) {
                if (spec->field_width < 0) {
                        spec->field_width = -spec->field_width;
                        spec->flags |= LEFT;
                }
                fmt.state = FORMAT_STATE_NONE;
                goto precision;
        }

        /* we finished early by reading the precision */
        if (unlikely(fmt.state == FORMAT_STATE_PRECISION)) {
                if (spec->precision < 0)
                        spec->precision = 0;

                fmt.state = FORMAT_STATE_NONE;
                goto qualifier;
        }

        /* By default */
        fmt.state = FORMAT_STATE_NONE;

        for (; *fmt.str ; fmt.str++) {
                if (*fmt.str == '%')
                        break;
        }

        /* Return the current non-format string */
        if (fmt.str != start || !*fmt.str)
                return fmt;

        /* Process flags. This also skips the first '%' */
        spec->flags = 0;
        do {
                /* this also skips first '%' */
                flag = spec_flag(*++fmt.str);
                spec->flags |= flag;
        } while (flag);

        /* get field width */
        spec->field_width = -1;

        if (isdigit(*fmt.str))
                spec->field_width = skip_atoi(&fmt.str);
        else if (unlikely(*fmt.str == '*')) {
                /* it's the next argument */
                fmt.state = FORMAT_STATE_WIDTH;
                fmt.str++;
                return fmt;
        }

precision:
        /* get the precision */
        spec->precision = -1;
        if (unlikely(*fmt.str == '.')) {
                fmt.str++;
                if (isdigit(*fmt.str)) {
                        spec->precision = skip_atoi(&fmt.str);
                        if (spec->precision < 0)
                                spec->precision = 0;
                } else if (*fmt.str == '*') {
                        /* it's the next argument */
                        fmt.state = FORMAT_STATE_PRECISION;
                        fmt.str++;
                        return fmt;
                }
        }

qualifier:
        /* Set up default numeric format */
        spec->base = 10;
        fmt.state = FORMAT_STATE_NUM;
        fmt.size = sizeof(int);
        static const struct format_state {
                unsigned char state;
                unsigned char size;
                unsigned char flags_or_double_size;
                unsigned char base;
        } lookup_state[256] = {
                // Length
                ['l'] = { 0, sizeof(long), sizeof(long long) },
                ['L'] = { 0, sizeof(long long) },
                ['h'] = { 0, sizeof(short), sizeof(char) },
                ['H'] = { 0, sizeof(char) },        // Questionable historical
                ['z'] = { 0, sizeof(size_t) },
                ['t'] = { 0, sizeof(ptrdiff_t) },

                // Non-numeric formats
                ['c'] = { FORMAT_STATE_CHAR },
                ['s'] = { FORMAT_STATE_STR },
                ['p'] = { FORMAT_STATE_PTR },
                ['%'] = { FORMAT_STATE_PERCENT_CHAR },

                // Numerics
                ['o'] = { FORMAT_STATE_NUM, 0, 0, 8 },
                ['x'] = { FORMAT_STATE_NUM, 0, SMALL, 16 },
                ['X'] = { FORMAT_STATE_NUM, 0, 0, 16 },
                ['d'] = { FORMAT_STATE_NUM, 0, SIGN, 10 },
                ['i'] = { FORMAT_STATE_NUM, 0, SIGN, 10 },
                ['u'] = { FORMAT_STATE_NUM, 0, 0, 10, },

                /*
                 * Since %n poses a greater security risk than
                 * utility, treat it as any other invalid or
                 * unsupported format specifier.
                 */
        };

        const struct format_state *p = lookup_state + (u8)*fmt.str;
        if (p->size) {
                fmt.size = p->size;
                if (p->flags_or_double_size && fmt.str[0] == fmt.str[1]) {
                        fmt.size = p->flags_or_double_size;
                        fmt.str++;
                }
                fmt.str++;
                p = lookup_state + *fmt.str;
        }
        if (p->state) {
                if (p->base)
                        spec->base = p->base;
                spec->flags |= p->flags_or_double_size;
                fmt.state = p->state;
                fmt.str++;
                return fmt;
        }

        WARN_ONCE(1, "Please remove unsupported %%%c in format string\n", *fmt.str);
        fmt.state = FORMAT_STATE_INVALID;
        return fmt;
}

static void
set_field_width(struct printf_spec *spec, int width)
{
        spec->field_width = width;
        if (WARN_ONCE(spec->field_width != width, "field width %d too large", width)) {
                spec->field_width = clamp(width, -FIELD_WIDTH_MAX, FIELD_WIDTH_MAX);
        }
}

static void
set_precision(struct printf_spec *spec, int prec)
{
        spec->precision = prec;
        if (WARN_ONCE(spec->precision != prec, "precision %d too large", prec)) {
                spec->precision = clamp(prec, 0, PRECISION_MAX);
        }
}

/*
 * Turn a 1/2/4-byte value into a 64-bit one for printing: truncate
 * as necessary and deal with signedness.
 *
 * 'size' is the size of the value in bytes.
 */
static unsigned long long convert_num_spec(unsigned int val, int size, struct printf_spec spec)
{
        unsigned int shift = 32 - size*8;

        val <<= shift;
        if (!(spec.flags & SIGN))
                return val >> shift;
        return (int)val >> shift;
}

/**
 * vsnprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @size: The size of the buffer, including the trailing null space
 * @fmt_str: The format string to use
 * @args: Arguments for the format string
 *
 * This function generally follows C99 vsnprintf, but has some
 * extensions and a few limitations:
 *
 *  - ``%n`` is unsupported
 *  - ``%p*`` is handled by pointer()
 *
 * See pointer() or Documentation/core-api/printk-formats.rst for more
 * extensive description.
 *
 * **Please update the documentation in both places when making changes**
 *
 * The return value is the number of characters which would
 * be generated for the given input, excluding the trailing
 * '\0', as per ISO C99. If you want to have the exact
 * number of characters written into @buf as return value
 * (not including the trailing '\0'), use vscnprintf(). If the
 * return is greater than or equal to @size, the resulting
 * string is truncated.
 *
 * If you're not already dealing with a va_list consider using snprintf().
 */
int vsnprintf(char *buf, size_t size, const char *fmt_str, va_list args)
{
        char *str, *end;
        struct printf_spec spec = {0};
        struct fmt fmt = {
                .str = fmt_str,
                .state = FORMAT_STATE_NONE,
        };

        /* Reject out-of-range values early.  Large positive sizes are
           used for unknown buffer sizes. */
        if (WARN_ON_ONCE(size > INT_MAX))
                return 0;

        str = buf;
        end = buf + size;

        /* Make sure end is always >= buf */
        if (end < buf) {
                end = ((void *)-1);
                size = end - buf;
        }

        while (*fmt.str) {
                const char *old_fmt = fmt.str;

                fmt = format_decode(fmt, &spec);

                switch (fmt.state) {
                case FORMAT_STATE_NONE: {
                        int read = fmt.str - old_fmt;
                        if (str < end) {
                                int copy = read;
                                if (copy > end - str)
                                        copy = end - str;
                                memcpy(str, old_fmt, copy);
                        }
                        str += read;
                        continue;
                }

                case FORMAT_STATE_NUM: {
                        unsigned long long num;
                        if (fmt.size <= sizeof(int))
                                num = convert_num_spec(va_arg(args, int), fmt.size, spec);
                        else
                                num = va_arg(args, long long);
                        str = number(str, end, num, spec);
                        continue;
                }

                case FORMAT_STATE_WIDTH:
                        set_field_width(&spec, va_arg(args, int));
                        continue;

                case FORMAT_STATE_PRECISION:
                        set_precision(&spec, va_arg(args, int));
                        continue;

                case FORMAT_STATE_CHAR: {
                        char c;

                        if (!(spec.flags & LEFT)) {
                                while (--spec.field_width > 0) {
                                        if (str < end)
                                                *str = ' ';
                                        ++str;

                                }
                        }
                        c = (unsigned char) va_arg(args, int);
                        if (str < end)
                                *str = c;
                        ++str;
                        while (--spec.field_width > 0) {
                                if (str < end)
                                        *str = ' ';
                                ++str;
                        }
                        continue;
                }

                case FORMAT_STATE_STR:
                        str = string(str, end, va_arg(args, char *), spec);
                        continue;

                case FORMAT_STATE_PTR:
                        str = pointer(fmt.str, str, end, va_arg(args, void *),
                                      spec);
                        while (isalnum(*fmt.str))
                                fmt.str++;
                        continue;

                case FORMAT_STATE_PERCENT_CHAR:
                        if (str < end)
                                *str = '%';
                        ++str;
                        continue;

                default:
                        /*
                         * Presumably the arguments passed gcc's type
                         * checking, but there is no safe or sane way
                         * for us to continue parsing the format and
                         * fetching from the va_list; the remaining
                         * specifiers and arguments would be out of
                         * sync.
                         */
                        goto out;
                }
        }

out:
        if (size > 0) {
                if (str < end)
                        *str = '\0';
                else
                        end[-1] = '\0';
        }

        /* the trailing null byte doesn't count towards the total */
        return str-buf;

}
EXPORT_SYMBOL(vsnprintf);

/**
 * vscnprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @size: The size of the buffer, including the trailing null space
 * @fmt: The format string to use
 * @args: Arguments for the format string
 *
 * The return value is the number of characters which have been written into
 * the @buf not including the trailing '\0'. If @size is == 0 the function
 * returns 0.
 *
 * If you're not already dealing with a va_list consider using scnprintf().
 *
 * See the vsnprintf() documentation for format string extensions over C99.
 */
int vscnprintf(char *buf, size_t size, const char *fmt, va_list args)
{
        int i;

        if (unlikely(!size))
                return 0;

        i = vsnprintf(buf, size, fmt, args);

        if (likely(i < size))
                return i;

        return size - 1;
}
EXPORT_SYMBOL(vscnprintf);

/**
 * snprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @size: The size of the buffer, including the trailing null space
 * @fmt: The format string to use
 * @...: Arguments for the format string
 *
 * The return value is the number of characters which would be
 * generated for the given input, excluding the trailing null,
 * as per ISO C99.  If the return is greater than or equal to
 * @size, the resulting string is truncated.
 *
 * See the vsnprintf() documentation for format string extensions over C99.
 */
int snprintf(char *buf, size_t size, const char *fmt, ...)
{
        va_list args;
        int i;

        va_start(args, fmt);
        i = vsnprintf(buf, size, fmt, args);
        va_end(args);

        return i;
}
EXPORT_SYMBOL(snprintf);

/**
 * scnprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @size: The size of the buffer, including the trailing null space
 * @fmt: The format string to use
 * @...: Arguments for the format string
 *
 * The return value is the number of characters written into @buf not including
 * the trailing '\0'. If @size is == 0 the function returns 0.
 */

int scnprintf(char *buf, size_t size, const char *fmt, ...)
{
        va_list args;
        int i;

        va_start(args, fmt);
        i = vscnprintf(buf, size, fmt, args);
        va_end(args);

        return i;
}
EXPORT_SYMBOL(scnprintf);

/**
 * vsprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @fmt: The format string to use
 * @args: Arguments for the format string
 *
 * The function returns the number of characters written
 * into @buf. Use vsnprintf() or vscnprintf() in order to avoid
 * buffer overflows.
 *
 * If you're not already dealing with a va_list consider using sprintf().
 *
 * See the vsnprintf() documentation for format string extensions over C99.
 */
int vsprintf(char *buf, const char *fmt, va_list args)
{
        return vsnprintf(buf, INT_MAX, fmt, args);
}
EXPORT_SYMBOL(vsprintf);

/**
 * sprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @fmt: The format string to use
 * @...: Arguments for the format string
 *
 * The function returns the number of characters written
 * into @buf. Use snprintf() or scnprintf() in order to avoid
 * buffer overflows.
 *
 * See the vsnprintf() documentation for format string extensions over C99.
 */
int sprintf(char *buf, const char *fmt, ...)
{
        va_list args;
        int i;

        va_start(args, fmt);
        i = vsnprintf(buf, INT_MAX, fmt, args);
        va_end(args);

        return i;
}
EXPORT_SYMBOL(sprintf);

#ifdef CONFIG_BINARY_PRINTF
/*
 * bprintf service:
 * vbin_printf() - VA arguments to binary data
 * bstr_printf() - Binary data to text string
 */

/**
 * vbin_printf - Parse a format string and place args' binary value in a buffer
 * @bin_buf: The buffer to place args' binary value
 * @size: The size of the buffer(by words(32bits), not characters)
 * @fmt_str: The format string to use
 * @args: Arguments for the format string
 *
 * The format follows C99 vsnprintf, except %n is ignored, and its argument
 * is skipped.
 *
 * The return value is the number of words(32bits) which would be generated for
 * the given input.
 *
 * NOTE:
 * If the return value is greater than @size, the resulting bin_buf is NOT
 * valid for bstr_printf().
 */
int vbin_printf(u32 *bin_buf, size_t size, const char *fmt_str, va_list args)
{
        struct fmt fmt = {
                .str = fmt_str,
                .state = FORMAT_STATE_NONE,
        };
        struct printf_spec spec = {0};
        char *str, *end;
        int width;

        str = (char *)bin_buf;
        end = (char *)(bin_buf + size);

#define save_arg(type)                                                        \
({                                                                        \
        unsigned long long value;                                        \
        if (sizeof(type) == 8) {                                        \
                unsigned long long val8;                                \
                str = PTR_ALIGN(str, sizeof(u32));                        \
                val8 = va_arg(args, unsigned long long);                \
                if (str + sizeof(type) <= end) {                        \
                        *(u32 *)str = *(u32 *)&val8;                        \
                        *(u32 *)(str + 4) = *((u32 *)&val8 + 1);        \
                }                                                        \
                value = val8;                                                \
        } else {                                                        \
                unsigned int val4;                                        \
                str = PTR_ALIGN(str, sizeof(type));                        \
                val4 = va_arg(args, int);                                \
                if (str + sizeof(type) <= end)                                \
                        *(typeof(type) *)str = (type)(long)val4;        \
                value = (unsigned long long)val4;                        \
        }                                                                \
        str += sizeof(type);                                                \
        value;                                                                \
})

        while (*fmt.str) {
                fmt = format_decode(fmt, &spec);

                switch (fmt.state) {
                case FORMAT_STATE_NONE:
                case FORMAT_STATE_PERCENT_CHAR:
                        break;
                case FORMAT_STATE_INVALID:
                        goto out;

                case FORMAT_STATE_WIDTH:
                case FORMAT_STATE_PRECISION:
                        width = (int)save_arg(int);
                        /* Pointers may require the width */
                        if (*fmt.str == 'p')
                                set_field_width(&spec, width);
                        break;

                case FORMAT_STATE_CHAR:
                        save_arg(char);
                        break;

                case FORMAT_STATE_STR: {
                        const char *save_str = va_arg(args, char *);
                        const char *err_msg;
                        size_t len;

                        err_msg = check_pointer_msg(save_str);
                        if (err_msg)
                                save_str = err_msg;

                        len = strlen(save_str) + 1;
                        if (str + len < end)
                                memcpy(str, save_str, len);
                        str += len;
                        break;
                }

                case FORMAT_STATE_PTR:
                        /* Dereferenced pointers must be done now */
                        switch (*fmt.str) {
                        /* Dereference of functions is still OK */
                        case 'S':
                        case 's':
                        case 'x':
                        case 'K':
                        case 'e':
                                save_arg(void *);
                                break;
                        default:
                                if (!isalnum(*fmt.str)) {
                                        save_arg(void *);
                                        break;
                                }
                                str = pointer(fmt.str, str, end, va_arg(args, void *),
                                              spec);
                                if (str + 1 < end)
                                        *str++ = '\0';
                                else
                                        end[-1] = '\0'; /* Must be nul terminated */
                        }
                        /* skip all alphanumeric pointer suffixes */
                        while (isalnum(*fmt.str))
                                fmt.str++;
                        break;

                case FORMAT_STATE_NUM:
                        if (fmt.size > sizeof(int)) {
                                save_arg(long long);
                        } else {
                                save_arg(int);
                        }
                }
        }

out:
        return (u32 *)(PTR_ALIGN(str, sizeof(u32))) - bin_buf;
#undef save_arg
}
EXPORT_SYMBOL_GPL(vbin_printf);

/**
 * bstr_printf - Format a string from binary arguments and place it in a buffer
 * @buf: The buffer to place the result into
 * @size: The size of the buffer, including the trailing null space
 * @fmt_str: The format string to use
 * @bin_buf: Binary arguments for the format string
 *
 * This function like C99 vsnprintf, but the difference is that vsnprintf gets
 * arguments from stack, and bstr_printf gets arguments from @bin_buf which is
 * a binary buffer that generated by vbin_printf.
 *
 * The format follows C99 vsnprintf, but has some extensions:
 *  see vsnprintf comment for details.
 *
 * The return value is the number of characters which would
 * be generated for the given input, excluding the trailing
 * '\0', as per ISO C99. If you want to have the exact
 * number of characters written into @buf as return value
 * (not including the trailing '\0'), use vscnprintf(). If the
 * return is greater than or equal to @size, the resulting
 * string is truncated.
 */
int bstr_printf(char *buf, size_t size, const char *fmt_str, const u32 *bin_buf)
{
        struct fmt fmt = {
                .str = fmt_str,
                .state = FORMAT_STATE_NONE,
        };
        struct printf_spec spec = {0};
        char *str, *end;
        const char *args = (const char *)bin_buf;

        if (WARN_ON_ONCE(size > INT_MAX))
                return 0;

        str = buf;
        end = buf + size;

#define get_arg(type)                                                        \
({                                                                        \
        typeof(type) value;                                                \
        if (sizeof(type) == 8) {                                        \
                args = PTR_ALIGN(args, sizeof(u32));                        \
                *(u32 *)&value = *(u32 *)args;                                \
                *((u32 *)&value + 1) = *(u32 *)(args + 4);                \
        } else {                                                        \
                args = PTR_ALIGN(args, sizeof(type));                        \
                value = *(typeof(type) *)args;                                \
        }                                                                \
        args += sizeof(type);                                                \
        value;                                                                \
})

        /* Make sure end is always >= buf */
        if (end < buf) {
                end = ((void *)-1);
                size = end - buf;
        }

        while (*fmt.str) {
                const char *old_fmt = fmt.str;
                unsigned long long num;

                fmt = format_decode(fmt, &spec);
                switch (fmt.state) {
                case FORMAT_STATE_NONE: {
                        int read = fmt.str - old_fmt;
                        if (str < end) {
                                int copy = read;
                                if (copy > end - str)
                                        copy = end - str;
                                memcpy(str, old_fmt, copy);
                        }
                        str += read;
                        continue;
                }

                case FORMAT_STATE_WIDTH:
                        set_field_width(&spec, get_arg(int));
                        continue;

                case FORMAT_STATE_PRECISION:
                        set_precision(&spec, get_arg(int));
                        continue;

                case FORMAT_STATE_CHAR: {
                        char c;

                        if (!(spec.flags & LEFT)) {
                                while (--spec.field_width > 0) {
                                        if (str < end)
                                                *str = ' ';
                                        ++str;
                                }
                        }
                        c = (unsigned char) get_arg(char);
                        if (str < end)
                                *str = c;
                        ++str;
                        while (--spec.field_width > 0) {
                                if (str < end)
                                        *str = ' ';
                                ++str;
                        }
                        continue;
                }

                case FORMAT_STATE_STR: {
                        const char *str_arg = args;
                        args += strlen(str_arg) + 1;
                        str = string(str, end, (char *)str_arg, spec);
                        continue;
                }

                case FORMAT_STATE_PTR: {
                        bool process = false;
                        int copy, len;
                        /* Non function dereferences were already done */
                        switch (*fmt.str) {
                        case 'S':
                        case 's':
                        case 'x':
                        case 'K':
                        case 'e':
                                process = true;
                                break;
                        default:
                                if (!isalnum(*fmt.str)) {
                                        process = true;
                                        break;
                                }
                                /* Pointer dereference was already processed */
                                if (str < end) {
                                        len = copy = strlen(args);
                                        if (copy > end - str)
                                                copy = end - str;
                                        memcpy(str, args, copy);
                                        str += len;
                                        args += len + 1;
                                }
                        }
                        if (process)
                                str = pointer(fmt.str, str, end, get_arg(void *), spec);

                        while (isalnum(*fmt.str))
                                fmt.str++;
                        continue;
                }

                case FORMAT_STATE_PERCENT_CHAR:
                        if (str < end)
                                *str = '%';
                        ++str;
                        continue;

                case FORMAT_STATE_INVALID:
                        goto out;

                case FORMAT_STATE_NUM:
                        if (fmt.size > sizeof(int)) {
                                num = get_arg(long long);
                        } else {
                                num = convert_num_spec(get_arg(int), fmt.size, spec);
                        }
                        str = number(str, end, num, spec);
                        continue;
                }
        } /* while(*fmt.str) */

out:
        if (size > 0) {
                if (str < end)
                        *str = '\0';
                else
                        end[-1] = '\0';
        }

#undef get_arg

        /* the trailing null byte doesn't count towards the total */
        return str - buf;
}
EXPORT_SYMBOL_GPL(bstr_printf);

#endif /* CONFIG_BINARY_PRINTF */

/**
 * vsscanf - Unformat a buffer into a list of arguments
 * @buf:        input buffer
 * @fmt:        format of buffer
 * @args:        arguments
 */
int vsscanf(const char *buf, const char *fmt, va_list args)
{
        const char *str = buf;
        char *next;
        char digit;
        int num = 0;
        u8 qualifier;
        unsigned int base;
        union {
                long long s;
                unsigned long long u;
        } val;
        s16 field_width;
        bool is_sign;

        while (*fmt) {
                /* skip any white space in format */
                /* white space in format matches any amount of
                 * white space, including none, in the input.
                 */
                if (isspace(*fmt)) {
                        fmt = skip_spaces(++fmt);
                        str = skip_spaces(str);
                }

                /* anything that is not a conversion must match exactly */
                if (*fmt != '%' && *fmt) {
                        if (*fmt++ != *str++)
                                break;
                        continue;
                }

                if (!*fmt)
                        break;
                ++fmt;

                /* skip this conversion.
                 * advance both strings to next white space
                 */
                if (*fmt == '*') {
                        if (!*str)
                                break;
                        while (!isspace(*fmt) && *fmt != '%' && *fmt) {
                                /* '%*[' not yet supported, invalid format */
                                if (*fmt == '[')
                                        return num;
                                fmt++;
                        }
                        while (!isspace(*str) && *str)
                                str++;
                        continue;
                }

                /* get field width */
                field_width = -1;
                if (isdigit(*fmt)) {
                        field_width = skip_atoi(&fmt);
                        if (field_width <= 0)
                                break;
                }

                /* get conversion qualifier */
                qualifier = -1;
                if (*fmt == 'h' || _tolower(*fmt) == 'l' ||
                    *fmt == 'z') {
                        qualifier = *fmt++;
                        if (unlikely(qualifier == *fmt)) {
                                if (qualifier == 'h') {
                                        qualifier = 'H';
                                        fmt++;
                                } else if (qualifier == 'l') {
                                        qualifier = 'L';
                                        fmt++;
                                }
                        }
                }

                if (!*fmt)
                        break;

                if (*fmt == 'n') {
                        /* return number of characters read so far */
                        *va_arg(args, int *) = str - buf;
                        ++fmt;
                        continue;
                }

                if (!*str)
                        break;

                base = 10;
                is_sign = false;

                switch (*fmt++) {
                case 'c':
                {
                        char *s = (char *)va_arg(args, char*);
                        if (field_width == -1)
                                field_width = 1;
                        do {
                                *s++ = *str++;
                        } while (--field_width > 0 && *str);
                        num++;
                }
                continue;
                case 's':
                {
                        char *s = (char *)va_arg(args, char *);
                        if (field_width == -1)
                                field_width = SHRT_MAX;
                        /* first, skip leading white space in buffer */
                        str = skip_spaces(str);

                        /* now copy until next white space */
                        while (*str && !isspace(*str) && field_width--)
                                *s++ = *str++;
                        *s = '\0';
                        num++;
                }
                continue;
                /*
                 * Warning: This implementation of the '[' conversion specifier
                 * deviates from its glibc counterpart in the following ways:
                 * (1) It does NOT support ranges i.e. '-' is NOT a special
                 *     character
                 * (2) It cannot match the closing bracket ']' itself
                 * (3) A field width is required
                 * (4) '%*[' (discard matching input) is currently not supported
                 *
                 * Example usage:
                 * ret = sscanf("00:0a:95","%2[^:]:%2[^:]:%2[^:]",
                 *                buf1, buf2, buf3);
                 * if (ret < 3)
                 *    // etc..
                 */
                case '[':
                {
                        char *s = (char *)va_arg(args, char *);
                        DECLARE_BITMAP(set, 256) = {0};
                        unsigned int len = 0;
                        bool negate = (*fmt == '^');

                        /* field width is required */
                        if (field_width == -1)
                                return num;

                        if (negate)
                                ++fmt;

                        for ( ; *fmt && *fmt != ']'; ++fmt, ++len)
                                __set_bit((u8)*fmt, set);

                        /* no ']' or no character set found */
                        if (!*fmt || !len)
                                return num;
                        ++fmt;

                        if (negate) {
                                bitmap_complement(set, set, 256);
                                /* exclude null '\0' byte */
                                __clear_bit(0, set);
                        }

                        /* match must be non-empty */
                        if (!test_bit((u8)*str, set))
                                return num;

                        while (test_bit((u8)*str, set) && field_width--)
                                *s++ = *str++;
                        *s = '\0';
                        ++num;
                }
                continue;
                case 'o':
                        base = 8;
                        break;
                case 'x':
                case 'X':
                        base = 16;
                        break;
                case 'i':
                        base = 0;
                        fallthrough;
                case 'd':
                        is_sign = true;
                        fallthrough;
                case 'u':
                        break;
                case '%':
                        /* looking for '%' in str */
                        if (*str++ != '%')
                                return num;
                        continue;
                default:
                        /* invalid format; stop here */
                        return num;
                }

                /* have some sort of integer conversion.
                 * first, skip white space in buffer.
                 */
                str = skip_spaces(str);

                digit = *str;
                if (is_sign && digit == '-') {
                        if (field_width == 1)
                                break;

                        digit = *(str + 1);
                }

                if (!digit
                    || (base == 16 && !isxdigit(digit))
                    || (base == 10 && !isdigit(digit))
                    || (base == 8 && !isodigit(digit))
                    || (base == 0 && !isdigit(digit)))
                        break;

                if (is_sign)
                        val.s = simple_strntoll(str, &next, base,
                                                field_width >= 0 ? field_width : INT_MAX);
                else
                        val.u = simple_strntoull(str, &next, base,
                                                 field_width >= 0 ? field_width : INT_MAX);

                switch (qualifier) {
                case 'H':        /* that's 'hh' in format */
                        if (is_sign)
                                *va_arg(args, signed char *) = val.s;
                        else
                                *va_arg(args, unsigned char *) = val.u;
                        break;
                case 'h':
                        if (is_sign)
                                *va_arg(args, short *) = val.s;
                        else
                                *va_arg(args, unsigned short *) = val.u;
                        break;
                case 'l':
                        if (is_sign)
                                *va_arg(args, long *) = val.s;
                        else
                                *va_arg(args, unsigned long *) = val.u;
                        break;
                case 'L':
                        if (is_sign)
                                *va_arg(args, long long *) = val.s;
                        else
                                *va_arg(args, unsigned long long *) = val.u;
                        break;
                case 'z':
                        *va_arg(args, size_t *) = val.u;
                        break;
                default:
                        if (is_sign)
                                *va_arg(args, int *) = val.s;
                        else
                                *va_arg(args, unsigned int *) = val.u;
                        break;
                }
                num++;

                if (!next)
                        break;
                str = next;
        }

        return num;
}
EXPORT_SYMBOL(vsscanf);

/**
 * sscanf - Unformat a buffer into a list of arguments
 * @buf:        input buffer
 * @fmt:        formatting of buffer
 * @...:        resulting arguments
 */
int sscanf(const char *buf, const char *fmt, ...)
{
        va_list args;
        int i;

        va_start(args, fmt);
        i = vsscanf(buf, fmt, args);
        va_end(args);

        return i;
}
EXPORT_SYMBOL(sscanf);
















































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _IPV6_H
#define _IPV6_H

#include <uapi/linux/ipv6.h>
#include <linux/cache.h>

#define ipv6_optlen(p)  (((p)->hdrlen+1) << 3)
#define ipv6_authlen(p) (((p)->hdrlen+2) << 2)
/*
 * This structure contains configuration options per IPv6 link.
 */
struct ipv6_devconf {
        /* RX & TX fastpath fields. */
        __cacheline_group_begin(ipv6_devconf_read_txrx);
        __s32                disable_ipv6;
        __s32                hop_limit;
        __s32                mtu6;
        __s32                forwarding;
        __s32                force_forwarding;
        __s32                disable_policy;
        __s32                proxy_ndp;
        __cacheline_group_end(ipv6_devconf_read_txrx);

        __s32                accept_ra;
        __s32                accept_redirects;
        __s32                autoconf;
        __s32                dad_transmits;
        __s32                rtr_solicits;
        __s32                rtr_solicit_interval;
        __s32                rtr_solicit_max_interval;
        __s32                rtr_solicit_delay;
        __s32                force_mld_version;
        __s32                mldv1_unsolicited_report_interval;
        __s32                mldv2_unsolicited_report_interval;
        __s32                use_tempaddr;
        __s32                temp_valid_lft;
        __s32                temp_prefered_lft;
        __s32                regen_min_advance;
        __s32                regen_max_retry;
        __s32                max_desync_factor;
        __s32                max_addresses;
        __s32                accept_ra_defrtr;
        __u32                ra_defrtr_metric;
        __s32                accept_ra_min_hop_limit;
        __s32                accept_ra_min_lft;
        __s32                accept_ra_pinfo;
        __s32                ignore_routes_with_linkdown;
#ifdef CONFIG_IPV6_ROUTER_PREF
        __s32                accept_ra_rtr_pref;
        __s32                rtr_probe_interval;
#ifdef CONFIG_IPV6_ROUTE_INFO
        __s32                accept_ra_rt_info_min_plen;
        __s32                accept_ra_rt_info_max_plen;
#endif
#endif
        __s32                accept_source_route;
        __s32                accept_ra_from_local;
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
        __s32                optimistic_dad;
        __s32                use_optimistic;
#endif
#ifdef CONFIG_IPV6_MROUTE
        atomic_t        mc_forwarding;
#endif
        __s32                drop_unicast_in_l2_multicast;
        __s32                accept_dad;
        __s32                force_tllao;
        __s32           ndisc_notify;
        __s32                suppress_frag_ndisc;
        __s32                accept_ra_mtu;
        __s32                drop_unsolicited_na;
        __s32                accept_untracked_na;
        struct ipv6_stable_secret {
                bool initialized;
                struct in6_addr secret;
        } stable_secret;
        __s32                use_oif_addrs_only;
        __s32                keep_addr_on_down;
        __s32                seg6_enabled;
#ifdef CONFIG_IPV6_SEG6_HMAC
        __s32                seg6_require_hmac;
#endif
        __u32                enhanced_dad;
        __u32                addr_gen_mode;
        __s32           ndisc_tclass;
        __s32                rpl_seg_enabled;
        __u32                ioam6_id;
        __u32                ioam6_id_wide;
        __u8                ioam6_enabled;
        __u8                ndisc_evict_nocarrier;
        __u8                ra_honor_pio_life;
        __u8                ra_honor_pio_pflag;

        struct ctl_table_header *sysctl_header;
};

struct ipv6_params {
        __s32 disable_ipv6;
        __s32 autoconf;
};
extern struct ipv6_params ipv6_defaults;
#include <linux/tcp.h>
#include <linux/udp.h>

#include <net/inet_sock.h>

static inline struct ipv6hdr *ipv6_hdr(const struct sk_buff *skb)
{
        return (struct ipv6hdr *)skb_network_header(skb);
}

static inline struct ipv6hdr *inner_ipv6_hdr(const struct sk_buff *skb)
{
        return (struct ipv6hdr *)skb_inner_network_header(skb);
}

static inline struct ipv6hdr *ipipv6_hdr(const struct sk_buff *skb)
{
        return (struct ipv6hdr *)skb_transport_header(skb);
}

static inline unsigned int ipv6_transport_len(const struct sk_buff *skb)
{
        return ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr) -
               skb_network_header_len(skb);
}

/* 
   This structure contains results of exthdrs parsing
   as offsets from skb->nh.
 */

struct inet6_skb_parm {
        int                        iif;
        __be16                        ra;
        __u16                        dst0;
        __u16                        srcrt;
        __u16                        dst1;
        __u16                        lastopt;
        __u16                        nhoff;
        __u16                        flags;
#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
        __u16                        dsthao;
#endif
        __u16                        frag_max_size;
        __u16                        srhoff;

#define IP6SKB_XFRM_TRANSFORMED        1
#define IP6SKB_FORWARDED        2
#define IP6SKB_REROUTED                4
#define IP6SKB_ROUTERALERT        8
#define IP6SKB_FRAGMENTED      16
#define IP6SKB_HOPBYHOP        32
#define IP6SKB_L3SLAVE         64
#define IP6SKB_JUMBOGRAM      128
#define IP6SKB_SEG6              256
#define IP6SKB_FAKEJUMBO      512
#define IP6SKB_MULTIPATH      1024
#define IP6SKB_MCROUTE        2048
};

#if defined(CONFIG_NET_L3_MASTER_DEV)
static inline bool ipv6_l3mdev_skb(__u16 flags)
{
        return flags & IP6SKB_L3SLAVE;
}
#else
static inline bool ipv6_l3mdev_skb(__u16 flags)
{
        return false;
}
#endif

#define IP6CB(skb)        ((struct inet6_skb_parm*)((skb)->cb))
#define IP6CBMTU(skb)        ((struct ip6_mtuinfo *)((skb)->cb))

static inline int inet6_iif(const struct sk_buff *skb)
{
        bool l3_slave = ipv6_l3mdev_skb(IP6CB(skb)->flags);

        return l3_slave ? skb->skb_iif : IP6CB(skb)->iif;
}

static inline bool inet6_is_jumbogram(const struct sk_buff *skb)
{
        return !!(IP6CB(skb)->flags & IP6SKB_JUMBOGRAM);
}

/* can not be used in TCP layer after tcp_v6_fill_cb */
static inline int inet6_sdif(const struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
        if (skb && ipv6_l3mdev_skb(IP6CB(skb)->flags))
                return IP6CB(skb)->iif;
#endif
        return 0;
}

struct tcp6_request_sock {
        struct tcp_request_sock          tcp6rsk_tcp;
};

struct ipv6_mc_socklist;
struct ipv6_ac_socklist;
struct ipv6_fl_socklist;

struct inet6_cork {
        struct ipv6_txoptions *opt;
        u8 hop_limit;
        u8 tclass;
        u8 dontfrag:1;
};

/* struct ipv6_pinfo - ipv6 private area */
struct ipv6_pinfo {
        /* Used in tx path (inet6_csk_route_socket(), ip6_xmit()) */
        struct in6_addr         saddr;
        __be32                        flow_label;
        u32                        dst_cookie;
        struct ipv6_txoptions __rcu        *opt;
        s16                        hop_limit;
        u8                        pmtudisc;
        u8                        tclass;
#ifdef CONFIG_IPV6_SUBTREES
        bool                        saddr_cache;
#endif
        bool                        daddr_cache;

        u8                        mcast_hops;
        u32                        frag_size;

        int                        ucast_oif;
        int                        mcast_oif;

        /* pktoption flags */
        union {
                struct {
                        u16        srcrt:1,
                                osrcrt:1,
                                rxinfo:1,
                                rxoinfo:1,
                                rxhlim:1,
                                rxohlim:1,
                                hopopts:1,
                                ohopopts:1,
                                dstopts:1,
                                odstopts:1,
                                rxflow:1,
                                rxtclass:1,
                                rxpmtu:1,
                                rxorigdstaddr:1,
                                recvfragsize:1;
                                /* 1 bits hole */
                } bits;
                u16                all;
        } rxopt;

        /* sockopt flags */
        u8                        srcprefs;        /* 001: prefer temporary address
                                                 * 010: prefer public address
                                                 * 100: prefer care-of address
                                                 */
        u8                        min_hopcount;
        __be32                        rcv_flowinfo;
        struct in6_pktinfo        sticky_pktinfo;

        struct sk_buff                *pktoptions;
        struct sk_buff                *rxpmtu;
        struct inet6_cork        cork;

        struct ipv6_mc_socklist        __rcu *ipv6_mc_list;
        struct ipv6_ac_socklist        *ipv6_ac_list;
        struct ipv6_fl_socklist __rcu *ipv6_fl_list;
};

/* We currently use available bits from inet_sk(sk)->inet_flags,
 * this could change in the future.
 */
#define inet6_test_bit(nr, sk)                        \
        test_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
#define inet6_set_bit(nr, sk)                        \
        set_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
#define inet6_clear_bit(nr, sk)                        \
        clear_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
#define inet6_assign_bit(nr, sk, val)                \
        assign_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags, val)

/* WARNING: don't change the layout of the members in {raw,udp,tcp}6_sock! */
struct raw6_sock {
        /* inet_sock has to be the first member of raw6_sock */
        struct inet_sock        inet;
        __u32                        checksum;        /* perform checksum */
        __u32                        offset;                /* checksum offset  */
        struct icmp6_filter        filter;
        __u32                        ip6mr_table;
        struct numa_drop_counters drop_counters;
        struct ipv6_pinfo        inet6;
};

struct udp6_sock {
        struct udp_sock          udp;

        struct ipv6_pinfo inet6;
};

struct tcp6_sock {
        struct tcp_sock          tcp;

        struct ipv6_pinfo inet6;
};

extern int inet6_sk_rebuild_header(struct sock *sk);

struct tcp6_timewait_sock {
        struct tcp_timewait_sock   tcp6tw_tcp;
};

#if IS_ENABLED(CONFIG_IPV6)
bool ipv6_mod_enabled(void);

static inline struct ipv6_pinfo *inet6_sk(const struct sock *__sk)
{
        return sk_fullsock(__sk) ? inet_sk(__sk)->pinet6 : NULL;
}

#define raw6_sk(ptr) container_of_const(ptr, struct raw6_sock, inet.sk)

#define ipv6_only_sock(sk)        (sk->sk_ipv6only)
#define ipv6_sk_rxinfo(sk)        ((sk)->sk_family == PF_INET6 && \
                                 inet6_sk(sk)->rxopt.bits.rxinfo)

static inline const struct in6_addr *inet6_rcv_saddr(const struct sock *sk)
{
        if (sk->sk_family == AF_INET6)
                return &sk->sk_v6_rcv_saddr;
        return NULL;
}

static inline int inet_v6_ipv6only(const struct sock *sk)
{
        /* ipv6only field is at same position for timewait and other sockets */
        return ipv6_only_sock(sk);
}
#else
#define ipv6_only_sock(sk)        0
#define ipv6_sk_rxinfo(sk)        0

static inline bool ipv6_mod_enabled(void)
{
        return false;
}

static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)
{
        return NULL;
}

static inline struct raw6_sock *raw6_sk(const struct sock *sk)
{
        return NULL;
}

#define inet6_rcv_saddr(__sk)        NULL
#define inet_v6_ipv6only(__sk)                0
#endif /* IS_ENABLED(CONFIG_IPV6) */
#endif /* _IPV6_H */
































































































  309 
























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
/* SPDX-License-Identifier: GPL-2.0 */

/*
 * This header provides generic wrappers for memory access instrumentation that
 * the compiler cannot emit for: KASAN, KCSAN, KMSAN.
 */
#ifndef _LINUX_INSTRUMENTED_H
#define _LINUX_INSTRUMENTED_H

#include <linux/compiler.h>
#include <linux/kasan-checks.h>
#include <linux/kcsan-checks.h>
#include <linux/kmsan-checks.h>
#include <linux/types.h>

/**
 * instrument_read - instrument regular read access
 * @v: address of access
 * @size: size of access
 *
 * Instrument a regular read access. The instrumentation should be inserted
 * before the actual read happens.
 */
static __always_inline void instrument_read(const volatile void *v, size_t size)
{
        kasan_check_read(v, size);
        kcsan_check_read(v, size);
}

/**
 * instrument_write - instrument regular write access
 * @v: address of access
 * @size: size of access
 *
 * Instrument a regular write access. The instrumentation should be inserted
 * before the actual write happens.
 */
static __always_inline void instrument_write(const volatile void *v, size_t size)
{
        kasan_check_write(v, size);
        kcsan_check_write(v, size);
}

/**
 * instrument_read_write - instrument regular read-write access
 * @v: address of access
 * @size: size of access
 *
 * Instrument a regular write access. The instrumentation should be inserted
 * before the actual write happens.
 */
static __always_inline void instrument_read_write(const volatile void *v, size_t size)
{
        kasan_check_write(v, size);
        kcsan_check_read_write(v, size);
}

/**
 * instrument_atomic_read - instrument atomic read access
 * @v: address of access
 * @size: size of access
 *
 * Instrument an atomic read access. The instrumentation should be inserted
 * before the actual read happens.
 */
static __always_inline void instrument_atomic_read(const volatile void *v, size_t size)
{
        kasan_check_read(v, size);
        kcsan_check_atomic_read(v, size);
}

/**
 * instrument_atomic_write - instrument atomic write access
 * @v: address of access
 * @size: size of access
 *
 * Instrument an atomic write access. The instrumentation should be inserted
 * before the actual write happens.
 */
static __always_inline void instrument_atomic_write(const volatile void *v, size_t size)
{
        kasan_check_write(v, size);
        kcsan_check_atomic_write(v, size);
}

/**
 * instrument_atomic_read_write - instrument atomic read-write access
 * @v: address of access
 * @size: size of access
 *
 * Instrument an atomic read-write access. The instrumentation should be
 * inserted before the actual write happens.
 */
static __always_inline void instrument_atomic_read_write(const volatile void *v, size_t size)
{
        kasan_check_write(v, size);
        kcsan_check_atomic_read_write(v, size);
}

/**
 * instrument_copy_to_user - instrument reads of copy_to_user
 * @to: destination address
 * @from: source address
 * @n: number of bytes to copy
 *
 * Instrument reads from kernel memory, that are due to copy_to_user (and
 * variants). The instrumentation must be inserted before the accesses.
 */
static __always_inline void
instrument_copy_to_user(void __user *to, const void *from, unsigned long n)
{
        kasan_check_read(from, n);
        kcsan_check_read(from, n);
        kmsan_copy_to_user(to, from, n, 0);
}

/**
 * instrument_copy_from_user_before - add instrumentation before copy_from_user
 * @to: destination address
 * @from: source address
 * @n: number of bytes to copy
 *
 * Instrument writes to kernel memory, that are due to copy_from_user (and
 * variants). The instrumentation should be inserted before the accesses.
 */
static __always_inline void
instrument_copy_from_user_before(const void *to, const void __user *from, unsigned long n)
{
        kasan_check_write(to, n);
        kcsan_check_write(to, n);
}

/**
 * instrument_copy_from_user_after - add instrumentation after copy_from_user
 * @to: destination address
 * @from: source address
 * @n: number of bytes to copy
 * @left: number of bytes not copied (as returned by copy_from_user)
 *
 * Instrument writes to kernel memory, that are due to copy_from_user (and
 * variants). The instrumentation should be inserted after the accesses.
 */
static __always_inline void
instrument_copy_from_user_after(const void *to, const void __user *from,
                                unsigned long n, unsigned long left)
{
        kmsan_unpoison_memory(to, n - left);
}

/**
 * instrument_memcpy_before - add instrumentation before non-instrumented memcpy
 * @to: destination address
 * @from: source address
 * @n: number of bytes to copy
 *
 * Instrument memory accesses that happen in custom memcpy implementations. The
 * instrumentation should be inserted before the memcpy call.
 */
static __always_inline void instrument_memcpy_before(void *to, const void *from,
                                                     unsigned long n)
{
        kasan_check_write(to, n);
        kasan_check_read(from, n);
        kcsan_check_write(to, n);
        kcsan_check_read(from, n);
}

/**
 * instrument_memcpy_after - add instrumentation after non-instrumented memcpy
 * @to: destination address
 * @from: source address
 * @n: number of bytes to copy
 * @left: number of bytes not copied (if known)
 *
 * Instrument memory accesses that happen in custom memcpy implementations. The
 * instrumentation should be inserted after the memcpy call.
 */
static __always_inline void instrument_memcpy_after(void *to, const void *from,
                                                    unsigned long n,
                                                    unsigned long left)
{
        kmsan_memmove(to, from, n - left);
}

/**
 * instrument_get_user() - add instrumentation to get_user()-like macros
 * @to: destination variable, may not be address-taken
 *
 * get_user() and friends are fragile, so it may depend on the implementation
 * whether the instrumentation happens before or after the data is copied from
 * the userspace.
 */
#define instrument_get_user(to)                                \
({                                                        \
        u64 __tmp = (u64)(to);                                \
        kmsan_unpoison_memory(&__tmp, sizeof(__tmp));        \
        to = __tmp;                                        \
})


/**
 * instrument_put_user() - add instrumentation to put_user()-like macros
 * @from: source address
 * @ptr: userspace pointer to copy to
 * @size: number of bytes to copy
 *
 * put_user() and friends are fragile, so it may depend on the implementation
 * whether the instrumentation happens before or after the data is copied from
 * the userspace.
 */
#define instrument_put_user(from, ptr, size)                        \
({                                                                \
        kmsan_copy_to_user(ptr, &from, sizeof(from), 0);        \
})

#endif /* _LINUX_INSTRUMENTED_H */

























































  318 


  315 

  320 




















  319 















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Wrapper functions for accessing the file_struct fd array.
 */

#ifndef __LINUX_FILE_H
#define __LINUX_FILE_H

#include <linux/compiler.h>
#include <linux/types.h>
#include <linux/posix_types.h>
#include <linux/errno.h>
#include <linux/cleanup.h>
#include <linux/err.h>

struct file;

extern void fput(struct file *);

struct file_operations;
struct task_struct;
struct vfsmount;
struct dentry;
struct inode;
struct path;
extern struct file *alloc_file_pseudo(struct inode *, struct vfsmount *,
        const char *, int flags, const struct file_operations *);
extern struct file *alloc_file_pseudo_noaccount(struct inode *, struct vfsmount *,
        const char *, int flags, const struct file_operations *);
extern struct file *alloc_file_clone(struct file *, int flags,
        const struct file_operations *);

/* either a reference to struct file + flags
 * (cloned vs. borrowed, pos locked), with
 * flags stored in lower bits of value,
 * or empty (represented by 0).
 */
struct fd {
        unsigned long word;
};
#define FDPUT_FPUT       1
#define FDPUT_POS_UNLOCK 2

#define fd_file(f) ((struct file *)((f).word & ~(FDPUT_FPUT|FDPUT_POS_UNLOCK)))
static inline bool fd_empty(struct fd f)
{
        return unlikely(!f.word);
}

#define EMPTY_FD (struct fd){0}
static inline struct fd BORROWED_FD(struct file *f)
{
        return (struct fd){(unsigned long)f};
}
static inline struct fd CLONED_FD(struct file *f)
{
        return (struct fd){(unsigned long)f | FDPUT_FPUT};
}

static inline void fdput(struct fd fd)
{
        if (unlikely(fd.word & FDPUT_FPUT))
                fput(fd_file(fd));
}

extern struct file *fget(unsigned int fd);
extern struct file *fget_raw(unsigned int fd);
extern struct file *fget_task(struct task_struct *task, unsigned int fd);
extern struct file *fget_task_next(struct task_struct *task, unsigned int *fd);
extern void __f_unlock_pos(struct file *);

struct fd fdget(unsigned int fd);
struct fd fdget_raw(unsigned int fd);
struct fd fdget_pos(unsigned int fd);

static inline void fdput_pos(struct fd f)
{
        if (f.word & FDPUT_POS_UNLOCK)
                __f_unlock_pos(fd_file(f));
        fdput(f);
}

DEFINE_CLASS(fd, struct fd, fdput(_T), fdget(fd), int fd)
DEFINE_CLASS(fd_raw, struct fd, fdput(_T), fdget_raw(fd), int fd)
DEFINE_CLASS(fd_pos, struct fd, fdput_pos(_T), fdget_pos(fd), int fd)

extern int f_dupfd(unsigned int from, struct file *file, unsigned flags);
extern int replace_fd(unsigned fd, struct file *file, unsigned flags);
extern void set_close_on_exec(unsigned int fd, int flag);
extern bool get_close_on_exec(unsigned int fd);
extern int __get_unused_fd_flags(unsigned flags, unsigned long nofile);
extern int get_unused_fd_flags(unsigned flags);
extern void put_unused_fd(unsigned int fd);

DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T),
             get_unused_fd_flags(flags), unsigned flags)
DEFINE_FREE(fput, struct file *, if (!IS_ERR_OR_NULL(_T)) fput(_T))

/*
 * take_fd() will take care to set @fd to -EBADF ensuring that
 * CLASS(get_unused_fd) won't call put_unused_fd(). This makes it
 * easier to rely on CLASS(get_unused_fd):
 *
 * struct file *f;
 *
 * CLASS(get_unused_fd, fd)(O_CLOEXEC);
 * if (fd < 0)
 *         return fd;
 *
 * f = dentry_open(&path, O_RDONLY, current_cred());
 * if (IS_ERR(f))
 *         return PTR_ERR(f);
 *
 * fd_install(fd, f);
 * return take_fd(fd);
 */
#define take_fd(fd) __get_and_null(fd, -EBADF)

extern void fd_install(unsigned int fd, struct file *file);

int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags);

int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags);

extern void flush_delayed_fput(void);
extern void __fput_sync(struct file *);

extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max;

#endif /* __LINUX_FILE_H */



















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RCUWAIT_H_
#define _LINUX_RCUWAIT_H_

#include <linux/rcupdate.h>
#include <linux/sched/signal.h>
#include <linux/types.h>

#define __RCUWAIT_INITIALIZER(name)                \
        { .task = NULL, }

static inline void rcuwait_init(struct rcuwait *w)
{
        w->task = NULL;
}

/*
 * Note: this provides no serialization and, just as with waitqueues,
 * requires care to estimate as to whether or not the wait is active.
 */
static inline int rcuwait_active(struct rcuwait *w)
{
        return !!rcu_access_pointer(w->task);
}

extern int rcuwait_wake_up(struct rcuwait *w);

/*
 * The caller is responsible for locking around rcuwait_wait_event(),
 * and [prepare_to/finish]_rcuwait() such that writes to @task are
 * properly serialized.
 */

static inline void prepare_to_rcuwait(struct rcuwait *w)
{
        rcu_assign_pointer(w->task, current);
}

extern void finish_rcuwait(struct rcuwait *w);

#define ___rcuwait_wait_event(w, condition, state, ret, cmd)                \
({                                                                        \
        long __ret = ret;                                                \
        prepare_to_rcuwait(w);                                                \
        for (;;) {                                                        \
                /*                                                        \
                 * Implicit barrier (A) pairs with (B) in                \
                 * rcuwait_wake_up().                                        \
                 */                                                        \
                set_current_state(state);                                \
                if (condition)                                                \
                        break;                                                \
                                                                        \
                if (signal_pending_state(state, current)) {                \
                        __ret = -EINTR;                                        \
                        break;                                                \
                }                                                        \
                                                                        \
                cmd;                                                        \
        }                                                                \
        finish_rcuwait(w);                                                \
        __ret;                                                                \
})

#define rcuwait_wait_event(w, condition, state)                                \
        ___rcuwait_wait_event(w, condition, state, 0, schedule())

#define __rcuwait_wait_event_timeout(w, condition, state, timeout)        \
        ___rcuwait_wait_event(w, ___wait_cond_timeout(condition),        \
                              state, timeout,                                \
                              __ret = schedule_timeout(__ret))

#define rcuwait_wait_event_timeout(w, condition, state, timeout)        \
({                                                                        \
        long __ret = timeout;                                                \
        if (!___wait_cond_timeout(condition))                                \
                __ret = __rcuwait_wait_event_timeout(w, condition,        \
                                                     state, timeout);        \
        __ret;                                                                \
})

#endif /* _LINUX_RCUWAIT_H_ */













































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PAGE_REF_H
#define _LINUX_PAGE_REF_H

#include <linux/atomic.h>
#include <linux/mm_types.h>
#include <linux/page-flags.h>
#include <linux/tracepoint-defs.h>

DECLARE_TRACEPOINT(page_ref_set);
DECLARE_TRACEPOINT(page_ref_mod);
DECLARE_TRACEPOINT(page_ref_mod_and_test);
DECLARE_TRACEPOINT(page_ref_mod_and_return);
DECLARE_TRACEPOINT(page_ref_mod_unless);
DECLARE_TRACEPOINT(page_ref_freeze);
DECLARE_TRACEPOINT(page_ref_unfreeze);

#ifdef CONFIG_DEBUG_PAGE_REF

/*
 * Ideally we would want to use the trace_<tracepoint>_enabled() helper
 * functions. But due to include header file issues, that is not
 * feasible. Instead we have to open code the static key functions.
 *
 * See trace_##name##_enabled(void) in include/linux/tracepoint.h
 */
#define page_ref_tracepoint_active(t) tracepoint_enabled(t)

extern void __page_ref_set(struct page *page, int v);
extern void __page_ref_mod(struct page *page, int v);
extern void __page_ref_mod_and_test(struct page *page, int v, int ret);
extern void __page_ref_mod_and_return(struct page *page, int v, int ret);
extern void __page_ref_mod_unless(struct page *page, int v, int u);
extern void __page_ref_freeze(struct page *page, int v, int ret);
extern void __page_ref_unfreeze(struct page *page, int v);

#else

#define page_ref_tracepoint_active(t) false

static inline void __page_ref_set(struct page *page, int v)
{
}
static inline void __page_ref_mod(struct page *page, int v)
{
}
static inline void __page_ref_mod_and_test(struct page *page, int v, int ret)
{
}
static inline void __page_ref_mod_and_return(struct page *page, int v, int ret)
{
}
static inline void __page_ref_mod_unless(struct page *page, int v, int u)
{
}
static inline void __page_ref_freeze(struct page *page, int v, int ret)
{
}
static inline void __page_ref_unfreeze(struct page *page, int v)
{
}

#endif

static inline int page_ref_count(const struct page *page)
{
        return atomic_read(&page->_refcount);
}

/**
 * folio_ref_count - The reference count on this folio.
 * @folio: The folio.
 *
 * The refcount is usually incremented by calls to folio_get() and
 * decremented by calls to folio_put().  Some typical users of the
 * folio refcount:
 *
 * - Each reference from a page table
 * - The page cache
 * - Filesystem private data
 * - The LRU list
 * - Pipes
 * - Direct IO which references this page in the process address space
 *
 * Return: The number of references to this folio.
 */
static inline int folio_ref_count(const struct folio *folio)
{
        return page_ref_count(&folio->page);
}

static inline int page_count(const struct page *page)
{
        return folio_ref_count(page_folio(page));
}

static inline void set_page_count(struct page *page, int v)
{
        atomic_set(&page->_refcount, v);
        if (page_ref_tracepoint_active(page_ref_set))
                __page_ref_set(page, v);
}

static inline void folio_set_count(struct folio *folio, int v)
{
        set_page_count(&folio->page, v);
}

/*
 * Setup the page count before being freed into the page allocator for
 * the first time (boot or memory hotplug)
 */
static inline void init_page_count(struct page *page)
{
        set_page_count(page, 1);
}

static inline void page_ref_add(struct page *page, int nr)
{
        atomic_add(nr, &page->_refcount);
        if (page_ref_tracepoint_active(page_ref_mod))
                __page_ref_mod(page, nr);
}

static inline void folio_ref_add(struct folio *folio, int nr)
{
        page_ref_add(&folio->page, nr);
}

static inline void page_ref_sub(struct page *page, int nr)
{
        atomic_sub(nr, &page->_refcount);
        if (page_ref_tracepoint_active(page_ref_mod))
                __page_ref_mod(page, -nr);
}

static inline void folio_ref_sub(struct folio *folio, int nr)
{
        page_ref_sub(&folio->page, nr);
}

static inline int folio_ref_sub_return(struct folio *folio, int nr)
{
        int ret = atomic_sub_return(nr, &folio->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_return))
                __page_ref_mod_and_return(&folio->page, -nr, ret);
        return ret;
}

static inline void page_ref_inc(struct page *page)
{
        atomic_inc(&page->_refcount);
        if (page_ref_tracepoint_active(page_ref_mod))
                __page_ref_mod(page, 1);
}

static inline void folio_ref_inc(struct folio *folio)
{
        page_ref_inc(&folio->page);
}

static inline void page_ref_dec(struct page *page)
{
        atomic_dec(&page->_refcount);
        if (page_ref_tracepoint_active(page_ref_mod))
                __page_ref_mod(page, -1);
}

static inline void folio_ref_dec(struct folio *folio)
{
        page_ref_dec(&folio->page);
}

static inline int page_ref_sub_and_test(struct page *page, int nr)
{
        int ret = atomic_sub_and_test(nr, &page->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_test))
                __page_ref_mod_and_test(page, -nr, ret);
        return ret;
}

static inline int folio_ref_sub_and_test(struct folio *folio, int nr)
{
        return page_ref_sub_and_test(&folio->page, nr);
}

static inline int page_ref_inc_return(struct page *page)
{
        int ret = atomic_inc_return(&page->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_return))
                __page_ref_mod_and_return(page, 1, ret);
        return ret;
}

static inline int folio_ref_inc_return(struct folio *folio)
{
        return page_ref_inc_return(&folio->page);
}

static inline int page_ref_dec_and_test(struct page *page)
{
        int ret = atomic_dec_and_test(&page->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_test))
                __page_ref_mod_and_test(page, -1, ret);
        return ret;
}

static inline int folio_ref_dec_and_test(struct folio *folio)
{
        return page_ref_dec_and_test(&folio->page);
}

static inline int page_ref_dec_return(struct page *page)
{
        int ret = atomic_dec_return(&page->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_return))
                __page_ref_mod_and_return(page, -1, ret);
        return ret;
}

static inline int folio_ref_dec_return(struct folio *folio)
{
        return page_ref_dec_return(&folio->page);
}

static inline bool page_ref_add_unless(struct page *page, int nr, int u)
{
        bool ret = false;

        rcu_read_lock();
        /* avoid writing to the vmemmap area being remapped */
        if (page_count_writable(page, u))
                ret = atomic_add_unless(&page->_refcount, nr, u);
        rcu_read_unlock();

        if (page_ref_tracepoint_active(page_ref_mod_unless))
                __page_ref_mod_unless(page, nr, ret);
        return ret;
}

static inline bool folio_ref_add_unless(struct folio *folio, int nr, int u)
{
        return page_ref_add_unless(&folio->page, nr, u);
}

/**
 * folio_try_get - Attempt to increase the refcount on a folio.
 * @folio: The folio.
 *
 * If you do not already have a reference to a folio, you can attempt to
 * get one using this function.  It may fail if, for example, the folio
 * has been freed since you found a pointer to it, or it is frozen for
 * the purposes of splitting or migration.
 *
 * Return: True if the reference count was successfully incremented.
 */
static inline bool folio_try_get(struct folio *folio)
{
        return folio_ref_add_unless(folio, 1, 0);
}

static inline bool folio_ref_try_add(struct folio *folio, int count)
{
        return folio_ref_add_unless(folio, count, 0);
}

static inline int page_ref_freeze(struct page *page, int count)
{
        int ret = likely(atomic_cmpxchg(&page->_refcount, count, 0) == count);

        if (page_ref_tracepoint_active(page_ref_freeze))
                __page_ref_freeze(page, count, ret);
        return ret;
}

static inline int folio_ref_freeze(struct folio *folio, int count)
{
        return page_ref_freeze(&folio->page, count);
}

static inline void page_ref_unfreeze(struct page *page, int count)
{
        VM_BUG_ON_PAGE(page_count(page) != 0, page);
        VM_BUG_ON(count == 0);

        atomic_set_release(&page->_refcount, count);
        if (page_ref_tracepoint_active(page_ref_unfreeze))
                __page_ref_unfreeze(page, count);
}

static inline void folio_ref_unfreeze(struct folio *folio, int count)
{
        page_ref_unfreeze(&folio->page, count);
}
#endif







































    3 















    3 





















    3 




























































    5 










    5 

    5 


















    2 


























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * CMAC: Cipher Block Mode for Authentication
 *
 * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
 *
 * Based on work by:
 *  Copyright © 2013 Tom St Denis <tstdenis@elliptictech.com>
 * Based on crypto/xcbc.c:
 *  Copyright © 2006 USAGI/WIDE Project,
 *   Author: Kazunori Miyazawa <miyazawa@linux-ipv6.org>
 */

#include <crypto/internal/cipher.h>
#include <crypto/internal/hash.h>
#include <crypto/utils.h>
#include <linux/err.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/string.h>

/*
 * +------------------------
 * | <parent tfm>
 * +------------------------
 * | cmac_tfm_ctx
 * +------------------------
 * | consts (block size * 2)
 * +------------------------
 */
struct cmac_tfm_ctx {
        struct crypto_cipher *child;
        __be64 consts[];
};

static int crypto_cmac_digest_setkey(struct crypto_shash *parent,
                                     const u8 *inkey, unsigned int keylen)
{
        struct cmac_tfm_ctx *ctx = crypto_shash_ctx(parent);
        unsigned int bs = crypto_shash_blocksize(parent);
        __be64 *consts = ctx->consts;
        u64 _const[2];
        int i, err = 0;
        u8 msb_mask, gfmask;

        err = crypto_cipher_setkey(ctx->child, inkey, keylen);
        if (err)
                return err;

        /* encrypt the zero block */
        memset(consts, 0, bs);
        crypto_cipher_encrypt_one(ctx->child, (u8 *)consts, (u8 *)consts);

        switch (bs) {
        case 16:
                gfmask = 0x87;
                _const[0] = be64_to_cpu(consts[1]);
                _const[1] = be64_to_cpu(consts[0]);

                /* gf(2^128) multiply zero-ciphertext with u and u^2 */
                for (i = 0; i < 4; i += 2) {
                        msb_mask = ((s64)_const[1] >> 63) & gfmask;
                        _const[1] = (_const[1] << 1) | (_const[0] >> 63);
                        _const[0] = (_const[0] << 1) ^ msb_mask;

                        consts[i + 0] = cpu_to_be64(_const[1]);
                        consts[i + 1] = cpu_to_be64(_const[0]);
                }

                break;
        case 8:
                gfmask = 0x1B;
                _const[0] = be64_to_cpu(consts[0]);

                /* gf(2^64) multiply zero-ciphertext with u and u^2 */
                for (i = 0; i < 2; i++) {
                        msb_mask = ((s64)_const[0] >> 63) & gfmask;
                        _const[0] = (_const[0] << 1) ^ msb_mask;

                        consts[i] = cpu_to_be64(_const[0]);
                }

                break;
        }

        return 0;
}

static int crypto_cmac_digest_init(struct shash_desc *pdesc)
{
        int bs = crypto_shash_blocksize(pdesc->tfm);
        u8 *prev = shash_desc_ctx(pdesc);

        memset(prev, 0, bs);
        return 0;
}

static int crypto_cmac_digest_update(struct shash_desc *pdesc, const u8 *p,
                                     unsigned int len)
{
        struct crypto_shash *parent = pdesc->tfm;
        struct cmac_tfm_ctx *tctx = crypto_shash_ctx(parent);
        struct crypto_cipher *tfm = tctx->child;
        int bs = crypto_shash_blocksize(parent);
        u8 *prev = shash_desc_ctx(pdesc);

        do {
                crypto_xor(prev, p, bs);
                crypto_cipher_encrypt_one(tfm, prev, prev);
                p += bs;
                len -= bs;
        } while (len >= bs);
        return len;
}

static int crypto_cmac_digest_finup(struct shash_desc *pdesc, const u8 *src,
                                    unsigned int len, u8 *out)
{
        struct crypto_shash *parent = pdesc->tfm;
        struct cmac_tfm_ctx *tctx = crypto_shash_ctx(parent);
        struct crypto_cipher *tfm = tctx->child;
        int bs = crypto_shash_blocksize(parent);
        u8 *prev = shash_desc_ctx(pdesc);
        unsigned int offset = 0;

        crypto_xor(prev, src, len);
        if (len != bs) {
                prev[len] ^= 0x80;
                offset += bs;
        }
        crypto_xor(prev, (const u8 *)tctx->consts + offset, bs);
        crypto_cipher_encrypt_one(tfm, out, prev);
        return 0;
}

static int cmac_init_tfm(struct crypto_shash *tfm)
{
        struct shash_instance *inst = shash_alg_instance(tfm);
        struct cmac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
        struct crypto_cipher_spawn *spawn;
        struct crypto_cipher *cipher;

        spawn = shash_instance_ctx(inst);
        cipher = crypto_spawn_cipher(spawn);
        if (IS_ERR(cipher))
                return PTR_ERR(cipher);

        ctx->child = cipher;

        return 0;
}

static int cmac_clone_tfm(struct crypto_shash *tfm, struct crypto_shash *otfm)
{
        struct cmac_tfm_ctx *octx = crypto_shash_ctx(otfm);
        struct cmac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
        struct crypto_cipher *cipher;

        cipher = crypto_clone_cipher(octx->child);
        if (IS_ERR(cipher))
                return PTR_ERR(cipher);

        ctx->child = cipher;

        return 0;
}

static void cmac_exit_tfm(struct crypto_shash *tfm)
{
        struct cmac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
        crypto_free_cipher(ctx->child);
}

static int cmac_create(struct crypto_template *tmpl, struct rtattr **tb)
{
        struct shash_instance *inst;
        struct crypto_cipher_spawn *spawn;
        struct crypto_alg *alg;
        u32 mask;
        int err;

        err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SHASH, &mask);
        if (err)
                return err;

        inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL);
        if (!inst)
                return -ENOMEM;
        spawn = shash_instance_ctx(inst);

        err = crypto_grab_cipher(spawn, shash_crypto_instance(inst),
                                 crypto_attr_alg_name(tb[1]), 0, mask);
        if (err)
                goto err_free_inst;
        alg = crypto_spawn_cipher_alg(spawn);

        switch (alg->cra_blocksize) {
        case 16:
        case 8:
                break;
        default:
                err = -EINVAL;
                goto err_free_inst;
        }

        err = crypto_inst_setname(shash_crypto_instance(inst), tmpl->name, alg);
        if (err)
                goto err_free_inst;

        inst->alg.base.cra_priority = alg->cra_priority;
        inst->alg.base.cra_blocksize = alg->cra_blocksize;
        inst->alg.base.cra_ctxsize = sizeof(struct cmac_tfm_ctx) +
                                     alg->cra_blocksize * 2;
        inst->alg.base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
                                   CRYPTO_AHASH_ALG_FINAL_NONZERO;

        inst->alg.digestsize = alg->cra_blocksize;
        inst->alg.descsize = alg->cra_blocksize;
        inst->alg.init = crypto_cmac_digest_init;
        inst->alg.update = crypto_cmac_digest_update;
        inst->alg.finup = crypto_cmac_digest_finup;
        inst->alg.setkey = crypto_cmac_digest_setkey;
        inst->alg.init_tfm = cmac_init_tfm;
        inst->alg.clone_tfm = cmac_clone_tfm;
        inst->alg.exit_tfm = cmac_exit_tfm;

        inst->free = shash_free_singlespawn_instance;

        err = shash_register_instance(tmpl, inst);
        if (err) {
err_free_inst:
                shash_free_singlespawn_instance(inst);
        }
        return err;
}

static struct crypto_template crypto_cmac_tmpl = {
        .name = "cmac",
        .create = cmac_create,
        .module = THIS_MODULE,
};

static int __init crypto_cmac_module_init(void)
{
        return crypto_register_template(&crypto_cmac_tmpl);
}

static void __exit crypto_cmac_module_exit(void)
{
        crypto_unregister_template(&crypto_cmac_tmpl);
}

module_init(crypto_cmac_module_init);
module_exit(crypto_cmac_module_exit);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("CMAC keyed hash algorithm");
MODULE_ALIAS_CRYPTO("cmac");
MODULE_IMPORT_NS("CRYPTO_INTERNAL");

























  156 





  157 







































  157 









  156 


  157 









  157 






  155 




















  152 
  157 





































































































































































































































































































































































































































































































  153 






  156 


  151 












































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
// SPDX-License-Identifier: GPL-2.0-only
#include "cgroup-internal.h"

#include <linux/sched/cputime.h>

#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/btf_ids.h>

#include <trace/events/cgroup.h>

static DEFINE_SPINLOCK(rstat_base_lock);
static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list);

static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);

/*
 * Determines whether a given css can participate in rstat.
 * css's that are cgroup::self use rstat for base stats.
 * Other css's associated with a subsystem use rstat only when
 * they define the ss->css_rstat_flush callback.
 */
static inline bool css_uses_rstat(struct cgroup_subsys_state *css)
{
        return css_is_self(css) || css->ss->css_rstat_flush != NULL;
}

static struct css_rstat_cpu *css_rstat_cpu(
                struct cgroup_subsys_state *css, int cpu)
{
        return per_cpu_ptr(css->rstat_cpu, cpu);
}

static struct cgroup_rstat_base_cpu *cgroup_rstat_base_cpu(
                struct cgroup *cgrp, int cpu)
{
        return per_cpu_ptr(cgrp->rstat_base_cpu, cpu);
}

static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss)
{
        if (ss)
                return &ss->rstat_ss_lock;

        return &rstat_base_lock;
}

static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu)
{
        if (ss)
                return per_cpu_ptr(ss->lhead, cpu);
        return per_cpu_ptr(&rstat_backlog_list, cpu);
}

/**
 * css_rstat_updated - keep track of updated rstat_cpu
 * @css: target cgroup subsystem state
 * @cpu: cpu on which rstat_cpu was updated
 *
 * Atomically inserts the css in the ss's llist for the given cpu. This is
 * reentrant safe i.e. safe against softirq, hardirq and nmi. The ss's llist
 * will be processed at the flush time to create the update tree.
 *
 * NOTE: if the user needs the guarantee that the updater either add itself in
 * the lockless list or the concurrent flusher flushes its updated stats, a
 * memory barrier is needed before the call to css_rstat_updated() i.e. a
 * barrier after updating the per-cpu stats and before calling
 * css_rstat_updated().
 */
__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
{
        struct llist_head *lhead;
        struct css_rstat_cpu *rstatc;
        struct css_rstat_cpu __percpu *rstatc_pcpu;
        struct llist_node *self;

        /*
         * Since bpf programs can call this function, prevent access to
         * uninitialized rstat pointers.
         */
        if (!css_uses_rstat(css))
                return;

        lockdep_assert_preemption_disabled();

        /*
         * For archs withnot nmi safe cmpxchg or percpu ops support, ignore
         * the requests from nmi context.
         */
        if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) ||
             !IS_ENABLED(CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS)) && in_nmi())
                return;

        rstatc = css_rstat_cpu(css, cpu);
        /*
         * If already on list return. This check is racy and smp_mb() is needed
         * to pair it with the smp_mb() in css_process_update_tree() if the
         * guarantee that the updated stats are visible to concurrent flusher is
         * needed.
         */
        if (llist_on_list(&rstatc->lnode))
                return;

        /*
         * This function can be renentered by irqs and nmis for the same cgroup
         * and may try to insert the same per-cpu lnode into the llist. Note
         * that llist_add() does not protect against such scenarios.
         *
         * To protect against such stacked contexts of irqs/nmis, we use the
         * fact that lnode points to itself when not on a list and then use
         * this_cpu_cmpxchg() to atomically set to NULL to select the winner
         * which will call llist_add(). The losers can assume the insertion is
         * successful and the winner will eventually add the per-cpu lnode to
         * the llist.
         */
        self = &rstatc->lnode;
        rstatc_pcpu = css->rstat_cpu;
        if (this_cpu_cmpxchg(rstatc_pcpu->lnode.next, self, NULL) != self)
                return;

        lhead = ss_lhead_cpu(css->ss, cpu);
        llist_add(&rstatc->lnode, lhead);
}

static void __css_process_update_tree(struct cgroup_subsys_state *css, int cpu)
{
        /* put @css and all ancestors on the corresponding updated lists */
        while (true) {
                struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
                struct cgroup_subsys_state *parent = css->parent;
                struct css_rstat_cpu *prstatc;

                /*
                 * Both additions and removals are bottom-up.  If a cgroup
                 * is already in the tree, all ancestors are.
                 */
                if (rstatc->updated_next)
                        break;

                /* Root has no parent to link it to, but mark it busy */
                if (!parent) {
                        rstatc->updated_next = css;
                        break;
                }

                prstatc = css_rstat_cpu(parent, cpu);
                rstatc->updated_next = prstatc->updated_children;
                prstatc->updated_children = css;

                css = parent;
        }
}

static void css_process_update_tree(struct cgroup_subsys *ss, int cpu)
{
        struct llist_head *lhead = ss_lhead_cpu(ss, cpu);
        struct llist_node *lnode;

        while ((lnode = llist_del_first_init(lhead))) {
                struct css_rstat_cpu *rstatc;

                /*
                 * smp_mb() is needed here (more specifically in between
                 * init_llist_node() and per-cpu stats flushing) if the
                 * guarantee is required by a rstat user where etiher the
                 * updater should add itself on the lockless list or the
                 * flusher flush the stats updated by the updater who have
                 * observed that they are already on the list. The
                 * corresponding barrier pair for this one should be before
                 * css_rstat_updated() by the user.
                 *
                 * For now, there aren't any such user, so not adding the
                 * barrier here but if such a use-case arise, please add
                 * smp_mb() here.
                 */

                rstatc = container_of(lnode, struct css_rstat_cpu, lnode);
                __css_process_update_tree(rstatc->owner, cpu);
        }
}

/**
 * css_rstat_push_children - push children css's into the given list
 * @head: current head of the list (= subtree root)
 * @child: first child of the root
 * @cpu: target cpu
 * Return: A new singly linked list of css's to be flushed
 *
 * Iteratively traverse down the css_rstat_cpu updated tree level by
 * level and push all the parents first before their next level children
 * into a singly linked list via the rstat_flush_next pointer built from the
 * tail backward like "pushing" css's into a stack. The root is pushed by
 * the caller.
 */
static struct cgroup_subsys_state *css_rstat_push_children(
                struct cgroup_subsys_state *head,
                struct cgroup_subsys_state *child, int cpu)
{
        struct cgroup_subsys_state *cnext = child;        /* Next head of child css level */
        struct cgroup_subsys_state *ghead = NULL;        /* Head of grandchild css level */
        struct cgroup_subsys_state *parent, *grandchild;
        struct css_rstat_cpu *crstatc;

        child->rstat_flush_next = NULL;

        /*
         * The subsystem rstat lock must be held for the whole duration from
         * here as the rstat_flush_next list is being constructed to when
         * it is consumed later in css_rstat_flush().
         */
        lockdep_assert_held(ss_rstat_lock(head->ss));

        /*
         * Notation: -> updated_next pointer
         *             => rstat_flush_next pointer
         *
         * Assuming the following sample updated_children lists:
         *  P: C1 -> C2 -> P
         *  C1: G11 -> G12 -> C1
         *  C2: G21 -> G22 -> C2
         *
         * After 1st iteration:
         *  head => C2 => C1 => NULL
         *  ghead => G21 => G11 => NULL
         *
         * After 2nd iteration:
         *  head => G12 => G11 => G22 => G21 => C2 => C1 => NULL
         */
next_level:
        while (cnext) {
                child = cnext;
                cnext = child->rstat_flush_next;
                parent = child->parent;

                /* updated_next is parent cgroup terminated if !NULL */
                while (child != parent) {
                        child->rstat_flush_next = head;
                        head = child;
                        crstatc = css_rstat_cpu(child, cpu);
                        grandchild = crstatc->updated_children;
                        if (grandchild != child) {
                                /* Push the grand child to the next level */
                                crstatc->updated_children = child;
                                grandchild->rstat_flush_next = ghead;
                                ghead = grandchild;
                        }
                        child = crstatc->updated_next;
                        crstatc->updated_next = NULL;
                }
        }

        if (ghead) {
                cnext = ghead;
                ghead = NULL;
                goto next_level;
        }
        return head;
}

/**
 * css_rstat_updated_list - build a list of updated css's to be flushed
 * @root: root of the css subtree to traverse
 * @cpu: target cpu
 * Return: A singly linked list of css's to be flushed
 *
 * Walks the updated rstat_cpu tree on @cpu from @root.  During traversal,
 * each returned css is unlinked from the updated tree.
 *
 * The only ordering guarantee is that, for a parent and a child pair
 * covered by a given traversal, the child is before its parent in
 * the list.
 *
 * Note that updated_children is self terminated and points to a list of
 * child css's if not empty. Whereas updated_next is like a sibling link
 * within the children list and terminated by the parent css. An exception
 * here is the css root whose updated_next can be self terminated.
 */
static struct cgroup_subsys_state *css_rstat_updated_list(
                struct cgroup_subsys_state *root, int cpu)
{
        struct css_rstat_cpu *rstatc = css_rstat_cpu(root, cpu);
        struct cgroup_subsys_state *head = NULL, *parent, *child;

        css_process_update_tree(root->ss, cpu);

        /* Return NULL if this subtree is not on-list */
        if (!rstatc->updated_next)
                return NULL;

        /*
         * Unlink @root from its parent. As the updated_children list is
         * singly linked, we have to walk it to find the removal point.
         */
        parent = root->parent;
        if (parent) {
                struct css_rstat_cpu *prstatc;
                struct cgroup_subsys_state **nextp;

                prstatc = css_rstat_cpu(parent, cpu);
                nextp = &prstatc->updated_children;
                while (*nextp != root) {
                        struct css_rstat_cpu *nrstatc;

                        nrstatc = css_rstat_cpu(*nextp, cpu);
                        WARN_ON_ONCE(*nextp == parent);
                        nextp = &nrstatc->updated_next;
                }
                *nextp = rstatc->updated_next;
        }

        rstatc->updated_next = NULL;

        /* Push @root to the list first before pushing the children */
        head = root;
        root->rstat_flush_next = NULL;
        child = rstatc->updated_children;
        rstatc->updated_children = root;
        if (child != root)
                head = css_rstat_push_children(head, child, cpu);

        return head;
}

/*
 * A hook for bpf stat collectors to attach to and flush their stats.
 * Together with providing bpf kfuncs for css_rstat_updated() and
 * css_rstat_flush(), this enables a complete workflow where bpf progs that
 * collect cgroup stats can integrate with rstat for efficient flushing.
 *
 * A static noinline declaration here could cause the compiler to optimize away
 * the function. A global noinline declaration will keep the definition, but may
 * optimize away the callsite. Therefore, __weak is needed to ensure that the
 * call is still emitted, by telling the compiler that we don't know what the
 * function might eventually be.
 */

__bpf_hook_start();

__weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
                                     struct cgroup *parent, int cpu)
{
}

__bpf_hook_end();

/*
 * Helper functions for locking.
 *
 * This makes it easier to diagnose locking issues and contention in
 * production environments.  The parameter @cpu_in_loop indicate lock
 * was released and re-taken when collection data from the CPUs. The
 * value -1 is used when obtaining the main lock else this is the CPU
 * number processed last.
 */
static inline void __css_rstat_lock(struct cgroup_subsys_state *css,
                int cpu_in_loop)
        __acquires(ss_rstat_lock(css->ss))
{
        struct cgroup *cgrp = css->cgroup;
        spinlock_t *lock;
        bool contended;

        lock = ss_rstat_lock(css->ss);
        contended = !spin_trylock_irq(lock);
        if (contended) {
                trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended);
                spin_lock_irq(lock);
        }
        trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended);
}

static inline void __css_rstat_unlock(struct cgroup_subsys_state *css,
                                      int cpu_in_loop)
        __releases(ss_rstat_lock(css->ss))
{
        struct cgroup *cgrp = css->cgroup;
        spinlock_t *lock;

        lock = ss_rstat_lock(css->ss);
        trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false);
        spin_unlock_irq(lock);
}

/**
 * css_rstat_flush - flush stats in @css's rstat subtree
 * @css: target cgroup subsystem state
 *
 * Collect all per-cpu stats in @css's subtree into the global counters
 * and propagate them upwards. After this function returns, all rstat
 * nodes in the subtree have up-to-date ->stat.
 *
 * This also gets all rstat nodes in the subtree including @css off the
 * ->updated_children lists.
 *
 * This function may block.
 */
__bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css)
{
        int cpu;
        bool is_self = css_is_self(css);

        /*
         * Since bpf programs can call this function, prevent access to
         * uninitialized rstat pointers.
         */
        if (!css_uses_rstat(css))
                return;

        might_sleep();
        for_each_possible_cpu(cpu) {
                struct cgroup_subsys_state *pos;

                /* Reacquire for each CPU to avoid disabling IRQs too long */
                __css_rstat_lock(css, cpu);
                pos = css_rstat_updated_list(css, cpu);
                for (; pos; pos = pos->rstat_flush_next) {
                        if (is_self) {
                                cgroup_base_stat_flush(pos->cgroup, cpu);
                                bpf_rstat_flush(pos->cgroup,
                                                cgroup_parent(pos->cgroup), cpu);
                        } else
                                pos->ss->css_rstat_flush(pos, cpu);
                }
                __css_rstat_unlock(css, cpu);
                if (!cond_resched())
                        cpu_relax();
        }
}

int css_rstat_init(struct cgroup_subsys_state *css)
{
        struct cgroup *cgrp = css->cgroup;
        int cpu;
        bool is_self = css_is_self(css);

        if (is_self) {
                /* the root cgrp has rstat_base_cpu preallocated */
                if (!cgrp->rstat_base_cpu) {
                        cgrp->rstat_base_cpu = alloc_percpu(struct cgroup_rstat_base_cpu);
                        if (!cgrp->rstat_base_cpu)
                                return -ENOMEM;
                }
        } else if (css->ss->css_rstat_flush == NULL)
                return 0;

        /* the root cgrp's self css has rstat_cpu preallocated */
        if (!css->rstat_cpu) {
                css->rstat_cpu = alloc_percpu(struct css_rstat_cpu);
                if (!css->rstat_cpu) {
                        if (is_self)
                                free_percpu(cgrp->rstat_base_cpu);

                        return -ENOMEM;
                }
        }

        /* ->updated_children list is self terminated */
        for_each_possible_cpu(cpu) {
                struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);

                rstatc->owner = rstatc->updated_children = css;
                init_llist_node(&rstatc->lnode);

                if (is_self) {
                        struct cgroup_rstat_base_cpu *rstatbc;

                        rstatbc = cgroup_rstat_base_cpu(cgrp, cpu);
                        u64_stats_init(&rstatbc->bsync);
                }
        }

        return 0;
}

void css_rstat_exit(struct cgroup_subsys_state *css)
{
        int cpu;

        if (!css_uses_rstat(css))
                return;

        if (!css->rstat_cpu)
                return;

        css_rstat_flush(css);

        /* sanity check */
        for_each_possible_cpu(cpu) {
                struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);

                if (WARN_ON_ONCE(rstatc->updated_children != css) ||
                    WARN_ON_ONCE(rstatc->updated_next))
                        return;
        }

        if (css_is_self(css)) {
                struct cgroup *cgrp = css->cgroup;

                free_percpu(cgrp->rstat_base_cpu);
                cgrp->rstat_base_cpu = NULL;
        }

        free_percpu(css->rstat_cpu);
        css->rstat_cpu = NULL;
}

/**
 * ss_rstat_init - subsystem-specific rstat initialization
 * @ss: target subsystem
 *
 * If @ss is NULL, the static locks associated with the base stats
 * are initialized. If @ss is non-NULL, the subsystem-specific locks
 * are initialized.
 */
int __init ss_rstat_init(struct cgroup_subsys *ss)
{
        int cpu;

        if (ss) {
                ss->lhead = alloc_percpu(struct llist_head);
                if (!ss->lhead)
                        return -ENOMEM;
        }

        spin_lock_init(ss_rstat_lock(ss));
        for_each_possible_cpu(cpu)
                init_llist_head(ss_lhead_cpu(ss, cpu));

        return 0;
}

/*
 * Functions for cgroup basic resource statistics implemented on top of
 * rstat.
 */
static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
                                 struct cgroup_base_stat *src_bstat)
{
        dst_bstat->cputime.utime += src_bstat->cputime.utime;
        dst_bstat->cputime.stime += src_bstat->cputime.stime;
        dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
#ifdef CONFIG_SCHED_CORE
        dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
#endif
        dst_bstat->ntime += src_bstat->ntime;
}

static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
                                 struct cgroup_base_stat *src_bstat)
{
        dst_bstat->cputime.utime -= src_bstat->cputime.utime;
        dst_bstat->cputime.stime -= src_bstat->cputime.stime;
        dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
#ifdef CONFIG_SCHED_CORE
        dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
#endif
        dst_bstat->ntime -= src_bstat->ntime;
}

static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
{
        struct cgroup_rstat_base_cpu *rstatbc = cgroup_rstat_base_cpu(cgrp, cpu);
        struct cgroup *parent = cgroup_parent(cgrp);
        struct cgroup_rstat_base_cpu *prstatbc;
        struct cgroup_base_stat delta;
        unsigned seq;

        /* Root-level stats are sourced from system-wide CPU stats */
        if (!parent)
                return;

        /* fetch the current per-cpu values */
        do {
                seq = __u64_stats_fetch_begin(&rstatbc->bsync);
                delta = rstatbc->bstat;
        } while (__u64_stats_fetch_retry(&rstatbc->bsync, seq));

        /* propagate per-cpu delta to cgroup and per-cpu global statistics */
        cgroup_base_stat_sub(&delta, &rstatbc->last_bstat);
        cgroup_base_stat_add(&cgrp->bstat, &delta);
        cgroup_base_stat_add(&rstatbc->last_bstat, &delta);
        cgroup_base_stat_add(&rstatbc->subtree_bstat, &delta);

        /* propagate cgroup and per-cpu global delta to parent (unless that's root) */
        if (cgroup_parent(parent)) {
                delta = cgrp->bstat;
                cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
                cgroup_base_stat_add(&parent->bstat, &delta);
                cgroup_base_stat_add(&cgrp->last_bstat, &delta);

                delta = rstatbc->subtree_bstat;
                prstatbc = cgroup_rstat_base_cpu(parent, cpu);
                cgroup_base_stat_sub(&delta, &rstatbc->last_subtree_bstat);
                cgroup_base_stat_add(&prstatbc->subtree_bstat, &delta);
                cgroup_base_stat_add(&rstatbc->last_subtree_bstat, &delta);
        }
}

static struct cgroup_rstat_base_cpu *
cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags)
{
        struct cgroup_rstat_base_cpu *rstatbc;

        rstatbc = get_cpu_ptr(cgrp->rstat_base_cpu);
        *flags = u64_stats_update_begin_irqsave(&rstatbc->bsync);
        return rstatbc;
}

static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
                                                 struct cgroup_rstat_base_cpu *rstatbc,
                                                 unsigned long flags)
{
        u64_stats_update_end_irqrestore(&rstatbc->bsync, flags);
        css_rstat_updated(&cgrp->self, smp_processor_id());
        put_cpu_ptr(rstatbc);
}

void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
{
        struct cgroup_rstat_base_cpu *rstatbc;
        unsigned long flags;

        rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
        rstatbc->bstat.cputime.sum_exec_runtime += delta_exec;
        cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags);
}

void __cgroup_account_cputime_field(struct cgroup *cgrp,
                                    enum cpu_usage_stat index, u64 delta_exec)
{
        struct cgroup_rstat_base_cpu *rstatbc;
        unsigned long flags;

        rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);

        switch (index) {
        case CPUTIME_NICE:
                rstatbc->bstat.ntime += delta_exec;
                fallthrough;
        case CPUTIME_USER:
                rstatbc->bstat.cputime.utime += delta_exec;
                break;
        case CPUTIME_SYSTEM:
        case CPUTIME_IRQ:
        case CPUTIME_SOFTIRQ:
                rstatbc->bstat.cputime.stime += delta_exec;
                break;
#ifdef CONFIG_SCHED_CORE
        case CPUTIME_FORCEIDLE:
                rstatbc->bstat.forceidle_sum += delta_exec;
                break;
#endif
        default:
                break;
        }

        cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags);
}

/*
 * compute the cputime for the root cgroup by getting the per cpu data
 * at a global level, then categorizing the fields in a manner consistent
 * with how it is done by __cgroup_account_cputime_field for each bit of
 * cpu time attributed to a cgroup.
 */
static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
{
        struct task_cputime *cputime = &bstat->cputime;
        int i;

        memset(bstat, 0, sizeof(*bstat));
        for_each_possible_cpu(i) {
                struct kernel_cpustat kcpustat;
                u64 *cpustat = kcpustat.cpustat;
                u64 user = 0;
                u64 sys = 0;

                kcpustat_cpu_fetch(&kcpustat, i);

                user += cpustat[CPUTIME_USER];
                user += cpustat[CPUTIME_NICE];
                cputime->utime += user;

                sys += cpustat[CPUTIME_SYSTEM];
                sys += cpustat[CPUTIME_IRQ];
                sys += cpustat[CPUTIME_SOFTIRQ];
                cputime->stime += sys;

                cputime->sum_exec_runtime += user;
                cputime->sum_exec_runtime += sys;

#ifdef CONFIG_SCHED_CORE
                bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
#endif
                bstat->ntime += cpustat[CPUTIME_NICE];
        }
}


static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat *bstat)
{
#ifdef CONFIG_SCHED_CORE
        u64 forceidle_time = bstat->forceidle_sum;

        do_div(forceidle_time, NSEC_PER_USEC);
        seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
#endif
}

void cgroup_base_stat_cputime_show(struct seq_file *seq)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        struct cgroup_base_stat bstat;

        if (cgroup_parent(cgrp)) {
                css_rstat_flush(&cgrp->self);
                __css_rstat_lock(&cgrp->self, -1);
                bstat = cgrp->bstat;
                cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
                               &bstat.cputime.utime, &bstat.cputime.stime);
                __css_rstat_unlock(&cgrp->self, -1);
        } else {
                root_cgroup_cputime(&bstat);
        }

        do_div(bstat.cputime.sum_exec_runtime, NSEC_PER_USEC);
        do_div(bstat.cputime.utime, NSEC_PER_USEC);
        do_div(bstat.cputime.stime, NSEC_PER_USEC);
        do_div(bstat.ntime, NSEC_PER_USEC);

        seq_printf(seq, "usage_usec %llu\n"
                        "user_usec %llu\n"
                        "system_usec %llu\n"
                        "nice_usec %llu\n",
                        bstat.cputime.sum_exec_runtime,
                        bstat.cputime.utime,
                        bstat.cputime.stime,
                        bstat.ntime);

        cgroup_force_idle_show(seq, &bstat);
}

/* Add bpf kfuncs for css_rstat_updated() and css_rstat_flush() */
BTF_KFUNCS_START(bpf_rstat_kfunc_ids)
BTF_ID_FLAGS(func, css_rstat_updated)
BTF_ID_FLAGS(func, css_rstat_flush, KF_SLEEPABLE)
BTF_KFUNCS_END(bpf_rstat_kfunc_ids)

static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = {
        .owner          = THIS_MODULE,
        .set            = &bpf_rstat_kfunc_ids,
};

static int __init bpf_rstat_kfunc_init(void)
{
        return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
                                         &bpf_rstat_kfunc_set);
}
late_initcall(bpf_rstat_kfunc_init);




















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PERCPU_RWSEM_H
#define _LINUX_PERCPU_RWSEM_H

#include <linux/atomic.h>
#include <linux/percpu.h>
#include <linux/rcuwait.h>
#include <linux/wait.h>
#include <linux/rcu_sync.h>
#include <linux/lockdep.h>
#include <linux/cleanup.h>

struct percpu_rw_semaphore {
        struct rcu_sync                rss;
        unsigned int __percpu        *read_count;
        struct rcuwait                writer;
        wait_queue_head_t        waiters;
        atomic_t                block;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map        dep_map;
#endif
};

#ifdef CONFIG_DEBUG_LOCK_ALLOC
#define __PERCPU_RWSEM_DEP_MAP_INIT(lockname)        .dep_map = { .name = #lockname },
#else
#define __PERCPU_RWSEM_DEP_MAP_INIT(lockname)
#endif

#define __DEFINE_PERCPU_RWSEM(name, is_static)                                \
static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_rc_##name);                \
is_static struct percpu_rw_semaphore name = {                                \
        .rss = __RCU_SYNC_INITIALIZER(name.rss),                        \
        .read_count = &__percpu_rwsem_rc_##name,                        \
        .writer = __RCUWAIT_INITIALIZER(name.writer),                        \
        .waiters = __WAIT_QUEUE_HEAD_INITIALIZER(name.waiters),                \
        .block = ATOMIC_INIT(0),                                        \
        __PERCPU_RWSEM_DEP_MAP_INIT(name)                                \
}

#define DEFINE_PERCPU_RWSEM(name)                \
        __DEFINE_PERCPU_RWSEM(name, /* not static */)
#define DEFINE_STATIC_PERCPU_RWSEM(name)        \
        __DEFINE_PERCPU_RWSEM(name, static)

extern bool __percpu_down_read(struct percpu_rw_semaphore *, bool, bool);

static inline void percpu_down_read_internal(struct percpu_rw_semaphore *sem,
                                             bool freezable)
{
        might_sleep();

        rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);

        preempt_disable();
        /*
         * We are in an RCU-sched read-side critical section, so the writer
         * cannot both change sem->state from readers_fast and start checking
         * counters while we are here. So if we see !sem->state, we know that
         * the writer won't be checking until we're past the preempt_enable()
         * and that once the synchronize_rcu() is done, the writer will see
         * anything we did within this RCU-sched read-size critical section.
         */
        if (likely(rcu_sync_is_idle(&sem->rss)))
                this_cpu_inc(*sem->read_count);
        else
                __percpu_down_read(sem, false, freezable); /* Unconditional memory barrier */
        /*
         * The preempt_enable() prevents the compiler from
         * bleeding the critical section out.
         */
        preempt_enable();
}

static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
{
        percpu_down_read_internal(sem, false);
}

static inline void percpu_down_read_freezable(struct percpu_rw_semaphore *sem,
                                              bool freeze)
{
        percpu_down_read_internal(sem, freeze);
}

static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
{
        bool ret = true;

        preempt_disable();
        /*
         * Same as in percpu_down_read().
         */
        if (likely(rcu_sync_is_idle(&sem->rss)))
                this_cpu_inc(*sem->read_count);
        else
                ret = __percpu_down_read(sem, true, false); /* Unconditional memory barrier */
        preempt_enable();
        /*
         * The barrier() from preempt_enable() prevents the compiler from
         * bleeding the critical section out.
         */

        if (ret)
                rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);

        return ret;
}

static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
{
        rwsem_release(&sem->dep_map, _RET_IP_);

        preempt_disable();
        /*
         * Same as in percpu_down_read().
         */
        if (likely(rcu_sync_is_idle(&sem->rss))) {
                this_cpu_dec(*sem->read_count);
        } else {
                /*
                 * slowpath; reader will only ever wake a single blocked
                 * writer.
                 */
                smp_mb(); /* B matches C */
                /*
                 * In other words, if they see our decrement (presumably to
                 * aggregate zero, as that is the only time it matters) they
                 * will also see our critical section.
                 */
                this_cpu_dec(*sem->read_count);
                rcuwait_wake_up(&sem->writer);
        }
        preempt_enable();
}

extern bool percpu_is_read_locked(struct percpu_rw_semaphore *);
extern void percpu_down_write(struct percpu_rw_semaphore *);
extern void percpu_up_write(struct percpu_rw_semaphore *);

DEFINE_GUARD(percpu_read, struct percpu_rw_semaphore *,
             percpu_down_read(_T), percpu_up_read(_T))
DEFINE_GUARD_COND(percpu_read, _try, percpu_down_read_trylock(_T))

DEFINE_GUARD(percpu_write, struct percpu_rw_semaphore *,
             percpu_down_write(_T), percpu_up_write(_T))

static inline bool percpu_is_write_locked(struct percpu_rw_semaphore *sem)
{
        return atomic_read(&sem->block);
}

extern int __percpu_init_rwsem(struct percpu_rw_semaphore *,
                                const char *, struct lock_class_key *);

extern void percpu_free_rwsem(struct percpu_rw_semaphore *);

#define percpu_init_rwsem(sem)                                        \
({                                                                \
        static struct lock_class_key rwsem_key;                        \
        __percpu_init_rwsem(sem, #sem, &rwsem_key);                \
})

#define percpu_rwsem_is_held(sem)        lockdep_is_held(sem)
#define percpu_rwsem_assert_held(sem)        lockdep_assert_held(sem)

static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem,
                                        unsigned long ip)
{
        lock_release(&sem->dep_map, ip);
}

static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem,
                                        bool read, unsigned long ip)
{
        lock_acquire(&sem->dep_map, 0, 1, read, 1, NULL, ip);
}

#endif
















































































































































































































  317 















































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_JUMP_LABEL_H
#define _LINUX_JUMP_LABEL_H

/*
 * Jump label support
 *
 * Copyright (C) 2009-2012 Jason Baron <jbaron@redhat.com>
 * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra
 *
 * DEPRECATED API:
 *
 * The use of 'struct static_key' directly, is now DEPRECATED. In addition
 * static_key_{true,false}() is also DEPRECATED. IE DO NOT use the following:
 *
 * struct static_key false = STATIC_KEY_INIT_FALSE;
 * struct static_key true = STATIC_KEY_INIT_TRUE;
 * static_key_true()
 * static_key_false()
 *
 * The updated API replacements are:
 *
 * DEFINE_STATIC_KEY_TRUE(key);
 * DEFINE_STATIC_KEY_FALSE(key);
 * DEFINE_STATIC_KEY_ARRAY_TRUE(keys, count);
 * DEFINE_STATIC_KEY_ARRAY_FALSE(keys, count);
 * static_branch_likely()
 * static_branch_unlikely()
 *
 * Jump labels provide an interface to generate dynamic branches using
 * self-modifying code. Assuming toolchain and architecture support, if we
 * define a "key" that is initially false via "DEFINE_STATIC_KEY_FALSE(key)",
 * an "if (static_branch_unlikely(&key))" statement is an unconditional branch
 * (which defaults to false - and the true block is placed out of line).
 * Similarly, we can define an initially true key via
 * "DEFINE_STATIC_KEY_TRUE(key)", and use it in the same
 * "if (static_branch_unlikely(&key))", in which case we will generate an
 * unconditional branch to the out-of-line true branch. Keys that are
 * initially true or false can be using in both static_branch_unlikely()
 * and static_branch_likely() statements.
 *
 * At runtime we can change the branch target by setting the key
 * to true via a call to static_branch_enable(), or false using
 * static_branch_disable(). If the direction of the branch is switched by
 * these calls then we run-time modify the branch target via a
 * no-op -> jump or jump -> no-op conversion. For example, for an
 * initially false key that is used in an "if (static_branch_unlikely(&key))"
 * statement, setting the key to true requires us to patch in a jump
 * to the out-of-line of true branch.
 *
 * In addition to static_branch_{enable,disable}, we can also reference count
 * the key or branch direction via static_branch_{inc,dec}. Thus,
 * static_branch_inc() can be thought of as a 'make more true' and
 * static_branch_dec() as a 'make more false'.
 *
 * Since this relies on modifying code, the branch modifying functions
 * must be considered absolute slow paths (machine wide synchronization etc.).
 * OTOH, since the affected branches are unconditional, their runtime overhead
 * will be absolutely minimal, esp. in the default (off) case where the total
 * effect is a single NOP of appropriate size. The on case will patch in a jump
 * to the out-of-line block.
 *
 * When the control is directly exposed to userspace, it is prudent to delay the
 * decrement to avoid high frequency code modifications which can (and do)
 * cause significant performance degradation. Struct static_key_deferred and
 * static_key_slow_dec_deferred() provide for this.
 *
 * Lacking toolchain and or architecture support, static keys fall back to a
 * simple conditional branch.
 *
 * Additional babbling in: Documentation/staging/static-keys.rst
 */

#ifndef __ASSEMBLY__

#include <linux/types.h>
#include <linux/compiler.h>
#include <linux/cleanup.h>

extern bool static_key_initialized;

#define STATIC_KEY_CHECK_USE(key) WARN(!static_key_initialized,                      \
                                    "%s(): static key '%pS' used before call to jump_label_init()", \
                                    __func__, (key))

struct static_key {
        atomic_t enabled;
#ifdef CONFIG_JUMP_LABEL
/*
 * Note:
 *   To make anonymous unions work with old compilers, the static
 *   initialization of them requires brackets. This creates a dependency
 *   on the order of the struct with the initializers. If any fields
 *   are added, STATIC_KEY_INIT_TRUE and STATIC_KEY_INIT_FALSE may need
 *   to be modified.
 *
 * bit 0 => 1 if key is initially true
 *            0 if initially false
 * bit 1 => 1 if points to struct static_key_mod
 *            0 if points to struct jump_entry
 */
        union {
                unsigned long type;
                struct jump_entry *entries;
                struct static_key_mod *next;
        };
#endif        /* CONFIG_JUMP_LABEL */
};

#endif /* __ASSEMBLY__ */

#ifdef CONFIG_JUMP_LABEL
#include <asm/jump_label.h>

#ifndef __ASSEMBLY__
#ifdef CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE

struct jump_entry {
        s32 code;
        s32 target;
        long key;        // key may be far away from the core kernel under KASLR
};

static inline unsigned long jump_entry_code(const struct jump_entry *entry)
{
        return (unsigned long)&entry->code + entry->code;
}

static inline unsigned long jump_entry_target(const struct jump_entry *entry)
{
        return (unsigned long)&entry->target + entry->target;
}

static inline struct static_key *jump_entry_key(const struct jump_entry *entry)
{
        long offset = entry->key & ~3L;

        return (struct static_key *)((unsigned long)&entry->key + offset);
}

#else

static inline unsigned long jump_entry_code(const struct jump_entry *entry)
{
        return entry->code;
}

static inline unsigned long jump_entry_target(const struct jump_entry *entry)
{
        return entry->target;
}

static inline struct static_key *jump_entry_key(const struct jump_entry *entry)
{
        return (struct static_key *)((unsigned long)entry->key & ~3UL);
}

#endif

static inline bool jump_entry_is_branch(const struct jump_entry *entry)
{
        return (unsigned long)entry->key & 1UL;
}

static inline bool jump_entry_is_init(const struct jump_entry *entry)
{
        return (unsigned long)entry->key & 2UL;
}

static inline void jump_entry_set_init(struct jump_entry *entry, bool set)
{
        if (set)
                entry->key |= 2;
        else
                entry->key &= ~2;
}

static inline int jump_entry_size(struct jump_entry *entry)
{
#ifdef JUMP_LABEL_NOP_SIZE
        return JUMP_LABEL_NOP_SIZE;
#else
        return arch_jump_entry_size(entry);
#endif
}

#endif
#endif

#ifndef __ASSEMBLY__

enum jump_label_type {
        JUMP_LABEL_NOP = 0,
        JUMP_LABEL_JMP,
};

struct module;

#ifdef CONFIG_JUMP_LABEL

#define JUMP_TYPE_FALSE                0UL
#define JUMP_TYPE_TRUE                1UL
#define JUMP_TYPE_LINKED        2UL
#define JUMP_TYPE_MASK                3UL

static __always_inline bool static_key_false(struct static_key *key)
{
        return arch_static_branch(key, false);
}

static __always_inline bool static_key_true(struct static_key *key)
{
        return !arch_static_branch(key, true);
}

extern struct jump_entry __start___jump_table[];
extern struct jump_entry __stop___jump_table[];

extern void jump_label_init(void);
extern void jump_label_init_ro(void);
extern void jump_label_lock(void);
extern void jump_label_unlock(void);
extern void arch_jump_label_transform(struct jump_entry *entry,
                                      enum jump_label_type type);
extern bool arch_jump_label_transform_queue(struct jump_entry *entry,
                                            enum jump_label_type type);
extern void arch_jump_label_transform_apply(void);
extern int jump_label_text_reserved(void *start, void *end);
extern bool static_key_slow_inc(struct static_key *key);
extern bool static_key_fast_inc_not_disabled(struct static_key *key);
extern void static_key_slow_dec(struct static_key *key);
extern bool static_key_slow_inc_cpuslocked(struct static_key *key);
extern void static_key_slow_dec_cpuslocked(struct static_key *key);
extern int static_key_count(struct static_key *key);
extern void static_key_enable(struct static_key *key);
extern void static_key_disable(struct static_key *key);
extern void static_key_enable_cpuslocked(struct static_key *key);
extern void static_key_disable_cpuslocked(struct static_key *key);
extern enum jump_label_type jump_label_init_type(struct jump_entry *entry);

/*
 * We should be using ATOMIC_INIT() for initializing .enabled, but
 * the inclusion of atomic.h is problematic for inclusion of jump_label.h
 * in 'low-level' headers. Thus, we are initializing .enabled with a
 * raw value, but have added a BUILD_BUG_ON() to catch any issues in
 * jump_label_init() see: kernel/jump_label.c.
 */
#define STATIC_KEY_INIT_TRUE                                        \
        { .enabled = { 1 },                                        \
          { .type = JUMP_TYPE_TRUE } }
#define STATIC_KEY_INIT_FALSE                                        \
        { .enabled = { 0 },                                        \
          { .type = JUMP_TYPE_FALSE } }

#else  /* !CONFIG_JUMP_LABEL */

#include <linux/atomic.h>
#include <linux/bug.h>

static __always_inline int static_key_count(struct static_key *key)
{
        return raw_atomic_read(&key->enabled);
}

static __always_inline void jump_label_init(void)
{
        static_key_initialized = true;
}

static __always_inline void jump_label_init_ro(void) { }

static __always_inline bool static_key_false(struct static_key *key)
{
        if (unlikely_notrace(static_key_count(key) > 0))
                return true;
        return false;
}

static __always_inline bool static_key_true(struct static_key *key)
{
        if (likely_notrace(static_key_count(key) > 0))
                return true;
        return false;
}

static inline bool static_key_fast_inc_not_disabled(struct static_key *key)
{
        int v;

        STATIC_KEY_CHECK_USE(key);
        /*
         * Prevent key->enabled getting negative to follow the same semantics
         * as for CONFIG_JUMP_LABEL=y, see kernel/jump_label.c comment.
         */
        v = atomic_read(&key->enabled);
        do {
                if (v < 0 || (v + 1) < 0)
                        return false;
        } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1)));
        return true;
}
#define static_key_slow_inc(key)        static_key_fast_inc_not_disabled(key)

static inline void static_key_slow_dec(struct static_key *key)
{
        STATIC_KEY_CHECK_USE(key);
        atomic_dec(&key->enabled);
}

#define static_key_slow_inc_cpuslocked(key) static_key_slow_inc(key)
#define static_key_slow_dec_cpuslocked(key) static_key_slow_dec(key)

static inline int jump_label_text_reserved(void *start, void *end)
{
        return 0;
}

static inline void jump_label_lock(void) {}
static inline void jump_label_unlock(void) {}

static inline void static_key_enable(struct static_key *key)
{
        STATIC_KEY_CHECK_USE(key);

        if (atomic_read(&key->enabled) != 0) {
                WARN_ON_ONCE(atomic_read(&key->enabled) != 1);
                return;
        }
        atomic_set(&key->enabled, 1);
}

static inline void static_key_disable(struct static_key *key)
{
        STATIC_KEY_CHECK_USE(key);

        if (atomic_read(&key->enabled) != 1) {
                WARN_ON_ONCE(atomic_read(&key->enabled) != 0);
                return;
        }
        atomic_set(&key->enabled, 0);
}

#define static_key_enable_cpuslocked(k)                static_key_enable((k))
#define static_key_disable_cpuslocked(k)        static_key_disable((k))

#define STATIC_KEY_INIT_TRUE        { .enabled = ATOMIC_INIT(1) }
#define STATIC_KEY_INIT_FALSE        { .enabled = ATOMIC_INIT(0) }

#endif        /* CONFIG_JUMP_LABEL */

DEFINE_LOCK_GUARD_0(jump_label_lock, jump_label_lock(), jump_label_unlock())

#define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE
#define jump_label_enabled static_key_enabled

/* -------------------------------------------------------------------------- */

/*
 * Two type wrappers around static_key, such that we can use compile time
 * type differentiation to emit the right code.
 *
 * All the below code is macros in order to play type games.
 */

struct static_key_true {
        struct static_key key;
};

struct static_key_false {
        struct static_key key;
};

#define STATIC_KEY_TRUE_INIT  (struct static_key_true) { .key = STATIC_KEY_INIT_TRUE,  }
#define STATIC_KEY_FALSE_INIT (struct static_key_false){ .key = STATIC_KEY_INIT_FALSE, }

#define DEFINE_STATIC_KEY_TRUE(name)        \
        struct static_key_true name = STATIC_KEY_TRUE_INIT

#define DEFINE_STATIC_KEY_TRUE_RO(name)        \
        struct static_key_true name __ro_after_init = STATIC_KEY_TRUE_INIT

#define DECLARE_STATIC_KEY_TRUE(name)        \
        extern struct static_key_true name

#define DEFINE_STATIC_KEY_FALSE(name)        \
        struct static_key_false name = STATIC_KEY_FALSE_INIT

#define DEFINE_STATIC_KEY_FALSE_RO(name)        \
        struct static_key_false name __ro_after_init = STATIC_KEY_FALSE_INIT

#define DECLARE_STATIC_KEY_FALSE(name)        \
        extern struct static_key_false name

#define DEFINE_STATIC_KEY_ARRAY_TRUE(name, count)                \
        struct static_key_true name[count] = {                        \
                [0 ... (count) - 1] = STATIC_KEY_TRUE_INIT,        \
        }

#define DEFINE_STATIC_KEY_ARRAY_FALSE(name, count)                \
        struct static_key_false name[count] = {                        \
                [0 ... (count) - 1] = STATIC_KEY_FALSE_INIT,        \
        }

#define _DEFINE_STATIC_KEY_1(name)        DEFINE_STATIC_KEY_TRUE(name)
#define _DEFINE_STATIC_KEY_0(name)        DEFINE_STATIC_KEY_FALSE(name)
#define DEFINE_STATIC_KEY_MAYBE(cfg, name)                        \
        __PASTE(_DEFINE_STATIC_KEY_, IS_ENABLED(cfg))(name)

#define _DEFINE_STATIC_KEY_RO_1(name)        DEFINE_STATIC_KEY_TRUE_RO(name)
#define _DEFINE_STATIC_KEY_RO_0(name)        DEFINE_STATIC_KEY_FALSE_RO(name)
#define DEFINE_STATIC_KEY_MAYBE_RO(cfg, name)                        \
        __PASTE(_DEFINE_STATIC_KEY_RO_, IS_ENABLED(cfg))(name)

#define _DECLARE_STATIC_KEY_1(name)        DECLARE_STATIC_KEY_TRUE(name)
#define _DECLARE_STATIC_KEY_0(name)        DECLARE_STATIC_KEY_FALSE(name)
#define DECLARE_STATIC_KEY_MAYBE(cfg, name)                        \
        __PASTE(_DECLARE_STATIC_KEY_, IS_ENABLED(cfg))(name)

extern bool ____wrong_branch_error(void);

#define static_key_enabled(x)                                                        \
({                                                                                \
        if (!__builtin_types_compatible_p(typeof(*x), struct static_key) &&        \
            !__builtin_types_compatible_p(typeof(*x), struct static_key_true) &&\
            !__builtin_types_compatible_p(typeof(*x), struct static_key_false))        \
                ____wrong_branch_error();                                        \
        static_key_count((struct static_key *)x) > 0;                                \
})

#ifdef CONFIG_JUMP_LABEL

/*
 * Combine the right initial value (type) with the right branch order
 * to generate the desired result.
 *
 *
 * type\branch|        likely (1)              |        unlikely (0)
 * -----------+-----------------------+------------------
 *            |                       |
 *  true (1)  |           ...                      |           ...
 *            |    NOP                      |           JMP L
 *            |    <br-stmts>              |        1: ...
 *            |        L: ...                      |
 *            |                              |
 *            |                              |        L: <br-stmts>
 *            |                              |           jmp 1b
 *            |                       |
 * -----------+-----------------------+------------------
 *            |                       |
 *  false (0) |           ...                      |           ...
 *            |    JMP L              |           NOP
 *            |    <br-stmts>              |        1: ...
 *            |        L: ...                      |
 *            |                              |
 *            |                              |        L: <br-stmts>
 *            |                              |           jmp 1b
 *            |                       |
 * -----------+-----------------------+------------------
 *
 * The initial value is encoded in the LSB of static_key::entries,
 * type: 0 = false, 1 = true.
 *
 * The branch type is encoded in the LSB of jump_entry::key,
 * branch: 0 = unlikely, 1 = likely.
 *
 * This gives the following logic table:
 *
 *        enabled        type        branch          instuction
 * -----------------------------+-----------
 *        0        0        0        | NOP
 *        0        0        1        | JMP
 *        0        1        0        | NOP
 *        0        1        1        | JMP
 *
 *        1        0        0        | JMP
 *        1        0        1        | NOP
 *        1        1        0        | JMP
 *        1        1        1        | NOP
 *
 * Which gives the following functions:
 *
 *   dynamic: instruction = enabled ^ branch
 *   static:  instruction = type ^ branch
 *
 * See jump_label_type() / jump_label_init_type().
 */

#define static_branch_likely(x)                                                        \
({                                                                                \
        bool branch;                                                                \
        if (__builtin_types_compatible_p(typeof(*x), struct static_key_true))        \
                branch = !arch_static_branch(&(x)->key, true);                        \
        else if (__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \
                branch = !arch_static_branch_jump(&(x)->key, true);                \
        else                                                                        \
                branch = ____wrong_branch_error();                                \
        likely_notrace(branch);                                                                \
})

#define static_branch_unlikely(x)                                                \
({                                                                                \
        bool branch;                                                                \
        if (__builtin_types_compatible_p(typeof(*x), struct static_key_true))        \
                branch = arch_static_branch_jump(&(x)->key, false);                \
        else if (__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \
                branch = arch_static_branch(&(x)->key, false);                        \
        else                                                                        \
                branch = ____wrong_branch_error();                                \
        unlikely_notrace(branch);                                                        \
})

#else /* !CONFIG_JUMP_LABEL */

#define static_branch_likely(x)                likely_notrace(static_key_enabled(&(x)->key))
#define static_branch_unlikely(x)        unlikely_notrace(static_key_enabled(&(x)->key))

#endif /* CONFIG_JUMP_LABEL */

#define static_branch_maybe(config, x)                                        \
        (IS_ENABLED(config) ? static_branch_likely(x)                        \
                            : static_branch_unlikely(x))

/*
 * Advanced usage; refcount, branch is enabled when: count != 0
 */

#define static_branch_inc(x)                static_key_slow_inc(&(x)->key)
#define static_branch_dec(x)                static_key_slow_dec(&(x)->key)
#define static_branch_inc_cpuslocked(x)        static_key_slow_inc_cpuslocked(&(x)->key)
#define static_branch_dec_cpuslocked(x)        static_key_slow_dec_cpuslocked(&(x)->key)

/*
 * Normal usage; boolean enable/disable.
 */

#define static_branch_enable(x)                        static_key_enable(&(x)->key)
#define static_branch_disable(x)                static_key_disable(&(x)->key)
#define static_branch_enable_cpuslocked(x)        static_key_enable_cpuslocked(&(x)->key)
#define static_branch_disable_cpuslocked(x)        static_key_disable_cpuslocked(&(x)->key)

#endif /* __ASSEMBLY__ */

#endif        /* _LINUX_JUMP_LABEL_H */






















































































































  316 


























































































  316 
  316 
  315 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Latched RB-trees
 *
 * Copyright (C) 2015 Intel Corp., Peter Zijlstra <peterz@infradead.org>
 *
 * Since RB-trees have non-atomic modifications they're not immediately suited
 * for RCU/lockless queries. Even though we made RB-tree lookups non-fatal for
 * lockless lookups; we cannot guarantee they return a correct result.
 *
 * The simplest solution is a seqlock + RB-tree, this will allow lockless
 * lookups; but has the constraint (inherent to the seqlock) that read sides
 * cannot nest in write sides.
 *
 * If we need to allow unconditional lookups (say as required for NMI context
 * usage) we need a more complex setup; this data structure provides this by
 * employing the latch technique -- see @write_seqcount_latch_begin -- to
 * implement a latched RB-tree which does allow for unconditional lookups by
 * virtue of always having (at least) one stable copy of the tree.
 *
 * However, while we have the guarantee that there is at all times one stable
 * copy, this does not guarantee an iteration will not observe modifications.
 * What might have been a stable copy at the start of the iteration, need not
 * remain so for the duration of the iteration.
 *
 * Therefore, this does require a lockless RB-tree iteration to be non-fatal;
 * see the comment in lib/rbtree.c. Note however that we only require the first
 * condition -- not seeing partial stores -- because the latch thing isolates
 * us from loops. If we were to interrupt a modification the lookup would be
 * pointed at the stable tree and complete while the modification was halted.
 */

#ifndef RB_TREE_LATCH_H
#define RB_TREE_LATCH_H

#include <linux/rbtree.h>
#include <linux/seqlock.h>
#include <linux/rcupdate.h>

struct latch_tree_node {
        struct rb_node node[2];
};

struct latch_tree_root {
        seqcount_latch_t        seq;
        struct rb_root                tree[2];
};

/**
 * latch_tree_ops - operators to define the tree order
 * @less: used for insertion; provides the (partial) order between two elements.
 * @comp: used for lookups; provides the order between the search key and an element.
 *
 * The operators are related like:
 *
 *        comp(a->key,b) < 0  := less(a,b)
 *        comp(a->key,b) > 0  := less(b,a)
 *        comp(a->key,b) == 0 := !less(a,b) && !less(b,a)
 *
 * If these operators define a partial order on the elements we make no
 * guarantee on which of the elements matching the key is found. See
 * latch_tree_find().
 */
struct latch_tree_ops {
        bool (*less)(struct latch_tree_node *a, struct latch_tree_node *b);
        int  (*comp)(void *key,                 struct latch_tree_node *b);
};

static __always_inline struct latch_tree_node *
__lt_from_rb(struct rb_node *node, int idx)
{
        return container_of(node, struct latch_tree_node, node[idx]);
}

static __always_inline void
__lt_insert(struct latch_tree_node *ltn, struct latch_tree_root *ltr, int idx,
            bool (*less)(struct latch_tree_node *a, struct latch_tree_node *b))
{
        struct rb_root *root = &ltr->tree[idx];
        struct rb_node **link = &root->rb_node;
        struct rb_node *node = &ltn->node[idx];
        struct rb_node *parent = NULL;
        struct latch_tree_node *ltp;

        while (*link) {
                parent = *link;
                ltp = __lt_from_rb(parent, idx);

                if (less(ltn, ltp))
                        link = &parent->rb_left;
                else
                        link = &parent->rb_right;
        }

        rb_link_node_rcu(node, parent, link);
        rb_insert_color(node, root);
}

static __always_inline void
__lt_erase(struct latch_tree_node *ltn, struct latch_tree_root *ltr, int idx)
{
        rb_erase(&ltn->node[idx], &ltr->tree[idx]);
}

static __always_inline struct latch_tree_node *
__lt_find(void *key, struct latch_tree_root *ltr, int idx,
          int (*comp)(void *key, struct latch_tree_node *node))
{
        struct rb_node *node = rcu_dereference_raw(ltr->tree[idx].rb_node);
        struct latch_tree_node *ltn;
        int c;

        while (node) {
                ltn = __lt_from_rb(node, idx);
                c = comp(key, ltn);

                if (c < 0)
                        node = rcu_dereference_raw(node->rb_left);
                else if (c > 0)
                        node = rcu_dereference_raw(node->rb_right);
                else
                        return ltn;
        }

        return NULL;
}

/**
 * latch_tree_insert() - insert @node into the trees @root
 * @node: nodes to insert
 * @root: trees to insert @node into
 * @ops: operators defining the node order
 *
 * It inserts @node into @root in an ordered fashion such that we can always
 * observe one complete tree. See the comment for write_seqcount_latch_begin().
 *
 * The inserts use rcu_assign_pointer() to publish the element such that the
 * tree structure is stored before we can observe the new @node.
 *
 * All modifications (latch_tree_insert, latch_tree_remove) are assumed to be
 * serialized.
 */
static __always_inline void
latch_tree_insert(struct latch_tree_node *node,
                  struct latch_tree_root *root,
                  const struct latch_tree_ops *ops)
{
        write_seqcount_latch_begin(&root->seq);
        __lt_insert(node, root, 0, ops->less);
        write_seqcount_latch(&root->seq);
        __lt_insert(node, root, 1, ops->less);
        write_seqcount_latch_end(&root->seq);
}

/**
 * latch_tree_erase() - removes @node from the trees @root
 * @node: nodes to remote
 * @root: trees to remove @node from
 * @ops: operators defining the node order
 *
 * Removes @node from the trees @root in an ordered fashion such that we can
 * always observe one complete tree. See the comment for
 * write_seqcount_latch_begin().
 *
 * It is assumed that @node will observe one RCU quiescent state before being
 * reused of freed.
 *
 * All modifications (latch_tree_insert, latch_tree_remove) are assumed to be
 * serialized.
 */
static __always_inline void
latch_tree_erase(struct latch_tree_node *node,
                 struct latch_tree_root *root,
                 const struct latch_tree_ops *ops)
{
        write_seqcount_latch_begin(&root->seq);
        __lt_erase(node, root, 0);
        write_seqcount_latch(&root->seq);
        __lt_erase(node, root, 1);
        write_seqcount_latch_end(&root->seq);
}

/**
 * latch_tree_find() - find the node matching @key in the trees @root
 * @key: search key
 * @root: trees to search for @key
 * @ops: operators defining the node order
 *
 * Does a lockless lookup in the trees @root for the node matching @key.
 *
 * It is assumed that this is called while holding the appropriate RCU read
 * side lock.
 *
 * If the operators define a partial order on the elements (there are multiple
 * elements which have the same key value) it is undefined which of these
 * elements will be found. Nor is it possible to iterate the tree to find
 * further elements with the same key value.
 *
 * Returns: a pointer to the node matching @key or NULL.
 */
static __always_inline struct latch_tree_node *
latch_tree_find(void *key, struct latch_tree_root *root,
                const struct latch_tree_ops *ops)
{
        struct latch_tree_node *node;
        unsigned int seq;

        do {
                seq = read_seqcount_latch(&root->seq);
                node = __lt_find(key, root, seq & 1, ops->comp);
        } while (read_seqcount_latch_retry(&root->seq, seq));

        return node;
}

#endif /* RB_TREE_LATCH_H */






































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * kref.h - library routines for handling generic reference counted objects
 *
 * Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com>
 * Copyright (C) 2004 IBM Corp.
 *
 * based on kobject.h which was:
 * Copyright (C) 2002-2003 Patrick Mochel <mochel@osdl.org>
 * Copyright (C) 2002-2003 Open Source Development Labs
 */

#ifndef _KREF_H_
#define _KREF_H_

#include <linux/spinlock.h>
#include <linux/refcount.h>

struct kref {
        refcount_t refcount;
};

#define KREF_INIT(n)        { .refcount = REFCOUNT_INIT(n), }

/**
 * kref_init - initialize object.
 * @kref: object in question.
 */
static inline void kref_init(struct kref *kref)
{
        refcount_set(&kref->refcount, 1);
}

static inline unsigned int kref_read(const struct kref *kref)
{
        return refcount_read(&kref->refcount);
}

/**
 * kref_get - increment refcount for object.
 * @kref: object.
 */
static inline void kref_get(struct kref *kref)
{
        refcount_inc(&kref->refcount);
}

/**
 * kref_put - Decrement refcount for object
 * @kref: Object
 * @release: Pointer to the function that will clean up the object when the
 *             last reference to the object is released.
 *
 * Decrement the refcount, and if 0, call @release.  The caller may not
 * pass NULL or kfree() as the release function.
 *
 * Return: 1 if this call removed the object, otherwise return 0.  Beware,
 * if this function returns 0, another caller may have removed the object
 * by the time this function returns.  The return value is only certain
 * if you want to see if the object is definitely released.
 */
static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref))
{
        if (refcount_dec_and_test(&kref->refcount)) {
                release(kref);
                return 1;
        }
        return 0;
}

/**
 * kref_put_mutex - Decrement refcount for object
 * @kref: Object
 * @release: Pointer to the function that will clean up the object when the
 *             last reference to the object is released.
 * @mutex: Mutex which protects the release function.
 *
 * This variant of kref_lock() calls the @release function with the @mutex
 * held.  The @release function will release the mutex.
 */
static inline int kref_put_mutex(struct kref *kref,
                                 void (*release)(struct kref *kref),
                                 struct mutex *mutex)
{
        if (refcount_dec_and_mutex_lock(&kref->refcount, mutex)) {
                release(kref);
                return 1;
        }
        return 0;
}

/**
 * kref_put_lock - Decrement refcount for object
 * @kref: Object
 * @release: Pointer to the function that will clean up the object when the
 *             last reference to the object is released.
 * @lock: Spinlock which protects the release function.
 *
 * This variant of kref_lock() calls the @release function with the @lock
 * held.  The @release function will release the lock.
 */
static inline int kref_put_lock(struct kref *kref,
                                void (*release)(struct kref *kref),
                                spinlock_t *lock)
{
        if (refcount_dec_and_lock(&kref->refcount, lock)) {
                release(kref);
                return 1;
        }
        return 0;
}

/**
 * kref_get_unless_zero - Increment refcount for object unless it is zero.
 * @kref: object.
 *
 * This function is intended to simplify locking around refcounting for
 * objects that can be looked up from a lookup structure, and which are
 * removed from that lookup structure in the object destructor.
 * Operations on such objects require at least a read lock around
 * lookup + kref_get, and a write lock around kref_put + remove from lookup
 * structure. Furthermore, RCU implementations become extremely tricky.
 * With a lookup followed by a kref_get_unless_zero *with return value check*
 * locking in the kref_put path can be deferred to the actual removal from
 * the lookup structure and RCU lookups become trivial.
 *
 * Return: non-zero if the increment succeeded. Otherwise return 0.
 */
static inline int __must_check kref_get_unless_zero(struct kref *kref)
{
        return refcount_inc_not_zero(&kref->refcount);
}
#endif /* _KREF_H_ */














































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NET_DST_CACHE_H
#define _NET_DST_CACHE_H

#include <linux/jiffies.h>
#include <net/dst.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ip6_fib.h>
#endif

struct dst_cache {
        struct dst_cache_pcpu __percpu *cache;
        unsigned long reset_ts;
};

/**
 *        dst_cache_get - perform cache lookup
 *        @dst_cache: the cache
 *
 *        The caller should use dst_cache_get_ip4() if it need to retrieve the
 *        source address to be used when xmitting to the cached dst.
 *        local BH must be disabled.
 */
struct dst_entry *dst_cache_get(struct dst_cache *dst_cache);

/**
 *        dst_cache_get_ip4 - perform cache lookup and fetch ipv4 source address
 *        @dst_cache: the cache
 *        @saddr: return value for the retrieved source address
 *
 *        local BH must be disabled.
 */
struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr);

/**
 *        dst_cache_set_ip4 - store the ipv4 dst into the cache
 *        @dst_cache: the cache
 *        @dst: the entry to be cached
 *        @saddr: the source address to be stored inside the cache
 *
 *        local BH must be disabled.
 */
void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst,
                       __be32 saddr);

#if IS_ENABLED(CONFIG_IPV6)

/**
 *        dst_cache_set_ip6 - store the ipv6 dst into the cache
 *        @dst_cache: the cache
 *        @dst: the entry to be cached
 *        @saddr: the source address to be stored inside the cache
 *
 *        local BH must be disabled.
 */
void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst,
                       const struct in6_addr *saddr);

/**
 *        dst_cache_get_ip6 - perform cache lookup and fetch ipv6 source address
 *        @dst_cache: the cache
 *        @saddr: return value for the retrieved source address
 *
 *        local BH must be disabled.
 */
struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache,
                                    struct in6_addr *saddr);
#endif

/**
 *        dst_cache_reset - invalidate the cache contents
 *        @dst_cache: the cache
 *
 *        This does not free the cached dst to avoid races and contentions.
 *        the dst will be freed on later cache lookup.
 */
static inline void dst_cache_reset(struct dst_cache *dst_cache)
{
        WRITE_ONCE(dst_cache->reset_ts, jiffies);
}

/**
 *        dst_cache_reset_now - invalidate the cache contents immediately
 *        @dst_cache: the cache
 *
 *        The caller must be sure there are no concurrent users, as this frees
 *        all dst_cache users immediately, rather than waiting for the next
 *        per-cpu usage like dst_cache_reset does. Most callers should use the
 *        higher speed lazily-freed dst_cache_reset function instead.
 */
void dst_cache_reset_now(struct dst_cache *dst_cache);

/**
 *        dst_cache_init - initialize the cache, allocating the required storage
 *        @dst_cache: the cache
 *        @gfp: allocation flags
 */
int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp);

/**
 *        dst_cache_destroy - empty the cache and free the allocated storage
 *        @dst_cache: the cache
 *
 *        No synchronization is enforced: it must be called only when the cache
 *        is unused.
 */
void dst_cache_destroy(struct dst_cache *dst_cache);

#endif









































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/mount.h>
#include <linux/seq_file.h>
#include <linux/poll.h>
#include <linux/ns_common.h>
#include <linux/fs_pin.h>

extern struct list_head notify_list;

struct mnt_namespace {
        struct ns_common        ns;
        struct mount *        root;
        struct {
                struct rb_root        mounts;                 /* Protected by namespace_sem */
                struct rb_node        *mnt_last_node;         /* last (rightmost) mount in the rbtree */
                struct rb_node        *mnt_first_node; /* first (leftmost) mount in the rbtree */
        };
        struct user_namespace        *user_ns;
        struct ucounts                *ucounts;
        wait_queue_head_t        poll;
        u64                        seq_origin; /* Sequence number of origin mount namespace */
        u64 event;
#ifdef CONFIG_FSNOTIFY
        __u32                        n_fsnotify_mask;
        struct fsnotify_mark_connector __rcu *n_fsnotify_marks;
#endif
        unsigned int                nr_mounts; /* # of mounts in the namespace */
        unsigned int                pending_mounts;
        refcount_t                passive; /* number references not pinning @mounts */
} __randomize_layout;

struct mnt_pcp {
        int mnt_count;
        int mnt_writers;
};

struct mountpoint {
        struct hlist_node m_hash;
        struct dentry *m_dentry;
        struct hlist_head m_list;
};

struct mount {
        struct hlist_node mnt_hash;
        struct mount *mnt_parent;
        struct dentry *mnt_mountpoint;
        struct vfsmount mnt;
        union {
                struct rb_node mnt_node; /* node in the ns->mounts rbtree */
                struct rcu_head mnt_rcu;
                struct llist_node mnt_llist;
        };
#ifdef CONFIG_SMP
        struct mnt_pcp __percpu *mnt_pcp;
#else
        int mnt_count;
        int mnt_writers;
#endif
        struct list_head mnt_mounts;        /* list of children, anchored here */
        struct list_head mnt_child;        /* and going through their mnt_child */
        struct mount *mnt_next_for_sb;        /* the next two fields are hlist_node, */
        struct mount * __aligned(1) *mnt_pprev_for_sb;
                                        /* except that LSB of pprev is stolen */
#define WRITE_HOLD 1                        /* ... for use by mnt_hold_writers() */
        const char *mnt_devname;        /* Name of device e.g. /dev/dsk/hda1 */
        struct list_head mnt_list;
        struct list_head mnt_expire;        /* link in fs-specific expiry list */
        struct list_head mnt_share;        /* circular list of shared mounts */
        struct hlist_head mnt_slave_list;/* list of slave mounts */
        struct hlist_node mnt_slave;        /* slave list entry */
        struct mount *mnt_master;        /* slave is on master->mnt_slave_list */
        struct mnt_namespace *mnt_ns;        /* containing namespace */
        struct mountpoint *mnt_mp;        /* where is it mounted */
        union {
                struct hlist_node mnt_mp_list;        /* list mounts with the same mountpoint */
                struct hlist_node mnt_umount;
        };
#ifdef CONFIG_FSNOTIFY
        struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
        __u32 mnt_fsnotify_mask;
        struct list_head to_notify;        /* need to queue notification */
        struct mnt_namespace *prev_ns;        /* previous namespace (NULL if none) */
#endif
        int mnt_t_flags;                /* namespace_sem-protected flags */
        int mnt_id;                        /* mount identifier, reused */
        u64 mnt_id_unique;                /* mount ID unique until reboot */
        int mnt_group_id;                /* peer group identifier */
        int mnt_expiry_mark;                /* true if marked for expiry */
        struct hlist_head mnt_pins;
        struct hlist_head mnt_stuck_children;
        struct mount *overmount;        /* mounted on ->mnt_root */
} __randomize_layout;

enum {
        T_SHARED                = 1, /* mount is shared */
        T_UNBINDABLE                = 2, /* mount is unbindable */
        T_MARKED                = 4, /* internal mark for propagate_... */
        T_UMOUNT_CANDIDATE        = 8, /* for propagate_umount */

        /*
         * T_SHARED_MASK is the set of flags that should be cleared when a
         * mount becomes shared.  Currently, this is only the flag that says a
         * mount cannot be bind mounted, since this is how we create a mount
         * that shares events with another mount.  If you add a new T_*
         * flag, consider how it interacts with shared mounts.
         */
        T_SHARED_MASK        = T_UNBINDABLE,
};

#define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */

static inline struct mount *real_mount(struct vfsmount *mnt)
{
        return container_of(mnt, struct mount, mnt);
}

static inline int mnt_has_parent(const struct mount *mnt)
{
        return mnt != mnt->mnt_parent;
}

static inline int is_mounted(struct vfsmount *mnt)
{
        /* neither detached nor internal? */
        return !IS_ERR_OR_NULL(real_mount(mnt)->mnt_ns);
}

extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);

extern int __legitimize_mnt(struct vfsmount *, unsigned);

static inline bool __path_is_mountpoint(const struct path *path)
{
        struct mount *m = __lookup_mnt(path->mnt, path->dentry);
        return m && likely(!(m->mnt.mnt_flags & MNT_SYNC_UMOUNT));
}

extern void __detach_mounts(struct dentry *dentry);

static inline void detach_mounts(struct dentry *dentry)
{
        if (!d_mountpoint(dentry))
                return;
        __detach_mounts(dentry);
}

static inline void get_mnt_ns(struct mnt_namespace *ns)
{
        ns_ref_inc(ns);
}

extern seqlock_t mount_lock;

DEFINE_LOCK_GUARD_0(mount_writer, write_seqlock(&mount_lock),
                    write_sequnlock(&mount_lock))
DEFINE_LOCK_GUARD_0(mount_locked_reader, read_seqlock_excl(&mount_lock),
                    read_sequnlock_excl(&mount_lock))

struct proc_mounts {
        struct mnt_namespace *ns;
        struct path root;
        int (*show)(struct seq_file *, struct vfsmount *);
};

extern const struct seq_operations mounts_op;

extern bool __is_local_mountpoint(const struct dentry *dentry);
static inline bool is_local_mountpoint(const struct dentry *dentry)
{
        if (!d_mountpoint(dentry))
                return false;

        return __is_local_mountpoint(dentry);
}

static inline bool is_anon_ns(struct mnt_namespace *ns)
{
        return ns->ns.ns_id == 0;
}

static inline bool anon_ns_root(const struct mount *m)
{
        struct mnt_namespace *ns = READ_ONCE(m->mnt_ns);

        return !IS_ERR_OR_NULL(ns) && is_anon_ns(ns) && m == ns->root;
}

static inline bool mnt_ns_attached(const struct mount *mnt)
{
        return !RB_EMPTY_NODE(&mnt->mnt_node);
}

static inline bool mnt_ns_empty(const struct mnt_namespace *ns)
{
        return RB_EMPTY_ROOT(&ns->mounts);
}

static inline void move_from_ns(struct mount *mnt)
{
        struct mnt_namespace *ns = mnt->mnt_ns;
        WARN_ON(!mnt_ns_attached(mnt));
        if (ns->mnt_last_node == &mnt->mnt_node)
                ns->mnt_last_node = rb_prev(&mnt->mnt_node);
        if (ns->mnt_first_node == &mnt->mnt_node)
                ns->mnt_first_node = rb_next(&mnt->mnt_node);
        rb_erase(&mnt->mnt_node, &ns->mounts);
        RB_CLEAR_NODE(&mnt->mnt_node);
}

bool has_locked_children(struct mount *mnt, struct dentry *dentry);
struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mnt_ns,
                                            bool previous);

static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
{
        return container_of(ns, struct mnt_namespace, ns);
}

#ifdef CONFIG_FSNOTIFY
static inline void mnt_notify_add(struct mount *m)
{
        /* Optimize the case where there are no watches */
        if ((m->mnt_ns && m->mnt_ns->n_fsnotify_marks) ||
            (m->prev_ns && m->prev_ns->n_fsnotify_marks))
                list_add_tail(&m->to_notify, &notify_list);
        else
                m->prev_ns = m->mnt_ns;
}
#else
static inline void mnt_notify_add(struct mount *m)
{
}
#endif

static inline struct mount *topmost_overmount(struct mount *m)
{
        while (m->overmount)
                m = m->overmount;
        return m;
}

static inline bool __test_write_hold(struct mount * __aligned(1) *val)
{
        return (unsigned long)val & WRITE_HOLD;
}

static inline bool test_write_hold(const struct mount *m)
{
        return __test_write_hold(m->mnt_pprev_for_sb);
}

static inline void set_write_hold(struct mount *m)
{
        m->mnt_pprev_for_sb = (void *)((unsigned long)m->mnt_pprev_for_sb
                                       | WRITE_HOLD);
}

static inline void clear_write_hold(struct mount *m)
{
        m->mnt_pprev_for_sb = (void *)((unsigned long)m->mnt_pprev_for_sb
                                       & ~WRITE_HOLD);
}

struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry);


























































































































































































































   17 














































    8 














   10 




   15 














   15 



















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * AEAD: Authenticated Encryption with Associated Data
 * 
 * Copyright (c) 2007-2015 Herbert Xu <herbert@gondor.apana.org.au>
 */

#ifndef _CRYPTO_AEAD_H
#define _CRYPTO_AEAD_H

#include <linux/atomic.h>
#include <linux/container_of.h>
#include <linux/crypto.h>
#include <linux/slab.h>
#include <linux/types.h>

/**
 * DOC: Authenticated Encryption With Associated Data (AEAD) Cipher API
 *
 * The AEAD cipher API is used with the ciphers of type CRYPTO_ALG_TYPE_AEAD
 * (listed as type "aead" in /proc/crypto)
 *
 * The most prominent examples for this type of encryption is GCM and CCM.
 * However, the kernel supports other types of AEAD ciphers which are defined
 * with the following cipher string:
 *
 *        authenc(keyed message digest, block cipher)
 *
 * For example: authenc(hmac(sha256), cbc(aes))
 *
 * The example code provided for the symmetric key cipher operation applies
 * here as well. Naturally all *skcipher* symbols must be exchanged the *aead*
 * pendants discussed in the following. In addition, for the AEAD operation,
 * the aead_request_set_ad function must be used to set the pointer to the
 * associated data memory location before performing the encryption or
 * decryption operation. Another deviation from the asynchronous block cipher
 * operation is that the caller should explicitly check for -EBADMSG of the
 * crypto_aead_decrypt. That error indicates an authentication error, i.e.
 * a breach in the integrity of the message. In essence, that -EBADMSG error
 * code is the key bonus an AEAD cipher has over "standard" block chaining
 * modes.
 *
 * Memory Structure:
 *
 * The source scatterlist must contain the concatenation of
 * associated data || plaintext or ciphertext.
 *
 * The destination scatterlist has the same layout, except that the plaintext
 * (resp. ciphertext) will grow (resp. shrink) by the authentication tag size
 * during encryption (resp. decryption). The authentication tag is generated
 * during the encryption operation and appended to the ciphertext. During
 * decryption, the authentication tag is consumed along with the ciphertext and
 * used to verify the integrity of the plaintext and the associated data.
 *
 * In-place encryption/decryption is enabled by using the same scatterlist
 * pointer for both the source and destination.
 *
 * Even in the out-of-place case, space must be reserved in the destination for
 * the associated data, even though it won't be written to.  This makes the
 * in-place and out-of-place cases more consistent.  It is permissible for the
 * "destination" associated data to alias the "source" associated data.
 *
 * As with the other scatterlist crypto APIs, zero-length scatterlist elements
 * are not allowed in the used part of the scatterlist.  Thus, if there is no
 * associated data, the first element must point to the plaintext/ciphertext.
 *
 * To meet the needs of IPsec, a special quirk applies to rfc4106, rfc4309,
 * rfc4543, and rfc7539esp ciphers.  For these ciphers, the final 'ivsize' bytes
 * of the associated data buffer must contain a second copy of the IV.  This is
 * in addition to the copy passed to aead_request_set_crypt().  These two IV
 * copies must not differ; different implementations of the same algorithm may
 * behave differently in that case.  Note that the algorithm might not actually
 * treat the IV as associated data; nevertheless the length passed to
 * aead_request_set_ad() must include it.
 */

struct crypto_aead;
struct scatterlist;

/**
 *        struct aead_request - AEAD request
 *        @base: Common attributes for async crypto requests
 *        @assoclen: Length in bytes of associated data for authentication
 *        @cryptlen: Length of data to be encrypted or decrypted
 *        @iv: Initialisation vector
 *        @src: Source data
 *        @dst: Destination data
 *        @__ctx: Start of private context data
 */
struct aead_request {
        struct crypto_async_request base;

        unsigned int assoclen;
        unsigned int cryptlen;

        u8 *iv;

        struct scatterlist *src;
        struct scatterlist *dst;

        void *__ctx[] CRYPTO_MINALIGN_ATTR;
};

/**
 * struct aead_alg - AEAD cipher definition
 * @maxauthsize: Set the maximum authentication tag size supported by the
 *                 transformation. A transformation may support smaller tag sizes.
 *                 As the authentication tag is a message digest to ensure the
 *                 integrity of the encrypted data, a consumer typically wants the
 *                 largest authentication tag possible as defined by this
 *                 variable.
 * @setauthsize: Set authentication size for the AEAD transformation. This
 *                 function is used to specify the consumer requested size of the
 *                  authentication tag to be either generated by the transformation
 *                 during encryption or the size of the authentication tag to be
 *                 supplied during the decryption operation. This function is also
 *                 responsible for checking the authentication tag size for
 *                 validity.
 * @setkey: see struct skcipher_alg
 * @encrypt: see struct skcipher_alg
 * @decrypt: see struct skcipher_alg
 * @ivsize: see struct skcipher_alg
 * @chunksize: see struct skcipher_alg
 * @init: Initialize the cryptographic transformation object. This function
 *          is used to initialize the cryptographic transformation object.
 *          This function is called only once at the instantiation time, right
 *          after the transformation context was allocated. In case the
 *          cryptographic hardware has some special requirements which need to
 *          be handled by software, this function shall check for the precise
 *          requirement of the transformation and put any software fallbacks
 *          in place.
 * @exit: Deinitialize the cryptographic transformation object. This is a
 *          counterpart to @init, used to remove various changes set in
 *          @init.
 * @base: Definition of a generic crypto cipher algorithm.
 *
 * All fields except @ivsize is mandatory and must be filled.
 */
struct aead_alg {
        int (*setkey)(struct crypto_aead *tfm, const u8 *key,
                      unsigned int keylen);
        int (*setauthsize)(struct crypto_aead *tfm, unsigned int authsize);
        int (*encrypt)(struct aead_request *req);
        int (*decrypt)(struct aead_request *req);
        int (*init)(struct crypto_aead *tfm);
        void (*exit)(struct crypto_aead *tfm);

        unsigned int ivsize;
        unsigned int maxauthsize;
        unsigned int chunksize;

        struct crypto_alg base;
};

struct crypto_aead {
        unsigned int authsize;
        unsigned int reqsize;

        struct crypto_tfm base;
};

static inline struct crypto_aead *__crypto_aead_cast(struct crypto_tfm *tfm)
{
        return container_of(tfm, struct crypto_aead, base);
}

/**
 * crypto_alloc_aead() - allocate AEAD cipher handle
 * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
 *             AEAD cipher
 * @type: specifies the type of the cipher
 * @mask: specifies the mask for the cipher
 *
 * Allocate a cipher handle for an AEAD. The returned struct
 * crypto_aead is the cipher handle that is required for any subsequent
 * API invocation for that AEAD.
 *
 * Return: allocated cipher handle in case of success; IS_ERR() is true in case
 *           of an error, PTR_ERR() returns the error code.
 */
struct crypto_aead *crypto_alloc_aead(const char *alg_name, u32 type, u32 mask);

static inline struct crypto_tfm *crypto_aead_tfm(struct crypto_aead *tfm)
{
        return &tfm->base;
}

/**
 * crypto_free_aead() - zeroize and free aead handle
 * @tfm: cipher handle to be freed
 *
 * If @tfm is a NULL or error pointer, this function does nothing.
 */
static inline void crypto_free_aead(struct crypto_aead *tfm)
{
        crypto_destroy_tfm(tfm, crypto_aead_tfm(tfm));
}

/**
 * crypto_has_aead() - Search for the availability of an aead.
 * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
 *              aead
 * @type: specifies the type of the aead
 * @mask: specifies the mask for the aead
 *
 * Return: true when the aead is known to the kernel crypto API; false
 *           otherwise
 */
int crypto_has_aead(const char *alg_name, u32 type, u32 mask);

static inline const char *crypto_aead_driver_name(struct crypto_aead *tfm)
{
        return crypto_tfm_alg_driver_name(crypto_aead_tfm(tfm));
}

static inline struct aead_alg *crypto_aead_alg(struct crypto_aead *tfm)
{
        return container_of(crypto_aead_tfm(tfm)->__crt_alg,
                            struct aead_alg, base);
}

static inline unsigned int crypto_aead_alg_ivsize(struct aead_alg *alg)
{
        return alg->ivsize;
}

/**
 * crypto_aead_ivsize() - obtain IV size
 * @tfm: cipher handle
 *
 * The size of the IV for the aead referenced by the cipher handle is
 * returned. This IV size may be zero if the cipher does not need an IV.
 *
 * Return: IV size in bytes
 */
static inline unsigned int crypto_aead_ivsize(struct crypto_aead *tfm)
{
        return crypto_aead_alg_ivsize(crypto_aead_alg(tfm));
}

/**
 * crypto_aead_authsize() - obtain maximum authentication data size
 * @tfm: cipher handle
 *
 * The maximum size of the authentication data for the AEAD cipher referenced
 * by the AEAD cipher handle is returned. The authentication data size may be
 * zero if the cipher implements a hard-coded maximum.
 *
 * The authentication data may also be known as "tag value".
 *
 * Return: authentication data size / tag size in bytes
 */
static inline unsigned int crypto_aead_authsize(struct crypto_aead *tfm)
{
        return tfm->authsize;
}

static inline unsigned int crypto_aead_alg_maxauthsize(struct aead_alg *alg)
{
        return alg->maxauthsize;
}

static inline unsigned int crypto_aead_maxauthsize(struct crypto_aead *aead)
{
        return crypto_aead_alg_maxauthsize(crypto_aead_alg(aead));
}

/**
 * crypto_aead_blocksize() - obtain block size of cipher
 * @tfm: cipher handle
 *
 * The block size for the AEAD referenced with the cipher handle is returned.
 * The caller may use that information to allocate appropriate memory for the
 * data returned by the encryption or decryption operation
 *
 * Return: block size of cipher
 */
static inline unsigned int crypto_aead_blocksize(struct crypto_aead *tfm)
{
        return crypto_tfm_alg_blocksize(crypto_aead_tfm(tfm));
}

static inline unsigned int crypto_aead_alignmask(struct crypto_aead *tfm)
{
        return crypto_tfm_alg_alignmask(crypto_aead_tfm(tfm));
}

static inline u32 crypto_aead_get_flags(struct crypto_aead *tfm)
{
        return crypto_tfm_get_flags(crypto_aead_tfm(tfm));
}

static inline void crypto_aead_set_flags(struct crypto_aead *tfm, u32 flags)
{
        crypto_tfm_set_flags(crypto_aead_tfm(tfm), flags);
}

static inline void crypto_aead_clear_flags(struct crypto_aead *tfm, u32 flags)
{
        crypto_tfm_clear_flags(crypto_aead_tfm(tfm), flags);
}

/**
 * crypto_aead_setkey() - set key for cipher
 * @tfm: cipher handle
 * @key: buffer holding the key
 * @keylen: length of the key in bytes
 *
 * The caller provided key is set for the AEAD referenced by the cipher
 * handle.
 *
 * Note, the key length determines the cipher type. Many block ciphers implement
 * different cipher modes depending on the key size, such as AES-128 vs AES-192
 * vs. AES-256. When providing a 16 byte key for an AES cipher handle, AES-128
 * is performed.
 *
 * Return: 0 if the setting of the key was successful; < 0 if an error occurred
 */
int crypto_aead_setkey(struct crypto_aead *tfm,
                       const u8 *key, unsigned int keylen);

/**
 * crypto_aead_setauthsize() - set authentication data size
 * @tfm: cipher handle
 * @authsize: size of the authentication data / tag in bytes
 *
 * Set the authentication data size / tag size. AEAD requires an authentication
 * tag (or MAC) in addition to the associated data.
 *
 * Return: 0 if the setting of the key was successful; < 0 if an error occurred
 */
int crypto_aead_setauthsize(struct crypto_aead *tfm, unsigned int authsize);

static inline struct crypto_aead *crypto_aead_reqtfm(struct aead_request *req)
{
        return __crypto_aead_cast(req->base.tfm);
}

/**
 * crypto_aead_encrypt() - encrypt plaintext
 * @req: reference to the aead_request handle that holds all information
 *         needed to perform the cipher operation
 *
 * Encrypt plaintext data using the aead_request handle. That data structure
 * and how it is filled with data is discussed with the aead_request_*
 * functions.
 *
 * IMPORTANT NOTE The encryption operation creates the authentication data /
 *                  tag. That data is concatenated with the created ciphertext.
 *                  The ciphertext memory size is therefore the given number of
 *                  block cipher blocks + the size defined by the
 *                  crypto_aead_setauthsize invocation. The caller must ensure
 *                  that sufficient memory is available for the ciphertext and
 *                  the authentication tag.
 *
 * Return: 0 if the cipher operation was successful; < 0 if an error occurred
 */
int crypto_aead_encrypt(struct aead_request *req);

/**
 * crypto_aead_decrypt() - decrypt ciphertext
 * @req: reference to the aead_request handle that holds all information
 *         needed to perform the cipher operation
 *
 * Decrypt ciphertext data using the aead_request handle. That data structure
 * and how it is filled with data is discussed with the aead_request_*
 * functions.
 *
 * IMPORTANT NOTE The caller must concatenate the ciphertext followed by the
 *                  authentication data / tag. That authentication data / tag
 *                  must have the size defined by the crypto_aead_setauthsize
 *                  invocation.
 *
 *
 * Return: 0 if the cipher operation was successful; -EBADMSG: The AEAD
 *           cipher operation performs the authentication of the data during the
 *           decryption operation. Therefore, the function returns this error if
 *           the authentication of the ciphertext was unsuccessful (i.e. the
 *           integrity of the ciphertext or the associated data was violated);
 *           < 0 if an error occurred.
 */
int crypto_aead_decrypt(struct aead_request *req);

/**
 * DOC: Asynchronous AEAD Request Handle
 *
 * The aead_request data structure contains all pointers to data required for
 * the AEAD cipher operation. This includes the cipher handle (which can be
 * used by multiple aead_request instances), pointer to plaintext and
 * ciphertext, asynchronous callback function, etc. It acts as a handle to the
 * aead_request_* API calls in a similar way as AEAD handle to the
 * crypto_aead_* API calls.
 */

/**
 * crypto_aead_reqsize() - obtain size of the request data structure
 * @tfm: cipher handle
 *
 * Return: number of bytes
 */
static inline unsigned int crypto_aead_reqsize(struct crypto_aead *tfm)
{
        return tfm->reqsize;
}

/**
 * aead_request_set_tfm() - update cipher handle reference in request
 * @req: request handle to be modified
 * @tfm: cipher handle that shall be added to the request handle
 *
 * Allow the caller to replace the existing aead handle in the request
 * data structure with a different one.
 */
static inline void aead_request_set_tfm(struct aead_request *req,
                                        struct crypto_aead *tfm)
{
        req->base.tfm = crypto_aead_tfm(tfm);
}

/**
 * aead_request_alloc() - allocate request data structure
 * @tfm: cipher handle to be registered with the request
 * @gfp: memory allocation flag that is handed to kmalloc by the API call.
 *
 * Allocate the request data structure that must be used with the AEAD
 * encrypt and decrypt API calls. During the allocation, the provided aead
 * handle is registered in the request data structure.
 *
 * Return: allocated request handle in case of success, or NULL if out of memory
 */
static inline struct aead_request *aead_request_alloc(struct crypto_aead *tfm,
                                                      gfp_t gfp)
{
        struct aead_request *req;

        req = kmalloc(sizeof(*req) + crypto_aead_reqsize(tfm), gfp);

        if (likely(req))
                aead_request_set_tfm(req, tfm);

        return req;
}

/**
 * aead_request_free() - zeroize and free request data structure
 * @req: request data structure cipher handle to be freed
 */
static inline void aead_request_free(struct aead_request *req)
{
        kfree_sensitive(req);
}

/**
 * aead_request_set_callback() - set asynchronous callback function
 * @req: request handle
 * @flags: specify zero or an ORing of the flags
 *           CRYPTO_TFM_REQ_MAY_BACKLOG the request queue may back log and
 *           increase the wait queue beyond the initial maximum size;
 *           CRYPTO_TFM_REQ_MAY_SLEEP the request processing may sleep
 * @compl: callback function pointer to be registered with the request handle
 * @data: The data pointer refers to memory that is not used by the kernel
 *          crypto API, but provided to the callback function for it to use. Here,
 *          the caller can provide a reference to memory the callback function can
 *          operate on. As the callback function is invoked asynchronously to the
 *          related functionality, it may need to access data structures of the
 *          related functionality which can be referenced using this pointer. The
 *          callback function can access the memory via the "data" field in the
 *          crypto_async_request data structure provided to the callback function.
 *
 * Setting the callback function that is triggered once the cipher operation
 * completes
 *
 * The callback function is registered with the aead_request handle and
 * must comply with the following template::
 *
 *        void callback_function(struct crypto_async_request *req, int error)
 */
static inline void aead_request_set_callback(struct aead_request *req,
                                             u32 flags,
                                             crypto_completion_t compl,
                                             void *data)
{
        req->base.complete = compl;
        req->base.data = data;
        req->base.flags = flags;
}

/**
 * aead_request_set_crypt - set data buffers
 * @req: request handle
 * @src: source scatter / gather list
 * @dst: destination scatter / gather list
 * @cryptlen: number of bytes to process from @src
 * @iv: IV for the cipher operation which must comply with the IV size defined
 *      by crypto_aead_ivsize()
 *
 * Setting the source data and destination data scatter / gather lists which
 * hold the associated data concatenated with the plaintext or ciphertext. See
 * below for the authentication tag.
 *
 * For encryption, the source is treated as the plaintext and the
 * destination is the ciphertext. For a decryption operation, the use is
 * reversed - the source is the ciphertext and the destination is the plaintext.
 *
 * The memory structure for cipher operation has the following structure:
 *
 * - AEAD encryption input:  assoc data || plaintext
 * - AEAD encryption output: assoc data || ciphertext || auth tag
 * - AEAD decryption input:  assoc data || ciphertext || auth tag
 * - AEAD decryption output: assoc data || plaintext
 *
 * Albeit the kernel requires the presence of the AAD buffer, however,
 * the kernel does not fill the AAD buffer in the output case. If the
 * caller wants to have that data buffer filled, the caller must either
 * use an in-place cipher operation (i.e. same memory location for
 * input/output memory location).
 */
static inline void aead_request_set_crypt(struct aead_request *req,
                                          struct scatterlist *src,
                                          struct scatterlist *dst,
                                          unsigned int cryptlen, u8 *iv)
{
        req->src = src;
        req->dst = dst;
        req->cryptlen = cryptlen;
        req->iv = iv;
}

/**
 * aead_request_set_ad - set associated data information
 * @req: request handle
 * @assoclen: number of bytes in associated data
 *
 * Setting the AD information.  This function sets the length of
 * the associated data.
 */
static inline void aead_request_set_ad(struct aead_request *req,
                                       unsigned int assoclen)
{
        req->assoclen = assoclen;
}

#endif        /* _CRYPTO_AEAD_H */






























































  319 


   14 









   15 













   14 











   14 








   15 












   14 

















   14 
   13 
   14 
   14 

















  144 














  148 















  148 



















  148 








































































































































































































































   14 

   15 

   15 


   15 
   14 

   13 


   14 

   15 


   14 


   14 

   15 



   14 


   14 



   15 





















































































































































































































































































































































































  165 



  167 
   28 




























  320 

  319 


  263 
  316 






























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  Copyright (C) 1994 Linus Torvalds
 *
 *  Pentium III FXSR, SSE support
 *  General FPU state handling cleanups
 *        Gareth Hughes <gareth@valinux.com>, May 2000
 */
#include <asm/fpu/api.h>
#include <asm/fpu/regset.h>
#include <asm/fpu/sched.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/types.h>
#include <asm/msr.h>
#include <asm/traps.h>
#include <asm/irq_regs.h>

#include <uapi/asm/kvm.h>

#include <linux/hardirq.h>
#include <linux/pkeys.h>
#include <linux/vmalloc.h>

#include "context.h"
#include "internal.h"
#include "legacy.h"
#include "xstate.h"

#define CREATE_TRACE_POINTS
#include <asm/trace/fpu.h>

#ifdef CONFIG_X86_64
DEFINE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);
DEFINE_PER_CPU(u64, xfd_state);
#endif

/* The FPU state configuration data for kernel and user space */
struct fpu_state_config        fpu_kernel_cfg __ro_after_init;
struct fpu_state_config fpu_user_cfg __ro_after_init;
struct vcpu_fpu_config guest_default_cfg __ro_after_init;

/*
 * Represents the initial FPU state. It's mostly (but not completely) zeroes,
 * depending on the FPU hardware format:
 */
struct fpstate init_fpstate __ro_after_init;

/*
 * Track FPU initialization and kernel-mode usage. 'true' means the FPU is
 * initialized and is not currently being used by the kernel:
 */
DEFINE_PER_CPU(bool, kernel_fpu_allowed);

/*
 * Track which context is using the FPU on the CPU:
 */
DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);

#ifdef CONFIG_X86_DEBUG_FPU
struct fpu *x86_task_fpu(struct task_struct *task)
{
        if (WARN_ON_ONCE(task->flags & PF_KTHREAD))
                return NULL;

        return (void *)task + sizeof(*task);
}
#endif

/*
 * Can we use the FPU in kernel mode with the
 * whole "kernel_fpu_begin/end()" sequence?
 */
bool irq_fpu_usable(void)
{
        if (WARN_ON_ONCE(in_nmi()))
                return false;

        /*
         * Return false in the following cases:
         *
         * - FPU is not yet initialized. This can happen only when the call is
         *   coming from CPU onlining, for example for microcode checksumming.
         * - The kernel is already using the FPU, either because of explicit
         *   nesting (which should never be done), or because of implicit
         *   nesting when a hardirq interrupted a kernel-mode FPU section.
         *
         * The single boolean check below handles both cases:
         */
        if (!this_cpu_read(kernel_fpu_allowed))
                return false;

        /*
         * When not in NMI or hard interrupt context, FPU can be used in:
         *
         * - Task context except from within fpregs_lock()'ed critical
         *   regions.
         *
         * - Soft interrupt processing context which cannot happen
         *   while in a fpregs_lock()'ed critical region.
         */
        if (!in_hardirq())
                return true;

        /*
         * In hard interrupt context it's safe when soft interrupts
         * are enabled, which means the interrupt did not hit in
         * a fpregs_lock()'ed critical region.
         */
        return !softirq_count();
}
EXPORT_SYMBOL(irq_fpu_usable);

/*
 * Track AVX512 state use because it is known to slow the max clock
 * speed of the core.
 */
static void update_avx_timestamp(struct fpu *fpu)
{

#define AVX512_TRACKING_MASK        (XFEATURE_MASK_ZMM_Hi256 | XFEATURE_MASK_Hi16_ZMM)

        if (fpu->fpstate->regs.xsave.header.xfeatures & AVX512_TRACKING_MASK)
                fpu->avx512_timestamp = jiffies;
}

/*
 * Save the FPU register state in fpu->fpstate->regs. The register state is
 * preserved.
 *
 * Must be called with fpregs_lock() held.
 *
 * The legacy FNSAVE instruction clears all FPU state unconditionally, so
 * register state has to be reloaded. That might be a pointless exercise
 * when the FPU is going to be used by another task right after that. But
 * this only affects 20+ years old 32bit systems and avoids conditionals all
 * over the place.
 *
 * FXSAVE and all XSAVE variants preserve the FPU register state.
 */
void save_fpregs_to_fpstate(struct fpu *fpu)
{
        if (likely(use_xsave())) {
                os_xsave(fpu->fpstate);
                update_avx_timestamp(fpu);
                return;
        }

        if (likely(use_fxsr())) {
                fxsave(&fpu->fpstate->regs.fxsave);
                return;
        }

        /*
         * Legacy FPU register saving, FNSAVE always clears FPU registers,
         * so we have to reload them from the memory state.
         */
        asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->fpstate->regs.fsave));
        frstor(&fpu->fpstate->regs.fsave);
}

void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask)
{
        /*
         * AMD K7/K8 and later CPUs up to Zen don't save/restore
         * FDP/FIP/FOP unless an exception is pending. Clear the x87 state
         * here by setting it to fixed values.  "m" is a random variable
         * that should be in L1.
         */
        if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) {
                asm volatile(
                        "fnclex\n\t"
                        "emms\n\t"
                        "fildl %[addr]"        /* set F?P to defined value */
                        : : [addr] "m" (*fpstate));
        }

        if (use_xsave()) {
                /*
                 * Dynamically enabled features are enabled in XCR0, but
                 * usage requires also that the corresponding bits in XFD
                 * are cleared.  If the bits are set then using a related
                 * instruction will raise #NM. This allows to do the
                 * allocation of the larger FPU buffer lazy from #NM or if
                 * the task has no permission to kill it which would happen
                 * via #UD if the feature is disabled in XCR0.
                 *
                 * XFD state is following the same life time rules as
                 * XSTATE and to restore state correctly XFD has to be
                 * updated before XRSTORS otherwise the component would
                 * stay in or go into init state even if the bits are set
                 * in fpstate::regs::xsave::xfeatures.
                 */
                xfd_update_state(fpstate);

                /*
                 * Restoring state always needs to modify all features
                 * which are in @mask even if the current task cannot use
                 * extended features.
                 *
                 * So fpstate->xfeatures cannot be used here, because then
                 * a feature for which the task has no permission but was
                 * used by the previous task would not go into init state.
                 */
                mask = fpu_kernel_cfg.max_features & mask;

                os_xrstor(fpstate, mask);
        } else {
                if (use_fxsr())
                        fxrstor(&fpstate->regs.fxsave);
                else
                        frstor(&fpstate->regs.fsave);
        }
}

void fpu_reset_from_exception_fixup(void)
{
        restore_fpregs_from_fpstate(&init_fpstate, XFEATURE_MASK_FPSTATE);
}

#if IS_ENABLED(CONFIG_KVM)
static void __fpstate_reset(struct fpstate *fpstate);

static void fpu_lock_guest_permissions(void)
{
        struct fpu_state_perm *fpuperm;
        u64 perm;

        if (!IS_ENABLED(CONFIG_X86_64))
                return;

        spin_lock_irq(&current->sighand->siglock);
        fpuperm = &x86_task_fpu(current->group_leader)->guest_perm;
        perm = fpuperm->__state_perm;

        /* First fpstate allocation locks down permissions. */
        WRITE_ONCE(fpuperm->__state_perm, perm | FPU_GUEST_PERM_LOCKED);

        spin_unlock_irq(&current->sighand->siglock);
}

bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu)
{
        struct fpstate *fpstate;
        unsigned int size;

        size = guest_default_cfg.size + ALIGN(offsetof(struct fpstate, regs), 64);

        fpstate = vzalloc(size);
        if (!fpstate)
                return false;

        /* Initialize indicators to reflect properties of the fpstate */
        fpstate->is_valloc        = true;
        fpstate->is_guest        = true;

        __fpstate_reset(fpstate);
        fpstate_init_user(fpstate);

        gfpu->fpstate                = fpstate;
        gfpu->xfeatures                = guest_default_cfg.features;

        /*
         * KVM sets the FP+SSE bits in the XSAVE header when copying FPU state
         * to userspace, even when XSAVE is unsupported, so that restoring FPU
         * state on a different CPU that does support XSAVE can cleanly load
         * the incoming state using its natural XSAVE.  In other words, KVM's
         * uABI size may be larger than this host's default size.  Conversely,
         * the default size should never be larger than KVM's base uABI size;
         * all features that can expand the uABI size must be opt-in.
         */
        gfpu->uabi_size                = sizeof(struct kvm_xsave);
        if (WARN_ON_ONCE(fpu_user_cfg.default_size > gfpu->uabi_size))
                gfpu->uabi_size = fpu_user_cfg.default_size;

        fpu_lock_guest_permissions();

        return true;
}
EXPORT_SYMBOL_GPL(fpu_alloc_guest_fpstate);

void fpu_free_guest_fpstate(struct fpu_guest *gfpu)
{
        struct fpstate *fpstate = gfpu->fpstate;

        if (!fpstate)
                return;

        if (WARN_ON_ONCE(!fpstate->is_valloc || !fpstate->is_guest || fpstate->in_use))
                return;

        gfpu->fpstate = NULL;
        vfree(fpstate);
}
EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate);

/*
  * fpu_enable_guest_xfd_features - Check xfeatures against guest perm and enable
  * @guest_fpu:         Pointer to the guest FPU container
  * @xfeatures:         Features requested by guest CPUID
  *
  * Enable all dynamic xfeatures according to guest perm and requested CPUID.
  *
  * Return: 0 on success, error code otherwise
  */
int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures)
{
        lockdep_assert_preemption_enabled();

        /* Nothing to do if all requested features are already enabled. */
        xfeatures &= ~guest_fpu->xfeatures;
        if (!xfeatures)
                return 0;

        return __xfd_enable_feature(xfeatures, guest_fpu);
}
EXPORT_SYMBOL_GPL(fpu_enable_guest_xfd_features);

#ifdef CONFIG_X86_64
void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd)
{
        fpregs_lock();
        guest_fpu->fpstate->xfd = xfd;
        if (guest_fpu->fpstate->in_use)
                xfd_update_state(guest_fpu->fpstate);
        fpregs_unlock();
}
EXPORT_SYMBOL_GPL(fpu_update_guest_xfd);

/**
 * fpu_sync_guest_vmexit_xfd_state - Synchronize XFD MSR and software state
 *
 * Must be invoked from KVM after a VMEXIT before enabling interrupts when
 * XFD write emulation is disabled. This is required because the guest can
 * freely modify XFD and the state at VMEXIT is not guaranteed to be the
 * same as the state on VMENTER. So software state has to be updated before
 * any operation which depends on it can take place.
 *
 * Note: It can be invoked unconditionally even when write emulation is
 * enabled for the price of a then pointless MSR read.
 */
void fpu_sync_guest_vmexit_xfd_state(void)
{
        struct fpstate *fpstate = x86_task_fpu(current)->fpstate;

        lockdep_assert_irqs_disabled();
        if (fpu_state_size_dynamic()) {
                rdmsrq(MSR_IA32_XFD, fpstate->xfd);
                __this_cpu_write(xfd_state, fpstate->xfd);
        }
}
EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state);
#endif /* CONFIG_X86_64 */

int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
{
        struct fpstate *guest_fps = guest_fpu->fpstate;
        struct fpu *fpu = x86_task_fpu(current);
        struct fpstate *cur_fps = fpu->fpstate;

        fpregs_lock();
        if (!cur_fps->is_confidential && !test_thread_flag(TIF_NEED_FPU_LOAD))
                save_fpregs_to_fpstate(fpu);

        /* Swap fpstate */
        if (enter_guest) {
                fpu->__task_fpstate = cur_fps;
                fpu->fpstate = guest_fps;
                guest_fps->in_use = true;
        } else {
                guest_fps->in_use = false;
                fpu->fpstate = fpu->__task_fpstate;
                fpu->__task_fpstate = NULL;
        }

        cur_fps = fpu->fpstate;

        if (!cur_fps->is_confidential) {
                /* Includes XFD update */
                restore_fpregs_from_fpstate(cur_fps, XFEATURE_MASK_FPSTATE);
        } else {
                /*
                 * XSTATE is restored by firmware from encrypted
                 * memory. Make sure XFD state is correct while
                 * running with guest fpstate
                 */
                xfd_update_state(cur_fps);
        }

        fpregs_mark_activate();
        fpregs_unlock();
        return 0;
}
EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate);

void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf,
                                    unsigned int size, u64 xfeatures, u32 pkru)
{
        struct fpstate *kstate = gfpu->fpstate;
        union fpregs_state *ustate = buf;
        struct membuf mb = { .p = buf, .left = size };

        if (cpu_feature_enabled(X86_FEATURE_XSAVE)) {
                __copy_xstate_to_uabi_buf(mb, kstate, xfeatures, pkru,
                                          XSTATE_COPY_XSAVE);
        } else {
                memcpy(&ustate->fxsave, &kstate->regs.fxsave,
                       sizeof(ustate->fxsave));
                /* Make it restorable on a XSAVE enabled host */
                ustate->xsave.header.xfeatures = XFEATURE_MASK_FPSSE;
        }
}
EXPORT_SYMBOL_GPL(fpu_copy_guest_fpstate_to_uabi);

int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
                                   u64 xcr0, u32 *vpkru)
{
        struct fpstate *kstate = gfpu->fpstate;
        const union fpregs_state *ustate = buf;

        if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) {
                if (ustate->xsave.header.xfeatures & ~XFEATURE_MASK_FPSSE)
                        return -EINVAL;
                if (ustate->fxsave.mxcsr & ~mxcsr_feature_mask)
                        return -EINVAL;
                memcpy(&kstate->regs.fxsave, &ustate->fxsave, sizeof(ustate->fxsave));
                return 0;
        }

        if (ustate->xsave.header.xfeatures & ~xcr0)
                return -EINVAL;

        /*
         * Nullify @vpkru to preserve its current value if PKRU's bit isn't set
         * in the header.  KVM's odd ABI is to leave PKRU untouched in this
         * case (all other components are eventually re-initialized).
         */
        if (!(ustate->xsave.header.xfeatures & XFEATURE_MASK_PKRU))
                vpkru = NULL;

        return copy_uabi_from_kernel_to_xstate(kstate, ustate, vpkru);
}
EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate);
#endif /* CONFIG_KVM */

void kernel_fpu_begin_mask(unsigned int kfpu_mask)
{
        if (!irqs_disabled())
                fpregs_lock();

        WARN_ON_FPU(!irq_fpu_usable());

        /* Toggle kernel_fpu_allowed to false: */
        WARN_ON_FPU(!this_cpu_read(kernel_fpu_allowed));
        this_cpu_write(kernel_fpu_allowed, false);

        if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER)) &&
            !test_thread_flag(TIF_NEED_FPU_LOAD)) {
                set_thread_flag(TIF_NEED_FPU_LOAD);
                save_fpregs_to_fpstate(x86_task_fpu(current));
        }
        __cpu_invalidate_fpregs_state();

        /* Put sane initial values into the control registers. */
        if (likely(kfpu_mask & KFPU_MXCSR) && boot_cpu_has(X86_FEATURE_XMM))
                ldmxcsr(MXCSR_DEFAULT);

        if (unlikely(kfpu_mask & KFPU_387) && boot_cpu_has(X86_FEATURE_FPU))
                asm volatile ("fninit");
}
EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask);

void kernel_fpu_end(void)
{
        /* Toggle kernel_fpu_allowed back to true: */
        WARN_ON_FPU(this_cpu_read(kernel_fpu_allowed));
        this_cpu_write(kernel_fpu_allowed, true);

        if (!irqs_disabled())
                fpregs_unlock();
}
EXPORT_SYMBOL_GPL(kernel_fpu_end);

/*
 * Sync the FPU register state to current's memory register state when the
 * current task owns the FPU. The hardware register state is preserved.
 */
void fpu_sync_fpstate(struct fpu *fpu)
{
        WARN_ON_FPU(fpu != x86_task_fpu(current));

        fpregs_lock();
        trace_x86_fpu_before_save(fpu);

        if (!test_thread_flag(TIF_NEED_FPU_LOAD))
                save_fpregs_to_fpstate(fpu);

        trace_x86_fpu_after_save(fpu);
        fpregs_unlock();
}

static inline unsigned int init_fpstate_copy_size(void)
{
        if (!use_xsave())
                return fpu_kernel_cfg.default_size;

        /* XSAVE(S) just needs the legacy and the xstate header part */
        return sizeof(init_fpstate.regs.xsave);
}

static inline void fpstate_init_fxstate(struct fpstate *fpstate)
{
        fpstate->regs.fxsave.cwd = 0x37f;
        fpstate->regs.fxsave.mxcsr = MXCSR_DEFAULT;
}

/*
 * Legacy x87 fpstate state init:
 */
static inline void fpstate_init_fstate(struct fpstate *fpstate)
{
        fpstate->regs.fsave.cwd = 0xffff037fu;
        fpstate->regs.fsave.swd = 0xffff0000u;
        fpstate->regs.fsave.twd = 0xffffffffu;
        fpstate->regs.fsave.fos = 0xffff0000u;
}

/*
 * Used in two places:
 * 1) Early boot to setup init_fpstate for non XSAVE systems
 * 2) fpu_alloc_guest_fpstate() which is invoked from KVM
 */
void fpstate_init_user(struct fpstate *fpstate)
{
        if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
                fpstate_init_soft(&fpstate->regs.soft);
                return;
        }

        xstate_init_xcomp_bv(&fpstate->regs.xsave, fpstate->xfeatures);

        if (cpu_feature_enabled(X86_FEATURE_FXSR))
                fpstate_init_fxstate(fpstate);
        else
                fpstate_init_fstate(fpstate);
}

static void __fpstate_reset(struct fpstate *fpstate)
{
        /*
         * Supervisor features (and thus sizes) may diverge between guest
         * FPUs and host FPUs, as some supervisor features are supported
         * for guests despite not being utilized by the host. User
         * features and sizes are always identical, which allows for
         * common guest and userspace ABI.
         *
         * For the host, set XFD to the kernel's desired initialization
         * value. For guests, set XFD to its architectural RESET value.
         */
        if (fpstate->is_guest) {
                fpstate->size                = guest_default_cfg.size;
                fpstate->xfeatures        = guest_default_cfg.features;
                fpstate->xfd                = 0;
        } else {
                fpstate->size                = fpu_kernel_cfg.default_size;
                fpstate->xfeatures        = fpu_kernel_cfg.default_features;
                fpstate->xfd                = init_fpstate.xfd;
        }

        fpstate->user_size        = fpu_user_cfg.default_size;
        fpstate->user_xfeatures        = fpu_user_cfg.default_features;
}

void fpstate_reset(struct fpu *fpu)
{
        /* Set the fpstate pointer to the default fpstate */
        fpu->fpstate = &fpu->__fpstate;
        __fpstate_reset(fpu->fpstate);

        /* Initialize the permission related info in fpu */
        fpu->perm.__state_perm                = fpu_kernel_cfg.default_features;
        fpu->perm.__state_size                = fpu_kernel_cfg.default_size;
        fpu->perm.__user_state_size        = fpu_user_cfg.default_size;

        fpu->guest_perm.__state_perm        = guest_default_cfg.features;
        fpu->guest_perm.__state_size        = guest_default_cfg.size;
        /*
         * User features and sizes are always identical between host and
         * guest FPUs, which allows for common guest and userspace ABI.
         */
        fpu->guest_perm.__user_state_size = fpu_user_cfg.default_size;
}

static inline void fpu_inherit_perms(struct fpu *dst_fpu)
{
        if (fpu_state_size_dynamic()) {
                struct fpu *src_fpu = x86_task_fpu(current->group_leader);

                spin_lock_irq(&current->sighand->siglock);
                /* Fork also inherits the permissions of the parent */
                dst_fpu->perm = src_fpu->perm;
                dst_fpu->guest_perm = src_fpu->guest_perm;
                spin_unlock_irq(&current->sighand->siglock);
        }
}

/* A passed ssp of zero will not cause any update */
static int update_fpu_shstk(struct task_struct *dst, unsigned long ssp)
{
#ifdef CONFIG_X86_USER_SHADOW_STACK
        struct cet_user_state *xstate;

        /* If ssp update is not needed. */
        if (!ssp)
                return 0;

        xstate = get_xsave_addr(&x86_task_fpu(dst)->fpstate->regs.xsave,
                                XFEATURE_CET_USER);

        /*
         * If there is a non-zero ssp, then 'dst' must be configured with a shadow
         * stack and the fpu state should be up to date since it was just copied
         * from the parent in fpu_clone(). So there must be a valid non-init CET
         * state location in the buffer.
         */
        if (WARN_ON_ONCE(!xstate))
                return 1;

        xstate->user_ssp = (u64)ssp;
#endif
        return 0;
}

/* Clone current's FPU state on fork */
int fpu_clone(struct task_struct *dst, u64 clone_flags, bool minimal,
              unsigned long ssp)
{
        /*
         * We allocate the new FPU structure right after the end of the task struct.
         * task allocation size already took this into account.
         *
         * This is safe because task_struct size is a multiple of cacheline size,
         * thus x86_task_fpu() will always be cacheline aligned as well.
         */
        struct fpu *dst_fpu = (void *)dst + sizeof(*dst);

        BUILD_BUG_ON(sizeof(*dst) % SMP_CACHE_BYTES != 0);

        /* The new task's FPU state cannot be valid in the hardware. */
        dst_fpu->last_cpu = -1;

        fpstate_reset(dst_fpu);

        if (!cpu_feature_enabled(X86_FEATURE_FPU))
                return 0;

        /*
         * Enforce reload for user space tasks and prevent kernel threads
         * from trying to save the FPU registers on context switch.
         */
        set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD);

        /*
         * No FPU state inheritance for kernel threads and IO
         * worker threads.
         */
        if (minimal) {
                /* Clear out the minimal state */
                memcpy(&dst_fpu->fpstate->regs, &init_fpstate.regs,
                       init_fpstate_copy_size());
                return 0;
        }

        /*
         * If a new feature is added, ensure all dynamic features are
         * caller-saved from here!
         */
        BUILD_BUG_ON(XFEATURE_MASK_USER_DYNAMIC != XFEATURE_MASK_XTILE_DATA);

        /*
         * Save the default portion of the current FPU state into the
         * clone. Assume all dynamic features to be defined as caller-
         * saved, which enables skipping both the expansion of fpstate
         * and the copying of any dynamic state.
         *
         * Do not use memcpy() when TIF_NEED_FPU_LOAD is set because
         * copying is not valid when current uses non-default states.
         */
        fpregs_lock();
        if (test_thread_flag(TIF_NEED_FPU_LOAD))
                fpregs_restore_userregs();
        save_fpregs_to_fpstate(dst_fpu);
        fpregs_unlock();
        if (!(clone_flags & CLONE_THREAD))
                fpu_inherit_perms(dst_fpu);

        /*
         * Children never inherit PASID state.
         * Force it to have its init value:
         */
        if (use_xsave())
                dst_fpu->fpstate->regs.xsave.header.xfeatures &= ~XFEATURE_MASK_PASID;

        /*
         * Update shadow stack pointer, in case it changed during clone.
         */
        if (update_fpu_shstk(dst, ssp))
                return 1;

        trace_x86_fpu_copy_dst(dst_fpu);

        return 0;
}

/*
 * While struct fpu is no longer part of struct thread_struct, it is still
 * allocated after struct task_struct in the "task_struct" kmem cache. But
 * since FPU is expected to be part of struct thread_struct, we have to
 * adjust for it here.
 */
void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size)
{
        /* The allocation follows struct task_struct. */
        *offset = sizeof(struct task_struct) - offsetof(struct task_struct, thread);
        *offset += offsetof(struct fpu, __fpstate.regs);
        *size = fpu_kernel_cfg.default_size;
}

/*
 * Drops current FPU state: deactivates the fpregs and
 * the fpstate. NOTE: it still leaves previous contents
 * in the fpregs in the eager-FPU case.
 *
 * This function can be used in cases where we know that
 * a state-restore is coming: either an explicit one,
 * or a reschedule.
 */
void fpu__drop(struct task_struct *tsk)
{
        struct fpu *fpu;

        if (test_tsk_thread_flag(tsk, TIF_NEED_FPU_LOAD))
                return;

        fpu = x86_task_fpu(tsk);

        preempt_disable();

        if (fpu == x86_task_fpu(current)) {
                /* Ignore delayed exceptions from user space */
                asm volatile("1: fwait\n"
                             "2:\n"
                             _ASM_EXTABLE(1b, 2b));
                fpregs_deactivate(fpu);
        }

        trace_x86_fpu_dropped(fpu);

        preempt_enable();
}

/*
 * Clear FPU registers by setting them up from the init fpstate.
 * Caller must do fpregs_[un]lock() around it.
 */
static inline void restore_fpregs_from_init_fpstate(u64 features_mask)
{
        if (use_xsave())
                os_xrstor(&init_fpstate, features_mask);
        else if (use_fxsr())
                fxrstor(&init_fpstate.regs.fxsave);
        else
                frstor(&init_fpstate.regs.fsave);

        pkru_write_default();
}

/*
 * Reset current->fpu memory state to the init values.
 */
static void fpu_reset_fpstate_regs(void)
{
        struct fpu *fpu = x86_task_fpu(current);

        fpregs_lock();
        __fpu_invalidate_fpregs_state(fpu);
        /*
         * This does not change the actual hardware registers. It just
         * resets the memory image and sets TIF_NEED_FPU_LOAD so a
         * subsequent return to usermode will reload the registers from the
         * task's memory image.
         *
         * Do not use fpstate_init() here. Just copy init_fpstate which has
         * the correct content already except for PKRU.
         *
         * PKRU handling does not rely on the xstate when restoring for
         * user space as PKRU is eagerly written in switch_to() and
         * flush_thread().
         */
        memcpy(&fpu->fpstate->regs, &init_fpstate.regs, init_fpstate_copy_size());
        set_thread_flag(TIF_NEED_FPU_LOAD);
        fpregs_unlock();
}

/*
 * Reset current's user FPU states to the init states.  current's
 * supervisor states, if any, are not modified by this function.  The
 * caller guarantees that the XSTATE header in memory is intact.
 */
void fpu__clear_user_states(struct fpu *fpu)
{
        WARN_ON_FPU(fpu != x86_task_fpu(current));

        fpregs_lock();
        if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
                fpu_reset_fpstate_regs();
                fpregs_unlock();
                return;
        }

        /*
         * Ensure that current's supervisor states are loaded into their
         * corresponding registers.
         */
        if (xfeatures_mask_supervisor() &&
            !fpregs_state_valid(fpu, smp_processor_id()))
                os_xrstor_supervisor(fpu->fpstate);

        /* Ensure XFD state is in sync before reloading XSTATE */
        xfd_update_state(fpu->fpstate);

        /* Reset user states in registers. */
        restore_fpregs_from_init_fpstate(XFEATURE_MASK_USER_RESTORE);

        /*
         * Now all FPU registers have their desired values.  Inform the FPU
         * state machine that current's FPU registers are in the hardware
         * registers. The memory image does not need to be updated because
         * any operation relying on it has to save the registers first when
         * current's FPU is marked active.
         */
        fpregs_mark_activate();
        fpregs_unlock();
}

void fpu_flush_thread(void)
{
        fpstate_reset(x86_task_fpu(current));
        fpu_reset_fpstate_regs();
}
/*
 * Load FPU context before returning to userspace.
 */
void switch_fpu_return(void)
{
        if (!static_cpu_has(X86_FEATURE_FPU))
                return;

        fpregs_restore_userregs();
}
EXPORT_SYMBOL_GPL(switch_fpu_return);

void fpregs_lock_and_load(void)
{
        /*
         * fpregs_lock() only disables preemption (mostly). So modifying state
         * in an interrupt could screw up some in progress fpregs operation.
         * Warn about it.
         */
        WARN_ON_ONCE(!irq_fpu_usable());
        WARN_ON_ONCE(current->flags & PF_KTHREAD);

        fpregs_lock();

        fpregs_assert_state_consistent();

        if (test_thread_flag(TIF_NEED_FPU_LOAD))
                fpregs_restore_userregs();
}

#ifdef CONFIG_X86_DEBUG_FPU
/*
 * If current FPU state according to its tracking (loaded FPU context on this
 * CPU) is not valid then we must have TIF_NEED_FPU_LOAD set so the context is
 * loaded on return to userland.
 */
void fpregs_assert_state_consistent(void)
{
        struct fpu *fpu = x86_task_fpu(current);

        if (test_thread_flag(TIF_NEED_FPU_LOAD))
                return;

        WARN_ON_FPU(!fpregs_state_valid(fpu, smp_processor_id()));
}
EXPORT_SYMBOL_GPL(fpregs_assert_state_consistent);
#endif

void fpregs_mark_activate(void)
{
        struct fpu *fpu = x86_task_fpu(current);

        fpregs_activate(fpu);
        fpu->last_cpu = smp_processor_id();
        clear_thread_flag(TIF_NEED_FPU_LOAD);
}

/*
 * x87 math exception handling:
 */

int fpu__exception_code(struct fpu *fpu, int trap_nr)
{
        int err;

        if (trap_nr == X86_TRAP_MF) {
                unsigned short cwd, swd;
                /*
                 * (~cwd & swd) will mask out exceptions that are not set to unmasked
                 * status.  0x3f is the exception bits in these regs, 0x200 is the
                 * C1 reg you need in case of a stack fault, 0x040 is the stack
                 * fault bit.  We should only be taking one exception at a time,
                 * so if this combination doesn't produce any single exception,
                 * then we have a bad program that isn't synchronizing its FPU usage
                 * and it will suffer the consequences since we won't be able to
                 * fully reproduce the context of the exception.
                 */
                if (boot_cpu_has(X86_FEATURE_FXSR)) {
                        cwd = fpu->fpstate->regs.fxsave.cwd;
                        swd = fpu->fpstate->regs.fxsave.swd;
                } else {
                        cwd = (unsigned short)fpu->fpstate->regs.fsave.cwd;
                        swd = (unsigned short)fpu->fpstate->regs.fsave.swd;
                }

                err = swd & ~cwd;
        } else {
                /*
                 * The SIMD FPU exceptions are handled a little differently, as there
                 * is only a single status/control register.  Thus, to determine which
                 * unmasked exception was caught we must mask the exception mask bits
                 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
                 */
                unsigned short mxcsr = MXCSR_DEFAULT;

                if (boot_cpu_has(X86_FEATURE_XMM))
                        mxcsr = fpu->fpstate->regs.fxsave.mxcsr;

                err = ~(mxcsr >> 7) & mxcsr;
        }

        if (err & 0x001) {        /* Invalid op */
                /*
                 * swd & 0x240 == 0x040: Stack Underflow
                 * swd & 0x240 == 0x240: Stack Overflow
                 * User must clear the SF bit (0x40) if set
                 */
                return FPE_FLTINV;
        } else if (err & 0x004) { /* Divide by Zero */
                return FPE_FLTDIV;
        } else if (err & 0x008) { /* Overflow */
                return FPE_FLTOVF;
        } else if (err & 0x012) { /* Denormal, Underflow */
                return FPE_FLTUND;
        } else if (err & 0x020) { /* Precision */
                return FPE_FLTRES;
        }

        /*
         * If we're using IRQ 13, or supposedly even some trap
         * X86_TRAP_MF implementations, it's possible
         * we get a spurious trap, which is not an error.
         */
        return 0;
}

/*
 * Initialize register state that may prevent from entering low-power idle.
 * This function will be invoked from the cpuidle driver only when needed.
 */
noinstr void fpu_idle_fpregs(void)
{
        /* Note: AMX_TILE being enabled implies XGETBV1 support */
        if (cpu_feature_enabled(X86_FEATURE_AMX_TILE) &&
            (xfeatures_in_use() & XFEATURE_MASK_XTILE)) {
                tile_release();
                __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
        }
}





































































   13 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * kernel/workqueue_internal.h
 *
 * Workqueue internal header file.  Only to be included by workqueue and
 * core kernel subsystems.
 */
#ifndef _KERNEL_WORKQUEUE_INTERNAL_H
#define _KERNEL_WORKQUEUE_INTERNAL_H

#include <linux/workqueue.h>
#include <linux/kthread.h>
#include <linux/preempt.h>

struct worker_pool;

/*
 * The poor guys doing the actual heavy lifting.  All on-duty workers are
 * either serving the manager role, on idle list or on busy hash.  For
 * details on the locking annotation (L, I, X...), refer to workqueue.c.
 *
 * Only to be used in workqueue and async.
 */
struct worker {
        /* on idle list while idle, on busy hash table while busy */
        union {
                struct list_head        entry;        /* L: while idle */
                struct hlist_node        hentry;        /* L: while busy */
        };

        struct work_struct        *current_work;        /* K: work being processed and its */
        work_func_t                current_func;        /* K: function */
        struct pool_workqueue        *current_pwq;        /* K: pwq */
        u64                        current_at;        /* K: runtime at start or last wakeup */
        unsigned int                current_color;        /* K: color */

        int                        sleeping;        /* S: is worker sleeping? */

        /* used by the scheduler to determine a worker's last known identity */
        work_func_t                last_func;        /* K: last work's fn */

        struct list_head        scheduled;        /* L: scheduled works */

        struct task_struct        *task;                /* I: worker task */
        struct worker_pool        *pool;                /* A: the associated pool */
                                                /* L: for rescuers */
        struct list_head        node;                /* A: anchored at pool->workers */
                                                /* A: runs through worker->node */

        unsigned long                last_active;        /* K: last active timestamp */
        unsigned int                flags;                /* L: flags */
        int                        id;                /* I: worker id */

        /*
         * Opaque string set with work_set_desc().  Printed out with task
         * dump for debugging - WARN, BUG, panic or sysrq.
         */
        char                        desc[WORKER_DESC_LEN];

        /* used only by rescuers to point to the target workqueue */
        struct workqueue_struct        *rescue_wq;        /* I: the workqueue to rescue */
};

/**
 * current_wq_worker - return struct worker if %current is a workqueue worker
 */
static inline struct worker *current_wq_worker(void)
{
        if (in_task() && (current->flags & PF_WQ_WORKER))
                return kthread_data(current);
        return NULL;
}

/*
 * Scheduler hooks for concurrency managed workqueue.  Only to be used from
 * sched/ and workqueue.c.
 */
void wq_worker_running(struct task_struct *task);
void wq_worker_sleeping(struct task_struct *task);
void wq_worker_tick(struct task_struct *task);
work_func_t wq_worker_last_func(struct task_struct *task);

#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
























































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
  Red Black Trees
  (C) 1999  Andrea Arcangeli <andrea@suse.de>
  (C) 2002  David Woodhouse <dwmw2@infradead.org>
  (C) 2012  Michel Lespinasse <walken@google.com>


  linux/include/linux/rbtree_augmented.h
*/

#ifndef _LINUX_RBTREE_AUGMENTED_H
#define _LINUX_RBTREE_AUGMENTED_H

#include <linux/compiler.h>
#include <linux/rbtree.h>
#include <linux/rcupdate.h>

/*
 * Please note - only struct rb_augment_callbacks and the prototypes for
 * rb_insert_augmented() and rb_erase_augmented() are intended to be public.
 * The rest are implementation details you are not expected to depend on.
 *
 * See Documentation/core-api/rbtree.rst for documentation and samples.
 */

struct rb_augment_callbacks {
        void (*propagate)(struct rb_node *node, struct rb_node *stop);
        void (*copy)(struct rb_node *old, struct rb_node *new);
        void (*rotate)(struct rb_node *old, struct rb_node *new);
};

extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
        void (*augment_rotate)(struct rb_node *old, struct rb_node *new));

/*
 * Fixup the rbtree and update the augmented information when rebalancing.
 *
 * On insertion, the user must update the augmented information on the path
 * leading to the inserted node, then call rb_link_node() as usual and
 * rb_insert_augmented() instead of the usual rb_insert_color() call.
 * If rb_insert_augmented() rebalances the rbtree, it will callback into
 * a user provided function to update the augmented information on the
 * affected subtrees.
 */
static inline void
rb_insert_augmented(struct rb_node *node, struct rb_root *root,
                    const struct rb_augment_callbacks *augment)
{
        __rb_insert_augmented(node, root, augment->rotate);
}

static inline void
rb_insert_augmented_cached(struct rb_node *node,
                           struct rb_root_cached *root, bool newleft,
                           const struct rb_augment_callbacks *augment)
{
        if (newleft)
                root->rb_leftmost = node;
        rb_insert_augmented(node, &root->rb_root, augment);
}

static __always_inline struct rb_node *
rb_add_augmented_cached(struct rb_node *node, struct rb_root_cached *tree,
                        bool (*less)(struct rb_node *, const struct rb_node *),
                        const struct rb_augment_callbacks *augment)
{
        struct rb_node **link = &tree->rb_root.rb_node;
        struct rb_node *parent = NULL;
        bool leftmost = true;

        while (*link) {
                parent = *link;
                if (less(node, parent)) {
                        link = &parent->rb_left;
                } else {
                        link = &parent->rb_right;
                        leftmost = false;
                }
        }

        rb_link_node(node, parent, link);
        augment->propagate(parent, NULL); /* suboptimal */
        rb_insert_augmented_cached(node, tree, leftmost, augment);

        return leftmost ? node : NULL;
}

/*
 * Template for declaring augmented rbtree callbacks (generic case)
 *
 * RBSTATIC:    'static' or empty
 * RBNAME:      name of the rb_augment_callbacks structure
 * RBSTRUCT:    struct type of the tree nodes
 * RBFIELD:     name of struct rb_node field within RBSTRUCT
 * RBAUGMENTED: name of field within RBSTRUCT holding data for subtree
 * RBCOMPUTE:   name of function that recomputes the RBAUGMENTED data
 */

#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME,                                \
                             RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE)        \
static inline void                                                        \
RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop)                \
{                                                                        \
        while (rb != stop) {                                                \
                RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD);        \
                if (RBCOMPUTE(node, true))                                \
                        break;                                                \
                rb = rb_parent(&node->RBFIELD);                                \
        }                                                                \
}                                                                        \
static inline void                                                        \
RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new)                \
{                                                                        \
        RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD);                \
        RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD);                \
        new->RBAUGMENTED = old->RBAUGMENTED;                                \
}                                                                        \
static void                                                                \
RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new)        \
{                                                                        \
        RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD);                \
        RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD);                \
        new->RBAUGMENTED = old->RBAUGMENTED;                                \
        RBCOMPUTE(old, false);                                                \
}                                                                        \
RBSTATIC const struct rb_augment_callbacks RBNAME = {                        \
        .propagate = RBNAME ## _propagate,                                \
        .copy = RBNAME ## _copy,                                        \
        .rotate = RBNAME ## _rotate                                        \
};

/*
 * Template for declaring augmented rbtree callbacks,
 * computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes.
 *
 * RBSTATIC:    'static' or empty
 * RBNAME:      name of the rb_augment_callbacks structure
 * RBSTRUCT:    struct type of the tree nodes
 * RBFIELD:     name of struct rb_node field within RBSTRUCT
 * RBTYPE:      type of the RBAUGMENTED field
 * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree
 * RBCOMPUTE:   name of function that returns the per-node RBTYPE scalar
 */

#define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD,              \
                                 RBTYPE, RBAUGMENTED, RBCOMPUTE)              \
static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit)              \
{                                                                              \
        RBSTRUCT *child;                                                      \
        RBTYPE max = RBCOMPUTE(node);                                              \
        if (node->RBFIELD.rb_left) {                                              \
                child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD);   \
                if (child->RBAUGMENTED > max)                                      \
                        max = child->RBAUGMENTED;                              \
        }                                                                      \
        if (node->RBFIELD.rb_right) {                                              \
                child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD);  \
                if (child->RBAUGMENTED > max)                                      \
                        max = child->RBAUGMENTED;                              \
        }                                                                      \
        if (exit && node->RBAUGMENTED == max)                                      \
                return true;                                                      \
        node->RBAUGMENTED = max;                                              \
        return false;                                                              \
}                                                                              \
RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME,                                              \
                     RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max)


#define        RB_RED                0
#define        RB_BLACK        1

#define __rb_parent(pc)    ((struct rb_node *)(pc & ~3))

#define __rb_color(pc)     ((pc) & 1)
#define __rb_is_black(pc)  __rb_color(pc)
#define __rb_is_red(pc)    (!__rb_color(pc))
#define rb_color(rb)       __rb_color((rb)->__rb_parent_color)
#define rb_is_red(rb)      __rb_is_red((rb)->__rb_parent_color)
#define rb_is_black(rb)    __rb_is_black((rb)->__rb_parent_color)

static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
{
        rb->__rb_parent_color = rb_color(rb) + (unsigned long)p;
}

static inline void rb_set_parent_color(struct rb_node *rb,
                                       struct rb_node *p, int color)
{
        rb->__rb_parent_color = (unsigned long)p + color;
}

static inline void
__rb_change_child(struct rb_node *old, struct rb_node *new,
                  struct rb_node *parent, struct rb_root *root)
{
        if (parent) {
                if (parent->rb_left == old)
                        WRITE_ONCE(parent->rb_left, new);
                else
                        WRITE_ONCE(parent->rb_right, new);
        } else
                WRITE_ONCE(root->rb_node, new);
}

static inline void
__rb_change_child_rcu(struct rb_node *old, struct rb_node *new,
                      struct rb_node *parent, struct rb_root *root)
{
        if (parent) {
                if (parent->rb_left == old)
                        rcu_assign_pointer(parent->rb_left, new);
                else
                        rcu_assign_pointer(parent->rb_right, new);
        } else
                rcu_assign_pointer(root->rb_node, new);
}

extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
        void (*augment_rotate)(struct rb_node *old, struct rb_node *new));

static __always_inline struct rb_node *
__rb_erase_augmented(struct rb_node *node, struct rb_root *root,
                     const struct rb_augment_callbacks *augment)
{
        struct rb_node *child = node->rb_right;
        struct rb_node *tmp = node->rb_left;
        struct rb_node *parent, *rebalance;
        unsigned long pc;

        if (!tmp) {
                /*
                 * Case 1: node to erase has no more than 1 child (easy!)
                 *
                 * Note that if there is one child it must be red due to 5)
                 * and node must be black due to 4). We adjust colors locally
                 * so as to bypass __rb_erase_color() later on.
                 */
                pc = node->__rb_parent_color;
                parent = __rb_parent(pc);
                __rb_change_child(node, child, parent, root);
                if (child) {
                        child->__rb_parent_color = pc;
                        rebalance = NULL;
                } else
                        rebalance = __rb_is_black(pc) ? parent : NULL;
                tmp = parent;
        } else if (!child) {
                /* Still case 1, but this time the child is node->rb_left */
                tmp->__rb_parent_color = pc = node->__rb_parent_color;
                parent = __rb_parent(pc);
                __rb_change_child(node, tmp, parent, root);
                rebalance = NULL;
                tmp = parent;
        } else {
                struct rb_node *successor = child, *child2;

                tmp = child->rb_left;
                if (!tmp) {
                        /*
                         * Case 2: node's successor is its right child
                         *
                         *    (n)          (s)
                         *    / \          / \
                         *  (x) (s)  ->  (x) (c)
                         *        \
                         *        (c)
                         */
                        parent = successor;
                        child2 = successor->rb_right;

                        augment->copy(node, successor);
                } else {
                        /*
                         * Case 3: node's successor is leftmost under
                         * node's right child subtree
                         *
                         *    (n)          (s)
                         *    / \          / \
                         *  (x) (y)  ->  (x) (y)
                         *      /            /
                         *    (p)          (p)
                         *    /            /
                         *  (s)          (c)
                         *    \
                         *    (c)
                         */
                        do {
                                parent = successor;
                                successor = tmp;
                                tmp = tmp->rb_left;
                        } while (tmp);
                        child2 = successor->rb_right;
                        WRITE_ONCE(parent->rb_left, child2);
                        WRITE_ONCE(successor->rb_right, child);
                        rb_set_parent(child, successor);

                        augment->copy(node, successor);
                        augment->propagate(parent, successor);
                }

                tmp = node->rb_left;
                WRITE_ONCE(successor->rb_left, tmp);
                rb_set_parent(tmp, successor);

                pc = node->__rb_parent_color;
                tmp = __rb_parent(pc);
                __rb_change_child(node, successor, tmp, root);

                if (child2) {
                        rb_set_parent_color(child2, parent, RB_BLACK);
                        rebalance = NULL;
                } else {
                        rebalance = rb_is_black(successor) ? parent : NULL;
                }
                successor->__rb_parent_color = pc;
                tmp = successor;
        }

        augment->propagate(tmp, NULL);
        return rebalance;
}

static __always_inline void
rb_erase_augmented(struct rb_node *node, struct rb_root *root,
                   const struct rb_augment_callbacks *augment)
{
        struct rb_node *rebalance = __rb_erase_augmented(node, root, augment);
        if (rebalance)
                __rb_erase_color(rebalance, root, augment->rotate);
}

static __always_inline void
rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root,
                          const struct rb_augment_callbacks *augment)
{
        if (root->rb_leftmost == node)
                root->rb_leftmost = rb_next(node);
        rb_erase_augmented(node, &root->rb_root, augment);
}

#endif        /* _LINUX_RBTREE_AUGMENTED_H */






























































































































   22 








































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_BITOPS_H
#define _LINUX_BITOPS_H

#include <asm/types.h>
#include <linux/bits.h>
#include <linux/typecheck.h>

#include <uapi/linux/kernel.h>

#define BITS_TO_LONGS(nr)        __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(long))
#define BITS_TO_U64(nr)                __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(u64))
#define BITS_TO_U32(nr)                __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(u32))
#define BITS_TO_BYTES(nr)        __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(char))

#define BYTES_TO_BITS(nb)        ((nb) * BITS_PER_BYTE)

extern unsigned int __sw_hweight8(unsigned int w);
extern unsigned int __sw_hweight16(unsigned int w);
extern unsigned int __sw_hweight32(unsigned int w);
extern unsigned long __sw_hweight64(__u64 w);

/*
 * Defined here because those may be needed by architecture-specific static
 * inlines.
 */

#include <asm-generic/bitops/generic-non-atomic.h>

/*
 * Many architecture-specific non-atomic bitops contain inline asm code and due
 * to that the compiler can't optimize them to compile-time expressions or
 * constants. In contrary, generic_*() helpers are defined in pure C and
 * compilers optimize them just well.
 * Therefore, to make `unsigned long foo = 0; __set_bit(BAR, &foo)` effectively
 * equal to `unsigned long foo = BIT(BAR)`, pick the generic C alternative when
 * the arguments can be resolved at compile time. That expression itself is a
 * constant and doesn't bring any functional changes to the rest of cases.
 * The casts to `uintptr_t` are needed to mitigate `-Waddress` warnings when
 * passing a bitmap from .bss or .data (-> `!!addr` is always true).
 */
#define bitop(op, nr, addr)                                                \
        ((__builtin_constant_p(nr) &&                                        \
          __builtin_constant_p((uintptr_t)(addr) != (uintptr_t)NULL) &&        \
          (uintptr_t)(addr) != (uintptr_t)NULL &&                        \
          __builtin_constant_p(*(const unsigned long *)(addr))) ?        \
         const##op(nr, addr) : op(nr, addr))

/*
 * The following macros are non-atomic versions of their non-underscored
 * counterparts.
 */
#define __set_bit(nr, addr)                bitop(___set_bit, nr, addr)
#define __clear_bit(nr, addr)                bitop(___clear_bit, nr, addr)
#define __change_bit(nr, addr)                bitop(___change_bit, nr, addr)
#define __test_and_set_bit(nr, addr)        bitop(___test_and_set_bit, nr, addr)
#define __test_and_clear_bit(nr, addr)        bitop(___test_and_clear_bit, nr, addr)
#define __test_and_change_bit(nr, addr)        bitop(___test_and_change_bit, nr, addr)

#define test_bit(nr, addr)                bitop(_test_bit, nr, addr)
#define test_bit_acquire(nr, addr)        bitop(_test_bit_acquire, nr, addr)

/*
 * Include this here because some architectures need generic_ffs/fls in
 * scope
 */
#include <asm/bitops.h>

/* Check that the bitops prototypes are sane */
#define __check_bitop_pr(name)                                                \
        static_assert(__same_type(arch_##name, generic_##name) &&        \
                      __same_type(const_##name, generic_##name) &&        \
                      __same_type(_##name, generic_##name))

__check_bitop_pr(__set_bit);
__check_bitop_pr(__clear_bit);
__check_bitop_pr(__change_bit);
__check_bitop_pr(__test_and_set_bit);
__check_bitop_pr(__test_and_clear_bit);
__check_bitop_pr(__test_and_change_bit);
__check_bitop_pr(test_bit);
__check_bitop_pr(test_bit_acquire);

#undef __check_bitop_pr

static inline int get_bitmask_order(unsigned int count)
{
        int order;

        order = fls(count);
        return order;        /* We could be slightly more clever with -1 here... */
}

static __always_inline unsigned long hweight_long(unsigned long w)
{
        return sizeof(w) == 4 ? hweight32(w) : hweight64((__u64)w);
}

/**
 * rol64 - rotate a 64-bit value left
 * @word: value to rotate
 * @shift: bits to roll
 */
static inline __u64 rol64(__u64 word, unsigned int shift)
{
        return (word << (shift & 63)) | (word >> ((-shift) & 63));
}

/**
 * ror64 - rotate a 64-bit value right
 * @word: value to rotate
 * @shift: bits to roll
 */
static inline __u64 ror64(__u64 word, unsigned int shift)
{
        return (word >> (shift & 63)) | (word << ((-shift) & 63));
}

/**
 * rol32 - rotate a 32-bit value left
 * @word: value to rotate
 * @shift: bits to roll
 */
static inline __u32 rol32(__u32 word, unsigned int shift)
{
        return (word << (shift & 31)) | (word >> ((-shift) & 31));
}

/**
 * ror32 - rotate a 32-bit value right
 * @word: value to rotate
 * @shift: bits to roll
 */
static inline __u32 ror32(__u32 word, unsigned int shift)
{
        return (word >> (shift & 31)) | (word << ((-shift) & 31));
}

/**
 * rol16 - rotate a 16-bit value left
 * @word: value to rotate
 * @shift: bits to roll
 */
static inline __u16 rol16(__u16 word, unsigned int shift)
{
        return (word << (shift & 15)) | (word >> ((-shift) & 15));
}

/**
 * ror16 - rotate a 16-bit value right
 * @word: value to rotate
 * @shift: bits to roll
 */
static inline __u16 ror16(__u16 word, unsigned int shift)
{
        return (word >> (shift & 15)) | (word << ((-shift) & 15));
}

/**
 * rol8 - rotate an 8-bit value left
 * @word: value to rotate
 * @shift: bits to roll
 */
static inline __u8 rol8(__u8 word, unsigned int shift)
{
        return (word << (shift & 7)) | (word >> ((-shift) & 7));
}

/**
 * ror8 - rotate an 8-bit value right
 * @word: value to rotate
 * @shift: bits to roll
 */
static inline __u8 ror8(__u8 word, unsigned int shift)
{
        return (word >> (shift & 7)) | (word << ((-shift) & 7));
}

/**
 * sign_extend32 - sign extend a 32-bit value using specified bit as sign-bit
 * @value: value to sign extend
 * @index: 0 based bit index (0<=index<32) to sign bit
 *
 * This is safe to use for 16- and 8-bit types as well.
 */
static __always_inline __s32 sign_extend32(__u32 value, int index)
{
        __u8 shift = 31 - index;
        return (__s32)(value << shift) >> shift;
}

/**
 * sign_extend64 - sign extend a 64-bit value using specified bit as sign-bit
 * @value: value to sign extend
 * @index: 0 based bit index (0<=index<64) to sign bit
 */
static __always_inline __s64 sign_extend64(__u64 value, int index)
{
        __u8 shift = 63 - index;
        return (__s64)(value << shift) >> shift;
}

static inline unsigned int fls_long(unsigned long l)
{
        if (sizeof(l) == 4)
                return fls(l);
        return fls64(l);
}

static inline int get_count_order(unsigned int count)
{
        if (count == 0)
                return -1;

        return fls(--count);
}

/**
 * get_count_order_long - get order after rounding @l up to power of 2
 * @l: parameter
 *
 * it is same as get_count_order() but with long type parameter
 */
static inline int get_count_order_long(unsigned long l)
{
        if (l == 0UL)
                return -1;
        return (int)fls_long(--l);
}

/**
 * parity8 - get the parity of an u8 value
 * @value: the value to be examined
 *
 * Determine the parity of the u8 argument.
 *
 * Returns:
 * 0 for even parity, 1 for odd parity
 *
 * Note: This function informs you about the current parity. Example to bail
 * out when parity is odd:
 *
 *        if (parity8(val) == 1)
 *                return -EBADMSG;
 *
 * If you need to calculate a parity bit, you need to draw the conclusion from
 * this result yourself. Example to enforce odd parity, parity bit is bit 7:
 *
 *        if (parity8(val) == 0)
 *                val ^= BIT(7);
 */
static inline int parity8(u8 val)
{
        /*
         * One explanation of this algorithm:
         * https://funloop.org/codex/problem/parity/README.html
         */
        val ^= val >> 4;
        return (0x6996 >> (val & 0xf)) & 1;
}

/**
 * __ffs64 - find first set bit in a 64 bit word
 * @word: The 64 bit word
 *
 * On 64 bit arches this is a synonym for __ffs
 * The result is not defined if no bits are set, so check that @word
 * is non-zero before calling this.
 */
static inline __attribute_const__ unsigned int __ffs64(u64 word)
{
#if BITS_PER_LONG == 32
        if (((u32)word) == 0UL)
                return __ffs((u32)(word >> 32)) + 32;
#elif BITS_PER_LONG != 64
#error BITS_PER_LONG not 32 or 64
#endif
        return __ffs((unsigned long)word);
}

/**
 * fns - find N'th set bit in a word
 * @word: The word to search
 * @n: Bit to find
 */
static inline unsigned int fns(unsigned long word, unsigned int n)
{
        while (word && n--)
                word &= word - 1;

        return word ? __ffs(word) : BITS_PER_LONG;
}

/**
 * assign_bit - Assign value to a bit in memory
 * @nr: the bit to set
 * @addr: the address to start counting from
 * @value: the value to assign
 */
#define assign_bit(nr, addr, value)                                        \
        ((value) ? set_bit((nr), (addr)) : clear_bit((nr), (addr)))

#define __assign_bit(nr, addr, value)                                        \
        ((value) ? __set_bit((nr), (addr)) : __clear_bit((nr), (addr)))

/**
 * __ptr_set_bit - Set bit in a pointer's value
 * @nr: the bit to set
 * @addr: the address of the pointer variable
 *
 * Example:
 *        void *p = foo();
 *        __ptr_set_bit(bit, &p);
 */
#define __ptr_set_bit(nr, addr)                         \
        ({                                              \
                typecheck_pointer(*(addr));             \
                __set_bit(nr, (unsigned long *)(addr)); \
        })

/**
 * __ptr_clear_bit - Clear bit in a pointer's value
 * @nr: the bit to clear
 * @addr: the address of the pointer variable
 *
 * Example:
 *        void *p = foo();
 *        __ptr_clear_bit(bit, &p);
 */
#define __ptr_clear_bit(nr, addr)                         \
        ({                                                \
                typecheck_pointer(*(addr));               \
                __clear_bit(nr, (unsigned long *)(addr)); \
        })

/**
 * __ptr_test_bit - Test bit in a pointer's value
 * @nr: the bit to test
 * @addr: the address of the pointer variable
 *
 * Example:
 *        void *p = foo();
 *        if (__ptr_test_bit(bit, &p)) {
 *                ...
 *        } else {
 *                ...
 *        }
 */
#define __ptr_test_bit(nr, addr)                       \
        ({                                             \
                typecheck_pointer(*(addr));            \
                test_bit(nr, (unsigned long *)(addr)); \
        })

#ifdef __KERNEL__

#ifndef set_mask_bits
#define set_mask_bits(ptr, mask, bits)        \
({                                                                \
        const typeof(*(ptr)) mask__ = (mask), bits__ = (bits);        \
        typeof(*(ptr)) old__, new__;                                \
                                                                \
        old__ = READ_ONCE(*(ptr));                                \
        do {                                                        \
                new__ = (old__ & ~mask__) | bits__;                \
        } while (!try_cmpxchg(ptr, &old__, new__));                \
                                                                \
        old__;                                                        \
})
#endif

#ifndef bit_clear_unless
#define bit_clear_unless(ptr, clear, test)        \
({                                                                \
        const typeof(*(ptr)) clear__ = (clear), test__ = (test);\
        typeof(*(ptr)) old__, new__;                                \
                                                                \
        old__ = READ_ONCE(*(ptr));                                \
        do {                                                        \
                if (old__ & test__)                                \
                        break;                                        \
                new__ = old__ & ~clear__;                        \
        } while (!try_cmpxchg(ptr, &old__, new__));                \
                                                                \
        !(old__ & test__);                                        \
})
#endif

#endif /* __KERNEL__ */
#endif






































































































































































































































































































































































































































































































































































































































































































































































































   32 


    5 

    1 



    4 
    3 

    1 


   28 




    2 


   32 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
// SPDX-License-Identifier: GPL-2.0-only
/*
 * xfrm_replay.c - xfrm replay detection, derived from xfrm_state.c.
 *
 * Copyright (C) 2010 secunet Security Networks AG
 * Copyright (C) 2010 Steffen Klassert <steffen.klassert@secunet.com>
 */

#include <linux/export.h>
#include <net/xfrm.h>

u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq)
{
        u32 seq, seq_hi, bottom;
        struct xfrm_replay_state_esn *replay_esn = x->replay_esn;

        if (!(x->props.flags & XFRM_STATE_ESN))
                return 0;

        seq = ntohl(net_seq);
        seq_hi = replay_esn->seq_hi;
        bottom = replay_esn->seq - replay_esn->replay_window + 1;

        if (likely(replay_esn->seq >= replay_esn->replay_window - 1)) {
                /* A. same subspace */
                if (unlikely(seq < bottom))
                        seq_hi++;
        } else {
                /* B. window spans two subspaces */
                if (unlikely(seq >= bottom))
                        seq_hi--;
        }

        return seq_hi;
}
EXPORT_SYMBOL(xfrm_replay_seqhi);

static void xfrm_replay_notify_bmp(struct xfrm_state *x, int event);
static void xfrm_replay_notify_esn(struct xfrm_state *x, int event);

void xfrm_replay_notify(struct xfrm_state *x, int event)
{
        struct km_event c;
        /* we send notify messages in case
         *  1. we updated on of the sequence numbers, and the seqno difference
         *     is at least x->replay_maxdiff, in this case we also update the
         *     timeout of our timer function
         *  2. if x->replay_maxage has elapsed since last update,
         *     and there were changes
         *
         *  The state structure must be locked!
         */

        switch (x->repl_mode) {
        case XFRM_REPLAY_MODE_LEGACY:
                break;
        case XFRM_REPLAY_MODE_BMP:
                xfrm_replay_notify_bmp(x, event);
                return;
        case XFRM_REPLAY_MODE_ESN:
                xfrm_replay_notify_esn(x, event);
                return;
        }

        switch (event) {
        case XFRM_REPLAY_UPDATE:
                if (!x->replay_maxdiff ||
                    ((x->replay.seq - x->preplay.seq < x->replay_maxdiff) &&
                    (x->replay.oseq - x->preplay.oseq < x->replay_maxdiff))) {
                        if (x->xflags & XFRM_TIME_DEFER)
                                event = XFRM_REPLAY_TIMEOUT;
                        else
                                return;
                }

                break;

        case XFRM_REPLAY_TIMEOUT:
                if (memcmp(&x->replay, &x->preplay,
                           sizeof(struct xfrm_replay_state)) == 0) {
                        x->xflags |= XFRM_TIME_DEFER;
                        return;
                }

                break;
        }

        memcpy(&x->preplay, &x->replay, sizeof(struct xfrm_replay_state));
        c.event = XFRM_MSG_NEWAE;
        c.data.aevent = event;
        km_state_notify(x, &c);

        if (x->replay_maxage &&
            !mod_timer(&x->rtimer, jiffies + x->replay_maxage))
                x->xflags &= ~XFRM_TIME_DEFER;
}

static int __xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb)
{
        int err = 0;
        struct net *net = xs_net(x);

        if (x->type->flags & XFRM_TYPE_REPLAY_PROT) {
                XFRM_SKB_CB(skb)->seq.output.low = ++x->replay.oseq;
                XFRM_SKB_CB(skb)->seq.output.hi = 0;
                if (unlikely(x->replay.oseq == 0) &&
                    !(x->props.extra_flags & XFRM_SA_XFLAG_OSEQ_MAY_WRAP)) {
                        x->replay.oseq--;
                        xfrm_audit_state_replay_overflow(x, skb);
                        err = -EOVERFLOW;

                        return err;
                }
                if (xfrm_aevent_is_on(net))
                        xfrm_replay_notify(x, XFRM_REPLAY_UPDATE);
        }

        return err;
}

static int xfrm_replay_check_legacy(struct xfrm_state *x,
                                    struct sk_buff *skb, __be32 net_seq)
{
        u32 diff;
        u32 seq = ntohl(net_seq);

        if (!x->props.replay_window)
                return 0;

        if (unlikely(seq == 0))
                goto err;

        if (likely(seq > x->replay.seq))
                return 0;

        diff = x->replay.seq - seq;
        if (diff >= x->props.replay_window) {
                x->stats.replay_window++;
                goto err;
        }

        if (x->replay.bitmap & (1U << diff)) {
                x->stats.replay++;
                goto err;
        }
        return 0;

err:
        xfrm_audit_state_replay(x, skb, net_seq);
        return -EINVAL;
}

static void xfrm_replay_advance_bmp(struct xfrm_state *x, __be32 net_seq);
static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq);

void xfrm_replay_advance(struct xfrm_state *x, __be32 net_seq)
{
        u32 diff, seq;

        switch (x->repl_mode) {
        case XFRM_REPLAY_MODE_LEGACY:
                break;
        case XFRM_REPLAY_MODE_BMP:
                return xfrm_replay_advance_bmp(x, net_seq);
        case XFRM_REPLAY_MODE_ESN:
                return xfrm_replay_advance_esn(x, net_seq);
        }

        if (!x->props.replay_window)
                return;

        seq = ntohl(net_seq);
        if (seq > x->replay.seq) {
                diff = seq - x->replay.seq;
                if (diff < x->props.replay_window)
                        x->replay.bitmap = ((x->replay.bitmap) << diff) | 1;
                else
                        x->replay.bitmap = 1;
                x->replay.seq = seq;
        } else {
                diff = x->replay.seq - seq;
                x->replay.bitmap |= (1U << diff);
        }

        if (xfrm_aevent_is_on(xs_net(x)))
                xfrm_replay_notify(x, XFRM_REPLAY_UPDATE);
}

static int xfrm_replay_overflow_bmp(struct xfrm_state *x, struct sk_buff *skb)
{
        int err = 0;
        struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
        struct net *net = xs_net(x);

        if (x->type->flags & XFRM_TYPE_REPLAY_PROT) {
                XFRM_SKB_CB(skb)->seq.output.low = ++replay_esn->oseq;
                XFRM_SKB_CB(skb)->seq.output.hi = 0;
                if (unlikely(replay_esn->oseq == 0) &&
                    !(x->props.extra_flags & XFRM_SA_XFLAG_OSEQ_MAY_WRAP)) {
                        replay_esn->oseq--;
                        xfrm_audit_state_replay_overflow(x, skb);
                        err = -EOVERFLOW;

                        return err;
                }
                if (xfrm_aevent_is_on(net))
                        xfrm_replay_notify(x, XFRM_REPLAY_UPDATE);
        }

        return err;
}

static int xfrm_replay_check_bmp(struct xfrm_state *x,
                                 struct sk_buff *skb, __be32 net_seq)
{
        unsigned int bitnr, nr;
        struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
        u32 pos;
        u32 seq = ntohl(net_seq);
        u32 diff =  replay_esn->seq - seq;

        if (!replay_esn->replay_window)
                return 0;

        if (unlikely(seq == 0))
                goto err;

        if (likely(seq > replay_esn->seq))
                return 0;

        if (diff >= replay_esn->replay_window) {
                x->stats.replay_window++;
                goto err;
        }

        pos = (replay_esn->seq - 1) % replay_esn->replay_window;

        if (pos >= diff)
                bitnr = (pos - diff) % replay_esn->replay_window;
        else
                bitnr = replay_esn->replay_window - (diff - pos);

        nr = bitnr >> 5;
        bitnr = bitnr & 0x1F;
        if (replay_esn->bmp[nr] & (1U << bitnr))
                goto err_replay;

        return 0;

err_replay:
        x->stats.replay++;
err:
        xfrm_audit_state_replay(x, skb, net_seq);
        return -EINVAL;
}

static void xfrm_replay_advance_bmp(struct xfrm_state *x, __be32 net_seq)
{
        unsigned int bitnr, nr, i;
        u32 diff;
        struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
        u32 seq = ntohl(net_seq);
        u32 pos;

        if (!replay_esn->replay_window)
                return;

        pos = (replay_esn->seq - 1) % replay_esn->replay_window;

        if (seq > replay_esn->seq) {
                diff = seq - replay_esn->seq;

                if (diff < replay_esn->replay_window) {
                        for (i = 1; i < diff; i++) {
                                bitnr = (pos + i) % replay_esn->replay_window;
                                nr = bitnr >> 5;
                                bitnr = bitnr & 0x1F;
                                replay_esn->bmp[nr] &=  ~(1U << bitnr);
                        }
                } else {
                        nr = (replay_esn->replay_window - 1) >> 5;
                        for (i = 0; i <= nr; i++)
                                replay_esn->bmp[i] = 0;
                }

                bitnr = (pos + diff) % replay_esn->replay_window;
                replay_esn->seq = seq;
        } else {
                diff = replay_esn->seq - seq;

                if (pos >= diff)
                        bitnr = (pos - diff) % replay_esn->replay_window;
                else
                        bitnr = replay_esn->replay_window - (diff - pos);
        }

        nr = bitnr >> 5;
        bitnr = bitnr & 0x1F;
        replay_esn->bmp[nr] |= (1U << bitnr);

        if (xfrm_aevent_is_on(xs_net(x)))
                xfrm_replay_notify(x, XFRM_REPLAY_UPDATE);
}

static void xfrm_replay_notify_bmp(struct xfrm_state *x, int event)
{
        struct km_event c;
        struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
        struct xfrm_replay_state_esn *preplay_esn = x->preplay_esn;

        /* we send notify messages in case
         *  1. we updated on of the sequence numbers, and the seqno difference
         *     is at least x->replay_maxdiff, in this case we also update the
         *     timeout of our timer function
         *  2. if x->replay_maxage has elapsed since last update,
         *     and there were changes
         *
         *  The state structure must be locked!
         */

        switch (event) {
        case XFRM_REPLAY_UPDATE:
                if (!x->replay_maxdiff ||
                    ((replay_esn->seq - preplay_esn->seq < x->replay_maxdiff) &&
                    (replay_esn->oseq - preplay_esn->oseq
                     < x->replay_maxdiff))) {
                        if (x->xflags & XFRM_TIME_DEFER)
                                event = XFRM_REPLAY_TIMEOUT;
                        else
                                return;
                }

                break;

        case XFRM_REPLAY_TIMEOUT:
                if (memcmp(x->replay_esn, x->preplay_esn,
                           xfrm_replay_state_esn_len(replay_esn)) == 0) {
                        x->xflags |= XFRM_TIME_DEFER;
                        return;
                }

                break;
        }

        memcpy(x->preplay_esn, x->replay_esn,
               xfrm_replay_state_esn_len(replay_esn));
        c.event = XFRM_MSG_NEWAE;
        c.data.aevent = event;
        km_state_notify(x, &c);

        if (x->replay_maxage &&
            !mod_timer(&x->rtimer, jiffies + x->replay_maxage))
                x->xflags &= ~XFRM_TIME_DEFER;
}

static void xfrm_replay_notify_esn(struct xfrm_state *x, int event)
{
        u32 seq_diff, oseq_diff;
        struct km_event c;
        struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
        struct xfrm_replay_state_esn *preplay_esn = x->preplay_esn;

        /* we send notify messages in case
         *  1. we updated on of the sequence numbers, and the seqno difference
         *     is at least x->replay_maxdiff, in this case we also update the
         *     timeout of our timer function
         *  2. if x->replay_maxage has elapsed since last update,
         *     and there were changes
         *
         *  The state structure must be locked!
         */

        switch (event) {
        case XFRM_REPLAY_UPDATE:
                if (x->replay_maxdiff) {
                        if (replay_esn->seq_hi == preplay_esn->seq_hi)
                                seq_diff = replay_esn->seq - preplay_esn->seq;
                        else
                                seq_diff = ~preplay_esn->seq + replay_esn->seq
                                           + 1;

                        if (replay_esn->oseq_hi == preplay_esn->oseq_hi)
                                oseq_diff = replay_esn->oseq
                                            - preplay_esn->oseq;
                        else
                                oseq_diff = ~preplay_esn->oseq
                                            + replay_esn->oseq + 1;

                        if (seq_diff >= x->replay_maxdiff ||
                            oseq_diff >= x->replay_maxdiff)
                                break;
                }

                if (x->xflags & XFRM_TIME_DEFER)
                        event = XFRM_REPLAY_TIMEOUT;
                else
                        return;

                break;

        case XFRM_REPLAY_TIMEOUT:
                if (memcmp(x->replay_esn, x->preplay_esn,
                           xfrm_replay_state_esn_len(replay_esn)) == 0) {
                        x->xflags |= XFRM_TIME_DEFER;
                        return;
                }

                break;
        }

        memcpy(x->preplay_esn, x->replay_esn,
               xfrm_replay_state_esn_len(replay_esn));
        c.event = XFRM_MSG_NEWAE;
        c.data.aevent = event;
        km_state_notify(x, &c);

        if (x->replay_maxage &&
            !mod_timer(&x->rtimer, jiffies + x->replay_maxage))
                x->xflags &= ~XFRM_TIME_DEFER;
}

static int xfrm_replay_overflow_esn(struct xfrm_state *x, struct sk_buff *skb)
{
        int err = 0;
        struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
        struct net *net = xs_net(x);

        if (x->type->flags & XFRM_TYPE_REPLAY_PROT) {
                XFRM_SKB_CB(skb)->seq.output.low = ++replay_esn->oseq;
                XFRM_SKB_CB(skb)->seq.output.hi = replay_esn->oseq_hi;

                if (unlikely(replay_esn->oseq == 0)) {
                        XFRM_SKB_CB(skb)->seq.output.hi = ++replay_esn->oseq_hi;

                        if (replay_esn->oseq_hi == 0) {
                                replay_esn->oseq--;
                                replay_esn->oseq_hi--;
                                xfrm_audit_state_replay_overflow(x, skb);
                                err = -EOVERFLOW;

                                return err;
                        }
                }
                if (xfrm_aevent_is_on(net))
                        xfrm_replay_notify(x, XFRM_REPLAY_UPDATE);
        }

        return err;
}

static int xfrm_replay_check_esn(struct xfrm_state *x,
                                 struct sk_buff *skb, __be32 net_seq)
{
        unsigned int bitnr, nr;
        u32 diff;
        struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
        u32 pos;
        u32 seq = ntohl(net_seq);
        u32 wsize = replay_esn->replay_window;
        u32 top = replay_esn->seq;
        u32 bottom = top - wsize + 1;

        if (!wsize)
                return 0;

        if (unlikely(seq == 0 && replay_esn->seq_hi == 0 &&
                     (replay_esn->seq < replay_esn->replay_window - 1)))
                goto err;

        diff = top - seq;

        if (likely(top >= wsize - 1)) {
                /* A. same subspace */
                if (likely(seq > top) || seq < bottom)
                        return 0;
        } else {
                /* B. window spans two subspaces */
                if (likely(seq > top && seq < bottom))
                        return 0;
                if (seq >= bottom)
                        diff = ~seq + top + 1;
        }

        if (diff >= replay_esn->replay_window) {
                x->stats.replay_window++;
                goto err;
        }

        pos = (replay_esn->seq - 1) % replay_esn->replay_window;

        if (pos >= diff)
                bitnr = (pos - diff) % replay_esn->replay_window;
        else
                bitnr = replay_esn->replay_window - (diff - pos);

        nr = bitnr >> 5;
        bitnr = bitnr & 0x1F;
        if (replay_esn->bmp[nr] & (1U << bitnr))
                goto err_replay;

        return 0;

err_replay:
        x->stats.replay++;
err:
        xfrm_audit_state_replay(x, skb, net_seq);
        return -EINVAL;
}

int xfrm_replay_check(struct xfrm_state *x,
                      struct sk_buff *skb, __be32 net_seq)
{
        switch (x->repl_mode) {
        case XFRM_REPLAY_MODE_LEGACY:
                break;
        case XFRM_REPLAY_MODE_BMP:
                return xfrm_replay_check_bmp(x, skb, net_seq);
        case XFRM_REPLAY_MODE_ESN:
                return xfrm_replay_check_esn(x, skb, net_seq);
        }

        return xfrm_replay_check_legacy(x, skb, net_seq);
}

static int xfrm_replay_recheck_esn(struct xfrm_state *x,
                                   struct sk_buff *skb, __be32 net_seq)
{
        if (unlikely(XFRM_SKB_CB(skb)->seq.input.hi !=
                     htonl(xfrm_replay_seqhi(x, net_seq)))) {
                        x->stats.replay_window++;
                        return -EINVAL;
        }

        return xfrm_replay_check_esn(x, skb, net_seq);
}

int xfrm_replay_recheck(struct xfrm_state *x,
                        struct sk_buff *skb, __be32 net_seq)
{
        switch (x->repl_mode) {
        case XFRM_REPLAY_MODE_LEGACY:
                break;
        case XFRM_REPLAY_MODE_BMP:
                /* no special recheck treatment */
                return xfrm_replay_check_bmp(x, skb, net_seq);
        case XFRM_REPLAY_MODE_ESN:
                return xfrm_replay_recheck_esn(x, skb, net_seq);
        }

        return xfrm_replay_check_legacy(x, skb, net_seq);
}

static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq)
{
        unsigned int bitnr, nr, i;
        int wrap;
        u32 diff, pos, seq, seq_hi;
        struct xfrm_replay_state_esn *replay_esn = x->replay_esn;

        if (!replay_esn->replay_window)
                return;

        seq = ntohl(net_seq);
        pos = (replay_esn->seq - 1) % replay_esn->replay_window;
        seq_hi = xfrm_replay_seqhi(x, net_seq);
        wrap = seq_hi - replay_esn->seq_hi;

        if ((!wrap && seq > replay_esn->seq) || wrap > 0) {
                if (likely(!wrap))
                        diff = seq - replay_esn->seq;
                else
                        diff = ~replay_esn->seq + seq + 1;

                if (diff < replay_esn->replay_window) {
                        for (i = 1; i < diff; i++) {
                                bitnr = (pos + i) % replay_esn->replay_window;
                                nr = bitnr >> 5;
                                bitnr = bitnr & 0x1F;
                                replay_esn->bmp[nr] &=  ~(1U << bitnr);
                        }
                } else {
                        nr = (replay_esn->replay_window - 1) >> 5;
                        for (i = 0; i <= nr; i++)
                                replay_esn->bmp[i] = 0;
                }

                bitnr = (pos + diff) % replay_esn->replay_window;
                replay_esn->seq = seq;

                if (unlikely(wrap > 0))
                        replay_esn->seq_hi++;
        } else {
                diff = replay_esn->seq - seq;

                if (pos >= diff)
                        bitnr = (pos - diff) % replay_esn->replay_window;
                else
                        bitnr = replay_esn->replay_window - (diff - pos);
        }

        xfrm_dev_state_advance_esn(x);

        nr = bitnr >> 5;
        bitnr = bitnr & 0x1F;
        replay_esn->bmp[nr] |= (1U << bitnr);

        if (xfrm_aevent_is_on(xs_net(x)))
                xfrm_replay_notify(x, XFRM_REPLAY_UPDATE);
}

#ifdef CONFIG_XFRM_OFFLOAD
static int xfrm_replay_overflow_offload(struct xfrm_state *x, struct sk_buff *skb)
{
        int err = 0;
        struct net *net = xs_net(x);
        struct xfrm_offload *xo = xfrm_offload(skb);
        __u32 oseq = x->replay.oseq;

        if (!xo)
                return __xfrm_replay_overflow(x, skb);

        if (x->type->flags & XFRM_TYPE_REPLAY_PROT) {
                if (!skb_is_gso(skb)) {
                        XFRM_SKB_CB(skb)->seq.output.low = ++oseq;
                        xo->seq.low = oseq;
                } else {
                        XFRM_SKB_CB(skb)->seq.output.low = oseq + 1;
                        xo->seq.low = oseq + 1;
                        oseq += skb_shinfo(skb)->gso_segs;
                }

                XFRM_SKB_CB(skb)->seq.output.hi = 0;
                xo->seq.hi = 0;
                if (unlikely(oseq < x->replay.oseq) &&
                    !(x->props.extra_flags & XFRM_SA_XFLAG_OSEQ_MAY_WRAP)) {
                        xfrm_audit_state_replay_overflow(x, skb);
                        err = -EOVERFLOW;

                        return err;
                }

                x->replay.oseq = oseq;

                if (xfrm_aevent_is_on(net))
                        xfrm_replay_notify(x, XFRM_REPLAY_UPDATE);
        }

        return err;
}

static int xfrm_replay_overflow_offload_bmp(struct xfrm_state *x, struct sk_buff *skb)
{
        int err = 0;
        struct xfrm_offload *xo = xfrm_offload(skb);
        struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
        struct net *net = xs_net(x);
        __u32 oseq = replay_esn->oseq;

        if (!xo)
                return xfrm_replay_overflow_bmp(x, skb);

        if (x->type->flags & XFRM_TYPE_REPLAY_PROT) {
                if (!skb_is_gso(skb)) {
                        XFRM_SKB_CB(skb)->seq.output.low = ++oseq;
                        xo->seq.low = oseq;
                } else {
                        XFRM_SKB_CB(skb)->seq.output.low = oseq + 1;
                        xo->seq.low = oseq + 1;
                        oseq += skb_shinfo(skb)->gso_segs;
                }

                XFRM_SKB_CB(skb)->seq.output.hi = 0;
                xo->seq.hi = 0;
                if (unlikely(oseq < replay_esn->oseq) &&
                    !(x->props.extra_flags & XFRM_SA_XFLAG_OSEQ_MAY_WRAP)) {
                        xfrm_audit_state_replay_overflow(x, skb);
                        err = -EOVERFLOW;

                        return err;
                } else {
                        replay_esn->oseq = oseq;
                }

                if (xfrm_aevent_is_on(net))
                        xfrm_replay_notify(x, XFRM_REPLAY_UPDATE);
        }

        return err;
}

static int xfrm_replay_overflow_offload_esn(struct xfrm_state *x, struct sk_buff *skb)
{
        int err = 0;
        struct xfrm_offload *xo = xfrm_offload(skb);
        struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
        struct net *net = xs_net(x);
        __u32 oseq = replay_esn->oseq;
        __u32 oseq_hi = replay_esn->oseq_hi;

        if (!xo)
                return xfrm_replay_overflow_esn(x, skb);

        if (x->type->flags & XFRM_TYPE_REPLAY_PROT) {
                if (!skb_is_gso(skb)) {
                        XFRM_SKB_CB(skb)->seq.output.low = ++oseq;
                        XFRM_SKB_CB(skb)->seq.output.hi = oseq_hi;
                        xo->seq.low = oseq;
                        xo->seq.hi = oseq_hi;
                } else {
                        XFRM_SKB_CB(skb)->seq.output.low = oseq + 1;
                        XFRM_SKB_CB(skb)->seq.output.hi = oseq_hi;
                        xo->seq.low = oseq + 1;
                        xo->seq.hi = oseq_hi;
                        oseq += skb_shinfo(skb)->gso_segs;
                }

                if (unlikely(oseq < replay_esn->oseq)) {
                        replay_esn->oseq_hi = ++oseq_hi;
                        if (xo->seq.low < replay_esn->oseq) {
                                XFRM_SKB_CB(skb)->seq.output.hi = oseq_hi;
                                xo->seq.hi = oseq_hi;
                        }
                        if (replay_esn->oseq_hi == 0) {
                                replay_esn->oseq--;
                                replay_esn->oseq_hi--;
                                xfrm_audit_state_replay_overflow(x, skb);
                                err = -EOVERFLOW;

                                return err;
                        }
                }

                replay_esn->oseq = oseq;
                xfrm_dev_state_advance_esn(x);

                if (xfrm_aevent_is_on(net))
                        xfrm_replay_notify(x, XFRM_REPLAY_UPDATE);
        }

        return err;
}

int xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb)
{
        switch (x->repl_mode) {
        case XFRM_REPLAY_MODE_LEGACY:
                break;
        case XFRM_REPLAY_MODE_BMP:
                return xfrm_replay_overflow_offload_bmp(x, skb);
        case XFRM_REPLAY_MODE_ESN:
                return xfrm_replay_overflow_offload_esn(x, skb);
        }

        return xfrm_replay_overflow_offload(x, skb);
}
#else
int xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb)
{
        switch (x->repl_mode) {
        case XFRM_REPLAY_MODE_LEGACY:
                break;
        case XFRM_REPLAY_MODE_BMP:
                return xfrm_replay_overflow_bmp(x, skb);
        case XFRM_REPLAY_MODE_ESN:
                return xfrm_replay_overflow_esn(x, skb);
        }

        return __xfrm_replay_overflow(x, skb);
}
#endif

int xfrm_init_replay(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
        struct xfrm_replay_state_esn *replay_esn = x->replay_esn;

        if (replay_esn) {
                if (replay_esn->replay_window >
                    replay_esn->bmp_len * sizeof(__u32) * 8) {
                        NL_SET_ERR_MSG(extack, "ESN replay window is too large for the chosen bitmap size");
                        return -EINVAL;
                }

                if (x->props.flags & XFRM_STATE_ESN) {
                        if (replay_esn->replay_window == 0 &&
                            (!x->dir || x->dir == XFRM_SA_DIR_IN)) {
                                NL_SET_ERR_MSG(extack, "ESN replay window must be > 0");
                                return -EINVAL;
                        }
                        x->repl_mode = XFRM_REPLAY_MODE_ESN;
                } else {
                        x->repl_mode = XFRM_REPLAY_MODE_BMP;
                }
        } else {
                x->repl_mode = XFRM_REPLAY_MODE_LEGACY;
        }

        return 0;
}
EXPORT_SYMBOL(xfrm_init_replay);































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NET_NF_TABLES_H
#define _NET_NF_TABLES_H

#include <linux/unaligned.h>
#include <linux/list.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/nf_tables.h>
#include <linux/u64_stats_sync.h>
#include <linux/rhashtable.h>
#include <net/netfilter/nf_flow_table.h>
#include <net/netlink.h>
#include <net/flow_offload.h>
#include <net/netns/generic.h>

#define NFT_MAX_HOOKS        (NF_INET_INGRESS + 1)

struct module;

#define NFT_JUMP_STACK_SIZE        16

enum {
        NFT_PKTINFO_L4PROTO        = (1 << 0),
        NFT_PKTINFO_INNER        = (1 << 1),
        NFT_PKTINFO_INNER_FULL        = (1 << 2),
};

struct nft_pktinfo {
        struct sk_buff                        *skb;
        const struct nf_hook_state        *state;
        u8                                flags;
        u8                                tprot;
        u16                                fragoff;
        u16                                thoff;
        u16                                inneroff;
};

static inline struct sock *nft_sk(const struct nft_pktinfo *pkt)
{
        return pkt->state->sk;
}

static inline unsigned int nft_thoff(const struct nft_pktinfo *pkt)
{
        return pkt->thoff;
}

static inline struct net *nft_net(const struct nft_pktinfo *pkt)
{
        return pkt->state->net;
}

static inline unsigned int nft_hook(const struct nft_pktinfo *pkt)
{
        return pkt->state->hook;
}

static inline u8 nft_pf(const struct nft_pktinfo *pkt)
{
        return pkt->state->pf;
}

static inline const struct net_device *nft_in(const struct nft_pktinfo *pkt)
{
        return pkt->state->in;
}

static inline const struct net_device *nft_out(const struct nft_pktinfo *pkt)
{
        return pkt->state->out;
}

static inline void nft_set_pktinfo(struct nft_pktinfo *pkt,
                                   struct sk_buff *skb,
                                   const struct nf_hook_state *state)
{
        pkt->skb = skb;
        pkt->state = state;
}

static inline void nft_set_pktinfo_unspec(struct nft_pktinfo *pkt)
{
        pkt->flags = 0;
        pkt->tprot = 0;
        pkt->thoff = 0;
        pkt->fragoff = 0;
}

/**
 *         struct nft_verdict - nf_tables verdict
 *
 *         @code: nf_tables/netfilter verdict code
 *         @chain: destination chain for NFT_JUMP/NFT_GOTO
 */
struct nft_verdict {
        u32                                code;
        struct nft_chain                *chain;
};

struct nft_data {
        union {
                u32                        data[4];
                struct nft_verdict        verdict;
        };
} __attribute__((aligned(__alignof__(u64))));

#define NFT_REG32_NUM                20

/**
 *        struct nft_regs - nf_tables register set
 *
 *        @data: data registers
 *        @verdict: verdict register
 *
 *        The first four data registers alias to the verdict register.
 */
struct nft_regs {
        union {
                u32                        data[NFT_REG32_NUM];
                struct nft_verdict        verdict;
        };
};

struct nft_regs_track {
        struct {
                const struct nft_expr                *selector;
                const struct nft_expr                *bitwise;
                u8                                num_reg;
        } regs[NFT_REG32_NUM];

        const struct nft_expr                        *cur;
        const struct nft_expr                        *last;
};

/* Store/load an u8, u16 or u64 integer to/from the u32 data register.
 *
 * Note, when using concatenations, register allocation happens at 32-bit
 * level. So for store instruction, pad the rest part with zero to avoid
 * garbage values.
 */

static inline void nft_reg_store8(u32 *dreg, u8 val)
{
        *dreg = 0;
        *(u8 *)dreg = val;
}

static inline u8 nft_reg_load8(const u32 *sreg)
{
        return *(u8 *)sreg;
}

static inline void nft_reg_store16(u32 *dreg, u16 val)
{
        *dreg = 0;
        *(u16 *)dreg = val;
}

static inline void nft_reg_store_be16(u32 *dreg, __be16 val)
{
        nft_reg_store16(dreg, (__force __u16)val);
}

static inline u16 nft_reg_load16(const u32 *sreg)
{
        return *(u16 *)sreg;
}

static inline __be16 nft_reg_load_be16(const u32 *sreg)
{
        return (__force __be16)nft_reg_load16(sreg);
}

static inline __be32 nft_reg_load_be32(const u32 *sreg)
{
        return *(__force __be32 *)sreg;
}

static inline void nft_reg_store64(u64 *dreg, u64 val)
{
        put_unaligned(val, dreg);
}

static inline u64 nft_reg_load64(const u32 *sreg)
{
        return get_unaligned((u64 *)sreg);
}

static inline void nft_data_copy(u32 *dst, const struct nft_data *src,
                                 unsigned int len)
{
        if (len % NFT_REG32_SIZE)
                dst[len / NFT_REG32_SIZE] = 0;
        memcpy(dst, src, len);
}

/**
 *        struct nft_ctx - nf_tables rule/set context
 *
 *        @net: net namespace
 *         @table: the table the chain is contained in
 *         @chain: the chain the rule is contained in
 *        @nla: netlink attributes
 *        @portid: netlink portID of the original message
 *        @seq: netlink sequence number
 *        @flags: modifiers to new request
 *        @family: protocol family
 *        @level: depth of the chains
 *        @report: notify via unicast netlink message
 *        @reg_inited: bitmap of initialised registers
 */
struct nft_ctx {
        struct net                        *net;
        struct nft_table                *table;
        struct nft_chain                *chain;
        const struct nlattr * const         *nla;
        u32                                portid;
        u32                                seq;
        u16                                flags;
        u8                                family;
        u8                                level;
        bool                                report;
        DECLARE_BITMAP(reg_inited, NFT_REG32_NUM);
};

enum nft_data_desc_flags {
        NFT_DATA_DESC_SETELEM        = (1 << 0),
};

struct nft_data_desc {
        enum nft_data_types                type;
        unsigned int                        size;
        unsigned int                        len;
        unsigned int                        flags;
};

int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data,
                  struct nft_data_desc *desc, const struct nlattr *nla);
void nft_data_hold(const struct nft_data *data, enum nft_data_types type);
void nft_data_release(const struct nft_data *data, enum nft_data_types type);
int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data,
                  enum nft_data_types type, unsigned int len);

static inline enum nft_data_types nft_dreg_to_type(enum nft_registers reg)
{
        return reg == NFT_REG_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE;
}

static inline enum nft_registers nft_type_to_reg(enum nft_data_types type)
{
        return type == NFT_DATA_VERDICT ? NFT_REG_VERDICT : NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE;
}

int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest);
int nft_dump_register(struct sk_buff *skb, unsigned int attr, unsigned int reg);

int nft_parse_register_load(const struct nft_ctx *ctx,
                            const struct nlattr *attr, u8 *sreg, u32 len);
int nft_parse_register_store(const struct nft_ctx *ctx,
                             const struct nlattr *attr, u8 *dreg,
                             const struct nft_data *data,
                             enum nft_data_types type, unsigned int len);

/**
 *        struct nft_userdata - user defined data associated with an object
 *
 *        @len: length of the data
 *        @data: content
 *
 *        The presence of user data is indicated in an object specific fashion,
 *        so a length of zero can't occur and the value "len" indicates data
 *        of length len + 1.
 */
struct nft_userdata {
        u8                        len;
        unsigned char                data[];
};

/* placeholder structure for opaque set element backend representation. */
struct nft_elem_priv { };

/**
 *        struct nft_set_elem - generic representation of set elements
 *
 *        @key: element key
 *        @key_end: closing element key
 *        @data: element data
 *        @priv: element private data and extensions
 */
struct nft_set_elem {
        union {
                u32                buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)];
                struct nft_data        val;
        } key;
        union {
                u32                buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)];
                struct nft_data        val;
        } key_end;
        union {
                u32                buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)];
                struct nft_data val;
        } data;
        struct nft_elem_priv        *priv;
};

static inline void *nft_elem_priv_cast(const struct nft_elem_priv *priv)
{
        return (void *)priv;
}


/**
 * enum nft_iter_type - nftables set iterator type
 *
 * @NFT_ITER_UNSPEC: unspecified, to catch errors
 * @NFT_ITER_READ: read-only iteration over set elements
 * @NFT_ITER_UPDATE: iteration under mutex to update set element state
 */
enum nft_iter_type {
        NFT_ITER_UNSPEC,
        NFT_ITER_READ,
        NFT_ITER_UPDATE,
};

struct nft_set;
struct nft_set_iter {
        u8                genmask;
        enum nft_iter_type type:8;
        unsigned int        count;
        unsigned int        skip;
        int                err;
        int                (*fn)(const struct nft_ctx *ctx,
                              struct nft_set *set,
                              const struct nft_set_iter *iter,
                              struct nft_elem_priv *elem_priv);
};

/**
 *        struct nft_set_desc - description of set elements
 *
 *        @ktype: key type
 *        @klen: key length
 *        @dtype: data type
 *        @dlen: data length
 *        @objtype: object type
 *        @size: number of set elements
 *        @policy: set policy
 *        @gc_int: garbage collector interval
 *        @timeout: element timeout
 *        @field_len: length of each field in concatenation, bytes
 *        @field_count: number of concatenated fields in element
 *        @expr: set must support for expressions
 */
struct nft_set_desc {
        u32                        ktype;
        unsigned int                klen;
        u32                        dtype;
        unsigned int                dlen;
        u32                        objtype;
        unsigned int                size;
        u32                        policy;
        u32                        gc_int;
        u64                        timeout;
        u8                        field_len[NFT_REG32_COUNT];
        u8                        field_count;
        bool                        expr;
};

/**
 *        enum nft_set_class - performance class
 *
 *        @NFT_SET_CLASS_O_1: constant, O(1)
 *        @NFT_SET_CLASS_O_LOG_N: logarithmic, O(log N)
 *        @NFT_SET_CLASS_O_N: linear, O(N)
 */
enum nft_set_class {
        NFT_SET_CLASS_O_1,
        NFT_SET_CLASS_O_LOG_N,
        NFT_SET_CLASS_O_N,
};

/**
 *        struct nft_set_estimate - estimation of memory and performance
 *                                  characteristics
 *
 *        @size: required memory
 *        @lookup: lookup performance class
 *        @space: memory class
 */
struct nft_set_estimate {
        u64                        size;
        enum nft_set_class        lookup;
        enum nft_set_class        space;
};

#define NFT_EXPR_MAXATTR                16
#define NFT_EXPR_SIZE(size)                (sizeof(struct nft_expr) + \
                                         ALIGN(size, __alignof__(struct nft_expr)))

/**
 *        struct nft_expr - nf_tables expression
 *
 *        @ops: expression ops
 *        @data: expression private data
 */
struct nft_expr {
        const struct nft_expr_ops        *ops;
        unsigned char                        data[]
                __attribute__((aligned(__alignof__(u64))));
};

static inline void *nft_expr_priv(const struct nft_expr *expr)
{
        return (void *)expr->data;
}

struct nft_expr_info;

int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla,
                         struct nft_expr_info *info);
int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src, gfp_t gfp);
void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr);
int nft_expr_dump(struct sk_buff *skb, unsigned int attr,
                  const struct nft_expr *expr, bool reset);
bool nft_expr_reduce_bitwise(struct nft_regs_track *track,
                             const struct nft_expr *expr);

struct nft_set_ext;

/**
 *        struct nft_set_ops - nf_tables set operations
 *
 *        @lookup: look up an element within the set
 *        @update: update an element if exists, add it if doesn't exist
 *        @delete: delete an element
 *        @insert: insert new element into set
 *        @activate: activate new element in the next generation
 *        @deactivate: lookup for element and deactivate it in the next generation
 *        @flush: deactivate element in the next generation
 *        @remove: remove element from set
 *        @walk: iterate over all set elements
 *        @get: get set elements
 *        @ksize: kernel set size
 *         @usize: userspace set size
 *        @adjust_maxsize: delta to adjust maximum set size
 *        @commit: commit set elements
 *        @abort: abort set elements
 *        @privsize: function to return size of set private data
 *        @estimate: estimate the required memory size and the lookup complexity class
 *        @init: initialize private data of new set instance
 *        @destroy: destroy private data of set instance
 *        @gc_init: initialize garbage collection
 *        @elemsize: element private size
 *
 *        Operations lookup, update and delete have simpler interfaces, are faster
 *        and currently only used in the packet path. All the rest are slower,
 *        control plane functions.
 */
struct nft_set_ops {
        const struct nft_set_ext *        (*lookup)(const struct net *net,
                                                  const struct nft_set *set,
                                                  const u32 *key);
        const struct nft_set_ext *        (*update)(struct nft_set *set,
                                                  const u32 *key,
                                                  const struct nft_expr *expr,
                                                  struct nft_regs *regs);
        bool                                (*delete)(const struct nft_set *set,
                                                  const u32 *key);

        int                                (*insert)(const struct net *net,
                                                  const struct nft_set *set,
                                                  const struct nft_set_elem *elem,
                                                  struct nft_elem_priv **priv);
        void                                (*activate)(const struct net *net,
                                                    const struct nft_set *set,
                                                    struct nft_elem_priv *elem_priv);
        struct nft_elem_priv *                (*deactivate)(const struct net *net,
                                                      const struct nft_set *set,
                                                      const struct nft_set_elem *elem);
        void                                (*flush)(const struct net *net,
                                                 const struct nft_set *set,
                                                 struct nft_elem_priv *priv);
        void                                (*remove)(const struct net *net,
                                                  const struct nft_set *set,
                                                  struct nft_elem_priv *elem_priv);
        void                                (*walk)(const struct nft_ctx *ctx,
                                                struct nft_set *set,
                                                struct nft_set_iter *iter);
        struct nft_elem_priv *                (*get)(const struct net *net,
                                               const struct nft_set *set,
                                               const struct nft_set_elem *elem,
                                               unsigned int flags);
        u32                                (*ksize)(u32 size);
        u32                                (*usize)(u32 size);
        u32                                (*adjust_maxsize)(const struct nft_set *set);
        void                                (*commit)(struct nft_set *set);
        void                                (*abort)(const struct nft_set *set);
        u64                                (*privsize)(const struct nlattr * const nla[],
                                                    const struct nft_set_desc *desc);
        bool                                (*estimate)(const struct nft_set_desc *desc,
                                                    u32 features,
                                                    struct nft_set_estimate *est);
        int                                (*init)(const struct nft_set *set,
                                                const struct nft_set_desc *desc,
                                                const struct nlattr * const nla[]);
        void                                (*destroy)(const struct nft_ctx *ctx,
                                                   const struct nft_set *set);
        void                                (*gc_init)(const struct nft_set *set);

        unsigned int                        elemsize;
};

/**
 *      struct nft_set_type - nf_tables set type
 *
 *      @ops: set ops for this type
 *      @features: features supported by the implementation
 */
struct nft_set_type {
        const struct nft_set_ops        ops;
        u32                                features;
};
#define to_set_type(o) container_of(o, struct nft_set_type, ops)

struct nft_set_elem_expr {
        u8                                size;
        unsigned char                        data[]
                __attribute__((aligned(__alignof__(struct nft_expr))));
};

#define nft_setelem_expr_at(__elem_expr, __offset)                        \
        ((struct nft_expr *)&__elem_expr->data[__offset])

#define nft_setelem_expr_foreach(__expr, __elem_expr, __size)                \
        for (__expr = nft_setelem_expr_at(__elem_expr, 0), __size = 0;        \
             __size < (__elem_expr)->size;                                \
             __size += (__expr)->ops->size, __expr = ((void *)(__expr)) + (__expr)->ops->size)

#define NFT_SET_EXPR_MAX        2

/**
 *         struct nft_set - nf_tables set instance
 *
 *        @list: table set list node
 *        @bindings: list of set bindings
 *        @refs: internal refcounting for async set destruction
 *        @table: table this set belongs to
 *        @net: netnamespace this set belongs to
 *         @name: name of the set
 *        @handle: unique handle of the set
 *         @ktype: key type (numeric type defined by userspace, not used in the kernel)
 *         @dtype: data type (verdict or numeric type defined by userspace)
 *         @objtype: object type (see NFT_OBJECT_* definitions)
 *         @size: maximum set size
 *        @field_len: length of each field in concatenation, bytes
 *        @field_count: number of concatenated fields in element
 *        @in_update_walk: true during ->walk() in transaction phase
 *        @use: number of rules references to this set
 *         @nelems: number of elements
 *         @ndeact: number of deactivated elements queued for removal
 *        @timeout: default timeout value in jiffies
 *         @gc_int: garbage collection interval in msecs
 *        @policy: set parameterization (see enum nft_set_policies)
 *        @udlen: user data length
 *        @udata: user data
 *        @pending_update: list of pending update set element
 *         @ops: set ops
 *         @flags: set flags
 *        @dead: set will be freed, never cleared
 *        @genmask: generation mask
 *         @klen: key length
 *         @dlen: data length
 *        @num_exprs: numbers of exprs
 *        @exprs: stateful expression
 *        @catchall_list: list of catch-all set element
 *         @data: private set data
 */
struct nft_set {
        struct list_head                list;
        struct list_head                bindings;
        refcount_t                        refs;
        struct nft_table                *table;
        possible_net_t                        net;
        char                                *name;
        u64                                handle;
        u32                                ktype;
        u32                                dtype;
        u32                                objtype;
        u32                                size;
        u8                                field_len[NFT_REG32_COUNT];
        u8                                field_count;
        bool                                in_update_walk;
        u32                                use;
        atomic_t                        nelems;
        u32                                ndeact;
        u64                                timeout;
        u32                                gc_int;
        u16                                policy;
        u16                                udlen;
        unsigned char                        *udata;
        struct list_head                pending_update;
        /* runtime data below here */
        const struct nft_set_ops        *ops ____cacheline_aligned;
        u16                                flags:13,
                                        dead:1,
                                        genmask:2;
        u8                                klen;
        u8                                dlen;
        u8                                num_exprs;
        struct nft_expr                        *exprs[NFT_SET_EXPR_MAX];
        struct list_head                catchall_list;
        unsigned char                        data[]
                __attribute__((aligned(__alignof__(u64))));
};

static inline bool nft_set_is_anonymous(const struct nft_set *set)
{
        return set->flags & NFT_SET_ANONYMOUS;
}

static inline void *nft_set_priv(const struct nft_set *set)
{
        return (void *)set->data;
}

static inline enum nft_data_types nft_set_datatype(const struct nft_set *set)
{
        return set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE;
}

static inline bool nft_set_gc_is_pending(const struct nft_set *s)
{
        return refcount_read(&s->refs) != 1;
}

static inline struct nft_set *nft_set_container_of(const void *priv)
{
        return (void *)priv - offsetof(struct nft_set, data);
}

struct nft_set *nft_set_lookup_global(const struct net *net,
                                      const struct nft_table *table,
                                      const struct nlattr *nla_set_name,
                                      const struct nlattr *nla_set_id,
                                      u8 genmask);

struct nft_set_ext *nft_set_catchall_lookup(const struct net *net,
                                            const struct nft_set *set);

static inline unsigned long nft_set_gc_interval(const struct nft_set *set)
{
        u32 gc_int = READ_ONCE(set->gc_int);

        return gc_int ? msecs_to_jiffies(gc_int) : HZ;
}

/**
 *        struct nft_set_binding - nf_tables set binding
 *
 *        @list: set bindings list node
 *        @chain: chain containing the rule bound to the set
 *        @flags: set action flags
 *
 *        A set binding contains all information necessary for validation
 *        of new elements added to a bound set.
 */
struct nft_set_binding {
        struct list_head                list;
        const struct nft_chain                *chain;
        u32                                flags;
};

enum nft_trans_phase;
void nf_tables_activate_set(const struct nft_ctx *ctx, struct nft_set *set);
void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set,
                              struct nft_set_binding *binding,
                              enum nft_trans_phase phase);
int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
                       struct nft_set_binding *binding);
void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set);

/**
 *        enum nft_set_extensions - set extension type IDs
 *
 *        @NFT_SET_EXT_KEY: element key
 *        @NFT_SET_EXT_KEY_END: upper bound element key, for ranges
 *        @NFT_SET_EXT_DATA: mapping data
 *        @NFT_SET_EXT_FLAGS: element flags
 *        @NFT_SET_EXT_TIMEOUT: element timeout
 *        @NFT_SET_EXT_USERDATA: user data associated with the element
 *        @NFT_SET_EXT_EXPRESSIONS: expressions associated with the element
 *        @NFT_SET_EXT_OBJREF: stateful object reference associated with element
 *        @NFT_SET_EXT_NUM: number of extension types
 */
enum nft_set_extensions {
        NFT_SET_EXT_KEY,
        NFT_SET_EXT_KEY_END,
        NFT_SET_EXT_DATA,
        NFT_SET_EXT_FLAGS,
        NFT_SET_EXT_TIMEOUT,
        NFT_SET_EXT_USERDATA,
        NFT_SET_EXT_EXPRESSIONS,
        NFT_SET_EXT_OBJREF,
        NFT_SET_EXT_NUM
};

/**
 *        struct nft_set_ext_type - set extension type
 *
 *         @len: fixed part length of the extension
 *         @align: alignment requirements of the extension
 */
struct nft_set_ext_type {
        u8        len;
        u8        align;
};

extern const struct nft_set_ext_type nft_set_ext_types[];

/**
 *        struct nft_set_ext_tmpl - set extension template
 *
 *        @len: length of extension area
 *        @offset: offsets of individual extension types
 *        @ext_len: length of the expected extension(used to sanity check)
 */
struct nft_set_ext_tmpl {
        u16        len;
        u8        offset[NFT_SET_EXT_NUM];
        u8        ext_len[NFT_SET_EXT_NUM];
};

/**
 *        struct nft_set_ext - set extensions
 *
 *        @genmask: generation mask, but also flags (see NFT_SET_ELEM_DEAD_BIT)
 *        @offset: offsets of individual extension types
 *        @data: beginning of extension data
 *
 *        This structure must be aligned to word size, otherwise atomic bitops
 *        on genmask field can cause alignment failure on some archs.
 */
struct nft_set_ext {
        u8        genmask;
        u8        offset[NFT_SET_EXT_NUM];
        char        data[];
} __aligned(BITS_PER_LONG / 8);

static inline void nft_set_ext_prepare(struct nft_set_ext_tmpl *tmpl)
{
        memset(tmpl, 0, sizeof(*tmpl));
        tmpl->len = sizeof(struct nft_set_ext);
}

static inline int nft_set_ext_add_length(struct nft_set_ext_tmpl *tmpl, u8 id,
                                         unsigned int len)
{
        tmpl->len         = ALIGN(tmpl->len, nft_set_ext_types[id].align);
        if (tmpl->len > U8_MAX)
                return -EINVAL;

        tmpl->offset[id] = tmpl->len;
        tmpl->ext_len[id] = nft_set_ext_types[id].len + len;
        tmpl->len        += tmpl->ext_len[id];

        return 0;
}

static inline int nft_set_ext_add(struct nft_set_ext_tmpl *tmpl, u8 id)
{
        return nft_set_ext_add_length(tmpl, id, 0);
}

static inline void nft_set_ext_init(struct nft_set_ext *ext,
                                    const struct nft_set_ext_tmpl *tmpl)
{
        memcpy(ext->offset, tmpl->offset, sizeof(ext->offset));
}

static inline bool __nft_set_ext_exists(const struct nft_set_ext *ext, u8 id)
{
        return !!ext->offset[id];
}

static inline bool nft_set_ext_exists(const struct nft_set_ext *ext, u8 id)
{
        return ext && __nft_set_ext_exists(ext, id);
}

static inline void *nft_set_ext(const struct nft_set_ext *ext, u8 id)
{
        return (void *)ext + ext->offset[id];
}

static inline struct nft_data *nft_set_ext_key(const struct nft_set_ext *ext)
{
        return nft_set_ext(ext, NFT_SET_EXT_KEY);
}

static inline struct nft_data *nft_set_ext_key_end(const struct nft_set_ext *ext)
{
        return nft_set_ext(ext, NFT_SET_EXT_KEY_END);
}

static inline struct nft_data *nft_set_ext_data(const struct nft_set_ext *ext)
{
        return nft_set_ext(ext, NFT_SET_EXT_DATA);
}

static inline u8 *nft_set_ext_flags(const struct nft_set_ext *ext)
{
        return nft_set_ext(ext, NFT_SET_EXT_FLAGS);
}

struct nft_timeout {
        u64        timeout;
        u64        expiration;
};

static inline struct nft_timeout *nft_set_ext_timeout(const struct nft_set_ext *ext)
{
        return nft_set_ext(ext, NFT_SET_EXT_TIMEOUT);
}

static inline struct nft_userdata *nft_set_ext_userdata(const struct nft_set_ext *ext)
{
        return nft_set_ext(ext, NFT_SET_EXT_USERDATA);
}

static inline struct nft_set_elem_expr *nft_set_ext_expr(const struct nft_set_ext *ext)
{
        return nft_set_ext(ext, NFT_SET_EXT_EXPRESSIONS);
}

static inline bool __nft_set_elem_expired(const struct nft_set_ext *ext,
                                          u64 tstamp)
{
        if (!nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) ||
            READ_ONCE(nft_set_ext_timeout(ext)->timeout) == 0)
                return false;

        return time_after_eq64(tstamp, READ_ONCE(nft_set_ext_timeout(ext)->expiration));
}

static inline bool nft_set_elem_expired(const struct nft_set_ext *ext)
{
        return __nft_set_elem_expired(ext, get_jiffies_64());
}

static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set,
                                                   const struct nft_elem_priv *elem_priv)
{
        return (void *)elem_priv + set->ops->elemsize;
}

static inline struct nft_object **nft_set_ext_obj(const struct nft_set_ext *ext)
{
        return nft_set_ext(ext, NFT_SET_EXT_OBJREF);
}

struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx,
                                         const struct nft_set *set,
                                         const struct nlattr *attr);

struct nft_elem_priv *nft_set_elem_init(const struct nft_set *set,
                                        const struct nft_set_ext_tmpl *tmpl,
                                        const u32 *key, const u32 *key_end,
                                        const u32 *data,
                                        u64 timeout, u64 expiration, gfp_t gfp);
int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set,
                            struct nft_expr *expr_array[]);
void nft_set_elem_destroy(const struct nft_set *set,
                          const struct nft_elem_priv *elem_priv,
                          bool destroy_expr);
void nf_tables_set_elem_destroy(const struct nft_ctx *ctx,
                                const struct nft_set *set,
                                const struct nft_elem_priv *elem_priv);

struct nft_expr_ops;
/**
 *        struct nft_expr_type - nf_tables expression type
 *
 *        @select_ops: function to select nft_expr_ops
 *        @release_ops: release nft_expr_ops
 *        @ops: default ops, used when no select_ops functions is present
 *        @inner_ops: inner ops, used for inner packet operation
 *        @list: used internally
 *        @name: Identifier
 *        @owner: module reference
 *        @policy: netlink attribute policy
 *        @maxattr: highest netlink attribute number
 *        @family: address family for AF-specific types
 *        @flags: expression type flags
 */
struct nft_expr_type {
        const struct nft_expr_ops        *(*select_ops)(const struct nft_ctx *,
                                                       const struct nlattr * const tb[]);
        void                                (*release_ops)(const struct nft_expr_ops *ops);
        const struct nft_expr_ops        *ops;
        const struct nft_expr_ops        *inner_ops;
        struct list_head                list;
        const char                        *name;
        struct module                        *owner;
        const struct nla_policy                *policy;
        unsigned int                        maxattr;
        u8                                family;
        u8                                flags;
};

#define NFT_EXPR_STATEFUL                0x1
#define NFT_EXPR_GC                        0x2

enum nft_trans_phase {
        NFT_TRANS_PREPARE,
        NFT_TRANS_PREPARE_ERROR,
        NFT_TRANS_ABORT,
        NFT_TRANS_COMMIT,
        NFT_TRANS_RELEASE
};

struct nft_flow_rule;
struct nft_offload_ctx;

/**
 *        struct nft_expr_ops - nf_tables expression operations
 *
 *        @eval: Expression evaluation function
 *        @clone: Expression clone function
 *        @size: full expression size, including private data size
 *        @init: initialization function
 *        @activate: activate expression in the next generation
 *        @deactivate: deactivate expression in next generation
 *        @destroy: destruction function, called after synchronize_rcu
 *        @destroy_clone: destruction clone function
 *        @dump: function to dump parameters
 *        @validate: validate expression, called during loop detection
 *        @reduce: reduce expression
 *        @gc: garbage collection expression
 *        @offload: hardware offload expression
 *        @offload_action: function to report true/false to allocate one slot or not in the flow
 *                         offload array
 *        @offload_stats: function to synchronize hardware stats via updating the counter expression
 *        @type: expression type
 *        @data: extra data to attach to this expression operation
 */
struct nft_expr_ops {
        void                                (*eval)(const struct nft_expr *expr,
                                                struct nft_regs *regs,
                                                const struct nft_pktinfo *pkt);
        int                                (*clone)(struct nft_expr *dst,
                                                 const struct nft_expr *src, gfp_t gfp);
        unsigned int                        size;

        int                                (*init)(const struct nft_ctx *ctx,
                                                const struct nft_expr *expr,
                                                const struct nlattr * const tb[]);
        void                                (*activate)(const struct nft_ctx *ctx,
                                                    const struct nft_expr *expr);
        void                                (*deactivate)(const struct nft_ctx *ctx,
                                                      const struct nft_expr *expr,
                                                      enum nft_trans_phase phase);
        void                                (*destroy)(const struct nft_ctx *ctx,
                                                   const struct nft_expr *expr);
        void                                (*destroy_clone)(const struct nft_ctx *ctx,
                                                         const struct nft_expr *expr);
        int                                (*dump)(struct sk_buff *skb,
                                                const struct nft_expr *expr,
                                                bool reset);
        int                                (*validate)(const struct nft_ctx *ctx,
                                                    const struct nft_expr *expr);
        bool                                (*reduce)(struct nft_regs_track *track,
                                                  const struct nft_expr *expr);
        bool                                (*gc)(struct net *net,
                                              const struct nft_expr *expr);
        int                                (*offload)(struct nft_offload_ctx *ctx,
                                                   struct nft_flow_rule *flow,
                                                   const struct nft_expr *expr);
        bool                                (*offload_action)(const struct nft_expr *expr);
        void                                (*offload_stats)(struct nft_expr *expr,
                                                         const struct flow_stats *stats);
        const struct nft_expr_type        *type;
        void                                *data;
};

/**
 *        struct nft_rule - nf_tables rule
 *
 *        @list: used internally
 *        @handle: rule handle
 *        @genmask: generation mask
 *        @dlen: length of expression data
 *        @udata: user data is appended to the rule
 *        @data: expression data
 */
struct nft_rule {
        struct list_head                list;
        u64                                handle:42,
                                        genmask:2,
                                        dlen:12,
                                        udata:1;
        unsigned char                        data[]
                __attribute__((aligned(__alignof__(struct nft_expr))));
};

static inline struct nft_expr *nft_expr_first(const struct nft_rule *rule)
{
        return (struct nft_expr *)&rule->data[0];
}

static inline struct nft_expr *nft_expr_next(const struct nft_expr *expr)
{
        return ((void *)expr) + expr->ops->size;
}

static inline struct nft_expr *nft_expr_last(const struct nft_rule *rule)
{
        return (struct nft_expr *)&rule->data[rule->dlen];
}

static inline bool nft_expr_more(const struct nft_rule *rule,
                                 const struct nft_expr *expr)
{
        return expr != nft_expr_last(rule) && expr->ops;
}

static inline struct nft_userdata *nft_userdata(const struct nft_rule *rule)
{
        return (void *)&rule->data[rule->dlen];
}

void nft_rule_expr_activate(const struct nft_ctx *ctx, struct nft_rule *rule);
void nft_rule_expr_deactivate(const struct nft_ctx *ctx, struct nft_rule *rule,
                              enum nft_trans_phase phase);
void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule);

static inline void nft_set_elem_update_expr(const struct nft_set_ext *ext,
                                            struct nft_regs *regs,
                                            const struct nft_pktinfo *pkt)
{
        struct nft_set_elem_expr *elem_expr;
        struct nft_expr *expr;
        u32 size;

        if (__nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS)) {
                elem_expr = nft_set_ext_expr(ext);
                nft_setelem_expr_foreach(expr, elem_expr, size) {
                        expr->ops->eval(expr, regs, pkt);
                        if (regs->verdict.code == NFT_BREAK)
                                return;
                }
        }
}

/*
 * The last pointer isn't really necessary, but the compiler isn't able to
 * determine that the result of nft_expr_last() is always the same since it
 * can't assume that the dlen value wasn't changed within calls in the loop.
 */
#define nft_rule_for_each_expr(expr, last, rule) \
        for ((expr) = nft_expr_first(rule), (last) = nft_expr_last(rule); \
             (expr) != (last); \
             (expr) = nft_expr_next(expr))

#define NFT_CHAIN_POLICY_UNSET                U8_MAX

struct nft_rule_dp {
        u64                                is_last:1,
                                        dlen:12,
                                        handle:42;        /* for tracing */
        unsigned char                        data[]
                __attribute__((aligned(__alignof__(struct nft_expr))));
};

struct nft_rule_dp_last {
        struct nft_rule_dp end;                /* end of nft_rule_blob marker */
        struct rcu_head h;                /* call_rcu head */
        struct nft_rule_blob *blob;        /* ptr to free via call_rcu */
        const struct nft_chain *chain;        /* for nftables tracing */
};

static inline const struct nft_rule_dp *nft_rule_next(const struct nft_rule_dp *rule)
{
        return (void *)rule + sizeof(*rule) + rule->dlen;
}

struct nft_rule_blob {
        unsigned long                        size;
        unsigned char                        data[]
                __attribute__((aligned(__alignof__(struct nft_rule_dp))));
};

/**
 *        struct nft_chain - nf_tables chain
 *
 *        @blob_gen_0: rule blob pointer to the current generation
 *        @blob_gen_1: rule blob pointer to the future generation
 *        @rules: list of rules in the chain
 *        @list: used internally
 *        @rhlhead: used internally
 *        @table: table that this chain belongs to
 *        @handle: chain handle
 *        @use: number of jump references to this chain
 *        @flags: bitmask of enum NFTA_CHAIN_FLAGS
 *        @bound: bind or not
 *        @genmask: generation mask
 *        @name: name of the chain
 *        @udlen: user data length
 *        @udata: user data in the chain
 *        @blob_next: rule blob pointer to the next in the chain
 */
struct nft_chain {
        struct nft_rule_blob                __rcu *blob_gen_0;
        struct nft_rule_blob                __rcu *blob_gen_1;
        struct list_head                rules;
        struct list_head                list;
        struct rhlist_head                rhlhead;
        struct nft_table                *table;
        u64                                handle;
        u32                                use;
        u8                                flags:5,
                                        bound:1,
                                        genmask:2;
        char                                *name;
        u16                                udlen;
        u8                                *udata;

        /* Only used during control plane commit phase: */
        struct nft_rule_blob                *blob_next;
};

int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain);
int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set,
                         const struct nft_set_iter *iter,
                         struct nft_elem_priv *elem_priv);
int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set);
int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain);
void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain);

enum nft_chain_types {
        NFT_CHAIN_T_DEFAULT = 0,
        NFT_CHAIN_T_ROUTE,
        NFT_CHAIN_T_NAT,
        NFT_CHAIN_T_MAX
};

/**
 *         struct nft_chain_type - nf_tables chain type info
 *
 *         @name: name of the type
 *         @type: numeric identifier
 *         @family: address family
 *         @owner: module owner
 *         @hook_mask: mask of valid hooks
 *         @hooks: array of hook functions
 *        @ops_register: base chain register function
 *        @ops_unregister: base chain unregister function
 */
struct nft_chain_type {
        const char                        *name;
        enum nft_chain_types                type;
        int                                family;
        struct module                        *owner;
        unsigned int                        hook_mask;
        nf_hookfn                        *hooks[NFT_MAX_HOOKS];
        int                                (*ops_register)(struct net *net, const struct nf_hook_ops *ops);
        void                                (*ops_unregister)(struct net *net, const struct nf_hook_ops *ops);
};

int nft_chain_validate_dependency(const struct nft_chain *chain,
                                  enum nft_chain_types type);
int nft_chain_validate_hooks(const struct nft_chain *chain,
                             unsigned int hook_flags);

static inline bool nft_chain_binding(const struct nft_chain *chain)
{
        return chain->flags & NFT_CHAIN_BINDING;
}

static inline bool nft_chain_is_bound(struct nft_chain *chain)
{
        return (chain->flags & NFT_CHAIN_BINDING) && chain->bound;
}

int nft_chain_add(struct nft_table *table, struct nft_chain *chain);
void nft_chain_del(struct nft_chain *chain);
void nf_tables_chain_destroy(struct nft_chain *chain);

struct nft_stats {
        u64                        bytes;
        u64                        pkts;
        struct u64_stats_sync        syncp;
};

struct nft_hook {
        struct list_head        list;
        struct list_head        ops_list;
        struct rcu_head                rcu;
        char                        ifname[IFNAMSIZ];
        u8                        ifnamelen;
};

struct nf_hook_ops *nft_hook_find_ops(const struct nft_hook *hook,
                                      const struct net_device *dev);
struct nf_hook_ops *nft_hook_find_ops_rcu(const struct nft_hook *hook,
                                          const struct net_device *dev);

/**
 *        struct nft_base_chain - nf_tables base chain
 *
 *        @ops: netfilter hook ops
 *        @hook_list: list of netfilter hooks (for NFPROTO_NETDEV family)
 *        @type: chain type
 *        @policy: default policy
 *        @flags: indicate the base chain disabled or not
 *        @stats: per-cpu chain stats
 *        @chain: the chain
 *        @flow_block: flow block (for hardware offload)
 */
struct nft_base_chain {
        struct nf_hook_ops                ops;
        struct list_head                hook_list;
        const struct nft_chain_type        *type;
        u8                                policy;
        u8                                flags;
        struct nft_stats __percpu        *stats;
        struct nft_chain                chain;
        struct flow_block                flow_block;
};

static inline struct nft_base_chain *nft_base_chain(const struct nft_chain *chain)
{
        return container_of(chain, struct nft_base_chain, chain);
}

static inline bool nft_is_base_chain(const struct nft_chain *chain)
{
        return chain->flags & NFT_CHAIN_BASE;
}

unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv);

static inline bool nft_use_inc(u32 *use)
{
        if (*use == UINT_MAX)
                return false;

        (*use)++;

        return true;
}

static inline void nft_use_dec(u32 *use)
{
        WARN_ON_ONCE((*use)-- == 0);
}

/* For error and abort path: restore use counter to previous state. */
static inline void nft_use_inc_restore(u32 *use)
{
        WARN_ON_ONCE(!nft_use_inc(use));
}

#define nft_use_dec_restore        nft_use_dec

/**
 *        struct nft_table - nf_tables table
 *
 *        @list: used internally
 *        @chains_ht: chains in the table
 *        @chains: same, for stable walks
 *        @sets: sets in the table
 *        @objects: stateful objects in the table
 *        @flowtables: flow tables in the table
 *        @hgenerator: handle generator state
 *        @handle: table handle
 *        @use: number of chain references to this table
 *        @family:address family
 *        @flags: table flag (see enum nft_table_flags)
 *        @genmask: generation mask
 *        @nlpid: netlink port ID
 *        @name: name of the table
 *        @udlen: length of the user data
 *        @udata: user data
 *        @validate_state: internal, set when transaction adds jumps
 */
struct nft_table {
        struct list_head                list;
        struct rhltable                        chains_ht;
        struct list_head                chains;
        struct list_head                sets;
        struct list_head                objects;
        struct list_head                flowtables;
        u64                                hgenerator;
        u64                                handle;
        u32                                use;
        u16                                family:6,
                                        flags:8,
                                        genmask:2;
        u32                                nlpid;
        char                                *name;
        u16                                udlen;
        u8                                *udata;
        u8                                validate_state;
};

static inline bool nft_table_has_owner(const struct nft_table *table)
{
        return table->flags & NFT_TABLE_F_OWNER;
}

static inline bool nft_table_is_orphan(const struct nft_table *table)
{
        return (table->flags & (NFT_TABLE_F_OWNER | NFT_TABLE_F_PERSIST)) ==
                        NFT_TABLE_F_PERSIST;
}

static inline bool nft_base_chain_netdev(int family, u32 hooknum)
{
        return family == NFPROTO_NETDEV ||
               (family == NFPROTO_INET && hooknum == NF_INET_INGRESS);
}

void nft_register_chain_type(const struct nft_chain_type *);
void nft_unregister_chain_type(const struct nft_chain_type *);

int nft_register_expr(struct nft_expr_type *);
void nft_unregister_expr(struct nft_expr_type *);

int nft_verdict_dump(struct sk_buff *skb, int type,
                     const struct nft_verdict *v);

/**
 *        struct nft_object_hash_key - key to lookup nft_object
 *
 *        @name: name of the stateful object to look up
 *        @table: table the object belongs to
 */
struct nft_object_hash_key {
        const char                      *name;
        const struct nft_table          *table;
};

/**
 *        struct nft_object - nf_tables stateful object
 *
 *        @list: table stateful object list node
 *        @rhlhead: nft_objname_ht node
 *        @key: keys that identify this object
 *        @genmask: generation mask
 *        @use: number of references to this stateful object
 *        @handle: unique object handle
 *        @udlen: length of user data
 *        @udata: user data
 *        @ops: object operations
 *        @data: object data, layout depends on type
 */
struct nft_object {
        struct list_head                list;
        struct rhlist_head                rhlhead;
        struct nft_object_hash_key        key;
        u32                                genmask:2;
        u32                                use;
        u64                                handle;
        u16                                udlen;
        u8                                *udata;
        /* runtime data below here */
        const struct nft_object_ops        *ops ____cacheline_aligned;
        unsigned char                        data[]
                __attribute__((aligned(__alignof__(u64))));
};

static inline void *nft_obj_data(const struct nft_object *obj)
{
        return (void *)obj->data;
}

#define nft_expr_obj(expr)        *((struct nft_object **)nft_expr_priv(expr))

struct nft_object *nft_obj_lookup(const struct net *net,
                                  const struct nft_table *table,
                                  const struct nlattr *nla, u32 objtype,
                                  u8 genmask);

void nft_obj_notify(struct net *net, const struct nft_table *table,
                    struct nft_object *obj, u32 portid, u32 seq,
                    int event, u16 flags, int family, int report, gfp_t gfp);

/**
 *        struct nft_object_type - stateful object type
 *
 *        @select_ops: function to select nft_object_ops
 *        @ops: default ops, used when no select_ops functions is present
 *        @list: list node in list of object types
 *        @type: stateful object numeric type
 *        @owner: module owner
 *        @maxattr: maximum netlink attribute
 *        @family: address family for AF-specific object types
 *        @policy: netlink attribute policy
 */
struct nft_object_type {
        const struct nft_object_ops        *(*select_ops)(const struct nft_ctx *,
                                                       const struct nlattr * const tb[]);
        const struct nft_object_ops        *ops;
        struct list_head                list;
        u32                                type;
        unsigned int                    maxattr;
        u8                                family;
        struct module                        *owner;
        const struct nla_policy                *policy;
};

/**
 *        struct nft_object_ops - stateful object operations
 *
 *        @eval: stateful object evaluation function
 *        @size: stateful object size
 *        @init: initialize object from netlink attributes
 *        @destroy: release existing stateful object
 *        @dump: netlink dump stateful object
 *        @update: update stateful object
 *        @type: pointer to object type
 */
struct nft_object_ops {
        void                                (*eval)(struct nft_object *obj,
                                                struct nft_regs *regs,
                                                const struct nft_pktinfo *pkt);
        unsigned int                        size;
        int                                (*init)(const struct nft_ctx *ctx,
                                                const struct nlattr *const tb[],
                                                struct nft_object *obj);
        void                                (*destroy)(const struct nft_ctx *ctx,
                                                   struct nft_object *obj);
        int                                (*dump)(struct sk_buff *skb,
                                                struct nft_object *obj,
                                                bool reset);
        void                                (*update)(struct nft_object *obj,
                                                  struct nft_object *newobj);
        const struct nft_object_type        *type;
};

int nft_register_obj(struct nft_object_type *obj_type);
void nft_unregister_obj(struct nft_object_type *obj_type);

#define NFT_NETDEVICE_MAX        256

/**
 *        struct nft_flowtable - nf_tables flow table
 *
 *        @list: flow table list node in table list
 *         @table: the table the flow table is contained in
 *        @name: name of this flow table
 *        @hooknum: hook number
 *        @ops_len: number of hooks in array
 *        @genmask: generation mask
 *        @use: number of references to this flow table
 *         @handle: unique object handle
 *        @hook_list: hook list for hooks per net_device in flowtables
 *        @data: rhashtable and garbage collector
 */
struct nft_flowtable {
        struct list_head                list;
        struct nft_table                *table;
        char                                *name;
        int                                hooknum;
        int                                ops_len;
        u32                                genmask:2;
        u32                                use;
        u64                                handle;
        /* runtime data below here */
        struct list_head                hook_list ____cacheline_aligned;
        struct nf_flowtable                data;
};

struct nft_flowtable *nft_flowtable_lookup(const struct net *net,
                                           const struct nft_table *table,
                                           const struct nlattr *nla,
                                           u8 genmask);

void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx,
                                    struct nft_flowtable *flowtable,
                                    enum nft_trans_phase phase);

void nft_register_flowtable_type(struct nf_flowtable_type *type);
void nft_unregister_flowtable_type(struct nf_flowtable_type *type);

/**
 *        struct nft_traceinfo - nft tracing information and state
 *
 *        @trace: other struct members are initialised
 *        @nf_trace: copy of skb->nf_trace before rule evaluation
 *        @type: event type (enum nft_trace_types)
 *        @skbid: hash of skb to be used as trace id
 *        @packet_dumped: packet headers sent in a previous traceinfo message
 *        @basechain: base chain currently processed
 */
struct nft_traceinfo {
        bool                                trace;
        bool                                nf_trace;
        bool                                packet_dumped;
        enum nft_trace_types                type:8;
        u32                                skbid;
        const struct nft_base_chain        *basechain;
};

void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt,
                    const struct nft_chain *basechain);

void nft_trace_notify(const struct nft_pktinfo *pkt,
                      const struct nft_verdict *verdict,
                      const struct nft_rule_dp *rule,
                      struct nft_traceinfo *info);

#define MODULE_ALIAS_NFT_CHAIN(family, name) \
        MODULE_ALIAS("nft-chain-" __stringify(family) "-" name)

#define MODULE_ALIAS_NFT_AF_EXPR(family, name) \
        MODULE_ALIAS("nft-expr-" __stringify(family) "-" name)

#define MODULE_ALIAS_NFT_EXPR(name) \
        MODULE_ALIAS("nft-expr-" name)

#define MODULE_ALIAS_NFT_OBJ(type) \
        MODULE_ALIAS("nft-obj-" __stringify(type))

#if IS_ENABLED(CONFIG_NF_TABLES)

/*
 * The gencursor defines two generations, the currently active and the
 * next one. Objects contain a bitmask of 2 bits specifying the generations
 * they're active in. A set bit means they're inactive in the generation
 * represented by that bit.
 *
 * New objects start out as inactive in the current and active in the
 * next generation. When committing the ruleset the bitmask is cleared,
 * meaning they're active in all generations. When removing an object,
 * it is set inactive in the next generation. After committing the ruleset,
 * the objects are removed.
 */
static inline unsigned int nft_gencursor_next(const struct net *net)
{
        return net->nft.gencursor + 1 == 1 ? 1 : 0;
}

static inline u8 nft_genmask_next(const struct net *net)
{
        return 1 << nft_gencursor_next(net);
}

static inline u8 nft_genmask_cur(const struct net *net)
{
        /* Use READ_ONCE() to prevent refetching the value for atomicity */
        return 1 << READ_ONCE(net->nft.gencursor);
}

#define NFT_GENMASK_ANY                ((1 << 0) | (1 << 1))

/*
 * Generic transaction helpers
 */

/* Check if this object is currently active. */
#define nft_is_active(__net, __obj)                                \
        (((__obj)->genmask & nft_genmask_cur(__net)) == 0)

/* Check if this object is active in the next generation. */
#define nft_is_active_next(__net, __obj)                        \
        (((__obj)->genmask & nft_genmask_next(__net)) == 0)

/* This object becomes active in the next generation. */
#define nft_activate_next(__net, __obj)                                \
        (__obj)->genmask = nft_genmask_cur(__net)

/* This object becomes inactive in the next generation. */
#define nft_deactivate_next(__net, __obj)                        \
        (__obj)->genmask = nft_genmask_next(__net)

/* After committing the ruleset, clear the stale generation bit. */
#define nft_clear(__net, __obj)                                        \
        (__obj)->genmask &= ~nft_genmask_next(__net)
#define nft_active_genmask(__obj, __genmask)                        \
        !((__obj)->genmask & __genmask)

/*
 * Set element transaction helpers
 */

static inline bool nft_set_elem_active(const struct nft_set_ext *ext,
                                       u8 genmask)
{
        return !(ext->genmask & genmask);
}

static inline void nft_set_elem_change_active(const struct net *net,
                                              const struct nft_set *set,
                                              struct nft_set_ext *ext)
{
        ext->genmask ^= nft_genmask_next(net);
}

#endif /* IS_ENABLED(CONFIG_NF_TABLES) */

#define NFT_SET_ELEM_DEAD_MASK        (1 << 2)

#if defined(__LITTLE_ENDIAN_BITFIELD)
#define NFT_SET_ELEM_DEAD_BIT        2
#elif defined(__BIG_ENDIAN_BITFIELD)
#define NFT_SET_ELEM_DEAD_BIT        (BITS_PER_LONG - BITS_PER_BYTE + 2)
#else
#error
#endif

static inline void nft_set_elem_dead(struct nft_set_ext *ext)
{
        unsigned long *word = (unsigned long *)ext;

        BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0);
        set_bit(NFT_SET_ELEM_DEAD_BIT, word);
}

static inline int nft_set_elem_is_dead(const struct nft_set_ext *ext)
{
        unsigned long *word = (unsigned long *)ext;

        BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0);
        return test_bit(NFT_SET_ELEM_DEAD_BIT, word);
}

/**
 * struct nft_trans - nf_tables object update in transaction
 *
 * @list: used internally
 * @net: struct net
 * @table: struct nft_table the object resides in
 * @msg_type: message type
 * @seq: netlink sequence number
 * @flags: modifiers to new request
 * @report: notify via unicast netlink message
 * @put_net: net needs to be put
 *
 * This is the information common to all objects in the transaction,
 * this must always be the first member of derived sub-types.
 */
struct nft_trans {
        struct list_head                list;
        struct net                        *net;
        struct nft_table                *table;
        int                                msg_type;
        u32                                seq;
        u16                                flags;
        u8                                report:1;
        u8                                put_net:1;
};

/**
 * struct nft_trans_binding - nf_tables object with binding support in transaction
 * @nft_trans:    base structure, MUST be first member
 * @binding_list: list of objects with possible bindings
 *
 * This is the base type used by objects that can be bound to a chain.
 */
struct nft_trans_binding {
        struct nft_trans nft_trans;
        struct list_head binding_list;
};

struct nft_trans_rule {
        struct nft_trans                nft_trans;
        struct nft_rule                        *rule;
        struct nft_chain                *chain;
        struct nft_flow_rule                *flow;
        u32                                rule_id;
        bool                                bound;
};

#define nft_trans_container_rule(trans)                        \
        container_of(trans, struct nft_trans_rule, nft_trans)
#define nft_trans_rule(trans)                                \
        nft_trans_container_rule(trans)->rule
#define nft_trans_flow_rule(trans)                        \
        nft_trans_container_rule(trans)->flow
#define nft_trans_rule_id(trans)                        \
        nft_trans_container_rule(trans)->rule_id
#define nft_trans_rule_bound(trans)                        \
        nft_trans_container_rule(trans)->bound
#define nft_trans_rule_chain(trans)        \
        nft_trans_container_rule(trans)->chain

struct nft_trans_set {
        struct nft_trans_binding        nft_trans_binding;
        struct list_head                list_trans_newset;
        struct nft_set                        *set;
        u32                                set_id;
        u32                                gc_int;
        u64                                timeout;
        bool                                update;
        bool                                bound;
        u32                                size;
};

#define nft_trans_container_set(t)        \
        container_of(t, struct nft_trans_set, nft_trans_binding.nft_trans)
#define nft_trans_set(trans)                                \
        nft_trans_container_set(trans)->set
#define nft_trans_set_id(trans)                                \
        nft_trans_container_set(trans)->set_id
#define nft_trans_set_bound(trans)                        \
        nft_trans_container_set(trans)->bound
#define nft_trans_set_update(trans)                        \
        nft_trans_container_set(trans)->update
#define nft_trans_set_timeout(trans)                        \
        nft_trans_container_set(trans)->timeout
#define nft_trans_set_gc_int(trans)                        \
        nft_trans_container_set(trans)->gc_int
#define nft_trans_set_size(trans)                        \
        nft_trans_container_set(trans)->size

struct nft_trans_chain {
        struct nft_trans_binding        nft_trans_binding;
        struct nft_chain                *chain;
        char                                *name;
        struct nft_stats __percpu        *stats;
        u8                                policy;
        bool                                update;
        bool                                bound;
        u32                                chain_id;
        struct nft_base_chain                *basechain;
        struct list_head                hook_list;
};

#define nft_trans_container_chain(t)        \
        container_of(t, struct nft_trans_chain, nft_trans_binding.nft_trans)
#define nft_trans_chain(trans)                                \
        nft_trans_container_chain(trans)->chain
#define nft_trans_chain_update(trans)                        \
        nft_trans_container_chain(trans)->update
#define nft_trans_chain_name(trans)                        \
        nft_trans_container_chain(trans)->name
#define nft_trans_chain_stats(trans)                        \
        nft_trans_container_chain(trans)->stats
#define nft_trans_chain_policy(trans)                        \
        nft_trans_container_chain(trans)->policy
#define nft_trans_chain_bound(trans)                        \
        nft_trans_container_chain(trans)->bound
#define nft_trans_chain_id(trans)                        \
        nft_trans_container_chain(trans)->chain_id
#define nft_trans_basechain(trans)                        \
        nft_trans_container_chain(trans)->basechain
#define nft_trans_chain_hooks(trans)                        \
        nft_trans_container_chain(trans)->hook_list

struct nft_trans_table {
        struct nft_trans                nft_trans;
        bool                                update;
};

#define nft_trans_container_table(trans)                \
        container_of(trans, struct nft_trans_table, nft_trans)
#define nft_trans_table_update(trans)                        \
        nft_trans_container_table(trans)->update

enum nft_trans_elem_flags {
        NFT_TRANS_UPD_TIMEOUT                = (1 << 0),
        NFT_TRANS_UPD_EXPIRATION        = (1 << 1),
};

struct nft_elem_update {
        u64                                timeout;
        u64                                expiration;
        u8                                flags;
};

struct nft_trans_one_elem {
        struct nft_elem_priv                *priv;
        struct nft_elem_update                *update;
};

struct nft_trans_elem {
        struct nft_trans                nft_trans;
        struct nft_set                        *set;
        bool                                bound;
        unsigned int                        nelems;
        struct nft_trans_one_elem        elems[] __counted_by(nelems);
};

#define nft_trans_container_elem(t)                        \
        container_of(t, struct nft_trans_elem, nft_trans)
#define nft_trans_elem_set(trans)                        \
        nft_trans_container_elem(trans)->set
#define nft_trans_elem_set_bound(trans)                        \
        nft_trans_container_elem(trans)->bound

struct nft_trans_obj {
        struct nft_trans                nft_trans;
        struct nft_object                *obj;
        struct nft_object                *newobj;
        bool                                update;
};

#define nft_trans_container_obj(t)                        \
        container_of(t, struct nft_trans_obj, nft_trans)
#define nft_trans_obj(trans)                                \
        nft_trans_container_obj(trans)->obj
#define nft_trans_obj_newobj(trans)                        \
        nft_trans_container_obj(trans)->newobj
#define nft_trans_obj_update(trans)                        \
        nft_trans_container_obj(trans)->update

struct nft_trans_flowtable {
        struct nft_trans                nft_trans;
        struct nft_flowtable                *flowtable;
        struct list_head                hook_list;
        u32                                flags;
        bool                                update;
};

#define nft_trans_container_flowtable(t)                \
        container_of(t, struct nft_trans_flowtable, nft_trans)
#define nft_trans_flowtable(trans)                        \
        nft_trans_container_flowtable(trans)->flowtable
#define nft_trans_flowtable_update(trans)                \
        nft_trans_container_flowtable(trans)->update
#define nft_trans_flowtable_hooks(trans)                \
        nft_trans_container_flowtable(trans)->hook_list
#define nft_trans_flowtable_flags(trans)                \
        nft_trans_container_flowtable(trans)->flags

#define NFT_TRANS_GC_BATCHCOUNT        256

struct nft_trans_gc {
        struct list_head        list;
        struct net                *net;
        struct nft_set                *set;
        u32                        seq;
        u16                        count;
        struct nft_elem_priv        *priv[NFT_TRANS_GC_BATCHCOUNT];
        struct rcu_head                rcu;
};

static inline void nft_ctx_update(struct nft_ctx *ctx,
                                  const struct nft_trans *trans)
{
        switch (trans->msg_type) {
        case NFT_MSG_NEWRULE:
        case NFT_MSG_DELRULE:
        case NFT_MSG_DESTROYRULE:
                ctx->chain = nft_trans_rule_chain(trans);
                break;
        case NFT_MSG_NEWCHAIN:
        case NFT_MSG_DELCHAIN:
        case NFT_MSG_DESTROYCHAIN:
                ctx->chain = nft_trans_chain(trans);
                break;
        default:
                ctx->chain = NULL;
                break;
        }

        ctx->net = trans->net;
        ctx->table = trans->table;
        ctx->family = trans->table->family;
        ctx->report = trans->report;
        ctx->flags = trans->flags;
        ctx->seq = trans->seq;
}

struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set,
                                        unsigned int gc_seq, gfp_t gfp);
void nft_trans_gc_destroy(struct nft_trans_gc *trans);

struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc,
                                              unsigned int gc_seq, gfp_t gfp);
void nft_trans_gc_queue_async_done(struct nft_trans_gc *gc);

struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp);
void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans);

void nft_trans_gc_elem_add(struct nft_trans_gc *gc, void *priv);

struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc,
                                                 unsigned int gc_seq);
struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc);

void nft_setelem_data_deactivate(const struct net *net,
                                 const struct nft_set *set,
                                 struct nft_elem_priv *elem_priv);

int __init nft_chain_filter_init(void);
void nft_chain_filter_fini(void);

void __init nft_chain_route_init(void);
void nft_chain_route_fini(void);

void nf_tables_trans_destroy_flush_work(struct net *net);

int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result);
__be64 nf_jiffies64_to_msecs(u64 input);

#ifdef CONFIG_MODULES
__printf(2, 3) int nft_request_module(struct net *net, const char *fmt, ...);
#else
static inline int nft_request_module(struct net *net, const char *fmt, ...) { return -ENOENT; }
#endif

struct nftables_pernet {
        struct list_head        tables;
        struct list_head        commit_list;
        struct list_head        destroy_list;
        struct list_head        commit_set_list;
        struct list_head        binding_list;
        struct list_head        module_list;
        struct list_head        notify_list;
        struct mutex                commit_mutex;
        u64                        table_handle;
        u64                        tstamp;
        unsigned int                gc_seq;
        u8                        validate_state;
        struct work_struct        destroy_work;
};

extern unsigned int nf_tables_net_id;

static inline struct nftables_pernet *nft_pernet(const struct net *net)
{
        return net_generic(net, nf_tables_net_id);
}

static inline u64 nft_net_tstamp(const struct net *net)
{
        return nft_pernet(net)->tstamp;
}

#define __NFT_REDUCE_READONLY        1UL
#define NFT_REDUCE_READONLY        (void *)__NFT_REDUCE_READONLY

void nft_reg_track_update(struct nft_regs_track *track,
                          const struct nft_expr *expr, u8 dreg, u8 len);
void nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg, u8 len);
void __nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg);

static inline bool nft_reg_track_cmp(struct nft_regs_track *track,
                                     const struct nft_expr *expr, u8 dreg)
{
        return track->regs[dreg].selector &&
               track->regs[dreg].selector->ops == expr->ops &&
               track->regs[dreg].num_reg == 0;
}

#endif /* _NET_NF_TABLES_H */

































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_BSEARCH_H
#define _LINUX_BSEARCH_H

#include <linux/types.h>

static __always_inline
void *__inline_bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp)
{
        const char *pivot;
        int result;

        while (num > 0) {
                pivot = base + (num >> 1) * size;
                result = cmp(key, pivot);

                if (result == 0)
                        return (void *)pivot;

                if (result > 0) {
                        base = pivot + size;
                        num--;
                }
                num >>= 1;
        }

        return NULL;
}

extern void *bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp);

#endif /* _LINUX_BSEARCH_H */





















































































































































































   28 

   28 


   28 


   30 





























































































  318 





  303 
  319 
    5 

























   30 

















   30 






























  319 
















  319 
  313 




  316 




  319 




  318 
   64 































  319 

















  319 









   11 
   50 














   51 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Variant of atomic_t specialized for reference counts.
 *
 * The interface matches the atomic_t interface (to aid in porting) but only
 * provides the few functions one should use for reference counting.
 *
 * Saturation semantics
 * ====================
 *
 * refcount_t differs from atomic_t in that the counter saturates at
 * REFCOUNT_SATURATED and will not move once there. This avoids wrapping the
 * counter and causing 'spurious' use-after-free issues. In order to avoid the
 * cost associated with introducing cmpxchg() loops into all of the saturating
 * operations, we temporarily allow the counter to take on an unchecked value
 * and then explicitly set it to REFCOUNT_SATURATED on detecting that underflow
 * or overflow has occurred. Although this is racy when multiple threads
 * access the refcount concurrently, by placing REFCOUNT_SATURATED roughly
 * equidistant from 0 and INT_MAX we minimise the scope for error:
 *
 *                                    INT_MAX     REFCOUNT_SATURATED   UINT_MAX
 *   0                          (0x7fff_ffff)    (0xc000_0000)    (0xffff_ffff)
 *   +--------------------------------+----------------+----------------+
 *                                     <---------- bad value! ---------->
 *
 * (in a signed view of the world, the "bad value" range corresponds to
 * a negative counter value).
 *
 * As an example, consider a refcount_inc() operation that causes the counter
 * to overflow:
 *
 *         int old = atomic_fetch_add_relaxed(r);
 *        // old is INT_MAX, refcount now INT_MIN (0x8000_0000)
 *        if (old < 0)
 *                atomic_set(r, REFCOUNT_SATURATED);
 *
 * If another thread also performs a refcount_inc() operation between the two
 * atomic operations, then the count will continue to edge closer to 0. If it
 * reaches a value of 1 before /any/ of the threads reset it to the saturated
 * value, then a concurrent refcount_dec_and_test() may erroneously free the
 * underlying object.
 * Linux limits the maximum number of tasks to PID_MAX_LIMIT, which is currently
 * 0x400000 (and can't easily be raised in the future beyond FUTEX_TID_MASK).
 * With the current PID limit, if no batched refcounting operations are used and
 * the attacker can't repeatedly trigger kernel oopses in the middle of refcount
 * operations, this makes it impossible for a saturated refcount to leave the
 * saturation range, even if it is possible for multiple uses of the same
 * refcount to nest in the context of a single task:
 *
 *     (UINT_MAX+1-REFCOUNT_SATURATED) / PID_MAX_LIMIT =
 *     0x40000000 / 0x400000 = 0x100 = 256
 *
 * If hundreds of references are added/removed with a single refcounting
 * operation, it may potentially be possible to leave the saturation range; but
 * given the precise timing details involved with the round-robin scheduling of
 * each thread manipulating the refcount and the need to hit the race multiple
 * times in succession, there doesn't appear to be a practical avenue of attack
 * even if using refcount_add() operations with larger increments.
 *
 * Memory ordering
 * ===============
 *
 * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
 * and provide only what is strictly required for refcounts.
 *
 * The increments are fully relaxed; these will not provide ordering. The
 * rationale is that whatever is used to obtain the object we're increasing the
 * reference count on will provide the ordering. For locked data structures,
 * its the lock acquire, for RCU/lockless data structures its the dependent
 * load.
 *
 * Do note that inc_not_zero() provides a control dependency which will order
 * future stores against the inc, this ensures we'll never modify the object
 * if we did not in fact acquire a reference.
 *
 * The decrements will provide release order, such that all the prior loads and
 * stores will be issued before, it also provides a control dependency, which
 * will order us against the subsequent free().
 *
 * The control dependency is against the load of the cmpxchg (ll/sc) that
 * succeeded. This means the stores aren't fully ordered, but this is fine
 * because the 1->0 transition indicates no concurrency.
 *
 * Note that the allocator is responsible for ordering things between free()
 * and alloc().
 *
 * The decrements dec_and_test() and sub_and_test() also provide acquire
 * ordering on success.
 *
 * refcount_{add|inc}_not_zero_acquire() and refcount_set_release() provide
 * acquire and release ordering for cases when the memory occupied by the
 * object might be reused to store another object. This is important for the
 * cases where secondary validation is required to detect such reuse, e.g.
 * SLAB_TYPESAFE_BY_RCU. The secondary validation checks have to happen after
 * the refcount is taken, hence acquire order is necessary. Similarly, when the
 * object is initialized, all stores to its attributes should be visible before
 * the refcount is set, otherwise a stale attribute value might be used by
 * another task which succeeds in taking a refcount to the new object.
 */

#ifndef _LINUX_REFCOUNT_H
#define _LINUX_REFCOUNT_H

#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/compiler.h>
#include <linux/limits.h>
#include <linux/refcount_types.h>
#include <linux/spinlock_types.h>

struct mutex;

#define REFCOUNT_INIT(n)        { .refs = ATOMIC_INIT(n), }
#define REFCOUNT_MAX                INT_MAX
#define REFCOUNT_SATURATED        (INT_MIN / 2)

enum refcount_saturation_type {
        REFCOUNT_ADD_NOT_ZERO_OVF,
        REFCOUNT_ADD_OVF,
        REFCOUNT_ADD_UAF,
        REFCOUNT_SUB_UAF,
        REFCOUNT_DEC_LEAK,
};

void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t);

/**
 * refcount_set - set a refcount's value
 * @r: the refcount
 * @n: value to which the refcount will be set
 */
static inline void refcount_set(refcount_t *r, int n)
{
        atomic_set(&r->refs, n);
}

/**
 * refcount_set_release - set a refcount's value with release ordering
 * @r: the refcount
 * @n: value to which the refcount will be set
 *
 * This function should be used when memory occupied by the object might be
 * reused to store another object -- consider SLAB_TYPESAFE_BY_RCU.
 *
 * Provides release memory ordering which will order previous memory operations
 * against this store. This ensures all updates to this object are visible
 * once the refcount is set and stale values from the object previously
 * occupying this memory are overwritten with new ones.
 *
 * This function should be called only after new object is fully initialized.
 * After this call the object should be considered visible to other tasks even
 * if it was not yet added into an object collection normally used to discover
 * it. This is because other tasks might have discovered the object previously
 * occupying the same memory and after memory reuse they can succeed in taking
 * refcount to the new object and start using it.
 */
static inline void refcount_set_release(refcount_t *r, int n)
{
        atomic_set_release(&r->refs, n);
}

/**
 * refcount_read - get a refcount's value
 * @r: the refcount
 *
 * Return: the refcount's value
 */
static inline unsigned int refcount_read(const refcount_t *r)
{
        return atomic_read(&r->refs);
}

static inline __must_check __signed_wrap
bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp)
{
        int old = refcount_read(r);

        do {
                if (!old)
                        break;
        } while (!atomic_try_cmpxchg_relaxed(&r->refs, &old, old + i));

        if (oldp)
                *oldp = old;

        if (unlikely(old < 0 || old + i < 0))
                refcount_warn_saturate(r, REFCOUNT_ADD_NOT_ZERO_OVF);

        return old;
}

/**
 * refcount_add_not_zero - add a value to a refcount unless it is 0
 * @i: the value to add to the refcount
 * @r: the refcount
 *
 * Will saturate at REFCOUNT_SATURATED and WARN.
 *
 * Provides no memory ordering, it is assumed the caller has guaranteed the
 * object memory to be stable (RCU, etc.). It does provide a control dependency
 * and thereby orders future stores. See the comment on top.
 *
 * Use of this function is not recommended for the normal reference counting
 * use case in which references are taken and released one at a time.  In these
 * cases, refcount_inc(), or one of its variants, should instead be used to
 * increment a reference count.
 *
 * Return: false if the passed refcount is 0, true otherwise
 */
static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r)
{
        return __refcount_add_not_zero(i, r, NULL);
}

static inline __must_check __signed_wrap
bool __refcount_add_not_zero_limited_acquire(int i, refcount_t *r, int *oldp,
                                             int limit)
{
        int old = refcount_read(r);

        do {
                if (!old)
                        break;

                if (i > limit - old) {
                        if (oldp)
                                *oldp = old;
                        return false;
                }
        } while (!atomic_try_cmpxchg_acquire(&r->refs, &old, old + i));

        if (oldp)
                *oldp = old;

        if (unlikely(old < 0 || old + i < 0))
                refcount_warn_saturate(r, REFCOUNT_ADD_NOT_ZERO_OVF);

        return old;
}

static inline __must_check bool
__refcount_inc_not_zero_limited_acquire(refcount_t *r, int *oldp, int limit)
{
        return __refcount_add_not_zero_limited_acquire(1, r, oldp, limit);
}

static inline __must_check __signed_wrap
bool __refcount_add_not_zero_acquire(int i, refcount_t *r, int *oldp)
{
        return __refcount_add_not_zero_limited_acquire(i, r, oldp, INT_MAX);
}

/**
 * refcount_add_not_zero_acquire - add a value to a refcount with acquire ordering unless it is 0
 *
 * @i: the value to add to the refcount
 * @r: the refcount
 *
 * Will saturate at REFCOUNT_SATURATED and WARN.
 *
 * This function should be used when memory occupied by the object might be
 * reused to store another object -- consider SLAB_TYPESAFE_BY_RCU.
 *
 * Provides acquire memory ordering on success, it is assumed the caller has
 * guaranteed the object memory to be stable (RCU, etc.). It does provide a
 * control dependency and thereby orders future stores. See the comment on top.
 *
 * Use of this function is not recommended for the normal reference counting
 * use case in which references are taken and released one at a time.  In these
 * cases, refcount_inc_not_zero_acquire() should instead be used to increment a
 * reference count.
 *
 * Return: false if the passed refcount is 0, true otherwise
 */
static inline __must_check bool refcount_add_not_zero_acquire(int i, refcount_t *r)
{
        return __refcount_add_not_zero_acquire(i, r, NULL);
}

static inline __signed_wrap
void __refcount_add(int i, refcount_t *r, int *oldp)
{
        int old = atomic_fetch_add_relaxed(i, &r->refs);

        if (oldp)
                *oldp = old;

        if (unlikely(!old))
                refcount_warn_saturate(r, REFCOUNT_ADD_UAF);
        else if (unlikely(old < 0 || old + i < 0))
                refcount_warn_saturate(r, REFCOUNT_ADD_OVF);
}

/**
 * refcount_add - add a value to a refcount
 * @i: the value to add to the refcount
 * @r: the refcount
 *
 * Similar to atomic_add(), but will saturate at REFCOUNT_SATURATED and WARN.
 *
 * Provides no memory ordering, it is assumed the caller has guaranteed the
 * object memory to be stable (RCU, etc.). It does provide a control dependency
 * and thereby orders future stores. See the comment on top.
 *
 * Use of this function is not recommended for the normal reference counting
 * use case in which references are taken and released one at a time.  In these
 * cases, refcount_inc(), or one of its variants, should instead be used to
 * increment a reference count.
 */
static inline void refcount_add(int i, refcount_t *r)
{
        __refcount_add(i, r, NULL);
}

static inline __must_check bool __refcount_inc_not_zero(refcount_t *r, int *oldp)
{
        return __refcount_add_not_zero(1, r, oldp);
}

/**
 * refcount_inc_not_zero - increment a refcount unless it is 0
 * @r: the refcount to increment
 *
 * Similar to atomic_inc_not_zero(), but will saturate at REFCOUNT_SATURATED
 * and WARN.
 *
 * Provides no memory ordering, it is assumed the caller has guaranteed the
 * object memory to be stable (RCU, etc.). It does provide a control dependency
 * and thereby orders future stores. See the comment on top.
 *
 * Return: true if the increment was successful, false otherwise
 */
static inline __must_check bool refcount_inc_not_zero(refcount_t *r)
{
        return __refcount_inc_not_zero(r, NULL);
}

static inline __must_check bool __refcount_inc_not_zero_acquire(refcount_t *r, int *oldp)
{
        return __refcount_add_not_zero_acquire(1, r, oldp);
}

/**
 * refcount_inc_not_zero_acquire - increment a refcount with acquire ordering unless it is 0
 * @r: the refcount to increment
 *
 * Similar to refcount_inc_not_zero(), but provides acquire memory ordering on
 * success.
 *
 * This function should be used when memory occupied by the object might be
 * reused to store another object -- consider SLAB_TYPESAFE_BY_RCU.
 *
 * Provides acquire memory ordering on success, it is assumed the caller has
 * guaranteed the object memory to be stable (RCU, etc.). It does provide a
 * control dependency and thereby orders future stores. See the comment on top.
 *
 * Return: true if the increment was successful, false otherwise
 */
static inline __must_check bool refcount_inc_not_zero_acquire(refcount_t *r)
{
        return __refcount_inc_not_zero_acquire(r, NULL);
}

static inline void __refcount_inc(refcount_t *r, int *oldp)
{
        __refcount_add(1, r, oldp);
}

/**
 * refcount_inc - increment a refcount
 * @r: the refcount to increment
 *
 * Similar to atomic_inc(), but will saturate at REFCOUNT_SATURATED and WARN.
 *
 * Provides no memory ordering, it is assumed the caller already has a
 * reference on the object.
 *
 * Will WARN if the refcount is 0, as this represents a possible use-after-free
 * condition.
 */
static inline void refcount_inc(refcount_t *r)
{
        __refcount_inc(r, NULL);
}

static inline __must_check __signed_wrap
bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp)
{
        int old = atomic_fetch_sub_release(i, &r->refs);

        if (oldp)
                *oldp = old;

        if (old > 0 && old == i) {
                smp_acquire__after_ctrl_dep();
                return true;
        }

        if (unlikely(old <= 0 || old - i < 0))
                refcount_warn_saturate(r, REFCOUNT_SUB_UAF);

        return false;
}

/**
 * refcount_sub_and_test - subtract from a refcount and test if it is 0
 * @i: amount to subtract from the refcount
 * @r: the refcount
 *
 * Similar to atomic_dec_and_test(), but it will WARN, return false and
 * ultimately leak on underflow and will fail to decrement when saturated
 * at REFCOUNT_SATURATED.
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before, and provides an acquire ordering on success such that free()
 * must come after.
 *
 * Use of this function is not recommended for the normal reference counting
 * use case in which references are taken and released one at a time.  In these
 * cases, refcount_dec(), or one of its variants, should instead be used to
 * decrement a reference count.
 *
 * Return: true if the resulting refcount is 0, false otherwise
 */
static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r)
{
        return __refcount_sub_and_test(i, r, NULL);
}

static inline __must_check bool __refcount_dec_and_test(refcount_t *r, int *oldp)
{
        return __refcount_sub_and_test(1, r, oldp);
}

/**
 * refcount_dec_and_test - decrement a refcount and test if it is 0
 * @r: the refcount
 *
 * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to
 * decrement when saturated at REFCOUNT_SATURATED.
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before, and provides an acquire ordering on success such that free()
 * must come after.
 *
 * Return: true if the resulting refcount is 0, false otherwise
 */
static inline __must_check bool refcount_dec_and_test(refcount_t *r)
{
        return __refcount_dec_and_test(r, NULL);
}

static inline void __refcount_dec(refcount_t *r, int *oldp)
{
        int old = atomic_fetch_sub_release(1, &r->refs);

        if (oldp)
                *oldp = old;

        if (unlikely(old <= 1))
                refcount_warn_saturate(r, REFCOUNT_DEC_LEAK);
}

/**
 * refcount_dec - decrement a refcount
 * @r: the refcount
 *
 * Similar to atomic_dec(), it will WARN on underflow and fail to decrement
 * when saturated at REFCOUNT_SATURATED.
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before.
 */
static inline void refcount_dec(refcount_t *r)
{
        __refcount_dec(r, NULL);
}

extern __must_check bool refcount_dec_if_one(refcount_t *r);
extern __must_check bool refcount_dec_not_one(refcount_t *r);
extern __must_check bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock) __cond_acquires(lock);
extern __must_check bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock) __cond_acquires(lock);
extern __must_check bool refcount_dec_and_lock_irqsave(refcount_t *r,
                                                       spinlock_t *lock,
                                                       unsigned long *flags) __cond_acquires(lock);
#endif /* _LINUX_REFCOUNT_H */





















































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Credentials management - see Documentation/security/credentials.rst
 *
 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#ifndef _LINUX_CRED_H
#define _LINUX_CRED_H

#include <linux/capability.h>
#include <linux/init.h>
#include <linux/key.h>
#include <linux/atomic.h>
#include <linux/refcount.h>
#include <linux/uidgid.h>
#include <linux/sched.h>
#include <linux/sched/user.h>

struct cred;
struct inode;

/*
 * COW Supplementary groups list
 */
struct group_info {
        refcount_t        usage;
        int                ngroups;
        kgid_t                gid[];
} __randomize_layout;

/**
 * get_group_info - Get a reference to a group info structure
 * @group_info: The group info to reference
 *
 * This gets a reference to a set of supplementary groups.
 *
 * If the caller is accessing a task's credentials, they must hold the RCU read
 * lock when reading.
 */
static inline struct group_info *get_group_info(struct group_info *gi)
{
        refcount_inc(&gi->usage);
        return gi;
}

/**
 * put_group_info - Release a reference to a group info structure
 * @group_info: The group info to release
 */
#define put_group_info(group_info)                        \
do {                                                        \
        if (refcount_dec_and_test(&(group_info)->usage))        \
                groups_free(group_info);                \
} while (0)

#ifdef CONFIG_MULTIUSER
extern struct group_info *groups_alloc(int);
extern void groups_free(struct group_info *);

extern int in_group_p(kgid_t);
extern int in_egroup_p(kgid_t);
extern int groups_search(const struct group_info *, kgid_t);

extern int set_current_groups(struct group_info *);
extern void set_groups(struct cred *, struct group_info *);
extern bool may_setgroups(void);
extern void groups_sort(struct group_info *);
#else
static inline void groups_free(struct group_info *group_info)
{
}

static inline int in_group_p(kgid_t grp)
{
        return 1;
}
static inline int in_egroup_p(kgid_t grp)
{
        return 1;
}
static inline int groups_search(const struct group_info *group_info, kgid_t grp)
{
        return 1;
}
#endif

/*
 * The security context of a task
 *
 * The parts of the context break down into two categories:
 *
 *  (1) The objective context of a task.  These parts are used when some other
 *        task is attempting to affect this one.
 *
 *  (2) The subjective context.  These details are used when the task is acting
 *        upon another object, be that a file, a task, a key or whatever.
 *
 * Note that some members of this structure belong to both categories - the
 * LSM security pointer for instance.
 *
 * A task has two security pointers.  task->real_cred points to the objective
 * context that defines that task's actual details.  The objective part of this
 * context is used whenever that task is acted upon.
 *
 * task->cred points to the subjective context that defines the details of how
 * that task is going to act upon another object.  This may be overridden
 * temporarily to point to another security context, but normally points to the
 * same context as task->real_cred.
 */
struct cred {
        atomic_long_t        usage;
        kuid_t                uid;                /* real UID of the task */
        kgid_t                gid;                /* real GID of the task */
        kuid_t                suid;                /* saved UID of the task */
        kgid_t                sgid;                /* saved GID of the task */
        kuid_t                euid;                /* effective UID of the task */
        kgid_t                egid;                /* effective GID of the task */
        kuid_t                fsuid;                /* UID for VFS ops */
        kgid_t                fsgid;                /* GID for VFS ops */
        unsigned        securebits;        /* SUID-less security management */
        kernel_cap_t        cap_inheritable; /* caps our children can inherit */
        kernel_cap_t        cap_permitted;        /* caps we're permitted */
        kernel_cap_t        cap_effective;        /* caps we can actually use */
        kernel_cap_t        cap_bset;        /* capability bounding set */
        kernel_cap_t        cap_ambient;        /* Ambient capability set */
#ifdef CONFIG_KEYS
        unsigned char        jit_keyring;        /* default keyring to attach requested
                                         * keys to */
        struct key        *session_keyring; /* keyring inherited over fork */
        struct key        *process_keyring; /* keyring private to this process */
        struct key        *thread_keyring; /* keyring private to this thread */
        struct key        *request_key_auth; /* assumed request_key authority */
#endif
#ifdef CONFIG_SECURITY
        void                *security;        /* LSM security */
#endif
        struct user_struct *user;        /* real user ID subscription */
        struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
        struct ucounts *ucounts;
        struct group_info *group_info;        /* supplementary groups for euid/fsgid */
        /* RCU deletion */
        union {
                int non_rcu;                        /* Can we skip RCU deletion? */
                struct rcu_head        rcu;                /* RCU deletion hook */
        };
} __randomize_layout;

extern void __put_cred(struct cred *);
extern void exit_creds(struct task_struct *);
extern int copy_creds(struct task_struct *, u64);
extern const struct cred *get_task_cred(struct task_struct *);
extern struct cred *cred_alloc_blank(void);
extern struct cred *prepare_creds(void);
extern struct cred *prepare_exec_creds(void);
extern int commit_creds(struct cred *);
extern void abort_creds(struct cred *);
extern struct cred *prepare_kernel_cred(struct task_struct *);
extern int set_security_override(struct cred *, u32);
extern int set_security_override_from_ctx(struct cred *, const char *);
extern int set_create_files_as(struct cred *, struct inode *);
extern int cred_fscmp(const struct cred *, const struct cred *);
extern void __init cred_init(void);
extern int set_cred_ucounts(struct cred *);

static inline bool cap_ambient_invariant_ok(const struct cred *cred)
{
        return cap_issubset(cred->cap_ambient,
                            cap_intersect(cred->cap_permitted,
                                          cred->cap_inheritable));
}

static inline const struct cred *override_creds(const struct cred *override_cred)
{
        return rcu_replace_pointer(current->cred, override_cred, 1);
}

static inline const struct cred *revert_creds(const struct cred *revert_cred)
{
        return rcu_replace_pointer(current->cred, revert_cred, 1);
}

/**
 * get_cred_many - Get references on a set of credentials
 * @cred: The credentials to reference
 * @nr: Number of references to acquire
 *
 * Get references on the specified set of credentials.  The caller must release
 * all acquired reference.  If %NULL is passed, it is returned with no action.
 *
 * This is used to deal with a committed set of credentials.  Although the
 * pointer is const, this will temporarily discard the const and increment the
 * usage count.  The purpose of this is to attempt to catch at compile time the
 * accidental alteration of a set of credentials that should be considered
 * immutable.
 */
static inline const struct cred *get_cred_many(const struct cred *cred, int nr)
{
        struct cred *nonconst_cred = (struct cred *) cred;
        if (!cred)
                return cred;
        nonconst_cred->non_rcu = 0;
        atomic_long_add(nr, &nonconst_cred->usage);
        return cred;
}

/*
 * get_cred - Get a reference on a set of credentials
 * @cred: The credentials to reference
 *
 * Get a reference on the specified set of credentials.  The caller must
 * release the reference.  If %NULL is passed, it is returned with no action.
 *
 * This is used to deal with a committed set of credentials.
 */
static inline const struct cred *get_cred(const struct cred *cred)
{
        return get_cred_many(cred, 1);
}

static inline const struct cred *get_cred_rcu(const struct cred *cred)
{
        struct cred *nonconst_cred = (struct cred *) cred;
        if (!cred)
                return NULL;
        if (!atomic_long_inc_not_zero(&nonconst_cred->usage))
                return NULL;
        nonconst_cred->non_rcu = 0;
        return cred;
}

/**
 * put_cred - Release a reference to a set of credentials
 * @cred: The credentials to release
 * @nr: Number of references to release
 *
 * Release a reference to a set of credentials, deleting them when the last ref
 * is released.  If %NULL is passed, nothing is done.
 *
 * This takes a const pointer to a set of credentials because the credentials
 * on task_struct are attached by const pointers to prevent accidental
 * alteration of otherwise immutable credential sets.
 */
static inline void put_cred_many(const struct cred *_cred, int nr)
{
        struct cred *cred = (struct cred *) _cred;

        if (cred) {
                if (atomic_long_sub_and_test(nr, &cred->usage))
                        __put_cred(cred);
        }
}

/*
 * put_cred - Release a reference to a set of credentials
 * @cred: The credentials to release
 *
 * Release a reference to a set of credentials, deleting them when the last ref
 * is released.  If %NULL is passed, nothing is done.
 */
static inline void put_cred(const struct cred *cred)
{
        put_cred_many(cred, 1);
}

DEFINE_FREE(put_cred, struct cred *, if (!IS_ERR_OR_NULL(_T)) put_cred(_T))

/**
 * current_cred - Access the current task's subjective credentials
 *
 * Access the subjective credentials of the current task.  RCU-safe,
 * since nobody else can modify it.
 */
#define current_cred() \
        rcu_dereference_protected(current->cred, 1)

/**
 * current_real_cred - Access the current task's objective credentials
 *
 * Access the objective credentials of the current task.  RCU-safe,
 * since nobody else can modify it.
 */
#define current_real_cred() \
        rcu_dereference_protected(current->real_cred, 1)

/**
 * __task_cred - Access a task's objective credentials
 * @task: The task to query
 *
 * Access the objective credentials of a task.  The caller must hold the RCU
 * readlock.
 *
 * The result of this function should not be passed directly to get_cred();
 * rather get_task_cred() should be used instead.
 */
#define __task_cred(task)        \
        rcu_dereference((task)->real_cred)

/**
 * get_current_cred - Get the current task's subjective credentials
 *
 * Get the subjective credentials of the current task, pinning them so that
 * they can't go away.  Accessing the current task's credentials directly is
 * not permitted.
 */
#define get_current_cred()                                \
        (get_cred(current_cred()))

/**
 * get_current_user - Get the current task's user_struct
 *
 * Get the user record of the current task, pinning it so that it can't go
 * away.
 */
#define get_current_user()                                \
({                                                        \
        struct user_struct *__u;                        \
        const struct cred *__cred;                        \
        __cred = current_cred();                        \
        __u = get_uid(__cred->user);                        \
        __u;                                                \
})

/**
 * get_current_groups - Get the current task's supplementary group list
 *
 * Get the supplementary group list of the current task, pinning it so that it
 * can't go away.
 */
#define get_current_groups()                                \
({                                                        \
        struct group_info *__groups;                        \
        const struct cred *__cred;                        \
        __cred = current_cred();                        \
        __groups = get_group_info(__cred->group_info);        \
        __groups;                                        \
})

#define task_cred_xxx(task, xxx)                        \
({                                                        \
        __typeof__(((struct cred *)NULL)->xxx) ___val;        \
        rcu_read_lock();                                \
        ___val = __task_cred((task))->xxx;                \
        rcu_read_unlock();                                \
        ___val;                                                \
})

#define task_uid(task)                (task_cred_xxx((task), uid))
#define task_euid(task)                (task_cred_xxx((task), euid))
#define task_ucounts(task)        (task_cred_xxx((task), ucounts))

#define current_cred_xxx(xxx)                        \
({                                                \
        current_cred()->xxx;                        \
})

#define current_uid()                (current_cred_xxx(uid))
#define current_gid()                (current_cred_xxx(gid))
#define current_euid()                (current_cred_xxx(euid))
#define current_egid()                (current_cred_xxx(egid))
#define current_suid()                (current_cred_xxx(suid))
#define current_sgid()                (current_cred_xxx(sgid))
#define current_fsuid()         (current_cred_xxx(fsuid))
#define current_fsgid()         (current_cred_xxx(fsgid))
#define current_cap()                (current_cred_xxx(cap_effective))
#define current_user()                (current_cred_xxx(user))
#define current_ucounts()        (current_cred_xxx(ucounts))

extern struct user_namespace init_user_ns;
#ifdef CONFIG_USER_NS
#define current_user_ns()        (current_cred_xxx(user_ns))
#else
static inline struct user_namespace *current_user_ns(void)
{
        return &init_user_ns;
}
#endif


#define current_uid_gid(_uid, _gid)                \
do {                                                \
        const struct cred *__cred;                \
        __cred = current_cred();                \
        *(_uid) = __cred->uid;                        \
        *(_gid) = __cred->gid;                        \
} while(0)

#define current_euid_egid(_euid, _egid)                \
do {                                                \
        const struct cred *__cred;                \
        __cred = current_cred();                \
        *(_euid) = __cred->euid;                \
        *(_egid) = __cred->egid;                \
} while(0)

#define current_fsuid_fsgid(_fsuid, _fsgid)        \
do {                                                \
        const struct cred *__cred;                \
        __cred = current_cred();                \
        *(_fsuid) = __cred->fsuid;                \
        *(_fsgid) = __cred->fsgid;                \
} while(0)

#endif /* _LINUX_CRED_H */


















































































































































   75 


   75 

   74 




   62 











    1 
    7 






    1 
   55 


    3 








    4 










    1 








   15 















    3 













   44 













    1 

    2 

    2 








   53 


























    1 








   74 






   75 
   70 
    3 

   72 


   67 


   67 




   67 
   66 

   65 



   71 






   74 



   75 
    1 




   75 


   75 


   75 
    4 
    4 

   72 










































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
// SPDX-License-Identifier: GPL-2.0
/*
 * XFRM compat layer
 * Author: Dmitry Safonov <dima@arista.com>
 * Based on code and translator idea by: Florian Westphal <fw@strlen.de>
 */
#include <linux/compat.h>
#include <linux/nospec.h>
#include <linux/xfrm.h>
#include <net/xfrm.h>

struct compat_xfrm_lifetime_cfg {
        compat_u64 soft_byte_limit, hard_byte_limit;
        compat_u64 soft_packet_limit, hard_packet_limit;
        compat_u64 soft_add_expires_seconds, hard_add_expires_seconds;
        compat_u64 soft_use_expires_seconds, hard_use_expires_seconds;
}; /* same size on 32bit, but only 4 byte alignment required */

struct compat_xfrm_lifetime_cur {
        compat_u64 bytes, packets, add_time, use_time;
}; /* same size on 32bit, but only 4 byte alignment required */

struct compat_xfrm_userpolicy_info {
        struct xfrm_selector sel;
        struct compat_xfrm_lifetime_cfg lft;
        struct compat_xfrm_lifetime_cur curlft;
        __u32 priority, index;
        u8 dir, action, flags, share;
        /* 4 bytes additional padding on 64bit */
};

struct compat_xfrm_usersa_info {
        struct xfrm_selector sel;
        struct xfrm_id id;
        xfrm_address_t saddr;
        struct compat_xfrm_lifetime_cfg lft;
        struct compat_xfrm_lifetime_cur curlft;
        struct xfrm_stats stats;
        __u32 seq, reqid;
        u16 family;
        u8 mode, replay_window, flags;
        /* 4 bytes additional padding on 64bit */
};

struct compat_xfrm_user_acquire {
        struct xfrm_id id;
        xfrm_address_t saddr;
        struct xfrm_selector sel;
        struct compat_xfrm_userpolicy_info policy;
        /* 4 bytes additional padding on 64bit */
        __u32 aalgos, ealgos, calgos, seq;
};

struct compat_xfrm_userspi_info {
        struct compat_xfrm_usersa_info info;
        /* 4 bytes additional padding on 64bit */
        __u32 min, max;
};

struct compat_xfrm_user_expire {
        struct compat_xfrm_usersa_info state;
        /* 8 bytes additional padding on 64bit */
        u8 hard;
};

struct compat_xfrm_user_polexpire {
        struct compat_xfrm_userpolicy_info pol;
        /* 8 bytes additional padding on 64bit */
        u8 hard;
};

#define XMSGSIZE(type) sizeof(struct type)

static const int compat_msg_min[XFRM_NR_MSGTYPES] = {
        [XFRM_MSG_NEWSA       - XFRM_MSG_BASE] = XMSGSIZE(compat_xfrm_usersa_info),
        [XFRM_MSG_DELSA       - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_id),
        [XFRM_MSG_GETSA       - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_id),
        [XFRM_MSG_NEWPOLICY   - XFRM_MSG_BASE] = XMSGSIZE(compat_xfrm_userpolicy_info),
        [XFRM_MSG_DELPOLICY   - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id),
        [XFRM_MSG_GETPOLICY   - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id),
        [XFRM_MSG_ALLOCSPI    - XFRM_MSG_BASE] = XMSGSIZE(compat_xfrm_userspi_info),
        [XFRM_MSG_ACQUIRE     - XFRM_MSG_BASE] = XMSGSIZE(compat_xfrm_user_acquire),
        [XFRM_MSG_EXPIRE      - XFRM_MSG_BASE] = XMSGSIZE(compat_xfrm_user_expire),
        [XFRM_MSG_UPDPOLICY   - XFRM_MSG_BASE] = XMSGSIZE(compat_xfrm_userpolicy_info),
        [XFRM_MSG_UPDSA       - XFRM_MSG_BASE] = XMSGSIZE(compat_xfrm_usersa_info),
        [XFRM_MSG_POLEXPIRE   - XFRM_MSG_BASE] = XMSGSIZE(compat_xfrm_user_polexpire),
        [XFRM_MSG_FLUSHSA     - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_flush),
        [XFRM_MSG_FLUSHPOLICY - XFRM_MSG_BASE] = 0,
        [XFRM_MSG_NEWAE       - XFRM_MSG_BASE] = XMSGSIZE(xfrm_aevent_id),
        [XFRM_MSG_GETAE       - XFRM_MSG_BASE] = XMSGSIZE(xfrm_aevent_id),
        [XFRM_MSG_REPORT      - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_report),
        [XFRM_MSG_MIGRATE     - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id),
        [XFRM_MSG_NEWSADINFO  - XFRM_MSG_BASE] = sizeof(u32),
        [XFRM_MSG_GETSADINFO  - XFRM_MSG_BASE] = sizeof(u32),
        [XFRM_MSG_NEWSPDINFO  - XFRM_MSG_BASE] = sizeof(u32),
        [XFRM_MSG_GETSPDINFO  - XFRM_MSG_BASE] = sizeof(u32),
        [XFRM_MSG_MAPPING     - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_mapping)
};

static const struct nla_policy compat_policy[XFRMA_MAX+1] = {
        [XFRMA_UNSPEC]          = { .strict_start_type = XFRMA_SA_DIR },
        [XFRMA_SA]                = { .len = XMSGSIZE(compat_xfrm_usersa_info)},
        [XFRMA_POLICY]                = { .len = XMSGSIZE(compat_xfrm_userpolicy_info)},
        [XFRMA_LASTUSED]        = { .type = NLA_U64},
        [XFRMA_ALG_AUTH_TRUNC]        = { .len = sizeof(struct xfrm_algo_auth)},
        [XFRMA_ALG_AEAD]        = { .len = sizeof(struct xfrm_algo_aead) },
        [XFRMA_ALG_AUTH]        = { .len = sizeof(struct xfrm_algo) },
        [XFRMA_ALG_CRYPT]        = { .len = sizeof(struct xfrm_algo) },
        [XFRMA_ALG_COMP]        = { .len = sizeof(struct xfrm_algo) },
        [XFRMA_ENCAP]                = { .len = sizeof(struct xfrm_encap_tmpl) },
        [XFRMA_TMPL]                = { .len = sizeof(struct xfrm_user_tmpl) },
        [XFRMA_SEC_CTX]                = { .len = sizeof(struct xfrm_user_sec_ctx) },
        [XFRMA_LTIME_VAL]        = { .len = sizeof(struct xfrm_lifetime_cur) },
        [XFRMA_REPLAY_VAL]        = { .len = sizeof(struct xfrm_replay_state) },
        [XFRMA_REPLAY_THRESH]        = { .type = NLA_U32 },
        [XFRMA_ETIMER_THRESH]        = { .type = NLA_U32 },
        [XFRMA_SRCADDR]                = { .len = sizeof(xfrm_address_t) },
        [XFRMA_COADDR]                = { .len = sizeof(xfrm_address_t) },
        [XFRMA_POLICY_TYPE]        = { .len = sizeof(struct xfrm_userpolicy_type)},
        [XFRMA_MIGRATE]                = { .len = sizeof(struct xfrm_user_migrate) },
        [XFRMA_KMADDRESS]        = { .len = sizeof(struct xfrm_user_kmaddress) },
        [XFRMA_MARK]                = { .len = sizeof(struct xfrm_mark) },
        [XFRMA_TFCPAD]                = { .type = NLA_U32 },
        [XFRMA_REPLAY_ESN_VAL]        = { .len = sizeof(struct xfrm_replay_state_esn) },
        [XFRMA_SA_EXTRA_FLAGS]        = { .type = NLA_U32 },
        [XFRMA_PROTO]                = { .type = NLA_U8 },
        [XFRMA_ADDRESS_FILTER]        = { .len = sizeof(struct xfrm_address_filter) },
        [XFRMA_OFFLOAD_DEV]        = { .len = sizeof(struct xfrm_user_offload) },
        [XFRMA_SET_MARK]        = { .type = NLA_U32 },
        [XFRMA_SET_MARK_MASK]        = { .type = NLA_U32 },
        [XFRMA_IF_ID]                = { .type = NLA_U32 },
        [XFRMA_MTIMER_THRESH]        = { .type = NLA_U32 },
        [XFRMA_SA_DIR]          = NLA_POLICY_RANGE(NLA_U8, XFRM_SA_DIR_IN, XFRM_SA_DIR_OUT),
        [XFRMA_NAT_KEEPALIVE_INTERVAL]        = { .type = NLA_U32 },
        [XFRMA_SA_PCPU]                = { .type = NLA_U32 },
};

static struct nlmsghdr *xfrm_nlmsg_put_compat(struct sk_buff *skb,
                        const struct nlmsghdr *nlh_src, u16 type)
{
        int payload = compat_msg_min[type];
        int src_len = xfrm_msg_min[type];
        struct nlmsghdr *nlh_dst;

        /* Compat messages are shorter or equal to native (+padding) */
        if (WARN_ON_ONCE(src_len < payload))
                return ERR_PTR(-EMSGSIZE);

        nlh_dst = nlmsg_put(skb, nlh_src->nlmsg_pid, nlh_src->nlmsg_seq,
                            nlh_src->nlmsg_type, payload, nlh_src->nlmsg_flags);
        if (!nlh_dst)
                return ERR_PTR(-EMSGSIZE);

        memset(nlmsg_data(nlh_dst), 0, payload);

        switch (nlh_src->nlmsg_type) {
        /* Compat message has the same layout as native */
        case XFRM_MSG_DELSA:
        case XFRM_MSG_DELPOLICY:
        case XFRM_MSG_FLUSHSA:
        case XFRM_MSG_FLUSHPOLICY:
        case XFRM_MSG_NEWAE:
        case XFRM_MSG_REPORT:
        case XFRM_MSG_MIGRATE:
        case XFRM_MSG_NEWSADINFO:
        case XFRM_MSG_NEWSPDINFO:
        case XFRM_MSG_MAPPING:
                WARN_ON_ONCE(src_len != payload);
                memcpy(nlmsg_data(nlh_dst), nlmsg_data(nlh_src), src_len);
                break;
        /* 4 byte alignment for trailing u64 on native, but not on compat */
        case XFRM_MSG_NEWSA:
        case XFRM_MSG_NEWPOLICY:
        case XFRM_MSG_UPDSA:
        case XFRM_MSG_UPDPOLICY:
                WARN_ON_ONCE(src_len != payload + 4);
                memcpy(nlmsg_data(nlh_dst), nlmsg_data(nlh_src), payload);
                break;
        case XFRM_MSG_EXPIRE: {
                const struct xfrm_user_expire *src_ue  = nlmsg_data(nlh_src);
                struct compat_xfrm_user_expire *dst_ue = nlmsg_data(nlh_dst);

                /* compat_xfrm_user_expire has 4-byte smaller state */
                memcpy(dst_ue, src_ue, sizeof(dst_ue->state));
                dst_ue->hard = src_ue->hard;
                break;
        }
        case XFRM_MSG_ACQUIRE: {
                const struct xfrm_user_acquire *src_ua  = nlmsg_data(nlh_src);
                struct compat_xfrm_user_acquire *dst_ua = nlmsg_data(nlh_dst);

                memcpy(dst_ua, src_ua, offsetof(struct compat_xfrm_user_acquire, aalgos));
                dst_ua->aalgos = src_ua->aalgos;
                dst_ua->ealgos = src_ua->ealgos;
                dst_ua->calgos = src_ua->calgos;
                dst_ua->seq    = src_ua->seq;
                break;
        }
        case XFRM_MSG_POLEXPIRE: {
                const struct xfrm_user_polexpire *src_upe  = nlmsg_data(nlh_src);
                struct compat_xfrm_user_polexpire *dst_upe = nlmsg_data(nlh_dst);

                /* compat_xfrm_user_polexpire has 4-byte smaller state */
                memcpy(dst_upe, src_upe, sizeof(dst_upe->pol));
                dst_upe->hard = src_upe->hard;
                break;
        }
        case XFRM_MSG_ALLOCSPI: {
                const struct xfrm_userspi_info *src_usi = nlmsg_data(nlh_src);
                struct compat_xfrm_userspi_info *dst_usi = nlmsg_data(nlh_dst);

                /* compat_xfrm_user_polexpire has 4-byte smaller state */
                memcpy(dst_usi, src_usi, sizeof(src_usi->info));
                dst_usi->min = src_usi->min;
                dst_usi->max = src_usi->max;
                break;
        }
        /* Not being sent by kernel */
        case XFRM_MSG_GETSA:
        case XFRM_MSG_GETPOLICY:
        case XFRM_MSG_GETAE:
        case XFRM_MSG_GETSADINFO:
        case XFRM_MSG_GETSPDINFO:
        default:
                pr_warn_once("unsupported nlmsg_type %d\n", nlh_src->nlmsg_type);
                return ERR_PTR(-EOPNOTSUPP);
        }

        return nlh_dst;
}

static int xfrm_nla_cpy(struct sk_buff *dst, const struct nlattr *src, int len)
{
        return nla_put(dst, src->nla_type, len, nla_data(src));
}

static int xfrm_xlate64_attr(struct sk_buff *dst, const struct nlattr *src)
{
        switch (src->nla_type) {
        case XFRMA_PAD:
                /* Ignore */
                return 0;
        case XFRMA_UNSPEC:
        case XFRMA_ALG_AUTH:
        case XFRMA_ALG_CRYPT:
        case XFRMA_ALG_COMP:
        case XFRMA_ENCAP:
        case XFRMA_TMPL:
                return xfrm_nla_cpy(dst, src, nla_len(src));
        case XFRMA_SA:
                return xfrm_nla_cpy(dst, src, XMSGSIZE(compat_xfrm_usersa_info));
        case XFRMA_POLICY:
                return xfrm_nla_cpy(dst, src, XMSGSIZE(compat_xfrm_userpolicy_info));
        case XFRMA_SEC_CTX:
                return xfrm_nla_cpy(dst, src, nla_len(src));
        case XFRMA_LTIME_VAL:
                return nla_put_64bit(dst, src->nla_type, nla_len(src),
                        nla_data(src), XFRMA_PAD);
        case XFRMA_REPLAY_VAL:
        case XFRMA_REPLAY_THRESH:
        case XFRMA_ETIMER_THRESH:
        case XFRMA_SRCADDR:
        case XFRMA_COADDR:
                return xfrm_nla_cpy(dst, src, nla_len(src));
        case XFRMA_LASTUSED:
                return nla_put_64bit(dst, src->nla_type, nla_len(src),
                        nla_data(src), XFRMA_PAD);
        case XFRMA_POLICY_TYPE:
        case XFRMA_MIGRATE:
        case XFRMA_ALG_AEAD:
        case XFRMA_KMADDRESS:
        case XFRMA_ALG_AUTH_TRUNC:
        case XFRMA_MARK:
        case XFRMA_TFCPAD:
        case XFRMA_REPLAY_ESN_VAL:
        case XFRMA_SA_EXTRA_FLAGS:
        case XFRMA_PROTO:
        case XFRMA_ADDRESS_FILTER:
        case XFRMA_OFFLOAD_DEV:
        case XFRMA_SET_MARK:
        case XFRMA_SET_MARK_MASK:
        case XFRMA_IF_ID:
        case XFRMA_MTIMER_THRESH:
        case XFRMA_SA_DIR:
        case XFRMA_NAT_KEEPALIVE_INTERVAL:
        case XFRMA_SA_PCPU:
        case XFRMA_IPTFS_DROP_TIME:
        case XFRMA_IPTFS_REORDER_WINDOW:
        case XFRMA_IPTFS_DONT_FRAG:
        case XFRMA_IPTFS_INIT_DELAY:
        case XFRMA_IPTFS_MAX_QSIZE:
        case XFRMA_IPTFS_PKT_SIZE:
                return xfrm_nla_cpy(dst, src, nla_len(src));
        default:
                BUILD_BUG_ON(XFRMA_MAX != XFRMA_IPTFS_PKT_SIZE);
                pr_warn_once("unsupported nla_type %d\n", src->nla_type);
                return -EOPNOTSUPP;
        }
}

/* Take kernel-built (64bit layout) and create 32bit layout for userspace */
static int xfrm_xlate64(struct sk_buff *dst, const struct nlmsghdr *nlh_src)
{
        u16 type = nlh_src->nlmsg_type - XFRM_MSG_BASE;
        const struct nlattr *nla, *attrs;
        struct nlmsghdr *nlh_dst;
        int len, remaining;

        nlh_dst = xfrm_nlmsg_put_compat(dst, nlh_src, type);
        if (IS_ERR(nlh_dst))
                return PTR_ERR(nlh_dst);

        attrs = nlmsg_attrdata(nlh_src, xfrm_msg_min[type]);
        len = nlmsg_attrlen(nlh_src, xfrm_msg_min[type]);

        nla_for_each_attr(nla, attrs, len, remaining) {
                int err;

                switch (nlh_src->nlmsg_type) {
                case XFRM_MSG_NEWSPDINFO:
                        err = xfrm_nla_cpy(dst, nla, nla_len(nla));
                        break;
                default:
                        err = xfrm_xlate64_attr(dst, nla);
                        break;
                }
                if (err)
                        return err;
        }

        nlmsg_end(dst, nlh_dst);

        return 0;
}

static int xfrm_alloc_compat(struct sk_buff *skb, const struct nlmsghdr *nlh_src)
{
        u16 type = nlh_src->nlmsg_type - XFRM_MSG_BASE;
        struct sk_buff *new = NULL;
        int err;

        if (type >= ARRAY_SIZE(xfrm_msg_min)) {
                pr_warn_once("unsupported nlmsg_type %d\n", nlh_src->nlmsg_type);
                return -EOPNOTSUPP;
        }

        if (skb_shinfo(skb)->frag_list == NULL) {
                new = alloc_skb(skb->len + skb_tailroom(skb), GFP_ATOMIC);
                if (!new)
                        return -ENOMEM;
                skb_shinfo(skb)->frag_list = new;
        }

        err = xfrm_xlate64(skb_shinfo(skb)->frag_list, nlh_src);
        if (err) {
                if (new) {
                        kfree_skb(new);
                        skb_shinfo(skb)->frag_list = NULL;
                }
                return err;
        }

        return 0;
}

/* Calculates len of translated 64-bit message. */
static size_t xfrm_user_rcv_calculate_len64(const struct nlmsghdr *src,
                                            struct nlattr *attrs[XFRMA_MAX + 1],
                                            int maxtype)
{
        size_t len = nlmsg_len(src);

        switch (src->nlmsg_type) {
        case XFRM_MSG_NEWSA:
        case XFRM_MSG_NEWPOLICY:
        case XFRM_MSG_ALLOCSPI:
        case XFRM_MSG_ACQUIRE:
        case XFRM_MSG_UPDPOLICY:
        case XFRM_MSG_UPDSA:
                len += 4;
                break;
        case XFRM_MSG_EXPIRE:
        case XFRM_MSG_POLEXPIRE:
                len += 8;
                break;
        case XFRM_MSG_NEWSPDINFO:
                /* attirbutes are xfrm_spdattr_type_t, not xfrm_attr_type_t */
                return len;
        default:
                break;
        }

        /* Unexpected for anything, but XFRM_MSG_NEWSPDINFO, please
         * correct both 64=>32-bit and 32=>64-bit translators to copy
         * new attributes.
         */
        if (WARN_ON_ONCE(maxtype))
                return len;

        if (attrs[XFRMA_SA])
                len += 4;
        if (attrs[XFRMA_POLICY])
                len += 4;

        /* XXX: some attrs may need to be realigned
         * if !CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
         */

        return len;
}

static int xfrm_attr_cpy32(void *dst, size_t *pos, const struct nlattr *src,
                           size_t size, int copy_len, int payload)
{
        struct nlmsghdr *nlmsg = dst;
        struct nlattr *nla;

        /* xfrm_user_rcv_msg_compat() relies on fact that 32-bit messages
         * have the same len or shorted than 64-bit ones.
         * 32-bit translation that is bigger than 64-bit original is unexpected.
         */
        if (WARN_ON_ONCE(copy_len > payload))
                copy_len = payload;

        if (size - *pos < nla_attr_size(payload))
                return -ENOBUFS;

        nla = dst + *pos;

        memcpy(nla, src, nla_attr_size(copy_len));
        nla->nla_len = nla_attr_size(payload);
        *pos += nla_attr_size(copy_len);
        nlmsg->nlmsg_len += nla->nla_len;

        memset(dst + *pos, 0, payload - copy_len);
        *pos += payload - copy_len;

        return 0;
}

static int xfrm_xlate32_attr(void *dst, const struct nlattr *nla,
                             size_t *pos, size_t size,
                             struct netlink_ext_ack *extack)
{
        int type = nla_type(nla);
        u16 pol_len32, pol_len64;
        int err;

        if (type > XFRMA_MAX) {
                BUILD_BUG_ON(XFRMA_MAX != XFRMA_IPTFS_PKT_SIZE);
                NL_SET_ERR_MSG(extack, "Bad attribute");
                return -EOPNOTSUPP;
        }
        type = array_index_nospec(type, XFRMA_MAX + 1);
        if (nla_len(nla) < compat_policy[type].len) {
                NL_SET_ERR_MSG(extack, "Attribute bad length");
                return -EOPNOTSUPP;
        }

        pol_len32 = compat_policy[type].len;
        pol_len64 = xfrma_policy[type].len;

        /* XFRMA_SA and XFRMA_POLICY - need to know how-to translate */
        if (pol_len32 != pol_len64) {
                if (nla_len(nla) != compat_policy[type].len) {
                        NL_SET_ERR_MSG(extack, "Attribute bad length");
                        return -EOPNOTSUPP;
                }
                err = xfrm_attr_cpy32(dst, pos, nla, size, pol_len32, pol_len64);
                if (err)
                        return err;
        }

        return xfrm_attr_cpy32(dst, pos, nla, size, nla_len(nla), nla_len(nla));
}

static int xfrm_xlate32(struct nlmsghdr *dst, const struct nlmsghdr *src,
                        struct nlattr *attrs[XFRMA_MAX+1],
                        size_t size, u8 type, int maxtype,
                        struct netlink_ext_ack *extack)
{
        size_t pos;
        int i;

        memcpy(dst, src, NLMSG_HDRLEN);
        dst->nlmsg_len = NLMSG_HDRLEN + xfrm_msg_min[type];
        memset(nlmsg_data(dst), 0, xfrm_msg_min[type]);

        switch (src->nlmsg_type) {
        /* Compat message has the same layout as native */
        case XFRM_MSG_DELSA:
        case XFRM_MSG_GETSA:
        case XFRM_MSG_DELPOLICY:
        case XFRM_MSG_GETPOLICY:
        case XFRM_MSG_FLUSHSA:
        case XFRM_MSG_FLUSHPOLICY:
        case XFRM_MSG_NEWAE:
        case XFRM_MSG_GETAE:
        case XFRM_MSG_REPORT:
        case XFRM_MSG_MIGRATE:
        case XFRM_MSG_NEWSADINFO:
        case XFRM_MSG_GETSADINFO:
        case XFRM_MSG_NEWSPDINFO:
        case XFRM_MSG_GETSPDINFO:
        case XFRM_MSG_MAPPING:
                memcpy(nlmsg_data(dst), nlmsg_data(src), compat_msg_min[type]);
                break;
        /* 4 byte alignment for trailing u64 on native, but not on compat */
        case XFRM_MSG_NEWSA:
        case XFRM_MSG_NEWPOLICY:
        case XFRM_MSG_UPDSA:
        case XFRM_MSG_UPDPOLICY:
                memcpy(nlmsg_data(dst), nlmsg_data(src), compat_msg_min[type]);
                break;
        case XFRM_MSG_EXPIRE: {
                const struct compat_xfrm_user_expire *src_ue = nlmsg_data(src);
                struct xfrm_user_expire *dst_ue = nlmsg_data(dst);

                /* compat_xfrm_user_expire has 4-byte smaller state */
                memcpy(dst_ue, src_ue, sizeof(src_ue->state));
                dst_ue->hard = src_ue->hard;
                break;
        }
        case XFRM_MSG_ACQUIRE: {
                const struct compat_xfrm_user_acquire *src_ua = nlmsg_data(src);
                struct xfrm_user_acquire *dst_ua = nlmsg_data(dst);

                memcpy(dst_ua, src_ua, offsetof(struct compat_xfrm_user_acquire, aalgos));
                dst_ua->aalgos = src_ua->aalgos;
                dst_ua->ealgos = src_ua->ealgos;
                dst_ua->calgos = src_ua->calgos;
                dst_ua->seq    = src_ua->seq;
                break;
        }
        case XFRM_MSG_POLEXPIRE: {
                const struct compat_xfrm_user_polexpire *src_upe = nlmsg_data(src);
                struct xfrm_user_polexpire *dst_upe = nlmsg_data(dst);

                /* compat_xfrm_user_polexpire has 4-byte smaller state */
                memcpy(dst_upe, src_upe, sizeof(src_upe->pol));
                dst_upe->hard = src_upe->hard;
                break;
        }
        case XFRM_MSG_ALLOCSPI: {
                const struct compat_xfrm_userspi_info *src_usi = nlmsg_data(src);
                struct xfrm_userspi_info *dst_usi = nlmsg_data(dst);

                /* compat_xfrm_user_polexpire has 4-byte smaller state */
                memcpy(dst_usi, src_usi, sizeof(src_usi->info));
                dst_usi->min = src_usi->min;
                dst_usi->max = src_usi->max;
                break;
        }
        default:
                NL_SET_ERR_MSG(extack, "Unsupported message type");
                return -EOPNOTSUPP;
        }
        pos = dst->nlmsg_len;

        if (maxtype) {
                /* attirbutes are xfrm_spdattr_type_t, not xfrm_attr_type_t */
                WARN_ON_ONCE(src->nlmsg_type != XFRM_MSG_NEWSPDINFO);

                for (i = 1; i <= maxtype; i++) {
                        int err;

                        if (!attrs[i])
                                continue;

                        /* just copy - no need for translation */
                        err = xfrm_attr_cpy32(dst, &pos, attrs[i], size,
                                        nla_len(attrs[i]), nla_len(attrs[i]));
                        if (err)
                                return err;
                }
                return 0;
        }

        for (i = 1; i < XFRMA_MAX + 1; i++) {
                int err;

                if (i == XFRMA_PAD)
                        continue;

                if (!attrs[i])
                        continue;

                err = xfrm_xlate32_attr(dst, attrs[i], &pos, size, extack);
                if (err)
                        return err;
        }

        return 0;
}

static struct nlmsghdr *xfrm_user_rcv_msg_compat(const struct nlmsghdr *h32,
                        int maxtype, const struct nla_policy *policy,
                        struct netlink_ext_ack *extack)
{
        /* netlink_rcv_skb() checks if a message has full (struct nlmsghdr) */
        u16 type = h32->nlmsg_type - XFRM_MSG_BASE;
        struct nlattr *attrs[XFRMA_MAX+1];
        struct nlmsghdr *h64;
        size_t len;
        int err;

        BUILD_BUG_ON(ARRAY_SIZE(xfrm_msg_min) != ARRAY_SIZE(compat_msg_min));

        if (type >= ARRAY_SIZE(xfrm_msg_min))
                return ERR_PTR(-EINVAL);

        /* Don't call parse: the message might have only nlmsg header */
        if ((h32->nlmsg_type == XFRM_MSG_GETSA ||
             h32->nlmsg_type == XFRM_MSG_GETPOLICY) &&
            (h32->nlmsg_flags & NLM_F_DUMP))
                return NULL;

        err = nlmsg_parse_deprecated(h32, compat_msg_min[type], attrs,
                        maxtype ? : XFRMA_MAX, policy ? : compat_policy, extack);
        if (err < 0)
                return ERR_PTR(err);

        len = xfrm_user_rcv_calculate_len64(h32, attrs, maxtype);
        /* The message doesn't need translation */
        if (len == nlmsg_len(h32))
                return NULL;

        len += NLMSG_HDRLEN;
        h64 = kvmalloc(len, GFP_KERNEL);
        if (!h64)
                return ERR_PTR(-ENOMEM);

        err = xfrm_xlate32(h64, h32, attrs, len, type, maxtype, extack);
        if (err < 0) {
                kvfree(h64);
                return ERR_PTR(err);
        }

        return h64;
}

static int xfrm_user_policy_compat(u8 **pdata32, int optlen)
{
        struct compat_xfrm_userpolicy_info *p = (void *)*pdata32;
        u8 *src_templates, *dst_templates;
        u8 *data64;

        if (optlen < sizeof(*p))
                return -EINVAL;

        data64 = kmalloc_track_caller(optlen + 4, GFP_USER | __GFP_NOWARN);
        if (!data64)
                return -ENOMEM;

        memcpy(data64, *pdata32, sizeof(*p));
        memset(data64 + sizeof(*p), 0, 4);

        src_templates = *pdata32 + sizeof(*p);
        dst_templates = data64 + sizeof(*p) + 4;
        memcpy(dst_templates, src_templates, optlen - sizeof(*p));

        kfree(*pdata32);
        *pdata32 = data64;
        return 0;
}

static struct xfrm_translator xfrm_translator = {
        .owner                                = THIS_MODULE,
        .alloc_compat                        = xfrm_alloc_compat,
        .rcv_msg_compat                        = xfrm_user_rcv_msg_compat,
        .xlate_user_policy_sockptr        = xfrm_user_policy_compat,
};

static int __init xfrm_compat_init(void)
{
        return xfrm_register_translator(&xfrm_translator);
}

static void __exit xfrm_compat_exit(void)
{
        xfrm_unregister_translator(&xfrm_translator);
}

module_init(xfrm_compat_init);
module_exit(xfrm_compat_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Dmitry Safonov");
MODULE_DESCRIPTION("XFRM 32-bit compatibility layer");





































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * linux/ipc/util.h
 * Copyright (C) 1999 Christoph Rohland
 *
 * ipc helper functions (c) 1999 Manfred Spraul <manfred@colorfullife.com>
 * namespaces support.      2006 OpenVZ, SWsoft Inc.
 *                               Pavel Emelianov <xemul@openvz.org>
 */

#ifndef _IPC_UTIL_H
#define _IPC_UTIL_H

#include <linux/unistd.h>
#include <linux/err.h>
#include <linux/ipc_namespace.h>
#include <linux/pid.h>

/*
 * The IPC ID contains 2 separate numbers - index and sequence number.
 * By default,
 *   bits  0-14: index (32k, 15 bits)
 *   bits 15-30: sequence number (64k, 16 bits)
 *
 * When IPCMNI extension mode is turned on, the composition changes:
 *   bits  0-23: index (16M, 24 bits)
 *   bits 24-30: sequence number (128, 7 bits)
 */
#define IPCMNI_SHIFT                15
#define IPCMNI_EXTEND_SHIFT        24
#define IPCMNI_EXTEND_MIN_CYCLE        (RADIX_TREE_MAP_SIZE * RADIX_TREE_MAP_SIZE)
#define IPCMNI                        (1 << IPCMNI_SHIFT)
#define IPCMNI_EXTEND                (1 << IPCMNI_EXTEND_SHIFT)

#ifdef CONFIG_SYSVIPC_SYSCTL
extern int ipc_mni;
extern int ipc_mni_shift;
extern int ipc_min_cycle;

#define ipcmni_seq_shift()        ipc_mni_shift
#define IPCMNI_IDX_MASK                ((1 << ipc_mni_shift) - 1)

#else /* CONFIG_SYSVIPC_SYSCTL */

#define ipc_mni                        IPCMNI
#define ipc_min_cycle                ((int)RADIX_TREE_MAP_SIZE)
#define ipcmni_seq_shift()        IPCMNI_SHIFT
#define IPCMNI_IDX_MASK                ((1 << IPCMNI_SHIFT) - 1)
#endif /* CONFIG_SYSVIPC_SYSCTL */

void sem_init(void);
void msg_init(void);
void shm_init(void);

struct ipc_namespace;
struct pid_namespace;

#ifdef CONFIG_POSIX_MQUEUE
extern void mq_clear_sbinfo(struct ipc_namespace *ns);
#else
static inline void mq_clear_sbinfo(struct ipc_namespace *ns) { }
#endif

#ifdef CONFIG_SYSVIPC
void sem_init_ns(struct ipc_namespace *ns);
int msg_init_ns(struct ipc_namespace *ns);
void shm_init_ns(struct ipc_namespace *ns);

void sem_exit_ns(struct ipc_namespace *ns);
void msg_exit_ns(struct ipc_namespace *ns);
void shm_exit_ns(struct ipc_namespace *ns);
#else
static inline void sem_init_ns(struct ipc_namespace *ns) { }
static inline int msg_init_ns(struct ipc_namespace *ns) { return 0; }
static inline void shm_init_ns(struct ipc_namespace *ns) { }

static inline void sem_exit_ns(struct ipc_namespace *ns) { }
static inline void msg_exit_ns(struct ipc_namespace *ns) { }
static inline void shm_exit_ns(struct ipc_namespace *ns) { }
#endif

/*
 * Structure that holds the parameters needed by the ipc operations
 * (see after)
 */
struct ipc_params {
        key_t key;
        int flg;
        union {
                size_t size;        /* for shared memories */
                int nsems;        /* for semaphores */
        } u;                        /* holds the getnew() specific param */
};

/*
 * Structure that holds some ipc operations. This structure is used to unify
 * the calls to sys_msgget(), sys_semget(), sys_shmget()
 *      . routine to call to create a new ipc object. Can be one of newque,
 *        newary, newseg
 *      . routine to call to check permissions for a new ipc object.
 *        Can be one of security_msg_associate, security_sem_associate,
 *        security_shm_associate
 *      . routine to call for an extra check if needed
 */
struct ipc_ops {
        int (*getnew)(struct ipc_namespace *, struct ipc_params *);
        int (*associate)(struct kern_ipc_perm *, int);
        int (*more_checks)(struct kern_ipc_perm *, struct ipc_params *);
};

struct seq_file;
struct ipc_ids;

void ipc_init_ids(struct ipc_ids *ids);
#ifdef CONFIG_PROC_FS
void __init ipc_init_proc_interface(const char *path, const char *header,
                int ids, int (*show)(struct seq_file *, void *));
struct pid_namespace *ipc_seq_pid_ns(struct seq_file *);
#else
#define ipc_init_proc_interface(path, header, ids, show) do {} while (0)
#endif

#define IPC_SEM_IDS        0
#define IPC_MSG_IDS        1
#define IPC_SHM_IDS        2

#define ipcid_to_idx(id)  ((id) & IPCMNI_IDX_MASK)
#define ipcid_to_seqx(id) ((id) >> ipcmni_seq_shift())
#define ipcid_seq_max()          (INT_MAX >> ipcmni_seq_shift())

/* must be called with ids->rwsem acquired for writing */
int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int);

/* must be called with both locks acquired. */
void ipc_rmid(struct ipc_ids *, struct kern_ipc_perm *);

/* must be called with both locks acquired. */
void ipc_set_key_private(struct ipc_ids *, struct kern_ipc_perm *);

/* must be called with ipcp locked */
int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flg);

/**
 * ipc_get_maxidx - get the highest assigned index
 * @ids: ipc identifier set
 *
 * The function returns the highest assigned index for @ids. The function
 * doesn't scan the idr tree, it uses a cached value.
 *
 * Called with ipc_ids.rwsem held for reading.
 */
static inline int ipc_get_maxidx(struct ipc_ids *ids)
{
        if (ids->in_use == 0)
                return -1;

        if (ids->in_use == ipc_mni)
                return ipc_mni - 1;

        return ids->max_idx;
}

/*
 * For allocation that need to be freed by RCU.
 * Objects are reference counted, they start with reference count 1.
 * getref increases the refcount, the putref call that reduces the recount
 * to 0 schedules the rcu destruction. Caller must guarantee locking.
 *
 * refcount is initialized by ipc_addid(), before that point call_rcu()
 * must be used.
 */
bool ipc_rcu_getref(struct kern_ipc_perm *ptr);
void ipc_rcu_putref(struct kern_ipc_perm *ptr,
                        void (*func)(struct rcu_head *head));

struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id);

void kernel_to_ipc64_perm(struct kern_ipc_perm *in, struct ipc64_perm *out);
void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out);
int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out);
struct kern_ipc_perm *ipcctl_obtain_check(struct ipc_namespace *ns,
                                             struct ipc_ids *ids, int id, int cmd,
                                             struct ipc64_perm *perm, int extra_perm);

static inline void ipc_update_pid(struct pid **pos, struct pid *pid)
{
        struct pid *old = *pos;
        if (old != pid) {
                *pos = get_pid(pid);
                put_pid(old);
        }
}

#ifdef CONFIG_ARCH_WANT_IPC_PARSE_VERSION
int ipc_parse_version(int *cmd);
#endif

extern void free_msg(struct msg_msg *msg);
extern struct msg_msg *load_msg(const void __user *src, size_t len);
extern struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst);
extern int store_msg(void __user *dest, struct msg_msg *msg, size_t len);

static inline int ipc_checkid(struct kern_ipc_perm *ipcp, int id)
{
        return ipcid_to_seqx(id) != ipcp->seq;
}

static inline void ipc_lock_object(struct kern_ipc_perm *perm)
{
        spin_lock(&perm->lock);
}

static inline void ipc_unlock_object(struct kern_ipc_perm *perm)
{
        spin_unlock(&perm->lock);
}

static inline void ipc_assert_locked_object(struct kern_ipc_perm *perm)
{
        assert_spin_locked(&perm->lock);
}

static inline void ipc_unlock(struct kern_ipc_perm *perm)
{
        ipc_unlock_object(perm);
        rcu_read_unlock();
}

/*
 * ipc_valid_object() - helper to sort out IPC_RMID races for codepaths
 * where the respective ipc_ids.rwsem is not being held down.
 * Checks whether the ipc object is still around or if it's gone already, as
 * ipc_rmid() may have already freed the ID while the ipc lock was spinning.
 * Needs to be called with kern_ipc_perm.lock held -- exception made for one
 * checkpoint case at sys_semtimedop() as noted in code commentary.
 */
static inline bool ipc_valid_object(struct kern_ipc_perm *perm)
{
        return !perm->deleted;
}

struct kern_ipc_perm *ipc_obtain_object_check(struct ipc_ids *ids, int id);
int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
                        const struct ipc_ops *ops, struct ipc_params *params);
void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
                void (*free)(struct ipc_namespace *, struct kern_ipc_perm *));

static inline int sem_check_semmni(struct ipc_namespace *ns) {
        /*
         * Check semmni range [0, ipc_mni]
         * semmni is the last element of sem_ctls[4] array
         */
        return ((ns->sem_ctls[3] < 0) || (ns->sem_ctls[3] > ipc_mni))
                ? -ERANGE : 0;
}

#ifdef CONFIG_COMPAT
#include <linux/compat.h>
struct compat_ipc_perm {
        key_t key;
        __compat_uid_t uid;
        __compat_gid_t gid;
        __compat_uid_t cuid;
        __compat_gid_t cgid;
        compat_mode_t mode;
        unsigned short seq;
};

void to_compat_ipc_perm(struct compat_ipc_perm *, struct ipc64_perm *);
void to_compat_ipc64_perm(struct compat_ipc64_perm *, struct ipc64_perm *);
int get_compat_ipc_perm(struct ipc64_perm *, struct compat_ipc_perm __user *);
int get_compat_ipc64_perm(struct ipc64_perm *,
                          struct compat_ipc64_perm __user *);

static inline int compat_ipc_parse_version(int *cmd)
{
        int version = *cmd & IPC_64;
        *cmd &= ~IPC_64;
        return version;
}

long compat_ksys_old_semctl(int semid, int semnum, int cmd, int arg);
long compat_ksys_old_msgctl(int msqid, int cmd, void __user *uptr);
long compat_ksys_msgrcv(int msqid, compat_uptr_t msgp, compat_ssize_t msgsz,
                        compat_long_t msgtyp, int msgflg);
long compat_ksys_msgsnd(int msqid, compat_uptr_t msgp,
                       compat_ssize_t msgsz, int msgflg);
long compat_ksys_old_shmctl(int shmid, int cmd, void __user *uptr);

#endif

#endif

























































  303 

























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
/* SPDX-License-Identifier: GPL-2.0 */

/*
 * This file provides wrappers with sanitizer instrumentation for bit
 * locking operations.
 *
 * To use this functionality, an arch's bitops.h file needs to define each of
 * the below bit operations with an arch_ prefix (e.g. arch_set_bit(),
 * arch___set_bit(), etc.).
 */
#ifndef _ASM_GENERIC_BITOPS_INSTRUMENTED_LOCK_H
#define _ASM_GENERIC_BITOPS_INSTRUMENTED_LOCK_H

#include <linux/instrumented.h>

/**
 * clear_bit_unlock - Clear a bit in memory, for unlock
 * @nr: the bit to set
 * @addr: the address to start counting from
 *
 * This operation is atomic and provides release barrier semantics.
 */
static inline void clear_bit_unlock(long nr, volatile unsigned long *addr)
{
        kcsan_release();
        instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long));
        arch_clear_bit_unlock(nr, addr);
}

/**
 * __clear_bit_unlock - Clears a bit in memory
 * @nr: Bit to clear
 * @addr: Address to start counting from
 *
 * This is a non-atomic operation but implies a release barrier before the
 * memory operation. It can be used for an unlock if no other CPUs can
 * concurrently modify other bits in the word.
 */
static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
{
        kcsan_release();
        instrument_write(addr + BIT_WORD(nr), sizeof(long));
        arch___clear_bit_unlock(nr, addr);
}

/**
 * test_and_set_bit_lock - Set a bit and return its old value, for lock
 * @nr: Bit to set
 * @addr: Address to count from
 *
 * This operation is atomic and provides acquire barrier semantics if
 * the returned value is 0.
 * It can be used to implement bit locks.
 */
static inline bool test_and_set_bit_lock(long nr, volatile unsigned long *addr)
{
        instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long));
        return arch_test_and_set_bit_lock(nr, addr);
}

/**
 * xor_unlock_is_negative_byte - XOR a single byte in memory and test if
 * it is negative, for unlock.
 * @mask: Change the bits which are set in this mask.
 * @addr: The address of the word containing the byte to change.
 *
 * Changes some of bits 0-6 in the word pointed to by @addr.
 * This operation is atomic and provides release barrier semantics.
 * Used to optimise some folio operations which are commonly paired
 * with an unlock or end of writeback.  Bit 7 is used as PG_waiters to
 * indicate whether anybody is waiting for the unlock.
 *
 * Return: Whether the top bit of the byte is set.
 */
static inline bool xor_unlock_is_negative_byte(unsigned long mask,
                volatile unsigned long *addr)
{
        kcsan_release();
        instrument_atomic_write(addr, sizeof(long));
        return arch_xor_unlock_is_negative_byte(mask, addr);
}
#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_LOCK_H */






















































































































































































































































































































































































































































































  320 
  319 



















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_PREEMPT_H
#define __LINUX_PREEMPT_H

/*
 * include/linux/preempt.h - macros for accessing and manipulating
 * preempt_count (used for kernel preemption, interrupt count, etc.)
 */

#include <linux/linkage.h>
#include <linux/cleanup.h>
#include <linux/types.h>

/*
 * We put the hardirq and softirq counter into the preemption
 * counter. The bitmask has the following meaning:
 *
 * - bits 0-7 are the preemption count (max preemption depth: 256)
 * - bits 8-15 are the softirq count (max # of softirqs: 256)
 *
 * The hardirq count could in theory be the same as the number of
 * interrupts in the system, but we run all interrupt handlers with
 * interrupts disabled, so we cannot have nesting interrupts. Though
 * there are a few palaeontologic drivers which reenable interrupts in
 * the handler, so we need more than one bit here.
 *
 *         PREEMPT_MASK:        0x000000ff
 *         SOFTIRQ_MASK:        0x0000ff00
 *         HARDIRQ_MASK:        0x000f0000
 *             NMI_MASK:        0x00f00000
 * PREEMPT_NEED_RESCHED:        0x80000000
 */
#define PREEMPT_BITS        8
#define SOFTIRQ_BITS        8
#define HARDIRQ_BITS        4
#define NMI_BITS        4

#define PREEMPT_SHIFT        0
#define SOFTIRQ_SHIFT        (PREEMPT_SHIFT + PREEMPT_BITS)
#define HARDIRQ_SHIFT        (SOFTIRQ_SHIFT + SOFTIRQ_BITS)
#define NMI_SHIFT        (HARDIRQ_SHIFT + HARDIRQ_BITS)

#define __IRQ_MASK(x)        ((1UL << (x))-1)

#define PREEMPT_MASK        (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
#define SOFTIRQ_MASK        (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
#define HARDIRQ_MASK        (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
#define NMI_MASK        (__IRQ_MASK(NMI_BITS)     << NMI_SHIFT)

#define PREEMPT_OFFSET        (1UL << PREEMPT_SHIFT)
#define SOFTIRQ_OFFSET        (1UL << SOFTIRQ_SHIFT)
#define HARDIRQ_OFFSET        (1UL << HARDIRQ_SHIFT)
#define NMI_OFFSET        (1UL << NMI_SHIFT)

#define SOFTIRQ_DISABLE_OFFSET        (2 * SOFTIRQ_OFFSET)

#define PREEMPT_DISABLED        (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)

/*
 * Disable preemption until the scheduler is running -- use an unconditional
 * value so that it also works on !PREEMPT_COUNT kernels.
 *
 * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count().
 */
#define INIT_PREEMPT_COUNT        PREEMPT_OFFSET

/*
 * Initial preempt_count value; reflects the preempt_count schedule invariant
 * which states that during context switches:
 *
 *    preempt_count() == 2*PREEMPT_DISABLE_OFFSET
 *
 * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels.
 * Note: See finish_task_switch().
 */
#define FORK_PREEMPT_COUNT        (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)

/* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */
#include <asm/preempt.h>

/**
 * interrupt_context_level - return interrupt context level
 *
 * Returns the current interrupt context level.
 *  0 - normal context
 *  1 - softirq context
 *  2 - hardirq context
 *  3 - NMI context
 */
static __always_inline unsigned char interrupt_context_level(void)
{
        unsigned long pc = preempt_count();
        unsigned char level = 0;

        level += !!(pc & (NMI_MASK));
        level += !!(pc & (NMI_MASK | HARDIRQ_MASK));
        level += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET));

        return level;
}

/*
 * These macro definitions avoid redundant invocations of preempt_count()
 * because such invocations would result in redundant loads given that
 * preempt_count() is commonly implemented with READ_ONCE().
 */

#define nmi_count()        (preempt_count() & NMI_MASK)
#define hardirq_count()        (preempt_count() & HARDIRQ_MASK)
#ifdef CONFIG_PREEMPT_RT
# define softirq_count()        (current->softirq_disable_cnt & SOFTIRQ_MASK)
# define irq_count()                ((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | softirq_count())
#else
# define softirq_count()        (preempt_count() & SOFTIRQ_MASK)
# define irq_count()                (preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_MASK))
#endif

/*
 * Macros to retrieve the current execution context:
 *
 * in_nmi()                - We're in NMI context
 * in_hardirq()                - We're in hard IRQ context
 * in_serving_softirq()        - We're in softirq context
 * in_task()                - We're in task context
 */
#define in_nmi()                (nmi_count())
#define in_hardirq()                (hardirq_count())
#define in_serving_softirq()        (softirq_count() & SOFTIRQ_OFFSET)
#ifdef CONFIG_PREEMPT_RT
# define in_task()                (!((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | in_serving_softirq()))
#else
# define in_task()                (!(preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
#endif

/*
 * The following macros are deprecated and should not be used in new code:
 * in_irq()       - Obsolete version of in_hardirq()
 * in_softirq()   - We have BH disabled, or are processing softirqs
 * in_interrupt() - We're in NMI,IRQ,SoftIRQ context or have BH disabled
 */
#define in_irq()                (hardirq_count())
#define in_softirq()                (softirq_count())
#define in_interrupt()                (irq_count())

/*
 * The preempt_count offset after preempt_disable();
 */
#if defined(CONFIG_PREEMPT_COUNT)
# define PREEMPT_DISABLE_OFFSET        PREEMPT_OFFSET
#else
# define PREEMPT_DISABLE_OFFSET        0
#endif

/*
 * The preempt_count offset after spin_lock()
 */
#if !defined(CONFIG_PREEMPT_RT)
#define PREEMPT_LOCK_OFFSET                PREEMPT_DISABLE_OFFSET
#else
/* Locks on RT do not disable preemption */
#define PREEMPT_LOCK_OFFSET                0
#endif

/*
 * The preempt_count offset needed for things like:
 *
 *  spin_lock_bh()
 *
 * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and
 * softirqs, such that unlock sequences of:
 *
 *  spin_unlock();
 *  local_bh_enable();
 *
 * Work as expected.
 */
#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_LOCK_OFFSET)

/*
 * Are we running in atomic context?  WARNING: this macro cannot
 * always detect atomic context; in particular, it cannot know about
 * held spinlocks in non-preemptible kernels.  Thus it should not be
 * used in the general case to determine whether sleeping is possible.
 * Do not use in_atomic() in driver code.
 */
#define in_atomic()        (preempt_count() != 0)

/*
 * Check whether we were atomic before we did preempt_disable():
 * (used by the scheduler)
 */
#define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET)

#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE)
extern void preempt_count_add(int val);
extern void preempt_count_sub(int val);
#define preempt_count_dec_and_test() \
        ({ preempt_count_sub(1); should_resched(0); })
#else
#define preempt_count_add(val)        __preempt_count_add(val)
#define preempt_count_sub(val)        __preempt_count_sub(val)
#define preempt_count_dec_and_test() __preempt_count_dec_and_test()
#endif

#define __preempt_count_inc() __preempt_count_add(1)
#define __preempt_count_dec() __preempt_count_sub(1)

#define preempt_count_inc() preempt_count_add(1)
#define preempt_count_dec() preempt_count_sub(1)

#ifdef CONFIG_PREEMPT_COUNT

#define preempt_disable() \
do { \
        preempt_count_inc(); \
        barrier(); \
} while (0)

#define sched_preempt_enable_no_resched() \
do { \
        barrier(); \
        preempt_count_dec(); \
} while (0)

#define preempt_enable_no_resched() sched_preempt_enable_no_resched()

#define preemptible()        (preempt_count() == 0 && !irqs_disabled())

#ifdef CONFIG_PREEMPTION
#define preempt_enable() \
do { \
        barrier(); \
        if (unlikely(preempt_count_dec_and_test())) \
                __preempt_schedule(); \
} while (0)

#define preempt_enable_notrace() \
do { \
        barrier(); \
        if (unlikely(__preempt_count_dec_and_test())) \
                __preempt_schedule_notrace(); \
} while (0)

#define preempt_check_resched() \
do { \
        if (should_resched(0)) \
                __preempt_schedule(); \
} while (0)

#else /* !CONFIG_PREEMPTION */
#define preempt_enable() \
do { \
        barrier(); \
        preempt_count_dec(); \
} while (0)

#define preempt_enable_notrace() \
do { \
        barrier(); \
        __preempt_count_dec(); \
} while (0)

#define preempt_check_resched() do { } while (0)
#endif /* CONFIG_PREEMPTION */

#define preempt_disable_notrace() \
do { \
        __preempt_count_inc(); \
        barrier(); \
} while (0)

#define preempt_enable_no_resched_notrace() \
do { \
        barrier(); \
        __preempt_count_dec(); \
} while (0)

#else /* !CONFIG_PREEMPT_COUNT */

/*
 * Even if we don't have any preemption, we need preempt disable/enable
 * to be barriers, so that we don't have things like get_user/put_user
 * that can cause faults and scheduling migrate into our preempt-protected
 * region.
 */
#define preempt_disable()                        barrier()
#define sched_preempt_enable_no_resched()        barrier()
#define preempt_enable_no_resched()                barrier()
#define preempt_enable()                        barrier()
#define preempt_check_resched()                        do { } while (0)

#define preempt_disable_notrace()                barrier()
#define preempt_enable_no_resched_notrace()        barrier()
#define preempt_enable_notrace()                barrier()
#define preemptible()                                0

#endif /* CONFIG_PREEMPT_COUNT */

#ifdef MODULE
/*
 * Modules have no business playing preemption tricks.
 */
#undef sched_preempt_enable_no_resched
#undef preempt_enable_no_resched
#undef preempt_enable_no_resched_notrace
#undef preempt_check_resched
#endif

#define preempt_set_need_resched() \
do { \
        set_preempt_need_resched(); \
} while (0)
#define preempt_fold_need_resched() \
do { \
        if (tif_need_resched()) \
                set_preempt_need_resched(); \
} while (0)

#ifdef CONFIG_PREEMPT_NOTIFIERS

struct preempt_notifier;
struct task_struct;

/**
 * preempt_ops - notifiers called when a task is preempted and rescheduled
 * @sched_in: we're about to be rescheduled:
 *    notifier: struct preempt_notifier for the task being scheduled
 *    cpu:  cpu we're scheduled on
 * @sched_out: we've just been preempted
 *    notifier: struct preempt_notifier for the task being preempted
 *    next: the task that's kicking us out
 *
 * Please note that sched_in and out are called under different
 * contexts.  sched_out is called with rq lock held and irq disabled
 * while sched_in is called without rq lock and irq enabled.  This
 * difference is intentional and depended upon by its users.
 */
struct preempt_ops {
        void (*sched_in)(struct preempt_notifier *notifier, int cpu);
        void (*sched_out)(struct preempt_notifier *notifier,
                          struct task_struct *next);
};

/**
 * preempt_notifier - key for installing preemption notifiers
 * @link: internal use
 * @ops: defines the notifier functions to be called
 *
 * Usually used in conjunction with container_of().
 */
struct preempt_notifier {
        struct hlist_node link;
        struct preempt_ops *ops;
};

void preempt_notifier_inc(void);
void preempt_notifier_dec(void);
void preempt_notifier_register(struct preempt_notifier *notifier);
void preempt_notifier_unregister(struct preempt_notifier *notifier);

static inline void preempt_notifier_init(struct preempt_notifier *notifier,
                                     struct preempt_ops *ops)
{
        /* INIT_HLIST_NODE() open coded, to avoid dependency on list.h */
        notifier->link.next = NULL;
        notifier->link.pprev = NULL;
        notifier->ops = ops;
}

#endif

/*
 * Migrate-Disable and why it is undesired.
 *
 * When a preempted task becomes eligible to run under the ideal model (IOW it
 * becomes one of the M highest priority tasks), it might still have to wait
 * for the preemptee's migrate_disable() section to complete. Thereby suffering
 * a reduction in bandwidth in the exact duration of the migrate_disable()
 * section.
 *
 * Per this argument, the change from preempt_disable() to migrate_disable()
 * gets us:
 *
 * - a higher priority tasks gains reduced wake-up latency; with preempt_disable()
 *   it would have had to wait for the lower priority task.
 *
 * - a lower priority tasks; which under preempt_disable() could've instantly
 *   migrated away when another CPU becomes available, is now constrained
 *   by the ability to push the higher priority task away, which might itself be
 *   in a migrate_disable() section, reducing its available bandwidth.
 *
 * IOW it trades latency / moves the interference term, but it stays in the
 * system, and as long as it remains unbounded, the system is not fully
 * deterministic.
 *
 *
 * The reason we have it anyway.
 *
 * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
 * number of primitives into becoming preemptible, they would also allow
 * migration. This turns out to break a bunch of per-cpu usage. To this end,
 * all these primitives employ migrate_disable() to restore this implicit
 * assumption.
 *
 * This is a 'temporary' work-around at best. The correct solution is getting
 * rid of the above assumptions and reworking the code to employ explicit
 * per-cpu locking or short preempt-disable regions.
 *
 * The end goal must be to get rid of migrate_disable(), alternatively we need
 * a schedulability theory that does not depend on arbitrary migration.
 *
 *
 * Notes on the implementation.
 *
 * The implementation is particularly tricky since existing code patterns
 * dictate neither migrate_disable() nor migrate_enable() is allowed to block.
 * This means that it cannot use cpus_read_lock() to serialize against hotplug,
 * nor can it easily migrate itself into a pending affinity mask change on
 * migrate_enable().
 *
 *
 * Note: even non-work-conserving schedulers like semi-partitioned depends on
 *       migration, so migrate_disable() is not only a problem for
 *       work-conserving schedulers.
 *
 */

/**
 * preempt_disable_nested - Disable preemption inside a normally preempt disabled section
 *
 * Use for code which requires preemption protection inside a critical
 * section which has preemption disabled implicitly on non-PREEMPT_RT
 * enabled kernels, by e.g.:
 *  - holding a spinlock/rwlock
 *  - soft interrupt context
 *  - regular interrupt handlers
 *
 * On PREEMPT_RT enabled kernels spinlock/rwlock held sections, soft
 * interrupt context and regular interrupt handlers are preemptible and
 * only prevent migration. preempt_disable_nested() ensures that preemption
 * is disabled for cases which require CPU local serialization even on
 * PREEMPT_RT. For non-PREEMPT_RT kernels this is a NOP.
 *
 * The use cases are code sequences which are not serialized by a
 * particular lock instance, e.g.:
 *  - seqcount write side critical sections where the seqcount is not
 *    associated to a particular lock and therefore the automatic
 *    protection mechanism does not work. This prevents a live lock
 *    against a preempting high priority reader.
 *  - RMW per CPU variable updates like vmstat.
 */
/* Macro to avoid header recursion hell vs. lockdep */
#define preempt_disable_nested()                                \
do {                                                                \
        if (IS_ENABLED(CONFIG_PREEMPT_RT))                        \
                preempt_disable();                                \
        else                                                        \
                lockdep_assert_preemption_disabled();                \
} while (0)

/**
 * preempt_enable_nested - Undo the effect of preempt_disable_nested()
 */
static __always_inline void preempt_enable_nested(void)
{
        if (IS_ENABLED(CONFIG_PREEMPT_RT))
                preempt_enable();
}

DEFINE_LOCK_GUARD_0(preempt, preempt_disable(), preempt_enable())
DEFINE_LOCK_GUARD_0(preempt_notrace, preempt_disable_notrace(), preempt_enable_notrace())

#ifdef CONFIG_PREEMPT_DYNAMIC

extern bool preempt_model_none(void);
extern bool preempt_model_voluntary(void);
extern bool preempt_model_full(void);
extern bool preempt_model_lazy(void);

#else

static inline bool preempt_model_none(void)
{
        return IS_ENABLED(CONFIG_PREEMPT_NONE);
}
static inline bool preempt_model_voluntary(void)
{
        return IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY);
}
static inline bool preempt_model_full(void)
{
        return IS_ENABLED(CONFIG_PREEMPT);
}

static inline bool preempt_model_lazy(void)
{
        return IS_ENABLED(CONFIG_PREEMPT_LAZY);
}

#endif

static inline bool preempt_model_rt(void)
{
        return IS_ENABLED(CONFIG_PREEMPT_RT);
}

extern const char *preempt_model_str(void);

/*
 * Does the preemption model allow non-cooperative preemption?
 *
 * For !CONFIG_PREEMPT_DYNAMIC kernels this is an exact match with
 * CONFIG_PREEMPTION; for CONFIG_PREEMPT_DYNAMIC this doesn't work as the
 * kernel is *built* with CONFIG_PREEMPTION=y but may run with e.g. the
 * PREEMPT_NONE model.
 */
static inline bool preempt_model_preemptible(void)
{
        return preempt_model_full() || preempt_model_lazy() || preempt_model_rt();
}

#endif /* __LINUX_PREEMPT_H */




































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * kernfs.h - pseudo filesystem decoupled from vfs locking
 */

#ifndef __LINUX_KERNFS_H
#define __LINUX_KERNFS_H

#include <linux/err.h>
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/idr.h>
#include <linux/lockdep.h>
#include <linux/rbtree.h>
#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/types.h>
#include <linux/uidgid.h>
#include <linux/wait.h>
#include <linux/rwsem.h>
#include <linux/cache.h>

struct file;
struct dentry;
struct iattr;
struct seq_file;
struct vm_area_struct;
struct vm_operations_struct;
struct super_block;
struct file_system_type;
struct poll_table_struct;
struct fs_context;

struct kernfs_fs_context;
struct kernfs_open_node;
struct kernfs_iattrs;

/*
 * NR_KERNFS_LOCK_BITS determines size (NR_KERNFS_LOCKS) of hash
 * table of locks.
 * Having a small hash table would impact scalability, since
 * more and more kernfs_node objects will end up using same lock
 * and having a very large hash table would waste memory.
 *
 * At the moment size of hash table of locks is being set based on
 * the number of CPUs as follows:
 *
 * NR_CPU      NR_KERNFS_LOCK_BITS      NR_KERNFS_LOCKS
 *   1                  1                       2
 *  2-3                 2                       4
 *  4-7                 4                       16
 *  8-15                6                       64
 *  16-31               8                       256
 *  32 and more         10                      1024
 *
 * The above relation between NR_CPU and number of locks is based
 * on some internal experimentation which involved booting qemu
 * with different values of smp, performing some sysfs operations
 * on all CPUs and observing how increase in number of locks impacts
 * completion time of these sysfs operations on each CPU.
 */
#ifdef CONFIG_SMP
#define NR_KERNFS_LOCK_BITS (2 * (ilog2(NR_CPUS < 32 ? NR_CPUS : 32)))
#else
#define NR_KERNFS_LOCK_BITS     1
#endif

#define NR_KERNFS_LOCKS     (1 << NR_KERNFS_LOCK_BITS)

/*
 * There's one kernfs_open_file for each open file and one kernfs_open_node
 * for each kernfs_node with one or more open files.
 *
 * filp->private_data points to seq_file whose ->private points to
 * kernfs_open_file.
 *
 * kernfs_open_files are chained at kernfs_open_node->files, which is
 * protected by kernfs_global_locks.open_file_mutex[i].
 *
 * To reduce possible contention in sysfs access, arising due to single
 * locks, use an array of locks (e.g. open_file_mutex) and use kernfs_node
 * object address as hash keys to get the index of these locks.
 *
 * Hashed mutexes are safe to use here because operations using these don't
 * rely on global exclusion.
 *
 * In future we intend to replace other global locks with hashed ones as well.
 * kernfs_global_locks acts as a holder for all such hash tables.
 */
struct kernfs_global_locks {
        struct mutex open_file_mutex[NR_KERNFS_LOCKS];
};

enum kernfs_node_type {
        KERNFS_DIR                = 0x0001,
        KERNFS_FILE                = 0x0002,
        KERNFS_LINK                = 0x0004,
};

#define KERNFS_TYPE_MASK                0x000f
#define KERNFS_FLAG_MASK                ~KERNFS_TYPE_MASK
#define KERNFS_MAX_USER_XATTRS                128
#define KERNFS_USER_XATTR_SIZE_LIMIT        (128 << 10)

enum kernfs_node_flag {
        KERNFS_ACTIVATED        = 0x0010,
        KERNFS_NS                = 0x0020,
        KERNFS_HAS_SEQ_SHOW        = 0x0040,
        KERNFS_HAS_MMAP                = 0x0080,
        KERNFS_LOCKDEP                = 0x0100,
        KERNFS_HIDDEN                = 0x0200,
        KERNFS_SUICIDAL                = 0x0400,
        KERNFS_SUICIDED                = 0x0800,
        KERNFS_EMPTY_DIR        = 0x1000,
        KERNFS_HAS_RELEASE        = 0x2000,
        KERNFS_REMOVING                = 0x4000,
};

/* @flags for kernfs_create_root() */
enum kernfs_root_flag {
        /*
         * kernfs_nodes are created in the deactivated state and invisible.
         * They require explicit kernfs_activate() to become visible.  This
         * can be used to make related nodes become visible atomically
         * after all nodes are created successfully.
         */
        KERNFS_ROOT_CREATE_DEACTIVATED                = 0x0001,

        /*
         * For regular files, if the opener has CAP_DAC_OVERRIDE, open(2)
         * succeeds regardless of the RW permissions.  sysfs had an extra
         * layer of enforcement where open(2) fails with -EACCES regardless
         * of CAP_DAC_OVERRIDE if the permission doesn't have the
         * respective read or write access at all (none of S_IRUGO or
         * S_IWUGO) or the respective operation isn't implemented.  The
         * following flag enables that behavior.
         */
        KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK        = 0x0002,

        /*
         * The filesystem supports exportfs operation, so userspace can use
         * fhandle to access nodes of the fs.
         */
        KERNFS_ROOT_SUPPORT_EXPORTOP                = 0x0004,

        /*
         * Support user xattrs to be written to nodes rooted at this root.
         */
        KERNFS_ROOT_SUPPORT_USER_XATTR                = 0x0008,

        /*
         * Renames must not change the parent node.
         */
        KERNFS_ROOT_INVARIANT_PARENT                = 0x0010,
};

/* type-specific structures for kernfs_node union members */
struct kernfs_elem_dir {
        unsigned long                subdirs;
        /* children rbtree starts here and goes through kn->rb */
        struct rb_root                children;

        /*
         * The kernfs hierarchy this directory belongs to.  This fits
         * better directly in kernfs_node but is here to save space.
         */
        struct kernfs_root        *root;
        /*
         * Monotonic revision counter, used to identify if a directory
         * node has changed during negative dentry revalidation.
         */
        unsigned long                rev;
};

struct kernfs_elem_symlink {
        struct kernfs_node        *target_kn;
};

struct kernfs_elem_attr {
        const struct kernfs_ops        *ops;
        struct kernfs_open_node __rcu        *open;
        loff_t                        size;
        struct kernfs_node        *notify_next;        /* for kernfs_notify() */
};

/*
 * kernfs_node - the building block of kernfs hierarchy.  Each and every
 * kernfs node is represented by single kernfs_node.  Most fields are
 * private to kernfs and shouldn't be accessed directly by kernfs users.
 *
 * As long as count reference is held, the kernfs_node itself is
 * accessible.  Dereferencing elem or any other outer entity requires
 * active reference.
 */
struct kernfs_node {
        atomic_t                count;
        atomic_t                active;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map        dep_map;
#endif
        /*
         * Use kernfs_get_parent() and kernfs_name/path() instead of
         * accessing the following two fields directly.  If the node is
         * never moved to a different parent, it is safe to access the
         * parent directly.
         */
        struct kernfs_node        __rcu *__parent;
        const char                __rcu *name;

        struct rb_node                rb;

        const void                *ns;        /* namespace tag */
        unsigned int                hash;        /* ns + name hash */
        unsigned short                flags;
        umode_t                        mode;

        union {
                struct kernfs_elem_dir                dir;
                struct kernfs_elem_symlink        symlink;
                struct kernfs_elem_attr                attr;
        };

        /*
         * 64bit unique ID.  On 64bit ino setups, id is the ino.  On 32bit,
         * the low 32bits are ino and upper generation.
         */
        u64                        id;

        void                        *priv;
        struct kernfs_iattrs        *iattr;

        struct rcu_head                rcu;
};

/*
 * kernfs_syscall_ops may be specified on kernfs_create_root() to support
 * syscalls.  These optional callbacks are invoked on the matching syscalls
 * and can perform any kernfs operations which don't necessarily have to be
 * the exact operation requested.  An active reference is held for each
 * kernfs_node parameter.
 */
struct kernfs_syscall_ops {
        int (*show_options)(struct seq_file *sf, struct kernfs_root *root);

        int (*mkdir)(struct kernfs_node *parent, const char *name,
                     umode_t mode);
        int (*rmdir)(struct kernfs_node *kn);
        int (*rename)(struct kernfs_node *kn, struct kernfs_node *new_parent,
                      const char *new_name);
        int (*show_path)(struct seq_file *sf, struct kernfs_node *kn,
                         struct kernfs_root *root);
};

struct kernfs_node *kernfs_root_to_node(struct kernfs_root *root);

struct kernfs_open_file {
        /* published fields */
        struct kernfs_node        *kn;
        struct file                *file;
        struct seq_file                *seq_file;
        void                        *priv;

        /* private fields, do not use outside kernfs proper */
        struct mutex                mutex;
        struct mutex                prealloc_mutex;
        int                        event;
        struct list_head        list;
        char                        *prealloc_buf;

        size_t                        atomic_write_len;
        bool                        mmapped:1;
        bool                        released:1;
        const struct vm_operations_struct *vm_ops;
};

struct kernfs_ops {
        /*
         * Optional open/release methods.  Both are called with
         * @of->seq_file populated.
         */
        int (*open)(struct kernfs_open_file *of);
        void (*release)(struct kernfs_open_file *of);

        /*
         * Read is handled by either seq_file or raw_read().
         *
         * If seq_show() is present, seq_file path is active.  Other seq
         * operations are optional and if not implemented, the behavior is
         * equivalent to single_open().  @sf->private points to the
         * associated kernfs_open_file.
         *
         * read() is bounced through kernel buffer and a read larger than
         * PAGE_SIZE results in partial operation of PAGE_SIZE.
         */
        int (*seq_show)(struct seq_file *sf, void *v);

        void *(*seq_start)(struct seq_file *sf, loff_t *ppos);
        void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos);
        void (*seq_stop)(struct seq_file *sf, void *v);

        ssize_t (*read)(struct kernfs_open_file *of, char *buf, size_t bytes,
                        loff_t off);

        /*
         * write() is bounced through kernel buffer.  If atomic_write_len
         * is not set, a write larger than PAGE_SIZE results in partial
         * operations of PAGE_SIZE chunks.  If atomic_write_len is set,
         * writes upto the specified size are executed atomically but
         * larger ones are rejected with -E2BIG.
         */
        size_t atomic_write_len;
        /*
         * "prealloc" causes a buffer to be allocated at open for
         * all read/write requests.  As ->seq_show uses seq_read()
         * which does its own allocation, it is incompatible with
         * ->prealloc.  Provide ->read and ->write with ->prealloc.
         */
        bool prealloc;
        ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t bytes,
                         loff_t off);

        __poll_t (*poll)(struct kernfs_open_file *of,
                         struct poll_table_struct *pt);

        int (*mmap)(struct kernfs_open_file *of, struct vm_area_struct *vma);
        loff_t (*llseek)(struct kernfs_open_file *of, loff_t offset, int whence);
};

/*
 * The kernfs superblock creation/mount parameter context.
 */
struct kernfs_fs_context {
        struct kernfs_root        *root;                /* Root of the hierarchy being mounted */
        void                        *ns_tag;        /* Namespace tag of the mount (or NULL) */
        unsigned long                magic;                /* File system specific magic number */

        /* The following are set/used by kernfs_mount() */
        bool                        new_sb_created;        /* Set to T if we allocated a new sb */
};

#ifdef CONFIG_KERNFS

static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn)
{
        return kn->flags & KERNFS_TYPE_MASK;
}

static inline ino_t kernfs_id_ino(u64 id)
{
        /* id is ino if ino_t is 64bit; otherwise, low 32bits */
        if (sizeof(ino_t) >= sizeof(u64))
                return id;
        else
                return (u32)id;
}

static inline u32 kernfs_id_gen(u64 id)
{
        /* gen is fixed at 1 if ino_t is 64bit; otherwise, high 32bits */
        if (sizeof(ino_t) >= sizeof(u64))
                return 1;
        else
                return id >> 32;
}

static inline ino_t kernfs_ino(struct kernfs_node *kn)
{
        return kernfs_id_ino(kn->id);
}

static inline ino_t kernfs_gen(struct kernfs_node *kn)
{
        return kernfs_id_gen(kn->id);
}

/**
 * kernfs_enable_ns - enable namespace under a directory
 * @kn: directory of interest, should be empty
 *
 * This is to be called right after @kn is created to enable namespace
 * under it.  All children of @kn must have non-NULL namespace tags and
 * only the ones which match the super_block's tag will be visible.
 */
static inline void kernfs_enable_ns(struct kernfs_node *kn)
{
        WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR);
        WARN_ON_ONCE(!RB_EMPTY_ROOT(&kn->dir.children));
        kn->flags |= KERNFS_NS;
}

/**
 * kernfs_ns_enabled - test whether namespace is enabled
 * @kn: the node to test
 *
 * Test whether namespace filtering is enabled for the children of @ns.
 */
static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
{
        return kn->flags & KERNFS_NS;
}

int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen);
int kernfs_path_from_node(struct kernfs_node *kn_to, struct kernfs_node *kn_from,
                          char *buf, size_t buflen);
void pr_cont_kernfs_name(struct kernfs_node *kn);
void pr_cont_kernfs_path(struct kernfs_node *kn);
struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn);
struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
                                           const char *name, const void *ns);
struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent,
                                           const char *path, const void *ns);
void kernfs_get(struct kernfs_node *kn);
void kernfs_put(struct kernfs_node *kn);

struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry);
struct kernfs_root *kernfs_root_from_sb(struct super_block *sb);
struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn);

struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
                                  struct super_block *sb);
struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
                                       unsigned int flags, void *priv);
void kernfs_destroy_root(struct kernfs_root *root);
unsigned int kernfs_root_flags(struct kernfs_node *kn);

struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
                                         const char *name, umode_t mode,
                                         kuid_t uid, kgid_t gid,
                                         void *priv, const void *ns);
struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent,
                                            const char *name);
struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
                                         const char *name, umode_t mode,
                                         kuid_t uid, kgid_t gid,
                                         loff_t size,
                                         const struct kernfs_ops *ops,
                                         void *priv, const void *ns,
                                         struct lock_class_key *key);
struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
                                       const char *name,
                                       struct kernfs_node *target);
void kernfs_activate(struct kernfs_node *kn);
void kernfs_show(struct kernfs_node *kn, bool show);
void kernfs_remove(struct kernfs_node *kn);
void kernfs_break_active_protection(struct kernfs_node *kn);
void kernfs_unbreak_active_protection(struct kernfs_node *kn);
bool kernfs_remove_self(struct kernfs_node *kn);
int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
                             const void *ns);
int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
                     const char *new_name, const void *new_ns);
int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr);
__poll_t kernfs_generic_poll(struct kernfs_open_file *of,
                             struct poll_table_struct *pt);
void kernfs_notify(struct kernfs_node *kn);

int kernfs_xattr_get(struct kernfs_node *kn, const char *name,
                     void *value, size_t size);
int kernfs_xattr_set(struct kernfs_node *kn, const char *name,
                     const void *value, size_t size, int flags);

const void *kernfs_super_ns(struct super_block *sb);
int kernfs_get_tree(struct fs_context *fc);
void kernfs_free_fs_context(struct fs_context *fc);
void kernfs_kill_sb(struct super_block *sb);

void kernfs_init(void);

struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
                                                   u64 id);
#else        /* CONFIG_KERNFS */

static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn)
{ return 0; }        /* whatever */

static inline void kernfs_enable_ns(struct kernfs_node *kn) { }

static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
{ return false; }

static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
{ return -ENOSYS; }

static inline int kernfs_path_from_node(struct kernfs_node *root_kn,
                                        struct kernfs_node *kn,
                                        char *buf, size_t buflen)
{ return -ENOSYS; }

static inline void pr_cont_kernfs_name(struct kernfs_node *kn) { }
static inline void pr_cont_kernfs_path(struct kernfs_node *kn) { }

static inline struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
{ return NULL; }

static inline struct kernfs_node *
kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name,
                       const void *ns)
{ return NULL; }
static inline struct kernfs_node *
kernfs_walk_and_get_ns(struct kernfs_node *parent, const char *path,
                       const void *ns)
{ return NULL; }

static inline void kernfs_get(struct kernfs_node *kn) { }
static inline void kernfs_put(struct kernfs_node *kn) { }

static inline struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry)
{ return NULL; }

static inline struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
{ return NULL; }

static inline struct inode *
kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn)
{ return NULL; }

static inline struct kernfs_root *
kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags,
                   void *priv)
{ return ERR_PTR(-ENOSYS); }

static inline void kernfs_destroy_root(struct kernfs_root *root) { }
static inline unsigned int kernfs_root_flags(struct kernfs_node *kn)
{ return 0; }

static inline struct kernfs_node *
kernfs_create_dir_ns(struct kernfs_node *parent, const char *name,
                     umode_t mode, kuid_t uid, kgid_t gid,
                     void *priv, const void *ns)
{ return ERR_PTR(-ENOSYS); }

static inline struct kernfs_node *
__kernfs_create_file(struct kernfs_node *parent, const char *name,
                     umode_t mode, kuid_t uid, kgid_t gid,
                     loff_t size, const struct kernfs_ops *ops,
                     void *priv, const void *ns, struct lock_class_key *key)
{ return ERR_PTR(-ENOSYS); }

static inline struct kernfs_node *
kernfs_create_link(struct kernfs_node *parent, const char *name,
                   struct kernfs_node *target)
{ return ERR_PTR(-ENOSYS); }

static inline void kernfs_activate(struct kernfs_node *kn) { }

static inline void kernfs_remove(struct kernfs_node *kn) { }

static inline bool kernfs_remove_self(struct kernfs_node *kn)
{ return false; }

static inline int kernfs_remove_by_name_ns(struct kernfs_node *kn,
                                           const char *name, const void *ns)
{ return -ENOSYS; }

static inline int kernfs_rename_ns(struct kernfs_node *kn,
                                   struct kernfs_node *new_parent,
                                   const char *new_name, const void *new_ns)
{ return -ENOSYS; }

static inline int kernfs_setattr(struct kernfs_node *kn,
                                 const struct iattr *iattr)
{ return -ENOSYS; }

static inline __poll_t kernfs_generic_poll(struct kernfs_open_file *of,
                                           struct poll_table_struct *pt)
{ return -ENOSYS; }

static inline void kernfs_notify(struct kernfs_node *kn) { }

static inline int kernfs_xattr_get(struct kernfs_node *kn, const char *name,
                                   void *value, size_t size)
{ return -ENOSYS; }

static inline int kernfs_xattr_set(struct kernfs_node *kn, const char *name,
                                   const void *value, size_t size, int flags)
{ return -ENOSYS; }

static inline const void *kernfs_super_ns(struct super_block *sb)
{ return NULL; }

static inline int kernfs_get_tree(struct fs_context *fc)
{ return -ENOSYS; }

static inline void kernfs_free_fs_context(struct fs_context *fc) { }

static inline void kernfs_kill_sb(struct super_block *sb) { }

static inline void kernfs_init(void) { }

#endif        /* CONFIG_KERNFS */

/**
 * kernfs_path - build full path of a given node
 * @kn: kernfs_node of interest
 * @buf: buffer to copy @kn's name into
 * @buflen: size of @buf
 *
 * If @kn is NULL result will be "(null)".
 *
 * Returns the length of the full path.  If the full length is equal to or
 * greater than @buflen, @buf contains the truncated path with the trailing
 * '\0'.  On error, -errno is returned.
 */
static inline int kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
{
        return kernfs_path_from_node(kn, NULL, buf, buflen);
}

static inline struct kernfs_node *
kernfs_find_and_get(struct kernfs_node *kn, const char *name)
{
        return kernfs_find_and_get_ns(kn, name, NULL);
}

static inline struct kernfs_node *
kernfs_walk_and_get(struct kernfs_node *kn, const char *path)
{
        return kernfs_walk_and_get_ns(kn, path, NULL);
}

static inline struct kernfs_node *
kernfs_create_dir(struct kernfs_node *parent, const char *name, umode_t mode,
                  void *priv)
{
        return kernfs_create_dir_ns(parent, name, mode,
                                    GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
                                    priv, NULL);
}

static inline int kernfs_remove_by_name(struct kernfs_node *parent,
                                        const char *name)
{
        return kernfs_remove_by_name_ns(parent, name, NULL);
}

static inline int kernfs_rename(struct kernfs_node *kn,
                                struct kernfs_node *new_parent,
                                const char *new_name)
{
        return kernfs_rename_ns(kn, new_parent, new_name, NULL);
}

#endif        /* __LINUX_KERNFS_H */































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
/*
 * include/net/tipc.h: Include file for TIPC message header routines
 *
 * Copyright (c) 2017 Ericsson AB
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the names of the copyright holders nor the names of its
 *    contributors may be used to endorse or promote products derived from
 *    this software without specific prior written permission.
 *
 * Alternatively, this software may be distributed under the terms of the
 * GNU General Public License ("GPL") version 2 as published by the Free
 * Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef _TIPC_HDR_H
#define _TIPC_HDR_H

#include <linux/random.h>

#define KEEPALIVE_MSG_MASK 0x0e080000  /* LINK_PROTOCOL + MSG_IS_KEEPALIVE */

struct tipc_basic_hdr {
        __be32 w[4];
};

static inline __be32 tipc_hdr_rps_key(struct tipc_basic_hdr *hdr)
{
        u32 w0 = ntohl(hdr->w[0]);
        bool keepalive_msg = (w0 & KEEPALIVE_MSG_MASK) == KEEPALIVE_MSG_MASK;
        __be32 key;

        /* Return source node identity as key */
        if (likely(!keepalive_msg))
                return hdr->w[3];

        /* Spread PROBE/PROBE_REPLY messages across the cores */
        get_random_bytes(&key, sizeof(key));
        return key;
}

#endif

















































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BLK_INTERNAL_H
#define BLK_INTERNAL_H

#include <linux/bio-integrity.h>
#include <linux/blk-crypto.h>
#include <linux/lockdep.h>
#include <linux/memblock.h>        /* for max_pfn/max_low_pfn */
#include <linux/sched/sysctl.h>
#include <linux/timekeeping.h>
#include <xen/xen.h>
#include "blk-crypto-internal.h"

struct elevator_type;
struct elevator_tags;

/*
 * Default upper limit for the software max_sectors limit used for regular I/Os.
 * This can be increased through sysfs.
 *
 * This should not be confused with the max_hw_sector limit that is entirely
 * controlled by the block device driver, usually based on hardware limits.
 */
#define BLK_DEF_MAX_SECTORS_CAP        (SZ_4M >> SECTOR_SHIFT)

#define        BLK_DEV_MAX_SECTORS        (LLONG_MAX >> 9)
#define        BLK_MIN_SEGMENT_SIZE        4096

/* Max future timer expiry for timeouts */
#define BLK_MAX_TIMEOUT                (5 * HZ)

extern const struct kobj_type blk_queue_ktype;
extern struct dentry *blk_debugfs_root;

struct blk_flush_queue {
        spinlock_t                mq_flush_lock;
        unsigned int                flush_pending_idx:1;
        unsigned int                flush_running_idx:1;
        blk_status_t                 rq_status;
        unsigned long                flush_pending_since;
        struct list_head        flush_queue[2];
        unsigned long                flush_data_in_flight;
        struct request                *flush_rq;
        struct rcu_head                rcu_head;
};

bool is_flush_rq(struct request *req);

struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
                                              gfp_t flags);
void blk_free_flush_queue(struct blk_flush_queue *q);

bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
bool blk_queue_start_drain(struct request_queue *q);
bool __blk_freeze_queue_start(struct request_queue *q,
                              struct task_struct *owner);
int __bio_queue_enter(struct request_queue *q, struct bio *bio);
void submit_bio_noacct_nocheck(struct bio *bio, bool split);
void bio_await_chain(struct bio *bio);

static inline bool blk_try_enter_queue(struct request_queue *q, bool pm)
{
        rcu_read_lock();
        if (!percpu_ref_tryget_live_rcu(&q->q_usage_counter))
                goto fail;

        /*
         * The code that increments the pm_only counter must ensure that the
         * counter is globally visible before the queue is unfrozen.
         */
        if (blk_queue_pm_only(q) &&
            (!pm || queue_rpm_status(q) == RPM_SUSPENDED))
                goto fail_put;

        rcu_read_unlock();
        return true;

fail_put:
        blk_queue_exit(q);
fail:
        rcu_read_unlock();
        return false;
}

static inline int bio_queue_enter(struct bio *bio)
{
        struct request_queue *q = bdev_get_queue(bio->bi_bdev);

        if (blk_try_enter_queue(q, false)) {
                rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_);
                rwsem_release(&q->io_lockdep_map, _RET_IP_);
                return 0;
        }
        return __bio_queue_enter(q, bio);
}

static inline void blk_wait_io(struct completion *done)
{
        /* Prevent hang_check timer from firing at us during very long I/O */
        unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2;

        if (timeout)
                while (!wait_for_completion_io_timeout(done, timeout))
                        ;
        else
                wait_for_completion_io(done);
}

struct block_device *blkdev_get_no_open(dev_t dev, bool autoload);
void blkdev_put_no_open(struct block_device *bdev);

#define BIO_INLINE_VECS 4
struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
                gfp_t gfp_mask);
void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs);

bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
                struct page *page, unsigned len, unsigned offset);

static inline bool biovec_phys_mergeable(struct request_queue *q,
                struct bio_vec *vec1, struct bio_vec *vec2)
{
        unsigned long mask = queue_segment_boundary(q);
        phys_addr_t addr1 = bvec_phys(vec1);
        phys_addr_t addr2 = bvec_phys(vec2);

        /*
         * Merging adjacent physical pages may not work correctly under KMSAN
         * if their metadata pages aren't adjacent. Just disable merging.
         */
        if (IS_ENABLED(CONFIG_KMSAN))
                return false;

        if (addr1 + vec1->bv_len != addr2)
                return false;
        if (xen_domain() && !xen_biovec_phys_mergeable(vec1, vec2->bv_page))
                return false;
        if ((addr1 | mask) != ((addr2 + vec2->bv_len - 1) | mask))
                return false;
        return true;
}

static inline bool __bvec_gap_to_prev(const struct queue_limits *lim,
                struct bio_vec *bprv, unsigned int offset)
{
        return (offset & lim->virt_boundary_mask) ||
                ((bprv->bv_offset + bprv->bv_len) & lim->virt_boundary_mask);
}

/*
 * Check if adding a bio_vec after bprv with offset would create a gap in
 * the SG list. Most drivers don't care about this, but some do.
 */
static inline bool bvec_gap_to_prev(const struct queue_limits *lim,
                struct bio_vec *bprv, unsigned int offset)
{
        if (!lim->virt_boundary_mask)
                return false;
        return __bvec_gap_to_prev(lim, bprv, offset);
}

static inline bool rq_mergeable(struct request *rq)
{
        if (blk_rq_is_passthrough(rq))
                return false;

        if (req_op(rq) == REQ_OP_FLUSH)
                return false;

        if (req_op(rq) == REQ_OP_WRITE_ZEROES)
                return false;

        if (req_op(rq) == REQ_OP_ZONE_APPEND)
                return false;

        if (rq->cmd_flags & REQ_NOMERGE_FLAGS)
                return false;
        if (rq->rq_flags & RQF_NOMERGE_FLAGS)
                return false;

        return true;
}

/*
 * There are two different ways to handle DISCARD merges:
 *  1) If max_discard_segments > 1, the driver treats every bio as a range and
 *     send the bios to controller together. The ranges don't need to be
 *     contiguous.
 *  2) Otherwise, the request will be normal read/write requests.  The ranges
 *     need to be contiguous.
 */
static inline bool blk_discard_mergable(struct request *req)
{
        if (req_op(req) == REQ_OP_DISCARD &&
            queue_max_discard_segments(req->q) > 1)
                return true;
        return false;
}

static inline unsigned int blk_rq_get_max_segments(struct request *rq)
{
        if (req_op(rq) == REQ_OP_DISCARD)
                return queue_max_discard_segments(rq->q);
        return queue_max_segments(rq->q);
}

static inline unsigned int blk_queue_get_max_sectors(struct request *rq)
{
        struct request_queue *q = rq->q;
        enum req_op op = req_op(rq);

        if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE))
                return min(q->limits.max_discard_sectors,
                           UINT_MAX >> SECTOR_SHIFT);

        if (unlikely(op == REQ_OP_WRITE_ZEROES))
                return q->limits.max_write_zeroes_sectors;

        if (rq->cmd_flags & REQ_ATOMIC)
                return q->limits.atomic_write_max_sectors;

        return q->limits.max_sectors;
}

#ifdef CONFIG_BLK_DEV_INTEGRITY
void blk_flush_integrity(void);
void bio_integrity_free(struct bio *bio);

/*
 * Integrity payloads can either be owned by the submitter, in which case
 * bio_uninit will free them, or owned and generated by the block layer,
 * in which case we'll verify them here (for reads) and free them before
 * the bio is handed back to the submitted.
 */
bool __bio_integrity_endio(struct bio *bio);
static inline bool bio_integrity_endio(struct bio *bio)
{
        struct bio_integrity_payload *bip = bio_integrity(bio);

        if (bip && (bip->bip_flags & BIP_BLOCK_INTEGRITY))
                return __bio_integrity_endio(bio);
        return true;
}

bool blk_integrity_merge_rq(struct request_queue *, struct request *,
                struct request *);
bool blk_integrity_merge_bio(struct request_queue *, struct request *,
                struct bio *);

static inline bool integrity_req_gap_back_merge(struct request *req,
                struct bio *next)
{
        struct bio_integrity_payload *bip = bio_integrity(req->bio);
        struct bio_integrity_payload *bip_next = bio_integrity(next);

        return bvec_gap_to_prev(&req->q->limits,
                                &bip->bip_vec[bip->bip_vcnt - 1],
                                bip_next->bip_vec[0].bv_offset);
}

static inline bool integrity_req_gap_front_merge(struct request *req,
                struct bio *bio)
{
        struct bio_integrity_payload *bip = bio_integrity(bio);
        struct bio_integrity_payload *bip_next = bio_integrity(req->bio);

        return bvec_gap_to_prev(&req->q->limits,
                                &bip->bip_vec[bip->bip_vcnt - 1],
                                bip_next->bip_vec[0].bv_offset);
}

extern const struct attribute_group blk_integrity_attr_group;
#else /* CONFIG_BLK_DEV_INTEGRITY */
static inline bool blk_integrity_merge_rq(struct request_queue *rq,
                struct request *r1, struct request *r2)
{
        return true;
}
static inline bool blk_integrity_merge_bio(struct request_queue *rq,
                struct request *r, struct bio *b)
{
        return true;
}
static inline bool integrity_req_gap_back_merge(struct request *req,
                struct bio *next)
{
        return false;
}
static inline bool integrity_req_gap_front_merge(struct request *req,
                struct bio *bio)
{
        return false;
}

static inline void blk_flush_integrity(void)
{
}
static inline bool bio_integrity_endio(struct bio *bio)
{
        return true;
}
static inline void bio_integrity_free(struct bio *bio)
{
}
#endif /* CONFIG_BLK_DEV_INTEGRITY */

unsigned long blk_rq_timeout(unsigned long timeout);
void blk_add_timer(struct request *req);

enum bio_merge_status {
        BIO_MERGE_OK,
        BIO_MERGE_NONE,
        BIO_MERGE_FAILED,
};

enum bio_merge_status bio_attempt_back_merge(struct request *req,
                struct bio *bio, unsigned int nr_segs);
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
                unsigned int nr_segs);
bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
                        struct bio *bio, unsigned int nr_segs);

/*
 * Plug flush limits
 */
#define BLK_MAX_REQUEST_COUNT        32
#define BLK_PLUG_FLUSH_SIZE        (128 * 1024)

/*
 * Internal elevator interface
 */
#define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED)

bool blk_insert_flush(struct request *rq);

void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e,
                struct elevator_tags *t);
void elevator_set_default(struct request_queue *q);
void elevator_set_none(struct request_queue *q);

ssize_t part_size_show(struct device *dev, struct device_attribute *attr,
                char *buf);
ssize_t part_stat_show(struct device *dev, struct device_attribute *attr,
                char *buf);
ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
                char *buf);
ssize_t part_fail_show(struct device *dev, struct device_attribute *attr,
                char *buf);
ssize_t part_fail_store(struct device *dev, struct device_attribute *attr,
                const char *buf, size_t count);
ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
ssize_t part_timeout_store(struct device *, struct device_attribute *,
                                const char *, size_t);

struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim,
                unsigned *nsegs);
struct bio *bio_split_write_zeroes(struct bio *bio,
                const struct queue_limits *lim, unsigned *nsegs);
struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
                unsigned *nr_segs);
struct bio *bio_split_zone_append(struct bio *bio,
                const struct queue_limits *lim, unsigned *nr_segs);

/*
 * All drivers must accept single-segments bios that are smaller than PAGE_SIZE.
 *
 * This is a quick and dirty check that relies on the fact that bi_io_vec[0] is
 * always valid if a bio has data.  The check might lead to occasional false
 * positives when bios are cloned, but compared to the performance impact of
 * cloned bios themselves the loop below doesn't matter anyway.
 */
static inline bool bio_may_need_split(struct bio *bio,
                const struct queue_limits *lim)
{
        if (lim->chunk_sectors)
                return true;
        if (bio->bi_vcnt != 1)
                return true;
        return bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset >
                lim->min_segment_size;
}

/**
 * __bio_split_to_limits - split a bio to fit the queue limits
 * @bio:     bio to be split
 * @lim:     queue limits to split based on
 * @nr_segs: returns the number of segments in the returned bio
 *
 * Check if @bio needs splitting based on the queue limits, and if so split off
 * a bio fitting the limits from the beginning of @bio and return it.  @bio is
 * shortened to the remainder and re-submitted.
 *
 * The split bio is allocated from @q->bio_split, which is provided by the
 * block layer.
 */
static inline struct bio *__bio_split_to_limits(struct bio *bio,
                const struct queue_limits *lim, unsigned int *nr_segs)
{
        switch (bio_op(bio)) {
        case REQ_OP_READ:
        case REQ_OP_WRITE:
                if (bio_may_need_split(bio, lim))
                        return bio_split_rw(bio, lim, nr_segs);
                *nr_segs = 1;
                return bio;
        case REQ_OP_ZONE_APPEND:
                return bio_split_zone_append(bio, lim, nr_segs);
        case REQ_OP_DISCARD:
        case REQ_OP_SECURE_ERASE:
                return bio_split_discard(bio, lim, nr_segs);
        case REQ_OP_WRITE_ZEROES:
                return bio_split_write_zeroes(bio, lim, nr_segs);
        default:
                /* other operations can't be split */
                *nr_segs = 0;
                return bio;
        }
}

/**
 * get_max_segment_size() - maximum number of bytes to add as a single segment
 * @lim: Request queue limits.
 * @paddr: address of the range to add
 * @len: maximum length available to add at @paddr
 *
 * Returns the maximum number of bytes of the range starting at @paddr that can
 * be added to a single segment.
 */
static inline unsigned get_max_segment_size(const struct queue_limits *lim,
                phys_addr_t paddr, unsigned int len)
{
        /*
         * Prevent an overflow if mask = ULONG_MAX and offset = 0 by adding 1
         * after having calculated the minimum.
         */
        return min_t(unsigned long, len,
                min(lim->seg_boundary_mask - (lim->seg_boundary_mask & paddr),
                    (unsigned long)lim->max_segment_size - 1) + 1);
}

int ll_back_merge_fn(struct request *req, struct bio *bio,
                unsigned int nr_segs);
bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
                                struct request *next);
unsigned int blk_recalc_rq_segments(struct request *rq);
bool blk_rq_merge_ok(struct request *rq, struct bio *bio);
enum elv_merge blk_try_merge(struct request *rq, struct bio *bio);

int blk_set_default_limits(struct queue_limits *lim);
void blk_apply_bdi_limits(struct backing_dev_info *bdi,
                struct queue_limits *lim);
int blk_dev_init(void);

void update_io_ticks(struct block_device *part, unsigned long now, bool end);

static inline void req_set_nomerge(struct request_queue *q, struct request *req)
{
        req->cmd_flags |= REQ_NOMERGE;
        if (req == q->last_merge)
                q->last_merge = NULL;
}

/*
 * Internal io_context interface
 */
struct io_cq *ioc_find_get_icq(struct request_queue *q);
struct io_cq *ioc_lookup_icq(struct request_queue *q);
#ifdef CONFIG_BLK_ICQ
void ioc_clear_queue(struct request_queue *q);
#else
static inline void ioc_clear_queue(struct request_queue *q)
{
}
#endif /* CONFIG_BLK_ICQ */

#ifdef CONFIG_BLK_DEV_ZONED
void disk_init_zone_resources(struct gendisk *disk);
void disk_free_zone_resources(struct gendisk *disk);
static inline bool bio_zone_write_plugging(struct bio *bio)
{
        return bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING);
}
static inline bool blk_req_bio_is_zone_append(struct request *rq,
                                              struct bio *bio)
{
        return req_op(rq) == REQ_OP_ZONE_APPEND ||
               bio_flagged(bio, BIO_EMULATES_ZONE_APPEND);
}
void blk_zone_write_plug_bio_merged(struct bio *bio);
void blk_zone_write_plug_init_request(struct request *rq);
void blk_zone_append_update_request_bio(struct request *rq, struct bio *bio);
void blk_zone_write_plug_bio_endio(struct bio *bio);
static inline void blk_zone_bio_endio(struct bio *bio)
{
        /*
         * For write BIOs to zoned devices, signal the completion of the BIO so
         * that the next write BIO can be submitted by zone write plugging.
         */
        if (bio_zone_write_plugging(bio))
                blk_zone_write_plug_bio_endio(bio);
}

void blk_zone_write_plug_finish_request(struct request *rq);
static inline void blk_zone_finish_request(struct request *rq)
{
        if (rq->rq_flags & RQF_ZONE_WRITE_PLUGGING)
                blk_zone_write_plug_finish_request(rq);
}
int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
                unsigned long arg);
int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
                unsigned int cmd, unsigned long arg);
#else /* CONFIG_BLK_DEV_ZONED */
static inline void disk_init_zone_resources(struct gendisk *disk)
{
}
static inline void disk_free_zone_resources(struct gendisk *disk)
{
}
static inline bool bio_zone_write_plugging(struct bio *bio)
{
        return false;
}
static inline bool blk_req_bio_is_zone_append(struct request *req,
                                              struct bio *bio)
{
        return false;
}
static inline void blk_zone_write_plug_bio_merged(struct bio *bio)
{
}
static inline void blk_zone_write_plug_init_request(struct request *rq)
{
}
static inline void blk_zone_append_update_request_bio(struct request *rq,
                                                      struct bio *bio)
{
}
static inline void blk_zone_bio_endio(struct bio *bio)
{
}
static inline void blk_zone_finish_request(struct request *rq)
{
}
static inline int blkdev_report_zones_ioctl(struct block_device *bdev,
                unsigned int cmd, unsigned long arg)
{
        return -ENOTTY;
}
static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev,
                blk_mode_t mode, unsigned int cmd, unsigned long arg)
{
        return -ENOTTY;
}
#endif /* CONFIG_BLK_DEV_ZONED */

struct block_device *bdev_alloc(struct gendisk *disk, u8 partno);
void bdev_add(struct block_device *bdev, dev_t dev);
void bdev_unhash(struct block_device *bdev);
void bdev_drop(struct block_device *bdev);

int blk_alloc_ext_minor(void);
void blk_free_ext_minor(unsigned int minor);
#define ADDPART_FLAG_NONE        0
#define ADDPART_FLAG_RAID        1
#define ADDPART_FLAG_WHOLEDISK        2
#define ADDPART_FLAG_READONLY        4
int bdev_add_partition(struct gendisk *disk, int partno, sector_t start,
                sector_t length);
int bdev_del_partition(struct gendisk *disk, int partno);
int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start,
                sector_t length);
void drop_partition(struct block_device *part);

void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors);

struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
                struct lock_class_key *lkclass);

/*
 * Clean up a page appropriately, where the page may be pinned, may have a
 * ref taken on it or neither.
 */
static inline void bio_release_page(struct bio *bio, struct page *page)
{
        if (bio_flagged(bio, BIO_PAGE_PINNED))
                unpin_user_page(page);
}

struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id);

int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode);

int disk_alloc_events(struct gendisk *disk);
void disk_add_events(struct gendisk *disk);
void disk_del_events(struct gendisk *disk);
void disk_release_events(struct gendisk *disk);
void disk_block_events(struct gendisk *disk);
void disk_unblock_events(struct gendisk *disk);
void disk_flush_events(struct gendisk *disk, unsigned int mask);
extern struct device_attribute dev_attr_events;
extern struct device_attribute dev_attr_events_async;
extern struct device_attribute dev_attr_events_poll_msecs;

extern struct attribute_group blk_trace_attr_group;

blk_mode_t file_to_blk_mode(struct file *file);
int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode,
                loff_t lstart, loff_t lend);
long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
int blkdev_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);

extern const struct address_space_operations def_blk_aops;

int disk_register_independent_access_ranges(struct gendisk *disk);
void disk_unregister_independent_access_ranges(struct gendisk *disk);

int should_fail_bio(struct bio *bio);
#ifdef CONFIG_FAIL_MAKE_REQUEST
bool should_fail_request(struct block_device *part, unsigned int bytes);
#else /* CONFIG_FAIL_MAKE_REQUEST */
static inline bool should_fail_request(struct block_device *part,
                                        unsigned int bytes)
{
        return false;
}
#endif /* CONFIG_FAIL_MAKE_REQUEST */

/*
 * Optimized request reference counting. Ideally we'd make timeouts be more
 * clever, as that's the only reason we need references at all... But until
 * this happens, this is faster than using refcount_t. Also see:
 *
 * abc54d634334 ("io_uring: switch to atomic_t for io_kiocb reference count")
 */
#define req_ref_zero_or_close_to_overflow(req)        \
        ((unsigned int) atomic_read(&(req->ref)) + 127u <= 127u)

static inline bool req_ref_inc_not_zero(struct request *req)
{
        return atomic_inc_not_zero(&req->ref);
}

static inline bool req_ref_put_and_test(struct request *req)
{
        WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
        return atomic_dec_and_test(&req->ref);
}

static inline void req_ref_set(struct request *req, int value)
{
        atomic_set(&req->ref, value);
}

static inline int req_ref_read(struct request *req)
{
        return atomic_read(&req->ref);
}

static inline u64 blk_time_get_ns(void)
{
        struct blk_plug *plug = current->plug;

        if (!plug || !in_task())
                return ktime_get_ns();

        /*
         * 0 could very well be a valid time, but rather than flag "this is
         * a valid timestamp" separately, just accept that we'll do an extra
         * ktime_get_ns() if we just happen to get 0 as the current time.
         */
        if (!plug->cur_ktime) {
                plug->cur_ktime = ktime_get_ns();
                current->flags |= PF_BLOCK_TS;
        }
        return plug->cur_ktime;
}

static inline ktime_t blk_time_get(void)
{
        return ns_to_ktime(blk_time_get_ns());
}

void bdev_release(struct file *bdev_file);
int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
              const struct blk_holder_ops *hops, struct file *bdev_file);
int bdev_permission(dev_t dev, blk_mode_t mode, void *holder);

void blk_integrity_generate(struct bio *bio);
void blk_integrity_verify_iter(struct bio *bio, struct bvec_iter *saved_iter);
void blk_integrity_prepare(struct request *rq);
void blk_integrity_complete(struct request *rq, unsigned int nr_bytes);

#ifdef CONFIG_LOCKDEP
static inline void blk_freeze_acquire_lock(struct request_queue *q)
{
        if (!q->mq_freeze_disk_dead)
                rwsem_acquire(&q->io_lockdep_map, 0, 1, _RET_IP_);
        if (!q->mq_freeze_queue_dying)
                rwsem_acquire(&q->q_lockdep_map, 0, 1, _RET_IP_);
}

static inline void blk_unfreeze_release_lock(struct request_queue *q)
{
        if (!q->mq_freeze_queue_dying)
                rwsem_release(&q->q_lockdep_map, _RET_IP_);
        if (!q->mq_freeze_disk_dead)
                rwsem_release(&q->io_lockdep_map, _RET_IP_);
}
#else
static inline void blk_freeze_acquire_lock(struct request_queue *q)
{
}
static inline void blk_unfreeze_release_lock(struct request_queue *q)
{
}
#endif

#endif /* BLK_INTERNAL_H */































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 




    1 





    1 







    1 


    1 














































    1 











    1 









    1 

















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
// SPDX-License-Identifier: GPL-2.0-only
/*
 * GCM: Galois/Counter Mode.
 *
 * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi>
 */

#include <crypto/gf128mul.h>
#include <crypto/internal/aead.h>
#include <crypto/internal/skcipher.h>
#include <crypto/internal/hash.h>
#include <crypto/scatterwalk.h>
#include <crypto/gcm.h>
#include <crypto/hash.h>
#include <linux/err.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>

struct gcm_instance_ctx {
        struct crypto_skcipher_spawn ctr;
        struct crypto_ahash_spawn ghash;
};

struct crypto_gcm_ctx {
        struct crypto_skcipher *ctr;
        struct crypto_ahash *ghash;
};

struct crypto_rfc4106_ctx {
        struct crypto_aead *child;
        u8 nonce[4];
};

struct crypto_rfc4106_req_ctx {
        struct scatterlist src[3];
        struct scatterlist dst[3];
        struct aead_request subreq;
};

struct crypto_rfc4543_instance_ctx {
        struct crypto_aead_spawn aead;
};

struct crypto_rfc4543_ctx {
        struct crypto_aead *child;
        u8 nonce[4];
};

struct crypto_rfc4543_req_ctx {
        struct aead_request subreq;
};

struct crypto_gcm_ghash_ctx {
        unsigned int cryptlen;
        struct scatterlist *src;
        int (*complete)(struct aead_request *req, u32 flags);
};

struct crypto_gcm_req_priv_ctx {
        u8 iv[16];
        u8 auth_tag[16];
        u8 iauth_tag[16];
        struct scatterlist src[3];
        struct scatterlist dst[3];
        struct scatterlist sg;
        struct crypto_gcm_ghash_ctx ghash_ctx;
        union {
                struct ahash_request ahreq;
                struct skcipher_request skreq;
        } u;
};

static struct {
        u8 buf[16];
        struct scatterlist sg;
} *gcm_zeroes;

static inline struct crypto_gcm_req_priv_ctx *crypto_gcm_reqctx(
        struct aead_request *req)
{
        unsigned long align = crypto_aead_alignmask(crypto_aead_reqtfm(req));

        return (void *)PTR_ALIGN((u8 *)aead_request_ctx(req), align + 1);
}

static int crypto_gcm_setkey(struct crypto_aead *aead, const u8 *key,
                             unsigned int keylen)
{
        struct crypto_gcm_ctx *ctx = crypto_aead_ctx(aead);
        struct crypto_ahash *ghash = ctx->ghash;
        struct crypto_skcipher *ctr = ctx->ctr;
        struct {
                be128 hash;
                u8 iv[16];

                struct crypto_wait wait;

                struct scatterlist sg[1];
                struct skcipher_request req;
        } *data;
        int err;

        crypto_skcipher_clear_flags(ctr, CRYPTO_TFM_REQ_MASK);
        crypto_skcipher_set_flags(ctr, crypto_aead_get_flags(aead) &
                                       CRYPTO_TFM_REQ_MASK);
        err = crypto_skcipher_setkey(ctr, key, keylen);
        if (err)
                return err;

        data = kzalloc(sizeof(*data) + crypto_skcipher_reqsize(ctr),
                       GFP_KERNEL);
        if (!data)
                return -ENOMEM;

        crypto_init_wait(&data->wait);
        sg_init_one(data->sg, &data->hash, sizeof(data->hash));
        skcipher_request_set_tfm(&data->req, ctr);
        skcipher_request_set_callback(&data->req, CRYPTO_TFM_REQ_MAY_SLEEP |
                                                  CRYPTO_TFM_REQ_MAY_BACKLOG,
                                      crypto_req_done,
                                      &data->wait);
        skcipher_request_set_crypt(&data->req, data->sg, data->sg,
                                   sizeof(data->hash), data->iv);

        err = crypto_wait_req(crypto_skcipher_encrypt(&data->req),
                                                        &data->wait);

        if (err)
                goto out;

        crypto_ahash_clear_flags(ghash, CRYPTO_TFM_REQ_MASK);
        crypto_ahash_set_flags(ghash, crypto_aead_get_flags(aead) &
                               CRYPTO_TFM_REQ_MASK);
        err = crypto_ahash_setkey(ghash, (u8 *)&data->hash, sizeof(be128));
out:
        kfree_sensitive(data);
        return err;
}

static int crypto_gcm_setauthsize(struct crypto_aead *tfm,
                                  unsigned int authsize)
{
        return crypto_gcm_check_authsize(authsize);
}

static void crypto_gcm_init_common(struct aead_request *req)
{
        struct crypto_gcm_req_priv_ctx *pctx = crypto_gcm_reqctx(req);
        __be32 counter = cpu_to_be32(1);
        struct scatterlist *sg;

        memset(pctx->auth_tag, 0, sizeof(pctx->auth_tag));
        memcpy(pctx->iv, req->iv, GCM_AES_IV_SIZE);
        memcpy(pctx->iv + GCM_AES_IV_SIZE, &counter, 4);

        sg_init_table(pctx->src, 3);
        sg_set_buf(pctx->src, pctx->auth_tag, sizeof(pctx->auth_tag));
        sg = scatterwalk_ffwd(pctx->src + 1, req->src, req->assoclen);
        if (sg != pctx->src + 1)
                sg_chain(pctx->src, 2, sg);

        if (req->src != req->dst) {
                sg_init_table(pctx->dst, 3);
                sg_set_buf(pctx->dst, pctx->auth_tag, sizeof(pctx->auth_tag));
                sg = scatterwalk_ffwd(pctx->dst + 1, req->dst, req->assoclen);
                if (sg != pctx->dst + 1)
                        sg_chain(pctx->dst, 2, sg);
        }
}

static void crypto_gcm_init_crypt(struct aead_request *req,
                                  unsigned int cryptlen)
{
        struct crypto_aead *aead = crypto_aead_reqtfm(req);
        struct crypto_gcm_ctx *ctx = crypto_aead_ctx(aead);
        struct crypto_gcm_req_priv_ctx *pctx = crypto_gcm_reqctx(req);
        struct skcipher_request *skreq = &pctx->u.skreq;
        struct scatterlist *dst;

        dst = req->src == req->dst ? pctx->src : pctx->dst;

        skcipher_request_set_tfm(skreq, ctx->ctr);
        skcipher_request_set_crypt(skreq, pctx->src, dst,
                                     cryptlen + sizeof(pctx->auth_tag),
                                     pctx->iv);
}

static inline unsigned int gcm_remain(unsigned int len)
{
        len &= 0xfU;
        return len ? 16 - len : 0;
}

static void gcm_hash_len_done(void *data, int err);

static int gcm_hash_update(struct aead_request *req,
                           crypto_completion_t compl,
                           struct scatterlist *src,
                           unsigned int len, u32 flags)
{
        struct crypto_gcm_req_priv_ctx *pctx = crypto_gcm_reqctx(req);
        struct ahash_request *ahreq = &pctx->u.ahreq;

        ahash_request_set_callback(ahreq, flags, compl, req);
        ahash_request_set_crypt(ahreq, src, NULL, len);

        return crypto_ahash_update(ahreq);
}

static int gcm_hash_remain(struct aead_request *req,
                           unsigned int remain,
                           crypto_completion_t compl, u32 flags)
{
        return gcm_hash_update(req, compl, &gcm_zeroes->sg, remain, flags);
}

static int gcm_hash_len(struct aead_request *req, u32 flags)
{
        struct crypto_gcm_req_priv_ctx *pctx = crypto_gcm_reqctx(req);
        struct ahash_request *ahreq = &pctx->u.ahreq;
        struct crypto_gcm_ghash_ctx *gctx = &pctx->ghash_ctx;
        be128 lengths;

        lengths.a = cpu_to_be64(req->assoclen * 8);
        lengths.b = cpu_to_be64(gctx->cryptlen * 8);
        memcpy(pctx->iauth_tag, &lengths, 16);
        sg_init_one(&pctx->sg, pctx->iauth_tag, 16);
        ahash_request_set_callback(ahreq, flags, gcm_hash_len_done, req);
        ahash_request_set_crypt(ahreq, &pctx->sg,
                                pctx->iauth_tag, sizeof(lengths));

        return crypto_ahash_finup(ahreq);
}

static int gcm_hash_len_continue(struct aead_request *req, u32 flags)
{
        struct crypto_gcm_req_priv_ctx *pctx = crypto_gcm_reqctx(req);
        struct crypto_gcm_ghash_ctx *gctx = &pctx->ghash_ctx;

        return gctx->complete(req, flags);
}

static void gcm_hash_len_done(void *data, int err)
{
        struct aead_request *req = data;

        if (err)
                goto out;

        err = gcm_hash_len_continue(req, 0);
        if (err == -EINPROGRESS)
                return;

out:
        aead_request_complete(req, err);
}

static int gcm_hash_crypt_remain_continue(struct aead_request *req, u32 flags)
{
        return gcm_hash_len(req, flags) ?:
               gcm_hash_len_continue(req, flags);
}

static void gcm_hash_crypt_remain_done(void *data, int err)
{
        struct aead_request *req = data;

        if (err)
                goto out;

        err = gcm_hash_crypt_remain_continue(req, 0);
        if (err == -EINPROGRESS)
                return;

out:
        aead_request_complete(req, err);
}

static int gcm_hash_crypt_continue(struct aead_request *req, u32 flags)
{
        struct crypto_gcm_req_priv_ctx *pctx = crypto_gcm_reqctx(req);
        struct crypto_gcm_ghash_ctx *gctx = &pctx->ghash_ctx;
        unsigned int remain;

        remain = gcm_remain(gctx->cryptlen);
        if (remain)
                return gcm_hash_remain(req, remain,
                                       gcm_hash_crypt_remain_done, flags) ?:
                       gcm_hash_crypt_remain_continue(req, flags);

        return gcm_hash_crypt_remain_continue(req, flags);
}

static void gcm_hash_crypt_done(void *data, int err)
{
        struct aead_request *req = data;

        if (err)
                goto out;

        err = gcm_hash_crypt_continue(req, 0);
        if (err == -EINPROGRESS)
                return;

out:
        aead_request_complete(req, err);
}

static int gcm_hash_assoc_remain_continue(struct aead_request *req, u32 flags)
{
        struct crypto_gcm_req_priv_ctx *pctx = crypto_gcm_reqctx(req);
        struct crypto_gcm_ghash_ctx *gctx = &pctx->ghash_ctx;

        if (gctx->cryptlen)
                return gcm_hash_update(req, gcm_hash_crypt_done,
                                       gctx->src, gctx->cryptlen, flags) ?:
                       gcm_hash_crypt_continue(req, flags);

        return gcm_hash_crypt_remain_continue(req, flags);
}

static void gcm_hash_assoc_remain_done(void *data, int err)
{
        struct aead_request *req = data;

        if (err)
                goto out;

        err = gcm_hash_assoc_remain_continue(req, 0);
        if (err == -EINPROGRESS)
                return;

out:
        aead_request_complete(req, err);
}

static int gcm_hash_assoc_continue(struct aead_request *req, u32 flags)
{
        unsigned int remain;

        remain = gcm_remain(req->assoclen);
        if (remain)
                return gcm_hash_remain(req, remain,
                                       gcm_hash_assoc_remain_done, flags) ?:
                       gcm_hash_assoc_remain_continue(req, flags);

        return gcm_hash_assoc_remain_continue(req, flags);
}

static void gcm_hash_assoc_done(void *data, int err)
{
        struct aead_request *req = data;

        if (err)
                goto out;

        err = gcm_hash_assoc_continue(req, 0);
        if (err == -EINPROGRESS)
                return;

out:
        aead_request_complete(req, err);
}

static int gcm_hash_init_continue(struct aead_request *req, u32 flags)
{
        if (req->assoclen)
                return gcm_hash_update(req, gcm_hash_assoc_done,
                                       req->src, req->assoclen, flags) ?:
                       gcm_hash_assoc_continue(req, flags);

        return gcm_hash_assoc_remain_continue(req, flags);
}

static void gcm_hash_init_done(void *data, int err)
{
        struct aead_request *req = data;

        if (err)
                goto out;

        err = gcm_hash_init_continue(req, 0);
        if (err == -EINPROGRESS)
                return;

out:
        aead_request_complete(req, err);
}

static int gcm_hash(struct aead_request *req, u32 flags)
{
        struct crypto_gcm_req_priv_ctx *pctx = crypto_gcm_reqctx(req);
        struct ahash_request *ahreq = &pctx->u.ahreq;
        struct crypto_gcm_ctx *ctx = crypto_aead_ctx(crypto_aead_reqtfm(req));

        ahash_request_set_tfm(ahreq, ctx->ghash);

        ahash_request_set_callback(ahreq, flags, gcm_hash_init_done, req);
        return crypto_ahash_init(ahreq) ?:
               gcm_hash_init_continue(req, flags);
}

static int gcm_enc_copy_hash(struct aead_request *req, u32 flags)
{
        struct crypto_gcm_req_priv_ctx *pctx = crypto_gcm_reqctx(req);
        struct crypto_aead *aead = crypto_aead_reqtfm(req);
        u8 *auth_tag = pctx->auth_tag;

        crypto_xor(auth_tag, pctx->iauth_tag, 16);
        scatterwalk_map_and_copy(auth_tag, req->dst,
                                 req->assoclen + req->cryptlen,
                                 crypto_aead_authsize(aead), 1);
        return 0;
}

static int gcm_encrypt_continue(struct aead_request *req, u32 flags)
{
        struct crypto_gcm_req_priv_ctx *pctx = crypto_gcm_reqctx(req);
        struct crypto_gcm_ghash_ctx *gctx = &pctx->ghash_ctx;

        gctx->src = sg_next(req->src == req->dst ? pctx->src : pctx->dst);
        gctx->cryptlen = req->cryptlen;
        gctx->complete = gcm_enc_copy_hash;

        return gcm_hash(req, flags);
}

static void gcm_encrypt_done(void *data, int err)
{
        struct aead_request *req = data;

        if (err)
                goto out;

        err = gcm_encrypt_continue(req, 0);
        if (err == -EINPROGRESS)
                return;

out:
        aead_request_complete(req, err);
}

static int crypto_gcm_encrypt(struct aead_request *req)
{
        struct crypto_gcm_req_priv_ctx *pctx = crypto_gcm_reqctx(req);
        struct skcipher_request *skreq = &pctx->u.skreq;
        u32 flags = aead_request_flags(req);

        crypto_gcm_init_common(req);
        crypto_gcm_init_crypt(req, req->cryptlen);
        skcipher_request_set_callback(skreq, flags, gcm_encrypt_done, req);

        return crypto_skcipher_encrypt(skreq) ?:
               gcm_encrypt_continue(req, flags);
}

static int crypto_gcm_verify(struct aead_request *req)
{
        struct crypto_gcm_req_priv_ctx *pctx = crypto_gcm_reqctx(req);
        struct crypto_aead *aead = crypto_aead_reqtfm(req);
        u8 *auth_tag = pctx->auth_tag;
        u8 *iauth_tag = pctx->iauth_tag;
        unsigned int authsize = crypto_aead_authsize(aead);
        unsigned int cryptlen = req->cryptlen - authsize;

        crypto_xor(auth_tag, iauth_tag, 16);
        scatterwalk_map_and_copy(iauth_tag, req->src,
                                 req->assoclen + cryptlen, authsize, 0);
        return crypto_memneq(iauth_tag, auth_tag, authsize) ? -EBADMSG : 0;
}

static void gcm_decrypt_done(void *data, int err)
{
        struct aead_request *req = data;

        if (!err)
                err = crypto_gcm_verify(req);

        aead_request_complete(req, err);
}

static int gcm_dec_hash_continue(struct aead_request *req, u32 flags)
{
        struct crypto_gcm_req_priv_ctx *pctx = crypto_gcm_reqctx(req);
        struct skcipher_request *skreq = &pctx->u.skreq;
        struct crypto_gcm_ghash_ctx *gctx = &pctx->ghash_ctx;

        crypto_gcm_init_crypt(req, gctx->cryptlen);
        skcipher_request_set_callback(skreq, flags, gcm_decrypt_done, req);
        return crypto_skcipher_decrypt(skreq) ?: crypto_gcm_verify(req);
}

static int crypto_gcm_decrypt(struct aead_request *req)
{
        struct crypto_aead *aead = crypto_aead_reqtfm(req);
        struct crypto_gcm_req_priv_ctx *pctx = crypto_gcm_reqctx(req);
        struct crypto_gcm_ghash_ctx *gctx = &pctx->ghash_ctx;
        unsigned int authsize = crypto_aead_authsize(aead);
        unsigned int cryptlen = req->cryptlen;
        u32 flags = aead_request_flags(req);

        cryptlen -= authsize;

        crypto_gcm_init_common(req);

        gctx->src = sg_next(pctx->src);
        gctx->cryptlen = cryptlen;
        gctx->complete = gcm_dec_hash_continue;

        return gcm_hash(req, flags);
}

static int crypto_gcm_init_tfm(struct crypto_aead *tfm)
{
        struct aead_instance *inst = aead_alg_instance(tfm);
        struct gcm_instance_ctx *ictx = aead_instance_ctx(inst);
        struct crypto_gcm_ctx *ctx = crypto_aead_ctx(tfm);
        struct crypto_skcipher *ctr;
        struct crypto_ahash *ghash;
        unsigned long align;
        int err;

        ghash = crypto_spawn_ahash(&ictx->ghash);
        if (IS_ERR(ghash))
                return PTR_ERR(ghash);

        ctr = crypto_spawn_skcipher(&ictx->ctr);
        err = PTR_ERR(ctr);
        if (IS_ERR(ctr))
                goto err_free_hash;

        ctx->ctr = ctr;
        ctx->ghash = ghash;

        align = crypto_aead_alignmask(tfm);
        align &= ~(crypto_tfm_ctx_alignment() - 1);
        crypto_aead_set_reqsize(tfm,
                align + offsetof(struct crypto_gcm_req_priv_ctx, u) +
                max(sizeof(struct skcipher_request) +
                    crypto_skcipher_reqsize(ctr),
                    sizeof(struct ahash_request) +
                    crypto_ahash_reqsize(ghash)));

        return 0;

err_free_hash:
        crypto_free_ahash(ghash);
        return err;
}

static void crypto_gcm_exit_tfm(struct crypto_aead *tfm)
{
        struct crypto_gcm_ctx *ctx = crypto_aead_ctx(tfm);

        crypto_free_ahash(ctx->ghash);
        crypto_free_skcipher(ctx->ctr);
}

static void crypto_gcm_free(struct aead_instance *inst)
{
        struct gcm_instance_ctx *ctx = aead_instance_ctx(inst);

        crypto_drop_skcipher(&ctx->ctr);
        crypto_drop_ahash(&ctx->ghash);
        kfree(inst);
}

static int crypto_gcm_create_common(struct crypto_template *tmpl,
                                    struct rtattr **tb,
                                    const char *ctr_name,
                                    const char *ghash_name)
{
        struct skcipher_alg_common *ctr;
        u32 mask;
        struct aead_instance *inst;
        struct gcm_instance_ctx *ctx;
        struct hash_alg_common *ghash;
        int err;

        err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_AEAD, &mask);
        if (err)
                return err;

        inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL);
        if (!inst)
                return -ENOMEM;
        ctx = aead_instance_ctx(inst);

        err = crypto_grab_ahash(&ctx->ghash, aead_crypto_instance(inst),
                                ghash_name, 0, mask);
        if (err)
                goto err_free_inst;
        ghash = crypto_spawn_ahash_alg(&ctx->ghash);

        err = -EINVAL;
        if (strcmp(ghash->base.cra_name, "ghash") != 0 ||
            ghash->digestsize != 16)
                goto err_free_inst;

        err = crypto_grab_skcipher(&ctx->ctr, aead_crypto_instance(inst),
                                   ctr_name, 0, mask);
        if (err)
                goto err_free_inst;
        ctr = crypto_spawn_skcipher_alg_common(&ctx->ctr);

        /* The skcipher algorithm must be CTR mode, using 16-byte blocks. */
        err = -EINVAL;
        if (strncmp(ctr->base.cra_name, "ctr(", 4) != 0 ||
            ctr->ivsize != 16 || ctr->base.cra_blocksize != 1)
                goto err_free_inst;

        err = -ENAMETOOLONG;
        if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME,
                     "gcm(%s", ctr->base.cra_name + 4) >= CRYPTO_MAX_ALG_NAME)
                goto err_free_inst;

        if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME,
                     "gcm_base(%s,%s)", ctr->base.cra_driver_name,
                     ghash->base.cra_driver_name) >=
            CRYPTO_MAX_ALG_NAME)
                goto err_free_inst;

        inst->alg.base.cra_priority = (ghash->base.cra_priority +
                                       ctr->base.cra_priority) / 2;
        inst->alg.base.cra_blocksize = 1;
        inst->alg.base.cra_alignmask = ctr->base.cra_alignmask;
        inst->alg.base.cra_ctxsize = sizeof(struct crypto_gcm_ctx);
        inst->alg.ivsize = GCM_AES_IV_SIZE;
        inst->alg.chunksize = ctr->chunksize;
        inst->alg.maxauthsize = 16;
        inst->alg.init = crypto_gcm_init_tfm;
        inst->alg.exit = crypto_gcm_exit_tfm;
        inst->alg.setkey = crypto_gcm_setkey;
        inst->alg.setauthsize = crypto_gcm_setauthsize;
        inst->alg.encrypt = crypto_gcm_encrypt;
        inst->alg.decrypt = crypto_gcm_decrypt;

        inst->free = crypto_gcm_free;

        err = aead_register_instance(tmpl, inst);
        if (err) {
err_free_inst:
                crypto_gcm_free(inst);
        }
        return err;
}

static int crypto_gcm_create(struct crypto_template *tmpl, struct rtattr **tb)
{
        const char *cipher_name;
        char ctr_name[CRYPTO_MAX_ALG_NAME];

        cipher_name = crypto_attr_alg_name(tb[1]);
        if (IS_ERR(cipher_name))
                return PTR_ERR(cipher_name);

        if (snprintf(ctr_name, CRYPTO_MAX_ALG_NAME, "ctr(%s)", cipher_name) >=
            CRYPTO_MAX_ALG_NAME)
                return -ENAMETOOLONG;

        return crypto_gcm_create_common(tmpl, tb, ctr_name, "ghash");
}

static int crypto_gcm_base_create(struct crypto_template *tmpl,
                                  struct rtattr **tb)
{
        const char *ctr_name;
        const char *ghash_name;

        ctr_name = crypto_attr_alg_name(tb[1]);
        if (IS_ERR(ctr_name))
                return PTR_ERR(ctr_name);

        ghash_name = crypto_attr_alg_name(tb[2]);
        if (IS_ERR(ghash_name))
                return PTR_ERR(ghash_name);

        return crypto_gcm_create_common(tmpl, tb, ctr_name, ghash_name);
}

static int crypto_rfc4106_setkey(struct crypto_aead *parent, const u8 *key,
                                 unsigned int keylen)
{
        struct crypto_rfc4106_ctx *ctx = crypto_aead_ctx(parent);
        struct crypto_aead *child = ctx->child;

        if (keylen < 4)
                return -EINVAL;

        keylen -= 4;
        memcpy(ctx->nonce, key + keylen, 4);

        crypto_aead_clear_flags(child, CRYPTO_TFM_REQ_MASK);
        crypto_aead_set_flags(child, crypto_aead_get_flags(parent) &
                                     CRYPTO_TFM_REQ_MASK);
        return crypto_aead_setkey(child, key, keylen);
}

static int crypto_rfc4106_setauthsize(struct crypto_aead *parent,
                                      unsigned int authsize)
{
        struct crypto_rfc4106_ctx *ctx = crypto_aead_ctx(parent);
        int err;

        err = crypto_rfc4106_check_authsize(authsize);
        if (err)
                return err;

        return crypto_aead_setauthsize(ctx->child, authsize);
}

static struct aead_request *crypto_rfc4106_crypt(struct aead_request *req)
{
        struct crypto_rfc4106_req_ctx *rctx = aead_request_ctx(req);
        struct crypto_aead *aead = crypto_aead_reqtfm(req);
        struct crypto_rfc4106_ctx *ctx = crypto_aead_ctx(aead);
        struct aead_request *subreq = &rctx->subreq;
        struct crypto_aead *child = ctx->child;
        struct scatterlist *sg;
        u8 *iv = PTR_ALIGN((u8 *)(subreq + 1) + crypto_aead_reqsize(child),
                           crypto_aead_alignmask(child) + 1);

        scatterwalk_map_and_copy(iv + GCM_AES_IV_SIZE, req->src, 0, req->assoclen - 8, 0);

        memcpy(iv, ctx->nonce, 4);
        memcpy(iv + 4, req->iv, 8);

        sg_init_table(rctx->src, 3);
        sg_set_buf(rctx->src, iv + GCM_AES_IV_SIZE, req->assoclen - 8);
        sg = scatterwalk_ffwd(rctx->src + 1, req->src, req->assoclen);
        if (sg != rctx->src + 1)
                sg_chain(rctx->src, 2, sg);

        if (req->src != req->dst) {
                sg_init_table(rctx->dst, 3);
                sg_set_buf(rctx->dst, iv + GCM_AES_IV_SIZE, req->assoclen - 8);
                sg = scatterwalk_ffwd(rctx->dst + 1, req->dst, req->assoclen);
                if (sg != rctx->dst + 1)
                        sg_chain(rctx->dst, 2, sg);
        }

        aead_request_set_tfm(subreq, child);
        aead_request_set_callback(subreq, req->base.flags, req->base.complete,
                                  req->base.data);
        aead_request_set_crypt(subreq, rctx->src,
                               req->src == req->dst ? rctx->src : rctx->dst,
                               req->cryptlen, iv);
        aead_request_set_ad(subreq, req->assoclen - 8);

        return subreq;
}

static int crypto_rfc4106_encrypt(struct aead_request *req)
{
        int err;

        err = crypto_ipsec_check_assoclen(req->assoclen);
        if (err)
                return err;

        req = crypto_rfc4106_crypt(req);

        return crypto_aead_encrypt(req);
}

static int crypto_rfc4106_decrypt(struct aead_request *req)
{
        int err;

        err = crypto_ipsec_check_assoclen(req->assoclen);
        if (err)
                return err;

        req = crypto_rfc4106_crypt(req);

        return crypto_aead_decrypt(req);
}

static int crypto_rfc4106_init_tfm(struct crypto_aead *tfm)
{
        struct aead_instance *inst = aead_alg_instance(tfm);
        struct crypto_aead_spawn *spawn = aead_instance_ctx(inst);
        struct crypto_rfc4106_ctx *ctx = crypto_aead_ctx(tfm);
        struct crypto_aead *aead;
        unsigned long align;

        aead = crypto_spawn_aead(spawn);
        if (IS_ERR(aead))
                return PTR_ERR(aead);

        ctx->child = aead;

        align = crypto_aead_alignmask(aead);
        align &= ~(crypto_tfm_ctx_alignment() - 1);
        crypto_aead_set_reqsize(
                tfm,
                sizeof(struct crypto_rfc4106_req_ctx) +
                ALIGN(crypto_aead_reqsize(aead), crypto_tfm_ctx_alignment()) +
                align + 24);

        return 0;
}

static void crypto_rfc4106_exit_tfm(struct crypto_aead *tfm)
{
        struct crypto_rfc4106_ctx *ctx = crypto_aead_ctx(tfm);

        crypto_free_aead(ctx->child);
}

static void crypto_rfc4106_free(struct aead_instance *inst)
{
        crypto_drop_aead(aead_instance_ctx(inst));
        kfree(inst);
}

static int crypto_rfc4106_create(struct crypto_template *tmpl,
                                 struct rtattr **tb)
{
        u32 mask;
        struct aead_instance *inst;
        struct crypto_aead_spawn *spawn;
        struct aead_alg *alg;
        int err;

        err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_AEAD, &mask);
        if (err)
                return err;

        inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL);
        if (!inst)
                return -ENOMEM;

        spawn = aead_instance_ctx(inst);
        err = crypto_grab_aead(spawn, aead_crypto_instance(inst),
                               crypto_attr_alg_name(tb[1]), 0, mask);
        if (err)
                goto err_free_inst;

        alg = crypto_spawn_aead_alg(spawn);

        err = -EINVAL;

        /* Underlying IV size must be 12. */
        if (crypto_aead_alg_ivsize(alg) != GCM_AES_IV_SIZE)
                goto err_free_inst;

        /* Not a stream cipher? */
        if (alg->base.cra_blocksize != 1)
                goto err_free_inst;

        err = -ENAMETOOLONG;
        if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME,
                     "rfc4106(%s)", alg->base.cra_name) >=
            CRYPTO_MAX_ALG_NAME ||
            snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME,
                     "rfc4106(%s)", alg->base.cra_driver_name) >=
            CRYPTO_MAX_ALG_NAME)
                goto err_free_inst;

        inst->alg.base.cra_priority = alg->base.cra_priority;
        inst->alg.base.cra_blocksize = 1;
        inst->alg.base.cra_alignmask = alg->base.cra_alignmask;

        inst->alg.base.cra_ctxsize = sizeof(struct crypto_rfc4106_ctx);

        inst->alg.ivsize = GCM_RFC4106_IV_SIZE;
        inst->alg.chunksize = crypto_aead_alg_chunksize(alg);
        inst->alg.maxauthsize = crypto_aead_alg_maxauthsize(alg);

        inst->alg.init = crypto_rfc4106_init_tfm;
        inst->alg.exit = crypto_rfc4106_exit_tfm;

        inst->alg.setkey = crypto_rfc4106_setkey;
        inst->alg.setauthsize = crypto_rfc4106_setauthsize;
        inst->alg.encrypt = crypto_rfc4106_encrypt;
        inst->alg.decrypt = crypto_rfc4106_decrypt;

        inst->free = crypto_rfc4106_free;

        err = aead_register_instance(tmpl, inst);
        if (err) {
err_free_inst:
                crypto_rfc4106_free(inst);
        }
        return err;
}

static int crypto_rfc4543_setkey(struct crypto_aead *parent, const u8 *key,
                                 unsigned int keylen)
{
        struct crypto_rfc4543_ctx *ctx = crypto_aead_ctx(parent);
        struct crypto_aead *child = ctx->child;

        if (keylen < 4)
                return -EINVAL;

        keylen -= 4;
        memcpy(ctx->nonce, key + keylen, 4);

        crypto_aead_clear_flags(child, CRYPTO_TFM_REQ_MASK);
        crypto_aead_set_flags(child, crypto_aead_get_flags(parent) &
                                     CRYPTO_TFM_REQ_MASK);
        return crypto_aead_setkey(child, key, keylen);
}

static int crypto_rfc4543_setauthsize(struct crypto_aead *parent,
                                      unsigned int authsize)
{
        struct crypto_rfc4543_ctx *ctx = crypto_aead_ctx(parent);

        if (authsize != 16)
                return -EINVAL;

        return crypto_aead_setauthsize(ctx->child, authsize);
}

static int crypto_rfc4543_crypt(struct aead_request *req, bool enc)
{
        struct crypto_aead *aead = crypto_aead_reqtfm(req);
        struct crypto_rfc4543_ctx *ctx = crypto_aead_ctx(aead);
        struct crypto_rfc4543_req_ctx *rctx = aead_request_ctx(req);
        struct aead_request *subreq = &rctx->subreq;
        unsigned int authsize = crypto_aead_authsize(aead);
        u8 *iv = PTR_ALIGN((u8 *)(rctx + 1) + crypto_aead_reqsize(ctx->child),
                           crypto_aead_alignmask(ctx->child) + 1);

        if (req->src != req->dst) {
                unsigned int nbytes = req->assoclen + req->cryptlen -
                                      (enc ? 0 : authsize);

                memcpy_sglist(req->dst, req->src, nbytes);
        }

        memcpy(iv, ctx->nonce, 4);
        memcpy(iv + 4, req->iv, 8);

        aead_request_set_tfm(subreq, ctx->child);
        aead_request_set_callback(subreq, req->base.flags,
                                  req->base.complete, req->base.data);
        aead_request_set_crypt(subreq, req->src, req->dst,
                               enc ? 0 : authsize, iv);
        aead_request_set_ad(subreq, req->assoclen + req->cryptlen -
                                    subreq->cryptlen);

        return enc ? crypto_aead_encrypt(subreq) : crypto_aead_decrypt(subreq);
}

static int crypto_rfc4543_encrypt(struct aead_request *req)
{
        return crypto_ipsec_check_assoclen(req->assoclen) ?:
               crypto_rfc4543_crypt(req, true);
}

static int crypto_rfc4543_decrypt(struct aead_request *req)
{
        return crypto_ipsec_check_assoclen(req->assoclen) ?:
               crypto_rfc4543_crypt(req, false);
}

static int crypto_rfc4543_init_tfm(struct crypto_aead *tfm)
{
        struct aead_instance *inst = aead_alg_instance(tfm);
        struct crypto_rfc4543_instance_ctx *ictx = aead_instance_ctx(inst);
        struct crypto_aead_spawn *spawn = &ictx->aead;
        struct crypto_rfc4543_ctx *ctx = crypto_aead_ctx(tfm);
        struct crypto_aead *aead;
        unsigned long align;

        aead = crypto_spawn_aead(spawn);
        if (IS_ERR(aead))
                return PTR_ERR(aead);

        ctx->child = aead;

        align = crypto_aead_alignmask(aead);
        align &= ~(crypto_tfm_ctx_alignment() - 1);
        crypto_aead_set_reqsize(
                tfm,
                sizeof(struct crypto_rfc4543_req_ctx) +
                ALIGN(crypto_aead_reqsize(aead), crypto_tfm_ctx_alignment()) +
                align + GCM_AES_IV_SIZE);

        return 0;
}

static void crypto_rfc4543_exit_tfm(struct crypto_aead *tfm)
{
        struct crypto_rfc4543_ctx *ctx = crypto_aead_ctx(tfm);

        crypto_free_aead(ctx->child);
}

static void crypto_rfc4543_free(struct aead_instance *inst)
{
        struct crypto_rfc4543_instance_ctx *ctx = aead_instance_ctx(inst);

        crypto_drop_aead(&ctx->aead);

        kfree(inst);
}

static int crypto_rfc4543_create(struct crypto_template *tmpl,
                                struct rtattr **tb)
{
        u32 mask;
        struct aead_instance *inst;
        struct aead_alg *alg;
        struct crypto_rfc4543_instance_ctx *ctx;
        int err;

        err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_AEAD, &mask);
        if (err)
                return err;

        inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL);
        if (!inst)
                return -ENOMEM;

        ctx = aead_instance_ctx(inst);
        err = crypto_grab_aead(&ctx->aead, aead_crypto_instance(inst),
                               crypto_attr_alg_name(tb[1]), 0, mask);
        if (err)
                goto err_free_inst;

        alg = crypto_spawn_aead_alg(&ctx->aead);

        err = -EINVAL;

        /* Underlying IV size must be 12. */
        if (crypto_aead_alg_ivsize(alg) != GCM_AES_IV_SIZE)
                goto err_free_inst;

        /* Not a stream cipher? */
        if (alg->base.cra_blocksize != 1)
                goto err_free_inst;

        err = -ENAMETOOLONG;
        if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME,
                     "rfc4543(%s)", alg->base.cra_name) >=
            CRYPTO_MAX_ALG_NAME ||
            snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME,
                     "rfc4543(%s)", alg->base.cra_driver_name) >=
            CRYPTO_MAX_ALG_NAME)
                goto err_free_inst;

        inst->alg.base.cra_priority = alg->base.cra_priority;
        inst->alg.base.cra_blocksize = 1;
        inst->alg.base.cra_alignmask = alg->base.cra_alignmask;

        inst->alg.base.cra_ctxsize = sizeof(struct crypto_rfc4543_ctx);

        inst->alg.ivsize = GCM_RFC4543_IV_SIZE;
        inst->alg.chunksize = crypto_aead_alg_chunksize(alg);
        inst->alg.maxauthsize = crypto_aead_alg_maxauthsize(alg);

        inst->alg.init = crypto_rfc4543_init_tfm;
        inst->alg.exit = crypto_rfc4543_exit_tfm;

        inst->alg.setkey = crypto_rfc4543_setkey;
        inst->alg.setauthsize = crypto_rfc4543_setauthsize;
        inst->alg.encrypt = crypto_rfc4543_encrypt;
        inst->alg.decrypt = crypto_rfc4543_decrypt;

        inst->free = crypto_rfc4543_free;

        err = aead_register_instance(tmpl, inst);
        if (err) {
err_free_inst:
                crypto_rfc4543_free(inst);
        }
        return err;
}

static struct crypto_template crypto_gcm_tmpls[] = {
        {
                .name = "gcm_base",
                .create = crypto_gcm_base_create,
                .module = THIS_MODULE,
        }, {
                .name = "gcm",
                .create = crypto_gcm_create,
                .module = THIS_MODULE,
        }, {
                .name = "rfc4106",
                .create = crypto_rfc4106_create,
                .module = THIS_MODULE,
        }, {
                .name = "rfc4543",
                .create = crypto_rfc4543_create,
                .module = THIS_MODULE,
        },
};

static int __init crypto_gcm_module_init(void)
{
        int err;

        gcm_zeroes = kzalloc(sizeof(*gcm_zeroes), GFP_KERNEL);
        if (!gcm_zeroes)
                return -ENOMEM;

        sg_init_one(&gcm_zeroes->sg, gcm_zeroes->buf, sizeof(gcm_zeroes->buf));

        err = crypto_register_templates(crypto_gcm_tmpls,
                                        ARRAY_SIZE(crypto_gcm_tmpls));
        if (err)
                kfree(gcm_zeroes);

        return err;
}

static void __exit crypto_gcm_module_exit(void)
{
        kfree(gcm_zeroes);
        crypto_unregister_templates(crypto_gcm_tmpls,
                                    ARRAY_SIZE(crypto_gcm_tmpls));
}

module_init(crypto_gcm_module_init);
module_exit(crypto_gcm_module_exit);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Galois/Counter Mode");
MODULE_AUTHOR("Mikko Herranen <mh1@iki.fi>");
MODULE_ALIAS_CRYPTO("gcm_base");
MODULE_ALIAS_CRYPTO("rfc4106");
MODULE_ALIAS_CRYPTO("rfc4543");
MODULE_ALIAS_CRYPTO("gcm");






















































































































































































































































   31 


   31 





















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
// SPDX-License-Identifier: GPL-2.0-only
/*
 * xfrm_nat_keepalive.c
 *
 * (c) 2024 Eyal Birger <eyal.birger@gmail.com>
 */

#include <net/inet_common.h>
#include <net/ip6_checksum.h>
#include <net/xfrm.h>

static DEFINE_PER_CPU(struct sock_bh_locked, nat_keepalive_sk_ipv4) = {
        .bh_lock = INIT_LOCAL_LOCK(bh_lock),
};
#if IS_ENABLED(CONFIG_IPV6)
static DEFINE_PER_CPU(struct sock_bh_locked, nat_keepalive_sk_ipv6) = {
        .bh_lock = INIT_LOCAL_LOCK(bh_lock),
};
#endif

struct nat_keepalive {
        struct net *net;
        u16 family;
        xfrm_address_t saddr;
        xfrm_address_t daddr;
        __be16 encap_sport;
        __be16 encap_dport;
        __u32 smark;
};

static void nat_keepalive_init(struct nat_keepalive *ka, struct xfrm_state *x)
{
        ka->net = xs_net(x);
        ka->family = x->props.family;
        ka->saddr = x->props.saddr;
        ka->daddr = x->id.daddr;
        ka->encap_sport = x->encap->encap_sport;
        ka->encap_dport = x->encap->encap_dport;
        ka->smark = xfrm_smark_get(0, x);
}

static int nat_keepalive_send_ipv4(struct sk_buff *skb,
                                   struct nat_keepalive *ka)
{
        struct net *net = ka->net;
        struct flowi4 fl4;
        struct rtable *rt;
        struct sock *sk;
        __u8 tos = 0;
        int err;

        flowi4_init_output(&fl4, 0 /* oif */, skb->mark, tos,
                           RT_SCOPE_UNIVERSE, IPPROTO_UDP, 0,
                           ka->daddr.a4, ka->saddr.a4, ka->encap_dport,
                           ka->encap_sport, sock_net_uid(net, NULL));

        rt = ip_route_output_key(net, &fl4);
        if (IS_ERR(rt))
                return PTR_ERR(rt);

        skb_dst_set(skb, &rt->dst);

        local_lock_nested_bh(&nat_keepalive_sk_ipv4.bh_lock);
        sk = this_cpu_read(nat_keepalive_sk_ipv4.sock);
        sock_net_set(sk, net);
        err = ip_build_and_send_pkt(skb, sk, fl4.saddr, fl4.daddr, NULL, tos);
        sock_net_set(sk, &init_net);
        local_unlock_nested_bh(&nat_keepalive_sk_ipv4.bh_lock);
        return err;
}

#if IS_ENABLED(CONFIG_IPV6)
static int nat_keepalive_send_ipv6(struct sk_buff *skb,
                                   struct nat_keepalive *ka,
                                   struct udphdr *uh)
{
        struct net *net = ka->net;
        struct dst_entry *dst;
        struct flowi6 fl6;
        struct sock *sk;
        __wsum csum;
        int err;

        csum = skb_checksum(skb, 0, skb->len, 0);
        uh->check = csum_ipv6_magic(&ka->saddr.in6, &ka->daddr.in6,
                                    skb->len, IPPROTO_UDP, csum);
        if (uh->check == 0)
                uh->check = CSUM_MANGLED_0;

        memset(&fl6, 0, sizeof(fl6));
        fl6.flowi6_mark = skb->mark;
        fl6.saddr = ka->saddr.in6;
        fl6.daddr = ka->daddr.in6;
        fl6.flowi6_proto = IPPROTO_UDP;
        fl6.fl6_sport = ka->encap_sport;
        fl6.fl6_dport = ka->encap_dport;

        local_lock_nested_bh(&nat_keepalive_sk_ipv6.bh_lock);
        sk = this_cpu_read(nat_keepalive_sk_ipv6.sock);
        sock_net_set(sk, net);
        dst = ipv6_stub->ipv6_dst_lookup_flow(net, sk, &fl6, NULL);
        if (IS_ERR(dst)) {
                local_unlock_nested_bh(&nat_keepalive_sk_ipv6.bh_lock);
                return PTR_ERR(dst);
        }

        skb_dst_set(skb, dst);
        err = ipv6_stub->ip6_xmit(sk, skb, &fl6, skb->mark, NULL, 0, 0);
        sock_net_set(sk, &init_net);
        local_unlock_nested_bh(&nat_keepalive_sk_ipv6.bh_lock);
        return err;
}
#endif

static void nat_keepalive_send(struct nat_keepalive *ka)
{
        const int nat_ka_hdrs_len = max(sizeof(struct iphdr),
                                        sizeof(struct ipv6hdr)) +
                                    sizeof(struct udphdr);
        const u8 nat_ka_payload = 0xFF;
        int err = -EAFNOSUPPORT;
        struct sk_buff *skb;
        struct udphdr *uh;

        skb = alloc_skb(nat_ka_hdrs_len + sizeof(nat_ka_payload), GFP_ATOMIC);
        if (unlikely(!skb))
                return;

        skb_reserve(skb, nat_ka_hdrs_len);

        skb_put_u8(skb, nat_ka_payload);

        uh = skb_push(skb, sizeof(*uh));
        uh->source = ka->encap_sport;
        uh->dest = ka->encap_dport;
        uh->len = htons(skb->len);
        uh->check = 0;

        skb->mark = ka->smark;

        switch (ka->family) {
        case AF_INET:
                err = nat_keepalive_send_ipv4(skb, ka);
                break;
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                err = nat_keepalive_send_ipv6(skb, ka, uh);
                break;
#endif
        }
        if (err)
                kfree_skb(skb);
}

struct nat_keepalive_work_ctx {
        time64_t next_run;
        time64_t now;
};

static int nat_keepalive_work_single(struct xfrm_state *x, int count, void *ptr)
{
        struct nat_keepalive_work_ctx *ctx = ptr;
        bool send_keepalive = false;
        struct nat_keepalive ka;
        time64_t next_run;
        u32 interval;
        int delta;

        interval = x->nat_keepalive_interval;
        if (!interval)
                return 0;

        spin_lock(&x->lock);

        delta = (int)(ctx->now - x->lastused);
        if (delta < interval) {
                x->nat_keepalive_expiration = ctx->now + interval - delta;
                next_run = x->nat_keepalive_expiration;
        } else if (x->nat_keepalive_expiration > ctx->now) {
                next_run = x->nat_keepalive_expiration;
        } else {
                next_run = ctx->now + interval;
                nat_keepalive_init(&ka, x);
                send_keepalive = true;
        }

        spin_unlock(&x->lock);

        if (send_keepalive)
                nat_keepalive_send(&ka);

        if (!ctx->next_run || next_run < ctx->next_run)
                ctx->next_run = next_run;
        return 0;
}

static void nat_keepalive_work(struct work_struct *work)
{
        struct nat_keepalive_work_ctx ctx;
        struct xfrm_state_walk walk;
        struct net *net;

        ctx.next_run = 0;
        ctx.now = ktime_get_real_seconds();

        net = container_of(work, struct net, xfrm.nat_keepalive_work.work);
        xfrm_state_walk_init(&walk, IPPROTO_ESP, NULL);
        xfrm_state_walk(net, &walk, nat_keepalive_work_single, &ctx);
        xfrm_state_walk_done(&walk, net);
        if (ctx.next_run)
                schedule_delayed_work(&net->xfrm.nat_keepalive_work,
                                      (ctx.next_run - ctx.now) * HZ);
}

static int nat_keepalive_sk_init(struct sock_bh_locked __percpu *socks,
                                 unsigned short family)
{
        struct sock *sk;
        int err, i;

        for_each_possible_cpu(i) {
                err = inet_ctl_sock_create(&sk, family, SOCK_RAW, IPPROTO_UDP,
                                           &init_net);
                if (err < 0)
                        goto err;

                per_cpu_ptr(socks, i)->sock = sk;
        }

        return 0;
err:
        for_each_possible_cpu(i)
                inet_ctl_sock_destroy(per_cpu_ptr(socks, i)->sock);
        return err;
}

static void nat_keepalive_sk_fini(struct sock_bh_locked __percpu *socks)
{
        int i;

        for_each_possible_cpu(i)
                inet_ctl_sock_destroy(per_cpu_ptr(socks, i)->sock);
}

void xfrm_nat_keepalive_state_updated(struct xfrm_state *x)
{
        struct net *net;

        if (!x->nat_keepalive_interval)
                return;

        net = xs_net(x);
        schedule_delayed_work(&net->xfrm.nat_keepalive_work, 0);
}

int __net_init xfrm_nat_keepalive_net_init(struct net *net)
{
        INIT_DELAYED_WORK(&net->xfrm.nat_keepalive_work, nat_keepalive_work);
        return 0;
}

int xfrm_nat_keepalive_net_fini(struct net *net)
{
        cancel_delayed_work_sync(&net->xfrm.nat_keepalive_work);
        return 0;
}

int xfrm_nat_keepalive_init(unsigned short family)
{
        int err = -EAFNOSUPPORT;

        switch (family) {
        case AF_INET:
                err = nat_keepalive_sk_init(&nat_keepalive_sk_ipv4, PF_INET);
                break;
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                err = nat_keepalive_sk_init(&nat_keepalive_sk_ipv6, PF_INET6);
                break;
#endif
        }

        if (err)
                pr_err("xfrm nat keepalive init: failed to init err:%d\n", err);
        return err;
}
EXPORT_SYMBOL_GPL(xfrm_nat_keepalive_init);

void xfrm_nat_keepalive_fini(unsigned short family)
{
        switch (family) {
        case AF_INET:
                nat_keepalive_sk_fini(&nat_keepalive_sk_ipv4);
                break;
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                nat_keepalive_sk_fini(&nat_keepalive_sk_ipv6);
                break;
#endif
        }
}
EXPORT_SYMBOL_GPL(xfrm_nat_keepalive_fini);




























   15 









  264 

















  166 
  165 




  164 











  148 




  166 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __X86_KERNEL_FPU_CONTEXT_H
#define __X86_KERNEL_FPU_CONTEXT_H

#include <asm/fpu/xstate.h>
#include <asm/trace/fpu.h>

/* Functions related to FPU context tracking */

/*
 * The in-register FPU state for an FPU context on a CPU is assumed to be
 * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx
 * matches the FPU.
 *
 * If the FPU register state is valid, the kernel can skip restoring the
 * FPU state from memory.
 *
 * Any code that clobbers the FPU registers or updates the in-memory
 * FPU state for a task MUST let the rest of the kernel know that the
 * FPU registers are no longer valid for this task.
 *
 * Invalidate a resource you control: CPU if using the CPU for something else
 * (with preemption disabled), FPU for the current task, or a task that
 * is prevented from running by the current task.
 */
static inline void __cpu_invalidate_fpregs_state(void)
{
        __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
}

static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu)
{
        fpu->last_cpu = -1;
}

static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu)
{
        return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu;
}

static inline void fpregs_deactivate(struct fpu *fpu)
{
        __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
        trace_x86_fpu_regs_deactivated(fpu);
}

static inline void fpregs_activate(struct fpu *fpu)
{
        __this_cpu_write(fpu_fpregs_owner_ctx, fpu);
        trace_x86_fpu_regs_activated(fpu);
}

/* Internal helper for switch_fpu_return() and signal frame setup */
static inline void fpregs_restore_userregs(void)
{
        struct fpu *fpu = x86_task_fpu(current);
        int cpu = smp_processor_id();

        if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_USER_WORKER)))
                return;

        if (!fpregs_state_valid(fpu, cpu)) {
                /*
                 * This restores _all_ xstate which has not been
                 * established yet.
                 *
                 * If PKRU is enabled, then the PKRU value is already
                 * correct because it was either set in switch_to() or in
                 * flush_thread(). So it is excluded because it might be
                 * not up to date in current->thread.fpu->xsave state.
                 *
                 * XFD state is handled in restore_fpregs_from_fpstate().
                 */
                restore_fpregs_from_fpstate(fpu->fpstate, XFEATURE_MASK_FPSTATE);

                fpregs_activate(fpu);
                fpu->last_cpu = cpu;
        }
        clear_thread_flag(TIF_NEED_FPU_LOAD);
}

#endif







































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_DCACHE_H
#define __LINUX_DCACHE_H

#include <linux/atomic.h>
#include <linux/list.h>
#include <linux/math.h>
#include <linux/rculist.h>
#include <linux/rculist_bl.h>
#include <linux/spinlock.h>
#include <linux/seqlock.h>
#include <linux/cache.h>
#include <linux/rcupdate.h>
#include <linux/lockref.h>
#include <linux/stringhash.h>
#include <linux/wait.h>

struct path;
struct file;
struct vfsmount;

/*
 * linux/include/linux/dcache.h
 *
 * Dirent cache data structures
 *
 * (C) Copyright 1997 Thomas Schoebel-Theuer,
 * with heavy changes by Linus Torvalds
 */

#define IS_ROOT(x) ((x) == (x)->d_parent)

/* The hash is always the low bits of hash_len */
#ifdef __LITTLE_ENDIAN
 #define HASH_LEN_DECLARE u32 hash; u32 len
 #define bytemask_from_count(cnt)        (~(~0ul << (cnt)*8))
#else
 #define HASH_LEN_DECLARE u32 len; u32 hash
 #define bytemask_from_count(cnt)        (~(~0ul >> (cnt)*8))
#endif

/*
 * "quick string" -- eases parameter passing, but more importantly
 * saves "metadata" about the string (ie length and the hash).
 *
 * hash comes first so it snuggles against d_parent in the
 * dentry.
 */
struct qstr {
        union {
                struct {
                        HASH_LEN_DECLARE;
                };
                u64 hash_len;
        };
        const unsigned char *name;
};

#define QSTR_INIT(n,l) { { { .len = l } }, .name = n }
#define QSTR_LEN(n,l) (struct qstr)QSTR_INIT(n,l)
#define QSTR(n) QSTR_LEN(n, strlen(n))

extern const struct qstr empty_name;
extern const struct qstr slash_name;
extern const struct qstr dotdot_name;

/*
 * Try to keep struct dentry aligned on 64 byte cachelines (this will
 * give reasonable cacheline footprint with larger lines without the
 * large memory footprint increase).
 */
#ifdef CONFIG_64BIT
# define DNAME_INLINE_WORDS 5 /* 192 bytes */
#else
# ifdef CONFIG_SMP
#  define DNAME_INLINE_WORDS 9 /* 128 bytes */
# else
#  define DNAME_INLINE_WORDS 11 /* 128 bytes */
# endif
#endif

#define DNAME_INLINE_LEN (DNAME_INLINE_WORDS*sizeof(unsigned long))

union shortname_store {
        unsigned char string[DNAME_INLINE_LEN];
        unsigned long words[DNAME_INLINE_WORDS];
};

#define d_lock        d_lockref.lock
#define d_iname d_shortname.string

struct dentry {
        /* RCU lookup touched fields */
        unsigned int d_flags;                /* protected by d_lock */
        seqcount_spinlock_t d_seq;        /* per dentry seqlock */
        struct hlist_bl_node d_hash;        /* lookup hash list */
        struct dentry *d_parent;        /* parent directory */
        union {
        struct qstr __d_name;                /* for use ONLY in fs/dcache.c */
        const struct qstr d_name;
        };
        struct inode *d_inode;                /* Where the name belongs to - NULL is
                                         * negative */
        union shortname_store d_shortname;
        /* --- cacheline 1 boundary (64 bytes) was 32 bytes ago --- */

        /* Ref lookup also touches following */
        const struct dentry_operations *d_op;
        struct super_block *d_sb;        /* The root of the dentry tree */
        unsigned long d_time;                /* used by d_revalidate */
        void *d_fsdata;                        /* fs-specific data */
        /* --- cacheline 2 boundary (128 bytes) --- */
        struct lockref d_lockref;        /* per-dentry lock and refcount
                                         * keep separate from RCU lookup area if
                                         * possible!
                                         */

        union {
                struct list_head d_lru;                /* LRU list */
                wait_queue_head_t *d_wait;        /* in-lookup ones only */
        };
        struct hlist_node d_sib;        /* child of parent list */
        struct hlist_head d_children;        /* our children */
        /*
         * d_alias and d_rcu can share memory
         */
        union {
                struct hlist_node d_alias;        /* inode alias list */
                struct hlist_bl_node d_in_lookup_hash;        /* only for in-lookup ones */
                 struct rcu_head d_rcu;
        } d_u;
};

/*
 * dentry->d_lock spinlock nesting subclasses:
 *
 * 0: normal
 * 1: nested
 */
enum dentry_d_lock_class
{
        DENTRY_D_LOCK_NORMAL, /* implicitly used by plain spin_lock() APIs. */
        DENTRY_D_LOCK_NESTED
};

enum d_real_type {
        D_REAL_DATA,
        D_REAL_METADATA,
};

struct dentry_operations {
        int (*d_revalidate)(struct inode *, const struct qstr *,
                            struct dentry *, unsigned int);
        int (*d_weak_revalidate)(struct dentry *, unsigned int);
        int (*d_hash)(const struct dentry *, struct qstr *);
        int (*d_compare)(const struct dentry *,
                        unsigned int, const char *, const struct qstr *);
        int (*d_delete)(const struct dentry *);
        int (*d_init)(struct dentry *);
        void (*d_release)(struct dentry *);
        void (*d_prune)(struct dentry *);
        void (*d_iput)(struct dentry *, struct inode *);
        char *(*d_dname)(struct dentry *, char *, int);
        struct vfsmount *(*d_automount)(struct path *);
        int (*d_manage)(const struct path *, bool);
        struct dentry *(*d_real)(struct dentry *, enum d_real_type type);
        bool (*d_unalias_trylock)(const struct dentry *);
        void (*d_unalias_unlock)(const struct dentry *);
} ____cacheline_aligned;

/*
 * Locking rules for dentry_operations callbacks are to be found in
 * Documentation/filesystems/locking.rst. Keep it updated!
 *
 * FUrther descriptions are found in Documentation/filesystems/vfs.rst.
 * Keep it updated too!
 */

/* d_flags entries */
enum dentry_flags {
        DCACHE_OP_HASH                        = BIT(0),
        DCACHE_OP_COMPARE                = BIT(1),
        DCACHE_OP_REVALIDATE                = BIT(2),
        DCACHE_OP_DELETE                = BIT(3),
        DCACHE_OP_PRUNE                        = BIT(4),
        /*
         * This dentry is possibly not currently connected to the dcache tree,
         * in which case its parent will either be itself, or will have this
         * flag as well.  nfsd will not use a dentry with this bit set, but will
         * first endeavour to clear the bit either by discovering that it is
         * connected, or by performing lookup operations.  Any filesystem which
         * supports nfsd_operations MUST have a lookup function which, if it
         * finds a directory inode with a DCACHE_DISCONNECTED dentry, will
         * d_move that dentry into place and return that dentry rather than the
         * passed one, typically using d_splice_alias.
         */
        DCACHE_DISCONNECTED                = BIT(5),
        DCACHE_REFERENCED                = BIT(6),        /* Recently used, don't discard. */
        DCACHE_DONTCACHE                = BIT(7),        /* Purge from memory on final dput() */
        DCACHE_CANT_MOUNT                = BIT(8),
        DCACHE_GENOCIDE                        = BIT(9),
        DCACHE_SHRINK_LIST                = BIT(10),
        DCACHE_OP_WEAK_REVALIDATE        = BIT(11),
        /*
         * this dentry has been "silly renamed" and has to be deleted on the
         * last dput()
         */
        DCACHE_NFSFS_RENAMED                = BIT(12),
        DCACHE_FSNOTIFY_PARENT_WATCHED        = BIT(13),        /* Parent inode is watched by some fsnotify listener */
        DCACHE_DENTRY_KILLED                = BIT(14),
        DCACHE_MOUNTED                        = BIT(15),        /* is a mountpoint */
        DCACHE_NEED_AUTOMOUNT                = BIT(16),        /* handle automount on this dir */
        DCACHE_MANAGE_TRANSIT                = BIT(17),        /* manage transit from this dirent */
        DCACHE_LRU_LIST                        = BIT(18),
        DCACHE_ENTRY_TYPE                = (7 << 19),        /* bits 19..21 are for storing type: */
        DCACHE_MISS_TYPE                = (0 << 19),        /* Negative dentry */
        DCACHE_WHITEOUT_TYPE                = (1 << 19),        /* Whiteout dentry (stop pathwalk) */
        DCACHE_DIRECTORY_TYPE                = (2 << 19),        /* Normal directory */
        DCACHE_AUTODIR_TYPE                = (3 << 19),        /* Lookupless directory (presumed automount) */
        DCACHE_REGULAR_TYPE                = (4 << 19),        /* Regular file type */
        DCACHE_SPECIAL_TYPE                = (5 << 19),        /* Other file type */
        DCACHE_SYMLINK_TYPE                = (6 << 19),        /* Symlink */
        DCACHE_NOKEY_NAME                = BIT(22),        /* Encrypted name encoded without key */
        DCACHE_OP_REAL                        = BIT(23),
        DCACHE_PAR_LOOKUP                = BIT(24),        /* being looked up (with parent locked shared) */
        DCACHE_DENTRY_CURSOR                = BIT(25),
        DCACHE_NORCU                        = BIT(26),        /* No RCU delay for freeing */
};

#define DCACHE_MANAGED_DENTRY \
        (DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT)

extern seqlock_t rename_lock;

/*
 * These are the low-level FS interfaces to the dcache..
 */
extern void d_instantiate(struct dentry *, struct inode *);
extern void d_instantiate_new(struct dentry *, struct inode *);
extern void __d_drop(struct dentry *dentry);
extern void d_drop(struct dentry *dentry);
extern void d_delete(struct dentry *);

/* allocate/de-allocate */
extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
extern struct dentry * d_alloc_anon(struct super_block *);
extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *,
                                        wait_queue_head_t *);
extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
/* weird procfs mess; *NOT* exported */
extern struct dentry * d_splice_alias_ops(struct inode *, struct dentry *,
                                          const struct dentry_operations *);
extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
extern bool d_same_name(const struct dentry *dentry, const struct dentry *parent,
                        const struct qstr *name);
extern struct dentry *d_find_any_alias(struct inode *inode);
extern struct dentry * d_obtain_alias(struct inode *);
extern struct dentry * d_obtain_root(struct inode *);
extern void shrink_dcache_sb(struct super_block *);
extern void shrink_dcache_parent(struct dentry *);
extern void d_invalidate(struct dentry *);

/* only used at mount-time */
extern struct dentry * d_make_root(struct inode *);

extern void d_mark_tmpfile(struct file *, struct inode *);
extern void d_tmpfile(struct file *, struct inode *);

extern struct dentry *d_find_alias(struct inode *);
extern void d_prune_aliases(struct inode *);

extern struct dentry *d_find_alias_rcu(struct inode *);

/* test whether we have any submounts in a subdir tree */
extern int path_has_submounts(const struct path *);

/*
 * This adds the entry to the hash queues.
 */
extern void d_rehash(struct dentry *);
 
extern void d_add(struct dentry *, struct inode *);

/* used for rename() and baskets */
extern void d_move(struct dentry *, struct dentry *);
extern void d_exchange(struct dentry *, struct dentry *);
extern struct dentry *d_ancestor(struct dentry *, struct dentry *);

extern struct dentry *d_lookup(const struct dentry *, const struct qstr *);

static inline unsigned d_count(const struct dentry *dentry)
{
        return dentry->d_lockref.count;
}

ino_t d_parent_ino(struct dentry *dentry);

/*
 * helper function for dentry_operations.d_dname() members
 */
extern __printf(3, 4)
char *dynamic_dname(char *, int, const char *, ...);

extern char *__d_path(const struct path *, const struct path *, char *, int);
extern char *d_absolute_path(const struct path *, char *, int);
extern char *d_path(const struct path *, char *, int);
extern char *dentry_path_raw(const struct dentry *, char *, int);
extern char *dentry_path(const struct dentry *, char *, int);

/* Allocation counts.. */

/**
 * dget_dlock -        get a reference to a dentry
 * @dentry: dentry to get a reference to
 *
 * Given a live dentry, increment the reference count and return the dentry.
 * Caller must hold @dentry->d_lock.  Making sure that dentry is alive is
 * caller's resonsibility.  There are many conditions sufficient to guarantee
 * that; e.g. anything with non-negative refcount is alive, so's anything
 * hashed, anything positive, anyone's parent, etc.
 */
static inline struct dentry *dget_dlock(struct dentry *dentry)
{
        dentry->d_lockref.count++;
        return dentry;
}


/**
 * dget - get a reference to a dentry
 * @dentry: dentry to get a reference to
 *
 * Given a dentry or %NULL pointer increment the reference count
 * if appropriate and return the dentry.  A dentry will not be
 * destroyed when it has references.  Conversely, a dentry with
 * no references can disappear for any number of reasons, starting
 * with memory pressure.  In other words, that primitive is
 * used to clone an existing reference; using it on something with
 * zero refcount is a bug.
 *
 * NOTE: it will spin if @dentry->d_lock is held.  From the deadlock
 * avoidance point of view it is equivalent to spin_lock()/increment
 * refcount/spin_unlock(), so calling it under @dentry->d_lock is
 * always a bug; so's calling it under ->d_lock on any of its descendents.
 *
 */
static inline struct dentry *dget(struct dentry *dentry)
{
        if (dentry)
                lockref_get(&dentry->d_lockref);
        return dentry;
}

extern struct dentry *dget_parent(struct dentry *dentry);

/**
 * d_unhashed - is dentry hashed
 * @dentry: entry to check
 *
 * Returns true if the dentry passed is not currently hashed.
 */
static inline int d_unhashed(const struct dentry *dentry)
{
        return hlist_bl_unhashed(&dentry->d_hash);
}

static inline int d_unlinked(const struct dentry *dentry)
{
        return d_unhashed(dentry) && !IS_ROOT(dentry);
}

static inline int cant_mount(const struct dentry *dentry)
{
        return (dentry->d_flags & DCACHE_CANT_MOUNT);
}

static inline void dont_mount(struct dentry *dentry)
{
        spin_lock(&dentry->d_lock);
        dentry->d_flags |= DCACHE_CANT_MOUNT;
        spin_unlock(&dentry->d_lock);
}

extern void __d_lookup_unhash_wake(struct dentry *dentry);

static inline int d_in_lookup(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_PAR_LOOKUP;
}

static inline void d_lookup_done(struct dentry *dentry)
{
        if (unlikely(d_in_lookup(dentry)))
                __d_lookup_unhash_wake(dentry);
}

extern void dput(struct dentry *);

static inline bool d_managed(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_MANAGED_DENTRY;
}

static inline bool d_mountpoint(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_MOUNTED;
}

/*
 * Directory cache entry type accessor functions.
 */
static inline unsigned __d_entry_type(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_ENTRY_TYPE;
}

static inline bool d_is_miss(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_MISS_TYPE;
}

static inline bool d_is_whiteout(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_WHITEOUT_TYPE;
}

static inline bool d_can_lookup(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_DIRECTORY_TYPE;
}

static inline bool d_is_autodir(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_AUTODIR_TYPE;
}

static inline bool d_is_dir(const struct dentry *dentry)
{
        return d_can_lookup(dentry) || d_is_autodir(dentry);
}

static inline bool d_is_symlink(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_SYMLINK_TYPE;
}

static inline bool d_is_reg(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_REGULAR_TYPE;
}

static inline bool d_is_special(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_SPECIAL_TYPE;
}

static inline bool d_is_file(const struct dentry *dentry)
{
        return d_is_reg(dentry) || d_is_special(dentry);
}

static inline bool d_is_negative(const struct dentry *dentry)
{
        // TODO: check d_is_whiteout(dentry) also.
        return d_is_miss(dentry);
}

static inline bool d_flags_negative(unsigned flags)
{
        return (flags & DCACHE_ENTRY_TYPE) == DCACHE_MISS_TYPE;
}

static inline bool d_is_positive(const struct dentry *dentry)
{
        return !d_is_negative(dentry);
}

/**
 * d_really_is_negative - Determine if a dentry is really negative (ignoring fallthroughs)
 * @dentry: The dentry in question
 *
 * Returns true if the dentry represents either an absent name or a name that
 * doesn't map to an inode (ie. ->d_inode is NULL).  The dentry could represent
 * a true miss, a whiteout that isn't represented by a 0,0 chardev or a
 * fallthrough marker in an opaque directory.
 *
 * Note!  (1) This should be used *only* by a filesystem to examine its own
 * dentries.  It should not be used to look at some other filesystem's
 * dentries.  (2) It should also be used in combination with d_inode() to get
 * the inode.  (3) The dentry may have something attached to ->d_lower and the
 * type field of the flags may be set to something other than miss or whiteout.
 */
static inline bool d_really_is_negative(const struct dentry *dentry)
{
        return dentry->d_inode == NULL;
}

/**
 * d_really_is_positive - Determine if a dentry is really positive (ignoring fallthroughs)
 * @dentry: The dentry in question
 *
 * Returns true if the dentry represents a name that maps to an inode
 * (ie. ->d_inode is not NULL).  The dentry might still represent a whiteout if
 * that is represented on medium as a 0,0 chardev.
 *
 * Note!  (1) This should be used *only* by a filesystem to examine its own
 * dentries.  It should not be used to look at some other filesystem's
 * dentries.  (2) It should also be used in combination with d_inode() to get
 * the inode.
 */
static inline bool d_really_is_positive(const struct dentry *dentry)
{
        return dentry->d_inode != NULL;
}

static inline int simple_positive(const struct dentry *dentry)
{
        return d_really_is_positive(dentry) && !d_unhashed(dentry);
}

unsigned long vfs_pressure_ratio(unsigned long val);

/**
 * d_inode - Get the actual inode of this dentry
 * @dentry: The dentry to query
 *
 * This is the helper normal filesystems should use to get at their own inodes
 * in their own dentries and ignore the layering superimposed upon them.
 */
static inline struct inode *d_inode(const struct dentry *dentry)
{
        return dentry->d_inode;
}

/**
 * d_inode_rcu - Get the actual inode of this dentry with READ_ONCE()
 * @dentry: The dentry to query
 *
 * This is the helper normal filesystems should use to get at their own inodes
 * in their own dentries and ignore the layering superimposed upon them.
 */
static inline struct inode *d_inode_rcu(const struct dentry *dentry)
{
        return READ_ONCE(dentry->d_inode);
}

/**
 * d_backing_inode - Get upper or lower inode we should be using
 * @upper: The upper layer
 *
 * This is the helper that should be used to get at the inode that will be used
 * if this dentry were to be opened as a file.  The inode may be on the upper
 * dentry or it may be on a lower dentry pinned by the upper.
 *
 * Normal filesystems should not use this to access their own inodes.
 */
static inline struct inode *d_backing_inode(const struct dentry *upper)
{
        struct inode *inode = upper->d_inode;

        return inode;
}

/**
 * d_real - Return the real dentry
 * @dentry: the dentry to query
 * @type: the type of real dentry (data or metadata)
 *
 * If dentry is on a union/overlay, then return the underlying, real dentry.
 * Otherwise return the dentry itself.
 *
 * See also: Documentation/filesystems/vfs.rst
 */
static inline struct dentry *d_real(struct dentry *dentry, enum d_real_type type)
{
        if (unlikely(dentry->d_flags & DCACHE_OP_REAL))
                return dentry->d_op->d_real(dentry, type);
        else
                return dentry;
}

/**
 * d_real_inode - Return the real inode hosting the data
 * @dentry: The dentry to query
 *
 * If dentry is on a union/overlay, then return the underlying, real inode.
 * Otherwise return d_inode().
 */
static inline struct inode *d_real_inode(const struct dentry *dentry)
{
        /* This usage of d_real() results in const dentry */
        return d_inode(d_real((struct dentry *) dentry, D_REAL_DATA));
}

struct name_snapshot {
        struct qstr name;
        union shortname_store inline_name;
};
void take_dentry_name_snapshot(struct name_snapshot *, struct dentry *);
void release_dentry_name_snapshot(struct name_snapshot *);

static inline struct dentry *d_first_child(const struct dentry *dentry)
{
        return hlist_entry_safe(dentry->d_children.first, struct dentry, d_sib);
}

static inline struct dentry *d_next_sibling(const struct dentry *dentry)
{
        return hlist_entry_safe(dentry->d_sib.next, struct dentry, d_sib);
}

void set_default_d_op(struct super_block *, const struct dentry_operations *);

#endif        /* __LINUX_DCACHE_H */





































































































































































































































































































































































































    4 































































































































































































    4 





    4 

    4 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef _LINUX_KPROBES_H
#define _LINUX_KPROBES_H
/*
 *  Kernel Probes (KProbes)
 *
 * Copyright (C) IBM Corporation, 2002, 2004
 *
 * 2002-Oct        Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
 *                Probes initial implementation ( includes suggestions from
 *                Rusty Russell).
 * 2004-July        Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
 *                interface to access function arguments.
 * 2005-May        Hien Nguyen <hien@us.ibm.com> and Jim Keniston
 *                <jkenisto@us.ibm.com>  and Prasanna S Panchamukhi
 *                <prasanna@in.ibm.com> added function-return probes.
 */
#include <linux/compiler.h>
#include <linux/linkage.h>
#include <linux/list.h>
#include <linux/notifier.h>
#include <linux/smp.h>
#include <linux/bug.h>
#include <linux/percpu.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/mutex.h>
#include <linux/ftrace.h>
#include <linux/objpool.h>
#include <linux/rethook.h>
#include <asm/kprobes.h>

#ifdef CONFIG_KPROBES

/* kprobe_status settings */
#define KPROBE_HIT_ACTIVE        0x00000001
#define KPROBE_HIT_SS                0x00000002
#define KPROBE_REENTER                0x00000004
#define KPROBE_HIT_SSDONE        0x00000008

#else /* !CONFIG_KPROBES */
#include <asm-generic/kprobes.h>
typedef int kprobe_opcode_t;
struct arch_specific_insn {
        int dummy;
};
#endif /* CONFIG_KPROBES */

struct kprobe;
struct pt_regs;
struct kretprobe;
struct kretprobe_instance;
typedef int (*kprobe_pre_handler_t) (struct kprobe *, struct pt_regs *);
typedef void (*kprobe_post_handler_t) (struct kprobe *, struct pt_regs *,
                                       unsigned long flags);
typedef int (*kretprobe_handler_t) (struct kretprobe_instance *,
                                    struct pt_regs *);

struct kprobe {
        struct hlist_node hlist;

        /* list of kprobes for multi-handler support */
        struct list_head list;

        /*count the number of times this probe was temporarily disarmed */
        unsigned long nmissed;

        /* location of the probe point */
        kprobe_opcode_t *addr;

        /* Allow user to indicate symbol name of the probe point */
        const char *symbol_name;

        /* Offset into the symbol */
        unsigned int offset;

        /* Called before addr is executed. */
        kprobe_pre_handler_t pre_handler;

        /* Called after addr is executed, unless... */
        kprobe_post_handler_t post_handler;

        /* Saved opcode (which has been replaced with breakpoint) */
        kprobe_opcode_t opcode;

        /* copy of the original instruction */
        struct arch_specific_insn ainsn;

        /*
         * Indicates various status flags.
         * Protected by kprobe_mutex after this kprobe is registered.
         */
        u32 flags;
};

/* Kprobe status flags */
#define KPROBE_FLAG_GONE        1 /* breakpoint has already gone */
#define KPROBE_FLAG_DISABLED        2 /* probe is temporarily disabled */
#define KPROBE_FLAG_OPTIMIZED        4 /*
                                   * probe is really optimized.
                                   * NOTE:
                                   * this flag is only for optimized_kprobe.
                                   */
#define KPROBE_FLAG_FTRACE        8 /* probe is using ftrace */
#define KPROBE_FLAG_ON_FUNC_ENTRY        16 /* probe is on the function entry */

/* Has this kprobe gone ? */
static inline bool kprobe_gone(struct kprobe *p)
{
        return p->flags & KPROBE_FLAG_GONE;
}

/* Is this kprobe disabled ? */
static inline bool kprobe_disabled(struct kprobe *p)
{
        return p->flags & (KPROBE_FLAG_DISABLED | KPROBE_FLAG_GONE);
}

/* Is this kprobe really running optimized path ? */
static inline bool kprobe_optimized(struct kprobe *p)
{
        return p->flags & KPROBE_FLAG_OPTIMIZED;
}

/* Is this kprobe uses ftrace ? */
static inline bool kprobe_ftrace(struct kprobe *p)
{
        return p->flags & KPROBE_FLAG_FTRACE;
}

/*
 * Function-return probe -
 * Note:
 * User needs to provide a handler function, and initialize maxactive.
 * maxactive - The maximum number of instances of the probed function that
 * can be active concurrently.
 * nmissed - tracks the number of times the probed function's return was
 * ignored, due to maxactive being too low.
 *
 */
struct kretprobe_holder {
        struct kretprobe __rcu *rp;
        struct objpool_head        pool;
};

struct kretprobe {
        struct kprobe kp;
        kretprobe_handler_t handler;
        kretprobe_handler_t entry_handler;
        int maxactive;
        int nmissed;
        size_t data_size;
#ifdef CONFIG_KRETPROBE_ON_RETHOOK
        struct rethook *rh;
#else
        struct kretprobe_holder *rph;
#endif
};

#define KRETPROBE_MAX_DATA_SIZE        4096

struct kretprobe_instance {
#ifdef CONFIG_KRETPROBE_ON_RETHOOK
        struct rethook_node node;
#else
        struct rcu_head rcu;
        struct llist_node llist;
        struct kretprobe_holder *rph;
        kprobe_opcode_t *ret_addr;
        void *fp;
#endif
        char data[];
};

struct kretprobe_blackpoint {
        const char *name;
        void *addr;
};

struct kprobe_blacklist_entry {
        struct list_head list;
        unsigned long start_addr;
        unsigned long end_addr;
};

#ifdef CONFIG_KPROBES
DECLARE_PER_CPU(struct kprobe *, current_kprobe);
DECLARE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);

extern void kprobe_busy_begin(void);
extern void kprobe_busy_end(void);

#ifdef CONFIG_KRETPROBES
/* Check whether @p is used for implementing a trampoline. */
extern int arch_trampoline_kprobe(struct kprobe *p);

#ifdef CONFIG_KRETPROBE_ON_RETHOOK
static nokprobe_inline struct kretprobe *get_kretprobe(struct kretprobe_instance *ri)
{
        /* rethook::data is non-changed field, so that you can access it freely. */
        return (struct kretprobe *)ri->node.rethook->data;
}
static nokprobe_inline unsigned long get_kretprobe_retaddr(struct kretprobe_instance *ri)
{
        return ri->node.ret_addr;
}
#else
extern void arch_prepare_kretprobe(struct kretprobe_instance *ri,
                                   struct pt_regs *regs);
void arch_kretprobe_fixup_return(struct pt_regs *regs,
                                 kprobe_opcode_t *correct_ret_addr);

void __kretprobe_trampoline(void);
/*
 * Since some architecture uses structured function pointer,
 * use dereference_function_descriptor() to get real function address.
 */
static nokprobe_inline void *kretprobe_trampoline_addr(void)
{
        return dereference_kernel_function_descriptor(__kretprobe_trampoline);
}

/* If the trampoline handler called from a kprobe, use this version */
unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs,
                                             void *frame_pointer);

static nokprobe_inline
unsigned long kretprobe_trampoline_handler(struct pt_regs *regs,
                                           void *frame_pointer)
{
        unsigned long ret;
        /*
         * Set a dummy kprobe for avoiding kretprobe recursion.
         * Since kretprobe never runs in kprobe handler, no kprobe must
         * be running at this point.
         */
        kprobe_busy_begin();
        ret = __kretprobe_trampoline_handler(regs, frame_pointer);
        kprobe_busy_end();

        return ret;
}

static nokprobe_inline struct kretprobe *get_kretprobe(struct kretprobe_instance *ri)
{
        return rcu_dereference_check(ri->rph->rp, rcu_read_lock_any_held());
}

static nokprobe_inline unsigned long get_kretprobe_retaddr(struct kretprobe_instance *ri)
{
        return (unsigned long)ri->ret_addr;
}
#endif /* CONFIG_KRETPROBE_ON_RETHOOK */

#else /* !CONFIG_KRETPROBES */
static inline void arch_prepare_kretprobe(struct kretprobe *rp,
                                        struct pt_regs *regs)
{
}
static inline int arch_trampoline_kprobe(struct kprobe *p)
{
        return 0;
}
#endif /* CONFIG_KRETPROBES */

/* Markers of '_kprobe_blacklist' section */
extern unsigned long __start_kprobe_blacklist[];
extern unsigned long __stop_kprobe_blacklist[];

extern struct kretprobe_blackpoint kretprobe_blacklist[];

extern int arch_prepare_kprobe(struct kprobe *p);
extern void arch_arm_kprobe(struct kprobe *p);
extern void arch_disarm_kprobe(struct kprobe *p);
extern int arch_init_kprobes(void);
extern void kprobes_inc_nmissed_count(struct kprobe *p);
extern bool arch_within_kprobe_blacklist(unsigned long addr);
extern int arch_populate_kprobe_blacklist(void);
extern int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset);

extern bool within_kprobe_blacklist(unsigned long addr);
extern int kprobe_add_ksym_blacklist(unsigned long entry);
extern int kprobe_add_area_blacklist(unsigned long start, unsigned long end);

struct kprobe_insn_cache {
        struct mutex mutex;
        void *(*alloc)(void);        /* allocate insn page */
        void (*free)(void *);        /* free insn page */
        const char *sym;        /* symbol for insn pages */
        struct list_head pages; /* list of kprobe_insn_page */
        size_t insn_size;        /* size of instruction slot */
        int nr_garbage;
};

#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
extern kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c);
extern void __free_insn_slot(struct kprobe_insn_cache *c,
                             kprobe_opcode_t *slot, int dirty);
/* sleep-less address checking routine  */
extern bool __is_insn_slot_addr(struct kprobe_insn_cache *c,
                                unsigned long addr);

#define DEFINE_INSN_CACHE_OPS(__name)                                        \
extern struct kprobe_insn_cache kprobe_##__name##_slots;                \
                                                                        \
static inline kprobe_opcode_t *get_##__name##_slot(void)                \
{                                                                        \
        return __get_insn_slot(&kprobe_##__name##_slots);                \
}                                                                        \
                                                                        \
static inline void free_##__name##_slot(kprobe_opcode_t *slot, int dirty)\
{                                                                        \
        __free_insn_slot(&kprobe_##__name##_slots, slot, dirty);        \
}                                                                        \
                                                                        \
static inline bool is_kprobe_##__name##_slot(unsigned long addr)        \
{                                                                        \
        return __is_insn_slot_addr(&kprobe_##__name##_slots, addr);        \
}
#define KPROBE_INSN_PAGE_SYM                "kprobe_insn_page"
#define KPROBE_OPTINSN_PAGE_SYM                "kprobe_optinsn_page"
int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum,
                             unsigned long *value, char *type, char *sym);
#else /* !__ARCH_WANT_KPROBES_INSN_SLOT */
#define DEFINE_INSN_CACHE_OPS(__name)                                        \
static inline bool is_kprobe_##__name##_slot(unsigned long addr)        \
{                                                                        \
        return 0;                                                        \
}
#endif

DEFINE_INSN_CACHE_OPS(insn);

#ifdef CONFIG_OPTPROBES
/*
 * Internal structure for direct jump optimized probe
 */
struct optimized_kprobe {
        struct kprobe kp;
        struct list_head list;        /* list for optimizing queue */
        struct arch_optimized_insn optinsn;
};

/* Architecture dependent functions for direct jump optimization */
extern int arch_prepared_optinsn(struct arch_optimized_insn *optinsn);
extern int arch_check_optimized_kprobe(struct optimized_kprobe *op);
extern int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
                                         struct kprobe *orig);
extern void arch_remove_optimized_kprobe(struct optimized_kprobe *op);
extern void arch_optimize_kprobes(struct list_head *oplist);
extern void arch_unoptimize_kprobes(struct list_head *oplist,
                                    struct list_head *done_list);
extern void arch_unoptimize_kprobe(struct optimized_kprobe *op);
extern int arch_within_optimized_kprobe(struct optimized_kprobe *op,
                                        kprobe_opcode_t *addr);

extern void opt_pre_handler(struct kprobe *p, struct pt_regs *regs);

DEFINE_INSN_CACHE_OPS(optinsn);

extern void wait_for_kprobe_optimizer(void);
bool optprobe_queued_unopt(struct optimized_kprobe *op);
bool kprobe_disarmed(struct kprobe *p);
#else /* !CONFIG_OPTPROBES */
static inline void wait_for_kprobe_optimizer(void) { }
#endif /* CONFIG_OPTPROBES */

#ifdef CONFIG_KPROBES_ON_FTRACE
extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
                                  struct ftrace_ops *ops, struct ftrace_regs *fregs);
extern int arch_prepare_kprobe_ftrace(struct kprobe *p);
/* Set when ftrace has been killed: kprobes on ftrace must be disabled for safety */
extern bool kprobe_ftrace_disabled __read_mostly;
extern void kprobe_ftrace_kill(void);
#else
static inline int arch_prepare_kprobe_ftrace(struct kprobe *p)
{
        return -EINVAL;
}
static inline void kprobe_ftrace_kill(void) {}
#endif /* CONFIG_KPROBES_ON_FTRACE */

/* Get the kprobe at this addr (if any) - called with preemption disabled */
struct kprobe *get_kprobe(void *addr);

/* kprobe_running() will just return the current_kprobe on this CPU */
static inline struct kprobe *kprobe_running(void)
{
        return __this_cpu_read(current_kprobe);
}

static inline void reset_current_kprobe(void)
{
        __this_cpu_write(current_kprobe, NULL);
}

static inline struct kprobe_ctlblk *get_kprobe_ctlblk(void)
{
        return this_cpu_ptr(&kprobe_ctlblk);
}

kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset);
kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long addr, unsigned long offset, bool *on_func_entry);

int register_kprobe(struct kprobe *p);
void unregister_kprobe(struct kprobe *p);
int register_kprobes(struct kprobe **kps, int num);
void unregister_kprobes(struct kprobe **kps, int num);

int register_kretprobe(struct kretprobe *rp);
void unregister_kretprobe(struct kretprobe *rp);
int register_kretprobes(struct kretprobe **rps, int num);
void unregister_kretprobes(struct kretprobe **rps, int num);

#if defined(CONFIG_KRETPROBE_ON_RETHOOK) || !defined(CONFIG_KRETPROBES)
#define kprobe_flush_task(tk)        do {} while (0)
#else
void kprobe_flush_task(struct task_struct *tk);
#endif

void kprobe_free_init_mem(void);

int disable_kprobe(struct kprobe *kp);
int enable_kprobe(struct kprobe *kp);

void dump_kprobe(struct kprobe *kp);

void *alloc_insn_page(void);

void *alloc_optinsn_page(void);
void free_optinsn_page(void *page);

int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
                       char *sym);

int arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
                            char *type, char *sym);

int kprobe_exceptions_notify(struct notifier_block *self,
                             unsigned long val, void *data);

#else /* !CONFIG_KPROBES: */

static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
{
        return 0;
}
static inline struct kprobe *get_kprobe(void *addr)
{
        return NULL;
}
static inline struct kprobe *kprobe_running(void)
{
        return NULL;
}
#define kprobe_busy_begin()        do {} while (0)
#define kprobe_busy_end()        do {} while (0)

static inline int register_kprobe(struct kprobe *p)
{
        return -EOPNOTSUPP;
}
static inline int register_kprobes(struct kprobe **kps, int num)
{
        return -EOPNOTSUPP;
}
static inline void unregister_kprobe(struct kprobe *p)
{
}
static inline void unregister_kprobes(struct kprobe **kps, int num)
{
}
static inline int register_kretprobe(struct kretprobe *rp)
{
        return -EOPNOTSUPP;
}
static inline int register_kretprobes(struct kretprobe **rps, int num)
{
        return -EOPNOTSUPP;
}
static inline void unregister_kretprobe(struct kretprobe *rp)
{
}
static inline void unregister_kretprobes(struct kretprobe **rps, int num)
{
}
static inline void kprobe_flush_task(struct task_struct *tk)
{
}
static inline void kprobe_free_init_mem(void)
{
}
static inline void kprobe_ftrace_kill(void)
{
}
static inline int disable_kprobe(struct kprobe *kp)
{
        return -EOPNOTSUPP;
}
static inline int enable_kprobe(struct kprobe *kp)
{
        return -EOPNOTSUPP;
}

static inline bool within_kprobe_blacklist(unsigned long addr)
{
        return true;
}
static inline int kprobe_get_kallsym(unsigned int symnum, unsigned long *value,
                                     char *type, char *sym)
{
        return -ERANGE;
}
#endif /* CONFIG_KPROBES */

static inline int disable_kretprobe(struct kretprobe *rp)
{
        return disable_kprobe(&rp->kp);
}
static inline int enable_kretprobe(struct kretprobe *rp)
{
        return enable_kprobe(&rp->kp);
}

#ifndef CONFIG_KPROBES
static inline bool is_kprobe_insn_slot(unsigned long addr)
{
        return false;
}
#endif /* !CONFIG_KPROBES */

#ifndef CONFIG_OPTPROBES
static inline bool is_kprobe_optinsn_slot(unsigned long addr)
{
        return false;
}
#endif /* !CONFIG_OPTPROBES */

#ifdef CONFIG_KRETPROBES
#ifdef CONFIG_KRETPROBE_ON_RETHOOK
static nokprobe_inline bool is_kretprobe_trampoline(unsigned long addr)
{
        return is_rethook_trampoline(addr);
}

static nokprobe_inline
unsigned long kretprobe_find_ret_addr(struct task_struct *tsk, void *fp,
                                      struct llist_node **cur)
{
        return rethook_find_ret_addr(tsk, (unsigned long)fp, cur);
}
#else
static nokprobe_inline bool is_kretprobe_trampoline(unsigned long addr)
{
        return (void *)addr == kretprobe_trampoline_addr();
}

unsigned long kretprobe_find_ret_addr(struct task_struct *tsk, void *fp,
                                      struct llist_node **cur);
#endif
#else
static nokprobe_inline bool is_kretprobe_trampoline(unsigned long addr)
{
        return false;
}

static nokprobe_inline
unsigned long kretprobe_find_ret_addr(struct task_struct *tsk, void *fp,
                                      struct llist_node **cur)
{
        return 0;
}
#endif

/* Returns true if kprobes handled the fault */
static nokprobe_inline bool kprobe_page_fault(struct pt_regs *regs,
                                              unsigned int trap)
{
        if (!IS_ENABLED(CONFIG_KPROBES))
                return false;
        if (user_mode(regs))
                return false;
        /*
         * To be potentially processing a kprobe fault and to be allowed
         * to call kprobe_running(), we have to be non-preemptible.
         */
        if (preemptible())
                return false;
        if (!kprobe_running())
                return false;
        return kprobe_fault_handler(regs, trap);
}

#endif /* _LINUX_KPROBES_H */































































    1 





    2 
    2 















































































































    1 


























    1 

   57 






































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_FIND_H_
#define __LINUX_FIND_H_

#ifndef __LINUX_BITMAP_H
#error only <linux/bitmap.h> can be included directly
#endif

#include <linux/bitops.h>

unsigned long _find_next_bit(const unsigned long *addr1, unsigned long nbits,
                                unsigned long start);
unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2,
                                        unsigned long nbits, unsigned long start);
unsigned long _find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
                                        unsigned long nbits, unsigned long start);
unsigned long _find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2,
                                        unsigned long nbits, unsigned long start);
unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits,
                                         unsigned long start);
extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size);
unsigned long __find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n);
unsigned long __find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2,
                                unsigned long size, unsigned long n);
unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
                                        unsigned long size, unsigned long n);
unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
                                        const unsigned long *addr3, unsigned long size,
                                        unsigned long n);
extern unsigned long _find_first_and_bit(const unsigned long *addr1,
                                         const unsigned long *addr2, unsigned long size);
unsigned long _find_first_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
                                 unsigned long size);
unsigned long _find_first_and_and_bit(const unsigned long *addr1, const unsigned long *addr2,
                                      const unsigned long *addr3, unsigned long size);
extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size);
extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size);

#ifdef __BIG_ENDIAN
unsigned long _find_first_zero_bit_le(const unsigned long *addr, unsigned long size);
unsigned long _find_next_zero_bit_le(const  unsigned long *addr, unsigned
                                        long size, unsigned long offset);
unsigned long _find_next_bit_le(const unsigned long *addr, unsigned
                                long size, unsigned long offset);
#endif

unsigned long find_random_bit(const unsigned long *addr, unsigned long size);

#ifndef find_next_bit
/**
 * find_next_bit - find the next set bit in a memory region
 * @addr: The address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number for the next set bit
 * If no bits are set, returns @size.
 */
static __always_inline
unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
                            unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val;

                if (unlikely(offset >= size))
                        return size;

                val = *addr & GENMASK(size - 1, offset);
                return val ? __ffs(val) : size;
        }

        return _find_next_bit(addr, size, offset);
}
#endif

#ifndef find_next_and_bit
/**
 * find_next_and_bit - find the next set bit in both memory regions
 * @addr1: The first address to base the search on
 * @addr2: The second address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number for the next set bit
 * If no bits are set, returns @size.
 */
static __always_inline
unsigned long find_next_and_bit(const unsigned long *addr1,
                const unsigned long *addr2, unsigned long size,
                unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val;

                if (unlikely(offset >= size))
                        return size;

                val = *addr1 & *addr2 & GENMASK(size - 1, offset);
                return val ? __ffs(val) : size;
        }

        return _find_next_and_bit(addr1, addr2, size, offset);
}
#endif

#ifndef find_next_andnot_bit
/**
 * find_next_andnot_bit - find the next set bit in *addr1 excluding all the bits
 *                        in *addr2
 * @addr1: The first address to base the search on
 * @addr2: The second address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number for the next set bit
 * If no bits are set, returns @size.
 */
static __always_inline
unsigned long find_next_andnot_bit(const unsigned long *addr1,
                const unsigned long *addr2, unsigned long size,
                unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val;

                if (unlikely(offset >= size))
                        return size;

                val = *addr1 & ~*addr2 & GENMASK(size - 1, offset);
                return val ? __ffs(val) : size;
        }

        return _find_next_andnot_bit(addr1, addr2, size, offset);
}
#endif

#ifndef find_next_or_bit
/**
 * find_next_or_bit - find the next set bit in either memory regions
 * @addr1: The first address to base the search on
 * @addr2: The second address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number for the next set bit
 * If no bits are set, returns @size.
 */
static __always_inline
unsigned long find_next_or_bit(const unsigned long *addr1,
                const unsigned long *addr2, unsigned long size,
                unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val;

                if (unlikely(offset >= size))
                        return size;

                val = (*addr1 | *addr2) & GENMASK(size - 1, offset);
                return val ? __ffs(val) : size;
        }

        return _find_next_or_bit(addr1, addr2, size, offset);
}
#endif

#ifndef find_next_zero_bit
/**
 * find_next_zero_bit - find the next cleared bit in a memory region
 * @addr: The address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number of the next zero bit
 * If no bits are zero, returns @size.
 */
static __always_inline
unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
                                 unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val;

                if (unlikely(offset >= size))
                        return size;

                val = *addr | ~GENMASK(size - 1, offset);
                return val == ~0UL ? size : ffz(val);
        }

        return _find_next_zero_bit(addr, size, offset);
}
#endif

#ifndef find_first_bit
/**
 * find_first_bit - find the first set bit in a memory region
 * @addr: The address to start the search at
 * @size: The maximum number of bits to search
 *
 * Returns the bit number of the first set bit.
 * If no bits are set, returns @size.
 */
static __always_inline
unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
{
        if (small_const_nbits(size)) {
                unsigned long val = *addr & GENMASK(size - 1, 0);

                return val ? __ffs(val) : size;
        }

        return _find_first_bit(addr, size);
}
#endif

/**
 * find_nth_bit - find N'th set bit in a memory region
 * @addr: The address to start the search at
 * @size: The maximum number of bits to search
 * @n: The number of set bit, which position is needed, counting from 0
 *
 * The following is semantically equivalent:
 *         idx = find_nth_bit(addr, size, 0);
 *         idx = find_first_bit(addr, size);
 *
 * Returns the bit number of the N'th set bit.
 * If no such, returns >= @size.
 */
static __always_inline
unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n)
{
        if (n >= size)
                return size;

        if (small_const_nbits(size)) {
                unsigned long val =  *addr & GENMASK(size - 1, 0);

                return val ? fns(val, n) : size;
        }

        return __find_nth_bit(addr, size, n);
}

/**
 * find_nth_and_bit - find N'th set bit in 2 memory regions
 * @addr1: The 1st address to start the search at
 * @addr2: The 2nd address to start the search at
 * @size: The maximum number of bits to search
 * @n: The number of set bit, which position is needed, counting from 0
 *
 * Returns the bit number of the N'th set bit.
 * If no such, returns @size.
 */
static __always_inline
unsigned long find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2,
                                unsigned long size, unsigned long n)
{
        if (n >= size)
                return size;

        if (small_const_nbits(size)) {
                unsigned long val =  *addr1 & *addr2 & GENMASK(size - 1, 0);

                return val ? fns(val, n) : size;
        }

        return __find_nth_and_bit(addr1, addr2, size, n);
}

/**
 * find_nth_and_andnot_bit - find N'th set bit in 2 memory regions,
 *                             excluding those set in 3rd region
 * @addr1: The 1st address to start the search at
 * @addr2: The 2nd address to start the search at
 * @addr3: The 3rd address to start the search at
 * @size: The maximum number of bits to search
 * @n: The number of set bit, which position is needed, counting from 0
 *
 * Returns the bit number of the N'th set bit.
 * If no such, returns @size.
 */
static __always_inline
unsigned long find_nth_and_andnot_bit(const unsigned long *addr1,
                                        const unsigned long *addr2,
                                        const unsigned long *addr3,
                                        unsigned long size, unsigned long n)
{
        if (n >= size)
                return size;

        if (small_const_nbits(size)) {
                unsigned long val =  *addr1 & *addr2 & (~*addr3) & GENMASK(size - 1, 0);

                return val ? fns(val, n) : size;
        }

        return __find_nth_and_andnot_bit(addr1, addr2, addr3, size, n);
}

#ifndef find_first_and_bit
/**
 * find_first_and_bit - find the first set bit in both memory regions
 * @addr1: The first address to base the search on
 * @addr2: The second address to base the search on
 * @size: The bitmap size in bits
 *
 * Returns the bit number for the next set bit
 * If no bits are set, returns @size.
 */
static __always_inline
unsigned long find_first_and_bit(const unsigned long *addr1,
                                 const unsigned long *addr2,
                                 unsigned long size)
{
        if (small_const_nbits(size)) {
                unsigned long val = *addr1 & *addr2 & GENMASK(size - 1, 0);

                return val ? __ffs(val) : size;
        }

        return _find_first_and_bit(addr1, addr2, size);
}
#endif

/**
 * find_first_andnot_bit - find the first bit set in 1st memory region and unset in 2nd
 * @addr1: The first address to base the search on
 * @addr2: The second address to base the search on
 * @size: The bitmap size in bits
 *
 * Returns the bit number for the first set bit
 * If no bits are set, returns >= @size.
 */
static __always_inline
unsigned long find_first_andnot_bit(const unsigned long *addr1,
                                 const unsigned long *addr2,
                                 unsigned long size)
{
        if (small_const_nbits(size)) {
                unsigned long val = *addr1 & (~*addr2) & GENMASK(size - 1, 0);

                return val ? __ffs(val) : size;
        }

        return _find_first_andnot_bit(addr1, addr2, size);
}

/**
 * find_first_and_and_bit - find the first set bit in 3 memory regions
 * @addr1: The first address to base the search on
 * @addr2: The second address to base the search on
 * @addr3: The third address to base the search on
 * @size: The bitmap size in bits
 *
 * Returns the bit number for the first set bit
 * If no bits are set, returns @size.
 */
static __always_inline
unsigned long find_first_and_and_bit(const unsigned long *addr1,
                                     const unsigned long *addr2,
                                     const unsigned long *addr3,
                                     unsigned long size)
{
        if (small_const_nbits(size)) {
                unsigned long val = *addr1 & *addr2 & *addr3 & GENMASK(size - 1, 0);

                return val ? __ffs(val) : size;
        }

        return _find_first_and_and_bit(addr1, addr2, addr3, size);
}

#ifndef find_first_zero_bit
/**
 * find_first_zero_bit - find the first cleared bit in a memory region
 * @addr: The address to start the search at
 * @size: The maximum number of bits to search
 *
 * Returns the bit number of the first cleared bit.
 * If no bits are zero, returns @size.
 */
static __always_inline
unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
{
        if (small_const_nbits(size)) {
                unsigned long val = *addr | ~GENMASK(size - 1, 0);

                return val == ~0UL ? size : ffz(val);
        }

        return _find_first_zero_bit(addr, size);
}
#endif

#ifndef find_last_bit
/**
 * find_last_bit - find the last set bit in a memory region
 * @addr: The address to start the search at
 * @size: The number of bits to search
 *
 * Returns the bit number of the last set bit, or size.
 */
static __always_inline
unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
{
        if (small_const_nbits(size)) {
                unsigned long val = *addr & GENMASK(size - 1, 0);

                return val ? __fls(val) : size;
        }

        return _find_last_bit(addr, size);
}
#endif

/**
 * find_next_and_bit_wrap - find the next set bit in both memory regions
 * @addr1: The first address to base the search on
 * @addr2: The second address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number for the next set bit, or first set bit up to @offset
 * If no bits are set, returns @size.
 */
static __always_inline
unsigned long find_next_and_bit_wrap(const unsigned long *addr1,
                                        const unsigned long *addr2,
                                        unsigned long size, unsigned long offset)
{
        unsigned long bit = find_next_and_bit(addr1, addr2, size, offset);

        if (bit < size || offset == 0)
                return bit;

        bit = find_first_and_bit(addr1, addr2, offset);
        return bit < offset ? bit : size;
}

/**
 * find_next_bit_wrap - find the next set bit in a memory region
 * @addr: The address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number for the next set bit, or first set bit up to @offset
 * If no bits are set, returns @size.
 */
static __always_inline
unsigned long find_next_bit_wrap(const unsigned long *addr,
                                        unsigned long size, unsigned long offset)
{
        unsigned long bit = find_next_bit(addr, size, offset);

        if (bit < size || offset == 0)
                return bit;

        bit = find_first_bit(addr, offset);
        return bit < offset ? bit : size;
}

/*
 * Helper for for_each_set_bit_wrap(). Make sure you're doing right thing
 * before using it alone.
 */
static __always_inline
unsigned long __for_each_wrap(const unsigned long *bitmap, unsigned long size,
                                 unsigned long start, unsigned long n)
{
        unsigned long bit;

        /* If not wrapped around */
        if (n > start) {
                /* and have a bit, just return it. */
                bit = find_next_bit(bitmap, size, n);
                if (bit < size)
                        return bit;

                /* Otherwise, wrap around and ... */
                n = 0;
        }

        /* Search the other part. */
        bit = find_next_bit(bitmap, start, n);
        return bit < start ? bit : size;
}

/**
 * find_next_clump8 - find next 8-bit clump with set bits in a memory region
 * @clump: location to store copy of found clump
 * @addr: address to base the search on
 * @size: bitmap size in number of bits
 * @offset: bit offset at which to start searching
 *
 * Returns the bit offset for the next set clump; the found clump value is
 * copied to the location pointed by @clump. If no bits are set, returns @size.
 */
extern unsigned long find_next_clump8(unsigned long *clump,
                                      const unsigned long *addr,
                                      unsigned long size, unsigned long offset);

#define find_first_clump8(clump, bits, size) \
        find_next_clump8((clump), (bits), (size), 0)

#if defined(__LITTLE_ENDIAN)

static __always_inline
unsigned long find_next_zero_bit_le(const void *addr, unsigned long size, unsigned long offset)
{
        return find_next_zero_bit(addr, size, offset);
}

static __always_inline
unsigned long find_next_bit_le(const void *addr, unsigned long size, unsigned long offset)
{
        return find_next_bit(addr, size, offset);
}

static __always_inline
unsigned long find_first_zero_bit_le(const void *addr, unsigned long size)
{
        return find_first_zero_bit(addr, size);
}

#elif defined(__BIG_ENDIAN)

#ifndef find_next_zero_bit_le
static __always_inline
unsigned long find_next_zero_bit_le(const void *addr, unsigned
                long size, unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val = *(const unsigned long *)addr;

                if (unlikely(offset >= size))
                        return size;

                val = swab(val) | ~GENMASK(size - 1, offset);
                return val == ~0UL ? size : ffz(val);
        }

        return _find_next_zero_bit_le(addr, size, offset);
}
#endif

#ifndef find_first_zero_bit_le
static __always_inline
unsigned long find_first_zero_bit_le(const void *addr, unsigned long size)
{
        if (small_const_nbits(size)) {
                unsigned long val = swab(*(const unsigned long *)addr) | ~GENMASK(size - 1, 0);

                return val == ~0UL ? size : ffz(val);
        }

        return _find_first_zero_bit_le(addr, size);
}
#endif

#ifndef find_next_bit_le
static __always_inline
unsigned long find_next_bit_le(const void *addr, unsigned
                long size, unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val = *(const unsigned long *)addr;

                if (unlikely(offset >= size))
                        return size;

                val = swab(val) & GENMASK(size - 1, offset);
                return val ? __ffs(val) : size;
        }

        return _find_next_bit_le(addr, size, offset);
}
#endif

#else
#error "Please fix <asm/byteorder.h>"
#endif

#define for_each_set_bit(bit, addr, size) \
        for ((bit) = 0; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++)

#define for_each_and_bit(bit, addr1, addr2, size) \
        for ((bit) = 0;                                                                        \
             (bit) = find_next_and_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\
             (bit)++)

#define for_each_andnot_bit(bit, addr1, addr2, size) \
        for ((bit) = 0;                                                                        \
             (bit) = find_next_andnot_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\
             (bit)++)

#define for_each_or_bit(bit, addr1, addr2, size) \
        for ((bit) = 0;                                                                        \
             (bit) = find_next_or_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\
             (bit)++)

/* same as for_each_set_bit() but use bit as value to start with */
#define for_each_set_bit_from(bit, addr, size) \
        for (; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++)

#define for_each_clear_bit(bit, addr, size) \
        for ((bit) = 0;                                                                        \
             (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size);                \
             (bit)++)

/* same as for_each_clear_bit() but use bit as value to start with */
#define for_each_clear_bit_from(bit, addr, size) \
        for (; (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size); (bit)++)

/**
 * for_each_set_bitrange - iterate over all set bit ranges [b; e)
 * @b: bit offset of start of current bitrange (first set bit)
 * @e: bit offset of end of current bitrange (first unset bit)
 * @addr: bitmap address to base the search on
 * @size: bitmap size in number of bits
 */
#define for_each_set_bitrange(b, e, addr, size)                        \
        for ((b) = 0;                                                \
             (b) = find_next_bit((addr), (size), b),                \
             (e) = find_next_zero_bit((addr), (size), (b) + 1),        \
             (b) < (size);                                        \
             (b) = (e) + 1)

/**
 * for_each_set_bitrange_from - iterate over all set bit ranges [b; e)
 * @b: bit offset of start of current bitrange (first set bit); must be initialized
 * @e: bit offset of end of current bitrange (first unset bit)
 * @addr: bitmap address to base the search on
 * @size: bitmap size in number of bits
 */
#define for_each_set_bitrange_from(b, e, addr, size)                \
        for (;                                                        \
             (b) = find_next_bit((addr), (size), (b)),                \
             (e) = find_next_zero_bit((addr), (size), (b) + 1),        \
             (b) < (size);                                        \
             (b) = (e) + 1)

/**
 * for_each_clear_bitrange - iterate over all unset bit ranges [b; e)
 * @b: bit offset of start of current bitrange (first unset bit)
 * @e: bit offset of end of current bitrange (first set bit)
 * @addr: bitmap address to base the search on
 * @size: bitmap size in number of bits
 */
#define for_each_clear_bitrange(b, e, addr, size)                \
        for ((b) = 0;                                                \
             (b) = find_next_zero_bit((addr), (size), (b)),        \
             (e) = find_next_bit((addr), (size), (b) + 1),        \
             (b) < (size);                                        \
             (b) = (e) + 1)

/**
 * for_each_clear_bitrange_from - iterate over all unset bit ranges [b; e)
 * @b: bit offset of start of current bitrange (first set bit); must be initialized
 * @e: bit offset of end of current bitrange (first unset bit)
 * @addr: bitmap address to base the search on
 * @size: bitmap size in number of bits
 */
#define for_each_clear_bitrange_from(b, e, addr, size)                \
        for (;                                                        \
             (b) = find_next_zero_bit((addr), (size), (b)),        \
             (e) = find_next_bit((addr), (size), (b) + 1),        \
             (b) < (size);                                        \
             (b) = (e) + 1)

/**
 * for_each_set_bit_wrap - iterate over all set bits starting from @start, and
 * wrapping around the end of bitmap.
 * @bit: offset for current iteration
 * @addr: bitmap address to base the search on
 * @size: bitmap size in number of bits
 * @start: Starting bit for bitmap traversing, wrapping around the bitmap end
 */
#define for_each_set_bit_wrap(bit, addr, size, start) \
        for ((bit) = find_next_bit_wrap((addr), (size), (start));                \
             (bit) < (size);                                                        \
             (bit) = __for_each_wrap((addr), (size), (start), (bit) + 1))

/**
 * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits
 * @start: bit offset to start search and to store the current iteration offset
 * @clump: location to store copy of current 8-bit clump
 * @bits: bitmap address to base the search on
 * @size: bitmap size in number of bits
 */
#define for_each_set_clump8(start, clump, bits, size) \
        for ((start) = find_first_clump8(&(clump), (bits), (size)); \
             (start) < (size); \
             (start) = find_next_clump8(&(clump), (bits), (size), (start) + 8))

#endif /*__LINUX_FIND_H_ */















































































































































































































































































































































    4 







    4 



    4 
    4 



    4 

    4 















    4 































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 library functions
 *
 * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
 * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
 * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
 * Copyright (c) 2014 Red Hat Inc.
 * Copyright 2025 Google LLC
 */

#include <crypto/hmac.h>
#include <crypto/sha2.h>
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/string.h>
#include <linux/unaligned.h>
#include <linux/wordpart.h>

static const struct sha256_block_state sha224_iv = {
        .h = {
                SHA224_H0, SHA224_H1, SHA224_H2, SHA224_H3,
                SHA224_H4, SHA224_H5, SHA224_H6, SHA224_H7,
        },
};

static const struct sha256_ctx initial_sha256_ctx = {
        .ctx = {
                .state = {
                        .h = {
                                SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3,
                                SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7,
                        },
                },
                .bytecount = 0,
        },
};

#define sha256_iv (initial_sha256_ctx.ctx.state)

static const u32 sha256_K[64] = {
        0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
        0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
        0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786,
        0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
        0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147,
        0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
        0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b,
        0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
        0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a,
        0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
        0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
};

#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
#define Maj(x, y, z) (((x) & (y)) | ((z) & ((x) | (y))))
#define e0(x) (ror32((x), 2) ^ ror32((x), 13) ^ ror32((x), 22))
#define e1(x) (ror32((x), 6) ^ ror32((x), 11) ^ ror32((x), 25))
#define s0(x) (ror32((x), 7) ^ ror32((x), 18) ^ ((x) >> 3))
#define s1(x) (ror32((x), 17) ^ ror32((x), 19) ^ ((x) >> 10))

static inline void LOAD_OP(int I, u32 *W, const u8 *input)
{
        W[I] = get_unaligned_be32((__u32 *)input + I);
}

static inline void BLEND_OP(int I, u32 *W)
{
        W[I] = s1(W[I - 2]) + W[I - 7] + s0(W[I - 15]) + W[I - 16];
}

#define SHA256_ROUND(i, a, b, c, d, e, f, g, h)                    \
        do {                                                       \
                u32 t1, t2;                                        \
                t1 = h + e1(e) + Ch(e, f, g) + sha256_K[i] + W[i]; \
                t2 = e0(a) + Maj(a, b, c);                         \
                d += t1;                                           \
                h = t1 + t2;                                       \
        } while (0)

static void sha256_block_generic(struct sha256_block_state *state,
                                 const u8 *input, u32 W[64])
{
        u32 a, b, c, d, e, f, g, h;
        int i;

        /* load the input */
        for (i = 0; i < 16; i += 8) {
                LOAD_OP(i + 0, W, input);
                LOAD_OP(i + 1, W, input);
                LOAD_OP(i + 2, W, input);
                LOAD_OP(i + 3, W, input);
                LOAD_OP(i + 4, W, input);
                LOAD_OP(i + 5, W, input);
                LOAD_OP(i + 6, W, input);
                LOAD_OP(i + 7, W, input);
        }

        /* now blend */
        for (i = 16; i < 64; i += 8) {
                BLEND_OP(i + 0, W);
                BLEND_OP(i + 1, W);
                BLEND_OP(i + 2, W);
                BLEND_OP(i + 3, W);
                BLEND_OP(i + 4, W);
                BLEND_OP(i + 5, W);
                BLEND_OP(i + 6, W);
                BLEND_OP(i + 7, W);
        }

        /* load the state into our registers */
        a = state->h[0];
        b = state->h[1];
        c = state->h[2];
        d = state->h[3];
        e = state->h[4];
        f = state->h[5];
        g = state->h[6];
        h = state->h[7];

        /* now iterate */
        for (i = 0; i < 64; i += 8) {
                SHA256_ROUND(i + 0, a, b, c, d, e, f, g, h);
                SHA256_ROUND(i + 1, h, a, b, c, d, e, f, g);
                SHA256_ROUND(i + 2, g, h, a, b, c, d, e, f);
                SHA256_ROUND(i + 3, f, g, h, a, b, c, d, e);
                SHA256_ROUND(i + 4, e, f, g, h, a, b, c, d);
                SHA256_ROUND(i + 5, d, e, f, g, h, a, b, c);
                SHA256_ROUND(i + 6, c, d, e, f, g, h, a, b);
                SHA256_ROUND(i + 7, b, c, d, e, f, g, h, a);
        }

        state->h[0] += a;
        state->h[1] += b;
        state->h[2] += c;
        state->h[3] += d;
        state->h[4] += e;
        state->h[5] += f;
        state->h[6] += g;
        state->h[7] += h;
}

static void __maybe_unused
sha256_blocks_generic(struct sha256_block_state *state,
                      const u8 *data, size_t nblocks)
{
        u32 W[64];

        do {
                sha256_block_generic(state, data, W);
                data += SHA256_BLOCK_SIZE;
        } while (--nblocks);

        memzero_explicit(W, sizeof(W));
}

#if defined(CONFIG_CRYPTO_LIB_SHA256_ARCH) && !defined(__DISABLE_EXPORTS)
#include "sha256.h" /* $(SRCARCH)/sha256.h */
#else
#define sha256_blocks sha256_blocks_generic
#endif

static void __sha256_init(struct __sha256_ctx *ctx,
                          const struct sha256_block_state *iv,
                          u64 initial_bytecount)
{
        ctx->state = *iv;
        ctx->bytecount = initial_bytecount;
}

void sha224_init(struct sha224_ctx *ctx)
{
        __sha256_init(&ctx->ctx, &sha224_iv, 0);
}
EXPORT_SYMBOL_GPL(sha224_init);

void sha256_init(struct sha256_ctx *ctx)
{
        __sha256_init(&ctx->ctx, &sha256_iv, 0);
}
EXPORT_SYMBOL_GPL(sha256_init);

void __sha256_update(struct __sha256_ctx *ctx, const u8 *data, size_t len)
{
        size_t partial = ctx->bytecount % SHA256_BLOCK_SIZE;

        ctx->bytecount += len;

        if (partial + len >= SHA256_BLOCK_SIZE) {
                size_t nblocks;

                if (partial) {
                        size_t l = SHA256_BLOCK_SIZE - partial;

                        memcpy(&ctx->buf[partial], data, l);
                        data += l;
                        len -= l;

                        sha256_blocks(&ctx->state, ctx->buf, 1);
                }

                nblocks = len / SHA256_BLOCK_SIZE;
                len %= SHA256_BLOCK_SIZE;

                if (nblocks) {
                        sha256_blocks(&ctx->state, data, nblocks);
                        data += nblocks * SHA256_BLOCK_SIZE;
                }
                partial = 0;
        }
        if (len)
                memcpy(&ctx->buf[partial], data, len);
}
EXPORT_SYMBOL(__sha256_update);

static void __sha256_final(struct __sha256_ctx *ctx,
                           u8 *out, size_t digest_size)
{
        u64 bitcount = ctx->bytecount << 3;
        size_t partial = ctx->bytecount % SHA256_BLOCK_SIZE;

        ctx->buf[partial++] = 0x80;
        if (partial > SHA256_BLOCK_SIZE - 8) {
                memset(&ctx->buf[partial], 0, SHA256_BLOCK_SIZE - partial);
                sha256_blocks(&ctx->state, ctx->buf, 1);
                partial = 0;
        }
        memset(&ctx->buf[partial], 0, SHA256_BLOCK_SIZE - 8 - partial);
        *(__be64 *)&ctx->buf[SHA256_BLOCK_SIZE - 8] = cpu_to_be64(bitcount);
        sha256_blocks(&ctx->state, ctx->buf, 1);

        for (size_t i = 0; i < digest_size; i += 4)
                put_unaligned_be32(ctx->state.h[i / 4], out + i);
}

void sha224_final(struct sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE])
{
        __sha256_final(&ctx->ctx, out, SHA224_DIGEST_SIZE);
        memzero_explicit(ctx, sizeof(*ctx));
}
EXPORT_SYMBOL(sha224_final);

void sha256_final(struct sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE])
{
        __sha256_final(&ctx->ctx, out, SHA256_DIGEST_SIZE);
        memzero_explicit(ctx, sizeof(*ctx));
}
EXPORT_SYMBOL(sha256_final);

void sha224(const u8 *data, size_t len, u8 out[SHA224_DIGEST_SIZE])
{
        struct sha224_ctx ctx;

        sha224_init(&ctx);
        sha224_update(&ctx, data, len);
        sha224_final(&ctx, out);
}
EXPORT_SYMBOL(sha224);

void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE])
{
        struct sha256_ctx ctx;

        sha256_init(&ctx);
        sha256_update(&ctx, data, len);
        sha256_final(&ctx, out);
}
EXPORT_SYMBOL(sha256);

/*
 * Pre-boot environment (as indicated by __DISABLE_EXPORTS being defined)
 * doesn't need either HMAC support or interleaved hashing support
 */
#ifndef __DISABLE_EXPORTS

#ifndef sha256_finup_2x_arch
static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
                                 const u8 *data1, const u8 *data2, size_t len,
                                 u8 out1[SHA256_DIGEST_SIZE],
                                 u8 out2[SHA256_DIGEST_SIZE])
{
        return false;
}
static bool sha256_finup_2x_is_optimized_arch(void)
{
        return false;
}
#endif

/* Sequential fallback implementation of sha256_finup_2x() */
static noinline_for_stack void sha256_finup_2x_sequential(
        const struct __sha256_ctx *ctx, const u8 *data1, const u8 *data2,
        size_t len, u8 out1[SHA256_DIGEST_SIZE], u8 out2[SHA256_DIGEST_SIZE])
{
        struct __sha256_ctx mut_ctx;

        mut_ctx = *ctx;
        __sha256_update(&mut_ctx, data1, len);
        __sha256_final(&mut_ctx, out1, SHA256_DIGEST_SIZE);

        mut_ctx = *ctx;
        __sha256_update(&mut_ctx, data2, len);
        __sha256_final(&mut_ctx, out2, SHA256_DIGEST_SIZE);
}

void sha256_finup_2x(const struct sha256_ctx *ctx, const u8 *data1,
                     const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE],
                     u8 out2[SHA256_DIGEST_SIZE])
{
        if (ctx == NULL)
                ctx = &initial_sha256_ctx;

        if (likely(sha256_finup_2x_arch(&ctx->ctx, data1, data2, len, out1,
                                        out2)))
                return;
        sha256_finup_2x_sequential(&ctx->ctx, data1, data2, len, out1, out2);
}
EXPORT_SYMBOL_GPL(sha256_finup_2x);

bool sha256_finup_2x_is_optimized(void)
{
        return sha256_finup_2x_is_optimized_arch();
}
EXPORT_SYMBOL_GPL(sha256_finup_2x_is_optimized);

static void __hmac_sha256_preparekey(struct sha256_block_state *istate,
                                     struct sha256_block_state *ostate,
                                     const u8 *raw_key, size_t raw_key_len,
                                     const struct sha256_block_state *iv)
{
        union {
                u8 b[SHA256_BLOCK_SIZE];
                unsigned long w[SHA256_BLOCK_SIZE / sizeof(unsigned long)];
        } derived_key = { 0 };

        if (unlikely(raw_key_len > SHA256_BLOCK_SIZE)) {
                if (iv == &sha224_iv)
                        sha224(raw_key, raw_key_len, derived_key.b);
                else
                        sha256(raw_key, raw_key_len, derived_key.b);
        } else {
                memcpy(derived_key.b, raw_key, raw_key_len);
        }

        for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
                derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE);
        *istate = *iv;
        sha256_blocks(istate, derived_key.b, 1);

        for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
                derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^
                                                HMAC_IPAD_VALUE);
        *ostate = *iv;
        sha256_blocks(ostate, derived_key.b, 1);

        memzero_explicit(&derived_key, sizeof(derived_key));
}

void hmac_sha224_preparekey(struct hmac_sha224_key *key,
                            const u8 *raw_key, size_t raw_key_len)
{
        __hmac_sha256_preparekey(&key->key.istate, &key->key.ostate,
                                 raw_key, raw_key_len, &sha224_iv);
}
EXPORT_SYMBOL_GPL(hmac_sha224_preparekey);

void hmac_sha256_preparekey(struct hmac_sha256_key *key,
                            const u8 *raw_key, size_t raw_key_len)
{
        __hmac_sha256_preparekey(&key->key.istate, &key->key.ostate,
                                 raw_key, raw_key_len, &sha256_iv);
}
EXPORT_SYMBOL_GPL(hmac_sha256_preparekey);

void __hmac_sha256_init(struct __hmac_sha256_ctx *ctx,
                        const struct __hmac_sha256_key *key)
{
        __sha256_init(&ctx->sha_ctx, &key->istate, SHA256_BLOCK_SIZE);
        ctx->ostate = key->ostate;
}
EXPORT_SYMBOL_GPL(__hmac_sha256_init);

void hmac_sha224_init_usingrawkey(struct hmac_sha224_ctx *ctx,
                                  const u8 *raw_key, size_t raw_key_len)
{
        __hmac_sha256_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate,
                                 raw_key, raw_key_len, &sha224_iv);
        ctx->ctx.sha_ctx.bytecount = SHA256_BLOCK_SIZE;
}
EXPORT_SYMBOL_GPL(hmac_sha224_init_usingrawkey);

void hmac_sha256_init_usingrawkey(struct hmac_sha256_ctx *ctx,
                                  const u8 *raw_key, size_t raw_key_len)
{
        __hmac_sha256_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate,
                                 raw_key, raw_key_len, &sha256_iv);
        ctx->ctx.sha_ctx.bytecount = SHA256_BLOCK_SIZE;
}
EXPORT_SYMBOL_GPL(hmac_sha256_init_usingrawkey);

static void __hmac_sha256_final(struct __hmac_sha256_ctx *ctx,
                                u8 *out, size_t digest_size)
{
        /* Generate the padded input for the outer hash in ctx->sha_ctx.buf. */
        __sha256_final(&ctx->sha_ctx, ctx->sha_ctx.buf, digest_size);
        memset(&ctx->sha_ctx.buf[digest_size], 0,
               SHA256_BLOCK_SIZE - digest_size);
        ctx->sha_ctx.buf[digest_size] = 0x80;
        *(__be32 *)&ctx->sha_ctx.buf[SHA256_BLOCK_SIZE - 4] =
                cpu_to_be32(8 * (SHA256_BLOCK_SIZE + digest_size));

        /* Compute the outer hash, which gives the HMAC value. */
        sha256_blocks(&ctx->ostate, ctx->sha_ctx.buf, 1);
        for (size_t i = 0; i < digest_size; i += 4)
                put_unaligned_be32(ctx->ostate.h[i / 4], out + i);

        memzero_explicit(ctx, sizeof(*ctx));
}

void hmac_sha224_final(struct hmac_sha224_ctx *ctx,
                       u8 out[SHA224_DIGEST_SIZE])
{
        __hmac_sha256_final(&ctx->ctx, out, SHA224_DIGEST_SIZE);
}
EXPORT_SYMBOL_GPL(hmac_sha224_final);

void hmac_sha256_final(struct hmac_sha256_ctx *ctx,
                       u8 out[SHA256_DIGEST_SIZE])
{
        __hmac_sha256_final(&ctx->ctx, out, SHA256_DIGEST_SIZE);
}
EXPORT_SYMBOL_GPL(hmac_sha256_final);

void hmac_sha224(const struct hmac_sha224_key *key,
                 const u8 *data, size_t data_len, u8 out[SHA224_DIGEST_SIZE])
{
        struct hmac_sha224_ctx ctx;

        hmac_sha224_init(&ctx, key);
        hmac_sha224_update(&ctx, data, data_len);
        hmac_sha224_final(&ctx, out);
}
EXPORT_SYMBOL_GPL(hmac_sha224);

void hmac_sha256(const struct hmac_sha256_key *key,
                 const u8 *data, size_t data_len, u8 out[SHA256_DIGEST_SIZE])
{
        struct hmac_sha256_ctx ctx;

        hmac_sha256_init(&ctx, key);
        hmac_sha256_update(&ctx, data, data_len);
        hmac_sha256_final(&ctx, out);
}
EXPORT_SYMBOL_GPL(hmac_sha256);

void hmac_sha224_usingrawkey(const u8 *raw_key, size_t raw_key_len,
                             const u8 *data, size_t data_len,
                             u8 out[SHA224_DIGEST_SIZE])
{
        struct hmac_sha224_ctx ctx;

        hmac_sha224_init_usingrawkey(&ctx, raw_key, raw_key_len);
        hmac_sha224_update(&ctx, data, data_len);
        hmac_sha224_final(&ctx, out);
}
EXPORT_SYMBOL_GPL(hmac_sha224_usingrawkey);

void hmac_sha256_usingrawkey(const u8 *raw_key, size_t raw_key_len,
                             const u8 *data, size_t data_len,
                             u8 out[SHA256_DIGEST_SIZE])
{
        struct hmac_sha256_ctx ctx;

        hmac_sha256_init_usingrawkey(&ctx, raw_key, raw_key_len);
        hmac_sha256_update(&ctx, data, data_len);
        hmac_sha256_final(&ctx, out);
}
EXPORT_SYMBOL_GPL(hmac_sha256_usingrawkey);
#endif /* !__DISABLE_EXPORTS */

#ifdef sha256_mod_init_arch
static int __init sha256_mod_init(void)
{
        sha256_mod_init_arch();
        return 0;
}
subsys_initcall(sha256_mod_init);

static void __exit sha256_mod_exit(void)
{
}
module_exit(sha256_mod_exit);
#endif

MODULE_DESCRIPTION("SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 library functions");
MODULE_LICENSE("GPL");








































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    4 

    4 




















































































































































































































































































































































































































































































































































































































































































































































































































    4 



    4 

    4 



    4 

    4 











    4 































































    4 
    4 




















































































































































































































    4 











































































































































































































































































    4 






























































    4 

































































    4 



    4 






    4 





































































































































































































































































































    4 




























    4 
    4 



    4 
    4 



    4 
    4 


    4 








    4 








































    4 














    4 
    4 
    4 

    4 
    4 




























    4 
    4 





































    4 
    4 























    4 
















    4 







    4 














    4 


    4 


























    4 






















    4 

















    4 





    4 

    4 
    4 


    4 

    4 




    4 










    4 













    4 










    4 



    4 


















































































































































































































































































































































































































































































































































































  317 



  315 
  315 
  314 


































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/mm/memory.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 */

/*
 * demand-loading started 01.12.91 - seems it is high on the list of
 * things wanted, and it should be easy to implement. - Linus
 */

/*
 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
 * pages started 02.12.91, seems to work. - Linus.
 *
 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
 * would have taken more than the 6M I have free, but it worked well as
 * far as I could see.
 *
 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
 */

/*
 * Real VM (paging to/from disk) started 18.12.91. Much more work and
 * thought has to go into this. Oh, well..
 * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
 *                Found it. Everything seems to work now.
 * 20.12.91  -  Ok, making the swap-device changeable like the root.
 */

/*
 * 05.04.94  -  Multi-page memory management added for v1.1.
 *              Idea by Alex Bligh (alex@cconcepts.co.uk)
 *
 * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
 *                (Gerhard.Wichert@pdb.siemens.de)
 *
 * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
 */

#include <linux/kernel_stat.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/sched/mm.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/memremap.h>
#include <linux/kmsan.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/export.h>
#include <linux/delayacct.h>
#include <linux/init.h>
#include <linux/writeback.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
#include <linux/swapops.h>
#include <linux/elf.h>
#include <linux/gfp.h>
#include <linux/migrate.h>
#include <linux/string.h>
#include <linux/shmem_fs.h>
#include <linux/memory-tiers.h>
#include <linux/debugfs.h>
#include <linux/userfaultfd_k.h>
#include <linux/dax.h>
#include <linux/oom.h>
#include <linux/numa.h>
#include <linux/perf_event.h>
#include <linux/ptrace.h>
#include <linux/vmalloc.h>
#include <linux/sched/sysctl.h>

#include <trace/events/kmem.h>

#include <asm/io.h>
#include <asm/mmu_context.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>

#include "pgalloc-track.h"
#include "internal.h"
#include "swap.h"

#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
#endif

static vm_fault_t do_fault(struct vm_fault *vmf);
static vm_fault_t do_anonymous_page(struct vm_fault *vmf);
static bool vmf_pte_changed(struct vm_fault *vmf);

/*
 * Return true if the original pte was a uffd-wp pte marker (so the pte was
 * wr-protected).
 */
static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)
{
        if (!userfaultfd_wp(vmf->vma))
                return false;
        if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
                return false;

        return pte_marker_uffd_wp(vmf->orig_pte);
}

/*
 * Randomize the address space (stacks, mmaps, brk, etc.).
 *
 * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
 *   as ancient (libc5 based) binaries can segfault. )
 */
int randomize_va_space __read_mostly =
#ifdef CONFIG_COMPAT_BRK
                                        1;
#else
                                        2;
#endif

static const struct ctl_table mmu_sysctl_table[] = {
        {
                .procname        = "randomize_va_space",
                .data                = &randomize_va_space,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
};

static int __init init_mm_sysctl(void)
{
        register_sysctl_init("kernel", mmu_sysctl_table);
        return 0;
}

subsys_initcall(init_mm_sysctl);

#ifndef arch_wants_old_prefaulted_pte
static inline bool arch_wants_old_prefaulted_pte(void)
{
        /*
         * Transitioning a PTE from 'old' to 'young' can be expensive on
         * some architectures, even if it's performed in hardware. By
         * default, "false" means prefaulted entries will be 'young'.
         */
        return false;
}
#endif

static int __init disable_randmaps(char *s)
{
        randomize_va_space = 0;
        return 1;
}
__setup("norandmaps", disable_randmaps);

unsigned long zero_pfn __read_mostly;
EXPORT_SYMBOL(zero_pfn);

unsigned long highest_memmap_pfn __read_mostly;

/*
 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
 */
static int __init init_zero_pfn(void)
{
        zero_pfn = page_to_pfn(ZERO_PAGE(0));
        return 0;
}
early_initcall(init_zero_pfn);

void mm_trace_rss_stat(struct mm_struct *mm, int member)
{
        trace_rss_stat(mm, member);
}

/*
 * Note: this doesn't free the actual pages themselves. That
 * has been handled earlier when unmapping all the memory regions.
 */
static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
                           unsigned long addr)
{
        pgtable_t token = pmd_pgtable(*pmd);
        pmd_clear(pmd);
        pte_free_tlb(tlb, token, addr);
        mm_dec_nr_ptes(tlb->mm);
}

static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
                                unsigned long addr, unsigned long end,
                                unsigned long floor, unsigned long ceiling)
{
        pmd_t *pmd;
        unsigned long next;
        unsigned long start;

        start = addr;
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
                if (pmd_none_or_clear_bad(pmd))
                        continue;
                free_pte_range(tlb, pmd, addr);
        } while (pmd++, addr = next, addr != end);

        start &= PUD_MASK;
        if (start < floor)
                return;
        if (ceiling) {
                ceiling &= PUD_MASK;
                if (!ceiling)
                        return;
        }
        if (end - 1 > ceiling - 1)
                return;

        pmd = pmd_offset(pud, start);
        pud_clear(pud);
        pmd_free_tlb(tlb, pmd, start);
        mm_dec_nr_pmds(tlb->mm);
}

static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
                                unsigned long addr, unsigned long end,
                                unsigned long floor, unsigned long ceiling)
{
        pud_t *pud;
        unsigned long next;
        unsigned long start;

        start = addr;
        pud = pud_offset(p4d, addr);
        do {
                next = pud_addr_end(addr, end);
                if (pud_none_or_clear_bad(pud))
                        continue;
                free_pmd_range(tlb, pud, addr, next, floor, ceiling);
        } while (pud++, addr = next, addr != end);

        start &= P4D_MASK;
        if (start < floor)
                return;
        if (ceiling) {
                ceiling &= P4D_MASK;
                if (!ceiling)
                        return;
        }
        if (end - 1 > ceiling - 1)
                return;

        pud = pud_offset(p4d, start);
        p4d_clear(p4d);
        pud_free_tlb(tlb, pud, start);
        mm_dec_nr_puds(tlb->mm);
}

static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
                                unsigned long addr, unsigned long end,
                                unsigned long floor, unsigned long ceiling)
{
        p4d_t *p4d;
        unsigned long next;
        unsigned long start;

        start = addr;
        p4d = p4d_offset(pgd, addr);
        do {
                next = p4d_addr_end(addr, end);
                if (p4d_none_or_clear_bad(p4d))
                        continue;
                free_pud_range(tlb, p4d, addr, next, floor, ceiling);
        } while (p4d++, addr = next, addr != end);

        start &= PGDIR_MASK;
        if (start < floor)
                return;
        if (ceiling) {
                ceiling &= PGDIR_MASK;
                if (!ceiling)
                        return;
        }
        if (end - 1 > ceiling - 1)
                return;

        p4d = p4d_offset(pgd, start);
        pgd_clear(pgd);
        p4d_free_tlb(tlb, p4d, start);
}

/**
 * free_pgd_range - Unmap and free page tables in the range
 * @tlb: the mmu_gather containing pending TLB flush info
 * @addr: virtual address start
 * @end: virtual address end
 * @floor: lowest address boundary
 * @ceiling: highest address boundary
 *
 * This function tears down all user-level page tables in the
 * specified virtual address range [@addr..@end). It is part of
 * the memory unmap flow.
 */
void free_pgd_range(struct mmu_gather *tlb,
                        unsigned long addr, unsigned long end,
                        unsigned long floor, unsigned long ceiling)
{
        pgd_t *pgd;
        unsigned long next;

        /*
         * The next few lines have given us lots of grief...
         *
         * Why are we testing PMD* at this top level?  Because often
         * there will be no work to do at all, and we'd prefer not to
         * go all the way down to the bottom just to discover that.
         *
         * Why all these "- 1"s?  Because 0 represents both the bottom
         * of the address space and the top of it (using -1 for the
         * top wouldn't help much: the masks would do the wrong thing).
         * The rule is that addr 0 and floor 0 refer to the bottom of
         * the address space, but end 0 and ceiling 0 refer to the top
         * Comparisons need to use "end - 1" and "ceiling - 1" (though
         * that end 0 case should be mythical).
         *
         * Wherever addr is brought up or ceiling brought down, we must
         * be careful to reject "the opposite 0" before it confuses the
         * subsequent tests.  But what about where end is brought down
         * by PMD_SIZE below? no, end can't go down to 0 there.
         *
         * Whereas we round start (addr) and ceiling down, by different
         * masks at different levels, in order to test whether a table
         * now has no other vmas using it, so can be freed, we don't
         * bother to round floor or end up - the tests don't need that.
         */

        addr &= PMD_MASK;
        if (addr < floor) {
                addr += PMD_SIZE;
                if (!addr)
                        return;
        }
        if (ceiling) {
                ceiling &= PMD_MASK;
                if (!ceiling)
                        return;
        }
        if (end - 1 > ceiling - 1)
                end -= PMD_SIZE;
        if (addr > end - 1)
                return;
        /*
         * We add page table cache pages with PAGE_SIZE,
         * (see pte_free_tlb()), flush the tlb if we need
         */
        tlb_change_page_size(tlb, PAGE_SIZE);
        pgd = pgd_offset(tlb->mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
                free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
        } while (pgd++, addr = next, addr != end);
}

void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
                   struct vm_area_struct *vma, unsigned long floor,
                   unsigned long ceiling, bool mm_wr_locked)
{
        struct unlink_vma_file_batch vb;

        tlb_free_vmas(tlb);

        do {
                unsigned long addr = vma->vm_start;
                struct vm_area_struct *next;

                /*
                 * Note: USER_PGTABLES_CEILING may be passed as ceiling and may
                 * be 0.  This will underflow and is okay.
                 */
                next = mas_find(mas, ceiling - 1);
                if (unlikely(xa_is_zero(next)))
                        next = NULL;

                /*
                 * Hide vma from rmap and truncate_pagecache before freeing
                 * pgtables
                 */
                if (mm_wr_locked)
                        vma_start_write(vma);
                unlink_anon_vmas(vma);

                unlink_file_vma_batch_init(&vb);
                unlink_file_vma_batch_add(&vb, vma);

                /*
                 * Optimization: gather nearby vmas into one call down
                 */
                while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
                        vma = next;
                        next = mas_find(mas, ceiling - 1);
                        if (unlikely(xa_is_zero(next)))
                                next = NULL;
                        if (mm_wr_locked)
                                vma_start_write(vma);
                        unlink_anon_vmas(vma);
                        unlink_file_vma_batch_add(&vb, vma);
                }
                unlink_file_vma_batch_final(&vb);

                free_pgd_range(tlb, addr, vma->vm_end,
                        floor, next ? next->vm_start : ceiling);
                vma = next;
        } while (vma);
}

void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
{
        spinlock_t *ptl = pmd_lock(mm, pmd);

        if (likely(pmd_none(*pmd))) {        /* Has another populated it ? */
                mm_inc_nr_ptes(mm);
                /*
                 * Ensure all pte setup (eg. pte page lock and page clearing) are
                 * visible before the pte is made visible to other CPUs by being
                 * put into page tables.
                 *
                 * The other side of the story is the pointer chasing in the page
                 * table walking code (when walking the page table without locking;
                 * ie. most of the time). Fortunately, these data accesses consist
                 * of a chain of data-dependent loads, meaning most CPUs (alpha
                 * being the notable exception) will already guarantee loads are
                 * seen in-order. See the alpha page table accessors for the
                 * smp_rmb() barriers in page table walking code.
                 */
                smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
                pmd_populate(mm, pmd, *pte);
                *pte = NULL;
        }
        spin_unlock(ptl);
}

int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
{
        pgtable_t new = pte_alloc_one(mm);
        if (!new)
                return -ENOMEM;

        pmd_install(mm, pmd, &new);
        if (new)
                pte_free(mm, new);
        return 0;
}

int __pte_alloc_kernel(pmd_t *pmd)
{
        pte_t *new = pte_alloc_one_kernel(&init_mm);
        if (!new)
                return -ENOMEM;

        spin_lock(&init_mm.page_table_lock);
        if (likely(pmd_none(*pmd))) {        /* Has another populated it ? */
                smp_wmb(); /* See comment in pmd_install() */
                pmd_populate_kernel(&init_mm, pmd, new);
                new = NULL;
        }
        spin_unlock(&init_mm.page_table_lock);
        if (new)
                pte_free_kernel(&init_mm, new);
        return 0;
}

static inline void init_rss_vec(int *rss)
{
        memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
}

static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
{
        int i;

        for (i = 0; i < NR_MM_COUNTERS; i++)
                if (rss[i])
                        add_mm_counter(mm, i, rss[i]);
}

static bool is_bad_page_map_ratelimited(void)
{
        static unsigned long resume;
        static unsigned long nr_shown;
        static unsigned long nr_unshown;

        /*
         * Allow a burst of 60 reports, then keep quiet for that minute;
         * or allow a steady drip of one report per second.
         */
        if (nr_shown == 60) {
                if (time_before(jiffies, resume)) {
                        nr_unshown++;
                        return true;
                }
                if (nr_unshown) {
                        pr_alert("BUG: Bad page map: %lu messages suppressed\n",
                                 nr_unshown);
                        nr_unshown = 0;
                }
                nr_shown = 0;
        }
        if (nr_shown++ == 0)
                resume = jiffies + 60 * HZ;
        return false;
}

static void __print_bad_page_map_pgtable(struct mm_struct *mm, unsigned long addr)
{
        unsigned long long pgdv, p4dv, pudv, pmdv;
        p4d_t p4d, *p4dp;
        pud_t pud, *pudp;
        pmd_t pmd, *pmdp;
        pgd_t *pgdp;

        /*
         * Although this looks like a fully lockless pgtable walk, it is not:
         * see locking requirements for print_bad_page_map().
         */
        pgdp = pgd_offset(mm, addr);
        pgdv = pgd_val(*pgdp);

        if (!pgd_present(*pgdp) || pgd_leaf(*pgdp)) {
                pr_alert("pgd:%08llx\n", pgdv);
                return;
        }

        p4dp = p4d_offset(pgdp, addr);
        p4d = p4dp_get(p4dp);
        p4dv = p4d_val(p4d);

        if (!p4d_present(p4d) || p4d_leaf(p4d)) {
                pr_alert("pgd:%08llx p4d:%08llx\n", pgdv, p4dv);
                return;
        }

        pudp = pud_offset(p4dp, addr);
        pud = pudp_get(pudp);
        pudv = pud_val(pud);

        if (!pud_present(pud) || pud_leaf(pud)) {
                pr_alert("pgd:%08llx p4d:%08llx pud:%08llx\n", pgdv, p4dv, pudv);
                return;
        }

        pmdp = pmd_offset(pudp, addr);
        pmd = pmdp_get(pmdp);
        pmdv = pmd_val(pmd);

        /*
         * Dumping the PTE would be nice, but it's tricky with CONFIG_HIGHPTE,
         * because the table should already be mapped by the caller and
         * doing another map would be bad. print_bad_page_map() should
         * already take care of printing the PTE.
         */
        pr_alert("pgd:%08llx p4d:%08llx pud:%08llx pmd:%08llx\n", pgdv,
                 p4dv, pudv, pmdv);
}

/*
 * This function is called to print an error when a bad page table entry (e.g.,
 * corrupted page table entry) is found. For example, we might have a
 * PFN-mapped pte in a region that doesn't allow it.
 *
 * The calling function must still handle the error.
 *
 * This function must be called during a proper page table walk, as it will
 * re-walk the page table to dump information: the caller MUST prevent page
 * table teardown (by holding mmap, vma or rmap lock) and MUST hold the leaf
 * page table lock.
 */
static void print_bad_page_map(struct vm_area_struct *vma,
                unsigned long addr, unsigned long long entry, struct page *page,
                enum pgtable_level level)
{
        struct address_space *mapping;
        pgoff_t index;

        if (is_bad_page_map_ratelimited())
                return;

        mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
        index = linear_page_index(vma, addr);

        pr_alert("BUG: Bad page map in process %s  %s:%08llx", current->comm,
                 pgtable_level_to_str(level), entry);
        __print_bad_page_map_pgtable(vma->vm_mm, addr);
        if (page)
                dump_page(page, "bad page map");
        pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
                 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
        pr_alert("file:%pD fault:%ps mmap:%ps mmap_prepare: %ps read_folio:%ps\n",
                 vma->vm_file,
                 vma->vm_ops ? vma->vm_ops->fault : NULL,
                 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
                 vma->vm_file ? vma->vm_file->f_op->mmap_prepare : NULL,
                 mapping ? mapping->a_ops->read_folio : NULL);
        dump_stack();
        add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
#define print_bad_pte(vma, addr, pte, page) \
        print_bad_page_map(vma, addr, pte_val(pte), page, PGTABLE_LEVEL_PTE)

/**
 * __vm_normal_page() - Get the "struct page" associated with a page table entry.
 * @vma: The VMA mapping the page table entry.
 * @addr: The address where the page table entry is mapped.
 * @pfn: The PFN stored in the page table entry.
 * @special: Whether the page table entry is marked "special".
 * @level: The page table level for error reporting purposes only.
 * @entry: The page table entry value for error reporting purposes only.
 *
 * "Special" mappings do not wish to be associated with a "struct page" (either
 * it doesn't exist, or it exists but they don't want to touch it). In this
 * case, NULL is returned here. "Normal" mappings do have a struct page and
 * are ordinarily refcounted.
 *
 * Page mappings of the shared zero folios are always considered "special", as
 * they are not ordinarily refcounted: neither the refcount nor the mapcount
 * of these folios is adjusted when mapping them into user page tables.
 * Selected page table walkers (such as GUP) can still identify mappings of the
 * shared zero folios and work with the underlying "struct page".
 *
 * There are 2 broad cases. Firstly, an architecture may define a "special"
 * page table entry bit, such as pte_special(), in which case this function is
 * trivial. Secondly, an architecture may not have a spare page table
 * entry bit, which requires a more complicated scheme, described below.
 *
 * With CONFIG_FIND_NORMAL_PAGE, we might have the "special" bit set on
 * page table entries that actually map "normal" pages: however, that page
 * cannot be looked up through the PFN stored in the page table entry, but
 * instead will be looked up through vm_ops->find_normal_page(). So far, this
 * only applies to PTEs.
 *
 * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
 * special mapping (even if there are underlying and valid "struct pages").
 * COWed pages of a VM_PFNMAP are always normal.
 *
 * The way we recognize COWed pages within VM_PFNMAP mappings is through the
 * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
 * set, and the vm_pgoff will point to the first PFN mapped: thus every special
 * mapping will always honor the rule
 *
 *        pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
 *
 * And for normal mappings this is false.
 *
 * This restricts such mappings to be a linear translation from virtual address
 * to pfn. To get around this restriction, we allow arbitrary mappings so long
 * as the vma is not a COW mapping; in that case, we know that all ptes are
 * special (because none can have been COWed).
 *
 *
 * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
 *
 * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
 * page" backing, however the difference is that _all_ pages with a struct
 * page (that is, those where pfn_valid is true, except the shared zero
 * folios) are refcounted and considered normal pages by the VM.
 *
 * The disadvantage is that pages are refcounted (which can be slower and
 * simply not an option for some PFNMAP users). The advantage is that we
 * don't have to follow the strict linearity rule of PFNMAP mappings in
 * order to support COWable mappings.
 *
 * Return: Returns the "struct page" if this is a "normal" mapping. Returns
 *           NULL if this is a "special" mapping.
 */
static inline struct page *__vm_normal_page(struct vm_area_struct *vma,
                unsigned long addr, unsigned long pfn, bool special,
                unsigned long long entry, enum pgtable_level level)
{
        if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
                if (unlikely(special)) {
#ifdef CONFIG_FIND_NORMAL_PAGE
                        if (vma->vm_ops && vma->vm_ops->find_normal_page)
                                return vma->vm_ops->find_normal_page(vma, addr);
#endif /* CONFIG_FIND_NORMAL_PAGE */
                        if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
                                return NULL;
                        if (is_zero_pfn(pfn) || is_huge_zero_pfn(pfn))
                                return NULL;

                        print_bad_page_map(vma, addr, entry, NULL, level);
                        return NULL;
                }
                /*
                 * With CONFIG_ARCH_HAS_PTE_SPECIAL, any special page table
                 * mappings (incl. shared zero folios) are marked accordingly.
                 */
        } else {
                if (unlikely(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) {
                        if (vma->vm_flags & VM_MIXEDMAP) {
                                /* If it has a "struct page", it's "normal". */
                                if (!pfn_valid(pfn))
                                        return NULL;
                        } else {
                                unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;

                                /* Only CoW'ed anon folios are "normal". */
                                if (pfn == vma->vm_pgoff + off)
                                        return NULL;
                                if (!is_cow_mapping(vma->vm_flags))
                                        return NULL;
                        }
                }

                if (is_zero_pfn(pfn) || is_huge_zero_pfn(pfn))
                        return NULL;
        }

        if (unlikely(pfn > highest_memmap_pfn)) {
                /* Corrupted page table entry. */
                print_bad_page_map(vma, addr, entry, NULL, level);
                return NULL;
        }
        /*
         * NOTE! We still have PageReserved() pages in the page tables.
         * For example, VDSO mappings can cause them to exist.
         */
        VM_WARN_ON_ONCE(is_zero_pfn(pfn) || is_huge_zero_pfn(pfn));
        return pfn_to_page(pfn);
}

/**
 * vm_normal_page() - Get the "struct page" associated with a PTE
 * @vma: The VMA mapping the @pte.
 * @addr: The address where the @pte is mapped.
 * @pte: The PTE.
 *
 * Get the "struct page" associated with a PTE. See __vm_normal_page()
 * for details on "normal" and "special" mappings.
 *
 * Return: Returns the "struct page" if this is a "normal" mapping. Returns
 *           NULL if this is a "special" mapping.
 */
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
                            pte_t pte)
{
        return __vm_normal_page(vma, addr, pte_pfn(pte), pte_special(pte),
                                pte_val(pte), PGTABLE_LEVEL_PTE);
}

/**
 * vm_normal_folio() - Get the "struct folio" associated with a PTE
 * @vma: The VMA mapping the @pte.
 * @addr: The address where the @pte is mapped.
 * @pte: The PTE.
 *
 * Get the "struct folio" associated with a PTE. See __vm_normal_page()
 * for details on "normal" and "special" mappings.
 *
 * Return: Returns the "struct folio" if this is a "normal" mapping. Returns
 *           NULL if this is a "special" mapping.
 */
struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
                            pte_t pte)
{
        struct page *page = vm_normal_page(vma, addr, pte);

        if (page)
                return page_folio(page);
        return NULL;
}

#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
/**
 * vm_normal_page_pmd() - Get the "struct page" associated with a PMD
 * @vma: The VMA mapping the @pmd.
 * @addr: The address where the @pmd is mapped.
 * @pmd: The PMD.
 *
 * Get the "struct page" associated with a PTE. See __vm_normal_page()
 * for details on "normal" and "special" mappings.
 *
 * Return: Returns the "struct page" if this is a "normal" mapping. Returns
 *           NULL if this is a "special" mapping.
 */
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
                                pmd_t pmd)
{
        return __vm_normal_page(vma, addr, pmd_pfn(pmd), pmd_special(pmd),
                                pmd_val(pmd), PGTABLE_LEVEL_PMD);
}

/**
 * vm_normal_folio_pmd() - Get the "struct folio" associated with a PMD
 * @vma: The VMA mapping the @pmd.
 * @addr: The address where the @pmd is mapped.
 * @pmd: The PMD.
 *
 * Get the "struct folio" associated with a PTE. See __vm_normal_page()
 * for details on "normal" and "special" mappings.
 *
 * Return: Returns the "struct folio" if this is a "normal" mapping. Returns
 *           NULL if this is a "special" mapping.
 */
struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
                                  unsigned long addr, pmd_t pmd)
{
        struct page *page = vm_normal_page_pmd(vma, addr, pmd);

        if (page)
                return page_folio(page);
        return NULL;
}

/**
 * vm_normal_page_pud() - Get the "struct page" associated with a PUD
 * @vma: The VMA mapping the @pud.
 * @addr: The address where the @pud is mapped.
 * @pud: The PUD.
 *
 * Get the "struct page" associated with a PUD. See __vm_normal_page()
 * for details on "normal" and "special" mappings.
 *
 * Return: Returns the "struct page" if this is a "normal" mapping. Returns
 *           NULL if this is a "special" mapping.
 */
struct page *vm_normal_page_pud(struct vm_area_struct *vma,
                unsigned long addr, pud_t pud)
{
        return __vm_normal_page(vma, addr, pud_pfn(pud), pud_special(pud),
                                pud_val(pud), PGTABLE_LEVEL_PUD);
}
#endif

/**
 * restore_exclusive_pte - Restore a device-exclusive entry
 * @vma: VMA covering @address
 * @folio: the mapped folio
 * @page: the mapped folio page
 * @address: the virtual address
 * @ptep: pte pointer into the locked page table mapping the folio page
 * @orig_pte: pte value at @ptep
 *
 * Restore a device-exclusive non-swap entry to an ordinary present pte.
 *
 * The folio and the page table must be locked, and MMU notifiers must have
 * been called to invalidate any (exclusive) device mappings.
 *
 * Locking the folio makes sure that anybody who just converted the pte to
 * a device-exclusive entry can map it into the device to make forward
 * progress without others converting it back until the folio was unlocked.
 *
 * If the folio lock ever becomes an issue, we can stop relying on the folio
 * lock; it might make some scenarios with heavy thrashing less likely to
 * make forward progress, but these scenarios might not be valid use cases.
 *
 * Note that the folio lock does not protect against all cases of concurrent
 * page table modifications (e.g., MADV_DONTNEED, mprotect), so device drivers
 * must use MMU notifiers to sync against any concurrent changes.
 */
static void restore_exclusive_pte(struct vm_area_struct *vma,
                struct folio *folio, struct page *page, unsigned long address,
                pte_t *ptep, pte_t orig_pte)
{
        pte_t pte;

        VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);

        pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
        if (pte_swp_soft_dirty(orig_pte))
                pte = pte_mksoft_dirty(pte);

        if (pte_swp_uffd_wp(orig_pte))
                pte = pte_mkuffd_wp(pte);

        if ((vma->vm_flags & VM_WRITE) &&
            can_change_pte_writable(vma, address, pte)) {
                if (folio_test_dirty(folio))
                        pte = pte_mkdirty(pte);
                pte = pte_mkwrite(pte, vma);
        }
        set_pte_at(vma->vm_mm, address, ptep, pte);

        /*
         * No need to invalidate - it was non-present before. However
         * secondary CPUs may have mappings that need invalidating.
         */
        update_mmu_cache(vma, address, ptep);
}

/*
 * Tries to restore an exclusive pte if the page lock can be acquired without
 * sleeping.
 */
static int try_restore_exclusive_pte(struct vm_area_struct *vma,
                unsigned long addr, pte_t *ptep, pte_t orig_pte)
{
        struct page *page = pfn_swap_entry_to_page(pte_to_swp_entry(orig_pte));
        struct folio *folio = page_folio(page);

        if (folio_trylock(folio)) {
                restore_exclusive_pte(vma, folio, page, addr, ptep, orig_pte);
                folio_unlock(folio);
                return 0;
        }

        return -EBUSY;
}

/*
 * copy one vm_area from one task to the other. Assumes the page tables
 * already present in the new task to be cleared in the whole range
 * covered by this vma.
 */

static unsigned long
copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma,
                struct vm_area_struct *src_vma, unsigned long addr, int *rss)
{
        vm_flags_t vm_flags = dst_vma->vm_flags;
        pte_t orig_pte = ptep_get(src_pte);
        pte_t pte = orig_pte;
        struct folio *folio;
        struct page *page;
        swp_entry_t entry = pte_to_swp_entry(orig_pte);

        if (likely(!non_swap_entry(entry))) {
                if (swap_duplicate(entry) < 0)
                        return -EIO;

                /* make sure dst_mm is on swapoff's mmlist. */
                if (unlikely(list_empty(&dst_mm->mmlist))) {
                        spin_lock(&mmlist_lock);
                        if (list_empty(&dst_mm->mmlist))
                                list_add(&dst_mm->mmlist,
                                                &src_mm->mmlist);
                        spin_unlock(&mmlist_lock);
                }
                /* Mark the swap entry as shared. */
                if (pte_swp_exclusive(orig_pte)) {
                        pte = pte_swp_clear_exclusive(orig_pte);
                        set_pte_at(src_mm, addr, src_pte, pte);
                }
                rss[MM_SWAPENTS]++;
        } else if (is_migration_entry(entry)) {
                folio = pfn_swap_entry_folio(entry);

                rss[mm_counter(folio)]++;

                if (!is_readable_migration_entry(entry) &&
                                is_cow_mapping(vm_flags)) {
                        /*
                         * COW mappings require pages in both parent and child
                         * to be set to read. A previously exclusive entry is
                         * now shared.
                         */
                        entry = make_readable_migration_entry(
                                                        swp_offset(entry));
                        pte = swp_entry_to_pte(entry);
                        if (pte_swp_soft_dirty(orig_pte))
                                pte = pte_swp_mksoft_dirty(pte);
                        if (pte_swp_uffd_wp(orig_pte))
                                pte = pte_swp_mkuffd_wp(pte);
                        set_pte_at(src_mm, addr, src_pte, pte);
                }
        } else if (is_device_private_entry(entry)) {
                page = pfn_swap_entry_to_page(entry);
                folio = page_folio(page);

                /*
                 * Update rss count even for unaddressable pages, as
                 * they should treated just like normal pages in this
                 * respect.
                 *
                 * We will likely want to have some new rss counters
                 * for unaddressable pages, at some point. But for now
                 * keep things as they are.
                 */
                folio_get(folio);
                rss[mm_counter(folio)]++;
                /* Cannot fail as these pages cannot get pinned. */
                folio_try_dup_anon_rmap_pte(folio, page, dst_vma, src_vma);

                /*
                 * We do not preserve soft-dirty information, because so
                 * far, checkpoint/restore is the only feature that
                 * requires that. And checkpoint/restore does not work
                 * when a device driver is involved (you cannot easily
                 * save and restore device driver state).
                 */
                if (is_writable_device_private_entry(entry) &&
                    is_cow_mapping(vm_flags)) {
                        entry = make_readable_device_private_entry(
                                                        swp_offset(entry));
                        pte = swp_entry_to_pte(entry);
                        if (pte_swp_uffd_wp(orig_pte))
                                pte = pte_swp_mkuffd_wp(pte);
                        set_pte_at(src_mm, addr, src_pte, pte);
                }
        } else if (is_device_exclusive_entry(entry)) {
                /*
                 * Make device exclusive entries present by restoring the
                 * original entry then copying as for a present pte. Device
                 * exclusive entries currently only support private writable
                 * (ie. COW) mappings.
                 */
                VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags));
                if (try_restore_exclusive_pte(src_vma, addr, src_pte, orig_pte))
                        return -EBUSY;
                return -ENOENT;
        } else if (is_pte_marker_entry(entry)) {
                pte_marker marker = copy_pte_marker(entry, dst_vma);

                if (marker)
                        set_pte_at(dst_mm, addr, dst_pte,
                                   make_pte_marker(marker));
                return 0;
        }
        if (!userfaultfd_wp(dst_vma))
                pte = pte_swp_clear_uffd_wp(pte);
        set_pte_at(dst_mm, addr, dst_pte, pte);
        return 0;
}

/*
 * Copy a present and normal page.
 *
 * NOTE! The usual case is that this isn't required;
 * instead, the caller can just increase the page refcount
 * and re-use the pte the traditional way.
 *
 * And if we need a pre-allocated page but don't yet have
 * one, return a negative error to let the preallocation
 * code know so that it can do so outside the page table
 * lock.
 */
static inline int
copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
                  pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
                  struct folio **prealloc, struct page *page)
{
        struct folio *new_folio;
        pte_t pte;

        new_folio = *prealloc;
        if (!new_folio)
                return -EAGAIN;

        /*
         * We have a prealloc page, all good!  Take it
         * over and copy the page & arm it.
         */

        if (copy_mc_user_highpage(&new_folio->page, page, addr, src_vma))
                return -EHWPOISON;

        *prealloc = NULL;
        __folio_mark_uptodate(new_folio);
        folio_add_new_anon_rmap(new_folio, dst_vma, addr, RMAP_EXCLUSIVE);
        folio_add_lru_vma(new_folio, dst_vma);
        rss[MM_ANONPAGES]++;

        /* All done, just insert the new page copy in the child */
        pte = folio_mk_pte(new_folio, dst_vma->vm_page_prot);
        pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
        if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte)))
                /* Uffd-wp needs to be delivered to dest pte as well */
                pte = pte_mkuffd_wp(pte);
        set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
        return 0;
}

static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma,
                struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte,
                pte_t pte, unsigned long addr, int nr)
{
        struct mm_struct *src_mm = src_vma->vm_mm;

        /* If it's a COW mapping, write protect it both processes. */
        if (is_cow_mapping(src_vma->vm_flags) && pte_write(pte)) {
                wrprotect_ptes(src_mm, addr, src_pte, nr);
                pte = pte_wrprotect(pte);
        }

        /* If it's a shared mapping, mark it clean in the child. */
        if (src_vma->vm_flags & VM_SHARED)
                pte = pte_mkclean(pte);
        pte = pte_mkold(pte);

        if (!userfaultfd_wp(dst_vma))
                pte = pte_clear_uffd_wp(pte);

        set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr);
}

/*
 * Copy one present PTE, trying to batch-process subsequent PTEs that map
 * consecutive pages of the same folio by copying them as well.
 *
 * Returns -EAGAIN if one preallocated page is required to copy the next PTE.
 * Otherwise, returns the number of copied PTEs (at least 1).
 */
static inline int
copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
                 pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr,
                 int max_nr, int *rss, struct folio **prealloc)
{
        fpb_t flags = FPB_MERGE_WRITE;
        struct page *page;
        struct folio *folio;
        int err, nr;

        page = vm_normal_page(src_vma, addr, pte);
        if (unlikely(!page))
                goto copy_pte;

        folio = page_folio(page);

        /*
         * If we likely have to copy, just don't bother with batching. Make
         * sure that the common "small folio" case is as fast as possible
         * by keeping the batching logic separate.
         */
        if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) {
                if (!(src_vma->vm_flags & VM_SHARED))
                        flags |= FPB_RESPECT_DIRTY;
                if (vma_soft_dirty_enabled(src_vma))
                        flags |= FPB_RESPECT_SOFT_DIRTY;

                nr = folio_pte_batch_flags(folio, src_vma, src_pte, &pte, max_nr, flags);
                folio_ref_add(folio, nr);
                if (folio_test_anon(folio)) {
                        if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
                                                                  nr, dst_vma, src_vma))) {
                                folio_ref_sub(folio, nr);
                                return -EAGAIN;
                        }
                        rss[MM_ANONPAGES] += nr;
                        VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
                } else {
                        folio_dup_file_rmap_ptes(folio, page, nr, dst_vma);
                        rss[mm_counter_file(folio)] += nr;
                }
                __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte,
                                    addr, nr);
                return nr;
        }

        folio_get(folio);
        if (folio_test_anon(folio)) {
                /*
                 * If this page may have been pinned by the parent process,
                 * copy the page immediately for the child so that we'll always
                 * guarantee the pinned page won't be randomly replaced in the
                 * future.
                 */
                if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, dst_vma, src_vma))) {
                        /* Page may be pinned, we have to copy. */
                        folio_put(folio);
                        err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
                                                addr, rss, prealloc, page);
                        return err ? err : 1;
                }
                rss[MM_ANONPAGES]++;
                VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
        } else {
                folio_dup_file_rmap_pte(folio, page, dst_vma);
                rss[mm_counter_file(folio)]++;
        }

copy_pte:
        __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, 1);
        return 1;
}

static inline struct folio *folio_prealloc(struct mm_struct *src_mm,
                struct vm_area_struct *vma, unsigned long addr, bool need_zero)
{
        struct folio *new_folio;

        if (need_zero)
                new_folio = vma_alloc_zeroed_movable_folio(vma, addr);
        else
                new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr);

        if (!new_folio)
                return NULL;

        if (mem_cgroup_charge(new_folio, src_mm, GFP_KERNEL)) {
                folio_put(new_folio);
                return NULL;
        }
        folio_throttle_swaprate(new_folio, GFP_KERNEL);

        return new_folio;
}

static int
copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
               pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
               unsigned long end)
{
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        struct mm_struct *src_mm = src_vma->vm_mm;
        pte_t *orig_src_pte, *orig_dst_pte;
        pte_t *src_pte, *dst_pte;
        pmd_t dummy_pmdval;
        pte_t ptent;
        spinlock_t *src_ptl, *dst_ptl;
        int progress, max_nr, ret = 0;
        int rss[NR_MM_COUNTERS];
        swp_entry_t entry = (swp_entry_t){0};
        struct folio *prealloc = NULL;
        int nr;

again:
        progress = 0;
        init_rss_vec(rss);

        /*
         * copy_pmd_range()'s prior pmd_none_or_clear_bad(src_pmd), and the
         * error handling here, assume that exclusive mmap_lock on dst and src
         * protects anon from unexpected THP transitions; with shmem and file
         * protected by mmap_lock-less collapse skipping areas with anon_vma
         * (whereas vma_needs_copy() skips areas without anon_vma).  A rework
         * can remove such assumptions later, but this is good enough for now.
         */
        dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
        if (!dst_pte) {
                ret = -ENOMEM;
                goto out;
        }

        /*
         * We already hold the exclusive mmap_lock, the copy_pte_range() and
         * retract_page_tables() are using vma->anon_vma to be exclusive, so
         * the PTE page is stable, and there is no need to get pmdval and do
         * pmd_same() check.
         */
        src_pte = pte_offset_map_rw_nolock(src_mm, src_pmd, addr, &dummy_pmdval,
                                           &src_ptl);
        if (!src_pte) {
                pte_unmap_unlock(dst_pte, dst_ptl);
                /* ret == 0 */
                goto out;
        }
        spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
        orig_src_pte = src_pte;
        orig_dst_pte = dst_pte;
        arch_enter_lazy_mmu_mode();

        do {
                nr = 1;

                /*
                 * We are holding two locks at this point - either of them
                 * could generate latencies in another task on another CPU.
                 */
                if (progress >= 32) {
                        progress = 0;
                        if (need_resched() ||
                            spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
                                break;
                }
                ptent = ptep_get(src_pte);
                if (pte_none(ptent)) {
                        progress++;
                        continue;
                }
                if (unlikely(!pte_present(ptent))) {
                        ret = copy_nonpresent_pte(dst_mm, src_mm,
                                                  dst_pte, src_pte,
                                                  dst_vma, src_vma,
                                                  addr, rss);
                        if (ret == -EIO) {
                                entry = pte_to_swp_entry(ptep_get(src_pte));
                                break;
                        } else if (ret == -EBUSY) {
                                break;
                        } else if (!ret) {
                                progress += 8;
                                continue;
                        }
                        ptent = ptep_get(src_pte);
                        VM_WARN_ON_ONCE(!pte_present(ptent));

                        /*
                         * Device exclusive entry restored, continue by copying
                         * the now present pte.
                         */
                        WARN_ON_ONCE(ret != -ENOENT);
                }
                /* copy_present_ptes() will clear `*prealloc' if consumed */
                max_nr = (end - addr) / PAGE_SIZE;
                ret = copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte,
                                        ptent, addr, max_nr, rss, &prealloc);
                /*
                 * If we need a pre-allocated page for this pte, drop the
                 * locks, allocate, and try again.
                 * If copy failed due to hwpoison in source page, break out.
                 */
                if (unlikely(ret == -EAGAIN || ret == -EHWPOISON))
                        break;
                if (unlikely(prealloc)) {
                        /*
                         * pre-alloc page cannot be reused by next time so as
                         * to strictly follow mempolicy (e.g., alloc_page_vma()
                         * will allocate page according to address).  This
                         * could only happen if one pinned pte changed.
                         */
                        folio_put(prealloc);
                        prealloc = NULL;
                }
                nr = ret;
                progress += 8 * nr;
        } while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr,
                 addr != end);

        arch_leave_lazy_mmu_mode();
        pte_unmap_unlock(orig_src_pte, src_ptl);
        add_mm_rss_vec(dst_mm, rss);
        pte_unmap_unlock(orig_dst_pte, dst_ptl);
        cond_resched();

        if (ret == -EIO) {
                VM_WARN_ON_ONCE(!entry.val);
                if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
                        ret = -ENOMEM;
                        goto out;
                }
                entry.val = 0;
        } else if (ret == -EBUSY || unlikely(ret == -EHWPOISON)) {
                goto out;
        } else if (ret ==  -EAGAIN) {
                prealloc = folio_prealloc(src_mm, src_vma, addr, false);
                if (!prealloc)
                        return -ENOMEM;
        } else if (ret < 0) {
                VM_WARN_ON_ONCE(1);
        }

        /* We've captured and resolved the error. Reset, try again. */
        ret = 0;

        if (addr != end)
                goto again;
out:
        if (unlikely(prealloc))
                folio_put(prealloc);
        return ret;
}

static inline int
copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
               pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
               unsigned long end)
{
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        struct mm_struct *src_mm = src_vma->vm_mm;
        pmd_t *src_pmd, *dst_pmd;
        unsigned long next;

        dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
        if (!dst_pmd)
                return -ENOMEM;
        src_pmd = pmd_offset(src_pud, addr);
        do {
                next = pmd_addr_end(addr, end);
                if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)) {
                        int err;
                        VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
                        err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
                                            addr, dst_vma, src_vma);
                        if (err == -ENOMEM)
                                return -ENOMEM;
                        if (!err)
                                continue;
                        /* fall through */
                }
                if (pmd_none_or_clear_bad(src_pmd))
                        continue;
                if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd,
                                   addr, next))
                        return -ENOMEM;
        } while (dst_pmd++, src_pmd++, addr = next, addr != end);
        return 0;
}

static inline int
copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
               p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr,
               unsigned long end)
{
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        struct mm_struct *src_mm = src_vma->vm_mm;
        pud_t *src_pud, *dst_pud;
        unsigned long next;

        dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
        if (!dst_pud)
                return -ENOMEM;
        src_pud = pud_offset(src_p4d, addr);
        do {
                next = pud_addr_end(addr, end);
                if (pud_trans_huge(*src_pud)) {
                        int err;

                        VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
                        err = copy_huge_pud(dst_mm, src_mm,
                                            dst_pud, src_pud, addr, src_vma);
                        if (err == -ENOMEM)
                                return -ENOMEM;
                        if (!err)
                                continue;
                        /* fall through */
                }
                if (pud_none_or_clear_bad(src_pud))
                        continue;
                if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud,
                                   addr, next))
                        return -ENOMEM;
        } while (dst_pud++, src_pud++, addr = next, addr != end);
        return 0;
}

static inline int
copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
               pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr,
               unsigned long end)
{
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        p4d_t *src_p4d, *dst_p4d;
        unsigned long next;

        dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
        if (!dst_p4d)
                return -ENOMEM;
        src_p4d = p4d_offset(src_pgd, addr);
        do {
                next = p4d_addr_end(addr, end);
                if (p4d_none_or_clear_bad(src_p4d))
                        continue;
                if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d,
                                   addr, next))
                        return -ENOMEM;
        } while (dst_p4d++, src_p4d++, addr = next, addr != end);
        return 0;
}

/*
 * Return true if the vma needs to copy the pgtable during this fork().  Return
 * false when we can speed up fork() by allowing lazy page faults later until
 * when the child accesses the memory range.
 */
static bool
vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
        /*
         * Always copy pgtables when dst_vma has uffd-wp enabled even if it's
         * file-backed (e.g. shmem). Because when uffd-wp is enabled, pgtable
         * contains uffd-wp protection information, that's something we can't
         * retrieve from page cache, and skip copying will lose those info.
         */
        if (userfaultfd_wp(dst_vma))
                return true;

        if (src_vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
                return true;

        if (src_vma->anon_vma)
                return true;

        /*
         * Don't copy ptes where a page fault will fill them correctly.  Fork
         * becomes much lighter when there are big shared or private readonly
         * mappings. The tradeoff is that copy_page_range is more efficient
         * than faulting.
         */
        return false;
}

int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
        pgd_t *src_pgd, *dst_pgd;
        unsigned long addr = src_vma->vm_start;
        unsigned long end = src_vma->vm_end;
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        struct mm_struct *src_mm = src_vma->vm_mm;
        struct mmu_notifier_range range;
        unsigned long next;
        bool is_cow;
        int ret;

        if (!vma_needs_copy(dst_vma, src_vma))
                return 0;

        if (is_vm_hugetlb_page(src_vma))
                return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);

        /*
         * We need to invalidate the secondary MMU mappings only when
         * there could be a permission downgrade on the ptes of the
         * parent mm. And a permission downgrade will only happen if
         * is_cow_mapping() returns true.
         */
        is_cow = is_cow_mapping(src_vma->vm_flags);

        if (is_cow) {
                mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
                                        0, src_mm, addr, end);
                mmu_notifier_invalidate_range_start(&range);
                /*
                 * Disabling preemption is not needed for the write side, as
                 * the read side doesn't spin, but goes to the mmap_lock.
                 *
                 * Use the raw variant of the seqcount_t write API to avoid
                 * lockdep complaining about preemptibility.
                 */
                vma_assert_write_locked(src_vma);
                raw_write_seqcount_begin(&src_mm->write_protect_seq);
        }

        ret = 0;
        dst_pgd = pgd_offset(dst_mm, addr);
        src_pgd = pgd_offset(src_mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(src_pgd))
                        continue;
                if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
                                            addr, next))) {
                        ret = -ENOMEM;
                        break;
                }
        } while (dst_pgd++, src_pgd++, addr = next, addr != end);

        if (is_cow) {
                raw_write_seqcount_end(&src_mm->write_protect_seq);
                mmu_notifier_invalidate_range_end(&range);
        }
        return ret;
}

/* Whether we should zap all COWed (private) pages too */
static inline bool should_zap_cows(struct zap_details *details)
{
        /* By default, zap all pages */
        if (!details || details->reclaim_pt)
                return true;

        /* Or, we zap COWed pages only if the caller wants to */
        return details->even_cows;
}

/* Decides whether we should zap this folio with the folio pointer specified */
static inline bool should_zap_folio(struct zap_details *details,
                                    struct folio *folio)
{
        /* If we can make a decision without *folio.. */
        if (should_zap_cows(details))
                return true;

        /* Otherwise we should only zap non-anon folios */
        return !folio_test_anon(folio);
}

static inline bool zap_drop_markers(struct zap_details *details)
{
        if (!details)
                return false;

        return details->zap_flags & ZAP_FLAG_DROP_MARKER;
}

/*
 * This function makes sure that we'll replace the none pte with an uffd-wp
 * swap special pte marker when necessary. Must be with the pgtable lock held.
 *
 * Returns true if uffd-wp ptes was installed, false otherwise.
 */
static inline bool
zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
                              unsigned long addr, pte_t *pte, int nr,
                              struct zap_details *details, pte_t pteval)
{
        bool was_installed = false;

#ifdef CONFIG_PTE_MARKER_UFFD_WP
        /* Zap on anonymous always means dropping everything */
        if (vma_is_anonymous(vma))
                return false;

        if (zap_drop_markers(details))
                return false;

        for (;;) {
                /* the PFN in the PTE is irrelevant. */
                if (pte_install_uffd_wp_if_needed(vma, addr, pte, pteval))
                        was_installed = true;
                if (--nr == 0)
                        break;
                pte++;
                addr += PAGE_SIZE;
        }
#endif
        return was_installed;
}

static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
                struct vm_area_struct *vma, struct folio *folio,
                struct page *page, pte_t *pte, pte_t ptent, unsigned int nr,
                unsigned long addr, struct zap_details *details, int *rss,
                bool *force_flush, bool *force_break, bool *any_skipped)
{
        struct mm_struct *mm = tlb->mm;
        bool delay_rmap = false;

        if (!folio_test_anon(folio)) {
                ptent = get_and_clear_full_ptes(mm, addr, pte, nr, tlb->fullmm);
                if (pte_dirty(ptent)) {
                        folio_mark_dirty(folio);
                        if (tlb_delay_rmap(tlb)) {
                                delay_rmap = true;
                                *force_flush = true;
                        }
                }
                if (pte_young(ptent) && likely(vma_has_recency(vma)))
                        folio_mark_accessed(folio);
                rss[mm_counter(folio)] -= nr;
        } else {
                /* We don't need up-to-date accessed/dirty bits. */
                clear_full_ptes(mm, addr, pte, nr, tlb->fullmm);
                rss[MM_ANONPAGES] -= nr;
        }
        /* Checking a single PTE in a batch is sufficient. */
        arch_check_zapped_pte(vma, ptent);
        tlb_remove_tlb_entries(tlb, pte, nr, addr);
        if (unlikely(userfaultfd_pte_wp(vma, ptent)))
                *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte,
                                                             nr, details, ptent);

        if (!delay_rmap) {
                folio_remove_rmap_ptes(folio, page, nr, vma);

                if (unlikely(folio_mapcount(folio) < 0))
                        print_bad_pte(vma, addr, ptent, page);
        }
        if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) {
                *force_flush = true;
                *force_break = true;
        }
}

/*
 * Zap or skip at least one present PTE, trying to batch-process subsequent
 * PTEs that map consecutive pages of the same folio.
 *
 * Returns the number of processed (skipped or zapped) PTEs (at least 1).
 */
static inline int zap_present_ptes(struct mmu_gather *tlb,
                struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
                unsigned int max_nr, unsigned long addr,
                struct zap_details *details, int *rss, bool *force_flush,
                bool *force_break, bool *any_skipped)
{
        struct mm_struct *mm = tlb->mm;
        struct folio *folio;
        struct page *page;
        int nr;

        page = vm_normal_page(vma, addr, ptent);
        if (!page) {
                /* We don't need up-to-date accessed/dirty bits. */
                ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
                arch_check_zapped_pte(vma, ptent);
                tlb_remove_tlb_entry(tlb, pte, addr);
                if (userfaultfd_pte_wp(vma, ptent))
                        *any_skipped = zap_install_uffd_wp_if_needed(vma, addr,
                                                pte, 1, details, ptent);
                ksm_might_unmap_zero_page(mm, ptent);
                return 1;
        }

        folio = page_folio(page);
        if (unlikely(!should_zap_folio(details, folio))) {
                *any_skipped = true;
                return 1;
        }

        /*
         * Make sure that the common "small folio" case is as fast as possible
         * by keeping the batching logic separate.
         */
        if (unlikely(folio_test_large(folio) && max_nr != 1)) {
                nr = folio_pte_batch(folio, pte, ptent, max_nr);
                zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr,
                                       addr, details, rss, force_flush,
                                       force_break, any_skipped);
                return nr;
        }
        zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr,
                               details, rss, force_flush, force_break, any_skipped);
        return 1;
}

static inline int zap_nonpresent_ptes(struct mmu_gather *tlb,
                struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
                unsigned int max_nr, unsigned long addr,
                struct zap_details *details, int *rss, bool *any_skipped)
{
        swp_entry_t entry;
        int nr = 1;

        *any_skipped = true;
        entry = pte_to_swp_entry(ptent);
        if (is_device_private_entry(entry) ||
                is_device_exclusive_entry(entry)) {
                struct page *page = pfn_swap_entry_to_page(entry);
                struct folio *folio = page_folio(page);

                if (unlikely(!should_zap_folio(details, folio)))
                        return 1;
                /*
                 * Both device private/exclusive mappings should only
                 * work with anonymous page so far, so we don't need to
                 * consider uffd-wp bit when zap. For more information,
                 * see zap_install_uffd_wp_if_needed().
                 */
                WARN_ON_ONCE(!vma_is_anonymous(vma));
                rss[mm_counter(folio)]--;
                folio_remove_rmap_pte(folio, page, vma);
                folio_put(folio);
        } else if (!non_swap_entry(entry)) {
                /* Genuine swap entries, hence a private anon pages */
                if (!should_zap_cows(details))
                        return 1;

                nr = swap_pte_batch(pte, max_nr, ptent);
                rss[MM_SWAPENTS] -= nr;
                free_swap_and_cache_nr(entry, nr);
        } else if (is_migration_entry(entry)) {
                struct folio *folio = pfn_swap_entry_folio(entry);

                if (!should_zap_folio(details, folio))
                        return 1;
                rss[mm_counter(folio)]--;
        } else if (pte_marker_entry_uffd_wp(entry)) {
                /*
                 * For anon: always drop the marker; for file: only
                 * drop the marker if explicitly requested.
                 */
                if (!vma_is_anonymous(vma) && !zap_drop_markers(details))
                        return 1;
        } else if (is_guard_swp_entry(entry)) {
                /*
                 * Ordinary zapping should not remove guard PTE
                 * markers. Only do so if we should remove PTE markers
                 * in general.
                 */
                if (!zap_drop_markers(details))
                        return 1;
        } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) {
                if (!should_zap_cows(details))
                        return 1;
        } else {
                /* We should have covered all the swap entry types */
                pr_alert("unrecognized swap entry 0x%lx\n", entry.val);
                WARN_ON_ONCE(1);
        }
        clear_not_present_full_ptes(vma->vm_mm, addr, pte, nr, tlb->fullmm);
        *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent);

        return nr;
}

static inline int do_zap_pte_range(struct mmu_gather *tlb,
                                   struct vm_area_struct *vma, pte_t *pte,
                                   unsigned long addr, unsigned long end,
                                   struct zap_details *details, int *rss,
                                   bool *force_flush, bool *force_break,
                                   bool *any_skipped)
{
        pte_t ptent = ptep_get(pte);
        int max_nr = (end - addr) / PAGE_SIZE;
        int nr = 0;

        /* Skip all consecutive none ptes */
        if (pte_none(ptent)) {
                for (nr = 1; nr < max_nr; nr++) {
                        ptent = ptep_get(pte + nr);
                        if (!pte_none(ptent))
                                break;
                }
                max_nr -= nr;
                if (!max_nr)
                        return nr;
                pte += nr;
                addr += nr * PAGE_SIZE;
        }

        if (pte_present(ptent))
                nr += zap_present_ptes(tlb, vma, pte, ptent, max_nr, addr,
                                       details, rss, force_flush, force_break,
                                       any_skipped);
        else
                nr += zap_nonpresent_ptes(tlb, vma, pte, ptent, max_nr, addr,
                                          details, rss, any_skipped);

        return nr;
}

static unsigned long zap_pte_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pmd_t *pmd,
                                unsigned long addr, unsigned long end,
                                struct zap_details *details)
{
        bool force_flush = false, force_break = false;
        struct mm_struct *mm = tlb->mm;
        int rss[NR_MM_COUNTERS];
        spinlock_t *ptl;
        pte_t *start_pte;
        pte_t *pte;
        pmd_t pmdval;
        unsigned long start = addr;
        bool can_reclaim_pt = reclaim_pt_is_enabled(start, end, details);
        bool direct_reclaim = true;
        int nr;

retry:
        tlb_change_page_size(tlb, PAGE_SIZE);
        init_rss_vec(rss);
        start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
        if (!pte)
                return addr;

        flush_tlb_batched_pending(mm);
        arch_enter_lazy_mmu_mode();
        do {
                bool any_skipped = false;

                if (need_resched()) {
                        direct_reclaim = false;
                        break;
                }

                nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, rss,
                                      &force_flush, &force_break, &any_skipped);
                if (any_skipped)
                        can_reclaim_pt = false;
                if (unlikely(force_break)) {
                        addr += nr * PAGE_SIZE;
                        direct_reclaim = false;
                        break;
                }
        } while (pte += nr, addr += PAGE_SIZE * nr, addr != end);

        /*
         * Fast path: try to hold the pmd lock and unmap the PTE page.
         *
         * If the pte lock was released midway (retry case), or if the attempt
         * to hold the pmd lock failed, then we need to recheck all pte entries
         * to ensure they are still none, thereby preventing the pte entries
         * from being repopulated by another thread.
         */
        if (can_reclaim_pt && direct_reclaim && addr == end)
                direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval);

        add_mm_rss_vec(mm, rss);
        arch_leave_lazy_mmu_mode();

        /* Do the actual TLB flush before dropping ptl */
        if (force_flush) {
                tlb_flush_mmu_tlbonly(tlb);
                tlb_flush_rmaps(tlb, vma);
        }
        pte_unmap_unlock(start_pte, ptl);

        /*
         * If we forced a TLB flush (either due to running out of
         * batch buffers or because we needed to flush dirty TLB
         * entries before releasing the ptl), free the batched
         * memory too. Come back again if we didn't do everything.
         */
        if (force_flush)
                tlb_flush_mmu(tlb);

        if (addr != end) {
                cond_resched();
                force_flush = false;
                force_break = false;
                goto retry;
        }

        if (can_reclaim_pt) {
                if (direct_reclaim)
                        free_pte(mm, start, tlb, pmdval);
                else
                        try_to_free_pte(mm, pmd, start, tlb);
        }

        return addr;
}

static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pud_t *pud,
                                unsigned long addr, unsigned long end,
                                struct zap_details *details)
{
        pmd_t *pmd;
        unsigned long next;

        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
                if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd)) {
                        if (next - addr != HPAGE_PMD_SIZE)
                                __split_huge_pmd(vma, pmd, addr, false);
                        else if (zap_huge_pmd(tlb, vma, pmd, addr)) {
                                addr = next;
                                continue;
                        }
                        /* fall through */
                } else if (details && details->single_folio &&
                           folio_test_pmd_mappable(details->single_folio) &&
                           next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
                        spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
                        /*
                         * Take and drop THP pmd lock so that we cannot return
                         * prematurely, while zap_huge_pmd() has cleared *pmd,
                         * but not yet decremented compound_mapcount().
                         */
                        spin_unlock(ptl);
                }
                if (pmd_none(*pmd)) {
                        addr = next;
                        continue;
                }
                addr = zap_pte_range(tlb, vma, pmd, addr, next, details);
                if (addr != next)
                        pmd--;
        } while (pmd++, cond_resched(), addr != end);

        return addr;
}

static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, p4d_t *p4d,
                                unsigned long addr, unsigned long end,
                                struct zap_details *details)
{
        pud_t *pud;
        unsigned long next;

        pud = pud_offset(p4d, addr);
        do {
                next = pud_addr_end(addr, end);
                if (pud_trans_huge(*pud)) {
                        if (next - addr != HPAGE_PUD_SIZE) {
                                mmap_assert_locked(tlb->mm);
                                split_huge_pud(vma, pud, addr);
                        } else if (zap_huge_pud(tlb, vma, pud, addr))
                                goto next;
                        /* fall through */
                }
                if (pud_none_or_clear_bad(pud))
                        continue;
                next = zap_pmd_range(tlb, vma, pud, addr, next, details);
next:
                cond_resched();
        } while (pud++, addr = next, addr != end);

        return addr;
}

static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pgd_t *pgd,
                                unsigned long addr, unsigned long end,
                                struct zap_details *details)
{
        p4d_t *p4d;
        unsigned long next;

        p4d = p4d_offset(pgd, addr);
        do {
                next = p4d_addr_end(addr, end);
                if (p4d_none_or_clear_bad(p4d))
                        continue;
                next = zap_pud_range(tlb, vma, p4d, addr, next, details);
        } while (p4d++, addr = next, addr != end);

        return addr;
}

void unmap_page_range(struct mmu_gather *tlb,
                             struct vm_area_struct *vma,
                             unsigned long addr, unsigned long end,
                             struct zap_details *details)
{
        pgd_t *pgd;
        unsigned long next;

        BUG_ON(addr >= end);
        tlb_start_vma(tlb, vma);
        pgd = pgd_offset(vma->vm_mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
                next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
        } while (pgd++, addr = next, addr != end);
        tlb_end_vma(tlb, vma);
}


static void unmap_single_vma(struct mmu_gather *tlb,
                struct vm_area_struct *vma, unsigned long start_addr,
                unsigned long end_addr,
                struct zap_details *details, bool mm_wr_locked)
{
        unsigned long start = max(vma->vm_start, start_addr);
        unsigned long end;

        if (start >= vma->vm_end)
                return;
        end = min(vma->vm_end, end_addr);
        if (end <= vma->vm_start)
                return;

        if (vma->vm_file)
                uprobe_munmap(vma, start, end);

        if (start != end) {
                if (unlikely(is_vm_hugetlb_page(vma))) {
                        /*
                         * It is undesirable to test vma->vm_file as it
                         * should be non-null for valid hugetlb area.
                         * However, vm_file will be NULL in the error
                         * cleanup path of mmap_region. When
                         * hugetlbfs ->mmap method fails,
                         * mmap_region() nullifies vma->vm_file
                         * before calling this function to clean up.
                         * Since no pte has actually been setup, it is
                         * safe to do nothing in this case.
                         */
                        if (vma->vm_file) {
                                zap_flags_t zap_flags = details ?
                                    details->zap_flags : 0;
                                __unmap_hugepage_range(tlb, vma, start, end,
                                                             NULL, zap_flags);
                        }
                } else
                        unmap_page_range(tlb, vma, start, end, details);
        }
}

/**
 * unmap_vmas - unmap a range of memory covered by a list of vma's
 * @tlb: address of the caller's struct mmu_gather
 * @mas: the maple state
 * @vma: the starting vma
 * @start_addr: virtual address at which to start unmapping
 * @end_addr: virtual address at which to end unmapping
 * @tree_end: The maximum index to check
 * @mm_wr_locked: lock flag
 *
 * Unmap all pages in the vma list.
 *
 * Only addresses between `start' and `end' will be unmapped.
 *
 * The VMA list must be sorted in ascending virtual address order.
 *
 * unmap_vmas() assumes that the caller will flush the whole unmapped address
 * range after unmap_vmas() returns.  So the only responsibility here is to
 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
 * drops the lock and schedules.
 */
void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
                struct vm_area_struct *vma, unsigned long start_addr,
                unsigned long end_addr, unsigned long tree_end,
                bool mm_wr_locked)
{
        struct mmu_notifier_range range;
        struct zap_details details = {
                .zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP,
                /* Careful - we need to zap private pages too! */
                .even_cows = true,
        };

        mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
                                start_addr, end_addr);
        mmu_notifier_invalidate_range_start(&range);
        do {
                unsigned long start = start_addr;
                unsigned long end = end_addr;
                hugetlb_zap_begin(vma, &start, &end);
                unmap_single_vma(tlb, vma, start, end, &details,
                                 mm_wr_locked);
                hugetlb_zap_end(vma, &details);
                vma = mas_find(mas, tree_end - 1);
        } while (vma && likely(!xa_is_zero(vma)));
        mmu_notifier_invalidate_range_end(&range);
}

/**
 * zap_page_range_single_batched - remove user pages in a given range
 * @tlb: pointer to the caller's struct mmu_gather
 * @vma: vm_area_struct holding the applicable pages
 * @address: starting address of pages to remove
 * @size: number of bytes to remove
 * @details: details of shared cache invalidation
 *
 * @tlb shouldn't be NULL.  The range must fit into one VMA.  If @vma is for
 * hugetlb, @tlb is flushed and re-initialized by this function.
 */
void zap_page_range_single_batched(struct mmu_gather *tlb,
                struct vm_area_struct *vma, unsigned long address,
                unsigned long size, struct zap_details *details)
{
        const unsigned long end = address + size;
        struct mmu_notifier_range range;

        VM_WARN_ON_ONCE(!tlb || tlb->mm != vma->vm_mm);

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
                                address, end);
        hugetlb_zap_begin(vma, &range.start, &range.end);
        update_hiwater_rss(vma->vm_mm);
        mmu_notifier_invalidate_range_start(&range);
        /*
         * unmap 'address-end' not 'range.start-range.end' as range
         * could have been expanded for hugetlb pmd sharing.
         */
        unmap_single_vma(tlb, vma, address, end, details, false);
        mmu_notifier_invalidate_range_end(&range);
        if (is_vm_hugetlb_page(vma)) {
                /*
                 * flush tlb and free resources before hugetlb_zap_end(), to
                 * avoid concurrent page faults' allocation failure.
                 */
                tlb_finish_mmu(tlb);
                hugetlb_zap_end(vma, details);
                tlb_gather_mmu(tlb, vma->vm_mm);
        }
}

/**
 * zap_page_range_single - remove user pages in a given range
 * @vma: vm_area_struct holding the applicable pages
 * @address: starting address of pages to zap
 * @size: number of bytes to zap
 * @details: details of shared cache invalidation
 *
 * The range must fit into one VMA.
 */
void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
                unsigned long size, struct zap_details *details)
{
        struct mmu_gather tlb;

        tlb_gather_mmu(&tlb, vma->vm_mm);
        zap_page_range_single_batched(&tlb, vma, address, size, details);
        tlb_finish_mmu(&tlb);
}

/**
 * zap_vma_ptes - remove ptes mapping the vma
 * @vma: vm_area_struct holding ptes to be zapped
 * @address: starting address of pages to zap
 * @size: number of bytes to zap
 *
 * This function only unmaps ptes assigned to VM_PFNMAP vmas.
 *
 * The entire address range must be fully contained within the vma.
 *
 */
void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
                unsigned long size)
{
        if (!range_in_vma(vma, address, address + size) ||
                            !(vma->vm_flags & VM_PFNMAP))
                return;

        zap_page_range_single(vma, address, size, NULL);
}
EXPORT_SYMBOL_GPL(zap_vma_ptes);

static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;

        pgd = pgd_offset(mm, addr);
        p4d = p4d_alloc(mm, pgd, addr);
        if (!p4d)
                return NULL;
        pud = pud_alloc(mm, p4d, addr);
        if (!pud)
                return NULL;
        pmd = pmd_alloc(mm, pud, addr);
        if (!pmd)
                return NULL;

        VM_BUG_ON(pmd_trans_huge(*pmd));
        return pmd;
}

pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
                        spinlock_t **ptl)
{
        pmd_t *pmd = walk_to_pmd(mm, addr);

        if (!pmd)
                return NULL;
        return pte_alloc_map_lock(mm, pmd, addr, ptl);
}

static bool vm_mixed_zeropage_allowed(struct vm_area_struct *vma)
{
        VM_WARN_ON_ONCE(vma->vm_flags & VM_PFNMAP);
        /*
         * Whoever wants to forbid the zeropage after some zeropages
         * might already have been mapped has to scan the page tables and
         * bail out on any zeropages. Zeropages in COW mappings can
         * be unshared using FAULT_FLAG_UNSHARE faults.
         */
        if (mm_forbids_zeropage(vma->vm_mm))
                return false;
        /* zeropages in COW mappings are common and unproblematic. */
        if (is_cow_mapping(vma->vm_flags))
                return true;
        /* Mappings that do not allow for writable PTEs are unproblematic. */
        if (!(vma->vm_flags & (VM_WRITE | VM_MAYWRITE)))
                return true;
        /*
         * Why not allow any VMA that has vm_ops->pfn_mkwrite? GUP could
         * find the shared zeropage and longterm-pin it, which would
         * be problematic as soon as the zeropage gets replaced by a different
         * page due to vma->vm_ops->pfn_mkwrite, because what's mapped would
         * now differ to what GUP looked up. FSDAX is incompatible to
         * FOLL_LONGTERM and VM_IO is incompatible to GUP completely (see
         * check_vma_flags).
         */
        return vma->vm_ops && vma->vm_ops->pfn_mkwrite &&
               (vma_is_fsdax(vma) || vma->vm_flags & VM_IO);
}

static int validate_page_before_insert(struct vm_area_struct *vma,
                                       struct page *page)
{
        struct folio *folio = page_folio(page);

        if (!folio_ref_count(folio))
                return -EINVAL;
        if (unlikely(is_zero_folio(folio))) {
                if (!vm_mixed_zeropage_allowed(vma))
                        return -EINVAL;
                return 0;
        }
        if (folio_test_anon(folio) || page_has_type(page))
                return -EINVAL;
        flush_dcache_folio(folio);
        return 0;
}

static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
                                unsigned long addr, struct page *page,
                                pgprot_t prot, bool mkwrite)
{
        struct folio *folio = page_folio(page);
        pte_t pteval = ptep_get(pte);

        if (!pte_none(pteval)) {
                if (!mkwrite)
                        return -EBUSY;

                /* see insert_pfn(). */
                if (pte_pfn(pteval) != page_to_pfn(page)) {
                        WARN_ON_ONCE(!is_zero_pfn(pte_pfn(pteval)));
                        return -EFAULT;
                }
                pteval = maybe_mkwrite(pteval, vma);
                pteval = pte_mkyoung(pteval);
                if (ptep_set_access_flags(vma, addr, pte, pteval, 1))
                        update_mmu_cache(vma, addr, pte);
                return 0;
        }

        /* Ok, finally just insert the thing.. */
        pteval = mk_pte(page, prot);
        if (unlikely(is_zero_folio(folio))) {
                pteval = pte_mkspecial(pteval);
        } else {
                folio_get(folio);
                pteval = mk_pte(page, prot);
                if (mkwrite) {
                        pteval = pte_mkyoung(pteval);
                        pteval = maybe_mkwrite(pte_mkdirty(pteval), vma);
                }
                inc_mm_counter(vma->vm_mm, mm_counter_file(folio));
                folio_add_file_rmap_pte(folio, page, vma);
        }
        set_pte_at(vma->vm_mm, addr, pte, pteval);
        return 0;
}

static int insert_page(struct vm_area_struct *vma, unsigned long addr,
                        struct page *page, pgprot_t prot, bool mkwrite)
{
        int retval;
        pte_t *pte;
        spinlock_t *ptl;

        retval = validate_page_before_insert(vma, page);
        if (retval)
                goto out;
        retval = -ENOMEM;
        pte = get_locked_pte(vma->vm_mm, addr, &ptl);
        if (!pte)
                goto out;
        retval = insert_page_into_pte_locked(vma, pte, addr, page, prot,
                                        mkwrite);
        pte_unmap_unlock(pte, ptl);
out:
        return retval;
}

static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte,
                        unsigned long addr, struct page *page, pgprot_t prot)
{
        int err;

        err = validate_page_before_insert(vma, page);
        if (err)
                return err;
        return insert_page_into_pte_locked(vma, pte, addr, page, prot, false);
}

/* insert_pages() amortizes the cost of spinlock operations
 * when inserting pages in a loop.
 */
static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
                        struct page **pages, unsigned long *num, pgprot_t prot)
{
        pmd_t *pmd = NULL;
        pte_t *start_pte, *pte;
        spinlock_t *pte_lock;
        struct mm_struct *const mm = vma->vm_mm;
        unsigned long curr_page_idx = 0;
        unsigned long remaining_pages_total = *num;
        unsigned long pages_to_write_in_pmd;
        int ret;
more:
        ret = -EFAULT;
        pmd = walk_to_pmd(mm, addr);
        if (!pmd)
                goto out;

        pages_to_write_in_pmd = min_t(unsigned long,
                remaining_pages_total, PTRS_PER_PTE - pte_index(addr));

        /* Allocate the PTE if necessary; takes PMD lock once only. */
        ret = -ENOMEM;
        if (pte_alloc(mm, pmd))
                goto out;

        while (pages_to_write_in_pmd) {
                int pte_idx = 0;
                const int batch_size = min_t(int, pages_to_write_in_pmd, 8);

                start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
                if (!start_pte) {
                        ret = -EFAULT;
                        goto out;
                }
                for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
                        int err = insert_page_in_batch_locked(vma, pte,
                                addr, pages[curr_page_idx], prot);
                        if (unlikely(err)) {
                                pte_unmap_unlock(start_pte, pte_lock);
                                ret = err;
                                remaining_pages_total -= pte_idx;
                                goto out;
                        }
                        addr += PAGE_SIZE;
                        ++curr_page_idx;
                }
                pte_unmap_unlock(start_pte, pte_lock);
                pages_to_write_in_pmd -= batch_size;
                remaining_pages_total -= batch_size;
        }
        if (remaining_pages_total)
                goto more;
        ret = 0;
out:
        *num = remaining_pages_total;
        return ret;
}

/**
 * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock.
 * @vma: user vma to map to
 * @addr: target start user address of these pages
 * @pages: source kernel pages
 * @num: in: number of pages to map. out: number of pages that were *not*
 * mapped. (0 means all pages were successfully mapped).
 *
 * Preferred over vm_insert_page() when inserting multiple pages.
 *
 * In case of error, we may have mapped a subset of the provided
 * pages. It is the caller's responsibility to account for this case.
 *
 * The same restrictions apply as in vm_insert_page().
 */
int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
                        struct page **pages, unsigned long *num)
{
        const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1;

        if (addr < vma->vm_start || end_addr >= vma->vm_end)
                return -EFAULT;
        if (!(vma->vm_flags & VM_MIXEDMAP)) {
                BUG_ON(mmap_read_trylock(vma->vm_mm));
                BUG_ON(vma->vm_flags & VM_PFNMAP);
                vm_flags_set(vma, VM_MIXEDMAP);
        }
        /* Defer page refcount checking till we're about to map that page. */
        return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
}
EXPORT_SYMBOL(vm_insert_pages);

/**
 * vm_insert_page - insert single page into user vma
 * @vma: user vma to map to
 * @addr: target user address of this page
 * @page: source kernel page
 *
 * This allows drivers to insert individual pages they've allocated
 * into a user vma. The zeropage is supported in some VMAs,
 * see vm_mixed_zeropage_allowed().
 *
 * The page has to be a nice clean _individual_ kernel allocation.
 * If you allocate a compound page, you need to have marked it as
 * such (__GFP_COMP), or manually just split the page up yourself
 * (see split_page()).
 *
 * NOTE! Traditionally this was done with "remap_pfn_range()" which
 * took an arbitrary page protection parameter. This doesn't allow
 * that. Your vma protection will have to be set up correctly, which
 * means that if you want a shared writable mapping, you'd better
 * ask for a shared writable mapping!
 *
 * The page does not need to be reserved.
 *
 * Usually this function is called from f_op->mmap() handler
 * under mm->mmap_lock write-lock, so it can change vma->vm_flags.
 * Caller must set VM_MIXEDMAP on vma if it wants to call this
 * function from other places, for example from page-fault handler.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
                        struct page *page)
{
        if (addr < vma->vm_start || addr >= vma->vm_end)
                return -EFAULT;
        if (!(vma->vm_flags & VM_MIXEDMAP)) {
                BUG_ON(mmap_read_trylock(vma->vm_mm));
                BUG_ON(vma->vm_flags & VM_PFNMAP);
                vm_flags_set(vma, VM_MIXEDMAP);
        }
        return insert_page(vma, addr, page, vma->vm_page_prot, false);
}
EXPORT_SYMBOL(vm_insert_page);

/*
 * __vm_map_pages - maps range of kernel pages into user vma
 * @vma: user vma to map to
 * @pages: pointer to array of source kernel pages
 * @num: number of pages in page array
 * @offset: user's requested vm_pgoff
 *
 * This allows drivers to map range of kernel pages into a user vma.
 * The zeropage is supported in some VMAs, see
 * vm_mixed_zeropage_allowed().
 *
 * Return: 0 on success and error code otherwise.
 */
static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num, unsigned long offset)
{
        unsigned long count = vma_pages(vma);
        unsigned long uaddr = vma->vm_start;
        int ret, i;

        /* Fail if the user requested offset is beyond the end of the object */
        if (offset >= num)
                return -ENXIO;

        /* Fail if the user requested size exceeds available object size */
        if (count > num - offset)
                return -ENXIO;

        for (i = 0; i < count; i++) {
                ret = vm_insert_page(vma, uaddr, pages[offset + i]);
                if (ret < 0)
                        return ret;
                uaddr += PAGE_SIZE;
        }

        return 0;
}

/**
 * vm_map_pages - maps range of kernel pages starts with non zero offset
 * @vma: user vma to map to
 * @pages: pointer to array of source kernel pages
 * @num: number of pages in page array
 *
 * Maps an object consisting of @num pages, catering for the user's
 * requested vm_pgoff
 *
 * If we fail to insert any page into the vma, the function will return
 * immediately leaving any previously inserted pages present.  Callers
 * from the mmap handler may immediately return the error as their caller
 * will destroy the vma, removing any successfully inserted pages. Other
 * callers should make their own arrangements for calling unmap_region().
 *
 * Context: Process context. Called by mmap handlers.
 * Return: 0 on success and error code otherwise.
 */
int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num)
{
        return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
}
EXPORT_SYMBOL(vm_map_pages);

/**
 * vm_map_pages_zero - map range of kernel pages starts with zero offset
 * @vma: user vma to map to
 * @pages: pointer to array of source kernel pages
 * @num: number of pages in page array
 *
 * Similar to vm_map_pages(), except that it explicitly sets the offset
 * to 0. This function is intended for the drivers that did not consider
 * vm_pgoff.
 *
 * Context: Process context. Called by mmap handlers.
 * Return: 0 on success and error code otherwise.
 */
int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num)
{
        return __vm_map_pages(vma, pages, num, 0);
}
EXPORT_SYMBOL(vm_map_pages_zero);

static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn, pgprot_t prot, bool mkwrite)
{
        struct mm_struct *mm = vma->vm_mm;
        pte_t *pte, entry;
        spinlock_t *ptl;

        pte = get_locked_pte(mm, addr, &ptl);
        if (!pte)
                return VM_FAULT_OOM;
        entry = ptep_get(pte);
        if (!pte_none(entry)) {
                if (mkwrite) {
                        /*
                         * For read faults on private mappings the PFN passed
                         * in may not match the PFN we have mapped if the
                         * mapped PFN is a writeable COW page.  In the mkwrite
                         * case we are creating a writable PTE for a shared
                         * mapping and we expect the PFNs to match. If they
                         * don't match, we are likely racing with block
                         * allocation and mapping invalidation so just skip the
                         * update.
                         */
                        if (pte_pfn(entry) != pfn) {
                                WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry)));
                                goto out_unlock;
                        }
                        entry = pte_mkyoung(entry);
                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                        if (ptep_set_access_flags(vma, addr, pte, entry, 1))
                                update_mmu_cache(vma, addr, pte);
                }
                goto out_unlock;
        }

        /* Ok, finally just insert the thing.. */
        entry = pte_mkspecial(pfn_pte(pfn, prot));

        if (mkwrite) {
                entry = pte_mkyoung(entry);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        }

        set_pte_at(mm, addr, pte, entry);
        update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */

out_unlock:
        pte_unmap_unlock(pte, ptl);
        return VM_FAULT_NOPAGE;
}

/**
 * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot
 * @vma: user vma to map to
 * @addr: target user address of this page
 * @pfn: source kernel pfn
 * @pgprot: pgprot flags for the inserted page
 *
 * This is exactly like vmf_insert_pfn(), except that it allows drivers
 * to override pgprot on a per-page basis.
 *
 * This only makes sense for IO mappings, and it makes no sense for
 * COW mappings.  In general, using multiple vmas is preferable;
 * vmf_insert_pfn_prot should only be used if using multiple VMAs is
 * impractical.
 *
 * pgprot typically only differs from @vma->vm_page_prot when drivers set
 * caching- and encryption bits different than those of @vma->vm_page_prot,
 * because the caching- or encryption mode may not be known at mmap() time.
 *
 * This is ok as long as @vma->vm_page_prot is not used by the core vm
 * to set caching and encryption bits for those vmas (except for COW pages).
 * This is ensured by core vm only modifying these page table entries using
 * functions that don't touch caching- or encryption bits, using pte_modify()
 * if needed. (See for example mprotect()).
 *
 * Also when new page-table entries are created, this is only done using the
 * fault() callback, and never using the value of vma->vm_page_prot,
 * except for page-table entries that point to anonymous pages as the result
 * of COW.
 *
 * Context: Process context.  May allocate using %GFP_KERNEL.
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn, pgprot_t pgprot)
{
        /*
         * Technically, architectures with pte_special can avoid all these
         * restrictions (same for remap_pfn_range).  However we would like
         * consistency in testing and feature parity among all, so we should
         * try to keep these invariants in place for everybody.
         */
        BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
        BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
                                                (VM_PFNMAP|VM_MIXEDMAP));
        BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
        BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));

        if (addr < vma->vm_start || addr >= vma->vm_end)
                return VM_FAULT_SIGBUS;

        if (!pfn_modify_allowed(pfn, pgprot))
                return VM_FAULT_SIGBUS;

        pfnmap_setup_cachemode_pfn(pfn, &pgprot);

        return insert_pfn(vma, addr, pfn, pgprot, false);
}
EXPORT_SYMBOL(vmf_insert_pfn_prot);

/**
 * vmf_insert_pfn - insert single pfn into user vma
 * @vma: user vma to map to
 * @addr: target user address of this page
 * @pfn: source kernel pfn
 *
 * Similar to vm_insert_page, this allows drivers to insert individual pages
 * they've allocated into a user vma. Same comments apply.
 *
 * This function should only be called from a vm_ops->fault handler, and
 * in that case the handler should return the result of this function.
 *
 * vma cannot be a COW mapping.
 *
 * As this is called only for pages that do not currently exist, we
 * do not need to flush old virtual caches or the TLB.
 *
 * Context: Process context.  May allocate using %GFP_KERNEL.
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn)
{
        return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
}
EXPORT_SYMBOL(vmf_insert_pfn);

static bool vm_mixed_ok(struct vm_area_struct *vma, unsigned long pfn,
                        bool mkwrite)
{
        if (unlikely(is_zero_pfn(pfn)) &&
            (mkwrite || !vm_mixed_zeropage_allowed(vma)))
                return false;
        /* these checks mirror the abort conditions in vm_normal_page */
        if (vma->vm_flags & VM_MIXEDMAP)
                return true;
        if (is_zero_pfn(pfn))
                return true;
        return false;
}

static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
                unsigned long addr, unsigned long pfn, bool mkwrite)
{
        pgprot_t pgprot = vma->vm_page_prot;
        int err;

        if (!vm_mixed_ok(vma, pfn, mkwrite))
                return VM_FAULT_SIGBUS;

        if (addr < vma->vm_start || addr >= vma->vm_end)
                return VM_FAULT_SIGBUS;

        pfnmap_setup_cachemode_pfn(pfn, &pgprot);

        if (!pfn_modify_allowed(pfn, pgprot))
                return VM_FAULT_SIGBUS;

        /*
         * If we don't have pte special, then we have to use the pfn_valid()
         * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
         * refcount the page if pfn_valid is true (hence insert_page rather
         * than insert_pfn).  If a zero_pfn were inserted into a VM_MIXEDMAP
         * without pte special, it would there be refcounted as a normal page.
         */
        if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pfn_valid(pfn)) {
                struct page *page;

                /*
                 * At this point we are committed to insert_page()
                 * regardless of whether the caller specified flags that
                 * result in pfn_t_has_page() == false.
                 */
                page = pfn_to_page(pfn);
                err = insert_page(vma, addr, page, pgprot, mkwrite);
        } else {
                return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
        }

        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        if (err < 0 && err != -EBUSY)
                return VM_FAULT_SIGBUS;

        return VM_FAULT_NOPAGE;
}

vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page,
                        bool write)
{
        pgprot_t pgprot = vmf->vma->vm_page_prot;
        unsigned long addr = vmf->address;
        int err;

        if (addr < vmf->vma->vm_start || addr >= vmf->vma->vm_end)
                return VM_FAULT_SIGBUS;

        err = insert_page(vmf->vma, addr, page, pgprot, write);
        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        if (err < 0 && err != -EBUSY)
                return VM_FAULT_SIGBUS;

        return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(vmf_insert_page_mkwrite);

vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                unsigned long pfn)
{
        return __vm_insert_mixed(vma, addr, pfn, false);
}
EXPORT_SYMBOL(vmf_insert_mixed);

/*
 *  If the insertion of PTE failed because someone else already added a
 *  different entry in the mean time, we treat that as success as we assume
 *  the same entry was actually inserted.
 */
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
                unsigned long addr, unsigned long pfn)
{
        return __vm_insert_mixed(vma, addr, pfn, true);
}

/*
 * maps a range of physical memory into the requested pages. the old
 * mappings are removed. any references to nonexistent pages results
 * in null mappings (currently treated as "copy-on-access")
 */
static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
                        unsigned long addr, unsigned long end,
                        unsigned long pfn, pgprot_t prot)
{
        pte_t *pte, *mapped_pte;
        spinlock_t *ptl;
        int err = 0;

        mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
        if (!pte)
                return -ENOMEM;
        arch_enter_lazy_mmu_mode();
        do {
                BUG_ON(!pte_none(ptep_get(pte)));
                if (!pfn_modify_allowed(pfn, prot)) {
                        err = -EACCES;
                        break;
                }
                set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
                pfn++;
        } while (pte++, addr += PAGE_SIZE, addr != end);
        arch_leave_lazy_mmu_mode();
        pte_unmap_unlock(mapped_pte, ptl);
        return err;
}

static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
                        unsigned long addr, unsigned long end,
                        unsigned long pfn, pgprot_t prot)
{
        pmd_t *pmd;
        unsigned long next;
        int err;

        pfn -= addr >> PAGE_SHIFT;
        pmd = pmd_alloc(mm, pud, addr);
        if (!pmd)
                return -ENOMEM;
        VM_BUG_ON(pmd_trans_huge(*pmd));
        do {
                next = pmd_addr_end(addr, end);
                err = remap_pte_range(mm, pmd, addr, next,
                                pfn + (addr >> PAGE_SHIFT), prot);
                if (err)
                        return err;
        } while (pmd++, addr = next, addr != end);
        return 0;
}

static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
                        unsigned long addr, unsigned long end,
                        unsigned long pfn, pgprot_t prot)
{
        pud_t *pud;
        unsigned long next;
        int err;

        pfn -= addr >> PAGE_SHIFT;
        pud = pud_alloc(mm, p4d, addr);
        if (!pud)
                return -ENOMEM;
        do {
                next = pud_addr_end(addr, end);
                err = remap_pmd_range(mm, pud, addr, next,
                                pfn + (addr >> PAGE_SHIFT), prot);
                if (err)
                        return err;
        } while (pud++, addr = next, addr != end);
        return 0;
}

static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
                        unsigned long addr, unsigned long end,
                        unsigned long pfn, pgprot_t prot)
{
        p4d_t *p4d;
        unsigned long next;
        int err;

        pfn -= addr >> PAGE_SHIFT;
        p4d = p4d_alloc(mm, pgd, addr);
        if (!p4d)
                return -ENOMEM;
        do {
                next = p4d_addr_end(addr, end);
                err = remap_pud_range(mm, p4d, addr, next,
                                pfn + (addr >> PAGE_SHIFT), prot);
                if (err)
                        return err;
        } while (p4d++, addr = next, addr != end);
        return 0;
}

static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long addr,
                unsigned long pfn, unsigned long size, pgprot_t prot)
{
        pgd_t *pgd;
        unsigned long next;
        unsigned long end = addr + PAGE_ALIGN(size);
        struct mm_struct *mm = vma->vm_mm;
        int err;

        if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
                return -EINVAL;

        /*
         * Physically remapped pages are special. Tell the
         * rest of the world about it:
         *   VM_IO tells people not to look at these pages
         *        (accesses can have side effects).
         *   VM_PFNMAP tells the core MM that the base pages are just
         *        raw PFN mappings, and do not have a "struct page" associated
         *        with them.
         *   VM_DONTEXPAND
         *      Disable vma merging and expanding with mremap().
         *   VM_DONTDUMP
         *      Omit vma from core dump, even when VM_IO turned off.
         *
         * There's a horrible special case to handle copy-on-write
         * behaviour that some programs depend on. We mark the "original"
         * un-COW'ed pages by matching them up with "vma->vm_pgoff".
         * See vm_normal_page() for details.
         */
        if (is_cow_mapping(vma->vm_flags)) {
                if (addr != vma->vm_start || end != vma->vm_end)
                        return -EINVAL;
                vma->vm_pgoff = pfn;
        }

        vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);

        BUG_ON(addr >= end);
        pfn -= addr >> PAGE_SHIFT;
        pgd = pgd_offset(mm, addr);
        flush_cache_range(vma, addr, end);
        do {
                next = pgd_addr_end(addr, end);
                err = remap_p4d_range(mm, pgd, addr, next,
                                pfn + (addr >> PAGE_SHIFT), prot);
                if (err)
                        return err;
        } while (pgd++, addr = next, addr != end);

        return 0;
}

/*
 * Variant of remap_pfn_range that does not call track_pfn_remap.  The caller
 * must have pre-validated the caching bits of the pgprot_t.
 */
int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
                unsigned long pfn, unsigned long size, pgprot_t prot)
{
        int error = remap_pfn_range_internal(vma, addr, pfn, size, prot);

        if (!error)
                return 0;

        /*
         * A partial pfn range mapping is dangerous: it does not
         * maintain page reference counts, and callers may free
         * pages due to the error. So zap it early.
         */
        zap_page_range_single(vma, addr, size, NULL);
        return error;
}

#ifdef __HAVE_PFNMAP_TRACKING
static inline struct pfnmap_track_ctx *pfnmap_track_ctx_alloc(unsigned long pfn,
                unsigned long size, pgprot_t *prot)
{
        struct pfnmap_track_ctx *ctx;

        if (pfnmap_track(pfn, size, prot))
                return ERR_PTR(-EINVAL);

        ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
        if (unlikely(!ctx)) {
                pfnmap_untrack(pfn, size);
                return ERR_PTR(-ENOMEM);
        }

        ctx->pfn = pfn;
        ctx->size = size;
        kref_init(&ctx->kref);
        return ctx;
}

void pfnmap_track_ctx_release(struct kref *ref)
{
        struct pfnmap_track_ctx *ctx = container_of(ref, struct pfnmap_track_ctx, kref);

        pfnmap_untrack(ctx->pfn, ctx->size);
        kfree(ctx);
}
#endif /* __HAVE_PFNMAP_TRACKING */

/**
 * remap_pfn_range - remap kernel memory to userspace
 * @vma: user vma to map to
 * @addr: target page aligned user address to start at
 * @pfn: page frame number of kernel physical memory address
 * @size: size of mapping area
 * @prot: page protection flags for this mapping
 *
 * Note: this is only safe if the mm semaphore is held when called.
 *
 * Return: %0 on success, negative error code otherwise.
 */
#ifdef __HAVE_PFNMAP_TRACKING
int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
                    unsigned long pfn, unsigned long size, pgprot_t prot)
{
        struct pfnmap_track_ctx *ctx = NULL;
        int err;

        size = PAGE_ALIGN(size);

        /*
         * If we cover the full VMA, we'll perform actual tracking, and
         * remember to untrack when the last reference to our tracking
         * context from a VMA goes away. We'll keep tracking the whole pfn
         * range even during VMA splits and partial unmapping.
         *
         * If we only cover parts of the VMA, we'll only setup the cachemode
         * in the pgprot for the pfn range.
         */
        if (addr == vma->vm_start && addr + size == vma->vm_end) {
                if (vma->pfnmap_track_ctx)
                        return -EINVAL;
                ctx = pfnmap_track_ctx_alloc(pfn, size, &prot);
                if (IS_ERR(ctx))
                        return PTR_ERR(ctx);
        } else if (pfnmap_setup_cachemode(pfn, size, &prot)) {
                return -EINVAL;
        }

        err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
        if (ctx) {
                if (err)
                        kref_put(&ctx->kref, pfnmap_track_ctx_release);
                else
                        vma->pfnmap_track_ctx = ctx;
        }
        return err;
}

#else
int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
                    unsigned long pfn, unsigned long size, pgprot_t prot)
{
        return remap_pfn_range_notrack(vma, addr, pfn, size, prot);
}
#endif
EXPORT_SYMBOL(remap_pfn_range);

/**
 * vm_iomap_memory - remap memory to userspace
 * @vma: user vma to map to
 * @start: start of the physical memory to be mapped
 * @len: size of area
 *
 * This is a simplified io_remap_pfn_range() for common driver use. The
 * driver just needs to give us the physical memory range to be mapped,
 * we'll figure out the rest from the vma information.
 *
 * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
 * whatever write-combining details or similar.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
{
        unsigned long vm_len, pfn, pages;

        /* Check that the physical memory area passed in looks valid */
        if (start + len < start)
                return -EINVAL;
        /*
         * You *really* shouldn't map things that aren't page-aligned,
         * but we've historically allowed it because IO memory might
         * just have smaller alignment.
         */
        len += start & ~PAGE_MASK;
        pfn = start >> PAGE_SHIFT;
        pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
        if (pfn + pages < pfn)
                return -EINVAL;

        /* We start the mapping 'vm_pgoff' pages into the area */
        if (vma->vm_pgoff > pages)
                return -EINVAL;
        pfn += vma->vm_pgoff;
        pages -= vma->vm_pgoff;

        /* Can we fit all of the mapping? */
        vm_len = vma->vm_end - vma->vm_start;
        if (vm_len >> PAGE_SHIFT > pages)
                return -EINVAL;

        /* Ok, let it rip */
        return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
}
EXPORT_SYMBOL(vm_iomap_memory);

static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
                                     unsigned long addr, unsigned long end,
                                     pte_fn_t fn, void *data, bool create,
                                     pgtbl_mod_mask *mask)
{
        pte_t *pte, *mapped_pte;
        int err = 0;
        spinlock_t *ptl;

        if (create) {
                mapped_pte = pte = (mm == &init_mm) ?
                        pte_alloc_kernel_track(pmd, addr, mask) :
                        pte_alloc_map_lock(mm, pmd, addr, &ptl);
                if (!pte)
                        return -ENOMEM;
        } else {
                mapped_pte = pte = (mm == &init_mm) ?
                        pte_offset_kernel(pmd, addr) :
                        pte_offset_map_lock(mm, pmd, addr, &ptl);
                if (!pte)
                        return -EINVAL;
        }

        arch_enter_lazy_mmu_mode();

        if (fn) {
                do {
                        if (create || !pte_none(ptep_get(pte))) {
                                err = fn(pte, addr, data);
                                if (err)
                                        break;
                        }
                } while (pte++, addr += PAGE_SIZE, addr != end);
        }
        *mask |= PGTBL_PTE_MODIFIED;

        arch_leave_lazy_mmu_mode();

        if (mm != &init_mm)
                pte_unmap_unlock(mapped_pte, ptl);
        return err;
}

static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
                                     unsigned long addr, unsigned long end,
                                     pte_fn_t fn, void *data, bool create,
                                     pgtbl_mod_mask *mask)
{
        pmd_t *pmd;
        unsigned long next;
        int err = 0;

        BUG_ON(pud_leaf(*pud));

        if (create) {
                pmd = pmd_alloc_track(mm, pud, addr, mask);
                if (!pmd)
                        return -ENOMEM;
        } else {
                pmd = pmd_offset(pud, addr);
        }
        do {
                next = pmd_addr_end(addr, end);
                if (pmd_none(*pmd) && !create)
                        continue;
                if (WARN_ON_ONCE(pmd_leaf(*pmd)))
                        return -EINVAL;
                if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) {
                        if (!create)
                                continue;
                        pmd_clear_bad(pmd);
                }
                err = apply_to_pte_range(mm, pmd, addr, next,
                                         fn, data, create, mask);
                if (err)
                        break;
        } while (pmd++, addr = next, addr != end);

        return err;
}

static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
                                     unsigned long addr, unsigned long end,
                                     pte_fn_t fn, void *data, bool create,
                                     pgtbl_mod_mask *mask)
{
        pud_t *pud;
        unsigned long next;
        int err = 0;

        if (create) {
                pud = pud_alloc_track(mm, p4d, addr, mask);
                if (!pud)
                        return -ENOMEM;
        } else {
                pud = pud_offset(p4d, addr);
        }
        do {
                next = pud_addr_end(addr, end);
                if (pud_none(*pud) && !create)
                        continue;
                if (WARN_ON_ONCE(pud_leaf(*pud)))
                        return -EINVAL;
                if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) {
                        if (!create)
                                continue;
                        pud_clear_bad(pud);
                }
                err = apply_to_pmd_range(mm, pud, addr, next,
                                         fn, data, create, mask);
                if (err)
                        break;
        } while (pud++, addr = next, addr != end);

        return err;
}

static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
                                     unsigned long addr, unsigned long end,
                                     pte_fn_t fn, void *data, bool create,
                                     pgtbl_mod_mask *mask)
{
        p4d_t *p4d;
        unsigned long next;
        int err = 0;

        if (create) {
                p4d = p4d_alloc_track(mm, pgd, addr, mask);
                if (!p4d)
                        return -ENOMEM;
        } else {
                p4d = p4d_offset(pgd, addr);
        }
        do {
                next = p4d_addr_end(addr, end);
                if (p4d_none(*p4d) && !create)
                        continue;
                if (WARN_ON_ONCE(p4d_leaf(*p4d)))
                        return -EINVAL;
                if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) {
                        if (!create)
                                continue;
                        p4d_clear_bad(p4d);
                }
                err = apply_to_pud_range(mm, p4d, addr, next,
                                         fn, data, create, mask);
                if (err)
                        break;
        } while (p4d++, addr = next, addr != end);

        return err;
}

static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
                                 unsigned long size, pte_fn_t fn,
                                 void *data, bool create)
{
        pgd_t *pgd;
        unsigned long start = addr, next;
        unsigned long end = addr + size;
        pgtbl_mod_mask mask = 0;
        int err = 0;

        if (WARN_ON(addr >= end))
                return -EINVAL;

        pgd = pgd_offset(mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none(*pgd) && !create)
                        continue;
                if (WARN_ON_ONCE(pgd_leaf(*pgd))) {
                        err = -EINVAL;
                        break;
                }
                if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) {
                        if (!create)
                                continue;
                        pgd_clear_bad(pgd);
                }
                err = apply_to_p4d_range(mm, pgd, addr, next,
                                         fn, data, create, &mask);
                if (err)
                        break;
        } while (pgd++, addr = next, addr != end);

        if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
                arch_sync_kernel_mappings(start, start + size);

        return err;
}

/*
 * Scan a region of virtual memory, filling in page tables as necessary
 * and calling a provided function on each leaf page table.
 */
int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
                        unsigned long size, pte_fn_t fn, void *data)
{
        return __apply_to_page_range(mm, addr, size, fn, data, true);
}
EXPORT_SYMBOL_GPL(apply_to_page_range);

/*
 * Scan a region of virtual memory, calling a provided function on
 * each leaf page table where it exists.
 *
 * Unlike apply_to_page_range, this does _not_ fill in page tables
 * where they are absent.
 */
int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
                                 unsigned long size, pte_fn_t fn, void *data)
{
        return __apply_to_page_range(mm, addr, size, fn, data, false);
}

/*
 * handle_pte_fault chooses page fault handler according to an entry which was
 * read non-atomically.  Before making any commitment, on those architectures
 * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
 * parts, do_swap_page must check under lock before unmapping the pte and
 * proceeding (but do_wp_page is only called after already making such a check;
 * and do_anonymous_page can safely check later on).
 */
static inline int pte_unmap_same(struct vm_fault *vmf)
{
        int same = 1;
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
        if (sizeof(pte_t) > sizeof(unsigned long)) {
                spin_lock(vmf->ptl);
                same = pte_same(ptep_get(vmf->pte), vmf->orig_pte);
                spin_unlock(vmf->ptl);
        }
#endif
        pte_unmap(vmf->pte);
        vmf->pte = NULL;
        return same;
}

/*
 * Return:
 *        0:                copied succeeded
 *        -EHWPOISON:        copy failed due to hwpoison in source page
 *        -EAGAIN:        copied failed (some other reason)
 */
static inline int __wp_page_copy_user(struct page *dst, struct page *src,
                                      struct vm_fault *vmf)
{
        int ret;
        void *kaddr;
        void __user *uaddr;
        struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *mm = vma->vm_mm;
        unsigned long addr = vmf->address;

        if (likely(src)) {
                if (copy_mc_user_highpage(dst, src, addr, vma))
                        return -EHWPOISON;
                return 0;
        }

        /*
         * If the source page was a PFN mapping, we don't have
         * a "struct page" for it. We do a best-effort copy by
         * just copying from the original user address. If that
         * fails, we just zero-fill it. Live with it.
         */
        kaddr = kmap_local_page(dst);
        pagefault_disable();
        uaddr = (void __user *)(addr & PAGE_MASK);

        /*
         * On architectures with software "accessed" bits, we would
         * take a double page fault, so mark it accessed here.
         */
        vmf->pte = NULL;
        if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) {
                pte_t entry;

                vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
                if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
                        /*
                         * Other thread has already handled the fault
                         * and update local tlb only
                         */
                        if (vmf->pte)
                                update_mmu_tlb(vma, addr, vmf->pte);
                        ret = -EAGAIN;
                        goto pte_unlock;
                }

                entry = pte_mkyoung(vmf->orig_pte);
                if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
                        update_mmu_cache_range(vmf, vma, addr, vmf->pte, 1);
        }

        /*
         * This really shouldn't fail, because the page is there
         * in the page tables. But it might just be unreadable,
         * in which case we just give up and fill the result with
         * zeroes.
         */
        if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
                if (vmf->pte)
                        goto warn;

                /* Re-validate under PTL if the page is still mapped */
                vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
                if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
                        /* The PTE changed under us, update local tlb */
                        if (vmf->pte)
                                update_mmu_tlb(vma, addr, vmf->pte);
                        ret = -EAGAIN;
                        goto pte_unlock;
                }

                /*
                 * The same page can be mapped back since last copy attempt.
                 * Try to copy again under PTL.
                 */
                if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
                        /*
                         * Give a warn in case there can be some obscure
                         * use-case
                         */
warn:
                        WARN_ON_ONCE(1);
                        clear_page(kaddr);
                }
        }

        ret = 0;

pte_unlock:
        if (vmf->pte)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
        pagefault_enable();
        kunmap_local(kaddr);
        flush_dcache_page(dst);

        return ret;
}

static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
{
        struct file *vm_file = vma->vm_file;

        if (vm_file)
                return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;

        /*
         * Special mappings (e.g. VDSO) do not have any file so fake
         * a default GFP_KERNEL for them.
         */
        return GFP_KERNEL;
}

/*
 * Notify the address space that the page is about to become writable so that
 * it can prohibit this or wait for the page to get into an appropriate state.
 *
 * We do this without the lock held, so that it can sleep if it needs to.
 */
static vm_fault_t do_page_mkwrite(struct vm_fault *vmf, struct folio *folio)
{
        vm_fault_t ret;
        unsigned int old_flags = vmf->flags;

        vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;

        if (vmf->vma->vm_file &&
            IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host))
                return VM_FAULT_SIGBUS;

        ret = vmf->vma->vm_ops->page_mkwrite(vmf);
        /* Restore original flags so that caller is not surprised */
        vmf->flags = old_flags;
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
                return ret;
        if (unlikely(!(ret & VM_FAULT_LOCKED))) {
                folio_lock(folio);
                if (!folio->mapping) {
                        folio_unlock(folio);
                        return 0; /* retry */
                }
                ret |= VM_FAULT_LOCKED;
        } else
                VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        return ret;
}

/*
 * Handle dirtying of a page in shared file mapping on a write fault.
 *
 * The function expects the page to be locked and unlocks it.
 */
static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct address_space *mapping;
        struct folio *folio = page_folio(vmf->page);
        bool dirtied;
        bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;

        dirtied = folio_mark_dirty(folio);
        VM_BUG_ON_FOLIO(folio_test_anon(folio), folio);
        /*
         * Take a local copy of the address_space - folio.mapping may be zeroed
         * by truncate after folio_unlock().   The address_space itself remains
         * pinned by vma->vm_file's reference.  We rely on folio_unlock()'s
         * release semantics to prevent the compiler from undoing this copying.
         */
        mapping = folio_raw_mapping(folio);
        folio_unlock(folio);

        if (!page_mkwrite)
                file_update_time(vma->vm_file);

        /*
         * Throttle page dirtying rate down to writeback speed.
         *
         * mapping may be NULL here because some device drivers do not
         * set page.mapping but still dirty their pages
         *
         * Drop the mmap_lock before waiting on IO, if we can. The file
         * is pinning the mapping, as per above.
         */
        if ((dirtied || page_mkwrite) && mapping) {
                struct file *fpin;

                fpin = maybe_unlock_mmap_for_io(vmf, NULL);
                balance_dirty_pages_ratelimited(mapping);
                if (fpin) {
                        fput(fpin);
                        return VM_FAULT_COMPLETED;
                }
        }

        return 0;
}

/*
 * Handle write page faults for pages that can be reused in the current vma
 *
 * This can happen either due to the mapping being with the VM_SHARED flag,
 * or due to us being the last reference standing to the page. In either
 * case, all we need to do here is to mark the page as writable and update
 * any related book-keeping.
 */
static inline void wp_page_reuse(struct vm_fault *vmf, struct folio *folio)
        __releases(vmf->ptl)
{
        struct vm_area_struct *vma = vmf->vma;
        pte_t entry;

        VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE));
        VM_WARN_ON(is_zero_pfn(pte_pfn(vmf->orig_pte)));

        if (folio) {
                VM_BUG_ON(folio_test_anon(folio) &&
                          !PageAnonExclusive(vmf->page));
                /*
                 * Clear the folio's cpupid information as the existing
                 * information potentially belongs to a now completely
                 * unrelated process.
                 */
                folio_xchg_last_cpupid(folio, (1 << LAST_CPUPID_SHIFT) - 1);
        }

        flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
        entry = pte_mkyoung(vmf->orig_pte);
        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
                update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        count_vm_event(PGREUSE);
}

/*
 * We could add a bitflag somewhere, but for now, we know that all
 * vm_ops that have a ->map_pages have been audited and don't need
 * the mmap_lock to be held.
 */
static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;

        if (vma->vm_ops->map_pages || !(vmf->flags & FAULT_FLAG_VMA_LOCK))
                return 0;
        vma_end_read(vma);
        return VM_FAULT_RETRY;
}

/**
 * __vmf_anon_prepare - Prepare to handle an anonymous fault.
 * @vmf: The vm_fault descriptor passed from the fault handler.
 *
 * When preparing to insert an anonymous page into a VMA from a
 * fault handler, call this function rather than anon_vma_prepare().
 * If this vma does not already have an associated anon_vma and we are
 * only protected by the per-VMA lock, the caller must retry with the
 * mmap_lock held.  __anon_vma_prepare() will look at adjacent VMAs to
 * determine if this VMA can share its anon_vma, and that's not safe to
 * do with only the per-VMA lock held for this VMA.
 *
 * Return: 0 if fault handling can proceed.  Any other value should be
 * returned to the caller.
 */
vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret = 0;

        if (likely(vma->anon_vma))
                return 0;
        if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
                if (!mmap_read_trylock(vma->vm_mm))
                        return VM_FAULT_RETRY;
        }
        if (__anon_vma_prepare(vma))
                ret = VM_FAULT_OOM;
        if (vmf->flags & FAULT_FLAG_VMA_LOCK)
                mmap_read_unlock(vma->vm_mm);
        return ret;
}

/*
 * Handle the case of a page which we actually need to copy to a new page,
 * either due to COW or unsharing.
 *
 * Called with mmap_lock locked and the old page referenced, but
 * without the ptl held.
 *
 * High level logic flow:
 *
 * - Allocate a page, copy the content of the old page to the new one.
 * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
 * - Take the PTL. If the pte changed, bail out and release the allocated page
 * - If the pte is still the way we remember it, update the page table and all
 *   relevant references. This includes dropping the reference the page-table
 *   held to the old page, as well as updating the rmap.
 * - In any case, unlock the PTL and drop the reference we took to the old page.
 */
static vm_fault_t wp_page_copy(struct vm_fault *vmf)
{
        const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
        struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *mm = vma->vm_mm;
        struct folio *old_folio = NULL;
        struct folio *new_folio = NULL;
        pte_t entry;
        int page_copied = 0;
        struct mmu_notifier_range range;
        vm_fault_t ret;
        bool pfn_is_zero;

        delayacct_wpcopy_start();

        if (vmf->page)
                old_folio = page_folio(vmf->page);
        ret = vmf_anon_prepare(vmf);
        if (unlikely(ret))
                goto out;

        pfn_is_zero = is_zero_pfn(pte_pfn(vmf->orig_pte));
        new_folio = folio_prealloc(mm, vma, vmf->address, pfn_is_zero);
        if (!new_folio)
                goto oom;

        if (!pfn_is_zero) {
                int err;

                err = __wp_page_copy_user(&new_folio->page, vmf->page, vmf);
                if (err) {
                        /*
                         * COW failed, if the fault was solved by other,
                         * it's fine. If not, userspace would re-fault on
                         * the same address and we will handle the fault
                         * from the second attempt.
                         * The -EHWPOISON case will not be retried.
                         */
                        folio_put(new_folio);
                        if (old_folio)
                                folio_put(old_folio);

                        delayacct_wpcopy_end();
                        return err == -EHWPOISON ? VM_FAULT_HWPOISON : 0;
                }
                kmsan_copy_page_meta(&new_folio->page, vmf->page);
        }

        __folio_mark_uptodate(new_folio);

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
                                vmf->address & PAGE_MASK,
                                (vmf->address & PAGE_MASK) + PAGE_SIZE);
        mmu_notifier_invalidate_range_start(&range);

        /*
         * Re-check the pte - we dropped the lock
         */
        vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
        if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
                if (old_folio) {
                        if (!folio_test_anon(old_folio)) {
                                dec_mm_counter(mm, mm_counter_file(old_folio));
                                inc_mm_counter(mm, MM_ANONPAGES);
                        }
                } else {
                        ksm_might_unmap_zero_page(mm, vmf->orig_pte);
                        inc_mm_counter(mm, MM_ANONPAGES);
                }
                flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
                entry = folio_mk_pte(new_folio, vma->vm_page_prot);
                entry = pte_sw_mkyoung(entry);
                if (unlikely(unshare)) {
                        if (pte_soft_dirty(vmf->orig_pte))
                                entry = pte_mksoft_dirty(entry);
                        if (pte_uffd_wp(vmf->orig_pte))
                                entry = pte_mkuffd_wp(entry);
                } else {
                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                }

                /*
                 * Clear the pte entry and flush it first, before updating the
                 * pte with the new entry, to keep TLBs on different CPUs in
                 * sync. This code used to set the new PTE then flush TLBs, but
                 * that left a window where the new PTE could be loaded into
                 * some TLBs while the old PTE remains in others.
                 */
                ptep_clear_flush(vma, vmf->address, vmf->pte);
                folio_add_new_anon_rmap(new_folio, vma, vmf->address, RMAP_EXCLUSIVE);
                folio_add_lru_vma(new_folio, vma);
                BUG_ON(unshare && pte_write(entry));
                set_pte_at(mm, vmf->address, vmf->pte, entry);
                update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
                if (old_folio) {
                        /*
                         * Only after switching the pte to the new page may
                         * we remove the mapcount here. Otherwise another
                         * process may come and find the rmap count decremented
                         * before the pte is switched to the new page, and
                         * "reuse" the old page writing into it while our pte
                         * here still points into it and can be read by other
                         * threads.
                         *
                         * The critical issue is to order this
                         * folio_remove_rmap_pte() with the ptp_clear_flush
                         * above. Those stores are ordered by (if nothing else,)
                         * the barrier present in the atomic_add_negative
                         * in folio_remove_rmap_pte();
                         *
                         * Then the TLB flush in ptep_clear_flush ensures that
                         * no process can access the old page before the
                         * decremented mapcount is visible. And the old page
                         * cannot be reused until after the decremented
                         * mapcount is visible. So transitively, TLBs to
                         * old page will be flushed before it can be reused.
                         */
                        folio_remove_rmap_pte(old_folio, vmf->page, vma);
                }

                /* Free the old page.. */
                new_folio = old_folio;
                page_copied = 1;
                pte_unmap_unlock(vmf->pte, vmf->ptl);
        } else if (vmf->pte) {
                update_mmu_tlb(vma, vmf->address, vmf->pte);
                pte_unmap_unlock(vmf->pte, vmf->ptl);
        }

        mmu_notifier_invalidate_range_end(&range);

        if (new_folio)
                folio_put(new_folio);
        if (old_folio) {
                if (page_copied)
                        free_swap_cache(old_folio);
                folio_put(old_folio);
        }

        delayacct_wpcopy_end();
        return 0;
oom:
        ret = VM_FAULT_OOM;
out:
        if (old_folio)
                folio_put(old_folio);

        delayacct_wpcopy_end();
        return ret;
}

/**
 * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE
 *                          writeable once the page is prepared
 *
 * @vmf: structure describing the fault
 * @folio: the folio of vmf->page
 *
 * This function handles all that is needed to finish a write page fault in a
 * shared mapping due to PTE being read-only once the mapped page is prepared.
 * It handles locking of PTE and modifying it.
 *
 * The function expects the page to be locked or other protection against
 * concurrent faults / writeback (such as DAX radix tree locks).
 *
 * Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before
 * we acquired PTE lock.
 */
static vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf, struct folio *folio)
{
        WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
        vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
                                       &vmf->ptl);
        if (!vmf->pte)
                return VM_FAULT_NOPAGE;
        /*
         * We might have raced with another page fault while we released the
         * pte_offset_map_lock.
         */
        if (!pte_same(ptep_get(vmf->pte), vmf->orig_pte)) {
                update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                return VM_FAULT_NOPAGE;
        }
        wp_page_reuse(vmf, folio);
        return 0;
}

/*
 * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
 * mapping
 */
static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;

        if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
                vm_fault_t ret;

                pte_unmap_unlock(vmf->pte, vmf->ptl);
                ret = vmf_can_call_fault(vmf);
                if (ret)
                        return ret;

                vmf->flags |= FAULT_FLAG_MKWRITE;
                ret = vma->vm_ops->pfn_mkwrite(vmf);
                if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
                        return ret;
                return finish_mkwrite_fault(vmf, NULL);
        }
        wp_page_reuse(vmf, NULL);
        return 0;
}

static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio)
        __releases(vmf->ptl)
{
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret = 0;

        folio_get(folio);

        if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
                vm_fault_t tmp;

                pte_unmap_unlock(vmf->pte, vmf->ptl);
                tmp = vmf_can_call_fault(vmf);
                if (tmp) {
                        folio_put(folio);
                        return tmp;
                }

                tmp = do_page_mkwrite(vmf, folio);
                if (unlikely(!tmp || (tmp &
                                      (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
                        folio_put(folio);
                        return tmp;
                }
                tmp = finish_mkwrite_fault(vmf, folio);
                if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
                        folio_unlock(folio);
                        folio_put(folio);
                        return tmp;
                }
        } else {
                wp_page_reuse(vmf, folio);
                folio_lock(folio);
        }
        ret |= fault_dirty_shared_page(vmf);
        folio_put(folio);

        return ret;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
                struct vm_area_struct *vma)
{
        bool exclusive = false;

        /* Let's just free up a large folio if only a single page is mapped. */
        if (folio_large_mapcount(folio) <= 1)
                return false;

        /*
         * The assumption for anonymous folios is that each page can only get
         * mapped once into each MM. The only exception are KSM folios, which
         * are always small.
         *
         * Each taken mapcount must be paired with exactly one taken reference,
         * whereby the refcount must be incremented before the mapcount when
         * mapping a page, and the refcount must be decremented after the
         * mapcount when unmapping a page.
         *
         * If all folio references are from mappings, and all mappings are in
         * the page tables of this MM, then this folio is exclusive to this MM.
         */
        if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids))
                return false;

        VM_WARN_ON_ONCE(folio_test_ksm(folio));

        if (unlikely(folio_test_swapcache(folio))) {
                /*
                 * Note: freeing up the swapcache will fail if some PTEs are
                 * still swap entries.
                 */
                if (!folio_trylock(folio))
                        return false;
                folio_free_swap(folio);
                folio_unlock(folio);
        }

        if (folio_large_mapcount(folio) != folio_ref_count(folio))
                return false;

        /* Stabilize the mapcount vs. refcount and recheck. */
        folio_lock_large_mapcount(folio);
        VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_ref_count(folio), folio);

        if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids))
                goto unlock;
        if (folio_large_mapcount(folio) != folio_ref_count(folio))
                goto unlock;

        VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_nr_pages(folio), folio);
        VM_WARN_ON_ONCE_FOLIO(folio_entire_mapcount(folio), folio);
        VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != vma->vm_mm->mm_id &&
                        folio_mm_id(folio, 1) != vma->vm_mm->mm_id);

        /*
         * Do we need the folio lock? Likely not. If there would have been
         * references from page migration/swapout, we would have detected
         * an additional folio reference and never ended up here.
         */
        exclusive = true;
unlock:
        folio_unlock_large_mapcount(folio);
        return exclusive;
}
#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
                struct vm_area_struct *vma)
{
        BUILD_BUG();
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

static bool wp_can_reuse_anon_folio(struct folio *folio,
                                    struct vm_area_struct *vma)
{
        if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && folio_test_large(folio))
                return __wp_can_reuse_large_anon_folio(folio, vma);

        /*
         * We have to verify under folio lock: these early checks are
         * just an optimization to avoid locking the folio and freeing
         * the swapcache if there is little hope that we can reuse.
         *
         * KSM doesn't necessarily raise the folio refcount.
         */
        if (folio_test_ksm(folio) || folio_ref_count(folio) > 3)
                return false;
        if (!folio_test_lru(folio))
                /*
                 * We cannot easily detect+handle references from
                 * remote LRU caches or references to LRU folios.
                 */
                lru_add_drain();
        if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio))
                return false;
        if (!folio_trylock(folio))
                return false;
        if (folio_test_swapcache(folio))
                folio_free_swap(folio);
        if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) {
                folio_unlock(folio);
                return false;
        }
        /*
         * Ok, we've got the only folio reference from our mapping
         * and the folio is locked, it's dark out, and we're wearing
         * sunglasses. Hit it.
         */
        folio_move_anon_rmap(folio, vma);
        folio_unlock(folio);
        return true;
}

/*
 * This routine handles present pages, when
 * * users try to write to a shared page (FAULT_FLAG_WRITE)
 * * GUP wants to take a R/O pin on a possibly shared anonymous page
 *   (FAULT_FLAG_UNSHARE)
 *
 * It is done by copying the page to a new address and decrementing the
 * shared-page counter for the old page.
 *
 * Note that this routine assumes that the protection checks have been
 * done by the caller (the low-level page fault routine in most cases).
 * Thus, with FAULT_FLAG_WRITE, we can safely just mark it writable once we've
 * done any necessary COW.
 *
 * In case of FAULT_FLAG_WRITE, we also mark the page dirty at this point even
 * though the page will change only once the write actually happens. This
 * avoids a few races, and potentially makes it more efficient.
 *
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
 * but allow concurrent faults), with pte both mapped and locked.
 * We return with mmap_lock still held, but pte unmapped and unlocked.
 */
static vm_fault_t do_wp_page(struct vm_fault *vmf)
        __releases(vmf->ptl)
{
        const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio = NULL;
        pte_t pte;

        if (likely(!unshare)) {
                if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) {
                        if (!userfaultfd_wp_async(vma)) {
                                pte_unmap_unlock(vmf->pte, vmf->ptl);
                                return handle_userfault(vmf, VM_UFFD_WP);
                        }

                        /*
                         * Nothing needed (cache flush, TLB invalidations,
                         * etc.) because we're only removing the uffd-wp bit,
                         * which is completely invisible to the user.
                         */
                        pte = pte_clear_uffd_wp(ptep_get(vmf->pte));

                        set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
                        /*
                         * Update this to be prepared for following up CoW
                         * handling
                         */
                        vmf->orig_pte = pte;
                }

                /*
                 * Userfaultfd write-protect can defer flushes. Ensure the TLB
                 * is flushed in this case before copying.
                 */
                if (unlikely(userfaultfd_wp(vmf->vma) &&
                             mm_tlb_flush_pending(vmf->vma->vm_mm)))
                        flush_tlb_page(vmf->vma, vmf->address);
        }

        vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);

        if (vmf->page)
                folio = page_folio(vmf->page);

        /*
         * Shared mapping: we are guaranteed to have VM_WRITE and
         * FAULT_FLAG_WRITE set at this point.
         */
        if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
                /*
                 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
                 * VM_PFNMAP VMA. FS DAX also wants ops->pfn_mkwrite called.
                 *
                 * We should not cow pages in a shared writeable mapping.
                 * Just mark the pages writable and/or call ops->pfn_mkwrite.
                 */
                if (!vmf->page || is_fsdax_page(vmf->page)) {
                        vmf->page = NULL;
                        return wp_pfn_shared(vmf);
                }
                return wp_page_shared(vmf, folio);
        }

        /*
         * Private mapping: create an exclusive anonymous page copy if reuse
         * is impossible. We might miss VM_WRITE for FOLL_FORCE handling.
         *
         * If we encounter a page that is marked exclusive, we must reuse
         * the page without further checks.
         */
        if (folio && folio_test_anon(folio) &&
            (PageAnonExclusive(vmf->page) || wp_can_reuse_anon_folio(folio, vma))) {
                if (!PageAnonExclusive(vmf->page))
                        SetPageAnonExclusive(vmf->page);
                if (unlikely(unshare)) {
                        pte_unmap_unlock(vmf->pte, vmf->ptl);
                        return 0;
                }
                wp_page_reuse(vmf, folio);
                return 0;
        }
        /*
         * Ok, we need to copy. Oh, well..
         */
        if (folio)
                folio_get(folio);

        pte_unmap_unlock(vmf->pte, vmf->ptl);
#ifdef CONFIG_KSM
        if (folio && folio_test_ksm(folio))
                count_vm_event(COW_KSM);
#endif
        return wp_page_copy(vmf);
}

static void unmap_mapping_range_vma(struct vm_area_struct *vma,
                unsigned long start_addr, unsigned long end_addr,
                struct zap_details *details)
{
        zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
}

static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
                                            pgoff_t first_index,
                                            pgoff_t last_index,
                                            struct zap_details *details)
{
        struct vm_area_struct *vma;
        pgoff_t vba, vea, zba, zea;

        vma_interval_tree_foreach(vma, root, first_index, last_index) {
                vba = vma->vm_pgoff;
                vea = vba + vma_pages(vma) - 1;
                zba = max(first_index, vba);
                zea = min(last_index, vea);

                unmap_mapping_range_vma(vma,
                        ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
                        ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
                                details);
        }
}

/**
 * unmap_mapping_folio() - Unmap single folio from processes.
 * @folio: The locked folio to be unmapped.
 *
 * Unmap this folio from any userspace process which still has it mmaped.
 * Typically, for efficiency, the range of nearby pages has already been
 * unmapped by unmap_mapping_pages() or unmap_mapping_range().  But once
 * truncation or invalidation holds the lock on a folio, it may find that
 * the page has been remapped again: and then uses unmap_mapping_folio()
 * to unmap it finally.
 */
void unmap_mapping_folio(struct folio *folio)
{
        struct address_space *mapping = folio->mapping;
        struct zap_details details = { };
        pgoff_t        first_index;
        pgoff_t        last_index;

        VM_BUG_ON(!folio_test_locked(folio));

        first_index = folio->index;
        last_index = folio_next_index(folio) - 1;

        details.even_cows = false;
        details.single_folio = folio;
        details.zap_flags = ZAP_FLAG_DROP_MARKER;

        i_mmap_lock_read(mapping);
        if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
                unmap_mapping_range_tree(&mapping->i_mmap, first_index,
                                         last_index, &details);
        i_mmap_unlock_read(mapping);
}

/**
 * unmap_mapping_pages() - Unmap pages from processes.
 * @mapping: The address space containing pages to be unmapped.
 * @start: Index of first page to be unmapped.
 * @nr: Number of pages to be unmapped.  0 to unmap to end of file.
 * @even_cows: Whether to unmap even private COWed pages.
 *
 * Unmap the pages in this address space from any userspace process which
 * has them mmaped.  Generally, you want to remove COWed pages as well when
 * a file is being truncated, but not when invalidating pages from the page
 * cache.
 */
void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
                pgoff_t nr, bool even_cows)
{
        struct zap_details details = { };
        pgoff_t        first_index = start;
        pgoff_t        last_index = start + nr - 1;

        details.even_cows = even_cows;
        if (last_index < first_index)
                last_index = ULONG_MAX;

        i_mmap_lock_read(mapping);
        if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
                unmap_mapping_range_tree(&mapping->i_mmap, first_index,
                                         last_index, &details);
        i_mmap_unlock_read(mapping);
}
EXPORT_SYMBOL_GPL(unmap_mapping_pages);

/**
 * unmap_mapping_range - unmap the portion of all mmaps in the specified
 * address_space corresponding to the specified byte range in the underlying
 * file.
 *
 * @mapping: the address space containing mmaps to be unmapped.
 * @holebegin: byte in first page to unmap, relative to the start of
 * the underlying file.  This will be rounded down to a PAGE_SIZE
 * boundary.  Note that this is different from truncate_pagecache(), which
 * must keep the partial page.  In contrast, we must get rid of
 * partial pages.
 * @holelen: size of prospective hole in bytes.  This will be rounded
 * up to a PAGE_SIZE boundary.  A holelen of zero truncates to the
 * end of the file.
 * @even_cows: 1 when truncating a file, unmap even private COWed pages;
 * but 0 when invalidating pagecache, don't throw away private data.
 */
void unmap_mapping_range(struct address_space *mapping,
                loff_t const holebegin, loff_t const holelen, int even_cows)
{
        pgoff_t hba = (pgoff_t)(holebegin) >> PAGE_SHIFT;
        pgoff_t hlen = ((pgoff_t)(holelen) + PAGE_SIZE - 1) >> PAGE_SHIFT;

        /* Check for overflow. */
        if (sizeof(holelen) > sizeof(hlen)) {
                long long holeend =
                        (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
                if (holeend & ~(long long)ULONG_MAX)
                        hlen = ULONG_MAX - hba + 1;
        }

        unmap_mapping_pages(mapping, hba, hlen, even_cows);
}
EXPORT_SYMBOL(unmap_mapping_range);

/*
 * Restore a potential device exclusive pte to a working pte entry
 */
static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
{
        struct folio *folio = page_folio(vmf->page);
        struct vm_area_struct *vma = vmf->vma;
        struct mmu_notifier_range range;
        vm_fault_t ret;

        /*
         * We need a reference to lock the folio because we don't hold
         * the PTL so a racing thread can remove the device-exclusive
         * entry and unmap it. If the folio is free the entry must
         * have been removed already. If it happens to have already
         * been re-allocated after being freed all we do is lock and
         * unlock it.
         */
        if (!folio_try_get(folio))
                return 0;

        ret = folio_lock_or_retry(folio, vmf);
        if (ret) {
                folio_put(folio);
                return ret;
        }
        mmu_notifier_range_init_owner(&range, MMU_NOTIFY_CLEAR, 0,
                                vma->vm_mm, vmf->address & PAGE_MASK,
                                (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
        mmu_notifier_invalidate_range_start(&range);

        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
                                &vmf->ptl);
        if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
                restore_exclusive_pte(vma, folio, vmf->page, vmf->address,
                                      vmf->pte, vmf->orig_pte);

        if (vmf->pte)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
        folio_unlock(folio);
        folio_put(folio);

        mmu_notifier_invalidate_range_end(&range);
        return 0;
}

static inline bool should_try_to_free_swap(struct folio *folio,
                                           struct vm_area_struct *vma,
                                           unsigned int fault_flags)
{
        if (!folio_test_swapcache(folio))
                return false;
        if (mem_cgroup_swap_full(folio) || (vma->vm_flags & VM_LOCKED) ||
            folio_test_mlocked(folio))
                return true;
        /*
         * If we want to map a page that's in the swapcache writable, we
         * have to detect via the refcount if we're really the exclusive
         * user. Try freeing the swapcache to get rid of the swapcache
         * reference only in case it's likely that we'll be the exlusive user.
         */
        return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) &&
                folio_ref_count(folio) == (1 + folio_nr_pages(folio));
}

static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
{
        vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
                                       vmf->address, &vmf->ptl);
        if (!vmf->pte)
                return 0;
        /*
         * Be careful so that we will only recover a special uffd-wp pte into a
         * none pte.  Otherwise it means the pte could have changed, so retry.
         *
         * This should also cover the case where e.g. the pte changed
         * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_POISONED.
         * So is_pte_marker() check is not enough to safely drop the pte.
         */
        if (pte_same(vmf->orig_pte, ptep_get(vmf->pte)))
                pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte);
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        return 0;
}

static vm_fault_t do_pte_missing(struct vm_fault *vmf)
{
        if (vma_is_anonymous(vmf->vma))
                return do_anonymous_page(vmf);
        else
                return do_fault(vmf);
}

/*
 * This is actually a page-missing access, but with uffd-wp special pte
 * installed.  It means this pte was wr-protected before being unmapped.
 */
static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf)
{
        /*
         * Just in case there're leftover special ptes even after the region
         * got unregistered - we can simply clear them.
         */
        if (unlikely(!userfaultfd_wp(vmf->vma)))
                return pte_marker_clear(vmf);

        return do_pte_missing(vmf);
}

static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
{
        swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte);
        unsigned long marker = pte_marker_get(entry);

        /*
         * PTE markers should never be empty.  If anything weird happened,
         * the best thing to do is to kill the process along with its mm.
         */
        if (WARN_ON_ONCE(!marker))
                return VM_FAULT_SIGBUS;

        /* Higher priority than uffd-wp when data corrupted */
        if (marker & PTE_MARKER_POISONED)
                return VM_FAULT_HWPOISON;

        /* Hitting a guard page is always a fatal condition. */
        if (marker & PTE_MARKER_GUARD)
                return VM_FAULT_SIGSEGV;

        if (pte_marker_entry_uffd_wp(entry))
                return pte_marker_handle_uffd_wp(vmf);

        /* This is an unknown pte marker */
        return VM_FAULT_SIGBUS;
}

static struct folio *__alloc_swap_folio(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio;
        swp_entry_t entry;

        folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address);
        if (!folio)
                return NULL;

        entry = pte_to_swp_entry(vmf->orig_pte);
        if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
                                           GFP_KERNEL, entry)) {
                folio_put(folio);
                return NULL;
        }

        return folio;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
 * Check if the PTEs within a range are contiguous swap entries
 * and have consistent swapcache, zeromap.
 */
static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
{
        unsigned long addr;
        swp_entry_t entry;
        int idx;
        pte_t pte;

        addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
        idx = (vmf->address - addr) / PAGE_SIZE;
        pte = ptep_get(ptep);

        if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx)))
                return false;
        entry = pte_to_swp_entry(pte);
        if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages)
                return false;

        /*
         * swap_read_folio() can't handle the case a large folio is hybridly
         * from different backends. And they are likely corner cases. Similar
         * things might be added once zswap support large folios.
         */
        if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages))
                return false;
        if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages))
                return false;

        return true;
}

static inline unsigned long thp_swap_suitable_orders(pgoff_t swp_offset,
                                                     unsigned long addr,
                                                     unsigned long orders)
{
        int order, nr;

        order = highest_order(orders);

        /*
         * To swap in a THP with nr pages, we require that its first swap_offset
         * is aligned with that number, as it was when the THP was swapped out.
         * This helps filter out most invalid entries.
         */
        while (orders) {
                nr = 1 << order;
                if ((addr >> PAGE_SHIFT) % nr == swp_offset % nr)
                        break;
                order = next_order(&orders, order);
        }

        return orders;
}

static struct folio *alloc_swap_folio(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        unsigned long orders;
        struct folio *folio;
        unsigned long addr;
        swp_entry_t entry;
        spinlock_t *ptl;
        pte_t *pte;
        gfp_t gfp;
        int order;

        /*
         * If uffd is active for the vma we need per-page fault fidelity to
         * maintain the uffd semantics.
         */
        if (unlikely(userfaultfd_armed(vma)))
                goto fallback;

        /*
         * A large swapped out folio could be partially or fully in zswap. We
         * lack handling for such cases, so fallback to swapping in order-0
         * folio.
         */
        if (!zswap_never_enabled())
                goto fallback;

        entry = pte_to_swp_entry(vmf->orig_pte);
        /*
         * Get a list of all the (large) orders below PMD_ORDER that are enabled
         * and suitable for swapping THP.
         */
        orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT,
                                          BIT(PMD_ORDER) - 1);
        orders = thp_vma_suitable_orders(vma, vmf->address, orders);
        orders = thp_swap_suitable_orders(swp_offset(entry),
                                          vmf->address, orders);

        if (!orders)
                goto fallback;

        pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
                                  vmf->address & PMD_MASK, &ptl);
        if (unlikely(!pte))
                goto fallback;

        /*
         * For do_swap_page, find the highest order where the aligned range is
         * completely swap entries with contiguous swap offsets.
         */
        order = highest_order(orders);
        while (orders) {
                addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
                if (can_swapin_thp(vmf, pte + pte_index(addr), 1 << order))
                        break;
                order = next_order(&orders, order);
        }

        pte_unmap_unlock(pte, ptl);

        /* Try allocating the highest of the remaining orders. */
        gfp = vma_thp_gfp_mask(vma);
        while (orders) {
                addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
                folio = vma_alloc_folio(gfp, order, vma, addr);
                if (folio) {
                        if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
                                                            gfp, entry))
                                return folio;
                        count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
                        folio_put(folio);
                }
                count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK);
                order = next_order(&orders, order);
        }

fallback:
        return __alloc_swap_folio(vmf);
}
#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
static struct folio *alloc_swap_folio(struct vm_fault *vmf)
{
        return __alloc_swap_folio(vmf);
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq);

/*
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
 * but allow concurrent faults), and pte mapped but not yet locked.
 * We return with pte unmapped and unlocked.
 *
 * We return with the mmap_lock locked or unlocked in the same cases
 * as does filemap_fault().
 */
vm_fault_t do_swap_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct folio *swapcache, *folio = NULL;
        DECLARE_WAITQUEUE(wait, current);
        struct page *page;
        struct swap_info_struct *si = NULL;
        rmap_t rmap_flags = RMAP_NONE;
        bool need_clear_cache = false;
        bool exclusive = false;
        swp_entry_t entry;
        pte_t pte;
        vm_fault_t ret = 0;
        void *shadow = NULL;
        int nr_pages;
        unsigned long page_idx;
        unsigned long address;
        pte_t *ptep;

        if (!pte_unmap_same(vmf))
                goto out;

        entry = pte_to_swp_entry(vmf->orig_pte);
        if (unlikely(non_swap_entry(entry))) {
                if (is_migration_entry(entry)) {
                        migration_entry_wait(vma->vm_mm, vmf->pmd,
                                             vmf->address);
                } else if (is_device_exclusive_entry(entry)) {
                        vmf->page = pfn_swap_entry_to_page(entry);
                        ret = remove_device_exclusive_entry(vmf);
                } else if (is_device_private_entry(entry)) {
                        if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
                                /*
                                 * migrate_to_ram is not yet ready to operate
                                 * under VMA lock.
                                 */
                                vma_end_read(vma);
                                ret = VM_FAULT_RETRY;
                                goto out;
                        }

                        vmf->page = pfn_swap_entry_to_page(entry);
                        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                        vmf->address, &vmf->ptl);
                        if (unlikely(!vmf->pte ||
                                     !pte_same(ptep_get(vmf->pte),
                                                        vmf->orig_pte)))
                                goto unlock;

                        /*
                         * Get a page reference while we know the page can't be
                         * freed.
                         */
                        if (trylock_page(vmf->page)) {
                                struct dev_pagemap *pgmap;

                                get_page(vmf->page);
                                pte_unmap_unlock(vmf->pte, vmf->ptl);
                                pgmap = page_pgmap(vmf->page);
                                ret = pgmap->ops->migrate_to_ram(vmf);
                                unlock_page(vmf->page);
                                put_page(vmf->page);
                        } else {
                                pte_unmap_unlock(vmf->pte, vmf->ptl);
                        }
                } else if (is_hwpoison_entry(entry)) {
                        ret = VM_FAULT_HWPOISON;
                } else if (is_pte_marker_entry(entry)) {
                        ret = handle_pte_marker(vmf);
                } else {
                        print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
                        ret = VM_FAULT_SIGBUS;
                }
                goto out;
        }

        /* Prevent swapoff from happening to us. */
        si = get_swap_device(entry);
        if (unlikely(!si))
                goto out;

        folio = swap_cache_get_folio(entry);
        if (folio)
                swap_update_readahead(folio, vma, vmf->address);
        swapcache = folio;

        if (!folio) {
                if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
                    __swap_count(entry) == 1) {
                        /* skip swapcache */
                        folio = alloc_swap_folio(vmf);
                        if (folio) {
                                __folio_set_locked(folio);
                                __folio_set_swapbacked(folio);

                                nr_pages = folio_nr_pages(folio);
                                if (folio_test_large(folio))
                                        entry.val = ALIGN_DOWN(entry.val, nr_pages);
                                /*
                                 * Prevent parallel swapin from proceeding with
                                 * the cache flag. Otherwise, another thread
                                 * may finish swapin first, free the entry, and
                                 * swapout reusing the same entry. It's
                                 * undetectable as pte_same() returns true due
                                 * to entry reuse.
                                 */
                                if (swapcache_prepare(entry, nr_pages)) {
                                        /*
                                         * Relax a bit to prevent rapid
                                         * repeated page faults.
                                         */
                                        add_wait_queue(&swapcache_wq, &wait);
                                        schedule_timeout_uninterruptible(1);
                                        remove_wait_queue(&swapcache_wq, &wait);
                                        goto out_page;
                                }
                                need_clear_cache = true;

                                memcg1_swapin(entry, nr_pages);

                                shadow = swap_cache_get_shadow(entry);
                                if (shadow)
                                        workingset_refault(folio, shadow);

                                folio_add_lru(folio);

                                /* To provide entry to swap_read_folio() */
                                folio->swap = entry;
                                swap_read_folio(folio, NULL);
                                folio->private = NULL;
                        }
                } else {
                        folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
                                                vmf);
                        swapcache = folio;
                }

                if (!folio) {
                        /*
                         * Back out if somebody else faulted in this pte
                         * while we released the pte lock.
                         */
                        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                        vmf->address, &vmf->ptl);
                        if (likely(vmf->pte &&
                                   pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
                                ret = VM_FAULT_OOM;
                        goto unlock;
                }

                /* Had to read the page from swap area: Major fault */
                ret = VM_FAULT_MAJOR;
                count_vm_event(PGMAJFAULT);
                count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
        }

        ret |= folio_lock_or_retry(folio, vmf);
        if (ret & VM_FAULT_RETRY)
                goto out_release;

        page = folio_file_page(folio, swp_offset(entry));
        if (swapcache) {
                /*
                 * Make sure folio_free_swap() or swapoff did not release the
                 * swapcache from under us.  The page pin, and pte_same test
                 * below, are not enough to exclude that.  Even if it is still
                 * swapcache, we need to check that the page's swap has not
                 * changed.
                 */
                if (unlikely(!folio_matches_swap_entry(folio, entry)))
                        goto out_page;

                if (unlikely(PageHWPoison(page))) {
                        /*
                         * hwpoisoned dirty swapcache pages are kept for killing
                         * owner processes (which may be unknown at hwpoison time)
                         */
                        ret = VM_FAULT_HWPOISON;
                        goto out_page;
                }

                /*
                 * KSM sometimes has to copy on read faults, for example, if
                 * folio->index of non-ksm folios would be nonlinear inside the
                 * anon VMA -- the ksm flag is lost on actual swapout.
                 */
                folio = ksm_might_need_to_copy(folio, vma, vmf->address);
                if (unlikely(!folio)) {
                        ret = VM_FAULT_OOM;
                        folio = swapcache;
                        goto out_page;
                } else if (unlikely(folio == ERR_PTR(-EHWPOISON))) {
                        ret = VM_FAULT_HWPOISON;
                        folio = swapcache;
                        goto out_page;
                }
                if (folio != swapcache)
                        page = folio_page(folio, 0);

                /*
                 * If we want to map a page that's in the swapcache writable, we
                 * have to detect via the refcount if we're really the exclusive
                 * owner. Try removing the extra reference from the local LRU
                 * caches if required.
                 */
                if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache &&
                    !folio_test_ksm(folio) && !folio_test_lru(folio))
                        lru_add_drain();
        }

        folio_throttle_swaprate(folio, GFP_KERNEL);

        /*
         * Back out if somebody else already faulted in this pte.
         */
        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
                        &vmf->ptl);
        if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
                goto out_nomap;

        if (unlikely(!folio_test_uptodate(folio))) {
                ret = VM_FAULT_SIGBUS;
                goto out_nomap;
        }

        /* allocated large folios for SWP_SYNCHRONOUS_IO */
        if (folio_test_large(folio) && !folio_test_swapcache(folio)) {
                unsigned long nr = folio_nr_pages(folio);
                unsigned long folio_start = ALIGN_DOWN(vmf->address, nr * PAGE_SIZE);
                unsigned long idx = (vmf->address - folio_start) / PAGE_SIZE;
                pte_t *folio_ptep = vmf->pte - idx;
                pte_t folio_pte = ptep_get(folio_ptep);

                if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) ||
                    swap_pte_batch(folio_ptep, nr, folio_pte) != nr)
                        goto out_nomap;

                page_idx = idx;
                address = folio_start;
                ptep = folio_ptep;
                goto check_folio;
        }

        nr_pages = 1;
        page_idx = 0;
        address = vmf->address;
        ptep = vmf->pte;
        if (folio_test_large(folio) && folio_test_swapcache(folio)) {
                int nr = folio_nr_pages(folio);
                unsigned long idx = folio_page_idx(folio, page);
                unsigned long folio_start = address - idx * PAGE_SIZE;
                unsigned long folio_end = folio_start + nr * PAGE_SIZE;
                pte_t *folio_ptep;
                pte_t folio_pte;

                if (unlikely(folio_start < max(address & PMD_MASK, vma->vm_start)))
                        goto check_folio;
                if (unlikely(folio_end > pmd_addr_end(address, vma->vm_end)))
                        goto check_folio;

                folio_ptep = vmf->pte - idx;
                folio_pte = ptep_get(folio_ptep);
                if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) ||
                    swap_pte_batch(folio_ptep, nr, folio_pte) != nr)
                        goto check_folio;

                page_idx = idx;
                address = folio_start;
                ptep = folio_ptep;
                nr_pages = nr;
                entry = folio->swap;
                page = &folio->page;
        }

check_folio:
        /*
         * PG_anon_exclusive reuses PG_mappedtodisk for anon pages. A swap pte
         * must never point at an anonymous page in the swapcache that is
         * PG_anon_exclusive. Sanity check that this holds and especially, that
         * no filesystem set PG_mappedtodisk on a page in the swapcache. Sanity
         * check after taking the PT lock and making sure that nobody
         * concurrently faulted in this page and set PG_anon_exclusive.
         */
        BUG_ON(!folio_test_anon(folio) && folio_test_mappedtodisk(folio));
        BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page));

        /*
         * Check under PT lock (to protect against concurrent fork() sharing
         * the swap entry concurrently) for certainly exclusive pages.
         */
        if (!folio_test_ksm(folio)) {
                exclusive = pte_swp_exclusive(vmf->orig_pte);
                if (folio != swapcache) {
                        /*
                         * We have a fresh page that is not exposed to the
                         * swapcache -> certainly exclusive.
                         */
                        exclusive = true;
                } else if (exclusive && folio_test_writeback(folio) &&
                          data_race(si->flags & SWP_STABLE_WRITES)) {
                        /*
                         * This is tricky: not all swap backends support
                         * concurrent page modifications while under writeback.
                         *
                         * So if we stumble over such a page in the swapcache
                         * we must not set the page exclusive, otherwise we can
                         * map it writable without further checks and modify it
                         * while still under writeback.
                         *
                         * For these problematic swap backends, simply drop the
                         * exclusive marker: this is perfectly fine as we start
                         * writeback only if we fully unmapped the page and
                         * there are no unexpected references on the page after
                         * unmapping succeeded. After fully unmapped, no
                         * further GUP references (FOLL_GET and FOLL_PIN) can
                         * appear, so dropping the exclusive marker and mapping
                         * it only R/O is fine.
                         */
                        exclusive = false;
                }
        }

        /*
         * Some architectures may have to restore extra metadata to the page
         * when reading from swap. This metadata may be indexed by swap entry
         * so this must be called before swap_free().
         */
        arch_swap_restore(folio_swap(entry, folio), folio);

        /*
         * Remove the swap entry and conditionally try to free up the swapcache.
         * We're already holding a reference on the page but haven't mapped it
         * yet.
         */
        swap_free_nr(entry, nr_pages);
        if (should_try_to_free_swap(folio, vma, vmf->flags))
                folio_free_swap(folio);

        add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
        add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages);
        pte = mk_pte(page, vma->vm_page_prot);
        if (pte_swp_soft_dirty(vmf->orig_pte))
                pte = pte_mksoft_dirty(pte);
        if (pte_swp_uffd_wp(vmf->orig_pte))
                pte = pte_mkuffd_wp(pte);

        /*
         * Same logic as in do_wp_page(); however, optimize for pages that are
         * certainly not shared either because we just allocated them without
         * exposing them to the swapcache or because the swap entry indicates
         * exclusivity.
         */
        if (!folio_test_ksm(folio) &&
            (exclusive || folio_ref_count(folio) == 1)) {
                if ((vma->vm_flags & VM_WRITE) && !userfaultfd_pte_wp(vma, pte) &&
                    !pte_needs_soft_dirty_wp(vma, pte)) {
                        pte = pte_mkwrite(pte, vma);
                        if (vmf->flags & FAULT_FLAG_WRITE) {
                                pte = pte_mkdirty(pte);
                                vmf->flags &= ~FAULT_FLAG_WRITE;
                        }
                }
                rmap_flags |= RMAP_EXCLUSIVE;
        }
        folio_ref_add(folio, nr_pages - 1);
        flush_icache_pages(vma, page, nr_pages);
        vmf->orig_pte = pte_advance_pfn(pte, page_idx);

        /* ksm created a completely new copy */
        if (unlikely(folio != swapcache && swapcache)) {
                folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
                folio_add_lru_vma(folio, vma);
        } else if (!folio_test_anon(folio)) {
                /*
                 * We currently only expect small !anon folios which are either
                 * fully exclusive or fully shared, or new allocated large
                 * folios which are fully exclusive. If we ever get large
                 * folios within swapcache here, we have to be careful.
                 */
                VM_WARN_ON_ONCE(folio_test_large(folio) && folio_test_swapcache(folio));
                VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
                folio_add_new_anon_rmap(folio, vma, address, rmap_flags);
        } else {
                folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, address,
                                        rmap_flags);
        }

        VM_BUG_ON(!folio_test_anon(folio) ||
                        (pte_write(pte) && !PageAnonExclusive(page)));
        set_ptes(vma->vm_mm, address, ptep, pte, nr_pages);
        arch_do_swap_page_nr(vma->vm_mm, vma, address,
                        pte, pte, nr_pages);

        folio_unlock(folio);
        if (folio != swapcache && swapcache) {
                /*
                 * Hold the lock to avoid the swap entry to be reused
                 * until we take the PT lock for the pte_same() check
                 * (to avoid false positives from pte_same). For
                 * further safety release the lock after the swap_free
                 * so that the swap count won't change under a
                 * parallel locked swapcache.
                 */
                folio_unlock(swapcache);
                folio_put(swapcache);
        }

        if (vmf->flags & FAULT_FLAG_WRITE) {
                ret |= do_wp_page(vmf);
                if (ret & VM_FAULT_ERROR)
                        ret &= VM_FAULT_ERROR;
                goto out;
        }

        /* No need to invalidate - it was non-present before */
        update_mmu_cache_range(vmf, vma, address, ptep, nr_pages);
unlock:
        if (vmf->pte)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
        /* Clear the swap cache pin for direct swapin after PTL unlock */
        if (need_clear_cache) {
                swapcache_clear(si, entry, nr_pages);
                if (waitqueue_active(&swapcache_wq))
                        wake_up(&swapcache_wq);
        }
        if (si)
                put_swap_device(si);
        return ret;
out_nomap:
        if (vmf->pte)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
out_page:
        folio_unlock(folio);
out_release:
        folio_put(folio);
        if (folio != swapcache && swapcache) {
                folio_unlock(swapcache);
                folio_put(swapcache);
        }
        if (need_clear_cache) {
                swapcache_clear(si, entry, nr_pages);
                if (waitqueue_active(&swapcache_wq))
                        wake_up(&swapcache_wq);
        }
        if (si)
                put_swap_device(si);
        return ret;
}

static bool pte_range_none(pte_t *pte, int nr_pages)
{
        int i;

        for (i = 0; i < nr_pages; i++) {
                if (!pte_none(ptep_get_lockless(pte + i)))
                        return false;
        }

        return true;
}

static struct folio *alloc_anon_folio(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        unsigned long orders;
        struct folio *folio;
        unsigned long addr;
        pte_t *pte;
        gfp_t gfp;
        int order;

        /*
         * If uffd is active for the vma we need per-page fault fidelity to
         * maintain the uffd semantics.
         */
        if (unlikely(userfaultfd_armed(vma)))
                goto fallback;

        /*
         * Get a list of all the (large) orders below PMD_ORDER that are enabled
         * for this vma. Then filter out the orders that can't be allocated over
         * the faulting address and still be fully contained in the vma.
         */
        orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT,
                                          BIT(PMD_ORDER) - 1);
        orders = thp_vma_suitable_orders(vma, vmf->address, orders);

        if (!orders)
                goto fallback;

        pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
        if (!pte)
                return ERR_PTR(-EAGAIN);

        /*
         * Find the highest order where the aligned range is completely
         * pte_none(). Note that all remaining orders will be completely
         * pte_none().
         */
        order = highest_order(orders);
        while (orders) {
                addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
                if (pte_range_none(pte + pte_index(addr), 1 << order))
                        break;
                order = next_order(&orders, order);
        }

        pte_unmap(pte);

        if (!orders)
                goto fallback;

        /* Try allocating the highest of the remaining orders. */
        gfp = vma_thp_gfp_mask(vma);
        while (orders) {
                addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
                folio = vma_alloc_folio(gfp, order, vma, addr);
                if (folio) {
                        if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
                                count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
                                folio_put(folio);
                                goto next;
                        }
                        folio_throttle_swaprate(folio, gfp);
                        /*
                         * When a folio is not zeroed during allocation
                         * (__GFP_ZERO not used) or user folios require special
                         * handling, folio_zero_user() is used to make sure
                         * that the page corresponding to the faulting address
                         * will be hot in the cache after zeroing.
                         */
                        if (user_alloc_needs_zeroing())
                                folio_zero_user(folio, vmf->address);
                        return folio;
                }
next:
                count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
                order = next_order(&orders, order);
        }

fallback:
#endif
        return folio_prealloc(vma->vm_mm, vma, vmf->address, true);
}

/*
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
 * but allow concurrent faults), and pte mapped but not yet locked.
 * We return with mmap_lock still held, but pte unmapped and unlocked.
 */
static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        unsigned long addr = vmf->address;
        struct folio *folio;
        vm_fault_t ret = 0;
        int nr_pages = 1;
        pte_t entry;

        /* File mapping without ->vm_ops ? */
        if (vma->vm_flags & VM_SHARED)
                return VM_FAULT_SIGBUS;

        /*
         * Use pte_alloc() instead of pte_alloc_map(), so that OOM can
         * be distinguished from a transient failure of pte_offset_map().
         */
        if (pte_alloc(vma->vm_mm, vmf->pmd))
                return VM_FAULT_OOM;

        /* Use the zero-page for reads */
        if (!(vmf->flags & FAULT_FLAG_WRITE) &&
                        !mm_forbids_zeropage(vma->vm_mm)) {
                entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
                                                vma->vm_page_prot));
                vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                vmf->address, &vmf->ptl);
                if (!vmf->pte)
                        goto unlock;
                if (vmf_pte_changed(vmf)) {
                        update_mmu_tlb(vma, vmf->address, vmf->pte);
                        goto unlock;
                }
                ret = check_stable_address_space(vma->vm_mm);
                if (ret)
                        goto unlock;
                /* Deliver the page fault to userland, check inside PT lock */
                if (userfaultfd_missing(vma)) {
                        pte_unmap_unlock(vmf->pte, vmf->ptl);
                        return handle_userfault(vmf, VM_UFFD_MISSING);
                }
                goto setpte;
        }

        /* Allocate our own private page. */
        ret = vmf_anon_prepare(vmf);
        if (ret)
                return ret;
        /* Returns NULL on OOM or ERR_PTR(-EAGAIN) if we must retry the fault */
        folio = alloc_anon_folio(vmf);
        if (IS_ERR(folio))
                return 0;
        if (!folio)
                goto oom;

        nr_pages = folio_nr_pages(folio);
        addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);

        /*
         * The memory barrier inside __folio_mark_uptodate makes sure that
         * preceding stores to the page contents become visible before
         * the set_pte_at() write.
         */
        __folio_mark_uptodate(folio);

        entry = folio_mk_pte(folio, vma->vm_page_prot);
        entry = pte_sw_mkyoung(entry);
        if (vma->vm_flags & VM_WRITE)
                entry = pte_mkwrite(pte_mkdirty(entry), vma);

        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
        if (!vmf->pte)
                goto release;
        if (nr_pages == 1 && vmf_pte_changed(vmf)) {
                update_mmu_tlb(vma, addr, vmf->pte);
                goto release;
        } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
                update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages);
                goto release;
        }

        ret = check_stable_address_space(vma->vm_mm);
        if (ret)
                goto release;

        /* Deliver the page fault to userland, check inside PT lock */
        if (userfaultfd_missing(vma)) {
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                folio_put(folio);
                return handle_userfault(vmf, VM_UFFD_MISSING);
        }

        folio_ref_add(folio, nr_pages - 1);
        add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
        count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC);
        folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
        folio_add_lru_vma(folio, vma);
setpte:
        if (vmf_orig_pte_uffd_wp(vmf))
                entry = pte_mkuffd_wp(entry);
        set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages);

        /* No need to invalidate - it was non-present before */
        update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages);
unlock:
        if (vmf->pte)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
        return ret;
release:
        folio_put(folio);
        goto unlock;
oom:
        return VM_FAULT_OOM;
}

/*
 * The mmap_lock must have been held on entry, and may have been
 * released depending on flags and vma->vm_ops->fault() return value.
 * See filemap_fault() and __lock_page_retry().
 */
static vm_fault_t __do_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio;
        vm_fault_t ret;

        /*
         * Preallocate pte before we take page_lock because this might lead to
         * deadlocks for memcg reclaim which waits for pages under writeback:
         *                                lock_page(A)
         *                                SetPageWriteback(A)
         *                                unlock_page(A)
         * lock_page(B)
         *                                lock_page(B)
         * pte_alloc_one
         *   shrink_folio_list
         *     wait_on_page_writeback(A)
         *                                SetPageWriteback(B)
         *                                unlock_page(B)
         *                                # flush A, B to clear the writeback
         */
        if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
                vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
                if (!vmf->prealloc_pte)
                        return VM_FAULT_OOM;
        }

        ret = vma->vm_ops->fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
                            VM_FAULT_DONE_COW)))
                return ret;

        folio = page_folio(vmf->page);
        if (unlikely(PageHWPoison(vmf->page))) {
                vm_fault_t poisonret = VM_FAULT_HWPOISON;
                if (ret & VM_FAULT_LOCKED) {
                        if (page_mapped(vmf->page))
                                unmap_mapping_folio(folio);
                        /* Retry if a clean folio was removed from the cache. */
                        if (mapping_evict_folio(folio->mapping, folio))
                                poisonret = VM_FAULT_NOPAGE;
                        folio_unlock(folio);
                }
                folio_put(folio);
                vmf->page = NULL;
                return poisonret;
        }

        if (unlikely(!(ret & VM_FAULT_LOCKED)))
                folio_lock(folio);
        else
                VM_BUG_ON_PAGE(!folio_test_locked(folio), vmf->page);

        return ret;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void deposit_prealloc_pte(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;

        pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
        /*
         * We are going to consume the prealloc table,
         * count that as nr_ptes.
         */
        mm_inc_nr_ptes(vma->vm_mm);
        vmf->prealloc_pte = NULL;
}

vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page)
{
        struct vm_area_struct *vma = vmf->vma;
        bool write = vmf->flags & FAULT_FLAG_WRITE;
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        pmd_t entry;
        vm_fault_t ret = VM_FAULT_FALLBACK;

        /*
         * It is too late to allocate a small folio, we already have a large
         * folio in the pagecache: especially s390 KVM cannot tolerate any
         * PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any
         * PMD mappings if THPs are disabled. As we already have a THP,
         * behave as if we are forcing a collapse.
         */
        if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags,
                                                     /* forced_collapse=*/ true))
                return ret;

        if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
                return ret;

        if (folio_order(folio) != HPAGE_PMD_ORDER)
                return ret;
        page = &folio->page;

        /*
         * Just backoff if any subpage of a THP is corrupted otherwise
         * the corrupted page may mapped by PMD silently to escape the
         * check.  This kind of THP just can be PTE mapped.  Access to
         * the corrupted subpage should trigger SIGBUS as expected.
         */
        if (unlikely(folio_test_has_hwpoisoned(folio)))
                return ret;

        /*
         * Archs like ppc64 need additional space to store information
         * related to pte entry. Use the preallocated table for that.
         */
        if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
                vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
                if (!vmf->prealloc_pte)
                        return VM_FAULT_OOM;
        }

        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
        if (unlikely(!pmd_none(*vmf->pmd)))
                goto out;

        flush_icache_pages(vma, page, HPAGE_PMD_NR);

        entry = folio_mk_pmd(folio, vma->vm_page_prot);
        if (write)
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);

        add_mm_counter(vma->vm_mm, mm_counter_file(folio), HPAGE_PMD_NR);
        folio_add_file_rmap_pmd(folio, page, vma);

        /*
         * deposit and withdraw with pmd lock held
         */
        if (arch_needs_pgtable_deposit())
                deposit_prealloc_pte(vmf);

        set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);

        update_mmu_cache_pmd(vma, haddr, vmf->pmd);

        /* fault is handled */
        ret = 0;
        count_vm_event(THP_FILE_MAPPED);
out:
        spin_unlock(vmf->ptl);
        return ret;
}
#else
vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page)
{
        return VM_FAULT_FALLBACK;
}
#endif

/**
 * set_pte_range - Set a range of PTEs to point to pages in a folio.
 * @vmf: Fault decription.
 * @folio: The folio that contains @page.
 * @page: The first page to create a PTE for.
 * @nr: The number of PTEs to create.
 * @addr: The first address to create a PTE for.
 */
void set_pte_range(struct vm_fault *vmf, struct folio *folio,
                struct page *page, unsigned int nr, unsigned long addr)
{
        struct vm_area_struct *vma = vmf->vma;
        bool write = vmf->flags & FAULT_FLAG_WRITE;
        bool prefault = !in_range(vmf->address, addr, nr * PAGE_SIZE);
        pte_t entry;

        flush_icache_pages(vma, page, nr);
        entry = mk_pte(page, vma->vm_page_prot);

        if (prefault && arch_wants_old_prefaulted_pte())
                entry = pte_mkold(entry);
        else
                entry = pte_sw_mkyoung(entry);

        if (write)
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        else if (pte_write(entry) && folio_test_dirty(folio))
                entry = pte_mkdirty(entry);
        if (unlikely(vmf_orig_pte_uffd_wp(vmf)))
                entry = pte_mkuffd_wp(entry);
        /* copy-on-write page */
        if (write && !(vma->vm_flags & VM_SHARED)) {
                VM_BUG_ON_FOLIO(nr != 1, folio);
                folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
                folio_add_lru_vma(folio, vma);
        } else {
                folio_add_file_rmap_ptes(folio, page, nr, vma);
        }
        set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr);

        /* no need to invalidate: a not-present page won't be cached */
        update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr);
}

static bool vmf_pte_changed(struct vm_fault *vmf)
{
        if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)
                return !pte_same(ptep_get(vmf->pte), vmf->orig_pte);

        return !pte_none(ptep_get(vmf->pte));
}

/**
 * finish_fault - finish page fault once we have prepared the page to fault
 *
 * @vmf: structure describing the fault
 *
 * This function handles all that is needed to finish a page fault once the
 * page to fault in is prepared. It handles locking of PTEs, inserts PTE for
 * given page, adds reverse page mapping, handles memcg charges and LRU
 * addition.
 *
 * The function expects the page to be locked and on success it consumes a
 * reference of a page being mapped (for the PTE which maps it).
 *
 * Return: %0 on success, %VM_FAULT_ code in case of error.
 */
vm_fault_t finish_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct page *page;
        struct folio *folio;
        vm_fault_t ret;
        bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
                      !(vma->vm_flags & VM_SHARED);
        int type, nr_pages;
        unsigned long addr;
        bool needs_fallback = false;

fallback:
        addr = vmf->address;

        /* Did we COW the page? */
        if (is_cow)
                page = vmf->cow_page;
        else
                page = vmf->page;

        folio = page_folio(page);
        /*
         * check even for read faults because we might have lost our CoWed
         * page
         */
        if (!(vma->vm_flags & VM_SHARED)) {
                ret = check_stable_address_space(vma->vm_mm);
                if (ret)
                        return ret;
        }

        if (!needs_fallback && vma->vm_file) {
                struct address_space *mapping = vma->vm_file->f_mapping;
                pgoff_t file_end;

                file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);

                /*
                 * Do not allow to map with PTEs beyond i_size and with PMD
                 * across i_size to preserve SIGBUS semantics.
                 *
                 * Make an exception for shmem/tmpfs that for long time
                 * intentionally mapped with PMDs across i_size.
                 */
                needs_fallback = !shmem_mapping(mapping) &&
                        file_end < folio_next_index(folio);
        }

        if (pmd_none(*vmf->pmd)) {
                if (!needs_fallback && folio_test_pmd_mappable(folio)) {
                        ret = do_set_pmd(vmf, folio, page);
                        if (ret != VM_FAULT_FALLBACK)
                                return ret;
                }

                if (vmf->prealloc_pte)
                        pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte);
                else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd)))
                        return VM_FAULT_OOM;
        }

        nr_pages = folio_nr_pages(folio);

        /* Using per-page fault to maintain the uffd semantics */
        if (unlikely(userfaultfd_armed(vma)) || unlikely(needs_fallback)) {
                nr_pages = 1;
        } else if (nr_pages > 1) {
                pgoff_t idx = folio_page_idx(folio, page);
                /* The page offset of vmf->address within the VMA. */
                pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff;
                /* The index of the entry in the pagetable for fault page. */
                pgoff_t pte_off = pte_index(vmf->address);

                /*
                 * Fallback to per-page fault in case the folio size in page
                 * cache beyond the VMA limits and PMD pagetable limits.
                 */
                if (unlikely(vma_off < idx ||
                            vma_off + (nr_pages - idx) > vma_pages(vma) ||
                            pte_off < idx ||
                            pte_off + (nr_pages - idx)  > PTRS_PER_PTE)) {
                        nr_pages = 1;
                } else {
                        /* Now we can set mappings for the whole large folio. */
                        addr = vmf->address - idx * PAGE_SIZE;
                        page = &folio->page;
                }
        }

        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                       addr, &vmf->ptl);
        if (!vmf->pte)
                return VM_FAULT_NOPAGE;

        /* Re-check under ptl */
        if (nr_pages == 1 && unlikely(vmf_pte_changed(vmf))) {
                update_mmu_tlb(vma, addr, vmf->pte);
                ret = VM_FAULT_NOPAGE;
                goto unlock;
        } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
                needs_fallback = true;
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                goto fallback;
        }

        folio_ref_add(folio, nr_pages - 1);
        set_pte_range(vmf, folio, page, nr_pages, addr);
        type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
        add_mm_counter(vma->vm_mm, type, nr_pages);
        ret = 0;

unlock:
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        return ret;
}

static unsigned long fault_around_pages __read_mostly =
        65536 >> PAGE_SHIFT;

#ifdef CONFIG_DEBUG_FS
static int fault_around_bytes_get(void *data, u64 *val)
{
        *val = fault_around_pages << PAGE_SHIFT;
        return 0;
}

/*
 * fault_around_bytes must be rounded down to the nearest page order as it's
 * what do_fault_around() expects to see.
 */
static int fault_around_bytes_set(void *data, u64 val)
{
        if (val / PAGE_SIZE > PTRS_PER_PTE)
                return -EINVAL;

        /*
         * The minimum value is 1 page, however this results in no fault-around
         * at all. See should_fault_around().
         */
        val = max(val, PAGE_SIZE);
        fault_around_pages = rounddown_pow_of_two(val) >> PAGE_SHIFT;

        return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
                fault_around_bytes_get, fault_around_bytes_set, "%llu\n");

static int __init fault_around_debugfs(void)
{
        debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
                                   &fault_around_bytes_fops);
        return 0;
}
late_initcall(fault_around_debugfs);
#endif

/*
 * do_fault_around() tries to map few pages around the fault address. The hope
 * is that the pages will be needed soon and this will lower the number of
 * faults to handle.
 *
 * It uses vm_ops->map_pages() to map the pages, which skips the page if it's
 * not ready to be mapped: not up-to-date, locked, etc.
 *
 * This function doesn't cross VMA or page table boundaries, in order to call
 * map_pages() and acquire a PTE lock only once.
 *
 * fault_around_pages defines how many pages we'll try to map.
 * do_fault_around() expects it to be set to a power of two less than or equal
 * to PTRS_PER_PTE.
 *
 * The virtual address of the area that we map is naturally aligned to
 * fault_around_pages * PAGE_SIZE rounded down to the machine page size
 * (and therefore to page order).  This way it's easier to guarantee
 * that we don't cross page table boundaries.
 */
static vm_fault_t do_fault_around(struct vm_fault *vmf)
{
        pgoff_t nr_pages = READ_ONCE(fault_around_pages);
        pgoff_t pte_off = pte_index(vmf->address);
        /* The page offset of vmf->address within the VMA. */
        pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff;
        pgoff_t from_pte, to_pte;
        vm_fault_t ret;

        /* The PTE offset of the start address, clamped to the VMA. */
        from_pte = max(ALIGN_DOWN(pte_off, nr_pages),
                       pte_off - min(pte_off, vma_off));

        /* The PTE offset of the end address, clamped to the VMA and PTE. */
        to_pte = min3(from_pte + nr_pages, (pgoff_t)PTRS_PER_PTE,
                      pte_off + vma_pages(vmf->vma) - vma_off) - 1;

        if (pmd_none(*vmf->pmd)) {
                vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
                if (!vmf->prealloc_pte)
                        return VM_FAULT_OOM;
        }

        rcu_read_lock();
        ret = vmf->vma->vm_ops->map_pages(vmf,
                        vmf->pgoff + from_pte - pte_off,
                        vmf->pgoff + to_pte - pte_off);
        rcu_read_unlock();

        return ret;
}

/* Return true if we should do read fault-around, false otherwise */
static inline bool should_fault_around(struct vm_fault *vmf)
{
        /* No ->map_pages?  No way to fault around... */
        if (!vmf->vma->vm_ops->map_pages)
                return false;

        if (uffd_disable_fault_around(vmf->vma))
                return false;

        /* A single page implies no faulting 'around' at all. */
        return fault_around_pages > 1;
}

static vm_fault_t do_read_fault(struct vm_fault *vmf)
{
        vm_fault_t ret = 0;
        struct folio *folio;

        /*
         * Let's call ->map_pages() first and use ->fault() as fallback
         * if page by the offset is not ready to be mapped (cold cache or
         * something).
         */
        if (should_fault_around(vmf)) {
                ret = do_fault_around(vmf);
                if (ret)
                        return ret;
        }

        ret = vmf_can_call_fault(vmf);
        if (ret)
                return ret;

        ret = __do_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                return ret;

        ret |= finish_fault(vmf);
        folio = page_folio(vmf->page);
        folio_unlock(folio);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                folio_put(folio);
        return ret;
}

static vm_fault_t do_cow_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio;
        vm_fault_t ret;

        ret = vmf_can_call_fault(vmf);
        if (!ret)
                ret = vmf_anon_prepare(vmf);
        if (ret)
                return ret;

        folio = folio_prealloc(vma->vm_mm, vma, vmf->address, false);
        if (!folio)
                return VM_FAULT_OOM;

        vmf->cow_page = &folio->page;

        ret = __do_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                goto uncharge_out;
        if (ret & VM_FAULT_DONE_COW)
                return ret;

        if (copy_mc_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma)) {
                ret = VM_FAULT_HWPOISON;
                goto unlock;
        }
        __folio_mark_uptodate(folio);

        ret |= finish_fault(vmf);
unlock:
        unlock_page(vmf->page);
        put_page(vmf->page);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                goto uncharge_out;
        return ret;
uncharge_out:
        folio_put(folio);
        return ret;
}

static vm_fault_t do_shared_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret, tmp;
        struct folio *folio;

        ret = vmf_can_call_fault(vmf);
        if (ret)
                return ret;

        ret = __do_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                return ret;

        folio = page_folio(vmf->page);

        /*
         * Check if the backing address space wants to know that the page is
         * about to become writable
         */
        if (vma->vm_ops->page_mkwrite) {
                folio_unlock(folio);
                tmp = do_page_mkwrite(vmf, folio);
                if (unlikely(!tmp ||
                                (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
                        folio_put(folio);
                        return tmp;
                }
        }

        ret |= finish_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
                                        VM_FAULT_RETRY))) {
                folio_unlock(folio);
                folio_put(folio);
                return ret;
        }

        ret |= fault_dirty_shared_page(vmf);
        return ret;
}

/*
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
 * but allow concurrent faults).
 * The mmap_lock may have been released depending on flags and our
 * return value.  See filemap_fault() and __folio_lock_or_retry().
 * If mmap_lock is released, vma may become invalid (for example
 * by other thread calling munmap()).
 */
static vm_fault_t do_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *vm_mm = vma->vm_mm;
        vm_fault_t ret;

        /*
         * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
         */
        if (!vma->vm_ops->fault) {
                vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
                                               vmf->address, &vmf->ptl);
                if (unlikely(!vmf->pte))
                        ret = VM_FAULT_SIGBUS;
                else {
                        /*
                         * Make sure this is not a temporary clearing of pte
                         * by holding ptl and checking again. A R/M/W update
                         * of pte involves: take ptl, clearing the pte so that
                         * we don't have concurrent modification by hardware
                         * followed by an update.
                         */
                        if (unlikely(pte_none(ptep_get(vmf->pte))))
                                ret = VM_FAULT_SIGBUS;
                        else
                                ret = VM_FAULT_NOPAGE;

                        pte_unmap_unlock(vmf->pte, vmf->ptl);
                }
        } else if (!(vmf->flags & FAULT_FLAG_WRITE))
                ret = do_read_fault(vmf);
        else if (!(vma->vm_flags & VM_SHARED))
                ret = do_cow_fault(vmf);
        else
                ret = do_shared_fault(vmf);

        /* preallocated pagetable is unused: free it */
        if (vmf->prealloc_pte) {
                pte_free(vm_mm, vmf->prealloc_pte);
                vmf->prealloc_pte = NULL;
        }
        return ret;
}

int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
                      unsigned long addr, int *flags,
                      bool writable, int *last_cpupid)
{
        struct vm_area_struct *vma = vmf->vma;

        /*
         * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
         * much anyway since they can be in shared cache state. This misses
         * the case where a mapping is writable but the process never writes
         * to it but pte_write gets cleared during protection updates and
         * pte_dirty has unpredictable behaviour between PTE scan updates,
         * background writeback, dirty balancing and application behaviour.
         */
        if (!writable)
                *flags |= TNF_NO_GROUP;

        /*
         * Flag if the folio is shared between multiple address spaces. This
         * is later used when determining whether to group tasks together
         */
        if (folio_maybe_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
                *flags |= TNF_SHARED;
        /*
         * For memory tiering mode, cpupid of slow memory page is used
         * to record page access time.  So use default value.
         */
        if (folio_use_access_time(folio))
                *last_cpupid = (-1 & LAST_CPUPID_MASK);
        else
                *last_cpupid = folio_last_cpupid(folio);

        /* Record the current PID acceesing VMA */
        vma_set_access_pid_bit(vma);

        count_vm_numa_event(NUMA_HINT_FAULTS);
#ifdef CONFIG_NUMA_BALANCING
        count_memcg_folio_events(folio, NUMA_HINT_FAULTS, 1);
#endif
        if (folio_nid(folio) == numa_node_id()) {
                count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
                *flags |= TNF_FAULT_LOCAL;
        }

        return mpol_misplaced(folio, vmf, addr);
}

static void numa_rebuild_single_mapping(struct vm_fault *vmf, struct vm_area_struct *vma,
                                        unsigned long fault_addr, pte_t *fault_pte,
                                        bool writable)
{
        pte_t pte, old_pte;

        old_pte = ptep_modify_prot_start(vma, fault_addr, fault_pte);
        pte = pte_modify(old_pte, vma->vm_page_prot);
        pte = pte_mkyoung(pte);
        if (writable)
                pte = pte_mkwrite(pte, vma);
        ptep_modify_prot_commit(vma, fault_addr, fault_pte, old_pte, pte);
        update_mmu_cache_range(vmf, vma, fault_addr, fault_pte, 1);
}

static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_struct *vma,
                                       struct folio *folio, pte_t fault_pte,
                                       bool ignore_writable, bool pte_write_upgrade)
{
        int nr = pte_pfn(fault_pte) - folio_pfn(folio);
        unsigned long start, end, addr = vmf->address;
        unsigned long addr_start = addr - (nr << PAGE_SHIFT);
        unsigned long pt_start = ALIGN_DOWN(addr, PMD_SIZE);
        pte_t *start_ptep;

        /* Stay within the VMA and within the page table. */
        start = max3(addr_start, pt_start, vma->vm_start);
        end = min3(addr_start + folio_size(folio), pt_start + PMD_SIZE,
                   vma->vm_end);
        start_ptep = vmf->pte - ((addr - start) >> PAGE_SHIFT);

        /* Restore all PTEs' mapping of the large folio */
        for (addr = start; addr != end; start_ptep++, addr += PAGE_SIZE) {
                pte_t ptent = ptep_get(start_ptep);
                bool writable = false;

                if (!pte_present(ptent) || !pte_protnone(ptent))
                        continue;

                if (pfn_folio(pte_pfn(ptent)) != folio)
                        continue;

                if (!ignore_writable) {
                        ptent = pte_modify(ptent, vma->vm_page_prot);
                        writable = pte_write(ptent);
                        if (!writable && pte_write_upgrade &&
                            can_change_pte_writable(vma, addr, ptent))
                                writable = true;
                }

                numa_rebuild_single_mapping(vmf, vma, addr, start_ptep, writable);
        }
}

static vm_fault_t do_numa_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio = NULL;
        int nid = NUMA_NO_NODE;
        bool writable = false, ignore_writable = false;
        bool pte_write_upgrade = vma_wants_manual_pte_write_upgrade(vma);
        int last_cpupid;
        int target_nid;
        pte_t pte, old_pte;
        int flags = 0, nr_pages;

        /*
         * The pte cannot be used safely until we verify, while holding the page
         * table lock, that its contents have not changed during fault handling.
         */
        spin_lock(vmf->ptl);
        /* Read the live PTE from the page tables: */
        old_pte = ptep_get(vmf->pte);

        if (unlikely(!pte_same(old_pte, vmf->orig_pte))) {
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                return 0;
        }

        pte = pte_modify(old_pte, vma->vm_page_prot);

        /*
         * Detect now whether the PTE could be writable; this information
         * is only valid while holding the PT lock.
         */
        writable = pte_write(pte);
        if (!writable && pte_write_upgrade &&
            can_change_pte_writable(vma, vmf->address, pte))
                writable = true;

        folio = vm_normal_folio(vma, vmf->address, pte);
        if (!folio || folio_is_zone_device(folio))
                goto out_map;

        nid = folio_nid(folio);
        nr_pages = folio_nr_pages(folio);

        target_nid = numa_migrate_check(folio, vmf, vmf->address, &flags,
                                        writable, &last_cpupid);
        if (target_nid == NUMA_NO_NODE)
                goto out_map;
        if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
                flags |= TNF_MIGRATE_FAIL;
                goto out_map;
        }
        /* The folio is isolated and isolation code holds a folio reference. */
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        writable = false;
        ignore_writable = true;

        /* Migrate to the requested node */
        if (!migrate_misplaced_folio(folio, target_nid)) {
                nid = target_nid;
                flags |= TNF_MIGRATED;
                task_numa_fault(last_cpupid, nid, nr_pages, flags);
                return 0;
        }

        flags |= TNF_MIGRATE_FAIL;
        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                       vmf->address, &vmf->ptl);
        if (unlikely(!vmf->pte))
                return 0;
        if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                return 0;
        }
out_map:
        /*
         * Make it present again, depending on how arch implements
         * non-accessible ptes, some can allow access by kernel mode.
         */
        if (folio && folio_test_large(folio))
                numa_rebuild_large_mapping(vmf, vma, folio, pte, ignore_writable,
                                           pte_write_upgrade);
        else
                numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte,
                                            writable);
        pte_unmap_unlock(vmf->pte, vmf->ptl);

        if (nid != NUMA_NO_NODE)
                task_numa_fault(last_cpupid, nid, nr_pages, flags);
        return 0;
}

static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        if (vma_is_anonymous(vma))
                return do_huge_pmd_anonymous_page(vmf);
        if (vma->vm_ops->huge_fault)
                return vma->vm_ops->huge_fault(vmf, PMD_ORDER);
        return VM_FAULT_FALLBACK;
}

/* `inline' is required to avoid gcc 4.1.2 build error */
static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
        vm_fault_t ret;

        if (vma_is_anonymous(vma)) {
                if (likely(!unshare) &&
                    userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) {
                        if (userfaultfd_wp_async(vmf->vma))
                                goto split;
                        return handle_userfault(vmf, VM_UFFD_WP);
                }
                return do_huge_pmd_wp_page(vmf);
        }

        if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
                if (vma->vm_ops->huge_fault) {
                        ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER);
                        if (!(ret & VM_FAULT_FALLBACK))
                                return ret;
                }
        }

split:
        /* COW or write-notify handled on pte level: split pmd. */
        __split_huge_pmd(vma, vmf->pmd, vmf->address, false);

        return VM_FAULT_FALLBACK;
}

static vm_fault_t create_huge_pud(struct vm_fault *vmf)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&                        \
        defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
        struct vm_area_struct *vma = vmf->vma;
        /* No support for anonymous transparent PUD pages yet */
        if (vma_is_anonymous(vma))
                return VM_FAULT_FALLBACK;
        if (vma->vm_ops->huge_fault)
                return vma->vm_ops->huge_fault(vmf, PUD_ORDER);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
        return VM_FAULT_FALLBACK;
}

static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&                        \
        defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret;

        /* No support for anonymous transparent PUD pages yet */
        if (vma_is_anonymous(vma))
                goto split;
        if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
                if (vma->vm_ops->huge_fault) {
                        ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER);
                        if (!(ret & VM_FAULT_FALLBACK))
                                return ret;
                }
        }
split:
        /* COW or write-notify not handled on PUD level: split pud.*/
        __split_huge_pud(vma, vmf->pud, vmf->address);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
        return VM_FAULT_FALLBACK;
}

/*
 * These routines also need to handle stuff like marking pages dirty
 * and/or accessed for architectures that don't do it in hardware (most
 * RISC architectures).  The early dirtying is also good on the i386.
 *
 * There is also a hook called "update_mmu_cache()" that architectures
 * with external mmu caches can use to update those (ie the Sparc or
 * PowerPC hashed page tables that act as extended TLBs).
 *
 * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow
 * concurrent faults).
 *
 * The mmap_lock may have been released depending on flags and our return value.
 * See filemap_fault() and __folio_lock_or_retry().
 */
static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
{
        pte_t entry;

        if (unlikely(pmd_none(*vmf->pmd))) {
                /*
                 * Leave __pte_alloc() until later: because vm_ops->fault may
                 * want to allocate huge page, and if we expose page table
                 * for an instant, it will be difficult to retract from
                 * concurrent faults and from rmap lookups.
                 */
                vmf->pte = NULL;
                vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID;
        } else {
                pmd_t dummy_pmdval;

                /*
                 * A regular pmd is established and it can't morph into a huge
                 * pmd by anon khugepaged, since that takes mmap_lock in write
                 * mode; but shmem or file collapse to THP could still morph
                 * it into a huge pmd: just retry later if so.
                 *
                 * Use the maywrite version to indicate that vmf->pte may be
                 * modified, but since we will use pte_same() to detect the
                 * change of the !pte_none() entry, there is no need to recheck
                 * the pmdval. Here we chooes to pass a dummy variable instead
                 * of NULL, which helps new user think about why this place is
                 * special.
                 */
                vmf->pte = pte_offset_map_rw_nolock(vmf->vma->vm_mm, vmf->pmd,
                                                    vmf->address, &dummy_pmdval,
                                                    &vmf->ptl);
                if (unlikely(!vmf->pte))
                        return 0;
                vmf->orig_pte = ptep_get_lockless(vmf->pte);
                vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID;

                if (pte_none(vmf->orig_pte)) {
                        pte_unmap(vmf->pte);
                        vmf->pte = NULL;
                }
        }

        if (!vmf->pte)
                return do_pte_missing(vmf);

        if (!pte_present(vmf->orig_pte))
                return do_swap_page(vmf);

        if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
                return do_numa_page(vmf);

        spin_lock(vmf->ptl);
        entry = vmf->orig_pte;
        if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) {
                update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
                goto unlock;
        }
        if (vmf->flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
                if (!pte_write(entry))
                        return do_wp_page(vmf);
                else if (likely(vmf->flags & FAULT_FLAG_WRITE))
                        entry = pte_mkdirty(entry);
        }
        entry = pte_mkyoung(entry);
        if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
                                vmf->flags & FAULT_FLAG_WRITE)) {
                update_mmu_cache_range(vmf, vmf->vma, vmf->address,
                                vmf->pte, 1);
        } else {
                /* Skip spurious TLB flush for retried page fault */
                if (vmf->flags & FAULT_FLAG_TRIED)
                        goto unlock;
                /*
                 * This is needed only for protection faults but the arch code
                 * is not yet telling us if this is a protection fault or not.
                 * This still avoids useless tlb flushes for .text page faults
                 * with threads.
                 */
                if (vmf->flags & FAULT_FLAG_WRITE)
                        flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
                                                     vmf->pte);
        }
unlock:
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        return 0;
}

/*
 * On entry, we hold either the VMA lock or the mmap_lock
 * (FAULT_FLAG_VMA_LOCK tells you which).  If VM_FAULT_RETRY is set in
 * the result, the mmap_lock is not held on exit.  See filemap_fault()
 * and __folio_lock_or_retry().
 */
static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
                unsigned long address, unsigned int flags)
{
        struct vm_fault vmf = {
                .vma = vma,
                .address = address & PAGE_MASK,
                .real_address = address,
                .flags = flags,
                .pgoff = linear_page_index(vma, address),
                .gfp_mask = __get_fault_gfp_mask(vma),
        };
        struct mm_struct *mm = vma->vm_mm;
        vm_flags_t vm_flags = vma->vm_flags;
        pgd_t *pgd;
        p4d_t *p4d;
        vm_fault_t ret;

        pgd = pgd_offset(mm, address);
        p4d = p4d_alloc(mm, pgd, address);
        if (!p4d)
                return VM_FAULT_OOM;

        vmf.pud = pud_alloc(mm, p4d, address);
        if (!vmf.pud)
                return VM_FAULT_OOM;
retry_pud:
        if (pud_none(*vmf.pud) &&
            thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PUD_ORDER)) {
                ret = create_huge_pud(&vmf);
                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;
        } else {
                pud_t orig_pud = *vmf.pud;

                barrier();
                if (pud_trans_huge(orig_pud)) {

                        /*
                         * TODO once we support anonymous PUDs: NUMA case and
                         * FAULT_FLAG_UNSHARE handling.
                         */
                        if ((flags & FAULT_FLAG_WRITE) && !pud_write(orig_pud)) {
                                ret = wp_huge_pud(&vmf, orig_pud);
                                if (!(ret & VM_FAULT_FALLBACK))
                                        return ret;
                        } else {
                                huge_pud_set_accessed(&vmf, orig_pud);
                                return 0;
                        }
                }
        }

        vmf.pmd = pmd_alloc(mm, vmf.pud, address);
        if (!vmf.pmd)
                return VM_FAULT_OOM;

        /* Huge pud page fault raced with pmd_alloc? */
        if (pud_trans_unstable(vmf.pud))
                goto retry_pud;

        if (pmd_none(*vmf.pmd) &&
            thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PMD_ORDER)) {
                ret = create_huge_pmd(&vmf);
                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;
        } else {
                vmf.orig_pmd = pmdp_get_lockless(vmf.pmd);

                if (unlikely(is_swap_pmd(vmf.orig_pmd))) {
                        VM_BUG_ON(thp_migration_supported() &&
                                          !is_pmd_migration_entry(vmf.orig_pmd));
                        if (is_pmd_migration_entry(vmf.orig_pmd))
                                pmd_migration_entry_wait(mm, vmf.pmd);
                        return 0;
                }
                if (pmd_trans_huge(vmf.orig_pmd)) {
                        if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
                                return do_huge_pmd_numa_page(&vmf);

                        if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
                            !pmd_write(vmf.orig_pmd)) {
                                ret = wp_huge_pmd(&vmf);
                                if (!(ret & VM_FAULT_FALLBACK))
                                        return ret;
                        } else {
                                huge_pmd_set_accessed(&vmf);
                                return 0;
                        }
                }
        }

        return handle_pte_fault(&vmf);
}

/**
 * mm_account_fault - Do page fault accounting
 * @mm: mm from which memcg should be extracted. It can be NULL.
 * @regs: the pt_regs struct pointer.  When set to NULL, will skip accounting
 *        of perf event counters, but we'll still do the per-task accounting to
 *        the task who triggered this page fault.
 * @address: the faulted address.
 * @flags: the fault flags.
 * @ret: the fault retcode.
 *
 * This will take care of most of the page fault accounting.  Meanwhile, it
 * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter
 * updates.  However, note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
 * still be in per-arch page fault handlers at the entry of page fault.
 */
static inline void mm_account_fault(struct mm_struct *mm, struct pt_regs *regs,
                                    unsigned long address, unsigned int flags,
                                    vm_fault_t ret)
{
        bool major;

        /* Incomplete faults will be accounted upon completion. */
        if (ret & VM_FAULT_RETRY)
                return;

        /*
         * To preserve the behavior of older kernels, PGFAULT counters record
         * both successful and failed faults, as opposed to perf counters,
         * which ignore failed cases.
         */
        count_vm_event(PGFAULT);
        count_memcg_event_mm(mm, PGFAULT);

        /*
         * Do not account for unsuccessful faults (e.g. when the address wasn't
         * valid).  That includes arch_vma_access_permitted() failing before
         * reaching here. So this is not a "this many hardware page faults"
         * counter.  We should use the hw profiling for that.
         */
        if (ret & VM_FAULT_ERROR)
                return;

        /*
         * We define the fault as a major fault when the final successful fault
         * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't
         * handle it immediately previously).
         */
        major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED);

        if (major)
                current->maj_flt++;
        else
                current->min_flt++;

        /*
         * If the fault is done for GUP, regs will be NULL.  We only do the
         * accounting for the per thread fault counters who triggered the
         * fault, and we skip the perf event updates.
         */
        if (!regs)
                return;

        if (major)
                perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
        else
                perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
}

#ifdef CONFIG_LRU_GEN
static void lru_gen_enter_fault(struct vm_area_struct *vma)
{
        /* the LRU algorithm only applies to accesses with recency */
        current->in_lru_fault = vma_has_recency(vma);
}

static void lru_gen_exit_fault(void)
{
        current->in_lru_fault = false;
}
#else
static void lru_gen_enter_fault(struct vm_area_struct *vma)
{
}

static void lru_gen_exit_fault(void)
{
}
#endif /* CONFIG_LRU_GEN */

static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma,
                                       unsigned int *flags)
{
        if (unlikely(*flags & FAULT_FLAG_UNSHARE)) {
                if (WARN_ON_ONCE(*flags & FAULT_FLAG_WRITE))
                        return VM_FAULT_SIGSEGV;
                /*
                 * FAULT_FLAG_UNSHARE only applies to COW mappings. Let's
                 * just treat it like an ordinary read-fault otherwise.
                 */
                if (!is_cow_mapping(vma->vm_flags))
                        *flags &= ~FAULT_FLAG_UNSHARE;
        } else if (*flags & FAULT_FLAG_WRITE) {
                /* Write faults on read-only mappings are impossible ... */
                if (WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE)))
                        return VM_FAULT_SIGSEGV;
                /* ... and FOLL_FORCE only applies to COW mappings. */
                if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE) &&
                                 !is_cow_mapping(vma->vm_flags)))
                        return VM_FAULT_SIGSEGV;
        }
#ifdef CONFIG_PER_VMA_LOCK
        /*
         * Per-VMA locks can't be used with FAULT_FLAG_RETRY_NOWAIT because of
         * the assumption that lock is dropped on VM_FAULT_RETRY.
         */
        if (WARN_ON_ONCE((*flags &
                        (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)) ==
                        (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)))
                return VM_FAULT_SIGSEGV;
#endif

        return 0;
}

/*
 * By the time we get here, we already hold either the VMA lock or the
 * mmap_lock (FAULT_FLAG_VMA_LOCK tells you which).
 *
 * The mmap_lock may have been released depending on flags and our
 * return value.  See filemap_fault() and __folio_lock_or_retry().
 */
vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
                           unsigned int flags, struct pt_regs *regs)
{
        /* If the fault handler drops the mmap_lock, vma may be freed */
        struct mm_struct *mm = vma->vm_mm;
        vm_fault_t ret;
        bool is_droppable;

        __set_current_state(TASK_RUNNING);

        ret = sanitize_fault_flags(vma, &flags);
        if (ret)
                goto out;

        if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
                                            flags & FAULT_FLAG_INSTRUCTION,
                                            flags & FAULT_FLAG_REMOTE)) {
                ret = VM_FAULT_SIGSEGV;
                goto out;
        }

        is_droppable = !!(vma->vm_flags & VM_DROPPABLE);

        /*
         * Enable the memcg OOM handling for faults triggered in user
         * space.  Kernel faults are handled more gracefully.
         */
        if (flags & FAULT_FLAG_USER)
                mem_cgroup_enter_user_fault();

        lru_gen_enter_fault(vma);

        if (unlikely(is_vm_hugetlb_page(vma)))
                ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
        else
                ret = __handle_mm_fault(vma, address, flags);

        /*
         * Warning: It is no longer safe to dereference vma-> after this point,
         * because mmap_lock might have been dropped by __handle_mm_fault(), so
         * vma might be destroyed from underneath us.
         */

        lru_gen_exit_fault();

        /* If the mapping is droppable, then errors due to OOM aren't fatal. */
        if (is_droppable)
                ret &= ~VM_FAULT_OOM;

        if (flags & FAULT_FLAG_USER) {
                mem_cgroup_exit_user_fault();
                /*
                 * The task may have entered a memcg OOM situation but
                 * if the allocation error was handled gracefully (no
                 * VM_FAULT_OOM), there is no need to kill anything.
                 * Just clean up the OOM state peacefully.
                 */
                if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
                        mem_cgroup_oom_synchronize(false);
        }
out:
        mm_account_fault(mm, regs, address, flags, ret);

        return ret;
}
EXPORT_SYMBOL_GPL(handle_mm_fault);

#ifndef __PAGETABLE_P4D_FOLDED
/*
 * Allocate p4d page table.
 * We've already handled the fast-path in-line.
 */
int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
{
        p4d_t *new = p4d_alloc_one(mm, address);
        if (!new)
                return -ENOMEM;

        spin_lock(&mm->page_table_lock);
        if (pgd_present(*pgd)) {        /* Another has populated it */
                p4d_free(mm, new);
        } else {
                smp_wmb(); /* See comment in pmd_install() */
                pgd_populate(mm, pgd, new);
        }
        spin_unlock(&mm->page_table_lock);
        return 0;
}
#endif /* __PAGETABLE_P4D_FOLDED */

#ifndef __PAGETABLE_PUD_FOLDED
/*
 * Allocate page upper directory.
 * We've already handled the fast-path in-line.
 */
int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
{
        pud_t *new = pud_alloc_one(mm, address);
        if (!new)
                return -ENOMEM;

        spin_lock(&mm->page_table_lock);
        if (!p4d_present(*p4d)) {
                mm_inc_nr_puds(mm);
                smp_wmb(); /* See comment in pmd_install() */
                p4d_populate(mm, p4d, new);
        } else        /* Another has populated it */
                pud_free(mm, new);
        spin_unlock(&mm->page_table_lock);
        return 0;
}
#endif /* __PAGETABLE_PUD_FOLDED */

#ifndef __PAGETABLE_PMD_FOLDED
/*
 * Allocate page middle directory.
 * We've already handled the fast-path in-line.
 */
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
{
        spinlock_t *ptl;
        pmd_t *new = pmd_alloc_one(mm, address);
        if (!new)
                return -ENOMEM;

        ptl = pud_lock(mm, pud);
        if (!pud_present(*pud)) {
                mm_inc_nr_pmds(mm);
                smp_wmb(); /* See comment in pmd_install() */
                pud_populate(mm, pud, new);
        } else {        /* Another has populated it */
                pmd_free(mm, new);
        }
        spin_unlock(ptl);
        return 0;
}
#endif /* __PAGETABLE_PMD_FOLDED */

static inline void pfnmap_args_setup(struct follow_pfnmap_args *args,
                                     spinlock_t *lock, pte_t *ptep,
                                     pgprot_t pgprot, unsigned long pfn_base,
                                     unsigned long addr_mask, bool writable,
                                     bool special)
{
        args->lock = lock;
        args->ptep = ptep;
        args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT);
        args->addr_mask = addr_mask;
        args->pgprot = pgprot;
        args->writable = writable;
        args->special = special;
}

static inline void pfnmap_lockdep_assert(struct vm_area_struct *vma)
{
#ifdef CONFIG_LOCKDEP
        struct file *file = vma->vm_file;
        struct address_space *mapping = file ? file->f_mapping : NULL;

        if (mapping)
                lockdep_assert(lockdep_is_held(&mapping->i_mmap_rwsem) ||
                               lockdep_is_held(&vma->vm_mm->mmap_lock));
        else
                lockdep_assert(lockdep_is_held(&vma->vm_mm->mmap_lock));
#endif
}

/**
 * follow_pfnmap_start() - Look up a pfn mapping at a user virtual address
 * @args: Pointer to struct @follow_pfnmap_args
 *
 * The caller needs to setup args->vma and args->address to point to the
 * virtual address as the target of such lookup.  On a successful return,
 * the results will be put into other output fields.
 *
 * After the caller finished using the fields, the caller must invoke
 * another follow_pfnmap_end() to proper releases the locks and resources
 * of such look up request.
 *
 * During the start() and end() calls, the results in @args will be valid
 * as proper locks will be held.  After the end() is called, all the fields
 * in @follow_pfnmap_args will be invalid to be further accessed.  Further
 * use of such information after end() may require proper synchronizations
 * by the caller with page table updates, otherwise it can create a
 * security bug.
 *
 * If the PTE maps a refcounted page, callers are responsible to protect
 * against invalidation with MMU notifiers; otherwise access to the PFN at
 * a later point in time can trigger use-after-free.
 *
 * Only IO mappings and raw PFN mappings are allowed.  The mmap semaphore
 * should be taken for read, and the mmap semaphore cannot be released
 * before the end() is invoked.
 *
 * This function must not be used to modify PTE content.
 *
 * Return: zero on success, negative otherwise.
 */
int follow_pfnmap_start(struct follow_pfnmap_args *args)
{
        struct vm_area_struct *vma = args->vma;
        unsigned long address = args->address;
        struct mm_struct *mm = vma->vm_mm;
        spinlock_t *lock;
        pgd_t *pgdp;
        p4d_t *p4dp, p4d;
        pud_t *pudp, pud;
        pmd_t *pmdp, pmd;
        pte_t *ptep, pte;

        pfnmap_lockdep_assert(vma);

        if (unlikely(address < vma->vm_start || address >= vma->vm_end))
                goto out;

        if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
                goto out;
retry:
        pgdp = pgd_offset(mm, address);
        if (pgd_none(*pgdp) || unlikely(pgd_bad(*pgdp)))
                goto out;

        p4dp = p4d_offset(pgdp, address);
        p4d = READ_ONCE(*p4dp);
        if (p4d_none(p4d) || unlikely(p4d_bad(p4d)))
                goto out;

        pudp = pud_offset(p4dp, address);
        pud = READ_ONCE(*pudp);
        if (pud_none(pud))
                goto out;
        if (pud_leaf(pud)) {
                lock = pud_lock(mm, pudp);
                if (!unlikely(pud_leaf(pud))) {
                        spin_unlock(lock);
                        goto retry;
                }
                pfnmap_args_setup(args, lock, NULL, pud_pgprot(pud),
                                  pud_pfn(pud), PUD_MASK, pud_write(pud),
                                  pud_special(pud));
                return 0;
        }

        pmdp = pmd_offset(pudp, address);
        pmd = pmdp_get_lockless(pmdp);
        if (pmd_leaf(pmd)) {
                lock = pmd_lock(mm, pmdp);
                if (!unlikely(pmd_leaf(pmd))) {
                        spin_unlock(lock);
                        goto retry;
                }
                pfnmap_args_setup(args, lock, NULL, pmd_pgprot(pmd),
                                  pmd_pfn(pmd), PMD_MASK, pmd_write(pmd),
                                  pmd_special(pmd));
                return 0;
        }

        ptep = pte_offset_map_lock(mm, pmdp, address, &lock);
        if (!ptep)
                goto out;
        pte = ptep_get(ptep);
        if (!pte_present(pte))
                goto unlock;
        pfnmap_args_setup(args, lock, ptep, pte_pgprot(pte),
                          pte_pfn(pte), PAGE_MASK, pte_write(pte),
                          pte_special(pte));
        return 0;
unlock:
        pte_unmap_unlock(ptep, lock);
out:
        return -EINVAL;
}
EXPORT_SYMBOL_GPL(follow_pfnmap_start);

/**
 * follow_pfnmap_end(): End a follow_pfnmap_start() process
 * @args: Pointer to struct @follow_pfnmap_args
 *
 * Must be used in pair of follow_pfnmap_start().  See the start() function
 * above for more information.
 */
void follow_pfnmap_end(struct follow_pfnmap_args *args)
{
        if (args->lock)
                spin_unlock(args->lock);
        if (args->ptep)
                pte_unmap(args->ptep);
}
EXPORT_SYMBOL_GPL(follow_pfnmap_end);

#ifdef CONFIG_HAVE_IOREMAP_PROT
/**
 * generic_access_phys - generic implementation for iomem mmap access
 * @vma: the vma to access
 * @addr: userspace address, not relative offset within @vma
 * @buf: buffer to read/write
 * @len: length of transfer
 * @write: set to FOLL_WRITE when writing, otherwise reading
 *
 * This is a generic implementation for &vm_operations_struct.access for an
 * iomem mapping. This callback is used by access_process_vm() when the @vma is
 * not page based.
 */
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
                        void *buf, int len, int write)
{
        resource_size_t phys_addr;
        pgprot_t prot = __pgprot(0);
        void __iomem *maddr;
        int offset = offset_in_page(addr);
        int ret = -EINVAL;
        bool writable;
        struct follow_pfnmap_args args = { .vma = vma, .address = addr };

retry:
        if (follow_pfnmap_start(&args))
                return -EINVAL;
        prot = args.pgprot;
        phys_addr = (resource_size_t)args.pfn << PAGE_SHIFT;
        writable = args.writable;
        follow_pfnmap_end(&args);

        if ((write & FOLL_WRITE) && !writable)
                return -EINVAL;

        maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
        if (!maddr)
                return -ENOMEM;

        if (follow_pfnmap_start(&args))
                goto out_unmap;

        if ((pgprot_val(prot) != pgprot_val(args.pgprot)) ||
            (phys_addr != (args.pfn << PAGE_SHIFT)) ||
            (writable != args.writable)) {
                follow_pfnmap_end(&args);
                iounmap(maddr);
                goto retry;
        }

        if (write)
                memcpy_toio(maddr + offset, buf, len);
        else
                memcpy_fromio(buf, maddr + offset, len);
        ret = len;
        follow_pfnmap_end(&args);
out_unmap:
        iounmap(maddr);

        return ret;
}
EXPORT_SYMBOL_GPL(generic_access_phys);
#endif

/*
 * Access another process' address space as given in mm.
 */
static int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
                              void *buf, int len, unsigned int gup_flags)
{
        void *old_buf = buf;
        int write = gup_flags & FOLL_WRITE;

        if (mmap_read_lock_killable(mm))
                return 0;

        /* Untag the address before looking up the VMA */
        addr = untagged_addr_remote(mm, addr);

        /* Avoid triggering the temporary warning in __get_user_pages */
        if (!vma_lookup(mm, addr) && !expand_stack(mm, addr))
                return 0;

        /* ignore errors, just check how much was successfully transferred */
        while (len) {
                int bytes, offset;
                void *maddr;
                struct folio *folio;
                struct vm_area_struct *vma = NULL;
                struct page *page = get_user_page_vma_remote(mm, addr,
                                                             gup_flags, &vma);

                if (IS_ERR(page)) {
                        /* We might need to expand the stack to access it */
                        vma = vma_lookup(mm, addr);
                        if (!vma) {
                                vma = expand_stack(mm, addr);

                                /* mmap_lock was dropped on failure */
                                if (!vma)
                                        return buf - old_buf;

                                /* Try again if stack expansion worked */
                                continue;
                        }

                        /*
                         * Check if this is a VM_IO | VM_PFNMAP VMA, which
                         * we can access using slightly different code.
                         */
                        bytes = 0;
#ifdef CONFIG_HAVE_IOREMAP_PROT
                        if (vma->vm_ops && vma->vm_ops->access)
                                bytes = vma->vm_ops->access(vma, addr, buf,
                                                            len, write);
#endif
                        if (bytes <= 0)
                                break;
                } else {
                        folio = page_folio(page);
                        bytes = len;
                        offset = addr & (PAGE_SIZE-1);
                        if (bytes > PAGE_SIZE-offset)
                                bytes = PAGE_SIZE-offset;

                        maddr = kmap_local_folio(folio, folio_page_idx(folio, page) * PAGE_SIZE);
                        if (write) {
                                copy_to_user_page(vma, page, addr,
                                                  maddr + offset, buf, bytes);
                                folio_mark_dirty_lock(folio);
                        } else {
                                copy_from_user_page(vma, page, addr,
                                                    buf, maddr + offset, bytes);
                        }
                        folio_release_kmap(folio, maddr);
                }
                len -= bytes;
                buf += bytes;
                addr += bytes;
        }
        mmap_read_unlock(mm);

        return buf - old_buf;
}

/**
 * access_remote_vm - access another process' address space
 * @mm:                the mm_struct of the target address space
 * @addr:        start address to access
 * @buf:        source or destination buffer
 * @len:        number of bytes to transfer
 * @gup_flags:        flags modifying lookup behaviour
 *
 * The caller must hold a reference on @mm.
 *
 * Return: number of bytes copied from source to destination.
 */
int access_remote_vm(struct mm_struct *mm, unsigned long addr,
                void *buf, int len, unsigned int gup_flags)
{
        return __access_remote_vm(mm, addr, buf, len, gup_flags);
}

/*
 * Access another process' address space.
 * Source/target buffer must be kernel space,
 * Do not walk the page table directly, use get_user_pages
 */
int access_process_vm(struct task_struct *tsk, unsigned long addr,
                void *buf, int len, unsigned int gup_flags)
{
        struct mm_struct *mm;
        int ret;

        mm = get_task_mm(tsk);
        if (!mm)
                return 0;

        ret = __access_remote_vm(mm, addr, buf, len, gup_flags);

        mmput(mm);

        return ret;
}
EXPORT_SYMBOL_GPL(access_process_vm);

#ifdef CONFIG_BPF_SYSCALL
/*
 * Copy a string from another process's address space as given in mm.
 * If there is any error return -EFAULT.
 */
static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr,
                                void *buf, int len, unsigned int gup_flags)
{
        void *old_buf = buf;
        int err = 0;

        *(char *)buf = '\0';

        if (mmap_read_lock_killable(mm))
                return -EFAULT;

        addr = untagged_addr_remote(mm, addr);

        /* Avoid triggering the temporary warning in __get_user_pages */
        if (!vma_lookup(mm, addr)) {
                err = -EFAULT;
                goto out;
        }

        while (len) {
                int bytes, offset, retval;
                void *maddr;
                struct folio *folio;
                struct page *page;
                struct vm_area_struct *vma = NULL;

                page = get_user_page_vma_remote(mm, addr, gup_flags, &vma);
                if (IS_ERR(page)) {
                        /*
                         * Treat as a total failure for now until we decide how
                         * to handle the CONFIG_HAVE_IOREMAP_PROT case and
                         * stack expansion.
                         */
                        *(char *)buf = '\0';
                        err = -EFAULT;
                        goto out;
                }

                folio = page_folio(page);
                bytes = len;
                offset = addr & (PAGE_SIZE - 1);
                if (bytes > PAGE_SIZE - offset)
                        bytes = PAGE_SIZE - offset;

                maddr = kmap_local_folio(folio, folio_page_idx(folio, page) * PAGE_SIZE);
                retval = strscpy(buf, maddr + offset, bytes);
                if (retval >= 0) {
                        /* Found the end of the string */
                        buf += retval;
                        folio_release_kmap(folio, maddr);
                        break;
                }

                buf += bytes - 1;
                /*
                 * Because strscpy always NUL terminates we need to
                 * copy the last byte in the page if we are going to
                 * load more pages
                 */
                if (bytes != len) {
                        addr += bytes - 1;
                        copy_from_user_page(vma, page, addr, buf, maddr + (PAGE_SIZE - 1), 1);
                        buf += 1;
                        addr += 1;
                }
                len -= bytes;

                folio_release_kmap(folio, maddr);
        }

out:
        mmap_read_unlock(mm);
        if (err)
                return err;
        return buf - old_buf;
}

/**
 * copy_remote_vm_str - copy a string from another process's address space.
 * @tsk:        the task of the target address space
 * @addr:        start address to read from
 * @buf:        destination buffer
 * @len:        number of bytes to copy
 * @gup_flags:        flags modifying lookup behaviour
 *
 * The caller must hold a reference on @mm.
 *
 * Return: number of bytes copied from @addr (source) to @buf (destination);
 * not including the trailing NUL. Always guaranteed to leave NUL-terminated
 * buffer. On any error, return -EFAULT.
 */
int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr,
                       void *buf, int len, unsigned int gup_flags)
{
        struct mm_struct *mm;
        int ret;

        if (unlikely(len == 0))
                return 0;

        mm = get_task_mm(tsk);
        if (!mm) {
                *(char *)buf = '\0';
                return -EFAULT;
        }

        ret = __copy_remote_vm_str(mm, addr, buf, len, gup_flags);

        mmput(mm);

        return ret;
}
EXPORT_SYMBOL_GPL(copy_remote_vm_str);
#endif /* CONFIG_BPF_SYSCALL */

/*
 * Print the name of a VMA.
 */
void print_vma_addr(char *prefix, unsigned long ip)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;

        /*
         * we might be running from an atomic context so we cannot sleep
         */
        if (!mmap_read_trylock(mm))
                return;

        vma = vma_lookup(mm, ip);
        if (vma && vma->vm_file) {
                struct file *f = vma->vm_file;
                ip -= vma->vm_start;
                ip += vma->vm_pgoff << PAGE_SHIFT;
                printk("%s%pD[%lx,%lx+%lx]", prefix, f, ip,
                                vma->vm_start,
                                vma->vm_end - vma->vm_start);
        }
        mmap_read_unlock(mm);
}

#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
void __might_fault(const char *file, int line)
{
        if (pagefault_disabled())
                return;
        __might_sleep(file, line);
        if (current->mm)
                might_lock_read(&current->mm->mmap_lock);
}
EXPORT_SYMBOL(__might_fault);
#endif

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
/*
 * Process all subpages of the specified huge page with the specified
 * operation.  The target subpage will be processed last to keep its
 * cache lines hot.
 */
static inline int process_huge_page(
        unsigned long addr_hint, unsigned int nr_pages,
        int (*process_subpage)(unsigned long addr, int idx, void *arg),
        void *arg)
{
        int i, n, base, l, ret;
        unsigned long addr = addr_hint &
                ~(((unsigned long)nr_pages << PAGE_SHIFT) - 1);

        /* Process target subpage last to keep its cache lines hot */
        might_sleep();
        n = (addr_hint - addr) / PAGE_SIZE;
        if (2 * n <= nr_pages) {
                /* If target subpage in first half of huge page */
                base = 0;
                l = n;
                /* Process subpages at the end of huge page */
                for (i = nr_pages - 1; i >= 2 * n; i--) {
                        cond_resched();
                        ret = process_subpage(addr + i * PAGE_SIZE, i, arg);
                        if (ret)
                                return ret;
                }
        } else {
                /* If target subpage in second half of huge page */
                base = nr_pages - 2 * (nr_pages - n);
                l = nr_pages - n;
                /* Process subpages at the begin of huge page */
                for (i = 0; i < base; i++) {
                        cond_resched();
                        ret = process_subpage(addr + i * PAGE_SIZE, i, arg);
                        if (ret)
                                return ret;
                }
        }
        /*
         * Process remaining subpages in left-right-left-right pattern
         * towards the target subpage
         */
        for (i = 0; i < l; i++) {
                int left_idx = base + i;
                int right_idx = base + 2 * l - 1 - i;

                cond_resched();
                ret = process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
                if (ret)
                        return ret;
                cond_resched();
                ret = process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
                if (ret)
                        return ret;
        }
        return 0;
}

static void clear_gigantic_page(struct folio *folio, unsigned long addr_hint,
                                unsigned int nr_pages)
{
        unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(folio));
        int i;

        might_sleep();
        for (i = 0; i < nr_pages; i++) {
                cond_resched();
                clear_user_highpage(folio_page(folio, i), addr + i * PAGE_SIZE);
        }
}

static int clear_subpage(unsigned long addr, int idx, void *arg)
{
        struct folio *folio = arg;

        clear_user_highpage(folio_page(folio, idx), addr);
        return 0;
}

/**
 * folio_zero_user - Zero a folio which will be mapped to userspace.
 * @folio: The folio to zero.
 * @addr_hint: The address will be accessed or the base address if uncelar.
 */
void folio_zero_user(struct folio *folio, unsigned long addr_hint)
{
        unsigned int nr_pages = folio_nr_pages(folio);

        if (unlikely(nr_pages > MAX_ORDER_NR_PAGES))
                clear_gigantic_page(folio, addr_hint, nr_pages);
        else
                process_huge_page(addr_hint, nr_pages, clear_subpage, folio);
}

static int copy_user_gigantic_page(struct folio *dst, struct folio *src,
                                   unsigned long addr_hint,
                                   struct vm_area_struct *vma,
                                   unsigned int nr_pages)
{
        unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(dst));
        struct page *dst_page;
        struct page *src_page;
        int i;

        for (i = 0; i < nr_pages; i++) {
                dst_page = folio_page(dst, i);
                src_page = folio_page(src, i);

                cond_resched();
                if (copy_mc_user_highpage(dst_page, src_page,
                                          addr + i*PAGE_SIZE, vma))
                        return -EHWPOISON;
        }
        return 0;
}

struct copy_subpage_arg {
        struct folio *dst;
        struct folio *src;
        struct vm_area_struct *vma;
};

static int copy_subpage(unsigned long addr, int idx, void *arg)
{
        struct copy_subpage_arg *copy_arg = arg;
        struct page *dst = folio_page(copy_arg->dst, idx);
        struct page *src = folio_page(copy_arg->src, idx);

        if (copy_mc_user_highpage(dst, src, addr, copy_arg->vma))
                return -EHWPOISON;
        return 0;
}

int copy_user_large_folio(struct folio *dst, struct folio *src,
                          unsigned long addr_hint, struct vm_area_struct *vma)
{
        unsigned int nr_pages = folio_nr_pages(dst);
        struct copy_subpage_arg arg = {
                .dst = dst,
                .src = src,
                .vma = vma,
        };

        if (unlikely(nr_pages > MAX_ORDER_NR_PAGES))
                return copy_user_gigantic_page(dst, src, addr_hint, vma, nr_pages);

        return process_huge_page(addr_hint, nr_pages, copy_subpage, &arg);
}

long copy_folio_from_user(struct folio *dst_folio,
                           const void __user *usr_src,
                           bool allow_pagefault)
{
        void *kaddr;
        unsigned long i, rc = 0;
        unsigned int nr_pages = folio_nr_pages(dst_folio);
        unsigned long ret_val = nr_pages * PAGE_SIZE;
        struct page *subpage;

        for (i = 0; i < nr_pages; i++) {
                subpage = folio_page(dst_folio, i);
                kaddr = kmap_local_page(subpage);
                if (!allow_pagefault)
                        pagefault_disable();
                rc = copy_from_user(kaddr, usr_src + i * PAGE_SIZE, PAGE_SIZE);
                if (!allow_pagefault)
                        pagefault_enable();
                kunmap_local(kaddr);

                ret_val -= (PAGE_SIZE - rc);
                if (rc)
                        break;

                flush_dcache_page(subpage);

                cond_resched();
        }
        return ret_val;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */

#if defined(CONFIG_SPLIT_PTE_PTLOCKS) && ALLOC_SPLIT_PTLOCKS

static struct kmem_cache *page_ptl_cachep;

void __init ptlock_cache_init(void)
{
        page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
                        SLAB_PANIC, NULL);
}

bool ptlock_alloc(struct ptdesc *ptdesc)
{
        spinlock_t *ptl;

        ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
        if (!ptl)
                return false;
        ptdesc->ptl = ptl;
        return true;
}

void ptlock_free(struct ptdesc *ptdesc)
{
        if (ptdesc->ptl)
                kmem_cache_free(page_ptl_cachep, ptdesc->ptl);
}
#endif

void vma_pgtable_walk_begin(struct vm_area_struct *vma)
{
        if (is_vm_hugetlb_page(vma))
                hugetlb_vma_lock_read(vma);
}

void vma_pgtable_walk_end(struct vm_area_struct *vma)
{
        if (is_vm_hugetlb_page(vma))
                hugetlb_vma_unlock_read(vma);
}





























































































































  148 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_X86_XSAVE_H
#define __ASM_X86_XSAVE_H

#include <linux/uaccess.h>
#include <linux/types.h>

#include <asm/processor.h>
#include <asm/fpu/api.h>
#include <asm/user.h>

/* Bit 63 of XCR0 is reserved for future expansion */
#define XFEATURE_MASK_EXTEND        (~(XFEATURE_MASK_FPSSE | (1ULL << 63)))

#define FXSAVE_SIZE        512

#define XSAVE_HDR_SIZE            64
#define XSAVE_HDR_OFFSET    FXSAVE_SIZE

#define XSAVE_YMM_SIZE            256
#define XSAVE_YMM_OFFSET    (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)

#define XSAVE_ALIGNMENT     64

/* All currently supported user features */
#define XFEATURE_MASK_USER_SUPPORTED (XFEATURE_MASK_FP | \
                                      XFEATURE_MASK_SSE | \
                                      XFEATURE_MASK_YMM | \
                                      XFEATURE_MASK_OPMASK | \
                                      XFEATURE_MASK_ZMM_Hi256 | \
                                      XFEATURE_MASK_Hi16_ZMM         | \
                                      XFEATURE_MASK_PKRU | \
                                      XFEATURE_MASK_BNDREGS | \
                                      XFEATURE_MASK_BNDCSR | \
                                      XFEATURE_MASK_XTILE | \
                                      XFEATURE_MASK_APX)

/*
 * Features which are restored when returning to user space.
 * PKRU is not restored on return to user space because PKRU
 * is switched eagerly in switch_to() and flush_thread()
 */
#define XFEATURE_MASK_USER_RESTORE        \
        (XFEATURE_MASK_USER_SUPPORTED & ~XFEATURE_MASK_PKRU)

/* Features which are dynamically enabled for a process on request */
#define XFEATURE_MASK_USER_DYNAMIC        XFEATURE_MASK_XTILE_DATA

/* Supervisor features which are enabled only in guest FPUs */
#define XFEATURE_MASK_GUEST_SUPERVISOR        XFEATURE_MASK_CET_KERNEL

/* All currently supported supervisor features */
#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID | \
                                            XFEATURE_MASK_CET_USER | \
                                            XFEATURE_MASK_GUEST_SUPERVISOR)

/*
 * A supervisor state component may not always contain valuable information,
 * and its size may be huge. Saving/restoring such supervisor state components
 * at each context switch can cause high CPU and space overhead, which should
 * be avoided. Such supervisor state components should only be saved/restored
 * on demand. The on-demand supervisor features are set in this mask.
 *
 * Unlike the existing supported supervisor features, an independent supervisor
 * feature does not allocate a buffer in task->fpu, and the corresponding
 * supervisor state component cannot be saved/restored at each context switch.
 *
 * To support an independent supervisor feature, a developer should follow the
 * dos and don'ts as below:
 * - Do dynamically allocate a buffer for the supervisor state component.
 * - Do manually invoke the XSAVES/XRSTORS instruction to save/restore the
 *   state component to/from the buffer.
 * - Don't set the bit corresponding to the independent supervisor feature in
 *   IA32_XSS at run time, since it has been set at boot time.
 */
#define XFEATURE_MASK_INDEPENDENT (XFEATURE_MASK_LBR)

/*
 * Unsupported supervisor features. When a supervisor feature in this mask is
 * supported in the future, move it to the supported supervisor feature mask.
 */
#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT)

/* All supervisor states including supported and unsupported states. */
#define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \
                                      XFEATURE_MASK_INDEPENDENT | \
                                      XFEATURE_MASK_SUPERVISOR_UNSUPPORTED)

/*
 * The feature mask required to restore FPU state:
 * - All user states which are not eagerly switched in switch_to()/exec()
 * - The suporvisor states
 */
#define XFEATURE_MASK_FPSTATE        (XFEATURE_MASK_USER_RESTORE | \
                                 XFEATURE_MASK_SUPERVISOR_SUPPORTED)

/*
 * Features in this mask have space allocated in the signal frame, but may not
 * have that space initialized when the feature is in its init state.
 */
#define XFEATURE_MASK_SIGFRAME_INITOPT        (XFEATURE_MASK_XTILE | \
                                         XFEATURE_MASK_USER_DYNAMIC)

extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];

extern void __init update_regset_xstate_info(unsigned int size,
                                             u64 xstate_mask);

int xfeature_size(int xfeature_nr);

void xsaves(struct xregs_state *xsave, u64 mask);
void xrstors(struct xregs_state *xsave, u64 mask);

int xfd_enable_feature(u64 xfd_err);

#ifdef CONFIG_X86_64
DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);
#endif

#ifdef CONFIG_X86_64
DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);

static __always_inline __pure bool fpu_state_size_dynamic(void)
{
        return static_branch_unlikely(&__fpu_state_size_dynamic);
}
#else
static __always_inline __pure bool fpu_state_size_dynamic(void)
{
        return false;
}
#endif

#endif
















































































































































































































































  268 

























































  312 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM sock

#if !defined(_TRACE_SOCK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_SOCK_H

#include <net/sock.h>
#include <net/ipv6.h>
#include <linux/tracepoint.h>
#include <linux/ipv6.h>
#include <linux/tcp.h>
#include <trace/events/net_probe_common.h>

#define family_names                        \
                EM(AF_INET)                                \
                EMe(AF_INET6)

/* The protocol traced by inet_sock_set_state */
#define inet_protocol_names                \
                EM(IPPROTO_TCP)                        \
                EM(IPPROTO_SCTP)                \
                EMe(IPPROTO_MPTCP)

#define tcp_state_names                        \
                EM(TCP_ESTABLISHED)                \
                EM(TCP_SYN_SENT)                \
                EM(TCP_SYN_RECV)                \
                EM(TCP_FIN_WAIT1)                \
                EM(TCP_FIN_WAIT2)                \
                EM(TCP_TIME_WAIT)                \
                EM(TCP_CLOSE)                        \
                EM(TCP_CLOSE_WAIT)                \
                EM(TCP_LAST_ACK)                \
                EM(TCP_LISTEN)                        \
                EM(TCP_CLOSING)                        \
                EMe(TCP_NEW_SYN_RECV)

#define skmem_kind_names                        \
                EM(SK_MEM_SEND)                        \
                EMe(SK_MEM_RECV)

/* enums need to be exported to user space */
#undef EM
#undef EMe
#define EM(a)       TRACE_DEFINE_ENUM(a);
#define EMe(a)      TRACE_DEFINE_ENUM(a);

family_names
inet_protocol_names
tcp_state_names
skmem_kind_names

#undef EM
#undef EMe
#define EM(a)       { a, #a },
#define EMe(a)      { a, #a }

#define show_family_name(val)                        \
        __print_symbolic(val, family_names)

#define show_inet_protocol_name(val)    \
        __print_symbolic(val, inet_protocol_names)

#define show_tcp_state_name(val)        \
        __print_symbolic(val, tcp_state_names)

#define show_skmem_kind_names(val)        \
        __print_symbolic(val, skmem_kind_names)

TRACE_EVENT(sock_rcvqueue_full,

        TP_PROTO(struct sock *sk, struct sk_buff *skb),

        TP_ARGS(sk, skb),

        TP_STRUCT__entry(
                __field(int, rmem_alloc)
                __field(unsigned int, truesize)
                __field(int, sk_rcvbuf)
        ),

        TP_fast_assign(
                __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc);
                __entry->truesize   = skb->truesize;
                __entry->sk_rcvbuf  = READ_ONCE(sk->sk_rcvbuf);
        ),

        TP_printk("rmem_alloc=%d truesize=%u sk_rcvbuf=%d",
                __entry->rmem_alloc, __entry->truesize, __entry->sk_rcvbuf)
);

TRACE_EVENT(sock_exceed_buf_limit,

        TP_PROTO(struct sock *sk, struct proto *prot, long allocated, int kind),

        TP_ARGS(sk, prot, allocated, kind),

        TP_STRUCT__entry(
                __array(char, name, 32)
                __array(long, sysctl_mem, 3)
                __field(long, allocated)
                __field(int, sysctl_rmem)
                __field(int, rmem_alloc)
                __field(int, sysctl_wmem)
                __field(int, wmem_alloc)
                __field(int, wmem_queued)
                __field(int, kind)
        ),

        TP_fast_assign(
                strscpy(__entry->name, prot->name, 32);
                __entry->sysctl_mem[0] = READ_ONCE(prot->sysctl_mem[0]);
                __entry->sysctl_mem[1] = READ_ONCE(prot->sysctl_mem[1]);
                __entry->sysctl_mem[2] = READ_ONCE(prot->sysctl_mem[2]);
                __entry->allocated = allocated;
                __entry->sysctl_rmem = sk_get_rmem0(sk, prot);
                __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc);
                __entry->sysctl_wmem = sk_get_wmem0(sk, prot);
                __entry->wmem_alloc = refcount_read(&sk->sk_wmem_alloc);
                __entry->wmem_queued = READ_ONCE(sk->sk_wmem_queued);
                __entry->kind = kind;
        ),

        TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld sysctl_rmem=%d rmem_alloc=%d sysctl_wmem=%d wmem_alloc=%d wmem_queued=%d kind=%s",
                __entry->name,
                __entry->sysctl_mem[0],
                __entry->sysctl_mem[1],
                __entry->sysctl_mem[2],
                __entry->allocated,
                __entry->sysctl_rmem,
                __entry->rmem_alloc,
                __entry->sysctl_wmem,
                __entry->wmem_alloc,
                __entry->wmem_queued,
                show_skmem_kind_names(__entry->kind)
        )
);

TRACE_EVENT(inet_sock_set_state,

        TP_PROTO(const struct sock *sk, const int oldstate, const int newstate),

        TP_ARGS(sk, oldstate, newstate),

        TP_STRUCT__entry(
                __field(const void *, skaddr)
                __field(int, oldstate)
                __field(int, newstate)
                __field(__u16, sport)
                __field(__u16, dport)
                __field(__u16, family)
                __field(__u16, protocol)
                __array(__u8, saddr, 4)
                __array(__u8, daddr, 4)
                __array(__u8, saddr_v6, 16)
                __array(__u8, daddr_v6, 16)
        ),

        TP_fast_assign(
                const struct inet_sock *inet = inet_sk(sk);
                __be32 *p32;

                __entry->skaddr = sk;
                __entry->oldstate = oldstate;
                __entry->newstate = newstate;

                __entry->family = sk->sk_family;
                __entry->protocol = sk->sk_protocol;
                __entry->sport = ntohs(inet->inet_sport);
                __entry->dport = ntohs(inet->inet_dport);

                p32 = (__be32 *) __entry->saddr;
                *p32 = inet->inet_saddr;

                p32 = (__be32 *) __entry->daddr;
                *p32 =  inet->inet_daddr;

                TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
                               sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
        ),

        TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s",
                        show_family_name(__entry->family),
                        show_inet_protocol_name(__entry->protocol),
                        __entry->sport, __entry->dport,
                        __entry->saddr, __entry->daddr,
                        __entry->saddr_v6, __entry->daddr_v6,
                        show_tcp_state_name(__entry->oldstate),
                        show_tcp_state_name(__entry->newstate))
);

TRACE_EVENT(inet_sk_error_report,

        TP_PROTO(const struct sock *sk),

        TP_ARGS(sk),

        TP_STRUCT__entry(
                __field(int, error)
                __field(__u16, sport)
                __field(__u16, dport)
                __field(__u16, family)
                __field(__u16, protocol)
                __array(__u8, saddr, 4)
                __array(__u8, daddr, 4)
                __array(__u8, saddr_v6, 16)
                __array(__u8, daddr_v6, 16)
        ),

        TP_fast_assign(
                const struct inet_sock *inet = inet_sk(sk);
                __be32 *p32;

                __entry->error = sk->sk_err;
                __entry->family = sk->sk_family;
                __entry->protocol = sk->sk_protocol;
                __entry->sport = ntohs(inet->inet_sport);
                __entry->dport = ntohs(inet->inet_dport);

                p32 = (__be32 *) __entry->saddr;
                *p32 = inet->inet_saddr;

                p32 = (__be32 *) __entry->daddr;
                *p32 =  inet->inet_daddr;

                TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
                               sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
        ),

        TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c error=%d",
                  show_family_name(__entry->family),
                  show_inet_protocol_name(__entry->protocol),
                  __entry->sport, __entry->dport,
                  __entry->saddr, __entry->daddr,
                  __entry->saddr_v6, __entry->daddr_v6,
                  __entry->error)
);

TRACE_EVENT(sk_data_ready,

        TP_PROTO(const struct sock *sk),

        TP_ARGS(sk),

        TP_STRUCT__entry(
                __field(const void *, skaddr)
                __field(__u16, family)
                __field(__u16, protocol)
                __field(unsigned long, ip)
        ),

        TP_fast_assign(
                __entry->skaddr = sk;
                __entry->family = sk->sk_family;
                __entry->protocol = sk->sk_protocol;
                __entry->ip = _RET_IP_;
        ),

        TP_printk("family=%u protocol=%u func=%ps",
                  __entry->family, __entry->protocol, (void *)__entry->ip)
);

/*
 * sock send/recv msg length
 */
DECLARE_EVENT_CLASS(sock_msg_length,

        TP_PROTO(struct sock *sk, int ret, int flags),

        TP_ARGS(sk, ret, flags),

        TP_STRUCT__entry(
                __field(void *, sk)
                __field(__u16, family)
                __field(__u16, protocol)
                __field(int, ret)
                __field(int, flags)
        ),

        TP_fast_assign(
                __entry->sk = sk;
                __entry->family = sk->sk_family;
                __entry->protocol = sk->sk_protocol;
                __entry->ret = ret;
                __entry->flags = flags;
        ),

        TP_printk("sk address = %p, family = %s protocol = %s, length = %d, error = %d, flags = 0x%x",
                  __entry->sk, show_family_name(__entry->family),
                  show_inet_protocol_name(__entry->protocol),
                  !(__entry->flags & MSG_PEEK) ?
                  (__entry->ret > 0 ? __entry->ret : 0) : 0,
                  __entry->ret < 0 ? __entry->ret : 0,
                  __entry->flags)
);

DEFINE_EVENT(sock_msg_length, sock_send_length,
        TP_PROTO(struct sock *sk, int ret, int flags),

        TP_ARGS(sk, ret, flags)
);

DEFINE_EVENT(sock_msg_length, sock_recv_length,
        TP_PROTO(struct sock *sk, int ret, int flags),

        TP_ARGS(sk, ret, flags)
);
#endif /* _TRACE_SOCK_H */

/* This part must be outside protection */
#include <trace/define_trace.h>












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_SCHED_GENERIC_H
#define __NET_SCHED_GENERIC_H

#include <linux/netdevice.h>
#include <linux/types.h>
#include <linux/rcupdate.h>
#include <linux/pkt_sched.h>
#include <linux/pkt_cls.h>
#include <linux/percpu.h>
#include <linux/dynamic_queue_limits.h>
#include <linux/list.h>
#include <linux/refcount.h>
#include <linux/workqueue.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/atomic.h>
#include <linux/hashtable.h>
#include <net/gen_stats.h>
#include <net/rtnetlink.h>
#include <net/flow_offload.h>
#include <linux/xarray.h>

struct Qdisc_ops;
struct qdisc_walker;
struct tcf_walker;
struct module;
struct bpf_flow_keys;

struct qdisc_rate_table {
        struct tc_ratespec rate;
        u32                data[256];
        struct qdisc_rate_table *next;
        int                refcnt;
};

enum qdisc_state_t {
        __QDISC_STATE_SCHED,
        __QDISC_STATE_DEACTIVATED,
        __QDISC_STATE_MISSED,
        __QDISC_STATE_DRAINING,
};

enum qdisc_state2_t {
        /* Only for !TCQ_F_NOLOCK qdisc. Never access it directly.
         * Use qdisc_run_begin/end() or qdisc_is_running() instead.
         */
        __QDISC_STATE2_RUNNING,
};

#define QDISC_STATE_MISSED        BIT(__QDISC_STATE_MISSED)
#define QDISC_STATE_DRAINING        BIT(__QDISC_STATE_DRAINING)

#define QDISC_STATE_NON_EMPTY        (QDISC_STATE_MISSED | \
                                        QDISC_STATE_DRAINING)

struct qdisc_size_table {
        struct rcu_head                rcu;
        struct list_head        list;
        struct tc_sizespec        szopts;
        int                        refcnt;
        u16                        data[];
};

/* similar to sk_buff_head, but skb->prev pointer is undefined. */
struct qdisc_skb_head {
        struct sk_buff        *head;
        struct sk_buff        *tail;
        __u32                qlen;
        spinlock_t        lock;
};

struct Qdisc {
        int                         (*enqueue)(struct sk_buff *skb,
                                           struct Qdisc *sch,
                                           struct sk_buff **to_free);
        struct sk_buff *        (*dequeue)(struct Qdisc *sch);
        unsigned int                flags;
#define TCQ_F_BUILTIN                1
#define TCQ_F_INGRESS                2
#define TCQ_F_CAN_BYPASS        4
#define TCQ_F_MQROOT                8
#define TCQ_F_ONETXQUEUE        0x10 /* dequeue_skb() can assume all skbs are for
                                      * q->dev_queue : It can test
                                      * netif_xmit_frozen_or_stopped() before
                                      * dequeueing next packet.
                                      * Its true for MQ/MQPRIO slaves, or non
                                      * multiqueue device.
                                      */
#define TCQ_F_WARN_NONWC        (1 << 16)
#define TCQ_F_CPUSTATS                0x20 /* run using percpu statistics */
#define TCQ_F_NOPARENT                0x40 /* root of its hierarchy :
                                      * qdisc_tree_decrease_qlen() should stop.
                                      */
#define TCQ_F_INVISIBLE                0x80 /* invisible by default in dump */
#define TCQ_F_NOLOCK                0x100 /* qdisc does not require locking */
#define TCQ_F_OFFLOADED                0x200 /* qdisc is offloaded to HW */
        u32                        limit;
        const struct Qdisc_ops        *ops;
        struct qdisc_size_table        __rcu *stab;
        struct hlist_node       hash;
        u32                        handle;
        u32                        parent;

        struct netdev_queue        *dev_queue;

        struct net_rate_estimator __rcu *rate_est;
        struct gnet_stats_basic_sync __percpu *cpu_bstats;
        struct gnet_stats_queue        __percpu *cpu_qstats;
        int                        pad;
        refcount_t                refcnt;

        /*
         * For performance sake on SMP, we put highly modified fields at the end
         */
        struct sk_buff_head        gso_skb ____cacheline_aligned_in_smp;
        struct qdisc_skb_head        q;
        struct gnet_stats_basic_sync bstats;
        struct gnet_stats_queue        qstats;
        int                     owner;
        unsigned long                state;
        unsigned long                state2; /* must be written under qdisc spinlock */
        struct Qdisc            *next_sched;
        struct sk_buff_head        skb_bad_txq;

        spinlock_t                busylock ____cacheline_aligned_in_smp;
        spinlock_t                seqlock;

        struct rcu_head                rcu;
        netdevice_tracker        dev_tracker;
        struct lock_class_key        root_lock_key;
        /* private data */
        long privdata[] ____cacheline_aligned;
};

static inline void qdisc_refcount_inc(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_BUILTIN)
                return;
        refcount_inc(&qdisc->refcnt);
}

static inline bool qdisc_refcount_dec_if_one(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_BUILTIN)
                return true;
        return refcount_dec_if_one(&qdisc->refcnt);
}

/* Intended to be used by unlocked users, when concurrent qdisc release is
 * possible.
 */

static inline struct Qdisc *qdisc_refcount_inc_nz(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_BUILTIN)
                return qdisc;
        if (refcount_inc_not_zero(&qdisc->refcnt))
                return qdisc;
        return NULL;
}

/* For !TCQ_F_NOLOCK qdisc: callers must either call this within a qdisc
 * root_lock section, or provide their own memory barriers -- ordering
 * against qdisc_run_begin/end() atomic bit operations.
 */
static inline bool qdisc_is_running(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_NOLOCK)
                return spin_is_locked(&qdisc->seqlock);
        return test_bit(__QDISC_STATE2_RUNNING, &qdisc->state2);
}

static inline bool nolock_qdisc_is_empty(const struct Qdisc *qdisc)
{
        return !(READ_ONCE(qdisc->state) & QDISC_STATE_NON_EMPTY);
}

static inline bool qdisc_is_percpu_stats(const struct Qdisc *q)
{
        return q->flags & TCQ_F_CPUSTATS;
}

static inline bool qdisc_is_empty(const struct Qdisc *qdisc)
{
        if (qdisc_is_percpu_stats(qdisc))
                return nolock_qdisc_is_empty(qdisc);
        return !READ_ONCE(qdisc->q.qlen);
}

/* For !TCQ_F_NOLOCK qdisc, qdisc_run_begin/end() must be invoked with
 * the qdisc root lock acquired.
 */
static inline bool qdisc_run_begin(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_NOLOCK) {
                if (spin_trylock(&qdisc->seqlock))
                        return true;

                /* No need to insist if the MISSED flag was already set.
                 * Note that test_and_set_bit() also gives us memory ordering
                 * guarantees wrt potential earlier enqueue() and below
                 * spin_trylock(), both of which are necessary to prevent races
                 */
                if (test_and_set_bit(__QDISC_STATE_MISSED, &qdisc->state))
                        return false;

                /* Try to take the lock again to make sure that we will either
                 * grab it or the CPU that still has it will see MISSED set
                 * when testing it in qdisc_run_end()
                 */
                return spin_trylock(&qdisc->seqlock);
        }
        return !__test_and_set_bit(__QDISC_STATE2_RUNNING, &qdisc->state2);
}

static inline void qdisc_run_end(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_NOLOCK) {
                spin_unlock(&qdisc->seqlock);

                /* spin_unlock() only has store-release semantic. The unlock
                 * and test_bit() ordering is a store-load ordering, so a full
                 * memory barrier is needed here.
                 */
                smp_mb();

                if (unlikely(test_bit(__QDISC_STATE_MISSED,
                                      &qdisc->state)))
                        __netif_schedule(qdisc);
        } else {
                __clear_bit(__QDISC_STATE2_RUNNING, &qdisc->state2);
        }
}

static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
{
        return qdisc->flags & TCQ_F_ONETXQUEUE;
}

static inline int qdisc_avail_bulklimit(const struct netdev_queue *txq)
{
        return netdev_queue_dql_avail(txq);
}

struct Qdisc_class_ops {
        unsigned int                flags;
        /* Child qdisc manipulation */
        struct netdev_queue *        (*select_queue)(struct Qdisc *, struct tcmsg *);
        int                        (*graft)(struct Qdisc *, unsigned long cl,
                                        struct Qdisc *, struct Qdisc **,
                                        struct netlink_ext_ack *extack);
        struct Qdisc *                (*leaf)(struct Qdisc *, unsigned long cl);
        void                        (*qlen_notify)(struct Qdisc *, unsigned long);

        /* Class manipulation routines */
        unsigned long                (*find)(struct Qdisc *, u32 classid);
        int                        (*change)(struct Qdisc *, u32, u32,
                                        struct nlattr **, unsigned long *,
                                        struct netlink_ext_ack *);
        int                        (*delete)(struct Qdisc *, unsigned long,
                                          struct netlink_ext_ack *);
        void                        (*walk)(struct Qdisc *, struct qdisc_walker * arg);

        /* Filter manipulation */
        struct tcf_block *        (*tcf_block)(struct Qdisc *sch,
                                             unsigned long arg,
                                             struct netlink_ext_ack *extack);
        unsigned long                (*bind_tcf)(struct Qdisc *, unsigned long,
                                        u32 classid);
        void                        (*unbind_tcf)(struct Qdisc *, unsigned long);

        /* rtnetlink specific */
        int                        (*dump)(struct Qdisc *, unsigned long,
                                        struct sk_buff *skb, struct tcmsg*);
        int                        (*dump_stats)(struct Qdisc *, unsigned long,
                                        struct gnet_dump *);
};

/* Qdisc_class_ops flag values */

/* Implements API that doesn't require rtnl lock */
enum qdisc_class_ops_flags {
        QDISC_CLASS_OPS_DOIT_UNLOCKED = 1,
};

struct Qdisc_ops {
        struct Qdisc_ops        *next;
        const struct Qdisc_class_ops        *cl_ops;
        char                        id[IFNAMSIZ];
        int                        priv_size;
        unsigned int                static_flags;

        int                         (*enqueue)(struct sk_buff *skb,
                                           struct Qdisc *sch,
                                           struct sk_buff **to_free);
        struct sk_buff *        (*dequeue)(struct Qdisc *);
        struct sk_buff *        (*peek)(struct Qdisc *);

        int                        (*init)(struct Qdisc *sch, struct nlattr *arg,
                                        struct netlink_ext_ack *extack);
        void                        (*reset)(struct Qdisc *);
        void                        (*destroy)(struct Qdisc *);
        int                        (*change)(struct Qdisc *sch,
                                          struct nlattr *arg,
                                          struct netlink_ext_ack *extack);
        void                        (*attach)(struct Qdisc *sch);
        int                        (*change_tx_queue_len)(struct Qdisc *, unsigned int);
        void                        (*change_real_num_tx)(struct Qdisc *sch,
                                                      unsigned int new_real_tx);

        int                        (*dump)(struct Qdisc *, struct sk_buff *);
        int                        (*dump_stats)(struct Qdisc *, struct gnet_dump *);

        void                        (*ingress_block_set)(struct Qdisc *sch,
                                                     u32 block_index);
        void                        (*egress_block_set)(struct Qdisc *sch,
                                                    u32 block_index);
        u32                        (*ingress_block_get)(struct Qdisc *sch);
        u32                        (*egress_block_get)(struct Qdisc *sch);

        struct module                *owner;
};

struct tcf_result {
        union {
                struct {
                        unsigned long        class;
                        u32                classid;
                };
                const struct tcf_proto *goto_tp;
        };
};

struct tcf_chain;

struct tcf_proto_ops {
        struct list_head        head;
        char                        kind[IFNAMSIZ];

        int                        (*classify)(struct sk_buff *,
                                            const struct tcf_proto *,
                                            struct tcf_result *);
        int                        (*init)(struct tcf_proto*);
        void                        (*destroy)(struct tcf_proto *tp, bool rtnl_held,
                                           struct netlink_ext_ack *extack);

        void*                        (*get)(struct tcf_proto*, u32 handle);
        void                        (*put)(struct tcf_proto *tp, void *f);
        int                        (*change)(struct net *net, struct sk_buff *,
                                        struct tcf_proto*, unsigned long,
                                        u32 handle, struct nlattr **,
                                        void **, u32,
                                        struct netlink_ext_ack *);
        int                        (*delete)(struct tcf_proto *tp, void *arg,
                                          bool *last, bool rtnl_held,
                                          struct netlink_ext_ack *);
        bool                        (*delete_empty)(struct tcf_proto *tp);
        void                        (*walk)(struct tcf_proto *tp,
                                        struct tcf_walker *arg, bool rtnl_held);
        int                        (*reoffload)(struct tcf_proto *tp, bool add,
                                             flow_setup_cb_t *cb, void *cb_priv,
                                             struct netlink_ext_ack *extack);
        void                        (*hw_add)(struct tcf_proto *tp,
                                          void *type_data);
        void                        (*hw_del)(struct tcf_proto *tp,
                                          void *type_data);
        void                        (*bind_class)(void *, u32, unsigned long,
                                              void *, unsigned long);
        void *                        (*tmplt_create)(struct net *net,
                                                struct tcf_chain *chain,
                                                struct nlattr **tca,
                                                struct netlink_ext_ack *extack);
        void                        (*tmplt_destroy)(void *tmplt_priv);
        void                        (*tmplt_reoffload)(struct tcf_chain *chain,
                                                   bool add,
                                                   flow_setup_cb_t *cb,
                                                   void *cb_priv);
        struct tcf_exts *        (*get_exts)(const struct tcf_proto *tp,
                                            u32 handle);

        /* rtnetlink specific */
        int                        (*dump)(struct net*, struct tcf_proto*, void *,
                                        struct sk_buff *skb, struct tcmsg*,
                                        bool);
        int                        (*terse_dump)(struct net *net,
                                              struct tcf_proto *tp, void *fh,
                                              struct sk_buff *skb,
                                              struct tcmsg *t, bool rtnl_held);
        int                        (*tmplt_dump)(struct sk_buff *skb,
                                              struct net *net,
                                              void *tmplt_priv);

        struct module                *owner;
        int                        flags;
};

/* Classifiers setting TCF_PROTO_OPS_DOIT_UNLOCKED in tcf_proto_ops->flags
 * are expected to implement tcf_proto_ops->delete_empty(), otherwise race
 * conditions can occur when filters are inserted/deleted simultaneously.
 */
enum tcf_proto_ops_flags {
        TCF_PROTO_OPS_DOIT_UNLOCKED = 1,
};

struct tcf_proto {
        /* Fast access part */
        struct tcf_proto __rcu        *next;
        void __rcu                *root;

        /* called under RCU BH lock*/
        int                        (*classify)(struct sk_buff *,
                                            const struct tcf_proto *,
                                            struct tcf_result *);
        __be16                        protocol;

        /* All the rest */
        u32                        prio;
        void                        *data;
        const struct tcf_proto_ops        *ops;
        struct tcf_chain        *chain;
        /* Lock protects tcf_proto shared state and can be used by unlocked
         * classifiers to protect their private data.
         */
        spinlock_t                lock;
        bool                        deleting;
        bool                        counted;
        bool                        usesw;
        refcount_t                refcnt;
        struct rcu_head                rcu;
        struct hlist_node        destroy_ht_node;
};

struct qdisc_skb_cb {
        struct {
                unsigned int                pkt_len;
                u16                        slave_dev_queue_mapping;
                u16                        tc_classid;
        };
#define QDISC_CB_PRIV_LEN 20
        unsigned char                data[QDISC_CB_PRIV_LEN];
};

typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv);

struct tcf_chain {
        /* Protects filter_chain. */
        struct mutex filter_chain_lock;
        struct tcf_proto __rcu *filter_chain;
        struct list_head list;
        struct tcf_block *block;
        u32 index; /* chain index */
        unsigned int refcnt;
        unsigned int action_refcnt;
        bool explicitly_created;
        bool flushing;
        const struct tcf_proto_ops *tmplt_ops;
        void *tmplt_priv;
        struct rcu_head rcu;
};

struct tcf_block {
        struct xarray ports; /* datapath accessible */
        /* Lock protects tcf_block and lifetime-management data of chains
         * attached to the block (refcnt, action_refcnt, explicitly_created).
         */
        struct mutex lock;
        struct list_head chain_list;
        u32 index; /* block index for shared blocks */
        u32 classid; /* which class this block belongs to */
        refcount_t refcnt;
        struct net *net;
        struct Qdisc *q;
        struct rw_semaphore cb_lock; /* protects cb_list and offload counters */
        struct flow_block flow_block;
        struct list_head owner_list;
        bool keep_dst;
        atomic_t useswcnt;
        atomic_t offloadcnt; /* Number of oddloaded filters */
        unsigned int nooffloaddevcnt; /* Number of devs unable to do offload */
        unsigned int lockeddevcnt; /* Number of devs that require rtnl lock. */
        struct {
                struct tcf_chain *chain;
                struct list_head filter_chain_list;
        } chain0;
        struct rcu_head rcu;
        DECLARE_HASHTABLE(proto_destroy_ht, 7);
        struct mutex proto_destroy_lock; /* Lock for proto_destroy hashtable. */
};

struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index);

static inline bool lockdep_tcf_chain_is_locked(struct tcf_chain *chain)
{
        return lockdep_is_held(&chain->filter_chain_lock);
}

static inline bool lockdep_tcf_proto_is_locked(struct tcf_proto *tp)
{
        return lockdep_is_held(&tp->lock);
}

#define tcf_chain_dereference(p, chain)                                        \
        rcu_dereference_protected(p, lockdep_tcf_chain_is_locked(chain))

#define tcf_proto_dereference(p, tp)                                        \
        rcu_dereference_protected(p, lockdep_tcf_proto_is_locked(tp))

static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
{
        struct qdisc_skb_cb *qcb;

        BUILD_BUG_ON(sizeof(skb->cb) < sizeof(*qcb));
        BUILD_BUG_ON(sizeof(qcb->data) < sz);
}

static inline int qdisc_qlen(const struct Qdisc *q)
{
        return q->q.qlen;
}

static inline int qdisc_qlen_sum(const struct Qdisc *q)
{
        __u32 qlen = q->qstats.qlen;
        int i;

        if (qdisc_is_percpu_stats(q)) {
                for_each_possible_cpu(i)
                        qlen += per_cpu_ptr(q->cpu_qstats, i)->qlen;
        } else {
                qlen += q->q.qlen;
        }

        return qlen;
}

static inline struct qdisc_skb_cb *qdisc_skb_cb(const struct sk_buff *skb)
{
        return (struct qdisc_skb_cb *)skb->cb;
}

static inline spinlock_t *qdisc_lock(struct Qdisc *qdisc)
{
        return &qdisc->q.lock;
}

static inline struct Qdisc *qdisc_root(const struct Qdisc *qdisc)
{
        struct Qdisc *q = rcu_dereference_rtnl(qdisc->dev_queue->qdisc);

        return q;
}

static inline struct Qdisc *qdisc_root_bh(const struct Qdisc *qdisc)
{
        return rcu_dereference_bh(qdisc->dev_queue->qdisc);
}

static inline struct Qdisc *qdisc_root_sleeping(const struct Qdisc *qdisc)
{
        return rcu_dereference_rtnl(qdisc->dev_queue->qdisc_sleeping);
}

static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc)
{
        struct Qdisc *root = qdisc_root_sleeping(qdisc);

        ASSERT_RTNL();
        return qdisc_lock(root);
}

static inline struct net_device *qdisc_dev(const struct Qdisc *qdisc)
{
        return qdisc->dev_queue->dev;
}

static inline void sch_tree_lock(struct Qdisc *q)
{
        if (q->flags & TCQ_F_MQROOT)
                spin_lock_bh(qdisc_lock(q));
        else
                spin_lock_bh(qdisc_root_sleeping_lock(q));
}

static inline void sch_tree_unlock(struct Qdisc *q)
{
        if (q->flags & TCQ_F_MQROOT)
                spin_unlock_bh(qdisc_lock(q));
        else
                spin_unlock_bh(qdisc_root_sleeping_lock(q));
}

extern struct Qdisc noop_qdisc;
extern struct Qdisc_ops noop_qdisc_ops;
extern struct Qdisc_ops pfifo_fast_ops;
extern const u8 sch_default_prio2band[TC_PRIO_MAX + 1];
extern struct Qdisc_ops mq_qdisc_ops;
extern struct Qdisc_ops noqueue_qdisc_ops;
extern const struct Qdisc_ops *default_qdisc_ops;
static inline const struct Qdisc_ops *
get_default_qdisc_ops(const struct net_device *dev, int ntx)
{
        return ntx < dev->real_num_tx_queues ?
                        default_qdisc_ops : &pfifo_fast_ops;
}

struct Qdisc_class_common {
        u32                        classid;
        unsigned int                filter_cnt;
        struct hlist_node        hnode;
};

struct Qdisc_class_hash {
        struct hlist_head        *hash;
        unsigned int                hashsize;
        unsigned int                hashmask;
        unsigned int                hashelems;
};

static inline unsigned int qdisc_class_hash(u32 id, u32 mask)
{
        id ^= id >> 8;
        id ^= id >> 4;
        return id & mask;
}

static inline struct Qdisc_class_common *
qdisc_class_find(const struct Qdisc_class_hash *hash, u32 id)
{
        struct Qdisc_class_common *cl;
        unsigned int h;

        if (!id)
                return NULL;

        h = qdisc_class_hash(id, hash->hashmask);
        hlist_for_each_entry(cl, &hash->hash[h], hnode) {
                if (cl->classid == id)
                        return cl;
        }
        return NULL;
}

static inline bool qdisc_class_in_use(const struct Qdisc_class_common *cl)
{
        return cl->filter_cnt > 0;
}

static inline void qdisc_class_get(struct Qdisc_class_common *cl)
{
        unsigned int res;

        if (check_add_overflow(cl->filter_cnt, 1, &res))
                WARN(1, "Qdisc class overflow");

        cl->filter_cnt = res;
}

static inline void qdisc_class_put(struct Qdisc_class_common *cl)
{
        unsigned int res;

        if (check_sub_overflow(cl->filter_cnt, 1, &res))
                WARN(1, "Qdisc class underflow");

        cl->filter_cnt = res;
}

static inline int tc_classid_to_hwtc(struct net_device *dev, u32 classid)
{
        u32 hwtc = TC_H_MIN(classid) - TC_H_MIN_PRIORITY;

        return (hwtc < netdev_get_num_tc(dev)) ? hwtc : -EINVAL;
}

int qdisc_class_hash_init(struct Qdisc_class_hash *);
void qdisc_class_hash_insert(struct Qdisc_class_hash *,
                             struct Qdisc_class_common *);
void qdisc_class_hash_remove(struct Qdisc_class_hash *,
                             struct Qdisc_class_common *);
void qdisc_class_hash_grow(struct Qdisc *, struct Qdisc_class_hash *);
void qdisc_class_hash_destroy(struct Qdisc_class_hash *);

int dev_qdisc_change_tx_queue_len(struct net_device *dev);
void dev_qdisc_change_real_num_tx(struct net_device *dev,
                                  unsigned int new_real_tx);
void dev_init_scheduler(struct net_device *dev);
void dev_shutdown(struct net_device *dev);
void dev_activate(struct net_device *dev);
void dev_deactivate(struct net_device *dev);
void dev_deactivate_many(struct list_head *head);
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
                              struct Qdisc *qdisc);
void qdisc_reset(struct Qdisc *qdisc);
void qdisc_destroy(struct Qdisc *qdisc);
void qdisc_put(struct Qdisc *qdisc);
void qdisc_put_unlocked(struct Qdisc *qdisc);
void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, int n, int len);
#ifdef CONFIG_NET_SCHED
int qdisc_offload_dump_helper(struct Qdisc *q, enum tc_setup_type type,
                              void *type_data);
void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
                                struct Qdisc *new, struct Qdisc *old,
                                enum tc_setup_type type, void *type_data,
                                struct netlink_ext_ack *extack);
#else
static inline int
qdisc_offload_dump_helper(struct Qdisc *q, enum tc_setup_type type,
                          void *type_data)
{
        q->flags &= ~TCQ_F_OFFLOADED;
        return 0;
}

static inline void
qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
                           struct Qdisc *new, struct Qdisc *old,
                           enum tc_setup_type type, void *type_data,
                           struct netlink_ext_ack *extack)
{
}
#endif
void qdisc_offload_query_caps(struct net_device *dev,
                              enum tc_setup_type type,
                              void *caps, size_t caps_len);
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
                          const struct Qdisc_ops *ops,
                          struct netlink_ext_ack *extack);
void qdisc_free(struct Qdisc *qdisc);
struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
                                const struct Qdisc_ops *ops, u32 parentid,
                                struct netlink_ext_ack *extack);
void __qdisc_calculate_pkt_len(struct sk_buff *skb,
                               const struct qdisc_size_table *stab);
int skb_do_redirect(struct sk_buff *);

static inline bool skb_at_tc_ingress(const struct sk_buff *skb)
{
#ifdef CONFIG_NET_XGRESS
        return skb->tc_at_ingress;
#else
        return false;
#endif
}

static inline bool skb_skip_tc_classify(struct sk_buff *skb)
{
#ifdef CONFIG_NET_CLS_ACT
        if (skb->tc_skip_classify) {
                skb->tc_skip_classify = 0;
                return true;
        }
#endif
        return false;
}

/* Reset all TX qdiscs greater than index of a device.  */
static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i)
{
        struct Qdisc *qdisc;

        for (; i < dev->num_tx_queues; i++) {
                qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc);
                if (qdisc) {
                        spin_lock_bh(qdisc_lock(qdisc));
                        qdisc_reset(qdisc);
                        spin_unlock_bh(qdisc_lock(qdisc));
                }
        }
}

/* Are all TX queues of the device empty?  */
static inline bool qdisc_all_tx_empty(const struct net_device *dev)
{
        unsigned int i;

        rcu_read_lock();
        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
                const struct Qdisc *q = rcu_dereference(txq->qdisc);

                if (!qdisc_is_empty(q)) {
                        rcu_read_unlock();
                        return false;
                }
        }
        rcu_read_unlock();
        return true;
}

/* Are any of the TX qdiscs changing?  */
static inline bool qdisc_tx_changing(const struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);

                if (rcu_access_pointer(txq->qdisc) !=
                    rcu_access_pointer(txq->qdisc_sleeping))
                        return true;
        }
        return false;
}

/* "noqueue" qdisc identified by not having any enqueue, see noqueue_init() */
static inline bool qdisc_txq_has_no_queue(const struct netdev_queue *txq)
{
        struct Qdisc *qdisc = rcu_access_pointer(txq->qdisc);

        return qdisc->enqueue == NULL;
}

/* Is the device using the noop qdisc on all queues?  */
static inline bool qdisc_tx_is_noop(const struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
                if (rcu_access_pointer(txq->qdisc) != &noop_qdisc)
                        return false;
        }
        return true;
}

static inline unsigned int qdisc_pkt_len(const struct sk_buff *skb)
{
        return qdisc_skb_cb(skb)->pkt_len;
}

/* additional qdisc xmit flags (NET_XMIT_MASK in linux/netdevice.h) */
enum net_xmit_qdisc_t {
        __NET_XMIT_STOLEN = 0x00010000,
        __NET_XMIT_BYPASS = 0x00020000,
};

#ifdef CONFIG_NET_CLS_ACT
#define net_xmit_drop_count(e)        ((e) & __NET_XMIT_STOLEN ? 0 : 1)
#else
#define net_xmit_drop_count(e)        (1)
#endif

static inline void qdisc_calculate_pkt_len(struct sk_buff *skb,
                                           const struct Qdisc *sch)
{
#ifdef CONFIG_NET_SCHED
        struct qdisc_size_table *stab = rcu_dereference_bh(sch->stab);

        if (stab)
                __qdisc_calculate_pkt_len(skb, stab);
#endif
}

static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
                                struct sk_buff **to_free)
{
        return sch->enqueue(skb, sch, to_free);
}

static inline void _bstats_update(struct gnet_stats_basic_sync *bstats,
                                  __u64 bytes, __u64 packets)
{
        u64_stats_update_begin(&bstats->syncp);
        u64_stats_add(&bstats->bytes, bytes);
        u64_stats_add(&bstats->packets, packets);
        u64_stats_update_end(&bstats->syncp);
}

static inline void bstats_update(struct gnet_stats_basic_sync *bstats,
                                 const struct sk_buff *skb)
{
        _bstats_update(bstats,
                       qdisc_pkt_len(skb),
                       skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1);
}

static inline void qdisc_bstats_cpu_update(struct Qdisc *sch,
                                           const struct sk_buff *skb)
{
        bstats_update(this_cpu_ptr(sch->cpu_bstats), skb);
}

static inline void qdisc_bstats_update(struct Qdisc *sch,
                                       const struct sk_buff *skb)
{
        bstats_update(&sch->bstats, skb);
}

static inline void qdisc_qstats_backlog_dec(struct Qdisc *sch,
                                            const struct sk_buff *skb)
{
        sch->qstats.backlog -= qdisc_pkt_len(skb);
}

static inline void qdisc_qstats_cpu_backlog_dec(struct Qdisc *sch,
                                                const struct sk_buff *skb)
{
        this_cpu_sub(sch->cpu_qstats->backlog, qdisc_pkt_len(skb));
}

static inline void qdisc_qstats_backlog_inc(struct Qdisc *sch,
                                            const struct sk_buff *skb)
{
        sch->qstats.backlog += qdisc_pkt_len(skb);
}

static inline void qdisc_qstats_cpu_backlog_inc(struct Qdisc *sch,
                                                const struct sk_buff *skb)
{
        this_cpu_add(sch->cpu_qstats->backlog, qdisc_pkt_len(skb));
}

static inline void qdisc_qstats_cpu_qlen_inc(struct Qdisc *sch)
{
        this_cpu_inc(sch->cpu_qstats->qlen);
}

static inline void qdisc_qstats_cpu_qlen_dec(struct Qdisc *sch)
{
        this_cpu_dec(sch->cpu_qstats->qlen);
}

static inline void qdisc_qstats_cpu_requeues_inc(struct Qdisc *sch)
{
        this_cpu_inc(sch->cpu_qstats->requeues);
}

static inline void __qdisc_qstats_drop(struct Qdisc *sch, int count)
{
        sch->qstats.drops += count;
}

static inline void qstats_drop_inc(struct gnet_stats_queue *qstats)
{
        qstats->drops++;
}

static inline void qstats_overlimit_inc(struct gnet_stats_queue *qstats)
{
        qstats->overlimits++;
}

static inline void qdisc_qstats_drop(struct Qdisc *sch)
{
        qstats_drop_inc(&sch->qstats);
}

static inline void qdisc_qstats_cpu_drop(struct Qdisc *sch)
{
        this_cpu_inc(sch->cpu_qstats->drops);
}

static inline void qdisc_qstats_overlimit(struct Qdisc *sch)
{
        sch->qstats.overlimits++;
}

static inline int qdisc_qstats_copy(struct gnet_dump *d, struct Qdisc *sch)
{
        __u32 qlen = qdisc_qlen_sum(sch);

        return gnet_stats_copy_queue(d, sch->cpu_qstats, &sch->qstats, qlen);
}

static inline void qdisc_qstats_qlen_backlog(struct Qdisc *sch,  __u32 *qlen,
                                             __u32 *backlog)
{
        struct gnet_stats_queue qstats = { 0 };

        gnet_stats_add_queue(&qstats, sch->cpu_qstats, &sch->qstats);
        *qlen = qstats.qlen + qdisc_qlen(sch);
        *backlog = qstats.backlog;
}

static inline void qdisc_purge_queue(struct Qdisc *sch)
{
        __u32 qlen, backlog;

        qdisc_qstats_qlen_backlog(sch, &qlen, &backlog);
        qdisc_reset(sch);
        qdisc_tree_reduce_backlog(sch, qlen, backlog);
}

static inline void __qdisc_enqueue_tail(struct sk_buff *skb,
                                        struct qdisc_skb_head *qh)
{
        struct sk_buff *last = qh->tail;

        if (last) {
                skb->next = NULL;
                last->next = skb;
                qh->tail = skb;
        } else {
                qh->tail = skb;
                qh->head = skb;
        }
        qh->qlen++;
}

static inline int qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch)
{
        __qdisc_enqueue_tail(skb, &sch->q);
        qdisc_qstats_backlog_inc(sch, skb);
        return NET_XMIT_SUCCESS;
}

static inline void __qdisc_enqueue_head(struct sk_buff *skb,
                                        struct qdisc_skb_head *qh)
{
        skb->next = qh->head;

        if (!qh->head)
                qh->tail = skb;
        qh->head = skb;
        qh->qlen++;
}

static inline struct sk_buff *__qdisc_dequeue_head(struct qdisc_skb_head *qh)
{
        struct sk_buff *skb = qh->head;

        if (likely(skb != NULL)) {
                qh->head = skb->next;
                qh->qlen--;
                if (qh->head == NULL)
                        qh->tail = NULL;
                skb->next = NULL;
        }

        return skb;
}

static inline struct sk_buff *qdisc_dequeue_internal(struct Qdisc *sch, bool direct)
{
        struct sk_buff *skb;

        skb = __skb_dequeue(&sch->gso_skb);
        if (skb) {
                sch->q.qlen--;
                qdisc_qstats_backlog_dec(sch, skb);
                return skb;
        }
        if (direct) {
                skb = __qdisc_dequeue_head(&sch->q);
                if (skb)
                        qdisc_qstats_backlog_dec(sch, skb);
                return skb;
        } else {
                return sch->dequeue(sch);
        }
}

static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch)
{
        struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);

        if (likely(skb != NULL)) {
                qdisc_qstats_backlog_dec(sch, skb);
                qdisc_bstats_update(sch, skb);
        }

        return skb;
}

struct tc_skb_cb {
        struct qdisc_skb_cb qdisc_cb;
        u32 drop_reason;

        u16 zone; /* Only valid if post_ct = true */
        u16 mru;
        u8 post_ct:1;
        u8 post_ct_snat:1;
        u8 post_ct_dnat:1;
};

static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb)
{
        struct tc_skb_cb *cb = (struct tc_skb_cb *)skb->cb;

        BUILD_BUG_ON(sizeof(*cb) > sizeof_field(struct sk_buff, cb));
        return cb;
}

static inline enum skb_drop_reason
tcf_get_drop_reason(const struct sk_buff *skb)
{
        return tc_skb_cb(skb)->drop_reason;
}

static inline void tcf_set_drop_reason(const struct sk_buff *skb,
                                       enum skb_drop_reason reason)
{
        tc_skb_cb(skb)->drop_reason = reason;
}

/* Instead of calling kfree_skb() while root qdisc lock is held,
 * queue the skb for future freeing at end of __dev_xmit_skb()
 */
static inline void __qdisc_drop(struct sk_buff *skb, struct sk_buff **to_free)
{
        skb->next = *to_free;
        *to_free = skb;
}

static inline void __qdisc_drop_all(struct sk_buff *skb,
                                    struct sk_buff **to_free)
{
        if (skb->prev)
                skb->prev->next = *to_free;
        else
                skb->next = *to_free;
        *to_free = skb;
}

static inline unsigned int __qdisc_queue_drop_head(struct Qdisc *sch,
                                                   struct qdisc_skb_head *qh,
                                                   struct sk_buff **to_free)
{
        struct sk_buff *skb = __qdisc_dequeue_head(qh);

        if (likely(skb != NULL)) {
                unsigned int len = qdisc_pkt_len(skb);

                qdisc_qstats_backlog_dec(sch, skb);
                __qdisc_drop(skb, to_free);
                return len;
        }

        return 0;
}

static inline struct sk_buff *qdisc_peek_head(struct Qdisc *sch)
{
        const struct qdisc_skb_head *qh = &sch->q;

        return qh->head;
}

/* generic pseudo peek method for non-work-conserving qdisc */
static inline struct sk_buff *qdisc_peek_dequeued(struct Qdisc *sch)
{
        struct sk_buff *skb = skb_peek(&sch->gso_skb);

        /* we can reuse ->gso_skb because peek isn't called for root qdiscs */
        if (!skb) {
                skb = sch->dequeue(sch);

                if (skb) {
                        __skb_queue_head(&sch->gso_skb, skb);
                        /* it's still part of the queue */
                        qdisc_qstats_backlog_inc(sch, skb);
                        sch->q.qlen++;
                }
        }

        return skb;
}

static inline void qdisc_update_stats_at_dequeue(struct Qdisc *sch,
                                                 struct sk_buff *skb)
{
        if (qdisc_is_percpu_stats(sch)) {
                qdisc_qstats_cpu_backlog_dec(sch, skb);
                qdisc_bstats_cpu_update(sch, skb);
                qdisc_qstats_cpu_qlen_dec(sch);
        } else {
                qdisc_qstats_backlog_dec(sch, skb);
                qdisc_bstats_update(sch, skb);
                sch->q.qlen--;
        }
}

static inline void qdisc_update_stats_at_enqueue(struct Qdisc *sch,
                                                 unsigned int pkt_len)
{
        if (qdisc_is_percpu_stats(sch)) {
                qdisc_qstats_cpu_qlen_inc(sch);
                this_cpu_add(sch->cpu_qstats->backlog, pkt_len);
        } else {
                sch->qstats.backlog += pkt_len;
                sch->q.qlen++;
        }
}

/* use instead of qdisc->dequeue() for all qdiscs queried with ->peek() */
static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch)
{
        struct sk_buff *skb = skb_peek(&sch->gso_skb);

        if (skb) {
                skb = __skb_dequeue(&sch->gso_skb);
                if (qdisc_is_percpu_stats(sch)) {
                        qdisc_qstats_cpu_backlog_dec(sch, skb);
                        qdisc_qstats_cpu_qlen_dec(sch);
                } else {
                        qdisc_qstats_backlog_dec(sch, skb);
                        sch->q.qlen--;
                }
        } else {
                skb = sch->dequeue(sch);
        }

        return skb;
}

static inline void __qdisc_reset_queue(struct qdisc_skb_head *qh)
{
        /*
         * We do not know the backlog in bytes of this list, it
         * is up to the caller to correct it
         */
        ASSERT_RTNL();
        if (qh->qlen) {
                rtnl_kfree_skbs(qh->head, qh->tail);

                qh->head = NULL;
                qh->tail = NULL;
                qh->qlen = 0;
        }
}

static inline void qdisc_reset_queue(struct Qdisc *sch)
{
        __qdisc_reset_queue(&sch->q);
}

static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new,
                                          struct Qdisc **pold)
{
        struct Qdisc *old;

        sch_tree_lock(sch);
        old = *pold;
        *pold = new;
        if (old != NULL)
                qdisc_purge_queue(old);
        sch_tree_unlock(sch);

        return old;
}

static inline void rtnl_qdisc_drop(struct sk_buff *skb, struct Qdisc *sch)
{
        rtnl_kfree_skbs(skb, skb);
        qdisc_qstats_drop(sch);
}

static inline int qdisc_drop_cpu(struct sk_buff *skb, struct Qdisc *sch,
                                 struct sk_buff **to_free)
{
        __qdisc_drop(skb, to_free);
        qdisc_qstats_cpu_drop(sch);

        return NET_XMIT_DROP;
}

static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch,
                             struct sk_buff **to_free)
{
        __qdisc_drop(skb, to_free);
        qdisc_qstats_drop(sch);

        return NET_XMIT_DROP;
}

static inline int qdisc_drop_reason(struct sk_buff *skb, struct Qdisc *sch,
                                    struct sk_buff **to_free,
                                    enum skb_drop_reason reason)
{
        tcf_set_drop_reason(skb, reason);
        return qdisc_drop(skb, sch, to_free);
}

static inline int qdisc_drop_all(struct sk_buff *skb, struct Qdisc *sch,
                                 struct sk_buff **to_free)
{
        __qdisc_drop_all(skb, to_free);
        qdisc_qstats_drop(sch);

        return NET_XMIT_DROP;
}

struct psched_ratecfg {
        u64        rate_bytes_ps; /* bytes per second */
        u32        mult;
        u16        overhead;
        u16        mpu;
        u8        linklayer;
        u8        shift;
};

static inline u64 psched_l2t_ns(const struct psched_ratecfg *r,
                                unsigned int len)
{
        len += r->overhead;

        if (len < r->mpu)
                len = r->mpu;

        if (unlikely(r->linklayer == TC_LINKLAYER_ATM))
                return ((u64)(DIV_ROUND_UP(len,48)*53) * r->mult) >> r->shift;

        return ((u64)len * r->mult) >> r->shift;
}

void psched_ratecfg_precompute(struct psched_ratecfg *r,
                               const struct tc_ratespec *conf,
                               u64 rate64);

static inline void psched_ratecfg_getrate(struct tc_ratespec *res,
                                          const struct psched_ratecfg *r)
{
        memset(res, 0, sizeof(*res));

        /* legacy struct tc_ratespec has a 32bit @rate field
         * Qdisc using 64bit rate should add new attributes
         * in order to maintain compatibility.
         */
        res->rate = min_t(u64, r->rate_bytes_ps, ~0U);

        res->overhead = r->overhead;
        res->mpu = r->mpu;
        res->linklayer = (r->linklayer & TC_LINKLAYER_MASK);
}

struct psched_pktrate {
        u64        rate_pkts_ps; /* packets per second */
        u32        mult;
        u8        shift;
};

static inline u64 psched_pkt2t_ns(const struct psched_pktrate *r,
                                  unsigned int pkt_num)
{
        return ((u64)pkt_num * r->mult) >> r->shift;
}

void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64);

/* Mini Qdisc serves for specific needs of ingress/clsact Qdisc.
 * The fast path only needs to access filter list and to update stats
 */
struct mini_Qdisc {
        struct tcf_proto *filter_list;
        struct tcf_block *block;
        struct gnet_stats_basic_sync __percpu *cpu_bstats;
        struct gnet_stats_queue        __percpu *cpu_qstats;
        unsigned long rcu_state;
};

static inline void mini_qdisc_bstats_cpu_update(struct mini_Qdisc *miniq,
                                                const struct sk_buff *skb)
{
        bstats_update(this_cpu_ptr(miniq->cpu_bstats), skb);
}

static inline void mini_qdisc_qstats_cpu_drop(struct mini_Qdisc *miniq)
{
        this_cpu_inc(miniq->cpu_qstats->drops);
}

struct mini_Qdisc_pair {
        struct mini_Qdisc miniq1;
        struct mini_Qdisc miniq2;
        struct mini_Qdisc __rcu **p_miniq;
};

void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
                          struct tcf_proto *tp_head);
void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
                          struct mini_Qdisc __rcu **p_miniq);
void mini_qdisc_pair_block_init(struct mini_Qdisc_pair *miniqp,
                                struct tcf_block *block);

void mq_change_real_num_tx(struct Qdisc *sch, unsigned int new_real_tx);

int sch_frag_xmit_hook(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb));

/* Make sure qdisc is no longer in SCHED state. */
static inline void qdisc_synchronize(const struct Qdisc *q)
{
        while (test_bit(__QDISC_STATE_SCHED, &q->state))
                msleep(1);
}

#endif










  148 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __X86_KERNEL_FPU_INTERNAL_H
#define __X86_KERNEL_FPU_INTERNAL_H

extern struct fpstate init_fpstate;

/* CPU feature check wrappers */
static __always_inline __pure bool use_xsave(void)
{
        return cpu_feature_enabled(X86_FEATURE_XSAVE);
}

static __always_inline __pure bool use_fxsr(void)
{
        return cpu_feature_enabled(X86_FEATURE_FXSR);
}

#ifdef CONFIG_X86_DEBUG_FPU
# define WARN_ON_FPU(x) WARN_ON_ONCE(x)
#else
# define WARN_ON_FPU(x) ({ BUILD_BUG_ON_INVALID(x); 0; })
#endif

/* Used in init.c */
extern void fpstate_init_user(struct fpstate *fpstate);
extern void fpstate_reset(struct fpu *fpu);

#endif


























































    8 








   52 

   52 
   32 






   54 

   54 






   33 

   30 




   23 

   23 
   21 











































































































































































































































   52 



   54 


    2 





    1 





    2 





   20 




   54 


   53 

    3 




























































   27 




   27 

   27 

   27 










   27 

   27 







   26 
    8 






















































   55 


   56 


   29 
   28 



   43 
   43 






   57 


































   32 



   35 

   32 
   32 


   32 











   14 













































































































































   94 




   93 























   94 







































   54 


   53 
   54 


   54 



    6 



    6 




    6 
    6 

    6 
    6 

    6 


    6 
    6 
    6 











    6 
    6 


    6 







    6 













    4 


    4 
    4 















































    4 



    4 




    4 


    4 
    4 
    4 
    4 





    4 

    4 

    4 
    2 






    4 

    4 
    4 


































































    1 




























































































   44 



   44 

   44 
   44 
   44 

   43 
   43 













































   41 





   41 


   19 
   19 
   17 
   11 
   11 


    9 
    4 
    8 
   21 



   41 
   15 
















































    3 





    3 


    2 
    2 
    1 







    1 



    3 











   27 
    8 









   42 
   28 



   42 






















































































































































































































































































































































































































   15 
   15 
   15 
   15 

   13 





   15 




   25 










   25 


   24 
   25 


   25 
   18 


   18 



   25 


   25 



   25 

   15 

   25 







   25 


    1 


   17 




   25 
   17 
    7 
    7 


    6 
    3 
   17 

   25 


   12 

















   34 


   20 

   28 
   20 
   11 
    6 
    5 
    3 
    1 
    8 
    1 
    1 







   33 




   17 







   31 








   15 


   17 









   17 




   17 

   17 
   17 


   17 

   33 



   20 




   18 




    9 
   18 

   18 





   18 




    2 


   17 


   17 





    6 

   17 









    2 




   18 
   10 

   18 
   12 
















































































































































   20 





   10 


    2 
    2 



    4 




   10 








   13 
    4 
    2 













   20 


   12 
















































    9 
    9 




    9 


    2 
    7 







    2 
















    2 




    7 


    2 

    1 






    2 





    2 


    2 

    2 





    1 

    2 
    2 

    1 
    1 

    2 


    2 




    2 
    2 



    2 

    8 

    9 
    1 















































   25 



   25 
   25 


   25 

   24 






    3 



    3 

    3 


    3 

    3 






   17 














































































































































   23 



   22 
    3 
    2 



    3 



   21 
    3 





   14 







    4 




    4 
    4 







   21 







    1 





    3 



   19 
    2 




    1 




   17 



    1 
   17 



   17 






   15 




   16 
   16 




   15 


   15 
   15 
   15 




   15 


    1 




    1 




    2 


   17 









    5 

    2 


    4 
    4 
    6 













    8 


    9 

    9 


    9 
    8 
    1 

    8 

    7 
    4 


    2 


    5 

    9 

    9 



    9 
    9 
    8 




    9 









    7 








    7 





















   19 


   18 
   19 
   19 

   17 
   19 


   26 

   26 
   26 
   26 

   26 
   26 





    3 














    4 



    4 
    4 
    4 

    1 

    4 

    4 






































    1 














    2 




    2 
    2 
    2 


    2 



    2 

    2 












































   74 


   75 
   75 
   74 

   75 


   75 





   73 





































































































































































   54 

   54 
   53 
   54 


   53 









    6 



    3 
    3 


    6 









































   57 



   51 


    3 

   57 
   41 

    1 



   41 

    1 



   40 




   15 

    2 



   14 

    6 


   14 
   12 
   10 
   26 



   54 

    3 





   51 

   31 

    1 




   32 

    2 
    2 




    1 
    1 






   35 
   32 





   57 





   12 










   12 
   11 
   12 

































































































































   27 





   27 


   27 




    6 





    6 


    6 























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
// SPDX-License-Identifier: GPL-2.0-only
/*
 * xfrm_state.c
 *
 * Changes:
 *        Mitsuru KANDA @USAGI
 *         Kazunori MIYAZAWA @USAGI
 *         Kunihiro Ishiguro <kunihiro@ipinfusion.com>
 *                 IPv6 support
 *         YOSHIFUJI Hideaki @USAGI
 *                 Split up af-specific functions
 *        Derek Atkins <derek@ihtfp.com>
 *                Add UDP Encapsulation
 *
 */

#include <linux/compat.h>
#include <linux/workqueue.h>
#include <net/xfrm.h>
#include <linux/pfkeyv2.h>
#include <linux/ipsec.h>
#include <linux/module.h>
#include <linux/cache.h>
#include <linux/audit.h>
#include <linux/uaccess.h>
#include <linux/ktime.h>
#include <linux/slab.h>
#include <linux/interrupt.h>
#include <linux/kernel.h>

#include <crypto/aead.h>

#include "xfrm_hash.h"

#define xfrm_state_deref_prot(table, net) \
        rcu_dereference_protected((table), lockdep_is_held(&(net)->xfrm.xfrm_state_lock))
#define xfrm_state_deref_check(table, net) \
        rcu_dereference_check((table), lockdep_is_held(&(net)->xfrm.xfrm_state_lock))

static void xfrm_state_gc_task(struct work_struct *work);

/* Each xfrm_state may be linked to two tables:

   1. Hash table by (spi,daddr,ah/esp) to find SA by SPI. (input,ctl)
   2. Hash table by (daddr,family,reqid) to find what SAs exist for given
      destination/tunnel endpoint. (output)
 */

static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024;
static struct kmem_cache *xfrm_state_cache __ro_after_init;

static DECLARE_WORK(xfrm_state_gc_work, xfrm_state_gc_task);
static HLIST_HEAD(xfrm_state_gc_list);
static HLIST_HEAD(xfrm_state_dev_gc_list);

static inline bool xfrm_state_hold_rcu(struct xfrm_state __rcu *x)
{
        return refcount_inc_not_zero(&x->refcnt);
}

static inline unsigned int xfrm_dst_hash(struct net *net,
                                         const xfrm_address_t *daddr,
                                         const xfrm_address_t *saddr,
                                         u32 reqid,
                                         unsigned short family)
{
        lockdep_assert_held(&net->xfrm.xfrm_state_lock);

        return __xfrm_dst_hash(daddr, saddr, reqid, family, net->xfrm.state_hmask);
}

static inline unsigned int xfrm_src_hash(struct net *net,
                                         const xfrm_address_t *daddr,
                                         const xfrm_address_t *saddr,
                                         unsigned short family)
{
        lockdep_assert_held(&net->xfrm.xfrm_state_lock);

        return __xfrm_src_hash(daddr, saddr, family, net->xfrm.state_hmask);
}

static inline unsigned int
xfrm_spi_hash(struct net *net, const xfrm_address_t *daddr,
              __be32 spi, u8 proto, unsigned short family)
{
        lockdep_assert_held(&net->xfrm.xfrm_state_lock);

        return __xfrm_spi_hash(daddr, spi, proto, family, net->xfrm.state_hmask);
}

static unsigned int xfrm_seq_hash(struct net *net, u32 seq)
{
        lockdep_assert_held(&net->xfrm.xfrm_state_lock);

        return __xfrm_seq_hash(seq, net->xfrm.state_hmask);
}

#define XFRM_STATE_INSERT(by, _n, _h, _type)                               \
        {                                                                  \
                struct xfrm_state *_x = NULL;                              \
                                                                           \
                if (_type != XFRM_DEV_OFFLOAD_PACKET) {                    \
                        hlist_for_each_entry_rcu(_x, _h, by) {             \
                                if (_x->xso.type == XFRM_DEV_OFFLOAD_PACKET) \
                                        continue;                          \
                                break;                                     \
                        }                                                  \
                }                                                          \
                                                                           \
                if (!_x || _x->xso.type == XFRM_DEV_OFFLOAD_PACKET)        \
                        /* SAD is empty or consist from HW SAs only */     \
                        hlist_add_head_rcu(_n, _h);                        \
                else                                                       \
                        hlist_add_before_rcu(_n, &_x->by);                 \
        }

static void xfrm_hash_transfer(struct hlist_head *list,
                               struct hlist_head *ndsttable,
                               struct hlist_head *nsrctable,
                               struct hlist_head *nspitable,
                               struct hlist_head *nseqtable,
                               unsigned int nhashmask)
{
        struct hlist_node *tmp;
        struct xfrm_state *x;

        hlist_for_each_entry_safe(x, tmp, list, bydst) {
                unsigned int h;

                h = __xfrm_dst_hash(&x->id.daddr, &x->props.saddr,
                                    x->props.reqid, x->props.family,
                                    nhashmask);
                XFRM_STATE_INSERT(bydst, &x->bydst, ndsttable + h, x->xso.type);

                h = __xfrm_src_hash(&x->id.daddr, &x->props.saddr,
                                    x->props.family,
                                    nhashmask);
                XFRM_STATE_INSERT(bysrc, &x->bysrc, nsrctable + h, x->xso.type);

                if (x->id.spi) {
                        h = __xfrm_spi_hash(&x->id.daddr, x->id.spi,
                                            x->id.proto, x->props.family,
                                            nhashmask);
                        XFRM_STATE_INSERT(byspi, &x->byspi, nspitable + h,
                                          x->xso.type);
                }

                if (x->km.seq) {
                        h = __xfrm_seq_hash(x->km.seq, nhashmask);
                        XFRM_STATE_INSERT(byseq, &x->byseq, nseqtable + h,
                                          x->xso.type);
                }
        }
}

static unsigned long xfrm_hash_new_size(unsigned int state_hmask)
{
        return ((state_hmask + 1) << 1) * sizeof(struct hlist_head);
}

static void xfrm_hash_resize(struct work_struct *work)
{
        struct net *net = container_of(work, struct net, xfrm.state_hash_work);
        struct hlist_head *ndst, *nsrc, *nspi, *nseq, *odst, *osrc, *ospi, *oseq;
        unsigned long nsize, osize;
        unsigned int nhashmask, ohashmask;
        int i;

        nsize = xfrm_hash_new_size(net->xfrm.state_hmask);
        ndst = xfrm_hash_alloc(nsize);
        if (!ndst)
                return;
        nsrc = xfrm_hash_alloc(nsize);
        if (!nsrc) {
                xfrm_hash_free(ndst, nsize);
                return;
        }
        nspi = xfrm_hash_alloc(nsize);
        if (!nspi) {
                xfrm_hash_free(ndst, nsize);
                xfrm_hash_free(nsrc, nsize);
                return;
        }
        nseq = xfrm_hash_alloc(nsize);
        if (!nseq) {
                xfrm_hash_free(ndst, nsize);
                xfrm_hash_free(nsrc, nsize);
                xfrm_hash_free(nspi, nsize);
                return;
        }

        spin_lock_bh(&net->xfrm.xfrm_state_lock);
        write_seqcount_begin(&net->xfrm.xfrm_state_hash_generation);

        nhashmask = (nsize / sizeof(struct hlist_head)) - 1U;
        odst = xfrm_state_deref_prot(net->xfrm.state_bydst, net);
        for (i = net->xfrm.state_hmask; i >= 0; i--)
                xfrm_hash_transfer(odst + i, ndst, nsrc, nspi, nseq, nhashmask);

        osrc = xfrm_state_deref_prot(net->xfrm.state_bysrc, net);
        ospi = xfrm_state_deref_prot(net->xfrm.state_byspi, net);
        oseq = xfrm_state_deref_prot(net->xfrm.state_byseq, net);
        ohashmask = net->xfrm.state_hmask;

        rcu_assign_pointer(net->xfrm.state_bydst, ndst);
        rcu_assign_pointer(net->xfrm.state_bysrc, nsrc);
        rcu_assign_pointer(net->xfrm.state_byspi, nspi);
        rcu_assign_pointer(net->xfrm.state_byseq, nseq);
        net->xfrm.state_hmask = nhashmask;

        write_seqcount_end(&net->xfrm.xfrm_state_hash_generation);
        spin_unlock_bh(&net->xfrm.xfrm_state_lock);

        osize = (ohashmask + 1) * sizeof(struct hlist_head);

        synchronize_rcu();

        xfrm_hash_free(odst, osize);
        xfrm_hash_free(osrc, osize);
        xfrm_hash_free(ospi, osize);
        xfrm_hash_free(oseq, osize);
}

static DEFINE_SPINLOCK(xfrm_state_afinfo_lock);
static struct xfrm_state_afinfo __rcu *xfrm_state_afinfo[NPROTO];

static DEFINE_SPINLOCK(xfrm_state_gc_lock);
static DEFINE_SPINLOCK(xfrm_state_dev_gc_lock);

int __xfrm_state_delete(struct xfrm_state *x);

int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol);
static bool km_is_alive(const struct km_event *c);
void km_state_expired(struct xfrm_state *x, int hard, u32 portid);

int xfrm_register_type(const struct xfrm_type *type, unsigned short family)
{
        struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
        int err = 0;

        if (!afinfo)
                return -EAFNOSUPPORT;

#define X(afi, T, name) do {                        \
                WARN_ON((afi)->type_ ## name);        \
                (afi)->type_ ## name = (T);        \
        } while (0)

        switch (type->proto) {
        case IPPROTO_COMP:
                X(afinfo, type, comp);
                break;
        case IPPROTO_AH:
                X(afinfo, type, ah);
                break;
        case IPPROTO_ESP:
                X(afinfo, type, esp);
                break;
        case IPPROTO_IPIP:
                X(afinfo, type, ipip);
                break;
        case IPPROTO_DSTOPTS:
                X(afinfo, type, dstopts);
                break;
        case IPPROTO_ROUTING:
                X(afinfo, type, routing);
                break;
        case IPPROTO_IPV6:
                X(afinfo, type, ipip6);
                break;
        default:
                WARN_ON(1);
                err = -EPROTONOSUPPORT;
                break;
        }
#undef X
        rcu_read_unlock();
        return err;
}
EXPORT_SYMBOL(xfrm_register_type);

void xfrm_unregister_type(const struct xfrm_type *type, unsigned short family)
{
        struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);

        if (unlikely(afinfo == NULL))
                return;

#define X(afi, T, name) do {                                \
                WARN_ON((afi)->type_ ## name != (T));        \
                (afi)->type_ ## name = NULL;                \
        } while (0)

        switch (type->proto) {
        case IPPROTO_COMP:
                X(afinfo, type, comp);
                break;
        case IPPROTO_AH:
                X(afinfo, type, ah);
                break;
        case IPPROTO_ESP:
                X(afinfo, type, esp);
                break;
        case IPPROTO_IPIP:
                X(afinfo, type, ipip);
                break;
        case IPPROTO_DSTOPTS:
                X(afinfo, type, dstopts);
                break;
        case IPPROTO_ROUTING:
                X(afinfo, type, routing);
                break;
        case IPPROTO_IPV6:
                X(afinfo, type, ipip6);
                break;
        default:
                WARN_ON(1);
                break;
        }
#undef X
        rcu_read_unlock();
}
EXPORT_SYMBOL(xfrm_unregister_type);

static const struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family)
{
        const struct xfrm_type *type = NULL;
        struct xfrm_state_afinfo *afinfo;
        int modload_attempted = 0;

retry:
        afinfo = xfrm_state_get_afinfo(family);
        if (unlikely(afinfo == NULL))
                return NULL;

        switch (proto) {
        case IPPROTO_COMP:
                type = afinfo->type_comp;
                break;
        case IPPROTO_AH:
                type = afinfo->type_ah;
                break;
        case IPPROTO_ESP:
                type = afinfo->type_esp;
                break;
        case IPPROTO_IPIP:
                type = afinfo->type_ipip;
                break;
        case IPPROTO_DSTOPTS:
                type = afinfo->type_dstopts;
                break;
        case IPPROTO_ROUTING:
                type = afinfo->type_routing;
                break;
        case IPPROTO_IPV6:
                type = afinfo->type_ipip6;
                break;
        default:
                break;
        }

        if (unlikely(type && !try_module_get(type->owner)))
                type = NULL;

        rcu_read_unlock();

        if (!type && !modload_attempted) {
                request_module("xfrm-type-%d-%d", family, proto);
                modload_attempted = 1;
                goto retry;
        }

        return type;
}

static void xfrm_put_type(const struct xfrm_type *type)
{
        module_put(type->owner);
}

int xfrm_register_type_offload(const struct xfrm_type_offload *type,
                               unsigned short family)
{
        struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
        int err = 0;

        if (unlikely(afinfo == NULL))
                return -EAFNOSUPPORT;

        switch (type->proto) {
        case IPPROTO_ESP:
                WARN_ON(afinfo->type_offload_esp);
                afinfo->type_offload_esp = type;
                break;
        default:
                WARN_ON(1);
                err = -EPROTONOSUPPORT;
                break;
        }

        rcu_read_unlock();
        return err;
}
EXPORT_SYMBOL(xfrm_register_type_offload);

void xfrm_unregister_type_offload(const struct xfrm_type_offload *type,
                                  unsigned short family)
{
        struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);

        if (unlikely(afinfo == NULL))
                return;

        switch (type->proto) {
        case IPPROTO_ESP:
                WARN_ON(afinfo->type_offload_esp != type);
                afinfo->type_offload_esp = NULL;
                break;
        default:
                WARN_ON(1);
                break;
        }
        rcu_read_unlock();
}
EXPORT_SYMBOL(xfrm_unregister_type_offload);

void xfrm_set_type_offload(struct xfrm_state *x, bool try_load)
{
        const struct xfrm_type_offload *type = NULL;
        struct xfrm_state_afinfo *afinfo;

retry:
        afinfo = xfrm_state_get_afinfo(x->props.family);
        if (unlikely(afinfo == NULL))
                goto out;

        switch (x->id.proto) {
        case IPPROTO_ESP:
                type = afinfo->type_offload_esp;
                break;
        default:
                break;
        }

        if ((type && !try_module_get(type->owner)))
                type = NULL;

        rcu_read_unlock();

        if (!type && try_load) {
                request_module("xfrm-offload-%d-%d", x->props.family,
                               x->id.proto);
                try_load = false;
                goto retry;
        }

out:
        x->type_offload = type;
}
EXPORT_SYMBOL(xfrm_set_type_offload);

static const struct xfrm_mode xfrm4_mode_map[XFRM_MODE_MAX] = {
        [XFRM_MODE_BEET] = {
                .encap = XFRM_MODE_BEET,
                .flags = XFRM_MODE_FLAG_TUNNEL,
                .family = AF_INET,
        },
        [XFRM_MODE_TRANSPORT] = {
                .encap = XFRM_MODE_TRANSPORT,
                .family = AF_INET,
        },
        [XFRM_MODE_TUNNEL] = {
                .encap = XFRM_MODE_TUNNEL,
                .flags = XFRM_MODE_FLAG_TUNNEL,
                .family = AF_INET,
        },
        [XFRM_MODE_IPTFS] = {
                .encap = XFRM_MODE_IPTFS,
                .flags = XFRM_MODE_FLAG_TUNNEL,
                .family = AF_INET,
        },
};

static const struct xfrm_mode xfrm6_mode_map[XFRM_MODE_MAX] = {
        [XFRM_MODE_BEET] = {
                .encap = XFRM_MODE_BEET,
                .flags = XFRM_MODE_FLAG_TUNNEL,
                .family = AF_INET6,
        },
        [XFRM_MODE_ROUTEOPTIMIZATION] = {
                .encap = XFRM_MODE_ROUTEOPTIMIZATION,
                .family = AF_INET6,
        },
        [XFRM_MODE_TRANSPORT] = {
                .encap = XFRM_MODE_TRANSPORT,
                .family = AF_INET6,
        },
        [XFRM_MODE_TUNNEL] = {
                .encap = XFRM_MODE_TUNNEL,
                .flags = XFRM_MODE_FLAG_TUNNEL,
                .family = AF_INET6,
        },
        [XFRM_MODE_IPTFS] = {
                .encap = XFRM_MODE_IPTFS,
                .flags = XFRM_MODE_FLAG_TUNNEL,
                .family = AF_INET6,
        },
};

static const struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family)
{
        const struct xfrm_mode *mode;

        if (unlikely(encap >= XFRM_MODE_MAX))
                return NULL;

        switch (family) {
        case AF_INET:
                mode = &xfrm4_mode_map[encap];
                if (mode->family == family)
                        return mode;
                break;
        case AF_INET6:
                mode = &xfrm6_mode_map[encap];
                if (mode->family == family)
                        return mode;
                break;
        default:
                break;
        }

        return NULL;
}

static const struct xfrm_mode_cbs  __rcu *xfrm_mode_cbs_map[XFRM_MODE_MAX];
static DEFINE_SPINLOCK(xfrm_mode_cbs_map_lock);

int xfrm_register_mode_cbs(u8 mode, const struct xfrm_mode_cbs *mode_cbs)
{
        if (mode >= XFRM_MODE_MAX)
                return -EINVAL;

        spin_lock_bh(&xfrm_mode_cbs_map_lock);
        rcu_assign_pointer(xfrm_mode_cbs_map[mode], mode_cbs);
        spin_unlock_bh(&xfrm_mode_cbs_map_lock);

        return 0;
}
EXPORT_SYMBOL(xfrm_register_mode_cbs);

void xfrm_unregister_mode_cbs(u8 mode)
{
        if (mode >= XFRM_MODE_MAX)
                return;

        spin_lock_bh(&xfrm_mode_cbs_map_lock);
        RCU_INIT_POINTER(xfrm_mode_cbs_map[mode], NULL);
        spin_unlock_bh(&xfrm_mode_cbs_map_lock);
        synchronize_rcu();
}
EXPORT_SYMBOL(xfrm_unregister_mode_cbs);

static const struct xfrm_mode_cbs *xfrm_get_mode_cbs(u8 mode)
{
        const struct xfrm_mode_cbs *cbs;
        bool try_load = true;

        if (mode >= XFRM_MODE_MAX)
                return NULL;

retry:
        rcu_read_lock();

        cbs = rcu_dereference(xfrm_mode_cbs_map[mode]);
        if (cbs && !try_module_get(cbs->owner))
                cbs = NULL;

        rcu_read_unlock();

        if (mode == XFRM_MODE_IPTFS && !cbs && try_load) {
                request_module("xfrm-iptfs");
                try_load = false;
                goto retry;
        }

        return cbs;
}

void xfrm_state_free(struct xfrm_state *x)
{
        kmem_cache_free(xfrm_state_cache, x);
}
EXPORT_SYMBOL(xfrm_state_free);

static void xfrm_state_delete_tunnel(struct xfrm_state *x);
static void xfrm_state_gc_destroy(struct xfrm_state *x)
{
        if (x->mode_cbs && x->mode_cbs->destroy_state)
                x->mode_cbs->destroy_state(x);
        hrtimer_cancel(&x->mtimer);
        timer_delete_sync(&x->rtimer);
        kfree_sensitive(x->aead);
        kfree_sensitive(x->aalg);
        kfree_sensitive(x->ealg);
        kfree(x->calg);
        kfree(x->encap);
        kfree(x->coaddr);
        kfree(x->replay_esn);
        kfree(x->preplay_esn);
        xfrm_unset_type_offload(x);
        xfrm_state_delete_tunnel(x);
        if (x->type) {
                x->type->destructor(x);
                xfrm_put_type(x->type);
        }
        if (x->xfrag.page)
                put_page(x->xfrag.page);
        xfrm_dev_state_free(x);
        security_xfrm_state_free(x);
        xfrm_state_free(x);
}

static void xfrm_state_gc_task(struct work_struct *work)
{
        struct xfrm_state *x;
        struct hlist_node *tmp;
        struct hlist_head gc_list;

        spin_lock_bh(&xfrm_state_gc_lock);
        hlist_move_list(&xfrm_state_gc_list, &gc_list);
        spin_unlock_bh(&xfrm_state_gc_lock);

        synchronize_rcu();

        hlist_for_each_entry_safe(x, tmp, &gc_list, gclist)
                xfrm_state_gc_destroy(x);
}

static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
{
        struct xfrm_state *x = container_of(me, struct xfrm_state, mtimer);
        enum hrtimer_restart ret = HRTIMER_NORESTART;
        time64_t now = ktime_get_real_seconds();
        time64_t next = TIME64_MAX;
        int warn = 0;
        int err = 0;

        spin_lock(&x->lock);
        xfrm_dev_state_update_stats(x);

        if (x->km.state == XFRM_STATE_DEAD)
                goto out;
        if (x->km.state == XFRM_STATE_EXPIRED)
                goto expired;
        if (x->lft.hard_add_expires_seconds) {
                time64_t tmo = x->lft.hard_add_expires_seconds +
                        x->curlft.add_time - now;
                if (tmo <= 0) {
                        if (x->xflags & XFRM_SOFT_EXPIRE) {
                                /* enter hard expire without soft expire first?!
                                 * setting a new date could trigger this.
                                 * workaround: fix x->curflt.add_time by below:
                                 */
                                x->curlft.add_time = now - x->saved_tmo - 1;
                                tmo = x->lft.hard_add_expires_seconds - x->saved_tmo;
                        } else
                                goto expired;
                }
                if (tmo < next)
                        next = tmo;
        }
        if (x->lft.hard_use_expires_seconds) {
                time64_t tmo = x->lft.hard_use_expires_seconds +
                        (READ_ONCE(x->curlft.use_time) ? : now) - now;
                if (tmo <= 0)
                        goto expired;
                if (tmo < next)
                        next = tmo;
        }
        if (x->km.dying)
                goto resched;
        if (x->lft.soft_add_expires_seconds) {
                time64_t tmo = x->lft.soft_add_expires_seconds +
                        x->curlft.add_time - now;
                if (tmo <= 0) {
                        warn = 1;
                        x->xflags &= ~XFRM_SOFT_EXPIRE;
                } else if (tmo < next) {
                        next = tmo;
                        x->xflags |= XFRM_SOFT_EXPIRE;
                        x->saved_tmo = tmo;
                }
        }
        if (x->lft.soft_use_expires_seconds) {
                time64_t tmo = x->lft.soft_use_expires_seconds +
                        (READ_ONCE(x->curlft.use_time) ? : now) - now;
                if (tmo <= 0)
                        warn = 1;
                else if (tmo < next)
                        next = tmo;
        }

        x->km.dying = warn;
        if (warn)
                km_state_expired(x, 0, 0);
resched:
        if (next != TIME64_MAX) {
                hrtimer_forward_now(&x->mtimer, ktime_set(next, 0));
                ret = HRTIMER_RESTART;
        }

        goto out;

expired:
        if (x->km.state == XFRM_STATE_ACQ && x->id.spi == 0)
                x->km.state = XFRM_STATE_EXPIRED;

        err = __xfrm_state_delete(x);
        if (!err)
                km_state_expired(x, 1, 0);

        xfrm_audit_state_delete(x, err ? 0 : 1, true);

out:
        spin_unlock(&x->lock);
        return ret;
}

static void xfrm_replay_timer_handler(struct timer_list *t);

struct xfrm_state *xfrm_state_alloc(struct net *net)
{
        struct xfrm_state *x;

        x = kmem_cache_zalloc(xfrm_state_cache, GFP_ATOMIC);

        if (x) {
                write_pnet(&x->xs_net, net);
                refcount_set(&x->refcnt, 1);
                atomic_set(&x->tunnel_users, 0);
                INIT_LIST_HEAD(&x->km.all);
                INIT_HLIST_NODE(&x->state_cache);
                INIT_HLIST_NODE(&x->bydst);
                INIT_HLIST_NODE(&x->bysrc);
                INIT_HLIST_NODE(&x->byspi);
                INIT_HLIST_NODE(&x->byseq);
                hrtimer_setup(&x->mtimer, xfrm_timer_handler, CLOCK_BOOTTIME,
                              HRTIMER_MODE_ABS_SOFT);
                timer_setup(&x->rtimer, xfrm_replay_timer_handler, 0);
                x->curlft.add_time = ktime_get_real_seconds();
                x->lft.soft_byte_limit = XFRM_INF;
                x->lft.soft_packet_limit = XFRM_INF;
                x->lft.hard_byte_limit = XFRM_INF;
                x->lft.hard_packet_limit = XFRM_INF;
                x->replay_maxage = 0;
                x->replay_maxdiff = 0;
                x->pcpu_num = UINT_MAX;
                spin_lock_init(&x->lock);
                x->mode_data = NULL;
        }
        return x;
}
EXPORT_SYMBOL(xfrm_state_alloc);

#ifdef CONFIG_XFRM_OFFLOAD
void xfrm_dev_state_delete(struct xfrm_state *x)
{
        struct xfrm_dev_offload *xso = &x->xso;
        struct net_device *dev = READ_ONCE(xso->dev);

        if (dev) {
                dev->xfrmdev_ops->xdo_dev_state_delete(dev, x);
                spin_lock_bh(&xfrm_state_dev_gc_lock);
                hlist_add_head(&x->dev_gclist, &xfrm_state_dev_gc_list);
                spin_unlock_bh(&xfrm_state_dev_gc_lock);
        }
}
EXPORT_SYMBOL_GPL(xfrm_dev_state_delete);

void xfrm_dev_state_free(struct xfrm_state *x)
{
        struct xfrm_dev_offload *xso = &x->xso;
        struct net_device *dev = READ_ONCE(xso->dev);

        if (dev && dev->xfrmdev_ops) {
                spin_lock_bh(&xfrm_state_dev_gc_lock);
                if (!hlist_unhashed(&x->dev_gclist))
                        hlist_del(&x->dev_gclist);
                spin_unlock_bh(&xfrm_state_dev_gc_lock);

                if (dev->xfrmdev_ops->xdo_dev_state_free)
                        dev->xfrmdev_ops->xdo_dev_state_free(dev, x);
                WRITE_ONCE(xso->dev, NULL);
                xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED;
                netdev_put(dev, &xso->dev_tracker);
        }
}
#endif

void __xfrm_state_destroy(struct xfrm_state *x)
{
        WARN_ON(x->km.state != XFRM_STATE_DEAD);

        spin_lock_bh(&xfrm_state_gc_lock);
        hlist_add_head(&x->gclist, &xfrm_state_gc_list);
        spin_unlock_bh(&xfrm_state_gc_lock);
        schedule_work(&xfrm_state_gc_work);
}
EXPORT_SYMBOL(__xfrm_state_destroy);

int __xfrm_state_delete(struct xfrm_state *x)
{
        struct net *net = xs_net(x);
        int err = -ESRCH;

        if (x->km.state != XFRM_STATE_DEAD) {
                x->km.state = XFRM_STATE_DEAD;

                spin_lock(&net->xfrm.xfrm_state_lock);
                list_del(&x->km.all);
                hlist_del_rcu(&x->bydst);
                hlist_del_rcu(&x->bysrc);
                if (x->km.seq)
                        hlist_del_rcu(&x->byseq);
                if (!hlist_unhashed(&x->state_cache))
                        hlist_del_rcu(&x->state_cache);
                if (!hlist_unhashed(&x->state_cache_input))
                        hlist_del_rcu(&x->state_cache_input);

                if (x->id.spi)
                        hlist_del_rcu(&x->byspi);
                net->xfrm.state_num--;
                xfrm_nat_keepalive_state_updated(x);
                spin_unlock(&net->xfrm.xfrm_state_lock);

                xfrm_dev_state_delete(x);

                xfrm_state_delete_tunnel(x);

                /* All xfrm_state objects are created by xfrm_state_alloc.
                 * The xfrm_state_alloc call gives a reference, and that
                 * is what we are dropping here.
                 */
                xfrm_state_put(x);
                err = 0;
        }

        return err;
}
EXPORT_SYMBOL(__xfrm_state_delete);

int xfrm_state_delete(struct xfrm_state *x)
{
        int err;

        spin_lock_bh(&x->lock);
        err = __xfrm_state_delete(x);
        spin_unlock_bh(&x->lock);

        return err;
}
EXPORT_SYMBOL(xfrm_state_delete);

#ifdef CONFIG_SECURITY_NETWORK_XFRM
static inline int
xfrm_state_flush_secctx_check(struct net *net, u8 proto, bool task_valid)
{
        int i, err = 0;

        for (i = 0; i <= net->xfrm.state_hmask; i++) {
                struct xfrm_state *x;

                hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) {
                        if (xfrm_id_proto_match(x->id.proto, proto) &&
                           (err = security_xfrm_state_delete(x)) != 0) {
                                xfrm_audit_state_delete(x, 0, task_valid);
                                return err;
                        }
                }
        }

        return err;
}

static inline int
xfrm_dev_state_flush_secctx_check(struct net *net, struct net_device *dev, bool task_valid)
{
        int i, err = 0;

        for (i = 0; i <= net->xfrm.state_hmask; i++) {
                struct xfrm_state *x;
                struct xfrm_dev_offload *xso;

                hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) {
                        xso = &x->xso;

                        if (xso->dev == dev &&
                           (err = security_xfrm_state_delete(x)) != 0) {
                                xfrm_audit_state_delete(x, 0, task_valid);
                                return err;
                        }
                }
        }

        return err;
}
#else
static inline int
xfrm_state_flush_secctx_check(struct net *net, u8 proto, bool task_valid)
{
        return 0;
}

static inline int
xfrm_dev_state_flush_secctx_check(struct net *net, struct net_device *dev, bool task_valid)
{
        return 0;
}
#endif

int xfrm_state_flush(struct net *net, u8 proto, bool task_valid)
{
        int i, err = 0, cnt = 0;

        spin_lock_bh(&net->xfrm.xfrm_state_lock);
        err = xfrm_state_flush_secctx_check(net, proto, task_valid);
        if (err)
                goto out;

        err = -ESRCH;
        for (i = 0; i <= net->xfrm.state_hmask; i++) {
                struct xfrm_state *x;
restart:
                hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) {
                        if (!xfrm_state_kern(x) &&
                            xfrm_id_proto_match(x->id.proto, proto)) {
                                xfrm_state_hold(x);
                                spin_unlock_bh(&net->xfrm.xfrm_state_lock);

                                err = xfrm_state_delete(x);
                                xfrm_audit_state_delete(x, err ? 0 : 1,
                                                        task_valid);
                                xfrm_state_put(x);
                                if (!err)
                                        cnt++;

                                spin_lock_bh(&net->xfrm.xfrm_state_lock);
                                goto restart;
                        }
                }
        }
out:
        spin_unlock_bh(&net->xfrm.xfrm_state_lock);
        if (cnt)
                err = 0;

        return err;
}
EXPORT_SYMBOL(xfrm_state_flush);

int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid)
{
        struct xfrm_state *x;
        struct hlist_node *tmp;
        struct xfrm_dev_offload *xso;
        int i, err = 0, cnt = 0;

        spin_lock_bh(&net->xfrm.xfrm_state_lock);
        err = xfrm_dev_state_flush_secctx_check(net, dev, task_valid);
        if (err)
                goto out;

        err = -ESRCH;
        for (i = 0; i <= net->xfrm.state_hmask; i++) {
restart:
                hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) {
                        xso = &x->xso;

                        if (!xfrm_state_kern(x) && xso->dev == dev) {
                                xfrm_state_hold(x);
                                spin_unlock_bh(&net->xfrm.xfrm_state_lock);

                                err = xfrm_state_delete(x);
                                xfrm_dev_state_free(x);

                                xfrm_audit_state_delete(x, err ? 0 : 1,
                                                        task_valid);
                                xfrm_state_put(x);
                                if (!err)
                                        cnt++;

                                spin_lock_bh(&net->xfrm.xfrm_state_lock);
                                goto restart;
                        }
                }
        }
        if (cnt)
                err = 0;

out:
        spin_unlock_bh(&net->xfrm.xfrm_state_lock);

        spin_lock_bh(&xfrm_state_dev_gc_lock);
restart_gc:
        hlist_for_each_entry_safe(x, tmp, &xfrm_state_dev_gc_list, dev_gclist) {
                xso = &x->xso;

                if (xso->dev == dev) {
                        spin_unlock_bh(&xfrm_state_dev_gc_lock);
                        xfrm_dev_state_free(x);
                        spin_lock_bh(&xfrm_state_dev_gc_lock);
                        goto restart_gc;
                }

        }
        spin_unlock_bh(&xfrm_state_dev_gc_lock);

        xfrm_flush_gc();

        return err;
}
EXPORT_SYMBOL(xfrm_dev_state_flush);

void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si)
{
        spin_lock_bh(&net->xfrm.xfrm_state_lock);
        si->sadcnt = net->xfrm.state_num;
        si->sadhcnt = net->xfrm.state_hmask + 1;
        si->sadhmcnt = xfrm_state_hashmax;
        spin_unlock_bh(&net->xfrm.xfrm_state_lock);
}
EXPORT_SYMBOL(xfrm_sad_getinfo);

static void
__xfrm4_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl)
{
        const struct flowi4 *fl4 = &fl->u.ip4;

        sel->daddr.a4 = fl4->daddr;
        sel->saddr.a4 = fl4->saddr;
        sel->dport = xfrm_flowi_dport(fl, &fl4->uli);
        sel->dport_mask = htons(0xffff);
        sel->sport = xfrm_flowi_sport(fl, &fl4->uli);
        sel->sport_mask = htons(0xffff);
        sel->family = AF_INET;
        sel->prefixlen_d = 32;
        sel->prefixlen_s = 32;
        sel->proto = fl4->flowi4_proto;
        sel->ifindex = fl4->flowi4_oif;
}

static void
__xfrm6_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl)
{
        const struct flowi6 *fl6 = &fl->u.ip6;

        /* Initialize temporary selector matching only to current session. */
        *(struct in6_addr *)&sel->daddr = fl6->daddr;
        *(struct in6_addr *)&sel->saddr = fl6->saddr;
        sel->dport = xfrm_flowi_dport(fl, &fl6->uli);
        sel->dport_mask = htons(0xffff);
        sel->sport = xfrm_flowi_sport(fl, &fl6->uli);
        sel->sport_mask = htons(0xffff);
        sel->family = AF_INET6;
        sel->prefixlen_d = 128;
        sel->prefixlen_s = 128;
        sel->proto = fl6->flowi6_proto;
        sel->ifindex = fl6->flowi6_oif;
}

static void
xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl,
                    const struct xfrm_tmpl *tmpl,
                    const xfrm_address_t *daddr, const xfrm_address_t *saddr,
                    unsigned short family)
{
        switch (family) {
        case AF_INET:
                __xfrm4_init_tempsel(&x->sel, fl);
                break;
        case AF_INET6:
                __xfrm6_init_tempsel(&x->sel, fl);
                break;
        }

        x->id = tmpl->id;

        switch (tmpl->encap_family) {
        case AF_INET:
                if (x->id.daddr.a4 == 0)
                        x->id.daddr.a4 = daddr->a4;
                x->props.saddr = tmpl->saddr;
                if (x->props.saddr.a4 == 0)
                        x->props.saddr.a4 = saddr->a4;
                break;
        case AF_INET6:
                if (ipv6_addr_any((struct in6_addr *)&x->id.daddr))
                        memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr));
                memcpy(&x->props.saddr, &tmpl->saddr, sizeof(x->props.saddr));
                if (ipv6_addr_any((struct in6_addr *)&x->props.saddr))
                        memcpy(&x->props.saddr, saddr, sizeof(x->props.saddr));
                break;
        }

        x->props.mode = tmpl->mode;
        x->props.reqid = tmpl->reqid;
        x->props.family = tmpl->encap_family;
}

struct xfrm_hash_state_ptrs {
        const struct hlist_head *bydst;
        const struct hlist_head *bysrc;
        const struct hlist_head *byspi;
        unsigned int hmask;
};

static void xfrm_hash_ptrs_get(const struct net *net, struct xfrm_hash_state_ptrs *ptrs)
{
        unsigned int sequence;

        do {
                sequence = read_seqcount_begin(&net->xfrm.xfrm_state_hash_generation);

                ptrs->bydst = xfrm_state_deref_check(net->xfrm.state_bydst, net);
                ptrs->bysrc = xfrm_state_deref_check(net->xfrm.state_bysrc, net);
                ptrs->byspi = xfrm_state_deref_check(net->xfrm.state_byspi, net);
                ptrs->hmask = net->xfrm.state_hmask;
        } while (read_seqcount_retry(&net->xfrm.xfrm_state_hash_generation, sequence));
}

static struct xfrm_state *__xfrm_state_lookup_all(const struct xfrm_hash_state_ptrs *state_ptrs,
                                                  u32 mark,
                                                  const xfrm_address_t *daddr,
                                                  __be32 spi, u8 proto,
                                                  unsigned short family,
                                                  struct xfrm_dev_offload *xdo)
{
        unsigned int h = __xfrm_spi_hash(daddr, spi, proto, family, state_ptrs->hmask);
        struct xfrm_state *x;

        hlist_for_each_entry_rcu(x, state_ptrs->byspi + h, byspi) {
#ifdef CONFIG_XFRM_OFFLOAD
                if (xdo->type == XFRM_DEV_OFFLOAD_PACKET) {
                        if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET)
                                /* HW states are in the head of list, there is
                                 * no need to iterate further.
                                 */
                                break;

                        /* Packet offload: both policy and SA should
                         * have same device.
                         */
                        if (xdo->dev != x->xso.dev)
                                continue;
                } else if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET)
                        /* Skip HW policy for SW lookups */
                        continue;
#endif
                if (x->props.family != family ||
                    x->id.spi       != spi ||
                    x->id.proto     != proto ||
                    !xfrm_addr_equal(&x->id.daddr, daddr, family))
                        continue;

                if ((mark & x->mark.m) != x->mark.v)
                        continue;
                if (!xfrm_state_hold_rcu(x))
                        continue;
                return x;
        }

        return NULL;
}

static struct xfrm_state *__xfrm_state_lookup(const struct xfrm_hash_state_ptrs *state_ptrs,
                                              u32 mark,
                                              const xfrm_address_t *daddr,
                                              __be32 spi, u8 proto,
                                              unsigned short family)
{
        unsigned int h = __xfrm_spi_hash(daddr, spi, proto, family, state_ptrs->hmask);
        struct xfrm_state *x;

        hlist_for_each_entry_rcu(x, state_ptrs->byspi + h, byspi) {
                if (x->props.family != family ||
                    x->id.spi       != spi ||
                    x->id.proto     != proto ||
                    !xfrm_addr_equal(&x->id.daddr, daddr, family))
                        continue;

                if ((mark & x->mark.m) != x->mark.v)
                        continue;
                if (!xfrm_state_hold_rcu(x))
                        continue;
                return x;
        }

        return NULL;
}

struct xfrm_state *xfrm_input_state_lookup(struct net *net, u32 mark,
                                           const xfrm_address_t *daddr,
                                           __be32 spi, u8 proto,
                                           unsigned short family)
{
        struct xfrm_hash_state_ptrs state_ptrs;
        struct hlist_head *state_cache_input;
        struct xfrm_state *x = NULL;

        state_cache_input = raw_cpu_ptr(net->xfrm.state_cache_input);

        rcu_read_lock();
        hlist_for_each_entry_rcu(x, state_cache_input, state_cache_input) {
                if (x->props.family != family ||
                    x->id.spi       != spi ||
                    x->id.proto     != proto ||
                    !xfrm_addr_equal(&x->id.daddr, daddr, family))
                        continue;

                if ((mark & x->mark.m) != x->mark.v)
                        continue;
                if (!xfrm_state_hold_rcu(x))
                        continue;
                goto out;
        }

        xfrm_hash_ptrs_get(net, &state_ptrs);

        x = __xfrm_state_lookup(&state_ptrs, mark, daddr, spi, proto, family);

        if (x && x->km.state == XFRM_STATE_VALID) {
                spin_lock_bh(&net->xfrm.xfrm_state_lock);
                if (hlist_unhashed(&x->state_cache_input)) {
                        hlist_add_head_rcu(&x->state_cache_input, state_cache_input);
                } else {
                        hlist_del_rcu(&x->state_cache_input);
                        hlist_add_head_rcu(&x->state_cache_input, state_cache_input);
                }
                spin_unlock_bh(&net->xfrm.xfrm_state_lock);
        }

out:
        rcu_read_unlock();
        return x;
}
EXPORT_SYMBOL(xfrm_input_state_lookup);

static struct xfrm_state *__xfrm_state_lookup_byaddr(const struct xfrm_hash_state_ptrs *state_ptrs,
                                                     u32 mark,
                                                     const xfrm_address_t *daddr,
                                                     const xfrm_address_t *saddr,
                                                     u8 proto, unsigned short family)
{
        unsigned int h = __xfrm_src_hash(daddr, saddr, family, state_ptrs->hmask);
        struct xfrm_state *x;

        hlist_for_each_entry_rcu(x, state_ptrs->bysrc + h, bysrc) {
                if (x->props.family != family ||
                    x->id.proto     != proto ||
                    !xfrm_addr_equal(&x->id.daddr, daddr, family) ||
                    !xfrm_addr_equal(&x->props.saddr, saddr, family))
                        continue;

                if ((mark & x->mark.m) != x->mark.v)
                        continue;
                if (!xfrm_state_hold_rcu(x))
                        continue;
                return x;
        }

        return NULL;
}

static inline struct xfrm_state *
__xfrm_state_locate(struct xfrm_state *x, int use_spi, int family)
{
        struct xfrm_hash_state_ptrs state_ptrs;
        struct net *net = xs_net(x);
        u32 mark = x->mark.v & x->mark.m;

        xfrm_hash_ptrs_get(net, &state_ptrs);

        if (use_spi)
                return __xfrm_state_lookup(&state_ptrs, mark, &x->id.daddr,
                                           x->id.spi, x->id.proto, family);
        else
                return __xfrm_state_lookup_byaddr(&state_ptrs, mark,
                                                  &x->id.daddr,
                                                  &x->props.saddr,
                                                  x->id.proto, family);
}

static void xfrm_hash_grow_check(struct net *net, int have_hash_collision)
{
        if (have_hash_collision &&
            (net->xfrm.state_hmask + 1) < xfrm_state_hashmax &&
            net->xfrm.state_num > net->xfrm.state_hmask)
                schedule_work(&net->xfrm.state_hash_work);
}

static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x,
                               const struct flowi *fl, unsigned short family,
                               struct xfrm_state **best, int *acq_in_progress,
                               int *error, unsigned int pcpu_id)
{
        /* Resolution logic:
         * 1. There is a valid state with matching selector. Done.
         * 2. Valid state with inappropriate selector. Skip.
         *
         * Entering area of "sysdeps".
         *
         * 3. If state is not valid, selector is temporary, it selects
         *    only session which triggered previous resolution. Key
         *    manager will do something to install a state with proper
         *    selector.
         */
        if (x->km.state == XFRM_STATE_VALID) {
                if ((x->sel.family &&
                     (x->sel.family != family ||
                      !xfrm_selector_match(&x->sel, fl, family))) ||
                    !security_xfrm_state_pol_flow_match(x, pol,
                                                        &fl->u.__fl_common))
                        return;

                if (x->pcpu_num != UINT_MAX && x->pcpu_num != pcpu_id)
                        return;

                if (!*best ||
                    ((*best)->pcpu_num == UINT_MAX && x->pcpu_num == pcpu_id) ||
                    (*best)->km.dying > x->km.dying ||
                    ((*best)->km.dying == x->km.dying &&
                     (*best)->curlft.add_time < x->curlft.add_time))
                        *best = x;
        } else if (x->km.state == XFRM_STATE_ACQ) {
                if (!*best || x->pcpu_num == pcpu_id)
                        *acq_in_progress = 1;
        } else if (x->km.state == XFRM_STATE_ERROR ||
                   x->km.state == XFRM_STATE_EXPIRED) {
                if ((!x->sel.family ||
                     (x->sel.family == family &&
                      xfrm_selector_match(&x->sel, fl, family))) &&
                    security_xfrm_state_pol_flow_match(x, pol,
                                                       &fl->u.__fl_common))
                        *error = -ESRCH;
        }
}

struct xfrm_state *
xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
                const struct flowi *fl, struct xfrm_tmpl *tmpl,
                struct xfrm_policy *pol, int *err,
                unsigned short family, u32 if_id)
{
        static xfrm_address_t saddr_wildcard = { };
        struct xfrm_hash_state_ptrs state_ptrs;
        struct net *net = xp_net(pol);
        unsigned int h, h_wildcard;
        struct xfrm_state *x, *x0, *to_put;
        int acquire_in_progress = 0;
        int error = 0;
        struct xfrm_state *best = NULL;
        u32 mark = pol->mark.v & pol->mark.m;
        unsigned short encap_family = tmpl->encap_family;
        unsigned int sequence;
        struct km_event c;
        unsigned int pcpu_id;
        bool cached = false;

        /* We need the cpu id just as a lookup key,
         * we don't require it to be stable.
         */
        pcpu_id = raw_smp_processor_id();

        to_put = NULL;

        sequence = read_seqcount_begin(&net->xfrm.xfrm_state_hash_generation);

        rcu_read_lock();
        xfrm_hash_ptrs_get(net, &state_ptrs);

        hlist_for_each_entry_rcu(x, &pol->state_cache_list, state_cache) {
                if (x->props.family == encap_family &&
                    x->props.reqid == tmpl->reqid &&
                    (mark & x->mark.m) == x->mark.v &&
                    x->if_id == if_id &&
                    !(x->props.flags & XFRM_STATE_WILDRECV) &&
                    xfrm_state_addr_check(x, daddr, saddr, encap_family) &&
                    tmpl->mode == x->props.mode &&
                    tmpl->id.proto == x->id.proto &&
                    (tmpl->id.spi == x->id.spi || !tmpl->id.spi))
                        xfrm_state_look_at(pol, x, fl, encap_family,
                                           &best, &acquire_in_progress, &error, pcpu_id);
        }

        if (best)
                goto cached;

        hlist_for_each_entry_rcu(x, &pol->state_cache_list, state_cache) {
                if (x->props.family == encap_family &&
                    x->props.reqid == tmpl->reqid &&
                    (mark & x->mark.m) == x->mark.v &&
                    x->if_id == if_id &&
                    !(x->props.flags & XFRM_STATE_WILDRECV) &&
                    xfrm_addr_equal(&x->id.daddr, daddr, encap_family) &&
                    tmpl->mode == x->props.mode &&
                    tmpl->id.proto == x->id.proto &&
                    (tmpl->id.spi == x->id.spi || !tmpl->id.spi))
                        xfrm_state_look_at(pol, x, fl, family,
                                           &best, &acquire_in_progress, &error, pcpu_id);
        }

cached:
        cached = true;
        if (best)
                goto found;
        else if (error)
                best = NULL;
        else if (acquire_in_progress) /* XXX: acquire_in_progress should not happen */
                WARN_ON(1);

        h = __xfrm_dst_hash(daddr, saddr, tmpl->reqid, encap_family, state_ptrs.hmask);
        hlist_for_each_entry_rcu(x, state_ptrs.bydst + h, bydst) {
#ifdef CONFIG_XFRM_OFFLOAD
                if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) {
                        if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET)
                                /* HW states are in the head of list, there is
                                 * no need to iterate further.
                                 */
                                break;

                        /* Packet offload: both policy and SA should
                         * have same device.
                         */
                        if (pol->xdo.dev != x->xso.dev)
                                continue;
                } else if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET)
                        /* Skip HW policy for SW lookups */
                        continue;
#endif
                if (x->props.family == encap_family &&
                    x->props.reqid == tmpl->reqid &&
                    (mark & x->mark.m) == x->mark.v &&
                    x->if_id == if_id &&
                    !(x->props.flags & XFRM_STATE_WILDRECV) &&
                    xfrm_state_addr_check(x, daddr, saddr, encap_family) &&
                    tmpl->mode == x->props.mode &&
                    tmpl->id.proto == x->id.proto &&
                    (tmpl->id.spi == x->id.spi || !tmpl->id.spi))
                        xfrm_state_look_at(pol, x, fl, family,
                                           &best, &acquire_in_progress, &error, pcpu_id);
        }
        if (best || acquire_in_progress)
                goto found;

        h_wildcard = __xfrm_dst_hash(daddr, &saddr_wildcard, tmpl->reqid,
                                     encap_family, state_ptrs.hmask);
        hlist_for_each_entry_rcu(x, state_ptrs.bydst + h_wildcard, bydst) {
#ifdef CONFIG_XFRM_OFFLOAD
                if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) {
                        if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET)
                                /* HW states are in the head of list, there is
                                 * no need to iterate further.
                                 */
                                break;

                        /* Packet offload: both policy and SA should
                         * have same device.
                         */
                        if (pol->xdo.dev != x->xso.dev)
                                continue;
                } else if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET)
                        /* Skip HW policy for SW lookups */
                        continue;
#endif
                if (x->props.family == encap_family &&
                    x->props.reqid == tmpl->reqid &&
                    (mark & x->mark.m) == x->mark.v &&
                    x->if_id == if_id &&
                    !(x->props.flags & XFRM_STATE_WILDRECV) &&
                    xfrm_addr_equal(&x->id.daddr, daddr, encap_family) &&
                    tmpl->mode == x->props.mode &&
                    tmpl->id.proto == x->id.proto &&
                    (tmpl->id.spi == x->id.spi || !tmpl->id.spi))
                        xfrm_state_look_at(pol, x, fl, family,
                                           &best, &acquire_in_progress, &error, pcpu_id);
        }

found:
        if (!(pol->flags & XFRM_POLICY_CPU_ACQUIRE) ||
            (best && (best->pcpu_num == pcpu_id)))
                x = best;

        if (!x && !error && !acquire_in_progress) {
                if (tmpl->id.spi &&
                    (x0 = __xfrm_state_lookup_all(&state_ptrs, mark, daddr,
                                                  tmpl->id.spi, tmpl->id.proto,
                                                  encap_family,
                                                  &pol->xdo)) != NULL) {
                        to_put = x0;
                        error = -EEXIST;
                        goto out;
                }

                c.net = net;
                /* If the KMs have no listeners (yet...), avoid allocating an SA
                 * for each and every packet - garbage collection might not
                 * handle the flood.
                 */
                if (!km_is_alive(&c)) {
                        error = -ESRCH;
                        goto out;
                }

                x = xfrm_state_alloc(net);
                if (x == NULL) {
                        error = -ENOMEM;
                        goto out;
                }
                /* Initialize temporary state matching only
                 * to current session. */
                xfrm_init_tempstate(x, fl, tmpl, daddr, saddr, family);
                memcpy(&x->mark, &pol->mark, sizeof(x->mark));
                x->if_id = if_id;
                if ((pol->flags & XFRM_POLICY_CPU_ACQUIRE) && best)
                        x->pcpu_num = pcpu_id;

                error = security_xfrm_state_alloc_acquire(x, pol->security, fl->flowi_secid);
                if (error) {
                        x->km.state = XFRM_STATE_DEAD;
                        to_put = x;
                        x = NULL;
                        goto out;
                }
#ifdef CONFIG_XFRM_OFFLOAD
                if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) {
                        struct xfrm_dev_offload *xdo = &pol->xdo;
                        struct xfrm_dev_offload *xso = &x->xso;
                        struct net_device *dev = xdo->dev;

                        xso->type = XFRM_DEV_OFFLOAD_PACKET;
                        xso->dir = xdo->dir;
                        xso->dev = dev;
                        xso->flags = XFRM_DEV_OFFLOAD_FLAG_ACQ;
                        netdev_hold(dev, &xso->dev_tracker, GFP_ATOMIC);
                        error = dev->xfrmdev_ops->xdo_dev_state_add(dev, x,
                                                                    NULL);
                        if (error) {
                                xso->dir = 0;
                                netdev_put(dev, &xso->dev_tracker);
                                xso->dev = NULL;
                                xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED;
                                x->km.state = XFRM_STATE_DEAD;
                                to_put = x;
                                x = NULL;
                                goto out;
                        }
                }
#endif
                if (km_query(x, tmpl, pol) == 0) {
                        spin_lock_bh(&net->xfrm.xfrm_state_lock);
                        x->km.state = XFRM_STATE_ACQ;
                        x->dir = XFRM_SA_DIR_OUT;
                        list_add(&x->km.all, &net->xfrm.state_all);
                        h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family);
                        XFRM_STATE_INSERT(bydst, &x->bydst,
                                          net->xfrm.state_bydst + h,
                                          x->xso.type);
                        h = xfrm_src_hash(net, daddr, saddr, encap_family);
                        XFRM_STATE_INSERT(bysrc, &x->bysrc,
                                          net->xfrm.state_bysrc + h,
                                          x->xso.type);
                        INIT_HLIST_NODE(&x->state_cache);
                        if (x->id.spi) {
                                h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, encap_family);
                                XFRM_STATE_INSERT(byspi, &x->byspi,
                                                  net->xfrm.state_byspi + h,
                                                  x->xso.type);
                        }
                        if (x->km.seq) {
                                h = xfrm_seq_hash(net, x->km.seq);
                                XFRM_STATE_INSERT(byseq, &x->byseq,
                                                  net->xfrm.state_byseq + h,
                                                  x->xso.type);
                        }
                        x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
                        hrtimer_start(&x->mtimer,
                                      ktime_set(net->xfrm.sysctl_acq_expires, 0),
                                      HRTIMER_MODE_REL_SOFT);
                        net->xfrm.state_num++;
                        xfrm_hash_grow_check(net, x->bydst.next != NULL);
                        spin_unlock_bh(&net->xfrm.xfrm_state_lock);
                } else {
#ifdef CONFIG_XFRM_OFFLOAD
                        struct xfrm_dev_offload *xso = &x->xso;

                        if (xso->type == XFRM_DEV_OFFLOAD_PACKET) {
                                xfrm_dev_state_delete(x);
                                xfrm_dev_state_free(x);
                        }
#endif
                        x->km.state = XFRM_STATE_DEAD;
                        to_put = x;
                        x = NULL;
                        error = -ESRCH;
                }

                /* Use the already installed 'fallback' while the CPU-specific
                 * SA acquire is handled*/
                if (best)
                        x = best;
        }
out:
        if (x) {
                if (!xfrm_state_hold_rcu(x)) {
                        *err = -EAGAIN;
                        x = NULL;
                }
        } else {
                *err = acquire_in_progress ? -EAGAIN : error;
        }

        if (x && x->km.state == XFRM_STATE_VALID && !cached &&
            (!(pol->flags & XFRM_POLICY_CPU_ACQUIRE) || x->pcpu_num == pcpu_id)) {
                spin_lock_bh(&net->xfrm.xfrm_state_lock);
                if (hlist_unhashed(&x->state_cache))
                        hlist_add_head_rcu(&x->state_cache, &pol->state_cache_list);
                spin_unlock_bh(&net->xfrm.xfrm_state_lock);
        }

        rcu_read_unlock();
        if (to_put)
                xfrm_state_put(to_put);

        if (read_seqcount_retry(&net->xfrm.xfrm_state_hash_generation, sequence)) {
                *err = -EAGAIN;
                if (x) {
                        xfrm_state_put(x);
                        x = NULL;
                }
        }

        return x;
}

struct xfrm_state *
xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id,
                    xfrm_address_t *daddr, xfrm_address_t *saddr,
                    unsigned short family, u8 mode, u8 proto, u32 reqid)
{
        unsigned int h;
        struct xfrm_state *rx = NULL, *x = NULL;

        spin_lock_bh(&net->xfrm.xfrm_state_lock);
        h = xfrm_dst_hash(net, daddr, saddr, reqid, family);
        hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) {
                if (x->props.family == family &&
                    x->props.reqid == reqid &&
                    (mark & x->mark.m) == x->mark.v &&
                    x->if_id == if_id &&
                    !(x->props.flags & XFRM_STATE_WILDRECV) &&
                    xfrm_state_addr_check(x, daddr, saddr, family) &&
                    mode == x->props.mode &&
                    proto == x->id.proto &&
                    x->km.state == XFRM_STATE_VALID) {
                        rx = x;
                        break;
                }
        }

        if (rx)
                xfrm_state_hold(rx);
        spin_unlock_bh(&net->xfrm.xfrm_state_lock);


        return rx;
}
EXPORT_SYMBOL(xfrm_stateonly_find);

struct xfrm_state *xfrm_state_lookup_byspi(struct net *net, __be32 spi,
                                              unsigned short family)
{
        struct xfrm_state *x;
        struct xfrm_state_walk *w;

        spin_lock_bh(&net->xfrm.xfrm_state_lock);
        list_for_each_entry(w, &net->xfrm.state_all, all) {
                x = container_of(w, struct xfrm_state, km);
                if (x->props.family != family ||
                        x->id.spi != spi)
                        continue;

                xfrm_state_hold(x);
                spin_unlock_bh(&net->xfrm.xfrm_state_lock);
                return x;
        }
        spin_unlock_bh(&net->xfrm.xfrm_state_lock);
        return NULL;
}
EXPORT_SYMBOL(xfrm_state_lookup_byspi);

static struct xfrm_state *xfrm_state_lookup_spi_proto(struct net *net, __be32 spi, u8 proto)
{
        struct xfrm_state *x;
        unsigned int i;

        rcu_read_lock();
        for (i = 0; i <= net->xfrm.state_hmask; i++) {
                hlist_for_each_entry_rcu(x, &net->xfrm.state_byspi[i], byspi) {
                        if (x->id.spi == spi && x->id.proto == proto) {
                                if (!xfrm_state_hold_rcu(x))
                                        continue;
                                rcu_read_unlock();
                                return x;
                        }
                }
        }
        rcu_read_unlock();
        return NULL;
}

static void __xfrm_state_insert(struct xfrm_state *x)
{
        struct net *net = xs_net(x);
        unsigned int h;

        list_add(&x->km.all, &net->xfrm.state_all);

        /* Sanitize mark before store */
        x->mark.v &= x->mark.m;

        h = xfrm_dst_hash(net, &x->id.daddr, &x->props.saddr,
                          x->props.reqid, x->props.family);
        XFRM_STATE_INSERT(bydst, &x->bydst, net->xfrm.state_bydst + h,
                          x->xso.type);

        h = xfrm_src_hash(net, &x->id.daddr, &x->props.saddr, x->props.family);
        XFRM_STATE_INSERT(bysrc, &x->bysrc, net->xfrm.state_bysrc + h,
                          x->xso.type);

        if (x->id.spi) {
                h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto,
                                  x->props.family);

                XFRM_STATE_INSERT(byspi, &x->byspi, net->xfrm.state_byspi + h,
                                  x->xso.type);
        }

        if (x->km.seq) {
                h = xfrm_seq_hash(net, x->km.seq);

                XFRM_STATE_INSERT(byseq, &x->byseq, net->xfrm.state_byseq + h,
                                  x->xso.type);
        }

        hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT);
        if (x->replay_maxage)
                mod_timer(&x->rtimer, jiffies + x->replay_maxage);

        net->xfrm.state_num++;

        xfrm_hash_grow_check(net, x->bydst.next != NULL);
        xfrm_nat_keepalive_state_updated(x);
}

/* net->xfrm.xfrm_state_lock is held */
static void __xfrm_state_bump_genids(struct xfrm_state *xnew)
{
        struct net *net = xs_net(xnew);
        unsigned short family = xnew->props.family;
        u32 reqid = xnew->props.reqid;
        struct xfrm_state *x;
        unsigned int h;
        u32 mark = xnew->mark.v & xnew->mark.m;
        u32 if_id = xnew->if_id;
        u32 cpu_id = xnew->pcpu_num;

        h = xfrm_dst_hash(net, &xnew->id.daddr, &xnew->props.saddr, reqid, family);
        hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) {
                if (x->props.family        == family &&
                    x->props.reqid        == reqid &&
                    x->if_id                == if_id &&
                    x->pcpu_num                == cpu_id &&
                    (mark & x->mark.m) == x->mark.v &&
                    xfrm_addr_equal(&x->id.daddr, &xnew->id.daddr, family) &&
                    xfrm_addr_equal(&x->props.saddr, &xnew->props.saddr, family))
                        x->genid++;
        }
}

void xfrm_state_insert(struct xfrm_state *x)
{
        struct net *net = xs_net(x);

        spin_lock_bh(&net->xfrm.xfrm_state_lock);
        __xfrm_state_bump_genids(x);
        __xfrm_state_insert(x);
        spin_unlock_bh(&net->xfrm.xfrm_state_lock);
}
EXPORT_SYMBOL(xfrm_state_insert);

/* net->xfrm.xfrm_state_lock is held */
static struct xfrm_state *__find_acq_core(struct net *net,
                                          const struct xfrm_mark *m,
                                          unsigned short family, u8 mode,
                                          u32 reqid, u32 if_id, u32 pcpu_num, u8 proto,
                                          const xfrm_address_t *daddr,
                                          const xfrm_address_t *saddr,
                                          int create)
{
        unsigned int h = xfrm_dst_hash(net, daddr, saddr, reqid, family);
        struct xfrm_state *x;
        u32 mark = m->v & m->m;

        hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) {
                if (x->props.reqid  != reqid ||
                    x->props.mode   != mode ||
                    x->props.family != family ||
                    x->km.state     != XFRM_STATE_ACQ ||
                    x->id.spi       != 0 ||
                    x->id.proto            != proto ||
                    (mark & x->mark.m) != x->mark.v ||
                    x->pcpu_num != pcpu_num ||
                    !xfrm_addr_equal(&x->id.daddr, daddr, family) ||
                    !xfrm_addr_equal(&x->props.saddr, saddr, family))
                        continue;

                xfrm_state_hold(x);
                return x;
        }

        if (!create)
                return NULL;

        x = xfrm_state_alloc(net);
        if (likely(x)) {
                switch (family) {
                case AF_INET:
                        x->sel.daddr.a4 = daddr->a4;
                        x->sel.saddr.a4 = saddr->a4;
                        x->sel.prefixlen_d = 32;
                        x->sel.prefixlen_s = 32;
                        x->props.saddr.a4 = saddr->a4;
                        x->id.daddr.a4 = daddr->a4;
                        break;

                case AF_INET6:
                        x->sel.daddr.in6 = daddr->in6;
                        x->sel.saddr.in6 = saddr->in6;
                        x->sel.prefixlen_d = 128;
                        x->sel.prefixlen_s = 128;
                        x->props.saddr.in6 = saddr->in6;
                        x->id.daddr.in6 = daddr->in6;
                        break;
                }

                x->pcpu_num = pcpu_num;
                x->km.state = XFRM_STATE_ACQ;
                x->id.proto = proto;
                x->props.family = family;
                x->props.mode = mode;
                x->props.reqid = reqid;
                x->if_id = if_id;
                x->mark.v = m->v;
                x->mark.m = m->m;
                x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
                xfrm_state_hold(x);
                hrtimer_start(&x->mtimer,
                              ktime_set(net->xfrm.sysctl_acq_expires, 0),
                              HRTIMER_MODE_REL_SOFT);
                list_add(&x->km.all, &net->xfrm.state_all);
                XFRM_STATE_INSERT(bydst, &x->bydst, net->xfrm.state_bydst + h,
                                  x->xso.type);
                h = xfrm_src_hash(net, daddr, saddr, family);
                XFRM_STATE_INSERT(bysrc, &x->bysrc, net->xfrm.state_bysrc + h,
                                  x->xso.type);

                net->xfrm.state_num++;

                xfrm_hash_grow_check(net, x->bydst.next != NULL);
        }

        return x;
}

static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num);

int xfrm_state_add(struct xfrm_state *x)
{
        struct net *net = xs_net(x);
        struct xfrm_state *x1, *to_put;
        int family;
        int err;
        u32 mark = x->mark.v & x->mark.m;
        int use_spi = xfrm_id_proto_match(x->id.proto, IPSEC_PROTO_ANY);

        family = x->props.family;

        to_put = NULL;

        spin_lock_bh(&net->xfrm.xfrm_state_lock);

        x1 = __xfrm_state_locate(x, use_spi, family);
        if (x1) {
                to_put = x1;
                x1 = NULL;
                err = -EEXIST;
                goto out;
        }

        if (use_spi && x->km.seq) {
                x1 = __xfrm_find_acq_byseq(net, mark, x->km.seq, x->pcpu_num);
                if (x1 && ((x1->id.proto != x->id.proto) ||
                    !xfrm_addr_equal(&x1->id.daddr, &x->id.daddr, family))) {
                        to_put = x1;
                        x1 = NULL;
                }
        }

        if (use_spi && !x1)
                x1 = __find_acq_core(net, &x->mark, family, x->props.mode,
                                     x->props.reqid, x->if_id, x->pcpu_num, x->id.proto,
                                     &x->id.daddr, &x->props.saddr, 0);

        __xfrm_state_bump_genids(x);
        __xfrm_state_insert(x);
        err = 0;

out:
        spin_unlock_bh(&net->xfrm.xfrm_state_lock);

        if (x1) {
                xfrm_state_delete(x1);
                xfrm_state_put(x1);
        }

        if (to_put)
                xfrm_state_put(to_put);

        return err;
}
EXPORT_SYMBOL(xfrm_state_add);

#ifdef CONFIG_XFRM_MIGRATE
static inline int clone_security(struct xfrm_state *x, struct xfrm_sec_ctx *security)
{
        struct xfrm_user_sec_ctx *uctx;
        int size = sizeof(*uctx) + security->ctx_len;
        int err;

        uctx = kmalloc(size, GFP_KERNEL);
        if (!uctx)
                return -ENOMEM;

        uctx->exttype = XFRMA_SEC_CTX;
        uctx->len = size;
        uctx->ctx_doi = security->ctx_doi;
        uctx->ctx_alg = security->ctx_alg;
        uctx->ctx_len = security->ctx_len;
        memcpy(uctx + 1, security->ctx_str, security->ctx_len);
        err = security_xfrm_state_alloc(x, uctx);
        kfree(uctx);
        if (err)
                return err;

        return 0;
}

static struct xfrm_state *xfrm_state_clone_and_setup(struct xfrm_state *orig,
                                           struct xfrm_encap_tmpl *encap,
                                           struct xfrm_migrate *m)
{
        struct net *net = xs_net(orig);
        struct xfrm_state *x = xfrm_state_alloc(net);
        if (!x)
                goto out;

        memcpy(&x->id, &orig->id, sizeof(x->id));
        memcpy(&x->sel, &orig->sel, sizeof(x->sel));
        memcpy(&x->lft, &orig->lft, sizeof(x->lft));
        x->props.mode = orig->props.mode;
        x->props.replay_window = orig->props.replay_window;
        x->props.reqid = orig->props.reqid;
        x->props.family = orig->props.family;
        x->props.saddr = orig->props.saddr;

        if (orig->aalg) {
                x->aalg = xfrm_algo_auth_clone(orig->aalg);
                if (!x->aalg)
                        goto error;
        }
        x->props.aalgo = orig->props.aalgo;

        if (orig->aead) {
                x->aead = xfrm_algo_aead_clone(orig->aead);
                x->geniv = orig->geniv;
                if (!x->aead)
                        goto error;
        }
        if (orig->ealg) {
                x->ealg = xfrm_algo_clone(orig->ealg);
                if (!x->ealg)
                        goto error;
        }
        x->props.ealgo = orig->props.ealgo;

        if (orig->calg) {
                x->calg = xfrm_algo_clone(orig->calg);
                if (!x->calg)
                        goto error;
        }
        x->props.calgo = orig->props.calgo;

        if (encap || orig->encap) {
                if (encap)
                        x->encap = kmemdup(encap, sizeof(*x->encap),
                                        GFP_KERNEL);
                else
                        x->encap = kmemdup(orig->encap, sizeof(*x->encap),
                                        GFP_KERNEL);

                if (!x->encap)
                        goto error;
        }

        if (orig->security)
                if (clone_security(x, orig->security))
                        goto error;

        if (orig->coaddr) {
                x->coaddr = kmemdup(orig->coaddr, sizeof(*x->coaddr),
                                    GFP_KERNEL);
                if (!x->coaddr)
                        goto error;
        }

        if (orig->replay_esn) {
                if (xfrm_replay_clone(x, orig))
                        goto error;
        }

        memcpy(&x->mark, &orig->mark, sizeof(x->mark));
        memcpy(&x->props.smark, &orig->props.smark, sizeof(x->props.smark));

        x->props.flags = orig->props.flags;
        x->props.extra_flags = orig->props.extra_flags;

        x->pcpu_num = orig->pcpu_num;
        x->if_id = orig->if_id;
        x->tfcpad = orig->tfcpad;
        x->replay_maxdiff = orig->replay_maxdiff;
        x->replay_maxage = orig->replay_maxage;
        memcpy(&x->curlft, &orig->curlft, sizeof(x->curlft));
        x->km.state = orig->km.state;
        x->km.seq = orig->km.seq;
        x->replay = orig->replay;
        x->preplay = orig->preplay;
        x->mapping_maxage = orig->mapping_maxage;
        x->lastused = orig->lastused;
        x->new_mapping = 0;
        x->new_mapping_sport = 0;
        x->dir = orig->dir;

        x->mode_cbs = orig->mode_cbs;
        if (x->mode_cbs && x->mode_cbs->clone_state) {
                if (x->mode_cbs->clone_state(x, orig))
                        goto error;
        }


        x->props.family = m->new_family;
        memcpy(&x->id.daddr, &m->new_daddr, sizeof(x->id.daddr));
        memcpy(&x->props.saddr, &m->new_saddr, sizeof(x->props.saddr));

        return x;

 error:
        x->km.state = XFRM_STATE_DEAD;
        xfrm_state_put(x);
out:
        return NULL;
}

struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net,
                                                u32 if_id)
{
        unsigned int h;
        struct xfrm_state *x = NULL;

        spin_lock_bh(&net->xfrm.xfrm_state_lock);

        if (m->reqid) {
                h = xfrm_dst_hash(net, &m->old_daddr, &m->old_saddr,
                                  m->reqid, m->old_family);
                hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) {
                        if (x->props.mode != m->mode ||
                            x->id.proto != m->proto)
                                continue;
                        if (m->reqid && x->props.reqid != m->reqid)
                                continue;
                        if (if_id != 0 && x->if_id != if_id)
                                continue;
                        if (!xfrm_addr_equal(&x->id.daddr, &m->old_daddr,
                                             m->old_family) ||
                            !xfrm_addr_equal(&x->props.saddr, &m->old_saddr,
                                             m->old_family))
                                continue;
                        xfrm_state_hold(x);
                        break;
                }
        } else {
                h = xfrm_src_hash(net, &m->old_daddr, &m->old_saddr,
                                  m->old_family);
                hlist_for_each_entry(x, net->xfrm.state_bysrc+h, bysrc) {
                        if (x->props.mode != m->mode ||
                            x->id.proto != m->proto)
                                continue;
                        if (if_id != 0 && x->if_id != if_id)
                                continue;
                        if (!xfrm_addr_equal(&x->id.daddr, &m->old_daddr,
                                             m->old_family) ||
                            !xfrm_addr_equal(&x->props.saddr, &m->old_saddr,
                                             m->old_family))
                                continue;
                        xfrm_state_hold(x);
                        break;
                }
        }

        spin_unlock_bh(&net->xfrm.xfrm_state_lock);

        return x;
}
EXPORT_SYMBOL(xfrm_migrate_state_find);

struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x,
                                      struct xfrm_migrate *m,
                                      struct xfrm_encap_tmpl *encap,
                                      struct net *net,
                                      struct xfrm_user_offload *xuo,
                                      struct netlink_ext_ack *extack)
{
        struct xfrm_state *xc;

        xc = xfrm_state_clone_and_setup(x, encap, m);
        if (!xc)
                return NULL;

        if (xfrm_init_state(xc) < 0)
                goto error;

        /* configure the hardware if offload is requested */
        if (xuo && xfrm_dev_state_add(net, xc, xuo, extack))
                goto error;

        /* add state */
        if (xfrm_addr_equal(&x->id.daddr, &m->new_daddr, m->new_family)) {
                /* a care is needed when the destination address of the
                   state is to be updated as it is a part of triplet */
                xfrm_state_insert(xc);
        } else {
                if (xfrm_state_add(xc) < 0)
                        goto error_add;
        }

        return xc;
error_add:
        if (xuo)
                xfrm_dev_state_delete(xc);
error:
        xc->km.state = XFRM_STATE_DEAD;
        xfrm_state_put(xc);
        return NULL;
}
EXPORT_SYMBOL(xfrm_state_migrate);
#endif

int xfrm_state_update(struct xfrm_state *x)
{
        struct xfrm_state *x1, *to_put;
        int err;
        int use_spi = xfrm_id_proto_match(x->id.proto, IPSEC_PROTO_ANY);
        struct net *net = xs_net(x);

        to_put = NULL;

        spin_lock_bh(&net->xfrm.xfrm_state_lock);
        x1 = __xfrm_state_locate(x, use_spi, x->props.family);

        err = -ESRCH;
        if (!x1)
                goto out;

        if (xfrm_state_kern(x1)) {
                to_put = x1;
                err = -EEXIST;
                goto out;
        }

        if (x1->km.state == XFRM_STATE_ACQ) {
                if (x->dir && x1->dir != x->dir) {
                        to_put = x1;
                        goto out;
                }

                __xfrm_state_insert(x);
                x = NULL;
        } else {
                if (x1->dir != x->dir) {
                        to_put = x1;
                        goto out;
                }
        }
        err = 0;

out:
        spin_unlock_bh(&net->xfrm.xfrm_state_lock);

        if (to_put)
                xfrm_state_put(to_put);

        if (err)
                return err;

        if (!x) {
                xfrm_state_delete(x1);
                xfrm_state_put(x1);
                return 0;
        }

        err = -EINVAL;
        spin_lock_bh(&x1->lock);
        if (likely(x1->km.state == XFRM_STATE_VALID)) {
                if (x->encap && x1->encap &&
                    x->encap->encap_type == x1->encap->encap_type)
                        memcpy(x1->encap, x->encap, sizeof(*x1->encap));
                else if (x->encap || x1->encap)
                        goto fail;

                if (x->coaddr && x1->coaddr) {
                        memcpy(x1->coaddr, x->coaddr, sizeof(*x1->coaddr));
                }
                if (!use_spi && memcmp(&x1->sel, &x->sel, sizeof(x1->sel)))
                        memcpy(&x1->sel, &x->sel, sizeof(x1->sel));
                memcpy(&x1->lft, &x->lft, sizeof(x1->lft));
                x1->km.dying = 0;

                hrtimer_start(&x1->mtimer, ktime_set(1, 0),
                              HRTIMER_MODE_REL_SOFT);
                if (READ_ONCE(x1->curlft.use_time))
                        xfrm_state_check_expire(x1);

                if (x->props.smark.m || x->props.smark.v || x->if_id) {
                        spin_lock_bh(&net->xfrm.xfrm_state_lock);

                        if (x->props.smark.m || x->props.smark.v)
                                x1->props.smark = x->props.smark;

                        if (x->if_id)
                                x1->if_id = x->if_id;

                        __xfrm_state_bump_genids(x1);
                        spin_unlock_bh(&net->xfrm.xfrm_state_lock);
                }

                err = 0;
                x->km.state = XFRM_STATE_DEAD;
                __xfrm_state_put(x);
        }

fail:
        spin_unlock_bh(&x1->lock);

        xfrm_state_put(x1);

        return err;
}
EXPORT_SYMBOL(xfrm_state_update);

int xfrm_state_check_expire(struct xfrm_state *x)
{
        /* All counters which are needed to decide if state is expired
         * are handled by SW for non-packet offload modes. Simply skip
         * the following update and save extra boilerplate in drivers.
         */
        if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET)
                xfrm_dev_state_update_stats(x);

        if (!READ_ONCE(x->curlft.use_time))
                WRITE_ONCE(x->curlft.use_time, ktime_get_real_seconds());

        if (x->curlft.bytes >= x->lft.hard_byte_limit ||
            x->curlft.packets >= x->lft.hard_packet_limit) {
                x->km.state = XFRM_STATE_EXPIRED;
                hrtimer_start(&x->mtimer, 0, HRTIMER_MODE_REL_SOFT);
                return -EINVAL;
        }

        if (!x->km.dying &&
            (x->curlft.bytes >= x->lft.soft_byte_limit ||
             x->curlft.packets >= x->lft.soft_packet_limit)) {
                x->km.dying = 1;
                km_state_expired(x, 0, 0);
        }
        return 0;
}
EXPORT_SYMBOL(xfrm_state_check_expire);

void xfrm_state_update_stats(struct net *net)
{
        struct xfrm_state *x;
        int i;

        spin_lock_bh(&net->xfrm.xfrm_state_lock);
        for (i = 0; i <= net->xfrm.state_hmask; i++) {
                hlist_for_each_entry(x, net->xfrm.state_bydst + i, bydst)
                        xfrm_dev_state_update_stats(x);
        }
        spin_unlock_bh(&net->xfrm.xfrm_state_lock);
}

struct xfrm_state *
xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 spi,
                  u8 proto, unsigned short family)
{
        struct xfrm_hash_state_ptrs state_ptrs;
        struct xfrm_state *x;

        rcu_read_lock();
        xfrm_hash_ptrs_get(net, &state_ptrs);

        x = __xfrm_state_lookup(&state_ptrs, mark, daddr, spi, proto, family);
        rcu_read_unlock();
        return x;
}
EXPORT_SYMBOL(xfrm_state_lookup);

struct xfrm_state *
xfrm_state_lookup_byaddr(struct net *net, u32 mark,
                         const xfrm_address_t *daddr, const xfrm_address_t *saddr,
                         u8 proto, unsigned short family)
{
        struct xfrm_hash_state_ptrs state_ptrs;
        struct xfrm_state *x;

        rcu_read_lock();

        xfrm_hash_ptrs_get(net, &state_ptrs);

        x = __xfrm_state_lookup_byaddr(&state_ptrs, mark, daddr, saddr, proto, family);
        rcu_read_unlock();
        return x;
}
EXPORT_SYMBOL(xfrm_state_lookup_byaddr);

struct xfrm_state *
xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, u8 mode, u32 reqid,
              u32 if_id, u32 pcpu_num, u8 proto, const xfrm_address_t *daddr,
              const xfrm_address_t *saddr, int create, unsigned short family)
{
        struct xfrm_state *x;

        spin_lock_bh(&net->xfrm.xfrm_state_lock);
        x = __find_acq_core(net, mark, family, mode, reqid, if_id, pcpu_num,
                            proto, daddr, saddr, create);
        spin_unlock_bh(&net->xfrm.xfrm_state_lock);

        return x;
}
EXPORT_SYMBOL(xfrm_find_acq);

#ifdef CONFIG_XFRM_SUB_POLICY
#if IS_ENABLED(CONFIG_IPV6)
/* distribution counting sort function for xfrm_state and xfrm_tmpl */
static void
__xfrm6_sort(void **dst, void **src, int n,
             int (*cmp)(const void *p), int maxclass)
{
        int count[XFRM_MAX_DEPTH] = { };
        int class[XFRM_MAX_DEPTH];
        int i;

        for (i = 0; i < n; i++) {
                int c = cmp(src[i]);

                class[i] = c;
                count[c]++;
        }

        for (i = 2; i < maxclass; i++)
                count[i] += count[i - 1];

        for (i = 0; i < n; i++) {
                dst[count[class[i] - 1]++] = src[i];
                src[i] = NULL;
        }
}

/* Rule for xfrm_state:
 *
 * rule 1: select IPsec transport except AH
 * rule 2: select MIPv6 RO or inbound trigger
 * rule 3: select IPsec transport AH
 * rule 4: select IPsec tunnel
 * rule 5: others
 */
static int __xfrm6_state_sort_cmp(const void *p)
{
        const struct xfrm_state *v = p;

        switch (v->props.mode) {
        case XFRM_MODE_TRANSPORT:
                if (v->id.proto != IPPROTO_AH)
                        return 1;
                else
                        return 3;
#if IS_ENABLED(CONFIG_IPV6_MIP6)
        case XFRM_MODE_ROUTEOPTIMIZATION:
        case XFRM_MODE_IN_TRIGGER:
                return 2;
#endif
        case XFRM_MODE_TUNNEL:
        case XFRM_MODE_BEET:
        case XFRM_MODE_IPTFS:
                return 4;
        }
        return 5;
}

/* Rule for xfrm_tmpl:
 *
 * rule 1: select IPsec transport
 * rule 2: select MIPv6 RO or inbound trigger
 * rule 3: select IPsec tunnel
 * rule 4: others
 */
static int __xfrm6_tmpl_sort_cmp(const void *p)
{
        const struct xfrm_tmpl *v = p;

        switch (v->mode) {
        case XFRM_MODE_TRANSPORT:
                return 1;
#if IS_ENABLED(CONFIG_IPV6_MIP6)
        case XFRM_MODE_ROUTEOPTIMIZATION:
        case XFRM_MODE_IN_TRIGGER:
                return 2;
#endif
        case XFRM_MODE_TUNNEL:
        case XFRM_MODE_BEET:
        case XFRM_MODE_IPTFS:
                return 3;
        }
        return 4;
}
#else
static inline int __xfrm6_state_sort_cmp(const void *p) { return 5; }
static inline int __xfrm6_tmpl_sort_cmp(const void *p) { return 4; }

static inline void
__xfrm6_sort(void **dst, void **src, int n,
             int (*cmp)(const void *p), int maxclass)
{
        int i;

        for (i = 0; i < n; i++)
                dst[i] = src[i];
}
#endif /* CONFIG_IPV6 */

void
xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n,
               unsigned short family)
{
        int i;

        if (family == AF_INET6)
                __xfrm6_sort((void **)dst, (void **)src, n,
                             __xfrm6_tmpl_sort_cmp, 5);
        else
                for (i = 0; i < n; i++)
                        dst[i] = src[i];
}

void
xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n,
                unsigned short family)
{
        int i;

        if (family == AF_INET6)
                __xfrm6_sort((void **)dst, (void **)src, n,
                             __xfrm6_state_sort_cmp, 6);
        else
                for (i = 0; i < n; i++)
                        dst[i] = src[i];
}
#endif

/* Silly enough, but I'm lazy to build resolution list */

static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num)
{
        unsigned int h = xfrm_seq_hash(net, seq);
        struct xfrm_state *x;

        hlist_for_each_entry_rcu(x, net->xfrm.state_byseq + h, byseq) {
                if (x->km.seq == seq &&
                    (mark & x->mark.m) == x->mark.v &&
                    x->pcpu_num == pcpu_num &&
                    x->km.state == XFRM_STATE_ACQ) {
                        xfrm_state_hold(x);
                        return x;
                }
        }

        return NULL;
}

struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num)
{
        struct xfrm_state *x;

        spin_lock_bh(&net->xfrm.xfrm_state_lock);
        x = __xfrm_find_acq_byseq(net, mark, seq, pcpu_num);
        spin_unlock_bh(&net->xfrm.xfrm_state_lock);
        return x;
}
EXPORT_SYMBOL(xfrm_find_acq_byseq);

u32 xfrm_get_acqseq(void)
{
        u32 res;
        static atomic_t acqseq;

        do {
                res = atomic_inc_return(&acqseq);
        } while (!res);

        return res;
}
EXPORT_SYMBOL(xfrm_get_acqseq);

int verify_spi_info(u8 proto, u32 min, u32 max, struct netlink_ext_ack *extack)
{
        switch (proto) {
        case IPPROTO_AH:
        case IPPROTO_ESP:
                break;

        case IPPROTO_COMP:
                /* IPCOMP spi is 16-bits. */
                if (max >= 0x10000) {
                        NL_SET_ERR_MSG(extack, "IPCOMP SPI must be <= 65535");
                        return -EINVAL;
                }
                break;

        default:
                NL_SET_ERR_MSG(extack, "Invalid protocol, must be one of AH, ESP, IPCOMP");
                return -EINVAL;
        }

        if (min > max) {
                NL_SET_ERR_MSG(extack, "Invalid SPI range: min > max");
                return -EINVAL;
        }

        return 0;
}
EXPORT_SYMBOL(verify_spi_info);

int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high,
                   struct netlink_ext_ack *extack)
{
        struct net *net = xs_net(x);
        unsigned int h;
        struct xfrm_state *x0;
        int err = -ENOENT;
        u32 range = high - low + 1;
        __be32 newspi = 0;

        spin_lock_bh(&x->lock);
        if (x->km.state == XFRM_STATE_DEAD) {
                NL_SET_ERR_MSG(extack, "Target ACQUIRE is in DEAD state");
                goto unlock;
        }

        err = 0;
        if (x->id.spi)
                goto unlock;

        err = -ENOENT;

        for (h = 0; h < range; h++) {
                u32 spi = (low == high) ? low : get_random_u32_inclusive(low, high);
                if (spi == 0)
                        goto next;
                newspi = htonl(spi);

                spin_lock_bh(&net->xfrm.xfrm_state_lock);
                x0 = xfrm_state_lookup_spi_proto(net, newspi, x->id.proto);
                if (!x0) {
                        x->id.spi = newspi;
                        h = xfrm_spi_hash(net, &x->id.daddr, newspi, x->id.proto, x->props.family);
                        XFRM_STATE_INSERT(byspi, &x->byspi, net->xfrm.state_byspi + h, x->xso.type);
                        spin_unlock_bh(&net->xfrm.xfrm_state_lock);
                        err = 0;
                        goto unlock;
                }
                xfrm_state_put(x0);
                spin_unlock_bh(&net->xfrm.xfrm_state_lock);

next:
                if (signal_pending(current)) {
                        err = -ERESTARTSYS;
                        goto unlock;
                }

                if (low == high)
                        break;
        }

        if (err)
                NL_SET_ERR_MSG(extack, "No SPI available in the requested range");

unlock:
        spin_unlock_bh(&x->lock);

        return err;
}
EXPORT_SYMBOL(xfrm_alloc_spi);

static bool __xfrm_state_filter_match(struct xfrm_state *x,
                                      struct xfrm_address_filter *filter)
{
        if (filter) {
                if ((filter->family == AF_INET ||
                     filter->family == AF_INET6) &&
                    x->props.family != filter->family)
                        return false;

                return addr_match(&x->props.saddr, &filter->saddr,
                                  filter->splen) &&
                       addr_match(&x->id.daddr, &filter->daddr,
                                  filter->dplen);
        }
        return true;
}

int xfrm_state_walk(struct net *net, struct xfrm_state_walk *walk,
                    int (*func)(struct xfrm_state *, int, void*),
                    void *data)
{
        struct xfrm_state *state;
        struct xfrm_state_walk *x;
        int err = 0;

        if (walk->seq != 0 && list_empty(&walk->all))
                return 0;

        spin_lock_bh(&net->xfrm.xfrm_state_lock);
        if (list_empty(&walk->all))
                x = list_first_entry(&net->xfrm.state_all, struct xfrm_state_walk, all);
        else
                x = list_first_entry(&walk->all, struct xfrm_state_walk, all);
        list_for_each_entry_from(x, &net->xfrm.state_all, all) {
                if (x->state == XFRM_STATE_DEAD)
                        continue;
                state = container_of(x, struct xfrm_state, km);
                if (!xfrm_id_proto_match(state->id.proto, walk->proto))
                        continue;
                if (!__xfrm_state_filter_match(state, walk->filter))
                        continue;
                err = func(state, walk->seq, data);
                if (err) {
                        list_move_tail(&walk->all, &x->all);
                        goto out;
                }
                walk->seq++;
        }
        if (walk->seq == 0) {
                err = -ENOENT;
                goto out;
        }
        list_del_init(&walk->all);
out:
        spin_unlock_bh(&net->xfrm.xfrm_state_lock);
        return err;
}
EXPORT_SYMBOL(xfrm_state_walk);

void xfrm_state_walk_init(struct xfrm_state_walk *walk, u8 proto,
                          struct xfrm_address_filter *filter)
{
        INIT_LIST_HEAD(&walk->all);
        walk->proto = proto;
        walk->state = XFRM_STATE_DEAD;
        walk->seq = 0;
        walk->filter = filter;
}
EXPORT_SYMBOL(xfrm_state_walk_init);

void xfrm_state_walk_done(struct xfrm_state_walk *walk, struct net *net)
{
        kfree(walk->filter);

        if (list_empty(&walk->all))
                return;

        spin_lock_bh(&net->xfrm.xfrm_state_lock);
        list_del(&walk->all);
        spin_unlock_bh(&net->xfrm.xfrm_state_lock);
}
EXPORT_SYMBOL(xfrm_state_walk_done);

static void xfrm_replay_timer_handler(struct timer_list *t)
{
        struct xfrm_state *x = timer_container_of(x, t, rtimer);

        spin_lock(&x->lock);

        if (x->km.state == XFRM_STATE_VALID) {
                if (xfrm_aevent_is_on(xs_net(x)))
                        xfrm_replay_notify(x, XFRM_REPLAY_TIMEOUT);
                else
                        x->xflags |= XFRM_TIME_DEFER;
        }

        spin_unlock(&x->lock);
}

static LIST_HEAD(xfrm_km_list);

void km_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c)
{
        struct xfrm_mgr *km;

        rcu_read_lock();
        list_for_each_entry_rcu(km, &xfrm_km_list, list)
                if (km->notify_policy)
                        km->notify_policy(xp, dir, c);
        rcu_read_unlock();
}

void km_state_notify(struct xfrm_state *x, const struct km_event *c)
{
        struct xfrm_mgr *km;
        rcu_read_lock();
        list_for_each_entry_rcu(km, &xfrm_km_list, list)
                if (km->notify)
                        km->notify(x, c);
        rcu_read_unlock();
}

EXPORT_SYMBOL(km_policy_notify);
EXPORT_SYMBOL(km_state_notify);

void km_state_expired(struct xfrm_state *x, int hard, u32 portid)
{
        struct km_event c;

        c.data.hard = hard;
        c.portid = portid;
        c.event = XFRM_MSG_EXPIRE;
        km_state_notify(x, &c);
}

EXPORT_SYMBOL(km_state_expired);
/*
 * We send to all registered managers regardless of failure
 * We are happy with one success
*/
int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol)
{
        int err = -EINVAL, acqret;
        struct xfrm_mgr *km;

        rcu_read_lock();
        list_for_each_entry_rcu(km, &xfrm_km_list, list) {
                acqret = km->acquire(x, t, pol);
                if (!acqret)
                        err = acqret;
        }
        rcu_read_unlock();
        return err;
}
EXPORT_SYMBOL(km_query);

static int __km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport)
{
        int err = -EINVAL;
        struct xfrm_mgr *km;

        rcu_read_lock();
        list_for_each_entry_rcu(km, &xfrm_km_list, list) {
                if (km->new_mapping)
                        err = km->new_mapping(x, ipaddr, sport);
                if (!err)
                        break;
        }
        rcu_read_unlock();
        return err;
}

int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport)
{
        int ret = 0;

        if (x->mapping_maxage) {
                if ((jiffies / HZ - x->new_mapping) > x->mapping_maxage ||
                    x->new_mapping_sport != sport) {
                        x->new_mapping_sport = sport;
                        x->new_mapping = jiffies / HZ;
                        ret = __km_new_mapping(x, ipaddr, sport);
                }
        } else {
                ret = __km_new_mapping(x, ipaddr, sport);
        }

        return ret;
}
EXPORT_SYMBOL(km_new_mapping);

void km_policy_expired(struct xfrm_policy *pol, int dir, int hard, u32 portid)
{
        struct km_event c;

        c.data.hard = hard;
        c.portid = portid;
        c.event = XFRM_MSG_POLEXPIRE;
        km_policy_notify(pol, dir, &c);
}
EXPORT_SYMBOL(km_policy_expired);

#ifdef CONFIG_XFRM_MIGRATE
int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
               const struct xfrm_migrate *m, int num_migrate,
               const struct xfrm_kmaddress *k,
               const struct xfrm_encap_tmpl *encap)
{
        int err = -EINVAL;
        int ret;
        struct xfrm_mgr *km;

        rcu_read_lock();
        list_for_each_entry_rcu(km, &xfrm_km_list, list) {
                if (km->migrate) {
                        ret = km->migrate(sel, dir, type, m, num_migrate, k,
                                          encap);
                        if (!ret)
                                err = ret;
                }
        }
        rcu_read_unlock();
        return err;
}
EXPORT_SYMBOL(km_migrate);
#endif

int km_report(struct net *net, u8 proto, struct xfrm_selector *sel, xfrm_address_t *addr)
{
        int err = -EINVAL;
        int ret;
        struct xfrm_mgr *km;

        rcu_read_lock();
        list_for_each_entry_rcu(km, &xfrm_km_list, list) {
                if (km->report) {
                        ret = km->report(net, proto, sel, addr);
                        if (!ret)
                                err = ret;
                }
        }
        rcu_read_unlock();
        return err;
}
EXPORT_SYMBOL(km_report);

static bool km_is_alive(const struct km_event *c)
{
        struct xfrm_mgr *km;
        bool is_alive = false;

        rcu_read_lock();
        list_for_each_entry_rcu(km, &xfrm_km_list, list) {
                if (km->is_alive && km->is_alive(c)) {
                        is_alive = true;
                        break;
                }
        }
        rcu_read_unlock();

        return is_alive;
}

#if IS_ENABLED(CONFIG_XFRM_USER_COMPAT)
static DEFINE_SPINLOCK(xfrm_translator_lock);
static struct xfrm_translator __rcu *xfrm_translator;

struct xfrm_translator *xfrm_get_translator(void)
{
        struct xfrm_translator *xtr;

        rcu_read_lock();
        xtr = rcu_dereference(xfrm_translator);
        if (unlikely(!xtr))
                goto out;
        if (!try_module_get(xtr->owner))
                xtr = NULL;
out:
        rcu_read_unlock();
        return xtr;
}
EXPORT_SYMBOL_GPL(xfrm_get_translator);

void xfrm_put_translator(struct xfrm_translator *xtr)
{
        module_put(xtr->owner);
}
EXPORT_SYMBOL_GPL(xfrm_put_translator);

int xfrm_register_translator(struct xfrm_translator *xtr)
{
        int err = 0;

        spin_lock_bh(&xfrm_translator_lock);
        if (unlikely(xfrm_translator != NULL))
                err = -EEXIST;
        else
                rcu_assign_pointer(xfrm_translator, xtr);
        spin_unlock_bh(&xfrm_translator_lock);

        return err;
}
EXPORT_SYMBOL_GPL(xfrm_register_translator);

int xfrm_unregister_translator(struct xfrm_translator *xtr)
{
        int err = 0;

        spin_lock_bh(&xfrm_translator_lock);
        if (likely(xfrm_translator != NULL)) {
                if (rcu_access_pointer(xfrm_translator) != xtr)
                        err = -EINVAL;
                else
                        RCU_INIT_POINTER(xfrm_translator, NULL);
        }
        spin_unlock_bh(&xfrm_translator_lock);
        synchronize_rcu();

        return err;
}
EXPORT_SYMBOL_GPL(xfrm_unregister_translator);
#endif

int xfrm_user_policy(struct sock *sk, int optname, sockptr_t optval, int optlen)
{
        int err;
        u8 *data;
        struct xfrm_mgr *km;
        struct xfrm_policy *pol = NULL;

        if (sockptr_is_null(optval) && !optlen) {
                xfrm_sk_policy_insert(sk, XFRM_POLICY_IN, NULL);
                xfrm_sk_policy_insert(sk, XFRM_POLICY_OUT, NULL);
                __sk_dst_reset(sk);
                return 0;
        }

        if (optlen <= 0 || optlen > PAGE_SIZE)
                return -EMSGSIZE;

        data = memdup_sockptr(optval, optlen);
        if (IS_ERR(data))
                return PTR_ERR(data);

        if (in_compat_syscall()) {
                struct xfrm_translator *xtr = xfrm_get_translator();

                if (!xtr) {
                        kfree(data);
                        return -EOPNOTSUPP;
                }

                err = xtr->xlate_user_policy_sockptr(&data, optlen);
                xfrm_put_translator(xtr);
                if (err) {
                        kfree(data);
                        return err;
                }
        }

        err = -EINVAL;
        rcu_read_lock();
        list_for_each_entry_rcu(km, &xfrm_km_list, list) {
                pol = km->compile_policy(sk, optname, data,
                                         optlen, &err);
                if (err >= 0)
                        break;
        }
        rcu_read_unlock();

        if (err >= 0) {
                xfrm_sk_policy_insert(sk, err, pol);
                xfrm_pol_put(pol);
                __sk_dst_reset(sk);
                err = 0;
        }

        kfree(data);
        return err;
}
EXPORT_SYMBOL(xfrm_user_policy);

static DEFINE_SPINLOCK(xfrm_km_lock);

void xfrm_register_km(struct xfrm_mgr *km)
{
        spin_lock_bh(&xfrm_km_lock);
        list_add_tail_rcu(&km->list, &xfrm_km_list);
        spin_unlock_bh(&xfrm_km_lock);
}
EXPORT_SYMBOL(xfrm_register_km);

void xfrm_unregister_km(struct xfrm_mgr *km)
{
        spin_lock_bh(&xfrm_km_lock);
        list_del_rcu(&km->list);
        spin_unlock_bh(&xfrm_km_lock);
        synchronize_rcu();
}
EXPORT_SYMBOL(xfrm_unregister_km);

int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo)
{
        int err = 0;

        if (WARN_ON(afinfo->family >= NPROTO))
                return -EAFNOSUPPORT;

        spin_lock_bh(&xfrm_state_afinfo_lock);
        if (unlikely(xfrm_state_afinfo[afinfo->family] != NULL))
                err = -EEXIST;
        else
                rcu_assign_pointer(xfrm_state_afinfo[afinfo->family], afinfo);
        spin_unlock_bh(&xfrm_state_afinfo_lock);
        return err;
}
EXPORT_SYMBOL(xfrm_state_register_afinfo);

int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo)
{
        int err = 0, family = afinfo->family;

        if (WARN_ON(family >= NPROTO))
                return -EAFNOSUPPORT;

        spin_lock_bh(&xfrm_state_afinfo_lock);
        if (likely(xfrm_state_afinfo[afinfo->family] != NULL)) {
                if (rcu_access_pointer(xfrm_state_afinfo[family]) != afinfo)
                        err = -EINVAL;
                else
                        RCU_INIT_POINTER(xfrm_state_afinfo[afinfo->family], NULL);
        }
        spin_unlock_bh(&xfrm_state_afinfo_lock);
        synchronize_rcu();
        return err;
}
EXPORT_SYMBOL(xfrm_state_unregister_afinfo);

struct xfrm_state_afinfo *xfrm_state_afinfo_get_rcu(unsigned int family)
{
        if (unlikely(family >= NPROTO))
                return NULL;

        return rcu_dereference(xfrm_state_afinfo[family]);
}
EXPORT_SYMBOL_GPL(xfrm_state_afinfo_get_rcu);

struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family)
{
        struct xfrm_state_afinfo *afinfo;
        if (unlikely(family >= NPROTO))
                return NULL;
        rcu_read_lock();
        afinfo = rcu_dereference(xfrm_state_afinfo[family]);
        if (unlikely(!afinfo))
                rcu_read_unlock();
        return afinfo;
}

void xfrm_flush_gc(void)
{
        flush_work(&xfrm_state_gc_work);
}
EXPORT_SYMBOL(xfrm_flush_gc);

static void xfrm_state_delete_tunnel(struct xfrm_state *x)
{
        if (x->tunnel) {
                struct xfrm_state *t = x->tunnel;

                if (atomic_dec_return(&t->tunnel_users) == 1)
                        xfrm_state_delete(t);
                xfrm_state_put(t);
                x->tunnel = NULL;
        }
}

u32 xfrm_state_mtu(struct xfrm_state *x, int mtu)
{
        const struct xfrm_type *type = READ_ONCE(x->type);
        struct crypto_aead *aead;
        u32 blksize, net_adj = 0;

        if (x->km.state != XFRM_STATE_VALID ||
            !type || type->proto != IPPROTO_ESP)
                return mtu - x->props.header_len;

        aead = x->data;
        blksize = ALIGN(crypto_aead_blocksize(aead), 4);

        switch (x->props.mode) {
        case XFRM_MODE_TRANSPORT:
        case XFRM_MODE_BEET:
                if (x->props.family == AF_INET)
                        net_adj = sizeof(struct iphdr);
                else if (x->props.family == AF_INET6)
                        net_adj = sizeof(struct ipv6hdr);
                break;
        case XFRM_MODE_TUNNEL:
                break;
        default:
                if (x->mode_cbs && x->mode_cbs->get_inner_mtu)
                        return x->mode_cbs->get_inner_mtu(x, mtu);

                WARN_ON_ONCE(1);
                break;
        }

        return ((mtu - x->props.header_len - crypto_aead_authsize(aead) -
                 net_adj) & ~(blksize - 1)) + net_adj - 2;
}
EXPORT_SYMBOL_GPL(xfrm_state_mtu);

int __xfrm_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
        const struct xfrm_mode *inner_mode;
        const struct xfrm_mode *outer_mode;
        int family = x->props.family;
        int err;

        if (family == AF_INET &&
            READ_ONCE(xs_net(x)->ipv4.sysctl_ip_no_pmtu_disc))
                x->props.flags |= XFRM_STATE_NOPMTUDISC;

        err = -EPROTONOSUPPORT;

        if (x->sel.family != AF_UNSPEC) {
                inner_mode = xfrm_get_mode(x->props.mode, x->sel.family);
                if (inner_mode == NULL) {
                        NL_SET_ERR_MSG(extack, "Requested mode not found");
                        goto error;
                }

                if (!(inner_mode->flags & XFRM_MODE_FLAG_TUNNEL) &&
                    family != x->sel.family) {
                        NL_SET_ERR_MSG(extack, "Only tunnel modes can accommodate a change of family");
                        goto error;
                }

                x->inner_mode = *inner_mode;
        } else {
                const struct xfrm_mode *inner_mode_iaf;
                int iafamily = AF_INET;

                inner_mode = xfrm_get_mode(x->props.mode, x->props.family);
                if (inner_mode == NULL) {
                        NL_SET_ERR_MSG(extack, "Requested mode not found");
                        goto error;
                }

                x->inner_mode = *inner_mode;

                if (x->props.family == AF_INET)
                        iafamily = AF_INET6;

                inner_mode_iaf = xfrm_get_mode(x->props.mode, iafamily);
                if (inner_mode_iaf) {
                        if (inner_mode_iaf->flags & XFRM_MODE_FLAG_TUNNEL)
                                x->inner_mode_iaf = *inner_mode_iaf;
                }
        }

        x->type = xfrm_get_type(x->id.proto, family);
        if (x->type == NULL) {
                NL_SET_ERR_MSG(extack, "Requested type not found");
                goto error;
        }

        err = x->type->init_state(x, extack);
        if (err)
                goto error;

        outer_mode = xfrm_get_mode(x->props.mode, family);
        if (!outer_mode) {
                NL_SET_ERR_MSG(extack, "Requested mode not found");
                err = -EPROTONOSUPPORT;
                goto error;
        }

        x->outer_mode = *outer_mode;
        if (x->nat_keepalive_interval) {
                if (x->dir != XFRM_SA_DIR_OUT) {
                        NL_SET_ERR_MSG(extack, "NAT keepalive is only supported for outbound SAs");
                        err = -EINVAL;
                        goto error;
                }

                if (!x->encap || x->encap->encap_type != UDP_ENCAP_ESPINUDP) {
                        NL_SET_ERR_MSG(extack,
                                       "NAT keepalive is only supported for UDP encapsulation");
                        err = -EINVAL;
                        goto error;
                }
        }

        x->mode_cbs = xfrm_get_mode_cbs(x->props.mode);
        if (x->mode_cbs) {
                if (x->mode_cbs->init_state)
                        err = x->mode_cbs->init_state(x);
                module_put(x->mode_cbs->owner);
        }
error:
        return err;
}

EXPORT_SYMBOL(__xfrm_init_state);

int xfrm_init_state(struct xfrm_state *x)
{
        int err;

        err = __xfrm_init_state(x, NULL);
        if (err)
                return err;

        err = xfrm_init_replay(x, NULL);
        if (err)
                return err;

        x->km.state = XFRM_STATE_VALID;
        return 0;
}

EXPORT_SYMBOL(xfrm_init_state);

int __net_init xfrm_state_init(struct net *net)
{
        unsigned int sz;

        if (net_eq(net, &init_net))
                xfrm_state_cache = KMEM_CACHE(xfrm_state,
                                              SLAB_HWCACHE_ALIGN | SLAB_PANIC);

        INIT_LIST_HEAD(&net->xfrm.state_all);

        sz = sizeof(struct hlist_head) * 8;

        net->xfrm.state_bydst = xfrm_hash_alloc(sz);
        if (!net->xfrm.state_bydst)
                goto out_bydst;
        net->xfrm.state_bysrc = xfrm_hash_alloc(sz);
        if (!net->xfrm.state_bysrc)
                goto out_bysrc;
        net->xfrm.state_byspi = xfrm_hash_alloc(sz);
        if (!net->xfrm.state_byspi)
                goto out_byspi;
        net->xfrm.state_byseq = xfrm_hash_alloc(sz);
        if (!net->xfrm.state_byseq)
                goto out_byseq;

        net->xfrm.state_cache_input = alloc_percpu(struct hlist_head);
        if (!net->xfrm.state_cache_input)
                goto out_state_cache_input;

        net->xfrm.state_hmask = ((sz / sizeof(struct hlist_head)) - 1);

        net->xfrm.state_num = 0;
        INIT_WORK(&net->xfrm.state_hash_work, xfrm_hash_resize);
        spin_lock_init(&net->xfrm.xfrm_state_lock);
        seqcount_spinlock_init(&net->xfrm.xfrm_state_hash_generation,
                               &net->xfrm.xfrm_state_lock);
        return 0;

out_state_cache_input:
        xfrm_hash_free(net->xfrm.state_byseq, sz);
out_byseq:
        xfrm_hash_free(net->xfrm.state_byspi, sz);
out_byspi:
        xfrm_hash_free(net->xfrm.state_bysrc, sz);
out_bysrc:
        xfrm_hash_free(net->xfrm.state_bydst, sz);
out_bydst:
        return -ENOMEM;
}

void xfrm_state_fini(struct net *net)
{
        unsigned int sz;
        int i;

        flush_work(&net->xfrm.state_hash_work);
        xfrm_state_flush(net, 0, false);
        flush_work(&xfrm_state_gc_work);

        WARN_ON(!list_empty(&net->xfrm.state_all));

        for (i = 0; i <= net->xfrm.state_hmask; i++) {
                WARN_ON(!hlist_empty(net->xfrm.state_byseq + i));
                WARN_ON(!hlist_empty(net->xfrm.state_byspi + i));
                WARN_ON(!hlist_empty(net->xfrm.state_bysrc + i));
                WARN_ON(!hlist_empty(net->xfrm.state_bydst + i));
        }

        sz = (net->xfrm.state_hmask + 1) * sizeof(struct hlist_head);
        xfrm_hash_free(net->xfrm.state_byseq, sz);
        xfrm_hash_free(net->xfrm.state_byspi, sz);
        xfrm_hash_free(net->xfrm.state_bysrc, sz);
        xfrm_hash_free(net->xfrm.state_bydst, sz);
        free_percpu(net->xfrm.state_cache_input);
}

#ifdef CONFIG_AUDITSYSCALL
static void xfrm_audit_helper_sainfo(struct xfrm_state *x,
                                     struct audit_buffer *audit_buf)
{
        struct xfrm_sec_ctx *ctx = x->security;
        u32 spi = ntohl(x->id.spi);

        if (ctx)
                audit_log_format(audit_buf, " sec_alg=%u sec_doi=%u sec_obj=%s",
                                 ctx->ctx_alg, ctx->ctx_doi, ctx->ctx_str);

        switch (x->props.family) {
        case AF_INET:
                audit_log_format(audit_buf, " src=%pI4 dst=%pI4",
                                 &x->props.saddr.a4, &x->id.daddr.a4);
                break;
        case AF_INET6:
                audit_log_format(audit_buf, " src=%pI6 dst=%pI6",
                                 x->props.saddr.a6, x->id.daddr.a6);
                break;
        }

        audit_log_format(audit_buf, " spi=%u(0x%x)", spi, spi);
}

static void xfrm_audit_helper_pktinfo(struct sk_buff *skb, u16 family,
                                      struct audit_buffer *audit_buf)
{
        const struct iphdr *iph4;
        const struct ipv6hdr *iph6;

        switch (family) {
        case AF_INET:
                iph4 = ip_hdr(skb);
                audit_log_format(audit_buf, " src=%pI4 dst=%pI4",
                                 &iph4->saddr, &iph4->daddr);
                break;
        case AF_INET6:
                iph6 = ipv6_hdr(skb);
                audit_log_format(audit_buf,
                                 " src=%pI6 dst=%pI6 flowlbl=0x%x%02x%02x",
                                 &iph6->saddr, &iph6->daddr,
                                 iph6->flow_lbl[0] & 0x0f,
                                 iph6->flow_lbl[1],
                                 iph6->flow_lbl[2]);
                break;
        }
}

void xfrm_audit_state_add(struct xfrm_state *x, int result, bool task_valid)
{
        struct audit_buffer *audit_buf;

        audit_buf = xfrm_audit_start("SAD-add");
        if (audit_buf == NULL)
                return;
        xfrm_audit_helper_usrinfo(task_valid, audit_buf);
        xfrm_audit_helper_sainfo(x, audit_buf);
        audit_log_format(audit_buf, " res=%u", result);
        audit_log_end(audit_buf);
}
EXPORT_SYMBOL_GPL(xfrm_audit_state_add);

void xfrm_audit_state_delete(struct xfrm_state *x, int result, bool task_valid)
{
        struct audit_buffer *audit_buf;

        audit_buf = xfrm_audit_start("SAD-delete");
        if (audit_buf == NULL)
                return;
        xfrm_audit_helper_usrinfo(task_valid, audit_buf);
        xfrm_audit_helper_sainfo(x, audit_buf);
        audit_log_format(audit_buf, " res=%u", result);
        audit_log_end(audit_buf);
}
EXPORT_SYMBOL_GPL(xfrm_audit_state_delete);

void xfrm_audit_state_replay_overflow(struct xfrm_state *x,
                                      struct sk_buff *skb)
{
        struct audit_buffer *audit_buf;
        u32 spi;

        audit_buf = xfrm_audit_start("SA-replay-overflow");
        if (audit_buf == NULL)
                return;
        xfrm_audit_helper_pktinfo(skb, x->props.family, audit_buf);
        /* don't record the sequence number because it's inherent in this kind
         * of audit message */
        spi = ntohl(x->id.spi);
        audit_log_format(audit_buf, " spi=%u(0x%x)", spi, spi);
        audit_log_end(audit_buf);
}
EXPORT_SYMBOL_GPL(xfrm_audit_state_replay_overflow);

void xfrm_audit_state_replay(struct xfrm_state *x,
                             struct sk_buff *skb, __be32 net_seq)
{
        struct audit_buffer *audit_buf;
        u32 spi;

        audit_buf = xfrm_audit_start("SA-replayed-pkt");
        if (audit_buf == NULL)
                return;
        xfrm_audit_helper_pktinfo(skb, x->props.family, audit_buf);
        spi = ntohl(x->id.spi);
        audit_log_format(audit_buf, " spi=%u(0x%x) seqno=%u",
                         spi, spi, ntohl(net_seq));
        audit_log_end(audit_buf);
}
EXPORT_SYMBOL_GPL(xfrm_audit_state_replay);

void xfrm_audit_state_notfound_simple(struct sk_buff *skb, u16 family)
{
        struct audit_buffer *audit_buf;

        audit_buf = xfrm_audit_start("SA-notfound");
        if (audit_buf == NULL)
                return;
        xfrm_audit_helper_pktinfo(skb, family, audit_buf);
        audit_log_end(audit_buf);
}
EXPORT_SYMBOL_GPL(xfrm_audit_state_notfound_simple);

void xfrm_audit_state_notfound(struct sk_buff *skb, u16 family,
                               __be32 net_spi, __be32 net_seq)
{
        struct audit_buffer *audit_buf;
        u32 spi;

        audit_buf = xfrm_audit_start("SA-notfound");
        if (audit_buf == NULL)
                return;
        xfrm_audit_helper_pktinfo(skb, family, audit_buf);
        spi = ntohl(net_spi);
        audit_log_format(audit_buf, " spi=%u(0x%x) seqno=%u",
                         spi, spi, ntohl(net_seq));
        audit_log_end(audit_buf);
}
EXPORT_SYMBOL_GPL(xfrm_audit_state_notfound);

void xfrm_audit_state_icvfail(struct xfrm_state *x,
                              struct sk_buff *skb, u8 proto)
{
        struct audit_buffer *audit_buf;
        __be32 net_spi;
        __be32 net_seq;

        audit_buf = xfrm_audit_start("SA-icv-failure");
        if (audit_buf == NULL)
                return;
        xfrm_audit_helper_pktinfo(skb, x->props.family, audit_buf);
        if (xfrm_parse_spi(skb, proto, &net_spi, &net_seq) == 0) {
                u32 spi = ntohl(net_spi);
                audit_log_format(audit_buf, " spi=%u(0x%x) seqno=%u",
                                 spi, spi, ntohl(net_seq));
        }
        audit_log_end(audit_buf);
}
EXPORT_SYMBOL_GPL(xfrm_audit_state_icvfail);
#endif /* CONFIG_AUDITSYSCALL */





















































































































































































































































    2 




    2 


    2 
    2 



    2 

    2 







    2 
















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
// SPDX-License-Identifier: GPL-2.0
/*
 * SHA-1 and HMAC-SHA1 library functions
 */

#include <crypto/hmac.h>
#include <crypto/sha1.h>
#include <linux/bitops.h>
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/string.h>
#include <linux/unaligned.h>
#include <linux/wordpart.h>

static const struct sha1_block_state sha1_iv = {
        .h = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
};

/*
 * If you have 32 registers or more, the compiler can (and should)
 * try to change the array[] accesses into registers. However, on
 * machines with less than ~25 registers, that won't really work,
 * and at least gcc will make an unholy mess of it.
 *
 * So to avoid that mess which just slows things down, we force
 * the stores to memory to actually happen (we might be better off
 * with a 'W(t)=(val);asm("":"+m" (W(t))' there instead, as
 * suggested by Artur Skawina - that will also make gcc unable to
 * try to do the silly "optimize away loads" part because it won't
 * see what the value will be).
 *
 * Ben Herrenschmidt reports that on PPC, the C version comes close
 * to the optimized asm with this (ie on PPC you don't want that
 * 'volatile', since there are lots of registers).
 *
 * On ARM we get the best code generation by forcing a full memory barrier
 * between each SHA_ROUND, otherwise gcc happily get wild with spilling and
 * the stack frame size simply explode and performance goes down the drain.
 */

#ifdef CONFIG_X86
  #define setW(x, val) (*(volatile __u32 *)&W(x) = (val))
#elif defined(CONFIG_ARM)
  #define setW(x, val) do { W(x) = (val); __asm__("":::"memory"); } while (0)
#else
  #define setW(x, val) (W(x) = (val))
#endif

/* This "rolls" over the 512-bit array */
#define W(x) (array[(x)&15])

/*
 * Where do we get the source from? The first 16 iterations get it from
 * the input data, the next mix it from the 512-bit array.
 */
#define SHA_SRC(t) get_unaligned_be32((__u32 *)data + t)
#define SHA_MIX(t) rol32(W(t+13) ^ W(t+8) ^ W(t+2) ^ W(t), 1)

#define SHA_ROUND(t, input, fn, constant, A, B, C, D, E) do { \
        __u32 TEMP = input(t); setW(t, TEMP); \
        E += TEMP + rol32(A,5) + (fn) + (constant); \
        B = ror32(B, 2); \
        TEMP = E; E = D; D = C; C = B; B = A; A = TEMP; } while (0)

#define T_0_15(t, A, B, C, D, E)  SHA_ROUND(t, SHA_SRC, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E )
#define T_16_19(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E )
#define T_20_39(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0x6ed9eba1, A, B, C, D, E )
#define T_40_59(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, ((B&C)+(D&(B^C))) , 0x8f1bbcdc, A, B, C, D, E )
#define T_60_79(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) ,  0xca62c1d6, A, B, C, D, E )

/**
 * sha1_transform - single block SHA1 transform (deprecated)
 *
 * @digest: 160 bit digest to update
 * @data:   512 bits of data to hash
 * @array:  16 words of workspace (see note)
 *
 * This function executes SHA-1's internal compression function.  It updates the
 * 160-bit internal state (@digest) with a single 512-bit data block (@data).
 *
 * Don't use this function.  SHA-1 is no longer considered secure.  And even if
 * you do have to use SHA-1, this isn't the correct way to hash something with
 * SHA-1 as this doesn't handle padding and finalization.
 *
 * Note: If the hash is security sensitive, the caller should be sure
 * to clear the workspace. This is left to the caller to avoid
 * unnecessary clears between chained hashing operations.
 */
void sha1_transform(__u32 *digest, const char *data, __u32 *array)
{
        __u32 A, B, C, D, E;
        unsigned int i = 0;

        A = digest[0];
        B = digest[1];
        C = digest[2];
        D = digest[3];
        E = digest[4];

        /* Round 1 - iterations 0-16 take their input from 'data' */
        for (; i < 16; ++i)
                T_0_15(i, A, B, C, D, E);

        /* Round 1 - tail. Input from 512-bit mixing array */
        for (; i < 20; ++i)
                T_16_19(i, A, B, C, D, E);

        /* Round 2 */
        for (; i < 40; ++i)
                T_20_39(i, A, B, C, D, E);

        /* Round 3 */
        for (; i < 60; ++i)
                T_40_59(i, A, B, C, D, E);

        /* Round 4 */
        for (; i < 80; ++i)
                T_60_79(i, A, B, C, D, E);

        digest[0] += A;
        digest[1] += B;
        digest[2] += C;
        digest[3] += D;
        digest[4] += E;
}
EXPORT_SYMBOL(sha1_transform);

/**
 * sha1_init_raw - initialize the vectors for a SHA1 digest
 * @buf: vector to initialize
 */
void sha1_init_raw(__u32 *buf)
{
        buf[0] = 0x67452301;
        buf[1] = 0xefcdab89;
        buf[2] = 0x98badcfe;
        buf[3] = 0x10325476;
        buf[4] = 0xc3d2e1f0;
}
EXPORT_SYMBOL(sha1_init_raw);

static void __maybe_unused sha1_blocks_generic(struct sha1_block_state *state,
                                               const u8 *data, size_t nblocks)
{
        u32 workspace[SHA1_WORKSPACE_WORDS];

        do {
                sha1_transform(state->h, data, workspace);
                data += SHA1_BLOCK_SIZE;
        } while (--nblocks);

        memzero_explicit(workspace, sizeof(workspace));
}

#ifdef CONFIG_CRYPTO_LIB_SHA1_ARCH
#include "sha1.h" /* $(SRCARCH)/sha1.h */
#else
#define sha1_blocks sha1_blocks_generic
#endif

void sha1_init(struct sha1_ctx *ctx)
{
        ctx->state = sha1_iv;
        ctx->bytecount = 0;
}
EXPORT_SYMBOL_GPL(sha1_init);

void sha1_update(struct sha1_ctx *ctx, const u8 *data, size_t len)
{
        size_t partial = ctx->bytecount % SHA1_BLOCK_SIZE;

        ctx->bytecount += len;

        if (partial + len >= SHA1_BLOCK_SIZE) {
                size_t nblocks;

                if (partial) {
                        size_t l = SHA1_BLOCK_SIZE - partial;

                        memcpy(&ctx->buf[partial], data, l);
                        data += l;
                        len -= l;

                        sha1_blocks(&ctx->state, ctx->buf, 1);
                }

                nblocks = len / SHA1_BLOCK_SIZE;
                len %= SHA1_BLOCK_SIZE;

                if (nblocks) {
                        sha1_blocks(&ctx->state, data, nblocks);
                        data += nblocks * SHA1_BLOCK_SIZE;
                }
                partial = 0;
        }
        if (len)
                memcpy(&ctx->buf[partial], data, len);
}
EXPORT_SYMBOL_GPL(sha1_update);

static void __sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE])
{
        u64 bitcount = ctx->bytecount << 3;
        size_t partial = ctx->bytecount % SHA1_BLOCK_SIZE;

        ctx->buf[partial++] = 0x80;
        if (partial > SHA1_BLOCK_SIZE - 8) {
                memset(&ctx->buf[partial], 0, SHA1_BLOCK_SIZE - partial);
                sha1_blocks(&ctx->state, ctx->buf, 1);
                partial = 0;
        }
        memset(&ctx->buf[partial], 0, SHA1_BLOCK_SIZE - 8 - partial);
        *(__be64 *)&ctx->buf[SHA1_BLOCK_SIZE - 8] = cpu_to_be64(bitcount);
        sha1_blocks(&ctx->state, ctx->buf, 1);

        for (size_t i = 0; i < SHA1_DIGEST_SIZE; i += 4)
                put_unaligned_be32(ctx->state.h[i / 4], out + i);
}

void sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE])
{
        __sha1_final(ctx, out);
        memzero_explicit(ctx, sizeof(*ctx));
}
EXPORT_SYMBOL_GPL(sha1_final);

void sha1(const u8 *data, size_t len, u8 out[SHA1_DIGEST_SIZE])
{
        struct sha1_ctx ctx;

        sha1_init(&ctx);
        sha1_update(&ctx, data, len);
        sha1_final(&ctx, out);
}
EXPORT_SYMBOL_GPL(sha1);

static void __hmac_sha1_preparekey(struct sha1_block_state *istate,
                                   struct sha1_block_state *ostate,
                                   const u8 *raw_key, size_t raw_key_len)
{
        union {
                u8 b[SHA1_BLOCK_SIZE];
                unsigned long w[SHA1_BLOCK_SIZE / sizeof(unsigned long)];
        } derived_key = { 0 };

        if (unlikely(raw_key_len > SHA1_BLOCK_SIZE))
                sha1(raw_key, raw_key_len, derived_key.b);
        else
                memcpy(derived_key.b, raw_key, raw_key_len);

        for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
                derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE);
        *istate = sha1_iv;
        sha1_blocks(istate, derived_key.b, 1);

        for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
                derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^
                                                HMAC_IPAD_VALUE);
        *ostate = sha1_iv;
        sha1_blocks(ostate, derived_key.b, 1);

        memzero_explicit(&derived_key, sizeof(derived_key));
}

void hmac_sha1_preparekey(struct hmac_sha1_key *key,
                          const u8 *raw_key, size_t raw_key_len)
{
        __hmac_sha1_preparekey(&key->istate, &key->ostate,
                               raw_key, raw_key_len);
}
EXPORT_SYMBOL_GPL(hmac_sha1_preparekey);

void hmac_sha1_init(struct hmac_sha1_ctx *ctx, const struct hmac_sha1_key *key)
{
        ctx->sha_ctx.state = key->istate;
        ctx->sha_ctx.bytecount = SHA1_BLOCK_SIZE;
        ctx->ostate = key->ostate;
}
EXPORT_SYMBOL_GPL(hmac_sha1_init);

void hmac_sha1_init_usingrawkey(struct hmac_sha1_ctx *ctx,
                                const u8 *raw_key, size_t raw_key_len)
{
        __hmac_sha1_preparekey(&ctx->sha_ctx.state, &ctx->ostate,
                               raw_key, raw_key_len);
        ctx->sha_ctx.bytecount = SHA1_BLOCK_SIZE;
}
EXPORT_SYMBOL_GPL(hmac_sha1_init_usingrawkey);

void hmac_sha1_final(struct hmac_sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE])
{
        /* Generate the padded input for the outer hash in ctx->sha_ctx.buf. */
        __sha1_final(&ctx->sha_ctx, ctx->sha_ctx.buf);
        memset(&ctx->sha_ctx.buf[SHA1_DIGEST_SIZE], 0,
               SHA1_BLOCK_SIZE - SHA1_DIGEST_SIZE);
        ctx->sha_ctx.buf[SHA1_DIGEST_SIZE] = 0x80;
        *(__be32 *)&ctx->sha_ctx.buf[SHA1_BLOCK_SIZE - 4] =
                cpu_to_be32(8 * (SHA1_BLOCK_SIZE + SHA1_DIGEST_SIZE));

        /* Compute the outer hash, which gives the HMAC value. */
        sha1_blocks(&ctx->ostate, ctx->sha_ctx.buf, 1);
        for (size_t i = 0; i < SHA1_DIGEST_SIZE; i += 4)
                put_unaligned_be32(ctx->ostate.h[i / 4], out + i);

        memzero_explicit(ctx, sizeof(*ctx));
}
EXPORT_SYMBOL_GPL(hmac_sha1_final);

void hmac_sha1(const struct hmac_sha1_key *key,
               const u8 *data, size_t data_len, u8 out[SHA1_DIGEST_SIZE])
{
        struct hmac_sha1_ctx ctx;

        hmac_sha1_init(&ctx, key);
        hmac_sha1_update(&ctx, data, data_len);
        hmac_sha1_final(&ctx, out);
}
EXPORT_SYMBOL_GPL(hmac_sha1);

void hmac_sha1_usingrawkey(const u8 *raw_key, size_t raw_key_len,
                           const u8 *data, size_t data_len,
                           u8 out[SHA1_DIGEST_SIZE])
{
        struct hmac_sha1_ctx ctx;

        hmac_sha1_init_usingrawkey(&ctx, raw_key, raw_key_len);
        hmac_sha1_update(&ctx, data, data_len);
        hmac_sha1_final(&ctx, out);
}
EXPORT_SYMBOL_GPL(hmac_sha1_usingrawkey);

#ifdef sha1_mod_init_arch
static int __init sha1_mod_init(void)
{
        sha1_mod_init_arch();
        return 0;
}
subsys_initcall(sha1_mod_init);

static void __exit sha1_mod_exit(void)
{
}
module_exit(sha1_mod_exit);
#endif

MODULE_DESCRIPTION("SHA-1 and HMAC-SHA1 library functions");
MODULE_LICENSE("GPL");








































    1 













































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_GFP_H
#define __LINUX_GFP_H

#include <linux/gfp_types.h>

#include <linux/mmzone.h>
#include <linux/topology.h>
#include <linux/alloc_tag.h>
#include <linux/cleanup.h>
#include <linux/sched.h>

struct vm_area_struct;
struct mempolicy;

/* Convert GFP flags to their corresponding migrate type */
#define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
#define GFP_MOVABLE_SHIFT 3

static inline int gfp_migratetype(const gfp_t gfp_flags)
{
        VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
        BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE);
        BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE);
        BUILD_BUG_ON((___GFP_RECLAIMABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_RECLAIMABLE);
        BUILD_BUG_ON(((___GFP_MOVABLE | ___GFP_RECLAIMABLE) >>
                      GFP_MOVABLE_SHIFT) != MIGRATE_HIGHATOMIC);

        if (unlikely(page_group_by_mobility_disabled))
                return MIGRATE_UNMOVABLE;

        /* Group based on mobility */
        return (__force unsigned long)(gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT;
}
#undef GFP_MOVABLE_MASK
#undef GFP_MOVABLE_SHIFT

static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
{
        return !!(gfp_flags & __GFP_DIRECT_RECLAIM);
}

static inline bool gfpflags_allow_spinning(const gfp_t gfp_flags)
{
        /*
         * !__GFP_DIRECT_RECLAIM -> direct claim is not allowed.
         * !__GFP_KSWAPD_RECLAIM -> it's not safe to wake up kswapd.
         * All GFP_* flags including GFP_NOWAIT use one or both flags.
         * alloc_pages_nolock() is the only API that doesn't specify either flag.
         *
         * This is stronger than GFP_NOWAIT or GFP_ATOMIC because
         * those are guaranteed to never block on a sleeping lock.
         * Here we are enforcing that the allocation doesn't ever spin
         * on any locks (i.e. only trylocks). There is no high level
         * GFP_$FOO flag for this use in alloc_pages_nolock() as the
         * regular page allocator doesn't fully support this
         * allocation mode.
         */
        return !!(gfp_flags & __GFP_RECLAIM);
}

#ifdef CONFIG_HIGHMEM
#define OPT_ZONE_HIGHMEM ZONE_HIGHMEM
#else
#define OPT_ZONE_HIGHMEM ZONE_NORMAL
#endif

#ifdef CONFIG_ZONE_DMA
#define OPT_ZONE_DMA ZONE_DMA
#else
#define OPT_ZONE_DMA ZONE_NORMAL
#endif

#ifdef CONFIG_ZONE_DMA32
#define OPT_ZONE_DMA32 ZONE_DMA32
#else
#define OPT_ZONE_DMA32 ZONE_NORMAL
#endif

/*
 * GFP_ZONE_TABLE is a word size bitstring that is used for looking up the
 * zone to use given the lowest 4 bits of gfp_t. Entries are GFP_ZONES_SHIFT
 * bits long and there are 16 of them to cover all possible combinations of
 * __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM.
 *
 * The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA.
 * But GFP_MOVABLE is not only a zone specifier but also an allocation
 * policy. Therefore __GFP_MOVABLE plus another zone selector is valid.
 * Only 1 bit of the lowest 3 bits (DMA,DMA32,HIGHMEM) can be set to "1".
 *
 *       bit       result
 *       =================
 *       0x0    => NORMAL
 *       0x1    => DMA or NORMAL
 *       0x2    => HIGHMEM or NORMAL
 *       0x3    => BAD (DMA+HIGHMEM)
 *       0x4    => DMA32 or NORMAL
 *       0x5    => BAD (DMA+DMA32)
 *       0x6    => BAD (HIGHMEM+DMA32)
 *       0x7    => BAD (HIGHMEM+DMA32+DMA)
 *       0x8    => NORMAL (MOVABLE+0)
 *       0x9    => DMA or NORMAL (MOVABLE+DMA)
 *       0xa    => MOVABLE (Movable is valid only if HIGHMEM is set too)
 *       0xb    => BAD (MOVABLE+HIGHMEM+DMA)
 *       0xc    => DMA32 or NORMAL (MOVABLE+DMA32)
 *       0xd    => BAD (MOVABLE+DMA32+DMA)
 *       0xe    => BAD (MOVABLE+DMA32+HIGHMEM)
 *       0xf    => BAD (MOVABLE+DMA32+HIGHMEM+DMA)
 *
 * GFP_ZONES_SHIFT must be <= 2 on 32 bit platforms.
 */

#if defined(CONFIG_ZONE_DEVICE) && (MAX_NR_ZONES-1) <= 4
/* ZONE_DEVICE is not a valid GFP zone specifier */
#define GFP_ZONES_SHIFT 2
#else
#define GFP_ZONES_SHIFT ZONES_SHIFT
#endif

#if 16 * GFP_ZONES_SHIFT > BITS_PER_LONG
#error GFP_ZONES_SHIFT too large to create GFP_ZONE_TABLE integer
#endif

#define GFP_ZONE_TABLE ( \
        (ZONE_NORMAL << 0 * GFP_ZONES_SHIFT)                                       \
        | (OPT_ZONE_DMA << ___GFP_DMA * GFP_ZONES_SHIFT)                       \
        | (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * GFP_ZONES_SHIFT)               \
        | (OPT_ZONE_DMA32 << ___GFP_DMA32 * GFP_ZONES_SHIFT)                       \
        | (ZONE_NORMAL << ___GFP_MOVABLE * GFP_ZONES_SHIFT)                       \
        | (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * GFP_ZONES_SHIFT)    \
        | (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * GFP_ZONES_SHIFT)\
        | (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * GFP_ZONES_SHIFT)\
)

/*
 * GFP_ZONE_BAD is a bitmap for all combinations of __GFP_DMA, __GFP_DMA32
 * __GFP_HIGHMEM and __GFP_MOVABLE that are not permitted. One flag per
 * entry starting with bit 0. Bit is set if the combination is not
 * allowed.
 */
#define GFP_ZONE_BAD ( \
        1 << (___GFP_DMA | ___GFP_HIGHMEM)                                      \
        | 1 << (___GFP_DMA | ___GFP_DMA32)                                      \
        | 1 << (___GFP_DMA32 | ___GFP_HIGHMEM)                                      \
        | 1 << (___GFP_DMA | ___GFP_DMA32 | ___GFP_HIGHMEM)                      \
        | 1 << (___GFP_MOVABLE | ___GFP_HIGHMEM | ___GFP_DMA)                      \
        | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA)                      \
        | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_HIGHMEM)                      \
        | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA | ___GFP_HIGHMEM)  \
)

static inline enum zone_type gfp_zone(gfp_t flags)
{
        enum zone_type z;
        int bit = (__force int) (flags & GFP_ZONEMASK);

        z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) &
                                         ((1 << GFP_ZONES_SHIFT) - 1);
        VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
        return z;
}

/*
 * There is only one page-allocator function, and two main namespaces to
 * it. The alloc_page*() variants return 'struct page *' and as such
 * can allocate highmem pages, the *get*page*() variants return
 * virtual kernel addresses to the allocated page(s).
 */

static inline int gfp_zonelist(gfp_t flags)
{
#ifdef CONFIG_NUMA
        if (unlikely(flags & __GFP_THISNODE))
                return ZONELIST_NOFALLBACK;
#endif
        return ZONELIST_FALLBACK;
}

/*
 * gfp flag masking for nested internal allocations.
 *
 * For code that needs to do allocations inside the public allocation API (e.g.
 * memory allocation tracking code) the allocations need to obey the caller
 * allocation context constrains to prevent allocation context mismatches (e.g.
 * GFP_KERNEL allocations in GFP_NOFS contexts) from potential deadlock
 * situations.
 *
 * It is also assumed that these nested allocations are for internal kernel
 * object storage purposes only and are not going to be used for DMA, etc. Hence
 * we strip out all the zone information and leave just the context information
 * intact.
 *
 * Further, internal allocations must fail before the higher level allocation
 * can fail, so we must make them fail faster and fail silently. We also don't
 * want them to deplete emergency reserves.  Hence nested allocations must be
 * prepared for these allocations to fail.
 */
static inline gfp_t gfp_nested_mask(gfp_t flags)
{
        return ((flags & (GFP_KERNEL | GFP_ATOMIC | __GFP_NOLOCKDEP)) |
                (__GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN));
}

/*
 * We get the zone list from the current node and the gfp_mask.
 * This zone list contains a maximum of MAX_NUMNODES*MAX_NR_ZONES zones.
 * There are two zonelists per node, one for all zones with memory and
 * one containing just zones from the node the zonelist belongs to.
 *
 * For the case of non-NUMA systems the NODE_DATA() gets optimized to
 * &contig_page_data at compile-time.
 */
static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
{
        return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags);
}

#ifndef HAVE_ARCH_FREE_PAGE
static inline void arch_free_page(struct page *page, int order) { }
#endif
#ifndef HAVE_ARCH_ALLOC_PAGE
static inline void arch_alloc_page(struct page *page, int order) { }
#endif

struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, int preferred_nid,
                nodemask_t *nodemask);
#define __alloc_pages(...)                        alloc_hooks(__alloc_pages_noprof(__VA_ARGS__))

struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid,
                nodemask_t *nodemask);
#define __folio_alloc(...)                        alloc_hooks(__folio_alloc_noprof(__VA_ARGS__))

unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
                                nodemask_t *nodemask, int nr_pages,
                                struct page **page_array);
#define __alloc_pages_bulk(...)                        alloc_hooks(alloc_pages_bulk_noprof(__VA_ARGS__))

unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,
                                unsigned long nr_pages,
                                struct page **page_array);
#define  alloc_pages_bulk_mempolicy(...)                                \
        alloc_hooks(alloc_pages_bulk_mempolicy_noprof(__VA_ARGS__))

/* Bulk allocate order-0 pages */
#define alloc_pages_bulk(_gfp, _nr_pages, _page_array)                \
        __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, _page_array)

static inline unsigned long
alloc_pages_bulk_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages,
                                   struct page **page_array)
{
        if (nid == NUMA_NO_NODE)
                nid = numa_mem_id();

        return alloc_pages_bulk_noprof(gfp, nid, NULL, nr_pages, page_array);
}

#define alloc_pages_bulk_node(...)                                \
        alloc_hooks(alloc_pages_bulk_node_noprof(__VA_ARGS__))

static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask)
{
        gfp_t warn_gfp = gfp_mask & (__GFP_THISNODE|__GFP_NOWARN);

        if (warn_gfp != (__GFP_THISNODE|__GFP_NOWARN))
                return;

        if (node_online(this_node))
                return;

        pr_warn("%pGg allocation from offline node %d\n", &gfp_mask, this_node);
        dump_stack();
}

/*
 * Allocate pages, preferring the node given as nid. The node must be valid and
 * online. For more general interface, see alloc_pages_node().
 */
static inline struct page *
__alloc_pages_node_noprof(int nid, gfp_t gfp_mask, unsigned int order)
{
        VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
        warn_if_node_offline(nid, gfp_mask);

        return __alloc_pages_noprof(gfp_mask, order, nid, NULL);
}

#define  __alloc_pages_node(...)                alloc_hooks(__alloc_pages_node_noprof(__VA_ARGS__))

static inline
struct folio *__folio_alloc_node_noprof(gfp_t gfp, unsigned int order, int nid)
{
        VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
        warn_if_node_offline(nid, gfp);

        return __folio_alloc_noprof(gfp, order, nid, NULL);
}

#define  __folio_alloc_node(...)                alloc_hooks(__folio_alloc_node_noprof(__VA_ARGS__))

/*
 * Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE,
 * prefer the current CPU's closest node. Otherwise node must be valid and
 * online.
 */
static inline struct page *alloc_pages_node_noprof(int nid, gfp_t gfp_mask,
                                                   unsigned int order)
{
        if (nid == NUMA_NO_NODE)
                nid = numa_mem_id();

        return __alloc_pages_node_noprof(nid, gfp_mask, order);
}

#define  alloc_pages_node(...)                        alloc_hooks(alloc_pages_node_noprof(__VA_ARGS__))

#ifdef CONFIG_NUMA
struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order);
struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order);
struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
                struct mempolicy *mpol, pgoff_t ilx, int nid);
struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
                unsigned long addr);
#else
static inline struct page *alloc_pages_noprof(gfp_t gfp_mask, unsigned int order)
{
        return alloc_pages_node_noprof(numa_node_id(), gfp_mask, order);
}
static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
{
        return __folio_alloc_node_noprof(gfp, order, numa_node_id());
}
static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
                struct mempolicy *mpol, pgoff_t ilx, int nid)
{
        return folio_alloc_noprof(gfp, order);
}
#define vma_alloc_folio_noprof(gfp, order, vma, addr)                \
        folio_alloc_noprof(gfp, order)
#endif

#define alloc_pages(...)                        alloc_hooks(alloc_pages_noprof(__VA_ARGS__))
#define folio_alloc(...)                        alloc_hooks(folio_alloc_noprof(__VA_ARGS__))
#define folio_alloc_mpol(...)                        alloc_hooks(folio_alloc_mpol_noprof(__VA_ARGS__))
#define vma_alloc_folio(...)                        alloc_hooks(vma_alloc_folio_noprof(__VA_ARGS__))

#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)

static inline struct page *alloc_page_vma_noprof(gfp_t gfp,
                struct vm_area_struct *vma, unsigned long addr)
{
        struct folio *folio = vma_alloc_folio_noprof(gfp, 0, vma, addr);

        return &folio->page;
}
#define alloc_page_vma(...)                        alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__))

struct page *alloc_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order);
#define alloc_pages_nolock(...)                        alloc_hooks(alloc_pages_nolock_noprof(__VA_ARGS__))

extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order);
#define __get_free_pages(...)                        alloc_hooks(get_free_pages_noprof(__VA_ARGS__))

extern unsigned long get_zeroed_page_noprof(gfp_t gfp_mask);
#define get_zeroed_page(...)                        alloc_hooks(get_zeroed_page_noprof(__VA_ARGS__))

void *alloc_pages_exact_noprof(size_t size, gfp_t gfp_mask) __alloc_size(1);
#define alloc_pages_exact(...)                        alloc_hooks(alloc_pages_exact_noprof(__VA_ARGS__))

void free_pages_exact(void *virt, size_t size);

__meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mask) __alloc_size(2);
#define alloc_pages_exact_nid(...)                                        \
        alloc_hooks(alloc_pages_exact_nid_noprof(__VA_ARGS__))

#define __get_free_page(gfp_mask)                                        \
        __get_free_pages((gfp_mask), 0)

#define __get_dma_pages(gfp_mask, order)                                \
        __get_free_pages((gfp_mask) | GFP_DMA, (order))

extern void __free_pages(struct page *page, unsigned int order);
extern void free_pages_nolock(struct page *page, unsigned int order);
extern void free_pages(unsigned long addr, unsigned int order);

#define __free_page(page) __free_pages((page), 0)
#define free_page(addr) free_pages((addr), 0)

void page_alloc_init_cpuhp(void);
int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp);
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
void drain_all_pages(struct zone *zone);
void drain_local_pages(struct zone *zone);

void page_alloc_init_late(void);
void setup_pcp_cacheinfo(unsigned int cpu);

/*
 * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
 * GFP flags are used before interrupts are enabled. Once interrupts are
 * enabled, it is set to __GFP_BITS_MASK while the system is running. During
 * hibernation, it is used by PM to avoid I/O during memory allocation while
 * devices are suspended.
 */
extern gfp_t gfp_allowed_mask;

/* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */
bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);

static inline bool gfp_has_io_fs(gfp_t gfp)
{
        return (gfp & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS);
}

/*
 * Check if the gfp flags allow compaction - GFP_NOIO is a really
 * tricky context because the migration might require IO.
 */
static inline bool gfp_compaction_allowed(gfp_t gfp_mask)
{
        return IS_ENABLED(CONFIG_COMPACTION) && (gfp_mask & __GFP_IO);
}

extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma);

#ifdef CONFIG_CONTIG_ALLOC

typedef unsigned int __bitwise acr_flags_t;
#define ACR_FLAGS_NONE ((__force acr_flags_t)0) // ordinary allocation request
#define ACR_FLAGS_CMA ((__force acr_flags_t)BIT(0)) // allocate for CMA

/* The below functions must be run on a range from a single zone. */
extern int alloc_contig_range_noprof(unsigned long start, unsigned long end,
                                     acr_flags_t alloc_flags, gfp_t gfp_mask);
#define alloc_contig_range(...)                        alloc_hooks(alloc_contig_range_noprof(__VA_ARGS__))

extern struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask,
                                              int nid, nodemask_t *nodemask);
#define alloc_contig_pages(...)                        alloc_hooks(alloc_contig_pages_noprof(__VA_ARGS__))

#endif
void free_contig_range(unsigned long pfn, unsigned long nr_pages);

#ifdef CONFIG_CONTIG_ALLOC
static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp,
                                                        int nid, nodemask_t *node)
{
        struct page *page;

        if (WARN_ON(!order || !(gfp & __GFP_COMP)))
                return NULL;

        page = alloc_contig_pages_noprof(1 << order, gfp, nid, node);

        return page ? page_folio(page) : NULL;
}
#else
static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp,
                                                        int nid, nodemask_t *node)
{
        return NULL;
}
#endif
/* This should be paired with folio_put() rather than free_contig_range(). */
#define folio_alloc_gigantic(...) alloc_hooks(folio_alloc_gigantic_noprof(__VA_ARGS__))

DEFINE_FREE(free_page, void *, free_page((unsigned long)_T))

#endif /* __LINUX_GFP_H */
















































































































































































































































































































































































































































































































































































































   73 

















   61 








  227 



























   72 








































































































































  309 













































  309 







































































































































































  310 

















  227 


  226 
    1 

  226 


















   11 









































  305 























    1 
    1 













    2 









  235 
















































   50 




















    4 

































   11 

   11 





















   70 

















  247 








   73 








  246 









   67 
  313 













  283 




























































































































































































































































































































































































































   23 













    1 































































































































   22 













   22 
















































































































   25 
































































































   25 














































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_NETLINK_H
#define __NET_NETLINK_H

#include <linux/types.h>
#include <linux/netlink.h>
#include <linux/jiffies.h>
#include <linux/in6.h>

/* ========================================================================
 *         Netlink Messages and Attributes Interface (As Seen On TV)
 * ------------------------------------------------------------------------
 *                          Messages Interface
 * ------------------------------------------------------------------------
 *
 * Message Format:
 *    <--- nlmsg_total_size(payload)  --->
 *    <-- nlmsg_msg_size(payload) ->
 *   +----------+- - -+-------------+- - -+-------- - -
 *   | nlmsghdr | Pad |   Payload   | Pad | nlmsghdr
 *   +----------+- - -+-------------+- - -+-------- - -
 *   nlmsg_data(nlh)---^                   ^
 *   nlmsg_next(nlh)-----------------------+
 *
 * Payload Format:
 *    <---------------------- nlmsg_len(nlh) --------------------->
 *    <------ hdrlen ------>       <- nlmsg_attrlen(nlh, hdrlen) ->
 *   +----------------------+- - -+--------------------------------+
 *   |     Family Header    | Pad |           Attributes           |
 *   +----------------------+- - -+--------------------------------+
 *   nlmsg_attrdata(nlh, hdrlen)---^
 *
 * Data Structures:
 *   struct nlmsghdr                        netlink message header
 *
 * Message Construction:
 *   nlmsg_new()                        create a new netlink message
 *   nlmsg_put()                        add a netlink message to an skb
 *   nlmsg_put_answer()                        callback based nlmsg_put()
 *   nlmsg_end()                        finalize netlink message
 *   nlmsg_get_pos()                        return current position in message
 *   nlmsg_trim()                        trim part of message
 *   nlmsg_cancel()                        cancel message construction
 *   nlmsg_consume()                        free a netlink message (expected)
 *   nlmsg_free()                        free a netlink message (drop)
 *
 * Message Sending:
 *   nlmsg_multicast()                        multicast message to several groups
 *   nlmsg_unicast()                        unicast a message to a single socket
 *   nlmsg_notify()                        send notification message
 *
 * Message Length Calculations:
 *   nlmsg_msg_size(payload)                length of message w/o padding
 *   nlmsg_total_size(payload)                length of message w/ padding
 *   nlmsg_padlen(payload)                length of padding at tail
 *
 * Message Payload Access:
 *   nlmsg_data(nlh)                        head of message payload
 *   nlmsg_len(nlh)                        length of message payload
 *   nlmsg_attrdata(nlh, hdrlen)        head of attributes data
 *   nlmsg_attrlen(nlh, hdrlen)                length of attributes data
 *
 * Message Parsing:
 *   nlmsg_ok(nlh, remaining)                does nlh fit into remaining bytes?
 *   nlmsg_next(nlh, remaining)                get next netlink message
 *   nlmsg_parse()                        parse attributes of a message
 *   nlmsg_find_attr()                        find an attribute in a message
 *   nlmsg_for_each_msg()                loop over all messages
 *   nlmsg_validate()                        validate netlink message incl. attrs
 *   nlmsg_for_each_attr()                loop over all attributes
 *   nlmsg_for_each_attr_type()                loop over all attributes with the
 *                                        given type
 *
 * Misc:
 *   nlmsg_report()                        report back to application?
 *
 * ------------------------------------------------------------------------
 *                          Attributes Interface
 * ------------------------------------------------------------------------
 *
 * Attribute Format:
 *    <------- nla_total_size(payload) ------->
 *    <---- nla_attr_size(payload) ----->
 *   +----------+- - -+- - - - - - - - - +- - -+-------- - -
 *   |  Header  | Pad |     Payload      | Pad |  Header
 *   +----------+- - -+- - - - - - - - - +- - -+-------- - -
 *                     <- nla_len(nla) ->      ^
 *   nla_data(nla)----^                        |
 *   nla_next(nla)-----------------------------'
 *
 * Data Structures:
 *   struct nlattr                        netlink attribute header
 *
 * Attribute Construction:
 *   nla_reserve(skb, type, len)        reserve room for an attribute
 *   nla_reserve_nohdr(skb, len)        reserve room for an attribute w/o hdr
 *   nla_put(skb, type, len, data)        add attribute to skb
 *   nla_put_nohdr(skb, len, data)        add attribute w/o hdr
 *   nla_append(skb, len, data)                append data to skb
 *
 * Attribute Construction for Basic Types:
 *   nla_put_u8(skb, type, value)        add u8 attribute to skb
 *   nla_put_u16(skb, type, value)        add u16 attribute to skb
 *   nla_put_u32(skb, type, value)        add u32 attribute to skb
 *   nla_put_u64_64bit(skb, type,
 *                     value, padattr)        add u64 attribute to skb
 *   nla_put_s8(skb, type, value)        add s8 attribute to skb
 *   nla_put_s16(skb, type, value)        add s16 attribute to skb
 *   nla_put_s32(skb, type, value)        add s32 attribute to skb
 *   nla_put_s64(skb, type, value,
 *               padattr)                add s64 attribute to skb
 *   nla_put_string(skb, type, str)        add string attribute to skb
 *   nla_put_flag(skb, type)                add flag attribute to skb
 *   nla_put_msecs(skb, type, jiffies,
 *                 padattr)                add msecs attribute to skb
 *   nla_put_in_addr(skb, type, addr)        add IPv4 address attribute to skb
 *   nla_put_in6_addr(skb, type, addr)        add IPv6 address attribute to skb
 *
 * Nested Attributes Construction:
 *   nla_nest_start(skb, type)                start a nested attribute
 *   nla_nest_end(skb, nla)                finalize a nested attribute
 *   nla_nest_cancel(skb, nla)                cancel nested attribute construction
 *   nla_put_empty_nest(skb, type)        create an empty nest
 *
 * Attribute Length Calculations:
 *   nla_attr_size(payload)                length of attribute w/o padding
 *   nla_total_size(payload)                length of attribute w/ padding
 *   nla_padlen(payload)                length of padding
 *
 * Attribute Payload Access:
 *   nla_data(nla)                        head of attribute payload
 *   nla_len(nla)                        length of attribute payload
 *
 * Attribute Payload Access for Basic Types:
 *   nla_get_uint(nla)                        get payload for a uint attribute
 *   nla_get_sint(nla)                        get payload for a sint attribute
 *   nla_get_u8(nla)                        get payload for a u8 attribute
 *   nla_get_u16(nla)                        get payload for a u16 attribute
 *   nla_get_u32(nla)                        get payload for a u32 attribute
 *   nla_get_u64(nla)                        get payload for a u64 attribute
 *   nla_get_s8(nla)                        get payload for a s8 attribute
 *   nla_get_s16(nla)                        get payload for a s16 attribute
 *   nla_get_s32(nla)                        get payload for a s32 attribute
 *   nla_get_s64(nla)                        get payload for a s64 attribute
 *   nla_get_flag(nla)                        return 1 if flag is true
 *   nla_get_msecs(nla)                        get payload for a msecs attribute
 *
 *   The same functions also exist with _default().
 *
 * Attribute Misc:
 *   nla_memcpy(dest, nla, count)        copy attribute into memory
 *   nla_memcmp(nla, data, size)        compare attribute with memory area
 *   nla_strscpy(dst, nla, size)        copy attribute to a sized string
 *   nla_strcmp(nla, str)                compare attribute with string
 *
 * Attribute Parsing:
 *   nla_ok(nla, remaining)                does nla fit into remaining bytes?
 *   nla_next(nla, remaining)                get next netlink attribute
 *   nla_validate()                        validate a stream of attributes
 *   nla_validate_nested()                validate a stream of nested attributes
 *   nla_find()                                find attribute in stream of attributes
 *   nla_find_nested()                        find attribute in nested attributes
 *   nla_parse()                        parse and validate stream of attrs
 *   nla_parse_nested()                        parse nested attributes
 *   nla_for_each_attr()                loop over all attributes
 *   nla_for_each_attr_type()                loop over all attributes with the
 *                                        given type
 *   nla_for_each_nested()                loop over the nested attributes
 *   nla_for_each_nested_type()                loop over the nested attributes with
 *                                        the given type
 *=========================================================================
 */

 /**
  * Standard attribute types to specify validation policy
  */
enum {
        NLA_UNSPEC,
        NLA_U8,
        NLA_U16,
        NLA_U32,
        NLA_U64,
        NLA_STRING,
        NLA_FLAG,
        NLA_MSECS,
        NLA_NESTED,
        NLA_NESTED_ARRAY,
        NLA_NUL_STRING,
        NLA_BINARY,
        NLA_S8,
        NLA_S16,
        NLA_S32,
        NLA_S64,
        NLA_BITFIELD32,
        NLA_REJECT,
        NLA_BE16,
        NLA_BE32,
        NLA_SINT,
        NLA_UINT,
        __NLA_TYPE_MAX,
};

#define NLA_TYPE_MAX (__NLA_TYPE_MAX - 1)

struct netlink_range_validation {
        u64 min, max;
};

struct netlink_range_validation_signed {
        s64 min, max;
};

enum nla_policy_validation {
        NLA_VALIDATE_NONE,
        NLA_VALIDATE_RANGE,
        NLA_VALIDATE_RANGE_WARN_TOO_LONG,
        NLA_VALIDATE_MIN,
        NLA_VALIDATE_MAX,
        NLA_VALIDATE_MASK,
        NLA_VALIDATE_RANGE_PTR,
        NLA_VALIDATE_FUNCTION,
};

/**
 * struct nla_policy - attribute validation policy
 * @type: Type of attribute or NLA_UNSPEC
 * @validation_type: type of attribute validation done in addition to
 *        type-specific validation (e.g. range, function call), see
 *        &enum nla_policy_validation
 * @len: Type specific length of payload
 *
 * Policies are defined as arrays of this struct, the array must be
 * accessible by attribute type up to the highest identifier to be expected.
 *
 * Meaning of `len' field:
 *    NLA_STRING           Maximum length of string
 *    NLA_NUL_STRING       Maximum length of string (excluding NUL)
 *    NLA_FLAG             Unused
 *    NLA_BINARY           Maximum length of attribute payload
 *                         (but see also below with the validation type)
 *    NLA_NESTED,
 *    NLA_NESTED_ARRAY     Length verification is done by checking len of
 *                         nested header (or empty); len field is used if
 *                         nested_policy is also used, for the max attr
 *                         number in the nested policy.
 *    NLA_SINT, NLA_UINT,
 *    NLA_U8, NLA_U16,
 *    NLA_U32, NLA_U64,
 *    NLA_S8, NLA_S16,
 *    NLA_S32, NLA_S64,
 *    NLA_BE16, NLA_BE32,
 *    NLA_MSECS            Leaving the length field zero will verify the
 *                         given type fits, using it verifies minimum length
 *                         just like "All other"
 *    NLA_BITFIELD32       Unused
 *    NLA_REJECT           Unused
 *    All other            Minimum length of attribute payload
 *
 * Meaning of validation union:
 *    NLA_BITFIELD32       This is a 32-bit bitmap/bitselector attribute and
 *                         `bitfield32_valid' is the u32 value of valid flags
 *    NLA_REJECT           This attribute is always rejected and `reject_message'
 *                         may point to a string to report as the error instead
 *                         of the generic one in extended ACK.
 *    NLA_NESTED           `nested_policy' to a nested policy to validate, must
 *                         also set `len' to the max attribute number. Use the
 *                         provided NLA_POLICY_NESTED() macro.
 *                         Note that nla_parse() will validate, but of course not
 *                         parse, the nested sub-policies.
 *    NLA_NESTED_ARRAY     `nested_policy' points to a nested policy to validate,
 *                         must also set `len' to the max attribute number. Use
 *                         the provided NLA_POLICY_NESTED_ARRAY() macro.
 *                         The difference to NLA_NESTED is the structure:
 *                         NLA_NESTED has the nested attributes directly inside
 *                         while an array has the nested attributes at another
 *                         level down and the attribute types directly in the
 *                         nesting don't matter.
 *    NLA_UINT,
 *    NLA_U8,
 *    NLA_U16,
 *    NLA_U32,
 *    NLA_U64,
 *    NLA_BE16,
 *    NLA_BE32,
 *    NLA_SINT,
 *    NLA_S8,
 *    NLA_S16,
 *    NLA_S32,
 *    NLA_S64              The `min' and `max' fields are used depending on the
 *                         validation_type field, if that is min/max/range then
 *                         the min, max or both are used (respectively) to check
 *                         the value of the integer attribute.
 *                         Note that in the interest of code simplicity and
 *                         struct size both limits are s16, so you cannot
 *                         enforce a range that doesn't fall within the range
 *                         of s16 - do that using the NLA_POLICY_FULL_RANGE()
 *                         or NLA_POLICY_FULL_RANGE_SIGNED() macros instead.
 *                         Use the NLA_POLICY_MIN(), NLA_POLICY_MAX() and
 *                         NLA_POLICY_RANGE() macros.
 *    NLA_UINT,
 *    NLA_U8,
 *    NLA_U16,
 *    NLA_U32,
 *    NLA_U64              If the validation_type field instead is set to
 *                         NLA_VALIDATE_RANGE_PTR, `range' must be a pointer
 *                         to a struct netlink_range_validation that indicates
 *                         the min/max values.
 *                         Use NLA_POLICY_FULL_RANGE().
 *    NLA_SINT,
 *    NLA_S8,
 *    NLA_S16,
 *    NLA_S32,
 *    NLA_S64              If the validation_type field instead is set to
 *                         NLA_VALIDATE_RANGE_PTR, `range_signed' must be a
 *                         pointer to a struct netlink_range_validation_signed
 *                         that indicates the min/max values.
 *                         Use NLA_POLICY_FULL_RANGE_SIGNED().
 *
 *    NLA_BINARY           If the validation type is like the ones for integers
 *                         above, then the min/max length (not value like for
 *                         integers) of the attribute is enforced.
 *
 *    All other            Unused - but note that it's a union
 *
 * Meaning of `validate' field, use via NLA_POLICY_VALIDATE_FN:
 *    NLA_U8, NLA_U16,
 *    NLA_U32, NLA_U64,
 *    NLA_S8, NLA_S16,
 *    NLA_S32, NLA_S64,
 *    NLA_MSECS,
 *    NLA_BINARY           Validation function called for the attribute.
 *
 *    All other            Unused - but note that it's a union
 *
 * Example:
 *
 * static const u32 myvalidflags = 0xff231023;
 *
 * static const struct nla_policy my_policy[ATTR_MAX+1] = {
 *         [ATTR_FOO] = { .type = NLA_U16 },
 *        [ATTR_BAR] = { .type = NLA_STRING, .len = BARSIZ },
 *        [ATTR_BAZ] = NLA_POLICY_EXACT_LEN(sizeof(struct mystruct)),
 *        [ATTR_GOO] = NLA_POLICY_BITFIELD32(myvalidflags),
 * };
 */
struct nla_policy {
        u8                type;
        u8                validation_type;
        u16                len;
        union {
                /**
                 * @strict_start_type: first attribute to validate strictly
                 *
                 * This entry is special, and used for the attribute at index 0
                 * only, and specifies special data about the policy, namely it
                 * specifies the "boundary type" where strict length validation
                 * starts for any attribute types >= this value, also, strict
                 * nesting validation starts here.
                 *
                 * Additionally, it means that NLA_UNSPEC is actually NLA_REJECT
                 * for any types >= this, so need to use NLA_POLICY_MIN_LEN() to
                 * get the previous pure { .len = xyz } behaviour. The advantage
                 * of this is that types not specified in the policy will be
                 * rejected.
                 *
                 * For completely new families it should be set to 1 so that the
                 * validation is enforced for all attributes. For existing ones
                 * it should be set at least when new attributes are added to
                 * the enum used by the policy, and be set to the new value that
                 * was added to enforce strict validation from thereon.
                 */
                u16 strict_start_type;

                /* private: use NLA_POLICY_*() to set */
                const u32 bitfield32_valid;
                const u32 mask;
                const char *reject_message;
                const struct nla_policy *nested_policy;
                const struct netlink_range_validation *range;
                const struct netlink_range_validation_signed *range_signed;
                struct {
                        s16 min, max;
                };
                int (*validate)(const struct nlattr *attr,
                                struct netlink_ext_ack *extack);
        };
};

#define NLA_POLICY_ETH_ADDR                NLA_POLICY_EXACT_LEN(ETH_ALEN)
#define NLA_POLICY_ETH_ADDR_COMPAT        NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN)

#define _NLA_POLICY_NESTED(maxattr, policy) \
        { .type = NLA_NESTED, .nested_policy = policy, .len = maxattr }
#define _NLA_POLICY_NESTED_ARRAY(maxattr, policy) \
        { .type = NLA_NESTED_ARRAY, .nested_policy = policy, .len = maxattr }
#define NLA_POLICY_NESTED(policy) \
        _NLA_POLICY_NESTED(ARRAY_SIZE(policy) - 1, policy)
#define NLA_POLICY_NESTED_ARRAY(policy) \
        _NLA_POLICY_NESTED_ARRAY(ARRAY_SIZE(policy) - 1, policy)
#define NLA_POLICY_BITFIELD32(valid) \
        { .type = NLA_BITFIELD32, .bitfield32_valid = valid }

#define __NLA_IS_UINT_TYPE(tp)                                        \
        (tp == NLA_U8 || tp == NLA_U16 || tp == NLA_U32 ||        \
         tp == NLA_U64 || tp == NLA_UINT ||                        \
         tp == NLA_BE16 || tp == NLA_BE32)
#define __NLA_IS_SINT_TYPE(tp)                                                \
        (tp == NLA_S8 || tp == NLA_S16 || tp == NLA_S32 || tp == NLA_S64 || \
         tp == NLA_SINT)

#define __NLA_ENSURE(condition) BUILD_BUG_ON_ZERO(!(condition))
#define NLA_ENSURE_UINT_TYPE(tp)                        \
        (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp)) + tp)
#define NLA_ENSURE_UINT_OR_BINARY_TYPE(tp)                \
        (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp) ||        \
                      tp == NLA_MSECS ||                \
                      tp == NLA_BINARY) + tp)
#define NLA_ENSURE_SINT_TYPE(tp)                        \
        (__NLA_ENSURE(__NLA_IS_SINT_TYPE(tp)) + tp)
#define NLA_ENSURE_INT_OR_BINARY_TYPE(tp)                \
        (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp) ||                \
                      __NLA_IS_SINT_TYPE(tp) ||                \
                      tp == NLA_MSECS ||                \
                      tp == NLA_BINARY) + tp)
#define NLA_ENSURE_NO_VALIDATION_PTR(tp)                \
        (__NLA_ENSURE(tp != NLA_BITFIELD32 &&                \
                      tp != NLA_REJECT &&                \
                      tp != NLA_NESTED &&                \
                      tp != NLA_NESTED_ARRAY) + tp)

#define NLA_POLICY_RANGE(tp, _min, _max) {                \
        .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp),        \
        .validation_type = NLA_VALIDATE_RANGE,                \
        .min = _min,                                        \
        .max = _max                                        \
}

#define NLA_POLICY_FULL_RANGE(tp, _range) {                \
        .type = NLA_ENSURE_UINT_OR_BINARY_TYPE(tp),        \
        .validation_type = NLA_VALIDATE_RANGE_PTR,        \
        .range = _range,                                \
}

#define NLA_POLICY_FULL_RANGE_SIGNED(tp, _range) {        \
        .type = NLA_ENSURE_SINT_TYPE(tp),                \
        .validation_type = NLA_VALIDATE_RANGE_PTR,        \
        .range_signed = _range,                                \
}

#define NLA_POLICY_MIN(tp, _min) {                        \
        .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp),        \
        .validation_type = NLA_VALIDATE_MIN,                \
        .min = _min,                                        \
}

#define NLA_POLICY_MAX(tp, _max) {                        \
        .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp),        \
        .validation_type = NLA_VALIDATE_MAX,                \
        .max = _max,                                        \
}

#define NLA_POLICY_MASK(tp, _mask) {                        \
        .type = NLA_ENSURE_UINT_TYPE(tp),                \
        .validation_type = NLA_VALIDATE_MASK,                \
        .mask = _mask,                                        \
}

#define NLA_POLICY_VALIDATE_FN(tp, fn, ...) {                \
        .type = NLA_ENSURE_NO_VALIDATION_PTR(tp),        \
        .validation_type = NLA_VALIDATE_FUNCTION,        \
        .validate = fn,                                        \
        .len = __VA_ARGS__ + 0,                                \
}

#define NLA_POLICY_EXACT_LEN(_len)        NLA_POLICY_RANGE(NLA_BINARY, _len, _len)
#define NLA_POLICY_EXACT_LEN_WARN(_len) {                        \
        .type = NLA_BINARY,                                        \
        .validation_type = NLA_VALIDATE_RANGE_WARN_TOO_LONG,        \
        .min = _len,                                                \
        .max = _len                                                \
}
#define NLA_POLICY_MIN_LEN(_len)        NLA_POLICY_MIN(NLA_BINARY, _len)
#define NLA_POLICY_MAX_LEN(_len)        NLA_POLICY_MAX(NLA_BINARY, _len)

/**
 * struct nl_info - netlink source information
 * @nlh: Netlink message header of original request
 * @nl_net: Network namespace
 * @portid: Netlink PORTID of requesting application
 * @skip_notify: Skip netlink notifications to user space
 * @skip_notify_kernel: Skip selected in-kernel notifications
 */
struct nl_info {
        struct nlmsghdr                *nlh;
        struct net                *nl_net;
        u32                        portid;
        u8                        skip_notify:1,
                                skip_notify_kernel:1;
};

/**
 * enum netlink_validation - netlink message/attribute validation levels
 * @NL_VALIDATE_LIBERAL: Old-style "be liberal" validation, not caring about
 *        extra data at the end of the message, attributes being longer than
 *        they should be, or unknown attributes being present.
 * @NL_VALIDATE_TRAILING: Reject junk data encountered after attribute parsing.
 * @NL_VALIDATE_MAXTYPE: Reject attributes > max type; Together with _TRAILING
 *        this is equivalent to the old nla_parse_strict()/nlmsg_parse_strict().
 * @NL_VALIDATE_UNSPEC: Reject attributes with NLA_UNSPEC in the policy.
 *        This can safely be set by the kernel when the given policy has no
 *        NLA_UNSPEC anymore, and can thus be used to ensure policy entries
 *        are enforced going forward.
 * @NL_VALIDATE_STRICT_ATTRS: strict attribute policy parsing (e.g.
 *        U8, U16, U32 must have exact size, etc.)
 * @NL_VALIDATE_NESTED: Check that NLA_F_NESTED is set for NLA_NESTED(_ARRAY)
 *        and unset for other policies.
 */
enum netlink_validation {
        NL_VALIDATE_LIBERAL = 0,
        NL_VALIDATE_TRAILING = BIT(0),
        NL_VALIDATE_MAXTYPE = BIT(1),
        NL_VALIDATE_UNSPEC = BIT(2),
        NL_VALIDATE_STRICT_ATTRS = BIT(3),
        NL_VALIDATE_NESTED = BIT(4),
};

#define NL_VALIDATE_DEPRECATED_STRICT (NL_VALIDATE_TRAILING |\
                                       NL_VALIDATE_MAXTYPE)
#define NL_VALIDATE_STRICT (NL_VALIDATE_TRAILING |\
                            NL_VALIDATE_MAXTYPE |\
                            NL_VALIDATE_UNSPEC |\
                            NL_VALIDATE_STRICT_ATTRS |\
                            NL_VALIDATE_NESTED)

int netlink_rcv_skb(struct sk_buff *skb,
                    int (*cb)(struct sk_buff *, struct nlmsghdr *,
                              struct netlink_ext_ack *));
int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid,
                 unsigned int group, int report, gfp_t flags);

int __nla_validate(const struct nlattr *head, int len, int maxtype,
                   const struct nla_policy *policy, unsigned int validate,
                   struct netlink_ext_ack *extack);
int __nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head,
                int len, const struct nla_policy *policy, unsigned int validate,
                struct netlink_ext_ack *extack);
int nla_policy_len(const struct nla_policy *, int);
struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype);
ssize_t nla_strscpy(char *dst, const struct nlattr *nla, size_t dstsize);
char *nla_strdup(const struct nlattr *nla, gfp_t flags);
int nla_memcpy(void *dest, const struct nlattr *src, int count);
int nla_memcmp(const struct nlattr *nla, const void *data, size_t size);
int nla_strcmp(const struct nlattr *nla, const char *str);
struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen);
struct nlattr *__nla_reserve_64bit(struct sk_buff *skb, int attrtype,
                                   int attrlen, int padattr);
void *__nla_reserve_nohdr(struct sk_buff *skb, int attrlen);
struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen);
struct nlattr *nla_reserve_64bit(struct sk_buff *skb, int attrtype,
                                 int attrlen, int padattr);
void *nla_reserve_nohdr(struct sk_buff *skb, int attrlen);
void __nla_put(struct sk_buff *skb, int attrtype, int attrlen,
               const void *data);
void __nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen,
                     const void *data, int padattr);
void __nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data);
int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data);
int nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen,
                  const void *data, int padattr);
int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data);
int nla_append(struct sk_buff *skb, int attrlen, const void *data);

/**************************************************************************
 * Netlink Messages
 **************************************************************************/

/**
 * nlmsg_msg_size - length of netlink message not including padding
 * @payload: length of message payload
 */
static inline int nlmsg_msg_size(int payload)
{
        return NLMSG_HDRLEN + payload;
}

/**
 * nlmsg_total_size - length of netlink message including padding
 * @payload: length of message payload
 */
static inline int nlmsg_total_size(int payload)
{
        return NLMSG_ALIGN(nlmsg_msg_size(payload));
}

/**
 * nlmsg_padlen - length of padding at the message's tail
 * @payload: length of message payload
 */
static inline int nlmsg_padlen(int payload)
{
        return nlmsg_total_size(payload) - nlmsg_msg_size(payload);
}

/**
 * nlmsg_data - head of message payload
 * @nlh: netlink message header
 */
static inline void *nlmsg_data(const struct nlmsghdr *nlh)
{
        return (unsigned char *) nlh + NLMSG_HDRLEN;
}

/**
 * nlmsg_len - length of message payload
 * @nlh: netlink message header
 */
static inline int nlmsg_len(const struct nlmsghdr *nlh)
{
        return nlh->nlmsg_len - NLMSG_HDRLEN;
}

/**
 * nlmsg_payload - message payload if the data fits in the len
 * @nlh: netlink message header
 * @len: struct length
 *
 * Returns: The netlink message payload/data if the length is sufficient,
 * otherwise NULL.
 */
static inline void *nlmsg_payload(const struct nlmsghdr *nlh, size_t len)
{
        if (nlh->nlmsg_len < nlmsg_msg_size(len))
                return NULL;

        return nlmsg_data(nlh);
}

/**
 * nlmsg_attrdata - head of attributes data
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 */
static inline struct nlattr *nlmsg_attrdata(const struct nlmsghdr *nlh,
                                            int hdrlen)
{
        unsigned char *data = nlmsg_data(nlh);
        return (struct nlattr *) (data + NLMSG_ALIGN(hdrlen));
}

/**
 * nlmsg_attrlen - length of attributes data
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 */
static inline int nlmsg_attrlen(const struct nlmsghdr *nlh, int hdrlen)
{
        return nlmsg_len(nlh) - NLMSG_ALIGN(hdrlen);
}

/**
 * nlmsg_ok - check if the netlink message fits into the remaining bytes
 * @nlh: netlink message header
 * @remaining: number of bytes remaining in message stream
 */
static inline int nlmsg_ok(const struct nlmsghdr *nlh, int remaining)
{
        return (remaining >= (int) sizeof(struct nlmsghdr) &&
                nlh->nlmsg_len >= sizeof(struct nlmsghdr) &&
                nlh->nlmsg_len <= remaining);
}

/**
 * nlmsg_next - next netlink message in message stream
 * @nlh: netlink message header
 * @remaining: number of bytes remaining in message stream
 *
 * Returns: the next netlink message in the message stream and
 * decrements remaining by the size of the current message.
 */
static inline struct nlmsghdr *
nlmsg_next(const struct nlmsghdr *nlh, int *remaining)
{
        int totlen = NLMSG_ALIGN(nlh->nlmsg_len);

        *remaining -= totlen;

        return (struct nlmsghdr *) ((unsigned char *) nlh + totlen);
}

/**
 * nla_parse - Parse a stream of attributes into a tb buffer
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @policy: validation policy
 * @extack: extended ACK pointer
 *
 * Parses a stream of attributes and stores a pointer to each attribute in
 * the tb array accessible via the attribute type. Attributes with a type
 * exceeding maxtype will be rejected, policy must be specified, attributes
 * will be validated in the strictest way possible.
 *
 * Returns: 0 on success or a negative error code.
 */
static inline int nla_parse(struct nlattr **tb, int maxtype,
                            const struct nlattr *head, int len,
                            const struct nla_policy *policy,
                            struct netlink_ext_ack *extack)
{
        return __nla_parse(tb, maxtype, head, len, policy,
                           NL_VALIDATE_STRICT, extack);
}

/**
 * nla_parse_deprecated - Parse a stream of attributes into a tb buffer
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @policy: validation policy
 * @extack: extended ACK pointer
 *
 * Parses a stream of attributes and stores a pointer to each attribute in
 * the tb array accessible via the attribute type. Attributes with a type
 * exceeding maxtype will be ignored and attributes from the policy are not
 * always strictly validated (only for new attributes).
 *
 * Returns: 0 on success or a negative error code.
 */
static inline int nla_parse_deprecated(struct nlattr **tb, int maxtype,
                                       const struct nlattr *head, int len,
                                       const struct nla_policy *policy,
                                       struct netlink_ext_ack *extack)
{
        return __nla_parse(tb, maxtype, head, len, policy,
                           NL_VALIDATE_LIBERAL, extack);
}

/**
 * nla_parse_deprecated_strict - Parse a stream of attributes into a tb buffer
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @policy: validation policy
 * @extack: extended ACK pointer
 *
 * Parses a stream of attributes and stores a pointer to each attribute in
 * the tb array accessible via the attribute type. Attributes with a type
 * exceeding maxtype will be rejected as well as trailing data, but the
 * policy is not completely strictly validated (only for new attributes).
 *
 * Returns: 0 on success or a negative error code.
 */
static inline int nla_parse_deprecated_strict(struct nlattr **tb, int maxtype,
                                              const struct nlattr *head,
                                              int len,
                                              const struct nla_policy *policy,
                                              struct netlink_ext_ack *extack)
{
        return __nla_parse(tb, maxtype, head, len, policy,
                           NL_VALIDATE_DEPRECATED_STRICT, extack);
}

/**
 * __nlmsg_parse - parse attributes of a netlink message
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @validate: validation strictness
 * @extack: extended ACK report struct
 *
 * See nla_parse()
 */
static inline int __nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen,
                                struct nlattr *tb[], int maxtype,
                                const struct nla_policy *policy,
                                unsigned int validate,
                                struct netlink_ext_ack *extack)
{
        if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) {
                NL_SET_ERR_MSG(extack, "Invalid header length");
                return -EINVAL;
        }

        return __nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen),
                           nlmsg_attrlen(nlh, hdrlen), policy, validate,
                           extack);
}

/**
 * nlmsg_parse - parse attributes of a netlink message
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @extack: extended ACK report struct
 *
 * See nla_parse()
 */
static inline int nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen,
                              struct nlattr *tb[], int maxtype,
                              const struct nla_policy *policy,
                              struct netlink_ext_ack *extack)
{
        return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy,
                             NL_VALIDATE_STRICT, extack);
}

/**
 * nlmsg_parse_deprecated - parse attributes of a netlink message
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @extack: extended ACK report struct
 *
 * See nla_parse_deprecated()
 */
static inline int nlmsg_parse_deprecated(const struct nlmsghdr *nlh, int hdrlen,
                                         struct nlattr *tb[], int maxtype,
                                         const struct nla_policy *policy,
                                         struct netlink_ext_ack *extack)
{
        return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy,
                             NL_VALIDATE_LIBERAL, extack);
}

/**
 * nlmsg_parse_deprecated_strict - parse attributes of a netlink message
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @extack: extended ACK report struct
 *
 * See nla_parse_deprecated_strict()
 */
static inline int
nlmsg_parse_deprecated_strict(const struct nlmsghdr *nlh, int hdrlen,
                              struct nlattr *tb[], int maxtype,
                              const struct nla_policy *policy,
                              struct netlink_ext_ack *extack)
{
        return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy,
                             NL_VALIDATE_DEPRECATED_STRICT, extack);
}

/**
 * nlmsg_find_attr - find a specific attribute in a netlink message
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 * @attrtype: type of attribute to look for
 *
 * Returns: the first attribute which matches the specified type.
 */
static inline struct nlattr *nlmsg_find_attr(const struct nlmsghdr *nlh,
                                             int hdrlen, int attrtype)
{
        return nla_find(nlmsg_attrdata(nlh, hdrlen),
                        nlmsg_attrlen(nlh, hdrlen), attrtype);
}

/**
 * nla_validate_deprecated - Validate a stream of attributes
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @extack: extended ACK report struct
 *
 * Validates all attributes in the specified attribute stream against the
 * specified policy. Validation is done in liberal mode.
 * See documentation of struct nla_policy for more details.
 *
 * Returns: 0 on success or a negative error code.
 */
static inline int nla_validate_deprecated(const struct nlattr *head, int len,
                                          int maxtype,
                                          const struct nla_policy *policy,
                                          struct netlink_ext_ack *extack)
{
        return __nla_validate(head, len, maxtype, policy, NL_VALIDATE_LIBERAL,
                              extack);
}

/**
 * nla_validate - Validate a stream of attributes
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @extack: extended ACK report struct
 *
 * Validates all attributes in the specified attribute stream against the
 * specified policy. Validation is done in strict mode.
 * See documentation of struct nla_policy for more details.
 *
 * Returns: 0 on success or a negative error code.
 */
static inline int nla_validate(const struct nlattr *head, int len, int maxtype,
                               const struct nla_policy *policy,
                               struct netlink_ext_ack *extack)
{
        return __nla_validate(head, len, maxtype, policy, NL_VALIDATE_STRICT,
                              extack);
}

/**
 * nlmsg_validate_deprecated - validate a netlink message including attributes
 * @nlh: netlinket message header
 * @hdrlen: length of family specific header
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @extack: extended ACK report struct
 */
static inline int nlmsg_validate_deprecated(const struct nlmsghdr *nlh,
                                            int hdrlen, int maxtype,
                                            const struct nla_policy *policy,
                                            struct netlink_ext_ack *extack)
{
        if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen))
                return -EINVAL;

        return __nla_validate(nlmsg_attrdata(nlh, hdrlen),
                              nlmsg_attrlen(nlh, hdrlen), maxtype,
                              policy, NL_VALIDATE_LIBERAL, extack);
}



/**
 * nlmsg_report - need to report back to application?
 * @nlh: netlink message header
 *
 * Returns: 1 if a report back to the application is requested.
 */
static inline int nlmsg_report(const struct nlmsghdr *nlh)
{
        return nlh ? !!(nlh->nlmsg_flags & NLM_F_ECHO) : 0;
}

/**
 * nlmsg_seq - return the seq number of netlink message
 * @nlh: netlink message header
 *
 * Returns: 0 if netlink message is NULL
 */
static inline u32 nlmsg_seq(const struct nlmsghdr *nlh)
{
        return nlh ? nlh->nlmsg_seq : 0;
}

/**
 * nlmsg_for_each_attr - iterate over a stream of attributes
 * @pos: loop counter, set to current attribute
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 * @rem: initialized to len, holds bytes currently remaining in stream
 */
#define nlmsg_for_each_attr(pos, nlh, hdrlen, rem) \
        nla_for_each_attr(pos, nlmsg_attrdata(nlh, hdrlen), \
                          nlmsg_attrlen(nlh, hdrlen), rem)

/**
 * nlmsg_for_each_attr_type - iterate over a stream of attributes
 * @pos: loop counter, set to the current attribute
 * @type: required attribute type for @pos
 * @nlh: netlink message header
 * @hdrlen: length of the family specific header
 * @rem: initialized to len, holds bytes currently remaining in stream
 */
#define nlmsg_for_each_attr_type(pos, type, nlh, hdrlen, rem) \
        nlmsg_for_each_attr(pos, nlh, hdrlen, rem) \
                if (nla_type(pos) == type)

/**
 * nlmsg_put - Add a new netlink message to an skb
 * @skb: socket buffer to store message in
 * @portid: netlink PORTID of requesting application
 * @seq: sequence number of message
 * @type: message type
 * @payload: length of message payload
 * @flags: message flags
 *
 * Returns: NULL if the tailroom of the skb is insufficient to store
 * the message header and payload.
 */
static inline struct nlmsghdr *nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq,
                                         int type, int payload, int flags)
{
        if (unlikely(skb_tailroom(skb) < nlmsg_total_size(payload)))
                return NULL;

        return __nlmsg_put(skb, portid, seq, type, payload, flags);
}

/**
 * nlmsg_append - Add more data to a nlmsg in a skb
 * @skb: socket buffer to store message in
 * @size: length of message payload
 *
 * Append data to an existing nlmsg, used when constructing a message
 * with multiple fixed-format headers (which is rare).
 * Returns: NULL if the tailroom of the skb is insufficient to store
 * the extra payload.
 */
static inline void *nlmsg_append(struct sk_buff *skb, u32 size)
{
        if (unlikely(skb_tailroom(skb) < NLMSG_ALIGN(size)))
                return NULL;

        if (NLMSG_ALIGN(size) - size)
                memset(skb_tail_pointer(skb) + size, 0,
                       NLMSG_ALIGN(size) - size);
        return __skb_put(skb, NLMSG_ALIGN(size));
}

/**
 * nlmsg_put_answer - Add a new callback based netlink message to an skb
 * @skb: socket buffer to store message in
 * @cb: netlink callback
 * @type: message type
 * @payload: length of message payload
 * @flags: message flags
 *
 * Returns: NULL if the tailroom of the skb is insufficient to store
 * the message header and payload.
 */
static inline struct nlmsghdr *nlmsg_put_answer(struct sk_buff *skb,
                                                struct netlink_callback *cb,
                                                int type, int payload,
                                                int flags)
{
        return nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
                         type, payload, flags);
}

/**
 * nlmsg_new - Allocate a new netlink message
 * @payload: size of the message payload
 * @flags: the type of memory to allocate.
 *
 * Use NLMSG_DEFAULT_SIZE if the size of the payload isn't known
 * and a good default is needed.
 */
static inline struct sk_buff *nlmsg_new(size_t payload, gfp_t flags)
{
        return alloc_skb(nlmsg_total_size(payload), flags);
}

/**
 * nlmsg_new_large - Allocate a new netlink message with non-contiguous
 * physical memory
 * @payload: size of the message payload
 *
 * The allocated skb is unable to have frag page for shinfo->frags*,
 * as the NULL setting for skb->head in netlink_skb_destructor() will
 * bypass most of the handling in skb_release_data()
 */
static inline struct sk_buff *nlmsg_new_large(size_t payload)
{
        return netlink_alloc_large_skb(nlmsg_total_size(payload), 0);
}

/**
 * nlmsg_end - Finalize a netlink message
 * @skb: socket buffer the message is stored in
 * @nlh: netlink message header
 *
 * Corrects the netlink message header to include the appended
 * attributes. Only necessary if attributes have been added to
 * the message.
 */
static inline void nlmsg_end(struct sk_buff *skb, struct nlmsghdr *nlh)
{
        nlh->nlmsg_len = skb_tail_pointer(skb) - (unsigned char *)nlh;
}

/**
 * nlmsg_get_pos - return current position in netlink message
 * @skb: socket buffer the message is stored in
 *
 * Returns: a pointer to the current tail of the message.
 */
static inline void *nlmsg_get_pos(struct sk_buff *skb)
{
        return skb_tail_pointer(skb);
}

/**
 * nlmsg_trim - Trim message to a mark
 * @skb: socket buffer the message is stored in
 * @mark: mark to trim to
 *
 * Trims the message to the provided mark.
 */
static inline void nlmsg_trim(struct sk_buff *skb, const void *mark)
{
        if (mark) {
                WARN_ON((unsigned char *) mark < skb->data);
                skb_trim(skb, (unsigned char *) mark - skb->data);
        }
}

/**
 * nlmsg_cancel - Cancel construction of a netlink message
 * @skb: socket buffer the message is stored in
 * @nlh: netlink message header
 *
 * Removes the complete netlink message including all
 * attributes from the socket buffer again.
 */
static inline void nlmsg_cancel(struct sk_buff *skb, struct nlmsghdr *nlh)
{
        nlmsg_trim(skb, nlh);
}

/**
 * nlmsg_free - drop a netlink message
 * @skb: socket buffer of netlink message
 */
static inline void nlmsg_free(struct sk_buff *skb)
{
        kfree_skb(skb);
}

/**
 * nlmsg_consume - free a netlink message
 * @skb: socket buffer of netlink message
 */
static inline void nlmsg_consume(struct sk_buff *skb)
{
        consume_skb(skb);
}

/**
 * nlmsg_multicast_filtered - multicast a netlink message with filter function
 * @sk: netlink socket to spread messages to
 * @skb: netlink message as socket buffer
 * @portid: own netlink portid to avoid sending to yourself
 * @group: multicast group id
 * @flags: allocation flags
 * @filter: filter function
 * @filter_data: filter function private data
 *
 * Return: 0 on success, negative error code for failure.
 */
static inline int nlmsg_multicast_filtered(struct sock *sk, struct sk_buff *skb,
                                           u32 portid, unsigned int group,
                                           gfp_t flags,
                                           netlink_filter_fn filter,
                                           void *filter_data)
{
        int err;

        NETLINK_CB(skb).dst_group = group;

        err = netlink_broadcast_filtered(sk, skb, portid, group, flags,
                                         filter, filter_data);
        if (err > 0)
                err = 0;

        return err;
}

/**
 * nlmsg_multicast - multicast a netlink message
 * @sk: netlink socket to spread messages to
 * @skb: netlink message as socket buffer
 * @portid: own netlink portid to avoid sending to yourself
 * @group: multicast group id
 * @flags: allocation flags
 */
static inline int nlmsg_multicast(struct sock *sk, struct sk_buff *skb,
                                  u32 portid, unsigned int group, gfp_t flags)
{
        return nlmsg_multicast_filtered(sk, skb, portid, group, flags,
                                        NULL, NULL);
}

/**
 * nlmsg_unicast - unicast a netlink message
 * @sk: netlink socket to spread message to
 * @skb: netlink message as socket buffer
 * @portid: netlink portid of the destination socket
 */
static inline int nlmsg_unicast(struct sock *sk, struct sk_buff *skb, u32 portid)
{
        int err;

        err = netlink_unicast(sk, skb, portid, MSG_DONTWAIT);
        if (err > 0)
                err = 0;

        return err;
}

/**
 * nlmsg_for_each_msg - iterate over a stream of messages
 * @pos: loop counter, set to current message
 * @head: head of message stream
 * @len: length of message stream
 * @rem: initialized to len, holds bytes currently remaining in stream
 */
#define nlmsg_for_each_msg(pos, head, len, rem) \
        for (pos = head, rem = len; \
             nlmsg_ok(pos, rem); \
             pos = nlmsg_next(pos, &(rem)))

/**
 * nl_dump_check_consistent - check if sequence is consistent and advertise if not
 * @cb: netlink callback structure that stores the sequence number
 * @nlh: netlink message header to write the flag to
 *
 * This function checks if the sequence (generation) number changed during dump
 * and if it did, advertises it in the netlink message header.
 *
 * The correct way to use it is to set cb->seq to the generation counter when
 * all locks for dumping have been acquired, and then call this function for
 * each message that is generated.
 *
 * Note that due to initialisation concerns, 0 is an invalid sequence number
 * and must not be used by code that uses this functionality.
 */
static inline void
nl_dump_check_consistent(struct netlink_callback *cb,
                         struct nlmsghdr *nlh)
{
        if (cb->prev_seq && cb->seq != cb->prev_seq)
                nlh->nlmsg_flags |= NLM_F_DUMP_INTR;
        cb->prev_seq = cb->seq;
}

/**************************************************************************
 * Netlink Attributes
 **************************************************************************/

/**
 * nla_attr_size - length of attribute not including padding
 * @payload: length of payload
 */
static inline int nla_attr_size(int payload)
{
        return NLA_HDRLEN + payload;
}

/**
 * nla_total_size - total length of attribute including padding
 * @payload: length of payload
 */
static inline int nla_total_size(int payload)
{
        return NLA_ALIGN(nla_attr_size(payload));
}

/**
 * nla_padlen - length of padding at the tail of attribute
 * @payload: length of payload
 */
static inline int nla_padlen(int payload)
{
        return nla_total_size(payload) - nla_attr_size(payload);
}

/**
 * nla_type - attribute type
 * @nla: netlink attribute
 */
static inline int nla_type(const struct nlattr *nla)
{
        return nla->nla_type & NLA_TYPE_MASK;
}

/**
 * nla_data - head of payload
 * @nla: netlink attribute
 */
static inline void *nla_data(const struct nlattr *nla)
{
        return (char *) nla + NLA_HDRLEN;
}

/**
 * nla_len - length of payload
 * @nla: netlink attribute
 */
static inline u16 nla_len(const struct nlattr *nla)
{
        return nla->nla_len - NLA_HDRLEN;
}

/**
 * nla_ok - check if the netlink attribute fits into the remaining bytes
 * @nla: netlink attribute
 * @remaining: number of bytes remaining in attribute stream
 */
static inline int nla_ok(const struct nlattr *nla, int remaining)
{
        return remaining >= (int) sizeof(*nla) &&
               nla->nla_len >= sizeof(*nla) &&
               nla->nla_len <= remaining;
}

/**
 * nla_next - next netlink attribute in attribute stream
 * @nla: netlink attribute
 * @remaining: number of bytes remaining in attribute stream
 *
 * Returns: the next netlink attribute in the attribute stream and
 * decrements remaining by the size of the current attribute.
 */
static inline struct nlattr *nla_next(const struct nlattr *nla, int *remaining)
{
        unsigned int totlen = NLA_ALIGN(nla->nla_len);

        *remaining -= totlen;
        return (struct nlattr *) ((char *) nla + totlen);
}

/**
 * nla_find_nested - find attribute in a set of nested attributes
 * @nla: attribute containing the nested attributes
 * @attrtype: type of attribute to look for
 *
 * Returns: the first attribute which matches the specified type.
 */
static inline struct nlattr *
nla_find_nested(const struct nlattr *nla, int attrtype)
{
        return nla_find(nla_data(nla), nla_len(nla), attrtype);
}

/**
 * nla_parse_nested - parse nested attributes
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @nla: attribute containing the nested attributes
 * @policy: validation policy
 * @extack: extended ACK report struct
 *
 * See nla_parse()
 */
static inline int nla_parse_nested(struct nlattr *tb[], int maxtype,
                                   const struct nlattr *nla,
                                   const struct nla_policy *policy,
                                   struct netlink_ext_ack *extack)
{
        if (!(nla->nla_type & NLA_F_NESTED)) {
                NL_SET_ERR_MSG_ATTR(extack, nla, "NLA_F_NESTED is missing");
                return -EINVAL;
        }

        return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy,
                           NL_VALIDATE_STRICT, extack);
}

/**
 * nla_parse_nested_deprecated - parse nested attributes
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @nla: attribute containing the nested attributes
 * @policy: validation policy
 * @extack: extended ACK report struct
 *
 * See nla_parse_deprecated()
 */
static inline int nla_parse_nested_deprecated(struct nlattr *tb[], int maxtype,
                                              const struct nlattr *nla,
                                              const struct nla_policy *policy,
                                              struct netlink_ext_ack *extack)
{
        return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy,
                           NL_VALIDATE_LIBERAL, extack);
}

/**
 * nla_put_u8 - Add a u8 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_u8(struct sk_buff *skb, int attrtype, u8 value)
{
        /* temporary variables to work around GCC PR81715 with asan-stack=1 */
        u8 tmp = value;

        return nla_put(skb, attrtype, sizeof(u8), &tmp);
}

/**
 * nla_put_u16 - Add a u16 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_u16(struct sk_buff *skb, int attrtype, u16 value)
{
        u16 tmp = value;

        return nla_put(skb, attrtype, sizeof(u16), &tmp);
}

/**
 * nla_put_be16 - Add a __be16 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_be16(struct sk_buff *skb, int attrtype, __be16 value)
{
        __be16 tmp = value;

        return nla_put(skb, attrtype, sizeof(__be16), &tmp);
}

/**
 * nla_put_net16 - Add 16-bit network byte order netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_net16(struct sk_buff *skb, int attrtype, __be16 value)
{
        __be16 tmp = value;

        return nla_put_be16(skb, attrtype | NLA_F_NET_BYTEORDER, tmp);
}

/**
 * nla_put_le16 - Add a __le16 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_le16(struct sk_buff *skb, int attrtype, __le16 value)
{
        __le16 tmp = value;

        return nla_put(skb, attrtype, sizeof(__le16), &tmp);
}

/**
 * nla_put_u32 - Add a u32 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_u32(struct sk_buff *skb, int attrtype, u32 value)
{
        u32 tmp = value;

        return nla_put(skb, attrtype, sizeof(u32), &tmp);
}

/**
 * nla_put_uint - Add a variable-size unsigned int to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_uint(struct sk_buff *skb, int attrtype, u64 value)
{
        u64 tmp64 = value;
        u32 tmp32 = value;

        if (tmp64 == tmp32)
                return nla_put_u32(skb, attrtype, tmp32);
        return nla_put(skb, attrtype, sizeof(u64), &tmp64);
}

/**
 * nla_put_be32 - Add a __be32 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_be32(struct sk_buff *skb, int attrtype, __be32 value)
{
        __be32 tmp = value;

        return nla_put(skb, attrtype, sizeof(__be32), &tmp);
}

/**
 * nla_put_net32 - Add 32-bit network byte order netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_net32(struct sk_buff *skb, int attrtype, __be32 value)
{
        __be32 tmp = value;

        return nla_put_be32(skb, attrtype | NLA_F_NET_BYTEORDER, tmp);
}

/**
 * nla_put_le32 - Add a __le32 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_le32(struct sk_buff *skb, int attrtype, __le32 value)
{
        __le32 tmp = value;

        return nla_put(skb, attrtype, sizeof(__le32), &tmp);
}

/**
 * nla_put_u64_64bit - Add a u64 netlink attribute to a skb and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 * @padattr: attribute type for the padding
 */
static inline int nla_put_u64_64bit(struct sk_buff *skb, int attrtype,
                                    u64 value, int padattr)
{
        u64 tmp = value;

        return nla_put_64bit(skb, attrtype, sizeof(u64), &tmp, padattr);
}

/**
 * nla_put_be64 - Add a __be64 netlink attribute to a socket buffer and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 * @padattr: attribute type for the padding
 */
static inline int nla_put_be64(struct sk_buff *skb, int attrtype, __be64 value,
                               int padattr)
{
        __be64 tmp = value;

        return nla_put_64bit(skb, attrtype, sizeof(__be64), &tmp, padattr);
}

/**
 * nla_put_net64 - Add 64-bit network byte order nlattr to a skb and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 * @padattr: attribute type for the padding
 */
static inline int nla_put_net64(struct sk_buff *skb, int attrtype, __be64 value,
                                int padattr)
{
        __be64 tmp = value;

        return nla_put_be64(skb, attrtype | NLA_F_NET_BYTEORDER, tmp,
                            padattr);
}

/**
 * nla_put_le64 - Add a __le64 netlink attribute to a socket buffer and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 * @padattr: attribute type for the padding
 */
static inline int nla_put_le64(struct sk_buff *skb, int attrtype, __le64 value,
                               int padattr)
{
        __le64 tmp = value;

        return nla_put_64bit(skb, attrtype, sizeof(__le64), &tmp, padattr);
}

/**
 * nla_put_s8 - Add a s8 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_s8(struct sk_buff *skb, int attrtype, s8 value)
{
        s8 tmp = value;

        return nla_put(skb, attrtype, sizeof(s8), &tmp);
}

/**
 * nla_put_s16 - Add a s16 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_s16(struct sk_buff *skb, int attrtype, s16 value)
{
        s16 tmp = value;

        return nla_put(skb, attrtype, sizeof(s16), &tmp);
}

/**
 * nla_put_s32 - Add a s32 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_s32(struct sk_buff *skb, int attrtype, s32 value)
{
        s32 tmp = value;

        return nla_put(skb, attrtype, sizeof(s32), &tmp);
}

/**
 * nla_put_s64 - Add a s64 netlink attribute to a socket buffer and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 * @padattr: attribute type for the padding
 */
static inline int nla_put_s64(struct sk_buff *skb, int attrtype, s64 value,
                              int padattr)
{
        s64 tmp = value;

        return nla_put_64bit(skb, attrtype, sizeof(s64), &tmp, padattr);
}

/**
 * nla_put_sint - Add a variable-size signed int to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_sint(struct sk_buff *skb, int attrtype, s64 value)
{
        s64 tmp64 = value;
        s32 tmp32 = value;

        if (tmp64 == tmp32)
                return nla_put_s32(skb, attrtype, tmp32);
        return nla_put(skb, attrtype, sizeof(s64), &tmp64);
}

/**
 * nla_put_string - Add a string netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @str: NUL terminated string
 */
static inline int nla_put_string(struct sk_buff *skb, int attrtype,
                                 const char *str)
{
        return nla_put(skb, attrtype, strlen(str) + 1, str);
}

/**
 * nla_put_flag - Add a flag netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 */
static inline int nla_put_flag(struct sk_buff *skb, int attrtype)
{
        return nla_put(skb, attrtype, 0, NULL);
}

/**
 * nla_put_msecs - Add a msecs netlink attribute to a skb and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @njiffies: number of jiffies to convert to msecs
 * @padattr: attribute type for the padding
 */
static inline int nla_put_msecs(struct sk_buff *skb, int attrtype,
                                unsigned long njiffies, int padattr)
{
        u64 tmp = jiffies_to_msecs(njiffies);

        return nla_put_64bit(skb, attrtype, sizeof(u64), &tmp, padattr);
}

/**
 * nla_put_in_addr - Add an IPv4 address netlink attribute to a socket
 * buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @addr: IPv4 address
 */
static inline int nla_put_in_addr(struct sk_buff *skb, int attrtype,
                                  __be32 addr)
{
        __be32 tmp = addr;

        return nla_put_be32(skb, attrtype, tmp);
}

/**
 * nla_put_in6_addr - Add an IPv6 address netlink attribute to a socket
 * buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @addr: IPv6 address
 */
static inline int nla_put_in6_addr(struct sk_buff *skb, int attrtype,
                                   const struct in6_addr *addr)
{
        return nla_put(skb, attrtype, sizeof(*addr), addr);
}

/**
 * nla_put_bitfield32 - Add a bitfield32 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: value carrying bits
 * @selector: selector of valid bits
 */
static inline int nla_put_bitfield32(struct sk_buff *skb, int attrtype,
                                     __u32 value, __u32 selector)
{
        struct nla_bitfield32 tmp = { value, selector, };

        return nla_put(skb, attrtype, sizeof(tmp), &tmp);
}

/**
 * nla_get_u32 - return payload of u32 attribute
 * @nla: u32 netlink attribute
 */
static inline u32 nla_get_u32(const struct nlattr *nla)
{
        return *(u32 *) nla_data(nla);
}

/**
 * nla_get_u32_default - return payload of u32 attribute or default
 * @nla: u32 netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline u32 nla_get_u32_default(const struct nlattr *nla, u32 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_u32(nla);
}

/**
 * nla_get_be32 - return payload of __be32 attribute
 * @nla: __be32 netlink attribute
 */
static inline __be32 nla_get_be32(const struct nlattr *nla)
{
        return *(__be32 *) nla_data(nla);
}

/**
 * nla_get_be32_default - return payload of be32 attribute or default
 * @nla: __be32 netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline __be32 nla_get_be32_default(const struct nlattr *nla,
                                          __be32 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_be32(nla);
}

/**
 * nla_get_le32 - return payload of __le32 attribute
 * @nla: __le32 netlink attribute
 */
static inline __le32 nla_get_le32(const struct nlattr *nla)
{
        return *(__le32 *) nla_data(nla);
}

/**
 * nla_get_le32_default - return payload of le32 attribute or default
 * @nla: __le32 netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline __le32 nla_get_le32_default(const struct nlattr *nla,
                                          __le32 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_le32(nla);
}

/**
 * nla_get_u16 - return payload of u16 attribute
 * @nla: u16 netlink attribute
 */
static inline u16 nla_get_u16(const struct nlattr *nla)
{
        return *(u16 *) nla_data(nla);
}

/**
 * nla_get_u16_default - return payload of u16 attribute or default
 * @nla: u16 netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline u16 nla_get_u16_default(const struct nlattr *nla, u16 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_u16(nla);
}

/**
 * nla_get_be16 - return payload of __be16 attribute
 * @nla: __be16 netlink attribute
 */
static inline __be16 nla_get_be16(const struct nlattr *nla)
{
        return *(__be16 *) nla_data(nla);
}

/**
 * nla_get_be16_default - return payload of be16 attribute or default
 * @nla: __be16 netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline __be16 nla_get_be16_default(const struct nlattr *nla,
                                          __be16 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_be16(nla);
}

/**
 * nla_get_le16 - return payload of __le16 attribute
 * @nla: __le16 netlink attribute
 */
static inline __le16 nla_get_le16(const struct nlattr *nla)
{
        return *(__le16 *) nla_data(nla);
}

/**
 * nla_get_le16_default - return payload of le16 attribute or default
 * @nla: __le16 netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline __le16 nla_get_le16_default(const struct nlattr *nla,
                                          __le16 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_le16(nla);
}

/**
 * nla_get_u8 - return payload of u8 attribute
 * @nla: u8 netlink attribute
 */
static inline u8 nla_get_u8(const struct nlattr *nla)
{
        return *(u8 *) nla_data(nla);
}

/**
 * nla_get_u8_default - return payload of u8 attribute or default
 * @nla: u8 netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline u8 nla_get_u8_default(const struct nlattr *nla, u8 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_u8(nla);
}

/**
 * nla_get_u64 - return payload of u64 attribute
 * @nla: u64 netlink attribute
 */
static inline u64 nla_get_u64(const struct nlattr *nla)
{
        u64 tmp;

        nla_memcpy(&tmp, nla, sizeof(tmp));

        return tmp;
}

/**
 * nla_get_u64_default - return payload of u64 attribute or default
 * @nla: u64 netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline u64 nla_get_u64_default(const struct nlattr *nla, u64 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_u64(nla);
}

/**
 * nla_get_uint - return payload of uint attribute
 * @nla: uint netlink attribute
 */
static inline u64 nla_get_uint(const struct nlattr *nla)
{
        if (nla_len(nla) == sizeof(u32))
                return nla_get_u32(nla);
        return nla_get_u64(nla);
}

/**
 * nla_get_uint_default - return payload of uint attribute or default
 * @nla: uint netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline u64 nla_get_uint_default(const struct nlattr *nla, u64 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_uint(nla);
}

/**
 * nla_get_be64 - return payload of __be64 attribute
 * @nla: __be64 netlink attribute
 */
static inline __be64 nla_get_be64(const struct nlattr *nla)
{
        __be64 tmp;

        nla_memcpy(&tmp, nla, sizeof(tmp));

        return tmp;
}

/**
 * nla_get_be64_default - return payload of be64 attribute or default
 * @nla: __be64 netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline __be64 nla_get_be64_default(const struct nlattr *nla,
                                          __be64 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_be64(nla);
}

/**
 * nla_get_le64 - return payload of __le64 attribute
 * @nla: __le64 netlink attribute
 */
static inline __le64 nla_get_le64(const struct nlattr *nla)
{
        return *(__le64 *) nla_data(nla);
}

/**
 * nla_get_le64_default - return payload of le64 attribute or default
 * @nla: __le64 netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline __le64 nla_get_le64_default(const struct nlattr *nla,
                                          __le64 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_le64(nla);
}

/**
 * nla_get_s32 - return payload of s32 attribute
 * @nla: s32 netlink attribute
 */
static inline s32 nla_get_s32(const struct nlattr *nla)
{
        return *(s32 *) nla_data(nla);
}

/**
 * nla_get_s32_default - return payload of s32 attribute or default
 * @nla: s32 netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline s32 nla_get_s32_default(const struct nlattr *nla, s32 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_s32(nla);
}

/**
 * nla_get_s16 - return payload of s16 attribute
 * @nla: s16 netlink attribute
 */
static inline s16 nla_get_s16(const struct nlattr *nla)
{
        return *(s16 *) nla_data(nla);
}

/**
 * nla_get_s16_default - return payload of s16 attribute or default
 * @nla: s16 netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline s16 nla_get_s16_default(const struct nlattr *nla, s16 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_s16(nla);
}

/**
 * nla_get_s8 - return payload of s8 attribute
 * @nla: s8 netlink attribute
 */
static inline s8 nla_get_s8(const struct nlattr *nla)
{
        return *(s8 *) nla_data(nla);
}

/**
 * nla_get_s8_default - return payload of s8 attribute or default
 * @nla: s8 netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline s8 nla_get_s8_default(const struct nlattr *nla, s8 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_s8(nla);
}

/**
 * nla_get_s64 - return payload of s64 attribute
 * @nla: s64 netlink attribute
 */
static inline s64 nla_get_s64(const struct nlattr *nla)
{
        s64 tmp;

        nla_memcpy(&tmp, nla, sizeof(tmp));

        return tmp;
}

/**
 * nla_get_s64_default - return payload of s64 attribute or default
 * @nla: s64 netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline s64 nla_get_s64_default(const struct nlattr *nla, s64 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_s64(nla);
}

/**
 * nla_get_sint - return payload of uint attribute
 * @nla: uint netlink attribute
 */
static inline s64 nla_get_sint(const struct nlattr *nla)
{
        if (nla_len(nla) == sizeof(s32))
                return nla_get_s32(nla);
        return nla_get_s64(nla);
}

/**
 * nla_get_sint_default - return payload of sint attribute or default
 * @nla: sint netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline s64 nla_get_sint_default(const struct nlattr *nla, s64 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_sint(nla);
}

/**
 * nla_get_flag - return payload of flag attribute
 * @nla: flag netlink attribute
 */
static inline int nla_get_flag(const struct nlattr *nla)
{
        return !!nla;
}

/**
 * nla_get_msecs - return payload of msecs attribute
 * @nla: msecs netlink attribute
 *
 * Returns: the number of milliseconds in jiffies.
 */
static inline unsigned long nla_get_msecs(const struct nlattr *nla)
{
        u64 msecs = nla_get_u64(nla);

        return msecs_to_jiffies((unsigned long) msecs);
}

/**
 * nla_get_msecs_default - return payload of msecs attribute or default
 * @nla: msecs netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline unsigned long nla_get_msecs_default(const struct nlattr *nla,
                                                  unsigned long defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_msecs(nla);
}

/**
 * nla_get_in_addr - return payload of IPv4 address attribute
 * @nla: IPv4 address netlink attribute
 */
static inline __be32 nla_get_in_addr(const struct nlattr *nla)
{
        return *(__be32 *) nla_data(nla);
}

/**
 * nla_get_in_addr_default - return payload of be32 attribute or default
 * @nla: IPv4 address netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline __be32 nla_get_in_addr_default(const struct nlattr *nla,
                                             __be32 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_in_addr(nla);
}

/**
 * nla_get_in6_addr - return payload of IPv6 address attribute
 * @nla: IPv6 address netlink attribute
 */
static inline struct in6_addr nla_get_in6_addr(const struct nlattr *nla)
{
        struct in6_addr tmp;

        nla_memcpy(&tmp, nla, sizeof(tmp));
        return tmp;
}

/**
 * nla_get_bitfield32 - return payload of 32 bitfield attribute
 * @nla: nla_bitfield32 attribute
 */
static inline struct nla_bitfield32 nla_get_bitfield32(const struct nlattr *nla)
{
        struct nla_bitfield32 tmp;

        nla_memcpy(&tmp, nla, sizeof(tmp));
        return tmp;
}

/**
 * nla_memdup - duplicate attribute memory (kmemdup)
 * @src: netlink attribute to duplicate from
 * @gfp: GFP mask
 */
static inline void *nla_memdup_noprof(const struct nlattr *src, gfp_t gfp)
{
        return kmemdup_noprof(nla_data(src), nla_len(src), gfp);
}
#define nla_memdup(...)        alloc_hooks(nla_memdup_noprof(__VA_ARGS__))

/**
 * nla_nest_start_noflag - Start a new level of nested attributes
 * @skb: socket buffer to add attributes to
 * @attrtype: attribute type of container
 *
 * This function exists for backward compatibility to use in APIs which never
 * marked their nest attributes with NLA_F_NESTED flag. New APIs should use
 * nla_nest_start() which sets the flag.
 *
 * Returns: the container attribute or NULL on error
 */
static inline struct nlattr *nla_nest_start_noflag(struct sk_buff *skb,
                                                   int attrtype)
{
        struct nlattr *start = (struct nlattr *)skb_tail_pointer(skb);

        if (nla_put(skb, attrtype, 0, NULL) < 0)
                return NULL;

        return start;
}

/**
 * nla_nest_start - Start a new level of nested attributes, with NLA_F_NESTED
 * @skb: socket buffer to add attributes to
 * @attrtype: attribute type of container
 *
 * Unlike nla_nest_start_noflag(), mark the nest attribute with NLA_F_NESTED
 * flag. This is the preferred function to use in new code.
 *
 * Returns: the container attribute or NULL on error
 */
static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype)
{
        return nla_nest_start_noflag(skb, attrtype | NLA_F_NESTED);
}

/**
 * nla_nest_end - Finalize nesting of attributes
 * @skb: socket buffer the attributes are stored in
 * @start: container attribute
 *
 * Corrects the container attribute header to include the all
 * appended attributes.
 *
 * Returns: the total data length of the skb.
 */
static inline int nla_nest_end(struct sk_buff *skb, struct nlattr *start)
{
        start->nla_len = skb_tail_pointer(skb) - (unsigned char *)start;
        return skb->len;
}

/**
 * nla_nest_cancel - Cancel nesting of attributes
 * @skb: socket buffer the message is stored in
 * @start: container attribute
 *
 * Removes the container attribute and including all nested
 * attributes. Returns -EMSGSIZE
 */
static inline void nla_nest_cancel(struct sk_buff *skb, struct nlattr *start)
{
        nlmsg_trim(skb, start);
}

/**
 * nla_put_empty_nest - Create an empty nest
 * @skb: socket buffer the message is stored in
 * @attrtype: attribute type of the container
 *
 * This function is a helper for creating empty nests.
 *
 * Returns: 0 when successful or -EMSGSIZE on failure.
 */
static inline int nla_put_empty_nest(struct sk_buff *skb, int attrtype)
{
        return nla_nest_start(skb, attrtype) ? 0 : -EMSGSIZE;
}

/**
 * __nla_validate_nested - Validate a stream of nested attributes
 * @start: container attribute
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @validate: validation strictness
 * @extack: extended ACK report struct
 *
 * Validates all attributes in the nested attribute stream against the
 * specified policy. Attributes with a type exceeding maxtype will be
 * ignored. See documentation of struct nla_policy for more details.
 *
 * Returns: 0 on success or a negative error code.
 */
static inline int __nla_validate_nested(const struct nlattr *start, int maxtype,
                                        const struct nla_policy *policy,
                                        unsigned int validate,
                                        struct netlink_ext_ack *extack)
{
        return __nla_validate(nla_data(start), nla_len(start), maxtype, policy,
                              validate, extack);
}

static inline int
nla_validate_nested(const struct nlattr *start, int maxtype,
                    const struct nla_policy *policy,
                    struct netlink_ext_ack *extack)
{
        return __nla_validate_nested(start, maxtype, policy,
                                     NL_VALIDATE_STRICT, extack);
}

static inline int
nla_validate_nested_deprecated(const struct nlattr *start, int maxtype,
                               const struct nla_policy *policy,
                               struct netlink_ext_ack *extack)
{
        return __nla_validate_nested(start, maxtype, policy,
                                     NL_VALIDATE_LIBERAL, extack);
}

/**
 * nla_need_padding_for_64bit - test 64-bit alignment of the next attribute
 * @skb: socket buffer the message is stored in
 *
 * Return: true if padding is needed to align the next attribute (nla_data()) to
 * a 64-bit aligned area.
 */
static inline bool nla_need_padding_for_64bit(struct sk_buff *skb)
{
#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
        /* The nlattr header is 4 bytes in size, that's why we test
         * if the skb->data _is_ aligned.  A NOP attribute, plus
         * nlattr header for next attribute, will make nla_data()
         * 8-byte aligned.
         */
        if (IS_ALIGNED((unsigned long)skb_tail_pointer(skb), 8))
                return true;
#endif
        return false;
}

/**
 * nla_align_64bit - 64-bit align the nla_data() of next attribute
 * @skb: socket buffer the message is stored in
 * @padattr: attribute type for the padding
 *
 * Conditionally emit a padding netlink attribute in order to make
 * the next attribute we emit have a 64-bit aligned nla_data() area.
 * This will only be done in architectures which do not have
 * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS defined.
 *
 * Returns: zero on success or a negative error code.
 */
static inline int nla_align_64bit(struct sk_buff *skb, int padattr)
{
        if (nla_need_padding_for_64bit(skb) &&
            !nla_reserve(skb, padattr, 0))
                return -EMSGSIZE;

        return 0;
}

/**
 * nla_total_size_64bit - total length of attribute including padding
 * @payload: length of payload
 */
static inline int nla_total_size_64bit(int payload)
{
        return NLA_ALIGN(nla_attr_size(payload))
#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
                + NLA_ALIGN(nla_attr_size(0))
#endif
                ;
}

/**
 * nla_for_each_attr - iterate over a stream of attributes
 * @pos: loop counter, set to current attribute
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @rem: initialized to len, holds bytes currently remaining in stream
 */
#define nla_for_each_attr(pos, head, len, rem) \
        for (pos = head, rem = len; \
             nla_ok(pos, rem); \
             pos = nla_next(pos, &(rem)))

/**
 * nla_for_each_attr_type - iterate over a stream of attributes
 * @pos: loop counter, set to current attribute
 * @type: required attribute type for @pos
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @rem: initialized to len, holds bytes currently remaining in stream
 */
#define nla_for_each_attr_type(pos, type, head, len, rem) \
        nla_for_each_attr(pos, head, len, rem) \
                if (nla_type(pos) == type)

/**
 * nla_for_each_nested - iterate over nested attributes
 * @pos: loop counter, set to current attribute
 * @nla: attribute containing the nested attributes
 * @rem: initialized to len, holds bytes currently remaining in stream
 */
#define nla_for_each_nested(pos, nla, rem) \
        nla_for_each_attr(pos, nla_data(nla), nla_len(nla), rem)

/**
 * nla_for_each_nested_type - iterate over nested attributes
 * @pos: loop counter, set to current attribute
 * @type: required attribute type for @pos
 * @nla: attribute containing the nested attributes
 * @rem: initialized to len, holds bytes currently remaining in stream
 */
#define nla_for_each_nested_type(pos, type, nla, rem) \
        nla_for_each_nested(pos, nla, rem) \
                if (nla_type(pos) == type)

/**
 * nla_is_last - Test if attribute is last in stream
 * @nla: attribute to test
 * @rem: bytes remaining in stream
 */
static inline bool nla_is_last(const struct nlattr *nla, int rem)
{
        return nla->nla_len == rem;
}

void nla_get_range_unsigned(const struct nla_policy *pt,
                            struct netlink_range_validation *range);
void nla_get_range_signed(const struct nla_policy *pt,
                          struct netlink_range_validation_signed *range);

struct netlink_policy_dump_state;

int netlink_policy_dump_add_policy(struct netlink_policy_dump_state **pstate,
                                   const struct nla_policy *policy,
                                   unsigned int maxtype);
int netlink_policy_dump_get_policy_idx(struct netlink_policy_dump_state *state,
                                       const struct nla_policy *policy,
                                       unsigned int maxtype);
bool netlink_policy_dump_loop(struct netlink_policy_dump_state *state);
int netlink_policy_dump_write(struct sk_buff *skb,
                              struct netlink_policy_dump_state *state);
int netlink_policy_dump_attr_size_estimate(const struct nla_policy *pt);
int netlink_policy_dump_write_attr(struct sk_buff *skb,
                                   const struct nla_policy *pt,
                                   int nestattr);
void netlink_policy_dump_free(struct netlink_policy_dump_state *state);

#endif












































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        Definitions for the 'struct ptr_ring' datastructure.
 *
 *        Author:
 *                Michael S. Tsirkin <mst@redhat.com>
 *
 *        Copyright (C) 2016 Red Hat, Inc.
 *
 *        This is a limited-size FIFO maintaining pointers in FIFO order, with
 *        one CPU producing entries and another consuming entries from a FIFO.
 *
 *        This implementation tries to minimize cache-contention when there is a
 *        single producer and a single consumer CPU.
 */

#ifndef _LINUX_PTR_RING_H
#define _LINUX_PTR_RING_H 1

#ifdef __KERNEL__
#include <linux/spinlock.h>
#include <linux/cache.h>
#include <linux/types.h>
#include <linux/compiler.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <asm/errno.h>
#endif

struct ptr_ring {
        int producer ____cacheline_aligned_in_smp;
        spinlock_t producer_lock;
        int consumer_head ____cacheline_aligned_in_smp; /* next valid entry */
        int consumer_tail; /* next entry to invalidate */
        spinlock_t consumer_lock;
        /* Shared consumer/producer data */
        /* Read-only by both the producer and the consumer */
        int size ____cacheline_aligned_in_smp; /* max entries in queue */
        int batch; /* number of entries to consume in a batch */
        void **queue;
};

/* Note: callers invoking this in a loop must use a compiler barrier,
 * for example cpu_relax().
 *
 * NB: this is unlike __ptr_ring_empty in that callers must hold producer_lock:
 * see e.g. ptr_ring_full.
 */
static inline bool __ptr_ring_full(struct ptr_ring *r)
{
        return r->queue[r->producer];
}

static inline bool ptr_ring_full(struct ptr_ring *r)
{
        bool ret;

        spin_lock(&r->producer_lock);
        ret = __ptr_ring_full(r);
        spin_unlock(&r->producer_lock);

        return ret;
}

static inline bool ptr_ring_full_irq(struct ptr_ring *r)
{
        bool ret;

        spin_lock_irq(&r->producer_lock);
        ret = __ptr_ring_full(r);
        spin_unlock_irq(&r->producer_lock);

        return ret;
}

static inline bool ptr_ring_full_any(struct ptr_ring *r)
{
        unsigned long flags;
        bool ret;

        spin_lock_irqsave(&r->producer_lock, flags);
        ret = __ptr_ring_full(r);
        spin_unlock_irqrestore(&r->producer_lock, flags);

        return ret;
}

static inline bool ptr_ring_full_bh(struct ptr_ring *r)
{
        bool ret;

        spin_lock_bh(&r->producer_lock);
        ret = __ptr_ring_full(r);
        spin_unlock_bh(&r->producer_lock);

        return ret;
}

/* Note: callers invoking this in a loop must use a compiler barrier,
 * for example cpu_relax(). Callers must hold producer_lock.
 * Callers are responsible for making sure pointer that is being queued
 * points to a valid data.
 */
static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr)
{
        if (unlikely(!r->size) || r->queue[r->producer])
                return -ENOSPC;

        /* Make sure the pointer we are storing points to a valid data. */
        /* Pairs with the dependency ordering in __ptr_ring_consume. */
        smp_wmb();

        WRITE_ONCE(r->queue[r->producer++], ptr);
        if (unlikely(r->producer >= r->size))
                r->producer = 0;
        return 0;
}

/*
 * Note: resize (below) nests producer lock within consumer lock, so if you
 * consume in interrupt or BH context, you must disable interrupts/BH when
 * calling this.
 */
static inline int ptr_ring_produce(struct ptr_ring *r, void *ptr)
{
        int ret;

        spin_lock(&r->producer_lock);
        ret = __ptr_ring_produce(r, ptr);
        spin_unlock(&r->producer_lock);

        return ret;
}

static inline int ptr_ring_produce_irq(struct ptr_ring *r, void *ptr)
{
        int ret;

        spin_lock_irq(&r->producer_lock);
        ret = __ptr_ring_produce(r, ptr);
        spin_unlock_irq(&r->producer_lock);

        return ret;
}

static inline int ptr_ring_produce_any(struct ptr_ring *r, void *ptr)
{
        unsigned long flags;
        int ret;

        spin_lock_irqsave(&r->producer_lock, flags);
        ret = __ptr_ring_produce(r, ptr);
        spin_unlock_irqrestore(&r->producer_lock, flags);

        return ret;
}

static inline int ptr_ring_produce_bh(struct ptr_ring *r, void *ptr)
{
        int ret;

        spin_lock_bh(&r->producer_lock);
        ret = __ptr_ring_produce(r, ptr);
        spin_unlock_bh(&r->producer_lock);

        return ret;
}

static inline void *__ptr_ring_peek(struct ptr_ring *r)
{
        if (likely(r->size))
                return READ_ONCE(r->queue[r->consumer_head]);
        return NULL;
}

/*
 * Test ring empty status without taking any locks.
 *
 * NB: This is only safe to call if ring is never resized.
 *
 * However, if some other CPU consumes ring entries at the same time, the value
 * returned is not guaranteed to be correct.
 *
 * In this case - to avoid incorrectly detecting the ring
 * as empty - the CPU consuming the ring entries is responsible
 * for either consuming all ring entries until the ring is empty,
 * or synchronizing with some other CPU and causing it to
 * re-test __ptr_ring_empty and/or consume the ring enteries
 * after the synchronization point.
 *
 * Note: callers invoking this in a loop must use a compiler barrier,
 * for example cpu_relax().
 */
static inline bool __ptr_ring_empty(struct ptr_ring *r)
{
        if (likely(r->size))
                return !r->queue[READ_ONCE(r->consumer_head)];
        return true;
}

static inline bool ptr_ring_empty(struct ptr_ring *r)
{
        bool ret;

        spin_lock(&r->consumer_lock);
        ret = __ptr_ring_empty(r);
        spin_unlock(&r->consumer_lock);

        return ret;
}

static inline bool ptr_ring_empty_irq(struct ptr_ring *r)
{
        bool ret;

        spin_lock_irq(&r->consumer_lock);
        ret = __ptr_ring_empty(r);
        spin_unlock_irq(&r->consumer_lock);

        return ret;
}

static inline bool ptr_ring_empty_any(struct ptr_ring *r)
{
        unsigned long flags;
        bool ret;

        spin_lock_irqsave(&r->consumer_lock, flags);
        ret = __ptr_ring_empty(r);
        spin_unlock_irqrestore(&r->consumer_lock, flags);

        return ret;
}

static inline bool ptr_ring_empty_bh(struct ptr_ring *r)
{
        bool ret;

        spin_lock_bh(&r->consumer_lock);
        ret = __ptr_ring_empty(r);
        spin_unlock_bh(&r->consumer_lock);

        return ret;
}

/* Zero entries from tail to specified head.
 * NB: if consumer_head can be >= r->size need to fixup tail later.
 */
static inline void __ptr_ring_zero_tail(struct ptr_ring *r, int consumer_head)
{
        int head = consumer_head;

        /* Zero out entries in the reverse order: this way we touch the
         * cache line that producer might currently be reading the last;
         * producer won't make progress and touch other cache lines
         * besides the first one until we write out all entries.
         */
        while (likely(head > r->consumer_tail))
                r->queue[--head] = NULL;

        r->consumer_tail = consumer_head;
}

/* Must only be called after __ptr_ring_peek returned !NULL */
static inline void __ptr_ring_discard_one(struct ptr_ring *r)
{
        /* Fundamentally, what we want to do is update consumer
         * index and zero out the entry so producer can reuse it.
         * Doing it naively at each consume would be as simple as:
         *       consumer = r->consumer;
         *       r->queue[consumer++] = NULL;
         *       if (unlikely(consumer >= r->size))
         *               consumer = 0;
         *       r->consumer = consumer;
         * but that is suboptimal when the ring is full as producer is writing
         * out new entries in the same cache line.  Defer these updates until a
         * batch of entries has been consumed.
         */
        /* Note: we must keep consumer_head valid at all times for __ptr_ring_empty
         * to work correctly.
         */
        int consumer_head = r->consumer_head + 1;

        /* Once we have processed enough entries invalidate them in
         * the ring all at once so producer can reuse their space in the ring.
         * We also do this when we reach end of the ring - not mandatory
         * but helps keep the implementation simple.
         */
        if (unlikely(consumer_head - r->consumer_tail >= r->batch ||
                     consumer_head >= r->size))
                __ptr_ring_zero_tail(r, consumer_head);

        if (unlikely(consumer_head >= r->size)) {
                consumer_head = 0;
                r->consumer_tail = 0;
        }
        /* matching READ_ONCE in __ptr_ring_empty for lockless tests */
        WRITE_ONCE(r->consumer_head, consumer_head);
}

static inline void *__ptr_ring_consume(struct ptr_ring *r)
{
        void *ptr;

        /* The READ_ONCE in __ptr_ring_peek guarantees that anyone
         * accessing data through the pointer is up to date. Pairs
         * with smp_wmb in __ptr_ring_produce.
         */
        ptr = __ptr_ring_peek(r);
        if (ptr)
                __ptr_ring_discard_one(r);

        return ptr;
}

static inline int __ptr_ring_consume_batched(struct ptr_ring *r,
                                             void **array, int n)
{
        void *ptr;
        int i;

        for (i = 0; i < n; i++) {
                ptr = __ptr_ring_consume(r);
                if (!ptr)
                        break;
                array[i] = ptr;
        }

        return i;
}

/*
 * Note: resize (below) nests producer lock within consumer lock, so if you
 * call this in interrupt or BH context, you must disable interrupts/BH when
 * producing.
 */
static inline void *ptr_ring_consume(struct ptr_ring *r)
{
        void *ptr;

        spin_lock(&r->consumer_lock);
        ptr = __ptr_ring_consume(r);
        spin_unlock(&r->consumer_lock);

        return ptr;
}

static inline void *ptr_ring_consume_irq(struct ptr_ring *r)
{
        void *ptr;

        spin_lock_irq(&r->consumer_lock);
        ptr = __ptr_ring_consume(r);
        spin_unlock_irq(&r->consumer_lock);

        return ptr;
}

static inline void *ptr_ring_consume_any(struct ptr_ring *r)
{
        unsigned long flags;
        void *ptr;

        spin_lock_irqsave(&r->consumer_lock, flags);
        ptr = __ptr_ring_consume(r);
        spin_unlock_irqrestore(&r->consumer_lock, flags);

        return ptr;
}

static inline void *ptr_ring_consume_bh(struct ptr_ring *r)
{
        void *ptr;

        spin_lock_bh(&r->consumer_lock);
        ptr = __ptr_ring_consume(r);
        spin_unlock_bh(&r->consumer_lock);

        return ptr;
}

static inline int ptr_ring_consume_batched(struct ptr_ring *r,
                                           void **array, int n)
{
        int ret;

        spin_lock(&r->consumer_lock);
        ret = __ptr_ring_consume_batched(r, array, n);
        spin_unlock(&r->consumer_lock);

        return ret;
}

static inline int ptr_ring_consume_batched_irq(struct ptr_ring *r,
                                               void **array, int n)
{
        int ret;

        spin_lock_irq(&r->consumer_lock);
        ret = __ptr_ring_consume_batched(r, array, n);
        spin_unlock_irq(&r->consumer_lock);

        return ret;
}

static inline int ptr_ring_consume_batched_any(struct ptr_ring *r,
                                               void **array, int n)
{
        unsigned long flags;
        int ret;

        spin_lock_irqsave(&r->consumer_lock, flags);
        ret = __ptr_ring_consume_batched(r, array, n);
        spin_unlock_irqrestore(&r->consumer_lock, flags);

        return ret;
}

static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r,
                                              void **array, int n)
{
        int ret;

        spin_lock_bh(&r->consumer_lock);
        ret = __ptr_ring_consume_batched(r, array, n);
        spin_unlock_bh(&r->consumer_lock);

        return ret;
}

/* Cast to structure type and call a function without discarding from FIFO.
 * Function must return a value.
 * Callers must take consumer_lock.
 */
#define __PTR_RING_PEEK_CALL(r, f) ((f)(__ptr_ring_peek(r)))

#define PTR_RING_PEEK_CALL(r, f) ({ \
        typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \
        \
        spin_lock(&(r)->consumer_lock); \
        __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \
        spin_unlock(&(r)->consumer_lock); \
        __PTR_RING_PEEK_CALL_v; \
})

#define PTR_RING_PEEK_CALL_IRQ(r, f) ({ \
        typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \
        \
        spin_lock_irq(&(r)->consumer_lock); \
        __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \
        spin_unlock_irq(&(r)->consumer_lock); \
        __PTR_RING_PEEK_CALL_v; \
})

#define PTR_RING_PEEK_CALL_BH(r, f) ({ \
        typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \
        \
        spin_lock_bh(&(r)->consumer_lock); \
        __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \
        spin_unlock_bh(&(r)->consumer_lock); \
        __PTR_RING_PEEK_CALL_v; \
})

#define PTR_RING_PEEK_CALL_ANY(r, f) ({ \
        typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \
        unsigned long __PTR_RING_PEEK_CALL_f;\
        \
        spin_lock_irqsave(&(r)->consumer_lock, __PTR_RING_PEEK_CALL_f); \
        __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \
        spin_unlock_irqrestore(&(r)->consumer_lock, __PTR_RING_PEEK_CALL_f); \
        __PTR_RING_PEEK_CALL_v; \
})

/* Not all gfp_t flags (besides GFP_KERNEL) are allowed. See
 * documentation for vmalloc for which of them are legal.
 */
static inline void **__ptr_ring_init_queue_alloc_noprof(unsigned int size, gfp_t gfp)
{
        if (size > KMALLOC_MAX_SIZE / sizeof(void *))
                return NULL;
        return kvmalloc_array_noprof(size, sizeof(void *), gfp | __GFP_ZERO);
}

static inline void __ptr_ring_set_size(struct ptr_ring *r, int size)
{
        r->size = size;
        r->batch = SMP_CACHE_BYTES * 2 / sizeof(*(r->queue));
        /* We need to set batch at least to 1 to make logic
         * in __ptr_ring_discard_one work correctly.
         * Batching too much (because ring is small) would cause a lot of
         * burstiness. Needs tuning, for now disable batching.
         */
        if (r->batch > r->size / 2 || !r->batch)
                r->batch = 1;
}

static inline int ptr_ring_init_noprof(struct ptr_ring *r, int size, gfp_t gfp)
{
        r->queue = __ptr_ring_init_queue_alloc_noprof(size, gfp);
        if (!r->queue)
                return -ENOMEM;

        __ptr_ring_set_size(r, size);
        r->producer = r->consumer_head = r->consumer_tail = 0;
        spin_lock_init(&r->producer_lock);
        spin_lock_init(&r->consumer_lock);

        return 0;
}
#define ptr_ring_init(...)        alloc_hooks(ptr_ring_init_noprof(__VA_ARGS__))

/*
 * Return entries into ring. Destroy entries that don't fit.
 *
 * Note: this is expected to be a rare slow path operation.
 *
 * Note: producer lock is nested within consumer lock, so if you
 * resize you must make sure all uses nest correctly.
 * In particular if you consume ring in interrupt or BH context, you must
 * disable interrupts/BH when doing so.
 */
static inline void ptr_ring_unconsume(struct ptr_ring *r, void **batch, int n,
                                      void (*destroy)(void *))
{
        unsigned long flags;

        spin_lock_irqsave(&r->consumer_lock, flags);
        spin_lock(&r->producer_lock);

        if (!r->size)
                goto done;

        /*
         * Clean out buffered entries (for simplicity). This way following code
         * can test entries for NULL and if not assume they are valid.
         */
        __ptr_ring_zero_tail(r, r->consumer_head);

        /*
         * Go over entries in batch, start moving head back and copy entries.
         * Stop when we run into previously unconsumed entries.
         */
        while (n) {
                int head = r->consumer_head - 1;
                if (head < 0)
                        head = r->size - 1;
                if (r->queue[head]) {
                        /* This batch entry will have to be destroyed. */
                        goto done;
                }
                r->queue[head] = batch[--n];
                r->consumer_tail = head;
                /* matching READ_ONCE in __ptr_ring_empty for lockless tests */
                WRITE_ONCE(r->consumer_head, head);
        }

done:
        /* Destroy all entries left in the batch. */
        while (n)
                destroy(batch[--n]);
        spin_unlock(&r->producer_lock);
        spin_unlock_irqrestore(&r->consumer_lock, flags);
}

static inline void **__ptr_ring_swap_queue(struct ptr_ring *r, void **queue,
                                           int size, gfp_t gfp,
                                           void (*destroy)(void *))
{
        int producer = 0;
        void **old;
        void *ptr;

        while ((ptr = __ptr_ring_consume(r)))
                if (producer < size)
                        queue[producer++] = ptr;
                else if (destroy)
                        destroy(ptr);

        if (producer >= size)
                producer = 0;
        __ptr_ring_set_size(r, size);
        r->producer = producer;
        r->consumer_head = 0;
        r->consumer_tail = 0;
        old = r->queue;
        r->queue = queue;

        return old;
}

/*
 * Note: producer lock is nested within consumer lock, so if you
 * resize you must make sure all uses nest correctly.
 * In particular if you consume ring in interrupt or BH context, you must
 * disable interrupts/BH when doing so.
 */
static inline int ptr_ring_resize_noprof(struct ptr_ring *r, int size, gfp_t gfp,
                                  void (*destroy)(void *))
{
        unsigned long flags;
        void **queue = __ptr_ring_init_queue_alloc_noprof(size, gfp);
        void **old;

        if (!queue)
                return -ENOMEM;

        spin_lock_irqsave(&(r)->consumer_lock, flags);
        spin_lock(&(r)->producer_lock);

        old = __ptr_ring_swap_queue(r, queue, size, gfp, destroy);

        spin_unlock(&(r)->producer_lock);
        spin_unlock_irqrestore(&(r)->consumer_lock, flags);

        kvfree(old);

        return 0;
}
#define ptr_ring_resize(...)        alloc_hooks(ptr_ring_resize_noprof(__VA_ARGS__))

/*
 * Note: producer lock is nested within consumer lock, so if you
 * resize you must make sure all uses nest correctly.
 * In particular if you consume ring in BH context, you must
 * disable BH when doing so.
 */
static inline int ptr_ring_resize_multiple_bh_noprof(struct ptr_ring **rings,
                                                     unsigned int nrings,
                                                     int size, gfp_t gfp,
                                                     void (*destroy)(void *))
{
        void ***queues;
        int i;

        queues = kmalloc_array_noprof(nrings, sizeof(*queues), gfp);
        if (!queues)
                goto noqueues;

        for (i = 0; i < nrings; ++i) {
                queues[i] = __ptr_ring_init_queue_alloc_noprof(size, gfp);
                if (!queues[i])
                        goto nomem;
        }

        for (i = 0; i < nrings; ++i) {
                spin_lock_bh(&(rings[i])->consumer_lock);
                spin_lock(&(rings[i])->producer_lock);
                queues[i] = __ptr_ring_swap_queue(rings[i], queues[i],
                                                  size, gfp, destroy);
                spin_unlock(&(rings[i])->producer_lock);
                spin_unlock_bh(&(rings[i])->consumer_lock);
        }

        for (i = 0; i < nrings; ++i)
                kvfree(queues[i]);

        kfree(queues);

        return 0;

nomem:
        while (--i >= 0)
                kvfree(queues[i]);

        kfree(queues);

noqueues:
        return -ENOMEM;
}
#define ptr_ring_resize_multiple_bh(...) \
                alloc_hooks(ptr_ring_resize_multiple_bh_noprof(__VA_ARGS__))

static inline void ptr_ring_cleanup(struct ptr_ring *r, void (*destroy)(void *))
{
        void *ptr;

        if (destroy)
                while ((ptr = ptr_ring_consume(r)))
                        destroy(ptr);
        kvfree(r->queue);
}

#endif /* _LINUX_PTR_RING_H  */





























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
/* SPDX-License-Identifier: GPL-2.0 */
/* Freezer declarations */

#ifndef FREEZER_H_INCLUDED
#define FREEZER_H_INCLUDED

#include <linux/debug_locks.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/atomic.h>
#include <linux/jump_label.h>

#ifdef CONFIG_FREEZER
DECLARE_STATIC_KEY_FALSE(freezer_active);

extern bool pm_freezing;                /* PM freezing in effect */
extern bool pm_nosig_freezing;                /* PM nosig freezing in effect */

/*
 * Timeout for stopping processes
 */
extern unsigned int freeze_timeout_msecs;

/*
 * Check if a process has been frozen
 */
extern bool frozen(struct task_struct *p);

extern bool freezing_slow_path(struct task_struct *p);

/*
 * Check if there is a request to freeze a process
 */
static inline bool freezing(struct task_struct *p)
{
        if (static_branch_unlikely(&freezer_active))
                return freezing_slow_path(p);

        return false;
}

/* Takes and releases task alloc lock using task_lock() */
extern void __thaw_task(struct task_struct *t);

extern bool __refrigerator(bool check_kthr_stop);
extern int freeze_processes(void);
extern int freeze_kernel_threads(void);
extern void thaw_processes(void);
extern void thaw_kernel_threads(void);
extern void thaw_process(struct task_struct *p);

static inline bool try_to_freeze(void)
{
        might_sleep();
        if (likely(!freezing(current)))
                return false;
        if (!(current->flags & PF_NOFREEZE))
                debug_check_no_locks_held();
        return __refrigerator(false);
}

extern bool freeze_task(struct task_struct *p);
extern bool set_freezable(void);

#ifdef CONFIG_CGROUP_FREEZER
extern bool cgroup_freezing(struct task_struct *task);
#else /* !CONFIG_CGROUP_FREEZER */
static inline bool cgroup_freezing(struct task_struct *task)
{
        return false;
}
#endif /* !CONFIG_CGROUP_FREEZER */

#else /* !CONFIG_FREEZER */
static inline bool frozen(struct task_struct *p) { return false; }
static inline bool freezing(struct task_struct *p) { return false; }
static inline void __thaw_task(struct task_struct *t) {}

static inline bool __refrigerator(bool check_kthr_stop) { return false; }
static inline int freeze_processes(void) { return -ENOSYS; }
static inline int freeze_kernel_threads(void) { return -ENOSYS; }
static inline void thaw_processes(void) {}
static inline void thaw_kernel_threads(void) {}
static inline void thaw_process(struct task_struct *p) {}

static inline bool try_to_freeze(void) { return false; }

static inline void set_freezable(void) {}

#endif /* !CONFIG_FREEZER */

#endif        /* FREEZER_H_INCLUDED */












































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_GENERIC_SECTIONS_H_
#define _ASM_GENERIC_SECTIONS_H_

/* References to section boundaries */

#include <linux/compiler.h>
#include <linux/types.h>

/*
 * Usage guidelines:
 * _text, _data: architecture specific, don't use them in arch-independent code
 * [_stext, _etext]: contains .text.* sections, may also contain .rodata.*
 *                   and/or .init.* sections
 * [_sdata, _edata]: contains .data.* sections, may also contain .rodata.*
 *                   and/or .init.* sections.
 * [__start_rodata, __end_rodata]: contains .rodata.* sections
 * [__start_ro_after_init, __end_ro_after_init]:
 *                     contains .data..ro_after_init section
 * [__init_begin, __init_end]: contains .init.* sections, but .init.text.*
 *                   may be out of this range on some architectures.
 * [_sinittext, _einittext]: contains .init.text.* sections
 * [__bss_start, __bss_stop]: contains BSS sections
 *
 * Following global variables are optional and may be unavailable on some
 * architectures and/or kernel configurations.
 *        _text, _data
 *        __kprobes_text_start, __kprobes_text_end
 *        __entry_text_start, __entry_text_end
 *        __ctors_start, __ctors_end
 *        __irqentry_text_start, __irqentry_text_end
 *        __softirqentry_text_start, __softirqentry_text_end
 *        __start_opd, __end_opd
 */
extern char _text[], _stext[], _etext[];
extern char _data[], _sdata[], _edata[];
extern char __bss_start[], __bss_stop[];
extern char __init_begin[], __init_end[];
extern char _sinittext[], _einittext[];
extern char __start_ro_after_init[], __end_ro_after_init[];
extern char _end[];
extern char __per_cpu_start[], __per_cpu_end[];
extern char __kprobes_text_start[], __kprobes_text_end[];
extern char __entry_text_start[], __entry_text_end[];
extern char __start_rodata[], __end_rodata[];
extern char __irqentry_text_start[], __irqentry_text_end[];
extern char __softirqentry_text_start[], __softirqentry_text_end[];
extern char __start_once[], __end_once[];

/* Start and end of .ctors section - used for constructor calls. */
extern char __ctors_start[], __ctors_end[];

/* Start and end of .opd section - used for function descriptors. */
extern char __start_opd[], __end_opd[];

/* Start and end of instrumentation protected text section */
extern char __noinstr_text_start[], __noinstr_text_end[];

extern __visible const void __nosave_begin, __nosave_end;

/* Function descriptor handling (if any).  Override in asm/sections.h */
#ifdef CONFIG_HAVE_FUNCTION_DESCRIPTORS
void *dereference_function_descriptor(void *ptr);
void *dereference_kernel_function_descriptor(void *ptr);
#else
#define dereference_function_descriptor(p) ((void *)(p))
#define dereference_kernel_function_descriptor(p) ((void *)(p))

/* An address is simply the address of the function. */
typedef struct {
        unsigned long addr;
} func_desc_t;
#endif

static inline bool have_function_descriptors(void)
{
        return IS_ENABLED(CONFIG_HAVE_FUNCTION_DESCRIPTORS);
}

/**
 * memory_contains - checks if an object is contained within a memory region
 * @begin: virtual address of the beginning of the memory region
 * @end: virtual address of the end of the memory region
 * @virt: virtual address of the memory object
 * @size: size of the memory object
 *
 * Returns: true if the object specified by @virt and @size is entirely
 * contained within the memory region defined by @begin and @end, false
 * otherwise.
 */
static inline bool memory_contains(void *begin, void *end, void *virt,
                                   size_t size)
{
        return virt >= begin && virt + size <= end;
}

/**
 * memory_intersects - checks if the region occupied by an object intersects
 *                     with another memory region
 * @begin: virtual address of the beginning of the memory region
 * @end: virtual address of the end of the memory region
 * @virt: virtual address of the memory object
 * @size: size of the memory object
 *
 * Returns: true if an object's memory region, specified by @virt and @size,
 * intersects with the region specified by @begin and @end, false otherwise.
 */
static inline bool memory_intersects(void *begin, void *end, void *virt,
                                     size_t size)
{
        void *vend = virt + size;

        if (virt < end && vend > begin)
                return true;

        return false;
}

/**
 * init_section_contains - checks if an object is contained within the init
 *                         section
 * @virt: virtual address of the memory object
 * @size: size of the memory object
 *
 * Returns: true if the object specified by @virt and @size is entirely
 * contained within the init section, false otherwise.
 */
static inline bool init_section_contains(void *virt, size_t size)
{
        return memory_contains(__init_begin, __init_end, virt, size);
}

/**
 * init_section_intersects - checks if the region occupied by an object
 *                           intersects with the init section
 * @virt: virtual address of the memory object
 * @size: size of the memory object
 *
 * Returns: true if an object's memory region, specified by @virt and @size,
 * intersects with the init section, false otherwise.
 */
static inline bool init_section_intersects(void *virt, size_t size)
{
        return memory_intersects(__init_begin, __init_end, virt, size);
}

/**
 * is_kernel_core_data - checks if the pointer address is located in the
 *                         .data or .bss section
 *
 * @addr: address to check
 *
 * Returns: true if the address is located in .data or .bss, false otherwise.
 * Note: On some archs it may return true for core RODATA, and false
 *       for others. But will always be true for core RW data.
 */
static inline bool is_kernel_core_data(unsigned long addr)
{
        if (addr >= (unsigned long)_sdata && addr < (unsigned long)_edata)
                return true;

        if (addr >= (unsigned long)__bss_start &&
            addr < (unsigned long)__bss_stop)
                return true;

        return false;
}

/**
 * is_kernel_rodata - checks if the pointer address is located in the
 *                    .rodata section
 *
 * @addr: address to check
 *
 * Returns: true if the address is located in .rodata, false otherwise.
 */
static inline bool is_kernel_rodata(unsigned long addr)
{
        return addr >= (unsigned long)__start_rodata &&
               addr < (unsigned long)__end_rodata;
}

static inline bool is_kernel_ro_after_init(unsigned long addr)
{
        return addr >= (unsigned long)__start_ro_after_init &&
               addr < (unsigned long)__end_ro_after_init;
}
/**
 * is_kernel_inittext - checks if the pointer address is located in the
 *                      .init.text section
 *
 * @addr: address to check
 *
 * Returns: true if the address is located in .init.text, false otherwise.
 */
static inline bool is_kernel_inittext(unsigned long addr)
{
        return addr >= (unsigned long)_sinittext &&
               addr < (unsigned long)_einittext;
}

/**
 * __is_kernel_text - checks if the pointer address is located in the
 *                    .text section
 *
 * @addr: address to check
 *
 * Returns: true if the address is located in .text, false otherwise.
 * Note: an internal helper, only check the range of _stext to _etext.
 */
static inline bool __is_kernel_text(unsigned long addr)
{
        return addr >= (unsigned long)_stext &&
               addr < (unsigned long)_etext;
}

/**
 * __is_kernel - checks if the pointer address is located in the kernel range
 *
 * @addr: address to check
 *
 * Returns: true if the address is located in the kernel range, false otherwise.
 * Note: an internal helper, check the range of _stext to _end,
 *       and range from __init_begin to __init_end, which can be outside
 *       of the _stext to _end range.
 */
static inline bool __is_kernel(unsigned long addr)
{
        return ((addr >= (unsigned long)_stext &&
                 addr < (unsigned long)_end) ||
                (addr >= (unsigned long)__init_begin &&
                 addr < (unsigned long)__init_end));
}

#endif /* _ASM_GENERIC_SECTIONS_H_ */






















   42 













   42 



   42 













    2 


    2 



























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  Generic Timer-queue
 *
 *  Manages a simple queue of timers, ordered by expiration time.
 *  Uses rbtrees for quick list adds and expiration.
 *
 *  NOTE: All of the following functions need to be serialized
 *  to avoid races. No locking is done by this library code.
 */

#include <linux/bug.h>
#include <linux/timerqueue.h>
#include <linux/rbtree.h>
#include <linux/export.h>

#define __node_2_tq(_n) \
        rb_entry((_n), struct timerqueue_node, node)

static inline bool __timerqueue_less(struct rb_node *a, const struct rb_node *b)
{
        return __node_2_tq(a)->expires < __node_2_tq(b)->expires;
}

/**
 * timerqueue_add - Adds timer to timerqueue.
 *
 * @head: head of timerqueue
 * @node: timer node to be added
 *
 * Adds the timer node to the timerqueue, sorted by the node's expires
 * value. Returns true if the newly added timer is the first expiring timer in
 * the queue.
 */
bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
{
        /* Make sure we don't add nodes that are already added */
        WARN_ON_ONCE(!RB_EMPTY_NODE(&node->node));

        return rb_add_cached(&node->node, &head->rb_root, __timerqueue_less);
}
EXPORT_SYMBOL_GPL(timerqueue_add);

/**
 * timerqueue_del - Removes a timer from the timerqueue.
 *
 * @head: head of timerqueue
 * @node: timer node to be removed
 *
 * Removes the timer node from the timerqueue. Returns true if the queue is
 * not empty after the remove.
 */
bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
{
        WARN_ON_ONCE(RB_EMPTY_NODE(&node->node));

        rb_erase_cached(&node->node, &head->rb_root);
        RB_CLEAR_NODE(&node->node);

        return !RB_EMPTY_ROOT(&head->rb_root.rb_root);
}
EXPORT_SYMBOL_GPL(timerqueue_del);

/**
 * timerqueue_iterate_next - Returns the timer after the provided timer
 *
 * @node: Pointer to a timer.
 *
 * Provides the timer that is after the given node. This is used, when
 * necessary, to iterate through the list of timers in a timer list
 * without modifying the list.
 */
struct timerqueue_node *timerqueue_iterate_next(struct timerqueue_node *node)
{
        struct rb_node *next;

        if (!node)
                return NULL;
        next = rb_next(&node->node);
        if (!next)
                return NULL;
        return container_of(next, struct timerqueue_node, node);
}
EXPORT_SYMBOL_GPL(timerqueue_iterate_next);










































































































































































































   17 






















    2 












































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Asynchronous Compression operations
 *
 * Copyright (c) 2016, Intel Corporation
 * Authors: Weigang Li <weigang.li@intel.com>
 *          Giovanni Cabiddu <giovanni.cabiddu@intel.com>
 */
#ifndef _CRYPTO_ACOMP_H
#define _CRYPTO_ACOMP_H

#include <linux/atomic.h>
#include <linux/args.h>
#include <linux/compiler_types.h>
#include <linux/container_of.h>
#include <linux/crypto.h>
#include <linux/err.h>
#include <linux/scatterlist.h>
#include <linux/slab.h>
#include <linux/spinlock_types.h>
#include <linux/types.h>

/* Set this bit if source is virtual address instead of SG list. */
#define CRYPTO_ACOMP_REQ_SRC_VIRT        0x00000002

/* Set this bit for if virtual address source cannot be used for DMA. */
#define CRYPTO_ACOMP_REQ_SRC_NONDMA        0x00000004

/* Set this bit if destination is virtual address instead of SG list. */
#define CRYPTO_ACOMP_REQ_DST_VIRT        0x00000008

/* Set this bit for if virtual address destination cannot be used for DMA. */
#define CRYPTO_ACOMP_REQ_DST_NONDMA        0x00000010

/* Private flags that should not be touched by the user. */
#define CRYPTO_ACOMP_REQ_PRIVATE \
        (CRYPTO_ACOMP_REQ_SRC_VIRT | CRYPTO_ACOMP_REQ_SRC_NONDMA | \
         CRYPTO_ACOMP_REQ_DST_VIRT | CRYPTO_ACOMP_REQ_DST_NONDMA)

#define CRYPTO_ACOMP_DST_MAX                131072

#define        MAX_SYNC_COMP_REQSIZE                0

#define ACOMP_REQUEST_ON_STACK(name, tfm) \
        char __##name##_req[sizeof(struct acomp_req) + \
                            MAX_SYNC_COMP_REQSIZE] CRYPTO_MINALIGN_ATTR; \
        struct acomp_req *name = acomp_request_on_stack_init( \
                __##name##_req, (tfm))

#define ACOMP_REQUEST_CLONE(name, gfp) \
        acomp_request_clone(name, sizeof(__##name##_req), gfp)

struct acomp_req;
struct folio;

struct acomp_req_chain {
        crypto_completion_t compl;
        void *data;
        struct scatterlist ssg;
        struct scatterlist dsg;
        union {
                const u8 *src;
                struct folio *sfolio;
        };
        union {
                u8 *dst;
                struct folio *dfolio;
        };
        u32 flags;
};

/**
 * struct acomp_req - asynchronous (de)compression request
 *
 * @base:        Common attributes for asynchronous crypto requests
 * @src:        Source scatterlist
 * @dst:        Destination scatterlist
 * @svirt:        Source virtual address
 * @dvirt:        Destination virtual address
 * @slen:        Size of the input buffer
 * @dlen:        Size of the output buffer and number of bytes produced
 * @chain:        Private API code data, do not use
 * @__ctx:        Start of private context data
 */
struct acomp_req {
        struct crypto_async_request base;
        union {
                struct scatterlist *src;
                const u8 *svirt;
        };
        union {
                struct scatterlist *dst;
                u8 *dvirt;
        };
        unsigned int slen;
        unsigned int dlen;

        struct acomp_req_chain chain;

        void *__ctx[] CRYPTO_MINALIGN_ATTR;
};

/**
 * struct crypto_acomp - user-instantiated objects which encapsulate
 * algorithms and core processing logic
 *
 * @compress:                Function performs a compress operation
 * @decompress:                Function performs a de-compress operation
 * @reqsize:                Context size for (de)compression requests
 * @fb:                        Synchronous fallback tfm
 * @base:                Common crypto API algorithm data structure
 */
struct crypto_acomp {
        int (*compress)(struct acomp_req *req);
        int (*decompress)(struct acomp_req *req);
        unsigned int reqsize;
        struct crypto_tfm base;
};

#define COMP_ALG_COMMON {                        \
        struct crypto_alg base;                        \
}
struct comp_alg_common COMP_ALG_COMMON;

/**
 * DOC: Asynchronous Compression API
 *
 * The Asynchronous Compression API is used with the algorithms of type
 * CRYPTO_ALG_TYPE_ACOMPRESS (listed as type "acomp" in /proc/crypto)
 */

/**
 * crypto_alloc_acomp() -- allocate ACOMPRESS tfm handle
 * @alg_name:        is the cra_name / name or cra_driver_name / driver name of the
 *                compression algorithm e.g. "deflate"
 * @type:        specifies the type of the algorithm
 * @mask:        specifies the mask for the algorithm
 *
 * Allocate a handle for a compression algorithm. The returned struct
 * crypto_acomp is the handle that is required for any subsequent
 * API invocation for the compression operations.
 *
 * Return:        allocated handle in case of success; IS_ERR() is true in case
 *                of an error, PTR_ERR() returns the error code.
 */
struct crypto_acomp *crypto_alloc_acomp(const char *alg_name, u32 type,
                                        u32 mask);
/**
 * crypto_alloc_acomp_node() -- allocate ACOMPRESS tfm handle with desired NUMA node
 * @alg_name:        is the cra_name / name or cra_driver_name / driver name of the
 *                compression algorithm e.g. "deflate"
 * @type:        specifies the type of the algorithm
 * @mask:        specifies the mask for the algorithm
 * @node:        specifies the NUMA node the ZIP hardware belongs to
 *
 * Allocate a handle for a compression algorithm. Drivers should try to use
 * (de)compressors on the specified NUMA node.
 * The returned struct crypto_acomp is the handle that is required for any
 * subsequent API invocation for the compression operations.
 *
 * Return:        allocated handle in case of success; IS_ERR() is true in case
 *                of an error, PTR_ERR() returns the error code.
 */
struct crypto_acomp *crypto_alloc_acomp_node(const char *alg_name, u32 type,
                                        u32 mask, int node);

static inline struct crypto_tfm *crypto_acomp_tfm(struct crypto_acomp *tfm)
{
        return &tfm->base;
}

static inline struct comp_alg_common *__crypto_comp_alg_common(
        struct crypto_alg *alg)
{
        return container_of(alg, struct comp_alg_common, base);
}

static inline struct crypto_acomp *__crypto_acomp_tfm(struct crypto_tfm *tfm)
{
        return container_of(tfm, struct crypto_acomp, base);
}

static inline struct comp_alg_common *crypto_comp_alg_common(
        struct crypto_acomp *tfm)
{
        return __crypto_comp_alg_common(crypto_acomp_tfm(tfm)->__crt_alg);
}

static inline unsigned int crypto_acomp_reqsize(struct crypto_acomp *tfm)
{
        return tfm->reqsize;
}

static inline void acomp_request_set_tfm(struct acomp_req *req,
                                         struct crypto_acomp *tfm)
{
        crypto_request_set_tfm(&req->base, crypto_acomp_tfm(tfm));
}

static inline bool acomp_is_async(struct crypto_acomp *tfm)
{
        return crypto_comp_alg_common(tfm)->base.cra_flags &
               CRYPTO_ALG_ASYNC;
}

static inline struct crypto_acomp *crypto_acomp_reqtfm(struct acomp_req *req)
{
        return __crypto_acomp_tfm(req->base.tfm);
}

/**
 * crypto_free_acomp() -- free ACOMPRESS tfm handle
 *
 * @tfm:        ACOMPRESS tfm handle allocated with crypto_alloc_acomp()
 *
 * If @tfm is a NULL or error pointer, this function does nothing.
 */
static inline void crypto_free_acomp(struct crypto_acomp *tfm)
{
        crypto_destroy_tfm(tfm, crypto_acomp_tfm(tfm));
}

static inline int crypto_has_acomp(const char *alg_name, u32 type, u32 mask)
{
        type &= ~CRYPTO_ALG_TYPE_MASK;
        type |= CRYPTO_ALG_TYPE_ACOMPRESS;
        mask |= CRYPTO_ALG_TYPE_ACOMPRESS_MASK;

        return crypto_has_alg(alg_name, type, mask);
}

static inline const char *crypto_acomp_alg_name(struct crypto_acomp *tfm)
{
        return crypto_tfm_alg_name(crypto_acomp_tfm(tfm));
}

static inline const char *crypto_acomp_driver_name(struct crypto_acomp *tfm)
{
        return crypto_tfm_alg_driver_name(crypto_acomp_tfm(tfm));
}

/**
 * acomp_request_alloc() -- allocates asynchronous (de)compression request
 *
 * @tfm:        ACOMPRESS tfm handle allocated with crypto_alloc_acomp()
 * @gfp:        gfp to pass to kzalloc (defaults to GFP_KERNEL)
 *
 * Return:        allocated handle in case of success or NULL in case of an error
 */
static inline struct acomp_req *acomp_request_alloc_extra_noprof(
        struct crypto_acomp *tfm, size_t extra, gfp_t gfp)
{
        struct acomp_req *req;
        size_t len;

        len = ALIGN(sizeof(*req) + crypto_acomp_reqsize(tfm), CRYPTO_MINALIGN);
        if (check_add_overflow(len, extra, &len))
                return NULL;

        req = kzalloc_noprof(len, gfp);
        if (likely(req))
                acomp_request_set_tfm(req, tfm);
        return req;
}
#define acomp_request_alloc_noprof(tfm, ...) \
        CONCATENATE(acomp_request_alloc_noprof_, COUNT_ARGS(__VA_ARGS__))( \
                tfm, ##__VA_ARGS__)
#define acomp_request_alloc_noprof_0(tfm) \
        acomp_request_alloc_noprof_1(tfm, GFP_KERNEL)
#define acomp_request_alloc_noprof_1(tfm, gfp) \
        acomp_request_alloc_extra_noprof(tfm, 0, gfp)
#define acomp_request_alloc(...)        alloc_hooks(acomp_request_alloc_noprof(__VA_ARGS__))

/**
 * acomp_request_alloc_extra() -- allocate acomp request with extra memory
 *
 * @tfm:        ACOMPRESS tfm handle allocated with crypto_alloc_acomp()
 * @extra:        amount of extra memory
 * @gfp:        gfp to pass to kzalloc
 *
 * Return:        allocated handle in case of success or NULL in case of an error
 */
#define acomp_request_alloc_extra(...)        alloc_hooks(acomp_request_alloc_extra_noprof(__VA_ARGS__))

static inline void *acomp_request_extra(struct acomp_req *req)
{
        struct crypto_acomp *tfm = crypto_acomp_reqtfm(req);
        size_t len;

        len = ALIGN(sizeof(*req) + crypto_acomp_reqsize(tfm), CRYPTO_MINALIGN);
        return (void *)((char *)req + len);
}

static inline bool acomp_req_on_stack(struct acomp_req *req)
{
        return crypto_req_on_stack(&req->base);
}

/**
 * acomp_request_free() -- zeroize and free asynchronous (de)compression
 *                           request as well as the output buffer if allocated
 *                           inside the algorithm
 *
 * @req:        request to free
 */
static inline void acomp_request_free(struct acomp_req *req)
{
        if (!req || acomp_req_on_stack(req))
                return;
        kfree_sensitive(req);
}

/**
 * acomp_request_set_callback() -- Sets an asynchronous callback
 *
 * Callback will be called when an asynchronous operation on a given
 * request is finished.
 *
 * @req:        request that the callback will be set for
 * @flgs:        specify for instance if the operation may backlog
 * @cmlp:        callback which will be called
 * @data:        private data used by the caller
 */
static inline void acomp_request_set_callback(struct acomp_req *req,
                                              u32 flgs,
                                              crypto_completion_t cmpl,
                                              void *data)
{
        flgs &= ~CRYPTO_ACOMP_REQ_PRIVATE;
        flgs |= req->base.flags & CRYPTO_ACOMP_REQ_PRIVATE;
        crypto_request_set_callback(&req->base, flgs, cmpl, data);
}

/**
 * acomp_request_set_params() -- Sets request parameters
 *
 * Sets parameters required by an acomp operation
 *
 * @req:        asynchronous compress request
 * @src:        pointer to input buffer scatterlist
 * @dst:        pointer to output buffer scatterlist. If this is NULL, the
 *                acomp layer will allocate the output memory
 * @slen:        size of the input buffer
 * @dlen:        size of the output buffer. If dst is NULL, this can be used by
 *                the user to specify the maximum amount of memory to allocate
 */
static inline void acomp_request_set_params(struct acomp_req *req,
                                            struct scatterlist *src,
                                            struct scatterlist *dst,
                                            unsigned int slen,
                                            unsigned int dlen)
{
        req->src = src;
        req->dst = dst;
        req->slen = slen;
        req->dlen = dlen;

        req->base.flags &= ~(CRYPTO_ACOMP_REQ_SRC_VIRT |
                             CRYPTO_ACOMP_REQ_SRC_NONDMA |
                             CRYPTO_ACOMP_REQ_DST_VIRT |
                             CRYPTO_ACOMP_REQ_DST_NONDMA);
}

/**
 * acomp_request_set_src_sg() -- Sets source scatterlist
 *
 * Sets source scatterlist required by an acomp operation.
 *
 * @req:        asynchronous compress request
 * @src:        pointer to input buffer scatterlist
 * @slen:        size of the input buffer
 */
static inline void acomp_request_set_src_sg(struct acomp_req *req,
                                            struct scatterlist *src,
                                            unsigned int slen)
{
        req->src = src;
        req->slen = slen;

        req->base.flags &= ~CRYPTO_ACOMP_REQ_SRC_NONDMA;
        req->base.flags &= ~CRYPTO_ACOMP_REQ_SRC_VIRT;
}

/**
 * acomp_request_set_src_dma() -- Sets DMA source virtual address
 *
 * Sets source virtual address required by an acomp operation.
 * The address must be usable for DMA.
 *
 * @req:        asynchronous compress request
 * @src:        virtual address pointer to input buffer
 * @slen:        size of the input buffer
 */
static inline void acomp_request_set_src_dma(struct acomp_req *req,
                                             const u8 *src, unsigned int slen)
{
        req->svirt = src;
        req->slen = slen;

        req->base.flags &= ~CRYPTO_ACOMP_REQ_SRC_NONDMA;
        req->base.flags |= CRYPTO_ACOMP_REQ_SRC_VIRT;
}

/**
 * acomp_request_set_src_nondma() -- Sets non-DMA source virtual address
 *
 * Sets source virtual address required by an acomp operation.
 * The address can not be used for DMA.
 *
 * @req:        asynchronous compress request
 * @src:        virtual address pointer to input buffer
 * @slen:        size of the input buffer
 */
static inline void acomp_request_set_src_nondma(struct acomp_req *req,
                                                const u8 *src,
                                                unsigned int slen)
{
        req->svirt = src;
        req->slen = slen;

        req->base.flags |= CRYPTO_ACOMP_REQ_SRC_NONDMA;
        req->base.flags |= CRYPTO_ACOMP_REQ_SRC_VIRT;
}

/**
 * acomp_request_set_src_folio() -- Sets source folio
 *
 * Sets source folio required by an acomp operation.
 *
 * @req:        asynchronous compress request
 * @folio:        pointer to input folio
 * @off:        input folio offset
 * @len:        size of the input buffer
 */
static inline void acomp_request_set_src_folio(struct acomp_req *req,
                                               struct folio *folio, size_t off,
                                               unsigned int len)
{
        sg_init_table(&req->chain.ssg, 1);
        sg_set_folio(&req->chain.ssg, folio, len, off);
        acomp_request_set_src_sg(req, &req->chain.ssg, len);
}

/**
 * acomp_request_set_dst_sg() -- Sets destination scatterlist
 *
 * Sets destination scatterlist required by an acomp operation.
 *
 * @req:        asynchronous compress request
 * @dst:        pointer to output buffer scatterlist
 * @dlen:        size of the output buffer
 */
static inline void acomp_request_set_dst_sg(struct acomp_req *req,
                                            struct scatterlist *dst,
                                            unsigned int dlen)
{
        req->dst = dst;
        req->dlen = dlen;

        req->base.flags &= ~CRYPTO_ACOMP_REQ_DST_NONDMA;
        req->base.flags &= ~CRYPTO_ACOMP_REQ_DST_VIRT;
}

/**
 * acomp_request_set_dst_dma() -- Sets DMA destination virtual address
 *
 * Sets destination virtual address required by an acomp operation.
 * The address must be usable for DMA.
 *
 * @req:        asynchronous compress request
 * @dst:        virtual address pointer to output buffer
 * @dlen:        size of the output buffer
 */
static inline void acomp_request_set_dst_dma(struct acomp_req *req,
                                             u8 *dst, unsigned int dlen)
{
        req->dvirt = dst;
        req->dlen = dlen;

        req->base.flags &= ~CRYPTO_ACOMP_REQ_DST_NONDMA;
        req->base.flags |= CRYPTO_ACOMP_REQ_DST_VIRT;
}

/**
 * acomp_request_set_dst_nondma() -- Sets non-DMA destination virtual address
 *
 * Sets destination virtual address required by an acomp operation.
 * The address can not be used for DMA.
 *
 * @req:        asynchronous compress request
 * @dst:        virtual address pointer to output buffer
 * @dlen:        size of the output buffer
 */
static inline void acomp_request_set_dst_nondma(struct acomp_req *req,
                                                u8 *dst, unsigned int dlen)
{
        req->dvirt = dst;
        req->dlen = dlen;

        req->base.flags |= CRYPTO_ACOMP_REQ_DST_NONDMA;
        req->base.flags |= CRYPTO_ACOMP_REQ_DST_VIRT;
}

/**
 * acomp_request_set_dst_folio() -- Sets destination folio
 *
 * Sets destination folio required by an acomp operation.
 *
 * @req:        asynchronous compress request
 * @folio:        pointer to input folio
 * @off:        input folio offset
 * @len:        size of the input buffer
 */
static inline void acomp_request_set_dst_folio(struct acomp_req *req,
                                               struct folio *folio, size_t off,
                                               unsigned int len)
{
        sg_init_table(&req->chain.dsg, 1);
        sg_set_folio(&req->chain.dsg, folio, len, off);
        acomp_request_set_dst_sg(req, &req->chain.dsg, len);
}

/**
 * crypto_acomp_compress() -- Invoke asynchronous compress operation
 *
 * Function invokes the asynchronous compress operation
 *
 * @req:        asynchronous compress request
 *
 * Return:        zero on success; error code in case of error
 */
int crypto_acomp_compress(struct acomp_req *req);

/**
 * crypto_acomp_decompress() -- Invoke asynchronous decompress operation
 *
 * Function invokes the asynchronous decompress operation
 *
 * @req:        asynchronous compress request
 *
 * Return:        zero on success; error code in case of error
 */
int crypto_acomp_decompress(struct acomp_req *req);

static inline struct acomp_req *acomp_request_on_stack_init(
        char *buf, struct crypto_acomp *tfm)
{
        struct acomp_req *req = (void *)buf;

        crypto_stack_request_init(&req->base, crypto_acomp_tfm(tfm));
        return req;
}

struct acomp_req *acomp_request_clone(struct acomp_req *req,
                                      size_t total, gfp_t gfp);

#endif






















































































    1 









    1 






































































































































































































































































































    1 








    1 







    1 









    1 




















































































































































    1 




    1 





    1 




















































































    1 










    1 









    1 













































































































































    1 









    1 

    1 







































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * CCM: Counter with CBC-MAC
 *
 * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
 */

#include <crypto/internal/aead.h>
#include <crypto/internal/cipher.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/skcipher.h>
#include <crypto/scatterwalk.h>
#include <crypto/utils.h>
#include <linux/err.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/string.h>

struct ccm_instance_ctx {
        struct crypto_skcipher_spawn ctr;
        struct crypto_ahash_spawn mac;
};

struct crypto_ccm_ctx {
        struct crypto_ahash *mac;
        struct crypto_skcipher *ctr;
};

struct crypto_rfc4309_ctx {
        struct crypto_aead *child;
        u8 nonce[3];
};

struct crypto_rfc4309_req_ctx {
        struct scatterlist src[3];
        struct scatterlist dst[3];
        struct aead_request subreq;
};

struct crypto_ccm_req_priv_ctx {
        u8 odata[16];
        u8 idata[16];
        u8 auth_tag[16];
        u32 flags;
        struct scatterlist src[3];
        struct scatterlist dst[3];
        union {
                struct ahash_request ahreq;
                struct skcipher_request skreq;
        };
};

struct cbcmac_tfm_ctx {
        struct crypto_cipher *child;
};

static inline struct crypto_ccm_req_priv_ctx *crypto_ccm_reqctx(
        struct aead_request *req)
{
        unsigned long align = crypto_aead_alignmask(crypto_aead_reqtfm(req));

        return (void *)PTR_ALIGN((u8 *)aead_request_ctx(req), align + 1);
}

static int set_msg_len(u8 *block, unsigned int msglen, int csize)
{
        __be32 data;

        memset(block, 0, csize);
        block += csize;

        if (csize >= 4)
                csize = 4;
        else if (msglen > (1 << (8 * csize)))
                return -EOVERFLOW;

        data = cpu_to_be32(msglen);
        memcpy(block - csize, (u8 *)&data + 4 - csize, csize);

        return 0;
}

static int crypto_ccm_setkey(struct crypto_aead *aead, const u8 *key,
                             unsigned int keylen)
{
        struct crypto_ccm_ctx *ctx = crypto_aead_ctx(aead);
        struct crypto_skcipher *ctr = ctx->ctr;
        struct crypto_ahash *mac = ctx->mac;
        int err;

        crypto_skcipher_clear_flags(ctr, CRYPTO_TFM_REQ_MASK);
        crypto_skcipher_set_flags(ctr, crypto_aead_get_flags(aead) &
                                       CRYPTO_TFM_REQ_MASK);
        err = crypto_skcipher_setkey(ctr, key, keylen);
        if (err)
                return err;

        crypto_ahash_clear_flags(mac, CRYPTO_TFM_REQ_MASK);
        crypto_ahash_set_flags(mac, crypto_aead_get_flags(aead) &
                                    CRYPTO_TFM_REQ_MASK);
        return crypto_ahash_setkey(mac, key, keylen);
}

static int crypto_ccm_setauthsize(struct crypto_aead *tfm,
                                  unsigned int authsize)
{
        switch (authsize) {
        case 4:
        case 6:
        case 8:
        case 10:
        case 12:
        case 14:
        case 16:
                break;
        default:
                return -EINVAL;
        }

        return 0;
}

static int format_input(u8 *info, struct aead_request *req,
                        unsigned int cryptlen)
{
        struct crypto_aead *aead = crypto_aead_reqtfm(req);
        unsigned int lp = req->iv[0];
        unsigned int l = lp + 1;
        unsigned int m;

        m = crypto_aead_authsize(aead);

        memcpy(info, req->iv, 16);

        /* format control info per RFC 3610 and
         * NIST Special Publication 800-38C
         */
        *info |= (8 * ((m - 2) / 2));
        if (req->assoclen)
                *info |= 64;

        return set_msg_len(info + 16 - l, cryptlen, l);
}

static int format_adata(u8 *adata, unsigned int a)
{
        int len = 0;

        /* add control info for associated data
         * RFC 3610 and NIST Special Publication 800-38C
         */
        if (a < 65280) {
                *(__be16 *)adata = cpu_to_be16(a);
                len = 2;
        } else  {
                *(__be16 *)adata = cpu_to_be16(0xfffe);
                *(__be32 *)&adata[2] = cpu_to_be32(a);
                len = 6;
        }

        return len;
}

static int crypto_ccm_auth(struct aead_request *req, struct scatterlist *plain,
                           unsigned int cryptlen)
{
        struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req);
        struct crypto_aead *aead = crypto_aead_reqtfm(req);
        struct crypto_ccm_ctx *ctx = crypto_aead_ctx(aead);
        struct ahash_request *ahreq = &pctx->ahreq;
        unsigned int assoclen = req->assoclen;
        struct scatterlist sg[3];
        u8 *odata = pctx->odata;
        u8 *idata = pctx->idata;
        int ilen, err;

        /* format control data for input */
        err = format_input(odata, req, cryptlen);
        if (err)
                goto out;

        sg_init_table(sg, 3);
        sg_set_buf(&sg[0], odata, 16);

        /* format associated data and compute into mac */
        if (assoclen) {
                ilen = format_adata(idata, assoclen);
                sg_set_buf(&sg[1], idata, ilen);
                sg_chain(sg, 3, req->src);
        } else {
                ilen = 0;
                sg_chain(sg, 2, req->src);
        }

        ahash_request_set_tfm(ahreq, ctx->mac);
        ahash_request_set_callback(ahreq, pctx->flags, NULL, NULL);
        ahash_request_set_crypt(ahreq, sg, NULL, assoclen + ilen + 16);
        err = crypto_ahash_init(ahreq);
        if (err)
                goto out;
        err = crypto_ahash_update(ahreq);
        if (err)
                goto out;

        /* we need to pad the MAC input to a round multiple of the block size */
        ilen = 16 - (assoclen + ilen) % 16;
        if (ilen < 16) {
                memset(idata, 0, ilen);
                sg_init_table(sg, 2);
                sg_set_buf(&sg[0], idata, ilen);
                if (plain)
                        sg_chain(sg, 2, plain);
                plain = sg;
                cryptlen += ilen;
        }

        ahash_request_set_crypt(ahreq, plain, odata, cryptlen);
        err = crypto_ahash_finup(ahreq);
out:
        return err;
}

static void crypto_ccm_encrypt_done(void *data, int err)
{
        struct aead_request *req = data;
        struct crypto_aead *aead = crypto_aead_reqtfm(req);
        struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req);
        u8 *odata = pctx->odata;

        if (!err)
                scatterwalk_map_and_copy(odata, req->dst,
                                         req->assoclen + req->cryptlen,
                                         crypto_aead_authsize(aead), 1);
        aead_request_complete(req, err);
}

static inline int crypto_ccm_check_iv(const u8 *iv)
{
        /* 2 <= L <= 8, so 1 <= L' <= 7. */
        if (1 > iv[0] || iv[0] > 7)
                return -EINVAL;

        return 0;
}

static int crypto_ccm_init_crypt(struct aead_request *req, u8 *tag)
{
        struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req);
        struct scatterlist *sg;
        u8 *iv = req->iv;
        int err;

        err = crypto_ccm_check_iv(iv);
        if (err)
                return err;

        pctx->flags = aead_request_flags(req);

         /* Note: rfc 3610 and NIST 800-38C require counter of
         * zero to encrypt auth tag.
         */
        memset(iv + 15 - iv[0], 0, iv[0] + 1);

        sg_init_table(pctx->src, 3);
        sg_set_buf(pctx->src, tag, 16);
        sg = scatterwalk_ffwd(pctx->src + 1, req->src, req->assoclen);
        if (sg != pctx->src + 1)
                sg_chain(pctx->src, 2, sg);

        if (req->src != req->dst) {
                sg_init_table(pctx->dst, 3);
                sg_set_buf(pctx->dst, tag, 16);
                sg = scatterwalk_ffwd(pctx->dst + 1, req->dst, req->assoclen);
                if (sg != pctx->dst + 1)
                        sg_chain(pctx->dst, 2, sg);
        }

        return 0;
}

static int crypto_ccm_encrypt(struct aead_request *req)
{
        struct crypto_aead *aead = crypto_aead_reqtfm(req);
        struct crypto_ccm_ctx *ctx = crypto_aead_ctx(aead);
        struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req);
        struct skcipher_request *skreq = &pctx->skreq;
        struct scatterlist *dst;
        unsigned int cryptlen = req->cryptlen;
        u8 *odata = pctx->odata;
        u8 *iv = req->iv;
        int err;

        err = crypto_ccm_init_crypt(req, odata);
        if (err)
                return err;

        err = crypto_ccm_auth(req, sg_next(pctx->src), cryptlen);
        if (err)
                return err;

        dst = pctx->src;
        if (req->src != req->dst)
                dst = pctx->dst;

        skcipher_request_set_tfm(skreq, ctx->ctr);
        skcipher_request_set_callback(skreq, pctx->flags,
                                      crypto_ccm_encrypt_done, req);
        skcipher_request_set_crypt(skreq, pctx->src, dst, cryptlen + 16, iv);
        err = crypto_skcipher_encrypt(skreq);
        if (err)
                return err;

        /* copy authtag to end of dst */
        scatterwalk_map_and_copy(odata, sg_next(dst), cryptlen,
                                 crypto_aead_authsize(aead), 1);
        return err;
}

static void crypto_ccm_decrypt_done(void *data, int err)
{
        struct aead_request *req = data;
        struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req);
        struct crypto_aead *aead = crypto_aead_reqtfm(req);
        unsigned int authsize = crypto_aead_authsize(aead);
        unsigned int cryptlen = req->cryptlen - authsize;
        struct scatterlist *dst;

        pctx->flags = 0;

        dst = sg_next(req->src == req->dst ? pctx->src : pctx->dst);

        if (!err) {
                err = crypto_ccm_auth(req, dst, cryptlen);
                if (!err && crypto_memneq(pctx->auth_tag, pctx->odata, authsize))
                        err = -EBADMSG;
        }
        aead_request_complete(req, err);
}

static int crypto_ccm_decrypt(struct aead_request *req)
{
        struct crypto_aead *aead = crypto_aead_reqtfm(req);
        struct crypto_ccm_ctx *ctx = crypto_aead_ctx(aead);
        struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req);
        struct skcipher_request *skreq = &pctx->skreq;
        struct scatterlist *dst;
        unsigned int authsize = crypto_aead_authsize(aead);
        unsigned int cryptlen = req->cryptlen;
        u8 *authtag = pctx->auth_tag;
        u8 *odata = pctx->odata;
        u8 *iv = pctx->idata;
        int err;

        cryptlen -= authsize;

        err = crypto_ccm_init_crypt(req, authtag);
        if (err)
                return err;

        scatterwalk_map_and_copy(authtag, sg_next(pctx->src), cryptlen,
                                 authsize, 0);

        dst = pctx->src;
        if (req->src != req->dst)
                dst = pctx->dst;

        memcpy(iv, req->iv, 16);

        skcipher_request_set_tfm(skreq, ctx->ctr);
        skcipher_request_set_callback(skreq, pctx->flags,
                                      crypto_ccm_decrypt_done, req);
        skcipher_request_set_crypt(skreq, pctx->src, dst, cryptlen + 16, iv);
        err = crypto_skcipher_decrypt(skreq);
        if (err)
                return err;

        err = crypto_ccm_auth(req, sg_next(dst), cryptlen);
        if (err)
                return err;

        /* verify */
        if (crypto_memneq(authtag, odata, authsize))
                return -EBADMSG;

        return err;
}

static int crypto_ccm_init_tfm(struct crypto_aead *tfm)
{
        struct aead_instance *inst = aead_alg_instance(tfm);
        struct ccm_instance_ctx *ictx = aead_instance_ctx(inst);
        struct crypto_ccm_ctx *ctx = crypto_aead_ctx(tfm);
        struct crypto_ahash *mac;
        struct crypto_skcipher *ctr;
        unsigned long align;
        int err;

        mac = crypto_spawn_ahash(&ictx->mac);
        if (IS_ERR(mac))
                return PTR_ERR(mac);

        ctr = crypto_spawn_skcipher(&ictx->ctr);
        err = PTR_ERR(ctr);
        if (IS_ERR(ctr))
                goto err_free_mac;

        ctx->mac = mac;
        ctx->ctr = ctr;

        align = crypto_aead_alignmask(tfm);
        align &= ~(crypto_tfm_ctx_alignment() - 1);
        crypto_aead_set_reqsize(
                tfm,
                align + sizeof(struct crypto_ccm_req_priv_ctx) +
                max(crypto_ahash_reqsize(mac), crypto_skcipher_reqsize(ctr)));

        return 0;

err_free_mac:
        crypto_free_ahash(mac);
        return err;
}

static void crypto_ccm_exit_tfm(struct crypto_aead *tfm)
{
        struct crypto_ccm_ctx *ctx = crypto_aead_ctx(tfm);

        crypto_free_ahash(ctx->mac);
        crypto_free_skcipher(ctx->ctr);
}

static void crypto_ccm_free(struct aead_instance *inst)
{
        struct ccm_instance_ctx *ctx = aead_instance_ctx(inst);

        crypto_drop_ahash(&ctx->mac);
        crypto_drop_skcipher(&ctx->ctr);
        kfree(inst);
}

static int crypto_ccm_create_common(struct crypto_template *tmpl,
                                    struct rtattr **tb,
                                    const char *ctr_name,
                                    const char *mac_name)
{
        struct skcipher_alg_common *ctr;
        u32 mask;
        struct aead_instance *inst;
        struct ccm_instance_ctx *ictx;
        struct hash_alg_common *mac;
        int err;

        err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_AEAD, &mask);
        if (err)
                return err;

        inst = kzalloc(sizeof(*inst) + sizeof(*ictx), GFP_KERNEL);
        if (!inst)
                return -ENOMEM;
        ictx = aead_instance_ctx(inst);

        err = crypto_grab_ahash(&ictx->mac, aead_crypto_instance(inst),
                                mac_name, 0, mask | CRYPTO_ALG_ASYNC);
        if (err)
                goto err_free_inst;
        mac = crypto_spawn_ahash_alg(&ictx->mac);

        err = -EINVAL;
        if (strncmp(mac->base.cra_name, "cbcmac(", 7) != 0 ||
            mac->digestsize != 16)
                goto err_free_inst;

        err = crypto_grab_skcipher(&ictx->ctr, aead_crypto_instance(inst),
                                   ctr_name, 0, mask);
        if (err)
                goto err_free_inst;
        ctr = crypto_spawn_skcipher_alg_common(&ictx->ctr);

        /* The skcipher algorithm must be CTR mode, using 16-byte blocks. */
        err = -EINVAL;
        if (strncmp(ctr->base.cra_name, "ctr(", 4) != 0 ||
            ctr->ivsize != 16 || ctr->base.cra_blocksize != 1)
                goto err_free_inst;

        /* ctr and cbcmac must use the same underlying block cipher. */
        if (strcmp(ctr->base.cra_name + 4, mac->base.cra_name + 7) != 0)
                goto err_free_inst;

        err = -ENAMETOOLONG;
        if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME,
                     "ccm(%s", ctr->base.cra_name + 4) >= CRYPTO_MAX_ALG_NAME)
                goto err_free_inst;

        if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME,
                     "ccm_base(%s,%s)", ctr->base.cra_driver_name,
                     mac->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME)
                goto err_free_inst;

        inst->alg.base.cra_priority = (mac->base.cra_priority +
                                       ctr->base.cra_priority) / 2;
        inst->alg.base.cra_blocksize = 1;
        inst->alg.base.cra_alignmask = ctr->base.cra_alignmask;
        inst->alg.ivsize = 16;
        inst->alg.chunksize = ctr->chunksize;
        inst->alg.maxauthsize = 16;
        inst->alg.base.cra_ctxsize = sizeof(struct crypto_ccm_ctx);
        inst->alg.init = crypto_ccm_init_tfm;
        inst->alg.exit = crypto_ccm_exit_tfm;
        inst->alg.setkey = crypto_ccm_setkey;
        inst->alg.setauthsize = crypto_ccm_setauthsize;
        inst->alg.encrypt = crypto_ccm_encrypt;
        inst->alg.decrypt = crypto_ccm_decrypt;

        inst->free = crypto_ccm_free;

        err = aead_register_instance(tmpl, inst);
        if (err) {
err_free_inst:
                crypto_ccm_free(inst);
        }
        return err;
}

static int crypto_ccm_create(struct crypto_template *tmpl, struct rtattr **tb)
{
        const char *cipher_name;
        char ctr_name[CRYPTO_MAX_ALG_NAME];
        char mac_name[CRYPTO_MAX_ALG_NAME];

        cipher_name = crypto_attr_alg_name(tb[1]);
        if (IS_ERR(cipher_name))
                return PTR_ERR(cipher_name);

        if (snprintf(ctr_name, CRYPTO_MAX_ALG_NAME, "ctr(%s)",
                     cipher_name) >= CRYPTO_MAX_ALG_NAME)
                return -ENAMETOOLONG;

        if (snprintf(mac_name, CRYPTO_MAX_ALG_NAME, "cbcmac(%s)",
                     cipher_name) >= CRYPTO_MAX_ALG_NAME)
                return -ENAMETOOLONG;

        return crypto_ccm_create_common(tmpl, tb, ctr_name, mac_name);
}

static int crypto_ccm_base_create(struct crypto_template *tmpl,
                                  struct rtattr **tb)
{
        const char *ctr_name;
        const char *mac_name;

        ctr_name = crypto_attr_alg_name(tb[1]);
        if (IS_ERR(ctr_name))
                return PTR_ERR(ctr_name);

        mac_name = crypto_attr_alg_name(tb[2]);
        if (IS_ERR(mac_name))
                return PTR_ERR(mac_name);

        return crypto_ccm_create_common(tmpl, tb, ctr_name, mac_name);
}

static int crypto_rfc4309_setkey(struct crypto_aead *parent, const u8 *key,
                                 unsigned int keylen)
{
        struct crypto_rfc4309_ctx *ctx = crypto_aead_ctx(parent);
        struct crypto_aead *child = ctx->child;

        if (keylen < 3)
                return -EINVAL;

        keylen -= 3;
        memcpy(ctx->nonce, key + keylen, 3);

        crypto_aead_clear_flags(child, CRYPTO_TFM_REQ_MASK);
        crypto_aead_set_flags(child, crypto_aead_get_flags(parent) &
                                     CRYPTO_TFM_REQ_MASK);
        return crypto_aead_setkey(child, key, keylen);
}

static int crypto_rfc4309_setauthsize(struct crypto_aead *parent,
                                      unsigned int authsize)
{
        struct crypto_rfc4309_ctx *ctx = crypto_aead_ctx(parent);

        switch (authsize) {
        case 8:
        case 12:
        case 16:
                break;
        default:
                return -EINVAL;
        }

        return crypto_aead_setauthsize(ctx->child, authsize);
}

static struct aead_request *crypto_rfc4309_crypt(struct aead_request *req)
{
        struct crypto_rfc4309_req_ctx *rctx = aead_request_ctx(req);
        struct aead_request *subreq = &rctx->subreq;
        struct crypto_aead *aead = crypto_aead_reqtfm(req);
        struct crypto_rfc4309_ctx *ctx = crypto_aead_ctx(aead);
        struct crypto_aead *child = ctx->child;
        struct scatterlist *sg;
        u8 *iv = PTR_ALIGN((u8 *)(subreq + 1) + crypto_aead_reqsize(child),
                           crypto_aead_alignmask(child) + 1);

        /* L' */
        iv[0] = 3;

        memcpy(iv + 1, ctx->nonce, 3);
        memcpy(iv + 4, req->iv, 8);

        scatterwalk_map_and_copy(iv + 16, req->src, 0, req->assoclen - 8, 0);

        sg_init_table(rctx->src, 3);
        sg_set_buf(rctx->src, iv + 16, req->assoclen - 8);
        sg = scatterwalk_ffwd(rctx->src + 1, req->src, req->assoclen);
        if (sg != rctx->src + 1)
                sg_chain(rctx->src, 2, sg);

        if (req->src != req->dst) {
                sg_init_table(rctx->dst, 3);
                sg_set_buf(rctx->dst, iv + 16, req->assoclen - 8);
                sg = scatterwalk_ffwd(rctx->dst + 1, req->dst, req->assoclen);
                if (sg != rctx->dst + 1)
                        sg_chain(rctx->dst, 2, sg);
        }

        aead_request_set_tfm(subreq, child);
        aead_request_set_callback(subreq, req->base.flags, req->base.complete,
                                  req->base.data);
        aead_request_set_crypt(subreq, rctx->src,
                               req->src == req->dst ? rctx->src : rctx->dst,
                               req->cryptlen, iv);
        aead_request_set_ad(subreq, req->assoclen - 8);

        return subreq;
}

static int crypto_rfc4309_encrypt(struct aead_request *req)
{
        if (req->assoclen != 16 && req->assoclen != 20)
                return -EINVAL;

        req = crypto_rfc4309_crypt(req);

        return crypto_aead_encrypt(req);
}

static int crypto_rfc4309_decrypt(struct aead_request *req)
{
        if (req->assoclen != 16 && req->assoclen != 20)
                return -EINVAL;

        req = crypto_rfc4309_crypt(req);

        return crypto_aead_decrypt(req);
}

static int crypto_rfc4309_init_tfm(struct crypto_aead *tfm)
{
        struct aead_instance *inst = aead_alg_instance(tfm);
        struct crypto_aead_spawn *spawn = aead_instance_ctx(inst);
        struct crypto_rfc4309_ctx *ctx = crypto_aead_ctx(tfm);
        struct crypto_aead *aead;
        unsigned long align;

        aead = crypto_spawn_aead(spawn);
        if (IS_ERR(aead))
                return PTR_ERR(aead);

        ctx->child = aead;

        align = crypto_aead_alignmask(aead);
        align &= ~(crypto_tfm_ctx_alignment() - 1);
        crypto_aead_set_reqsize(
                tfm,
                sizeof(struct crypto_rfc4309_req_ctx) +
                ALIGN(crypto_aead_reqsize(aead), crypto_tfm_ctx_alignment()) +
                align + 32);

        return 0;
}

static void crypto_rfc4309_exit_tfm(struct crypto_aead *tfm)
{
        struct crypto_rfc4309_ctx *ctx = crypto_aead_ctx(tfm);

        crypto_free_aead(ctx->child);
}

static void crypto_rfc4309_free(struct aead_instance *inst)
{
        crypto_drop_aead(aead_instance_ctx(inst));
        kfree(inst);
}

static int crypto_rfc4309_create(struct crypto_template *tmpl,
                                 struct rtattr **tb)
{
        u32 mask;
        struct aead_instance *inst;
        struct crypto_aead_spawn *spawn;
        struct aead_alg *alg;
        int err;

        err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_AEAD, &mask);
        if (err)
                return err;

        inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL);
        if (!inst)
                return -ENOMEM;

        spawn = aead_instance_ctx(inst);
        err = crypto_grab_aead(spawn, aead_crypto_instance(inst),
                               crypto_attr_alg_name(tb[1]), 0, mask);
        if (err)
                goto err_free_inst;

        alg = crypto_spawn_aead_alg(spawn);

        err = -EINVAL;

        /* We only support 16-byte blocks. */
        if (crypto_aead_alg_ivsize(alg) != 16)
                goto err_free_inst;

        /* Not a stream cipher? */
        if (alg->base.cra_blocksize != 1)
                goto err_free_inst;

        err = -ENAMETOOLONG;
        if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME,
                     "rfc4309(%s)", alg->base.cra_name) >=
            CRYPTO_MAX_ALG_NAME ||
            snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME,
                     "rfc4309(%s)", alg->base.cra_driver_name) >=
            CRYPTO_MAX_ALG_NAME)
                goto err_free_inst;

        inst->alg.base.cra_priority = alg->base.cra_priority;
        inst->alg.base.cra_blocksize = 1;
        inst->alg.base.cra_alignmask = alg->base.cra_alignmask;

        inst->alg.ivsize = 8;
        inst->alg.chunksize = crypto_aead_alg_chunksize(alg);
        inst->alg.maxauthsize = 16;

        inst->alg.base.cra_ctxsize = sizeof(struct crypto_rfc4309_ctx);

        inst->alg.init = crypto_rfc4309_init_tfm;
        inst->alg.exit = crypto_rfc4309_exit_tfm;

        inst->alg.setkey = crypto_rfc4309_setkey;
        inst->alg.setauthsize = crypto_rfc4309_setauthsize;
        inst->alg.encrypt = crypto_rfc4309_encrypt;
        inst->alg.decrypt = crypto_rfc4309_decrypt;

        inst->free = crypto_rfc4309_free;

        err = aead_register_instance(tmpl, inst);
        if (err) {
err_free_inst:
                crypto_rfc4309_free(inst);
        }
        return err;
}

static int crypto_cbcmac_digest_setkey(struct crypto_shash *parent,
                                     const u8 *inkey, unsigned int keylen)
{
        struct cbcmac_tfm_ctx *ctx = crypto_shash_ctx(parent);

        return crypto_cipher_setkey(ctx->child, inkey, keylen);
}

static int crypto_cbcmac_digest_init(struct shash_desc *pdesc)
{
        int bs = crypto_shash_digestsize(pdesc->tfm);
        u8 *dg = shash_desc_ctx(pdesc);

        memset(dg, 0, bs);
        return 0;
}

static int crypto_cbcmac_digest_update(struct shash_desc *pdesc, const u8 *p,
                                       unsigned int len)
{
        struct crypto_shash *parent = pdesc->tfm;
        struct cbcmac_tfm_ctx *tctx = crypto_shash_ctx(parent);
        struct crypto_cipher *tfm = tctx->child;
        int bs = crypto_shash_digestsize(parent);
        u8 *dg = shash_desc_ctx(pdesc);

        do {
                crypto_xor(dg, p, bs);
                crypto_cipher_encrypt_one(tfm, dg, dg);
                p += bs;
                len -= bs;
        } while (len >= bs);
        return len;
}

static int crypto_cbcmac_digest_finup(struct shash_desc *pdesc, const u8 *src,
                                      unsigned int len, u8 *out)
{
        struct crypto_shash *parent = pdesc->tfm;
        struct cbcmac_tfm_ctx *tctx = crypto_shash_ctx(parent);
        struct crypto_cipher *tfm = tctx->child;
        int bs = crypto_shash_digestsize(parent);
        u8 *dg = shash_desc_ctx(pdesc);

        if (len) {
                crypto_xor(dg, src, len);
                crypto_cipher_encrypt_one(tfm, out, dg);
                return 0;
        }
        memcpy(out, dg, bs);
        return 0;
}

static int cbcmac_init_tfm(struct crypto_tfm *tfm)
{
        struct crypto_cipher *cipher;
        struct crypto_instance *inst = (void *)tfm->__crt_alg;
        struct crypto_cipher_spawn *spawn = crypto_instance_ctx(inst);
        struct cbcmac_tfm_ctx *ctx = crypto_tfm_ctx(tfm);

        cipher = crypto_spawn_cipher(spawn);
        if (IS_ERR(cipher))
                return PTR_ERR(cipher);

        ctx->child = cipher;

        return 0;
};

static void cbcmac_exit_tfm(struct crypto_tfm *tfm)
{
        struct cbcmac_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
        crypto_free_cipher(ctx->child);
}

static int cbcmac_create(struct crypto_template *tmpl, struct rtattr **tb)
{
        struct shash_instance *inst;
        struct crypto_cipher_spawn *spawn;
        struct crypto_alg *alg;
        u32 mask;
        int err;

        err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SHASH, &mask);
        if (err)
                return err;

        inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL);
        if (!inst)
                return -ENOMEM;
        spawn = shash_instance_ctx(inst);

        err = crypto_grab_cipher(spawn, shash_crypto_instance(inst),
                                 crypto_attr_alg_name(tb[1]), 0, mask);
        if (err)
                goto err_free_inst;
        alg = crypto_spawn_cipher_alg(spawn);

        err = crypto_inst_setname(shash_crypto_instance(inst), tmpl->name, alg);
        if (err)
                goto err_free_inst;

        inst->alg.base.cra_priority = alg->cra_priority;
        inst->alg.base.cra_blocksize = alg->cra_blocksize;

        inst->alg.digestsize = alg->cra_blocksize;
        inst->alg.descsize = alg->cra_blocksize;

        inst->alg.base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY;
        inst->alg.base.cra_ctxsize = sizeof(struct cbcmac_tfm_ctx);
        inst->alg.base.cra_init = cbcmac_init_tfm;
        inst->alg.base.cra_exit = cbcmac_exit_tfm;

        inst->alg.init = crypto_cbcmac_digest_init;
        inst->alg.update = crypto_cbcmac_digest_update;
        inst->alg.finup = crypto_cbcmac_digest_finup;
        inst->alg.setkey = crypto_cbcmac_digest_setkey;

        inst->free = shash_free_singlespawn_instance;

        err = shash_register_instance(tmpl, inst);
        if (err) {
err_free_inst:
                shash_free_singlespawn_instance(inst);
        }
        return err;
}

static struct crypto_template crypto_ccm_tmpls[] = {
        {
                .name = "cbcmac",
                .create = cbcmac_create,
                .module = THIS_MODULE,
        }, {
                .name = "ccm_base",
                .create = crypto_ccm_base_create,
                .module = THIS_MODULE,
        }, {
                .name = "ccm",
                .create = crypto_ccm_create,
                .module = THIS_MODULE,
        }, {
                .name = "rfc4309",
                .create = crypto_rfc4309_create,
                .module = THIS_MODULE,
        },
};

static int __init crypto_ccm_module_init(void)
{
        return crypto_register_templates(crypto_ccm_tmpls,
                                         ARRAY_SIZE(crypto_ccm_tmpls));
}

static void __exit crypto_ccm_module_exit(void)
{
        crypto_unregister_templates(crypto_ccm_tmpls,
                                    ARRAY_SIZE(crypto_ccm_tmpls));
}

module_init(crypto_ccm_module_init);
module_exit(crypto_ccm_module_exit);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Counter with CBC MAC");
MODULE_ALIAS_CRYPTO("ccm_base");
MODULE_ALIAS_CRYPTO("rfc4309");
MODULE_ALIAS_CRYPTO("ccm");
MODULE_ALIAS_CRYPTO("cbcmac");
MODULE_IMPORT_NS("CRYPTO_INTERNAL");






























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_USER_NAMESPACE_H
#define _LINUX_USER_NAMESPACE_H

#include <linux/kref.h>
#include <linux/nsproxy.h>
#include <linux/ns_common.h>
#include <linux/rculist_nulls.h>
#include <linux/sched.h>
#include <linux/workqueue.h>
#include <linux/rcuref.h>
#include <linux/rwsem.h>
#include <linux/sysctl.h>
#include <linux/err.h>

#define UID_GID_MAP_MAX_BASE_EXTENTS 5
#define UID_GID_MAP_MAX_EXTENTS 340

struct uid_gid_extent {
        u32 first;
        u32 lower_first;
        u32 count;
};

struct uid_gid_map { /* 64 bytes -- 1 cache line */
        union {
                struct {
                        struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS];
                        u32 nr_extents;
                };
                struct {
                        struct uid_gid_extent *forward;
                        struct uid_gid_extent *reverse;
                };
        };
};

#define USERNS_SETGROUPS_ALLOWED 1UL

#define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED

struct ucounts;

enum ucount_type {
        UCOUNT_USER_NAMESPACES,
        UCOUNT_PID_NAMESPACES,
        UCOUNT_UTS_NAMESPACES,
        UCOUNT_IPC_NAMESPACES,
        UCOUNT_NET_NAMESPACES,
        UCOUNT_MNT_NAMESPACES,
        UCOUNT_CGROUP_NAMESPACES,
        UCOUNT_TIME_NAMESPACES,
#ifdef CONFIG_INOTIFY_USER
        UCOUNT_INOTIFY_INSTANCES,
        UCOUNT_INOTIFY_WATCHES,
#endif
#ifdef CONFIG_FANOTIFY
        UCOUNT_FANOTIFY_GROUPS,
        UCOUNT_FANOTIFY_MARKS,
#endif
        UCOUNT_COUNTS,
};

enum rlimit_type {
        UCOUNT_RLIMIT_NPROC,
        UCOUNT_RLIMIT_MSGQUEUE,
        UCOUNT_RLIMIT_SIGPENDING,
        UCOUNT_RLIMIT_MEMLOCK,
        UCOUNT_RLIMIT_COUNTS,
};

#if IS_ENABLED(CONFIG_BINFMT_MISC)
struct binfmt_misc;
#endif

struct user_namespace {
        struct uid_gid_map        uid_map;
        struct uid_gid_map        gid_map;
        struct uid_gid_map        projid_map;
        struct user_namespace        *parent;
        int                        level;
        kuid_t                        owner;
        kgid_t                        group;
        struct ns_common        ns;
        unsigned long                flags;
        /* parent_could_setfcap: true if the creator if this ns had CAP_SETFCAP
         * in its effective capability set at the child ns creation time. */
        bool                        parent_could_setfcap;

#ifdef CONFIG_KEYS
        /* List of joinable keyrings in this namespace.  Modification access of
         * these pointers is controlled by keyring_sem.  Once
         * user_keyring_register is set, it won't be changed, so it can be
         * accessed directly with READ_ONCE().
         */
        struct list_head        keyring_name_list;
        struct key                *user_keyring_register;
        struct rw_semaphore        keyring_sem;
#endif

        /* Register of per-UID persistent keyrings for this namespace */
#ifdef CONFIG_PERSISTENT_KEYRINGS
        struct key                *persistent_keyring_register;
#endif
        struct work_struct        work;
#ifdef CONFIG_SYSCTL
        struct ctl_table_set        set;
        struct ctl_table_header *sysctls;
#endif
        struct ucounts                *ucounts;
        long ucount_max[UCOUNT_COUNTS];
        long rlimit_max[UCOUNT_RLIMIT_COUNTS];

#if IS_ENABLED(CONFIG_BINFMT_MISC)
        struct binfmt_misc *binfmt_misc;
#endif
} __randomize_layout;

struct ucounts {
        struct hlist_nulls_node node;
        struct user_namespace *ns;
        kuid_t uid;
        struct rcu_head rcu;
        rcuref_t count;
        atomic_long_t ucount[UCOUNT_COUNTS];
        atomic_long_t rlimit[UCOUNT_RLIMIT_COUNTS];
};

extern struct user_namespace init_user_ns;
extern struct ucounts init_ucounts;

bool setup_userns_sysctls(struct user_namespace *ns);
void retire_userns_sysctls(struct user_namespace *ns);
struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type);
void dec_ucount(struct ucounts *ucounts, enum ucount_type type);
struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid);
void put_ucounts(struct ucounts *ucounts);

static inline struct ucounts * __must_check get_ucounts(struct ucounts *ucounts)
{
        if (rcuref_get(&ucounts->count))
                return ucounts;
        return NULL;
}

static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type type)
{
        return atomic_long_read(&ucounts->rlimit[type]);
}

long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v);
bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v);
long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type,
                            bool override_rlimit);
void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type);
bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long max);

static inline long get_userns_rlimit_max(struct user_namespace *ns, enum rlimit_type type)
{
        return READ_ONCE(ns->rlimit_max[type]);
}

static inline void set_userns_rlimit_max(struct user_namespace *ns,
                enum rlimit_type type, unsigned long max)
{
        ns->rlimit_max[type] = max <= LONG_MAX ? max : LONG_MAX;
}

#ifdef CONFIG_USER_NS

static inline struct user_namespace *to_user_ns(struct ns_common *ns)
{
        return container_of(ns, struct user_namespace, ns);
}

static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
{
        if (ns)
                ns_ref_inc(ns);
        return ns;
}

extern int create_user_ns(struct cred *new);
extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred);
extern void __put_user_ns(struct user_namespace *ns);

static inline void put_user_ns(struct user_namespace *ns)
{
        if (ns && ns_ref_put(ns))
                __put_user_ns(ns);
}

struct seq_operations;
extern const struct seq_operations proc_uid_seq_operations;
extern const struct seq_operations proc_gid_seq_operations;
extern const struct seq_operations proc_projid_seq_operations;
extern ssize_t proc_uid_map_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t proc_gid_map_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *);
extern int proc_setgroups_show(struct seq_file *m, void *v);
extern bool userns_may_setgroups(const struct user_namespace *ns);
extern bool in_userns(const struct user_namespace *ancestor,
                       const struct user_namespace *child);
extern bool current_in_userns(const struct user_namespace *target_ns);
struct ns_common *ns_get_owner(struct ns_common *ns);
#else

static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
{
        return &init_user_ns;
}

static inline int create_user_ns(struct cred *new)
{
        return -EINVAL;
}

static inline int unshare_userns(unsigned long unshare_flags,
                                 struct cred **new_cred)
{
        if (unshare_flags & CLONE_NEWUSER)
                return -EINVAL;
        return 0;
}

static inline void put_user_ns(struct user_namespace *ns)
{
}

static inline bool userns_may_setgroups(const struct user_namespace *ns)
{
        return true;
}

static inline bool in_userns(const struct user_namespace *ancestor,
                             const struct user_namespace *child)
{
        return true;
}

static inline bool current_in_userns(const struct user_namespace *target_ns)
{
        return true;
}

static inline struct ns_common *ns_get_owner(struct ns_common *ns)
{
        return ERR_PTR(-EPERM);
}
#endif

#endif /* _LINUX_USER_H */































    4 
































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * SHA-256 optimized for x86_64
 *
 * Copyright 2025 Google LLC
 */
#include <asm/fpu/api.h>
#include <linux/static_call.h>

static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha_ni);

DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_blocks_generic);

#define DEFINE_X86_SHA256_FN(c_fn, asm_fn)                                 \
        asmlinkage void asm_fn(struct sha256_block_state *state,           \
                               const u8 *data, size_t nblocks);            \
        static void c_fn(struct sha256_block_state *state, const u8 *data, \
                         size_t nblocks)                                   \
        {                                                                  \
                if (likely(irq_fpu_usable())) {                            \
                        kernel_fpu_begin();                                \
                        asm_fn(state, data, nblocks);                      \
                        kernel_fpu_end();                                  \
                } else {                                                   \
                        sha256_blocks_generic(state, data, nblocks);       \
                }                                                          \
        }

DEFINE_X86_SHA256_FN(sha256_blocks_ssse3, sha256_transform_ssse3);
DEFINE_X86_SHA256_FN(sha256_blocks_avx, sha256_transform_avx);
DEFINE_X86_SHA256_FN(sha256_blocks_avx2, sha256_transform_rorx);
DEFINE_X86_SHA256_FN(sha256_blocks_ni, sha256_ni_transform);

static void sha256_blocks(struct sha256_block_state *state,
                          const u8 *data, size_t nblocks)
{
        static_call(sha256_blocks_x86)(state, data, nblocks);
}

static_assert(offsetof(struct __sha256_ctx, state) == 0);
static_assert(offsetof(struct __sha256_ctx, bytecount) == 32);
static_assert(offsetof(struct __sha256_ctx, buf) == 40);
asmlinkage void sha256_ni_finup2x(const struct __sha256_ctx *ctx,
                                  const u8 *data1, const u8 *data2, int len,
                                  u8 out1[SHA256_DIGEST_SIZE],
                                  u8 out2[SHA256_DIGEST_SIZE]);

#define sha256_finup_2x_arch sha256_finup_2x_arch
static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
                                 const u8 *data1, const u8 *data2, size_t len,
                                 u8 out1[SHA256_DIGEST_SIZE],
                                 u8 out2[SHA256_DIGEST_SIZE])
{
        /*
         * The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX.
         * Further limit len to 65536 to avoid spending too long with preemption
         * disabled.  (Of course, in practice len is nearly always 4096 anyway.)
         */
        if (static_branch_likely(&have_sha_ni) && len >= SHA256_BLOCK_SIZE &&
            len <= 65536 && likely(irq_fpu_usable())) {
                kernel_fpu_begin();
                sha256_ni_finup2x(ctx, data1, data2, len, out1, out2);
                kernel_fpu_end();
                kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE);
                kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE);
                return true;
        }
        return false;
}

static bool sha256_finup_2x_is_optimized_arch(void)
{
        return static_key_enabled(&have_sha_ni);
}

#define sha256_mod_init_arch sha256_mod_init_arch
static void sha256_mod_init_arch(void)
{
        if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
                static_call_update(sha256_blocks_x86, sha256_blocks_ni);
                static_branch_enable(&have_sha_ni);
        } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
                                     NULL) &&
                   boot_cpu_has(X86_FEATURE_AVX)) {
                if (boot_cpu_has(X86_FEATURE_AVX2) &&
                    boot_cpu_has(X86_FEATURE_BMI2))
                        static_call_update(sha256_blocks_x86,
                                           sha256_blocks_avx2);
                else
                        static_call_update(sha256_blocks_x86,
                                           sha256_blocks_avx);
        } else if (boot_cpu_has(X86_FEATURE_SSSE3)) {
                static_call_update(sha256_blocks_x86, sha256_blocks_ssse3);
        }
}



























































































































































































































































































































































































































































































































































































































































































































































































































   62 





   64 




















































































   62 
   65 


   17 

   65 



   60 
   17 

   60 
   64 







































   65 

















































































   64 


   64 























































































































































































































   65 


   65 

   65 
   64 

   65 

   65 

































   64 




















































































































































































































































































   63 

   13 
   65 

   13 































































   65 
   65 
   65 
   64 


























































   13 






   13 














   64 

   65 


   65 

   65 





   13 
   63 










   13 


   13 































   12 
   64 

   13 


























































































































































































































































































































































































































   64 





























   13 

   62 










   13 
















   65 






   65 




   65 


   64 
   65 
   63 

   64 


   65 
   65 











   60 
















   65 









   64 











   65 

   65 










   65 



   61 

   65 






   64 
   65 



   64 





   64 





   64 

















   66 



   66 

   66 





   67 
   66 
   66 




















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   39 



   38 


    6 


   39 

   39 









































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
// SPDX-License-Identifier: GPL-2.0-only
/*
 * kernel/workqueue.c - generic async execution with shared worker pool
 *
 * Copyright (C) 2002                Ingo Molnar
 *
 *   Derived from the taskqueue/keventd code by:
 *     David Woodhouse <dwmw2@infradead.org>
 *     Andrew Morton
 *     Kai Petzke <wpp@marie.physik.tu-berlin.de>
 *     Theodore Ts'o <tytso@mit.edu>
 *
 * Made to use alloc_percpu by Christoph Lameter.
 *
 * Copyright (C) 2010                SUSE Linux Products GmbH
 * Copyright (C) 2010                Tejun Heo <tj@kernel.org>
 *
 * This is the generic async execution mechanism.  Work items as are
 * executed in process context.  The worker pool is shared and
 * automatically managed.  There are two worker pools for each CPU (one for
 * normal work items and the other for high priority ones) and some extra
 * pools for workqueues which are not bound to any specific CPU - the
 * number of these backing pools is dynamic.
 *
 * Please read Documentation/core-api/workqueue.rst for details.
 */

#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/signal.h>
#include <linux/completion.h>
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/kthread.h>
#include <linux/hardirq.h>
#include <linux/mempolicy.h>
#include <linux/freezer.h>
#include <linux/debug_locks.h>
#include <linux/lockdep.h>
#include <linux/idr.h>
#include <linux/jhash.h>
#include <linux/hashtable.h>
#include <linux/rculist.h>
#include <linux/nodemask.h>
#include <linux/moduleparam.h>
#include <linux/uaccess.h>
#include <linux/sched/isolation.h>
#include <linux/sched/debug.h>
#include <linux/nmi.h>
#include <linux/kvm_para.h>
#include <linux/delay.h>
#include <linux/irq_work.h>

#include "workqueue_internal.h"

enum worker_pool_flags {
        /*
         * worker_pool flags
         *
         * A bound pool is either associated or disassociated with its CPU.
         * While associated (!DISASSOCIATED), all workers are bound to the
         * CPU and none has %WORKER_UNBOUND set and concurrency management
         * is in effect.
         *
         * While DISASSOCIATED, the cpu may be offline and all workers have
         * %WORKER_UNBOUND set and concurrency management disabled, and may
         * be executing on any CPU.  The pool behaves as an unbound one.
         *
         * Note that DISASSOCIATED should be flipped only while holding
         * wq_pool_attach_mutex to avoid changing binding state while
         * worker_attach_to_pool() is in progress.
         *
         * As there can only be one concurrent BH execution context per CPU, a
         * BH pool is per-CPU and always DISASSOCIATED.
         */
        POOL_BH                        = 1 << 0,        /* is a BH pool */
        POOL_MANAGER_ACTIVE        = 1 << 1,        /* being managed */
        POOL_DISASSOCIATED        = 1 << 2,        /* cpu can't serve workers */
        POOL_BH_DRAINING        = 1 << 3,        /* draining after CPU offline */
};

enum worker_flags {
        /* worker flags */
        WORKER_DIE                = 1 << 1,        /* die die die */
        WORKER_IDLE                = 1 << 2,        /* is idle */
        WORKER_PREP                = 1 << 3,        /* preparing to run works */
        WORKER_CPU_INTENSIVE        = 1 << 6,        /* cpu intensive */
        WORKER_UNBOUND                = 1 << 7,        /* worker is unbound */
        WORKER_REBOUND                = 1 << 8,        /* worker was rebound */

        WORKER_NOT_RUNNING        = WORKER_PREP | WORKER_CPU_INTENSIVE |
                                  WORKER_UNBOUND | WORKER_REBOUND,
};

enum work_cancel_flags {
        WORK_CANCEL_DELAYED        = 1 << 0,        /* canceling a delayed_work */
        WORK_CANCEL_DISABLE        = 1 << 1,        /* canceling to disable */
};

enum wq_internal_consts {
        NR_STD_WORKER_POOLS        = 2,                /* # standard pools per cpu */

        UNBOUND_POOL_HASH_ORDER        = 6,                /* hashed by pool->attrs */
        BUSY_WORKER_HASH_ORDER        = 6,                /* 64 pointers */

        MAX_IDLE_WORKERS_RATIO        = 4,                /* 1/4 of busy can be idle */
        IDLE_WORKER_TIMEOUT        = 300 * HZ,        /* keep idle ones for 5 mins */

        MAYDAY_INITIAL_TIMEOUT  = HZ / 100 >= 2 ? HZ / 100 : 2,
                                                /* call for help after 10ms
                                                   (min two ticks) */
        MAYDAY_INTERVAL                = HZ / 10,        /* and then every 100ms */
        CREATE_COOLDOWN                = HZ,                /* time to breath after fail */

        /*
         * Rescue workers are used only on emergencies and shared by
         * all cpus.  Give MIN_NICE.
         */
        RESCUER_NICE_LEVEL        = MIN_NICE,
        HIGHPRI_NICE_LEVEL        = MIN_NICE,

        WQ_NAME_LEN                = 32,
        WORKER_ID_LEN                = 10 + WQ_NAME_LEN, /* "kworker/R-" + WQ_NAME_LEN */
};

/*
 * We don't want to trap softirq for too long. See MAX_SOFTIRQ_TIME and
 * MAX_SOFTIRQ_RESTART in kernel/softirq.c. These are macros because
 * msecs_to_jiffies() can't be an initializer.
 */
#define BH_WORKER_JIFFIES        msecs_to_jiffies(2)
#define BH_WORKER_RESTARTS        10

/*
 * Structure fields follow one of the following exclusion rules.
 *
 * I: Modifiable by initialization/destruction paths and read-only for
 *    everyone else.
 *
 * P: Preemption protected.  Disabling preemption is enough and should
 *    only be modified and accessed from the local cpu.
 *
 * L: pool->lock protected.  Access with pool->lock held.
 *
 * LN: pool->lock and wq_node_nr_active->lock protected for writes. Either for
 *     reads.
 *
 * K: Only modified by worker while holding pool->lock. Can be safely read by
 *    self, while holding pool->lock or from IRQ context if %current is the
 *    kworker.
 *
 * S: Only modified by worker self.
 *
 * A: wq_pool_attach_mutex protected.
 *
 * PL: wq_pool_mutex protected.
 *
 * PR: wq_pool_mutex protected for writes.  RCU protected for reads.
 *
 * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
 *
 * PWR: wq_pool_mutex and wq->mutex protected for writes.  Either or
 *      RCU for reads.
 *
 * WQ: wq->mutex protected.
 *
 * WR: wq->mutex protected for writes.  RCU protected for reads.
 *
 * WO: wq->mutex protected for writes. Updated with WRITE_ONCE() and can be read
 *     with READ_ONCE() without locking.
 *
 * MD: wq_mayday_lock protected.
 *
 * WD: Used internally by the watchdog.
 */

/* struct worker is defined in workqueue_internal.h */

struct worker_pool {
        raw_spinlock_t                lock;                /* the pool lock */
        int                        cpu;                /* I: the associated cpu */
        int                        node;                /* I: the associated node ID */
        int                        id;                /* I: pool ID */
        unsigned int                flags;                /* L: flags */

        unsigned long                watchdog_ts;        /* L: watchdog timestamp */
        bool                        cpu_stall;        /* WD: stalled cpu bound pool */

        /*
         * The counter is incremented in a process context on the associated CPU
         * w/ preemption disabled, and decremented or reset in the same context
         * but w/ pool->lock held. The readers grab pool->lock and are
         * guaranteed to see if the counter reached zero.
         */
        int                        nr_running;

        struct list_head        worklist;        /* L: list of pending works */

        int                        nr_workers;        /* L: total number of workers */
        int                        nr_idle;        /* L: currently idle workers */

        struct list_head        idle_list;        /* L: list of idle workers */
        struct timer_list        idle_timer;        /* L: worker idle timeout */
        struct work_struct      idle_cull_work; /* L: worker idle cleanup */

        struct timer_list        mayday_timer;          /* L: SOS timer for workers */

        /* a workers is either on busy_hash or idle_list, or the manager */
        DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
                                                /* L: hash of busy workers */

        struct worker                *manager;        /* L: purely informational */
        struct list_head        workers;        /* A: attached workers */

        struct ida                worker_ida;        /* worker IDs for task name */

        struct workqueue_attrs        *attrs;                /* I: worker attributes */
        struct hlist_node        hash_node;        /* PL: unbound_pool_hash node */
        int                        refcnt;                /* PL: refcnt for unbound pools */
#ifdef CONFIG_PREEMPT_RT
        spinlock_t                cb_lock;        /* BH worker cancel lock */
#endif
        /*
         * Destruction of pool is RCU protected to allow dereferences
         * from get_work_pool().
         */
        struct rcu_head                rcu;
};

/*
 * Per-pool_workqueue statistics. These can be monitored using
 * tools/workqueue/wq_monitor.py.
 */
enum pool_workqueue_stats {
        PWQ_STAT_STARTED,        /* work items started execution */
        PWQ_STAT_COMPLETED,        /* work items completed execution */
        PWQ_STAT_CPU_TIME,        /* total CPU time consumed */
        PWQ_STAT_CPU_INTENSIVE,        /* wq_cpu_intensive_thresh_us violations */
        PWQ_STAT_CM_WAKEUP,        /* concurrency-management worker wakeups */
        PWQ_STAT_REPATRIATED,        /* unbound workers brought back into scope */
        PWQ_STAT_MAYDAY,        /* maydays to rescuer */
        PWQ_STAT_RESCUED,        /* linked work items executed by rescuer */

        PWQ_NR_STATS,
};

/*
 * The per-pool workqueue.  While queued, bits below WORK_PWQ_SHIFT
 * of work_struct->data are used for flags and the remaining high bits
 * point to the pwq; thus, pwqs need to be aligned at two's power of the
 * number of flag bits.
 */
struct pool_workqueue {
        struct worker_pool        *pool;                /* I: the associated pool */
        struct workqueue_struct *wq;                /* I: the owning workqueue */
        int                        work_color;        /* L: current color */
        int                        flush_color;        /* L: flushing color */
        int                        refcnt;                /* L: reference count */
        int                        nr_in_flight[WORK_NR_COLORS];
                                                /* L: nr of in_flight works */
        bool                        plugged;        /* L: execution suspended */

        /*
         * nr_active management and WORK_STRUCT_INACTIVE:
         *
         * When pwq->nr_active >= max_active, new work item is queued to
         * pwq->inactive_works instead of pool->worklist and marked with
         * WORK_STRUCT_INACTIVE.
         *
         * All work items marked with WORK_STRUCT_INACTIVE do not participate in
         * nr_active and all work items in pwq->inactive_works are marked with
         * WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE work items are
         * in pwq->inactive_works. Some of them are ready to run in
         * pool->worklist or worker->scheduled. Those work itmes are only struct
         * wq_barrier which is used for flush_work() and should not participate
         * in nr_active. For non-barrier work item, it is marked with
         * WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works.
         */
        int                        nr_active;        /* L: nr of active works */
        struct list_head        inactive_works;        /* L: inactive works */
        struct list_head        pending_node;        /* LN: node on wq_node_nr_active->pending_pwqs */
        struct list_head        pwqs_node;        /* WR: node on wq->pwqs */
        struct list_head        mayday_node;        /* MD: node on wq->maydays */

        u64                        stats[PWQ_NR_STATS];

        /*
         * Release of unbound pwq is punted to a kthread_worker. See put_pwq()
         * and pwq_release_workfn() for details. pool_workqueue itself is also
         * RCU protected so that the first pwq can be determined without
         * grabbing wq->mutex.
         */
        struct kthread_work        release_work;
        struct rcu_head                rcu;
} __aligned(1 << WORK_STRUCT_PWQ_SHIFT);

/*
 * Structure used to wait for workqueue flush.
 */
struct wq_flusher {
        struct list_head        list;                /* WQ: list of flushers */
        int                        flush_color;        /* WQ: flush color waiting for */
        struct completion        done;                /* flush completion */
};

struct wq_device;

/*
 * Unlike in a per-cpu workqueue where max_active limits its concurrency level
 * on each CPU, in an unbound workqueue, max_active applies to the whole system.
 * As sharing a single nr_active across multiple sockets can be very expensive,
 * the counting and enforcement is per NUMA node.
 *
 * The following struct is used to enforce per-node max_active. When a pwq wants
 * to start executing a work item, it should increment ->nr using
 * tryinc_node_nr_active(). If acquisition fails due to ->nr already being over
 * ->max, the pwq is queued on ->pending_pwqs. As in-flight work items finish
 * and decrement ->nr, node_activate_pending_pwq() activates the pending pwqs in
 * round-robin order.
 */
struct wq_node_nr_active {
        int                        max;                /* per-node max_active */
        atomic_t                nr;                /* per-node nr_active */
        raw_spinlock_t                lock;                /* nests inside pool locks */
        struct list_head        pending_pwqs;        /* LN: pwqs with inactive works */
};

/*
 * The externally visible workqueue.  It relays the issued work items to
 * the appropriate worker_pool through its pool_workqueues.
 */
struct workqueue_struct {
        struct list_head        pwqs;                /* WR: all pwqs of this wq */
        struct list_head        list;                /* PR: list of all workqueues */

        struct mutex                mutex;                /* protects this wq */
        int                        work_color;        /* WQ: current work color */
        int                        flush_color;        /* WQ: current flush color */
        atomic_t                nr_pwqs_to_flush; /* flush in progress */
        struct wq_flusher        *first_flusher;        /* WQ: first flusher */
        struct list_head        flusher_queue;        /* WQ: flush waiters */
        struct list_head        flusher_overflow; /* WQ: flush overflow list */

        struct list_head        maydays;        /* MD: pwqs requesting rescue */
        struct worker                *rescuer;        /* MD: rescue worker */

        int                        nr_drainers;        /* WQ: drain in progress */

        /* See alloc_workqueue() function comment for info on min/max_active */
        int                        max_active;        /* WO: max active works */
        int                        min_active;        /* WO: min active works */
        int                        saved_max_active; /* WQ: saved max_active */
        int                        saved_min_active; /* WQ: saved min_active */

        struct workqueue_attrs        *unbound_attrs;        /* PW: only for unbound wqs */
        struct pool_workqueue __rcu *dfl_pwq;   /* PW: only for unbound wqs */

#ifdef CONFIG_SYSFS
        struct wq_device        *wq_dev;        /* I: for sysfs interface */
#endif
#ifdef CONFIG_LOCKDEP
        char                        *lock_name;
        struct lock_class_key        key;
        struct lockdep_map        __lockdep_map;
        struct lockdep_map        *lockdep_map;
#endif
        char                        name[WQ_NAME_LEN]; /* I: workqueue name */

        /*
         * Destruction of workqueue_struct is RCU protected to allow walking
         * the workqueues list without grabbing wq_pool_mutex.
         * This is used to dump all workqueues from sysrq.
         */
        struct rcu_head                rcu;

        /* hot fields used during command issue, aligned to cacheline */
        unsigned int                flags ____cacheline_aligned; /* WQ: WQ_* flags */
        struct pool_workqueue __rcu * __percpu *cpu_pwq; /* I: per-cpu pwqs */
        struct wq_node_nr_active *node_nr_active[]; /* I: per-node nr_active */
};

/*
 * Each pod type describes how CPUs should be grouped for unbound workqueues.
 * See the comment above workqueue_attrs->affn_scope.
 */
struct wq_pod_type {
        int                        nr_pods;        /* number of pods */
        cpumask_var_t                *pod_cpus;        /* pod -> cpus */
        int                        *pod_node;        /* pod -> node */
        int                        *cpu_pod;        /* cpu -> pod */
};

struct work_offq_data {
        u32                        pool_id;
        u32                        disable;
        u32                        flags;
};

static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = {
        [WQ_AFFN_DFL]                = "default",
        [WQ_AFFN_CPU]                = "cpu",
        [WQ_AFFN_SMT]                = "smt",
        [WQ_AFFN_CACHE]                = "cache",
        [WQ_AFFN_NUMA]                = "numa",
        [WQ_AFFN_SYSTEM]        = "system",
};

/*
 * Per-cpu work items which run for longer than the following threshold are
 * automatically considered CPU intensive and excluded from concurrency
 * management to prevent them from noticeably delaying other per-cpu work items.
 * ULONG_MAX indicates that the user hasn't overridden it with a boot parameter.
 * The actual value is initialized in wq_cpu_intensive_thresh_init().
 */
static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX;
module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644);
#ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT
static unsigned int wq_cpu_intensive_warning_thresh = 4;
module_param_named(cpu_intensive_warning_thresh, wq_cpu_intensive_warning_thresh, uint, 0644);
#endif

/* see the comment above the definition of WQ_POWER_EFFICIENT */
static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT);
module_param_named(power_efficient, wq_power_efficient, bool, 0444);

static bool wq_online;                        /* can kworkers be created yet? */
static bool wq_topo_initialized __read_mostly = false;

static struct kmem_cache *pwq_cache;

static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES];
static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE;

/* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */
static struct workqueue_attrs *unbound_wq_update_pwq_attrs_buf;

static DEFINE_MUTEX(wq_pool_mutex);        /* protects pools and workqueues list */
static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */
static DEFINE_RAW_SPINLOCK(wq_mayday_lock);        /* protects wq->maydays list */
/* wait for manager to go away */
static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait);

static LIST_HEAD(workqueues);                /* PR: list of all workqueues */
static bool workqueue_freezing;                /* PL: have wqs started freezing? */

/* PL: mirror the cpu_online_mask excluding the CPU in the midst of hotplugging */
static cpumask_var_t wq_online_cpumask;

/* PL&A: allowable cpus for unbound wqs and work items */
static cpumask_var_t wq_unbound_cpumask;

/* PL: user requested unbound cpumask via sysfs */
static cpumask_var_t wq_requested_unbound_cpumask;

/* PL: isolated cpumask to be excluded from unbound cpumask */
static cpumask_var_t wq_isolated_cpumask;

/* for further constrain wq_unbound_cpumask by cmdline parameter*/
static struct cpumask wq_cmdline_cpumask __initdata;

/* CPU where unbound work was last round robin scheduled from this CPU */
static DEFINE_PER_CPU(int, wq_rr_cpu_last);

/*
 * Local execution of unbound work items is no longer guaranteed.  The
 * following always forces round-robin CPU selection on unbound work items
 * to uncover usages which depend on it.
 */
#ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU
static bool wq_debug_force_rr_cpu = true;
#else
static bool wq_debug_force_rr_cpu = false;
#endif
module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644);

/* to raise softirq for the BH worker pools on other CPUs */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_work [NR_STD_WORKER_POOLS], bh_pool_irq_works);

/* the BH worker pools */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], bh_worker_pools);

/* the per-cpu worker pools */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools);

static DEFINE_IDR(worker_pool_idr);        /* PR: idr of all pools */

/* PL: hash of all unbound pools keyed by pool->attrs */
static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);

/* I: attributes used when instantiating standard unbound pools on demand */
static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];

/* I: attributes used when instantiating ordered pools on demand */
static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];

/*
 * I: kthread_worker to release pwq's. pwq release needs to be bounced to a
 * process context while holding a pool lock. Bounce to a dedicated kthread
 * worker to avoid A-A deadlocks.
 */
static struct kthread_worker *pwq_release_worker __ro_after_init;

struct workqueue_struct *system_wq __ro_after_init;
EXPORT_SYMBOL(system_wq);
struct workqueue_struct *system_percpu_wq __ro_after_init;
EXPORT_SYMBOL(system_percpu_wq);
struct workqueue_struct *system_highpri_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_highpri_wq);
struct workqueue_struct *system_long_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_long_wq);
struct workqueue_struct *system_unbound_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_unbound_wq);
struct workqueue_struct *system_dfl_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_dfl_wq);
struct workqueue_struct *system_freezable_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_freezable_wq);
struct workqueue_struct *system_power_efficient_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_power_efficient_wq);
struct workqueue_struct *system_freezable_power_efficient_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
struct workqueue_struct *system_bh_wq;
EXPORT_SYMBOL_GPL(system_bh_wq);
struct workqueue_struct *system_bh_highpri_wq;
EXPORT_SYMBOL_GPL(system_bh_highpri_wq);

static int worker_thread(void *__worker);
static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
static void show_pwq(struct pool_workqueue *pwq);
static void show_one_worker_pool(struct worker_pool *pool);

#define CREATE_TRACE_POINTS
#include <trace/events/workqueue.h>

#define assert_rcu_or_pool_mutex()                                        \
        RCU_LOCKDEP_WARN(!rcu_read_lock_any_held() &&                        \
                         !lockdep_is_held(&wq_pool_mutex),                \
                         "RCU or wq_pool_mutex should be held")

#define assert_rcu_or_wq_mutex_or_pool_mutex(wq)                        \
        RCU_LOCKDEP_WARN(!rcu_read_lock_any_held() &&                        \
                         !lockdep_is_held(&wq->mutex) &&                \
                         !lockdep_is_held(&wq_pool_mutex),                \
                         "RCU, wq->mutex or wq_pool_mutex should be held")

#define for_each_bh_worker_pool(pool, cpu)                                \
        for ((pool) = &per_cpu(bh_worker_pools, cpu)[0];                \
             (pool) < &per_cpu(bh_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
             (pool)++)

#define for_each_cpu_worker_pool(pool, cpu)                                \
        for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];                \
             (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
             (pool)++)

/**
 * for_each_pool - iterate through all worker_pools in the system
 * @pool: iteration cursor
 * @pi: integer used for iteration
 *
 * This must be called either with wq_pool_mutex held or RCU read
 * locked.  If the pool needs to be used beyond the locking in effect, the
 * caller is responsible for guaranteeing that the pool stays online.
 *
 * The if/else clause exists only for the lockdep assertion and can be
 * ignored.
 */
#define for_each_pool(pool, pi)                                                \
        idr_for_each_entry(&worker_pool_idr, pool, pi)                        \
                if (({ assert_rcu_or_pool_mutex(); false; })) { }        \
                else

/**
 * for_each_pool_worker - iterate through all workers of a worker_pool
 * @worker: iteration cursor
 * @pool: worker_pool to iterate workers of
 *
 * This must be called with wq_pool_attach_mutex.
 *
 * The if/else clause exists only for the lockdep assertion and can be
 * ignored.
 */
#define for_each_pool_worker(worker, pool)                                \
        list_for_each_entry((worker), &(pool)->workers, node)                \
                if (({ lockdep_assert_held(&wq_pool_attach_mutex); false; })) { } \
                else

/**
 * for_each_pwq - iterate through all pool_workqueues of the specified workqueue
 * @pwq: iteration cursor
 * @wq: the target workqueue
 *
 * This must be called either with wq->mutex held or RCU read locked.
 * If the pwq needs to be used beyond the locking in effect, the caller is
 * responsible for guaranteeing that the pwq stays online.
 *
 * The if/else clause exists only for the lockdep assertion and can be
 * ignored.
 */
#define for_each_pwq(pwq, wq)                                                \
        list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node,                \
                                 lockdep_is_held(&(wq->mutex)))

#ifdef CONFIG_DEBUG_OBJECTS_WORK

static const struct debug_obj_descr work_debug_descr;

static void *work_debug_hint(void *addr)
{
        return ((struct work_struct *) addr)->func;
}

static bool work_is_static_object(void *addr)
{
        struct work_struct *work = addr;

        return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work));
}

/*
 * fixup_init is called when:
 * - an active object is initialized
 */
static bool work_fixup_init(void *addr, enum debug_obj_state state)
{
        struct work_struct *work = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                cancel_work_sync(work);
                debug_object_init(work, &work_debug_descr);
                return true;
        default:
                return false;
        }
}

/*
 * fixup_free is called when:
 * - an active object is freed
 */
static bool work_fixup_free(void *addr, enum debug_obj_state state)
{
        struct work_struct *work = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                cancel_work_sync(work);
                debug_object_free(work, &work_debug_descr);
                return true;
        default:
                return false;
        }
}

static const struct debug_obj_descr work_debug_descr = {
        .name                = "work_struct",
        .debug_hint        = work_debug_hint,
        .is_static_object = work_is_static_object,
        .fixup_init        = work_fixup_init,
        .fixup_free        = work_fixup_free,
};

static inline void debug_work_activate(struct work_struct *work)
{
        debug_object_activate(work, &work_debug_descr);
}

static inline void debug_work_deactivate(struct work_struct *work)
{
        debug_object_deactivate(work, &work_debug_descr);
}

void __init_work(struct work_struct *work, int onstack)
{
        if (onstack)
                debug_object_init_on_stack(work, &work_debug_descr);
        else
                debug_object_init(work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(__init_work);

void destroy_work_on_stack(struct work_struct *work)
{
        debug_object_free(work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_work_on_stack);

void destroy_delayed_work_on_stack(struct delayed_work *work)
{
        timer_destroy_on_stack(&work->timer);
        debug_object_free(&work->work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack);

#else
static inline void debug_work_activate(struct work_struct *work) { }
static inline void debug_work_deactivate(struct work_struct *work) { }
#endif

/**
 * worker_pool_assign_id - allocate ID and assign it to @pool
 * @pool: the pool pointer of interest
 *
 * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned
 * successfully, -errno on failure.
 */
static int worker_pool_assign_id(struct worker_pool *pool)
{
        int ret;

        lockdep_assert_held(&wq_pool_mutex);

        ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE,
                        GFP_KERNEL);
        if (ret >= 0) {
                pool->id = ret;
                return 0;
        }
        return ret;
}

static struct pool_workqueue __rcu **
unbound_pwq_slot(struct workqueue_struct *wq, int cpu)
{
       if (cpu >= 0)
               return per_cpu_ptr(wq->cpu_pwq, cpu);
       else
               return &wq->dfl_pwq;
}

/* @cpu < 0 for dfl_pwq */
static struct pool_workqueue *unbound_pwq(struct workqueue_struct *wq, int cpu)
{
        return rcu_dereference_check(*unbound_pwq_slot(wq, cpu),
                                     lockdep_is_held(&wq_pool_mutex) ||
                                     lockdep_is_held(&wq->mutex));
}

/**
 * unbound_effective_cpumask - effective cpumask of an unbound workqueue
 * @wq: workqueue of interest
 *
 * @wq->unbound_attrs->cpumask contains the cpumask requested by the user which
 * is masked with wq_unbound_cpumask to determine the effective cpumask. The
 * default pwq is always mapped to the pool with the current effective cpumask.
 */
static struct cpumask *unbound_effective_cpumask(struct workqueue_struct *wq)
{
        return unbound_pwq(wq, -1)->pool->attrs->__pod_cpumask;
}

static unsigned int work_color_to_flags(int color)
{
        return color << WORK_STRUCT_COLOR_SHIFT;
}

static int get_work_color(unsigned long work_data)
{
        return (work_data >> WORK_STRUCT_COLOR_SHIFT) &
                ((1 << WORK_STRUCT_COLOR_BITS) - 1);
}

static int work_next_color(int color)
{
        return (color + 1) % WORK_NR_COLORS;
}

static unsigned long pool_offq_flags(struct worker_pool *pool)
{
        return (pool->flags & POOL_BH) ? WORK_OFFQ_BH : 0;
}

/*
 * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data
 * contain the pointer to the queued pwq.  Once execution starts, the flag
 * is cleared and the high bits contain OFFQ flags and pool ID.
 *
 * set_work_pwq(), set_work_pool_and_clear_pending() and mark_work_canceling()
 * can be used to set the pwq, pool or clear work->data. These functions should
 * only be called while the work is owned - ie. while the PENDING bit is set.
 *
 * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq
 * corresponding to a work.  Pool is available once the work has been
 * queued anywhere after initialization until it is sync canceled.  pwq is
 * available only while the work item is queued.
 */
static inline void set_work_data(struct work_struct *work, unsigned long data)
{
        WARN_ON_ONCE(!work_pending(work));
        atomic_long_set(&work->data, data | work_static(work));
}

static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq,
                         unsigned long flags)
{
        set_work_data(work, (unsigned long)pwq | WORK_STRUCT_PENDING |
                      WORK_STRUCT_PWQ | flags);
}

static void set_work_pool_and_keep_pending(struct work_struct *work,
                                           int pool_id, unsigned long flags)
{
        set_work_data(work, ((unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT) |
                      WORK_STRUCT_PENDING | flags);
}

static void set_work_pool_and_clear_pending(struct work_struct *work,
                                            int pool_id, unsigned long flags)
{
        /*
         * The following wmb is paired with the implied mb in
         * test_and_set_bit(PENDING) and ensures all updates to @work made
         * here are visible to and precede any updates by the next PENDING
         * owner.
         */
        smp_wmb();
        set_work_data(work, ((unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT) |
                      flags);
        /*
         * The following mb guarantees that previous clear of a PENDING bit
         * will not be reordered with any speculative LOADS or STORES from
         * work->current_func, which is executed afterwards.  This possible
         * reordering can lead to a missed execution on attempt to queue
         * the same @work.  E.g. consider this case:
         *
         *   CPU#0                         CPU#1
         *   ----------------------------  --------------------------------
         *
         * 1  STORE event_indicated
         * 2  queue_work_on() {
         * 3    test_and_set_bit(PENDING)
         * 4 }                             set_..._and_clear_pending() {
         * 5                                 set_work_data() # clear bit
         * 6                                 smp_mb()
         * 7                               work->current_func() {
         * 8                                      LOAD event_indicated
         *                                   }
         *
         * Without an explicit full barrier speculative LOAD on line 8 can
         * be executed before CPU#0 does STORE on line 1.  If that happens,
         * CPU#0 observes the PENDING bit is still set and new execution of
         * a @work is not queued in a hope, that CPU#1 will eventually
         * finish the queued @work.  Meanwhile CPU#1 does not see
         * event_indicated is set, because speculative LOAD was executed
         * before actual STORE.
         */
        smp_mb();
}

static inline struct pool_workqueue *work_struct_pwq(unsigned long data)
{
        return (struct pool_workqueue *)(data & WORK_STRUCT_PWQ_MASK);
}

static struct pool_workqueue *get_work_pwq(struct work_struct *work)
{
        unsigned long data = atomic_long_read(&work->data);

        if (data & WORK_STRUCT_PWQ)
                return work_struct_pwq(data);
        else
                return NULL;
}

/**
 * get_work_pool - return the worker_pool a given work was associated with
 * @work: the work item of interest
 *
 * Pools are created and destroyed under wq_pool_mutex, and allows read
 * access under RCU read lock.  As such, this function should be
 * called under wq_pool_mutex or inside of a rcu_read_lock() region.
 *
 * All fields of the returned pool are accessible as long as the above
 * mentioned locking is in effect.  If the returned pool needs to be used
 * beyond the critical section, the caller is responsible for ensuring the
 * returned pool is and stays online.
 *
 * Return: The worker_pool @work was last associated with.  %NULL if none.
 */
static struct worker_pool *get_work_pool(struct work_struct *work)
{
        unsigned long data = atomic_long_read(&work->data);
        int pool_id;

        assert_rcu_or_pool_mutex();

        if (data & WORK_STRUCT_PWQ)
                return work_struct_pwq(data)->pool;

        pool_id = data >> WORK_OFFQ_POOL_SHIFT;
        if (pool_id == WORK_OFFQ_POOL_NONE)
                return NULL;

        return idr_find(&worker_pool_idr, pool_id);
}

static unsigned long shift_and_mask(unsigned long v, u32 shift, u32 bits)
{
        return (v >> shift) & ((1U << bits) - 1);
}

static void work_offqd_unpack(struct work_offq_data *offqd, unsigned long data)
{
        WARN_ON_ONCE(data & WORK_STRUCT_PWQ);

        offqd->pool_id = shift_and_mask(data, WORK_OFFQ_POOL_SHIFT,
                                        WORK_OFFQ_POOL_BITS);
        offqd->disable = shift_and_mask(data, WORK_OFFQ_DISABLE_SHIFT,
                                        WORK_OFFQ_DISABLE_BITS);
        offqd->flags = data & WORK_OFFQ_FLAG_MASK;
}

static unsigned long work_offqd_pack_flags(struct work_offq_data *offqd)
{
        return ((unsigned long)offqd->disable << WORK_OFFQ_DISABLE_SHIFT) |
                ((unsigned long)offqd->flags);
}

/*
 * Policy functions.  These define the policies on how the global worker
 * pools are managed.  Unless noted otherwise, these functions assume that
 * they're being called with pool->lock held.
 */

/*
 * Need to wake up a worker?  Called from anything but currently
 * running workers.
 *
 * Note that, because unbound workers never contribute to nr_running, this
 * function will always return %true for unbound pools as long as the
 * worklist isn't empty.
 */
static bool need_more_worker(struct worker_pool *pool)
{
        return !list_empty(&pool->worklist) && !pool->nr_running;
}

/* Can I start working?  Called from busy but !running workers. */
static bool may_start_working(struct worker_pool *pool)
{
        return pool->nr_idle;
}

/* Do I need to keep working?  Called from currently running workers. */
static bool keep_working(struct worker_pool *pool)
{
        return !list_empty(&pool->worklist) && (pool->nr_running <= 1);
}

/* Do we need a new worker?  Called from manager. */
static bool need_to_create_worker(struct worker_pool *pool)
{
        return need_more_worker(pool) && !may_start_working(pool);
}

/* Do we have too many workers and should some go away? */
static bool too_many_workers(struct worker_pool *pool)
{
        bool managing = pool->flags & POOL_MANAGER_ACTIVE;
        int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
        int nr_busy = pool->nr_workers - nr_idle;

        return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
}

/**
 * worker_set_flags - set worker flags and adjust nr_running accordingly
 * @worker: self
 * @flags: flags to set
 *
 * Set @flags in @worker->flags and adjust nr_running accordingly.
 */
static inline void worker_set_flags(struct worker *worker, unsigned int flags)
{
        struct worker_pool *pool = worker->pool;

        lockdep_assert_held(&pool->lock);

        /* If transitioning into NOT_RUNNING, adjust nr_running. */
        if ((flags & WORKER_NOT_RUNNING) &&
            !(worker->flags & WORKER_NOT_RUNNING)) {
                pool->nr_running--;
        }

        worker->flags |= flags;
}

/**
 * worker_clr_flags - clear worker flags and adjust nr_running accordingly
 * @worker: self
 * @flags: flags to clear
 *
 * Clear @flags in @worker->flags and adjust nr_running accordingly.
 */
static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
{
        struct worker_pool *pool = worker->pool;
        unsigned int oflags = worker->flags;

        lockdep_assert_held(&pool->lock);

        worker->flags &= ~flags;

        /*
         * If transitioning out of NOT_RUNNING, increment nr_running.  Note
         * that the nested NOT_RUNNING is not a noop.  NOT_RUNNING is mask
         * of multiple flags, not a single flag.
         */
        if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
                if (!(worker->flags & WORKER_NOT_RUNNING))
                        pool->nr_running++;
}

/* Return the first idle worker.  Called with pool->lock held. */
static struct worker *first_idle_worker(struct worker_pool *pool)
{
        if (unlikely(list_empty(&pool->idle_list)))
                return NULL;

        return list_first_entry(&pool->idle_list, struct worker, entry);
}

/**
 * worker_enter_idle - enter idle state
 * @worker: worker which is entering idle state
 *
 * @worker is entering idle state.  Update stats and idle timer if
 * necessary.
 *
 * LOCKING:
 * raw_spin_lock_irq(pool->lock).
 */
static void worker_enter_idle(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;

        if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) ||
            WARN_ON_ONCE(!list_empty(&worker->entry) &&
                         (worker->hentry.next || worker->hentry.pprev)))
                return;

        /* can't use worker_set_flags(), also called from create_worker() */
        worker->flags |= WORKER_IDLE;
        pool->nr_idle++;
        worker->last_active = jiffies;

        /* idle_list is LIFO */
        list_add(&worker->entry, &pool->idle_list);

        if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
                mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);

        /* Sanity check nr_running. */
        WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running);
}

/**
 * worker_leave_idle - leave idle state
 * @worker: worker which is leaving idle state
 *
 * @worker is leaving idle state.  Update stats.
 *
 * LOCKING:
 * raw_spin_lock_irq(pool->lock).
 */
static void worker_leave_idle(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;

        if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE)))
                return;
        worker_clr_flags(worker, WORKER_IDLE);
        pool->nr_idle--;
        list_del_init(&worker->entry);
}

/**
 * find_worker_executing_work - find worker which is executing a work
 * @pool: pool of interest
 * @work: work to find worker for
 *
 * Find a worker which is executing @work on @pool by searching
 * @pool->busy_hash which is keyed by the address of @work.  For a worker
 * to match, its current execution should match the address of @work and
 * its work function.  This is to avoid unwanted dependency between
 * unrelated work executions through a work item being recycled while still
 * being executed.
 *
 * This is a bit tricky.  A work item may be freed once its execution
 * starts and nothing prevents the freed area from being recycled for
 * another work item.  If the same work item address ends up being reused
 * before the original execution finishes, workqueue will identify the
 * recycled work item as currently executing and make it wait until the
 * current execution finishes, introducing an unwanted dependency.
 *
 * This function checks the work item address and work function to avoid
 * false positives.  Note that this isn't complete as one may construct a
 * work function which can introduce dependency onto itself through a
 * recycled work item.  Well, if somebody wants to shoot oneself in the
 * foot that badly, there's only so much we can do, and if such deadlock
 * actually occurs, it should be easy to locate the culprit work function.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 *
 * Return:
 * Pointer to worker which is executing @work if found, %NULL
 * otherwise.
 */
static struct worker *find_worker_executing_work(struct worker_pool *pool,
                                                 struct work_struct *work)
{
        struct worker *worker;

        hash_for_each_possible(pool->busy_hash, worker, hentry,
                               (unsigned long)work)
                if (worker->current_work == work &&
                    worker->current_func == work->func)
                        return worker;

        return NULL;
}

/**
 * move_linked_works - move linked works to a list
 * @work: start of series of works to be scheduled
 * @head: target list to append @work to
 * @nextp: out parameter for nested worklist walking
 *
 * Schedule linked works starting from @work to @head. Work series to be
 * scheduled starts at @work and includes any consecutive work with
 * WORK_STRUCT_LINKED set in its predecessor. See assign_work() for details on
 * @nextp.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 */
static void move_linked_works(struct work_struct *work, struct list_head *head,
                              struct work_struct **nextp)
{
        struct work_struct *n;

        /*
         * Linked worklist will always end before the end of the list,
         * use NULL for list head.
         */
        list_for_each_entry_safe_from(work, n, NULL, entry) {
                list_move_tail(&work->entry, head);
                if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
                        break;
        }

        /*
         * If we're already inside safe list traversal and have moved
         * multiple works to the scheduled queue, the next position
         * needs to be updated.
         */
        if (nextp)
                *nextp = n;
}

/**
 * assign_work - assign a work item and its linked work items to a worker
 * @work: work to assign
 * @worker: worker to assign to
 * @nextp: out parameter for nested worklist walking
 *
 * Assign @work and its linked work items to @worker. If @work is already being
 * executed by another worker in the same pool, it'll be punted there.
 *
 * If @nextp is not NULL, it's updated to point to the next work of the last
 * scheduled work. This allows assign_work() to be nested inside
 * list_for_each_entry_safe().
 *
 * Returns %true if @work was successfully assigned to @worker. %false if @work
 * was punted to another worker already executing it.
 */
static bool assign_work(struct work_struct *work, struct worker *worker,
                        struct work_struct **nextp)
{
        struct worker_pool *pool = worker->pool;
        struct worker *collision;

        lockdep_assert_held(&pool->lock);

        /*
         * A single work shouldn't be executed concurrently by multiple workers.
         * __queue_work() ensures that @work doesn't jump to a different pool
         * while still running in the previous pool. Here, we should ensure that
         * @work is not executed concurrently by multiple workers from the same
         * pool. Check whether anyone is already processing the work. If so,
         * defer the work to the currently executing one.
         */
        collision = find_worker_executing_work(pool, work);
        if (unlikely(collision)) {
                move_linked_works(work, &collision->scheduled, nextp);
                return false;
        }

        move_linked_works(work, &worker->scheduled, nextp);
        return true;
}

static struct irq_work *bh_pool_irq_work(struct worker_pool *pool)
{
        int high = pool->attrs->nice == HIGHPRI_NICE_LEVEL ? 1 : 0;

        return &per_cpu(bh_pool_irq_works, pool->cpu)[high];
}

static void kick_bh_pool(struct worker_pool *pool)
{
#ifdef CONFIG_SMP
        /* see drain_dead_softirq_workfn() for BH_DRAINING */
        if (unlikely(pool->cpu != smp_processor_id() &&
                     !(pool->flags & POOL_BH_DRAINING))) {
                irq_work_queue_on(bh_pool_irq_work(pool), pool->cpu);
                return;
        }
#endif
        if (pool->attrs->nice == HIGHPRI_NICE_LEVEL)
                raise_softirq_irqoff(HI_SOFTIRQ);
        else
                raise_softirq_irqoff(TASKLET_SOFTIRQ);
}

/**
 * kick_pool - wake up an idle worker if necessary
 * @pool: pool to kick
 *
 * @pool may have pending work items. Wake up worker if necessary. Returns
 * whether a worker was woken up.
 */
static bool kick_pool(struct worker_pool *pool)
{
        struct worker *worker = first_idle_worker(pool);
        struct task_struct *p;

        lockdep_assert_held(&pool->lock);

        if (!need_more_worker(pool) || !worker)
                return false;

        if (pool->flags & POOL_BH) {
                kick_bh_pool(pool);
                return true;
        }

        p = worker->task;

#ifdef CONFIG_SMP
        /*
         * Idle @worker is about to execute @work and waking up provides an
         * opportunity to migrate @worker at a lower cost by setting the task's
         * wake_cpu field. Let's see if we want to move @worker to improve
         * execution locality.
         *
         * We're waking the worker that went idle the latest and there's some
         * chance that @worker is marked idle but hasn't gone off CPU yet. If
         * so, setting the wake_cpu won't do anything. As this is a best-effort
         * optimization and the race window is narrow, let's leave as-is for
         * now. If this becomes pronounced, we can skip over workers which are
         * still on cpu when picking an idle worker.
         *
         * If @pool has non-strict affinity, @worker might have ended up outside
         * its affinity scope. Repatriate.
         */
        if (!pool->attrs->affn_strict &&
            !cpumask_test_cpu(p->wake_cpu, pool->attrs->__pod_cpumask)) {
                struct work_struct *work = list_first_entry(&pool->worklist,
                                                struct work_struct, entry);
                int wake_cpu = cpumask_any_and_distribute(pool->attrs->__pod_cpumask,
                                                          cpu_online_mask);
                if (wake_cpu < nr_cpu_ids) {
                        p->wake_cpu = wake_cpu;
                        get_work_pwq(work)->stats[PWQ_STAT_REPATRIATED]++;
                }
        }
#endif
        wake_up_process(p);
        return true;
}

#ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT

/*
 * Concurrency-managed per-cpu work items that hog CPU for longer than
 * wq_cpu_intensive_thresh_us trigger the automatic CPU_INTENSIVE mechanism,
 * which prevents them from stalling other concurrency-managed work items. If a
 * work function keeps triggering this mechanism, it's likely that the work item
 * should be using an unbound workqueue instead.
 *
 * wq_cpu_intensive_report() tracks work functions which trigger such conditions
 * and report them so that they can be examined and converted to use unbound
 * workqueues as appropriate. To avoid flooding the console, each violating work
 * function is tracked and reported with exponential backoff.
 */
#define WCI_MAX_ENTS 128

struct wci_ent {
        work_func_t                func;
        atomic64_t                cnt;
        struct hlist_node        hash_node;
};

static struct wci_ent wci_ents[WCI_MAX_ENTS];
static int wci_nr_ents;
static DEFINE_RAW_SPINLOCK(wci_lock);
static DEFINE_HASHTABLE(wci_hash, ilog2(WCI_MAX_ENTS));

static struct wci_ent *wci_find_ent(work_func_t func)
{
        struct wci_ent *ent;

        hash_for_each_possible_rcu(wci_hash, ent, hash_node,
                                   (unsigned long)func) {
                if (ent->func == func)
                        return ent;
        }
        return NULL;
}

static void wq_cpu_intensive_report(work_func_t func)
{
        struct wci_ent *ent;

restart:
        ent = wci_find_ent(func);
        if (ent) {
                u64 cnt;

                /*
                 * Start reporting from the warning_thresh and back off
                 * exponentially.
                 */
                cnt = atomic64_inc_return_relaxed(&ent->cnt);
                if (wq_cpu_intensive_warning_thresh &&
                    cnt >= wq_cpu_intensive_warning_thresh &&
                    is_power_of_2(cnt + 1 - wq_cpu_intensive_warning_thresh))
                        printk_deferred(KERN_WARNING "workqueue: %ps hogged CPU for >%luus %llu times, consider switching to WQ_UNBOUND\n",
                                        ent->func, wq_cpu_intensive_thresh_us,
                                        atomic64_read(&ent->cnt));
                return;
        }

        /*
         * @func is a new violation. Allocate a new entry for it. If wcn_ents[]
         * is exhausted, something went really wrong and we probably made enough
         * noise already.
         */
        if (wci_nr_ents >= WCI_MAX_ENTS)
                return;

        raw_spin_lock(&wci_lock);

        if (wci_nr_ents >= WCI_MAX_ENTS) {
                raw_spin_unlock(&wci_lock);
                return;
        }

        if (wci_find_ent(func)) {
                raw_spin_unlock(&wci_lock);
                goto restart;
        }

        ent = &wci_ents[wci_nr_ents++];
        ent->func = func;
        atomic64_set(&ent->cnt, 0);
        hash_add_rcu(wci_hash, &ent->hash_node, (unsigned long)func);

        raw_spin_unlock(&wci_lock);

        goto restart;
}

#else        /* CONFIG_WQ_CPU_INTENSIVE_REPORT */
static void wq_cpu_intensive_report(work_func_t func) {}
#endif        /* CONFIG_WQ_CPU_INTENSIVE_REPORT */

/**
 * wq_worker_running - a worker is running again
 * @task: task waking up
 *
 * This function is called when a worker returns from schedule()
 */
void wq_worker_running(struct task_struct *task)
{
        struct worker *worker = kthread_data(task);

        if (!READ_ONCE(worker->sleeping))
                return;

        /*
         * If preempted by unbind_workers() between the WORKER_NOT_RUNNING check
         * and the nr_running increment below, we may ruin the nr_running reset
         * and leave with an unexpected pool->nr_running == 1 on the newly unbound
         * pool. Protect against such race.
         */
        preempt_disable();
        if (!(worker->flags & WORKER_NOT_RUNNING))
                worker->pool->nr_running++;
        preempt_enable();

        /*
         * CPU intensive auto-detection cares about how long a work item hogged
         * CPU without sleeping. Reset the starting timestamp on wakeup.
         */
        worker->current_at = worker->task->se.sum_exec_runtime;

        WRITE_ONCE(worker->sleeping, 0);
}

/**
 * wq_worker_sleeping - a worker is going to sleep
 * @task: task going to sleep
 *
 * This function is called from schedule() when a busy worker is
 * going to sleep.
 */
void wq_worker_sleeping(struct task_struct *task)
{
        struct worker *worker = kthread_data(task);
        struct worker_pool *pool;

        /*
         * Rescuers, which may not have all the fields set up like normal
         * workers, also reach here, let's not access anything before
         * checking NOT_RUNNING.
         */
        if (worker->flags & WORKER_NOT_RUNNING)
                return;

        pool = worker->pool;

        /* Return if preempted before wq_worker_running() was reached */
        if (READ_ONCE(worker->sleeping))
                return;

        WRITE_ONCE(worker->sleeping, 1);
        raw_spin_lock_irq(&pool->lock);

        /*
         * Recheck in case unbind_workers() preempted us. We don't
         * want to decrement nr_running after the worker is unbound
         * and nr_running has been reset.
         */
        if (worker->flags & WORKER_NOT_RUNNING) {
                raw_spin_unlock_irq(&pool->lock);
                return;
        }

        pool->nr_running--;
        if (kick_pool(pool))
                worker->current_pwq->stats[PWQ_STAT_CM_WAKEUP]++;

        raw_spin_unlock_irq(&pool->lock);
}

/**
 * wq_worker_tick - a scheduler tick occurred while a kworker is running
 * @task: task currently running
 *
 * Called from sched_tick(). We're in the IRQ context and the current
 * worker's fields which follow the 'K' locking rule can be accessed safely.
 */
void wq_worker_tick(struct task_struct *task)
{
        struct worker *worker = kthread_data(task);
        struct pool_workqueue *pwq = worker->current_pwq;
        struct worker_pool *pool = worker->pool;

        if (!pwq)
                return;

        pwq->stats[PWQ_STAT_CPU_TIME] += TICK_USEC;

        if (!wq_cpu_intensive_thresh_us)
                return;

        /*
         * If the current worker is concurrency managed and hogged the CPU for
         * longer than wq_cpu_intensive_thresh_us, it's automatically marked
         * CPU_INTENSIVE to avoid stalling other concurrency-managed work items.
         *
         * Set @worker->sleeping means that @worker is in the process of
         * switching out voluntarily and won't be contributing to
         * @pool->nr_running until it wakes up. As wq_worker_sleeping() also
         * decrements ->nr_running, setting CPU_INTENSIVE here can lead to
         * double decrements. The task is releasing the CPU anyway. Let's skip.
         * We probably want to make this prettier in the future.
         */
        if ((worker->flags & WORKER_NOT_RUNNING) || READ_ONCE(worker->sleeping) ||
            worker->task->se.sum_exec_runtime - worker->current_at <
            wq_cpu_intensive_thresh_us * NSEC_PER_USEC)
                return;

        raw_spin_lock(&pool->lock);

        worker_set_flags(worker, WORKER_CPU_INTENSIVE);
        wq_cpu_intensive_report(worker->current_func);
        pwq->stats[PWQ_STAT_CPU_INTENSIVE]++;

        if (kick_pool(pool))
                pwq->stats[PWQ_STAT_CM_WAKEUP]++;

        raw_spin_unlock(&pool->lock);
}

/**
 * wq_worker_last_func - retrieve worker's last work function
 * @task: Task to retrieve last work function of.
 *
 * Determine the last function a worker executed. This is called from
 * the scheduler to get a worker's last known identity.
 *
 * CONTEXT:
 * raw_spin_lock_irq(rq->lock)
 *
 * This function is called during schedule() when a kworker is going
 * to sleep. It's used by psi to identify aggregation workers during
 * dequeuing, to allow periodic aggregation to shut-off when that
 * worker is the last task in the system or cgroup to go to sleep.
 *
 * As this function doesn't involve any workqueue-related locking, it
 * only returns stable values when called from inside the scheduler's
 * queuing and dequeuing paths, when @task, which must be a kworker,
 * is guaranteed to not be processing any works.
 *
 * Return:
 * The last work function %current executed as a worker, NULL if it
 * hasn't executed any work yet.
 */
work_func_t wq_worker_last_func(struct task_struct *task)
{
        struct worker *worker = kthread_data(task);

        return worker->last_func;
}

/**
 * wq_node_nr_active - Determine wq_node_nr_active to use
 * @wq: workqueue of interest
 * @node: NUMA node, can be %NUMA_NO_NODE
 *
 * Determine wq_node_nr_active to use for @wq on @node. Returns:
 *
 * - %NULL for per-cpu workqueues as they don't need to use shared nr_active.
 *
 * - node_nr_active[nr_node_ids] if @node is %NUMA_NO_NODE.
 *
 * - Otherwise, node_nr_active[@node].
 */
static struct wq_node_nr_active *wq_node_nr_active(struct workqueue_struct *wq,
                                                   int node)
{
        if (!(wq->flags & WQ_UNBOUND))
                return NULL;

        if (node == NUMA_NO_NODE)
                node = nr_node_ids;

        return wq->node_nr_active[node];
}

/**
 * wq_update_node_max_active - Update per-node max_actives to use
 * @wq: workqueue to update
 * @off_cpu: CPU that's going down, -1 if a CPU is not going down
 *
 * Update @wq->node_nr_active[]->max. @wq must be unbound. max_active is
 * distributed among nodes according to the proportions of numbers of online
 * cpus. The result is always between @wq->min_active and max_active.
 */
static void wq_update_node_max_active(struct workqueue_struct *wq, int off_cpu)
{
        struct cpumask *effective = unbound_effective_cpumask(wq);
        int min_active = READ_ONCE(wq->min_active);
        int max_active = READ_ONCE(wq->max_active);
        int total_cpus, node;

        lockdep_assert_held(&wq->mutex);

        if (!wq_topo_initialized)
                return;

        if (off_cpu >= 0 && !cpumask_test_cpu(off_cpu, effective))
                off_cpu = -1;

        total_cpus = cpumask_weight_and(effective, cpu_online_mask);
        if (off_cpu >= 0)
                total_cpus--;

        /* If all CPUs of the wq get offline, use the default values */
        if (unlikely(!total_cpus)) {
                for_each_node(node)
                        wq_node_nr_active(wq, node)->max = min_active;

                wq_node_nr_active(wq, NUMA_NO_NODE)->max = max_active;
                return;
        }

        for_each_node(node) {
                int node_cpus;

                node_cpus = cpumask_weight_and(effective, cpumask_of_node(node));
                if (off_cpu >= 0 && cpu_to_node(off_cpu) == node)
                        node_cpus--;

                wq_node_nr_active(wq, node)->max =
                        clamp(DIV_ROUND_UP(max_active * node_cpus, total_cpus),
                              min_active, max_active);
        }

        wq_node_nr_active(wq, NUMA_NO_NODE)->max = max_active;
}

/**
 * get_pwq - get an extra reference on the specified pool_workqueue
 * @pwq: pool_workqueue to get
 *
 * Obtain an extra reference on @pwq.  The caller should guarantee that
 * @pwq has positive refcnt and be holding the matching pool->lock.
 */
static void get_pwq(struct pool_workqueue *pwq)
{
        lockdep_assert_held(&pwq->pool->lock);
        WARN_ON_ONCE(pwq->refcnt <= 0);
        pwq->refcnt++;
}

/**
 * put_pwq - put a pool_workqueue reference
 * @pwq: pool_workqueue to put
 *
 * Drop a reference of @pwq.  If its refcnt reaches zero, schedule its
 * destruction.  The caller should be holding the matching pool->lock.
 */
static void put_pwq(struct pool_workqueue *pwq)
{
        lockdep_assert_held(&pwq->pool->lock);
        if (likely(--pwq->refcnt))
                return;
        /*
         * @pwq can't be released under pool->lock, bounce to a dedicated
         * kthread_worker to avoid A-A deadlocks.
         */
        kthread_queue_work(pwq_release_worker, &pwq->release_work);
}

/**
 * put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock
 * @pwq: pool_workqueue to put (can be %NULL)
 *
 * put_pwq() with locking.  This function also allows %NULL @pwq.
 */
static void put_pwq_unlocked(struct pool_workqueue *pwq)
{
        if (pwq) {
                /*
                 * As both pwqs and pools are RCU protected, the
                 * following lock operations are safe.
                 */
                raw_spin_lock_irq(&pwq->pool->lock);
                put_pwq(pwq);
                raw_spin_unlock_irq(&pwq->pool->lock);
        }
}

static bool pwq_is_empty(struct pool_workqueue *pwq)
{
        return !pwq->nr_active && list_empty(&pwq->inactive_works);
}

static void __pwq_activate_work(struct pool_workqueue *pwq,
                                struct work_struct *work)
{
        unsigned long *wdb = work_data_bits(work);

        WARN_ON_ONCE(!(*wdb & WORK_STRUCT_INACTIVE));
        trace_workqueue_activate_work(work);
        if (list_empty(&pwq->pool->worklist))
                pwq->pool->watchdog_ts = jiffies;
        move_linked_works(work, &pwq->pool->worklist, NULL);
        __clear_bit(WORK_STRUCT_INACTIVE_BIT, wdb);
}

static bool tryinc_node_nr_active(struct wq_node_nr_active *nna)
{
        int max = READ_ONCE(nna->max);
        int old = atomic_read(&nna->nr);

        do {
                if (old >= max)
                        return false;
        } while (!atomic_try_cmpxchg_relaxed(&nna->nr, &old, old + 1));

        return true;
}

/**
 * pwq_tryinc_nr_active - Try to increment nr_active for a pwq
 * @pwq: pool_workqueue of interest
 * @fill: max_active may have increased, try to increase concurrency level
 *
 * Try to increment nr_active for @pwq. Returns %true if an nr_active count is
 * successfully obtained. %false otherwise.
 */
static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq, bool fill)
{
        struct workqueue_struct *wq = pwq->wq;
        struct worker_pool *pool = pwq->pool;
        struct wq_node_nr_active *nna = wq_node_nr_active(wq, pool->node);
        bool obtained = false;

        lockdep_assert_held(&pool->lock);

        if (!nna) {
                /* BH or per-cpu workqueue, pwq->nr_active is sufficient */
                obtained = pwq->nr_active < READ_ONCE(wq->max_active);
                goto out;
        }

        if (unlikely(pwq->plugged))
                return false;

        /*
         * Unbound workqueue uses per-node shared nr_active $nna. If @pwq is
         * already waiting on $nna, pwq_dec_nr_active() will maintain the
         * concurrency level. Don't jump the line.
         *
         * We need to ignore the pending test after max_active has increased as
         * pwq_dec_nr_active() can only maintain the concurrency level but not
         * increase it. This is indicated by @fill.
         */
        if (!list_empty(&pwq->pending_node) && likely(!fill))
                goto out;

        obtained = tryinc_node_nr_active(nna);
        if (obtained)
                goto out;

        /*
         * Lockless acquisition failed. Lock, add ourself to $nna->pending_pwqs
         * and try again. The smp_mb() is paired with the implied memory barrier
         * of atomic_dec_return() in pwq_dec_nr_active() to ensure that either
         * we see the decremented $nna->nr or they see non-empty
         * $nna->pending_pwqs.
         */
        raw_spin_lock(&nna->lock);

        if (list_empty(&pwq->pending_node))
                list_add_tail(&pwq->pending_node, &nna->pending_pwqs);
        else if (likely(!fill))
                goto out_unlock;

        smp_mb();

        obtained = tryinc_node_nr_active(nna);

        /*
         * If @fill, @pwq might have already been pending. Being spuriously
         * pending in cold paths doesn't affect anything. Let's leave it be.
         */
        if (obtained && likely(!fill))
                list_del_init(&pwq->pending_node);

out_unlock:
        raw_spin_unlock(&nna->lock);
out:
        if (obtained)
                pwq->nr_active++;
        return obtained;
}

/**
 * pwq_activate_first_inactive - Activate the first inactive work item on a pwq
 * @pwq: pool_workqueue of interest
 * @fill: max_active may have increased, try to increase concurrency level
 *
 * Activate the first inactive work item of @pwq if available and allowed by
 * max_active limit.
 *
 * Returns %true if an inactive work item has been activated. %false if no
 * inactive work item is found or max_active limit is reached.
 */
static bool pwq_activate_first_inactive(struct pool_workqueue *pwq, bool fill)
{
        struct work_struct *work =
                list_first_entry_or_null(&pwq->inactive_works,
                                         struct work_struct, entry);

        if (work && pwq_tryinc_nr_active(pwq, fill)) {
                __pwq_activate_work(pwq, work);
                return true;
        } else {
                return false;
        }
}

/**
 * unplug_oldest_pwq - unplug the oldest pool_workqueue
 * @wq: workqueue_struct where its oldest pwq is to be unplugged
 *
 * This function should only be called for ordered workqueues where only the
 * oldest pwq is unplugged, the others are plugged to suspend execution to
 * ensure proper work item ordering::
 *
 *    dfl_pwq --------------+     [P] - plugged
 *                          |
 *                          v
 *    pwqs -> A -> B [P] -> C [P] (newest)
 *            |    |        |
 *            1    3        5
 *            |    |        |
 *            2    4        6
 *
 * When the oldest pwq is drained and removed, this function should be called
 * to unplug the next oldest one to start its work item execution. Note that
 * pwq's are linked into wq->pwqs with the oldest first, so the first one in
 * the list is the oldest.
 */
static void unplug_oldest_pwq(struct workqueue_struct *wq)
{
        struct pool_workqueue *pwq;

        lockdep_assert_held(&wq->mutex);

        /* Caller should make sure that pwqs isn't empty before calling */
        pwq = list_first_entry_or_null(&wq->pwqs, struct pool_workqueue,
                                       pwqs_node);
        raw_spin_lock_irq(&pwq->pool->lock);
        if (pwq->plugged) {
                pwq->plugged = false;
                if (pwq_activate_first_inactive(pwq, true))
                        kick_pool(pwq->pool);
        }
        raw_spin_unlock_irq(&pwq->pool->lock);
}

/**
 * node_activate_pending_pwq - Activate a pending pwq on a wq_node_nr_active
 * @nna: wq_node_nr_active to activate a pending pwq for
 * @caller_pool: worker_pool the caller is locking
 *
 * Activate a pwq in @nna->pending_pwqs. Called with @caller_pool locked.
 * @caller_pool may be unlocked and relocked to lock other worker_pools.
 */
static void node_activate_pending_pwq(struct wq_node_nr_active *nna,
                                      struct worker_pool *caller_pool)
{
        struct worker_pool *locked_pool = caller_pool;
        struct pool_workqueue *pwq;
        struct work_struct *work;

        lockdep_assert_held(&caller_pool->lock);

        raw_spin_lock(&nna->lock);
retry:
        pwq = list_first_entry_or_null(&nna->pending_pwqs,
                                       struct pool_workqueue, pending_node);
        if (!pwq)
                goto out_unlock;

        /*
         * If @pwq is for a different pool than @locked_pool, we need to lock
         * @pwq->pool->lock. Let's trylock first. If unsuccessful, do the unlock
         * / lock dance. For that, we also need to release @nna->lock as it's
         * nested inside pool locks.
         */
        if (pwq->pool != locked_pool) {
                raw_spin_unlock(&locked_pool->lock);
                locked_pool = pwq->pool;
                if (!raw_spin_trylock(&locked_pool->lock)) {
                        raw_spin_unlock(&nna->lock);
                        raw_spin_lock(&locked_pool->lock);
                        raw_spin_lock(&nna->lock);
                        goto retry;
                }
        }

        /*
         * $pwq may not have any inactive work items due to e.g. cancellations.
         * Drop it from pending_pwqs and see if there's another one.
         */
        work = list_first_entry_or_null(&pwq->inactive_works,
                                        struct work_struct, entry);
        if (!work) {
                list_del_init(&pwq->pending_node);
                goto retry;
        }

        /*
         * Acquire an nr_active count and activate the inactive work item. If
         * $pwq still has inactive work items, rotate it to the end of the
         * pending_pwqs so that we round-robin through them. This means that
         * inactive work items are not activated in queueing order which is fine
         * given that there has never been any ordering across different pwqs.
         */
        if (likely(tryinc_node_nr_active(nna))) {
                pwq->nr_active++;
                __pwq_activate_work(pwq, work);

                if (list_empty(&pwq->inactive_works))
                        list_del_init(&pwq->pending_node);
                else
                        list_move_tail(&pwq->pending_node, &nna->pending_pwqs);

                /* if activating a foreign pool, make sure it's running */
                if (pwq->pool != caller_pool)
                        kick_pool(pwq->pool);
        }

out_unlock:
        raw_spin_unlock(&nna->lock);
        if (locked_pool != caller_pool) {
                raw_spin_unlock(&locked_pool->lock);
                raw_spin_lock(&caller_pool->lock);
        }
}

/**
 * pwq_dec_nr_active - Retire an active count
 * @pwq: pool_workqueue of interest
 *
 * Decrement @pwq's nr_active and try to activate the first inactive work item.
 * For unbound workqueues, this function may temporarily drop @pwq->pool->lock.
 */
static void pwq_dec_nr_active(struct pool_workqueue *pwq)
{
        struct worker_pool *pool = pwq->pool;
        struct wq_node_nr_active *nna = wq_node_nr_active(pwq->wq, pool->node);

        lockdep_assert_held(&pool->lock);

        /*
         * @pwq->nr_active should be decremented for both percpu and unbound
         * workqueues.
         */
        pwq->nr_active--;

        /*
         * For a percpu workqueue, it's simple. Just need to kick the first
         * inactive work item on @pwq itself.
         */
        if (!nna) {
                pwq_activate_first_inactive(pwq, false);
                return;
        }

        /*
         * If @pwq is for an unbound workqueue, it's more complicated because
         * multiple pwqs and pools may be sharing the nr_active count. When a
         * pwq needs to wait for an nr_active count, it puts itself on
         * $nna->pending_pwqs. The following atomic_dec_return()'s implied
         * memory barrier is paired with smp_mb() in pwq_tryinc_nr_active() to
         * guarantee that either we see non-empty pending_pwqs or they see
         * decremented $nna->nr.
         *
         * $nna->max may change as CPUs come online/offline and @pwq->wq's
         * max_active gets updated. However, it is guaranteed to be equal to or
         * larger than @pwq->wq->min_active which is above zero unless freezing.
         * This maintains the forward progress guarantee.
         */
        if (atomic_dec_return(&nna->nr) >= READ_ONCE(nna->max))
                return;

        if (!list_empty(&nna->pending_pwqs))
                node_activate_pending_pwq(nna, pool);
}

/**
 * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight
 * @pwq: pwq of interest
 * @work_data: work_data of work which left the queue
 *
 * A work either has completed or is removed from pending queue,
 * decrement nr_in_flight of its pwq and handle workqueue flushing.
 *
 * NOTE:
 * For unbound workqueues, this function may temporarily drop @pwq->pool->lock
 * and thus should be called after all other state updates for the in-flight
 * work item is complete.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 */
static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_data)
{
        int color = get_work_color(work_data);

        if (!(work_data & WORK_STRUCT_INACTIVE))
                pwq_dec_nr_active(pwq);

        pwq->nr_in_flight[color]--;

        /* is flush in progress and are we at the flushing tip? */
        if (likely(pwq->flush_color != color))
                goto out_put;

        /* are there still in-flight works? */
        if (pwq->nr_in_flight[color])
                goto out_put;

        /* this pwq is done, clear flush_color */
        pwq->flush_color = -1;

        /*
         * If this was the last pwq, wake up the first flusher.  It
         * will handle the rest.
         */
        if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush))
                complete(&pwq->wq->first_flusher->done);
out_put:
        put_pwq(pwq);
}

/**
 * try_to_grab_pending - steal work item from worklist and disable irq
 * @work: work item to steal
 * @cflags: %WORK_CANCEL_ flags
 * @irq_flags: place to store irq state
 *
 * Try to grab PENDING bit of @work.  This function can handle @work in any
 * stable state - idle, on timer or on worklist.
 *
 * Return:
 *
 *  ========        ================================================================
 *  1                if @work was pending and we successfully stole PENDING
 *  0                if @work was idle and we claimed PENDING
 *  -EAGAIN        if PENDING couldn't be grabbed at the moment, safe to busy-retry
 *  ========        ================================================================
 *
 * Note:
 * On >= 0 return, the caller owns @work's PENDING bit.  To avoid getting
 * interrupted while holding PENDING and @work off queue, irq must be
 * disabled on entry.  This, combined with delayed_work->timer being
 * irqsafe, ensures that we return -EAGAIN for finite short period of time.
 *
 * On successful return, >= 0, irq is disabled and the caller is
 * responsible for releasing it using local_irq_restore(*@irq_flags).
 *
 * This function is safe to call from any context including IRQ handler.
 */
static int try_to_grab_pending(struct work_struct *work, u32 cflags,
                               unsigned long *irq_flags)
{
        struct worker_pool *pool;
        struct pool_workqueue *pwq;

        local_irq_save(*irq_flags);

        /* try to steal the timer if it exists */
        if (cflags & WORK_CANCEL_DELAYED) {
                struct delayed_work *dwork = to_delayed_work(work);

                /*
                 * dwork->timer is irqsafe.  If timer_delete() fails, it's
                 * guaranteed that the timer is not queued anywhere and not
                 * running on the local CPU.
                 */
                if (likely(timer_delete(&dwork->timer)))
                        return 1;
        }

        /* try to claim PENDING the normal way */
        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
                return 0;

        rcu_read_lock();
        /*
         * The queueing is in progress, or it is already queued. Try to
         * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
         */
        pool = get_work_pool(work);
        if (!pool)
                goto fail;

        raw_spin_lock(&pool->lock);
        /*
         * work->data is guaranteed to point to pwq only while the work
         * item is queued on pwq->wq, and both updating work->data to point
         * to pwq on queueing and to pool on dequeueing are done under
         * pwq->pool->lock.  This in turn guarantees that, if work->data
         * points to pwq which is associated with a locked pool, the work
         * item is currently queued on that pool.
         */
        pwq = get_work_pwq(work);
        if (pwq && pwq->pool == pool) {
                unsigned long work_data = *work_data_bits(work);

                debug_work_deactivate(work);

                /*
                 * A cancelable inactive work item must be in the
                 * pwq->inactive_works since a queued barrier can't be
                 * canceled (see the comments in insert_wq_barrier()).
                 *
                 * An inactive work item cannot be deleted directly because
                 * it might have linked barrier work items which, if left
                 * on the inactive_works list, will confuse pwq->nr_active
                 * management later on and cause stall.  Move the linked
                 * barrier work items to the worklist when deleting the grabbed
                 * item. Also keep WORK_STRUCT_INACTIVE in work_data, so that
                 * it doesn't participate in nr_active management in later
                 * pwq_dec_nr_in_flight().
                 */
                if (work_data & WORK_STRUCT_INACTIVE)
                        move_linked_works(work, &pwq->pool->worklist, NULL);

                list_del_init(&work->entry);

                /*
                 * work->data points to pwq iff queued. Let's point to pool. As
                 * this destroys work->data needed by the next step, stash it.
                 */
                set_work_pool_and_keep_pending(work, pool->id,
                                               pool_offq_flags(pool));

                /* must be the last step, see the function comment */
                pwq_dec_nr_in_flight(pwq, work_data);

                raw_spin_unlock(&pool->lock);
                rcu_read_unlock();
                return 1;
        }
        raw_spin_unlock(&pool->lock);
fail:
        rcu_read_unlock();
        local_irq_restore(*irq_flags);
        return -EAGAIN;
}

/**
 * work_grab_pending - steal work item from worklist and disable irq
 * @work: work item to steal
 * @cflags: %WORK_CANCEL_ flags
 * @irq_flags: place to store IRQ state
 *
 * Grab PENDING bit of @work. @work can be in any stable state - idle, on timer
 * or on worklist.
 *
 * Can be called from any context. IRQ is disabled on return with IRQ state
 * stored in *@irq_flags. The caller is responsible for re-enabling it using
 * local_irq_restore().
 *
 * Returns %true if @work was pending. %false if idle.
 */
static bool work_grab_pending(struct work_struct *work, u32 cflags,
                              unsigned long *irq_flags)
{
        int ret;

        while (true) {
                ret = try_to_grab_pending(work, cflags, irq_flags);
                if (ret >= 0)
                        return ret;
                cpu_relax();
        }
}

/**
 * insert_work - insert a work into a pool
 * @pwq: pwq @work belongs to
 * @work: work to insert
 * @head: insertion point
 * @extra_flags: extra WORK_STRUCT_* flags to set
 *
 * Insert @work which belongs to @pwq after @head.  @extra_flags is or'd to
 * work_struct flags.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 */
static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
                        struct list_head *head, unsigned int extra_flags)
{
        debug_work_activate(work);

        /* record the work call stack in order to print it in KASAN reports */
        kasan_record_aux_stack(work);

        /* we own @work, set data and link */
        set_work_pwq(work, pwq, extra_flags);
        list_add_tail(&work->entry, head);
        get_pwq(pwq);
}

/*
 * Test whether @work is being queued from another work executing on the
 * same workqueue.
 */
static bool is_chained_work(struct workqueue_struct *wq)
{
        struct worker *worker;

        worker = current_wq_worker();
        /*
         * Return %true iff I'm a worker executing a work item on @wq.  If
         * I'm @worker, it's safe to dereference it without locking.
         */
        return worker && worker->current_pwq->wq == wq;
}

/*
 * When queueing an unbound work item to a wq, prefer local CPU if allowed
 * by wq_unbound_cpumask.  Otherwise, round robin among the allowed ones to
 * avoid perturbing sensitive tasks.
 */
static int wq_select_unbound_cpu(int cpu)
{
        int new_cpu;

        if (likely(!wq_debug_force_rr_cpu)) {
                if (cpumask_test_cpu(cpu, wq_unbound_cpumask))
                        return cpu;
        } else {
                pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n");
        }

        new_cpu = __this_cpu_read(wq_rr_cpu_last);
        new_cpu = cpumask_next_and_wrap(new_cpu, wq_unbound_cpumask, cpu_online_mask);
        if (unlikely(new_cpu >= nr_cpu_ids))
                return cpu;
        __this_cpu_write(wq_rr_cpu_last, new_cpu);

        return new_cpu;
}

static void __queue_work(int cpu, struct workqueue_struct *wq,
                         struct work_struct *work)
{
        struct pool_workqueue *pwq;
        struct worker_pool *last_pool, *pool;
        unsigned int work_flags;
        unsigned int req_cpu = cpu;

        /*
         * While a work item is PENDING && off queue, a task trying to
         * steal the PENDING will busy-loop waiting for it to either get
         * queued or lose PENDING.  Grabbing PENDING and queueing should
         * happen with IRQ disabled.
         */
        lockdep_assert_irqs_disabled();

        /*
         * For a draining wq, only works from the same workqueue are
         * allowed. The __WQ_DESTROYING helps to spot the issue that
         * queues a new work item to a wq after destroy_workqueue(wq).
         */
        if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) &&
                     WARN_ONCE(!is_chained_work(wq), "workqueue: cannot queue %ps on wq %s\n",
                               work->func, wq->name))) {
                return;
        }
        rcu_read_lock();
retry:
        /* pwq which will be used unless @work is executing elsewhere */
        if (req_cpu == WORK_CPU_UNBOUND) {
                if (wq->flags & WQ_UNBOUND)
                        cpu = wq_select_unbound_cpu(raw_smp_processor_id());
                else
                        cpu = raw_smp_processor_id();
        }

        pwq = rcu_dereference(*per_cpu_ptr(wq->cpu_pwq, cpu));
        pool = pwq->pool;

        /*
         * If @work was previously on a different pool, it might still be
         * running there, in which case the work needs to be queued on that
         * pool to guarantee non-reentrancy.
         *
         * For ordered workqueue, work items must be queued on the newest pwq
         * for accurate order management.  Guaranteed order also guarantees
         * non-reentrancy.  See the comments above unplug_oldest_pwq().
         */
        last_pool = get_work_pool(work);
        if (last_pool && last_pool != pool && !(wq->flags & __WQ_ORDERED)) {
                struct worker *worker;

                raw_spin_lock(&last_pool->lock);

                worker = find_worker_executing_work(last_pool, work);

                if (worker && worker->current_pwq->wq == wq) {
                        pwq = worker->current_pwq;
                        pool = pwq->pool;
                        WARN_ON_ONCE(pool != last_pool);
                } else {
                        /* meh... not running there, queue here */
                        raw_spin_unlock(&last_pool->lock);
                        raw_spin_lock(&pool->lock);
                }
        } else {
                raw_spin_lock(&pool->lock);
        }

        /*
         * pwq is determined and locked. For unbound pools, we could have raced
         * with pwq release and it could already be dead. If its refcnt is zero,
         * repeat pwq selection. Note that unbound pwqs never die without
         * another pwq replacing it in cpu_pwq or while work items are executing
         * on it, so the retrying is guaranteed to make forward-progress.
         */
        if (unlikely(!pwq->refcnt)) {
                if (wq->flags & WQ_UNBOUND) {
                        raw_spin_unlock(&pool->lock);
                        cpu_relax();
                        goto retry;
                }
                /* oops */
                WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
                          wq->name, cpu);
        }

        /* pwq determined, queue */
        trace_workqueue_queue_work(req_cpu, pwq, work);

        if (WARN_ON(!list_empty(&work->entry)))
                goto out;

        pwq->nr_in_flight[pwq->work_color]++;
        work_flags = work_color_to_flags(pwq->work_color);

        /*
         * Limit the number of concurrently active work items to max_active.
         * @work must also queue behind existing inactive work items to maintain
         * ordering when max_active changes. See wq_adjust_max_active().
         */
        if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq, false)) {
                if (list_empty(&pool->worklist))
                        pool->watchdog_ts = jiffies;

                trace_workqueue_activate_work(work);
                insert_work(pwq, work, &pool->worklist, work_flags);
                kick_pool(pool);
        } else {
                work_flags |= WORK_STRUCT_INACTIVE;
                insert_work(pwq, work, &pwq->inactive_works, work_flags);
        }

out:
        raw_spin_unlock(&pool->lock);
        rcu_read_unlock();
}

static bool clear_pending_if_disabled(struct work_struct *work)
{
        unsigned long data = *work_data_bits(work);
        struct work_offq_data offqd;

        if (likely((data & WORK_STRUCT_PWQ) ||
                   !(data & WORK_OFFQ_DISABLE_MASK)))
                return false;

        work_offqd_unpack(&offqd, data);
        set_work_pool_and_clear_pending(work, offqd.pool_id,
                                        work_offqd_pack_flags(&offqd));
        return true;
}

/**
 * queue_work_on - queue work on specific cpu
 * @cpu: CPU number to execute work on
 * @wq: workqueue to use
 * @work: work to queue
 *
 * We queue the work to a specific CPU, the caller must ensure it
 * can't go away.  Callers that fail to ensure that the specified
 * CPU cannot go away will execute on a randomly chosen CPU.
 * But note well that callers specifying a CPU that never has been
 * online will get a splat.
 *
 * Return: %false if @work was already on a queue, %true otherwise.
 */
bool queue_work_on(int cpu, struct workqueue_struct *wq,
                   struct work_struct *work)
{
        bool ret = false;
        unsigned long irq_flags;

        local_irq_save(irq_flags);

        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) &&
            !clear_pending_if_disabled(work)) {
                __queue_work(cpu, wq, work);
                ret = true;
        }

        local_irq_restore(irq_flags);
        return ret;
}
EXPORT_SYMBOL(queue_work_on);

/**
 * select_numa_node_cpu - Select a CPU based on NUMA node
 * @node: NUMA node ID that we want to select a CPU from
 *
 * This function will attempt to find a "random" cpu available on a given
 * node. If there are no CPUs available on the given node it will return
 * WORK_CPU_UNBOUND indicating that we should just schedule to any
 * available CPU if we need to schedule this work.
 */
static int select_numa_node_cpu(int node)
{
        int cpu;

        /* Delay binding to CPU if node is not valid or online */
        if (node < 0 || node >= MAX_NUMNODES || !node_online(node))
                return WORK_CPU_UNBOUND;

        /* Use local node/cpu if we are already there */
        cpu = raw_smp_processor_id();
        if (node == cpu_to_node(cpu))
                return cpu;

        /* Use "random" otherwise know as "first" online CPU of node */
        cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask);

        /* If CPU is valid return that, otherwise just defer */
        return cpu < nr_cpu_ids ? cpu : WORK_CPU_UNBOUND;
}

/**
 * queue_work_node - queue work on a "random" cpu for a given NUMA node
 * @node: NUMA node that we are targeting the work for
 * @wq: workqueue to use
 * @work: work to queue
 *
 * We queue the work to a "random" CPU within a given NUMA node. The basic
 * idea here is to provide a way to somehow associate work with a given
 * NUMA node.
 *
 * This function will only make a best effort attempt at getting this onto
 * the right NUMA node. If no node is requested or the requested node is
 * offline then we just fall back to standard queue_work behavior.
 *
 * Currently the "random" CPU ends up being the first available CPU in the
 * intersection of cpu_online_mask and the cpumask of the node, unless we
 * are running on the node. In that case we just use the current CPU.
 *
 * Return: %false if @work was already on a queue, %true otherwise.
 */
bool queue_work_node(int node, struct workqueue_struct *wq,
                     struct work_struct *work)
{
        unsigned long irq_flags;
        bool ret = false;

        /*
         * This current implementation is specific to unbound workqueues.
         * Specifically we only return the first available CPU for a given
         * node instead of cycling through individual CPUs within the node.
         *
         * If this is used with a per-cpu workqueue then the logic in
         * workqueue_select_cpu_near would need to be updated to allow for
         * some round robin type logic.
         */
        WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND));

        local_irq_save(irq_flags);

        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) &&
            !clear_pending_if_disabled(work)) {
                int cpu = select_numa_node_cpu(node);

                __queue_work(cpu, wq, work);
                ret = true;
        }

        local_irq_restore(irq_flags);
        return ret;
}
EXPORT_SYMBOL_GPL(queue_work_node);

void delayed_work_timer_fn(struct timer_list *t)
{
        struct delayed_work *dwork = timer_container_of(dwork, t, timer);

        /* should have been called from irqsafe timer with irq already off */
        __queue_work(dwork->cpu, dwork->wq, &dwork->work);
}
EXPORT_SYMBOL(delayed_work_timer_fn);

static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
                                struct delayed_work *dwork, unsigned long delay)
{
        struct timer_list *timer = &dwork->timer;
        struct work_struct *work = &dwork->work;

        WARN_ON_ONCE(!wq);
        WARN_ON_ONCE(timer->function != delayed_work_timer_fn);
        WARN_ON_ONCE(timer_pending(timer));
        WARN_ON_ONCE(!list_empty(&work->entry));

        /*
         * If @delay is 0, queue @dwork->work immediately.  This is for
         * both optimization and correctness.  The earliest @timer can
         * expire is on the closest next tick and delayed_work users depend
         * on that there's no such delay when @delay is 0.
         */
        if (!delay) {
                __queue_work(cpu, wq, &dwork->work);
                return;
        }

        WARN_ON_ONCE(cpu != WORK_CPU_UNBOUND && !cpu_online(cpu));
        dwork->wq = wq;
        dwork->cpu = cpu;
        timer->expires = jiffies + delay;

        if (housekeeping_enabled(HK_TYPE_TIMER)) {
                /* If the current cpu is a housekeeping cpu, use it. */
                cpu = smp_processor_id();
                if (!housekeeping_test_cpu(cpu, HK_TYPE_TIMER))
                        cpu = housekeeping_any_cpu(HK_TYPE_TIMER);
                add_timer_on(timer, cpu);
        } else {
                if (likely(cpu == WORK_CPU_UNBOUND))
                        add_timer_global(timer);
                else
                        add_timer_on(timer, cpu);
        }
}

/**
 * queue_delayed_work_on - queue work on specific CPU after delay
 * @cpu: CPU number to execute work on
 * @wq: workqueue to use
 * @dwork: work to queue
 * @delay: number of jiffies to wait before queueing
 *
 * We queue the delayed_work to a specific CPU, for non-zero delays the
 * caller must ensure it is online and can't go away. Callers that fail
 * to ensure this, may get @dwork->timer queued to an offlined CPU and
 * this will prevent queueing of @dwork->work unless the offlined CPU
 * becomes online again.
 *
 * Return: %false if @work was already on a queue, %true otherwise.  If
 * @delay is zero and @dwork is idle, it will be scheduled for immediate
 * execution.
 */
bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
                           struct delayed_work *dwork, unsigned long delay)
{
        struct work_struct *work = &dwork->work;
        bool ret = false;
        unsigned long irq_flags;

        /* read the comment in __queue_work() */
        local_irq_save(irq_flags);

        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) &&
            !clear_pending_if_disabled(work)) {
                __queue_delayed_work(cpu, wq, dwork, delay);
                ret = true;
        }

        local_irq_restore(irq_flags);
        return ret;
}
EXPORT_SYMBOL(queue_delayed_work_on);

/**
 * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
 * @cpu: CPU number to execute work on
 * @wq: workqueue to use
 * @dwork: work to queue
 * @delay: number of jiffies to wait before queueing
 *
 * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise,
 * modify @dwork's timer so that it expires after @delay.  If @delay is
 * zero, @work is guaranteed to be scheduled immediately regardless of its
 * current state.
 *
 * Return: %false if @dwork was idle and queued, %true if @dwork was
 * pending and its timer was modified.
 *
 * This function is safe to call from any context including IRQ handler.
 * See try_to_grab_pending() for details.
 */
bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
                         struct delayed_work *dwork, unsigned long delay)
{
        unsigned long irq_flags;
        bool ret;

        ret = work_grab_pending(&dwork->work, WORK_CANCEL_DELAYED, &irq_flags);

        if (!clear_pending_if_disabled(&dwork->work))
                __queue_delayed_work(cpu, wq, dwork, delay);

        local_irq_restore(irq_flags);
        return ret;
}
EXPORT_SYMBOL_GPL(mod_delayed_work_on);

static void rcu_work_rcufn(struct rcu_head *rcu)
{
        struct rcu_work *rwork = container_of(rcu, struct rcu_work, rcu);

        /* read the comment in __queue_work() */
        local_irq_disable();
        __queue_work(WORK_CPU_UNBOUND, rwork->wq, &rwork->work);
        local_irq_enable();
}

/**
 * queue_rcu_work - queue work after a RCU grace period
 * @wq: workqueue to use
 * @rwork: work to queue
 *
 * Return: %false if @rwork was already pending, %true otherwise.  Note
 * that a full RCU grace period is guaranteed only after a %true return.
 * While @rwork is guaranteed to be executed after a %false return, the
 * execution may happen before a full RCU grace period has passed.
 */
bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork)
{
        struct work_struct *work = &rwork->work;

        /*
         * rcu_work can't be canceled or disabled. Warn if the user reached
         * inside @rwork and disabled the inner work.
         */
        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) &&
            !WARN_ON_ONCE(clear_pending_if_disabled(work))) {
                rwork->wq = wq;
                call_rcu_hurry(&rwork->rcu, rcu_work_rcufn);
                return true;
        }

        return false;
}
EXPORT_SYMBOL(queue_rcu_work);

static struct worker *alloc_worker(int node)
{
        struct worker *worker;

        worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node);
        if (worker) {
                INIT_LIST_HEAD(&worker->entry);
                INIT_LIST_HEAD(&worker->scheduled);
                INIT_LIST_HEAD(&worker->node);
                /* on creation a worker is in !idle && prep state */
                worker->flags = WORKER_PREP;
        }
        return worker;
}

static cpumask_t *pool_allowed_cpus(struct worker_pool *pool)
{
        if (pool->cpu < 0 && pool->attrs->affn_strict)
                return pool->attrs->__pod_cpumask;
        else
                return pool->attrs->cpumask;
}

/**
 * worker_attach_to_pool() - attach a worker to a pool
 * @worker: worker to be attached
 * @pool: the target pool
 *
 * Attach @worker to @pool.  Once attached, the %WORKER_UNBOUND flag and
 * cpu-binding of @worker are kept coordinated with the pool across
 * cpu-[un]hotplugs.
 */
static void worker_attach_to_pool(struct worker *worker,
                                  struct worker_pool *pool)
{
        mutex_lock(&wq_pool_attach_mutex);

        /*
         * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains stable
         * across this function. See the comments above the flag definition for
         * details. BH workers are, while per-CPU, always DISASSOCIATED.
         */
        if (pool->flags & POOL_DISASSOCIATED) {
                worker->flags |= WORKER_UNBOUND;
        } else {
                WARN_ON_ONCE(pool->flags & POOL_BH);
                kthread_set_per_cpu(worker->task, pool->cpu);
        }

        if (worker->rescue_wq)
                set_cpus_allowed_ptr(worker->task, pool_allowed_cpus(pool));

        list_add_tail(&worker->node, &pool->workers);
        worker->pool = pool;

        mutex_unlock(&wq_pool_attach_mutex);
}

static void unbind_worker(struct worker *worker)
{
        lockdep_assert_held(&wq_pool_attach_mutex);

        kthread_set_per_cpu(worker->task, -1);
        if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask))
                WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0);
        else
                WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
}


static void detach_worker(struct worker *worker)
{
        lockdep_assert_held(&wq_pool_attach_mutex);

        unbind_worker(worker);
        list_del(&worker->node);
}

/**
 * worker_detach_from_pool() - detach a worker from its pool
 * @worker: worker which is attached to its pool
 *
 * Undo the attaching which had been done in worker_attach_to_pool().  The
 * caller worker shouldn't access to the pool after detached except it has
 * other reference to the pool.
 */
static void worker_detach_from_pool(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;

        /* there is one permanent BH worker per CPU which should never detach */
        WARN_ON_ONCE(pool->flags & POOL_BH);

        mutex_lock(&wq_pool_attach_mutex);
        detach_worker(worker);
        worker->pool = NULL;
        mutex_unlock(&wq_pool_attach_mutex);

        /* clear leftover flags without pool->lock after it is detached */
        worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND);
}

static int format_worker_id(char *buf, size_t size, struct worker *worker,
                            struct worker_pool *pool)
{
        if (worker->rescue_wq)
                return scnprintf(buf, size, "kworker/R-%s",
                                 worker->rescue_wq->name);

        if (pool) {
                if (pool->cpu >= 0)
                        return scnprintf(buf, size, "kworker/%d:%d%s",
                                         pool->cpu, worker->id,
                                         pool->attrs->nice < 0  ? "H" : "");
                else
                        return scnprintf(buf, size, "kworker/u%d:%d",
                                         pool->id, worker->id);
        } else {
                return scnprintf(buf, size, "kworker/dying");
        }
}

/**
 * create_worker - create a new workqueue worker
 * @pool: pool the new worker will belong to
 *
 * Create and start a new worker which is attached to @pool.
 *
 * CONTEXT:
 * Might sleep.  Does GFP_KERNEL allocations.
 *
 * Return:
 * Pointer to the newly created worker.
 */
static struct worker *create_worker(struct worker_pool *pool)
{
        struct worker *worker;
        int id;

        /* ID is needed to determine kthread name */
        id = ida_alloc(&pool->worker_ida, GFP_KERNEL);
        if (id < 0) {
                pr_err_once("workqueue: Failed to allocate a worker ID: %pe\n",
                            ERR_PTR(id));
                return NULL;
        }

        worker = alloc_worker(pool->node);
        if (!worker) {
                pr_err_once("workqueue: Failed to allocate a worker\n");
                goto fail;
        }

        worker->id = id;

        if (!(pool->flags & POOL_BH)) {
                char id_buf[WORKER_ID_LEN];

                format_worker_id(id_buf, sizeof(id_buf), worker, pool);
                worker->task = kthread_create_on_node(worker_thread, worker,
                                                      pool->node, "%s", id_buf);
                if (IS_ERR(worker->task)) {
                        if (PTR_ERR(worker->task) == -EINTR) {
                                pr_err("workqueue: Interrupted when creating a worker thread \"%s\"\n",
                                       id_buf);
                        } else {
                                pr_err_once("workqueue: Failed to create a worker thread: %pe",
                                            worker->task);
                        }
                        goto fail;
                }

                set_user_nice(worker->task, pool->attrs->nice);
                kthread_bind_mask(worker->task, pool_allowed_cpus(pool));
        }

        /* successful, attach the worker to the pool */
        worker_attach_to_pool(worker, pool);

        /* start the newly created worker */
        raw_spin_lock_irq(&pool->lock);

        worker->pool->nr_workers++;
        worker_enter_idle(worker);

        /*
         * @worker is waiting on a completion in kthread() and will trigger hung
         * check if not woken up soon. As kick_pool() is noop if @pool is empty,
         * wake it up explicitly.
         */
        if (worker->task)
                wake_up_process(worker->task);

        raw_spin_unlock_irq(&pool->lock);

        return worker;

fail:
        ida_free(&pool->worker_ida, id);
        kfree(worker);
        return NULL;
}

static void detach_dying_workers(struct list_head *cull_list)
{
        struct worker *worker;

        list_for_each_entry(worker, cull_list, entry)
                detach_worker(worker);
}

static void reap_dying_workers(struct list_head *cull_list)
{
        struct worker *worker, *tmp;

        list_for_each_entry_safe(worker, tmp, cull_list, entry) {
                list_del_init(&worker->entry);
                kthread_stop_put(worker->task);
                kfree(worker);
        }
}

/**
 * set_worker_dying - Tag a worker for destruction
 * @worker: worker to be destroyed
 * @list: transfer worker away from its pool->idle_list and into list
 *
 * Tag @worker for destruction and adjust @pool stats accordingly.  The worker
 * should be idle.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 */
static void set_worker_dying(struct worker *worker, struct list_head *list)
{
        struct worker_pool *pool = worker->pool;

        lockdep_assert_held(&pool->lock);
        lockdep_assert_held(&wq_pool_attach_mutex);

        /* sanity check frenzy */
        if (WARN_ON(worker->current_work) ||
            WARN_ON(!list_empty(&worker->scheduled)) ||
            WARN_ON(!(worker->flags & WORKER_IDLE)))
                return;

        pool->nr_workers--;
        pool->nr_idle--;

        worker->flags |= WORKER_DIE;

        list_move(&worker->entry, list);

        /* get an extra task struct reference for later kthread_stop_put() */
        get_task_struct(worker->task);
}

/**
 * idle_worker_timeout - check if some idle workers can now be deleted.
 * @t: The pool's idle_timer that just expired
 *
 * The timer is armed in worker_enter_idle(). Note that it isn't disarmed in
 * worker_leave_idle(), as a worker flicking between idle and active while its
 * pool is at the too_many_workers() tipping point would cause too much timer
 * housekeeping overhead. Since IDLE_WORKER_TIMEOUT is long enough, we just let
 * it expire and re-evaluate things from there.
 */
static void idle_worker_timeout(struct timer_list *t)
{
        struct worker_pool *pool = timer_container_of(pool, t, idle_timer);
        bool do_cull = false;

        if (work_pending(&pool->idle_cull_work))
                return;

        raw_spin_lock_irq(&pool->lock);

        if (too_many_workers(pool)) {
                struct worker *worker;
                unsigned long expires;

                /* idle_list is kept in LIFO order, check the last one */
                worker = list_last_entry(&pool->idle_list, struct worker, entry);
                expires = worker->last_active + IDLE_WORKER_TIMEOUT;
                do_cull = !time_before(jiffies, expires);

                if (!do_cull)
                        mod_timer(&pool->idle_timer, expires);
        }
        raw_spin_unlock_irq(&pool->lock);

        if (do_cull)
                queue_work(system_dfl_wq, &pool->idle_cull_work);
}

/**
 * idle_cull_fn - cull workers that have been idle for too long.
 * @work: the pool's work for handling these idle workers
 *
 * This goes through a pool's idle workers and gets rid of those that have been
 * idle for at least IDLE_WORKER_TIMEOUT seconds.
 *
 * We don't want to disturb isolated CPUs because of a pcpu kworker being
 * culled, so this also resets worker affinity. This requires a sleepable
 * context, hence the split between timer callback and work item.
 */
static void idle_cull_fn(struct work_struct *work)
{
        struct worker_pool *pool = container_of(work, struct worker_pool, idle_cull_work);
        LIST_HEAD(cull_list);

        /*
         * Grabbing wq_pool_attach_mutex here ensures an already-running worker
         * cannot proceed beyong set_pf_worker() in its self-destruct path.
         * This is required as a previously-preempted worker could run after
         * set_worker_dying() has happened but before detach_dying_workers() did.
         */
        mutex_lock(&wq_pool_attach_mutex);
        raw_spin_lock_irq(&pool->lock);

        while (too_many_workers(pool)) {
                struct worker *worker;
                unsigned long expires;

                worker = list_last_entry(&pool->idle_list, struct worker, entry);
                expires = worker->last_active + IDLE_WORKER_TIMEOUT;

                if (time_before(jiffies, expires)) {
                        mod_timer(&pool->idle_timer, expires);
                        break;
                }

                set_worker_dying(worker, &cull_list);
        }

        raw_spin_unlock_irq(&pool->lock);
        detach_dying_workers(&cull_list);
        mutex_unlock(&wq_pool_attach_mutex);

        reap_dying_workers(&cull_list);
}

static void send_mayday(struct work_struct *work)
{
        struct pool_workqueue *pwq = get_work_pwq(work);
        struct workqueue_struct *wq = pwq->wq;

        lockdep_assert_held(&wq_mayday_lock);

        if (!wq->rescuer)
                return;

        /* mayday mayday mayday */
        if (list_empty(&pwq->mayday_node)) {
                /*
                 * If @pwq is for an unbound wq, its base ref may be put at
                 * any time due to an attribute change.  Pin @pwq until the
                 * rescuer is done with it.
                 */
                get_pwq(pwq);
                list_add_tail(&pwq->mayday_node, &wq->maydays);
                wake_up_process(wq->rescuer->task);
                pwq->stats[PWQ_STAT_MAYDAY]++;
        }
}

static void pool_mayday_timeout(struct timer_list *t)
{
        struct worker_pool *pool = timer_container_of(pool, t, mayday_timer);
        struct work_struct *work;

        raw_spin_lock_irq(&pool->lock);
        raw_spin_lock(&wq_mayday_lock);                /* for wq->maydays */

        if (need_to_create_worker(pool)) {
                /*
                 * We've been trying to create a new worker but
                 * haven't been successful.  We might be hitting an
                 * allocation deadlock.  Send distress signals to
                 * rescuers.
                 */
                list_for_each_entry(work, &pool->worklist, entry)
                        send_mayday(work);
        }

        raw_spin_unlock(&wq_mayday_lock);
        raw_spin_unlock_irq(&pool->lock);

        mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
}

/**
 * maybe_create_worker - create a new worker if necessary
 * @pool: pool to create a new worker for
 *
 * Create a new worker for @pool if necessary.  @pool is guaranteed to
 * have at least one idle worker on return from this function.  If
 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
 * sent to all rescuers with works scheduled on @pool to resolve
 * possible allocation deadlock.
 *
 * On return, need_to_create_worker() is guaranteed to be %false and
 * may_start_working() %true.
 *
 * LOCKING:
 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.  Does GFP_KERNEL allocations.  Called only from
 * manager.
 */
static void maybe_create_worker(struct worker_pool *pool)
__releases(&pool->lock)
__acquires(&pool->lock)
{
restart:
        raw_spin_unlock_irq(&pool->lock);

        /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
        mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);

        while (true) {
                if (create_worker(pool) || !need_to_create_worker(pool))
                        break;

                schedule_timeout_interruptible(CREATE_COOLDOWN);

                if (!need_to_create_worker(pool))
                        break;
        }

        timer_delete_sync(&pool->mayday_timer);
        raw_spin_lock_irq(&pool->lock);
        /*
         * This is necessary even after a new worker was just successfully
         * created as @pool->lock was dropped and the new worker might have
         * already become busy.
         */
        if (need_to_create_worker(pool))
                goto restart;
}

#ifdef CONFIG_PREEMPT_RT
static void worker_lock_callback(struct worker_pool *pool)
{
        spin_lock(&pool->cb_lock);
}

static void worker_unlock_callback(struct worker_pool *pool)
{
        spin_unlock(&pool->cb_lock);
}

static void workqueue_callback_cancel_wait_running(struct worker_pool *pool)
{
        spin_lock(&pool->cb_lock);
        spin_unlock(&pool->cb_lock);
}

#else

static void worker_lock_callback(struct worker_pool *pool) { }
static void worker_unlock_callback(struct worker_pool *pool) { }
static void workqueue_callback_cancel_wait_running(struct worker_pool *pool) { }

#endif

/**
 * manage_workers - manage worker pool
 * @worker: self
 *
 * Assume the manager role and manage the worker pool @worker belongs
 * to.  At any given time, there can be only zero or one manager per
 * pool.  The exclusion is handled automatically by this function.
 *
 * The caller can safely start processing works on false return.  On
 * true return, it's guaranteed that need_to_create_worker() is false
 * and may_start_working() is true.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.  Does GFP_KERNEL allocations.
 *
 * Return:
 * %false if the pool doesn't need management and the caller can safely
 * start processing works, %true if management function was performed and
 * the conditions that the caller verified before calling the function may
 * no longer be true.
 */
static bool manage_workers(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;

        if (pool->flags & POOL_MANAGER_ACTIVE)
                return false;

        pool->flags |= POOL_MANAGER_ACTIVE;
        pool->manager = worker;

        maybe_create_worker(pool);

        pool->manager = NULL;
        pool->flags &= ~POOL_MANAGER_ACTIVE;
        rcuwait_wake_up(&manager_wait);
        return true;
}

/**
 * process_one_work - process single work
 * @worker: self
 * @work: work to process
 *
 * Process @work.  This function contains all the logics necessary to
 * process a single work including synchronization against and
 * interaction with other workers on the same cpu, queueing and
 * flushing.  As long as context requirement is met, any worker can
 * call this function to process a work.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock) which is released and regrabbed.
 */
static void process_one_work(struct worker *worker, struct work_struct *work)
__releases(&pool->lock)
__acquires(&pool->lock)
{
        struct pool_workqueue *pwq = get_work_pwq(work);
        struct worker_pool *pool = worker->pool;
        unsigned long work_data;
        int lockdep_start_depth, rcu_start_depth;
        bool bh_draining = pool->flags & POOL_BH_DRAINING;
#ifdef CONFIG_LOCKDEP
        /*
         * It is permissible to free the struct work_struct from
         * inside the function that is called from it, this we need to
         * take into account for lockdep too.  To avoid bogus "held
         * lock freed" warnings as well as problems when looking into
         * work->lockdep_map, make a copy and use that here.
         */
        struct lockdep_map lockdep_map;

        lockdep_copy_map(&lockdep_map, &work->lockdep_map);
#endif
        /* ensure we're on the correct CPU */
        WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
                     raw_smp_processor_id() != pool->cpu);

        /* claim and dequeue */
        debug_work_deactivate(work);
        hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
        worker->current_work = work;
        worker->current_func = work->func;
        worker->current_pwq = pwq;
        if (worker->task)
                worker->current_at = worker->task->se.sum_exec_runtime;
        work_data = *work_data_bits(work);
        worker->current_color = get_work_color(work_data);

        /*
         * Record wq name for cmdline and debug reporting, may get
         * overridden through set_worker_desc().
         */
        strscpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN);

        list_del_init(&work->entry);

        /*
         * CPU intensive works don't participate in concurrency management.
         * They're the scheduler's responsibility.  This takes @worker out
         * of concurrency management and the next code block will chain
         * execution of the pending work items.
         */
        if (unlikely(pwq->wq->flags & WQ_CPU_INTENSIVE))
                worker_set_flags(worker, WORKER_CPU_INTENSIVE);

        /*
         * Kick @pool if necessary. It's always noop for per-cpu worker pools
         * since nr_running would always be >= 1 at this point. This is used to
         * chain execution of the pending work items for WORKER_NOT_RUNNING
         * workers such as the UNBOUND and CPU_INTENSIVE ones.
         */
        kick_pool(pool);

        /*
         * Record the last pool and clear PENDING which should be the last
         * update to @work.  Also, do this inside @pool->lock so that
         * PENDING and queued state changes happen together while IRQ is
         * disabled.
         */
        set_work_pool_and_clear_pending(work, pool->id, pool_offq_flags(pool));

        pwq->stats[PWQ_STAT_STARTED]++;
        raw_spin_unlock_irq(&pool->lock);

        rcu_start_depth = rcu_preempt_depth();
        lockdep_start_depth = lockdep_depth(current);
        /* see drain_dead_softirq_workfn() */
        if (!bh_draining)
                lock_map_acquire(pwq->wq->lockdep_map);
        lock_map_acquire(&lockdep_map);
        /*
         * Strictly speaking we should mark the invariant state without holding
         * any locks, that is, before these two lock_map_acquire()'s.
         *
         * However, that would result in:
         *
         *   A(W1)
         *   WFC(C)
         *                A(W1)
         *                C(C)
         *
         * Which would create W1->C->W1 dependencies, even though there is no
         * actual deadlock possible. There are two solutions, using a
         * read-recursive acquire on the work(queue) 'locks', but this will then
         * hit the lockdep limitation on recursive locks, or simply discard
         * these locks.
         *
         * AFAICT there is no possible deadlock scenario between the
         * flush_work() and complete() primitives (except for single-threaded
         * workqueues), so hiding them isn't a problem.
         */
        lockdep_invariant_state(true);
        trace_workqueue_execute_start(work);
        worker->current_func(work);
        /*
         * While we must be careful to not use "work" after this, the trace
         * point will only record its address.
         */
        trace_workqueue_execute_end(work, worker->current_func);

        lock_map_release(&lockdep_map);
        if (!bh_draining)
                lock_map_release(pwq->wq->lockdep_map);

        if (unlikely((worker->task && in_atomic()) ||
                     lockdep_depth(current) != lockdep_start_depth ||
                     rcu_preempt_depth() != rcu_start_depth)) {
                pr_err("BUG: workqueue leaked atomic, lock or RCU: %s[%d]\n"
                       "     preempt=0x%08x lock=%d->%d RCU=%d->%d workfn=%ps\n",
                       current->comm, task_pid_nr(current), preempt_count(),
                       lockdep_start_depth, lockdep_depth(current),
                       rcu_start_depth, rcu_preempt_depth(),
                       worker->current_func);
                debug_show_held_locks(current);
                dump_stack();
        }

        /*
         * The following prevents a kworker from hogging CPU on !PREEMPTION
         * kernels, where a requeueing work item waiting for something to
         * happen could deadlock with stop_machine as such work item could
         * indefinitely requeue itself while all other CPUs are trapped in
         * stop_machine. At the same time, report a quiescent RCU state so
         * the same condition doesn't freeze RCU.
         */
        if (worker->task)
                cond_resched();

        raw_spin_lock_irq(&pool->lock);

        pwq->stats[PWQ_STAT_COMPLETED]++;

        /*
         * In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked
         * CPU intensive by wq_worker_tick() if @work hogged CPU longer than
         * wq_cpu_intensive_thresh_us. Clear it.
         */
        worker_clr_flags(worker, WORKER_CPU_INTENSIVE);

        /* tag the worker for identification in schedule() */
        worker->last_func = worker->current_func;

        /* we're done with it, release */
        hash_del(&worker->hentry);
        worker->current_work = NULL;
        worker->current_func = NULL;
        worker->current_pwq = NULL;
        worker->current_color = INT_MAX;

        /* must be the last step, see the function comment */
        pwq_dec_nr_in_flight(pwq, work_data);
}

/**
 * process_scheduled_works - process scheduled works
 * @worker: self
 *
 * Process all scheduled works.  Please note that the scheduled list
 * may change while processing a work, so this function repeatedly
 * fetches a work from the top and executes it.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.
 */
static void process_scheduled_works(struct worker *worker)
{
        struct work_struct *work;
        bool first = true;

        while ((work = list_first_entry_or_null(&worker->scheduled,
                                                struct work_struct, entry))) {
                if (first) {
                        worker->pool->watchdog_ts = jiffies;
                        first = false;
                }
                process_one_work(worker, work);
        }
}

static void set_pf_worker(bool val)
{
        mutex_lock(&wq_pool_attach_mutex);
        if (val)
                current->flags |= PF_WQ_WORKER;
        else
                current->flags &= ~PF_WQ_WORKER;
        mutex_unlock(&wq_pool_attach_mutex);
}

/**
 * worker_thread - the worker thread function
 * @__worker: self
 *
 * The worker thread function.  All workers belong to a worker_pool -
 * either a per-cpu one or dynamic unbound one.  These workers process all
 * work items regardless of their specific target workqueue.  The only
 * exception is work items which belong to workqueues with a rescuer which
 * will be explained in rescuer_thread().
 *
 * Return: 0
 */
static int worker_thread(void *__worker)
{
        struct worker *worker = __worker;
        struct worker_pool *pool = worker->pool;

        /* tell the scheduler that this is a workqueue worker */
        set_pf_worker(true);
woke_up:
        raw_spin_lock_irq(&pool->lock);

        /* am I supposed to die? */
        if (unlikely(worker->flags & WORKER_DIE)) {
                raw_spin_unlock_irq(&pool->lock);
                set_pf_worker(false);
                /*
                 * The worker is dead and PF_WQ_WORKER is cleared, worker->pool
                 * shouldn't be accessed, reset it to NULL in case otherwise.
                 */
                worker->pool = NULL;
                ida_free(&pool->worker_ida, worker->id);
                return 0;
        }

        worker_leave_idle(worker);
recheck:
        /* no more worker necessary? */
        if (!need_more_worker(pool))
                goto sleep;

        /* do we need to manage? */
        if (unlikely(!may_start_working(pool)) && manage_workers(worker))
                goto recheck;

        /*
         * ->scheduled list can only be filled while a worker is
         * preparing to process a work or actually processing it.
         * Make sure nobody diddled with it while I was sleeping.
         */
        WARN_ON_ONCE(!list_empty(&worker->scheduled));

        /*
         * Finish PREP stage.  We're guaranteed to have at least one idle
         * worker or that someone else has already assumed the manager
         * role.  This is where @worker starts participating in concurrency
         * management if applicable and concurrency management is restored
         * after being rebound.  See rebind_workers() for details.
         */
        worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);

        do {
                struct work_struct *work =
                        list_first_entry(&pool->worklist,
                                         struct work_struct, entry);

                if (assign_work(work, worker, NULL))
                        process_scheduled_works(worker);
        } while (keep_working(pool));

        worker_set_flags(worker, WORKER_PREP);
sleep:
        /*
         * pool->lock is held and there's no work to process and no need to
         * manage, sleep.  Workers are woken up only while holding
         * pool->lock or from local cpu, so setting the current state
         * before releasing pool->lock is enough to prevent losing any
         * event.
         */
        worker_enter_idle(worker);
        __set_current_state(TASK_IDLE);
        raw_spin_unlock_irq(&pool->lock);
        schedule();
        goto woke_up;
}

/**
 * rescuer_thread - the rescuer thread function
 * @__rescuer: self
 *
 * Workqueue rescuer thread function.  There's one rescuer for each
 * workqueue which has WQ_MEM_RECLAIM set.
 *
 * Regular work processing on a pool may block trying to create a new
 * worker which uses GFP_KERNEL allocation which has slight chance of
 * developing into deadlock if some works currently on the same queue
 * need to be processed to satisfy the GFP_KERNEL allocation.  This is
 * the problem rescuer solves.
 *
 * When such condition is possible, the pool summons rescuers of all
 * workqueues which have works queued on the pool and let them process
 * those works so that forward progress can be guaranteed.
 *
 * This should happen rarely.
 *
 * Return: 0
 */
static int rescuer_thread(void *__rescuer)
{
        struct worker *rescuer = __rescuer;
        struct workqueue_struct *wq = rescuer->rescue_wq;
        bool should_stop;

        set_user_nice(current, RESCUER_NICE_LEVEL);

        /*
         * Mark rescuer as worker too.  As WORKER_PREP is never cleared, it
         * doesn't participate in concurrency management.
         */
        set_pf_worker(true);
repeat:
        set_current_state(TASK_IDLE);

        /*
         * By the time the rescuer is requested to stop, the workqueue
         * shouldn't have any work pending, but @wq->maydays may still have
         * pwq(s) queued.  This can happen by non-rescuer workers consuming
         * all the work items before the rescuer got to them.  Go through
         * @wq->maydays processing before acting on should_stop so that the
         * list is always empty on exit.
         */
        should_stop = kthread_should_stop();

        /* see whether any pwq is asking for help */
        raw_spin_lock_irq(&wq_mayday_lock);

        while (!list_empty(&wq->maydays)) {
                struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
                                        struct pool_workqueue, mayday_node);
                struct worker_pool *pool = pwq->pool;
                struct work_struct *work, *n;

                __set_current_state(TASK_RUNNING);
                list_del_init(&pwq->mayday_node);

                raw_spin_unlock_irq(&wq_mayday_lock);

                worker_attach_to_pool(rescuer, pool);

                raw_spin_lock_irq(&pool->lock);

                /*
                 * Slurp in all works issued via this workqueue and
                 * process'em.
                 */
                WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
                list_for_each_entry_safe(work, n, &pool->worklist, entry) {
                        if (get_work_pwq(work) == pwq &&
                            assign_work(work, rescuer, &n))
                                pwq->stats[PWQ_STAT_RESCUED]++;
                }

                if (!list_empty(&rescuer->scheduled)) {
                        process_scheduled_works(rescuer);

                        /*
                         * The above execution of rescued work items could
                         * have created more to rescue through
                         * pwq_activate_first_inactive() or chained
                         * queueing.  Let's put @pwq back on mayday list so
                         * that such back-to-back work items, which may be
                         * being used to relieve memory pressure, don't
                         * incur MAYDAY_INTERVAL delay inbetween.
                         */
                        if (pwq->nr_active && need_to_create_worker(pool)) {
                                raw_spin_lock(&wq_mayday_lock);
                                /*
                                 * Queue iff we aren't racing destruction
                                 * and somebody else hasn't queued it already.
                                 */
                                if (wq->rescuer && list_empty(&pwq->mayday_node)) {
                                        get_pwq(pwq);
                                        list_add_tail(&pwq->mayday_node, &wq->maydays);
                                }
                                raw_spin_unlock(&wq_mayday_lock);
                        }
                }

                /*
                 * Leave this pool. Notify regular workers; otherwise, we end up
                 * with 0 concurrency and stalling the execution.
                 */
                kick_pool(pool);

                raw_spin_unlock_irq(&pool->lock);

                worker_detach_from_pool(rescuer);

                /*
                 * Put the reference grabbed by send_mayday().  @pool might
                 * go away any time after it.
                 */
                put_pwq_unlocked(pwq);

                raw_spin_lock_irq(&wq_mayday_lock);
        }

        raw_spin_unlock_irq(&wq_mayday_lock);

        if (should_stop) {
                __set_current_state(TASK_RUNNING);
                set_pf_worker(false);
                return 0;
        }

        /* rescuers should never participate in concurrency management */
        WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
        schedule();
        goto repeat;
}

static void bh_worker(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;
        int nr_restarts = BH_WORKER_RESTARTS;
        unsigned long end = jiffies + BH_WORKER_JIFFIES;

        worker_lock_callback(pool);
        raw_spin_lock_irq(&pool->lock);
        worker_leave_idle(worker);

        /*
         * This function follows the structure of worker_thread(). See there for
         * explanations on each step.
         */
        if (!need_more_worker(pool))
                goto done;

        WARN_ON_ONCE(!list_empty(&worker->scheduled));
        worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);

        do {
                struct work_struct *work =
                        list_first_entry(&pool->worklist,
                                         struct work_struct, entry);

                if (assign_work(work, worker, NULL))
                        process_scheduled_works(worker);
        } while (keep_working(pool) &&
                 --nr_restarts && time_before(jiffies, end));

        worker_set_flags(worker, WORKER_PREP);
done:
        worker_enter_idle(worker);
        kick_pool(pool);
        raw_spin_unlock_irq(&pool->lock);
        worker_unlock_callback(pool);
}

/*
 * TODO: Convert all tasklet users to workqueue and use softirq directly.
 *
 * This is currently called from tasklet[_hi]action() and thus is also called
 * whenever there are tasklets to run. Let's do an early exit if there's nothing
 * queued. Once conversion from tasklet is complete, the need_more_worker() test
 * can be dropped.
 *
 * After full conversion, we'll add worker->softirq_action, directly use the
 * softirq action and obtain the worker pointer from the softirq_action pointer.
 */
void workqueue_softirq_action(bool highpri)
{
        struct worker_pool *pool =
                &per_cpu(bh_worker_pools, smp_processor_id())[highpri];
        if (need_more_worker(pool))
                bh_worker(list_first_entry(&pool->workers, struct worker, node));
}

struct wq_drain_dead_softirq_work {
        struct work_struct        work;
        struct worker_pool        *pool;
        struct completion        done;
};

static void drain_dead_softirq_workfn(struct work_struct *work)
{
        struct wq_drain_dead_softirq_work *dead_work =
                container_of(work, struct wq_drain_dead_softirq_work, work);
        struct worker_pool *pool = dead_work->pool;
        bool repeat;

        /*
         * @pool's CPU is dead and we want to execute its still pending work
         * items from this BH work item which is running on a different CPU. As
         * its CPU is dead, @pool can't be kicked and, as work execution path
         * will be nested, a lockdep annotation needs to be suppressed. Mark
         * @pool with %POOL_BH_DRAINING for the special treatments.
         */
        raw_spin_lock_irq(&pool->lock);
        pool->flags |= POOL_BH_DRAINING;
        raw_spin_unlock_irq(&pool->lock);

        bh_worker(list_first_entry(&pool->workers, struct worker, node));

        raw_spin_lock_irq(&pool->lock);
        pool->flags &= ~POOL_BH_DRAINING;
        repeat = need_more_worker(pool);
        raw_spin_unlock_irq(&pool->lock);

        /*
         * bh_worker() might hit consecutive execution limit and bail. If there
         * still are pending work items, reschedule self and return so that we
         * don't hog this CPU's BH.
         */
        if (repeat) {
                if (pool->attrs->nice == HIGHPRI_NICE_LEVEL)
                        queue_work(system_bh_highpri_wq, work);
                else
                        queue_work(system_bh_wq, work);
        } else {
                complete(&dead_work->done);
        }
}

/*
 * @cpu is dead. Drain the remaining BH work items on the current CPU. It's
 * possible to allocate dead_work per CPU and avoid flushing. However, then we
 * have to worry about draining overlapping with CPU coming back online or
 * nesting (one CPU's dead_work queued on another CPU which is also dead and so
 * on). Let's keep it simple and drain them synchronously. These are BH work
 * items which shouldn't be requeued on the same pool. Shouldn't take long.
 */
void workqueue_softirq_dead(unsigned int cpu)
{
        int i;

        for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
                struct worker_pool *pool = &per_cpu(bh_worker_pools, cpu)[i];
                struct wq_drain_dead_softirq_work dead_work;

                if (!need_more_worker(pool))
                        continue;

                INIT_WORK_ONSTACK(&dead_work.work, drain_dead_softirq_workfn);
                dead_work.pool = pool;
                init_completion(&dead_work.done);

                if (pool->attrs->nice == HIGHPRI_NICE_LEVEL)
                        queue_work(system_bh_highpri_wq, &dead_work.work);
                else
                        queue_work(system_bh_wq, &dead_work.work);

                wait_for_completion(&dead_work.done);
                destroy_work_on_stack(&dead_work.work);
        }
}

/**
 * check_flush_dependency - check for flush dependency sanity
 * @target_wq: workqueue being flushed
 * @target_work: work item being flushed (NULL for workqueue flushes)
 * @from_cancel: are we called from the work cancel path
 *
 * %current is trying to flush the whole @target_wq or @target_work on it.
 * If this is not the cancel path (which implies work being flushed is either
 * already running, or will not be at all), check if @target_wq doesn't have
 * %WQ_MEM_RECLAIM and verify that %current is not reclaiming memory or running
 * on a workqueue which doesn't have %WQ_MEM_RECLAIM as that can break forward-
 * progress guarantee leading to a deadlock.
 */
static void check_flush_dependency(struct workqueue_struct *target_wq,
                                   struct work_struct *target_work,
                                   bool from_cancel)
{
        work_func_t target_func;
        struct worker *worker;

        if (from_cancel || target_wq->flags & WQ_MEM_RECLAIM)
                return;

        worker = current_wq_worker();
        target_func = target_work ? target_work->func : NULL;

        WARN_ONCE(current->flags & PF_MEMALLOC,
                  "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps",
                  current->pid, current->comm, target_wq->name, target_func);
        WARN_ONCE(worker && ((worker->current_pwq->wq->flags &
                              (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM),
                  "workqueue: WQ_MEM_RECLAIM %s:%ps is flushing !WQ_MEM_RECLAIM %s:%ps",
                  worker->current_pwq->wq->name, worker->current_func,
                  target_wq->name, target_func);
}

struct wq_barrier {
        struct work_struct        work;
        struct completion        done;
        struct task_struct        *task;        /* purely informational */
};

static void wq_barrier_func(struct work_struct *work)
{
        struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
        complete(&barr->done);
}

/**
 * insert_wq_barrier - insert a barrier work
 * @pwq: pwq to insert barrier into
 * @barr: wq_barrier to insert
 * @target: target work to attach @barr to
 * @worker: worker currently executing @target, NULL if @target is not executing
 *
 * @barr is linked to @target such that @barr is completed only after
 * @target finishes execution.  Please note that the ordering
 * guarantee is observed only with respect to @target and on the local
 * cpu.
 *
 * Currently, a queued barrier can't be canceled.  This is because
 * try_to_grab_pending() can't determine whether the work to be
 * grabbed is at the head of the queue and thus can't clear LINKED
 * flag of the previous work while there must be a valid next work
 * after a work with LINKED flag set.
 *
 * Note that when @worker is non-NULL, @target may be modified
 * underneath us, so we can't reliably determine pwq from @target.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 */
static void insert_wq_barrier(struct pool_workqueue *pwq,
                              struct wq_barrier *barr,
                              struct work_struct *target, struct worker *worker)
{
        static __maybe_unused struct lock_class_key bh_key, thr_key;
        unsigned int work_flags = 0;
        unsigned int work_color;
        struct list_head *head;

        /*
         * debugobject calls are safe here even with pool->lock locked
         * as we know for sure that this will not trigger any of the
         * checks and call back into the fixup functions where we
         * might deadlock.
         *
         * BH and threaded workqueues need separate lockdep keys to avoid
         * spuriously triggering "inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W}
         * usage".
         */
        INIT_WORK_ONSTACK_KEY(&barr->work, wq_barrier_func,
                              (pwq->wq->flags & WQ_BH) ? &bh_key : &thr_key);
        __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));

        init_completion_map(&barr->done, &target->lockdep_map);

        barr->task = current;

        /* The barrier work item does not participate in nr_active. */
        work_flags |= WORK_STRUCT_INACTIVE;

        /*
         * If @target is currently being executed, schedule the
         * barrier to the worker; otherwise, put it after @target.
         */
        if (worker) {
                head = worker->scheduled.next;
                work_color = worker->current_color;
        } else {
                unsigned long *bits = work_data_bits(target);

                head = target->entry.next;
                /* there can already be other linked works, inherit and set */
                work_flags |= *bits & WORK_STRUCT_LINKED;
                work_color = get_work_color(*bits);
                __set_bit(WORK_STRUCT_LINKED_BIT, bits);
        }

        pwq->nr_in_flight[work_color]++;
        work_flags |= work_color_to_flags(work_color);

        insert_work(pwq, &barr->work, head, work_flags);
}

/**
 * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing
 * @wq: workqueue being flushed
 * @flush_color: new flush color, < 0 for no-op
 * @work_color: new work color, < 0 for no-op
 *
 * Prepare pwqs for workqueue flushing.
 *
 * If @flush_color is non-negative, flush_color on all pwqs should be
 * -1.  If no pwq has in-flight commands at the specified color, all
 * pwq->flush_color's stay at -1 and %false is returned.  If any pwq
 * has in flight commands, its pwq->flush_color is set to
 * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq
 * wakeup logic is armed and %true is returned.
 *
 * The caller should have initialized @wq->first_flusher prior to
 * calling this function with non-negative @flush_color.  If
 * @flush_color is negative, no flush color update is done and %false
 * is returned.
 *
 * If @work_color is non-negative, all pwqs should have the same
 * work_color which is previous to @work_color and all will be
 * advanced to @work_color.
 *
 * CONTEXT:
 * mutex_lock(wq->mutex).
 *
 * Return:
 * %true if @flush_color >= 0 and there's something to flush.  %false
 * otherwise.
 */
static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
                                      int flush_color, int work_color)
{
        bool wait = false;
        struct pool_workqueue *pwq;
        struct worker_pool *current_pool = NULL;

        if (flush_color >= 0) {
                WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush));
                atomic_set(&wq->nr_pwqs_to_flush, 1);
        }

        /*
         * For unbound workqueue, pwqs will map to only a few pools.
         * Most of the time, pwqs within the same pool will be linked
         * sequentially to wq->pwqs by cpu index. So in the majority
         * of pwq iters, the pool is the same, only doing lock/unlock
         * if the pool has changed. This can largely reduce expensive
         * lock operations.
         */
        for_each_pwq(pwq, wq) {
                if (current_pool != pwq->pool) {
                        if (likely(current_pool))
                                raw_spin_unlock_irq(&current_pool->lock);
                        current_pool = pwq->pool;
                        raw_spin_lock_irq(&current_pool->lock);
                }

                if (flush_color >= 0) {
                        WARN_ON_ONCE(pwq->flush_color != -1);

                        if (pwq->nr_in_flight[flush_color]) {
                                pwq->flush_color = flush_color;
                                atomic_inc(&wq->nr_pwqs_to_flush);
                                wait = true;
                        }
                }

                if (work_color >= 0) {
                        WARN_ON_ONCE(work_color != work_next_color(pwq->work_color));
                        pwq->work_color = work_color;
                }

        }

        if (current_pool)
                raw_spin_unlock_irq(&current_pool->lock);

        if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush))
                complete(&wq->first_flusher->done);

        return wait;
}

static void touch_wq_lockdep_map(struct workqueue_struct *wq)
{
#ifdef CONFIG_LOCKDEP
        if (unlikely(!wq->lockdep_map))
                return;

        if (wq->flags & WQ_BH)
                local_bh_disable();

        lock_map_acquire(wq->lockdep_map);
        lock_map_release(wq->lockdep_map);

        if (wq->flags & WQ_BH)
                local_bh_enable();
#endif
}

static void touch_work_lockdep_map(struct work_struct *work,
                                   struct workqueue_struct *wq)
{
#ifdef CONFIG_LOCKDEP
        if (wq->flags & WQ_BH)
                local_bh_disable();

        lock_map_acquire(&work->lockdep_map);
        lock_map_release(&work->lockdep_map);

        if (wq->flags & WQ_BH)
                local_bh_enable();
#endif
}

/**
 * __flush_workqueue - ensure that any scheduled work has run to completion.
 * @wq: workqueue to flush
 *
 * This function sleeps until all work items which were queued on entry
 * have finished execution, but it is not livelocked by new incoming ones.
 */
void __flush_workqueue(struct workqueue_struct *wq)
{
        struct wq_flusher this_flusher = {
                .list = LIST_HEAD_INIT(this_flusher.list),
                .flush_color = -1,
                .done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, (*wq->lockdep_map)),
        };
        int next_color;

        if (WARN_ON(!wq_online))
                return;

        touch_wq_lockdep_map(wq);

        mutex_lock(&wq->mutex);

        /*
         * Start-to-wait phase
         */
        next_color = work_next_color(wq->work_color);

        if (next_color != wq->flush_color) {
                /*
                 * Color space is not full.  The current work_color
                 * becomes our flush_color and work_color is advanced
                 * by one.
                 */
                WARN_ON_ONCE(!list_empty(&wq->flusher_overflow));
                this_flusher.flush_color = wq->work_color;
                wq->work_color = next_color;

                if (!wq->first_flusher) {
                        /* no flush in progress, become the first flusher */
                        WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);

                        wq->first_flusher = &this_flusher;

                        if (!flush_workqueue_prep_pwqs(wq, wq->flush_color,
                                                       wq->work_color)) {
                                /* nothing to flush, done */
                                wq->flush_color = next_color;
                                wq->first_flusher = NULL;
                                goto out_unlock;
                        }
                } else {
                        /* wait in queue */
                        WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color);
                        list_add_tail(&this_flusher.list, &wq->flusher_queue);
                        flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
                }
        } else {
                /*
                 * Oops, color space is full, wait on overflow queue.
                 * The next flush completion will assign us
                 * flush_color and transfer to flusher_queue.
                 */
                list_add_tail(&this_flusher.list, &wq->flusher_overflow);
        }

        check_flush_dependency(wq, NULL, false);

        mutex_unlock(&wq->mutex);

        wait_for_completion(&this_flusher.done);

        /*
         * Wake-up-and-cascade phase
         *
         * First flushers are responsible for cascading flushes and
         * handling overflow.  Non-first flushers can simply return.
         */
        if (READ_ONCE(wq->first_flusher) != &this_flusher)
                return;

        mutex_lock(&wq->mutex);

        /* we might have raced, check again with mutex held */
        if (wq->first_flusher != &this_flusher)
                goto out_unlock;

        WRITE_ONCE(wq->first_flusher, NULL);

        WARN_ON_ONCE(!list_empty(&this_flusher.list));
        WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);

        while (true) {
                struct wq_flusher *next, *tmp;

                /* complete all the flushers sharing the current flush color */
                list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
                        if (next->flush_color != wq->flush_color)
                                break;
                        list_del_init(&next->list);
                        complete(&next->done);
                }

                WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) &&
                             wq->flush_color != work_next_color(wq->work_color));

                /* this flush_color is finished, advance by one */
                wq->flush_color = work_next_color(wq->flush_color);

                /* one color has been freed, handle overflow queue */
                if (!list_empty(&wq->flusher_overflow)) {
                        /*
                         * Assign the same color to all overflowed
                         * flushers, advance work_color and append to
                         * flusher_queue.  This is the start-to-wait
                         * phase for these overflowed flushers.
                         */
                        list_for_each_entry(tmp, &wq->flusher_overflow, list)
                                tmp->flush_color = wq->work_color;

                        wq->work_color = work_next_color(wq->work_color);

                        list_splice_tail_init(&wq->flusher_overflow,
                                              &wq->flusher_queue);
                        flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
                }

                if (list_empty(&wq->flusher_queue)) {
                        WARN_ON_ONCE(wq->flush_color != wq->work_color);
                        break;
                }

                /*
                 * Need to flush more colors.  Make the next flusher
                 * the new first flusher and arm pwqs.
                 */
                WARN_ON_ONCE(wq->flush_color == wq->work_color);
                WARN_ON_ONCE(wq->flush_color != next->flush_color);

                list_del_init(&next->list);
                wq->first_flusher = next;

                if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1))
                        break;

                /*
                 * Meh... this color is already done, clear first
                 * flusher and repeat cascading.
                 */
                wq->first_flusher = NULL;
        }

out_unlock:
        mutex_unlock(&wq->mutex);
}
EXPORT_SYMBOL(__flush_workqueue);

/**
 * drain_workqueue - drain a workqueue
 * @wq: workqueue to drain
 *
 * Wait until the workqueue becomes empty.  While draining is in progress,
 * only chain queueing is allowed.  IOW, only currently pending or running
 * work items on @wq can queue further work items on it.  @wq is flushed
 * repeatedly until it becomes empty.  The number of flushing is determined
 * by the depth of chaining and should be relatively short.  Whine if it
 * takes too long.
 */
void drain_workqueue(struct workqueue_struct *wq)
{
        unsigned int flush_cnt = 0;
        struct pool_workqueue *pwq;

        /*
         * __queue_work() needs to test whether there are drainers, is much
         * hotter than drain_workqueue() and already looks at @wq->flags.
         * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers.
         */
        mutex_lock(&wq->mutex);
        if (!wq->nr_drainers++)
                wq->flags |= __WQ_DRAINING;
        mutex_unlock(&wq->mutex);
reflush:
        __flush_workqueue(wq);

        mutex_lock(&wq->mutex);

        for_each_pwq(pwq, wq) {
                bool drained;

                raw_spin_lock_irq(&pwq->pool->lock);
                drained = pwq_is_empty(pwq);
                raw_spin_unlock_irq(&pwq->pool->lock);

                if (drained)
                        continue;

                if (++flush_cnt == 10 ||
                    (flush_cnt % 100 == 0 && flush_cnt <= 1000))
                        pr_warn("workqueue %s: %s() isn't complete after %u tries\n",
                                wq->name, __func__, flush_cnt);

                mutex_unlock(&wq->mutex);
                goto reflush;
        }

        if (!--wq->nr_drainers)
                wq->flags &= ~__WQ_DRAINING;
        mutex_unlock(&wq->mutex);
}
EXPORT_SYMBOL_GPL(drain_workqueue);

static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
                             bool from_cancel)
{
        struct worker *worker = NULL;
        struct worker_pool *pool;
        struct pool_workqueue *pwq;
        struct workqueue_struct *wq;

        rcu_read_lock();
        pool = get_work_pool(work);
        if (!pool) {
                rcu_read_unlock();
                return false;
        }

        raw_spin_lock_irq(&pool->lock);
        /* see the comment in try_to_grab_pending() with the same code */
        pwq = get_work_pwq(work);
        if (pwq) {
                if (unlikely(pwq->pool != pool))
                        goto already_gone;
        } else {
                worker = find_worker_executing_work(pool, work);
                if (!worker)
                        goto already_gone;
                pwq = worker->current_pwq;
        }

        wq = pwq->wq;
        check_flush_dependency(wq, work, from_cancel);

        insert_wq_barrier(pwq, barr, work, worker);
        raw_spin_unlock_irq(&pool->lock);

        touch_work_lockdep_map(work, wq);

        /*
         * Force a lock recursion deadlock when using flush_work() inside a
         * single-threaded or rescuer equipped workqueue.
         *
         * For single threaded workqueues the deadlock happens when the work
         * is after the work issuing the flush_work(). For rescuer equipped
         * workqueues the deadlock happens when the rescuer stalls, blocking
         * forward progress.
         */
        if (!from_cancel && (wq->saved_max_active == 1 || wq->rescuer))
                touch_wq_lockdep_map(wq);

        rcu_read_unlock();
        return true;
already_gone:
        raw_spin_unlock_irq(&pool->lock);
        rcu_read_unlock();
        return false;
}

static bool __flush_work(struct work_struct *work, bool from_cancel)
{
        struct wq_barrier barr;

        if (WARN_ON(!wq_online))
                return false;

        if (WARN_ON(!work->func))
                return false;

        if (!start_flush_work(work, &barr, from_cancel))
                return false;

        /*
         * start_flush_work() returned %true. If @from_cancel is set, we know
         * that @work must have been executing during start_flush_work() and
         * can't currently be queued. Its data must contain OFFQ bits. If @work
         * was queued on a BH workqueue, we also know that it was running in the
         * BH context and thus can be busy-waited.
         */
        if (from_cancel) {
                unsigned long data = *work_data_bits(work);

                if (!WARN_ON_ONCE(data & WORK_STRUCT_PWQ) &&
                    (data & WORK_OFFQ_BH)) {
                        /*
                         * On RT, prevent a live lock when %current preempted
                         * soft interrupt processing by blocking on lock which
                         * is owned by the thread invoking the callback.
                         */
                        while (!try_wait_for_completion(&barr.done)) {
                                if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
                                        struct worker_pool *pool;

                                        guard(rcu)();
                                        pool = get_work_pool(work);
                                        if (pool)
                                                workqueue_callback_cancel_wait_running(pool);
                                } else {
                                        cpu_relax();
                                }
                        }
                        goto out_destroy;
                }
        }

        wait_for_completion(&barr.done);

out_destroy:
        destroy_work_on_stack(&barr.work);
        return true;
}

/**
 * flush_work - wait for a work to finish executing the last queueing instance
 * @work: the work to flush
 *
 * Wait until @work has finished execution.  @work is guaranteed to be idle
 * on return if it hasn't been requeued since flush started.
 *
 * Return:
 * %true if flush_work() waited for the work to finish execution,
 * %false if it was already idle.
 */
bool flush_work(struct work_struct *work)
{
        might_sleep();
        return __flush_work(work, false);
}
EXPORT_SYMBOL_GPL(flush_work);

/**
 * flush_delayed_work - wait for a dwork to finish executing the last queueing
 * @dwork: the delayed work to flush
 *
 * Delayed timer is cancelled and the pending work is queued for
 * immediate execution.  Like flush_work(), this function only
 * considers the last queueing instance of @dwork.
 *
 * Return:
 * %true if flush_work() waited for the work to finish execution,
 * %false if it was already idle.
 */
bool flush_delayed_work(struct delayed_work *dwork)
{
        local_irq_disable();
        if (timer_delete_sync(&dwork->timer))
                __queue_work(dwork->cpu, dwork->wq, &dwork->work);
        local_irq_enable();
        return flush_work(&dwork->work);
}
EXPORT_SYMBOL(flush_delayed_work);

/**
 * flush_rcu_work - wait for a rwork to finish executing the last queueing
 * @rwork: the rcu work to flush
 *
 * Return:
 * %true if flush_rcu_work() waited for the work to finish execution,
 * %false if it was already idle.
 */
bool flush_rcu_work(struct rcu_work *rwork)
{
        if (test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&rwork->work))) {
                rcu_barrier();
                flush_work(&rwork->work);
                return true;
        } else {
                return flush_work(&rwork->work);
        }
}
EXPORT_SYMBOL(flush_rcu_work);

static void work_offqd_disable(struct work_offq_data *offqd)
{
        const unsigned long max = (1lu << WORK_OFFQ_DISABLE_BITS) - 1;

        if (likely(offqd->disable < max))
                offqd->disable++;
        else
                WARN_ONCE(true, "workqueue: work disable count overflowed\n");
}

static void work_offqd_enable(struct work_offq_data *offqd)
{
        if (likely(offqd->disable > 0))
                offqd->disable--;
        else
                WARN_ONCE(true, "workqueue: work disable count underflowed\n");
}

static bool __cancel_work(struct work_struct *work, u32 cflags)
{
        struct work_offq_data offqd;
        unsigned long irq_flags;
        int ret;

        ret = work_grab_pending(work, cflags, &irq_flags);

        work_offqd_unpack(&offqd, *work_data_bits(work));

        if (cflags & WORK_CANCEL_DISABLE)
                work_offqd_disable(&offqd);

        set_work_pool_and_clear_pending(work, offqd.pool_id,
                                        work_offqd_pack_flags(&offqd));
        local_irq_restore(irq_flags);
        return ret;
}

static bool __cancel_work_sync(struct work_struct *work, u32 cflags)
{
        bool ret;

        ret = __cancel_work(work, cflags | WORK_CANCEL_DISABLE);

        if (*work_data_bits(work) & WORK_OFFQ_BH)
                WARN_ON_ONCE(in_hardirq());
        else
                might_sleep();

        /*
         * Skip __flush_work() during early boot when we know that @work isn't
         * executing. This allows canceling during early boot.
         */
        if (wq_online)
                __flush_work(work, true);

        if (!(cflags & WORK_CANCEL_DISABLE))
                enable_work(work);

        return ret;
}

/*
 * See cancel_delayed_work()
 */
bool cancel_work(struct work_struct *work)
{
        return __cancel_work(work, 0);
}
EXPORT_SYMBOL(cancel_work);

/**
 * cancel_work_sync - cancel a work and wait for it to finish
 * @work: the work to cancel
 *
 * Cancel @work and wait for its execution to finish. This function can be used
 * even if the work re-queues itself or migrates to another workqueue. On return
 * from this function, @work is guaranteed to be not pending or executing on any
 * CPU as long as there aren't racing enqueues.
 *
 * cancel_work_sync(&delayed_work->work) must not be used for delayed_work's.
 * Use cancel_delayed_work_sync() instead.
 *
 * Must be called from a sleepable context if @work was last queued on a non-BH
 * workqueue. Can also be called from non-hardirq atomic contexts including BH
 * if @work was last queued on a BH workqueue.
 *
 * Returns %true if @work was pending, %false otherwise.
 */
bool cancel_work_sync(struct work_struct *work)
{
        return __cancel_work_sync(work, 0);
}
EXPORT_SYMBOL_GPL(cancel_work_sync);

/**
 * cancel_delayed_work - cancel a delayed work
 * @dwork: delayed_work to cancel
 *
 * Kill off a pending delayed_work.
 *
 * Return: %true if @dwork was pending and canceled; %false if it wasn't
 * pending.
 *
 * Note:
 * The work callback function may still be running on return, unless
 * it returns %true and the work doesn't re-arm itself.  Explicitly flush or
 * use cancel_delayed_work_sync() to wait on it.
 *
 * This function is safe to call from any context including IRQ handler.
 */
bool cancel_delayed_work(struct delayed_work *dwork)
{
        return __cancel_work(&dwork->work, WORK_CANCEL_DELAYED);
}
EXPORT_SYMBOL(cancel_delayed_work);

/**
 * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
 * @dwork: the delayed work cancel
 *
 * This is cancel_work_sync() for delayed works.
 *
 * Return:
 * %true if @dwork was pending, %false otherwise.
 */
bool cancel_delayed_work_sync(struct delayed_work *dwork)
{
        return __cancel_work_sync(&dwork->work, WORK_CANCEL_DELAYED);
}
EXPORT_SYMBOL(cancel_delayed_work_sync);

/**
 * disable_work - Disable and cancel a work item
 * @work: work item to disable
 *
 * Disable @work by incrementing its disable count and cancel it if currently
 * pending. As long as the disable count is non-zero, any attempt to queue @work
 * will fail and return %false. The maximum supported disable depth is 2 to the
 * power of %WORK_OFFQ_DISABLE_BITS, currently 65536.
 *
 * Can be called from any context. Returns %true if @work was pending, %false
 * otherwise.
 */
bool disable_work(struct work_struct *work)
{
        return __cancel_work(work, WORK_CANCEL_DISABLE);
}
EXPORT_SYMBOL_GPL(disable_work);

/**
 * disable_work_sync - Disable, cancel and drain a work item
 * @work: work item to disable
 *
 * Similar to disable_work() but also wait for @work to finish if currently
 * executing.
 *
 * Must be called from a sleepable context if @work was last queued on a non-BH
 * workqueue. Can also be called from non-hardirq atomic contexts including BH
 * if @work was last queued on a BH workqueue.
 *
 * Returns %true if @work was pending, %false otherwise.
 */
bool disable_work_sync(struct work_struct *work)
{
        return __cancel_work_sync(work, WORK_CANCEL_DISABLE);
}
EXPORT_SYMBOL_GPL(disable_work_sync);

/**
 * enable_work - Enable a work item
 * @work: work item to enable
 *
 * Undo disable_work[_sync]() by decrementing @work's disable count. @work can
 * only be queued if its disable count is 0.
 *
 * Can be called from any context. Returns %true if the disable count reached 0.
 * Otherwise, %false.
 */
bool enable_work(struct work_struct *work)
{
        struct work_offq_data offqd;
        unsigned long irq_flags;

        work_grab_pending(work, 0, &irq_flags);

        work_offqd_unpack(&offqd, *work_data_bits(work));
        work_offqd_enable(&offqd);
        set_work_pool_and_clear_pending(work, offqd.pool_id,
                                        work_offqd_pack_flags(&offqd));
        local_irq_restore(irq_flags);

        return !offqd.disable;
}
EXPORT_SYMBOL_GPL(enable_work);

/**
 * disable_delayed_work - Disable and cancel a delayed work item
 * @dwork: delayed work item to disable
 *
 * disable_work() for delayed work items.
 */
bool disable_delayed_work(struct delayed_work *dwork)
{
        return __cancel_work(&dwork->work,
                             WORK_CANCEL_DELAYED | WORK_CANCEL_DISABLE);
}
EXPORT_SYMBOL_GPL(disable_delayed_work);

/**
 * disable_delayed_work_sync - Disable, cancel and drain a delayed work item
 * @dwork: delayed work item to disable
 *
 * disable_work_sync() for delayed work items.
 */
bool disable_delayed_work_sync(struct delayed_work *dwork)
{
        return __cancel_work_sync(&dwork->work,
                                  WORK_CANCEL_DELAYED | WORK_CANCEL_DISABLE);
}
EXPORT_SYMBOL_GPL(disable_delayed_work_sync);

/**
 * enable_delayed_work - Enable a delayed work item
 * @dwork: delayed work item to enable
 *
 * enable_work() for delayed work items.
 */
bool enable_delayed_work(struct delayed_work *dwork)
{
        return enable_work(&dwork->work);
}
EXPORT_SYMBOL_GPL(enable_delayed_work);

/**
 * schedule_on_each_cpu - execute a function synchronously on each online CPU
 * @func: the function to call
 *
 * schedule_on_each_cpu() executes @func on each online CPU using the
 * system workqueue and blocks until all CPUs have completed.
 * schedule_on_each_cpu() is very slow.
 *
 * Return:
 * 0 on success, -errno on failure.
 */
int schedule_on_each_cpu(work_func_t func)
{
        int cpu;
        struct work_struct __percpu *works;

        works = alloc_percpu(struct work_struct);
        if (!works)
                return -ENOMEM;

        cpus_read_lock();

        for_each_online_cpu(cpu) {
                struct work_struct *work = per_cpu_ptr(works, cpu);

                INIT_WORK(work, func);
                schedule_work_on(cpu, work);
        }

        for_each_online_cpu(cpu)
                flush_work(per_cpu_ptr(works, cpu));

        cpus_read_unlock();
        free_percpu(works);
        return 0;
}

/**
 * execute_in_process_context - reliably execute the routine with user context
 * @fn:                the function to execute
 * @ew:                guaranteed storage for the execute work structure (must
 *                be available when the work executes)
 *
 * Executes the function immediately if process context is available,
 * otherwise schedules the function for delayed execution.
 *
 * Return:        0 - function was executed
 *                1 - function was scheduled for execution
 */
int execute_in_process_context(work_func_t fn, struct execute_work *ew)
{
        if (!in_interrupt()) {
                fn(&ew->work);
                return 0;
        }

        INIT_WORK(&ew->work, fn);
        schedule_work(&ew->work);

        return 1;
}
EXPORT_SYMBOL_GPL(execute_in_process_context);

/**
 * free_workqueue_attrs - free a workqueue_attrs
 * @attrs: workqueue_attrs to free
 *
 * Undo alloc_workqueue_attrs().
 */
void free_workqueue_attrs(struct workqueue_attrs *attrs)
{
        if (attrs) {
                free_cpumask_var(attrs->cpumask);
                free_cpumask_var(attrs->__pod_cpumask);
                kfree(attrs);
        }
}

/**
 * alloc_workqueue_attrs - allocate a workqueue_attrs
 *
 * Allocate a new workqueue_attrs, initialize with default settings and
 * return it.
 *
 * Return: The allocated new workqueue_attr on success. %NULL on failure.
 */
struct workqueue_attrs *alloc_workqueue_attrs_noprof(void)
{
        struct workqueue_attrs *attrs;

        attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
        if (!attrs)
                goto fail;
        if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL))
                goto fail;
        if (!alloc_cpumask_var(&attrs->__pod_cpumask, GFP_KERNEL))
                goto fail;

        cpumask_copy(attrs->cpumask, cpu_possible_mask);
        attrs->affn_scope = WQ_AFFN_DFL;
        return attrs;
fail:
        free_workqueue_attrs(attrs);
        return NULL;
}

static void copy_workqueue_attrs(struct workqueue_attrs *to,
                                 const struct workqueue_attrs *from)
{
        to->nice = from->nice;
        cpumask_copy(to->cpumask, from->cpumask);
        cpumask_copy(to->__pod_cpumask, from->__pod_cpumask);
        to->affn_strict = from->affn_strict;

        /*
         * Unlike hash and equality test, copying shouldn't ignore wq-only
         * fields as copying is used for both pool and wq attrs. Instead,
         * get_unbound_pool() explicitly clears the fields.
         */
        to->affn_scope = from->affn_scope;
        to->ordered = from->ordered;
}

/*
 * Some attrs fields are workqueue-only. Clear them for worker_pool's. See the
 * comments in 'struct workqueue_attrs' definition.
 */
static void wqattrs_clear_for_pool(struct workqueue_attrs *attrs)
{
        attrs->affn_scope = WQ_AFFN_NR_TYPES;
        attrs->ordered = false;
        if (attrs->affn_strict)
                cpumask_copy(attrs->cpumask, cpu_possible_mask);
}

/* hash value of the content of @attr */
static u32 wqattrs_hash(const struct workqueue_attrs *attrs)
{
        u32 hash = 0;

        hash = jhash_1word(attrs->nice, hash);
        hash = jhash_1word(attrs->affn_strict, hash);
        hash = jhash(cpumask_bits(attrs->__pod_cpumask),
                     BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
        if (!attrs->affn_strict)
                hash = jhash(cpumask_bits(attrs->cpumask),
                             BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
        return hash;
}

/* content equality test */
static bool wqattrs_equal(const struct workqueue_attrs *a,
                          const struct workqueue_attrs *b)
{
        if (a->nice != b->nice)
                return false;
        if (a->affn_strict != b->affn_strict)
                return false;
        if (!cpumask_equal(a->__pod_cpumask, b->__pod_cpumask))
                return false;
        if (!a->affn_strict && !cpumask_equal(a->cpumask, b->cpumask))
                return false;
        return true;
}

/* Update @attrs with actually available CPUs */
static void wqattrs_actualize_cpumask(struct workqueue_attrs *attrs,
                                      const cpumask_t *unbound_cpumask)
{
        /*
         * Calculate the effective CPU mask of @attrs given @unbound_cpumask. If
         * @attrs->cpumask doesn't overlap with @unbound_cpumask, we fallback to
         * @unbound_cpumask.
         */
        cpumask_and(attrs->cpumask, attrs->cpumask, unbound_cpumask);
        if (unlikely(cpumask_empty(attrs->cpumask)))
                cpumask_copy(attrs->cpumask, unbound_cpumask);
}

/* find wq_pod_type to use for @attrs */
static const struct wq_pod_type *
wqattrs_pod_type(const struct workqueue_attrs *attrs)
{
        enum wq_affn_scope scope;
        struct wq_pod_type *pt;

        /* to synchronize access to wq_affn_dfl */
        lockdep_assert_held(&wq_pool_mutex);

        if (attrs->affn_scope == WQ_AFFN_DFL)
                scope = wq_affn_dfl;
        else
                scope = attrs->affn_scope;

        pt = &wq_pod_types[scope];

        if (!WARN_ON_ONCE(attrs->affn_scope == WQ_AFFN_NR_TYPES) &&
            likely(pt->nr_pods))
                return pt;

        /*
         * Before workqueue_init_topology(), only SYSTEM is available which is
         * initialized in workqueue_init_early().
         */
        pt = &wq_pod_types[WQ_AFFN_SYSTEM];
        BUG_ON(!pt->nr_pods);
        return pt;
}

/**
 * init_worker_pool - initialize a newly zalloc'd worker_pool
 * @pool: worker_pool to initialize
 *
 * Initialize a newly zalloc'd @pool.  It also allocates @pool->attrs.
 *
 * Return: 0 on success, -errno on failure.  Even on failure, all fields
 * inside @pool proper are initialized and put_unbound_pool() can be called
 * on @pool safely to release it.
 */
static int init_worker_pool(struct worker_pool *pool)
{
        raw_spin_lock_init(&pool->lock);
        pool->id = -1;
        pool->cpu = -1;
        pool->node = NUMA_NO_NODE;
        pool->flags |= POOL_DISASSOCIATED;
        pool->watchdog_ts = jiffies;
        INIT_LIST_HEAD(&pool->worklist);
        INIT_LIST_HEAD(&pool->idle_list);
        hash_init(pool->busy_hash);

        timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE);
        INIT_WORK(&pool->idle_cull_work, idle_cull_fn);

        timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);

        INIT_LIST_HEAD(&pool->workers);

        ida_init(&pool->worker_ida);
        INIT_HLIST_NODE(&pool->hash_node);
        pool->refcnt = 1;
#ifdef CONFIG_PREEMPT_RT
        spin_lock_init(&pool->cb_lock);
#endif

        /* shouldn't fail above this point */
        pool->attrs = alloc_workqueue_attrs();
        if (!pool->attrs)
                return -ENOMEM;

        wqattrs_clear_for_pool(pool->attrs);

        return 0;
}

#ifdef CONFIG_LOCKDEP
static void wq_init_lockdep(struct workqueue_struct *wq)
{
        char *lock_name;

        lockdep_register_key(&wq->key);
        lock_name = kasprintf(GFP_KERNEL, "%s%s", "(wq_completion)", wq->name);
        if (!lock_name)
                lock_name = wq->name;

        wq->lock_name = lock_name;
        wq->lockdep_map = &wq->__lockdep_map;
        lockdep_init_map(wq->lockdep_map, lock_name, &wq->key, 0);
}

static void wq_unregister_lockdep(struct workqueue_struct *wq)
{
        if (wq->lockdep_map != &wq->__lockdep_map)
                return;

        lockdep_unregister_key(&wq->key);
}

static void wq_free_lockdep(struct workqueue_struct *wq)
{
        if (wq->lockdep_map != &wq->__lockdep_map)
                return;

        if (wq->lock_name != wq->name)
                kfree(wq->lock_name);
}
#else
static void wq_init_lockdep(struct workqueue_struct *wq)
{
}

static void wq_unregister_lockdep(struct workqueue_struct *wq)
{
}

static void wq_free_lockdep(struct workqueue_struct *wq)
{
}
#endif

static void free_node_nr_active(struct wq_node_nr_active **nna_ar)
{
        int node;

        for_each_node(node) {
                kfree(nna_ar[node]);
                nna_ar[node] = NULL;
        }

        kfree(nna_ar[nr_node_ids]);
        nna_ar[nr_node_ids] = NULL;
}

static void init_node_nr_active(struct wq_node_nr_active *nna)
{
        nna->max = WQ_DFL_MIN_ACTIVE;
        atomic_set(&nna->nr, 0);
        raw_spin_lock_init(&nna->lock);
        INIT_LIST_HEAD(&nna->pending_pwqs);
}

/*
 * Each node's nr_active counter will be accessed mostly from its own node and
 * should be allocated in the node.
 */
static int alloc_node_nr_active(struct wq_node_nr_active **nna_ar)
{
        struct wq_node_nr_active *nna;
        int node;

        for_each_node(node) {
                nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, node);
                if (!nna)
                        goto err_free;
                init_node_nr_active(nna);
                nna_ar[node] = nna;
        }

        /* [nr_node_ids] is used as the fallback */
        nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, NUMA_NO_NODE);
        if (!nna)
                goto err_free;
        init_node_nr_active(nna);
        nna_ar[nr_node_ids] = nna;

        return 0;

err_free:
        free_node_nr_active(nna_ar);
        return -ENOMEM;
}

static void rcu_free_wq(struct rcu_head *rcu)
{
        struct workqueue_struct *wq =
                container_of(rcu, struct workqueue_struct, rcu);

        if (wq->flags & WQ_UNBOUND)
                free_node_nr_active(wq->node_nr_active);

        wq_free_lockdep(wq);
        free_percpu(wq->cpu_pwq);
        free_workqueue_attrs(wq->unbound_attrs);
        kfree(wq);
}

static void rcu_free_pool(struct rcu_head *rcu)
{
        struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);

        ida_destroy(&pool->worker_ida);
        free_workqueue_attrs(pool->attrs);
        kfree(pool);
}

/**
 * put_unbound_pool - put a worker_pool
 * @pool: worker_pool to put
 *
 * Put @pool.  If its refcnt reaches zero, it gets destroyed in RCU
 * safe manner.  get_unbound_pool() calls this function on its failure path
 * and this function should be able to release pools which went through,
 * successfully or not, init_worker_pool().
 *
 * Should be called with wq_pool_mutex held.
 */
static void put_unbound_pool(struct worker_pool *pool)
{
        struct worker *worker;
        LIST_HEAD(cull_list);

        lockdep_assert_held(&wq_pool_mutex);

        if (--pool->refcnt)
                return;

        /* sanity checks */
        if (WARN_ON(!(pool->cpu < 0)) ||
            WARN_ON(!list_empty(&pool->worklist)))
                return;

        /* release id and unhash */
        if (pool->id >= 0)
                idr_remove(&worker_pool_idr, pool->id);
        hash_del(&pool->hash_node);

        /*
         * Become the manager and destroy all workers.  This prevents
         * @pool's workers from blocking on attach_mutex.  We're the last
         * manager and @pool gets freed with the flag set.
         *
         * Having a concurrent manager is quite unlikely to happen as we can
         * only get here with
         *   pwq->refcnt == pool->refcnt == 0
         * which implies no work queued to the pool, which implies no worker can
         * become the manager. However a worker could have taken the role of
         * manager before the refcnts dropped to 0, since maybe_create_worker()
         * drops pool->lock
         */
        while (true) {
                rcuwait_wait_event(&manager_wait,
                                   !(pool->flags & POOL_MANAGER_ACTIVE),
                                   TASK_UNINTERRUPTIBLE);

                mutex_lock(&wq_pool_attach_mutex);
                raw_spin_lock_irq(&pool->lock);
                if (!(pool->flags & POOL_MANAGER_ACTIVE)) {
                        pool->flags |= POOL_MANAGER_ACTIVE;
                        break;
                }
                raw_spin_unlock_irq(&pool->lock);
                mutex_unlock(&wq_pool_attach_mutex);
        }

        while ((worker = first_idle_worker(pool)))
                set_worker_dying(worker, &cull_list);
        WARN_ON(pool->nr_workers || pool->nr_idle);
        raw_spin_unlock_irq(&pool->lock);

        detach_dying_workers(&cull_list);

        mutex_unlock(&wq_pool_attach_mutex);

        reap_dying_workers(&cull_list);

        /* shut down the timers */
        timer_delete_sync(&pool->idle_timer);
        cancel_work_sync(&pool->idle_cull_work);
        timer_delete_sync(&pool->mayday_timer);

        /* RCU protected to allow dereferences from get_work_pool() */
        call_rcu(&pool->rcu, rcu_free_pool);
}

/**
 * get_unbound_pool - get a worker_pool with the specified attributes
 * @attrs: the attributes of the worker_pool to get
 *
 * Obtain a worker_pool which has the same attributes as @attrs, bump the
 * reference count and return it.  If there already is a matching
 * worker_pool, it will be used; otherwise, this function attempts to
 * create a new one.
 *
 * Should be called with wq_pool_mutex held.
 *
 * Return: On success, a worker_pool with the same attributes as @attrs.
 * On failure, %NULL.
 */
static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
{
        struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_NUMA];
        u32 hash = wqattrs_hash(attrs);
        struct worker_pool *pool;
        int pod, node = NUMA_NO_NODE;

        lockdep_assert_held(&wq_pool_mutex);

        /* do we already have a matching pool? */
        hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
                if (wqattrs_equal(pool->attrs, attrs)) {
                        pool->refcnt++;
                        return pool;
                }
        }

        /* If __pod_cpumask is contained inside a NUMA pod, that's our node */
        for (pod = 0; pod < pt->nr_pods; pod++) {
                if (cpumask_subset(attrs->__pod_cpumask, pt->pod_cpus[pod])) {
                        node = pt->pod_node[pod];
                        break;
                }
        }

        /* nope, create a new one */
        pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, node);
        if (!pool || init_worker_pool(pool) < 0)
                goto fail;

        pool->node = node;
        copy_workqueue_attrs(pool->attrs, attrs);
        wqattrs_clear_for_pool(pool->attrs);

        if (worker_pool_assign_id(pool) < 0)
                goto fail;

        /* create and start the initial worker */
        if (wq_online && !create_worker(pool))
                goto fail;

        /* install */
        hash_add(unbound_pool_hash, &pool->hash_node, hash);

        return pool;
fail:
        if (pool)
                put_unbound_pool(pool);
        return NULL;
}

/*
 * Scheduled on pwq_release_worker by put_pwq() when an unbound pwq hits zero
 * refcnt and needs to be destroyed.
 */
static void pwq_release_workfn(struct kthread_work *work)
{
        struct pool_workqueue *pwq = container_of(work, struct pool_workqueue,
                                                  release_work);
        struct workqueue_struct *wq = pwq->wq;
        struct worker_pool *pool = pwq->pool;
        bool is_last = false;

        /*
         * When @pwq is not linked, it doesn't hold any reference to the
         * @wq, and @wq is invalid to access.
         */
        if (!list_empty(&pwq->pwqs_node)) {
                mutex_lock(&wq->mutex);
                list_del_rcu(&pwq->pwqs_node);
                is_last = list_empty(&wq->pwqs);

                /*
                 * For ordered workqueue with a plugged dfl_pwq, restart it now.
                 */
                if (!is_last && (wq->flags & __WQ_ORDERED))
                        unplug_oldest_pwq(wq);

                mutex_unlock(&wq->mutex);
        }

        if (wq->flags & WQ_UNBOUND) {
                mutex_lock(&wq_pool_mutex);
                put_unbound_pool(pool);
                mutex_unlock(&wq_pool_mutex);
        }

        if (!list_empty(&pwq->pending_node)) {
                struct wq_node_nr_active *nna =
                        wq_node_nr_active(pwq->wq, pwq->pool->node);

                raw_spin_lock_irq(&nna->lock);
                list_del_init(&pwq->pending_node);
                raw_spin_unlock_irq(&nna->lock);
        }

        kfree_rcu(pwq, rcu);

        /*
         * If we're the last pwq going away, @wq is already dead and no one
         * is gonna access it anymore.  Schedule RCU free.
         */
        if (is_last) {
                wq_unregister_lockdep(wq);
                call_rcu(&wq->rcu, rcu_free_wq);
        }
}

/* initialize newly allocated @pwq which is associated with @wq and @pool */
static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
                     struct worker_pool *pool)
{
        BUG_ON((unsigned long)pwq & ~WORK_STRUCT_PWQ_MASK);

        memset(pwq, 0, sizeof(*pwq));

        pwq->pool = pool;
        pwq->wq = wq;
        pwq->flush_color = -1;
        pwq->refcnt = 1;
        INIT_LIST_HEAD(&pwq->inactive_works);
        INIT_LIST_HEAD(&pwq->pending_node);
        INIT_LIST_HEAD(&pwq->pwqs_node);
        INIT_LIST_HEAD(&pwq->mayday_node);
        kthread_init_work(&pwq->release_work, pwq_release_workfn);
}

/* sync @pwq with the current state of its associated wq and link it */
static void link_pwq(struct pool_workqueue *pwq)
{
        struct workqueue_struct *wq = pwq->wq;

        lockdep_assert_held(&wq->mutex);

        /* may be called multiple times, ignore if already linked */
        if (!list_empty(&pwq->pwqs_node))
                return;

        /* set the matching work_color */
        pwq->work_color = wq->work_color;

        /* link in @pwq */
        list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs);
}

/* obtain a pool matching @attr and create a pwq associating the pool and @wq */
static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
                                        const struct workqueue_attrs *attrs)
{
        struct worker_pool *pool;
        struct pool_workqueue *pwq;

        lockdep_assert_held(&wq_pool_mutex);

        pool = get_unbound_pool(attrs);
        if (!pool)
                return NULL;

        pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node);
        if (!pwq) {
                put_unbound_pool(pool);
                return NULL;
        }

        init_pwq(pwq, wq, pool);
        return pwq;
}

static void apply_wqattrs_lock(void)
{
        mutex_lock(&wq_pool_mutex);
}

static void apply_wqattrs_unlock(void)
{
        mutex_unlock(&wq_pool_mutex);
}

/**
 * wq_calc_pod_cpumask - calculate a wq_attrs' cpumask for a pod
 * @attrs: the wq_attrs of the default pwq of the target workqueue
 * @cpu: the target CPU
 *
 * Calculate the cpumask a workqueue with @attrs should use on @pod.
 * The result is stored in @attrs->__pod_cpumask.
 *
 * If pod affinity is not enabled, @attrs->cpumask is always used. If enabled
 * and @pod has online CPUs requested by @attrs, the returned cpumask is the
 * intersection of the possible CPUs of @pod and @attrs->cpumask.
 *
 * The caller is responsible for ensuring that the cpumask of @pod stays stable.
 */
static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu)
{
        const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
        int pod = pt->cpu_pod[cpu];

        /* calculate possible CPUs in @pod that @attrs wants */
        cpumask_and(attrs->__pod_cpumask, pt->pod_cpus[pod], attrs->cpumask);
        /* does @pod have any online CPUs @attrs wants? */
        if (!cpumask_intersects(attrs->__pod_cpumask, wq_online_cpumask)) {
                cpumask_copy(attrs->__pod_cpumask, attrs->cpumask);
                return;
        }
}

/* install @pwq into @wq and return the old pwq, @cpu < 0 for dfl_pwq */
static struct pool_workqueue *install_unbound_pwq(struct workqueue_struct *wq,
                                        int cpu, struct pool_workqueue *pwq)
{
        struct pool_workqueue __rcu **slot = unbound_pwq_slot(wq, cpu);
        struct pool_workqueue *old_pwq;

        lockdep_assert_held(&wq_pool_mutex);
        lockdep_assert_held(&wq->mutex);

        /* link_pwq() can handle duplicate calls */
        link_pwq(pwq);

        old_pwq = rcu_access_pointer(*slot);
        rcu_assign_pointer(*slot, pwq);
        return old_pwq;
}

/* context to store the prepared attrs & pwqs before applying */
struct apply_wqattrs_ctx {
        struct workqueue_struct        *wq;                /* target workqueue */
        struct workqueue_attrs        *attrs;                /* attrs to apply */
        struct list_head        list;                /* queued for batching commit */
        struct pool_workqueue        *dfl_pwq;
        struct pool_workqueue        *pwq_tbl[];
};

/* free the resources after success or abort */
static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)
{
        if (ctx) {
                int cpu;

                for_each_possible_cpu(cpu)
                        put_pwq_unlocked(ctx->pwq_tbl[cpu]);
                put_pwq_unlocked(ctx->dfl_pwq);

                free_workqueue_attrs(ctx->attrs);

                kfree(ctx);
        }
}

/* allocate the attrs and pwqs for later installation */
static struct apply_wqattrs_ctx *
apply_wqattrs_prepare(struct workqueue_struct *wq,
                      const struct workqueue_attrs *attrs,
                      const cpumask_var_t unbound_cpumask)
{
        struct apply_wqattrs_ctx *ctx;
        struct workqueue_attrs *new_attrs;
        int cpu;

        lockdep_assert_held(&wq_pool_mutex);

        if (WARN_ON(attrs->affn_scope < 0 ||
                    attrs->affn_scope >= WQ_AFFN_NR_TYPES))
                return ERR_PTR(-EINVAL);

        ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_cpu_ids), GFP_KERNEL);

        new_attrs = alloc_workqueue_attrs();
        if (!ctx || !new_attrs)
                goto out_free;

        /*
         * If something goes wrong during CPU up/down, we'll fall back to
         * the default pwq covering whole @attrs->cpumask.  Always create
         * it even if we don't use it immediately.
         */
        copy_workqueue_attrs(new_attrs, attrs);
        wqattrs_actualize_cpumask(new_attrs, unbound_cpumask);
        cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask);
        ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
        if (!ctx->dfl_pwq)
                goto out_free;

        for_each_possible_cpu(cpu) {
                if (new_attrs->ordered) {
                        ctx->dfl_pwq->refcnt++;
                        ctx->pwq_tbl[cpu] = ctx->dfl_pwq;
                } else {
                        wq_calc_pod_cpumask(new_attrs, cpu);
                        ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, new_attrs);
                        if (!ctx->pwq_tbl[cpu])
                                goto out_free;
                }
        }

        /* save the user configured attrs and sanitize it. */
        copy_workqueue_attrs(new_attrs, attrs);
        cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
        cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask);
        ctx->attrs = new_attrs;

        /*
         * For initialized ordered workqueues, there should only be one pwq
         * (dfl_pwq). Set the plugged flag of ctx->dfl_pwq to suspend execution
         * of newly queued work items until execution of older work items in
         * the old pwq's have completed.
         */
        if ((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))
                ctx->dfl_pwq->plugged = true;

        ctx->wq = wq;
        return ctx;

out_free:
        free_workqueue_attrs(new_attrs);
        apply_wqattrs_cleanup(ctx);
        return ERR_PTR(-ENOMEM);
}

/* set attrs and install prepared pwqs, @ctx points to old pwqs on return */
static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
{
        int cpu;

        /* all pwqs have been created successfully, let's install'em */
        mutex_lock(&ctx->wq->mutex);

        copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs);

        /* save the previous pwqs and install the new ones */
        for_each_possible_cpu(cpu)
                ctx->pwq_tbl[cpu] = install_unbound_pwq(ctx->wq, cpu,
                                                        ctx->pwq_tbl[cpu]);
        ctx->dfl_pwq = install_unbound_pwq(ctx->wq, -1, ctx->dfl_pwq);

        /* update node_nr_active->max */
        wq_update_node_max_active(ctx->wq, -1);

        /* rescuer needs to respect wq cpumask changes */
        if (ctx->wq->rescuer)
                set_cpus_allowed_ptr(ctx->wq->rescuer->task,
                                     unbound_effective_cpumask(ctx->wq));

        mutex_unlock(&ctx->wq->mutex);
}

static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
                                        const struct workqueue_attrs *attrs)
{
        struct apply_wqattrs_ctx *ctx;

        /* only unbound workqueues can change attributes */
        if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
                return -EINVAL;

        ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_cpumask);
        if (IS_ERR(ctx))
                return PTR_ERR(ctx);

        /* the ctx has been prepared successfully, let's commit it */
        apply_wqattrs_commit(ctx);
        apply_wqattrs_cleanup(ctx);

        return 0;
}

/**
 * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
 * @wq: the target workqueue
 * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
 *
 * Apply @attrs to an unbound workqueue @wq. Unless disabled, this function maps
 * a separate pwq to each CPU pod with possibles CPUs in @attrs->cpumask so that
 * work items are affine to the pod it was issued on. Older pwqs are released as
 * in-flight work items finish. Note that a work item which repeatedly requeues
 * itself back-to-back will stay on its current pwq.
 *
 * Performs GFP_KERNEL allocations.
 *
 * Return: 0 on success and -errno on failure.
 */
int apply_workqueue_attrs(struct workqueue_struct *wq,
                          const struct workqueue_attrs *attrs)
{
        int ret;

        mutex_lock(&wq_pool_mutex);
        ret = apply_workqueue_attrs_locked(wq, attrs);
        mutex_unlock(&wq_pool_mutex);

        return ret;
}

/**
 * unbound_wq_update_pwq - update a pwq slot for CPU hot[un]plug
 * @wq: the target workqueue
 * @cpu: the CPU to update the pwq slot for
 *
 * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
 * %CPU_DOWN_FAILED.  @cpu is in the same pod of the CPU being hot[un]plugged.
 *
 *
 * If pod affinity can't be adjusted due to memory allocation failure, it falls
 * back to @wq->dfl_pwq which may not be optimal but is always correct.
 *
 * Note that when the last allowed CPU of a pod goes offline for a workqueue
 * with a cpumask spanning multiple pods, the workers which were already
 * executing the work items for the workqueue will lose their CPU affinity and
 * may execute on any CPU. This is similar to how per-cpu workqueues behave on
 * CPU_DOWN. If a workqueue user wants strict affinity, it's the user's
 * responsibility to flush the work item from CPU_DOWN_PREPARE.
 */
static void unbound_wq_update_pwq(struct workqueue_struct *wq, int cpu)
{
        struct pool_workqueue *old_pwq = NULL, *pwq;
        struct workqueue_attrs *target_attrs;

        lockdep_assert_held(&wq_pool_mutex);

        if (!(wq->flags & WQ_UNBOUND) || wq->unbound_attrs->ordered)
                return;

        /*
         * We don't wanna alloc/free wq_attrs for each wq for each CPU.
         * Let's use a preallocated one.  The following buf is protected by
         * CPU hotplug exclusion.
         */
        target_attrs = unbound_wq_update_pwq_attrs_buf;

        copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
        wqattrs_actualize_cpumask(target_attrs, wq_unbound_cpumask);

        /* nothing to do if the target cpumask matches the current pwq */
        wq_calc_pod_cpumask(target_attrs, cpu);
        if (wqattrs_equal(target_attrs, unbound_pwq(wq, cpu)->pool->attrs))
                return;

        /* create a new pwq */
        pwq = alloc_unbound_pwq(wq, target_attrs);
        if (!pwq) {
                pr_warn("workqueue: allocation failed while updating CPU pod affinity of \"%s\"\n",
                        wq->name);
                goto use_dfl_pwq;
        }

        /* Install the new pwq. */
        mutex_lock(&wq->mutex);
        old_pwq = install_unbound_pwq(wq, cpu, pwq);
        goto out_unlock;

use_dfl_pwq:
        mutex_lock(&wq->mutex);
        pwq = unbound_pwq(wq, -1);
        raw_spin_lock_irq(&pwq->pool->lock);
        get_pwq(pwq);
        raw_spin_unlock_irq(&pwq->pool->lock);
        old_pwq = install_unbound_pwq(wq, cpu, pwq);
out_unlock:
        mutex_unlock(&wq->mutex);
        put_pwq_unlocked(old_pwq);
}

static int alloc_and_link_pwqs(struct workqueue_struct *wq)
{
        bool highpri = wq->flags & WQ_HIGHPRI;
        int cpu, ret;

        lockdep_assert_held(&wq_pool_mutex);

        wq->cpu_pwq = alloc_percpu(struct pool_workqueue *);
        if (!wq->cpu_pwq)
                goto enomem;

        if (!(wq->flags & WQ_UNBOUND)) {
                struct worker_pool __percpu *pools;

                if (wq->flags & WQ_BH)
                        pools = bh_worker_pools;
                else
                        pools = cpu_worker_pools;

                for_each_possible_cpu(cpu) {
                        struct pool_workqueue **pwq_p;
                        struct worker_pool *pool;

                        pool = &(per_cpu_ptr(pools, cpu)[highpri]);
                        pwq_p = per_cpu_ptr(wq->cpu_pwq, cpu);

                        *pwq_p = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL,
                                                       pool->node);
                        if (!*pwq_p)
                                goto enomem;

                        init_pwq(*pwq_p, wq, pool);

                        mutex_lock(&wq->mutex);
                        link_pwq(*pwq_p);
                        mutex_unlock(&wq->mutex);
                }
                return 0;
        }

        if (wq->flags & __WQ_ORDERED) {
                struct pool_workqueue *dfl_pwq;

                ret = apply_workqueue_attrs_locked(wq, ordered_wq_attrs[highpri]);
                /* there should only be single pwq for ordering guarantee */
                dfl_pwq = rcu_access_pointer(wq->dfl_pwq);
                WARN(!ret && (wq->pwqs.next != &dfl_pwq->pwqs_node ||
                              wq->pwqs.prev != &dfl_pwq->pwqs_node),
                     "ordering guarantee broken for workqueue %s\n", wq->name);
        } else {
                ret = apply_workqueue_attrs_locked(wq, unbound_std_wq_attrs[highpri]);
        }

        return ret;

enomem:
        if (wq->cpu_pwq) {
                for_each_possible_cpu(cpu) {
                        struct pool_workqueue *pwq = *per_cpu_ptr(wq->cpu_pwq, cpu);

                        if (pwq)
                                kmem_cache_free(pwq_cache, pwq);
                }
                free_percpu(wq->cpu_pwq);
                wq->cpu_pwq = NULL;
        }
        return -ENOMEM;
}

static int wq_clamp_max_active(int max_active, unsigned int flags,
                               const char *name)
{
        if (max_active < 1 || max_active > WQ_MAX_ACTIVE)
                pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n",
                        max_active, name, 1, WQ_MAX_ACTIVE);

        return clamp_val(max_active, 1, WQ_MAX_ACTIVE);
}

/*
 * Workqueues which may be used during memory reclaim should have a rescuer
 * to guarantee forward progress.
 */
static int init_rescuer(struct workqueue_struct *wq)
{
        struct worker *rescuer;
        char id_buf[WORKER_ID_LEN];
        int ret;

        lockdep_assert_held(&wq_pool_mutex);

        if (!(wq->flags & WQ_MEM_RECLAIM))
                return 0;

        rescuer = alloc_worker(NUMA_NO_NODE);
        if (!rescuer) {
                pr_err("workqueue: Failed to allocate a rescuer for wq \"%s\"\n",
                       wq->name);
                return -ENOMEM;
        }

        rescuer->rescue_wq = wq;
        format_worker_id(id_buf, sizeof(id_buf), rescuer, NULL);

        rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", id_buf);
        if (IS_ERR(rescuer->task)) {
                ret = PTR_ERR(rescuer->task);
                pr_err("workqueue: Failed to create a rescuer kthread for wq \"%s\": %pe",
                       wq->name, ERR_PTR(ret));
                kfree(rescuer);
                return ret;
        }

        wq->rescuer = rescuer;
        if (wq->flags & WQ_UNBOUND)
                kthread_bind_mask(rescuer->task, unbound_effective_cpumask(wq));
        else
                kthread_bind_mask(rescuer->task, cpu_possible_mask);
        wake_up_process(rescuer->task);

        return 0;
}

/**
 * wq_adjust_max_active - update a wq's max_active to the current setting
 * @wq: target workqueue
 *
 * If @wq isn't freezing, set @wq->max_active to the saved_max_active and
 * activate inactive work items accordingly. If @wq is freezing, clear
 * @wq->max_active to zero.
 */
static void wq_adjust_max_active(struct workqueue_struct *wq)
{
        bool activated;
        int new_max, new_min;

        lockdep_assert_held(&wq->mutex);

        if ((wq->flags & WQ_FREEZABLE) && workqueue_freezing) {
                new_max = 0;
                new_min = 0;
        } else {
                new_max = wq->saved_max_active;
                new_min = wq->saved_min_active;
        }

        if (wq->max_active == new_max && wq->min_active == new_min)
                return;

        /*
         * Update @wq->max/min_active and then kick inactive work items if more
         * active work items are allowed. This doesn't break work item ordering
         * because new work items are always queued behind existing inactive
         * work items if there are any.
         */
        WRITE_ONCE(wq->max_active, new_max);
        WRITE_ONCE(wq->min_active, new_min);

        if (wq->flags & WQ_UNBOUND)
                wq_update_node_max_active(wq, -1);

        if (new_max == 0)
                return;

        /*
         * Round-robin through pwq's activating the first inactive work item
         * until max_active is filled.
         */
        do {
                struct pool_workqueue *pwq;

                activated = false;
                for_each_pwq(pwq, wq) {
                        unsigned long irq_flags;

                        /* can be called during early boot w/ irq disabled */
                        raw_spin_lock_irqsave(&pwq->pool->lock, irq_flags);
                        if (pwq_activate_first_inactive(pwq, true)) {
                                activated = true;
                                kick_pool(pwq->pool);
                        }
                        raw_spin_unlock_irqrestore(&pwq->pool->lock, irq_flags);
                }
        } while (activated);
}

__printf(1, 0)
static struct workqueue_struct *__alloc_workqueue(const char *fmt,
                                                  unsigned int flags,
                                                  int max_active, va_list args)
{
        struct workqueue_struct *wq;
        size_t wq_size;
        int name_len;

        if (flags & WQ_BH) {
                if (WARN_ON_ONCE(flags & ~__WQ_BH_ALLOWS))
                        return NULL;
                if (WARN_ON_ONCE(max_active))
                        return NULL;
        }

        /* see the comment above the definition of WQ_POWER_EFFICIENT */
        if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
                flags |= WQ_UNBOUND;

        /* allocate wq and format name */
        if (flags & WQ_UNBOUND)
                wq_size = struct_size(wq, node_nr_active, nr_node_ids + 1);
        else
                wq_size = sizeof(*wq);

        wq = kzalloc_noprof(wq_size, GFP_KERNEL);
        if (!wq)
                return NULL;

        if (flags & WQ_UNBOUND) {
                wq->unbound_attrs = alloc_workqueue_attrs_noprof();
                if (!wq->unbound_attrs)
                        goto err_free_wq;
        }

        name_len = vsnprintf(wq->name, sizeof(wq->name), fmt, args);

        if (name_len >= WQ_NAME_LEN)
                pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n",
                             wq->name);

        if (flags & WQ_BH) {
                /*
                 * BH workqueues always share a single execution context per CPU
                 * and don't impose any max_active limit.
                 */
                max_active = INT_MAX;
        } else {
                max_active = max_active ?: WQ_DFL_ACTIVE;
                max_active = wq_clamp_max_active(max_active, flags, wq->name);
        }

        /* init wq */
        wq->flags = flags;
        wq->max_active = max_active;
        wq->min_active = min(max_active, WQ_DFL_MIN_ACTIVE);
        wq->saved_max_active = wq->max_active;
        wq->saved_min_active = wq->min_active;
        mutex_init(&wq->mutex);
        atomic_set(&wq->nr_pwqs_to_flush, 0);
        INIT_LIST_HEAD(&wq->pwqs);
        INIT_LIST_HEAD(&wq->flusher_queue);
        INIT_LIST_HEAD(&wq->flusher_overflow);
        INIT_LIST_HEAD(&wq->maydays);

        INIT_LIST_HEAD(&wq->list);

        if (flags & WQ_UNBOUND) {
                if (alloc_node_nr_active(wq->node_nr_active) < 0)
                        goto err_free_wq;
        }

        /*
         * wq_pool_mutex protects the workqueues list, allocations of PWQs,
         * and the global freeze state.
         */
        apply_wqattrs_lock();

        if (alloc_and_link_pwqs(wq) < 0)
                goto err_unlock_free_node_nr_active;

        mutex_lock(&wq->mutex);
        wq_adjust_max_active(wq);
        mutex_unlock(&wq->mutex);

        list_add_tail_rcu(&wq->list, &workqueues);

        if (wq_online && init_rescuer(wq) < 0)
                goto err_unlock_destroy;

        apply_wqattrs_unlock();

        if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
                goto err_destroy;

        return wq;

err_unlock_free_node_nr_active:
        apply_wqattrs_unlock();
        /*
         * Failed alloc_and_link_pwqs() may leave pending pwq->release_work,
         * flushing the pwq_release_worker ensures that the pwq_release_workfn()
         * completes before calling kfree(wq).
         */
        if (wq->flags & WQ_UNBOUND) {
                kthread_flush_worker(pwq_release_worker);
                free_node_nr_active(wq->node_nr_active);
        }
err_free_wq:
        free_workqueue_attrs(wq->unbound_attrs);
        kfree(wq);
        return NULL;
err_unlock_destroy:
        apply_wqattrs_unlock();
err_destroy:
        destroy_workqueue(wq);
        return NULL;
}

__printf(1, 4)
struct workqueue_struct *alloc_workqueue_noprof(const char *fmt,
                                                unsigned int flags,
                                                int max_active, ...)
{
        struct workqueue_struct *wq;
        va_list args;

        va_start(args, max_active);
        wq = __alloc_workqueue(fmt, flags, max_active, args);
        va_end(args);
        if (!wq)
                return NULL;

        wq_init_lockdep(wq);

        return wq;
}
EXPORT_SYMBOL_GPL(alloc_workqueue_noprof);

#ifdef CONFIG_LOCKDEP
__printf(1, 5)
struct workqueue_struct *
alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags,
                            int max_active, struct lockdep_map *lockdep_map, ...)
{
        struct workqueue_struct *wq;
        va_list args;

        va_start(args, lockdep_map);
        wq = __alloc_workqueue(fmt, flags, max_active, args);
        va_end(args);
        if (!wq)
                return NULL;

        wq->lockdep_map = lockdep_map;

        return wq;
}
EXPORT_SYMBOL_GPL(alloc_workqueue_lockdep_map);
#endif

static bool pwq_busy(struct pool_workqueue *pwq)
{
        int i;

        for (i = 0; i < WORK_NR_COLORS; i++)
                if (pwq->nr_in_flight[i])
                        return true;

        if ((pwq != rcu_access_pointer(pwq->wq->dfl_pwq)) && (pwq->refcnt > 1))
                return true;
        if (!pwq_is_empty(pwq))
                return true;

        return false;
}

/**
 * destroy_workqueue - safely terminate a workqueue
 * @wq: target workqueue
 *
 * Safely destroy a workqueue. All work currently pending will be done first.
 *
 * This function does NOT guarantee that non-pending work that has been
 * submitted with queue_delayed_work() and similar functions will be done
 * before destroying the workqueue. The fundamental problem is that, currently,
 * the workqueue has no way of accessing non-pending delayed_work. delayed_work
 * is only linked on the timer-side. All delayed_work must, therefore, be
 * canceled before calling this function.
 *
 * TODO: It would be better if the problem described above wouldn't exist and
 * destroy_workqueue() would cleanly cancel all pending and non-pending
 * delayed_work.
 */
void destroy_workqueue(struct workqueue_struct *wq)
{
        struct pool_workqueue *pwq;
        int cpu;

        /*
         * Remove it from sysfs first so that sanity check failure doesn't
         * lead to sysfs name conflicts.
         */
        workqueue_sysfs_unregister(wq);

        /* mark the workqueue destruction is in progress */
        mutex_lock(&wq->mutex);
        wq->flags |= __WQ_DESTROYING;
        mutex_unlock(&wq->mutex);

        /* drain it before proceeding with destruction */
        drain_workqueue(wq);

        /* kill rescuer, if sanity checks fail, leave it w/o rescuer */
        if (wq->rescuer) {
                struct worker *rescuer = wq->rescuer;

                /* this prevents new queueing */
                raw_spin_lock_irq(&wq_mayday_lock);
                wq->rescuer = NULL;
                raw_spin_unlock_irq(&wq_mayday_lock);

                /* rescuer will empty maydays list before exiting */
                kthread_stop(rescuer->task);
                kfree(rescuer);
        }

        /*
         * Sanity checks - grab all the locks so that we wait for all
         * in-flight operations which may do put_pwq().
         */
        mutex_lock(&wq_pool_mutex);
        mutex_lock(&wq->mutex);
        for_each_pwq(pwq, wq) {
                raw_spin_lock_irq(&pwq->pool->lock);
                if (WARN_ON(pwq_busy(pwq))) {
                        pr_warn("%s: %s has the following busy pwq\n",
                                __func__, wq->name);
                        show_pwq(pwq);
                        raw_spin_unlock_irq(&pwq->pool->lock);
                        mutex_unlock(&wq->mutex);
                        mutex_unlock(&wq_pool_mutex);
                        show_one_workqueue(wq);
                        return;
                }
                raw_spin_unlock_irq(&pwq->pool->lock);
        }
        mutex_unlock(&wq->mutex);

        /*
         * wq list is used to freeze wq, remove from list after
         * flushing is complete in case freeze races us.
         */
        list_del_rcu(&wq->list);
        mutex_unlock(&wq_pool_mutex);

        /*
         * We're the sole accessor of @wq. Directly access cpu_pwq and dfl_pwq
         * to put the base refs. @wq will be auto-destroyed from the last
         * pwq_put. RCU read lock prevents @wq from going away from under us.
         */
        rcu_read_lock();

        for_each_possible_cpu(cpu) {
                put_pwq_unlocked(unbound_pwq(wq, cpu));
                RCU_INIT_POINTER(*unbound_pwq_slot(wq, cpu), NULL);
        }

        put_pwq_unlocked(unbound_pwq(wq, -1));
        RCU_INIT_POINTER(*unbound_pwq_slot(wq, -1), NULL);

        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(destroy_workqueue);

/**
 * workqueue_set_max_active - adjust max_active of a workqueue
 * @wq: target workqueue
 * @max_active: new max_active value.
 *
 * Set max_active of @wq to @max_active. See the alloc_workqueue() function
 * comment.
 *
 * CONTEXT:
 * Don't call from IRQ context.
 */
void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
{
        /* max_active doesn't mean anything for BH workqueues */
        if (WARN_ON(wq->flags & WQ_BH))
                return;
        /* disallow meddling with max_active for ordered workqueues */
        if (WARN_ON(wq->flags & __WQ_ORDERED))
                return;

        max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);

        mutex_lock(&wq->mutex);

        wq->saved_max_active = max_active;
        if (wq->flags & WQ_UNBOUND)
                wq->saved_min_active = min(wq->saved_min_active, max_active);

        wq_adjust_max_active(wq);

        mutex_unlock(&wq->mutex);
}
EXPORT_SYMBOL_GPL(workqueue_set_max_active);

/**
 * workqueue_set_min_active - adjust min_active of an unbound workqueue
 * @wq: target unbound workqueue
 * @min_active: new min_active value
 *
 * Set min_active of an unbound workqueue. Unlike other types of workqueues, an
 * unbound workqueue is not guaranteed to be able to process max_active
 * interdependent work items. Instead, an unbound workqueue is guaranteed to be
 * able to process min_active number of interdependent work items which is
 * %WQ_DFL_MIN_ACTIVE by default.
 *
 * Use this function to adjust the min_active value between 0 and the current
 * max_active.
 */
void workqueue_set_min_active(struct workqueue_struct *wq, int min_active)
{
        /* min_active is only meaningful for non-ordered unbound workqueues */
        if (WARN_ON((wq->flags & (WQ_BH | WQ_UNBOUND | __WQ_ORDERED)) !=
                    WQ_UNBOUND))
                return;

        mutex_lock(&wq->mutex);
        wq->saved_min_active = clamp(min_active, 0, wq->saved_max_active);
        wq_adjust_max_active(wq);
        mutex_unlock(&wq->mutex);
}

/**
 * current_work - retrieve %current task's work struct
 *
 * Determine if %current task is a workqueue worker and what it's working on.
 * Useful to find out the context that the %current task is running in.
 *
 * Return: work struct if %current task is a workqueue worker, %NULL otherwise.
 */
struct work_struct *current_work(void)
{
        struct worker *worker = current_wq_worker();

        return worker ? worker->current_work : NULL;
}
EXPORT_SYMBOL(current_work);

/**
 * current_is_workqueue_rescuer - is %current workqueue rescuer?
 *
 * Determine whether %current is a workqueue rescuer.  Can be used from
 * work functions to determine whether it's being run off the rescuer task.
 *
 * Return: %true if %current is a workqueue rescuer. %false otherwise.
 */
bool current_is_workqueue_rescuer(void)
{
        struct worker *worker = current_wq_worker();

        return worker && worker->rescue_wq;
}

/**
 * workqueue_congested - test whether a workqueue is congested
 * @cpu: CPU in question
 * @wq: target workqueue
 *
 * Test whether @wq's cpu workqueue for @cpu is congested.  There is
 * no synchronization around this function and the test result is
 * unreliable and only useful as advisory hints or for debugging.
 *
 * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU.
 *
 * With the exception of ordered workqueues, all workqueues have per-cpu
 * pool_workqueues, each with its own congested state. A workqueue being
 * congested on one CPU doesn't mean that the workqueue is contested on any
 * other CPUs.
 *
 * Return:
 * %true if congested, %false otherwise.
 */
bool workqueue_congested(int cpu, struct workqueue_struct *wq)
{
        struct pool_workqueue *pwq;
        bool ret;

        preempt_disable();

        if (cpu == WORK_CPU_UNBOUND)
                cpu = smp_processor_id();

        pwq = *per_cpu_ptr(wq->cpu_pwq, cpu);
        ret = !list_empty(&pwq->inactive_works);

        preempt_enable();

        return ret;
}
EXPORT_SYMBOL_GPL(workqueue_congested);

/**
 * work_busy - test whether a work is currently pending or running
 * @work: the work to be tested
 *
 * Test whether @work is currently pending or running.  There is no
 * synchronization around this function and the test result is
 * unreliable and only useful as advisory hints or for debugging.
 *
 * Return:
 * OR'd bitmask of WORK_BUSY_* bits.
 */
unsigned int work_busy(struct work_struct *work)
{
        struct worker_pool *pool;
        unsigned long irq_flags;
        unsigned int ret = 0;

        if (work_pending(work))
                ret |= WORK_BUSY_PENDING;

        rcu_read_lock();
        pool = get_work_pool(work);
        if (pool) {
                raw_spin_lock_irqsave(&pool->lock, irq_flags);
                if (find_worker_executing_work(pool, work))
                        ret |= WORK_BUSY_RUNNING;
                raw_spin_unlock_irqrestore(&pool->lock, irq_flags);
        }
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL_GPL(work_busy);

/**
 * set_worker_desc - set description for the current work item
 * @fmt: printf-style format string
 * @...: arguments for the format string
 *
 * This function can be called by a running work function to describe what
 * the work item is about.  If the worker task gets dumped, this
 * information will be printed out together to help debugging.  The
 * description can be at most WORKER_DESC_LEN including the trailing '\0'.
 */
void set_worker_desc(const char *fmt, ...)
{
        struct worker *worker = current_wq_worker();
        va_list args;

        if (worker) {
                va_start(args, fmt);
                vsnprintf(worker->desc, sizeof(worker->desc), fmt, args);
                va_end(args);
        }
}
EXPORT_SYMBOL_GPL(set_worker_desc);

/**
 * print_worker_info - print out worker information and description
 * @log_lvl: the log level to use when printing
 * @task: target task
 *
 * If @task is a worker and currently executing a work item, print out the
 * name of the workqueue being serviced and worker description set with
 * set_worker_desc() by the currently executing work item.
 *
 * This function can be safely called on any task as long as the
 * task_struct itself is accessible.  While safe, this function isn't
 * synchronized and may print out mixups or garbages of limited length.
 */
void print_worker_info(const char *log_lvl, struct task_struct *task)
{
        work_func_t *fn = NULL;
        char name[WQ_NAME_LEN] = { };
        char desc[WORKER_DESC_LEN] = { };
        struct pool_workqueue *pwq = NULL;
        struct workqueue_struct *wq = NULL;
        struct worker *worker;

        if (!(task->flags & PF_WQ_WORKER))
                return;

        /*
         * This function is called without any synchronization and @task
         * could be in any state.  Be careful with dereferences.
         */
        worker = kthread_probe_data(task);

        /*
         * Carefully copy the associated workqueue's workfn, name and desc.
         * Keep the original last '\0' in case the original is garbage.
         */
        copy_from_kernel_nofault(&fn, &worker->current_func, sizeof(fn));
        copy_from_kernel_nofault(&pwq, &worker->current_pwq, sizeof(pwq));
        copy_from_kernel_nofault(&wq, &pwq->wq, sizeof(wq));
        copy_from_kernel_nofault(name, wq->name, sizeof(name) - 1);
        copy_from_kernel_nofault(desc, worker->desc, sizeof(desc) - 1);

        if (fn || name[0] || desc[0]) {
                printk("%sWorkqueue: %s %ps", log_lvl, name, fn);
                if (strcmp(name, desc))
                        pr_cont(" (%s)", desc);
                pr_cont("\n");
        }
}

static void pr_cont_pool_info(struct worker_pool *pool)
{
        pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask);
        if (pool->node != NUMA_NO_NODE)
                pr_cont(" node=%d", pool->node);
        pr_cont(" flags=0x%x", pool->flags);
        if (pool->flags & POOL_BH)
                pr_cont(" bh%s",
                        pool->attrs->nice == HIGHPRI_NICE_LEVEL ? "-hi" : "");
        else
                pr_cont(" nice=%d", pool->attrs->nice);
}

static void pr_cont_worker_id(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;

        if (pool->flags & WQ_BH)
                pr_cont("bh%s",
                        pool->attrs->nice == HIGHPRI_NICE_LEVEL ? "-hi" : "");
        else
                pr_cont("%d%s", task_pid_nr(worker->task),
                        worker->rescue_wq ? "(RESCUER)" : "");
}

struct pr_cont_work_struct {
        bool comma;
        work_func_t func;
        long ctr;
};

static void pr_cont_work_flush(bool comma, work_func_t func, struct pr_cont_work_struct *pcwsp)
{
        if (!pcwsp->ctr)
                goto out_record;
        if (func == pcwsp->func) {
                pcwsp->ctr++;
                return;
        }
        if (pcwsp->ctr == 1)
                pr_cont("%s %ps", pcwsp->comma ? "," : "", pcwsp->func);
        else
                pr_cont("%s %ld*%ps", pcwsp->comma ? "," : "", pcwsp->ctr, pcwsp->func);
        pcwsp->ctr = 0;
out_record:
        if ((long)func == -1L)
                return;
        pcwsp->comma = comma;
        pcwsp->func = func;
        pcwsp->ctr = 1;
}

static void pr_cont_work(bool comma, struct work_struct *work, struct pr_cont_work_struct *pcwsp)
{
        if (work->func == wq_barrier_func) {
                struct wq_barrier *barr;

                barr = container_of(work, struct wq_barrier, work);

                pr_cont_work_flush(comma, (work_func_t)-1, pcwsp);
                pr_cont("%s BAR(%d)", comma ? "," : "",
                        task_pid_nr(barr->task));
        } else {
                if (!comma)
                        pr_cont_work_flush(comma, (work_func_t)-1, pcwsp);
                pr_cont_work_flush(comma, work->func, pcwsp);
        }
}

static void show_pwq(struct pool_workqueue *pwq)
{
        struct pr_cont_work_struct pcws = { .ctr = 0, };
        struct worker_pool *pool = pwq->pool;
        struct work_struct *work;
        struct worker *worker;
        bool has_in_flight = false, has_pending = false;
        int bkt;

        pr_info("  pwq %d:", pool->id);
        pr_cont_pool_info(pool);

        pr_cont(" active=%d refcnt=%d%s\n",
                pwq->nr_active, pwq->refcnt,
                !list_empty(&pwq->mayday_node) ? " MAYDAY" : "");

        hash_for_each(pool->busy_hash, bkt, worker, hentry) {
                if (worker->current_pwq == pwq) {
                        has_in_flight = true;
                        break;
                }
        }
        if (has_in_flight) {
                bool comma = false;

                pr_info("    in-flight:");
                hash_for_each(pool->busy_hash, bkt, worker, hentry) {
                        if (worker->current_pwq != pwq)
                                continue;

                        pr_cont(" %s", comma ? "," : "");
                        pr_cont_worker_id(worker);
                        pr_cont(":%ps", worker->current_func);
                        list_for_each_entry(work, &worker->scheduled, entry)
                                pr_cont_work(false, work, &pcws);
                        pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
                        comma = true;
                }
                pr_cont("\n");
        }

        list_for_each_entry(work, &pool->worklist, entry) {
                if (get_work_pwq(work) == pwq) {
                        has_pending = true;
                        break;
                }
        }
        if (has_pending) {
                bool comma = false;

                pr_info("    pending:");
                list_for_each_entry(work, &pool->worklist, entry) {
                        if (get_work_pwq(work) != pwq)
                                continue;

                        pr_cont_work(comma, work, &pcws);
                        comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
                }
                pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
                pr_cont("\n");
        }

        if (!list_empty(&pwq->inactive_works)) {
                bool comma = false;

                pr_info("    inactive:");
                list_for_each_entry(work, &pwq->inactive_works, entry) {
                        pr_cont_work(comma, work, &pcws);
                        comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
                }
                pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
                pr_cont("\n");
        }
}

/**
 * show_one_workqueue - dump state of specified workqueue
 * @wq: workqueue whose state will be printed
 */
void show_one_workqueue(struct workqueue_struct *wq)
{
        struct pool_workqueue *pwq;
        bool idle = true;
        unsigned long irq_flags;

        for_each_pwq(pwq, wq) {
                if (!pwq_is_empty(pwq)) {
                        idle = false;
                        break;
                }
        }
        if (idle) /* Nothing to print for idle workqueue */
                return;

        pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);

        for_each_pwq(pwq, wq) {
                raw_spin_lock_irqsave(&pwq->pool->lock, irq_flags);
                if (!pwq_is_empty(pwq)) {
                        /*
                         * Defer printing to avoid deadlocks in console
                         * drivers that queue work while holding locks
                         * also taken in their write paths.
                         */
                        printk_deferred_enter();
                        show_pwq(pwq);
                        printk_deferred_exit();
                }
                raw_spin_unlock_irqrestore(&pwq->pool->lock, irq_flags);
                /*
                 * We could be printing a lot from atomic context, e.g.
                 * sysrq-t -> show_all_workqueues(). Avoid triggering
                 * hard lockup.
                 */
                touch_nmi_watchdog();
        }

}

/**
 * show_one_worker_pool - dump state of specified worker pool
 * @pool: worker pool whose state will be printed
 */
static void show_one_worker_pool(struct worker_pool *pool)
{
        struct worker *worker;
        bool first = true;
        unsigned long irq_flags;
        unsigned long hung = 0;

        raw_spin_lock_irqsave(&pool->lock, irq_flags);
        if (pool->nr_workers == pool->nr_idle)
                goto next_pool;

        /* How long the first pending work is waiting for a worker. */
        if (!list_empty(&pool->worklist))
                hung = jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000;

        /*
         * Defer printing to avoid deadlocks in console drivers that
         * queue work while holding locks also taken in their write
         * paths.
         */
        printk_deferred_enter();
        pr_info("pool %d:", pool->id);
        pr_cont_pool_info(pool);
        pr_cont(" hung=%lus workers=%d", hung, pool->nr_workers);
        if (pool->manager)
                pr_cont(" manager: %d",
                        task_pid_nr(pool->manager->task));
        list_for_each_entry(worker, &pool->idle_list, entry) {
                pr_cont(" %s", first ? "idle: " : "");
                pr_cont_worker_id(worker);
                first = false;
        }
        pr_cont("\n");
        printk_deferred_exit();
next_pool:
        raw_spin_unlock_irqrestore(&pool->lock, irq_flags);
        /*
         * We could be printing a lot from atomic context, e.g.
         * sysrq-t -> show_all_workqueues(). Avoid triggering
         * hard lockup.
         */
        touch_nmi_watchdog();

}

/**
 * show_all_workqueues - dump workqueue state
 *
 * Called from a sysrq handler and prints out all busy workqueues and pools.
 */
void show_all_workqueues(void)
{
        struct workqueue_struct *wq;
        struct worker_pool *pool;
        int pi;

        rcu_read_lock();

        pr_info("Showing busy workqueues and worker pools:\n");

        list_for_each_entry_rcu(wq, &workqueues, list)
                show_one_workqueue(wq);

        for_each_pool(pool, pi)
                show_one_worker_pool(pool);

        rcu_read_unlock();
}

/**
 * show_freezable_workqueues - dump freezable workqueue state
 *
 * Called from try_to_freeze_tasks() and prints out all freezable workqueues
 * still busy.
 */
void show_freezable_workqueues(void)
{
        struct workqueue_struct *wq;

        rcu_read_lock();

        pr_info("Showing freezable workqueues that are still busy:\n");

        list_for_each_entry_rcu(wq, &workqueues, list) {
                if (!(wq->flags & WQ_FREEZABLE))
                        continue;
                show_one_workqueue(wq);
        }

        rcu_read_unlock();
}

/* used to show worker information through /proc/PID/{comm,stat,status} */
void wq_worker_comm(char *buf, size_t size, struct task_struct *task)
{
        /* stabilize PF_WQ_WORKER and worker pool association */
        mutex_lock(&wq_pool_attach_mutex);

        if (task->flags & PF_WQ_WORKER) {
                struct worker *worker = kthread_data(task);
                struct worker_pool *pool = worker->pool;
                int off;

                off = format_worker_id(buf, size, worker, pool);

                if (pool) {
                        raw_spin_lock_irq(&pool->lock);
                        /*
                         * ->desc tracks information (wq name or
                         * set_worker_desc()) for the latest execution.  If
                         * current, prepend '+', otherwise '-'.
                         */
                        if (worker->desc[0] != '\0') {
                                if (worker->current_work)
                                        scnprintf(buf + off, size - off, "+%s",
                                                  worker->desc);
                                else
                                        scnprintf(buf + off, size - off, "-%s",
                                                  worker->desc);
                        }
                        raw_spin_unlock_irq(&pool->lock);
                }
        } else {
                strscpy(buf, task->comm, size);
        }

        mutex_unlock(&wq_pool_attach_mutex);
}

#ifdef CONFIG_SMP

/*
 * CPU hotplug.
 *
 * There are two challenges in supporting CPU hotplug.  Firstly, there
 * are a lot of assumptions on strong associations among work, pwq and
 * pool which make migrating pending and scheduled works very
 * difficult to implement without impacting hot paths.  Secondly,
 * worker pools serve mix of short, long and very long running works making
 * blocked draining impractical.
 *
 * This is solved by allowing the pools to be disassociated from the CPU
 * running as an unbound one and allowing it to be reattached later if the
 * cpu comes back online.
 */

static void unbind_workers(int cpu)
{
        struct worker_pool *pool;
        struct worker *worker;

        for_each_cpu_worker_pool(pool, cpu) {
                mutex_lock(&wq_pool_attach_mutex);
                raw_spin_lock_irq(&pool->lock);

                /*
                 * We've blocked all attach/detach operations. Make all workers
                 * unbound and set DISASSOCIATED.  Before this, all workers
                 * must be on the cpu.  After this, they may become diasporas.
                 * And the preemption disabled section in their sched callbacks
                 * are guaranteed to see WORKER_UNBOUND since the code here
                 * is on the same cpu.
                 */
                for_each_pool_worker(worker, pool)
                        worker->flags |= WORKER_UNBOUND;

                pool->flags |= POOL_DISASSOCIATED;

                /*
                 * The handling of nr_running in sched callbacks are disabled
                 * now.  Zap nr_running.  After this, nr_running stays zero and
                 * need_more_worker() and keep_working() are always true as
                 * long as the worklist is not empty.  This pool now behaves as
                 * an unbound (in terms of concurrency management) pool which
                 * are served by workers tied to the pool.
                 */
                pool->nr_running = 0;

                /*
                 * With concurrency management just turned off, a busy
                 * worker blocking could lead to lengthy stalls.  Kick off
                 * unbound chain execution of currently pending work items.
                 */
                kick_pool(pool);

                raw_spin_unlock_irq(&pool->lock);

                for_each_pool_worker(worker, pool)
                        unbind_worker(worker);

                mutex_unlock(&wq_pool_attach_mutex);
        }
}

/**
 * rebind_workers - rebind all workers of a pool to the associated CPU
 * @pool: pool of interest
 *
 * @pool->cpu is coming online.  Rebind all workers to the CPU.
 */
static void rebind_workers(struct worker_pool *pool)
{
        struct worker *worker;

        lockdep_assert_held(&wq_pool_attach_mutex);

        /*
         * Restore CPU affinity of all workers.  As all idle workers should
         * be on the run-queue of the associated CPU before any local
         * wake-ups for concurrency management happen, restore CPU affinity
         * of all workers first and then clear UNBOUND.  As we're called
         * from CPU_ONLINE, the following shouldn't fail.
         */
        for_each_pool_worker(worker, pool) {
                kthread_set_per_cpu(worker->task, pool->cpu);
                WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
                                                  pool_allowed_cpus(pool)) < 0);
        }

        raw_spin_lock_irq(&pool->lock);

        pool->flags &= ~POOL_DISASSOCIATED;

        for_each_pool_worker(worker, pool) {
                unsigned int worker_flags = worker->flags;

                /*
                 * We want to clear UNBOUND but can't directly call
                 * worker_clr_flags() or adjust nr_running.  Atomically
                 * replace UNBOUND with another NOT_RUNNING flag REBOUND.
                 * @worker will clear REBOUND using worker_clr_flags() when
                 * it initiates the next execution cycle thus restoring
                 * concurrency management.  Note that when or whether
                 * @worker clears REBOUND doesn't affect correctness.
                 *
                 * WRITE_ONCE() is necessary because @worker->flags may be
                 * tested without holding any lock in
                 * wq_worker_running().  Without it, NOT_RUNNING test may
                 * fail incorrectly leading to premature concurrency
                 * management operations.
                 */
                WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND));
                worker_flags |= WORKER_REBOUND;
                worker_flags &= ~WORKER_UNBOUND;
                WRITE_ONCE(worker->flags, worker_flags);
        }

        raw_spin_unlock_irq(&pool->lock);
}

/**
 * restore_unbound_workers_cpumask - restore cpumask of unbound workers
 * @pool: unbound pool of interest
 * @cpu: the CPU which is coming up
 *
 * An unbound pool may end up with a cpumask which doesn't have any online
 * CPUs.  When a worker of such pool get scheduled, the scheduler resets
 * its cpus_allowed.  If @cpu is in @pool's cpumask which didn't have any
 * online CPU before, cpus_allowed of all its workers should be restored.
 */
static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
{
        static cpumask_t cpumask;
        struct worker *worker;

        lockdep_assert_held(&wq_pool_attach_mutex);

        /* is @cpu allowed for @pool? */
        if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
                return;

        cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask);

        /* as we're called from CPU_ONLINE, the following shouldn't fail */
        for_each_pool_worker(worker, pool)
                WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < 0);
}

int workqueue_prepare_cpu(unsigned int cpu)
{
        struct worker_pool *pool;

        for_each_cpu_worker_pool(pool, cpu) {
                if (pool->nr_workers)
                        continue;
                if (!create_worker(pool))
                        return -ENOMEM;
        }
        return 0;
}

int workqueue_online_cpu(unsigned int cpu)
{
        struct worker_pool *pool;
        struct workqueue_struct *wq;
        int pi;

        mutex_lock(&wq_pool_mutex);

        cpumask_set_cpu(cpu, wq_online_cpumask);

        for_each_pool(pool, pi) {
                /* BH pools aren't affected by hotplug */
                if (pool->flags & POOL_BH)
                        continue;

                mutex_lock(&wq_pool_attach_mutex);
                if (pool->cpu == cpu)
                        rebind_workers(pool);
                else if (pool->cpu < 0)
                        restore_unbound_workers_cpumask(pool, cpu);
                mutex_unlock(&wq_pool_attach_mutex);
        }

        /* update pod affinity of unbound workqueues */
        list_for_each_entry(wq, &workqueues, list) {
                struct workqueue_attrs *attrs = wq->unbound_attrs;

                if (attrs) {
                        const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
                        int tcpu;

                        for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
                                unbound_wq_update_pwq(wq, tcpu);

                        mutex_lock(&wq->mutex);
                        wq_update_node_max_active(wq, -1);
                        mutex_unlock(&wq->mutex);
                }
        }

        mutex_unlock(&wq_pool_mutex);
        return 0;
}

int workqueue_offline_cpu(unsigned int cpu)
{
        struct workqueue_struct *wq;

        /* unbinding per-cpu workers should happen on the local CPU */
        if (WARN_ON(cpu != smp_processor_id()))
                return -1;

        unbind_workers(cpu);

        /* update pod affinity of unbound workqueues */
        mutex_lock(&wq_pool_mutex);

        cpumask_clear_cpu(cpu, wq_online_cpumask);

        list_for_each_entry(wq, &workqueues, list) {
                struct workqueue_attrs *attrs = wq->unbound_attrs;

                if (attrs) {
                        const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
                        int tcpu;

                        for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
                                unbound_wq_update_pwq(wq, tcpu);

                        mutex_lock(&wq->mutex);
                        wq_update_node_max_active(wq, cpu);
                        mutex_unlock(&wq->mutex);
                }
        }
        mutex_unlock(&wq_pool_mutex);

        return 0;
}

struct work_for_cpu {
        struct work_struct work;
        long (*fn)(void *);
        void *arg;
        long ret;
};

static void work_for_cpu_fn(struct work_struct *work)
{
        struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);

        wfc->ret = wfc->fn(wfc->arg);
}

/**
 * work_on_cpu_key - run a function in thread context on a particular cpu
 * @cpu: the cpu to run on
 * @fn: the function to run
 * @arg: the function arg
 * @key: The lock class key for lock debugging purposes
 *
 * It is up to the caller to ensure that the cpu doesn't go offline.
 * The caller must not hold any locks which would prevent @fn from completing.
 *
 * Return: The value @fn returns.
 */
long work_on_cpu_key(int cpu, long (*fn)(void *),
                     void *arg, struct lock_class_key *key)
{
        struct work_for_cpu wfc = { .fn = fn, .arg = arg };

        INIT_WORK_ONSTACK_KEY(&wfc.work, work_for_cpu_fn, key);
        schedule_work_on(cpu, &wfc.work);
        flush_work(&wfc.work);
        destroy_work_on_stack(&wfc.work);
        return wfc.ret;
}
EXPORT_SYMBOL_GPL(work_on_cpu_key);
#endif /* CONFIG_SMP */

#ifdef CONFIG_FREEZER

/**
 * freeze_workqueues_begin - begin freezing workqueues
 *
 * Start freezing workqueues.  After this function returns, all freezable
 * workqueues will queue new works to their inactive_works list instead of
 * pool->worklist.
 *
 * CONTEXT:
 * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
 */
void freeze_workqueues_begin(void)
{
        struct workqueue_struct *wq;

        mutex_lock(&wq_pool_mutex);

        WARN_ON_ONCE(workqueue_freezing);
        workqueue_freezing = true;

        list_for_each_entry(wq, &workqueues, list) {
                mutex_lock(&wq->mutex);
                wq_adjust_max_active(wq);
                mutex_unlock(&wq->mutex);
        }

        mutex_unlock(&wq_pool_mutex);
}

/**
 * freeze_workqueues_busy - are freezable workqueues still busy?
 *
 * Check whether freezing is complete.  This function must be called
 * between freeze_workqueues_begin() and thaw_workqueues().
 *
 * CONTEXT:
 * Grabs and releases wq_pool_mutex.
 *
 * Return:
 * %true if some freezable workqueues are still busy.  %false if freezing
 * is complete.
 */
bool freeze_workqueues_busy(void)
{
        bool busy = false;
        struct workqueue_struct *wq;
        struct pool_workqueue *pwq;

        mutex_lock(&wq_pool_mutex);

        WARN_ON_ONCE(!workqueue_freezing);

        list_for_each_entry(wq, &workqueues, list) {
                if (!(wq->flags & WQ_FREEZABLE))
                        continue;
                /*
                 * nr_active is monotonically decreasing.  It's safe
                 * to peek without lock.
                 */
                rcu_read_lock();
                for_each_pwq(pwq, wq) {
                        WARN_ON_ONCE(pwq->nr_active < 0);
                        if (pwq->nr_active) {
                                busy = true;
                                rcu_read_unlock();
                                goto out_unlock;
                        }
                }
                rcu_read_unlock();
        }
out_unlock:
        mutex_unlock(&wq_pool_mutex);
        return busy;
}

/**
 * thaw_workqueues - thaw workqueues
 *
 * Thaw workqueues.  Normal queueing is restored and all collected
 * frozen works are transferred to their respective pool worklists.
 *
 * CONTEXT:
 * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
 */
void thaw_workqueues(void)
{
        struct workqueue_struct *wq;

        mutex_lock(&wq_pool_mutex);

        if (!workqueue_freezing)
                goto out_unlock;

        workqueue_freezing = false;

        /* restore max_active and repopulate worklist */
        list_for_each_entry(wq, &workqueues, list) {
                mutex_lock(&wq->mutex);
                wq_adjust_max_active(wq);
                mutex_unlock(&wq->mutex);
        }

out_unlock:
        mutex_unlock(&wq_pool_mutex);
}
#endif /* CONFIG_FREEZER */

static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
{
        LIST_HEAD(ctxs);
        int ret = 0;
        struct workqueue_struct *wq;
        struct apply_wqattrs_ctx *ctx, *n;

        lockdep_assert_held(&wq_pool_mutex);

        list_for_each_entry(wq, &workqueues, list) {
                if (!(wq->flags & WQ_UNBOUND) || (wq->flags & __WQ_DESTROYING))
                        continue;

                ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask);
                if (IS_ERR(ctx)) {
                        ret = PTR_ERR(ctx);
                        break;
                }

                list_add_tail(&ctx->list, &ctxs);
        }

        list_for_each_entry_safe(ctx, n, &ctxs, list) {
                if (!ret)
                        apply_wqattrs_commit(ctx);
                apply_wqattrs_cleanup(ctx);
        }

        if (!ret) {
                mutex_lock(&wq_pool_attach_mutex);
                cpumask_copy(wq_unbound_cpumask, unbound_cpumask);
                mutex_unlock(&wq_pool_attach_mutex);
        }
        return ret;
}

/**
 * workqueue_unbound_exclude_cpumask - Exclude given CPUs from unbound cpumask
 * @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask
 *
 * This function can be called from cpuset code to provide a set of isolated
 * CPUs that should be excluded from wq_unbound_cpumask.
 */
int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask)
{
        cpumask_var_t cpumask;
        int ret = 0;

        if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
                return -ENOMEM;

        mutex_lock(&wq_pool_mutex);

        /*
         * If the operation fails, it will fall back to
         * wq_requested_unbound_cpumask which is initially set to
         * (HK_TYPE_WQ ∩ HK_TYPE_DOMAIN) house keeping mask and rewritten
         * by any subsequent write to workqueue/cpumask sysfs file.
         */
        if (!cpumask_andnot(cpumask, wq_requested_unbound_cpumask, exclude_cpumask))
                cpumask_copy(cpumask, wq_requested_unbound_cpumask);
        if (!cpumask_equal(cpumask, wq_unbound_cpumask))
                ret = workqueue_apply_unbound_cpumask(cpumask);

        /* Save the current isolated cpumask & export it via sysfs */
        if (!ret)
                cpumask_copy(wq_isolated_cpumask, exclude_cpumask);

        mutex_unlock(&wq_pool_mutex);
        free_cpumask_var(cpumask);
        return ret;
}

static int parse_affn_scope(const char *val)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(wq_affn_names); i++) {
                if (!strncasecmp(val, wq_affn_names[i], strlen(wq_affn_names[i])))
                        return i;
        }
        return -EINVAL;
}

static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp)
{
        struct workqueue_struct *wq;
        int affn, cpu;

        affn = parse_affn_scope(val);
        if (affn < 0)
                return affn;
        if (affn == WQ_AFFN_DFL)
                return -EINVAL;

        cpus_read_lock();
        mutex_lock(&wq_pool_mutex);

        wq_affn_dfl = affn;

        list_for_each_entry(wq, &workqueues, list) {
                for_each_online_cpu(cpu)
                        unbound_wq_update_pwq(wq, cpu);
        }

        mutex_unlock(&wq_pool_mutex);
        cpus_read_unlock();

        return 0;
}

static int wq_affn_dfl_get(char *buffer, const struct kernel_param *kp)
{
        return scnprintf(buffer, PAGE_SIZE, "%s\n", wq_affn_names[wq_affn_dfl]);
}

static const struct kernel_param_ops wq_affn_dfl_ops = {
        .set        = wq_affn_dfl_set,
        .get        = wq_affn_dfl_get,
};

module_param_cb(default_affinity_scope, &wq_affn_dfl_ops, NULL, 0644);

#ifdef CONFIG_SYSFS
/*
 * Workqueues with WQ_SYSFS flag set is visible to userland via
 * /sys/bus/workqueue/devices/WQ_NAME.  All visible workqueues have the
 * following attributes.
 *
 *  per_cpu                RO bool        : whether the workqueue is per-cpu or unbound
 *  max_active                RW int        : maximum number of in-flight work items
 *
 * Unbound workqueues have the following extra attributes.
 *
 *  nice                RW int        : nice value of the workers
 *  cpumask                RW mask        : bitmask of allowed CPUs for the workers
 *  affinity_scope        RW str  : worker CPU affinity scope (cache, numa, none)
 *  affinity_strict        RW bool : worker CPU affinity is strict
 */
struct wq_device {
        struct workqueue_struct                *wq;
        struct device                        dev;
};

static struct workqueue_struct *dev_to_wq(struct device *dev)
{
        struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);

        return wq_dev->wq;
}

static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
                            char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);

        return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
}
static DEVICE_ATTR_RO(per_cpu);

static ssize_t max_active_show(struct device *dev,
                               struct device_attribute *attr, char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);

        return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
}

static ssize_t max_active_store(struct device *dev,
                                struct device_attribute *attr, const char *buf,
                                size_t count)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        int val;

        if (sscanf(buf, "%d", &val) != 1 || val <= 0)
                return -EINVAL;

        workqueue_set_max_active(wq, val);
        return count;
}
static DEVICE_ATTR_RW(max_active);

static struct attribute *wq_sysfs_attrs[] = {
        &dev_attr_per_cpu.attr,
        &dev_attr_max_active.attr,
        NULL,
};
ATTRIBUTE_GROUPS(wq_sysfs);

static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
                            char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        int written;

        mutex_lock(&wq->mutex);
        written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
        mutex_unlock(&wq->mutex);

        return written;
}

/* prepare workqueue_attrs for sysfs store operations */
static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
{
        struct workqueue_attrs *attrs;

        lockdep_assert_held(&wq_pool_mutex);

        attrs = alloc_workqueue_attrs();
        if (!attrs)
                return NULL;

        copy_workqueue_attrs(attrs, wq->unbound_attrs);
        return attrs;
}

static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
                             const char *buf, size_t count)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        struct workqueue_attrs *attrs;
        int ret = -ENOMEM;

        apply_wqattrs_lock();

        attrs = wq_sysfs_prep_attrs(wq);
        if (!attrs)
                goto out_unlock;

        if (sscanf(buf, "%d", &attrs->nice) == 1 &&
            attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
                ret = apply_workqueue_attrs_locked(wq, attrs);
        else
                ret = -EINVAL;

out_unlock:
        apply_wqattrs_unlock();
        free_workqueue_attrs(attrs);
        return ret ?: count;
}

static ssize_t wq_cpumask_show(struct device *dev,
                               struct device_attribute *attr, char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        int written;

        mutex_lock(&wq->mutex);
        written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
                            cpumask_pr_args(wq->unbound_attrs->cpumask));
        mutex_unlock(&wq->mutex);
        return written;
}

static ssize_t wq_cpumask_store(struct device *dev,
                                struct device_attribute *attr,
                                const char *buf, size_t count)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        struct workqueue_attrs *attrs;
        int ret = -ENOMEM;

        apply_wqattrs_lock();

        attrs = wq_sysfs_prep_attrs(wq);
        if (!attrs)
                goto out_unlock;

        ret = cpumask_parse(buf, attrs->cpumask);
        if (!ret)
                ret = apply_workqueue_attrs_locked(wq, attrs);

out_unlock:
        apply_wqattrs_unlock();
        free_workqueue_attrs(attrs);
        return ret ?: count;
}

static ssize_t wq_affn_scope_show(struct device *dev,
                                  struct device_attribute *attr, char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        int written;

        mutex_lock(&wq->mutex);
        if (wq->unbound_attrs->affn_scope == WQ_AFFN_DFL)
                written = scnprintf(buf, PAGE_SIZE, "%s (%s)\n",
                                    wq_affn_names[WQ_AFFN_DFL],
                                    wq_affn_names[wq_affn_dfl]);
        else
                written = scnprintf(buf, PAGE_SIZE, "%s\n",
                                    wq_affn_names[wq->unbound_attrs->affn_scope]);
        mutex_unlock(&wq->mutex);

        return written;
}

static ssize_t wq_affn_scope_store(struct device *dev,
                                   struct device_attribute *attr,
                                   const char *buf, size_t count)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        struct workqueue_attrs *attrs;
        int affn, ret = -ENOMEM;

        affn = parse_affn_scope(buf);
        if (affn < 0)
                return affn;

        apply_wqattrs_lock();
        attrs = wq_sysfs_prep_attrs(wq);
        if (attrs) {
                attrs->affn_scope = affn;
                ret = apply_workqueue_attrs_locked(wq, attrs);
        }
        apply_wqattrs_unlock();
        free_workqueue_attrs(attrs);
        return ret ?: count;
}

static ssize_t wq_affinity_strict_show(struct device *dev,
                                       struct device_attribute *attr, char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);

        return scnprintf(buf, PAGE_SIZE, "%d\n",
                         wq->unbound_attrs->affn_strict);
}

static ssize_t wq_affinity_strict_store(struct device *dev,
                                        struct device_attribute *attr,
                                        const char *buf, size_t count)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        struct workqueue_attrs *attrs;
        int v, ret = -ENOMEM;

        if (sscanf(buf, "%d", &v) != 1)
                return -EINVAL;

        apply_wqattrs_lock();
        attrs = wq_sysfs_prep_attrs(wq);
        if (attrs) {
                attrs->affn_strict = (bool)v;
                ret = apply_workqueue_attrs_locked(wq, attrs);
        }
        apply_wqattrs_unlock();
        free_workqueue_attrs(attrs);
        return ret ?: count;
}

static struct device_attribute wq_sysfs_unbound_attrs[] = {
        __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
        __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
        __ATTR(affinity_scope, 0644, wq_affn_scope_show, wq_affn_scope_store),
        __ATTR(affinity_strict, 0644, wq_affinity_strict_show, wq_affinity_strict_store),
        __ATTR_NULL,
};

static const struct bus_type wq_subsys = {
        .name                                = "workqueue",
        .dev_groups                        = wq_sysfs_groups,
};

/**
 *  workqueue_set_unbound_cpumask - Set the low-level unbound cpumask
 *  @cpumask: the cpumask to set
 *
 *  The low-level workqueues cpumask is a global cpumask that limits
 *  the affinity of all unbound workqueues.  This function check the @cpumask
 *  and apply it to all unbound workqueues and updates all pwqs of them.
 *
 *  Return:        0        - Success
 *                -EINVAL        - Invalid @cpumask
 *                -ENOMEM        - Failed to allocate memory for attrs or pwqs.
 */
static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
{
        int ret = -EINVAL;

        /*
         * Not excluding isolated cpus on purpose.
         * If the user wishes to include them, we allow that.
         */
        cpumask_and(cpumask, cpumask, cpu_possible_mask);
        if (!cpumask_empty(cpumask)) {
                ret = 0;
                apply_wqattrs_lock();
                if (!cpumask_equal(cpumask, wq_unbound_cpumask))
                        ret = workqueue_apply_unbound_cpumask(cpumask);
                if (!ret)
                        cpumask_copy(wq_requested_unbound_cpumask, cpumask);
                apply_wqattrs_unlock();
        }

        return ret;
}

static ssize_t __wq_cpumask_show(struct device *dev,
                struct device_attribute *attr, char *buf, cpumask_var_t mask)
{
        int written;

        mutex_lock(&wq_pool_mutex);
        written = scnprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask));
        mutex_unlock(&wq_pool_mutex);

        return written;
}

static ssize_t cpumask_requested_show(struct device *dev,
                struct device_attribute *attr, char *buf)
{
        return __wq_cpumask_show(dev, attr, buf, wq_requested_unbound_cpumask);
}
static DEVICE_ATTR_RO(cpumask_requested);

static ssize_t cpumask_isolated_show(struct device *dev,
                struct device_attribute *attr, char *buf)
{
        return __wq_cpumask_show(dev, attr, buf, wq_isolated_cpumask);
}
static DEVICE_ATTR_RO(cpumask_isolated);

static ssize_t cpumask_show(struct device *dev,
                struct device_attribute *attr, char *buf)
{
        return __wq_cpumask_show(dev, attr, buf, wq_unbound_cpumask);
}

static ssize_t cpumask_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        cpumask_var_t cpumask;
        int ret;

        if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
                return -ENOMEM;

        ret = cpumask_parse(buf, cpumask);
        if (!ret)
                ret = workqueue_set_unbound_cpumask(cpumask);

        free_cpumask_var(cpumask);
        return ret ? ret : count;
}
static DEVICE_ATTR_RW(cpumask);

static struct attribute *wq_sysfs_cpumask_attrs[] = {
        &dev_attr_cpumask.attr,
        &dev_attr_cpumask_requested.attr,
        &dev_attr_cpumask_isolated.attr,
        NULL,
};
ATTRIBUTE_GROUPS(wq_sysfs_cpumask);

static int __init wq_sysfs_init(void)
{
        return subsys_virtual_register(&wq_subsys, wq_sysfs_cpumask_groups);
}
core_initcall(wq_sysfs_init);

static void wq_device_release(struct device *dev)
{
        struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);

        kfree(wq_dev);
}

/**
 * workqueue_sysfs_register - make a workqueue visible in sysfs
 * @wq: the workqueue to register
 *
 * Expose @wq in sysfs under /sys/bus/workqueue/devices.
 * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
 * which is the preferred method.
 *
 * Workqueue user should use this function directly iff it wants to apply
 * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
 * apply_workqueue_attrs() may race against userland updating the
 * attributes.
 *
 * Return: 0 on success, -errno on failure.
 */
int workqueue_sysfs_register(struct workqueue_struct *wq)
{
        struct wq_device *wq_dev;
        int ret;

        /*
         * Adjusting max_active breaks ordering guarantee.  Disallow exposing
         * ordered workqueues.
         */
        if (WARN_ON(wq->flags & __WQ_ORDERED))
                return -EINVAL;

        wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
        if (!wq_dev)
                return -ENOMEM;

        wq_dev->wq = wq;
        wq_dev->dev.bus = &wq_subsys;
        wq_dev->dev.release = wq_device_release;
        dev_set_name(&wq_dev->dev, "%s", wq->name);

        /*
         * unbound_attrs are created separately.  Suppress uevent until
         * everything is ready.
         */
        dev_set_uevent_suppress(&wq_dev->dev, true);

        ret = device_register(&wq_dev->dev);
        if (ret) {
                put_device(&wq_dev->dev);
                wq->wq_dev = NULL;
                return ret;
        }

        if (wq->flags & WQ_UNBOUND) {
                struct device_attribute *attr;

                for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
                        ret = device_create_file(&wq_dev->dev, attr);
                        if (ret) {
                                device_unregister(&wq_dev->dev);
                                wq->wq_dev = NULL;
                                return ret;
                        }
                }
        }

        dev_set_uevent_suppress(&wq_dev->dev, false);
        kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
        return 0;
}

/**
 * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
 * @wq: the workqueue to unregister
 *
 * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
 */
static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
{
        struct wq_device *wq_dev = wq->wq_dev;

        if (!wq->wq_dev)
                return;

        wq->wq_dev = NULL;
        device_unregister(&wq_dev->dev);
}
#else        /* CONFIG_SYSFS */
static void workqueue_sysfs_unregister(struct workqueue_struct *wq)        { }
#endif        /* CONFIG_SYSFS */

/*
 * Workqueue watchdog.
 *
 * Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal
 * flush dependency, a concurrency managed work item which stays RUNNING
 * indefinitely.  Workqueue stalls can be very difficult to debug as the
 * usual warning mechanisms don't trigger and internal workqueue state is
 * largely opaque.
 *
 * Workqueue watchdog monitors all worker pools periodically and dumps
 * state if some pools failed to make forward progress for a while where
 * forward progress is defined as the first item on ->worklist changing.
 *
 * This mechanism is controlled through the kernel parameter
 * "workqueue.watchdog_thresh" which can be updated at runtime through the
 * corresponding sysfs parameter file.
 */
#ifdef CONFIG_WQ_WATCHDOG

static unsigned long wq_watchdog_thresh = 30;
static struct timer_list wq_watchdog_timer;

static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;

static unsigned int wq_panic_on_stall;
module_param_named(panic_on_stall, wq_panic_on_stall, uint, 0644);

/*
 * Show workers that might prevent the processing of pending work items.
 * The only candidates are CPU-bound workers in the running state.
 * Pending work items should be handled by another idle worker
 * in all other situations.
 */
static void show_cpu_pool_hog(struct worker_pool *pool)
{
        struct worker *worker;
        unsigned long irq_flags;
        int bkt;

        raw_spin_lock_irqsave(&pool->lock, irq_flags);

        hash_for_each(pool->busy_hash, bkt, worker, hentry) {
                if (task_is_running(worker->task)) {
                        /*
                         * Defer printing to avoid deadlocks in console
                         * drivers that queue work while holding locks
                         * also taken in their write paths.
                         */
                        printk_deferred_enter();

                        pr_info("pool %d:\n", pool->id);
                        sched_show_task(worker->task);

                        printk_deferred_exit();
                }
        }

        raw_spin_unlock_irqrestore(&pool->lock, irq_flags);
}

static void show_cpu_pools_hogs(void)
{
        struct worker_pool *pool;
        int pi;

        pr_info("Showing backtraces of running workers in stalled CPU-bound worker pools:\n");

        rcu_read_lock();

        for_each_pool(pool, pi) {
                if (pool->cpu_stall)
                        show_cpu_pool_hog(pool);

        }

        rcu_read_unlock();
}

static void panic_on_wq_watchdog(void)
{
        static unsigned int wq_stall;

        if (wq_panic_on_stall) {
                wq_stall++;
                BUG_ON(wq_stall >= wq_panic_on_stall);
        }
}

static void wq_watchdog_reset_touched(void)
{
        int cpu;

        wq_watchdog_touched = jiffies;
        for_each_possible_cpu(cpu)
                per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
}

static void wq_watchdog_timer_fn(struct timer_list *unused)
{
        unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
        bool lockup_detected = false;
        bool cpu_pool_stall = false;
        unsigned long now = jiffies;
        struct worker_pool *pool;
        int pi;

        if (!thresh)
                return;

        for_each_pool(pool, pi) {
                unsigned long pool_ts, touched, ts;

                pool->cpu_stall = false;
                if (list_empty(&pool->worklist))
                        continue;

                /*
                 * If a virtual machine is stopped by the host it can look to
                 * the watchdog like a stall.
                 */
                kvm_check_and_clear_guest_paused();

                /* get the latest of pool and touched timestamps */
                if (pool->cpu >= 0)
                        touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu));
                else
                        touched = READ_ONCE(wq_watchdog_touched);
                pool_ts = READ_ONCE(pool->watchdog_ts);

                if (time_after(pool_ts, touched))
                        ts = pool_ts;
                else
                        ts = touched;

                /* did we stall? */
                if (time_after(now, ts + thresh)) {
                        lockup_detected = true;
                        if (pool->cpu >= 0 && !(pool->flags & POOL_BH)) {
                                pool->cpu_stall = true;
                                cpu_pool_stall = true;
                        }
                        pr_emerg("BUG: workqueue lockup - pool");
                        pr_cont_pool_info(pool);
                        pr_cont(" stuck for %us!\n",
                                jiffies_to_msecs(now - pool_ts) / 1000);
                }


        }

        if (lockup_detected)
                show_all_workqueues();

        if (cpu_pool_stall)
                show_cpu_pools_hogs();

        if (lockup_detected)
                panic_on_wq_watchdog();

        wq_watchdog_reset_touched();
        mod_timer(&wq_watchdog_timer, jiffies + thresh);
}

notrace void wq_watchdog_touch(int cpu)
{
        unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
        unsigned long touch_ts = READ_ONCE(wq_watchdog_touched);
        unsigned long now = jiffies;

        if (cpu >= 0)
                per_cpu(wq_watchdog_touched_cpu, cpu) = now;
        else
                WARN_ONCE(1, "%s should be called with valid CPU", __func__);

        /* Don't unnecessarily store to global cacheline */
        if (time_after(now, touch_ts + thresh / 4))
                WRITE_ONCE(wq_watchdog_touched, jiffies);
}

static void wq_watchdog_set_thresh(unsigned long thresh)
{
        wq_watchdog_thresh = 0;
        timer_delete_sync(&wq_watchdog_timer);

        if (thresh) {
                wq_watchdog_thresh = thresh;
                wq_watchdog_reset_touched();
                mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ);
        }
}

static int wq_watchdog_param_set_thresh(const char *val,
                                        const struct kernel_param *kp)
{
        unsigned long thresh;
        int ret;

        ret = kstrtoul(val, 0, &thresh);
        if (ret)
                return ret;

        if (system_percpu_wq)
                wq_watchdog_set_thresh(thresh);
        else
                wq_watchdog_thresh = thresh;

        return 0;
}

static const struct kernel_param_ops wq_watchdog_thresh_ops = {
        .set        = wq_watchdog_param_set_thresh,
        .get        = param_get_ulong,
};

module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh,
                0644);

static void wq_watchdog_init(void)
{
        timer_setup(&wq_watchdog_timer, wq_watchdog_timer_fn, TIMER_DEFERRABLE);
        wq_watchdog_set_thresh(wq_watchdog_thresh);
}

#else        /* CONFIG_WQ_WATCHDOG */

static inline void wq_watchdog_init(void) { }

#endif        /* CONFIG_WQ_WATCHDOG */

static void bh_pool_kick_normal(struct irq_work *irq_work)
{
        raise_softirq_irqoff(TASKLET_SOFTIRQ);
}

static void bh_pool_kick_highpri(struct irq_work *irq_work)
{
        raise_softirq_irqoff(HI_SOFTIRQ);
}

static void __init restrict_unbound_cpumask(const char *name, const struct cpumask *mask)
{
        if (!cpumask_intersects(wq_unbound_cpumask, mask)) {
                pr_warn("workqueue: Restricting unbound_cpumask (%*pb) with %s (%*pb) leaves no CPU, ignoring\n",
                        cpumask_pr_args(wq_unbound_cpumask), name, cpumask_pr_args(mask));
                return;
        }

        cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, mask);
}

static void __init init_cpu_worker_pool(struct worker_pool *pool, int cpu, int nice)
{
        BUG_ON(init_worker_pool(pool));
        pool->cpu = cpu;
        cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
        cpumask_copy(pool->attrs->__pod_cpumask, cpumask_of(cpu));
        pool->attrs->nice = nice;
        pool->attrs->affn_strict = true;
        pool->node = cpu_to_node(cpu);

        /* alloc pool ID */
        mutex_lock(&wq_pool_mutex);
        BUG_ON(worker_pool_assign_id(pool));
        mutex_unlock(&wq_pool_mutex);
}

/**
 * workqueue_init_early - early init for workqueue subsystem
 *
 * This is the first step of three-staged workqueue subsystem initialization and
 * invoked as soon as the bare basics - memory allocation, cpumasks and idr are
 * up. It sets up all the data structures and system workqueues and allows early
 * boot code to create workqueues and queue/cancel work items. Actual work item
 * execution starts only after kthreads can be created and scheduled right
 * before early initcalls.
 */
void __init workqueue_init_early(void)
{
        struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_SYSTEM];
        int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
        void (*irq_work_fns[2])(struct irq_work *) = { bh_pool_kick_normal,
                                                       bh_pool_kick_highpri };
        int i, cpu;

        BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));

        BUG_ON(!alloc_cpumask_var(&wq_online_cpumask, GFP_KERNEL));
        BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
        BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL));
        BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL));

        cpumask_copy(wq_online_cpumask, cpu_online_mask);
        cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
        restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ));
        restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN));
        if (!cpumask_empty(&wq_cmdline_cpumask))
                restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask);

        cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask);
        cpumask_andnot(wq_isolated_cpumask, cpu_possible_mask,
                                                housekeeping_cpumask(HK_TYPE_DOMAIN));
        pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);

        unbound_wq_update_pwq_attrs_buf = alloc_workqueue_attrs();
        BUG_ON(!unbound_wq_update_pwq_attrs_buf);

        /*
         * If nohz_full is enabled, set power efficient workqueue as unbound.
         * This allows workqueue items to be moved to HK CPUs.
         */
        if (housekeeping_enabled(HK_TYPE_TICK))
                wq_power_efficient = true;

        /* initialize WQ_AFFN_SYSTEM pods */
        pt->pod_cpus = kcalloc(1, sizeof(pt->pod_cpus[0]), GFP_KERNEL);
        pt->pod_node = kcalloc(1, sizeof(pt->pod_node[0]), GFP_KERNEL);
        pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL);
        BUG_ON(!pt->pod_cpus || !pt->pod_node || !pt->cpu_pod);

        BUG_ON(!zalloc_cpumask_var_node(&pt->pod_cpus[0], GFP_KERNEL, NUMA_NO_NODE));

        pt->nr_pods = 1;
        cpumask_copy(pt->pod_cpus[0], cpu_possible_mask);
        pt->pod_node[0] = NUMA_NO_NODE;
        pt->cpu_pod[0] = 0;

        /* initialize BH and CPU pools */
        for_each_possible_cpu(cpu) {
                struct worker_pool *pool;

                i = 0;
                for_each_bh_worker_pool(pool, cpu) {
                        init_cpu_worker_pool(pool, cpu, std_nice[i]);
                        pool->flags |= POOL_BH;
                        init_irq_work(bh_pool_irq_work(pool), irq_work_fns[i]);
                        i++;
                }

                i = 0;
                for_each_cpu_worker_pool(pool, cpu)
                        init_cpu_worker_pool(pool, cpu, std_nice[i++]);
        }

        /* create default unbound and ordered wq attrs */
        for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
                struct workqueue_attrs *attrs;

                BUG_ON(!(attrs = alloc_workqueue_attrs()));
                attrs->nice = std_nice[i];
                unbound_std_wq_attrs[i] = attrs;

                /*
                 * An ordered wq should have only one pwq as ordering is
                 * guaranteed by max_active which is enforced by pwqs.
                 */
                BUG_ON(!(attrs = alloc_workqueue_attrs()));
                attrs->nice = std_nice[i];
                attrs->ordered = true;
                ordered_wq_attrs[i] = attrs;
        }

        system_wq = alloc_workqueue("events", WQ_PERCPU, 0);
        system_percpu_wq = alloc_workqueue("events", WQ_PERCPU, 0);
        system_highpri_wq = alloc_workqueue("events_highpri",
                                            WQ_HIGHPRI | WQ_PERCPU, 0);
        system_long_wq = alloc_workqueue("events_long", WQ_PERCPU, 0);
        system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
        system_dfl_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
        system_freezable_wq = alloc_workqueue("events_freezable",
                                              WQ_FREEZABLE | WQ_PERCPU, 0);
        system_power_efficient_wq = alloc_workqueue("events_power_efficient",
                                              WQ_POWER_EFFICIENT | WQ_PERCPU, 0);
        system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_pwr_efficient",
                                              WQ_FREEZABLE | WQ_POWER_EFFICIENT | WQ_PERCPU, 0);
        system_bh_wq = alloc_workqueue("events_bh", WQ_BH | WQ_PERCPU, 0);
        system_bh_highpri_wq = alloc_workqueue("events_bh_highpri",
                                               WQ_BH | WQ_HIGHPRI | WQ_PERCPU, 0);
        BUG_ON(!system_wq || !system_percpu_wq|| !system_highpri_wq || !system_long_wq ||
               !system_unbound_wq || !system_freezable_wq || !system_dfl_wq ||
               !system_power_efficient_wq ||
               !system_freezable_power_efficient_wq ||
               !system_bh_wq || !system_bh_highpri_wq);
}

static void __init wq_cpu_intensive_thresh_init(void)
{
        unsigned long thresh;
        unsigned long bogo;

        pwq_release_worker = kthread_run_worker(0, "pool_workqueue_release");
        BUG_ON(IS_ERR(pwq_release_worker));

        /* if the user set it to a specific value, keep it */
        if (wq_cpu_intensive_thresh_us != ULONG_MAX)
                return;

        /*
         * The default of 10ms is derived from the fact that most modern (as of
         * 2023) processors can do a lot in 10ms and that it's just below what
         * most consider human-perceivable. However, the kernel also runs on a
         * lot slower CPUs including microcontrollers where the threshold is way
         * too low.
         *
         * Let's scale up the threshold upto 1 second if BogoMips is below 4000.
         * This is by no means accurate but it doesn't have to be. The mechanism
         * is still useful even when the threshold is fully scaled up. Also, as
         * the reports would usually be applicable to everyone, some machines
         * operating on longer thresholds won't significantly diminish their
         * usefulness.
         */
        thresh = 10 * USEC_PER_MSEC;

        /* see init/calibrate.c for lpj -> BogoMIPS calculation */
        bogo = max_t(unsigned long, loops_per_jiffy / 500000 * HZ, 1);
        if (bogo < 4000)
                thresh = min_t(unsigned long, thresh * 4000 / bogo, USEC_PER_SEC);

        pr_debug("wq_cpu_intensive_thresh: lpj=%lu BogoMIPS=%lu thresh_us=%lu\n",
                 loops_per_jiffy, bogo, thresh);

        wq_cpu_intensive_thresh_us = thresh;
}

/**
 * workqueue_init - bring workqueue subsystem fully online
 *
 * This is the second step of three-staged workqueue subsystem initialization
 * and invoked as soon as kthreads can be created and scheduled. Workqueues have
 * been created and work items queued on them, but there are no kworkers
 * executing the work items yet. Populate the worker pools with the initial
 * workers and enable future kworker creations.
 */
void __init workqueue_init(void)
{
        struct workqueue_struct *wq;
        struct worker_pool *pool;
        int cpu, bkt;

        wq_cpu_intensive_thresh_init();

        mutex_lock(&wq_pool_mutex);

        /*
         * Per-cpu pools created earlier could be missing node hint. Fix them
         * up. Also, create a rescuer for workqueues that requested it.
         */
        for_each_possible_cpu(cpu) {
                for_each_bh_worker_pool(pool, cpu)
                        pool->node = cpu_to_node(cpu);
                for_each_cpu_worker_pool(pool, cpu)
                        pool->node = cpu_to_node(cpu);
        }

        list_for_each_entry(wq, &workqueues, list) {
                WARN(init_rescuer(wq),
                     "workqueue: failed to create early rescuer for %s",
                     wq->name);
        }

        mutex_unlock(&wq_pool_mutex);

        /*
         * Create the initial workers. A BH pool has one pseudo worker that
         * represents the shared BH execution context and thus doesn't get
         * affected by hotplug events. Create the BH pseudo workers for all
         * possible CPUs here.
         */
        for_each_possible_cpu(cpu)
                for_each_bh_worker_pool(pool, cpu)
                        BUG_ON(!create_worker(pool));

        for_each_online_cpu(cpu) {
                for_each_cpu_worker_pool(pool, cpu) {
                        pool->flags &= ~POOL_DISASSOCIATED;
                        BUG_ON(!create_worker(pool));
                }
        }

        hash_for_each(unbound_pool_hash, bkt, pool, hash_node)
                BUG_ON(!create_worker(pool));

        wq_online = true;
        wq_watchdog_init();
}

/*
 * Initialize @pt by first initializing @pt->cpu_pod[] with pod IDs according to
 * @cpu_shares_pod(). Each subset of CPUs that share a pod is assigned a unique
 * and consecutive pod ID. The rest of @pt is initialized accordingly.
 */
static void __init init_pod_type(struct wq_pod_type *pt,
                                 bool (*cpus_share_pod)(int, int))
{
        int cur, pre, cpu, pod;

        pt->nr_pods = 0;

        /* init @pt->cpu_pod[] according to @cpus_share_pod() */
        pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL);
        BUG_ON(!pt->cpu_pod);

        for_each_possible_cpu(cur) {
                for_each_possible_cpu(pre) {
                        if (pre >= cur) {
                                pt->cpu_pod[cur] = pt->nr_pods++;
                                break;
                        }
                        if (cpus_share_pod(cur, pre)) {
                                pt->cpu_pod[cur] = pt->cpu_pod[pre];
                                break;
                        }
                }
        }

        /* init the rest to match @pt->cpu_pod[] */
        pt->pod_cpus = kcalloc(pt->nr_pods, sizeof(pt->pod_cpus[0]), GFP_KERNEL);
        pt->pod_node = kcalloc(pt->nr_pods, sizeof(pt->pod_node[0]), GFP_KERNEL);
        BUG_ON(!pt->pod_cpus || !pt->pod_node);

        for (pod = 0; pod < pt->nr_pods; pod++)
                BUG_ON(!zalloc_cpumask_var(&pt->pod_cpus[pod], GFP_KERNEL));

        for_each_possible_cpu(cpu) {
                cpumask_set_cpu(cpu, pt->pod_cpus[pt->cpu_pod[cpu]]);
                pt->pod_node[pt->cpu_pod[cpu]] = cpu_to_node(cpu);
        }
}

static bool __init cpus_dont_share(int cpu0, int cpu1)
{
        return false;
}

static bool __init cpus_share_smt(int cpu0, int cpu1)
{
#ifdef CONFIG_SCHED_SMT
        return cpumask_test_cpu(cpu0, cpu_smt_mask(cpu1));
#else
        return false;
#endif
}

static bool __init cpus_share_numa(int cpu0, int cpu1)
{
        return cpu_to_node(cpu0) == cpu_to_node(cpu1);
}

/**
 * workqueue_init_topology - initialize CPU pods for unbound workqueues
 *
 * This is the third step of three-staged workqueue subsystem initialization and
 * invoked after SMP and topology information are fully initialized. It
 * initializes the unbound CPU pods accordingly.
 */
void __init workqueue_init_topology(void)
{
        struct workqueue_struct *wq;
        int cpu;

        init_pod_type(&wq_pod_types[WQ_AFFN_CPU], cpus_dont_share);
        init_pod_type(&wq_pod_types[WQ_AFFN_SMT], cpus_share_smt);
        init_pod_type(&wq_pod_types[WQ_AFFN_CACHE], cpus_share_cache);
        init_pod_type(&wq_pod_types[WQ_AFFN_NUMA], cpus_share_numa);

        wq_topo_initialized = true;

        mutex_lock(&wq_pool_mutex);

        /*
         * Workqueues allocated earlier would have all CPUs sharing the default
         * worker pool. Explicitly call unbound_wq_update_pwq() on all workqueue
         * and CPU combinations to apply per-pod sharing.
         */
        list_for_each_entry(wq, &workqueues, list) {
                for_each_online_cpu(cpu)
                        unbound_wq_update_pwq(wq, cpu);
                if (wq->flags & WQ_UNBOUND) {
                        mutex_lock(&wq->mutex);
                        wq_update_node_max_active(wq, -1);
                        mutex_unlock(&wq->mutex);
                }
        }

        mutex_unlock(&wq_pool_mutex);
}

void __warn_flushing_systemwide_wq(void)
{
        pr_warn("WARNING: Flushing system-wide workqueues will be prohibited in near future.\n");
        dump_stack();
}
EXPORT_SYMBOL(__warn_flushing_systemwide_wq);

static int __init workqueue_unbound_cpus_setup(char *str)
{
        if (cpulist_parse(str, &wq_cmdline_cpumask) < 0) {
                cpumask_clear(&wq_cmdline_cpumask);
                pr_warn("workqueue.unbound_cpus: incorrect CPU range, using default\n");
        }

        return 1;
}
__setup("workqueue.unbound_cpus=", workqueue_unbound_cpus_setup);














































































































































































































































































































































































































































































































































































































































































































































  317 









  317 






































































































































































































































































  265 












































































































   12 










































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Linux Socket Filter Data Structures
 */
#ifndef __LINUX_FILTER_H__
#define __LINUX_FILTER_H__

#include <linux/atomic.h>
#include <linux/bpf.h>
#include <linux/refcount.h>
#include <linux/compat.h>
#include <linux/skbuff.h>
#include <linux/linkage.h>
#include <linux/printk.h>
#include <linux/workqueue.h>
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <linux/capability.h>
#include <linux/set_memory.h>
#include <linux/kallsyms.h>
#include <linux/if_vlan.h>
#include <linux/vmalloc.h>
#include <linux/sockptr.h>
#include <crypto/sha1.h>
#include <linux/u64_stats_sync.h>

#include <net/sch_generic.h>

#include <asm/byteorder.h>
#include <uapi/linux/filter.h>

struct sk_buff;
struct sock;
struct seccomp_data;
struct bpf_prog_aux;
struct xdp_rxq_info;
struct xdp_buff;
struct sock_reuseport;
struct ctl_table;
struct ctl_table_header;

/* ArgX, context and stack frame pointer register positions. Note,
 * Arg1, Arg2, Arg3, etc are used as argument mappings of function
 * calls in BPF_CALL instruction.
 */
#define BPF_REG_ARG1        BPF_REG_1
#define BPF_REG_ARG2        BPF_REG_2
#define BPF_REG_ARG3        BPF_REG_3
#define BPF_REG_ARG4        BPF_REG_4
#define BPF_REG_ARG5        BPF_REG_5
#define BPF_REG_CTX        BPF_REG_6
#define BPF_REG_FP        BPF_REG_10

/* Additional register mappings for converted user programs. */
#define BPF_REG_A        BPF_REG_0
#define BPF_REG_X        BPF_REG_7
#define BPF_REG_TMP        BPF_REG_2        /* scratch reg */
#define BPF_REG_D        BPF_REG_8        /* data, callee-saved */
#define BPF_REG_H        BPF_REG_9        /* hlen, callee-saved */

/* Kernel hidden auxiliary/helper register. */
#define BPF_REG_AX                MAX_BPF_REG
#define MAX_BPF_EXT_REG                (MAX_BPF_REG + 1)
#define MAX_BPF_JIT_REG                MAX_BPF_EXT_REG

/* unused opcode to mark special call to bpf_tail_call() helper */
#define BPF_TAIL_CALL        0xf0

/* unused opcode to mark special load instruction. Same as BPF_ABS */
#define BPF_PROBE_MEM        0x20

/* unused opcode to mark special ldsx instruction. Same as BPF_IND */
#define BPF_PROBE_MEMSX        0x40

/* unused opcode to mark special load instruction. Same as BPF_MSH */
#define BPF_PROBE_MEM32        0xa0

/* unused opcode to mark special atomic instruction */
#define BPF_PROBE_ATOMIC 0xe0

/* unused opcode to mark special ldsx instruction. Same as BPF_NOSPEC */
#define BPF_PROBE_MEM32SX 0xc0

/* unused opcode to mark call to interpreter with arguments */
#define BPF_CALL_ARGS        0xe0

/* unused opcode to mark speculation barrier for mitigating
 * Spectre v1 and v4
 */
#define BPF_NOSPEC        0xc0

/* As per nm, we expose JITed images as text (code) section for
 * kallsyms. That way, tools like perf can find it to match
 * addresses.
 */
#define BPF_SYM_ELF_TYPE        't'

/* BPF program can access up to 512 bytes of stack space. */
#define MAX_BPF_STACK        512

/* Helper macros for filter block array initializers. */

/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */

#define BPF_ALU64_REG_OFF(OP, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_OP(OP) | BPF_X,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

#define BPF_ALU64_REG(OP, DST, SRC)                                \
        BPF_ALU64_REG_OFF(OP, DST, SRC, 0)

#define BPF_ALU32_REG_OFF(OP, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_OP(OP) | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

#define BPF_ALU32_REG(OP, DST, SRC)                                \
        BPF_ALU32_REG_OFF(OP, DST, SRC, 0)

/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */

#define BPF_ALU64_IMM_OFF(OP, DST, IMM, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_OP(OP) | BPF_K,        \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = OFF,                                        \
                .imm   = IMM })
#define BPF_ALU64_IMM(OP, DST, IMM)                                \
        BPF_ALU64_IMM_OFF(OP, DST, IMM, 0)

#define BPF_ALU32_IMM_OFF(OP, DST, IMM, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_OP(OP) | BPF_K,                \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = OFF,                                        \
                .imm   = IMM })
#define BPF_ALU32_IMM(OP, DST, IMM)                                \
        BPF_ALU32_IMM_OFF(OP, DST, IMM, 0)

/* Endianess conversion, cpu_to_{l,b}e(), {l,b}e_to_cpu() */

#define BPF_ENDIAN(TYPE, DST, LEN)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_END | BPF_SRC(TYPE),        \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = LEN })

/* Byte Swap, bswap16/32/64 */

#define BPF_BSWAP(DST, LEN)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_END | BPF_SRC(BPF_TO_LE),        \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = LEN })

/* Short form of mov, dst_reg = src_reg */

#define BPF_MOV64_REG(DST, SRC)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_MOV | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = 0 })

#define BPF_MOV32_REG(DST, SRC)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_MOV | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = 0 })

/* Special (internal-only) form of mov, used to resolve per-CPU addrs:
 * dst_reg = src_reg + <percpu_base_off>
 * BPF_ADDR_PERCPU is used as a special insn->off value.
 */
#define BPF_ADDR_PERCPU        (-1)

#define BPF_MOV64_PERCPU_REG(DST, SRC)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_MOV | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = BPF_ADDR_PERCPU,                        \
                .imm   = 0 })

static inline bool insn_is_mov_percpu_addr(const struct bpf_insn *insn)
{
        return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->off == BPF_ADDR_PERCPU;
}

/* Short form of mov, dst_reg = imm32 */

#define BPF_MOV64_IMM(DST, IMM)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_MOV | BPF_K,                \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

#define BPF_MOV32_IMM(DST, IMM)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_MOV | BPF_K,                \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

/* Short form of movsx, dst_reg = (s8,s16,s32)src_reg */

#define BPF_MOVSX64_REG(DST, SRC, OFF)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_MOV | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

#define BPF_MOVSX32_REG(DST, SRC, OFF)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_MOV | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Special form of mov32, used for doing explicit zero extension on dst. */
#define BPF_ZEXT_REG(DST)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_MOV | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = DST,                                        \
                .off   = 0,                                        \
                .imm   = 1 })

static inline bool insn_is_zext(const struct bpf_insn *insn)
{
        return insn->code == (BPF_ALU | BPF_MOV | BPF_X) && insn->imm == 1;
}

/* addr_space_cast from as(0) to as(1) is for converting bpf arena pointers
 * to pointers in user vma.
 */
static inline bool insn_is_cast_user(const struct bpf_insn *insn)
{
        return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) &&
                              insn->off == BPF_ADDR_SPACE_CAST &&
                              insn->imm == 1U << 16;
}

/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */
#define BPF_LD_IMM64(DST, IMM)                                        \
        BPF_LD_IMM64_RAW(DST, 0, IMM)

#define BPF_LD_IMM64_RAW(DST, SRC, IMM)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_LD | BPF_DW | BPF_IMM,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = (__u32) (IMM) }),                        \
        ((struct bpf_insn) {                                        \
                .code  = 0, /* zero is reserved opcode */        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = ((__u64) (IMM)) >> 32 })

/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */
#define BPF_LD_MAP_FD(DST, MAP_FD)                                \
        BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)

/* Short form of mov based on type, BPF_X: dst_reg = src_reg, BPF_K: dst_reg = imm32 */

#define BPF_MOV64_RAW(TYPE, DST, SRC, IMM)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_MOV | BPF_SRC(TYPE),        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

#define BPF_MOV32_RAW(TYPE, DST, SRC, IMM)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_MOV | BPF_SRC(TYPE),        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

/* Direct packet access, R0 = *(uint *) (skb->data + imm32) */

#define BPF_LD_ABS(SIZE, IMM)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS,        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

/* Indirect packet access, R0 = *(uint *) (skb->data + src_reg + imm32) */

#define BPF_LD_IND(SIZE, SRC, IMM)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_LD | BPF_SIZE(SIZE) | BPF_IND,        \
                .dst_reg = 0,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

/* Memory load, dst_reg = *(uint *) (src_reg + off16) */

#define BPF_LDX_MEM(SIZE, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Memory load, dst_reg = *(signed size *) (src_reg + off16) */

#define BPF_LDX_MEMSX(SIZE, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEMSX,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Memory store, *(uint *) (dst_reg + off16) = src_reg */

#define BPF_STX_MEM(SIZE, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })


/*
 * Atomic operations:
 *
 *   BPF_ADD                  *(uint *) (dst_reg + off16) += src_reg
 *   BPF_AND                  *(uint *) (dst_reg + off16) &= src_reg
 *   BPF_OR                   *(uint *) (dst_reg + off16) |= src_reg
 *   BPF_XOR                  *(uint *) (dst_reg + off16) ^= src_reg
 *   BPF_ADD | BPF_FETCH      src_reg = atomic_fetch_add(dst_reg + off16, src_reg);
 *   BPF_AND | BPF_FETCH      src_reg = atomic_fetch_and(dst_reg + off16, src_reg);
 *   BPF_OR | BPF_FETCH       src_reg = atomic_fetch_or(dst_reg + off16, src_reg);
 *   BPF_XOR | BPF_FETCH      src_reg = atomic_fetch_xor(dst_reg + off16, src_reg);
 *   BPF_XCHG                 src_reg = atomic_xchg(dst_reg + off16, src_reg)
 *   BPF_CMPXCHG              r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg)
 *   BPF_LOAD_ACQ             dst_reg = smp_load_acquire(src_reg + off16)
 *   BPF_STORE_REL            smp_store_release(dst_reg + off16, src_reg)
 */

#define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = OP })

/* Legacy alias */
#define BPF_STX_XADD(SIZE, DST, SRC, OFF) BPF_ATOMIC_OP(SIZE, BPF_ADD, DST, SRC, OFF)

/* Memory store, *(uint *) (dst_reg + off16) = imm32 */

#define BPF_ST_MEM(SIZE, DST, OFF, IMM)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM,        \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = OFF,                                        \
                .imm   = IMM })

/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */

#define BPF_JMP_REG(OP, DST, SRC, OFF)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_OP(OP) | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */

#define BPF_JMP_IMM(OP, DST, IMM, OFF)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_OP(OP) | BPF_K,                \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = OFF,                                        \
                .imm   = IMM })

/* Like BPF_JMP_REG, but with 32-bit wide operands for comparison. */

#define BPF_JMP32_REG(OP, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP32 | BPF_OP(OP) | BPF_X,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Like BPF_JMP_IMM, but with 32-bit wide operands for comparison. */

#define BPF_JMP32_IMM(OP, DST, IMM, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP32 | BPF_OP(OP) | BPF_K,        \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = OFF,                                        \
                .imm   = IMM })

/* Unconditional jumps, goto pc + off16 */

#define BPF_JMP_A(OFF)                                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_JA,                        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Unconditional jumps, gotol pc + imm32 */

#define BPF_JMP32_A(IMM)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP32 | BPF_JA,                        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

/* Relative call */

#define BPF_CALL_REL(TGT)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_CALL,                        \
                .dst_reg = 0,                                        \
                .src_reg = BPF_PSEUDO_CALL,                        \
                .off   = 0,                                        \
                .imm   = TGT })

/* Convert function address to BPF immediate */

#define BPF_CALL_IMM(x)        ((void *)(x) - (void *)__bpf_call_base)

#define BPF_EMIT_CALL(FUNC)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_CALL,                        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = BPF_CALL_IMM(FUNC) })

/* Kfunc call */

#define BPF_CALL_KFUNC(OFF, IMM)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_CALL,                        \
                .dst_reg = 0,                                        \
                .src_reg = BPF_PSEUDO_KFUNC_CALL,                \
                .off   = OFF,                                        \
                .imm   = IMM })

/* Raw code statement block */

#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM)                        \
        ((struct bpf_insn) {                                        \
                .code  = CODE,                                        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = IMM })

/* Program exit */

#define BPF_EXIT_INSN()                                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_EXIT,                        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = 0 })

/* Speculation barrier */

#define BPF_ST_NOSPEC()                                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ST | BPF_NOSPEC,                        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = 0 })

/* Internal classic blocks for direct assignment */

#define __BPF_STMT(CODE, K)                                        \
        ((struct sock_filter) BPF_STMT(CODE, K))

#define __BPF_JUMP(CODE, K, JT, JF)                                \
        ((struct sock_filter) BPF_JUMP(CODE, K, JT, JF))

#define bytes_to_bpf_size(bytes)                                \
({                                                                \
        int bpf_size = -EINVAL;                                        \
                                                                \
        if (bytes == sizeof(u8))                                \
                bpf_size = BPF_B;                                \
        else if (bytes == sizeof(u16))                                \
                bpf_size = BPF_H;                                \
        else if (bytes == sizeof(u32))                                \
                bpf_size = BPF_W;                                \
        else if (bytes == sizeof(u64))                                \
                bpf_size = BPF_DW;                                \
                                                                \
        bpf_size;                                                \
})

#define bpf_size_to_bytes(bpf_size)                                \
({                                                                \
        int bytes = -EINVAL;                                        \
                                                                \
        if (bpf_size == BPF_B)                                        \
                bytes = sizeof(u8);                                \
        else if (bpf_size == BPF_H)                                \
                bytes = sizeof(u16);                                \
        else if (bpf_size == BPF_W)                                \
                bytes = sizeof(u32);                                \
        else if (bpf_size == BPF_DW)                                \
                bytes = sizeof(u64);                                \
                                                                \
        bytes;                                                        \
})

#define BPF_SIZEOF(type)                                        \
        ({                                                        \
                const int __size = bytes_to_bpf_size(sizeof(type)); \
                BUILD_BUG_ON(__size < 0);                        \
                __size;                                                \
        })

#define BPF_FIELD_SIZEOF(type, field)                                \
        ({                                                        \
                const int __size = bytes_to_bpf_size(sizeof_field(type, field)); \
                BUILD_BUG_ON(__size < 0);                        \
                __size;                                                \
        })

#define BPF_LDST_BYTES(insn)                                        \
        ({                                                        \
                const int __size = bpf_size_to_bytes(BPF_SIZE((insn)->code)); \
                WARN_ON(__size < 0);                                \
                __size;                                                \
        })

#define __BPF_MAP_0(m, v, ...) v
#define __BPF_MAP_1(m, v, t, a, ...) m(t, a)
#define __BPF_MAP_2(m, v, t, a, ...) m(t, a), __BPF_MAP_1(m, v, __VA_ARGS__)
#define __BPF_MAP_3(m, v, t, a, ...) m(t, a), __BPF_MAP_2(m, v, __VA_ARGS__)
#define __BPF_MAP_4(m, v, t, a, ...) m(t, a), __BPF_MAP_3(m, v, __VA_ARGS__)
#define __BPF_MAP_5(m, v, t, a, ...) m(t, a), __BPF_MAP_4(m, v, __VA_ARGS__)

#define __BPF_REG_0(...) __BPF_PAD(5)
#define __BPF_REG_1(...) __BPF_MAP(1, __VA_ARGS__), __BPF_PAD(4)
#define __BPF_REG_2(...) __BPF_MAP(2, __VA_ARGS__), __BPF_PAD(3)
#define __BPF_REG_3(...) __BPF_MAP(3, __VA_ARGS__), __BPF_PAD(2)
#define __BPF_REG_4(...) __BPF_MAP(4, __VA_ARGS__), __BPF_PAD(1)
#define __BPF_REG_5(...) __BPF_MAP(5, __VA_ARGS__)

#define __BPF_MAP(n, ...) __BPF_MAP_##n(__VA_ARGS__)
#define __BPF_REG(n, ...) __BPF_REG_##n(__VA_ARGS__)

#define __BPF_CAST(t, a)                                                       \
        (__force t)                                                               \
        (__force                                                               \
         typeof(__builtin_choose_expr(sizeof(t) == sizeof(unsigned long),      \
                                      (unsigned long)0, (t)0))) a
#define __BPF_V void
#define __BPF_N

#define __BPF_DECL_ARGS(t, a) t   a
#define __BPF_DECL_REGS(t, a) u64 a

#define __BPF_PAD(n)                                                               \
        __BPF_MAP(n, __BPF_DECL_ARGS, __BPF_N, u64, __ur_1, u64, __ur_2,       \
                  u64, __ur_3, u64, __ur_4, u64, __ur_5)

#define BPF_CALL_x(x, attr, name, ...)                                               \
        static __always_inline                                                       \
        u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__));   \
        typedef u64 (*btf_##name)(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \
        attr u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__));    \
        attr u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__))     \
        {                                                                       \
                return ((btf_##name)____##name)(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\
        }                                                                       \
        static __always_inline                                                       \
        u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__))

#define __NOATTR
#define BPF_CALL_0(name, ...)        BPF_CALL_x(0, __NOATTR, name, __VA_ARGS__)
#define BPF_CALL_1(name, ...)        BPF_CALL_x(1, __NOATTR, name, __VA_ARGS__)
#define BPF_CALL_2(name, ...)        BPF_CALL_x(2, __NOATTR, name, __VA_ARGS__)
#define BPF_CALL_3(name, ...)        BPF_CALL_x(3, __NOATTR, name, __VA_ARGS__)
#define BPF_CALL_4(name, ...)        BPF_CALL_x(4, __NOATTR, name, __VA_ARGS__)
#define BPF_CALL_5(name, ...)        BPF_CALL_x(5, __NOATTR, name, __VA_ARGS__)

#define NOTRACE_BPF_CALL_1(name, ...)        BPF_CALL_x(1, notrace, name, __VA_ARGS__)

#define bpf_ctx_range(TYPE, MEMBER)                                                \
        offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1
#define bpf_ctx_range_till(TYPE, MEMBER1, MEMBER2)                                \
        offsetof(TYPE, MEMBER1) ... offsetofend(TYPE, MEMBER2) - 1
#if BITS_PER_LONG == 64
# define bpf_ctx_range_ptr(TYPE, MEMBER)                                        \
        offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1
#else
# define bpf_ctx_range_ptr(TYPE, MEMBER)                                        \
        offsetof(TYPE, MEMBER) ... offsetof(TYPE, MEMBER) + 8 - 1
#endif /* BITS_PER_LONG == 64 */

#define bpf_target_off(TYPE, MEMBER, SIZE, PTR_SIZE)                                \
        ({                                                                        \
                BUILD_BUG_ON(sizeof_field(TYPE, MEMBER) != (SIZE));                \
                *(PTR_SIZE) = (SIZE);                                                \
                offsetof(TYPE, MEMBER);                                                \
        })

/* A struct sock_filter is architecture independent. */
struct compat_sock_fprog {
        u16                len;
        compat_uptr_t        filter;        /* struct sock_filter * */
};

struct sock_fprog_kern {
        u16                        len;
        struct sock_filter        *filter;
};

/* Some arches need doubleword alignment for their instructions and/or data */
#define BPF_IMAGE_ALIGNMENT 8

struct bpf_binary_header {
        u32 size;
        u8 image[] __aligned(BPF_IMAGE_ALIGNMENT);
};

struct bpf_prog_stats {
        u64_stats_t cnt;
        u64_stats_t nsecs;
        u64_stats_t misses;
        struct u64_stats_sync syncp;
} __aligned(2 * sizeof(u64));

struct bpf_timed_may_goto {
        u64 count;
        u64 timestamp;
};

struct sk_filter {
        refcount_t        refcnt;
        struct rcu_head        rcu;
        struct bpf_prog        *prog;
};

DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);

extern struct mutex nf_conn_btf_access_lock;
extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log,
                                     const struct bpf_reg_state *reg,
                                     int off, int size);

typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx,
                                          const struct bpf_insn *insnsi,
                                          unsigned int (*bpf_func)(const void *,
                                                                   const struct bpf_insn *));

static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
                                          const void *ctx,
                                          bpf_dispatcher_fn dfunc)
{
        u32 ret;

        cant_migrate();
        if (static_branch_unlikely(&bpf_stats_enabled_key)) {
                struct bpf_prog_stats *stats;
                u64 duration, start = sched_clock();
                unsigned long flags;

                ret = dfunc(ctx, prog->insnsi, prog->bpf_func);

                duration = sched_clock() - start;
                stats = this_cpu_ptr(prog->stats);
                flags = u64_stats_update_begin_irqsave(&stats->syncp);
                u64_stats_inc(&stats->cnt);
                u64_stats_add(&stats->nsecs, duration);
                u64_stats_update_end_irqrestore(&stats->syncp, flags);
        } else {
                ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
        }
        return ret;
}

static __always_inline u32 bpf_prog_run(const struct bpf_prog *prog, const void *ctx)
{
        return __bpf_prog_run(prog, ctx, bpf_dispatcher_nop_func);
}

/*
 * Use in preemptible and therefore migratable context to make sure that
 * the execution of the BPF program runs on one CPU.
 *
 * This uses migrate_disable/enable() explicitly to document that the
 * invocation of a BPF program does not require reentrancy protection
 * against a BPF program which is invoked from a preempting task.
 */
static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog,
                                          const void *ctx)
{
        u32 ret;

        migrate_disable();
        ret = bpf_prog_run(prog, ctx);
        migrate_enable();
        return ret;
}

#define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN

struct bpf_skb_data_end {
        struct qdisc_skb_cb qdisc_cb;
        void *data_meta;
        void *data_end;
};

struct bpf_nh_params {
        u32 nh_family;
        union {
                u32 ipv4_nh;
                struct in6_addr ipv6_nh;
        };
};

/* flags for bpf_redirect_info kern_flags */
#define BPF_RI_F_RF_NO_DIRECT        BIT(0)        /* no napi_direct on return_frame */
#define BPF_RI_F_RI_INIT        BIT(1)
#define BPF_RI_F_CPU_MAP_INIT        BIT(2)
#define BPF_RI_F_DEV_MAP_INIT        BIT(3)
#define BPF_RI_F_XSK_MAP_INIT        BIT(4)

struct bpf_redirect_info {
        u64 tgt_index;
        void *tgt_value;
        struct bpf_map *map;
        u32 flags;
        u32 map_id;
        enum bpf_map_type map_type;
        struct bpf_nh_params nh;
        u32 kern_flags;
};

struct bpf_net_context {
        struct bpf_redirect_info ri;
        struct list_head cpu_map_flush_list;
        struct list_head dev_map_flush_list;
        struct list_head xskmap_map_flush_list;
};

static inline struct bpf_net_context *bpf_net_ctx_set(struct bpf_net_context *bpf_net_ctx)
{
        struct task_struct *tsk = current;

        if (tsk->bpf_net_context != NULL)
                return NULL;
        bpf_net_ctx->ri.kern_flags = 0;

        tsk->bpf_net_context = bpf_net_ctx;
        return bpf_net_ctx;
}

static inline void bpf_net_ctx_clear(struct bpf_net_context *bpf_net_ctx)
{
        if (bpf_net_ctx)
                current->bpf_net_context = NULL;
}

static inline struct bpf_net_context *bpf_net_ctx_get(void)
{
        return current->bpf_net_context;
}

static inline struct bpf_redirect_info *bpf_net_ctx_get_ri(void)
{
        struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();

        if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_RI_INIT)) {
                memset(&bpf_net_ctx->ri, 0, offsetof(struct bpf_net_context, ri.nh));
                bpf_net_ctx->ri.kern_flags |= BPF_RI_F_RI_INIT;
        }

        return &bpf_net_ctx->ri;
}

static inline struct list_head *bpf_net_ctx_get_cpu_map_flush_list(void)
{
        struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();

        if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_CPU_MAP_INIT)) {
                INIT_LIST_HEAD(&bpf_net_ctx->cpu_map_flush_list);
                bpf_net_ctx->ri.kern_flags |= BPF_RI_F_CPU_MAP_INIT;
        }

        return &bpf_net_ctx->cpu_map_flush_list;
}

static inline struct list_head *bpf_net_ctx_get_dev_flush_list(void)
{
        struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();

        if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_DEV_MAP_INIT)) {
                INIT_LIST_HEAD(&bpf_net_ctx->dev_map_flush_list);
                bpf_net_ctx->ri.kern_flags |= BPF_RI_F_DEV_MAP_INIT;
        }

        return &bpf_net_ctx->dev_map_flush_list;
}

static inline struct list_head *bpf_net_ctx_get_xskmap_flush_list(void)
{
        struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();

        if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_XSK_MAP_INIT)) {
                INIT_LIST_HEAD(&bpf_net_ctx->xskmap_map_flush_list);
                bpf_net_ctx->ri.kern_flags |= BPF_RI_F_XSK_MAP_INIT;
        }

        return &bpf_net_ctx->xskmap_map_flush_list;
}

static inline void bpf_net_ctx_get_all_used_flush_lists(struct list_head **lh_map,
                                                        struct list_head **lh_dev,
                                                        struct list_head **lh_xsk)
{
        struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();
        u32 kern_flags = bpf_net_ctx->ri.kern_flags;
        struct list_head *lh;

        *lh_map = *lh_dev = *lh_xsk = NULL;

        if (!IS_ENABLED(CONFIG_BPF_SYSCALL))
                return;

        lh = &bpf_net_ctx->dev_map_flush_list;
        if (kern_flags & BPF_RI_F_DEV_MAP_INIT && !list_empty(lh))
                *lh_dev = lh;

        lh = &bpf_net_ctx->cpu_map_flush_list;
        if (kern_flags & BPF_RI_F_CPU_MAP_INIT && !list_empty(lh))
                *lh_map = lh;

        lh = &bpf_net_ctx->xskmap_map_flush_list;
        if (IS_ENABLED(CONFIG_XDP_SOCKETS) &&
            kern_flags & BPF_RI_F_XSK_MAP_INIT && !list_empty(lh))
                *lh_xsk = lh;
}

/* Compute the linear packet data range [data, data_end) which
 * will be accessed by various program types (cls_bpf, act_bpf,
 * lwt, ...). Subsystems allowing direct data access must (!)
 * ensure that cb[] area can be written to when BPF program is
 * invoked (otherwise cb[] save/restore is necessary).
 */
static inline void bpf_compute_data_pointers(struct sk_buff *skb)
{
        struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;

        BUILD_BUG_ON(sizeof(*cb) > sizeof_field(struct sk_buff, cb));
        cb->data_meta = skb->data - skb_metadata_len(skb);
        cb->data_end  = skb->data + skb_headlen(skb);
}

static inline int bpf_prog_run_data_pointers(
        const struct bpf_prog *prog,
        struct sk_buff *skb)
{
        struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;
        void *save_data_meta, *save_data_end;
        int res;

        save_data_meta = cb->data_meta;
        save_data_end = cb->data_end;

        bpf_compute_data_pointers(skb);
        res = bpf_prog_run(prog, skb);

        cb->data_meta = save_data_meta;
        cb->data_end = save_data_end;

        return res;
}

/* Similar to bpf_compute_data_pointers(), except that save orginal
 * data in cb->data and cb->meta_data for restore.
 */
static inline void bpf_compute_and_save_data_end(
        struct sk_buff *skb, void **saved_data_end)
{
        struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;

        *saved_data_end = cb->data_end;
        cb->data_end  = skb->data + skb_headlen(skb);
}

/* Restore data saved by bpf_compute_and_save_data_end(). */
static inline void bpf_restore_data_end(
        struct sk_buff *skb, void *saved_data_end)
{
        struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;

        cb->data_end = saved_data_end;
}

static inline u8 *bpf_skb_cb(const struct sk_buff *skb)
{
        /* eBPF programs may read/write skb->cb[] area to transfer meta
         * data between tail calls. Since this also needs to work with
         * tc, that scratch memory is mapped to qdisc_skb_cb's data area.
         *
         * In some socket filter cases, the cb unfortunately needs to be
         * saved/restored so that protocol specific skb->cb[] data won't
         * be lost. In any case, due to unpriviledged eBPF programs
         * attached to sockets, we need to clear the bpf_skb_cb() area
         * to not leak previous contents to user space.
         */
        BUILD_BUG_ON(sizeof_field(struct __sk_buff, cb) != BPF_SKB_CB_LEN);
        BUILD_BUG_ON(sizeof_field(struct __sk_buff, cb) !=
                     sizeof_field(struct qdisc_skb_cb, data));

        return qdisc_skb_cb(skb)->data;
}

/* Must be invoked with migration disabled */
static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog,
                                         const void *ctx)
{
        const struct sk_buff *skb = ctx;
        u8 *cb_data = bpf_skb_cb(skb);
        u8 cb_saved[BPF_SKB_CB_LEN];
        u32 res;

        if (unlikely(prog->cb_access)) {
                memcpy(cb_saved, cb_data, sizeof(cb_saved));
                memset(cb_data, 0, sizeof(cb_saved));
        }

        res = bpf_prog_run(prog, skb);

        if (unlikely(prog->cb_access))
                memcpy(cb_data, cb_saved, sizeof(cb_saved));

        return res;
}

static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog,
                                       struct sk_buff *skb)
{
        u32 res;

        migrate_disable();
        res = __bpf_prog_run_save_cb(prog, skb);
        migrate_enable();
        return res;
}

static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog,
                                        struct sk_buff *skb)
{
        u8 *cb_data = bpf_skb_cb(skb);
        u32 res;

        if (unlikely(prog->cb_access))
                memset(cb_data, 0, BPF_SKB_CB_LEN);

        res = bpf_prog_run_pin_on_cpu(prog, skb);
        return res;
}

DECLARE_BPF_DISPATCHER(xdp)

DECLARE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key);

u32 xdp_master_redirect(struct xdp_buff *xdp);

void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog);

static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog)
{
        return prog->len * sizeof(struct bpf_insn);
}

static inline unsigned int bpf_prog_size(unsigned int proglen)
{
        return max(sizeof(struct bpf_prog),
                   offsetof(struct bpf_prog, insns[proglen]));
}

static inline bool bpf_prog_was_classic(const struct bpf_prog *prog)
{
        /* When classic BPF programs have been loaded and the arch
         * does not have a classic BPF JIT (anymore), they have been
         * converted via bpf_migrate_filter() to eBPF and thus always
         * have an unspec program type.
         */
        return prog->type == BPF_PROG_TYPE_UNSPEC;
}

static inline u32 bpf_ctx_off_adjust_machine(u32 size)
{
        const u32 size_machine = sizeof(unsigned long);

        if (size > size_machine && size % size_machine == 0)
                size = size_machine;

        return size;
}

static inline bool
bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
{
        return size <= size_default && (size & (size - 1)) == 0;
}

static inline u8
bpf_ctx_narrow_access_offset(u32 off, u32 size, u32 size_default)
{
        u8 access_off = off & (size_default - 1);

#ifdef __LITTLE_ENDIAN
        return access_off;
#else
        return size_default - (access_off + size);
#endif
}

#define bpf_ctx_wide_access_ok(off, size, type, field)                        \
        (size == sizeof(__u64) &&                                        \
        off >= offsetof(type, field) &&                                        \
        off + sizeof(__u64) <= offsetofend(type, field) &&                \
        off % sizeof(__u64) == 0)

#define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0]))

static inline int __must_check bpf_prog_lock_ro(struct bpf_prog *fp)
{
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
        if (!fp->jited) {
                set_vm_flush_reset_perms(fp);
                return set_memory_ro((unsigned long)fp, fp->pages);
        }
#endif
        return 0;
}

static inline int __must_check
bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
{
        set_vm_flush_reset_perms(hdr);
        return set_memory_rox((unsigned long)hdr, hdr->size >> PAGE_SHIFT);
}

int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap,
                       enum skb_drop_reason *reason);

static inline int sk_filter(struct sock *sk, struct sk_buff *skb)
{
        enum skb_drop_reason ignore_reason;

        return sk_filter_trim_cap(sk, skb, 1, &ignore_reason);
}

static inline int sk_filter_reason(struct sock *sk, struct sk_buff *skb,
                                   enum skb_drop_reason *reason)
{
        return sk_filter_trim_cap(sk, skb, 1, reason);
}

struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err);
void bpf_prog_free(struct bpf_prog *fp);

bool bpf_opcode_in_insntable(u8 code);

void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
                               const u32 *insn_to_jit_off);
int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog);
void bpf_prog_jit_attempt_done(struct bpf_prog *prog);

struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags);
struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags);
struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
                                  gfp_t gfp_extra_flags);
void __bpf_prog_free(struct bpf_prog *fp);

static inline void bpf_prog_unlock_free(struct bpf_prog *fp)
{
        __bpf_prog_free(fp);
}

typedef int (*bpf_aux_classic_check_t)(struct sock_filter *filter,
                                       unsigned int flen);

int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog);
int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
                              bpf_aux_classic_check_t trans, bool save_orig);
void bpf_prog_destroy(struct bpf_prog *fp);

int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
int sk_attach_bpf(u32 ufd, struct sock *sk);
int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk);
int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk);
void sk_reuseport_prog_free(struct bpf_prog *prog);
int sk_detach_filter(struct sock *sk);
int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len);

bool sk_filter_charge(struct sock *sk, struct sk_filter *fp);
void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);

u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
#define __bpf_call_base_args \
        ((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \
         (void *)__bpf_call_base)

struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
void bpf_jit_compile(struct bpf_prog *prog);
bool bpf_jit_needs_zext(void);
bool bpf_jit_inlines_helper_call(s32 imm);
bool bpf_jit_supports_subprog_tailcalls(void);
bool bpf_jit_supports_percpu_insn(void);
bool bpf_jit_supports_kfunc_call(void);
bool bpf_jit_supports_far_kfunc_call(void);
bool bpf_jit_supports_exceptions(void);
bool bpf_jit_supports_ptr_xchg(void);
bool bpf_jit_supports_arena(void);
bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena);
bool bpf_jit_supports_private_stack(void);
bool bpf_jit_supports_timed_may_goto(void);
u64 bpf_arch_uaddress_limit(void);
void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
u64 arch_bpf_timed_may_goto(void);
u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *);
bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id);

static inline bool bpf_dump_raw_ok(const struct cred *cred)
{
        /* Reconstruction of call-sites is dependent on kallsyms,
         * thus make dump the same restriction.
         */
        return kallsyms_show_value(cred);
}

struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
                                       const struct bpf_insn *patch, u32 len);
int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt);

static inline bool xdp_return_frame_no_direct(void)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();

        return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT;
}

static inline void xdp_set_return_frame_no_direct(void)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();

        ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT;
}

static inline void xdp_clear_return_frame_no_direct(void)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();

        ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT;
}

static inline int xdp_ok_fwd_dev(const struct net_device *fwd,
                                 unsigned int pktlen)
{
        unsigned int len;

        if (unlikely(!(fwd->flags & IFF_UP)))
                return -ENETDOWN;

        len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN;
        if (pktlen > len)
                return -EMSGSIZE;

        return 0;
}

/* The pair of xdp_do_redirect and xdp_do_flush MUST be called in the
 * same cpu context. Further for best results no more than a single map
 * for the do_redirect/do_flush pair should be used. This limitation is
 * because we only track one map and force a flush when the map changes.
 * This does not appear to be a real limitation for existing software.
 */
int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
                            struct xdp_buff *xdp, const struct bpf_prog *prog);
int xdp_do_redirect(struct net_device *dev,
                    struct xdp_buff *xdp,
                    const struct bpf_prog *prog);
int xdp_do_redirect_frame(struct net_device *dev,
                          struct xdp_buff *xdp,
                          struct xdp_frame *xdpf,
                          const struct bpf_prog *prog);
void xdp_do_flush(void);

void bpf_warn_invalid_xdp_action(const struct net_device *dev,
                                 const struct bpf_prog *prog, u32 act);

#ifdef CONFIG_INET
struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
                                  struct bpf_prog *prog, struct sk_buff *skb,
                                  struct sock *migrating_sk,
                                  u32 hash);
#else
static inline struct sock *
bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
                     struct bpf_prog *prog, struct sk_buff *skb,
                     struct sock *migrating_sk,
                     u32 hash)
{
        return NULL;
}
#endif

#ifdef CONFIG_BPF_JIT
extern int bpf_jit_enable;
extern int bpf_jit_harden;
extern int bpf_jit_kallsyms;
extern long bpf_jit_limit;
extern long bpf_jit_limit_max;

typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);

void bpf_jit_fill_hole_with_zero(void *area, unsigned int size);

struct bpf_binary_header *
bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
                     unsigned int alignment,
                     bpf_jit_fill_hole_t bpf_fill_ill_insns);
void bpf_jit_binary_free(struct bpf_binary_header *hdr);
u64 bpf_jit_alloc_exec_limit(void);
void *bpf_jit_alloc_exec(unsigned long size);
void bpf_jit_free_exec(void *addr);
void bpf_jit_free(struct bpf_prog *fp);
struct bpf_binary_header *
bpf_jit_binary_pack_hdr(const struct bpf_prog *fp);

void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns);
void bpf_prog_pack_free(void *ptr, u32 size);

static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
{
        return list_empty(&fp->aux->ksym.lnode) ||
               fp->aux->ksym.lnode.prev == LIST_POISON2;
}

struct bpf_binary_header *
bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **ro_image,
                          unsigned int alignment,
                          struct bpf_binary_header **rw_hdr,
                          u8 **rw_image,
                          bpf_jit_fill_hole_t bpf_fill_ill_insns);
int bpf_jit_binary_pack_finalize(struct bpf_binary_header *ro_header,
                                 struct bpf_binary_header *rw_header);
void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
                              struct bpf_binary_header *rw_header);

int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
                                struct bpf_jit_poke_descriptor *poke);

int bpf_jit_get_func_addr(const struct bpf_prog *prog,
                          const struct bpf_insn *insn, bool extra_pass,
                          u64 *func_addr, bool *func_addr_fixed);

const char *bpf_jit_get_prog_name(struct bpf_prog *prog);

struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *fp);
void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other);

static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen,
                                u32 pass, void *image)
{
        pr_err("flen=%u proglen=%u pass=%u image=%p from=%s pid=%d\n", flen,
               proglen, pass, image, current->comm, task_pid_nr(current));

        if (image)
                print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_OFFSET,
                               16, 1, image, proglen, false);
}

static inline bool bpf_jit_is_ebpf(void)
{
# ifdef CONFIG_HAVE_EBPF_JIT
        return true;
# else
        return false;
# endif
}

static inline bool ebpf_jit_enabled(void)
{
        return bpf_jit_enable && bpf_jit_is_ebpf();
}

static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp)
{
        return fp->jited && bpf_jit_is_ebpf();
}

static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog)
{
        /* These are the prerequisites, should someone ever have the
         * idea to call blinding outside of them, we make sure to
         * bail out.
         */
        if (!bpf_jit_is_ebpf())
                return false;
        if (!prog->jit_requested)
                return false;
        if (!bpf_jit_harden)
                return false;
        if (bpf_jit_harden == 1 && bpf_token_capable(prog->aux->token, CAP_BPF))
                return false;

        return true;
}

static inline bool bpf_jit_kallsyms_enabled(void)
{
        /* There are a couple of corner cases where kallsyms should
         * not be enabled f.e. on hardening.
         */
        if (bpf_jit_harden)
                return false;
        if (!bpf_jit_kallsyms)
                return false;
        if (bpf_jit_kallsyms == 1)
                return true;

        return false;
}

int __bpf_address_lookup(unsigned long addr, unsigned long *size,
                                 unsigned long *off, char *sym);
bool is_bpf_text_address(unsigned long addr);
int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
                    char *sym);
struct bpf_prog *bpf_prog_ksym_find(unsigned long addr);

static inline int
bpf_address_lookup(unsigned long addr, unsigned long *size,
                   unsigned long *off, char **modname, char *sym)
{
        int ret = __bpf_address_lookup(addr, size, off, sym);

        if (ret && modname)
                *modname = NULL;
        return ret;
}

void bpf_prog_kallsyms_add(struct bpf_prog *fp);
void bpf_prog_kallsyms_del(struct bpf_prog *fp);

#else /* CONFIG_BPF_JIT */

static inline bool ebpf_jit_enabled(void)
{
        return false;
}

static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog)
{
        return false;
}

static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp)
{
        return false;
}

static inline int
bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
                            struct bpf_jit_poke_descriptor *poke)
{
        return -ENOTSUPP;
}

static inline void bpf_jit_free(struct bpf_prog *fp)
{
        bpf_prog_unlock_free(fp);
}

static inline bool bpf_jit_kallsyms_enabled(void)
{
        return false;
}

static inline int
__bpf_address_lookup(unsigned long addr, unsigned long *size,
                     unsigned long *off, char *sym)
{
        return 0;
}

static inline bool is_bpf_text_address(unsigned long addr)
{
        return false;
}

static inline int bpf_get_kallsym(unsigned int symnum, unsigned long *value,
                                  char *type, char *sym)
{
        return -ERANGE;
}

static inline struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
{
        return NULL;
}

static inline int
bpf_address_lookup(unsigned long addr, unsigned long *size,
                   unsigned long *off, char **modname, char *sym)
{
        return 0;
}

static inline void bpf_prog_kallsyms_add(struct bpf_prog *fp)
{
}

static inline void bpf_prog_kallsyms_del(struct bpf_prog *fp)
{
}

#endif /* CONFIG_BPF_JIT */

void bpf_prog_kallsyms_del_all(struct bpf_prog *fp);

#define BPF_ANC                BIT(15)

static inline bool bpf_needs_clear_a(const struct sock_filter *first)
{
        switch (first->code) {
        case BPF_RET | BPF_K:
        case BPF_LD | BPF_W | BPF_LEN:
                return false;

        case BPF_LD | BPF_W | BPF_ABS:
        case BPF_LD | BPF_H | BPF_ABS:
        case BPF_LD | BPF_B | BPF_ABS:
                if (first->k == SKF_AD_OFF + SKF_AD_ALU_XOR_X)
                        return true;
                return false;

        default:
                return true;
        }
}

static inline u16 bpf_anc_helper(const struct sock_filter *ftest)
{
        BUG_ON(ftest->code & BPF_ANC);

        switch (ftest->code) {
        case BPF_LD | BPF_W | BPF_ABS:
        case BPF_LD | BPF_H | BPF_ABS:
        case BPF_LD | BPF_B | BPF_ABS:
#define BPF_ANCILLARY(CODE)        case SKF_AD_OFF + SKF_AD_##CODE:        \
                                return BPF_ANC | SKF_AD_##CODE
                switch (ftest->k) {
                BPF_ANCILLARY(PROTOCOL);
                BPF_ANCILLARY(PKTTYPE);
                BPF_ANCILLARY(IFINDEX);
                BPF_ANCILLARY(NLATTR);
                BPF_ANCILLARY(NLATTR_NEST);
                BPF_ANCILLARY(MARK);
                BPF_ANCILLARY(QUEUE);
                BPF_ANCILLARY(HATYPE);
                BPF_ANCILLARY(RXHASH);
                BPF_ANCILLARY(CPU);
                BPF_ANCILLARY(ALU_XOR_X);
                BPF_ANCILLARY(VLAN_TAG);
                BPF_ANCILLARY(VLAN_TAG_PRESENT);
                BPF_ANCILLARY(PAY_OFFSET);
                BPF_ANCILLARY(RANDOM);
                BPF_ANCILLARY(VLAN_TPID);
                }
                fallthrough;
        default:
                return ftest->code;
        }
}

void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb,
                                           int k, unsigned int size);

static inline int bpf_tell_extensions(void)
{
        return SKF_AD_MAX;
}

struct bpf_sock_addr_kern {
        struct sock *sk;
        struct sockaddr *uaddr;
        /* Temporary "register" to make indirect stores to nested structures
         * defined above. We need three registers to make such a store, but
         * only two (src and dst) are available at convert_ctx_access time
         */
        u64 tmp_reg;
        void *t_ctx;        /* Attach type specific context. */
        u32 uaddrlen;
};

struct bpf_sock_ops_kern {
        struct        sock *sk;
        union {
                u32 args[4];
                u32 reply;
                u32 replylong[4];
        };
        struct sk_buff        *syn_skb;
        struct sk_buff        *skb;
        void        *skb_data_end;
        u8        op;
        u8        is_fullsock;
        u8        is_locked_tcp_sock;
        u8        remaining_opt_len;
        u64        temp;                        /* temp and everything after is not
                                         * initialized to 0 before calling
                                         * the BPF program. New fields that
                                         * should be initialized to 0 should
                                         * be inserted before temp.
                                         * temp is scratch storage used by
                                         * sock_ops_convert_ctx_access
                                         * as temporary storage of a register.
                                         */
};

struct bpf_sysctl_kern {
        struct ctl_table_header *head;
        const struct ctl_table *table;
        void *cur_val;
        size_t cur_len;
        void *new_val;
        size_t new_len;
        int new_updated;
        int write;
        loff_t *ppos;
        /* Temporary "register" for indirect stores to ppos. */
        u64 tmp_reg;
};

#define BPF_SOCKOPT_KERN_BUF_SIZE        32
struct bpf_sockopt_buf {
        u8                data[BPF_SOCKOPT_KERN_BUF_SIZE];
};

struct bpf_sockopt_kern {
        struct sock        *sk;
        u8                *optval;
        u8                *optval_end;
        s32                level;
        s32                optname;
        s32                optlen;
        /* for retval in struct bpf_cg_run_ctx */
        struct task_struct *current_task;
        /* Temporary "register" for indirect stores to ppos. */
        u64                tmp_reg;
};

int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len);

struct bpf_sk_lookup_kern {
        u16                family;
        u16                protocol;
        __be16                sport;
        u16                dport;
        struct {
                __be32 saddr;
                __be32 daddr;
        } v4;
        struct {
                const struct in6_addr *saddr;
                const struct in6_addr *daddr;
        } v6;
        struct sock        *selected_sk;
        u32                ingress_ifindex;
        bool                no_reuseport;
};

extern struct static_key_false bpf_sk_lookup_enabled;

/* Runners for BPF_SK_LOOKUP programs to invoke on socket lookup.
 *
 * Allowed return values for a BPF SK_LOOKUP program are SK_PASS and
 * SK_DROP. Their meaning is as follows:
 *
 *  SK_PASS && ctx.selected_sk != NULL: use selected_sk as lookup result
 *  SK_PASS && ctx.selected_sk == NULL: continue to htable-based socket lookup
 *  SK_DROP                           : terminate lookup with -ECONNREFUSED
 *
 * This macro aggregates return values and selected sockets from
 * multiple BPF programs according to following rules in order:
 *
 *  1. If any program returned SK_PASS and a non-NULL ctx.selected_sk,
 *     macro result is SK_PASS and last ctx.selected_sk is used.
 *  2. If any program returned SK_DROP return value,
 *     macro result is SK_DROP.
 *  3. Otherwise result is SK_PASS and ctx.selected_sk is NULL.
 *
 * Caller must ensure that the prog array is non-NULL, and that the
 * array as well as the programs it contains remain valid.
 */
#define BPF_PROG_SK_LOOKUP_RUN_ARRAY(array, ctx, func)                        \
        ({                                                                \
                struct bpf_sk_lookup_kern *_ctx = &(ctx);                \
                struct bpf_prog_array_item *_item;                        \
                struct sock *_selected_sk = NULL;                        \
                bool _no_reuseport = false;                                \
                struct bpf_prog *_prog;                                        \
                bool _all_pass = true;                                        \
                u32 _ret;                                                \
                                                                        \
                migrate_disable();                                        \
                _item = &(array)->items[0];                                \
                while ((_prog = READ_ONCE(_item->prog))) {                \
                        /* restore most recent selection */                \
                        _ctx->selected_sk = _selected_sk;                \
                        _ctx->no_reuseport = _no_reuseport;                \
                                                                        \
                        _ret = func(_prog, _ctx);                        \
                        if (_ret == SK_PASS && _ctx->selected_sk) {        \
                                /* remember last non-NULL socket */        \
                                _selected_sk = _ctx->selected_sk;        \
                                _no_reuseport = _ctx->no_reuseport;        \
                        } else if (_ret == SK_DROP && _all_pass) {        \
                                _all_pass = false;                        \
                        }                                                \
                        _item++;                                        \
                }                                                        \
                _ctx->selected_sk = _selected_sk;                        \
                _ctx->no_reuseport = _no_reuseport;                        \
                migrate_enable();                                        \
                _all_pass || _selected_sk ? SK_PASS : SK_DROP;                \
         })

static inline bool bpf_sk_lookup_run_v4(const struct net *net, int protocol,
                                        const __be32 saddr, const __be16 sport,
                                        const __be32 daddr, const u16 dport,
                                        const int ifindex, struct sock **psk)
{
        struct bpf_prog_array *run_array;
        struct sock *selected_sk = NULL;
        bool no_reuseport = false;

        rcu_read_lock();
        run_array = rcu_dereference(net->bpf.run_array[NETNS_BPF_SK_LOOKUP]);
        if (run_array) {
                struct bpf_sk_lookup_kern ctx = {
                        .family                = AF_INET,
                        .protocol        = protocol,
                        .v4.saddr        = saddr,
                        .v4.daddr        = daddr,
                        .sport                = sport,
                        .dport                = dport,
                        .ingress_ifindex        = ifindex,
                };
                u32 act;

                act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, bpf_prog_run);
                if (act == SK_PASS) {
                        selected_sk = ctx.selected_sk;
                        no_reuseport = ctx.no_reuseport;
                } else {
                        selected_sk = ERR_PTR(-ECONNREFUSED);
                }
        }
        rcu_read_unlock();
        *psk = selected_sk;
        return no_reuseport;
}

#if IS_ENABLED(CONFIG_IPV6)
static inline bool bpf_sk_lookup_run_v6(const struct net *net, int protocol,
                                        const struct in6_addr *saddr,
                                        const __be16 sport,
                                        const struct in6_addr *daddr,
                                        const u16 dport,
                                        const int ifindex, struct sock **psk)
{
        struct bpf_prog_array *run_array;
        struct sock *selected_sk = NULL;
        bool no_reuseport = false;

        rcu_read_lock();
        run_array = rcu_dereference(net->bpf.run_array[NETNS_BPF_SK_LOOKUP]);
        if (run_array) {
                struct bpf_sk_lookup_kern ctx = {
                        .family                = AF_INET6,
                        .protocol        = protocol,
                        .v6.saddr        = saddr,
                        .v6.daddr        = daddr,
                        .sport                = sport,
                        .dport                = dport,
                        .ingress_ifindex        = ifindex,
                };
                u32 act;

                act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, bpf_prog_run);
                if (act == SK_PASS) {
                        selected_sk = ctx.selected_sk;
                        no_reuseport = ctx.no_reuseport;
                } else {
                        selected_sk = ERR_PTR(-ECONNREFUSED);
                }
        }
        rcu_read_unlock();
        *psk = selected_sk;
        return no_reuseport;
}
#endif /* IS_ENABLED(CONFIG_IPV6) */

static __always_inline long __bpf_xdp_redirect_map(struct bpf_map *map, u64 index,
                                                   u64 flags, const u64 flag_mask,
                                                   void *lookup_elem(struct bpf_map *map, u32 key))
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
        const u64 action_mask = XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX;

        /* Lower bits of the flags are used as return code on lookup failure */
        if (unlikely(flags & ~(action_mask | flag_mask)))
                return XDP_ABORTED;

        ri->tgt_value = lookup_elem(map, index);
        if (unlikely(!ri->tgt_value) && !(flags & BPF_F_BROADCAST)) {
                /* If the lookup fails we want to clear out the state in the
                 * redirect_info struct completely, so that if an eBPF program
                 * performs multiple lookups, the last one always takes
                 * precedence.
                 */
                ri->map_id = INT_MAX; /* Valid map id idr range: [1,INT_MAX[ */
                ri->map_type = BPF_MAP_TYPE_UNSPEC;
                return flags & action_mask;
        }

        ri->tgt_index = index;
        ri->map_id = map->id;
        ri->map_type = map->map_type;

        if (flags & BPF_F_BROADCAST) {
                WRITE_ONCE(ri->map, map);
                ri->flags = flags;
        } else {
                WRITE_ONCE(ri->map, NULL);
                ri->flags = 0;
        }

        return XDP_REDIRECT;
}

#ifdef CONFIG_NET
int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len);
int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from,
                          u32 len, u64 flags);
int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len);
int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len);
void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len);
void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
                      void *buf, unsigned long len, bool flush);
void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset);
#else /* CONFIG_NET */
static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset,
                                       void *to, u32 len)
{
        return -EOPNOTSUPP;
}

static inline int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset,
                                        const void *from, u32 len, u64 flags)
{
        return -EOPNOTSUPP;
}

static inline int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset,
                                       void *buf, u32 len)
{
        return -EOPNOTSUPP;
}

static inline int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset,
                                        void *buf, u32 len)
{
        return -EOPNOTSUPP;
}

static inline void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len)
{
        return NULL;
}

static inline void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *buf,
                                    unsigned long len, bool flush)
{
}

static inline void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset)
{
        return ERR_PTR(-EOPNOTSUPP);
}
#endif /* CONFIG_NET */

#endif /* __LINUX_FILTER_H__ */







































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the UDP module.
 *
 * Version:        @(#)udp.h        1.0.2        05/07/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *
 * Fixes:
 *                Alan Cox        : Turned on udp checksums. I don't want to
 *                                  chase 'memory corruption' bugs that aren't!
 */
#ifndef _UDP_H
#define _UDP_H

#include <linux/list.h>
#include <linux/bug.h>
#include <net/inet_sock.h>
#include <net/gso.h>
#include <net/sock.h>
#include <net/snmp.h>
#include <net/ip.h>
#include <linux/ipv6.h>
#include <linux/seq_file.h>
#include <linux/poll.h>
#include <linux/indirect_call_wrapper.h>

/**
 *        struct udp_skb_cb  -  UDP(-Lite) private variables
 *
 *        @header:      private variables used by IPv4/IPv6
 *        @cscov:       checksum coverage length (UDP-Lite only)
 *        @partial_cov: if set indicates partial csum coverage
 */
struct udp_skb_cb {
        union {
                struct inet_skb_parm        h4;
#if IS_ENABLED(CONFIG_IPV6)
                struct inet6_skb_parm        h6;
#endif
        } header;
        __u16                cscov;
        __u8                partial_cov;
};
#define UDP_SKB_CB(__skb)        ((struct udp_skb_cb *)((__skb)->cb))

/**
 *        struct udp_hslot - UDP hash slot used by udp_table.hash/hash4
 *
 *        @head:        head of list of sockets
 *        @nulls_head:        head of list of sockets, only used by hash4
 *        @count:        number of sockets in 'head' list
 *        @lock:        spinlock protecting changes to head/count
 */
struct udp_hslot {
        union {
                struct hlist_head        head;
                /* hash4 uses hlist_nulls to avoid moving wrongly onto another
                 * hlist, because rehash() can happen with lookup().
                 */
                struct hlist_nulls_head        nulls_head;
        };
        int                        count;
        spinlock_t                lock;
} __aligned(2 * sizeof(long));

/**
 *        struct udp_hslot_main - UDP hash slot used by udp_table.hash2
 *
 *        @hslot:        basic hash slot
 *        @hash4_cnt: number of sockets in hslot4 of the same
 *                    (local port, local address)
 */
struct udp_hslot_main {
        struct udp_hslot        hslot; /* must be the first member */
#if !IS_ENABLED(CONFIG_BASE_SMALL)
        u32                        hash4_cnt;
#endif
} __aligned(2 * sizeof(long));
#define UDP_HSLOT_MAIN(__hslot) ((struct udp_hslot_main *)(__hslot))

/**
 *        struct udp_table - UDP table
 *
 *        @hash:        hash table, sockets are hashed on (local port)
 *        @hash2:        hash table, sockets are hashed on (local port, local address)
 *        @hash4:        hash table, connected sockets are hashed on
 *                (local port, local address, remote port, remote address)
 *        @mask:        number of slots in hash tables, minus 1
 *        @log:        log2(number of slots in hash table)
 */
struct udp_table {
        struct udp_hslot        *hash;
        struct udp_hslot_main        *hash2;
#if !IS_ENABLED(CONFIG_BASE_SMALL)
        struct udp_hslot        *hash4;
#endif
        unsigned int                mask;
        unsigned int                log;
};
extern struct udp_table udp_table;
void udp_table_init(struct udp_table *, const char *);
static inline struct udp_hslot *udp_hashslot(struct udp_table *table,
                                             const struct net *net,
                                             unsigned int num)
{
        return &table->hash[udp_hashfn(net, num, table->mask)];
}

/*
 * For secondary hash, net_hash_mix() is performed before calling
 * udp_hashslot2(), this explains difference with udp_hashslot()
 */
static inline struct udp_hslot *udp_hashslot2(struct udp_table *table,
                                              unsigned int hash)
{
        return &table->hash2[hash & table->mask].hslot;
}

#if IS_ENABLED(CONFIG_BASE_SMALL)
static inline void udp_table_hash4_init(struct udp_table *table)
{
}

static inline struct udp_hslot *udp_hashslot4(struct udp_table *table,
                                              unsigned int hash)
{
        BUILD_BUG();
        return NULL;
}

static inline bool udp_hashed4(const struct sock *sk)
{
        return false;
}

static inline unsigned int udp_hash4_slot_size(void)
{
        return 0;
}

static inline bool udp_has_hash4(const struct udp_hslot *hslot2)
{
        return false;
}

static inline void udp_hash4_inc(struct udp_hslot *hslot2)
{
}

static inline void udp_hash4_dec(struct udp_hslot *hslot2)
{
}
#else /* !CONFIG_BASE_SMALL */

/* Must be called with table->hash2 initialized */
static inline void udp_table_hash4_init(struct udp_table *table)
{
        table->hash4 = (void *)(table->hash2 + (table->mask + 1));
        for (int i = 0; i <= table->mask; i++) {
                table->hash2[i].hash4_cnt = 0;

                INIT_HLIST_NULLS_HEAD(&table->hash4[i].nulls_head, i);
                table->hash4[i].count = 0;
                spin_lock_init(&table->hash4[i].lock);
        }
}

static inline struct udp_hslot *udp_hashslot4(struct udp_table *table,
                                              unsigned int hash)
{
        return &table->hash4[hash & table->mask];
}

static inline bool udp_hashed4(const struct sock *sk)
{
        return !hlist_nulls_unhashed(&udp_sk(sk)->udp_lrpa_node);
}

static inline unsigned int udp_hash4_slot_size(void)
{
        return sizeof(struct udp_hslot);
}

static inline bool udp_has_hash4(const struct udp_hslot *hslot2)
{
        return UDP_HSLOT_MAIN(hslot2)->hash4_cnt;
}

static inline void udp_hash4_inc(struct udp_hslot *hslot2)
{
        UDP_HSLOT_MAIN(hslot2)->hash4_cnt++;
}

static inline void udp_hash4_dec(struct udp_hslot *hslot2)
{
        UDP_HSLOT_MAIN(hslot2)->hash4_cnt--;
}
#endif /* CONFIG_BASE_SMALL */

extern struct proto udp_prot;

DECLARE_PER_CPU(int, udp_memory_per_cpu_fw_alloc);

/* sysctl variables for udp */
extern long sysctl_udp_mem[3];
extern int sysctl_udp_rmem_min;
extern int sysctl_udp_wmem_min;

struct sk_buff;

/*
 *        Generic checksumming routines for UDP(-Lite) v4 and v6
 */
static inline __sum16 __udp_lib_checksum_complete(struct sk_buff *skb)
{
        return (UDP_SKB_CB(skb)->cscov == skb->len ?
                __skb_checksum_complete(skb) :
                __skb_checksum_complete_head(skb, UDP_SKB_CB(skb)->cscov));
}

static inline int udp_lib_checksum_complete(struct sk_buff *skb)
{
        return !skb_csum_unnecessary(skb) &&
                __udp_lib_checksum_complete(skb);
}

/**
 *         udp_csum_outgoing  -  compute UDPv4/v6 checksum over fragments
 *         @sk:         socket we are writing to
 *         @skb:         sk_buff containing the filled-in UDP header
 *                 (checksum field must be zeroed out)
 */
static inline __wsum udp_csum_outgoing(struct sock *sk, struct sk_buff *skb)
{
        __wsum csum = csum_partial(skb_transport_header(skb),
                                   sizeof(struct udphdr), 0);
        skb_queue_walk(&sk->sk_write_queue, skb) {
                csum = csum_add(csum, skb->csum);
        }
        return csum;
}

static inline __wsum udp_csum(struct sk_buff *skb)
{
        __wsum csum = csum_partial(skb_transport_header(skb),
                                   sizeof(struct udphdr), skb->csum);

        for (skb = skb_shinfo(skb)->frag_list; skb; skb = skb->next) {
                csum = csum_add(csum, skb->csum);
        }
        return csum;
}

static inline __sum16 udp_v4_check(int len, __be32 saddr,
                                   __be32 daddr, __wsum base)
{
        return csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base);
}

void udp_set_csum(bool nocheck, struct sk_buff *skb,
                  __be32 saddr, __be32 daddr, int len);

static inline void udp_csum_pull_header(struct sk_buff *skb)
{
        if (!skb->csum_valid && skb->ip_summed == CHECKSUM_NONE)
                skb->csum = csum_partial(skb->data, sizeof(struct udphdr),
                                         skb->csum);
        skb_pull_rcsum(skb, sizeof(struct udphdr));
        UDP_SKB_CB(skb)->cscov -= sizeof(struct udphdr);
}

typedef struct sock *(*udp_lookup_t)(const struct sk_buff *skb, __be16 sport,
                                     __be16 dport);

void udp_v6_early_demux(struct sk_buff *skb);
INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *));

struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
                                  netdev_features_t features, bool is_ipv6);

static inline int udp_lib_init_sock(struct sock *sk)
{
        struct udp_sock *up = udp_sk(sk);

        sk->sk_drop_counters = &up->drop_counters;
        skb_queue_head_init(&up->reader_queue);
        INIT_HLIST_NODE(&up->tunnel_list);
        up->forward_threshold = sk->sk_rcvbuf >> 2;
        set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags);

        up->udp_prod_queue = kcalloc(nr_node_ids, sizeof(*up->udp_prod_queue),
                                     GFP_KERNEL);
        if (!up->udp_prod_queue)
                return -ENOMEM;
        for (int i = 0; i < nr_node_ids; i++)
                init_llist_head(&up->udp_prod_queue[i].ll_root);
        return 0;
}

static inline void udp_drops_inc(struct sock *sk)
{
        numa_drop_add(&udp_sk(sk)->drop_counters, 1);
}

/* hash routines shared between UDPv4/6 and UDP-Litev4/6 */
static inline int udp_lib_hash(struct sock *sk)
{
        BUG();
        return 0;
}

void udp_lib_unhash(struct sock *sk);
void udp_lib_rehash(struct sock *sk, u16 new_hash, u16 new_hash4);
u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport,
                const __be32 faddr, const __be16 fport);

static inline void udp_lib_close(struct sock *sk, long timeout)
{
        sk_common_release(sk);
}

/* hash4 routines shared between UDPv4/6 */
#if IS_ENABLED(CONFIG_BASE_SMALL)
static inline void udp_lib_hash4(struct sock *sk, u16 hash)
{
}

static inline void udp4_hash4(struct sock *sk)
{
}
#else /* !CONFIG_BASE_SMALL */
void udp_lib_hash4(struct sock *sk, u16 hash);
void udp4_hash4(struct sock *sk);
#endif /* CONFIG_BASE_SMALL */

int udp_lib_get_port(struct sock *sk, unsigned short snum,
                     unsigned int hash2_nulladdr);

u32 udp_flow_hashrnd(void);

static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb,
                                       int min, int max, bool use_eth)
{
        u32 hash;

        if (min >= max) {
                /* Use default range */
                inet_get_local_port_range(net, &min, &max);
        }

        hash = skb_get_hash(skb);
        if (unlikely(!hash)) {
                if (use_eth) {
                        /* Can't find a normal hash, caller has indicated an
                         * Ethernet packet so use that to compute a hash.
                         */
                        hash = jhash(skb->data, 2 * ETH_ALEN,
                                     (__force u32) skb->protocol);
                } else {
                        /* Can't derive any sort of hash for the packet, set
                         * to some consistent random value.
                         */
                        hash = udp_flow_hashrnd();
                }
        }

        /* Since this is being sent on the wire obfuscate hash a bit
         * to minimize possibility that any useful information to an
         * attacker is leaked. Only upper 16 bits are relevant in the
         * computation for 16 bit port value.
         */
        hash ^= hash << 16;

        return htons((((u64) hash * (max - min)) >> 32) + min);
}

static inline int udp_rqueue_get(struct sock *sk)
{
        return sk_rmem_alloc_get(sk) - READ_ONCE(udp_sk(sk)->forward_deficit);
}

static inline bool udp_sk_bound_dev_eq(const struct net *net, int bound_dev_if,
                                       int dif, int sdif)
{
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
        return inet_bound_dev_eq(!!READ_ONCE(net->ipv4.sysctl_udp_l3mdev_accept),
                                 bound_dev_if, dif, sdif);
#else
        return inet_bound_dev_eq(true, bound_dev_if, dif, sdif);
#endif
}

/* net/ipv4/udp.c */
void udp_destruct_common(struct sock *sk);
void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len);
int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb);
void udp_skb_destructor(struct sock *sk, struct sk_buff *skb);
struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, int *off,
                               int *err);
static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags,
                                           int *err)
{
        int off = 0;

        return __skb_recv_udp(sk, flags, &off, err);
}

enum skb_drop_reason udp_v4_early_demux(struct sk_buff *skb);
bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst);
int udp_err(struct sk_buff *, u32);
int udp_abort(struct sock *sk, int err);
int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
void udp_splice_eof(struct socket *sock);
int udp_push_pending_frames(struct sock *sk);
void udp_flush_pending_frames(struct sock *sk);
int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size);
void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst);
int udp_rcv(struct sk_buff *skb);
int udp_ioctl(struct sock *sk, int cmd, int *karg);
int udp_init_sock(struct sock *sk);
int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
int __udp_disconnect(struct sock *sk, int flags);
int udp_disconnect(struct sock *sk, int flags);
__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait);
struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
                                       netdev_features_t features,
                                       bool is_ipv6);
int udp_lib_getsockopt(struct sock *sk, int level, int optname,
                       char __user *optval, int __user *optlen);
int udp_lib_setsockopt(struct sock *sk, int level, int optname,
                       sockptr_t optval, unsigned int optlen,
                       int (*push_pending_frames)(struct sock *));
struct sock *udp4_lib_lookup(const struct net *net, __be32 saddr, __be16 sport,
                             __be32 daddr, __be16 dport, int dif);
struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr,
                               __be16 sport,
                               __be32 daddr, __be16 dport, int dif, int sdif,
                               struct udp_table *tbl, struct sk_buff *skb);
struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb,
                                 __be16 sport, __be16 dport);
struct sock *udp6_lib_lookup(const struct net *net,
                             const struct in6_addr *saddr, __be16 sport,
                             const struct in6_addr *daddr, __be16 dport,
                             int dif);
struct sock *__udp6_lib_lookup(const struct net *net,
                               const struct in6_addr *saddr, __be16 sport,
                               const struct in6_addr *daddr, __be16 dport,
                               int dif, int sdif, struct udp_table *tbl,
                               struct sk_buff *skb);
struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb,
                                 __be16 sport, __be16 dport);
int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor);

/* UDP uses skb->dev_scratch to cache as much information as possible and avoid
 * possibly multiple cache miss on dequeue()
 */
struct udp_dev_scratch {
        /* skb->truesize and the stateless bit are embedded in a single field;
         * do not use a bitfield since the compiler emits better/smaller code
         * this way
         */
        u32 _tsize_state;

#if BITS_PER_LONG == 64
        /* len and the bit needed to compute skb_csum_unnecessary
         * will be on cold cache lines at recvmsg time.
         * skb->len can be stored on 16 bits since the udp header has been
         * already validated and pulled.
         */
        u16 len;
        bool is_linear;
        bool csum_unnecessary;
#endif
};

static inline struct udp_dev_scratch *udp_skb_scratch(struct sk_buff *skb)
{
        return (struct udp_dev_scratch *)&skb->dev_scratch;
}

#if BITS_PER_LONG == 64
static inline unsigned int udp_skb_len(struct sk_buff *skb)
{
        return udp_skb_scratch(skb)->len;
}

static inline bool udp_skb_csum_unnecessary(struct sk_buff *skb)
{
        return udp_skb_scratch(skb)->csum_unnecessary;
}

static inline bool udp_skb_is_linear(struct sk_buff *skb)
{
        return udp_skb_scratch(skb)->is_linear;
}

#else
static inline unsigned int udp_skb_len(struct sk_buff *skb)
{
        return skb->len;
}

static inline bool udp_skb_csum_unnecessary(struct sk_buff *skb)
{
        return skb_csum_unnecessary(skb);
}

static inline bool udp_skb_is_linear(struct sk_buff *skb)
{
        return !skb_is_nonlinear(skb);
}
#endif

static inline int copy_linear_skb(struct sk_buff *skb, int len, int off,
                                  struct iov_iter *to)
{
        return copy_to_iter_full(skb->data + off, len, to) ? 0 : -EFAULT;
}

/*
 *         SNMP statistics for UDP and UDP-Lite
 */
#define UDP_INC_STATS(net, field, is_udplite)                      do { \
        if (is_udplite) SNMP_INC_STATS((net)->mib.udplite_statistics, field);       \
        else                SNMP_INC_STATS((net)->mib.udp_statistics, field);  }  while(0)
#define __UDP_INC_STATS(net, field, is_udplite)               do { \
        if (is_udplite) __SNMP_INC_STATS((net)->mib.udplite_statistics, field);         \
        else                __SNMP_INC_STATS((net)->mib.udp_statistics, field);    }  while(0)

#define __UDP6_INC_STATS(net, field, is_udplite)            do { \
        if (is_udplite) __SNMP_INC_STATS((net)->mib.udplite_stats_in6, field);\
        else                __SNMP_INC_STATS((net)->mib.udp_stats_in6, field);  \
} while(0)
#define UDP6_INC_STATS(net, field, __lite)                    do { \
        if (__lite) SNMP_INC_STATS((net)->mib.udplite_stats_in6, field);  \
        else            SNMP_INC_STATS((net)->mib.udp_stats_in6, field);      \
} while(0)

#if IS_ENABLED(CONFIG_IPV6)
#define __UDPX_MIB(sk, ipv4)                                                \
({                                                                        \
        ipv4 ? (IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_statistics :        \
                                 sock_net(sk)->mib.udp_statistics) :        \
                (IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_stats_in6 :        \
                                 sock_net(sk)->mib.udp_stats_in6);        \
})
#else
#define __UDPX_MIB(sk, ipv4)                                                \
({                                                                        \
        IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_statistics :                \
                         sock_net(sk)->mib.udp_statistics;                \
})
#endif

#define __UDPX_INC_STATS(sk, field) \
        __SNMP_INC_STATS(__UDPX_MIB(sk, (sk)->sk_family == AF_INET), field)

#ifdef CONFIG_PROC_FS
struct udp_seq_afinfo {
        sa_family_t                        family;
        struct udp_table                *udp_table;
};

struct udp_iter_state {
        struct seq_net_private  p;
        int                        bucket;
};

void *udp_seq_start(struct seq_file *seq, loff_t *pos);
void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos);
void udp_seq_stop(struct seq_file *seq, void *v);

extern const struct seq_operations udp_seq_ops;
extern const struct seq_operations udp6_seq_ops;

int udp4_proc_init(void);
void udp4_proc_exit(void);
#endif /* CONFIG_PROC_FS */

int udpv4_offload_init(void);

void udp_init(void);

DECLARE_STATIC_KEY_FALSE(udp_encap_needed_key);
void udp_encap_enable(void);
void udp_encap_disable(void);
#if IS_ENABLED(CONFIG_IPV6)
DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
void udpv6_encap_enable(void);
#endif

static inline struct sk_buff *udp_rcv_segment(struct sock *sk,
                                              struct sk_buff *skb, bool ipv4)
{
        netdev_features_t features = NETIF_F_SG;
        struct sk_buff *segs;
        int drop_count;

        /*
         * Segmentation in UDP receive path is only for UDP GRO, drop udp
         * fragmentation offload (UFO) packets.
         */
        if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP) {
                drop_count = 1;
                goto drop;
        }

        /* Avoid csum recalculation by skb_segment unless userspace explicitly
         * asks for the final checksum values
         */
        if (!inet_get_convert_csum(sk))
                features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;

        /* UDP segmentation expects packets of type CHECKSUM_PARTIAL or
         * CHECKSUM_NONE in __udp_gso_segment. UDP GRO indeed builds partial
         * packets in udp_gro_complete_segment. As does UDP GSO, verified by
         * udp_send_skb. But when those packets are looped in dev_loopback_xmit
         * their ip_summed CHECKSUM_NONE is changed to CHECKSUM_UNNECESSARY.
         * Reset in this specific case, where PARTIAL is both correct and
         * required.
         */
        if (skb->pkt_type == PACKET_LOOPBACK)
                skb->ip_summed = CHECKSUM_PARTIAL;

        /* the GSO CB lays after the UDP one, no need to save and restore any
         * CB fragment
         */
        segs = __skb_gso_segment(skb, features, false);
        if (IS_ERR_OR_NULL(segs)) {
                drop_count = skb_shinfo(skb)->gso_segs;
                goto drop;
        }

        consume_skb(skb);
        return segs;

drop:
        sk_drops_add(sk, drop_count);
        SNMP_ADD_STATS(__UDPX_MIB(sk, ipv4), UDP_MIB_INERRORS, drop_count);
        kfree_skb(skb);
        return NULL;
}

static inline void udp_post_segment_fix_csum(struct sk_buff *skb)
{
        /* UDP-lite can't land here - no GRO */
        WARN_ON_ONCE(UDP_SKB_CB(skb)->partial_cov);

        /* UDP packets generated with UDP_SEGMENT and traversing:
         *
         * UDP tunnel(xmit) -> veth (segmentation) -> veth (gro) -> UDP tunnel (rx)
         *
         * can reach an UDP socket with CHECKSUM_NONE, because
         * __iptunnel_pull_header() converts CHECKSUM_PARTIAL into NONE.
         * SKB_GSO_UDP_L4 or SKB_GSO_FRAGLIST packets with no UDP tunnel will
         * have a valid checksum, as the GRO engine validates the UDP csum
         * before the aggregation and nobody strips such info in between.
         * Instead of adding another check in the tunnel fastpath, we can force
         * a valid csum after the segmentation.
         * Additionally fixup the UDP CB.
         */
        UDP_SKB_CB(skb)->cscov = skb->len;
        if (skb->ip_summed == CHECKSUM_NONE && !skb->csum_valid)
                skb->csum_valid = 1;
}

#ifdef CONFIG_BPF_SYSCALL
struct sk_psock;
int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
#endif

#endif        /* _UDP_H */




















































  319 


















































































































    7 

















































































  319 





  317 


    7 
  315 
  319 
    7 








































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  319 
  319 

  320 
  315 





  320 
  319 






  315 







  317 















    7 

    7 

    7 





    7 



    7 


    6 












  315 




  310 
    1 


  312 


  311 


  312 










    7 
  320 















    7 
    7 

    7 



  314 


    7 

  311 




    7 
    6 































  317 
  319 





  309 


  309 


  313 
  311 










































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/export.h>
#include <linux/bvec.h>
#include <linux/fault-inject-usercopy.h>
#include <linux/uio.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/splice.h>
#include <linux/compat.h>
#include <linux/scatterlist.h>
#include <linux/instrumented.h>
#include <linux/iov_iter.h>

static __always_inline
size_t copy_to_user_iter(void __user *iter_to, size_t progress,
                         size_t len, void *from, void *priv2)
{
        if (should_fail_usercopy())
                return len;
        if (access_ok(iter_to, len)) {
                from += progress;
                instrument_copy_to_user(iter_to, from, len);
                len = raw_copy_to_user(iter_to, from, len);
        }
        return len;
}

static __always_inline
size_t copy_to_user_iter_nofault(void __user *iter_to, size_t progress,
                                 size_t len, void *from, void *priv2)
{
        ssize_t res;

        if (should_fail_usercopy())
                return len;

        from += progress;
        res = copy_to_user_nofault(iter_to, from, len);
        return res < 0 ? len : res;
}

static __always_inline
size_t copy_from_user_iter(void __user *iter_from, size_t progress,
                           size_t len, void *to, void *priv2)
{
        size_t res = len;

        if (should_fail_usercopy())
                return len;
        if (access_ok(iter_from, len)) {
                to += progress;
                instrument_copy_from_user_before(to, iter_from, len);
                res = raw_copy_from_user(to, iter_from, len);
                instrument_copy_from_user_after(to, iter_from, len, res);
        }
        return res;
}

static __always_inline
size_t memcpy_to_iter(void *iter_to, size_t progress,
                      size_t len, void *from, void *priv2)
{
        memcpy(iter_to, from + progress, len);
        return 0;
}

static __always_inline
size_t memcpy_from_iter(void *iter_from, size_t progress,
                        size_t len, void *to, void *priv2)
{
        memcpy(to + progress, iter_from, len);
        return 0;
}

/*
 * fault_in_iov_iter_readable - fault in iov iterator for reading
 * @i: iterator
 * @size: maximum length
 *
 * Fault in one or more iovecs of the given iov_iter, to a maximum length of
 * @size.  For each iovec, fault in each page that constitutes the iovec.
 *
 * Returns the number of bytes not faulted in (like copy_to_user() and
 * copy_from_user()).
 *
 * Always returns 0 for non-userspace iterators.
 */
size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size)
{
        if (iter_is_ubuf(i)) {
                size_t n = min(size, iov_iter_count(i));
                n -= fault_in_readable(i->ubuf + i->iov_offset, n);
                return size - n;
        } else if (iter_is_iovec(i)) {
                size_t count = min(size, iov_iter_count(i));
                const struct iovec *p;
                size_t skip;

                size -= count;
                for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) {
                        size_t len = min(count, p->iov_len - skip);
                        size_t ret;

                        if (unlikely(!len))
                                continue;
                        ret = fault_in_readable(p->iov_base + skip, len);
                        count -= len - ret;
                        if (ret)
                                break;
                }
                return count + size;
        }
        return 0;
}
EXPORT_SYMBOL(fault_in_iov_iter_readable);

/*
 * fault_in_iov_iter_writeable - fault in iov iterator for writing
 * @i: iterator
 * @size: maximum length
 *
 * Faults in the iterator using get_user_pages(), i.e., without triggering
 * hardware page faults.  This is primarily useful when we already know that
 * some or all of the pages in @i aren't in memory.
 *
 * Returns the number of bytes not faulted in, like copy_to_user() and
 * copy_from_user().
 *
 * Always returns 0 for non-user-space iterators.
 */
size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size)
{
        if (iter_is_ubuf(i)) {
                size_t n = min(size, iov_iter_count(i));
                n -= fault_in_safe_writeable(i->ubuf + i->iov_offset, n);
                return size - n;
        } else if (iter_is_iovec(i)) {
                size_t count = min(size, iov_iter_count(i));
                const struct iovec *p;
                size_t skip;

                size -= count;
                for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) {
                        size_t len = min(count, p->iov_len - skip);
                        size_t ret;

                        if (unlikely(!len))
                                continue;
                        ret = fault_in_safe_writeable(p->iov_base + skip, len);
                        count -= len - ret;
                        if (ret)
                                break;
                }
                return count + size;
        }
        return 0;
}
EXPORT_SYMBOL(fault_in_iov_iter_writeable);

void iov_iter_init(struct iov_iter *i, unsigned int direction,
                        const struct iovec *iov, unsigned long nr_segs,
                        size_t count)
{
        WARN_ON(direction & ~(READ | WRITE));
        *i = (struct iov_iter) {
                .iter_type = ITER_IOVEC,
                .nofault = false,
                .data_source = direction,
                .__iov = iov,
                .nr_segs = nr_segs,
                .iov_offset = 0,
                .count = count
        };
}
EXPORT_SYMBOL(iov_iter_init);

size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
        if (WARN_ON_ONCE(i->data_source))
                return 0;
        if (user_backed_iter(i))
                might_fault();
        return iterate_and_advance(i, bytes, (void *)addr,
                                   copy_to_user_iter, memcpy_to_iter);
}
EXPORT_SYMBOL(_copy_to_iter);

#ifdef CONFIG_ARCH_HAS_COPY_MC
static __always_inline
size_t copy_to_user_iter_mc(void __user *iter_to, size_t progress,
                            size_t len, void *from, void *priv2)
{
        if (access_ok(iter_to, len)) {
                from += progress;
                instrument_copy_to_user(iter_to, from, len);
                len = copy_mc_to_user(iter_to, from, len);
        }
        return len;
}

static __always_inline
size_t memcpy_to_iter_mc(void *iter_to, size_t progress,
                         size_t len, void *from, void *priv2)
{
        return copy_mc_to_kernel(iter_to, from + progress, len);
}

/**
 * _copy_mc_to_iter - copy to iter with source memory error exception handling
 * @addr: source kernel address
 * @bytes: total transfer length
 * @i: destination iterator
 *
 * The pmem driver deploys this for the dax operation
 * (dax_copy_to_iter()) for dax reads (bypass page-cache and the
 * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes
 * successfully copied.
 *
 * The main differences between this and typical _copy_to_iter().
 *
 * * Typical tail/residue handling after a fault retries the copy
 *   byte-by-byte until the fault happens again. Re-triggering machine
 *   checks is potentially fatal so the implementation uses source
 *   alignment and poison alignment assumptions to avoid re-triggering
 *   hardware exceptions.
 *
 * * ITER_KVEC and ITER_BVEC can return short copies.  Compare to
 *   copy_to_iter() where only ITER_IOVEC attempts might return a short copy.
 *
 * Return: number of bytes copied (may be %0)
 */
size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
        if (WARN_ON_ONCE(i->data_source))
                return 0;
        if (user_backed_iter(i))
                might_fault();
        return iterate_and_advance(i, bytes, (void *)addr,
                                   copy_to_user_iter_mc, memcpy_to_iter_mc);
}
EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
#endif /* CONFIG_ARCH_HAS_COPY_MC */

static __always_inline
size_t __copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
{
        return iterate_and_advance(i, bytes, addr,
                                   copy_from_user_iter, memcpy_from_iter);
}

size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
{
        if (WARN_ON_ONCE(!i->data_source))
                return 0;

        if (user_backed_iter(i))
                might_fault();
        return __copy_from_iter(addr, bytes, i);
}
EXPORT_SYMBOL(_copy_from_iter);

static __always_inline
size_t copy_from_user_iter_nocache(void __user *iter_from, size_t progress,
                                   size_t len, void *to, void *priv2)
{
        return __copy_from_user_inatomic_nocache(to + progress, iter_from, len);
}

size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
{
        if (WARN_ON_ONCE(!i->data_source))
                return 0;

        return iterate_and_advance(i, bytes, addr,
                                   copy_from_user_iter_nocache,
                                   memcpy_from_iter);
}
EXPORT_SYMBOL(_copy_from_iter_nocache);

#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
static __always_inline
size_t copy_from_user_iter_flushcache(void __user *iter_from, size_t progress,
                                      size_t len, void *to, void *priv2)
{
        return __copy_from_user_flushcache(to + progress, iter_from, len);
}

static __always_inline
size_t memcpy_from_iter_flushcache(void *iter_from, size_t progress,
                                   size_t len, void *to, void *priv2)
{
        memcpy_flushcache(to + progress, iter_from, len);
        return 0;
}

/**
 * _copy_from_iter_flushcache - write destination through cpu cache
 * @addr: destination kernel address
 * @bytes: total transfer length
 * @i: source iterator
 *
 * The pmem driver arranges for filesystem-dax to use this facility via
 * dax_copy_from_iter() for ensuring that writes to persistent memory
 * are flushed through the CPU cache. It is differentiated from
 * _copy_from_iter_nocache() in that guarantees all data is flushed for
 * all iterator types. The _copy_from_iter_nocache() only attempts to
 * bypass the cache for the ITER_IOVEC case, and on some archs may use
 * instructions that strand dirty-data in the cache.
 *
 * Return: number of bytes copied (may be %0)
 */
size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
{
        if (WARN_ON_ONCE(!i->data_source))
                return 0;

        return iterate_and_advance(i, bytes, addr,
                                   copy_from_user_iter_flushcache,
                                   memcpy_from_iter_flushcache);
}
EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache);
#endif

static inline bool page_copy_sane(struct page *page, size_t offset, size_t n)
{
        struct page *head;
        size_t v = n + offset;

        /*
         * The general case needs to access the page order in order
         * to compute the page size.
         * However, we mostly deal with order-0 pages and thus can
         * avoid a possible cache line miss for requests that fit all
         * page orders.
         */
        if (n <= v && v <= PAGE_SIZE)
                return true;

        head = compound_head(page);
        v += (page - head) << PAGE_SHIFT;

        if (WARN_ON(n > v || v > page_size(head)))
                return false;
        return true;
}

size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
                         struct iov_iter *i)
{
        size_t res = 0;
        if (!page_copy_sane(page, offset, bytes))
                return 0;
        if (WARN_ON_ONCE(i->data_source))
                return 0;
        page += offset / PAGE_SIZE; // first subpage
        offset %= PAGE_SIZE;
        while (1) {
                void *kaddr = kmap_local_page(page);
                size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
                n = _copy_to_iter(kaddr + offset, n, i);
                kunmap_local(kaddr);
                res += n;
                bytes -= n;
                if (!bytes || !n)
                        break;
                offset += n;
                if (offset == PAGE_SIZE) {
                        page++;
                        offset = 0;
                }
        }
        return res;
}
EXPORT_SYMBOL(copy_page_to_iter);

size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes,
                                 struct iov_iter *i)
{
        size_t res = 0;

        if (!page_copy_sane(page, offset, bytes))
                return 0;
        if (WARN_ON_ONCE(i->data_source))
                return 0;
        page += offset / PAGE_SIZE; // first subpage
        offset %= PAGE_SIZE;
        while (1) {
                void *kaddr = kmap_local_page(page);
                size_t n = min(bytes, (size_t)PAGE_SIZE - offset);

                n = iterate_and_advance(i, n, kaddr + offset,
                                        copy_to_user_iter_nofault,
                                        memcpy_to_iter);
                kunmap_local(kaddr);
                res += n;
                bytes -= n;
                if (!bytes || !n)
                        break;
                offset += n;
                if (offset == PAGE_SIZE) {
                        page++;
                        offset = 0;
                }
        }
        return res;
}
EXPORT_SYMBOL(copy_page_to_iter_nofault);

size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
                         struct iov_iter *i)
{
        size_t res = 0;
        if (!page_copy_sane(page, offset, bytes))
                return 0;
        page += offset / PAGE_SIZE; // first subpage
        offset %= PAGE_SIZE;
        while (1) {
                void *kaddr = kmap_local_page(page);
                size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
                n = _copy_from_iter(kaddr + offset, n, i);
                kunmap_local(kaddr);
                res += n;
                bytes -= n;
                if (!bytes || !n)
                        break;
                offset += n;
                if (offset == PAGE_SIZE) {
                        page++;
                        offset = 0;
                }
        }
        return res;
}
EXPORT_SYMBOL(copy_page_from_iter);

static __always_inline
size_t zero_to_user_iter(void __user *iter_to, size_t progress,
                         size_t len, void *priv, void *priv2)
{
        return clear_user(iter_to, len);
}

static __always_inline
size_t zero_to_iter(void *iter_to, size_t progress,
                    size_t len, void *priv, void *priv2)
{
        memset(iter_to, 0, len);
        return 0;
}

size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
{
        return iterate_and_advance(i, bytes, NULL,
                                   zero_to_user_iter, zero_to_iter);
}
EXPORT_SYMBOL(iov_iter_zero);

size_t copy_folio_from_iter_atomic(struct folio *folio, size_t offset,
                size_t bytes, struct iov_iter *i)
{
        size_t n, copied = 0;

        if (!page_copy_sane(&folio->page, offset, bytes))
                return 0;
        if (WARN_ON_ONCE(!i->data_source))
                return 0;

        do {
                char *to = kmap_local_folio(folio, offset);

                n = bytes - copied;
                if (folio_test_partial_kmap(folio) &&
                    n > PAGE_SIZE - offset_in_page(offset))
                        n = PAGE_SIZE - offset_in_page(offset);

                pagefault_disable();
                n = __copy_from_iter(to, n, i);
                pagefault_enable();
                kunmap_local(to);
                copied += n;
                offset += n;
        } while (copied != bytes && n > 0);

        return copied;
}
EXPORT_SYMBOL(copy_folio_from_iter_atomic);

static void iov_iter_bvec_advance(struct iov_iter *i, size_t size)
{
        const struct bio_vec *bvec, *end;

        if (!i->count)
                return;
        i->count -= size;

        size += i->iov_offset;

        for (bvec = i->bvec, end = bvec + i->nr_segs; bvec < end; bvec++) {
                if (likely(size < bvec->bv_len))
                        break;
                size -= bvec->bv_len;
        }
        i->iov_offset = size;
        i->nr_segs -= bvec - i->bvec;
        i->bvec = bvec;
}

static void iov_iter_iovec_advance(struct iov_iter *i, size_t size)
{
        const struct iovec *iov, *end;

        if (!i->count)
                return;
        i->count -= size;

        size += i->iov_offset; // from beginning of current segment
        for (iov = iter_iov(i), end = iov + i->nr_segs; iov < end; iov++) {
                if (likely(size < iov->iov_len))
                        break;
                size -= iov->iov_len;
        }
        i->iov_offset = size;
        i->nr_segs -= iov - iter_iov(i);
        i->__iov = iov;
}

static void iov_iter_folioq_advance(struct iov_iter *i, size_t size)
{
        const struct folio_queue *folioq = i->folioq;
        unsigned int slot = i->folioq_slot;

        if (!i->count)
                return;
        i->count -= size;

        if (slot >= folioq_nr_slots(folioq)) {
                folioq = folioq->next;
                slot = 0;
        }

        size += i->iov_offset; /* From beginning of current segment. */
        do {
                size_t fsize = folioq_folio_size(folioq, slot);

                if (likely(size < fsize))
                        break;
                size -= fsize;
                slot++;
                if (slot >= folioq_nr_slots(folioq) && folioq->next) {
                        folioq = folioq->next;
                        slot = 0;
                }
        } while (size);

        i->iov_offset = size;
        i->folioq_slot = slot;
        i->folioq = folioq;
}

void iov_iter_advance(struct iov_iter *i, size_t size)
{
        if (unlikely(i->count < size))
                size = i->count;
        if (likely(iter_is_ubuf(i)) || unlikely(iov_iter_is_xarray(i))) {
                i->iov_offset += size;
                i->count -= size;
        } else if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) {
                /* iovec and kvec have identical layouts */
                iov_iter_iovec_advance(i, size);
        } else if (iov_iter_is_bvec(i)) {
                iov_iter_bvec_advance(i, size);
        } else if (iov_iter_is_folioq(i)) {
                iov_iter_folioq_advance(i, size);
        } else if (iov_iter_is_discard(i)) {
                i->count -= size;
        }
}
EXPORT_SYMBOL(iov_iter_advance);

static void iov_iter_folioq_revert(struct iov_iter *i, size_t unroll)
{
        const struct folio_queue *folioq = i->folioq;
        unsigned int slot = i->folioq_slot;

        for (;;) {
                size_t fsize;

                if (slot == 0) {
                        folioq = folioq->prev;
                        slot = folioq_nr_slots(folioq);
                }
                slot--;

                fsize = folioq_folio_size(folioq, slot);
                if (unroll <= fsize) {
                        i->iov_offset = fsize - unroll;
                        break;
                }
                unroll -= fsize;
        }

        i->folioq_slot = slot;
        i->folioq = folioq;
}

void iov_iter_revert(struct iov_iter *i, size_t unroll)
{
        if (!unroll)
                return;
        if (WARN_ON(unroll > MAX_RW_COUNT))
                return;
        i->count += unroll;
        if (unlikely(iov_iter_is_discard(i)))
                return;
        if (unroll <= i->iov_offset) {
                i->iov_offset -= unroll;
                return;
        }
        unroll -= i->iov_offset;
        if (iov_iter_is_xarray(i) || iter_is_ubuf(i)) {
                BUG(); /* We should never go beyond the start of the specified
                        * range since we might then be straying into pages that
                        * aren't pinned.
                        */
        } else if (iov_iter_is_bvec(i)) {
                const struct bio_vec *bvec = i->bvec;
                while (1) {
                        size_t n = (--bvec)->bv_len;
                        i->nr_segs++;
                        if (unroll <= n) {
                                i->bvec = bvec;
                                i->iov_offset = n - unroll;
                                return;
                        }
                        unroll -= n;
                }
        } else if (iov_iter_is_folioq(i)) {
                i->iov_offset = 0;
                iov_iter_folioq_revert(i, unroll);
        } else { /* same logics for iovec and kvec */
                const struct iovec *iov = iter_iov(i);
                while (1) {
                        size_t n = (--iov)->iov_len;
                        i->nr_segs++;
                        if (unroll <= n) {
                                i->__iov = iov;
                                i->iov_offset = n - unroll;
                                return;
                        }
                        unroll -= n;
                }
        }
}
EXPORT_SYMBOL(iov_iter_revert);

/*
 * Return the count of just the current iov_iter segment.
 */
size_t iov_iter_single_seg_count(const struct iov_iter *i)
{
        if (i->nr_segs > 1) {
                if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
                        return min(i->count, iter_iov(i)->iov_len - i->iov_offset);
                if (iov_iter_is_bvec(i))
                        return min(i->count, i->bvec->bv_len - i->iov_offset);
        }
        if (unlikely(iov_iter_is_folioq(i)))
                return !i->count ? 0 :
                        umin(folioq_folio_size(i->folioq, i->folioq_slot), i->count);
        return i->count;
}
EXPORT_SYMBOL(iov_iter_single_seg_count);

void iov_iter_kvec(struct iov_iter *i, unsigned int direction,
                        const struct kvec *kvec, unsigned long nr_segs,
                        size_t count)
{
        WARN_ON(direction & ~(READ | WRITE));
        *i = (struct iov_iter){
                .iter_type = ITER_KVEC,
                .data_source = direction,
                .kvec = kvec,
                .nr_segs = nr_segs,
                .iov_offset = 0,
                .count = count
        };
}
EXPORT_SYMBOL(iov_iter_kvec);

void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
                        const struct bio_vec *bvec, unsigned long nr_segs,
                        size_t count)
{
        WARN_ON(direction & ~(READ | WRITE));
        *i = (struct iov_iter){
                .iter_type = ITER_BVEC,
                .data_source = direction,
                .bvec = bvec,
                .nr_segs = nr_segs,
                .iov_offset = 0,
                .count = count
        };
}
EXPORT_SYMBOL(iov_iter_bvec);

/**
 * iov_iter_folio_queue - Initialise an I/O iterator to use the folios in a folio queue
 * @i: The iterator to initialise.
 * @direction: The direction of the transfer.
 * @folioq: The starting point in the folio queue.
 * @first_slot: The first slot in the folio queue to use
 * @offset: The offset into the folio in the first slot to start at
 * @count: The size of the I/O buffer in bytes.
 *
 * Set up an I/O iterator to either draw data out of the pages attached to an
 * inode or to inject data into those pages.  The pages *must* be prevented
 * from evaporation, either by taking a ref on them or locking them by the
 * caller.
 */
void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction,
                          const struct folio_queue *folioq, unsigned int first_slot,
                          unsigned int offset, size_t count)
{
        BUG_ON(direction & ~1);
        *i = (struct iov_iter) {
                .iter_type = ITER_FOLIOQ,
                .data_source = direction,
                .folioq = folioq,
                .folioq_slot = first_slot,
                .count = count,
                .iov_offset = offset,
        };
}
EXPORT_SYMBOL(iov_iter_folio_queue);

/**
 * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray
 * @i: The iterator to initialise.
 * @direction: The direction of the transfer.
 * @xarray: The xarray to access.
 * @start: The start file position.
 * @count: The size of the I/O buffer in bytes.
 *
 * Set up an I/O iterator to either draw data out of the pages attached to an
 * inode or to inject data into those pages.  The pages *must* be prevented
 * from evaporation, either by taking a ref on them or locking them by the
 * caller.
 */
void iov_iter_xarray(struct iov_iter *i, unsigned int direction,
                     struct xarray *xarray, loff_t start, size_t count)
{
        BUG_ON(direction & ~1);
        *i = (struct iov_iter) {
                .iter_type = ITER_XARRAY,
                .data_source = direction,
                .xarray = xarray,
                .xarray_start = start,
                .count = count,
                .iov_offset = 0
        };
}
EXPORT_SYMBOL(iov_iter_xarray);

/**
 * iov_iter_discard - Initialise an I/O iterator that discards data
 * @i: The iterator to initialise.
 * @direction: The direction of the transfer.
 * @count: The size of the I/O buffer in bytes.
 *
 * Set up an I/O iterator that just discards everything that's written to it.
 * It's only available as a READ iterator.
 */
void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
{
        BUG_ON(direction != READ);
        *i = (struct iov_iter){
                .iter_type = ITER_DISCARD,
                .data_source = false,
                .count = count,
                .iov_offset = 0
        };
}
EXPORT_SYMBOL(iov_iter_discard);

static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i)
{
        const struct iovec *iov = iter_iov(i);
        unsigned long res = 0;
        size_t size = i->count;
        size_t skip = i->iov_offset;

        do {
                size_t len = iov->iov_len - skip;
                if (len) {
                        res |= (unsigned long)iov->iov_base + skip;
                        if (len > size)
                                len = size;
                        res |= len;
                        size -= len;
                }
                iov++;
                skip = 0;
        } while (size);
        return res;
}

static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i)
{
        const struct bio_vec *bvec = i->bvec;
        unsigned res = 0;
        size_t size = i->count;
        unsigned skip = i->iov_offset;

        do {
                size_t len = bvec->bv_len - skip;
                res |= (unsigned long)bvec->bv_offset + skip;
                if (len > size)
                        len = size;
                res |= len;
                bvec++;
                size -= len;
                skip = 0;
        } while (size);

        return res;
}

unsigned long iov_iter_alignment(const struct iov_iter *i)
{
        if (likely(iter_is_ubuf(i))) {
                size_t size = i->count;
                if (size)
                        return ((unsigned long)i->ubuf + i->iov_offset) | size;
                return 0;
        }

        /* iovec and kvec have identical layouts */
        if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
                return iov_iter_alignment_iovec(i);

        if (iov_iter_is_bvec(i))
                return iov_iter_alignment_bvec(i);

        /* With both xarray and folioq types, we're dealing with whole folios. */
        if (iov_iter_is_folioq(i))
                return i->iov_offset | i->count;
        if (iov_iter_is_xarray(i))
                return (i->xarray_start + i->iov_offset) | i->count;

        return 0;
}
EXPORT_SYMBOL(iov_iter_alignment);

unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
{
        unsigned long res = 0;
        unsigned long v = 0;
        size_t size = i->count;
        unsigned k;

        if (iter_is_ubuf(i))
                return 0;

        if (WARN_ON(!iter_is_iovec(i)))
                return ~0U;

        for (k = 0; k < i->nr_segs; k++) {
                const struct iovec *iov = iter_iov(i) + k;
                if (iov->iov_len) {
                        unsigned long base = (unsigned long)iov->iov_base;
                        if (v) // if not the first one
                                res |= base | v; // this start | previous end
                        v = base + iov->iov_len;
                        if (size <= iov->iov_len)
                                break;
                        size -= iov->iov_len;
                }
        }
        return res;
}
EXPORT_SYMBOL(iov_iter_gap_alignment);

static int want_pages_array(struct page ***res, size_t size,
                            size_t start, unsigned int maxpages)
{
        unsigned int count = DIV_ROUND_UP(size + start, PAGE_SIZE);

        if (count > maxpages)
                count = maxpages;
        WARN_ON(!count);        // caller should've prevented that
        if (!*res) {
                *res = kvmalloc_array(count, sizeof(struct page *), GFP_KERNEL);
                if (!*res)
                        return 0;
        }
        return count;
}

static ssize_t iter_folioq_get_pages(struct iov_iter *iter,
                                     struct page ***ppages, size_t maxsize,
                                     unsigned maxpages, size_t *_start_offset)
{
        const struct folio_queue *folioq = iter->folioq;
        struct page **pages;
        unsigned int slot = iter->folioq_slot;
        size_t extracted = 0, count = iter->count, iov_offset = iter->iov_offset;

        if (slot >= folioq_nr_slots(folioq)) {
                folioq = folioq->next;
                slot = 0;
                if (WARN_ON(iov_offset != 0))
                        return -EIO;
        }

        maxpages = want_pages_array(ppages, maxsize, iov_offset & ~PAGE_MASK, maxpages);
        if (!maxpages)
                return -ENOMEM;
        *_start_offset = iov_offset & ~PAGE_MASK;
        pages = *ppages;

        for (;;) {
                struct folio *folio = folioq_folio(folioq, slot);
                size_t offset = iov_offset, fsize = folioq_folio_size(folioq, slot);
                size_t part = PAGE_SIZE - offset % PAGE_SIZE;

                if (offset < fsize) {
                        part = umin(part, umin(maxsize - extracted, fsize - offset));
                        count -= part;
                        iov_offset += part;
                        extracted += part;

                        *pages = folio_page(folio, offset / PAGE_SIZE);
                        get_page(*pages);
                        pages++;
                        maxpages--;
                }

                if (maxpages == 0 || extracted >= maxsize)
                        break;

                if (iov_offset >= fsize) {
                        iov_offset = 0;
                        slot++;
                        if (slot == folioq_nr_slots(folioq) && folioq->next) {
                                folioq = folioq->next;
                                slot = 0;
                        }
                }
        }

        iter->count = count;
        iter->iov_offset = iov_offset;
        iter->folioq = folioq;
        iter->folioq_slot = slot;
        return extracted;
}

static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa,
                                          pgoff_t index, unsigned int nr_pages)
{
        XA_STATE(xas, xa, index);
        struct folio *folio;
        unsigned int ret = 0;

        rcu_read_lock();
        for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {
                if (xas_retry(&xas, folio))
                        continue;

                /* Has the folio moved or been split? */
                if (unlikely(folio != xas_reload(&xas))) {
                        xas_reset(&xas);
                        continue;
                }

                pages[ret] = folio_file_page(folio, xas.xa_index);
                folio_get(folio);
                if (++ret == nr_pages)
                        break;
        }
        rcu_read_unlock();
        return ret;
}

static ssize_t iter_xarray_get_pages(struct iov_iter *i,
                                     struct page ***pages, size_t maxsize,
                                     unsigned maxpages, size_t *_start_offset)
{
        unsigned nr, offset, count;
        pgoff_t index;
        loff_t pos;

        pos = i->xarray_start + i->iov_offset;
        index = pos >> PAGE_SHIFT;
        offset = pos & ~PAGE_MASK;
        *_start_offset = offset;

        count = want_pages_array(pages, maxsize, offset, maxpages);
        if (!count)
                return -ENOMEM;
        nr = iter_xarray_populate_pages(*pages, i->xarray, index, count);
        if (nr == 0)
                return 0;

        maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize);
        i->iov_offset += maxsize;
        i->count -= maxsize;
        return maxsize;
}

/* must be done on non-empty ITER_UBUF or ITER_IOVEC one */
static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size)
{
        size_t skip;
        long k;

        if (iter_is_ubuf(i))
                return (unsigned long)i->ubuf + i->iov_offset;

        for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) {
                const struct iovec *iov = iter_iov(i) + k;
                size_t len = iov->iov_len - skip;

                if (unlikely(!len))
                        continue;
                if (*size > len)
                        *size = len;
                return (unsigned long)iov->iov_base + skip;
        }
        BUG(); // if it had been empty, we wouldn't get called
}

/* must be done on non-empty ITER_BVEC one */
static struct page *first_bvec_segment(const struct iov_iter *i,
                                       size_t *size, size_t *start)
{
        struct page *page;
        size_t skip = i->iov_offset, len;

        len = i->bvec->bv_len - skip;
        if (*size > len)
                *size = len;
        skip += i->bvec->bv_offset;
        page = i->bvec->bv_page + skip / PAGE_SIZE;
        *start = skip % PAGE_SIZE;
        return page;
}

static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i,
                   struct page ***pages, size_t maxsize,
                   unsigned int maxpages, size_t *start)
{
        unsigned int n, gup_flags = 0;

        if (maxsize > i->count)
                maxsize = i->count;
        if (!maxsize)
                return 0;
        if (maxsize > MAX_RW_COUNT)
                maxsize = MAX_RW_COUNT;

        if (likely(user_backed_iter(i))) {
                unsigned long addr;
                int res;

                if (iov_iter_rw(i) != WRITE)
                        gup_flags |= FOLL_WRITE;
                if (i->nofault)
                        gup_flags |= FOLL_NOFAULT;

                addr = first_iovec_segment(i, &maxsize);
                *start = addr % PAGE_SIZE;
                addr &= PAGE_MASK;
                n = want_pages_array(pages, maxsize, *start, maxpages);
                if (!n)
                        return -ENOMEM;
                res = get_user_pages_fast(addr, n, gup_flags, *pages);
                if (unlikely(res <= 0))
                        return res;
                maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - *start);
                iov_iter_advance(i, maxsize);
                return maxsize;
        }
        if (iov_iter_is_bvec(i)) {
                struct page **p;
                struct page *page;

                page = first_bvec_segment(i, &maxsize, start);
                n = want_pages_array(pages, maxsize, *start, maxpages);
                if (!n)
                        return -ENOMEM;
                p = *pages;
                for (int k = 0; k < n; k++) {
                        struct folio *folio = page_folio(page + k);
                        p[k] = page + k;
                        if (!folio_test_slab(folio))
                                folio_get(folio);
                }
                maxsize = min_t(size_t, maxsize, n * PAGE_SIZE - *start);
                i->count -= maxsize;
                i->iov_offset += maxsize;
                if (i->iov_offset == i->bvec->bv_len) {
                        i->iov_offset = 0;
                        i->bvec++;
                        i->nr_segs--;
                }
                return maxsize;
        }
        if (iov_iter_is_folioq(i))
                return iter_folioq_get_pages(i, pages, maxsize, maxpages, start);
        if (iov_iter_is_xarray(i))
                return iter_xarray_get_pages(i, pages, maxsize, maxpages, start);
        return -EFAULT;
}

ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages,
                size_t maxsize, unsigned maxpages, size_t *start)
{
        if (!maxpages)
                return 0;
        BUG_ON(!pages);

        return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, start);
}
EXPORT_SYMBOL(iov_iter_get_pages2);

ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i,
                struct page ***pages, size_t maxsize, size_t *start)
{
        ssize_t len;

        *pages = NULL;

        len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start);
        if (len <= 0) {
                kvfree(*pages);
                *pages = NULL;
        }
        return len;
}
EXPORT_SYMBOL(iov_iter_get_pages_alloc2);

static int iov_npages(const struct iov_iter *i, int maxpages)
{
        size_t skip = i->iov_offset, size = i->count;
        const struct iovec *p;
        int npages = 0;

        for (p = iter_iov(i); size; skip = 0, p++) {
                unsigned offs = offset_in_page(p->iov_base + skip);
                size_t len = min(p->iov_len - skip, size);

                if (len) {
                        size -= len;
                        npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
                        if (unlikely(npages > maxpages))
                                return maxpages;
                }
        }
        return npages;
}

static int bvec_npages(const struct iov_iter *i, int maxpages)
{
        size_t skip = i->iov_offset, size = i->count;
        const struct bio_vec *p;
        int npages = 0;

        for (p = i->bvec; size; skip = 0, p++) {
                unsigned offs = (p->bv_offset + skip) % PAGE_SIZE;
                size_t len = min(p->bv_len - skip, size);

                size -= len;
                npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
                if (unlikely(npages > maxpages))
                        return maxpages;
        }
        return npages;
}

int iov_iter_npages(const struct iov_iter *i, int maxpages)
{
        if (unlikely(!i->count))
                return 0;
        if (likely(iter_is_ubuf(i))) {
                unsigned offs = offset_in_page(i->ubuf + i->iov_offset);
                int npages = DIV_ROUND_UP(offs + i->count, PAGE_SIZE);
                return min(npages, maxpages);
        }
        /* iovec and kvec have identical layouts */
        if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
                return iov_npages(i, maxpages);
        if (iov_iter_is_bvec(i))
                return bvec_npages(i, maxpages);
        if (iov_iter_is_folioq(i)) {
                unsigned offset = i->iov_offset % PAGE_SIZE;
                int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE);
                return min(npages, maxpages);
        }
        if (iov_iter_is_xarray(i)) {
                unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE;
                int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE);
                return min(npages, maxpages);
        }
        return 0;
}
EXPORT_SYMBOL(iov_iter_npages);

const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
{
        *new = *old;
        if (iov_iter_is_bvec(new))
                return new->bvec = kmemdup(new->bvec,
                                    new->nr_segs * sizeof(struct bio_vec),
                                    flags);
        else if (iov_iter_is_kvec(new) || iter_is_iovec(new))
                /* iovec and kvec have identical layout */
                return new->__iov = kmemdup(new->__iov,
                                   new->nr_segs * sizeof(struct iovec),
                                   flags);
        return NULL;
}
EXPORT_SYMBOL(dup_iter);

static __noclone int copy_compat_iovec_from_user(struct iovec *iov,
                const struct iovec __user *uvec, u32 nr_segs)
{
        const struct compat_iovec __user *uiov =
                (const struct compat_iovec __user *)uvec;
        int ret = -EFAULT;
        u32 i;

        if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
                return -EFAULT;

        for (i = 0; i < nr_segs; i++) {
                compat_uptr_t buf;
                compat_ssize_t len;

                unsafe_get_user(len, &uiov[i].iov_len, uaccess_end);
                unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end);

                /* check for compat_size_t not fitting in compat_ssize_t .. */
                if (len < 0) {
                        ret = -EINVAL;
                        goto uaccess_end;
                }
                iov[i].iov_base = compat_ptr(buf);
                iov[i].iov_len = len;
        }

        ret = 0;
uaccess_end:
        user_access_end();
        return ret;
}

static __noclone int copy_iovec_from_user(struct iovec *iov,
                const struct iovec __user *uiov, unsigned long nr_segs)
{
        int ret = -EFAULT;

        if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
                return -EFAULT;

        do {
                void __user *buf;
                ssize_t len;

                unsafe_get_user(len, &uiov->iov_len, uaccess_end);
                unsafe_get_user(buf, &uiov->iov_base, uaccess_end);

                /* check for size_t not fitting in ssize_t .. */
                if (unlikely(len < 0)) {
                        ret = -EINVAL;
                        goto uaccess_end;
                }
                iov->iov_base = buf;
                iov->iov_len = len;

                uiov++; iov++;
        } while (--nr_segs);

        ret = 0;
uaccess_end:
        user_access_end();
        return ret;
}

struct iovec *iovec_from_user(const struct iovec __user *uvec,
                unsigned long nr_segs, unsigned long fast_segs,
                struct iovec *fast_iov, bool compat)
{
        struct iovec *iov = fast_iov;
        int ret;

        /*
         * SuS says "The readv() function *may* fail if the iovcnt argument was
         * less than or equal to 0, or greater than {IOV_MAX}.  Linux has
         * traditionally returned zero for zero segments, so...
         */
        if (nr_segs == 0)
                return iov;
        if (nr_segs > UIO_MAXIOV)
                return ERR_PTR(-EINVAL);
        if (nr_segs > fast_segs) {
                iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
                if (!iov)
                        return ERR_PTR(-ENOMEM);
        }

        if (unlikely(compat))
                ret = copy_compat_iovec_from_user(iov, uvec, nr_segs);
        else
                ret = copy_iovec_from_user(iov, uvec, nr_segs);
        if (ret) {
                if (iov != fast_iov)
                        kfree(iov);
                return ERR_PTR(ret);
        }

        return iov;
}

/*
 * Single segment iovec supplied by the user, import it as ITER_UBUF.
 */
static ssize_t __import_iovec_ubuf(int type, const struct iovec __user *uvec,
                                   struct iovec **iovp, struct iov_iter *i,
                                   bool compat)
{
        struct iovec *iov = *iovp;
        ssize_t ret;

        *iovp = NULL;

        if (compat)
                ret = copy_compat_iovec_from_user(iov, uvec, 1);
        else
                ret = copy_iovec_from_user(iov, uvec, 1);
        if (unlikely(ret))
                return ret;

        ret = import_ubuf(type, iov->iov_base, iov->iov_len, i);
        if (unlikely(ret))
                return ret;
        return i->count;
}

ssize_t __import_iovec(int type, const struct iovec __user *uvec,
                 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
                 struct iov_iter *i, bool compat)
{
        ssize_t total_len = 0;
        unsigned long seg;
        struct iovec *iov;

        if (nr_segs == 1)
                return __import_iovec_ubuf(type, uvec, iovp, i, compat);

        iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat);
        if (IS_ERR(iov)) {
                *iovp = NULL;
                return PTR_ERR(iov);
        }

        /*
         * According to the Single Unix Specification we should return EINVAL if
         * an element length is < 0 when cast to ssize_t or if the total length
         * would overflow the ssize_t return value of the system call.
         *
         * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
         * overflow case.
         */
        for (seg = 0; seg < nr_segs; seg++) {
                ssize_t len = (ssize_t)iov[seg].iov_len;

                if (!access_ok(iov[seg].iov_base, len)) {
                        if (iov != *iovp)
                                kfree(iov);
                        *iovp = NULL;
                        return -EFAULT;
                }

                if (len > MAX_RW_COUNT - total_len) {
                        len = MAX_RW_COUNT - total_len;
                        iov[seg].iov_len = len;
                }
                total_len += len;
        }

        iov_iter_init(i, type, iov, nr_segs, total_len);
        if (iov == *iovp)
                *iovp = NULL;
        else
                *iovp = iov;
        return total_len;
}

/**
 * import_iovec() - Copy an array of &struct iovec from userspace
 *     into the kernel, check that it is valid, and initialize a new
 *     &struct iov_iter iterator to access it.
 *
 * @type: One of %READ or %WRITE.
 * @uvec: Pointer to the userspace array.
 * @nr_segs: Number of elements in userspace array.
 * @fast_segs: Number of elements in @iov.
 * @iovp: (input and output parameter) Pointer to pointer to (usually small
 *     on-stack) kernel array.
 * @i: Pointer to iterator that will be initialized on success.
 *
 * If the array pointed to by *@iov is large enough to hold all @nr_segs,
 * then this function places %NULL in *@iov on return. Otherwise, a new
 * array will be allocated and the result placed in *@iov. This means that
 * the caller may call kfree() on *@iov regardless of whether the small
 * on-stack array was used or not (and regardless of whether this function
 * returns an error or not).
 *
 * Return: Negative error code on error, bytes imported on success
 */
ssize_t import_iovec(int type, const struct iovec __user *uvec,
                 unsigned nr_segs, unsigned fast_segs,
                 struct iovec **iovp, struct iov_iter *i)
{
        return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i,
                              in_compat_syscall());
}
EXPORT_SYMBOL(import_iovec);

int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i)
{
        if (len > MAX_RW_COUNT)
                len = MAX_RW_COUNT;
        if (unlikely(!access_ok(buf, len)))
                return -EFAULT;

        iov_iter_ubuf(i, rw, buf, len);
        return 0;
}
EXPORT_SYMBOL_GPL(import_ubuf);

/**
 * iov_iter_restore() - Restore a &struct iov_iter to the same state as when
 *     iov_iter_save_state() was called.
 *
 * @i: &struct iov_iter to restore
 * @state: state to restore from
 *
 * Used after iov_iter_save_state() to bring restore @i, if operations may
 * have advanced it.
 *
 * Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC
 */
void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
{
        if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i) &&
                         !iter_is_ubuf(i)) && !iov_iter_is_kvec(i))
                return;
        i->iov_offset = state->iov_offset;
        i->count = state->count;
        if (iter_is_ubuf(i))
                return;
        /*
         * For the *vec iters, nr_segs + iov is constant - if we increment
         * the vec, then we also decrement the nr_segs count. Hence we don't
         * need to track both of these, just one is enough and we can deduct
         * the other from that. ITER_KVEC and ITER_IOVEC are the same struct
         * size, so we can just increment the iov pointer as they are unionzed.
         * ITER_BVEC _may_ be the same size on some archs, but on others it is
         * not. Be safe and handle it separately.
         */
        BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec));
        if (iov_iter_is_bvec(i))
                i->bvec -= state->nr_segs - i->nr_segs;
        else
                i->__iov -= state->nr_segs - i->nr_segs;
        i->nr_segs = state->nr_segs;
}

/*
 * Extract a list of contiguous pages from an ITER_FOLIOQ iterator.  This does
 * not get references on the pages, nor does it get a pin on them.
 */
static ssize_t iov_iter_extract_folioq_pages(struct iov_iter *i,
                                             struct page ***pages, size_t maxsize,
                                             unsigned int maxpages,
                                             iov_iter_extraction_t extraction_flags,
                                             size_t *offset0)
{
        const struct folio_queue *folioq = i->folioq;
        struct page **p;
        unsigned int nr = 0;
        size_t extracted = 0, offset, slot = i->folioq_slot;

        if (slot >= folioq_nr_slots(folioq)) {
                folioq = folioq->next;
                slot = 0;
                if (WARN_ON(i->iov_offset != 0))
                        return -EIO;
        }

        offset = i->iov_offset & ~PAGE_MASK;
        *offset0 = offset;

        maxpages = want_pages_array(pages, maxsize, offset, maxpages);
        if (!maxpages)
                return -ENOMEM;
        p = *pages;

        for (;;) {
                struct folio *folio = folioq_folio(folioq, slot);
                size_t offset = i->iov_offset, fsize = folioq_folio_size(folioq, slot);
                size_t part = PAGE_SIZE - offset % PAGE_SIZE;

                if (offset < fsize) {
                        part = umin(part, umin(maxsize - extracted, fsize - offset));
                        i->count -= part;
                        i->iov_offset += part;
                        extracted += part;

                        p[nr++] = folio_page(folio, offset / PAGE_SIZE);
                }

                if (nr >= maxpages || extracted >= maxsize)
                        break;

                if (i->iov_offset >= fsize) {
                        i->iov_offset = 0;
                        slot++;
                        if (slot == folioq_nr_slots(folioq) && folioq->next) {
                                folioq = folioq->next;
                                slot = 0;
                        }
                }
        }

        i->folioq = folioq;
        i->folioq_slot = slot;
        return extracted;
}

/*
 * Extract a list of contiguous pages from an ITER_XARRAY iterator.  This does not
 * get references on the pages, nor does it get a pin on them.
 */
static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i,
                                             struct page ***pages, size_t maxsize,
                                             unsigned int maxpages,
                                             iov_iter_extraction_t extraction_flags,
                                             size_t *offset0)
{
        struct page **p;
        struct folio *folio;
        unsigned int nr = 0, offset;
        loff_t pos = i->xarray_start + i->iov_offset;
        XA_STATE(xas, i->xarray, pos >> PAGE_SHIFT);

        offset = pos & ~PAGE_MASK;
        *offset0 = offset;

        maxpages = want_pages_array(pages, maxsize, offset, maxpages);
        if (!maxpages)
                return -ENOMEM;
        p = *pages;

        rcu_read_lock();
        for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {
                if (xas_retry(&xas, folio))
                        continue;

                /* Has the folio moved or been split? */
                if (unlikely(folio != xas_reload(&xas))) {
                        xas_reset(&xas);
                        continue;
                }

                p[nr++] = folio_file_page(folio, xas.xa_index);
                if (nr == maxpages)
                        break;
        }
        rcu_read_unlock();

        maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize);
        iov_iter_advance(i, maxsize);
        return maxsize;
}

/*
 * Extract a list of virtually contiguous pages from an ITER_BVEC iterator.
 * This does not get references on the pages, nor does it get a pin on them.
 */
static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i,
                                           struct page ***pages, size_t maxsize,
                                           unsigned int maxpages,
                                           iov_iter_extraction_t extraction_flags,
                                           size_t *offset0)
{
        size_t skip = i->iov_offset, size = 0;
        struct bvec_iter bi;
        int k = 0;

        if (i->nr_segs == 0)
                return 0;

        if (i->iov_offset == i->bvec->bv_len) {
                i->iov_offset = 0;
                i->nr_segs--;
                i->bvec++;
                skip = 0;
        }
        bi.bi_idx = 0;
        bi.bi_size = maxsize;
        bi.bi_bvec_done = skip;

        maxpages = want_pages_array(pages, maxsize, skip, maxpages);

        while (bi.bi_size && bi.bi_idx < i->nr_segs) {
                struct bio_vec bv = bvec_iter_bvec(i->bvec, bi);

                /*
                 * The iov_iter_extract_pages interface only allows an offset
                 * into the first page.  Break out of the loop if we see an
                 * offset into subsequent pages, the caller will have to call
                 * iov_iter_extract_pages again for the reminder.
                 */
                if (k) {
                        if (bv.bv_offset)
                                break;
                } else {
                        *offset0 = bv.bv_offset;
                }

                (*pages)[k++] = bv.bv_page;
                size += bv.bv_len;

                if (k >= maxpages)
                        break;

                /*
                 * We are done when the end of the bvec doesn't align to a page
                 * boundary as that would create a hole in the returned space.
                 * The caller will handle this with another call to
                 * iov_iter_extract_pages.
                 */
                if (bv.bv_offset + bv.bv_len != PAGE_SIZE)
                        break;

                bvec_iter_advance_single(i->bvec, &bi, bv.bv_len);
        }

        iov_iter_advance(i, size);
        return size;
}

/*
 * Extract a list of virtually contiguous pages from an ITER_KVEC iterator.
 * This does not get references on the pages, nor does it get a pin on them.
 */
static ssize_t iov_iter_extract_kvec_pages(struct iov_iter *i,
                                           struct page ***pages, size_t maxsize,
                                           unsigned int maxpages,
                                           iov_iter_extraction_t extraction_flags,
                                           size_t *offset0)
{
        struct page **p, *page;
        const void *kaddr;
        size_t skip = i->iov_offset, offset, len, size;
        int k;

        for (;;) {
                if (i->nr_segs == 0)
                        return 0;
                size = min(maxsize, i->kvec->iov_len - skip);
                if (size)
                        break;
                i->iov_offset = 0;
                i->nr_segs--;
                i->kvec++;
                skip = 0;
        }

        kaddr = i->kvec->iov_base + skip;
        offset = (unsigned long)kaddr & ~PAGE_MASK;
        *offset0 = offset;

        maxpages = want_pages_array(pages, size, offset, maxpages);
        if (!maxpages)
                return -ENOMEM;
        p = *pages;

        kaddr -= offset;
        len = offset + size;
        for (k = 0; k < maxpages; k++) {
                size_t seg = min_t(size_t, len, PAGE_SIZE);

                if (is_vmalloc_or_module_addr(kaddr))
                        page = vmalloc_to_page(kaddr);
                else
                        page = virt_to_page(kaddr);

                p[k] = page;
                len -= seg;
                kaddr += PAGE_SIZE;
        }

        size = min_t(size_t, size, maxpages * PAGE_SIZE - offset);
        iov_iter_advance(i, size);
        return size;
}

/*
 * Extract a list of contiguous pages from a user iterator and get a pin on
 * each of them.  This should only be used if the iterator is user-backed
 * (IOBUF/UBUF).
 *
 * It does not get refs on the pages, but the pages must be unpinned by the
 * caller once the transfer is complete.
 *
 * This is safe to be used where background IO/DMA *is* going to be modifying
 * the buffer; using a pin rather than a ref makes forces fork() to give the
 * child a copy of the page.
 */
static ssize_t iov_iter_extract_user_pages(struct iov_iter *i,
                                           struct page ***pages,
                                           size_t maxsize,
                                           unsigned int maxpages,
                                           iov_iter_extraction_t extraction_flags,
                                           size_t *offset0)
{
        unsigned long addr;
        unsigned int gup_flags = 0;
        size_t offset;
        int res;

        if (i->data_source == ITER_DEST)
                gup_flags |= FOLL_WRITE;
        if (extraction_flags & ITER_ALLOW_P2PDMA)
                gup_flags |= FOLL_PCI_P2PDMA;
        if (i->nofault)
                gup_flags |= FOLL_NOFAULT;

        addr = first_iovec_segment(i, &maxsize);
        *offset0 = offset = addr % PAGE_SIZE;
        addr &= PAGE_MASK;
        maxpages = want_pages_array(pages, maxsize, offset, maxpages);
        if (!maxpages)
                return -ENOMEM;
        res = pin_user_pages_fast(addr, maxpages, gup_flags, *pages);
        if (unlikely(res <= 0))
                return res;
        maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - offset);
        iov_iter_advance(i, maxsize);
        return maxsize;
}

/**
 * iov_iter_extract_pages - Extract a list of contiguous pages from an iterator
 * @i: The iterator to extract from
 * @pages: Where to return the list of pages
 * @maxsize: The maximum amount of iterator to extract
 * @maxpages: The maximum size of the list of pages
 * @extraction_flags: Flags to qualify request
 * @offset0: Where to return the starting offset into (*@pages)[0]
 *
 * Extract a list of contiguous pages from the current point of the iterator,
 * advancing the iterator.  The maximum number of pages and the maximum amount
 * of page contents can be set.
 *
 * If *@pages is NULL, a page list will be allocated to the required size and
 * *@pages will be set to its base.  If *@pages is not NULL, it will be assumed
 * that the caller allocated a page list at least @maxpages in size and this
 * will be filled in.
 *
 * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA
 * be allowed on the pages extracted.
 *
 * The iov_iter_extract_will_pin() function can be used to query how cleanup
 * should be performed.
 *
 * Extra refs or pins on the pages may be obtained as follows:
 *
 *  (*) If the iterator is user-backed (ITER_IOVEC/ITER_UBUF), pins will be
 *      added to the pages, but refs will not be taken.
 *      iov_iter_extract_will_pin() will return true.
 *
 *  (*) If the iterator is ITER_KVEC, ITER_BVEC, ITER_FOLIOQ or ITER_XARRAY, the
 *      pages are merely listed; no extra refs or pins are obtained.
 *      iov_iter_extract_will_pin() will return 0.
 *
 * Note also:
 *
 *  (*) Use with ITER_DISCARD is not supported as that has no content.
 *
 * On success, the function sets *@pages to the new pagelist, if allocated, and
 * sets *offset0 to the offset into the first page.
 *
 * It may also return -ENOMEM and -EFAULT.
 */
ssize_t iov_iter_extract_pages(struct iov_iter *i,
                               struct page ***pages,
                               size_t maxsize,
                               unsigned int maxpages,
                               iov_iter_extraction_t extraction_flags,
                               size_t *offset0)
{
        maxsize = min_t(size_t, min_t(size_t, maxsize, i->count), MAX_RW_COUNT);
        if (!maxsize)
                return 0;

        if (likely(user_backed_iter(i)))
                return iov_iter_extract_user_pages(i, pages, maxsize,
                                                   maxpages, extraction_flags,
                                                   offset0);
        if (iov_iter_is_kvec(i))
                return iov_iter_extract_kvec_pages(i, pages, maxsize,
                                                   maxpages, extraction_flags,
                                                   offset0);
        if (iov_iter_is_bvec(i))
                return iov_iter_extract_bvec_pages(i, pages, maxsize,
                                                   maxpages, extraction_flags,
                                                   offset0);
        if (iov_iter_is_folioq(i))
                return iov_iter_extract_folioq_pages(i, pages, maxsize,
                                                     maxpages, extraction_flags,
                                                     offset0);
        if (iov_iter_is_xarray(i))
                return iov_iter_extract_xarray_pages(i, pages, maxsize,
                                                     maxpages, extraction_flags,
                                                     offset0);
        return -EFAULT;
}
EXPORT_SYMBOL_GPL(iov_iter_extract_pages);















































































































   40 
   42 




























































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
// SPDX-License-Identifier: GPL-2.0
/*
 *  hrtimers - High-resolution kernel timers
 *
 *   Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
 *   Copyright(C) 2005, Red Hat, Inc., Ingo Molnar
 *
 *  data type definitions, declarations, prototypes
 *
 *  Started by: Thomas Gleixner and Ingo Molnar
 */
#ifndef _LINUX_HRTIMER_H
#define _LINUX_HRTIMER_H

#include <linux/hrtimer_defs.h>
#include <linux/hrtimer_types.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/percpu-defs.h>
#include <linux/rbtree.h>
#include <linux/timer.h>

/*
 * Mode arguments of xxx_hrtimer functions:
 *
 * HRTIMER_MODE_ABS                - Time value is absolute
 * HRTIMER_MODE_REL                - Time value is relative to now
 * HRTIMER_MODE_PINNED                - Timer is bound to CPU (is only considered
 *                                  when starting the timer)
 * HRTIMER_MODE_SOFT                - Timer callback function will be executed in
 *                                  soft irq context
 * HRTIMER_MODE_HARD                - Timer callback function will be executed in
 *                                  hard irq context even on PREEMPT_RT.
 */
enum hrtimer_mode {
        HRTIMER_MODE_ABS        = 0x00,
        HRTIMER_MODE_REL        = 0x01,
        HRTIMER_MODE_PINNED        = 0x02,
        HRTIMER_MODE_SOFT        = 0x04,
        HRTIMER_MODE_HARD        = 0x08,

        HRTIMER_MODE_ABS_PINNED = HRTIMER_MODE_ABS | HRTIMER_MODE_PINNED,
        HRTIMER_MODE_REL_PINNED = HRTIMER_MODE_REL | HRTIMER_MODE_PINNED,

        HRTIMER_MODE_ABS_SOFT        = HRTIMER_MODE_ABS | HRTIMER_MODE_SOFT,
        HRTIMER_MODE_REL_SOFT        = HRTIMER_MODE_REL | HRTIMER_MODE_SOFT,

        HRTIMER_MODE_ABS_PINNED_SOFT = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_SOFT,
        HRTIMER_MODE_REL_PINNED_SOFT = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_SOFT,

        HRTIMER_MODE_ABS_HARD        = HRTIMER_MODE_ABS | HRTIMER_MODE_HARD,
        HRTIMER_MODE_REL_HARD        = HRTIMER_MODE_REL | HRTIMER_MODE_HARD,

        HRTIMER_MODE_ABS_PINNED_HARD = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_HARD,
        HRTIMER_MODE_REL_PINNED_HARD = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_HARD,
};

/*
 * Values to track state of the timer
 *
 * Possible states:
 *
 * 0x00                inactive
 * 0x01                enqueued into rbtree
 *
 * The callback state is not part of the timer->state because clearing it would
 * mean touching the timer after the callback, this makes it impossible to free
 * the timer from the callback function.
 *
 * Therefore we track the callback state in:
 *
 *        timer->base->cpu_base->running == timer
 *
 * On SMP it is possible to have a "callback function running and enqueued"
 * status. It happens for example when a posix timer expired and the callback
 * queued a signal. Between dropping the lock which protects the posix timer
 * and reacquiring the base lock of the hrtimer, another CPU can deliver the
 * signal and rearm the timer.
 *
 * All state transitions are protected by cpu_base->lock.
 */
#define HRTIMER_STATE_INACTIVE        0x00
#define HRTIMER_STATE_ENQUEUED        0x01

/**
 * struct hrtimer_sleeper - simple sleeper structure
 * @timer:        embedded timer structure
 * @task:        task to wake up
 *
 * task is set to NULL, when the timer expires.
 */
struct hrtimer_sleeper {
        struct hrtimer timer;
        struct task_struct *task;
};

static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
{
        timer->node.expires = time;
        timer->_softexpires = time;
}

static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time, ktime_t delta)
{
        timer->_softexpires = time;
        timer->node.expires = ktime_add_safe(time, delta);
}

static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, u64 delta)
{
        timer->_softexpires = time;
        timer->node.expires = ktime_add_safe(time, ns_to_ktime(delta));
}

static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64)
{
        timer->node.expires = tv64;
        timer->_softexpires = tv64;
}

static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time)
{
        timer->node.expires = ktime_add_safe(timer->node.expires, time);
        timer->_softexpires = ktime_add_safe(timer->_softexpires, time);
}

static inline void hrtimer_add_expires_ns(struct hrtimer *timer, u64 ns)
{
        timer->node.expires = ktime_add_ns(timer->node.expires, ns);
        timer->_softexpires = ktime_add_ns(timer->_softexpires, ns);
}

static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer)
{
        return timer->node.expires;
}

static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer)
{
        return timer->_softexpires;
}

static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer)
{
        return timer->node.expires;
}
static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer)
{
        return timer->_softexpires;
}

static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer)
{
        return ktime_to_ns(timer->node.expires);
}

ktime_t hrtimer_cb_get_time(const struct hrtimer *timer);

static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer)
{
        return ktime_sub(timer->node.expires, hrtimer_cb_get_time(timer));
}

static inline int hrtimer_is_hres_active(struct hrtimer *timer)
{
        return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
                timer->base->cpu_base->hres_active : 0;
}

#ifdef CONFIG_HIGH_RES_TIMERS
struct clock_event_device;

extern void hrtimer_interrupt(struct clock_event_device *dev);

extern unsigned int hrtimer_resolution;

#else

#define hrtimer_resolution        (unsigned int)LOW_RES_NSEC

#endif

static inline ktime_t
__hrtimer_expires_remaining_adjusted(const struct hrtimer *timer, ktime_t now)
{
        ktime_t rem = ktime_sub(timer->node.expires, now);

        /*
         * Adjust relative timers for the extra we added in
         * hrtimer_start_range_ns() to prevent short timeouts.
         */
        if (IS_ENABLED(CONFIG_TIME_LOW_RES) && timer->is_rel)
                rem -= hrtimer_resolution;
        return rem;
}

static inline ktime_t
hrtimer_expires_remaining_adjusted(const struct hrtimer *timer)
{
        return __hrtimer_expires_remaining_adjusted(timer, hrtimer_cb_get_time(timer));
}

#ifdef CONFIG_TIMERFD
extern void timerfd_clock_was_set(void);
extern void timerfd_resume(void);
#else
static inline void timerfd_clock_was_set(void) { }
static inline void timerfd_resume(void) { }
#endif

DECLARE_PER_CPU(struct tick_device, tick_cpu_device);

#ifdef CONFIG_PREEMPT_RT
void hrtimer_cancel_wait_running(const struct hrtimer *timer);
#else
static inline void hrtimer_cancel_wait_running(struct hrtimer *timer)
{
        cpu_relax();
}
#endif

static inline enum hrtimer_restart hrtimer_dummy_timeout(struct hrtimer *unused)
{
        return HRTIMER_NORESTART;
}

/* Exported timer functions: */

/* Initialize timers: */
extern void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *),
                          clockid_t clock_id, enum hrtimer_mode mode);
extern void hrtimer_setup_on_stack(struct hrtimer *timer,
                                   enum hrtimer_restart (*function)(struct hrtimer *),
                                   clockid_t clock_id, enum hrtimer_mode mode);
extern void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id,
                                           enum hrtimer_mode mode);

#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
extern void destroy_hrtimer_on_stack(struct hrtimer *timer);
#else
static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { }
#endif

/* Basic timer operations: */
extern void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
                                   u64 range_ns, const enum hrtimer_mode mode);

/**
 * hrtimer_start - (re)start an hrtimer
 * @timer:        the timer to be added
 * @tim:        expiry time
 * @mode:        timer mode: absolute (HRTIMER_MODE_ABS) or
 *                relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
 *                softirq based mode is considered for debug purpose only!
 */
static inline void hrtimer_start(struct hrtimer *timer, ktime_t tim,
                                 const enum hrtimer_mode mode)
{
        hrtimer_start_range_ns(timer, tim, 0, mode);
}

extern int hrtimer_cancel(struct hrtimer *timer);
extern int hrtimer_try_to_cancel(struct hrtimer *timer);

static inline void hrtimer_start_expires(struct hrtimer *timer,
                                         enum hrtimer_mode mode)
{
        u64 delta;
        ktime_t soft, hard;
        soft = hrtimer_get_softexpires(timer);
        hard = hrtimer_get_expires(timer);
        delta = ktime_to_ns(ktime_sub(hard, soft));
        hrtimer_start_range_ns(timer, soft, delta, mode);
}

void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl,
                                   enum hrtimer_mode mode);

static inline void hrtimer_restart(struct hrtimer *timer)
{
        hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}

/* Query timers: */
extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust);

/**
 * hrtimer_get_remaining - get remaining time for the timer
 * @timer:        the timer to read
 */
static inline ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
{
        return __hrtimer_get_remaining(timer, false);
}

extern u64 hrtimer_get_next_event(void);
extern u64 hrtimer_next_event_without(const struct hrtimer *exclude);

extern bool hrtimer_active(const struct hrtimer *timer);

/**
 * hrtimer_is_queued - check, whether the timer is on one of the queues
 * @timer:        Timer to check
 *
 * Returns: True if the timer is queued, false otherwise
 *
 * The function can be used lockless, but it gives only a current snapshot.
 */
static inline bool hrtimer_is_queued(struct hrtimer *timer)
{
        /* The READ_ONCE pairs with the update functions of timer->state */
        return !!(READ_ONCE(timer->state) & HRTIMER_STATE_ENQUEUED);
}

/*
 * Helper function to check, whether the timer is running the callback
 * function
 */
static inline int hrtimer_callback_running(struct hrtimer *timer)
{
        return timer->base->running == timer;
}

/**
 * hrtimer_update_function - Update the timer's callback function
 * @timer:        Timer to update
 * @function:        New callback function
 *
 * Only safe to call if the timer is not enqueued. Can be called in the callback function if the
 * timer is not enqueued at the same time (see the comments above HRTIMER_STATE_ENQUEUED).
 */
static inline void hrtimer_update_function(struct hrtimer *timer,
                                           enum hrtimer_restart (*function)(struct hrtimer *))
{
#ifdef CONFIG_PROVE_LOCKING
        guard(raw_spinlock_irqsave)(&timer->base->cpu_base->lock);

        if (WARN_ON_ONCE(hrtimer_is_queued(timer)))
                return;

        if (WARN_ON_ONCE(!function))
                return;
#endif
        ACCESS_PRIVATE(timer, function) = function;
}

/* Forward a hrtimer so it expires after now: */
extern u64
hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval);

/**
 * hrtimer_forward_now() - forward the timer expiry so it expires after now
 * @timer:        hrtimer to forward
 * @interval:        the interval to forward
 *
 * It is a variant of hrtimer_forward(). The timer will expire after the current
 * time of the hrtimer clock base. See hrtimer_forward() for details.
 */
static inline u64 hrtimer_forward_now(struct hrtimer *timer,
                                      ktime_t interval)
{
        return hrtimer_forward(timer, hrtimer_cb_get_time(timer), interval);
}

/* Precise sleep: */

extern int nanosleep_copyout(struct restart_block *, struct timespec64 *);
extern long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
                              const clockid_t clockid);

extern int schedule_hrtimeout_range(ktime_t *expires, u64 delta,
                                    const enum hrtimer_mode mode);
extern int schedule_hrtimeout_range_clock(ktime_t *expires,
                                          u64 delta,
                                          const enum hrtimer_mode mode,
                                          clockid_t clock_id);
extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode);

/* Soft interrupt function to run the hrtimer queues: */
extern void hrtimer_run_queues(void);

/* Bootup initialization: */
extern void __init hrtimers_init(void);

/* Show pending timers: */
extern void sysrq_timer_list_show(void);

int hrtimers_prepare_cpu(unsigned int cpu);
int hrtimers_cpu_starting(unsigned int cpu);
#ifdef CONFIG_HOTPLUG_CPU
int hrtimers_cpu_dying(unsigned int cpu);
#else
#define hrtimers_cpu_dying        NULL
#endif

#endif
























































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BLK_CGROUP_PRIVATE_H
#define _BLK_CGROUP_PRIVATE_H
/*
 * block cgroup private header
 *
 * Based on ideas and code from CFQ, CFS and BFQ:
 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
 *
 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
 *                      Paolo Valente <paolo.valente@unimore.it>
 *
 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
 *                       Nauman Rafique <nauman@google.com>
 */

#include <linux/blk-cgroup.h>
#include <linux/cgroup.h>
#include <linux/kthread.h>
#include <linux/blk-mq.h>
#include <linux/llist.h>
#include "blk.h"

struct blkcg_gq;
struct blkg_policy_data;


/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
#define BLKG_STAT_CPU_BATCH        (INT_MAX / 2)

#ifdef CONFIG_BLK_CGROUP

enum blkg_iostat_type {
        BLKG_IOSTAT_READ,
        BLKG_IOSTAT_WRITE,
        BLKG_IOSTAT_DISCARD,

        BLKG_IOSTAT_NR,
};

struct blkg_iostat {
        u64                                bytes[BLKG_IOSTAT_NR];
        u64                                ios[BLKG_IOSTAT_NR];
};

struct blkg_iostat_set {
        struct u64_stats_sync                sync;
        struct blkcg_gq                       *blkg;
        struct llist_node                lnode;
        int                                lqueued;        /* queued in llist */
        struct blkg_iostat                cur;
        struct blkg_iostat                last;
};

/* association between a blk cgroup and a request queue */
struct blkcg_gq {
        /* Pointer to the associated request_queue */
        struct request_queue                *q;
        struct list_head                q_node;
        struct hlist_node                blkcg_node;
        struct blkcg                        *blkcg;

        /* all non-root blkcg_gq's are guaranteed to have access to parent */
        struct blkcg_gq                        *parent;

        /* reference count */
        struct percpu_ref                refcnt;

        /* is this blkg online? protected by both blkcg and q locks */
        bool                                online;

        struct blkg_iostat_set __percpu        *iostat_cpu;
        struct blkg_iostat_set                iostat;

        struct blkg_policy_data                *pd[BLKCG_MAX_POLS];
#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
        spinlock_t                        async_bio_lock;
        struct bio_list                        async_bios;
#endif
        union {
                struct work_struct        async_bio_work;
                struct work_struct        free_work;
        };

        atomic_t                        use_delay;
        atomic64_t                        delay_nsec;
        atomic64_t                        delay_start;
        u64                                last_delay;
        int                                last_use;

        struct rcu_head                        rcu_head;
};

struct blkcg {
        struct cgroup_subsys_state        css;
        spinlock_t                        lock;
        refcount_t                        online_pin;
        /* If there is block congestion on this cgroup. */
        atomic_t                        congestion_count;

        struct radix_tree_root                blkg_tree;
        struct blkcg_gq        __rcu                *blkg_hint;
        struct hlist_head                blkg_list;

        struct blkcg_policy_data        *cpd[BLKCG_MAX_POLS];

        struct list_head                all_blkcgs_node;

        /*
         * List of updated percpu blkg_iostat_set's since the last flush.
         */
        struct llist_head __percpu        *lhead;

#ifdef CONFIG_BLK_CGROUP_FC_APPID
        char                            fc_app_id[FC_APPID_LEN];
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
        struct list_head                cgwb_list;
#endif
};

static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
{
        return css ? container_of(css, struct blkcg, css) : NULL;
}

/*
 * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
 * request_queue (q).  This is used by blkcg policies which need to track
 * information per blkcg - q pair.
 *
 * There can be multiple active blkcg policies and each blkg:policy pair is
 * represented by a blkg_policy_data which is allocated and freed by each
 * policy's pd_alloc/free_fn() methods.  A policy can allocate private data
 * area by allocating larger data structure which embeds blkg_policy_data
 * at the beginning.
 */
struct blkg_policy_data {
        /* the blkg and policy id this per-policy data belongs to */
        struct blkcg_gq                        *blkg;
        int                                plid;
        bool                                online;
};

/*
 * Policies that need to keep per-blkcg data which is independent from any
 * request_queue associated to it should implement cpd_alloc/free_fn()
 * methods.  A policy can allocate private data area by allocating larger
 * data structure which embeds blkcg_policy_data at the beginning.
 * cpd_init() is invoked to let each policy handle per-blkcg data.
 */
struct blkcg_policy_data {
        /* the blkcg and policy id this per-policy data belongs to */
        struct blkcg                        *blkcg;
        int                                plid;
};

typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd);
typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd);
typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd);
typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(struct gendisk *disk,
                struct blkcg *blkcg, gfp_t gfp);
typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd);
typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
typedef void (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd,
                                struct seq_file *s);

struct blkcg_policy {
        int                                plid;
        /* cgroup files for the policy */
        struct cftype                        *dfl_cftypes;
        struct cftype                        *legacy_cftypes;

        /* operations */
        blkcg_pol_alloc_cpd_fn                *cpd_alloc_fn;
        blkcg_pol_free_cpd_fn                *cpd_free_fn;

        blkcg_pol_alloc_pd_fn                *pd_alloc_fn;
        blkcg_pol_init_pd_fn                *pd_init_fn;
        blkcg_pol_online_pd_fn                *pd_online_fn;
        blkcg_pol_offline_pd_fn                *pd_offline_fn;
        blkcg_pol_free_pd_fn                *pd_free_fn;
        blkcg_pol_reset_pd_stats_fn        *pd_reset_stats_fn;
        blkcg_pol_stat_pd_fn                *pd_stat_fn;
};

extern struct blkcg blkcg_root;
extern bool blkcg_debug_stats;

void blkg_init_queue(struct request_queue *q);
int blkcg_init_disk(struct gendisk *disk);
void blkcg_exit_disk(struct gendisk *disk);

/* Blkio controller policy registration */
int blkcg_policy_register(struct blkcg_policy *pol);
void blkcg_policy_unregister(struct blkcg_policy *pol);
int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol);
void blkcg_deactivate_policy(struct gendisk *disk,
                             const struct blkcg_policy *pol);

const char *blkg_dev_name(struct blkcg_gq *blkg);
void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
                       u64 (*prfill)(struct seq_file *,
                                     struct blkg_policy_data *, int),
                       const struct blkcg_policy *pol, int data,
                       bool show_total);
u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);

struct blkg_conf_ctx {
        char                                *input;
        char                                *body;
        struct block_device                *bdev;
        struct blkcg_gq                        *blkg;
};

void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input);
int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx);
unsigned long blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx);
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
                   struct blkg_conf_ctx *ctx);
void blkg_conf_exit(struct blkg_conf_ctx *ctx);
void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags);

/**
 * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
 * @bio: the target &bio
 *
 * Return: true if this bio needs to be submitted with the root blkg context.
 *
 * In order to avoid priority inversions we sometimes need to issue a bio as if
 * it were attached to the root blkg, and then backcharge to the actual owning
 * blkg.  The idea is we do bio_blkcg_css() to look up the actual context for
 * the bio and attach the appropriate blkg to the bio.  Then we call this helper
 * and if it is true run with the root blkg for that queue and then do any
 * backcharging to the originating cgroup once the io is complete.
 */
static inline bool bio_issue_as_root_blkg(struct bio *bio)
{
        return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0;
}

/**
 * blkg_lookup - lookup blkg for the specified blkcg - q pair
 * @blkcg: blkcg of interest
 * @q: request_queue of interest
 *
 * Lookup blkg for the @blkcg - @q pair.
 *
 * Must be called in a RCU critical section.
 */
static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
                                           struct request_queue *q)
{
        struct blkcg_gq *blkg;

        if (blkcg == &blkcg_root)
                return q->root_blkg;

        blkg = rcu_dereference_check(blkcg->blkg_hint,
                        lockdep_is_held(&q->queue_lock));
        if (blkg && blkg->q == q)
                return blkg;

        blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
        if (blkg && blkg->q != q)
                blkg = NULL;
        return blkg;
}

/**
 * blkg_to_pd - get policy private data
 * @blkg: blkg of interest
 * @pol: policy of interest
 *
 * Return pointer to private data associated with the @blkg-@pol pair.
 */
static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
                                                  struct blkcg_policy *pol)
{
        return blkg ? blkg->pd[pol->plid] : NULL;
}

static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
                                                     struct blkcg_policy *pol)
{
        return blkcg ? blkcg->cpd[pol->plid] : NULL;
}

/**
 * pd_to_blkg - get blkg associated with policy private data
 * @pd: policy private data of interest
 *
 * @pd is policy private data.  Determine the blkg it's associated with.
 */
static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
{
        return pd ? pd->blkg : NULL;
}

static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd)
{
        return cpd ? cpd->blkcg : NULL;
}

/**
 * blkg_get - get a blkg reference
 * @blkg: blkg to get
 *
 * The caller should be holding an existing reference.
 */
static inline void blkg_get(struct blkcg_gq *blkg)
{
        percpu_ref_get(&blkg->refcnt);
}

/**
 * blkg_tryget - try and get a blkg reference
 * @blkg: blkg to get
 *
 * This is for use when doing an RCU lookup of the blkg.  We may be in the midst
 * of freeing this blkg, so we can only use it if the refcnt is not zero.
 */
static inline bool blkg_tryget(struct blkcg_gq *blkg)
{
        return blkg && percpu_ref_tryget(&blkg->refcnt);
}

/**
 * blkg_put - put a blkg reference
 * @blkg: blkg to put
 */
static inline void blkg_put(struct blkcg_gq *blkg)
{
        percpu_ref_put(&blkg->refcnt);
}

/**
 * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
 * @d_blkg: loop cursor pointing to the current descendant
 * @pos_css: used for iteration
 * @p_blkg: target blkg to walk descendants of
 *
 * Walk @c_blkg through the descendants of @p_blkg.  Must be used with RCU
 * read locked.  If called under either blkcg or queue lock, the iteration
 * is guaranteed to include all and only online blkgs.  The caller may
 * update @pos_css by calling css_rightmost_descendant() to skip subtree.
 * @p_blkg is included in the iteration and the first node to be visited.
 */
#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg)                \
        css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css)        \
                if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css),        \
                                            (p_blkg)->q)))

/**
 * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
 * @d_blkg: loop cursor pointing to the current descendant
 * @pos_css: used for iteration
 * @p_blkg: target blkg to walk descendants of
 *
 * Similar to blkg_for_each_descendant_pre() but performs post-order
 * traversal instead.  Synchronization rules are the same.  @p_blkg is
 * included in the iteration and the last node to be visited.
 */
#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg)                \
        css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css)        \
                if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css),        \
                                            (p_blkg)->q)))

static inline void blkcg_use_delay(struct blkcg_gq *blkg)
{
        if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
                return;
        if (atomic_add_return(1, &blkg->use_delay) == 1)
                atomic_inc(&blkg->blkcg->congestion_count);
}

static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
{
        int old = atomic_read(&blkg->use_delay);

        if (WARN_ON_ONCE(old < 0))
                return 0;
        if (old == 0)
                return 0;

        /*
         * We do this song and dance because we can race with somebody else
         * adding or removing delay.  If we just did an atomic_dec we'd end up
         * negative and we'd already be in trouble.  We need to subtract 1 and
         * then check to see if we were the last delay so we can drop the
         * congestion count on the cgroup.
         */
        while (old && !atomic_try_cmpxchg(&blkg->use_delay, &old, old - 1))
                ;

        if (old == 0)
                return 0;
        if (old == 1)
                atomic_dec(&blkg->blkcg->congestion_count);
        return 1;
}

/**
 * blkcg_set_delay - Enable allocator delay mechanism with the specified delay amount
 * @blkg: target blkg
 * @delay: delay duration in nsecs
 *
 * When enabled with this function, the delay is not decayed and must be
 * explicitly cleared with blkcg_clear_delay(). Must not be mixed with
 * blkcg_[un]use_delay() and blkcg_add_delay() usages.
 */
static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay)
{
        int old = atomic_read(&blkg->use_delay);

        /* We only want 1 person setting the congestion count for this blkg. */
        if (!old && atomic_try_cmpxchg(&blkg->use_delay, &old, -1))
                atomic_inc(&blkg->blkcg->congestion_count);

        atomic64_set(&blkg->delay_nsec, delay);
}

/**
 * blkcg_clear_delay - Disable allocator delay mechanism
 * @blkg: target blkg
 *
 * Disable use_delay mechanism. See blkcg_set_delay().
 */
static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
{
        int old = atomic_read(&blkg->use_delay);

        /* We only want 1 person clearing the congestion count for this blkg. */
        if (old && atomic_try_cmpxchg(&blkg->use_delay, &old, 0))
                atomic_dec(&blkg->blkcg->congestion_count);
}

/**
 * blk_cgroup_mergeable - Determine whether to allow or disallow merges
 * @rq: request to merge into
 * @bio: bio to merge
 *
 * @bio and @rq should belong to the same cgroup and their issue_as_root should
 * match. The latter is necessary as we don't want to throttle e.g. a metadata
 * update because it happens to be next to a regular IO.
 */
static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio)
{
        return rq->bio->bi_blkg == bio->bi_blkg &&
                bio_issue_as_root_blkg(rq->bio) == bio_issue_as_root_blkg(bio);
}

static inline bool blkcg_policy_enabled(struct request_queue *q,
                                const struct blkcg_policy *pol)
{
        return pol && test_bit(pol->plid, q->blkcg_pols);
}

void blk_cgroup_bio_start(struct bio *bio);
void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
#else        /* CONFIG_BLK_CGROUP */

struct blkg_policy_data {
};

struct blkcg_policy_data {
};

struct blkcg_policy {
};

struct blkcg {
};

static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
static inline void blkg_init_queue(struct request_queue *q) { }
static inline int blkcg_init_disk(struct gendisk *disk) { return 0; }
static inline void blkcg_exit_disk(struct gendisk *disk) { }
static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
static inline int blkcg_activate_policy(struct gendisk *disk,
                                        const struct blkcg_policy *pol) { return 0; }
static inline void blkcg_deactivate_policy(struct gendisk *disk,
                                           const struct blkcg_policy *pol) { }

static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
                                                  struct blkcg_policy *pol) { return NULL; }
static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
static inline void blkg_get(struct blkcg_gq *blkg) { }
static inline void blkg_put(struct blkcg_gq *blkg) { }
static inline void blk_cgroup_bio_start(struct bio *bio) { }
static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; }

#define blk_queue_for_each_rl(rl, q)        \
        for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)

#endif        /* CONFIG_BLK_CGROUP */

#endif /* _BLK_CGROUP_PRIVATE_H */










































  164 





























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
/* SPDX-License-Identifier: GPL-2.0 */

/*
 * This file provides wrappers with sanitizer instrumentation for atomic bit
 * operations.
 *
 * To use this functionality, an arch's bitops.h file needs to define each of
 * the below bit operations with an arch_ prefix (e.g. arch_set_bit(),
 * arch___set_bit(), etc.).
 */
#ifndef _ASM_GENERIC_BITOPS_INSTRUMENTED_ATOMIC_H
#define _ASM_GENERIC_BITOPS_INSTRUMENTED_ATOMIC_H

#include <linux/instrumented.h>

/**
 * set_bit - Atomically set a bit in memory
 * @nr: the bit to set
 * @addr: the address to start counting from
 *
 * This is a relaxed atomic operation (no implied memory barriers).
 *
 * Note that @nr may be almost arbitrarily large; this function is not
 * restricted to acting on a single-word quantity.
 */
static __always_inline void set_bit(long nr, volatile unsigned long *addr)
{
        instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long));
        arch_set_bit(nr, addr);
}

/**
 * clear_bit - Clears a bit in memory
 * @nr: Bit to clear
 * @addr: Address to start counting from
 *
 * This is a relaxed atomic operation (no implied memory barriers).
 */
static __always_inline void clear_bit(long nr, volatile unsigned long *addr)
{
        instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long));
        arch_clear_bit(nr, addr);
}

/**
 * change_bit - Toggle a bit in memory
 * @nr: Bit to change
 * @addr: Address to start counting from
 *
 * This is a relaxed atomic operation (no implied memory barriers).
 *
 * Note that @nr may be almost arbitrarily large; this function is not
 * restricted to acting on a single-word quantity.
 */
static __always_inline void change_bit(long nr, volatile unsigned long *addr)
{
        instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long));
        arch_change_bit(nr, addr);
}

/**
 * test_and_set_bit - Set a bit and return its old value
 * @nr: Bit to set
 * @addr: Address to count from
 *
 * This is an atomic fully-ordered operation (implied full memory barrier).
 */
static __always_inline bool test_and_set_bit(long nr, volatile unsigned long *addr)
{
        kcsan_mb();
        instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long));
        return arch_test_and_set_bit(nr, addr);
}

/**
 * test_and_clear_bit - Clear a bit and return its old value
 * @nr: Bit to clear
 * @addr: Address to count from
 *
 * This is an atomic fully-ordered operation (implied full memory barrier).
 */
static __always_inline bool test_and_clear_bit(long nr, volatile unsigned long *addr)
{
        kcsan_mb();
        instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long));
        return arch_test_and_clear_bit(nr, addr);
}

/**
 * test_and_change_bit - Change a bit and return its old value
 * @nr: Bit to change
 * @addr: Address to count from
 *
 * This is an atomic fully-ordered operation (implied full memory barrier).
 */
static __always_inline bool test_and_change_bit(long nr, volatile unsigned long *addr)
{
        kcsan_mb();
        instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long));
        return arch_test_and_change_bit(nr, addr);
}

#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */







































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_FS_H
#define _LINUX_FS_H

#include <linux/vfsdebug.h>
#include <linux/linkage.h>
#include <linux/wait_bit.h>
#include <linux/kdev_t.h>
#include <linux/dcache.h>
#include <linux/path.h>
#include <linux/stat.h>
#include <linux/cache.h>
#include <linux/list.h>
#include <linux/list_lru.h>
#include <linux/llist.h>
#include <linux/radix-tree.h>
#include <linux/xarray.h>
#include <linux/rbtree.h>
#include <linux/init.h>
#include <linux/pid.h>
#include <linux/bug.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/mm_types.h>
#include <linux/capability.h>
#include <linux/semaphore.h>
#include <linux/fcntl.h>
#include <linux/rculist_bl.h>
#include <linux/atomic.h>
#include <linux/shrinker.h>
#include <linux/migrate_mode.h>
#include <linux/uidgid.h>
#include <linux/lockdep.h>
#include <linux/percpu-rwsem.h>
#include <linux/workqueue.h>
#include <linux/delayed_call.h>
#include <linux/uuid.h>
#include <linux/errseq.h>
#include <linux/ioprio.h>
#include <linux/fs_types.h>
#include <linux/build_bug.h>
#include <linux/stddef.h>
#include <linux/mount.h>
#include <linux/cred.h>
#include <linux/mnt_idmapping.h>
#include <linux/slab.h>
#include <linux/maple_tree.h>
#include <linux/rw_hint.h>
#include <linux/file_ref.h>
#include <linux/unicode.h>

#include <asm/byteorder.h>
#include <uapi/linux/fs.h>

struct backing_dev_info;
struct bdi_writeback;
struct bio;
struct io_comp_batch;
struct export_operations;
struct fiemap_extent_info;
struct hd_geometry;
struct iovec;
struct kiocb;
struct kobject;
struct pipe_inode_info;
struct poll_table_struct;
struct kstatfs;
struct vm_area_struct;
struct vfsmount;
struct cred;
struct swap_info_struct;
struct seq_file;
struct workqueue_struct;
struct iov_iter;
struct fscrypt_operations;
struct fsverity_operations;
struct fsnotify_mark_connector;
struct fsnotify_sb_info;
struct fs_context;
struct fs_parameter_spec;
struct file_kattr;
struct iomap_ops;

extern void __init inode_init(void);
extern void __init inode_init_early(void);
extern void __init files_init(void);
extern void __init files_maxfiles_init(void);

extern unsigned long get_max_files(void);
extern unsigned int sysctl_nr_open;

typedef __kernel_rwf_t rwf_t;

struct buffer_head;
typedef int (get_block_t)(struct inode *inode, sector_t iblock,
                        struct buffer_head *bh_result, int create);
typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
                        ssize_t bytes, void *private);

#define MAY_EXEC                0x00000001
#define MAY_WRITE                0x00000002
#define MAY_READ                0x00000004
#define MAY_APPEND                0x00000008
#define MAY_ACCESS                0x00000010
#define MAY_OPEN                0x00000020
#define MAY_CHDIR                0x00000040
/* called from RCU mode, don't block */
#define MAY_NOT_BLOCK                0x00000080

/*
 * flags in file.f_mode.  Note that FMODE_READ and FMODE_WRITE must correspond
 * to O_WRONLY and O_RDWR via the strange trick in do_dentry_open()
 */

/* file is open for reading */
#define FMODE_READ                ((__force fmode_t)(1 << 0))
/* file is open for writing */
#define FMODE_WRITE                ((__force fmode_t)(1 << 1))
/* file is seekable */
#define FMODE_LSEEK                ((__force fmode_t)(1 << 2))
/* file can be accessed using pread */
#define FMODE_PREAD                ((__force fmode_t)(1 << 3))
/* file can be accessed using pwrite */
#define FMODE_PWRITE                ((__force fmode_t)(1 << 4))
/* File is opened for execution with sys_execve / sys_uselib */
#define FMODE_EXEC                ((__force fmode_t)(1 << 5))
/* File writes are restricted (block device specific) */
#define FMODE_WRITE_RESTRICTED        ((__force fmode_t)(1 << 6))
/* File supports atomic writes */
#define FMODE_CAN_ATOMIC_WRITE        ((__force fmode_t)(1 << 7))

/* FMODE_* bit 8 */

/* 32bit hashes as llseek() offset (for directories) */
#define FMODE_32BITHASH         ((__force fmode_t)(1 << 9))
/* 64bit hashes as llseek() offset (for directories) */
#define FMODE_64BITHASH         ((__force fmode_t)(1 << 10))

/*
 * Don't update ctime and mtime.
 *
 * Currently a special hack for the XFS open_by_handle ioctl, but we'll
 * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon.
 */
#define FMODE_NOCMTIME                ((__force fmode_t)(1 << 11))

/* Expect random access pattern */
#define FMODE_RANDOM                ((__force fmode_t)(1 << 12))

/* Supports IOCB_HAS_METADATA */
#define FMODE_HAS_METADATA        ((__force fmode_t)(1 << 13))

/* File is opened with O_PATH; almost nothing can be done with it */
#define FMODE_PATH                ((__force fmode_t)(1 << 14))

/* File needs atomic accesses to f_pos */
#define FMODE_ATOMIC_POS        ((__force fmode_t)(1 << 15))
/* Write access to underlying fs */
#define FMODE_WRITER                ((__force fmode_t)(1 << 16))
/* Has read method(s) */
#define FMODE_CAN_READ          ((__force fmode_t)(1 << 17))
/* Has write method(s) */
#define FMODE_CAN_WRITE         ((__force fmode_t)(1 << 18))

#define FMODE_OPENED                ((__force fmode_t)(1 << 19))
#define FMODE_CREATED                ((__force fmode_t)(1 << 20))

/* File is stream-like */
#define FMODE_STREAM                ((__force fmode_t)(1 << 21))

/* File supports DIRECT IO */
#define        FMODE_CAN_ODIRECT        ((__force fmode_t)(1 << 22))

#define        FMODE_NOREUSE                ((__force fmode_t)(1 << 23))

/* File is embedded in backing_file object */
#define FMODE_BACKING                ((__force fmode_t)(1 << 24))

/*
 * Together with FMODE_NONOTIFY_PERM defines which fsnotify events shouldn't be
 * generated (see below)
 */
#define FMODE_NONOTIFY                ((__force fmode_t)(1 << 25))

/*
 * Together with FMODE_NONOTIFY defines which fsnotify events shouldn't be
 * generated (see below)
 */
#define FMODE_NONOTIFY_PERM        ((__force fmode_t)(1 << 26))

/* File is capable of returning -EAGAIN if I/O will block */
#define FMODE_NOWAIT                ((__force fmode_t)(1 << 27))

/* File represents mount that needs unmounting */
#define FMODE_NEED_UNMOUNT        ((__force fmode_t)(1 << 28))

/* File does not contribute to nr_files count */
#define FMODE_NOACCOUNT                ((__force fmode_t)(1 << 29))

/*
 * The two FMODE_NONOTIFY* define which fsnotify events should not be generated
 * for an open file. These are the possible values of
 * (f->f_mode & FMODE_FSNOTIFY_MASK) and their meaning:
 *
 * FMODE_NONOTIFY - suppress all (incl. non-permission) events.
 * FMODE_NONOTIFY_PERM - suppress permission (incl. pre-content) events.
 * FMODE_NONOTIFY | FMODE_NONOTIFY_PERM - suppress only FAN_ACCESS_PERM.
 */
#define FMODE_FSNOTIFY_MASK \
        (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM)

#define FMODE_FSNOTIFY_NONE(mode) \
        ((mode & FMODE_FSNOTIFY_MASK) == FMODE_NONOTIFY)
#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
#define FMODE_FSNOTIFY_HSM(mode) \
        ((mode & FMODE_FSNOTIFY_MASK) == 0 || \
         (mode & FMODE_FSNOTIFY_MASK) == (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM))
#define FMODE_FSNOTIFY_ACCESS_PERM(mode) \
        ((mode & FMODE_FSNOTIFY_MASK) == 0)
#else
#define FMODE_FSNOTIFY_ACCESS_PERM(mode) 0
#define FMODE_FSNOTIFY_HSM(mode)        0
#endif

/*
 * Attribute flags.  These should be or-ed together to figure out what
 * has been changed!
 */
#define ATTR_MODE        (1 << 0)
#define ATTR_UID        (1 << 1)
#define ATTR_GID        (1 << 2)
#define ATTR_SIZE        (1 << 3)
#define ATTR_ATIME        (1 << 4)
#define ATTR_MTIME        (1 << 5)
#define ATTR_CTIME        (1 << 6)
#define ATTR_ATIME_SET        (1 << 7)
#define ATTR_MTIME_SET        (1 << 8)
#define ATTR_FORCE        (1 << 9) /* Not a change, but a change it */
#define ATTR_CTIME_SET        (1 << 10)
#define ATTR_KILL_SUID        (1 << 11)
#define ATTR_KILL_SGID        (1 << 12)
#define ATTR_FILE        (1 << 13)
#define ATTR_KILL_PRIV        (1 << 14)
#define ATTR_OPEN        (1 << 15) /* Truncating from open(O_TRUNC) */
#define ATTR_TIMES_SET        (1 << 16)
#define ATTR_TOUCH        (1 << 17)
#define ATTR_DELEG        (1 << 18) /* Delegated attrs. Don't break write delegations */

/*
 * Whiteout is represented by a char device.  The following constants define the
 * mode and device number to use.
 */
#define WHITEOUT_MODE 0
#define WHITEOUT_DEV 0

/*
 * This is the Inode Attributes structure, used for notify_change().  It
 * uses the above definitions as flags, to know which values have changed.
 * Also, in this manner, a Filesystem can look at only the values it cares
 * about.  Basically, these are the attributes that the VFS layer can
 * request to change from the FS layer.
 *
 * Derek Atkins <warlord@MIT.EDU> 94-10-20
 */
struct iattr {
        unsigned int        ia_valid;
        umode_t                ia_mode;
        /*
         * The two anonymous unions wrap structures with the same member.
         *
         * Filesystems raising FS_ALLOW_IDMAP need to use ia_vfs{g,u}id which
         * are a dedicated type requiring the filesystem to use the dedicated
         * helpers. Other filesystem can continue to use ia_{g,u}id until they
         * have been ported.
         *
         * They always contain the same value. In other words FS_ALLOW_IDMAP
         * pass down the same value on idmapped mounts as they would on regular
         * mounts.
         */
        union {
                kuid_t                ia_uid;
                vfsuid_t        ia_vfsuid;
        };
        union {
                kgid_t                ia_gid;
                vfsgid_t        ia_vfsgid;
        };
        loff_t                ia_size;
        struct timespec64 ia_atime;
        struct timespec64 ia_mtime;
        struct timespec64 ia_ctime;

        /*
         * Not an attribute, but an auxiliary info for filesystems wanting to
         * implement an ftruncate() like method.  NOTE: filesystem should
         * check for (ia_valid & ATTR_FILE), and not for (ia_file != NULL).
         */
        struct file        *ia_file;
};

/*
 * Includes for diskquotas.
 */
#include <linux/quota.h>

/*
 * Maximum number of layers of fs stack.  Needs to be limited to
 * prevent kernel stack overflow
 */
#define FILESYSTEM_MAX_STACK_DEPTH 2

/** 
 * enum positive_aop_returns - aop return codes with specific semantics
 *
 * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has
 *                             completed, that the page is still locked, and
 *                             should be considered active.  The VM uses this hint
 *                             to return the page to the active list -- it won't
 *                             be a candidate for writeback again in the near
 *                             future.  Other callers must be careful to unlock
 *                             the page if they get this return.  Returned by
 *                             writepage(); 
 *
 * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has
 *                          unlocked it and the page might have been truncated.
 *                          The caller should back up to acquiring a new page and
 *                          trying again.  The aop will be taking reasonable
 *                          precautions not to livelock.  If the caller held a page
 *                          reference, it should drop it before retrying.  Returned
 *                          by read_folio().
 *
 * address_space_operation functions return these large constants to indicate
 * special semantics to the caller.  These are much larger than the bytes in a
 * page to allow for functions that return the number of bytes operated on in a
 * given page.
 */

enum positive_aop_returns {
        AOP_WRITEPAGE_ACTIVATE        = 0x80000,
        AOP_TRUNCATED_PAGE        = 0x80001,
};

/*
 * oh the beauties of C type declarations.
 */
struct page;
struct address_space;
struct writeback_control;
struct readahead_control;

/* Match RWF_* bits to IOCB bits */
#define IOCB_HIPRI                (__force int) RWF_HIPRI
#define IOCB_DSYNC                (__force int) RWF_DSYNC
#define IOCB_SYNC                (__force int) RWF_SYNC
#define IOCB_NOWAIT                (__force int) RWF_NOWAIT
#define IOCB_APPEND                (__force int) RWF_APPEND
#define IOCB_ATOMIC                (__force int) RWF_ATOMIC
#define IOCB_DONTCACHE                (__force int) RWF_DONTCACHE
#define IOCB_NOSIGNAL                (__force int) RWF_NOSIGNAL

/* non-RWF related bits - start at 16 */
#define IOCB_EVENTFD                (1 << 16)
#define IOCB_DIRECT                (1 << 17)
#define IOCB_WRITE                (1 << 18)
/* iocb->ki_waitq is valid */
#define IOCB_WAITQ                (1 << 19)
#define IOCB_NOIO                (1 << 20)
/* can use bio alloc cache */
#define IOCB_ALLOC_CACHE        (1 << 21)
/*
 * IOCB_DIO_CALLER_COMP can be set by the iocb owner, to indicate that the
 * iocb completion can be passed back to the owner for execution from a safe
 * context rather than needing to be punted through a workqueue. If this
 * flag is set, the bio completion handling may set iocb->dio_complete to a
 * handler function and iocb->private to context information for that handler.
 * The issuer should call the handler with that context information from task
 * context to complete the processing of the iocb. Note that while this
 * provides a task context for the dio_complete() callback, it should only be
 * used on the completion side for non-IO generating completions. It's fine to
 * call blocking functions from this callback, but they should not wait for
 * unrelated IO (like cache flushing, new IO generation, etc).
 */
#define IOCB_DIO_CALLER_COMP        (1 << 22)
/* kiocb is a read or write operation submitted by fs/aio.c. */
#define IOCB_AIO_RW                (1 << 23)
#define IOCB_HAS_METADATA        (1 << 24)

/* for use in trace events */
#define TRACE_IOCB_STRINGS \
        { IOCB_HIPRI,                "HIPRI" }, \
        { IOCB_DSYNC,                "DSYNC" }, \
        { IOCB_SYNC,                "SYNC" }, \
        { IOCB_NOWAIT,                "NOWAIT" }, \
        { IOCB_APPEND,                "APPEND" }, \
        { IOCB_ATOMIC,                "ATOMIC" }, \
        { IOCB_DONTCACHE,        "DONTCACHE" }, \
        { IOCB_EVENTFD,                "EVENTFD"}, \
        { IOCB_DIRECT,                "DIRECT" }, \
        { IOCB_WRITE,                "WRITE" }, \
        { IOCB_WAITQ,                "WAITQ" }, \
        { IOCB_NOIO,                "NOIO" }, \
        { IOCB_ALLOC_CACHE,        "ALLOC_CACHE" }, \
        { IOCB_DIO_CALLER_COMP,        "CALLER_COMP" }, \
        { IOCB_AIO_RW,                "AIO_RW" }, \
        { IOCB_HAS_METADATA,        "AIO_HAS_METADATA" }

struct kiocb {
        struct file                *ki_filp;
        loff_t                        ki_pos;
        void (*ki_complete)(struct kiocb *iocb, long ret);
        void                        *private;
        int                        ki_flags;
        u16                        ki_ioprio; /* See linux/ioprio.h */
        u8                        ki_write_stream;
        union {
                /*
                 * Only used for async buffered reads, where it denotes the
                 * page waitqueue associated with completing the read. Valid
                 * IFF IOCB_WAITQ is set.
                 */
                struct wait_page_queue        *ki_waitq;
                /*
                 * Can be used for O_DIRECT IO, where the completion handling
                 * is punted back to the issuer of the IO. May only be set
                 * if IOCB_DIO_CALLER_COMP is set by the issuer, and the issuer
                 * must then check for presence of this handler when ki_complete
                 * is invoked. The data passed in to this handler must be
                 * assigned to ->private when dio_complete is assigned.
                 */
                ssize_t (*dio_complete)(void *data);
        };
};

static inline bool is_sync_kiocb(struct kiocb *kiocb)
{
        return kiocb->ki_complete == NULL;
}

struct address_space_operations {
        int (*read_folio)(struct file *, struct folio *);

        /* Write back some dirty pages from this mapping. */
        int (*writepages)(struct address_space *, struct writeback_control *);

        /* Mark a folio dirty.  Return true if this dirtied it */
        bool (*dirty_folio)(struct address_space *, struct folio *);

        void (*readahead)(struct readahead_control *);

        int (*write_begin)(const struct kiocb *, struct address_space *mapping,
                                loff_t pos, unsigned len,
                                struct folio **foliop, void **fsdata);
        int (*write_end)(const struct kiocb *, struct address_space *mapping,
                                loff_t pos, unsigned len, unsigned copied,
                                struct folio *folio, void *fsdata);

        /* Unfortunately this kludge is needed for FIBMAP. Don't use it */
        sector_t (*bmap)(struct address_space *, sector_t);
        void (*invalidate_folio) (struct folio *, size_t offset, size_t len);
        bool (*release_folio)(struct folio *, gfp_t);
        void (*free_folio)(struct folio *folio);
        ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
        /*
         * migrate the contents of a folio to the specified target. If
         * migrate_mode is MIGRATE_ASYNC, it must not block.
         */
        int (*migrate_folio)(struct address_space *, struct folio *dst,
                        struct folio *src, enum migrate_mode);
        int (*launder_folio)(struct folio *);
        bool (*is_partially_uptodate) (struct folio *, size_t from,
                        size_t count);
        void (*is_dirty_writeback) (struct folio *, bool *dirty, bool *wb);
        int (*error_remove_folio)(struct address_space *, struct folio *);

        /* swapfile support */
        int (*swap_activate)(struct swap_info_struct *sis, struct file *file,
                                sector_t *span);
        void (*swap_deactivate)(struct file *file);
        int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter);
};

extern const struct address_space_operations empty_aops;

/**
 * struct address_space - Contents of a cacheable, mappable object.
 * @host: Owner, either the inode or the block_device.
 * @i_pages: Cached pages.
 * @invalidate_lock: Guards coherency between page cache contents and
 *   file offset->disk block mappings in the filesystem during invalidates.
 *   It is also used to block modification of page cache contents through
 *   memory mappings.
 * @gfp_mask: Memory allocation flags to use for allocating pages.
 * @i_mmap_writable: Number of VM_SHARED, VM_MAYWRITE mappings.
 * @nr_thps: Number of THPs in the pagecache (non-shmem only).
 * @i_mmap: Tree of private and shared mappings.
 * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
 * @nrpages: Number of page entries, protected by the i_pages lock.
 * @writeback_index: Writeback starts here.
 * @a_ops: Methods.
 * @flags: Error bits and flags (AS_*).
 * @wb_err: The most recent error which has occurred.
 * @i_private_lock: For use by the owner of the address_space.
 * @i_private_list: For use by the owner of the address_space.
 * @i_private_data: For use by the owner of the address_space.
 */
struct address_space {
        struct inode                *host;
        struct xarray                i_pages;
        struct rw_semaphore        invalidate_lock;
        gfp_t                        gfp_mask;
        atomic_t                i_mmap_writable;
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        /* number of thp, only for non-shmem files */
        atomic_t                nr_thps;
#endif
        struct rb_root_cached        i_mmap;
        unsigned long                nrpages;
        pgoff_t                        writeback_index;
        const struct address_space_operations *a_ops;
        unsigned long                flags;
        errseq_t                wb_err;
        spinlock_t                i_private_lock;
        struct list_head        i_private_list;
        struct rw_semaphore        i_mmap_rwsem;
        void *                        i_private_data;
} __attribute__((aligned(sizeof(long)))) __randomize_layout;
        /*
         * On most architectures that alignment is already the case; but
         * must be enforced here for CRIS, to let the least significant bit
         * of struct folio's "mapping" pointer be used for FOLIO_MAPPING_ANON.
         */

/* XArray tags, for tagging dirty and writeback pages in the pagecache. */
#define PAGECACHE_TAG_DIRTY        XA_MARK_0
#define PAGECACHE_TAG_WRITEBACK        XA_MARK_1
#define PAGECACHE_TAG_TOWRITE        XA_MARK_2

/*
 * Returns true if any of the pages in the mapping are marked with the tag.
 */
static inline bool mapping_tagged(const struct address_space *mapping, xa_mark_t tag)
{
        return xa_marked(&mapping->i_pages, tag);
}

static inline void i_mmap_lock_write(struct address_space *mapping)
{
        down_write(&mapping->i_mmap_rwsem);
}

static inline int i_mmap_trylock_write(struct address_space *mapping)
{
        return down_write_trylock(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_unlock_write(struct address_space *mapping)
{
        up_write(&mapping->i_mmap_rwsem);
}

static inline int i_mmap_trylock_read(struct address_space *mapping)
{
        return down_read_trylock(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_lock_read(struct address_space *mapping)
{
        down_read(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_unlock_read(struct address_space *mapping)
{
        up_read(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_assert_locked(struct address_space *mapping)
{
        lockdep_assert_held(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_assert_write_locked(struct address_space *mapping)
{
        lockdep_assert_held_write(&mapping->i_mmap_rwsem);
}

/*
 * Might pages of this file be mapped into userspace?
 */
static inline int mapping_mapped(const struct address_space *mapping)
{
        return        !RB_EMPTY_ROOT(&mapping->i_mmap.rb_root);
}

/*
 * Might pages of this file have been modified in userspace?
 * Note that i_mmap_writable counts all VM_SHARED, VM_MAYWRITE vmas: do_mmap
 * marks vma as VM_SHARED if it is shared, and the file was opened for
 * writing i.e. vma may be mprotected writable even if now readonly.
 *
 * If i_mmap_writable is negative, no new writable mappings are allowed. You
 * can only deny writable mappings, if none exists right now.
 */
static inline int mapping_writably_mapped(const struct address_space *mapping)
{
        return atomic_read(&mapping->i_mmap_writable) > 0;
}

static inline int mapping_map_writable(struct address_space *mapping)
{
        return atomic_inc_unless_negative(&mapping->i_mmap_writable) ?
                0 : -EPERM;
}

static inline void mapping_unmap_writable(struct address_space *mapping)
{
        atomic_dec(&mapping->i_mmap_writable);
}

static inline int mapping_deny_writable(struct address_space *mapping)
{
        return atomic_dec_unless_positive(&mapping->i_mmap_writable) ?
                0 : -EBUSY;
}

static inline void mapping_allow_writable(struct address_space *mapping)
{
        atomic_inc(&mapping->i_mmap_writable);
}

/*
 * Use sequence counter to get consistent i_size on 32-bit processors.
 */
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
#include <linux/seqlock.h>
#define __NEED_I_SIZE_ORDERED
#define i_size_ordered_init(inode) seqcount_init(&inode->i_size_seqcount)
#else
#define i_size_ordered_init(inode) do { } while (0)
#endif

struct posix_acl;
#define ACL_NOT_CACHED ((void *)(-1))
/*
 * ACL_DONT_CACHE is for stacked filesystems, that rely on underlying fs to
 * cache the ACL.  This also means that ->get_inode_acl() can be called in RCU
 * mode with the LOOKUP_RCU flag.
 */
#define ACL_DONT_CACHE ((void *)(-3))

static inline struct posix_acl *
uncached_acl_sentinel(struct task_struct *task)
{
        return (void *)task + 1;
}

static inline bool
is_uncached_acl(struct posix_acl *acl)
{
        return (long)acl & 1;
}

#define IOP_FASTPERM        0x0001
#define IOP_LOOKUP        0x0002
#define IOP_NOFOLLOW        0x0004
#define IOP_XATTR        0x0008
#define IOP_DEFAULT_READLINK        0x0010
#define IOP_MGTIME        0x0020
#define IOP_CACHED_LINK        0x0040

/*
 * Inode state bits.  Protected by inode->i_lock
 *
 * Four bits determine the dirty state of the inode: I_DIRTY_SYNC,
 * I_DIRTY_DATASYNC, I_DIRTY_PAGES, and I_DIRTY_TIME.
 *
 * Four bits define the lifetime of an inode.  Initially, inodes are I_NEW,
 * until that flag is cleared.  I_WILL_FREE, I_FREEING and I_CLEAR are set at
 * various stages of removing an inode.
 *
 * Two bits are used for locking and completion notification, I_NEW and I_SYNC.
 *
 * I_DIRTY_SYNC                Inode is dirty, but doesn't have to be written on
 *                        fdatasync() (unless I_DIRTY_DATASYNC is also set).
 *                        Timestamp updates are the usual cause.
 * I_DIRTY_DATASYNC        Data-related inode changes pending.  We keep track of
 *                        these changes separately from I_DIRTY_SYNC so that we
 *                        don't have to write inode on fdatasync() when only
 *                        e.g. the timestamps have changed.
 * I_DIRTY_PAGES        Inode has dirty pages.  Inode itself may be clean.
 * I_DIRTY_TIME                The inode itself has dirty timestamps, and the
 *                        lazytime mount option is enabled.  We keep track of this
 *                        separately from I_DIRTY_SYNC in order to implement
 *                        lazytime.  This gets cleared if I_DIRTY_INODE
 *                        (I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set. But
 *                        I_DIRTY_TIME can still be set if I_DIRTY_SYNC is already
 *                        in place because writeback might already be in progress
 *                        and we don't want to lose the time update
 * I_NEW                Serves as both a mutex and completion notification.
 *                        New inodes set I_NEW.  If two processes both create
 *                        the same inode, one of them will release its inode and
 *                        wait for I_NEW to be released before returning.
 *                        Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can
 *                        also cause waiting on I_NEW, without I_NEW actually
 *                        being set.  find_inode() uses this to prevent returning
 *                        nearly-dead inodes.
 * I_WILL_FREE                Must be set when calling write_inode_now() if i_count
 *                        is zero.  I_FREEING must be set when I_WILL_FREE is
 *                        cleared.
 * I_FREEING                Set when inode is about to be freed but still has dirty
 *                        pages or buffers attached or the inode itself is still
 *                        dirty.
 * I_CLEAR                Added by clear_inode().  In this state the inode is
 *                        clean and can be destroyed.  Inode keeps I_FREEING.
 *
 *                        Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are
 *                        prohibited for many purposes.  iget() must wait for
 *                        the inode to be completely released, then create it
 *                        anew.  Other functions will just ignore such inodes,
 *                        if appropriate.  I_NEW is used for waiting.
 *
 * I_SYNC                Writeback of inode is running. The bit is set during
 *                        data writeback, and cleared with a wakeup on the bit
 *                        address once it is done. The bit is also used to pin
 *                        the inode in memory for flusher thread.
 *
 * I_REFERENCED                Marks the inode as recently references on the LRU list.
 *
 * I_WB_SWITCH                Cgroup bdi_writeback switching in progress.  Used to
 *                        synchronize competing switching instances and to tell
 *                        wb stat updates to grab the i_pages lock.  See
 *                        inode_switch_wbs_work_fn() for details.
 *
 * I_OVL_INUSE                Used by overlayfs to get exclusive ownership on upper
 *                        and work dirs among overlayfs mounts.
 *
 * I_CREATING                New object's inode in the middle of setting up.
 *
 * I_DONTCACHE                Evict inode as soon as it is not used anymore.
 *
 * I_SYNC_QUEUED        Inode is queued in b_io or b_more_io writeback lists.
 *                        Used to detect that mark_inode_dirty() should not move
 *                        inode between dirty lists.
 *
 * I_PINNING_FSCACHE_WB        Inode is pinning an fscache object for writeback.
 *
 * I_LRU_ISOLATING        Inode is pinned being isolated from LRU without holding
 *                        i_count.
 *
 * Q: What is the difference between I_WILL_FREE and I_FREEING?
 *
 * __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait
 * upon. There's one free address left.
 */

enum inode_state_bits {
        __I_NEW                        = 0U,
        __I_SYNC                = 1U,
        __I_LRU_ISOLATING        = 2U
        /* reserved wait address bit 3 */
};

enum inode_state_flags_t {
        I_NEW                        = (1U << __I_NEW),
        I_SYNC                        = (1U << __I_SYNC),
        I_LRU_ISOLATING         = (1U << __I_LRU_ISOLATING),
        /* reserved flag bit 3 */
        I_DIRTY_SYNC                = (1U << 4),
        I_DIRTY_DATASYNC        = (1U << 5),
        I_DIRTY_PAGES                = (1U << 6),
        I_WILL_FREE                = (1U << 7),
        I_FREEING                = (1U << 8),
        I_CLEAR                        = (1U << 9),
        I_REFERENCED                = (1U << 10),
        I_LINKABLE                = (1U << 11),
        I_DIRTY_TIME                = (1U << 12),
        I_WB_SWITCH                = (1U << 13),
        I_OVL_INUSE                = (1U << 14),
        I_CREATING                = (1U << 15),
        I_DONTCACHE                = (1U << 16),
        I_SYNC_QUEUED                = (1U << 17),
        I_PINNING_NETFS_WB        = (1U << 18)
};

#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
#define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)
#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)

/*
 * Keep mostly read-only and often accessed (especially for
 * the RCU path lookup and 'stat' data) fields at the beginning
 * of the 'struct inode'
 */
struct inode {
        umode_t                        i_mode;
        unsigned short                i_opflags;
        kuid_t                        i_uid;
        kgid_t                        i_gid;
        unsigned int                i_flags;

#ifdef CONFIG_FS_POSIX_ACL
        struct posix_acl        *i_acl;
        struct posix_acl        *i_default_acl;
#endif

        const struct inode_operations        *i_op;
        struct super_block        *i_sb;
        struct address_space        *i_mapping;

#ifdef CONFIG_SECURITY
        void                        *i_security;
#endif

        /* Stat data, not accessed from path walking */
        unsigned long                i_ino;
        /*
         * Filesystems may only read i_nlink directly.  They shall use the
         * following functions for modification:
         *
         *    (set|clear|inc|drop)_nlink
         *    inode_(inc|dec)_link_count
         */
        union {
                const unsigned int i_nlink;
                unsigned int __i_nlink;
        };
        dev_t                        i_rdev;
        loff_t                        i_size;
        time64_t                i_atime_sec;
        time64_t                i_mtime_sec;
        time64_t                i_ctime_sec;
        u32                        i_atime_nsec;
        u32                        i_mtime_nsec;
        u32                        i_ctime_nsec;
        u32                        i_generation;
        spinlock_t                i_lock;        /* i_blocks, i_bytes, maybe i_size */
        unsigned short          i_bytes;
        u8                        i_blkbits;
        enum rw_hint                i_write_hint;
        blkcnt_t                i_blocks;

#ifdef __NEED_I_SIZE_ORDERED
        seqcount_t                i_size_seqcount;
#endif

        /* Misc */
        enum inode_state_flags_t        i_state;
        /* 32-bit hole */
        struct rw_semaphore        i_rwsem;

        unsigned long                dirtied_when;        /* jiffies of first dirtying */
        unsigned long                dirtied_time_when;

        struct hlist_node        i_hash;
        struct list_head        i_io_list;        /* backing dev IO list */
#ifdef CONFIG_CGROUP_WRITEBACK
        struct bdi_writeback        *i_wb;                /* the associated cgroup wb */

        /* foreign inode detection, see wbc_detach_inode() */
        int                        i_wb_frn_winner;
        u16                        i_wb_frn_avg_time;
        u16                        i_wb_frn_history;
#endif
        struct list_head        i_lru;                /* inode LRU list */
        struct list_head        i_sb_list;
        struct list_head        i_wb_list;        /* backing dev writeback list */
        union {
                struct hlist_head        i_dentry;
                struct rcu_head                i_rcu;
        };
        atomic64_t                i_version;
        atomic64_t                i_sequence; /* see futex */
        atomic_t                i_count;
        atomic_t                i_dio_count;
        atomic_t                i_writecount;
#if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING)
        atomic_t                i_readcount; /* struct files open RO */
#endif
        union {
                const struct file_operations        *i_fop;        /* former ->i_op->default_file_ops */
                void (*free_inode)(struct inode *);
        };
        struct file_lock_context        *i_flctx;
        struct address_space        i_data;
        union {
                struct list_head        i_devices;
                int                        i_linklen;
        };
        union {
                struct pipe_inode_info        *i_pipe;
                struct cdev                *i_cdev;
                char                        *i_link;
                unsigned                i_dir_seq;
        };


#ifdef CONFIG_FSNOTIFY
        __u32                        i_fsnotify_mask; /* all events this inode cares about */
        /* 32-bit hole reserved for expanding i_fsnotify_mask */
        struct fsnotify_mark_connector __rcu        *i_fsnotify_marks;
#endif

        void                        *i_private; /* fs or device private pointer */
} __randomize_layout;

static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen)
{
        VFS_WARN_ON_INODE(strlen(link) != linklen, inode);
        VFS_WARN_ON_INODE(inode->i_opflags & IOP_CACHED_LINK, inode);
        inode->i_link = link;
        inode->i_linklen = linklen;
        inode->i_opflags |= IOP_CACHED_LINK;
}

/*
 * Get bit address from inode->i_state to use with wait_var_event()
 * infrastructre.
 */
#define inode_state_wait_address(inode, bit) ((char *)&(inode)->i_state + (bit))

struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe,
                                            struct inode *inode, u32 bit);

static inline void inode_wake_up_bit(struct inode *inode, u32 bit)
{
        /* Caller is responsible for correct memory barriers. */
        wake_up_var(inode_state_wait_address(inode, bit));
}

struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode);

static inline unsigned int i_blocksize(const struct inode *node)
{
        return (1 << node->i_blkbits);
}

static inline int inode_unhashed(struct inode *inode)
{
        return hlist_unhashed(&inode->i_hash);
}

/*
 * __mark_inode_dirty expects inodes to be hashed.  Since we don't
 * want special inodes in the fileset inode space, we make them
 * appear hashed, but do not put on any lists.  hlist_del()
 * will work fine and require no locking.
 */
static inline void inode_fake_hash(struct inode *inode)
{
        hlist_add_fake(&inode->i_hash);
}

/*
 * inode->i_rwsem nesting subclasses for the lock validator:
 *
 * 0: the object of the current VFS operation
 * 1: parent
 * 2: child/target
 * 3: xattr
 * 4: second non-directory
 * 5: second parent (when locking independent directories in rename)
 *
 * I_MUTEX_NONDIR2 is for certain operations (such as rename) which lock two
 * non-directories at once.
 *
 * The locking order between these classes is
 * parent[2] -> child -> grandchild -> normal -> xattr -> second non-directory
 */
enum inode_i_mutex_lock_class
{
        I_MUTEX_NORMAL,
        I_MUTEX_PARENT,
        I_MUTEX_CHILD,
        I_MUTEX_XATTR,
        I_MUTEX_NONDIR2,
        I_MUTEX_PARENT2,
};

static inline void inode_lock(struct inode *inode)
{
        down_write(&inode->i_rwsem);
}

static inline __must_check int inode_lock_killable(struct inode *inode)
{
        return down_write_killable(&inode->i_rwsem);
}

static inline void inode_unlock(struct inode *inode)
{
        up_write(&inode->i_rwsem);
}

static inline void inode_lock_shared(struct inode *inode)
{
        down_read(&inode->i_rwsem);
}

static inline __must_check int inode_lock_shared_killable(struct inode *inode)
{
        return down_read_killable(&inode->i_rwsem);
}

static inline void inode_unlock_shared(struct inode *inode)
{
        up_read(&inode->i_rwsem);
}

static inline int inode_trylock(struct inode *inode)
{
        return down_write_trylock(&inode->i_rwsem);
}

static inline int inode_trylock_shared(struct inode *inode)
{
        return down_read_trylock(&inode->i_rwsem);
}

static inline int inode_is_locked(struct inode *inode)
{
        return rwsem_is_locked(&inode->i_rwsem);
}

static inline void inode_lock_nested(struct inode *inode, unsigned subclass)
{
        down_write_nested(&inode->i_rwsem, subclass);
}

static inline void inode_lock_shared_nested(struct inode *inode, unsigned subclass)
{
        down_read_nested(&inode->i_rwsem, subclass);
}

static inline void filemap_invalidate_lock(struct address_space *mapping)
{
        down_write(&mapping->invalidate_lock);
}

static inline void filemap_invalidate_unlock(struct address_space *mapping)
{
        up_write(&mapping->invalidate_lock);
}

static inline void filemap_invalidate_lock_shared(struct address_space *mapping)
{
        down_read(&mapping->invalidate_lock);
}

static inline int filemap_invalidate_trylock_shared(
                                        struct address_space *mapping)
{
        return down_read_trylock(&mapping->invalidate_lock);
}

static inline void filemap_invalidate_unlock_shared(
                                        struct address_space *mapping)
{
        up_read(&mapping->invalidate_lock);
}

void lock_two_nondirectories(struct inode *, struct inode*);
void unlock_two_nondirectories(struct inode *, struct inode*);

void filemap_invalidate_lock_two(struct address_space *mapping1,
                                 struct address_space *mapping2);
void filemap_invalidate_unlock_two(struct address_space *mapping1,
                                   struct address_space *mapping2);


/*
 * NOTE: in a 32bit arch with a preemptable kernel and
 * an UP compile the i_size_read/write must be atomic
 * with respect to the local cpu (unlike with preempt disabled),
 * but they don't need to be atomic with respect to other cpus like in
 * true SMP (so they need either to either locally disable irq around
 * the read or for example on x86 they can be still implemented as a
 * cmpxchg8b without the need of the lock prefix). For SMP compiles
 * and 64bit archs it makes no difference if preempt is enabled or not.
 */
static inline loff_t i_size_read(const struct inode *inode)
{
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
        loff_t i_size;
        unsigned int seq;

        do {
                seq = read_seqcount_begin(&inode->i_size_seqcount);
                i_size = inode->i_size;
        } while (read_seqcount_retry(&inode->i_size_seqcount, seq));
        return i_size;
#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION)
        loff_t i_size;

        preempt_disable();
        i_size = inode->i_size;
        preempt_enable();
        return i_size;
#else
        /* Pairs with smp_store_release() in i_size_write() */
        return smp_load_acquire(&inode->i_size);
#endif
}

/*
 * NOTE: unlike i_size_read(), i_size_write() does need locking around it
 * (normally i_rwsem), otherwise on 32bit/SMP an update of i_size_seqcount
 * can be lost, resulting in subsequent i_size_read() calls spinning forever.
 */
static inline void i_size_write(struct inode *inode, loff_t i_size)
{
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
        preempt_disable();
        write_seqcount_begin(&inode->i_size_seqcount);
        inode->i_size = i_size;
        write_seqcount_end(&inode->i_size_seqcount);
        preempt_enable();
#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION)
        preempt_disable();
        inode->i_size = i_size;
        preempt_enable();
#else
        /*
         * Pairs with smp_load_acquire() in i_size_read() to ensure
         * changes related to inode size (such as page contents) are
         * visible before we see the changed inode size.
         */
        smp_store_release(&inode->i_size, i_size);
#endif
}

static inline unsigned iminor(const struct inode *inode)
{
        return MINOR(inode->i_rdev);
}

static inline unsigned imajor(const struct inode *inode)
{
        return MAJOR(inode->i_rdev);
}

struct fown_struct {
        struct file *file;        /* backpointer for security modules */
        rwlock_t lock;          /* protects pid, uid, euid fields */
        struct pid *pid;        /* pid or -pgrp where SIGIO should be sent */
        enum pid_type pid_type;        /* Kind of process group SIGIO should be sent to */
        kuid_t uid, euid;        /* uid/euid of process setting the owner */
        int signum;                /* posix.1b rt signal to be delivered on IO */
};

/**
 * struct file_ra_state - Track a file's readahead state.
 * @start: Where the most recent readahead started.
 * @size: Number of pages read in the most recent readahead.
 * @async_size: Numer of pages that were/are not needed immediately
 *      and so were/are genuinely "ahead".  Start next readahead when
 *      the first of these pages is accessed.
 * @ra_pages: Maximum size of a readahead request, copied from the bdi.
 * @order: Preferred folio order used for most recent readahead.
 * @mmap_miss: How many mmap accesses missed in the page cache.
 * @prev_pos: The last byte in the most recent read request.
 *
 * When this structure is passed to ->readahead(), the "most recent"
 * readahead means the current readahead.
 */
struct file_ra_state {
        pgoff_t start;
        unsigned int size;
        unsigned int async_size;
        unsigned int ra_pages;
        unsigned short order;
        unsigned short mmap_miss;
        loff_t prev_pos;
};

/*
 * Check if @index falls in the readahead windows.
 */
static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
{
        return (index >= ra->start &&
                index <  ra->start + ra->size);
}

/**
 * struct file - Represents a file
 * @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context.
 * @f_mode: FMODE_* flags often used in hotpaths
 * @f_op: file operations
 * @f_mapping: Contents of a cacheable, mappable object.
 * @private_data: filesystem or driver specific data
 * @f_inode: cached inode
 * @f_flags: file flags
 * @f_iocb_flags: iocb flags
 * @f_cred: stashed credentials of creator/opener
 * @f_owner: file owner
 * @f_path: path of the file
 * @__f_path: writable alias for @f_path; *ONLY* for core VFS and only before
 *   the file gets open
 * @f_pos_lock: lock protecting file position
 * @f_pipe: specific to pipes
 * @f_pos: file position
 * @f_security: LSM security context of this file
 * @f_wb_err: writeback error
 * @f_sb_err: per sb writeback errors
 * @f_ep: link of all epoll hooks for this file
 * @f_task_work: task work entry point
 * @f_llist: work queue entrypoint
 * @f_ra: file's readahead state
 * @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.)
 * @f_ref: reference count
 */
struct file {
        spinlock_t                        f_lock;
        fmode_t                                f_mode;
        const struct file_operations        *f_op;
        struct address_space                *f_mapping;
        void                                *private_data;
        struct inode                        *f_inode;
        unsigned int                        f_flags;
        unsigned int                        f_iocb_flags;
        const struct cred                *f_cred;
        struct fown_struct                *f_owner;
        /* --- cacheline 1 boundary (64 bytes) --- */
        union {
                const struct path        f_path;
                struct path                __f_path;
        };
        union {
                /* regular files (with FMODE_ATOMIC_POS) and directories */
                struct mutex                f_pos_lock;
                /* pipes */
                u64                        f_pipe;
        };
        loff_t                                f_pos;
#ifdef CONFIG_SECURITY
        void                                *f_security;
#endif
        /* --- cacheline 2 boundary (128 bytes) --- */
        errseq_t                        f_wb_err;
        errseq_t                        f_sb_err;
#ifdef CONFIG_EPOLL
        struct hlist_head                *f_ep;
#endif
        union {
                struct callback_head        f_task_work;
                struct llist_node        f_llist;
                struct file_ra_state        f_ra;
                freeptr_t                f_freeptr;
        };
        file_ref_t                        f_ref;
        /* --- cacheline 3 boundary (192 bytes) --- */
} __randomize_layout
  __attribute__((aligned(4)));        /* lest something weird decides that 2 is OK */

struct file_handle {
        __u32 handle_bytes;
        int handle_type;
        /* file identifier */
        unsigned char f_handle[] __counted_by(handle_bytes);
};

static inline struct file *get_file(struct file *f)
{
        file_ref_inc(&f->f_ref);
        return f;
}

struct file *get_file_rcu(struct file __rcu **f);
struct file *get_file_active(struct file **f);

#define file_count(f)        file_ref_read(&(f)->f_ref)

#define        MAX_NON_LFS        ((1UL<<31) - 1)

/* Page cache limit. The filesystems should put that into their s_maxbytes 
   limits, otherwise bad things can happen in VM. */ 
#if BITS_PER_LONG==32
#define MAX_LFS_FILESIZE        ((loff_t)ULONG_MAX << PAGE_SHIFT)
#elif BITS_PER_LONG==64
#define MAX_LFS_FILESIZE         ((loff_t)LLONG_MAX)
#endif

/* legacy typedef, should eventually be removed */
typedef void *fl_owner_t;

struct file_lock;
struct file_lease;

/* The following constant reflects the upper bound of the file/locking space */
#ifndef OFFSET_MAX
#define OFFSET_MAX        type_max(loff_t)
#define OFFT_OFFSET_MAX        type_max(off_t)
#endif

int file_f_owner_allocate(struct file *file);
static inline struct fown_struct *file_f_owner(const struct file *file)
{
        return READ_ONCE(file->f_owner);
}

extern void send_sigio(struct fown_struct *fown, int fd, int band);

static inline struct inode *file_inode(const struct file *f)
{
        return f->f_inode;
}

/*
 * file_dentry() is a relic from the days that overlayfs was using files with a
 * "fake" path, meaning, f_path on overlayfs and f_inode on underlying fs.
 * In those days, file_dentry() was needed to get the underlying fs dentry that
 * matches f_inode.
 * Files with "fake" path should not exist nowadays, so use an assertion to make
 * sure that file_dentry() was not papering over filesystem bugs.
 */
static inline struct dentry *file_dentry(const struct file *file)
{
        struct dentry *dentry = file->f_path.dentry;

        WARN_ON_ONCE(d_inode(dentry) != file_inode(file));
        return dentry;
}

struct fasync_struct {
        rwlock_t                fa_lock;
        int                        magic;
        int                        fa_fd;
        struct fasync_struct        *fa_next; /* singly linked list */
        struct file                *fa_file;
        struct rcu_head                fa_rcu;
};

#define FASYNC_MAGIC 0x4601

/* SMP safe fasync helpers: */
extern int fasync_helper(int, struct file *, int, struct fasync_struct **);
extern struct fasync_struct *fasync_insert_entry(int, struct file *, struct fasync_struct **, struct fasync_struct *);
extern int fasync_remove_entry(struct file *, struct fasync_struct **);
extern struct fasync_struct *fasync_alloc(void);
extern void fasync_free(struct fasync_struct *);

/* can be called from interrupts */
extern void kill_fasync(struct fasync_struct **, int, int);

extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force);
extern int f_setown(struct file *filp, int who, int force);
extern void f_delown(struct file *filp);
extern pid_t f_getown(struct file *filp);
extern int send_sigurg(struct file *file);

/*
 * sb->s_flags.  Note that these mirror the equivalent MS_* flags where
 * represented in both.
 */
#define SB_RDONLY       BIT(0)        /* Mount read-only */
#define SB_NOSUID       BIT(1)        /* Ignore suid and sgid bits */
#define SB_NODEV        BIT(2)        /* Disallow access to device special files */
#define SB_NOEXEC       BIT(3)        /* Disallow program execution */
#define SB_SYNCHRONOUS  BIT(4)        /* Writes are synced at once */
#define SB_MANDLOCK     BIT(6)        /* Allow mandatory locks on an FS */
#define SB_DIRSYNC      BIT(7)        /* Directory modifications are synchronous */
#define SB_NOATIME      BIT(10)        /* Do not update access times. */
#define SB_NODIRATIME   BIT(11)        /* Do not update directory access times */
#define SB_SILENT       BIT(15)
#define SB_POSIXACL     BIT(16)        /* Supports POSIX ACLs */
#define SB_INLINECRYPT  BIT(17)        /* Use blk-crypto for encrypted files */
#define SB_KERNMOUNT    BIT(22)        /* this is a kern_mount call */
#define SB_I_VERSION    BIT(23)        /* Update inode I_version field */
#define SB_LAZYTIME     BIT(25)        /* Update the on-disk [acm]times lazily */

/* These sb flags are internal to the kernel */
#define SB_DEAD         BIT(21)
#define SB_DYING        BIT(24)
#define SB_FORCE        BIT(27)
#define SB_NOSEC        BIT(28)
#define SB_BORN         BIT(29)
#define SB_ACTIVE       BIT(30)
#define SB_NOUSER       BIT(31)

/* These flags relate to encoding and casefolding */
#define SB_ENC_STRICT_MODE_FL                (1 << 0)
#define SB_ENC_NO_COMPAT_FALLBACK_FL        (1 << 1)

#define sb_has_strict_encoding(sb) \
        (sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL)

#if IS_ENABLED(CONFIG_UNICODE)
#define sb_no_casefold_compat_fallback(sb) \
        (sb->s_encoding_flags & SB_ENC_NO_COMPAT_FALLBACK_FL)
#else
#define sb_no_casefold_compat_fallback(sb) (1)
#endif

/*
 *        Umount options
 */

#define MNT_FORCE        0x00000001        /* Attempt to forcibily umount */
#define MNT_DETACH        0x00000002        /* Just detach from the tree */
#define MNT_EXPIRE        0x00000004        /* Mark for expiry */
#define UMOUNT_NOFOLLOW        0x00000008        /* Don't follow symlink on umount */
#define UMOUNT_UNUSED        0x80000000        /* Flag guaranteed to be unused */

/* sb->s_iflags */
#define SB_I_CGROUPWB        0x00000001        /* cgroup-aware writeback enabled */
#define SB_I_NOEXEC        0x00000002        /* Ignore executables on this fs */
#define SB_I_NODEV        0x00000004        /* Ignore devices on this fs */
#define SB_I_STABLE_WRITES 0x00000008        /* don't modify blks until WB is done */

/* sb->s_iflags to limit user namespace mounts */
#define SB_I_USERNS_VISIBLE                0x00000010 /* fstype already mounted */
#define SB_I_IMA_UNVERIFIABLE_SIGNATURE        0x00000020
#define SB_I_UNTRUSTED_MOUNTER                0x00000040
#define SB_I_EVM_HMAC_UNSUPPORTED        0x00000080

#define SB_I_SKIP_SYNC        0x00000100        /* Skip superblock at global sync */
#define SB_I_PERSB_BDI        0x00000200        /* has a per-sb bdi */
#define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */
#define SB_I_RETIRED        0x00000800        /* superblock shouldn't be reused */
#define SB_I_NOUMASK        0x00001000        /* VFS does not apply umask */
#define SB_I_NOIDMAP        0x00002000        /* No idmapped mounts on this superblock */
#define SB_I_ALLOW_HSM        0x00004000        /* Allow HSM events on this superblock */

/* Possible states of 'frozen' field */
enum {
        SB_UNFROZEN = 0,                /* FS is unfrozen */
        SB_FREEZE_WRITE        = 1,                /* Writes, dir ops, ioctls frozen */
        SB_FREEZE_PAGEFAULT = 2,        /* Page faults stopped as well */
        SB_FREEZE_FS = 3,                /* For internal FS use (e.g. to stop
                                         * internal threads if needed) */
        SB_FREEZE_COMPLETE = 4,                /* ->freeze_fs finished successfully */
};

#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1)

struct sb_writers {
        unsigned short                        frozen;                /* Is sb frozen? */
        int                                freeze_kcount;        /* How many kernel freeze requests? */
        int                                freeze_ucount;        /* How many userspace freeze requests? */
        const void                        *freeze_owner;        /* Owner of the freeze */
        struct percpu_rw_semaphore        rw_sem[SB_FREEZE_LEVELS];
};

struct mount;

struct super_block {
        struct list_head        s_list;                /* Keep this first */
        dev_t                        s_dev;                /* search index; _not_ kdev_t */
        unsigned char                s_blocksize_bits;
        unsigned long                s_blocksize;
        loff_t                        s_maxbytes;        /* Max file size */
        struct file_system_type        *s_type;
        const struct super_operations        *s_op;
        const struct dquot_operations        *dq_op;
        const struct quotactl_ops        *s_qcop;
        const struct export_operations *s_export_op;
        unsigned long                s_flags;
        unsigned long                s_iflags;        /* internal SB_I_* flags */
        unsigned long                s_magic;
        struct dentry                *s_root;
        struct rw_semaphore        s_umount;
        int                        s_count;
        atomic_t                s_active;
#ifdef CONFIG_SECURITY
        void                    *s_security;
#endif
        const struct xattr_handler * const *s_xattr;
#ifdef CONFIG_FS_ENCRYPTION
        const struct fscrypt_operations        *s_cop;
        struct fscrypt_keyring        *s_master_keys; /* master crypto keys in use */
#endif
#ifdef CONFIG_FS_VERITY
        const struct fsverity_operations *s_vop;
#endif
#if IS_ENABLED(CONFIG_UNICODE)
        struct unicode_map *s_encoding;
        __u16 s_encoding_flags;
#endif
        struct hlist_bl_head        s_roots;        /* alternate root dentries for NFS */
        struct mount                *s_mounts;        /* list of mounts; _not_ for fs use */
        struct block_device        *s_bdev;        /* can go away once we use an accessor for @s_bdev_file */
        struct file                *s_bdev_file;
        struct backing_dev_info *s_bdi;
        struct mtd_info                *s_mtd;
        struct hlist_node        s_instances;
        unsigned int                s_quota_types;        /* Bitmask of supported quota types */
        struct quota_info        s_dquot;        /* Diskquota specific options */

        struct sb_writers        s_writers;

        /*
         * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and
         * s_fsnotify_info together for cache efficiency. They are frequently
         * accessed and rarely modified.
         */
        void                        *s_fs_info;        /* Filesystem private info */

        /* Granularity of c/m/atime in ns (cannot be worse than a second) */
        u32                        s_time_gran;
        /* Time limits for c/m/atime in seconds */
        time64_t                   s_time_min;
        time64_t                   s_time_max;
#ifdef CONFIG_FSNOTIFY
        u32                        s_fsnotify_mask;
        struct fsnotify_sb_info        *s_fsnotify_info;
#endif

        /*
         * q: why are s_id and s_sysfs_name not the same? both are human
         * readable strings that identify the filesystem
         * a: s_id is allowed to change at runtime; it's used in log messages,
         * and we want to when a device starts out as single device (s_id is dev
         * name) but then a device is hot added and we have to switch to
         * identifying it by UUID
         * but s_sysfs_name is a handle for programmatic access, and can't
         * change at runtime
         */
        char                        s_id[32];        /* Informational name */
        uuid_t                        s_uuid;                /* UUID */
        u8                        s_uuid_len;        /* Default 16, possibly smaller for weird filesystems */

        /* if set, fs shows up under sysfs at /sys/fs/$FSTYP/s_sysfs_name */
        char                        s_sysfs_name[UUID_STRING_LEN + 1];

        unsigned int                s_max_links;
        unsigned int                s_d_flags;        /* default d_flags for dentries */

        /*
         * The next field is for VFS *only*. No filesystems have any business
         * even looking at it. You had been warned.
         */
        struct mutex s_vfs_rename_mutex;        /* Kludge */

        /*
         * Filesystem subtype.  If non-empty the filesystem type field
         * in /proc/mounts will be "type.subtype"
         */
        const char *s_subtype;

        const struct dentry_operations *__s_d_op; /* default d_op for dentries */

        struct shrinker *s_shrink;        /* per-sb shrinker handle */

        /* Number of inodes with nlink == 0 but still referenced */
        atomic_long_t s_remove_count;

        /* Read-only state of the superblock is being changed */
        int s_readonly_remount;

        /* per-sb errseq_t for reporting writeback errors via syncfs */
        errseq_t s_wb_err;

        /* AIO completions deferred from interrupt context */
        struct workqueue_struct *s_dio_done_wq;
        struct hlist_head s_pins;

        /*
         * Owning user namespace and default context in which to
         * interpret filesystem uids, gids, quotas, device nodes,
         * xattrs and security labels.
         */
        struct user_namespace *s_user_ns;

        /*
         * The list_lru structure is essentially just a pointer to a table
         * of per-node lru lists, each of which has its own spinlock.
         * There is no need to put them into separate cachelines.
         */
        struct list_lru                s_dentry_lru;
        struct list_lru                s_inode_lru;
        struct rcu_head                rcu;
        struct work_struct        destroy_work;

        struct mutex                s_sync_lock;        /* sync serialisation lock */

        /*
         * Indicates how deep in a filesystem stack this SB is
         */
        int s_stack_depth;

        /* s_inode_list_lock protects s_inodes */
        spinlock_t                s_inode_list_lock ____cacheline_aligned_in_smp;
        struct list_head        s_inodes;        /* all inodes */

        spinlock_t                s_inode_wblist_lock;
        struct list_head        s_inodes_wb;        /* writeback inodes */
} __randomize_layout;

static inline struct user_namespace *i_user_ns(const struct inode *inode)
{
        return inode->i_sb->s_user_ns;
}

/* Helper functions so that in most cases filesystems will
 * not need to deal directly with kuid_t and kgid_t and can
 * instead deal with the raw numeric values that are stored
 * in the filesystem.
 */
static inline uid_t i_uid_read(const struct inode *inode)
{
        return from_kuid(i_user_ns(inode), inode->i_uid);
}

static inline gid_t i_gid_read(const struct inode *inode)
{
        return from_kgid(i_user_ns(inode), inode->i_gid);
}

static inline void i_uid_write(struct inode *inode, uid_t uid)
{
        inode->i_uid = make_kuid(i_user_ns(inode), uid);
}

static inline void i_gid_write(struct inode *inode, gid_t gid)
{
        inode->i_gid = make_kgid(i_user_ns(inode), gid);
}

/**
 * i_uid_into_vfsuid - map an inode's i_uid down according to an idmapping
 * @idmap: idmap of the mount the inode was found from
 * @inode: inode to map
 *
 * Return: whe inode's i_uid mapped down according to @idmap.
 * If the inode's i_uid has no mapping INVALID_VFSUID is returned.
 */
static inline vfsuid_t i_uid_into_vfsuid(struct mnt_idmap *idmap,
                                         const struct inode *inode)
{
        return make_vfsuid(idmap, i_user_ns(inode), inode->i_uid);
}

/**
 * i_uid_needs_update - check whether inode's i_uid needs to be updated
 * @idmap: idmap of the mount the inode was found from
 * @attr: the new attributes of @inode
 * @inode: the inode to update
 *
 * Check whether the $inode's i_uid field needs to be updated taking idmapped
 * mounts into account if the filesystem supports it.
 *
 * Return: true if @inode's i_uid field needs to be updated, false if not.
 */
static inline bool i_uid_needs_update(struct mnt_idmap *idmap,
                                      const struct iattr *attr,
                                      const struct inode *inode)
{
        return ((attr->ia_valid & ATTR_UID) &&
                !vfsuid_eq(attr->ia_vfsuid,
                           i_uid_into_vfsuid(idmap, inode)));
}

/**
 * i_uid_update - update @inode's i_uid field
 * @idmap: idmap of the mount the inode was found from
 * @attr: the new attributes of @inode
 * @inode: the inode to update
 *
 * Safely update @inode's i_uid field translating the vfsuid of any idmapped
 * mount into the filesystem kuid.
 */
static inline void i_uid_update(struct mnt_idmap *idmap,
                                const struct iattr *attr,
                                struct inode *inode)
{
        if (attr->ia_valid & ATTR_UID)
                inode->i_uid = from_vfsuid(idmap, i_user_ns(inode),
                                           attr->ia_vfsuid);
}

/**
 * i_gid_into_vfsgid - map an inode's i_gid down according to an idmapping
 * @idmap: idmap of the mount the inode was found from
 * @inode: inode to map
 *
 * Return: the inode's i_gid mapped down according to @idmap.
 * If the inode's i_gid has no mapping INVALID_VFSGID is returned.
 */
static inline vfsgid_t i_gid_into_vfsgid(struct mnt_idmap *idmap,
                                         const struct inode *inode)
{
        return make_vfsgid(idmap, i_user_ns(inode), inode->i_gid);
}

/**
 * i_gid_needs_update - check whether inode's i_gid needs to be updated
 * @idmap: idmap of the mount the inode was found from
 * @attr: the new attributes of @inode
 * @inode: the inode to update
 *
 * Check whether the $inode's i_gid field needs to be updated taking idmapped
 * mounts into account if the filesystem supports it.
 *
 * Return: true if @inode's i_gid field needs to be updated, false if not.
 */
static inline bool i_gid_needs_update(struct mnt_idmap *idmap,
                                      const struct iattr *attr,
                                      const struct inode *inode)
{
        return ((attr->ia_valid & ATTR_GID) &&
                !vfsgid_eq(attr->ia_vfsgid,
                           i_gid_into_vfsgid(idmap, inode)));
}

/**
 * i_gid_update - update @inode's i_gid field
 * @idmap: idmap of the mount the inode was found from
 * @attr: the new attributes of @inode
 * @inode: the inode to update
 *
 * Safely update @inode's i_gid field translating the vfsgid of any idmapped
 * mount into the filesystem kgid.
 */
static inline void i_gid_update(struct mnt_idmap *idmap,
                                const struct iattr *attr,
                                struct inode *inode)
{
        if (attr->ia_valid & ATTR_GID)
                inode->i_gid = from_vfsgid(idmap, i_user_ns(inode),
                                           attr->ia_vfsgid);
}

/**
 * inode_fsuid_set - initialize inode's i_uid field with callers fsuid
 * @inode: inode to initialize
 * @idmap: idmap of the mount the inode was found from
 *
 * Initialize the i_uid field of @inode. If the inode was found/created via
 * an idmapped mount map the caller's fsuid according to @idmap.
 */
static inline void inode_fsuid_set(struct inode *inode,
                                   struct mnt_idmap *idmap)
{
        inode->i_uid = mapped_fsuid(idmap, i_user_ns(inode));
}

/**
 * inode_fsgid_set - initialize inode's i_gid field with callers fsgid
 * @inode: inode to initialize
 * @idmap: idmap of the mount the inode was found from
 *
 * Initialize the i_gid field of @inode. If the inode was found/created via
 * an idmapped mount map the caller's fsgid according to @idmap.
 */
static inline void inode_fsgid_set(struct inode *inode,
                                   struct mnt_idmap *idmap)
{
        inode->i_gid = mapped_fsgid(idmap, i_user_ns(inode));
}

/**
 * fsuidgid_has_mapping() - check whether caller's fsuid/fsgid is mapped
 * @sb: the superblock we want a mapping in
 * @idmap: idmap of the relevant mount
 *
 * Check whether the caller's fsuid and fsgid have a valid mapping in the
 * s_user_ns of the superblock @sb. If the caller is on an idmapped mount map
 * the caller's fsuid and fsgid according to the @idmap first.
 *
 * Return: true if fsuid and fsgid is mapped, false if not.
 */
static inline bool fsuidgid_has_mapping(struct super_block *sb,
                                        struct mnt_idmap *idmap)
{
        struct user_namespace *fs_userns = sb->s_user_ns;
        kuid_t kuid;
        kgid_t kgid;

        kuid = mapped_fsuid(idmap, fs_userns);
        if (!uid_valid(kuid))
                return false;
        kgid = mapped_fsgid(idmap, fs_userns);
        if (!gid_valid(kgid))
                return false;
        return kuid_has_mapping(fs_userns, kuid) &&
               kgid_has_mapping(fs_userns, kgid);
}

struct timespec64 current_time(struct inode *inode);
struct timespec64 inode_set_ctime_current(struct inode *inode);
struct timespec64 inode_set_ctime_deleg(struct inode *inode,
                                        struct timespec64 update);

static inline time64_t inode_get_atime_sec(const struct inode *inode)
{
        return inode->i_atime_sec;
}

static inline long inode_get_atime_nsec(const struct inode *inode)
{
        return inode->i_atime_nsec;
}

static inline struct timespec64 inode_get_atime(const struct inode *inode)
{
        struct timespec64 ts = { .tv_sec  = inode_get_atime_sec(inode),
                                 .tv_nsec = inode_get_atime_nsec(inode) };

        return ts;
}

static inline struct timespec64 inode_set_atime_to_ts(struct inode *inode,
                                                      struct timespec64 ts)
{
        inode->i_atime_sec = ts.tv_sec;
        inode->i_atime_nsec = ts.tv_nsec;
        return ts;
}

static inline struct timespec64 inode_set_atime(struct inode *inode,
                                                time64_t sec, long nsec)
{
        struct timespec64 ts = { .tv_sec  = sec,
                                 .tv_nsec = nsec };

        return inode_set_atime_to_ts(inode, ts);
}

static inline time64_t inode_get_mtime_sec(const struct inode *inode)
{
        return inode->i_mtime_sec;
}

static inline long inode_get_mtime_nsec(const struct inode *inode)
{
        return inode->i_mtime_nsec;
}

static inline struct timespec64 inode_get_mtime(const struct inode *inode)
{
        struct timespec64 ts = { .tv_sec  = inode_get_mtime_sec(inode),
                                 .tv_nsec = inode_get_mtime_nsec(inode) };
        return ts;
}

static inline struct timespec64 inode_set_mtime_to_ts(struct inode *inode,
                                                      struct timespec64 ts)
{
        inode->i_mtime_sec = ts.tv_sec;
        inode->i_mtime_nsec = ts.tv_nsec;
        return ts;
}

static inline struct timespec64 inode_set_mtime(struct inode *inode,
                                                time64_t sec, long nsec)
{
        struct timespec64 ts = { .tv_sec  = sec,
                                 .tv_nsec = nsec };
        return inode_set_mtime_to_ts(inode, ts);
}

/*
 * Multigrain timestamps
 *
 * Conditionally use fine-grained ctime and mtime timestamps when there
 * are users actively observing them via getattr. The primary use-case
 * for this is NFS clients that use the ctime to distinguish between
 * different states of the file, and that are often fooled by multiple
 * operations that occur in the same coarse-grained timer tick.
 */
#define I_CTIME_QUERIED                ((u32)BIT(31))

static inline time64_t inode_get_ctime_sec(const struct inode *inode)
{
        return inode->i_ctime_sec;
}

static inline long inode_get_ctime_nsec(const struct inode *inode)
{
        return inode->i_ctime_nsec & ~I_CTIME_QUERIED;
}

static inline struct timespec64 inode_get_ctime(const struct inode *inode)
{
        struct timespec64 ts = { .tv_sec  = inode_get_ctime_sec(inode),
                                 .tv_nsec = inode_get_ctime_nsec(inode) };

        return ts;
}

struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts);

/**
 * inode_set_ctime - set the ctime in the inode
 * @inode: inode in which to set the ctime
 * @sec: tv_sec value to set
 * @nsec: tv_nsec value to set
 *
 * Set the ctime in @inode to { @sec, @nsec }
 */
static inline struct timespec64 inode_set_ctime(struct inode *inode,
                                                time64_t sec, long nsec)
{
        struct timespec64 ts = { .tv_sec  = sec,
                                 .tv_nsec = nsec };

        return inode_set_ctime_to_ts(inode, ts);
}

struct timespec64 simple_inode_init_ts(struct inode *inode);

/*
 * Snapshotting support.
 */

/*
 * These are internal functions, please use sb_start_{write,pagefault,intwrite}
 * instead.
 */
static inline void __sb_end_write(struct super_block *sb, int level)
{
        percpu_up_read(sb->s_writers.rw_sem + level-1);
}

static inline void __sb_start_write(struct super_block *sb, int level)
{
        percpu_down_read_freezable(sb->s_writers.rw_sem + level - 1, true);
}

static inline bool __sb_start_write_trylock(struct super_block *sb, int level)
{
        return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1);
}

#define __sb_writers_acquired(sb, lev)        \
        percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
#define __sb_writers_release(sb, lev)        \
        percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], _THIS_IP_)

/**
 * __sb_write_started - check if sb freeze level is held
 * @sb: the super we write to
 * @level: the freeze level
 *
 * * > 0 - sb freeze level is held
 * *   0 - sb freeze level is not held
 * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN
 */
static inline int __sb_write_started(const struct super_block *sb, int level)
{
        return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1);
}

/**
 * sb_write_started - check if SB_FREEZE_WRITE is held
 * @sb: the super we write to
 *
 * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
 */
static inline bool sb_write_started(const struct super_block *sb)
{
        return __sb_write_started(sb, SB_FREEZE_WRITE);
}

/**
 * sb_write_not_started - check if SB_FREEZE_WRITE is not held
 * @sb: the super we write to
 *
 * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
 */
static inline bool sb_write_not_started(const struct super_block *sb)
{
        return __sb_write_started(sb, SB_FREEZE_WRITE) <= 0;
}

/**
 * file_write_started - check if SB_FREEZE_WRITE is held
 * @file: the file we write to
 *
 * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
 * May be false positive with !S_ISREG, because file_start_write() has
 * no effect on !S_ISREG.
 */
static inline bool file_write_started(const struct file *file)
{
        if (!S_ISREG(file_inode(file)->i_mode))
                return true;
        return sb_write_started(file_inode(file)->i_sb);
}

/**
 * file_write_not_started - check if SB_FREEZE_WRITE is not held
 * @file: the file we write to
 *
 * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
 * May be false positive with !S_ISREG, because file_start_write() has
 * no effect on !S_ISREG.
 */
static inline bool file_write_not_started(const struct file *file)
{
        if (!S_ISREG(file_inode(file)->i_mode))
                return true;
        return sb_write_not_started(file_inode(file)->i_sb);
}

/**
 * sb_end_write - drop write access to a superblock
 * @sb: the super we wrote to
 *
 * Decrement number of writers to the filesystem. Wake up possible waiters
 * wanting to freeze the filesystem.
 */
static inline void sb_end_write(struct super_block *sb)
{
        __sb_end_write(sb, SB_FREEZE_WRITE);
}

/**
 * sb_end_pagefault - drop write access to a superblock from a page fault
 * @sb: the super we wrote to
 *
 * Decrement number of processes handling write page fault to the filesystem.
 * Wake up possible waiters wanting to freeze the filesystem.
 */
static inline void sb_end_pagefault(struct super_block *sb)
{
        __sb_end_write(sb, SB_FREEZE_PAGEFAULT);
}

/**
 * sb_end_intwrite - drop write access to a superblock for internal fs purposes
 * @sb: the super we wrote to
 *
 * Decrement fs-internal number of writers to the filesystem.  Wake up possible
 * waiters wanting to freeze the filesystem.
 */
static inline void sb_end_intwrite(struct super_block *sb)
{
        __sb_end_write(sb, SB_FREEZE_FS);
}

/**
 * sb_start_write - get write access to a superblock
 * @sb: the super we write to
 *
 * When a process wants to write data or metadata to a file system (i.e. dirty
 * a page or an inode), it should embed the operation in a sb_start_write() -
 * sb_end_write() pair to get exclusion against file system freezing. This
 * function increments number of writers preventing freezing. If the file
 * system is already frozen, the function waits until the file system is
 * thawed.
 *
 * Since freeze protection behaves as a lock, users have to preserve
 * ordering of freeze protection and other filesystem locks. Generally,
 * freeze protection should be the outermost lock. In particular, we have:
 *
 * sb_start_write
 *   -> i_rwsem                        (write path, truncate, directory ops, ...)
 *   -> s_umount                (freeze_super, thaw_super)
 */
static inline void sb_start_write(struct super_block *sb)
{
        __sb_start_write(sb, SB_FREEZE_WRITE);
}

static inline bool sb_start_write_trylock(struct super_block *sb)
{
        return __sb_start_write_trylock(sb, SB_FREEZE_WRITE);
}

/**
 * sb_start_pagefault - get write access to a superblock from a page fault
 * @sb: the super we write to
 *
 * When a process starts handling write page fault, it should embed the
 * operation into sb_start_pagefault() - sb_end_pagefault() pair to get
 * exclusion against file system freezing. This is needed since the page fault
 * is going to dirty a page. This function increments number of running page
 * faults preventing freezing. If the file system is already frozen, the
 * function waits until the file system is thawed.
 *
 * Since page fault freeze protection behaves as a lock, users have to preserve
 * ordering of freeze protection and other filesystem locks. It is advised to
 * put sb_start_pagefault() close to mmap_lock in lock ordering. Page fault
 * handling code implies lock dependency:
 *
 * mmap_lock
 *   -> sb_start_pagefault
 */
static inline void sb_start_pagefault(struct super_block *sb)
{
        __sb_start_write(sb, SB_FREEZE_PAGEFAULT);
}

/**
 * sb_start_intwrite - get write access to a superblock for internal fs purposes
 * @sb: the super we write to
 *
 * This is the third level of protection against filesystem freezing. It is
 * free for use by a filesystem. The only requirement is that it must rank
 * below sb_start_pagefault.
 *
 * For example filesystem can call sb_start_intwrite() when starting a
 * transaction which somewhat eases handling of freezing for internal sources
 * of filesystem changes (internal fs threads, discarding preallocation on file
 * close, etc.).
 */
static inline void sb_start_intwrite(struct super_block *sb)
{
        __sb_start_write(sb, SB_FREEZE_FS);
}

static inline bool sb_start_intwrite_trylock(struct super_block *sb)
{
        return __sb_start_write_trylock(sb, SB_FREEZE_FS);
}

bool inode_owner_or_capable(struct mnt_idmap *idmap,
                            const struct inode *inode);

/*
 * VFS helper functions..
 */
int vfs_create(struct mnt_idmap *, struct inode *,
               struct dentry *, umode_t, bool);
struct dentry *vfs_mkdir(struct mnt_idmap *, struct inode *,
                         struct dentry *, umode_t);
int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *,
              umode_t, dev_t);
int vfs_symlink(struct mnt_idmap *, struct inode *,
                struct dentry *, const char *);
int vfs_link(struct dentry *, struct mnt_idmap *, struct inode *,
             struct dentry *, struct inode **);
int vfs_rmdir(struct mnt_idmap *, struct inode *, struct dentry *);
int vfs_unlink(struct mnt_idmap *, struct inode *, struct dentry *,
               struct inode **);

/**
 * struct renamedata - contains all information required for renaming
 * @mnt_idmap:     idmap of the mount in which the rename is happening.
 * @old_parent:        parent of source
 * @old_dentry:                source
 * @new_parent:        parent of destination
 * @new_dentry:                destination
 * @delegated_inode:   returns an inode needing a delegation break
 * @flags:             rename flags
 */
struct renamedata {
        struct mnt_idmap *mnt_idmap;
        struct dentry *old_parent;
        struct dentry *old_dentry;
        struct dentry *new_parent;
        struct dentry *new_dentry;
        struct inode **delegated_inode;
        unsigned int flags;
} __randomize_layout;

int vfs_rename(struct renamedata *);

static inline int vfs_whiteout(struct mnt_idmap *idmap,
                               struct inode *dir, struct dentry *dentry)
{
        return vfs_mknod(idmap, dir, dentry, S_IFCHR | WHITEOUT_MODE,
                         WHITEOUT_DEV);
}

struct file *kernel_tmpfile_open(struct mnt_idmap *idmap,
                                 const struct path *parentpath,
                                 umode_t mode, int open_flag,
                                 const struct cred *cred);
struct file *kernel_file_open(const struct path *path, int flags,
                              const struct cred *cred);

int vfs_mkobj(struct dentry *, umode_t,
                int (*f)(struct dentry *, umode_t, void *),
                void *);

int vfs_fchown(struct file *file, uid_t user, gid_t group);
int vfs_fchmod(struct file *file, umode_t mode);
int vfs_utimes(const struct path *path, struct timespec64 *times);

#ifdef CONFIG_COMPAT
extern long compat_ptr_ioctl(struct file *file, unsigned int cmd,
                                        unsigned long arg);
#else
#define compat_ptr_ioctl NULL
#endif

/*
 * VFS file helper functions.
 */
void inode_init_owner(struct mnt_idmap *idmap, struct inode *inode,
                      const struct inode *dir, umode_t mode);
extern bool may_open_dev(const struct path *path);
umode_t mode_strip_sgid(struct mnt_idmap *idmap,
                        const struct inode *dir, umode_t mode);
bool in_group_or_capable(struct mnt_idmap *idmap,
                         const struct inode *inode, vfsgid_t vfsgid);

/*
 * This is the "filldir" function type, used by readdir() to let
 * the kernel specify what kind of dirent layout it wants to have.
 * This allows the kernel to read directories into kernel space or
 * to have different dirent layouts depending on the binary type.
 * Return 'true' to keep going and 'false' if there are no more entries.
 */
struct dir_context;
typedef bool (*filldir_t)(struct dir_context *, const char *, int, loff_t, u64,
                         unsigned);

struct dir_context {
        filldir_t actor;
        loff_t pos;
        /*
         * Filesystems MUST NOT MODIFY count, but may use as a hint:
         * 0            unknown
         * > 0      space in buffer (assume at least one entry)
         * INT_MAX  unlimited
         */
        int count;
};

/* If OR-ed with d_type, pending signals are not checked */
#define FILLDIR_FLAG_NOINTR        0x1000

/*
 * These flags let !MMU mmap() govern direct device mapping vs immediate
 * copying more easily for MAP_PRIVATE, especially for ROM filesystems.
 *
 * NOMMU_MAP_COPY:        Copy can be mapped (MAP_PRIVATE)
 * NOMMU_MAP_DIRECT:        Can be mapped directly (MAP_SHARED)
 * NOMMU_MAP_READ:        Can be mapped for reading
 * NOMMU_MAP_WRITE:        Can be mapped for writing
 * NOMMU_MAP_EXEC:        Can be mapped for execution
 */
#define NOMMU_MAP_COPY                0x00000001
#define NOMMU_MAP_DIRECT        0x00000008
#define NOMMU_MAP_READ                VM_MAYREAD
#define NOMMU_MAP_WRITE                VM_MAYWRITE
#define NOMMU_MAP_EXEC                VM_MAYEXEC

#define NOMMU_VMFLAGS \
        (NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC)

/*
 * These flags control the behavior of the remap_file_range function pointer.
 * If it is called with len == 0 that means "remap to end of source file".
 * See Documentation/filesystems/vfs.rst for more details about this call.
 *
 * REMAP_FILE_DEDUP: only remap if contents identical (i.e. deduplicate)
 * REMAP_FILE_CAN_SHORTEN: caller can handle a shortened request
 */
#define REMAP_FILE_DEDUP                (1 << 0)
#define REMAP_FILE_CAN_SHORTEN                (1 << 1)

/*
 * These flags signal that the caller is ok with altering various aspects of
 * the behavior of the remap operation.  The changes must be made by the
 * implementation; the vfs remap helper functions can take advantage of them.
 * Flags in this category exist to preserve the quirky behavior of the hoisted
 * btrfs clone/dedupe ioctls.
 */
#define REMAP_FILE_ADVISORY                (REMAP_FILE_CAN_SHORTEN)

/*
 * These flags control the behavior of vfs_copy_file_range().
 * They are not available to the user via syscall.
 *
 * COPY_FILE_SPLICE: call splice direct instead of fs clone/copy ops
 */
#define COPY_FILE_SPLICE                (1 << 0)

struct iov_iter;
struct io_uring_cmd;
struct offset_ctx;

typedef unsigned int __bitwise fop_flags_t;

struct file_operations {
        struct module *owner;
        fop_flags_t fop_flags;
        loff_t (*llseek) (struct file *, loff_t, int);
        ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
        ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
        ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
        ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
        int (*iopoll)(struct kiocb *kiocb, struct io_comp_batch *,
                        unsigned int flags);
        int (*iterate_shared) (struct file *, struct dir_context *);
        __poll_t (*poll) (struct file *, struct poll_table_struct *);
        long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
        long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
        int (*mmap) (struct file *, struct vm_area_struct *);
        int (*open) (struct inode *, struct file *);
        int (*flush) (struct file *, fl_owner_t id);
        int (*release) (struct inode *, struct file *);
        int (*fsync) (struct file *, loff_t, loff_t, int datasync);
        int (*fasync) (int, struct file *, int);
        int (*lock) (struct file *, int, struct file_lock *);
        unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
        int (*check_flags)(int);
        int (*flock) (struct file *, int, struct file_lock *);
        ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
        ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
        void (*splice_eof)(struct file *file);
        int (*setlease)(struct file *, int, struct file_lease **, void **);
        long (*fallocate)(struct file *file, int mode, loff_t offset,
                          loff_t len);
        void (*show_fdinfo)(struct seq_file *m, struct file *f);
#ifndef CONFIG_MMU
        unsigned (*mmap_capabilities)(struct file *);
#endif
        ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
                        loff_t, size_t, unsigned int);
        loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
                                   struct file *file_out, loff_t pos_out,
                                   loff_t len, unsigned int remap_flags);
        int (*fadvise)(struct file *, loff_t, loff_t, int);
        int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
        int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *,
                                unsigned int poll_flags);
        int (*mmap_prepare)(struct vm_area_desc *);
} __randomize_layout;

/* Supports async buffered reads */
#define FOP_BUFFER_RASYNC        ((__force fop_flags_t)(1 << 0))
/* Supports async buffered writes */
#define FOP_BUFFER_WASYNC        ((__force fop_flags_t)(1 << 1))
/* Supports synchronous page faults for mappings */
#define FOP_MMAP_SYNC                ((__force fop_flags_t)(1 << 2))
/* Supports non-exclusive O_DIRECT writes from multiple threads */
#define FOP_DIO_PARALLEL_WRITE        ((__force fop_flags_t)(1 << 3))
/* Contains huge pages */
#define FOP_HUGE_PAGES                ((__force fop_flags_t)(1 << 4))
/* Treat loff_t as unsigned (e.g., /dev/mem) */
#define FOP_UNSIGNED_OFFSET        ((__force fop_flags_t)(1 << 5))
/* Supports asynchronous lock callbacks */
#define FOP_ASYNC_LOCK                ((__force fop_flags_t)(1 << 6))
/* File system supports uncached read/write buffered IO */
#define FOP_DONTCACHE                ((__force fop_flags_t)(1 << 7))

/* Wrap a directory iterator that needs exclusive inode access */
int wrap_directory_iterator(struct file *, struct dir_context *,
                            int (*) (struct file *, struct dir_context *));
#define WRAP_DIR_ITER(x) \
        static int shared_##x(struct file *file , struct dir_context *ctx) \
        { return wrap_directory_iterator(file, ctx, x); }

struct inode_operations {
        struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
        const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *);
        int (*permission) (struct mnt_idmap *, struct inode *, int);
        struct posix_acl * (*get_inode_acl)(struct inode *, int, bool);

        int (*readlink) (struct dentry *, char __user *,int);

        int (*create) (struct mnt_idmap *, struct inode *,struct dentry *,
                       umode_t, bool);
        int (*link) (struct dentry *,struct inode *,struct dentry *);
        int (*unlink) (struct inode *,struct dentry *);
        int (*symlink) (struct mnt_idmap *, struct inode *,struct dentry *,
                        const char *);
        struct dentry *(*mkdir) (struct mnt_idmap *, struct inode *,
                                 struct dentry *, umode_t);
        int (*rmdir) (struct inode *,struct dentry *);
        int (*mknod) (struct mnt_idmap *, struct inode *,struct dentry *,
                      umode_t,dev_t);
        int (*rename) (struct mnt_idmap *, struct inode *, struct dentry *,
                        struct inode *, struct dentry *, unsigned int);
        int (*setattr) (struct mnt_idmap *, struct dentry *, struct iattr *);
        int (*getattr) (struct mnt_idmap *, const struct path *,
                        struct kstat *, u32, unsigned int);
        ssize_t (*listxattr) (struct dentry *, char *, size_t);
        int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
                      u64 len);
        int (*update_time)(struct inode *, int);
        int (*atomic_open)(struct inode *, struct dentry *,
                           struct file *, unsigned open_flag,
                           umode_t create_mode);
        int (*tmpfile) (struct mnt_idmap *, struct inode *,
                        struct file *, umode_t);
        struct posix_acl *(*get_acl)(struct mnt_idmap *, struct dentry *,
                                     int);
        int (*set_acl)(struct mnt_idmap *, struct dentry *,
                       struct posix_acl *, int);
        int (*fileattr_set)(struct mnt_idmap *idmap,
                            struct dentry *dentry, struct file_kattr *fa);
        int (*fileattr_get)(struct dentry *dentry, struct file_kattr *fa);
        struct offset_ctx *(*get_offset_ctx)(struct inode *inode);
} ____cacheline_aligned;

/* Did the driver provide valid mmap hook configuration? */
static inline bool can_mmap_file(struct file *file)
{
        bool has_mmap = file->f_op->mmap;
        bool has_mmap_prepare = file->f_op->mmap_prepare;

        /* Hooks are mutually exclusive. */
        if (WARN_ON_ONCE(has_mmap && has_mmap_prepare))
                return false;
        if (!has_mmap && !has_mmap_prepare)
                return false;

        return true;
}

int __compat_vma_mmap_prepare(const struct file_operations *f_op,
                struct file *file, struct vm_area_struct *vma);
int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma);

static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma)
{
        if (file->f_op->mmap_prepare)
                return compat_vma_mmap_prepare(file, vma);

        return file->f_op->mmap(file, vma);
}

static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc)
{
        return file->f_op->mmap_prepare(desc);
}

extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
                                   loff_t, size_t, unsigned int);
int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write);
int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
                                    struct file *file_out, loff_t pos_out,
                                    loff_t *len, unsigned int remap_flags,
                                    const struct iomap_ops *dax_read_ops);
int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
                                  struct file *file_out, loff_t pos_out,
                                  loff_t *count, unsigned int remap_flags);
extern loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
                                   struct file *file_out, loff_t pos_out,
                                   loff_t len, unsigned int remap_flags);
extern int vfs_dedupe_file_range(struct file *file,
                                 struct file_dedupe_range *same);
extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
                                        struct file *dst_file, loff_t dst_pos,
                                        loff_t len, unsigned int remap_flags);

/**
 * enum freeze_holder - holder of the freeze
 * @FREEZE_HOLDER_KERNEL: kernel wants to freeze or thaw filesystem
 * @FREEZE_HOLDER_USERSPACE: userspace wants to freeze or thaw filesystem
 * @FREEZE_MAY_NEST: whether nesting freeze and thaw requests is allowed
 * @FREEZE_EXCL: a freeze that can only be undone by the owner
 *
 * Indicate who the owner of the freeze or thaw request is and whether
 * the freeze needs to be exclusive or can nest.
 * Without @FREEZE_MAY_NEST, multiple freeze and thaw requests from the
 * same holder aren't allowed. It is however allowed to hold a single
 * @FREEZE_HOLDER_USERSPACE and a single @FREEZE_HOLDER_KERNEL freeze at
 * the same time. This is relied upon by some filesystems during online
 * repair or similar.
 */
enum freeze_holder {
        FREEZE_HOLDER_KERNEL        = (1U << 0),
        FREEZE_HOLDER_USERSPACE        = (1U << 1),
        FREEZE_MAY_NEST                = (1U << 2),
        FREEZE_EXCL                = (1U << 3),
};

struct super_operations {
           struct inode *(*alloc_inode)(struct super_block *sb);
        void (*destroy_inode)(struct inode *);
        void (*free_inode)(struct inode *);

           void (*dirty_inode) (struct inode *, int flags);
        int (*write_inode) (struct inode *, struct writeback_control *wbc);
        int (*drop_inode) (struct inode *);
        void (*evict_inode) (struct inode *);
        void (*put_super) (struct super_block *);
        int (*sync_fs)(struct super_block *sb, int wait);
        int (*freeze_super) (struct super_block *, enum freeze_holder who, const void *owner);
        int (*freeze_fs) (struct super_block *);
        int (*thaw_super) (struct super_block *, enum freeze_holder who, const void *owner);
        int (*unfreeze_fs) (struct super_block *);
        int (*statfs) (struct dentry *, struct kstatfs *);
        int (*remount_fs) (struct super_block *, int *, char *);
        void (*umount_begin) (struct super_block *);

        int (*show_options)(struct seq_file *, struct dentry *);
        int (*show_devname)(struct seq_file *, struct dentry *);
        int (*show_path)(struct seq_file *, struct dentry *);
        int (*show_stats)(struct seq_file *, struct dentry *);
#ifdef CONFIG_QUOTA
        ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
        ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
        struct dquot __rcu **(*get_dquots)(struct inode *);
#endif
        long (*nr_cached_objects)(struct super_block *,
                                  struct shrink_control *);
        long (*free_cached_objects)(struct super_block *,
                                    struct shrink_control *);
        /*
         * If a filesystem can support graceful removal of a device and
         * continue read-write operations, implement this callback.
         *
         * Return 0 if the filesystem can continue read-write.
         * Non-zero return value or no such callback means the fs will be shutdown
         * as usual.
         */
        int (*remove_bdev)(struct super_block *sb, struct block_device *bdev);
        void (*shutdown)(struct super_block *sb);
};

/*
 * Inode flags - they have no relation to superblock flags now
 */
#define S_SYNC                (1 << 0)  /* Writes are synced at once */
#define S_NOATIME        (1 << 1)  /* Do not update access times */
#define S_APPEND        (1 << 2)  /* Append-only file */
#define S_IMMUTABLE        (1 << 3)  /* Immutable file */
#define S_DEAD                (1 << 4)  /* removed, but still open directory */
#define S_NOQUOTA        (1 << 5)  /* Inode is not counted to quota */
#define S_DIRSYNC        (1 << 6)  /* Directory modifications are synchronous */
#define S_NOCMTIME        (1 << 7)  /* Do not update file c/mtime */
#define S_SWAPFILE        (1 << 8)  /* Do not truncate: swapon got its bmaps */
#define S_PRIVATE        (1 << 9)  /* Inode is fs-internal */
#define S_IMA                (1 << 10) /* Inode has an associated IMA struct */
#define S_AUTOMOUNT        (1 << 11) /* Automount/referral quasi-directory */
#define S_NOSEC                (1 << 12) /* no suid or xattr security attributes */
#ifdef CONFIG_FS_DAX
#define S_DAX                (1 << 13) /* Direct Access, avoiding the page cache */
#else
#define S_DAX                0          /* Make all the DAX code disappear */
#endif
#define S_ENCRYPTED        (1 << 14) /* Encrypted file (using fs/crypto/) */
#define S_CASEFOLD        (1 << 15) /* Casefolded file */
#define S_VERITY        (1 << 16) /* Verity file (using fs/verity/) */
#define S_KERNEL_FILE        (1 << 17) /* File is in use by the kernel (eg. fs/cachefiles) */
#define S_ANON_INODE        (1 << 19) /* Inode is an anonymous inode */

/*
 * Note that nosuid etc flags are inode-specific: setting some file-system
 * flags just means all the inodes inherit those flags by default. It might be
 * possible to override it selectively if you really wanted to with some
 * ioctl() that is not currently implemented.
 *
 * Exception: SB_RDONLY is always applied to the entire file system.
 *
 * Unfortunately, it is possible to change a filesystems flags with it mounted
 * with files in use.  This means that all of the inodes will not have their
 * i_flags updated.  Hence, i_flags no longer inherit the superblock mount
 * flags, so these have to be checked separately. -- rmk@arm.uk.linux.org
 */
#define __IS_FLG(inode, flg)        ((inode)->i_sb->s_flags & (flg))

static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags & SB_RDONLY; }
#define IS_RDONLY(inode)        sb_rdonly((inode)->i_sb)
#define IS_SYNC(inode)                (__IS_FLG(inode, SB_SYNCHRONOUS) || \
                                        ((inode)->i_flags & S_SYNC))
#define IS_DIRSYNC(inode)        (__IS_FLG(inode, SB_SYNCHRONOUS|SB_DIRSYNC) || \
                                        ((inode)->i_flags & (S_SYNC|S_DIRSYNC)))
#define IS_MANDLOCK(inode)        __IS_FLG(inode, SB_MANDLOCK)
#define IS_NOATIME(inode)        __IS_FLG(inode, SB_RDONLY|SB_NOATIME)
#define IS_I_VERSION(inode)        __IS_FLG(inode, SB_I_VERSION)

#define IS_NOQUOTA(inode)        ((inode)->i_flags & S_NOQUOTA)
#define IS_APPEND(inode)        ((inode)->i_flags & S_APPEND)
#define IS_IMMUTABLE(inode)        ((inode)->i_flags & S_IMMUTABLE)

#ifdef CONFIG_FS_POSIX_ACL
#define IS_POSIXACL(inode)        __IS_FLG(inode, SB_POSIXACL)
#else
#define IS_POSIXACL(inode)        0
#endif

#define IS_DEADDIR(inode)        ((inode)->i_flags & S_DEAD)
#define IS_NOCMTIME(inode)        ((inode)->i_flags & S_NOCMTIME)

#ifdef CONFIG_SWAP
#define IS_SWAPFILE(inode)        ((inode)->i_flags & S_SWAPFILE)
#else
#define IS_SWAPFILE(inode)        ((void)(inode), 0U)
#endif

#define IS_PRIVATE(inode)        ((inode)->i_flags & S_PRIVATE)
#define IS_IMA(inode)                ((inode)->i_flags & S_IMA)
#define IS_AUTOMOUNT(inode)        ((inode)->i_flags & S_AUTOMOUNT)
#define IS_NOSEC(inode)                ((inode)->i_flags & S_NOSEC)
#define IS_DAX(inode)                ((inode)->i_flags & S_DAX)
#define IS_ENCRYPTED(inode)        ((inode)->i_flags & S_ENCRYPTED)
#define IS_CASEFOLDED(inode)        ((inode)->i_flags & S_CASEFOLD)
#define IS_VERITY(inode)        ((inode)->i_flags & S_VERITY)

#define IS_WHITEOUT(inode)        (S_ISCHR(inode->i_mode) && \
                                 (inode)->i_rdev == WHITEOUT_DEV)
#define IS_ANON_FILE(inode)        ((inode)->i_flags & S_ANON_INODE)

static inline bool HAS_UNMAPPED_ID(struct mnt_idmap *idmap,
                                   struct inode *inode)
{
        return !vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) ||
               !vfsgid_valid(i_gid_into_vfsgid(idmap, inode));
}

static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
{
        *kiocb = (struct kiocb) {
                .ki_filp = filp,
                .ki_flags = filp->f_iocb_flags,
                .ki_ioprio = get_current_ioprio(),
        };
}

static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
                               struct file *filp)
{
        *kiocb = (struct kiocb) {
                .ki_filp = filp,
                .ki_flags = kiocb_src->ki_flags,
                .ki_ioprio = kiocb_src->ki_ioprio,
                .ki_pos = kiocb_src->ki_pos,
        };
}

extern void __mark_inode_dirty(struct inode *, int);
static inline void mark_inode_dirty(struct inode *inode)
{
        __mark_inode_dirty(inode, I_DIRTY);
}

static inline void mark_inode_dirty_sync(struct inode *inode)
{
        __mark_inode_dirty(inode, I_DIRTY_SYNC);
}

static inline int icount_read(const struct inode *inode)
{
        return atomic_read(&inode->i_count);
}

/*
 * Returns true if the given inode itself only has dirty timestamps (its pages
 * may still be dirty) and isn't currently being allocated or freed.
 * Filesystems should call this if when writing an inode when lazytime is
 * enabled, they want to opportunistically write the timestamps of other inodes
 * located very nearby on-disk, e.g. in the same inode block.  This returns true
 * if the given inode is in need of such an opportunistic update.  Requires
 * i_lock, or at least later re-checking under i_lock.
 */
static inline bool inode_is_dirtytime_only(struct inode *inode)
{
        return (inode->i_state & (I_DIRTY_TIME | I_NEW |
                                  I_FREEING | I_WILL_FREE)) == I_DIRTY_TIME;
}

extern void inc_nlink(struct inode *inode);
extern void drop_nlink(struct inode *inode);
extern void clear_nlink(struct inode *inode);
extern void set_nlink(struct inode *inode, unsigned int nlink);

static inline void inode_inc_link_count(struct inode *inode)
{
        inc_nlink(inode);
        mark_inode_dirty(inode);
}

static inline void inode_dec_link_count(struct inode *inode)
{
        drop_nlink(inode);
        mark_inode_dirty(inode);
}

enum file_time_flags {
        S_ATIME = 1,
        S_MTIME = 2,
        S_CTIME = 4,
        S_VERSION = 8,
};

extern bool atime_needs_update(const struct path *, struct inode *);
extern void touch_atime(const struct path *);
int inode_update_time(struct inode *inode, int flags);

static inline void file_accessed(struct file *file)
{
        if (!(file->f_flags & O_NOATIME))
                touch_atime(&file->f_path);
}

extern int file_modified(struct file *file);
int kiocb_modified(struct kiocb *iocb);

int sync_inode_metadata(struct inode *inode, int wait);

struct file_system_type {
        const char *name;
        int fs_flags;
#define FS_REQUIRES_DEV                1 
#define FS_BINARY_MOUNTDATA        2
#define FS_HAS_SUBTYPE                4
#define FS_USERNS_MOUNT                8        /* Can be mounted by userns root */
#define FS_DISALLOW_NOTIFY_PERM        16        /* Disable fanotify permission events */
#define FS_ALLOW_IDMAP         32      /* FS has been updated to handle vfs idmappings. */
#define FS_MGTIME                64        /* FS uses multigrain timestamps */
#define FS_LBS                        128        /* FS supports LBS */
#define FS_POWER_FREEZE                256        /* Always freeze on suspend/hibernate */
#define FS_RENAME_DOES_D_MOVE        32768        /* FS will handle d_move() during rename() internally. */
        int (*init_fs_context)(struct fs_context *);
        const struct fs_parameter_spec *parameters;
        struct dentry *(*mount) (struct file_system_type *, int,
                       const char *, void *);
        void (*kill_sb) (struct super_block *);
        struct module *owner;
        struct file_system_type * next;
        struct hlist_head fs_supers;

        struct lock_class_key s_lock_key;
        struct lock_class_key s_umount_key;
        struct lock_class_key s_vfs_rename_key;
        struct lock_class_key s_writers_key[SB_FREEZE_LEVELS];

        struct lock_class_key i_lock_key;
        struct lock_class_key i_mutex_key;
        struct lock_class_key invalidate_lock_key;
        struct lock_class_key i_mutex_dir_key;
};

#define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME)

/**
 * is_mgtime: is this inode using multigrain timestamps
 * @inode: inode to test for multigrain timestamps
 *
 * Return true if the inode uses multigrain timestamps, false otherwise.
 */
static inline bool is_mgtime(const struct inode *inode)
{
        return inode->i_opflags & IOP_MGTIME;
}

extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path);
void retire_super(struct super_block *sb);
void generic_shutdown_super(struct super_block *sb);
void kill_block_super(struct super_block *sb);
void kill_anon_super(struct super_block *sb);
void kill_litter_super(struct super_block *sb);
void deactivate_super(struct super_block *sb);
void deactivate_locked_super(struct super_block *sb);
int set_anon_super(struct super_block *s, void *data);
int set_anon_super_fc(struct super_block *s, struct fs_context *fc);
int get_anon_bdev(dev_t *);
void free_anon_bdev(dev_t);
struct super_block *sget_fc(struct fs_context *fc,
                            int (*test)(struct super_block *, struct fs_context *),
                            int (*set)(struct super_block *, struct fs_context *));
struct super_block *sget(struct file_system_type *type,
                        int (*test)(struct super_block *,void *),
                        int (*set)(struct super_block *,void *),
                        int flags, void *data);
struct super_block *sget_dev(struct fs_context *fc, dev_t dev);

/* Alas, no aliases. Too much hassle with bringing module.h everywhere */
#define fops_get(fops) ({                                                \
        const struct file_operations *_fops = (fops);                        \
        (((_fops) && try_module_get((_fops)->owner) ? (_fops) : NULL));        \
})

#define fops_put(fops) ({                                                \
        const struct file_operations *_fops = (fops);                        \
        if (_fops)                                                        \
                module_put((_fops)->owner);                                \
})

/*
 * This one is to be used *ONLY* from ->open() instances.
 * fops must be non-NULL, pinned down *and* module dependencies
 * should be sufficient to pin the caller down as well.
 */
#define replace_fops(f, fops) \
        do {        \
                struct file *__file = (f); \
                fops_put(__file->f_op); \
                BUG_ON(!(__file->f_op = (fops))); \
        } while(0)

extern int register_filesystem(struct file_system_type *);
extern int unregister_filesystem(struct file_system_type *);
extern int vfs_statfs(const struct path *, struct kstatfs *);
extern int user_statfs(const char __user *, struct kstatfs *);
extern int fd_statfs(int, struct kstatfs *);
int freeze_super(struct super_block *super, enum freeze_holder who,
                 const void *freeze_owner);
int thaw_super(struct super_block *super, enum freeze_holder who,
               const void *freeze_owner);
extern __printf(2, 3)
int super_setup_bdi_name(struct super_block *sb, char *fmt, ...);
extern int super_setup_bdi(struct super_block *sb);

static inline void super_set_uuid(struct super_block *sb, const u8 *uuid, unsigned len)
{
        if (WARN_ON(len > sizeof(sb->s_uuid)))
                len = sizeof(sb->s_uuid);
        sb->s_uuid_len = len;
        memcpy(&sb->s_uuid, uuid, len);
}

/* set sb sysfs name based on sb->s_bdev */
static inline void super_set_sysfs_name_bdev(struct super_block *sb)
{
        snprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), "%pg", sb->s_bdev);
}

/* set sb sysfs name based on sb->s_uuid */
static inline void super_set_sysfs_name_uuid(struct super_block *sb)
{
        WARN_ON(sb->s_uuid_len != sizeof(sb->s_uuid));
        snprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), "%pU", sb->s_uuid.b);
}

/* set sb sysfs name based on sb->s_id */
static inline void super_set_sysfs_name_id(struct super_block *sb)
{
        strscpy(sb->s_sysfs_name, sb->s_id, sizeof(sb->s_sysfs_name));
}

/* try to use something standard before you use this */
__printf(2, 3)
static inline void super_set_sysfs_name_generic(struct super_block *sb, const char *fmt, ...)
{
        va_list args;

        va_start(args, fmt);
        vsnprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), fmt, args);
        va_end(args);
}

extern int current_umask(void);

extern void ihold(struct inode * inode);
extern void iput(struct inode *);
void iput_not_last(struct inode *);
int inode_update_timestamps(struct inode *inode, int flags);
int generic_update_time(struct inode *, int);

/* /sys/fs */
extern struct kobject *fs_kobj;

#define MAX_RW_COUNT (INT_MAX & PAGE_MASK)

/* fs/open.c */
struct audit_names;
struct filename {
        const char                *name;        /* pointer to actual string */
        const __user char        *uptr;        /* original userland pointer */
        atomic_t                refcnt;
        struct audit_names        *aname;
        const char                iname[];
};
static_assert(offsetof(struct filename, iname) % sizeof(long) == 0);

static inline struct mnt_idmap *file_mnt_idmap(const struct file *file)
{
        return mnt_idmap(file->f_path.mnt);
}

/**
 * is_idmapped_mnt - check whether a mount is mapped
 * @mnt: the mount to check
 *
 * If @mnt has an non @nop_mnt_idmap attached to it then @mnt is mapped.
 *
 * Return: true if mount is mapped, false if not.
 */
static inline bool is_idmapped_mnt(const struct vfsmount *mnt)
{
        return mnt_idmap(mnt) != &nop_mnt_idmap;
}

int vfs_truncate(const struct path *, loff_t);
int do_truncate(struct mnt_idmap *, struct dentry *, loff_t start,
                unsigned int time_attrs, struct file *filp);
extern int vfs_fallocate(struct file *file, int mode, loff_t offset,
                        loff_t len);
int do_sys_open(int dfd, const char __user *filename, int flags,
                umode_t mode);
extern struct file *file_open_name(struct filename *, int, umode_t);
extern struct file *filp_open(const char *, int, umode_t);
extern struct file *file_open_root(const struct path *,
                                   const char *, int, umode_t);
static inline struct file *file_open_root_mnt(struct vfsmount *mnt,
                                   const char *name, int flags, umode_t mode)
{
        return file_open_root(&(struct path){.mnt = mnt, .dentry = mnt->mnt_root},
                              name, flags, mode);
}
struct file *dentry_open(const struct path *path, int flags,
                         const struct cred *creds);
struct file *dentry_open_nonotify(const struct path *path, int flags,
                                  const struct cred *cred);
struct file *dentry_create(const struct path *path, int flags, umode_t mode,
                           const struct cred *cred);
const struct path *backing_file_user_path(const struct file *f);

/*
 * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file
 * stored in ->vm_file is a backing file whose f_inode is on the underlying
 * filesystem.  When the mapped file path and inode number are displayed to
 * user (e.g. via /proc/<pid>/maps), these helpers should be used to get the
 * path and inode number to display to the user, which is the path of the fd
 * that user has requested to map and the inode number that would be returned
 * by fstat() on that same fd.
 */
/* Get the path to display in /proc/<pid>/maps */
static inline const struct path *file_user_path(const struct file *f)
{
        if (unlikely(f->f_mode & FMODE_BACKING))
                return backing_file_user_path(f);
        return &f->f_path;
}
/* Get the inode whose inode number to display in /proc/<pid>/maps */
static inline const struct inode *file_user_inode(const struct file *f)
{
        if (unlikely(f->f_mode & FMODE_BACKING))
                return d_inode(backing_file_user_path(f)->dentry);
        return file_inode(f);
}

static inline struct file *file_clone_open(struct file *file)
{
        return dentry_open(&file->f_path, file->f_flags, file->f_cred);
}
extern int filp_close(struct file *, fl_owner_t id);

extern struct filename *getname_flags(const char __user *, int);
extern struct filename *getname_uflags(const char __user *, int);
static inline struct filename *getname(const char __user *name)
{
        return getname_flags(name, 0);
}
extern struct filename *getname_kernel(const char *);
extern struct filename *__getname_maybe_null(const char __user *);
static inline struct filename *getname_maybe_null(const char __user *name, int flags)
{
        if (!(flags & AT_EMPTY_PATH))
                return getname(name);

        if (!name)
                return NULL;
        return __getname_maybe_null(name);
}
extern void putname(struct filename *name);
DEFINE_FREE(putname, struct filename *, if (!IS_ERR_OR_NULL(_T)) putname(_T))

static inline struct filename *refname(struct filename *name)
{
        atomic_inc(&name->refcnt);
        return name;
}

extern int finish_open(struct file *file, struct dentry *dentry,
                        int (*open)(struct inode *, struct file *));
extern int finish_no_open(struct file *file, struct dentry *dentry);

/* Helper for the simple case when original dentry is used */
static inline int finish_open_simple(struct file *file, int error)
{
        if (error)
                return error;

        return finish_open(file, file->f_path.dentry, NULL);
}

/* fs/dcache.c */
extern void __init vfs_caches_init_early(void);
extern void __init vfs_caches_init(void);

extern struct kmem_cache *names_cachep;

#define __getname()                kmem_cache_alloc(names_cachep, GFP_KERNEL)
#define __putname(name)                kmem_cache_free(names_cachep, (void *)(name))

extern struct super_block *blockdev_superblock;
static inline bool sb_is_blkdev_sb(struct super_block *sb)
{
        return IS_ENABLED(CONFIG_BLOCK) && sb == blockdev_superblock;
}

void emergency_thaw_all(void);
extern int sync_filesystem(struct super_block *);
extern const struct file_operations def_blk_fops;
extern const struct file_operations def_chr_fops;

/* fs/char_dev.c */
#define CHRDEV_MAJOR_MAX 512
/* Marks the bottom of the first segment of free char majors */
#define CHRDEV_MAJOR_DYN_END 234
/* Marks the top and bottom of the second segment of free char majors */
#define CHRDEV_MAJOR_DYN_EXT_START 511
#define CHRDEV_MAJOR_DYN_EXT_END 384

extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *);
extern int register_chrdev_region(dev_t, unsigned, const char *);
extern int __register_chrdev(unsigned int major, unsigned int baseminor,
                             unsigned int count, const char *name,
                             const struct file_operations *fops);
extern void __unregister_chrdev(unsigned int major, unsigned int baseminor,
                                unsigned int count, const char *name);
extern void unregister_chrdev_region(dev_t, unsigned);
extern void chrdev_show(struct seq_file *,off_t);

static inline int register_chrdev(unsigned int major, const char *name,
                                  const struct file_operations *fops)
{
        return __register_chrdev(major, 0, 256, name, fops);
}

static inline void unregister_chrdev(unsigned int major, const char *name)
{
        __unregister_chrdev(major, 0, 256, name);
}

extern void init_special_inode(struct inode *, umode_t, dev_t);

/* Invalid inode operations -- fs/bad_inode.c */
extern void make_bad_inode(struct inode *);
extern bool is_bad_inode(struct inode *);

extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart,
                                                loff_t lend);
extern int __must_check file_check_and_advance_wb_err(struct file *file);
extern int __must_check file_write_and_wait_range(struct file *file,
                                                loff_t start, loff_t end);
int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start,
                loff_t end);

static inline int file_write_and_wait(struct file *file)
{
        return file_write_and_wait_range(file, 0, LLONG_MAX);
}

extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end,
                           int datasync);
extern int vfs_fsync(struct file *file, int datasync);

extern int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
                                unsigned int flags);

static inline bool iocb_is_dsync(const struct kiocb *iocb)
{
        return (iocb->ki_flags & IOCB_DSYNC) ||
                IS_SYNC(iocb->ki_filp->f_mapping->host);
}

/*
 * Sync the bytes written if this was a synchronous write.  Expect ki_pos
 * to already be updated for the write, and will return either the amount
 * of bytes passed in, or an error if syncing the file failed.
 */
static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count)
{
        if (iocb_is_dsync(iocb)) {
                int ret = vfs_fsync_range(iocb->ki_filp,
                                iocb->ki_pos - count, iocb->ki_pos - 1,
                                (iocb->ki_flags & IOCB_SYNC) ? 0 : 1);
                if (ret)
                        return ret;
        } else if (iocb->ki_flags & IOCB_DONTCACHE) {
                struct address_space *mapping = iocb->ki_filp->f_mapping;

                filemap_fdatawrite_range_kick(mapping, iocb->ki_pos - count,
                                              iocb->ki_pos - 1);
        }

        return count;
}

extern void emergency_sync(void);
extern void emergency_remount(void);

#ifdef CONFIG_BLOCK
extern int bmap(struct inode *inode, sector_t *block);
#else
static inline int bmap(struct inode *inode,  sector_t *block)
{
        return -EINVAL;
}
#endif

int notify_change(struct mnt_idmap *, struct dentry *,
                  struct iattr *, struct inode **);
int inode_permission(struct mnt_idmap *, struct inode *, int);
int generic_permission(struct mnt_idmap *, struct inode *, int);
static inline int file_permission(struct file *file, int mask)
{
        return inode_permission(file_mnt_idmap(file),
                                file_inode(file), mask);
}
static inline int path_permission(const struct path *path, int mask)
{
        return inode_permission(mnt_idmap(path->mnt),
                                d_inode(path->dentry), mask);
}
int __check_sticky(struct mnt_idmap *idmap, struct inode *dir,
                   struct inode *inode);

static inline bool execute_ok(struct inode *inode)
{
        return (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode);
}

static inline bool inode_wrong_type(const struct inode *inode, umode_t mode)
{
        return (inode->i_mode ^ mode) & S_IFMT;
}

/**
 * file_start_write - get write access to a superblock for regular file io
 * @file: the file we want to write to
 *
 * This is a variant of sb_start_write() which is a noop on non-regualr file.
 * Should be matched with a call to file_end_write().
 */
static inline void file_start_write(struct file *file)
{
        if (!S_ISREG(file_inode(file)->i_mode))
                return;
        sb_start_write(file_inode(file)->i_sb);
}

static inline bool file_start_write_trylock(struct file *file)
{
        if (!S_ISREG(file_inode(file)->i_mode))
                return true;
        return sb_start_write_trylock(file_inode(file)->i_sb);
}

/**
 * file_end_write - drop write access to a superblock of a regular file
 * @file: the file we wrote to
 *
 * Should be matched with a call to file_start_write().
 */
static inline void file_end_write(struct file *file)
{
        if (!S_ISREG(file_inode(file)->i_mode))
                return;
        sb_end_write(file_inode(file)->i_sb);
}

/**
 * kiocb_start_write - get write access to a superblock for async file io
 * @iocb: the io context we want to submit the write with
 *
 * This is a variant of sb_start_write() for async io submission.
 * Should be matched with a call to kiocb_end_write().
 */
static inline void kiocb_start_write(struct kiocb *iocb)
{
        struct inode *inode = file_inode(iocb->ki_filp);

        sb_start_write(inode->i_sb);
        /*
         * Fool lockdep by telling it the lock got released so that it
         * doesn't complain about the held lock when we return to userspace.
         */
        __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE);
}

/**
 * kiocb_end_write - drop write access to a superblock after async file io
 * @iocb: the io context we sumbitted the write with
 *
 * Should be matched with a call to kiocb_start_write().
 */
static inline void kiocb_end_write(struct kiocb *iocb)
{
        struct inode *inode = file_inode(iocb->ki_filp);

        /*
         * Tell lockdep we inherited freeze protection from submission thread.
         */
        __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
        sb_end_write(inode->i_sb);
}

/*
 * This is used for regular files where some users -- especially the
 * currently executed binary in a process, previously handled via
 * VM_DENYWRITE -- cannot handle concurrent write (and maybe mmap
 * read-write shared) accesses.
 *
 * get_write_access() gets write permission for a file.
 * put_write_access() releases this write permission.
 * deny_write_access() denies write access to a file.
 * allow_write_access() re-enables write access to a file.
 *
 * The i_writecount field of an inode can have the following values:
 * 0: no write access, no denied write access
 * < 0: (-i_writecount) users that denied write access to the file.
 * > 0: (i_writecount) users that have write access to the file.
 *
 * Normally we operate on that counter with atomic_{inc,dec} and it's safe
 * except for the cases where we don't hold i_writecount yet. Then we need to
 * use {get,deny}_write_access() - these functions check the sign and refuse
 * to do the change if sign is wrong.
 */
static inline int get_write_access(struct inode *inode)
{
        return atomic_inc_unless_negative(&inode->i_writecount) ? 0 : -ETXTBSY;
}
static inline int deny_write_access(struct file *file)
{
        struct inode *inode = file_inode(file);
        return atomic_dec_unless_positive(&inode->i_writecount) ? 0 : -ETXTBSY;
}
static inline void put_write_access(struct inode * inode)
{
        atomic_dec(&inode->i_writecount);
}
static inline void allow_write_access(struct file *file)
{
        if (file)
                atomic_inc(&file_inode(file)->i_writecount);
}

/*
 * Do not prevent write to executable file when watched by pre-content events.
 *
 * Note that FMODE_FSNOTIFY_HSM mode is set depending on pre-content watches at
 * the time of file open and remains constant for entire lifetime of the file,
 * so if pre-content watches are added post execution or removed before the end
 * of the execution, it will not cause i_writecount reference leak.
 */
static inline int exe_file_deny_write_access(struct file *exe_file)
{
        if (unlikely(FMODE_FSNOTIFY_HSM(exe_file->f_mode)))
                return 0;
        return deny_write_access(exe_file);
}
static inline void exe_file_allow_write_access(struct file *exe_file)
{
        if (unlikely(!exe_file || FMODE_FSNOTIFY_HSM(exe_file->f_mode)))
                return;
        allow_write_access(exe_file);
}

static inline void file_set_fsnotify_mode(struct file *file, fmode_t mode)
{
        file->f_mode &= ~FMODE_FSNOTIFY_MASK;
        file->f_mode |= mode;
}

static inline bool inode_is_open_for_write(const struct inode *inode)
{
        return atomic_read(&inode->i_writecount) > 0;
}

#if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING)
static inline void i_readcount_dec(struct inode *inode)
{
        BUG_ON(atomic_dec_return(&inode->i_readcount) < 0);
}
static inline void i_readcount_inc(struct inode *inode)
{
        atomic_inc(&inode->i_readcount);
}
#else
static inline void i_readcount_dec(struct inode *inode)
{
        return;
}
static inline void i_readcount_inc(struct inode *inode)
{
        return;
}
#endif
extern int do_pipe_flags(int *, int);

extern ssize_t kernel_read(struct file *, void *, size_t, loff_t *);
ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos);
extern ssize_t kernel_write(struct file *, const void *, size_t, loff_t *);
extern ssize_t __kernel_write(struct file *, const void *, size_t, loff_t *);
extern struct file * open_exec(const char *);
 
/* fs/dcache.c -- generic fs support functions */
extern bool is_subdir(struct dentry *, struct dentry *);
extern bool path_is_under(const struct path *, const struct path *);

extern char *file_path(struct file *, char *, int);

/**
 * is_dot_dotdot - returns true only if @name is "." or ".."
 * @name: file name to check
 * @len: length of file name, in bytes
 */
static inline bool is_dot_dotdot(const char *name, size_t len)
{
        return len && unlikely(name[0] == '.') &&
                (len == 1 || (len == 2 && name[1] == '.'));
}

/**
 * name_contains_dotdot - check if a file name contains ".." path components
 * @name: File path string to check
 * Search for ".." surrounded by either '/' or start/end of string.
 */
static inline bool name_contains_dotdot(const char *name)
{
        size_t name_len;

        name_len = strlen(name);
        return strcmp(name, "..") == 0 ||
               strncmp(name, "../", 3) == 0 ||
               strstr(name, "/../") != NULL ||
               (name_len >= 3 && strcmp(name + name_len - 3, "/..") == 0);
}

#include <linux/err.h>

/* needed for stackable file system support */
extern loff_t default_llseek(struct file *file, loff_t offset, int whence);

extern loff_t vfs_llseek(struct file *file, loff_t offset, int whence);

extern int inode_init_always_gfp(struct super_block *, struct inode *, gfp_t);
static inline int inode_init_always(struct super_block *sb, struct inode *inode)
{
        return inode_init_always_gfp(sb, inode, GFP_NOFS);
}

extern void inode_init_once(struct inode *);
extern void address_space_init_once(struct address_space *mapping);
extern struct inode * igrab(struct inode *);
extern ino_t iunique(struct super_block *, ino_t);
extern int inode_needs_sync(struct inode *inode);
extern int inode_just_drop(struct inode *inode);
static inline int inode_generic_drop(struct inode *inode)
{
        return !inode->i_nlink || inode_unhashed(inode);
}
extern void d_mark_dontcache(struct inode *inode);

extern struct inode *ilookup5_nowait(struct super_block *sb,
                unsigned long hashval, int (*test)(struct inode *, void *),
                void *data);
extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
                int (*test)(struct inode *, void *), void *data);
extern struct inode *ilookup(struct super_block *sb, unsigned long ino);

extern struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
                int (*test)(struct inode *, void *),
                int (*set)(struct inode *, void *),
                void *data);
struct inode *iget5_locked(struct super_block *, unsigned long,
                           int (*test)(struct inode *, void *),
                           int (*set)(struct inode *, void *), void *);
struct inode *iget5_locked_rcu(struct super_block *, unsigned long,
                               int (*test)(struct inode *, void *),
                               int (*set)(struct inode *, void *), void *);
extern struct inode * iget_locked(struct super_block *, unsigned long);
extern struct inode *find_inode_nowait(struct super_block *,
                                       unsigned long,
                                       int (*match)(struct inode *,
                                                    unsigned long, void *),
                                       void *data);
extern struct inode *find_inode_rcu(struct super_block *, unsigned long,
                                    int (*)(struct inode *, void *), void *);
extern struct inode *find_inode_by_ino_rcu(struct super_block *, unsigned long);
extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *);
extern int insert_inode_locked(struct inode *);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
extern void lockdep_annotate_inode_mutex_key(struct inode *inode);
#else
static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { };
#endif
extern void unlock_new_inode(struct inode *);
extern void discard_new_inode(struct inode *);
extern unsigned int get_next_ino(void);
extern void evict_inodes(struct super_block *sb);
void dump_mapping(const struct address_space *);

/*
 * Userspace may rely on the inode number being non-zero. For example, glibc
 * simply ignores files with zero i_ino in unlink() and other places.
 *
 * As an additional complication, if userspace was compiled with
 * _FILE_OFFSET_BITS=32 on a 64-bit kernel we'll only end up reading out the
 * lower 32 bits, so we need to check that those aren't zero explicitly. With
 * _FILE_OFFSET_BITS=64, this may cause some harmless false-negatives, but
 * better safe than sorry.
 */
static inline bool is_zero_ino(ino_t ino)
{
        return (u32)ino == 0;
}

/*
 * inode->i_lock must be held
 */
static inline void __iget(struct inode *inode)
{
        atomic_inc(&inode->i_count);
}

extern void iget_failed(struct inode *);
extern void clear_inode(struct inode *);
extern void __destroy_inode(struct inode *);
struct inode *alloc_inode(struct super_block *sb);
static inline struct inode *new_inode_pseudo(struct super_block *sb)
{
        return alloc_inode(sb);
}
extern struct inode *new_inode(struct super_block *sb);
extern void free_inode_nonrcu(struct inode *inode);
extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *);
extern int file_remove_privs(struct file *);
int setattr_should_drop_sgid(struct mnt_idmap *idmap,
                             const struct inode *inode);

/*
 * This must be used for allocating filesystems specific inodes to set
 * up the inode reclaim context correctly.
 */
#define alloc_inode_sb(_sb, _cache, _gfp) kmem_cache_alloc_lru(_cache, &_sb->s_inode_lru, _gfp)

extern void __insert_inode_hash(struct inode *, unsigned long hashval);
static inline void insert_inode_hash(struct inode *inode)
{
        __insert_inode_hash(inode, inode->i_ino);
}

extern void __remove_inode_hash(struct inode *);
static inline void remove_inode_hash(struct inode *inode)
{
        if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash))
                __remove_inode_hash(inode);
}

extern void inode_sb_list_add(struct inode *inode);
extern void inode_add_lru(struct inode *inode);

int sb_set_blocksize(struct super_block *sb, int size);
int __must_check sb_min_blocksize(struct super_block *sb, int size);

int generic_file_mmap(struct file *, struct vm_area_struct *);
int generic_file_mmap_prepare(struct vm_area_desc *desc);
int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc);
extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
int generic_write_checks_count(struct kiocb *iocb, loff_t *count);
extern int generic_write_check_limits(struct file *file, loff_t pos,
                loff_t *count);
extern int generic_file_rw_checks(struct file *file_in, struct file *file_out);
ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *to,
                ssize_t already_read);
extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *);
extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *);
ssize_t generic_perform_write(struct kiocb *, struct iov_iter *);
ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter,
                ssize_t direct_written, ssize_t buffered_written);

ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
                rwf_t flags);
ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
                rwf_t flags);
ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
                           struct iov_iter *iter);
ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
                            struct iov_iter *iter);

/* fs/splice.c */
ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
                            struct pipe_inode_info *pipe,
                            size_t len, unsigned int flags);
ssize_t copy_splice_read(struct file *in, loff_t *ppos,
                         struct pipe_inode_info *pipe,
                         size_t len, unsigned int flags);
extern ssize_t iter_file_splice_write(struct pipe_inode_info *,
                struct file *, loff_t *, size_t, unsigned int);


extern void
file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
extern loff_t noop_llseek(struct file *file, loff_t offset, int whence);
extern loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize);
extern loff_t generic_file_llseek(struct file *file, loff_t offset, int whence);
extern loff_t generic_file_llseek_size(struct file *file, loff_t offset,
                int whence, loff_t maxsize, loff_t eof);
loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence,
                             u64 *cookie);
extern loff_t fixed_size_llseek(struct file *file, loff_t offset,
                int whence, loff_t size);
extern loff_t no_seek_end_llseek_size(struct file *, loff_t, int, loff_t);
extern loff_t no_seek_end_llseek(struct file *, loff_t, int);
int rw_verify_area(int, struct file *, const loff_t *, size_t);
extern int generic_file_open(struct inode * inode, struct file * filp);
extern int nonseekable_open(struct inode * inode, struct file * filp);
extern int stream_open(struct inode * inode, struct file * filp);

#ifdef CONFIG_BLOCK
typedef void (dio_submit_t)(struct bio *bio, struct inode *inode,
                            loff_t file_offset);

enum {
        /* need locking between buffered and direct access */
        DIO_LOCKING        = 0x01,

        /* filesystem does not support filling holes */
        DIO_SKIP_HOLES        = 0x02,
};

ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
                             struct block_device *bdev, struct iov_iter *iter,
                             get_block_t get_block,
                             dio_iodone_t end_io,
                             int flags);

static inline ssize_t blockdev_direct_IO(struct kiocb *iocb,
                                         struct inode *inode,
                                         struct iov_iter *iter,
                                         get_block_t get_block)
{
        return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
                        get_block, NULL, DIO_LOCKING | DIO_SKIP_HOLES);
}
#endif

bool inode_dio_finished(const struct inode *inode);
void inode_dio_wait(struct inode *inode);
void inode_dio_wait_interruptible(struct inode *inode);

/**
 * inode_dio_begin - signal start of a direct I/O requests
 * @inode: inode the direct I/O happens on
 *
 * This is called once we've finished processing a direct I/O request,
 * and is used to wake up callers waiting for direct I/O to be quiesced.
 */
static inline void inode_dio_begin(struct inode *inode)
{
        atomic_inc(&inode->i_dio_count);
}

/**
 * inode_dio_end - signal finish of a direct I/O requests
 * @inode: inode the direct I/O happens on
 *
 * This is called once we've finished processing a direct I/O request,
 * and is used to wake up callers waiting for direct I/O to be quiesced.
 */
static inline void inode_dio_end(struct inode *inode)
{
        if (atomic_dec_and_test(&inode->i_dio_count))
                wake_up_var(&inode->i_dio_count);
}

extern void inode_set_flags(struct inode *inode, unsigned int flags,
                            unsigned int mask);

extern const struct file_operations generic_ro_fops;

#define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))

extern int readlink_copy(char __user *, int, const char *, int);
extern int page_readlink(struct dentry *, char __user *, int);
extern const char *page_get_link_raw(struct dentry *, struct inode *,
                                     struct delayed_call *);
extern const char *page_get_link(struct dentry *, struct inode *,
                                 struct delayed_call *);
extern void page_put_link(void *);
extern int page_symlink(struct inode *inode, const char *symname, int len);
extern const struct inode_operations page_symlink_inode_operations;
extern void kfree_link(void *);
void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode);
void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *);
void generic_fill_statx_attr(struct inode *inode, struct kstat *stat);
void generic_fill_statx_atomic_writes(struct kstat *stat,
                                      unsigned int unit_min,
                                      unsigned int unit_max,
                                      unsigned int unit_max_opt);
extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int);
extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int);
void __inode_add_bytes(struct inode *inode, loff_t bytes);
void inode_add_bytes(struct inode *inode, loff_t bytes);
void __inode_sub_bytes(struct inode *inode, loff_t bytes);
void inode_sub_bytes(struct inode *inode, loff_t bytes);
static inline loff_t __inode_get_bytes(struct inode *inode)
{
        return (((loff_t)inode->i_blocks) << 9) + inode->i_bytes;
}
loff_t inode_get_bytes(struct inode *inode);
void inode_set_bytes(struct inode *inode, loff_t bytes);
const char *simple_get_link(struct dentry *, struct inode *,
                            struct delayed_call *);
extern const struct inode_operations simple_symlink_inode_operations;

extern int iterate_dir(struct file *, struct dir_context *);

int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
                int flags);
int vfs_fstat(int fd, struct kstat *stat);

static inline int vfs_stat(const char __user *filename, struct kstat *stat)
{
        return vfs_fstatat(AT_FDCWD, filename, stat, 0);
}
static inline int vfs_lstat(const char __user *name, struct kstat *stat)
{
        return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW);
}

extern const char *vfs_get_link(struct dentry *, struct delayed_call *);
extern int vfs_readlink(struct dentry *, char __user *, int);

extern struct file_system_type *get_filesystem(struct file_system_type *fs);
extern void put_filesystem(struct file_system_type *fs);
extern struct file_system_type *get_fs_type(const char *name);
extern void drop_super(struct super_block *sb);
extern void drop_super_exclusive(struct super_block *sb);
extern void iterate_supers(void (*f)(struct super_block *, void *), void *arg);
extern void iterate_supers_type(struct file_system_type *,
                                void (*)(struct super_block *, void *), void *);
void filesystems_freeze(bool freeze_all);
void filesystems_thaw(void);

extern int dcache_dir_open(struct inode *, struct file *);
extern int dcache_dir_close(struct inode *, struct file *);
extern loff_t dcache_dir_lseek(struct file *, loff_t, int);
extern int dcache_readdir(struct file *, struct dir_context *);
extern int simple_setattr(struct mnt_idmap *, struct dentry *,
                          struct iattr *);
extern int simple_getattr(struct mnt_idmap *, const struct path *,
                          struct kstat *, u32, unsigned int);
extern int simple_statfs(struct dentry *, struct kstatfs *);
extern int simple_open(struct inode *inode, struct file *file);
extern int simple_link(struct dentry *, struct inode *, struct dentry *);
extern int simple_unlink(struct inode *, struct dentry *);
extern int simple_rmdir(struct inode *, struct dentry *);
void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry,
                             struct inode *new_dir, struct dentry *new_dentry);
extern int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry,
                                  struct inode *new_dir, struct dentry *new_dentry);
extern int simple_rename(struct mnt_idmap *, struct inode *,
                         struct dentry *, struct inode *, struct dentry *,
                         unsigned int);
extern void simple_recursive_removal(struct dentry *,
                              void (*callback)(struct dentry *));
extern void locked_recursive_removal(struct dentry *,
                              void (*callback)(struct dentry *));
extern int noop_fsync(struct file *, loff_t, loff_t, int);
extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
extern int simple_empty(struct dentry *);
extern int simple_write_begin(const struct kiocb *iocb,
                              struct address_space *mapping,
                              loff_t pos, unsigned len,
                              struct folio **foliop, void **fsdata);
extern const struct address_space_operations ram_aops;
extern int always_delete_dentry(const struct dentry *);
extern struct inode *alloc_anon_inode(struct super_block *);
struct inode *anon_inode_make_secure_inode(struct super_block *sb, const char *name,
                                           const struct inode *context_inode);
extern int simple_nosetlease(struct file *, int, struct file_lease **, void **);

extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags);
extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *);
extern const struct file_operations simple_dir_operations;
extern const struct inode_operations simple_dir_inode_operations;
extern void make_empty_dir_inode(struct inode *inode);
extern bool is_empty_dir_inode(struct inode *inode);
struct tree_descr { const char *name; const struct file_operations *ops; int mode; };
struct dentry *d_alloc_name(struct dentry *, const char *);
extern int simple_fill_super(struct super_block *, unsigned long,
                             const struct tree_descr *);
extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count);
extern void simple_release_fs(struct vfsmount **mount, int *count);
struct dentry *simple_start_creating(struct dentry *, const char *);

extern ssize_t simple_read_from_buffer(void __user *to, size_t count,
                        loff_t *ppos, const void *from, size_t available);
extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
                const void __user *from, size_t count);

struct offset_ctx {
        struct maple_tree        mt;
        unsigned long                next_offset;
};

void simple_offset_init(struct offset_ctx *octx);
int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry);
void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry);
int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry,
                         struct inode *new_dir, struct dentry *new_dentry);
int simple_offset_rename_exchange(struct inode *old_dir,
                                  struct dentry *old_dentry,
                                  struct inode *new_dir,
                                  struct dentry *new_dentry);
void simple_offset_destroy(struct offset_ctx *octx);

extern const struct file_operations simple_offset_dir_operations;

extern int __generic_file_fsync(struct file *, loff_t, loff_t, int);
extern int generic_file_fsync(struct file *, loff_t, loff_t, int);

extern int generic_check_addressable(unsigned, u64);

extern void generic_set_sb_d_ops(struct super_block *sb);
extern int generic_ci_match(const struct inode *parent,
                            const struct qstr *name,
                            const struct qstr *folded_name,
                            const u8 *de_name, u32 de_name_len);

#if IS_ENABLED(CONFIG_UNICODE)
int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str);
int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
                         const char *str, const struct qstr *name);

/**
 * generic_ci_validate_strict_name - Check if a given name is suitable
 * for a directory
 *
 * This functions checks if the proposed filename is valid for the
 * parent directory. That means that only valid UTF-8 filenames will be
 * accepted for casefold directories from filesystems created with the
 * strict encoding flag.  That also means that any name will be
 * accepted for directories that doesn't have casefold enabled, or
 * aren't being strict with the encoding.
 *
 * @dir: inode of the directory where the new file will be created
 * @name: name of the new file
 *
 * Return:
 * * True: if the filename is suitable for this directory. It can be
 *   true if a given name is not suitable for a strict encoding
 *   directory, but the directory being used isn't strict
 * * False if the filename isn't suitable for this directory. This only
 *   happens when a directory is casefolded and the filesystem is strict
 *   about its encoding.
 */
static inline bool generic_ci_validate_strict_name(struct inode *dir,
                                                   const struct qstr *name)
{
        if (!IS_CASEFOLDED(dir) || !sb_has_strict_encoding(dir->i_sb))
                return true;

        /*
         * A casefold dir must have a encoding set, unless the filesystem
         * is corrupted
         */
        if (WARN_ON_ONCE(!dir->i_sb->s_encoding))
                return true;

        return !utf8_validate(dir->i_sb->s_encoding, name);
}
#else
static inline bool generic_ci_validate_strict_name(struct inode *dir,
                                                   const struct qstr *name)
{
        return true;
}
#endif

static inline struct unicode_map *sb_encoding(const struct super_block *sb)
{
#if IS_ENABLED(CONFIG_UNICODE)
        return sb->s_encoding;
#else
        return NULL;
#endif
}

static inline bool sb_has_encoding(const struct super_block *sb)
{
        return !!sb_encoding(sb);
}

/*
 * Compare if two super blocks have the same encoding and flags
 */
static inline bool sb_same_encoding(const struct super_block *sb1,
                                    const struct super_block *sb2)
{
#if IS_ENABLED(CONFIG_UNICODE)
        if (sb1->s_encoding == sb2->s_encoding)
                return true;

        return (sb1->s_encoding && sb2->s_encoding &&
               (sb1->s_encoding->version == sb2->s_encoding->version) &&
               (sb1->s_encoding_flags == sb2->s_encoding_flags));
#else
        return true;
#endif
}

int may_setattr(struct mnt_idmap *idmap, struct inode *inode,
                unsigned int ia_valid);
int setattr_prepare(struct mnt_idmap *, struct dentry *, struct iattr *);
extern int inode_newsize_ok(const struct inode *, loff_t offset);
void setattr_copy(struct mnt_idmap *, struct inode *inode,
                  const struct iattr *attr);

extern int file_update_time(struct file *file);

static inline bool file_is_dax(const struct file *file)
{
        return file && IS_DAX(file->f_mapping->host);
}

static inline bool vma_is_dax(const struct vm_area_struct *vma)
{
        return file_is_dax(vma->vm_file);
}

static inline bool vma_is_fsdax(struct vm_area_struct *vma)
{
        struct inode *inode;

        if (!IS_ENABLED(CONFIG_FS_DAX) || !vma->vm_file)
                return false;
        if (!vma_is_dax(vma))
                return false;
        inode = file_inode(vma->vm_file);
        if (S_ISCHR(inode->i_mode))
                return false; /* device-dax */
        return true;
}

static inline int iocb_flags(struct file *file)
{
        int res = 0;
        if (file->f_flags & O_APPEND)
                res |= IOCB_APPEND;
        if (file->f_flags & O_DIRECT)
                res |= IOCB_DIRECT;
        if (file->f_flags & O_DSYNC)
                res |= IOCB_DSYNC;
        if (file->f_flags & __O_SYNC)
                res |= IOCB_SYNC;
        return res;
}

static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags,
                                     int rw_type)
{
        int kiocb_flags = 0;

        /* make sure there's no overlap between RWF and private IOCB flags */
        BUILD_BUG_ON((__force int) RWF_SUPPORTED & IOCB_EVENTFD);

        if (!flags)
                return 0;
        if (unlikely(flags & ~RWF_SUPPORTED))
                return -EOPNOTSUPP;
        if (unlikely((flags & RWF_APPEND) && (flags & RWF_NOAPPEND)))
                return -EINVAL;

        if (flags & RWF_NOWAIT) {
                if (!(ki->ki_filp->f_mode & FMODE_NOWAIT))
                        return -EOPNOTSUPP;
        }
        if (flags & RWF_ATOMIC) {
                if (rw_type != WRITE)
                        return -EOPNOTSUPP;
                if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE))
                        return -EOPNOTSUPP;
        }
        if (flags & RWF_DONTCACHE) {
                /* file system must support it */
                if (!(ki->ki_filp->f_op->fop_flags & FOP_DONTCACHE))
                        return -EOPNOTSUPP;
                /* DAX mappings not supported */
                if (IS_DAX(ki->ki_filp->f_mapping->host))
                        return -EOPNOTSUPP;
        }
        kiocb_flags |= (__force int) (flags & RWF_SUPPORTED);
        if (flags & RWF_SYNC)
                kiocb_flags |= IOCB_DSYNC;

        if ((flags & RWF_NOAPPEND) && (ki->ki_flags & IOCB_APPEND)) {
                if (IS_APPEND(file_inode(ki->ki_filp)))
                        return -EPERM;
                ki->ki_flags &= ~IOCB_APPEND;
        }

        ki->ki_flags |= kiocb_flags;
        return 0;
}

/* Transaction based IO helpers */

/*
 * An argresp is stored in an allocated page and holds the
 * size of the argument or response, along with its content
 */
struct simple_transaction_argresp {
        ssize_t size;
        char data[];
};

#define SIMPLE_TRANSACTION_LIMIT (PAGE_SIZE - sizeof(struct simple_transaction_argresp))

char *simple_transaction_get(struct file *file, const char __user *buf,
                                size_t size);
ssize_t simple_transaction_read(struct file *file, char __user *buf,
                                size_t size, loff_t *pos);
int simple_transaction_release(struct inode *inode, struct file *file);

void simple_transaction_set(struct file *file, size_t n);

/*
 * simple attribute files
 *
 * These attributes behave similar to those in sysfs:
 *
 * Writing to an attribute immediately sets a value, an open file can be
 * written to multiple times.
 *
 * Reading from an attribute creates a buffer from the value that might get
 * read with multiple read calls. When the attribute has been read
 * completely, no further read calls are possible until the file is opened
 * again.
 *
 * All attributes contain a text representation of a numeric value
 * that are accessed with the get() and set() functions.
 */
#define DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, __is_signed)        \
static int __fops ## _open(struct inode *inode, struct file *file)        \
{                                                                        \
        __simple_attr_check_format(__fmt, 0ull);                        \
        return simple_attr_open(inode, file, __get, __set, __fmt);        \
}                                                                        \
static const struct file_operations __fops = {                                \
        .owner         = THIS_MODULE,                                                \
        .open         = __fops ## _open,                                        \
        .release = simple_attr_release,                                        \
        .read         = simple_attr_read,                                        \
        .write         = (__is_signed) ? simple_attr_write_signed : simple_attr_write,        \
        .llseek         = generic_file_llseek,                                        \
}

#define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt)                \
        DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, false)

#define DEFINE_SIMPLE_ATTRIBUTE_SIGNED(__fops, __get, __set, __fmt)        \
        DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, true)

static inline __printf(1, 2)
void __simple_attr_check_format(const char *fmt, ...)
{
        /* don't do anything, just let the compiler check the arguments; */
}

int simple_attr_open(struct inode *inode, struct file *file,
                     int (*get)(void *, u64 *), int (*set)(void *, u64),
                     const char *fmt);
int simple_attr_release(struct inode *inode, struct file *file);
ssize_t simple_attr_read(struct file *file, char __user *buf,
                         size_t len, loff_t *ppos);
ssize_t simple_attr_write(struct file *file, const char __user *buf,
                          size_t len, loff_t *ppos);
ssize_t simple_attr_write_signed(struct file *file, const char __user *buf,
                                 size_t len, loff_t *ppos);

struct ctl_table;
int __init list_bdev_fs_names(char *buf, size_t size);

#define __FMODE_EXEC                ((__force int) FMODE_EXEC)

#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
#define OPEN_FMODE(flag) ((__force fmode_t)((flag + 1) & O_ACCMODE))

static inline bool is_sxid(umode_t mode)
{
        return mode & (S_ISUID | S_ISGID);
}

static inline int check_sticky(struct mnt_idmap *idmap,
                               struct inode *dir, struct inode *inode)
{
        if (!(dir->i_mode & S_ISVTX))
                return 0;

        return __check_sticky(idmap, dir, inode);
}

static inline void inode_has_no_xattr(struct inode *inode)
{
        if (!is_sxid(inode->i_mode) && (inode->i_sb->s_flags & SB_NOSEC))
                inode->i_flags |= S_NOSEC;
}

static inline bool is_root_inode(struct inode *inode)
{
        return inode == inode->i_sb->s_root->d_inode;
}

static inline bool dir_emit(struct dir_context *ctx,
                            const char *name, int namelen,
                            u64 ino, unsigned type)
{
        return ctx->actor(ctx, name, namelen, ctx->pos, ino, type);
}
static inline bool dir_emit_dot(struct file *file, struct dir_context *ctx)
{
        return ctx->actor(ctx, ".", 1, ctx->pos,
                          file->f_path.dentry->d_inode->i_ino, DT_DIR);
}
static inline bool dir_emit_dotdot(struct file *file, struct dir_context *ctx)
{
        return ctx->actor(ctx, "..", 2, ctx->pos,
                          d_parent_ino(file->f_path.dentry), DT_DIR);
}
static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx)
{
        if (ctx->pos == 0) {
                if (!dir_emit_dot(file, ctx))
                        return false;
                ctx->pos = 1;
        }
        if (ctx->pos == 1) {
                if (!dir_emit_dotdot(file, ctx))
                        return false;
                ctx->pos = 2;
        }
        return true;
}
static inline bool dir_relax(struct inode *inode)
{
        inode_unlock(inode);
        inode_lock(inode);
        return !IS_DEADDIR(inode);
}

static inline bool dir_relax_shared(struct inode *inode)
{
        inode_unlock_shared(inode);
        inode_lock_shared(inode);
        return !IS_DEADDIR(inode);
}

extern bool path_noexec(const struct path *path);
extern void inode_nohighmem(struct inode *inode);

/* mm/fadvise.c */
extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len,
                       int advice);
extern int generic_fadvise(struct file *file, loff_t offset, loff_t len,
                           int advice);

static inline bool vfs_empty_path(int dfd, const char __user *path)
{
        char c;

        if (dfd < 0)
                return false;

        /* We now allow NULL to be used for empty path. */
        if (!path)
                return true;

        if (unlikely(get_user(c, path)))
                return false;

        return !c;
}

int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter);

static inline bool extensible_ioctl_valid(unsigned int cmd_a,
                                          unsigned int cmd_b, size_t min_size)
{
        if (_IOC_DIR(cmd_a) != _IOC_DIR(cmd_b))
                return false;
        if (_IOC_TYPE(cmd_a) != _IOC_TYPE(cmd_b))
                return false;
        if (_IOC_NR(cmd_a) != _IOC_NR(cmd_b))
                return false;
        if (_IOC_SIZE(cmd_a) < min_size)
                return false;
        return true;
}

#endif /* _LINUX_FS_H */






























































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_IP_TUNNELS_H
#define __NET_IP_TUNNELS_H 1

#include <linux/if_tunnel.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/socket.h>
#include <linux/types.h>
#include <linux/u64_stats_sync.h>
#include <linux/bitops.h>

#include <net/dsfield.h>
#include <net/flow.h>
#include <net/gro_cells.h>
#include <net/inet_dscp.h>
#include <net/inet_ecn.h>
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
#include <net/lwtunnel.h>
#include <net/dst_cache.h>

#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
#endif

/* Keep error state on tunnel for 30 sec */
#define IPTUNNEL_ERR_TIMEO        (30*HZ)

/* Used to memset ip_tunnel padding. */
#define IP_TUNNEL_KEY_SIZE        offsetofend(struct ip_tunnel_key, tp_dst)

/* Used to memset ipv4 address padding. */
#define IP_TUNNEL_KEY_IPV4_PAD        offsetofend(struct ip_tunnel_key, u.ipv4.dst)
#define IP_TUNNEL_KEY_IPV4_PAD_LEN                                \
        (sizeof_field(struct ip_tunnel_key, u) -                \
         sizeof_field(struct ip_tunnel_key, u.ipv4))

#define __ipt_flag_op(op, ...)                                        \
        op(__VA_ARGS__, __IP_TUNNEL_FLAG_NUM)

#define IP_TUNNEL_DECLARE_FLAGS(...)                                \
        __ipt_flag_op(DECLARE_BITMAP, __VA_ARGS__)

#define ip_tunnel_flags_zero(...)        __ipt_flag_op(bitmap_zero, __VA_ARGS__)
#define ip_tunnel_flags_copy(...)        __ipt_flag_op(bitmap_copy, __VA_ARGS__)
#define ip_tunnel_flags_and(...)        __ipt_flag_op(bitmap_and, __VA_ARGS__)
#define ip_tunnel_flags_or(...)                __ipt_flag_op(bitmap_or, __VA_ARGS__)

#define ip_tunnel_flags_empty(...)                                \
        __ipt_flag_op(bitmap_empty, __VA_ARGS__)
#define ip_tunnel_flags_intersect(...)                                \
        __ipt_flag_op(bitmap_intersects, __VA_ARGS__)
#define ip_tunnel_flags_subset(...)                                \
        __ipt_flag_op(bitmap_subset, __VA_ARGS__)

struct ip_tunnel_key {
        __be64                        tun_id;
        union {
                struct {
                        __be32        src;
                        __be32        dst;
                } ipv4;
                struct {
                        struct in6_addr src;
                        struct in6_addr dst;
                } ipv6;
        } u;
        IP_TUNNEL_DECLARE_FLAGS(tun_flags);
        __be32                        label;                /* Flow Label for IPv6 */
        u32                        nhid;
        u8                        tos;                /* TOS for IPv4, TC for IPv6 */
        u8                        ttl;                /* TTL for IPv4, HL for IPv6 */
        __be16                        tp_src;
        __be16                        tp_dst;
        __u8                        flow_flags;
};

struct ip_tunnel_encap {
        u16                        type;
        u16                        flags;
        __be16                        sport;
        __be16                        dport;
};

/* Flags for ip_tunnel_info mode. */
#define IP_TUNNEL_INFO_TX        0x01        /* represents tx tunnel parameters */
#define IP_TUNNEL_INFO_IPV6        0x02        /* key contains IPv6 addresses */
#define IP_TUNNEL_INFO_BRIDGE        0x04        /* represents a bridged tunnel id */

/* Maximum tunnel options length. */
#define IP_TUNNEL_OPTS_MAX                                        \
        GENMASK((sizeof_field(struct ip_tunnel_info,                \
                              options_len) * BITS_PER_BYTE) - 1, 0)

#define ip_tunnel_info_opts(info)                                \
        _Generic(info,                                                \
                 const struct ip_tunnel_info * : ((const void *)(info)->options),\
                 struct ip_tunnel_info * : ((void *)(info)->options)\
        )

struct ip_tunnel_info {
        struct ip_tunnel_key        key;
        struct ip_tunnel_encap        encap;
#ifdef CONFIG_DST_CACHE
        struct dst_cache        dst_cache;
#endif
        u8                        options_len;
        u8                        mode;
        u8                        options[] __aligned_largest __counted_by(options_len);
};

/* 6rd prefix/relay information */
#ifdef CONFIG_IPV6_SIT_6RD
struct ip_tunnel_6rd_parm {
        struct in6_addr                prefix;
        __be32                        relay_prefix;
        u16                        prefixlen;
        u16                        relay_prefixlen;
};
#endif

struct ip_tunnel_prl_entry {
        struct ip_tunnel_prl_entry __rcu *next;
        __be32                                addr;
        u16                                flags;
        struct rcu_head                        rcu_head;
};

struct metadata_dst;

/* Kernel-side variant of ip_tunnel_parm */
struct ip_tunnel_parm_kern {
        char                        name[IFNAMSIZ];
        IP_TUNNEL_DECLARE_FLAGS(i_flags);
        IP_TUNNEL_DECLARE_FLAGS(o_flags);
        __be32                        i_key;
        __be32                        o_key;
        int                        link;
        struct iphdr                iph;
};

struct ip_tunnel {
        struct ip_tunnel __rcu        *next;
        struct hlist_node hash_node;

        struct net_device        *dev;
        netdevice_tracker        dev_tracker;

        struct net                *net;        /* netns for packet i/o */

        unsigned long        err_time;        /* Time when the last ICMP error
                                         * arrived */
        int                err_count;        /* Number of arrived ICMP errors */

        /* These four fields used only by GRE */
        u32                i_seqno;        /* The last seen seqno        */
        atomic_t        o_seqno;        /* The last output seqno */
        int                tun_hlen;        /* Precalculated header length */

        /* These four fields used only by ERSPAN */
        u32                index;                /* ERSPAN type II index */
        u8                erspan_ver;        /* ERSPAN version */
        u8                dir;                /* ERSPAN direction */
        u16                hwid;                /* ERSPAN hardware ID */

        struct dst_cache dst_cache;

        struct ip_tunnel_parm_kern parms;

        int                mlink;
        int                encap_hlen;        /* Encap header length (FOU,GUE) */
        int                hlen;                /* tun_hlen + encap_hlen */
        struct ip_tunnel_encap encap;

        /* for SIT */
#ifdef CONFIG_IPV6_SIT_6RD
        struct ip_tunnel_6rd_parm ip6rd;
#endif
        struct ip_tunnel_prl_entry __rcu *prl;        /* potential router list */
        unsigned int                prl_count;        /* # of entries in PRL */
        unsigned int                ip_tnl_net_id;
        struct gro_cells        gro_cells;
        __u32                        fwmark;
        bool                        collect_md;
        bool                        ignore_df;
};

struct tnl_ptk_info {
        IP_TUNNEL_DECLARE_FLAGS(flags);
        __be16 proto;
        __be32 key;
        __be32 seq;
        int hdr_len;
};

#define PACKET_RCVD        0
#define PACKET_REJECT        1
#define PACKET_NEXT        2

#define IP_TNL_HASH_BITS   7
#define IP_TNL_HASH_SIZE   (1 << IP_TNL_HASH_BITS)

struct ip_tunnel_net {
        struct net_device *fb_tunnel_dev;
        struct rtnl_link_ops *rtnl_link_ops;
        struct hlist_head tunnels[IP_TNL_HASH_SIZE];
        struct ip_tunnel __rcu *collect_md_tun;
        int type;
};

static inline void ip_tunnel_set_options_present(unsigned long *flags)
{
        IP_TUNNEL_DECLARE_FLAGS(present) = { };

        __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, present);
        __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present);
        __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present);
        __set_bit(IP_TUNNEL_GTP_OPT_BIT, present);
        __set_bit(IP_TUNNEL_PFCP_OPT_BIT, present);

        ip_tunnel_flags_or(flags, flags, present);
}

static inline void ip_tunnel_clear_options_present(unsigned long *flags)
{
        IP_TUNNEL_DECLARE_FLAGS(present) = { };

        __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, present);
        __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present);
        __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present);
        __set_bit(IP_TUNNEL_GTP_OPT_BIT, present);
        __set_bit(IP_TUNNEL_PFCP_OPT_BIT, present);

        __ipt_flag_op(bitmap_andnot, flags, flags, present);
}

static inline bool ip_tunnel_is_options_present(const unsigned long *flags)
{
        IP_TUNNEL_DECLARE_FLAGS(present) = { };

        __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, present);
        __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present);
        __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present);
        __set_bit(IP_TUNNEL_GTP_OPT_BIT, present);
        __set_bit(IP_TUNNEL_PFCP_OPT_BIT, present);

        return ip_tunnel_flags_intersect(flags, present);
}

static inline bool ip_tunnel_flags_is_be16_compat(const unsigned long *flags)
{
        IP_TUNNEL_DECLARE_FLAGS(supp) = { };

        bitmap_set(supp, 0, BITS_PER_TYPE(__be16));
        __set_bit(IP_TUNNEL_VTI_BIT, supp);

        return ip_tunnel_flags_subset(flags, supp);
}

static inline void ip_tunnel_flags_from_be16(unsigned long *dst, __be16 flags)
{
        ip_tunnel_flags_zero(dst);

        bitmap_write(dst, be16_to_cpu(flags), 0, BITS_PER_TYPE(__be16));
        __assign_bit(IP_TUNNEL_VTI_BIT, dst, flags & VTI_ISVTI);
}

static inline __be16 ip_tunnel_flags_to_be16(const unsigned long *flags)
{
        __be16 ret;

        ret = cpu_to_be16(bitmap_read(flags, 0, BITS_PER_TYPE(__be16)));
        if (test_bit(IP_TUNNEL_VTI_BIT, flags))
                ret |= VTI_ISVTI;

        return ret;
}

static inline void ip_tunnel_key_init(struct ip_tunnel_key *key,
                                      __be32 saddr, __be32 daddr,
                                      u8 tos, u8 ttl, __be32 label,
                                      __be16 tp_src, __be16 tp_dst,
                                      __be64 tun_id,
                                      const unsigned long *tun_flags)
{
        key->tun_id = tun_id;
        key->u.ipv4.src = saddr;
        key->u.ipv4.dst = daddr;
        memset((unsigned char *)key + IP_TUNNEL_KEY_IPV4_PAD,
               0, IP_TUNNEL_KEY_IPV4_PAD_LEN);
        key->tos = tos;
        key->ttl = ttl;
        key->label = label;
        ip_tunnel_flags_copy(key->tun_flags, tun_flags);

        /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of
         * the upper tunnel are used.
         * E.g: GRE over IPSEC, the tp_src and tp_port are zero.
         */
        key->tp_src = tp_src;
        key->tp_dst = tp_dst;

        /* Clear struct padding. */
        if (sizeof(*key) != IP_TUNNEL_KEY_SIZE)
                memset((unsigned char *)key + IP_TUNNEL_KEY_SIZE,
                       0, sizeof(*key) - IP_TUNNEL_KEY_SIZE);
}

static inline bool
ip_tunnel_dst_cache_usable(const struct sk_buff *skb,
                           const struct ip_tunnel_info *info)
{
        if (skb->mark)
                return false;

        return !info || !test_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags);
}

static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info
                                               *tun_info)
{
        return tun_info->mode & IP_TUNNEL_INFO_IPV6 ? AF_INET6 : AF_INET;
}

static inline __be64 key32_to_tunnel_id(__be32 key)
{
#ifdef __BIG_ENDIAN
        return (__force __be64)key;
#else
        return (__force __be64)((__force u64)key << 32);
#endif
}

/* Returns the least-significant 32 bits of a __be64. */
static inline __be32 tunnel_id_to_key32(__be64 tun_id)
{
#ifdef __BIG_ENDIAN
        return (__force __be32)tun_id;
#else
        return (__force __be32)((__force u64)tun_id >> 32);
#endif
}

#ifdef CONFIG_INET

static inline void ip_tunnel_init_flow(struct flowi4 *fl4,
                                       int proto,
                                       __be32 daddr, __be32 saddr,
                                       __be32 key, __u8 tos,
                                       struct net *net, int oif,
                                       __u32 mark, __u32 tun_inner_hash,
                                       __u8 flow_flags)
{
        memset(fl4, 0, sizeof(*fl4));

        if (oif) {
                fl4->flowi4_l3mdev = l3mdev_master_upper_ifindex_by_index(net, oif);
                /* Legacy VRF/l3mdev use case */
                fl4->flowi4_oif = fl4->flowi4_l3mdev ? 0 : oif;
        }

        fl4->daddr = daddr;
        fl4->saddr = saddr;
        fl4->flowi4_dscp = inet_dsfield_to_dscp(tos);
        fl4->flowi4_proto = proto;
        fl4->fl4_gre_key = key;
        fl4->flowi4_mark = mark;
        fl4->flowi4_multipath_hash = tun_inner_hash;
        fl4->flowi4_flags = flow_flags;
}

int ip_tunnel_init(struct net_device *dev);
void ip_tunnel_uninit(struct net_device *dev);
void  ip_tunnel_dellink(struct net_device *dev, struct list_head *head);
struct net *ip_tunnel_get_link_net(const struct net_device *dev);
int ip_tunnel_get_iflink(const struct net_device *dev);
int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
                       struct rtnl_link_ops *ops, char *devname);
void ip_tunnel_delete_net(struct net *net, unsigned int id,
                          struct rtnl_link_ops *ops,
                          struct list_head *dev_to_kill);

void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
                    const struct iphdr *tnl_params, const u8 protocol);
void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
                       const u8 proto, int tunnel_hlen);
int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p,
                  int cmd);
bool ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern *kp,
                              const void __user *data);
bool ip_tunnel_parm_to_user(void __user *data, struct ip_tunnel_parm_kern *kp);
int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
                             void __user *data, int cmd);
int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict);
int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu);

struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
                                   int link, const unsigned long *flags,
                                   __be32 remote, __be32 local,
                                   __be32 key);

void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info);
int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
                  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
                  bool log_ecn_error);
int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
                         struct ip_tunnel_parm_kern *p, __u32 fwmark);
int ip_tunnel_newlink(struct net *net, struct net_device *dev,
                      struct nlattr *tb[], struct ip_tunnel_parm_kern *p,
                      __u32 fwmark);
void ip_tunnel_setup(struct net_device *dev, unsigned int net_id);

bool ip_tunnel_netlink_encap_parms(struct nlattr *data[],
                                   struct ip_tunnel_encap *encap);

void ip_tunnel_netlink_parms(struct nlattr *data[],
                             struct ip_tunnel_parm_kern *parms);

extern const struct header_ops ip_tunnel_header_ops;
__be16 ip_tunnel_parse_protocol(const struct sk_buff *skb);

struct ip_tunnel_encap_ops {
        size_t (*encap_hlen)(struct ip_tunnel_encap *e);
        int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e,
                            u8 *protocol, struct flowi4 *fl4);
        int (*err_handler)(struct sk_buff *skb, u32 info);
};

#define MAX_IPTUN_ENCAP_OPS 8

extern const struct ip_tunnel_encap_ops __rcu *
                iptun_encaps[MAX_IPTUN_ENCAP_OPS];

int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *op,
                            unsigned int num);
int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op,
                            unsigned int num);

int ip_tunnel_encap_setup(struct ip_tunnel *t,
                          struct ip_tunnel_encap *ipencap);

static inline enum skb_drop_reason
pskb_inet_may_pull_reason(struct sk_buff *skb)
{
        int nhlen;

        switch (skb->protocol) {
#if IS_ENABLED(CONFIG_IPV6)
        case htons(ETH_P_IPV6):
                nhlen = sizeof(struct ipv6hdr);
                break;
#endif
        case htons(ETH_P_IP):
                nhlen = sizeof(struct iphdr);
                break;
        default:
                nhlen = 0;
        }

        return pskb_network_may_pull_reason(skb, nhlen);
}

static inline bool pskb_inet_may_pull(struct sk_buff *skb)
{
        return pskb_inet_may_pull_reason(skb) == SKB_NOT_DROPPED_YET;
}

/* Variant of pskb_inet_may_pull().
 */
static inline enum skb_drop_reason
skb_vlan_inet_prepare(struct sk_buff *skb, bool inner_proto_inherit)
{
        int nhlen = 0, maclen = inner_proto_inherit ? 0 : ETH_HLEN;
        __be16 type = skb->protocol;
        enum skb_drop_reason reason;

        /* Essentially this is skb_protocol(skb, true)
         * And we get MAC len.
         */
        if (eth_type_vlan(type))
                type = __vlan_get_protocol(skb, type, &maclen);

        switch (type) {
#if IS_ENABLED(CONFIG_IPV6)
        case htons(ETH_P_IPV6):
                nhlen = sizeof(struct ipv6hdr);
                break;
#endif
        case htons(ETH_P_IP):
                nhlen = sizeof(struct iphdr);
                break;
        }
        /* For ETH_P_IPV6/ETH_P_IP we make sure to pull
         * a base network header in skb->head.
         */
        reason = pskb_may_pull_reason(skb, maclen + nhlen);
        if (reason)
                return reason;

        skb_set_network_header(skb, maclen);

        return SKB_NOT_DROPPED_YET;
}

static inline int ip_encap_hlen(struct ip_tunnel_encap *e)
{
        const struct ip_tunnel_encap_ops *ops;
        int hlen = -EINVAL;

        if (e->type == TUNNEL_ENCAP_NONE)
                return 0;

        if (e->type >= MAX_IPTUN_ENCAP_OPS)
                return -EINVAL;

        rcu_read_lock();
        ops = rcu_dereference(iptun_encaps[e->type]);
        if (likely(ops && ops->encap_hlen))
                hlen = ops->encap_hlen(e);
        rcu_read_unlock();

        return hlen;
}

static inline int ip_tunnel_encap(struct sk_buff *skb,
                                  struct ip_tunnel_encap *e,
                                  u8 *protocol, struct flowi4 *fl4)
{
        const struct ip_tunnel_encap_ops *ops;
        int ret = -EINVAL;

        if (e->type == TUNNEL_ENCAP_NONE)
                return 0;

        if (e->type >= MAX_IPTUN_ENCAP_OPS)
                return -EINVAL;

        rcu_read_lock();
        ops = rcu_dereference(iptun_encaps[e->type]);
        if (likely(ops && ops->build_header))
                ret = ops->build_header(skb, e, protocol, fl4);
        rcu_read_unlock();

        return ret;
}

/* Extract dsfield from inner protocol */
static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph,
                                       const struct sk_buff *skb)
{
        __be16 payload_protocol = skb_protocol(skb, true);

        if (payload_protocol == htons(ETH_P_IP))
                return iph->tos;
        else if (payload_protocol == htons(ETH_P_IPV6))
                return ipv6_get_dsfield((const struct ipv6hdr *)iph);
        else
                return 0;
}

static inline __be32 ip_tunnel_get_flowlabel(const struct iphdr *iph,
                                             const struct sk_buff *skb)
{
        __be16 payload_protocol = skb_protocol(skb, true);

        if (payload_protocol == htons(ETH_P_IPV6))
                return ip6_flowlabel((const struct ipv6hdr *)iph);
        else
                return 0;
}

static inline u8 ip_tunnel_get_ttl(const struct iphdr *iph,
                                       const struct sk_buff *skb)
{
        __be16 payload_protocol = skb_protocol(skb, true);

        if (payload_protocol == htons(ETH_P_IP))
                return iph->ttl;
        else if (payload_protocol == htons(ETH_P_IPV6))
                return ((const struct ipv6hdr *)iph)->hop_limit;
        else
                return 0;
}

/* Propagate ECN bits out */
static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph,
                                     const struct sk_buff *skb)
{
        u8 inner = ip_tunnel_get_dsfield(iph, skb);

        return INET_ECN_encapsulate(tos, inner);
}

int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len,
                           __be16 inner_proto, bool raw_proto, bool xnet);

static inline int iptunnel_pull_header(struct sk_buff *skb, int hdr_len,
                                       __be16 inner_proto, bool xnet)
{
        return __iptunnel_pull_header(skb, hdr_len, inner_proto, false, xnet);
}

void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
                   __be32 src, __be32 dst, u8 proto,
                   u8 tos, u8 ttl, __be16 df, bool xnet, u16 ipcb_flags);
struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
                                             gfp_t flags);
int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst,
                          int headroom, bool reply);

static inline void ip_tunnel_adj_headroom(struct net_device *dev,
                                          unsigned int headroom)
{
        /* we must cap headroom to some upperlimit, else pskb_expand_head
         * will overflow header offsets in skb_headers_offset_update().
         */
        const unsigned int max_allowed = 512;

        if (headroom > max_allowed)
                headroom = max_allowed;

        if (headroom > READ_ONCE(dev->needed_headroom))
                WRITE_ONCE(dev->needed_headroom, headroom);
}

int iptunnel_handle_offloads(struct sk_buff *skb, int gso_type_mask);

static inline int iptunnel_pull_offloads(struct sk_buff *skb)
{
        if (skb_is_gso(skb)) {
                int err;

                err = skb_unclone(skb, GFP_ATOMIC);
                if (unlikely(err))
                        return err;
                skb_shinfo(skb)->gso_type &= ~(NETIF_F_GSO_ENCAP_ALL >>
                                               NETIF_F_GSO_SHIFT);
        }

        skb->encapsulation = 0;
        return 0;
}

static inline void iptunnel_xmit_stats(struct net_device *dev, int pkt_len)
{
        if (pkt_len > 0) {
                struct pcpu_sw_netstats *tstats = get_cpu_ptr(dev->tstats);

                u64_stats_update_begin(&tstats->syncp);
                u64_stats_add(&tstats->tx_bytes, pkt_len);
                u64_stats_inc(&tstats->tx_packets);
                u64_stats_update_end(&tstats->syncp);
                put_cpu_ptr(tstats);
                return;
        }

        if (pkt_len < 0) {
                DEV_STATS_INC(dev, tx_errors);
                DEV_STATS_INC(dev, tx_aborted_errors);
        } else {
                DEV_STATS_INC(dev, tx_dropped);
        }
}

static inline void ip_tunnel_info_opts_get(void *to,
                                           const struct ip_tunnel_info *info)
{
        memcpy(to, ip_tunnel_info_opts(info), info->options_len);
}

static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info,
                                           const void *from, int len,
                                           const unsigned long *flags)
{
        info->options_len = len;
        if (len > 0) {
                memcpy(ip_tunnel_info_opts(info), from, len);
                ip_tunnel_flags_or(info->key.tun_flags, info->key.tun_flags,
                                   flags);
        }
}

static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate)
{
        return (struct ip_tunnel_info *)lwtstate->data;
}

DECLARE_STATIC_KEY_FALSE(ip_tunnel_metadata_cnt);

/* Returns > 0 if metadata should be collected */
static inline int ip_tunnel_collect_metadata(void)
{
        return static_branch_unlikely(&ip_tunnel_metadata_cnt);
}

void __init ip_tunnel_core_init(void);

void ip_tunnel_need_metadata(void);
void ip_tunnel_unneed_metadata(void);

#else /* CONFIG_INET */

static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate)
{
        return NULL;
}

static inline void ip_tunnel_need_metadata(void)
{
}

static inline void ip_tunnel_unneed_metadata(void)
{
}

static inline void ip_tunnel_info_opts_get(void *to,
                                           const struct ip_tunnel_info *info)
{
}

static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info,
                                           const void *from, int len,
                                           const unsigned long *flags)
{
        info->options_len = 0;
}

#endif /* CONFIG_INET */

#endif /* __NET_IP_TUNNELS_H */




































    1 









    1 

    1 











































































































































































   17 








































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Cryptographic API.
 *
 * Deflate algorithm (RFC 1951), implemented here primarily for use
 * by IPCOMP (RFC 3173 & RFC 2394).
 *
 * Copyright (c) 2003 James Morris <jmorris@intercode.com.au>
 * Copyright (c) 2023 Google, LLC. <ardb@kernel.org>
 * Copyright (c) 2025 Herbert Xu <herbert@gondor.apana.org.au>
 */
#include <crypto/internal/acompress.h>
#include <crypto/scatterwalk.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/percpu.h>
#include <linux/scatterlist.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/zlib.h>

#define DEFLATE_DEF_LEVEL                Z_DEFAULT_COMPRESSION
#define DEFLATE_DEF_WINBITS                11
#define DEFLATE_DEF_MEMLEVEL                MAX_MEM_LEVEL

struct deflate_stream {
        struct z_stream_s stream;
        u8 workspace[];
};

static DEFINE_MUTEX(deflate_stream_lock);

static void *deflate_alloc_stream(void)
{
        size_t size = max(zlib_inflate_workspacesize(),
                          zlib_deflate_workspacesize(-DEFLATE_DEF_WINBITS,
                                                     DEFLATE_DEF_MEMLEVEL));
        struct deflate_stream *ctx;

        ctx = kvmalloc(sizeof(*ctx) + size, GFP_KERNEL);
        if (!ctx)
                return ERR_PTR(-ENOMEM);

        ctx->stream.workspace = ctx->workspace;

        return ctx;
}

static void deflate_free_stream(void *ctx)
{
        kvfree(ctx);
}

static struct crypto_acomp_streams deflate_streams = {
        .alloc_ctx = deflate_alloc_stream,
        .free_ctx = deflate_free_stream,
};

static int deflate_compress_one(struct acomp_req *req,
                                struct deflate_stream *ds)
{
        struct z_stream_s *stream = &ds->stream;
        struct acomp_walk walk;
        int ret;

        ret = acomp_walk_virt(&walk, req, true);
        if (ret)
                return ret;

        do {
                unsigned int dcur;

                dcur = acomp_walk_next_dst(&walk);
                if (!dcur)
                        return -ENOSPC;

                stream->avail_out = dcur;
                stream->next_out = walk.dst.virt.addr;

                do {
                        int flush = Z_FINISH;
                        unsigned int scur;

                        stream->avail_in = 0;
                        stream->next_in = NULL;

                        scur = acomp_walk_next_src(&walk);
                        if (scur) {
                                if (acomp_walk_more_src(&walk, scur))
                                        flush = Z_NO_FLUSH;
                                stream->avail_in = scur;
                                stream->next_in = walk.src.virt.addr;
                        }

                        ret = zlib_deflate(stream, flush);

                        if (scur) {
                                scur -= stream->avail_in;
                                acomp_walk_done_src(&walk, scur);
                        }
                } while (ret == Z_OK && stream->avail_out);

                acomp_walk_done_dst(&walk, dcur);
        } while (ret == Z_OK);

        if (ret != Z_STREAM_END)
                return -EINVAL;

        req->dlen = stream->total_out;
        return 0;
}

static int deflate_compress(struct acomp_req *req)
{
        struct crypto_acomp_stream *s;
        struct deflate_stream *ds;
        int err;

        s = crypto_acomp_lock_stream_bh(&deflate_streams);
        ds = s->ctx;

        err = zlib_deflateInit2(&ds->stream, DEFLATE_DEF_LEVEL, Z_DEFLATED,
                                -DEFLATE_DEF_WINBITS, DEFLATE_DEF_MEMLEVEL,
                                Z_DEFAULT_STRATEGY);
        if (err != Z_OK) {
                err = -EINVAL;
                goto out;
        }

        err = deflate_compress_one(req, ds);

out:
        crypto_acomp_unlock_stream_bh(s);

        return err;
}

static int deflate_decompress_one(struct acomp_req *req,
                                  struct deflate_stream *ds)
{
        struct z_stream_s *stream = &ds->stream;
        bool out_of_space = false;
        struct acomp_walk walk;
        int ret;

        ret = acomp_walk_virt(&walk, req, true);
        if (ret)
                return ret;

        do {
                unsigned int scur;

                stream->avail_in = 0;
                stream->next_in = NULL;

                scur = acomp_walk_next_src(&walk);
                if (scur) {
                        stream->avail_in = scur;
                        stream->next_in = walk.src.virt.addr;
                }

                do {
                        unsigned int dcur;

                        dcur = acomp_walk_next_dst(&walk);
                        if (!dcur) {
                                out_of_space = true;
                                break;
                        }

                        stream->avail_out = dcur;
                        stream->next_out = walk.dst.virt.addr;

                        ret = zlib_inflate(stream, Z_NO_FLUSH);

                        dcur -= stream->avail_out;
                        acomp_walk_done_dst(&walk, dcur);
                } while (ret == Z_OK && stream->avail_in);

                if (scur)
                        acomp_walk_done_src(&walk, scur);

                if (out_of_space)
                        return -ENOSPC;
        } while (ret == Z_OK);

        if (ret != Z_STREAM_END)
                return -EINVAL;

        req->dlen = stream->total_out;
        return 0;
}

static int deflate_decompress(struct acomp_req *req)
{
        struct crypto_acomp_stream *s;
        struct deflate_stream *ds;
        int err;

        s = crypto_acomp_lock_stream_bh(&deflate_streams);
        ds = s->ctx;

        err = zlib_inflateInit2(&ds->stream, -DEFLATE_DEF_WINBITS);
        if (err != Z_OK) {
                err = -EINVAL;
                goto out;
        }

        err = deflate_decompress_one(req, ds);

out:
        crypto_acomp_unlock_stream_bh(s);

        return err;
}

static int deflate_init(struct crypto_acomp *tfm)
{
        int ret;

        mutex_lock(&deflate_stream_lock);
        ret = crypto_acomp_alloc_streams(&deflate_streams);
        mutex_unlock(&deflate_stream_lock);

        return ret;
}

static struct acomp_alg acomp = {
        .compress                = deflate_compress,
        .decompress                = deflate_decompress,
        .init                        = deflate_init,
        .base.cra_name                = "deflate",
        .base.cra_driver_name        = "deflate-generic",
        .base.cra_flags                = CRYPTO_ALG_REQ_VIRT,
        .base.cra_module        = THIS_MODULE,
};

static int __init deflate_mod_init(void)
{
        return crypto_register_acomp(&acomp);
}

static void __exit deflate_mod_fini(void)
{
        crypto_unregister_acomp(&acomp);
        crypto_acomp_free_streams(&deflate_streams);
}

module_init(deflate_mod_init);
module_exit(deflate_mod_fini);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Deflate Compression Algorithm for IPCOMP");
MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
MODULE_AUTHOR("Ard Biesheuvel <ardb@kernel.org>");
MODULE_AUTHOR("Herbert Xu <herbert@gondor.apana.org.au>");
MODULE_ALIAS_CRYPTO("deflate");
MODULE_ALIAS_CRYPTO("deflate-generic");




























































































































    2 




















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Crypto API support for SHA-1 and HMAC-SHA1
 *
 * Copyright (c) Alan Smithee.
 * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
 * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
 * Copyright 2025 Google LLC
 */
#include <crypto/internal/hash.h>
#include <crypto/sha1.h>
#include <linux/kernel.h>
#include <linux/module.h>

/*
 * Export and import functions.  crypto_shash wants a particular format that
 * matches that used by some legacy drivers.  It currently is the same as the
 * library SHA context, except the value in bytecount must be block-aligned and
 * the remainder must be stored in an extra u8 appended to the struct.
 */

#define SHA1_SHASH_STATE_SIZE (sizeof(struct sha1_ctx) + 1)
static_assert(sizeof(struct sha1_ctx) == sizeof(struct sha1_state));
static_assert(offsetof(struct sha1_ctx, state) == offsetof(struct sha1_state, state));
static_assert(offsetof(struct sha1_ctx, bytecount) == offsetof(struct sha1_state, count));
static_assert(offsetof(struct sha1_ctx, buf) == offsetof(struct sha1_state, buffer));

static int __crypto_sha1_export(const struct sha1_ctx *ctx0, void *out)
{
        struct sha1_ctx ctx = *ctx0;
        unsigned int partial;
        u8 *p = out;

        partial = ctx.bytecount % SHA1_BLOCK_SIZE;
        ctx.bytecount -= partial;
        memcpy(p, &ctx, sizeof(ctx));
        p += sizeof(ctx);
        *p = partial;
        return 0;
}

static int __crypto_sha1_import(struct sha1_ctx *ctx, const void *in)
{
        const u8 *p = in;

        memcpy(ctx, p, sizeof(*ctx));
        p += sizeof(*ctx);
        ctx->bytecount += *p;
        return 0;
}

static int __crypto_sha1_export_core(const struct sha1_ctx *ctx, void *out)
{
        memcpy(out, ctx, offsetof(struct sha1_ctx, buf));
        return 0;
}

static int __crypto_sha1_import_core(struct sha1_ctx *ctx, const void *in)
{
        memcpy(ctx, in, offsetof(struct sha1_ctx, buf));
        return 0;
}

const u8 sha1_zero_message_hash[SHA1_DIGEST_SIZE] = {
        0xda, 0x39, 0xa3, 0xee, 0x5e, 0x6b, 0x4b, 0x0d,
        0x32, 0x55, 0xbf, 0xef, 0x95, 0x60, 0x18, 0x90,
        0xaf, 0xd8, 0x07, 0x09
};
EXPORT_SYMBOL_GPL(sha1_zero_message_hash);

#define SHA1_CTX(desc) ((struct sha1_ctx *)shash_desc_ctx(desc))

static int crypto_sha1_init(struct shash_desc *desc)
{
        sha1_init(SHA1_CTX(desc));
        return 0;
}

static int crypto_sha1_update(struct shash_desc *desc,
                              const u8 *data, unsigned int len)
{
        sha1_update(SHA1_CTX(desc), data, len);
        return 0;
}

static int crypto_sha1_final(struct shash_desc *desc, u8 *out)
{
        sha1_final(SHA1_CTX(desc), out);
        return 0;
}

static int crypto_sha1_digest(struct shash_desc *desc,
                              const u8 *data, unsigned int len, u8 *out)
{
        sha1(data, len, out);
        return 0;
}

static int crypto_sha1_export(struct shash_desc *desc, void *out)
{
        return __crypto_sha1_export(SHA1_CTX(desc), out);
}

static int crypto_sha1_import(struct shash_desc *desc, const void *in)
{
        return __crypto_sha1_import(SHA1_CTX(desc), in);
}

static int crypto_sha1_export_core(struct shash_desc *desc, void *out)
{
        return __crypto_sha1_export_core(SHA1_CTX(desc), out);
}

static int crypto_sha1_import_core(struct shash_desc *desc, const void *in)
{
        return __crypto_sha1_import_core(SHA1_CTX(desc), in);
}

#define HMAC_SHA1_KEY(tfm) ((struct hmac_sha1_key *)crypto_shash_ctx(tfm))
#define HMAC_SHA1_CTX(desc) ((struct hmac_sha1_ctx *)shash_desc_ctx(desc))

static int crypto_hmac_sha1_setkey(struct crypto_shash *tfm,
                                   const u8 *raw_key, unsigned int keylen)
{
        hmac_sha1_preparekey(HMAC_SHA1_KEY(tfm), raw_key, keylen);
        return 0;
}

static int crypto_hmac_sha1_init(struct shash_desc *desc)
{
        hmac_sha1_init(HMAC_SHA1_CTX(desc), HMAC_SHA1_KEY(desc->tfm));
        return 0;
}

static int crypto_hmac_sha1_update(struct shash_desc *desc,
                                   const u8 *data, unsigned int len)
{
        hmac_sha1_update(HMAC_SHA1_CTX(desc), data, len);
        return 0;
}

static int crypto_hmac_sha1_final(struct shash_desc *desc, u8 *out)
{
        hmac_sha1_final(HMAC_SHA1_CTX(desc), out);
        return 0;
}

static int crypto_hmac_sha1_digest(struct shash_desc *desc,
                                   const u8 *data, unsigned int len, u8 *out)
{
        hmac_sha1(HMAC_SHA1_KEY(desc->tfm), data, len, out);
        return 0;
}

static int crypto_hmac_sha1_export(struct shash_desc *desc, void *out)
{
        return __crypto_sha1_export(&HMAC_SHA1_CTX(desc)->sha_ctx, out);
}

static int crypto_hmac_sha1_import(struct shash_desc *desc, const void *in)
{
        struct hmac_sha1_ctx *ctx = HMAC_SHA1_CTX(desc);

        ctx->ostate = HMAC_SHA1_KEY(desc->tfm)->ostate;
        return __crypto_sha1_import(&ctx->sha_ctx, in);
}

static int crypto_hmac_sha1_export_core(struct shash_desc *desc, void *out)
{
        return __crypto_sha1_export_core(&HMAC_SHA1_CTX(desc)->sha_ctx, out);
}

static int crypto_hmac_sha1_import_core(struct shash_desc *desc, const void *in)
{
        struct hmac_sha1_ctx *ctx = HMAC_SHA1_CTX(desc);

        ctx->ostate = HMAC_SHA1_KEY(desc->tfm)->ostate;
        return __crypto_sha1_import_core(&ctx->sha_ctx, in);
}

static struct shash_alg algs[] = {
        {
                .base.cra_name                = "sha1",
                .base.cra_driver_name        = "sha1-lib",
                .base.cra_priority        = 300,
                .base.cra_blocksize        = SHA1_BLOCK_SIZE,
                .base.cra_module        = THIS_MODULE,
                .digestsize                = SHA1_DIGEST_SIZE,
                .init                        = crypto_sha1_init,
                .update                        = crypto_sha1_update,
                .final                        = crypto_sha1_final,
                .digest                        = crypto_sha1_digest,
                .export                        = crypto_sha1_export,
                .import                        = crypto_sha1_import,
                .export_core                = crypto_sha1_export_core,
                .import_core                = crypto_sha1_import_core,
                .descsize                = sizeof(struct sha1_ctx),
                .statesize                = SHA1_SHASH_STATE_SIZE,
        },
        {
                .base.cra_name                = "hmac(sha1)",
                .base.cra_driver_name        = "hmac-sha1-lib",
                .base.cra_priority        = 300,
                .base.cra_blocksize        = SHA1_BLOCK_SIZE,
                .base.cra_ctxsize        = sizeof(struct hmac_sha1_key),
                .base.cra_module        = THIS_MODULE,
                .digestsize                = SHA1_DIGEST_SIZE,
                .setkey                        = crypto_hmac_sha1_setkey,
                .init                        = crypto_hmac_sha1_init,
                .update                        = crypto_hmac_sha1_update,
                .final                        = crypto_hmac_sha1_final,
                .digest                        = crypto_hmac_sha1_digest,
                .export                        = crypto_hmac_sha1_export,
                .import                        = crypto_hmac_sha1_import,
                .export_core                = crypto_hmac_sha1_export_core,
                .import_core                = crypto_hmac_sha1_import_core,
                .descsize                = sizeof(struct hmac_sha1_ctx),
                .statesize                = SHA1_SHASH_STATE_SIZE,
        },
};

static int __init crypto_sha1_mod_init(void)
{
        return crypto_register_shashes(algs, ARRAY_SIZE(algs));
}
module_init(crypto_sha1_mod_init);

static void __exit crypto_sha1_mod_exit(void)
{
        crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
}
module_exit(crypto_sha1_mod_exit);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Crypto API support for SHA-1 and HMAC-SHA1");

MODULE_ALIAS_CRYPTO("sha1");
MODULE_ALIAS_CRYPTO("sha1-lib");
MODULE_ALIAS_CRYPTO("hmac(sha1)");
MODULE_ALIAS_CRYPTO("hmac-sha1-lib");






























  312 

  307 

  317 














    7 


    7 




    7 


    7 



    7 







































































































































































































































  317 

  318 


  319 
  319 
    7 
    7 

























  319 


















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* I/O iterator iteration building functions.
 *
 * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#ifndef _LINUX_IOV_ITER_H
#define _LINUX_IOV_ITER_H

#include <linux/uio.h>
#include <linux/bvec.h>
#include <linux/folio_queue.h>

typedef size_t (*iov_step_f)(void *iter_base, size_t progress, size_t len,
                             void *priv, void *priv2);
typedef size_t (*iov_ustep_f)(void __user *iter_base, size_t progress, size_t len,
                              void *priv, void *priv2);

/*
 * Handle ITER_UBUF.
 */
static __always_inline
size_t iterate_ubuf(struct iov_iter *iter, size_t len, void *priv, void *priv2,
                    iov_ustep_f step)
{
        void __user *base = iter->ubuf;
        size_t progress = 0, remain;

        remain = step(base + iter->iov_offset, 0, len, priv, priv2);
        progress = len - remain;
        iter->iov_offset += progress;
        iter->count -= progress;
        return progress;
}

/*
 * Handle ITER_IOVEC.
 */
static __always_inline
size_t iterate_iovec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
                     iov_ustep_f step)
{
        const struct iovec *p = iter->__iov;
        size_t progress = 0, skip = iter->iov_offset;

        do {
                size_t remain, consumed;
                size_t part = min(len, p->iov_len - skip);

                if (likely(part)) {
                        remain = step(p->iov_base + skip, progress, part, priv, priv2);
                        consumed = part - remain;
                        progress += consumed;
                        skip += consumed;
                        len -= consumed;
                        if (skip < p->iov_len)
                                break;
                }
                p++;
                skip = 0;
        } while (len);

        iter->nr_segs -= p - iter->__iov;
        iter->__iov = p;
        iter->iov_offset = skip;
        iter->count -= progress;
        return progress;
}

/*
 * Handle ITER_KVEC.
 */
static __always_inline
size_t iterate_kvec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
                    iov_step_f step)
{
        const struct kvec *p = iter->kvec;
        size_t progress = 0, skip = iter->iov_offset;

        do {
                size_t remain, consumed;
                size_t part = min(len, p->iov_len - skip);

                if (likely(part)) {
                        remain = step(p->iov_base + skip, progress, part, priv, priv2);
                        consumed = part - remain;
                        progress += consumed;
                        skip += consumed;
                        len -= consumed;
                        if (skip < p->iov_len)
                                break;
                }
                p++;
                skip = 0;
        } while (len);

        iter->nr_segs -= p - iter->kvec;
        iter->kvec = p;
        iter->iov_offset = skip;
        iter->count -= progress;
        return progress;
}

/*
 * Handle ITER_BVEC.
 */
static __always_inline
size_t iterate_bvec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
                    iov_step_f step)
{
        const struct bio_vec *p = iter->bvec;
        size_t progress = 0, skip = iter->iov_offset;

        do {
                size_t remain, consumed;
                size_t offset = p->bv_offset + skip, part;
                void *kaddr = kmap_local_page(p->bv_page + offset / PAGE_SIZE);

                part = min3(len,
                           (size_t)(p->bv_len - skip),
                           (size_t)(PAGE_SIZE - offset % PAGE_SIZE));
                remain = step(kaddr + offset % PAGE_SIZE, progress, part, priv, priv2);
                kunmap_local(kaddr);
                consumed = part - remain;
                len -= consumed;
                progress += consumed;
                skip += consumed;
                if (skip >= p->bv_len) {
                        skip = 0;
                        p++;
                }
                if (remain)
                        break;
        } while (len);

        iter->nr_segs -= p - iter->bvec;
        iter->bvec = p;
        iter->iov_offset = skip;
        iter->count -= progress;
        return progress;
}

/*
 * Handle ITER_FOLIOQ.
 */
static __always_inline
size_t iterate_folioq(struct iov_iter *iter, size_t len, void *priv, void *priv2,
                      iov_step_f step)
{
        const struct folio_queue *folioq = iter->folioq;
        unsigned int slot = iter->folioq_slot;
        size_t progress = 0, skip = iter->iov_offset;

        if (slot == folioq_nr_slots(folioq)) {
                /* The iterator may have been extended. */
                folioq = folioq->next;
                slot = 0;
        }

        do {
                struct folio *folio = folioq_folio(folioq, slot);
                size_t part, remain = 0, consumed;
                size_t fsize;
                void *base;

                if (!folio)
                        break;

                fsize = folioq_folio_size(folioq, slot);
                if (skip < fsize) {
                        base = kmap_local_folio(folio, skip);
                        part = umin(len, PAGE_SIZE - skip % PAGE_SIZE);
                        remain = step(base, progress, part, priv, priv2);
                        kunmap_local(base);
                        consumed = part - remain;
                        len -= consumed;
                        progress += consumed;
                        skip += consumed;
                }
                if (skip >= fsize) {
                        skip = 0;
                        slot++;
                        if (slot == folioq_nr_slots(folioq) && folioq->next) {
                                folioq = folioq->next;
                                slot = 0;
                        }
                }
                if (remain)
                        break;
        } while (len);

        iter->folioq_slot = slot;
        iter->folioq = folioq;
        iter->iov_offset = skip;
        iter->count -= progress;
        return progress;
}

/*
 * Handle ITER_XARRAY.
 */
static __always_inline
size_t iterate_xarray(struct iov_iter *iter, size_t len, void *priv, void *priv2,
                      iov_step_f step)
{
        struct folio *folio;
        size_t progress = 0;
        loff_t start = iter->xarray_start + iter->iov_offset;
        pgoff_t index = start / PAGE_SIZE;
        XA_STATE(xas, iter->xarray, index);

        rcu_read_lock();
        xas_for_each(&xas, folio, ULONG_MAX) {
                size_t remain, consumed, offset, part, flen;

                if (xas_retry(&xas, folio))
                        continue;
                if (WARN_ON(xa_is_value(folio)))
                        break;
                if (WARN_ON(folio_test_hugetlb(folio)))
                        break;

                offset = offset_in_folio(folio, start + progress);
                flen = min(folio_size(folio) - offset, len);

                while (flen) {
                        void *base = kmap_local_folio(folio, offset);

                        part = min_t(size_t, flen,
                                     PAGE_SIZE - offset_in_page(offset));
                        remain = step(base, progress, part, priv, priv2);
                        kunmap_local(base);

                        consumed = part - remain;
                        progress += consumed;
                        len -= consumed;

                        if (remain || len == 0)
                                goto out;
                        flen -= consumed;
                        offset += consumed;
                }
        }

out:
        rcu_read_unlock();
        iter->iov_offset += progress;
        iter->count -= progress;
        return progress;
}

/*
 * Handle ITER_DISCARD.
 */
static __always_inline
size_t iterate_discard(struct iov_iter *iter, size_t len, void *priv, void *priv2,
                      iov_step_f step)
{
        size_t progress = len;

        iter->count -= progress;
        return progress;
}

/**
 * iterate_and_advance2 - Iterate over an iterator
 * @iter: The iterator to iterate over.
 * @len: The amount to iterate over.
 * @priv: Data for the step functions.
 * @priv2: More data for the step functions.
 * @ustep: Function for UBUF/IOVEC iterators; given __user addresses.
 * @step: Function for other iterators; given kernel addresses.
 *
 * Iterate over the next part of an iterator, up to the specified length.  The
 * buffer is presented in segments, which for kernel iteration are broken up by
 * physical pages and mapped, with the mapped address being presented.
 *
 * Two step functions, @step and @ustep, must be provided, one for handling
 * mapped kernel addresses and the other is given user addresses which have the
 * potential to fault since no pinning is performed.
 *
 * The step functions are passed the address and length of the segment, @priv,
 * @priv2 and the amount of data so far iterated over (which can, for example,
 * be added to @priv to point to the right part of a second buffer).  The step
 * functions should return the amount of the segment they didn't process (ie. 0
 * indicates complete processsing).
 *
 * This function returns the amount of data processed (ie. 0 means nothing was
 * processed and the value of @len means processes to completion).
 */
static __always_inline
size_t iterate_and_advance2(struct iov_iter *iter, size_t len, void *priv,
                            void *priv2, iov_ustep_f ustep, iov_step_f step)
{
        if (unlikely(iter->count < len))
                len = iter->count;
        if (unlikely(!len))
                return 0;

        if (likely(iter_is_ubuf(iter)))
                return iterate_ubuf(iter, len, priv, priv2, ustep);
        if (likely(iter_is_iovec(iter)))
                return iterate_iovec(iter, len, priv, priv2, ustep);
        if (iov_iter_is_bvec(iter))
                return iterate_bvec(iter, len, priv, priv2, step);
        if (iov_iter_is_kvec(iter))
                return iterate_kvec(iter, len, priv, priv2, step);
        if (iov_iter_is_folioq(iter))
                return iterate_folioq(iter, len, priv, priv2, step);
        if (iov_iter_is_xarray(iter))
                return iterate_xarray(iter, len, priv, priv2, step);
        return iterate_discard(iter, len, priv, priv2, step);
}

/**
 * iterate_and_advance - Iterate over an iterator
 * @iter: The iterator to iterate over.
 * @len: The amount to iterate over.
 * @priv: Data for the step functions.
 * @ustep: Function for UBUF/IOVEC iterators; given __user addresses.
 * @step: Function for other iterators; given kernel addresses.
 *
 * As iterate_and_advance2(), but priv2 is always NULL.
 */
static __always_inline
size_t iterate_and_advance(struct iov_iter *iter, size_t len, void *priv,
                           iov_ustep_f ustep, iov_step_f step)
{
        return iterate_and_advance2(iter, len, priv, NULL, ustep, step);
}

/**
 * iterate_and_advance_kernel - Iterate over a kernel-internal iterator
 * @iter: The iterator to iterate over.
 * @len: The amount to iterate over.
 * @priv: Data for the step functions.
 * @priv2: More data for the step functions.
 * @step: Function for other iterators; given kernel addresses.
 *
 * Iterate over the next part of an iterator, up to the specified length.  The
 * buffer is presented in segments, which for kernel iteration are broken up by
 * physical pages and mapped, with the mapped address being presented.
 *
 * [!] Note This will only handle BVEC, KVEC, FOLIOQ, XARRAY and DISCARD-type
 * iterators; it will not handle UBUF or IOVEC-type iterators.
 *
 * A step functions, @step, must be provided, one for handling mapped kernel
 * addresses and the other is given user addresses which have the potential to
 * fault since no pinning is performed.
 *
 * The step functions are passed the address and length of the segment, @priv,
 * @priv2 and the amount of data so far iterated over (which can, for example,
 * be added to @priv to point to the right part of a second buffer).  The step
 * functions should return the amount of the segment they didn't process (ie. 0
 * indicates complete processsing).
 *
 * This function returns the amount of data processed (ie. 0 means nothing was
 * processed and the value of @len means processes to completion).
 */
static __always_inline
size_t iterate_and_advance_kernel(struct iov_iter *iter, size_t len, void *priv,
                                  void *priv2, iov_step_f step)
{
        if (unlikely(iter->count < len))
                len = iter->count;
        if (unlikely(!len))
                return 0;
        if (iov_iter_is_bvec(iter))
                return iterate_bvec(iter, len, priv, priv2, step);
        if (iov_iter_is_kvec(iter))
                return iterate_kvec(iter, len, priv, priv2, step);
        if (iov_iter_is_folioq(iter))
                return iterate_folioq(iter, len, priv, priv2, step);
        if (iov_iter_is_xarray(iter))
                return iterate_xarray(iter, len, priv, priv2, step);
        return iterate_discard(iter, len, priv, priv2, step);
}

#endif /* _LINUX_IOV_ITER_H */




























































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Wireless configuration interface internals.
 *
 * Copyright 2006-2010        Johannes Berg <johannes@sipsolutions.net>
 * Copyright (C) 2018-2025 Intel Corporation
 */
#ifndef __NET_WIRELESS_CORE_H
#define __NET_WIRELESS_CORE_H
#include <linux/list.h>
#include <linux/netdevice.h>
#include <linux/rbtree.h>
#include <linux/debugfs.h>
#include <linux/rfkill.h>
#include <linux/workqueue.h>
#include <linux/rtnetlink.h>
#include <net/genetlink.h>
#include <net/cfg80211.h>
#include "reg.h"


#define WIPHY_IDX_INVALID        -1

struct cfg80211_scan_request_int {
        struct cfg80211_scan_info info;
        bool notified;
        /* must be last - variable members */
        struct cfg80211_scan_request req;
};

struct cfg80211_registered_device {
        const struct cfg80211_ops *ops;
        struct list_head list;

        /* rfkill support */
        struct rfkill_ops rfkill_ops;
        struct work_struct rfkill_block;

        /* ISO / IEC 3166 alpha2 for which this device is receiving
         * country IEs on, this can help disregard country IEs from APs
         * on the same alpha2 quickly. The alpha2 may differ from
         * cfg80211_regdomain's alpha2 when an intersection has occurred.
         * If the AP is reconfigured this can also be used to tell us if
         * the country on the country IE changed. */
        char country_ie_alpha2[2];

        /*
         * the driver requests the regulatory core to set this regulatory
         * domain as the wiphy's. Only used for %REGULATORY_WIPHY_SELF_MANAGED
         * devices using the regulatory_set_wiphy_regd() API
         */
        const struct ieee80211_regdomain *requested_regd;

        /* If a Country IE has been received this tells us the environment
         * which its telling us its in. This defaults to ENVIRON_ANY */
        enum environment_cap env;

        /* wiphy index, internal only */
        int wiphy_idx;

        /* protected by RTNL */
        int devlist_generation, wdev_id;
        int opencount;
        wait_queue_head_t dev_wait;

        struct list_head beacon_registrations;
        spinlock_t beacon_registrations_lock;

        /* protected by RTNL only */
        int num_running_ifaces;
        int num_running_monitor_ifaces;
        u64 cookie_counter;

        /* BSSes/scanning */
        spinlock_t bss_lock;
        struct list_head bss_list;
        struct rb_root bss_tree;
        u32 bss_generation;
        u32 bss_entries;
        struct cfg80211_scan_request_int *scan_req; /* protected by RTNL */
        struct cfg80211_scan_request_int *int_scan_req;
        struct sk_buff *scan_msg;
        struct list_head sched_scan_req_list;
        time64_t suspend_at;
        struct wiphy_work scan_done_wk;

        struct genl_info *cur_cmd_info;

        struct work_struct conn_work;
        struct work_struct event_work;

        struct delayed_work dfs_update_channels_wk;

        struct wireless_dev *background_radar_wdev;
        struct cfg80211_chan_def background_radar_chandef;
        struct delayed_work background_cac_done_wk;
        struct work_struct background_cac_abort_wk;

        /* netlink port which started critical protocol (0 means not started) */
        u32 crit_proto_nlportid;

        struct cfg80211_coalesce *coalesce;

        struct work_struct destroy_work;
        struct wiphy_work sched_scan_stop_wk;
        struct work_struct sched_scan_res_wk;

        struct cfg80211_chan_def radar_chandef;
        struct work_struct propagate_radar_detect_wk;

        struct cfg80211_chan_def cac_done_chandef;
        struct work_struct propagate_cac_done_wk;

        struct work_struct mgmt_registrations_update_wk;
        /* lock for all wdev lists */
        spinlock_t mgmt_registrations_lock;

        struct work_struct wiphy_work;
        struct list_head wiphy_work_list;
        /* protects the list above */
        spinlock_t wiphy_work_lock;
        bool suspended;

        /* must be last because of the way we do wiphy_priv(),
         * and it should at least be aligned to NETDEV_ALIGN */
        struct wiphy wiphy __aligned(NETDEV_ALIGN);
};

static inline
struct cfg80211_registered_device *wiphy_to_rdev(struct wiphy *wiphy)
{
        BUG_ON(!wiphy);
        return container_of(wiphy, struct cfg80211_registered_device, wiphy);
}

static inline void
cfg80211_rdev_free_wowlan(struct cfg80211_registered_device *rdev)
{
#ifdef CONFIG_PM
        int i;

        if (!rdev->wiphy.wowlan_config)
                return;
        for (i = 0; i < rdev->wiphy.wowlan_config->n_patterns; i++)
                kfree(rdev->wiphy.wowlan_config->patterns[i].mask);
        kfree(rdev->wiphy.wowlan_config->patterns);
        if (rdev->wiphy.wowlan_config->tcp &&
            rdev->wiphy.wowlan_config->tcp->sock)
                sock_release(rdev->wiphy.wowlan_config->tcp->sock);
        kfree(rdev->wiphy.wowlan_config->tcp);
        kfree(rdev->wiphy.wowlan_config->nd_config);
        kfree(rdev->wiphy.wowlan_config);
#endif
}

static inline u64 cfg80211_assign_cookie(struct cfg80211_registered_device *rdev)
{
        u64 r = ++rdev->cookie_counter;

        if (WARN_ON(r == 0))
                r = ++rdev->cookie_counter;

        return r;
}

extern struct workqueue_struct *cfg80211_wq;
extern struct list_head cfg80211_rdev_list;
extern int cfg80211_rdev_list_generation;

/* This is constructed like this so it can be used in if/else */
static inline int for_each_rdev_check_rtnl(void)
{
        ASSERT_RTNL();
        return 0;
}
#define for_each_rdev(rdev)                                                \
        if (for_each_rdev_check_rtnl()) {} else                                \
                list_for_each_entry(rdev, &cfg80211_rdev_list, list)

enum bss_source_type {
        BSS_SOURCE_DIRECT = 0,
        BSS_SOURCE_MBSSID,
        BSS_SOURCE_STA_PROFILE,
};

struct cfg80211_internal_bss {
        struct list_head list;
        struct list_head hidden_list;
        struct rb_node rbn;
        unsigned long ts;
        unsigned long refcount;
        atomic_t hold;

        /* time at the start of the reception of the first octet of the
         * timestamp field of the last beacon/probe received for this BSS.
         * The time is the TSF of the BSS specified by %parent_bssid.
         */
        u64 parent_tsf;

        /* the BSS according to which %parent_tsf is set. This is set to
         * the BSS that the interface that requested the scan was connected to
         * when the beacon/probe was received.
         */
        u8 parent_bssid[ETH_ALEN] __aligned(2);

        enum bss_source_type bss_source;

        /* must be last because of priv member */
        struct cfg80211_bss pub;
};

static inline struct cfg80211_internal_bss *bss_from_pub(struct cfg80211_bss *pub)
{
        return container_of(pub, struct cfg80211_internal_bss, pub);
}

static inline void cfg80211_hold_bss(struct cfg80211_internal_bss *bss)
{
        atomic_inc(&bss->hold);
        if (bss->pub.transmitted_bss) {
                bss = container_of(bss->pub.transmitted_bss,
                                   struct cfg80211_internal_bss, pub);
                atomic_inc(&bss->hold);
        }
}

static inline void cfg80211_unhold_bss(struct cfg80211_internal_bss *bss)
{
        int r = atomic_dec_return(&bss->hold);
        WARN_ON(r < 0);
        if (bss->pub.transmitted_bss) {
                bss = container_of(bss->pub.transmitted_bss,
                                   struct cfg80211_internal_bss, pub);
                r = atomic_dec_return(&bss->hold);
                WARN_ON(r < 0);
        }
}


struct cfg80211_registered_device *cfg80211_rdev_by_wiphy_idx(int wiphy_idx);
int get_wiphy_idx(struct wiphy *wiphy);

struct wiphy *wiphy_idx_to_wiphy(int wiphy_idx);

int cfg80211_switch_netns(struct cfg80211_registered_device *rdev,
                          struct net *net);

void cfg80211_init_wdev(struct wireless_dev *wdev);
void cfg80211_register_wdev(struct cfg80211_registered_device *rdev,
                            struct wireless_dev *wdev);

static inline bool cfg80211_has_monitors_only(struct cfg80211_registered_device *rdev)
{
        lockdep_assert_held(&rdev->wiphy.mtx);

        return rdev->num_running_ifaces == rdev->num_running_monitor_ifaces &&
               rdev->num_running_ifaces > 0;
}

enum cfg80211_event_type {
        EVENT_CONNECT_RESULT,
        EVENT_ROAMED,
        EVENT_DISCONNECTED,
        EVENT_IBSS_JOINED,
        EVENT_STOPPED,
        EVENT_PORT_AUTHORIZED,
};

struct cfg80211_event {
        struct list_head list;
        enum cfg80211_event_type type;

        union {
                struct cfg80211_connect_resp_params cr;
                struct cfg80211_roam_info rm;
                struct {
                        const u8 *ie;
                        size_t ie_len;
                        u16 reason;
                        bool locally_generated;
                } dc;
                struct {
                        u8 bssid[ETH_ALEN];
                        struct ieee80211_channel *channel;
                } ij;
                struct {
                        u8 peer_addr[ETH_ALEN];
                        const u8 *td_bitmap;
                        u8 td_bitmap_len;
                } pa;
        };
};

struct cfg80211_cached_keys {
        struct key_params params[4];
        u8 data[4][WLAN_KEY_LEN_WEP104];
        int def;
};

struct cfg80211_beacon_registration {
        struct list_head list;
        u32 nlportid;
};

struct cfg80211_cqm_config {
        struct rcu_head rcu_head;
        u32 rssi_hyst;
        s32 last_rssi_event_value;
        enum nl80211_cqm_rssi_threshold_event last_rssi_event_type;
        bool use_range_api;
        int n_rssi_thresholds;
        s32 rssi_thresholds[] __counted_by(n_rssi_thresholds);
};

void cfg80211_cqm_rssi_notify_work(struct wiphy *wiphy,
                                   struct wiphy_work *work);

void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev);

/* free object */
void cfg80211_dev_free(struct cfg80211_registered_device *rdev);

int cfg80211_dev_rename(struct cfg80211_registered_device *rdev,
                        char *newname);

void ieee80211_set_bitrate_flags(struct wiphy *wiphy);

void cfg80211_bss_expire(struct cfg80211_registered_device *rdev);
void cfg80211_bss_age(struct cfg80211_registered_device *rdev,
                      unsigned long age_secs);
void cfg80211_update_assoc_bss_entry(struct wireless_dev *wdev,
                                     unsigned int link,
                                     struct ieee80211_channel *channel);

/* IBSS */
int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
                         struct net_device *dev,
                         struct cfg80211_ibss_params *params,
                         struct cfg80211_cached_keys *connkeys);
void cfg80211_clear_ibss(struct net_device *dev, bool nowext);
int cfg80211_leave_ibss(struct cfg80211_registered_device *rdev,
                        struct net_device *dev, bool nowext);
void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid,
                            struct ieee80211_channel *channel);
int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev,
                            struct wireless_dev *wdev);

/* mesh */
extern const struct mesh_config default_mesh_config;
extern const struct mesh_setup default_mesh_setup;
int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev,
                         struct net_device *dev,
                         struct mesh_setup *setup,
                         const struct mesh_config *conf);
int cfg80211_leave_mesh(struct cfg80211_registered_device *rdev,
                        struct net_device *dev);
int cfg80211_set_mesh_channel(struct cfg80211_registered_device *rdev,
                              struct wireless_dev *wdev,
                              struct cfg80211_chan_def *chandef);

/* OCB */
int cfg80211_join_ocb(struct cfg80211_registered_device *rdev,
                      struct net_device *dev,
                      struct ocb_setup *setup);
int cfg80211_leave_ocb(struct cfg80211_registered_device *rdev,
                       struct net_device *dev);

/* AP */
int cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
                     struct net_device *dev, int link,
                     bool notify);

/* MLME */
int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev,
                       struct net_device *dev,
                       struct cfg80211_auth_request *req);
int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev,
                        struct net_device *dev,
                        struct cfg80211_assoc_request *req,
                        struct netlink_ext_ack *extack);
int cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev,
                         struct net_device *dev, const u8 *bssid,
                         const u8 *ie, int ie_len, u16 reason,
                         bool local_state_change);
int cfg80211_mlme_disassoc(struct cfg80211_registered_device *rdev,
                           struct net_device *dev, const u8 *ap_addr,
                           const u8 *ie, int ie_len, u16 reason,
                           bool local_state_change);
void cfg80211_mlme_down(struct cfg80211_registered_device *rdev,
                        struct net_device *dev);
int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid,
                                u16 frame_type, const u8 *match_data,
                                int match_len, bool multicast_rx,
                                struct netlink_ext_ack *extack);
void cfg80211_mgmt_registrations_update_wk(struct work_struct *wk);
void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid);
void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev);
int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
                          struct wireless_dev *wdev,
                          struct cfg80211_mgmt_tx_params *params,
                          u64 *cookie);
void cfg80211_oper_and_ht_capa(struct ieee80211_ht_cap *ht_capa,
                               const struct ieee80211_ht_cap *ht_capa_mask);
void cfg80211_oper_and_vht_capa(struct ieee80211_vht_cap *vht_capa,
                                const struct ieee80211_vht_cap *vht_capa_mask);

/* SME events */
int cfg80211_connect(struct cfg80211_registered_device *rdev,
                     struct net_device *dev,
                     struct cfg80211_connect_params *connect,
                     struct cfg80211_cached_keys *connkeys,
                     const u8 *prev_bssid);
void __cfg80211_connect_result(struct net_device *dev,
                               struct cfg80211_connect_resp_params *params,
                               bool wextev);
void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
                             size_t ie_len, u16 reason, bool from_ap);
int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
                        struct net_device *dev, u16 reason,
                        bool wextev);
void __cfg80211_roamed(struct wireless_dev *wdev,
                       struct cfg80211_roam_info *info);
void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *peer_addr,
                                const u8 *td_bitmap, u8 td_bitmap_len);
int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev,
                              struct wireless_dev *wdev);
void cfg80211_autodisconnect_wk(struct work_struct *work);

/* SME implementation */
void cfg80211_conn_work(struct work_struct *work);
void cfg80211_sme_scan_done(struct net_device *dev);
bool cfg80211_sme_rx_assoc_resp(struct wireless_dev *wdev, u16 status);
void cfg80211_sme_rx_auth(struct wireless_dev *wdev, const u8 *buf, size_t len);
void cfg80211_sme_disassoc(struct wireless_dev *wdev);
void cfg80211_sme_deauth(struct wireless_dev *wdev);
void cfg80211_sme_auth_timeout(struct wireless_dev *wdev);
void cfg80211_sme_assoc_timeout(struct wireless_dev *wdev);
void cfg80211_sme_abandon_assoc(struct wireless_dev *wdev);

/* internal helpers */
bool cfg80211_supported_cipher_suite(struct wiphy *wiphy, u32 cipher);
bool cfg80211_valid_key_idx(struct cfg80211_registered_device *rdev,
                            int key_idx, bool pairwise);
int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
                                   struct key_params *params, int key_idx,
                                   bool pairwise, const u8 *mac_addr);
void __cfg80211_scan_done(struct wiphy *wiphy, struct wiphy_work *wk);
void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev,
                           bool send_message);
void cfg80211_add_sched_scan_req(struct cfg80211_registered_device *rdev,
                                 struct cfg80211_sched_scan_request *req);
int cfg80211_sched_scan_req_possible(struct cfg80211_registered_device *rdev,
                                     bool want_multi);
void cfg80211_sched_scan_results_wk(struct work_struct *work);
int cfg80211_stop_sched_scan_req(struct cfg80211_registered_device *rdev,
                                 struct cfg80211_sched_scan_request *req,
                                 bool driver_initiated);
int __cfg80211_stop_sched_scan(struct cfg80211_registered_device *rdev,
                               u64 reqid, bool driver_initiated);
void cfg80211_upload_connect_keys(struct wireless_dev *wdev);
int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
                          struct net_device *dev, enum nl80211_iftype ntype,
                          struct vif_params *params);
void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev);
void cfg80211_process_wiphy_works(struct cfg80211_registered_device *rdev,
                                  struct wiphy_work *end);
void cfg80211_process_wdev_events(struct wireless_dev *wdev);

bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range,
                                u32 center_freq_khz, u32 bw_khz);

int cfg80211_scan(struct cfg80211_registered_device *rdev);

extern struct work_struct cfg80211_disconnect_work;

#define NL80211_BSS_USE_FOR_ALL        (NL80211_BSS_USE_FOR_NORMAL | \
                                 NL80211_BSS_USE_FOR_MLD_LINK)

void cfg80211_set_dfs_state(struct wiphy *wiphy,
                            const struct cfg80211_chan_def *chandef,
                            enum nl80211_dfs_state dfs_state);

void cfg80211_dfs_channels_update_work(struct work_struct *work);

void cfg80211_sched_dfs_chan_update(struct cfg80211_registered_device *rdev);

int
cfg80211_start_background_radar_detection(struct cfg80211_registered_device *rdev,
                                          struct wireless_dev *wdev,
                                          struct cfg80211_chan_def *chandef);

void cfg80211_stop_background_radar_detection(struct wireless_dev *wdev);

void cfg80211_background_cac_done_wk(struct work_struct *work);

void cfg80211_background_cac_abort_wk(struct work_struct *work);

bool cfg80211_any_wiphy_oper_chan(struct wiphy *wiphy,
                                  struct ieee80211_channel *chan);

bool cfg80211_beaconing_iface_active(struct wireless_dev *wdev);

bool cfg80211_is_sub_chan(struct cfg80211_chan_def *chandef,
                          struct ieee80211_channel *chan,
                          bool primary_only);
bool cfg80211_wdev_on_sub_chan(struct wireless_dev *wdev,
                               struct ieee80211_channel *chan,
                               bool primary_only);
bool _cfg80211_chandef_usable(struct wiphy *wiphy,
                              const struct cfg80211_chan_def *chandef,
                              u32 prohibited_flags,
                              u32 permitting_flags);

static inline unsigned int elapsed_jiffies_msecs(unsigned long start)
{
        unsigned long end = jiffies;

        if (end >= start)
                return jiffies_to_msecs(end - start);

        return jiffies_to_msecs(end + (ULONG_MAX - start) + 1);
}

int cfg80211_set_monitor_channel(struct cfg80211_registered_device *rdev,
                                 struct net_device *dev,
                                 struct cfg80211_chan_def *chandef);

int ieee80211_get_ratemask(struct ieee80211_supported_band *sband,
                           const u8 *rates, unsigned int n_rates,
                           u32 *mask);

int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev,
                                 enum nl80211_iftype iftype, u32 beacon_int);

void cfg80211_update_iface_num(struct cfg80211_registered_device *rdev,
                               enum nl80211_iftype iftype, int num);

void cfg80211_leave(struct cfg80211_registered_device *rdev,
                    struct wireless_dev *wdev);

void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev,
                              struct wireless_dev *wdev);

void cfg80211_stop_nan(struct cfg80211_registered_device *rdev,
                       struct wireless_dev *wdev);

struct cfg80211_internal_bss *
cfg80211_bss_update(struct cfg80211_registered_device *rdev,
                    struct cfg80211_internal_bss *tmp,
                    bool signal_valid, unsigned long ts);

enum ieee80211_ap_reg_power
cfg80211_get_6ghz_power_type(const u8 *elems, size_t elems_len);

#ifdef CONFIG_CFG80211_DEVELOPER_WARNINGS
#define CFG80211_DEV_WARN_ON(cond)        WARN_ON(cond)
#else
/*
 * Trick to enable using it as a condition,
 * and also not give a warning when it's
 * not used that way.
 */
#define CFG80211_DEV_WARN_ON(cond)        ({bool __r = (cond); __r; })
#endif

void cfg80211_release_pmsr(struct wireless_dev *wdev, u32 portid);
void cfg80211_pmsr_wdev_down(struct wireless_dev *wdev);
void cfg80211_pmsr_free_wk(struct work_struct *work);

void cfg80211_remove_link(struct wireless_dev *wdev, unsigned int link_id);
void cfg80211_remove_links(struct wireless_dev *wdev);
int cfg80211_remove_virtual_intf(struct cfg80211_registered_device *rdev,
                                 struct wireless_dev *wdev);
void cfg80211_wdev_release_link_bsses(struct wireless_dev *wdev, u16 link_mask);

int cfg80211_assoc_ml_reconf(struct cfg80211_registered_device *rdev,
                             struct net_device *dev,
                             struct cfg80211_ml_reconf_req *req);

/**
 * struct cfg80211_colocated_ap - colocated AP information
 *
 * @list: linked list to all colocated APs
 * @bssid: BSSID of the reported AP
 * @ssid: SSID of the reported AP
 * @ssid_len: length of the ssid
 * @center_freq: frequency the reported AP is on
 * @unsolicited_probe: the reported AP is part of an ESS, where all the APs
 *        that operate in the same channel as the reported AP and that might be
 *        detected by a STA receiving this frame, are transmitting unsolicited
 *        Probe Response frames every 20 TUs
 * @oct_recommended: OCT is recommended to exchange MMPDUs with the reported AP
 * @same_ssid: the reported AP has the same SSID as the reporting AP
 * @multi_bss: the reported AP is part of a multiple BSSID set
 * @transmitted_bssid: the reported AP is the transmitting BSSID
 * @colocated_ess: all the APs that share the same ESS as the reported AP are
 *        colocated and can be discovered via legacy bands.
 * @short_ssid_valid: short_ssid is valid and can be used
 * @short_ssid: the short SSID for this SSID
 * @psd_20: The 20MHz PSD EIRP of the primary 20MHz channel for the reported AP
 */
struct cfg80211_colocated_ap {
        struct list_head list;
        u8 bssid[ETH_ALEN];
        u8 ssid[IEEE80211_MAX_SSID_LEN];
        size_t ssid_len;
        u32 short_ssid;
        u32 center_freq;
        u8 unsolicited_probe:1,
           oct_recommended:1,
           same_ssid:1,
           multi_bss:1,
           transmitted_bssid:1,
           colocated_ess:1,
           short_ssid_valid:1;
        s8 psd_20;
};

#if IS_ENABLED(CONFIG_CFG80211_KUNIT_TEST)
#define EXPORT_SYMBOL_IF_CFG80211_KUNIT(sym) EXPORT_SYMBOL_IF_KUNIT(sym)
#define VISIBLE_IF_CFG80211_KUNIT
void cfg80211_free_coloc_ap_list(struct list_head *coloc_ap_list);

int cfg80211_parse_colocated_ap(const struct cfg80211_bss_ies *ies,
                                struct list_head *list);

size_t cfg80211_gen_new_ie(const u8 *ie, size_t ielen,
                           const u8 *subie, size_t subie_len,
                           u8 *new_ie, size_t new_ie_len);
#else
#define EXPORT_SYMBOL_IF_CFG80211_KUNIT(sym)
#define VISIBLE_IF_CFG80211_KUNIT static
#endif /* IS_ENABLED(CONFIG_CFG80211_KUNIT_TEST) */

#endif /* __NET_WIRELESS_CORE_H */




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 




























    1 






























    1 
















    1 









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Implementation of the security services.
 *
 * Authors : Stephen Smalley, <stephen.smalley.work@gmail.com>
 *             James Morris <jmorris@redhat.com>
 *
 * Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com>
 *
 *        Support for enhanced MLS infrastructure.
 *        Support for context based audit filters.
 *
 * Updated: Frank Mayer <mayerf@tresys.com> and Karl MacMillan <kmacmillan@tresys.com>
 *
 *        Added conditional policy language extensions
 *
 * Updated: Hewlett-Packard <paul@paul-moore.com>
 *
 *      Added support for NetLabel
 *      Added support for the policy capability bitmap
 *
 * Updated: Chad Sellers <csellers@tresys.com>
 *
 *  Added validation of kernel classes and permissions
 *
 * Updated: KaiGai Kohei <kaigai@ak.jp.nec.com>
 *
 *  Added support for bounds domain and audit messaged on masked permissions
 *
 * Updated: Guido Trentalancia <guido@trentalancia.com>
 *
 *  Added support for runtime switching of the policy type
 *
 * Copyright (C) 2008, 2009 NEC Corporation
 * Copyright (C) 2006, 2007 Hewlett-Packard Development Company, L.P.
 * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc.
 * Copyright (C) 2003 - 2004, 2006 Tresys Technology, LLC
 * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
 */
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/sched.h>
#include <linux/audit.h>
#include <linux/parser.h>
#include <linux/vmalloc.h>
#include <linux/lsm_hooks.h>
#include <net/netlabel.h>

#include "flask.h"
#include "avc.h"
#include "avc_ss.h"
#include "security.h"
#include "context.h"
#include "policydb.h"
#include "sidtab.h"
#include "services.h"
#include "conditional.h"
#include "mls.h"
#include "objsec.h"
#include "netlabel.h"
#include "xfrm.h"
#include "ebitmap.h"
#include "audit.h"
#include "policycap_names.h"
#include "ima.h"

struct selinux_policy_convert_data {
        struct convert_context_args args;
        struct sidtab_convert_params sidtab_params;
};

/* Forward declaration. */
static int context_struct_to_string(struct policydb *policydb,
                                    struct context *context,
                                    char **scontext,
                                    u32 *scontext_len);

static int sidtab_entry_to_string(struct policydb *policydb,
                                  struct sidtab *sidtab,
                                  struct sidtab_entry *entry,
                                  char **scontext,
                                  u32 *scontext_len);

static void context_struct_compute_av(struct policydb *policydb,
                                      struct context *scontext,
                                      struct context *tcontext,
                                      u16 tclass,
                                      struct av_decision *avd,
                                      struct extended_perms *xperms);

static int selinux_set_mapping(struct policydb *pol,
                               const struct security_class_mapping *map,
                               struct selinux_map *out_map)
{
        u16 i, j;
        bool print_unknown_handle = false;

        /* Find number of classes in the input mapping */
        if (!map)
                return -EINVAL;
        i = 0;
        while (map[i].name)
                i++;

        /* Allocate space for the class records, plus one for class zero */
        out_map->mapping = kcalloc(++i, sizeof(*out_map->mapping), GFP_ATOMIC);
        if (!out_map->mapping)
                return -ENOMEM;

        /* Store the raw class and permission values */
        j = 0;
        while (map[j].name) {
                const struct security_class_mapping *p_in = map + (j++);
                struct selinux_mapping *p_out = out_map->mapping + j;
                u16 k;

                /* An empty class string skips ahead */
                if (!strcmp(p_in->name, "")) {
                        p_out->num_perms = 0;
                        continue;
                }

                p_out->value = string_to_security_class(pol, p_in->name);
                if (!p_out->value) {
                        pr_info("SELinux:  Class %s not defined in policy.\n",
                               p_in->name);
                        if (pol->reject_unknown)
                                goto err;
                        p_out->num_perms = 0;
                        print_unknown_handle = true;
                        continue;
                }

                k = 0;
                while (p_in->perms[k]) {
                        /* An empty permission string skips ahead */
                        if (!*p_in->perms[k]) {
                                k++;
                                continue;
                        }
                        p_out->perms[k] = string_to_av_perm(pol, p_out->value,
                                                            p_in->perms[k]);
                        if (!p_out->perms[k]) {
                                pr_info("SELinux:  Permission %s in class %s not defined in policy.\n",
                                       p_in->perms[k], p_in->name);
                                if (pol->reject_unknown)
                                        goto err;
                                print_unknown_handle = true;
                        }

                        k++;
                }
                p_out->num_perms = k;
        }

        if (print_unknown_handle)
                pr_info("SELinux: the above unknown classes and permissions will be %s\n",
                       pol->allow_unknown ? "allowed" : "denied");

        out_map->size = i;
        return 0;
err:
        kfree(out_map->mapping);
        out_map->mapping = NULL;
        return -EINVAL;
}

/*
 * Get real, policy values from mapped values
 */

static u16 unmap_class(struct selinux_map *map, u16 tclass)
{
        if (tclass < map->size)
                return map->mapping[tclass].value;

        return tclass;
}

/*
 * Get kernel value for class from its policy value
 */
static u16 map_class(struct selinux_map *map, u16 pol_value)
{
        u16 i;

        for (i = 1; i < map->size; i++) {
                if (map->mapping[i].value == pol_value)
                        return i;
        }

        return SECCLASS_NULL;
}

static void map_decision(struct selinux_map *map,
                         u16 tclass, struct av_decision *avd,
                         int allow_unknown)
{
        if (tclass < map->size) {
                struct selinux_mapping *mapping = &map->mapping[tclass];
                unsigned int i, n = mapping->num_perms;
                u32 result;

                for (i = 0, result = 0; i < n; i++) {
                        if (avd->allowed & mapping->perms[i])
                                result |= (u32)1<<i;
                        if (allow_unknown && !mapping->perms[i])
                                result |= (u32)1<<i;
                }
                avd->allowed = result;

                for (i = 0, result = 0; i < n; i++)
                        if (avd->auditallow & mapping->perms[i])
                                result |= (u32)1<<i;
                avd->auditallow = result;

                for (i = 0, result = 0; i < n; i++) {
                        if (avd->auditdeny & mapping->perms[i])
                                result |= (u32)1<<i;
                        if (!allow_unknown && !mapping->perms[i])
                                result |= (u32)1<<i;
                }
                /*
                 * In case the kernel has a bug and requests a permission
                 * between num_perms and the maximum permission number, we
                 * should audit that denial
                 */
                for (; i < (sizeof(u32)*8); i++)
                        result |= (u32)1<<i;
                avd->auditdeny = result;
        }
}

int security_mls_enabled(void)
{
        int mls_enabled;
        struct selinux_policy *policy;

        if (!selinux_initialized())
                return 0;

        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        mls_enabled = policy->policydb.mls_enabled;
        rcu_read_unlock();
        return mls_enabled;
}

/*
 * Return the boolean value of a constraint expression
 * when it is applied to the specified source and target
 * security contexts.
 *
 * xcontext is a special beast...  It is used by the validatetrans rules
 * only.  For these rules, scontext is the context before the transition,
 * tcontext is the context after the transition, and xcontext is the context
 * of the process performing the transition.  All other callers of
 * constraint_expr_eval should pass in NULL for xcontext.
 */
static int constraint_expr_eval(struct policydb *policydb,
                                struct context *scontext,
                                struct context *tcontext,
                                struct context *xcontext,
                                struct constraint_expr *cexpr)
{
        u32 val1, val2;
        struct context *c;
        struct role_datum *r1, *r2;
        struct mls_level *l1, *l2;
        struct constraint_expr *e;
        int s[CEXPR_MAXDEPTH];
        int sp = -1;

        for (e = cexpr; e; e = e->next) {
                switch (e->expr_type) {
                case CEXPR_NOT:
                        BUG_ON(sp < 0);
                        s[sp] = !s[sp];
                        break;
                case CEXPR_AND:
                        BUG_ON(sp < 1);
                        sp--;
                        s[sp] &= s[sp + 1];
                        break;
                case CEXPR_OR:
                        BUG_ON(sp < 1);
                        sp--;
                        s[sp] |= s[sp + 1];
                        break;
                case CEXPR_ATTR:
                        if (sp == (CEXPR_MAXDEPTH - 1))
                                return 0;
                        switch (e->attr) {
                        case CEXPR_USER:
                                val1 = scontext->user;
                                val2 = tcontext->user;
                                break;
                        case CEXPR_TYPE:
                                val1 = scontext->type;
                                val2 = tcontext->type;
                                break;
                        case CEXPR_ROLE:
                                val1 = scontext->role;
                                val2 = tcontext->role;
                                r1 = policydb->role_val_to_struct[val1 - 1];
                                r2 = policydb->role_val_to_struct[val2 - 1];
                                switch (e->op) {
                                case CEXPR_DOM:
                                        s[++sp] = ebitmap_get_bit(&r1->dominates,
                                                                  val2 - 1);
                                        continue;
                                case CEXPR_DOMBY:
                                        s[++sp] = ebitmap_get_bit(&r2->dominates,
                                                                  val1 - 1);
                                        continue;
                                case CEXPR_INCOMP:
                                        s[++sp] = (!ebitmap_get_bit(&r1->dominates,
                                                                    val2 - 1) &&
                                                   !ebitmap_get_bit(&r2->dominates,
                                                                    val1 - 1));
                                        continue;
                                default:
                                        break;
                                }
                                break;
                        case CEXPR_L1L2:
                                l1 = &(scontext->range.level[0]);
                                l2 = &(tcontext->range.level[0]);
                                goto mls_ops;
                        case CEXPR_L1H2:
                                l1 = &(scontext->range.level[0]);
                                l2 = &(tcontext->range.level[1]);
                                goto mls_ops;
                        case CEXPR_H1L2:
                                l1 = &(scontext->range.level[1]);
                                l2 = &(tcontext->range.level[0]);
                                goto mls_ops;
                        case CEXPR_H1H2:
                                l1 = &(scontext->range.level[1]);
                                l2 = &(tcontext->range.level[1]);
                                goto mls_ops;
                        case CEXPR_L1H1:
                                l1 = &(scontext->range.level[0]);
                                l2 = &(scontext->range.level[1]);
                                goto mls_ops;
                        case CEXPR_L2H2:
                                l1 = &(tcontext->range.level[0]);
                                l2 = &(tcontext->range.level[1]);
                                goto mls_ops;
mls_ops:
                                switch (e->op) {
                                case CEXPR_EQ:
                                        s[++sp] = mls_level_eq(l1, l2);
                                        continue;
                                case CEXPR_NEQ:
                                        s[++sp] = !mls_level_eq(l1, l2);
                                        continue;
                                case CEXPR_DOM:
                                        s[++sp] = mls_level_dom(l1, l2);
                                        continue;
                                case CEXPR_DOMBY:
                                        s[++sp] = mls_level_dom(l2, l1);
                                        continue;
                                case CEXPR_INCOMP:
                                        s[++sp] = mls_level_incomp(l2, l1);
                                        continue;
                                default:
                                        BUG();
                                        return 0;
                                }
                                break;
                        default:
                                BUG();
                                return 0;
                        }

                        switch (e->op) {
                        case CEXPR_EQ:
                                s[++sp] = (val1 == val2);
                                break;
                        case CEXPR_NEQ:
                                s[++sp] = (val1 != val2);
                                break;
                        default:
                                BUG();
                                return 0;
                        }
                        break;
                case CEXPR_NAMES:
                        if (sp == (CEXPR_MAXDEPTH-1))
                                return 0;
                        c = scontext;
                        if (e->attr & CEXPR_TARGET)
                                c = tcontext;
                        else if (e->attr & CEXPR_XTARGET) {
                                c = xcontext;
                                if (!c) {
                                        BUG();
                                        return 0;
                                }
                        }
                        if (e->attr & CEXPR_USER)
                                val1 = c->user;
                        else if (e->attr & CEXPR_ROLE)
                                val1 = c->role;
                        else if (e->attr & CEXPR_TYPE)
                                val1 = c->type;
                        else {
                                BUG();
                                return 0;
                        }

                        switch (e->op) {
                        case CEXPR_EQ:
                                s[++sp] = ebitmap_get_bit(&e->names, val1 - 1);
                                break;
                        case CEXPR_NEQ:
                                s[++sp] = !ebitmap_get_bit(&e->names, val1 - 1);
                                break;
                        default:
                                BUG();
                                return 0;
                        }
                        break;
                default:
                        BUG();
                        return 0;
                }
        }

        BUG_ON(sp != 0);
        return s[0];
}

/*
 * security_dump_masked_av - dumps masked permissions during
 * security_compute_av due to RBAC, MLS/Constraint and Type bounds.
 */
static int dump_masked_av_helper(void *k, void *d, void *args)
{
        struct perm_datum *pdatum = d;
        char **permission_names = args;

        BUG_ON(pdatum->value < 1 || pdatum->value > 32);

        permission_names[pdatum->value - 1] = (char *)k;

        return 0;
}

static void security_dump_masked_av(struct policydb *policydb,
                                    struct context *scontext,
                                    struct context *tcontext,
                                    u16 tclass,
                                    u32 permissions,
                                    const char *reason)
{
        struct common_datum *common_dat;
        struct class_datum *tclass_dat;
        struct audit_buffer *ab;
        char *tclass_name;
        char *scontext_name = NULL;
        char *tcontext_name = NULL;
        char *permission_names[32];
        int index;
        u32 length;
        bool need_comma = false;

        if (!permissions)
                return;

        tclass_name = sym_name(policydb, SYM_CLASSES, tclass - 1);
        tclass_dat = policydb->class_val_to_struct[tclass - 1];
        common_dat = tclass_dat->comdatum;

        /* init permission_names */
        if (common_dat &&
            hashtab_map(&common_dat->permissions.table,
                        dump_masked_av_helper, permission_names) < 0)
                goto out;

        if (hashtab_map(&tclass_dat->permissions.table,
                        dump_masked_av_helper, permission_names) < 0)
                goto out;

        /* get scontext/tcontext in text form */
        if (context_struct_to_string(policydb, scontext,
                                     &scontext_name, &length) < 0)
                goto out;

        if (context_struct_to_string(policydb, tcontext,
                                     &tcontext_name, &length) < 0)
                goto out;

        /* audit a message */
        ab = audit_log_start(audit_context(),
                             GFP_ATOMIC, AUDIT_SELINUX_ERR);
        if (!ab)
                goto out;

        audit_log_format(ab, "op=security_compute_av reason=%s "
                         "scontext=%s tcontext=%s tclass=%s perms=",
                         reason, scontext_name, tcontext_name, tclass_name);

        for (index = 0; index < 32; index++) {
                u32 mask = (1 << index);

                if ((mask & permissions) == 0)
                        continue;

                audit_log_format(ab, "%s%s",
                                 need_comma ? "," : "",
                                 permission_names[index]
                                 ? permission_names[index] : "????");
                need_comma = true;
        }
        audit_log_end(ab);
out:
        /* release scontext/tcontext */
        kfree(tcontext_name);
        kfree(scontext_name);
}

/*
 * security_boundary_permission - drops violated permissions
 * on boundary constraint.
 */
static void type_attribute_bounds_av(struct policydb *policydb,
                                     struct context *scontext,
                                     struct context *tcontext,
                                     u16 tclass,
                                     struct av_decision *avd)
{
        struct context lo_scontext;
        struct context lo_tcontext, *tcontextp = tcontext;
        struct av_decision lo_avd;
        struct type_datum *source;
        struct type_datum *target;
        u32 masked = 0;

        source = policydb->type_val_to_struct[scontext->type - 1];
        BUG_ON(!source);

        if (!source->bounds)
                return;

        target = policydb->type_val_to_struct[tcontext->type - 1];
        BUG_ON(!target);

        memset(&lo_avd, 0, sizeof(lo_avd));

        memcpy(&lo_scontext, scontext, sizeof(lo_scontext));
        lo_scontext.type = source->bounds;

        if (target->bounds) {
                memcpy(&lo_tcontext, tcontext, sizeof(lo_tcontext));
                lo_tcontext.type = target->bounds;
                tcontextp = &lo_tcontext;
        }

        context_struct_compute_av(policydb, &lo_scontext,
                                  tcontextp,
                                  tclass,
                                  &lo_avd,
                                  NULL);

        masked = ~lo_avd.allowed & avd->allowed;

        if (likely(!masked))
                return;                /* no masked permission */

        /* mask violated permissions */
        avd->allowed &= ~masked;

        /* audit masked permissions */
        security_dump_masked_av(policydb, scontext, tcontext,
                                tclass, masked, "bounds");
}

/*
 * Flag which drivers have permissions and which base permissions are covered.
 */
void services_compute_xperms_drivers(
                struct extended_perms *xperms,
                struct avtab_node *node)
{
        unsigned int i;

        switch (node->datum.u.xperms->specified) {
        case AVTAB_XPERMS_IOCTLDRIVER:
                xperms->base_perms |= AVC_EXT_IOCTL;
                /* if one or more driver has all permissions allowed */
                for (i = 0; i < ARRAY_SIZE(xperms->drivers.p); i++)
                        xperms->drivers.p[i] |= node->datum.u.xperms->perms.p[i];
                break;
        case AVTAB_XPERMS_IOCTLFUNCTION:
                xperms->base_perms |= AVC_EXT_IOCTL;
                /* if allowing permissions within a driver */
                security_xperm_set(xperms->drivers.p,
                                        node->datum.u.xperms->driver);
                break;
        case AVTAB_XPERMS_NLMSG:
                xperms->base_perms |= AVC_EXT_NLMSG;
                /* if allowing permissions within a driver */
                security_xperm_set(xperms->drivers.p,
                                        node->datum.u.xperms->driver);
                break;
        }

        xperms->len = 1;
}

/*
 * Compute access vectors and extended permissions based on a context
 * structure pair for the permissions in a particular class.
 */
static void context_struct_compute_av(struct policydb *policydb,
                                      struct context *scontext,
                                      struct context *tcontext,
                                      u16 tclass,
                                      struct av_decision *avd,
                                      struct extended_perms *xperms)
{
        struct constraint_node *constraint;
        struct role_allow *ra;
        struct avtab_key avkey;
        struct avtab_node *node;
        struct class_datum *tclass_datum;
        struct ebitmap *sattr, *tattr;
        struct ebitmap_node *snode, *tnode;
        unsigned int i, j;

        avd->allowed = 0;
        avd->auditallow = 0;
        avd->auditdeny = 0xffffffff;
        if (xperms) {
                memset(xperms, 0, sizeof(*xperms));
        }

        if (unlikely(!tclass || tclass > policydb->p_classes.nprim)) {
                pr_warn_ratelimited("SELinux:  Invalid class %u\n", tclass);
                return;
        }

        tclass_datum = policydb->class_val_to_struct[tclass - 1];

        /*
         * If a specific type enforcement rule was defined for
         * this permission check, then use it.
         */
        avkey.target_class = tclass;
        avkey.specified = AVTAB_AV | AVTAB_XPERMS;
        sattr = &policydb->type_attr_map_array[scontext->type - 1];
        tattr = &policydb->type_attr_map_array[tcontext->type - 1];
        ebitmap_for_each_positive_bit(sattr, snode, i) {
                ebitmap_for_each_positive_bit(tattr, tnode, j) {
                        avkey.source_type = i + 1;
                        avkey.target_type = j + 1;
                        for (node = avtab_search_node(&policydb->te_avtab,
                                                      &avkey);
                             node;
                             node = avtab_search_node_next(node, avkey.specified)) {
                                if (node->key.specified == AVTAB_ALLOWED)
                                        avd->allowed |= node->datum.u.data;
                                else if (node->key.specified == AVTAB_AUDITALLOW)
                                        avd->auditallow |= node->datum.u.data;
                                else if (node->key.specified == AVTAB_AUDITDENY)
                                        avd->auditdeny &= node->datum.u.data;
                                else if (xperms && (node->key.specified & AVTAB_XPERMS))
                                        services_compute_xperms_drivers(xperms, node);
                        }

                        /* Check conditional av table for additional permissions */
                        cond_compute_av(&policydb->te_cond_avtab, &avkey,
                                        avd, xperms);

                }
        }

        /*
         * Remove any permissions prohibited by a constraint (this includes
         * the MLS policy).
         */
        constraint = tclass_datum->constraints;
        while (constraint) {
                if ((constraint->permissions & (avd->allowed)) &&
                    !constraint_expr_eval(policydb, scontext, tcontext, NULL,
                                          constraint->expr)) {
                        avd->allowed &= ~(constraint->permissions);
                }
                constraint = constraint->next;
        }

        /*
         * If checking process transition permission and the
         * role is changing, then check the (current_role, new_role)
         * pair.
         */
        if (tclass == policydb->process_class &&
            (avd->allowed & policydb->process_trans_perms) &&
            scontext->role != tcontext->role) {
                for (ra = policydb->role_allow; ra; ra = ra->next) {
                        if (scontext->role == ra->role &&
                            tcontext->role == ra->new_role)
                                break;
                }
                if (!ra)
                        avd->allowed &= ~policydb->process_trans_perms;
        }

        /*
         * If the given source and target types have boundary
         * constraint, lazy checks have to mask any violated
         * permission and notice it to userspace via audit.
         */
        type_attribute_bounds_av(policydb, scontext, tcontext,
                                 tclass, avd);
}

static int security_validtrans_handle_fail(struct selinux_policy *policy,
                                        struct sidtab_entry *oentry,
                                        struct sidtab_entry *nentry,
                                        struct sidtab_entry *tentry,
                                        u16 tclass)
{
        struct policydb *p = &policy->policydb;
        struct sidtab *sidtab = policy->sidtab;
        char *o = NULL, *n = NULL, *t = NULL;
        u32 olen, nlen, tlen;

        if (sidtab_entry_to_string(p, sidtab, oentry, &o, &olen))
                goto out;
        if (sidtab_entry_to_string(p, sidtab, nentry, &n, &nlen))
                goto out;
        if (sidtab_entry_to_string(p, sidtab, tentry, &t, &tlen))
                goto out;
        audit_log(audit_context(), GFP_ATOMIC, AUDIT_SELINUX_ERR,
                  "op=security_validate_transition seresult=denied"
                  " oldcontext=%s newcontext=%s taskcontext=%s tclass=%s",
                  o, n, t, sym_name(p, SYM_CLASSES, tclass-1));
out:
        kfree(o);
        kfree(n);
        kfree(t);

        if (!enforcing_enabled())
                return 0;
        return -EPERM;
}

static int security_compute_validatetrans(u32 oldsid, u32 newsid, u32 tasksid,
                                          u16 orig_tclass, bool user)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        struct sidtab_entry *oentry;
        struct sidtab_entry *nentry;
        struct sidtab_entry *tentry;
        struct class_datum *tclass_datum;
        struct constraint_node *constraint;
        u16 tclass;
        int rc = 0;


        if (!selinux_initialized())
                return 0;

        rcu_read_lock();

        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        if (!user)
                tclass = unmap_class(&policy->map, orig_tclass);
        else
                tclass = orig_tclass;

        if (!tclass || tclass > policydb->p_classes.nprim) {
                rc = -EINVAL;
                goto out;
        }
        tclass_datum = policydb->class_val_to_struct[tclass - 1];

        oentry = sidtab_search_entry(sidtab, oldsid);
        if (!oentry) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                        __func__, oldsid);
                rc = -EINVAL;
                goto out;
        }

        nentry = sidtab_search_entry(sidtab, newsid);
        if (!nentry) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                        __func__, newsid);
                rc = -EINVAL;
                goto out;
        }

        tentry = sidtab_search_entry(sidtab, tasksid);
        if (!tentry) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                        __func__, tasksid);
                rc = -EINVAL;
                goto out;
        }

        constraint = tclass_datum->validatetrans;
        while (constraint) {
                if (!constraint_expr_eval(policydb, &oentry->context,
                                          &nentry->context, &tentry->context,
                                          constraint->expr)) {
                        if (user)
                                rc = -EPERM;
                        else
                                rc = security_validtrans_handle_fail(policy,
                                                                oentry,
                                                                nentry,
                                                                tentry,
                                                                tclass);
                        goto out;
                }
                constraint = constraint->next;
        }

out:
        rcu_read_unlock();
        return rc;
}

int security_validate_transition_user(u32 oldsid, u32 newsid, u32 tasksid,
                                      u16 tclass)
{
        return security_compute_validatetrans(oldsid, newsid, tasksid,
                                              tclass, true);
}

int security_validate_transition(u32 oldsid, u32 newsid, u32 tasksid,
                                 u16 orig_tclass)
{
        return security_compute_validatetrans(oldsid, newsid, tasksid,
                                              orig_tclass, false);
}

/*
 * security_bounded_transition - check whether the given
 * transition is directed to bounded, or not.
 * It returns 0, if @newsid is bounded by @oldsid.
 * Otherwise, it returns error code.
 *
 * @oldsid : current security identifier
 * @newsid : destinated security identifier
 */
int security_bounded_transition(u32 old_sid, u32 new_sid)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        struct sidtab_entry *old_entry, *new_entry;
        struct type_datum *type;
        u32 index;
        int rc;

        if (!selinux_initialized())
                return 0;

        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        rc = -EINVAL;
        old_entry = sidtab_search_entry(sidtab, old_sid);
        if (!old_entry) {
                pr_err("SELinux: %s: unrecognized SID %u\n",
                       __func__, old_sid);
                goto out;
        }

        rc = -EINVAL;
        new_entry = sidtab_search_entry(sidtab, new_sid);
        if (!new_entry) {
                pr_err("SELinux: %s: unrecognized SID %u\n",
                       __func__, new_sid);
                goto out;
        }

        rc = 0;
        /* type/domain unchanged */
        if (old_entry->context.type == new_entry->context.type)
                goto out;

        index = new_entry->context.type;
        while (true) {
                type = policydb->type_val_to_struct[index - 1];
                BUG_ON(!type);

                /* not bounded anymore */
                rc = -EPERM;
                if (!type->bounds)
                        break;

                /* @newsid is bounded by @oldsid */
                rc = 0;
                if (type->bounds == old_entry->context.type)
                        break;

                index = type->bounds;
        }

        if (rc) {
                char *old_name = NULL;
                char *new_name = NULL;
                u32 length;

                if (!sidtab_entry_to_string(policydb, sidtab, old_entry,
                                            &old_name, &length) &&
                    !sidtab_entry_to_string(policydb, sidtab, new_entry,
                                            &new_name, &length)) {
                        audit_log(audit_context(),
                                  GFP_ATOMIC, AUDIT_SELINUX_ERR,
                                  "op=security_bounded_transition "
                                  "seresult=denied "
                                  "oldcontext=%s newcontext=%s",
                                  old_name, new_name);
                }
                kfree(new_name);
                kfree(old_name);
        }
out:
        rcu_read_unlock();

        return rc;
}

static void avd_init(struct selinux_policy *policy, struct av_decision *avd)
{
        avd->allowed = 0;
        avd->auditallow = 0;
        avd->auditdeny = 0xffffffff;
        if (policy)
                avd->seqno = policy->latest_granting;
        else
                avd->seqno = 0;
        avd->flags = 0;
}

static void update_xperms_extended_data(u8 specified,
                                        const struct extended_perms_data *from,
                                        struct extended_perms_data *xp_data)
{
        unsigned int i;

        switch (specified) {
        case AVTAB_XPERMS_IOCTLDRIVER:
                memset(xp_data->p, 0xff, sizeof(xp_data->p));
                break;
        case AVTAB_XPERMS_IOCTLFUNCTION:
        case AVTAB_XPERMS_NLMSG:
                for (i = 0; i < ARRAY_SIZE(xp_data->p); i++)
                        xp_data->p[i] |= from->p[i];
                break;
        }

}

void services_compute_xperms_decision(struct extended_perms_decision *xpermd,
                                        struct avtab_node *node)
{
        u16 specified;

        switch (node->datum.u.xperms->specified) {
        case AVTAB_XPERMS_IOCTLFUNCTION:
                if (xpermd->base_perm != AVC_EXT_IOCTL ||
                    xpermd->driver != node->datum.u.xperms->driver)
                        return;
                break;
        case AVTAB_XPERMS_IOCTLDRIVER:
                if (xpermd->base_perm != AVC_EXT_IOCTL ||
                    !security_xperm_test(node->datum.u.xperms->perms.p,
                                         xpermd->driver))
                        return;
                break;
        case AVTAB_XPERMS_NLMSG:
                if (xpermd->base_perm != AVC_EXT_NLMSG ||
                    xpermd->driver != node->datum.u.xperms->driver)
                        return;
                break;
        default:
                pr_warn_once(
                        "SELinux: unknown extended permission (%u) will be ignored\n",
                        node->datum.u.xperms->specified);
                return;
        }

        specified = node->key.specified & ~(AVTAB_ENABLED | AVTAB_ENABLED_OLD);

        if (specified == AVTAB_XPERMS_ALLOWED) {
                xpermd->used |= XPERMS_ALLOWED;
                update_xperms_extended_data(node->datum.u.xperms->specified,
                                            &node->datum.u.xperms->perms,
                                            xpermd->allowed);
        } else if (specified == AVTAB_XPERMS_AUDITALLOW) {
                xpermd->used |= XPERMS_AUDITALLOW;
                update_xperms_extended_data(node->datum.u.xperms->specified,
                                            &node->datum.u.xperms->perms,
                                            xpermd->auditallow);
        } else if (specified == AVTAB_XPERMS_DONTAUDIT) {
                xpermd->used |= XPERMS_DONTAUDIT;
                update_xperms_extended_data(node->datum.u.xperms->specified,
                                            &node->datum.u.xperms->perms,
                                            xpermd->dontaudit);
        } else {
                pr_warn_once("SELinux: unknown specified key (%u)\n",
                             node->key.specified);
        }
}

void security_compute_xperms_decision(u32 ssid,
                                      u32 tsid,
                                      u16 orig_tclass,
                                      u8 driver,
                                      u8 base_perm,
                                      struct extended_perms_decision *xpermd)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        u16 tclass;
        struct context *scontext, *tcontext;
        struct avtab_key avkey;
        struct avtab_node *node;
        struct ebitmap *sattr, *tattr;
        struct ebitmap_node *snode, *tnode;
        unsigned int i, j;

        xpermd->base_perm = base_perm;
        xpermd->driver = driver;
        xpermd->used = 0;
        memset(xpermd->allowed->p, 0, sizeof(xpermd->allowed->p));
        memset(xpermd->auditallow->p, 0, sizeof(xpermd->auditallow->p));
        memset(xpermd->dontaudit->p, 0, sizeof(xpermd->dontaudit->p));

        rcu_read_lock();
        if (!selinux_initialized())
                goto allow;

        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        scontext = sidtab_search(sidtab, ssid);
        if (!scontext) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                       __func__, ssid);
                goto out;
        }

        tcontext = sidtab_search(sidtab, tsid);
        if (!tcontext) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                       __func__, tsid);
                goto out;
        }

        tclass = unmap_class(&policy->map, orig_tclass);
        if (unlikely(orig_tclass && !tclass)) {
                if (policydb->allow_unknown)
                        goto allow;
                goto out;
        }


        if (unlikely(!tclass || tclass > policydb->p_classes.nprim)) {
                pr_warn_ratelimited("SELinux:  Invalid class %hu\n", tclass);
                goto out;
        }

        avkey.target_class = tclass;
        avkey.specified = AVTAB_XPERMS;
        sattr = &policydb->type_attr_map_array[scontext->type - 1];
        tattr = &policydb->type_attr_map_array[tcontext->type - 1];
        ebitmap_for_each_positive_bit(sattr, snode, i) {
                ebitmap_for_each_positive_bit(tattr, tnode, j) {
                        avkey.source_type = i + 1;
                        avkey.target_type = j + 1;
                        for (node = avtab_search_node(&policydb->te_avtab,
                                                      &avkey);
                             node;
                             node = avtab_search_node_next(node, avkey.specified))
                                services_compute_xperms_decision(xpermd, node);

                        cond_compute_xperms(&policydb->te_cond_avtab,
                                                &avkey, xpermd);
                }
        }
out:
        rcu_read_unlock();
        return;
allow:
        memset(xpermd->allowed->p, 0xff, sizeof(xpermd->allowed->p));
        goto out;
}

/**
 * security_compute_av - Compute access vector decisions.
 * @ssid: source security identifier
 * @tsid: target security identifier
 * @orig_tclass: target security class
 * @avd: access vector decisions
 * @xperms: extended permissions
 *
 * Compute a set of access vector decisions based on the
 * SID pair (@ssid, @tsid) for the permissions in @tclass.
 */
void security_compute_av(u32 ssid,
                         u32 tsid,
                         u16 orig_tclass,
                         struct av_decision *avd,
                         struct extended_perms *xperms)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        u16 tclass;
        struct context *scontext = NULL, *tcontext = NULL;

        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        avd_init(policy, avd);
        xperms->len = 0;
        if (!selinux_initialized())
                goto allow;

        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        scontext = sidtab_search(sidtab, ssid);
        if (!scontext) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                       __func__, ssid);
                goto out;
        }

        /* permissive domain? */
        if (ebitmap_get_bit(&policydb->permissive_map, scontext->type))
                avd->flags |= AVD_FLAGS_PERMISSIVE;

        /* neveraudit domain? */
        if (ebitmap_get_bit(&policydb->neveraudit_map, scontext->type))
                avd->flags |= AVD_FLAGS_NEVERAUDIT;

        /* both permissive and neveraudit => allow */
        if (avd->flags == (AVD_FLAGS_PERMISSIVE|AVD_FLAGS_NEVERAUDIT))
                goto allow;

        tcontext = sidtab_search(sidtab, tsid);
        if (!tcontext) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                       __func__, tsid);
                goto out;
        }

        tclass = unmap_class(&policy->map, orig_tclass);
        if (unlikely(orig_tclass && !tclass)) {
                if (policydb->allow_unknown)
                        goto allow;
                goto out;
        }
        context_struct_compute_av(policydb, scontext, tcontext, tclass, avd,
                                  xperms);
        map_decision(&policy->map, orig_tclass, avd,
                     policydb->allow_unknown);
out:
        rcu_read_unlock();
        if (avd->flags & AVD_FLAGS_NEVERAUDIT)
                avd->auditallow = avd->auditdeny = 0;
        return;
allow:
        avd->allowed = 0xffffffff;
        goto out;
}

void security_compute_av_user(u32 ssid,
                              u32 tsid,
                              u16 tclass,
                              struct av_decision *avd)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        struct context *scontext = NULL, *tcontext = NULL;

        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        avd_init(policy, avd);
        if (!selinux_initialized())
                goto allow;

        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        scontext = sidtab_search(sidtab, ssid);
        if (!scontext) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                       __func__, ssid);
                goto out;
        }

        /* permissive domain? */
        if (ebitmap_get_bit(&policydb->permissive_map, scontext->type))
                avd->flags |= AVD_FLAGS_PERMISSIVE;

        /* neveraudit domain? */
        if (ebitmap_get_bit(&policydb->neveraudit_map, scontext->type))
                avd->flags |= AVD_FLAGS_NEVERAUDIT;

        /* both permissive and neveraudit => allow */
        if (avd->flags == (AVD_FLAGS_PERMISSIVE|AVD_FLAGS_NEVERAUDIT))
                goto allow;

        tcontext = sidtab_search(sidtab, tsid);
        if (!tcontext) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                       __func__, tsid);
                goto out;
        }

        if (unlikely(!tclass)) {
                if (policydb->allow_unknown)
                        goto allow;
                goto out;
        }

        context_struct_compute_av(policydb, scontext, tcontext, tclass, avd,
                                  NULL);
 out:
        rcu_read_unlock();
        if (avd->flags & AVD_FLAGS_NEVERAUDIT)
                avd->auditallow = avd->auditdeny = 0;
        return;
allow:
        avd->allowed = 0xffffffff;
        goto out;
}

/*
 * Write the security context string representation of
 * the context structure `context' into a dynamically
 * allocated string of the correct size.  Set `*scontext'
 * to point to this string and set `*scontext_len' to
 * the length of the string.
 */
static int context_struct_to_string(struct policydb *p,
                                    struct context *context,
                                    char **scontext, u32 *scontext_len)
{
        char *scontextp;

        if (scontext)
                *scontext = NULL;
        *scontext_len = 0;

        if (context->len) {
                *scontext_len = context->len;
                if (scontext) {
                        *scontext = kstrdup(context->str, GFP_ATOMIC);
                        if (!(*scontext))
                                return -ENOMEM;
                }
                return 0;
        }

        /* Compute the size of the context. */
        *scontext_len += strlen(sym_name(p, SYM_USERS, context->user - 1)) + 1;
        *scontext_len += strlen(sym_name(p, SYM_ROLES, context->role - 1)) + 1;
        *scontext_len += strlen(sym_name(p, SYM_TYPES, context->type - 1)) + 1;
        *scontext_len += mls_compute_context_len(p, context);

        if (!scontext)
                return 0;

        /* Allocate space for the context; caller must free this space. */
        scontextp = kmalloc(*scontext_len, GFP_ATOMIC);
        if (!scontextp)
                return -ENOMEM;
        *scontext = scontextp;

        /*
         * Copy the user name, role name and type name into the context.
         */
        scontextp += sprintf(scontextp, "%s:%s:%s",
                sym_name(p, SYM_USERS, context->user - 1),
                sym_name(p, SYM_ROLES, context->role - 1),
                sym_name(p, SYM_TYPES, context->type - 1));

        mls_sid_to_context(p, context, &scontextp);

        *scontextp = 0;

        return 0;
}

static int sidtab_entry_to_string(struct policydb *p,
                                  struct sidtab *sidtab,
                                  struct sidtab_entry *entry,
                                  char **scontext, u32 *scontext_len)
{
        int rc = sidtab_sid2str_get(sidtab, entry, scontext, scontext_len);

        if (rc != -ENOENT)
                return rc;

        rc = context_struct_to_string(p, &entry->context, scontext,
                                      scontext_len);
        if (!rc && scontext)
                sidtab_sid2str_put(sidtab, entry, *scontext, *scontext_len);
        return rc;
}

#include "initial_sid_to_string.h"

int security_sidtab_hash_stats(char *page)
{
        struct selinux_policy *policy;
        int rc;

        if (!selinux_initialized()) {
                pr_err("SELinux: %s:  called before initial load_policy\n",
                       __func__);
                return -EINVAL;
        }

        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        rc = sidtab_hash_stats(policy->sidtab, page);
        rcu_read_unlock();

        return rc;
}

const char *security_get_initial_sid_context(u32 sid)
{
        if (unlikely(sid > SECINITSID_NUM))
                return NULL;
        return initial_sid_to_string[sid];
}

static int security_sid_to_context_core(u32 sid, char **scontext,
                                        u32 *scontext_len, int force,
                                        int only_invalid)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        struct sidtab_entry *entry;
        int rc = 0;

        if (scontext)
                *scontext = NULL;
        *scontext_len  = 0;

        if (!selinux_initialized()) {
                if (sid <= SECINITSID_NUM) {
                        char *scontextp;
                        const char *s;

                        /*
                         * Before the policy is loaded, translate
                         * SECINITSID_INIT to "kernel", because systemd and
                         * libselinux < 2.6 take a getcon_raw() result that is
                         * both non-null and not "kernel" to mean that a policy
                         * is already loaded.
                         */
                        if (sid == SECINITSID_INIT)
                                sid = SECINITSID_KERNEL;

                        s = initial_sid_to_string[sid];
                        if (!s)
                                return -EINVAL;
                        *scontext_len = strlen(s) + 1;
                        if (!scontext)
                                return 0;
                        scontextp = kmemdup(s, *scontext_len, GFP_ATOMIC);
                        if (!scontextp)
                                return -ENOMEM;
                        *scontext = scontextp;
                        return 0;
                }
                pr_err("SELinux: %s:  called before initial "
                       "load_policy on unknown SID %d\n", __func__, sid);
                return -EINVAL;
        }
        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        if (force)
                entry = sidtab_search_entry_force(sidtab, sid);
        else
                entry = sidtab_search_entry(sidtab, sid);
        if (!entry) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                        __func__, sid);
                rc = -EINVAL;
                goto out_unlock;
        }
        if (only_invalid && !entry->context.len)
                goto out_unlock;

        rc = sidtab_entry_to_string(policydb, sidtab, entry, scontext,
                                    scontext_len);

out_unlock:
        rcu_read_unlock();
        return rc;

}

/**
 * security_sid_to_context - Obtain a context for a given SID.
 * @sid: security identifier, SID
 * @scontext: security context
 * @scontext_len: length in bytes
 *
 * Write the string representation of the context associated with @sid
 * into a dynamically allocated string of the correct size.  Set @scontext
 * to point to this string and set @scontext_len to the length of the string.
 */
int security_sid_to_context(u32 sid, char **scontext, u32 *scontext_len)
{
        return security_sid_to_context_core(sid, scontext,
                                            scontext_len, 0, 0);
}

int security_sid_to_context_force(u32 sid,
                                  char **scontext, u32 *scontext_len)
{
        return security_sid_to_context_core(sid, scontext,
                                            scontext_len, 1, 0);
}

/**
 * security_sid_to_context_inval - Obtain a context for a given SID if it
 *                                 is invalid.
 * @sid: security identifier, SID
 * @scontext: security context
 * @scontext_len: length in bytes
 *
 * Write the string representation of the context associated with @sid
 * into a dynamically allocated string of the correct size, but only if the
 * context is invalid in the current policy.  Set @scontext to point to
 * this string (or NULL if the context is valid) and set @scontext_len to
 * the length of the string (or 0 if the context is valid).
 */
int security_sid_to_context_inval(u32 sid,
                                  char **scontext, u32 *scontext_len)
{
        return security_sid_to_context_core(sid, scontext,
                                            scontext_len, 1, 1);
}

/*
 * Caveat:  Mutates scontext.
 */
static int string_to_context_struct(struct policydb *pol,
                                    struct sidtab *sidtabp,
                                    char *scontext,
                                    struct context *ctx,
                                    u32 def_sid)
{
        struct role_datum *role;
        struct type_datum *typdatum;
        struct user_datum *usrdatum;
        char *scontextp, *p, oldc;
        int rc = 0;

        context_init(ctx);

        /* Parse the security context. */

        rc = -EINVAL;
        scontextp = scontext;

        /* Extract the user. */
        p = scontextp;
        while (*p && *p != ':')
                p++;

        if (*p == 0)
                goto out;

        *p++ = 0;

        usrdatum = symtab_search(&pol->p_users, scontextp);
        if (!usrdatum)
                goto out;

        ctx->user = usrdatum->value;

        /* Extract role. */
        scontextp = p;
        while (*p && *p != ':')
                p++;

        if (*p == 0)
                goto out;

        *p++ = 0;

        role = symtab_search(&pol->p_roles, scontextp);
        if (!role)
                goto out;
        ctx->role = role->value;

        /* Extract type. */
        scontextp = p;
        while (*p && *p != ':')
                p++;
        oldc = *p;
        *p++ = 0;

        typdatum = symtab_search(&pol->p_types, scontextp);
        if (!typdatum || typdatum->attribute)
                goto out;

        ctx->type = typdatum->value;

        rc = mls_context_to_sid(pol, oldc, p, ctx, sidtabp, def_sid);
        if (rc)
                goto out;

        /* Check the validity of the new context. */
        rc = -EINVAL;
        if (!policydb_context_isvalid(pol, ctx))
                goto out;
        rc = 0;
out:
        if (rc)
                context_destroy(ctx);
        return rc;
}

static int security_context_to_sid_core(const char *scontext, u32 scontext_len,
                                        u32 *sid, u32 def_sid, gfp_t gfp_flags,
                                        int force)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        char *scontext2, *str = NULL;
        struct context context;
        int rc = 0;

        /* An empty security context is never valid. */
        if (!scontext_len)
                return -EINVAL;

        /* Copy the string to allow changes and ensure a NUL terminator */
        scontext2 = kmemdup_nul(scontext, scontext_len, gfp_flags);
        if (!scontext2)
                return -ENOMEM;

        if (!selinux_initialized()) {
                u32 i;

                for (i = 1; i < SECINITSID_NUM; i++) {
                        const char *s = initial_sid_to_string[i];

                        if (s && !strcmp(s, scontext2)) {
                                *sid = i;
                                goto out;
                        }
                }
                *sid = SECINITSID_KERNEL;
                goto out;
        }
        *sid = SECSID_NULL;

        if (force) {
                /* Save another copy for storing in uninterpreted form */
                rc = -ENOMEM;
                str = kstrdup(scontext2, gfp_flags);
                if (!str)
                        goto out;
        }
retry:
        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;
        rc = string_to_context_struct(policydb, sidtab, scontext2,
                                      &context, def_sid);
        if (rc == -EINVAL && force) {
                context.str = str;
                context.len = strlen(str) + 1;
                str = NULL;
        } else if (rc)
                goto out_unlock;
        rc = sidtab_context_to_sid(sidtab, &context, sid);
        if (rc == -ESTALE) {
                rcu_read_unlock();
                if (context.str) {
                        str = context.str;
                        context.str = NULL;
                }
                context_destroy(&context);
                goto retry;
        }
        context_destroy(&context);
out_unlock:
        rcu_read_unlock();
out:
        kfree(scontext2);
        kfree(str);
        return rc;
}

/**
 * security_context_to_sid - Obtain a SID for a given security context.
 * @scontext: security context
 * @scontext_len: length in bytes
 * @sid: security identifier, SID
 * @gfp: context for the allocation
 *
 * Obtains a SID associated with the security context that
 * has the string representation specified by @scontext.
 * Returns -%EINVAL if the context is invalid, -%ENOMEM if insufficient
 * memory is available, or 0 on success.
 */
int security_context_to_sid(const char *scontext, u32 scontext_len, u32 *sid,
                            gfp_t gfp)
{
        return security_context_to_sid_core(scontext, scontext_len,
                                            sid, SECSID_NULL, gfp, 0);
}

int security_context_str_to_sid(const char *scontext, u32 *sid, gfp_t gfp)
{
        return security_context_to_sid(scontext, strlen(scontext),
                                       sid, gfp);
}

/**
 * security_context_to_sid_default - Obtain a SID for a given security context,
 * falling back to specified default if needed.
 *
 * @scontext: security context
 * @scontext_len: length in bytes
 * @sid: security identifier, SID
 * @def_sid: default SID to assign on error
 * @gfp_flags: the allocator get-free-page (GFP) flags
 *
 * Obtains a SID associated with the security context that
 * has the string representation specified by @scontext.
 * The default SID is passed to the MLS layer to be used to allow
 * kernel labeling of the MLS field if the MLS field is not present
 * (for upgrading to MLS without full relabel).
 * Implicitly forces adding of the context even if it cannot be mapped yet.
 * Returns -%EINVAL if the context is invalid, -%ENOMEM if insufficient
 * memory is available, or 0 on success.
 */
int security_context_to_sid_default(const char *scontext, u32 scontext_len,
                                    u32 *sid, u32 def_sid, gfp_t gfp_flags)
{
        return security_context_to_sid_core(scontext, scontext_len,
                                            sid, def_sid, gfp_flags, 1);
}

int security_context_to_sid_force(const char *scontext, u32 scontext_len,
                                  u32 *sid)
{
        return security_context_to_sid_core(scontext, scontext_len,
                                            sid, SECSID_NULL, GFP_KERNEL, 1);
}

static int compute_sid_handle_invalid_context(
        struct selinux_policy *policy,
        struct sidtab_entry *sentry,
        struct sidtab_entry *tentry,
        u16 tclass,
        struct context *newcontext)
{
        struct policydb *policydb = &policy->policydb;
        struct sidtab *sidtab = policy->sidtab;
        char *s = NULL, *t = NULL, *n = NULL;
        u32 slen, tlen, nlen;
        struct audit_buffer *ab;

        if (sidtab_entry_to_string(policydb, sidtab, sentry, &s, &slen))
                goto out;
        if (sidtab_entry_to_string(policydb, sidtab, tentry, &t, &tlen))
                goto out;
        if (context_struct_to_string(policydb, newcontext, &n, &nlen))
                goto out;
        ab = audit_log_start(audit_context(), GFP_ATOMIC, AUDIT_SELINUX_ERR);
        if (!ab)
                goto out;
        audit_log_format(ab,
                         "op=security_compute_sid invalid_context=");
        /* no need to record the NUL with untrusted strings */
        audit_log_n_untrustedstring(ab, n, nlen - 1);
        audit_log_format(ab, " scontext=%s tcontext=%s tclass=%s",
                         s, t, sym_name(policydb, SYM_CLASSES, tclass-1));
        audit_log_end(ab);
out:
        kfree(s);
        kfree(t);
        kfree(n);
        if (!enforcing_enabled())
                return 0;
        return -EACCES;
}

static void filename_compute_type(struct policydb *policydb,
                                  struct context *newcontext,
                                  u32 stype, u32 ttype, u16 tclass,
                                  const char *objname)
{
        struct filename_trans_key ft;
        struct filename_trans_datum *datum;

        /*
         * Most filename trans rules are going to live in specific directories
         * like /dev or /var/run.  This bitmap will quickly skip rule searches
         * if the ttype does not contain any rules.
         */
        if (!ebitmap_get_bit(&policydb->filename_trans_ttypes, ttype))
                return;

        ft.ttype = ttype;
        ft.tclass = tclass;
        ft.name = objname;

        datum = policydb_filenametr_search(policydb, &ft);
        while (datum) {
                if (ebitmap_get_bit(&datum->stypes, stype - 1)) {
                        newcontext->type = datum->otype;
                        return;
                }
                datum = datum->next;
        }
}

static int security_compute_sid(u32 ssid,
                                u32 tsid,
                                u16 orig_tclass,
                                u16 specified,
                                const char *objname,
                                u32 *out_sid,
                                bool kern)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        struct class_datum *cladatum;
        struct context *scontext, *tcontext, newcontext;
        struct sidtab_entry *sentry, *tentry;
        struct avtab_key avkey;
        struct avtab_node *avnode, *node;
        u16 tclass;
        int rc = 0;
        bool sock;

        if (!selinux_initialized()) {
                switch (orig_tclass) {
                case SECCLASS_PROCESS: /* kernel value */
                        *out_sid = ssid;
                        break;
                default:
                        *out_sid = tsid;
                        break;
                }
                goto out;
        }

retry:
        cladatum = NULL;
        context_init(&newcontext);

        rcu_read_lock();

        policy = rcu_dereference(selinux_state.policy);

        if (kern) {
                tclass = unmap_class(&policy->map, orig_tclass);
                sock = security_is_socket_class(orig_tclass);
        } else {
                tclass = orig_tclass;
                sock = security_is_socket_class(map_class(&policy->map,
                                                          tclass));
        }

        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        sentry = sidtab_search_entry(sidtab, ssid);
        if (!sentry) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                       __func__, ssid);
                rc = -EINVAL;
                goto out_unlock;
        }
        tentry = sidtab_search_entry(sidtab, tsid);
        if (!tentry) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                       __func__, tsid);
                rc = -EINVAL;
                goto out_unlock;
        }

        scontext = &sentry->context;
        tcontext = &tentry->context;

        if (tclass && tclass <= policydb->p_classes.nprim)
                cladatum = policydb->class_val_to_struct[tclass - 1];

        /* Set the user identity. */
        switch (specified) {
        case AVTAB_TRANSITION:
        case AVTAB_CHANGE:
                if (cladatum && cladatum->default_user == DEFAULT_TARGET) {
                        newcontext.user = tcontext->user;
                } else {
                        /* notice this gets both DEFAULT_SOURCE and unset */
                        /* Use the process user identity. */
                        newcontext.user = scontext->user;
                }
                break;
        case AVTAB_MEMBER:
                /* Use the related object owner. */
                newcontext.user = tcontext->user;
                break;
        }

        /* Set the role to default values. */
        if (cladatum && cladatum->default_role == DEFAULT_SOURCE) {
                newcontext.role = scontext->role;
        } else if (cladatum && cladatum->default_role == DEFAULT_TARGET) {
                newcontext.role = tcontext->role;
        } else {
                if ((tclass == policydb->process_class) || sock)
                        newcontext.role = scontext->role;
                else
                        newcontext.role = OBJECT_R_VAL;
        }

        /* Set the type.
         * Look for a type transition/member/change rule.
         */
        avkey.source_type = scontext->type;
        avkey.target_type = tcontext->type;
        avkey.target_class = tclass;
        avkey.specified = specified;
        avnode = avtab_search_node(&policydb->te_avtab, &avkey);

        /* If no permanent rule, also check for enabled conditional rules */
        if (!avnode) {
                node = avtab_search_node(&policydb->te_cond_avtab, &avkey);
                for (; node; node = avtab_search_node_next(node, specified)) {
                        if (node->key.specified & AVTAB_ENABLED) {
                                avnode = node;
                                break;
                        }
                }
        }

        /* If a permanent rule is found, use the type from
         * the type transition/member/change rule. Otherwise,
         * set the type to its default values.
         */
        if (avnode) {
                newcontext.type = avnode->datum.u.data;
        } else if (cladatum && cladatum->default_type == DEFAULT_SOURCE) {
                newcontext.type = scontext->type;
        } else if (cladatum && cladatum->default_type == DEFAULT_TARGET) {
                newcontext.type = tcontext->type;
        } else {
                if ((tclass == policydb->process_class) || sock) {
                        /* Use the type of process. */
                        newcontext.type = scontext->type;
                } else {
                        /* Use the type of the related object. */
                        newcontext.type = tcontext->type;
                }
        }

        /* if we have a objname this is a file trans check so check those rules */
        if (objname)
                filename_compute_type(policydb, &newcontext, scontext->type,
                                      tcontext->type, tclass, objname);

        /* Check for class-specific changes. */
        if (specified & AVTAB_TRANSITION) {
                /* Look for a role transition rule. */
                struct role_trans_datum *rtd;
                struct role_trans_key rtk = {
                        .role = scontext->role,
                        .type = tcontext->type,
                        .tclass = tclass,
                };

                rtd = policydb_roletr_search(policydb, &rtk);
                if (rtd)
                        newcontext.role = rtd->new_role;
        }

        /* Set the MLS attributes.
           This is done last because it may allocate memory. */
        rc = mls_compute_sid(policydb, scontext, tcontext, tclass, specified,
                             &newcontext, sock);
        if (rc)
                goto out_unlock;

        /* Check the validity of the context. */
        if (!policydb_context_isvalid(policydb, &newcontext)) {
                rc = compute_sid_handle_invalid_context(policy, sentry,
                                                        tentry, tclass,
                                                        &newcontext);
                if (rc)
                        goto out_unlock;
        }
        /* Obtain the sid for the context. */
        if (context_equal(scontext, &newcontext))
                *out_sid = ssid;
        else if (context_equal(tcontext, &newcontext))
                *out_sid = tsid;
        else {
                rc = sidtab_context_to_sid(sidtab, &newcontext, out_sid);
                if (rc == -ESTALE) {
                        rcu_read_unlock();
                        context_destroy(&newcontext);
                        goto retry;
                }
        }
out_unlock:
        rcu_read_unlock();
        context_destroy(&newcontext);
out:
        return rc;
}

/**
 * security_transition_sid - Compute the SID for a new subject/object.
 * @ssid: source security identifier
 * @tsid: target security identifier
 * @tclass: target security class
 * @qstr: object name
 * @out_sid: security identifier for new subject/object
 *
 * Compute a SID to use for labeling a new subject or object in the
 * class @tclass based on a SID pair (@ssid, @tsid).
 * Return -%EINVAL if any of the parameters are invalid, -%ENOMEM
 * if insufficient memory is available, or %0 if the new SID was
 * computed successfully.
 */
int security_transition_sid(u32 ssid, u32 tsid, u16 tclass,
                            const struct qstr *qstr, u32 *out_sid)
{
        return security_compute_sid(ssid, tsid, tclass,
                                    AVTAB_TRANSITION,
                                    qstr ? qstr->name : NULL, out_sid, true);
}

int security_transition_sid_user(u32 ssid, u32 tsid, u16 tclass,
                                 const char *objname, u32 *out_sid)
{
        return security_compute_sid(ssid, tsid, tclass,
                                    AVTAB_TRANSITION,
                                    objname, out_sid, false);
}

/**
 * security_member_sid - Compute the SID for member selection.
 * @ssid: source security identifier
 * @tsid: target security identifier
 * @tclass: target security class
 * @out_sid: security identifier for selected member
 *
 * Compute a SID to use when selecting a member of a polyinstantiated
 * object of class @tclass based on a SID pair (@ssid, @tsid).
 * Return -%EINVAL if any of the parameters are invalid, -%ENOMEM
 * if insufficient memory is available, or %0 if the SID was
 * computed successfully.
 */
int security_member_sid(u32 ssid,
                        u32 tsid,
                        u16 tclass,
                        u32 *out_sid)
{
        return security_compute_sid(ssid, tsid, tclass,
                                    AVTAB_MEMBER, NULL,
                                    out_sid, false);
}

/**
 * security_change_sid - Compute the SID for object relabeling.
 * @ssid: source security identifier
 * @tsid: target security identifier
 * @tclass: target security class
 * @out_sid: security identifier for selected member
 *
 * Compute a SID to use for relabeling an object of class @tclass
 * based on a SID pair (@ssid, @tsid).
 * Return -%EINVAL if any of the parameters are invalid, -%ENOMEM
 * if insufficient memory is available, or %0 if the SID was
 * computed successfully.
 */
int security_change_sid(u32 ssid,
                        u32 tsid,
                        u16 tclass,
                        u32 *out_sid)
{
        return security_compute_sid(ssid, tsid, tclass, AVTAB_CHANGE, NULL,
                                    out_sid, false);
}

static inline int convert_context_handle_invalid_context(
        struct policydb *policydb,
        struct context *context)
{
        char *s;
        u32 len;

        if (enforcing_enabled())
                return -EINVAL;

        if (!context_struct_to_string(policydb, context, &s, &len)) {
                pr_warn("SELinux:  Context %s would be invalid if enforcing\n",
                        s);
                kfree(s);
        }
        return 0;
}

/**
 * services_convert_context - Convert a security context across policies.
 * @args: populated convert_context_args struct
 * @oldc: original context
 * @newc: converted context
 * @gfp_flags: allocation flags
 *
 * Convert the values in the security context structure @oldc from the values
 * specified in the policy @args->oldp to the values specified in the policy
 * @args->newp, storing the new context in @newc, and verifying that the
 * context is valid under the new policy.
 */
int services_convert_context(struct convert_context_args *args,
                             struct context *oldc, struct context *newc,
                             gfp_t gfp_flags)
{
        struct ocontext *oc;
        struct role_datum *role;
        struct type_datum *typdatum;
        struct user_datum *usrdatum;
        char *s;
        u32 len;
        int rc;

        if (oldc->str) {
                s = kstrdup(oldc->str, gfp_flags);
                if (!s)
                        return -ENOMEM;

                rc = string_to_context_struct(args->newp, NULL, s, newc, SECSID_NULL);
                if (rc == -EINVAL) {
                        /*
                         * Retain string representation for later mapping.
                         *
                         * IMPORTANT: We need to copy the contents of oldc->str
                         * back into s again because string_to_context_struct()
                         * may have garbled it.
                         */
                        memcpy(s, oldc->str, oldc->len);
                        context_init(newc);
                        newc->str = s;
                        newc->len = oldc->len;
                        return 0;
                }
                kfree(s);
                if (rc) {
                        /* Other error condition, e.g. ENOMEM. */
                        pr_err("SELinux:   Unable to map context %s, rc = %d.\n",
                               oldc->str, -rc);
                        return rc;
                }
                pr_info("SELinux:  Context %s became valid (mapped).\n",
                        oldc->str);
                return 0;
        }

        context_init(newc);

        /* Convert the user. */
        usrdatum = symtab_search(&args->newp->p_users,
                                 sym_name(args->oldp, SYM_USERS, oldc->user - 1));
        if (!usrdatum)
                goto bad;
        newc->user = usrdatum->value;

        /* Convert the role. */
        role = symtab_search(&args->newp->p_roles,
                             sym_name(args->oldp, SYM_ROLES, oldc->role - 1));
        if (!role)
                goto bad;
        newc->role = role->value;

        /* Convert the type. */
        typdatum = symtab_search(&args->newp->p_types,
                                 sym_name(args->oldp, SYM_TYPES, oldc->type - 1));
        if (!typdatum)
                goto bad;
        newc->type = typdatum->value;

        /* Convert the MLS fields if dealing with MLS policies */
        if (args->oldp->mls_enabled && args->newp->mls_enabled) {
                rc = mls_convert_context(args->oldp, args->newp, oldc, newc);
                if (rc)
                        goto bad;
        } else if (!args->oldp->mls_enabled && args->newp->mls_enabled) {
                /*
                 * Switching between non-MLS and MLS policy:
                 * ensure that the MLS fields of the context for all
                 * existing entries in the sidtab are filled in with a
                 * suitable default value, likely taken from one of the
                 * initial SIDs.
                 */
                oc = args->newp->ocontexts[OCON_ISID];
                while (oc && oc->sid[0] != SECINITSID_UNLABELED)
                        oc = oc->next;
                if (!oc) {
                        pr_err("SELinux:  unable to look up"
                                " the initial SIDs list\n");
                        goto bad;
                }
                rc = mls_range_set(newc, &oc->context[0].range);
                if (rc)
                        goto bad;
        }

        /* Check the validity of the new context. */
        if (!policydb_context_isvalid(args->newp, newc)) {
                rc = convert_context_handle_invalid_context(args->oldp, oldc);
                if (rc)
                        goto bad;
        }

        return 0;
bad:
        /* Map old representation to string and save it. */
        rc = context_struct_to_string(args->oldp, oldc, &s, &len);
        if (rc)
                return rc;
        context_destroy(newc);
        newc->str = s;
        newc->len = len;
        pr_info("SELinux:  Context %s became invalid (unmapped).\n",
                newc->str);
        return 0;
}

static void security_load_policycaps(struct selinux_policy *policy)
{
        struct policydb *p;
        unsigned int i;
        struct ebitmap_node *node;

        p = &policy->policydb;

        for (i = 0; i < ARRAY_SIZE(selinux_state.policycap); i++)
                WRITE_ONCE(selinux_state.policycap[i],
                        ebitmap_get_bit(&p->policycaps, i));

        for (i = 0; i < ARRAY_SIZE(selinux_policycap_names); i++)
                pr_info("SELinux:  policy capability %s=%d\n",
                        selinux_policycap_names[i],
                        ebitmap_get_bit(&p->policycaps, i));

        ebitmap_for_each_positive_bit(&p->policycaps, node, i) {
                if (i >= ARRAY_SIZE(selinux_policycap_names))
                        pr_info("SELinux:  unknown policy capability %u\n",
                                i);
        }
}

static int security_preserve_bools(struct selinux_policy *oldpolicy,
                                struct selinux_policy *newpolicy);

static void selinux_policy_free(struct selinux_policy *policy)
{
        if (!policy)
                return;

        sidtab_destroy(policy->sidtab);
        kfree(policy->map.mapping);
        policydb_destroy(&policy->policydb);
        kfree(policy->sidtab);
        kfree(policy);
}

static void selinux_policy_cond_free(struct selinux_policy *policy)
{
        cond_policydb_destroy_dup(&policy->policydb);
        kfree(policy);
}

void selinux_policy_cancel(struct selinux_load_state *load_state)
{
        struct selinux_state *state = &selinux_state;
        struct selinux_policy *oldpolicy;

        oldpolicy = rcu_dereference_protected(state->policy,
                                        lockdep_is_held(&state->policy_mutex));

        sidtab_cancel_convert(oldpolicy->sidtab);
        selinux_policy_free(load_state->policy);
        kfree(load_state->convert_data);
}

static void selinux_notify_policy_change(u32 seqno)
{
        /* Flush external caches and notify userspace of policy load */
        avc_ss_reset(seqno);
        selnl_notify_policyload(seqno);
        selinux_status_update_policyload(seqno);
        selinux_netlbl_cache_invalidate();
        selinux_xfrm_notify_policyload();
        selinux_ima_measure_state_locked();
}

void selinux_policy_commit(struct selinux_load_state *load_state)
{
        struct selinux_state *state = &selinux_state;
        struct selinux_policy *oldpolicy, *newpolicy = load_state->policy;
        unsigned long flags;
        u32 seqno;

        oldpolicy = rcu_dereference_protected(state->policy,
                                        lockdep_is_held(&state->policy_mutex));

        /* If switching between different policy types, log MLS status */
        if (oldpolicy) {
                if (oldpolicy->policydb.mls_enabled && !newpolicy->policydb.mls_enabled)
                        pr_info("SELinux: Disabling MLS support...\n");
                else if (!oldpolicy->policydb.mls_enabled && newpolicy->policydb.mls_enabled)
                        pr_info("SELinux: Enabling MLS support...\n");
        }

        /* Set latest granting seqno for new policy. */
        if (oldpolicy)
                newpolicy->latest_granting = oldpolicy->latest_granting + 1;
        else
                newpolicy->latest_granting = 1;
        seqno = newpolicy->latest_granting;

        /* Install the new policy. */
        if (oldpolicy) {
                sidtab_freeze_begin(oldpolicy->sidtab, &flags);
                rcu_assign_pointer(state->policy, newpolicy);
                sidtab_freeze_end(oldpolicy->sidtab, &flags);
        } else {
                rcu_assign_pointer(state->policy, newpolicy);
        }

        /* Load the policycaps from the new policy */
        security_load_policycaps(newpolicy);

        if (!selinux_initialized()) {
                /*
                 * After first policy load, the security server is
                 * marked as initialized and ready to handle requests and
                 * any objects created prior to policy load are then labeled.
                 */
                selinux_mark_initialized();
                selinux_complete_init();
        }

        /* Free the old policy */
        synchronize_rcu();
        selinux_policy_free(oldpolicy);
        kfree(load_state->convert_data);

        /* Notify others of the policy change */
        selinux_notify_policy_change(seqno);
}

/**
 * security_load_policy - Load a security policy configuration.
 * @data: binary policy data
 * @len: length of data in bytes
 * @load_state: policy load state
 *
 * Load a new set of security policy configuration data,
 * validate it and convert the SID table as necessary.
 * This function will flush the access vector cache after
 * loading the new policy.
 */
int security_load_policy(void *data, size_t len,
                         struct selinux_load_state *load_state)
{
        struct selinux_state *state = &selinux_state;
        struct selinux_policy *newpolicy, *oldpolicy;
        struct selinux_policy_convert_data *convert_data;
        int rc = 0;
        struct policy_file file = { data, len }, *fp = &file;

        newpolicy = kzalloc(sizeof(*newpolicy), GFP_KERNEL);
        if (!newpolicy)
                return -ENOMEM;

        newpolicy->sidtab = kzalloc(sizeof(*newpolicy->sidtab), GFP_KERNEL);
        if (!newpolicy->sidtab) {
                rc = -ENOMEM;
                goto err_policy;
        }

        rc = policydb_read(&newpolicy->policydb, fp);
        if (rc)
                goto err_sidtab;

        newpolicy->policydb.len = len;
        rc = selinux_set_mapping(&newpolicy->policydb, secclass_map,
                                &newpolicy->map);
        if (rc)
                goto err_policydb;

        rc = policydb_load_isids(&newpolicy->policydb, newpolicy->sidtab);
        if (rc) {
                pr_err("SELinux:  unable to load the initial SIDs\n");
                goto err_mapping;
        }

        if (!selinux_initialized()) {
                /* First policy load, so no need to preserve state from old policy */
                load_state->policy = newpolicy;
                load_state->convert_data = NULL;
                return 0;
        }

        oldpolicy = rcu_dereference_protected(state->policy,
                                        lockdep_is_held(&state->policy_mutex));

        /* Preserve active boolean values from the old policy */
        rc = security_preserve_bools(oldpolicy, newpolicy);
        if (rc) {
                pr_err("SELinux:  unable to preserve booleans\n");
                goto err_free_isids;
        }

        /*
         * Convert the internal representations of contexts
         * in the new SID table.
         */

        convert_data = kmalloc(sizeof(*convert_data), GFP_KERNEL);
        if (!convert_data) {
                rc = -ENOMEM;
                goto err_free_isids;
        }

        convert_data->args.oldp = &oldpolicy->policydb;
        convert_data->args.newp = &newpolicy->policydb;

        convert_data->sidtab_params.args = &convert_data->args;
        convert_data->sidtab_params.target = newpolicy->sidtab;

        rc = sidtab_convert(oldpolicy->sidtab, &convert_data->sidtab_params);
        if (rc) {
                pr_err("SELinux:  unable to convert the internal"
                        " representation of contexts in the new SID"
                        " table\n");
                goto err_free_convert_data;
        }

        load_state->policy = newpolicy;
        load_state->convert_data = convert_data;
        return 0;

err_free_convert_data:
        kfree(convert_data);
err_free_isids:
        sidtab_destroy(newpolicy->sidtab);
err_mapping:
        kfree(newpolicy->map.mapping);
err_policydb:
        policydb_destroy(&newpolicy->policydb);
err_sidtab:
        kfree(newpolicy->sidtab);
err_policy:
        kfree(newpolicy);

        return rc;
}

/**
 * ocontext_to_sid - Helper to safely get sid for an ocontext
 * @sidtab: SID table
 * @c: ocontext structure
 * @index: index of the context entry (0 or 1)
 * @out_sid: pointer to the resulting SID value
 *
 * For all ocontexts except OCON_ISID the SID fields are populated
 * on-demand when needed. Since updating the SID value is an SMP-sensitive
 * operation, this helper must be used to do that safely.
 *
 * WARNING: This function may return -ESTALE, indicating that the caller
 * must retry the operation after re-acquiring the policy pointer!
 */
static int ocontext_to_sid(struct sidtab *sidtab, struct ocontext *c,
                           size_t index, u32 *out_sid)
{
        int rc;
        u32 sid;

        /* Ensure the associated sidtab entry is visible to this thread. */
        sid = smp_load_acquire(&c->sid[index]);
        if (!sid) {
                rc = sidtab_context_to_sid(sidtab, &c->context[index], &sid);
                if (rc)
                        return rc;

                /*
                 * Ensure the new sidtab entry is visible to other threads
                 * when they see the SID.
                 */
                smp_store_release(&c->sid[index], sid);
        }
        *out_sid = sid;
        return 0;
}

/**
 * security_port_sid - Obtain the SID for a port.
 * @protocol: protocol number
 * @port: port number
 * @out_sid: security identifier
 */
int security_port_sid(u8 protocol, u16 port, u32 *out_sid)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        struct ocontext *c;
        int rc;

        if (!selinux_initialized()) {
                *out_sid = SECINITSID_PORT;
                return 0;
        }

retry:
        rc = 0;
        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        c = policydb->ocontexts[OCON_PORT];
        while (c) {
                if (c->u.port.protocol == protocol &&
                    c->u.port.low_port <= port &&
                    c->u.port.high_port >= port)
                        break;
                c = c->next;
        }

        if (c) {
                rc = ocontext_to_sid(sidtab, c, 0, out_sid);
                if (rc == -ESTALE) {
                        rcu_read_unlock();
                        goto retry;
                }
                if (rc)
                        goto out;
        } else {
                *out_sid = SECINITSID_PORT;
        }

out:
        rcu_read_unlock();
        return rc;
}

/**
 * security_ib_pkey_sid - Obtain the SID for a pkey.
 * @subnet_prefix: Subnet Prefix
 * @pkey_num: pkey number
 * @out_sid: security identifier
 */
int security_ib_pkey_sid(u64 subnet_prefix, u16 pkey_num, u32 *out_sid)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        struct ocontext *c;
        int rc;

        if (!selinux_initialized()) {
                *out_sid = SECINITSID_UNLABELED;
                return 0;
        }

retry:
        rc = 0;
        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        c = policydb->ocontexts[OCON_IBPKEY];
        while (c) {
                if (c->u.ibpkey.low_pkey <= pkey_num &&
                    c->u.ibpkey.high_pkey >= pkey_num &&
                    c->u.ibpkey.subnet_prefix == subnet_prefix)
                        break;

                c = c->next;
        }

        if (c) {
                rc = ocontext_to_sid(sidtab, c, 0, out_sid);
                if (rc == -ESTALE) {
                        rcu_read_unlock();
                        goto retry;
                }
                if (rc)
                        goto out;
        } else
                *out_sid = SECINITSID_UNLABELED;

out:
        rcu_read_unlock();
        return rc;
}

/**
 * security_ib_endport_sid - Obtain the SID for a subnet management interface.
 * @dev_name: device name
 * @port_num: port number
 * @out_sid: security identifier
 */
int security_ib_endport_sid(const char *dev_name, u8 port_num, u32 *out_sid)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        struct ocontext *c;
        int rc;

        if (!selinux_initialized()) {
                *out_sid = SECINITSID_UNLABELED;
                return 0;
        }

retry:
        rc = 0;
        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        c = policydb->ocontexts[OCON_IBENDPORT];
        while (c) {
                if (c->u.ibendport.port == port_num &&
                    !strncmp(c->u.ibendport.dev_name,
                             dev_name,
                             IB_DEVICE_NAME_MAX))
                        break;

                c = c->next;
        }

        if (c) {
                rc = ocontext_to_sid(sidtab, c, 0, out_sid);
                if (rc == -ESTALE) {
                        rcu_read_unlock();
                        goto retry;
                }
                if (rc)
                        goto out;
        } else
                *out_sid = SECINITSID_UNLABELED;

out:
        rcu_read_unlock();
        return rc;
}

/**
 * security_netif_sid - Obtain the SID for a network interface.
 * @name: interface name
 * @if_sid: interface SID
 */
int security_netif_sid(const char *name, u32 *if_sid)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        int rc;
        struct ocontext *c;
        bool wildcard_support;

        if (!selinux_initialized()) {
                *if_sid = SECINITSID_NETIF;
                return 0;
        }

retry:
        rc = 0;
        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;
        wildcard_support = ebitmap_get_bit(&policydb->policycaps, POLICYDB_CAP_NETIF_WILDCARD);

        c = policydb->ocontexts[OCON_NETIF];
        while (c) {
                if (wildcard_support) {
                        if (match_wildcard(c->u.name, name))
                                break;
                } else {
                        if (strcmp(c->u.name, name) == 0)
                                break;
                }

                c = c->next;
        }

        if (c) {
                rc = ocontext_to_sid(sidtab, c, 0, if_sid);
                if (rc == -ESTALE) {
                        rcu_read_unlock();
                        goto retry;
                }
                if (rc)
                        goto out;
        } else
                *if_sid = SECINITSID_NETIF;

out:
        rcu_read_unlock();
        return rc;
}

static bool match_ipv6_addrmask(const u32 input[4], const u32 addr[4], const u32 mask[4])
{
        int i;

        for (i = 0; i < 4; i++)
                if (addr[i] != (input[i] & mask[i]))
                        return false;

        return true;
}

/**
 * security_node_sid - Obtain the SID for a node (host).
 * @domain: communication domain aka address family
 * @addrp: address
 * @addrlen: address length in bytes
 * @out_sid: security identifier
 */
int security_node_sid(u16 domain,
                      const void *addrp,
                      u32 addrlen,
                      u32 *out_sid)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        int rc;
        struct ocontext *c;

        if (!selinux_initialized()) {
                *out_sid = SECINITSID_NODE;
                return 0;
        }

retry:
        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        switch (domain) {
        case AF_INET: {
                u32 addr;

                rc = -EINVAL;
                if (addrlen != sizeof(u32))
                        goto out;

                addr = *((const u32 *)addrp);

                c = policydb->ocontexts[OCON_NODE];
                while (c) {
                        if (c->u.node.addr == (addr & c->u.node.mask))
                                break;
                        c = c->next;
                }
                break;
        }

        case AF_INET6:
                rc = -EINVAL;
                if (addrlen != sizeof(u64) * 2)
                        goto out;
                c = policydb->ocontexts[OCON_NODE6];
                while (c) {
                        if (match_ipv6_addrmask(addrp, c->u.node6.addr,
                                                c->u.node6.mask))
                                break;
                        c = c->next;
                }
                break;

        default:
                rc = 0;
                *out_sid = SECINITSID_NODE;
                goto out;
        }

        if (c) {
                rc = ocontext_to_sid(sidtab, c, 0, out_sid);
                if (rc == -ESTALE) {
                        rcu_read_unlock();
                        goto retry;
                }
                if (rc)
                        goto out;
        } else {
                *out_sid = SECINITSID_NODE;
        }

        rc = 0;
out:
        rcu_read_unlock();
        return rc;
}

#define SIDS_NEL 25

/**
 * security_get_user_sids - Obtain reachable SIDs for a user.
 * @fromsid: starting SID
 * @username: username
 * @sids: array of reachable SIDs for user
 * @nel: number of elements in @sids
 *
 * Generate the set of SIDs for legal security contexts
 * for a given user that can be reached by @fromsid.
 * Set *@sids to point to a dynamically allocated
 * array containing the set of SIDs.  Set *@nel to the
 * number of elements in the array.
 */

int security_get_user_sids(u32 fromsid,
                           const char *username,
                           u32 **sids,
                           u32 *nel)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        struct context *fromcon, usercon;
        u32 *mysids = NULL, *mysids2, sid;
        u32 i, j, mynel, maxnel = SIDS_NEL;
        struct user_datum *user;
        struct role_datum *role;
        struct ebitmap_node *rnode, *tnode;
        int rc;

        *sids = NULL;
        *nel = 0;

        if (!selinux_initialized())
                return 0;

        mysids = kcalloc(maxnel, sizeof(*mysids), GFP_KERNEL);
        if (!mysids)
                return -ENOMEM;

retry:
        mynel = 0;
        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        context_init(&usercon);

        rc = -EINVAL;
        fromcon = sidtab_search(sidtab, fromsid);
        if (!fromcon)
                goto out_unlock;

        rc = -EINVAL;
        user = symtab_search(&policydb->p_users, username);
        if (!user)
                goto out_unlock;

        usercon.user = user->value;

        ebitmap_for_each_positive_bit(&user->roles, rnode, i) {
                role = policydb->role_val_to_struct[i];
                usercon.role = i + 1;
                ebitmap_for_each_positive_bit(&role->types, tnode, j) {
                        usercon.type = j + 1;

                        if (mls_setup_user_range(policydb, fromcon, user,
                                                 &usercon))
                                continue;

                        rc = sidtab_context_to_sid(sidtab, &usercon, &sid);
                        if (rc == -ESTALE) {
                                rcu_read_unlock();
                                goto retry;
                        }
                        if (rc)
                                goto out_unlock;
                        if (mynel < maxnel) {
                                mysids[mynel++] = sid;
                        } else {
                                rc = -ENOMEM;
                                maxnel += SIDS_NEL;
                                mysids2 = kcalloc(maxnel, sizeof(*mysids2), GFP_ATOMIC);
                                if (!mysids2)
                                        goto out_unlock;
                                memcpy(mysids2, mysids, mynel * sizeof(*mysids2));
                                kfree(mysids);
                                mysids = mysids2;
                                mysids[mynel++] = sid;
                        }
                }
        }
        rc = 0;
out_unlock:
        rcu_read_unlock();
        if (rc || !mynel) {
                kfree(mysids);
                return rc;
        }

        rc = -ENOMEM;
        mysids2 = kcalloc(mynel, sizeof(*mysids2), GFP_KERNEL);
        if (!mysids2) {
                kfree(mysids);
                return rc;
        }
        for (i = 0, j = 0; i < mynel; i++) {
                struct av_decision dummy_avd;
                rc = avc_has_perm_noaudit(fromsid, mysids[i],
                                          SECCLASS_PROCESS, /* kernel value */
                                          PROCESS__TRANSITION, AVC_STRICT,
                                          &dummy_avd);
                if (!rc)
                        mysids2[j++] = mysids[i];
                cond_resched();
        }
        kfree(mysids);
        *sids = mysids2;
        *nel = j;
        return 0;
}

/**
 * __security_genfs_sid - Helper to obtain a SID for a file in a filesystem
 * @policy: policy
 * @fstype: filesystem type
 * @path: path from root of mount
 * @orig_sclass: file security class
 * @sid: SID for path
 *
 * Obtain a SID to use for a file in a filesystem that
 * cannot support xattr or use a fixed labeling behavior like
 * transition SIDs or task SIDs.
 *
 * WARNING: This function may return -ESTALE, indicating that the caller
 * must retry the operation after re-acquiring the policy pointer!
 */
static inline int __security_genfs_sid(struct selinux_policy *policy,
                                       const char *fstype,
                                       const char *path,
                                       u16 orig_sclass,
                                       u32 *sid)
{
        struct policydb *policydb = &policy->policydb;
        struct sidtab *sidtab = policy->sidtab;
        u16 sclass;
        struct genfs *genfs;
        struct ocontext *c;
        int cmp = 0;
        bool wildcard;

        while (path[0] == '/' && path[1] == '/')
                path++;

        sclass = unmap_class(&policy->map, orig_sclass);
        *sid = SECINITSID_UNLABELED;

        for (genfs = policydb->genfs; genfs; genfs = genfs->next) {
                cmp = strcmp(fstype, genfs->fstype);
                if (cmp <= 0)
                        break;
        }

        if (!genfs || cmp)
                return -ENOENT;

        wildcard = ebitmap_get_bit(&policy->policydb.policycaps,
                                   POLICYDB_CAP_GENFS_SECLABEL_WILDCARD);
        for (c = genfs->head; c; c = c->next) {
                if (!c->v.sclass || sclass == c->v.sclass) {
                        if (wildcard) {
                                if (match_wildcard(c->u.name, path))
                                        break;
                        } else {
                                size_t len = strlen(c->u.name);

                                if ((strncmp(c->u.name, path, len)) == 0)
                                        break;
                        }
                }
        }

        if (!c)
                return -ENOENT;

        return ocontext_to_sid(sidtab, c, 0, sid);
}

/**
 * security_genfs_sid - Obtain a SID for a file in a filesystem
 * @fstype: filesystem type
 * @path: path from root of mount
 * @orig_sclass: file security class
 * @sid: SID for path
 *
 * Acquire policy_rwlock before calling __security_genfs_sid() and release
 * it afterward.
 */
int security_genfs_sid(const char *fstype,
                       const char *path,
                       u16 orig_sclass,
                       u32 *sid)
{
        struct selinux_policy *policy;
        int retval;

        if (!selinux_initialized()) {
                *sid = SECINITSID_UNLABELED;
                return 0;
        }

        do {
                rcu_read_lock();
                policy = rcu_dereference(selinux_state.policy);
                retval = __security_genfs_sid(policy, fstype, path,
                                              orig_sclass, sid);
                rcu_read_unlock();
        } while (retval == -ESTALE);
        return retval;
}

int selinux_policy_genfs_sid(struct selinux_policy *policy,
                        const char *fstype,
                        const char *path,
                        u16 orig_sclass,
                        u32 *sid)
{
        /* no lock required, policy is not yet accessible by other threads */
        return __security_genfs_sid(policy, fstype, path, orig_sclass, sid);
}

/**
 * security_fs_use - Determine how to handle labeling for a filesystem.
 * @sb: superblock in question
 */
int security_fs_use(struct super_block *sb)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        int rc;
        struct ocontext *c;
        struct superblock_security_struct *sbsec = selinux_superblock(sb);
        const char *fstype = sb->s_type->name;

        if (!selinux_initialized()) {
                sbsec->behavior = SECURITY_FS_USE_NONE;
                sbsec->sid = SECINITSID_UNLABELED;
                return 0;
        }

retry:
        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        c = policydb->ocontexts[OCON_FSUSE];
        while (c) {
                if (strcmp(fstype, c->u.name) == 0)
                        break;
                c = c->next;
        }

        if (c) {
                sbsec->behavior = c->v.behavior;
                rc = ocontext_to_sid(sidtab, c, 0, &sbsec->sid);
                if (rc == -ESTALE) {
                        rcu_read_unlock();
                        goto retry;
                }
                if (rc)
                        goto out;
        } else {
                rc = __security_genfs_sid(policy, fstype, "/",
                                        SECCLASS_DIR, &sbsec->sid);
                if (rc == -ESTALE) {
                        rcu_read_unlock();
                        goto retry;
                }
                if (rc) {
                        sbsec->behavior = SECURITY_FS_USE_NONE;
                        rc = 0;
                } else {
                        sbsec->behavior = SECURITY_FS_USE_GENFS;
                }
        }

out:
        rcu_read_unlock();
        return rc;
}

int security_get_bools(struct selinux_policy *policy,
                       u32 *len, char ***names, int **values)
{
        struct policydb *policydb;
        u32 i;
        int rc;

        policydb = &policy->policydb;

        *names = NULL;
        *values = NULL;

        rc = 0;
        *len = policydb->p_bools.nprim;
        if (!*len)
                goto out;

        rc = -ENOMEM;
        *names = kcalloc(*len, sizeof(char *), GFP_ATOMIC);
        if (!*names)
                goto err;

        rc = -ENOMEM;
        *values = kcalloc(*len, sizeof(int), GFP_ATOMIC);
        if (!*values)
                goto err;

        for (i = 0; i < *len; i++) {
                (*values)[i] = policydb->bool_val_to_struct[i]->state;

                rc = -ENOMEM;
                (*names)[i] = kstrdup(sym_name(policydb, SYM_BOOLS, i),
                                      GFP_ATOMIC);
                if (!(*names)[i])
                        goto err;
        }
        rc = 0;
out:
        return rc;
err:
        if (*names) {
                for (i = 0; i < *len; i++)
                        kfree((*names)[i]);
                kfree(*names);
        }
        kfree(*values);
        *len = 0;
        *names = NULL;
        *values = NULL;
        goto out;
}


int security_set_bools(u32 len, const int *values)
{
        struct selinux_state *state = &selinux_state;
        struct selinux_policy *newpolicy, *oldpolicy;
        int rc;
        u32 i, seqno = 0;

        if (!selinux_initialized())
                return -EINVAL;

        oldpolicy = rcu_dereference_protected(state->policy,
                                        lockdep_is_held(&state->policy_mutex));

        /* Consistency check on number of booleans, should never fail */
        if (WARN_ON(len != oldpolicy->policydb.p_bools.nprim))
                return -EINVAL;

        newpolicy = kmemdup(oldpolicy, sizeof(*newpolicy), GFP_KERNEL);
        if (!newpolicy)
                return -ENOMEM;

        /*
         * Deep copy only the parts of the policydb that might be
         * modified as a result of changing booleans.
         */
        rc = cond_policydb_dup(&newpolicy->policydb, &oldpolicy->policydb);
        if (rc) {
                kfree(newpolicy);
                return -ENOMEM;
        }

        /* Update the boolean states in the copy */
        for (i = 0; i < len; i++) {
                int new_state = !!values[i];
                int old_state = newpolicy->policydb.bool_val_to_struct[i]->state;

                if (new_state != old_state) {
                        audit_log(audit_context(), GFP_ATOMIC,
                                AUDIT_MAC_CONFIG_CHANGE,
                                "bool=%s val=%d old_val=%d auid=%u ses=%u",
                                sym_name(&newpolicy->policydb, SYM_BOOLS, i),
                                new_state,
                                old_state,
                                from_kuid(&init_user_ns, audit_get_loginuid(current)),
                                audit_get_sessionid(current));
                        newpolicy->policydb.bool_val_to_struct[i]->state = new_state;
                }
        }

        /* Re-evaluate the conditional rules in the copy */
        evaluate_cond_nodes(&newpolicy->policydb);

        /* Set latest granting seqno for new policy */
        newpolicy->latest_granting = oldpolicy->latest_granting + 1;
        seqno = newpolicy->latest_granting;

        /* Install the new policy */
        rcu_assign_pointer(state->policy, newpolicy);

        /*
         * Free the conditional portions of the old policydb
         * that were copied for the new policy, and the oldpolicy
         * structure itself but not what it references.
         */
        synchronize_rcu();
        selinux_policy_cond_free(oldpolicy);

        /* Notify others of the policy change */
        selinux_notify_policy_change(seqno);
        return 0;
}

int security_get_bool_value(u32 index)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        int rc;
        u32 len;

        if (!selinux_initialized())
                return 0;

        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;

        rc = -EFAULT;
        len = policydb->p_bools.nprim;
        if (index >= len)
                goto out;

        rc = policydb->bool_val_to_struct[index]->state;
out:
        rcu_read_unlock();
        return rc;
}

static int security_preserve_bools(struct selinux_policy *oldpolicy,
                                struct selinux_policy *newpolicy)
{
        int rc, *bvalues = NULL;
        char **bnames = NULL;
        struct cond_bool_datum *booldatum;
        u32 i, nbools = 0;

        rc = security_get_bools(oldpolicy, &nbools, &bnames, &bvalues);
        if (rc)
                goto out;
        for (i = 0; i < nbools; i++) {
                booldatum = symtab_search(&newpolicy->policydb.p_bools,
                                        bnames[i]);
                if (booldatum)
                        booldatum->state = bvalues[i];
        }
        evaluate_cond_nodes(&newpolicy->policydb);

out:
        if (bnames) {
                for (i = 0; i < nbools; i++)
                        kfree(bnames[i]);
        }
        kfree(bnames);
        kfree(bvalues);
        return rc;
}

/*
 * security_sid_mls_copy() - computes a new sid based on the given
 * sid and the mls portion of mls_sid.
 */
int security_sid_mls_copy(u32 sid, u32 mls_sid, u32 *new_sid)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        struct context *context1;
        struct context *context2;
        struct context newcon;
        char *s;
        u32 len;
        int rc;

        if (!selinux_initialized()) {
                *new_sid = sid;
                return 0;
        }

retry:
        rc = 0;
        context_init(&newcon);

        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        if (!policydb->mls_enabled) {
                *new_sid = sid;
                goto out_unlock;
        }

        rc = -EINVAL;
        context1 = sidtab_search(sidtab, sid);
        if (!context1) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                        __func__, sid);
                goto out_unlock;
        }

        rc = -EINVAL;
        context2 = sidtab_search(sidtab, mls_sid);
        if (!context2) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                        __func__, mls_sid);
                goto out_unlock;
        }

        newcon.user = context1->user;
        newcon.role = context1->role;
        newcon.type = context1->type;
        rc = mls_context_cpy(&newcon, context2);
        if (rc)
                goto out_unlock;

        /* Check the validity of the new context. */
        if (!policydb_context_isvalid(policydb, &newcon)) {
                rc = convert_context_handle_invalid_context(policydb,
                                                        &newcon);
                if (rc) {
                        if (!context_struct_to_string(policydb, &newcon, &s,
                                                      &len)) {
                                struct audit_buffer *ab;

                                ab = audit_log_start(audit_context(),
                                                     GFP_ATOMIC,
                                                     AUDIT_SELINUX_ERR);
                                audit_log_format(ab,
                                                 "op=security_sid_mls_copy invalid_context=");
                                /* don't record NUL with untrusted strings */
                                audit_log_n_untrustedstring(ab, s, len - 1);
                                audit_log_end(ab);
                                kfree(s);
                        }
                        goto out_unlock;
                }
        }
        rc = sidtab_context_to_sid(sidtab, &newcon, new_sid);
        if (rc == -ESTALE) {
                rcu_read_unlock();
                context_destroy(&newcon);
                goto retry;
        }
out_unlock:
        rcu_read_unlock();
        context_destroy(&newcon);
        return rc;
}

/**
 * security_net_peersid_resolve - Compare and resolve two network peer SIDs
 * @nlbl_sid: NetLabel SID
 * @nlbl_type: NetLabel labeling protocol type
 * @xfrm_sid: XFRM SID
 * @peer_sid: network peer sid
 *
 * Description:
 * Compare the @nlbl_sid and @xfrm_sid values and if the two SIDs can be
 * resolved into a single SID it is returned via @peer_sid and the function
 * returns zero.  Otherwise @peer_sid is set to SECSID_NULL and the function
 * returns a negative value.  A table summarizing the behavior is below:
 *
 *                                 | function return |      @sid
 *   ------------------------------+-----------------+-----------------
 *   no peer labels                |        0        |    SECSID_NULL
 *   single peer label             |        0        |    <peer_label>
 *   multiple, consistent labels   |        0        |    <peer_label>
 *   multiple, inconsistent labels |    -<errno>     |    SECSID_NULL
 *
 */
int security_net_peersid_resolve(u32 nlbl_sid, u32 nlbl_type,
                                 u32 xfrm_sid,
                                 u32 *peer_sid)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        int rc;
        struct context *nlbl_ctx;
        struct context *xfrm_ctx;

        *peer_sid = SECSID_NULL;

        /* handle the common (which also happens to be the set of easy) cases
         * right away, these two if statements catch everything involving a
         * single or absent peer SID/label */
        if (xfrm_sid == SECSID_NULL) {
                *peer_sid = nlbl_sid;
                return 0;
        }
        /* NOTE: an nlbl_type == NETLBL_NLTYPE_UNLABELED is a "fallback" label
         * and is treated as if nlbl_sid == SECSID_NULL when a XFRM SID/label
         * is present */
        if (nlbl_sid == SECSID_NULL || nlbl_type == NETLBL_NLTYPE_UNLABELED) {
                *peer_sid = xfrm_sid;
                return 0;
        }

        if (!selinux_initialized())
                return 0;

        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        /*
         * We don't need to check initialized here since the only way both
         * nlbl_sid and xfrm_sid are not equal to SECSID_NULL would be if the
         * security server was initialized and state->initialized was true.
         */
        if (!policydb->mls_enabled) {
                rc = 0;
                goto out;
        }

        rc = -EINVAL;
        nlbl_ctx = sidtab_search(sidtab, nlbl_sid);
        if (!nlbl_ctx) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                       __func__, nlbl_sid);
                goto out;
        }
        rc = -EINVAL;
        xfrm_ctx = sidtab_search(sidtab, xfrm_sid);
        if (!xfrm_ctx) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                       __func__, xfrm_sid);
                goto out;
        }
        rc = (mls_context_equal(nlbl_ctx, xfrm_ctx) ? 0 : -EACCES);
        if (rc)
                goto out;

        /* at present NetLabel SIDs/labels really only carry MLS
         * information so if the MLS portion of the NetLabel SID
         * matches the MLS portion of the labeled XFRM SID/label
         * then pass along the XFRM SID as it is the most
         * expressive */
        *peer_sid = xfrm_sid;
out:
        rcu_read_unlock();
        return rc;
}

static int get_classes_callback(void *k, void *d, void *args)
{
        struct class_datum *datum = d;
        char *name = k, **classes = args;
        u32 value = datum->value - 1;

        classes[value] = kstrdup(name, GFP_ATOMIC);
        if (!classes[value])
                return -ENOMEM;

        return 0;
}

int security_get_classes(struct selinux_policy *policy,
                         char ***classes, u32 *nclasses)
{
        struct policydb *policydb;
        int rc;

        policydb = &policy->policydb;

        rc = -ENOMEM;
        *nclasses = policydb->p_classes.nprim;
        *classes = kcalloc(*nclasses, sizeof(**classes), GFP_ATOMIC);
        if (!*classes)
                goto out;

        rc = hashtab_map(&policydb->p_classes.table, get_classes_callback,
                         *classes);
        if (rc) {
                u32 i;

                for (i = 0; i < *nclasses; i++)
                        kfree((*classes)[i]);
                kfree(*classes);
        }

out:
        return rc;
}

static int get_permissions_callback(void *k, void *d, void *args)
{
        struct perm_datum *datum = d;
        char *name = k, **perms = args;
        u32 value = datum->value - 1;

        perms[value] = kstrdup(name, GFP_ATOMIC);
        if (!perms[value])
                return -ENOMEM;

        return 0;
}

int security_get_permissions(struct selinux_policy *policy,
                             const char *class, char ***perms, u32 *nperms)
{
        struct policydb *policydb;
        u32 i;
        int rc;
        struct class_datum *match;

        policydb = &policy->policydb;

        rc = -EINVAL;
        match = symtab_search(&policydb->p_classes, class);
        if (!match) {
                pr_err("SELinux: %s:  unrecognized class %s\n",
                        __func__, class);
                goto out;
        }

        rc = -ENOMEM;
        *nperms = match->permissions.nprim;
        *perms = kcalloc(*nperms, sizeof(**perms), GFP_ATOMIC);
        if (!*perms)
                goto out;

        if (match->comdatum) {
                rc = hashtab_map(&match->comdatum->permissions.table,
                                 get_permissions_callback, *perms);
                if (rc)
                        goto err;
        }

        rc = hashtab_map(&match->permissions.table, get_permissions_callback,
                         *perms);
        if (rc)
                goto err;

out:
        return rc;

err:
        for (i = 0; i < *nperms; i++)
                kfree((*perms)[i]);
        kfree(*perms);
        return rc;
}

int security_get_reject_unknown(void)
{
        struct selinux_policy *policy;
        int value;

        if (!selinux_initialized())
                return 0;

        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        value = policy->policydb.reject_unknown;
        rcu_read_unlock();
        return value;
}

int security_get_allow_unknown(void)
{
        struct selinux_policy *policy;
        int value;

        if (!selinux_initialized())
                return 0;

        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        value = policy->policydb.allow_unknown;
        rcu_read_unlock();
        return value;
}

/**
 * security_policycap_supported - Check for a specific policy capability
 * @req_cap: capability
 *
 * Description:
 * This function queries the currently loaded policy to see if it supports the
 * capability specified by @req_cap.  Returns true (1) if the capability is
 * supported, false (0) if it isn't supported.
 *
 */
int security_policycap_supported(unsigned int req_cap)
{
        struct selinux_policy *policy;
        int rc;

        if (!selinux_initialized())
                return 0;

        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        rc = ebitmap_get_bit(&policy->policydb.policycaps, req_cap);
        rcu_read_unlock();

        return rc;
}

struct selinux_audit_rule {
        u32 au_seqno;
        struct context au_ctxt;
};

void selinux_audit_rule_free(void *vrule)
{
        struct selinux_audit_rule *rule = vrule;

        if (rule) {
                context_destroy(&rule->au_ctxt);
                kfree(rule);
        }
}

int selinux_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule,
                            gfp_t gfp)
{
        struct selinux_state *state = &selinux_state;
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct selinux_audit_rule *tmprule;
        struct role_datum *roledatum;
        struct type_datum *typedatum;
        struct user_datum *userdatum;
        struct selinux_audit_rule **rule = (struct selinux_audit_rule **)vrule;
        int rc = 0;

        *rule = NULL;

        if (!selinux_initialized())
                return -EOPNOTSUPP;

        switch (field) {
        case AUDIT_SUBJ_USER:
        case AUDIT_SUBJ_ROLE:
        case AUDIT_SUBJ_TYPE:
        case AUDIT_OBJ_USER:
        case AUDIT_OBJ_ROLE:
        case AUDIT_OBJ_TYPE:
                /* only 'equals' and 'not equals' fit user, role, and type */
                if (op != Audit_equal && op != Audit_not_equal)
                        return -EINVAL;
                break;
        case AUDIT_SUBJ_SEN:
        case AUDIT_SUBJ_CLR:
        case AUDIT_OBJ_LEV_LOW:
        case AUDIT_OBJ_LEV_HIGH:
                /* we do not allow a range, indicated by the presence of '-' */
                if (strchr(rulestr, '-'))
                        return -EINVAL;
                break;
        default:
                /* only the above fields are valid */
                return -EINVAL;
        }

        tmprule = kzalloc(sizeof(struct selinux_audit_rule), gfp);
        if (!tmprule)
                return -ENOMEM;
        context_init(&tmprule->au_ctxt);

        rcu_read_lock();
        policy = rcu_dereference(state->policy);
        policydb = &policy->policydb;
        tmprule->au_seqno = policy->latest_granting;
        switch (field) {
        case AUDIT_SUBJ_USER:
        case AUDIT_OBJ_USER:
                userdatum = symtab_search(&policydb->p_users, rulestr);
                if (!userdatum) {
                        rc = -EINVAL;
                        goto err;
                }
                tmprule->au_ctxt.user = userdatum->value;
                break;
        case AUDIT_SUBJ_ROLE:
        case AUDIT_OBJ_ROLE:
                roledatum = symtab_search(&policydb->p_roles, rulestr);
                if (!roledatum) {
                        rc = -EINVAL;
                        goto err;
                }
                tmprule->au_ctxt.role = roledatum->value;
                break;
        case AUDIT_SUBJ_TYPE:
        case AUDIT_OBJ_TYPE:
                typedatum = symtab_search(&policydb->p_types, rulestr);
                if (!typedatum) {
                        rc = -EINVAL;
                        goto err;
                }
                tmprule->au_ctxt.type = typedatum->value;
                break;
        case AUDIT_SUBJ_SEN:
        case AUDIT_SUBJ_CLR:
        case AUDIT_OBJ_LEV_LOW:
        case AUDIT_OBJ_LEV_HIGH:
                rc = mls_from_string(policydb, rulestr, &tmprule->au_ctxt,
                                     GFP_ATOMIC);
                if (rc)
                        goto err;
                break;
        }
        rcu_read_unlock();

        *rule = tmprule;
        return 0;

err:
        rcu_read_unlock();
        selinux_audit_rule_free(tmprule);
        *rule = NULL;
        return rc;
}

/* Check to see if the rule contains any selinux fields */
int selinux_audit_rule_known(struct audit_krule *rule)
{
        u32 i;

        for (i = 0; i < rule->field_count; i++) {
                struct audit_field *f = &rule->fields[i];
                switch (f->type) {
                case AUDIT_SUBJ_USER:
                case AUDIT_SUBJ_ROLE:
                case AUDIT_SUBJ_TYPE:
                case AUDIT_SUBJ_SEN:
                case AUDIT_SUBJ_CLR:
                case AUDIT_OBJ_USER:
                case AUDIT_OBJ_ROLE:
                case AUDIT_OBJ_TYPE:
                case AUDIT_OBJ_LEV_LOW:
                case AUDIT_OBJ_LEV_HIGH:
                        return 1;
                }
        }

        return 0;
}

int selinux_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op, void *vrule)
{
        struct selinux_state *state = &selinux_state;
        struct selinux_policy *policy;
        struct context *ctxt;
        struct mls_level *level;
        struct selinux_audit_rule *rule = vrule;
        int match = 0;

        if (unlikely(!rule)) {
                WARN_ONCE(1, "selinux_audit_rule_match: missing rule\n");
                return -ENOENT;
        }

        if (!selinux_initialized())
                return 0;

        rcu_read_lock();

        policy = rcu_dereference(state->policy);

        if (rule->au_seqno < policy->latest_granting) {
                match = -ESTALE;
                goto out;
        }

        ctxt = sidtab_search(policy->sidtab, prop->selinux.secid);
        if (unlikely(!ctxt)) {
                WARN_ONCE(1, "selinux_audit_rule_match: unrecognized SID %d\n",
                          prop->selinux.secid);
                match = -ENOENT;
                goto out;
        }

        /* a field/op pair that is not caught here will simply fall through
           without a match */
        switch (field) {
        case AUDIT_SUBJ_USER:
        case AUDIT_OBJ_USER:
                switch (op) {
                case Audit_equal:
                        match = (ctxt->user == rule->au_ctxt.user);
                        break;
                case Audit_not_equal:
                        match = (ctxt->user != rule->au_ctxt.user);
                        break;
                }
                break;
        case AUDIT_SUBJ_ROLE:
        case AUDIT_OBJ_ROLE:
                switch (op) {
                case Audit_equal:
                        match = (ctxt->role == rule->au_ctxt.role);
                        break;
                case Audit_not_equal:
                        match = (ctxt->role != rule->au_ctxt.role);
                        break;
                }
                break;
        case AUDIT_SUBJ_TYPE:
        case AUDIT_OBJ_TYPE:
                switch (op) {
                case Audit_equal:
                        match = (ctxt->type == rule->au_ctxt.type);
                        break;
                case Audit_not_equal:
                        match = (ctxt->type != rule->au_ctxt.type);
                        break;
                }
                break;
        case AUDIT_SUBJ_SEN:
        case AUDIT_SUBJ_CLR:
        case AUDIT_OBJ_LEV_LOW:
        case AUDIT_OBJ_LEV_HIGH:
                level = ((field == AUDIT_SUBJ_SEN ||
                          field == AUDIT_OBJ_LEV_LOW) ?
                         &ctxt->range.level[0] : &ctxt->range.level[1]);
                switch (op) {
                case Audit_equal:
                        match = mls_level_eq(&rule->au_ctxt.range.level[0],
                                             level);
                        break;
                case Audit_not_equal:
                        match = !mls_level_eq(&rule->au_ctxt.range.level[0],
                                              level);
                        break;
                case Audit_lt:
                        match = (mls_level_dom(&rule->au_ctxt.range.level[0],
                                               level) &&
                                 !mls_level_eq(&rule->au_ctxt.range.level[0],
                                               level));
                        break;
                case Audit_le:
                        match = mls_level_dom(&rule->au_ctxt.range.level[0],
                                              level);
                        break;
                case Audit_gt:
                        match = (mls_level_dom(level,
                                              &rule->au_ctxt.range.level[0]) &&
                                 !mls_level_eq(level,
                                               &rule->au_ctxt.range.level[0]));
                        break;
                case Audit_ge:
                        match = mls_level_dom(level,
                                              &rule->au_ctxt.range.level[0]);
                        break;
                }
        }

out:
        rcu_read_unlock();
        return match;
}

static int aurule_avc_callback(u32 event)
{
        if (event == AVC_CALLBACK_RESET)
                return audit_update_lsm_rules();
        return 0;
}

static int __init aurule_init(void)
{
        int err;

        err = avc_add_callback(aurule_avc_callback, AVC_CALLBACK_RESET);
        if (err)
                panic("avc_add_callback() failed, error %d\n", err);

        return err;
}
__initcall(aurule_init);

#ifdef CONFIG_NETLABEL
/**
 * security_netlbl_cache_add - Add an entry to the NetLabel cache
 * @secattr: the NetLabel packet security attributes
 * @sid: the SELinux SID
 *
 * Description:
 * Attempt to cache the context in @ctx, which was derived from the packet in
 * @skb, in the NetLabel subsystem cache.  This function assumes @secattr has
 * already been initialized.
 *
 */
static void security_netlbl_cache_add(struct netlbl_lsm_secattr *secattr,
                                      u32 sid)
{
        u32 *sid_cache;

        sid_cache = kmalloc(sizeof(*sid_cache), GFP_ATOMIC);
        if (sid_cache == NULL)
                return;
        secattr->cache = netlbl_secattr_cache_alloc(GFP_ATOMIC);
        if (secattr->cache == NULL) {
                kfree(sid_cache);
                return;
        }

        *sid_cache = sid;
        secattr->cache->free = kfree;
        secattr->cache->data = sid_cache;
        secattr->flags |= NETLBL_SECATTR_CACHE;
}

/**
 * security_netlbl_secattr_to_sid - Convert a NetLabel secattr to a SELinux SID
 * @secattr: the NetLabel packet security attributes
 * @sid: the SELinux SID
 *
 * Description:
 * Convert the given NetLabel security attributes in @secattr into a
 * SELinux SID.  If the @secattr field does not contain a full SELinux
 * SID/context then use SECINITSID_NETMSG as the foundation.  If possible the
 * 'cache' field of @secattr is set and the CACHE flag is set; this is to
 * allow the @secattr to be used by NetLabel to cache the secattr to SID
 * conversion for future lookups.  Returns zero on success, negative values on
 * failure.
 *
 */
int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr,
                                   u32 *sid)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        int rc;
        struct context *ctx;
        struct context ctx_new;

        if (!selinux_initialized()) {
                *sid = SECSID_NULL;
                return 0;
        }

retry:
        rc = 0;
        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        if (secattr->flags & NETLBL_SECATTR_CACHE)
                *sid = *(u32 *)secattr->cache->data;
        else if (secattr->flags & NETLBL_SECATTR_SECID)
                *sid = secattr->attr.secid;
        else if (secattr->flags & NETLBL_SECATTR_MLS_LVL) {
                rc = -EIDRM;
                ctx = sidtab_search(sidtab, SECINITSID_NETMSG);
                if (ctx == NULL)
                        goto out;

                context_init(&ctx_new);
                ctx_new.user = ctx->user;
                ctx_new.role = ctx->role;
                ctx_new.type = ctx->type;
                mls_import_netlbl_lvl(policydb, &ctx_new, secattr);
                if (secattr->flags & NETLBL_SECATTR_MLS_CAT) {
                        rc = mls_import_netlbl_cat(policydb, &ctx_new, secattr);
                        if (rc)
                                goto out;
                }
                rc = -EIDRM;
                if (!mls_context_isvalid(policydb, &ctx_new)) {
                        ebitmap_destroy(&ctx_new.range.level[0].cat);
                        goto out;
                }

                rc = sidtab_context_to_sid(sidtab, &ctx_new, sid);
                ebitmap_destroy(&ctx_new.range.level[0].cat);
                if (rc == -ESTALE) {
                        rcu_read_unlock();
                        goto retry;
                }
                if (rc)
                        goto out;

                security_netlbl_cache_add(secattr, *sid);
        } else
                *sid = SECSID_NULL;

out:
        rcu_read_unlock();
        return rc;
}

/**
 * security_netlbl_sid_to_secattr - Convert a SELinux SID to a NetLabel secattr
 * @sid: the SELinux SID
 * @secattr: the NetLabel packet security attributes
 *
 * Description:
 * Convert the given SELinux SID in @sid into a NetLabel security attribute.
 * Returns zero on success, negative values on failure.
 *
 */
int security_netlbl_sid_to_secattr(u32 sid, struct netlbl_lsm_secattr *secattr)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        int rc;
        struct context *ctx;

        if (!selinux_initialized())
                return 0;

        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;

        rc = -ENOENT;
        ctx = sidtab_search(policy->sidtab, sid);
        if (ctx == NULL)
                goto out;

        rc = -ENOMEM;
        secattr->domain = kstrdup(sym_name(policydb, SYM_TYPES, ctx->type - 1),
                                  GFP_ATOMIC);
        if (secattr->domain == NULL)
                goto out;

        secattr->attr.secid = sid;
        secattr->flags |= NETLBL_SECATTR_DOMAIN_CPY | NETLBL_SECATTR_SECID;
        mls_export_netlbl_lvl(policydb, ctx, secattr);
        rc = mls_export_netlbl_cat(policydb, ctx, secattr);
out:
        rcu_read_unlock();
        return rc;
}
#endif /* CONFIG_NETLABEL */

/**
 * __security_read_policy - read the policy.
 * @policy: SELinux policy
 * @data: binary policy data
 * @len: length of data in bytes
 *
 */
static int __security_read_policy(struct selinux_policy *policy,
                                  void *data, size_t *len)
{
        int rc;
        struct policy_file fp;

        fp.data = data;
        fp.len = *len;

        rc = policydb_write(&policy->policydb, &fp);
        if (rc)
                return rc;

        *len = (unsigned long)fp.data - (unsigned long)data;
        return 0;
}

/**
 * security_read_policy - read the policy.
 * @data: binary policy data
 * @len: length of data in bytes
 *
 */
int security_read_policy(void **data, size_t *len)
{
        struct selinux_state *state = &selinux_state;
        struct selinux_policy *policy;

        policy = rcu_dereference_protected(
                        state->policy, lockdep_is_held(&state->policy_mutex));
        if (!policy)
                return -EINVAL;

        *len = policy->policydb.len;
        *data = vmalloc_user(*len);
        if (!*data)
                return -ENOMEM;

        return __security_read_policy(policy, *data, len);
}

/**
 * security_read_state_kernel - read the policy.
 * @data: binary policy data
 * @len: length of data in bytes
 *
 * Allocates kernel memory for reading SELinux policy.
 * This function is for internal use only and should not
 * be used for returning data to user space.
 *
 * This function must be called with policy_mutex held.
 */
int security_read_state_kernel(void **data, size_t *len)
{
        int err;
        struct selinux_state *state = &selinux_state;
        struct selinux_policy *policy;

        policy = rcu_dereference_protected(
                        state->policy, lockdep_is_held(&state->policy_mutex));
        if (!policy)
                return -EINVAL;

        *len = policy->policydb.len;
        *data = vmalloc(*len);
        if (!*data)
                return -ENOMEM;

        err = __security_read_policy(policy, *data, len);
        if (err) {
                vfree(*data);
                *data = NULL;
                *len = 0;
        }
        return err;
}









































































































































































































































































































    1 





    1 
    1 


    1 













































    1 


    1 
    1 
    1 


    1 






































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
// SPDX-License-Identifier: GPL-2.0-only
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/workqueue.h>
#include <linux/rtnetlink.h>
#include <linux/cache.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/delay.h>
#include <linux/sched.h>
#include <linux/idr.h>
#include <linux/rculist.h>
#include <linux/nsproxy.h>
#include <linux/fs.h>
#include <linux/proc_ns.h>
#include <linux/file.h>
#include <linux/export.h>
#include <linux/user_namespace.h>
#include <linux/net_namespace.h>
#include <linux/sched/task.h>
#include <linux/uidgid.h>
#include <linux/proc_fs.h>
#include <linux/nstree.h>

#include <net/aligned_data.h>
#include <net/sock.h>
#include <net/netlink.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>

/*
 *        Our network namespace constructor/destructor lists
 */

static LIST_HEAD(pernet_list);
static struct list_head *first_device = &pernet_list;

LIST_HEAD(net_namespace_list);
EXPORT_SYMBOL_GPL(net_namespace_list);

/* Protects net_namespace_list. Nests iside rtnl_lock() */
DECLARE_RWSEM(net_rwsem);
EXPORT_SYMBOL_GPL(net_rwsem);

#ifdef CONFIG_KEYS
static struct key_tag init_net_key_domain = { .usage = REFCOUNT_INIT(1) };
#endif

struct net init_net;
EXPORT_SYMBOL(init_net);

static bool init_net_initialized;
/*
 * pernet_ops_rwsem: protects: pernet_list, net_generic_ids,
 * init_net_initialized and first_device pointer.
 * This is internal net namespace object. Please, don't use it
 * outside.
 */
DECLARE_RWSEM(pernet_ops_rwsem);

#define MIN_PERNET_OPS_ID        \
        ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *))

#define INITIAL_NET_GEN_PTRS        13 /* +1 for len +2 for rcu_head */

static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;

static struct net_generic *net_alloc_generic(void)
{
        unsigned int gen_ptrs = READ_ONCE(max_gen_ptrs);
        unsigned int generic_size;
        struct net_generic *ng;

        generic_size = offsetof(struct net_generic, ptr[gen_ptrs]);

        ng = kzalloc(generic_size, GFP_KERNEL);
        if (ng)
                ng->s.len = gen_ptrs;

        return ng;
}

static int net_assign_generic(struct net *net, unsigned int id, void *data)
{
        struct net_generic *ng, *old_ng;

        BUG_ON(id < MIN_PERNET_OPS_ID);

        old_ng = rcu_dereference_protected(net->gen,
                                           lockdep_is_held(&pernet_ops_rwsem));
        if (old_ng->s.len > id) {
                old_ng->ptr[id] = data;
                return 0;
        }

        ng = net_alloc_generic();
        if (!ng)
                return -ENOMEM;

        /*
         * Some synchronisation notes:
         *
         * The net_generic explores the net->gen array inside rcu
         * read section. Besides once set the net->gen->ptr[x]
         * pointer never changes (see rules in netns/generic.h).
         *
         * That said, we simply duplicate this array and schedule
         * the old copy for kfree after a grace period.
         */

        memcpy(&ng->ptr[MIN_PERNET_OPS_ID], &old_ng->ptr[MIN_PERNET_OPS_ID],
               (old_ng->s.len - MIN_PERNET_OPS_ID) * sizeof(void *));
        ng->ptr[id] = data;

        rcu_assign_pointer(net->gen, ng);
        kfree_rcu(old_ng, s.rcu);
        return 0;
}

static int ops_init(const struct pernet_operations *ops, struct net *net)
{
        struct net_generic *ng;
        int err = -ENOMEM;
        void *data = NULL;

        if (ops->id) {
                data = kzalloc(ops->size, GFP_KERNEL);
                if (!data)
                        goto out;

                err = net_assign_generic(net, *ops->id, data);
                if (err)
                        goto cleanup;
        }
        err = 0;
        if (ops->init)
                err = ops->init(net);
        if (!err)
                return 0;

        if (ops->id) {
                ng = rcu_dereference_protected(net->gen,
                                               lockdep_is_held(&pernet_ops_rwsem));
                ng->ptr[*ops->id] = NULL;
        }

cleanup:
        kfree(data);

out:
        return err;
}

static void ops_pre_exit_list(const struct pernet_operations *ops,
                              struct list_head *net_exit_list)
{
        struct net *net;

        if (ops->pre_exit) {
                list_for_each_entry(net, net_exit_list, exit_list)
                        ops->pre_exit(net);
        }
}

static void ops_exit_rtnl_list(const struct list_head *ops_list,
                               const struct pernet_operations *ops,
                               struct list_head *net_exit_list)
{
        const struct pernet_operations *saved_ops = ops;
        LIST_HEAD(dev_kill_list);
        struct net *net;

        rtnl_lock();

        list_for_each_entry(net, net_exit_list, exit_list) {
                __rtnl_net_lock(net);

                ops = saved_ops;
                list_for_each_entry_continue_reverse(ops, ops_list, list) {
                        if (ops->exit_rtnl)
                                ops->exit_rtnl(net, &dev_kill_list);
                }

                __rtnl_net_unlock(net);
        }

        unregister_netdevice_many(&dev_kill_list);

        rtnl_unlock();
}

static void ops_exit_list(const struct pernet_operations *ops,
                          struct list_head *net_exit_list)
{
        if (ops->exit) {
                struct net *net;

                list_for_each_entry(net, net_exit_list, exit_list) {
                        ops->exit(net);
                        cond_resched();
                }
        }

        if (ops->exit_batch)
                ops->exit_batch(net_exit_list);
}

static void ops_free_list(const struct pernet_operations *ops,
                          struct list_head *net_exit_list)
{
        struct net *net;

        if (ops->id) {
                list_for_each_entry(net, net_exit_list, exit_list)
                        kfree(net_generic(net, *ops->id));
        }
}

static void ops_undo_list(const struct list_head *ops_list,
                          const struct pernet_operations *ops,
                          struct list_head *net_exit_list,
                          bool expedite_rcu)
{
        const struct pernet_operations *saved_ops;
        bool hold_rtnl = false;

        if (!ops)
                ops = list_entry(ops_list, typeof(*ops), list);

        saved_ops = ops;

        list_for_each_entry_continue_reverse(ops, ops_list, list) {
                hold_rtnl |= !!ops->exit_rtnl;
                ops_pre_exit_list(ops, net_exit_list);
        }

        /* Another CPU might be rcu-iterating the list, wait for it.
         * This needs to be before calling the exit() notifiers, so the
         * rcu_barrier() after ops_undo_list() isn't sufficient alone.
         * Also the pre_exit() and exit() methods need this barrier.
         */
        if (expedite_rcu)
                synchronize_rcu_expedited();
        else
                synchronize_rcu();

        if (hold_rtnl)
                ops_exit_rtnl_list(ops_list, saved_ops, net_exit_list);

        ops = saved_ops;
        list_for_each_entry_continue_reverse(ops, ops_list, list)
                ops_exit_list(ops, net_exit_list);

        ops = saved_ops;
        list_for_each_entry_continue_reverse(ops, ops_list, list)
                ops_free_list(ops, net_exit_list);
}

static void ops_undo_single(struct pernet_operations *ops,
                            struct list_head *net_exit_list)
{
        LIST_HEAD(ops_list);

        list_add(&ops->list, &ops_list);
        ops_undo_list(&ops_list, NULL, net_exit_list, false);
        list_del(&ops->list);
}

/* should be called with nsid_lock held */
static int alloc_netid(struct net *net, struct net *peer, int reqid)
{
        int min = 0, max = 0;

        if (reqid >= 0) {
                min = reqid;
                max = reqid + 1;
        }

        return idr_alloc(&net->netns_ids, peer, min, max, GFP_ATOMIC);
}

/* This function is used by idr_for_each(). If net is equal to peer, the
 * function returns the id so that idr_for_each() stops. Because we cannot
 * returns the id 0 (idr_for_each() will not stop), we return the magic value
 * NET_ID_ZERO (-1) for it.
 */
#define NET_ID_ZERO -1
static int net_eq_idr(int id, void *net, void *peer)
{
        if (net_eq(net, peer))
                return id ? : NET_ID_ZERO;
        return 0;
}

/* Must be called from RCU-critical section or with nsid_lock held */
static int __peernet2id(const struct net *net, struct net *peer)
{
        int id = idr_for_each(&net->netns_ids, net_eq_idr, peer);

        /* Magic value for id 0. */
        if (id == NET_ID_ZERO)
                return 0;
        if (id > 0)
                return id;

        return NETNSA_NSID_NOT_ASSIGNED;
}

static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid,
                              struct nlmsghdr *nlh, gfp_t gfp);
/* This function returns the id of a peer netns. If no id is assigned, one will
 * be allocated and returned.
 */
int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp)
{
        int id;

        if (!check_net(net))
                return NETNSA_NSID_NOT_ASSIGNED;

        spin_lock(&net->nsid_lock);
        id = __peernet2id(net, peer);
        if (id >= 0) {
                spin_unlock(&net->nsid_lock);
                return id;
        }

        /* When peer is obtained from RCU lists, we may race with
         * its cleanup. Check whether it's alive, and this guarantees
         * we never hash a peer back to net->netns_ids, after it has
         * just been idr_remove()'d from there in cleanup_net().
         */
        if (!maybe_get_net(peer)) {
                spin_unlock(&net->nsid_lock);
                return NETNSA_NSID_NOT_ASSIGNED;
        }

        id = alloc_netid(net, peer, -1);
        spin_unlock(&net->nsid_lock);

        put_net(peer);
        if (id < 0)
                return NETNSA_NSID_NOT_ASSIGNED;

        rtnl_net_notifyid(net, RTM_NEWNSID, id, 0, NULL, gfp);

        return id;
}
EXPORT_SYMBOL_GPL(peernet2id_alloc);

/* This function returns, if assigned, the id of a peer netns. */
int peernet2id(const struct net *net, struct net *peer)
{
        int id;

        rcu_read_lock();
        id = __peernet2id(net, peer);
        rcu_read_unlock();

        return id;
}
EXPORT_SYMBOL(peernet2id);

/* This function returns true is the peer netns has an id assigned into the
 * current netns.
 */
bool peernet_has_id(const struct net *net, struct net *peer)
{
        return peernet2id(net, peer) >= 0;
}

struct net *get_net_ns_by_id(const struct net *net, int id)
{
        struct net *peer;

        if (id < 0)
                return NULL;

        rcu_read_lock();
        peer = idr_find(&net->netns_ids, id);
        if (peer)
                peer = maybe_get_net(peer);
        rcu_read_unlock();

        return peer;
}
EXPORT_SYMBOL_GPL(get_net_ns_by_id);

static __net_init void preinit_net_sysctl(struct net *net)
{
        net->core.sysctl_somaxconn = SOMAXCONN;
        /* Limits per socket sk_omem_alloc usage.
         * TCP zerocopy regular usage needs 128 KB.
         */
        net->core.sysctl_optmem_max = 128 * 1024;
        net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED;
        net->core.sysctl_tstamp_allow_data = 1;
}

/* init code that must occur even if setup_net() is not called. */
static __net_init int preinit_net(struct net *net, struct user_namespace *user_ns)
{
        int ret;

        ret = ns_common_init(net);
        if (ret)
                return ret;

        refcount_set(&net->passive, 1);
        ref_tracker_dir_init(&net->refcnt_tracker, 128, "net_refcnt");
        ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net_notrefcnt");

        get_random_bytes(&net->hash_mix, sizeof(u32));
        net->dev_base_seq = 1;
        net->user_ns = user_ns;

        idr_init(&net->netns_ids);
        spin_lock_init(&net->nsid_lock);
        mutex_init(&net->ipv4.ra_mutex);

#ifdef CONFIG_DEBUG_NET_SMALL_RTNL
        mutex_init(&net->rtnl_mutex);
        lock_set_cmp_fn(&net->rtnl_mutex, rtnl_net_lock_cmp_fn, NULL);
#endif

        INIT_LIST_HEAD(&net->ptype_all);
        INIT_LIST_HEAD(&net->ptype_specific);
        preinit_net_sysctl(net);
        return 0;
}

/*
 * setup_net runs the initializers for the network namespace object.
 */
static __net_init int setup_net(struct net *net)
{
        /* Must be called with pernet_ops_rwsem held */
        const struct pernet_operations *ops;
        LIST_HEAD(net_exit_list);
        int error = 0;

        net->net_cookie = ns_tree_gen_id(&net->ns);

        list_for_each_entry(ops, &pernet_list, list) {
                error = ops_init(ops, net);
                if (error < 0)
                        goto out_undo;
        }
        down_write(&net_rwsem);
        list_add_tail_rcu(&net->list, &net_namespace_list);
        up_write(&net_rwsem);
        ns_tree_add_raw(net);
out:
        return error;

out_undo:
        /* Walk through the list backwards calling the exit functions
         * for the pernet modules whose init functions did not fail.
         */
        list_add(&net->exit_list, &net_exit_list);
        ops_undo_list(&pernet_list, ops, &net_exit_list, false);
        rcu_barrier();
        goto out;
}

#ifdef CONFIG_NET_NS
static struct ucounts *inc_net_namespaces(struct user_namespace *ns)
{
        return inc_ucount(ns, current_euid(), UCOUNT_NET_NAMESPACES);
}

static void dec_net_namespaces(struct ucounts *ucounts)
{
        dec_ucount(ucounts, UCOUNT_NET_NAMESPACES);
}

static struct kmem_cache *net_cachep __ro_after_init;
static struct workqueue_struct *netns_wq;

static struct net *net_alloc(void)
{
        struct net *net = NULL;
        struct net_generic *ng;

        ng = net_alloc_generic();
        if (!ng)
                goto out;

        net = kmem_cache_zalloc(net_cachep, GFP_KERNEL);
        if (!net)
                goto out_free;

#ifdef CONFIG_KEYS
        net->key_domain = kzalloc(sizeof(struct key_tag), GFP_KERNEL);
        if (!net->key_domain)
                goto out_free_2;
        refcount_set(&net->key_domain->usage, 1);
#endif

        rcu_assign_pointer(net->gen, ng);
out:
        return net;

#ifdef CONFIG_KEYS
out_free_2:
        kmem_cache_free(net_cachep, net);
        net = NULL;
#endif
out_free:
        kfree(ng);
        goto out;
}

static LLIST_HEAD(defer_free_list);

static void net_complete_free(void)
{
        struct llist_node *kill_list;
        struct net *net, *next;

        /* Get the list of namespaces to free from last round. */
        kill_list = llist_del_all(&defer_free_list);

        llist_for_each_entry_safe(net, next, kill_list, defer_free_list)
                kmem_cache_free(net_cachep, net);

}

void net_passive_dec(struct net *net)
{
        if (refcount_dec_and_test(&net->passive)) {
                kfree(rcu_access_pointer(net->gen));

                /* There should not be any trackers left there. */
                ref_tracker_dir_exit(&net->notrefcnt_tracker);

                /* Wait for an extra rcu_barrier() before final free. */
                llist_add(&net->defer_free_list, &defer_free_list);
        }
}

void net_drop_ns(void *p)
{
        struct net *net = (struct net *)p;

        if (net)
                net_passive_dec(net);
}

struct net *copy_net_ns(u64 flags,
                        struct user_namespace *user_ns, struct net *old_net)
{
        struct ucounts *ucounts;
        struct net *net;
        int rv;

        if (!(flags & CLONE_NEWNET))
                return get_net(old_net);

        ucounts = inc_net_namespaces(user_ns);
        if (!ucounts)
                return ERR_PTR(-ENOSPC);

        net = net_alloc();
        if (!net) {
                rv = -ENOMEM;
                goto dec_ucounts;
        }

        rv = preinit_net(net, user_ns);
        if (rv < 0)
                goto dec_ucounts;
        net->ucounts = ucounts;
        get_user_ns(user_ns);

        rv = down_read_killable(&pernet_ops_rwsem);
        if (rv < 0)
                goto put_userns;

        rv = setup_net(net);

        up_read(&pernet_ops_rwsem);

        if (rv < 0) {
put_userns:
                ns_common_free(net);
#ifdef CONFIG_KEYS
                key_remove_domain(net->key_domain);
#endif
                put_user_ns(user_ns);
                net_passive_dec(net);
dec_ucounts:
                dec_net_namespaces(ucounts);
                return ERR_PTR(rv);
        }
        return net;
}

/**
 * net_ns_get_ownership - get sysfs ownership data for @net
 * @net: network namespace in question (can be NULL)
 * @uid: kernel user ID for sysfs objects
 * @gid: kernel group ID for sysfs objects
 *
 * Returns the uid/gid pair of root in the user namespace associated with the
 * given network namespace.
 */
void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid)
{
        if (net) {
                kuid_t ns_root_uid = make_kuid(net->user_ns, 0);
                kgid_t ns_root_gid = make_kgid(net->user_ns, 0);

                if (uid_valid(ns_root_uid))
                        *uid = ns_root_uid;

                if (gid_valid(ns_root_gid))
                        *gid = ns_root_gid;
        } else {
                *uid = GLOBAL_ROOT_UID;
                *gid = GLOBAL_ROOT_GID;
        }
}
EXPORT_SYMBOL_GPL(net_ns_get_ownership);

static void unhash_nsid(struct net *net, struct net *last)
{
        struct net *tmp;
        /* This function is only called from cleanup_net() work,
         * and this work is the only process, that may delete
         * a net from net_namespace_list. So, when the below
         * is executing, the list may only grow. Thus, we do not
         * use for_each_net_rcu() or net_rwsem.
         */
        for_each_net(tmp) {
                int id;

                spin_lock(&tmp->nsid_lock);
                id = __peernet2id(tmp, net);
                if (id >= 0)
                        idr_remove(&tmp->netns_ids, id);
                spin_unlock(&tmp->nsid_lock);
                if (id >= 0)
                        rtnl_net_notifyid(tmp, RTM_DELNSID, id, 0, NULL,
                                          GFP_KERNEL);
                if (tmp == last)
                        break;
        }
        spin_lock(&net->nsid_lock);
        idr_destroy(&net->netns_ids);
        spin_unlock(&net->nsid_lock);
}

static LLIST_HEAD(cleanup_list);

struct task_struct *cleanup_net_task;

static void cleanup_net(struct work_struct *work)
{
        struct llist_node *net_kill_list;
        struct net *net, *tmp, *last;
        LIST_HEAD(net_exit_list);

        WRITE_ONCE(cleanup_net_task, current);

        /* Atomically snapshot the list of namespaces to cleanup */
        net_kill_list = llist_del_all(&cleanup_list);

        down_read(&pernet_ops_rwsem);

        /* Don't let anyone else find us. */
        down_write(&net_rwsem);
        llist_for_each_entry(net, net_kill_list, cleanup_list) {
                ns_tree_remove(net);
                list_del_rcu(&net->list);
        }
        /* Cache last net. After we unlock rtnl, no one new net
         * added to net_namespace_list can assign nsid pointer
         * to a net from net_kill_list (see peernet2id_alloc()).
         * So, we skip them in unhash_nsid().
         *
         * Note, that unhash_nsid() does not delete nsid links
         * between net_kill_list's nets, as they've already
         * deleted from net_namespace_list. But, this would be
         * useless anyway, as netns_ids are destroyed there.
         */
        last = list_last_entry(&net_namespace_list, struct net, list);
        up_write(&net_rwsem);

        llist_for_each_entry(net, net_kill_list, cleanup_list) {
                unhash_nsid(net, last);
                list_add_tail(&net->exit_list, &net_exit_list);
        }

        ops_undo_list(&pernet_list, NULL, &net_exit_list, true);

        up_read(&pernet_ops_rwsem);

        /* Ensure there are no outstanding rcu callbacks using this
         * network namespace.
         */
        rcu_barrier();

        net_complete_free();

        /* Finally it is safe to free my network namespace structure */
        list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
                list_del_init(&net->exit_list);
                ns_common_free(net);
                dec_net_namespaces(net->ucounts);
#ifdef CONFIG_KEYS
                key_remove_domain(net->key_domain);
#endif
                put_user_ns(net->user_ns);
                net_passive_dec(net);
        }
        WRITE_ONCE(cleanup_net_task, NULL);
}

/**
 * net_ns_barrier - wait until concurrent net_cleanup_work is done
 *
 * cleanup_net runs from work queue and will first remove namespaces
 * from the global list, then run net exit functions.
 *
 * Call this in module exit path to make sure that all netns
 * ->exit ops have been invoked before the function is removed.
 */
void net_ns_barrier(void)
{
        down_write(&pernet_ops_rwsem);
        up_write(&pernet_ops_rwsem);
}
EXPORT_SYMBOL(net_ns_barrier);

static DECLARE_WORK(net_cleanup_work, cleanup_net);

void __put_net(struct net *net)
{
        ref_tracker_dir_exit(&net->refcnt_tracker);
        /* Cleanup the network namespace in process context */
        if (llist_add(&net->cleanup_list, &cleanup_list))
                queue_work(netns_wq, &net_cleanup_work);
}
EXPORT_SYMBOL_GPL(__put_net);

/**
 * get_net_ns - increment the refcount of the network namespace
 * @ns: common namespace (net)
 *
 * Returns the net's common namespace or ERR_PTR() if ref is zero.
 */
struct ns_common *get_net_ns(struct ns_common *ns)
{
        struct net *net;

        net = maybe_get_net(container_of(ns, struct net, ns));
        if (net)
                return &net->ns;
        return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL_GPL(get_net_ns);

struct net *get_net_ns_by_fd(int fd)
{
        CLASS(fd, f)(fd);

        if (fd_empty(f))
                return ERR_PTR(-EBADF);

        if (proc_ns_file(fd_file(f))) {
                struct ns_common *ns = get_proc_ns(file_inode(fd_file(f)));
                if (ns->ops == &netns_operations)
                        return get_net(container_of(ns, struct net, ns));
        }

        return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL_GPL(get_net_ns_by_fd);
#endif

struct net *get_net_ns_by_pid(pid_t pid)
{
        struct task_struct *tsk;
        struct net *net;

        /* Lookup the network namespace */
        net = ERR_PTR(-ESRCH);
        rcu_read_lock();
        tsk = find_task_by_vpid(pid);
        if (tsk) {
                struct nsproxy *nsproxy;
                task_lock(tsk);
                nsproxy = tsk->nsproxy;
                if (nsproxy)
                        net = get_net(nsproxy->net_ns);
                task_unlock(tsk);
        }
        rcu_read_unlock();
        return net;
}
EXPORT_SYMBOL_GPL(get_net_ns_by_pid);

#ifdef CONFIG_NET_NS_REFCNT_TRACKER
static void net_ns_net_debugfs(struct net *net)
{
        ref_tracker_dir_symlink(&net->refcnt_tracker, "netns-%llx-%u-refcnt",
                                net->net_cookie, net->ns.inum);
        ref_tracker_dir_symlink(&net->notrefcnt_tracker, "netns-%llx-%u-notrefcnt",
                                net->net_cookie, net->ns.inum);
}

static int __init init_net_debugfs(void)
{
        ref_tracker_dir_debugfs(&init_net.refcnt_tracker);
        ref_tracker_dir_debugfs(&init_net.notrefcnt_tracker);
        net_ns_net_debugfs(&init_net);
        return 0;
}
late_initcall(init_net_debugfs);
#else
static void net_ns_net_debugfs(struct net *net)
{
}
#endif

static __net_init int net_ns_net_init(struct net *net)
{
        net_ns_net_debugfs(net);
        return 0;
}

static struct pernet_operations __net_initdata net_ns_ops = {
        .init = net_ns_net_init,
};

static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = {
        [NETNSA_NONE]                = { .type = NLA_UNSPEC },
        [NETNSA_NSID]                = { .type = NLA_S32 },
        [NETNSA_PID]                = { .type = NLA_U32 },
        [NETNSA_FD]                = { .type = NLA_U32 },
        [NETNSA_TARGET_NSID]        = { .type = NLA_S32 },
};

static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh,
                          struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct nlattr *tb[NETNSA_MAX + 1];
        struct nlattr *nla;
        struct net *peer;
        int nsid, err;

        err = nlmsg_parse_deprecated(nlh, sizeof(struct rtgenmsg), tb,
                                     NETNSA_MAX, rtnl_net_policy, extack);
        if (err < 0)
                return err;
        if (!tb[NETNSA_NSID]) {
                NL_SET_ERR_MSG(extack, "nsid is missing");
                return -EINVAL;
        }
        nsid = nla_get_s32(tb[NETNSA_NSID]);

        if (tb[NETNSA_PID]) {
                peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID]));
                nla = tb[NETNSA_PID];
        } else if (tb[NETNSA_FD]) {
                peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD]));
                nla = tb[NETNSA_FD];
        } else {
                NL_SET_ERR_MSG(extack, "Peer netns reference is missing");
                return -EINVAL;
        }
        if (IS_ERR(peer)) {
                NL_SET_BAD_ATTR(extack, nla);
                NL_SET_ERR_MSG(extack, "Peer netns reference is invalid");
                return PTR_ERR(peer);
        }

        spin_lock(&net->nsid_lock);
        if (__peernet2id(net, peer) >= 0) {
                spin_unlock(&net->nsid_lock);
                err = -EEXIST;
                NL_SET_BAD_ATTR(extack, nla);
                NL_SET_ERR_MSG(extack,
                               "Peer netns already has a nsid assigned");
                goto out;
        }

        err = alloc_netid(net, peer, nsid);
        spin_unlock(&net->nsid_lock);
        if (err >= 0) {
                rtnl_net_notifyid(net, RTM_NEWNSID, err, NETLINK_CB(skb).portid,
                                  nlh, GFP_KERNEL);
                err = 0;
        } else if (err == -ENOSPC && nsid >= 0) {
                err = -EEXIST;
                NL_SET_BAD_ATTR(extack, tb[NETNSA_NSID]);
                NL_SET_ERR_MSG(extack, "The specified nsid is already used");
        }
out:
        put_net(peer);
        return err;
}

static int rtnl_net_get_size(void)
{
        return NLMSG_ALIGN(sizeof(struct rtgenmsg))
               + nla_total_size(sizeof(s32)) /* NETNSA_NSID */
               + nla_total_size(sizeof(s32)) /* NETNSA_CURRENT_NSID */
               ;
}

struct net_fill_args {
        u32 portid;
        u32 seq;
        int flags;
        int cmd;
        int nsid;
        bool add_ref;
        int ref_nsid;
};

static int rtnl_net_fill(struct sk_buff *skb, struct net_fill_args *args)
{
        struct nlmsghdr *nlh;
        struct rtgenmsg *rth;

        nlh = nlmsg_put(skb, args->portid, args->seq, args->cmd, sizeof(*rth),
                        args->flags);
        if (!nlh)
                return -EMSGSIZE;

        rth = nlmsg_data(nlh);
        rth->rtgen_family = AF_UNSPEC;

        if (nla_put_s32(skb, NETNSA_NSID, args->nsid))
                goto nla_put_failure;

        if (args->add_ref &&
            nla_put_s32(skb, NETNSA_CURRENT_NSID, args->ref_nsid))
                goto nla_put_failure;

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

static int rtnl_net_valid_getid_req(struct sk_buff *skb,
                                    const struct nlmsghdr *nlh,
                                    struct nlattr **tb,
                                    struct netlink_ext_ack *extack)
{
        int i, err;

        if (!netlink_strict_get_check(skb))
                return nlmsg_parse_deprecated(nlh, sizeof(struct rtgenmsg),
                                              tb, NETNSA_MAX, rtnl_net_policy,
                                              extack);

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct rtgenmsg), tb,
                                            NETNSA_MAX, rtnl_net_policy,
                                            extack);
        if (err)
                return err;

        for (i = 0; i <= NETNSA_MAX; i++) {
                if (!tb[i])
                        continue;

                switch (i) {
                case NETNSA_PID:
                case NETNSA_FD:
                case NETNSA_NSID:
                case NETNSA_TARGET_NSID:
                        break;
                default:
                        NL_SET_ERR_MSG(extack, "Unsupported attribute in peer netns getid request");
                        return -EINVAL;
                }
        }

        return 0;
}

static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh,
                          struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct nlattr *tb[NETNSA_MAX + 1];
        struct net_fill_args fillargs = {
                .portid = NETLINK_CB(skb).portid,
                .seq = nlh->nlmsg_seq,
                .cmd = RTM_NEWNSID,
        };
        struct net *peer, *target = net;
        struct nlattr *nla;
        struct sk_buff *msg;
        int err;

        err = rtnl_net_valid_getid_req(skb, nlh, tb, extack);
        if (err < 0)
                return err;
        if (tb[NETNSA_PID]) {
                peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID]));
                nla = tb[NETNSA_PID];
        } else if (tb[NETNSA_FD]) {
                peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD]));
                nla = tb[NETNSA_FD];
        } else if (tb[NETNSA_NSID]) {
                peer = get_net_ns_by_id(net, nla_get_s32(tb[NETNSA_NSID]));
                if (!peer)
                        peer = ERR_PTR(-ENOENT);
                nla = tb[NETNSA_NSID];
        } else {
                NL_SET_ERR_MSG(extack, "Peer netns reference is missing");
                return -EINVAL;
        }

        if (IS_ERR(peer)) {
                NL_SET_BAD_ATTR(extack, nla);
                NL_SET_ERR_MSG(extack, "Peer netns reference is invalid");
                return PTR_ERR(peer);
        }

        if (tb[NETNSA_TARGET_NSID]) {
                int id = nla_get_s32(tb[NETNSA_TARGET_NSID]);

                target = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, id);
                if (IS_ERR(target)) {
                        NL_SET_BAD_ATTR(extack, tb[NETNSA_TARGET_NSID]);
                        NL_SET_ERR_MSG(extack,
                                       "Target netns reference is invalid");
                        err = PTR_ERR(target);
                        goto out;
                }
                fillargs.add_ref = true;
                fillargs.ref_nsid = peernet2id(net, peer);
        }

        msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL);
        if (!msg) {
                err = -ENOMEM;
                goto out;
        }

        fillargs.nsid = peernet2id(target, peer);
        err = rtnl_net_fill(msg, &fillargs);
        if (err < 0)
                goto err_out;

        err = rtnl_unicast(msg, net, NETLINK_CB(skb).portid);
        goto out;

err_out:
        nlmsg_free(msg);
out:
        if (fillargs.add_ref)
                put_net(target);
        put_net(peer);
        return err;
}

struct rtnl_net_dump_cb {
        struct net *tgt_net;
        struct net *ref_net;
        struct sk_buff *skb;
        struct net_fill_args fillargs;
        int idx;
        int s_idx;
};

/* Runs in RCU-critical section. */
static int rtnl_net_dumpid_one(int id, void *peer, void *data)
{
        struct rtnl_net_dump_cb *net_cb = (struct rtnl_net_dump_cb *)data;
        int ret;

        if (net_cb->idx < net_cb->s_idx)
                goto cont;

        net_cb->fillargs.nsid = id;
        if (net_cb->fillargs.add_ref)
                net_cb->fillargs.ref_nsid = __peernet2id(net_cb->ref_net, peer);
        ret = rtnl_net_fill(net_cb->skb, &net_cb->fillargs);
        if (ret < 0)
                return ret;

cont:
        net_cb->idx++;
        return 0;
}

static int rtnl_valid_dump_net_req(const struct nlmsghdr *nlh, struct sock *sk,
                                   struct rtnl_net_dump_cb *net_cb,
                                   struct netlink_callback *cb)
{
        struct netlink_ext_ack *extack = cb->extack;
        struct nlattr *tb[NETNSA_MAX + 1];
        int err, i;

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct rtgenmsg), tb,
                                            NETNSA_MAX, rtnl_net_policy,
                                            extack);
        if (err < 0)
                return err;

        for (i = 0; i <= NETNSA_MAX; i++) {
                if (!tb[i])
                        continue;

                if (i == NETNSA_TARGET_NSID) {
                        struct net *net;

                        net = rtnl_get_net_ns_capable(sk, nla_get_s32(tb[i]));
                        if (IS_ERR(net)) {
                                NL_SET_BAD_ATTR(extack, tb[i]);
                                NL_SET_ERR_MSG(extack,
                                               "Invalid target network namespace id");
                                return PTR_ERR(net);
                        }
                        net_cb->fillargs.add_ref = true;
                        net_cb->ref_net = net_cb->tgt_net;
                        net_cb->tgt_net = net;
                } else {
                        NL_SET_BAD_ATTR(extack, tb[i]);
                        NL_SET_ERR_MSG(extack,
                                       "Unsupported attribute in dump request");
                        return -EINVAL;
                }
        }

        return 0;
}

static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct rtnl_net_dump_cb net_cb = {
                .tgt_net = sock_net(skb->sk),
                .skb = skb,
                .fillargs = {
                        .portid = NETLINK_CB(cb->skb).portid,
                        .seq = cb->nlh->nlmsg_seq,
                        .flags = NLM_F_MULTI,
                        .cmd = RTM_NEWNSID,
                },
                .idx = 0,
                .s_idx = cb->args[0],
        };
        int err = 0;

        if (cb->strict_check) {
                err = rtnl_valid_dump_net_req(cb->nlh, skb->sk, &net_cb, cb);
                if (err < 0)
                        goto end;
        }

        rcu_read_lock();
        idr_for_each(&net_cb.tgt_net->netns_ids, rtnl_net_dumpid_one, &net_cb);
        rcu_read_unlock();

        cb->args[0] = net_cb.idx;
end:
        if (net_cb.fillargs.add_ref)
                put_net(net_cb.tgt_net);
        return err;
}

static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid,
                              struct nlmsghdr *nlh, gfp_t gfp)
{
        struct net_fill_args fillargs = {
                .portid = portid,
                .seq = nlh ? nlh->nlmsg_seq : 0,
                .cmd = cmd,
                .nsid = id,
        };
        struct sk_buff *msg;
        int err = -ENOMEM;

        msg = nlmsg_new(rtnl_net_get_size(), gfp);
        if (!msg)
                goto out;

        err = rtnl_net_fill(msg, &fillargs);
        if (err < 0)
                goto err_out;

        rtnl_notify(msg, net, portid, RTNLGRP_NSID, nlh, gfp);
        return;

err_out:
        nlmsg_free(msg);
out:
        rtnl_set_sk_err(net, RTNLGRP_NSID, err);
}

#ifdef CONFIG_NET_NS
static void __init netns_ipv4_struct_check(void)
{
        /* TX readonly hotpath cache lines */
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_early_retrans);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_tso_win_divisor);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_tso_rtt_log);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_autocorking);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_min_snd_mss);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_notsent_lowat);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_limit_output_bytes);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_min_rtt_wlen);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_wmem);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_ip_fwd_use_pmtu);
        CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_tx, 33);

        /* TXRX readonly hotpath cache lines */
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_txrx,
                                      sysctl_tcp_moderate_rcvbuf);
        CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_txrx, 1);

        /* RX readonly hotpath cache line */
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
                                      sysctl_ip_early_demux);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
                                      sysctl_tcp_early_demux);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
                                      sysctl_tcp_l3mdev_accept);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
                                      sysctl_tcp_reordering);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
                                      sysctl_tcp_rmem);
        CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 22);
}
#endif

static const struct rtnl_msg_handler net_ns_rtnl_msg_handlers[] __initconst = {
        {.msgtype = RTM_NEWNSID, .doit = rtnl_net_newid,
         .flags = RTNL_FLAG_DOIT_UNLOCKED},
        {.msgtype = RTM_GETNSID, .doit = rtnl_net_getid,
         .dumpit = rtnl_net_dumpid,
         .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED},
};

void __init net_ns_init(void)
{
        struct net_generic *ng;

#ifdef CONFIG_NET_NS
        netns_ipv4_struct_check();
        net_cachep = kmem_cache_create("net_namespace", sizeof(struct net),
                                        SMP_CACHE_BYTES,
                                        SLAB_PANIC|SLAB_ACCOUNT, NULL);

        /* Create workqueue for cleanup */
        netns_wq = create_singlethread_workqueue("netns");
        if (!netns_wq)
                panic("Could not create netns workq");
#endif

        ng = net_alloc_generic();
        if (!ng)
                panic("Could not allocate generic netns");

        rcu_assign_pointer(init_net.gen, ng);

#ifdef CONFIG_KEYS
        init_net.key_domain = &init_net_key_domain;
#endif
        /*
         * This currently cannot fail as the initial network namespace
         * has a static inode number.
         */
        if (preinit_net(&init_net, &init_user_ns))
                panic("Could not preinitialize the initial network namespace");

        down_write(&pernet_ops_rwsem);
        if (setup_net(&init_net))
                panic("Could not setup the initial network namespace");

        init_net_initialized = true;
        up_write(&pernet_ops_rwsem);

        if (register_pernet_subsys(&net_ns_ops))
                panic("Could not register network namespace subsystems");

        rtnl_register_many(net_ns_rtnl_msg_handlers);
}

#ifdef CONFIG_NET_NS
static int __register_pernet_operations(struct list_head *list,
                                        struct pernet_operations *ops)
{
        LIST_HEAD(net_exit_list);
        struct net *net;
        int error;

        list_add_tail(&ops->list, list);
        if (ops->init || ops->id) {
                /* We held write locked pernet_ops_rwsem, and parallel
                 * setup_net() and cleanup_net() are not possible.
                 */
                for_each_net(net) {
                        error = ops_init(ops, net);
                        if (error)
                                goto out_undo;
                        list_add_tail(&net->exit_list, &net_exit_list);
                }
        }
        return 0;

out_undo:
        /* If I have an error cleanup all namespaces I initialized */
        list_del(&ops->list);
        ops_undo_single(ops, &net_exit_list);
        return error;
}

static void __unregister_pernet_operations(struct pernet_operations *ops)
{
        LIST_HEAD(net_exit_list);
        struct net *net;

        /* See comment in __register_pernet_operations() */
        for_each_net(net)
                list_add_tail(&net->exit_list, &net_exit_list);

        list_del(&ops->list);
        ops_undo_single(ops, &net_exit_list);
}

#else

static int __register_pernet_operations(struct list_head *list,
                                        struct pernet_operations *ops)
{
        if (!init_net_initialized) {
                list_add_tail(&ops->list, list);
                return 0;
        }

        return ops_init(ops, &init_net);
}

static void __unregister_pernet_operations(struct pernet_operations *ops)
{
        if (!init_net_initialized) {
                list_del(&ops->list);
        } else {
                LIST_HEAD(net_exit_list);

                list_add(&init_net.exit_list, &net_exit_list);
                ops_undo_single(ops, &net_exit_list);
        }
}

#endif /* CONFIG_NET_NS */

static DEFINE_IDA(net_generic_ids);

static int register_pernet_operations(struct list_head *list,
                                      struct pernet_operations *ops)
{
        int error;

        if (WARN_ON(!!ops->id ^ !!ops->size))
                return -EINVAL;

        if (ops->id) {
                error = ida_alloc_min(&net_generic_ids, MIN_PERNET_OPS_ID,
                                GFP_KERNEL);
                if (error < 0)
                        return error;
                *ops->id = error;
                /* This does not require READ_ONCE as writers already hold
                 * pernet_ops_rwsem. But WRITE_ONCE is needed to protect
                 * net_alloc_generic.
                 */
                WRITE_ONCE(max_gen_ptrs, max(max_gen_ptrs, *ops->id + 1));
        }
        error = __register_pernet_operations(list, ops);
        if (error) {
                rcu_barrier();
                if (ops->id)
                        ida_free(&net_generic_ids, *ops->id);
        }

        return error;
}

static void unregister_pernet_operations(struct pernet_operations *ops)
{
        __unregister_pernet_operations(ops);
        rcu_barrier();
        if (ops->id)
                ida_free(&net_generic_ids, *ops->id);
}

/**
 *      register_pernet_subsys - register a network namespace subsystem
 *        @ops:  pernet operations structure for the subsystem
 *
 *        Register a subsystem which has init and exit functions
 *        that are called when network namespaces are created and
 *        destroyed respectively.
 *
 *        When registered all network namespace init functions are
 *        called for every existing network namespace.  Allowing kernel
 *        modules to have a race free view of the set of network namespaces.
 *
 *        When a new network namespace is created all of the init
 *        methods are called in the order in which they were registered.
 *
 *        When a network namespace is destroyed all of the exit methods
 *        are called in the reverse of the order with which they were
 *        registered.
 */
int register_pernet_subsys(struct pernet_operations *ops)
{
        int error;
        down_write(&pernet_ops_rwsem);
        error =  register_pernet_operations(first_device, ops);
        up_write(&pernet_ops_rwsem);
        return error;
}
EXPORT_SYMBOL_GPL(register_pernet_subsys);

/**
 *      unregister_pernet_subsys - unregister a network namespace subsystem
 *        @ops: pernet operations structure to manipulate
 *
 *        Remove the pernet operations structure from the list to be
 *        used when network namespaces are created or destroyed.  In
 *        addition run the exit method for all existing network
 *        namespaces.
 */
void unregister_pernet_subsys(struct pernet_operations *ops)
{
        down_write(&pernet_ops_rwsem);
        unregister_pernet_operations(ops);
        up_write(&pernet_ops_rwsem);
}
EXPORT_SYMBOL_GPL(unregister_pernet_subsys);

/**
 *      register_pernet_device - register a network namespace device
 *        @ops:  pernet operations structure for the subsystem
 *
 *        Register a device which has init and exit functions
 *        that are called when network namespaces are created and
 *        destroyed respectively.
 *
 *        When registered all network namespace init functions are
 *        called for every existing network namespace.  Allowing kernel
 *        modules to have a race free view of the set of network namespaces.
 *
 *        When a new network namespace is created all of the init
 *        methods are called in the order in which they were registered.
 *
 *        When a network namespace is destroyed all of the exit methods
 *        are called in the reverse of the order with which they were
 *        registered.
 */
int register_pernet_device(struct pernet_operations *ops)
{
        int error;
        down_write(&pernet_ops_rwsem);
        error = register_pernet_operations(&pernet_list, ops);
        if (!error && (first_device == &pernet_list))
                first_device = &ops->list;
        up_write(&pernet_ops_rwsem);
        return error;
}
EXPORT_SYMBOL_GPL(register_pernet_device);

/**
 *      unregister_pernet_device - unregister a network namespace netdevice
 *        @ops: pernet operations structure to manipulate
 *
 *        Remove the pernet operations structure from the list to be
 *        used when network namespaces are created or destroyed.  In
 *        addition run the exit method for all existing network
 *        namespaces.
 */
void unregister_pernet_device(struct pernet_operations *ops)
{
        down_write(&pernet_ops_rwsem);
        if (&ops->list == first_device)
                first_device = first_device->next;
        unregister_pernet_operations(ops);
        up_write(&pernet_ops_rwsem);
}
EXPORT_SYMBOL_GPL(unregister_pernet_device);

#ifdef CONFIG_NET_NS
static struct ns_common *netns_get(struct task_struct *task)
{
        struct net *net = NULL;
        struct nsproxy *nsproxy;

        task_lock(task);
        nsproxy = task->nsproxy;
        if (nsproxy)
                net = get_net(nsproxy->net_ns);
        task_unlock(task);

        return net ? &net->ns : NULL;
}

static void netns_put(struct ns_common *ns)
{
        put_net(to_net_ns(ns));
}

static int netns_install(struct nsset *nsset, struct ns_common *ns)
{
        struct nsproxy *nsproxy = nsset->nsproxy;
        struct net *net = to_net_ns(ns);

        if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) ||
            !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        put_net(nsproxy->net_ns);
        nsproxy->net_ns = get_net(net);
        return 0;
}

static struct user_namespace *netns_owner(struct ns_common *ns)
{
        return to_net_ns(ns)->user_ns;
}

const struct proc_ns_operations netns_operations = {
        .name                = "net",
        .get                = netns_get,
        .put                = netns_put,
        .install        = netns_install,
        .owner                = netns_owner,
};
#endif







































    6 
























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Common values for AES algorithms
 */

#ifndef _CRYPTO_AES_H
#define _CRYPTO_AES_H

#include <linux/types.h>
#include <linux/crypto.h>

#define AES_MIN_KEY_SIZE        16
#define AES_MAX_KEY_SIZE        32
#define AES_KEYSIZE_128                16
#define AES_KEYSIZE_192                24
#define AES_KEYSIZE_256                32
#define AES_BLOCK_SIZE                16
#define AES_MAX_KEYLENGTH        (15 * 16)
#define AES_MAX_KEYLENGTH_U32        (AES_MAX_KEYLENGTH / sizeof(u32))

/*
 * Please ensure that the first two fields are 16-byte aligned
 * relative to the start of the structure, i.e., don't move them!
 */
struct crypto_aes_ctx {
        u32 key_enc[AES_MAX_KEYLENGTH_U32];
        u32 key_dec[AES_MAX_KEYLENGTH_U32];
        u32 key_length;
};

extern const u32 crypto_ft_tab[4][256] ____cacheline_aligned;
extern const u32 crypto_it_tab[4][256] ____cacheline_aligned;

/*
 * validate key length for AES algorithms
 */
static inline int aes_check_keylen(unsigned int keylen)
{
        switch (keylen) {
        case AES_KEYSIZE_128:
        case AES_KEYSIZE_192:
        case AES_KEYSIZE_256:
                break;
        default:
                return -EINVAL;
        }

        return 0;
}

int crypto_aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
                unsigned int key_len);

/**
 * aes_expandkey - Expands the AES key as described in FIPS-197
 * @ctx:        The location where the computed key will be stored.
 * @in_key:        The supplied key.
 * @key_len:        The length of the supplied key.
 *
 * Returns 0 on success. The function fails only if an invalid key size (or
 * pointer) is supplied.
 * The expanded key size is 240 bytes (max of 14 rounds with a unique 16 bytes
 * key schedule plus a 16 bytes key which is used before the first round).
 * The decryption key is prepared for the "Equivalent Inverse Cipher" as
 * described in FIPS-197. The first slot (16 bytes) of each key (enc or dec) is
 * for the initial combination, the second slot for the first round and so on.
 */
int aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
                  unsigned int key_len);

/**
 * aes_encrypt - Encrypt a single AES block
 * @ctx:        Context struct containing the key schedule
 * @out:        Buffer to store the ciphertext
 * @in:                Buffer containing the plaintext
 */
void aes_encrypt(const struct crypto_aes_ctx *ctx, u8 *out, const u8 *in);

/**
 * aes_decrypt - Decrypt a single AES block
 * @ctx:        Context struct containing the key schedule
 * @out:        Buffer to store the plaintext
 * @in:                Buffer containing the ciphertext
 */
void aes_decrypt(const struct crypto_aes_ctx *ctx, u8 *out, const u8 *in);

extern const u8 crypto_aes_sbox[];
extern const u8 crypto_aes_inv_sbox[];

void aescfb_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src,
                    int len, const u8 iv[AES_BLOCK_SIZE]);
void aescfb_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src,
                    int len, const u8 iv[AES_BLOCK_SIZE]);

#endif


























































   20 























    5 
    5 



    1 



    1 



    1 


    1 




    1 

    4 









    1 
    5 







   15 




   15 






    6 





    6 











   20 



   20 




   20 






















































































































    4 





























   15 







    2 









    1 

























































    4 














































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  Security-Enhanced Linux (SELinux) security module
 *
 *  This file contains the SELinux XFRM hook function implementations.
 *
 *  Authors:  Serge Hallyn <sergeh@us.ibm.com>
 *              Trent Jaeger <jaegert@us.ibm.com>
 *
 *  Updated: Venkat Yekkirala <vyekkirala@TrustedCS.com>
 *
 *           Granular IPSec Associations for use in MLS environments.
 *
 *  Copyright (C) 2005 International Business Machines Corporation
 *  Copyright (C) 2006 Trusted Computer Solutions, Inc.
 */

/*
 * USAGE:
 * NOTES:
 *   1. Make sure to enable the following options in your kernel config:
 *        CONFIG_SECURITY=y
 *        CONFIG_SECURITY_NETWORK=y
 *        CONFIG_SECURITY_NETWORK_XFRM=y
 *        CONFIG_SECURITY_SELINUX=m/y
 * ISSUES:
 *   1. Caching packets, so they are not dropped during negotiation
 *   2. Emulating a reasonable SO_PEERSEC across machines
 *   3. Testing addition of sk_policy's with security context via setsockopt
 */
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/security.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/ip.h>
#include <linux/tcp.h>
#include <linux/skbuff.h>
#include <linux/xfrm.h>
#include <net/xfrm.h>
#include <net/checksum.h>
#include <net/udp.h>
#include <linux/atomic.h>

#include "avc.h"
#include "objsec.h"
#include "xfrm.h"

/* Labeled XFRM instance counter */
atomic_t selinux_xfrm_refcount __read_mostly = ATOMIC_INIT(0);

/*
 * Returns true if the context is an LSM/SELinux context.
 */
static inline int selinux_authorizable_ctx(struct xfrm_sec_ctx *ctx)
{
        return (ctx &&
                (ctx->ctx_doi == XFRM_SC_DOI_LSM) &&
                (ctx->ctx_alg == XFRM_SC_ALG_SELINUX));
}

/*
 * Returns true if the xfrm contains a security blob for SELinux.
 */
static inline int selinux_authorizable_xfrm(struct xfrm_state *x)
{
        return selinux_authorizable_ctx(x->security);
}

/*
 * Allocates a xfrm_sec_state and populates it using the supplied security
 * xfrm_user_sec_ctx context.
 */
static int selinux_xfrm_alloc_user(struct xfrm_sec_ctx **ctxp,
                                   struct xfrm_user_sec_ctx *uctx,
                                   gfp_t gfp)
{
        int rc;
        struct xfrm_sec_ctx *ctx = NULL;
        u32 str_len;

        if (ctxp == NULL || uctx == NULL ||
            uctx->ctx_doi != XFRM_SC_DOI_LSM ||
            uctx->ctx_alg != XFRM_SC_ALG_SELINUX)
                return -EINVAL;

        str_len = uctx->ctx_len;
        if (str_len >= PAGE_SIZE)
                return -ENOMEM;

        ctx = kmalloc(struct_size(ctx, ctx_str, str_len + 1), gfp);
        if (!ctx)
                return -ENOMEM;

        ctx->ctx_doi = XFRM_SC_DOI_LSM;
        ctx->ctx_alg = XFRM_SC_ALG_SELINUX;
        ctx->ctx_len = str_len + 1;
        memcpy(ctx->ctx_str, &uctx[1], str_len);
        ctx->ctx_str[str_len] = '\0';
        rc = security_context_to_sid(ctx->ctx_str, str_len,
                                     &ctx->ctx_sid, gfp);
        if (rc)
                goto err;

        rc = avc_has_perm(current_sid(), ctx->ctx_sid,
                          SECCLASS_ASSOCIATION, ASSOCIATION__SETCONTEXT, NULL);
        if (rc)
                goto err;

        *ctxp = ctx;
        atomic_inc(&selinux_xfrm_refcount);
        return 0;

err:
        kfree(ctx);
        return rc;
}

/*
 * Free the xfrm_sec_ctx structure.
 */
static void selinux_xfrm_free(struct xfrm_sec_ctx *ctx)
{
        if (!ctx)
                return;

        atomic_dec(&selinux_xfrm_refcount);
        kfree(ctx);
}

/*
 * Authorize the deletion of a labeled SA or policy rule.
 */
static int selinux_xfrm_delete(struct xfrm_sec_ctx *ctx)
{
        if (!ctx)
                return 0;

        return avc_has_perm(current_sid(), ctx->ctx_sid,
                            SECCLASS_ASSOCIATION, ASSOCIATION__SETCONTEXT,
                            NULL);
}

/*
 * LSM hook implementation that authorizes that a flow can use a xfrm policy
 * rule.
 */
int selinux_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid)
{
        int rc;

        /* All flows should be treated as polmatch'ing an otherwise applicable
         * "non-labeled" policy. This would prevent inadvertent "leaks". */
        if (!ctx)
                return 0;

        /* Context sid is either set to label or ANY_ASSOC */
        if (!selinux_authorizable_ctx(ctx))
                return -EINVAL;

        rc = avc_has_perm(fl_secid, ctx->ctx_sid,
                          SECCLASS_ASSOCIATION, ASSOCIATION__POLMATCH, NULL);
        return (rc == -EACCES ? -ESRCH : rc);
}

/*
 * LSM hook implementation that authorizes that a state matches
 * the given policy, flow combo.
 */
int selinux_xfrm_state_pol_flow_match(struct xfrm_state *x,
                                      struct xfrm_policy *xp,
                                      const struct flowi_common *flic)
{
        u32 state_sid;
        u32 flic_sid;

        if (!xp->security)
                if (x->security)
                        /* unlabeled policy and labeled SA can't match */
                        return 0;
                else
                        /* unlabeled policy and unlabeled SA match all flows */
                        return 1;
        else
                if (!x->security)
                        /* unlabeled SA and labeled policy can't match */
                        return 0;
                else
                        if (!selinux_authorizable_xfrm(x))
                                /* Not a SELinux-labeled SA */
                                return 0;

        state_sid = x->security->ctx_sid;
        flic_sid = flic->flowic_secid;

        if (flic_sid != state_sid)
                return 0;

        /* We don't need a separate SA Vs. policy polmatch check since the SA
         * is now of the same label as the flow and a flow Vs. policy polmatch
         * check had already happened in selinux_xfrm_policy_lookup() above. */
        return (avc_has_perm(flic_sid, state_sid,
                             SECCLASS_ASSOCIATION, ASSOCIATION__SENDTO,
                             NULL) ? 0 : 1);
}

static u32 selinux_xfrm_skb_sid_egress(struct sk_buff *skb)
{
        struct dst_entry *dst = skb_dst(skb);
        struct xfrm_state *x;

        if (dst == NULL)
                return SECSID_NULL;
        x = dst->xfrm;
        if (x == NULL || !selinux_authorizable_xfrm(x))
                return SECSID_NULL;

        return x->security->ctx_sid;
}

static int selinux_xfrm_skb_sid_ingress(struct sk_buff *skb,
                                        u32 *sid, int ckall)
{
        u32 sid_session = SECSID_NULL;
        struct sec_path *sp = skb_sec_path(skb);

        if (sp) {
                int i;

                for (i = sp->len - 1; i >= 0; i--) {
                        struct xfrm_state *x = sp->xvec[i];
                        if (selinux_authorizable_xfrm(x)) {
                                struct xfrm_sec_ctx *ctx = x->security;

                                if (sid_session == SECSID_NULL) {
                                        sid_session = ctx->ctx_sid;
                                        if (!ckall)
                                                goto out;
                                } else if (sid_session != ctx->ctx_sid) {
                                        *sid = SECSID_NULL;
                                        return -EINVAL;
                                }
                        }
                }
        }

out:
        *sid = sid_session;
        return 0;
}

/*
 * LSM hook implementation that checks and/or returns the xfrm sid for the
 * incoming packet.
 */
int selinux_xfrm_decode_session(struct sk_buff *skb, u32 *sid, int ckall)
{
        if (skb == NULL) {
                *sid = SECSID_NULL;
                return 0;
        }
        return selinux_xfrm_skb_sid_ingress(skb, sid, ckall);
}

int selinux_xfrm_skb_sid(struct sk_buff *skb, u32 *sid)
{
        int rc;

        rc = selinux_xfrm_skb_sid_ingress(skb, sid, 0);
        if (rc == 0 && *sid == SECSID_NULL)
                *sid = selinux_xfrm_skb_sid_egress(skb);

        return rc;
}

/*
 * LSM hook implementation that allocs and transfers uctx spec to xfrm_policy.
 */
int selinux_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp,
                              struct xfrm_user_sec_ctx *uctx,
                              gfp_t gfp)
{
        return selinux_xfrm_alloc_user(ctxp, uctx, gfp);
}

/*
 * LSM hook implementation that copies security data structure from old to new
 * for policy cloning.
 */
int selinux_xfrm_policy_clone(struct xfrm_sec_ctx *old_ctx,
                              struct xfrm_sec_ctx **new_ctxp)
{
        struct xfrm_sec_ctx *new_ctx;

        if (!old_ctx)
                return 0;

        new_ctx = kmemdup(old_ctx, sizeof(*old_ctx) + old_ctx->ctx_len,
                          GFP_ATOMIC);
        if (!new_ctx)
                return -ENOMEM;
        atomic_inc(&selinux_xfrm_refcount);
        *new_ctxp = new_ctx;

        return 0;
}

/*
 * LSM hook implementation that frees xfrm_sec_ctx security information.
 */
void selinux_xfrm_policy_free(struct xfrm_sec_ctx *ctx)
{
        selinux_xfrm_free(ctx);
}

/*
 * LSM hook implementation that authorizes deletion of labeled policies.
 */
int selinux_xfrm_policy_delete(struct xfrm_sec_ctx *ctx)
{
        return selinux_xfrm_delete(ctx);
}

/*
 * LSM hook implementation that allocates a xfrm_sec_state, populates it using
 * the supplied security context, and assigns it to the xfrm_state.
 */
int selinux_xfrm_state_alloc(struct xfrm_state *x,
                             struct xfrm_user_sec_ctx *uctx)
{
        return selinux_xfrm_alloc_user(&x->security, uctx, GFP_KERNEL);
}

/*
 * LSM hook implementation that allocates a xfrm_sec_state and populates based
 * on a secid.
 */
int selinux_xfrm_state_alloc_acquire(struct xfrm_state *x,
                                     struct xfrm_sec_ctx *polsec, u32 secid)
{
        int rc;
        struct xfrm_sec_ctx *ctx;
        char *ctx_str = NULL;
        u32 str_len;

        if (!polsec)
                return 0;

        if (secid == 0)
                return -EINVAL;

        rc = security_sid_to_context(secid, &ctx_str,
                                     &str_len);
        if (rc)
                return rc;

        ctx = kmalloc(struct_size(ctx, ctx_str, str_len), GFP_ATOMIC);
        if (!ctx) {
                rc = -ENOMEM;
                goto out;
        }

        ctx->ctx_doi = XFRM_SC_DOI_LSM;
        ctx->ctx_alg = XFRM_SC_ALG_SELINUX;
        ctx->ctx_sid = secid;
        ctx->ctx_len = str_len;
        memcpy(ctx->ctx_str, ctx_str, str_len);

        x->security = ctx;
        atomic_inc(&selinux_xfrm_refcount);
out:
        kfree(ctx_str);
        return rc;
}

/*
 * LSM hook implementation that frees xfrm_state security information.
 */
void selinux_xfrm_state_free(struct xfrm_state *x)
{
        selinux_xfrm_free(x->security);
}

/*
 * LSM hook implementation that authorizes deletion of labeled SAs.
 */
int selinux_xfrm_state_delete(struct xfrm_state *x)
{
        return selinux_xfrm_delete(x->security);
}

/*
 * LSM hook that controls access to unlabelled packets.  If
 * a xfrm_state is authorizable (defined by macro) then it was
 * already authorized by the IPSec process.  If not, then
 * we need to check for unlabelled access since this may not have
 * gone thru the IPSec process.
 */
int selinux_xfrm_sock_rcv_skb(u32 sk_sid, struct sk_buff *skb,
                              struct common_audit_data *ad)
{
        int i;
        struct sec_path *sp = skb_sec_path(skb);
        u32 peer_sid = SECINITSID_UNLABELED;

        if (sp) {
                for (i = 0; i < sp->len; i++) {
                        struct xfrm_state *x = sp->xvec[i];

                        if (x && selinux_authorizable_xfrm(x)) {
                                struct xfrm_sec_ctx *ctx = x->security;
                                peer_sid = ctx->ctx_sid;
                                break;
                        }
                }
        }

        /* This check even when there's no association involved is intended,
         * according to Trent Jaeger, to make sure a process can't engage in
         * non-IPsec communication unless explicitly allowed by policy. */
        return avc_has_perm(sk_sid, peer_sid,
                            SECCLASS_ASSOCIATION, ASSOCIATION__RECVFROM, ad);
}

/*
 * POSTROUTE_LAST hook's XFRM processing:
 * If we have no security association, then we need to determine
 * whether the socket is allowed to send to an unlabelled destination.
 * If we do have a authorizable security association, then it has already been
 * checked in the selinux_xfrm_state_pol_flow_match hook above.
 */
int selinux_xfrm_postroute_last(u32 sk_sid, struct sk_buff *skb,
                                struct common_audit_data *ad, u8 proto)
{
        struct dst_entry *dst;

        switch (proto) {
        case IPPROTO_AH:
        case IPPROTO_ESP:
        case IPPROTO_COMP:
                /* We should have already seen this packet once before it
                 * underwent xfrm(s). No need to subject it to the unlabeled
                 * check. */
                return 0;
        default:
                break;
        }

        dst = skb_dst(skb);
        if (dst) {
                struct dst_entry *iter;

                for (iter = dst; iter != NULL; iter = xfrm_dst_child(iter)) {
                        struct xfrm_state *x = iter->xfrm;

                        if (x && selinux_authorizable_xfrm(x))
                                return 0;
                }
        }

        /* This check even when there's no association involved is intended,
         * according to Trent Jaeger, to make sure a process can't engage in
         * non-IPsec communication unless explicitly allowed by policy. */
        return avc_has_perm(sk_sid, SECINITSID_UNLABELED,
                            SECCLASS_ASSOCIATION, ASSOCIATION__SENDTO, ad);
}









































































































































































































































































































































































































































































































































































































































































































   59 



   30 
   59 


   56 


   10 






    8 
   60 

    2 


































   52 
   52 
   24 


   23 






   10 






   22 














    8 

    8 




    8 



















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * xfrm algorithm interface
 *
 * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
 */

#include <crypto/acompress.h>
#include <crypto/aead.h>
#include <crypto/hash.h>
#include <crypto/skcipher.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/pfkeyv2.h>
#include <linux/scatterlist.h>
#include <net/xfrm.h>
#if IS_ENABLED(CONFIG_INET_ESP) || IS_ENABLED(CONFIG_INET6_ESP)
#include <net/esp.h>
#endif

/*
 * Algorithms supported by IPsec.  These entries contain properties which
 * are used in key negotiation and xfrm processing, and are used to verify
 * that instantiated crypto transforms have correct parameters for IPsec
 * purposes.
 */
static struct xfrm_algo_desc aead_list[] = {
{
        .name = "rfc4106(gcm(aes))",

        .uinfo = {
                .aead = {
                        .geniv = "seqiv",
                        .icv_truncbits = 64,
                }
        },

        .pfkey_supported = 1,

        .desc = {
                .sadb_alg_id = SADB_X_EALG_AES_GCM_ICV8,
                .sadb_alg_ivlen = 8,
                .sadb_alg_minbits = 128,
                .sadb_alg_maxbits = 256
        }
},
{
        .name = "rfc4106(gcm(aes))",

        .uinfo = {
                .aead = {
                        .geniv = "seqiv",
                        .icv_truncbits = 96,
                }
        },

        .pfkey_supported = 1,

        .desc = {
                .sadb_alg_id = SADB_X_EALG_AES_GCM_ICV12,
                .sadb_alg_ivlen = 8,
                .sadb_alg_minbits = 128,
                .sadb_alg_maxbits = 256
        }
},
{
        .name = "rfc4106(gcm(aes))",

        .uinfo = {
                .aead = {
                        .geniv = "seqiv",
                        .icv_truncbits = 128,
                }
        },

        .pfkey_supported = 1,

        .desc = {
                .sadb_alg_id = SADB_X_EALG_AES_GCM_ICV16,
                .sadb_alg_ivlen = 8,
                .sadb_alg_minbits = 128,
                .sadb_alg_maxbits = 256
        }
},
{
        .name = "rfc4309(ccm(aes))",

        .uinfo = {
                .aead = {
                        .geniv = "seqiv",
                        .icv_truncbits = 64,
                }
        },

        .pfkey_supported = 1,

        .desc = {
                .sadb_alg_id = SADB_X_EALG_AES_CCM_ICV8,
                .sadb_alg_ivlen = 8,
                .sadb_alg_minbits = 128,
                .sadb_alg_maxbits = 256
        }
},
{
        .name = "rfc4309(ccm(aes))",

        .uinfo = {
                .aead = {
                        .geniv = "seqiv",
                        .icv_truncbits = 96,
                }
        },

        .pfkey_supported = 1,

        .desc = {
                .sadb_alg_id = SADB_X_EALG_AES_CCM_ICV12,
                .sadb_alg_ivlen = 8,
                .sadb_alg_minbits = 128,
                .sadb_alg_maxbits = 256
        }
},
{
        .name = "rfc4309(ccm(aes))",

        .uinfo = {
                .aead = {
                        .geniv = "seqiv",
                        .icv_truncbits = 128,
                }
        },

        .pfkey_supported = 1,

        .desc = {
                .sadb_alg_id = SADB_X_EALG_AES_CCM_ICV16,
                .sadb_alg_ivlen = 8,
                .sadb_alg_minbits = 128,
                .sadb_alg_maxbits = 256
        }
},
{
        .name = "rfc4543(gcm(aes))",

        .uinfo = {
                .aead = {
                        .geniv = "seqiv",
                        .icv_truncbits = 128,
                }
        },

        .pfkey_supported = 1,

        .desc = {
                .sadb_alg_id = SADB_X_EALG_NULL_AES_GMAC,
                .sadb_alg_ivlen = 8,
                .sadb_alg_minbits = 128,
                .sadb_alg_maxbits = 256
        }
},
{
        .name = "rfc7539esp(chacha20,poly1305)",

        .uinfo = {
                .aead = {
                        .geniv = "seqiv",
                        .icv_truncbits = 128,
                }
        },

        .pfkey_supported = 0,
},
};

static struct xfrm_algo_desc aalg_list[] = {
{
        .name = "digest_null",

        .uinfo = {
                .auth = {
                        .icv_truncbits = 0,
                        .icv_fullbits = 0,
                }
        },

        .pfkey_supported = 1,

        .desc = {
                .sadb_alg_id = SADB_X_AALG_NULL,
                .sadb_alg_ivlen = 0,
                .sadb_alg_minbits = 0,
                .sadb_alg_maxbits = 0
        }
},
{
        .name = "hmac(md5)",
        .compat = "md5",

        .uinfo = {
                .auth = {
                        .icv_truncbits = 96,
                        .icv_fullbits = 128,
                }
        },

        .pfkey_supported = 1,

        .desc = {
                .sadb_alg_id = SADB_AALG_MD5HMAC,
                .sadb_alg_ivlen = 0,
                .sadb_alg_minbits = 128,
                .sadb_alg_maxbits = 128
        }
},
{
        .name = "hmac(sha1)",
        .compat = "sha1",

        .uinfo = {
                .auth = {
                        .icv_truncbits = 96,
                        .icv_fullbits = 160,
                }
        },

        .pfkey_supported = 1,

        .desc = {
                .sadb_alg_id = SADB_AALG_SHA1HMAC,
                .sadb_alg_ivlen = 0,
                .sadb_alg_minbits = 160,
                .sadb_alg_maxbits = 160
        }
},
{
        .name = "hmac(sha256)",
        .compat = "sha256",

        .uinfo = {
                .auth = {
                        .icv_truncbits = 96,
                        .icv_fullbits = 256,
                }
        },

        .pfkey_supported = 1,

        .desc = {
                .sadb_alg_id = SADB_X_AALG_SHA2_256HMAC,
                .sadb_alg_ivlen = 0,
                .sadb_alg_minbits = 256,
                .sadb_alg_maxbits = 256
        }
},
{
        .name = "hmac(sha384)",

        .uinfo = {
                .auth = {
                        .icv_truncbits = 192,
                        .icv_fullbits = 384,
                }
        },

        .pfkey_supported = 1,

        .desc = {
                .sadb_alg_id = SADB_X_AALG_SHA2_384HMAC,
                .sadb_alg_ivlen = 0,
                .sadb_alg_minbits = 384,
                .sadb_alg_maxbits = 384
        }
},
{
        .name = "hmac(sha512)",

        .uinfo = {
                .auth = {
                        .icv_truncbits = 256,
                        .icv_fullbits = 512,
                }
        },

        .pfkey_supported = 1,

        .desc = {
                .sadb_alg_id = SADB_X_AALG_SHA2_512HMAC,
                .sadb_alg_ivlen = 0,
                .sadb_alg_minbits = 512,
                .sadb_alg_maxbits = 512
        }
},
{
        .name = "hmac(rmd160)",
        .compat = "rmd160",

        .uinfo = {
                .auth = {
                        .icv_truncbits = 96,
                        .icv_fullbits = 160,
                }
        },

        .pfkey_supported = 1,

        .desc = {
                .sadb_alg_id = SADB_X_AALG_RIPEMD160HMAC,
                .sadb_alg_ivlen = 0,
                .sadb_alg_minbits = 160,
                .sadb_alg_maxbits = 160
        }
},
{
        .name = "xcbc(aes)",

        .uinfo = {
                .auth = {
                        .icv_truncbits = 96,
                        .icv_fullbits = 128,
                }
        },

        .pfkey_supported = 1,

        .desc = {
                .sadb_alg_id = SADB_X_AALG_AES_XCBC_MAC,
                .sadb_alg_ivlen = 0,
                .sadb_alg_minbits = 128,
                .sadb_alg_maxbits = 128
        }
},
{
        /* rfc4494 */
        .name = "cmac(aes)",

        .uinfo = {
                .auth = {
                        .icv_truncbits = 96,
                        .icv_fullbits = 128,
                }
        },

        .pfkey_supported = 0,
},
{
        .name = "hmac(sm3)",
        .compat = "sm3",

        .uinfo = {
                .auth = {
                        .icv_truncbits = 256,
                        .icv_fullbits = 256,
                }
        },

        .pfkey_supported = 1,

        .desc = {
                .sadb_alg_id = SADB_X_AALG_SM3_256HMAC,
                .sadb_alg_ivlen = 0,
                .sadb_alg_minbits = 256,
                .sadb_alg_maxbits = 256
        }
},
};

static struct xfrm_algo_desc ealg_list[] = {
{
        .name = "ecb(cipher_null)",
        .compat = "cipher_null",

        .uinfo = {
                .encr = {
                        .blockbits = 8,
                        .defkeybits = 0,
                }
        },

        .pfkey_supported = 1,

        .desc = {
                .sadb_alg_id =        SADB_EALG_NULL,
                .sadb_alg_ivlen = 0,
                .sadb_alg_minbits = 0,
                .sadb_alg_maxbits = 0
        }
},
{
        .name = "cbc(des)",
        .compat = "des",

        .uinfo = {
                .encr = {
                        .geniv = "echainiv",
                        .blockbits = 64,
                        .defkeybits = 64,
                }
        },

        .pfkey_supported = 1,

        .desc = {
                .sadb_alg_id = SADB_EALG_DESCBC,
                .sadb_alg_ivlen = 8,
                .sadb_alg_minbits = 64,
                .sadb_alg_maxbits = 64
        }
},
{
        .name = "cbc(des3_ede)",
        .compat = "des3_ede",

        .uinfo = {
                .encr = {
                        .geniv = "echainiv",
                        .blockbits = 64,
                        .defkeybits = 192,
                }
        },

        .pfkey_supported = 1,

        .desc = {
                .sadb_alg_id = SADB_EALG_3DESCBC,
                .sadb_alg_ivlen = 8,
                .sadb_alg_minbits = 192,
                .sadb_alg_maxbits = 192
        }
},
{
        .name = "cbc(cast5)",
        .compat = "cast5",

        .uinfo = {
                .encr = {
                        .geniv = "echainiv",
                        .blockbits = 64,
                        .defkeybits = 128,
                }
        },

        .pfkey_supported = 1,

        .desc = {
                .sadb_alg_id = SADB_X_EALG_CASTCBC,
                .sadb_alg_ivlen = 8,
                .sadb_alg_minbits = 40,
                .sadb_alg_maxbits = 128
        }
},
{
        .name = "cbc(blowfish)",
        .compat = "blowfish",

        .uinfo = {
                .encr = {
                        .geniv = "echainiv",
                        .blockbits = 64,
                        .defkeybits = 128,
                }
        },

        .pfkey_supported = 1,

        .desc = {
                .sadb_alg_id = SADB_X_EALG_BLOWFISHCBC,
                .sadb_alg_ivlen = 8,
                .sadb_alg_minbits = 40,
                .sadb_alg_maxbits = 448
        }
},
{
        .name = "cbc(aes)",
        .compat = "aes",

        .uinfo = {
                .encr = {
                        .geniv = "echainiv",
                        .blockbits = 128,
                        .defkeybits = 128,
                }
        },

        .pfkey_supported = 1,

        .desc = {
                .sadb_alg_id = SADB_X_EALG_AESCBC,
                .sadb_alg_ivlen = 8,
                .sadb_alg_minbits = 128,
                .sadb_alg_maxbits = 256
        }
},
{
        .name = "cbc(serpent)",
        .compat = "serpent",

        .uinfo = {
                .encr = {
                        .geniv = "echainiv",
                        .blockbits = 128,
                        .defkeybits = 128,
                }
        },

        .pfkey_supported = 1,

        .desc = {
                .sadb_alg_id = SADB_X_EALG_SERPENTCBC,
                .sadb_alg_ivlen = 8,
                .sadb_alg_minbits = 128,
                .sadb_alg_maxbits = 256,
        }
},
{
        .name = "cbc(camellia)",
        .compat = "camellia",

        .uinfo = {
                .encr = {
                        .geniv = "echainiv",
                        .blockbits = 128,
                        .defkeybits = 128,
                }
        },

        .pfkey_supported = 1,

        .desc = {
                .sadb_alg_id = SADB_X_EALG_CAMELLIACBC,
                .sadb_alg_ivlen = 8,
                .sadb_alg_minbits = 128,
                .sadb_alg_maxbits = 256
        }
},
{
        .name = "cbc(twofish)",
        .compat = "twofish",

        .uinfo = {
                .encr = {
                        .geniv = "echainiv",
                        .blockbits = 128,
                        .defkeybits = 128,
                }
        },

        .pfkey_supported = 1,

        .desc = {
                .sadb_alg_id = SADB_X_EALG_TWOFISHCBC,
                .sadb_alg_ivlen = 8,
                .sadb_alg_minbits = 128,
                .sadb_alg_maxbits = 256
        }
},
{
        .name = "rfc3686(ctr(aes))",

        .uinfo = {
                .encr = {
                        .geniv = "seqiv",
                        .blockbits = 128,
                        .defkeybits = 160, /* 128-bit key + 32-bit nonce */
                }
        },

        .pfkey_supported = 1,

        .desc = {
                .sadb_alg_id = SADB_X_EALG_AESCTR,
                .sadb_alg_ivlen        = 8,
                .sadb_alg_minbits = 160,
                .sadb_alg_maxbits = 288
        }
},
{
        .name = "cbc(sm4)",
        .compat = "sm4",

        .uinfo = {
                .encr = {
                        .geniv = "echainiv",
                        .blockbits = 128,
                        .defkeybits = 128,
                }
        },

        .pfkey_supported = 1,

        .desc = {
                .sadb_alg_id = SADB_X_EALG_SM4CBC,
                .sadb_alg_ivlen        = 16,
                .sadb_alg_minbits = 128,
                .sadb_alg_maxbits = 256
        }
},
};

static struct xfrm_algo_desc calg_list[] = {
{
        .name = "deflate",
        .uinfo = {
                .comp = {
                        .threshold = 90,
                }
        },
        .pfkey_supported = 1,
        .desc = { .sadb_alg_id = SADB_X_CALG_DEFLATE }
},
{
        .name = "lzs",
        .uinfo = {
                .comp = {
                        .threshold = 90,
                }
        },
        .pfkey_supported = 1,
        .desc = { .sadb_alg_id = SADB_X_CALG_LZS }
},
{
        .name = "lzjh",
        .uinfo = {
                .comp = {
                        .threshold = 50,
                }
        },
        .pfkey_supported = 1,
        .desc = { .sadb_alg_id = SADB_X_CALG_LZJH }
},
};

static inline int aalg_entries(void)
{
        return ARRAY_SIZE(aalg_list);
}

static inline int ealg_entries(void)
{
        return ARRAY_SIZE(ealg_list);
}

static inline int calg_entries(void)
{
        return ARRAY_SIZE(calg_list);
}

struct xfrm_algo_list {
        int (*find)(const char *name, u32 type, u32 mask);
        struct xfrm_algo_desc *algs;
        int entries;
};

static const struct xfrm_algo_list xfrm_aead_list = {
        .find = crypto_has_aead,
        .algs = aead_list,
        .entries = ARRAY_SIZE(aead_list),
};

static const struct xfrm_algo_list xfrm_aalg_list = {
        .find = crypto_has_ahash,
        .algs = aalg_list,
        .entries = ARRAY_SIZE(aalg_list),
};

static const struct xfrm_algo_list xfrm_ealg_list = {
        .find = crypto_has_skcipher,
        .algs = ealg_list,
        .entries = ARRAY_SIZE(ealg_list),
};

static const struct xfrm_algo_list xfrm_calg_list = {
        .find = crypto_has_acomp,
        .algs = calg_list,
        .entries = ARRAY_SIZE(calg_list),
};

static struct xfrm_algo_desc *xfrm_find_algo(
        const struct xfrm_algo_list *algo_list,
        int match(const struct xfrm_algo_desc *entry, const void *data),
        const void *data, int probe)
{
        struct xfrm_algo_desc *list = algo_list->algs;
        int i, status;

        for (i = 0; i < algo_list->entries; i++) {
                if (!match(list + i, data))
                        continue;

                if (list[i].available)
                        return &list[i];

                if (!probe)
                        break;

                status = algo_list->find(list[i].name, 0, 0);
                if (!status)
                        break;

                list[i].available = status;
                return &list[i];
        }
        return NULL;
}

static int xfrm_alg_id_match(const struct xfrm_algo_desc *entry,
                             const void *data)
{
        return entry->desc.sadb_alg_id == (unsigned long)data;
}

struct xfrm_algo_desc *xfrm_aalg_get_byid(int alg_id)
{
        return xfrm_find_algo(&xfrm_aalg_list, xfrm_alg_id_match,
                              (void *)(unsigned long)alg_id, 1);
}
EXPORT_SYMBOL_GPL(xfrm_aalg_get_byid);

struct xfrm_algo_desc *xfrm_ealg_get_byid(int alg_id)
{
        return xfrm_find_algo(&xfrm_ealg_list, xfrm_alg_id_match,
                              (void *)(unsigned long)alg_id, 1);
}
EXPORT_SYMBOL_GPL(xfrm_ealg_get_byid);

struct xfrm_algo_desc *xfrm_calg_get_byid(int alg_id)
{
        return xfrm_find_algo(&xfrm_calg_list, xfrm_alg_id_match,
                              (void *)(unsigned long)alg_id, 1);
}
EXPORT_SYMBOL_GPL(xfrm_calg_get_byid);

static int xfrm_alg_name_match(const struct xfrm_algo_desc *entry,
                               const void *data)
{
        const char *name = data;

        return name && (!strcmp(name, entry->name) ||
                        (entry->compat && !strcmp(name, entry->compat)));
}

struct xfrm_algo_desc *xfrm_aalg_get_byname(const char *name, int probe)
{
        return xfrm_find_algo(&xfrm_aalg_list, xfrm_alg_name_match, name,
                              probe);
}
EXPORT_SYMBOL_GPL(xfrm_aalg_get_byname);

struct xfrm_algo_desc *xfrm_ealg_get_byname(const char *name, int probe)
{
        return xfrm_find_algo(&xfrm_ealg_list, xfrm_alg_name_match, name,
                              probe);
}
EXPORT_SYMBOL_GPL(xfrm_ealg_get_byname);

struct xfrm_algo_desc *xfrm_calg_get_byname(const char *name, int probe)
{
        return xfrm_find_algo(&xfrm_calg_list, xfrm_alg_name_match, name,
                              probe);
}
EXPORT_SYMBOL_GPL(xfrm_calg_get_byname);

struct xfrm_aead_name {
        const char *name;
        int icvbits;
};

static int xfrm_aead_name_match(const struct xfrm_algo_desc *entry,
                                const void *data)
{
        const struct xfrm_aead_name *aead = data;
        const char *name = aead->name;

        return aead->icvbits == entry->uinfo.aead.icv_truncbits && name &&
               !strcmp(name, entry->name);
}

struct xfrm_algo_desc *xfrm_aead_get_byname(const char *name, int icv_len, int probe)
{
        struct xfrm_aead_name data = {
                .name = name,
                .icvbits = icv_len,
        };

        return xfrm_find_algo(&xfrm_aead_list, xfrm_aead_name_match, &data,
                              probe);
}
EXPORT_SYMBOL_GPL(xfrm_aead_get_byname);

struct xfrm_algo_desc *xfrm_aalg_get_byidx(unsigned int idx)
{
        if (idx >= aalg_entries())
                return NULL;

        return &aalg_list[idx];
}
EXPORT_SYMBOL_GPL(xfrm_aalg_get_byidx);

struct xfrm_algo_desc *xfrm_ealg_get_byidx(unsigned int idx)
{
        if (idx >= ealg_entries())
                return NULL;

        return &ealg_list[idx];
}
EXPORT_SYMBOL_GPL(xfrm_ealg_get_byidx);

/*
 * Probe for the availability of crypto algorithms, and set the available
 * flag for any algorithms found on the system.  This is typically called by
 * pfkey during userspace SA add, update or register.
 */
void xfrm_probe_algs(void)
{
        int i, status;

        BUG_ON(in_softirq());

        for (i = 0; i < aalg_entries(); i++) {
                status = crypto_has_ahash(aalg_list[i].name, 0, 0);
                if (aalg_list[i].available != status)
                        aalg_list[i].available = status;
        }

        for (i = 0; i < ealg_entries(); i++) {
                status = crypto_has_skcipher(ealg_list[i].name, 0, 0);
                if (ealg_list[i].available != status)
                        ealg_list[i].available = status;
        }

        for (i = 0; i < calg_entries(); i++) {
                status = crypto_has_acomp(calg_list[i].name, 0, 0);
                if (calg_list[i].available != status)
                        calg_list[i].available = status;
        }
}
EXPORT_SYMBOL_GPL(xfrm_probe_algs);

int xfrm_count_pfkey_auth_supported(void)
{
        int i, n;

        for (i = 0, n = 0; i < aalg_entries(); i++)
                if (aalg_list[i].available && aalg_list[i].pfkey_supported)
                        n++;
        return n;
}
EXPORT_SYMBOL_GPL(xfrm_count_pfkey_auth_supported);

int xfrm_count_pfkey_enc_supported(void)
{
        int i, n;

        for (i = 0, n = 0; i < ealg_entries(); i++)
                if (ealg_list[i].available && ealg_list[i].pfkey_supported)
                        n++;
        return n;
}
EXPORT_SYMBOL_GPL(xfrm_count_pfkey_enc_supported);

MODULE_DESCRIPTION("XFRM Algorithm interface");
MODULE_LICENSE("GPL");





































































   60 

































































































































































































































































































































    1 



   59 


































































































































































































































































































































































   61 





   60 
   59 
   59 
   60 


   61 


   60 



   60 




   61 

   61 
   61 

   59 






































   60 













































































































































































































































































































    1 




































    1 




    1 










    1 
    1 



    1 


    1 





















































    1 

    1 



















































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (C) 2001 Momchil Velikov
 * Portions Copyright (C) 2001 Christoph Hellwig
 * Copyright (C) 2005 SGI, Christoph Lameter
 * Copyright (C) 2006 Nick Piggin
 * Copyright (C) 2012 Konstantin Khlebnikov
 * Copyright (C) 2016 Intel, Matthew Wilcox
 * Copyright (C) 2016 Intel, Ross Zwisler
 */

#include <linux/bitmap.h>
#include <linux/bitops.h>
#include <linux/bug.h>
#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/export.h>
#include <linux/idr.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kmemleak.h>
#include <linux/percpu.h>
#include <linux/preempt.h>                /* in_interrupt() */
#include <linux/radix-tree.h>
#include <linux/rcupdate.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/xarray.h>

#include "radix-tree.h"

/*
 * Radix tree node cache.
 */
struct kmem_cache *radix_tree_node_cachep;

/*
 * The radix tree is variable-height, so an insert operation not only has
 * to build the branch to its corresponding item, it also has to build the
 * branch to existing items if the size has to be increased (by
 * radix_tree_extend).
 *
 * The worst case is a zero height tree with just a single item at index 0,
 * and then inserting an item at index ULONG_MAX. This requires 2 new branches
 * of RADIX_TREE_MAX_PATH size to be created, with only the root node shared.
 * Hence:
 */
#define RADIX_TREE_PRELOAD_SIZE (RADIX_TREE_MAX_PATH * 2 - 1)

/*
 * The IDR does not have to be as high as the radix tree since it uses
 * signed integers, not unsigned longs.
 */
#define IDR_INDEX_BITS                (8 /* CHAR_BIT */ * sizeof(int) - 1)
#define IDR_MAX_PATH                (DIV_ROUND_UP(IDR_INDEX_BITS, \
                                                RADIX_TREE_MAP_SHIFT))
#define IDR_PRELOAD_SIZE        (IDR_MAX_PATH * 2 - 1)

/*
 * Per-cpu pool of preloaded nodes
 */
DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = {
        .lock = INIT_LOCAL_LOCK(lock),
};
EXPORT_PER_CPU_SYMBOL_GPL(radix_tree_preloads);

static inline struct radix_tree_node *entry_to_node(void *ptr)
{
        return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE);
}

static inline void *node_to_entry(void *ptr)
{
        return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE);
}

#define RADIX_TREE_RETRY        XA_RETRY_ENTRY

static inline unsigned long
get_slot_offset(const struct radix_tree_node *parent, void __rcu **slot)
{
        return parent ? slot - parent->slots : 0;
}

static unsigned int radix_tree_descend(const struct radix_tree_node *parent,
                        struct radix_tree_node **nodep, unsigned long index)
{
        unsigned int offset = (index >> parent->shift) & RADIX_TREE_MAP_MASK;
        void __rcu **entry = rcu_dereference_raw(parent->slots[offset]);

        *nodep = (void *)entry;
        return offset;
}

static inline gfp_t root_gfp_mask(const struct radix_tree_root *root)
{
        return root->xa_flags & (__GFP_BITS_MASK & ~GFP_ZONEMASK);
}

static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
                int offset)
{
        __set_bit(offset, node->tags[tag]);
}

static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
                int offset)
{
        __clear_bit(offset, node->tags[tag]);
}

static inline int tag_get(const struct radix_tree_node *node, unsigned int tag,
                int offset)
{
        return test_bit(offset, node->tags[tag]);
}

static inline void root_tag_set(struct radix_tree_root *root, unsigned tag)
{
        root->xa_flags |= (__force gfp_t)(1 << (tag + ROOT_TAG_SHIFT));
}

static inline void root_tag_clear(struct radix_tree_root *root, unsigned tag)
{
        root->xa_flags &= (__force gfp_t)~(1 << (tag + ROOT_TAG_SHIFT));
}

static inline void root_tag_clear_all(struct radix_tree_root *root)
{
        root->xa_flags &= (__force gfp_t)((1 << ROOT_TAG_SHIFT) - 1);
}

static inline int root_tag_get(const struct radix_tree_root *root, unsigned tag)
{
        return (__force int)root->xa_flags & (1 << (tag + ROOT_TAG_SHIFT));
}

static inline unsigned root_tags_get(const struct radix_tree_root *root)
{
        return (__force unsigned)root->xa_flags >> ROOT_TAG_SHIFT;
}

static inline bool is_idr(const struct radix_tree_root *root)
{
        return !!(root->xa_flags & ROOT_IS_IDR);
}

/*
 * Returns 1 if any slot in the node has this tag set.
 * Otherwise returns 0.
 */
static inline int any_tag_set(const struct radix_tree_node *node,
                                                        unsigned int tag)
{
        unsigned idx;
        for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
                if (node->tags[tag][idx])
                        return 1;
        }
        return 0;
}

static inline void all_tag_set(struct radix_tree_node *node, unsigned int tag)
{
        bitmap_fill(node->tags[tag], RADIX_TREE_MAP_SIZE);
}

/**
 * radix_tree_find_next_bit - find the next set bit in a memory region
 *
 * @node: where to begin the search
 * @tag: the tag index
 * @offset: the bitnumber to start searching at
 *
 * Unrollable variant of find_next_bit() for constant size arrays.
 * Tail bits starting from size to roundup(size, BITS_PER_LONG) must be zero.
 * Returns next bit offset, or size if nothing found.
 */
static __always_inline unsigned long
radix_tree_find_next_bit(struct radix_tree_node *node, unsigned int tag,
                         unsigned long offset)
{
        const unsigned long *addr = node->tags[tag];

        if (offset < RADIX_TREE_MAP_SIZE) {
                unsigned long tmp;

                addr += offset / BITS_PER_LONG;
                tmp = *addr >> (offset % BITS_PER_LONG);
                if (tmp)
                        return __ffs(tmp) + offset;
                offset = (offset + BITS_PER_LONG) & ~(BITS_PER_LONG - 1);
                while (offset < RADIX_TREE_MAP_SIZE) {
                        tmp = *++addr;
                        if (tmp)
                                return __ffs(tmp) + offset;
                        offset += BITS_PER_LONG;
                }
        }
        return RADIX_TREE_MAP_SIZE;
}

static unsigned int iter_offset(const struct radix_tree_iter *iter)
{
        return iter->index & RADIX_TREE_MAP_MASK;
}

/*
 * The maximum index which can be stored in a radix tree
 */
static inline unsigned long shift_maxindex(unsigned int shift)
{
        return (RADIX_TREE_MAP_SIZE << shift) - 1;
}

static inline unsigned long node_maxindex(const struct radix_tree_node *node)
{
        return shift_maxindex(node->shift);
}

static unsigned long next_index(unsigned long index,
                                const struct radix_tree_node *node,
                                unsigned long offset)
{
        return (index & ~node_maxindex(node)) + (offset << node->shift);
}

/*
 * This assumes that the caller has performed appropriate preallocation, and
 * that the caller has pinned this thread of control to the current CPU.
 */
static struct radix_tree_node *
radix_tree_node_alloc(gfp_t gfp_mask, struct radix_tree_node *parent,
                        struct radix_tree_root *root,
                        unsigned int shift, unsigned int offset,
                        unsigned int count, unsigned int nr_values)
{
        struct radix_tree_node *ret = NULL;

        /*
         * Preload code isn't irq safe and it doesn't make sense to use
         * preloading during an interrupt anyway as all the allocations have
         * to be atomic. So just do normal allocation when in interrupt.
         */
        if (!gfpflags_allow_blocking(gfp_mask) && !in_interrupt()) {
                struct radix_tree_preload *rtp;

                /*
                 * Even if the caller has preloaded, try to allocate from the
                 * cache first for the new node to get accounted to the memory
                 * cgroup.
                 */
                ret = kmem_cache_alloc(radix_tree_node_cachep,
                                       gfp_mask | __GFP_NOWARN);
                if (ret)
                        goto out;

                /*
                 * Provided the caller has preloaded here, we will always
                 * succeed in getting a node here (and never reach
                 * kmem_cache_alloc)
                 */
                rtp = this_cpu_ptr(&radix_tree_preloads);
                if (rtp->nr) {
                        ret = rtp->nodes;
                        rtp->nodes = ret->parent;
                        rtp->nr--;
                }
                /*
                 * Update the allocation stack trace as this is more useful
                 * for debugging.
                 */
                kmemleak_update_trace(ret);
                goto out;
        }
        ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
out:
        BUG_ON(radix_tree_is_internal_node(ret));
        if (ret) {
                ret->shift = shift;
                ret->offset = offset;
                ret->count = count;
                ret->nr_values = nr_values;
                ret->parent = parent;
                ret->array = root;
        }
        return ret;
}

void radix_tree_node_rcu_free(struct rcu_head *head)
{
        struct radix_tree_node *node =
                        container_of(head, struct radix_tree_node, rcu_head);

        /*
         * Must only free zeroed nodes into the slab.  We can be left with
         * non-NULL entries by radix_tree_free_nodes, so clear the entries
         * and tags here.
         */
        memset(node->slots, 0, sizeof(node->slots));
        memset(node->tags, 0, sizeof(node->tags));
        INIT_LIST_HEAD(&node->private_list);

        kmem_cache_free(radix_tree_node_cachep, node);
}

static inline void
radix_tree_node_free(struct radix_tree_node *node)
{
        call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
}

/*
 * Load up this CPU's radix_tree_node buffer with sufficient objects to
 * ensure that the addition of a single element in the tree cannot fail.  On
 * success, return zero, with preemption disabled.  On error, return -ENOMEM
 * with preemption not disabled.
 *
 * To make use of this facility, the radix tree must be initialised without
 * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
 */
static __must_check int __radix_tree_preload(gfp_t gfp_mask, unsigned nr)
{
        struct radix_tree_preload *rtp;
        struct radix_tree_node *node;
        int ret = -ENOMEM;

        /*
         * Nodes preloaded by one cgroup can be used by another cgroup, so
         * they should never be accounted to any particular memory cgroup.
         */
        gfp_mask &= ~__GFP_ACCOUNT;

        local_lock(&radix_tree_preloads.lock);
        rtp = this_cpu_ptr(&radix_tree_preloads);
        while (rtp->nr < nr) {
                local_unlock(&radix_tree_preloads.lock);
                node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
                if (node == NULL)
                        goto out;
                local_lock(&radix_tree_preloads.lock);
                rtp = this_cpu_ptr(&radix_tree_preloads);
                if (rtp->nr < nr) {
                        node->parent = rtp->nodes;
                        rtp->nodes = node;
                        rtp->nr++;
                } else {
                        kmem_cache_free(radix_tree_node_cachep, node);
                }
        }
        ret = 0;
out:
        return ret;
}

/*
 * Load up this CPU's radix_tree_node buffer with sufficient objects to
 * ensure that the addition of a single element in the tree cannot fail.  On
 * success, return zero, with preemption disabled.  On error, return -ENOMEM
 * with preemption not disabled.
 *
 * To make use of this facility, the radix tree must be initialised without
 * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
 */
int radix_tree_preload(gfp_t gfp_mask)
{
        /* Warn on non-sensical use... */
        WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask));
        return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
}
EXPORT_SYMBOL(radix_tree_preload);

/*
 * The same as above function, except we don't guarantee preloading happens.
 * We do it, if we decide it helps. On success, return zero with preemption
 * disabled. On error, return -ENOMEM with preemption not disabled.
 */
int radix_tree_maybe_preload(gfp_t gfp_mask)
{
        if (gfpflags_allow_blocking(gfp_mask))
                return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
        /* Preloading doesn't help anything with this gfp mask, skip it */
        local_lock(&radix_tree_preloads.lock);
        return 0;
}
EXPORT_SYMBOL(radix_tree_maybe_preload);

static unsigned radix_tree_load_root(const struct radix_tree_root *root,
                struct radix_tree_node **nodep, unsigned long *maxindex)
{
        struct radix_tree_node *node = rcu_dereference_raw(root->xa_head);

        *nodep = node;

        if (likely(radix_tree_is_internal_node(node))) {
                node = entry_to_node(node);
                *maxindex = node_maxindex(node);
                return node->shift + RADIX_TREE_MAP_SHIFT;
        }

        *maxindex = 0;
        return 0;
}

/*
 *        Extend a radix tree so it can store key @index.
 */
static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp,
                                unsigned long index, unsigned int shift)
{
        void *entry;
        unsigned int maxshift;
        int tag;

        /* Figure out what the shift should be.  */
        maxshift = shift;
        while (index > shift_maxindex(maxshift))
                maxshift += RADIX_TREE_MAP_SHIFT;

        entry = rcu_dereference_raw(root->xa_head);
        if (!entry && (!is_idr(root) || root_tag_get(root, IDR_FREE)))
                goto out;

        do {
                struct radix_tree_node *node = radix_tree_node_alloc(gfp, NULL,
                                                        root, shift, 0, 1, 0);
                if (!node)
                        return -ENOMEM;

                if (is_idr(root)) {
                        all_tag_set(node, IDR_FREE);
                        if (!root_tag_get(root, IDR_FREE)) {
                                tag_clear(node, IDR_FREE, 0);
                                root_tag_set(root, IDR_FREE);
                        }
                } else {
                        /* Propagate the aggregated tag info to the new child */
                        for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
                                if (root_tag_get(root, tag))
                                        tag_set(node, tag, 0);
                        }
                }

                BUG_ON(shift > BITS_PER_LONG);
                if (radix_tree_is_internal_node(entry)) {
                        entry_to_node(entry)->parent = node;
                } else if (xa_is_value(entry)) {
                        /* Moving a value entry root->xa_head to a node */
                        node->nr_values = 1;
                }
                /*
                 * entry was already in the radix tree, so we do not need
                 * rcu_assign_pointer here
                 */
                node->slots[0] = (void __rcu *)entry;
                entry = node_to_entry(node);
                rcu_assign_pointer(root->xa_head, entry);
                shift += RADIX_TREE_MAP_SHIFT;
        } while (shift <= maxshift);
out:
        return maxshift + RADIX_TREE_MAP_SHIFT;
}

/**
 *        radix_tree_shrink    -    shrink radix tree to minimum height
 *        @root:                radix tree root
 */
static inline bool radix_tree_shrink(struct radix_tree_root *root)
{
        bool shrunk = false;

        for (;;) {
                struct radix_tree_node *node = rcu_dereference_raw(root->xa_head);
                struct radix_tree_node *child;

                if (!radix_tree_is_internal_node(node))
                        break;
                node = entry_to_node(node);

                /*
                 * The candidate node has more than one child, or its child
                 * is not at the leftmost slot, we cannot shrink.
                 */
                if (node->count != 1)
                        break;
                child = rcu_dereference_raw(node->slots[0]);
                if (!child)
                        break;

                /*
                 * For an IDR, we must not shrink entry 0 into the root in
                 * case somebody calls idr_replace() with a pointer that
                 * appears to be an internal entry
                 */
                if (!node->shift && is_idr(root))
                        break;

                if (radix_tree_is_internal_node(child))
                        entry_to_node(child)->parent = NULL;

                /*
                 * We don't need rcu_assign_pointer(), since we are simply
                 * moving the node from one part of the tree to another: if it
                 * was safe to dereference the old pointer to it
                 * (node->slots[0]), it will be safe to dereference the new
                 * one (root->xa_head) as far as dependent read barriers go.
                 */
                root->xa_head = (void __rcu *)child;
                if (is_idr(root) && !tag_get(node, IDR_FREE, 0))
                        root_tag_clear(root, IDR_FREE);

                /*
                 * We have a dilemma here. The node's slot[0] must not be
                 * NULLed in case there are concurrent lookups expecting to
                 * find the item. However if this was a bottom-level node,
                 * then it may be subject to the slot pointer being visible
                 * to callers dereferencing it. If item corresponding to
                 * slot[0] is subsequently deleted, these callers would expect
                 * their slot to become empty sooner or later.
                 *
                 * For example, lockless pagecache will look up a slot, deref
                 * the page pointer, and if the page has 0 refcount it means it
                 * was concurrently deleted from pagecache so try the deref
                 * again. Fortunately there is already a requirement for logic
                 * to retry the entire slot lookup -- the indirect pointer
                 * problem (replacing direct root node with an indirect pointer
                 * also results in a stale slot). So tag the slot as indirect
                 * to force callers to retry.
                 */
                node->count = 0;
                if (!radix_tree_is_internal_node(child)) {
                        node->slots[0] = (void __rcu *)RADIX_TREE_RETRY;
                }

                WARN_ON_ONCE(!list_empty(&node->private_list));
                radix_tree_node_free(node);
                shrunk = true;
        }

        return shrunk;
}

static bool delete_node(struct radix_tree_root *root,
                        struct radix_tree_node *node)
{
        bool deleted = false;

        do {
                struct radix_tree_node *parent;

                if (node->count) {
                        if (node_to_entry(node) ==
                                        rcu_dereference_raw(root->xa_head))
                                deleted |= radix_tree_shrink(root);
                        return deleted;
                }

                parent = node->parent;
                if (parent) {
                        parent->slots[node->offset] = NULL;
                        parent->count--;
                } else {
                        /*
                         * Shouldn't the tags already have all been cleared
                         * by the caller?
                         */
                        if (!is_idr(root))
                                root_tag_clear_all(root);
                        root->xa_head = NULL;
                }

                WARN_ON_ONCE(!list_empty(&node->private_list));
                radix_tree_node_free(node);
                deleted = true;

                node = parent;
        } while (node);

        return deleted;
}

/**
 *        __radix_tree_create        -        create a slot in a radix tree
 *        @root:                radix tree root
 *        @index:                index key
 *        @nodep:                returns node
 *        @slotp:                returns slot
 *
 *        Create, if necessary, and return the node and slot for an item
 *        at position @index in the radix tree @root.
 *
 *        Until there is more than one item in the tree, no nodes are
 *        allocated and @root->xa_head is used as a direct slot instead of
 *        pointing to a node, in which case *@nodep will be NULL.
 *
 *        Returns -ENOMEM, or 0 for success.
 */
static int __radix_tree_create(struct radix_tree_root *root,
                unsigned long index, struct radix_tree_node **nodep,
                void __rcu ***slotp)
{
        struct radix_tree_node *node = NULL, *child;
        void __rcu **slot = (void __rcu **)&root->xa_head;
        unsigned long maxindex;
        unsigned int shift, offset = 0;
        unsigned long max = index;
        gfp_t gfp = root_gfp_mask(root);

        shift = radix_tree_load_root(root, &child, &maxindex);

        /* Make sure the tree is high enough.  */
        if (max > maxindex) {
                int error = radix_tree_extend(root, gfp, max, shift);
                if (error < 0)
                        return error;
                shift = error;
                child = rcu_dereference_raw(root->xa_head);
        }

        while (shift > 0) {
                shift -= RADIX_TREE_MAP_SHIFT;
                if (child == NULL) {
                        /* Have to add a child node.  */
                        child = radix_tree_node_alloc(gfp, node, root, shift,
                                                        offset, 0, 0);
                        if (!child)
                                return -ENOMEM;
                        rcu_assign_pointer(*slot, node_to_entry(child));
                        if (node)
                                node->count++;
                } else if (!radix_tree_is_internal_node(child))
                        break;

                /* Go a level down */
                node = entry_to_node(child);
                offset = radix_tree_descend(node, &child, index);
                slot = &node->slots[offset];
        }

        if (nodep)
                *nodep = node;
        if (slotp)
                *slotp = slot;
        return 0;
}

/*
 * Free any nodes below this node.  The tree is presumed to not need
 * shrinking, and any user data in the tree is presumed to not need a
 * destructor called on it.  If we need to add a destructor, we can
 * add that functionality later.  Note that we may not clear tags or
 * slots from the tree as an RCU walker may still have a pointer into
 * this subtree.  We could replace the entries with RADIX_TREE_RETRY,
 * but we'll still have to clear those in rcu_free.
 */
static void radix_tree_free_nodes(struct radix_tree_node *node)
{
        unsigned offset = 0;
        struct radix_tree_node *child = entry_to_node(node);

        for (;;) {
                void *entry = rcu_dereference_raw(child->slots[offset]);
                if (xa_is_node(entry) && child->shift) {
                        child = entry_to_node(entry);
                        offset = 0;
                        continue;
                }
                offset++;
                while (offset == RADIX_TREE_MAP_SIZE) {
                        struct radix_tree_node *old = child;
                        offset = child->offset + 1;
                        child = child->parent;
                        WARN_ON_ONCE(!list_empty(&old->private_list));
                        radix_tree_node_free(old);
                        if (old == entry_to_node(node))
                                return;
                }
        }
}

static inline int insert_entries(struct radix_tree_node *node,
                void __rcu **slot, void *item)
{
        if (*slot)
                return -EEXIST;
        rcu_assign_pointer(*slot, item);
        if (node) {
                node->count++;
                if (xa_is_value(item))
                        node->nr_values++;
        }
        return 1;
}

/**
 *        radix_tree_insert    -    insert into a radix tree
 *        @root:                radix tree root
 *        @index:                index key
 *        @item:                item to insert
 *
 *        Insert an item into the radix tree at position @index.
 */
int radix_tree_insert(struct radix_tree_root *root, unsigned long index,
                        void *item)
{
        struct radix_tree_node *node;
        void __rcu **slot;
        int error;

        BUG_ON(radix_tree_is_internal_node(item));

        error = __radix_tree_create(root, index, &node, &slot);
        if (error)
                return error;

        error = insert_entries(node, slot, item);
        if (error < 0)
                return error;

        if (node) {
                unsigned offset = get_slot_offset(node, slot);
                BUG_ON(tag_get(node, 0, offset));
                BUG_ON(tag_get(node, 1, offset));
                BUG_ON(tag_get(node, 2, offset));
        } else {
                BUG_ON(root_tags_get(root));
        }

        return 0;
}
EXPORT_SYMBOL(radix_tree_insert);

/**
 *        __radix_tree_lookup        -        lookup an item in a radix tree
 *        @root:                radix tree root
 *        @index:                index key
 *        @nodep:                returns node
 *        @slotp:                returns slot
 *
 *        Lookup and return the item at position @index in the radix
 *        tree @root.
 *
 *        Until there is more than one item in the tree, no nodes are
 *        allocated and @root->xa_head is used as a direct slot instead of
 *        pointing to a node, in which case *@nodep will be NULL.
 */
void *__radix_tree_lookup(const struct radix_tree_root *root,
                          unsigned long index, struct radix_tree_node **nodep,
                          void __rcu ***slotp)
{
        struct radix_tree_node *node, *parent;
        unsigned long maxindex;
        void __rcu **slot;

 restart:
        parent = NULL;
        slot = (void __rcu **)&root->xa_head;
        radix_tree_load_root(root, &node, &maxindex);
        if (index > maxindex)
                return NULL;

        while (radix_tree_is_internal_node(node)) {
                unsigned offset;

                parent = entry_to_node(node);
                offset = radix_tree_descend(parent, &node, index);
                slot = parent->slots + offset;
                if (node == RADIX_TREE_RETRY)
                        goto restart;
                if (parent->shift == 0)
                        break;
        }

        if (nodep)
                *nodep = parent;
        if (slotp)
                *slotp = slot;
        return node;
}

/**
 *        radix_tree_lookup_slot    -    lookup a slot in a radix tree
 *        @root:                radix tree root
 *        @index:                index key
 *
 *        Returns:  the slot corresponding to the position @index in the
 *        radix tree @root. This is useful for update-if-exists operations.
 *
 *        This function can be called under rcu_read_lock iff the slot is not
 *        modified by radix_tree_replace_slot, otherwise it must be called
 *        exclusive from other writers. Any dereference of the slot must be done
 *        using radix_tree_deref_slot.
 */
void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *root,
                                unsigned long index)
{
        void __rcu **slot;

        if (!__radix_tree_lookup(root, index, NULL, &slot))
                return NULL;
        return slot;
}
EXPORT_SYMBOL(radix_tree_lookup_slot);

/**
 *        radix_tree_lookup    -    perform lookup operation on a radix tree
 *        @root:                radix tree root
 *        @index:                index key
 *
 *        Lookup the item at the position @index in the radix tree @root.
 *
 *        This function can be called under rcu_read_lock, however the caller
 *        must manage lifetimes of leaf nodes (eg. RCU may also be used to free
 *        them safely). No RCU barriers are required to access or modify the
 *        returned item, however.
 */
void *radix_tree_lookup(const struct radix_tree_root *root, unsigned long index)
{
        return __radix_tree_lookup(root, index, NULL, NULL);
}
EXPORT_SYMBOL(radix_tree_lookup);

static void replace_slot(void __rcu **slot, void *item,
                struct radix_tree_node *node, int count, int values)
{
        if (node && (count || values)) {
                node->count += count;
                node->nr_values += values;
        }

        rcu_assign_pointer(*slot, item);
}

static bool node_tag_get(const struct radix_tree_root *root,
                                const struct radix_tree_node *node,
                                unsigned int tag, unsigned int offset)
{
        if (node)
                return tag_get(node, tag, offset);
        return root_tag_get(root, tag);
}

/*
 * IDR users want to be able to store NULL in the tree, so if the slot isn't
 * free, don't adjust the count, even if it's transitioning between NULL and
 * non-NULL.  For the IDA, we mark slots as being IDR_FREE while they still
 * have empty bits, but it only stores NULL in slots when they're being
 * deleted.
 */
static int calculate_count(struct radix_tree_root *root,
                                struct radix_tree_node *node, void __rcu **slot,
                                void *item, void *old)
{
        if (is_idr(root)) {
                unsigned offset = get_slot_offset(node, slot);
                bool free = node_tag_get(root, node, IDR_FREE, offset);
                if (!free)
                        return 0;
                if (!old)
                        return 1;
        }
        return !!item - !!old;
}

/**
 * __radix_tree_replace                - replace item in a slot
 * @root:                radix tree root
 * @node:                pointer to tree node
 * @slot:                pointer to slot in @node
 * @item:                new item to store in the slot.
 *
 * For use with __radix_tree_lookup().  Caller must hold tree write locked
 * across slot lookup and replacement.
 */
void __radix_tree_replace(struct radix_tree_root *root,
                          struct radix_tree_node *node,
                          void __rcu **slot, void *item)
{
        void *old = rcu_dereference_raw(*slot);
        int values = !!xa_is_value(item) - !!xa_is_value(old);
        int count = calculate_count(root, node, slot, item, old);

        /*
         * This function supports replacing value entries and
         * deleting entries, but that needs accounting against the
         * node unless the slot is root->xa_head.
         */
        WARN_ON_ONCE(!node && (slot != (void __rcu **)&root->xa_head) &&
                        (count || values));
        replace_slot(slot, item, node, count, values);

        if (!node)
                return;

        delete_node(root, node);
}

/**
 * radix_tree_replace_slot        - replace item in a slot
 * @root:        radix tree root
 * @slot:        pointer to slot
 * @item:        new item to store in the slot.
 *
 * For use with radix_tree_lookup_slot() and
 * radix_tree_gang_lookup_tag_slot().  Caller must hold tree write locked
 * across slot lookup and replacement.
 *
 * NOTE: This cannot be used to switch between non-entries (empty slots),
 * regular entries, and value entries, as that requires accounting
 * inside the radix tree node. When switching from one type of entry or
 * deleting, use __radix_tree_lookup() and __radix_tree_replace() or
 * radix_tree_iter_replace().
 */
void radix_tree_replace_slot(struct radix_tree_root *root,
                             void __rcu **slot, void *item)
{
        __radix_tree_replace(root, NULL, slot, item);
}
EXPORT_SYMBOL(radix_tree_replace_slot);

/**
 * radix_tree_iter_replace - replace item in a slot
 * @root:        radix tree root
 * @iter:        iterator state
 * @slot:        pointer to slot
 * @item:        new item to store in the slot.
 *
 * For use with radix_tree_for_each_slot().
 * Caller must hold tree write locked.
 */
void radix_tree_iter_replace(struct radix_tree_root *root,
                                const struct radix_tree_iter *iter,
                                void __rcu **slot, void *item)
{
        __radix_tree_replace(root, iter->node, slot, item);
}

static void node_tag_set(struct radix_tree_root *root,
                                struct radix_tree_node *node,
                                unsigned int tag, unsigned int offset)
{
        while (node) {
                if (tag_get(node, tag, offset))
                        return;
                tag_set(node, tag, offset);
                offset = node->offset;
                node = node->parent;
        }

        if (!root_tag_get(root, tag))
                root_tag_set(root, tag);
}

/**
 *        radix_tree_tag_set - set a tag on a radix tree node
 *        @root:                radix tree root
 *        @index:                index key
 *        @tag:                tag index
 *
 *        Set the search tag (which must be < RADIX_TREE_MAX_TAGS)
 *        corresponding to @index in the radix tree.  From
 *        the root all the way down to the leaf node.
 *
 *        Returns the address of the tagged item.  Setting a tag on a not-present
 *        item is a bug.
 */
void *radix_tree_tag_set(struct radix_tree_root *root,
                        unsigned long index, unsigned int tag)
{
        struct radix_tree_node *node, *parent;
        unsigned long maxindex;

        radix_tree_load_root(root, &node, &maxindex);
        BUG_ON(index > maxindex);

        while (radix_tree_is_internal_node(node)) {
                unsigned offset;

                parent = entry_to_node(node);
                offset = radix_tree_descend(parent, &node, index);
                BUG_ON(!node);

                if (!tag_get(parent, tag, offset))
                        tag_set(parent, tag, offset);
        }

        /* set the root's tag bit */
        if (!root_tag_get(root, tag))
                root_tag_set(root, tag);

        return node;
}
EXPORT_SYMBOL(radix_tree_tag_set);

static void node_tag_clear(struct radix_tree_root *root,
                                struct radix_tree_node *node,
                                unsigned int tag, unsigned int offset)
{
        while (node) {
                if (!tag_get(node, tag, offset))
                        return;
                tag_clear(node, tag, offset);
                if (any_tag_set(node, tag))
                        return;

                offset = node->offset;
                node = node->parent;
        }

        /* clear the root's tag bit */
        if (root_tag_get(root, tag))
                root_tag_clear(root, tag);
}

/**
 *        radix_tree_tag_clear - clear a tag on a radix tree node
 *        @root:                radix tree root
 *        @index:                index key
 *        @tag:                tag index
 *
 *        Clear the search tag (which must be < RADIX_TREE_MAX_TAGS)
 *        corresponding to @index in the radix tree.  If this causes
 *        the leaf node to have no tags set then clear the tag in the
 *        next-to-leaf node, etc.
 *
 *        Returns the address of the tagged item on success, else NULL.  ie:
 *        has the same return value and semantics as radix_tree_lookup().
 */
void *radix_tree_tag_clear(struct radix_tree_root *root,
                        unsigned long index, unsigned int tag)
{
        struct radix_tree_node *node, *parent;
        unsigned long maxindex;
        int offset = 0;

        radix_tree_load_root(root, &node, &maxindex);
        if (index > maxindex)
                return NULL;

        parent = NULL;

        while (radix_tree_is_internal_node(node)) {
                parent = entry_to_node(node);
                offset = radix_tree_descend(parent, &node, index);
        }

        if (node)
                node_tag_clear(root, parent, tag, offset);

        return node;
}
EXPORT_SYMBOL(radix_tree_tag_clear);

/**
  * radix_tree_iter_tag_clear - clear a tag on the current iterator entry
  * @root: radix tree root
  * @iter: iterator state
  * @tag: tag to clear
  */
void radix_tree_iter_tag_clear(struct radix_tree_root *root,
                        const struct radix_tree_iter *iter, unsigned int tag)
{
        node_tag_clear(root, iter->node, tag, iter_offset(iter));
}

/**
 * radix_tree_tag_get - get a tag on a radix tree node
 * @root:                radix tree root
 * @index:                index key
 * @tag:                tag index (< RADIX_TREE_MAX_TAGS)
 *
 * Return values:
 *
 *  0: tag not present or not set
 *  1: tag set
 *
 * Note that the return value of this function may not be relied on, even if
 * the RCU lock is held, unless tag modification and node deletion are excluded
 * from concurrency.
 */
int radix_tree_tag_get(const struct radix_tree_root *root,
                        unsigned long index, unsigned int tag)
{
        struct radix_tree_node *node, *parent;
        unsigned long maxindex;

        if (!root_tag_get(root, tag))
                return 0;

        radix_tree_load_root(root, &node, &maxindex);
        if (index > maxindex)
                return 0;

        while (radix_tree_is_internal_node(node)) {
                unsigned offset;

                parent = entry_to_node(node);
                offset = radix_tree_descend(parent, &node, index);

                if (!tag_get(parent, tag, offset))
                        return 0;
                if (node == RADIX_TREE_RETRY)
                        break;
        }

        return 1;
}
EXPORT_SYMBOL(radix_tree_tag_get);

/* Construct iter->tags bit-mask from node->tags[tag] array */
static void set_iter_tags(struct radix_tree_iter *iter,
                                struct radix_tree_node *node, unsigned offset,
                                unsigned tag)
{
        unsigned tag_long = offset / BITS_PER_LONG;
        unsigned tag_bit  = offset % BITS_PER_LONG;

        if (!node) {
                iter->tags = 1;
                return;
        }

        iter->tags = node->tags[tag][tag_long] >> tag_bit;

        /* This never happens if RADIX_TREE_TAG_LONGS == 1 */
        if (tag_long < RADIX_TREE_TAG_LONGS - 1) {
                /* Pick tags from next element */
                if (tag_bit)
                        iter->tags |= node->tags[tag][tag_long + 1] <<
                                                (BITS_PER_LONG - tag_bit);
                /* Clip chunk size, here only BITS_PER_LONG tags */
                iter->next_index = __radix_tree_iter_add(iter, BITS_PER_LONG);
        }
}

void __rcu **radix_tree_iter_resume(void __rcu **slot,
                                        struct radix_tree_iter *iter)
{
        iter->index = __radix_tree_iter_add(iter, 1);
        iter->next_index = iter->index;
        iter->tags = 0;
        return NULL;
}
EXPORT_SYMBOL(radix_tree_iter_resume);

/**
 * radix_tree_next_chunk - find next chunk of slots for iteration
 *
 * @root:        radix tree root
 * @iter:        iterator state
 * @flags:        RADIX_TREE_ITER_* flags and tag index
 * Returns:        pointer to chunk first slot, or NULL if iteration is over
 */
void __rcu **radix_tree_next_chunk(const struct radix_tree_root *root,
                             struct radix_tree_iter *iter, unsigned flags)
{
        unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK;
        struct radix_tree_node *node, *child;
        unsigned long index, offset, maxindex;

        if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag))
                return NULL;

        /*
         * Catch next_index overflow after ~0UL. iter->index never overflows
         * during iterating; it can be zero only at the beginning.
         * And we cannot overflow iter->next_index in a single step,
         * because RADIX_TREE_MAP_SHIFT < BITS_PER_LONG.
         *
         * This condition also used by radix_tree_next_slot() to stop
         * contiguous iterating, and forbid switching to the next chunk.
         */
        index = iter->next_index;
        if (!index && iter->index)
                return NULL;

 restart:
        radix_tree_load_root(root, &child, &maxindex);
        if (index > maxindex)
                return NULL;
        if (!child)
                return NULL;

        if (!radix_tree_is_internal_node(child)) {
                /* Single-slot tree */
                iter->index = index;
                iter->next_index = maxindex + 1;
                iter->tags = 1;
                iter->node = NULL;
                return (void __rcu **)&root->xa_head;
        }

        do {
                node = entry_to_node(child);
                offset = radix_tree_descend(node, &child, index);

                if ((flags & RADIX_TREE_ITER_TAGGED) ?
                                !tag_get(node, tag, offset) : !child) {
                        /* Hole detected */
                        if (flags & RADIX_TREE_ITER_CONTIG)
                                return NULL;

                        if (flags & RADIX_TREE_ITER_TAGGED)
                                offset = radix_tree_find_next_bit(node, tag,
                                                offset + 1);
                        else
                                while (++offset        < RADIX_TREE_MAP_SIZE) {
                                        void *slot = rcu_dereference_raw(
                                                        node->slots[offset]);
                                        if (slot)
                                                break;
                                }
                        index &= ~node_maxindex(node);
                        index += offset << node->shift;
                        /* Overflow after ~0UL */
                        if (!index)
                                return NULL;
                        if (offset == RADIX_TREE_MAP_SIZE)
                                goto restart;
                        child = rcu_dereference_raw(node->slots[offset]);
                }

                if (!child)
                        goto restart;
                if (child == RADIX_TREE_RETRY)
                        break;
        } while (node->shift && radix_tree_is_internal_node(child));

        /* Update the iterator state */
        iter->index = (index &~ node_maxindex(node)) | offset;
        iter->next_index = (index | node_maxindex(node)) + 1;
        iter->node = node;

        if (flags & RADIX_TREE_ITER_TAGGED)
                set_iter_tags(iter, node, offset, tag);

        return node->slots + offset;
}
EXPORT_SYMBOL(radix_tree_next_chunk);

/**
 *        radix_tree_gang_lookup - perform multiple lookup on a radix tree
 *        @root:                radix tree root
 *        @results:        where the results of the lookup are placed
 *        @first_index:        start the lookup from this key
 *        @max_items:        place up to this many items at *results
 *
 *        Performs an index-ascending scan of the tree for present items.  Places
 *        them at *@results and returns the number of items which were placed at
 *        *@results.
 *
 *        The implementation is naive.
 *
 *        Like radix_tree_lookup, radix_tree_gang_lookup may be called under
 *        rcu_read_lock. In this case, rather than the returned results being
 *        an atomic snapshot of the tree at a single point in time, the
 *        semantics of an RCU protected gang lookup are as though multiple
 *        radix_tree_lookups have been issued in individual locks, and results
 *        stored in 'results'.
 */
unsigned int
radix_tree_gang_lookup(const struct radix_tree_root *root, void **results,
                        unsigned long first_index, unsigned int max_items)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        unsigned int ret = 0;

        if (unlikely(!max_items))
                return 0;

        radix_tree_for_each_slot(slot, root, &iter, first_index) {
                results[ret] = rcu_dereference_raw(*slot);
                if (!results[ret])
                        continue;
                if (radix_tree_is_internal_node(results[ret])) {
                        slot = radix_tree_iter_retry(&iter);
                        continue;
                }
                if (++ret == max_items)
                        break;
        }

        return ret;
}
EXPORT_SYMBOL(radix_tree_gang_lookup);

/**
 *        radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree
 *                                     based on a tag
 *        @root:                radix tree root
 *        @results:        where the results of the lookup are placed
 *        @first_index:        start the lookup from this key
 *        @max_items:        place up to this many items at *results
 *        @tag:                the tag index (< RADIX_TREE_MAX_TAGS)
 *
 *        Performs an index-ascending scan of the tree for present items which
 *        have the tag indexed by @tag set.  Places the items at *@results and
 *        returns the number of items which were placed at *@results.
 */
unsigned int
radix_tree_gang_lookup_tag(const struct radix_tree_root *root, void **results,
                unsigned long first_index, unsigned int max_items,
                unsigned int tag)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        unsigned int ret = 0;

        if (unlikely(!max_items))
                return 0;

        radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) {
                results[ret] = rcu_dereference_raw(*slot);
                if (!results[ret])
                        continue;
                if (radix_tree_is_internal_node(results[ret])) {
                        slot = radix_tree_iter_retry(&iter);
                        continue;
                }
                if (++ret == max_items)
                        break;
        }

        return ret;
}
EXPORT_SYMBOL(radix_tree_gang_lookup_tag);

/**
 *        radix_tree_gang_lookup_tag_slot - perform multiple slot lookup on a
 *                                          radix tree based on a tag
 *        @root:                radix tree root
 *        @results:        where the results of the lookup are placed
 *        @first_index:        start the lookup from this key
 *        @max_items:        place up to this many items at *results
 *        @tag:                the tag index (< RADIX_TREE_MAX_TAGS)
 *
 *        Performs an index-ascending scan of the tree for present items which
 *        have the tag indexed by @tag set.  Places the slots at *@results and
 *        returns the number of slots which were placed at *@results.
 */
unsigned int
radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *root,
                void __rcu ***results, unsigned long first_index,
                unsigned int max_items, unsigned int tag)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        unsigned int ret = 0;

        if (unlikely(!max_items))
                return 0;

        radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) {
                results[ret] = slot;
                if (++ret == max_items)
                        break;
        }

        return ret;
}
EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot);

static bool __radix_tree_delete(struct radix_tree_root *root,
                                struct radix_tree_node *node, void __rcu **slot)
{
        void *old = rcu_dereference_raw(*slot);
        int values = xa_is_value(old) ? -1 : 0;
        unsigned offset = get_slot_offset(node, slot);
        int tag;

        if (is_idr(root))
                node_tag_set(root, node, IDR_FREE, offset);
        else
                for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
                        node_tag_clear(root, node, tag, offset);

        replace_slot(slot, NULL, node, -1, values);
        return node && delete_node(root, node);
}

/**
 * radix_tree_iter_delete - delete the entry at this iterator position
 * @root: radix tree root
 * @iter: iterator state
 * @slot: pointer to slot
 *
 * Delete the entry at the position currently pointed to by the iterator.
 * This may result in the current node being freed; if it is, the iterator
 * is advanced so that it will not reference the freed memory.  This
 * function may be called without any locking if there are no other threads
 * which can access this tree.
 */
void radix_tree_iter_delete(struct radix_tree_root *root,
                                struct radix_tree_iter *iter, void __rcu **slot)
{
        if (__radix_tree_delete(root, iter->node, slot))
                iter->index = iter->next_index;
}
EXPORT_SYMBOL(radix_tree_iter_delete);

/**
 * radix_tree_delete_item - delete an item from a radix tree
 * @root: radix tree root
 * @index: index key
 * @item: expected item
 *
 * Remove @item at @index from the radix tree rooted at @root.
 *
 * Return: the deleted entry, or %NULL if it was not present
 * or the entry at the given @index was not @item.
 */
void *radix_tree_delete_item(struct radix_tree_root *root,
                             unsigned long index, void *item)
{
        struct radix_tree_node *node = NULL;
        void __rcu **slot = NULL;
        void *entry;

        entry = __radix_tree_lookup(root, index, &node, &slot);
        if (!slot)
                return NULL;
        if (!entry && (!is_idr(root) || node_tag_get(root, node, IDR_FREE,
                                                get_slot_offset(node, slot))))
                return NULL;

        if (item && entry != item)
                return NULL;

        __radix_tree_delete(root, node, slot);

        return entry;
}
EXPORT_SYMBOL(radix_tree_delete_item);

/**
 * radix_tree_delete - delete an entry from a radix tree
 * @root: radix tree root
 * @index: index key
 *
 * Remove the entry at @index from the radix tree rooted at @root.
 *
 * Return: The deleted entry, or %NULL if it was not present.
 */
void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
{
        return radix_tree_delete_item(root, index, NULL);
}
EXPORT_SYMBOL(radix_tree_delete);

/**
 *        radix_tree_tagged - test whether any items in the tree are tagged
 *        @root:                radix tree root
 *        @tag:                tag to test
 */
int radix_tree_tagged(const struct radix_tree_root *root, unsigned int tag)
{
        return root_tag_get(root, tag);
}
EXPORT_SYMBOL(radix_tree_tagged);

/**
 * idr_preload - preload for idr_alloc()
 * @gfp_mask: allocation mask to use for preloading
 *
 * Preallocate memory to use for the next call to idr_alloc().  This function
 * returns with preemption disabled.  It will be enabled by idr_preload_end().
 */
void idr_preload(gfp_t gfp_mask)
{
        if (__radix_tree_preload(gfp_mask, IDR_PRELOAD_SIZE))
                local_lock(&radix_tree_preloads.lock);
}
EXPORT_SYMBOL(idr_preload);

void __rcu **idr_get_free(struct radix_tree_root *root,
                              struct radix_tree_iter *iter, gfp_t gfp,
                              unsigned long max)
{
        struct radix_tree_node *node = NULL, *child;
        void __rcu **slot = (void __rcu **)&root->xa_head;
        unsigned long maxindex, start = iter->next_index;
        unsigned int shift, offset = 0;

 grow:
        shift = radix_tree_load_root(root, &child, &maxindex);
        if (!radix_tree_tagged(root, IDR_FREE))
                start = max(start, maxindex + 1);
        if (start > max)
                return ERR_PTR(-ENOSPC);

        if (start > maxindex) {
                int error = radix_tree_extend(root, gfp, start, shift);
                if (error < 0)
                        return ERR_PTR(error);
                shift = error;
                child = rcu_dereference_raw(root->xa_head);
        }
        if (start == 0 && shift == 0)
                shift = RADIX_TREE_MAP_SHIFT;

        while (shift) {
                shift -= RADIX_TREE_MAP_SHIFT;
                if (child == NULL) {
                        /* Have to add a child node.  */
                        child = radix_tree_node_alloc(gfp, node, root, shift,
                                                        offset, 0, 0);
                        if (!child)
                                return ERR_PTR(-ENOMEM);
                        all_tag_set(child, IDR_FREE);
                        rcu_assign_pointer(*slot, node_to_entry(child));
                        if (node)
                                node->count++;
                } else if (!radix_tree_is_internal_node(child))
                        break;

                node = entry_to_node(child);
                offset = radix_tree_descend(node, &child, start);
                if (!tag_get(node, IDR_FREE, offset)) {
                        offset = radix_tree_find_next_bit(node, IDR_FREE,
                                                        offset + 1);
                        start = next_index(start, node, offset);
                        if (start > max || start == 0)
                                return ERR_PTR(-ENOSPC);
                        while (offset == RADIX_TREE_MAP_SIZE) {
                                offset = node->offset + 1;
                                node = node->parent;
                                if (!node)
                                        goto grow;
                                shift = node->shift;
                        }
                        child = rcu_dereference_raw(node->slots[offset]);
                }
                slot = &node->slots[offset];
        }

        iter->index = start;
        if (node)
                iter->next_index = 1 + min(max, (start | node_maxindex(node)));
        else
                iter->next_index = 1;
        iter->node = node;
        set_iter_tags(iter, node, offset, IDR_FREE);

        return slot;
}

/**
 * idr_destroy - release all internal memory from an IDR
 * @idr: idr handle
 *
 * After this function is called, the IDR is empty, and may be reused or
 * the data structure containing it may be freed.
 *
 * A typical clean-up sequence for objects stored in an idr tree will use
 * idr_for_each() to free all objects, if necessary, then idr_destroy() to
 * free the memory used to keep track of those objects.
 */
void idr_destroy(struct idr *idr)
{
        struct radix_tree_node *node = rcu_dereference_raw(idr->idr_rt.xa_head);
        if (radix_tree_is_internal_node(node))
                radix_tree_free_nodes(node);
        idr->idr_rt.xa_head = NULL;
        root_tag_set(&idr->idr_rt, IDR_FREE);
}
EXPORT_SYMBOL(idr_destroy);

static void
radix_tree_node_ctor(void *arg)
{
        struct radix_tree_node *node = arg;

        memset(node, 0, sizeof(*node));
        INIT_LIST_HEAD(&node->private_list);
}

static int radix_tree_cpu_dead(unsigned int cpu)
{
        struct radix_tree_preload *rtp;
        struct radix_tree_node *node;

        /* Free per-cpu pool of preloaded nodes */
        rtp = &per_cpu(radix_tree_preloads, cpu);
        while (rtp->nr) {
                node = rtp->nodes;
                rtp->nodes = node->parent;
                kmem_cache_free(radix_tree_node_cachep, node);
                rtp->nr--;
        }
        return 0;
}

void __init radix_tree_init(void)
{
        int ret;

        BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32);
        BUILD_BUG_ON(ROOT_IS_IDR & ~GFP_ZONEMASK);
        BUILD_BUG_ON(XA_CHUNK_SIZE > 255);
        radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
                        sizeof(struct radix_tree_node), 0,
                        SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
                        radix_tree_node_ctor);
        ret = cpuhp_setup_state_nocalls(CPUHP_RADIX_DEAD, "lib/radix:dead",
                                        NULL, radix_tree_cpu_dead);
        WARN_ON(ret < 0);
}















































   15 


   15 




   15 




   15 
   15 







    8 

    3 

    8 





    6 
    8 







































   17 





   17 


   17 
   17 


    6 




































































   17 





    4 
























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * AEAD: Authenticated Encryption with Associated Data
 *
 * This file provides API support for AEAD algorithms.
 *
 * Copyright (c) 2007-2015 Herbert Xu <herbert@gondor.apana.org.au>
 */

#include <crypto/internal/aead.h>
#include <linux/cryptouser.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/seq_file.h>
#include <linux/string.h>
#include <linux/string_choices.h>
#include <net/netlink.h>

#include "internal.h"

static int setkey_unaligned(struct crypto_aead *tfm, const u8 *key,
                            unsigned int keylen)
{
        unsigned long alignmask = crypto_aead_alignmask(tfm);
        int ret;
        u8 *buffer, *alignbuffer;
        unsigned long absize;

        absize = keylen + alignmask;
        buffer = kmalloc(absize, GFP_ATOMIC);
        if (!buffer)
                return -ENOMEM;

        alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1);
        memcpy(alignbuffer, key, keylen);
        ret = crypto_aead_alg(tfm)->setkey(tfm, alignbuffer, keylen);
        kfree_sensitive(buffer);
        return ret;
}

int crypto_aead_setkey(struct crypto_aead *tfm,
                       const u8 *key, unsigned int keylen)
{
        unsigned long alignmask = crypto_aead_alignmask(tfm);
        int err;

        if ((unsigned long)key & alignmask)
                err = setkey_unaligned(tfm, key, keylen);
        else
                err = crypto_aead_alg(tfm)->setkey(tfm, key, keylen);

        if (unlikely(err)) {
                crypto_aead_set_flags(tfm, CRYPTO_TFM_NEED_KEY);
                return err;
        }

        crypto_aead_clear_flags(tfm, CRYPTO_TFM_NEED_KEY);
        return 0;
}
EXPORT_SYMBOL_GPL(crypto_aead_setkey);

int crypto_aead_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
{
        int err;

        if ((!authsize && crypto_aead_maxauthsize(tfm)) ||
            authsize > crypto_aead_maxauthsize(tfm))
                return -EINVAL;

        if (crypto_aead_alg(tfm)->setauthsize) {
                err = crypto_aead_alg(tfm)->setauthsize(tfm, authsize);
                if (err)
                        return err;
        }

        tfm->authsize = authsize;
        return 0;
}
EXPORT_SYMBOL_GPL(crypto_aead_setauthsize);

int crypto_aead_encrypt(struct aead_request *req)
{
        struct crypto_aead *aead = crypto_aead_reqtfm(req);

        if (crypto_aead_get_flags(aead) & CRYPTO_TFM_NEED_KEY)
                return -ENOKEY;

        return crypto_aead_alg(aead)->encrypt(req);
}
EXPORT_SYMBOL_GPL(crypto_aead_encrypt);

int crypto_aead_decrypt(struct aead_request *req)
{
        struct crypto_aead *aead = crypto_aead_reqtfm(req);

        if (crypto_aead_get_flags(aead) & CRYPTO_TFM_NEED_KEY)
                return -ENOKEY;

        if (req->cryptlen < crypto_aead_authsize(aead))
                return -EINVAL;

        return crypto_aead_alg(aead)->decrypt(req);
}
EXPORT_SYMBOL_GPL(crypto_aead_decrypt);

static void crypto_aead_exit_tfm(struct crypto_tfm *tfm)
{
        struct crypto_aead *aead = __crypto_aead_cast(tfm);
        struct aead_alg *alg = crypto_aead_alg(aead);

        alg->exit(aead);
}

static int crypto_aead_init_tfm(struct crypto_tfm *tfm)
{
        struct crypto_aead *aead = __crypto_aead_cast(tfm);
        struct aead_alg *alg = crypto_aead_alg(aead);

        crypto_aead_set_flags(aead, CRYPTO_TFM_NEED_KEY);

        aead->authsize = alg->maxauthsize;

        if (alg->exit)
                aead->base.exit = crypto_aead_exit_tfm;

        if (alg->init)
                return alg->init(aead);

        return 0;
}

static int __maybe_unused crypto_aead_report(
        struct sk_buff *skb, struct crypto_alg *alg)
{
        struct crypto_report_aead raead;
        struct aead_alg *aead = container_of(alg, struct aead_alg, base);

        memset(&raead, 0, sizeof(raead));

        strscpy(raead.type, "aead", sizeof(raead.type));
        strscpy(raead.geniv, "<none>", sizeof(raead.geniv));

        raead.blocksize = alg->cra_blocksize;
        raead.maxauthsize = aead->maxauthsize;
        raead.ivsize = aead->ivsize;

        return nla_put(skb, CRYPTOCFGA_REPORT_AEAD, sizeof(raead), &raead);
}

static void crypto_aead_show(struct seq_file *m, struct crypto_alg *alg)
        __maybe_unused;
static void crypto_aead_show(struct seq_file *m, struct crypto_alg *alg)
{
        struct aead_alg *aead = container_of(alg, struct aead_alg, base);

        seq_printf(m, "type         : aead\n");
        seq_printf(m, "async        : %s\n",
                   str_yes_no(alg->cra_flags & CRYPTO_ALG_ASYNC));
        seq_printf(m, "blocksize    : %u\n", alg->cra_blocksize);
        seq_printf(m, "ivsize       : %u\n", aead->ivsize);
        seq_printf(m, "maxauthsize  : %u\n", aead->maxauthsize);
        seq_printf(m, "geniv        : <none>\n");
}

static void crypto_aead_free_instance(struct crypto_instance *inst)
{
        struct aead_instance *aead = aead_instance(inst);

        aead->free(aead);
}

static const struct crypto_type crypto_aead_type = {
        .extsize = crypto_alg_extsize,
        .init_tfm = crypto_aead_init_tfm,
        .free = crypto_aead_free_instance,
#ifdef CONFIG_PROC_FS
        .show = crypto_aead_show,
#endif
#if IS_ENABLED(CONFIG_CRYPTO_USER)
        .report = crypto_aead_report,
#endif
        .maskclear = ~CRYPTO_ALG_TYPE_MASK,
        .maskset = CRYPTO_ALG_TYPE_MASK,
        .type = CRYPTO_ALG_TYPE_AEAD,
        .tfmsize = offsetof(struct crypto_aead, base),
        .algsize = offsetof(struct aead_alg, base),
};

int crypto_grab_aead(struct crypto_aead_spawn *spawn,
                     struct crypto_instance *inst,
                     const char *name, u32 type, u32 mask)
{
        spawn->base.frontend = &crypto_aead_type;
        return crypto_grab_spawn(&spawn->base, inst, name, type, mask);
}
EXPORT_SYMBOL_GPL(crypto_grab_aead);

struct crypto_aead *crypto_alloc_aead(const char *alg_name, u32 type, u32 mask)
{
        return crypto_alloc_tfm(alg_name, &crypto_aead_type, type, mask);
}
EXPORT_SYMBOL_GPL(crypto_alloc_aead);

int crypto_has_aead(const char *alg_name, u32 type, u32 mask)
{
        return crypto_type_has_alg(alg_name, &crypto_aead_type, type, mask);
}
EXPORT_SYMBOL_GPL(crypto_has_aead);

static int aead_prepare_alg(struct aead_alg *alg)
{
        struct crypto_alg *base = &alg->base;

        if (max3(alg->maxauthsize, alg->ivsize, alg->chunksize) >
            PAGE_SIZE / 8)
                return -EINVAL;

        if (!alg->chunksize)
                alg->chunksize = base->cra_blocksize;

        base->cra_type = &crypto_aead_type;
        base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
        base->cra_flags |= CRYPTO_ALG_TYPE_AEAD;

        return 0;
}

int crypto_register_aead(struct aead_alg *alg)
{
        struct crypto_alg *base = &alg->base;
        int err;

        err = aead_prepare_alg(alg);
        if (err)
                return err;

        return crypto_register_alg(base);
}
EXPORT_SYMBOL_GPL(crypto_register_aead);

void crypto_unregister_aead(struct aead_alg *alg)
{
        crypto_unregister_alg(&alg->base);
}
EXPORT_SYMBOL_GPL(crypto_unregister_aead);

int crypto_register_aeads(struct aead_alg *algs, int count)
{
        int i, ret;

        for (i = 0; i < count; i++) {
                ret = crypto_register_aead(&algs[i]);
                if (ret)
                        goto err;
        }

        return 0;

err:
        for (--i; i >= 0; --i)
                crypto_unregister_aead(&algs[i]);

        return ret;
}
EXPORT_SYMBOL_GPL(crypto_register_aeads);

void crypto_unregister_aeads(struct aead_alg *algs, int count)
{
        int i;

        for (i = count - 1; i >= 0; --i)
                crypto_unregister_aead(&algs[i]);
}
EXPORT_SYMBOL_GPL(crypto_unregister_aeads);

int aead_register_instance(struct crypto_template *tmpl,
                           struct aead_instance *inst)
{
        int err;

        if (WARN_ON(!inst->free))
                return -EINVAL;

        err = aead_prepare_alg(&inst->alg);
        if (err)
                return err;

        return crypto_register_instance(tmpl, aead_crypto_instance(inst));
}
EXPORT_SYMBOL_GPL(aead_register_instance);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Authenticated Encryption with Associated Data (AEAD)");































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
// SPDX-License-Identifier: GPL-2.0
/*
 * kobject.h - generic kernel object infrastructure.
 *
 * Copyright (c) 2002-2003 Patrick Mochel
 * Copyright (c) 2002-2003 Open Source Development Labs
 * Copyright (c) 2006-2008 Greg Kroah-Hartman <greg@kroah.com>
 * Copyright (c) 2006-2008 Novell Inc.
 *
 * Please read Documentation/core-api/kobject.rst before using the kobject
 * interface, ESPECIALLY the parts about reference counts and object
 * destructors.
 */

#ifndef _KOBJECT_H_
#define _KOBJECT_H_

#include <linux/types.h>
#include <linux/list.h>
#include <linux/sysfs.h>
#include <linux/compiler.h>
#include <linux/container_of.h>
#include <linux/spinlock.h>
#include <linux/kref.h>
#include <linux/kobject_ns.h>
#include <linux/wait.h>
#include <linux/atomic.h>
#include <linux/workqueue.h>
#include <linux/uidgid.h>

#define UEVENT_HELPER_PATH_LEN                256
#define UEVENT_NUM_ENVP                        64        /* number of env pointers */
#define UEVENT_BUFFER_SIZE                2048        /* buffer for the variables */

#ifdef CONFIG_UEVENT_HELPER
/* path to the userspace helper executed on an event */
extern char uevent_helper[];
#endif

/* counter to tag the uevent, read only except for the kobject core */
extern atomic64_t uevent_seqnum;

/*
 * The actions here must match the index to the string array
 * in lib/kobject_uevent.c
 *
 * Do not add new actions here without checking with the driver-core
 * maintainers. Action strings are not meant to express subsystem
 * or device specific properties. In most cases you want to send a
 * kobject_uevent_env(kobj, KOBJ_CHANGE, env) with additional event
 * specific variables added to the event environment.
 */
enum kobject_action {
        KOBJ_ADD,
        KOBJ_REMOVE,
        KOBJ_CHANGE,
        KOBJ_MOVE,
        KOBJ_ONLINE,
        KOBJ_OFFLINE,
        KOBJ_BIND,
        KOBJ_UNBIND,
};

struct kobject {
        const char                *name;
        struct list_head        entry;
        struct kobject                *parent;
        struct kset                *kset;
        const struct kobj_type        *ktype;
        struct kernfs_node        *sd; /* sysfs directory entry */
        struct kref                kref;

        unsigned int state_initialized:1;
        unsigned int state_in_sysfs:1;
        unsigned int state_add_uevent_sent:1;
        unsigned int state_remove_uevent_sent:1;
        unsigned int uevent_suppress:1;

#ifdef CONFIG_DEBUG_KOBJECT_RELEASE
        struct delayed_work        release;
#endif
};

__printf(2, 3) int kobject_set_name(struct kobject *kobj, const char *name, ...);
__printf(2, 0) int kobject_set_name_vargs(struct kobject *kobj, const char *fmt, va_list vargs);

static inline const char *kobject_name(const struct kobject *kobj)
{
        return kobj->name;
}

void kobject_init(struct kobject *kobj, const struct kobj_type *ktype);
__printf(3, 4) __must_check int kobject_add(struct kobject *kobj,
                                            struct kobject *parent,
                                            const char *fmt, ...);
__printf(4, 5) __must_check int kobject_init_and_add(struct kobject *kobj,
                                                     const struct kobj_type *ktype,
                                                     struct kobject *parent,
                                                     const char *fmt, ...);

void kobject_del(struct kobject *kobj);

struct kobject * __must_check kobject_create_and_add(const char *name, struct kobject *parent);

int __must_check kobject_rename(struct kobject *, const char *new_name);
int __must_check kobject_move(struct kobject *, struct kobject *);

struct kobject *kobject_get(struct kobject *kobj);
struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj);
void kobject_put(struct kobject *kobj);

const void *kobject_namespace(const struct kobject *kobj);
void kobject_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid);
char *kobject_get_path(const struct kobject *kobj, gfp_t flag);

struct kobj_type {
        void (*release)(struct kobject *kobj);
        const struct sysfs_ops *sysfs_ops;
        const struct attribute_group **default_groups;
        const struct kobj_ns_type_operations *(*child_ns_type)(const struct kobject *kobj);
        const void *(*namespace)(const struct kobject *kobj);
        void (*get_ownership)(const struct kobject *kobj, kuid_t *uid, kgid_t *gid);
};

struct kobj_uevent_env {
        char *argv[3];
        char *envp[UEVENT_NUM_ENVP];
        int envp_idx;
        char buf[UEVENT_BUFFER_SIZE];
        int buflen;
};

struct kset_uevent_ops {
        int (* const filter)(const struct kobject *kobj);
        const char *(* const name)(const struct kobject *kobj);
        int (* const uevent)(const struct kobject *kobj, struct kobj_uevent_env *env);
};

struct kobj_attribute {
        struct attribute attr;
        ssize_t (*show)(struct kobject *kobj, struct kobj_attribute *attr,
                        char *buf);
        ssize_t (*store)(struct kobject *kobj, struct kobj_attribute *attr,
                         const char *buf, size_t count);
};

extern const struct sysfs_ops kobj_sysfs_ops;

struct sock;

/**
 * struct kset - a set of kobjects of a specific type, belonging to a specific subsystem.
 *
 * A kset defines a group of kobjects.  They can be individually
 * different "types" but overall these kobjects all want to be grouped
 * together and operated on in the same manner.  ksets are used to
 * define the attribute callbacks and other common events that happen to
 * a kobject.
 *
 * @list: the list of all kobjects for this kset
 * @list_lock: a lock for iterating over the kobjects
 * @kobj: the embedded kobject for this kset (recursion, isn't it fun...)
 * @uevent_ops: the set of uevent operations for this kset.  These are
 * called whenever a kobject has something happen to it so that the kset
 * can add new environment variables, or filter out the uevents if so
 * desired.
 */
struct kset {
        struct list_head list;
        spinlock_t list_lock;
        struct kobject kobj;
        const struct kset_uevent_ops *uevent_ops;
} __randomize_layout;

void kset_init(struct kset *kset);
int __must_check kset_register(struct kset *kset);
void kset_unregister(struct kset *kset);
struct kset * __must_check kset_create_and_add(const char *name, const struct kset_uevent_ops *u,
                                               struct kobject *parent_kobj);

static inline struct kset *to_kset(struct kobject *kobj)
{
        return kobj ? container_of(kobj, struct kset, kobj) : NULL;
}

static inline struct kset *kset_get(struct kset *k)
{
        return k ? to_kset(kobject_get(&k->kobj)) : NULL;
}

static inline void kset_put(struct kset *k)
{
        kobject_put(&k->kobj);
}

static inline const struct kobj_type *get_ktype(const struct kobject *kobj)
{
        return kobj->ktype;
}

struct kobject *kset_find_obj(struct kset *, const char *);

/* The global /sys/kernel/ kobject for people to chain off of */
extern struct kobject *kernel_kobj;
/* The global /sys/kernel/mm/ kobject for people to chain off of */
extern struct kobject *mm_kobj;
/* The global /sys/hypervisor/ kobject for people to chain off of */
extern struct kobject *hypervisor_kobj;
/* The global /sys/power/ kobject for people to chain off of */
extern struct kobject *power_kobj;
/* The global /sys/firmware/ kobject for people to chain off of */
extern struct kobject *firmware_kobj;

int kobject_uevent(struct kobject *kobj, enum kobject_action action);
int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
                        char *envp[]);
int kobject_synth_uevent(struct kobject *kobj, const char *buf, size_t count);

__printf(2, 3)
int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...);

#endif /* _KOBJECT_H_ */






















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *  pm_wakeup.h - Power management wakeup interface
 *
 *  Copyright (C) 2008 Alan Stern
 *  Copyright (C) 2010 Rafael J. Wysocki, Novell Inc.
 */

#ifndef _LINUX_PM_WAKEUP_H
#define _LINUX_PM_WAKEUP_H

#ifndef _DEVICE_H_
# error "Please do not include this file directly."
#endif

#include <linux/types.h>

struct wake_irq;

/**
 * struct wakeup_source - Representation of wakeup sources
 *
 * @name: Name of the wakeup source
 * @id: Wakeup source id
 * @entry: Wakeup source list entry
 * @lock: Wakeup source lock
 * @wakeirq: Optional device specific wakeirq
 * @timer: Wakeup timer list
 * @timer_expires: Wakeup timer expiration
 * @total_time: Total time this wakeup source has been active.
 * @max_time: Maximum time this wakeup source has been continuously active.
 * @last_time: Monotonic clock when the wakeup source's was touched last time.
 * @prevent_sleep_time: Total time this source has been preventing autosleep.
 * @event_count: Number of signaled wakeup events.
 * @active_count: Number of times the wakeup source was activated.
 * @relax_count: Number of times the wakeup source was deactivated.
 * @expire_count: Number of times the wakeup source's timeout has expired.
 * @wakeup_count: Number of times the wakeup source might abort suspend.
 * @dev: Struct device for sysfs statistics about the wakeup source.
 * @active: Status of the wakeup source.
 * @autosleep_enabled: Autosleep is active, so update @prevent_sleep_time.
 */
struct wakeup_source {
        const char                 *name;
        int                        id;
        struct list_head        entry;
        spinlock_t                lock;
        struct wake_irq                *wakeirq;
        struct timer_list        timer;
        unsigned long                timer_expires;
        ktime_t total_time;
        ktime_t max_time;
        ktime_t last_time;
        ktime_t start_prevent_time;
        ktime_t prevent_sleep_time;
        unsigned long                event_count;
        unsigned long                active_count;
        unsigned long                relax_count;
        unsigned long                expire_count;
        unsigned long                wakeup_count;
        struct device                *dev;
        bool                        active:1;
        bool                        autosleep_enabled:1;
};

#define for_each_wakeup_source(ws) \
        for ((ws) = wakeup_sources_walk_start();        \
             (ws);                                        \
             (ws) = wakeup_sources_walk_next((ws)))

#ifdef CONFIG_PM_SLEEP

/*
 * Changes to device_may_wakeup take effect on the next pm state change.
 */

static inline bool device_can_wakeup(struct device *dev)
{
        return dev->power.can_wakeup;
}

static inline bool device_may_wakeup(struct device *dev)
{
        return dev->power.can_wakeup && !!dev->power.wakeup;
}

static inline bool device_wakeup_path(struct device *dev)
{
        return dev->power.wakeup_path;
}

static inline void device_set_wakeup_path(struct device *dev)
{
        dev->power.wakeup_path = true;
}

/* drivers/base/power/wakeup.c */
extern struct wakeup_source *wakeup_source_register(struct device *dev,
                                                    const char *name);
extern void wakeup_source_unregister(struct wakeup_source *ws);
extern int wakeup_sources_read_lock(void);
extern void wakeup_sources_read_unlock(int idx);
extern struct wakeup_source *wakeup_sources_walk_start(void);
extern struct wakeup_source *wakeup_sources_walk_next(struct wakeup_source *ws);
extern int device_wakeup_enable(struct device *dev);
extern void device_wakeup_disable(struct device *dev);
extern void device_set_wakeup_capable(struct device *dev, bool capable);
extern int device_set_wakeup_enable(struct device *dev, bool enable);
extern void __pm_stay_awake(struct wakeup_source *ws);
extern void pm_stay_awake(struct device *dev);
extern void __pm_relax(struct wakeup_source *ws);
extern void pm_relax(struct device *dev);
extern void pm_wakeup_ws_event(struct wakeup_source *ws, unsigned int msec, bool hard);
extern void pm_wakeup_dev_event(struct device *dev, unsigned int msec, bool hard);

#else /* !CONFIG_PM_SLEEP */

static inline void device_set_wakeup_capable(struct device *dev, bool capable)
{
        dev->power.can_wakeup = capable;
}

static inline bool device_can_wakeup(struct device *dev)
{
        return dev->power.can_wakeup;
}

static inline struct wakeup_source *wakeup_source_register(struct device *dev,
                                                           const char *name)
{
        return NULL;
}

static inline void wakeup_source_unregister(struct wakeup_source *ws) {}

static inline int device_wakeup_enable(struct device *dev)
{
        dev->power.should_wakeup = true;
        return 0;
}

static inline void device_wakeup_disable(struct device *dev)
{
        dev->power.should_wakeup = false;
}

static inline int device_set_wakeup_enable(struct device *dev, bool enable)
{
        dev->power.should_wakeup = enable;
        return 0;
}

static inline bool device_may_wakeup(struct device *dev)
{
        return dev->power.can_wakeup && dev->power.should_wakeup;
}

static inline bool device_wakeup_path(struct device *dev)
{
        return false;
}

static inline void device_set_wakeup_path(struct device *dev) {}

static inline void __pm_stay_awake(struct wakeup_source *ws) {}

static inline void pm_stay_awake(struct device *dev) {}

static inline void __pm_relax(struct wakeup_source *ws) {}

static inline void pm_relax(struct device *dev) {}

static inline void pm_wakeup_ws_event(struct wakeup_source *ws,
                                      unsigned int msec, bool hard) {}

static inline void pm_wakeup_dev_event(struct device *dev, unsigned int msec,
                                       bool hard) {}

#endif /* !CONFIG_PM_SLEEP */

static inline bool device_awake_path(struct device *dev)
{
        return device_wakeup_path(dev);
}

static inline void device_set_awake_path(struct device *dev)
{
        device_set_wakeup_path(dev);
}

static inline void __pm_wakeup_event(struct wakeup_source *ws, unsigned int msec)
{
        pm_wakeup_ws_event(ws, msec, false);
}

static inline void pm_wakeup_event(struct device *dev, unsigned int msec)
{
        pm_wakeup_dev_event(dev, msec, false);
}

static inline void pm_wakeup_hard_event(struct device *dev)
{
        pm_wakeup_dev_event(dev, 0, true);
}

/**
 * device_init_wakeup - Device wakeup initialization.
 * @dev: Device to handle.
 * @enable: Whether or not to enable @dev as a wakeup device.
 *
 * By default, most devices should leave wakeup disabled.  The exceptions are
 * devices that everyone expects to be wakeup sources: keyboards, power buttons,
 * possibly network interfaces, etc.  Also, devices that don't generate their
 * own wakeup requests but merely forward requests from one bus to another
 * (like PCI bridges) should have wakeup enabled by default.
 */
static inline int device_init_wakeup(struct device *dev, bool enable)
{
        if (enable) {
                device_set_wakeup_capable(dev, true);
                return device_wakeup_enable(dev);
        }
        device_wakeup_disable(dev);
        device_set_wakeup_capable(dev, false);
        return 0;
}

static void device_disable_wakeup(void *dev)
{
        device_init_wakeup(dev, false);
}

/**
 * devm_device_init_wakeup - Resource managed device wakeup initialization.
 * @dev: Device to handle.
 *
 * This function is the devm managed version of device_init_wakeup(dev, true).
 */
static inline int devm_device_init_wakeup(struct device *dev)
{
        device_init_wakeup(dev, true);
        return devm_add_action_or_reset(dev, device_disable_wakeup, dev);
}

#endif /* _LINUX_PM_WAKEUP_H */



















































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * NUMA memory policies for Linux.
 * Copyright 2003,2004 Andi Kleen SuSE Labs
 */
#ifndef _LINUX_MEMPOLICY_H
#define _LINUX_MEMPOLICY_H 1

#include <linux/sched.h>
#include <linux/mmzone.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/spinlock.h>
#include <linux/node.h>
#include <linux/nodemask.h>
#include <linux/pagemap.h>
#include <uapi/linux/mempolicy.h>

struct mm_struct;

#define NO_INTERLEAVE_INDEX (-1UL)        /* use task il_prev for interleaving */

#ifdef CONFIG_NUMA

/*
 * Describe a memory policy.
 *
 * A mempolicy can be either associated with a process or with a VMA.
 * For VMA related allocations the VMA policy is preferred, otherwise
 * the process policy is used. Interrupts ignore the memory policy
 * of the current process.
 *
 * Locking policy for interleave:
 * In process context there is no locking because only the process accesses
 * its own state. All vma manipulation is somewhat protected by a down_read on
 * mmap_lock.
 *
 * Freeing policy:
 * Mempolicy objects are reference counted.  A mempolicy will be freed when
 * mpol_put() decrements the reference count to zero.
 *
 * Duplicating policy objects:
 * mpol_dup() allocates a new mempolicy and copies the specified mempolicy
 * to the new storage.  The reference count of the new object is initialized
 * to 1, representing the caller of mpol_dup().
 */
struct mempolicy {
        atomic_t refcnt;
        unsigned short mode;         /* See MPOL_* above */
        unsigned short flags;        /* See set_mempolicy() MPOL_F_* above */
        nodemask_t nodes;        /* interleave/bind/preferred/etc */
        int home_node;                /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */

        union {
                nodemask_t cpuset_mems_allowed;        /* relative to these nodes */
                nodemask_t user_nodemask;        /* nodemask passed by user */
        } w;
};

/*
 * Support for managing mempolicy data objects (clone, copy, destroy)
 * The default fast path of a NULL MPOL_DEFAULT policy is always inlined.
 */

extern void __mpol_put(struct mempolicy *pol);
static inline void mpol_put(struct mempolicy *pol)
{
        if (pol)
                __mpol_put(pol);
}

/*
 * Does mempolicy pol need explicit unref after use?
 * Currently only needed for shared policies.
 */
static inline int mpol_needs_cond_ref(struct mempolicy *pol)
{
        return (pol && (pol->flags & MPOL_F_SHARED));
}

static inline void mpol_cond_put(struct mempolicy *pol)
{
        if (mpol_needs_cond_ref(pol))
                __mpol_put(pol);
}

extern struct mempolicy *__mpol_dup(struct mempolicy *pol);
static inline struct mempolicy *mpol_dup(struct mempolicy *pol)
{
        if (pol)
                pol = __mpol_dup(pol);
        return pol;
}

static inline void mpol_get(struct mempolicy *pol)
{
        if (pol)
                atomic_inc(&pol->refcnt);
}

extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b);
static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
        if (a == b)
                return true;
        return __mpol_equal(a, b);
}

/*
 * Tree of shared policies for a shared memory region.
 */
struct shared_policy {
        struct rb_root root;
        rwlock_t lock;
};
struct sp_node {
        struct rb_node nd;
        pgoff_t start, end;
        struct mempolicy *policy;
};

int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst);
void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol);
int mpol_set_shared_policy(struct shared_policy *sp,
                           struct vm_area_struct *vma, struct mempolicy *mpol);
void mpol_free_shared_policy(struct shared_policy *sp);
struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
                                            pgoff_t idx);

struct mempolicy *get_task_policy(struct task_struct *p);
struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
                unsigned long addr, pgoff_t *ilx);
struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
                unsigned long addr, int order, pgoff_t *ilx);
bool vma_policy_mof(struct vm_area_struct *vma);

extern void numa_default_policy(void);
extern void numa_policy_init(void);
extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new);
extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);

extern int huge_node(struct vm_area_struct *vma,
                                unsigned long addr, gfp_t gfp_flags,
                                struct mempolicy **mpol, nodemask_t **nodemask);
extern bool init_nodemask_of_mempolicy(nodemask_t *mask);
extern bool mempolicy_in_oom_domain(struct task_struct *tsk,
                                const nodemask_t *mask);
extern unsigned int mempolicy_slab_node(void);

extern enum zone_type policy_zone;

static inline void check_highest_zone(enum zone_type k)
{
        if (k > policy_zone && k != ZONE_MOVABLE)
                policy_zone = k;
}

int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
                     const nodemask_t *to, int flags);


#ifdef CONFIG_TMPFS
extern int mpol_parse_str(char *str, struct mempolicy **mpol);
#endif

extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);

/* Check if a vma is migratable */
extern bool vma_migratable(struct vm_area_struct *vma);

int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
                                        unsigned long addr);
extern void mpol_put_task_policy(struct task_struct *);

static inline bool mpol_is_preferred_many(struct mempolicy *pol)
{
        return  (pol->mode == MPOL_PREFERRED_MANY);
}

extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone);

extern int mempolicy_set_node_perf(unsigned int node,
                                   struct access_coordinate *coords);

#else

struct mempolicy {};

static inline struct mempolicy *get_task_policy(struct task_struct *p)
{
        return NULL;
}

static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
        return true;
}

static inline void mpol_put(struct mempolicy *pol)
{
}

static inline void mpol_cond_put(struct mempolicy *pol)
{
}

static inline void mpol_get(struct mempolicy *pol)
{
}

struct shared_policy {};

static inline void mpol_shared_policy_init(struct shared_policy *sp,
                                                struct mempolicy *mpol)
{
}

static inline void mpol_free_shared_policy(struct shared_policy *sp)
{
}

static inline struct mempolicy *
mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx)
{
        return NULL;
}

static inline struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
                                unsigned long addr, int order, pgoff_t *ilx)
{
        *ilx = 0;
        return NULL;
}

static inline int
vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
{
        return 0;
}

static inline void numa_policy_init(void)
{
}

static inline void numa_default_policy(void)
{
}

static inline void mpol_rebind_task(struct task_struct *tsk,
                                const nodemask_t *new)
{
}

static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
}

static inline int huge_node(struct vm_area_struct *vma,
                                unsigned long addr, gfp_t gfp_flags,
                                struct mempolicy **mpol, nodemask_t **nodemask)
{
        *mpol = NULL;
        *nodemask = NULL;
        return 0;
}

static inline bool init_nodemask_of_mempolicy(nodemask_t *m)
{
        return false;
}

static inline int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
                                   const nodemask_t *to, int flags)
{
        return 0;
}

static inline void check_highest_zone(int k)
{
}

#ifdef CONFIG_TMPFS
static inline int mpol_parse_str(char *str, struct mempolicy **mpol)
{
        return 1;        /* error */
}
#endif

static inline int mpol_misplaced(struct folio *folio,
                                 struct vm_fault *vmf,
                                 unsigned long address)
{
        return -1; /* no node preference */
}

static inline void mpol_put_task_policy(struct task_struct *task)
{
}

static inline bool mpol_is_preferred_many(struct mempolicy *pol)
{
        return  false;
}

#endif /* CONFIG_NUMA */
#endif













































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_COREDUMP_H
#define _LINUX_SCHED_COREDUMP_H

#include <linux/mm_types.h>

#define SUID_DUMP_DISABLE        0        /* No setuid dumping */
#define SUID_DUMP_USER                1        /* Dump as user of process */
#define SUID_DUMP_ROOT                2        /* Dump as root */

static inline unsigned long __mm_flags_get_dumpable(struct mm_struct *mm)
{
        /*
         * By convention, dumpable bits are contained in first 32 bits of the
         * bitmap, so we can simply access this first unsigned long directly.
         */
        return __mm_flags_get_word(mm);
}

static inline void __mm_flags_set_mask_dumpable(struct mm_struct *mm, int value)
{
        __mm_flags_set_mask_bits_word(mm, MMF_DUMPABLE_MASK, value);
}

extern void set_dumpable(struct mm_struct *mm, int value);
/*
 * This returns the actual value of the suid_dumpable flag. For things
 * that are using this for checking for privilege transitions, it must
 * test against SUID_DUMP_USER rather than treating it as a boolean
 * value.
 */
static inline int __get_dumpable(unsigned long mm_flags)
{
        return mm_flags & MMF_DUMPABLE_MASK;
}

static inline int get_dumpable(struct mm_struct *mm)
{
        unsigned long flags = __mm_flags_get_dumpable(mm);

        return __get_dumpable(flags);
}

#endif /* _LINUX_SCHED_COREDUMP_H */













































































































































































































    1 







































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Cryptographic API for algorithms (i.e., low-level API).
 *
 * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
 */
#ifndef _CRYPTO_ALGAPI_H
#define _CRYPTO_ALGAPI_H

#include <crypto/utils.h>
#include <linux/align.h>
#include <linux/cache.h>
#include <linux/crypto.h>
#include <linux/list.h>
#include <linux/types.h>
#include <linux/workqueue.h>

/*
 * Maximum values for blocksize and alignmask, used to allocate
 * static buffers that are big enough for any combination of
 * algs and architectures. Ciphers have a lower maximum size.
 */
#define MAX_ALGAPI_BLOCKSIZE                160
#define MAX_ALGAPI_ALIGNMASK                127
#define MAX_CIPHER_BLOCKSIZE                16
#define MAX_CIPHER_ALIGNMASK                15

#ifdef ARCH_DMA_MINALIGN
#define CRYPTO_DMA_ALIGN ARCH_DMA_MINALIGN
#else
#define CRYPTO_DMA_ALIGN CRYPTO_MINALIGN
#endif

#define CRYPTO_DMA_PADDING ((CRYPTO_DMA_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1))

/*
 * Autoloaded crypto modules should only use a prefixed name to avoid allowing
 * arbitrary modules to be loaded. Loading from userspace may still need the
 * unprefixed names, so retains those aliases as well.
 * This uses __MODULE_INFO directly instead of MODULE_ALIAS because pre-4.3
 * gcc (e.g. avr32 toolchain) uses __LINE__ for uniqueness, and this macro
 * expands twice on the same line. Instead, use a separate base name for the
 * alias.
 */
#define MODULE_ALIAS_CRYPTO(name)        \
                MODULE_INFO(alias, name);        \
                MODULE_INFO(alias, "crypto-" name)

struct crypto_aead;
struct crypto_instance;
struct module;
struct notifier_block;
struct rtattr;
struct scatterlist;
struct seq_file;
struct sk_buff;
union crypto_no_such_thing;

struct crypto_instance {
        struct crypto_alg alg;

        struct crypto_template *tmpl;

        union {
                /* Node in list of instances after registration. */
                struct hlist_node list;
                /* List of attached spawns before registration. */
                struct crypto_spawn *spawns;
        };

        void *__ctx[] CRYPTO_MINALIGN_ATTR;
};

struct crypto_template {
        struct list_head list;
        struct hlist_head instances;
        struct hlist_head dead;
        struct module *module;

        struct work_struct free_work;

        int (*create)(struct crypto_template *tmpl, struct rtattr **tb);

        char name[CRYPTO_MAX_ALG_NAME];
};

struct crypto_spawn {
        struct list_head list;
        struct crypto_alg *alg;
        union {
                /* Back pointer to instance after registration.*/
                struct crypto_instance *inst;
                /* Spawn list pointer prior to registration. */
                struct crypto_spawn *next;
        };
        const struct crypto_type *frontend;
        u32 mask;
        bool dead;
        bool registered;
};

struct crypto_queue {
        struct list_head list;
        struct list_head *backlog;

        unsigned int qlen;
        unsigned int max_qlen;
};

struct crypto_attr_alg {
        char name[CRYPTO_MAX_ALG_NAME];
};

struct crypto_attr_type {
        u32 type;
        u32 mask;
};

/*
 * Algorithm registration interface.
 */
int crypto_register_alg(struct crypto_alg *alg);
void crypto_unregister_alg(struct crypto_alg *alg);
int crypto_register_algs(struct crypto_alg *algs, int count);
void crypto_unregister_algs(struct crypto_alg *algs, int count);

void crypto_mod_put(struct crypto_alg *alg);

int crypto_register_template(struct crypto_template *tmpl);
int crypto_register_templates(struct crypto_template *tmpls, int count);
void crypto_unregister_template(struct crypto_template *tmpl);
void crypto_unregister_templates(struct crypto_template *tmpls, int count);
struct crypto_template *crypto_lookup_template(const char *name);

int crypto_register_instance(struct crypto_template *tmpl,
                             struct crypto_instance *inst);
void crypto_unregister_instance(struct crypto_instance *inst);

int crypto_grab_spawn(struct crypto_spawn *spawn, struct crypto_instance *inst,
                      const char *name, u32 type, u32 mask);
void crypto_drop_spawn(struct crypto_spawn *spawn);
struct crypto_tfm *crypto_spawn_tfm(struct crypto_spawn *spawn, u32 type,
                                    u32 mask);
void *crypto_spawn_tfm2(struct crypto_spawn *spawn);

struct crypto_attr_type *crypto_get_attr_type(struct rtattr **tb);
int crypto_check_attr_type(struct rtattr **tb, u32 type, u32 *mask_ret);
const char *crypto_attr_alg_name(struct rtattr *rta);
int __crypto_inst_setname(struct crypto_instance *inst, const char *name,
                          const char *driver, struct crypto_alg *alg);

#define crypto_inst_setname(inst, name, ...) \
        CONCATENATE(crypto_inst_setname_, COUNT_ARGS(__VA_ARGS__))( \
                inst, name, ##__VA_ARGS__)
#define crypto_inst_setname_1(inst, name, alg) \
        __crypto_inst_setname(inst, name, name, alg)
#define crypto_inst_setname_2(inst, name, driver, alg) \
        __crypto_inst_setname(inst, name, driver, alg)

void crypto_init_queue(struct crypto_queue *queue, unsigned int max_qlen);
int crypto_enqueue_request(struct crypto_queue *queue,
                           struct crypto_async_request *request);
void crypto_enqueue_request_head(struct crypto_queue *queue,
                                 struct crypto_async_request *request);
struct crypto_async_request *crypto_dequeue_request(struct crypto_queue *queue);
static inline unsigned int crypto_queue_len(struct crypto_queue *queue)
{
        return queue->qlen;
}

void crypto_inc(u8 *a, unsigned int size);

static inline void *crypto_tfm_ctx(struct crypto_tfm *tfm)
{
        return tfm->__crt_ctx;
}

static inline void *crypto_tfm_ctx_align(struct crypto_tfm *tfm,
                                         unsigned int align)
{
        if (align <= crypto_tfm_ctx_alignment())
                align = 1;

        return PTR_ALIGN(crypto_tfm_ctx(tfm), align);
}

static inline unsigned int crypto_dma_align(void)
{
        return CRYPTO_DMA_ALIGN;
}

static inline unsigned int crypto_dma_padding(void)
{
        return (crypto_dma_align() - 1) & ~(crypto_tfm_ctx_alignment() - 1);
}

static inline void *crypto_tfm_ctx_dma(struct crypto_tfm *tfm)
{
        return crypto_tfm_ctx_align(tfm, crypto_dma_align());
}

static inline struct crypto_instance *crypto_tfm_alg_instance(
        struct crypto_tfm *tfm)
{
        return container_of(tfm->__crt_alg, struct crypto_instance, alg);
}

static inline void *crypto_instance_ctx(struct crypto_instance *inst)
{
        return inst->__ctx;
}

static inline struct crypto_async_request *crypto_get_backlog(
        struct crypto_queue *queue)
{
        return queue->backlog == &queue->list ? NULL :
               container_of(queue->backlog, struct crypto_async_request, list);
}

static inline u32 crypto_requires_off(struct crypto_attr_type *algt, u32 off)
{
        return (algt->type ^ off) & algt->mask & off;
}

/*
 * When an algorithm uses another algorithm (e.g., if it's an instance of a
 * template), these are the flags that should always be set on the "outer"
 * algorithm if any "inner" algorithm has them set.
 */
#define CRYPTO_ALG_INHERITED_FLAGS        \
        (CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK |        \
         CRYPTO_ALG_ALLOCATES_MEMORY)

/*
 * Given the type and mask that specify the flags restrictions on a template
 * instance being created, return the mask that should be passed to
 * crypto_grab_*() (along with type=0) to honor any request the user made to
 * have any of the CRYPTO_ALG_INHERITED_FLAGS clear.
 */
static inline u32 crypto_algt_inherited_mask(struct crypto_attr_type *algt)
{
        return crypto_requires_off(algt, CRYPTO_ALG_INHERITED_FLAGS);
}

int crypto_register_notifier(struct notifier_block *nb);
int crypto_unregister_notifier(struct notifier_block *nb);

/* Crypto notification events. */
enum {
        CRYPTO_MSG_ALG_REQUEST,
        CRYPTO_MSG_ALG_REGISTER,
        CRYPTO_MSG_ALG_LOADED,
};

static inline void crypto_request_complete(struct crypto_async_request *req,
                                           int err)
{
        req->complete(req->data, err);
}

static inline u32 crypto_tfm_alg_type(struct crypto_tfm *tfm)
{
        return tfm->__crt_alg->cra_flags & CRYPTO_ALG_TYPE_MASK;
}

static inline bool crypto_tfm_req_virt(struct crypto_tfm *tfm)
{
        return tfm->__crt_alg->cra_flags & CRYPTO_ALG_REQ_VIRT;
}

static inline u32 crypto_request_flags(struct crypto_async_request *req)
{
        return req->flags & ~CRYPTO_TFM_REQ_ON_STACK;
}

#endif        /* _CRYPTO_ALGAPI_H */

























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * cls_cgroup.h                        Control Group Classifier
 *
 * Authors:        Thomas Graf <tgraf@suug.ch>
 */

#ifndef _NET_CLS_CGROUP_H
#define _NET_CLS_CGROUP_H

#include <linux/cgroup.h>
#include <linux/hardirq.h>
#include <linux/rcupdate.h>
#include <net/sock.h>
#include <net/inet_sock.h>

#ifdef CONFIG_CGROUP_NET_CLASSID
struct cgroup_cls_state {
        struct cgroup_subsys_state css;
        u32 classid;
};

struct cgroup_cls_state *task_cls_state(struct task_struct *p);

static inline u32 task_cls_classid(struct task_struct *p)
{
        u32 classid;

        if (in_interrupt())
                return 0;

        rcu_read_lock();
        classid = container_of(task_css(p, net_cls_cgrp_id),
                               struct cgroup_cls_state, css)->classid;
        rcu_read_unlock();

        return classid;
}

static inline void sock_update_classid(struct sock_cgroup_data *skcd)
{
        u32 classid;

        classid = task_cls_classid(current);
        sock_cgroup_set_classid(skcd, classid);
}

static inline u32 __task_get_classid(struct task_struct *task)
{
        return task_cls_state(task)->classid;
}

static inline u32 task_get_classid(const struct sk_buff *skb)
{
        u32 classid = __task_get_classid(current);

        /* Due to the nature of the classifier it is required to ignore all
         * packets originating from softirq context as accessing `current'
         * would lead to false results.
         *
         * This test assumes that all callers of dev_queue_xmit() explicitly
         * disable bh. Knowing this, it is possible to detect softirq based
         * calls by looking at the number of nested bh disable calls because
         * softirqs always disables bh.
         */
        if (softirq_count()) {
                struct sock *sk = skb_to_full_sk(skb);

                /* If there is an sock_cgroup_classid we'll use that. */
                if (!sk || !sk_fullsock(sk))
                        return 0;

                classid = sock_cgroup_classid(&sk->sk_cgrp_data);
        }

        return classid;
}
#else /* !CONFIG_CGROUP_NET_CLASSID */
static inline void sock_update_classid(struct sock_cgroup_data *skcd)
{
}

static inline u32 task_get_classid(const struct sk_buff *skb)
{
        return 0;
}
#endif /* CONFIG_CGROUP_NET_CLASSID */
#endif  /* _NET_CLS_CGROUP_H */










































































































   41 






























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_TIMEKEEPING_H
#define _LINUX_TIMEKEEPING_H

#include <linux/errno.h>
#include <linux/clocksource_ids.h>
#include <linux/ktime.h>

/* Included from linux/ktime.h */

void timekeeping_init(void);
extern int timekeeping_suspended;

/* Architecture timer tick functions: */
extern void legacy_timer_tick(unsigned long ticks);

/*
 * Get and set timeofday
 */
extern int do_settimeofday64(const struct timespec64 *ts);
extern int do_sys_settimeofday64(const struct timespec64 *tv,
                                 const struct timezone *tz);

/*
 * ktime_get() family - read the current time in a multitude of ways.
 *
 * The default time reference is CLOCK_MONOTONIC, starting at
 * boot time but not counting the time spent in suspend.
 * For other references, use the functions with "real", "clocktai",
 * "boottime" and "raw" suffixes.
 *
 * To get the time in a different format, use the ones with
 * "ns", "ts64" and "seconds" suffix.
 *
 * See Documentation/core-api/timekeeping.rst for more details.
 */


/*
 * timespec64 based interfaces
 */
extern void ktime_get_raw_ts64(struct timespec64 *ts);
extern void ktime_get_ts64(struct timespec64 *ts);
extern void ktime_get_real_ts64(struct timespec64 *tv);
extern void ktime_get_coarse_ts64(struct timespec64 *ts);
extern void ktime_get_coarse_real_ts64(struct timespec64 *ts);
extern void ktime_get_clock_ts64(clockid_t id, struct timespec64 *ts);

/* Multigrain timestamp interfaces */
extern void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts);
extern void ktime_get_real_ts64_mg(struct timespec64 *ts);
extern unsigned long timekeeping_get_mg_floor_swaps(void);

void getboottime64(struct timespec64 *ts);

/*
 * time64_t base interfaces
 */
extern time64_t ktime_get_seconds(void);
extern time64_t __ktime_get_real_seconds(void);
extern time64_t ktime_get_real_seconds(void);

/*
 * ktime_t based interfaces
 */

enum tk_offsets {
        TK_OFFS_REAL,
        TK_OFFS_BOOT,
        TK_OFFS_TAI,
        TK_OFFS_MAX,
};

extern ktime_t ktime_get(void);
extern ktime_t ktime_get_with_offset(enum tk_offsets offs);
extern ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs);
extern ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs);
extern ktime_t ktime_get_raw(void);
extern u32 ktime_get_resolution_ns(void);

/**
 * ktime_get_real - get the real (wall-) time in ktime_t format
 *
 * Returns: real (wall) time in ktime_t format
 */
static inline ktime_t ktime_get_real(void)
{
        return ktime_get_with_offset(TK_OFFS_REAL);
}

static inline ktime_t ktime_get_coarse_real(void)
{
        return ktime_get_coarse_with_offset(TK_OFFS_REAL);
}

/**
 * ktime_get_boottime - Get monotonic time since boot in ktime_t format
 *
 * This is similar to CLOCK_MONTONIC/ktime_get, but also includes the
 * time spent in suspend.
 *
 * Returns: monotonic time since boot in ktime_t format
 */
static inline ktime_t ktime_get_boottime(void)
{
        return ktime_get_with_offset(TK_OFFS_BOOT);
}

static inline ktime_t ktime_get_coarse_boottime(void)
{
        return ktime_get_coarse_with_offset(TK_OFFS_BOOT);
}

/**
 * ktime_get_clocktai - Get the TAI time of day in ktime_t format
 *
 * Returns: the TAI time of day in ktime_t format
 */
static inline ktime_t ktime_get_clocktai(void)
{
        return ktime_get_with_offset(TK_OFFS_TAI);
}

static inline ktime_t ktime_get_coarse_clocktai(void)
{
        return ktime_get_coarse_with_offset(TK_OFFS_TAI);
}

static inline ktime_t ktime_get_coarse(void)
{
        struct timespec64 ts;

        ktime_get_coarse_ts64(&ts);
        return timespec64_to_ktime(ts);
}

static inline u64 ktime_get_coarse_ns(void)
{
        return ktime_to_ns(ktime_get_coarse());
}

static inline u64 ktime_get_coarse_real_ns(void)
{
        return ktime_to_ns(ktime_get_coarse_real());
}

static inline u64 ktime_get_coarse_boottime_ns(void)
{
        return ktime_to_ns(ktime_get_coarse_boottime());
}

static inline u64 ktime_get_coarse_clocktai_ns(void)
{
        return ktime_to_ns(ktime_get_coarse_clocktai());
}

/**
 * ktime_mono_to_real - Convert monotonic time to clock realtime
 * @mono: monotonic time to convert
 *
 * Returns: time converted to realtime clock
 */
static inline ktime_t ktime_mono_to_real(ktime_t mono)
{
        return ktime_mono_to_any(mono, TK_OFFS_REAL);
}

/**
 * ktime_get_ns - Get the current time in nanoseconds
 *
 * Returns: current time converted to nanoseconds
 */
static inline u64 ktime_get_ns(void)
{
        return ktime_to_ns(ktime_get());
}

/**
 * ktime_get_real_ns - Get the current real/wall time in nanoseconds
 *
 * Returns: current real time converted to nanoseconds
 */
static inline u64 ktime_get_real_ns(void)
{
        return ktime_to_ns(ktime_get_real());
}

/**
 * ktime_get_boottime_ns - Get the monotonic time since boot in nanoseconds
 *
 * Returns: current boottime converted to nanoseconds
 */
static inline u64 ktime_get_boottime_ns(void)
{
        return ktime_to_ns(ktime_get_boottime());
}

/**
 * ktime_get_clocktai_ns - Get the current TAI time of day in nanoseconds
 *
 * Returns: current TAI time converted to nanoseconds
 */
static inline u64 ktime_get_clocktai_ns(void)
{
        return ktime_to_ns(ktime_get_clocktai());
}

/**
 * ktime_get_raw_ns - Get the raw monotonic time in nanoseconds
 *
 * Returns: current raw monotonic time converted to nanoseconds
 */
static inline u64 ktime_get_raw_ns(void)
{
        return ktime_to_ns(ktime_get_raw());
}

extern u64 ktime_get_mono_fast_ns(void);
extern u64 ktime_get_raw_fast_ns(void);
extern u64 ktime_get_boot_fast_ns(void);
extern u64 ktime_get_tai_fast_ns(void);
extern u64 ktime_get_real_fast_ns(void);

/*
 * timespec64/time64_t interfaces utilizing the ktime based ones
 * for API completeness, these could be implemented more efficiently
 * if needed.
 */
static inline void ktime_get_boottime_ts64(struct timespec64 *ts)
{
        *ts = ktime_to_timespec64(ktime_get_boottime());
}

static inline void ktime_get_coarse_boottime_ts64(struct timespec64 *ts)
{
        *ts = ktime_to_timespec64(ktime_get_coarse_boottime());
}

static inline time64_t ktime_get_boottime_seconds(void)
{
        return ktime_divns(ktime_get_coarse_boottime(), NSEC_PER_SEC);
}

static inline void ktime_get_clocktai_ts64(struct timespec64 *ts)
{
        *ts = ktime_to_timespec64(ktime_get_clocktai());
}

static inline void ktime_get_coarse_clocktai_ts64(struct timespec64 *ts)
{
        *ts = ktime_to_timespec64(ktime_get_coarse_clocktai());
}

static inline time64_t ktime_get_clocktai_seconds(void)
{
        return ktime_divns(ktime_get_coarse_clocktai(), NSEC_PER_SEC);
}

/*
 * RTC specific
 */
extern bool timekeeping_rtc_skipsuspend(void);
extern bool timekeeping_rtc_skipresume(void);

extern void timekeeping_inject_sleeptime64(const struct timespec64 *delta);

/*
 * Auxiliary clock interfaces
 */
#ifdef CONFIG_POSIX_AUX_CLOCKS
extern bool ktime_get_aux(clockid_t id, ktime_t *kt);
extern bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *kt);
#else
static inline bool ktime_get_aux(clockid_t id, ktime_t *kt) { return false; }
static inline bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *kt) { return false; }
#endif

/**
 * struct system_time_snapshot - simultaneous raw/real time capture with
 *                                 counter value
 * @cycles:        Clocksource counter value to produce the system times
 * @real:        Realtime system time
 * @boot:        Boot time
 * @raw:        Monotonic raw system time
 * @cs_id:        Clocksource ID
 * @clock_was_set_seq:        The sequence number of clock-was-set events
 * @cs_was_changed_seq:        The sequence number of clocksource change events
 */
struct system_time_snapshot {
        u64                        cycles;
        ktime_t                        real;
        ktime_t                        boot;
        ktime_t                        raw;
        enum clocksource_ids        cs_id;
        unsigned int                clock_was_set_seq;
        u8                        cs_was_changed_seq;
};

/**
 * struct system_device_crosststamp - system/device cross-timestamp
 *                                      (synchronized capture)
 * @device:                Device time
 * @sys_realtime:        Realtime simultaneous with device time
 * @sys_monoraw:        Monotonic raw simultaneous with device time
 */
struct system_device_crosststamp {
        ktime_t device;
        ktime_t sys_realtime;
        ktime_t sys_monoraw;
};

/**
 * struct system_counterval_t - system counter value with the ID of the
 *                                corresponding clocksource
 * @cycles:        System counter value
 * @cs_id:        Clocksource ID corresponding to system counter value. Used by
 *                timekeeping code to verify comparability of two cycle values.
 *                The default ID, CSID_GENERIC, does not identify a specific
 *                clocksource.
 * @use_nsecs:        @cycles is in nanoseconds.
 */
struct system_counterval_t {
        u64                        cycles;
        enum clocksource_ids        cs_id;
        bool                        use_nsecs;
};

extern bool ktime_real_to_base_clock(ktime_t treal,
                                     enum clocksource_ids base_id, u64 *cycles);
extern bool timekeeping_clocksource_has_base(enum clocksource_ids id);

/*
 * Get cross timestamp between system clock and device clock
 */
extern int get_device_system_crosststamp(
                        int (*get_time_fn)(ktime_t *device_time,
                                struct system_counterval_t *system_counterval,
                                void *ctx),
                        void *ctx,
                        struct system_time_snapshot *history,
                        struct system_device_crosststamp *xtstamp);

/*
 * Simultaneously snapshot realtime and monotonic raw clocks
 */
extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot);

/*
 * Persistent clock related interfaces
 */
extern int persistent_clock_is_local;

extern void read_persistent_clock64(struct timespec64 *ts);
void read_persistent_wall_and_boot_offset(struct timespec64 *wall_clock,
                                          struct timespec64 *boot_offset);
#ifdef CONFIG_GENERIC_CMOS_UPDATE
extern int update_persistent_clock64(struct timespec64 now);
#endif

#endif































































    7 



































    7 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Common values and helper functions for the ChaCha and XChaCha stream ciphers.
 *
 * XChaCha extends ChaCha's nonce to 192 bits, while provably retaining ChaCha's
 * security.  Here they share the same key size, tfm context, and setkey
 * function; only their IV size and encrypt/decrypt function differ.
 *
 * The ChaCha paper specifies 20, 12, and 8-round variants.  In general, it is
 * recommended to use the 20-round variant ChaCha20.  However, the other
 * variants can be needed in some performance-sensitive scenarios.  The generic
 * ChaCha code currently allows only the 20 and 12-round variants.
 */

#ifndef _CRYPTO_CHACHA_H
#define _CRYPTO_CHACHA_H

#include <linux/unaligned.h>
#include <linux/string.h>
#include <linux/types.h>

/* 32-bit stream position, then 96-bit nonce (RFC7539 convention) */
#define CHACHA_IV_SIZE                16

#define CHACHA_KEY_SIZE                32
#define CHACHA_BLOCK_SIZE        64
#define CHACHAPOLY_IV_SIZE        12

#define CHACHA_KEY_WORDS        8
#define CHACHA_STATE_WORDS        16
#define HCHACHA_OUT_WORDS        8

/* 192-bit nonce, then 64-bit stream position */
#define XCHACHA_IV_SIZE                32

struct chacha_state {
        u32 x[CHACHA_STATE_WORDS];
};

void chacha_block_generic(struct chacha_state *state,
                          u8 out[CHACHA_BLOCK_SIZE], int nrounds);
static inline void chacha20_block(struct chacha_state *state,
                                  u8 out[CHACHA_BLOCK_SIZE])
{
        chacha_block_generic(state, out, 20);
}

void hchacha_block_generic(const struct chacha_state *state,
                           u32 out[HCHACHA_OUT_WORDS], int nrounds);

void hchacha_block(const struct chacha_state *state,
                   u32 out[HCHACHA_OUT_WORDS], int nrounds);

enum chacha_constants { /* expand 32-byte k */
        CHACHA_CONSTANT_EXPA = 0x61707865U,
        CHACHA_CONSTANT_ND_3 = 0x3320646eU,
        CHACHA_CONSTANT_2_BY = 0x79622d32U,
        CHACHA_CONSTANT_TE_K = 0x6b206574U
};

static inline void chacha_init_consts(struct chacha_state *state)
{
        state->x[0]  = CHACHA_CONSTANT_EXPA;
        state->x[1]  = CHACHA_CONSTANT_ND_3;
        state->x[2]  = CHACHA_CONSTANT_2_BY;
        state->x[3]  = CHACHA_CONSTANT_TE_K;
}

static inline void chacha_init(struct chacha_state *state,
                               const u32 key[CHACHA_KEY_WORDS],
                               const u8 iv[CHACHA_IV_SIZE])
{
        chacha_init_consts(state);
        state->x[4]  = key[0];
        state->x[5]  = key[1];
        state->x[6]  = key[2];
        state->x[7]  = key[3];
        state->x[8]  = key[4];
        state->x[9]  = key[5];
        state->x[10] = key[6];
        state->x[11] = key[7];
        state->x[12] = get_unaligned_le32(iv +  0);
        state->x[13] = get_unaligned_le32(iv +  4);
        state->x[14] = get_unaligned_le32(iv +  8);
        state->x[15] = get_unaligned_le32(iv + 12);
}

void chacha_crypt(struct chacha_state *state, u8 *dst, const u8 *src,
                  unsigned int bytes, int nrounds);

static inline void chacha20_crypt(struct chacha_state *state,
                                  u8 *dst, const u8 *src, unsigned int bytes)
{
        chacha_crypt(state, dst, src, bytes, 20);
}

static inline void chacha_zeroize_state(struct chacha_state *state)
{
        memzero_explicit(state, sizeof(*state));
}

#endif /* _CRYPTO_CHACHA_H */











































































    2 





    2 








    2 







    2 























    2 


    1 
    2 





    2 

    3 









    2 

    7 
    4 






    1 



    6 

    4 

    5 
    3 








    7 
    1 



















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * IP Payload Compression Protocol (IPComp) for IPv6 - RFC3173
 *
 * Copyright (C)2003 USAGI/WIDE Project
 *
 * Author        Mitsuru KANDA  <mk@linux-ipv6.org>
 */
/*
 * [Memo]
 *
 * Outbound:
 *  The compression of IP datagram MUST be done before AH/ESP processing,
 *  fragmentation, and the addition of Hop-by-Hop/Routing header.
 *
 * Inbound:
 *  The decompression of IP datagram MUST be done after the reassembly,
 *  AH/ESP processing.
 */

#define pr_fmt(fmt) "IPv6: " fmt

#include <linux/module.h>
#include <net/ip.h>
#include <net/xfrm.h>
#include <net/ipcomp.h>
#include <linux/crypto.h>
#include <linux/err.h>
#include <linux/pfkeyv2.h>
#include <linux/random.h>
#include <linux/percpu.h>
#include <linux/smp.h>
#include <linux/list.h>
#include <linux/vmalloc.h>
#include <linux/rtnetlink.h>
#include <net/ip6_route.h>
#include <net/icmp.h>
#include <net/ipv6.h>
#include <net/protocol.h>
#include <linux/ipv6.h>
#include <linux/icmpv6.h>
#include <linux/mutex.h>

static int ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
                                u8 type, u8 code, int offset, __be32 info)
{
        struct net *net = dev_net(skb->dev);
        __be32 spi;
        const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data;
        struct ip_comp_hdr *ipcomph =
                (struct ip_comp_hdr *)(skb->data + offset);
        struct xfrm_state *x;

        if (type != ICMPV6_PKT_TOOBIG &&
            type != NDISC_REDIRECT)
                return 0;

        spi = htonl(ntohs(ipcomph->cpi));
        x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
                              spi, IPPROTO_COMP, AF_INET6);
        if (!x)
                return 0;

        if (type == NDISC_REDIRECT)
                ip6_redirect(skb, net, skb->dev->ifindex, 0,
                             sock_net_uid(net, NULL));
        else
                ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
        xfrm_state_put(x);

        return 0;
}

static struct lock_class_key xfrm_state_lock_key;
static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x)
{
        struct net *net = xs_net(x);
        struct xfrm_state *t = NULL;

        t = xfrm_state_alloc(net);
        if (!t)
                goto out;
        lockdep_set_class(&t->lock, &xfrm_state_lock_key);

        t->id.proto = IPPROTO_IPV6;
        t->id.spi = xfrm6_tunnel_alloc_spi(net, (xfrm_address_t *)&x->props.saddr);
        if (!t->id.spi)
                goto error;

        memcpy(t->id.daddr.a6, x->id.daddr.a6, sizeof(struct in6_addr));
        memcpy(&t->sel, &x->sel, sizeof(t->sel));
        t->props.family = AF_INET6;
        t->props.mode = x->props.mode;
        memcpy(t->props.saddr.a6, x->props.saddr.a6, sizeof(struct in6_addr));
        memcpy(&t->mark, &x->mark, sizeof(t->mark));
        t->if_id = x->if_id;

        if (xfrm_init_state(t))
                goto error;

        atomic_set(&t->tunnel_users, 1);

out:
        return t;

error:
        t->km.state = XFRM_STATE_DEAD;
        xfrm_state_put(t);
        t = NULL;
        goto out;
}

static int ipcomp6_tunnel_attach(struct xfrm_state *x)
{
        struct net *net = xs_net(x);
        int err = 0;
        struct xfrm_state *t = NULL;
        __be32 spi;
        u32 mark = x->mark.m & x->mark.v;

        spi = xfrm6_tunnel_spi_lookup(net, (xfrm_address_t *)&x->props.saddr);
        if (spi)
                t = xfrm_state_lookup(net, mark, (xfrm_address_t *)&x->id.daddr,
                                              spi, IPPROTO_IPV6, AF_INET6);
        if (!t) {
                t = ipcomp6_tunnel_create(x);
                if (!t) {
                        err = -EINVAL;
                        goto out;
                }
                xfrm_state_insert(t);
                xfrm_state_hold(t);
        }
        x->tunnel = t;
        atomic_inc(&t->tunnel_users);

out:
        return err;
}

static int ipcomp6_init_state(struct xfrm_state *x,
                              struct netlink_ext_ack *extack)
{
        int err = -EINVAL;

        x->props.header_len = 0;
        switch (x->props.mode) {
        case XFRM_MODE_TRANSPORT:
                break;
        case XFRM_MODE_TUNNEL:
                x->props.header_len += sizeof(struct ipv6hdr);
                break;
        default:
                NL_SET_ERR_MSG(extack, "Unsupported XFRM mode for IPcomp");
                goto out;
        }

        err = ipcomp_init_state(x, extack);
        if (err)
                goto out;

        if (x->props.mode == XFRM_MODE_TUNNEL) {
                err = ipcomp6_tunnel_attach(x);
                if (err) {
                        NL_SET_ERR_MSG(extack, "Kernel error: failed to initialize the associated state");
                        goto out;
                }
        }

        err = 0;
out:
        return err;
}

static int ipcomp6_rcv_cb(struct sk_buff *skb, int err)
{
        return 0;
}

static const struct xfrm_type ipcomp6_type = {
        .owner                = THIS_MODULE,
        .proto                = IPPROTO_COMP,
        .init_state        = ipcomp6_init_state,
        .destructor        = ipcomp_destroy,
        .input                = ipcomp_input,
        .output                = ipcomp_output,
};

static struct xfrm6_protocol ipcomp6_protocol = {
        .handler        = xfrm6_rcv,
        .input_handler        = xfrm_input,
        .cb_handler        = ipcomp6_rcv_cb,
        .err_handler        = ipcomp6_err,
        .priority        = 0,
};

static int __init ipcomp6_init(void)
{
        if (xfrm_register_type(&ipcomp6_type, AF_INET6) < 0) {
                pr_info("%s: can't add xfrm type\n", __func__);
                return -EAGAIN;
        }
        if (xfrm6_protocol_register(&ipcomp6_protocol, IPPROTO_COMP) < 0) {
                pr_info("%s: can't add protocol\n", __func__);
                xfrm_unregister_type(&ipcomp6_type, AF_INET6);
                return -EAGAIN;
        }
        return 0;
}

static void __exit ipcomp6_fini(void)
{
        if (xfrm6_protocol_deregister(&ipcomp6_protocol, IPPROTO_COMP) < 0)
                pr_info("%s: can't remove protocol\n", __func__);
        xfrm_unregister_type(&ipcomp6_type, AF_INET6);
}

module_init(ipcomp6_init);
module_exit(ipcomp6_fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp) for IPv6 - RFC3173");
MODULE_AUTHOR("Mitsuru KANDA <mk@linux-ipv6.org>");

MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_COMP);
































































































































































































































































































































































































































































































































































































































































   39 

   39 










   39 
   39 

   39 
















   39 
   39 




   39 



   39 
   39 



   39 





















































































































   39 




   39 


















































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *  linux/drivers/char/serial_core.h
 *
 *  Copyright (C) 2000 Deep Blue Solutions Ltd.
 */
#ifndef LINUX_SERIAL_CORE_H
#define LINUX_SERIAL_CORE_H

#include <linux/bitops.h>
#include <linux/compiler.h>
#include <linux/console.h>
#include <linux/interrupt.h>
#include <linux/lockdep.h>
#include <linux/printk.h>
#include <linux/spinlock.h>
#include <linux/sched.h>
#include <linux/tty.h>
#include <linux/mutex.h>
#include <linux/sysrq.h>
#include <uapi/linux/serial_core.h>

#ifdef CONFIG_SERIAL_CORE_CONSOLE
#define uart_console(port) \
        ((port)->cons && (port)->cons->index == (port)->line)
#else
#define uart_console(port)      ({ (void)port; 0; })
#endif

struct uart_port;
struct serial_struct;
struct serial_port_device;
struct device;
struct gpio_desc;

/**
 * struct uart_ops -- interface between serial_core and the driver
 *
 * This structure describes all the operations that can be done on the
 * physical hardware.
 *
 * @tx_empty: ``unsigned int ()(struct uart_port *port)``
 *
 *        This function tests whether the transmitter fifo and shifter for the
 *        @port is empty. If it is empty, this function should return
 *        %TIOCSER_TEMT, otherwise return 0. If the port does not support this
 *        operation, then it should return %TIOCSER_TEMT.
 *
 *        Locking: none.
 *        Interrupts: caller dependent.
 *        This call must not sleep
 *
 * @set_mctrl: ``void ()(struct uart_port *port, unsigned int mctrl)``
 *
 *        This function sets the modem control lines for @port to the state
 *        described by @mctrl. The relevant bits of @mctrl are:
 *
 *                - %TIOCM_RTS        RTS signal.
 *                - %TIOCM_DTR        DTR signal.
 *                - %TIOCM_OUT1        OUT1 signal.
 *                - %TIOCM_OUT2        OUT2 signal.
 *                - %TIOCM_LOOP        Set the port into loopback mode.
 *
 *        If the appropriate bit is set, the signal should be driven
 *        active.  If the bit is clear, the signal should be driven
 *        inactive.
 *
 *        Locking: @port->lock taken.
 *        Interrupts: locally disabled.
 *        This call must not sleep
 *
 * @get_mctrl: ``unsigned int ()(struct uart_port *port)``
 *
 *        Returns the current state of modem control inputs of @port. The state
 *        of the outputs should not be returned, since the core keeps track of
 *        their state. The state information should include:
 *
 *                - %TIOCM_CAR        state of DCD signal
 *                - %TIOCM_CTS        state of CTS signal
 *                - %TIOCM_DSR        state of DSR signal
 *                - %TIOCM_RI        state of RI signal
 *
 *        The bit is set if the signal is currently driven active.  If
 *        the port does not support CTS, DCD or DSR, the driver should
 *        indicate that the signal is permanently active. If RI is
 *        not available, the signal should not be indicated as active.
 *
 *        Locking: @port->lock taken.
 *        Interrupts: locally disabled.
 *        This call must not sleep
 *
 * @stop_tx: ``void ()(struct uart_port *port)``
 *
 *        Stop transmitting characters. This might be due to the CTS line
 *        becoming inactive or the tty layer indicating we want to stop
 *        transmission due to an %XOFF character.
 *
 *        The driver should stop transmitting characters as soon as possible.
 *
 *        Locking: @port->lock taken.
 *        Interrupts: locally disabled.
 *        This call must not sleep
 *
 * @start_tx: ``void ()(struct uart_port *port)``
 *
 *        Start transmitting characters.
 *
 *        Locking: @port->lock taken.
 *        Interrupts: locally disabled.
 *        This call must not sleep
 *
 * @throttle: ``void ()(struct uart_port *port)``
 *
 *        Notify the serial driver that input buffers for the line discipline are
 *        close to full, and it should somehow signal that no more characters
 *        should be sent to the serial port.
 *        This will be called only if hardware assisted flow control is enabled.
 *
 *        Locking: serialized with @unthrottle() and termios modification by the
 *        tty layer.
 *
 * @unthrottle: ``void ()(struct uart_port *port)``
 *
 *        Notify the serial driver that characters can now be sent to the serial
 *        port without fear of overrunning the input buffers of the line
 *        disciplines.
 *
 *        This will be called only if hardware assisted flow control is enabled.
 *
 *        Locking: serialized with @throttle() and termios modification by the
 *        tty layer.
 *
 * @send_xchar: ``void ()(struct uart_port *port, char ch)``
 *
 *        Transmit a high priority character, even if the port is stopped. This
 *        is used to implement XON/XOFF flow control and tcflow(). If the serial
 *        driver does not implement this function, the tty core will append the
 *        character to the circular buffer and then call start_tx() / stop_tx()
 *        to flush the data out.
 *
 *        Do not transmit if @ch == '\0' (%__DISABLED_CHAR).
 *
 *        Locking: none.
 *        Interrupts: caller dependent.
 *
 * @start_rx: ``void ()(struct uart_port *port)``
 *
 *        Start receiving characters.
 *
 *        Locking: @port->lock taken.
 *        Interrupts: locally disabled.
 *        This call must not sleep
 *
 * @stop_rx: ``void ()(struct uart_port *port)``
 *
 *        Stop receiving characters; the @port is in the process of being closed.
 *
 *        Locking: @port->lock taken.
 *        Interrupts: locally disabled.
 *        This call must not sleep
 *
 * @enable_ms: ``void ()(struct uart_port *port)``
 *
 *        Enable the modem status interrupts.
 *
 *        This method may be called multiple times. Modem status interrupts
 *        should be disabled when the @shutdown() method is called.
 *
 *        Locking: @port->lock taken.
 *        Interrupts: locally disabled.
 *        This call must not sleep
 *
 * @break_ctl: ``void ()(struct uart_port *port, int ctl)``
 *
 *        Control the transmission of a break signal. If @ctl is nonzero, the
 *        break signal should be transmitted. The signal should be terminated
 *        when another call is made with a zero @ctl.
 *
 *        Locking: caller holds tty_port->mutex
 *
 * @startup: ``int ()(struct uart_port *port)``
 *
 *        Grab any interrupt resources and initialise any low level driver state.
 *        Enable the port for reception. It should not activate RTS nor DTR;
 *        this will be done via a separate call to @set_mctrl().
 *
 *        This method will only be called when the port is initially opened.
 *
 *        Locking: port_sem taken.
 *        Interrupts: globally disabled.
 *
 * @shutdown: ``void ()(struct uart_port *port)``
 *
 *        Disable the @port, disable any break condition that may be in effect,
 *        and free any interrupt resources. It should not disable RTS nor DTR;
 *        this will have already been done via a separate call to @set_mctrl().
 *
 *        Drivers must not access @port->state once this call has completed.
 *
 *        This method will only be called when there are no more users of this
 *        @port.
 *
 *        Locking: port_sem taken.
 *        Interrupts: caller dependent.
 *
 * @flush_buffer: ``void ()(struct uart_port *port)``
 *
 *        Flush any write buffers, reset any DMA state and stop any ongoing DMA
 *        transfers.
 *
 *        This will be called whenever the @port->state->xmit circular buffer is
 *        cleared.
 *
 *        Locking: @port->lock taken.
 *        Interrupts: locally disabled.
 *        This call must not sleep
 *
 * @set_termios: ``void ()(struct uart_port *port, struct ktermios *new,
 *                        struct ktermios *old)``
 *
 *        Change the @port parameters, including word length, parity, stop bits.
 *        Update @port->read_status_mask and @port->ignore_status_mask to
 *        indicate the types of events we are interested in receiving. Relevant
 *        ktermios::c_cflag bits are:
 *
 *        - %CSIZE - word size
 *        - %CSTOPB - 2 stop bits
 *        - %PARENB - parity enable
 *        - %PARODD - odd parity (when %PARENB is in force)
 *        - %ADDRB - address bit (changed through uart_port::rs485_config()).
 *        - %CREAD - enable reception of characters (if not set, still receive
 *          characters from the port, but throw them away).
 *        - %CRTSCTS - if set, enable CTS status change reporting.
 *        - %CLOCAL - if not set, enable modem status change reporting.
 *
 *        Relevant ktermios::c_iflag bits are:
 *
 *        - %INPCK - enable frame and parity error events to be passed to the TTY
 *          layer.
 *        - %BRKINT / %PARMRK - both of these enable break events to be passed to
 *          the TTY layer.
 *        - %IGNPAR - ignore parity and framing errors.
 *        - %IGNBRK - ignore break errors. If %IGNPAR is also set, ignore overrun
 *          errors as well.
 *
 *        The interaction of the ktermios::c_iflag bits is as follows (parity
 *        error given as an example):
 *
 *        ============ ======= ======= =========================================
 *        Parity error INPCK   IGNPAR
 *        ============ ======= ======= =========================================
 *        n/a             0             n/a     character received, marked as %TTY_NORMAL
 *        None             1             n/a     character received, marked as %TTY_NORMAL
 *        Yes             1             0             character received, marked as %TTY_PARITY
 *        Yes             1             1             character discarded
 *        ============ ======= ======= =========================================
 *
 *        Other flags may be used (eg, xon/xoff characters) if your hardware
 *        supports hardware "soft" flow control.
 *
 *        Locking: caller holds tty_port->mutex
 *        Interrupts: caller dependent.
 *        This call must not sleep
 *
 * @set_ldisc: ``void ()(struct uart_port *port, struct ktermios *termios)``
 *
 *        Notifier for discipline change. See
 *        Documentation/driver-api/tty/tty_ldisc.rst.
 *
 *        Locking: caller holds tty_port->mutex
 *
 * @pm: ``void ()(struct uart_port *port, unsigned int state,
 *                 unsigned int oldstate)``
 *
 *        Perform any power management related activities on the specified @port.
 *        @state indicates the new state (defined by enum uart_pm_state),
 *        @oldstate indicates the previous state.
 *
 *        This function should not be used to grab any resources.
 *
 *        This will be called when the @port is initially opened and finally
 *        closed, except when the @port is also the system console. This will
 *        occur even if %CONFIG_PM is not set.
 *
 *        Locking: none.
 *        Interrupts: caller dependent.
 *
 * @type: ``const char *()(struct uart_port *port)``
 *
 *        Return a pointer to a string constant describing the specified @port,
 *        or return %NULL, in which case the string 'unknown' is substituted.
 *
 *        Locking: none.
 *        Interrupts: caller dependent.
 *
 * @release_port: ``void ()(struct uart_port *port)``
 *
 *        Release any memory and IO region resources currently in use by the
 *        @port.
 *
 *        Locking: none.
 *        Interrupts: caller dependent.
 *
 * @request_port: ``int ()(struct uart_port *port)``
 *
 *        Request any memory and IO region resources required by the port. If any
 *        fail, no resources should be registered when this function returns, and
 *        it should return -%EBUSY on failure.
 *
 *        Locking: none.
 *        Interrupts: caller dependent.
 *
 * @config_port: ``void ()(struct uart_port *port, int type)``
 *
 *        Perform any autoconfiguration steps required for the @port. @type
 *        contains a bit mask of the required configuration. %UART_CONFIG_TYPE
 *        indicates that the port requires detection and identification.
 *        @port->type should be set to the type found, or %PORT_UNKNOWN if no
 *        port was detected.
 *
 *        %UART_CONFIG_IRQ indicates autoconfiguration of the interrupt signal,
 *        which should be probed using standard kernel autoprobing techniques.
 *        This is not necessary on platforms where ports have interrupts
 *        internally hard wired (eg, system on a chip implementations).
 *
 *        Locking: none.
 *        Interrupts: caller dependent.
 *
 * @verify_port: ``int ()(struct uart_port *port,
 *                        struct serial_struct *serinfo)``
 *
 *        Verify the new serial port information contained within @serinfo is
 *        suitable for this port type.
 *
 *        Locking: none.
 *        Interrupts: caller dependent.
 *
 * @ioctl: ``int ()(struct uart_port *port, unsigned int cmd,
 *                unsigned long arg)``
 *
 *        Perform any port specific IOCTLs. IOCTL commands must be defined using
 *        the standard numbering system found in <asm/ioctl.h>.
 *
 *        Locking: none.
 *        Interrupts: caller dependent.
 *
 * @poll_init: ``int ()(struct uart_port *port)``
 *
 *        Called by kgdb to perform the minimal hardware initialization needed to
 *        support @poll_put_char() and @poll_get_char(). Unlike @startup(), this
 *        should not request interrupts.
 *
 *        Locking: %tty_mutex and tty_port->mutex taken.
 *        Interrupts: n/a.
 *
 * @poll_put_char: ``void ()(struct uart_port *port, unsigned char ch)``
 *
 *        Called by kgdb to write a single character @ch directly to the serial
 *        @port. It can and should block until there is space in the TX FIFO.
 *
 *        Locking: none.
 *        Interrupts: caller dependent.
 *        This call must not sleep
 *
 * @poll_get_char: ``int ()(struct uart_port *port)``
 *
 *        Called by kgdb to read a single character directly from the serial
 *        port. If data is available, it should be returned; otherwise the
 *        function should return %NO_POLL_CHAR immediately.
 *
 *        Locking: none.
 *        Interrupts: caller dependent.
 *        This call must not sleep
 */
struct uart_ops {
        unsigned int        (*tx_empty)(struct uart_port *);
        void                (*set_mctrl)(struct uart_port *, unsigned int mctrl);
        unsigned int        (*get_mctrl)(struct uart_port *);
        void                (*stop_tx)(struct uart_port *);
        void                (*start_tx)(struct uart_port *);
        void                (*throttle)(struct uart_port *);
        void                (*unthrottle)(struct uart_port *);
        void                (*send_xchar)(struct uart_port *, char ch);
        void                (*stop_rx)(struct uart_port *);
        void                (*start_rx)(struct uart_port *);
        void                (*enable_ms)(struct uart_port *);
        void                (*break_ctl)(struct uart_port *, int ctl);
        int                (*startup)(struct uart_port *);
        void                (*shutdown)(struct uart_port *);
        void                (*flush_buffer)(struct uart_port *);
        void                (*set_termios)(struct uart_port *, struct ktermios *new,
                                       const struct ktermios *old);
        void                (*set_ldisc)(struct uart_port *, struct ktermios *);
        void                (*pm)(struct uart_port *, unsigned int state,
                              unsigned int oldstate);
        const char        *(*type)(struct uart_port *);
        void                (*release_port)(struct uart_port *);
        int                (*request_port)(struct uart_port *);
        void                (*config_port)(struct uart_port *, int);
        int                (*verify_port)(struct uart_port *, struct serial_struct *);
        int                (*ioctl)(struct uart_port *, unsigned int, unsigned long);
#ifdef CONFIG_CONSOLE_POLL
        int                (*poll_init)(struct uart_port *);
        void                (*poll_put_char)(struct uart_port *, unsigned char);
        int                (*poll_get_char)(struct uart_port *);
#endif
};

#define NO_POLL_CHAR                0x00ff0000
#define UART_CONFIG_TYPE        (1 << 0)
#define UART_CONFIG_IRQ                (1 << 1)

struct uart_icount {
        __u32        cts;
        __u32        dsr;
        __u32        rng;
        __u32        dcd;
        __u32        rx;
        __u32        tx;
        __u32        frame;
        __u32        overrun;
        __u32        parity;
        __u32        brk;
        __u32        buf_overrun;
};

typedef u64 __bitwise upf_t;
typedef unsigned int __bitwise upstat_t;

enum uart_iotype {
        UPIO_UNKNOWN        = -1,
        UPIO_PORT        = SERIAL_IO_PORT,        /* 8b I/O port access */
        UPIO_HUB6        = SERIAL_IO_HUB6,        /* Hub6 ISA card */
        UPIO_MEM        = SERIAL_IO_MEM,        /* driver-specific */
        UPIO_MEM32        = SERIAL_IO_MEM32,        /* 32b little endian */
        UPIO_AU                = SERIAL_IO_AU,                /* Au1x00 and RT288x type IO */
        UPIO_TSI        = SERIAL_IO_TSI,        /* Tsi108/109 type IO */
        UPIO_MEM32BE        = SERIAL_IO_MEM32BE,        /* 32b big endian */
        UPIO_MEM16        = SERIAL_IO_MEM16,        /* 16b little endian */
};

struct uart_port {
        spinlock_t                lock;                        /* port lock */
        unsigned long                iobase;                        /* in/out[bwl] */
        unsigned char __iomem        *membase;                /* read/write[bwl] */
        u32                        (*serial_in)(struct uart_port *, unsigned int offset);
        void                        (*serial_out)(struct uart_port *, unsigned int offset, u32 val);
        void                        (*set_termios)(struct uart_port *,
                                               struct ktermios *new,
                                               const struct ktermios *old);
        void                        (*set_ldisc)(struct uart_port *,
                                             struct ktermios *);
        unsigned int                (*get_mctrl)(struct uart_port *);
        void                        (*set_mctrl)(struct uart_port *, unsigned int);
        unsigned int                (*get_divisor)(struct uart_port *,
                                               unsigned int baud,
                                               unsigned int *frac);
        void                        (*set_divisor)(struct uart_port *,
                                               unsigned int baud,
                                               unsigned int quot,
                                               unsigned int quot_frac);
        int                        (*startup)(struct uart_port *port);
        void                        (*shutdown)(struct uart_port *port);
        void                        (*throttle)(struct uart_port *port);
        void                        (*unthrottle)(struct uart_port *port);
        int                        (*handle_irq)(struct uart_port *);
        void                        (*pm)(struct uart_port *, unsigned int state,
                                      unsigned int old);
        void                        (*handle_break)(struct uart_port *);
        int                        (*rs485_config)(struct uart_port *,
                                                struct ktermios *termios,
                                                struct serial_rs485 *rs485);
        int                        (*iso7816_config)(struct uart_port *,
                                                  struct serial_iso7816 *iso7816);
        unsigned int                ctrl_id;                /* optional serial core controller id */
        unsigned int                port_id;                /* optional serial core port id */
        unsigned int                irq;                        /* irq number */
        unsigned long                irqflags;                /* irq flags  */
        unsigned int                uartclk;                /* base uart clock */
        unsigned int                fifosize;                /* tx fifo size */
        unsigned char                x_char;                        /* xon/xoff char */
        unsigned char                regshift;                /* reg offset shift */

        unsigned char                quirks;                        /* internal quirks */

        /* internal quirks must be updated while holding port mutex */
#define UPQ_NO_TXEN_TEST        BIT(0)

        enum uart_iotype        iotype;                        /* io access style */

        unsigned int                read_status_mask;        /* driver specific */
        unsigned int                ignore_status_mask;        /* driver specific */
        struct uart_state        *state;                        /* pointer to parent state */
        struct uart_icount        icount;                        /* statistics */

        struct console                *cons;                        /* struct console, if any */
        /* flags must be updated while holding port mutex */
        upf_t                        flags;

        /*
         * These flags must be equivalent to the flags defined in
         * include/uapi/linux/tty_flags.h which are the userspace definitions
         * assigned from the serial_struct flags in uart_set_info()
         * [for bit definitions in the UPF_CHANGE_MASK]
         *
         * Bits [0..ASYNCB_LAST_USER] are userspace defined/visible/changeable
         * The remaining bits are serial-core specific and not modifiable by
         * userspace.
         */
#ifdef CONFIG_HAS_IOPORT
#define UPF_FOURPORT                ((__force upf_t) ASYNC_FOURPORT       /* 1  */ )
#else
#define UPF_FOURPORT                0
#endif
#define UPF_SAK                        ((__force upf_t) ASYNC_SAK            /* 2  */ )
#define UPF_SPD_HI                ((__force upf_t) ASYNC_SPD_HI         /* 4  */ )
#define UPF_SPD_VHI                ((__force upf_t) ASYNC_SPD_VHI        /* 5  */ )
#define UPF_SPD_CUST                ((__force upf_t) ASYNC_SPD_CUST   /* 0x0030 */ )
#define UPF_SPD_WARP                ((__force upf_t) ASYNC_SPD_WARP   /* 0x1010 */ )
#define UPF_SPD_MASK                ((__force upf_t) ASYNC_SPD_MASK   /* 0x1030 */ )
#define UPF_SKIP_TEST                ((__force upf_t) ASYNC_SKIP_TEST      /* 6  */ )
#define UPF_AUTO_IRQ                ((__force upf_t) ASYNC_AUTO_IRQ       /* 7  */ )
#define UPF_HARDPPS_CD                ((__force upf_t) ASYNC_HARDPPS_CD     /* 11 */ )
#define UPF_SPD_SHI                ((__force upf_t) ASYNC_SPD_SHI        /* 12 */ )
#define UPF_LOW_LATENCY                ((__force upf_t) ASYNC_LOW_LATENCY    /* 13 */ )
#define UPF_BUGGY_UART                ((__force upf_t) ASYNC_BUGGY_UART     /* 14 */ )
#define UPF_MAGIC_MULTIPLIER        ((__force upf_t) ASYNC_MAGIC_MULTIPLIER /* 16 */ )

#define UPF_NO_THRE_TEST        ((__force upf_t) BIT_ULL(19))
/* Port has hardware-assisted h/w flow control */
#define UPF_AUTO_CTS                ((__force upf_t) BIT_ULL(20))
#define UPF_AUTO_RTS                ((__force upf_t) BIT_ULL(21))
#define UPF_HARD_FLOW                ((__force upf_t) (UPF_AUTO_CTS | UPF_AUTO_RTS))
/* Port has hardware-assisted s/w flow control */
#define UPF_SOFT_FLOW                ((__force upf_t) BIT_ULL(22))
#define UPF_CONS_FLOW                ((__force upf_t) BIT_ULL(23))
#define UPF_SHARE_IRQ                ((__force upf_t) BIT_ULL(24))
#define UPF_EXAR_EFR                ((__force upf_t) BIT_ULL(25))
#define UPF_BUG_THRE                ((__force upf_t) BIT_ULL(26))
/* The exact UART type is known and should not be probed.  */
#define UPF_FIXED_TYPE                ((__force upf_t) BIT_ULL(27))
#define UPF_BOOT_AUTOCONF        ((__force upf_t) BIT_ULL(28))
#define UPF_FIXED_PORT                ((__force upf_t) BIT_ULL(29))
#define UPF_DEAD                ((__force upf_t) BIT_ULL(30))
#define UPF_IOREMAP                ((__force upf_t) BIT_ULL(31))
#define UPF_FULL_PROBE                ((__force upf_t) BIT_ULL(32))

#define __UPF_CHANGE_MASK        0x17fff
#define UPF_CHANGE_MASK                ((__force upf_t) __UPF_CHANGE_MASK)
#define UPF_USR_MASK                ((__force upf_t) (UPF_SPD_MASK|UPF_LOW_LATENCY))

#if __UPF_CHANGE_MASK > ASYNC_FLAGS
#error Change mask not equivalent to userspace-visible bit defines
#endif

        /*
         * Must hold termios_rwsem, port mutex and port lock to change;
         * can hold any one lock to read.
         */
        upstat_t                status;

#define UPSTAT_CTS_ENABLE        ((__force upstat_t) (1 << 0))
#define UPSTAT_DCD_ENABLE        ((__force upstat_t) (1 << 1))
#define UPSTAT_AUTORTS                ((__force upstat_t) (1 << 2))
#define UPSTAT_AUTOCTS                ((__force upstat_t) (1 << 3))
#define UPSTAT_AUTOXOFF                ((__force upstat_t) (1 << 4))
#define UPSTAT_SYNC_FIFO        ((__force upstat_t) (1 << 5))

        bool                        hw_stopped;                /* sw-assisted CTS flow state */
        unsigned int                mctrl;                        /* current modem ctrl settings */
        unsigned int                frame_time;                /* frame timing in ns */
        unsigned int                type;                        /* port type */
        const struct uart_ops        *ops;
        unsigned int                custom_divisor;
        unsigned int                line;                        /* port index */
        unsigned int                minor;
        resource_size_t                mapbase;                /* for ioremap */
        resource_size_t                mapsize;
        struct device                *dev;                        /* serial port physical parent device */
        struct serial_port_device *port_dev;                /* serial core port device */

        unsigned long                sysrq;                        /* sysrq timeout */
        u8                        sysrq_ch;                /* char for sysrq */
        unsigned char                has_sysrq;
        unsigned char                sysrq_seq;                /* index in sysrq_toggle_seq */

        unsigned char                hub6;                        /* this should be in the 8250 driver */
        unsigned char                suspended;
        unsigned char                console_reinit;
        const char                *name;                        /* port name */
        struct attribute_group        *attr_group;                /* port specific attributes */
        const struct attribute_group **tty_groups;        /* all attributes (serial core use only) */
        struct serial_rs485     rs485;
        struct serial_rs485        rs485_supported;        /* Supported mask for serial_rs485 */
        struct gpio_desc        *rs485_term_gpio;        /* enable RS485 bus termination */
        struct gpio_desc        *rs485_rx_during_tx_gpio; /* Output GPIO that sets the state of RS485 RX during TX */
        struct serial_iso7816   iso7816;
        void                        *private_data;                /* generic platform data pointer */
};

/*
 * Only for console->device_lock()/_unlock() callbacks and internal
 * port lock wrapper synchronization.
 */
static inline void __uart_port_lock_irqsave(struct uart_port *up, unsigned long *flags)
{
        spin_lock_irqsave(&up->lock, *flags);
}

/*
 * Only for console->device_lock()/_unlock() callbacks and internal
 * port lock wrapper synchronization.
 */
static inline void __uart_port_unlock_irqrestore(struct uart_port *up, unsigned long flags)
{
        spin_unlock_irqrestore(&up->lock, flags);
}

/**
 * uart_port_set_cons - Safely set the @cons field for a uart
 * @up:                The uart port to set
 * @con:        The new console to set to
 *
 * This function must be used to set @up->cons. It uses the port lock to
 * synchronize with the port lock wrappers in order to ensure that the console
 * cannot change or disappear while another context is holding the port lock.
 */
static inline void uart_port_set_cons(struct uart_port *up, struct console *con)
{
        unsigned long flags;

        __uart_port_lock_irqsave(up, &flags);
        up->cons = con;
        __uart_port_unlock_irqrestore(up, flags);
}

/* Only for internal port lock wrapper usage. */
static inline bool __uart_port_using_nbcon(struct uart_port *up)
{
        lockdep_assert_held_once(&up->lock);

        if (likely(!uart_console(up)))
                return false;

        /*
         * @up->cons is only modified under the port lock. Therefore it is
         * certain that it cannot disappear here.
         *
         * @up->cons->node is added/removed from the console list under the
         * port lock. Therefore it is certain that the registration status
         * cannot change here, thus @up->cons->flags can be read directly.
         */
        if (hlist_unhashed_lockless(&up->cons->node) ||
            !(up->cons->flags & CON_NBCON) ||
            !up->cons->write_atomic) {
                return false;
        }

        return true;
}

/* Only for internal port lock wrapper usage. */
static inline bool __uart_port_nbcon_try_acquire(struct uart_port *up)
{
        if (!__uart_port_using_nbcon(up))
                return true;

        return nbcon_device_try_acquire(up->cons);
}

/* Only for internal port lock wrapper usage. */
static inline void __uart_port_nbcon_acquire(struct uart_port *up)
{
        if (!__uart_port_using_nbcon(up))
                return;

        while (!nbcon_device_try_acquire(up->cons))
                cpu_relax();
}

/* Only for internal port lock wrapper usage. */
static inline void __uart_port_nbcon_release(struct uart_port *up)
{
        if (!__uart_port_using_nbcon(up))
                return;

        nbcon_device_release(up->cons);
}

/**
 * uart_port_lock - Lock the UART port
 * @up:                Pointer to UART port structure
 */
static inline void uart_port_lock(struct uart_port *up)
{
        spin_lock(&up->lock);
        __uart_port_nbcon_acquire(up);
}

/**
 * uart_port_lock_irq - Lock the UART port and disable interrupts
 * @up:                Pointer to UART port structure
 */
static inline void uart_port_lock_irq(struct uart_port *up)
{
        spin_lock_irq(&up->lock);
        __uart_port_nbcon_acquire(up);
}

/**
 * uart_port_lock_irqsave - Lock the UART port, save and disable interrupts
 * @up:                Pointer to UART port structure
 * @flags:        Pointer to interrupt flags storage
 */
static inline void uart_port_lock_irqsave(struct uart_port *up, unsigned long *flags)
{
        spin_lock_irqsave(&up->lock, *flags);
        __uart_port_nbcon_acquire(up);
}

/**
 * uart_port_trylock - Try to lock the UART port
 * @up:                Pointer to UART port structure
 *
 * Returns: True if lock was acquired, false otherwise
 */
static inline bool uart_port_trylock(struct uart_port *up)
{
        if (!spin_trylock(&up->lock))
                return false;

        if (!__uart_port_nbcon_try_acquire(up)) {
                spin_unlock(&up->lock);
                return false;
        }

        return true;
}

/**
 * uart_port_trylock_irqsave - Try to lock the UART port, save and disable interrupts
 * @up:                Pointer to UART port structure
 * @flags:        Pointer to interrupt flags storage
 *
 * Returns: True if lock was acquired, false otherwise
 */
static inline bool uart_port_trylock_irqsave(struct uart_port *up, unsigned long *flags)
{
        if (!spin_trylock_irqsave(&up->lock, *flags))
                return false;

        if (!__uart_port_nbcon_try_acquire(up)) {
                spin_unlock_irqrestore(&up->lock, *flags);
                return false;
        }

        return true;
}

/**
 * uart_port_unlock - Unlock the UART port
 * @up:                Pointer to UART port structure
 */
static inline void uart_port_unlock(struct uart_port *up)
{
        __uart_port_nbcon_release(up);
        spin_unlock(&up->lock);
}

/**
 * uart_port_unlock_irq - Unlock the UART port and re-enable interrupts
 * @up:                Pointer to UART port structure
 */
static inline void uart_port_unlock_irq(struct uart_port *up)
{
        __uart_port_nbcon_release(up);
        spin_unlock_irq(&up->lock);
}

/**
 * uart_port_unlock_irqrestore - Unlock the UART port, restore interrupts
 * @up:                Pointer to UART port structure
 * @flags:        The saved interrupt flags for restore
 */
static inline void uart_port_unlock_irqrestore(struct uart_port *up, unsigned long flags)
{
        __uart_port_nbcon_release(up);
        spin_unlock_irqrestore(&up->lock, flags);
}

DEFINE_GUARD(uart_port_lock, struct uart_port *, uart_port_lock(_T), uart_port_unlock(_T));
DEFINE_GUARD_COND(uart_port_lock, _try, uart_port_trylock(_T));

DEFINE_GUARD(uart_port_lock_irq, struct uart_port *, uart_port_lock_irq(_T),
             uart_port_unlock_irq(_T));

DEFINE_LOCK_GUARD_1(uart_port_lock_irqsave, struct uart_port,
                    uart_port_lock_irqsave(_T->lock, &_T->flags),
                    uart_port_unlock_irqrestore(_T->lock, _T->flags),
                    unsigned long flags);
DEFINE_LOCK_GUARD_1_COND(uart_port_lock_irqsave, _try,
                         uart_port_trylock_irqsave(_T->lock, &_T->flags));

static inline int serial_port_in(struct uart_port *up, int offset)
{
        return up->serial_in(up, offset);
}

static inline void serial_port_out(struct uart_port *up, int offset, int value)
{
        up->serial_out(up, offset, value);
}

/**
 * enum uart_pm_state - power states for UARTs
 * @UART_PM_STATE_ON: UART is powered, up and operational
 * @UART_PM_STATE_OFF: UART is powered off
 * @UART_PM_STATE_UNDEFINED: sentinel
 */
enum uart_pm_state {
        UART_PM_STATE_ON = 0,
        UART_PM_STATE_OFF = 3, /* number taken from ACPI */
        UART_PM_STATE_UNDEFINED,
};

/*
 * This is the state information which is persistent across opens.
 */
struct uart_state {
        struct tty_port                port;

        enum uart_pm_state        pm_state;

        atomic_t                refcount;
        wait_queue_head_t        remove_wait;
        struct uart_port        *uart_port;
};

#define UART_XMIT_SIZE        PAGE_SIZE


/* number of characters left in xmit buffer before we ask for more */
#define WAKEUP_CHARS                256

/**
 * uart_xmit_advance - Advance xmit buffer and account Tx'ed chars
 * @up: uart_port structure describing the port
 * @chars: number of characters sent
 *
 * This function advances the tail of circular xmit buffer by the number of
 * @chars transmitted and handles accounting of transmitted bytes (into
 * @up's icount.tx).
 */
static inline void uart_xmit_advance(struct uart_port *up, unsigned int chars)
{
        struct tty_port *tport = &up->state->port;

        kfifo_skip_count(&tport->xmit_fifo, chars);
        up->icount.tx += chars;
}

static inline unsigned int uart_fifo_out(struct uart_port *up,
                unsigned char *buf, unsigned int chars)
{
        struct tty_port *tport = &up->state->port;

        chars = kfifo_out(&tport->xmit_fifo, buf, chars);
        up->icount.tx += chars;

        return chars;
}

static inline unsigned int uart_fifo_get(struct uart_port *up,
                unsigned char *ch)
{
        struct tty_port *tport = &up->state->port;
        unsigned int chars;

        chars = kfifo_get(&tport->xmit_fifo, ch);
        up->icount.tx += chars;

        return chars;
}

struct module;
struct tty_driver;

struct uart_driver {
        struct module                *owner;
        const char                *driver_name;
        const char                *dev_name;
        int                         major;
        int                         minor;
        int                         nr;
        struct console                *cons;

        /*
         * these are private; the low level driver should not
         * touch these; they should be initialised to NULL
         */
        struct uart_state        *state;
        struct tty_driver        *tty_driver;
};

void uart_write_wakeup(struct uart_port *port);

/**
 * enum UART_TX_FLAGS -- flags for uart_port_tx_flags()
 *
 * @UART_TX_NOSTOP: don't call port->ops->stop_tx() on empty buffer
 */
enum UART_TX_FLAGS {
        UART_TX_NOSTOP = BIT(0),
};

#define __uart_port_tx(uport, ch, flags, tx_ready, put_char, tx_done,              \
                       for_test, for_post)                                      \
({                                                                              \
        struct uart_port *__port = (uport);                                      \
        struct tty_port *__tport = &__port->state->port;                      \
        unsigned int pending;                                                      \
                                                                              \
        for (; (for_test) && (tx_ready); (for_post), __port->icount.tx++) {   \
                if (__port->x_char) {                                              \
                        (ch) = __port->x_char;                                      \
                        (put_char);                                              \
                        __port->x_char = 0;                                      \
                        continue;                                              \
                }                                                              \
                                                                              \
                if (uart_tx_stopped(__port))                                      \
                        break;                                                      \
                                                                              \
                if (!kfifo_get(&__tport->xmit_fifo, &(ch)))                      \
                        break;                                                      \
                                                                              \
                (put_char);                                                      \
        }                                                                      \
                                                                              \
        (tx_done);                                                              \
                                                                              \
        pending = kfifo_len(&__tport->xmit_fifo);                              \
        if (pending < WAKEUP_CHARS) {                                              \
                uart_write_wakeup(__port);                                      \
                                                                              \
                if (!((flags) & UART_TX_NOSTOP) && pending == 0)              \
                        __port->ops->stop_tx(__port);                              \
        }                                                                      \
                                                                              \
        pending;                                                              \
})

/**
 * uart_port_tx_limited -- transmit helper for uart_port with count limiting
 * @port: uart port
 * @ch: variable to store a character to be written to the HW
 * @count: a limit of characters to send
 * @tx_ready: can HW accept more data function
 * @put_char: function to write a character
 * @tx_done: function to call after the loop is done
 *
 * This helper transmits characters from the xmit buffer to the hardware using
 * @put_char(). It does so until @count characters are sent and while @tx_ready
 * evaluates to true.
 *
 * Returns: the number of characters in the xmit buffer when done.
 *
 * The expression in macro parameters shall be designed as follows:
 *  * **tx_ready:** should evaluate to true if the HW can accept more data to
 *    be sent. This parameter can be %true, which means the HW is always ready.
 *  * **put_char:** shall write @ch to the device of @port.
 *  * **tx_done:** when the write loop is done, this can perform arbitrary
 *    action before potential invocation of ops->stop_tx() happens. If the
 *    driver does not need to do anything, use e.g. ({}).
 *
 * For all of them, @port->lock is held, interrupts are locally disabled and
 * the expressions must not sleep.
 */
#define uart_port_tx_limited(port, ch, count, tx_ready, put_char, tx_done) ({ \
        unsigned int __count = (count);                                              \
        __uart_port_tx(port, ch, 0, tx_ready, put_char, tx_done, __count,     \
                        __count--);                                              \
})

/**
 * uart_port_tx_limited_flags -- transmit helper for uart_port with count limiting with flags
 * @port: uart port
 * @ch: variable to store a character to be written to the HW
 * @flags: %UART_TX_NOSTOP or similar
 * @count: a limit of characters to send
 * @tx_ready: can HW accept more data function
 * @put_char: function to write a character
 * @tx_done: function to call after the loop is done
 *
 * See uart_port_tx_limited() for more details.
 */
#define uart_port_tx_limited_flags(port, ch, flags, count, tx_ready, put_char, tx_done) ({ \
        unsigned int __count = (count);                                                           \
        __uart_port_tx(port, ch, flags, tx_ready, put_char, tx_done, __count,                   \
                        __count--);                                                           \
})

/**
 * uart_port_tx -- transmit helper for uart_port
 * @port: uart port
 * @ch: variable to store a character to be written to the HW
 * @tx_ready: can HW accept more data function
 * @put_char: function to write a character
 *
 * See uart_port_tx_limited() for more details.
 */
#define uart_port_tx(port, ch, tx_ready, put_char)                        \
        __uart_port_tx(port, ch, 0, tx_ready, put_char, ({}), true, ({}))


/**
 * uart_port_tx_flags -- transmit helper for uart_port with flags
 * @port: uart port
 * @ch: variable to store a character to be written to the HW
 * @flags: %UART_TX_NOSTOP or similar
 * @tx_ready: can HW accept more data function
 * @put_char: function to write a character
 *
 * See uart_port_tx_limited() for more details.
 */
#define uart_port_tx_flags(port, ch, flags, tx_ready, put_char)                \
        __uart_port_tx(port, ch, flags, tx_ready, put_char, ({}), true, ({}))
/*
 * Baud rate helpers.
 */
void uart_update_timeout(struct uart_port *port, unsigned int cflag,
                         unsigned int baud);
unsigned int uart_get_baud_rate(struct uart_port *port, struct ktermios *termios,
                                const struct ktermios *old, unsigned int min,
                                unsigned int max);
unsigned int uart_get_divisor(struct uart_port *port, unsigned int baud);

/*
 * Calculates FIFO drain time.
 */
static inline unsigned long uart_fifo_timeout(struct uart_port *port)
{
        u64 fifo_timeout = (u64)READ_ONCE(port->frame_time) * port->fifosize;

        /* Add .02 seconds of slop */
        fifo_timeout += 20 * NSEC_PER_MSEC;

        return max(nsecs_to_jiffies(fifo_timeout), 1UL);
}

/* Base timer interval for polling */
static inline unsigned long uart_poll_timeout(struct uart_port *port)
{
        unsigned long timeout = uart_fifo_timeout(port);

        return timeout > 6 ? (timeout / 2 - 2) : 1;
}

/*
 * Console helpers.
 */
struct earlycon_device {
        struct console *con;
        struct uart_port port;
        char options[32];                /* e.g., 115200n8 */
        unsigned int baud;
};

struct earlycon_id {
        char        name[15];
        char        name_term;        /* In case compiler didn't '\0' term name */
        char        compatible[128];
        int        (*setup)(struct earlycon_device *, const char *options);
};

extern const struct earlycon_id __earlycon_table[];
extern const struct earlycon_id __earlycon_table_end[];

#if defined(CONFIG_SERIAL_EARLYCON) && !defined(MODULE)
#define EARLYCON_USED_OR_UNUSED        __used
#else
#define EARLYCON_USED_OR_UNUSED        __maybe_unused
#endif

#define OF_EARLYCON_DECLARE(_name, compat, fn)                                \
        static const struct earlycon_id __UNIQUE_ID(__earlycon_##_name) \
                EARLYCON_USED_OR_UNUSED  __section("__earlycon_table")  \
                __aligned(__alignof__(struct earlycon_id))                \
                = { .name = __stringify(_name),                                \
                    .compatible = compat,                                \
                    .setup = fn }

#define EARLYCON_DECLARE(_name, fn)        OF_EARLYCON_DECLARE(_name, "", fn)

int of_setup_earlycon(const struct earlycon_id *match, unsigned long node,
                      const char *options);

#ifdef CONFIG_SERIAL_EARLYCON
extern bool earlycon_acpi_spcr_enable __initdata;
int setup_earlycon(char *buf);
#else
static const bool earlycon_acpi_spcr_enable EARLYCON_USED_OR_UNUSED;
static inline int setup_earlycon(char *buf) { return 0; }
#endif

/* Variant of uart_console_registered() when the console_list_lock is held. */
static inline bool uart_console_registered_locked(struct uart_port *port)
{
        return uart_console(port) && console_is_registered_locked(port->cons);
}

static inline bool uart_console_registered(struct uart_port *port)
{
        return uart_console(port) && console_is_registered(port->cons);
}

int uart_parse_earlycon(char *p, enum uart_iotype *iotype,
                        resource_size_t *addr, char **options);
void uart_parse_options(const char *options, int *baud, int *parity, int *bits,
                        int *flow);
int uart_set_options(struct uart_port *port, struct console *co, int baud,
                     int parity, int bits, int flow);
struct tty_driver *uart_console_device(struct console *co, int *index);
void uart_console_write(struct uart_port *port, const char *s,
                        unsigned int count,
                        void (*putchar)(struct uart_port *, unsigned char));

/*
 * Port/driver registration/removal
 */
int uart_register_driver(struct uart_driver *uart);
void uart_unregister_driver(struct uart_driver *uart);
int uart_add_one_port(struct uart_driver *reg, struct uart_port *port);
void uart_remove_one_port(struct uart_driver *reg, struct uart_port *port);
int uart_read_port_properties(struct uart_port *port);
int uart_read_and_validate_port_properties(struct uart_port *port);
bool uart_match_port(const struct uart_port *port1,
                const struct uart_port *port2);

/*
 * Power Management
 */
int uart_suspend_port(struct uart_driver *reg, struct uart_port *port);
int uart_resume_port(struct uart_driver *reg, struct uart_port *port);

static inline int uart_tx_stopped(struct uart_port *port)
{
        struct tty_struct *tty = port->state->port.tty;
        if ((tty && tty->flow.stopped) || port->hw_stopped)
                return 1;
        return 0;
}

static inline bool uart_cts_enabled(struct uart_port *uport)
{
        return !!(uport->status & UPSTAT_CTS_ENABLE);
}

static inline bool uart_softcts_mode(struct uart_port *uport)
{
        upstat_t mask = UPSTAT_CTS_ENABLE | UPSTAT_AUTOCTS;

        return ((uport->status & mask) == UPSTAT_CTS_ENABLE);
}

/*
 * The following are helper functions for the low level drivers.
 */

void uart_handle_dcd_change(struct uart_port *uport, bool active);
void uart_handle_cts_change(struct uart_port *uport, bool active);

void uart_insert_char(struct uart_port *port, unsigned int status,
                      unsigned int overrun, u8 ch, u8 flag);

void uart_xchar_out(struct uart_port *uport, int offset);

#ifdef CONFIG_MAGIC_SYSRQ_SERIAL
#define SYSRQ_TIMEOUT        (HZ * 5)

bool uart_try_toggle_sysrq(struct uart_port *port, u8 ch);

static inline int uart_handle_sysrq_char(struct uart_port *port, u8 ch)
{
        if (!port->sysrq)
                return 0;

        if (ch && time_before(jiffies, port->sysrq)) {
                if (sysrq_mask()) {
                        handle_sysrq(ch);
                        port->sysrq = 0;
                        return 1;
                }
                if (uart_try_toggle_sysrq(port, ch))
                        return 1;
        }
        port->sysrq = 0;

        return 0;
}

static inline int uart_prepare_sysrq_char(struct uart_port *port, u8 ch)
{
        if (!port->sysrq)
                return 0;

        if (ch && time_before(jiffies, port->sysrq)) {
                if (sysrq_mask()) {
                        port->sysrq_ch = ch;
                        port->sysrq = 0;
                        return 1;
                }
                if (uart_try_toggle_sysrq(port, ch))
                        return 1;
        }
        port->sysrq = 0;

        return 0;
}

static inline void uart_unlock_and_check_sysrq(struct uart_port *port)
{
        u8 sysrq_ch;

        if (!port->has_sysrq) {
                uart_port_unlock(port);
                return;
        }

        sysrq_ch = port->sysrq_ch;
        port->sysrq_ch = 0;

        uart_port_unlock(port);

        if (sysrq_ch)
                handle_sysrq(sysrq_ch);
}

static inline void uart_unlock_and_check_sysrq_irqrestore(struct uart_port *port,
                unsigned long flags)
{
        u8 sysrq_ch;

        if (!port->has_sysrq) {
                uart_port_unlock_irqrestore(port, flags);
                return;
        }

        sysrq_ch = port->sysrq_ch;
        port->sysrq_ch = 0;

        uart_port_unlock_irqrestore(port, flags);

        if (sysrq_ch)
                handle_sysrq(sysrq_ch);
}
#else        /* CONFIG_MAGIC_SYSRQ_SERIAL */
static inline int uart_handle_sysrq_char(struct uart_port *port, u8 ch)
{
        return 0;
}
static inline int uart_prepare_sysrq_char(struct uart_port *port, u8 ch)
{
        return 0;
}
static inline void uart_unlock_and_check_sysrq(struct uart_port *port)
{
        uart_port_unlock(port);
}
static inline void uart_unlock_and_check_sysrq_irqrestore(struct uart_port *port,
                unsigned long flags)
{
        uart_port_unlock_irqrestore(port, flags);
}
#endif        /* CONFIG_MAGIC_SYSRQ_SERIAL */

/*
 * We do the SysRQ and SAK checking like this...
 */
static inline int uart_handle_break(struct uart_port *port)
{
        struct uart_state *state = port->state;

        if (port->handle_break)
                port->handle_break(port);

#ifdef CONFIG_MAGIC_SYSRQ_SERIAL
        if (port->has_sysrq && uart_console(port)) {
                if (!port->sysrq) {
                        port->sysrq = jiffies + SYSRQ_TIMEOUT;
                        return 1;
                }
                port->sysrq = 0;
        }
#endif
        if (port->flags & UPF_SAK)
                do_SAK(state->port.tty);
        return 0;
}

/*
 *        UART_ENABLE_MS - determine if port should enable modem status irqs
 */
#define UART_ENABLE_MS(port,cflag)        ((port)->flags & UPF_HARDPPS_CD || \
                                         (cflag) & CRTSCTS || \
                                         !((cflag) & CLOCAL))

int uart_get_rs485_mode(struct uart_port *port);
#endif /* LINUX_SERIAL_CORE_H */









































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Authentication token and access key management
 *
 * Copyright (C) 2004, 2007 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * See Documentation/security/keys/core.rst for information on keys/keyrings.
 */

#ifndef _LINUX_KEY_H
#define _LINUX_KEY_H

#include <linux/types.h>
#include <linux/list.h>
#include <linux/rbtree.h>
#include <linux/rcupdate.h>
#include <linux/sysctl.h>
#include <linux/rwsem.h>
#include <linux/atomic.h>
#include <linux/assoc_array.h>
#include <linux/refcount.h>
#include <linux/time64.h>

#ifdef __KERNEL__
#include <linux/uidgid.h>

/* key handle serial number */
typedef int32_t key_serial_t;

/* key handle permissions mask */
typedef uint32_t key_perm_t;

struct key;
struct net;

#ifdef CONFIG_KEYS

#undef KEY_DEBUGGING

#define KEY_POS_VIEW        0x01000000        /* possessor can view a key's attributes */
#define KEY_POS_READ        0x02000000        /* possessor can read key payload / view keyring */
#define KEY_POS_WRITE        0x04000000        /* possessor can update key payload / add link to keyring */
#define KEY_POS_SEARCH        0x08000000        /* possessor can find a key in search / search a keyring */
#define KEY_POS_LINK        0x10000000        /* possessor can create a link to a key/keyring */
#define KEY_POS_SETATTR        0x20000000        /* possessor can set key attributes */
#define KEY_POS_ALL        0x3f000000

#define KEY_USR_VIEW        0x00010000        /* user permissions... */
#define KEY_USR_READ        0x00020000
#define KEY_USR_WRITE        0x00040000
#define KEY_USR_SEARCH        0x00080000
#define KEY_USR_LINK        0x00100000
#define KEY_USR_SETATTR        0x00200000
#define KEY_USR_ALL        0x003f0000

#define KEY_GRP_VIEW        0x00000100        /* group permissions... */
#define KEY_GRP_READ        0x00000200
#define KEY_GRP_WRITE        0x00000400
#define KEY_GRP_SEARCH        0x00000800
#define KEY_GRP_LINK        0x00001000
#define KEY_GRP_SETATTR        0x00002000
#define KEY_GRP_ALL        0x00003f00

#define KEY_OTH_VIEW        0x00000001        /* third party permissions... */
#define KEY_OTH_READ        0x00000002
#define KEY_OTH_WRITE        0x00000004
#define KEY_OTH_SEARCH        0x00000008
#define KEY_OTH_LINK        0x00000010
#define KEY_OTH_SETATTR        0x00000020
#define KEY_OTH_ALL        0x0000003f

#define KEY_PERM_UNDEF        0xffffffff

/*
 * The permissions required on a key that we're looking up.
 */
enum key_need_perm {
        KEY_NEED_UNSPECIFIED,        /* Needed permission unspecified */
        KEY_NEED_VIEW,                /* Require permission to view attributes */
        KEY_NEED_READ,                /* Require permission to read content */
        KEY_NEED_WRITE,                /* Require permission to update / modify */
        KEY_NEED_SEARCH,        /* Require permission to search (keyring) or find (key) */
        KEY_NEED_LINK,                /* Require permission to link */
        KEY_NEED_SETATTR,        /* Require permission to change attributes */
        KEY_NEED_UNLINK,        /* Require permission to unlink key */
        KEY_SYSADMIN_OVERRIDE,        /* Special: override by CAP_SYS_ADMIN */
        KEY_AUTHTOKEN_OVERRIDE,        /* Special: override by possession of auth token */
        KEY_DEFER_PERM_CHECK,        /* Special: permission check is deferred */
};

enum key_lookup_flag {
        KEY_LOOKUP_CREATE = 0x01,
        KEY_LOOKUP_PARTIAL = 0x02,
        KEY_LOOKUP_ALL = (KEY_LOOKUP_CREATE | KEY_LOOKUP_PARTIAL),
};

struct seq_file;
struct user_struct;
struct signal_struct;
struct cred;

struct key_type;
struct key_owner;
struct key_tag;
struct keyring_list;
struct keyring_name;

struct key_tag {
        struct rcu_head                rcu;
        refcount_t                usage;
        bool                        removed;        /* T when subject removed */
};

struct keyring_index_key {
        /* [!] If this structure is altered, the union in struct key must change too! */
        unsigned long                hash;                        /* Hash value */
        union {
                struct {
#ifdef __LITTLE_ENDIAN /* Put desc_len at the LSB of x */
                        u16        desc_len;
                        char        desc[sizeof(long) - 2];        /* First few chars of description */
#else
                        char        desc[sizeof(long) - 2];        /* First few chars of description */
                        u16        desc_len;
#endif
                };
                unsigned long x;
        };
        struct key_type                *type;
        struct key_tag                *domain_tag;        /* Domain of operation */
        const char                *description;
};

union key_payload {
        void __rcu                *rcu_data0;
        void                        *data[4];
};

/*****************************************************************************/
/*
 * key reference with possession attribute handling
 *
 * NOTE! key_ref_t is a typedef'd pointer to a type that is not actually
 * defined. This is because we abuse the bottom bit of the reference to carry a
 * flag to indicate whether the calling process possesses that key in one of
 * its keyrings.
 *
 * the key_ref_t has been made a separate type so that the compiler can reject
 * attempts to dereference it without proper conversion.
 *
 * the three functions are used to assemble and disassemble references
 */
typedef struct __key_reference_with_attributes *key_ref_t;

static inline key_ref_t make_key_ref(const struct key *key,
                                     bool possession)
{
        return (key_ref_t) ((unsigned long) key | possession);
}

static inline struct key *key_ref_to_ptr(const key_ref_t key_ref)
{
        return (struct key *) ((unsigned long) key_ref & ~1UL);
}

static inline bool is_key_possessed(const key_ref_t key_ref)
{
        return (unsigned long) key_ref & 1UL;
}

typedef int (*key_restrict_link_func_t)(struct key *dest_keyring,
                                        const struct key_type *type,
                                        const union key_payload *payload,
                                        struct key *restriction_key);

struct key_restriction {
        key_restrict_link_func_t check;
        struct key *key;
        struct key_type *keytype;
};

enum key_state {
        KEY_IS_UNINSTANTIATED,
        KEY_IS_POSITIVE,                /* Positively instantiated */
};

/*****************************************************************************/
/*
 * authentication token / access credential / keyring
 * - types of key include:
 *   - keyrings
 *   - disk encryption IDs
 *   - Kerberos TGTs and tickets
 */
struct key {
        refcount_t                usage;                /* number of references */
        key_serial_t                serial;                /* key serial number */
        union {
                struct list_head graveyard_link;
                struct rb_node        serial_node;
        };
#ifdef CONFIG_KEY_NOTIFICATIONS
        struct watch_list        *watchers;        /* Entities watching this key for changes */
#endif
        struct rw_semaphore        sem;                /* change vs change sem */
        struct key_user                *user;                /* owner of this key */
        void                        *security;        /* security data for this key */
        union {
                time64_t        expiry;                /* time at which key expires (or 0) */
                time64_t        revoked_at;        /* time at which key was revoked */
        };
        time64_t                last_used_at;        /* last time used for LRU keyring discard */
        kuid_t                        uid;
        kgid_t                        gid;
        key_perm_t                perm;                /* access permissions */
        unsigned short                quotalen;        /* length added to quota */
        unsigned short                datalen;        /* payload data length
                                                 * - may not match RCU dereferenced payload
                                                 * - payload should contain own length
                                                 */
        short                        state;                /* Key state (+) or rejection error (-) */

#ifdef KEY_DEBUGGING
        unsigned                magic;
#define KEY_DEBUG_MAGIC                0x18273645u
#endif

        unsigned long                flags;                /* status flags (change with bitops) */
#define KEY_FLAG_DEAD                0        /* set if key type has been deleted */
#define KEY_FLAG_REVOKED        1        /* set if key had been revoked */
#define KEY_FLAG_IN_QUOTA        2        /* set if key consumes quota */
#define KEY_FLAG_USER_CONSTRUCT        3        /* set if key is being constructed in userspace */
#define KEY_FLAG_ROOT_CAN_CLEAR        4        /* set if key can be cleared by root without permission */
#define KEY_FLAG_INVALIDATED        5        /* set if key has been invalidated */
#define KEY_FLAG_BUILTIN        6        /* set if key is built in to the kernel */
#define KEY_FLAG_ROOT_CAN_INVAL        7        /* set if key can be invalidated by root without permission */
#define KEY_FLAG_KEEP                8        /* set if key should not be removed */
#define KEY_FLAG_UID_KEYRING        9        /* set if key is a user or user session keyring */
#define KEY_FLAG_USER_ALIVE        10        /* set if final put has not happened on key yet */

        /* the key type and key description string
         * - the desc is used to match a key against search criteria
         * - it should be a printable string
         * - eg: for krb5 AFS, this might be "afs@REDHAT.COM"
         */
        union {
                struct keyring_index_key index_key;
                struct {
                        unsigned long        hash;
                        unsigned long        len_desc;
                        struct key_type        *type;                /* type of key */
                        struct key_tag        *domain_tag;        /* Domain of operation */
                        char                *description;
                };
        };

        /* key data
         * - this is used to hold the data actually used in cryptography or
         *   whatever
         */
        union {
                union key_payload payload;
                struct {
                        /* Keyring bits */
                        struct list_head name_link;
                        struct assoc_array keys;
                };
        };

        /* This is set on a keyring to restrict the addition of a link to a key
         * to it.  If this structure isn't provided then it is assumed that the
         * keyring is open to any addition.  It is ignored for non-keyring
         * keys. Only set this value using keyring_restrict(), keyring_alloc(),
         * or key_alloc().
         *
         * This is intended for use with rings of trusted keys whereby addition
         * to the keyring needs to be controlled.  KEY_ALLOC_BYPASS_RESTRICTION
         * overrides this, allowing the kernel to add extra keys without
         * restriction.
         */
        struct key_restriction *restrict_link;
};

extern struct key *key_alloc(struct key_type *type,
                             const char *desc,
                             kuid_t uid, kgid_t gid,
                             const struct cred *cred,
                             key_perm_t perm,
                             unsigned long flags,
                             struct key_restriction *restrict_link);


#define KEY_ALLOC_IN_QUOTA                0x0000        /* add to quota, reject if would overrun */
#define KEY_ALLOC_QUOTA_OVERRUN                0x0001        /* add to quota, permit even if overrun */
#define KEY_ALLOC_NOT_IN_QUOTA                0x0002        /* not in quota */
#define KEY_ALLOC_BUILT_IN                0x0004        /* Key is built into kernel */
#define KEY_ALLOC_BYPASS_RESTRICTION        0x0008        /* Override the check on restricted keyrings */
#define KEY_ALLOC_UID_KEYRING                0x0010        /* allocating a user or user session keyring */
#define KEY_ALLOC_SET_KEEP                0x0020        /* Set the KEEP flag on the key/keyring */

extern void key_revoke(struct key *key);
extern void key_invalidate(struct key *key);
extern void key_put(struct key *key);
extern bool key_put_tag(struct key_tag *tag);
extern void key_remove_domain(struct key_tag *domain_tag);

static inline struct key *__key_get(struct key *key)
{
        refcount_inc(&key->usage);
        return key;
}

static inline struct key *key_get(struct key *key)
{
        return key ? __key_get(key) : key;
}

static inline void key_ref_put(key_ref_t key_ref)
{
        key_put(key_ref_to_ptr(key_ref));
}

extern struct key *request_key_tag(struct key_type *type,
                                   const char *description,
                                   struct key_tag *domain_tag,
                                   const char *callout_info);

extern struct key *request_key_rcu(struct key_type *type,
                                   const char *description,
                                   struct key_tag *domain_tag);

extern struct key *request_key_with_auxdata(struct key_type *type,
                                            const char *description,
                                            struct key_tag *domain_tag,
                                            const void *callout_info,
                                            size_t callout_len,
                                            void *aux);

/**
 * request_key - Request a key and wait for construction
 * @type: Type of key.
 * @description: The searchable description of the key.
 * @callout_info: The data to pass to the instantiation upcall (or NULL).
 *
 * As for request_key_tag(), but with the default global domain tag.
 */
static inline struct key *request_key(struct key_type *type,
                                      const char *description,
                                      const char *callout_info)
{
        return request_key_tag(type, description, NULL, callout_info);
}

#ifdef CONFIG_NET
/**
 * request_key_net - Request a key for a net namespace and wait for construction
 * @type: Type of key.
 * @description: The searchable description of the key.
 * @net: The network namespace that is the key's domain of operation.
 * @callout_info: The data to pass to the instantiation upcall (or NULL).
 *
 * As for request_key() except that it does not add the returned key to a
 * keyring if found, new keys are always allocated in the user's quota, the
 * callout_info must be a NUL-terminated string and no auxiliary data can be
 * passed.  Only keys that operate the specified network namespace are used.
 *
 * Furthermore, it then works as wait_for_key_construction() to wait for the
 * completion of keys undergoing construction with a non-interruptible wait.
 */
#define request_key_net(type, description, net, callout_info) \
        request_key_tag(type, description, net->key_domain, callout_info)

/**
 * request_key_net_rcu - Request a key for a net namespace under RCU conditions
 * @type: Type of key.
 * @description: The searchable description of the key.
 * @net: The network namespace that is the key's domain of operation.
 *
 * As for request_key_rcu() except that only keys that operate the specified
 * network namespace are used.
 */
#define request_key_net_rcu(type, description, net) \
        request_key_rcu(type, description, net->key_domain)
#endif /* CONFIG_NET */

extern int wait_for_key_construction(struct key *key, bool intr);

extern int key_validate(const struct key *key);

extern key_ref_t key_create(key_ref_t keyring,
                            const char *type,
                            const char *description,
                            const void *payload,
                            size_t plen,
                            key_perm_t perm,
                            unsigned long flags);

extern key_ref_t key_create_or_update(key_ref_t keyring,
                                      const char *type,
                                      const char *description,
                                      const void *payload,
                                      size_t plen,
                                      key_perm_t perm,
                                      unsigned long flags);

extern int key_update(key_ref_t key,
                      const void *payload,
                      size_t plen);

extern int key_link(struct key *keyring,
                    struct key *key);

extern int key_move(struct key *key,
                    struct key *from_keyring,
                    struct key *to_keyring,
                    unsigned int flags);

extern int key_unlink(struct key *keyring,
                      struct key *key);

extern struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid,
                                 const struct cred *cred,
                                 key_perm_t perm,
                                 unsigned long flags,
                                 struct key_restriction *restrict_link,
                                 struct key *dest);

extern int restrict_link_reject(struct key *keyring,
                                const struct key_type *type,
                                const union key_payload *payload,
                                struct key *restriction_key);

extern int keyring_clear(struct key *keyring);

extern key_ref_t keyring_search(key_ref_t keyring,
                                struct key_type *type,
                                const char *description,
                                bool recurse);

extern int keyring_restrict(key_ref_t keyring, const char *type,
                            const char *restriction);

extern struct key *key_lookup(key_serial_t id);

static inline key_serial_t key_serial(const struct key *key)
{
        return key ? key->serial : 0;
}

extern void key_set_timeout(struct key *, unsigned);

extern key_ref_t lookup_user_key(key_serial_t id, unsigned long flags,
                                 enum key_need_perm need_perm);
extern void key_free_user_ns(struct user_namespace *);

static inline short key_read_state(const struct key *key)
{
        /* Barrier versus mark_key_instantiated(). */
        return smp_load_acquire(&key->state);
}

/**
 * key_is_positive - Determine if a key has been positively instantiated
 * @key: The key to check.
 *
 * Return true if the specified key has been positively instantiated, false
 * otherwise.
 */
static inline bool key_is_positive(const struct key *key)
{
        return key_read_state(key) == KEY_IS_POSITIVE;
}

static inline bool key_is_negative(const struct key *key)
{
        return key_read_state(key) < 0;
}

#define dereference_key_rcu(KEY)                                        \
        (rcu_dereference((KEY)->payload.rcu_data0))

#define dereference_key_locked(KEY)                                        \
        (rcu_dereference_protected((KEY)->payload.rcu_data0,                \
                                   rwsem_is_locked(&((struct key *)(KEY))->sem)))

#define rcu_assign_keypointer(KEY, PAYLOAD)                                \
do {                                                                        \
        rcu_assign_pointer((KEY)->payload.rcu_data0, (PAYLOAD));        \
} while (0)

/*
 * the userspace interface
 */
extern int install_thread_keyring_to_cred(struct cred *cred);
extern void key_fsuid_changed(struct cred *new_cred);
extern void key_fsgid_changed(struct cred *new_cred);
extern void key_init(void);

#else /* CONFIG_KEYS */

#define key_validate(k)                        0
#define key_serial(k)                        0
#define key_get(k)                         ({ NULL; })
#define key_revoke(k)                        do { } while(0)
#define key_invalidate(k)                do { } while(0)
#define key_put(k)                        do { } while(0)
#define key_ref_put(k)                        do { } while(0)
#define make_key_ref(k, p)                NULL
#define key_ref_to_ptr(k)                NULL
#define is_key_possessed(k)                0
#define key_fsuid_changed(c)                do { } while(0)
#define key_fsgid_changed(c)                do { } while(0)
#define key_init()                        do { } while(0)
#define key_free_user_ns(ns)                do { } while(0)
#define key_remove_domain(d)                do { } while(0)
#define key_lookup(k)                        NULL

#endif /* CONFIG_KEYS */
#endif /* __KERNEL__ */
#endif /* _LINUX_KEY_H */

























  300 

  305 





































  304 


























  304 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_BIT_SPINLOCK_H
#define __LINUX_BIT_SPINLOCK_H

#include <linux/kernel.h>
#include <linux/preempt.h>
#include <linux/atomic.h>
#include <linux/bug.h>

/*
 *  bit-based spin_lock()
 *
 * Don't use this unless you really need to: spin_lock() and spin_unlock()
 * are significantly faster.
 */
static __always_inline void bit_spin_lock(int bitnum, unsigned long *addr)
{
        /*
         * Assuming the lock is uncontended, this never enters
         * the body of the outer loop. If it is contended, then
         * within the inner loop a non-atomic test is used to
         * busywait with less bus contention for a good time to
         * attempt to acquire the lock bit.
         */
        preempt_disable();
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        while (unlikely(test_and_set_bit_lock(bitnum, addr))) {
                preempt_enable();
                do {
                        cpu_relax();
                } while (test_bit(bitnum, addr));
                preempt_disable();
        }
#endif
        __acquire(bitlock);
}

/*
 * Return true if it was acquired
 */
static __always_inline int bit_spin_trylock(int bitnum, unsigned long *addr)
{
        preempt_disable();
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        if (unlikely(test_and_set_bit_lock(bitnum, addr))) {
                preempt_enable();
                return 0;
        }
#endif
        __acquire(bitlock);
        return 1;
}

/*
 *  bit-based spin_unlock()
 */
static __always_inline void bit_spin_unlock(int bitnum, unsigned long *addr)
{
#ifdef CONFIG_DEBUG_SPINLOCK
        BUG_ON(!test_bit(bitnum, addr));
#endif
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        clear_bit_unlock(bitnum, addr);
#endif
        preempt_enable();
        __release(bitlock);
}

/*
 *  bit-based spin_unlock()
 *  non-atomic version, which can be used eg. if the bit lock itself is
 *  protecting the rest of the flags in the word.
 */
static __always_inline void __bit_spin_unlock(int bitnum, unsigned long *addr)
{
#ifdef CONFIG_DEBUG_SPINLOCK
        BUG_ON(!test_bit(bitnum, addr));
#endif
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        __clear_bit_unlock(bitnum, addr);
#endif
        preempt_enable();
        __release(bitlock);
}

/*
 * Return true if the lock is held.
 */
static inline int bit_spin_is_locked(int bitnum, unsigned long *addr)
{
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        return test_bit(bitnum, addr);
#elif defined CONFIG_PREEMPT_COUNT
        return preempt_count();
#else
        return 1;
#endif
}

#endif /* __LINUX_BIT_SPINLOCK_H */

























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Queued spinlock
 *
 * A 'generic' spinlock implementation that is based on MCS locks. For an
 * architecture that's looking for a 'generic' spinlock, please first consider
 * ticket-lock.h and only come looking here when you've considered all the
 * constraints below and can show your hardware does actually perform better
 * with qspinlock.
 *
 * qspinlock relies on atomic_*_release()/atomic_*_acquire() to be RCsc (or no
 * weaker than RCtso if you're power), where regular code only expects atomic_t
 * to be RCpc.
 *
 * qspinlock relies on a far greater (compared to asm-generic/spinlock.h) set
 * of atomic operations to behave well together, please audit them carefully to
 * ensure they all have forward progress. Many atomic operations may default to
 * cmpxchg() loops which will not have good forward progress properties on
 * LL/SC architectures.
 *
 * One notable example is atomic_fetch_or_acquire(), which x86 cannot (cheaply)
 * do. Carefully read the patches that introduced
 * queued_fetch_set_pending_acquire().
 *
 * qspinlock also heavily relies on mixed size atomic operations, in specific
 * it requires architectures to have xchg16; something which many LL/SC
 * architectures need to implement as a 32bit and+or in order to satisfy the
 * forward progress guarantees mentioned above.
 *
 * Further reading on mixed size atomics that might be relevant:
 *
 *   http://www.cl.cam.ac.uk/~pes20/popl17/mixed-size.pdf
 *
 * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
 * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
 *
 * Authors: Waiman Long <waiman.long@hpe.com>
 */
#ifndef __ASM_GENERIC_QSPINLOCK_H
#define __ASM_GENERIC_QSPINLOCK_H

#include <asm-generic/qspinlock_types.h>
#include <linux/atomic.h>

#ifndef queued_spin_is_locked
/**
 * queued_spin_is_locked - is the spinlock locked?
 * @lock: Pointer to queued spinlock structure
 * Return: 1 if it is locked, 0 otherwise
 */
static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
{
        /*
         * Any !0 state indicates it is locked, even if _Q_LOCKED_VAL
         * isn't immediately observable.
         */
        return atomic_read(&lock->val);
}
#endif

/**
 * queued_spin_value_unlocked - is the spinlock structure unlocked?
 * @lock: queued spinlock structure
 * Return: 1 if it is unlocked, 0 otherwise
 *
 * N.B. Whenever there are tasks waiting for the lock, it is considered
 *      locked wrt the lockref code to avoid lock stealing by the lockref
 *      code and change things underneath the lock. This also allows some
 *      optimizations to be applied without conflict with lockref.
 */
static __always_inline int queued_spin_value_unlocked(struct qspinlock lock)
{
        return !lock.val.counter;
}

/**
 * queued_spin_is_contended - check if the lock is contended
 * @lock : Pointer to queued spinlock structure
 * Return: 1 if lock contended, 0 otherwise
 */
static __always_inline int queued_spin_is_contended(struct qspinlock *lock)
{
        return atomic_read(&lock->val) & ~_Q_LOCKED_MASK;
}
/**
 * queued_spin_trylock - try to acquire the queued spinlock
 * @lock : Pointer to queued spinlock structure
 * Return: 1 if lock acquired, 0 if failed
 */
static __always_inline int queued_spin_trylock(struct qspinlock *lock)
{
        int val = atomic_read(&lock->val);

        if (unlikely(val))
                return 0;

        return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL));
}

extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);

#ifndef queued_spin_lock
/**
 * queued_spin_lock - acquire a queued spinlock
 * @lock: Pointer to queued spinlock structure
 */
static __always_inline void queued_spin_lock(struct qspinlock *lock)
{
        int val = 0;

        if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
                return;

        queued_spin_lock_slowpath(lock, val);
}
#endif

#ifndef queued_spin_unlock
/**
 * queued_spin_unlock - release a queued spinlock
 * @lock : Pointer to queued spinlock structure
 */
static __always_inline void queued_spin_unlock(struct qspinlock *lock)
{
        /*
         * unlock() needs release semantics:
         */
        smp_store_release(&lock->locked, 0);
}
#endif

#ifndef virt_spin_lock
static __always_inline bool virt_spin_lock(struct qspinlock *lock)
{
        return false;
}
#endif

#ifndef __no_arch_spinlock_redefine
/*
 * Remapping spinlock architecture specific functions to the corresponding
 * queued spinlock functions.
 */
#define arch_spin_is_locked(l)                queued_spin_is_locked(l)
#define arch_spin_is_contended(l)        queued_spin_is_contended(l)
#define arch_spin_value_unlocked(l)        queued_spin_value_unlocked(l)
#define arch_spin_lock(l)                queued_spin_lock(l)
#define arch_spin_trylock(l)                queued_spin_trylock(l)
#define arch_spin_unlock(l)                queued_spin_unlock(l)
#endif

#endif /* __ASM_GENERIC_QSPINLOCK_H */


































































































































































































































































































































































































































































   39 









































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
/* SPDX-License-Identifier: GPL-2.0+ */
/*
 * Sleepable Read-Copy Update mechanism for mutual exclusion
 *
 * Copyright (C) IBM Corporation, 2006
 * Copyright (C) Fujitsu, 2012
 *
 * Author: Paul McKenney <paulmck@linux.ibm.com>
 *           Lai Jiangshan <laijs@cn.fujitsu.com>
 *
 * For detailed explanation of Read-Copy Update mechanism see -
 *                Documentation/RCU/ *.txt
 *
 */

#ifndef _LINUX_SRCU_H
#define _LINUX_SRCU_H

#include <linux/mutex.h>
#include <linux/rcupdate.h>
#include <linux/workqueue.h>
#include <linux/rcu_segcblist.h>

struct srcu_struct;

#ifdef CONFIG_DEBUG_LOCK_ALLOC

int __init_srcu_struct(struct srcu_struct *ssp, const char *name,
                       struct lock_class_key *key);

#define init_srcu_struct(ssp) \
({ \
        static struct lock_class_key __srcu_key; \
        \
        __init_srcu_struct((ssp), #ssp, &__srcu_key); \
})

#define __SRCU_DEP_MAP_INIT(srcu_name)        .dep_map = { .name = #srcu_name },
#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */

int init_srcu_struct(struct srcu_struct *ssp);

#define __SRCU_DEP_MAP_INIT(srcu_name)
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */

/* Values for SRCU Tree srcu_data ->srcu_reader_flavor, but also used by rcutorture. */
#define SRCU_READ_FLAVOR_NORMAL        0x1                // srcu_read_lock().
#define SRCU_READ_FLAVOR_NMI        0x2                // srcu_read_lock_nmisafe().
//                                0x4                // SRCU-lite is no longer with us.
#define SRCU_READ_FLAVOR_FAST        0x8                // srcu_read_lock_fast().
#define SRCU_READ_FLAVOR_ALL   (SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_NMI | \
                                SRCU_READ_FLAVOR_FAST) // All of the above.
#define SRCU_READ_FLAVOR_SLOWGP        SRCU_READ_FLAVOR_FAST
                                                // Flavors requiring synchronize_rcu()
                                                // instead of smp_mb().
void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp);

#ifdef CONFIG_TINY_SRCU
#include <linux/srcutiny.h>
#elif defined(CONFIG_TREE_SRCU)
#include <linux/srcutree.h>
#else
#error "Unknown SRCU implementation specified to kernel configuration"
#endif

void call_srcu(struct srcu_struct *ssp, struct rcu_head *head,
                void (*func)(struct rcu_head *head));
void cleanup_srcu_struct(struct srcu_struct *ssp);
void synchronize_srcu(struct srcu_struct *ssp);

#define SRCU_GET_STATE_COMPLETED 0x1

/**
 * get_completed_synchronize_srcu - Return a pre-completed polled state cookie
 *
 * Returns a value that poll_state_synchronize_srcu() will always treat
 * as a cookie whose grace period has already completed.
 */
static inline unsigned long get_completed_synchronize_srcu(void)
{
        return SRCU_GET_STATE_COMPLETED;
}

unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp);
unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp);
bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie);

// Maximum number of unsigned long values corresponding to
// not-yet-completed SRCU grace periods.
#define NUM_ACTIVE_SRCU_POLL_OLDSTATE 2

/**
 * same_state_synchronize_srcu - Are two old-state values identical?
 * @oldstate1: First old-state value.
 * @oldstate2: Second old-state value.
 *
 * The two old-state values must have been obtained from either
 * get_state_synchronize_srcu(), start_poll_synchronize_srcu(), or
 * get_completed_synchronize_srcu().  Returns @true if the two values are
 * identical and @false otherwise.  This allows structures whose lifetimes
 * are tracked by old-state values to push these values to a list header,
 * allowing those structures to be slightly smaller.
 */
static inline bool same_state_synchronize_srcu(unsigned long oldstate1, unsigned long oldstate2)
{
        return oldstate1 == oldstate2;
}

#ifdef CONFIG_NEED_SRCU_NMI_SAFE
int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp);
void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) __releases(ssp);
#else
static inline int __srcu_read_lock_nmisafe(struct srcu_struct *ssp)
{
        return __srcu_read_lock(ssp);
}
static inline void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx)
{
        __srcu_read_unlock(ssp, idx);
}
#endif /* CONFIG_NEED_SRCU_NMI_SAFE */

void srcu_init(void);

#ifdef CONFIG_DEBUG_LOCK_ALLOC

/**
 * srcu_read_lock_held - might we be in SRCU read-side critical section?
 * @ssp: The srcu_struct structure to check
 *
 * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an SRCU
 * read-side critical section.  In absence of CONFIG_DEBUG_LOCK_ALLOC,
 * this assumes we are in an SRCU read-side critical section unless it can
 * prove otherwise.
 *
 * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot
 * and while lockdep is disabled.
 *
 * Note that SRCU is based on its own statemachine and it doesn't
 * relies on normal RCU, it can be called from the CPU which
 * is in the idle loop from an RCU point of view or offline.
 */
static inline int srcu_read_lock_held(const struct srcu_struct *ssp)
{
        if (!debug_lockdep_rcu_enabled())
                return 1;
        return lock_is_held(&ssp->dep_map);
}

/*
 * Annotations provide deadlock detection for SRCU.
 *
 * Similar to other lockdep annotations, except there is an additional
 * srcu_lock_sync(), which is basically an empty *write*-side critical section,
 * see lock_sync() for more information.
 */

/* Annotates a srcu_read_lock() */
static inline void srcu_lock_acquire(struct lockdep_map *map)
{
        lock_map_acquire_read(map);
}

/* Annotates a srcu_read_lock() */
static inline void srcu_lock_release(struct lockdep_map *map)
{
        lock_map_release(map);
}

/* Annotates a synchronize_srcu() */
static inline void srcu_lock_sync(struct lockdep_map *map)
{
        lock_map_sync(map);
}

#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */

static inline int srcu_read_lock_held(const struct srcu_struct *ssp)
{
        return 1;
}

#define srcu_lock_acquire(m) do { } while (0)
#define srcu_lock_release(m) do { } while (0)
#define srcu_lock_sync(m) do { } while (0)

#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */


/**
 * srcu_dereference_check - fetch SRCU-protected pointer for later dereferencing
 * @p: the pointer to fetch and protect for later dereferencing
 * @ssp: pointer to the srcu_struct, which is used to check that we
 *        really are in an SRCU read-side critical section.
 * @c: condition to check for update-side use
 *
 * If PROVE_RCU is enabled, invoking this outside of an RCU read-side
 * critical section will result in an RCU-lockdep splat, unless @c evaluates
 * to 1.  The @c argument will normally be a logical expression containing
 * lockdep_is_held() calls.
 */
#define srcu_dereference_check(p, ssp, c) \
        __rcu_dereference_check((p), __UNIQUE_ID(rcu), \
                                (c) || srcu_read_lock_held(ssp), __rcu)

/**
 * srcu_dereference - fetch SRCU-protected pointer for later dereferencing
 * @p: the pointer to fetch and protect for later dereferencing
 * @ssp: pointer to the srcu_struct, which is used to check that we
 *        really are in an SRCU read-side critical section.
 *
 * Makes rcu_dereference_check() do the dirty work.  If PROVE_RCU
 * is enabled, invoking this outside of an RCU read-side critical
 * section will result in an RCU-lockdep splat.
 */
#define srcu_dereference(p, ssp) srcu_dereference_check((p), (ssp), 0)

/**
 * srcu_dereference_notrace - no tracing and no lockdep calls from here
 * @p: the pointer to fetch and protect for later dereferencing
 * @ssp: pointer to the srcu_struct, which is used to check that we
 *        really are in an SRCU read-side critical section.
 */
#define srcu_dereference_notrace(p, ssp) srcu_dereference_check((p), (ssp), 1)

/**
 * srcu_read_lock - register a new reader for an SRCU-protected structure.
 * @ssp: srcu_struct in which to register the new reader.
 *
 * Enter an SRCU read-side critical section.  Note that SRCU read-side
 * critical sections may be nested.  However, it is illegal to
 * call anything that waits on an SRCU grace period for the same
 * srcu_struct, whether directly or indirectly.  Please note that
 * one way to indirectly wait on an SRCU grace period is to acquire
 * a mutex that is held elsewhere while calling synchronize_srcu() or
 * synchronize_srcu_expedited().
 *
 * The return value from srcu_read_lock() is guaranteed to be
 * non-negative.  This value must be passed unaltered to the matching
 * srcu_read_unlock().  Note that srcu_read_lock() and the matching
 * srcu_read_unlock() must occur in the same context, for example, it is
 * illegal to invoke srcu_read_unlock() in an irq handler if the matching
 * srcu_read_lock() was invoked in process context.  Or, for that matter to
 * invoke srcu_read_unlock() from one task and the matching srcu_read_lock()
 * from another.
 */
static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp)
{
        int retval;

        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL);
        retval = __srcu_read_lock(ssp);
        srcu_lock_acquire(&ssp->dep_map);
        return retval;
}

/**
 * srcu_read_lock_fast - register a new reader for an SRCU-protected structure.
 * @ssp: srcu_struct in which to register the new reader.
 *
 * Enter an SRCU read-side critical section, but for a light-weight
 * smp_mb()-free reader.  See srcu_read_lock() for more information.
 *
 * If srcu_read_lock_fast() is ever used on an srcu_struct structure,
 * then none of the other flavors may be used, whether before, during,
 * or after.  Note that grace-period auto-expediting is disabled for _fast
 * srcu_struct structures because auto-expedited grace periods invoke
 * synchronize_rcu_expedited(), IPIs and all.
 *
 * Note that srcu_read_lock_fast() can be invoked only from those contexts
 * where RCU is watching, that is, from contexts where it would be legal
 * to invoke rcu_read_lock().  Otherwise, lockdep will complain.
 */
static inline struct srcu_ctr __percpu *srcu_read_lock_fast(struct srcu_struct *ssp) __acquires(ssp)
{
        struct srcu_ctr __percpu *retval;

        RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_fast().");
        srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST);
        retval = __srcu_read_lock_fast(ssp);
        rcu_try_lock_acquire(&ssp->dep_map);
        return retval;
}

/*
 * Used by tracing, cannot be traced and cannot call lockdep.
 * See srcu_read_lock_fast() for more information.
 */
static inline struct srcu_ctr __percpu *srcu_read_lock_fast_notrace(struct srcu_struct *ssp)
        __acquires(ssp)
{
        struct srcu_ctr __percpu *retval;

        srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST);
        retval = __srcu_read_lock_fast(ssp);
        return retval;
}

/**
 * srcu_down_read_fast - register a new reader for an SRCU-protected structure.
 * @ssp: srcu_struct in which to register the new reader.
 *
 * Enter a semaphore-like SRCU read-side critical section, but for
 * a light-weight smp_mb()-free reader.  See srcu_read_lock_fast() and
 * srcu_down_read() for more information.
 *
 * The same srcu_struct may be used concurrently by srcu_down_read_fast()
 * and srcu_read_lock_fast().
 */
static inline struct srcu_ctr __percpu *srcu_down_read_fast(struct srcu_struct *ssp) __acquires(ssp)
{
        WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && in_nmi());
        RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_down_read_fast().");
        srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST);
        return __srcu_read_lock_fast(ssp);
}

/**
 * srcu_read_lock_nmisafe - register a new reader for an SRCU-protected structure.
 * @ssp: srcu_struct in which to register the new reader.
 *
 * Enter an SRCU read-side critical section, but in an NMI-safe manner.
 * See srcu_read_lock() for more information.
 *
 * If srcu_read_lock_nmisafe() is ever used on an srcu_struct structure,
 * then none of the other flavors may be used, whether before, during,
 * or after.
 */
static inline int srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp)
{
        int retval;

        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NMI);
        retval = __srcu_read_lock_nmisafe(ssp);
        rcu_try_lock_acquire(&ssp->dep_map);
        return retval;
}

/* Used by tracing, cannot be traced and cannot invoke lockdep. */
static inline notrace int
srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp)
{
        int retval;

        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL);
        retval = __srcu_read_lock(ssp);
        return retval;
}

/**
 * srcu_down_read - register a new reader for an SRCU-protected structure.
 * @ssp: srcu_struct in which to register the new reader.
 *
 * Enter a semaphore-like SRCU read-side critical section.  Note that
 * SRCU read-side critical sections may be nested.  However, it is
 * illegal to call anything that waits on an SRCU grace period for the
 * same srcu_struct, whether directly or indirectly.  Please note that
 * one way to indirectly wait on an SRCU grace period is to acquire
 * a mutex that is held elsewhere while calling synchronize_srcu() or
 * synchronize_srcu_expedited().  But if you want lockdep to help you
 * keep this stuff straight, you should instead use srcu_read_lock().
 *
 * The semaphore-like nature of srcu_down_read() means that the matching
 * srcu_up_read() can be invoked from some other context, for example,
 * from some other task or from an irq handler.  However, neither
 * srcu_down_read() nor srcu_up_read() may be invoked from an NMI handler.
 *
 * Calls to srcu_down_read() may be nested, similar to the manner in
 * which calls to down_read() may be nested.  The same srcu_struct may be
 * used concurrently by srcu_down_read() and srcu_read_lock().
 */
static inline int srcu_down_read(struct srcu_struct *ssp) __acquires(ssp)
{
        WARN_ON_ONCE(in_nmi());
        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL);
        return __srcu_read_lock(ssp);
}

/**
 * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
 * @ssp: srcu_struct in which to unregister the old reader.
 * @idx: return value from corresponding srcu_read_lock().
 *
 * Exit an SRCU read-side critical section.
 */
static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx)
        __releases(ssp)
{
        WARN_ON_ONCE(idx & ~0x1);
        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL);
        srcu_lock_release(&ssp->dep_map);
        __srcu_read_unlock(ssp, idx);
}

/**
 * srcu_read_unlock_fast - unregister a old reader from an SRCU-protected structure.
 * @ssp: srcu_struct in which to unregister the old reader.
 * @scp: return value from corresponding srcu_read_lock_fast().
 *
 * Exit a light-weight SRCU read-side critical section.
 */
static inline void srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp)
        __releases(ssp)
{
        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST);
        srcu_lock_release(&ssp->dep_map);
        __srcu_read_unlock_fast(ssp, scp);
        RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_fast().");
}

/*
 * Used by tracing, cannot be traced and cannot call lockdep.
 * See srcu_read_unlock_fast() for more information.
 */
static inline void srcu_read_unlock_fast_notrace(struct srcu_struct *ssp,
                                                 struct srcu_ctr __percpu *scp) __releases(ssp)
{
        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST);
        __srcu_read_unlock_fast(ssp, scp);
}

/**
 * srcu_up_read_fast - unregister a old reader from an SRCU-protected structure.
 * @ssp: srcu_struct in which to unregister the old reader.
 * @scp: return value from corresponding srcu_read_lock_fast().
 *
 * Exit an SRCU read-side critical section, but not necessarily from
 * the same context as the maching srcu_down_read_fast().
 */
static inline void srcu_up_read_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp)
        __releases(ssp)
{
        WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && in_nmi());
        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST);
        __srcu_read_unlock_fast(ssp, scp);
        RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_up_read_fast().");
}

/**
 * srcu_read_unlock_nmisafe - unregister a old reader from an SRCU-protected structure.
 * @ssp: srcu_struct in which to unregister the old reader.
 * @idx: return value from corresponding srcu_read_lock_nmisafe().
 *
 * Exit an SRCU read-side critical section, but in an NMI-safe manner.
 */
static inline void srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx)
        __releases(ssp)
{
        WARN_ON_ONCE(idx & ~0x1);
        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NMI);
        rcu_lock_release(&ssp->dep_map);
        __srcu_read_unlock_nmisafe(ssp, idx);
}

/* Used by tracing, cannot be traced and cannot call lockdep. */
static inline notrace void
srcu_read_unlock_notrace(struct srcu_struct *ssp, int idx) __releases(ssp)
{
        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL);
        __srcu_read_unlock(ssp, idx);
}

/**
 * srcu_up_read - unregister a old reader from an SRCU-protected structure.
 * @ssp: srcu_struct in which to unregister the old reader.
 * @idx: return value from corresponding srcu_read_lock().
 *
 * Exit an SRCU read-side critical section, but not necessarily from
 * the same context as the maching srcu_down_read().
 */
static inline void srcu_up_read(struct srcu_struct *ssp, int idx)
        __releases(ssp)
{
        WARN_ON_ONCE(idx & ~0x1);
        WARN_ON_ONCE(in_nmi());
        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL);
        __srcu_read_unlock(ssp, idx);
}

/**
 * smp_mb__after_srcu_read_unlock - ensure full ordering after srcu_read_unlock
 *
 * Converts the preceding srcu_read_unlock into a two-way memory barrier.
 *
 * Call this after srcu_read_unlock, to guarantee that all memory operations
 * that occur after smp_mb__after_srcu_read_unlock will appear to happen after
 * the preceding srcu_read_unlock.
 */
static inline void smp_mb__after_srcu_read_unlock(void)
{
        /* __srcu_read_unlock has smp_mb() internally so nothing to do here. */
}

/**
 * smp_mb__after_srcu_read_lock - ensure full ordering after srcu_read_lock
 *
 * Converts the preceding srcu_read_lock into a two-way memory barrier.
 *
 * Call this after srcu_read_lock, to guarantee that all memory operations
 * that occur after smp_mb__after_srcu_read_lock will appear to happen after
 * the preceding srcu_read_lock.
 */
static inline void smp_mb__after_srcu_read_lock(void)
{
        /* __srcu_read_lock has smp_mb() internally so nothing to do here. */
}

DEFINE_LOCK_GUARD_1(srcu, struct srcu_struct,
                    _T->idx = srcu_read_lock(_T->lock),
                    srcu_read_unlock(_T->lock, _T->idx),
                    int idx)

DEFINE_LOCK_GUARD_1(srcu_fast, struct srcu_struct,
                    _T->scp = srcu_read_lock_fast(_T->lock),
                    srcu_read_unlock_fast(_T->lock, _T->scp),
                    struct srcu_ctr __percpu *scp)

DEFINE_LOCK_GUARD_1(srcu_fast_notrace, struct srcu_struct,
                    _T->scp = srcu_read_lock_fast_notrace(_T->lock),
                    srcu_read_unlock_fast_notrace(_T->lock, _T->scp),
                    struct srcu_ctr __percpu *scp)

#endif















































































































































































































































































































   39 













   39 












   39 


   39 
   39 



   39 







   39 






















































































































































































   39 
   39 




























































   39 



















































































































































































































































































































































































































































































































































































































































































































































































   39 

























   39 



   39 

   39 









   39 





















   39 


























   39 













   39 








   39 
   39 













   39 



























   39 


   39 
   39 

















































































































































































































































































































































































   39 


















   39 
   39 


















   39 












   39 












   39 



















   38 


















   39 

























































































   39 

   39 



   39 






































   39 



   39 











   39 






















   39 



   39 
   39 





   39 







   39 



   39 
   39 





   39 





   39 





   39 




   39 
   39 



   39 





   39 

















   39 
   39 









   39 







   39 







   39 
   39 

   39 



   38 

   39 


   38 
   39 

























   38 


   39 







   39 


   39 









   38 




   39 

   39 

   39 











































   39 
   38 











   39 




    2 



   39 




   39 

   39 


   39 














   39 

   39 


   39 





    2 



   39 





   39 









































































































































































































































































































































































































   39 





   39 

   39 
   39 











   38 

































































































   39 

















   39 






   39 







   39 


   39 







   39 
   39 
   39 





















































   39 











   37 


   39 

   39 






























   39 
























































   39 












   39 


   39 
   39 









   39 

   39 
   39 


   39 




   39 
   39 






   39 
   39 


   39 


   39 

   37 





   39 


   39 
   39 





























   39 


   39 








   39 
























   39 



   39 
   39 


   39 

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   39 
















   39 


   39 














   39 


































    2 






























































































































































































































































































































































   38 



















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/kernel/printk.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 * Modified to make sys_syslog() more flexible: added commands to
 * return the last 4k of kernel messages, regardless of whether
 * they've been read or not.  Added option to suppress kernel printk's
 * to the console.  Added hook for sending the console messages
 * elsewhere, in preparation for a serial line console (someday).
 * Ted Ts'o, 2/11/93.
 * Modified for sysctl support, 1/8/97, Chris Horn.
 * Fixed SMP synchronization, 08/08/99, Manfred Spraul
 *     manfred@colorfullife.com
 * Rewrote bits to get rid of console_lock
 *        01Mar01 Andrew Morton
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/tty.h>
#include <linux/tty_driver.h>
#include <linux/console.h>
#include <linux/init.h>
#include <linux/jiffies.h>
#include <linux/nmi.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/delay.h>
#include <linux/smp.h>
#include <linux/security.h>
#include <linux/memblock.h>
#include <linux/syscalls.h>
#include <linux/syscore_ops.h>
#include <linux/vmcore_info.h>
#include <linux/ratelimit.h>
#include <linux/kmsg_dump.h>
#include <linux/syslog.h>
#include <linux/cpu.h>
#include <linux/rculist.h>
#include <linux/poll.h>
#include <linux/irq_work.h>
#include <linux/ctype.h>
#include <linux/uio.h>
#include <linux/sched/clock.h>
#include <linux/sched/debug.h>
#include <linux/sched/task_stack.h>
#include <linux/panic.h>

#include <linux/uaccess.h>
#include <asm/sections.h>

#include <trace/events/initcall.h>
#define CREATE_TRACE_POINTS
#include <trace/events/printk.h>

#include "printk_ringbuffer.h"
#include "console_cmdline.h"
#include "braille.h"
#include "internal.h"

int console_printk[4] = {
        CONSOLE_LOGLEVEL_DEFAULT,        /* console_loglevel */
        MESSAGE_LOGLEVEL_DEFAULT,        /* default_message_loglevel */
        CONSOLE_LOGLEVEL_MIN,                /* minimum_console_loglevel */
        CONSOLE_LOGLEVEL_DEFAULT,        /* default_console_loglevel */
};
EXPORT_SYMBOL_GPL(console_printk);

atomic_t ignore_console_lock_warning __read_mostly = ATOMIC_INIT(0);
EXPORT_SYMBOL(ignore_console_lock_warning);

EXPORT_TRACEPOINT_SYMBOL_GPL(console);

/*
 * Low level drivers may need that to know if they can schedule in
 * their unblank() callback or not. So let's export it.
 */
int oops_in_progress;
EXPORT_SYMBOL(oops_in_progress);

/*
 * console_mutex protects console_list updates and console->flags updates.
 * The flags are synchronized only for consoles that are registered, i.e.
 * accessible via the console list.
 */
static DEFINE_MUTEX(console_mutex);

/*
 * console_sem protects updates to console->seq
 * and also provides serialization for console printing.
 */
static DEFINE_SEMAPHORE(console_sem, 1);
HLIST_HEAD(console_list);
EXPORT_SYMBOL_GPL(console_list);
DEFINE_STATIC_SRCU(console_srcu);

/*
 * System may need to suppress printk message under certain
 * circumstances, like after kernel panic happens.
 */
int __read_mostly suppress_printk;

#ifdef CONFIG_LOCKDEP
static struct lockdep_map console_lock_dep_map = {
        .name = "console_lock"
};

void lockdep_assert_console_list_lock_held(void)
{
        lockdep_assert_held(&console_mutex);
}
EXPORT_SYMBOL(lockdep_assert_console_list_lock_held);
#endif

#ifdef CONFIG_DEBUG_LOCK_ALLOC
bool console_srcu_read_lock_is_held(void)
{
        return srcu_read_lock_held(&console_srcu);
}
EXPORT_SYMBOL(console_srcu_read_lock_is_held);
#endif

enum devkmsg_log_bits {
        __DEVKMSG_LOG_BIT_ON = 0,
        __DEVKMSG_LOG_BIT_OFF,
        __DEVKMSG_LOG_BIT_LOCK,
};

enum devkmsg_log_masks {
        DEVKMSG_LOG_MASK_ON             = BIT(__DEVKMSG_LOG_BIT_ON),
        DEVKMSG_LOG_MASK_OFF            = BIT(__DEVKMSG_LOG_BIT_OFF),
        DEVKMSG_LOG_MASK_LOCK           = BIT(__DEVKMSG_LOG_BIT_LOCK),
};

/* Keep both the 'on' and 'off' bits clear, i.e. ratelimit by default: */
#define DEVKMSG_LOG_MASK_DEFAULT        0

static unsigned int __read_mostly devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT;

static int __control_devkmsg(char *str)
{
        size_t len;

        if (!str)
                return -EINVAL;

        len = str_has_prefix(str, "on");
        if (len) {
                devkmsg_log = DEVKMSG_LOG_MASK_ON;
                return len;
        }

        len = str_has_prefix(str, "off");
        if (len) {
                devkmsg_log = DEVKMSG_LOG_MASK_OFF;
                return len;
        }

        len = str_has_prefix(str, "ratelimit");
        if (len) {
                devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT;
                return len;
        }

        return -EINVAL;
}

static int __init control_devkmsg(char *str)
{
        if (__control_devkmsg(str) < 0) {
                pr_warn("printk.devkmsg: bad option string '%s'\n", str);
                return 1;
        }

        /*
         * Set sysctl string accordingly:
         */
        if (devkmsg_log == DEVKMSG_LOG_MASK_ON)
                strscpy(devkmsg_log_str, "on");
        else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF)
                strscpy(devkmsg_log_str, "off");
        /* else "ratelimit" which is set by default. */

        /*
         * Sysctl cannot change it anymore. The kernel command line setting of
         * this parameter is to force the setting to be permanent throughout the
         * runtime of the system. This is a precation measure against userspace
         * trying to be a smarta** and attempting to change it up on us.
         */
        devkmsg_log |= DEVKMSG_LOG_MASK_LOCK;

        return 1;
}
__setup("printk.devkmsg=", control_devkmsg);

char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit";
#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)
int devkmsg_sysctl_set_loglvl(const struct ctl_table *table, int write,
                              void *buffer, size_t *lenp, loff_t *ppos)
{
        char old_str[DEVKMSG_STR_MAX_SIZE];
        unsigned int old;
        int err;

        if (write) {
                if (devkmsg_log & DEVKMSG_LOG_MASK_LOCK)
                        return -EINVAL;

                old = devkmsg_log;
                strscpy(old_str, devkmsg_log_str);
        }

        err = proc_dostring(table, write, buffer, lenp, ppos);
        if (err)
                return err;

        if (write) {
                err = __control_devkmsg(devkmsg_log_str);

                /*
                 * Do not accept an unknown string OR a known string with
                 * trailing crap...
                 */
                if (err < 0 || (err + 1 != *lenp)) {

                        /* ... and restore old setting. */
                        devkmsg_log = old;
                        strscpy(devkmsg_log_str, old_str);

                        return -EINVAL;
                }
        }

        return 0;
}
#endif /* CONFIG_PRINTK && CONFIG_SYSCTL */

/**
 * console_list_lock - Lock the console list
 *
 * For console list or console->flags updates
 */
void console_list_lock(void)
{
        /*
         * In unregister_console() and console_force_preferred_locked(),
         * synchronize_srcu() is called with the console_list_lock held.
         * Therefore it is not allowed that the console_list_lock is taken
         * with the srcu_lock held.
         *
         * Detecting if this context is really in the read-side critical
         * section is only possible if the appropriate debug options are
         * enabled.
         */
        WARN_ON_ONCE(debug_lockdep_rcu_enabled() &&
                     srcu_read_lock_held(&console_srcu));

        mutex_lock(&console_mutex);
}
EXPORT_SYMBOL(console_list_lock);

/**
 * console_list_unlock - Unlock the console list
 *
 * Counterpart to console_list_lock()
 */
void console_list_unlock(void)
{
        mutex_unlock(&console_mutex);
}
EXPORT_SYMBOL(console_list_unlock);

/**
 * console_srcu_read_lock - Register a new reader for the
 *        SRCU-protected console list
 *
 * Use for_each_console_srcu() to iterate the console list
 *
 * Context: Any context.
 * Return: A cookie to pass to console_srcu_read_unlock().
 */
int console_srcu_read_lock(void)
        __acquires(&console_srcu)
{
        return srcu_read_lock_nmisafe(&console_srcu);
}
EXPORT_SYMBOL(console_srcu_read_lock);

/**
 * console_srcu_read_unlock - Unregister an old reader from
 *        the SRCU-protected console list
 * @cookie: cookie returned from console_srcu_read_lock()
 *
 * Counterpart to console_srcu_read_lock()
 */
void console_srcu_read_unlock(int cookie)
        __releases(&console_srcu)
{
        srcu_read_unlock_nmisafe(&console_srcu, cookie);
}
EXPORT_SYMBOL(console_srcu_read_unlock);

/*
 * Helper macros to handle lockdep when locking/unlocking console_sem. We use
 * macros instead of functions so that _RET_IP_ contains useful information.
 */
#define down_console_sem() do { \
        down(&console_sem);\
        mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);\
} while (0)

static int __down_trylock_console_sem(unsigned long ip)
{
        int lock_failed;
        unsigned long flags;

        /*
         * Here and in __up_console_sem() we need to be in safe mode,
         * because spindump/WARN/etc from under console ->lock will
         * deadlock in printk()->down_trylock_console_sem() otherwise.
         */
        printk_safe_enter_irqsave(flags);
        lock_failed = down_trylock(&console_sem);
        printk_safe_exit_irqrestore(flags);

        if (lock_failed)
                return 1;
        mutex_acquire(&console_lock_dep_map, 0, 1, ip);
        return 0;
}
#define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_)

static void __up_console_sem(unsigned long ip)
{
        unsigned long flags;

        mutex_release(&console_lock_dep_map, ip);

        printk_safe_enter_irqsave(flags);
        up(&console_sem);
        printk_safe_exit_irqrestore(flags);
}
#define up_console_sem() __up_console_sem(_RET_IP_)

/*
 * This is used for debugging the mess that is the VT code by
 * keeping track if we have the console semaphore held. It's
 * definitely not the perfect debug tool (we don't know if _WE_
 * hold it and are racing, but it helps tracking those weird code
 * paths in the console code where we end up in places I want
 * locked without the console semaphore held).
 */
static int console_locked;

/*
 *        Array of consoles built from command line options (console=)
 */

#define MAX_CMDLINECONSOLES 8

static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];

static int preferred_console = -1;
int console_set_on_cmdline;
EXPORT_SYMBOL(console_set_on_cmdline);

/* Flag: console code may call schedule() */
static int console_may_schedule;

enum con_msg_format_flags {
        MSG_FORMAT_DEFAULT        = 0,
        MSG_FORMAT_SYSLOG        = (1 << 0),
};

static int console_msg_format = MSG_FORMAT_DEFAULT;

/*
 * The printk log buffer consists of a sequenced collection of records, each
 * containing variable length message text. Every record also contains its
 * own meta-data (@info).
 *
 * Every record meta-data carries the timestamp in microseconds, as well as
 * the standard userspace syslog level and syslog facility. The usual kernel
 * messages use LOG_KERN; userspace-injected messages always carry a matching
 * syslog facility, by default LOG_USER. The origin of every message can be
 * reliably determined that way.
 *
 * The human readable log message of a record is available in @text, the
 * length of the message text in @text_len. The stored message is not
 * terminated.
 *
 * Optionally, a record can carry a dictionary of properties (key/value
 * pairs), to provide userspace with a machine-readable message context.
 *
 * Examples for well-defined, commonly used property names are:
 *   DEVICE=b12:8               device identifier
 *                                b12:8         block dev_t
 *                                c127:3        char dev_t
 *                                n8            netdev ifindex
 *                                +sound:card0  subsystem:devname
 *   SUBSYSTEM=pci              driver-core subsystem name
 *
 * Valid characters in property names are [a-zA-Z0-9.-_]. Property names
 * and values are terminated by a '\0' character.
 *
 * Example of record values:
 *   record.text_buf                = "it's a line" (unterminated)
 *   record.info.seq                = 56
 *   record.info.ts_nsec            = 36863
 *   record.info.text_len           = 11
 *   record.info.facility           = 0 (LOG_KERN)
 *   record.info.flags              = 0
 *   record.info.level              = 3 (LOG_ERR)
 *   record.info.caller_id          = 299 (task 299)
 *   record.info.dev_info.subsystem = "pci" (terminated)
 *   record.info.dev_info.device    = "+pci:0000:00:01.0" (terminated)
 *
 * The 'struct printk_info' buffer must never be directly exported to
 * userspace, it is a kernel-private implementation detail that might
 * need to be changed in the future, when the requirements change.
 *
 * /dev/kmsg exports the structured data in the following line format:
 *   "<level>,<sequnum>,<timestamp>,<contflag>[,additional_values, ... ];<message text>\n"
 *
 * Users of the export format should ignore possible additional values
 * separated by ',', and find the message after the ';' character.
 *
 * The optional key/value pairs are attached as continuation lines starting
 * with a space character and terminated by a newline. All possible
 * non-prinatable characters are escaped in the "\xff" notation.
 */

/* syslog_lock protects syslog_* variables and write access to clear_seq. */
static DEFINE_MUTEX(syslog_lock);

/*
 * Specifies if a legacy console is registered. If legacy consoles are
 * present, it is necessary to perform the console lock/unlock dance
 * whenever console flushing should occur.
 */
bool have_legacy_console;

/*
 * Specifies if an nbcon console is registered. If nbcon consoles are present,
 * synchronous printing of legacy consoles will not occur during panic until
 * the backtrace has been stored to the ringbuffer.
 */
bool have_nbcon_console;

/*
 * Specifies if a boot console is registered. If boot consoles are present,
 * nbcon consoles cannot print simultaneously and must be synchronized by
 * the console lock. This is because boot consoles and nbcon consoles may
 * have mapped the same hardware.
 */
bool have_boot_console;

/* See printk_legacy_allow_panic_sync() for details. */
bool legacy_allow_panic_sync;

#ifdef CONFIG_PRINTK
DECLARE_WAIT_QUEUE_HEAD(log_wait);
static DECLARE_WAIT_QUEUE_HEAD(legacy_wait);
/* All 3 protected by @syslog_lock. */
/* the next printk record to read by syslog(READ) or /proc/kmsg */
static u64 syslog_seq;
static size_t syslog_partial;
static bool syslog_time;

/* True when _all_ printer threads are available for printing. */
bool printk_kthreads_running;

struct latched_seq {
        seqcount_latch_t        latch;
        u64                        val[2];
};

/*
 * The next printk record to read after the last 'clear' command. There are
 * two copies (updated with seqcount_latch) so that reads can locklessly
 * access a valid value. Writers are synchronized by @syslog_lock.
 */
static struct latched_seq clear_seq = {
        .latch                = SEQCNT_LATCH_ZERO(clear_seq.latch),
        .val[0]                = 0,
        .val[1]                = 0,
};

#define LOG_LEVEL(v)                ((v) & 0x07)
#define LOG_FACILITY(v)                ((v) >> 3 & 0xff)

/* record buffer */
#define LOG_ALIGN __alignof__(unsigned long)
#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
#define LOG_BUF_LEN_MAX ((u32)1 << 31)
static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
static char *log_buf = __log_buf;
static u32 log_buf_len = __LOG_BUF_LEN;

/*
 * Define the average message size. This only affects the number of
 * descriptors that will be available. Underestimating is better than
 * overestimating (too many available descriptors is better than not enough).
 */
#define PRB_AVGBITS 5        /* 32 character average length */

#if CONFIG_LOG_BUF_SHIFT <= PRB_AVGBITS
#error CONFIG_LOG_BUF_SHIFT value too small.
#endif
_DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS,
                 PRB_AVGBITS, &__log_buf[0]);

static struct printk_ringbuffer printk_rb_dynamic;

struct printk_ringbuffer *prb = &printk_rb_static;

/*
 * We cannot access per-CPU data (e.g. per-CPU flush irq_work) before
 * per_cpu_areas are initialised. This variable is set to true when
 * it's safe to access per-CPU data.
 */
static bool __printk_percpu_data_ready __ro_after_init;

bool printk_percpu_data_ready(void)
{
        return __printk_percpu_data_ready;
}

/* Must be called under syslog_lock. */
static void latched_seq_write(struct latched_seq *ls, u64 val)
{
        write_seqcount_latch_begin(&ls->latch);
        ls->val[0] = val;
        write_seqcount_latch(&ls->latch);
        ls->val[1] = val;
        write_seqcount_latch_end(&ls->latch);
}

/* Can be called from any context. */
static u64 latched_seq_read_nolock(struct latched_seq *ls)
{
        unsigned int seq;
        unsigned int idx;
        u64 val;

        do {
                seq = read_seqcount_latch(&ls->latch);
                idx = seq & 0x1;
                val = ls->val[idx];
        } while (read_seqcount_latch_retry(&ls->latch, seq));

        return val;
}

/* Return log buffer address */
char *log_buf_addr_get(void)
{
        return log_buf;
}

/* Return log buffer size */
u32 log_buf_len_get(void)
{
        return log_buf_len;
}

/*
 * Define how much of the log buffer we could take at maximum. The value
 * must be greater than two. Note that only half of the buffer is available
 * when the index points to the middle.
 */
#define MAX_LOG_TAKE_PART 4
static const char trunc_msg[] = "<truncated>";

static void truncate_msg(u16 *text_len, u16 *trunc_msg_len)
{
        /*
         * The message should not take the whole buffer. Otherwise, it might
         * get removed too soon.
         */
        u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART;

        if (*text_len > max_text_len)
                *text_len = max_text_len;

        /* enable the warning message (if there is room) */
        *trunc_msg_len = strlen(trunc_msg);
        if (*text_len >= *trunc_msg_len)
                *text_len -= *trunc_msg_len;
        else
                *trunc_msg_len = 0;
}

int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT);

static int syslog_action_restricted(int type)
{
        if (dmesg_restrict)
                return 1;
        /*
         * Unless restricted, we allow "read all" and "get buffer size"
         * for everybody.
         */
        return type != SYSLOG_ACTION_READ_ALL &&
               type != SYSLOG_ACTION_SIZE_BUFFER;
}

static int check_syslog_permissions(int type, int source)
{
        /*
         * If this is from /proc/kmsg and we've already opened it, then we've
         * already done the capabilities checks at open time.
         */
        if (source == SYSLOG_FROM_PROC && type != SYSLOG_ACTION_OPEN)
                goto ok;

        if (syslog_action_restricted(type)) {
                if (capable(CAP_SYSLOG))
                        goto ok;
                return -EPERM;
        }
ok:
        return security_syslog(type);
}

static void append_char(char **pp, char *e, char c)
{
        if (*pp < e)
                *(*pp)++ = c;
}

static ssize_t info_print_ext_header(char *buf, size_t size,
                                     struct printk_info *info)
{
        u64 ts_usec = info->ts_nsec;
        char caller[20];
#ifdef CONFIG_PRINTK_CALLER
        u32 id = info->caller_id;

        snprintf(caller, sizeof(caller), ",caller=%c%u",
                 id & 0x80000000 ? 'C' : 'T', id & ~0x80000000);
#else
        caller[0] = '\0';
#endif

        do_div(ts_usec, 1000);

        return scnprintf(buf, size, "%u,%llu,%llu,%c%s;",
                         (info->facility << 3) | info->level, info->seq,
                         ts_usec, info->flags & LOG_CONT ? 'c' : '-', caller);
}

static ssize_t msg_add_ext_text(char *buf, size_t size,
                                const char *text, size_t text_len,
                                unsigned char endc)
{
        char *p = buf, *e = buf + size;
        size_t i;

        /* escape non-printable characters */
        for (i = 0; i < text_len; i++) {
                unsigned char c = text[i];

                if (c < ' ' || c >= 127 || c == '\\')
                        p += scnprintf(p, e - p, "\\x%02x", c);
                else
                        append_char(&p, e, c);
        }
        append_char(&p, e, endc);

        return p - buf;
}

static ssize_t msg_add_dict_text(char *buf, size_t size,
                                 const char *key, const char *val)
{
        size_t val_len = strlen(val);
        ssize_t len;

        if (!val_len)
                return 0;

        len = msg_add_ext_text(buf, size, "", 0, ' ');        /* dict prefix */
        len += msg_add_ext_text(buf + len, size - len, key, strlen(key), '=');
        len += msg_add_ext_text(buf + len, size - len, val, val_len, '\n');

        return len;
}

static ssize_t msg_print_ext_body(char *buf, size_t size,
                                  char *text, size_t text_len,
                                  struct dev_printk_info *dev_info)
{
        ssize_t len;

        len = msg_add_ext_text(buf, size, text, text_len, '\n');

        if (!dev_info)
                goto out;

        len += msg_add_dict_text(buf + len, size - len, "SUBSYSTEM",
                                 dev_info->subsystem);
        len += msg_add_dict_text(buf + len, size - len, "DEVICE",
                                 dev_info->device);
out:
        return len;
}

/* /dev/kmsg - userspace message inject/listen interface */
struct devkmsg_user {
        atomic64_t seq;
        struct ratelimit_state rs;
        struct mutex lock;
        struct printk_buffers pbufs;
};

static __printf(3, 4) __cold
int devkmsg_emit(int facility, int level, const char *fmt, ...)
{
        va_list args;
        int r;

        va_start(args, fmt);
        r = vprintk_emit(facility, level, NULL, fmt, args);
        va_end(args);

        return r;
}

static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
{
        char *buf, *line;
        int level = default_message_loglevel;
        int facility = 1;        /* LOG_USER */
        struct file *file = iocb->ki_filp;
        struct devkmsg_user *user = file->private_data;
        size_t len = iov_iter_count(from);
        ssize_t ret = len;

        if (len > PRINTKRB_RECORD_MAX)
                return -EINVAL;

        /* Ignore when user logging is disabled. */
        if (devkmsg_log & DEVKMSG_LOG_MASK_OFF)
                return len;

        /* Ratelimit when not explicitly enabled. */
        if (!(devkmsg_log & DEVKMSG_LOG_MASK_ON)) {
                if (!___ratelimit(&user->rs, current->comm))
                        return ret;
        }

        buf = kmalloc(len+1, GFP_KERNEL);
        if (buf == NULL)
                return -ENOMEM;

        buf[len] = '\0';
        if (!copy_from_iter_full(buf, len, from)) {
                kfree(buf);
                return -EFAULT;
        }

        /*
         * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace
         * the decimal value represents 32bit, the lower 3 bit are the log
         * level, the rest are the log facility.
         *
         * If no prefix or no userspace facility is specified, we
         * enforce LOG_USER, to be able to reliably distinguish
         * kernel-generated messages from userspace-injected ones.
         */
        line = buf;
        if (line[0] == '<') {
                char *endp = NULL;
                unsigned int u;

                u = simple_strtoul(line + 1, &endp, 10);
                if (endp && endp[0] == '>') {
                        level = LOG_LEVEL(u);
                        if (LOG_FACILITY(u) != 0)
                                facility = LOG_FACILITY(u);
                        endp++;
                        line = endp;
                }
        }

        devkmsg_emit(facility, level, "%s", line);
        kfree(buf);
        return ret;
}

static ssize_t devkmsg_read(struct file *file, char __user *buf,
                            size_t count, loff_t *ppos)
{
        struct devkmsg_user *user = file->private_data;
        char *outbuf = &user->pbufs.outbuf[0];
        struct printk_message pmsg = {
                .pbufs = &user->pbufs,
        };
        ssize_t ret;

        ret = mutex_lock_interruptible(&user->lock);
        if (ret)
                return ret;

        if (!printk_get_next_message(&pmsg, atomic64_read(&user->seq), true, false)) {
                if (file->f_flags & O_NONBLOCK) {
                        ret = -EAGAIN;
                        goto out;
                }

                /*
                 * Guarantee this task is visible on the waitqueue before
                 * checking the wake condition.
                 *
                 * The full memory barrier within set_current_state() of
                 * prepare_to_wait_event() pairs with the full memory barrier
                 * within wq_has_sleeper().
                 *
                 * This pairs with __wake_up_klogd:A.
                 */
                ret = wait_event_interruptible(log_wait,
                                printk_get_next_message(&pmsg, atomic64_read(&user->seq), true,
                                                        false)); /* LMM(devkmsg_read:A) */
                if (ret)
                        goto out;
        }

        if (pmsg.dropped) {
                /* our last seen message is gone, return error and reset */
                atomic64_set(&user->seq, pmsg.seq);
                ret = -EPIPE;
                goto out;
        }

        atomic64_set(&user->seq, pmsg.seq + 1);

        if (pmsg.outbuf_len > count) {
                ret = -EINVAL;
                goto out;
        }

        if (copy_to_user(buf, outbuf, pmsg.outbuf_len)) {
                ret = -EFAULT;
                goto out;
        }
        ret = pmsg.outbuf_len;
out:
        mutex_unlock(&user->lock);
        return ret;
}

/*
 * Be careful when modifying this function!!!
 *
 * Only few operations are supported because the device works only with the
 * entire variable length messages (records). Non-standard values are
 * returned in the other cases and has been this way for quite some time.
 * User space applications might depend on this behavior.
 */
static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
{
        struct devkmsg_user *user = file->private_data;
        loff_t ret = 0;

        if (offset)
                return -ESPIPE;

        switch (whence) {
        case SEEK_SET:
                /* the first record */
                atomic64_set(&user->seq, prb_first_valid_seq(prb));
                break;
        case SEEK_DATA:
                /*
                 * The first record after the last SYSLOG_ACTION_CLEAR,
                 * like issued by 'dmesg -c'. Reading /dev/kmsg itself
                 * changes no global state, and does not clear anything.
                 */
                atomic64_set(&user->seq, latched_seq_read_nolock(&clear_seq));
                break;
        case SEEK_END:
                /* after the last record */
                atomic64_set(&user->seq, prb_next_seq(prb));
                break;
        default:
                ret = -EINVAL;
        }
        return ret;
}

static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
{
        struct devkmsg_user *user = file->private_data;
        struct printk_info info;
        __poll_t ret = 0;

        poll_wait(file, &log_wait, wait);

        if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) {
                /* return error when data has vanished underneath us */
                if (info.seq != atomic64_read(&user->seq))
                        ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
                else
                        ret = EPOLLIN|EPOLLRDNORM;
        }

        return ret;
}

static int devkmsg_open(struct inode *inode, struct file *file)
{
        struct devkmsg_user *user;
        int err;

        if (devkmsg_log & DEVKMSG_LOG_MASK_OFF)
                return -EPERM;

        /* write-only does not need any file context */
        if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
                err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
                                               SYSLOG_FROM_READER);
                if (err)
                        return err;
        }

        user = kvmalloc(sizeof(struct devkmsg_user), GFP_KERNEL);
        if (!user)
                return -ENOMEM;

        ratelimit_default_init(&user->rs);
        ratelimit_set_flags(&user->rs, RATELIMIT_MSG_ON_RELEASE);

        mutex_init(&user->lock);

        atomic64_set(&user->seq, prb_first_valid_seq(prb));

        file->private_data = user;
        return 0;
}

static int devkmsg_release(struct inode *inode, struct file *file)
{
        struct devkmsg_user *user = file->private_data;

        ratelimit_state_exit(&user->rs);

        mutex_destroy(&user->lock);
        kvfree(user);
        return 0;
}

const struct file_operations kmsg_fops = {
        .open = devkmsg_open,
        .read = devkmsg_read,
        .write_iter = devkmsg_write,
        .llseek = devkmsg_llseek,
        .poll = devkmsg_poll,
        .release = devkmsg_release,
};

#ifdef CONFIG_VMCORE_INFO
/*
 * This appends the listed symbols to /proc/vmcore
 *
 * /proc/vmcore is used by various utilities, like crash and makedumpfile to
 * obtain access to symbols that are otherwise very difficult to locate.  These
 * symbols are specifically used so that utilities can access and extract the
 * dmesg log from a vmcore file after a crash.
 */
void log_buf_vmcoreinfo_setup(void)
{
        struct dev_printk_info *dev_info = NULL;

        VMCOREINFO_SYMBOL(prb);
        VMCOREINFO_SYMBOL(printk_rb_static);
        VMCOREINFO_SYMBOL(clear_seq);

        /*
         * Export struct size and field offsets. User space tools can
         * parse it and detect any changes to structure down the line.
         */

        VMCOREINFO_STRUCT_SIZE(printk_ringbuffer);
        VMCOREINFO_OFFSET(printk_ringbuffer, desc_ring);
        VMCOREINFO_OFFSET(printk_ringbuffer, text_data_ring);
        VMCOREINFO_OFFSET(printk_ringbuffer, fail);

        VMCOREINFO_STRUCT_SIZE(prb_desc_ring);
        VMCOREINFO_OFFSET(prb_desc_ring, count_bits);
        VMCOREINFO_OFFSET(prb_desc_ring, descs);
        VMCOREINFO_OFFSET(prb_desc_ring, infos);
        VMCOREINFO_OFFSET(prb_desc_ring, head_id);
        VMCOREINFO_OFFSET(prb_desc_ring, tail_id);

        VMCOREINFO_STRUCT_SIZE(prb_desc);
        VMCOREINFO_OFFSET(prb_desc, state_var);
        VMCOREINFO_OFFSET(prb_desc, text_blk_lpos);

        VMCOREINFO_STRUCT_SIZE(prb_data_blk_lpos);
        VMCOREINFO_OFFSET(prb_data_blk_lpos, begin);
        VMCOREINFO_OFFSET(prb_data_blk_lpos, next);

        VMCOREINFO_STRUCT_SIZE(printk_info);
        VMCOREINFO_OFFSET(printk_info, seq);
        VMCOREINFO_OFFSET(printk_info, ts_nsec);
        VMCOREINFO_OFFSET(printk_info, text_len);
        VMCOREINFO_OFFSET(printk_info, caller_id);
        VMCOREINFO_OFFSET(printk_info, dev_info);

        VMCOREINFO_STRUCT_SIZE(dev_printk_info);
        VMCOREINFO_OFFSET(dev_printk_info, subsystem);
        VMCOREINFO_LENGTH(printk_info_subsystem, sizeof(dev_info->subsystem));
        VMCOREINFO_OFFSET(dev_printk_info, device);
        VMCOREINFO_LENGTH(printk_info_device, sizeof(dev_info->device));

        VMCOREINFO_STRUCT_SIZE(prb_data_ring);
        VMCOREINFO_OFFSET(prb_data_ring, size_bits);
        VMCOREINFO_OFFSET(prb_data_ring, data);
        VMCOREINFO_OFFSET(prb_data_ring, head_lpos);
        VMCOREINFO_OFFSET(prb_data_ring, tail_lpos);

        VMCOREINFO_SIZE(atomic_long_t);
        VMCOREINFO_TYPE_OFFSET(atomic_long_t, counter);

        VMCOREINFO_STRUCT_SIZE(latched_seq);
        VMCOREINFO_OFFSET(latched_seq, val);
}
#endif

/* requested log_buf_len from kernel cmdline */
static unsigned long __initdata new_log_buf_len;

/* we practice scaling the ring buffer by powers of 2 */
static void __init log_buf_len_update(u64 size)
{
        if (size > (u64)LOG_BUF_LEN_MAX) {
                size = (u64)LOG_BUF_LEN_MAX;
                pr_err("log_buf over 2G is not supported.\n");
        }

        if (size)
                size = roundup_pow_of_two(size);
        if (size > log_buf_len)
                new_log_buf_len = (unsigned long)size;
}

/* save requested log_buf_len since it's too early to process it */
static int __init log_buf_len_setup(char *str)
{
        u64 size;

        if (!str)
                return -EINVAL;

        size = memparse(str, &str);

        log_buf_len_update(size);

        return 0;
}
early_param("log_buf_len", log_buf_len_setup);

#ifdef CONFIG_SMP
#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT)

static void __init log_buf_add_cpu(void)
{
        unsigned int cpu_extra;

        /*
         * archs should set up cpu_possible_bits properly with
         * set_cpu_possible() after setup_arch() but just in
         * case lets ensure this is valid.
         */
        if (num_possible_cpus() == 1)
                return;

        cpu_extra = (num_possible_cpus() - 1) * __LOG_CPU_MAX_BUF_LEN;

        /* by default this will only continue through for large > 64 CPUs */
        if (cpu_extra <= __LOG_BUF_LEN / 2)
                return;

        pr_info("log_buf_len individual max cpu contribution: %d bytes\n",
                __LOG_CPU_MAX_BUF_LEN);
        pr_info("log_buf_len total cpu_extra contributions: %d bytes\n",
                cpu_extra);
        pr_info("log_buf_len min size: %d bytes\n", __LOG_BUF_LEN);

        log_buf_len_update(cpu_extra + __LOG_BUF_LEN);
}
#else /* !CONFIG_SMP */
static inline void log_buf_add_cpu(void) {}
#endif /* CONFIG_SMP */

static void __init set_percpu_data_ready(void)
{
        __printk_percpu_data_ready = true;
}

static unsigned int __init add_to_rb(struct printk_ringbuffer *rb,
                                     struct printk_record *r)
{
        struct prb_reserved_entry e;
        struct printk_record dest_r;

        prb_rec_init_wr(&dest_r, r->info->text_len);

        if (!prb_reserve(&e, rb, &dest_r))
                return 0;

        memcpy(&dest_r.text_buf[0], &r->text_buf[0], r->info->text_len);
        dest_r.info->text_len = r->info->text_len;
        dest_r.info->facility = r->info->facility;
        dest_r.info->level = r->info->level;
        dest_r.info->flags = r->info->flags;
        dest_r.info->ts_nsec = r->info->ts_nsec;
        dest_r.info->caller_id = r->info->caller_id;
        memcpy(&dest_r.info->dev_info, &r->info->dev_info, sizeof(dest_r.info->dev_info));

        prb_final_commit(&e);

        return prb_record_text_space(&e);
}

static char setup_text_buf[PRINTKRB_RECORD_MAX] __initdata;

static void print_log_buf_usage_stats(void)
{
        unsigned int descs_count = log_buf_len >> PRB_AVGBITS;
        size_t meta_data_size;

        meta_data_size = descs_count * (sizeof(struct prb_desc) + sizeof(struct printk_info));

        pr_info("log buffer data + meta data: %u + %zu = %zu bytes\n",
                log_buf_len, meta_data_size, log_buf_len + meta_data_size);
}

void __init setup_log_buf(int early)
{
        struct printk_info *new_infos;
        unsigned int new_descs_count;
        struct prb_desc *new_descs;
        struct printk_info info;
        struct printk_record r;
        unsigned int text_size;
        size_t new_descs_size;
        size_t new_infos_size;
        unsigned long flags;
        char *new_log_buf;
        unsigned int free;
        u64 seq;

        /*
         * Some archs call setup_log_buf() multiple times - first is very
         * early, e.g. from setup_arch(), and second - when percpu_areas
         * are initialised.
         */
        if (!early)
                set_percpu_data_ready();

        if (log_buf != __log_buf)
                return;

        if (!early && !new_log_buf_len)
                log_buf_add_cpu();

        if (!new_log_buf_len) {
                /* Show the memory stats only once. */
                if (!early)
                        goto out;

                return;
        }

        new_descs_count = new_log_buf_len >> PRB_AVGBITS;
        if (new_descs_count == 0) {
                pr_err("new_log_buf_len: %lu too small\n", new_log_buf_len);
                goto out;
        }

        new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN);
        if (unlikely(!new_log_buf)) {
                pr_err("log_buf_len: %lu text bytes not available\n",
                       new_log_buf_len);
                goto out;
        }

        new_descs_size = new_descs_count * sizeof(struct prb_desc);
        new_descs = memblock_alloc(new_descs_size, LOG_ALIGN);
        if (unlikely(!new_descs)) {
                pr_err("log_buf_len: %zu desc bytes not available\n",
                       new_descs_size);
                goto err_free_log_buf;
        }

        new_infos_size = new_descs_count * sizeof(struct printk_info);
        new_infos = memblock_alloc(new_infos_size, LOG_ALIGN);
        if (unlikely(!new_infos)) {
                pr_err("log_buf_len: %zu info bytes not available\n",
                       new_infos_size);
                goto err_free_descs;
        }

        prb_rec_init_rd(&r, &info, &setup_text_buf[0], sizeof(setup_text_buf));

        prb_init(&printk_rb_dynamic,
                 new_log_buf, ilog2(new_log_buf_len),
                 new_descs, ilog2(new_descs_count),
                 new_infos);

        local_irq_save(flags);

        log_buf_len = new_log_buf_len;
        log_buf = new_log_buf;
        new_log_buf_len = 0;

        free = __LOG_BUF_LEN;
        prb_for_each_record(0, &printk_rb_static, seq, &r) {
                text_size = add_to_rb(&printk_rb_dynamic, &r);
                if (text_size > free)
                        free = 0;
                else
                        free -= text_size;
        }

        prb = &printk_rb_dynamic;

        local_irq_restore(flags);

        /*
         * Copy any remaining messages that might have appeared from
         * NMI context after copying but before switching to the
         * dynamic buffer.
         */
        prb_for_each_record(seq, &printk_rb_static, seq, &r) {
                text_size = add_to_rb(&printk_rb_dynamic, &r);
                if (text_size > free)
                        free = 0;
                else
                        free -= text_size;
        }

        if (seq != prb_next_seq(&printk_rb_static)) {
                pr_err("dropped %llu messages\n",
                       prb_next_seq(&printk_rb_static) - seq);
        }

        print_log_buf_usage_stats();
        pr_info("early log buf free: %u(%u%%)\n",
                free, (free * 100) / __LOG_BUF_LEN);
        return;

err_free_descs:
        memblock_free(new_descs, new_descs_size);
err_free_log_buf:
        memblock_free(new_log_buf, new_log_buf_len);
out:
        print_log_buf_usage_stats();
}

static bool __read_mostly ignore_loglevel;

static int __init ignore_loglevel_setup(char *str)
{
        ignore_loglevel = true;
        pr_info("debug: ignoring loglevel setting.\n");

        return 0;
}

early_param("ignore_loglevel", ignore_loglevel_setup);
module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(ignore_loglevel,
                 "ignore loglevel setting (prints all kernel messages to the console)");

static bool suppress_message_printing(int level)
{
        return (level >= console_loglevel && !ignore_loglevel);
}

#ifdef CONFIG_BOOT_PRINTK_DELAY

static int boot_delay; /* msecs delay after each printk during bootup */
static unsigned long long loops_per_msec;        /* based on boot_delay */

static int __init boot_delay_setup(char *str)
{
        unsigned long lpj;

        lpj = preset_lpj ? preset_lpj : 1000000;        /* some guess */
        loops_per_msec = (unsigned long long)lpj / 1000 * HZ;

        get_option(&str, &boot_delay);
        if (boot_delay > 10 * 1000)
                boot_delay = 0;

        pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, "
                "HZ: %d, loops_per_msec: %llu\n",
                boot_delay, preset_lpj, lpj, HZ, loops_per_msec);
        return 0;
}
early_param("boot_delay", boot_delay_setup);

static void boot_delay_msec(int level)
{
        unsigned long long k;
        unsigned long timeout;
        bool suppress = !is_printk_force_console() &&
                        suppress_message_printing(level);

        if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING) || suppress)
                return;

        k = (unsigned long long)loops_per_msec * boot_delay;

        timeout = jiffies + msecs_to_jiffies(boot_delay);
        while (k) {
                k--;
                cpu_relax();
                /*
                 * use (volatile) jiffies to prevent
                 * compiler reduction; loop termination via jiffies
                 * is secondary and may or may not happen.
                 */
                if (time_after(jiffies, timeout))
                        break;
                touch_nmi_watchdog();
        }
}
#else
static inline void boot_delay_msec(int level)
{
}
#endif

static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME);
module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);

static size_t print_syslog(unsigned int level, char *buf)
{
        return sprintf(buf, "<%u>", level);
}

static size_t print_time(u64 ts, char *buf)
{
        unsigned long rem_nsec = do_div(ts, 1000000000);

        return sprintf(buf, "[%5lu.%06lu]",
                       (unsigned long)ts, rem_nsec / 1000);
}

#ifdef CONFIG_PRINTK_CALLER
static size_t print_caller(u32 id, char *buf)
{
        char caller[12];

        snprintf(caller, sizeof(caller), "%c%u",
                 id & 0x80000000 ? 'C' : 'T', id & ~0x80000000);
        return sprintf(buf, "[%6s]", caller);
}
#else
#define print_caller(id, buf) 0
#endif

static size_t info_print_prefix(const struct printk_info  *info, bool syslog,
                                bool time, char *buf)
{
        size_t len = 0;

        if (syslog)
                len = print_syslog((info->facility << 3) | info->level, buf);

        if (time)
                len += print_time(info->ts_nsec, buf + len);

        len += print_caller(info->caller_id, buf + len);

        if (IS_ENABLED(CONFIG_PRINTK_CALLER) || time) {
                buf[len++] = ' ';
                buf[len] = '\0';
        }

        return len;
}

/*
 * Prepare the record for printing. The text is shifted within the given
 * buffer to avoid a need for another one. The following operations are
 * done:
 *
 *   - Add prefix for each line.
 *   - Drop truncated lines that no longer fit into the buffer.
 *   - Add the trailing newline that has been removed in vprintk_store().
 *   - Add a string terminator.
 *
 * Since the produced string is always terminated, the maximum possible
 * return value is @r->text_buf_size - 1;
 *
 * Return: The length of the updated/prepared text, including the added
 * prefixes and the newline. The terminator is not counted. The dropped
 * line(s) are not counted.
 */
static size_t record_print_text(struct printk_record *r, bool syslog,
                                bool time)
{
        size_t text_len = r->info->text_len;
        size_t buf_size = r->text_buf_size;
        char *text = r->text_buf;
        char prefix[PRINTK_PREFIX_MAX];
        bool truncated = false;
        size_t prefix_len;
        size_t line_len;
        size_t len = 0;
        char *next;

        /*
         * If the message was truncated because the buffer was not large
         * enough, treat the available text as if it were the full text.
         */
        if (text_len > buf_size)
                text_len = buf_size;

        prefix_len = info_print_prefix(r->info, syslog, time, prefix);

        /*
         * @text_len: bytes of unprocessed text
         * @line_len: bytes of current line _without_ newline
         * @text:     pointer to beginning of current line
         * @len:      number of bytes prepared in r->text_buf
         */
        for (;;) {
                next = memchr(text, '\n', text_len);
                if (next) {
                        line_len = next - text;
                } else {
                        /* Drop truncated line(s). */
                        if (truncated)
                                break;
                        line_len = text_len;
                }

                /*
                 * Truncate the text if there is not enough space to add the
                 * prefix and a trailing newline and a terminator.
                 */
                if (len + prefix_len + text_len + 1 + 1 > buf_size) {
                        /* Drop even the current line if no space. */
                        if (len + prefix_len + line_len + 1 + 1 > buf_size)
                                break;

                        text_len = buf_size - len - prefix_len - 1 - 1;
                        truncated = true;
                }

                memmove(text + prefix_len, text, text_len);
                memcpy(text, prefix, prefix_len);

                /*
                 * Increment the prepared length to include the text and
                 * prefix that were just moved+copied. Also increment for the
                 * newline at the end of this line. If this is the last line,
                 * there is no newline, but it will be added immediately below.
                 */
                len += prefix_len + line_len + 1;
                if (text_len == line_len) {
                        /*
                         * This is the last line. Add the trailing newline
                         * removed in vprintk_store().
                         */
                        text[prefix_len + line_len] = '\n';
                        break;
                }

                /*
                 * Advance beyond the added prefix and the related line with
                 * its newline.
                 */
                text += prefix_len + line_len + 1;

                /*
                 * The remaining text has only decreased by the line with its
                 * newline.
                 *
                 * Note that @text_len can become zero. It happens when @text
                 * ended with a newline (either due to truncation or the
                 * original string ending with "\n\n"). The loop is correctly
                 * repeated and (if not truncated) an empty line with a prefix
                 * will be prepared.
                 */
                text_len -= line_len + 1;
        }

        /*
         * If a buffer was provided, it will be terminated. Space for the
         * string terminator is guaranteed to be available. The terminator is
         * not counted in the return value.
         */
        if (buf_size > 0)
                r->text_buf[len] = 0;

        return len;
}

static size_t get_record_print_text_size(struct printk_info *info,
                                         unsigned int line_count,
                                         bool syslog, bool time)
{
        char prefix[PRINTK_PREFIX_MAX];
        size_t prefix_len;

        prefix_len = info_print_prefix(info, syslog, time, prefix);

        /*
         * Each line will be preceded with a prefix. The intermediate
         * newlines are already within the text, but a final trailing
         * newline will be added.
         */
        return ((prefix_len * line_count) + info->text_len + 1);
}

/*
 * Beginning with @start_seq, find the first record where it and all following
 * records up to (but not including) @max_seq fit into @size.
 *
 * @max_seq is simply an upper bound and does not need to exist. If the caller
 * does not require an upper bound, -1 can be used for @max_seq.
 */
static u64 find_first_fitting_seq(u64 start_seq, u64 max_seq, size_t size,
                                  bool syslog, bool time)
{
        struct printk_info info;
        unsigned int line_count;
        size_t len = 0;
        u64 seq;

        /* Determine the size of the records up to @max_seq. */
        prb_for_each_info(start_seq, prb, seq, &info, &line_count) {
                if (info.seq >= max_seq)
                        break;
                len += get_record_print_text_size(&info, line_count, syslog, time);
        }

        /*
         * Adjust the upper bound for the next loop to avoid subtracting
         * lengths that were never added.
         */
        if (seq < max_seq)
                max_seq = seq;

        /*
         * Move first record forward until length fits into the buffer. Ignore
         * newest messages that were not counted in the above cycle. Messages
         * might appear and get lost in the meantime. This is a best effort
         * that prevents an infinite loop that could occur with a retry.
         */
        prb_for_each_info(start_seq, prb, seq, &info, &line_count) {
                if (len <= size || info.seq >= max_seq)
                        break;
                len -= get_record_print_text_size(&info, line_count, syslog, time);
        }

        return seq;
}

/* The caller is responsible for making sure @size is greater than 0. */
static int syslog_print(char __user *buf, int size)
{
        struct printk_info info;
        struct printk_record r;
        char *text;
        int len = 0;
        u64 seq;

        text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL);
        if (!text)
                return -ENOMEM;

        prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX);

        mutex_lock(&syslog_lock);

        /*
         * Wait for the @syslog_seq record to be available. @syslog_seq may
         * change while waiting.
         */
        do {
                seq = syslog_seq;

                mutex_unlock(&syslog_lock);
                /*
                 * Guarantee this task is visible on the waitqueue before
                 * checking the wake condition.
                 *
                 * The full memory barrier within set_current_state() of
                 * prepare_to_wait_event() pairs with the full memory barrier
                 * within wq_has_sleeper().
                 *
                 * This pairs with __wake_up_klogd:A.
                 */
                len = wait_event_interruptible(log_wait,
                                prb_read_valid(prb, seq, NULL)); /* LMM(syslog_print:A) */
                mutex_lock(&syslog_lock);

                if (len)
                        goto out;
        } while (syslog_seq != seq);

        /*
         * Copy records that fit into the buffer. The above cycle makes sure
         * that the first record is always available.
         */
        do {
                size_t n;
                size_t skip;
                int err;

                if (!prb_read_valid(prb, syslog_seq, &r))
                        break;

                if (r.info->seq != syslog_seq) {
                        /* message is gone, move to next valid one */
                        syslog_seq = r.info->seq;
                        syslog_partial = 0;
                }

                /*
                 * To keep reading/counting partial line consistent,
                 * use printk_time value as of the beginning of a line.
                 */
                if (!syslog_partial)
                        syslog_time = printk_time;

                skip = syslog_partial;
                n = record_print_text(&r, true, syslog_time);
                if (n - syslog_partial <= size) {
                        /* message fits into buffer, move forward */
                        syslog_seq = r.info->seq + 1;
                        n -= syslog_partial;
                        syslog_partial = 0;
                } else if (!len){
                        /* partial read(), remember position */
                        n = size;
                        syslog_partial += n;
                } else
                        n = 0;

                if (!n)
                        break;

                mutex_unlock(&syslog_lock);
                err = copy_to_user(buf, text + skip, n);
                mutex_lock(&syslog_lock);

                if (err) {
                        if (!len)
                                len = -EFAULT;
                        break;
                }

                len += n;
                size -= n;
                buf += n;
        } while (size);
out:
        mutex_unlock(&syslog_lock);
        kfree(text);
        return len;
}

static int syslog_print_all(char __user *buf, int size, bool clear)
{
        struct printk_info info;
        struct printk_record r;
        char *text;
        int len = 0;
        u64 seq;
        bool time;

        text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL);
        if (!text)
                return -ENOMEM;

        time = printk_time;
        /*
         * Find first record that fits, including all following records,
         * into the user-provided buffer for this dump.
         */
        seq = find_first_fitting_seq(latched_seq_read_nolock(&clear_seq), -1,
                                     size, true, time);

        prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX);

        prb_for_each_record(seq, prb, seq, &r) {
                int textlen;

                textlen = record_print_text(&r, true, time);

                if (len + textlen > size) {
                        seq--;
                        break;
                }

                if (copy_to_user(buf + len, text, textlen))
                        len = -EFAULT;
                else
                        len += textlen;

                if (len < 0)
                        break;
        }

        if (clear) {
                mutex_lock(&syslog_lock);
                latched_seq_write(&clear_seq, seq);
                mutex_unlock(&syslog_lock);
        }

        kfree(text);
        return len;
}

static void syslog_clear(void)
{
        mutex_lock(&syslog_lock);
        latched_seq_write(&clear_seq, prb_next_seq(prb));
        mutex_unlock(&syslog_lock);
}

int do_syslog(int type, char __user *buf, int len, int source)
{
        struct printk_info info;
        bool clear = false;
        static int saved_console_loglevel = LOGLEVEL_DEFAULT;
        int error;

        error = check_syslog_permissions(type, source);
        if (error)
                return error;

        switch (type) {
        case SYSLOG_ACTION_CLOSE:        /* Close log */
                break;
        case SYSLOG_ACTION_OPEN:        /* Open log */
                break;
        case SYSLOG_ACTION_READ:        /* Read from log */
                if (!buf || len < 0)
                        return -EINVAL;
                if (!len)
                        return 0;
                if (!access_ok(buf, len))
                        return -EFAULT;
                error = syslog_print(buf, len);
                break;
        /* Read/clear last kernel messages */
        case SYSLOG_ACTION_READ_CLEAR:
                clear = true;
                fallthrough;
        /* Read last kernel messages */
        case SYSLOG_ACTION_READ_ALL:
                if (!buf || len < 0)
                        return -EINVAL;
                if (!len)
                        return 0;
                if (!access_ok(buf, len))
                        return -EFAULT;
                error = syslog_print_all(buf, len, clear);
                break;
        /* Clear ring buffer */
        case SYSLOG_ACTION_CLEAR:
                syslog_clear();
                break;
        /* Disable logging to console */
        case SYSLOG_ACTION_CONSOLE_OFF:
                if (saved_console_loglevel == LOGLEVEL_DEFAULT)
                        saved_console_loglevel = console_loglevel;
                console_loglevel = minimum_console_loglevel;
                break;
        /* Enable logging to console */
        case SYSLOG_ACTION_CONSOLE_ON:
                if (saved_console_loglevel != LOGLEVEL_DEFAULT) {
                        console_loglevel = saved_console_loglevel;
                        saved_console_loglevel = LOGLEVEL_DEFAULT;
                }
                break;
        /* Set level of messages printed to console */
        case SYSLOG_ACTION_CONSOLE_LEVEL:
                if (len < 1 || len > 8)
                        return -EINVAL;
                if (len < minimum_console_loglevel)
                        len = minimum_console_loglevel;
                console_loglevel = len;
                /* Implicitly re-enable logging to console */
                saved_console_loglevel = LOGLEVEL_DEFAULT;
                break;
        /* Number of chars in the log buffer */
        case SYSLOG_ACTION_SIZE_UNREAD:
                mutex_lock(&syslog_lock);
                if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) {
                        /* No unread messages. */
                        mutex_unlock(&syslog_lock);
                        return 0;
                }
                if (info.seq != syslog_seq) {
                        /* messages are gone, move to first one */
                        syslog_seq = info.seq;
                        syslog_partial = 0;
                }
                if (source == SYSLOG_FROM_PROC) {
                        /*
                         * Short-cut for poll(/"proc/kmsg") which simply checks
                         * for pending data, not the size; return the count of
                         * records, not the length.
                         */
                        error = prb_next_seq(prb) - syslog_seq;
                } else {
                        bool time = syslog_partial ? syslog_time : printk_time;
                        unsigned int line_count;
                        u64 seq;

                        prb_for_each_info(syslog_seq, prb, seq, &info,
                                          &line_count) {
                                error += get_record_print_text_size(&info, line_count,
                                                                    true, time);
                                time = printk_time;
                        }
                        error -= syslog_partial;
                }
                mutex_unlock(&syslog_lock);
                break;
        /* Size of the log buffer */
        case SYSLOG_ACTION_SIZE_BUFFER:
                error = log_buf_len;
                break;
        default:
                error = -EINVAL;
                break;
        }

        return error;
}

SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
{
        return do_syslog(type, buf, len, SYSLOG_FROM_READER);
}

/*
 * Special console_lock variants that help to reduce the risk of soft-lockups.
 * They allow to pass console_lock to another printk() call using a busy wait.
 */

#ifdef CONFIG_LOCKDEP
static struct lockdep_map console_owner_dep_map = {
        .name = "console_owner"
};
#endif

static DEFINE_RAW_SPINLOCK(console_owner_lock);
static struct task_struct *console_owner;
static bool console_waiter;

/**
 * console_lock_spinning_enable - mark beginning of code where another
 *        thread might safely busy wait
 *
 * This basically converts console_lock into a spinlock. This marks
 * the section where the console_lock owner can not sleep, because
 * there may be a waiter spinning (like a spinlock). Also it must be
 * ready to hand over the lock at the end of the section.
 */
void console_lock_spinning_enable(void)
{
        /*
         * Do not use spinning in panic(). The panic CPU wants to keep the lock.
         * Non-panic CPUs abandon the flush anyway.
         *
         * Just keep the lockdep annotation. The panic-CPU should avoid
         * taking console_owner_lock because it might cause a deadlock.
         * This looks like the easiest way how to prevent false lockdep
         * reports without handling races a lockless way.
         */
        if (panic_in_progress())
                goto lockdep;

        raw_spin_lock(&console_owner_lock);
        console_owner = current;
        raw_spin_unlock(&console_owner_lock);

lockdep:
        /* The waiter may spin on us after setting console_owner */
        spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_);
}

/**
 * console_lock_spinning_disable_and_check - mark end of code where another
 *        thread was able to busy wait and check if there is a waiter
 * @cookie: cookie returned from console_srcu_read_lock()
 *
 * This is called at the end of the section where spinning is allowed.
 * It has two functions. First, it is a signal that it is no longer
 * safe to start busy waiting for the lock. Second, it checks if
 * there is a busy waiter and passes the lock rights to her.
 *
 * Important: Callers lose both the console_lock and the SRCU read lock if
 *        there was a busy waiter. They must not touch items synchronized by
 *        console_lock or SRCU read lock in this case.
 *
 * Return: 1 if the lock rights were passed, 0 otherwise.
 */
int console_lock_spinning_disable_and_check(int cookie)
{
        int waiter;

        /*
         * Ignore spinning waiters during panic() because they might get stopped
         * or blocked at any time,
         *
         * It is safe because nobody is allowed to start spinning during panic
         * in the first place. If there has been a waiter then non panic CPUs
         * might stay spinning. They would get stopped anyway. The panic context
         * will never start spinning and an interrupted spin on panic CPU will
         * never continue.
         */
        if (panic_in_progress()) {
                /* Keep lockdep happy. */
                spin_release(&console_owner_dep_map, _THIS_IP_);
                return 0;
        }

        raw_spin_lock(&console_owner_lock);
        waiter = READ_ONCE(console_waiter);
        console_owner = NULL;
        raw_spin_unlock(&console_owner_lock);

        if (!waiter) {
                spin_release(&console_owner_dep_map, _THIS_IP_);
                return 0;
        }

        /* The waiter is now free to continue */
        WRITE_ONCE(console_waiter, false);

        spin_release(&console_owner_dep_map, _THIS_IP_);

        /*
         * Preserve lockdep lock ordering. Release the SRCU read lock before
         * releasing the console_lock.
         */
        console_srcu_read_unlock(cookie);

        /*
         * Hand off console_lock to waiter. The waiter will perform
         * the up(). After this, the waiter is the console_lock owner.
         */
        mutex_release(&console_lock_dep_map, _THIS_IP_);
        return 1;
}

/**
 * console_trylock_spinning - try to get console_lock by busy waiting
 *
 * This allows to busy wait for the console_lock when the current
 * owner is running in specially marked sections. It means that
 * the current owner is running and cannot reschedule until it
 * is ready to lose the lock.
 *
 * Return: 1 if we got the lock, 0 othrewise
 */
static int console_trylock_spinning(void)
{
        struct task_struct *owner = NULL;
        bool waiter;
        bool spin = false;
        unsigned long flags;

        if (console_trylock())
                return 1;

        /*
         * It's unsafe to spin once a panic has begun. If we are the
         * panic CPU, we may have already halted the owner of the
         * console_sem. If we are not the panic CPU, then we should
         * avoid taking console_sem, so the panic CPU has a better
         * chance of cleanly acquiring it later.
         */
        if (panic_in_progress())
                return 0;

        printk_safe_enter_irqsave(flags);

        raw_spin_lock(&console_owner_lock);
        owner = READ_ONCE(console_owner);
        waiter = READ_ONCE(console_waiter);
        if (!waiter && owner && owner != current) {
                WRITE_ONCE(console_waiter, true);
                spin = true;
        }
        raw_spin_unlock(&console_owner_lock);

        /*
         * If there is an active printk() writing to the
         * consoles, instead of having it write our data too,
         * see if we can offload that load from the active
         * printer, and do some printing ourselves.
         * Go into a spin only if there isn't already a waiter
         * spinning, and there is an active printer, and
         * that active printer isn't us (recursive printk?).
         */
        if (!spin) {
                printk_safe_exit_irqrestore(flags);
                return 0;
        }

        /* We spin waiting for the owner to release us */
        spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_);
        /* Owner will clear console_waiter on hand off */
        while (READ_ONCE(console_waiter))
                cpu_relax();
        spin_release(&console_owner_dep_map, _THIS_IP_);

        printk_safe_exit_irqrestore(flags);
        /*
         * The owner passed the console lock to us.
         * Since we did not spin on console lock, annotate
         * this as a trylock. Otherwise lockdep will
         * complain.
         */
        mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_);

        /*
         * Update @console_may_schedule for trylock because the previous
         * owner may have been schedulable.
         */
        console_may_schedule = 0;

        return 1;
}

/*
 * Recursion is tracked separately on each CPU. If NMIs are supported, an
 * additional NMI context per CPU is also separately tracked. Until per-CPU
 * is available, a separate "early tracking" is performed.
 */
static DEFINE_PER_CPU(u8, printk_count);
static u8 printk_count_early;
#ifdef CONFIG_HAVE_NMI
static DEFINE_PER_CPU(u8, printk_count_nmi);
static u8 printk_count_nmi_early;
#endif

/*
 * Recursion is limited to keep the output sane. printk() should not require
 * more than 1 level of recursion (allowing, for example, printk() to trigger
 * a WARN), but a higher value is used in case some printk-internal errors
 * exist, such as the ringbuffer validation checks failing.
 */
#define PRINTK_MAX_RECURSION 3

/*
 * Return a pointer to the dedicated counter for the CPU+context of the
 * caller.
 */
static u8 *__printk_recursion_counter(void)
{
#ifdef CONFIG_HAVE_NMI
        if (in_nmi()) {
                if (printk_percpu_data_ready())
                        return this_cpu_ptr(&printk_count_nmi);
                return &printk_count_nmi_early;
        }
#endif
        if (printk_percpu_data_ready())
                return this_cpu_ptr(&printk_count);
        return &printk_count_early;
}

/*
 * Enter recursion tracking. Interrupts are disabled to simplify tracking.
 * The caller must check the boolean return value to see if the recursion is
 * allowed. On failure, interrupts are not disabled.
 *
 * @recursion_ptr must be a variable of type (u8 *) and is the same variable
 * that is passed to printk_exit_irqrestore().
 */
#define printk_enter_irqsave(recursion_ptr, flags)        \
({                                                        \
        bool success = true;                                \
                                                        \
        typecheck(u8 *, recursion_ptr);                        \
        local_irq_save(flags);                                \
        (recursion_ptr) = __printk_recursion_counter();        \
        if (*(recursion_ptr) > PRINTK_MAX_RECURSION) {        \
                local_irq_restore(flags);                \
                success = false;                        \
        } else {                                        \
                (*(recursion_ptr))++;                        \
        }                                                \
        success;                                        \
})

/* Exit recursion tracking, restoring interrupts. */
#define printk_exit_irqrestore(recursion_ptr, flags)        \
        do {                                                \
                typecheck(u8 *, recursion_ptr);                \
                (*(recursion_ptr))--;                        \
                local_irq_restore(flags);                \
        } while (0)

int printk_delay_msec __read_mostly;

static inline void printk_delay(int level)
{
        boot_delay_msec(level);

        if (unlikely(printk_delay_msec)) {
                int m = printk_delay_msec;

                while (m--) {
                        mdelay(1);
                        touch_nmi_watchdog();
                }
        }
}

static inline u32 printk_caller_id(void)
{
        return in_task() ? task_pid_nr(current) :
                0x80000000 + smp_processor_id();
}

/**
 * printk_parse_prefix - Parse level and control flags.
 *
 * @text:     The terminated text message.
 * @level:    A pointer to the current level value, will be updated.
 * @flags:    A pointer to the current printk_info flags, will be updated.
 *
 * @level may be NULL if the caller is not interested in the parsed value.
 * Otherwise the variable pointed to by @level must be set to
 * LOGLEVEL_DEFAULT in order to be updated with the parsed value.
 *
 * @flags may be NULL if the caller is not interested in the parsed value.
 * Otherwise the variable pointed to by @flags will be OR'd with the parsed
 * value.
 *
 * Return: The length of the parsed level and control flags.
 */
u16 printk_parse_prefix(const char *text, int *level,
                        enum printk_info_flags *flags)
{
        u16 prefix_len = 0;
        int kern_level;

        while (*text) {
                kern_level = printk_get_level(text);
                if (!kern_level)
                        break;

                switch (kern_level) {
                case '0' ... '7':
                        if (level && *level == LOGLEVEL_DEFAULT)
                                *level = kern_level - '0';
                        break;
                case 'c':        /* KERN_CONT */
                        if (flags)
                                *flags |= LOG_CONT;
                }

                prefix_len += 2;
                text += 2;
        }

        return prefix_len;
}

__printf(5, 0)
static u16 printk_sprint(char *text, u16 size, int facility,
                         enum printk_info_flags *flags, const char *fmt,
                         va_list args)
{
        u16 text_len;

        text_len = vscnprintf(text, size, fmt, args);

        /* Mark and strip a trailing newline. */
        if (text_len && text[text_len - 1] == '\n') {
                text_len--;
                *flags |= LOG_NEWLINE;
        }

        /* Strip log level and control flags. */
        if (facility == 0) {
                u16 prefix_len;

                prefix_len = printk_parse_prefix(text, NULL, NULL);
                if (prefix_len) {
                        text_len -= prefix_len;
                        memmove(text, text + prefix_len, text_len);
                }
        }

        trace_console(text, text_len);

        return text_len;
}

__printf(4, 0)
int vprintk_store(int facility, int level,
                  const struct dev_printk_info *dev_info,
                  const char *fmt, va_list args)
{
        struct prb_reserved_entry e;
        enum printk_info_flags flags = 0;
        struct printk_record r;
        unsigned long irqflags;
        u16 trunc_msg_len = 0;
        char prefix_buf[8];
        u8 *recursion_ptr;
        u16 reserve_size;
        va_list args2;
        u32 caller_id;
        u16 text_len;
        int ret = 0;
        u64 ts_nsec;

        if (!printk_enter_irqsave(recursion_ptr, irqflags))
                return 0;

        /*
         * Since the duration of printk() can vary depending on the message
         * and state of the ringbuffer, grab the timestamp now so that it is
         * close to the call of printk(). This provides a more deterministic
         * timestamp with respect to the caller.
         */
        ts_nsec = local_clock();

        caller_id = printk_caller_id();

        /*
         * The sprintf needs to come first since the syslog prefix might be
         * passed in as a parameter. An extra byte must be reserved so that
         * later the vscnprintf() into the reserved buffer has room for the
         * terminating '\0', which is not counted by vsnprintf().
         */
        va_copy(args2, args);
        reserve_size = vsnprintf(&prefix_buf[0], sizeof(prefix_buf), fmt, args2) + 1;
        va_end(args2);

        if (reserve_size > PRINTKRB_RECORD_MAX)
                reserve_size = PRINTKRB_RECORD_MAX;

        /* Extract log level or control flags. */
        if (facility == 0)
                printk_parse_prefix(&prefix_buf[0], &level, &flags);

        if (level == LOGLEVEL_DEFAULT)
                level = default_message_loglevel;

        if (dev_info)
                flags |= LOG_NEWLINE;

        if (is_printk_force_console())
                flags |= LOG_FORCE_CON;

        if (flags & LOG_CONT) {
                prb_rec_init_wr(&r, reserve_size);
                if (prb_reserve_in_last(&e, prb, &r, caller_id, PRINTKRB_RECORD_MAX)) {
                        text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size,
                                                 facility, &flags, fmt, args);
                        r.info->text_len += text_len;

                        if (flags & LOG_FORCE_CON)
                                r.info->flags |= LOG_FORCE_CON;

                        if (flags & LOG_NEWLINE) {
                                r.info->flags |= LOG_NEWLINE;
                                prb_final_commit(&e);
                        } else {
                                prb_commit(&e);
                        }

                        ret = text_len;
                        goto out;
                }
        }

        /*
         * Explicitly initialize the record before every prb_reserve() call.
         * prb_reserve_in_last() and prb_reserve() purposely invalidate the
         * structure when they fail.
         */
        prb_rec_init_wr(&r, reserve_size);
        if (!prb_reserve(&e, prb, &r)) {
                /* truncate the message if it is too long for empty buffer */
                truncate_msg(&reserve_size, &trunc_msg_len);

                prb_rec_init_wr(&r, reserve_size + trunc_msg_len);
                if (!prb_reserve(&e, prb, &r))
                        goto out;
        }

        /* fill message */
        text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &flags, fmt, args);
        if (trunc_msg_len)
                memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len);
        r.info->text_len = text_len + trunc_msg_len;
        r.info->facility = facility;
        r.info->level = level & 7;
        r.info->flags = flags & 0x1f;
        r.info->ts_nsec = ts_nsec;
        r.info->caller_id = caller_id;
        if (dev_info)
                memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info));

        /* A message without a trailing newline can be continued. */
        if (!(flags & LOG_NEWLINE))
                prb_commit(&e);
        else
                prb_final_commit(&e);

        ret = text_len + trunc_msg_len;
out:
        printk_exit_irqrestore(recursion_ptr, irqflags);
        return ret;
}

/*
 * This acts as a one-way switch to allow legacy consoles to print from
 * the printk() caller context on a panic CPU. It also attempts to flush
 * the legacy consoles in this context.
 */
void printk_legacy_allow_panic_sync(void)
{
        struct console_flush_type ft;

        legacy_allow_panic_sync = true;

        printk_get_console_flush_type(&ft);
        if (ft.legacy_direct) {
                if (console_trylock())
                        console_unlock();
        }
}

bool __read_mostly debug_non_panic_cpus;

#ifdef CONFIG_PRINTK_CALLER
static int __init debug_non_panic_cpus_setup(char *str)
{
        debug_non_panic_cpus = true;
        pr_info("allow messages from non-panic CPUs in panic()\n");

        return 0;
}
early_param("debug_non_panic_cpus", debug_non_panic_cpus_setup);
module_param(debug_non_panic_cpus, bool, 0644);
MODULE_PARM_DESC(debug_non_panic_cpus,
                 "allow messages from non-panic CPUs in panic()");
#endif

asmlinkage int vprintk_emit(int facility, int level,
                            const struct dev_printk_info *dev_info,
                            const char *fmt, va_list args)
{
        struct console_flush_type ft;
        int printed_len;

        /* Suppress unimportant messages after panic happens */
        if (unlikely(suppress_printk))
                return 0;

        /*
         * The messages on the panic CPU are the most important. If
         * non-panic CPUs are generating any messages, they will be
         * silently dropped.
         */
        if (panic_on_other_cpu() &&
            !debug_non_panic_cpus &&
            !panic_triggering_all_cpu_backtrace)
                return 0;

        printk_get_console_flush_type(&ft);

        /* If called from the scheduler, we can not call up(). */
        if (level == LOGLEVEL_SCHED) {
                level = LOGLEVEL_DEFAULT;
                ft.legacy_offload |= ft.legacy_direct;
                ft.legacy_direct = false;
        }

        printk_delay(level);

        printed_len = vprintk_store(facility, level, dev_info, fmt, args);

        if (ft.nbcon_atomic)
                nbcon_atomic_flush_pending();

        if (ft.nbcon_offload)
                nbcon_kthreads_wake();

        if (ft.legacy_direct) {
                /*
                 * The caller may be holding system-critical or
                 * timing-sensitive locks. Disable preemption during
                 * printing of all remaining records to all consoles so that
                 * this context can return as soon as possible. Hopefully
                 * another printk() caller will take over the printing.
                 */
                preempt_disable();
                /*
                 * Try to acquire and then immediately release the console
                 * semaphore. The release will print out buffers. With the
                 * spinning variant, this context tries to take over the
                 * printing from another printing context.
                 */
                if (console_trylock_spinning())
                        console_unlock();
                preempt_enable();
        }

        if (ft.legacy_offload)
                defer_console_output();
        else
                wake_up_klogd();

        return printed_len;
}
EXPORT_SYMBOL(vprintk_emit);

int vprintk_default(const char *fmt, va_list args)
{
        return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
}
EXPORT_SYMBOL_GPL(vprintk_default);

asmlinkage __visible int _printk(const char *fmt, ...)
{
        va_list args;
        int r;

        va_start(args, fmt);
        r = vprintk(fmt, args);
        va_end(args);

        return r;
}
EXPORT_SYMBOL(_printk);

static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress);

#else /* CONFIG_PRINTK */

#define printk_time                false

#define prb_read_valid(rb, seq, r)        false
#define prb_first_valid_seq(rb)                0
#define prb_next_seq(rb)                0

static u64 syslog_seq;

static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; }

#endif /* CONFIG_PRINTK */

#ifdef CONFIG_EARLY_PRINTK
struct console *early_console;

asmlinkage __visible void early_printk(const char *fmt, ...)
{
        va_list ap;
        char buf[512];
        int n;

        if (!early_console)
                return;

        va_start(ap, fmt);
        n = vscnprintf(buf, sizeof(buf), fmt, ap);
        va_end(ap);

        early_console->write(early_console, buf, n);
}
#endif

static void set_user_specified(struct console_cmdline *c, bool user_specified)
{
        if (!user_specified)
                return;

        /*
         * @c console was defined by the user on the command line.
         * Do not clear when added twice also by SPCR or the device tree.
         */
        c->user_specified = true;
        /* At least one console defined by the user on the command line. */
        console_set_on_cmdline = 1;
}

static int __add_preferred_console(const char *name, const short idx,
                                   const char *devname, char *options,
                                   char *brl_options, bool user_specified)
{
        struct console_cmdline *c;
        int i;

        if (!name && !devname)
                return -EINVAL;

        /*
         * We use a signed short index for struct console for device drivers to
         * indicate a not yet assigned index or port. However, a negative index
         * value is not valid when the console name and index are defined on
         * the command line.
         */
        if (name && idx < 0)
                return -EINVAL;

        /*
         *        See if this tty is not yet registered, and
         *        if we have a slot free.
         */
        for (i = 0, c = console_cmdline;
             i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]);
             i++, c++) {
                if ((name && strcmp(c->name, name) == 0 && c->index == idx) ||
                    (devname && strcmp(c->devname, devname) == 0)) {
                        if (!brl_options)
                                preferred_console = i;
                        set_user_specified(c, user_specified);
                        return 0;
                }
        }
        if (i == MAX_CMDLINECONSOLES)
                return -E2BIG;
        if (!brl_options)
                preferred_console = i;
        if (name)
                strscpy(c->name, name);
        if (devname)
                strscpy(c->devname, devname);
        c->options = options;
        set_user_specified(c, user_specified);
        braille_set_options(c, brl_options);

        c->index = idx;
        return 0;
}

static int __init console_msg_format_setup(char *str)
{
        if (!strcmp(str, "syslog"))
                console_msg_format = MSG_FORMAT_SYSLOG;
        if (!strcmp(str, "default"))
                console_msg_format = MSG_FORMAT_DEFAULT;
        return 1;
}
__setup("console_msg_format=", console_msg_format_setup);

/*
 * Set up a console.  Called via do_early_param() in init/main.c
 * for each "console=" parameter in the boot command line.
 */
static int __init console_setup(char *str)
{
        static_assert(sizeof(console_cmdline[0].devname) >= sizeof(console_cmdline[0].name) + 4);
        char buf[sizeof(console_cmdline[0].devname)];
        char *brl_options = NULL;
        char *ttyname = NULL;
        char *devname = NULL;
        char *options;
        char *s;
        int idx;

        /*
         * console="" or console=null have been suggested as a way to
         * disable console output. Use ttynull that has been created
         * for exactly this purpose.
         */
        if (str[0] == 0 || strcmp(str, "null") == 0) {
                __add_preferred_console("ttynull", 0, NULL, NULL, NULL, true);
                return 1;
        }

        if (_braille_console_setup(&str, &brl_options))
                return 1;

        /* For a DEVNAME:0.0 style console the character device is unknown early */
        if (strchr(str, ':'))
                devname = buf;
        else
                ttyname = buf;

        /*
         * Decode str into name, index, options.
         */
        if (ttyname && isdigit(str[0]))
                scnprintf(buf, sizeof(buf), "ttyS%s", str);
        else
                strscpy(buf, str);

        options = strchr(str, ',');
        if (options)
                *(options++) = 0;

#ifdef __sparc__
        if (!strcmp(str, "ttya"))
                strscpy(buf, "ttyS0");
        if (!strcmp(str, "ttyb"))
                strscpy(buf, "ttyS1");
#endif

        for (s = buf; *s; s++)
                if ((ttyname && isdigit(*s)) || *s == ',')
                        break;

        /* @idx will get defined when devname matches. */
        if (devname)
                idx = -1;
        else
                idx = simple_strtoul(s, NULL, 10);

        *s = 0;

        __add_preferred_console(ttyname, idx, devname, options, brl_options, true);
        return 1;
}
__setup("console=", console_setup);

/**
 * add_preferred_console - add a device to the list of preferred consoles.
 * @name: device name
 * @idx: device index
 * @options: options for this console
 *
 * The last preferred console added will be used for kernel messages
 * and stdin/out/err for init.  Normally this is used by console_setup
 * above to handle user-supplied console arguments; however it can also
 * be used by arch-specific code either to override the user or more
 * commonly to provide a default console (ie from PROM variables) when
 * the user has not supplied one.
 */
int add_preferred_console(const char *name, const short idx, char *options)
{
        return __add_preferred_console(name, idx, NULL, options, NULL, false);
}

/**
 * match_devname_and_update_preferred_console - Update a preferred console
 *        when matching devname is found.
 * @devname: DEVNAME:0.0 style device name
 * @name: Name of the corresponding console driver, e.g. "ttyS"
 * @idx: Console index, e.g. port number.
 *
 * The function checks whether a device with the given @devname is
 * preferred via the console=DEVNAME:0.0 command line option.
 * It fills the missing console driver name and console index
 * so that a later register_console() call could find (match)
 * and enable this device.
 *
 * It might be used when a driver subsystem initializes particular
 * devices with already known DEVNAME:0.0 style names. And it
 * could predict which console driver name and index this device
 * would later get associated with.
 *
 * Return: 0 on success, negative error code on failure.
 */
int match_devname_and_update_preferred_console(const char *devname,
                                               const char *name,
                                               const short idx)
{
        struct console_cmdline *c = console_cmdline;
        int i;

        if (!devname || !strlen(devname) || !name || !strlen(name) || idx < 0)
                return -EINVAL;

        for (i = 0; i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]);
             i++, c++) {
                if (!strcmp(devname, c->devname)) {
                        pr_info("associate the preferred console \"%s\" with \"%s%d\"\n",
                                devname, name, idx);
                        strscpy(c->name, name);
                        c->index = idx;
                        return 0;
                }
        }

        return -ENOENT;
}
EXPORT_SYMBOL_GPL(match_devname_and_update_preferred_console);

bool console_suspend_enabled = true;
EXPORT_SYMBOL(console_suspend_enabled);

static int __init console_suspend_disable(char *str)
{
        console_suspend_enabled = false;
        return 1;
}
__setup("no_console_suspend", console_suspend_disable);
module_param_named(console_suspend, console_suspend_enabled,
                bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(console_suspend, "suspend console during suspend"
        " and hibernate operations");

static bool printk_console_no_auto_verbose;

void console_verbose(void)
{
        if (console_loglevel && !printk_console_no_auto_verbose)
                console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
}
EXPORT_SYMBOL_GPL(console_verbose);

module_param_named(console_no_auto_verbose, printk_console_no_auto_verbose, bool, 0644);
MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to highest on oops/panic/etc");

/**
 * console_suspend_all - suspend the console subsystem
 *
 * This disables printk() while we go into suspend states
 */
void console_suspend_all(void)
{
        struct console *con;

        if (!console_suspend_enabled)
                return;
        pr_info("Suspending console(s) (use no_console_suspend to debug)\n");
        pr_flush(1000, true);

        console_list_lock();
        for_each_console(con)
                console_srcu_write_flags(con, con->flags | CON_SUSPENDED);
        console_list_unlock();

        /*
         * Ensure that all SRCU list walks have completed. All printing
         * contexts must be able to see that they are suspended so that it
         * is guaranteed that all printing has stopped when this function
         * completes.
         */
        synchronize_srcu(&console_srcu);
}

void console_resume_all(void)
{
        struct console_flush_type ft;
        struct console *con;

        if (!console_suspend_enabled)
                return;

        console_list_lock();
        for_each_console(con)
                console_srcu_write_flags(con, con->flags & ~CON_SUSPENDED);
        console_list_unlock();

        /*
         * Ensure that all SRCU list walks have completed. All printing
         * contexts must be able to see they are no longer suspended so
         * that they are guaranteed to wake up and resume printing.
         */
        synchronize_srcu(&console_srcu);

        printk_get_console_flush_type(&ft);
        if (ft.nbcon_offload)
                nbcon_kthreads_wake();
        if (ft.legacy_offload)
                defer_console_output();

        pr_flush(1000, true);
}

/**
 * console_cpu_notify - print deferred console messages after CPU hotplug
 * @cpu: unused
 *
 * If printk() is called from a CPU that is not online yet, the messages
 * will be printed on the console only if there are CON_ANYTIME consoles.
 * This function is called when a new CPU comes online (or fails to come
 * up) or goes offline.
 */
static int console_cpu_notify(unsigned int cpu)
{
        struct console_flush_type ft;

        if (!cpuhp_tasks_frozen) {
                printk_get_console_flush_type(&ft);
                if (ft.nbcon_atomic)
                        nbcon_atomic_flush_pending();
                if (ft.legacy_direct) {
                        if (console_trylock())
                                console_unlock();
                }
        }
        return 0;
}

/**
 * console_lock - block the console subsystem from printing
 *
 * Acquires a lock which guarantees that no consoles will
 * be in or enter their write() callback.
 *
 * Can sleep, returns nothing.
 */
void console_lock(void)
{
        might_sleep();

        /* On panic, the console_lock must be left to the panic cpu. */
        while (panic_on_other_cpu())
                msleep(1000);

        down_console_sem();
        console_locked = 1;
        console_may_schedule = 1;
}
EXPORT_SYMBOL(console_lock);

/**
 * console_trylock - try to block the console subsystem from printing
 *
 * Try to acquire a lock which guarantees that no consoles will
 * be in or enter their write() callback.
 *
 * returns 1 on success, and 0 on failure to acquire the lock.
 */
int console_trylock(void)
{
        /* On panic, the console_lock must be left to the panic cpu. */
        if (panic_on_other_cpu())
                return 0;
        if (down_trylock_console_sem())
                return 0;
        console_locked = 1;
        console_may_schedule = 0;
        return 1;
}
EXPORT_SYMBOL(console_trylock);

int is_console_locked(void)
{
        return console_locked;
}
EXPORT_SYMBOL(is_console_locked);

static void __console_unlock(void)
{
        console_locked = 0;
        up_console_sem();
}

#ifdef CONFIG_PRINTK

/*
 * Prepend the message in @pmsg->pbufs->outbuf. This is achieved by shifting
 * the existing message over and inserting the scratchbuf message.
 *
 * @pmsg is the original printk message.
 * @fmt is the printf format of the message which will prepend the existing one.
 *
 * If there is not enough space in @pmsg->pbufs->outbuf, the existing
 * message text will be sufficiently truncated.
 *
 * If @pmsg->pbufs->outbuf is modified, @pmsg->outbuf_len is updated.
 */
__printf(2, 3)
static void console_prepend_message(struct printk_message *pmsg, const char *fmt, ...)
{
        struct printk_buffers *pbufs = pmsg->pbufs;
        const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf);
        const size_t outbuf_sz = sizeof(pbufs->outbuf);
        char *scratchbuf = &pbufs->scratchbuf[0];
        char *outbuf = &pbufs->outbuf[0];
        va_list args;
        size_t len;

        va_start(args, fmt);
        len = vscnprintf(scratchbuf, scratchbuf_sz, fmt, args);
        va_end(args);

        /*
         * Make sure outbuf is sufficiently large before prepending.
         * Keep at least the prefix when the message must be truncated.
         * It is a rather theoretical problem when someone tries to
         * use a minimalist buffer.
         */
        if (WARN_ON_ONCE(len + PRINTK_PREFIX_MAX >= outbuf_sz))
                return;

        if (pmsg->outbuf_len + len >= outbuf_sz) {
                /* Truncate the message, but keep it terminated. */
                pmsg->outbuf_len = outbuf_sz - (len + 1);
                outbuf[pmsg->outbuf_len] = 0;
        }

        memmove(outbuf + len, outbuf, pmsg->outbuf_len + 1);
        memcpy(outbuf, scratchbuf, len);
        pmsg->outbuf_len += len;
}

/*
 * Prepend the message in @pmsg->pbufs->outbuf with a "dropped message".
 * @pmsg->outbuf_len is updated appropriately.
 *
 * @pmsg is the printk message to prepend.
 *
 * @dropped is the dropped count to report in the dropped message.
 */
void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped)
{
        console_prepend_message(pmsg, "** %lu printk messages dropped **\n", dropped);
}

/*
 * Prepend the message in @pmsg->pbufs->outbuf with a "replay message".
 * @pmsg->outbuf_len is updated appropriately.
 *
 * @pmsg is the printk message to prepend.
 */
void console_prepend_replay(struct printk_message *pmsg)
{
        console_prepend_message(pmsg, "** replaying previous printk message **\n");
}

/*
 * Read and format the specified record (or a later record if the specified
 * record is not available).
 *
 * @pmsg will contain the formatted result. @pmsg->pbufs must point to a
 * struct printk_buffers.
 *
 * @seq is the record to read and format. If it is not available, the next
 * valid record is read.
 *
 * @is_extended specifies if the message should be formatted for extended
 * console output.
 *
 * @may_supress specifies if records may be skipped based on loglevel.
 *
 * Returns false if no record is available. Otherwise true and all fields
 * of @pmsg are valid. (See the documentation of struct printk_message
 * for information about the @pmsg fields.)
 */
bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
                             bool is_extended, bool may_suppress)
{
        struct printk_buffers *pbufs = pmsg->pbufs;
        const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf);
        const size_t outbuf_sz = sizeof(pbufs->outbuf);
        char *scratchbuf = &pbufs->scratchbuf[0];
        char *outbuf = &pbufs->outbuf[0];
        struct printk_info info;
        struct printk_record r;
        size_t len = 0;
        bool force_con;

        /*
         * Formatting extended messages requires a separate buffer, so use the
         * scratch buffer to read in the ringbuffer text.
         *
         * Formatting normal messages is done in-place, so read the ringbuffer
         * text directly into the output buffer.
         */
        if (is_extended)
                prb_rec_init_rd(&r, &info, scratchbuf, scratchbuf_sz);
        else
                prb_rec_init_rd(&r, &info, outbuf, outbuf_sz);

        if (!prb_read_valid(prb, seq, &r))
                return false;

        pmsg->seq = r.info->seq;
        pmsg->dropped = r.info->seq - seq;
        force_con = r.info->flags & LOG_FORCE_CON;

        /*
         * Skip records that are not forced to be printed on consoles and that
         * has level above the console loglevel.
         */
        if (!force_con && may_suppress && suppress_message_printing(r.info->level))
                goto out;

        if (is_extended) {
                len = info_print_ext_header(outbuf, outbuf_sz, r.info);
                len += msg_print_ext_body(outbuf + len, outbuf_sz - len,
                                          &r.text_buf[0], r.info->text_len, &r.info->dev_info);
        } else {
                len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time);
        }
out:
        pmsg->outbuf_len = len;
        return true;
}

/*
 * Legacy console printing from printk() caller context does not respect
 * raw_spinlock/spinlock nesting. For !PREEMPT_RT the lockdep warning is a
 * false positive. For PREEMPT_RT the false positive condition does not
 * occur.
 *
 * This map is used to temporarily establish LD_WAIT_SLEEP context for the
 * console write() callback when legacy printing to avoid false positive
 * lockdep complaints, thus allowing lockdep to continue to function for
 * real issues.
 */
#ifdef CONFIG_PREEMPT_RT
static inline void printk_legacy_allow_spinlock_enter(void) { }
static inline void printk_legacy_allow_spinlock_exit(void) { }
#else
static DEFINE_WAIT_OVERRIDE_MAP(printk_legacy_map, LD_WAIT_SLEEP);

static inline void printk_legacy_allow_spinlock_enter(void)
{
        lock_map_acquire_try(&printk_legacy_map);
}

static inline void printk_legacy_allow_spinlock_exit(void)
{
        lock_map_release(&printk_legacy_map);
}
#endif /* CONFIG_PREEMPT_RT */

/*
 * Used as the printk buffers for non-panic, serialized console printing.
 * This is for legacy (!CON_NBCON) as well as all boot (CON_BOOT) consoles.
 * Its usage requires the console_lock held.
 */
struct printk_buffers printk_shared_pbufs;

/*
 * Print one record for the given console. The record printed is whatever
 * record is the next available record for the given console.
 *
 * @handover will be set to true if a printk waiter has taken over the
 * console_lock, in which case the caller is no longer holding both the
 * console_lock and the SRCU read lock. Otherwise it is set to false.
 *
 * @cookie is the cookie from the SRCU read lock.
 *
 * Returns false if the given console has no next record to print, otherwise
 * true.
 *
 * Requires the console_lock and the SRCU read lock.
 */
static bool console_emit_next_record(struct console *con, bool *handover, int cookie)
{
        bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED;
        char *outbuf = &printk_shared_pbufs.outbuf[0];
        struct printk_message pmsg = {
                .pbufs = &printk_shared_pbufs,
        };
        unsigned long flags;

        *handover = false;

        if (!printk_get_next_message(&pmsg, con->seq, is_extended, true))
                return false;

        con->dropped += pmsg.dropped;

        /* Skip messages of formatted length 0. */
        if (pmsg.outbuf_len == 0) {
                con->seq = pmsg.seq + 1;
                goto skip;
        }

        if (con->dropped && !is_extended) {
                console_prepend_dropped(&pmsg, con->dropped);
                con->dropped = 0;
        }

        /* Write everything out to the hardware. */

        if (force_legacy_kthread() && !panic_in_progress()) {
                /*
                 * With forced threading this function is in a task context
                 * (either legacy kthread or get_init_console_seq()). There
                 * is no need for concern about printk reentrance, handovers,
                 * or lockdep complaints.
                 */

                con->write(con, outbuf, pmsg.outbuf_len);
                con->seq = pmsg.seq + 1;
        } else {
                /*
                 * While actively printing out messages, if another printk()
                 * were to occur on another CPU, it may wait for this one to
                 * finish. This task can not be preempted if there is a
                 * waiter waiting to take over.
                 *
                 * Interrupts are disabled because the hand over to a waiter
                 * must not be interrupted until the hand over is completed
                 * (@console_waiter is cleared).
                 */
                printk_safe_enter_irqsave(flags);
                console_lock_spinning_enable();

                /* Do not trace print latency. */
                stop_critical_timings();

                printk_legacy_allow_spinlock_enter();
                con->write(con, outbuf, pmsg.outbuf_len);
                printk_legacy_allow_spinlock_exit();

                start_critical_timings();

                con->seq = pmsg.seq + 1;

                *handover = console_lock_spinning_disable_and_check(cookie);
                printk_safe_exit_irqrestore(flags);
        }
skip:
        return true;
}

#else

static bool console_emit_next_record(struct console *con, bool *handover, int cookie)
{
        *handover = false;
        return false;
}

static inline void printk_kthreads_check_locked(void) { }

#endif /* CONFIG_PRINTK */

/*
 * Print out all remaining records to all consoles.
 *
 * @do_cond_resched is set by the caller. It can be true only in schedulable
 * context.
 *
 * @next_seq is set to the sequence number after the last available record.
 * The value is valid only when this function returns true. It means that all
 * usable consoles are completely flushed.
 *
 * @handover will be set to true if a printk waiter has taken over the
 * console_lock, in which case the caller is no longer holding the
 * console_lock. Otherwise it is set to false.
 *
 * Returns true when there was at least one usable console and all messages
 * were flushed to all usable consoles. A returned false informs the caller
 * that everything was not flushed (either there were no usable consoles or
 * another context has taken over printing or it is a panic situation and this
 * is not the panic CPU). Regardless the reason, the caller should assume it
 * is not useful to immediately try again.
 *
 * Requires the console_lock.
 */
static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover)
{
        struct console_flush_type ft;
        bool any_usable = false;
        struct console *con;
        bool any_progress;
        int cookie;

        *next_seq = 0;
        *handover = false;

        do {
                any_progress = false;

                printk_get_console_flush_type(&ft);

                cookie = console_srcu_read_lock();
                for_each_console_srcu(con) {
                        short flags = console_srcu_read_flags(con);
                        u64 printk_seq;
                        bool progress;

                        /*
                         * console_flush_all() is only responsible for nbcon
                         * consoles when the nbcon consoles cannot print via
                         * their atomic or threaded flushing.
                         */
                        if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload))
                                continue;

                        if (!console_is_usable(con, flags, !do_cond_resched))
                                continue;
                        any_usable = true;

                        if (flags & CON_NBCON) {
                                progress = nbcon_legacy_emit_next_record(con, handover, cookie,
                                                                         !do_cond_resched);
                                printk_seq = nbcon_seq_read(con);
                        } else {
                                progress = console_emit_next_record(con, handover, cookie);
                                printk_seq = con->seq;
                        }

                        /*
                         * If a handover has occurred, the SRCU read lock
                         * is already released.
                         */
                        if (*handover)
                                return false;

                        /* Track the next of the highest seq flushed. */
                        if (printk_seq > *next_seq)
                                *next_seq = printk_seq;

                        if (!progress)
                                continue;
                        any_progress = true;

                        /* Allow panic_cpu to take over the consoles safely. */
                        if (panic_on_other_cpu())
                                goto abandon;

                        if (do_cond_resched)
                                cond_resched();
                }
                console_srcu_read_unlock(cookie);
        } while (any_progress);

        return any_usable;

abandon:
        console_srcu_read_unlock(cookie);
        return false;
}

static void __console_flush_and_unlock(void)
{
        bool do_cond_resched;
        bool handover;
        bool flushed;
        u64 next_seq;

        /*
         * Console drivers are called with interrupts disabled, so
         * @console_may_schedule should be cleared before; however, we may
         * end up dumping a lot of lines, for example, if called from
         * console registration path, and should invoke cond_resched()
         * between lines if allowable.  Not doing so can cause a very long
         * scheduling stall on a slow console leading to RCU stall and
         * softlockup warnings which exacerbate the issue with more
         * messages practically incapacitating the system. Therefore, create
         * a local to use for the printing loop.
         */
        do_cond_resched = console_may_schedule;

        do {
                console_may_schedule = 0;

                flushed = console_flush_all(do_cond_resched, &next_seq, &handover);
                if (!handover)
                        __console_unlock();

                /*
                 * Abort if there was a failure to flush all messages to all
                 * usable consoles. Either it is not possible to flush (in
                 * which case it would be an infinite loop of retrying) or
                 * another context has taken over printing.
                 */
                if (!flushed)
                        break;

                /*
                 * Some context may have added new records after
                 * console_flush_all() but before unlocking the console.
                 * Re-check if there is a new record to flush. If the trylock
                 * fails, another context is already handling the printing.
                 */
        } while (prb_read_valid(prb, next_seq, NULL) && console_trylock());
}

/**
 * console_unlock - unblock the legacy console subsystem from printing
 *
 * Releases the console_lock which the caller holds to block printing of
 * the legacy console subsystem.
 *
 * While the console_lock was held, console output may have been buffered
 * by printk(). If this is the case, console_unlock() emits the output on
 * legacy consoles prior to releasing the lock.
 *
 * console_unlock(); may be called from any context.
 */
void console_unlock(void)
{
        struct console_flush_type ft;

        printk_get_console_flush_type(&ft);
        if (ft.legacy_direct)
                __console_flush_and_unlock();
        else
                __console_unlock();
}
EXPORT_SYMBOL(console_unlock);

/**
 * console_conditional_schedule - yield the CPU if required
 *
 * If the console code is currently allowed to sleep, and
 * if this CPU should yield the CPU to another task, do
 * so here.
 *
 * Must be called within console_lock();.
 */
void __sched console_conditional_schedule(void)
{
        if (console_may_schedule)
                cond_resched();
}
EXPORT_SYMBOL(console_conditional_schedule);

void console_unblank(void)
{
        bool found_unblank = false;
        struct console *c;
        int cookie;

        /*
         * First check if there are any consoles implementing the unblank()
         * callback. If not, there is no reason to continue and take the
         * console lock, which in particular can be dangerous if
         * @oops_in_progress is set.
         */
        cookie = console_srcu_read_lock();
        for_each_console_srcu(c) {
                short flags = console_srcu_read_flags(c);

                if (flags & CON_SUSPENDED)
                        continue;

                if ((flags & CON_ENABLED) && c->unblank) {
                        found_unblank = true;
                        break;
                }
        }
        console_srcu_read_unlock(cookie);
        if (!found_unblank)
                return;

        /*
         * Stop console printing because the unblank() callback may
         * assume the console is not within its write() callback.
         *
         * If @oops_in_progress is set, this may be an atomic context.
         * In that case, attempt a trylock as best-effort.
         */
        if (oops_in_progress) {
                /* Semaphores are not NMI-safe. */
                if (in_nmi())
                        return;

                /*
                 * Attempting to trylock the console lock can deadlock
                 * if another CPU was stopped while modifying the
                 * semaphore. "Hope and pray" that this is not the
                 * current situation.
                 */
                if (down_trylock_console_sem() != 0)
                        return;
        } else
                console_lock();

        console_locked = 1;
        console_may_schedule = 0;

        cookie = console_srcu_read_lock();
        for_each_console_srcu(c) {
                short flags = console_srcu_read_flags(c);

                if (flags & CON_SUSPENDED)
                        continue;

                if ((flags & CON_ENABLED) && c->unblank)
                        c->unblank();
        }
        console_srcu_read_unlock(cookie);

        console_unlock();

        if (!oops_in_progress)
                pr_flush(1000, true);
}

/*
 * Rewind all consoles to the oldest available record.
 *
 * IMPORTANT: The function is safe only when called under
 *            console_lock(). It is not enforced because
 *            it is used as a best effort in panic().
 */
static void __console_rewind_all(void)
{
        struct console *c;
        short flags;
        int cookie;
        u64 seq;

        seq = prb_first_valid_seq(prb);

        cookie = console_srcu_read_lock();
        for_each_console_srcu(c) {
                flags = console_srcu_read_flags(c);

                if (flags & CON_NBCON) {
                        nbcon_seq_force(c, seq);
                } else {
                        /*
                         * This assignment is safe only when called under
                         * console_lock(). On panic, legacy consoles are
                         * only best effort.
                         */
                        c->seq = seq;
                }
        }
        console_srcu_read_unlock(cookie);
}

/**
 * console_flush_on_panic - flush console content on panic
 * @mode: flush all messages in buffer or just the pending ones
 *
 * Immediately output all pending messages no matter what.
 */
void console_flush_on_panic(enum con_flush_mode mode)
{
        struct console_flush_type ft;
        bool handover;
        u64 next_seq;

        /*
         * Ignore the console lock and flush out the messages. Attempting a
         * trylock would not be useful because:
         *
         *   - if it is contended, it must be ignored anyway
         *   - console_lock() and console_trylock() block and fail
         *     respectively in panic for non-panic CPUs
         *   - semaphores are not NMI-safe
         */

        /*
         * If another context is holding the console lock,
         * @console_may_schedule might be set. Clear it so that
         * this context does not call cond_resched() while flushing.
         */
        console_may_schedule = 0;

        if (mode == CONSOLE_REPLAY_ALL)
                __console_rewind_all();

        printk_get_console_flush_type(&ft);
        if (ft.nbcon_atomic)
                nbcon_atomic_flush_pending();

        /* Flush legacy consoles once allowed, even when dangerous. */
        if (legacy_allow_panic_sync)
                console_flush_all(false, &next_seq, &handover);
}

/*
 * Return the console tty driver structure and its associated index
 */
struct tty_driver *console_device(int *index)
{
        struct console *c;
        struct tty_driver *driver = NULL;
        int cookie;

        /*
         * Take console_lock to serialize device() callback with
         * other console operations. For example, fg_console is
         * modified under console_lock when switching vt.
         */
        console_lock();

        cookie = console_srcu_read_lock();
        for_each_console_srcu(c) {
                if (!c->device)
                        continue;
                driver = c->device(c, index);
                if (driver)
                        break;
        }
        console_srcu_read_unlock(cookie);

        console_unlock();
        return driver;
}

/*
 * Prevent further output on the passed console device so that (for example)
 * serial drivers can suspend console output before suspending a port, and can
 * re-enable output afterwards.
 */
void console_suspend(struct console *console)
{
        __pr_flush(console, 1000, true);
        console_list_lock();
        console_srcu_write_flags(console, console->flags & ~CON_ENABLED);
        console_list_unlock();

        /*
         * Ensure that all SRCU list walks have completed. All contexts must
         * be able to see that this console is disabled so that (for example)
         * the caller can suspend the port without risk of another context
         * using the port.
         */
        synchronize_srcu(&console_srcu);
}
EXPORT_SYMBOL(console_suspend);

void console_resume(struct console *console)
{
        struct console_flush_type ft;
        bool is_nbcon;

        console_list_lock();
        console_srcu_write_flags(console, console->flags | CON_ENABLED);
        is_nbcon = console->flags & CON_NBCON;
        console_list_unlock();

        /*
         * Ensure that all SRCU list walks have completed. The related
         * printing context must be able to see it is enabled so that
         * it is guaranteed to wake up and resume printing.
         */
        synchronize_srcu(&console_srcu);

        printk_get_console_flush_type(&ft);
        if (is_nbcon && ft.nbcon_offload)
                nbcon_kthread_wake(console);
        else if (ft.legacy_offload)
                defer_console_output();

        __pr_flush(console, 1000, true);
}
EXPORT_SYMBOL(console_resume);

#ifdef CONFIG_PRINTK
static int unregister_console_locked(struct console *console);

/* True when system boot is far enough to create printer threads. */
bool printk_kthreads_ready __ro_after_init;

static struct task_struct *printk_legacy_kthread;

static bool legacy_kthread_should_wakeup(void)
{
        struct console_flush_type ft;
        struct console *con;
        bool ret = false;
        int cookie;

        if (kthread_should_stop())
                return true;

        printk_get_console_flush_type(&ft);

        cookie = console_srcu_read_lock();
        for_each_console_srcu(con) {
                short flags = console_srcu_read_flags(con);
                u64 printk_seq;

                /*
                 * The legacy printer thread is only responsible for nbcon
                 * consoles when the nbcon consoles cannot print via their
                 * atomic or threaded flushing.
                 */
                if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload))
                        continue;

                if (!console_is_usable(con, flags, false))
                        continue;

                if (flags & CON_NBCON) {
                        printk_seq = nbcon_seq_read(con);
                } else {
                        /*
                         * It is safe to read @seq because only this
                         * thread context updates @seq.
                         */
                        printk_seq = con->seq;
                }

                if (prb_read_valid(prb, printk_seq, NULL)) {
                        ret = true;
                        break;
                }
        }
        console_srcu_read_unlock(cookie);

        return ret;
}

static int legacy_kthread_func(void *unused)
{
        for (;;) {
                wait_event_interruptible(legacy_wait, legacy_kthread_should_wakeup());

                if (kthread_should_stop())
                        break;

                console_lock();
                __console_flush_and_unlock();
        }

        return 0;
}

static bool legacy_kthread_create(void)
{
        struct task_struct *kt;

        lockdep_assert_console_list_lock_held();

        kt = kthread_run(legacy_kthread_func, NULL, "pr/legacy");
        if (WARN_ON(IS_ERR(kt))) {
                pr_err("failed to start legacy printing thread\n");
                return false;
        }

        printk_legacy_kthread = kt;

        /*
         * It is important that console printing threads are scheduled
         * shortly after a printk call and with generous runtime budgets.
         */
        sched_set_normal(printk_legacy_kthread, -20);

        return true;
}

/**
 * printk_kthreads_shutdown - shutdown all threaded printers
 *
 * On system shutdown all threaded printers are stopped. This allows printk
 * to transition back to atomic printing, thus providing a robust mechanism
 * for the final shutdown/reboot messages to be output.
 */
static void printk_kthreads_shutdown(void)
{
        struct console *con;

        console_list_lock();
        if (printk_kthreads_running) {
                printk_kthreads_running = false;

                for_each_console(con) {
                        if (con->flags & CON_NBCON)
                                nbcon_kthread_stop(con);
                }

                /*
                 * The threads may have been stopped while printing a
                 * backlog. Flush any records left over.
                 */
                nbcon_atomic_flush_pending();
        }
        console_list_unlock();
}

static struct syscore_ops printk_syscore_ops = {
        .shutdown = printk_kthreads_shutdown,
};

/*
 * If appropriate, start nbcon kthreads and set @printk_kthreads_running.
 * If any kthreads fail to start, those consoles are unregistered.
 *
 * Must be called under console_list_lock().
 */
static void printk_kthreads_check_locked(void)
{
        struct hlist_node *tmp;
        struct console *con;

        lockdep_assert_console_list_lock_held();

        if (!printk_kthreads_ready)
                return;

        /* Start or stop the legacy kthread when needed. */
        if (have_legacy_console || have_boot_console) {
                if (!printk_legacy_kthread &&
                    force_legacy_kthread() &&
                    !legacy_kthread_create()) {
                        /*
                         * All legacy consoles must be unregistered. If there
                         * are any nbcon consoles, they will set up their own
                         * kthread.
                         */
                        hlist_for_each_entry_safe(con, tmp, &console_list, node) {
                                if (con->flags & CON_NBCON)
                                        continue;

                                unregister_console_locked(con);
                        }
                }
        } else if (printk_legacy_kthread) {
                kthread_stop(printk_legacy_kthread);
                printk_legacy_kthread = NULL;
        }

        /*
         * Printer threads cannot be started as long as any boot console is
         * registered because there is no way to synchronize the hardware
         * registers between boot console code and regular console code.
         * It can only be known that there will be no new boot consoles when
         * an nbcon console is registered.
         */
        if (have_boot_console || !have_nbcon_console) {
                /* Clear flag in case all nbcon consoles unregistered. */
                printk_kthreads_running = false;
                return;
        }

        if (printk_kthreads_running)
                return;

        hlist_for_each_entry_safe(con, tmp, &console_list, node) {
                if (!(con->flags & CON_NBCON))
                        continue;

                if (!nbcon_kthread_create(con))
                        unregister_console_locked(con);
        }

        printk_kthreads_running = true;
}

static int __init printk_set_kthreads_ready(void)
{
        register_syscore_ops(&printk_syscore_ops);

        console_list_lock();
        printk_kthreads_ready = true;
        printk_kthreads_check_locked();
        console_list_unlock();

        return 0;
}
early_initcall(printk_set_kthreads_ready);
#endif /* CONFIG_PRINTK */

static int __read_mostly keep_bootcon;

static int __init keep_bootcon_setup(char *str)
{
        keep_bootcon = 1;
        pr_info("debug: skip boot console de-registration.\n");

        return 0;
}

early_param("keep_bootcon", keep_bootcon_setup);

static int console_call_setup(struct console *newcon, char *options)
{
        int err;

        if (!newcon->setup)
                return 0;

        /* Synchronize with possible boot console. */
        console_lock();
        err = newcon->setup(newcon, options);
        console_unlock();

        return err;
}

/*
 * This is called by register_console() to try to match
 * the newly registered console with any of the ones selected
 * by either the command line or add_preferred_console() and
 * setup/enable it.
 *
 * Care need to be taken with consoles that are statically
 * enabled such as netconsole
 */
static int try_enable_preferred_console(struct console *newcon,
                                        bool user_specified)
{
        struct console_cmdline *c;
        int i, err;

        for (i = 0, c = console_cmdline;
             i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]);
             i++, c++) {
                /* Console not yet initialized? */
                if (!c->name[0])
                        continue;
                if (c->user_specified != user_specified)
                        continue;
                if (!newcon->match ||
                    newcon->match(newcon, c->name, c->index, c->options) != 0) {
                        /* default matching */
                        BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name));
                        if (strcmp(c->name, newcon->name) != 0)
                                continue;
                        if (newcon->index >= 0 &&
                            newcon->index != c->index)
                                continue;
                        if (newcon->index < 0)
                                newcon->index = c->index;

                        if (_braille_register_console(newcon, c))
                                return 0;

                        err = console_call_setup(newcon, c->options);
                        if (err)
                                return err;
                }
                newcon->flags |= CON_ENABLED;
                if (i == preferred_console)
                        newcon->flags |= CON_CONSDEV;
                return 0;
        }

        /*
         * Some consoles, such as pstore and netconsole, can be enabled even
         * without matching. Accept the pre-enabled consoles only when match()
         * and setup() had a chance to be called.
         */
        if (newcon->flags & CON_ENABLED && c->user_specified ==        user_specified)
                return 0;

        return -ENOENT;
}

/* Try to enable the console unconditionally */
static void try_enable_default_console(struct console *newcon)
{
        if (newcon->index < 0)
                newcon->index = 0;

        if (console_call_setup(newcon, NULL) != 0)
                return;

        newcon->flags |= CON_ENABLED;

        if (newcon->device)
                newcon->flags |= CON_CONSDEV;
}

/* Return the starting sequence number for a newly registered console. */
static u64 get_init_console_seq(struct console *newcon, bool bootcon_registered)
{
        struct console *con;
        bool handover;
        u64 init_seq;

        if (newcon->flags & (CON_PRINTBUFFER | CON_BOOT)) {
                /* Get a consistent copy of @syslog_seq. */
                mutex_lock(&syslog_lock);
                init_seq = syslog_seq;
                mutex_unlock(&syslog_lock);
        } else {
                /* Begin with next message added to ringbuffer. */
                init_seq = prb_next_seq(prb);

                /*
                 * If any enabled boot consoles are due to be unregistered
                 * shortly, some may not be caught up and may be the same
                 * device as @newcon. Since it is not known which boot console
                 * is the same device, flush all consoles and, if necessary,
                 * start with the message of the enabled boot console that is
                 * the furthest behind.
                 */
                if (bootcon_registered && !keep_bootcon) {
                        /*
                         * Hold the console_lock to stop console printing and
                         * guarantee safe access to console->seq.
                         */
                        console_lock();

                        /*
                         * Flush all consoles and set the console to start at
                         * the next unprinted sequence number.
                         */
                        if (!console_flush_all(true, &init_seq, &handover)) {
                                /*
                                 * Flushing failed. Just choose the lowest
                                 * sequence of the enabled boot consoles.
                                 */

                                /*
                                 * If there was a handover, this context no
                                 * longer holds the console_lock.
                                 */
                                if (handover)
                                        console_lock();

                                init_seq = prb_next_seq(prb);
                                for_each_console(con) {
                                        u64 seq;

                                        if (!(con->flags & CON_BOOT) ||
                                            !(con->flags & CON_ENABLED)) {
                                                continue;
                                        }

                                        if (con->flags & CON_NBCON)
                                                seq = nbcon_seq_read(con);
                                        else
                                                seq = con->seq;

                                        if (seq < init_seq)
                                                init_seq = seq;
                                }
                        }

                        console_unlock();
                }
        }

        return init_seq;
}

#define console_first()                                \
        hlist_entry(console_list.first, struct console, node)

static int unregister_console_locked(struct console *console);

/*
 * The console driver calls this routine during kernel initialization
 * to register the console printing procedure with printk() and to
 * print any messages that were printed by the kernel before the
 * console driver was initialized.
 *
 * This can happen pretty early during the boot process (because of
 * early_printk) - sometimes before setup_arch() completes - be careful
 * of what kernel features are used - they may not be initialised yet.
 *
 * There are two types of consoles - bootconsoles (early_printk) and
 * "real" consoles (everything which is not a bootconsole) which are
 * handled differently.
 *  - Any number of bootconsoles can be registered at any time.
 *  - As soon as a "real" console is registered, all bootconsoles
 *    will be unregistered automatically.
 *  - Once a "real" console is registered, any attempt to register a
 *    bootconsoles will be rejected
 */
void register_console(struct console *newcon)
{
        bool use_device_lock = (newcon->flags & CON_NBCON) && newcon->write_atomic;
        bool bootcon_registered = false;
        bool realcon_registered = false;
        struct console *con;
        unsigned long flags;
        u64 init_seq;
        int err;

        console_list_lock();

        for_each_console(con) {
                if (WARN(con == newcon, "console '%s%d' already registered\n",
                                         con->name, con->index)) {
                        goto unlock;
                }

                if (con->flags & CON_BOOT)
                        bootcon_registered = true;
                else
                        realcon_registered = true;
        }

        /* Do not register boot consoles when there already is a real one. */
        if ((newcon->flags & CON_BOOT) && realcon_registered) {
                pr_info("Too late to register bootconsole %s%d\n",
                        newcon->name, newcon->index);
                goto unlock;
        }

        if (newcon->flags & CON_NBCON) {
                /*
                 * Ensure the nbcon console buffers can be allocated
                 * before modifying any global data.
                 */
                if (!nbcon_alloc(newcon))
                        goto unlock;
        }

        /*
         * See if we want to enable this console driver by default.
         *
         * Nope when a console is preferred by the command line, device
         * tree, or SPCR.
         *
         * The first real console with tty binding (driver) wins. More
         * consoles might get enabled before the right one is found.
         *
         * Note that a console with tty binding will have CON_CONSDEV
         * flag set and will be first in the list.
         */
        if (preferred_console < 0) {
                if (hlist_empty(&console_list) || !console_first()->device ||
                    console_first()->flags & CON_BOOT) {
                        try_enable_default_console(newcon);
                }
        }

        /* See if this console matches one we selected on the command line */
        err = try_enable_preferred_console(newcon, true);

        /* If not, try to match against the platform default(s) */
        if (err == -ENOENT)
                err = try_enable_preferred_console(newcon, false);

        /* printk() messages are not printed to the Braille console. */
        if (err || newcon->flags & CON_BRL) {
                if (newcon->flags & CON_NBCON)
                        nbcon_free(newcon);
                goto unlock;
        }

        /*
         * If we have a bootconsole, and are switching to a real console,
         * don't print everything out again, since when the boot console, and
         * the real console are the same physical device, it's annoying to
         * see the beginning boot messages twice
         */
        if (bootcon_registered &&
            ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
                newcon->flags &= ~CON_PRINTBUFFER;
        }

        newcon->dropped = 0;
        init_seq = get_init_console_seq(newcon, bootcon_registered);

        if (newcon->flags & CON_NBCON) {
                have_nbcon_console = true;
                nbcon_seq_force(newcon, init_seq);
        } else {
                have_legacy_console = true;
                newcon->seq = init_seq;
        }

        if (newcon->flags & CON_BOOT)
                have_boot_console = true;

        /*
         * If another context is actively using the hardware of this new
         * console, it will not be aware of the nbcon synchronization. This
         * is a risk that two contexts could access the hardware
         * simultaneously if this new console is used for atomic printing
         * and the other context is still using the hardware.
         *
         * Use the driver synchronization to ensure that the hardware is not
         * in use while this new console transitions to being registered.
         */
        if (use_device_lock)
                newcon->device_lock(newcon, &flags);

        /*
         * Put this console in the list - keep the
         * preferred driver at the head of the list.
         */
        if (hlist_empty(&console_list)) {
                /* Ensure CON_CONSDEV is always set for the head. */
                newcon->flags |= CON_CONSDEV;
                hlist_add_head_rcu(&newcon->node, &console_list);

        } else if (newcon->flags & CON_CONSDEV) {
                /* Only the new head can have CON_CONSDEV set. */
                console_srcu_write_flags(console_first(), console_first()->flags & ~CON_CONSDEV);
                hlist_add_head_rcu(&newcon->node, &console_list);

        } else {
                hlist_add_behind_rcu(&newcon->node, console_list.first);
        }

        /*
         * No need to synchronize SRCU here! The caller does not rely
         * on all contexts being able to see the new console before
         * register_console() completes.
         */

        /* This new console is now registered. */
        if (use_device_lock)
                newcon->device_unlock(newcon, flags);

        console_sysfs_notify();

        /*
         * By unregistering the bootconsoles after we enable the real console
         * we get the "console xxx enabled" message on all the consoles -
         * boot consoles, real consoles, etc - this is to ensure that end
         * users know there might be something in the kernel's log buffer that
         * went to the bootconsole (that they do not see on the real console)
         */
        con_printk(KERN_INFO, newcon, "enabled\n");
        if (bootcon_registered &&
            ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
            !keep_bootcon) {
                struct hlist_node *tmp;

                hlist_for_each_entry_safe(con, tmp, &console_list, node) {
                        if (con->flags & CON_BOOT)
                                unregister_console_locked(con);
                }
        }

        /* Changed console list, may require printer threads to start/stop. */
        printk_kthreads_check_locked();
unlock:
        console_list_unlock();
}
EXPORT_SYMBOL(register_console);

/* Must be called under console_list_lock(). */
static int unregister_console_locked(struct console *console)
{
        bool use_device_lock = (console->flags & CON_NBCON) && console->write_atomic;
        bool found_legacy_con = false;
        bool found_nbcon_con = false;
        bool found_boot_con = false;
        unsigned long flags;
        struct console *c;
        int res;

        lockdep_assert_console_list_lock_held();

        con_printk(KERN_INFO, console, "disabled\n");

        res = _braille_unregister_console(console);
        if (res < 0)
                return res;
        if (res > 0)
                return 0;

        if (!console_is_registered_locked(console))
                res = -ENODEV;
        else if (console_is_usable(console, console->flags, true))
                __pr_flush(console, 1000, true);

        /* Disable it unconditionally */
        console_srcu_write_flags(console, console->flags & ~CON_ENABLED);

        if (res < 0)
                return res;

        /*
         * Use the driver synchronization to ensure that the hardware is not
         * in use while this console transitions to being unregistered.
         */
        if (use_device_lock)
                console->device_lock(console, &flags);

        hlist_del_init_rcu(&console->node);

        if (use_device_lock)
                console->device_unlock(console, flags);

        /*
         * <HISTORICAL>
         * If this isn't the last console and it has CON_CONSDEV set, we
         * need to set it on the next preferred console.
         * </HISTORICAL>
         *
         * The above makes no sense as there is no guarantee that the next
         * console has any device attached. Oh well....
         */
        if (!hlist_empty(&console_list) && console->flags & CON_CONSDEV)
                console_srcu_write_flags(console_first(), console_first()->flags | CON_CONSDEV);

        /*
         * Ensure that all SRCU list walks have completed. All contexts
         * must not be able to see this console in the list so that any
         * exit/cleanup routines can be performed safely.
         */
        synchronize_srcu(&console_srcu);

        /*
         * With this console gone, the global flags tracking registered
         * console types may have changed. Update them.
         */
        for_each_console(c) {
                if (c->flags & CON_BOOT)
                        found_boot_con = true;

                if (c->flags & CON_NBCON)
                        found_nbcon_con = true;
                else
                        found_legacy_con = true;
        }
        if (!found_boot_con)
                have_boot_console = found_boot_con;
        if (!found_legacy_con)
                have_legacy_console = found_legacy_con;
        if (!found_nbcon_con)
                have_nbcon_console = found_nbcon_con;

        /* @have_nbcon_console must be updated before calling nbcon_free(). */
        if (console->flags & CON_NBCON)
                nbcon_free(console);

        console_sysfs_notify();

        if (console->exit)
                res = console->exit(console);

        /* Changed console list, may require printer threads to start/stop. */
        printk_kthreads_check_locked();

        return res;
}

int unregister_console(struct console *console)
{
        int res;

        console_list_lock();
        res = unregister_console_locked(console);
        console_list_unlock();
        return res;
}
EXPORT_SYMBOL(unregister_console);

/**
 * console_force_preferred_locked - force a registered console preferred
 * @con: The registered console to force preferred.
 *
 * Must be called under console_list_lock().
 */
void console_force_preferred_locked(struct console *con)
{
        struct console *cur_pref_con;

        if (!console_is_registered_locked(con))
                return;

        cur_pref_con = console_first();

        /* Already preferred? */
        if (cur_pref_con == con)
                return;

        /*
         * Delete, but do not re-initialize the entry. This allows the console
         * to continue to appear registered (via any hlist_unhashed_lockless()
         * checks), even though it was briefly removed from the console list.
         */
        hlist_del_rcu(&con->node);

        /*
         * Ensure that all SRCU list walks have completed so that the console
         * can be added to the beginning of the console list and its forward
         * list pointer can be re-initialized.
         */
        synchronize_srcu(&console_srcu);

        con->flags |= CON_CONSDEV;
        WARN_ON(!con->device);

        /* Only the new head can have CON_CONSDEV set. */
        console_srcu_write_flags(cur_pref_con, cur_pref_con->flags & ~CON_CONSDEV);
        hlist_add_head_rcu(&con->node, &console_list);
}
EXPORT_SYMBOL(console_force_preferred_locked);

/*
 * Initialize the console device. This is called *early*, so
 * we can't necessarily depend on lots of kernel help here.
 * Just do some early initializations, and do the complex setup
 * later.
 */
void __init console_init(void)
{
        int ret;
        initcall_t call;
        initcall_entry_t *ce;

#ifdef CONFIG_NULL_TTY_DEFAULT_CONSOLE
        if (!console_set_on_cmdline)
                add_preferred_console("ttynull", 0, NULL);
#endif

        /* Setup the default TTY line discipline. */
        n_tty_init();

        /*
         * set up the console device so that later boot sequences can
         * inform about problems etc..
         */
        ce = __con_initcall_start;
        trace_initcall_level("console");
        while (ce < __con_initcall_end) {
                call = initcall_from_entry(ce);
                trace_initcall_start(call);
                ret = call();
                trace_initcall_finish(call, ret);
                ce++;
        }
}

/*
 * Some boot consoles access data that is in the init section and which will
 * be discarded after the initcalls have been run. To make sure that no code
 * will access this data, unregister the boot consoles in a late initcall.
 *
 * If for some reason, such as deferred probe or the driver being a loadable
 * module, the real console hasn't registered yet at this point, there will
 * be a brief interval in which no messages are logged to the console, which
 * makes it difficult to diagnose problems that occur during this time.
 *
 * To mitigate this problem somewhat, only unregister consoles whose memory
 * intersects with the init section. Note that all other boot consoles will
 * get unregistered when the real preferred console is registered.
 */
static int __init printk_late_init(void)
{
        struct hlist_node *tmp;
        struct console *con;
        int ret;

        console_list_lock();
        hlist_for_each_entry_safe(con, tmp, &console_list, node) {
                if (!(con->flags & CON_BOOT))
                        continue;

                /* Check addresses that might be used for enabled consoles. */
                if (init_section_intersects(con, sizeof(*con)) ||
                    init_section_contains(con->write, 0) ||
                    init_section_contains(con->read, 0) ||
                    init_section_contains(con->device, 0) ||
                    init_section_contains(con->unblank, 0) ||
                    init_section_contains(con->data, 0)) {
                        /*
                         * Please, consider moving the reported consoles out
                         * of the init section.
                         */
                        pr_warn("bootconsole [%s%d] uses init memory and must be disabled even before the real one is ready\n",
                                con->name, con->index);
                        unregister_console_locked(con);
                }
        }
        console_list_unlock();

        ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL,
                                        console_cpu_notify);
        WARN_ON(ret < 0);
        ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "printk:online",
                                        console_cpu_notify, NULL);
        WARN_ON(ret < 0);
        printk_sysctl_init();
        return 0;
}
late_initcall(printk_late_init);

#if defined CONFIG_PRINTK
/* If @con is specified, only wait for that console. Otherwise wait for all. */
static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress)
{
        unsigned long timeout_jiffies = msecs_to_jiffies(timeout_ms);
        unsigned long remaining_jiffies = timeout_jiffies;
        struct console_flush_type ft;
        struct console *c;
        u64 last_diff = 0;
        u64 printk_seq;
        short flags;
        int cookie;
        u64 diff;
        u64 seq;

        /* Sorry, pr_flush() will not work this early. */
        if (system_state < SYSTEM_SCHEDULING)
                return false;

        might_sleep();

        seq = prb_next_reserve_seq(prb);

        /* Flush the consoles so that records up to @seq are printed. */
        printk_get_console_flush_type(&ft);
        if (ft.nbcon_atomic)
                nbcon_atomic_flush_pending();
        if (ft.legacy_direct) {
                console_lock();
                console_unlock();
        }

        for (;;) {
                unsigned long begin_jiffies;
                unsigned long slept_jiffies;

                diff = 0;

                /*
                 * Hold the console_lock to guarantee safe access to
                 * console->seq. Releasing console_lock flushes more
                 * records in case @seq is still not printed on all
                 * usable consoles.
                 *
                 * Holding the console_lock is not necessary if there
                 * are no legacy or boot consoles. However, such a
                 * console could register at any time. Always hold the
                 * console_lock as a precaution rather than
                 * synchronizing against register_console().
                 */
                console_lock();

                cookie = console_srcu_read_lock();
                for_each_console_srcu(c) {
                        if (con && con != c)
                                continue;

                        flags = console_srcu_read_flags(c);

                        /*
                         * If consoles are not usable, it cannot be expected
                         * that they make forward progress, so only increment
                         * @diff for usable consoles.
                         */
                        if (!console_is_usable(c, flags, true) &&
                            !console_is_usable(c, flags, false)) {
                                continue;
                        }

                        if (flags & CON_NBCON) {
                                printk_seq = nbcon_seq_read(c);
                        } else {
                                printk_seq = c->seq;
                        }

                        if (printk_seq < seq)
                                diff += seq - printk_seq;
                }
                console_srcu_read_unlock(cookie);

                if (diff != last_diff && reset_on_progress)
                        remaining_jiffies = timeout_jiffies;

                console_unlock();

                /* Note: @diff is 0 if there are no usable consoles. */
                if (diff == 0 || remaining_jiffies == 0)
                        break;

                /* msleep(1) might sleep much longer. Check time by jiffies. */
                begin_jiffies = jiffies;
                msleep(1);
                slept_jiffies = jiffies - begin_jiffies;

                remaining_jiffies -= min(slept_jiffies, remaining_jiffies);

                last_diff = diff;
        }

        return (diff == 0);
}

/**
 * pr_flush() - Wait for printing threads to catch up.
 *
 * @timeout_ms:        The maximum time (in ms) to wait.
 * @reset_on_progress: Reset the timeout if forward progress is seen.
 *
 * A value of 0 for @timeout_ms means no waiting will occur. A value of -1
 * represents infinite waiting.
 *
 * If @reset_on_progress is true, the timeout will be reset whenever any
 * printer has been seen to make some forward progress.
 *
 * Context: Process context. May sleep while acquiring console lock.
 * Return: true if all usable printers are caught up.
 */
bool pr_flush(int timeout_ms, bool reset_on_progress)
{
        return __pr_flush(NULL, timeout_ms, reset_on_progress);
}

/*
 * Delayed printk version, for scheduler-internal messages:
 */
#define PRINTK_PENDING_WAKEUP        0x01
#define PRINTK_PENDING_OUTPUT        0x02

static DEFINE_PER_CPU(int, printk_pending);

static void wake_up_klogd_work_func(struct irq_work *irq_work)
{
        int pending = this_cpu_xchg(printk_pending, 0);

        if (pending & PRINTK_PENDING_OUTPUT) {
                if (force_legacy_kthread()) {
                        if (printk_legacy_kthread)
                                wake_up_interruptible(&legacy_wait);
                } else {
                        if (console_trylock())
                                console_unlock();
                }
        }

        if (pending & PRINTK_PENDING_WAKEUP)
                wake_up_interruptible(&log_wait);
}

static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) =
        IRQ_WORK_INIT_LAZY(wake_up_klogd_work_func);

static void __wake_up_klogd(int val)
{
        if (!printk_percpu_data_ready())
                return;

        preempt_disable();
        /*
         * Guarantee any new records can be seen by tasks preparing to wait
         * before this context checks if the wait queue is empty.
         *
         * The full memory barrier within wq_has_sleeper() pairs with the full
         * memory barrier within set_current_state() of
         * prepare_to_wait_event(), which is called after ___wait_event() adds
         * the waiter but before it has checked the wait condition.
         *
         * This pairs with devkmsg_read:A and syslog_print:A.
         */
        if (wq_has_sleeper(&log_wait) || /* LMM(__wake_up_klogd:A) */
            (val & PRINTK_PENDING_OUTPUT)) {
                this_cpu_or(printk_pending, val);
                irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
        }
        preempt_enable();
}

/**
 * wake_up_klogd - Wake kernel logging daemon
 *
 * Use this function when new records have been added to the ringbuffer
 * and the console printing of those records has already occurred or is
 * known to be handled by some other context. This function will only
 * wake the logging daemon.
 *
 * Context: Any context.
 */
void wake_up_klogd(void)
{
        __wake_up_klogd(PRINTK_PENDING_WAKEUP);
}

/**
 * defer_console_output - Wake kernel logging daemon and trigger
 *        console printing in a deferred context
 *
 * Use this function when new records have been added to the ringbuffer,
 * this context is responsible for console printing those records, but
 * the current context is not allowed to perform the console printing.
 * Trigger an irq_work context to perform the console printing. This
 * function also wakes the logging daemon.
 *
 * Context: Any context.
 */
void defer_console_output(void)
{
        /*
         * New messages may have been added directly to the ringbuffer
         * using vprintk_store(), so wake any waiters as well.
         */
        __wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT);
}

void printk_trigger_flush(void)
{
        defer_console_output();
}

int vprintk_deferred(const char *fmt, va_list args)
{
        return vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args);
}

int _printk_deferred(const char *fmt, ...)
{
        va_list args;
        int r;

        va_start(args, fmt);
        r = vprintk_deferred(fmt, args);
        va_end(args);

        return r;
}

/*
 * printk rate limiting, lifted from the networking subsystem.
 *
 * This enforces a rate limit: not more than 10 kernel messages
 * every 5s to make a denial-of-service attack impossible.
 */
DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);

int __printk_ratelimit(const char *func)
{
        return ___ratelimit(&printk_ratelimit_state, func);
}
EXPORT_SYMBOL(__printk_ratelimit);

/**
 * printk_timed_ratelimit - caller-controlled printk ratelimiting
 * @caller_jiffies: pointer to caller's state
 * @interval_msecs: minimum interval between prints
 *
 * printk_timed_ratelimit() returns true if more than @interval_msecs
 * milliseconds have elapsed since the last time printk_timed_ratelimit()
 * returned true.
 */
bool printk_timed_ratelimit(unsigned long *caller_jiffies,
                        unsigned int interval_msecs)
{
        unsigned long elapsed = jiffies - *caller_jiffies;

        if (*caller_jiffies && elapsed <= msecs_to_jiffies(interval_msecs))
                return false;

        *caller_jiffies = jiffies;
        return true;
}
EXPORT_SYMBOL(printk_timed_ratelimit);

static DEFINE_SPINLOCK(dump_list_lock);
static LIST_HEAD(dump_list);

/**
 * kmsg_dump_register - register a kernel log dumper.
 * @dumper: pointer to the kmsg_dumper structure
 *
 * Adds a kernel log dumper to the system. The dump callback in the
 * structure will be called when the kernel oopses or panics and must be
 * set. Returns zero on success and %-EINVAL or %-EBUSY otherwise.
 */
int kmsg_dump_register(struct kmsg_dumper *dumper)
{
        unsigned long flags;
        int err = -EBUSY;

        /* The dump callback needs to be set */
        if (!dumper->dump)
                return -EINVAL;

        spin_lock_irqsave(&dump_list_lock, flags);
        /* Don't allow registering multiple times */
        if (!dumper->registered) {
                dumper->registered = 1;
                list_add_tail_rcu(&dumper->list, &dump_list);
                err = 0;
        }
        spin_unlock_irqrestore(&dump_list_lock, flags);

        return err;
}
EXPORT_SYMBOL_GPL(kmsg_dump_register);

/**
 * kmsg_dump_unregister - unregister a kmsg dumper.
 * @dumper: pointer to the kmsg_dumper structure
 *
 * Removes a dump device from the system. Returns zero on success and
 * %-EINVAL otherwise.
 */
int kmsg_dump_unregister(struct kmsg_dumper *dumper)
{
        unsigned long flags;
        int err = -EINVAL;

        spin_lock_irqsave(&dump_list_lock, flags);
        if (dumper->registered) {
                dumper->registered = 0;
                list_del_rcu(&dumper->list);
                err = 0;
        }
        spin_unlock_irqrestore(&dump_list_lock, flags);
        synchronize_rcu();

        return err;
}
EXPORT_SYMBOL_GPL(kmsg_dump_unregister);

static bool always_kmsg_dump;
module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);

const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason)
{
        switch (reason) {
        case KMSG_DUMP_PANIC:
                return "Panic";
        case KMSG_DUMP_OOPS:
                return "Oops";
        case KMSG_DUMP_EMERG:
                return "Emergency";
        case KMSG_DUMP_SHUTDOWN:
                return "Shutdown";
        default:
                return "Unknown";
        }
}
EXPORT_SYMBOL_GPL(kmsg_dump_reason_str);

/**
 * kmsg_dump_desc - dump kernel log to kernel message dumpers.
 * @reason: the reason (oops, panic etc) for dumping
 * @desc: a short string to describe what caused the panic or oops. Can be NULL
 * if no additional description is available.
 *
 * Call each of the registered dumper's dump() callback, which can
 * retrieve the kmsg records with kmsg_dump_get_line() or
 * kmsg_dump_get_buffer().
 */
void kmsg_dump_desc(enum kmsg_dump_reason reason, const char *desc)
{
        struct kmsg_dumper *dumper;
        struct kmsg_dump_detail detail = {
                .reason = reason,
                .description = desc};

        rcu_read_lock();
        list_for_each_entry_rcu(dumper, &dump_list, list) {
                enum kmsg_dump_reason max_reason = dumper->max_reason;

                /*
                 * If client has not provided a specific max_reason, default
                 * to KMSG_DUMP_OOPS, unless always_kmsg_dump was set.
                 */
                if (max_reason == KMSG_DUMP_UNDEF) {
                        max_reason = always_kmsg_dump ? KMSG_DUMP_MAX :
                                                        KMSG_DUMP_OOPS;
                }
                if (reason > max_reason)
                        continue;

                /* invoke dumper which will iterate over records */
                dumper->dump(dumper, &detail);
        }
        rcu_read_unlock();
}

/**
 * kmsg_dump_get_line - retrieve one kmsg log line
 * @iter: kmsg dump iterator
 * @syslog: include the "<4>" prefixes
 * @line: buffer to copy the line to
 * @size: maximum size of the buffer
 * @len: length of line placed into buffer
 *
 * Start at the beginning of the kmsg buffer, with the oldest kmsg
 * record, and copy one record into the provided buffer.
 *
 * Consecutive calls will return the next available record moving
 * towards the end of the buffer with the youngest messages.
 *
 * A return value of FALSE indicates that there are no more records to
 * read.
 */
bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog,
                        char *line, size_t size, size_t *len)
{
        u64 min_seq = latched_seq_read_nolock(&clear_seq);
        struct printk_info info;
        unsigned int line_count;
        struct printk_record r;
        size_t l = 0;
        bool ret = false;

        if (iter->cur_seq < min_seq)
                iter->cur_seq = min_seq;

        prb_rec_init_rd(&r, &info, line, size);

        /* Read text or count text lines? */
        if (line) {
                if (!prb_read_valid(prb, iter->cur_seq, &r))
                        goto out;
                l = record_print_text(&r, syslog, printk_time);
        } else {
                if (!prb_read_valid_info(prb, iter->cur_seq,
                                         &info, &line_count)) {
                        goto out;
                }
                l = get_record_print_text_size(&info, line_count, syslog,
                                               printk_time);

        }

        iter->cur_seq = r.info->seq + 1;
        ret = true;
out:
        if (len)
                *len = l;
        return ret;
}
EXPORT_SYMBOL_GPL(kmsg_dump_get_line);

/**
 * kmsg_dump_get_buffer - copy kmsg log lines
 * @iter: kmsg dump iterator
 * @syslog: include the "<4>" prefixes
 * @buf: buffer to copy the line to
 * @size: maximum size of the buffer
 * @len_out: length of line placed into buffer
 *
 * Start at the end of the kmsg buffer and fill the provided buffer
 * with as many of the *youngest* kmsg records that fit into it.
 * If the buffer is large enough, all available kmsg records will be
 * copied with a single call.
 *
 * Consecutive calls will fill the buffer with the next block of
 * available older records, not including the earlier retrieved ones.
 *
 * A return value of FALSE indicates that there are no more records to
 * read.
 */
bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog,
                          char *buf, size_t size, size_t *len_out)
{
        u64 min_seq = latched_seq_read_nolock(&clear_seq);
        struct printk_info info;
        struct printk_record r;
        u64 seq;
        u64 next_seq;
        size_t len = 0;
        bool ret = false;
        bool time = printk_time;

        if (!buf || !size)
                goto out;

        if (iter->cur_seq < min_seq)
                iter->cur_seq = min_seq;

        if (prb_read_valid_info(prb, iter->cur_seq, &info, NULL)) {
                if (info.seq != iter->cur_seq) {
                        /* messages are gone, move to first available one */
                        iter->cur_seq = info.seq;
                }
        }

        /* last entry */
        if (iter->cur_seq >= iter->next_seq)
                goto out;

        /*
         * Find first record that fits, including all following records,
         * into the user-provided buffer for this dump. Pass in size-1
         * because this function (by way of record_print_text()) will
         * not write more than size-1 bytes of text into @buf.
         */
        seq = find_first_fitting_seq(iter->cur_seq, iter->next_seq,
                                     size - 1, syslog, time);

        /*
         * Next kmsg_dump_get_buffer() invocation will dump block of
         * older records stored right before this one.
         */
        next_seq = seq;

        prb_rec_init_rd(&r, &info, buf, size);

        prb_for_each_record(seq, prb, seq, &r) {
                if (r.info->seq >= iter->next_seq)
                        break;

                len += record_print_text(&r, syslog, time);

                /* Adjust record to store to remaining buffer space. */
                prb_rec_init_rd(&r, &info, buf + len, size - len);
        }

        iter->next_seq = next_seq;
        ret = true;
out:
        if (len_out)
                *len_out = len;
        return ret;
}
EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);

/**
 * kmsg_dump_rewind - reset the iterator
 * @iter: kmsg dump iterator
 *
 * Reset the dumper's iterator so that kmsg_dump_get_line() and
 * kmsg_dump_get_buffer() can be called again and used multiple
 * times within the same dumper.dump() callback.
 */
void kmsg_dump_rewind(struct kmsg_dump_iter *iter)
{
        iter->cur_seq = latched_seq_read_nolock(&clear_seq);
        iter->next_seq = prb_next_seq(prb);
}
EXPORT_SYMBOL_GPL(kmsg_dump_rewind);

/**
 * console_try_replay_all - try to replay kernel log on consoles
 *
 * Try to obtain lock on console subsystem and replay all
 * available records in printk buffer on the consoles.
 * Does nothing if lock is not obtained.
 *
 * Context: Any, except for NMI.
 */
void console_try_replay_all(void)
{
        struct console_flush_type ft;

        printk_get_console_flush_type(&ft);
        if (console_trylock()) {
                __console_rewind_all();
                if (ft.nbcon_atomic)
                        nbcon_atomic_flush_pending();
                if (ft.nbcon_offload)
                        nbcon_kthreads_wake();
                if (ft.legacy_offload)
                        defer_console_output();
                /* Consoles are flushed as part of console_unlock(). */
                console_unlock();
        }
}
#endif

#ifdef CONFIG_SMP
static atomic_t printk_cpu_sync_owner = ATOMIC_INIT(-1);
static atomic_t printk_cpu_sync_nested = ATOMIC_INIT(0);

bool is_printk_cpu_sync_owner(void)
{
        return (atomic_read(&printk_cpu_sync_owner) == raw_smp_processor_id());
}

/**
 * __printk_cpu_sync_wait() - Busy wait until the printk cpu-reentrant
 *                            spinning lock is not owned by any CPU.
 *
 * Context: Any context.
 */
void __printk_cpu_sync_wait(void)
{
        do {
                cpu_relax();
        } while (atomic_read(&printk_cpu_sync_owner) != -1);
}
EXPORT_SYMBOL(__printk_cpu_sync_wait);

/**
 * __printk_cpu_sync_try_get() - Try to acquire the printk cpu-reentrant
 *                               spinning lock.
 *
 * If no processor has the lock, the calling processor takes the lock and
 * becomes the owner. If the calling processor is already the owner of the
 * lock, this function succeeds immediately.
 *
 * Context: Any context. Expects interrupts to be disabled.
 * Return: 1 on success, otherwise 0.
 */
int __printk_cpu_sync_try_get(void)
{
        int cpu;
        int old;

        cpu = smp_processor_id();

        /*
         * Guarantee loads and stores from this CPU when it is the lock owner
         * are _not_ visible to the previous lock owner. This pairs with
         * __printk_cpu_sync_put:B.
         *
         * Memory barrier involvement:
         *
         * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B,
         * then __printk_cpu_sync_put:A can never read from
         * __printk_cpu_sync_try_get:B.
         *
         * Relies on:
         *
         * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B
         * of the previous CPU
         *    matching
         * ACQUIRE from __printk_cpu_sync_try_get:A to
         * __printk_cpu_sync_try_get:B of this CPU
         */
        old = atomic_cmpxchg_acquire(&printk_cpu_sync_owner, -1,
                                     cpu); /* LMM(__printk_cpu_sync_try_get:A) */
        if (old == -1) {
                /*
                 * This CPU is now the owner and begins loading/storing
                 * data: LMM(__printk_cpu_sync_try_get:B)
                 */
                return 1;

        } else if (old == cpu) {
                /* This CPU is already the owner. */
                atomic_inc(&printk_cpu_sync_nested);
                return 1;
        }

        return 0;
}
EXPORT_SYMBOL(__printk_cpu_sync_try_get);

/**
 * __printk_cpu_sync_put() - Release the printk cpu-reentrant spinning lock.
 *
 * The calling processor must be the owner of the lock.
 *
 * Context: Any context. Expects interrupts to be disabled.
 */
void __printk_cpu_sync_put(void)
{
        if (atomic_read(&printk_cpu_sync_nested)) {
                atomic_dec(&printk_cpu_sync_nested);
                return;
        }

        /*
         * This CPU is finished loading/storing data:
         * LMM(__printk_cpu_sync_put:A)
         */

        /*
         * Guarantee loads and stores from this CPU when it was the
         * lock owner are visible to the next lock owner. This pairs
         * with __printk_cpu_sync_try_get:A.
         *
         * Memory barrier involvement:
         *
         * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B,
         * then __printk_cpu_sync_try_get:B reads from __printk_cpu_sync_put:A.
         *
         * Relies on:
         *
         * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B
         * of this CPU
         *    matching
         * ACQUIRE from __printk_cpu_sync_try_get:A to
         * __printk_cpu_sync_try_get:B of the next CPU
         */
        atomic_set_release(&printk_cpu_sync_owner,
                           -1); /* LMM(__printk_cpu_sync_put:B) */
}
EXPORT_SYMBOL(__printk_cpu_sync_put);
#endif /* CONFIG_SMP */















































    1 

































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Integer base 2 logarithm calculation
 *
 * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#ifndef _LINUX_LOG2_H
#define _LINUX_LOG2_H

#include <linux/types.h>
#include <linux/bitops.h>

/*
 * non-constant log of base 2 calculators
 * - the arch may override these in asm/bitops.h if they can be implemented
 *   more efficiently than using fls() and fls64()
 * - the arch is not required to handle n==0 if implementing the fallback
 */
#ifndef CONFIG_ARCH_HAS_ILOG2_U32
static __always_inline __attribute__((const))
int __ilog2_u32(u32 n)
{
        return fls(n) - 1;
}
#endif

#ifndef CONFIG_ARCH_HAS_ILOG2_U64
static __always_inline __attribute__((const))
int __ilog2_u64(u64 n)
{
        return fls64(n) - 1;
}
#endif

/**
 * is_power_of_2() - check if a value is a power of two
 * @n: the value to check
 *
 * Determine whether some value is a power of two, where zero is
 * *not* considered a power of two.
 * Return: true if @n is a power of 2, otherwise false.
 */
static __always_inline __attribute__((const))
bool is_power_of_2(unsigned long n)
{
        return (n != 0 && ((n & (n - 1)) == 0));
}

/**
 * __roundup_pow_of_two() - round up to nearest power of two
 * @n: value to round up
 */
static inline __attribute__((const))
unsigned long __roundup_pow_of_two(unsigned long n)
{
        return 1UL << fls_long(n - 1);
}

/**
 * __rounddown_pow_of_two() - round down to nearest power of two
 * @n: value to round down
 */
static inline __attribute__((const))
unsigned long __rounddown_pow_of_two(unsigned long n)
{
        return 1UL << (fls_long(n) - 1);
}

/**
 * const_ilog2 - log base 2 of 32-bit or a 64-bit constant unsigned value
 * @n: parameter
 *
 * Use this where sparse expects a true constant expression, e.g. for array
 * indices.
 */
#define const_ilog2(n)                                \
(                                                \
        __builtin_constant_p(n) ? (                \
                (n) < 2 ? 0 :                        \
                (n) & (1ULL << 63) ? 63 :        \
                (n) & (1ULL << 62) ? 62 :        \
                (n) & (1ULL << 61) ? 61 :        \
                (n) & (1ULL << 60) ? 60 :        \
                (n) & (1ULL << 59) ? 59 :        \
                (n) & (1ULL << 58) ? 58 :        \
                (n) & (1ULL << 57) ? 57 :        \
                (n) & (1ULL << 56) ? 56 :        \
                (n) & (1ULL << 55) ? 55 :        \
                (n) & (1ULL << 54) ? 54 :        \
                (n) & (1ULL << 53) ? 53 :        \
                (n) & (1ULL << 52) ? 52 :        \
                (n) & (1ULL << 51) ? 51 :        \
                (n) & (1ULL << 50) ? 50 :        \
                (n) & (1ULL << 49) ? 49 :        \
                (n) & (1ULL << 48) ? 48 :        \
                (n) & (1ULL << 47) ? 47 :        \
                (n) & (1ULL << 46) ? 46 :        \
                (n) & (1ULL << 45) ? 45 :        \
                (n) & (1ULL << 44) ? 44 :        \
                (n) & (1ULL << 43) ? 43 :        \
                (n) & (1ULL << 42) ? 42 :        \
                (n) & (1ULL << 41) ? 41 :        \
                (n) & (1ULL << 40) ? 40 :        \
                (n) & (1ULL << 39) ? 39 :        \
                (n) & (1ULL << 38) ? 38 :        \
                (n) & (1ULL << 37) ? 37 :        \
                (n) & (1ULL << 36) ? 36 :        \
                (n) & (1ULL << 35) ? 35 :        \
                (n) & (1ULL << 34) ? 34 :        \
                (n) & (1ULL << 33) ? 33 :        \
                (n) & (1ULL << 32) ? 32 :        \
                (n) & (1ULL << 31) ? 31 :        \
                (n) & (1ULL << 30) ? 30 :        \
                (n) & (1ULL << 29) ? 29 :        \
                (n) & (1ULL << 28) ? 28 :        \
                (n) & (1ULL << 27) ? 27 :        \
                (n) & (1ULL << 26) ? 26 :        \
                (n) & (1ULL << 25) ? 25 :        \
                (n) & (1ULL << 24) ? 24 :        \
                (n) & (1ULL << 23) ? 23 :        \
                (n) & (1ULL << 22) ? 22 :        \
                (n) & (1ULL << 21) ? 21 :        \
                (n) & (1ULL << 20) ? 20 :        \
                (n) & (1ULL << 19) ? 19 :        \
                (n) & (1ULL << 18) ? 18 :        \
                (n) & (1ULL << 17) ? 17 :        \
                (n) & (1ULL << 16) ? 16 :        \
                (n) & (1ULL << 15) ? 15 :        \
                (n) & (1ULL << 14) ? 14 :        \
                (n) & (1ULL << 13) ? 13 :        \
                (n) & (1ULL << 12) ? 12 :        \
                (n) & (1ULL << 11) ? 11 :        \
                (n) & (1ULL << 10) ? 10 :        \
                (n) & (1ULL <<  9) ?  9 :        \
                (n) & (1ULL <<  8) ?  8 :        \
                (n) & (1ULL <<  7) ?  7 :        \
                (n) & (1ULL <<  6) ?  6 :        \
                (n) & (1ULL <<  5) ?  5 :        \
                (n) & (1ULL <<  4) ?  4 :        \
                (n) & (1ULL <<  3) ?  3 :        \
                (n) & (1ULL <<  2) ?  2 :        \
                1) :                                \
        -1)

/**
 * ilog2 - log base 2 of 32-bit or a 64-bit unsigned value
 * @n: parameter
 *
 * constant-capable log of base 2 calculation
 * - this can be used to initialise global variables from constant data, hence
 * the massive ternary operator construction
 *
 * selects the appropriately-sized optimised version depending on sizeof(n)
 */
#define ilog2(n) \
( \
        __builtin_constant_p(n) ?        \
        ((n) < 2 ? 0 :                        \
         63 - __builtin_clzll(n)) :        \
        (sizeof(n) <= 4) ?                \
        __ilog2_u32(n) :                \
        __ilog2_u64(n)                        \
 )

/**
 * roundup_pow_of_two - round the given value up to nearest power of two
 * @n: parameter
 *
 * round the given value up to the nearest power of two
 * - the result is undefined when n == 0
 * - this can be used to initialise global variables from constant data
 */
#define roundup_pow_of_two(n)                        \
(                                                \
        __builtin_constant_p(n) ? (                \
                ((n) == 1) ? 1 :                \
                (1UL << (ilog2((n) - 1) + 1))        \
                                   ) :                \
        __roundup_pow_of_two(n)                        \
 )

/**
 * rounddown_pow_of_two - round the given value down to nearest power of two
 * @n: parameter
 *
 * round the given value down to the nearest power of two
 * - the result is undefined when n == 0
 * - this can be used to initialise global variables from constant data
 */
#define rounddown_pow_of_two(n)                        \
(                                                \
        __builtin_constant_p(n) ? (                \
                (1UL << ilog2(n))) :                \
        __rounddown_pow_of_two(n)                \
 )

static inline __attribute_const__
int __order_base_2(unsigned long n)
{
        return n > 1 ? ilog2(n - 1) + 1 : 0;
}

/**
 * order_base_2 - calculate the (rounded up) base 2 order of the argument
 * @n: parameter
 *
 * The first few values calculated by this routine:
 *  ob2(0) = 0
 *  ob2(1) = 0
 *  ob2(2) = 1
 *  ob2(3) = 2
 *  ob2(4) = 2
 *  ob2(5) = 3
 *  ... and so on.
 */
#define order_base_2(n)                                \
(                                                \
        __builtin_constant_p(n) ? (                \
                ((n) == 0 || (n) == 1) ? 0 :        \
                ilog2((n) - 1) + 1) :                \
        __order_base_2(n)                        \
)

static inline __attribute__((const))
int __bits_per(unsigned long n)
{
        if (n < 2)
                return 1;
        if (is_power_of_2(n))
                return order_base_2(n) + 1;
        return order_base_2(n);
}

/**
 * bits_per - calculate the number of bits required for the argument
 * @n: parameter
 *
 * This is constant-capable and can be used for compile time
 * initializations, e.g bitfields.
 *
 * The first few values calculated by this routine:
 * bf(0) = 1
 * bf(1) = 1
 * bf(2) = 2
 * bf(3) = 2
 * bf(4) = 3
 * ... and so on.
 */
#define bits_per(n)                                \
(                                                \
        __builtin_constant_p(n) ? (                \
                ((n) == 0 || (n) == 1)                \
                        ? 1 : ilog2(n) + 1        \
        ) :                                        \
        __bits_per(n)                                \
)

/**
 * max_pow_of_two_factor - return highest power-of-2 factor
 * @n: parameter
 *
 * find highest power-of-2 which is evenly divisible into n.
 * 0 is returned for n == 0 or 1.
 */
static inline __attribute__((const))
unsigned int max_pow_of_two_factor(unsigned int n)
{
        return n & -n;
}

#endif /* _LINUX_LOG2_H */

















  317 
  311 
  317 
































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
// SPDX-License-Identifier: GPL-2.0
#include <linux/compiler.h>
#include <linux/errno.h>
#include <linux/export.h>
#include <linux/fault-inject-usercopy.h>
#include <linux/instrumented.h>
#include <linux/kernel.h>
#include <linux/nospec.h>
#include <linux/string.h>
#include <linux/uaccess.h>
#include <linux/wordpart.h>

/* out-of-line parts */

#if !defined(INLINE_COPY_FROM_USER) || defined(CONFIG_RUST)
unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n)
{
        return _inline_copy_from_user(to, from, n);
}
EXPORT_SYMBOL(_copy_from_user);
#endif

#if !defined(INLINE_COPY_TO_USER) || defined(CONFIG_RUST)
unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n)
{
        return _inline_copy_to_user(to, from, n);
}
EXPORT_SYMBOL(_copy_to_user);
#endif

/**
 * check_zeroed_user: check if a userspace buffer only contains zero bytes
 * @from: Source address, in userspace.
 * @size: Size of buffer.
 *
 * This is effectively shorthand for "memchr_inv(from, 0, size) == NULL" for
 * userspace addresses (and is more efficient because we don't care where the
 * first non-zero byte is).
 *
 * Returns:
 *  * 0: There were non-zero bytes present in the buffer.
 *  * 1: The buffer was full of zero bytes.
 *  * -EFAULT: access to userspace failed.
 */
int check_zeroed_user(const void __user *from, size_t size)
{
        unsigned long val;
        uintptr_t align = (uintptr_t) from % sizeof(unsigned long);

        if (unlikely(size == 0))
                return 1;

        from -= align;
        size += align;

        if (!user_read_access_begin(from, size))
                return -EFAULT;

        unsafe_get_user(val, (unsigned long __user *) from, err_fault);
        if (align)
                val &= ~aligned_byte_mask(align);

        while (size > sizeof(unsigned long)) {
                if (unlikely(val))
                        goto done;

                from += sizeof(unsigned long);
                size -= sizeof(unsigned long);

                unsafe_get_user(val, (unsigned long __user *) from, err_fault);
        }

        if (size < sizeof(unsigned long))
                val &= aligned_byte_mask(size);

done:
        user_read_access_end();
        return (val == 0);
err_fault:
        user_read_access_end();
        return -EFAULT;
}
EXPORT_SYMBOL(check_zeroed_user);
































































   10 















   10 



























   10 





   10 

   11 








    2 

   14 
   11 






    1 



   13 

   12 

   12 
   11 








   14 
    1 




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * IP Payload Compression Protocol (IPComp) - RFC3173.
 *
 * Copyright (c) 2003 James Morris <jmorris@intercode.com.au>
 *
 * Todo:
 *   - Tunable compression parameters.
 *   - Compression stats.
 *   - Adaptive compression.
 */
#include <linux/module.h>
#include <linux/err.h>
#include <linux/rtnetlink.h>
#include <net/ip.h>
#include <net/xfrm.h>
#include <net/icmp.h>
#include <net/ipcomp.h>
#include <net/protocol.h>
#include <net/sock.h>

static int ipcomp4_err(struct sk_buff *skb, u32 info)
{
        struct net *net = dev_net(skb->dev);
        __be32 spi;
        const struct iphdr *iph = (const struct iphdr *)skb->data;
        struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
        struct xfrm_state *x;

        switch (icmp_hdr(skb)->type) {
        case ICMP_DEST_UNREACH:
                if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
                        return 0;
                break;
        case ICMP_REDIRECT:
                break;
        default:
                return 0;
        }

        spi = htonl(ntohs(ipch->cpi));
        x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
                              spi, IPPROTO_COMP, AF_INET);
        if (!x)
                return 0;

        if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
                ipv4_update_pmtu(skb, net, info, 0, IPPROTO_COMP);
        else
                ipv4_redirect(skb, net, 0, IPPROTO_COMP);
        xfrm_state_put(x);

        return 0;
}

/* We always hold one tunnel user reference to indicate a tunnel */
static struct lock_class_key xfrm_state_lock_key;
static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
{
        struct net *net = xs_net(x);
        struct xfrm_state *t;

        t = xfrm_state_alloc(net);
        if (!t)
                goto out;
        lockdep_set_class(&t->lock, &xfrm_state_lock_key);

        t->id.proto = IPPROTO_IPIP;
        t->id.spi = x->props.saddr.a4;
        t->id.daddr.a4 = x->id.daddr.a4;
        memcpy(&t->sel, &x->sel, sizeof(t->sel));
        t->props.family = AF_INET;
        t->props.mode = x->props.mode;
        t->props.saddr.a4 = x->props.saddr.a4;
        t->props.flags = x->props.flags;
        t->props.extra_flags = x->props.extra_flags;
        memcpy(&t->mark, &x->mark, sizeof(t->mark));
        t->if_id = x->if_id;

        if (xfrm_init_state(t))
                goto error;

        atomic_set(&t->tunnel_users, 1);
out:
        return t;

error:
        t->km.state = XFRM_STATE_DEAD;
        xfrm_state_put(t);
        t = NULL;
        goto out;
}

/*
 * Must be protected by xfrm_cfg_mutex.  State and tunnel user references are
 * always incremented on success.
 */
static int ipcomp_tunnel_attach(struct xfrm_state *x)
{
        struct net *net = xs_net(x);
        int err = 0;
        struct xfrm_state *t;
        u32 mark = x->mark.v & x->mark.m;

        t = xfrm_state_lookup(net, mark, (xfrm_address_t *)&x->id.daddr.a4,
                              x->props.saddr.a4, IPPROTO_IPIP, AF_INET);
        if (!t) {
                t = ipcomp_tunnel_create(x);
                if (!t) {
                        err = -EINVAL;
                        goto out;
                }
                xfrm_state_insert(t);
                xfrm_state_hold(t);
        }
        x->tunnel = t;
        atomic_inc(&t->tunnel_users);
out:
        return err;
}

static int ipcomp4_init_state(struct xfrm_state *x,
                              struct netlink_ext_ack *extack)
{
        int err = -EINVAL;

        x->props.header_len = 0;
        switch (x->props.mode) {
        case XFRM_MODE_TRANSPORT:
                break;
        case XFRM_MODE_TUNNEL:
                x->props.header_len += sizeof(struct iphdr);
                break;
        default:
                NL_SET_ERR_MSG(extack, "Unsupported XFRM mode for IPcomp");
                goto out;
        }

        err = ipcomp_init_state(x, extack);
        if (err)
                goto out;

        if (x->props.mode == XFRM_MODE_TUNNEL) {
                err = ipcomp_tunnel_attach(x);
                if (err) {
                        NL_SET_ERR_MSG(extack, "Kernel error: failed to initialize the associated state");
                        goto out;
                }
        }

        err = 0;
out:
        return err;
}

static int ipcomp4_rcv_cb(struct sk_buff *skb, int err)
{
        return 0;
}

static const struct xfrm_type ipcomp_type = {
        .owner                = THIS_MODULE,
        .proto                     = IPPROTO_COMP,
        .init_state        = ipcomp4_init_state,
        .destructor        = ipcomp_destroy,
        .input                = ipcomp_input,
        .output                = ipcomp_output
};

static struct xfrm4_protocol ipcomp4_protocol = {
        .handler        =        xfrm4_rcv,
        .input_handler        =        xfrm_input,
        .cb_handler        =        ipcomp4_rcv_cb,
        .err_handler        =        ipcomp4_err,
        .priority        =        0,
};

static int __init ipcomp4_init(void)
{
        if (xfrm_register_type(&ipcomp_type, AF_INET) < 0) {
                pr_info("%s: can't add xfrm type\n", __func__);
                return -EAGAIN;
        }
        if (xfrm4_protocol_register(&ipcomp4_protocol, IPPROTO_COMP) < 0) {
                pr_info("%s: can't add protocol\n", __func__);
                xfrm_unregister_type(&ipcomp_type, AF_INET);
                return -EAGAIN;
        }
        return 0;
}

static void __exit ipcomp4_fini(void)
{
        if (xfrm4_protocol_deregister(&ipcomp4_protocol, IPPROTO_COMP) < 0)
                pr_info("%s: can't remove protocol\n", __func__);
        xfrm_unregister_type(&ipcomp_type, AF_INET);
}

module_init(ipcomp4_init);
module_exit(ipcomp4_fini);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp/IPv4) - RFC3173");
MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");

MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_COMP);










   39 



























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM printk

#if !defined(_TRACE_PRINTK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_PRINTK_H

#include <linux/tracepoint.h>

TRACE_EVENT(console,
        TP_PROTO(const char *text, size_t len),

        TP_ARGS(text, len),

        TP_STRUCT__entry(
                __dynamic_array(char, msg, len + 1)
        ),

        TP_fast_assign(
                /*
                 * Each trace entry is printed in a new line.
                 * If the msg finishes with '\n', cut it off
                 * to avoid blank lines in the trace.
                 */
                if ((len > 0) && (text[len-1] == '\n'))
                        len -= 1;

                memcpy(__get_str(msg), text, len);
                __get_str(msg)[len] = 0;
        ),

        TP_printk("%s", __get_str(msg))
);
#endif /* _TRACE_PRINTK_H */

/* This part must be outside protection */
#include <trace/define_trace.h>





































































































   39 
































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_ATOMIC64_64_H
#define _ASM_X86_ATOMIC64_64_H

#include <linux/types.h>
#include <asm/alternative.h>
#include <asm/cmpxchg.h>

/* The 64-bit atomic type */

#define ATOMIC64_INIT(i)        { (i) }

static __always_inline s64 arch_atomic64_read(const atomic64_t *v)
{
        return __READ_ONCE((v)->counter);
}

static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i)
{
        __WRITE_ONCE(v->counter, i);
}

static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v)
{
        asm_inline volatile(LOCK_PREFIX "addq %1, %0"
                     : "=m" (v->counter)
                     : "er" (i), "m" (v->counter) : "memory");
}

static __always_inline void arch_atomic64_sub(s64 i, atomic64_t *v)
{
        asm_inline volatile(LOCK_PREFIX "subq %1, %0"
                     : "=m" (v->counter)
                     : "er" (i), "m" (v->counter) : "memory");
}

static __always_inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, e, "er", i);
}
#define arch_atomic64_sub_and_test arch_atomic64_sub_and_test

static __always_inline void arch_atomic64_inc(atomic64_t *v)
{
        asm_inline volatile(LOCK_PREFIX "incq %0"
                     : "=m" (v->counter)
                     : "m" (v->counter) : "memory");
}
#define arch_atomic64_inc arch_atomic64_inc

static __always_inline void arch_atomic64_dec(atomic64_t *v)
{
        asm_inline volatile(LOCK_PREFIX "decq %0"
                     : "=m" (v->counter)
                     : "m" (v->counter) : "memory");
}
#define arch_atomic64_dec arch_atomic64_dec

static __always_inline bool arch_atomic64_dec_and_test(atomic64_t *v)
{
        return GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, e);
}
#define arch_atomic64_dec_and_test arch_atomic64_dec_and_test

static __always_inline bool arch_atomic64_inc_and_test(atomic64_t *v)
{
        return GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, e);
}
#define arch_atomic64_inc_and_test arch_atomic64_inc_and_test

static __always_inline bool arch_atomic64_add_negative(s64 i, atomic64_t *v)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, s, "er", i);
}
#define arch_atomic64_add_negative arch_atomic64_add_negative

static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
{
        return i + xadd(&v->counter, i);
}
#define arch_atomic64_add_return arch_atomic64_add_return

#define arch_atomic64_sub_return(i, v) arch_atomic64_add_return(-(i), v)

static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
{
        return xadd(&v->counter, i);
}
#define arch_atomic64_fetch_add arch_atomic64_fetch_add

#define arch_atomic64_fetch_sub(i, v) arch_atomic64_fetch_add(-(i), v)

static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
{
        return arch_cmpxchg(&v->counter, old, new);
}
#define arch_atomic64_cmpxchg arch_atomic64_cmpxchg

static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
{
        return arch_try_cmpxchg(&v->counter, old, new);
}
#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg

static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new)
{
        return arch_xchg(&v->counter, new);
}
#define arch_atomic64_xchg arch_atomic64_xchg

static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v)
{
        asm_inline volatile(LOCK_PREFIX "andq %1, %0"
                        : "+m" (v->counter)
                        : "er" (i)
                        : "memory");
}

static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
{
        s64 val = arch_atomic64_read(v);

        do {
        } while (!arch_atomic64_try_cmpxchg(v, &val, val & i));
        return val;
}
#define arch_atomic64_fetch_and arch_atomic64_fetch_and

static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v)
{
        asm_inline volatile(LOCK_PREFIX "orq %1, %0"
                        : "+m" (v->counter)
                        : "er" (i)
                        : "memory");
}

static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
{
        s64 val = arch_atomic64_read(v);

        do {
        } while (!arch_atomic64_try_cmpxchg(v, &val, val | i));
        return val;
}
#define arch_atomic64_fetch_or arch_atomic64_fetch_or

static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v)
{
        asm_inline volatile(LOCK_PREFIX "xorq %1, %0"
                        : "+m" (v->counter)
                        : "er" (i)
                        : "memory");
}

static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
{
        s64 val = arch_atomic64_read(v);

        do {
        } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i));
        return val;
}
#define arch_atomic64_fetch_xor arch_atomic64_fetch_xor

#endif /* _ASM_X86_ATOMIC64_64_H */





















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Type definitions for the multi-level security (MLS) policy.
 *
 * Author : Stephen Smalley, <stephen.smalley.work@gmail.com>
 */

/*
 * Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com>
 *          Support for enhanced MLS infrastructure.
 *          Copyright (C) 2004-2005 Trusted Computer Solutions, Inc.
 */

#ifndef _SS_MLS_TYPES_H_
#define _SS_MLS_TYPES_H_

#include "security.h"
#include "ebitmap.h"

struct mls_level {
        u32 sens; /* sensitivity */
        struct ebitmap cat; /* category set */
};

struct mls_range {
        struct mls_level level[2]; /* low == level[0], high == level[1] */
};

static inline int mls_level_eq(const struct mls_level *l1,
                               const struct mls_level *l2)
{
        return ((l1->sens == l2->sens) && ebitmap_equal(&l1->cat, &l2->cat));
}

static inline int mls_level_dom(const struct mls_level *l1,
                                const struct mls_level *l2)
{
        return ((l1->sens >= l2->sens) &&
                ebitmap_contains(&l1->cat, &l2->cat, 0));
}

#define mls_level_incomp(l1, l2) \
        (!mls_level_dom((l1), (l2)) && !mls_level_dom((l2), (l1)))

#define mls_level_between(l1, l2, l3) \
        (mls_level_dom((l1), (l2)) && mls_level_dom((l3), (l1)))

#define mls_range_contains(r1, r2)                        \
        (mls_level_dom(&(r2).level[0], &(r1).level[0]) && \
         mls_level_dom(&(r1).level[1], &(r2).level[1]))

#endif /* _SS_MLS_TYPES_H_ */

































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* delayacct.h - per-task delay accounting
 *
 * Copyright (C) Shailabh Nagar, IBM Corp. 2006
 */

#ifndef _LINUX_DELAYACCT_H
#define _LINUX_DELAYACCT_H

#include <uapi/linux/taskstats.h>

#ifdef CONFIG_TASK_DELAY_ACCT
struct task_delay_info {
        raw_spinlock_t        lock;

        /* For each stat XXX, add following, aligned appropriately
         *
         * struct timespec XXX_start, XXX_end;
         * u64 XXX_delay;
         * u32 XXX_count;
         *
         * Atomicity of updates to XXX_delay, XXX_count protected by
         * single lock above (split into XXX_lock if contention is an issue).
         */

        /*
         * XXX_count is incremented on every XXX operation, the delay
         * associated with the operation is added to XXX_delay.
         * XXX_delay contains the accumulated delay time in nanoseconds.
         */
        u64 blkio_start;
        u64 blkio_delay_max;
        u64 blkio_delay_min;
        u64 blkio_delay;        /* wait for sync block io completion */
        u64 swapin_start;
        u64 swapin_delay_max;
        u64 swapin_delay_min;
        u64 swapin_delay;        /* wait for swapin */
        u32 blkio_count;        /* total count of the number of sync block */
                                /* io operations performed */
        u32 swapin_count;        /* total count of swapin */

        u64 freepages_start;
        u64 freepages_delay_max;
        u64 freepages_delay_min;
        u64 freepages_delay;        /* wait for memory reclaim */

        u64 thrashing_start;
        u64 thrashing_delay_max;
        u64 thrashing_delay_min;
        u64 thrashing_delay;        /* wait for thrashing page */

        u64 compact_start;
        u64 compact_delay_max;
        u64 compact_delay_min;
        u64 compact_delay;        /* wait for memory compact */

        u64 wpcopy_start;
        u64 wpcopy_delay_max;
        u64 wpcopy_delay_min;
        u64 wpcopy_delay;        /* wait for write-protect copy */

        u64 irq_delay_max;
        u64 irq_delay_min;
        u64 irq_delay;        /* wait for IRQ/SOFTIRQ */

        u32 freepages_count;        /* total count of memory reclaim */
        u32 thrashing_count;        /* total count of thrash waits */
        u32 compact_count;        /* total count of memory compact */
        u32 wpcopy_count;        /* total count of write-protect copy */
        u32 irq_count;        /* total count of IRQ/SOFTIRQ */
};
#endif

#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/jump_label.h>

#ifdef CONFIG_TASK_DELAY_ACCT
DECLARE_STATIC_KEY_FALSE(delayacct_key);
extern int delayacct_on;        /* Delay accounting turned on/off */
extern struct kmem_cache *delayacct_cache;
extern void delayacct_init(void);

extern void __delayacct_tsk_init(struct task_struct *);
extern void __delayacct_tsk_exit(struct task_struct *);
extern void __delayacct_blkio_start(void);
extern void __delayacct_blkio_end(struct task_struct *);
extern int delayacct_add_tsk(struct taskstats *, struct task_struct *);
extern __u64 __delayacct_blkio_ticks(struct task_struct *);
extern void __delayacct_freepages_start(void);
extern void __delayacct_freepages_end(void);
extern void __delayacct_thrashing_start(bool *in_thrashing);
extern void __delayacct_thrashing_end(bool *in_thrashing);
extern void __delayacct_swapin_start(void);
extern void __delayacct_swapin_end(void);
extern void __delayacct_compact_start(void);
extern void __delayacct_compact_end(void);
extern void __delayacct_wpcopy_start(void);
extern void __delayacct_wpcopy_end(void);
extern void __delayacct_irq(struct task_struct *task, u32 delta);

static inline void delayacct_tsk_init(struct task_struct *tsk)
{
        /* reinitialize in case parent's non-null pointer was dup'ed*/
        tsk->delays = NULL;
        if (delayacct_on)
                __delayacct_tsk_init(tsk);
}

/* Free tsk->delays. Called from bad fork and __put_task_struct
 * where there's no risk of tsk->delays being accessed elsewhere
 */
static inline void delayacct_tsk_free(struct task_struct *tsk)
{
        if (tsk->delays)
                kmem_cache_free(delayacct_cache, tsk->delays);
        tsk->delays = NULL;
}

static inline void delayacct_blkio_start(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_blkio_start();
}

static inline void delayacct_blkio_end(struct task_struct *p)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (p->delays)
                __delayacct_blkio_end(p);
}

static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk)
{
        if (tsk->delays)
                return __delayacct_blkio_ticks(tsk);
        return 0;
}

static inline void delayacct_freepages_start(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_freepages_start();
}

static inline void delayacct_freepages_end(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_freepages_end();
}

static inline void delayacct_thrashing_start(bool *in_thrashing)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_thrashing_start(in_thrashing);
}

static inline void delayacct_thrashing_end(bool *in_thrashing)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_thrashing_end(in_thrashing);
}

static inline void delayacct_swapin_start(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_swapin_start();
}

static inline void delayacct_swapin_end(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_swapin_end();
}

static inline void delayacct_compact_start(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_compact_start();
}

static inline void delayacct_compact_end(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_compact_end();
}

static inline void delayacct_wpcopy_start(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_wpcopy_start();
}

static inline void delayacct_wpcopy_end(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_wpcopy_end();
}

static inline void delayacct_irq(struct task_struct *task, u32 delta)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (task->delays)
                __delayacct_irq(task, delta);
}

#else
static inline void delayacct_init(void)
{}
static inline void delayacct_tsk_init(struct task_struct *tsk)
{}
static inline void delayacct_tsk_free(struct task_struct *tsk)
{}
static inline void delayacct_blkio_start(void)
{}
static inline void delayacct_blkio_end(struct task_struct *p)
{}
static inline int delayacct_add_tsk(struct taskstats *d,
                                        struct task_struct *tsk)
{ return 0; }
static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk)
{ return 0; }
static inline int delayacct_is_task_waiting_on_io(struct task_struct *p)
{ return 0; }
static inline void delayacct_freepages_start(void)
{}
static inline void delayacct_freepages_end(void)
{}
static inline void delayacct_thrashing_start(bool *in_thrashing)
{}
static inline void delayacct_thrashing_end(bool *in_thrashing)
{}
static inline void delayacct_swapin_start(void)
{}
static inline void delayacct_swapin_end(void)
{}
static inline void delayacct_compact_start(void)
{}
static inline void delayacct_compact_end(void)
{}
static inline void delayacct_wpcopy_start(void)
{}
static inline void delayacct_wpcopy_end(void)
{}
static inline void delayacct_irq(struct task_struct *task, u32 delta)
{}

#endif /* CONFIG_TASK_DELAY_ACCT */

#endif









































   13 

   13 


























































































































































































































































































   13 

































   13 

   13 
   13 
















   12 





















   13 




   13 






   11 







   13 







   13 


   13 



   13 


   13 


   13 
















   13 

   12 

   13 
   13 









































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
// SPDX-License-Identifier: GPL-2.0-only
/*
 * umh - the kernel usermode helper
 */
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/binfmts.h>
#include <linux/syscalls.h>
#include <linux/unistd.h>
#include <linux/kmod.h>
#include <linux/slab.h>
#include <linux/completion.h>
#include <linux/cred.h>
#include <linux/file.h>
#include <linux/fs_struct.h>
#include <linux/workqueue.h>
#include <linux/security.h>
#include <linux/mount.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/resource.h>
#include <linux/notifier.h>
#include <linux/suspend.h>
#include <linux/rwsem.h>
#include <linux/ptrace.h>
#include <linux/async.h>
#include <linux/uaccess.h>
#include <linux/initrd.h>
#include <linux/freezer.h>

#include <trace/events/module.h>

static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
static DEFINE_SPINLOCK(umh_sysctl_lock);
static DECLARE_RWSEM(umhelper_sem);

static void call_usermodehelper_freeinfo(struct subprocess_info *info)
{
        if (info->cleanup)
                (*info->cleanup)(info);
        kfree(info);
}

static void umh_complete(struct subprocess_info *sub_info)
{
        struct completion *comp = xchg(&sub_info->complete, NULL);
        /*
         * See call_usermodehelper_exec(). If xchg() returns NULL
         * we own sub_info, the UMH_KILLABLE caller has gone away
         * or the caller used UMH_NO_WAIT.
         */
        if (comp)
                complete(comp);
        else
                call_usermodehelper_freeinfo(sub_info);
}

/*
 * This is the task which runs the usermode application
 */
static int call_usermodehelper_exec_async(void *data)
{
        struct subprocess_info *sub_info = data;
        struct cred *new;
        int retval;

        spin_lock_irq(&current->sighand->siglock);
        flush_signal_handlers(current, 1);
        spin_unlock_irq(&current->sighand->siglock);

        /*
         * Initial kernel threads share ther FS with init, in order to
         * get the init root directory. But we've now created a new
         * thread that is going to execve a user process and has its own
         * 'struct fs_struct'. Reset umask to the default.
         */
        current->fs->umask = 0022;

        /*
         * Our parent (unbound workqueue) runs with elevated scheduling
         * priority. Avoid propagating that into the userspace child.
         */
        set_user_nice(current, 0);

        retval = -ENOMEM;
        new = prepare_kernel_cred(current);
        if (!new)
                goto out;

        spin_lock(&umh_sysctl_lock);
        new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset);
        new->cap_inheritable = cap_intersect(usermodehelper_inheritable,
                                             new->cap_inheritable);
        spin_unlock(&umh_sysctl_lock);

        if (sub_info->init) {
                retval = sub_info->init(sub_info, new);
                if (retval) {
                        abort_creds(new);
                        goto out;
                }
        }

        commit_creds(new);

        wait_for_initramfs();
        retval = kernel_execve(sub_info->path,
                               (const char *const *)sub_info->argv,
                               (const char *const *)sub_info->envp);
out:
        sub_info->retval = retval;
        /*
         * call_usermodehelper_exec_sync() will call umh_complete
         * if UHM_WAIT_PROC.
         */
        if (!(sub_info->wait & UMH_WAIT_PROC))
                umh_complete(sub_info);
        if (!retval)
                return 0;
        do_exit(0);
}

/* Handles UMH_WAIT_PROC.  */
static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
{
        pid_t pid;

        /* If SIGCLD is ignored do_wait won't populate the status. */
        kernel_sigaction(SIGCHLD, SIG_DFL);
        pid = user_mode_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
        if (pid < 0)
                sub_info->retval = pid;
        else
                kernel_wait(pid, &sub_info->retval);

        /* Restore default kernel sig handler */
        kernel_sigaction(SIGCHLD, SIG_IGN);
        umh_complete(sub_info);
}

/*
 * We need to create the usermodehelper kernel thread from a task that is affine
 * to an optimized set of CPUs (or nohz housekeeping ones) such that they
 * inherit a widest affinity irrespective of call_usermodehelper() callers with
 * possibly reduced affinity (eg: per-cpu workqueues). We don't want
 * usermodehelper targets to contend a busy CPU.
 *
 * Unbound workqueues provide such wide affinity and allow to block on
 * UMH_WAIT_PROC requests without blocking pending request (up to some limit).
 *
 * Besides, workqueues provide the privilege level that caller might not have
 * to perform the usermodehelper request.
 *
 */
static void call_usermodehelper_exec_work(struct work_struct *work)
{
        struct subprocess_info *sub_info =
                container_of(work, struct subprocess_info, work);

        if (sub_info->wait & UMH_WAIT_PROC) {
                call_usermodehelper_exec_sync(sub_info);
        } else {
                pid_t pid;
                /*
                 * Use CLONE_PARENT to reparent it to kthreadd; we do not
                 * want to pollute current->children, and we need a parent
                 * that always ignores SIGCHLD to ensure auto-reaping.
                 */
                pid = user_mode_thread(call_usermodehelper_exec_async, sub_info,
                                       CLONE_PARENT | SIGCHLD);
                if (pid < 0) {
                        sub_info->retval = pid;
                        umh_complete(sub_info);
                }
        }
}

/*
 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
 * (used for preventing user land processes from being created after the user
 * land has been frozen during a system-wide hibernation or suspend operation).
 * Should always be manipulated under umhelper_sem acquired for write.
 */
static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED;

/* Number of helpers running */
static atomic_t running_helpers = ATOMIC_INIT(0);

/*
 * Wait queue head used by usermodehelper_disable() to wait for all running
 * helpers to finish.
 */
static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);

/*
 * Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled
 * to become 'false'.
 */
static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq);

/*
 * Time to wait for running_helpers to become zero before the setting of
 * usermodehelper_disabled in usermodehelper_disable() fails
 */
#define RUNNING_HELPERS_TIMEOUT        (5 * HZ)

int usermodehelper_read_trylock(void)
{
        DEFINE_WAIT(wait);
        int ret = 0;

        down_read(&umhelper_sem);
        for (;;) {
                prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
                                TASK_INTERRUPTIBLE);
                if (!usermodehelper_disabled)
                        break;

                if (usermodehelper_disabled == UMH_DISABLED)
                        ret = -EAGAIN;

                up_read(&umhelper_sem);

                if (ret)
                        break;

                schedule();
                try_to_freeze();

                down_read(&umhelper_sem);
        }
        finish_wait(&usermodehelper_disabled_waitq, &wait);
        return ret;
}
EXPORT_SYMBOL_GPL(usermodehelper_read_trylock);

long usermodehelper_read_lock_wait(long timeout)
{
        DEFINE_WAIT(wait);

        if (timeout < 0)
                return -EINVAL;

        down_read(&umhelper_sem);
        for (;;) {
                prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
                                TASK_UNINTERRUPTIBLE);
                if (!usermodehelper_disabled)
                        break;

                up_read(&umhelper_sem);

                timeout = schedule_timeout(timeout);
                if (!timeout)
                        break;

                down_read(&umhelper_sem);
        }
        finish_wait(&usermodehelper_disabled_waitq, &wait);
        return timeout;
}
EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait);

void usermodehelper_read_unlock(void)
{
        up_read(&umhelper_sem);
}
EXPORT_SYMBOL_GPL(usermodehelper_read_unlock);

/**
 * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled.
 * @depth: New value to assign to usermodehelper_disabled.
 *
 * Change the value of usermodehelper_disabled (under umhelper_sem locked for
 * writing) and wakeup tasks waiting for it to change.
 */
void __usermodehelper_set_disable_depth(enum umh_disable_depth depth)
{
        down_write(&umhelper_sem);
        usermodehelper_disabled = depth;
        wake_up(&usermodehelper_disabled_waitq);
        up_write(&umhelper_sem);
}

/**
 * __usermodehelper_disable - Prevent new helpers from being started.
 * @depth: New value to assign to usermodehelper_disabled.
 *
 * Set usermodehelper_disabled to @depth and wait for running helpers to exit.
 */
int __usermodehelper_disable(enum umh_disable_depth depth)
{
        long retval;

        if (!depth)
                return -EINVAL;

        down_write(&umhelper_sem);
        usermodehelper_disabled = depth;
        up_write(&umhelper_sem);

        /*
         * From now on call_usermodehelper_exec() won't start any new
         * helpers, so it is sufficient if running_helpers turns out to
         * be zero at one point (it may be increased later, but that
         * doesn't matter).
         */
        retval = wait_event_timeout(running_helpers_waitq,
                                        atomic_read(&running_helpers) == 0,
                                        RUNNING_HELPERS_TIMEOUT);
        if (retval)
                return 0;

        __usermodehelper_set_disable_depth(UMH_ENABLED);
        return -EAGAIN;
}

static void helper_lock(void)
{
        atomic_inc(&running_helpers);
        smp_mb__after_atomic();
}

static void helper_unlock(void)
{
        if (atomic_dec_and_test(&running_helpers))
                wake_up(&running_helpers_waitq);
}

/**
 * call_usermodehelper_setup - prepare to call a usermode helper
 * @path: path to usermode executable
 * @argv: arg vector for process
 * @envp: environment for process
 * @gfp_mask: gfp mask for memory allocation
 * @init: an init function
 * @cleanup: a cleanup function
 * @data: arbitrary context sensitive data
 *
 * Returns either %NULL on allocation failure, or a subprocess_info
 * structure.  This should be passed to call_usermodehelper_exec to
 * exec the process and free the structure.
 *
 * The init function is used to customize the helper process prior to
 * exec.  A non-zero return code causes the process to error out, exit,
 * and return the failure to the calling process
 *
 * The cleanup function is just before the subprocess_info is about to
 * be freed.  This can be used for freeing the argv and envp.  The
 * Function must be runnable in either a process context or the
 * context in which call_usermodehelper_exec is called.
 */
struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
                char **envp, gfp_t gfp_mask,
                int (*init)(struct subprocess_info *info, struct cred *new),
                void (*cleanup)(struct subprocess_info *info),
                void *data)
{
        struct subprocess_info *sub_info;
        sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask);
        if (!sub_info)
                goto out;

        INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);

#ifdef CONFIG_STATIC_USERMODEHELPER
        sub_info->path = CONFIG_STATIC_USERMODEHELPER_PATH;
#else
        sub_info->path = path;
#endif
        sub_info->argv = argv;
        sub_info->envp = envp;

        sub_info->cleanup = cleanup;
        sub_info->init = init;
        sub_info->data = data;
  out:
        return sub_info;
}
EXPORT_SYMBOL(call_usermodehelper_setup);

/**
 * call_usermodehelper_exec - start a usermode application
 * @sub_info: information about the subprocess
 * @wait: wait for the application to finish and return status.
 *        when UMH_NO_WAIT don't wait at all, but you get no useful error back
 *        when the program couldn't be exec'ed. This makes it safe to call
 *        from interrupt context.
 *
 * Runs a user-space application.  The application is started
 * asynchronously if wait is not set, and runs as a child of system workqueues.
 * (ie. it runs with full root capabilities and optimized affinity).
 *
 * Note: successful return value does not guarantee the helper was called at
 * all. You can't rely on sub_info->{init,cleanup} being called even for
 * UMH_WAIT_* wait modes as STATIC_USERMODEHELPER_PATH="" turns all helpers
 * into a successful no-op.
 */
int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
{
        unsigned int state = TASK_UNINTERRUPTIBLE;
        DECLARE_COMPLETION_ONSTACK(done);
        int retval = 0;

        if (!sub_info->path) {
                call_usermodehelper_freeinfo(sub_info);
                return -EINVAL;
        }
        helper_lock();
        if (usermodehelper_disabled) {
                retval = -EBUSY;
                goto out;
        }

        /*
         * If there is no binary for us to call, then just return and get out of
         * here.  This allows us to set STATIC_USERMODEHELPER_PATH to "" and
         * disable all call_usermodehelper() calls.
         */
        if (strlen(sub_info->path) == 0)
                goto out;

        /*
         * Set the completion pointer only if there is a waiter.
         * This makes it possible to use umh_complete to free
         * the data structure in case of UMH_NO_WAIT.
         */
        sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done;
        sub_info->wait = wait;

        queue_work(system_unbound_wq, &sub_info->work);
        if (wait == UMH_NO_WAIT)        /* task has freed sub_info */
                goto unlock;

        if (wait & UMH_FREEZABLE)
                state |= TASK_FREEZABLE;

        if (wait & UMH_KILLABLE) {
                retval = wait_for_completion_state(&done, state | TASK_KILLABLE);
                if (!retval)
                        goto wait_done;

                /* umh_complete() will see NULL and free sub_info */
                if (xchg(&sub_info->complete, NULL))
                        goto unlock;

                /*
                 * fallthrough; in case of -ERESTARTSYS now do uninterruptible
                 * wait_for_completion_state(). Since umh_complete() shall call
                 * complete() in a moment if xchg() above returned NULL, this
                 * uninterruptible wait_for_completion_state() will not block
                 * SIGKILL'ed processes for long.
                 */
        }
        wait_for_completion_state(&done, state);

wait_done:
        retval = sub_info->retval;
out:
        call_usermodehelper_freeinfo(sub_info);
unlock:
        helper_unlock();
        return retval;
}
EXPORT_SYMBOL(call_usermodehelper_exec);

/**
 * call_usermodehelper() - prepare and start a usermode application
 * @path: path to usermode executable
 * @argv: arg vector for process
 * @envp: environment for process
 * @wait: wait for the application to finish and return status.
 *        when UMH_NO_WAIT don't wait at all, but you get no useful error back
 *        when the program couldn't be exec'ed. This makes it safe to call
 *        from interrupt context.
 *
 * This function is the equivalent to use call_usermodehelper_setup() and
 * call_usermodehelper_exec().
 */
int call_usermodehelper(const char *path, char **argv, char **envp, int wait)
{
        struct subprocess_info *info;
        gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;

        info = call_usermodehelper_setup(path, argv, envp, gfp_mask,
                                         NULL, NULL, NULL);
        if (info == NULL)
                return -ENOMEM;

        return call_usermodehelper_exec(info, wait);
}
EXPORT_SYMBOL(call_usermodehelper);

#if defined(CONFIG_SYSCTL)
static int proc_cap_handler(const struct ctl_table *table, int write,
                         void *buffer, size_t *lenp, loff_t *ppos)
{
        struct ctl_table t;
        unsigned long cap_array[2];
        kernel_cap_t new_cap, *cap;
        int err;

        if (write && (!capable(CAP_SETPCAP) ||
                      !capable(CAP_SYS_MODULE)))
                return -EPERM;

        /*
         * convert from the global kernel_cap_t to the ulong array to print to
         * userspace if this is a read.
         *
         * Legacy format: capabilities are exposed as two 32-bit values
         */
        cap = table->data;
        spin_lock(&umh_sysctl_lock);
        cap_array[0] = (u32) cap->val;
        cap_array[1] = cap->val >> 32;
        spin_unlock(&umh_sysctl_lock);

        t = *table;
        t.data = &cap_array;

        /*
         * actually read or write and array of ulongs from userspace.  Remember
         * these are least significant 32 bits first
         */
        err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
        if (err < 0)
                return err;

        new_cap.val = (u32)cap_array[0];
        new_cap.val += (u64)cap_array[1] << 32;

        /*
         * Drop everything not in the new_cap (but don't add things)
         */
        if (write) {
                spin_lock(&umh_sysctl_lock);
                *cap = cap_intersect(*cap, new_cap);
                spin_unlock(&umh_sysctl_lock);
        }

        return 0;
}

static const struct ctl_table usermodehelper_table[] = {
        {
                .procname        = "bset",
                .data                = &usermodehelper_bset,
                .maxlen                = 2 * sizeof(unsigned long),
                .mode                = 0600,
                .proc_handler        = proc_cap_handler,
        },
        {
                .procname        = "inheritable",
                .data                = &usermodehelper_inheritable,
                .maxlen                = 2 * sizeof(unsigned long),
                .mode                = 0600,
                .proc_handler        = proc_cap_handler,
        },
};

static int __init init_umh_sysctls(void)
{
        register_sysctl_init("kernel/usermodehelper", usermodehelper_table);
        return 0;
}
early_initcall(init_umh_sysctls);
#endif /* CONFIG_SYSCTL */




















    3 






















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_BH_H
#define _LINUX_BH_H

#include <linux/instruction_pointer.h>
#include <linux/preempt.h>

#if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_TRACE_IRQFLAGS)
extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
#else
static __always_inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
{
        preempt_count_add(cnt);
        barrier();
}
#endif

static inline void local_bh_disable(void)
{
        __local_bh_disable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
}

extern void _local_bh_enable(void);
extern void __local_bh_enable_ip(unsigned long ip, unsigned int cnt);

static inline void local_bh_enable_ip(unsigned long ip)
{
        __local_bh_enable_ip(ip, SOFTIRQ_DISABLE_OFFSET);
}

static inline void local_bh_enable(void)
{
        __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
}

#ifdef CONFIG_PREEMPT_RT
extern bool local_bh_blocked(void);
#else
static inline bool local_bh_blocked(void) { return false; }
#endif

#endif /* _LINUX_BH_H */




































  320 









  317 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_JUMP_LABEL_H
#define _ASM_X86_JUMP_LABEL_H

#define HAVE_JUMP_LABEL_BATCH

#include <asm/asm.h>
#include <asm/nops.h>

#ifndef __ASSEMBLER__

#include <linux/stringify.h>
#include <linux/types.h>

#define JUMP_TABLE_ENTRY(key, label)                        \
        ".pushsection __jump_table,  \"aw\" \n\t"        \
        _ASM_ALIGN "\n\t"                                \
        ".long 1b - . \n\t"                                \
        ".long " label " - . \n\t"                        \
        _ASM_PTR " " key " - . \n\t"                        \
        ".popsection \n\t"

/* This macro is also expanded on the Rust side. */
#ifdef CONFIG_HAVE_JUMP_LABEL_HACK
#define ARCH_STATIC_BRANCH_ASM(key, label)                \
        "1: jmp " label " # objtool NOPs this \n\t"        \
        JUMP_TABLE_ENTRY(key " + 2", label)
#else /* !CONFIG_HAVE_JUMP_LABEL_HACK */
#define ARCH_STATIC_BRANCH_ASM(key, label)                \
        "1: .byte " __stringify(BYTES_NOP5) "\n\t"        \
        JUMP_TABLE_ENTRY(key, label)
#endif /* CONFIG_HAVE_JUMP_LABEL_HACK */

static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch)
{
        asm goto(ARCH_STATIC_BRANCH_ASM("%c0 + %c1", "%l[l_yes]")
                : :  "i" (key), "i" (branch) : : l_yes);

        return false;
l_yes:
        return true;
}

static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch)
{
        asm goto("1:"
                "jmp %l[l_yes]\n\t"
                JUMP_TABLE_ENTRY("%c0 + %c1", "%l[l_yes]")
                : :  "i" (key), "i" (branch) : : l_yes);

        return false;
l_yes:
        return true;
}

extern int arch_jump_entry_size(struct jump_entry *entry);

#endif        /* __ASSEMBLER__ */

#endif









































  312 



  316 
  317 
  316 
  319 


  315 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * generic net pointers
 */

#ifndef __NET_GENERIC_H__
#define __NET_GENERIC_H__

#include <linux/bug.h>
#include <linux/rcupdate.h>
#include <net/net_namespace.h>

/*
 * Generic net pointers are to be used by modules to put some private
 * stuff on the struct net without explicit struct net modification
 *
 * The rules are simple:
 * 1. set pernet_operations->id.  After register_pernet_device you
 *    will have the id of your private pointer.
 * 2. set pernet_operations->size to have the code allocate and free
 *    a private structure pointed to from struct net.
 * 3. do not change this pointer while the net is alive;
 * 4. do not try to have any private reference on the net_generic object.
 *
 * After accomplishing all of the above, the private pointer can be
 * accessed with the net_generic() call.
 */

struct net_generic {
        union {
                struct {
                        unsigned int len;
                        struct rcu_head rcu;
                } s;

                DECLARE_FLEX_ARRAY(void *, ptr);
        };
};

static inline void *net_generic(const struct net *net, unsigned int id)
{
        struct net_generic *ng;
        void *ptr;

        rcu_read_lock();
        ng = rcu_dereference(net->gen);
        ptr = ng->ptr[id];
        rcu_read_unlock();

        return ptr;
}
#endif











































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_HIGHMEM_INTERNAL_H
#define _LINUX_HIGHMEM_INTERNAL_H

/*
 * Outside of CONFIG_HIGHMEM to support X86 32bit iomap_atomic() cruft.
 */
#ifdef CONFIG_KMAP_LOCAL
void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot);
void *__kmap_local_page_prot(const struct page *page, pgprot_t prot);
void kunmap_local_indexed(const void *vaddr);
void kmap_local_fork(struct task_struct *tsk);
void __kmap_local_sched_out(void);
void __kmap_local_sched_in(void);
static inline void kmap_assert_nomap(void)
{
        DEBUG_LOCKS_WARN_ON(current->kmap_ctrl.idx);
}
#else
static inline void kmap_local_fork(struct task_struct *tsk) { }
static inline void kmap_assert_nomap(void) { }
#endif

#ifdef CONFIG_HIGHMEM
#include <asm/highmem.h>

#ifndef ARCH_HAS_KMAP_FLUSH_TLB
static inline void kmap_flush_tlb(unsigned long addr) { }
#endif

#ifndef kmap_prot
#define kmap_prot PAGE_KERNEL
#endif

void *kmap_high(struct page *page);
void kunmap_high(const struct page *page);
void __kmap_flush_unused(void);
struct page *__kmap_to_page(void *addr);

static inline void *kmap(struct page *page)
{
        void *addr;

        might_sleep();
        if (!PageHighMem(page))
                addr = page_address(page);
        else
                addr = kmap_high(page);
        kmap_flush_tlb((unsigned long)addr);
        return addr;
}

static inline void kunmap(const struct page *page)
{
        might_sleep();
        if (!PageHighMem(page))
                return;
        kunmap_high(page);
}

static inline struct page *kmap_to_page(void *addr)
{
        return __kmap_to_page(addr);
}

static inline void kmap_flush_unused(void)
{
        __kmap_flush_unused();
}

static inline void *kmap_local_page(const struct page *page)
{
        return __kmap_local_page_prot(page, kmap_prot);
}

static inline void *kmap_local_page_try_from_panic(const struct page *page)
{
        if (!PageHighMem(page))
                return page_address(page);
        /* If the page is in HighMem, it's not safe to kmap it.*/
        return NULL;
}

static inline void *kmap_local_folio(const struct folio *folio, size_t offset)
{
        const struct page *page = folio_page(folio, offset / PAGE_SIZE);
        return __kmap_local_page_prot(page, kmap_prot) + offset % PAGE_SIZE;
}

static inline void *kmap_local_page_prot(const struct page *page, pgprot_t prot)
{
        return __kmap_local_page_prot(page, prot);
}

static inline void *kmap_local_pfn(unsigned long pfn)
{
        return __kmap_local_pfn_prot(pfn, kmap_prot);
}

static inline void __kunmap_local(const void *vaddr)
{
        kunmap_local_indexed(vaddr);
}

static inline void *kmap_atomic_prot(const struct page *page, pgprot_t prot)
{
        if (IS_ENABLED(CONFIG_PREEMPT_RT))
                migrate_disable();
        else
                preempt_disable();

        pagefault_disable();
        return __kmap_local_page_prot(page, prot);
}

static inline void *kmap_atomic(const struct page *page)
{
        return kmap_atomic_prot(page, kmap_prot);
}

static inline void *kmap_atomic_pfn(unsigned long pfn)
{
        if (IS_ENABLED(CONFIG_PREEMPT_RT))
                migrate_disable();
        else
                preempt_disable();

        pagefault_disable();
        return __kmap_local_pfn_prot(pfn, kmap_prot);
}

static inline void __kunmap_atomic(const void *addr)
{
        kunmap_local_indexed(addr);
        pagefault_enable();
        if (IS_ENABLED(CONFIG_PREEMPT_RT))
                migrate_enable();
        else
                preempt_enable();
}

unsigned long __nr_free_highpages(void);
unsigned long __totalhigh_pages(void);

static inline unsigned long nr_free_highpages(void)
{
        return __nr_free_highpages();
}

static inline unsigned long totalhigh_pages(void)
{
        return __totalhigh_pages();
}

static inline bool is_kmap_addr(const void *x)
{
        unsigned long addr = (unsigned long)x;

        return (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) ||
                (addr >= __fix_to_virt(FIX_KMAP_END) &&
                 addr < __fix_to_virt(FIX_KMAP_BEGIN));
}
#else /* CONFIG_HIGHMEM */

static inline struct page *kmap_to_page(void *addr)
{
        return virt_to_page(addr);
}

static inline void *kmap(struct page *page)
{
        might_sleep();
        return page_address(page);
}

static inline void kunmap_high(const struct page *page) { }
static inline void kmap_flush_unused(void) { }

static inline void kunmap(const struct page *page)
{
#ifdef ARCH_HAS_FLUSH_ON_KUNMAP
        kunmap_flush_on_unmap(page_address(page));
#endif
}

static inline void *kmap_local_page(const struct page *page)
{
        return page_address(page);
}

static inline void *kmap_local_page_try_from_panic(const struct page *page)
{
        return page_address(page);
}

static inline void *kmap_local_folio(const struct folio *folio, size_t offset)
{
        return folio_address(folio) + offset;
}

static inline void *kmap_local_page_prot(const struct page *page, pgprot_t prot)
{
        return kmap_local_page(page);
}

static inline void *kmap_local_pfn(unsigned long pfn)
{
        return kmap_local_page(pfn_to_page(pfn));
}

static inline void __kunmap_local(const void *addr)
{
#ifdef ARCH_HAS_FLUSH_ON_KUNMAP
        kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE));
#endif
}

static inline void *kmap_atomic(const struct page *page)
{
        if (IS_ENABLED(CONFIG_PREEMPT_RT))
                migrate_disable();
        else
                preempt_disable();
        pagefault_disable();
        return page_address(page);
}

static inline void *kmap_atomic_prot(const struct page *page, pgprot_t prot)
{
        return kmap_atomic(page);
}

static inline void *kmap_atomic_pfn(unsigned long pfn)
{
        return kmap_atomic(pfn_to_page(pfn));
}

static inline void __kunmap_atomic(const void *addr)
{
#ifdef ARCH_HAS_FLUSH_ON_KUNMAP
        kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE));
#endif
        pagefault_enable();
        if (IS_ENABLED(CONFIG_PREEMPT_RT))
                migrate_enable();
        else
                preempt_enable();
}

static inline unsigned long nr_free_highpages(void) { return 0; }
static inline unsigned long totalhigh_pages(void) { return 0; }

static inline bool is_kmap_addr(const void *x)
{
        return false;
}

#endif /* CONFIG_HIGHMEM */

/**
 * kunmap_atomic - Unmap the virtual address mapped by kmap_atomic() - deprecated!
 * @__addr:       Virtual address to be unmapped
 *
 * Unmaps an address previously mapped by kmap_atomic() and re-enables
 * pagefaults. Depending on PREEMP_RT configuration, re-enables also
 * migration and preemption. Users should not count on these side effects.
 *
 * Mappings should be unmapped in the reverse order that they were mapped.
 * See kmap_local_page() for details on nesting.
 *
 * @__addr can be any address within the mapped page, so there is no need
 * to subtract any offset that has been added. In contrast to kunmap(),
 * this function takes the address returned from kmap_atomic(), not the
 * page passed to it. The compiler will warn you if you pass the page.
 */
#define kunmap_atomic(__addr)                                        \
do {                                                                \
        BUILD_BUG_ON(__same_type((__addr), struct page *));        \
        __kunmap_atomic(__addr);                                \
} while (0)

/**
 * kunmap_local - Unmap a page mapped via kmap_local_page().
 * @__addr: An address within the page mapped
 *
 * @__addr can be any address within the mapped page.  Commonly it is the
 * address return from kmap_local_page(), but it can also include offsets.
 *
 * Unmapping should be done in the reverse order of the mapping.  See
 * kmap_local_page() for details.
 */
#define kunmap_local(__addr)                                        \
do {                                                                \
        BUILD_BUG_ON(__same_type((__addr), struct page *));        \
        __kunmap_local(__addr);                                        \
} while (0)

#endif







































































































































































































































































































































































    1 





    1 
    1 





    1 



    1 



    1 





    1 
    1 





    1 



    1 





































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
// SPDX-License-Identifier: GPL-2.0-only
/*
 * lib/bitmap.c
 * Helper functions for bitmap.h.
 */

#include <linux/bitmap.h>
#include <linux/bitops.h>
#include <linux/ctype.h>
#include <linux/device.h>
#include <linux/export.h>
#include <linux/slab.h>

/**
 * DOC: bitmap introduction
 *
 * bitmaps provide an array of bits, implemented using an
 * array of unsigned longs.  The number of valid bits in a
 * given bitmap does _not_ need to be an exact multiple of
 * BITS_PER_LONG.
 *
 * The possible unused bits in the last, partially used word
 * of a bitmap are 'don't care'.  The implementation makes
 * no particular effort to keep them zero.  It ensures that
 * their value will not affect the results of any operation.
 * The bitmap operations that return Boolean (bitmap_empty,
 * for example) or scalar (bitmap_weight, for example) results
 * carefully filter out these unused bits from impacting their
 * results.
 *
 * The byte ordering of bitmaps is more natural on little
 * endian architectures.  See the big-endian headers
 * include/asm-ppc64/bitops.h and include/asm-s390/bitops.h
 * for the best explanations of this ordering.
 */

bool __bitmap_equal(const unsigned long *bitmap1,
                    const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k, lim = bits/BITS_PER_LONG;
        for (k = 0; k < lim; ++k)
                if (bitmap1[k] != bitmap2[k])
                        return false;

        if (bits % BITS_PER_LONG)
                if ((bitmap1[k] ^ bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits))
                        return false;

        return true;
}
EXPORT_SYMBOL(__bitmap_equal);

bool __bitmap_or_equal(const unsigned long *bitmap1,
                       const unsigned long *bitmap2,
                       const unsigned long *bitmap3,
                       unsigned int bits)
{
        unsigned int k, lim = bits / BITS_PER_LONG;
        unsigned long tmp;

        for (k = 0; k < lim; ++k) {
                if ((bitmap1[k] | bitmap2[k]) != bitmap3[k])
                        return false;
        }

        if (!(bits % BITS_PER_LONG))
                return true;

        tmp = (bitmap1[k] | bitmap2[k]) ^ bitmap3[k];
        return (tmp & BITMAP_LAST_WORD_MASK(bits)) == 0;
}

void __bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int bits)
{
        unsigned int k, lim = BITS_TO_LONGS(bits);
        for (k = 0; k < lim; ++k)
                dst[k] = ~src[k];
}
EXPORT_SYMBOL(__bitmap_complement);

/**
 * __bitmap_shift_right - logical right shift of the bits in a bitmap
 *   @dst : destination bitmap
 *   @src : source bitmap
 *   @shift : shift by this many bits
 *   @nbits : bitmap size, in bits
 *
 * Shifting right (dividing) means moving bits in the MS -> LS bit
 * direction.  Zeros are fed into the vacated MS positions and the
 * LS bits shifted off the bottom are lost.
 */
void __bitmap_shift_right(unsigned long *dst, const unsigned long *src,
                        unsigned shift, unsigned nbits)
{
        unsigned k, lim = BITS_TO_LONGS(nbits);
        unsigned off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG;
        unsigned long mask = BITMAP_LAST_WORD_MASK(nbits);
        for (k = 0; off + k < lim; ++k) {
                unsigned long upper, lower;

                /*
                 * If shift is not word aligned, take lower rem bits of
                 * word above and make them the top rem bits of result.
                 */
                if (!rem || off + k + 1 >= lim)
                        upper = 0;
                else {
                        upper = src[off + k + 1];
                        if (off + k + 1 == lim - 1)
                                upper &= mask;
                        upper <<= (BITS_PER_LONG - rem);
                }
                lower = src[off + k];
                if (off + k == lim - 1)
                        lower &= mask;
                lower >>= rem;
                dst[k] = lower | upper;
        }
        if (off)
                memset(&dst[lim - off], 0, off*sizeof(unsigned long));
}
EXPORT_SYMBOL(__bitmap_shift_right);


/**
 * __bitmap_shift_left - logical left shift of the bits in a bitmap
 *   @dst : destination bitmap
 *   @src : source bitmap
 *   @shift : shift by this many bits
 *   @nbits : bitmap size, in bits
 *
 * Shifting left (multiplying) means moving bits in the LS -> MS
 * direction.  Zeros are fed into the vacated LS bit positions
 * and those MS bits shifted off the top are lost.
 */

void __bitmap_shift_left(unsigned long *dst, const unsigned long *src,
                        unsigned int shift, unsigned int nbits)
{
        int k;
        unsigned int lim = BITS_TO_LONGS(nbits);
        unsigned int off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG;
        for (k = lim - off - 1; k >= 0; --k) {
                unsigned long upper, lower;

                /*
                 * If shift is not word aligned, take upper rem bits of
                 * word below and make them the bottom rem bits of result.
                 */
                if (rem && k > 0)
                        lower = src[k - 1] >> (BITS_PER_LONG - rem);
                else
                        lower = 0;
                upper = src[k] << rem;
                dst[k + off] = lower | upper;
        }
        if (off)
                memset(dst, 0, off*sizeof(unsigned long));
}
EXPORT_SYMBOL(__bitmap_shift_left);

/**
 * bitmap_cut() - remove bit region from bitmap and right shift remaining bits
 * @dst: destination bitmap, might overlap with src
 * @src: source bitmap
 * @first: start bit of region to be removed
 * @cut: number of bits to remove
 * @nbits: bitmap size, in bits
 *
 * Set the n-th bit of @dst iff the n-th bit of @src is set and
 * n is less than @first, or the m-th bit of @src is set for any
 * m such that @first <= n < nbits, and m = n + @cut.
 *
 * In pictures, example for a big-endian 32-bit architecture:
 *
 * The @src bitmap is::
 *
 *   31                                   63
 *   |                                    |
 *   10000000 11000001 11110010 00010101  10000000 11000001 01110010 00010101
 *                   |  |              |                                    |
 *                  16  14             0                                   32
 *
 * if @cut is 3, and @first is 14, bits 14-16 in @src are cut and @dst is::
 *
 *   31                                   63
 *   |                                    |
 *   10110000 00011000 00110010 00010101  00010000 00011000 00101110 01000010
 *                      |              |                                    |
 *                      14 (bit 17     0                                   32
 *                          from @src)
 *
 * Note that @dst and @src might overlap partially or entirely.
 *
 * This is implemented in the obvious way, with a shift and carry
 * step for each moved bit. Optimisation is left as an exercise
 * for the compiler.
 */
void bitmap_cut(unsigned long *dst, const unsigned long *src,
                unsigned int first, unsigned int cut, unsigned int nbits)
{
        unsigned int len = BITS_TO_LONGS(nbits);
        unsigned long keep = 0, carry;
        int i;

        if (first % BITS_PER_LONG) {
                keep = src[first / BITS_PER_LONG] &
                       (~0UL >> (BITS_PER_LONG - first % BITS_PER_LONG));
        }

        memmove(dst, src, len * sizeof(*dst));

        while (cut--) {
                for (i = first / BITS_PER_LONG; i < len; i++) {
                        if (i < len - 1)
                                carry = dst[i + 1] & 1UL;
                        else
                                carry = 0;

                        dst[i] = (dst[i] >> 1) | (carry << (BITS_PER_LONG - 1));
                }
        }

        dst[first / BITS_PER_LONG] &= ~0UL << (first % BITS_PER_LONG);
        dst[first / BITS_PER_LONG] |= keep;
}
EXPORT_SYMBOL(bitmap_cut);

bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k;
        unsigned int lim = bits/BITS_PER_LONG;
        unsigned long result = 0;

        for (k = 0; k < lim; k++)
                result |= (dst[k] = bitmap1[k] & bitmap2[k]);
        if (bits % BITS_PER_LONG)
                result |= (dst[k] = bitmap1[k] & bitmap2[k] &
                           BITMAP_LAST_WORD_MASK(bits));
        return result != 0;
}
EXPORT_SYMBOL(__bitmap_and);

void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k;
        unsigned int nr = BITS_TO_LONGS(bits);

        for (k = 0; k < nr; k++)
                dst[k] = bitmap1[k] | bitmap2[k];
}
EXPORT_SYMBOL(__bitmap_or);

void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k;
        unsigned int nr = BITS_TO_LONGS(bits);

        for (k = 0; k < nr; k++)
                dst[k] = bitmap1[k] ^ bitmap2[k];
}
EXPORT_SYMBOL(__bitmap_xor);

bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k;
        unsigned int lim = bits/BITS_PER_LONG;
        unsigned long result = 0;

        for (k = 0; k < lim; k++)
                result |= (dst[k] = bitmap1[k] & ~bitmap2[k]);
        if (bits % BITS_PER_LONG)
                result |= (dst[k] = bitmap1[k] & ~bitmap2[k] &
                           BITMAP_LAST_WORD_MASK(bits));
        return result != 0;
}
EXPORT_SYMBOL(__bitmap_andnot);

void __bitmap_replace(unsigned long *dst,
                      const unsigned long *old, const unsigned long *new,
                      const unsigned long *mask, unsigned int nbits)
{
        unsigned int k;
        unsigned int nr = BITS_TO_LONGS(nbits);

        for (k = 0; k < nr; k++)
                dst[k] = (old[k] & ~mask[k]) | (new[k] & mask[k]);
}
EXPORT_SYMBOL(__bitmap_replace);

bool __bitmap_intersects(const unsigned long *bitmap1,
                         const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k, lim = bits/BITS_PER_LONG;
        for (k = 0; k < lim; ++k)
                if (bitmap1[k] & bitmap2[k])
                        return true;

        if (bits % BITS_PER_LONG)
                if ((bitmap1[k] & bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits))
                        return true;
        return false;
}
EXPORT_SYMBOL(__bitmap_intersects);

bool __bitmap_subset(const unsigned long *bitmap1,
                     const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k, lim = bits/BITS_PER_LONG;
        for (k = 0; k < lim; ++k)
                if (bitmap1[k] & ~bitmap2[k])
                        return false;

        if (bits % BITS_PER_LONG)
                if ((bitmap1[k] & ~bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits))
                        return false;
        return true;
}
EXPORT_SYMBOL(__bitmap_subset);

#define BITMAP_WEIGHT(FETCH, bits)        \
({                                                                                \
        unsigned int __bits = (bits), idx, w = 0;                                \
                                                                                \
        for (idx = 0; idx < __bits / BITS_PER_LONG; idx++)                        \
                w += hweight_long(FETCH);                                        \
                                                                                \
        if (__bits % BITS_PER_LONG)                                                \
                w += hweight_long((FETCH) & BITMAP_LAST_WORD_MASK(__bits));        \
                                                                                \
        w;                                                                        \
})

unsigned int __bitmap_weight(const unsigned long *bitmap, unsigned int bits)
{
        return BITMAP_WEIGHT(bitmap[idx], bits);
}
EXPORT_SYMBOL(__bitmap_weight);

unsigned int __bitmap_weight_and(const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        return BITMAP_WEIGHT(bitmap1[idx] & bitmap2[idx], bits);
}
EXPORT_SYMBOL(__bitmap_weight_and);

unsigned int __bitmap_weight_andnot(const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        return BITMAP_WEIGHT(bitmap1[idx] & ~bitmap2[idx], bits);
}
EXPORT_SYMBOL(__bitmap_weight_andnot);

void __bitmap_set(unsigned long *map, unsigned int start, int len)
{
        unsigned long *p = map + BIT_WORD(start);
        const unsigned int size = start + len;
        int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
        unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start);

        while (len - bits_to_set >= 0) {
                *p |= mask_to_set;
                len -= bits_to_set;
                bits_to_set = BITS_PER_LONG;
                mask_to_set = ~0UL;
                p++;
        }
        if (len) {
                mask_to_set &= BITMAP_LAST_WORD_MASK(size);
                *p |= mask_to_set;
        }
}
EXPORT_SYMBOL(__bitmap_set);

void __bitmap_clear(unsigned long *map, unsigned int start, int len)
{
        unsigned long *p = map + BIT_WORD(start);
        const unsigned int size = start + len;
        int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
        unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);

        while (len - bits_to_clear >= 0) {
                *p &= ~mask_to_clear;
                len -= bits_to_clear;
                bits_to_clear = BITS_PER_LONG;
                mask_to_clear = ~0UL;
                p++;
        }
        if (len) {
                mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
                *p &= ~mask_to_clear;
        }
}
EXPORT_SYMBOL(__bitmap_clear);

/**
 * bitmap_find_next_zero_area_off - find a contiguous aligned zero area
 * @map: The address to base the search on
 * @size: The bitmap size in bits
 * @start: The bitnumber to start searching at
 * @nr: The number of zeroed bits we're looking for
 * @align_mask: Alignment mask for zero area
 * @align_offset: Alignment offset for zero area.
 *
 * The @align_mask should be one less than a power of 2; the effect is that
 * the bit offset of all zero areas this function finds plus @align_offset
 * is multiple of that power of 2.
 */
unsigned long bitmap_find_next_zero_area_off(unsigned long *map,
                                             unsigned long size,
                                             unsigned long start,
                                             unsigned int nr,
                                             unsigned long align_mask,
                                             unsigned long align_offset)
{
        unsigned long index, end, i;
again:
        index = find_next_zero_bit(map, size, start);

        /* Align allocation */
        index = __ALIGN_MASK(index + align_offset, align_mask) - align_offset;

        end = index + nr;
        if (end > size)
                return end;
        i = find_next_bit(map, end, index);
        if (i < end) {
                start = i + 1;
                goto again;
        }
        return index;
}
EXPORT_SYMBOL(bitmap_find_next_zero_area_off);

/**
 * bitmap_pos_to_ord - find ordinal of set bit at given position in bitmap
 *        @buf: pointer to a bitmap
 *        @pos: a bit position in @buf (0 <= @pos < @nbits)
 *        @nbits: number of valid bit positions in @buf
 *
 * Map the bit at position @pos in @buf (of length @nbits) to the
 * ordinal of which set bit it is.  If it is not set or if @pos
 * is not a valid bit position, map to -1.
 *
 * If for example, just bits 4 through 7 are set in @buf, then @pos
 * values 4 through 7 will get mapped to 0 through 3, respectively,
 * and other @pos values will get mapped to -1.  When @pos value 7
 * gets mapped to (returns) @ord value 3 in this example, that means
 * that bit 7 is the 3rd (starting with 0th) set bit in @buf.
 *
 * The bit positions 0 through @bits are valid positions in @buf.
 */
static int bitmap_pos_to_ord(const unsigned long *buf, unsigned int pos, unsigned int nbits)
{
        if (pos >= nbits || !test_bit(pos, buf))
                return -1;

        return bitmap_weight(buf, pos);
}

/**
 * bitmap_remap - Apply map defined by a pair of bitmaps to another bitmap
 *        @dst: remapped result
 *        @src: subset to be remapped
 *        @old: defines domain of map
 *        @new: defines range of map
 *        @nbits: number of bits in each of these bitmaps
 *
 * Let @old and @new define a mapping of bit positions, such that
 * whatever position is held by the n-th set bit in @old is mapped
 * to the n-th set bit in @new.  In the more general case, allowing
 * for the possibility that the weight 'w' of @new is less than the
 * weight of @old, map the position of the n-th set bit in @old to
 * the position of the m-th set bit in @new, where m == n % w.
 *
 * If either of the @old and @new bitmaps are empty, or if @src and
 * @dst point to the same location, then this routine copies @src
 * to @dst.
 *
 * The positions of unset bits in @old are mapped to themselves
 * (the identity map).
 *
 * Apply the above specified mapping to @src, placing the result in
 * @dst, clearing any bits previously set in @dst.
 *
 * For example, lets say that @old has bits 4 through 7 set, and
 * @new has bits 12 through 15 set.  This defines the mapping of bit
 * position 4 to 12, 5 to 13, 6 to 14 and 7 to 15, and of all other
 * bit positions unchanged.  So if say @src comes into this routine
 * with bits 1, 5 and 7 set, then @dst should leave with bits 1,
 * 13 and 15 set.
 */
void bitmap_remap(unsigned long *dst, const unsigned long *src,
                const unsigned long *old, const unsigned long *new,
                unsigned int nbits)
{
        unsigned int oldbit, w;

        if (dst == src)                /* following doesn't handle inplace remaps */
                return;
        bitmap_zero(dst, nbits);

        w = bitmap_weight(new, nbits);
        for_each_set_bit(oldbit, src, nbits) {
                int n = bitmap_pos_to_ord(old, oldbit, nbits);

                if (n < 0 || w == 0)
                        set_bit(oldbit, dst);        /* identity map */
                else
                        set_bit(find_nth_bit(new, nbits, n % w), dst);
        }
}
EXPORT_SYMBOL(bitmap_remap);

/**
 * bitmap_bitremap - Apply map defined by a pair of bitmaps to a single bit
 *        @oldbit: bit position to be mapped
 *        @old: defines domain of map
 *        @new: defines range of map
 *        @bits: number of bits in each of these bitmaps
 *
 * Let @old and @new define a mapping of bit positions, such that
 * whatever position is held by the n-th set bit in @old is mapped
 * to the n-th set bit in @new.  In the more general case, allowing
 * for the possibility that the weight 'w' of @new is less than the
 * weight of @old, map the position of the n-th set bit in @old to
 * the position of the m-th set bit in @new, where m == n % w.
 *
 * The positions of unset bits in @old are mapped to themselves
 * (the identity map).
 *
 * Apply the above specified mapping to bit position @oldbit, returning
 * the new bit position.
 *
 * For example, lets say that @old has bits 4 through 7 set, and
 * @new has bits 12 through 15 set.  This defines the mapping of bit
 * position 4 to 12, 5 to 13, 6 to 14 and 7 to 15, and of all other
 * bit positions unchanged.  So if say @oldbit is 5, then this routine
 * returns 13.
 */
int bitmap_bitremap(int oldbit, const unsigned long *old,
                                const unsigned long *new, int bits)
{
        int w = bitmap_weight(new, bits);
        int n = bitmap_pos_to_ord(old, oldbit, bits);
        if (n < 0 || w == 0)
                return oldbit;
        else
                return find_nth_bit(new, bits, n % w);
}
EXPORT_SYMBOL(bitmap_bitremap);

#ifdef CONFIG_NUMA
/**
 * bitmap_onto - translate one bitmap relative to another
 *        @dst: resulting translated bitmap
 *         @orig: original untranslated bitmap
 *         @relmap: bitmap relative to which translated
 *        @bits: number of bits in each of these bitmaps
 *
 * Set the n-th bit of @dst iff there exists some m such that the
 * n-th bit of @relmap is set, the m-th bit of @orig is set, and
 * the n-th bit of @relmap is also the m-th _set_ bit of @relmap.
 * (If you understood the previous sentence the first time your
 * read it, you're overqualified for your current job.)
 *
 * In other words, @orig is mapped onto (surjectively) @dst,
 * using the map { <n, m> | the n-th bit of @relmap is the
 * m-th set bit of @relmap }.
 *
 * Any set bits in @orig above bit number W, where W is the
 * weight of (number of set bits in) @relmap are mapped nowhere.
 * In particular, if for all bits m set in @orig, m >= W, then
 * @dst will end up empty.  In situations where the possibility
 * of such an empty result is not desired, one way to avoid it is
 * to use the bitmap_fold() operator, below, to first fold the
 * @orig bitmap over itself so that all its set bits x are in the
 * range 0 <= x < W.  The bitmap_fold() operator does this by
 * setting the bit (m % W) in @dst, for each bit (m) set in @orig.
 *
 * Example [1] for bitmap_onto():
 *  Let's say @relmap has bits 30-39 set, and @orig has bits
 *  1, 3, 5, 7, 9 and 11 set.  Then on return from this routine,
 *  @dst will have bits 31, 33, 35, 37 and 39 set.
 *
 *  When bit 0 is set in @orig, it means turn on the bit in
 *  @dst corresponding to whatever is the first bit (if any)
 *  that is turned on in @relmap.  Since bit 0 was off in the
 *  above example, we leave off that bit (bit 30) in @dst.
 *
 *  When bit 1 is set in @orig (as in the above example), it
 *  means turn on the bit in @dst corresponding to whatever
 *  is the second bit that is turned on in @relmap.  The second
 *  bit in @relmap that was turned on in the above example was
 *  bit 31, so we turned on bit 31 in @dst.
 *
 *  Similarly, we turned on bits 33, 35, 37 and 39 in @dst,
 *  because they were the 4th, 6th, 8th and 10th set bits
 *  set in @relmap, and the 4th, 6th, 8th and 10th bits of
 *  @orig (i.e. bits 3, 5, 7 and 9) were also set.
 *
 *  When bit 11 is set in @orig, it means turn on the bit in
 *  @dst corresponding to whatever is the twelfth bit that is
 *  turned on in @relmap.  In the above example, there were
 *  only ten bits turned on in @relmap (30..39), so that bit
 *  11 was set in @orig had no affect on @dst.
 *
 * Example [2] for bitmap_fold() + bitmap_onto():
 *  Let's say @relmap has these ten bits set::
 *
 *                40 41 42 43 45 48 53 61 74 95
 *
 *  (for the curious, that's 40 plus the first ten terms of the
 *  Fibonacci sequence.)
 *
 *  Further lets say we use the following code, invoking
 *  bitmap_fold() then bitmap_onto, as suggested above to
 *  avoid the possibility of an empty @dst result::
 *
 *        unsigned long *tmp;        // a temporary bitmap's bits
 *
 *        bitmap_fold(tmp, orig, bitmap_weight(relmap, bits), bits);
 *        bitmap_onto(dst, tmp, relmap, bits);
 *
 *  Then this table shows what various values of @dst would be, for
 *  various @orig's.  I list the zero-based positions of each set bit.
 *  The tmp column shows the intermediate result, as computed by
 *  using bitmap_fold() to fold the @orig bitmap modulo ten
 *  (the weight of @relmap):
 *
 *      =============== ============== =================
 *      @orig           tmp            @dst
 *      0                0             40
 *      1                1             41
 *      9                9             95
 *      10               0             40 [#f1]_
 *      1 3 5 7          1 3 5 7       41 43 48 61
 *      0 1 2 3 4        0 1 2 3 4     40 41 42 43 45
 *      0 9 18 27        0 9 8 7       40 61 74 95
 *      0 10 20 30       0             40
 *      0 11 22 33       0 1 2 3       40 41 42 43
 *      0 12 24 36       0 2 4 6       40 42 45 53
 *      78 102 211       1 2 8         41 42 74 [#f1]_
 *      =============== ============== =================
 *
 * .. [#f1]
 *
 *     For these marked lines, if we hadn't first done bitmap_fold()
 *     into tmp, then the @dst result would have been empty.
 *
 * If either of @orig or @relmap is empty (no set bits), then @dst
 * will be returned empty.
 *
 * If (as explained above) the only set bits in @orig are in positions
 * m where m >= W, (where W is the weight of @relmap) then @dst will
 * once again be returned empty.
 *
 * All bits in @dst not set by the above rule are cleared.
 */
void bitmap_onto(unsigned long *dst, const unsigned long *orig,
                        const unsigned long *relmap, unsigned int bits)
{
        unsigned int n, m;        /* same meaning as in above comment */

        if (dst == orig)        /* following doesn't handle inplace mappings */
                return;
        bitmap_zero(dst, bits);

        /*
         * The following code is a more efficient, but less
         * obvious, equivalent to the loop:
         *        for (m = 0; m < bitmap_weight(relmap, bits); m++) {
         *                n = find_nth_bit(orig, bits, m);
         *                if (test_bit(m, orig))
         *                        set_bit(n, dst);
         *        }
         */

        m = 0;
        for_each_set_bit(n, relmap, bits) {
                /* m == bitmap_pos_to_ord(relmap, n, bits) */
                if (test_bit(m, orig))
                        set_bit(n, dst);
                m++;
        }
}

/**
 * bitmap_fold - fold larger bitmap into smaller, modulo specified size
 *        @dst: resulting smaller bitmap
 *        @orig: original larger bitmap
 *        @sz: specified size
 *        @nbits: number of bits in each of these bitmaps
 *
 * For each bit oldbit in @orig, set bit oldbit mod @sz in @dst.
 * Clear all other bits in @dst.  See further the comment and
 * Example [2] for bitmap_onto() for why and how to use this.
 */
void bitmap_fold(unsigned long *dst, const unsigned long *orig,
                        unsigned int sz, unsigned int nbits)
{
        unsigned int oldbit;

        if (dst == orig)        /* following doesn't handle inplace mappings */
                return;
        bitmap_zero(dst, nbits);

        for_each_set_bit(oldbit, orig, nbits)
                set_bit(oldbit % sz, dst);
}
#endif /* CONFIG_NUMA */

unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags)
{
        return kmalloc_array(BITS_TO_LONGS(nbits), sizeof(unsigned long),
                             flags);
}
EXPORT_SYMBOL(bitmap_alloc);

unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags)
{
        return bitmap_alloc(nbits, flags | __GFP_ZERO);
}
EXPORT_SYMBOL(bitmap_zalloc);

unsigned long *bitmap_alloc_node(unsigned int nbits, gfp_t flags, int node)
{
        return kmalloc_array_node(BITS_TO_LONGS(nbits), sizeof(unsigned long),
                                  flags, node);
}
EXPORT_SYMBOL(bitmap_alloc_node);

unsigned long *bitmap_zalloc_node(unsigned int nbits, gfp_t flags, int node)
{
        return bitmap_alloc_node(nbits, flags | __GFP_ZERO, node);
}
EXPORT_SYMBOL(bitmap_zalloc_node);

void bitmap_free(const unsigned long *bitmap)
{
        kfree(bitmap);
}
EXPORT_SYMBOL(bitmap_free);

static void devm_bitmap_free(void *data)
{
        unsigned long *bitmap = data;

        bitmap_free(bitmap);
}

unsigned long *devm_bitmap_alloc(struct device *dev,
                                 unsigned int nbits, gfp_t flags)
{
        unsigned long *bitmap;
        int ret;

        bitmap = bitmap_alloc(nbits, flags);
        if (!bitmap)
                return NULL;

        ret = devm_add_action_or_reset(dev, devm_bitmap_free, bitmap);
        if (ret)
                return NULL;

        return bitmap;
}
EXPORT_SYMBOL_GPL(devm_bitmap_alloc);

unsigned long *devm_bitmap_zalloc(struct device *dev,
                                  unsigned int nbits, gfp_t flags)
{
        return devm_bitmap_alloc(dev, nbits, flags | __GFP_ZERO);
}
EXPORT_SYMBOL_GPL(devm_bitmap_zalloc);

#if BITS_PER_LONG == 64
/**
 * bitmap_from_arr32 - copy the contents of u32 array of bits to bitmap
 *        @bitmap: array of unsigned longs, the destination bitmap
 *        @buf: array of u32 (in host byte order), the source bitmap
 *        @nbits: number of bits in @bitmap
 */
void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf, unsigned int nbits)
{
        unsigned int i, halfwords;

        halfwords = DIV_ROUND_UP(nbits, 32);
        for (i = 0; i < halfwords; i++) {
                bitmap[i/2] = (unsigned long) buf[i];
                if (++i < halfwords)
                        bitmap[i/2] |= ((unsigned long) buf[i]) << 32;
        }

        /* Clear tail bits in last word beyond nbits. */
        if (nbits % BITS_PER_LONG)
                bitmap[(halfwords - 1) / 2] &= BITMAP_LAST_WORD_MASK(nbits);
}
EXPORT_SYMBOL(bitmap_from_arr32);

/**
 * bitmap_to_arr32 - copy the contents of bitmap to a u32 array of bits
 *        @buf: array of u32 (in host byte order), the dest bitmap
 *        @bitmap: array of unsigned longs, the source bitmap
 *        @nbits: number of bits in @bitmap
 */
void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap, unsigned int nbits)
{
        unsigned int i, halfwords;

        halfwords = DIV_ROUND_UP(nbits, 32);
        for (i = 0; i < halfwords; i++) {
                buf[i] = (u32) (bitmap[i/2] & UINT_MAX);
                if (++i < halfwords)
                        buf[i] = (u32) (bitmap[i/2] >> 32);
        }

        /* Clear tail bits in last element of array beyond nbits. */
        if (nbits % BITS_PER_LONG)
                buf[halfwords - 1] &= (u32) (UINT_MAX >> ((-nbits) & 31));
}
EXPORT_SYMBOL(bitmap_to_arr32);
#endif

#if BITS_PER_LONG == 32
/**
 * bitmap_from_arr64 - copy the contents of u64 array of bits to bitmap
 *        @bitmap: array of unsigned longs, the destination bitmap
 *        @buf: array of u64 (in host byte order), the source bitmap
 *        @nbits: number of bits in @bitmap
 */
void bitmap_from_arr64(unsigned long *bitmap, const u64 *buf, unsigned int nbits)
{
        int n;

        for (n = nbits; n > 0; n -= 64) {
                u64 val = *buf++;

                *bitmap++ = val;
                if (n > 32)
                        *bitmap++ = val >> 32;
        }

        /*
         * Clear tail bits in the last word beyond nbits.
         *
         * Negative index is OK because here we point to the word next
         * to the last word of the bitmap, except for nbits == 0, which
         * is tested implicitly.
         */
        if (nbits % BITS_PER_LONG)
                bitmap[-1] &= BITMAP_LAST_WORD_MASK(nbits);
}
EXPORT_SYMBOL(bitmap_from_arr64);

/**
 * bitmap_to_arr64 - copy the contents of bitmap to a u64 array of bits
 *        @buf: array of u64 (in host byte order), the dest bitmap
 *        @bitmap: array of unsigned longs, the source bitmap
 *        @nbits: number of bits in @bitmap
 */
void bitmap_to_arr64(u64 *buf, const unsigned long *bitmap, unsigned int nbits)
{
        const unsigned long *end = bitmap + BITS_TO_LONGS(nbits);

        while (bitmap < end) {
                *buf = *bitmap++;
                if (bitmap < end)
                        *buf |= (u64)(*bitmap++) << 32;
                buf++;
        }

        /* Clear tail bits in the last element of array beyond nbits. */
        if (nbits % 64)
                buf[-1] &= GENMASK_ULL((nbits - 1) % 64, 0);
}
EXPORT_SYMBOL(bitmap_to_arr64);
#endif














































































































































   70 















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
/* SPDX-License-Identifier: GPL-2.0 */

/*
 * This file provides wrappers with sanitizer instrumentation for non-atomic
 * bit operations.
 *
 * To use this functionality, an arch's bitops.h file needs to define each of
 * the below bit operations with an arch_ prefix (e.g. arch_set_bit(),
 * arch___set_bit(), etc.).
 */
#ifndef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H
#define _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H

#include <linux/instrumented.h>

/**
 * ___set_bit - Set a bit in memory
 * @nr: the bit to set
 * @addr: the address to start counting from
 *
 * Unlike set_bit(), this function is non-atomic. If it is called on the same
 * region of memory concurrently, the effect may be that only one operation
 * succeeds.
 */
static __always_inline void
___set_bit(unsigned long nr, volatile unsigned long *addr)
{
        instrument_write(addr + BIT_WORD(nr), sizeof(long));
        arch___set_bit(nr, addr);
}

/**
 * ___clear_bit - Clears a bit in memory
 * @nr: the bit to clear
 * @addr: the address to start counting from
 *
 * Unlike clear_bit(), this function is non-atomic. If it is called on the same
 * region of memory concurrently, the effect may be that only one operation
 * succeeds.
 */
static __always_inline void
___clear_bit(unsigned long nr, volatile unsigned long *addr)
{
        instrument_write(addr + BIT_WORD(nr), sizeof(long));
        arch___clear_bit(nr, addr);
}

/**
 * ___change_bit - Toggle a bit in memory
 * @nr: the bit to change
 * @addr: the address to start counting from
 *
 * Unlike change_bit(), this function is non-atomic. If it is called on the same
 * region of memory concurrently, the effect may be that only one operation
 * succeeds.
 */
static __always_inline void
___change_bit(unsigned long nr, volatile unsigned long *addr)
{
        instrument_write(addr + BIT_WORD(nr), sizeof(long));
        arch___change_bit(nr, addr);
}

static __always_inline void __instrument_read_write_bitop(long nr, volatile unsigned long *addr)
{
        if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC)) {
                /*
                 * We treat non-atomic read-write bitops a little more special.
                 * Given the operations here only modify a single bit, assuming
                 * non-atomicity of the writer is sufficient may be reasonable
                 * for certain usage (and follows the permissible nature of the
                 * assume-plain-writes-atomic rule):
                 * 1. report read-modify-write races -> check read;
                 * 2. do not report races with marked readers, but do report
                 *    races with unmarked readers -> check "atomic" write.
                 */
                kcsan_check_read(addr + BIT_WORD(nr), sizeof(long));
                /*
                 * Use generic write instrumentation, in case other sanitizers
                 * or tools are enabled alongside KCSAN.
                 */
                instrument_write(addr + BIT_WORD(nr), sizeof(long));
        } else {
                instrument_read_write(addr + BIT_WORD(nr), sizeof(long));
        }
}

/**
 * ___test_and_set_bit - Set a bit and return its old value
 * @nr: Bit to set
 * @addr: Address to count from
 *
 * This operation is non-atomic. If two instances of this operation race, one
 * can appear to succeed but actually fail.
 */
static __always_inline bool
___test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
{
        __instrument_read_write_bitop(nr, addr);
        return arch___test_and_set_bit(nr, addr);
}

/**
 * ___test_and_clear_bit - Clear a bit and return its old value
 * @nr: Bit to clear
 * @addr: Address to count from
 *
 * This operation is non-atomic. If two instances of this operation race, one
 * can appear to succeed but actually fail.
 */
static __always_inline bool
___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
{
        __instrument_read_write_bitop(nr, addr);
        return arch___test_and_clear_bit(nr, addr);
}

/**
 * ___test_and_change_bit - Change a bit and return its old value
 * @nr: Bit to change
 * @addr: Address to count from
 *
 * This operation is non-atomic. If two instances of this operation race, one
 * can appear to succeed but actually fail.
 */
static __always_inline bool
___test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
{
        __instrument_read_write_bitop(nr, addr);
        return arch___test_and_change_bit(nr, addr);
}

/**
 * _test_bit - Determine whether a bit is set
 * @nr: bit number to test
 * @addr: Address to start counting from
 */
static __always_inline bool
_test_bit(unsigned long nr, const volatile unsigned long *addr)
{
        instrument_atomic_read(addr + BIT_WORD(nr), sizeof(long));
        return arch_test_bit(nr, addr);
}

/**
 * _test_bit_acquire - Determine, with acquire semantics, whether a bit is set
 * @nr: bit number to test
 * @addr: Address to start counting from
 */
static __always_inline bool
_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
{
        instrument_atomic_read(addr + BIT_WORD(nr), sizeof(long));
        return arch_test_bit_acquire(nr, addr);
}

#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */














































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SIGNAL_H
#define _LINUX_SIGNAL_H

#include <linux/bug.h>
#include <linux/list.h>
#include <linux/signal_types.h>
#include <linux/string.h>

struct task_struct;

/* for sysctl */
extern int print_fatal_signals;

static inline void copy_siginfo(kernel_siginfo_t *to,
                                const kernel_siginfo_t *from)
{
        memcpy(to, from, sizeof(*to));
}

static inline void clear_siginfo(kernel_siginfo_t *info)
{
        memset(info, 0, sizeof(*info));
}

#define SI_EXPANSION_SIZE (sizeof(struct siginfo) - sizeof(struct kernel_siginfo))

static inline void copy_siginfo_to_external(siginfo_t *to,
                                            const kernel_siginfo_t *from)
{
        memcpy(to, from, sizeof(*from));
        memset(((char *)to) + sizeof(struct kernel_siginfo), 0,
                SI_EXPANSION_SIZE);
}

int copy_siginfo_to_user(siginfo_t __user *to, const kernel_siginfo_t *from);
int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from);

enum siginfo_layout {
        SIL_KILL,
        SIL_TIMER,
        SIL_POLL,
        SIL_FAULT,
        SIL_FAULT_TRAPNO,
        SIL_FAULT_MCEERR,
        SIL_FAULT_BNDERR,
        SIL_FAULT_PKUERR,
        SIL_FAULT_PERF_EVENT,
        SIL_CHLD,
        SIL_RT,
        SIL_SYS,
};

enum siginfo_layout siginfo_layout(unsigned sig, int si_code);

/*
 * Define some primitives to manipulate sigset_t.
 */

#ifndef __HAVE_ARCH_SIG_BITOPS
#include <linux/bitops.h>

/* We don't use <linux/bitops.h> for these because there is no need to
   be atomic.  */
static inline void sigaddset(sigset_t *set, int _sig)
{
        unsigned long sig = _sig - 1;
        if (_NSIG_WORDS == 1)
                set->sig[0] |= 1UL << sig;
        else
                set->sig[sig / _NSIG_BPW] |= 1UL << (sig % _NSIG_BPW);
}

static inline void sigdelset(sigset_t *set, int _sig)
{
        unsigned long sig = _sig - 1;
        if (_NSIG_WORDS == 1)
                set->sig[0] &= ~(1UL << sig);
        else
                set->sig[sig / _NSIG_BPW] &= ~(1UL << (sig % _NSIG_BPW));
}

static inline int sigismember(sigset_t *set, int _sig)
{
        unsigned long sig = _sig - 1;
        if (_NSIG_WORDS == 1)
                return 1 & (set->sig[0] >> sig);
        else
                return 1 & (set->sig[sig / _NSIG_BPW] >> (sig % _NSIG_BPW));
}

#endif /* __HAVE_ARCH_SIG_BITOPS */

static inline int sigisemptyset(sigset_t *set)
{
        switch (_NSIG_WORDS) {
        case 4:
                return (set->sig[3] | set->sig[2] |
                        set->sig[1] | set->sig[0]) == 0;
        case 2:
                return (set->sig[1] | set->sig[0]) == 0;
        case 1:
                return set->sig[0] == 0;
        default:
                BUILD_BUG();
                return 0;
        }
}

static inline int sigequalsets(const sigset_t *set1, const sigset_t *set2)
{
        switch (_NSIG_WORDS) {
        case 4:
                return        (set1->sig[3] == set2->sig[3]) &&
                        (set1->sig[2] == set2->sig[2]) &&
                        (set1->sig[1] == set2->sig[1]) &&
                        (set1->sig[0] == set2->sig[0]);
        case 2:
                return        (set1->sig[1] == set2->sig[1]) &&
                        (set1->sig[0] == set2->sig[0]);
        case 1:
                return        set1->sig[0] == set2->sig[0];
        }
        return 0;
}

#define sigmask(sig)        (1UL << ((sig) - 1))

#ifndef __HAVE_ARCH_SIG_SETOPS

#define _SIG_SET_BINOP(name, op)                                        \
static inline void name(sigset_t *r, const sigset_t *a, const sigset_t *b) \
{                                                                        \
        unsigned long a0, a1, a2, a3, b0, b1, b2, b3;                        \
                                                                        \
        switch (_NSIG_WORDS) {                                                \
        case 4:                                                                \
                a3 = a->sig[3]; a2 = a->sig[2];                                \
                b3 = b->sig[3]; b2 = b->sig[2];                                \
                r->sig[3] = op(a3, b3);                                        \
                r->sig[2] = op(a2, b2);                                        \
                fallthrough;                                                \
        case 2:                                                                \
                a1 = a->sig[1]; b1 = b->sig[1];                                \
                r->sig[1] = op(a1, b1);                                        \
                fallthrough;                                                \
        case 1:                                                                \
                a0 = a->sig[0]; b0 = b->sig[0];                                \
                r->sig[0] = op(a0, b0);                                        \
                break;                                                        \
        default:                                                        \
                BUILD_BUG();                                                \
        }                                                                \
}

#define _sig_or(x,y)        ((x) | (y))
_SIG_SET_BINOP(sigorsets, _sig_or)

#define _sig_and(x,y)        ((x) & (y))
_SIG_SET_BINOP(sigandsets, _sig_and)

#define _sig_andn(x,y)        ((x) & ~(y))
_SIG_SET_BINOP(sigandnsets, _sig_andn)

#undef _SIG_SET_BINOP
#undef _sig_or
#undef _sig_and
#undef _sig_andn

#define _SIG_SET_OP(name, op)                                                \
static inline void name(sigset_t *set)                                        \
{                                                                        \
        switch (_NSIG_WORDS) {                                                \
        case 4:        set->sig[3] = op(set->sig[3]);                                \
                set->sig[2] = op(set->sig[2]);                                \
                fallthrough;                                                \
        case 2:        set->sig[1] = op(set->sig[1]);                                \
                fallthrough;                                                \
        case 1:        set->sig[0] = op(set->sig[0]);                                \
                    break;                                                \
        default:                                                        \
                BUILD_BUG();                                                \
        }                                                                \
}

#define _sig_not(x)        (~(x))
_SIG_SET_OP(signotset, _sig_not)

#undef _SIG_SET_OP
#undef _sig_not

static inline void sigemptyset(sigset_t *set)
{
        switch (_NSIG_WORDS) {
        default:
                memset(set, 0, sizeof(sigset_t));
                break;
        case 2: set->sig[1] = 0;
                fallthrough;
        case 1:        set->sig[0] = 0;
                break;
        }
}

static inline void sigfillset(sigset_t *set)
{
        switch (_NSIG_WORDS) {
        default:
                memset(set, -1, sizeof(sigset_t));
                break;
        case 2: set->sig[1] = -1;
                fallthrough;
        case 1:        set->sig[0] = -1;
                break;
        }
}

/* Some extensions for manipulating the low 32 signals in particular.  */

static inline void sigaddsetmask(sigset_t *set, unsigned long mask)
{
        set->sig[0] |= mask;
}

static inline void sigdelsetmask(sigset_t *set, unsigned long mask)
{
        set->sig[0] &= ~mask;
}

static inline int sigtestsetmask(sigset_t *set, unsigned long mask)
{
        return (set->sig[0] & mask) != 0;
}

static inline void siginitset(sigset_t *set, unsigned long mask)
{
        set->sig[0] = mask;
        switch (_NSIG_WORDS) {
        default:
                memset(&set->sig[1], 0, sizeof(long)*(_NSIG_WORDS-1));
                break;
        case 2: set->sig[1] = 0;
                break;
        case 1: ;
        }
}

static inline void siginitsetinv(sigset_t *set, unsigned long mask)
{
        set->sig[0] = ~mask;
        switch (_NSIG_WORDS) {
        default:
                memset(&set->sig[1], -1, sizeof(long)*(_NSIG_WORDS-1));
                break;
        case 2: set->sig[1] = -1;
                break;
        case 1: ;
        }
}

#endif /* __HAVE_ARCH_SIG_SETOPS */

static inline void init_sigpending(struct sigpending *sig)
{
        sigemptyset(&sig->signal);
        INIT_LIST_HEAD(&sig->list);
}

extern void flush_sigqueue(struct sigpending *queue);

/* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
static inline int valid_signal(unsigned long sig)
{
        return sig <= _NSIG ? 1 : 0;
}

struct timespec;
struct pt_regs;
enum pid_type;

extern int next_signal(struct sigpending *pending, sigset_t *mask);
extern int do_send_sig_info(int sig, struct kernel_siginfo *info,
                                struct task_struct *p, enum pid_type type);
extern int group_send_sig_info(int sig, struct kernel_siginfo *info,
                               struct task_struct *p, enum pid_type type);
extern int send_signal_locked(int sig, struct kernel_siginfo *info,
                              struct task_struct *p, enum pid_type type);
extern int sigprocmask(int, sigset_t *, sigset_t *);
extern void set_current_blocked(sigset_t *);
extern void __set_current_blocked(const sigset_t *);
extern int show_unhandled_signals;

extern bool get_signal(struct ksignal *ksig);
extern void signal_setup_done(int failed, struct ksignal *ksig, int stepping);
extern void exit_signals(struct task_struct *tsk);
extern void kernel_sigaction(int, __sighandler_t);

#define SIG_KTHREAD ((__force __sighandler_t)2)
#define SIG_KTHREAD_KERNEL ((__force __sighandler_t)3)

static inline void allow_signal(int sig)
{
        /*
         * Kernel threads handle their own signals. Let the signal code
         * know it'll be handled, so that they don't get converted to
         * SIGKILL or just silently dropped.
         */
        kernel_sigaction(sig, SIG_KTHREAD);
}

static inline void allow_kernel_signal(int sig)
{
        /*
         * Kernel threads handle their own signals. Let the signal code
         * know signals sent by the kernel will be handled, so that they
         * don't get silently dropped.
         */
        kernel_sigaction(sig, SIG_KTHREAD_KERNEL);
}

static inline void disallow_signal(int sig)
{
        kernel_sigaction(sig, SIG_IGN);
}

extern struct kmem_cache *sighand_cachep;

extern bool unhandled_signal(struct task_struct *tsk, int sig);

/*
 * In POSIX a signal is sent either to a specific thread (Linux task)
 * or to the process as a whole (Linux thread group).  How the signal
 * is sent determines whether it's to one thread or the whole group,
 * which determines which signal mask(s) are involved in blocking it
 * from being delivered until later.  When the signal is delivered,
 * either it's caught or ignored by a user handler or it has a default
 * effect that applies to the whole thread group (POSIX process).
 *
 * The possible effects an unblocked signal set to SIG_DFL can have are:
 *   ignore        - Nothing Happens
 *   terminate        - kill the process, i.e. all threads in the group,
 *                   similar to exit_group.  The group leader (only) reports
 *                  WIFSIGNALED status to its parent.
 *   coredump        - write a core dump file describing all threads using
 *                  the same mm and then kill all those threads
 *   stop         - stop all the threads in the group, i.e. TASK_STOPPED state
 *
 * SIGKILL and SIGSTOP cannot be caught, blocked, or ignored.
 * Other signals when not blocked and set to SIG_DFL behaves as follows.
 * The job control signals also have other special effects.
 *
 *        +--------------------+------------------+
 *        |  POSIX signal      |  default action  |
 *        +--------------------+------------------+
 *        |  SIGHUP            |  terminate        |
 *        |  SIGINT            |        terminate        |
 *        |  SIGQUIT           |        coredump         |
 *        |  SIGILL            |        coredump         |
 *        |  SIGTRAP           |        coredump         |
 *        |  SIGABRT/SIGIOT    |        coredump         |
 *        |  SIGBUS            |        coredump         |
 *        |  SIGFPE            |        coredump         |
 *        |  SIGKILL           |        terminate(+)        |
 *        |  SIGUSR1           |        terminate        |
 *        |  SIGSEGV           |        coredump         |
 *        |  SIGUSR2           |        terminate        |
 *        |  SIGPIPE           |        terminate        |
 *        |  SIGALRM           |        terminate        |
 *        |  SIGTERM           |        terminate        |
 *        |  SIGCHLD           |        ignore           |
 *        |  SIGCONT           |        ignore(*)        |
 *        |  SIGSTOP           |        stop(*)(+)          |
 *        |  SIGTSTP           |        stop(*)          |
 *        |  SIGTTIN           |        stop(*)          |
 *        |  SIGTTOU           |        stop(*)          |
 *        |  SIGURG            |        ignore           |
 *        |  SIGXCPU           |        coredump         |
 *        |  SIGXFSZ           |        coredump         |
 *        |  SIGVTALRM         |        terminate        |
 *        |  SIGPROF           |        terminate        |
 *        |  SIGPOLL/SIGIO     |        terminate        |
 *        |  SIGSYS/SIGUNUSED  |        coredump         |
 *        |  SIGSTKFLT         |        terminate        |
 *        |  SIGWINCH          |        ignore           |
 *        |  SIGPWR            |        terminate        |
 *        |  SIGRTMIN-SIGRTMAX |        terminate       |
 *        +--------------------+------------------+
 *        |  non-POSIX signal  |  default action  |
 *        +--------------------+------------------+
 *        |  SIGEMT            |  coredump        |
 *        +--------------------+------------------+
 *
 * (+) For SIGKILL and SIGSTOP the action is "always", not just "default".
 * (*) Special job control effects:
 * When SIGCONT is sent, it resumes the process (all threads in the group)
 * from TASK_STOPPED state and also clears any pending/queued stop signals
 * (any of those marked with "stop(*)").  This happens regardless of blocking,
 * catching, or ignoring SIGCONT.  When any stop signal is sent, it clears
 * any pending/queued SIGCONT signals; this happens regardless of blocking,
 * catching, or ignored the stop signal, though (except for SIGSTOP) the
 * default action of stopping the process may happen later or never.
 */

#ifdef SIGEMT
#define SIGEMT_MASK        rt_sigmask(SIGEMT)
#else
#define SIGEMT_MASK        0
#endif

#if SIGRTMIN > BITS_PER_LONG
#define rt_sigmask(sig)        (1ULL << ((sig)-1))
#else
#define rt_sigmask(sig)        sigmask(sig)
#endif

#define siginmask(sig, mask) \
        ((sig) > 0 && (sig) < SIGRTMIN && (rt_sigmask(sig) & (mask)))

#define SIG_KERNEL_ONLY_MASK (\
        rt_sigmask(SIGKILL)   |  rt_sigmask(SIGSTOP))

#define SIG_KERNEL_STOP_MASK (\
        rt_sigmask(SIGSTOP)   |  rt_sigmask(SIGTSTP)   | \
        rt_sigmask(SIGTTIN)   |  rt_sigmask(SIGTTOU)   )

#define SIG_KERNEL_COREDUMP_MASK (\
        rt_sigmask(SIGQUIT)   |  rt_sigmask(SIGILL)    | \
        rt_sigmask(SIGTRAP)   |  rt_sigmask(SIGABRT)   | \
        rt_sigmask(SIGFPE)    |  rt_sigmask(SIGSEGV)   | \
        rt_sigmask(SIGBUS)    |  rt_sigmask(SIGSYS)    | \
        rt_sigmask(SIGXCPU)   |  rt_sigmask(SIGXFSZ)   | \
        SIGEMT_MASK                                       )

#define SIG_KERNEL_IGNORE_MASK (\
        rt_sigmask(SIGCONT)   |  rt_sigmask(SIGCHLD)   | \
        rt_sigmask(SIGWINCH)  |  rt_sigmask(SIGURG)    )

#define SIG_SPECIFIC_SICODES_MASK (\
        rt_sigmask(SIGILL)    |  rt_sigmask(SIGFPE)    | \
        rt_sigmask(SIGSEGV)   |  rt_sigmask(SIGBUS)    | \
        rt_sigmask(SIGTRAP)   |  rt_sigmask(SIGCHLD)   | \
        rt_sigmask(SIGPOLL)   |  rt_sigmask(SIGSYS)    | \
        SIGEMT_MASK                                    )

#define sig_kernel_only(sig)                siginmask(sig, SIG_KERNEL_ONLY_MASK)
#define sig_kernel_coredump(sig)        siginmask(sig, SIG_KERNEL_COREDUMP_MASK)
#define sig_kernel_ignore(sig)                siginmask(sig, SIG_KERNEL_IGNORE_MASK)
#define sig_kernel_stop(sig)                siginmask(sig, SIG_KERNEL_STOP_MASK)
#define sig_specific_sicodes(sig)        siginmask(sig, SIG_SPECIFIC_SICODES_MASK)

#define sig_fatal(t, signr) \
        (!siginmask(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \
         (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL)

void signals_init(void);

int restore_altstack(const stack_t __user *);
int __save_altstack(stack_t __user *, unsigned long);

#define unsafe_save_altstack(uss, sp, label) do { \
        stack_t __user *__uss = uss; \
        struct task_struct *t = current; \
        unsafe_put_user((void __user *)t->sas_ss_sp, &__uss->ss_sp, label); \
        unsafe_put_user(t->sas_ss_flags, &__uss->ss_flags, label); \
        unsafe_put_user(t->sas_ss_size, &__uss->ss_size, label); \
} while (0);

#ifdef CONFIG_DYNAMIC_SIGFRAME
bool sigaltstack_size_valid(size_t ss_size);
#else
static inline bool sigaltstack_size_valid(size_t size) { return true; }
#endif /* !CONFIG_DYNAMIC_SIGFRAME */

#ifdef CONFIG_PROC_FS
struct seq_file;
extern void render_sigset_t(struct seq_file *, const char *, sigset_t *);
#endif

#ifndef arch_untagged_si_addr
/*
 * Given a fault address and a signal and si_code which correspond to the
 * _sigfault union member, returns the address that must appear in si_addr if
 * the signal handler does not have SA_EXPOSE_TAGBITS enabled in sa_flags.
 */
static inline void __user *arch_untagged_si_addr(void __user *addr,
                                                 unsigned long sig,
                                                 unsigned long si_code)
{
        return addr;
}
#endif

#endif /* _LINUX_SIGNAL_H */














































































































































































































































































































































































































   39 





   39 


















































































































































































































































































































   39 



   39 
















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   39 






   39 





   39 

   39 







   39 




   39 















   39 









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   39 











   39 



   39 




















   39 

   39 

   39 
   39 

   37 

   39 










   39 













   38 
















   39 




   39 



   39 





   39 




   39 



   39 

   39 
   38 





   39 

   39 
   39 







   39 







   39 









   39 

   39 

   39 






















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
// SPDX-License-Identifier: GPL-2.0+
/*
 *  Base port operations for 8250/16550-type serial ports
 *
 *  Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o.
 *  Split from 8250_core.c, Copyright (C) 2001 Russell King.
 *
 * A note about mapbase / membase
 *
 *  mapbase is the physical address of the IO port.
 *  membase is an 'ioremapped' cookie.
 */

#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/ioport.h>
#include <linux/init.h>
#include <linux/irq.h>
#include <linux/console.h>
#include <linux/gpio/consumer.h>
#include <linux/sysrq.h>
#include <linux/delay.h>
#include <linux/platform_device.h>
#include <linux/tty.h>
#include <linux/ratelimit.h>
#include <linux/tty_flip.h>
#include <linux/serial.h>
#include <linux/serial_8250.h>
#include <linux/nmi.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/pm_runtime.h>
#include <linux/ktime.h>

#include <asm/io.h>
#include <asm/irq.h>

#include "8250.h"

/*
 * Here we define the default xmit fifo size used for each type of UART.
 */
static const struct serial8250_config uart_config[] = {
        [PORT_UNKNOWN] = {
                .name                = "unknown",
                .fifo_size        = 1,
                .tx_loadsz        = 1,
        },
        [PORT_8250] = {
                .name                = "8250",
                .fifo_size        = 1,
                .tx_loadsz        = 1,
        },
        [PORT_16450] = {
                .name                = "16450",
                .fifo_size        = 1,
                .tx_loadsz        = 1,
        },
        [PORT_16550] = {
                .name                = "16550",
                .fifo_size        = 1,
                .tx_loadsz        = 1,
        },
        [PORT_16550A] = {
                .name                = "16550A",
                .fifo_size        = 16,
                .tx_loadsz        = 16,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .rxtrig_bytes        = {1, 4, 8, 14},
                .flags                = UART_CAP_FIFO,
        },
        [PORT_CIRRUS] = {
                .name                = "Cirrus",
                .fifo_size        = 1,
                .tx_loadsz        = 1,
        },
        [PORT_16650] = {
                .name                = "ST16650",
                .fifo_size        = 1,
                .tx_loadsz        = 1,
                .flags                = UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP,
        },
        [PORT_16650V2] = {
                .name                = "ST16650V2",
                .fifo_size        = 32,
                .tx_loadsz        = 16,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01 |
                                  UART_FCR_T_TRIG_00,
                .rxtrig_bytes        = {8, 16, 24, 28},
                .flags                = UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP,
        },
        [PORT_16750] = {
                .name                = "TI16750",
                .fifo_size        = 64,
                .tx_loadsz        = 64,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10 |
                                  UART_FCR7_64BYTE,
                .rxtrig_bytes        = {1, 16, 32, 56},
                .flags                = UART_CAP_FIFO | UART_CAP_SLEEP | UART_CAP_AFE,
        },
        [PORT_STARTECH] = {
                .name                = "Startech",
                .fifo_size        = 1,
                .tx_loadsz        = 1,
        },
        [PORT_16C950] = {
                .name                = "16C950/954",
                .fifo_size        = 128,
                .tx_loadsz        = 128,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01,
                .rxtrig_bytes        = {16, 32, 112, 120},
                /* UART_CAP_EFR breaks billionon CF bluetooth card. */
                .flags                = UART_CAP_FIFO | UART_CAP_SLEEP,
        },
        [PORT_16654] = {
                .name                = "ST16654",
                .fifo_size        = 64,
                .tx_loadsz        = 32,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01 |
                                  UART_FCR_T_TRIG_10,
                .rxtrig_bytes        = {8, 16, 56, 60},
                .flags                = UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP,
        },
        [PORT_16850] = {
                .name                = "XR16850",
                .fifo_size        = 128,
                .tx_loadsz        = 128,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .flags                = UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP,
        },
        [PORT_RSA] = {
                .name                = "RSA",
                .fifo_size        = 2048,
                .tx_loadsz        = 2048,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_11,
                .flags                = UART_CAP_FIFO,
        },
        [PORT_NS16550A] = {
                .name                = "NS16550A",
                .fifo_size        = 16,
                .tx_loadsz        = 16,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .flags                = UART_CAP_FIFO | UART_NATSEMI,
        },
        [PORT_XSCALE] = {
                .name                = "XScale",
                .fifo_size        = 32,
                .tx_loadsz        = 32,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .flags                = UART_CAP_FIFO | UART_CAP_UUE | UART_CAP_RTOIE,
        },
        [PORT_OCTEON] = {
                .name                = "OCTEON",
                .fifo_size        = 64,
                .tx_loadsz        = 64,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .flags                = UART_CAP_FIFO,
        },
        [PORT_U6_16550A] = {
                .name                = "U6_16550A",
                .fifo_size        = 64,
                .tx_loadsz        = 64,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .flags                = UART_CAP_FIFO | UART_CAP_AFE,
        },
        [PORT_TEGRA] = {
                .name                = "Tegra",
                .fifo_size        = 32,
                .tx_loadsz        = 8,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01 |
                                  UART_FCR_T_TRIG_01,
                .rxtrig_bytes        = {1, 4, 8, 14},
                .flags                = UART_CAP_FIFO | UART_CAP_RTOIE,
        },
        [PORT_XR17D15X] = {
                .name                = "XR17D15X",
                .fifo_size        = 64,
                .tx_loadsz        = 64,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .flags                = UART_CAP_FIFO | UART_CAP_AFE | UART_CAP_EFR |
                                  UART_CAP_SLEEP,
        },
        [PORT_XR17V35X] = {
                .name                = "XR17V35X",
                .fifo_size        = 256,
                .tx_loadsz        = 256,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_11 |
                                  UART_FCR_T_TRIG_11,
                .flags                = UART_CAP_FIFO | UART_CAP_AFE | UART_CAP_EFR |
                                  UART_CAP_SLEEP,
        },
        [PORT_LPC3220] = {
                .name                = "LPC3220",
                .fifo_size        = 64,
                .tx_loadsz        = 32,
                .fcr                = UART_FCR_DMA_SELECT | UART_FCR_ENABLE_FIFO |
                                  UART_FCR_R_TRIG_00 | UART_FCR_T_TRIG_00,
                .flags                = UART_CAP_FIFO,
        },
        [PORT_BRCM_TRUMANAGE] = {
                .name                = "TruManage",
                .fifo_size        = 1,
                .tx_loadsz        = 1024,
                .flags                = UART_CAP_HFIFO,
        },
        [PORT_8250_CIR] = {
                .name                = "CIR port"
        },
        [PORT_ALTR_16550_F32] = {
                .name                = "Altera 16550 FIFO32",
                .fifo_size        = 32,
                .tx_loadsz        = 32,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .rxtrig_bytes        = {1, 8, 16, 30},
                .flags                = UART_CAP_FIFO | UART_CAP_AFE,
        },
        [PORT_ALTR_16550_F64] = {
                .name                = "Altera 16550 FIFO64",
                .fifo_size        = 64,
                .tx_loadsz        = 64,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .rxtrig_bytes        = {1, 16, 32, 62},
                .flags                = UART_CAP_FIFO | UART_CAP_AFE,
        },
        [PORT_ALTR_16550_F128] = {
                .name                = "Altera 16550 FIFO128",
                .fifo_size        = 128,
                .tx_loadsz        = 128,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .rxtrig_bytes        = {1, 32, 64, 126},
                .flags                = UART_CAP_FIFO | UART_CAP_AFE,
        },
        /*
         * tx_loadsz is set to 63-bytes instead of 64-bytes to implement
         * workaround of errata A-008006 which states that tx_loadsz should
         * be configured less than Maximum supported fifo bytes.
         */
        [PORT_16550A_FSL64] = {
                .name                = "16550A_FSL64",
                .fifo_size        = 64,
                .tx_loadsz        = 63,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10 |
                                  UART_FCR7_64BYTE,
                .flags                = UART_CAP_FIFO | UART_CAP_NOTEMT,
        },
        [PORT_RT2880] = {
                .name                = "Palmchip BK-3103",
                .fifo_size        = 16,
                .tx_loadsz        = 16,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .rxtrig_bytes        = {1, 4, 8, 14},
                .flags                = UART_CAP_FIFO,
        },
        [PORT_DA830] = {
                .name                = "TI DA8xx/66AK2x",
                .fifo_size        = 16,
                .tx_loadsz        = 16,
                .fcr                = UART_FCR_DMA_SELECT | UART_FCR_ENABLE_FIFO |
                                  UART_FCR_R_TRIG_10,
                .rxtrig_bytes        = {1, 4, 8, 14},
                .flags                = UART_CAP_FIFO | UART_CAP_AFE,
        },
        [PORT_MTK_BTIF] = {
                .name                = "MediaTek BTIF",
                .fifo_size        = 16,
                .tx_loadsz        = 16,
                .fcr                = UART_FCR_ENABLE_FIFO |
                                  UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT,
                .flags                = UART_CAP_FIFO,
        },
        [PORT_NPCM] = {
                .name                = "Nuvoton 16550",
                .fifo_size        = 16,
                .tx_loadsz        = 16,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10 |
                                  UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT,
                .rxtrig_bytes        = {1, 4, 8, 14},
                .flags                = UART_CAP_FIFO,
        },
        [PORT_SUNIX] = {
                .name                = "Sunix",
                .fifo_size        = 128,
                .tx_loadsz        = 128,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .rxtrig_bytes        = {1, 32, 64, 112},
                .flags                = UART_CAP_FIFO | UART_CAP_SLEEP,
        },
        [PORT_ASPEED_VUART] = {
                .name                = "ASPEED VUART",
                .fifo_size        = 16,
                .tx_loadsz        = 16,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_00,
                .rxtrig_bytes        = {1, 4, 8, 14},
                .flags                = UART_CAP_FIFO,
        },
        [PORT_MCHP16550A] = {
                .name           = "MCHP16550A",
                .fifo_size      = 256,
                .tx_loadsz      = 256,
                .fcr            = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01,
                .rxtrig_bytes   = {2, 66, 130, 194},
                .flags          = UART_CAP_FIFO,
        },
        [PORT_BCM7271] = {
                .name                = "Broadcom BCM7271 UART",
                .fifo_size        = 32,
                .tx_loadsz        = 32,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01,
                .rxtrig_bytes        = {1, 8, 16, 30},
                .flags                = UART_CAP_FIFO | UART_CAP_AFE,
        },
};

/* Uart divisor latch read */
static u32 default_serial_dl_read(struct uart_8250_port *up)
{
        /* Assign these in pieces to truncate any bits above 7.  */
        unsigned char dll = serial_in(up, UART_DLL);
        unsigned char dlm = serial_in(up, UART_DLM);

        return dll | dlm << 8;
}

/* Uart divisor latch write */
static void default_serial_dl_write(struct uart_8250_port *up, u32 value)
{
        serial_out(up, UART_DLL, value & 0xff);
        serial_out(up, UART_DLM, value >> 8 & 0xff);
}

#ifdef CONFIG_HAS_IOPORT
static u32 hub6_serial_in(struct uart_port *p, unsigned int offset)
{
        offset = offset << p->regshift;
        outb(p->hub6 - 1 + offset, p->iobase);
        return inb(p->iobase + 1);
}

static void hub6_serial_out(struct uart_port *p, unsigned int offset, u32 value)
{
        offset = offset << p->regshift;
        outb(p->hub6 - 1 + offset, p->iobase);
        outb(value, p->iobase + 1);
}
#endif /* CONFIG_HAS_IOPORT */

static u32 mem_serial_in(struct uart_port *p, unsigned int offset)
{
        offset = offset << p->regshift;
        return readb(p->membase + offset);
}

static void mem_serial_out(struct uart_port *p, unsigned int offset, u32 value)
{
        offset = offset << p->regshift;
        writeb(value, p->membase + offset);
}

static void mem16_serial_out(struct uart_port *p, unsigned int offset, u32 value)
{
        offset = offset << p->regshift;
        writew(value, p->membase + offset);
}

static u32 mem16_serial_in(struct uart_port *p, unsigned int offset)
{
        offset = offset << p->regshift;
        return readw(p->membase + offset);
}

static void mem32_serial_out(struct uart_port *p, unsigned int offset, u32 value)
{
        offset = offset << p->regshift;
        writel(value, p->membase + offset);
}

static u32 mem32_serial_in(struct uart_port *p, unsigned int offset)
{
        offset = offset << p->regshift;
        return readl(p->membase + offset);
}

static void mem32be_serial_out(struct uart_port *p, unsigned int offset, u32 value)
{
        offset = offset << p->regshift;
        iowrite32be(value, p->membase + offset);
}

static u32 mem32be_serial_in(struct uart_port *p, unsigned int offset)
{
        offset = offset << p->regshift;
        return ioread32be(p->membase + offset);
}

#ifdef CONFIG_HAS_IOPORT
static u32 io_serial_in(struct uart_port *p, unsigned int offset)
{
        offset = offset << p->regshift;
        return inb(p->iobase + offset);
}

static void io_serial_out(struct uart_port *p, unsigned int offset, u32 value)
{
        offset = offset << p->regshift;
        outb(value, p->iobase + offset);
}
#endif
static u32 no_serial_in(struct uart_port *p, unsigned int offset)
{
        return ~0U;
}

static void no_serial_out(struct uart_port *p, unsigned int offset, u32 value)
{
}

static int serial8250_default_handle_irq(struct uart_port *port);

static void set_io_from_upio(struct uart_port *p)
{
        struct uart_8250_port *up = up_to_u8250p(p);

        up->dl_read = default_serial_dl_read;
        up->dl_write = default_serial_dl_write;

        switch (p->iotype) {
#ifdef CONFIG_HAS_IOPORT
        case UPIO_HUB6:
                p->serial_in = hub6_serial_in;
                p->serial_out = hub6_serial_out;
                break;
#endif

        case UPIO_MEM:
                p->serial_in = mem_serial_in;
                p->serial_out = mem_serial_out;
                break;

        case UPIO_MEM16:
                p->serial_in = mem16_serial_in;
                p->serial_out = mem16_serial_out;
                break;

        case UPIO_MEM32:
                p->serial_in = mem32_serial_in;
                p->serial_out = mem32_serial_out;
                break;

        case UPIO_MEM32BE:
                p->serial_in = mem32be_serial_in;
                p->serial_out = mem32be_serial_out;
                break;
#ifdef CONFIG_HAS_IOPORT
        case UPIO_PORT:
                p->serial_in = io_serial_in;
                p->serial_out = io_serial_out;
                break;
#endif
        default:
                WARN(p->iotype != UPIO_PORT || p->iobase,
                     "Unsupported UART type %x\n", p->iotype);
                p->serial_in = no_serial_in;
                p->serial_out = no_serial_out;
        }
        /* Remember loaded iotype */
        up->cur_iotype = p->iotype;
        p->handle_irq = serial8250_default_handle_irq;
}

static void
serial_port_out_sync(struct uart_port *p, int offset, int value)
{
        switch (p->iotype) {
        case UPIO_MEM:
        case UPIO_MEM16:
        case UPIO_MEM32:
        case UPIO_MEM32BE:
        case UPIO_AU:
                p->serial_out(p, offset, value);
                p->serial_in(p, UART_LCR);        /* safe, no side-effects */
                break;
        default:
                p->serial_out(p, offset, value);
        }
}

/*
 * FIFO support.
 */
static void serial8250_clear_fifos(struct uart_8250_port *p)
{
        if (p->capabilities & UART_CAP_FIFO) {
                serial_out(p, UART_FCR, UART_FCR_ENABLE_FIFO);
                serial_out(p, UART_FCR, UART_FCR_ENABLE_FIFO |
                               UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT);
                serial_out(p, UART_FCR, 0);
        }
}

static enum hrtimer_restart serial8250_em485_handle_start_tx(struct hrtimer *t);
static enum hrtimer_restart serial8250_em485_handle_stop_tx(struct hrtimer *t);

void serial8250_clear_and_reinit_fifos(struct uart_8250_port *p)
{
        serial8250_clear_fifos(p);
        serial_out(p, UART_FCR, p->fcr);
}
EXPORT_SYMBOL_GPL(serial8250_clear_and_reinit_fifos);

void serial8250_rpm_get(struct uart_8250_port *p)
{
        if (!(p->capabilities & UART_CAP_RPM))
                return;
        pm_runtime_get_sync(p->port.dev);
}
EXPORT_SYMBOL_GPL(serial8250_rpm_get);

void serial8250_rpm_put(struct uart_8250_port *p)
{
        if (!(p->capabilities & UART_CAP_RPM))
                return;
        pm_runtime_mark_last_busy(p->port.dev);
        pm_runtime_put_autosuspend(p->port.dev);
}
EXPORT_SYMBOL_GPL(serial8250_rpm_put);

/**
 *        serial8250_em485_init() - put uart_8250_port into rs485 emulating
 *        @p:        uart_8250_port port instance
 *
 *        The function is used to start rs485 software emulating on the
 *        &struct uart_8250_port* @p. Namely, RTS is switched before/after
 *        transmission. The function is idempotent, so it is safe to call it
 *        multiple times.
 *
 *        The caller MUST enable interrupt on empty shift register before
 *        calling serial8250_em485_init(). This interrupt is not a part of
 *        8250 standard, but implementation defined.
 *
 *        The function is supposed to be called from .rs485_config callback
 *        or from any other callback protected with p->port.lock spinlock.
 *
 *        See also serial8250_em485_destroy()
 *
 *        Return 0 - success, -errno - otherwise
 */
static int serial8250_em485_init(struct uart_8250_port *p)
{
        /* Port locked to synchronize UART_IER access against the console. */
        lockdep_assert_held_once(&p->port.lock);

        if (p->em485)
                goto deassert_rts;

        p->em485 = kmalloc(sizeof(struct uart_8250_em485), GFP_ATOMIC);
        if (!p->em485)
                return -ENOMEM;

        hrtimer_setup(&p->em485->stop_tx_timer, &serial8250_em485_handle_stop_tx, CLOCK_MONOTONIC,
                      HRTIMER_MODE_REL);
        hrtimer_setup(&p->em485->start_tx_timer, &serial8250_em485_handle_start_tx, CLOCK_MONOTONIC,
                      HRTIMER_MODE_REL);
        p->em485->port = p;
        p->em485->active_timer = NULL;
        p->em485->tx_stopped = true;

deassert_rts:
        if (p->em485->tx_stopped)
                p->rs485_stop_tx(p, true);

        return 0;
}

/**
 *        serial8250_em485_destroy() - put uart_8250_port into normal state
 *        @p:        uart_8250_port port instance
 *
 *        The function is used to stop rs485 software emulating on the
 *        &struct uart_8250_port* @p. The function is idempotent, so it is safe to
 *        call it multiple times.
 *
 *        The function is supposed to be called from .rs485_config callback
 *        or from any other callback protected with p->port.lock spinlock.
 *
 *        See also serial8250_em485_init()
 */
void serial8250_em485_destroy(struct uart_8250_port *p)
{
        if (!p->em485)
                return;

        hrtimer_cancel(&p->em485->start_tx_timer);
        hrtimer_cancel(&p->em485->stop_tx_timer);

        kfree(p->em485);
        p->em485 = NULL;
}
EXPORT_SYMBOL_GPL(serial8250_em485_destroy);

struct serial_rs485 serial8250_em485_supported = {
        .flags = SER_RS485_ENABLED | SER_RS485_RTS_ON_SEND | SER_RS485_RTS_AFTER_SEND |
                 SER_RS485_TERMINATE_BUS | SER_RS485_RX_DURING_TX,
        .delay_rts_before_send = 1,
        .delay_rts_after_send = 1,
};
EXPORT_SYMBOL_GPL(serial8250_em485_supported);

/**
 * serial8250_em485_config() - generic ->rs485_config() callback
 * @port: uart port
 * @termios: termios structure
 * @rs485: rs485 settings
 *
 * Generic callback usable by 8250 uart drivers to activate rs485 settings
 * if the uart is incapable of driving RTS as a Transmit Enable signal in
 * hardware, relying on software emulation instead.
 */
int serial8250_em485_config(struct uart_port *port, struct ktermios *termios,
                            struct serial_rs485 *rs485)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        /*
         * Both serial8250_em485_init() and serial8250_em485_destroy()
         * are idempotent.
         */
        if (rs485->flags & SER_RS485_ENABLED)
                return serial8250_em485_init(up);

        serial8250_em485_destroy(up);
        return 0;
}
EXPORT_SYMBOL_GPL(serial8250_em485_config);

/*
 * These two wrappers ensure that enable_runtime_pm_tx() can be called more than
 * once and disable_runtime_pm_tx() will still disable RPM because the fifo is
 * empty and the HW can idle again.
 */
static void serial8250_rpm_get_tx(struct uart_8250_port *p)
{
        unsigned char rpm_active;

        if (!(p->capabilities & UART_CAP_RPM))
                return;

        rpm_active = xchg(&p->rpm_tx_active, 1);
        if (rpm_active)
                return;
        pm_runtime_get_sync(p->port.dev);
}

static void serial8250_rpm_put_tx(struct uart_8250_port *p)
{
        unsigned char rpm_active;

        if (!(p->capabilities & UART_CAP_RPM))
                return;

        rpm_active = xchg(&p->rpm_tx_active, 0);
        if (!rpm_active)
                return;
        pm_runtime_mark_last_busy(p->port.dev);
        pm_runtime_put_autosuspend(p->port.dev);
}

/*
 * IER sleep support.  UARTs which have EFRs need the "extended
 * capability" bit enabled.  Note that on XR16C850s, we need to
 * reset LCR to write to IER.
 */
static void serial8250_set_sleep(struct uart_8250_port *p, int sleep)
{
        unsigned char lcr = 0, efr = 0;

        guard(serial8250_rpm)(p);

        if (!(p->capabilities & UART_CAP_SLEEP))
                return;

        /* Synchronize UART_IER access against the console. */
        guard(uart_port_lock_irq)(&p->port);

        if (p->capabilities & UART_CAP_EFR) {
                lcr = serial_in(p, UART_LCR);
                efr = serial_in(p, UART_EFR);
                serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B);
                serial_out(p, UART_EFR, UART_EFR_ECB);
                serial_out(p, UART_LCR, 0);
        }
        serial_out(p, UART_IER, sleep ? UART_IERX_SLEEP : 0);
        if (p->capabilities & UART_CAP_EFR) {
                serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B);
                serial_out(p, UART_EFR, efr);
                serial_out(p, UART_LCR, lcr);
        }
}

/* Clear the interrupt registers. */
static void serial8250_clear_interrupts(struct uart_port *port)
{
        serial_port_in(port, UART_LSR);
        serial_port_in(port, UART_RX);
        serial_port_in(port, UART_IIR);
        serial_port_in(port, UART_MSR);
}

static void serial8250_clear_IER(struct uart_8250_port *up)
{
        if (up->capabilities & UART_CAP_UUE)
                serial_out(up, UART_IER, UART_IER_UUE);
        else
                serial_out(up, UART_IER, 0);
}

/*
 * This is a quickie test to see how big the FIFO is.
 * It doesn't work at all the time, more's the pity.
 */
static int size_fifo(struct uart_8250_port *up)
{
        unsigned char old_fcr, old_mcr, old_lcr;
        u32 old_dl;
        int count;

        old_lcr = serial_in(up, UART_LCR);
        serial_out(up, UART_LCR, 0);
        old_fcr = serial_in(up, UART_FCR);
        old_mcr = serial8250_in_MCR(up);
        serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO |
                    UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT);
        serial8250_out_MCR(up, UART_MCR_LOOP);
        serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A);
        old_dl = serial_dl_read(up);
        serial_dl_write(up, 0x0001);
        serial_out(up, UART_LCR, UART_LCR_WLEN8);
        for (count = 0; count < 256; count++)
                serial_out(up, UART_TX, count);
        mdelay(20);/* FIXME - schedule_timeout */
        for (count = 0; (serial_in(up, UART_LSR) & UART_LSR_DR) &&
             (count < 256); count++)
                serial_in(up, UART_RX);
        serial_out(up, UART_FCR, old_fcr);
        serial8250_out_MCR(up, old_mcr);
        serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A);
        serial_dl_write(up, old_dl);
        serial_out(up, UART_LCR, old_lcr);

        return count;
}

/*
 * Read UART ID using the divisor method - set DLL and DLM to zero
 * and the revision will be in DLL and device type in DLM.  We
 * preserve the device state across this.
 */
static unsigned int autoconfig_read_divisor_id(struct uart_8250_port *p)
{
        unsigned char old_lcr;
        unsigned int id, old_dl;

        old_lcr = serial_in(p, UART_LCR);
        serial_out(p, UART_LCR, UART_LCR_CONF_MODE_A);
        old_dl = serial_dl_read(p);
        serial_dl_write(p, 0);
        id = serial_dl_read(p);
        serial_dl_write(p, old_dl);

        serial_out(p, UART_LCR, old_lcr);

        return id;
}

/*
 * This is a helper routine to autodetect StarTech/Exar/Oxsemi UART's.
 * When this function is called we know it is at least a StarTech
 * 16650 V2, but it might be one of several StarTech UARTs, or one of
 * its clones.  (We treat the broken original StarTech 16650 V1 as a
 * 16550, and why not?  Startech doesn't seem to even acknowledge its
 * existence.)
 *
 * What evil have men's minds wrought...
 */
static void autoconfig_has_efr(struct uart_8250_port *up)
{
        unsigned int id1, id2, id3, rev;

        /*
         * Everything with an EFR has SLEEP
         */
        up->capabilities |= UART_CAP_EFR | UART_CAP_SLEEP;

        /*
         * First we check to see if it's an Oxford Semiconductor UART.
         *
         * If we have to do this here because some non-National
         * Semiconductor clone chips lock up if you try writing to the
         * LSR register (which serial_icr_read does)
         */

        /*
         * Check for Oxford Semiconductor 16C950.
         *
         * EFR [4] must be set else this test fails.
         *
         * This shouldn't be necessary, but Mike Hudson (Exoray@isys.ca)
         * claims that it's needed for 952 dual UART's (which are not
         * recommended for new designs).
         */
        up->acr = 0;
        serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
        serial_out(up, UART_EFR, UART_EFR_ECB);
        serial_out(up, UART_LCR, 0x00);
        id1 = serial_icr_read(up, UART_ID1);
        id2 = serial_icr_read(up, UART_ID2);
        id3 = serial_icr_read(up, UART_ID3);
        rev = serial_icr_read(up, UART_REV);

        if (id1 == 0x16 && id2 == 0xC9 &&
            (id3 == 0x50 || id3 == 0x52 || id3 == 0x54)) {
                up->port.type = PORT_16C950;

                /*
                 * Enable work around for the Oxford Semiconductor 952 rev B
                 * chip which causes it to seriously miscalculate baud rates
                 * when DLL is 0.
                 */
                if (id3 == 0x52 && rev == 0x01)
                        up->bugs |= UART_BUG_QUOT;
                return;
        }

        /*
         * We check for a XR16C850 by setting DLL and DLM to 0, and then
         * reading back DLL and DLM.  The chip type depends on the DLM
         * value read back:
         *  0x10 - XR16C850 and the DLL contains the chip revision.
         *  0x12 - XR16C2850.
         *  0x14 - XR16C854.
         */
        id1 = autoconfig_read_divisor_id(up);

        id2 = id1 >> 8;
        if (id2 == 0x10 || id2 == 0x12 || id2 == 0x14) {
                up->port.type = PORT_16850;
                return;
        }

        /*
         * It wasn't an XR16C850.
         *
         * We distinguish between the '654 and the '650 by counting
         * how many bytes are in the FIFO.  I'm using this for now,
         * since that's the technique that was sent to me in the
         * serial driver update, but I'm not convinced this works.
         * I've had problems doing this in the past.  -TYT
         */
        if (size_fifo(up) == 64)
                up->port.type = PORT_16654;
        else
                up->port.type = PORT_16650V2;
}

/*
 * We detected a chip without a FIFO.  Only two fall into
 * this category - the original 8250 and the 16450.  The
 * 16450 has a scratch register (accessible with LCR=0)
 */
static void autoconfig_8250(struct uart_8250_port *up)
{
        unsigned char scratch, status1, status2;

        up->port.type = PORT_8250;

        scratch = serial_in(up, UART_SCR);
        serial_out(up, UART_SCR, 0xa5);
        status1 = serial_in(up, UART_SCR);
        serial_out(up, UART_SCR, 0x5a);
        status2 = serial_in(up, UART_SCR);
        serial_out(up, UART_SCR, scratch);

        if (status1 == 0xa5 && status2 == 0x5a)
                up->port.type = PORT_16450;
}

static int broken_efr(struct uart_8250_port *up)
{
        /*
         * Exar ST16C2550 "A2" devices incorrectly detect as
         * having an EFR, and report an ID of 0x0201.  See
         * http://linux.derkeiler.com/Mailing-Lists/Kernel/2004-11/4812.html
         */
        if (autoconfig_read_divisor_id(up) == 0x0201 && size_fifo(up) == 16)
                return 1;

        return 0;
}

/*
 * We know that the chip has FIFOs.  Does it have an EFR?  The
 * EFR is located in the same register position as the IIR and
 * we know the top two bits of the IIR are currently set.  The
 * EFR should contain zero.  Try to read the EFR.
 */
static void autoconfig_16550a(struct uart_8250_port *up)
{
        unsigned char status1, status2;
        unsigned int iersave;

        /* Port locked to synchronize UART_IER access against the console. */
        lockdep_assert_held_once(&up->port.lock);

        up->port.type = PORT_16550A;
        up->capabilities |= UART_CAP_FIFO;

        if (!IS_ENABLED(CONFIG_SERIAL_8250_16550A_VARIANTS) &&
            !(up->port.flags & UPF_FULL_PROBE))
                return;

        /*
         * Check for presence of the EFR when DLAB is set.
         * Only ST16C650V1 UARTs pass this test.
         */
        serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A);
        if (serial_in(up, UART_EFR) == 0) {
                serial_out(up, UART_EFR, 0xA8);
                if (serial_in(up, UART_EFR) != 0) {
                        up->port.type = PORT_16650;
                        up->capabilities |= UART_CAP_EFR | UART_CAP_SLEEP;
                } else {
                        serial_out(up, UART_LCR, 0);
                        serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO |
                                   UART_FCR7_64BYTE);
                        status1 = serial_in(up, UART_IIR) & UART_IIR_FIFO_ENABLED_16750;
                        serial_out(up, UART_FCR, 0);
                        serial_out(up, UART_LCR, 0);

                        if (status1 == UART_IIR_FIFO_ENABLED_16750)
                                up->port.type = PORT_16550A_FSL64;
                }
                serial_out(up, UART_EFR, 0);
                return;
        }

        /*
         * Maybe it requires 0xbf to be written to the LCR.
         * (other ST16C650V2 UARTs, TI16C752A, etc)
         */
        serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
        if (serial_in(up, UART_EFR) == 0 && !broken_efr(up)) {
                autoconfig_has_efr(up);
                return;
        }

        /*
         * Check for a National Semiconductor SuperIO chip.
         * Attempt to switch to bank 2, read the value of the LOOP bit
         * from EXCR1. Switch back to bank 0, change it in MCR. Then
         * switch back to bank 2, read it from EXCR1 again and check
         * it's changed. If so, set baud_base in EXCR2 to 921600. -- dwmw2
         */
        serial_out(up, UART_LCR, 0);
        status1 = serial8250_in_MCR(up);
        serial_out(up, UART_LCR, 0xE0);
        status2 = serial_in(up, 0x02); /* EXCR1 */

        if (!((status2 ^ status1) & UART_MCR_LOOP)) {
                serial_out(up, UART_LCR, 0);
                serial8250_out_MCR(up, status1 ^ UART_MCR_LOOP);
                serial_out(up, UART_LCR, 0xE0);
                status2 = serial_in(up, 0x02); /* EXCR1 */
                serial_out(up, UART_LCR, 0);
                serial8250_out_MCR(up, status1);

                if ((status2 ^ status1) & UART_MCR_LOOP) {
                        unsigned short quot;

                        serial_out(up, UART_LCR, 0xE0);

                        quot = serial_dl_read(up);
                        quot <<= 3;

                        if (ns16550a_goto_highspeed(up))
                                serial_dl_write(up, quot);

                        serial_out(up, UART_LCR, 0);

                        up->port.uartclk = 921600*16;
                        up->port.type = PORT_NS16550A;
                        up->capabilities |= UART_NATSEMI;
                        return;
                }
        }

        /*
         * No EFR.  Try to detect a TI16750, which only sets bit 5 of
         * the IIR when 64 byte FIFO mode is enabled when DLAB is set.
         * Try setting it with and without DLAB set.  Cheap clones
         * set bit 5 without DLAB set.
         */
        serial_out(up, UART_LCR, 0);
        serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR7_64BYTE);
        status1 = serial_in(up, UART_IIR) & UART_IIR_FIFO_ENABLED_16750;
        serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO);

        serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A);
        serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR7_64BYTE);
        status2 = serial_in(up, UART_IIR) & UART_IIR_FIFO_ENABLED_16750;
        serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO);

        serial_out(up, UART_LCR, 0);

        if (status1 == UART_IIR_FIFO_ENABLED_16550A &&
            status2 == UART_IIR_FIFO_ENABLED_16750) {
                up->port.type = PORT_16750;
                up->capabilities |= UART_CAP_AFE | UART_CAP_SLEEP;
                return;
        }

        /*
         * Try writing and reading the UART_IER_UUE bit (b6).
         * If it works, this is probably one of the Xscale platform's
         * internal UARTs.
         * We're going to explicitly set the UUE bit to 0 before
         * trying to write and read a 1 just to make sure it's not
         * already a 1 and maybe locked there before we even start.
         */
        iersave = serial_in(up, UART_IER);
        serial_out(up, UART_IER, iersave & ~UART_IER_UUE);
        if (!(serial_in(up, UART_IER) & UART_IER_UUE)) {
                /*
                 * OK it's in a known zero state, try writing and reading
                 * without disturbing the current state of the other bits.
                 */
                serial_out(up, UART_IER, iersave | UART_IER_UUE);
                if (serial_in(up, UART_IER) & UART_IER_UUE) {
                        /*
                         * It's an Xscale.
                         * We'll leave the UART_IER_UUE bit set to 1 (enabled).
                         */
                        up->port.type = PORT_XSCALE;
                        up->capabilities |= UART_CAP_UUE | UART_CAP_RTOIE;
                        return;
                }
        }
        serial_out(up, UART_IER, iersave);

        /*
         * We distinguish between 16550A and U6 16550A by counting
         * how many bytes are in the FIFO.
         */
        if (up->port.type == PORT_16550A && size_fifo(up) == 64) {
                up->port.type = PORT_U6_16550A;
                up->capabilities |= UART_CAP_AFE;
        }
}

/*
 * This routine is called by rs_init() to initialize a specific serial
 * port.  It determines what type of UART chip this serial port is
 * using: 8250, 16450, 16550, 16550A.  The important question is
 * whether or not this UART is a 16550A or not, since this will
 * determine whether or not we can use its FIFO features or not.
 */
static void autoconfig(struct uart_8250_port *up)
{
        unsigned char status1, scratch, scratch2, scratch3;
        unsigned char save_lcr, save_mcr;
        struct uart_port *port = &up->port;
        unsigned long flags;
        unsigned int old_capabilities;

        if (!port->iobase && !port->mapbase && !port->membase)
                return;

        /*
         * We really do need global IRQs disabled here - we're going to
         * be frobbing the chips IRQ enable register to see if it exists.
         *
         * Synchronize UART_IER access against the console.
         */
        uart_port_lock_irqsave(port, &flags);

        up->capabilities = 0;
        up->bugs = 0;

        if (!(port->flags & UPF_BUGGY_UART)) {
                /*
                 * Do a simple existence test first; if we fail this,
                 * there's no point trying anything else.
                 *
                 * 0x80 is used as a nonsense port to prevent against
                 * false positives due to ISA bus float.  The
                 * assumption is that 0x80 is a non-existent port;
                 * which should be safe since include/asm/io.h also
                 * makes this assumption.
                 *
                 * Note: this is safe as long as MCR bit 4 is clear
                 * and the device is in "PC" mode.
                 */
                scratch = serial_in(up, UART_IER);
                serial_out(up, UART_IER, 0);
#if defined(__i386__) && defined(CONFIG_HAS_IOPORT)
                outb(0xff, 0x080);
#endif
                /*
                 * Mask out IER[7:4] bits for test as some UARTs (e.g. TL
                 * 16C754B) allow only to modify them if an EFR bit is set.
                 */
                scratch2 = serial_in(up, UART_IER) & UART_IER_ALL_INTR;
                serial_out(up, UART_IER, UART_IER_ALL_INTR);
#if defined(__i386__) && defined(CONFIG_HAS_IOPORT)
                outb(0, 0x080);
#endif
                scratch3 = serial_in(up, UART_IER) & UART_IER_ALL_INTR;
                serial_out(up, UART_IER, scratch);
                if (scratch2 != 0 || scratch3 != UART_IER_ALL_INTR) {
                        /*
                         * We failed; there's nothing here
                         */
                        uart_port_unlock_irqrestore(port, flags);
                        return;
                }
        }

        save_mcr = serial8250_in_MCR(up);
        save_lcr = serial_in(up, UART_LCR);

        /*
         * Check to see if a UART is really there.  Certain broken
         * internal modems based on the Rockwell chipset fail this
         * test, because they apparently don't implement the loopback
         * test mode.  So this test is skipped on the COM 1 through
         * COM 4 ports.  This *should* be safe, since no board
         * manufacturer would be stupid enough to design a board
         * that conflicts with COM 1-4 --- we hope!
         */
        if (!(port->flags & UPF_SKIP_TEST)) {
                serial8250_out_MCR(up, UART_MCR_LOOP | UART_MCR_OUT2 | UART_MCR_RTS);
                status1 = serial_in(up, UART_MSR) & UART_MSR_STATUS_BITS;
                serial8250_out_MCR(up, save_mcr);
                if (status1 != (UART_MSR_DCD | UART_MSR_CTS)) {
                        uart_port_unlock_irqrestore(port, flags);
                        return;
                }
        }

        /*
         * We're pretty sure there's a port here.  Lets find out what
         * type of port it is.  The IIR top two bits allows us to find
         * out if it's 8250 or 16450, 16550, 16550A or later.  This
         * determines what we test for next.
         *
         * We also initialise the EFR (if any) to zero for later.  The
         * EFR occupies the same register location as the FCR and IIR.
         */
        serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
        serial_out(up, UART_EFR, 0);
        serial_out(up, UART_LCR, 0);

        serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO);

        switch (serial_in(up, UART_IIR) & UART_IIR_FIFO_ENABLED) {
        case UART_IIR_FIFO_ENABLED_8250:
                autoconfig_8250(up);
                break;
        case UART_IIR_FIFO_ENABLED_16550:
                port->type = PORT_16550;
                break;
        case UART_IIR_FIFO_ENABLED_16550A:
                autoconfig_16550a(up);
                break;
        default:
                port->type = PORT_UNKNOWN;
                break;
        }

        rsa_autoconfig(up);

        serial_out(up, UART_LCR, save_lcr);

        port->fifosize = uart_config[up->port.type].fifo_size;
        old_capabilities = up->capabilities;
        up->capabilities = uart_config[port->type].flags;
        up->tx_loadsz = uart_config[port->type].tx_loadsz;

        if (port->type != PORT_UNKNOWN) {
                /*
                 * Reset the UART.
                 */
                rsa_reset(up);
                serial8250_out_MCR(up, save_mcr);
                serial8250_clear_fifos(up);
                serial_in(up, UART_RX);
                serial8250_clear_IER(up);
        }

        uart_port_unlock_irqrestore(port, flags);

        /*
         * Check if the device is a Fintek F81216A
         */
        if (port->type == PORT_16550A && port->iotype == UPIO_PORT)
                fintek_8250_probe(up);

        if (up->capabilities != old_capabilities) {
                dev_warn(port->dev, "detected caps %08x should be %08x\n",
                         old_capabilities, up->capabilities);
        }
}

static void autoconfig_irq(struct uart_8250_port *up)
{
        struct uart_port *port = &up->port;
        unsigned char save_mcr, save_ier;
        unsigned char save_ICP = 0;
        unsigned int ICP = 0;
        unsigned long irqs;
        int irq;

        if (port->flags & UPF_FOURPORT) {
                ICP = (port->iobase & 0xfe0) | 0x1f;
                save_ICP = inb_p(ICP);
                outb_p(0x80, ICP);
                inb_p(ICP);
        }

        /* forget possible initially masked and pending IRQ */
        probe_irq_off(probe_irq_on());
        save_mcr = serial8250_in_MCR(up);
        /* Synchronize UART_IER access against the console. */
        scoped_guard(uart_port_lock_irq, port)
                save_ier = serial_in(up, UART_IER);
        serial8250_out_MCR(up, UART_MCR_OUT1 | UART_MCR_OUT2);

        irqs = probe_irq_on();
        serial8250_out_MCR(up, 0);
        udelay(10);
        if (port->flags & UPF_FOURPORT) {
                serial8250_out_MCR(up, UART_MCR_DTR | UART_MCR_RTS);
        } else {
                serial8250_out_MCR(up,
                        UART_MCR_DTR | UART_MCR_RTS | UART_MCR_OUT2);
        }
        /* Synchronize UART_IER access against the console. */
        scoped_guard(uart_port_lock_irq, port)
                serial_out(up, UART_IER, UART_IER_ALL_INTR);
        serial8250_clear_interrupts(port);
        serial_out(up, UART_TX, 0xFF);
        udelay(20);
        irq = probe_irq_off(irqs);

        serial8250_out_MCR(up, save_mcr);
        /* Synchronize UART_IER access against the console. */
        scoped_guard(uart_port_lock_irq, port)
                serial_out(up, UART_IER, save_ier);

        if (port->flags & UPF_FOURPORT)
                outb_p(save_ICP, ICP);

        port->irq = (irq > 0) ? irq : 0;
}

static void serial8250_stop_rx(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        /* Port locked to synchronize UART_IER access against the console. */
        lockdep_assert_held_once(&port->lock);

        guard(serial8250_rpm)(up);

        up->ier &= ~(UART_IER_RLSI | UART_IER_RDI);
        serial_port_out(port, UART_IER, up->ier);
}

/**
 * serial8250_em485_stop_tx() - generic ->rs485_stop_tx() callback
 * @p: uart 8250 port
 * @toggle_ier: true to allow enabling receive interrupts
 *
 * Generic callback usable by 8250 uart drivers to stop rs485 transmission.
 */
void serial8250_em485_stop_tx(struct uart_8250_port *p, bool toggle_ier)
{
        unsigned char mcr = serial8250_in_MCR(p);

        /* Port locked to synchronize UART_IER access against the console. */
        lockdep_assert_held_once(&p->port.lock);

        if (p->port.rs485.flags & SER_RS485_RTS_AFTER_SEND)
                mcr |= UART_MCR_RTS;
        else
                mcr &= ~UART_MCR_RTS;
        serial8250_out_MCR(p, mcr);

        /*
         * Empty the RX FIFO, we are not interested in anything
         * received during the half-duplex transmission.
         * Enable previously disabled RX interrupts.
         */
        if (!(p->port.rs485.flags & SER_RS485_RX_DURING_TX)) {
                serial8250_clear_and_reinit_fifos(p);

                if (toggle_ier) {
                        p->ier |= UART_IER_RLSI | UART_IER_RDI;
                        serial_port_out(&p->port, UART_IER, p->ier);
                }
        }
}
EXPORT_SYMBOL_GPL(serial8250_em485_stop_tx);

static enum hrtimer_restart serial8250_em485_handle_stop_tx(struct hrtimer *t)
{
        struct uart_8250_em485 *em485 = container_of(t, struct uart_8250_em485,
                        stop_tx_timer);
        struct uart_8250_port *p = em485->port;

        guard(serial8250_rpm)(p);
        guard(uart_port_lock_irqsave)(&p->port);

        if (em485->active_timer == &em485->stop_tx_timer) {
                p->rs485_stop_tx(p, true);
                em485->active_timer = NULL;
                em485->tx_stopped = true;
        }

        return HRTIMER_NORESTART;
}

static void start_hrtimer_ms(struct hrtimer *hrt, unsigned long msec)
{
        hrtimer_start(hrt, ms_to_ktime(msec), HRTIMER_MODE_REL);
}

static void __stop_tx_rs485(struct uart_8250_port *p, u64 stop_delay)
{
        struct uart_8250_em485 *em485 = p->em485;

        /* Port locked to synchronize UART_IER access against the console. */
        lockdep_assert_held_once(&p->port.lock);

        stop_delay += (u64)p->port.rs485.delay_rts_after_send * NSEC_PER_MSEC;

        /*
         * rs485_stop_tx() is going to set RTS according to config
         * AND flush RX FIFO if required.
         */
        if (stop_delay > 0) {
                em485->active_timer = &em485->stop_tx_timer;
                hrtimer_start(&em485->stop_tx_timer, ns_to_ktime(stop_delay), HRTIMER_MODE_REL);
        } else {
                p->rs485_stop_tx(p, true);
                em485->active_timer = NULL;
                em485->tx_stopped = true;
        }
}

static inline void __stop_tx(struct uart_8250_port *p)
{
        struct uart_8250_em485 *em485 = p->em485;

        if (em485) {
                u16 lsr = serial_lsr_in(p);
                u64 stop_delay = 0;

                if (!(lsr & UART_LSR_THRE))
                        return;
                /*
                 * To provide required timing and allow FIFO transfer,
                 * __stop_tx_rs485() must be called only when both FIFO and
                 * shift register are empty. The device driver should either
                 * enable interrupt on TEMT or set UART_CAP_NOTEMT that will
                 * enlarge stop_tx_timer by the tx time of one frame to cover
                 * for emptying of the shift register.
                 */
                if (!(lsr & UART_LSR_TEMT)) {
                        if (!(p->capabilities & UART_CAP_NOTEMT))
                                return;
                        /*
                         * RTS might get deasserted too early with the normal
                         * frame timing formula. It seems to suggest THRE might
                         * get asserted already during tx of the stop bit
                         * rather than after it is fully sent.
                         * Roughly estimate 1 extra bit here with / 7.
                         */
                        stop_delay = p->port.frame_time + DIV_ROUND_UP(p->port.frame_time, 7);
                }

                __stop_tx_rs485(p, stop_delay);
        }

        if (serial8250_clear_THRI(p))
                serial8250_rpm_put_tx(p);
}

static void serial8250_stop_tx(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        guard(serial8250_rpm)(up);
        __stop_tx(up);

        /*
         * We really want to stop the transmitter from sending.
         */
        if (port->type == PORT_16C950) {
                up->acr |= UART_ACR_TXDIS;
                serial_icr_write(up, UART_ACR, up->acr);
        }
}

static inline void __start_tx(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        if (up->dma && !up->dma->tx_dma(up))
                return;

        if (serial8250_set_THRI(up)) {
                if (up->bugs & UART_BUG_TXEN) {
                        u16 lsr = serial_lsr_in(up);

                        if (lsr & UART_LSR_THRE)
                                serial8250_tx_chars(up);
                }
        }

        /*
         * Re-enable the transmitter if we disabled it.
         */
        if (port->type == PORT_16C950 && up->acr & UART_ACR_TXDIS) {
                up->acr &= ~UART_ACR_TXDIS;
                serial_icr_write(up, UART_ACR, up->acr);
        }
}

/**
 * serial8250_em485_start_tx() - generic ->rs485_start_tx() callback
 * @up: uart 8250 port
 * @toggle_ier: true to allow disabling receive interrupts
 *
 * Generic callback usable by 8250 uart drivers to start rs485 transmission.
 * Assumes that setting the RTS bit in the MCR register means RTS is high.
 * (Some chips use inverse semantics.)  Further assumes that reception is
 * stoppable by disabling the UART_IER_RDI interrupt.  (Some chips set the
 * UART_LSR_DR bit even when UART_IER_RDI is disabled, foiling this approach.)
 */
void serial8250_em485_start_tx(struct uart_8250_port *up, bool toggle_ier)
{
        unsigned char mcr = serial8250_in_MCR(up);

        if (!(up->port.rs485.flags & SER_RS485_RX_DURING_TX) && toggle_ier)
                serial8250_stop_rx(&up->port);

        if (up->port.rs485.flags & SER_RS485_RTS_ON_SEND)
                mcr |= UART_MCR_RTS;
        else
                mcr &= ~UART_MCR_RTS;
        serial8250_out_MCR(up, mcr);
}
EXPORT_SYMBOL_GPL(serial8250_em485_start_tx);

/* Returns false, if start_tx_timer was setup to defer TX start */
static bool start_tx_rs485(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        struct uart_8250_em485 *em485 = up->em485;

        /*
         * While serial8250_em485_handle_stop_tx() is a noop if
         * em485->active_timer != &em485->stop_tx_timer, it might happen that
         * the timer is still armed and triggers only after the current bunch of
         * chars is send and em485->active_timer == &em485->stop_tx_timer again.
         * So cancel the timer. There is still a theoretical race condition if
         * the timer is already running and only comes around to check for
         * em485->active_timer when &em485->stop_tx_timer is armed again.
         */
        if (em485->active_timer == &em485->stop_tx_timer)
                hrtimer_try_to_cancel(&em485->stop_tx_timer);

        em485->active_timer = NULL;

        if (em485->tx_stopped) {
                em485->tx_stopped = false;

                up->rs485_start_tx(up, true);

                if (up->port.rs485.delay_rts_before_send > 0) {
                        em485->active_timer = &em485->start_tx_timer;
                        start_hrtimer_ms(&em485->start_tx_timer,
                                         up->port.rs485.delay_rts_before_send);
                        return false;
                }
        }

        return true;
}

static enum hrtimer_restart serial8250_em485_handle_start_tx(struct hrtimer *t)
{
        struct uart_8250_em485 *em485 = container_of(t, struct uart_8250_em485,
                        start_tx_timer);
        struct uart_8250_port *p = em485->port;

        guard(uart_port_lock_irqsave)(&p->port);

        if (em485->active_timer == &em485->start_tx_timer) {
                __start_tx(&p->port);
                em485->active_timer = NULL;
        }

        return HRTIMER_NORESTART;
}

static void serial8250_start_tx(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        struct uart_8250_em485 *em485 = up->em485;

        /* Port locked to synchronize UART_IER access against the console. */
        lockdep_assert_held_once(&port->lock);

        if (!port->x_char && kfifo_is_empty(&port->state->port.xmit_fifo))
                return;

        serial8250_rpm_get_tx(up);

        if (em485) {
                if ((em485->active_timer == &em485->start_tx_timer) ||
                    !start_tx_rs485(port))
                        return;
        }
        __start_tx(port);
}

static void serial8250_throttle(struct uart_port *port)
{
        port->throttle(port);
}

static void serial8250_unthrottle(struct uart_port *port)
{
        port->unthrottle(port);
}

static void serial8250_disable_ms(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        /* Port locked to synchronize UART_IER access against the console. */
        lockdep_assert_held_once(&port->lock);

        /* no MSR capabilities */
        if (up->bugs & UART_BUG_NOMSR)
                return;

        mctrl_gpio_disable_ms_no_sync(up->gpios);

        up->ier &= ~UART_IER_MSI;
        serial_port_out(port, UART_IER, up->ier);
}

static void serial8250_enable_ms(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        /* Port locked to synchronize UART_IER access against the console. */
        lockdep_assert_held_once(&port->lock);

        /* no MSR capabilities */
        if (up->bugs & UART_BUG_NOMSR)
                return;

        mctrl_gpio_enable_ms(up->gpios);

        up->ier |= UART_IER_MSI;

        guard(serial8250_rpm)(up);
        serial_port_out(port, UART_IER, up->ier);
}

void serial8250_read_char(struct uart_8250_port *up, u16 lsr)
{
        struct uart_port *port = &up->port;
        u8 ch, flag = TTY_NORMAL;

        if (likely(lsr & UART_LSR_DR))
                ch = serial_in(up, UART_RX);
        else
                /*
                 * Intel 82571 has a Serial Over Lan device that will
                 * set UART_LSR_BI without setting UART_LSR_DR when
                 * it receives a break. To avoid reading from the
                 * receive buffer without UART_LSR_DR bit set, we
                 * just force the read character to be 0
                 */
                ch = 0;

        port->icount.rx++;

        lsr |= up->lsr_saved_flags;
        up->lsr_saved_flags = 0;

        if (unlikely(lsr & UART_LSR_BRK_ERROR_BITS)) {
                if (lsr & UART_LSR_BI) {
                        lsr &= ~(UART_LSR_FE | UART_LSR_PE);
                        port->icount.brk++;
                        /*
                         * We do the SysRQ and SAK checking
                         * here because otherwise the break
                         * may get masked by ignore_status_mask
                         * or read_status_mask.
                         */
                        if (uart_handle_break(port))
                                return;
                } else if (lsr & UART_LSR_PE)
                        port->icount.parity++;
                else if (lsr & UART_LSR_FE)
                        port->icount.frame++;
                if (lsr & UART_LSR_OE)
                        port->icount.overrun++;

                /*
                 * Mask off conditions which should be ignored.
                 */
                lsr &= port->read_status_mask;

                if (lsr & UART_LSR_BI) {
                        dev_dbg(port->dev, "handling break\n");
                        flag = TTY_BREAK;
                } else if (lsr & UART_LSR_PE)
                        flag = TTY_PARITY;
                else if (lsr & UART_LSR_FE)
                        flag = TTY_FRAME;
        }
        if (uart_prepare_sysrq_char(port, ch))
                return;

        uart_insert_char(port, lsr, UART_LSR_OE, ch, flag);
}
EXPORT_SYMBOL_GPL(serial8250_read_char);

/*
 * serial8250_rx_chars - Read characters. The first LSR value must be passed in.
 *
 * Returns LSR bits. The caller should rely only on non-Rx related LSR bits
 * (such as THRE) because the LSR value might come from an already consumed
 * character.
 */
u16 serial8250_rx_chars(struct uart_8250_port *up, u16 lsr)
{
        struct uart_port *port = &up->port;
        int max_count = 256;

        do {
                serial8250_read_char(up, lsr);
                if (--max_count == 0)
                        break;
                lsr = serial_in(up, UART_LSR);
        } while (lsr & (UART_LSR_DR | UART_LSR_BI));

        tty_flip_buffer_push(&port->state->port);
        return lsr;
}
EXPORT_SYMBOL_GPL(serial8250_rx_chars);

void serial8250_tx_chars(struct uart_8250_port *up)
{
        struct uart_port *port = &up->port;
        struct tty_port *tport = &port->state->port;
        int count;

        if (port->x_char) {
                uart_xchar_out(port, UART_TX);
                return;
        }
        if (uart_tx_stopped(port)) {
                serial8250_stop_tx(port);
                return;
        }
        if (kfifo_is_empty(&tport->xmit_fifo)) {
                __stop_tx(up);
                return;
        }

        count = up->tx_loadsz;
        do {
                unsigned char c;

                if (!uart_fifo_get(port, &c))
                        break;

                serial_out(up, UART_TX, c);
                if (up->bugs & UART_BUG_TXRACE) {
                        /*
                         * The Aspeed BMC virtual UARTs have a bug where data
                         * may get stuck in the BMC's Tx FIFO from bursts of
                         * writes on the APB interface.
                         *
                         * Delay back-to-back writes by a read cycle to avoid
                         * stalling the VUART. Read a register that won't have
                         * side-effects and discard the result.
                         */
                        serial_in(up, UART_SCR);
                }

                if ((up->capabilities & UART_CAP_HFIFO) &&
                    !uart_lsr_tx_empty(serial_in(up, UART_LSR)))
                        break;
                /* The BCM2835 MINI UART THRE bit is really a not-full bit. */
                if ((up->capabilities & UART_CAP_MINI) &&
                    !(serial_in(up, UART_LSR) & UART_LSR_THRE))
                        break;
        } while (--count > 0);

        if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
                uart_write_wakeup(port);

        /*
         * With RPM enabled, we have to wait until the FIFO is empty before the
         * HW can go idle. So we get here once again with empty FIFO and disable
         * the interrupt and RPM in __stop_tx()
         */
        if (kfifo_is_empty(&tport->xmit_fifo) &&
            !(up->capabilities & UART_CAP_RPM))
                __stop_tx(up);
}
EXPORT_SYMBOL_GPL(serial8250_tx_chars);

/* Caller holds uart port lock */
unsigned int serial8250_modem_status(struct uart_8250_port *up)
{
        struct uart_port *port = &up->port;
        unsigned int status = serial_in(up, UART_MSR);

        status |= up->msr_saved_flags;
        up->msr_saved_flags = 0;
        if (status & UART_MSR_ANY_DELTA && up->ier & UART_IER_MSI &&
            port->state != NULL) {
                if (status & UART_MSR_TERI)
                        port->icount.rng++;
                if (status & UART_MSR_DDSR)
                        port->icount.dsr++;
                if (status & UART_MSR_DDCD)
                        uart_handle_dcd_change(port, status & UART_MSR_DCD);
                if (status & UART_MSR_DCTS)
                        uart_handle_cts_change(port, status & UART_MSR_CTS);

                wake_up_interruptible(&port->state->port.delta_msr_wait);
        }

        return status;
}
EXPORT_SYMBOL_GPL(serial8250_modem_status);

static bool handle_rx_dma(struct uart_8250_port *up, unsigned int iir)
{
        switch (iir & 0x3f) {
        case UART_IIR_THRI:
                /*
                 * Postpone DMA or not decision to IIR_RDI or IIR_RX_TIMEOUT
                 * because it's impossible to do an informed decision about
                 * that with IIR_THRI.
                 *
                 * This also fixes one known DMA Rx corruption issue where
                 * DR is asserted but DMA Rx only gets a corrupted zero byte
                 * (too early DR?).
                 */
                return false;
        case UART_IIR_RDI:
                if (!up->dma->rx_running)
                        break;
                fallthrough;
        case UART_IIR_RLSI:
        case UART_IIR_RX_TIMEOUT:
                serial8250_rx_dma_flush(up);
                return true;
        }
        return up->dma->rx_dma(up);
}

/*
 * This handles the interrupt from one port.
 */
int serial8250_handle_irq(struct uart_port *port, unsigned int iir)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        struct tty_port *tport = &port->state->port;
        bool skip_rx = false;
        unsigned long flags;
        u16 status;

        if (iir & UART_IIR_NO_INT)
                return 0;

        uart_port_lock_irqsave(port, &flags);

        status = serial_lsr_in(up);

        /*
         * If port is stopped and there are no error conditions in the
         * FIFO, then don't drain the FIFO, as this may lead to TTY buffer
         * overflow. Not servicing, RX FIFO would trigger auto HW flow
         * control when FIFO occupancy reaches preset threshold, thus
         * halting RX. This only works when auto HW flow control is
         * available.
         */
        if (!(status & (UART_LSR_FIFOE | UART_LSR_BRK_ERROR_BITS)) &&
            (port->status & (UPSTAT_AUTOCTS | UPSTAT_AUTORTS)) &&
            !(up->ier & (UART_IER_RLSI | UART_IER_RDI)))
                skip_rx = true;

        if (status & (UART_LSR_DR | UART_LSR_BI) && !skip_rx) {
                struct irq_data *d;

                d = irq_get_irq_data(port->irq);
                if (d && irqd_is_wakeup_set(d))
                        pm_wakeup_event(tport->tty->dev, 0);
                if (!up->dma || handle_rx_dma(up, iir))
                        status = serial8250_rx_chars(up, status);
        }
        serial8250_modem_status(up);
        if ((status & UART_LSR_THRE) && (up->ier & UART_IER_THRI)) {
                if (!up->dma || up->dma->tx_err)
                        serial8250_tx_chars(up);
                else if (!up->dma->tx_running)
                        __stop_tx(up);
        }

        uart_unlock_and_check_sysrq_irqrestore(port, flags);

        return 1;
}
EXPORT_SYMBOL_GPL(serial8250_handle_irq);

static int serial8250_default_handle_irq(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        unsigned int iir;

        guard(serial8250_rpm)(up);

        iir = serial_port_in(port, UART_IIR);
        return serial8250_handle_irq(port, iir);
}

/*
 * Newer 16550 compatible parts such as the SC16C650 & Altera 16550 Soft IP
 * have a programmable TX threshold that triggers the THRE interrupt in
 * the IIR register. In this case, the THRE interrupt indicates the FIFO
 * has space available. Load it up with tx_loadsz bytes.
 */
static int serial8250_tx_threshold_handle_irq(struct uart_port *port)
{
        unsigned int iir = serial_port_in(port, UART_IIR);

        /* TX Threshold IRQ triggered so load up FIFO */
        if ((iir & UART_IIR_ID) == UART_IIR_THRI) {
                struct uart_8250_port *up = up_to_u8250p(port);

                guard(uart_port_lock_irqsave)(port);
                serial8250_tx_chars(up);
        }

        iir = serial_port_in(port, UART_IIR);
        return serial8250_handle_irq(port, iir);
}

static unsigned int serial8250_tx_empty(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        guard(serial8250_rpm)(up);
        guard(uart_port_lock_irqsave)(port);

        if (!serial8250_tx_dma_running(up) && uart_lsr_tx_empty(serial_lsr_in(up)))
                return TIOCSER_TEMT;

        return 0;
}

unsigned int serial8250_do_get_mctrl(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        unsigned int status;
        unsigned int val;

        scoped_guard(serial8250_rpm, up)
                status = serial8250_modem_status(up);

        val = serial8250_MSR_to_TIOCM(status);
        if (up->gpios)
                return mctrl_gpio_get(up->gpios, &val);

        return val;
}
EXPORT_SYMBOL_GPL(serial8250_do_get_mctrl);

static unsigned int serial8250_get_mctrl(struct uart_port *port)
{
        if (port->get_mctrl)
                return port->get_mctrl(port);
        return serial8250_do_get_mctrl(port);
}

void serial8250_do_set_mctrl(struct uart_port *port, unsigned int mctrl)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        unsigned char mcr;

        mcr = serial8250_TIOCM_to_MCR(mctrl);

        mcr |= up->mcr;

        serial8250_out_MCR(up, mcr);
}
EXPORT_SYMBOL_GPL(serial8250_do_set_mctrl);

static void serial8250_set_mctrl(struct uart_port *port, unsigned int mctrl)
{
        if (port->rs485.flags & SER_RS485_ENABLED)
                return;

        if (port->set_mctrl)
                port->set_mctrl(port, mctrl);
        else
                serial8250_do_set_mctrl(port, mctrl);
}

static void serial8250_break_ctl(struct uart_port *port, int break_state)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        guard(serial8250_rpm)(up);
        guard(uart_port_lock_irqsave)(port);

        if (break_state == -1)
                up->lcr |= UART_LCR_SBC;
        else
                up->lcr &= ~UART_LCR_SBC;
        serial_port_out(port, UART_LCR, up->lcr);
}

/* Returns true if @bits were set, false on timeout */
static bool wait_for_lsr(struct uart_8250_port *up, int bits)
{
        unsigned int status, tmout;

        /*
         * Wait for a character to be sent. Fallback to a safe default
         * timeout value if @frame_time is not available.
         */
        if (up->port.frame_time)
                tmout = up->port.frame_time * 2 / NSEC_PER_USEC;
        else
                tmout = 10000;

        for (;;) {
                status = serial_lsr_in(up);

                if ((status & bits) == bits)
                        break;
                if (--tmout == 0)
                        break;
                udelay(1);
                touch_nmi_watchdog();
        }

        return (tmout != 0);
}

/* Wait for transmitter and holding register to empty with timeout */
static void wait_for_xmitr(struct uart_8250_port *up, int bits)
{
        unsigned int tmout;

        wait_for_lsr(up, bits);

        /* Wait up to 1s for flow control if necessary */
        if (up->port.flags & UPF_CONS_FLOW) {
                for (tmout = 1000000; tmout; tmout--) {
                        unsigned int msr = serial_in(up, UART_MSR);
                        up->msr_saved_flags |= msr & MSR_SAVE_FLAGS;
                        if (msr & UART_MSR_CTS)
                                break;
                        udelay(1);
                        touch_nmi_watchdog();
                }
        }
}

#ifdef CONFIG_CONSOLE_POLL
/*
 * Console polling routines for writing and reading from the uart while
 * in an interrupt or debug context.
 */

static int serial8250_get_poll_char(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        u16 lsr;

        guard(serial8250_rpm)(up);

        lsr = serial_port_in(port, UART_LSR);
        if (!(lsr & UART_LSR_DR))
                return NO_POLL_CHAR;

        return serial_port_in(port, UART_RX);
}


static void serial8250_put_poll_char(struct uart_port *port,
                         unsigned char c)
{
        unsigned int ier;
        struct uart_8250_port *up = up_to_u8250p(port);

        /*
         * Normally the port is locked to synchronize UART_IER access
         * against the console. However, this function is only used by
         * KDB/KGDB, where it may not be possible to acquire the port
         * lock because all other CPUs are quiesced. The quiescence
         * should allow safe lockless usage here.
         */

        guard(serial8250_rpm)(up);
        /*
         *        First save the IER then disable the interrupts
         */
        ier = serial_port_in(port, UART_IER);
        serial8250_clear_IER(up);

        wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
        /*
         *        Send the character out.
         */
        serial_port_out(port, UART_TX, c);

        /*
         *        Finally, wait for transmitter to become empty
         *        and restore the IER
         */
        wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
        serial_port_out(port, UART_IER, ier);
}

#endif /* CONFIG_CONSOLE_POLL */

static void serial8250_startup_special(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        switch (port->type) {
        case PORT_16C950: {
                /*
                 * Wake up and initialize UART
                 *
                 * Synchronize UART_IER access against the console.
                 */
                guard(uart_port_lock_irqsave)(port);
                up->acr = 0;
                serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B);
                serial_port_out(port, UART_EFR, UART_EFR_ECB);
                serial_port_out(port, UART_IER, 0);
                serial_port_out(port, UART_LCR, 0);
                serial_icr_write(up, UART_CSR, 0); /* Reset the UART */
                serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B);
                serial_port_out(port, UART_EFR, UART_EFR_ECB);
                serial_port_out(port, UART_LCR, 0);
                break;
        }
        case PORT_DA830:
                /*
                 * Reset the port
                 *
                 * Synchronize UART_IER access against the console.
                 */
                scoped_guard(uart_port_lock_irqsave, port) {
                        serial_port_out(port, UART_IER, 0);
                        serial_port_out(port, UART_DA830_PWREMU_MGMT, 0);
                }
                mdelay(10);

                /* Enable Tx, Rx and free run mode */
                serial_port_out(port, UART_DA830_PWREMU_MGMT,
                                UART_DA830_PWREMU_MGMT_UTRST |
                                UART_DA830_PWREMU_MGMT_URRST |
                                UART_DA830_PWREMU_MGMT_FREE);
                break;
        case PORT_RSA:
                rsa_enable(up);
                break;
        }
}

static void serial8250_set_TRG_levels(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        switch (port->type) {
        /* For a XR16C850, we need to set the trigger levels */
        case PORT_16850: {
                u8 fctr;

                serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);

                fctr = serial_in(up, UART_FCTR) & ~(UART_FCTR_RX|UART_FCTR_TX);
                fctr |= UART_FCTR_TRGD;
                serial_port_out(port, UART_FCTR, fctr | UART_FCTR_RX);
                serial_port_out(port, UART_TRG, UART_TRG_96);
                serial_port_out(port, UART_FCTR, fctr | UART_FCTR_TX);
                serial_port_out(port, UART_TRG, UART_TRG_96);

                serial_port_out(port, UART_LCR, 0);
                break;
        }
        /* For the Altera 16550 variants, set TX threshold trigger level. */
        case PORT_ALTR_16550_F32:
        case PORT_ALTR_16550_F64:
        case PORT_ALTR_16550_F128:
                if (port->fifosize <= 1)
                        return;

                /* Bounds checking of TX threshold (valid 0 to fifosize-2) */
                if (up->tx_loadsz < 2 || up->tx_loadsz > port->fifosize) {
                        dev_err(port->dev, "TX FIFO Threshold errors, skipping\n");
                        return;
                }
                serial_port_out(port, UART_ALTR_AFR, UART_ALTR_EN_TXFIFO_LW);
                serial_port_out(port, UART_ALTR_TX_LOW, port->fifosize - up->tx_loadsz);
                port->handle_irq = serial8250_tx_threshold_handle_irq;
                break;
        }
}

static void serial8250_THRE_test(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        bool iir_noint1, iir_noint2;

        if (!port->irq)
                return;

        if (up->port.flags & UPF_NO_THRE_TEST)
                return;

        if (port->irqflags & IRQF_SHARED)
                disable_irq_nosync(port->irq);

        /*
         * Test for UARTs that do not reassert THRE when the transmitter is idle and the interrupt
         * has already been cleared.  Real 16550s should always reassert this interrupt whenever the
         * transmitter is idle and the interrupt is enabled.  Delays are necessary to allow register
         * changes to become visible.
         *
         * Synchronize UART_IER access against the console.
         */
        scoped_guard(uart_port_lock_irqsave, port) {
                wait_for_xmitr(up, UART_LSR_THRE);
                serial_port_out_sync(port, UART_IER, UART_IER_THRI);
                udelay(1); /* allow THRE to set */
                iir_noint1 = serial_port_in(port, UART_IIR) & UART_IIR_NO_INT;
                serial_port_out(port, UART_IER, 0);
                serial_port_out_sync(port, UART_IER, UART_IER_THRI);
                udelay(1); /* allow a working UART time to re-assert THRE */
                iir_noint2 = serial_port_in(port, UART_IIR) & UART_IIR_NO_INT;
                serial_port_out(port, UART_IER, 0);
        }

        if (port->irqflags & IRQF_SHARED)
                enable_irq(port->irq);

        /*
         * If the interrupt is not reasserted, or we otherwise don't trust the iir, setup a timer to
         * kick the UART on a regular basis.
         */
        if ((!iir_noint1 && iir_noint2) || up->port.flags & UPF_BUG_THRE)
                up->bugs |= UART_BUG_THRE;
}

static void serial8250_init_mctrl(struct uart_port *port)
{
        if (port->flags & UPF_FOURPORT) {
                if (!port->irq)
                        port->mctrl |= TIOCM_OUT1;
        } else {
                /* Most PC uarts need OUT2 raised to enable interrupts. */
                if (port->irq)
                        port->mctrl |= TIOCM_OUT2;
        }

        serial8250_set_mctrl(port, port->mctrl);
}

static void serial8250_iir_txen_test(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        bool lsr_temt, iir_noint;

        if (port->quirks & UPQ_NO_TXEN_TEST)
                return;

        /* Do a quick test to see if we receive an interrupt when we enable the TX irq. */
        serial_port_out(port, UART_IER, UART_IER_THRI);
        lsr_temt = serial_port_in(port, UART_LSR) & UART_LSR_TEMT;
        iir_noint = serial_port_in(port, UART_IIR) & UART_IIR_NO_INT;
        serial_port_out(port, UART_IER, 0);

        /*
         * Serial over Lan (SoL) hack:
         * Intel 8257x Gigabit ethernet chips have a 16550 emulation, to be used for Serial Over
         * Lan.  Those chips take a longer time than a normal serial device to signalize that a
         * transmission data was queued. Due to that, the above test generally fails. One solution
         * would be to delay the reading of iir. However, this is not reliable, since the timeout is
         * variable. So, in case of UPQ_NO_TXEN_TEST, let's just don't test if we receive TX irq.
         * This way, we'll never enable UART_BUG_TXEN.
         */
        if (lsr_temt && iir_noint) {
                if (!(up->bugs & UART_BUG_TXEN)) {
                        up->bugs |= UART_BUG_TXEN;
                        dev_dbg(port->dev, "enabling bad tx status workarounds\n");
                }
                return;
        }

        /* FIXME: why is this needed? */
        up->bugs &= ~UART_BUG_TXEN;
}

static void serial8250_initialize(struct uart_port *port)
{
        guard(uart_port_lock_irqsave)(port);
        serial_port_out(port, UART_LCR, UART_LCR_WLEN8);

        serial8250_init_mctrl(port);
        serial8250_iir_txen_test(port);
}

int serial8250_do_startup(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        int retval;

        if (!port->fifosize)
                port->fifosize = uart_config[port->type].fifo_size;
        if (!up->tx_loadsz)
                up->tx_loadsz = uart_config[port->type].tx_loadsz;
        if (!up->capabilities)
                up->capabilities = uart_config[port->type].flags;
        up->mcr = 0;

        if (port->iotype != up->cur_iotype)
                set_io_from_upio(port);

        guard(serial8250_rpm)(up);

        serial8250_startup_special(port);

        /*
         * Clear the FIFO buffers and disable them.
         * (they will be reenabled in set_termios())
         */
        serial8250_clear_fifos(up);

        serial8250_clear_interrupts(port);

        /*
         * At this point, there's no way the LSR could still be 0xff;
         * if it is, then bail out, because there's likely no UART
         * here.
         */
        if (!(port->flags & UPF_BUGGY_UART) &&
            (serial_port_in(port, UART_LSR) == 0xff)) {
                dev_info_ratelimited(port->dev, "LSR safety check engaged!\n");
                return -ENODEV;
        }

        serial8250_set_TRG_levels(port);

        /* Check if we need to have shared IRQs */
        if (port->irq && (up->port.flags & UPF_SHARE_IRQ))
                up->port.irqflags |= IRQF_SHARED;

        retval = up->ops->setup_irq(up);
        if (retval)
                return retval;

        serial8250_THRE_test(port);

        up->ops->setup_timer(up);

        serial8250_initialize(port);

        /*
         * Clear the interrupt registers again for luck, and clear the
         * saved flags to avoid getting false values from polling
         * routines or the previous session.
         */
        serial8250_clear_interrupts(port);
        up->lsr_saved_flags = 0;
        up->msr_saved_flags = 0;

        /*
         * Request DMA channels for both RX and TX.
         */
        if (up->dma) {
                const char *msg = NULL;

                if (uart_console(port))
                        msg = "forbid DMA for kernel console";
                else if (serial8250_request_dma(up))
                        msg = "failed to request DMA";
                if (msg) {
                        dev_warn_ratelimited(port->dev, "%s\n", msg);
                        up->dma = NULL;
                }
        }

        /*
         * Set the IER shadow for rx interrupts but defer actual interrupt
         * enable until after the FIFOs are enabled; otherwise, an already-
         * active sender can swamp the interrupt handler with "too much work".
         */
        up->ier = UART_IER_RLSI | UART_IER_RDI;

        if (port->flags & UPF_FOURPORT) {
                unsigned int icp;
                /*
                 * Enable interrupts on the AST Fourport board
                 */
                icp = (port->iobase & 0xfe0) | 0x01f;
                outb_p(0x80, icp);
                inb_p(icp);
        }

        return 0;
}
EXPORT_SYMBOL_GPL(serial8250_do_startup);

static int serial8250_startup(struct uart_port *port)
{
        if (port->startup)
                return port->startup(port);
        return serial8250_do_startup(port);
}

void serial8250_do_shutdown(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        serial8250_rpm_get(up);
        /*
         * Disable interrupts from this port
         *
         * Synchronize UART_IER access against the console.
         */
        scoped_guard(uart_port_lock_irqsave, port) {
                up->ier = 0;
                serial_port_out(port, UART_IER, 0);
        }

        synchronize_irq(port->irq);

        if (up->dma)
                serial8250_release_dma(up);

        scoped_guard(uart_port_lock_irqsave, port) {
                if (port->flags & UPF_FOURPORT) {
                        /* reset interrupts on the AST Fourport board */
                        inb((port->iobase & 0xfe0) | 0x1f);
                        port->mctrl |= TIOCM_OUT1;
                } else
                        port->mctrl &= ~TIOCM_OUT2;

                serial8250_set_mctrl(port, port->mctrl);
        }

        /*
         * Disable break condition and FIFOs
         */
        serial_port_out(port, UART_LCR,
                        serial_port_in(port, UART_LCR) & ~UART_LCR_SBC);
        serial8250_clear_fifos(up);

        rsa_disable(up);

        /*
         * Read data port to reset things, and then unlink from
         * the IRQ chain.
         */
        serial_port_in(port, UART_RX);
        serial8250_rpm_put(up);

        up->ops->release_irq(up);
}
EXPORT_SYMBOL_GPL(serial8250_do_shutdown);

static void serial8250_shutdown(struct uart_port *port)
{
        if (port->shutdown)
                port->shutdown(port);
        else
                serial8250_do_shutdown(port);
}

static void serial8250_flush_buffer(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        if (up->dma)
                serial8250_tx_dma_flush(up);
}

static unsigned int serial8250_do_get_divisor(struct uart_port *port, unsigned int baud)
{
        upf_t magic_multiplier = port->flags & UPF_MAGIC_MULTIPLIER;
        struct uart_8250_port *up = up_to_u8250p(port);
        unsigned int quot;

        /*
         * Handle magic divisors for baud rates above baud_base on SMSC
         * Super I/O chips.  We clamp custom rates from clk/6 and clk/12
         * up to clk/4 (0x8001) and clk/8 (0x8002) respectively.  These
         * magic divisors actually reprogram the baud rate generator's
         * reference clock derived from chips's 14.318MHz clock input.
         *
         * Documentation claims that with these magic divisors the base
         * frequencies of 7.3728MHz and 3.6864MHz are used respectively
         * for the extra baud rates of 460800bps and 230400bps rather
         * than the usual base frequency of 1.8462MHz.  However empirical
         * evidence contradicts that.
         *
         * Instead bit 7 of the DLM register (bit 15 of the divisor) is
         * effectively used as a clock prescaler selection bit for the
         * base frequency of 7.3728MHz, always used.  If set to 0, then
         * the base frequency is divided by 4 for use by the Baud Rate
         * Generator, for the usual arrangement where the value of 1 of
         * the divisor produces the baud rate of 115200bps.  Conversely,
         * if set to 1 and high-speed operation has been enabled with the
         * Serial Port Mode Register in the Device Configuration Space,
         * then the base frequency is supplied directly to the Baud Rate
         * Generator, so for the divisor values of 0x8001, 0x8002, 0x8003,
         * 0x8004, etc. the respective baud rates produced are 460800bps,
         * 230400bps, 153600bps, 115200bps, etc.
         *
         * In all cases only low 15 bits of the divisor are used to divide
         * the baud base and therefore 32767 is the maximum divisor value
         * possible, even though documentation says that the programmable
         * Baud Rate Generator is capable of dividing the internal PLL
         * clock by any divisor from 1 to 65535.
         */
        if (magic_multiplier && baud >= port->uartclk / 6)
                quot = 0x8001;
        else if (magic_multiplier && baud >= port->uartclk / 12)
                quot = 0x8002;
        else
                quot = uart_get_divisor(port, baud);

        /*
         * Oxford Semi 952 rev B workaround
         */
        if (up->bugs & UART_BUG_QUOT && (quot & 0xff) == 0)
                quot++;

        return quot;
}

static unsigned int serial8250_get_divisor(struct uart_port *port,
                                           unsigned int baud,
                                           unsigned int *frac)
{
        if (port->get_divisor)
                return port->get_divisor(port, baud, frac);

        return serial8250_do_get_divisor(port, baud);
}

static unsigned char serial8250_compute_lcr(struct uart_8250_port *up, tcflag_t c_cflag)
{
        u8 lcr = UART_LCR_WLEN(tty_get_char_size(c_cflag));

        if (c_cflag & CSTOPB)
                lcr |= UART_LCR_STOP;
        if (c_cflag & PARENB)
                lcr |= UART_LCR_PARITY;
        if (!(c_cflag & PARODD))
                lcr |= UART_LCR_EPAR;
        if (c_cflag & CMSPAR)
                lcr |= UART_LCR_SPAR;

        return lcr;
}

void serial8250_do_set_divisor(struct uart_port *port, unsigned int baud,
                               unsigned int quot)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        /* Workaround to enable 115200 baud on OMAP1510 internal ports */
        if (is_omap1510_8250(up)) {
                if (baud == 115200) {
                        quot = 1;
                        serial_port_out(port, UART_OMAP_OSC_12M_SEL, 1);
                } else
                        serial_port_out(port, UART_OMAP_OSC_12M_SEL, 0);
        }

        /*
         * For NatSemi, switch to bank 2 not bank 1, to avoid resetting EXCR2,
         * otherwise just set DLAB
         */
        if (up->capabilities & UART_NATSEMI)
                serial_port_out(port, UART_LCR, 0xe0);
        else
                serial_port_out(port, UART_LCR, up->lcr | UART_LCR_DLAB);

        serial_dl_write(up, quot);
}
EXPORT_SYMBOL_GPL(serial8250_do_set_divisor);

static void serial8250_set_divisor(struct uart_port *port, unsigned int baud,
                                   unsigned int quot, unsigned int quot_frac)
{
        if (port->set_divisor)
                port->set_divisor(port, baud, quot, quot_frac);
        else
                serial8250_do_set_divisor(port, baud, quot);
}

static unsigned int serial8250_get_baud_rate(struct uart_port *port,
                                             struct ktermios *termios,
                                             const struct ktermios *old)
{
        unsigned int tolerance = port->uartclk / 100;
        unsigned int min;
        unsigned int max;

        /*
         * Handle magic divisors for baud rates above baud_base on SMSC
         * Super I/O chips.  Enable custom rates of clk/4 and clk/8, but
         * disable divisor values beyond 32767, which are unavailable.
         */
        if (port->flags & UPF_MAGIC_MULTIPLIER) {
                min = port->uartclk / 16 / UART_DIV_MAX >> 1;
                max = (port->uartclk + tolerance) / 4;
        } else {
                min = port->uartclk / 16 / UART_DIV_MAX;
                max = (port->uartclk + tolerance) / 16;
        }

        /*
         * Ask the core to calculate the divisor for us.
         * Allow 1% tolerance at the upper limit so uart clks marginally
         * slower than nominal still match standard baud rates without
         * causing transmission errors.
         */
        return uart_get_baud_rate(port, termios, old, min, max);
}

/*
 * Note in order to avoid the tty port mutex deadlock don't use the next method
 * within the uart port callbacks. Primarily it's supposed to be utilized to
 * handle a sudden reference clock rate change.
 */
void serial8250_update_uartclk(struct uart_port *port, unsigned int uartclk)
{
        struct tty_port *tport = &port->state->port;

        scoped_guard(tty_port_tty, tport) {
                struct tty_struct *tty = scoped_tty();

                guard(rwsem_write)(&tty->termios_rwsem);
                guard(mutex)(&tport->mutex);

                if (port->uartclk == uartclk)
                        return;

                port->uartclk = uartclk;

                if (!tty_port_initialized(tport))
                        return;

                serial8250_do_set_termios(port, &tty->termios, NULL);

                return;
        }
        guard(mutex)(&tport->mutex);
        port->uartclk = uartclk;
}
EXPORT_SYMBOL_GPL(serial8250_update_uartclk);

static void serial8250_set_mini(struct uart_port *port, struct ktermios *termios)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        if (!(up->capabilities & UART_CAP_MINI))
                return;

        termios->c_cflag &= ~(CSTOPB | PARENB | PARODD | CMSPAR);

        tcflag_t csize = termios->c_cflag & CSIZE;
        if (csize == CS5 || csize == CS6) {
                termios->c_cflag &= ~CSIZE;
                termios->c_cflag |= CS7;
        }
}

static void serial8250_set_trigger_for_slow_speed(struct uart_port *port, struct ktermios *termios,
                                                  unsigned int baud)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        if (!(up->capabilities & UART_CAP_FIFO))
                return;
        if (port->fifosize <= 1)
                return;
        if (baud >= 2400)
                return;
        if (up->dma)
                return;

        up->fcr &= ~UART_FCR_TRIGGER_MASK;
        up->fcr |= UART_FCR_TRIGGER_1;
}

/*
 * MCR-based auto flow control. When AFE is enabled, RTS will be deasserted when the receive FIFO
 * contains more characters than the trigger, or the MCR RTS bit is cleared.
 */
static void serial8250_set_afe(struct uart_port *port, struct ktermios *termios)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        if (!(up->capabilities & UART_CAP_AFE))
                return;

        up->mcr &= ~UART_MCR_AFE;
        if (termios->c_cflag & CRTSCTS)
                up->mcr |= UART_MCR_AFE;
}

static void serial8250_set_errors_and_ignores(struct uart_port *port, struct ktermios *termios)
{
        /*
         * Specify which conditions may be considered for error handling and the ignoring of
         * characters. The actual ignoring of characters only occurs if the bit is set in
         * @ignore_status_mask as well.
         */
        port->read_status_mask = UART_LSR_OE | UART_LSR_DR;
        if (termios->c_iflag & INPCK)
                port->read_status_mask |= UART_LSR_FE | UART_LSR_PE;
        if (termios->c_iflag & (IGNBRK | BRKINT | PARMRK))
                port->read_status_mask |= UART_LSR_BI;

        /* Characters to ignore */
        port->ignore_status_mask = 0;
        if (termios->c_iflag & IGNPAR)
                port->ignore_status_mask |= UART_LSR_PE | UART_LSR_FE;
        if (termios->c_iflag & IGNBRK) {
                port->ignore_status_mask |= UART_LSR_BI;
                /*
                 * If we're ignoring parity and break indicators, ignore overruns too (for real raw
                 * support).
                 */
                if (termios->c_iflag & IGNPAR)
                        port->ignore_status_mask |= UART_LSR_OE;
        }

        /* ignore all characters if CREAD is not set */
        if ((termios->c_cflag & CREAD) == 0)
                port->ignore_status_mask |= UART_LSR_DR;
}

static void serial8250_set_ier(struct uart_port *port, struct ktermios *termios)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        /* CTS flow control flag and modem status interrupts */
        up->ier &= ~UART_IER_MSI;
        if (!(up->bugs & UART_BUG_NOMSR) && UART_ENABLE_MS(&up->port, termios->c_cflag))
                up->ier |= UART_IER_MSI;
        if (up->capabilities & UART_CAP_UUE)
                up->ier |= UART_IER_UUE;
        if (up->capabilities & UART_CAP_RTOIE)
                up->ier |= UART_IER_RTOIE;

        serial_port_out(port, UART_IER, up->ier);
}

static void serial8250_set_efr(struct uart_port *port, struct ktermios *termios)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        u8 efr_reg = UART_EFR;
        u8 efr = 0;

        if (!(up->capabilities & UART_CAP_EFR))
                return;

        /*
         * TI16C752/Startech hardware flow control.  FIXME:
         * - TI16C752 requires control thresholds to be set.
         * - UART_MCR_RTS is ineffective if auto-RTS mode is enabled.
         */
        if (termios->c_cflag & CRTSCTS)
                efr |= UART_EFR_CTS;

        if (port->flags & UPF_EXAR_EFR)
                efr_reg = UART_XR_EFR;

        serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B);
        serial_port_out(port, efr_reg, efr);
}

static void serial8250_set_fcr(struct uart_port *port, struct ktermios *termios)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        bool is_16750 = port->type == PORT_16750;

        if (is_16750)
                serial_port_out(port, UART_FCR, up->fcr);

        /*
         * LCR DLAB must be reset to enable 64-byte FIFO mode. If the FCR is written without DLAB
         * set, this mode will be disabled.
         */
        serial_port_out(port, UART_LCR, up->lcr);

        if (is_16750)
                return;

        /* emulated UARTs (Lucent Venus 167x) need two steps */
        if (up->fcr & UART_FCR_ENABLE_FIFO)
                serial_port_out(port, UART_FCR, UART_FCR_ENABLE_FIFO);

        serial_port_out(port, UART_FCR, up->fcr);
}

void
serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios,
                          const struct ktermios *old)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        unsigned int baud, quot, frac = 0;
        u8 lcr;

        serial8250_set_mini(port, termios);
        lcr = serial8250_compute_lcr(up, termios->c_cflag);
        baud = serial8250_get_baud_rate(port, termios, old);
        quot = serial8250_get_divisor(port, baud, &frac);

        /*
         * Ok, we're now changing the port state. Do it with interrupts disabled.
         *
         * Synchronize UART_IER access against the console.
         */
        scoped_guard(serial8250_rpm, up) {
                guard(uart_port_lock_irqsave)(port);

                up->lcr = lcr;
                serial8250_set_trigger_for_slow_speed(port, termios, baud);
                serial8250_set_afe(port, termios);
                uart_update_timeout(port, termios->c_cflag, baud);
                serial8250_set_errors_and_ignores(port, termios);
                serial8250_set_ier(port, termios);
                serial8250_set_efr(port, termios);
                serial8250_set_divisor(port, baud, quot, frac);
                serial8250_set_fcr(port, termios);
                serial8250_set_mctrl(port, port->mctrl);
        }

        /* Don't rewrite B0 */
        if (tty_termios_baud_rate(termios))
                tty_termios_encode_baud_rate(termios, baud, baud);
}
EXPORT_SYMBOL(serial8250_do_set_termios);

static void
serial8250_set_termios(struct uart_port *port, struct ktermios *termios,
                       const struct ktermios *old)
{
        if (port->set_termios)
                port->set_termios(port, termios, old);
        else
                serial8250_do_set_termios(port, termios, old);
}

void serial8250_do_set_ldisc(struct uart_port *port, struct ktermios *termios)
{
        if (termios->c_line == N_PPS) {
                port->flags |= UPF_HARDPPS_CD;
                guard(uart_port_lock_irq)(port);
                serial8250_enable_ms(port);
        } else {
                port->flags &= ~UPF_HARDPPS_CD;
                if (!UART_ENABLE_MS(port, termios->c_cflag)) {
                        guard(uart_port_lock_irq)(port);
                        serial8250_disable_ms(port);
                }
        }
}
EXPORT_SYMBOL_GPL(serial8250_do_set_ldisc);

static void
serial8250_set_ldisc(struct uart_port *port, struct ktermios *termios)
{
        if (port->set_ldisc)
                port->set_ldisc(port, termios);
        else
                serial8250_do_set_ldisc(port, termios);
}

void serial8250_do_pm(struct uart_port *port, unsigned int state,
                      unsigned int oldstate)
{
        struct uart_8250_port *p = up_to_u8250p(port);

        serial8250_set_sleep(p, state != 0);
}
EXPORT_SYMBOL(serial8250_do_pm);

static void
serial8250_pm(struct uart_port *port, unsigned int state,
              unsigned int oldstate)
{
        if (port->pm)
                port->pm(port, state, oldstate);
        else
                serial8250_do_pm(port, state, oldstate);
}

static unsigned int serial8250_port_size(struct uart_8250_port *pt)
{
        if (pt->port.mapsize)
                return pt->port.mapsize;
        if (is_omap1_8250(pt))
                return 0x16 << pt->port.regshift;

        return 8 << pt->port.regshift;
}

/*
 * Resource handling.
 */
static int serial8250_request_std_resource(struct uart_8250_port *up)
{
        unsigned int size = serial8250_port_size(up);
        struct uart_port *port = &up->port;

        switch (port->iotype) {
        case UPIO_AU:
        case UPIO_TSI:
        case UPIO_MEM32:
        case UPIO_MEM32BE:
        case UPIO_MEM16:
        case UPIO_MEM:
                if (!port->mapbase)
                        return -EINVAL;

                if (!request_mem_region(port->mapbase, size, "serial"))
                        return -EBUSY;

                if (port->flags & UPF_IOREMAP) {
                        port->membase = ioremap(port->mapbase, size);
                        if (!port->membase) {
                                release_mem_region(port->mapbase, size);
                                return -ENOMEM;
                        }
                }
                return 0;
        case UPIO_HUB6:
        case UPIO_PORT:
                if (!request_region(port->iobase, size, "serial"))
                        return -EBUSY;
                return 0;
        case UPIO_UNKNOWN:
                break;
        }

        return 0;
}

static void serial8250_release_std_resource(struct uart_8250_port *up)
{
        unsigned int size = serial8250_port_size(up);
        struct uart_port *port = &up->port;

        switch (port->iotype) {
        case UPIO_AU:
        case UPIO_TSI:
        case UPIO_MEM32:
        case UPIO_MEM32BE:
        case UPIO_MEM16:
        case UPIO_MEM:
                if (!port->mapbase)
                        break;

                if (port->flags & UPF_IOREMAP) {
                        iounmap(port->membase);
                        port->membase = NULL;
                }

                release_mem_region(port->mapbase, size);
                break;

        case UPIO_HUB6:
        case UPIO_PORT:
                release_region(port->iobase, size);
                break;
        case UPIO_UNKNOWN:
                break;
        }
}

static void serial8250_release_port(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        serial8250_release_std_resource(up);
}

static int serial8250_request_port(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        return serial8250_request_std_resource(up);
}

static int fcr_get_rxtrig_bytes(struct uart_8250_port *up)
{
        const struct serial8250_config *conf_type = &uart_config[up->port.type];
        unsigned char bytes;

        bytes = conf_type->rxtrig_bytes[UART_FCR_R_TRIG_BITS(up->fcr)];

        return bytes ? bytes : -EOPNOTSUPP;
}

static int bytes_to_fcr_rxtrig(struct uart_8250_port *up, unsigned char bytes)
{
        const struct serial8250_config *conf_type = &uart_config[up->port.type];
        int i;

        if (!conf_type->rxtrig_bytes[UART_FCR_R_TRIG_BITS(UART_FCR_R_TRIG_00)])
                return -EOPNOTSUPP;

        for (i = 1; i < UART_FCR_R_TRIG_MAX_STATE; i++) {
                if (bytes < conf_type->rxtrig_bytes[i])
                        /* Use the nearest lower value */
                        return (--i) << UART_FCR_R_TRIG_SHIFT;
        }

        return UART_FCR_R_TRIG_11;
}

static int do_get_rxtrig(struct tty_port *port)
{
        struct uart_state *state = container_of(port, struct uart_state, port);
        struct uart_port *uport = state->uart_port;
        struct uart_8250_port *up = up_to_u8250p(uport);

        if (!(up->capabilities & UART_CAP_FIFO) || uport->fifosize <= 1)
                return -EINVAL;

        return fcr_get_rxtrig_bytes(up);
}

static int do_serial8250_get_rxtrig(struct tty_port *port)
{
        int rxtrig_bytes;

        mutex_lock(&port->mutex);
        rxtrig_bytes = do_get_rxtrig(port);
        mutex_unlock(&port->mutex);

        return rxtrig_bytes;
}

static ssize_t rx_trig_bytes_show(struct device *dev,
        struct device_attribute *attr, char *buf)
{
        struct tty_port *port = dev_get_drvdata(dev);
        int rxtrig_bytes;

        rxtrig_bytes = do_serial8250_get_rxtrig(port);
        if (rxtrig_bytes < 0)
                return rxtrig_bytes;

        return sysfs_emit(buf, "%d\n", rxtrig_bytes);
}

static int do_set_rxtrig(struct tty_port *port, unsigned char bytes)
{
        struct uart_state *state = container_of(port, struct uart_state, port);
        struct uart_port *uport = state->uart_port;
        struct uart_8250_port *up = up_to_u8250p(uport);
        int rxtrig;

        if (!(up->capabilities & UART_CAP_FIFO) || uport->fifosize <= 1)
                return -EINVAL;

        rxtrig = bytes_to_fcr_rxtrig(up, bytes);
        if (rxtrig < 0)
                return rxtrig;

        serial8250_clear_fifos(up);
        up->fcr &= ~UART_FCR_TRIGGER_MASK;
        up->fcr |= (unsigned char)rxtrig;
        serial_out(up, UART_FCR, up->fcr);
        return 0;
}

static int do_serial8250_set_rxtrig(struct tty_port *port, unsigned char bytes)
{
        int ret;

        mutex_lock(&port->mutex);
        ret = do_set_rxtrig(port, bytes);
        mutex_unlock(&port->mutex);

        return ret;
}

static ssize_t rx_trig_bytes_store(struct device *dev,
        struct device_attribute *attr, const char *buf, size_t count)
{
        struct tty_port *port = dev_get_drvdata(dev);
        unsigned char bytes;
        int ret;

        if (!count)
                return -EINVAL;

        ret = kstrtou8(buf, 10, &bytes);
        if (ret < 0)
                return ret;

        ret = do_serial8250_set_rxtrig(port, bytes);
        if (ret < 0)
                return ret;

        return count;
}

static DEVICE_ATTR_RW(rx_trig_bytes);

static struct attribute *serial8250_dev_attrs[] = {
        &dev_attr_rx_trig_bytes.attr,
        NULL
};

static struct attribute_group serial8250_dev_attr_group = {
        .attrs = serial8250_dev_attrs,
};

static void register_dev_spec_attr_grp(struct uart_8250_port *up)
{
        const struct serial8250_config *conf_type = &uart_config[up->port.type];

        if (conf_type->rxtrig_bytes[0])
                up->port.attr_group = &serial8250_dev_attr_group;
}

static void serial8250_config_port(struct uart_port *port, int flags)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        int ret;

        /*
         * Find the region that we can probe for.  This in turn
         * tells us whether we can probe for the type of port.
         */
        ret = serial8250_request_std_resource(up);
        if (ret < 0)
                return;

        if (port->iotype != up->cur_iotype)
                set_io_from_upio(port);

        if (flags & UART_CONFIG_TYPE)
                autoconfig(up);

        /* HW bugs may trigger IRQ while IIR == NO_INT */
        if (port->type == PORT_TEGRA)
                up->bugs |= UART_BUG_NOMSR;

        if (port->type != PORT_UNKNOWN && flags & UART_CONFIG_IRQ)
                autoconfig_irq(up);

        if (port->type == PORT_UNKNOWN)
                serial8250_release_std_resource(up);

        register_dev_spec_attr_grp(up);
        up->fcr = uart_config[up->port.type].fcr;
}

static int
serial8250_verify_port(struct uart_port *port, struct serial_struct *ser)
{
        if (ser->irq >= irq_get_nr_irqs() || ser->irq < 0 ||
            ser->baud_base < 9600 || ser->type < PORT_UNKNOWN ||
            ser->type >= ARRAY_SIZE(uart_config) || ser->type == PORT_CIRRUS ||
            ser->type == PORT_STARTECH)
                return -EINVAL;
        return 0;
}

static const char *serial8250_type(struct uart_port *port)
{
        int type = port->type;

        if (type >= ARRAY_SIZE(uart_config))
                type = 0;
        return uart_config[type].name;
}

static const struct uart_ops serial8250_pops = {
        .tx_empty        = serial8250_tx_empty,
        .set_mctrl        = serial8250_set_mctrl,
        .get_mctrl        = serial8250_get_mctrl,
        .stop_tx        = serial8250_stop_tx,
        .start_tx        = serial8250_start_tx,
        .throttle        = serial8250_throttle,
        .unthrottle        = serial8250_unthrottle,
        .stop_rx        = serial8250_stop_rx,
        .enable_ms        = serial8250_enable_ms,
        .break_ctl        = serial8250_break_ctl,
        .startup        = serial8250_startup,
        .shutdown        = serial8250_shutdown,
        .flush_buffer        = serial8250_flush_buffer,
        .set_termios        = serial8250_set_termios,
        .set_ldisc        = serial8250_set_ldisc,
        .pm                = serial8250_pm,
        .type                = serial8250_type,
        .release_port        = serial8250_release_port,
        .request_port        = serial8250_request_port,
        .config_port        = serial8250_config_port,
        .verify_port        = serial8250_verify_port,
#ifdef CONFIG_CONSOLE_POLL
        .poll_get_char = serial8250_get_poll_char,
        .poll_put_char = serial8250_put_poll_char,
#endif
};

void serial8250_init_port(struct uart_8250_port *up)
{
        struct uart_port *port = &up->port;

        spin_lock_init(&port->lock);
        port->ctrl_id = 0;
        port->pm = NULL;
        port->ops = &serial8250_pops;
        port->has_sysrq = IS_ENABLED(CONFIG_SERIAL_8250_CONSOLE);

        up->cur_iotype = UPIO_UNKNOWN;
}
EXPORT_SYMBOL_GPL(serial8250_init_port);

void serial8250_set_defaults(struct uart_8250_port *up)
{
        struct uart_port *port = &up->port;

        if (up->port.flags & UPF_FIXED_TYPE) {
                unsigned int type = up->port.type;

                if (!up->port.fifosize)
                        up->port.fifosize = uart_config[type].fifo_size;
                if (!up->tx_loadsz)
                        up->tx_loadsz = uart_config[type].tx_loadsz;
                if (!up->capabilities)
                        up->capabilities = uart_config[type].flags;
        }

        set_io_from_upio(port);

        /* default dma handlers */
        if (up->dma) {
                if (!up->dma->tx_dma)
                        up->dma->tx_dma = serial8250_tx_dma;
                if (!up->dma->rx_dma)
                        up->dma->rx_dma = serial8250_rx_dma;
        }
}
EXPORT_SYMBOL_GPL(serial8250_set_defaults);

#ifdef CONFIG_SERIAL_8250_CONSOLE

static void serial8250_console_putchar(struct uart_port *port, unsigned char ch)
{
        serial_port_out(port, UART_TX, ch);
}

static void serial8250_console_wait_putchar(struct uart_port *port, unsigned char ch)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        wait_for_xmitr(up, UART_LSR_THRE);
        serial8250_console_putchar(port, ch);
}

/*
 *        Restore serial console when h/w power-off detected
 */
static void serial8250_console_restore(struct uart_8250_port *up)
{
        struct uart_port *port = &up->port;
        struct ktermios termios;
        unsigned int baud, quot, frac = 0;

        termios.c_cflag = port->cons->cflag;
        termios.c_ispeed = port->cons->ispeed;
        termios.c_ospeed = port->cons->ospeed;
        if (port->state->port.tty && termios.c_cflag == 0) {
                termios.c_cflag = port->state->port.tty->termios.c_cflag;
                termios.c_ispeed = port->state->port.tty->termios.c_ispeed;
                termios.c_ospeed = port->state->port.tty->termios.c_ospeed;
        }

        baud = serial8250_get_baud_rate(port, &termios, NULL);
        quot = serial8250_get_divisor(port, baud, &frac);

        serial8250_set_divisor(port, baud, quot, frac);
        serial_port_out(port, UART_LCR, up->lcr);
        serial8250_out_MCR(up, up->mcr | UART_MCR_DTR | UART_MCR_RTS);
}

static void fifo_wait_for_lsr(struct uart_8250_port *up, unsigned int count)
{
        unsigned int i;

        for (i = 0; i < count; i++) {
                if (wait_for_lsr(up, UART_LSR_THRE))
                        return;
        }
}

/*
 * Print a string to the serial port using the device FIFO
 *
 * It sends fifosize bytes and then waits for the fifo
 * to get empty.
 */
static void serial8250_console_fifo_write(struct uart_8250_port *up,
                                          const char *s, unsigned int count)
{
        const char *end = s + count;
        unsigned int fifosize = up->tx_loadsz;
        struct uart_port *port = &up->port;
        unsigned int tx_count = 0;
        bool cr_sent = false;
        unsigned int i;

        while (s != end) {
                /* Allow timeout for each byte of a possibly full FIFO */
                fifo_wait_for_lsr(up, fifosize);

                for (i = 0; i < fifosize && s != end; ++i) {
                        if (*s == '\n' && !cr_sent) {
                                serial8250_console_putchar(port, '\r');
                                cr_sent = true;
                        } else {
                                serial8250_console_putchar(port, *s++);
                                cr_sent = false;
                        }
                }
                tx_count = i;
        }

        /*
         * Allow timeout for each byte written since the caller will only wait
         * for UART_LSR_BOTH_EMPTY using the timeout of a single character
         */
        fifo_wait_for_lsr(up, tx_count);
}

/*
 *        Print a string to the serial port trying not to disturb
 *        any possible real use of the port...
 *
 *        The console_lock must be held when we get here.
 *
 *        Doing runtime PM is really a bad idea for the kernel console.
 *        Thus, we assume the function is called when device is powered up.
 */
void serial8250_console_write(struct uart_8250_port *up, const char *s,
                              unsigned int count)
{
        struct uart_8250_em485 *em485 = up->em485;
        struct uart_port *port = &up->port;
        unsigned long flags;
        unsigned int ier, use_fifo;
        int locked = 1;

        touch_nmi_watchdog();

        if (oops_in_progress)
                locked = uart_port_trylock_irqsave(port, &flags);
        else
                uart_port_lock_irqsave(port, &flags);

        /*
         *        First save the IER then disable the interrupts
         */
        ier = serial_port_in(port, UART_IER);
        serial8250_clear_IER(up);

        /* check scratch reg to see if port powered off during system sleep */
        if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) {
                serial8250_console_restore(up);
                up->canary = 0;
        }

        if (em485) {
                if (em485->tx_stopped)
                        up->rs485_start_tx(up, false);
                mdelay(port->rs485.delay_rts_before_send);
        }

        use_fifo = (up->capabilities & UART_CAP_FIFO) &&
                /*
                 * BCM283x requires to check the fifo
                 * after each byte.
                 */
                !(up->capabilities & UART_CAP_MINI) &&
                /*
                 * tx_loadsz contains the transmit fifo size
                 */
                up->tx_loadsz > 1 &&
                (up->fcr & UART_FCR_ENABLE_FIFO) &&
                port->state &&
                test_bit(TTY_PORT_INITIALIZED, &port->state->port.iflags) &&
                /*
                 * After we put a data in the fifo, the controller will send
                 * it regardless of the CTS state. Therefore, only use fifo
                 * if we don't use control flow.
                 */
                !(up->port.flags & UPF_CONS_FLOW);

        if (likely(use_fifo))
                serial8250_console_fifo_write(up, s, count);
        else
                uart_console_write(port, s, count, serial8250_console_wait_putchar);

        /*
         *        Finally, wait for transmitter to become empty
         *        and restore the IER
         */
        wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);

        if (em485) {
                mdelay(port->rs485.delay_rts_after_send);
                if (em485->tx_stopped)
                        up->rs485_stop_tx(up, false);
        }

        serial_port_out(port, UART_IER, ier);

        /*
         *        The receive handling will happen properly because the
         *        receive ready bit will still be set; it is not cleared
         *        on read.  However, modem control will not, we must
         *        call it if we have saved something in the saved flags
         *        while processing with interrupts off.
         */
        if (up->msr_saved_flags)
                serial8250_modem_status(up);

        if (locked)
                uart_port_unlock_irqrestore(port, flags);
}

static unsigned int probe_baud(struct uart_port *port)
{
        unsigned char lcr, dll, dlm;
        unsigned int quot;

        lcr = serial_port_in(port, UART_LCR);
        serial_port_out(port, UART_LCR, lcr | UART_LCR_DLAB);
        dll = serial_port_in(port, UART_DLL);
        dlm = serial_port_in(port, UART_DLM);
        serial_port_out(port, UART_LCR, lcr);

        quot = (dlm << 8) | dll;
        return (port->uartclk / 16) / quot;
}

int serial8250_console_setup(struct uart_port *port, char *options, bool probe)
{
        int baud = 9600;
        int bits = 8;
        int parity = 'n';
        int flow = 'n';
        int ret;

        if (!port->iobase && !port->membase)
                return -ENODEV;

        if (options)
                uart_parse_options(options, &baud, &parity, &bits, &flow);
        else if (probe)
                baud = probe_baud(port);

        ret = uart_set_options(port, port->cons, baud, parity, bits, flow);
        if (ret)
                return ret;

        if (port->dev)
                pm_runtime_get_sync(port->dev);

        return 0;
}

int serial8250_console_exit(struct uart_port *port)
{
        if (port->dev)
                pm_runtime_put_sync(port->dev);

        return 0;
}

#endif /* CONFIG_SERIAL_8250_CONSOLE */

MODULE_DESCRIPTION("Base port operations for 8250/16550-type serial ports");
MODULE_LICENSE("GPL");










































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Network device features.
 */
#ifndef _LINUX_NETDEV_FEATURES_H
#define _LINUX_NETDEV_FEATURES_H

#include <linux/types.h>
#include <linux/bitops.h>
#include <asm/byteorder.h>

typedef u64 netdev_features_t;

enum {
        NETIF_F_SG_BIT,                        /* Scatter/gather IO. */
        NETIF_F_IP_CSUM_BIT,                /* Can checksum TCP/UDP over IPv4. */
        __UNUSED_NETIF_F_1,
        NETIF_F_HW_CSUM_BIT,                /* Can checksum all the packets. */
        NETIF_F_IPV6_CSUM_BIT,                /* Can checksum TCP/UDP over IPV6 */
        NETIF_F_HIGHDMA_BIT,                /* Can DMA to high memory. */
        NETIF_F_FRAGLIST_BIT,                /* Scatter/gather IO. */
        NETIF_F_HW_VLAN_CTAG_TX_BIT,        /* Transmit VLAN CTAG HW acceleration */
        NETIF_F_HW_VLAN_CTAG_RX_BIT,        /* Receive VLAN CTAG HW acceleration */
        NETIF_F_HW_VLAN_CTAG_FILTER_BIT,/* Receive filtering on VLAN CTAGs */
        NETIF_F_VLAN_CHALLENGED_BIT,        /* Device cannot handle VLAN packets */
        NETIF_F_GSO_BIT,                /* Enable software GSO. */
        __UNUSED_NETIF_F_12,
        __UNUSED_NETIF_F_13,
        NETIF_F_GRO_BIT,                /* Generic receive offload */
        NETIF_F_LRO_BIT,                /* large receive offload */

        /**/NETIF_F_GSO_SHIFT,                /* keep the order of SKB_GSO_* bits */
        NETIF_F_TSO_BIT                        /* ... TCPv4 segmentation */
                = NETIF_F_GSO_SHIFT,
        NETIF_F_GSO_ROBUST_BIT,                /* ... ->SKB_GSO_DODGY */
        NETIF_F_TSO_ECN_BIT,                /* ... TCP ECN support */
        NETIF_F_TSO_MANGLEID_BIT,        /* ... IPV4 ID mangling allowed */
        NETIF_F_TSO6_BIT,                /* ... TCPv6 segmentation */
        NETIF_F_FSO_BIT,                /* ... FCoE segmentation */
        NETIF_F_GSO_GRE_BIT,                /* ... GRE with TSO */
        NETIF_F_GSO_GRE_CSUM_BIT,        /* ... GRE with csum with TSO */
        NETIF_F_GSO_IPXIP4_BIT,                /* ... IP4 or IP6 over IP4 with TSO */
        NETIF_F_GSO_IPXIP6_BIT,                /* ... IP4 or IP6 over IP6 with TSO */
        NETIF_F_GSO_UDP_TUNNEL_BIT,        /* ... UDP TUNNEL with TSO */
        NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT,/* ... UDP TUNNEL with TSO & CSUM */
        NETIF_F_GSO_PARTIAL_BIT,        /* ... Only segment inner-most L4
                                         *     in hardware and all other
                                         *     headers in software.
                                         */
        NETIF_F_GSO_TUNNEL_REMCSUM_BIT, /* ... TUNNEL with TSO & REMCSUM */
        NETIF_F_GSO_SCTP_BIT,                /* ... SCTP fragmentation */
        NETIF_F_GSO_ESP_BIT,                /* ... ESP with TSO */
        NETIF_F_GSO_UDP_BIT,                /* ... UFO, deprecated except tuntap */
        NETIF_F_GSO_UDP_L4_BIT,                /* ... UDP payload GSO (not UFO) */
        NETIF_F_GSO_FRAGLIST_BIT,                /* ... Fraglist GSO */
        NETIF_F_GSO_ACCECN_BIT,                /* TCP AccECN w/ TSO (no clear CWR) */
        /**/NETIF_F_GSO_LAST =                /* last bit, see GSO_MASK */
                NETIF_F_GSO_ACCECN_BIT,

        NETIF_F_FCOE_CRC_BIT,                /* FCoE CRC32 */
        NETIF_F_SCTP_CRC_BIT,                /* SCTP checksum offload */
        NETIF_F_NTUPLE_BIT,                /* N-tuple filters supported */
        NETIF_F_RXHASH_BIT,                /* Receive hashing offload */
        NETIF_F_RXCSUM_BIT,                /* Receive checksumming offload */
        NETIF_F_NOCACHE_COPY_BIT,        /* Use no-cache copyfromuser */
        NETIF_F_LOOPBACK_BIT,                /* Enable loopback */
        NETIF_F_RXFCS_BIT,                /* Append FCS to skb pkt data */
        NETIF_F_RXALL_BIT,                /* Receive errored frames too */
        NETIF_F_HW_VLAN_STAG_TX_BIT,        /* Transmit VLAN STAG HW acceleration */
        NETIF_F_HW_VLAN_STAG_RX_BIT,        /* Receive VLAN STAG HW acceleration */
        NETIF_F_HW_VLAN_STAG_FILTER_BIT,/* Receive filtering on VLAN STAGs */
        NETIF_F_HW_L2FW_DOFFLOAD_BIT,        /* Allow L2 Forwarding in Hardware */

        NETIF_F_HW_TC_BIT,                /* Offload TC infrastructure */
        NETIF_F_HW_ESP_BIT,                /* Hardware ESP transformation offload */
        NETIF_F_HW_ESP_TX_CSUM_BIT,        /* ESP with TX checksum offload */
        NETIF_F_RX_UDP_TUNNEL_PORT_BIT, /* Offload of RX port for UDP tunnels */
        NETIF_F_HW_TLS_TX_BIT,                /* Hardware TLS TX offload */
        NETIF_F_HW_TLS_RX_BIT,                /* Hardware TLS RX offload */

        NETIF_F_GRO_HW_BIT,                /* Hardware Generic receive offload */
        NETIF_F_HW_TLS_RECORD_BIT,        /* Offload TLS record */
        NETIF_F_GRO_FRAGLIST_BIT,        /* Fraglist GRO */

        NETIF_F_HW_MACSEC_BIT,                /* Offload MACsec operations */
        NETIF_F_GRO_UDP_FWD_BIT,        /* Allow UDP GRO for forwarding */

        NETIF_F_HW_HSR_TAG_INS_BIT,        /* Offload HSR tag insertion */
        NETIF_F_HW_HSR_TAG_RM_BIT,        /* Offload HSR tag removal */
        NETIF_F_HW_HSR_FWD_BIT,                /* Offload HSR forwarding */
        NETIF_F_HW_HSR_DUP_BIT,                /* Offload HSR duplication */

        /*
         * Add your fresh new feature above and remember to update
         * netdev_features_strings[] in net/ethtool/common.c and maybe
         * some feature mask #defines below. Please also describe it
         * in Documentation/networking/netdev-features.rst.
         */

        /**/NETDEV_FEATURE_COUNT
};

/* copy'n'paste compression ;) */
#define __NETIF_F_BIT(bit)        ((netdev_features_t)1 << (bit))
#define __NETIF_F(name)                __NETIF_F_BIT(NETIF_F_##name##_BIT)

#define NETIF_F_FCOE_CRC        __NETIF_F(FCOE_CRC)
#define NETIF_F_FRAGLIST        __NETIF_F(FRAGLIST)
#define NETIF_F_FSO                __NETIF_F(FSO)
#define NETIF_F_GRO                __NETIF_F(GRO)
#define NETIF_F_GRO_HW                __NETIF_F(GRO_HW)
#define NETIF_F_GSO                __NETIF_F(GSO)
#define NETIF_F_GSO_ROBUST        __NETIF_F(GSO_ROBUST)
#define NETIF_F_HIGHDMA                __NETIF_F(HIGHDMA)
#define NETIF_F_HW_CSUM                __NETIF_F(HW_CSUM)
#define NETIF_F_HW_VLAN_CTAG_FILTER __NETIF_F(HW_VLAN_CTAG_FILTER)
#define NETIF_F_HW_VLAN_CTAG_RX        __NETIF_F(HW_VLAN_CTAG_RX)
#define NETIF_F_HW_VLAN_CTAG_TX        __NETIF_F(HW_VLAN_CTAG_TX)
#define NETIF_F_IP_CSUM                __NETIF_F(IP_CSUM)
#define NETIF_F_IPV6_CSUM        __NETIF_F(IPV6_CSUM)
#define NETIF_F_LOOPBACK        __NETIF_F(LOOPBACK)
#define NETIF_F_LRO                __NETIF_F(LRO)
#define NETIF_F_NOCACHE_COPY        __NETIF_F(NOCACHE_COPY)
#define NETIF_F_NTUPLE                __NETIF_F(NTUPLE)
#define NETIF_F_RXCSUM                __NETIF_F(RXCSUM)
#define NETIF_F_RXHASH                __NETIF_F(RXHASH)
#define NETIF_F_SCTP_CRC        __NETIF_F(SCTP_CRC)
#define NETIF_F_SG                __NETIF_F(SG)
#define NETIF_F_TSO6                __NETIF_F(TSO6)
#define NETIF_F_TSO_ECN                __NETIF_F(TSO_ECN)
#define NETIF_F_GSO_ACCECN        __NETIF_F(GSO_ACCECN)
#define NETIF_F_TSO                __NETIF_F(TSO)
#define NETIF_F_VLAN_CHALLENGED        __NETIF_F(VLAN_CHALLENGED)
#define NETIF_F_RXFCS                __NETIF_F(RXFCS)
#define NETIF_F_RXALL                __NETIF_F(RXALL)
#define NETIF_F_GSO_GRE                __NETIF_F(GSO_GRE)
#define NETIF_F_GSO_GRE_CSUM        __NETIF_F(GSO_GRE_CSUM)
#define NETIF_F_GSO_IPXIP4        __NETIF_F(GSO_IPXIP4)
#define NETIF_F_GSO_IPXIP6        __NETIF_F(GSO_IPXIP6)
#define NETIF_F_GSO_UDP_TUNNEL        __NETIF_F(GSO_UDP_TUNNEL)
#define NETIF_F_GSO_UDP_TUNNEL_CSUM __NETIF_F(GSO_UDP_TUNNEL_CSUM)
#define NETIF_F_TSO_MANGLEID        __NETIF_F(TSO_MANGLEID)
#define NETIF_F_GSO_PARTIAL         __NETIF_F(GSO_PARTIAL)
#define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM)
#define NETIF_F_GSO_SCTP        __NETIF_F(GSO_SCTP)
#define NETIF_F_GSO_ESP                __NETIF_F(GSO_ESP)
#define NETIF_F_GSO_UDP                __NETIF_F(GSO_UDP)
#define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
#define NETIF_F_HW_VLAN_STAG_RX        __NETIF_F(HW_VLAN_STAG_RX)
#define NETIF_F_HW_VLAN_STAG_TX        __NETIF_F(HW_VLAN_STAG_TX)
#define NETIF_F_HW_L2FW_DOFFLOAD        __NETIF_F(HW_L2FW_DOFFLOAD)
#define NETIF_F_HW_TC                __NETIF_F(HW_TC)
#define NETIF_F_HW_ESP                __NETIF_F(HW_ESP)
#define NETIF_F_HW_ESP_TX_CSUM        __NETIF_F(HW_ESP_TX_CSUM)
#define        NETIF_F_RX_UDP_TUNNEL_PORT  __NETIF_F(RX_UDP_TUNNEL_PORT)
#define NETIF_F_HW_TLS_RECORD        __NETIF_F(HW_TLS_RECORD)
#define NETIF_F_GSO_UDP_L4        __NETIF_F(GSO_UDP_L4)
#define NETIF_F_HW_TLS_TX        __NETIF_F(HW_TLS_TX)
#define NETIF_F_HW_TLS_RX        __NETIF_F(HW_TLS_RX)
#define NETIF_F_GRO_FRAGLIST        __NETIF_F(GRO_FRAGLIST)
#define NETIF_F_GSO_FRAGLIST        __NETIF_F(GSO_FRAGLIST)
#define NETIF_F_HW_MACSEC        __NETIF_F(HW_MACSEC)
#define NETIF_F_GRO_UDP_FWD        __NETIF_F(GRO_UDP_FWD)
#define NETIF_F_HW_HSR_TAG_INS        __NETIF_F(HW_HSR_TAG_INS)
#define NETIF_F_HW_HSR_TAG_RM        __NETIF_F(HW_HSR_TAG_RM)
#define NETIF_F_HW_HSR_FWD        __NETIF_F(HW_HSR_FWD)
#define NETIF_F_HW_HSR_DUP        __NETIF_F(HW_HSR_DUP)

/* Finds the next feature with the highest number of the range of start-1 till 0.
 */
static inline int find_next_netdev_feature(u64 feature, unsigned long start)
{
        /* like BITMAP_LAST_WORD_MASK() for u64
         * this sets the most significant 64 - start to 0.
         */
        feature &= ~0ULL >> (-start & ((sizeof(feature) * 8) - 1));

        return fls64(feature) - 1;
}

/* This goes for the MSB to the LSB through the set feature bits,
 * mask_addr should be a u64 and bit an int
 */
#define for_each_netdev_feature(mask_addr, bit)                                \
        for ((bit) = find_next_netdev_feature((mask_addr),                \
                                              NETDEV_FEATURE_COUNT);        \
             (bit) >= 0;                                                \
             (bit) = find_next_netdev_feature((mask_addr), (bit)))

/* Features valid for ethtool to change */
/* = all defined minus driver/device-class-related */
#define NETIF_F_NEVER_CHANGE        NETIF_F_VLAN_CHALLENGED

/* remember that ((t)1 << t_BITS) is undefined in C99 */
#define NETIF_F_ETHTOOL_BITS        ((__NETIF_F_BIT(NETDEV_FEATURE_COUNT - 1) | \
                (__NETIF_F_BIT(NETDEV_FEATURE_COUNT - 1) - 1)) & \
                ~NETIF_F_NEVER_CHANGE)

/* Segmentation offload feature mask */
#define NETIF_F_GSO_MASK        (__NETIF_F_BIT(NETIF_F_GSO_LAST + 1) - \
                __NETIF_F_BIT(NETIF_F_GSO_SHIFT))

/* List of IP checksum features. Note that NETIF_F_HW_CSUM should not be
 * set in features when NETIF_F_IP_CSUM or NETIF_F_IPV6_CSUM are set--
 * this would be contradictory
 */
#define NETIF_F_CSUM_MASK        (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | \
                                 NETIF_F_HW_CSUM)

#define NETIF_F_ALL_TSO         (NETIF_F_TSO | NETIF_F_TSO6 | \
                                 NETIF_F_TSO_ECN | NETIF_F_TSO_MANGLEID)

/* List of features with software fallbacks. */
#define NETIF_F_GSO_SOFTWARE        (NETIF_F_ALL_TSO | \
                                 NETIF_F_GSO_ACCECN | NETIF_F_GSO_SCTP | \
                                 NETIF_F_GSO_UDP_L4 | NETIF_F_GSO_FRAGLIST)

/*
 * If one device supports one of these features, then enable them
 * for all in netdev_increment_features.
 */
#define NETIF_F_ONE_FOR_ALL        (NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ROBUST | \
                                 NETIF_F_SG | NETIF_F_HIGHDMA |                \
                                 NETIF_F_FRAGLIST | NETIF_F_VLAN_CHALLENGED)

/*
 * If one device doesn't support one of these features, then disable it
 * for all in netdev_increment_features.
 */
#define NETIF_F_ALL_FOR_ALL        (NETIF_F_NOCACHE_COPY | NETIF_F_FSO)

/*
 * If upper/master device has these features disabled, they must be disabled
 * on all lower/slave devices as well.
 */
#define NETIF_F_UPPER_DISABLES        NETIF_F_LRO

/* changeable features with no special hardware requirements */
#define NETIF_F_SOFT_FEATURES        (NETIF_F_GSO | NETIF_F_GRO)

/* Changeable features with no special hardware requirements that defaults to off. */
#define NETIF_F_SOFT_FEATURES_OFF        (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD)

#define NETIF_F_VLAN_FEATURES        (NETIF_F_HW_VLAN_CTAG_FILTER | \
                                 NETIF_F_HW_VLAN_CTAG_RX | \
                                 NETIF_F_HW_VLAN_CTAG_TX | \
                                 NETIF_F_HW_VLAN_STAG_FILTER | \
                                 NETIF_F_HW_VLAN_STAG_RX | \
                                 NETIF_F_HW_VLAN_STAG_TX)

#define NETIF_F_GSO_ENCAP_ALL        (NETIF_F_GSO_GRE |                        \
                                 NETIF_F_GSO_GRE_CSUM |                        \
                                 NETIF_F_GSO_IPXIP4 |                        \
                                 NETIF_F_GSO_IPXIP6 |                        \
                                 NETIF_F_GSO_UDP_TUNNEL |                \
                                 NETIF_F_GSO_UDP_TUNNEL_CSUM)

static inline netdev_features_t netdev_base_features(netdev_features_t features)
{
        features &= ~NETIF_F_ONE_FOR_ALL;
        features |= NETIF_F_ALL_FOR_ALL;
        return features;
}

#endif        /* _LINUX_NETDEV_FEATURES_H */





































    2 
    2 


    2 



    2 



    2 

























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * SHA-1 optimized for x86_64
 *
 * Copyright 2025 Google LLC
 */
#include <asm/fpu/api.h>
#include <linux/static_call.h>

DEFINE_STATIC_CALL(sha1_blocks_x86, sha1_blocks_generic);

#define DEFINE_X86_SHA1_FN(c_fn, asm_fn)                           \
        asmlinkage void asm_fn(struct sha1_block_state *state,     \
                               const u8 *data, size_t nblocks);    \
        static void c_fn(struct sha1_block_state *state,           \
                         const u8 *data, size_t nblocks)           \
        {                                                          \
                if (likely(irq_fpu_usable())) {                    \
                        kernel_fpu_begin();                        \
                        asm_fn(state, data, nblocks);              \
                        kernel_fpu_end();                          \
                } else {                                           \
                        sha1_blocks_generic(state, data, nblocks); \
                }                                                  \
        }

DEFINE_X86_SHA1_FN(sha1_blocks_ssse3, sha1_transform_ssse3);
DEFINE_X86_SHA1_FN(sha1_blocks_avx, sha1_transform_avx);
DEFINE_X86_SHA1_FN(sha1_blocks_ni, sha1_ni_transform);

#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */

asmlinkage void sha1_transform_avx2(struct sha1_block_state *state,
                                    const u8 *data, size_t nblocks);
static void sha1_blocks_avx2(struct sha1_block_state *state,
                             const u8 *data, size_t nblocks)
{
        if (likely(irq_fpu_usable())) {
                kernel_fpu_begin();
                /* Select the optimal transform based on the number of blocks */
                if (nblocks >= SHA1_AVX2_BLOCK_OPTSIZE)
                        sha1_transform_avx2(state, data, nblocks);
                else
                        sha1_transform_avx(state, data, nblocks);
                kernel_fpu_end();
        } else {
                sha1_blocks_generic(state, data, nblocks);
        }
}

static void sha1_blocks(struct sha1_block_state *state,
                        const u8 *data, size_t nblocks)
{
        static_call(sha1_blocks_x86)(state, data, nblocks);
}

#define sha1_mod_init_arch sha1_mod_init_arch
static void sha1_mod_init_arch(void)
{
        if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
                static_call_update(sha1_blocks_x86, sha1_blocks_ni);
        } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
                                     NULL) &&
                   boot_cpu_has(X86_FEATURE_AVX)) {
                if (boot_cpu_has(X86_FEATURE_AVX2) &&
                    boot_cpu_has(X86_FEATURE_BMI1) &&
                    boot_cpu_has(X86_FEATURE_BMI2))
                        static_call_update(sha1_blocks_x86, sha1_blocks_avx2);
                else
                        static_call_update(sha1_blocks_x86, sha1_blocks_avx);
        } else if (boot_cpu_has(X86_FEATURE_SSSE3)) {
                static_call_update(sha1_blocks_x86, sha1_blocks_ssse3);
        }
}










   39 






























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM ipi

#if !defined(_TRACE_IPI_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_IPI_H

#include <linux/tracepoint.h>

TRACE_EVENT(ipi_send_cpu,

        TP_PROTO(const unsigned int cpu, unsigned long callsite, void *callback),

        TP_ARGS(cpu, callsite, callback),

        TP_STRUCT__entry(
                __field(unsigned int, cpu)
                __field(void *, callsite)
                __field(void *, callback)
        ),

        TP_fast_assign(
                __entry->cpu = cpu;
                __entry->callsite = (void *)callsite;
                __entry->callback = callback;
        ),

        TP_printk("cpu=%u callsite=%pS callback=%pS",
                  __entry->cpu, __entry->callsite, __entry->callback)
);

TRACE_EVENT(ipi_send_cpumask,

        TP_PROTO(const struct cpumask *cpumask, unsigned long callsite, void *callback),

        TP_ARGS(cpumask, callsite, callback),

        TP_STRUCT__entry(
                __cpumask(cpumask)
                __field(void *, callsite)
                __field(void *, callback)
        ),

        TP_fast_assign(
                __assign_cpumask(cpumask, cpumask_bits(cpumask));
                __entry->callsite = (void *)callsite;
                __entry->callback = callback;
        ),

        TP_printk("cpumask=%s callsite=%pS callback=%pS",
                  __get_cpumask(cpumask), __entry->callsite, __entry->callback)
);

#ifdef CONFIG_HAVE_EXTRA_IPI_TRACEPOINTS
/**
 * ipi_raise - called when a smp cross call is made
 *
 * @mask: mask of recipient CPUs for the IPI
 * @reason: string identifying the IPI purpose
 *
 * It is necessary for @reason to be a static string declared with
 * __tracepoint_string.
 */
TRACE_EVENT(ipi_raise,

        TP_PROTO(const struct cpumask *mask, const char *reason),

        TP_ARGS(mask, reason),

        TP_STRUCT__entry(
                __bitmask(target_cpus, nr_cpumask_bits)
                __field(const char *, reason)
        ),

        TP_fast_assign(
                __assign_bitmask(target_cpus, cpumask_bits(mask), nr_cpumask_bits);
                __entry->reason = reason;
        ),

        TP_printk("target_mask=%s (%s)", __get_bitmask(target_cpus), __entry->reason)
);

DECLARE_EVENT_CLASS(ipi_handler,

        TP_PROTO(const char *reason),

        TP_ARGS(reason),

        TP_STRUCT__entry(
                __field(const char *, reason)
        ),

        TP_fast_assign(
                __entry->reason = reason;
        ),

        TP_printk("(%s)", __entry->reason)
);

/**
 * ipi_entry - called immediately before the IPI handler
 *
 * @reason: string identifying the IPI purpose
 *
 * It is necessary for @reason to be a static string declared with
 * __tracepoint_string, ideally the same as used with trace_ipi_raise
 * for that IPI.
 */
DEFINE_EVENT(ipi_handler, ipi_entry,

        TP_PROTO(const char *reason),

        TP_ARGS(reason)
);

/**
 * ipi_exit - called immediately after the IPI handler returns
 *
 * @reason: string identifying the IPI purpose
 *
 * It is necessary for @reason to be a static string declared with
 * __tracepoint_string, ideally the same as used with trace_ipi_raise for
 * that IPI.
 */
DEFINE_EVENT(ipi_handler, ipi_exit,

        TP_PROTO(const char *reason),

        TP_ARGS(reason)
);
#endif /* CONFIG_HAVE_EXTRA_IPI_TRACEPOINTS */

#endif /* _TRACE_IPI_H */

/* This part must be outside protection */
#include <trace/define_trace.h>






































































































































































































































































































































   39 






   39 

















   39 
   39 


































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/kernel/panic.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 * This function is used through-out the kernel (including mm and fs)
 * to indicate a major problem.
 */
#include <linux/debug_locks.h>
#include <linux/sched/debug.h>
#include <linux/interrupt.h>
#include <linux/kgdb.h>
#include <linux/kmsg_dump.h>
#include <linux/kallsyms.h>
#include <linux/notifier.h>
#include <linux/vt_kern.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/ftrace.h>
#include <linux/reboot.h>
#include <linux/delay.h>
#include <linux/kexec.h>
#include <linux/panic_notifier.h>
#include <linux/sched.h>
#include <linux/string_helpers.h>
#include <linux/sysrq.h>
#include <linux/init.h>
#include <linux/nmi.h>
#include <linux/console.h>
#include <linux/bug.h>
#include <linux/ratelimit.h>
#include <linux/debugfs.h>
#include <linux/sysfs.h>
#include <linux/context_tracking.h>
#include <linux/seq_buf.h>
#include <linux/sys_info.h>
#include <trace/events/error_report.h>
#include <asm/sections.h>

#define PANIC_TIMER_STEP 100
#define PANIC_BLINK_SPD 18

#ifdef CONFIG_SMP
/*
 * Should we dump all CPUs backtraces in an oops event?
 * Defaults to 0, can be changed via sysctl.
 */
static unsigned int __read_mostly sysctl_oops_all_cpu_backtrace;
#else
#define sysctl_oops_all_cpu_backtrace 0
#endif /* CONFIG_SMP */

int panic_on_oops = IS_ENABLED(CONFIG_PANIC_ON_OOPS);
static unsigned long tainted_mask =
        IS_ENABLED(CONFIG_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0;
static int pause_on_oops;
static int pause_on_oops_flag;
static DEFINE_SPINLOCK(pause_on_oops_lock);
bool crash_kexec_post_notifiers;
int panic_on_warn __read_mostly;
unsigned long panic_on_taint;
bool panic_on_taint_nousertaint = false;
static unsigned int warn_limit __read_mostly;
static bool panic_console_replay;

bool panic_triggering_all_cpu_backtrace;
static bool panic_this_cpu_backtrace_printed;

int panic_timeout = CONFIG_PANIC_TIMEOUT;
EXPORT_SYMBOL_GPL(panic_timeout);

unsigned long panic_print;

ATOMIC_NOTIFIER_HEAD(panic_notifier_list);

EXPORT_SYMBOL(panic_notifier_list);

static void panic_print_deprecated(void)
{
        pr_info_once("Kernel: The 'panic_print' parameter is now deprecated. Please use 'panic_sys_info' and 'panic_console_replay' instead.\n");
}

#ifdef CONFIG_SYSCTL

/*
 * Taint values can only be increased
 * This means we can safely use a temporary.
 */
static int proc_taint(const struct ctl_table *table, int write,
                               void *buffer, size_t *lenp, loff_t *ppos)
{
        struct ctl_table t;
        unsigned long tmptaint = get_taint();
        int err;

        if (write && !capable(CAP_SYS_ADMIN))
                return -EPERM;

        t = *table;
        t.data = &tmptaint;
        err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
        if (err < 0)
                return err;

        if (write) {
                int i;

                /*
                 * If we are relying on panic_on_taint not producing
                 * false positives due to userspace input, bail out
                 * before setting the requested taint flags.
                 */
                if (panic_on_taint_nousertaint && (tmptaint & panic_on_taint))
                        return -EINVAL;

                /*
                 * Poor man's atomic or. Not worth adding a primitive
                 * to everyone's atomic.h for this
                 */
                for (i = 0; i < TAINT_FLAGS_COUNT; i++)
                        if ((1UL << i) & tmptaint)
                                add_taint(i, LOCKDEP_STILL_OK);
        }

        return err;
}

static int sysctl_panic_print_handler(const struct ctl_table *table, int write,
                           void *buffer, size_t *lenp, loff_t *ppos)
{
        panic_print_deprecated();
        return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}

static const struct ctl_table kern_panic_table[] = {
#ifdef CONFIG_SMP
        {
                .procname       = "oops_all_cpu_backtrace",
                .data           = &sysctl_oops_all_cpu_backtrace,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec_minmax,
                .extra1         = SYSCTL_ZERO,
                .extra2         = SYSCTL_ONE,
        },
#endif
        {
                .procname        = "tainted",
                .maxlen                = sizeof(long),
                .mode                = 0644,
                .proc_handler        = proc_taint,
        },
        {
                .procname        = "panic",
                .data                = &panic_timeout,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "panic_on_oops",
                .data                = &panic_on_oops,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "panic_print",
                .data                = &panic_print,
                .maxlen                = sizeof(unsigned long),
                .mode                = 0644,
                .proc_handler        = sysctl_panic_print_handler,
        },
        {
                .procname        = "panic_on_warn",
                .data                = &panic_on_warn,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
        {
                .procname       = "warn_limit",
                .data           = &warn_limit,
                .maxlen         = sizeof(warn_limit),
                .mode           = 0644,
                .proc_handler   = proc_douintvec,
        },
#if (defined(CONFIG_X86_32) || defined(CONFIG_PARISC)) && \
        defined(CONFIG_DEBUG_STACKOVERFLOW)
        {
                .procname        = "panic_on_stackoverflow",
                .data                = &sysctl_panic_on_stackoverflow,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
#endif
        {
                .procname        = "panic_sys_info",
                .data                = &panic_print,
                .maxlen         = sizeof(panic_print),
                .mode                = 0644,
                .proc_handler        = sysctl_sys_info_handler,
        },
};

static __init int kernel_panic_sysctls_init(void)
{
        register_sysctl_init("kernel", kern_panic_table);
        return 0;
}
late_initcall(kernel_panic_sysctls_init);
#endif

/* The format is "panic_sys_info=tasks,mem,locks,ftrace,..." */
static int __init setup_panic_sys_info(char *buf)
{
        /* There is no risk of race in kernel boot phase */
        panic_print = sys_info_parse_param(buf);
        return 1;
}
__setup("panic_sys_info=", setup_panic_sys_info);

static atomic_t warn_count = ATOMIC_INIT(0);

#ifdef CONFIG_SYSFS
static ssize_t warn_count_show(struct kobject *kobj, struct kobj_attribute *attr,
                               char *page)
{
        return sysfs_emit(page, "%d\n", atomic_read(&warn_count));
}

static struct kobj_attribute warn_count_attr = __ATTR_RO(warn_count);

static __init int kernel_panic_sysfs_init(void)
{
        sysfs_add_file_to_group(kernel_kobj, &warn_count_attr.attr, NULL);
        return 0;
}
late_initcall(kernel_panic_sysfs_init);
#endif

static long no_blink(int state)
{
        return 0;
}

/* Returns how long it waited in ms */
long (*panic_blink)(int state);
EXPORT_SYMBOL(panic_blink);

/*
 * Stop ourself in panic -- architecture code may override this
 */
void __weak __noreturn panic_smp_self_stop(void)
{
        while (1)
                cpu_relax();
}

/*
 * Stop ourselves in NMI context if another CPU has already panicked. Arch code
 * may override this to prepare for crash dumping, e.g. save regs info.
 */
void __weak __noreturn nmi_panic_self_stop(struct pt_regs *regs)
{
        panic_smp_self_stop();
}

/*
 * Stop other CPUs in panic.  Architecture dependent code may override this
 * with more suitable version.  For example, if the architecture supports
 * crash dump, it should save registers of each stopped CPU and disable
 * per-CPU features such as virtualization extensions.
 */
void __weak crash_smp_send_stop(void)
{
        static int cpus_stopped;

        /*
         * This function can be called twice in panic path, but obviously
         * we execute this only once.
         */
        if (cpus_stopped)
                return;

        /*
         * Note smp_send_stop is the usual smp shutdown function, which
         * unfortunately means it may not be hardened to work in a panic
         * situation.
         */
        smp_send_stop();
        cpus_stopped = 1;
}

atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);

bool panic_try_start(void)
{
        int old_cpu, this_cpu;

        /*
         * Only one CPU is allowed to execute the crash_kexec() code as with
         * panic().  Otherwise parallel calls of panic() and crash_kexec()
         * may stop each other.  To exclude them, we use panic_cpu here too.
         */
        old_cpu = PANIC_CPU_INVALID;
        this_cpu = raw_smp_processor_id();

        return atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu);
}
EXPORT_SYMBOL(panic_try_start);

void panic_reset(void)
{
        atomic_set(&panic_cpu, PANIC_CPU_INVALID);
}
EXPORT_SYMBOL(panic_reset);

bool panic_in_progress(void)
{
        return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID);
}
EXPORT_SYMBOL(panic_in_progress);

/* Return true if a panic is in progress on the current CPU. */
bool panic_on_this_cpu(void)
{
        /*
         * We can use raw_smp_processor_id() here because it is impossible for
         * the task to be migrated to the panic_cpu, or away from it. If
         * panic_cpu has already been set, and we're not currently executing on
         * that CPU, then we never will be.
         */
        return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id());
}
EXPORT_SYMBOL(panic_on_this_cpu);

/*
 * Return true if a panic is in progress on a remote CPU.
 *
 * On true, the local CPU should immediately release any printing resources
 * that may be needed by the panic CPU.
 */
bool panic_on_other_cpu(void)
{
        return (panic_in_progress() && !panic_on_this_cpu());
}
EXPORT_SYMBOL(panic_on_other_cpu);

/*
 * A variant of panic() called from NMI context. We return if we've already
 * panicked on this CPU. If another CPU already panicked, loop in
 * nmi_panic_self_stop() which can provide architecture dependent code such
 * as saving register state for crash dump.
 */
void nmi_panic(struct pt_regs *regs, const char *msg)
{
        if (panic_try_start())
                panic("%s", msg);
        else if (panic_on_other_cpu())
                nmi_panic_self_stop(regs);
}
EXPORT_SYMBOL(nmi_panic);

void check_panic_on_warn(const char *origin)
{
        unsigned int limit;

        if (panic_on_warn)
                panic("%s: panic_on_warn set ...\n", origin);

        limit = READ_ONCE(warn_limit);
        if (atomic_inc_return(&warn_count) >= limit && limit)
                panic("%s: system warned too often (kernel.warn_limit is %d)",
                      origin, limit);
}

static void panic_trigger_all_cpu_backtrace(void)
{
        /* Temporary allow non-panic CPUs to write their backtraces. */
        panic_triggering_all_cpu_backtrace = true;

        if (panic_this_cpu_backtrace_printed)
                trigger_allbutcpu_cpu_backtrace(raw_smp_processor_id());
        else
                trigger_all_cpu_backtrace();

        panic_triggering_all_cpu_backtrace = false;
}

/*
 * Helper that triggers the NMI backtrace (if set in panic_print)
 * and then performs the secondary CPUs shutdown - we cannot have
 * the NMI backtrace after the CPUs are off!
 */
static void panic_other_cpus_shutdown(bool crash_kexec)
{
        if (panic_print & SYS_INFO_ALL_CPU_BT)
                panic_trigger_all_cpu_backtrace();

        /*
         * Note that smp_send_stop() is the usual SMP shutdown function,
         * which unfortunately may not be hardened to work in a panic
         * situation. If we want to do crash dump after notifier calls
         * and kmsg_dump, we will need architecture dependent extra
         * bits in addition to stopping other CPUs, hence we rely on
         * crash_smp_send_stop() for that.
         */
        if (!crash_kexec)
                smp_send_stop();
        else
                crash_smp_send_stop();
}

/**
 * vpanic - halt the system
 * @fmt: The text string to print
 * @args: Arguments for the format string
 *
 * Display a message, then perform cleanups. This function never returns.
 */
void vpanic(const char *fmt, va_list args)
{
        static char buf[1024];
        long i, i_next = 0, len;
        int state = 0;
        bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers;

        if (panic_on_warn) {
                /*
                 * This thread may hit another WARN() in the panic path.
                 * Resetting this prevents additional WARN() from panicking the
                 * system on this thread.  Other threads are blocked by the
                 * panic_mutex in panic().
                 */
                panic_on_warn = 0;
        }

        /*
         * Disable local interrupts. This will prevent panic_smp_self_stop
         * from deadlocking the first cpu that invokes the panic, since
         * there is nothing to prevent an interrupt handler (that runs
         * after setting panic_cpu) from invoking panic() again.
         */
        local_irq_disable();
        preempt_disable_notrace();

        /*
         * It's possible to come here directly from a panic-assertion and
         * not have preempt disabled. Some functions called from here want
         * preempt to be disabled. No point enabling it later though...
         *
         * Only one CPU is allowed to execute the panic code from here. For
         * multiple parallel invocations of panic, all other CPUs either
         * stop themself or will wait until they are stopped by the 1st CPU
         * with smp_send_stop().
         *
         * cmpxchg success means this is the 1st CPU which comes here,
         * so go ahead.
         * `old_cpu == this_cpu' means we came from nmi_panic() which sets
         * panic_cpu to this CPU.  In this case, this is also the 1st CPU.
         */
        /* atomic_try_cmpxchg updates old_cpu on failure */
        if (panic_try_start()) {
                /* go ahead */
        } else if (panic_on_other_cpu())
                panic_smp_self_stop();

        console_verbose();
        bust_spinlocks(1);
        len = vscnprintf(buf, sizeof(buf), fmt, args);

        if (len && buf[len - 1] == '\n')
                buf[len - 1] = '\0';

        pr_emerg("Kernel panic - not syncing: %s\n", buf);
        /*
         * Avoid nested stack-dumping if a panic occurs during oops processing
         */
        if (test_taint(TAINT_DIE) || oops_in_progress > 1) {
                panic_this_cpu_backtrace_printed = true;
        } else if (IS_ENABLED(CONFIG_DEBUG_BUGVERBOSE)) {
                dump_stack();
                panic_this_cpu_backtrace_printed = true;
        }

        /*
         * If kgdb is enabled, give it a chance to run before we stop all
         * the other CPUs or else we won't be able to debug processes left
         * running on them.
         */
        kgdb_panic(buf);

        /*
         * If we have crashed and we have a crash kernel loaded let it handle
         * everything else.
         * If we want to run this after calling panic_notifiers, pass
         * the "crash_kexec_post_notifiers" option to the kernel.
         *
         * Bypass the panic_cpu check and call __crash_kexec directly.
         */
        if (!_crash_kexec_post_notifiers)
                __crash_kexec(NULL);

        panic_other_cpus_shutdown(_crash_kexec_post_notifiers);

        printk_legacy_allow_panic_sync();

        /*
         * Run any panic handlers, including those that might need to
         * add information to the kmsg dump output.
         */
        atomic_notifier_call_chain(&panic_notifier_list, 0, buf);

        sys_info(panic_print);

        kmsg_dump_desc(KMSG_DUMP_PANIC, buf);

        /*
         * If you doubt kdump always works fine in any situation,
         * "crash_kexec_post_notifiers" offers you a chance to run
         * panic_notifiers and dumping kmsg before kdump.
         * Note: since some panic_notifiers can make crashed kernel
         * more unstable, it can increase risks of the kdump failure too.
         *
         * Bypass the panic_cpu check and call __crash_kexec directly.
         */
        if (_crash_kexec_post_notifiers)
                __crash_kexec(NULL);

        console_unblank();

        /*
         * We may have ended up stopping the CPU holding the lock (in
         * smp_send_stop()) while still having some valuable data in the console
         * buffer.  Try to acquire the lock then release it regardless of the
         * result.  The release will also print the buffers out.  Locks debug
         * should be disabled to avoid reporting bad unlock balance when
         * panic() is not being callled from OOPS.
         */
        debug_locks_off();
        console_flush_on_panic(CONSOLE_FLUSH_PENDING);

        if ((panic_print & SYS_INFO_PANIC_CONSOLE_REPLAY) ||
                panic_console_replay)
                console_flush_on_panic(CONSOLE_REPLAY_ALL);

        if (!panic_blink)
                panic_blink = no_blink;

        if (panic_timeout > 0) {
                /*
                 * Delay timeout seconds before rebooting the machine.
                 * We can't use the "normal" timers since we just panicked.
                 */
                pr_emerg("Rebooting in %d seconds..\n", panic_timeout);

                for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) {
                        touch_nmi_watchdog();
                        if (i >= i_next) {
                                i += panic_blink(state ^= 1);
                                i_next = i + 3600 / PANIC_BLINK_SPD;
                        }
                        mdelay(PANIC_TIMER_STEP);
                }
        }
        if (panic_timeout != 0) {
                /*
                 * This will not be a clean reboot, with everything
                 * shutting down.  But if there is a chance of
                 * rebooting the system it will be rebooted.
                 */
                if (panic_reboot_mode != REBOOT_UNDEFINED)
                        reboot_mode = panic_reboot_mode;
                emergency_restart();
        }
#ifdef __sparc__
        {
                extern int stop_a_enabled;
                /* Make sure the user can actually press Stop-A (L1-A) */
                stop_a_enabled = 1;
                pr_emerg("Press Stop-A (L1-A) from sun keyboard or send break\n"
                         "twice on console to return to the boot prom\n");
        }
#endif
#if defined(CONFIG_S390)
        disabled_wait();
#endif
        pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf);

        /* Do not scroll important messages printed above */
        suppress_printk = 1;

        /*
         * The final messages may not have been printed if in a context that
         * defers printing (such as NMI) and irq_work is not available.
         * Explicitly flush the kernel log buffer one last time.
         */
        console_flush_on_panic(CONSOLE_FLUSH_PENDING);
        nbcon_atomic_flush_unsafe();

        local_irq_enable();
        for (i = 0; ; i += PANIC_TIMER_STEP) {
                touch_softlockup_watchdog();
                if (i >= i_next) {
                        i += panic_blink(state ^= 1);
                        i_next = i + 3600 / PANIC_BLINK_SPD;
                }
                mdelay(PANIC_TIMER_STEP);
        }
}
EXPORT_SYMBOL(vpanic);

/* Identical to vpanic(), except it takes variadic arguments instead of va_list */
void panic(const char *fmt, ...)
{
        va_list args;

        va_start(args, fmt);
        vpanic(fmt, args);
        va_end(args);
}
EXPORT_SYMBOL(panic);

#define TAINT_FLAG(taint, _c_true, _c_false, _module)                        \
        [ TAINT_##taint ] = {                                                \
                .c_true = _c_true, .c_false = _c_false,                        \
                .module = _module,                                        \
                .desc = #taint,                                                \
        }

/*
 * TAINT_FORCED_RMMOD could be a per-module flag but the module
 * is being removed anyway.
 */
const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
        TAINT_FLAG(PROPRIETARY_MODULE,                'P', 'G', true),
        TAINT_FLAG(FORCED_MODULE,                'F', ' ', true),
        TAINT_FLAG(CPU_OUT_OF_SPEC,                'S', ' ', false),
        TAINT_FLAG(FORCED_RMMOD,                'R', ' ', false),
        TAINT_FLAG(MACHINE_CHECK,                'M', ' ', false),
        TAINT_FLAG(BAD_PAGE,                        'B', ' ', false),
        TAINT_FLAG(USER,                        'U', ' ', false),
        TAINT_FLAG(DIE,                                'D', ' ', false),
        TAINT_FLAG(OVERRIDDEN_ACPI_TABLE,        'A', ' ', false),
        TAINT_FLAG(WARN,                        'W', ' ', false),
        TAINT_FLAG(CRAP,                        'C', ' ', true),
        TAINT_FLAG(FIRMWARE_WORKAROUND,                'I', ' ', false),
        TAINT_FLAG(OOT_MODULE,                        'O', ' ', true),
        TAINT_FLAG(UNSIGNED_MODULE,                'E', ' ', true),
        TAINT_FLAG(SOFTLOCKUP,                        'L', ' ', false),
        TAINT_FLAG(LIVEPATCH,                        'K', ' ', true),
        TAINT_FLAG(AUX,                                'X', ' ', true),
        TAINT_FLAG(RANDSTRUCT,                        'T', ' ', true),
        TAINT_FLAG(TEST,                        'N', ' ', true),
        TAINT_FLAG(FWCTL,                        'J', ' ', true),
};

#undef TAINT_FLAG

static void print_tainted_seq(struct seq_buf *s, bool verbose)
{
        const char *sep = "";
        int i;

        if (!tainted_mask) {
                seq_buf_puts(s, "Not tainted");
                return;
        }

        seq_buf_printf(s, "Tainted: ");
        for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
                const struct taint_flag *t = &taint_flags[i];
                bool is_set = test_bit(i, &tainted_mask);
                char c = is_set ? t->c_true : t->c_false;

                if (verbose) {
                        if (is_set) {
                                seq_buf_printf(s, "%s[%c]=%s", sep, c, t->desc);
                                sep = ", ";
                        }
                } else {
                        seq_buf_putc(s, c);
                }
        }
}

static const char *_print_tainted(bool verbose)
{
        /* FIXME: what should the size be? */
        static char buf[sizeof(taint_flags)];
        struct seq_buf s;

        BUILD_BUG_ON(ARRAY_SIZE(taint_flags) != TAINT_FLAGS_COUNT);

        seq_buf_init(&s, buf, sizeof(buf));

        print_tainted_seq(&s, verbose);

        return seq_buf_str(&s);
}

/**
 * print_tainted - return a string to represent the kernel taint state.
 *
 * For individual taint flag meanings, see Documentation/admin-guide/sysctl/kernel.rst
 *
 * The string is overwritten by the next call to print_tainted(),
 * but is always NULL terminated.
 */
const char *print_tainted(void)
{
        return _print_tainted(false);
}

/**
 * print_tainted_verbose - A more verbose version of print_tainted()
 */
const char *print_tainted_verbose(void)
{
        return _print_tainted(true);
}

int test_taint(unsigned flag)
{
        return test_bit(flag, &tainted_mask);
}
EXPORT_SYMBOL(test_taint);

unsigned long get_taint(void)
{
        return tainted_mask;
}

/**
 * add_taint: add a taint flag if not already set.
 * @flag: one of the TAINT_* constants.
 * @lockdep_ok: whether lock debugging is still OK.
 *
 * If something bad has gone wrong, you'll want @lockdebug_ok = false, but for
 * some notewortht-but-not-corrupting cases, it can be set to true.
 */
void add_taint(unsigned flag, enum lockdep_ok lockdep_ok)
{
        if (lockdep_ok == LOCKDEP_NOW_UNRELIABLE && __debug_locks_off())
                pr_warn("Disabling lock debugging due to kernel taint\n");

        set_bit(flag, &tainted_mask);

        if (tainted_mask & panic_on_taint) {
                panic_on_taint = 0;
                panic("panic_on_taint set ...");
        }
}
EXPORT_SYMBOL(add_taint);

static void spin_msec(int msecs)
{
        int i;

        for (i = 0; i < msecs; i++) {
                touch_nmi_watchdog();
                mdelay(1);
        }
}

/*
 * It just happens that oops_enter() and oops_exit() are identically
 * implemented...
 */
static void do_oops_enter_exit(void)
{
        unsigned long flags;
        static int spin_counter;

        if (!pause_on_oops)
                return;

        spin_lock_irqsave(&pause_on_oops_lock, flags);
        if (pause_on_oops_flag == 0) {
                /* This CPU may now print the oops message */
                pause_on_oops_flag = 1;
        } else {
                /* We need to stall this CPU */
                if (!spin_counter) {
                        /* This CPU gets to do the counting */
                        spin_counter = pause_on_oops;
                        do {
                                spin_unlock(&pause_on_oops_lock);
                                spin_msec(MSEC_PER_SEC);
                                spin_lock(&pause_on_oops_lock);
                        } while (--spin_counter);
                        pause_on_oops_flag = 0;
                } else {
                        /* This CPU waits for a different one */
                        while (spin_counter) {
                                spin_unlock(&pause_on_oops_lock);
                                spin_msec(1);
                                spin_lock(&pause_on_oops_lock);
                        }
                }
        }
        spin_unlock_irqrestore(&pause_on_oops_lock, flags);
}

/*
 * Return true if the calling CPU is allowed to print oops-related info.
 * This is a bit racy..
 */
bool oops_may_print(void)
{
        return pause_on_oops_flag == 0;
}

/*
 * Called when the architecture enters its oops handler, before it prints
 * anything.  If this is the first CPU to oops, and it's oopsing the first
 * time then let it proceed.
 *
 * This is all enabled by the pause_on_oops kernel boot option.  We do all
 * this to ensure that oopses don't scroll off the screen.  It has the
 * side-effect of preventing later-oopsing CPUs from mucking up the display,
 * too.
 *
 * It turns out that the CPU which is allowed to print ends up pausing for
 * the right duration, whereas all the other CPUs pause for twice as long:
 * once in oops_enter(), once in oops_exit().
 */
void oops_enter(void)
{
        nbcon_cpu_emergency_enter();
        tracing_off();
        /* can't trust the integrity of the kernel anymore: */
        debug_locks_off();
        do_oops_enter_exit();

        if (sysctl_oops_all_cpu_backtrace)
                trigger_all_cpu_backtrace();
}

static void print_oops_end_marker(void)
{
        pr_warn("---[ end trace %016llx ]---\n", 0ULL);
}

/*
 * Called when the architecture exits its oops handler, after printing
 * everything.
 */
void oops_exit(void)
{
        do_oops_enter_exit();
        print_oops_end_marker();
        nbcon_cpu_emergency_exit();
        kmsg_dump(KMSG_DUMP_OOPS);
}

struct warn_args {
        const char *fmt;
        va_list args;
};

void __warn(const char *file, int line, void *caller, unsigned taint,
            struct pt_regs *regs, struct warn_args *args)
{
        nbcon_cpu_emergency_enter();

        disable_trace_on_warning();

        if (file)
                pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n",
                        raw_smp_processor_id(), current->pid, file, line,
                        caller);
        else
                pr_warn("WARNING: CPU: %d PID: %d at %pS\n",
                        raw_smp_processor_id(), current->pid, caller);

#pragma GCC diagnostic push
#ifndef __clang__
#pragma GCC diagnostic ignored "-Wsuggest-attribute=format"
#endif
        if (args)
                vprintk(args->fmt, args->args);
#pragma GCC diagnostic pop

        print_modules();

        if (regs)
                show_regs(regs);

        check_panic_on_warn("kernel");

        if (!regs)
                dump_stack();

        print_irqtrace_events(current);

        print_oops_end_marker();
        trace_error_report_end(ERROR_DETECTOR_WARN, (unsigned long)caller);

        /* Just a warning, don't kill lockdep. */
        add_taint(taint, LOCKDEP_STILL_OK);

        nbcon_cpu_emergency_exit();
}

#ifdef CONFIG_BUG
#ifndef __WARN_FLAGS
void warn_slowpath_fmt(const char *file, int line, unsigned taint,
                       const char *fmt, ...)
{
        bool rcu = warn_rcu_enter();
        struct warn_args args;

        pr_warn(CUT_HERE);

        if (!fmt) {
                __warn(file, line, __builtin_return_address(0), taint,
                       NULL, NULL);
                warn_rcu_exit(rcu);
                return;
        }

        args.fmt = fmt;
        va_start(args.args, fmt);
        __warn(file, line, __builtin_return_address(0), taint, NULL, &args);
        va_end(args.args);
        warn_rcu_exit(rcu);
}
EXPORT_SYMBOL(warn_slowpath_fmt);
#else
void __warn_printk(const char *fmt, ...)
{
        bool rcu = warn_rcu_enter();
        va_list args;

        pr_warn(CUT_HERE);

        va_start(args, fmt);
        vprintk(fmt, args);
        va_end(args);
        warn_rcu_exit(rcu);
}
EXPORT_SYMBOL(__warn_printk);
#endif

/* Support resetting WARN*_ONCE state */

static int clear_warn_once_set(void *data, u64 val)
{
        generic_bug_clear_once();
        memset(__start_once, 0, __end_once - __start_once);
        return 0;
}

DEFINE_DEBUGFS_ATTRIBUTE(clear_warn_once_fops, NULL, clear_warn_once_set,
                         "%lld\n");

static __init int register_warn_debugfs(void)
{
        /* Don't care about failure */
        debugfs_create_file_unsafe("clear_warn_once", 0200, NULL, NULL,
                                   &clear_warn_once_fops);
        return 0;
}

device_initcall(register_warn_debugfs);
#endif

#ifdef CONFIG_STACKPROTECTOR

/*
 * Called when gcc's -fstack-protector feature is used, and
 * gcc detects corruption of the on-stack canary value
 */
__visible noinstr void __stack_chk_fail(void)
{
        unsigned long flags;

        instrumentation_begin();
        flags = user_access_save();

        panic("stack-protector: Kernel stack is corrupted in: %pB",
                __builtin_return_address(0));

        user_access_restore(flags);
        instrumentation_end();
}
EXPORT_SYMBOL(__stack_chk_fail);

#endif

core_param(panic, panic_timeout, int, 0644);
core_param(pause_on_oops, pause_on_oops, int, 0644);
core_param(panic_on_warn, panic_on_warn, int, 0644);
core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644);
core_param(panic_console_replay, panic_console_replay, bool, 0644);

static int panic_print_set(const char *val, const struct kernel_param *kp)
{
        panic_print_deprecated();
        return  param_set_ulong(val, kp);
}

static int panic_print_get(char *val, const struct kernel_param *kp)
{
        panic_print_deprecated();
        return  param_get_ulong(val, kp);
}

static const struct kernel_param_ops panic_print_ops = {
        .set        = panic_print_set,
        .get        = panic_print_get,
};
__core_param_cb(panic_print, &panic_print_ops, &panic_print, 0644);

static int __init oops_setup(char *s)
{
        if (!s)
                return -EINVAL;
        if (!strcmp(s, "panic"))
                panic_on_oops = 1;
        return 0;
}
early_param("oops", oops_setup);

static int __init panic_on_taint_setup(char *s)
{
        char *taint_str;

        if (!s)
                return -EINVAL;

        taint_str = strsep(&s, ",");
        if (kstrtoul(taint_str, 16, &panic_on_taint))
                return -EINVAL;

        /* make sure panic_on_taint doesn't hold out-of-range TAINT flags */
        panic_on_taint &= TAINT_FLAGS_MAX;

        if (!panic_on_taint)
                return -EINVAL;

        if (s && !strcmp(s, "nousertaint"))
                panic_on_taint_nousertaint = true;

        pr_info("panic_on_taint: bitmask=0x%lx nousertaint_mode=%s\n",
                panic_on_taint, str_enabled_disabled(panic_on_taint_nousertaint));

        return 0;
}
early_param("panic_on_taint", panic_on_taint_setup);
























































































































































































    1 










    1 



    1 




















    1 

    1 





    1 









    1 


    1 

























    1 




























    1 

    1 


    1 

    1 



















    1 












    1 


    1 





    1 















    1 


    1 




















    1 











    1 
    1 







    1 

















    1 













    1 




























    1 







    1 
    1 









    1 
    1 

































    1 























    1 


    1 









    1 






    1 

    1 








    1 


    1 


























    1 



    1 

    1 















































    1 






    1 








    1 







    1 

    1 










    1 

































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
/*
 * Non-physical true random number generator based on timing jitter --
 * Jitter RNG standalone code.
 *
 * Copyright Stephan Mueller <smueller@chronox.de>, 2015 - 2023
 *
 * Design
 * ======
 *
 * See https://www.chronox.de/jent.html
 *
 * License
 * =======
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, and the entire permission notice in its entirety,
 *    including the disclaimer of warranties.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the author may not be used to endorse or promote
 *    products derived from this software without specific prior
 *    written permission.
 *
 * ALTERNATIVELY, this product may be distributed under the terms of
 * the GNU General Public License, in which case the provisions of the GPL2 are
 * required INSTEAD OF the above restrictions.  (This clause is
 * necessary due to a potential bad interaction between the GPL and
 * the restrictions contained in a BSD-style copyright.)
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
 * WHICH ARE HEREBY DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
 * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
 * DAMAGE.
 */

/*
 * This Jitterentropy RNG is based on the jitterentropy library
 * version 3.4.0 provided at https://www.chronox.de/jent.html
 */

#ifdef __OPTIMIZE__
 #error "The CPU Jitter random number generator must not be compiled with optimizations. See documentation. Use the compiler switch -O0 for compiling jitterentropy.c."
#endif

typedef        unsigned long long        __u64;
typedef        long long                __s64;
typedef        unsigned int                __u32;
typedef unsigned char                u8;
#define NULL    ((void *) 0)

/* The entropy pool */
struct rand_data {
        /* SHA3-256 is used as conditioner */
#define DATA_SIZE_BITS 256
        /* all data values that are vital to maintain the security
         * of the RNG are marked as SENSITIVE. A user must not
         * access that information while the RNG executes its loops to
         * calculate the next random value. */
        void *hash_state;                /* SENSITIVE hash state entropy pool */
        __u64 prev_time;                /* SENSITIVE Previous time stamp */
        __u64 last_delta;                /* SENSITIVE stuck test */
        __s64 last_delta2;                /* SENSITIVE stuck test */

        unsigned int flags;                /* Flags used to initialize */
        unsigned int osr;                /* Oversample rate */
#define JENT_MEMORY_ACCESSLOOPS 128
#define JENT_MEMORY_SIZE                                                \
        (CONFIG_CRYPTO_JITTERENTROPY_MEMORY_BLOCKS *                        \
         CONFIG_CRYPTO_JITTERENTROPY_MEMORY_BLOCKSIZE)
        unsigned char *mem;        /* Memory access location with size of
                                 * memblocks * memblocksize */
        unsigned int memlocation; /* Pointer to byte in *mem */
        unsigned int memblocks;        /* Number of memory blocks in *mem */
        unsigned int memblocksize; /* Size of one memory block in bytes */
        unsigned int memaccessloops; /* Number of memory accesses per random
                                      * bit generation */

        /* Repetition Count Test */
        unsigned int rct_count;                        /* Number of stuck values */

        /* Adaptive Proportion Test cutoff values */
        unsigned int apt_cutoff; /* Intermittent health test failure */
        unsigned int apt_cutoff_permanent; /* Permanent health test failure */
#define JENT_APT_WINDOW_SIZE        512        /* Data window size */
        /* LSB of time stamp to process */
#define JENT_APT_LSB                16
#define JENT_APT_WORD_MASK        (JENT_APT_LSB - 1)
        unsigned int apt_observations;        /* Number of collected observations */
        unsigned int apt_count;                /* APT counter */
        unsigned int apt_base;                /* APT base reference */
        unsigned int health_failure;        /* Record health failure */

        unsigned int apt_base_set:1;        /* APT base reference set? */
};

/* Flags that can be used to initialize the RNG */
#define JENT_DISABLE_MEMORY_ACCESS (1<<2) /* Disable memory access for more
                                           * entropy, saves MEMORY_SIZE RAM for
                                           * entropy collector */

/* -- error codes for init function -- */
#define JENT_ENOTIME                1 /* Timer service not available */
#define JENT_ECOARSETIME        2 /* Timer too coarse for RNG */
#define JENT_ENOMONOTONIC        3 /* Timer is not monotonic increasing */
#define JENT_EVARVAR                5 /* Timer does not produce variations of
                                   * variations (2nd derivation of time is
                                   * zero). */
#define JENT_ESTUCK                8 /* Too many stuck results during init. */
#define JENT_EHEALTH                9 /* Health test failed during initialization */
#define JENT_ERCT               10 /* RCT failed during initialization */
#define JENT_EHASH               11 /* Hash self test failed */
#define JENT_EMEM               12 /* Can't allocate memory for initialization */

#define JENT_RCT_FAILURE        1 /* Failure in RCT health test. */
#define JENT_APT_FAILURE        2 /* Failure in APT health test. */
#define JENT_PERMANENT_FAILURE_SHIFT        16
#define JENT_PERMANENT_FAILURE(x)        (x << JENT_PERMANENT_FAILURE_SHIFT)
#define JENT_RCT_FAILURE_PERMANENT        JENT_PERMANENT_FAILURE(JENT_RCT_FAILURE)
#define JENT_APT_FAILURE_PERMANENT        JENT_PERMANENT_FAILURE(JENT_APT_FAILURE)

/*
 * The output n bits can receive more than n bits of min entropy, of course,
 * but the fixed output of the conditioning function can only asymptotically
 * approach the output size bits of min entropy, not attain that bound. Random
 * maps will tend to have output collisions, which reduces the creditable
 * output entropy (that is what SP 800-90B Section 3.1.5.1.2 attempts to bound).
 *
 * The value "64" is justified in Appendix A.4 of the current 90C draft,
 * and aligns with NIST's in "epsilon" definition in this document, which is
 * that a string can be considered "full entropy" if you can bound the min
 * entropy in each bit of output to at least 1-epsilon, where epsilon is
 * required to be <= 2^(-32).
 */
#define JENT_ENTROPY_SAFETY_FACTOR        64

#include <linux/array_size.h>
#include <linux/fips.h>
#include <linux/minmax.h>
#include "jitterentropy.h"

/***************************************************************************
 * Adaptive Proportion Test
 *
 * This test complies with SP800-90B section 4.4.2.
 ***************************************************************************/

/*
 * See the SP 800-90B comment #10b for the corrected cutoff for the SP 800-90B
 * APT.
 * https://www.untruth.org/~josh/sp80090b/UL%20SP800-90B-final%20comments%20v1.9%2020191212.pdf
 * In the syntax of R, this is C = 2 + qbinom(1 − 2^(−30), 511, 2^(-1/osr)).
 * (The original formula wasn't correct because the first symbol must
 * necessarily have been observed, so there is no chance of observing 0 of these
 * symbols.)
 *
 * For the alpha < 2^-53, R cannot be used as it uses a float data type without
 * arbitrary precision. A SageMath script is used to calculate those cutoff
 * values.
 *
 * For any value above 14, this yields the maximal allowable value of 512
 * (by FIPS 140-2 IG 7.19 Resolution # 16, we cannot choose a cutoff value that
 * renders the test unable to fail).
 */
static const unsigned int jent_apt_cutoff_lookup[15] = {
        325, 422, 459, 477, 488, 494, 499, 502,
        505, 507, 508, 509, 510, 511, 512 };
static const unsigned int jent_apt_cutoff_permanent_lookup[15] = {
        355, 447, 479, 494, 502, 507, 510, 512,
        512, 512, 512, 512, 512, 512, 512 };

static void jent_apt_init(struct rand_data *ec, unsigned int osr)
{
        /*
         * Establish the apt_cutoff based on the presumed entropy rate of
         * 1/osr.
         */
        if (osr >= ARRAY_SIZE(jent_apt_cutoff_lookup)) {
                ec->apt_cutoff = jent_apt_cutoff_lookup[
                        ARRAY_SIZE(jent_apt_cutoff_lookup) - 1];
                ec->apt_cutoff_permanent = jent_apt_cutoff_permanent_lookup[
                        ARRAY_SIZE(jent_apt_cutoff_permanent_lookup) - 1];
        } else {
                ec->apt_cutoff = jent_apt_cutoff_lookup[osr - 1];
                ec->apt_cutoff_permanent =
                                jent_apt_cutoff_permanent_lookup[osr - 1];
        }
}
/*
 * Reset the APT counter
 *
 * @ec [in] Reference to entropy collector
 */
static void jent_apt_reset(struct rand_data *ec, unsigned int delta_masked)
{
        /* Reset APT counter */
        ec->apt_count = 0;
        ec->apt_base = delta_masked;
        ec->apt_observations = 0;
}

/*
 * Insert a new entropy event into APT
 *
 * @ec [in] Reference to entropy collector
 * @delta_masked [in] Masked time delta to process
 */
static void jent_apt_insert(struct rand_data *ec, unsigned int delta_masked)
{
        /* Initialize the base reference */
        if (!ec->apt_base_set) {
                ec->apt_base = delta_masked;
                ec->apt_base_set = 1;
                return;
        }

        if (delta_masked == ec->apt_base) {
                ec->apt_count++;

                /* Note, ec->apt_count starts with one. */
                if (ec->apt_count >= ec->apt_cutoff_permanent)
                        ec->health_failure |= JENT_APT_FAILURE_PERMANENT;
                else if (ec->apt_count >= ec->apt_cutoff)
                        ec->health_failure |= JENT_APT_FAILURE;
        }

        ec->apt_observations++;

        if (ec->apt_observations >= JENT_APT_WINDOW_SIZE)
                jent_apt_reset(ec, delta_masked);
}

/***************************************************************************
 * Stuck Test and its use as Repetition Count Test
 *
 * The Jitter RNG uses an enhanced version of the Repetition Count Test
 * (RCT) specified in SP800-90B section 4.4.1. Instead of counting identical
 * back-to-back values, the input to the RCT is the counting of the stuck
 * values during the generation of one Jitter RNG output block.
 *
 * The RCT is applied with an alpha of 2^{-30} compliant to FIPS 140-2 IG 9.8.
 *
 * During the counting operation, the Jitter RNG always calculates the RCT
 * cut-off value of C. If that value exceeds the allowed cut-off value,
 * the Jitter RNG output block will be calculated completely but discarded at
 * the end. The caller of the Jitter RNG is informed with an error code.
 ***************************************************************************/

/*
 * Repetition Count Test as defined in SP800-90B section 4.4.1
 *
 * @ec [in] Reference to entropy collector
 * @stuck [in] Indicator whether the value is stuck
 */
static void jent_rct_insert(struct rand_data *ec, int stuck)
{
        if (stuck) {
                ec->rct_count++;

                /*
                 * The cutoff value is based on the following consideration:
                 * alpha = 2^-30 or 2^-60 as recommended in SP800-90B.
                 * In addition, we require an entropy value H of 1/osr as this
                 * is the minimum entropy required to provide full entropy.
                 * Note, we collect (DATA_SIZE_BITS + ENTROPY_SAFETY_FACTOR)*osr
                 * deltas for inserting them into the entropy pool which should
                 * then have (close to) DATA_SIZE_BITS bits of entropy in the
                 * conditioned output.
                 *
                 * Note, ec->rct_count (which equals to value B in the pseudo
                 * code of SP800-90B section 4.4.1) starts with zero. Hence
                 * we need to subtract one from the cutoff value as calculated
                 * following SP800-90B. Thus C = ceil(-log_2(alpha)/H) = 30*osr
                 * or 60*osr.
                 */
                if ((unsigned int)ec->rct_count >= (60 * ec->osr)) {
                        ec->rct_count = -1;
                        ec->health_failure |= JENT_RCT_FAILURE_PERMANENT;
                } else if ((unsigned int)ec->rct_count >= (30 * ec->osr)) {
                        ec->rct_count = -1;
                        ec->health_failure |= JENT_RCT_FAILURE;
                }
        } else {
                /* Reset RCT */
                ec->rct_count = 0;
        }
}

static inline __u64 jent_delta(__u64 prev, __u64 next)
{
#define JENT_UINT64_MAX                (__u64)(~((__u64) 0))
        return (prev < next) ? (next - prev) :
                               (JENT_UINT64_MAX - prev + 1 + next);
}

/*
 * Stuck test by checking the:
 *         1st derivative of the jitter measurement (time delta)
 *         2nd derivative of the jitter measurement (delta of time deltas)
 *         3rd derivative of the jitter measurement (delta of delta of time deltas)
 *
 * All values must always be non-zero.
 *
 * @ec [in] Reference to entropy collector
 * @current_delta [in] Jitter time delta
 *
 * @return
 *         0 jitter measurement not stuck (good bit)
 *         1 jitter measurement stuck (reject bit)
 */
static int jent_stuck(struct rand_data *ec, __u64 current_delta)
{
        __u64 delta2 = jent_delta(ec->last_delta, current_delta);
        __u64 delta3 = jent_delta(ec->last_delta2, delta2);

        ec->last_delta = current_delta;
        ec->last_delta2 = delta2;

        /*
         * Insert the result of the comparison of two back-to-back time
         * deltas.
         */
        jent_apt_insert(ec, current_delta);

        if (!current_delta || !delta2 || !delta3) {
                /* RCT with a stuck bit */
                jent_rct_insert(ec, 1);
                return 1;
        }

        /* RCT with a non-stuck bit */
        jent_rct_insert(ec, 0);

        return 0;
}

/*
 * Report any health test failures
 *
 * @ec [in] Reference to entropy collector
 *
 * @return a bitmask indicating which tests failed
 *        0 No health test failure
 *        1 RCT failure
 *        2 APT failure
 *        1<<JENT_PERMANENT_FAILURE_SHIFT RCT permanent failure
 *        2<<JENT_PERMANENT_FAILURE_SHIFT APT permanent failure
 */
static unsigned int jent_health_failure(struct rand_data *ec)
{
        /* Test is only enabled in FIPS mode */
        if (!fips_enabled)
                return 0;

        return ec->health_failure;
}

/***************************************************************************
 * Noise sources
 ***************************************************************************/

/*
 * Update of the loop count used for the next round of
 * an entropy collection.
 *
 * Input:
 * @bits is the number of low bits of the timer to consider
 * @min is the number of bits we shift the timer value to the right at
 *        the end to make sure we have a guaranteed minimum value
 *
 * @return Newly calculated loop counter
 */
static __u64 jent_loop_shuffle(unsigned int bits, unsigned int min)
{
        __u64 time = 0;
        __u64 shuffle = 0;
        unsigned int i = 0;
        unsigned int mask = (1<<bits) - 1;

        jent_get_nstime(&time);

        /*
         * We fold the time value as much as possible to ensure that as many
         * bits of the time stamp are included as possible.
         */
        for (i = 0; ((DATA_SIZE_BITS + bits - 1) / bits) > i; i++) {
                shuffle ^= time & mask;
                time = time >> bits;
        }

        /*
         * We add a lower boundary value to ensure we have a minimum
         * RNG loop count.
         */
        return (shuffle + (1<<min));
}

/*
 * CPU Jitter noise source -- this is the noise source based on the CPU
 *                              execution time jitter
 *
 * This function injects the individual bits of the time value into the
 * entropy pool using a hash.
 *
 * ec [in] entropy collector
 * time [in] time stamp to be injected
 * stuck [in] Is the time stamp identified as stuck?
 *
 * Output:
 * updated hash context in the entropy collector or error code
 */
static int jent_condition_data(struct rand_data *ec, __u64 time, int stuck)
{
#define SHA3_HASH_LOOP (1<<3)
        struct {
                int rct_count;
                unsigned int apt_observations;
                unsigned int apt_count;
                unsigned int apt_base;
        } addtl = {
                ec->rct_count,
                ec->apt_observations,
                ec->apt_count,
                ec->apt_base
        };

        return jent_hash_time(ec->hash_state, time, (u8 *)&addtl, sizeof(addtl),
                              SHA3_HASH_LOOP, stuck);
}

/*
 * Memory Access noise source -- this is a noise source based on variations in
 *                                 memory access times
 *
 * This function performs memory accesses which will add to the timing
 * variations due to an unknown amount of CPU wait states that need to be
 * added when accessing memory. The memory size should be larger than the L1
 * caches as outlined in the documentation and the associated testing.
 *
 * The L1 cache has a very high bandwidth, albeit its access rate is  usually
 * slower than accessing CPU registers. Therefore, L1 accesses only add minimal
 * variations as the CPU has hardly to wait. Starting with L2, significant
 * variations are added because L2 typically does not belong to the CPU any more
 * and therefore a wider range of CPU wait states is necessary for accesses.
 * L3 and real memory accesses have even a wider range of wait states. However,
 * to reliably access either L3 or memory, the ec->mem memory must be quite
 * large which is usually not desirable.
 *
 * @ec [in] Reference to the entropy collector with the memory access data -- if
 *            the reference to the memory block to be accessed is NULL, this noise
 *            source is disabled
 * @loop_cnt [in] if a value not equal to 0 is set, use the given value
 *                  number of loops to perform the LFSR
 */
static void jent_memaccess(struct rand_data *ec, __u64 loop_cnt)
{
        unsigned int wrap = 0;
        __u64 i = 0;
#define MAX_ACC_LOOP_BIT 7
#define MIN_ACC_LOOP_BIT 0
        __u64 acc_loop_cnt =
                jent_loop_shuffle(MAX_ACC_LOOP_BIT, MIN_ACC_LOOP_BIT);

        if (NULL == ec || NULL == ec->mem)
                return;
        wrap = ec->memblocksize * ec->memblocks;

        /*
         * testing purposes -- allow test app to set the counter, not
         * needed during runtime
         */
        if (loop_cnt)
                acc_loop_cnt = loop_cnt;

        for (i = 0; i < (ec->memaccessloops + acc_loop_cnt); i++) {
                unsigned char *tmpval = ec->mem + ec->memlocation;
                /*
                 * memory access: just add 1 to one byte,
                 * wrap at 255 -- memory access implies read
                 * from and write to memory location
                 */
                *tmpval = (*tmpval + 1) & 0xff;
                /*
                 * Addition of memblocksize - 1 to pointer
                 * with wrap around logic to ensure that every
                 * memory location is hit evenly
                 */
                ec->memlocation = ec->memlocation + ec->memblocksize - 1;
                ec->memlocation = ec->memlocation % wrap;
        }
}

/***************************************************************************
 * Start of entropy processing logic
 ***************************************************************************/
/*
 * This is the heart of the entropy generation: calculate time deltas and
 * use the CPU jitter in the time deltas. The jitter is injected into the
 * entropy pool.
 *
 * WARNING: ensure that ->prev_time is primed before using the output
 *            of this function! This can be done by calling this function
 *            and not using its result.
 *
 * @ec [in] Reference to entropy collector
 *
 * @return result of stuck test
 */
static int jent_measure_jitter(struct rand_data *ec, __u64 *ret_current_delta)
{
        __u64 time = 0;
        __u64 current_delta = 0;
        int stuck;

        /* Invoke one noise source before time measurement to add variations */
        jent_memaccess(ec, 0);

        /*
         * Get time stamp and calculate time delta to previous
         * invocation to measure the timing variations
         */
        jent_get_nstime(&time);
        current_delta = jent_delta(ec->prev_time, time);
        ec->prev_time = time;

        /* Check whether we have a stuck measurement. */
        stuck = jent_stuck(ec, current_delta);

        /* Now call the next noise sources which also injects the data */
        if (jent_condition_data(ec, current_delta, stuck))
                stuck = 1;

        /* return the raw entropy value */
        if (ret_current_delta)
                *ret_current_delta = current_delta;

        return stuck;
}

/*
 * Generator of one 64 bit random number
 * Function fills rand_data->hash_state
 *
 * @ec [in] Reference to entropy collector
 */
static void jent_gen_entropy(struct rand_data *ec)
{
        unsigned int k = 0, safety_factor = 0;

        if (fips_enabled)
                safety_factor = JENT_ENTROPY_SAFETY_FACTOR;

        /* priming of the ->prev_time value */
        jent_measure_jitter(ec, NULL);

        while (!jent_health_failure(ec)) {
                /* If a stuck measurement is received, repeat measurement */
                if (jent_measure_jitter(ec, NULL))
                        continue;

                /*
                 * We multiply the loop value with ->osr to obtain the
                 * oversampling rate requested by the caller
                 */
                if (++k >= ((DATA_SIZE_BITS + safety_factor) * ec->osr))
                        break;
        }
}

/*
 * Entry function: Obtain entropy for the caller.
 *
 * This function invokes the entropy gathering logic as often to generate
 * as many bytes as requested by the caller. The entropy gathering logic
 * creates 64 bit per invocation.
 *
 * This function truncates the last 64 bit entropy value output to the exact
 * size specified by the caller.
 *
 * @ec [in] Reference to entropy collector
 * @data [in] pointer to buffer for storing random data -- buffer must already
 *              exist
 * @len [in] size of the buffer, specifying also the requested number of random
 *             in bytes
 *
 * @return 0 when request is fulfilled or an error
 *
 * The following error codes can occur:
 *        -1        entropy_collector is NULL or the generation failed
 *        -2        Intermittent health failure
 *        -3        Permanent health failure
 */
int jent_read_entropy(struct rand_data *ec, unsigned char *data,
                      unsigned int len)
{
        unsigned char *p = data;

        if (!ec)
                return -1;

        while (len > 0) {
                unsigned int tocopy, health_test_result;

                jent_gen_entropy(ec);

                health_test_result = jent_health_failure(ec);
                if (health_test_result > JENT_PERMANENT_FAILURE_SHIFT) {
                        /*
                         * At this point, the Jitter RNG instance is considered
                         * as a failed instance. There is no rerun of the
                         * startup test any more, because the caller
                         * is assumed to not further use this instance.
                         */
                        return -3;
                } else if (health_test_result) {
                        /*
                         * Perform startup health tests and return permanent
                         * error if it fails.
                         */
                        if (jent_entropy_init(0, 0, NULL, ec)) {
                                /* Mark the permanent error */
                                ec->health_failure &=
                                        JENT_RCT_FAILURE_PERMANENT |
                                        JENT_APT_FAILURE_PERMANENT;
                                return -3;
                        }

                        return -2;
                }

                tocopy = min(DATA_SIZE_BITS / 8, len);
                if (jent_read_random_block(ec->hash_state, p, tocopy))
                        return -1;

                len -= tocopy;
                p += tocopy;
        }

        return 0;
}

/***************************************************************************
 * Initialization logic
 ***************************************************************************/

struct rand_data *jent_entropy_collector_alloc(unsigned int osr,
                                               unsigned int flags,
                                               void *hash_state)
{
        struct rand_data *entropy_collector;

        entropy_collector = jent_zalloc(sizeof(struct rand_data));
        if (!entropy_collector)
                return NULL;

        if (!(flags & JENT_DISABLE_MEMORY_ACCESS)) {
                /* Allocate memory for adding variations based on memory
                 * access
                 */
                entropy_collector->mem = jent_kvzalloc(JENT_MEMORY_SIZE);
                if (!entropy_collector->mem) {
                        jent_zfree(entropy_collector);
                        return NULL;
                }
                entropy_collector->memblocksize =
                        CONFIG_CRYPTO_JITTERENTROPY_MEMORY_BLOCKSIZE;
                entropy_collector->memblocks =
                        CONFIG_CRYPTO_JITTERENTROPY_MEMORY_BLOCKS;
                entropy_collector->memaccessloops = JENT_MEMORY_ACCESSLOOPS;
        }

        /* verify and set the oversampling rate */
        if (osr == 0)
                osr = 1; /* H_submitter = 1 / osr */
        entropy_collector->osr = osr;
        entropy_collector->flags = flags;

        entropy_collector->hash_state = hash_state;

        /* Initialize the APT */
        jent_apt_init(entropy_collector, osr);

        /* fill the data pad with non-zero values */
        jent_gen_entropy(entropy_collector);

        return entropy_collector;
}

void jent_entropy_collector_free(struct rand_data *entropy_collector)
{
        jent_kvzfree(entropy_collector->mem, JENT_MEMORY_SIZE);
        entropy_collector->mem = NULL;
        jent_zfree(entropy_collector);
}

int jent_entropy_init(unsigned int osr, unsigned int flags, void *hash_state,
                      struct rand_data *p_ec)
{
        /*
         * If caller provides an allocated ec, reuse it which implies that the
         * health test entropy data is used to further still the available
         * entropy pool.
         */
        struct rand_data *ec = p_ec;
        int i, time_backwards = 0, ret = 0, ec_free = 0;
        unsigned int health_test_result;

        if (!ec) {
                ec = jent_entropy_collector_alloc(osr, flags, hash_state);
                if (!ec)
                        return JENT_EMEM;
                ec_free = 1;
        } else {
                /* Reset the APT */
                jent_apt_reset(ec, 0);
                /* Ensure that a new APT base is obtained */
                ec->apt_base_set = 0;
                /* Reset the RCT */
                ec->rct_count = 0;
                /* Reset intermittent, leave permanent health test result */
                ec->health_failure &= (~JENT_RCT_FAILURE);
                ec->health_failure &= (~JENT_APT_FAILURE);
        }

        /* We could perform statistical tests here, but the problem is
         * that we only have a few loop counts to do testing. These
         * loop counts may show some slight skew and we produce
         * false positives.
         *
         * Moreover, only old systems show potentially problematic
         * jitter entropy that could potentially be caught here. But
         * the RNG is intended for hardware that is available or widely
         * used, but not old systems that are long out of favor. Thus,
         * no statistical tests.
         */

        /*
         * We could add a check for system capabilities such as clock_getres or
         * check for CONFIG_X86_TSC, but it does not make much sense as the
         * following sanity checks verify that we have a high-resolution
         * timer.
         */
        /*
         * TESTLOOPCOUNT needs some loops to identify edge systems. 100 is
         * definitely too little.
         *
         * SP800-90B requires at least 1024 initial test cycles.
         */
#define TESTLOOPCOUNT 1024
#define CLEARCACHE 100
        for (i = 0; (TESTLOOPCOUNT + CLEARCACHE) > i; i++) {
                __u64 start_time = 0, end_time = 0, delta = 0;

                /* Invoke core entropy collection logic */
                jent_measure_jitter(ec, &delta);
                end_time = ec->prev_time;
                start_time = ec->prev_time - delta;

                /* test whether timer works */
                if (!start_time || !end_time) {
                        ret = JENT_ENOTIME;
                        goto out;
                }

                /*
                 * test whether timer is fine grained enough to provide
                 * delta even when called shortly after each other -- this
                 * implies that we also have a high resolution timer
                 */
                if (!delta || (end_time == start_time)) {
                        ret = JENT_ECOARSETIME;
                        goto out;
                }

                /*
                 * up to here we did not modify any variable that will be
                 * evaluated later, but we already performed some work. Thus we
                 * already have had an impact on the caches, branch prediction,
                 * etc. with the goal to clear it to get the worst case
                 * measurements.
                 */
                if (i < CLEARCACHE)
                        continue;

                /* test whether we have an increasing timer */
                if (!(end_time > start_time))
                        time_backwards++;
        }

        /*
         * we allow up to three times the time running backwards.
         * CLOCK_REALTIME is affected by adjtime and NTP operations. Thus,
         * if such an operation just happens to interfere with our test, it
         * should not fail. The value of 3 should cover the NTP case being
         * performed during our test run.
         */
        if (time_backwards > 3) {
                ret = JENT_ENOMONOTONIC;
                goto out;
        }

        /* Did we encounter a health test failure? */
        health_test_result = jent_health_failure(ec);
        if (health_test_result) {
                ret = (health_test_result & JENT_RCT_FAILURE) ? JENT_ERCT :
                                                                JENT_EHEALTH;
                goto out;
        }

out:
        if (ec_free)
                jent_entropy_collector_free(ec);

        return ret;
}


























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_NS_COMMON_H
#define _LINUX_NS_COMMON_H

#include <linux/refcount.h>
#include <linux/rbtree.h>
#include <uapi/linux/sched.h>

struct proc_ns_operations;

struct cgroup_namespace;
struct ipc_namespace;
struct mnt_namespace;
struct net;
struct pid_namespace;
struct time_namespace;
struct user_namespace;
struct uts_namespace;

extern struct cgroup_namespace init_cgroup_ns;
extern struct ipc_namespace init_ipc_ns;
extern struct mnt_namespace init_mnt_ns;
extern struct net init_net;
extern struct pid_namespace init_pid_ns;
extern struct time_namespace init_time_ns;
extern struct user_namespace init_user_ns;
extern struct uts_namespace init_uts_ns;

extern const struct proc_ns_operations netns_operations;
extern const struct proc_ns_operations utsns_operations;
extern const struct proc_ns_operations ipcns_operations;
extern const struct proc_ns_operations pidns_operations;
extern const struct proc_ns_operations pidns_for_children_operations;
extern const struct proc_ns_operations userns_operations;
extern const struct proc_ns_operations mntns_operations;
extern const struct proc_ns_operations cgroupns_operations;
extern const struct proc_ns_operations timens_operations;
extern const struct proc_ns_operations timens_for_children_operations;

struct ns_common {
        u32 ns_type;
        struct dentry *stashed;
        const struct proc_ns_operations *ops;
        unsigned int inum;
        refcount_t __ns_ref; /* do not use directly */
        union {
                struct {
                        u64 ns_id;
                        struct rb_node ns_tree_node;
                        struct list_head ns_list_node;
                };
                struct rcu_head ns_rcu;
        };
};

int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum);
void __ns_common_free(struct ns_common *ns);

#define to_ns_common(__ns)                                    \
        _Generic((__ns),                                      \
                struct cgroup_namespace *:       &(__ns)->ns, \
                const struct cgroup_namespace *: &(__ns)->ns, \
                struct ipc_namespace *:          &(__ns)->ns, \
                const struct ipc_namespace *:    &(__ns)->ns, \
                struct mnt_namespace *:          &(__ns)->ns, \
                const struct mnt_namespace *:    &(__ns)->ns, \
                struct net *:                    &(__ns)->ns, \
                const struct net *:              &(__ns)->ns, \
                struct pid_namespace *:          &(__ns)->ns, \
                const struct pid_namespace *:    &(__ns)->ns, \
                struct time_namespace *:         &(__ns)->ns, \
                const struct time_namespace *:   &(__ns)->ns, \
                struct user_namespace *:         &(__ns)->ns, \
                const struct user_namespace *:   &(__ns)->ns, \
                struct uts_namespace *:          &(__ns)->ns, \
                const struct uts_namespace *:    &(__ns)->ns)

#define ns_init_inum(__ns)                                     \
        _Generic((__ns),                                       \
                struct cgroup_namespace *: CGROUP_NS_INIT_INO, \
                struct ipc_namespace *:    IPC_NS_INIT_INO,    \
                struct mnt_namespace *:    MNT_NS_INIT_INO,    \
                struct net *:              NET_NS_INIT_INO,    \
                struct pid_namespace *:    PID_NS_INIT_INO,    \
                struct time_namespace *:   TIME_NS_INIT_INO,   \
                struct user_namespace *:   USER_NS_INIT_INO,   \
                struct uts_namespace *:    UTS_NS_INIT_INO)

#define ns_init_ns(__ns)                                    \
        _Generic((__ns),                                    \
                struct cgroup_namespace *: &init_cgroup_ns, \
                struct ipc_namespace *:    &init_ipc_ns,    \
                struct mnt_namespace *:    &init_mnt_ns,     \
                struct net *:              &init_net,       \
                struct pid_namespace *:    &init_pid_ns,    \
                struct time_namespace *:   &init_time_ns,   \
                struct user_namespace *:   &init_user_ns,   \
                struct uts_namespace *:    &init_uts_ns)

#define to_ns_operations(__ns)                                                                         \
        _Generic((__ns),                                                                               \
                struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \
                struct ipc_namespace *:    (IS_ENABLED(CONFIG_IPC_NS)  ? &ipcns_operations    : NULL), \
                struct mnt_namespace *:    &mntns_operations,                                          \
                struct net *:              (IS_ENABLED(CONFIG_NET_NS)  ? &netns_operations    : NULL), \
                struct pid_namespace *:    (IS_ENABLED(CONFIG_PID_NS)  ? &pidns_operations    : NULL), \
                struct time_namespace *:   (IS_ENABLED(CONFIG_TIME_NS) ? &timens_operations   : NULL), \
                struct user_namespace *:   (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations   : NULL), \
                struct uts_namespace *:    (IS_ENABLED(CONFIG_UTS_NS)  ? &utsns_operations    : NULL))

#define ns_common_type(__ns)                                \
        _Generic((__ns),                                    \
                struct cgroup_namespace *: CLONE_NEWCGROUP, \
                struct ipc_namespace *:    CLONE_NEWIPC,    \
                struct mnt_namespace *:    CLONE_NEWNS,     \
                struct net *:              CLONE_NEWNET,    \
                struct pid_namespace *:    CLONE_NEWPID,    \
                struct time_namespace *:   CLONE_NEWTIME,   \
                struct user_namespace *:   CLONE_NEWUSER,   \
                struct uts_namespace *:    CLONE_NEWUTS)

#define ns_common_init(__ns)                     \
        __ns_common_init(to_ns_common(__ns),     \
                         ns_common_type(__ns),   \
                         to_ns_operations(__ns), \
                         (((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0))

#define ns_common_init_inum(__ns, __inum)        \
        __ns_common_init(to_ns_common(__ns),     \
                         ns_common_type(__ns),   \
                         to_ns_operations(__ns), \
                         __inum)

#define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns)))

static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns)
{
        return refcount_dec_and_test(&ns->__ns_ref);
}

static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns)
{
        return refcount_inc_not_zero(&ns->__ns_ref);
}

#define ns_ref_read(__ns) refcount_read(&to_ns_common((__ns))->__ns_ref)
#define ns_ref_inc(__ns) refcount_inc(&to_ns_common((__ns))->__ns_ref)
#define ns_ref_get(__ns) __ns_ref_get(to_ns_common((__ns)))
#define ns_ref_put(__ns) __ns_ref_put(to_ns_common((__ns)))
#define ns_ref_put_and_lock(__ns, __lock) \
        refcount_dec_and_lock(&to_ns_common((__ns))->__ns_ref, (__lock))

#endif






























































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SWAP_H
#define _LINUX_SWAP_H

#include <linux/spinlock.h>
#include <linux/linkage.h>
#include <linux/mmzone.h>
#include <linux/list.h>
#include <linux/memcontrol.h>
#include <linux/sched.h>
#include <linux/node.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/atomic.h>
#include <linux/page-flags.h>
#include <uapi/linux/mempolicy.h>
#include <asm/page.h>

struct notifier_block;

struct bio;

struct pagevec;

#define SWAP_FLAG_PREFER        0x8000        /* set if swap priority specified */
#define SWAP_FLAG_PRIO_MASK        0x7fff
#define SWAP_FLAG_DISCARD        0x10000 /* enable discard for swap */
#define SWAP_FLAG_DISCARD_ONCE        0x20000 /* discard swap area at swapon-time */
#define SWAP_FLAG_DISCARD_PAGES 0x40000 /* discard page-clusters after use */

#define SWAP_FLAGS_VALID        (SWAP_FLAG_PRIO_MASK | SWAP_FLAG_PREFER | \
                                 SWAP_FLAG_DISCARD | SWAP_FLAG_DISCARD_ONCE | \
                                 SWAP_FLAG_DISCARD_PAGES)
#define SWAP_BATCH 64

static inline int current_is_kswapd(void)
{
        return current->flags & PF_KSWAPD;
}

/*
 * MAX_SWAPFILES defines the maximum number of swaptypes: things which can
 * be swapped to.  The swap type and the offset into that swap type are
 * encoded into pte's and into pgoff_t's in the swapcache.  Using five bits
 * for the type means that the maximum number of swapcache pages is 27 bits
 * on 32-bit-pgoff_t architectures.  And that assumes that the architecture packs
 * the type/offset into the pte as 5/27 as well.
 */
#define MAX_SWAPFILES_SHIFT        5

/*
 * Use some of the swap files numbers for other purposes. This
 * is a convenient way to hook into the VM to trigger special
 * actions on faults.
 */

/*
 * PTE markers are used to persist information onto PTEs that otherwise
 * should be a none pte.  As its name "PTE" hints, it should only be
 * applied to the leaves of pgtables.
 */
#define SWP_PTE_MARKER_NUM 1
#define SWP_PTE_MARKER     (MAX_SWAPFILES + SWP_HWPOISON_NUM + \
                            SWP_MIGRATION_NUM + SWP_DEVICE_NUM)

/*
 * Unaddressable device memory support. See include/linux/hmm.h and
 * Documentation/mm/hmm.rst. Short description is we need struct pages for
 * device memory that is unaddressable (inaccessible) by CPU, so that we can
 * migrate part of a process memory to device memory.
 *
 * When a page is migrated from CPU to device, we set the CPU page table entry
 * to a special SWP_DEVICE_{READ|WRITE} entry.
 *
 * When a page is mapped by the device for exclusive access we set the CPU page
 * table entries to a special SWP_DEVICE_EXCLUSIVE entry.
 */
#ifdef CONFIG_DEVICE_PRIVATE
#define SWP_DEVICE_NUM 3
#define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM)
#define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1)
#define SWP_DEVICE_EXCLUSIVE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2)
#else
#define SWP_DEVICE_NUM 0
#endif

/*
 * Page migration support.
 *
 * SWP_MIGRATION_READ_EXCLUSIVE is only applicable to anonymous pages and
 * indicates that the referenced (part of) an anonymous page is exclusive to
 * a single process. For SWP_MIGRATION_WRITE, that information is implicit:
 * (part of) an anonymous page that are mapped writable are exclusive to a
 * single process.
 */
#ifdef CONFIG_MIGRATION
#define SWP_MIGRATION_NUM 3
#define SWP_MIGRATION_READ (MAX_SWAPFILES + SWP_HWPOISON_NUM)
#define SWP_MIGRATION_READ_EXCLUSIVE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 1)
#define SWP_MIGRATION_WRITE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 2)
#else
#define SWP_MIGRATION_NUM 0
#endif

/*
 * Handling of hardware poisoned pages with memory corruption.
 */
#ifdef CONFIG_MEMORY_FAILURE
#define SWP_HWPOISON_NUM 1
#define SWP_HWPOISON                MAX_SWAPFILES
#else
#define SWP_HWPOISON_NUM 0
#endif

#define MAX_SWAPFILES \
        ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \
        SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - \
        SWP_PTE_MARKER_NUM)

/*
 * Magic header for a swap area. The first part of the union is
 * what the swap magic looks like for the old (limited to 128MB)
 * swap area format, the second part of the union adds - in the
 * old reserved area - some extra information. Note that the first
 * kilobyte is reserved for boot loader or disk label stuff...
 *
 * Having the magic at the end of the PAGE_SIZE makes detecting swap
 * areas somewhat tricky on machines that support multiple page sizes.
 * For 2.5 we'll probably want to move the magic to just beyond the
 * bootbits...
 */
union swap_header {
        struct {
                char reserved[PAGE_SIZE - 10];
                char magic[10];                        /* SWAP-SPACE or SWAPSPACE2 */
        } magic;
        struct {
                char                bootbits[1024];        /* Space for disklabel etc. */
                __u32                version;
                __u32                last_page;
                __u32                nr_badpages;
                unsigned char        sws_uuid[16];
                unsigned char        sws_volume[16];
                __u32                padding[117];
                __u32                badpages[1];
        } info;
};

/*
 * current->reclaim_state points to one of these when a task is running
 * memory reclaim
 */
struct reclaim_state {
        /* pages reclaimed outside of LRU-based reclaim */
        unsigned long reclaimed;
#ifdef CONFIG_LRU_GEN
        /* per-thread mm walk data */
        struct lru_gen_mm_walk *mm_walk;
#endif
};

/*
 * mm_account_reclaimed_pages(): account reclaimed pages outside of LRU-based
 * reclaim
 * @pages: number of pages reclaimed
 *
 * If the current process is undergoing a reclaim operation, increment the
 * number of reclaimed pages by @pages.
 */
static inline void mm_account_reclaimed_pages(unsigned long pages)
{
        if (current->reclaim_state)
                current->reclaim_state->reclaimed += pages;
}

#ifdef __KERNEL__

struct address_space;
struct sysinfo;
struct writeback_control;
struct zone;

/*
 * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of
 * disk blocks.  A rbtree of swap extents maps the entire swapfile (Where the
 * term `swapfile' refers to either a blockdevice or an IS_REG file). Apart
 * from setup, they're handled identically.
 *
 * We always assume that blocks are of size PAGE_SIZE.
 */
struct swap_extent {
        struct rb_node rb_node;
        pgoff_t start_page;
        pgoff_t nr_pages;
        sector_t start_block;
};

/*
 * Max bad pages in the new format..
 */
#define MAX_SWAP_BADPAGES \
        ((offsetof(union swap_header, magic.magic) - \
          offsetof(union swap_header, info.badpages)) / sizeof(int))

enum {
        SWP_USED        = (1 << 0),        /* is slot in swap_info[] used? */
        SWP_WRITEOK        = (1 << 1),        /* ok to write to this swap?        */
        SWP_DISCARDABLE = (1 << 2),        /* blkdev support discard */
        SWP_DISCARDING        = (1 << 3),        /* now discarding a free cluster */
        SWP_SOLIDSTATE        = (1 << 4),        /* blkdev seeks are cheap */
        SWP_CONTINUED        = (1 << 5),        /* swap_map has count continuation */
        SWP_BLKDEV        = (1 << 6),        /* its a block device */
        SWP_ACTIVATED        = (1 << 7),        /* set after swap_activate success */
        SWP_FS_OPS        = (1 << 8),        /* swapfile operations go through fs */
        SWP_AREA_DISCARD = (1 << 9),        /* single-time swap area discards */
        SWP_PAGE_DISCARD = (1 << 10),        /* freed swap page-cluster discards */
        SWP_STABLE_WRITES = (1 << 11),        /* no overwrite PG_writeback pages */
        SWP_SYNCHRONOUS_IO = (1 << 12),        /* synchronous IO is efficient */
                                        /* add others here before... */
};

#define SWAP_CLUSTER_MAX 32UL
#define SWAP_CLUSTER_MAX_SKIPPED (SWAP_CLUSTER_MAX << 10)
#define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX

/* Bit flag in swap_map */
#define SWAP_HAS_CACHE        0x40        /* Flag page is cached, in first swap_map */
#define COUNT_CONTINUED        0x80        /* Flag swap_map continuation for full count */

/* Special value in first swap_map */
#define SWAP_MAP_MAX        0x3e        /* Max count */
#define SWAP_MAP_BAD        0x3f        /* Note page is bad */
#define SWAP_MAP_SHMEM        0xbf        /* Owned by shmem/tmpfs */

/* Special value in each swap_map continuation */
#define SWAP_CONT_MAX        0x7f        /* Max count */

/*
 * The first page in the swap file is the swap header, which is always marked
 * bad to prevent it from being allocated as an entry. This also prevents the
 * cluster to which it belongs being marked free. Therefore 0 is safe to use as
 * a sentinel to indicate an entry is not valid.
 */
#define SWAP_ENTRY_INVALID        0

#ifdef CONFIG_THP_SWAP
#define SWAP_NR_ORDERS                (PMD_ORDER + 1)
#else
#define SWAP_NR_ORDERS                1
#endif

/*
 * We keep using same cluster for rotational device so IO will be sequential.
 * The purpose is to optimize SWAP throughput on these device.
 */
struct swap_sequential_cluster {
        unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
};

/*
 * The in-memory structure used to track swap areas.
 */
struct swap_info_struct {
        struct percpu_ref users;        /* indicate and keep swap device valid. */
        unsigned long        flags;                /* SWP_USED etc: see above */
        signed short        prio;                /* swap priority of this type */
        struct plist_node list;                /* entry in swap_active_head */
        signed char        type;                /* strange name for an index */
        unsigned int        max;                /* extent of the swap_map */
        unsigned char *swap_map;        /* vmalloc'ed array of usage counts */
        unsigned long *zeromap;                /* kvmalloc'ed bitmap to track zero pages */
        struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
        struct list_head free_clusters; /* free clusters list */
        struct list_head full_clusters; /* full clusters list */
        struct list_head nonfull_clusters[SWAP_NR_ORDERS];
                                        /* list of cluster that contains at least one free slot */
        struct list_head frag_clusters[SWAP_NR_ORDERS];
                                        /* list of cluster that are fragmented or contented */
        unsigned int pages;                /* total of usable pages of swap */
        atomic_long_t inuse_pages;        /* number of those currently in use */
        struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */
        spinlock_t global_cluster_lock;        /* Serialize usage of global cluster */
        struct rb_root swap_extent_root;/* root of the swap extent rbtree */
        struct block_device *bdev;        /* swap device or bdev of swap file */
        struct file *swap_file;                /* seldom referenced */
        struct completion comp;                /* seldom referenced */
        spinlock_t lock;                /*
                                         * protect map scan related fields like
                                         * swap_map, inuse_pages and all cluster
                                         * lists. other fields are only changed
                                         * at swapon/swapoff, so are protected
                                         * by swap_lock. changing flags need
                                         * hold this lock and swap_lock. If
                                         * both locks need hold, hold swap_lock
                                         * first.
                                         */
        spinlock_t cont_lock;                /*
                                         * protect swap count continuation page
                                         * list.
                                         */
        struct work_struct discard_work; /* discard worker */
        struct work_struct reclaim_work; /* reclaim worker */
        struct list_head discard_clusters; /* discard clusters list */
        struct plist_node avail_lists[]; /*
                                           * entries in swap_avail_heads, one
                                           * entry per node.
                                           * Must be last as the number of the
                                           * array is nr_node_ids, which is not
                                           * a fixed value so have to allocate
                                           * dynamically.
                                           * And it has to be an array so that
                                           * plist_for_each_* can work.
                                           */
};

static inline swp_entry_t page_swap_entry(struct page *page)
{
        struct folio *folio = page_folio(page);
        swp_entry_t entry = folio->swap;

        entry.val += folio_page_idx(folio, page);
        return entry;
}

/* linux/mm/workingset.c */
bool workingset_test_recent(void *shadow, bool file, bool *workingset,
                                bool flush);
void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages);
void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg);
void workingset_refault(struct folio *folio, void *shadow);
void workingset_activation(struct folio *folio);

/* linux/mm/page_alloc.c */
extern unsigned long totalreserve_pages;

/* Definition of global_zone_page_state not available yet */
#define nr_free_pages() global_zone_page_state(NR_FREE_PAGES)


/* linux/mm/swap.c */
void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file,
                unsigned int nr_io, unsigned int nr_rotated)
                __releases(lruvec->lru_lock);
void lru_note_cost_refault(struct folio *);
void folio_add_lru(struct folio *);
void folio_add_lru_vma(struct folio *, struct vm_area_struct *);
void mark_page_accessed(struct page *);
void folio_mark_accessed(struct folio *);

static inline bool folio_may_be_lru_cached(struct folio *folio)
{
        /*
         * Holding PMD-sized folios in per-CPU LRU cache unbalances accounting.
         * Holding small numbers of low-order mTHP folios in per-CPU LRU cache
         * will be sensible, but nobody has implemented and tested that yet.
         */
        return !folio_test_large(folio);
}

extern atomic_t lru_disable_count;

static inline bool lru_cache_disabled(void)
{
        return atomic_read(&lru_disable_count);
}

static inline void lru_cache_enable(void)
{
        atomic_dec(&lru_disable_count);
}

extern void lru_cache_disable(void);
extern void lru_add_drain(void);
extern void lru_add_drain_cpu(int cpu);
extern void lru_add_drain_cpu_zone(struct zone *zone);
extern void lru_add_drain_all(void);
void folio_deactivate(struct folio *folio);
void folio_mark_lazyfree(struct folio *folio);
extern void swap_setup(void);

/* linux/mm/vmscan.c */
extern unsigned long zone_reclaimable_pages(struct zone *zone);
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                                        gfp_t gfp_mask, nodemask_t *mask);

#define MEMCG_RECLAIM_MAY_SWAP (1 << 1)
#define MEMCG_RECLAIM_PROACTIVE (1 << 2)
#define MIN_SWAPPINESS 0
#define MAX_SWAPPINESS 200

/* Just reclaim from anon folios in proactive memory reclaim */
#define SWAPPINESS_ANON_ONLY (MAX_SWAPPINESS + 1)

extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
                                                  unsigned long nr_pages,
                                                  gfp_t gfp_mask,
                                                  unsigned int reclaim_options,
                                                  int *swappiness);
extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
                                                gfp_t gfp_mask, bool noswap,
                                                pg_data_t *pgdat,
                                                unsigned long *nr_scanned);
extern unsigned long shrink_all_memory(unsigned long nr_pages);
extern int vm_swappiness;
long remove_mapping(struct address_space *mapping, struct folio *folio);

#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
extern int reclaim_register_node(struct node *node);
extern void reclaim_unregister_node(struct node *node);

#else

static inline int reclaim_register_node(struct node *node)
{
        return 0;
}

static inline void reclaim_unregister_node(struct node *node)
{
}
#endif /* CONFIG_SYSFS && CONFIG_NUMA */

#ifdef CONFIG_NUMA
extern int sysctl_min_unmapped_ratio;
extern int sysctl_min_slab_ratio;
#endif

void check_move_unevictable_folios(struct folio_batch *fbatch);

extern void __meminit kswapd_run(int nid);
extern void __meminit kswapd_stop(int nid);

#ifdef CONFIG_SWAP

int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
                unsigned long nr_pages, sector_t start_block);
int generic_swapfile_activate(struct swap_info_struct *, struct file *,
                sector_t *);

static inline unsigned long total_swapcache_pages(void)
{
        return global_node_page_state(NR_SWAPCACHE);
}

void free_swap_cache(struct folio *folio);
void free_folio_and_swap_cache(struct folio *folio);
void free_pages_and_swap_cache(struct encoded_page **, int);
/* linux/mm/swapfile.c */
extern atomic_long_t nr_swap_pages;
extern long total_swap_pages;
extern atomic_t nr_rotate_swap;

/* Swap 50% full? Release swapcache more aggressively.. */
static inline bool vm_swap_full(void)
{
        return atomic_long_read(&nr_swap_pages) * 2 < total_swap_pages;
}

static inline long get_nr_swap_pages(void)
{
        return atomic_long_read(&nr_swap_pages);
}

extern void si_swapinfo(struct sysinfo *);
int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask);
bool folio_free_swap(struct folio *folio);
void put_swap_folio(struct folio *folio, swp_entry_t entry);
extern swp_entry_t get_swap_page_of_type(int);
extern int add_swap_count_continuation(swp_entry_t, gfp_t);
extern void swap_shmem_alloc(swp_entry_t, int);
extern int swap_duplicate(swp_entry_t);
extern int swapcache_prepare(swp_entry_t entry, int nr);
extern void swap_free_nr(swp_entry_t entry, int nr_pages);
extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
int swap_type_of(dev_t device, sector_t offset);
int find_first_swap(dev_t *device);
extern unsigned int count_swap_pages(int, int);
extern sector_t swapdev_block(int, pgoff_t);
extern int __swap_count(swp_entry_t entry);
extern bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry);
extern int swp_swapcount(swp_entry_t entry);
struct backing_dev_info;
extern struct swap_info_struct *get_swap_device(swp_entry_t entry);
sector_t swap_folio_sector(struct folio *folio);

static inline void put_swap_device(struct swap_info_struct *si)
{
        percpu_ref_put(&si->users);
}

#else /* CONFIG_SWAP */
static inline struct swap_info_struct *get_swap_device(swp_entry_t entry)
{
        return NULL;
}

static inline void put_swap_device(struct swap_info_struct *si)
{
}

#define get_nr_swap_pages()                        0L
#define total_swap_pages                        0L
#define total_swapcache_pages()                        0UL
#define vm_swap_full()                                0

#define si_swapinfo(val) \
        do { (val)->freeswap = (val)->totalswap = 0; } while (0)
#define free_folio_and_swap_cache(folio) \
        folio_put(folio)
#define free_pages_and_swap_cache(pages, nr) \
        release_pages((pages), (nr));

static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr)
{
}

static inline void free_swap_cache(struct folio *folio)
{
}

static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
{
        return 0;
}

static inline void swap_shmem_alloc(swp_entry_t swp, int nr)
{
}

static inline int swap_duplicate(swp_entry_t swp)
{
        return 0;
}

static inline int swapcache_prepare(swp_entry_t swp, int nr)
{
        return 0;
}

static inline void swap_free_nr(swp_entry_t entry, int nr_pages)
{
}

static inline void put_swap_folio(struct folio *folio, swp_entry_t swp)
{
}

static inline int __swap_count(swp_entry_t entry)
{
        return 0;
}

static inline bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry)
{
        return false;
}

static inline int swp_swapcount(swp_entry_t entry)
{
        return 0;
}

static inline int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask)
{
        return -EINVAL;
}

static inline bool folio_free_swap(struct folio *folio)
{
        return false;
}

static inline int add_swap_extent(struct swap_info_struct *sis,
                                  unsigned long start_page,
                                  unsigned long nr_pages, sector_t start_block)
{
        return -EINVAL;
}
#endif /* CONFIG_SWAP */

static inline void free_swap_and_cache(swp_entry_t entry)
{
        free_swap_and_cache_nr(entry, 1);
}

static inline void swap_free(swp_entry_t entry)
{
        swap_free_nr(entry, 1);
}

#ifdef CONFIG_MEMCG
static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
{
        /* Cgroup2 doesn't have per-cgroup swappiness */
        if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
                return READ_ONCE(vm_swappiness);

        /* root ? */
        if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg))
                return READ_ONCE(vm_swappiness);

        return READ_ONCE(memcg->swappiness);
}
#else
static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
{
        return READ_ONCE(vm_swappiness);
}
#endif

#if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp);
static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
{
        if (mem_cgroup_disabled())
                return;
        __folio_throttle_swaprate(folio, gfp);
}
#else
static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
{
}
#endif

#if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP)
int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry);
static inline int mem_cgroup_try_charge_swap(struct folio *folio,
                swp_entry_t entry)
{
        if (mem_cgroup_disabled())
                return 0;
        return __mem_cgroup_try_charge_swap(folio, entry);
}

extern void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages);
static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
{
        if (mem_cgroup_disabled())
                return;
        __mem_cgroup_uncharge_swap(entry, nr_pages);
}

extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
extern bool mem_cgroup_swap_full(struct folio *folio);
#else
static inline int mem_cgroup_try_charge_swap(struct folio *folio,
                                             swp_entry_t entry)
{
        return 0;
}

static inline void mem_cgroup_uncharge_swap(swp_entry_t entry,
                                            unsigned int nr_pages)
{
}

static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
{
        return get_nr_swap_pages();
}

static inline bool mem_cgroup_swap_full(struct folio *folio)
{
        return vm_swap_full();
}
#endif

#endif /* __KERNEL__*/
#endif /* _LINUX_SWAP_H */




















































































































































































































































































    4 

























    4 












































































































































































































































































    4 






























    4 









































































































    4 




















    4 






    4 











































    4 




























































































    4 






























































































































































































































































































































    4 






    4 
    4 

    4 







    4 















    4 






    4 
























    4 
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    4 















    4 

    4 
    4 


    4 






    4 
    4 






    4 

    4 
    4 
    4 

    4 












    4 



















































































































































































































































































































































































































































































































































































































































































































    4 







    4 



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    4 








    4 
    4 

    4 

    4 
    4 


    4 


    4 










    4 
    4 

    4 














































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
// SPDX-License-Identifier: GPL-2.0+
/*
 * Maple Tree implementation
 * Copyright (c) 2018-2022 Oracle Corporation
 * Authors: Liam R. Howlett <Liam.Howlett@oracle.com>
 *            Matthew Wilcox <willy@infradead.org>
 * Copyright (c) 2023 ByteDance
 * Author: Peng Zhang <zhangpeng.00@bytedance.com>
 */

/*
 * DOC: Interesting implementation details of the Maple Tree
 *
 * Each node type has a number of slots for entries and a number of slots for
 * pivots.  In the case of dense nodes, the pivots are implied by the position
 * and are simply the slot index + the minimum of the node.
 *
 * In regular B-Tree terms, pivots are called keys.  The term pivot is used to
 * indicate that the tree is specifying ranges.  Pivots may appear in the
 * subtree with an entry attached to the value whereas keys are unique to a
 * specific position of a B-tree.  Pivot values are inclusive of the slot with
 * the same index.
 *
 *
 * The following illustrates the layout of a range64 nodes slots and pivots.
 *
 *
 *  Slots -> | 0 | 1 | 2 | ... | 12 | 13 | 14 | 15 |
 *           ┬   ┬   ┬   ┬     ┬    ┬    ┬    ┬    ┬
 *           │   │   │   │     │    │    │    │    └─ Implied maximum
 *           │   │   │   │     │    │    │    └─ Pivot 14
 *           │   │   │   │     │    │    └─ Pivot 13
 *           │   │   │   │     │    └─ Pivot 12
 *           │   │   │   │     └─ Pivot 11
 *           │   │   │   └─ Pivot 2
 *           │   │   └─ Pivot 1
 *           │   └─ Pivot 0
 *           └─  Implied minimum
 *
 * Slot contents:
 *  Internal (non-leaf) nodes contain pointers to other nodes.
 *  Leaf nodes contain entries.
 *
 * The location of interest is often referred to as an offset.  All offsets have
 * a slot, but the last offset has an implied pivot from the node above (or
 * UINT_MAX for the root node.
 *
 * Ranges complicate certain write activities.  When modifying any of
 * the B-tree variants, it is known that one entry will either be added or
 * deleted.  When modifying the Maple Tree, one store operation may overwrite
 * the entire data set, or one half of the tree, or the middle half of the tree.
 *
 */


#include <linux/maple_tree.h>
#include <linux/xarray.h>
#include <linux/types.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/limits.h>
#include <asm/barrier.h>

#define CREATE_TRACE_POINTS
#include <trace/events/maple_tree.h>

#define TP_FCT tracepoint_string(__func__)

/*
 * Kernel pointer hashing renders much of the maple tree dump useless as tagged
 * pointers get hashed to arbitrary values.
 *
 * If CONFIG_DEBUG_VM_MAPLE_TREE is set we are in a debug mode where it is
 * permissible to bypass this. Otherwise remain cautious and retain the hashing.
 *
 * Userland doesn't know about %px so also use %p there.
 */
#if defined(__KERNEL__) && defined(CONFIG_DEBUG_VM_MAPLE_TREE)
#define PTR_FMT "%px"
#else
#define PTR_FMT "%p"
#endif

#define MA_ROOT_PARENT 1

/*
 * Maple state flags
 * * MA_STATE_PREALLOC                - Preallocated nodes, WARN_ON allocation
 */
#define MA_STATE_PREALLOC        1

#define ma_parent_ptr(x) ((struct maple_pnode *)(x))
#define mas_tree_parent(x) ((unsigned long)(x->tree) | MA_ROOT_PARENT)
#define ma_mnode_ptr(x) ((struct maple_node *)(x))
#define ma_enode_ptr(x) ((struct maple_enode *)(x))
static struct kmem_cache *maple_node_cache;

#ifdef CONFIG_DEBUG_MAPLE_TREE
static const unsigned long mt_max[] = {
        [maple_dense]                = MAPLE_NODE_SLOTS,
        [maple_leaf_64]                = ULONG_MAX,
        [maple_range_64]        = ULONG_MAX,
        [maple_arange_64]        = ULONG_MAX,
};
#define mt_node_max(x) mt_max[mte_node_type(x)]
#endif

static const unsigned char mt_slots[] = {
        [maple_dense]                = MAPLE_NODE_SLOTS,
        [maple_leaf_64]                = MAPLE_RANGE64_SLOTS,
        [maple_range_64]        = MAPLE_RANGE64_SLOTS,
        [maple_arange_64]        = MAPLE_ARANGE64_SLOTS,
};
#define mt_slot_count(x) mt_slots[mte_node_type(x)]

static const unsigned char mt_pivots[] = {
        [maple_dense]                = 0,
        [maple_leaf_64]                = MAPLE_RANGE64_SLOTS - 1,
        [maple_range_64]        = MAPLE_RANGE64_SLOTS - 1,
        [maple_arange_64]        = MAPLE_ARANGE64_SLOTS - 1,
};
#define mt_pivot_count(x) mt_pivots[mte_node_type(x)]

static const unsigned char mt_min_slots[] = {
        [maple_dense]                = MAPLE_NODE_SLOTS / 2,
        [maple_leaf_64]                = (MAPLE_RANGE64_SLOTS / 2) - 2,
        [maple_range_64]        = (MAPLE_RANGE64_SLOTS / 2) - 2,
        [maple_arange_64]        = (MAPLE_ARANGE64_SLOTS / 2) - 1,
};
#define mt_min_slot_count(x) mt_min_slots[mte_node_type(x)]

#define MAPLE_BIG_NODE_SLOTS        (MAPLE_RANGE64_SLOTS * 2 + 2)
#define MAPLE_BIG_NODE_GAPS        (MAPLE_ARANGE64_SLOTS * 2 + 1)

struct maple_big_node {
        unsigned long pivot[MAPLE_BIG_NODE_SLOTS - 1];
        union {
                struct maple_enode *slot[MAPLE_BIG_NODE_SLOTS];
                struct {
                        unsigned long padding[MAPLE_BIG_NODE_GAPS];
                        unsigned long gap[MAPLE_BIG_NODE_GAPS];
                };
        };
        unsigned char b_end;
        enum maple_type type;
};

/*
 * The maple_subtree_state is used to build a tree to replace a segment of an
 * existing tree in a more atomic way.  Any walkers of the older tree will hit a
 * dead node and restart on updates.
 */
struct maple_subtree_state {
        struct ma_state *orig_l;        /* Original left side of subtree */
        struct ma_state *orig_r;        /* Original right side of subtree */
        struct ma_state *l;                /* New left side of subtree */
        struct ma_state *m;                /* New middle of subtree (rare) */
        struct ma_state *r;                /* New right side of subtree */
        struct ma_topiary *free;        /* nodes to be freed */
        struct ma_topiary *destroy;        /* Nodes to be destroyed (walked and freed) */
        struct maple_big_node *bn;
};

#ifdef CONFIG_KASAN_STACK
/* Prevent mas_wr_bnode() from exceeding the stack frame limit */
#define noinline_for_kasan noinline_for_stack
#else
#define noinline_for_kasan inline
#endif

/* Functions */
static inline struct maple_node *mt_alloc_one(gfp_t gfp)
{
        return kmem_cache_alloc(maple_node_cache, gfp);
}

static inline void mt_free_bulk(size_t size, void __rcu **nodes)
{
        kmem_cache_free_bulk(maple_node_cache, size, (void **)nodes);
}

static void mt_return_sheaf(struct slab_sheaf *sheaf)
{
        kmem_cache_return_sheaf(maple_node_cache, GFP_NOWAIT, sheaf);
}

static struct slab_sheaf *mt_get_sheaf(gfp_t gfp, int count)
{
        return kmem_cache_prefill_sheaf(maple_node_cache, gfp, count);
}

static int mt_refill_sheaf(gfp_t gfp, struct slab_sheaf **sheaf,
                unsigned int size)
{
        return kmem_cache_refill_sheaf(maple_node_cache, gfp, sheaf, size);
}

/*
 * ma_free_rcu() - Use rcu callback to free a maple node
 * @node: The node to free
 *
 * The maple tree uses the parent pointer to indicate this node is no longer in
 * use and will be freed.
 */
static void ma_free_rcu(struct maple_node *node)
{
        WARN_ON(node->parent != ma_parent_ptr(node));
        kfree_rcu(node, rcu);
}

static void mt_set_height(struct maple_tree *mt, unsigned char height)
{
        unsigned int new_flags = mt->ma_flags;

        new_flags &= ~MT_FLAGS_HEIGHT_MASK;
        MT_BUG_ON(mt, height > MAPLE_HEIGHT_MAX);
        new_flags |= height << MT_FLAGS_HEIGHT_OFFSET;
        mt->ma_flags = new_flags;
}

static unsigned int mas_mt_height(struct ma_state *mas)
{
        return mt_height(mas->tree);
}

static inline unsigned int mt_attr(struct maple_tree *mt)
{
        return mt->ma_flags & ~MT_FLAGS_HEIGHT_MASK;
}

static __always_inline enum maple_type mte_node_type(
                const struct maple_enode *entry)
{
        return ((unsigned long)entry >> MAPLE_NODE_TYPE_SHIFT) &
                MAPLE_NODE_TYPE_MASK;
}

static __always_inline bool ma_is_dense(const enum maple_type type)
{
        return type < maple_leaf_64;
}

static __always_inline bool ma_is_leaf(const enum maple_type type)
{
        return type < maple_range_64;
}

static __always_inline bool mte_is_leaf(const struct maple_enode *entry)
{
        return ma_is_leaf(mte_node_type(entry));
}

/*
 * We also reserve values with the bottom two bits set to '10' which are
 * below 4096
 */
static __always_inline bool mt_is_reserved(const void *entry)
{
        return ((unsigned long)entry < MAPLE_RESERVED_RANGE) &&
                xa_is_internal(entry);
}

static __always_inline void mas_set_err(struct ma_state *mas, long err)
{
        mas->node = MA_ERROR(err);
        mas->status = ma_error;
}

static __always_inline bool mas_is_ptr(const struct ma_state *mas)
{
        return mas->status == ma_root;
}

static __always_inline bool mas_is_start(const struct ma_state *mas)
{
        return mas->status == ma_start;
}

static __always_inline bool mas_is_none(const struct ma_state *mas)
{
        return mas->status == ma_none;
}

static __always_inline bool mas_is_paused(const struct ma_state *mas)
{
        return mas->status == ma_pause;
}

static __always_inline bool mas_is_overflow(struct ma_state *mas)
{
        return mas->status == ma_overflow;
}

static inline bool mas_is_underflow(struct ma_state *mas)
{
        return mas->status == ma_underflow;
}

static __always_inline struct maple_node *mte_to_node(
                const struct maple_enode *entry)
{
        return (struct maple_node *)((unsigned long)entry & ~MAPLE_NODE_MASK);
}

/*
 * mte_to_mat() - Convert a maple encoded node to a maple topiary node.
 * @entry: The maple encoded node
 *
 * Return: a maple topiary pointer
 */
static inline struct maple_topiary *mte_to_mat(const struct maple_enode *entry)
{
        return (struct maple_topiary *)
                ((unsigned long)entry & ~MAPLE_NODE_MASK);
}

/*
 * mas_mn() - Get the maple state node.
 * @mas: The maple state
 *
 * Return: the maple node (not encoded - bare pointer).
 */
static inline struct maple_node *mas_mn(const struct ma_state *mas)
{
        return mte_to_node(mas->node);
}

/*
 * mte_set_node_dead() - Set a maple encoded node as dead.
 * @mn: The maple encoded node.
 */
static inline void mte_set_node_dead(struct maple_enode *mn)
{
        mte_to_node(mn)->parent = ma_parent_ptr(mte_to_node(mn));
        smp_wmb(); /* Needed for RCU */
}

/* Bit 1 indicates the root is a node */
#define MAPLE_ROOT_NODE                        0x02
/* maple_type stored bit 3-6 */
#define MAPLE_ENODE_TYPE_SHIFT                0x03
/* Bit 2 means a NULL somewhere below */
#define MAPLE_ENODE_NULL                0x04

static inline struct maple_enode *mt_mk_node(const struct maple_node *node,
                                             enum maple_type type)
{
        return (void *)((unsigned long)node |
                        (type << MAPLE_ENODE_TYPE_SHIFT) | MAPLE_ENODE_NULL);
}

static inline void *mte_mk_root(const struct maple_enode *node)
{
        return (void *)((unsigned long)node | MAPLE_ROOT_NODE);
}

static inline void *mte_safe_root(const struct maple_enode *node)
{
        return (void *)((unsigned long)node & ~MAPLE_ROOT_NODE);
}

static inline void __maybe_unused *mte_set_full(const struct maple_enode *node)
{
        return (void *)((unsigned long)node & ~MAPLE_ENODE_NULL);
}

static inline void __maybe_unused *mte_clear_full(const struct maple_enode *node)
{
        return (void *)((unsigned long)node | MAPLE_ENODE_NULL);
}

static inline bool __maybe_unused mte_has_null(const struct maple_enode *node)
{
        return (unsigned long)node & MAPLE_ENODE_NULL;
}

static __always_inline bool ma_is_root(struct maple_node *node)
{
        return ((unsigned long)node->parent & MA_ROOT_PARENT);
}

static __always_inline bool mte_is_root(const struct maple_enode *node)
{
        return ma_is_root(mte_to_node(node));
}

static inline bool mas_is_root_limits(const struct ma_state *mas)
{
        return !mas->min && mas->max == ULONG_MAX;
}

static __always_inline bool mt_is_alloc(struct maple_tree *mt)
{
        return (mt->ma_flags & MT_FLAGS_ALLOC_RANGE);
}

/*
 * The Parent Pointer
 * Excluding root, the parent pointer is 256B aligned like all other tree nodes.
 * When storing a 32 or 64 bit values, the offset can fit into 5 bits.  The 16
 * bit values need an extra bit to store the offset.  This extra bit comes from
 * a reuse of the last bit in the node type.  This is possible by using bit 1 to
 * indicate if bit 2 is part of the type or the slot.
 *
 * Node types:
 *  0b??1 = Root
 *  0b?00 = 16 bit nodes
 *  0b010 = 32 bit nodes
 *  0b110 = 64 bit nodes
 *
 * Slot size and alignment
 *  0b??1 : Root
 *  0b?00 : 16 bit values, type in 0-1, slot in 2-7
 *  0b010 : 32 bit values, type in 0-2, slot in 3-7
 *  0b110 : 64 bit values, type in 0-2, slot in 3-7
 */

#define MAPLE_PARENT_ROOT                0x01

#define MAPLE_PARENT_SLOT_SHIFT                0x03
#define MAPLE_PARENT_SLOT_MASK                0xF8

#define MAPLE_PARENT_16B_SLOT_SHIFT        0x02
#define MAPLE_PARENT_16B_SLOT_MASK        0xFC

#define MAPLE_PARENT_RANGE64                0x06
#define MAPLE_PARENT_RANGE32                0x02
#define MAPLE_PARENT_NOT_RANGE16        0x02

/*
 * mte_parent_shift() - Get the parent shift for the slot storage.
 * @parent: The parent pointer cast as an unsigned long
 * Return: The shift into that pointer to the star to of the slot
 */
static inline unsigned long mte_parent_shift(unsigned long parent)
{
        /* Note bit 1 == 0 means 16B */
        if (likely(parent & MAPLE_PARENT_NOT_RANGE16))
                return MAPLE_PARENT_SLOT_SHIFT;

        return MAPLE_PARENT_16B_SLOT_SHIFT;
}

/*
 * mte_parent_slot_mask() - Get the slot mask for the parent.
 * @parent: The parent pointer cast as an unsigned long.
 * Return: The slot mask for that parent.
 */
static inline unsigned long mte_parent_slot_mask(unsigned long parent)
{
        /* Note bit 1 == 0 means 16B */
        if (likely(parent & MAPLE_PARENT_NOT_RANGE16))
                return MAPLE_PARENT_SLOT_MASK;

        return MAPLE_PARENT_16B_SLOT_MASK;
}

/*
 * mas_parent_type() - Return the maple_type of the parent from the stored
 * parent type.
 * @mas: The maple state
 * @enode: The maple_enode to extract the parent's enum
 * Return: The node->parent maple_type
 */
static inline
enum maple_type mas_parent_type(struct ma_state *mas, struct maple_enode *enode)
{
        unsigned long p_type;

        p_type = (unsigned long)mte_to_node(enode)->parent;
        if (WARN_ON(p_type & MAPLE_PARENT_ROOT))
                return 0;

        p_type &= MAPLE_NODE_MASK;
        p_type &= ~mte_parent_slot_mask(p_type);
        switch (p_type) {
        case MAPLE_PARENT_RANGE64: /* or MAPLE_PARENT_ARANGE64 */
                if (mt_is_alloc(mas->tree))
                        return maple_arange_64;
                return maple_range_64;
        }

        return 0;
}

/*
 * mas_set_parent() - Set the parent node and encode the slot
 * @mas: The maple state
 * @enode: The encoded maple node.
 * @parent: The encoded maple node that is the parent of @enode.
 * @slot: The slot that @enode resides in @parent.
 *
 * Slot number is encoded in the enode->parent bit 3-6 or 2-6, depending on the
 * parent type.
 */
static inline
void mas_set_parent(struct ma_state *mas, struct maple_enode *enode,
                    const struct maple_enode *parent, unsigned char slot)
{
        unsigned long val = (unsigned long)parent;
        unsigned long shift;
        unsigned long type;
        enum maple_type p_type = mte_node_type(parent);

        MAS_BUG_ON(mas, p_type == maple_dense);
        MAS_BUG_ON(mas, p_type == maple_leaf_64);

        switch (p_type) {
        case maple_range_64:
        case maple_arange_64:
                shift = MAPLE_PARENT_SLOT_SHIFT;
                type = MAPLE_PARENT_RANGE64;
                break;
        default:
        case maple_dense:
        case maple_leaf_64:
                shift = type = 0;
                break;
        }

        val &= ~MAPLE_NODE_MASK; /* Clear all node metadata in parent */
        val |= (slot << shift) | type;
        mte_to_node(enode)->parent = ma_parent_ptr(val);
}

/*
 * mte_parent_slot() - get the parent slot of @enode.
 * @enode: The encoded maple node.
 *
 * Return: The slot in the parent node where @enode resides.
 */
static __always_inline
unsigned int mte_parent_slot(const struct maple_enode *enode)
{
        unsigned long val = (unsigned long)mte_to_node(enode)->parent;

        if (unlikely(val & MA_ROOT_PARENT))
                return 0;

        /*
         * Okay to use MAPLE_PARENT_16B_SLOT_MASK as the last bit will be lost
         * by shift if the parent shift is MAPLE_PARENT_SLOT_SHIFT
         */
        return (val & MAPLE_PARENT_16B_SLOT_MASK) >> mte_parent_shift(val);
}

/*
 * mte_parent() - Get the parent of @node.
 * @enode: The encoded maple node.
 *
 * Return: The parent maple node.
 */
static __always_inline
struct maple_node *mte_parent(const struct maple_enode *enode)
{
        return (void *)((unsigned long)
                        (mte_to_node(enode)->parent) & ~MAPLE_NODE_MASK);
}

/*
 * ma_dead_node() - check if the @enode is dead.
 * @enode: The encoded maple node
 *
 * Return: true if dead, false otherwise.
 */
static __always_inline bool ma_dead_node(const struct maple_node *node)
{
        struct maple_node *parent;

        /* Do not reorder reads from the node prior to the parent check */
        smp_rmb();
        parent = (void *)((unsigned long) node->parent & ~MAPLE_NODE_MASK);
        return (parent == node);
}

/*
 * mte_dead_node() - check if the @enode is dead.
 * @enode: The encoded maple node
 *
 * Return: true if dead, false otherwise.
 */
static __always_inline bool mte_dead_node(const struct maple_enode *enode)
{
        struct maple_node *node;

        node = mte_to_node(enode);
        return ma_dead_node(node);
}

/*
 * ma_pivots() - Get a pointer to the maple node pivots.
 * @node: the maple node
 * @type: the node type
 *
 * In the event of a dead node, this array may be %NULL
 *
 * Return: A pointer to the maple node pivots
 */
static inline unsigned long *ma_pivots(struct maple_node *node,
                                           enum maple_type type)
{
        switch (type) {
        case maple_arange_64:
                return node->ma64.pivot;
        case maple_range_64:
        case maple_leaf_64:
                return node->mr64.pivot;
        case maple_dense:
                return NULL;
        }
        return NULL;
}

/*
 * ma_gaps() - Get a pointer to the maple node gaps.
 * @node: the maple node
 * @type: the node type
 *
 * Return: A pointer to the maple node gaps
 */
static inline unsigned long *ma_gaps(struct maple_node *node,
                                     enum maple_type type)
{
        switch (type) {
        case maple_arange_64:
                return node->ma64.gap;
        case maple_range_64:
        case maple_leaf_64:
        case maple_dense:
                return NULL;
        }
        return NULL;
}

/*
 * mas_safe_pivot() - get the pivot at @piv or mas->max.
 * @mas: The maple state
 * @pivots: The pointer to the maple node pivots
 * @piv: The pivot to fetch
 * @type: The maple node type
 *
 * Return: The pivot at @piv within the limit of the @pivots array, @mas->max
 * otherwise.
 */
static __always_inline unsigned long
mas_safe_pivot(const struct ma_state *mas, unsigned long *pivots,
               unsigned char piv, enum maple_type type)
{
        if (piv >= mt_pivots[type])
                return mas->max;

        return pivots[piv];
}

/*
 * mas_safe_min() - Return the minimum for a given offset.
 * @mas: The maple state
 * @pivots: The pointer to the maple node pivots
 * @offset: The offset into the pivot array
 *
 * Return: The minimum range value that is contained in @offset.
 */
static inline unsigned long
mas_safe_min(struct ma_state *mas, unsigned long *pivots, unsigned char offset)
{
        if (likely(offset))
                return pivots[offset - 1] + 1;

        return mas->min;
}

/*
 * mte_set_pivot() - Set a pivot to a value in an encoded maple node.
 * @mn: The encoded maple node
 * @piv: The pivot offset
 * @val: The value of the pivot
 */
static inline void mte_set_pivot(struct maple_enode *mn, unsigned char piv,
                                unsigned long val)
{
        struct maple_node *node = mte_to_node(mn);
        enum maple_type type = mte_node_type(mn);

        BUG_ON(piv >= mt_pivots[type]);
        switch (type) {
        case maple_range_64:
        case maple_leaf_64:
                node->mr64.pivot[piv] = val;
                break;
        case maple_arange_64:
                node->ma64.pivot[piv] = val;
                break;
        case maple_dense:
                break;
        }

}

/*
 * ma_slots() - Get a pointer to the maple node slots.
 * @mn: The maple node
 * @mt: The maple node type
 *
 * Return: A pointer to the maple node slots
 */
static inline void __rcu **ma_slots(struct maple_node *mn, enum maple_type mt)
{
        switch (mt) {
        case maple_arange_64:
                return mn->ma64.slot;
        case maple_range_64:
        case maple_leaf_64:
                return mn->mr64.slot;
        case maple_dense:
                return mn->slot;
        }

        return NULL;
}

static inline bool mt_write_locked(const struct maple_tree *mt)
{
        return mt_external_lock(mt) ? mt_write_lock_is_held(mt) :
                lockdep_is_held(&mt->ma_lock);
}

static __always_inline bool mt_locked(const struct maple_tree *mt)
{
        return mt_external_lock(mt) ? mt_lock_is_held(mt) :
                lockdep_is_held(&mt->ma_lock);
}

static __always_inline void *mt_slot(const struct maple_tree *mt,
                void __rcu **slots, unsigned char offset)
{
        return rcu_dereference_check(slots[offset], mt_locked(mt));
}

static __always_inline void *mt_slot_locked(struct maple_tree *mt,
                void __rcu **slots, unsigned char offset)
{
        return rcu_dereference_protected(slots[offset], mt_write_locked(mt));
}
/*
 * mas_slot_locked() - Get the slot value when holding the maple tree lock.
 * @mas: The maple state
 * @slots: The pointer to the slots
 * @offset: The offset into the slots array to fetch
 *
 * Return: The entry stored in @slots at the @offset.
 */
static __always_inline void *mas_slot_locked(struct ma_state *mas,
                void __rcu **slots, unsigned char offset)
{
        return mt_slot_locked(mas->tree, slots, offset);
}

/*
 * mas_slot() - Get the slot value when not holding the maple tree lock.
 * @mas: The maple state
 * @slots: The pointer to the slots
 * @offset: The offset into the slots array to fetch
 *
 * Return: The entry stored in @slots at the @offset
 */
static __always_inline void *mas_slot(struct ma_state *mas, void __rcu **slots,
                unsigned char offset)
{
        return mt_slot(mas->tree, slots, offset);
}

/*
 * mas_root() - Get the maple tree root.
 * @mas: The maple state.
 *
 * Return: The pointer to the root of the tree
 */
static __always_inline void *mas_root(struct ma_state *mas)
{
        return rcu_dereference_check(mas->tree->ma_root, mt_locked(mas->tree));
}

static inline void *mt_root_locked(struct maple_tree *mt)
{
        return rcu_dereference_protected(mt->ma_root, mt_write_locked(mt));
}

/*
 * mas_root_locked() - Get the maple tree root when holding the maple tree lock.
 * @mas: The maple state.
 *
 * Return: The pointer to the root of the tree
 */
static inline void *mas_root_locked(struct ma_state *mas)
{
        return mt_root_locked(mas->tree);
}

static inline struct maple_metadata *ma_meta(struct maple_node *mn,
                                             enum maple_type mt)
{
        switch (mt) {
        case maple_arange_64:
                return &mn->ma64.meta;
        default:
                return &mn->mr64.meta;
        }
}

/*
 * ma_set_meta() - Set the metadata information of a node.
 * @mn: The maple node
 * @mt: The maple node type
 * @offset: The offset of the highest sub-gap in this node.
 * @end: The end of the data in this node.
 */
static inline void ma_set_meta(struct maple_node *mn, enum maple_type mt,
                               unsigned char offset, unsigned char end)
{
        struct maple_metadata *meta = ma_meta(mn, mt);

        meta->gap = offset;
        meta->end = end;
}

/*
 * mt_clear_meta() - clear the metadata information of a node, if it exists
 * @mt: The maple tree
 * @mn: The maple node
 * @type: The maple node type
 */
static inline void mt_clear_meta(struct maple_tree *mt, struct maple_node *mn,
                                  enum maple_type type)
{
        struct maple_metadata *meta;
        unsigned long *pivots;
        void __rcu **slots;
        void *next;

        switch (type) {
        case maple_range_64:
                pivots = mn->mr64.pivot;
                if (unlikely(pivots[MAPLE_RANGE64_SLOTS - 2])) {
                        slots = mn->mr64.slot;
                        next = mt_slot_locked(mt, slots,
                                              MAPLE_RANGE64_SLOTS - 1);
                        if (unlikely((mte_to_node(next) &&
                                      mte_node_type(next))))
                                return; /* no metadata, could be node */
                }
                fallthrough;
        case maple_arange_64:
                meta = ma_meta(mn, type);
                break;
        default:
                return;
        }

        meta->gap = 0;
        meta->end = 0;
}

/*
 * ma_meta_end() - Get the data end of a node from the metadata
 * @mn: The maple node
 * @mt: The maple node type
 */
static inline unsigned char ma_meta_end(struct maple_node *mn,
                                        enum maple_type mt)
{
        struct maple_metadata *meta = ma_meta(mn, mt);

        return meta->end;
}

/*
 * ma_meta_gap() - Get the largest gap location of a node from the metadata
 * @mn: The maple node
 */
static inline unsigned char ma_meta_gap(struct maple_node *mn)
{
        return mn->ma64.meta.gap;
}

/*
 * ma_set_meta_gap() - Set the largest gap location in a nodes metadata
 * @mn: The maple node
 * @mt: The maple node type
 * @offset: The location of the largest gap.
 */
static inline void ma_set_meta_gap(struct maple_node *mn, enum maple_type mt,
                                   unsigned char offset)
{

        struct maple_metadata *meta = ma_meta(mn, mt);

        meta->gap = offset;
}

/*
 * mat_add() - Add a @dead_enode to the ma_topiary of a list of dead nodes.
 * @mat: the ma_topiary, a linked list of dead nodes.
 * @dead_enode: the node to be marked as dead and added to the tail of the list
 *
 * Add the @dead_enode to the linked list in @mat.
 */
static inline void mat_add(struct ma_topiary *mat,
                           struct maple_enode *dead_enode)
{
        mte_set_node_dead(dead_enode);
        mte_to_mat(dead_enode)->next = NULL;
        if (!mat->tail) {
                mat->tail = mat->head = dead_enode;
                return;
        }

        mte_to_mat(mat->tail)->next = dead_enode;
        mat->tail = dead_enode;
}

static void mt_free_walk(struct rcu_head *head);
static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt,
                            bool free);
/*
 * mas_mat_destroy() - Free all nodes and subtrees in a dead list.
 * @mas: the maple state
 * @mat: the ma_topiary linked list of dead nodes to free.
 *
 * Destroy walk a dead list.
 */
static void mas_mat_destroy(struct ma_state *mas, struct ma_topiary *mat)
{
        struct maple_enode *next;
        struct maple_node *node;
        bool in_rcu = mt_in_rcu(mas->tree);

        while (mat->head) {
                next = mte_to_mat(mat->head)->next;
                node = mte_to_node(mat->head);
                mt_destroy_walk(mat->head, mas->tree, !in_rcu);
                if (in_rcu)
                        call_rcu(&node->rcu, mt_free_walk);
                mat->head = next;
        }
}
/*
 * mas_descend() - Descend into the slot stored in the ma_state.
 * @mas: the maple state.
 *
 * Note: Not RCU safe, only use in write side or debug code.
 */
static inline void mas_descend(struct ma_state *mas)
{
        enum maple_type type;
        unsigned long *pivots;
        struct maple_node *node;
        void __rcu **slots;

        node = mas_mn(mas);
        type = mte_node_type(mas->node);
        pivots = ma_pivots(node, type);
        slots = ma_slots(node, type);

        if (mas->offset)
                mas->min = pivots[mas->offset - 1] + 1;
        mas->max = mas_safe_pivot(mas, pivots, mas->offset, type);
        mas->node = mas_slot(mas, slots, mas->offset);
}

/*
 * mas_ascend() - Walk up a level of the tree.
 * @mas: The maple state
 *
 * Sets the @mas->max and @mas->min for the parent node of mas->node.  This
 * may cause several levels of walking up to find the correct min and max.
 * May find a dead node which will cause a premature return.
 * Return: 1 on dead node, 0 otherwise
 */
static int mas_ascend(struct ma_state *mas)
{
        struct maple_enode *p_enode; /* parent enode. */
        struct maple_enode *a_enode; /* ancestor enode. */
        struct maple_node *a_node; /* ancestor node. */
        struct maple_node *p_node; /* parent node. */
        unsigned char a_slot;
        enum maple_type a_type;
        unsigned long min, max;
        unsigned long *pivots;
        bool set_max = false, set_min = false;

        a_node = mas_mn(mas);
        if (ma_is_root(a_node)) {
                mas->offset = 0;
                return 0;
        }

        p_node = mte_parent(mas->node);
        if (unlikely(a_node == p_node))
                return 1;

        a_type = mas_parent_type(mas, mas->node);
        mas->offset = mte_parent_slot(mas->node);
        a_enode = mt_mk_node(p_node, a_type);

        /* Check to make sure all parent information is still accurate */
        if (p_node != mte_parent(mas->node))
                return 1;

        mas->node = a_enode;

        if (mte_is_root(a_enode)) {
                mas->max = ULONG_MAX;
                mas->min = 0;
                return 0;
        }

        min = 0;
        max = ULONG_MAX;

        /*
         * !mas->offset implies that parent node min == mas->min.
         * mas->offset > 0 implies that we need to walk up to find the
         * implied pivot min.
         */
        if (!mas->offset) {
                min = mas->min;
                set_min = true;
        }

        if (mas->max == ULONG_MAX)
                set_max = true;

        do {
                p_enode = a_enode;
                a_type = mas_parent_type(mas, p_enode);
                a_node = mte_parent(p_enode);
                a_slot = mte_parent_slot(p_enode);
                a_enode = mt_mk_node(a_node, a_type);
                pivots = ma_pivots(a_node, a_type);

                if (unlikely(ma_dead_node(a_node)))
                        return 1;

                if (!set_min && a_slot) {
                        set_min = true;
                        min = pivots[a_slot - 1] + 1;
                }

                if (!set_max && a_slot < mt_pivots[a_type]) {
                        set_max = true;
                        max = pivots[a_slot];
                }

                if (unlikely(ma_dead_node(a_node)))
                        return 1;

                if (unlikely(ma_is_root(a_node)))
                        break;

        } while (!set_min || !set_max);

        mas->max = max;
        mas->min = min;
        return 0;
}

/*
 * mas_pop_node() - Get a previously allocated maple node from the maple state.
 * @mas: The maple state
 *
 * Return: A pointer to a maple node.
 */
static __always_inline struct maple_node *mas_pop_node(struct ma_state *mas)
{
        struct maple_node *ret;

        if (mas->alloc) {
                ret = mas->alloc;
                mas->alloc = NULL;
                goto out;
        }

        if (WARN_ON_ONCE(!mas->sheaf))
                return NULL;

        ret = kmem_cache_alloc_from_sheaf(maple_node_cache, GFP_NOWAIT, mas->sheaf);

out:
        memset(ret, 0, sizeof(*ret));
        return ret;
}

/*
 * mas_alloc_nodes() - Allocate nodes into a maple state
 * @mas: The maple state
 * @gfp: The GFP Flags
 */
static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
{
        if (!mas->node_request)
                return;

        if (mas->node_request == 1) {
                if (mas->sheaf)
                        goto use_sheaf;

                if (mas->alloc)
                        return;

                mas->alloc = mt_alloc_one(gfp);
                if (!mas->alloc)
                        goto error;

                mas->node_request = 0;
                return;
        }

use_sheaf:
        if (unlikely(mas->alloc)) {
                kfree(mas->alloc);
                mas->alloc = NULL;
        }

        if (mas->sheaf) {
                unsigned long refill;

                refill = mas->node_request;
                if (kmem_cache_sheaf_size(mas->sheaf) >= refill) {
                        mas->node_request = 0;
                        return;
                }

                if (mt_refill_sheaf(gfp, &mas->sheaf, refill))
                        goto error;

                mas->node_request = 0;
                return;
        }

        mas->sheaf = mt_get_sheaf(gfp, mas->node_request);
        if (likely(mas->sheaf)) {
                mas->node_request = 0;
                return;
        }

error:
        mas_set_err(mas, -ENOMEM);
}

static inline void mas_empty_nodes(struct ma_state *mas)
{
        mas->node_request = 0;
        if (mas->sheaf) {
                mt_return_sheaf(mas->sheaf);
                mas->sheaf = NULL;
        }

        if (mas->alloc) {
                kfree(mas->alloc);
                mas->alloc = NULL;
        }
}

/*
 * mas_free() - Free an encoded maple node
 * @mas: The maple state
 * @used: The encoded maple node to free.
 *
 * Uses rcu free if necessary, pushes @used back on the maple state allocations
 * otherwise.
 */
static inline void mas_free(struct ma_state *mas, struct maple_enode *used)
{
        ma_free_rcu(mte_to_node(used));
}

/*
 * mas_start() - Sets up maple state for operations.
 * @mas: The maple state.
 *
 * If mas->status == ma_start, then set the min, max and depth to
 * defaults.
 *
 * Return:
 * - If mas->node is an error or not mas_start, return NULL.
 * - If it's an empty tree:     NULL & mas->status == ma_none
 * - If it's a single entry:    The entry & mas->status == ma_root
 * - If it's a tree:            NULL & mas->status == ma_active
 */
static inline struct maple_enode *mas_start(struct ma_state *mas)
{
        if (likely(mas_is_start(mas))) {
                struct maple_enode *root;

                mas->min = 0;
                mas->max = ULONG_MAX;

retry:
                mas->depth = 0;
                root = mas_root(mas);
                /* Tree with nodes */
                if (likely(xa_is_node(root))) {
                        mas->depth = 0;
                        mas->status = ma_active;
                        mas->node = mte_safe_root(root);
                        mas->offset = 0;
                        if (mte_dead_node(mas->node))
                                goto retry;

                        return NULL;
                }

                mas->node = NULL;
                /* empty tree */
                if (unlikely(!root)) {
                        mas->status = ma_none;
                        mas->offset = MAPLE_NODE_SLOTS;
                        return NULL;
                }

                /* Single entry tree */
                mas->status = ma_root;
                mas->offset = MAPLE_NODE_SLOTS;

                /* Single entry tree. */
                if (mas->index > 0)
                        return NULL;

                return root;
        }

        return NULL;
}

/*
 * ma_data_end() - Find the end of the data in a node.
 * @node: The maple node
 * @type: The maple node type
 * @pivots: The array of pivots in the node
 * @max: The maximum value in the node
 *
 * Uses metadata to find the end of the data when possible.
 * Return: The zero indexed last slot with data (may be null).
 */
static __always_inline unsigned char ma_data_end(struct maple_node *node,
                enum maple_type type, unsigned long *pivots, unsigned long max)
{
        unsigned char offset;

        if (!pivots)
                return 0;

        if (type == maple_arange_64)
                return ma_meta_end(node, type);

        offset = mt_pivots[type] - 1;
        if (likely(!pivots[offset]))
                return ma_meta_end(node, type);

        if (likely(pivots[offset] == max))
                return offset;

        return mt_pivots[type];
}

/*
 * mas_data_end() - Find the end of the data (slot).
 * @mas: the maple state
 *
 * This method is optimized to check the metadata of a node if the node type
 * supports data end metadata.
 *
 * Return: The zero indexed last slot with data (may be null).
 */
static inline unsigned char mas_data_end(struct ma_state *mas)
{
        enum maple_type type;
        struct maple_node *node;
        unsigned char offset;
        unsigned long *pivots;

        type = mte_node_type(mas->node);
        node = mas_mn(mas);
        if (type == maple_arange_64)
                return ma_meta_end(node, type);

        pivots = ma_pivots(node, type);
        if (unlikely(ma_dead_node(node)))
                return 0;

        offset = mt_pivots[type] - 1;
        if (likely(!pivots[offset]))
                return ma_meta_end(node, type);

        if (likely(pivots[offset] == mas->max))
                return offset;

        return mt_pivots[type];
}

/*
 * mas_leaf_max_gap() - Returns the largest gap in a leaf node
 * @mas: the maple state
 *
 * Return: The maximum gap in the leaf.
 */
static unsigned long mas_leaf_max_gap(struct ma_state *mas)
{
        enum maple_type mt;
        unsigned long pstart, gap, max_gap;
        struct maple_node *mn;
        unsigned long *pivots;
        void __rcu **slots;
        unsigned char i;
        unsigned char max_piv;

        mt = mte_node_type(mas->node);
        mn = mas_mn(mas);
        slots = ma_slots(mn, mt);
        max_gap = 0;
        if (unlikely(ma_is_dense(mt))) {
                gap = 0;
                for (i = 0; i < mt_slots[mt]; i++) {
                        if (slots[i]) {
                                if (gap > max_gap)
                                        max_gap = gap;
                                gap = 0;
                        } else {
                                gap++;
                        }
                }
                if (gap > max_gap)
                        max_gap = gap;
                return max_gap;
        }

        /*
         * Check the first implied pivot optimizes the loop below and slot 1 may
         * be skipped if there is a gap in slot 0.
         */
        pivots = ma_pivots(mn, mt);
        if (likely(!slots[0])) {
                max_gap = pivots[0] - mas->min + 1;
                i = 2;
        } else {
                i = 1;
        }

        /* reduce max_piv as the special case is checked before the loop */
        max_piv = ma_data_end(mn, mt, pivots, mas->max) - 1;
        /*
         * Check end implied pivot which can only be a gap on the right most
         * node.
         */
        if (unlikely(mas->max == ULONG_MAX) && !slots[max_piv + 1]) {
                gap = ULONG_MAX - pivots[max_piv];
                if (gap > max_gap)
                        max_gap = gap;

                if (max_gap > pivots[max_piv] - mas->min)
                        return max_gap;
        }

        for (; i <= max_piv; i++) {
                /* data == no gap. */
                if (likely(slots[i]))
                        continue;

                pstart = pivots[i - 1];
                gap = pivots[i] - pstart;
                if (gap > max_gap)
                        max_gap = gap;

                /* There cannot be two gaps in a row. */
                i++;
        }
        return max_gap;
}

/*
 * ma_max_gap() - Get the maximum gap in a maple node (non-leaf)
 * @node: The maple node
 * @gaps: The pointer to the gaps
 * @mt: The maple node type
 * @off: Pointer to store the offset location of the gap.
 *
 * Uses the metadata data end to scan backwards across set gaps.
 *
 * Return: The maximum gap value
 */
static inline unsigned long
ma_max_gap(struct maple_node *node, unsigned long *gaps, enum maple_type mt,
            unsigned char *off)
{
        unsigned char offset, i;
        unsigned long max_gap = 0;

        i = offset = ma_meta_end(node, mt);
        do {
                if (gaps[i] > max_gap) {
                        max_gap = gaps[i];
                        offset = i;
                }
        } while (i--);

        *off = offset;
        return max_gap;
}

/*
 * mas_max_gap() - find the largest gap in a non-leaf node and set the slot.
 * @mas: The maple state.
 *
 * Return: The gap value.
 */
static inline unsigned long mas_max_gap(struct ma_state *mas)
{
        unsigned long *gaps;
        unsigned char offset;
        enum maple_type mt;
        struct maple_node *node;

        mt = mte_node_type(mas->node);
        if (ma_is_leaf(mt))
                return mas_leaf_max_gap(mas);

        node = mas_mn(mas);
        MAS_BUG_ON(mas, mt != maple_arange_64);
        offset = ma_meta_gap(node);
        gaps = ma_gaps(node, mt);
        return gaps[offset];
}

/*
 * mas_parent_gap() - Set the parent gap and any gaps above, as needed
 * @mas: The maple state
 * @offset: The gap offset in the parent to set
 * @new: The new gap value.
 *
 * Set the parent gap then continue to set the gap upwards, using the metadata
 * of the parent to see if it is necessary to check the node above.
 */
static inline void mas_parent_gap(struct ma_state *mas, unsigned char offset,
                unsigned long new)
{
        unsigned long meta_gap = 0;
        struct maple_node *pnode;
        struct maple_enode *penode;
        unsigned long *pgaps;
        unsigned char meta_offset;
        enum maple_type pmt;

        pnode = mte_parent(mas->node);
        pmt = mas_parent_type(mas, mas->node);
        penode = mt_mk_node(pnode, pmt);
        pgaps = ma_gaps(pnode, pmt);

ascend:
        MAS_BUG_ON(mas, pmt != maple_arange_64);
        meta_offset = ma_meta_gap(pnode);
        meta_gap = pgaps[meta_offset];

        pgaps[offset] = new;

        if (meta_gap == new)
                return;

        if (offset != meta_offset) {
                if (meta_gap > new)
                        return;

                ma_set_meta_gap(pnode, pmt, offset);
        } else if (new < meta_gap) {
                new = ma_max_gap(pnode, pgaps, pmt, &meta_offset);
                ma_set_meta_gap(pnode, pmt, meta_offset);
        }

        if (ma_is_root(pnode))
                return;

        /* Go to the parent node. */
        pnode = mte_parent(penode);
        pmt = mas_parent_type(mas, penode);
        pgaps = ma_gaps(pnode, pmt);
        offset = mte_parent_slot(penode);
        penode = mt_mk_node(pnode, pmt);
        goto ascend;
}

/*
 * mas_update_gap() - Update a nodes gaps and propagate up if necessary.
 * @mas: the maple state.
 */
static inline void mas_update_gap(struct ma_state *mas)
{
        unsigned char pslot;
        unsigned long p_gap;
        unsigned long max_gap;

        if (!mt_is_alloc(mas->tree))
                return;

        if (mte_is_root(mas->node))
                return;

        max_gap = mas_max_gap(mas);

        pslot = mte_parent_slot(mas->node);
        p_gap = ma_gaps(mte_parent(mas->node),
                        mas_parent_type(mas, mas->node))[pslot];

        if (p_gap != max_gap)
                mas_parent_gap(mas, pslot, max_gap);
}

/*
 * mas_adopt_children() - Set the parent pointer of all nodes in @parent to
 * @parent with the slot encoded.
 * @mas: the maple state (for the tree)
 * @parent: the maple encoded node containing the children.
 */
static inline void mas_adopt_children(struct ma_state *mas,
                struct maple_enode *parent)
{
        enum maple_type type = mte_node_type(parent);
        struct maple_node *node = mte_to_node(parent);
        void __rcu **slots = ma_slots(node, type);
        unsigned long *pivots = ma_pivots(node, type);
        struct maple_enode *child;
        unsigned char offset;

        offset = ma_data_end(node, type, pivots, mas->max);
        do {
                child = mas_slot_locked(mas, slots, offset);
                mas_set_parent(mas, child, parent, offset);
        } while (offset--);
}

/*
 * mas_put_in_tree() - Put a new node in the tree, smp_wmb(), and mark the old
 * node as dead.
 * @mas: the maple state with the new node
 * @old_enode: The old maple encoded node to replace.
 * @new_height: if we are inserting a root node, update the height of the tree
 */
static inline void mas_put_in_tree(struct ma_state *mas,
                struct maple_enode *old_enode, char new_height)
        __must_hold(mas->tree->ma_lock)
{
        unsigned char offset;
        void __rcu **slots;

        if (mte_is_root(mas->node)) {
                mas_mn(mas)->parent = ma_parent_ptr(mas_tree_parent(mas));
                rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node));
                mt_set_height(mas->tree, new_height);
        } else {

                offset = mte_parent_slot(mas->node);
                slots = ma_slots(mte_parent(mas->node),
                                 mas_parent_type(mas, mas->node));
                rcu_assign_pointer(slots[offset], mas->node);
        }

        mte_set_node_dead(old_enode);
}

/*
 * mas_replace_node() - Replace a node by putting it in the tree, marking it
 * dead, and freeing it.
 * the parent encoding to locate the maple node in the tree.
 * @mas: the ma_state with @mas->node pointing to the new node.
 * @old_enode: The old maple encoded node.
 * @new_height: The new height of the tree as a result of the operation
 */
static inline void mas_replace_node(struct ma_state *mas,
                struct maple_enode *old_enode, unsigned char new_height)
        __must_hold(mas->tree->ma_lock)
{
        mas_put_in_tree(mas, old_enode, new_height);
        mas_free(mas, old_enode);
}

/*
 * mas_find_child() - Find a child who has the parent @mas->node.
 * @mas: the maple state with the parent.
 * @child: the maple state to store the child.
 */
static inline bool mas_find_child(struct ma_state *mas, struct ma_state *child)
        __must_hold(mas->tree->ma_lock)
{
        enum maple_type mt;
        unsigned char offset;
        unsigned char end;
        unsigned long *pivots;
        struct maple_enode *entry;
        struct maple_node *node;
        void __rcu **slots;

        mt = mte_node_type(mas->node);
        node = mas_mn(mas);
        slots = ma_slots(node, mt);
        pivots = ma_pivots(node, mt);
        end = ma_data_end(node, mt, pivots, mas->max);
        for (offset = mas->offset; offset <= end; offset++) {
                entry = mas_slot_locked(mas, slots, offset);
                if (mte_parent(entry) == node) {
                        *child = *mas;
                        mas->offset = offset + 1;
                        child->offset = offset;
                        mas_descend(child);
                        child->offset = 0;
                        return true;
                }
        }
        return false;
}

/*
 * mab_shift_right() - Shift the data in mab right. Note, does not clean out the
 * old data or set b_node->b_end.
 * @b_node: the maple_big_node
 * @shift: the shift count
 */
static inline void mab_shift_right(struct maple_big_node *b_node,
                                 unsigned char shift)
{
        unsigned long size = b_node->b_end * sizeof(unsigned long);

        memmove(b_node->pivot + shift, b_node->pivot, size);
        memmove(b_node->slot + shift, b_node->slot, size);
        if (b_node->type == maple_arange_64)
                memmove(b_node->gap + shift, b_node->gap, size);
}

/*
 * mab_middle_node() - Check if a middle node is needed (unlikely)
 * @b_node: the maple_big_node that contains the data.
 * @split: the potential split location
 * @slot_count: the size that can be stored in a single node being considered.
 *
 * Return: true if a middle node is required.
 */
static inline bool mab_middle_node(struct maple_big_node *b_node, int split,
                                   unsigned char slot_count)
{
        unsigned char size = b_node->b_end;

        if (size >= 2 * slot_count)
                return true;

        if (!b_node->slot[split] && (size >= 2 * slot_count - 1))
                return true;

        return false;
}

/*
 * mab_no_null_split() - ensure the split doesn't fall on a NULL
 * @b_node: the maple_big_node with the data
 * @split: the suggested split location
 * @slot_count: the number of slots in the node being considered.
 *
 * Return: the split location.
 */
static inline int mab_no_null_split(struct maple_big_node *b_node,
                                    unsigned char split, unsigned char slot_count)
{
        if (!b_node->slot[split]) {
                /*
                 * If the split is less than the max slot && the right side will
                 * still be sufficient, then increment the split on NULL.
                 */
                if ((split < slot_count - 1) &&
                    (b_node->b_end - split) > (mt_min_slots[b_node->type]))
                        split++;
                else
                        split--;
        }
        return split;
}

/*
 * mab_calc_split() - Calculate the split location and if there needs to be two
 * splits.
 * @mas: The maple state
 * @bn: The maple_big_node with the data
 * @mid_split: The second split, if required.  0 otherwise.
 *
 * Return: The first split location.  The middle split is set in @mid_split.
 */
static inline int mab_calc_split(struct ma_state *mas,
         struct maple_big_node *bn, unsigned char *mid_split)
{
        unsigned char b_end = bn->b_end;
        int split = b_end / 2; /* Assume equal split. */
        unsigned char slot_count = mt_slots[bn->type];

        /*
         * To support gap tracking, all NULL entries are kept together and a node cannot
         * end on a NULL entry, with the exception of the left-most leaf.  The
         * limitation means that the split of a node must be checked for this condition
         * and be able to put more data in one direction or the other.
         *
         * Although extremely rare, it is possible to enter what is known as the 3-way
         * split scenario.  The 3-way split comes about by means of a store of a range
         * that overwrites the end and beginning of two full nodes.  The result is a set
         * of entries that cannot be stored in 2 nodes.  Sometimes, these two nodes can
         * also be located in different parent nodes which are also full.  This can
         * carry upwards all the way to the root in the worst case.
         */
        if (unlikely(mab_middle_node(bn, split, slot_count))) {
                split = b_end / 3;
                *mid_split = split * 2;
        } else {
                *mid_split = 0;
        }

        /* Avoid ending a node on a NULL entry */
        split = mab_no_null_split(bn, split, slot_count);

        if (unlikely(*mid_split))
                *mid_split = mab_no_null_split(bn, *mid_split, slot_count);

        return split;
}

/*
 * mas_mab_cp() - Copy data from a maple state inclusively to a maple_big_node
 * and set @b_node->b_end to the next free slot.
 * @mas: The maple state
 * @mas_start: The starting slot to copy
 * @mas_end: The end slot to copy (inclusively)
 * @b_node: The maple_big_node to place the data
 * @mab_start: The starting location in maple_big_node to store the data.
 */
static inline void mas_mab_cp(struct ma_state *mas, unsigned char mas_start,
                        unsigned char mas_end, struct maple_big_node *b_node,
                        unsigned char mab_start)
{
        enum maple_type mt;
        struct maple_node *node;
        void __rcu **slots;
        unsigned long *pivots, *gaps;
        int i = mas_start, j = mab_start;
        unsigned char piv_end;

        node = mas_mn(mas);
        mt = mte_node_type(mas->node);
        pivots = ma_pivots(node, mt);
        if (!i) {
                b_node->pivot[j] = pivots[i++];
                if (unlikely(i > mas_end))
                        goto complete;
                j++;
        }

        piv_end = min(mas_end, mt_pivots[mt]);
        for (; i < piv_end; i++, j++) {
                b_node->pivot[j] = pivots[i];
                if (unlikely(!b_node->pivot[j]))
                        goto complete;

                if (unlikely(mas->max == b_node->pivot[j]))
                        goto complete;
        }

        b_node->pivot[j] = mas_safe_pivot(mas, pivots, i, mt);

complete:
        b_node->b_end = ++j;
        j -= mab_start;
        slots = ma_slots(node, mt);
        memcpy(b_node->slot + mab_start, slots + mas_start, sizeof(void *) * j);
        if (!ma_is_leaf(mt) && mt_is_alloc(mas->tree)) {
                gaps = ma_gaps(node, mt);
                memcpy(b_node->gap + mab_start, gaps + mas_start,
                       sizeof(unsigned long) * j);
        }
}

/*
 * mas_leaf_set_meta() - Set the metadata of a leaf if possible.
 * @node: The maple node
 * @mt: The maple type
 * @end: The node end
 */
static inline void mas_leaf_set_meta(struct maple_node *node,
                enum maple_type mt, unsigned char end)
{
        if (end < mt_slots[mt] - 1)
                ma_set_meta(node, mt, 0, end);
}

/*
 * mab_mas_cp() - Copy data from maple_big_node to a maple encoded node.
 * @b_node: the maple_big_node that has the data
 * @mab_start: the start location in @b_node.
 * @mab_end: The end location in @b_node (inclusively)
 * @mas: The maple state with the maple encoded node.
 */
static inline void mab_mas_cp(struct maple_big_node *b_node,
                              unsigned char mab_start, unsigned char mab_end,
                              struct ma_state *mas, bool new_max)
{
        int i, j = 0;
        enum maple_type mt = mte_node_type(mas->node);
        struct maple_node *node = mte_to_node(mas->node);
        void __rcu **slots = ma_slots(node, mt);
        unsigned long *pivots = ma_pivots(node, mt);
        unsigned long *gaps = NULL;
        unsigned char end;

        if (mab_end - mab_start > mt_pivots[mt])
                mab_end--;

        if (!pivots[mt_pivots[mt] - 1])
                slots[mt_pivots[mt]] = NULL;

        i = mab_start;
        do {
                pivots[j++] = b_node->pivot[i++];
        } while (i <= mab_end && likely(b_node->pivot[i]));

        memcpy(slots, b_node->slot + mab_start,
               sizeof(void *) * (i - mab_start));

        if (new_max)
                mas->max = b_node->pivot[i - 1];

        end = j - 1;
        if (likely(!ma_is_leaf(mt) && mt_is_alloc(mas->tree))) {
                unsigned long max_gap = 0;
                unsigned char offset = 0;

                gaps = ma_gaps(node, mt);
                do {
                        gaps[--j] = b_node->gap[--i];
                        if (gaps[j] > max_gap) {
                                offset = j;
                                max_gap = gaps[j];
                        }
                } while (j);

                ma_set_meta(node, mt, offset, end);
        } else {
                mas_leaf_set_meta(node, mt, end);
        }
}

/*
 * mas_store_b_node() - Store an @entry into the b_node while also copying the
 * data from a maple encoded node.
 * @wr_mas: the maple write state
 * @b_node: the maple_big_node to fill with data
 * @offset_end: the offset to end copying
 *
 * Return: The actual end of the data stored in @b_node
 */
static noinline_for_kasan void mas_store_b_node(struct ma_wr_state *wr_mas,
                struct maple_big_node *b_node, unsigned char offset_end)
{
        unsigned char slot;
        unsigned char b_end;
        /* Possible underflow of piv will wrap back to 0 before use. */
        unsigned long piv;
        struct ma_state *mas = wr_mas->mas;

        b_node->type = wr_mas->type;
        b_end = 0;
        slot = mas->offset;
        if (slot) {
                /* Copy start data up to insert. */
                mas_mab_cp(mas, 0, slot - 1, b_node, 0);
                b_end = b_node->b_end;
                piv = b_node->pivot[b_end - 1];
        } else
                piv = mas->min - 1;

        if (piv + 1 < mas->index) {
                /* Handle range starting after old range */
                b_node->slot[b_end] = wr_mas->content;
                if (!wr_mas->content)
                        b_node->gap[b_end] = mas->index - 1 - piv;
                b_node->pivot[b_end++] = mas->index - 1;
        }

        /* Store the new entry. */
        mas->offset = b_end;
        b_node->slot[b_end] = wr_mas->entry;
        b_node->pivot[b_end] = mas->last;

        /* Appended. */
        if (mas->last >= mas->max)
                goto b_end;

        /* Handle new range ending before old range ends */
        piv = mas_safe_pivot(mas, wr_mas->pivots, offset_end, wr_mas->type);
        if (piv > mas->last) {
                if (offset_end != slot)
                        wr_mas->content = mas_slot_locked(mas, wr_mas->slots,
                                                          offset_end);

                b_node->slot[++b_end] = wr_mas->content;
                if (!wr_mas->content)
                        b_node->gap[b_end] = piv - mas->last + 1;
                b_node->pivot[b_end] = piv;
        }

        slot = offset_end + 1;
        if (slot > mas->end)
                goto b_end;

        /* Copy end data to the end of the node. */
        mas_mab_cp(mas, slot, mas->end + 1, b_node, ++b_end);
        b_node->b_end--;
        return;

b_end:
        b_node->b_end = b_end;
}

/*
 * mas_prev_sibling() - Find the previous node with the same parent.
 * @mas: the maple state
 *
 * Return: True if there is a previous sibling, false otherwise.
 */
static inline bool mas_prev_sibling(struct ma_state *mas)
{
        unsigned int p_slot = mte_parent_slot(mas->node);

        /* For root node, p_slot is set to 0 by mte_parent_slot(). */
        if (!p_slot)
                return false;

        mas_ascend(mas);
        mas->offset = p_slot - 1;
        mas_descend(mas);
        return true;
}

/*
 * mas_next_sibling() - Find the next node with the same parent.
 * @mas: the maple state
 *
 * Return: true if there is a next sibling, false otherwise.
 */
static inline bool mas_next_sibling(struct ma_state *mas)
{
        MA_STATE(parent, mas->tree, mas->index, mas->last);

        if (mte_is_root(mas->node))
                return false;

        parent = *mas;
        mas_ascend(&parent);
        parent.offset = mte_parent_slot(mas->node) + 1;
        if (parent.offset > mas_data_end(&parent))
                return false;

        *mas = parent;
        mas_descend(mas);
        return true;
}

/*
 * mas_node_or_none() - Set the enode and state.
 * @mas: the maple state
 * @enode: The encoded maple node.
 *
 * Set the node to the enode and the status.
 */
static inline void mas_node_or_none(struct ma_state *mas,
                struct maple_enode *enode)
{
        if (enode) {
                mas->node = enode;
                mas->status = ma_active;
        } else {
                mas->node = NULL;
                mas->status = ma_none;
        }
}

/*
 * mas_wr_node_walk() - Find the correct offset for the index in the @mas.
 *                      If @mas->index cannot be found within the containing
 *                      node, we traverse to the last entry in the node.
 * @wr_mas: The maple write state
 *
 * Uses mas_slot_locked() and does not need to worry about dead nodes.
 */
static inline void mas_wr_node_walk(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;
        unsigned char count, offset;

        if (unlikely(ma_is_dense(wr_mas->type))) {
                wr_mas->r_max = wr_mas->r_min = mas->index;
                mas->offset = mas->index = mas->min;
                return;
        }

        wr_mas->node = mas_mn(wr_mas->mas);
        wr_mas->pivots = ma_pivots(wr_mas->node, wr_mas->type);
        count = mas->end = ma_data_end(wr_mas->node, wr_mas->type,
                                       wr_mas->pivots, mas->max);
        offset = mas->offset;

        while (offset < count && mas->index > wr_mas->pivots[offset])
                offset++;

        wr_mas->r_max = offset < count ? wr_mas->pivots[offset] : mas->max;
        wr_mas->r_min = mas_safe_min(mas, wr_mas->pivots, offset);
        wr_mas->offset_end = mas->offset = offset;
}

/*
 * mast_rebalance_next() - Rebalance against the next node
 * @mast: The maple subtree state
 */
static inline void mast_rebalance_next(struct maple_subtree_state *mast)
{
        unsigned char b_end = mast->bn->b_end;

        mas_mab_cp(mast->orig_r, 0, mt_slot_count(mast->orig_r->node),
                   mast->bn, b_end);
        mast->orig_r->last = mast->orig_r->max;
}

/*
 * mast_rebalance_prev() - Rebalance against the previous node
 * @mast: The maple subtree state
 */
static inline void mast_rebalance_prev(struct maple_subtree_state *mast)
{
        unsigned char end = mas_data_end(mast->orig_l) + 1;
        unsigned char b_end = mast->bn->b_end;

        mab_shift_right(mast->bn, end);
        mas_mab_cp(mast->orig_l, 0, end - 1, mast->bn, 0);
        mast->l->min = mast->orig_l->min;
        mast->orig_l->index = mast->orig_l->min;
        mast->bn->b_end = end + b_end;
        mast->l->offset += end;
}

/*
 * mast_spanning_rebalance() - Rebalance nodes with nearest neighbour favouring
 * the node to the right.  Checking the nodes to the right then the left at each
 * level upwards until root is reached.
 * Data is copied into the @mast->bn.
 * @mast: The maple_subtree_state.
 */
static inline
bool mast_spanning_rebalance(struct maple_subtree_state *mast)
{
        struct ma_state r_tmp = *mast->orig_r;
        struct ma_state l_tmp = *mast->orig_l;
        unsigned char depth = 0;

        do {
                mas_ascend(mast->orig_r);
                mas_ascend(mast->orig_l);
                depth++;
                if (mast->orig_r->offset < mas_data_end(mast->orig_r)) {
                        mast->orig_r->offset++;
                        do {
                                mas_descend(mast->orig_r);
                                mast->orig_r->offset = 0;
                        } while (--depth);

                        mast_rebalance_next(mast);
                        *mast->orig_l = l_tmp;
                        return true;
                } else if (mast->orig_l->offset != 0) {
                        mast->orig_l->offset--;
                        do {
                                mas_descend(mast->orig_l);
                                mast->orig_l->offset =
                                        mas_data_end(mast->orig_l);
                        } while (--depth);

                        mast_rebalance_prev(mast);
                        *mast->orig_r = r_tmp;
                        return true;
                }
        } while (!mte_is_root(mast->orig_r->node));

        *mast->orig_r = r_tmp;
        *mast->orig_l = l_tmp;
        return false;
}

/*
 * mast_ascend() - Ascend the original left and right maple states.
 * @mast: the maple subtree state.
 *
 * Ascend the original left and right sides.  Set the offsets to point to the
 * data already in the new tree (@mast->l and @mast->r).
 */
static inline void mast_ascend(struct maple_subtree_state *mast)
{
        MA_WR_STATE(wr_mas, mast->orig_r,  NULL);
        mas_ascend(mast->orig_l);
        mas_ascend(mast->orig_r);

        mast->orig_r->offset = 0;
        mast->orig_r->index = mast->r->max;
        /* last should be larger than or equal to index */
        if (mast->orig_r->last < mast->orig_r->index)
                mast->orig_r->last = mast->orig_r->index;

        wr_mas.type = mte_node_type(mast->orig_r->node);
        mas_wr_node_walk(&wr_mas);
        /* Set up the left side of things */
        mast->orig_l->offset = 0;
        mast->orig_l->index = mast->l->min;
        wr_mas.mas = mast->orig_l;
        wr_mas.type = mte_node_type(mast->orig_l->node);
        mas_wr_node_walk(&wr_mas);

        mast->bn->type = wr_mas.type;
}

/*
 * mas_new_ma_node() - Create and return a new maple node.  Helper function.
 * @mas: the maple state with the allocations.
 * @b_node: the maple_big_node with the type encoding.
 *
 * Use the node type from the maple_big_node to allocate a new node from the
 * ma_state.  This function exists mainly for code readability.
 *
 * Return: A new maple encoded node
 */
static inline struct maple_enode
*mas_new_ma_node(struct ma_state *mas, struct maple_big_node *b_node)
{
        return mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)), b_node->type);
}

/*
 * mas_mab_to_node() - Set up right and middle nodes
 *
 * @mas: the maple state that contains the allocations.
 * @b_node: the node which contains the data.
 * @left: The pointer which will have the left node
 * @right: The pointer which may have the right node
 * @middle: the pointer which may have the middle node (rare)
 * @mid_split: the split location for the middle node
 *
 * Return: the split of left.
 */
static inline unsigned char mas_mab_to_node(struct ma_state *mas,
        struct maple_big_node *b_node, struct maple_enode **left,
        struct maple_enode **right, struct maple_enode **middle,
        unsigned char *mid_split)
{
        unsigned char split = 0;
        unsigned char slot_count = mt_slots[b_node->type];

        *left = mas_new_ma_node(mas, b_node);
        *right = NULL;
        *middle = NULL;
        *mid_split = 0;

        if (b_node->b_end < slot_count) {
                split = b_node->b_end;
        } else {
                split = mab_calc_split(mas, b_node, mid_split);
                *right = mas_new_ma_node(mas, b_node);
        }

        if (*mid_split)
                *middle = mas_new_ma_node(mas, b_node);

        return split;

}

/*
 * mab_set_b_end() - Add entry to b_node at b_node->b_end and increment the end
 * pointer.
 * @b_node: the big node to add the entry
 * @mas: the maple state to get the pivot (mas->max)
 * @entry: the entry to add, if NULL nothing happens.
 */
static inline void mab_set_b_end(struct maple_big_node *b_node,
                                 struct ma_state *mas,
                                 void *entry)
{
        if (!entry)
                return;

        b_node->slot[b_node->b_end] = entry;
        if (mt_is_alloc(mas->tree))
                b_node->gap[b_node->b_end] = mas_max_gap(mas);
        b_node->pivot[b_node->b_end++] = mas->max;
}

/*
 * mas_set_split_parent() - combine_then_separate helper function.  Sets the parent
 * of @mas->node to either @left or @right, depending on @slot and @split
 *
 * @mas: the maple state with the node that needs a parent
 * @left: possible parent 1
 * @right: possible parent 2
 * @slot: the slot the mas->node was placed
 * @split: the split location between @left and @right
 */
static inline void mas_set_split_parent(struct ma_state *mas,
                                        struct maple_enode *left,
                                        struct maple_enode *right,
                                        unsigned char *slot, unsigned char split)
{
        if (mas_is_none(mas))
                return;

        if ((*slot) <= split)
                mas_set_parent(mas, mas->node, left, *slot);
        else if (right)
                mas_set_parent(mas, mas->node, right, (*slot) - split - 1);

        (*slot)++;
}

/*
 * mte_mid_split_check() - Check if the next node passes the mid-split
 * @l: Pointer to left encoded maple node.
 * @m: Pointer to middle encoded maple node.
 * @r: Pointer to right encoded maple node.
 * @slot: The offset
 * @split: The split location.
 * @mid_split: The middle split.
 */
static inline void mte_mid_split_check(struct maple_enode **l,
                                       struct maple_enode **r,
                                       struct maple_enode *right,
                                       unsigned char slot,
                                       unsigned char *split,
                                       unsigned char mid_split)
{
        if (*r == right)
                return;

        if (slot < mid_split)
                return;

        *l = *r;
        *r = right;
        *split = mid_split;
}

/*
 * mast_set_split_parents() - Helper function to set three nodes parents.  Slot
 * is taken from @mast->l.
 * @mast: the maple subtree state
 * @left: the left node
 * @right: the right node
 * @split: the split location.
 */
static inline void mast_set_split_parents(struct maple_subtree_state *mast,
                                          struct maple_enode *left,
                                          struct maple_enode *middle,
                                          struct maple_enode *right,
                                          unsigned char split,
                                          unsigned char mid_split)
{
        unsigned char slot;
        struct maple_enode *l = left;
        struct maple_enode *r = right;

        if (mas_is_none(mast->l))
                return;

        if (middle)
                r = middle;

        slot = mast->l->offset;

        mte_mid_split_check(&l, &r, right, slot, &split, mid_split);
        mas_set_split_parent(mast->l, l, r, &slot, split);

        mte_mid_split_check(&l, &r, right, slot, &split, mid_split);
        mas_set_split_parent(mast->m, l, r, &slot, split);

        mte_mid_split_check(&l, &r, right, slot, &split, mid_split);
        mas_set_split_parent(mast->r, l, r, &slot, split);
}

/*
 * mas_topiary_node() - Dispose of a single node
 * @mas: The maple state for pushing nodes
 * @in_rcu: If the tree is in rcu mode
 *
 * The node will either be RCU freed or pushed back on the maple state.
 */
static inline void mas_topiary_node(struct ma_state *mas,
                struct ma_state *tmp_mas, bool in_rcu)
{
        struct maple_node *tmp;
        struct maple_enode *enode;

        if (mas_is_none(tmp_mas))
                return;

        enode = tmp_mas->node;
        tmp = mte_to_node(enode);
        mte_set_node_dead(enode);
        ma_free_rcu(tmp);
}

/*
 * mas_topiary_replace() - Replace the data with new data, then repair the
 * parent links within the new tree.  Iterate over the dead sub-tree and collect
 * the dead subtrees and topiary the nodes that are no longer of use.
 *
 * The new tree will have up to three children with the correct parent.  Keep
 * track of the new entries as they need to be followed to find the next level
 * of new entries.
 *
 * The old tree will have up to three children with the old parent.  Keep track
 * of the old entries as they may have more nodes below replaced.  Nodes within
 * [index, last] are dead subtrees, others need to be freed and followed.
 *
 * @mas: The maple state pointing at the new data
 * @old_enode: The maple encoded node being replaced
 * @new_height: The new height of the tree as a result of the operation
 *
 */
static inline void mas_topiary_replace(struct ma_state *mas,
                struct maple_enode *old_enode, unsigned char new_height)
{
        struct ma_state tmp[3], tmp_next[3];
        MA_TOPIARY(subtrees, mas->tree);
        bool in_rcu;
        int i, n;

        /* Place data in tree & then mark node as old */
        mas_put_in_tree(mas, old_enode, new_height);

        /* Update the parent pointers in the tree */
        tmp[0] = *mas;
        tmp[0].offset = 0;
        tmp[1].status = ma_none;
        tmp[2].status = ma_none;
        while (!mte_is_leaf(tmp[0].node)) {
                n = 0;
                for (i = 0; i < 3; i++) {
                        if (mas_is_none(&tmp[i]))
                                continue;

                        while (n < 3) {
                                if (!mas_find_child(&tmp[i], &tmp_next[n]))
                                        break;
                                n++;
                        }

                        mas_adopt_children(&tmp[i], tmp[i].node);
                }

                if (MAS_WARN_ON(mas, n == 0))
                        break;

                while (n < 3)
                        tmp_next[n++].status = ma_none;

                for (i = 0; i < 3; i++)
                        tmp[i] = tmp_next[i];
        }

        /* Collect the old nodes that need to be discarded */
        if (mte_is_leaf(old_enode))
                return mas_free(mas, old_enode);

        tmp[0] = *mas;
        tmp[0].offset = 0;
        tmp[0].node = old_enode;
        tmp[1].status = ma_none;
        tmp[2].status = ma_none;
        in_rcu = mt_in_rcu(mas->tree);
        do {
                n = 0;
                for (i = 0; i < 3; i++) {
                        if (mas_is_none(&tmp[i]))
                                continue;

                        while (n < 3) {
                                if (!mas_find_child(&tmp[i], &tmp_next[n]))
                                        break;

                                if ((tmp_next[n].min >= tmp_next->index) &&
                                    (tmp_next[n].max <= tmp_next->last)) {
                                        mat_add(&subtrees, tmp_next[n].node);
                                        tmp_next[n].status = ma_none;
                                } else {
                                        n++;
                                }
                        }
                }

                if (MAS_WARN_ON(mas, n == 0))
                        break;

                while (n < 3)
                        tmp_next[n++].status = ma_none;

                for (i = 0; i < 3; i++) {
                        mas_topiary_node(mas, &tmp[i], in_rcu);
                        tmp[i] = tmp_next[i];
                }
        } while (!mte_is_leaf(tmp[0].node));

        for (i = 0; i < 3; i++)
                mas_topiary_node(mas, &tmp[i], in_rcu);

        mas_mat_destroy(mas, &subtrees);
}

/*
 * mas_wmb_replace() - Write memory barrier and replace
 * @mas: The maple state
 * @old_enode: The old maple encoded node that is being replaced.
 * @new_height: The new height of the tree as a result of the operation
 *
 * Updates gap as necessary.
 */
static inline void mas_wmb_replace(struct ma_state *mas,
                struct maple_enode *old_enode, unsigned char new_height)
{
        /* Insert the new data in the tree */
        mas_topiary_replace(mas, old_enode, new_height);

        if (mte_is_leaf(mas->node))
                return;

        mas_update_gap(mas);
}

/*
 * mast_cp_to_nodes() - Copy data out to nodes.
 * @mast: The maple subtree state
 * @left: The left encoded maple node
 * @middle: The middle encoded maple node
 * @right: The right encoded maple node
 * @split: The location to split between left and (middle ? middle : right)
 * @mid_split: The location to split between middle and right.
 */
static inline void mast_cp_to_nodes(struct maple_subtree_state *mast,
        struct maple_enode *left, struct maple_enode *middle,
        struct maple_enode *right, unsigned char split, unsigned char mid_split)
{
        bool new_lmax = true;

        mas_node_or_none(mast->l, left);
        mas_node_or_none(mast->m, middle);
        mas_node_or_none(mast->r, right);

        mast->l->min = mast->orig_l->min;
        if (split == mast->bn->b_end) {
                mast->l->max = mast->orig_r->max;
                new_lmax = false;
        }

        mab_mas_cp(mast->bn, 0, split, mast->l, new_lmax);

        if (middle) {
                mab_mas_cp(mast->bn, 1 + split, mid_split, mast->m, true);
                mast->m->min = mast->bn->pivot[split] + 1;
                split = mid_split;
        }

        mast->r->max = mast->orig_r->max;
        if (right) {
                mab_mas_cp(mast->bn, 1 + split, mast->bn->b_end, mast->r, false);
                mast->r->min = mast->bn->pivot[split] + 1;
        }
}

/*
 * mast_combine_cp_left - Copy in the original left side of the tree into the
 * combined data set in the maple subtree state big node.
 * @mast: The maple subtree state
 */
static inline void mast_combine_cp_left(struct maple_subtree_state *mast)
{
        unsigned char l_slot = mast->orig_l->offset;

        if (!l_slot)
                return;

        mas_mab_cp(mast->orig_l, 0, l_slot - 1, mast->bn, 0);
}

/*
 * mast_combine_cp_right: Copy in the original right side of the tree into the
 * combined data set in the maple subtree state big node.
 * @mast: The maple subtree state
 */
static inline void mast_combine_cp_right(struct maple_subtree_state *mast)
{
        if (mast->bn->pivot[mast->bn->b_end - 1] >= mast->orig_r->max)
                return;

        mas_mab_cp(mast->orig_r, mast->orig_r->offset + 1,
                   mt_slot_count(mast->orig_r->node), mast->bn,
                   mast->bn->b_end);
        mast->orig_r->last = mast->orig_r->max;
}

/*
 * mast_sufficient: Check if the maple subtree state has enough data in the big
 * node to create at least one sufficient node
 * @mast: the maple subtree state
 */
static inline bool mast_sufficient(struct maple_subtree_state *mast)
{
        if (mast->bn->b_end > mt_min_slot_count(mast->orig_l->node))
                return true;

        return false;
}

/*
 * mast_overflow: Check if there is too much data in the subtree state for a
 * single node.
 * @mast: The maple subtree state
 */
static inline bool mast_overflow(struct maple_subtree_state *mast)
{
        if (mast->bn->b_end > mt_slot_count(mast->orig_l->node))
                return true;

        return false;
}

static inline void *mtree_range_walk(struct ma_state *mas)
{
        unsigned long *pivots;
        unsigned char offset;
        struct maple_node *node;
        struct maple_enode *next, *last;
        enum maple_type type;
        void __rcu **slots;
        unsigned char end;
        unsigned long max, min;
        unsigned long prev_max, prev_min;

        next = mas->node;
        min = mas->min;
        max = mas->max;
        do {
                last = next;
                node = mte_to_node(next);
                type = mte_node_type(next);
                pivots = ma_pivots(node, type);
                end = ma_data_end(node, type, pivots, max);
                prev_min = min;
                prev_max = max;
                if (pivots[0] >= mas->index) {
                        offset = 0;
                        max = pivots[0];
                        goto next;
                }

                offset = 1;
                while (offset < end) {
                        if (pivots[offset] >= mas->index) {
                                max = pivots[offset];
                                break;
                        }
                        offset++;
                }

                min = pivots[offset - 1] + 1;
next:
                slots = ma_slots(node, type);
                next = mt_slot(mas->tree, slots, offset);
                if (unlikely(ma_dead_node(node)))
                        goto dead_node;
        } while (!ma_is_leaf(type));

        mas->end = end;
        mas->offset = offset;
        mas->index = min;
        mas->last = max;
        mas->min = prev_min;
        mas->max = prev_max;
        mas->node = last;
        return (void *)next;

dead_node:
        mas_reset(mas);
        return NULL;
}

/*
 * mas_spanning_rebalance() - Rebalance across two nodes which may not be peers.
 * @mas: The starting maple state
 * @mast: The maple_subtree_state, keeps track of 4 maple states.
 * @count: The estimated count of iterations needed.
 *
 * Follow the tree upwards from @l_mas and @r_mas for @count, or until the root
 * is hit.  First @b_node is split into two entries which are inserted into the
 * next iteration of the loop.  @b_node is returned populated with the final
 * iteration. @mas is used to obtain allocations.  orig_l_mas keeps track of the
 * nodes that will remain active by using orig_l_mas->index and orig_l_mas->last
 * to account of what has been copied into the new sub-tree.  The update of
 * orig_l_mas->last is used in mas_consume to find the slots that will need to
 * be either freed or destroyed.  orig_l_mas->depth keeps track of the height of
 * the new sub-tree in case the sub-tree becomes the full tree.
 */
static void mas_spanning_rebalance(struct ma_state *mas,
                struct maple_subtree_state *mast, unsigned char count)
{
        unsigned char split, mid_split;
        unsigned char slot = 0;
        unsigned char new_height = 0; /* used if node is a new root */
        struct maple_enode *left = NULL, *middle = NULL, *right = NULL;
        struct maple_enode *old_enode;

        MA_STATE(l_mas, mas->tree, mas->index, mas->index);
        MA_STATE(r_mas, mas->tree, mas->index, mas->last);
        MA_STATE(m_mas, mas->tree, mas->index, mas->index);

        /*
         * The tree needs to be rebalanced and leaves need to be kept at the same level.
         * Rebalancing is done by use of the ``struct maple_topiary``.
         */
        mast->l = &l_mas;
        mast->m = &m_mas;
        mast->r = &r_mas;
        l_mas.status = r_mas.status = m_mas.status = ma_none;

        /* Check if this is not root and has sufficient data.  */
        if (((mast->orig_l->min != 0) || (mast->orig_r->max != ULONG_MAX)) &&
            unlikely(mast->bn->b_end <= mt_min_slots[mast->bn->type]))
                mast_spanning_rebalance(mast);

        /*
         * Each level of the tree is examined and balanced, pushing data to the left or
         * right, or rebalancing against left or right nodes is employed to avoid
         * rippling up the tree to limit the amount of churn.  Once a new sub-section of
         * the tree is created, there may be a mix of new and old nodes.  The old nodes
         * will have the incorrect parent pointers and currently be in two trees: the
         * original tree and the partially new tree.  To remedy the parent pointers in
         * the old tree, the new data is swapped into the active tree and a walk down
         * the tree is performed and the parent pointers are updated.
         * See mas_topiary_replace() for more information.
         */
        while (count--) {
                mast->bn->b_end--;
                mast->bn->type = mte_node_type(mast->orig_l->node);
                split = mas_mab_to_node(mas, mast->bn, &left, &right, &middle,
                                        &mid_split);
                mast_set_split_parents(mast, left, middle, right, split,
                                       mid_split);
                mast_cp_to_nodes(mast, left, middle, right, split, mid_split);
                new_height++;

                /*
                 * Copy data from next level in the tree to mast->bn from next
                 * iteration
                 */
                memset(mast->bn, 0, sizeof(struct maple_big_node));
                mast->bn->type = mte_node_type(left);

                /* Root already stored in l->node. */
                if (mas_is_root_limits(mast->l))
                        goto new_root;

                mast_ascend(mast);
                mast_combine_cp_left(mast);
                l_mas.offset = mast->bn->b_end;
                mab_set_b_end(mast->bn, &l_mas, left);
                mab_set_b_end(mast->bn, &m_mas, middle);
                mab_set_b_end(mast->bn, &r_mas, right);

                /* Copy anything necessary out of the right node. */
                mast_combine_cp_right(mast);
                mast->orig_l->last = mast->orig_l->max;

                if (mast_sufficient(mast)) {
                        if (mast_overflow(mast))
                                continue;

                        if (mast->orig_l->node == mast->orig_r->node) {
                               /*
                                * The data in b_node should be stored in one
                                * node and in the tree
                                */
                                slot = mast->l->offset;
                                break;
                        }

                        continue;
                }

                /* May be a new root stored in mast->bn */
                if (mas_is_root_limits(mast->orig_l))
                        break;

                mast_spanning_rebalance(mast);

                /* rebalancing from other nodes may require another loop. */
                if (!count)
                        count++;
        }

        l_mas.node = mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)),
                                mte_node_type(mast->orig_l->node));

        mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, &l_mas, true);
        new_height++;
        mas_set_parent(mas, left, l_mas.node, slot);
        if (middle)
                mas_set_parent(mas, middle, l_mas.node, ++slot);

        if (right)
                mas_set_parent(mas, right, l_mas.node, ++slot);

        if (mas_is_root_limits(mast->l)) {
new_root:
                mas_mn(mast->l)->parent = ma_parent_ptr(mas_tree_parent(mas));
                while (!mte_is_root(mast->orig_l->node))
                        mast_ascend(mast);
        } else {
                mas_mn(&l_mas)->parent = mas_mn(mast->orig_l)->parent;
        }

        old_enode = mast->orig_l->node;
        mas->depth = l_mas.depth;
        mas->node = l_mas.node;
        mas->min = l_mas.min;
        mas->max = l_mas.max;
        mas->offset = l_mas.offset;
        mas_wmb_replace(mas, old_enode, new_height);
        mtree_range_walk(mas);
        return;
}

/*
 * mas_rebalance() - Rebalance a given node.
 * @mas: The maple state
 * @b_node: The big maple node.
 *
 * Rebalance two nodes into a single node or two new nodes that are sufficient.
 * Continue upwards until tree is sufficient.
 */
static inline void mas_rebalance(struct ma_state *mas,
                                struct maple_big_node *b_node)
{
        char empty_count = mas_mt_height(mas);
        struct maple_subtree_state mast;
        unsigned char shift, b_end = ++b_node->b_end;

        MA_STATE(l_mas, mas->tree, mas->index, mas->last);
        MA_STATE(r_mas, mas->tree, mas->index, mas->last);

        trace_ma_op(TP_FCT, mas);

        /*
         * Rebalancing occurs if a node is insufficient.  Data is rebalanced
         * against the node to the right if it exists, otherwise the node to the
         * left of this node is rebalanced against this node.  If rebalancing
         * causes just one node to be produced instead of two, then the parent
         * is also examined and rebalanced if it is insufficient.  Every level
         * tries to combine the data in the same way.  If one node contains the
         * entire range of the tree, then that node is used as a new root node.
         */

        mast.orig_l = &l_mas;
        mast.orig_r = &r_mas;
        mast.bn = b_node;
        mast.bn->type = mte_node_type(mas->node);

        l_mas = r_mas = *mas;

        if (mas_next_sibling(&r_mas)) {
                mas_mab_cp(&r_mas, 0, mt_slot_count(r_mas.node), b_node, b_end);
                r_mas.last = r_mas.index = r_mas.max;
        } else {
                mas_prev_sibling(&l_mas);
                shift = mas_data_end(&l_mas) + 1;
                mab_shift_right(b_node, shift);
                mas->offset += shift;
                mas_mab_cp(&l_mas, 0, shift - 1, b_node, 0);
                b_node->b_end = shift + b_end;
                l_mas.index = l_mas.last = l_mas.min;
        }

        return mas_spanning_rebalance(mas, &mast, empty_count);
}

/*
 * mas_split_final_node() - Split the final node in a subtree operation.
 * @mast: the maple subtree state
 * @mas: The maple state
 */
static inline void mas_split_final_node(struct maple_subtree_state *mast,
                                        struct ma_state *mas)
{
        struct maple_enode *ancestor;

        if (mte_is_root(mas->node)) {
                if (mt_is_alloc(mas->tree))
                        mast->bn->type = maple_arange_64;
                else
                        mast->bn->type = maple_range_64;
        }
        /*
         * Only a single node is used here, could be root.
         * The Big_node data should just fit in a single node.
         */
        ancestor = mas_new_ma_node(mas, mast->bn);
        mas_set_parent(mas, mast->l->node, ancestor, mast->l->offset);
        mas_set_parent(mas, mast->r->node, ancestor, mast->r->offset);
        mte_to_node(ancestor)->parent = mas_mn(mas)->parent;

        mast->l->node = ancestor;
        mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, mast->l, true);
        mas->offset = mast->bn->b_end - 1;
}

/*
 * mast_fill_bnode() - Copy data into the big node in the subtree state
 * @mast: The maple subtree state
 * @mas: the maple state
 * @skip: The number of entries to skip for new nodes insertion.
 */
static inline void mast_fill_bnode(struct maple_subtree_state *mast,
                                         struct ma_state *mas,
                                         unsigned char skip)
{
        bool cp = true;
        unsigned char split;

        memset(mast->bn, 0, sizeof(struct maple_big_node));

        if (mte_is_root(mas->node)) {
                cp = false;
        } else {
                mas_ascend(mas);
                mas->offset = mte_parent_slot(mas->node);
        }

        if (cp && mast->l->offset)
                mas_mab_cp(mas, 0, mast->l->offset - 1, mast->bn, 0);

        split = mast->bn->b_end;
        mab_set_b_end(mast->bn, mast->l, mast->l->node);
        mast->r->offset = mast->bn->b_end;
        mab_set_b_end(mast->bn, mast->r, mast->r->node);
        if (mast->bn->pivot[mast->bn->b_end - 1] == mas->max)
                cp = false;

        if (cp)
                mas_mab_cp(mas, split + skip, mt_slot_count(mas->node) - 1,
                           mast->bn, mast->bn->b_end);

        mast->bn->b_end--;
        mast->bn->type = mte_node_type(mas->node);
}

/*
 * mast_split_data() - Split the data in the subtree state big node into regular
 * nodes.
 * @mast: The maple subtree state
 * @mas: The maple state
 * @split: The location to split the big node
 */
static inline void mast_split_data(struct maple_subtree_state *mast,
           struct ma_state *mas, unsigned char split)
{
        unsigned char p_slot;

        mab_mas_cp(mast->bn, 0, split, mast->l, true);
        mte_set_pivot(mast->r->node, 0, mast->r->max);
        mab_mas_cp(mast->bn, split + 1, mast->bn->b_end, mast->r, false);
        mast->l->offset = mte_parent_slot(mas->node);
        mast->l->max = mast->bn->pivot[split];
        mast->r->min = mast->l->max + 1;
        if (mte_is_leaf(mas->node))
                return;

        p_slot = mast->orig_l->offset;
        mas_set_split_parent(mast->orig_l, mast->l->node, mast->r->node,
                             &p_slot, split);
        mas_set_split_parent(mast->orig_r, mast->l->node, mast->r->node,
                             &p_slot, split);
}

/*
 * mas_push_data() - Instead of splitting a node, it is beneficial to push the
 * data to the right or left node if there is room.
 * @mas: The maple state
 * @mast: The maple subtree state
 * @left: Push left or not.
 *
 * Keeping the height of the tree low means faster lookups.
 *
 * Return: True if pushed, false otherwise.
 */
static inline bool mas_push_data(struct ma_state *mas,
                                struct maple_subtree_state *mast, bool left)
{
        unsigned char slot_total = mast->bn->b_end;
        unsigned char end, space, split;

        MA_STATE(tmp_mas, mas->tree, mas->index, mas->last);
        tmp_mas = *mas;
        tmp_mas.depth = mast->l->depth;

        if (left && !mas_prev_sibling(&tmp_mas))
                return false;
        else if (!left && !mas_next_sibling(&tmp_mas))
                return false;

        end = mas_data_end(&tmp_mas);
        slot_total += end;
        space = 2 * mt_slot_count(mas->node) - 2;
        /* -2 instead of -1 to ensure there isn't a triple split */
        if (ma_is_leaf(mast->bn->type))
                space--;

        if (mas->max == ULONG_MAX)
                space--;

        if (slot_total >= space)
                return false;

        /* Get the data; Fill mast->bn */
        mast->bn->b_end++;
        if (left) {
                mab_shift_right(mast->bn, end + 1);
                mas_mab_cp(&tmp_mas, 0, end, mast->bn, 0);
                mast->bn->b_end = slot_total + 1;
        } else {
                mas_mab_cp(&tmp_mas, 0, end, mast->bn, mast->bn->b_end);
        }

        /* Configure mast for splitting of mast->bn */
        split = mt_slots[mast->bn->type] - 2;
        if (left) {
                /*  Switch mas to prev node  */
                *mas = tmp_mas;
                /* Start using mast->l for the left side. */
                tmp_mas.node = mast->l->node;
                *mast->l = tmp_mas;
        } else {
                tmp_mas.node = mast->r->node;
                *mast->r = tmp_mas;
                split = slot_total - split;
        }
        split = mab_no_null_split(mast->bn, split, mt_slots[mast->bn->type]);
        /* Update parent slot for split calculation. */
        if (left)
                mast->orig_l->offset += end + 1;

        mast_split_data(mast, mas, split);
        mast_fill_bnode(mast, mas, 2);
        mas_split_final_node(mast, mas);
        return true;
}

/*
 * mas_split() - Split data that is too big for one node into two.
 * @mas: The maple state
 * @b_node: The maple big node
 */
static void mas_split(struct ma_state *mas, struct maple_big_node *b_node)
{
        struct maple_subtree_state mast;
        int height = 0;
        unsigned int orig_height = mas_mt_height(mas);
        unsigned char mid_split, split = 0;
        struct maple_enode *old;

        /*
         * Splitting is handled differently from any other B-tree; the Maple
         * Tree splits upwards.  Splitting up means that the split operation
         * occurs when the walk of the tree hits the leaves and not on the way
         * down.  The reason for splitting up is that it is impossible to know
         * how much space will be needed until the leaf is (or leaves are)
         * reached.  Since overwriting data is allowed and a range could
         * overwrite more than one range or result in changing one entry into 3
         * entries, it is impossible to know if a split is required until the
         * data is examined.
         *
         * Splitting is a balancing act between keeping allocations to a minimum
         * and avoiding a 'jitter' event where a tree is expanded to make room
         * for an entry followed by a contraction when the entry is removed.  To
         * accomplish the balance, there are empty slots remaining in both left
         * and right nodes after a split.
         */
        MA_STATE(l_mas, mas->tree, mas->index, mas->last);
        MA_STATE(r_mas, mas->tree, mas->index, mas->last);
        MA_STATE(prev_l_mas, mas->tree, mas->index, mas->last);
        MA_STATE(prev_r_mas, mas->tree, mas->index, mas->last);

        trace_ma_op(TP_FCT, mas);

        mast.l = &l_mas;
        mast.r = &r_mas;
        mast.orig_l = &prev_l_mas;
        mast.orig_r = &prev_r_mas;
        mast.bn = b_node;

        while (height++ <= orig_height) {
                if (mt_slots[b_node->type] > b_node->b_end) {
                        mas_split_final_node(&mast, mas);
                        break;
                }

                l_mas = r_mas = *mas;
                l_mas.node = mas_new_ma_node(mas, b_node);
                r_mas.node = mas_new_ma_node(mas, b_node);
                /*
                 * Another way that 'jitter' is avoided is to terminate a split up early if the
                 * left or right node has space to spare.  This is referred to as "pushing left"
                 * or "pushing right" and is similar to the B* tree, except the nodes left or
                 * right can rarely be reused due to RCU, but the ripple upwards is halted which
                 * is a significant savings.
                 */
                /* Try to push left. */
                if (mas_push_data(mas, &mast, true)) {
                        height++;
                        break;
                }
                /* Try to push right. */
                if (mas_push_data(mas, &mast, false)) {
                        height++;
                        break;
                }

                split = mab_calc_split(mas, b_node, &mid_split);
                mast_split_data(&mast, mas, split);
                /*
                 * Usually correct, mab_mas_cp in the above call overwrites
                 * r->max.
                 */
                mast.r->max = mas->max;
                mast_fill_bnode(&mast, mas, 1);
                prev_l_mas = *mast.l;
                prev_r_mas = *mast.r;
        }

        /* Set the original node as dead */
        old = mas->node;
        mas->node = l_mas.node;
        mas_wmb_replace(mas, old, height);
        mtree_range_walk(mas);
        return;
}

/*
 * mas_commit_b_node() - Commit the big node into the tree.
 * @wr_mas: The maple write state
 * @b_node: The maple big node
 */
static noinline_for_kasan void mas_commit_b_node(struct ma_wr_state *wr_mas,
                            struct maple_big_node *b_node)
{
        enum store_type type = wr_mas->mas->store_type;

        WARN_ON_ONCE(type != wr_rebalance && type != wr_split_store);

        if (type == wr_rebalance)
                return mas_rebalance(wr_mas->mas, b_node);

        return mas_split(wr_mas->mas, b_node);
}

/*
 * mas_root_expand() - Expand a root to a node
 * @mas: The maple state
 * @entry: The entry to store into the tree
 */
static inline void mas_root_expand(struct ma_state *mas, void *entry)
{
        void *contents = mas_root_locked(mas);
        enum maple_type type = maple_leaf_64;
        struct maple_node *node;
        void __rcu **slots;
        unsigned long *pivots;
        int slot = 0;

        node = mas_pop_node(mas);
        pivots = ma_pivots(node, type);
        slots = ma_slots(node, type);
        node->parent = ma_parent_ptr(mas_tree_parent(mas));
        mas->node = mt_mk_node(node, type);
        mas->status = ma_active;

        if (mas->index) {
                if (contents) {
                        rcu_assign_pointer(slots[slot], contents);
                        if (likely(mas->index > 1))
                                slot++;
                }
                pivots[slot++] = mas->index - 1;
        }

        rcu_assign_pointer(slots[slot], entry);
        mas->offset = slot;
        pivots[slot] = mas->last;
        if (mas->last != ULONG_MAX)
                pivots[++slot] = ULONG_MAX;

        mt_set_height(mas->tree, 1);
        ma_set_meta(node, maple_leaf_64, 0, slot);
        /* swap the new root into the tree */
        rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node));
        return;
}

/*
 * mas_store_root() - Storing value into root.
 * @mas: The maple state
 * @entry: The entry to store.
 *
 * There is no root node now and we are storing a value into the root - this
 * function either assigns the pointer or expands into a node.
 */
static inline void mas_store_root(struct ma_state *mas, void *entry)
{
        if (!entry) {
                if (!mas->index)
                        rcu_assign_pointer(mas->tree->ma_root, NULL);
        } else if (likely((mas->last != 0) || (mas->index != 0)))
                mas_root_expand(mas, entry);
        else if (((unsigned long) (entry) & 3) == 2)
                mas_root_expand(mas, entry);
        else {
                rcu_assign_pointer(mas->tree->ma_root, entry);
                mas->status = ma_start;
        }
}

/*
 * mas_is_span_wr() - Check if the write needs to be treated as a write that
 * spans the node.
 * @wr_mas: The maple write state
 *
 * Spanning writes are writes that start in one node and end in another OR if
 * the write of a %NULL will cause the node to end with a %NULL.
 *
 * Return: True if this is a spanning write, false otherwise.
 */
static bool mas_is_span_wr(struct ma_wr_state *wr_mas)
{
        unsigned long max = wr_mas->r_max;
        unsigned long last = wr_mas->mas->last;
        enum maple_type type = wr_mas->type;
        void *entry = wr_mas->entry;

        /* Contained in this pivot, fast path */
        if (last < max)
                return false;

        if (ma_is_leaf(type)) {
                max = wr_mas->mas->max;
                if (last < max)
                        return false;
        }

        if (last == max) {
                /*
                 * The last entry of leaf node cannot be NULL unless it is the
                 * rightmost node (writing ULONG_MAX), otherwise it spans slots.
                 */
                if (entry || last == ULONG_MAX)
                        return false;
        }

        trace_ma_write(TP_FCT, wr_mas->mas, wr_mas->r_max, entry);
        return true;
}

static inline void mas_wr_walk_descend(struct ma_wr_state *wr_mas)
{
        wr_mas->type = mte_node_type(wr_mas->mas->node);
        mas_wr_node_walk(wr_mas);
        wr_mas->slots = ma_slots(wr_mas->node, wr_mas->type);
}

static inline void mas_wr_walk_traverse(struct ma_wr_state *wr_mas)
{
        wr_mas->mas->max = wr_mas->r_max;
        wr_mas->mas->min = wr_mas->r_min;
        wr_mas->mas->node = wr_mas->content;
        wr_mas->mas->offset = 0;
        wr_mas->mas->depth++;
}
/*
 * mas_wr_walk() - Walk the tree for a write.
 * @wr_mas: The maple write state
 *
 * Uses mas_slot_locked() and does not need to worry about dead nodes.
 *
 * Return: True if it's contained in a node, false on spanning write.
 */
static bool mas_wr_walk(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;

        while (true) {
                mas_wr_walk_descend(wr_mas);
                if (unlikely(mas_is_span_wr(wr_mas)))
                        return false;

                wr_mas->content = mas_slot_locked(mas, wr_mas->slots,
                                                  mas->offset);
                if (ma_is_leaf(wr_mas->type))
                        return true;

                if (mas->end < mt_slots[wr_mas->type] - 1)
                        wr_mas->vacant_height = mas->depth + 1;

                if (ma_is_root(mas_mn(mas))) {
                        /* root needs more than 2 entries to be sufficient + 1 */
                        if (mas->end > 2)
                                wr_mas->sufficient_height = 1;
                } else if (mas->end > mt_min_slots[wr_mas->type] + 1)
                        wr_mas->sufficient_height = mas->depth + 1;

                mas_wr_walk_traverse(wr_mas);
        }

        return true;
}

static void mas_wr_walk_index(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;

        while (true) {
                mas_wr_walk_descend(wr_mas);
                wr_mas->content = mas_slot_locked(mas, wr_mas->slots,
                                                  mas->offset);
                if (ma_is_leaf(wr_mas->type))
                        return;
                mas_wr_walk_traverse(wr_mas);
        }
}
/*
 * mas_extend_spanning_null() - Extend a store of a %NULL to include surrounding %NULLs.
 * @l_wr_mas: The left maple write state
 * @r_wr_mas: The right maple write state
 */
static inline void mas_extend_spanning_null(struct ma_wr_state *l_wr_mas,
                                            struct ma_wr_state *r_wr_mas)
{
        struct ma_state *r_mas = r_wr_mas->mas;
        struct ma_state *l_mas = l_wr_mas->mas;
        unsigned char l_slot;

        l_slot = l_mas->offset;
        if (!l_wr_mas->content)
                l_mas->index = l_wr_mas->r_min;

        if ((l_mas->index == l_wr_mas->r_min) &&
                 (l_slot &&
                  !mas_slot_locked(l_mas, l_wr_mas->slots, l_slot - 1))) {
                if (l_slot > 1)
                        l_mas->index = l_wr_mas->pivots[l_slot - 2] + 1;
                else
                        l_mas->index = l_mas->min;

                l_mas->offset = l_slot - 1;
        }

        if (!r_wr_mas->content) {
                if (r_mas->last < r_wr_mas->r_max)
                        r_mas->last = r_wr_mas->r_max;
                r_mas->offset++;
        } else if ((r_mas->last == r_wr_mas->r_max) &&
            (r_mas->last < r_mas->max) &&
            !mas_slot_locked(r_mas, r_wr_mas->slots, r_mas->offset + 1)) {
                r_mas->last = mas_safe_pivot(r_mas, r_wr_mas->pivots,
                                             r_wr_mas->type, r_mas->offset + 1);
                r_mas->offset++;
        }
}

static inline void *mas_state_walk(struct ma_state *mas)
{
        void *entry;

        entry = mas_start(mas);
        if (mas_is_none(mas))
                return NULL;

        if (mas_is_ptr(mas))
                return entry;

        return mtree_range_walk(mas);
}

/*
 * mtree_lookup_walk() - Internal quick lookup that does not keep maple state up
 * to date.
 *
 * @mas: The maple state.
 *
 * Note: Leaves mas in undesirable state.
 * Return: The entry for @mas->index or %NULL on dead node.
 */
static inline void *mtree_lookup_walk(struct ma_state *mas)
{
        unsigned long *pivots;
        unsigned char offset;
        struct maple_node *node;
        struct maple_enode *next;
        enum maple_type type;
        void __rcu **slots;
        unsigned char end;

        next = mas->node;
        do {
                node = mte_to_node(next);
                type = mte_node_type(next);
                pivots = ma_pivots(node, type);
                end = mt_pivots[type];
                offset = 0;
                do {
                        if (pivots[offset] >= mas->index)
                                break;
                } while (++offset < end);

                slots = ma_slots(node, type);
                next = mt_slot(mas->tree, slots, offset);
                if (unlikely(ma_dead_node(node)))
                        goto dead_node;
        } while (!ma_is_leaf(type));

        return (void *)next;

dead_node:
        mas_reset(mas);
        return NULL;
}

static void mte_destroy_walk(struct maple_enode *, struct maple_tree *);
/*
 * mas_new_root() - Create a new root node that only contains the entry passed
 * in.
 * @mas: The maple state
 * @entry: The entry to store.
 *
 * Only valid when the index == 0 and the last == ULONG_MAX
 */
static inline void mas_new_root(struct ma_state *mas, void *entry)
{
        struct maple_enode *root = mas_root_locked(mas);
        enum maple_type type = maple_leaf_64;
        struct maple_node *node;
        void __rcu **slots;
        unsigned long *pivots;

        WARN_ON_ONCE(mas->index || mas->last != ULONG_MAX);

        if (!entry) {
                mt_set_height(mas->tree, 0);
                rcu_assign_pointer(mas->tree->ma_root, entry);
                mas->status = ma_start;
                goto done;
        }

        node = mas_pop_node(mas);
        pivots = ma_pivots(node, type);
        slots = ma_slots(node, type);
        node->parent = ma_parent_ptr(mas_tree_parent(mas));
        mas->node = mt_mk_node(node, type);
        mas->status = ma_active;
        rcu_assign_pointer(slots[0], entry);
        pivots[0] = mas->last;
        mt_set_height(mas->tree, 1);
        rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node));

done:
        if (xa_is_node(root))
                mte_destroy_walk(root, mas->tree);

        return;
}
/*
 * mas_wr_spanning_store() - Create a subtree with the store operation completed
 * and new nodes where necessary, then place the sub-tree in the actual tree.
 * Note that mas is expected to point to the node which caused the store to
 * span.
 * @wr_mas: The maple write state
 */
static noinline void mas_wr_spanning_store(struct ma_wr_state *wr_mas)
{
        struct maple_subtree_state mast;
        struct maple_big_node b_node;
        struct ma_state *mas;
        unsigned char height;

        /* Left and Right side of spanning store */
        MA_STATE(l_mas, NULL, 0, 0);
        MA_STATE(r_mas, NULL, 0, 0);
        MA_WR_STATE(r_wr_mas, &r_mas, wr_mas->entry);
        MA_WR_STATE(l_wr_mas, &l_mas, wr_mas->entry);

        /*
         * A store operation that spans multiple nodes is called a spanning
         * store and is handled early in the store call stack by the function
         * mas_is_span_wr().  When a spanning store is identified, the maple
         * state is duplicated.  The first maple state walks the left tree path
         * to ``index``, the duplicate walks the right tree path to ``last``.
         * The data in the two nodes are combined into a single node, two nodes,
         * or possibly three nodes (see the 3-way split above).  A ``NULL``
         * written to the last entry of a node is considered a spanning store as
         * a rebalance is required for the operation to complete and an overflow
         * of data may happen.
         */
        mas = wr_mas->mas;
        trace_ma_op(TP_FCT, mas);

        if (unlikely(!mas->index && mas->last == ULONG_MAX))
                return mas_new_root(mas, wr_mas->entry);
        /*
         * Node rebalancing may occur due to this store, so there may be three new
         * entries per level plus a new root.
         */
        height = mas_mt_height(mas);

        /*
         * Set up right side.  Need to get to the next offset after the spanning
         * store to ensure it's not NULL and to combine both the next node and
         * the node with the start together.
         */
        r_mas = *mas;
        /* Avoid overflow, walk to next slot in the tree. */
        if (r_mas.last + 1)
                r_mas.last++;

        r_mas.index = r_mas.last;
        mas_wr_walk_index(&r_wr_mas);
        r_mas.last = r_mas.index = mas->last;

        /* Set up left side. */
        l_mas = *mas;
        mas_wr_walk_index(&l_wr_mas);

        if (!wr_mas->entry) {
                mas_extend_spanning_null(&l_wr_mas, &r_wr_mas);
                mas->offset = l_mas.offset;
                mas->index = l_mas.index;
                mas->last = l_mas.last = r_mas.last;
        }

        /* expanding NULLs may make this cover the entire range */
        if (!l_mas.index && r_mas.last == ULONG_MAX) {
                mas_set_range(mas, 0, ULONG_MAX);
                return mas_new_root(mas, wr_mas->entry);
        }

        memset(&b_node, 0, sizeof(struct maple_big_node));
        /* Copy l_mas and store the value in b_node. */
        mas_store_b_node(&l_wr_mas, &b_node, l_mas.end);
        /* Copy r_mas into b_node if there is anything to copy. */
        if (r_mas.max > r_mas.last)
                mas_mab_cp(&r_mas, r_mas.offset, r_mas.end,
                           &b_node, b_node.b_end + 1);
        else
                b_node.b_end++;

        /* Stop spanning searches by searching for just index. */
        l_mas.index = l_mas.last = mas->index;

        mast.bn = &b_node;
        mast.orig_l = &l_mas;
        mast.orig_r = &r_mas;
        /* Combine l_mas and r_mas and split them up evenly again. */
        return mas_spanning_rebalance(mas, &mast, height + 1);
}

/*
 * mas_wr_node_store() - Attempt to store the value in a node
 * @wr_mas: The maple write state
 *
 * Attempts to reuse the node, but may allocate.
 */
static inline void mas_wr_node_store(struct ma_wr_state *wr_mas,
                                     unsigned char new_end)
{
        struct ma_state *mas = wr_mas->mas;
        void __rcu **dst_slots;
        unsigned long *dst_pivots;
        unsigned char dst_offset, offset_end = wr_mas->offset_end;
        struct maple_node reuse, *newnode;
        unsigned char copy_size, node_pivots = mt_pivots[wr_mas->type];
        bool in_rcu = mt_in_rcu(mas->tree);
        unsigned char height = mas_mt_height(mas);

        if (mas->last == wr_mas->end_piv)
                offset_end++; /* don't copy this offset */

        /* set up node. */
        if (in_rcu) {
                newnode = mas_pop_node(mas);
        } else {
                memset(&reuse, 0, sizeof(struct maple_node));
                newnode = &reuse;
        }

        newnode->parent = mas_mn(mas)->parent;
        dst_pivots = ma_pivots(newnode, wr_mas->type);
        dst_slots = ma_slots(newnode, wr_mas->type);
        /* Copy from start to insert point */
        memcpy(dst_pivots, wr_mas->pivots, sizeof(unsigned long) * mas->offset);
        memcpy(dst_slots, wr_mas->slots, sizeof(void *) * mas->offset);

        /* Handle insert of new range starting after old range */
        if (wr_mas->r_min < mas->index) {
                rcu_assign_pointer(dst_slots[mas->offset], wr_mas->content);
                dst_pivots[mas->offset++] = mas->index - 1;
        }

        /* Store the new entry and range end. */
        if (mas->offset < node_pivots)
                dst_pivots[mas->offset] = mas->last;
        rcu_assign_pointer(dst_slots[mas->offset], wr_mas->entry);

        /*
         * this range wrote to the end of the node or it overwrote the rest of
         * the data
         */
        if (offset_end > mas->end)
                goto done;

        dst_offset = mas->offset + 1;
        /* Copy to the end of node if necessary. */
        copy_size = mas->end - offset_end + 1;
        memcpy(dst_slots + dst_offset, wr_mas->slots + offset_end,
               sizeof(void *) * copy_size);
        memcpy(dst_pivots + dst_offset, wr_mas->pivots + offset_end,
               sizeof(unsigned long) * (copy_size - 1));

        if (new_end < node_pivots)
                dst_pivots[new_end] = mas->max;

done:
        mas_leaf_set_meta(newnode, maple_leaf_64, new_end);
        if (in_rcu) {
                struct maple_enode *old_enode = mas->node;

                mas->node = mt_mk_node(newnode, wr_mas->type);
                mas_replace_node(mas, old_enode, height);
        } else {
                memcpy(wr_mas->node, newnode, sizeof(struct maple_node));
        }
        trace_ma_write(TP_FCT, mas, 0, wr_mas->entry);
        mas_update_gap(mas);
        mas->end = new_end;
        return;
}

/*
 * mas_wr_slot_store: Attempt to store a value in a slot.
 * @wr_mas: the maple write state
 */
static inline void mas_wr_slot_store(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;
        unsigned char offset = mas->offset;
        void __rcu **slots = wr_mas->slots;
        bool gap = false;

        gap |= !mt_slot_locked(mas->tree, slots, offset);
        gap |= !mt_slot_locked(mas->tree, slots, offset + 1);

        if (wr_mas->offset_end - offset == 1) {
                if (mas->index == wr_mas->r_min) {
                        /* Overwriting the range and a part of the next one */
                        rcu_assign_pointer(slots[offset], wr_mas->entry);
                        wr_mas->pivots[offset] = mas->last;
                } else {
                        /* Overwriting a part of the range and the next one */
                        rcu_assign_pointer(slots[offset + 1], wr_mas->entry);
                        wr_mas->pivots[offset] = mas->index - 1;
                        mas->offset++; /* Keep mas accurate. */
                }
        } else {
                WARN_ON_ONCE(mt_in_rcu(mas->tree));
                /*
                 * Expand the range, only partially overwriting the previous and
                 * next ranges
                 */
                gap |= !mt_slot_locked(mas->tree, slots, offset + 2);
                rcu_assign_pointer(slots[offset + 1], wr_mas->entry);
                wr_mas->pivots[offset] = mas->index - 1;
                wr_mas->pivots[offset + 1] = mas->last;
                mas->offset++; /* Keep mas accurate. */
        }

        trace_ma_write(TP_FCT, mas, 0, wr_mas->entry);
        /*
         * Only update gap when the new entry is empty or there is an empty
         * entry in the original two ranges.
         */
        if (!wr_mas->entry || gap)
                mas_update_gap(mas);

        return;
}

static inline void mas_wr_extend_null(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;

        if (!wr_mas->slots[wr_mas->offset_end]) {
                /* If this one is null, the next and prev are not */
                mas->last = wr_mas->end_piv;
        } else {
                /* Check next slot(s) if we are overwriting the end */
                if ((mas->last == wr_mas->end_piv) &&
                    (mas->end != wr_mas->offset_end) &&
                    !wr_mas->slots[wr_mas->offset_end + 1]) {
                        wr_mas->offset_end++;
                        if (wr_mas->offset_end == mas->end)
                                mas->last = mas->max;
                        else
                                mas->last = wr_mas->pivots[wr_mas->offset_end];
                        wr_mas->end_piv = mas->last;
                }
        }

        if (!wr_mas->content) {
                /* If this one is null, the next and prev are not */
                mas->index = wr_mas->r_min;
        } else {
                /* Check prev slot if we are overwriting the start */
                if (mas->index == wr_mas->r_min && mas->offset &&
                    !wr_mas->slots[mas->offset - 1]) {
                        mas->offset--;
                        wr_mas->r_min = mas->index =
                                mas_safe_min(mas, wr_mas->pivots, mas->offset);
                        wr_mas->r_max = wr_mas->pivots[mas->offset];
                }
        }
}

static inline void mas_wr_end_piv(struct ma_wr_state *wr_mas)
{
        while ((wr_mas->offset_end < wr_mas->mas->end) &&
               (wr_mas->mas->last > wr_mas->pivots[wr_mas->offset_end]))
                wr_mas->offset_end++;

        if (wr_mas->offset_end < wr_mas->mas->end)
                wr_mas->end_piv = wr_mas->pivots[wr_mas->offset_end];
        else
                wr_mas->end_piv = wr_mas->mas->max;
}

static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;
        unsigned char new_end = mas->end + 2;

        new_end -= wr_mas->offset_end - mas->offset;
        if (wr_mas->r_min == mas->index)
                new_end--;

        if (wr_mas->end_piv == mas->last)
                new_end--;

        return new_end;
}

/*
 * mas_wr_append: Attempt to append
 * @wr_mas: the maple write state
 * @new_end: The end of the node after the modification
 *
 * This is currently unsafe in rcu mode since the end of the node may be cached
 * by readers while the node contents may be updated which could result in
 * inaccurate information.
 */
static inline void mas_wr_append(struct ma_wr_state *wr_mas,
                unsigned char new_end)
{
        struct ma_state *mas = wr_mas->mas;
        void __rcu **slots;
        unsigned char end = mas->end;

        if (new_end < mt_pivots[wr_mas->type]) {
                wr_mas->pivots[new_end] = wr_mas->pivots[end];
                ma_set_meta(wr_mas->node, wr_mas->type, 0, new_end);
        }

        slots = wr_mas->slots;
        if (new_end == end + 1) {
                if (mas->last == wr_mas->r_max) {
                        /* Append to end of range */
                        rcu_assign_pointer(slots[new_end], wr_mas->entry);
                        wr_mas->pivots[end] = mas->index - 1;
                        mas->offset = new_end;
                } else {
                        /* Append to start of range */
                        rcu_assign_pointer(slots[new_end], wr_mas->content);
                        wr_mas->pivots[end] = mas->last;
                        rcu_assign_pointer(slots[end], wr_mas->entry);
                }
        } else {
                /* Append to the range without touching any boundaries. */
                rcu_assign_pointer(slots[new_end], wr_mas->content);
                wr_mas->pivots[end + 1] = mas->last;
                rcu_assign_pointer(slots[end + 1], wr_mas->entry);
                wr_mas->pivots[end] = mas->index - 1;
                mas->offset = end + 1;
        }

        if (!wr_mas->content || !wr_mas->entry)
                mas_update_gap(mas);

        mas->end = new_end;
        trace_ma_write(TP_FCT, mas, new_end, wr_mas->entry);
        return;
}

/*
 * mas_wr_bnode() - Slow path for a modification.
 * @wr_mas: The write maple state
 *
 * This is where split, rebalance end up.
 */
static void mas_wr_bnode(struct ma_wr_state *wr_mas)
{
        struct maple_big_node b_node;

        trace_ma_write(TP_FCT, wr_mas->mas, 0, wr_mas->entry);
        memset(&b_node, 0, sizeof(struct maple_big_node));
        mas_store_b_node(wr_mas, &b_node, wr_mas->offset_end);
        mas_commit_b_node(wr_mas, &b_node);
}

/*
 * mas_wr_store_entry() - Internal call to store a value
 * @wr_mas: The maple write state
 */
static inline void mas_wr_store_entry(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;
        unsigned char new_end = mas_wr_new_end(wr_mas);

        switch (mas->store_type) {
        case wr_exact_fit:
                rcu_assign_pointer(wr_mas->slots[mas->offset], wr_mas->entry);
                if (!!wr_mas->entry ^ !!wr_mas->content)
                        mas_update_gap(mas);
                break;
        case wr_append:
                mas_wr_append(wr_mas, new_end);
                break;
        case wr_slot_store:
                mas_wr_slot_store(wr_mas);
                break;
        case wr_node_store:
                mas_wr_node_store(wr_mas, new_end);
                break;
        case wr_spanning_store:
                mas_wr_spanning_store(wr_mas);
                break;
        case wr_split_store:
        case wr_rebalance:
                mas_wr_bnode(wr_mas);
                break;
        case wr_new_root:
                mas_new_root(mas, wr_mas->entry);
                break;
        case wr_store_root:
                mas_store_root(mas, wr_mas->entry);
                break;
        case wr_invalid:
                MT_BUG_ON(mas->tree, 1);
        }

        return;
}

static inline void mas_wr_prealloc_setup(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;

        if (!mas_is_active(mas)) {
                if (mas_is_start(mas))
                        goto set_content;

                if (unlikely(mas_is_paused(mas)))
                        goto reset;

                if (unlikely(mas_is_none(mas)))
                        goto reset;

                if (unlikely(mas_is_overflow(mas)))
                        goto reset;

                if (unlikely(mas_is_underflow(mas)))
                        goto reset;
        }

        /*
         * A less strict version of mas_is_span_wr() where we allow spanning
         * writes within this node.  This is to stop partial walks in
         * mas_prealloc() from being reset.
         */
        if (mas->last > mas->max)
                goto reset;

        if (wr_mas->entry)
                goto set_content;

        if (mte_is_leaf(mas->node) && mas->last == mas->max)
                goto reset;

        goto set_content;

reset:
        mas_reset(mas);
set_content:
        wr_mas->content = mas_start(mas);
}

/**
 * mas_prealloc_calc() - Calculate number of nodes needed for a
 * given store oepration
 * @wr_mas: The maple write state
 * @entry: The entry to store into the tree
 *
 * Return: Number of nodes required for preallocation.
 */
static inline void mas_prealloc_calc(struct ma_wr_state *wr_mas, void *entry)
{
        struct ma_state *mas = wr_mas->mas;
        unsigned char height = mas_mt_height(mas);
        int ret = height * 3 + 1;
        unsigned char delta = height - wr_mas->vacant_height;

        switch (mas->store_type) {
        case wr_exact_fit:
        case wr_append:
        case wr_slot_store:
                ret = 0;
                break;
        case wr_spanning_store:
                if (wr_mas->sufficient_height < wr_mas->vacant_height)
                        ret = (height - wr_mas->sufficient_height) * 3 + 1;
                else
                        ret = delta * 3 + 1;
                break;
        case wr_split_store:
                ret = delta * 2 + 1;
                break;
        case wr_rebalance:
                if (wr_mas->sufficient_height < wr_mas->vacant_height)
                        ret = (height - wr_mas->sufficient_height) * 2 + 1;
                else
                        ret = delta * 2 + 1;
                break;
        case wr_node_store:
                ret = mt_in_rcu(mas->tree) ? 1 : 0;
                break;
        case wr_new_root:
                ret = 1;
                break;
        case wr_store_root:
                if (likely((mas->last != 0) || (mas->index != 0)))
                        ret = 1;
                else if (((unsigned long) (entry) & 3) == 2)
                        ret = 1;
                else
                        ret = 0;
                break;
        case wr_invalid:
                WARN_ON_ONCE(1);
        }

        mas->node_request = ret;
}

/*
 * mas_wr_store_type() - Determine the store type for a given
 * store operation.
 * @wr_mas: The maple write state
 *
 * Return: the type of store needed for the operation
 */
static inline enum store_type mas_wr_store_type(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;
        unsigned char new_end;

        if (unlikely(mas_is_none(mas) || mas_is_ptr(mas)))
                return wr_store_root;

        if (unlikely(!mas_wr_walk(wr_mas)))
                return wr_spanning_store;

        /* At this point, we are at the leaf node that needs to be altered. */
        mas_wr_end_piv(wr_mas);
        if (!wr_mas->entry)
                mas_wr_extend_null(wr_mas);

        if ((wr_mas->r_min == mas->index) && (wr_mas->r_max == mas->last))
                return wr_exact_fit;

        if (unlikely(!mas->index && mas->last == ULONG_MAX))
                return wr_new_root;

        new_end = mas_wr_new_end(wr_mas);
        /* Potential spanning rebalance collapsing a node */
        if (new_end < mt_min_slots[wr_mas->type]) {
                if (!mte_is_root(mas->node))
                        return  wr_rebalance;
                return wr_node_store;
        }

        if (new_end >= mt_slots[wr_mas->type])
                return wr_split_store;

        if (!mt_in_rcu(mas->tree) && (mas->offset == mas->end))
                return wr_append;

        if ((new_end == mas->end) && (!mt_in_rcu(mas->tree) ||
                (wr_mas->offset_end - mas->offset == 1)))
                return wr_slot_store;

        return wr_node_store;
}

/**
 * mas_wr_preallocate() - Preallocate enough nodes for a store operation
 * @wr_mas: The maple write state
 * @entry: The entry that will be stored
 *
 */
static inline void mas_wr_preallocate(struct ma_wr_state *wr_mas, void *entry)
{
        struct ma_state *mas = wr_mas->mas;

        mas_wr_prealloc_setup(wr_mas);
        mas->store_type = mas_wr_store_type(wr_mas);
        mas_prealloc_calc(wr_mas, entry);
        if (!mas->node_request)
                return;

        mas_alloc_nodes(mas, GFP_NOWAIT);
}

/**
 * mas_insert() - Internal call to insert a value
 * @mas: The maple state
 * @entry: The entry to store
 *
 * Return: %NULL or the contents that already exists at the requested index
 * otherwise.  The maple state needs to be checked for error conditions.
 */
static inline void *mas_insert(struct ma_state *mas, void *entry)
{
        MA_WR_STATE(wr_mas, mas, entry);

        /*
         * Inserting a new range inserts either 0, 1, or 2 pivots within the
         * tree.  If the insert fits exactly into an existing gap with a value
         * of NULL, then the slot only needs to be written with the new value.
         * If the range being inserted is adjacent to another range, then only a
         * single pivot needs to be inserted (as well as writing the entry).  If
         * the new range is within a gap but does not touch any other ranges,
         * then two pivots need to be inserted: the start - 1, and the end.  As
         * usual, the entry must be written.  Most operations require a new node
         * to be allocated and replace an existing node to ensure RCU safety,
         * when in RCU mode.  The exception to requiring a newly allocated node
         * is when inserting at the end of a node (appending).  When done
         * carefully, appending can reuse the node in place.
         */
        wr_mas.content = mas_start(mas);
        if (wr_mas.content)
                goto exists;

        mas_wr_preallocate(&wr_mas, entry);
        if (mas_is_err(mas))
                return NULL;

        /* spanning writes always overwrite something */
        if (mas->store_type == wr_spanning_store)
                goto exists;

        /* At this point, we are at the leaf node that needs to be altered. */
        if (mas->store_type != wr_new_root && mas->store_type != wr_store_root) {
                wr_mas.offset_end = mas->offset;
                wr_mas.end_piv = wr_mas.r_max;

                if (wr_mas.content || (mas->last > wr_mas.r_max))
                        goto exists;
        }

        mas_wr_store_entry(&wr_mas);
        return wr_mas.content;

exists:
        mas_set_err(mas, -EEXIST);
        return wr_mas.content;

}

/**
 * mas_alloc_cyclic() - Internal call to find somewhere to store an entry
 * @mas: The maple state.
 * @startp: Pointer to ID.
 * @range_lo: Lower bound of range to search.
 * @range_hi: Upper bound of range to search.
 * @entry: The entry to store.
 * @next: Pointer to next ID to allocate.
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * Return: 0 if the allocation succeeded without wrapping, 1 if the
 * allocation succeeded after wrapping, or -EBUSY if there are no
 * free entries.
 */
int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp,
                void *entry, unsigned long range_lo, unsigned long range_hi,
                unsigned long *next, gfp_t gfp)
{
        unsigned long min = range_lo;
        int ret = 0;

        range_lo = max(min, *next);
        ret = mas_empty_area(mas, range_lo, range_hi, 1);
        if ((mas->tree->ma_flags & MT_FLAGS_ALLOC_WRAPPED) && ret == 0) {
                mas->tree->ma_flags &= ~MT_FLAGS_ALLOC_WRAPPED;
                ret = 1;
        }
        if (ret < 0 && range_lo > min) {
                mas_reset(mas);
                ret = mas_empty_area(mas, min, range_hi, 1);
                if (ret == 0)
                        ret = 1;
        }
        if (ret < 0)
                return ret;

        do {
                mas_insert(mas, entry);
        } while (mas_nomem(mas, gfp));
        if (mas_is_err(mas))
                return xa_err(mas->node);

        *startp = mas->index;
        *next = *startp + 1;
        if (*next == 0)
                mas->tree->ma_flags |= MT_FLAGS_ALLOC_WRAPPED;

        mas_destroy(mas);
        return ret;
}
EXPORT_SYMBOL(mas_alloc_cyclic);

static __always_inline void mas_rewalk(struct ma_state *mas, unsigned long index)
{
retry:
        mas_set(mas, index);
        mas_state_walk(mas);
        if (mas_is_start(mas))
                goto retry;
}

static __always_inline bool mas_rewalk_if_dead(struct ma_state *mas,
                struct maple_node *node, const unsigned long index)
{
        if (unlikely(ma_dead_node(node))) {
                mas_rewalk(mas, index);
                return true;
        }
        return false;
}

/*
 * mas_prev_node() - Find the prev non-null entry at the same level in the
 * tree.  The prev value will be mas->node[mas->offset] or the status will be
 * ma_none.
 * @mas: The maple state
 * @min: The lower limit to search
 *
 * The prev node value will be mas->node[mas->offset] or the status will be
 * ma_none.
 * Return: 1 if the node is dead, 0 otherwise.
 */
static int mas_prev_node(struct ma_state *mas, unsigned long min)
{
        enum maple_type mt;
        int offset, level;
        void __rcu **slots;
        struct maple_node *node;
        unsigned long *pivots;
        unsigned long max;

        node = mas_mn(mas);
        if (!mas->min)
                goto no_entry;

        max = mas->min - 1;
        if (max < min)
                goto no_entry;

        level = 0;
        do {
                if (ma_is_root(node))
                        goto no_entry;

                /* Walk up. */
                if (unlikely(mas_ascend(mas)))
                        return 1;
                offset = mas->offset;
                level++;
                node = mas_mn(mas);
        } while (!offset);

        offset--;
        mt = mte_node_type(mas->node);
        while (level > 1) {
                level--;
                slots = ma_slots(node, mt);
                mas->node = mas_slot(mas, slots, offset);
                if (unlikely(ma_dead_node(node)))
                        return 1;

                mt = mte_node_type(mas->node);
                node = mas_mn(mas);
                pivots = ma_pivots(node, mt);
                offset = ma_data_end(node, mt, pivots, max);
                if (unlikely(ma_dead_node(node)))
                        return 1;
        }

        slots = ma_slots(node, mt);
        mas->node = mas_slot(mas, slots, offset);
        pivots = ma_pivots(node, mt);
        if (unlikely(ma_dead_node(node)))
                return 1;

        if (likely(offset))
                mas->min = pivots[offset - 1] + 1;
        mas->max = max;
        mas->offset = mas_data_end(mas);
        if (unlikely(mte_dead_node(mas->node)))
                return 1;

        mas->end = mas->offset;
        return 0;

no_entry:
        if (unlikely(ma_dead_node(node)))
                return 1;

        mas->status = ma_underflow;
        return 0;
}

/*
 * mas_prev_slot() - Get the entry in the previous slot
 *
 * @mas: The maple state
 * @min: The minimum starting range
 * @empty: Can be empty
 *
 * Return: The entry in the previous slot which is possibly NULL
 */
static void *mas_prev_slot(struct ma_state *mas, unsigned long min, bool empty)
{
        void *entry;
        void __rcu **slots;
        unsigned long pivot;
        enum maple_type type;
        unsigned long *pivots;
        struct maple_node *node;
        unsigned long save_point = mas->index;

retry:
        node = mas_mn(mas);
        type = mte_node_type(mas->node);
        pivots = ma_pivots(node, type);
        if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
                goto retry;

        if (mas->min <= min) {
                pivot = mas_safe_min(mas, pivots, mas->offset);

                if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
                        goto retry;

                if (pivot <= min)
                        goto underflow;
        }

again:
        if (likely(mas->offset)) {
                mas->offset--;
                mas->last = mas->index - 1;
                mas->index = mas_safe_min(mas, pivots, mas->offset);
        } else  {
                if (mas->index <= min)
                        goto underflow;

                if (mas_prev_node(mas, min)) {
                        mas_rewalk(mas, save_point);
                        goto retry;
                }

                if (WARN_ON_ONCE(mas_is_underflow(mas)))
                        return NULL;

                mas->last = mas->max;
                node = mas_mn(mas);
                type = mte_node_type(mas->node);
                pivots = ma_pivots(node, type);
                mas->index = pivots[mas->offset - 1] + 1;
        }

        slots = ma_slots(node, type);
        entry = mas_slot(mas, slots, mas->offset);
        if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
                goto retry;

        if (likely(entry))
                return entry;

        if (!empty) {
                if (mas->index <= min)
                        goto underflow;

                goto again;
        }

        return entry;

underflow:
        mas->status = ma_underflow;
        return NULL;
}

/*
 * mas_next_node() - Get the next node at the same level in the tree.
 * @mas: The maple state
 * @node: The maple node
 * @max: The maximum pivot value to check.
 *
 * The next value will be mas->node[mas->offset] or the status will have
 * overflowed.
 * Return: 1 on dead node, 0 otherwise.
 */
static int mas_next_node(struct ma_state *mas, struct maple_node *node,
                unsigned long max)
{
        unsigned long min;
        unsigned long *pivots;
        struct maple_enode *enode;
        struct maple_node *tmp;
        int level = 0;
        unsigned char node_end;
        enum maple_type mt;
        void __rcu **slots;

        if (mas->max >= max)
                goto overflow;

        min = mas->max + 1;
        level = 0;
        do {
                if (ma_is_root(node))
                        goto overflow;

                /* Walk up. */
                if (unlikely(mas_ascend(mas)))
                        return 1;

                level++;
                node = mas_mn(mas);
                mt = mte_node_type(mas->node);
                pivots = ma_pivots(node, mt);
                node_end = ma_data_end(node, mt, pivots, mas->max);
                if (unlikely(ma_dead_node(node)))
                        return 1;

        } while (unlikely(mas->offset == node_end));

        slots = ma_slots(node, mt);
        mas->offset++;
        enode = mas_slot(mas, slots, mas->offset);
        if (unlikely(ma_dead_node(node)))
                return 1;

        if (level > 1)
                mas->offset = 0;

        while (unlikely(level > 1)) {
                level--;
                mas->node = enode;
                node = mas_mn(mas);
                mt = mte_node_type(mas->node);
                slots = ma_slots(node, mt);
                enode = mas_slot(mas, slots, 0);
                if (unlikely(ma_dead_node(node)))
                        return 1;
        }

        if (!mas->offset)
                pivots = ma_pivots(node, mt);

        mas->max = mas_safe_pivot(mas, pivots, mas->offset, mt);
        tmp = mte_to_node(enode);
        mt = mte_node_type(enode);
        pivots = ma_pivots(tmp, mt);
        mas->end = ma_data_end(tmp, mt, pivots, mas->max);
        if (unlikely(ma_dead_node(node)))
                return 1;

        mas->node = enode;
        mas->min = min;
        return 0;

overflow:
        if (unlikely(ma_dead_node(node)))
                return 1;

        mas->status = ma_overflow;
        return 0;
}

/*
 * mas_next_slot() - Get the entry in the next slot
 *
 * @mas: The maple state
 * @max: The maximum starting range
 * @empty: Can be empty
 *
 * Return: The entry in the next slot which is possibly NULL
 */
static void *mas_next_slot(struct ma_state *mas, unsigned long max, bool empty)
{
        void __rcu **slots;
        unsigned long *pivots;
        unsigned long pivot;
        enum maple_type type;
        struct maple_node *node;
        unsigned long save_point = mas->last;
        void *entry;

retry:
        node = mas_mn(mas);
        type = mte_node_type(mas->node);
        pivots = ma_pivots(node, type);
        if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
                goto retry;

        if (mas->max >= max) {
                if (likely(mas->offset < mas->end))
                        pivot = pivots[mas->offset];
                else
                        pivot = mas->max;

                if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
                        goto retry;

                if (pivot >= max) { /* Was at the limit, next will extend beyond */
                        mas->status = ma_overflow;
                        return NULL;
                }
        }

        if (likely(mas->offset < mas->end)) {
                mas->index = pivots[mas->offset] + 1;
again:
                mas->offset++;
                if (likely(mas->offset < mas->end))
                        mas->last = pivots[mas->offset];
                else
                        mas->last = mas->max;
        } else  {
                if (mas->last >= max) {
                        mas->status = ma_overflow;
                        return NULL;
                }

                if (mas_next_node(mas, node, max)) {
                        mas_rewalk(mas, save_point);
                        goto retry;
                }

                if (WARN_ON_ONCE(mas_is_overflow(mas)))
                        return NULL;

                mas->offset = 0;
                mas->index = mas->min;
                node = mas_mn(mas);
                type = mte_node_type(mas->node);
                pivots = ma_pivots(node, type);
                mas->last = pivots[0];
        }

        slots = ma_slots(node, type);
        entry = mt_slot(mas->tree, slots, mas->offset);
        if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
                goto retry;

        if (entry)
                return entry;


        if (!empty) {
                if (mas->last >= max) {
                        mas->status = ma_overflow;
                        return NULL;
                }

                mas->index = mas->last + 1;
                goto again;
        }

        return entry;
}

/*
 * mas_rev_awalk() - Internal function.  Reverse allocation walk.  Find the
 * highest gap address of a given size in a given node and descend.
 * @mas: The maple state
 * @size: The needed size.
 *
 * Return: True if found in a leaf, false otherwise.
 *
 */
static bool mas_rev_awalk(struct ma_state *mas, unsigned long size,
                unsigned long *gap_min, unsigned long *gap_max)
{
        enum maple_type type = mte_node_type(mas->node);
        struct maple_node *node = mas_mn(mas);
        unsigned long *pivots, *gaps;
        void __rcu **slots;
        unsigned long gap = 0;
        unsigned long max, min;
        unsigned char offset;

        if (unlikely(mas_is_err(mas)))
                return true;

        if (ma_is_dense(type)) {
                /* dense nodes. */
                mas->offset = (unsigned char)(mas->index - mas->min);
                return true;
        }

        pivots = ma_pivots(node, type);
        slots = ma_slots(node, type);
        gaps = ma_gaps(node, type);
        offset = mas->offset;
        min = mas_safe_min(mas, pivots, offset);
        /* Skip out of bounds. */
        while (mas->last < min)
                min = mas_safe_min(mas, pivots, --offset);

        max = mas_safe_pivot(mas, pivots, offset, type);
        while (mas->index <= max) {
                gap = 0;
                if (gaps)
                        gap = gaps[offset];
                else if (!mas_slot(mas, slots, offset))
                        gap = max - min + 1;

                if (gap) {
                        if ((size <= gap) && (size <= mas->last - min + 1))
                                break;

                        if (!gaps) {
                                /* Skip the next slot, it cannot be a gap. */
                                if (offset < 2)
                                        goto ascend;

                                offset -= 2;
                                max = pivots[offset];
                                min = mas_safe_min(mas, pivots, offset);
                                continue;
                        }
                }

                if (!offset)
                        goto ascend;

                offset--;
                max = min - 1;
                min = mas_safe_min(mas, pivots, offset);
        }

        if (unlikely((mas->index > max) || (size - 1 > max - mas->index)))
                goto no_space;

        if (unlikely(ma_is_leaf(type))) {
                mas->offset = offset;
                *gap_min = min;
                *gap_max = min + gap - 1;
                return true;
        }

        /* descend, only happens under lock. */
        mas->node = mas_slot(mas, slots, offset);
        mas->min = min;
        mas->max = max;
        mas->offset = mas_data_end(mas);
        return false;

ascend:
        if (!mte_is_root(mas->node))
                return false;

no_space:
        mas_set_err(mas, -EBUSY);
        return false;
}

static inline bool mas_anode_descend(struct ma_state *mas, unsigned long size)
{
        enum maple_type type = mte_node_type(mas->node);
        unsigned long pivot, min, gap = 0;
        unsigned char offset, data_end;
        unsigned long *gaps, *pivots;
        void __rcu **slots;
        struct maple_node *node;
        bool found = false;

        if (ma_is_dense(type)) {
                mas->offset = (unsigned char)(mas->index - mas->min);
                return true;
        }

        node = mas_mn(mas);
        pivots = ma_pivots(node, type);
        slots = ma_slots(node, type);
        gaps = ma_gaps(node, type);
        offset = mas->offset;
        min = mas_safe_min(mas, pivots, offset);
        data_end = ma_data_end(node, type, pivots, mas->max);
        for (; offset <= data_end; offset++) {
                pivot = mas_safe_pivot(mas, pivots, offset, type);

                /* Not within lower bounds */
                if (mas->index > pivot)
                        goto next_slot;

                if (gaps)
                        gap = gaps[offset];
                else if (!mas_slot(mas, slots, offset))
                        gap = min(pivot, mas->last) - max(mas->index, min) + 1;
                else
                        goto next_slot;

                if (gap >= size) {
                        if (ma_is_leaf(type)) {
                                found = true;
                                break;
                        }

                        mas->node = mas_slot(mas, slots, offset);
                        mas->min = min;
                        mas->max = pivot;
                        offset = 0;
                        break;
                }
next_slot:
                min = pivot + 1;
                if (mas->last <= pivot) {
                        mas_set_err(mas, -EBUSY);
                        return true;
                }
        }

        mas->offset = offset;
        return found;
}

/**
 * mas_walk() - Search for @mas->index in the tree.
 * @mas: The maple state.
 *
 * mas->index and mas->last will be set to the range if there is a value.  If
 * mas->status is ma_none, reset to ma_start
 *
 * Return: the entry at the location or %NULL.
 */
void *mas_walk(struct ma_state *mas)
{
        void *entry;

        if (!mas_is_active(mas) && !mas_is_start(mas))
                mas->status = ma_start;
retry:
        entry = mas_state_walk(mas);
        if (mas_is_start(mas)) {
                goto retry;
        } else if (mas_is_none(mas)) {
                mas->index = 0;
                mas->last = ULONG_MAX;
        } else if (mas_is_ptr(mas)) {
                if (!mas->index) {
                        mas->last = 0;
                        return entry;
                }

                mas->index = 1;
                mas->last = ULONG_MAX;
                mas->status = ma_none;
                return NULL;
        }

        return entry;
}
EXPORT_SYMBOL_GPL(mas_walk);

static inline bool mas_rewind_node(struct ma_state *mas)
{
        unsigned char slot;

        do {
                if (mte_is_root(mas->node)) {
                        slot = mas->offset;
                        if (!slot)
                                return false;
                } else {
                        mas_ascend(mas);
                        slot = mas->offset;
                }
        } while (!slot);

        mas->offset = --slot;
        return true;
}

/*
 * mas_skip_node() - Internal function.  Skip over a node.
 * @mas: The maple state.
 *
 * Return: true if there is another node, false otherwise.
 */
static inline bool mas_skip_node(struct ma_state *mas)
{
        if (mas_is_err(mas))
                return false;

        do {
                if (mte_is_root(mas->node)) {
                        if (mas->offset >= mas_data_end(mas)) {
                                mas_set_err(mas, -EBUSY);
                                return false;
                        }
                } else {
                        mas_ascend(mas);
                }
        } while (mas->offset >= mas_data_end(mas));

        mas->offset++;
        return true;
}

/*
 * mas_awalk() - Allocation walk.  Search from low address to high, for a gap of
 * @size
 * @mas: The maple state
 * @size: The size of the gap required
 *
 * Search between @mas->index and @mas->last for a gap of @size.
 */
static inline void mas_awalk(struct ma_state *mas, unsigned long size)
{
        struct maple_enode *last = NULL;

        /*
         * There are 4 options:
         * go to child (descend)
         * go back to parent (ascend)
         * no gap found. (return, error == -EBUSY)
         * found the gap. (return)
         */
        while (!mas_is_err(mas) && !mas_anode_descend(mas, size)) {
                if (last == mas->node)
                        mas_skip_node(mas);
                else
                        last = mas->node;
        }
}

/*
 * mas_sparse_area() - Internal function.  Return upper or lower limit when
 * searching for a gap in an empty tree.
 * @mas: The maple state
 * @min: the minimum range
 * @max: The maximum range
 * @size: The size of the gap
 * @fwd: Searching forward or back
 */
static inline int mas_sparse_area(struct ma_state *mas, unsigned long min,
                                unsigned long max, unsigned long size, bool fwd)
{
        if (!unlikely(mas_is_none(mas)) && min == 0) {
                min++;
                /*
                 * At this time, min is increased, we need to recheck whether
                 * the size is satisfied.
                 */
                if (min > max || max - min + 1 < size)
                        return -EBUSY;
        }
        /* mas_is_ptr */

        if (fwd) {
                mas->index = min;
                mas->last = min + size - 1;
        } else {
                mas->last = max;
                mas->index = max - size + 1;
        }
        return 0;
}

/*
 * mas_empty_area() - Get the lowest address within the range that is
 * sufficient for the size requested.
 * @mas: The maple state
 * @min: The lowest value of the range
 * @max: The highest value of the range
 * @size: The size needed
 */
int mas_empty_area(struct ma_state *mas, unsigned long min,
                unsigned long max, unsigned long size)
{
        unsigned char offset;
        unsigned long *pivots;
        enum maple_type mt;
        struct maple_node *node;

        if (min > max)
                return -EINVAL;

        if (size == 0 || max - min < size - 1)
                return -EINVAL;

        if (mas_is_start(mas))
                mas_start(mas);
        else if (mas->offset >= 2)
                mas->offset -= 2;
        else if (!mas_skip_node(mas))
                return -EBUSY;

        /* Empty set */
        if (mas_is_none(mas) || mas_is_ptr(mas))
                return mas_sparse_area(mas, min, max, size, true);

        /* The start of the window can only be within these values */
        mas->index = min;
        mas->last = max;
        mas_awalk(mas, size);

        if (unlikely(mas_is_err(mas)))
                return xa_err(mas->node);

        offset = mas->offset;
        node = mas_mn(mas);
        mt = mte_node_type(mas->node);
        pivots = ma_pivots(node, mt);
        min = mas_safe_min(mas, pivots, offset);
        if (mas->index < min)
                mas->index = min;
        mas->last = mas->index + size - 1;
        mas->end = ma_data_end(node, mt, pivots, mas->max);
        return 0;
}
EXPORT_SYMBOL_GPL(mas_empty_area);

/*
 * mas_empty_area_rev() - Get the highest address within the range that is
 * sufficient for the size requested.
 * @mas: The maple state
 * @min: The lowest value of the range
 * @max: The highest value of the range
 * @size: The size needed
 */
int mas_empty_area_rev(struct ma_state *mas, unsigned long min,
                unsigned long max, unsigned long size)
{
        struct maple_enode *last = mas->node;

        if (min > max)
                return -EINVAL;

        if (size == 0 || max - min < size - 1)
                return -EINVAL;

        if (mas_is_start(mas))
                mas_start(mas);
        else if ((mas->offset < 2) && (!mas_rewind_node(mas)))
                return -EBUSY;

        if (unlikely(mas_is_none(mas) || mas_is_ptr(mas)))
                return mas_sparse_area(mas, min, max, size, false);
        else if (mas->offset >= 2)
                mas->offset -= 2;
        else
                mas->offset = mas_data_end(mas);


        /* The start of the window can only be within these values. */
        mas->index = min;
        mas->last = max;

        while (!mas_rev_awalk(mas, size, &min, &max)) {
                if (last == mas->node) {
                        if (!mas_rewind_node(mas))
                                return -EBUSY;
                } else {
                        last = mas->node;
                }
        }

        if (mas_is_err(mas))
                return xa_err(mas->node);

        if (unlikely(mas->offset == MAPLE_NODE_SLOTS))
                return -EBUSY;

        /* Trim the upper limit to the max. */
        if (max < mas->last)
                mas->last = max;

        mas->index = mas->last - size + 1;
        mas->end = mas_data_end(mas);
        return 0;
}
EXPORT_SYMBOL_GPL(mas_empty_area_rev);

/*
 * mte_dead_leaves() - Mark all leaves of a node as dead.
 * @enode: the encoded node
 * @mt: the maple tree
 * @slots: Pointer to the slot array
 *
 * Must hold the write lock.
 *
 * Return: The number of leaves marked as dead.
 */
static inline
unsigned char mte_dead_leaves(struct maple_enode *enode, struct maple_tree *mt,
                              void __rcu **slots)
{
        struct maple_node *node;
        enum maple_type type;
        void *entry;
        int offset;

        for (offset = 0; offset < mt_slot_count(enode); offset++) {
                entry = mt_slot(mt, slots, offset);
                type = mte_node_type(entry);
                node = mte_to_node(entry);
                /* Use both node and type to catch LE & BE metadata */
                if (!node || !type)
                        break;

                mte_set_node_dead(entry);
                node->type = type;
                rcu_assign_pointer(slots[offset], node);
        }

        return offset;
}

/**
 * mte_dead_walk() - Walk down a dead tree to just before the leaves
 * @enode: The maple encoded node
 * @offset: The starting offset
 *
 * Note: This can only be used from the RCU callback context.
 */
static void __rcu **mte_dead_walk(struct maple_enode **enode, unsigned char offset)
{
        struct maple_node *node, *next;
        void __rcu **slots = NULL;

        next = mte_to_node(*enode);
        do {
                *enode = ma_enode_ptr(next);
                node = mte_to_node(*enode);
                slots = ma_slots(node, node->type);
                next = rcu_dereference_protected(slots[offset],
                                        lock_is_held(&rcu_callback_map));
                offset = 0;
        } while (!ma_is_leaf(next->type));

        return slots;
}

/**
 * mt_free_walk() - Walk & free a tree in the RCU callback context
 * @head: The RCU head that's within the node.
 *
 * Note: This can only be used from the RCU callback context.
 */
static void mt_free_walk(struct rcu_head *head)
{
        void __rcu **slots;
        struct maple_node *node, *start;
        struct maple_enode *enode;
        unsigned char offset;
        enum maple_type type;

        node = container_of(head, struct maple_node, rcu);

        if (ma_is_leaf(node->type))
                goto free_leaf;

        start = node;
        enode = mt_mk_node(node, node->type);
        slots = mte_dead_walk(&enode, 0);
        node = mte_to_node(enode);
        do {
                mt_free_bulk(node->slot_len, slots);
                offset = node->parent_slot + 1;
                enode = node->piv_parent;
                if (mte_to_node(enode) == node)
                        goto free_leaf;

                type = mte_node_type(enode);
                slots = ma_slots(mte_to_node(enode), type);
                if ((offset < mt_slots[type]) &&
                    rcu_dereference_protected(slots[offset],
                                              lock_is_held(&rcu_callback_map)))
                        slots = mte_dead_walk(&enode, offset);
                node = mte_to_node(enode);
        } while ((node != start) || (node->slot_len < offset));

        slots = ma_slots(node, node->type);
        mt_free_bulk(node->slot_len, slots);

free_leaf:
        kfree(node);
}

static inline void __rcu **mte_destroy_descend(struct maple_enode **enode,
        struct maple_tree *mt, struct maple_enode *prev, unsigned char offset)
{
        struct maple_node *node;
        struct maple_enode *next = *enode;
        void __rcu **slots = NULL;
        enum maple_type type;
        unsigned char next_offset = 0;

        do {
                *enode = next;
                node = mte_to_node(*enode);
                type = mte_node_type(*enode);
                slots = ma_slots(node, type);
                next = mt_slot_locked(mt, slots, next_offset);
                if ((mte_dead_node(next)))
                        next = mt_slot_locked(mt, slots, ++next_offset);

                mte_set_node_dead(*enode);
                node->type = type;
                node->piv_parent = prev;
                node->parent_slot = offset;
                offset = next_offset;
                next_offset = 0;
                prev = *enode;
        } while (!mte_is_leaf(next));

        return slots;
}

static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt,
                            bool free)
{
        void __rcu **slots;
        struct maple_node *node = mte_to_node(enode);
        struct maple_enode *start;

        if (mte_is_leaf(enode)) {
                mte_set_node_dead(enode);
                node->type = mte_node_type(enode);
                goto free_leaf;
        }

        start = enode;
        slots = mte_destroy_descend(&enode, mt, start, 0);
        node = mte_to_node(enode); // Updated in the above call.
        do {
                enum maple_type type;
                unsigned char offset;
                struct maple_enode *parent, *tmp;

                node->slot_len = mte_dead_leaves(enode, mt, slots);
                if (free)
                        mt_free_bulk(node->slot_len, slots);
                offset = node->parent_slot + 1;
                enode = node->piv_parent;
                if (mte_to_node(enode) == node)
                        goto free_leaf;

                type = mte_node_type(enode);
                slots = ma_slots(mte_to_node(enode), type);
                if (offset >= mt_slots[type])
                        goto next;

                tmp = mt_slot_locked(mt, slots, offset);
                if (mte_node_type(tmp) && mte_to_node(tmp)) {
                        parent = enode;
                        enode = tmp;
                        slots = mte_destroy_descend(&enode, mt, parent, offset);
                }
next:
                node = mte_to_node(enode);
        } while (start != enode);

        node = mte_to_node(enode);
        node->slot_len = mte_dead_leaves(enode, mt, slots);
        if (free)
                mt_free_bulk(node->slot_len, slots);

free_leaf:
        if (free)
                kfree(node);
        else
                mt_clear_meta(mt, node, node->type);
}

/*
 * mte_destroy_walk() - Free a tree or sub-tree.
 * @enode: the encoded maple node (maple_enode) to start
 * @mt: the tree to free - needed for node types.
 *
 * Must hold the write lock.
 */
static inline void mte_destroy_walk(struct maple_enode *enode,
                                    struct maple_tree *mt)
{
        struct maple_node *node = mte_to_node(enode);

        if (mt_in_rcu(mt)) {
                mt_destroy_walk(enode, mt, false);
                call_rcu(&node->rcu, mt_free_walk);
        } else {
                mt_destroy_walk(enode, mt, true);
        }
}
/* Interface */

/**
 * mas_store() - Store an @entry.
 * @mas: The maple state.
 * @entry: The entry to store.
 *
 * The @mas->index and @mas->last is used to set the range for the @entry.
 *
 * Return: the first entry between mas->index and mas->last or %NULL.
 */
void *mas_store(struct ma_state *mas, void *entry)
{
        MA_WR_STATE(wr_mas, mas, entry);

        trace_ma_write(TP_FCT, mas, 0, entry);
#ifdef CONFIG_DEBUG_MAPLE_TREE
        if (MAS_WARN_ON(mas, mas->index > mas->last))
                pr_err("Error %lX > %lX " PTR_FMT "\n", mas->index, mas->last,
                       entry);

        if (mas->index > mas->last) {
                mas_set_err(mas, -EINVAL);
                return NULL;
        }

#endif

        /*
         * Storing is the same operation as insert with the added caveat that it
         * can overwrite entries.  Although this seems simple enough, one may
         * want to examine what happens if a single store operation was to
         * overwrite multiple entries within a self-balancing B-Tree.
         */
        mas_wr_prealloc_setup(&wr_mas);
        mas->store_type = mas_wr_store_type(&wr_mas);
        if (mas->mas_flags & MA_STATE_PREALLOC) {
                mas_wr_store_entry(&wr_mas);
                MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas));
                return wr_mas.content;
        }

        mas_prealloc_calc(&wr_mas, entry);
        if (!mas->node_request)
                goto store;

        mas_alloc_nodes(mas, GFP_NOWAIT);
        if (mas_is_err(mas))
                return NULL;

store:
        mas_wr_store_entry(&wr_mas);
        mas_destroy(mas);
        return wr_mas.content;
}
EXPORT_SYMBOL_GPL(mas_store);

/**
 * mas_store_gfp() - Store a value into the tree.
 * @mas: The maple state
 * @entry: The entry to store
 * @gfp: The GFP_FLAGS to use for allocations if necessary.
 *
 * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not
 * be allocated.
 */
int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp)
{
        unsigned long index = mas->index;
        unsigned long last = mas->last;
        MA_WR_STATE(wr_mas, mas, entry);
        int ret = 0;

retry:
        mas_wr_preallocate(&wr_mas, entry);
        if (unlikely(mas_nomem(mas, gfp))) {
                if (!entry)
                        __mas_set_range(mas, index, last);
                goto retry;
        }

        if (mas_is_err(mas)) {
                ret = xa_err(mas->node);
                goto out;
        }

        mas_wr_store_entry(&wr_mas);
out:
        mas_destroy(mas);
        return ret;
}
EXPORT_SYMBOL_GPL(mas_store_gfp);

/**
 * mas_store_prealloc() - Store a value into the tree using memory
 * preallocated in the maple state.
 * @mas: The maple state
 * @entry: The entry to store.
 */
void mas_store_prealloc(struct ma_state *mas, void *entry)
{
        MA_WR_STATE(wr_mas, mas, entry);

        if (mas->store_type == wr_store_root) {
                mas_wr_prealloc_setup(&wr_mas);
                goto store;
        }

        mas_wr_walk_descend(&wr_mas);
        if (mas->store_type != wr_spanning_store) {
                /* set wr_mas->content to current slot */
                wr_mas.content = mas_slot_locked(mas, wr_mas.slots, mas->offset);
                mas_wr_end_piv(&wr_mas);
        }

store:
        trace_ma_write(TP_FCT, mas, 0, entry);
        mas_wr_store_entry(&wr_mas);
        MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas));
        mas_destroy(mas);
}
EXPORT_SYMBOL_GPL(mas_store_prealloc);

/**
 * mas_preallocate() - Preallocate enough nodes for a store operation
 * @mas: The maple state
 * @entry: The entry that will be stored
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * Return: 0 on success, -ENOMEM if memory could not be allocated.
 */
int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp)
{
        MA_WR_STATE(wr_mas, mas, entry);

        mas_wr_prealloc_setup(&wr_mas);
        mas->store_type = mas_wr_store_type(&wr_mas);
        mas_prealloc_calc(&wr_mas, entry);
        if (!mas->node_request)
                goto set_flag;

        mas->mas_flags &= ~MA_STATE_PREALLOC;
        mas_alloc_nodes(mas, gfp);
        if (mas_is_err(mas)) {
                int ret = xa_err(mas->node);

                mas->node_request = 0;
                mas_destroy(mas);
                mas_reset(mas);
                return ret;
        }

set_flag:
        mas->mas_flags |= MA_STATE_PREALLOC;
        return 0;
}
EXPORT_SYMBOL_GPL(mas_preallocate);

/*
 * mas_destroy() - destroy a maple state.
 * @mas: The maple state
 *
 * Upon completion, check the left-most node and rebalance against the node to
 * the right if necessary.  Frees any allocated nodes associated with this maple
 * state.
 */
void mas_destroy(struct ma_state *mas)
{
        mas->mas_flags &= ~MA_STATE_PREALLOC;
        mas_empty_nodes(mas);
}
EXPORT_SYMBOL_GPL(mas_destroy);

static void mas_may_activate(struct ma_state *mas)
{
        if (!mas->node) {
                mas->status = ma_start;
        } else if (mas->index > mas->max || mas->index < mas->min) {
                mas->status = ma_start;
        } else {
                mas->status = ma_active;
        }
}

static bool mas_next_setup(struct ma_state *mas, unsigned long max,
                void **entry)
{
        bool was_none = mas_is_none(mas);

        if (unlikely(mas->last >= max)) {
                mas->status = ma_overflow;
                return true;
        }

        switch (mas->status) {
        case ma_active:
                return false;
        case ma_none:
                fallthrough;
        case ma_pause:
                mas->status = ma_start;
                fallthrough;
        case ma_start:
                mas_walk(mas); /* Retries on dead nodes handled by mas_walk */
                break;
        case ma_overflow:
                /* Overflowed before, but the max changed */
                mas_may_activate(mas);
                break;
        case ma_underflow:
                /* The user expects the mas to be one before where it is */
                mas_may_activate(mas);
                *entry = mas_walk(mas);
                if (*entry)
                        return true;
                break;
        case ma_root:
                break;
        case ma_error:
                return true;
        }

        if (likely(mas_is_active(mas))) /* Fast path */
                return false;

        if (mas_is_ptr(mas)) {
                *entry = NULL;
                if (was_none && mas->index == 0) {
                        mas->index = mas->last = 0;
                        return true;
                }
                mas->index = 1;
                mas->last = ULONG_MAX;
                mas->status = ma_none;
                return true;
        }

        if (mas_is_none(mas))
                return true;

        return false;
}

/**
 * mas_next() - Get the next entry.
 * @mas: The maple state
 * @max: The maximum index to check.
 *
 * Returns the next entry after @mas->index.
 * Must hold rcu_read_lock or the write lock.
 * Can return the zero entry.
 *
 * Return: The next entry or %NULL
 */
void *mas_next(struct ma_state *mas, unsigned long max)
{
        void *entry = NULL;

        if (mas_next_setup(mas, max, &entry))
                return entry;

        /* Retries on dead nodes handled by mas_next_slot */
        return mas_next_slot(mas, max, false);
}
EXPORT_SYMBOL_GPL(mas_next);

/**
 * mas_next_range() - Advance the maple state to the next range
 * @mas: The maple state
 * @max: The maximum index to check.
 *
 * Sets @mas->index and @mas->last to the range.
 * Must hold rcu_read_lock or the write lock.
 * Can return the zero entry.
 *
 * Return: The next entry or %NULL
 */
void *mas_next_range(struct ma_state *mas, unsigned long max)
{
        void *entry = NULL;

        if (mas_next_setup(mas, max, &entry))
                return entry;

        /* Retries on dead nodes handled by mas_next_slot */
        return mas_next_slot(mas, max, true);
}
EXPORT_SYMBOL_GPL(mas_next_range);

/**
 * mt_next() - get the next value in the maple tree
 * @mt: The maple tree
 * @index: The start index
 * @max: The maximum index to check
 *
 * Takes RCU read lock internally to protect the search, which does not
 * protect the returned pointer after dropping RCU read lock.
 * See also: Documentation/core-api/maple_tree.rst
 *
 * Return: The entry higher than @index or %NULL if nothing is found.
 */
void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max)
{
        void *entry = NULL;
        MA_STATE(mas, mt, index, index);

        rcu_read_lock();
        entry = mas_next(&mas, max);
        rcu_read_unlock();
        return entry;
}
EXPORT_SYMBOL_GPL(mt_next);

static bool mas_prev_setup(struct ma_state *mas, unsigned long min, void **entry)
{
        if (unlikely(mas->index <= min)) {
                mas->status = ma_underflow;
                return true;
        }

        switch (mas->status) {
        case ma_active:
                return false;
        case ma_start:
                break;
        case ma_none:
                fallthrough;
        case ma_pause:
                mas->status = ma_start;
                break;
        case ma_underflow:
                /* underflowed before but the min changed */
                mas_may_activate(mas);
                break;
        case ma_overflow:
                /* User expects mas to be one after where it is */
                mas_may_activate(mas);
                *entry = mas_walk(mas);
                if (*entry)
                        return true;
                break;
        case ma_root:
                break;
        case ma_error:
                return true;
        }

        if (mas_is_start(mas))
                mas_walk(mas);

        if (unlikely(mas_is_ptr(mas))) {
                if (!mas->index) {
                        mas->status = ma_none;
                        return true;
                }
                mas->index = mas->last = 0;
                *entry = mas_root(mas);
                return true;
        }

        if (mas_is_none(mas)) {
                if (mas->index) {
                        /* Walked to out-of-range pointer? */
                        mas->index = mas->last = 0;
                        mas->status = ma_root;
                        *entry = mas_root(mas);
                        return true;
                }
                return true;
        }

        return false;
}

/**
 * mas_prev() - Get the previous entry
 * @mas: The maple state
 * @min: The minimum value to check.
 *
 * Must hold rcu_read_lock or the write lock.
 * Will reset mas to ma_start if the status is ma_none.  Will stop on not
 * searchable nodes.
 *
 * Return: the previous value or %NULL.
 */
void *mas_prev(struct ma_state *mas, unsigned long min)
{
        void *entry = NULL;

        if (mas_prev_setup(mas, min, &entry))
                return entry;

        return mas_prev_slot(mas, min, false);
}
EXPORT_SYMBOL_GPL(mas_prev);

/**
 * mas_prev_range() - Advance to the previous range
 * @mas: The maple state
 * @min: The minimum value to check.
 *
 * Sets @mas->index and @mas->last to the range.
 * Must hold rcu_read_lock or the write lock.
 * Will reset mas to ma_start if the node is ma_none.  Will stop on not
 * searchable nodes.
 *
 * Return: the previous value or %NULL.
 */
void *mas_prev_range(struct ma_state *mas, unsigned long min)
{
        void *entry = NULL;

        if (mas_prev_setup(mas, min, &entry))
                return entry;

        return mas_prev_slot(mas, min, true);
}
EXPORT_SYMBOL_GPL(mas_prev_range);

/**
 * mt_prev() - get the previous value in the maple tree
 * @mt: The maple tree
 * @index: The start index
 * @min: The minimum index to check
 *
 * Takes RCU read lock internally to protect the search, which does not
 * protect the returned pointer after dropping RCU read lock.
 * See also: Documentation/core-api/maple_tree.rst
 *
 * Return: The entry before @index or %NULL if nothing is found.
 */
void *mt_prev(struct maple_tree *mt, unsigned long index, unsigned long min)
{
        void *entry = NULL;
        MA_STATE(mas, mt, index, index);

        rcu_read_lock();
        entry = mas_prev(&mas, min);
        rcu_read_unlock();
        return entry;
}
EXPORT_SYMBOL_GPL(mt_prev);

/**
 * mas_pause() - Pause a mas_find/mas_for_each to drop the lock.
 * @mas: The maple state to pause
 *
 * Some users need to pause a walk and drop the lock they're holding in
 * order to yield to a higher priority thread or carry out an operation
 * on an entry.  Those users should call this function before they drop
 * the lock.  It resets the @mas to be suitable for the next iteration
 * of the loop after the user has reacquired the lock.  If most entries
 * found during a walk require you to call mas_pause(), the mt_for_each()
 * iterator may be more appropriate.
 *
 */
void mas_pause(struct ma_state *mas)
{
        mas->status = ma_pause;
        mas->node = NULL;
}
EXPORT_SYMBOL_GPL(mas_pause);

/**
 * mas_find_setup() - Internal function to set up mas_find*().
 * @mas: The maple state
 * @max: The maximum index
 * @entry: Pointer to the entry
 *
 * Returns: True if entry is the answer, false otherwise.
 */
static __always_inline bool mas_find_setup(struct ma_state *mas, unsigned long max, void **entry)
{
        switch (mas->status) {
        case ma_active:
                if (mas->last < max)
                        return false;
                return true;
        case ma_start:
                break;
        case ma_pause:
                if (unlikely(mas->last >= max))
                        return true;

                mas->index = ++mas->last;
                mas->status = ma_start;
                break;
        case ma_none:
                if (unlikely(mas->last >= max))
                        return true;

                mas->index = mas->last;
                mas->status = ma_start;
                break;
        case ma_underflow:
                /* mas is pointing at entry before unable to go lower */
                if (unlikely(mas->index >= max)) {
                        mas->status = ma_overflow;
                        return true;
                }

                mas_may_activate(mas);
                *entry = mas_walk(mas);
                if (*entry)
                        return true;
                break;
        case ma_overflow:
                if (unlikely(mas->last >= max))
                        return true;

                mas_may_activate(mas);
                *entry = mas_walk(mas);
                if (*entry)
                        return true;
                break;
        case ma_root:
                break;
        case ma_error:
                return true;
        }

        if (mas_is_start(mas)) {
                /* First run or continue */
                if (mas->index > max)
                        return true;

                *entry = mas_walk(mas);
                if (*entry)
                        return true;

        }

        if (unlikely(mas_is_ptr(mas)))
                goto ptr_out_of_range;

        if (unlikely(mas_is_none(mas)))
                return true;

        if (mas->index == max)
                return true;

        return false;

ptr_out_of_range:
        mas->status = ma_none;
        mas->index = 1;
        mas->last = ULONG_MAX;
        return true;
}

/**
 * mas_find() - On the first call, find the entry at or after mas->index up to
 * %max.  Otherwise, find the entry after mas->index.
 * @mas: The maple state
 * @max: The maximum value to check.
 *
 * Must hold rcu_read_lock or the write lock.
 * If an entry exists, last and index are updated accordingly.
 * May set @mas->status to ma_overflow.
 *
 * Return: The entry or %NULL.
 */
void *mas_find(struct ma_state *mas, unsigned long max)
{
        void *entry = NULL;

        if (mas_find_setup(mas, max, &entry))
                return entry;

        /* Retries on dead nodes handled by mas_next_slot */
        entry = mas_next_slot(mas, max, false);
        /* Ignore overflow */
        mas->status = ma_active;
        return entry;
}
EXPORT_SYMBOL_GPL(mas_find);

/**
 * mas_find_range() - On the first call, find the entry at or after
 * mas->index up to %max.  Otherwise, advance to the next slot mas->index.
 * @mas: The maple state
 * @max: The maximum value to check.
 *
 * Must hold rcu_read_lock or the write lock.
 * If an entry exists, last and index are updated accordingly.
 * May set @mas->status to ma_overflow.
 *
 * Return: The entry or %NULL.
 */
void *mas_find_range(struct ma_state *mas, unsigned long max)
{
        void *entry = NULL;

        if (mas_find_setup(mas, max, &entry))
                return entry;

        /* Retries on dead nodes handled by mas_next_slot */
        return mas_next_slot(mas, max, true);
}
EXPORT_SYMBOL_GPL(mas_find_range);

/**
 * mas_find_rev_setup() - Internal function to set up mas_find_*_rev()
 * @mas: The maple state
 * @min: The minimum index
 * @entry: Pointer to the entry
 *
 * Returns: True if entry is the answer, false otherwise.
 */
static bool mas_find_rev_setup(struct ma_state *mas, unsigned long min,
                void **entry)
{

        switch (mas->status) {
        case ma_active:
                goto active;
        case ma_start:
                break;
        case ma_pause:
                if (unlikely(mas->index <= min)) {
                        mas->status = ma_underflow;
                        return true;
                }
                mas->last = --mas->index;
                mas->status = ma_start;
                break;
        case ma_none:
                if (mas->index <= min)
                        goto none;

                mas->last = mas->index;
                mas->status = ma_start;
                break;
        case ma_overflow: /* user expects the mas to be one after where it is */
                if (unlikely(mas->index <= min)) {
                        mas->status = ma_underflow;
                        return true;
                }

                mas->status = ma_active;
                break;
        case ma_underflow: /* user expects the mas to be one before where it is */
                if (unlikely(mas->index <= min))
                        return true;

                mas->status = ma_active;
                break;
        case ma_root:
                break;
        case ma_error:
                return true;
        }

        if (mas_is_start(mas)) {
                /* First run or continue */
                if (mas->index < min)
                        return true;

                *entry = mas_walk(mas);
                if (*entry)
                        return true;
        }

        if (unlikely(mas_is_ptr(mas)))
                goto none;

        if (unlikely(mas_is_none(mas))) {
                /*
                 * Walked to the location, and there was nothing so the previous
                 * location is 0.
                 */
                mas->last = mas->index = 0;
                mas->status = ma_root;
                *entry = mas_root(mas);
                return true;
        }

active:
        if (mas->index < min)
                return true;

        return false;

none:
        mas->status = ma_none;
        return true;
}

/**
 * mas_find_rev: On the first call, find the first non-null entry at or below
 * mas->index down to %min.  Otherwise find the first non-null entry below
 * mas->index down to %min.
 * @mas: The maple state
 * @min: The minimum value to check.
 *
 * Must hold rcu_read_lock or the write lock.
 * If an entry exists, last and index are updated accordingly.
 * May set @mas->status to ma_underflow.
 *
 * Return: The entry or %NULL.
 */
void *mas_find_rev(struct ma_state *mas, unsigned long min)
{
        void *entry = NULL;

        if (mas_find_rev_setup(mas, min, &entry))
                return entry;

        /* Retries on dead nodes handled by mas_prev_slot */
        return mas_prev_slot(mas, min, false);

}
EXPORT_SYMBOL_GPL(mas_find_rev);

/**
 * mas_find_range_rev: On the first call, find the first non-null entry at or
 * below mas->index down to %min.  Otherwise advance to the previous slot after
 * mas->index down to %min.
 * @mas: The maple state
 * @min: The minimum value to check.
 *
 * Must hold rcu_read_lock or the write lock.
 * If an entry exists, last and index are updated accordingly.
 * May set @mas->status to ma_underflow.
 *
 * Return: The entry or %NULL.
 */
void *mas_find_range_rev(struct ma_state *mas, unsigned long min)
{
        void *entry = NULL;

        if (mas_find_rev_setup(mas, min, &entry))
                return entry;

        /* Retries on dead nodes handled by mas_prev_slot */
        return mas_prev_slot(mas, min, true);
}
EXPORT_SYMBOL_GPL(mas_find_range_rev);

/**
 * mas_erase() - Find the range in which index resides and erase the entire
 * range.
 * @mas: The maple state
 *
 * Must hold the write lock.
 * Searches for @mas->index, sets @mas->index and @mas->last to the range and
 * erases that range.
 *
 * Return: the entry that was erased or %NULL, @mas->index and @mas->last are updated.
 */
void *mas_erase(struct ma_state *mas)
{
        void *entry;
        unsigned long index = mas->index;
        MA_WR_STATE(wr_mas, mas, NULL);

        if (!mas_is_active(mas) || !mas_is_start(mas))
                mas->status = ma_start;

write_retry:
        entry = mas_state_walk(mas);
        if (!entry)
                return NULL;

        /* Must reset to ensure spanning writes of last slot are detected */
        mas_reset(mas);
        mas_wr_preallocate(&wr_mas, NULL);
        if (mas_nomem(mas, GFP_KERNEL)) {
                /* in case the range of entry changed when unlocked */
                mas->index = mas->last = index;
                goto write_retry;
        }

        if (mas_is_err(mas))
                goto out;

        mas_wr_store_entry(&wr_mas);
out:
        mas_destroy(mas);
        return entry;
}
EXPORT_SYMBOL_GPL(mas_erase);

/**
 * mas_nomem() - Check if there was an error allocating and do the allocation
 * if necessary If there are allocations, then free them.
 * @mas: The maple state
 * @gfp: The GFP_FLAGS to use for allocations
 * Return: true on allocation, false otherwise.
 */
bool mas_nomem(struct ma_state *mas, gfp_t gfp)
        __must_hold(mas->tree->ma_lock)
{
        if (likely(mas->node != MA_ERROR(-ENOMEM)))
                return false;

        if (gfpflags_allow_blocking(gfp) && !mt_external_lock(mas->tree)) {
                mtree_unlock(mas->tree);
                mas_alloc_nodes(mas, gfp);
                mtree_lock(mas->tree);
        } else {
                mas_alloc_nodes(mas, gfp);
        }

        if (!mas->sheaf && !mas->alloc)
                return false;

        mas->status = ma_start;
        return true;
}

void __init maple_tree_init(void)
{
        struct kmem_cache_args args = {
                .align  = sizeof(struct maple_node),
                .sheaf_capacity = 32,
        };

        maple_node_cache = kmem_cache_create("maple_node",
                        sizeof(struct maple_node), &args,
                        SLAB_PANIC);
}

/**
 * mtree_load() - Load a value stored in a maple tree
 * @mt: The maple tree
 * @index: The index to load
 *
 * Return: the entry or %NULL
 */
void *mtree_load(struct maple_tree *mt, unsigned long index)
{
        MA_STATE(mas, mt, index, index);
        void *entry;

        trace_ma_read(TP_FCT, &mas);
        rcu_read_lock();
retry:
        entry = mas_start(&mas);
        if (unlikely(mas_is_none(&mas)))
                goto unlock;

        if (unlikely(mas_is_ptr(&mas))) {
                if (index)
                        entry = NULL;

                goto unlock;
        }

        entry = mtree_lookup_walk(&mas);
        if (!entry && unlikely(mas_is_start(&mas)))
                goto retry;
unlock:
        rcu_read_unlock();
        if (xa_is_zero(entry))
                return NULL;

        return entry;
}
EXPORT_SYMBOL(mtree_load);

/**
 * mtree_store_range() - Store an entry at a given range.
 * @mt: The maple tree
 * @index: The start of the range
 * @last: The end of the range
 * @entry: The entry to store
 * @gfp: The GFP_FLAGS to use for allocations
 *
 * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not
 * be allocated.
 */
int mtree_store_range(struct maple_tree *mt, unsigned long index,
                unsigned long last, void *entry, gfp_t gfp)
{
        MA_STATE(mas, mt, index, last);
        int ret = 0;

        trace_ma_write(TP_FCT, &mas, 0, entry);
        if (WARN_ON_ONCE(xa_is_advanced(entry)))
                return -EINVAL;

        if (index > last)
                return -EINVAL;

        mtree_lock(mt);
        ret = mas_store_gfp(&mas, entry, gfp);
        mtree_unlock(mt);

        return ret;
}
EXPORT_SYMBOL(mtree_store_range);

/**
 * mtree_store() - Store an entry at a given index.
 * @mt: The maple tree
 * @index: The index to store the value
 * @entry: The entry to store
 * @gfp: The GFP_FLAGS to use for allocations
 *
 * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not
 * be allocated.
 */
int mtree_store(struct maple_tree *mt, unsigned long index, void *entry,
                 gfp_t gfp)
{
        return mtree_store_range(mt, index, index, entry, gfp);
}
EXPORT_SYMBOL(mtree_store);

/**
 * mtree_insert_range() - Insert an entry at a given range if there is no value.
 * @mt: The maple tree
 * @first: The start of the range
 * @last: The end of the range
 * @entry: The entry to store
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * Return: 0 on success, -EEXISTS if the range is occupied, -EINVAL on invalid
 * request, -ENOMEM if memory could not be allocated.
 */
int mtree_insert_range(struct maple_tree *mt, unsigned long first,
                unsigned long last, void *entry, gfp_t gfp)
{
        MA_STATE(ms, mt, first, last);
        int ret = 0;

        if (WARN_ON_ONCE(xa_is_advanced(entry)))
                return -EINVAL;

        if (first > last)
                return -EINVAL;

        mtree_lock(mt);
retry:
        mas_insert(&ms, entry);
        if (mas_nomem(&ms, gfp))
                goto retry;

        mtree_unlock(mt);
        if (mas_is_err(&ms))
                ret = xa_err(ms.node);

        mas_destroy(&ms);
        return ret;
}
EXPORT_SYMBOL(mtree_insert_range);

/**
 * mtree_insert() - Insert an entry at a given index if there is no value.
 * @mt: The maple tree
 * @index : The index to store the value
 * @entry: The entry to store
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * Return: 0 on success, -EEXISTS if the range is occupied, -EINVAL on invalid
 * request, -ENOMEM if memory could not be allocated.
 */
int mtree_insert(struct maple_tree *mt, unsigned long index, void *entry,
                 gfp_t gfp)
{
        return mtree_insert_range(mt, index, index, entry, gfp);
}
EXPORT_SYMBOL(mtree_insert);

int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp,
                void *entry, unsigned long size, unsigned long min,
                unsigned long max, gfp_t gfp)
{
        int ret = 0;

        MA_STATE(mas, mt, 0, 0);
        if (!mt_is_alloc(mt))
                return -EINVAL;

        if (WARN_ON_ONCE(mt_is_reserved(entry)))
                return -EINVAL;

        mtree_lock(mt);
retry:
        ret = mas_empty_area(&mas, min, max, size);
        if (ret)
                goto unlock;

        mas_insert(&mas, entry);
        /*
         * mas_nomem() may release the lock, causing the allocated area
         * to be unavailable, so try to allocate a free area again.
         */
        if (mas_nomem(&mas, gfp))
                goto retry;

        if (mas_is_err(&mas))
                ret = xa_err(mas.node);
        else
                *startp = mas.index;

unlock:
        mtree_unlock(mt);
        mas_destroy(&mas);
        return ret;
}
EXPORT_SYMBOL(mtree_alloc_range);

/**
 * mtree_alloc_cyclic() - Find somewhere to store this entry in the tree.
 * @mt: The maple tree.
 * @startp: Pointer to ID.
 * @range_lo: Lower bound of range to search.
 * @range_hi: Upper bound of range to search.
 * @entry: The entry to store.
 * @next: Pointer to next ID to allocate.
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * Finds an empty entry in @mt after @next, stores the new index into
 * the @id pointer, stores the entry at that index, then updates @next.
 *
 * @mt must be initialized with the MT_FLAGS_ALLOC_RANGE flag.
 *
 * Context: Any context.  Takes and releases the mt.lock.  May sleep if
 * the @gfp flags permit.
 *
 * Return: 0 if the allocation succeeded without wrapping, 1 if the
 * allocation succeeded after wrapping, -ENOMEM if memory could not be
 * allocated, -EINVAL if @mt cannot be used, or -EBUSY if there are no
 * free entries.
 */
int mtree_alloc_cyclic(struct maple_tree *mt, unsigned long *startp,
                void *entry, unsigned long range_lo, unsigned long range_hi,
                unsigned long *next, gfp_t gfp)
{
        int ret;

        MA_STATE(mas, mt, 0, 0);

        if (!mt_is_alloc(mt))
                return -EINVAL;
        if (WARN_ON_ONCE(mt_is_reserved(entry)))
                return -EINVAL;
        mtree_lock(mt);
        ret = mas_alloc_cyclic(&mas, startp, entry, range_lo, range_hi,
                               next, gfp);
        mtree_unlock(mt);
        return ret;
}
EXPORT_SYMBOL(mtree_alloc_cyclic);

int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp,
                void *entry, unsigned long size, unsigned long min,
                unsigned long max, gfp_t gfp)
{
        int ret = 0;

        MA_STATE(mas, mt, 0, 0);
        if (!mt_is_alloc(mt))
                return -EINVAL;

        if (WARN_ON_ONCE(mt_is_reserved(entry)))
                return -EINVAL;

        mtree_lock(mt);
retry:
        ret = mas_empty_area_rev(&mas, min, max, size);
        if (ret)
                goto unlock;

        mas_insert(&mas, entry);
        /*
         * mas_nomem() may release the lock, causing the allocated area
         * to be unavailable, so try to allocate a free area again.
         */
        if (mas_nomem(&mas, gfp))
                goto retry;

        if (mas_is_err(&mas))
                ret = xa_err(mas.node);
        else
                *startp = mas.index;

unlock:
        mtree_unlock(mt);
        mas_destroy(&mas);
        return ret;
}
EXPORT_SYMBOL(mtree_alloc_rrange);

/**
 * mtree_erase() - Find an index and erase the entire range.
 * @mt: The maple tree
 * @index: The index to erase
 *
 * Erasing is the same as a walk to an entry then a store of a NULL to that
 * ENTIRE range.  In fact, it is implemented as such using the advanced API.
 *
 * Return: The entry stored at the @index or %NULL
 */
void *mtree_erase(struct maple_tree *mt, unsigned long index)
{
        void *entry = NULL;

        MA_STATE(mas, mt, index, index);
        trace_ma_op(TP_FCT, &mas);

        mtree_lock(mt);
        entry = mas_erase(&mas);
        mtree_unlock(mt);

        return entry;
}
EXPORT_SYMBOL(mtree_erase);

/*
 * mas_dup_free() - Free an incomplete duplication of a tree.
 * @mas: The maple state of a incomplete tree.
 *
 * The parameter @mas->node passed in indicates that the allocation failed on
 * this node. This function frees all nodes starting from @mas->node in the
 * reverse order of mas_dup_build(). There is no need to hold the source tree
 * lock at this time.
 */
static void mas_dup_free(struct ma_state *mas)
{
        struct maple_node *node;
        enum maple_type type;
        void __rcu **slots;
        unsigned char count, i;

        /* Maybe the first node allocation failed. */
        if (mas_is_none(mas))
                return;

        while (!mte_is_root(mas->node)) {
                mas_ascend(mas);
                if (mas->offset) {
                        mas->offset--;
                        do {
                                mas_descend(mas);
                                mas->offset = mas_data_end(mas);
                        } while (!mte_is_leaf(mas->node));

                        mas_ascend(mas);
                }

                node = mte_to_node(mas->node);
                type = mte_node_type(mas->node);
                slots = ma_slots(node, type);
                count = mas_data_end(mas) + 1;
                for (i = 0; i < count; i++)
                        ((unsigned long *)slots)[i] &= ~MAPLE_NODE_MASK;
                mt_free_bulk(count, slots);
        }

        node = mte_to_node(mas->node);
        kfree(node);
}

/*
 * mas_copy_node() - Copy a maple node and replace the parent.
 * @mas: The maple state of source tree.
 * @new_mas: The maple state of new tree.
 * @parent: The parent of the new node.
 *
 * Copy @mas->node to @new_mas->node, set @parent to be the parent of
 * @new_mas->node. If memory allocation fails, @mas is set to -ENOMEM.
 */
static inline void mas_copy_node(struct ma_state *mas, struct ma_state *new_mas,
                struct maple_pnode *parent)
{
        struct maple_node *node = mte_to_node(mas->node);
        struct maple_node *new_node = mte_to_node(new_mas->node);
        unsigned long val;

        /* Copy the node completely. */
        memcpy(new_node, node, sizeof(struct maple_node));
        /* Update the parent node pointer. */
        val = (unsigned long)node->parent & MAPLE_NODE_MASK;
        new_node->parent = ma_parent_ptr(val | (unsigned long)parent);
}

/*
 * mas_dup_alloc() - Allocate child nodes for a maple node.
 * @mas: The maple state of source tree.
 * @new_mas: The maple state of new tree.
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * This function allocates child nodes for @new_mas->node during the duplication
 * process. If memory allocation fails, @mas is set to -ENOMEM.
 */
static inline void mas_dup_alloc(struct ma_state *mas, struct ma_state *new_mas,
                gfp_t gfp)
{
        struct maple_node *node = mte_to_node(mas->node);
        struct maple_node *new_node = mte_to_node(new_mas->node);
        enum maple_type type;
        unsigned char count, i;
        void __rcu **slots;
        void __rcu **new_slots;
        unsigned long val;

        /* Allocate memory for child nodes. */
        type = mte_node_type(mas->node);
        new_slots = ma_slots(new_node, type);
        count = mas->node_request = mas_data_end(mas) + 1;
        mas_alloc_nodes(mas, gfp);
        if (unlikely(mas_is_err(mas)))
                return;

        slots = ma_slots(node, type);
        for (i = 0; i < count; i++) {
                val = (unsigned long)mt_slot_locked(mas->tree, slots, i);
                val &= MAPLE_NODE_MASK;
                new_slots[i] = ma_mnode_ptr((unsigned long)mas_pop_node(mas) |
                                            val);
        }
}

/*
 * mas_dup_build() - Build a new maple tree from a source tree
 * @mas: The maple state of source tree, need to be in MAS_START state.
 * @new_mas: The maple state of new tree, need to be in MAS_START state.
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * This function builds a new tree in DFS preorder. If the memory allocation
 * fails, the error code -ENOMEM will be set in @mas, and @new_mas points to the
 * last node. mas_dup_free() will free the incomplete duplication of a tree.
 *
 * Note that the attributes of the two trees need to be exactly the same, and the
 * new tree needs to be empty, otherwise -EINVAL will be set in @mas.
 */
static inline void mas_dup_build(struct ma_state *mas, struct ma_state *new_mas,
                gfp_t gfp)
{
        struct maple_node *node;
        struct maple_pnode *parent = NULL;
        struct maple_enode *root;
        enum maple_type type;

        if (unlikely(mt_attr(mas->tree) != mt_attr(new_mas->tree)) ||
            unlikely(!mtree_empty(new_mas->tree))) {
                mas_set_err(mas, -EINVAL);
                return;
        }

        root = mas_start(mas);
        if (mas_is_ptr(mas) || mas_is_none(mas))
                goto set_new_tree;

        node = mt_alloc_one(gfp);
        if (!node) {
                new_mas->status = ma_none;
                mas_set_err(mas, -ENOMEM);
                return;
        }

        type = mte_node_type(mas->node);
        root = mt_mk_node(node, type);
        new_mas->node = root;
        new_mas->min = 0;
        new_mas->max = ULONG_MAX;
        root = mte_mk_root(root);
        while (1) {
                mas_copy_node(mas, new_mas, parent);
                if (!mte_is_leaf(mas->node)) {
                        /* Only allocate child nodes for non-leaf nodes. */
                        mas_dup_alloc(mas, new_mas, gfp);
                        if (unlikely(mas_is_err(mas)))
                                goto empty_mas;
                } else {
                        /*
                         * This is the last leaf node and duplication is
                         * completed.
                         */
                        if (mas->max == ULONG_MAX)
                                goto done;

                        /* This is not the last leaf node and needs to go up. */
                        do {
                                mas_ascend(mas);
                                mas_ascend(new_mas);
                        } while (mas->offset == mas_data_end(mas));

                        /* Move to the next subtree. */
                        mas->offset++;
                        new_mas->offset++;
                }

                mas_descend(mas);
                parent = ma_parent_ptr(mte_to_node(new_mas->node));
                mas_descend(new_mas);
                mas->offset = 0;
                new_mas->offset = 0;
        }
done:
        /* Specially handle the parent of the root node. */
        mte_to_node(root)->parent = ma_parent_ptr(mas_tree_parent(new_mas));
set_new_tree:
        /* Make them the same height */
        new_mas->tree->ma_flags = mas->tree->ma_flags;
        rcu_assign_pointer(new_mas->tree->ma_root, root);
empty_mas:
        mas_empty_nodes(mas);
}

/**
 * __mt_dup(): Duplicate an entire maple tree
 * @mt: The source maple tree
 * @new: The new maple tree
 * @gfp: The GFP_FLAGS to use for allocations
 *
 * This function duplicates a maple tree in Depth-First Search (DFS) pre-order
 * traversal. It uses memcpy() to copy nodes in the source tree and allocate
 * new child nodes in non-leaf nodes. The new node is exactly the same as the
 * source node except for all the addresses stored in it. It will be faster than
 * traversing all elements in the source tree and inserting them one by one into
 * the new tree.
 * The user needs to ensure that the attributes of the source tree and the new
 * tree are the same, and the new tree needs to be an empty tree, otherwise
 * -EINVAL will be returned.
 * Note that the user needs to manually lock the source tree and the new tree.
 *
 * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If
 * the attributes of the two trees are different or the new tree is not an empty
 * tree.
 */
int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp)
{
        int ret = 0;
        MA_STATE(mas, mt, 0, 0);
        MA_STATE(new_mas, new, 0, 0);

        mas_dup_build(&mas, &new_mas, gfp);
        if (unlikely(mas_is_err(&mas))) {
                ret = xa_err(mas.node);
                if (ret == -ENOMEM)
                        mas_dup_free(&new_mas);
        }

        return ret;
}
EXPORT_SYMBOL(__mt_dup);

/**
 * mtree_dup(): Duplicate an entire maple tree
 * @mt: The source maple tree
 * @new: The new maple tree
 * @gfp: The GFP_FLAGS to use for allocations
 *
 * This function duplicates a maple tree in Depth-First Search (DFS) pre-order
 * traversal. It uses memcpy() to copy nodes in the source tree and allocate
 * new child nodes in non-leaf nodes. The new node is exactly the same as the
 * source node except for all the addresses stored in it. It will be faster than
 * traversing all elements in the source tree and inserting them one by one into
 * the new tree.
 * The user needs to ensure that the attributes of the source tree and the new
 * tree are the same, and the new tree needs to be an empty tree, otherwise
 * -EINVAL will be returned.
 *
 * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If
 * the attributes of the two trees are different or the new tree is not an empty
 * tree.
 */
int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp)
{
        int ret = 0;
        MA_STATE(mas, mt, 0, 0);
        MA_STATE(new_mas, new, 0, 0);

        mas_lock(&new_mas);
        mas_lock_nested(&mas, SINGLE_DEPTH_NESTING);
        mas_dup_build(&mas, &new_mas, gfp);
        mas_unlock(&mas);
        if (unlikely(mas_is_err(&mas))) {
                ret = xa_err(mas.node);
                if (ret == -ENOMEM)
                        mas_dup_free(&new_mas);
        }

        mas_unlock(&new_mas);
        return ret;
}
EXPORT_SYMBOL(mtree_dup);

/**
 * __mt_destroy() - Walk and free all nodes of a locked maple tree.
 * @mt: The maple tree
 *
 * Note: Does not handle locking.
 */
void __mt_destroy(struct maple_tree *mt)
{
        void *root = mt_root_locked(mt);

        rcu_assign_pointer(mt->ma_root, NULL);
        if (xa_is_node(root))
                mte_destroy_walk(root, mt);

        mt->ma_flags = mt_attr(mt);
}
EXPORT_SYMBOL_GPL(__mt_destroy);

/**
 * mtree_destroy() - Destroy a maple tree
 * @mt: The maple tree
 *
 * Frees all resources used by the tree.  Handles locking.
 */
void mtree_destroy(struct maple_tree *mt)
{
        mtree_lock(mt);
        __mt_destroy(mt);
        mtree_unlock(mt);
}
EXPORT_SYMBOL(mtree_destroy);

/**
 * mt_find() - Search from the start up until an entry is found.
 * @mt: The maple tree
 * @index: Pointer which contains the start location of the search
 * @max: The maximum value of the search range
 *
 * Takes RCU read lock internally to protect the search, which does not
 * protect the returned pointer after dropping RCU read lock.
 * See also: Documentation/core-api/maple_tree.rst
 *
 * In case that an entry is found @index is updated to point to the next
 * possible entry independent whether the found entry is occupying a
 * single index or a range if indices.
 *
 * Return: The entry at or after the @index or %NULL
 */
void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max)
{
        MA_STATE(mas, mt, *index, *index);
        void *entry;
#ifdef CONFIG_DEBUG_MAPLE_TREE
        unsigned long copy = *index;
#endif

        trace_ma_read(TP_FCT, &mas);

        if ((*index) > max)
                return NULL;

        rcu_read_lock();
retry:
        entry = mas_state_walk(&mas);
        if (mas_is_start(&mas))
                goto retry;

        if (unlikely(xa_is_zero(entry)))
                entry = NULL;

        if (entry)
                goto unlock;

        while (mas_is_active(&mas) && (mas.last < max)) {
                entry = mas_next_slot(&mas, max, false);
                if (likely(entry && !xa_is_zero(entry)))
                        break;
        }

        if (unlikely(xa_is_zero(entry)))
                entry = NULL;
unlock:
        rcu_read_unlock();
        if (likely(entry)) {
                *index = mas.last + 1;
#ifdef CONFIG_DEBUG_MAPLE_TREE
                if (MT_WARN_ON(mt, (*index) && ((*index) <= copy)))
                        pr_err("index not increased! %lx <= %lx\n",
                               *index, copy);
#endif
        }

        return entry;
}
EXPORT_SYMBOL(mt_find);

/**
 * mt_find_after() - Search from the start up until an entry is found.
 * @mt: The maple tree
 * @index: Pointer which contains the start location of the search
 * @max: The maximum value to check
 *
 * Same as mt_find() except that it checks @index for 0 before
 * searching. If @index == 0, the search is aborted. This covers a wrap
 * around of @index to 0 in an iterator loop.
 *
 * Return: The entry at or after the @index or %NULL
 */
void *mt_find_after(struct maple_tree *mt, unsigned long *index,
                    unsigned long max)
{
        if (!(*index))
                return NULL;

        return mt_find(mt, index, max);
}
EXPORT_SYMBOL(mt_find_after);

#ifdef CONFIG_DEBUG_MAPLE_TREE
atomic_t maple_tree_tests_run;
EXPORT_SYMBOL_GPL(maple_tree_tests_run);
atomic_t maple_tree_tests_passed;
EXPORT_SYMBOL_GPL(maple_tree_tests_passed);

#ifndef __KERNEL__
extern void kmem_cache_set_non_kernel(struct kmem_cache *, unsigned int);
void mt_set_non_kernel(unsigned int val)
{
        kmem_cache_set_non_kernel(maple_node_cache, val);
}

extern void kmem_cache_set_callback(struct kmem_cache *cachep,
                void (*callback)(void *));
void mt_set_callback(void (*callback)(void *))
{
        kmem_cache_set_callback(maple_node_cache, callback);
}

extern void kmem_cache_set_private(struct kmem_cache *cachep, void *private);
void mt_set_private(void *private)
{
        kmem_cache_set_private(maple_node_cache, private);
}

extern unsigned long kmem_cache_get_alloc(struct kmem_cache *);
unsigned long mt_get_alloc_size(void)
{
        return kmem_cache_get_alloc(maple_node_cache);
}

extern void kmem_cache_zero_nr_tallocated(struct kmem_cache *);
void mt_zero_nr_tallocated(void)
{
        kmem_cache_zero_nr_tallocated(maple_node_cache);
}

extern unsigned int kmem_cache_nr_tallocated(struct kmem_cache *);
unsigned int mt_nr_tallocated(void)
{
        return kmem_cache_nr_tallocated(maple_node_cache);
}

extern unsigned int kmem_cache_nr_allocated(struct kmem_cache *);
unsigned int mt_nr_allocated(void)
{
        return kmem_cache_nr_allocated(maple_node_cache);
}

void mt_cache_shrink(void)
{
}
#else
/*
 * mt_cache_shrink() - For testing, don't use this.
 *
 * Certain testcases can trigger an OOM when combined with other memory
 * debugging configuration options.  This function is used to reduce the
 * possibility of an out of memory even due to kmem_cache objects remaining
 * around for longer than usual.
 */
void mt_cache_shrink(void)
{
        kmem_cache_shrink(maple_node_cache);

}
EXPORT_SYMBOL_GPL(mt_cache_shrink);

#endif /* not defined __KERNEL__ */
/*
 * mas_get_slot() - Get the entry in the maple state node stored at @offset.
 * @mas: The maple state
 * @offset: The offset into the slot array to fetch.
 *
 * Return: The entry stored at @offset.
 */
static inline struct maple_enode *mas_get_slot(struct ma_state *mas,
                unsigned char offset)
{
        return mas_slot(mas, ma_slots(mas_mn(mas), mte_node_type(mas->node)),
                        offset);
}

/* Depth first search, post-order */
static void mas_dfs_postorder(struct ma_state *mas, unsigned long max)
{

        struct maple_enode *p, *mn = mas->node;
        unsigned long p_min, p_max;

        mas_next_node(mas, mas_mn(mas), max);
        if (!mas_is_overflow(mas))
                return;

        if (mte_is_root(mn))
                return;

        mas->node = mn;
        mas_ascend(mas);
        do {
                p = mas->node;
                p_min = mas->min;
                p_max = mas->max;
                mas_prev_node(mas, 0);
        } while (!mas_is_underflow(mas));

        mas->node = p;
        mas->max = p_max;
        mas->min = p_min;
}

/* Tree validations */
static void mt_dump_node(const struct maple_tree *mt, void *entry,
                unsigned long min, unsigned long max, unsigned int depth,
                enum mt_dump_format format);
static void mt_dump_range(unsigned long min, unsigned long max,
                          unsigned int depth, enum mt_dump_format format)
{
        static const char spaces[] = "                                ";

        switch(format) {
        case mt_dump_hex:
                if (min == max)
                        pr_info("%.*s%lx: ", depth * 2, spaces, min);
                else
                        pr_info("%.*s%lx-%lx: ", depth * 2, spaces, min, max);
                break;
        case mt_dump_dec:
                if (min == max)
                        pr_info("%.*s%lu: ", depth * 2, spaces, min);
                else
                        pr_info("%.*s%lu-%lu: ", depth * 2, spaces, min, max);
        }
}

static void mt_dump_entry(void *entry, unsigned long min, unsigned long max,
                          unsigned int depth, enum mt_dump_format format)
{
        mt_dump_range(min, max, depth, format);

        if (xa_is_value(entry))
                pr_cont("value %ld (0x%lx) [" PTR_FMT "]\n", xa_to_value(entry),
                        xa_to_value(entry), entry);
        else if (xa_is_zero(entry))
                pr_cont("zero (%ld)\n", xa_to_internal(entry));
        else if (mt_is_reserved(entry))
                pr_cont("UNKNOWN ENTRY (" PTR_FMT ")\n", entry);
        else
                pr_cont(PTR_FMT "\n", entry);
}

static void mt_dump_range64(const struct maple_tree *mt, void *entry,
                unsigned long min, unsigned long max, unsigned int depth,
                enum mt_dump_format format)
{
        struct maple_range_64 *node = &mte_to_node(entry)->mr64;
        bool leaf = mte_is_leaf(entry);
        unsigned long first = min;
        int i;

        pr_cont(" contents: ");
        for (i = 0; i < MAPLE_RANGE64_SLOTS - 1; i++) {
                switch(format) {
                case mt_dump_hex:
                        pr_cont(PTR_FMT " %lX ", node->slot[i], node->pivot[i]);
                        break;
                case mt_dump_dec:
                        pr_cont(PTR_FMT " %lu ", node->slot[i], node->pivot[i]);
                }
        }
        pr_cont(PTR_FMT "\n", node->slot[i]);
        for (i = 0; i < MAPLE_RANGE64_SLOTS; i++) {
                unsigned long last = max;

                if (i < (MAPLE_RANGE64_SLOTS - 1))
                        last = node->pivot[i];
                else if (!node->slot[i] && max != mt_node_max(entry))
                        break;
                if (last == 0 && i > 0)
                        break;
                if (leaf)
                        mt_dump_entry(mt_slot(mt, node->slot, i),
                                        first, last, depth + 1, format);
                else if (node->slot[i])
                        mt_dump_node(mt, mt_slot(mt, node->slot, i),
                                        first, last, depth + 1, format);

                if (last == max)
                        break;
                if (last > max) {
                        switch(format) {
                        case mt_dump_hex:
                                pr_err("node " PTR_FMT " last (%lx) > max (%lx) at pivot %d!\n",
                                        node, last, max, i);
                                break;
                        case mt_dump_dec:
                                pr_err("node " PTR_FMT " last (%lu) > max (%lu) at pivot %d!\n",
                                        node, last, max, i);
                        }
                }
                first = last + 1;
        }
}

static void mt_dump_arange64(const struct maple_tree *mt, void *entry,
        unsigned long min, unsigned long max, unsigned int depth,
        enum mt_dump_format format)
{
        struct maple_arange_64 *node = &mte_to_node(entry)->ma64;
        unsigned long first = min;
        int i;

        pr_cont(" contents: ");
        for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) {
                switch (format) {
                case mt_dump_hex:
                        pr_cont("%lx ", node->gap[i]);
                        break;
                case mt_dump_dec:
                        pr_cont("%lu ", node->gap[i]);
                }
        }
        pr_cont("| %02X %02X| ", node->meta.end, node->meta.gap);
        for (i = 0; i < MAPLE_ARANGE64_SLOTS - 1; i++) {
                switch (format) {
                case mt_dump_hex:
                        pr_cont(PTR_FMT " %lX ", node->slot[i], node->pivot[i]);
                        break;
                case mt_dump_dec:
                        pr_cont(PTR_FMT " %lu ", node->slot[i], node->pivot[i]);
                }
        }
        pr_cont(PTR_FMT "\n", node->slot[i]);
        for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) {
                unsigned long last = max;

                if (i < (MAPLE_ARANGE64_SLOTS - 1))
                        last = node->pivot[i];
                else if (!node->slot[i])
                        break;
                if (last == 0 && i > 0)
                        break;
                if (node->slot[i])
                        mt_dump_node(mt, mt_slot(mt, node->slot, i),
                                        first, last, depth + 1, format);

                if (last == max)
                        break;
                if (last > max) {
                        switch(format) {
                        case mt_dump_hex:
                                pr_err("node " PTR_FMT " last (%lx) > max (%lx) at pivot %d!\n",
                                        node, last, max, i);
                                break;
                        case mt_dump_dec:
                                pr_err("node " PTR_FMT " last (%lu) > max (%lu) at pivot %d!\n",
                                        node, last, max, i);
                        }
                }
                first = last + 1;
        }
}

static void mt_dump_node(const struct maple_tree *mt, void *entry,
                unsigned long min, unsigned long max, unsigned int depth,
                enum mt_dump_format format)
{
        struct maple_node *node = mte_to_node(entry);
        unsigned int type = mte_node_type(entry);
        unsigned int i;

        mt_dump_range(min, max, depth, format);

        pr_cont("node " PTR_FMT " depth %d type %d parent " PTR_FMT, node,
                depth, type, node ? node->parent : NULL);
        switch (type) {
        case maple_dense:
                pr_cont("\n");
                for (i = 0; i < MAPLE_NODE_SLOTS; i++) {
                        if (min + i > max)
                                pr_cont("OUT OF RANGE: ");
                        mt_dump_entry(mt_slot(mt, node->slot, i),
                                        min + i, min + i, depth, format);
                }
                break;
        case maple_leaf_64:
        case maple_range_64:
                mt_dump_range64(mt, entry, min, max, depth, format);
                break;
        case maple_arange_64:
                mt_dump_arange64(mt, entry, min, max, depth, format);
                break;

        default:
                pr_cont(" UNKNOWN TYPE\n");
        }
}

void mt_dump(const struct maple_tree *mt, enum mt_dump_format format)
{
        void *entry = rcu_dereference_check(mt->ma_root, mt_locked(mt));

        pr_info("maple_tree(" PTR_FMT ") flags %X, height %u root " PTR_FMT "\n",
                 mt, mt->ma_flags, mt_height(mt), entry);
        if (xa_is_node(entry))
                mt_dump_node(mt, entry, 0, mt_node_max(entry), 0, format);
        else if (entry)
                mt_dump_entry(entry, 0, 0, 0, format);
        else
                pr_info("(empty)\n");
}
EXPORT_SYMBOL_GPL(mt_dump);

/*
 * Calculate the maximum gap in a node and check if that's what is reported in
 * the parent (unless root).
 */
static void mas_validate_gaps(struct ma_state *mas)
{
        struct maple_enode *mte = mas->node;
        struct maple_node *p_mn, *node = mte_to_node(mte);
        enum maple_type mt = mte_node_type(mas->node);
        unsigned long gap = 0, max_gap = 0;
        unsigned long p_end, p_start = mas->min;
        unsigned char p_slot, offset;
        unsigned long *gaps = NULL;
        unsigned long *pivots = ma_pivots(node, mt);
        unsigned int i;

        if (ma_is_dense(mt)) {
                for (i = 0; i < mt_slot_count(mte); i++) {
                        if (mas_get_slot(mas, i)) {
                                if (gap > max_gap)
                                        max_gap = gap;
                                gap = 0;
                                continue;
                        }
                        gap++;
                }
                goto counted;
        }

        gaps = ma_gaps(node, mt);
        for (i = 0; i < mt_slot_count(mte); i++) {
                p_end = mas_safe_pivot(mas, pivots, i, mt);

                if (!gaps) {
                        if (!mas_get_slot(mas, i))
                                gap = p_end - p_start + 1;
                } else {
                        void *entry = mas_get_slot(mas, i);

                        gap = gaps[i];
                        MT_BUG_ON(mas->tree, !entry);

                        if (gap > p_end - p_start + 1) {
                                pr_err(PTR_FMT "[%u] %lu >= %lu - %lu + 1 (%lu)\n",
                                       mas_mn(mas), i, gap, p_end, p_start,
                                       p_end - p_start + 1);
                                MT_BUG_ON(mas->tree, gap > p_end - p_start + 1);
                        }
                }

                if (gap > max_gap)
                        max_gap = gap;

                p_start = p_end + 1;
                if (p_end >= mas->max)
                        break;
        }

counted:
        if (mt == maple_arange_64) {
                MT_BUG_ON(mas->tree, !gaps);
                offset = ma_meta_gap(node);
                if (offset > i) {
                        pr_err("gap offset " PTR_FMT "[%u] is invalid\n", node, offset);
                        MT_BUG_ON(mas->tree, 1);
                }

                if (gaps[offset] != max_gap) {
                        pr_err("gap " PTR_FMT "[%u] is not the largest gap %lu\n",
                               node, offset, max_gap);
                        MT_BUG_ON(mas->tree, 1);
                }

                for (i++ ; i < mt_slot_count(mte); i++) {
                        if (gaps[i] != 0) {
                                pr_err("gap " PTR_FMT "[%u] beyond node limit != 0\n",
                                       node, i);
                                MT_BUG_ON(mas->tree, 1);
                        }
                }
        }

        if (mte_is_root(mte))
                return;

        p_slot = mte_parent_slot(mas->node);
        p_mn = mte_parent(mte);
        MT_BUG_ON(mas->tree, max_gap > mas->max);
        if (ma_gaps(p_mn, mas_parent_type(mas, mte))[p_slot] != max_gap) {
                pr_err("gap " PTR_FMT "[%u] != %lu\n", p_mn, p_slot, max_gap);
                mt_dump(mas->tree, mt_dump_hex);
                MT_BUG_ON(mas->tree, 1);
        }
}

static void mas_validate_parent_slot(struct ma_state *mas)
{
        struct maple_node *parent;
        struct maple_enode *node;
        enum maple_type p_type;
        unsigned char p_slot;
        void __rcu **slots;
        int i;

        if (mte_is_root(mas->node))
                return;

        p_slot = mte_parent_slot(mas->node);
        p_type = mas_parent_type(mas, mas->node);
        parent = mte_parent(mas->node);
        slots = ma_slots(parent, p_type);
        MT_BUG_ON(mas->tree, mas_mn(mas) == parent);

        /* Check prev/next parent slot for duplicate node entry */

        for (i = 0; i < mt_slots[p_type]; i++) {
                node = mas_slot(mas, slots, i);
                if (i == p_slot) {
                        if (node != mas->node)
                                pr_err("parent " PTR_FMT "[%u] does not have " PTR_FMT "\n",
                                        parent, i, mas_mn(mas));
                        MT_BUG_ON(mas->tree, node != mas->node);
                } else if (node == mas->node) {
                        pr_err("Invalid child " PTR_FMT " at parent " PTR_FMT "[%u] p_slot %u\n",
                               mas_mn(mas), parent, i, p_slot);
                        MT_BUG_ON(mas->tree, node == mas->node);
                }
        }
}

static void mas_validate_child_slot(struct ma_state *mas)
{
        enum maple_type type = mte_node_type(mas->node);
        void __rcu **slots = ma_slots(mte_to_node(mas->node), type);
        unsigned long *pivots = ma_pivots(mte_to_node(mas->node), type);
        struct maple_enode *child;
        unsigned char i;

        if (mte_is_leaf(mas->node))
                return;

        for (i = 0; i < mt_slots[type]; i++) {
                child = mas_slot(mas, slots, i);

                if (!child) {
                        pr_err("Non-leaf node lacks child at " PTR_FMT "[%u]\n",
                               mas_mn(mas), i);
                        MT_BUG_ON(mas->tree, 1);
                }

                if (mte_parent_slot(child) != i) {
                        pr_err("Slot error at " PTR_FMT "[%u]: child " PTR_FMT " has pslot %u\n",
                               mas_mn(mas), i, mte_to_node(child),
                               mte_parent_slot(child));
                        MT_BUG_ON(mas->tree, 1);
                }

                if (mte_parent(child) != mte_to_node(mas->node)) {
                        pr_err("child " PTR_FMT " has parent " PTR_FMT " not " PTR_FMT "\n",
                               mte_to_node(child), mte_parent(child),
                               mte_to_node(mas->node));
                        MT_BUG_ON(mas->tree, 1);
                }

                if (i < mt_pivots[type] && pivots[i] == mas->max)
                        break;
        }
}

/*
 * Validate all pivots are within mas->min and mas->max, check metadata ends
 * where the maximum ends and ensure there is no slots or pivots set outside of
 * the end of the data.
 */
static void mas_validate_limits(struct ma_state *mas)
{
        int i;
        unsigned long prev_piv = 0;
        enum maple_type type = mte_node_type(mas->node);
        void __rcu **slots = ma_slots(mte_to_node(mas->node), type);
        unsigned long *pivots = ma_pivots(mas_mn(mas), type);

        for (i = 0; i < mt_slots[type]; i++) {
                unsigned long piv;

                piv = mas_safe_pivot(mas, pivots, i, type);

                if (!piv && (i != 0)) {
                        pr_err("Missing node limit pivot at " PTR_FMT "[%u]",
                               mas_mn(mas), i);
                        MAS_WARN_ON(mas, 1);
                }

                if (prev_piv > piv) {
                        pr_err(PTR_FMT "[%u] piv %lu < prev_piv %lu\n",
                                mas_mn(mas), i, piv, prev_piv);
                        MAS_WARN_ON(mas, piv < prev_piv);
                }

                if (piv < mas->min) {
                        pr_err(PTR_FMT "[%u] %lu < %lu\n", mas_mn(mas), i,
                                piv, mas->min);
                        MAS_WARN_ON(mas, piv < mas->min);
                }
                if (piv > mas->max) {
                        pr_err(PTR_FMT "[%u] %lu > %lu\n", mas_mn(mas), i,
                                piv, mas->max);
                        MAS_WARN_ON(mas, piv > mas->max);
                }
                prev_piv = piv;
                if (piv == mas->max)
                        break;
        }

        if (mas_data_end(mas) != i) {
                pr_err("node" PTR_FMT ": data_end %u != the last slot offset %u\n",
                       mas_mn(mas), mas_data_end(mas), i);
                MT_BUG_ON(mas->tree, 1);
        }

        for (i += 1; i < mt_slots[type]; i++) {
                void *entry = mas_slot(mas, slots, i);

                if (entry && (i != mt_slots[type] - 1)) {
                        pr_err(PTR_FMT "[%u] should not have entry " PTR_FMT "\n",
                               mas_mn(mas), i, entry);
                        MT_BUG_ON(mas->tree, entry != NULL);
                }

                if (i < mt_pivots[type]) {
                        unsigned long piv = pivots[i];

                        if (!piv)
                                continue;

                        pr_err(PTR_FMT "[%u] should not have piv %lu\n",
                               mas_mn(mas), i, piv);
                        MAS_WARN_ON(mas, i < mt_pivots[type] - 1);
                }
        }
}

static void mt_validate_nulls(struct maple_tree *mt)
{
        void *entry, *last = (void *)1;
        unsigned char offset = 0;
        void __rcu **slots;
        MA_STATE(mas, mt, 0, 0);

        mas_start(&mas);
        if (mas_is_none(&mas) || (mas_is_ptr(&mas)))
                return;

        while (!mte_is_leaf(mas.node))
                mas_descend(&mas);

        slots = ma_slots(mte_to_node(mas.node), mte_node_type(mas.node));
        do {
                entry = mas_slot(&mas, slots, offset);
                if (!last && !entry) {
                        pr_err("Sequential nulls end at " PTR_FMT "[%u]\n",
                                mas_mn(&mas), offset);
                }
                MT_BUG_ON(mt, !last && !entry);
                last = entry;
                if (offset == mas_data_end(&mas)) {
                        mas_next_node(&mas, mas_mn(&mas), ULONG_MAX);
                        if (mas_is_overflow(&mas))
                                return;
                        offset = 0;
                        slots = ma_slots(mte_to_node(mas.node),
                                         mte_node_type(mas.node));
                } else {
                        offset++;
                }

        } while (!mas_is_overflow(&mas));
}

/*
 * validate a maple tree by checking:
 * 1. The limits (pivots are within mas->min to mas->max)
 * 2. The gap is correctly set in the parents
 */
void mt_validate(struct maple_tree *mt)
        __must_hold(mas->tree->ma_lock)
{
        unsigned char end;

        MA_STATE(mas, mt, 0, 0);
        mas_start(&mas);
        if (!mas_is_active(&mas))
                return;

        while (!mte_is_leaf(mas.node))
                mas_descend(&mas);

        while (!mas_is_overflow(&mas)) {
                MAS_WARN_ON(&mas, mte_dead_node(mas.node));
                end = mas_data_end(&mas);
                if (MAS_WARN_ON(&mas, (end < mt_min_slot_count(mas.node)) &&
                                (!mte_is_root(mas.node)))) {
                        pr_err("Invalid size %u of " PTR_FMT "\n",
                               end, mas_mn(&mas));
                }

                mas_validate_parent_slot(&mas);
                mas_validate_limits(&mas);
                mas_validate_child_slot(&mas);
                if (mt_is_alloc(mt))
                        mas_validate_gaps(&mas);
                mas_dfs_postorder(&mas, ULONG_MAX);
        }
        mt_validate_nulls(mt);
}
EXPORT_SYMBOL_GPL(mt_validate);

void mas_dump(const struct ma_state *mas)
{
        pr_err("MAS: tree=" PTR_FMT " enode=" PTR_FMT " ",
               mas->tree, mas->node);
        switch (mas->status) {
        case ma_active:
                pr_err("(ma_active)");
                break;
        case ma_none:
                pr_err("(ma_none)");
                break;
        case ma_root:
                pr_err("(ma_root)");
                break;
        case ma_start:
                pr_err("(ma_start) ");
                break;
        case ma_pause:
                pr_err("(ma_pause) ");
                break;
        case ma_overflow:
                pr_err("(ma_overflow) ");
                break;
        case ma_underflow:
                pr_err("(ma_underflow) ");
                break;
        case ma_error:
                pr_err("(ma_error) ");
                break;
        }

        pr_err("Store Type: ");
        switch (mas->store_type) {
        case wr_invalid:
                pr_err("invalid store type\n");
                break;
        case wr_new_root:
                pr_err("new_root\n");
                break;
        case wr_store_root:
                pr_err("store_root\n");
                break;
        case wr_exact_fit:
                pr_err("exact_fit\n");
                break;
        case wr_split_store:
                pr_err("split_store\n");
                break;
        case wr_slot_store:
                pr_err("slot_store\n");
                break;
        case wr_append:
                pr_err("append\n");
                break;
        case wr_node_store:
                pr_err("node_store\n");
                break;
        case wr_spanning_store:
                pr_err("spanning_store\n");
                break;
        case wr_rebalance:
                pr_err("rebalance\n");
                break;
        }

        pr_err("[%u/%u] index=%lx last=%lx\n", mas->offset, mas->end,
               mas->index, mas->last);
        pr_err("     min=%lx max=%lx sheaf=" PTR_FMT ", request %lu depth=%u, flags=%x\n",
               mas->min, mas->max, mas->sheaf, mas->node_request, mas->depth,
               mas->mas_flags);
        if (mas->index > mas->last)
                pr_err("Check index & last\n");
}
EXPORT_SYMBOL_GPL(mas_dump);

void mas_wr_dump(const struct ma_wr_state *wr_mas)
{
        pr_err("WR_MAS: node=" PTR_FMT " r_min=%lx r_max=%lx\n",
               wr_mas->node, wr_mas->r_min, wr_mas->r_max);
        pr_err("        type=%u off_end=%u, node_end=%u, end_piv=%lx\n",
               wr_mas->type, wr_mas->offset_end, wr_mas->mas->end,
               wr_mas->end_piv);
}
EXPORT_SYMBOL_GPL(mas_wr_dump);

#endif /* CONFIG_DEBUG_MAPLE_TREE */





































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_TASK_H
#define _LINUX_SCHED_TASK_H

/*
 * Interface between the scheduler and various task lifetime (fork()/exit())
 * functionality:
 */

#include <linux/rcupdate.h>
#include <linux/refcount.h>
#include <linux/sched.h>
#include <linux/uaccess.h>

struct task_struct;
struct rusage;
union thread_union;
struct css_set;

/* All the bits taken by the old clone syscall. */
#define CLONE_LEGACY_FLAGS 0xffffffffULL

struct kernel_clone_args {
        u64 flags;
        int __user *pidfd;
        int __user *child_tid;
        int __user *parent_tid;
        const char *name;
        int exit_signal;
        u32 kthread:1;
        u32 io_thread:1;
        u32 user_worker:1;
        u32 no_files:1;
        unsigned long stack;
        unsigned long stack_size;
        unsigned long tls;
        pid_t *set_tid;
        /* Number of elements in *set_tid */
        size_t set_tid_size;
        int cgroup;
        int idle;
        int (*fn)(void *);
        void *fn_arg;
        struct cgroup *cgrp;
        struct css_set *cset;
        unsigned int kill_seq;
};

/*
 * This serializes "schedule()" and also protects
 * the run-queue from deletions/modifications (but
 * _adding_ to the beginning of the run-queue has
 * a separate lock).
 */
extern rwlock_t tasklist_lock;
extern spinlock_t mmlist_lock;

extern union thread_union init_thread_union;
extern struct task_struct init_task;

extern int lockdep_tasklist_lock_is_held(void);

extern asmlinkage void schedule_tail(struct task_struct *prev);
extern void init_idle(struct task_struct *idle, int cpu);

extern int sched_fork(u64 clone_flags, struct task_struct *p);
extern int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs);
extern void sched_cancel_fork(struct task_struct *p);
extern void sched_post_fork(struct task_struct *p);
extern void sched_dead(struct task_struct *p);

void __noreturn do_task_dead(void);
void __noreturn make_task_dead(int signr);

extern void mm_cache_init(void);
extern void proc_caches_init(void);

extern void fork_init(void);

extern void release_task(struct task_struct * p);

extern int copy_thread(struct task_struct *, const struct kernel_clone_args *);

extern void flush_thread(void);

#ifdef CONFIG_HAVE_EXIT_THREAD
extern void exit_thread(struct task_struct *tsk);
#else
static inline void exit_thread(struct task_struct *tsk)
{
}
#endif
extern __noreturn void do_group_exit(int);

extern void exit_files(struct task_struct *);
extern void exit_itimers(struct task_struct *);

extern pid_t kernel_clone(struct kernel_clone_args *kargs);
struct task_struct *copy_process(struct pid *pid, int trace, int node,
                                 struct kernel_clone_args *args);
struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node);
struct task_struct *fork_idle(int);
extern pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
                            unsigned long flags);
extern pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags);
extern long kernel_wait4(pid_t, int __user *, int, struct rusage *);
int kernel_wait(pid_t pid, int *stat);

extern void free_task(struct task_struct *tsk);

/* sched_exec is called by processes performing an exec */
extern void sched_exec(void);

static inline struct task_struct *get_task_struct(struct task_struct *t)
{
        refcount_inc(&t->usage);
        return t;
}

static inline struct task_struct *tryget_task_struct(struct task_struct *t)
{
        return refcount_inc_not_zero(&t->usage) ? t : NULL;
}

extern void __put_task_struct(struct task_struct *t);
extern void __put_task_struct_rcu_cb(struct rcu_head *rhp);

static inline void put_task_struct(struct task_struct *t)
{
        if (!refcount_dec_and_test(&t->usage))
                return;

        /*
         * Under PREEMPT_RT, we can't call __put_task_struct
         * in atomic context because it will indirectly
         * acquire sleeping locks. The same is true if the
         * current process has a mutex enqueued (blocked on
         * a PI chain).
         *
         * In !RT, it is always safe to call __put_task_struct().
         * Though, in order to simplify the code, resort to the
         * deferred call too.
         *
         * call_rcu() will schedule __put_task_struct_rcu_cb()
         * to be called in process context.
         *
         * __put_task_struct() is called when
         * refcount_dec_and_test(&t->usage) succeeds.
         *
         * This means that it can't "conflict" with
         * put_task_struct_rcu_user() which abuses ->rcu the same
         * way; rcu_users has a reference so task->usage can't be
         * zero after rcu_users 1 -> 0 transition.
         *
         * delayed_free_task() also uses ->rcu, but it is only called
         * when it fails to fork a process. Therefore, there is no
         * way it can conflict with __put_task_struct().
         */
        call_rcu(&t->rcu, __put_task_struct_rcu_cb);
}

DEFINE_FREE(put_task, struct task_struct *, if (_T) put_task_struct(_T))

static inline void put_task_struct_many(struct task_struct *t, int nr)
{
        if (refcount_sub_and_test(nr, &t->usage))
                __put_task_struct(t);
}

void put_task_struct_rcu_user(struct task_struct *task);

/* Free all architecture-specific resources held by a thread. */
void release_thread(struct task_struct *dead_task);

#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
extern int arch_task_struct_size __read_mostly;
#else
# define arch_task_struct_size (sizeof(struct task_struct))
#endif

#ifndef CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST
/*
 * If an architecture has not declared a thread_struct whitelist we
 * must assume something there may need to be copied to userspace.
 */
static inline void arch_thread_struct_whitelist(unsigned long *offset,
                                                unsigned long *size)
{
        *offset = 0;
        /* Handle dynamically sized thread_struct. */
        *size = arch_task_struct_size - offsetof(struct task_struct, thread);
}
#endif

#ifdef CONFIG_VMAP_STACK
static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
{
        return t->stack_vm_area;
}
#else
static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
{
        return NULL;
}
#endif

/*
 * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
 * subscriptions and synchronises with wait4().  Also used in procfs.  Also
 * pins the final release of task.io_context.  Also protects ->cpuset and
 * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist.
 *
 * Nests inside of read_lock(&tasklist_lock). It must not be nested with
 * write_lock_irq(&tasklist_lock), neither inside nor outside.
 */
static inline void task_lock(struct task_struct *p)
{
        spin_lock(&p->alloc_lock);
}

static inline void task_unlock(struct task_struct *p)
{
        spin_unlock(&p->alloc_lock);
}

DEFINE_GUARD(task_lock, struct task_struct *, task_lock(_T), task_unlock(_T))

#endif /* _LINUX_SCHED_TASK_H */




















  315 




































  316 

  315 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
/* SPDX-License-Identifier: GPL-2.0 */
/* Perform sanity checking for object sizes for uaccess.h and uio.h. */
#ifndef __LINUX_UCOPYSIZE_H__
#define __LINUX_UCOPYSIZE_H__

#include <linux/bug.h>

#ifdef CONFIG_HARDENED_USERCOPY
#include <linux/jump_label.h>
extern void __check_object_size(const void *ptr, unsigned long n,
                                        bool to_user);

DECLARE_STATIC_KEY_MAYBE(CONFIG_HARDENED_USERCOPY_DEFAULT_ON,
                           validate_usercopy_range);

static __always_inline void check_object_size(const void *ptr, unsigned long n,
                                              bool to_user)
{
        if (!__builtin_constant_p(n) &&
            static_branch_maybe(CONFIG_HARDENED_USERCOPY_DEFAULT_ON,
                                &validate_usercopy_range)) {
                __check_object_size(ptr, n, to_user);
        }
}
#else
static inline void check_object_size(const void *ptr, unsigned long n,
                                     bool to_user)
{ }
#endif /* CONFIG_HARDENED_USERCOPY */

extern void __compiletime_error("copy source size is too small")
__bad_copy_from(void);
extern void __compiletime_error("copy destination size is too small")
__bad_copy_to(void);

void __copy_overflow(int size, unsigned long count);

static inline void copy_overflow(int size, unsigned long count)
{
        if (IS_ENABLED(CONFIG_BUG))
                __copy_overflow(size, count);
}

static __always_inline __must_check bool
check_copy_size(const void *addr, size_t bytes, bool is_source)
{
        int sz = __builtin_object_size(addr, 0);
        if (unlikely(sz >= 0 && sz < bytes)) {
                if (!__builtin_constant_p(bytes))
                        copy_overflow(sz, bytes);
                else if (is_source)
                        __bad_copy_from();
                else
                        __bad_copy_to();
                return false;
        }
        if (WARN_ON_ONCE(bytes > INT_MAX))
                return false;
        check_object_size(addr, bytes, is_source);
        return true;
}

#endif /* __LINUX_UCOPYSIZE_H__ */






















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_RTNETLINK_H
#define __LINUX_RTNETLINK_H


#include <linux/mutex.h>
#include <linux/netdevice.h>
#include <linux/wait.h>
#include <linux/refcount.h>
#include <uapi/linux/rtnetlink.h>

extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo);

static inline int rtnetlink_maybe_send(struct sk_buff *skb, struct net *net,
                                       u32 pid, u32 group, int echo)
{
        return !skb ? 0 : rtnetlink_send(skb, net, pid, group, echo);
}

extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid);
extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid,
                        u32 group, const struct nlmsghdr *nlh, gfp_t flags);
extern void rtnl_set_sk_err(struct net *net, u32 group, int error);
extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics);
extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst,
                              u32 id, long expires, u32 error);

void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, gfp_t flags,
                  u32 portid, const struct nlmsghdr *nlh);
void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change,
                         gfp_t flags, int *new_nsid, int new_ifindex);
struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
                                       unsigned change, u32 event,
                                       gfp_t flags, int *new_nsid,
                                       int new_ifindex, u32 portid,
                                       const struct nlmsghdr *nlh);
void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev,
                       gfp_t flags, u32 portid, const struct nlmsghdr *nlh);


/* RTNL is used as a global lock for all changes to network configuration  */
extern void rtnl_lock(void);
extern void rtnl_unlock(void);
extern int rtnl_trylock(void);
extern int rtnl_is_locked(void);
extern int rtnl_lock_interruptible(void);
extern int rtnl_lock_killable(void);
extern bool refcount_dec_and_rtnl_lock(refcount_t *r);

extern wait_queue_head_t netdev_unregistering_wq;
extern atomic_t dev_unreg_count;
extern struct rw_semaphore pernet_ops_rwsem;
extern struct rw_semaphore net_rwsem;

#define ASSERT_RTNL() \
        WARN_ONCE(!rtnl_is_locked(), \
                  "RTNL: assertion failed at %s (%d)\n", __FILE__,  __LINE__)

#ifdef CONFIG_PROVE_LOCKING
extern bool lockdep_rtnl_is_held(void);
#else
static inline bool lockdep_rtnl_is_held(void)
{
        return true;
}
#endif /* #ifdef CONFIG_PROVE_LOCKING */

/**
 * rcu_dereference_rtnl - rcu_dereference with debug checking
 * @p: The pointer to read, prior to dereferencing
 *
 * Do an rcu_dereference(p), but check caller either holds rcu_read_lock()
 * or RTNL. Note : Please prefer rtnl_dereference() or rcu_dereference()
 */
#define rcu_dereference_rtnl(p)                                        \
        rcu_dereference_check(p, lockdep_rtnl_is_held())

/**
 * rtnl_dereference - fetch RCU pointer when updates are prevented by RTNL
 * @p: The pointer to read, prior to dereferencing
 *
 * Return: the value of the specified RCU-protected pointer, but omit
 * the READ_ONCE(), because caller holds RTNL.
 */
#define rtnl_dereference(p)                                        \
        rcu_dereference_protected(p, lockdep_rtnl_is_held())

/**
 * rcu_replace_pointer_rtnl - replace an RCU pointer under rtnl_lock, returning
 * its old value
 * @rp: RCU pointer, whose value is returned
 * @p: regular pointer
 *
 * Perform a replacement under rtnl_lock, where @rp is an RCU-annotated
 * pointer. The old value of @rp is returned, and @rp is set to @p
 */
#define rcu_replace_pointer_rtnl(rp, p)                        \
        rcu_replace_pointer(rp, p, lockdep_rtnl_is_held())

#ifdef CONFIG_DEBUG_NET_SMALL_RTNL
void __rtnl_net_lock(struct net *net);
void __rtnl_net_unlock(struct net *net);
void rtnl_net_lock(struct net *net);
void rtnl_net_unlock(struct net *net);
int rtnl_net_trylock(struct net *net);
int rtnl_net_lock_killable(struct net *net);
int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b);

bool rtnl_net_is_locked(struct net *net);

#define ASSERT_RTNL_NET(net)                                                \
        WARN_ONCE(!rtnl_net_is_locked(net),                                \
                  "RTNL_NET: assertion failed at %s (%d)\n",                \
                  __FILE__,  __LINE__)

bool lockdep_rtnl_net_is_held(struct net *net);

#define rcu_dereference_rtnl_net(net, p)                                \
        rcu_dereference_check(p, lockdep_rtnl_net_is_held(net))
#define rtnl_net_dereference(net, p)                                        \
        rcu_dereference_protected(p, lockdep_rtnl_net_is_held(net))
#define rcu_replace_pointer_rtnl_net(net, rp, p)                        \
        rcu_replace_pointer(rp, p, lockdep_rtnl_net_is_held(net))
#else
static inline void __rtnl_net_lock(struct net *net) {}
static inline void __rtnl_net_unlock(struct net *net) {}

static inline void rtnl_net_lock(struct net *net)
{
        rtnl_lock();
}

static inline void rtnl_net_unlock(struct net *net)
{
        rtnl_unlock();
}

static inline int rtnl_net_trylock(struct net *net)
{
        return rtnl_trylock();
}

static inline int rtnl_net_lock_killable(struct net *net)
{
        return rtnl_lock_killable();
}

static inline void ASSERT_RTNL_NET(struct net *net)
{
        ASSERT_RTNL();
}

#define rcu_dereference_rtnl_net(net, p)                \
        rcu_dereference_rtnl(p)
#define rtnl_net_dereference(net, p)                        \
        rtnl_dereference(p)
#define rcu_replace_pointer_rtnl_net(net, rp, p)        \
        rcu_replace_pointer_rtnl(rp, p)
#endif

static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev)
{
        return rtnl_dereference(dev->ingress_queue);
}

static inline struct netdev_queue *dev_ingress_queue_rcu(struct net_device *dev)
{
        return rcu_dereference(dev->ingress_queue);
}

struct netdev_queue *dev_ingress_queue_create(struct net_device *dev);

#ifdef CONFIG_NET_INGRESS
void net_inc_ingress_queue(void);
void net_dec_ingress_queue(void);
#endif

#ifdef CONFIG_NET_EGRESS
void net_inc_egress_queue(void);
void net_dec_egress_queue(void);
void netdev_xmit_skip_txqueue(bool skip);
#endif

void rtnetlink_init(void);
void __rtnl_unlock(void);
void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail);

/* Shared by rtnl_fdb_dump() and various ndo_fdb_dump() helpers. */
struct ndo_fdb_dump_context {
        unsigned long ifindex;
        unsigned long fdb_idx;
};

extern int ndo_dflt_fdb_dump(struct sk_buff *skb,
                             struct netlink_callback *cb,
                             struct net_device *dev,
                             struct net_device *filter_dev,
                             int *idx);
extern int ndo_dflt_fdb_add(struct ndmsg *ndm,
                            struct nlattr *tb[],
                            struct net_device *dev,
                            const unsigned char *addr,
                            u16 vid,
                            u16 flags);
extern int ndo_dflt_fdb_del(struct ndmsg *ndm,
                            struct nlattr *tb[],
                            struct net_device *dev,
                            const unsigned char *addr,
                            u16 vid);

extern int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
                                   struct net_device *dev, u16 mode,
                                   u32 flags, u32 mask, int nlflags,
                                   u32 filter_mask,
                                   int (*vlan_fill)(struct sk_buff *skb,
                                                    struct net_device *dev,
                                                    u32 filter_mask));

extern void rtnl_offload_xstats_notify(struct net_device *dev);

static inline int rtnl_has_listeners(const struct net *net, u32 group)
{
        struct sock *rtnl = net->rtnl;

        return netlink_has_listeners(rtnl, group);
}

/**
 * rtnl_notify_needed - check if notification is needed
 * @net: Pointer to the net namespace
 * @nlflags: netlink ingress message flags
 * @group: rtnl group
 *
 * Based on the ingress message flags and rtnl group, returns true
 * if a notification is needed, false otherwise.
 */
static inline bool
rtnl_notify_needed(const struct net *net, u16 nlflags, u32 group)
{
        return (nlflags & NLM_F_ECHO) || rtnl_has_listeners(net, group);
}

void netif_set_operstate(struct net_device *dev, int newstate);

#endif        /* __LINUX_RTNETLINK_H */






















































































































































































































   54 






















































































































































































































































































    6 


























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* audit.h -- Auditing support
 *
 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
 * All Rights Reserved.
 *
 * Written by Rickard E. (Rik) Faith <faith@redhat.com>
 */
#ifndef _LINUX_AUDIT_H_
#define _LINUX_AUDIT_H_

#include <linux/sched.h>
#include <linux/ptrace.h>
#include <linux/audit_arch.h>
#include <uapi/linux/audit.h>
#include <uapi/linux/netfilter/nf_tables.h>
#include <uapi/linux/fanotify.h>

#define AUDIT_INO_UNSET ((unsigned long)-1)
#define AUDIT_DEV_UNSET ((dev_t)-1)

struct audit_sig_info {
        uid_t                uid;
        pid_t                pid;
        char                ctx[];
};

struct audit_buffer;
struct audit_context;
struct inode;
struct netlink_skb_parms;
struct path;
struct linux_binprm;
struct mq_attr;
struct mqstat;
struct audit_watch;
struct audit_tree;
struct sk_buff;
struct kern_ipc_perm;
struct lsm_id;
struct lsm_prop;

struct audit_krule {
        u32                        pflags;
        u32                        flags;
        u32                        listnr;
        u32                        action;
        u32                        mask[AUDIT_BITMASK_SIZE];
        u32                        buflen; /* for data alloc on list rules */
        u32                        field_count;
        char                        *filterkey; /* ties events to rules */
        struct audit_field        *fields;
        struct audit_field        *arch_f; /* quick access to arch field */
        struct audit_field        *inode_f; /* quick access to an inode field */
        struct audit_watch        *watch;        /* associated watch */
        struct audit_tree        *tree;        /* associated watched tree */
        struct audit_fsnotify_mark        *exe;
        struct list_head        rlist;        /* entry in audit_{watch,tree}.rules list */
        struct list_head        list;        /* for AUDIT_LIST* purposes only */
        u64                        prio;
};

/* Flag to indicate legacy AUDIT_LOGINUID unset usage */
#define AUDIT_LOGINUID_LEGACY                0x1

struct audit_field {
        u32                                type;
        union {
                u32                        val;
                kuid_t                        uid;
                kgid_t                        gid;
                struct {
                        char                *lsm_str;
                        void                *lsm_rule;
                };
        };
        u32                                op;
};

enum audit_ntp_type {
        AUDIT_NTP_OFFSET,
        AUDIT_NTP_FREQ,
        AUDIT_NTP_STATUS,
        AUDIT_NTP_TAI,
        AUDIT_NTP_TICK,
        AUDIT_NTP_ADJUST,

        AUDIT_NTP_NVALS /* count */
};

#ifdef CONFIG_AUDITSYSCALL
struct audit_ntp_val {
        long long oldval, newval;
};

struct audit_ntp_data {
        struct audit_ntp_val vals[AUDIT_NTP_NVALS];
};
#else
struct audit_ntp_data {};
#endif

enum audit_nfcfgop {
        AUDIT_XT_OP_REGISTER,
        AUDIT_XT_OP_REPLACE,
        AUDIT_XT_OP_UNREGISTER,
        AUDIT_NFT_OP_TABLE_REGISTER,
        AUDIT_NFT_OP_TABLE_UNREGISTER,
        AUDIT_NFT_OP_CHAIN_REGISTER,
        AUDIT_NFT_OP_CHAIN_UNREGISTER,
        AUDIT_NFT_OP_RULE_REGISTER,
        AUDIT_NFT_OP_RULE_UNREGISTER,
        AUDIT_NFT_OP_SET_REGISTER,
        AUDIT_NFT_OP_SET_UNREGISTER,
        AUDIT_NFT_OP_SETELEM_REGISTER,
        AUDIT_NFT_OP_SETELEM_UNREGISTER,
        AUDIT_NFT_OP_GEN_REGISTER,
        AUDIT_NFT_OP_OBJ_REGISTER,
        AUDIT_NFT_OP_OBJ_UNREGISTER,
        AUDIT_NFT_OP_OBJ_RESET,
        AUDIT_NFT_OP_FLOWTABLE_REGISTER,
        AUDIT_NFT_OP_FLOWTABLE_UNREGISTER,
        AUDIT_NFT_OP_SETELEM_RESET,
        AUDIT_NFT_OP_RULE_RESET,
        AUDIT_NFT_OP_INVALID,
};

extern int __init audit_register_class(int class, unsigned *list);
extern int audit_classify_syscall(int abi, unsigned syscall);
extern int audit_classify_arch(int arch);
/* only for compat system calls */
extern unsigned compat_write_class[];
extern unsigned compat_read_class[];
extern unsigned compat_dir_class[];
extern unsigned compat_chattr_class[];
extern unsigned compat_signal_class[];

/* audit_names->type values */
#define        AUDIT_TYPE_UNKNOWN        0        /* we don't know yet */
#define        AUDIT_TYPE_NORMAL        1        /* a "normal" audit record */
#define        AUDIT_TYPE_PARENT        2        /* a parent audit record */
#define        AUDIT_TYPE_CHILD_DELETE 3        /* a child being deleted */
#define        AUDIT_TYPE_CHILD_CREATE 4        /* a child being created */

/* maximized args number that audit_socketcall can process */
#define AUDITSC_ARGS                6

/* bit values for ->signal->audit_tty */
#define AUDIT_TTY_ENABLE        BIT(0)
#define AUDIT_TTY_LOG_PASSWD        BIT(1)

/* bit values for audit_cfg_lsm */
#define AUDIT_CFG_LSM_SECCTX_SUBJECT        BIT(0)
#define AUDIT_CFG_LSM_SECCTX_OBJECT        BIT(1)

struct filename;

#define AUDIT_OFF        0
#define AUDIT_ON        1
#define AUDIT_LOCKED        2
#ifdef CONFIG_AUDIT
/* These are defined in audit.c */
                                /* Public API */
extern __printf(4, 5)
void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
               const char *fmt, ...);

extern struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type);
extern __printf(2, 3)
void audit_log_format(struct audit_buffer *ab, const char *fmt, ...);
extern void                    audit_log_end(struct audit_buffer *ab);
extern bool                    audit_string_contains_control(const char *string,
                                                          size_t len);
extern void                    audit_log_n_hex(struct audit_buffer *ab,
                                          const unsigned char *buf,
                                          size_t len);
extern void                    audit_log_n_string(struct audit_buffer *ab,
                                               const char *buf,
                                               size_t n);
extern void                    audit_log_n_untrustedstring(struct audit_buffer *ab,
                                                        const char *string,
                                                        size_t n);
extern void                    audit_log_untrustedstring(struct audit_buffer *ab,
                                                      const char *string);
extern void                    audit_log_d_path(struct audit_buffer *ab,
                                             const char *prefix,
                                             const struct path *path);
extern void                    audit_log_key(struct audit_buffer *ab,
                                          char *key);
extern void                    audit_log_path_denied(int type,
                                                  const char *operation);
extern void                    audit_log_lost(const char *message);

extern int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop);
extern int audit_log_obj_ctx(struct audit_buffer *ab, struct lsm_prop *prop);
extern int audit_log_task_context(struct audit_buffer *ab);
extern void audit_log_task_info(struct audit_buffer *ab);

extern int                    audit_update_lsm_rules(void);

                                /* Private API (for audit.c only) */
extern int audit_rule_change(int type, int seq, void *data, size_t datasz);
extern int audit_list_rules_send(struct sk_buff *request_skb, int seq);

extern int audit_set_loginuid(kuid_t loginuid);

static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
{
        return tsk->loginuid;
}

static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
{
        return tsk->sessionid;
}

extern u32 audit_enabled;

extern int audit_signal_info(int sig, struct task_struct *t);

extern void audit_cfg_lsm(const struct lsm_id *lsmid, int flags);

#else /* CONFIG_AUDIT */
static inline __printf(4, 5)
void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
               const char *fmt, ...)
{ }
static inline struct audit_buffer *audit_log_start(struct audit_context *ctx,
                                                   gfp_t gfp_mask, int type)
{
        return NULL;
}
static inline __printf(2, 3)
void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
{ }
static inline void audit_log_end(struct audit_buffer *ab)
{ }
static inline void audit_log_n_hex(struct audit_buffer *ab,
                                   const unsigned char *buf, size_t len)
{ }
static inline void audit_log_n_string(struct audit_buffer *ab,
                                      const char *buf, size_t n)
{ }
static inline void  audit_log_n_untrustedstring(struct audit_buffer *ab,
                                                const char *string, size_t n)
{ }
static inline void audit_log_untrustedstring(struct audit_buffer *ab,
                                             const char *string)
{ }
static inline void audit_log_d_path(struct audit_buffer *ab,
                                    const char *prefix,
                                    const struct path *path)
{ }
static inline void audit_log_key(struct audit_buffer *ab, char *key)
{ }
static inline void audit_log_path_denied(int type, const char *operation)
{ }
static inline int audit_log_subj_ctx(struct audit_buffer *ab,
                                     struct lsm_prop *prop)
{
        return 0;
}
static inline int audit_log_obj_ctx(struct audit_buffer *ab,
                                    struct lsm_prop *prop)
{
        return 0;
}
static inline int audit_log_task_context(struct audit_buffer *ab)
{
        return 0;
}
static inline void audit_log_task_info(struct audit_buffer *ab)
{ }

static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
{
        return INVALID_UID;
}

static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
{
        return AUDIT_SID_UNSET;
}

#define audit_enabled AUDIT_OFF

static inline int audit_signal_info(int sig, struct task_struct *t)
{
        return 0;
}

static inline void audit_cfg_lsm(const struct lsm_id *lsmid, int flags)
{ }

#endif /* CONFIG_AUDIT */

#ifdef CONFIG_AUDIT_COMPAT_GENERIC
#define audit_is_compat(arch)  (!((arch) & __AUDIT_ARCH_64BIT))
#else
#define audit_is_compat(arch)  false
#endif

#define AUDIT_INODE_PARENT        1        /* dentry represents the parent */
#define AUDIT_INODE_HIDDEN        2        /* audit record should be hidden */
#define AUDIT_INODE_NOEVAL        4        /* audit record incomplete */

#ifdef CONFIG_AUDITSYSCALL
#include <asm/syscall.h> /* for syscall_get_arch() */

/* These are defined in auditsc.c */
                                /* Public API */
extern int  audit_alloc(struct task_struct *task);
extern void __audit_free(struct task_struct *task);
extern void __audit_uring_entry(u8 op);
extern void __audit_uring_exit(int success, long code);
extern void __audit_syscall_entry(int major, unsigned long a0, unsigned long a1,
                                  unsigned long a2, unsigned long a3);
extern void __audit_syscall_exit(int ret_success, long ret_value);
extern struct filename *__audit_reusename(const __user char *uptr);
extern void __audit_getname(struct filename *name);
extern void __audit_inode(struct filename *name, const struct dentry *dentry,
                                unsigned int flags);
extern void __audit_file(const struct file *);
extern void __audit_inode_child(struct inode *parent,
                                const struct dentry *dentry,
                                const unsigned char type);
extern void audit_seccomp(unsigned long syscall, long signr, int code);
extern void audit_seccomp_actions_logged(const char *names,
                                         const char *old_names, int res);
extern void __audit_ptrace(struct task_struct *t);

static inline void audit_set_context(struct task_struct *task, struct audit_context *ctx)
{
        task->audit_context = ctx;
}

static inline struct audit_context *audit_context(void)
{
        return current->audit_context;
}

static inline bool audit_dummy_context(void)
{
        void *p = audit_context();
        return !p || *(int *)p;
}
static inline void audit_free(struct task_struct *task)
{
        if (unlikely(task->audit_context))
                __audit_free(task);
}
static inline void audit_uring_entry(u8 op)
{
        /*
         * We intentionally check audit_context() before audit_enabled as most
         * Linux systems (as of ~2021) rely on systemd which forces audit to
         * be enabled regardless of the user's audit configuration.
         */
        if (unlikely(audit_context() && audit_enabled))
                __audit_uring_entry(op);
}
static inline void audit_uring_exit(int success, long code)
{
        if (unlikely(audit_context()))
                __audit_uring_exit(success, code);
}
static inline void audit_syscall_entry(int major, unsigned long a0,
                                       unsigned long a1, unsigned long a2,
                                       unsigned long a3)
{
        if (unlikely(audit_context()))
                __audit_syscall_entry(major, a0, a1, a2, a3);
}
static inline void audit_syscall_exit(void *pt_regs)
{
        if (unlikely(audit_context())) {
                int success = is_syscall_success(pt_regs);
                long return_code = regs_return_value(pt_regs);

                __audit_syscall_exit(success, return_code);
        }
}
static inline struct filename *audit_reusename(const __user char *name)
{
        if (unlikely(!audit_dummy_context()))
                return __audit_reusename(name);
        return NULL;
}
static inline void audit_getname(struct filename *name)
{
        if (unlikely(!audit_dummy_context()))
                __audit_getname(name);
}
static inline void audit_inode(struct filename *name,
                                const struct dentry *dentry,
                                unsigned int aflags) {
        if (unlikely(!audit_dummy_context()))
                __audit_inode(name, dentry, aflags);
}
static inline void audit_file(struct file *file)
{
        if (unlikely(!audit_dummy_context()))
                __audit_file(file);
}
static inline void audit_inode_parent_hidden(struct filename *name,
                                                const struct dentry *dentry)
{
        if (unlikely(!audit_dummy_context()))
                __audit_inode(name, dentry,
                                AUDIT_INODE_PARENT | AUDIT_INODE_HIDDEN);
}
static inline void audit_inode_child(struct inode *parent,
                                     const struct dentry *dentry,
                                     const unsigned char type) {
        if (unlikely(!audit_dummy_context()))
                __audit_inode_child(parent, dentry, type);
}
void audit_core_dumps(long signr);

static inline void audit_ptrace(struct task_struct *t)
{
        if (unlikely(!audit_dummy_context()))
                __audit_ptrace(t);
}

                                /* Private API (for audit.c only) */
extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode);
extern void __audit_bprm(struct linux_binprm *bprm);
extern int __audit_socketcall(int nargs, unsigned long *args);
extern int __audit_sockaddr(int len, void *addr);
extern void __audit_fd_pair(int fd1, int fd2);
extern void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr);
extern void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout);
extern void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification);
extern void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat);
extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
                                  const struct cred *new,
                                  const struct cred *old);
extern void __audit_log_capset(const struct cred *new, const struct cred *old);
extern void __audit_mmap_fd(int fd, int flags);
extern void __audit_openat2_how(struct open_how *how);
extern void __audit_log_kern_module(const char *name);
extern void __audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar);
extern void __audit_tk_injoffset(struct timespec64 offset);
extern void __audit_ntp_log(const struct audit_ntp_data *ad);
extern void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries,
                              enum audit_nfcfgop op, gfp_t gfp);

static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
{
        if (unlikely(!audit_dummy_context()))
                __audit_ipc_obj(ipcp);
}
static inline void audit_fd_pair(int fd1, int fd2)
{
        if (unlikely(!audit_dummy_context()))
                __audit_fd_pair(fd1, fd2);
}
static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode)
{
        if (unlikely(!audit_dummy_context()))
                __audit_ipc_set_perm(qbytes, uid, gid, mode);
}
static inline void audit_bprm(struct linux_binprm *bprm)
{
        if (unlikely(!audit_dummy_context()))
                __audit_bprm(bprm);
}
static inline int audit_socketcall(int nargs, unsigned long *args)
{
        if (unlikely(!audit_dummy_context()))
                return __audit_socketcall(nargs, args);
        return 0;
}

static inline int audit_socketcall_compat(int nargs, u32 *args)
{
        unsigned long a[AUDITSC_ARGS];
        int i;

        if (audit_dummy_context())
                return 0;

        for (i = 0; i < nargs; i++)
                a[i] = (unsigned long)args[i];
        return __audit_socketcall(nargs, a);
}

static inline int audit_sockaddr(int len, void *addr)
{
        if (unlikely(!audit_dummy_context()))
                return __audit_sockaddr(len, addr);
        return 0;
}
static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mq_open(oflag, mode, attr);
}
static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mq_sendrecv(mqdes, msg_len, msg_prio, abs_timeout);
}
static inline void audit_mq_notify(mqd_t mqdes, const struct sigevent *notification)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mq_notify(mqdes, notification);
}
static inline void audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mq_getsetattr(mqdes, mqstat);
}

static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm,
                                       const struct cred *new,
                                       const struct cred *old)
{
        if (unlikely(!audit_dummy_context()))
                return __audit_log_bprm_fcaps(bprm, new, old);
        return 0;
}

static inline void audit_log_capset(const struct cred *new,
                                   const struct cred *old)
{
        if (unlikely(!audit_dummy_context()))
                __audit_log_capset(new, old);
}

static inline void audit_mmap_fd(int fd, int flags)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mmap_fd(fd, flags);
}

static inline void audit_openat2_how(struct open_how *how)
{
        if (unlikely(!audit_dummy_context()))
                __audit_openat2_how(how);
}

static inline void audit_log_kern_module(const char *name)
{
        if (!audit_dummy_context())
                __audit_log_kern_module(name);
}

static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar)
{
        if (audit_enabled)
                __audit_fanotify(response, friar);
}

static inline void audit_tk_injoffset(struct timespec64 offset)
{
        /* ignore no-op events */
        if (offset.tv_sec == 0 && offset.tv_nsec == 0)
                return;

        if (!audit_dummy_context())
                __audit_tk_injoffset(offset);
}

static inline void audit_ntp_init(struct audit_ntp_data *ad)
{
        memset(ad, 0, sizeof(*ad));
}

static inline void audit_ntp_set_old(struct audit_ntp_data *ad,
                                     enum audit_ntp_type type, long long val)
{
        ad->vals[type].oldval = val;
}

static inline void audit_ntp_set_new(struct audit_ntp_data *ad,
                                     enum audit_ntp_type type, long long val)
{
        ad->vals[type].newval = val;
}

static inline void audit_ntp_log(const struct audit_ntp_data *ad)
{
        if (!audit_dummy_context())
                __audit_ntp_log(ad);
}

static inline void audit_log_nfcfg(const char *name, u8 af,
                                   unsigned int nentries,
                                   enum audit_nfcfgop op, gfp_t gfp)
{
        if (audit_enabled)
                __audit_log_nfcfg(name, af, nentries, op, gfp);
}

extern int audit_n_rules;
extern int audit_signals;
#else /* CONFIG_AUDITSYSCALL */
static inline int audit_alloc(struct task_struct *task)
{
        return 0;
}
static inline void audit_free(struct task_struct *task)
{ }
static inline void audit_uring_entry(u8 op)
{ }
static inline void audit_uring_exit(int success, long code)
{ }
static inline void audit_syscall_entry(int major, unsigned long a0,
                                       unsigned long a1, unsigned long a2,
                                       unsigned long a3)
{ }
static inline void audit_syscall_exit(void *pt_regs)
{ }
static inline bool audit_dummy_context(void)
{
        return true;
}
static inline void audit_set_context(struct task_struct *task, struct audit_context *ctx)
{ }
static inline struct audit_context *audit_context(void)
{
        return NULL;
}
static inline struct filename *audit_reusename(const __user char *name)
{
        return NULL;
}
static inline void audit_getname(struct filename *name)
{ }
static inline void audit_inode(struct filename *name,
                                const struct dentry *dentry,
                                unsigned int aflags)
{ }
static inline void audit_file(struct file *file)
{
}
static inline void audit_inode_parent_hidden(struct filename *name,
                                const struct dentry *dentry)
{ }
static inline void audit_inode_child(struct inode *parent,
                                     const struct dentry *dentry,
                                     const unsigned char type)
{ }
static inline void audit_core_dumps(long signr)
{ }
static inline void audit_seccomp(unsigned long syscall, long signr, int code)
{ }
static inline void audit_seccomp_actions_logged(const char *names,
                                                const char *old_names, int res)
{ }
static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
{ }
static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid,
                                        gid_t gid, umode_t mode)
{ }
static inline void audit_bprm(struct linux_binprm *bprm)
{ }
static inline int audit_socketcall(int nargs, unsigned long *args)
{
        return 0;
}

static inline int audit_socketcall_compat(int nargs, u32 *args)
{
        return 0;
}

static inline void audit_fd_pair(int fd1, int fd2)
{ }
static inline int audit_sockaddr(int len, void *addr)
{
        return 0;
}
static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
{ }
static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len,
                                     unsigned int msg_prio,
                                     const struct timespec64 *abs_timeout)
{ }
static inline void audit_mq_notify(mqd_t mqdes,
                                   const struct sigevent *notification)
{ }
static inline void audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
{ }
static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm,
                                       const struct cred *new,
                                       const struct cred *old)
{
        return 0;
}
static inline void audit_log_capset(const struct cred *new,
                                    const struct cred *old)
{ }
static inline void audit_mmap_fd(int fd, int flags)
{ }

static inline void audit_openat2_how(struct open_how *how)
{ }

static inline void audit_log_kern_module(const char *name)
{ }

static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar)
{ }

static inline void audit_tk_injoffset(struct timespec64 offset)
{ }

static inline void audit_ntp_init(struct audit_ntp_data *ad)
{ }

static inline void audit_ntp_set_old(struct audit_ntp_data *ad,
                                     enum audit_ntp_type type, long long val)
{ }

static inline void audit_ntp_set_new(struct audit_ntp_data *ad,
                                     enum audit_ntp_type type, long long val)
{ }

static inline void audit_ntp_log(const struct audit_ntp_data *ad)
{ }

static inline void audit_ptrace(struct task_struct *t)
{ }

static inline void audit_log_nfcfg(const char *name, u8 af,
                                   unsigned int nentries,
                                   enum audit_nfcfgop op, gfp_t gfp)
{ }

#define audit_n_rules 0
#define audit_signals 0
#endif /* CONFIG_AUDITSYSCALL */

static inline bool audit_loginuid_set(struct task_struct *tsk)
{
        return uid_valid(audit_get_loginuid(tsk));
}

#endif


































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 *  linux/fs/pnode.h
 *
 * (C) Copyright IBM Corporation 2005.
 */
#ifndef _LINUX_PNODE_H
#define _LINUX_PNODE_H

#include <linux/list.h>
#include "mount.h"

#define IS_MNT_SHARED(m) ((m)->mnt_t_flags & T_SHARED)
#define IS_MNT_SLAVE(m) ((m)->mnt_master)
#define IS_MNT_NEW(m) (!(m)->mnt_ns)
#define CLEAR_MNT_SHARED(m) ((m)->mnt_t_flags &= ~T_SHARED)
#define IS_MNT_UNBINDABLE(m) ((m)->mnt_t_flags & T_UNBINDABLE)
#define IS_MNT_MARKED(m) ((m)->mnt_t_flags & T_MARKED)
#define SET_MNT_MARK(m) ((m)->mnt_t_flags |= T_MARKED)
#define CLEAR_MNT_MARK(m) ((m)->mnt_t_flags &= ~T_MARKED)
#define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED)

#define CL_EXPIRE                    0x01
#define CL_SLAVE                     0x02
#define CL_COPY_UNBINDABLE        0x04
#define CL_MAKE_SHARED                 0x08
#define CL_PRIVATE                 0x10
#define CL_COPY_MNT_NS_FILE        0x40

/*
 * EXCL[namespace_sem]
 */
static inline void set_mnt_shared(struct mount *mnt)
{
        mnt->mnt_t_flags &= ~T_SHARED_MASK;
        mnt->mnt_t_flags |= T_SHARED;
}

static inline bool peers(const struct mount *m1, const struct mount *m2)
{
        return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id;
}

void change_mnt_propagation(struct mount *, int);
void bulk_make_private(struct list_head *);
int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
                struct hlist_head *);
void propagate_umount(struct list_head *);
int propagate_mount_busy(struct mount *, int);
void propagate_mount_unlock(struct mount *);
void mnt_release_group_id(struct mount *);
int get_dominating_id(struct mount *mnt, const struct path *root);
int mnt_get_count(struct mount *mnt);
void mnt_set_mountpoint(struct mount *, struct mountpoint *,
                        struct mount *);
void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp,
                           struct mount *mnt);
struct mount *copy_tree(struct mount *, struct dentry *, int);
bool is_path_reachable(struct mount *, struct dentry *,
                         const struct path *root);
int count_mounts(struct mnt_namespace *ns, struct mount *mnt);
bool propagation_would_overmount(const struct mount *from,
                                 const struct mount *to,
                                 const struct mountpoint *mp);
#endif /* _LINUX_PNODE_H */
















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_STRINGHASH_H
#define __LINUX_STRINGHASH_H

#include <linux/compiler.h>        /* For __pure */
#include <linux/types.h>        /* For u32, u64 */
#include <linux/hash.h>

/*
 * Routines for hashing strings of bytes to a 32-bit hash value.
 *
 * These hash functions are NOT GUARANTEED STABLE between kernel
 * versions, architectures, or even repeated boots of the same kernel.
 * (E.g. they may depend on boot-time hardware detection or be
 * deliberately randomized.)
 *
 * They are also not intended to be secure against collisions caused by
 * malicious inputs; much slower hash functions are required for that.
 *
 * They are optimized for pathname components, meaning short strings.
 * Even if a majority of files have longer names, the dynamic profile of
 * pathname components skews short due to short directory names.
 * (E.g. /usr/lib/libsesquipedalianism.so.3.141.)
 */

/*
 * Version 1: one byte at a time.  Example of use:
 *
 * unsigned long hash = init_name_hash;
 * while (*p)
 *        hash = partial_name_hash(tolower(*p++), hash);
 * hash = end_name_hash(hash);
 *
 * Although this is designed for bytes, fs/hfsplus/unicode.c
 * abuses it to hash 16-bit values.
 */

/* Hash courtesy of the R5 hash in reiserfs modulo sign bits */
#define init_name_hash(salt)                (unsigned long)(salt)

/* partial hash update function. Assume roughly 4 bits per character */
static inline unsigned long
partial_name_hash(unsigned long c, unsigned long prevhash)
{
        return (prevhash + (c << 4) + (c >> 4)) * 11;
}

/*
 * Finally: cut down the number of bits to a int value (and try to avoid
 * losing bits).  This also has the property (wanted by the dcache)
 * that the msbits make a good hash table index.
 */
static inline unsigned int end_name_hash(unsigned long hash)
{
        return hash_long(hash, 32);
}

/*
 * Version 2: One word (32 or 64 bits) at a time.
 * If CONFIG_DCACHE_WORD_ACCESS is defined (meaning <asm/word-at-a-time.h>
 * exists, which describes major Linux platforms like x86 and ARM), then
 * this computes a different hash function much faster.
 *
 * If not set, this falls back to a wrapper around the preceding.
 */
extern unsigned int __pure full_name_hash(const void *salt, const char *, unsigned int);

/*
 * A hash_len is a u64 with the hash of a string in the low
 * half and the length in the high half.
 */
#define hashlen_hash(hashlen) ((u32)(hashlen))
#define hashlen_len(hashlen)  ((u32)((hashlen) >> 32))
#define hashlen_create(hash, len) ((u64)(len)<<32 | (u32)(hash))

/* Return the "hash_len" (hash and length) of a null-terminated string */
extern u64 __pure hashlen_string(const void *salt, const char *name);

#endif        /* __LINUX_STRINGHASH_H */






































































































































































































































































    9 





    9 






    9 





    9 
















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Crypto API support for SHA-384, SHA-512, HMAC-SHA384, and HMAC-SHA512
 *
 * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
 * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
 * Copyright (c) 2003 Kyle McMartin <kyle@debian.org>
 * Copyright 2025 Google LLC
 */
#include <crypto/internal/hash.h>
#include <crypto/sha2.h>
#include <linux/kernel.h>
#include <linux/module.h>

/*
 * Export and import functions.  crypto_shash wants a particular format that
 * matches that used by some legacy drivers.  It currently is the same as the
 * library SHA context, except the value in bytecount_lo must be block-aligned
 * and the remainder must be stored in an extra u8 appended to the struct.
 */

#define SHA512_SHASH_STATE_SIZE 209
static_assert(offsetof(struct __sha512_ctx, state) == 0);
static_assert(offsetof(struct __sha512_ctx, bytecount_lo) == 64);
static_assert(offsetof(struct __sha512_ctx, bytecount_hi) == 72);
static_assert(offsetof(struct __sha512_ctx, buf) == 80);
static_assert(sizeof(struct __sha512_ctx) + 1 == SHA512_SHASH_STATE_SIZE);

static int __crypto_sha512_export(const struct __sha512_ctx *ctx0, void *out)
{
        struct __sha512_ctx ctx = *ctx0;
        unsigned int partial;
        u8 *p = out;

        partial = ctx.bytecount_lo % SHA512_BLOCK_SIZE;
        ctx.bytecount_lo -= partial;
        memcpy(p, &ctx, sizeof(ctx));
        p += sizeof(ctx);
        *p = partial;
        return 0;
}

static int __crypto_sha512_import(struct __sha512_ctx *ctx, const void *in)
{
        const u8 *p = in;

        memcpy(ctx, p, sizeof(*ctx));
        p += sizeof(*ctx);
        ctx->bytecount_lo += *p;
        return 0;
}

static int __crypto_sha512_export_core(const struct __sha512_ctx *ctx,
                                       void *out)
{
        memcpy(out, ctx, offsetof(struct __sha512_ctx, buf));
        return 0;
}

static int __crypto_sha512_import_core(struct __sha512_ctx *ctx, const void *in)
{
        memcpy(ctx, in, offsetof(struct __sha512_ctx, buf));
        return 0;
}

/* SHA-384 */

const u8 sha384_zero_message_hash[SHA384_DIGEST_SIZE] = {
        0x38, 0xb0, 0x60, 0xa7, 0x51, 0xac, 0x96, 0x38,
        0x4c, 0xd9, 0x32, 0x7e, 0xb1, 0xb1, 0xe3, 0x6a,
        0x21, 0xfd, 0xb7, 0x11, 0x14, 0xbe, 0x07, 0x43,
        0x4c, 0x0c, 0xc7, 0xbf, 0x63, 0xf6, 0xe1, 0xda,
        0x27, 0x4e, 0xde, 0xbf, 0xe7, 0x6f, 0x65, 0xfb,
        0xd5, 0x1a, 0xd2, 0xf1, 0x48, 0x98, 0xb9, 0x5b
};
EXPORT_SYMBOL_GPL(sha384_zero_message_hash);

#define SHA384_CTX(desc) ((struct sha384_ctx *)shash_desc_ctx(desc))

static int crypto_sha384_init(struct shash_desc *desc)
{
        sha384_init(SHA384_CTX(desc));
        return 0;
}

static int crypto_sha384_update(struct shash_desc *desc,
                                const u8 *data, unsigned int len)
{
        sha384_update(SHA384_CTX(desc), data, len);
        return 0;
}

static int crypto_sha384_final(struct shash_desc *desc, u8 *out)
{
        sha384_final(SHA384_CTX(desc), out);
        return 0;
}

static int crypto_sha384_digest(struct shash_desc *desc,
                                const u8 *data, unsigned int len, u8 *out)
{
        sha384(data, len, out);
        return 0;
}

static int crypto_sha384_export(struct shash_desc *desc, void *out)
{
        return __crypto_sha512_export(&SHA384_CTX(desc)->ctx, out);
}

static int crypto_sha384_import(struct shash_desc *desc, const void *in)
{
        return __crypto_sha512_import(&SHA384_CTX(desc)->ctx, in);
}

static int crypto_sha384_export_core(struct shash_desc *desc, void *out)
{
        return __crypto_sha512_export_core(&SHA384_CTX(desc)->ctx, out);
}

static int crypto_sha384_import_core(struct shash_desc *desc, const void *in)
{
        return __crypto_sha512_import_core(&SHA384_CTX(desc)->ctx, in);
}

/* SHA-512 */

const u8 sha512_zero_message_hash[SHA512_DIGEST_SIZE] = {
        0xcf, 0x83, 0xe1, 0x35, 0x7e, 0xef, 0xb8, 0xbd,
        0xf1, 0x54, 0x28, 0x50, 0xd6, 0x6d, 0x80, 0x07,
        0xd6, 0x20, 0xe4, 0x05, 0x0b, 0x57, 0x15, 0xdc,
        0x83, 0xf4, 0xa9, 0x21, 0xd3, 0x6c, 0xe9, 0xce,
        0x47, 0xd0, 0xd1, 0x3c, 0x5d, 0x85, 0xf2, 0xb0,
        0xff, 0x83, 0x18, 0xd2, 0x87, 0x7e, 0xec, 0x2f,
        0x63, 0xb9, 0x31, 0xbd, 0x47, 0x41, 0x7a, 0x81,
        0xa5, 0x38, 0x32, 0x7a, 0xf9, 0x27, 0xda, 0x3e
};
EXPORT_SYMBOL_GPL(sha512_zero_message_hash);

#define SHA512_CTX(desc) ((struct sha512_ctx *)shash_desc_ctx(desc))

static int crypto_sha512_init(struct shash_desc *desc)
{
        sha512_init(SHA512_CTX(desc));
        return 0;
}

static int crypto_sha512_update(struct shash_desc *desc,
                                const u8 *data, unsigned int len)
{
        sha512_update(SHA512_CTX(desc), data, len);
        return 0;
}

static int crypto_sha512_final(struct shash_desc *desc, u8 *out)
{
        sha512_final(SHA512_CTX(desc), out);
        return 0;
}

static int crypto_sha512_digest(struct shash_desc *desc,
                                const u8 *data, unsigned int len, u8 *out)
{
        sha512(data, len, out);
        return 0;
}

static int crypto_sha512_export(struct shash_desc *desc, void *out)
{
        return __crypto_sha512_export(&SHA512_CTX(desc)->ctx, out);
}

static int crypto_sha512_import(struct shash_desc *desc, const void *in)
{
        return __crypto_sha512_import(&SHA512_CTX(desc)->ctx, in);
}

static int crypto_sha512_export_core(struct shash_desc *desc, void *out)
{
        return __crypto_sha512_export_core(&SHA512_CTX(desc)->ctx, out);
}

static int crypto_sha512_import_core(struct shash_desc *desc, const void *in)
{
        return __crypto_sha512_import_core(&SHA512_CTX(desc)->ctx, in);
}

/* HMAC-SHA384 */

#define HMAC_SHA384_KEY(tfm) ((struct hmac_sha384_key *)crypto_shash_ctx(tfm))
#define HMAC_SHA384_CTX(desc) ((struct hmac_sha384_ctx *)shash_desc_ctx(desc))

static int crypto_hmac_sha384_setkey(struct crypto_shash *tfm,
                                     const u8 *raw_key, unsigned int keylen)
{
        hmac_sha384_preparekey(HMAC_SHA384_KEY(tfm), raw_key, keylen);
        return 0;
}

static int crypto_hmac_sha384_init(struct shash_desc *desc)
{
        hmac_sha384_init(HMAC_SHA384_CTX(desc), HMAC_SHA384_KEY(desc->tfm));
        return 0;
}

static int crypto_hmac_sha384_update(struct shash_desc *desc,
                                     const u8 *data, unsigned int len)
{
        hmac_sha384_update(HMAC_SHA384_CTX(desc), data, len);
        return 0;
}

static int crypto_hmac_sha384_final(struct shash_desc *desc, u8 *out)
{
        hmac_sha384_final(HMAC_SHA384_CTX(desc), out);
        return 0;
}

static int crypto_hmac_sha384_digest(struct shash_desc *desc,
                                     const u8 *data, unsigned int len,
                                     u8 *out)
{
        hmac_sha384(HMAC_SHA384_KEY(desc->tfm), data, len, out);
        return 0;
}

static int crypto_hmac_sha384_export(struct shash_desc *desc, void *out)
{
        return __crypto_sha512_export(&HMAC_SHA384_CTX(desc)->ctx.sha_ctx, out);
}

static int crypto_hmac_sha384_import(struct shash_desc *desc, const void *in)
{
        struct hmac_sha384_ctx *ctx = HMAC_SHA384_CTX(desc);

        ctx->ctx.ostate = HMAC_SHA384_KEY(desc->tfm)->key.ostate;
        return __crypto_sha512_import(&ctx->ctx.sha_ctx, in);
}

static int crypto_hmac_sha384_export_core(struct shash_desc *desc, void *out)
{
        return __crypto_sha512_export_core(&HMAC_SHA384_CTX(desc)->ctx.sha_ctx,
                                           out);
}

static int crypto_hmac_sha384_import_core(struct shash_desc *desc,
                                          const void *in)
{
        struct hmac_sha384_ctx *ctx = HMAC_SHA384_CTX(desc);

        ctx->ctx.ostate = HMAC_SHA384_KEY(desc->tfm)->key.ostate;
        return __crypto_sha512_import_core(&ctx->ctx.sha_ctx, in);
}

/* HMAC-SHA512 */

#define HMAC_SHA512_KEY(tfm) ((struct hmac_sha512_key *)crypto_shash_ctx(tfm))
#define HMAC_SHA512_CTX(desc) ((struct hmac_sha512_ctx *)shash_desc_ctx(desc))

static int crypto_hmac_sha512_setkey(struct crypto_shash *tfm,
                                     const u8 *raw_key, unsigned int keylen)
{
        hmac_sha512_preparekey(HMAC_SHA512_KEY(tfm), raw_key, keylen);
        return 0;
}

static int crypto_hmac_sha512_init(struct shash_desc *desc)
{
        hmac_sha512_init(HMAC_SHA512_CTX(desc), HMAC_SHA512_KEY(desc->tfm));
        return 0;
}

static int crypto_hmac_sha512_update(struct shash_desc *desc,
                                     const u8 *data, unsigned int len)
{
        hmac_sha512_update(HMAC_SHA512_CTX(desc), data, len);
        return 0;
}

static int crypto_hmac_sha512_final(struct shash_desc *desc, u8 *out)
{
        hmac_sha512_final(HMAC_SHA512_CTX(desc), out);
        return 0;
}

static int crypto_hmac_sha512_digest(struct shash_desc *desc,
                                     const u8 *data, unsigned int len,
                                     u8 *out)
{
        hmac_sha512(HMAC_SHA512_KEY(desc->tfm), data, len, out);
        return 0;
}

static int crypto_hmac_sha512_export(struct shash_desc *desc, void *out)
{
        return __crypto_sha512_export(&HMAC_SHA512_CTX(desc)->ctx.sha_ctx, out);
}

static int crypto_hmac_sha512_import(struct shash_desc *desc, const void *in)
{
        struct hmac_sha512_ctx *ctx = HMAC_SHA512_CTX(desc);

        ctx->ctx.ostate = HMAC_SHA512_KEY(desc->tfm)->key.ostate;
        return __crypto_sha512_import(&ctx->ctx.sha_ctx, in);
}

static int crypto_hmac_sha512_export_core(struct shash_desc *desc, void *out)
{
        return __crypto_sha512_export_core(&HMAC_SHA512_CTX(desc)->ctx.sha_ctx,
                                           out);
}

static int crypto_hmac_sha512_import_core(struct shash_desc *desc,
                                          const void *in)
{
        struct hmac_sha512_ctx *ctx = HMAC_SHA512_CTX(desc);

        ctx->ctx.ostate = HMAC_SHA512_KEY(desc->tfm)->key.ostate;
        return __crypto_sha512_import_core(&ctx->ctx.sha_ctx, in);
}

/* Algorithm definitions */

static struct shash_alg algs[] = {
        {
                .base.cra_name                = "sha384",
                .base.cra_driver_name        = "sha384-lib",
                .base.cra_priority        = 300,
                .base.cra_blocksize        = SHA384_BLOCK_SIZE,
                .base.cra_module        = THIS_MODULE,
                .digestsize                = SHA384_DIGEST_SIZE,
                .init                        = crypto_sha384_init,
                .update                        = crypto_sha384_update,
                .final                        = crypto_sha384_final,
                .digest                        = crypto_sha384_digest,
                .export                        = crypto_sha384_export,
                .import                        = crypto_sha384_import,
                .export_core                = crypto_sha384_export_core,
                .import_core                = crypto_sha384_import_core,
                .descsize                = sizeof(struct sha384_ctx),
                .statesize                = SHA512_SHASH_STATE_SIZE,
        },
        {
                .base.cra_name                = "sha512",
                .base.cra_driver_name        = "sha512-lib",
                .base.cra_priority        = 300,
                .base.cra_blocksize        = SHA512_BLOCK_SIZE,
                .base.cra_module        = THIS_MODULE,
                .digestsize                = SHA512_DIGEST_SIZE,
                .init                        = crypto_sha512_init,
                .update                        = crypto_sha512_update,
                .final                        = crypto_sha512_final,
                .digest                        = crypto_sha512_digest,
                .export                        = crypto_sha512_export,
                .import                        = crypto_sha512_import,
                .export_core                = crypto_sha512_export_core,
                .import_core                = crypto_sha512_import_core,
                .descsize                = sizeof(struct sha512_ctx),
                .statesize                = SHA512_SHASH_STATE_SIZE,
        },
        {
                .base.cra_name                = "hmac(sha384)",
                .base.cra_driver_name        = "hmac-sha384-lib",
                .base.cra_priority        = 300,
                .base.cra_blocksize        = SHA384_BLOCK_SIZE,
                .base.cra_ctxsize        = sizeof(struct hmac_sha384_key),
                .base.cra_module        = THIS_MODULE,
                .digestsize                = SHA384_DIGEST_SIZE,
                .setkey                        = crypto_hmac_sha384_setkey,
                .init                        = crypto_hmac_sha384_init,
                .update                        = crypto_hmac_sha384_update,
                .final                        = crypto_hmac_sha384_final,
                .digest                        = crypto_hmac_sha384_digest,
                .export                        = crypto_hmac_sha384_export,
                .import                        = crypto_hmac_sha384_import,
                .export_core                = crypto_hmac_sha384_export_core,
                .import_core                = crypto_hmac_sha384_import_core,
                .descsize                = sizeof(struct hmac_sha384_ctx),
                .statesize                = SHA512_SHASH_STATE_SIZE,
        },
        {
                .base.cra_name                = "hmac(sha512)",
                .base.cra_driver_name        = "hmac-sha512-lib",
                .base.cra_priority        = 300,
                .base.cra_blocksize        = SHA512_BLOCK_SIZE,
                .base.cra_ctxsize        = sizeof(struct hmac_sha512_key),
                .base.cra_module        = THIS_MODULE,
                .digestsize                = SHA512_DIGEST_SIZE,
                .setkey                        = crypto_hmac_sha512_setkey,
                .init                        = crypto_hmac_sha512_init,
                .update                        = crypto_hmac_sha512_update,
                .final                        = crypto_hmac_sha512_final,
                .digest                        = crypto_hmac_sha512_digest,
                .export                        = crypto_hmac_sha512_export,
                .import                        = crypto_hmac_sha512_import,
                .export_core                = crypto_hmac_sha512_export_core,
                .import_core                = crypto_hmac_sha512_import_core,
                .descsize                = sizeof(struct hmac_sha512_ctx),
                .statesize                = SHA512_SHASH_STATE_SIZE,
        },
};

static int __init crypto_sha512_mod_init(void)
{
        return crypto_register_shashes(algs, ARRAY_SIZE(algs));
}
module_init(crypto_sha512_mod_init);

static void __exit crypto_sha512_mod_exit(void)
{
        crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
}
module_exit(crypto_sha512_mod_exit);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Crypto API support for SHA-384, SHA-512, HMAC-SHA384, and HMAC-SHA512");

MODULE_ALIAS_CRYPTO("sha384");
MODULE_ALIAS_CRYPTO("sha384-lib");
MODULE_ALIAS_CRYPTO("sha512");
MODULE_ALIAS_CRYPTO("sha512-lib");
MODULE_ALIAS_CRYPTO("hmac(sha384)");
MODULE_ALIAS_CRYPTO("hmac-sha384-lib");
MODULE_ALIAS_CRYPTO("hmac(sha512)");
MODULE_ALIAS_CRYPTO("hmac-sha512-lib");







































































































































































































































































































































































































































































































































































































































































































































































































   19 













   19 






   19 
   19 



    6 








    6 



    6 











   17 




   17 



   17 







































































































































































































   40 







    8 



    7 




    7 
    1 




































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Cryptographic API for algorithms (i.e., low-level API).
 *
 * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
 */

#include <crypto/algapi.h>
#include <linux/err.h>
#include <linux/errno.h>
#include <linux/fips.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/rtnetlink.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/workqueue.h>

#include "internal.h"

static LIST_HEAD(crypto_template_list);

static inline void crypto_check_module_sig(struct module *mod)
{
        if (fips_enabled && mod && !module_sig_ok(mod))
                panic("Module %s signature verification failed in FIPS mode\n",
                      module_name(mod));
}

static int crypto_check_alg(struct crypto_alg *alg)
{
        crypto_check_module_sig(alg->cra_module);

        if (!alg->cra_name[0] || !alg->cra_driver_name[0])
                return -EINVAL;

        if (alg->cra_alignmask & (alg->cra_alignmask + 1))
                return -EINVAL;

        /* General maximums for all algs. */
        if (alg->cra_alignmask > MAX_ALGAPI_ALIGNMASK)
                return -EINVAL;

        if (alg->cra_blocksize > MAX_ALGAPI_BLOCKSIZE)
                return -EINVAL;

        /* Lower maximums for specific alg types. */
        if (!alg->cra_type && (alg->cra_flags & CRYPTO_ALG_TYPE_MASK) ==
                               CRYPTO_ALG_TYPE_CIPHER) {
                if (alg->cra_alignmask > MAX_CIPHER_ALIGNMASK)
                        return -EINVAL;

                if (alg->cra_blocksize > MAX_CIPHER_BLOCKSIZE)
                        return -EINVAL;
        }

        if (alg->cra_priority < 0)
                return -EINVAL;

        refcount_set(&alg->cra_refcnt, 1);

        return 0;
}

static void crypto_free_instance(struct crypto_instance *inst)
{
        inst->alg.cra_type->free(inst);
}

static void crypto_destroy_instance_workfn(struct work_struct *w)
{
        struct crypto_template *tmpl = container_of(w, struct crypto_template,
                                                    free_work);
        struct crypto_instance *inst;
        struct hlist_node *n;
        HLIST_HEAD(list);

        down_write(&crypto_alg_sem);
        hlist_for_each_entry_safe(inst, n, &tmpl->dead, list) {
                if (refcount_read(&inst->alg.cra_refcnt) != -1)
                        continue;
                hlist_del(&inst->list);
                hlist_add_head(&inst->list, &list);
        }
        up_write(&crypto_alg_sem);

        hlist_for_each_entry_safe(inst, n, &list, list)
                crypto_free_instance(inst);
}

static void crypto_destroy_instance(struct crypto_alg *alg)
{
        struct crypto_instance *inst = container_of(alg,
                                                    struct crypto_instance,
                                                    alg);
        struct crypto_template *tmpl = inst->tmpl;

        refcount_set(&alg->cra_refcnt, -1);
        schedule_work(&tmpl->free_work);
}

/*
 * This function adds a spawn to the list secondary_spawns which
 * will be used at the end of crypto_remove_spawns to unregister
 * instances, unless the spawn happens to be one that is depended
 * on by the new algorithm (nalg in crypto_remove_spawns).
 *
 * This function is also responsible for resurrecting any algorithms
 * in the dependency chain of nalg by unsetting n->dead.
 */
static struct list_head *crypto_more_spawns(struct crypto_alg *alg,
                                            struct list_head *stack,
                                            struct list_head *top,
                                            struct list_head *secondary_spawns)
{
        struct crypto_spawn *spawn, *n;

        spawn = list_first_entry_or_null(stack, struct crypto_spawn, list);
        if (!spawn)
                return NULL;

        n = list_prev_entry(spawn, list);
        list_move(&spawn->list, secondary_spawns);

        if (list_is_last(&n->list, stack))
                return top;

        n = list_next_entry(n, list);
        if (!spawn->dead)
                n->dead = false;

        return &n->inst->alg.cra_users;
}

static void crypto_remove_instance(struct crypto_instance *inst,
                                   struct list_head *list)
{
        struct crypto_template *tmpl = inst->tmpl;

        if (crypto_is_dead(&inst->alg))
                return;

        inst->alg.cra_flags |= CRYPTO_ALG_DEAD;

        if (!tmpl)
                return;

        list_del_init(&inst->alg.cra_list);
        hlist_del(&inst->list);
        hlist_add_head(&inst->list, &tmpl->dead);

        BUG_ON(!list_empty(&inst->alg.cra_users));

        crypto_alg_put(&inst->alg);
}

/*
 * Given an algorithm alg, remove all algorithms that depend on it
 * through spawns.  If nalg is not null, then exempt any algorithms
 * that is depended on by nalg.  This is useful when nalg itself
 * depends on alg.
 */
void crypto_remove_spawns(struct crypto_alg *alg, struct list_head *list,
                          struct crypto_alg *nalg)
{
        u32 new_type = (nalg ?: alg)->cra_flags;
        struct crypto_spawn *spawn, *n;
        LIST_HEAD(secondary_spawns);
        struct list_head *spawns;
        LIST_HEAD(stack);
        LIST_HEAD(top);

        spawns = &alg->cra_users;
        list_for_each_entry_safe(spawn, n, spawns, list) {
                if ((spawn->alg->cra_flags ^ new_type) & spawn->mask)
                        continue;

                list_move(&spawn->list, &top);
        }

        /*
         * Perform a depth-first walk starting from alg through
         * the cra_users tree.  The list stack records the path
         * from alg to the current spawn.
         */
        spawns = &top;
        do {
                while (!list_empty(spawns)) {
                        struct crypto_instance *inst;

                        spawn = list_first_entry(spawns, struct crypto_spawn,
                                                 list);
                        inst = spawn->inst;

                        list_move(&spawn->list, &stack);
                        spawn->dead = !spawn->registered || &inst->alg != nalg;

                        if (!spawn->registered)
                                break;

                        BUG_ON(&inst->alg == alg);

                        if (&inst->alg == nalg)
                                break;

                        spawns = &inst->alg.cra_users;

                        /*
                         * Even if spawn->registered is true, the
                         * instance itself may still be unregistered.
                         * This is because it may have failed during
                         * registration.  Therefore we still need to
                         * make the following test.
                         *
                         * We may encounter an unregistered instance here, since
                         * an instance's spawns are set up prior to the instance
                         * being registered.  An unregistered instance will have
                         * NULL ->cra_users.next, since ->cra_users isn't
                         * properly initialized until registration.  But an
                         * unregistered instance cannot have any users, so treat
                         * it the same as ->cra_users being empty.
                         */
                        if (spawns->next == NULL)
                                break;
                }
        } while ((spawns = crypto_more_spawns(alg, &stack, &top,
                                              &secondary_spawns)));

        /*
         * Remove all instances that are marked as dead.  Also
         * complete the resurrection of the others by moving them
         * back to the cra_users list.
         */
        list_for_each_entry_safe(spawn, n, &secondary_spawns, list) {
                if (!spawn->dead)
                        list_move(&spawn->list, &spawn->alg->cra_users);
                else if (spawn->registered)
                        crypto_remove_instance(spawn->inst, list);
        }
}
EXPORT_SYMBOL_GPL(crypto_remove_spawns);

static void crypto_alg_finish_registration(struct crypto_alg *alg,
                                           struct list_head *algs_to_put)
{
        struct crypto_alg *q;

        list_for_each_entry(q, &crypto_alg_list, cra_list) {
                if (q == alg)
                        continue;

                if (crypto_is_moribund(q))
                        continue;

                if (crypto_is_larval(q))
                        continue;

                if (strcmp(alg->cra_name, q->cra_name))
                        continue;

                if (strcmp(alg->cra_driver_name, q->cra_driver_name) &&
                    q->cra_priority > alg->cra_priority)
                        continue;

                crypto_remove_spawns(q, algs_to_put, alg);
        }

        crypto_notify(CRYPTO_MSG_ALG_LOADED, alg);
}

static struct crypto_larval *crypto_alloc_test_larval(struct crypto_alg *alg)
{
        struct crypto_larval *larval;

        if (!IS_ENABLED(CONFIG_CRYPTO_SELFTESTS) ||
            (alg->cra_flags & CRYPTO_ALG_INTERNAL))
                return NULL; /* No self-test needed */

        larval = crypto_larval_alloc(alg->cra_name,
                                     alg->cra_flags | CRYPTO_ALG_TESTED, 0);
        if (IS_ERR(larval))
                return larval;

        larval->adult = crypto_mod_get(alg);
        if (!larval->adult) {
                kfree(larval);
                return ERR_PTR(-ENOENT);
        }

        refcount_set(&larval->alg.cra_refcnt, 1);
        memcpy(larval->alg.cra_driver_name, alg->cra_driver_name,
               CRYPTO_MAX_ALG_NAME);
        larval->alg.cra_priority = alg->cra_priority;

        return larval;
}

static struct crypto_larval *
__crypto_register_alg(struct crypto_alg *alg, struct list_head *algs_to_put)
{
        struct crypto_alg *q;
        struct crypto_larval *larval;
        int ret = -EAGAIN;

        if (crypto_is_dead(alg))
                goto err;

        INIT_LIST_HEAD(&alg->cra_users);

        ret = -EEXIST;

        list_for_each_entry(q, &crypto_alg_list, cra_list) {
                if (q == alg)
                        goto err;

                if (crypto_is_moribund(q))
                        continue;

                if (crypto_is_larval(q)) {
                        if (!strcmp(alg->cra_driver_name, q->cra_driver_name))
                                goto err;
                        continue;
                }

                if (!strcmp(q->cra_driver_name, alg->cra_name) ||
                    !strcmp(q->cra_driver_name, alg->cra_driver_name) ||
                    !strcmp(q->cra_name, alg->cra_driver_name))
                        goto err;
        }

        larval = crypto_alloc_test_larval(alg);
        if (IS_ERR(larval))
                goto out;

        list_add(&alg->cra_list, &crypto_alg_list);

        if (larval) {
                /* No cheating! */
                alg->cra_flags &= ~CRYPTO_ALG_TESTED;

                list_add(&larval->alg.cra_list, &crypto_alg_list);
        } else {
                alg->cra_flags |= CRYPTO_ALG_TESTED;
                crypto_alg_finish_registration(alg, algs_to_put);
        }

out:
        return larval;

err:
        larval = ERR_PTR(ret);
        goto out;
}

void crypto_alg_tested(const char *name, int err)
{
        struct crypto_larval *test;
        struct crypto_alg *alg;
        struct crypto_alg *q;
        LIST_HEAD(list);

        down_write(&crypto_alg_sem);
        list_for_each_entry(q, &crypto_alg_list, cra_list) {
                if (crypto_is_moribund(q) || !crypto_is_larval(q))
                        continue;

                test = (struct crypto_larval *)q;

                if (!strcmp(q->cra_driver_name, name))
                        goto found;
        }

        pr_err("alg: Unexpected test result for %s: %d\n", name, err);
        up_write(&crypto_alg_sem);
        return;

found:
        q->cra_flags |= CRYPTO_ALG_DEAD;
        alg = test->adult;

        if (crypto_is_dead(alg))
                goto complete;

        if (err == -ECANCELED)
                alg->cra_flags |= CRYPTO_ALG_FIPS_INTERNAL;
        else if (err)
                goto complete;
        else
                alg->cra_flags &= ~CRYPTO_ALG_FIPS_INTERNAL;

        alg->cra_flags |= CRYPTO_ALG_TESTED;

        crypto_alg_finish_registration(alg, &list);

complete:
        list_del_init(&test->alg.cra_list);
        complete_all(&test->completion);

        up_write(&crypto_alg_sem);

        crypto_alg_put(&test->alg);
        crypto_remove_final(&list);
}
EXPORT_SYMBOL_GPL(crypto_alg_tested);

void crypto_remove_final(struct list_head *list)
{
        struct crypto_alg *alg;
        struct crypto_alg *n;

        list_for_each_entry_safe(alg, n, list, cra_list) {
                list_del_init(&alg->cra_list);
                crypto_alg_put(alg);
        }
}
EXPORT_SYMBOL_GPL(crypto_remove_final);

static void crypto_free_alg(struct crypto_alg *alg)
{
        unsigned int algsize = alg->cra_type->algsize;
        u8 *p = (u8 *)alg - algsize;

        crypto_destroy_alg(alg);
        kfree(p);
}

int crypto_register_alg(struct crypto_alg *alg)
{
        struct crypto_larval *larval;
        bool test_started = false;
        LIST_HEAD(algs_to_put);
        int err;

        alg->cra_flags &= ~CRYPTO_ALG_DEAD;
        err = crypto_check_alg(alg);
        if (err)
                return err;

        if (alg->cra_flags & CRYPTO_ALG_DUP_FIRST &&
            !WARN_ON_ONCE(alg->cra_destroy)) {
                unsigned int algsize = alg->cra_type->algsize;
                u8 *p = (u8 *)alg - algsize;

                p = kmemdup(p, algsize + sizeof(*alg), GFP_KERNEL);
                if (!p)
                        return -ENOMEM;

                alg = (void *)(p + algsize);
                alg->cra_destroy = crypto_free_alg;
        }

        down_write(&crypto_alg_sem);
        larval = __crypto_register_alg(alg, &algs_to_put);
        if (!IS_ERR_OR_NULL(larval)) {
                test_started = crypto_boot_test_finished();
                larval->test_started = test_started;
        }
        up_write(&crypto_alg_sem);

        if (IS_ERR(larval)) {
                crypto_alg_put(alg);
                return PTR_ERR(larval);
        }

        if (test_started)
                crypto_schedule_test(larval);
        else
                crypto_remove_final(&algs_to_put);

        return 0;
}
EXPORT_SYMBOL_GPL(crypto_register_alg);

static int crypto_remove_alg(struct crypto_alg *alg, struct list_head *list)
{
        if (unlikely(list_empty(&alg->cra_list)))
                return -ENOENT;

        alg->cra_flags |= CRYPTO_ALG_DEAD;

        list_del_init(&alg->cra_list);
        crypto_remove_spawns(alg, list, NULL);

        return 0;
}

void crypto_unregister_alg(struct crypto_alg *alg)
{
        int ret;
        LIST_HEAD(list);

        down_write(&crypto_alg_sem);
        ret = crypto_remove_alg(alg, &list);
        up_write(&crypto_alg_sem);

        if (WARN(ret, "Algorithm %s is not registered", alg->cra_driver_name))
                return;

        WARN_ON(!alg->cra_destroy && refcount_read(&alg->cra_refcnt) != 1);

        list_add(&alg->cra_list, &list);
        crypto_remove_final(&list);
}
EXPORT_SYMBOL_GPL(crypto_unregister_alg);

int crypto_register_algs(struct crypto_alg *algs, int count)
{
        int i, ret;

        for (i = 0; i < count; i++) {
                ret = crypto_register_alg(&algs[i]);
                if (ret)
                        goto err;
        }

        return 0;

err:
        for (--i; i >= 0; --i)
                crypto_unregister_alg(&algs[i]);

        return ret;
}
EXPORT_SYMBOL_GPL(crypto_register_algs);

void crypto_unregister_algs(struct crypto_alg *algs, int count)
{
        int i;

        for (i = 0; i < count; i++)
                crypto_unregister_alg(&algs[i]);
}
EXPORT_SYMBOL_GPL(crypto_unregister_algs);

int crypto_register_template(struct crypto_template *tmpl)
{
        struct crypto_template *q;
        int err = -EEXIST;

        INIT_WORK(&tmpl->free_work, crypto_destroy_instance_workfn);

        down_write(&crypto_alg_sem);

        crypto_check_module_sig(tmpl->module);

        list_for_each_entry(q, &crypto_template_list, list) {
                if (q == tmpl)
                        goto out;
        }

        list_add(&tmpl->list, &crypto_template_list);
        err = 0;
out:
        up_write(&crypto_alg_sem);
        return err;
}
EXPORT_SYMBOL_GPL(crypto_register_template);

int crypto_register_templates(struct crypto_template *tmpls, int count)
{
        int i, err;

        for (i = 0; i < count; i++) {
                err = crypto_register_template(&tmpls[i]);
                if (err)
                        goto out;
        }
        return 0;

out:
        for (--i; i >= 0; --i)
                crypto_unregister_template(&tmpls[i]);
        return err;
}
EXPORT_SYMBOL_GPL(crypto_register_templates);

void crypto_unregister_template(struct crypto_template *tmpl)
{
        struct crypto_instance *inst;
        struct hlist_node *n;
        struct hlist_head *list;
        LIST_HEAD(users);

        down_write(&crypto_alg_sem);

        BUG_ON(list_empty(&tmpl->list));
        list_del_init(&tmpl->list);

        list = &tmpl->instances;
        hlist_for_each_entry(inst, list, list) {
                int err = crypto_remove_alg(&inst->alg, &users);

                BUG_ON(err);
        }

        up_write(&crypto_alg_sem);

        hlist_for_each_entry_safe(inst, n, list, list) {
                BUG_ON(refcount_read(&inst->alg.cra_refcnt) != 1);
                crypto_free_instance(inst);
        }
        crypto_remove_final(&users);

        flush_work(&tmpl->free_work);
}
EXPORT_SYMBOL_GPL(crypto_unregister_template);

void crypto_unregister_templates(struct crypto_template *tmpls, int count)
{
        int i;

        for (i = count - 1; i >= 0; --i)
                crypto_unregister_template(&tmpls[i]);
}
EXPORT_SYMBOL_GPL(crypto_unregister_templates);

static struct crypto_template *__crypto_lookup_template(const char *name)
{
        struct crypto_template *q, *tmpl = NULL;

        down_read(&crypto_alg_sem);
        list_for_each_entry(q, &crypto_template_list, list) {
                if (strcmp(q->name, name))
                        continue;
                if (unlikely(!crypto_tmpl_get(q)))
                        continue;

                tmpl = q;
                break;
        }
        up_read(&crypto_alg_sem);

        return tmpl;
}

struct crypto_template *crypto_lookup_template(const char *name)
{
        return try_then_request_module(__crypto_lookup_template(name),
                                       "crypto-%s", name);
}
EXPORT_SYMBOL_GPL(crypto_lookup_template);

int crypto_register_instance(struct crypto_template *tmpl,
                             struct crypto_instance *inst)
{
        struct crypto_larval *larval;
        struct crypto_spawn *spawn;
        u32 fips_internal = 0;
        LIST_HEAD(algs_to_put);
        int err;

        err = crypto_check_alg(&inst->alg);
        if (err)
                return err;

        inst->alg.cra_module = tmpl->module;
        inst->alg.cra_flags |= CRYPTO_ALG_INSTANCE;
        inst->alg.cra_destroy = crypto_destroy_instance;

        down_write(&crypto_alg_sem);

        larval = ERR_PTR(-EAGAIN);
        for (spawn = inst->spawns; spawn;) {
                struct crypto_spawn *next;

                if (spawn->dead)
                        goto unlock;

                next = spawn->next;
                spawn->inst = inst;
                spawn->registered = true;

                fips_internal |= spawn->alg->cra_flags;

                crypto_mod_put(spawn->alg);

                spawn = next;
        }

        inst->alg.cra_flags |= (fips_internal & CRYPTO_ALG_FIPS_INTERNAL);

        larval = __crypto_register_alg(&inst->alg, &algs_to_put);
        if (IS_ERR(larval))
                goto unlock;
        else if (larval)
                larval->test_started = true;

        hlist_add_head(&inst->list, &tmpl->instances);
        inst->tmpl = tmpl;

unlock:
        up_write(&crypto_alg_sem);

        if (IS_ERR(larval))
                return PTR_ERR(larval);

        if (larval)
                crypto_schedule_test(larval);
        else
                crypto_remove_final(&algs_to_put);

        return 0;
}
EXPORT_SYMBOL_GPL(crypto_register_instance);

void crypto_unregister_instance(struct crypto_instance *inst)
{
        LIST_HEAD(list);

        down_write(&crypto_alg_sem);

        crypto_remove_spawns(&inst->alg, &list, NULL);
        crypto_remove_instance(inst, &list);

        up_write(&crypto_alg_sem);

        crypto_remove_final(&list);
}
EXPORT_SYMBOL_GPL(crypto_unregister_instance);

int crypto_grab_spawn(struct crypto_spawn *spawn, struct crypto_instance *inst,
                      const char *name, u32 type, u32 mask)
{
        struct crypto_alg *alg;
        int err = -EAGAIN;

        if (WARN_ON_ONCE(inst == NULL))
                return -EINVAL;

        /* Allow the result of crypto_attr_alg_name() to be passed directly */
        if (IS_ERR(name))
                return PTR_ERR(name);

        alg = crypto_find_alg(name, spawn->frontend,
                              type | CRYPTO_ALG_FIPS_INTERNAL, mask);
        if (IS_ERR(alg))
                return PTR_ERR(alg);

        down_write(&crypto_alg_sem);
        if (!crypto_is_moribund(alg)) {
                list_add(&spawn->list, &alg->cra_users);
                spawn->alg = alg;
                spawn->mask = mask;
                spawn->next = inst->spawns;
                inst->spawns = spawn;
                inst->alg.cra_flags |=
                        (alg->cra_flags & CRYPTO_ALG_INHERITED_FLAGS);
                err = 0;
        }
        up_write(&crypto_alg_sem);
        if (err)
                crypto_mod_put(alg);
        return err;
}
EXPORT_SYMBOL_GPL(crypto_grab_spawn);

void crypto_drop_spawn(struct crypto_spawn *spawn)
{
        if (!spawn->alg) /* not yet initialized? */
                return;

        down_write(&crypto_alg_sem);
        if (!spawn->dead)
                list_del(&spawn->list);
        up_write(&crypto_alg_sem);

        if (!spawn->registered)
                crypto_mod_put(spawn->alg);
}
EXPORT_SYMBOL_GPL(crypto_drop_spawn);

static struct crypto_alg *crypto_spawn_alg(struct crypto_spawn *spawn)
{
        struct crypto_alg *alg = ERR_PTR(-EAGAIN);
        struct crypto_alg *target;
        bool shoot = false;

        down_read(&crypto_alg_sem);
        if (!spawn->dead) {
                alg = spawn->alg;
                if (!crypto_mod_get(alg)) {
                        target = crypto_alg_get(alg);
                        shoot = true;
                        alg = ERR_PTR(-EAGAIN);
                }
        }
        up_read(&crypto_alg_sem);

        if (shoot) {
                crypto_shoot_alg(target);
                crypto_alg_put(target);
        }

        return alg;
}

struct crypto_tfm *crypto_spawn_tfm(struct crypto_spawn *spawn, u32 type,
                                    u32 mask)
{
        struct crypto_alg *alg;
        struct crypto_tfm *tfm;

        alg = crypto_spawn_alg(spawn);
        if (IS_ERR(alg))
                return ERR_CAST(alg);

        tfm = ERR_PTR(-EINVAL);
        if (unlikely((alg->cra_flags ^ type) & mask))
                goto out_put_alg;

        tfm = __crypto_alloc_tfm(alg, type, mask);
        if (IS_ERR(tfm))
                goto out_put_alg;

        return tfm;

out_put_alg:
        crypto_mod_put(alg);
        return tfm;
}
EXPORT_SYMBOL_GPL(crypto_spawn_tfm);

void *crypto_spawn_tfm2(struct crypto_spawn *spawn)
{
        struct crypto_alg *alg;
        struct crypto_tfm *tfm;

        alg = crypto_spawn_alg(spawn);
        if (IS_ERR(alg))
                return ERR_CAST(alg);

        tfm = crypto_create_tfm(alg, spawn->frontend);
        if (IS_ERR(tfm))
                goto out_put_alg;

        return tfm;

out_put_alg:
        crypto_mod_put(alg);
        return tfm;
}
EXPORT_SYMBOL_GPL(crypto_spawn_tfm2);

int crypto_register_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&crypto_chain, nb);
}
EXPORT_SYMBOL_GPL(crypto_register_notifier);

int crypto_unregister_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_unregister(&crypto_chain, nb);
}
EXPORT_SYMBOL_GPL(crypto_unregister_notifier);

struct crypto_attr_type *crypto_get_attr_type(struct rtattr **tb)
{
        struct rtattr *rta = tb[0];
        struct crypto_attr_type *algt;

        if (!rta)
                return ERR_PTR(-ENOENT);
        if (RTA_PAYLOAD(rta) < sizeof(*algt))
                return ERR_PTR(-EINVAL);
        if (rta->rta_type != CRYPTOA_TYPE)
                return ERR_PTR(-EINVAL);

        algt = RTA_DATA(rta);

        return algt;
}
EXPORT_SYMBOL_GPL(crypto_get_attr_type);

/**
 * crypto_check_attr_type() - check algorithm type and compute inherited mask
 * @tb: the template parameters
 * @type: the algorithm type the template would be instantiated as
 * @mask_ret: (output) the mask that should be passed to crypto_grab_*()
 *              to restrict the flags of any inner algorithms
 *
 * Validate that the algorithm type the user requested is compatible with the
 * one the template would actually be instantiated as.  E.g., if the user is
 * doing crypto_alloc_shash("cbc(aes)", ...), this would return an error because
 * the "cbc" template creates an "skcipher" algorithm, not an "shash" algorithm.
 *
 * Also compute the mask to use to restrict the flags of any inner algorithms.
 *
 * Return: 0 on success; -errno on failure
 */
int crypto_check_attr_type(struct rtattr **tb, u32 type, u32 *mask_ret)
{
        struct crypto_attr_type *algt;

        algt = crypto_get_attr_type(tb);
        if (IS_ERR(algt))
                return PTR_ERR(algt);

        if ((algt->type ^ type) & algt->mask)
                return -EINVAL;

        *mask_ret = crypto_algt_inherited_mask(algt);
        return 0;
}
EXPORT_SYMBOL_GPL(crypto_check_attr_type);

const char *crypto_attr_alg_name(struct rtattr *rta)
{
        struct crypto_attr_alg *alga;

        if (!rta)
                return ERR_PTR(-ENOENT);
        if (RTA_PAYLOAD(rta) < sizeof(*alga))
                return ERR_PTR(-EINVAL);
        if (rta->rta_type != CRYPTOA_ALG)
                return ERR_PTR(-EINVAL);

        alga = RTA_DATA(rta);
        alga->name[CRYPTO_MAX_ALG_NAME - 1] = 0;

        return alga->name;
}
EXPORT_SYMBOL_GPL(crypto_attr_alg_name);

int __crypto_inst_setname(struct crypto_instance *inst, const char *name,
                          const char *driver, struct crypto_alg *alg)
{
        if (snprintf(inst->alg.cra_name, CRYPTO_MAX_ALG_NAME, "%s(%s)", name,
                     alg->cra_name) >= CRYPTO_MAX_ALG_NAME)
                return -ENAMETOOLONG;

        if (snprintf(inst->alg.cra_driver_name, CRYPTO_MAX_ALG_NAME, "%s(%s)",
                     driver, alg->cra_driver_name) >= CRYPTO_MAX_ALG_NAME)
                return -ENAMETOOLONG;

        return 0;
}
EXPORT_SYMBOL_GPL(__crypto_inst_setname);

void crypto_init_queue(struct crypto_queue *queue, unsigned int max_qlen)
{
        INIT_LIST_HEAD(&queue->list);
        queue->backlog = &queue->list;
        queue->qlen = 0;
        queue->max_qlen = max_qlen;
}
EXPORT_SYMBOL_GPL(crypto_init_queue);

int crypto_enqueue_request(struct crypto_queue *queue,
                           struct crypto_async_request *request)
{
        int err = -EINPROGRESS;

        if (unlikely(queue->qlen >= queue->max_qlen)) {
                if (!(request->flags & CRYPTO_TFM_REQ_MAY_BACKLOG)) {
                        err = -ENOSPC;
                        goto out;
                }
                err = -EBUSY;
                if (queue->backlog == &queue->list)
                        queue->backlog = &request->list;
        }

        queue->qlen++;
        list_add_tail(&request->list, &queue->list);

out:
        return err;
}
EXPORT_SYMBOL_GPL(crypto_enqueue_request);

void crypto_enqueue_request_head(struct crypto_queue *queue,
                                 struct crypto_async_request *request)
{
        if (unlikely(queue->qlen >= queue->max_qlen))
                queue->backlog = queue->backlog->prev;

        queue->qlen++;
        list_add(&request->list, &queue->list);
}
EXPORT_SYMBOL_GPL(crypto_enqueue_request_head);

struct crypto_async_request *crypto_dequeue_request(struct crypto_queue *queue)
{
        struct list_head *request;

        if (unlikely(!queue->qlen))
                return NULL;

        queue->qlen--;

        if (queue->backlog != &queue->list)
                queue->backlog = queue->backlog->next;

        request = queue->list.next;
        list_del_init(request);

        return list_entry(request, struct crypto_async_request, list);
}
EXPORT_SYMBOL_GPL(crypto_dequeue_request);

static inline void crypto_inc_byte(u8 *a, unsigned int size)
{
        u8 *b = (a + size);
        u8 c;

        for (; size; size--) {
                c = *--b + 1;
                *b = c;
                if (c)
                        break;
        }
}

void crypto_inc(u8 *a, unsigned int size)
{
        __be32 *b = (__be32 *)(a + size);
        u32 c;

        if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
            IS_ALIGNED((unsigned long)b, __alignof__(*b)))
                for (; size >= 4; size -= 4) {
                        c = be32_to_cpu(*--b) + 1;
                        *b = cpu_to_be32(c);
                        if (likely(c))
                                return;
                }

        crypto_inc_byte(a, size);
}
EXPORT_SYMBOL_GPL(crypto_inc);

unsigned int crypto_alg_extsize(struct crypto_alg *alg)
{
        return alg->cra_ctxsize +
               (alg->cra_alignmask & ~(crypto_tfm_ctx_alignment() - 1));
}
EXPORT_SYMBOL_GPL(crypto_alg_extsize);

int crypto_type_has_alg(const char *name, const struct crypto_type *frontend,
                        u32 type, u32 mask)
{
        int ret = 0;
        struct crypto_alg *alg = crypto_find_alg(name, frontend, type, mask);

        if (!IS_ERR(alg)) {
                crypto_mod_put(alg);
                ret = 1;
        }

        return ret;
}
EXPORT_SYMBOL_GPL(crypto_type_has_alg);

static void __init crypto_start_tests(void)
{
        if (!IS_BUILTIN(CONFIG_CRYPTO_ALGAPI))
                return;

        if (!IS_ENABLED(CONFIG_CRYPTO_SELFTESTS))
                return;

        set_crypto_boot_test_finished();

        for (;;) {
                struct crypto_larval *larval = NULL;
                struct crypto_alg *q;

                down_write(&crypto_alg_sem);

                list_for_each_entry(q, &crypto_alg_list, cra_list) {
                        struct crypto_larval *l;

                        if (!crypto_is_larval(q))
                                continue;

                        l = (void *)q;

                        if (!crypto_is_test_larval(l))
                                continue;

                        if (l->test_started)
                                continue;

                        l->test_started = true;
                        larval = l;
                        break;
                }

                up_write(&crypto_alg_sem);

                if (!larval)
                        break;

                crypto_schedule_test(larval);
        }
}

static int __init crypto_algapi_init(void)
{
        crypto_init_proc();
        crypto_start_tests();
        return 0;
}

static void __exit crypto_algapi_exit(void)
{
        crypto_exit_proc();
}

/*
 * We run this at late_initcall so that all the built-in algorithms
 * have had a chance to register themselves first.
 */
late_initcall(crypto_algapi_init);
module_exit(crypto_algapi_exit);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Cryptographic algorithms API");
MODULE_SOFTDEP("pre: cryptomgr");



















































































































    4 

    4 














    4 



    4 







    4 

    4 







    4 

























































































































































































    4 






    4 
























































































































































































































































































    4 






    4 










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PGTABLE_H
#define _LINUX_PGTABLE_H

#include <linux/pfn.h>
#include <asm/pgtable.h>

#define PMD_ORDER        (PMD_SHIFT - PAGE_SHIFT)
#define PUD_ORDER        (PUD_SHIFT - PAGE_SHIFT)

#ifndef __ASSEMBLY__
#ifdef CONFIG_MMU

#include <linux/mm_types.h>
#include <linux/bug.h>
#include <linux/errno.h>
#include <asm-generic/pgtable_uffd.h>
#include <linux/page_table_check.h>

#if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \
        defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS
#error CONFIG_PGTABLE_LEVELS is not consistent with __PAGETABLE_{P4D,PUD,PMD}_FOLDED
#endif

/*
 * On almost all architectures and configurations, 0 can be used as the
 * upper ceiling to free_pgtables(): on many architectures it has the same
 * effect as using TASK_SIZE.  However, there is one configuration which
 * must impose a more careful limit, to avoid freeing kernel pgtables.
 */
#ifndef USER_PGTABLES_CEILING
#define USER_PGTABLES_CEILING        0UL
#endif

/*
 * This defines the first usable user address. Platforms
 * can override its value with custom FIRST_USER_ADDRESS
 * defined in their respective <asm/pgtable.h>.
 */
#ifndef FIRST_USER_ADDRESS
#define FIRST_USER_ADDRESS        0UL
#endif

/*
 * This defines the generic helper for accessing PMD page
 * table page. Although platforms can still override this
 * via their respective <asm/pgtable.h>.
 */
#ifndef pmd_pgtable
#define pmd_pgtable(pmd) pmd_page(pmd)
#endif

#define pmd_folio(pmd) page_folio(pmd_page(pmd))

/*
 * A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD]
 *
 * The pXx_index() functions return the index of the entry in the page
 * table page which would control the given virtual address
 *
 * As these functions may be used by the same code for different levels of
 * the page table folding, they are always available, regardless of
 * CONFIG_PGTABLE_LEVELS value. For the folded levels they simply return 0
 * because in such cases PTRS_PER_PxD equals 1.
 */

static inline unsigned long pte_index(unsigned long address)
{
        return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
}

#ifndef pmd_index
static inline unsigned long pmd_index(unsigned long address)
{
        return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1);
}
#define pmd_index pmd_index
#endif

#ifndef pud_index
static inline unsigned long pud_index(unsigned long address)
{
        return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
}
#define pud_index pud_index
#endif

#ifndef pgd_index
/* Must be a compile-time constant, so implement it as a macro */
#define pgd_index(a)  (((a) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
#endif

#ifndef kernel_pte_init
static inline void kernel_pte_init(void *addr)
{
}
#define kernel_pte_init kernel_pte_init
#endif

#ifndef pmd_init
static inline void pmd_init(void *addr)
{
}
#define pmd_init pmd_init
#endif

#ifndef pud_init
static inline void pud_init(void *addr)
{
}
#define pud_init pud_init
#endif

#ifndef pte_offset_kernel
static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
{
        return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address);
}
#define pte_offset_kernel pte_offset_kernel
#endif

#ifdef CONFIG_HIGHPTE
#define __pte_map(pmd, address) \
        ((pte_t *)kmap_local_page(pmd_page(*(pmd))) + pte_index((address)))
#define pte_unmap(pte)        do {        \
        kunmap_local((pte));        \
        rcu_read_unlock();        \
} while (0)
#else
static inline pte_t *__pte_map(pmd_t *pmd, unsigned long address)
{
        return pte_offset_kernel(pmd, address);
}
static inline void pte_unmap(pte_t *pte)
{
        rcu_read_unlock();
}
#endif

void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable);

/* Find an entry in the second-level page table.. */
#ifndef pmd_offset
static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
{
        return pud_pgtable(*pud) + pmd_index(address);
}
#define pmd_offset pmd_offset
#endif

#ifndef pud_offset
static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
{
        return p4d_pgtable(*p4d) + pud_index(address);
}
#define pud_offset pud_offset
#endif

static inline pgd_t *pgd_offset_pgd(pgd_t *pgd, unsigned long address)
{
        return (pgd + pgd_index(address));
};

/*
 * a shortcut to get a pgd_t in a given mm
 */
#ifndef pgd_offset
#define pgd_offset(mm, address)                pgd_offset_pgd((mm)->pgd, (address))
#endif

/*
 * a shortcut which implies the use of the kernel's pgd, instead
 * of a process's
 */
#define pgd_offset_k(address)                pgd_offset(&init_mm, (address))

/*
 * In many cases it is known that a virtual address is mapped at PMD or PTE
 * level, so instead of traversing all the page table levels, we can get a
 * pointer to the PMD entry in user or kernel page table or translate a virtual
 * address to the pointer in the PTE in the kernel page tables with simple
 * helpers.
 */
static inline pmd_t *pmd_off(struct mm_struct *mm, unsigned long va)
{
        return pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, va), va), va), va);
}

static inline pmd_t *pmd_off_k(unsigned long va)
{
        return pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va);
}

static inline pte_t *virt_to_kpte(unsigned long vaddr)
{
        pmd_t *pmd = pmd_off_k(vaddr);

        return pmd_none(*pmd) ? NULL : pte_offset_kernel(pmd, vaddr);
}

#ifndef pmd_young
static inline int pmd_young(pmd_t pmd)
{
        return 0;
}
#endif

#ifndef pmd_dirty
static inline int pmd_dirty(pmd_t pmd)
{
        return 0;
}
#endif

/*
 * A facility to provide lazy MMU batching.  This allows PTE updates and
 * page invalidations to be delayed until a call to leave lazy MMU mode
 * is issued.  Some architectures may benefit from doing this, and it is
 * beneficial for both shadow and direct mode hypervisors, which may batch
 * the PTE updates which happen during this window.  Note that using this
 * interface requires that read hazards be removed from the code.  A read
 * hazard could result in the direct mode hypervisor case, since the actual
 * write to the page tables may not yet have taken place, so reads though
 * a raw PTE pointer after it has been modified are not guaranteed to be
 * up to date.
 *
 * In the general case, no lock is guaranteed to be held between entry and exit
 * of the lazy mode. So the implementation must assume preemption may be enabled
 * and cpu migration is possible; it must take steps to be robust against this.
 * (In practice, for user PTE updates, the appropriate page table lock(s) are
 * held, but for kernel PTE updates, no lock is held). Nesting is not permitted
 * and the mode cannot be used in interrupt context.
 */
#ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE
static inline void arch_enter_lazy_mmu_mode(void) {}
static inline void arch_leave_lazy_mmu_mode(void) {}
static inline void arch_flush_lazy_mmu_mode(void) {}
#endif

#ifndef pte_batch_hint
/**
 * pte_batch_hint - Number of pages that can be added to batch without scanning.
 * @ptep: Page table pointer for the entry.
 * @pte: Page table entry.
 *
 * Some architectures know that a set of contiguous ptes all map the same
 * contiguous memory with the same permissions. In this case, it can provide a
 * hint to aid pte batching without the core code needing to scan every pte.
 *
 * An architecture implementation may ignore the PTE accessed state. Further,
 * the dirty state must apply atomically to all the PTEs described by the hint.
 *
 * May be overridden by the architecture, else pte_batch_hint is always 1.
 */
static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte)
{
        return 1;
}
#endif

#ifndef pte_advance_pfn
static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
{
        return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT));
}
#endif

#define pte_next_pfn(pte) pte_advance_pfn(pte, 1)

#ifndef set_ptes
/**
 * set_ptes - Map consecutive pages to a contiguous range of addresses.
 * @mm: Address space to map the pages into.
 * @addr: Address to map the first page at.
 * @ptep: Page table pointer for the first entry.
 * @pte: Page table entry for the first page.
 * @nr: Number of pages to map.
 *
 * When nr==1, initial state of pte may be present or not present, and new state
 * may be present or not present. When nr>1, initial state of all ptes must be
 * not present, and new state must be present.
 *
 * May be overridden by the architecture, or the architecture can define
 * set_pte() and PFN_PTE_SHIFT.
 *
 * Context: The caller holds the page table lock.  The pages all belong
 * to the same folio.  The PTEs are all in the same PMD.
 */
static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
                pte_t *ptep, pte_t pte, unsigned int nr)
{
        page_table_check_ptes_set(mm, ptep, pte, nr);

        for (;;) {
                set_pte(ptep, pte);
                if (--nr == 0)
                        break;
                ptep++;
                pte = pte_next_pfn(pte);
        }
}
#endif
#define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 1)

#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
extern int ptep_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pte_t *ptep,
                                 pte_t entry, int dirty);
#endif

#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern int pmdp_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp,
                                 pmd_t entry, int dirty);
extern int pudp_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pud_t *pudp,
                                 pud_t entry, int dirty);
#else
static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
                                        unsigned long address, pmd_t *pmdp,
                                        pmd_t entry, int dirty)
{
        BUILD_BUG();
        return 0;
}
static inline int pudp_set_access_flags(struct vm_area_struct *vma,
                                        unsigned long address, pud_t *pudp,
                                        pud_t entry, int dirty)
{
        BUILD_BUG();
        return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif

#ifndef ptep_get
static inline pte_t ptep_get(pte_t *ptep)
{
        return READ_ONCE(*ptep);
}
#endif

#ifndef pmdp_get
static inline pmd_t pmdp_get(pmd_t *pmdp)
{
        return READ_ONCE(*pmdp);
}
#endif

#ifndef pudp_get
static inline pud_t pudp_get(pud_t *pudp)
{
        return READ_ONCE(*pudp);
}
#endif

#ifndef p4dp_get
static inline p4d_t p4dp_get(p4d_t *p4dp)
{
        return READ_ONCE(*p4dp);
}
#endif

#ifndef pgdp_get
static inline pgd_t pgdp_get(pgd_t *pgdp)
{
        return READ_ONCE(*pgdp);
}
#endif

#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
                                            unsigned long address,
                                            pte_t *ptep)
{
        pte_t pte = ptep_get(ptep);
        int r = 1;
        if (!pte_young(pte))
                r = 0;
        else
                set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte));
        return r;
}
#endif

#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
                                            unsigned long address,
                                            pmd_t *pmdp)
{
        pmd_t pmd = *pmdp;
        int r = 1;
        if (!pmd_young(pmd))
                r = 0;
        else
                set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd));
        return r;
}
#else
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
                                            unsigned long address,
                                            pmd_t *pmdp)
{
        BUILD_BUG();
        return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */
#endif

#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
int ptep_clear_flush_young(struct vm_area_struct *vma,
                           unsigned long address, pte_t *ptep);
#endif

#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
                                  unsigned long address, pmd_t *pmdp);
#else
/*
 * Despite relevant to THP only, this API is called from generic rmap code
 * under PageTransHuge(), hence needs a dummy implementation for !THP
 */
static inline int pmdp_clear_flush_young(struct vm_area_struct *vma,
                                         unsigned long address, pmd_t *pmdp)
{
        BUILD_BUG();
        return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif

#ifndef arch_has_hw_nonleaf_pmd_young
/*
 * Return whether the accessed bit in non-leaf PMD entries is supported on the
 * local CPU.
 */
static inline bool arch_has_hw_nonleaf_pmd_young(void)
{
        return IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG);
}
#endif

#ifndef arch_has_hw_pte_young
/*
 * Return whether the accessed bit is supported on the local CPU.
 *
 * This stub assumes accessing through an old PTE triggers a page fault.
 * Architectures that automatically set the access bit should overwrite it.
 */
static inline bool arch_has_hw_pte_young(void)
{
        return IS_ENABLED(CONFIG_ARCH_HAS_HW_PTE_YOUNG);
}
#endif

#ifndef exec_folio_order
/*
 * Returns preferred minimum folio order for executable file-backed memory. Must
 * be in range [0, PMD_ORDER). Default to order-0.
 */
static inline unsigned int exec_folio_order(void)
{
        return 0;
}
#endif

#ifndef arch_check_zapped_pte
static inline void arch_check_zapped_pte(struct vm_area_struct *vma,
                                         pte_t pte)
{
}
#endif

#ifndef arch_check_zapped_pmd
static inline void arch_check_zapped_pmd(struct vm_area_struct *vma,
                                         pmd_t pmd)
{
}
#endif

#ifndef arch_check_zapped_pud
static inline void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud)
{
}
#endif

#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
                                       unsigned long address,
                                       pte_t *ptep)
{
        pte_t pte = ptep_get(ptep);
        pte_clear(mm, address, ptep);
        page_table_check_pte_clear(mm, pte);
        return pte;
}
#endif

#ifndef clear_young_dirty_ptes
/**
 * clear_young_dirty_ptes - Mark PTEs that map consecutive pages of the
 *                same folio as old/clean.
 * @mm: Address space the pages are mapped into.
 * @addr: Address the first page is mapped at.
 * @ptep: Page table pointer for the first entry.
 * @nr: Number of entries to mark old/clean.
 * @flags: Flags to modify the PTE batch semantics.
 *
 * May be overridden by the architecture; otherwise, implemented by
 * get_and_clear/modify/set for each pte in the range.
 *
 * Note that PTE bits in the PTE range besides the PFN can differ. For example,
 * some PTEs might be write-protected.
 *
 * Context: The caller holds the page table lock.  The PTEs map consecutive
 * pages that belong to the same folio.  The PTEs are all in the same PMD.
 */
static inline void clear_young_dirty_ptes(struct vm_area_struct *vma,
                                          unsigned long addr, pte_t *ptep,
                                          unsigned int nr, cydp_t flags)
{
        pte_t pte;

        for (;;) {
                if (flags == CYDP_CLEAR_YOUNG)
                        ptep_test_and_clear_young(vma, addr, ptep);
                else {
                        pte = ptep_get_and_clear(vma->vm_mm, addr, ptep);
                        if (flags & CYDP_CLEAR_YOUNG)
                                pte = pte_mkold(pte);
                        if (flags & CYDP_CLEAR_DIRTY)
                                pte = pte_mkclean(pte);
                        set_pte_at(vma->vm_mm, addr, ptep, pte);
                }
                if (--nr == 0)
                        break;
                ptep++;
                addr += PAGE_SIZE;
        }
}
#endif

static inline void ptep_clear(struct mm_struct *mm, unsigned long addr,
                              pte_t *ptep)
{
        pte_t pte = ptep_get(ptep);

        pte_clear(mm, addr, ptep);
        /*
         * No need for ptep_get_and_clear(): page table check doesn't care about
         * any bits that could have been set by HW concurrently.
         */
        page_table_check_pte_clear(mm, pte);
}

#ifdef CONFIG_GUP_GET_PXX_LOW_HIGH
/*
 * For walking the pagetables without holding any locks.  Some architectures
 * (eg x86-32 PAE) cannot load the entries atomically without using expensive
 * instructions.  We are guaranteed that a PTE will only either go from not
 * present to present, or present to not present -- it will not switch to a
 * completely different present page without a TLB flush inbetween; which we
 * are blocking by holding interrupts off.
 *
 * Setting ptes from not present to present goes:
 *
 *   ptep->pte_high = h;
 *   smp_wmb();
 *   ptep->pte_low = l;
 *
 * And present to not present goes:
 *
 *   ptep->pte_low = 0;
 *   smp_wmb();
 *   ptep->pte_high = 0;
 *
 * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'.
 * We load pte_high *after* loading pte_low, which ensures we don't see an older
 * value of pte_high.  *Then* we recheck pte_low, which ensures that we haven't
 * picked up a changed pte high. We might have gotten rubbish values from
 * pte_low and pte_high, but we are guaranteed that pte_low will not have the
 * present bit set *unless* it is 'l'. Because get_user_pages_fast() only
 * operates on present ptes we're safe.
 */
static inline pte_t ptep_get_lockless(pte_t *ptep)
{
        pte_t pte;

        do {
                pte.pte_low = ptep->pte_low;
                smp_rmb();
                pte.pte_high = ptep->pte_high;
                smp_rmb();
        } while (unlikely(pte.pte_low != ptep->pte_low));

        return pte;
}
#define ptep_get_lockless ptep_get_lockless

#if CONFIG_PGTABLE_LEVELS > 2
static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
{
        pmd_t pmd;

        do {
                pmd.pmd_low = pmdp->pmd_low;
                smp_rmb();
                pmd.pmd_high = pmdp->pmd_high;
                smp_rmb();
        } while (unlikely(pmd.pmd_low != pmdp->pmd_low));

        return pmd;
}
#define pmdp_get_lockless pmdp_get_lockless
#define pmdp_get_lockless_sync() tlb_remove_table_sync_one()
#endif /* CONFIG_PGTABLE_LEVELS > 2 */
#endif /* CONFIG_GUP_GET_PXX_LOW_HIGH */

/*
 * We require that the PTE can be read atomically.
 */
#ifndef ptep_get_lockless
static inline pte_t ptep_get_lockless(pte_t *ptep)
{
        return ptep_get(ptep);
}
#endif

#ifndef pmdp_get_lockless
static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
{
        return pmdp_get(pmdp);
}
static inline void pmdp_get_lockless_sync(void)
{
}
#endif

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
                                            unsigned long address,
                                            pmd_t *pmdp)
{
        pmd_t pmd = *pmdp;

        pmd_clear(pmdp);
        page_table_check_pmd_clear(mm, pmd);

        return pmd;
}
#endif /* __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR */
#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
                                            unsigned long address,
                                            pud_t *pudp)
{
        pud_t pud = *pudp;

        pud_clear(pudp);
        page_table_check_pud_clear(mm, pud);

        return pud;
}
#endif /* __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR */
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
                                            unsigned long address, pmd_t *pmdp,
                                            int full)
{
        return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
}
#endif

#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL
static inline pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma,
                                            unsigned long address, pud_t *pudp,
                                            int full)
{
        return pudp_huge_get_and_clear(vma->vm_mm, address, pudp);
}
#endif
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
                                            unsigned long address, pte_t *ptep,
                                            int full)
{
        return ptep_get_and_clear(mm, address, ptep);
}
#endif

#ifndef get_and_clear_full_ptes
/**
 * get_and_clear_full_ptes - Clear present PTEs that map consecutive pages of
 *                             the same folio, collecting dirty/accessed bits.
 * @mm: Address space the pages are mapped into.
 * @addr: Address the first page is mapped at.
 * @ptep: Page table pointer for the first entry.
 * @nr: Number of entries to clear.
 * @full: Whether we are clearing a full mm.
 *
 * May be overridden by the architecture; otherwise, implemented as a simple
 * loop over ptep_get_and_clear_full(), merging dirty/accessed bits into the
 * returned PTE.
 *
 * Note that PTE bits in the PTE range besides the PFN can differ. For example,
 * some PTEs might be write-protected.
 *
 * Context: The caller holds the page table lock.  The PTEs map consecutive
 * pages that belong to the same folio.  The PTEs are all in the same PMD.
 */
static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm,
                unsigned long addr, pte_t *ptep, unsigned int nr, int full)
{
        pte_t pte, tmp_pte;

        pte = ptep_get_and_clear_full(mm, addr, ptep, full);
        while (--nr) {
                ptep++;
                addr += PAGE_SIZE;
                tmp_pte = ptep_get_and_clear_full(mm, addr, ptep, full);
                if (pte_dirty(tmp_pte))
                        pte = pte_mkdirty(pte);
                if (pte_young(tmp_pte))
                        pte = pte_mkyoung(pte);
        }
        return pte;
}
#endif

/**
 * get_and_clear_ptes - Clear present PTEs that map consecutive pages of
 *                        the same folio, collecting dirty/accessed bits.
 * @mm: Address space the pages are mapped into.
 * @addr: Address the first page is mapped at.
 * @ptep: Page table pointer for the first entry.
 * @nr: Number of entries to clear.
 *
 * Use this instead of get_and_clear_full_ptes() if it is known that we don't
 * need to clear the full mm, which is mostly the case.
 *
 * Note that PTE bits in the PTE range besides the PFN can differ. For example,
 * some PTEs might be write-protected.
 *
 * Context: The caller holds the page table lock.  The PTEs map consecutive
 * pages that belong to the same folio.  The PTEs are all in the same PMD.
 */
static inline pte_t get_and_clear_ptes(struct mm_struct *mm, unsigned long addr,
                pte_t *ptep, unsigned int nr)
{
        return get_and_clear_full_ptes(mm, addr, ptep, nr, 0);
}

#ifndef clear_full_ptes
/**
 * clear_full_ptes - Clear present PTEs that map consecutive pages of the same
 *                     folio.
 * @mm: Address space the pages are mapped into.
 * @addr: Address the first page is mapped at.
 * @ptep: Page table pointer for the first entry.
 * @nr: Number of entries to clear.
 * @full: Whether we are clearing a full mm.
 *
 * May be overridden by the architecture; otherwise, implemented as a simple
 * loop over ptep_get_and_clear_full().
 *
 * Note that PTE bits in the PTE range besides the PFN can differ. For example,
 * some PTEs might be write-protected.
 *
 * Context: The caller holds the page table lock.  The PTEs map consecutive
 * pages that belong to the same folio.  The PTEs are all in the same PMD.
 */
static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr,
                pte_t *ptep, unsigned int nr, int full)
{
        for (;;) {
                ptep_get_and_clear_full(mm, addr, ptep, full);
                if (--nr == 0)
                        break;
                ptep++;
                addr += PAGE_SIZE;
        }
}
#endif

/**
 * clear_ptes - Clear present PTEs that map consecutive pages of the same folio.
 * @mm: Address space the pages are mapped into.
 * @addr: Address the first page is mapped at.
 * @ptep: Page table pointer for the first entry.
 * @nr: Number of entries to clear.
 *
 * Use this instead of clear_full_ptes() if it is known that we don't need to
 * clear the full mm, which is mostly the case.
 *
 * Note that PTE bits in the PTE range besides the PFN can differ. For example,
 * some PTEs might be write-protected.
 *
 * Context: The caller holds the page table lock.  The PTEs map consecutive
 * pages that belong to the same folio.  The PTEs are all in the same PMD.
 */
static inline void clear_ptes(struct mm_struct *mm, unsigned long addr,
                pte_t *ptep, unsigned int nr)
{
        clear_full_ptes(mm, addr, ptep, nr, 0);
}

/*
 * If two threads concurrently fault at the same page, the thread that
 * won the race updates the PTE and its local TLB/Cache. The other thread
 * gives up, simply does nothing, and continues; on architectures where
 * software can update TLB,  local TLB can be updated here to avoid next page
 * fault. This function updates TLB only, do nothing with cache or others.
 * It is the difference with function update_mmu_cache.
 */
#ifndef update_mmu_tlb_range
static inline void update_mmu_tlb_range(struct vm_area_struct *vma,
                                unsigned long address, pte_t *ptep, unsigned int nr)
{
}
#endif

static inline void update_mmu_tlb(struct vm_area_struct *vma,
                                unsigned long address, pte_t *ptep)
{
        update_mmu_tlb_range(vma, address, ptep, 1);
}

/*
 * Some architectures may be able to avoid expensive synchronization
 * primitives when modifications are made to PTE's which are already
 * not present, or in the process of an address space destruction.
 */
#ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL
static inline void pte_clear_not_present_full(struct mm_struct *mm,
                                              unsigned long address,
                                              pte_t *ptep,
                                              int full)
{
        pte_clear(mm, address, ptep);
}
#endif

#ifndef clear_not_present_full_ptes
/**
 * clear_not_present_full_ptes - Clear multiple not present PTEs which are
 *                                 consecutive in the pgtable.
 * @mm: Address space the ptes represent.
 * @addr: Address of the first pte.
 * @ptep: Page table pointer for the first entry.
 * @nr: Number of entries to clear.
 * @full: Whether we are clearing a full mm.
 *
 * May be overridden by the architecture; otherwise, implemented as a simple
 * loop over pte_clear_not_present_full().
 *
 * Context: The caller holds the page table lock.  The PTEs are all not present.
 * The PTEs are all in the same PMD.
 */
static inline void clear_not_present_full_ptes(struct mm_struct *mm,
                unsigned long addr, pte_t *ptep, unsigned int nr, int full)
{
        for (;;) {
                pte_clear_not_present_full(mm, addr, ptep, full);
                if (--nr == 0)
                        break;
                ptep++;
                addr += PAGE_SIZE;
        }
}
#endif

#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
                              unsigned long address,
                              pte_t *ptep);
#endif

#ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
                              unsigned long address,
                              pmd_t *pmdp);
extern pud_t pudp_huge_clear_flush(struct vm_area_struct *vma,
                              unsigned long address,
                              pud_t *pudp);
#endif

#ifndef pte_mkwrite
static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
        return pte_mkwrite_novma(pte);
}
#endif

#if defined(CONFIG_ARCH_WANT_PMD_MKWRITE) && !defined(pmd_mkwrite)
static inline pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
{
        return pmd_mkwrite_novma(pmd);
}
#endif

#ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT
struct mm_struct;
static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep)
{
        pte_t old_pte = ptep_get(ptep);
        set_pte_at(mm, address, ptep, pte_wrprotect(old_pte));
}
#endif

#ifndef wrprotect_ptes
/**
 * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same
 *                    folio.
 * @mm: Address space the pages are mapped into.
 * @addr: Address the first page is mapped at.
 * @ptep: Page table pointer for the first entry.
 * @nr: Number of entries to write-protect.
 *
 * May be overridden by the architecture; otherwise, implemented as a simple
 * loop over ptep_set_wrprotect().
 *
 * Note that PTE bits in the PTE range besides the PFN can differ. For example,
 * some PTEs might be write-protected.
 *
 * Context: The caller holds the page table lock.  The PTEs map consecutive
 * pages that belong to the same folio.  The PTEs are all in the same PMD.
 */
static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
                pte_t *ptep, unsigned int nr)
{
        for (;;) {
                ptep_set_wrprotect(mm, addr, ptep);
                if (--nr == 0)
                        break;
                ptep++;
                addr += PAGE_SIZE;
        }
}
#endif

/*
 * On some architectures hardware does not set page access bit when accessing
 * memory page, it is responsibility of software setting this bit. It brings
 * out extra page fault penalty to track page access bit. For optimization page
 * access bit can be set during all page fault flow on these arches.
 * To be differentiate with macro pte_mkyoung, this macro is used on platforms
 * where software maintains page access bit.
 */
#ifndef pte_sw_mkyoung
static inline pte_t pte_sw_mkyoung(pte_t pte)
{
        return pte;
}
#define pte_sw_mkyoung        pte_sw_mkyoung
#endif

#ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long address, pmd_t *pmdp)
{
        pmd_t old_pmd = *pmdp;
        set_pmd_at(mm, address, pmdp, pmd_wrprotect(old_pmd));
}
#else
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long address, pmd_t *pmdp)
{
        BUILD_BUG();
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
#ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline void pudp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long address, pud_t *pudp)
{
        pud_t old_pud = *pudp;

        set_pud_at(mm, address, pudp, pud_wrprotect(old_pud));
}
#else
static inline void pudp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long address, pud_t *pudp)
{
        BUILD_BUG();
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
#endif

#ifndef pmdp_collapse_flush
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp);
#else
static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
                                        unsigned long address,
                                        pmd_t *pmdp)
{
        BUILD_BUG();
        return *pmdp;
}
#define pmdp_collapse_flush pmdp_collapse_flush
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif

#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
                                       pgtable_t pgtable);
#endif

#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
#endif

#ifndef arch_needs_pgtable_deposit
#define arch_needs_pgtable_deposit() (false)
#endif

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
 * This is an implementation of pmdp_establish() that is only suitable for an
 * architecture that doesn't have hardware dirty/accessed bits. In this case we
 * can't race with CPU which sets these bits and non-atomic approach is fine.
 */
static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmdp, pmd_t pmd)
{
        pmd_t old_pmd = *pmdp;
        set_pmd_at(vma->vm_mm, address, pmdp, pmd);
        return old_pmd;
}
#endif

#ifndef __HAVE_ARCH_PMDP_INVALIDATE
extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
                            pmd_t *pmdp);
#endif

#ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD

/*
 * pmdp_invalidate_ad() invalidates the PMD while changing a transparent
 * hugepage mapping in the page tables. This function is similar to
 * pmdp_invalidate(), but should only be used if the access and dirty bits would
 * not be cleared by the software in the new PMD value. The function ensures
 * that hardware changes of the access and dirty bits updates would not be lost.
 *
 * Doing so can allow in certain architectures to avoid a TLB flush in most
 * cases. Yet, another TLB flush might be necessary later if the PMD update
 * itself requires such flush (e.g., if protection was set to be stricter). Yet,
 * even when a TLB flush is needed because of the update, the caller may be able
 * to batch these TLB flushing operations, so fewer TLB flush operations are
 * needed.
 */
extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma,
                                unsigned long address, pmd_t *pmdp);
#endif

#ifndef __HAVE_ARCH_PTE_SAME
static inline int pte_same(pte_t pte_a, pte_t pte_b)
{
        return pte_val(pte_a) == pte_val(pte_b);
}
#endif

#ifndef __HAVE_ARCH_PTE_UNUSED
/*
 * Some architectures provide facilities to virtualization guests
 * so that they can flag allocated pages as unused. This allows the
 * host to transparently reclaim unused pages. This function returns
 * whether the pte's page is unused.
 */
static inline int pte_unused(pte_t pte)
{
        return 0;
}
#endif

#ifndef pte_access_permitted
#define pte_access_permitted(pte, write) \
        (pte_present(pte) && (!(write) || pte_write(pte)))
#endif

#ifndef pmd_access_permitted
#define pmd_access_permitted(pmd, write) \
        (pmd_present(pmd) && (!(write) || pmd_write(pmd)))
#endif

#ifndef pud_access_permitted
#define pud_access_permitted(pud, write) \
        (pud_present(pud) && (!(write) || pud_write(pud)))
#endif

#ifndef p4d_access_permitted
#define p4d_access_permitted(p4d, write) \
        (p4d_present(p4d) && (!(write) || p4d_write(p4d)))
#endif

#ifndef pgd_access_permitted
#define pgd_access_permitted(pgd, write) \
        (pgd_present(pgd) && (!(write) || pgd_write(pgd)))
#endif

#ifndef __HAVE_ARCH_PMD_SAME
static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
{
        return pmd_val(pmd_a) == pmd_val(pmd_b);
}
#endif

#ifndef pud_same
static inline int pud_same(pud_t pud_a, pud_t pud_b)
{
        return pud_val(pud_a) == pud_val(pud_b);
}
#define pud_same pud_same
#endif

#ifndef __HAVE_ARCH_P4D_SAME
static inline int p4d_same(p4d_t p4d_a, p4d_t p4d_b)
{
        return p4d_val(p4d_a) == p4d_val(p4d_b);
}
#endif

#ifndef __HAVE_ARCH_PGD_SAME
static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b)
{
        return pgd_val(pgd_a) == pgd_val(pgd_b);
}
#endif

#ifndef __HAVE_ARCH_DO_SWAP_PAGE
static inline void arch_do_swap_page_nr(struct mm_struct *mm,
                                     struct vm_area_struct *vma,
                                     unsigned long addr,
                                     pte_t pte, pte_t oldpte,
                                     int nr)
{

}
#else
/*
 * Some architectures support metadata associated with a page. When a
 * page is being swapped out, this metadata must be saved so it can be
 * restored when the page is swapped back in. SPARC M7 and newer
 * processors support an ADI (Application Data Integrity) tag for the
 * page as metadata for the page. arch_do_swap_page() can restore this
 * metadata when a page is swapped back in.
 */
static inline void arch_do_swap_page_nr(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
                                        unsigned long addr,
                                        pte_t pte, pte_t oldpte,
                                        int nr)
{
        for (int i = 0; i < nr; i++) {
                arch_do_swap_page(vma->vm_mm, vma, addr + i * PAGE_SIZE,
                                pte_advance_pfn(pte, i),
                                pte_advance_pfn(oldpte, i));
        }
}
#endif

#ifndef __HAVE_ARCH_UNMAP_ONE
/*
 * Some architectures support metadata associated with a page. When a
 * page is being swapped out, this metadata must be saved so it can be
 * restored when the page is swapped back in. SPARC M7 and newer
 * processors support an ADI (Application Data Integrity) tag for the
 * page as metadata for the page. arch_unmap_one() can save this
 * metadata on a swap-out of a page.
 */
static inline int arch_unmap_one(struct mm_struct *mm,
                                  struct vm_area_struct *vma,
                                  unsigned long addr,
                                  pte_t orig_pte)
{
        return 0;
}
#endif

/*
 * Allow architectures to preserve additional metadata associated with
 * swapped-out pages. The corresponding __HAVE_ARCH_SWAP_* macros and function
 * prototypes must be defined in the arch-specific asm/pgtable.h file.
 */
#ifndef __HAVE_ARCH_PREPARE_TO_SWAP
static inline int arch_prepare_to_swap(struct folio *folio)
{
        return 0;
}
#endif

#ifndef __HAVE_ARCH_SWAP_INVALIDATE
static inline void arch_swap_invalidate_page(int type, pgoff_t offset)
{
}

static inline void arch_swap_invalidate_area(int type)
{
}
#endif

#ifndef __HAVE_ARCH_SWAP_RESTORE
static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
{
}
#endif

#ifndef __HAVE_ARCH_MOVE_PTE
#define move_pte(pte, old_addr, new_addr)        (pte)
#endif

#ifndef pte_accessible
# define pte_accessible(mm, pte)        ((void)(pte), 1)
#endif

#ifndef flush_tlb_fix_spurious_fault
#define flush_tlb_fix_spurious_fault(vma, address, ptep) flush_tlb_page(vma, address)
#endif

/*
 * When walking page tables, get the address of the next boundary,
 * or the end address of the range if that comes earlier.  Although no
 * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout.
 */

#define pgd_addr_end(addr, end)                                                \
({        unsigned long __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK;        \
        (__boundary - 1 < (end) - 1)? __boundary: (end);                \
})

#ifndef p4d_addr_end
#define p4d_addr_end(addr, end)                                                \
({        unsigned long __boundary = ((addr) + P4D_SIZE) & P4D_MASK;        \
        (__boundary - 1 < (end) - 1)? __boundary: (end);                \
})
#endif

#ifndef pud_addr_end
#define pud_addr_end(addr, end)                                                \
({        unsigned long __boundary = ((addr) + PUD_SIZE) & PUD_MASK;        \
        (__boundary - 1 < (end) - 1)? __boundary: (end);                \
})
#endif

#ifndef pmd_addr_end
#define pmd_addr_end(addr, end)                                                \
({        unsigned long __boundary = ((addr) + PMD_SIZE) & PMD_MASK;        \
        (__boundary - 1 < (end) - 1)? __boundary: (end);                \
})
#endif

/*
 * When walking page tables, we usually want to skip any p?d_none entries;
 * and any p?d_bad entries - reporting the error before resetting to none.
 * Do the tests inline, but report and clear the bad entry in mm/memory.c.
 */
void pgd_clear_bad(pgd_t *);

#ifndef __PAGETABLE_P4D_FOLDED
void p4d_clear_bad(p4d_t *);
#else
#define p4d_clear_bad(p4d)        do { } while (0)
#endif

#ifndef __PAGETABLE_PUD_FOLDED
void pud_clear_bad(pud_t *);
#else
#define pud_clear_bad(p4d)        do { } while (0)
#endif

void pmd_clear_bad(pmd_t *);

static inline int pgd_none_or_clear_bad(pgd_t *pgd)
{
        if (pgd_none(*pgd))
                return 1;
        if (unlikely(pgd_bad(*pgd))) {
                pgd_clear_bad(pgd);
                return 1;
        }
        return 0;
}

static inline int p4d_none_or_clear_bad(p4d_t *p4d)
{
        if (p4d_none(*p4d))
                return 1;
        if (unlikely(p4d_bad(*p4d))) {
                p4d_clear_bad(p4d);
                return 1;
        }
        return 0;
}

static inline int pud_none_or_clear_bad(pud_t *pud)
{
        if (pud_none(*pud))
                return 1;
        if (unlikely(pud_bad(*pud))) {
                pud_clear_bad(pud);
                return 1;
        }
        return 0;
}

static inline int pmd_none_or_clear_bad(pmd_t *pmd)
{
        if (pmd_none(*pmd))
                return 1;
        if (unlikely(pmd_bad(*pmd))) {
                pmd_clear_bad(pmd);
                return 1;
        }
        return 0;
}

static inline pte_t __ptep_modify_prot_start(struct vm_area_struct *vma,
                                             unsigned long addr,
                                             pte_t *ptep)
{
        /*
         * Get the current pte state, but zero it out to make it
         * non-present, preventing the hardware from asynchronously
         * updating it.
         */
        return ptep_get_and_clear(vma->vm_mm, addr, ptep);
}

static inline void __ptep_modify_prot_commit(struct vm_area_struct *vma,
                                             unsigned long addr,
                                             pte_t *ptep, pte_t pte)
{
        /*
         * The pte is non-present, so there's no hardware state to
         * preserve.
         */
        set_pte_at(vma->vm_mm, addr, ptep, pte);
}

#ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
/*
 * Start a pte protection read-modify-write transaction, which
 * protects against asynchronous hardware modifications to the pte.
 * The intention is not to prevent the hardware from making pte
 * updates, but to prevent any updates it may make from being lost.
 *
 * This does not protect against other software modifications of the
 * pte; the appropriate pte lock must be held over the transaction.
 *
 * Note that this interface is intended to be batchable, meaning that
 * ptep_modify_prot_commit may not actually update the pte, but merely
 * queue the update to be done at some later time.  The update must be
 * actually committed before the pte lock is released, however.
 */
static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma,
                                           unsigned long addr,
                                           pte_t *ptep)
{
        return __ptep_modify_prot_start(vma, addr, ptep);
}

/*
 * Commit an update to a pte, leaving any hardware-controlled bits in
 * the PTE unmodified. The pte returned from ptep_modify_prot_start() may
 * additionally have young and/or dirty bits set where previously they were not,
 * so the updated pte may have these additional changes.
 */
static inline void ptep_modify_prot_commit(struct vm_area_struct *vma,
                                           unsigned long addr,
                                           pte_t *ptep, pte_t old_pte, pte_t pte)
{
        __ptep_modify_prot_commit(vma, addr, ptep, pte);
}
#endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */

/**
 * modify_prot_start_ptes - Start a pte protection read-modify-write transaction
 * over a batch of ptes, which protects against asynchronous hardware
 * modifications to the ptes. The intention is not to prevent the hardware from
 * making pte updates, but to prevent any updates it may make from being lost.
 * Please see the comment above ptep_modify_prot_start() for full description.
 *
 * @vma: The virtual memory area the pages are mapped into.
 * @addr: Address the first page is mapped at.
 * @ptep: Page table pointer for the first entry.
 * @nr: Number of entries.
 *
 * May be overridden by the architecture; otherwise, implemented as a simple
 * loop over ptep_modify_prot_start(), collecting the a/d bits from each pte
 * in the batch.
 *
 * Note that PTE bits in the PTE batch besides the PFN can differ.
 *
 * Context: The caller holds the page table lock.  The PTEs map consecutive
 * pages that belong to the same folio. All other PTE bits must be identical for
 * all PTEs in the batch except for young and dirty bits.  The PTEs are all in
 * the same PMD.
 */
#ifndef modify_prot_start_ptes
static inline pte_t modify_prot_start_ptes(struct vm_area_struct *vma,
                unsigned long addr, pte_t *ptep, unsigned int nr)
{
        pte_t pte, tmp_pte;

        pte = ptep_modify_prot_start(vma, addr, ptep);
        while (--nr) {
                ptep++;
                addr += PAGE_SIZE;
                tmp_pte = ptep_modify_prot_start(vma, addr, ptep);
                if (pte_dirty(tmp_pte))
                        pte = pte_mkdirty(pte);
                if (pte_young(tmp_pte))
                        pte = pte_mkyoung(pte);
        }
        return pte;
}
#endif

/**
 * modify_prot_commit_ptes - Commit an update to a batch of ptes, leaving any
 * hardware-controlled bits in the PTE unmodified.
 *
 * @vma: The virtual memory area the pages are mapped into.
 * @addr: Address the first page is mapped at.
 * @ptep: Page table pointer for the first entry.
 * @old_pte: Old page table entry (for the first entry) which is now cleared.
 * @pte: New page table entry to be set.
 * @nr: Number of entries.
 *
 * May be overridden by the architecture; otherwise, implemented as a simple
 * loop over ptep_modify_prot_commit().
 *
 * Context: The caller holds the page table lock. The PTEs are all in the same
 * PMD. On exit, the set ptes in the batch map the same folio. The ptes set by
 * ptep_modify_prot_start() may additionally have young and/or dirty bits set
 * where previously they were not, so the updated ptes may have these
 * additional changes.
 */
#ifndef modify_prot_commit_ptes
static inline void modify_prot_commit_ptes(struct vm_area_struct *vma, unsigned long addr,
                pte_t *ptep, pte_t old_pte, pte_t pte, unsigned int nr)
{
        int i;

        for (i = 0; i < nr; ++i, ++ptep, addr += PAGE_SIZE) {
                ptep_modify_prot_commit(vma, addr, ptep, old_pte, pte);

                /* Advance PFN only, set same prot */
                old_pte = pte_next_pfn(old_pte);
                pte = pte_next_pfn(pte);
        }
}
#endif

/*
 * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values
 * and let generic vmalloc, ioremap and page table update code know when
 * arch_sync_kernel_mappings() needs to be called.
 */
#ifndef ARCH_PAGE_TABLE_SYNC_MASK
#define ARCH_PAGE_TABLE_SYNC_MASK 0
#endif

/*
 * There is no default implementation for arch_sync_kernel_mappings(). It is
 * relied upon the compiler to optimize calls out if ARCH_PAGE_TABLE_SYNC_MASK
 * is 0.
 */
void arch_sync_kernel_mappings(unsigned long start, unsigned long end);

#endif /* CONFIG_MMU */

/*
 * No-op macros that just return the current protection value. Defined here
 * because these macros can be used even if CONFIG_MMU is not defined.
 */

#ifndef pgprot_nx
#define pgprot_nx(prot)        (prot)
#endif

#ifndef pgprot_noncached
#define pgprot_noncached(prot)        (prot)
#endif

#ifndef pgprot_writecombine
#define pgprot_writecombine pgprot_noncached
#endif

#ifndef pgprot_writethrough
#define pgprot_writethrough pgprot_noncached
#endif

#ifndef pgprot_device
#define pgprot_device pgprot_noncached
#endif

#ifndef pgprot_mhp
#define pgprot_mhp(prot)        (prot)
#endif

#ifdef CONFIG_MMU
#ifndef pgprot_modify
#define pgprot_modify pgprot_modify
static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
{
        if (pgprot_val(oldprot) == pgprot_val(pgprot_noncached(oldprot)))
                newprot = pgprot_noncached(newprot);
        if (pgprot_val(oldprot) == pgprot_val(pgprot_writecombine(oldprot)))
                newprot = pgprot_writecombine(newprot);
        if (pgprot_val(oldprot) == pgprot_val(pgprot_device(oldprot)))
                newprot = pgprot_device(newprot);
        return newprot;
}
#endif
#endif /* CONFIG_MMU */

#ifndef pgprot_encrypted
#define pgprot_encrypted(prot)        (prot)
#endif

#ifndef pgprot_decrypted
#define pgprot_decrypted(prot)        (prot)
#endif

/*
 * A facility to provide batching of the reload of page tables and
 * other process state with the actual context switch code for
 * paravirtualized guests.  By convention, only one of the batched
 * update (lazy) modes (CPU, MMU) should be active at any given time,
 * entry should never be nested, and entry and exits should always be
 * paired.  This is for sanity of maintaining and reasoning about the
 * kernel code.  In this case, the exit (end of the context switch) is
 * in architecture-specific code, and so doesn't need a generic
 * definition.
 */
#ifndef __HAVE_ARCH_START_CONTEXT_SWITCH
#define arch_start_context_switch(prev)        do {} while (0)
#endif

#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
#ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION
static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
{
        return pmd;
}

static inline int pmd_swp_soft_dirty(pmd_t pmd)
{
        return 0;
}

static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
{
        return pmd;
}
#endif
#else /* !CONFIG_HAVE_ARCH_SOFT_DIRTY */
static inline int pte_soft_dirty(pte_t pte)
{
        return 0;
}

static inline int pmd_soft_dirty(pmd_t pmd)
{
        return 0;
}

static inline pte_t pte_mksoft_dirty(pte_t pte)
{
        return pte;
}

static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
{
        return pmd;
}

static inline pte_t pte_clear_soft_dirty(pte_t pte)
{
        return pte;
}

static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
{
        return pmd;
}

static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
{
        return pte;
}

static inline int pte_swp_soft_dirty(pte_t pte)
{
        return 0;
}

static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
{
        return pte;
}

static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
{
        return pmd;
}

static inline int pmd_swp_soft_dirty(pmd_t pmd)
{
        return 0;
}

static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
{
        return pmd;
}
#endif

#ifndef __HAVE_PFNMAP_TRACKING
/*
 * Interfaces that can be used by architecture code to keep track of
 * memory type of pfn mappings specified by the remap_pfn_range,
 * vmf_insert_pfn.
 */

static inline int pfnmap_setup_cachemode(unsigned long pfn, unsigned long size,
                pgprot_t *prot)
{
        return 0;
}

static inline int pfnmap_track(unsigned long pfn, unsigned long size,
                pgprot_t *prot)
{
        return 0;
}

static inline void pfnmap_untrack(unsigned long pfn, unsigned long size)
{
}
#else
/**
 * pfnmap_setup_cachemode - setup the cachemode in the pgprot for a pfn range
 * @pfn: the start of the pfn range
 * @size: the size of the pfn range in bytes
 * @prot: the pgprot to modify
 *
 * Lookup the cachemode for the pfn range starting at @pfn with the size
 * @size and store it in @prot, leaving other data in @prot unchanged.
 *
 * This allows for a hardware implementation to have fine-grained control of
 * memory cache behavior at page level granularity. Without a hardware
 * implementation, this function does nothing.
 *
 * Currently there is only one implementation for this - x86 Page Attribute
 * Table (PAT). See Documentation/arch/x86/pat.rst for more details.
 *
 * This function can fail if the pfn range spans pfns that require differing
 * cachemodes. If the pfn range was previously verified to have a single
 * cachemode, it is sufficient to query only a single pfn. The assumption is
 * that this is the case for drivers using the vmf_insert_pfn*() interface.
 *
 * Returns 0 on success and -EINVAL on error.
 */
int pfnmap_setup_cachemode(unsigned long pfn, unsigned long size,
                pgprot_t *prot);

/**
 * pfnmap_track - track a pfn range
 * @pfn: the start of the pfn range
 * @size: the size of the pfn range in bytes
 * @prot: the pgprot to track
 *
 * Requested the pfn range to be 'tracked' by a hardware implementation and
 * setup the cachemode in @prot similar to pfnmap_setup_cachemode().
 *
 * This allows for fine-grained control of memory cache behaviour at page
 * level granularity. Tracking memory this way is persisted across VMA splits
 * (VMA merging does not apply for VM_PFNMAP).
 *
 * Currently, there is only one implementation for this - x86 Page Attribute
 * Table (PAT). See Documentation/arch/x86/pat.rst for more details.
 *
 * Returns 0 on success and -EINVAL on error.
 */
int pfnmap_track(unsigned long pfn, unsigned long size, pgprot_t *prot);

/**
 * pfnmap_untrack - untrack a pfn range
 * @pfn: the start of the pfn range
 * @size: the size of the pfn range in bytes
 *
 * Untrack a pfn range previously tracked through pfnmap_track().
 */
void pfnmap_untrack(unsigned long pfn, unsigned long size);
#endif

/**
 * pfnmap_setup_cachemode_pfn - setup the cachemode in the pgprot for a pfn
 * @pfn: the pfn
 * @prot: the pgprot to modify
 *
 * Lookup the cachemode for @pfn and store it in @prot, leaving other
 * data in @prot unchanged.
 *
 * See pfnmap_setup_cachemode() for details.
 */
static inline void pfnmap_setup_cachemode_pfn(unsigned long pfn, pgprot_t *prot)
{
        pfnmap_setup_cachemode(pfn, PAGE_SIZE, prot);
}

#ifdef CONFIG_MMU
#ifdef __HAVE_COLOR_ZERO_PAGE
static inline int is_zero_pfn(unsigned long pfn)
{
        extern unsigned long zero_pfn;
        unsigned long offset_from_zero_pfn = pfn - zero_pfn;
        return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
}

#define my_zero_pfn(addr)        page_to_pfn(ZERO_PAGE(addr))

#else
static inline int is_zero_pfn(unsigned long pfn)
{
        extern unsigned long zero_pfn;
        return pfn == zero_pfn;
}

static inline unsigned long my_zero_pfn(unsigned long addr)
{
        extern unsigned long zero_pfn;
        return zero_pfn;
}
#endif
#else
static inline int is_zero_pfn(unsigned long pfn)
{
        return 0;
}

static inline unsigned long my_zero_pfn(unsigned long addr)
{
        return 0;
}
#endif /* CONFIG_MMU */

#ifdef CONFIG_MMU

#ifndef CONFIG_TRANSPARENT_HUGEPAGE
static inline int pmd_trans_huge(pmd_t pmd)
{
        return 0;
}
#ifndef pmd_write
static inline int pmd_write(pmd_t pmd)
{
        BUG();
        return 0;
}
#endif /* pmd_write */
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#ifndef pud_write
static inline int pud_write(pud_t pud)
{
        BUG();
        return 0;
}
#endif /* pud_write */

#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \
        !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
static inline int pud_trans_huge(pud_t pud)
{
        return 0;
}
#endif

static inline int pud_trans_unstable(pud_t *pud)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
        defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
        pud_t pudval = READ_ONCE(*pud);

        if (pud_none(pudval) || pud_trans_huge(pudval))
                return 1;
        if (unlikely(pud_bad(pudval))) {
                pud_clear_bad(pud);
                return 1;
        }
#endif
        return 0;
}

#ifndef CONFIG_NUMA_BALANCING
/*
 * In an inaccessible (PROT_NONE) VMA, pte_protnone() may indicate "yes". It is
 * perfectly valid to indicate "no" in that case, which is why our default
 * implementation defaults to "always no".
 *
 * In an accessible VMA, however, pte_protnone() reliably indicates PROT_NONE
 * page protection due to NUMA hinting. NUMA hinting faults only apply in
 * accessible VMAs.
 *
 * So, to reliably identify PROT_NONE PTEs that require a NUMA hinting fault,
 * looking at the VMA accessibility is sufficient.
 */
static inline int pte_protnone(pte_t pte)
{
        return 0;
}

static inline int pmd_protnone(pmd_t pmd)
{
        return 0;
}
#endif /* CONFIG_NUMA_BALANCING */

#endif /* CONFIG_MMU */

#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP

#ifndef __PAGETABLE_P4D_FOLDED
int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot);
void p4d_clear_huge(p4d_t *p4d);
#else
static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
{
        return 0;
}
static inline void p4d_clear_huge(p4d_t *p4d) { }
#endif /* !__PAGETABLE_P4D_FOLDED */

int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot);
int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot);
int pud_clear_huge(pud_t *pud);
int pmd_clear_huge(pmd_t *pmd);
int p4d_free_pud_page(p4d_t *p4d, unsigned long addr);
int pud_free_pmd_page(pud_t *pud, unsigned long addr);
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr);
#else        /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
{
        return 0;
}
static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
{
        return 0;
}
static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
{
        return 0;
}
static inline void p4d_clear_huge(p4d_t *p4d) { }
static inline int pud_clear_huge(pud_t *pud)
{
        return 0;
}
static inline int pmd_clear_huge(pmd_t *pmd)
{
        return 0;
}
static inline int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
{
        return 0;
}
static inline int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{
        return 0;
}
static inline int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{
        return 0;
}
#endif        /* CONFIG_HAVE_ARCH_HUGE_VMAP */

#ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
 * ARCHes with special requirements for evicting THP backing TLB entries can
 * implement this. Otherwise also, it can help optimize normal TLB flush in
 * THP regime. Stock flush_tlb_range() typically has optimization to nuke the
 * entire TLB if flush span is greater than a threshold, which will
 * likely be true for a single huge page. Thus a single THP flush will
 * invalidate the entire TLB which is not desirable.
 * e.g. see arch/arc: flush_pmd_tlb_range
 */
#define flush_pmd_tlb_range(vma, addr, end)        flush_tlb_range(vma, addr, end)
#define flush_pud_tlb_range(vma, addr, end)        flush_tlb_range(vma, addr, end)
#else
#define flush_pmd_tlb_range(vma, addr, end)        BUILD_BUG()
#define flush_pud_tlb_range(vma, addr, end)        BUILD_BUG()
#endif
#endif

struct file;
int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
                        unsigned long size, pgprot_t *vma_prot);

#ifndef CONFIG_X86_ESPFIX64
static inline void init_espfix_bsp(void) { }
#endif

extern void __init pgtable_cache_init(void);

#ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED
static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
{
        return true;
}

static inline bool arch_has_pfn_modify_check(void)
{
        return false;
}
#endif /* !_HAVE_ARCH_PFN_MODIFY_ALLOWED */

/*
 * Architecture PAGE_KERNEL_* fallbacks
 *
 * Some architectures don't define certain PAGE_KERNEL_* flags. This is either
 * because they really don't support them, or the port needs to be updated to
 * reflect the required functionality. Below are a set of relatively safe
 * fallbacks, as best effort, which we can count on in lieu of the architectures
 * not defining them on their own yet.
 */

#ifndef PAGE_KERNEL_RO
# define PAGE_KERNEL_RO PAGE_KERNEL
#endif

#ifndef PAGE_KERNEL_EXEC
# define PAGE_KERNEL_EXEC PAGE_KERNEL
#endif

/*
 * Page Table Modification bits for pgtbl_mod_mask.
 *
 * These are used by the p?d_alloc_track*() and p*d_populate_kernel()
 * functions in the generic vmalloc, ioremap and page table update code
 * to track at which page-table levels entries have been modified.
 * Based on that the code can better decide when page table changes need
 * to be synchronized to other page-tables in the system.
 */
#define                __PGTBL_PGD_MODIFIED        0
#define                __PGTBL_P4D_MODIFIED        1
#define                __PGTBL_PUD_MODIFIED        2
#define                __PGTBL_PMD_MODIFIED        3
#define                __PGTBL_PTE_MODIFIED        4

#define                PGTBL_PGD_MODIFIED        BIT(__PGTBL_PGD_MODIFIED)
#define                PGTBL_P4D_MODIFIED        BIT(__PGTBL_P4D_MODIFIED)
#define                PGTBL_PUD_MODIFIED        BIT(__PGTBL_PUD_MODIFIED)
#define                PGTBL_PMD_MODIFIED        BIT(__PGTBL_PMD_MODIFIED)
#define                PGTBL_PTE_MODIFIED        BIT(__PGTBL_PTE_MODIFIED)

/* Page-Table Modification Mask */
typedef unsigned int pgtbl_mod_mask;

enum pgtable_level {
        PGTABLE_LEVEL_PTE = 0,
        PGTABLE_LEVEL_PMD,
        PGTABLE_LEVEL_PUD,
        PGTABLE_LEVEL_P4D,
        PGTABLE_LEVEL_PGD,
};

static inline const char *pgtable_level_to_str(enum pgtable_level level)
{
        switch (level) {
        case PGTABLE_LEVEL_PTE:
                return "pte";
        case PGTABLE_LEVEL_PMD:
                return "pmd";
        case PGTABLE_LEVEL_PUD:
                return "pud";
        case PGTABLE_LEVEL_P4D:
                return "p4d";
        case PGTABLE_LEVEL_PGD:
                return "pgd";
        default:
                return "unknown";
        }
}

#endif /* !__ASSEMBLY__ */

#if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT)
#ifdef CONFIG_PHYS_ADDR_T_64BIT
/*
 * ZSMALLOC needs to know the highest PFN on 32-bit architectures
 * with physical address space extension, but falls back to
 * BITS_PER_LONG otherwise.
 */
#error Missing MAX_POSSIBLE_PHYSMEM_BITS definition
#else
#define MAX_POSSIBLE_PHYSMEM_BITS 32
#endif
#endif

#ifndef has_transparent_hugepage
#define has_transparent_hugepage() IS_BUILTIN(CONFIG_TRANSPARENT_HUGEPAGE)
#endif

#ifndef has_transparent_pud_hugepage
#define has_transparent_pud_hugepage() IS_BUILTIN(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
#endif
/*
 * On some architectures it depends on the mm if the p4d/pud or pmd
 * layer of the page table hierarchy is folded or not.
 */
#ifndef mm_p4d_folded
#define mm_p4d_folded(mm)        __is_defined(__PAGETABLE_P4D_FOLDED)
#endif

#ifndef mm_pud_folded
#define mm_pud_folded(mm)        __is_defined(__PAGETABLE_PUD_FOLDED)
#endif

#ifndef mm_pmd_folded
#define mm_pmd_folded(mm)        __is_defined(__PAGETABLE_PMD_FOLDED)
#endif

#ifndef p4d_offset_lockless
#define p4d_offset_lockless(pgdp, pgd, address) p4d_offset(&(pgd), address)
#endif
#ifndef pud_offset_lockless
#define pud_offset_lockless(p4dp, p4d, address) pud_offset(&(p4d), address)
#endif
#ifndef pmd_offset_lockless
#define pmd_offset_lockless(pudp, pud, address) pmd_offset(&(pud), address)
#endif

/*
 * pXd_leaf() is the API to check whether a pgtable entry is a huge page
 * mapping.  It should work globally across all archs, without any
 * dependency on CONFIG_* options.  For architectures that do not support
 * huge mappings on specific levels, below fallbacks will be used.
 *
 * A leaf pgtable entry should always imply the following:
 *
 * - It is a "present" entry.  IOW, before using this API, please check it
 *   with pXd_present() first. NOTE: it may not always mean the "present
 *   bit" is set.  For example, PROT_NONE entries are always "present".
 *
 * - It should _never_ be a swap entry of any type.  Above "present" check
 *   should have guarded this, but let's be crystal clear on this.
 *
 * - It should contain a huge PFN, which points to a huge page larger than
 *   PAGE_SIZE of the platform.  The PFN format isn't important here.
 *
 * - It should cover all kinds of huge mappings (i.e. pXd_trans_huge()
 *   or hugetlb mappings).
 */
#ifndef pgd_leaf
#define pgd_leaf(x)        false
#endif
#ifndef p4d_leaf
#define p4d_leaf(x)        false
#endif
#ifndef pud_leaf
#define pud_leaf(x)        false
#endif
#ifndef pmd_leaf
#define pmd_leaf(x)        false
#endif

#ifndef pgd_leaf_size
#define pgd_leaf_size(x) (1ULL << PGDIR_SHIFT)
#endif
#ifndef p4d_leaf_size
#define p4d_leaf_size(x) P4D_SIZE
#endif
#ifndef pud_leaf_size
#define pud_leaf_size(x) PUD_SIZE
#endif
#ifndef pmd_leaf_size
#define pmd_leaf_size(x) PMD_SIZE
#endif
#ifndef __pte_leaf_size
#ifndef pte_leaf_size
#define pte_leaf_size(x) PAGE_SIZE
#endif
#define __pte_leaf_size(x,y) pte_leaf_size(y)
#endif

/*
 * We always define pmd_pfn for all archs as it's used in lots of generic
 * code.  Now it happens too for pud_pfn (and can happen for larger
 * mappings too in the future; we're not there yet).  Instead of defining
 * it for all archs (like pmd_pfn), provide a fallback.
 *
 * Note that returning 0 here means any arch that didn't define this can
 * get severely wrong when it hits a real pud leaf.  It's arch's
 * responsibility to properly define it when a huge pud is possible.
 */
#ifndef pud_pfn
#define pud_pfn(x) 0
#endif

/*
 * Some architectures have MMUs that are configurable or selectable at boot
 * time. These lead to variable PTRS_PER_x. For statically allocated arrays it
 * helps to have a static maximum value.
 */

#ifndef MAX_PTRS_PER_PTE
#define MAX_PTRS_PER_PTE PTRS_PER_PTE
#endif

#ifndef MAX_PTRS_PER_PMD
#define MAX_PTRS_PER_PMD PTRS_PER_PMD
#endif

#ifndef MAX_PTRS_PER_PUD
#define MAX_PTRS_PER_PUD PTRS_PER_PUD
#endif

#ifndef MAX_PTRS_PER_P4D
#define MAX_PTRS_PER_P4D PTRS_PER_P4D
#endif

#ifndef pte_pgprot
#define pte_pgprot(x) ((pgprot_t) {0})
#endif

#ifndef pmd_pgprot
#define pmd_pgprot(x) ((pgprot_t) {0})
#endif

#ifndef pud_pgprot
#define pud_pgprot(x) ((pgprot_t) {0})
#endif

/* description of effects of mapping type and prot in current implementation.
 * this is due to the limited x86 page protection hardware.  The expected
 * behavior is in parens:
 *
 * map_type        prot
 *                PROT_NONE        PROT_READ        PROT_WRITE        PROT_EXEC
 * MAP_SHARED        r: (no) no        r: (yes) yes        r: (no) yes        r: (no) yes
 *                w: (no) no        w: (no) no        w: (yes) yes        w: (no) no
 *                x: (no) no        x: (no) yes        x: (no) yes        x: (yes) yes
 *
 * MAP_PRIVATE        r: (no) no        r: (yes) yes        r: (no) yes        r: (no) yes
 *                w: (no) no        w: (no) no        w: (copy) copy        w: (no) no
 *                x: (no) no        x: (no) yes        x: (no) yes        x: (yes) yes
 *
 * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and
 * MAP_PRIVATE (with Enhanced PAN supported):
 *                                                                r: (no) no
 *                                                                w: (no) no
 *                                                                x: (yes) yes
 */
#define DECLARE_VM_GET_PAGE_PROT                                        \
pgprot_t vm_get_page_prot(vm_flags_t vm_flags)                                \
{                                                                        \
                return protection_map[vm_flags &                        \
                        (VM_READ | VM_WRITE | VM_EXEC | VM_SHARED)];        \
}                                                                        \
EXPORT_SYMBOL(vm_get_page_prot);

#endif /* _LINUX_PGTABLE_H */





































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_KSM_H
#define __LINUX_KSM_H
/*
 * Memory merging support.
 *
 * This code enables dynamic sharing of identical pages found in different
 * memory areas, even if they are not shared by fork().
 */

#include <linux/bitops.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/sched.h>

#ifdef CONFIG_KSM
int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
                unsigned long end, int advice, vm_flags_t *vm_flags);
vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file,
                         vm_flags_t vm_flags);
int ksm_enable_merge_any(struct mm_struct *mm);
int ksm_disable_merge_any(struct mm_struct *mm);
int ksm_disable(struct mm_struct *mm);

int __ksm_enter(struct mm_struct *mm);
void __ksm_exit(struct mm_struct *mm);
/*
 * To identify zeropages that were mapped by KSM, we reuse the dirty bit
 * in the PTE. If the PTE is dirty, the zeropage was mapped by KSM when
 * deduplicating memory.
 */
#define is_ksm_zero_pte(pte)        (is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte))

extern atomic_long_t ksm_zero_pages;

static inline void ksm_map_zero_page(struct mm_struct *mm)
{
        atomic_long_inc(&ksm_zero_pages);
        atomic_long_inc(&mm->ksm_zero_pages);
}

static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte)
{
        if (is_ksm_zero_pte(pte)) {
                atomic_long_dec(&ksm_zero_pages);
                atomic_long_dec(&mm->ksm_zero_pages);
        }
}

static inline long mm_ksm_zero_pages(struct mm_struct *mm)
{
        return atomic_long_read(&mm->ksm_zero_pages);
}

static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
{
        /* Adding mm to ksm is best effort on fork. */
        if (mm_flags_test(MMF_VM_MERGEABLE, oldmm)) {
                long nr_ksm_zero_pages = atomic_long_read(&mm->ksm_zero_pages);

                mm->ksm_merging_pages = 0;
                mm->ksm_rmap_items = 0;
                atomic_long_add(nr_ksm_zero_pages, &ksm_zero_pages);
                __ksm_enter(mm);
        }
}

static inline int ksm_execve(struct mm_struct *mm)
{
        if (mm_flags_test(MMF_VM_MERGE_ANY, mm))
                return __ksm_enter(mm);

        return 0;
}

static inline void ksm_exit(struct mm_struct *mm)
{
        if (mm_flags_test(MMF_VM_MERGEABLE, mm))
                __ksm_exit(mm);
}

/*
 * When do_swap_page() first faults in from swap what used to be a KSM page,
 * no problem, it will be assigned to this vma's anon_vma; but thereafter,
 * it might be faulted into a different anon_vma (or perhaps to a different
 * offset in the same anon_vma).  do_swap_page() cannot do all the locking
 * needed to reconstitute a cross-anon_vma KSM page: for now it has to make
 * a copy, and leave remerging the pages to a later pass of ksmd.
 *
 * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE,
 * but what if the vma was unmerged while the page was swapped out?
 */
struct folio *ksm_might_need_to_copy(struct folio *folio,
                        struct vm_area_struct *vma, unsigned long addr);

void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc);
void folio_migrate_ksm(struct folio *newfolio, struct folio *folio);
void collect_procs_ksm(const struct folio *folio, const struct page *page,
                struct list_head *to_kill, int force_early);
long ksm_process_profit(struct mm_struct *);
bool ksm_process_mergeable(struct mm_struct *mm);

#else  /* !CONFIG_KSM */

static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm,
                const struct file *file, vm_flags_t vm_flags)
{
        return vm_flags;
}

static inline int ksm_disable(struct mm_struct *mm)
{
        return 0;
}

static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
{
}

static inline int ksm_execve(struct mm_struct *mm)
{
        return 0;
}

static inline void ksm_exit(struct mm_struct *mm)
{
}

static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte)
{
}

static inline void collect_procs_ksm(const struct folio *folio,
                const struct page *page, struct list_head *to_kill,
                int force_early)
{
}

#ifdef CONFIG_MMU
static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
                unsigned long end, int advice, vm_flags_t *vm_flags)
{
        return 0;
}

static inline struct folio *ksm_might_need_to_copy(struct folio *folio,
                        struct vm_area_struct *vma, unsigned long addr)
{
        return folio;
}

static inline void rmap_walk_ksm(struct folio *folio,
                        struct rmap_walk_control *rwc)
{
}

static inline void folio_migrate_ksm(struct folio *newfolio, struct folio *old)
{
}
#endif /* CONFIG_MMU */
#endif /* !CONFIG_KSM */

#endif /* __LINUX_KSM_H */




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  314 
  317 
  311 













































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   13 
   13 
   13 






































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  315 
  319 
  316 




























































































































































































  319 
  320 
  318 








































































































  266 
  268 
  265 








































  316 
  319 
  314 











































































































































































































































































































































































































































































    4 
    4 
    3 

























   15 
   15 
   15 











    2 
    2 
    2 














    1 
    1 
    1 





























    4 
    4 
    4 


























   19 
   20 
   20 











































































































































































































































































































































































































































   14 
   14 































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Security plug functions
 *
 * Copyright (C) 2001 WireX Communications, Inc <chris@wirex.com>
 * Copyright (C) 2001-2002 Greg Kroah-Hartman <greg@kroah.com>
 * Copyright (C) 2001 Networks Associates Technology, Inc <ssmalley@nai.com>
 * Copyright (C) 2016 Mellanox Technologies
 * Copyright (C) 2023 Microsoft Corporation <paul@paul-moore.com>
 */

#define pr_fmt(fmt) "LSM: " fmt

#include <linux/bpf.h>
#include <linux/capability.h>
#include <linux/dcache.h>
#include <linux/export.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kernel_read_file.h>
#include <linux/lsm_hooks.h>
#include <linux/mman.h>
#include <linux/mount.h>
#include <linux/personality.h>
#include <linux/backing-dev.h>
#include <linux/string.h>
#include <linux/xattr.h>
#include <linux/msg.h>
#include <linux/overflow.h>
#include <linux/perf_event.h>
#include <linux/fs.h>
#include <net/flow.h>
#include <net/sock.h>

#define SECURITY_HOOK_ACTIVE_KEY(HOOK, IDX) security_hook_active_##HOOK##_##IDX

/*
 * Identifier for the LSM static calls.
 * HOOK is an LSM hook as defined in linux/lsm_hookdefs.h
 * IDX is the index of the static call. 0 <= NUM < MAX_LSM_COUNT
 */
#define LSM_STATIC_CALL(HOOK, IDX) lsm_static_call_##HOOK##_##IDX

/*
 * Call the macro M for each LSM hook MAX_LSM_COUNT times.
 */
#define LSM_LOOP_UNROLL(M, ...)                 \
do {                                                \
        UNROLL(MAX_LSM_COUNT, M, __VA_ARGS__)        \
} while (0)

#define LSM_DEFINE_UNROLL(M, ...) UNROLL(MAX_LSM_COUNT, M, __VA_ARGS__)

/*
 * These are descriptions of the reasons that can be passed to the
 * security_locked_down() LSM hook. Placing this array here allows
 * all security modules to use the same descriptions for auditing
 * purposes.
 */
const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX + 1] = {
        [LOCKDOWN_NONE] = "none",
        [LOCKDOWN_MODULE_SIGNATURE] = "unsigned module loading",
        [LOCKDOWN_DEV_MEM] = "/dev/mem,kmem,port",
        [LOCKDOWN_EFI_TEST] = "/dev/efi_test access",
        [LOCKDOWN_KEXEC] = "kexec of unsigned images",
        [LOCKDOWN_HIBERNATION] = "hibernation",
        [LOCKDOWN_PCI_ACCESS] = "direct PCI access",
        [LOCKDOWN_IOPORT] = "raw io port access",
        [LOCKDOWN_MSR] = "raw MSR access",
        [LOCKDOWN_ACPI_TABLES] = "modifying ACPI tables",
        [LOCKDOWN_DEVICE_TREE] = "modifying device tree contents",
        [LOCKDOWN_PCMCIA_CIS] = "direct PCMCIA CIS storage",
        [LOCKDOWN_TIOCSSERIAL] = "reconfiguration of serial port IO",
        [LOCKDOWN_MODULE_PARAMETERS] = "unsafe module parameters",
        [LOCKDOWN_MMIOTRACE] = "unsafe mmio",
        [LOCKDOWN_DEBUGFS] = "debugfs access",
        [LOCKDOWN_XMON_WR] = "xmon write access",
        [LOCKDOWN_BPF_WRITE_USER] = "use of bpf to write user RAM",
        [LOCKDOWN_DBG_WRITE_KERNEL] = "use of kgdb/kdb to write kernel RAM",
        [LOCKDOWN_RTAS_ERROR_INJECTION] = "RTAS error injection",
        [LOCKDOWN_INTEGRITY_MAX] = "integrity",
        [LOCKDOWN_KCORE] = "/proc/kcore access",
        [LOCKDOWN_KPROBES] = "use of kprobes",
        [LOCKDOWN_BPF_READ_KERNEL] = "use of bpf to read kernel RAM",
        [LOCKDOWN_DBG_READ_KERNEL] = "use of kgdb/kdb to read kernel RAM",
        [LOCKDOWN_PERF] = "unsafe use of perf",
        [LOCKDOWN_TRACEFS] = "use of tracefs",
        [LOCKDOWN_XMON_RW] = "xmon read and write access",
        [LOCKDOWN_XFRM_SECRET] = "xfrm SA secret",
        [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality",
};

static BLOCKING_NOTIFIER_HEAD(blocking_lsm_notifier_chain);

static struct kmem_cache *lsm_file_cache;
static struct kmem_cache *lsm_inode_cache;

char *lsm_names;
static struct lsm_blob_sizes blob_sizes __ro_after_init;

/* Boot-time LSM user choice */
static __initdata const char *chosen_lsm_order;
static __initdata const char *chosen_major_lsm;

static __initconst const char *const builtin_lsm_order = CONFIG_LSM;

/* Ordered list of LSMs to initialize. */
static __initdata struct lsm_info *ordered_lsms[MAX_LSM_COUNT + 1];
static __initdata struct lsm_info *exclusive;

#ifdef CONFIG_HAVE_STATIC_CALL
#define LSM_HOOK_TRAMP(NAME, NUM) \
        &STATIC_CALL_TRAMP(LSM_STATIC_CALL(NAME, NUM))
#else
#define LSM_HOOK_TRAMP(NAME, NUM) NULL
#endif

/*
 * Define static calls and static keys for each LSM hook.
 */
#define DEFINE_LSM_STATIC_CALL(NUM, NAME, RET, ...)                        \
        DEFINE_STATIC_CALL_NULL(LSM_STATIC_CALL(NAME, NUM),                \
                                *((RET(*)(__VA_ARGS__))NULL));                \
        DEFINE_STATIC_KEY_FALSE(SECURITY_HOOK_ACTIVE_KEY(NAME, NUM));

#define LSM_HOOK(RET, DEFAULT, NAME, ...)                                \
        LSM_DEFINE_UNROLL(DEFINE_LSM_STATIC_CALL, NAME, RET, __VA_ARGS__)
#include <linux/lsm_hook_defs.h>
#undef LSM_HOOK
#undef DEFINE_LSM_STATIC_CALL

/*
 * Initialise a table of static calls for each LSM hook.
 * DEFINE_STATIC_CALL_NULL invocation above generates a key (STATIC_CALL_KEY)
 * and a trampoline (STATIC_CALL_TRAMP) which are used to call
 * __static_call_update when updating the static call.
 *
 * The static calls table is used by early LSMs, some architectures can fault on
 * unaligned accesses and the fault handling code may not be ready by then.
 * Thus, the static calls table should be aligned to avoid any unhandled faults
 * in early init.
 */
struct lsm_static_calls_table
        static_calls_table __ro_after_init __aligned(sizeof(u64)) = {
#define INIT_LSM_STATIC_CALL(NUM, NAME)                                        \
        (struct lsm_static_call) {                                        \
                .key = &STATIC_CALL_KEY(LSM_STATIC_CALL(NAME, NUM)),        \
                .trampoline = LSM_HOOK_TRAMP(NAME, NUM),                \
                .active = &SECURITY_HOOK_ACTIVE_KEY(NAME, NUM),                \
        },
#define LSM_HOOK(RET, DEFAULT, NAME, ...)                                \
        .NAME = {                                                        \
                LSM_DEFINE_UNROLL(INIT_LSM_STATIC_CALL, NAME)                \
        },
#include <linux/lsm_hook_defs.h>
#undef LSM_HOOK
#undef INIT_LSM_STATIC_CALL
        };

static __initdata bool debug;
#define init_debug(...)                                                \
        do {                                                        \
                if (debug)                                        \
                        pr_info(__VA_ARGS__);                        \
        } while (0)

static bool __init is_enabled(struct lsm_info *lsm)
{
        if (!lsm->enabled)
                return false;

        return *lsm->enabled;
}

/* Mark an LSM's enabled flag. */
static int lsm_enabled_true __initdata = 1;
static int lsm_enabled_false __initdata = 0;
static void __init set_enabled(struct lsm_info *lsm, bool enabled)
{
        /*
         * When an LSM hasn't configured an enable variable, we can use
         * a hard-coded location for storing the default enabled state.
         */
        if (!lsm->enabled) {
                if (enabled)
                        lsm->enabled = &lsm_enabled_true;
                else
                        lsm->enabled = &lsm_enabled_false;
        } else if (lsm->enabled == &lsm_enabled_true) {
                if (!enabled)
                        lsm->enabled = &lsm_enabled_false;
        } else if (lsm->enabled == &lsm_enabled_false) {
                if (enabled)
                        lsm->enabled = &lsm_enabled_true;
        } else {
                *lsm->enabled = enabled;
        }
}

/* Is an LSM already listed in the ordered LSMs list? */
static bool __init exists_ordered_lsm(struct lsm_info *lsm)
{
        struct lsm_info **check;

        for (check = ordered_lsms; *check; check++)
                if (*check == lsm)
                        return true;

        return false;
}

/* Append an LSM to the list of ordered LSMs to initialize. */
static int last_lsm __initdata;
static void __init append_ordered_lsm(struct lsm_info *lsm, const char *from)
{
        /* Ignore duplicate selections. */
        if (exists_ordered_lsm(lsm))
                return;

        if (WARN(last_lsm == MAX_LSM_COUNT, "%s: out of LSM static calls!?\n", from))
                return;

        /* Enable this LSM, if it is not already set. */
        if (!lsm->enabled)
                lsm->enabled = &lsm_enabled_true;
        ordered_lsms[last_lsm++] = lsm;

        init_debug("%s ordered: %s (%s)\n", from, lsm->name,
                   is_enabled(lsm) ? "enabled" : "disabled");
}

/* Is an LSM allowed to be initialized? */
static bool __init lsm_allowed(struct lsm_info *lsm)
{
        /* Skip if the LSM is disabled. */
        if (!is_enabled(lsm))
                return false;

        /* Not allowed if another exclusive LSM already initialized. */
        if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && exclusive) {
                init_debug("exclusive disabled: %s\n", lsm->name);
                return false;
        }

        return true;
}

static void __init lsm_set_blob_size(int *need, int *lbs)
{
        int offset;

        if (*need <= 0)
                return;

        offset = ALIGN(*lbs, sizeof(void *));
        *lbs = offset + *need;
        *need = offset;
}

static void __init lsm_set_blob_sizes(struct lsm_blob_sizes *needed)
{
        if (!needed)
                return;

        lsm_set_blob_size(&needed->lbs_cred, &blob_sizes.lbs_cred);
        lsm_set_blob_size(&needed->lbs_file, &blob_sizes.lbs_file);
        lsm_set_blob_size(&needed->lbs_ib, &blob_sizes.lbs_ib);
        /*
         * The inode blob gets an rcu_head in addition to
         * what the modules might need.
         */
        if (needed->lbs_inode && blob_sizes.lbs_inode == 0)
                blob_sizes.lbs_inode = sizeof(struct rcu_head);
        lsm_set_blob_size(&needed->lbs_inode, &blob_sizes.lbs_inode);
        lsm_set_blob_size(&needed->lbs_ipc, &blob_sizes.lbs_ipc);
        lsm_set_blob_size(&needed->lbs_key, &blob_sizes.lbs_key);
        lsm_set_blob_size(&needed->lbs_msg_msg, &blob_sizes.lbs_msg_msg);
        lsm_set_blob_size(&needed->lbs_perf_event, &blob_sizes.lbs_perf_event);
        lsm_set_blob_size(&needed->lbs_sock, &blob_sizes.lbs_sock);
        lsm_set_blob_size(&needed->lbs_superblock, &blob_sizes.lbs_superblock);
        lsm_set_blob_size(&needed->lbs_task, &blob_sizes.lbs_task);
        lsm_set_blob_size(&needed->lbs_tun_dev, &blob_sizes.lbs_tun_dev);
        lsm_set_blob_size(&needed->lbs_xattr_count,
                          &blob_sizes.lbs_xattr_count);
        lsm_set_blob_size(&needed->lbs_bdev, &blob_sizes.lbs_bdev);
        lsm_set_blob_size(&needed->lbs_bpf_map, &blob_sizes.lbs_bpf_map);
        lsm_set_blob_size(&needed->lbs_bpf_prog, &blob_sizes.lbs_bpf_prog);
        lsm_set_blob_size(&needed->lbs_bpf_token, &blob_sizes.lbs_bpf_token);
}

/* Prepare LSM for initialization. */
static void __init prepare_lsm(struct lsm_info *lsm)
{
        int enabled = lsm_allowed(lsm);

        /* Record enablement (to handle any following exclusive LSMs). */
        set_enabled(lsm, enabled);

        /* If enabled, do pre-initialization work. */
        if (enabled) {
                if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && !exclusive) {
                        exclusive = lsm;
                        init_debug("exclusive chosen:   %s\n", lsm->name);
                }

                lsm_set_blob_sizes(lsm->blobs);
        }
}

/* Initialize a given LSM, if it is enabled. */
static void __init initialize_lsm(struct lsm_info *lsm)
{
        if (is_enabled(lsm)) {
                int ret;

                init_debug("initializing %s\n", lsm->name);
                ret = lsm->init();
                WARN(ret, "%s failed to initialize: %d\n", lsm->name, ret);
        }
}

/*
 * Current index to use while initializing the lsm id list.
 */
u32 lsm_active_cnt __ro_after_init;
const struct lsm_id *lsm_idlist[MAX_LSM_COUNT];

/* Populate ordered LSMs list from comma-separated LSM name list. */
static void __init ordered_lsm_parse(const char *order, const char *origin)
{
        struct lsm_info *lsm;
        char *sep, *name, *next;

        /* LSM_ORDER_FIRST is always first. */
        for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
                if (lsm->order == LSM_ORDER_FIRST)
                        append_ordered_lsm(lsm, "  first");
        }

        /* Process "security=", if given. */
        if (chosen_major_lsm) {
                struct lsm_info *major;

                /*
                 * To match the original "security=" behavior, this
                 * explicitly does NOT fallback to another Legacy Major
                 * if the selected one was separately disabled: disable
                 * all non-matching Legacy Major LSMs.
                 */
                for (major = __start_lsm_info; major < __end_lsm_info;
                     major++) {
                        if ((major->flags & LSM_FLAG_LEGACY_MAJOR) &&
                            strcmp(major->name, chosen_major_lsm) != 0) {
                                set_enabled(major, false);
                                init_debug("security=%s disabled: %s (only one legacy major LSM)\n",
                                           chosen_major_lsm, major->name);
                        }
                }
        }

        sep = kstrdup(order, GFP_KERNEL);
        next = sep;
        /* Walk the list, looking for matching LSMs. */
        while ((name = strsep(&next, ",")) != NULL) {
                bool found = false;

                for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
                        if (strcmp(lsm->name, name) == 0) {
                                if (lsm->order == LSM_ORDER_MUTABLE)
                                        append_ordered_lsm(lsm, origin);
                                found = true;
                        }
                }

                if (!found)
                        init_debug("%s ignored: %s (not built into kernel)\n",
                                   origin, name);
        }

        /* Process "security=", if given. */
        if (chosen_major_lsm) {
                for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
                        if (exists_ordered_lsm(lsm))
                                continue;
                        if (strcmp(lsm->name, chosen_major_lsm) == 0)
                                append_ordered_lsm(lsm, "security=");
                }
        }

        /* LSM_ORDER_LAST is always last. */
        for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
                if (lsm->order == LSM_ORDER_LAST)
                        append_ordered_lsm(lsm, "   last");
        }

        /* Disable all LSMs not in the ordered list. */
        for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
                if (exists_ordered_lsm(lsm))
                        continue;
                set_enabled(lsm, false);
                init_debug("%s skipped: %s (not in requested order)\n",
                           origin, lsm->name);
        }

        kfree(sep);
}

static void __init lsm_static_call_init(struct security_hook_list *hl)
{
        struct lsm_static_call *scall = hl->scalls;
        int i;

        for (i = 0; i < MAX_LSM_COUNT; i++) {
                /* Update the first static call that is not used yet */
                if (!scall->hl) {
                        __static_call_update(scall->key, scall->trampoline,
                                             hl->hook.lsm_func_addr);
                        scall->hl = hl;
                        static_branch_enable(scall->active);
                        return;
                }
                scall++;
        }
        panic("%s - Ran out of static slots.\n", __func__);
}

static void __init lsm_early_cred(struct cred *cred);
static void __init lsm_early_task(struct task_struct *task);

static int lsm_append(const char *new, char **result);

static void __init report_lsm_order(void)
{
        struct lsm_info **lsm, *early;
        int first = 0;

        pr_info("initializing lsm=");

        /* Report each enabled LSM name, comma separated. */
        for (early = __start_early_lsm_info;
             early < __end_early_lsm_info; early++)
                if (is_enabled(early))
                        pr_cont("%s%s", first++ == 0 ? "" : ",", early->name);
        for (lsm = ordered_lsms; *lsm; lsm++)
                if (is_enabled(*lsm))
                        pr_cont("%s%s", first++ == 0 ? "" : ",", (*lsm)->name);

        pr_cont("\n");
}

static void __init ordered_lsm_init(void)
{
        struct lsm_info **lsm;

        if (chosen_lsm_order) {
                if (chosen_major_lsm) {
                        pr_warn("security=%s is ignored because it is superseded by lsm=%s\n",
                                chosen_major_lsm, chosen_lsm_order);
                        chosen_major_lsm = NULL;
                }
                ordered_lsm_parse(chosen_lsm_order, "cmdline");
        } else
                ordered_lsm_parse(builtin_lsm_order, "builtin");

        for (lsm = ordered_lsms; *lsm; lsm++)
                prepare_lsm(*lsm);

        report_lsm_order();

        init_debug("cred blob size       = %d\n", blob_sizes.lbs_cred);
        init_debug("file blob size       = %d\n", blob_sizes.lbs_file);
        init_debug("ib blob size         = %d\n", blob_sizes.lbs_ib);
        init_debug("inode blob size      = %d\n", blob_sizes.lbs_inode);
        init_debug("ipc blob size        = %d\n", blob_sizes.lbs_ipc);
#ifdef CONFIG_KEYS
        init_debug("key blob size        = %d\n", blob_sizes.lbs_key);
#endif /* CONFIG_KEYS */
        init_debug("msg_msg blob size    = %d\n", blob_sizes.lbs_msg_msg);
        init_debug("sock blob size       = %d\n", blob_sizes.lbs_sock);
        init_debug("superblock blob size = %d\n", blob_sizes.lbs_superblock);
        init_debug("perf event blob size = %d\n", blob_sizes.lbs_perf_event);
        init_debug("task blob size       = %d\n", blob_sizes.lbs_task);
        init_debug("tun device blob size = %d\n", blob_sizes.lbs_tun_dev);
        init_debug("xattr slots          = %d\n", blob_sizes.lbs_xattr_count);
        init_debug("bdev blob size       = %d\n", blob_sizes.lbs_bdev);
        init_debug("bpf map blob size    = %d\n", blob_sizes.lbs_bpf_map);
        init_debug("bpf prog blob size   = %d\n", blob_sizes.lbs_bpf_prog);
        init_debug("bpf token blob size  = %d\n", blob_sizes.lbs_bpf_token);

        /*
         * Create any kmem_caches needed for blobs
         */
        if (blob_sizes.lbs_file)
                lsm_file_cache = kmem_cache_create("lsm_file_cache",
                                                   blob_sizes.lbs_file, 0,
                                                   SLAB_PANIC, NULL);
        if (blob_sizes.lbs_inode)
                lsm_inode_cache = kmem_cache_create("lsm_inode_cache",
                                                    blob_sizes.lbs_inode, 0,
                                                    SLAB_PANIC, NULL);

        lsm_early_cred((struct cred *) current->cred);
        lsm_early_task(current);
        for (lsm = ordered_lsms; *lsm; lsm++)
                initialize_lsm(*lsm);
}

int __init early_security_init(void)
{
        struct lsm_info *lsm;

        for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) {
                if (!lsm->enabled)
                        lsm->enabled = &lsm_enabled_true;
                prepare_lsm(lsm);
                initialize_lsm(lsm);
        }

        return 0;
}

/**
 * security_init - initializes the security framework
 *
 * This should be called early in the kernel initialization sequence.
 */
int __init security_init(void)
{
        struct lsm_info *lsm;

        init_debug("legacy security=%s\n", chosen_major_lsm ? : " *unspecified*");
        init_debug("  CONFIG_LSM=%s\n", builtin_lsm_order);
        init_debug("boot arg lsm=%s\n", chosen_lsm_order ? : " *unspecified*");

        /*
         * Append the names of the early LSM modules now that kmalloc() is
         * available
         */
        for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) {
                init_debug("  early started: %s (%s)\n", lsm->name,
                           is_enabled(lsm) ? "enabled" : "disabled");
                if (lsm->enabled)
                        lsm_append(lsm->name, &lsm_names);
        }

        /* Load LSMs in specified order. */
        ordered_lsm_init();

        return 0;
}

/* Save user chosen LSM */
static int __init choose_major_lsm(char *str)
{
        chosen_major_lsm = str;
        return 1;
}
__setup("security=", choose_major_lsm);

/* Explicitly choose LSM initialization order. */
static int __init choose_lsm_order(char *str)
{
        chosen_lsm_order = str;
        return 1;
}
__setup("lsm=", choose_lsm_order);

/* Enable LSM order debugging. */
static int __init enable_debug(char *str)
{
        debug = true;
        return 1;
}
__setup("lsm.debug", enable_debug);

static bool match_last_lsm(const char *list, const char *lsm)
{
        const char *last;

        if (WARN_ON(!list || !lsm))
                return false;
        last = strrchr(list, ',');
        if (last)
                /* Pass the comma, strcmp() will check for '\0' */
                last++;
        else
                last = list;
        return !strcmp(last, lsm);
}

static int lsm_append(const char *new, char **result)
{
        char *cp;

        if (*result == NULL) {
                *result = kstrdup(new, GFP_KERNEL);
                if (*result == NULL)
                        return -ENOMEM;
        } else {
                /* Check if it is the last registered name */
                if (match_last_lsm(*result, new))
                        return 0;
                cp = kasprintf(GFP_KERNEL, "%s,%s", *result, new);
                if (cp == NULL)
                        return -ENOMEM;
                kfree(*result);
                *result = cp;
        }
        return 0;
}

/**
 * security_add_hooks - Add a modules hooks to the hook lists.
 * @hooks: the hooks to add
 * @count: the number of hooks to add
 * @lsmid: the identification information for the security module
 *
 * Each LSM has to register its hooks with the infrastructure.
 */
void __init security_add_hooks(struct security_hook_list *hooks, int count,
                               const struct lsm_id *lsmid)
{
        int i;

        /*
         * A security module may call security_add_hooks() more
         * than once during initialization, and LSM initialization
         * is serialized. Landlock is one such case.
         * Look at the previous entry, if there is one, for duplication.
         */
        if (lsm_active_cnt == 0 || lsm_idlist[lsm_active_cnt - 1] != lsmid) {
                if (lsm_active_cnt >= MAX_LSM_COUNT)
                        panic("%s Too many LSMs registered.\n", __func__);
                lsm_idlist[lsm_active_cnt++] = lsmid;
        }

        for (i = 0; i < count; i++) {
                hooks[i].lsmid = lsmid;
                lsm_static_call_init(&hooks[i]);
        }

        /*
         * Don't try to append during early_security_init(), we'll come back
         * and fix this up afterwards.
         */
        if (slab_is_available()) {
                if (lsm_append(lsmid->name, &lsm_names) < 0)
                        panic("%s - Cannot get early memory.\n", __func__);
        }
}

int call_blocking_lsm_notifier(enum lsm_event event, void *data)
{
        return blocking_notifier_call_chain(&blocking_lsm_notifier_chain,
                                            event, data);
}
EXPORT_SYMBOL(call_blocking_lsm_notifier);

int register_blocking_lsm_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&blocking_lsm_notifier_chain,
                                                nb);
}
EXPORT_SYMBOL(register_blocking_lsm_notifier);

int unregister_blocking_lsm_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_unregister(&blocking_lsm_notifier_chain,
                                                  nb);
}
EXPORT_SYMBOL(unregister_blocking_lsm_notifier);

/**
 * lsm_blob_alloc - allocate a composite blob
 * @dest: the destination for the blob
 * @size: the size of the blob
 * @gfp: allocation type
 *
 * Allocate a blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_blob_alloc(void **dest, size_t size, gfp_t gfp)
{
        if (size == 0) {
                *dest = NULL;
                return 0;
        }

        *dest = kzalloc(size, gfp);
        if (*dest == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_cred_alloc - allocate a composite cred blob
 * @cred: the cred that needs a blob
 * @gfp: allocation type
 *
 * Allocate the cred blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_cred_alloc(struct cred *cred, gfp_t gfp)
{
        return lsm_blob_alloc(&cred->security, blob_sizes.lbs_cred, gfp);
}

/**
 * lsm_early_cred - during initialization allocate a composite cred blob
 * @cred: the cred that needs a blob
 *
 * Allocate the cred blob for all the modules
 */
static void __init lsm_early_cred(struct cred *cred)
{
        int rc = lsm_cred_alloc(cred, GFP_KERNEL);

        if (rc)
                panic("%s: Early cred alloc failed.\n", __func__);
}

/**
 * lsm_file_alloc - allocate a composite file blob
 * @file: the file that needs a blob
 *
 * Allocate the file blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_file_alloc(struct file *file)
{
        if (!lsm_file_cache) {
                file->f_security = NULL;
                return 0;
        }

        file->f_security = kmem_cache_zalloc(lsm_file_cache, GFP_KERNEL);
        if (file->f_security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_inode_alloc - allocate a composite inode blob
 * @inode: the inode that needs a blob
 * @gfp: allocation flags
 *
 * Allocate the inode blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_inode_alloc(struct inode *inode, gfp_t gfp)
{
        if (!lsm_inode_cache) {
                inode->i_security = NULL;
                return 0;
        }

        inode->i_security = kmem_cache_zalloc(lsm_inode_cache, gfp);
        if (inode->i_security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_task_alloc - allocate a composite task blob
 * @task: the task that needs a blob
 *
 * Allocate the task blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_task_alloc(struct task_struct *task)
{
        return lsm_blob_alloc(&task->security, blob_sizes.lbs_task, GFP_KERNEL);
}

/**
 * lsm_ipc_alloc - allocate a composite ipc blob
 * @kip: the ipc that needs a blob
 *
 * Allocate the ipc blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_ipc_alloc(struct kern_ipc_perm *kip)
{
        return lsm_blob_alloc(&kip->security, blob_sizes.lbs_ipc, GFP_KERNEL);
}

#ifdef CONFIG_KEYS
/**
 * lsm_key_alloc - allocate a composite key blob
 * @key: the key that needs a blob
 *
 * Allocate the key blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_key_alloc(struct key *key)
{
        return lsm_blob_alloc(&key->security, blob_sizes.lbs_key, GFP_KERNEL);
}
#endif /* CONFIG_KEYS */

/**
 * lsm_msg_msg_alloc - allocate a composite msg_msg blob
 * @mp: the msg_msg that needs a blob
 *
 * Allocate the ipc blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_msg_msg_alloc(struct msg_msg *mp)
{
        return lsm_blob_alloc(&mp->security, blob_sizes.lbs_msg_msg,
                              GFP_KERNEL);
}

/**
 * lsm_bdev_alloc - allocate a composite block_device blob
 * @bdev: the block_device that needs a blob
 *
 * Allocate the block_device blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_bdev_alloc(struct block_device *bdev)
{
        return lsm_blob_alloc(&bdev->bd_security, blob_sizes.lbs_bdev,
                              GFP_KERNEL);
}

#ifdef CONFIG_BPF_SYSCALL
/**
 * lsm_bpf_map_alloc - allocate a composite bpf_map blob
 * @map: the bpf_map that needs a blob
 *
 * Allocate the bpf_map blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_bpf_map_alloc(struct bpf_map *map)
{
        return lsm_blob_alloc(&map->security, blob_sizes.lbs_bpf_map, GFP_KERNEL);
}

/**
 * lsm_bpf_prog_alloc - allocate a composite bpf_prog blob
 * @prog: the bpf_prog that needs a blob
 *
 * Allocate the bpf_prog blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_bpf_prog_alloc(struct bpf_prog *prog)
{
        return lsm_blob_alloc(&prog->aux->security, blob_sizes.lbs_bpf_prog, GFP_KERNEL);
}

/**
 * lsm_bpf_token_alloc - allocate a composite bpf_token blob
 * @token: the bpf_token that needs a blob
 *
 * Allocate the bpf_token blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_bpf_token_alloc(struct bpf_token *token)
{
        return lsm_blob_alloc(&token->security, blob_sizes.lbs_bpf_token, GFP_KERNEL);
}
#endif /* CONFIG_BPF_SYSCALL */

/**
 * lsm_early_task - during initialization allocate a composite task blob
 * @task: the task that needs a blob
 *
 * Allocate the task blob for all the modules
 */
static void __init lsm_early_task(struct task_struct *task)
{
        int rc = lsm_task_alloc(task);

        if (rc)
                panic("%s: Early task alloc failed.\n", __func__);
}

/**
 * lsm_superblock_alloc - allocate a composite superblock blob
 * @sb: the superblock that needs a blob
 *
 * Allocate the superblock blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_superblock_alloc(struct super_block *sb)
{
        return lsm_blob_alloc(&sb->s_security, blob_sizes.lbs_superblock,
                              GFP_KERNEL);
}

/**
 * lsm_fill_user_ctx - Fill a user space lsm_ctx structure
 * @uctx: a userspace LSM context to be filled
 * @uctx_len: available uctx size (input), used uctx size (output)
 * @val: the new LSM context value
 * @val_len: the size of the new LSM context value
 * @id: LSM id
 * @flags: LSM defined flags
 *
 * Fill all of the fields in a userspace lsm_ctx structure.  If @uctx is NULL
 * simply calculate the required size to output via @utc_len and return
 * success.
 *
 * Returns 0 on success, -E2BIG if userspace buffer is not large enough,
 * -EFAULT on a copyout error, -ENOMEM if memory can't be allocated.
 */
int lsm_fill_user_ctx(struct lsm_ctx __user *uctx, u32 *uctx_len,
                      void *val, size_t val_len,
                      u64 id, u64 flags)
{
        struct lsm_ctx *nctx = NULL;
        size_t nctx_len;
        int rc = 0;

        nctx_len = ALIGN(struct_size(nctx, ctx, val_len), sizeof(void *));
        if (nctx_len > *uctx_len) {
                rc = -E2BIG;
                goto out;
        }

        /* no buffer - return success/0 and set @uctx_len to the req size */
        if (!uctx)
                goto out;

        nctx = kzalloc(nctx_len, GFP_KERNEL);
        if (nctx == NULL) {
                rc = -ENOMEM;
                goto out;
        }
        nctx->id = id;
        nctx->flags = flags;
        nctx->len = nctx_len;
        nctx->ctx_len = val_len;
        memcpy(nctx->ctx, val, val_len);

        if (copy_to_user(uctx, nctx, nctx_len))
                rc = -EFAULT;

out:
        kfree(nctx);
        *uctx_len = nctx_len;
        return rc;
}

/*
 * The default value of the LSM hook is defined in linux/lsm_hook_defs.h and
 * can be accessed with:
 *
 *        LSM_RET_DEFAULT(<hook_name>)
 *
 * The macros below define static constants for the default value of each
 * LSM hook.
 */
#define LSM_RET_DEFAULT(NAME) (NAME##_default)
#define DECLARE_LSM_RET_DEFAULT_void(DEFAULT, NAME)
#define DECLARE_LSM_RET_DEFAULT_int(DEFAULT, NAME) \
        static const int __maybe_unused LSM_RET_DEFAULT(NAME) = (DEFAULT);
#define LSM_HOOK(RET, DEFAULT, NAME, ...) \
        DECLARE_LSM_RET_DEFAULT_##RET(DEFAULT, NAME)

#include <linux/lsm_hook_defs.h>
#undef LSM_HOOK

/*
 * Hook list operation macros.
 *
 * call_void_hook:
 *        This is a hook that does not return a value.
 *
 * call_int_hook:
 *        This is a hook that returns a value.
 */
#define __CALL_STATIC_VOID(NUM, HOOK, ...)                                     \
do {                                                                             \
        if (static_branch_unlikely(&SECURITY_HOOK_ACTIVE_KEY(HOOK, NUM))) {    \
                static_call(LSM_STATIC_CALL(HOOK, NUM))(__VA_ARGS__);             \
        }                                                                     \
} while (0);

#define call_void_hook(HOOK, ...)                                 \
        do {                                                      \
                LSM_LOOP_UNROLL(__CALL_STATIC_VOID, HOOK, __VA_ARGS__); \
        } while (0)


#define __CALL_STATIC_INT(NUM, R, HOOK, LABEL, ...)                             \
do {                                                                             \
        if (static_branch_unlikely(&SECURITY_HOOK_ACTIVE_KEY(HOOK, NUM))) {  \
                R = static_call(LSM_STATIC_CALL(HOOK, NUM))(__VA_ARGS__);    \
                if (R != LSM_RET_DEFAULT(HOOK))                                     \
                        goto LABEL;                                             \
        }                                                                     \
} while (0);

#define call_int_hook(HOOK, ...)                                        \
({                                                                        \
        __label__ OUT;                                                        \
        int RC = LSM_RET_DEFAULT(HOOK);                                        \
                                                                        \
        LSM_LOOP_UNROLL(__CALL_STATIC_INT, RC, HOOK, OUT, __VA_ARGS__);        \
OUT:                                                                        \
        RC;                                                                \
})

#define lsm_for_each_hook(scall, NAME)                                        \
        for (scall = static_calls_table.NAME;                                \
             scall - static_calls_table.NAME < MAX_LSM_COUNT; scall++)  \
                if (static_key_enabled(&scall->active->key))

/* Security operations */

/**
 * security_binder_set_context_mgr() - Check if becoming binder ctx mgr is ok
 * @mgr: task credentials of current binder process
 *
 * Check whether @mgr is allowed to be the binder context manager.
 *
 * Return: Return 0 if permission is granted.
 */
int security_binder_set_context_mgr(const struct cred *mgr)
{
        return call_int_hook(binder_set_context_mgr, mgr);
}

/**
 * security_binder_transaction() - Check if a binder transaction is allowed
 * @from: sending process
 * @to: receiving process
 *
 * Check whether @from is allowed to invoke a binder transaction call to @to.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_binder_transaction(const struct cred *from,
                                const struct cred *to)
{
        return call_int_hook(binder_transaction, from, to);
}

/**
 * security_binder_transfer_binder() - Check if a binder transfer is allowed
 * @from: sending process
 * @to: receiving process
 *
 * Check whether @from is allowed to transfer a binder reference to @to.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_binder_transfer_binder(const struct cred *from,
                                    const struct cred *to)
{
        return call_int_hook(binder_transfer_binder, from, to);
}

/**
 * security_binder_transfer_file() - Check if a binder file xfer is allowed
 * @from: sending process
 * @to: receiving process
 * @file: file being transferred
 *
 * Check whether @from is allowed to transfer @file to @to.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_binder_transfer_file(const struct cred *from,
                                  const struct cred *to, const struct file *file)
{
        return call_int_hook(binder_transfer_file, from, to, file);
}

/**
 * security_ptrace_access_check() - Check if tracing is allowed
 * @child: target process
 * @mode: PTRACE_MODE flags
 *
 * Check permission before allowing the current process to trace the @child
 * process.  Security modules may also want to perform a process tracing check
 * during an execve in the set_security or apply_creds hooks of tracing check
 * during an execve in the bprm_set_creds hook of binprm_security_ops if the
 * process is being traced and its security attributes would be changed by the
 * execve.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_ptrace_access_check(struct task_struct *child, unsigned int mode)
{
        return call_int_hook(ptrace_access_check, child, mode);
}

/**
 * security_ptrace_traceme() - Check if tracing is allowed
 * @parent: tracing process
 *
 * Check that the @parent process has sufficient permission to trace the
 * current process before allowing the current process to present itself to the
 * @parent process for tracing.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_ptrace_traceme(struct task_struct *parent)
{
        return call_int_hook(ptrace_traceme, parent);
}

/**
 * security_capget() - Get the capability sets for a process
 * @target: target process
 * @effective: effective capability set
 * @inheritable: inheritable capability set
 * @permitted: permitted capability set
 *
 * Get the @effective, @inheritable, and @permitted capability sets for the
 * @target process.  The hook may also perform permission checking to determine
 * if the current process is allowed to see the capability sets of the @target
 * process.
 *
 * Return: Returns 0 if the capability sets were successfully obtained.
 */
int security_capget(const struct task_struct *target,
                    kernel_cap_t *effective,
                    kernel_cap_t *inheritable,
                    kernel_cap_t *permitted)
{
        return call_int_hook(capget, target, effective, inheritable, permitted);
}

/**
 * security_capset() - Set the capability sets for a process
 * @new: new credentials for the target process
 * @old: current credentials of the target process
 * @effective: effective capability set
 * @inheritable: inheritable capability set
 * @permitted: permitted capability set
 *
 * Set the @effective, @inheritable, and @permitted capability sets for the
 * current process.
 *
 * Return: Returns 0 and update @new if permission is granted.
 */
int security_capset(struct cred *new, const struct cred *old,
                    const kernel_cap_t *effective,
                    const kernel_cap_t *inheritable,
                    const kernel_cap_t *permitted)
{
        return call_int_hook(capset, new, old, effective, inheritable,
                             permitted);
}

/**
 * security_capable() - Check if a process has the necessary capability
 * @cred: credentials to examine
 * @ns: user namespace
 * @cap: capability requested
 * @opts: capability check options
 *
 * Check whether the @tsk process has the @cap capability in the indicated
 * credentials.  @cap contains the capability <include/linux/capability.h>.
 * @opts contains options for the capable check <include/linux/security.h>.
 *
 * Return: Returns 0 if the capability is granted.
 */
int security_capable(const struct cred *cred,
                     struct user_namespace *ns,
                     int cap,
                     unsigned int opts)
{
        return call_int_hook(capable, cred, ns, cap, opts);
}

/**
 * security_quotactl() - Check if a quotactl() syscall is allowed for this fs
 * @cmds: commands
 * @type: type
 * @id: id
 * @sb: filesystem
 *
 * Check whether the quotactl syscall is allowed for this @sb.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_quotactl(int cmds, int type, int id, const struct super_block *sb)
{
        return call_int_hook(quotactl, cmds, type, id, sb);
}

/**
 * security_quota_on() - Check if QUOTAON is allowed for a dentry
 * @dentry: dentry
 *
 * Check whether QUOTAON is allowed for @dentry.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_quota_on(struct dentry *dentry)
{
        return call_int_hook(quota_on, dentry);
}

/**
 * security_syslog() - Check if accessing the kernel message ring is allowed
 * @type: SYSLOG_ACTION_* type
 *
 * Check permission before accessing the kernel message ring or changing
 * logging to the console.  See the syslog(2) manual page for an explanation of
 * the @type values.
 *
 * Return: Return 0 if permission is granted.
 */
int security_syslog(int type)
{
        return call_int_hook(syslog, type);
}

/**
 * security_settime64() - Check if changing the system time is allowed
 * @ts: new time
 * @tz: timezone
 *
 * Check permission to change the system time, struct timespec64 is defined in
 * <include/linux/time64.h> and timezone is defined in <include/linux/time.h>.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_settime64(const struct timespec64 *ts, const struct timezone *tz)
{
        return call_int_hook(settime, ts, tz);
}

/**
 * security_vm_enough_memory_mm() - Check if allocating a new mem map is allowed
 * @mm: mm struct
 * @pages: number of pages
 *
 * Check permissions for allocating a new virtual mapping.  If all LSMs return
 * a positive value, __vm_enough_memory() will be called with cap_sys_admin
 * set. If at least one LSM returns 0 or negative, __vm_enough_memory() will be
 * called with cap_sys_admin cleared.
 *
 * Return: Returns 0 if permission is granted by the LSM infrastructure to the
 *         caller.
 */
int security_vm_enough_memory_mm(struct mm_struct *mm, long pages)
{
        struct lsm_static_call *scall;
        int cap_sys_admin = 1;
        int rc;

        /*
         * The module will respond with 0 if it thinks the __vm_enough_memory()
         * call should be made with the cap_sys_admin set. If all of the modules
         * agree that it should be set it will. If any module thinks it should
         * not be set it won't.
         */
        lsm_for_each_hook(scall, vm_enough_memory) {
                rc = scall->hl->hook.vm_enough_memory(mm, pages);
                if (rc < 0) {
                        cap_sys_admin = 0;
                        break;
                }
        }
        return __vm_enough_memory(mm, pages, cap_sys_admin);
}

/**
 * security_bprm_creds_for_exec() - Prepare the credentials for exec()
 * @bprm: binary program information
 *
 * If the setup in prepare_exec_creds did not setup @bprm->cred->security
 * properly for executing @bprm->file, update the LSM's portion of
 * @bprm->cred->security to be what commit_creds needs to install for the new
 * program.  This hook may also optionally check permissions (e.g. for
 * transitions between security domains).  The hook must set @bprm->secureexec
 * to 1 if AT_SECURE should be set to request libc enable secure mode.  @bprm
 * contains the linux_binprm structure.
 *
 * If execveat(2) is called with the AT_EXECVE_CHECK flag, bprm->is_check is
 * set.  The result must be the same as without this flag even if the execution
 * will never really happen and @bprm will always be dropped.
 *
 * This hook must not change current->cred, only @bprm->cred.
 *
 * Return: Returns 0 if the hook is successful and permission is granted.
 */
int security_bprm_creds_for_exec(struct linux_binprm *bprm)
{
        return call_int_hook(bprm_creds_for_exec, bprm);
}

/**
 * security_bprm_creds_from_file() - Update linux_binprm creds based on file
 * @bprm: binary program information
 * @file: associated file
 *
 * If @file is setpcap, suid, sgid or otherwise marked to change privilege upon
 * exec, update @bprm->cred to reflect that change. This is called after
 * finding the binary that will be executed without an interpreter.  This
 * ensures that the credentials will not be derived from a script that the
 * binary will need to reopen, which when reopend may end up being a completely
 * different file.  This hook may also optionally check permissions (e.g. for
 * transitions between security domains).  The hook must set @bprm->secureexec
 * to 1 if AT_SECURE should be set to request libc enable secure mode.  The
 * hook must add to @bprm->per_clear any personality flags that should be
 * cleared from current->personality.  @bprm contains the linux_binprm
 * structure.
 *
 * Return: Returns 0 if the hook is successful and permission is granted.
 */
int security_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file)
{
        return call_int_hook(bprm_creds_from_file, bprm, file);
}

/**
 * security_bprm_check() - Mediate binary handler search
 * @bprm: binary program information
 *
 * This hook mediates the point when a search for a binary handler will begin.
 * It allows a check against the @bprm->cred->security value which was set in
 * the preceding creds_for_exec call.  The argv list and envp list are reliably
 * available in @bprm.  This hook may be called multiple times during a single
 * execve.  @bprm contains the linux_binprm structure.
 *
 * Return: Returns 0 if the hook is successful and permission is granted.
 */
int security_bprm_check(struct linux_binprm *bprm)
{
        return call_int_hook(bprm_check_security, bprm);
}

/**
 * security_bprm_committing_creds() - Install creds for a process during exec()
 * @bprm: binary program information
 *
 * Prepare to install the new security attributes of a process being
 * transformed by an execve operation, based on the old credentials pointed to
 * by @current->cred and the information set in @bprm->cred by the
 * bprm_creds_for_exec hook.  @bprm points to the linux_binprm structure.  This
 * hook is a good place to perform state changes on the process such as closing
 * open file descriptors to which access will no longer be granted when the
 * attributes are changed.  This is called immediately before commit_creds().
 */
void security_bprm_committing_creds(const struct linux_binprm *bprm)
{
        call_void_hook(bprm_committing_creds, bprm);
}

/**
 * security_bprm_committed_creds() - Tidy up after cred install during exec()
 * @bprm: binary program information
 *
 * Tidy up after the installation of the new security attributes of a process
 * being transformed by an execve operation.  The new credentials have, by this
 * point, been set to @current->cred.  @bprm points to the linux_binprm
 * structure.  This hook is a good place to perform state changes on the
 * process such as clearing out non-inheritable signal state.  This is called
 * immediately after commit_creds().
 */
void security_bprm_committed_creds(const struct linux_binprm *bprm)
{
        call_void_hook(bprm_committed_creds, bprm);
}

/**
 * security_fs_context_submount() - Initialise fc->security
 * @fc: new filesystem context
 * @reference: dentry reference for submount/remount
 *
 * Fill out the ->security field for a new fs_context.
 *
 * Return: Returns 0 on success or negative error code on failure.
 */
int security_fs_context_submount(struct fs_context *fc, struct super_block *reference)
{
        return call_int_hook(fs_context_submount, fc, reference);
}

/**
 * security_fs_context_dup() - Duplicate a fs_context LSM blob
 * @fc: destination filesystem context
 * @src_fc: source filesystem context
 *
 * Allocate and attach a security structure to sc->security.  This pointer is
 * initialised to NULL by the caller.  @fc indicates the new filesystem context.
 * @src_fc indicates the original filesystem context.
 *
 * Return: Returns 0 on success or a negative error code on failure.
 */
int security_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc)
{
        return call_int_hook(fs_context_dup, fc, src_fc);
}

/**
 * security_fs_context_parse_param() - Configure a filesystem context
 * @fc: filesystem context
 * @param: filesystem parameter
 *
 * Userspace provided a parameter to configure a superblock.  The LSM can
 * consume the parameter or return it to the caller for use elsewhere.
 *
 * Return: If the parameter is used by the LSM it should return 0, if it is
 *         returned to the caller -ENOPARAM is returned, otherwise a negative
 *         error code is returned.
 */
int security_fs_context_parse_param(struct fs_context *fc,
                                    struct fs_parameter *param)
{
        struct lsm_static_call *scall;
        int trc;
        int rc = -ENOPARAM;

        lsm_for_each_hook(scall, fs_context_parse_param) {
                trc = scall->hl->hook.fs_context_parse_param(fc, param);
                if (trc == 0)
                        rc = 0;
                else if (trc != -ENOPARAM)
                        return trc;
        }
        return rc;
}

/**
 * security_sb_alloc() - Allocate a super_block LSM blob
 * @sb: filesystem superblock
 *
 * Allocate and attach a security structure to the sb->s_security field.  The
 * s_security field is initialized to NULL when the structure is allocated.
 * @sb contains the super_block structure to be modified.
 *
 * Return: Returns 0 if operation was successful.
 */
int security_sb_alloc(struct super_block *sb)
{
        int rc = lsm_superblock_alloc(sb);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(sb_alloc_security, sb);
        if (unlikely(rc))
                security_sb_free(sb);
        return rc;
}

/**
 * security_sb_delete() - Release super_block LSM associated objects
 * @sb: filesystem superblock
 *
 * Release objects tied to a superblock (e.g. inodes).  @sb contains the
 * super_block structure being released.
 */
void security_sb_delete(struct super_block *sb)
{
        call_void_hook(sb_delete, sb);
}

/**
 * security_sb_free() - Free a super_block LSM blob
 * @sb: filesystem superblock
 *
 * Deallocate and clear the sb->s_security field.  @sb contains the super_block
 * structure to be modified.
 */
void security_sb_free(struct super_block *sb)
{
        call_void_hook(sb_free_security, sb);
        kfree(sb->s_security);
        sb->s_security = NULL;
}

/**
 * security_free_mnt_opts() - Free memory associated with mount options
 * @mnt_opts: LSM processed mount options
 *
 * Free memory associated with @mnt_ops.
 */
void security_free_mnt_opts(void **mnt_opts)
{
        if (!*mnt_opts)
                return;
        call_void_hook(sb_free_mnt_opts, *mnt_opts);
        *mnt_opts = NULL;
}
EXPORT_SYMBOL(security_free_mnt_opts);

/**
 * security_sb_eat_lsm_opts() - Consume LSM mount options
 * @options: mount options
 * @mnt_opts: LSM processed mount options
 *
 * Eat (scan @options) and save them in @mnt_opts.
 *
 * Return: Returns 0 on success, negative values on failure.
 */
int security_sb_eat_lsm_opts(char *options, void **mnt_opts)
{
        return call_int_hook(sb_eat_lsm_opts, options, mnt_opts);
}
EXPORT_SYMBOL(security_sb_eat_lsm_opts);

/**
 * security_sb_mnt_opts_compat() - Check if new mount options are allowed
 * @sb: filesystem superblock
 * @mnt_opts: new mount options
 *
 * Determine if the new mount options in @mnt_opts are allowed given the
 * existing mounted filesystem at @sb.  @sb superblock being compared.
 *
 * Return: Returns 0 if options are compatible.
 */
int security_sb_mnt_opts_compat(struct super_block *sb,
                                void *mnt_opts)
{
        return call_int_hook(sb_mnt_opts_compat, sb, mnt_opts);
}
EXPORT_SYMBOL(security_sb_mnt_opts_compat);

/**
 * security_sb_remount() - Verify no incompatible mount changes during remount
 * @sb: filesystem superblock
 * @mnt_opts: (re)mount options
 *
 * Extracts security system specific mount options and verifies no changes are
 * being made to those options.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sb_remount(struct super_block *sb,
                        void *mnt_opts)
{
        return call_int_hook(sb_remount, sb, mnt_opts);
}
EXPORT_SYMBOL(security_sb_remount);

/**
 * security_sb_kern_mount() - Check if a kernel mount is allowed
 * @sb: filesystem superblock
 *
 * Mount this @sb if allowed by permissions.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sb_kern_mount(const struct super_block *sb)
{
        return call_int_hook(sb_kern_mount, sb);
}

/**
 * security_sb_show_options() - Output the mount options for a superblock
 * @m: output file
 * @sb: filesystem superblock
 *
 * Show (print on @m) mount options for this @sb.
 *
 * Return: Returns 0 on success, negative values on failure.
 */
int security_sb_show_options(struct seq_file *m, struct super_block *sb)
{
        return call_int_hook(sb_show_options, m, sb);
}

/**
 * security_sb_statfs() - Check if accessing fs stats is allowed
 * @dentry: superblock handle
 *
 * Check permission before obtaining filesystem statistics for the @mnt
 * mountpoint.  @dentry is a handle on the superblock for the filesystem.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sb_statfs(struct dentry *dentry)
{
        return call_int_hook(sb_statfs, dentry);
}

/**
 * security_sb_mount() - Check permission for mounting a filesystem
 * @dev_name: filesystem backing device
 * @path: mount point
 * @type: filesystem type
 * @flags: mount flags
 * @data: filesystem specific data
 *
 * Check permission before an object specified by @dev_name is mounted on the
 * mount point named by @nd.  For an ordinary mount, @dev_name identifies a
 * device if the file system type requires a device.  For a remount
 * (@flags & MS_REMOUNT), @dev_name is irrelevant.  For a loopback/bind mount
 * (@flags & MS_BIND), @dev_name identifies the        pathname of the object being
 * mounted.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sb_mount(const char *dev_name, const struct path *path,
                      const char *type, unsigned long flags, void *data)
{
        return call_int_hook(sb_mount, dev_name, path, type, flags, data);
}

/**
 * security_sb_umount() - Check permission for unmounting a filesystem
 * @mnt: mounted filesystem
 * @flags: unmount flags
 *
 * Check permission before the @mnt file system is unmounted.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sb_umount(struct vfsmount *mnt, int flags)
{
        return call_int_hook(sb_umount, mnt, flags);
}

/**
 * security_sb_pivotroot() - Check permissions for pivoting the rootfs
 * @old_path: new location for current rootfs
 * @new_path: location of the new rootfs
 *
 * Check permission before pivoting the root filesystem.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sb_pivotroot(const struct path *old_path,
                          const struct path *new_path)
{
        return call_int_hook(sb_pivotroot, old_path, new_path);
}

/**
 * security_sb_set_mnt_opts() - Set the mount options for a filesystem
 * @sb: filesystem superblock
 * @mnt_opts: binary mount options
 * @kern_flags: kernel flags (in)
 * @set_kern_flags: kernel flags (out)
 *
 * Set the security relevant mount options used for a superblock.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_sb_set_mnt_opts(struct super_block *sb,
                             void *mnt_opts,
                             unsigned long kern_flags,
                             unsigned long *set_kern_flags)
{
        struct lsm_static_call *scall;
        int rc = mnt_opts ? -EOPNOTSUPP : LSM_RET_DEFAULT(sb_set_mnt_opts);

        lsm_for_each_hook(scall, sb_set_mnt_opts) {
                rc = scall->hl->hook.sb_set_mnt_opts(sb, mnt_opts, kern_flags,
                                              set_kern_flags);
                if (rc != LSM_RET_DEFAULT(sb_set_mnt_opts))
                        break;
        }
        return rc;
}
EXPORT_SYMBOL(security_sb_set_mnt_opts);

/**
 * security_sb_clone_mnt_opts() - Duplicate superblock mount options
 * @oldsb: source superblock
 * @newsb: destination superblock
 * @kern_flags: kernel flags (in)
 * @set_kern_flags: kernel flags (out)
 *
 * Copy all security options from a given superblock to another.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_sb_clone_mnt_opts(const struct super_block *oldsb,
                               struct super_block *newsb,
                               unsigned long kern_flags,
                               unsigned long *set_kern_flags)
{
        return call_int_hook(sb_clone_mnt_opts, oldsb, newsb,
                             kern_flags, set_kern_flags);
}
EXPORT_SYMBOL(security_sb_clone_mnt_opts);

/**
 * security_move_mount() - Check permissions for moving a mount
 * @from_path: source mount point
 * @to_path: destination mount point
 *
 * Check permission before a mount is moved.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_move_mount(const struct path *from_path,
                        const struct path *to_path)
{
        return call_int_hook(move_mount, from_path, to_path);
}

/**
 * security_path_notify() - Check if setting a watch is allowed
 * @path: file path
 * @mask: event mask
 * @obj_type: file path type
 *
 * Check permissions before setting a watch on events as defined by @mask, on
 * an object at @path, whose type is defined by @obj_type.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_notify(const struct path *path, u64 mask,
                         unsigned int obj_type)
{
        return call_int_hook(path_notify, path, mask, obj_type);
}

/**
 * security_inode_alloc() - Allocate an inode LSM blob
 * @inode: the inode
 * @gfp: allocation flags
 *
 * Allocate and attach a security structure to @inode->i_security.  The
 * i_security field is initialized to NULL when the inode structure is
 * allocated.
 *
 * Return: Return 0 if operation was successful.
 */
int security_inode_alloc(struct inode *inode, gfp_t gfp)
{
        int rc = lsm_inode_alloc(inode, gfp);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(inode_alloc_security, inode);
        if (unlikely(rc))
                security_inode_free(inode);
        return rc;
}

static void inode_free_by_rcu(struct rcu_head *head)
{
        /* The rcu head is at the start of the inode blob */
        call_void_hook(inode_free_security_rcu, head);
        kmem_cache_free(lsm_inode_cache, head);
}

/**
 * security_inode_free() - Free an inode's LSM blob
 * @inode: the inode
 *
 * Release any LSM resources associated with @inode, although due to the
 * inode's RCU protections it is possible that the resources will not be
 * fully released until after the current RCU grace period has elapsed.
 *
 * It is important for LSMs to note that despite being present in a call to
 * security_inode_free(), @inode may still be referenced in a VFS path walk
 * and calls to security_inode_permission() may be made during, or after,
 * a call to security_inode_free().  For this reason the inode->i_security
 * field is released via a call_rcu() callback and any LSMs which need to
 * retain inode state for use in security_inode_permission() should only
 * release that state in the inode_free_security_rcu() LSM hook callback.
 */
void security_inode_free(struct inode *inode)
{
        call_void_hook(inode_free_security, inode);
        if (!inode->i_security)
                return;
        call_rcu((struct rcu_head *)inode->i_security, inode_free_by_rcu);
}

/**
 * security_dentry_init_security() - Perform dentry initialization
 * @dentry: the dentry to initialize
 * @mode: mode used to determine resource type
 * @name: name of the last path component
 * @xattr_name: name of the security/LSM xattr
 * @lsmctx: pointer to the resulting LSM context
 *
 * Compute a context for a dentry as the inode is not yet available since NFSv4
 * has no label backed by an EA anyway.  It is important to note that
 * @xattr_name does not need to be free'd by the caller, it is a static string.
 *
 * Return: Returns 0 on success, negative values on failure.
 */
int security_dentry_init_security(struct dentry *dentry, int mode,
                                  const struct qstr *name,
                                  const char **xattr_name,
                                  struct lsm_context *lsmctx)
{
        return call_int_hook(dentry_init_security, dentry, mode, name,
                             xattr_name, lsmctx);
}
EXPORT_SYMBOL(security_dentry_init_security);

/**
 * security_dentry_create_files_as() - Perform dentry initialization
 * @dentry: the dentry to initialize
 * @mode: mode used to determine resource type
 * @name: name of the last path component
 * @old: creds to use for LSM context calculations
 * @new: creds to modify
 *
 * Compute a context for a dentry as the inode is not yet available and set
 * that context in passed in creds so that new files are created using that
 * context. Context is calculated using the passed in creds and not the creds
 * of the caller.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_dentry_create_files_as(struct dentry *dentry, int mode,
                                    const struct qstr *name,
                                    const struct cred *old, struct cred *new)
{
        return call_int_hook(dentry_create_files_as, dentry, mode,
                             name, old, new);
}
EXPORT_SYMBOL(security_dentry_create_files_as);

/**
 * security_inode_init_security() - Initialize an inode's LSM context
 * @inode: the inode
 * @dir: parent directory
 * @qstr: last component of the pathname
 * @initxattrs: callback function to write xattrs
 * @fs_data: filesystem specific data
 *
 * Obtain the security attribute name suffix and value to set on a newly
 * created inode and set up the incore security field for the new inode.  This
 * hook is called by the fs code as part of the inode creation transaction and
 * provides for atomic labeling of the inode, unlike the post_create/mkdir/...
 * hooks called by the VFS.
 *
 * The hook function is expected to populate the xattrs array, by calling
 * lsm_get_xattr_slot() to retrieve the slots reserved by the security module
 * with the lbs_xattr_count field of the lsm_blob_sizes structure.  For each
 * slot, the hook function should set ->name to the attribute name suffix
 * (e.g. selinux), to allocate ->value (will be freed by the caller) and set it
 * to the attribute value, to set ->value_len to the length of the value.  If
 * the security module does not use security attributes or does not wish to put
 * a security attribute on this particular inode, then it should return
 * -EOPNOTSUPP to skip this processing.
 *
 * Return: Returns 0 if the LSM successfully initialized all of the inode
 *         security attributes that are required, negative values otherwise.
 */
int security_inode_init_security(struct inode *inode, struct inode *dir,
                                 const struct qstr *qstr,
                                 const initxattrs initxattrs, void *fs_data)
{
        struct lsm_static_call *scall;
        struct xattr *new_xattrs = NULL;
        int ret = -EOPNOTSUPP, xattr_count = 0;

        if (unlikely(IS_PRIVATE(inode)))
                return 0;

        if (!blob_sizes.lbs_xattr_count)
                return 0;

        if (initxattrs) {
                /* Allocate +1 as terminator. */
                new_xattrs = kcalloc(blob_sizes.lbs_xattr_count + 1,
                                     sizeof(*new_xattrs), GFP_NOFS);
                if (!new_xattrs)
                        return -ENOMEM;
        }

        lsm_for_each_hook(scall, inode_init_security) {
                ret = scall->hl->hook.inode_init_security(inode, dir, qstr, new_xattrs,
                                                  &xattr_count);
                if (ret && ret != -EOPNOTSUPP)
                        goto out;
                /*
                 * As documented in lsm_hooks.h, -EOPNOTSUPP in this context
                 * means that the LSM is not willing to provide an xattr, not
                 * that it wants to signal an error. Thus, continue to invoke
                 * the remaining LSMs.
                 */
        }

        /* If initxattrs() is NULL, xattr_count is zero, skip the call. */
        if (!xattr_count)
                goto out;

        ret = initxattrs(inode, new_xattrs, fs_data);
out:
        for (; xattr_count > 0; xattr_count--)
                kfree(new_xattrs[xattr_count - 1].value);
        kfree(new_xattrs);
        return (ret == -EOPNOTSUPP) ? 0 : ret;
}
EXPORT_SYMBOL(security_inode_init_security);

/**
 * security_inode_init_security_anon() - Initialize an anonymous inode
 * @inode: the inode
 * @name: the anonymous inode class
 * @context_inode: an optional related inode
 *
 * Set up the incore security field for the new anonymous inode and return
 * whether the inode creation is permitted by the security module or not.
 *
 * Return: Returns 0 on success, -EACCES if the security module denies the
 * creation of this inode, or another -errno upon other errors.
 */
int security_inode_init_security_anon(struct inode *inode,
                                      const struct qstr *name,
                                      const struct inode *context_inode)
{
        return call_int_hook(inode_init_security_anon, inode, name,
                             context_inode);
}

#ifdef CONFIG_SECURITY_PATH
/**
 * security_path_mknod() - Check if creating a special file is allowed
 * @dir: parent directory
 * @dentry: new file
 * @mode: new file mode
 * @dev: device number
 *
 * Check permissions when creating a file. Note that this hook is called even
 * if mknod operation is being done for a regular file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_mknod(const struct path *dir, struct dentry *dentry,
                        umode_t mode, unsigned int dev)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_mknod, dir, dentry, mode, dev);
}
EXPORT_SYMBOL(security_path_mknod);

/**
 * security_path_post_mknod() - Update inode security after reg file creation
 * @idmap: idmap of the mount
 * @dentry: new file
 *
 * Update inode security field after a regular file has been created.
 */
void security_path_post_mknod(struct mnt_idmap *idmap, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return;
        call_void_hook(path_post_mknod, idmap, dentry);
}

/**
 * security_path_mkdir() - Check if creating a new directory is allowed
 * @dir: parent directory
 * @dentry: new directory
 * @mode: new directory mode
 *
 * Check permissions to create a new directory in the existing directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_mkdir(const struct path *dir, struct dentry *dentry,
                        umode_t mode)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_mkdir, dir, dentry, mode);
}
EXPORT_SYMBOL(security_path_mkdir);

/**
 * security_path_rmdir() - Check if removing a directory is allowed
 * @dir: parent directory
 * @dentry: directory to remove
 *
 * Check the permission to remove a directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_rmdir(const struct path *dir, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_rmdir, dir, dentry);
}

/**
 * security_path_unlink() - Check if removing a hard link is allowed
 * @dir: parent directory
 * @dentry: file
 *
 * Check the permission to remove a hard link to a file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_unlink(const struct path *dir, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_unlink, dir, dentry);
}
EXPORT_SYMBOL(security_path_unlink);

/**
 * security_path_symlink() - Check if creating a symbolic link is allowed
 * @dir: parent directory
 * @dentry: symbolic link
 * @old_name: file pathname
 *
 * Check the permission to create a symbolic link to a file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_symlink(const struct path *dir, struct dentry *dentry,
                          const char *old_name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_symlink, dir, dentry, old_name);
}

/**
 * security_path_link - Check if creating a hard link is allowed
 * @old_dentry: existing file
 * @new_dir: new parent directory
 * @new_dentry: new link
 *
 * Check permission before creating a new hard link to a file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_link(struct dentry *old_dentry, const struct path *new_dir,
                       struct dentry *new_dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry))))
                return 0;
        return call_int_hook(path_link, old_dentry, new_dir, new_dentry);
}

/**
 * security_path_rename() - Check if renaming a file is allowed
 * @old_dir: parent directory of the old file
 * @old_dentry: the old file
 * @new_dir: parent directory of the new file
 * @new_dentry: the new file
 * @flags: flags
 *
 * Check for permission to rename a file or directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_rename(const struct path *old_dir, struct dentry *old_dentry,
                         const struct path *new_dir, struct dentry *new_dentry,
                         unsigned int flags)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)) ||
                     (d_is_positive(new_dentry) &&
                      IS_PRIVATE(d_backing_inode(new_dentry)))))
                return 0;

        return call_int_hook(path_rename, old_dir, old_dentry, new_dir,
                             new_dentry, flags);
}
EXPORT_SYMBOL(security_path_rename);

/**
 * security_path_truncate() - Check if truncating a file is allowed
 * @path: file
 *
 * Check permission before truncating the file indicated by path.  Note that
 * truncation permissions may also be checked based on already opened files,
 * using the security_file_truncate() hook.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_truncate(const struct path *path)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
                return 0;
        return call_int_hook(path_truncate, path);
}

/**
 * security_path_chmod() - Check if changing the file's mode is allowed
 * @path: file
 * @mode: new mode
 *
 * Check for permission to change a mode of the file @path. The new mode is
 * specified in @mode which is a bitmask of constants from
 * <include/uapi/linux/stat.h>.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_chmod(const struct path *path, umode_t mode)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
                return 0;
        return call_int_hook(path_chmod, path, mode);
}

/**
 * security_path_chown() - Check if changing the file's owner/group is allowed
 * @path: file
 * @uid: file owner
 * @gid: file group
 *
 * Check for permission to change owner/group of a file or directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
                return 0;
        return call_int_hook(path_chown, path, uid, gid);
}

/**
 * security_path_chroot() - Check if changing the root directory is allowed
 * @path: directory
 *
 * Check for permission to change root directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_chroot(const struct path *path)
{
        return call_int_hook(path_chroot, path);
}
#endif /* CONFIG_SECURITY_PATH */

/**
 * security_inode_create() - Check if creating a file is allowed
 * @dir: the parent directory
 * @dentry: the file being created
 * @mode: requested file mode
 *
 * Check permission to create a regular file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_create(struct inode *dir, struct dentry *dentry,
                          umode_t mode)
{
        if (unlikely(IS_PRIVATE(dir)))
                return 0;
        return call_int_hook(inode_create, dir, dentry, mode);
}
EXPORT_SYMBOL_GPL(security_inode_create);

/**
 * security_inode_post_create_tmpfile() - Update inode security of new tmpfile
 * @idmap: idmap of the mount
 * @inode: inode of the new tmpfile
 *
 * Update inode security data after a tmpfile has been created.
 */
void security_inode_post_create_tmpfile(struct mnt_idmap *idmap,
                                        struct inode *inode)
{
        if (unlikely(IS_PRIVATE(inode)))
                return;
        call_void_hook(inode_post_create_tmpfile, idmap, inode);
}

/**
 * security_inode_link() - Check if creating a hard link is allowed
 * @old_dentry: existing file
 * @dir: new parent directory
 * @new_dentry: new link
 *
 * Check permission before creating a new hard link to a file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_link(struct dentry *old_dentry, struct inode *dir,
                        struct dentry *new_dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry))))
                return 0;
        return call_int_hook(inode_link, old_dentry, dir, new_dentry);
}

/**
 * security_inode_unlink() - Check if removing a hard link is allowed
 * @dir: parent directory
 * @dentry: file
 *
 * Check the permission to remove a hard link to a file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_unlink(struct inode *dir, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_unlink, dir, dentry);
}

/**
 * security_inode_symlink() - Check if creating a symbolic link is allowed
 * @dir: parent directory
 * @dentry: symbolic link
 * @old_name: existing filename
 *
 * Check the permission to create a symbolic link to a file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_symlink(struct inode *dir, struct dentry *dentry,
                           const char *old_name)
{
        if (unlikely(IS_PRIVATE(dir)))
                return 0;
        return call_int_hook(inode_symlink, dir, dentry, old_name);
}

/**
 * security_inode_mkdir() - Check if creating a new directory is allowed
 * @dir: parent directory
 * @dentry: new directory
 * @mode: new directory mode
 *
 * Check permissions to create a new directory in the existing directory
 * associated with inode structure @dir.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
        if (unlikely(IS_PRIVATE(dir)))
                return 0;
        return call_int_hook(inode_mkdir, dir, dentry, mode);
}
EXPORT_SYMBOL_GPL(security_inode_mkdir);

/**
 * security_inode_rmdir() - Check if removing a directory is allowed
 * @dir: parent directory
 * @dentry: directory to be removed
 *
 * Check the permission to remove a directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_rmdir(struct inode *dir, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_rmdir, dir, dentry);
}

/**
 * security_inode_mknod() - Check if creating a special file is allowed
 * @dir: parent directory
 * @dentry: new file
 * @mode: new file mode
 * @dev: device number
 *
 * Check permissions when creating a special file (or a socket or a fifo file
 * created via the mknod system call).  Note that if mknod operation is being
 * done for a regular file, then the create hook will be called and not this
 * hook.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_mknod(struct inode *dir, struct dentry *dentry,
                         umode_t mode, dev_t dev)
{
        if (unlikely(IS_PRIVATE(dir)))
                return 0;
        return call_int_hook(inode_mknod, dir, dentry, mode, dev);
}

/**
 * security_inode_rename() - Check if renaming a file is allowed
 * @old_dir: parent directory of the old file
 * @old_dentry: the old file
 * @new_dir: parent directory of the new file
 * @new_dentry: the new file
 * @flags: flags
 *
 * Check for permission to rename a file or directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry,
                          struct inode *new_dir, struct dentry *new_dentry,
                          unsigned int flags)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)) ||
                     (d_is_positive(new_dentry) &&
                      IS_PRIVATE(d_backing_inode(new_dentry)))))
                return 0;

        if (flags & RENAME_EXCHANGE) {
                int err = call_int_hook(inode_rename, new_dir, new_dentry,
                                        old_dir, old_dentry);
                if (err)
                        return err;
        }

        return call_int_hook(inode_rename, old_dir, old_dentry,
                             new_dir, new_dentry);
}

/**
 * security_inode_readlink() - Check if reading a symbolic link is allowed
 * @dentry: link
 *
 * Check the permission to read the symbolic link.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_readlink(struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_readlink, dentry);
}

/**
 * security_inode_follow_link() - Check if following a symbolic link is allowed
 * @dentry: link dentry
 * @inode: link inode
 * @rcu: true if in RCU-walk mode
 *
 * Check permission to follow a symbolic link when looking up a pathname.  If
 * @rcu is true, @inode is not stable.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_follow_link(struct dentry *dentry, struct inode *inode,
                               bool rcu)
{
        if (unlikely(IS_PRIVATE(inode)))
                return 0;
        return call_int_hook(inode_follow_link, dentry, inode, rcu);
}

/**
 * security_inode_permission() - Check if accessing an inode is allowed
 * @inode: inode
 * @mask: access mask
 *
 * Check permission before accessing an inode.  This hook is called by the
 * existing Linux permission function, so a security module can use it to
 * provide additional checking for existing Linux permission checks.  Notice
 * that this hook is called when a file is opened (as well as many other
 * operations), whereas the file_security_ops permission hook is called when
 * the actual read/write operations are performed.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_permission(struct inode *inode, int mask)
{
        if (unlikely(IS_PRIVATE(inode)))
                return 0;
        return call_int_hook(inode_permission, inode, mask);
}

/**
 * security_inode_setattr() - Check if setting file attributes is allowed
 * @idmap: idmap of the mount
 * @dentry: file
 * @attr: new attributes
 *
 * Check permission before setting file attributes.  Note that the kernel call
 * to notify_change is performed from several locations, whenever file
 * attributes change (such as when a file is truncated, chown/chmod operations,
 * transferring disk quotas, etc).
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_setattr(struct mnt_idmap *idmap,
                           struct dentry *dentry, struct iattr *attr)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_setattr, idmap, dentry, attr);
}
EXPORT_SYMBOL_GPL(security_inode_setattr);

/**
 * security_inode_post_setattr() - Update the inode after a setattr operation
 * @idmap: idmap of the mount
 * @dentry: file
 * @ia_valid: file attributes set
 *
 * Update inode security field after successful setting file attributes.
 */
void security_inode_post_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                                 int ia_valid)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return;
        call_void_hook(inode_post_setattr, idmap, dentry, ia_valid);
}

/**
 * security_inode_getattr() - Check if getting file attributes is allowed
 * @path: file
 *
 * Check permission before obtaining file attributes.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_getattr(const struct path *path)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
                return 0;
        return call_int_hook(inode_getattr, path);
}

/**
 * security_inode_setxattr() - Check if setting file xattrs is allowed
 * @idmap: idmap of the mount
 * @dentry: file
 * @name: xattr name
 * @value: xattr value
 * @size: size of xattr value
 * @flags: flags
 *
 * This hook performs the desired permission checks before setting the extended
 * attributes (xattrs) on @dentry.  It is important to note that we have some
 * additional logic before the main LSM implementation calls to detect if we
 * need to perform an additional capability check at the LSM layer.
 *
 * Normally we enforce a capability check prior to executing the various LSM
 * hook implementations, but if a LSM wants to avoid this capability check,
 * it can register a 'inode_xattr_skipcap' hook and return a value of 1 for
 * xattrs that it wants to avoid the capability check, leaving the LSM fully
 * responsible for enforcing the access control for the specific xattr.  If all
 * of the enabled LSMs refrain from registering a 'inode_xattr_skipcap' hook,
 * or return a 0 (the default return value), the capability check is still
 * performed.  If no 'inode_xattr_skipcap' hooks are registered the capability
 * check is performed.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_setxattr(struct mnt_idmap *idmap,
                            struct dentry *dentry, const char *name,
                            const void *value, size_t size, int flags)
{
        int rc;

        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;

        /* enforce the capability checks at the lsm layer, if needed */
        if (!call_int_hook(inode_xattr_skipcap, name)) {
                rc = cap_inode_setxattr(dentry, name, value, size, flags);
                if (rc)
                        return rc;
        }

        return call_int_hook(inode_setxattr, idmap, dentry, name, value, size,
                             flags);
}

/**
 * security_inode_set_acl() - Check if setting posix acls is allowed
 * @idmap: idmap of the mount
 * @dentry: file
 * @acl_name: acl name
 * @kacl: acl struct
 *
 * Check permission before setting posix acls, the posix acls in @kacl are
 * identified by @acl_name.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_set_acl(struct mnt_idmap *idmap,
                           struct dentry *dentry, const char *acl_name,
                           struct posix_acl *kacl)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_set_acl, idmap, dentry, acl_name, kacl);
}

/**
 * security_inode_post_set_acl() - Update inode security from posix acls set
 * @dentry: file
 * @acl_name: acl name
 * @kacl: acl struct
 *
 * Update inode security data after successfully setting posix acls on @dentry.
 * The posix acls in @kacl are identified by @acl_name.
 */
void security_inode_post_set_acl(struct dentry *dentry, const char *acl_name,
                                 struct posix_acl *kacl)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return;
        call_void_hook(inode_post_set_acl, dentry, acl_name, kacl);
}

/**
 * security_inode_get_acl() - Check if reading posix acls is allowed
 * @idmap: idmap of the mount
 * @dentry: file
 * @acl_name: acl name
 *
 * Check permission before getting osix acls, the posix acls are identified by
 * @acl_name.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_get_acl(struct mnt_idmap *idmap,
                           struct dentry *dentry, const char *acl_name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_get_acl, idmap, dentry, acl_name);
}

/**
 * security_inode_remove_acl() - Check if removing a posix acl is allowed
 * @idmap: idmap of the mount
 * @dentry: file
 * @acl_name: acl name
 *
 * Check permission before removing posix acls, the posix acls are identified
 * by @acl_name.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_remove_acl(struct mnt_idmap *idmap,
                              struct dentry *dentry, const char *acl_name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_remove_acl, idmap, dentry, acl_name);
}

/**
 * security_inode_post_remove_acl() - Update inode security after rm posix acls
 * @idmap: idmap of the mount
 * @dentry: file
 * @acl_name: acl name
 *
 * Update inode security data after successfully removing posix acls on
 * @dentry in @idmap. The posix acls are identified by @acl_name.
 */
void security_inode_post_remove_acl(struct mnt_idmap *idmap,
                                    struct dentry *dentry, const char *acl_name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return;
        call_void_hook(inode_post_remove_acl, idmap, dentry, acl_name);
}

/**
 * security_inode_post_setxattr() - Update the inode after a setxattr operation
 * @dentry: file
 * @name: xattr name
 * @value: xattr value
 * @size: xattr value size
 * @flags: flags
 *
 * Update inode security field after successful setxattr operation.
 */
void security_inode_post_setxattr(struct dentry *dentry, const char *name,
                                  const void *value, size_t size, int flags)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return;
        call_void_hook(inode_post_setxattr, dentry, name, value, size, flags);
}

/**
 * security_inode_getxattr() - Check if xattr access is allowed
 * @dentry: file
 * @name: xattr name
 *
 * Check permission before obtaining the extended attributes identified by
 * @name for @dentry.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_getxattr(struct dentry *dentry, const char *name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_getxattr, dentry, name);
}

/**
 * security_inode_listxattr() - Check if listing xattrs is allowed
 * @dentry: file
 *
 * Check permission before obtaining the list of extended attribute names for
 * @dentry.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_listxattr(struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_listxattr, dentry);
}

/**
 * security_inode_removexattr() - Check if removing an xattr is allowed
 * @idmap: idmap of the mount
 * @dentry: file
 * @name: xattr name
 *
 * This hook performs the desired permission checks before setting the extended
 * attributes (xattrs) on @dentry.  It is important to note that we have some
 * additional logic before the main LSM implementation calls to detect if we
 * need to perform an additional capability check at the LSM layer.
 *
 * Normally we enforce a capability check prior to executing the various LSM
 * hook implementations, but if a LSM wants to avoid this capability check,
 * it can register a 'inode_xattr_skipcap' hook and return a value of 1 for
 * xattrs that it wants to avoid the capability check, leaving the LSM fully
 * responsible for enforcing the access control for the specific xattr.  If all
 * of the enabled LSMs refrain from registering a 'inode_xattr_skipcap' hook,
 * or return a 0 (the default return value), the capability check is still
 * performed.  If no 'inode_xattr_skipcap' hooks are registered the capability
 * check is performed.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_removexattr(struct mnt_idmap *idmap,
                               struct dentry *dentry, const char *name)
{
        int rc;

        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;

        /* enforce the capability checks at the lsm layer, if needed */
        if (!call_int_hook(inode_xattr_skipcap, name)) {
                rc = cap_inode_removexattr(idmap, dentry, name);
                if (rc)
                        return rc;
        }

        return call_int_hook(inode_removexattr, idmap, dentry, name);
}

/**
 * security_inode_post_removexattr() - Update the inode after a removexattr op
 * @dentry: file
 * @name: xattr name
 *
 * Update the inode after a successful removexattr operation.
 */
void security_inode_post_removexattr(struct dentry *dentry, const char *name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return;
        call_void_hook(inode_post_removexattr, dentry, name);
}

/**
 * security_inode_file_setattr() - check if setting fsxattr is allowed
 * @dentry: file to set filesystem extended attributes on
 * @fa: extended attributes to set on the inode
 *
 * Called when file_setattr() syscall or FS_IOC_FSSETXATTR ioctl() is called on
 * inode
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_file_setattr(struct dentry *dentry, struct file_kattr *fa)
{
        return call_int_hook(inode_file_setattr, dentry, fa);
}

/**
 * security_inode_file_getattr() - check if retrieving fsxattr is allowed
 * @dentry: file to retrieve filesystem extended attributes from
 * @fa: extended attributes to get
 *
 * Called when file_getattr() syscall or FS_IOC_FSGETXATTR ioctl() is called on
 * inode
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_file_getattr(struct dentry *dentry, struct file_kattr *fa)
{
        return call_int_hook(inode_file_getattr, dentry, fa);
}

/**
 * security_inode_need_killpriv() - Check if security_inode_killpriv() required
 * @dentry: associated dentry
 *
 * Called when an inode has been changed to determine if
 * security_inode_killpriv() should be called.
 *
 * Return: Return <0 on error to abort the inode change operation, return 0 if
 *         security_inode_killpriv() does not need to be called, return >0 if
 *         security_inode_killpriv() does need to be called.
 */
int security_inode_need_killpriv(struct dentry *dentry)
{
        return call_int_hook(inode_need_killpriv, dentry);
}

/**
 * security_inode_killpriv() - The setuid bit is removed, update LSM state
 * @idmap: idmap of the mount
 * @dentry: associated dentry
 *
 * The @dentry's setuid bit is being removed.  Remove similar security labels.
 * Called with the dentry->d_inode->i_mutex held.
 *
 * Return: Return 0 on success.  If error is returned, then the operation
 *         causing setuid bit removal is failed.
 */
int security_inode_killpriv(struct mnt_idmap *idmap,
                            struct dentry *dentry)
{
        return call_int_hook(inode_killpriv, idmap, dentry);
}

/**
 * security_inode_getsecurity() - Get the xattr security label of an inode
 * @idmap: idmap of the mount
 * @inode: inode
 * @name: xattr name
 * @buffer: security label buffer
 * @alloc: allocation flag
 *
 * Retrieve a copy of the extended attribute representation of the security
 * label associated with @name for @inode via @buffer.  Note that @name is the
 * remainder of the attribute name after the security prefix has been removed.
 * @alloc is used to specify if the call should return a value via the buffer
 * or just the value length.
 *
 * Return: Returns size of buffer on success.
 */
int security_inode_getsecurity(struct mnt_idmap *idmap,
                               struct inode *inode, const char *name,
                               void **buffer, bool alloc)
{
        if (unlikely(IS_PRIVATE(inode)))
                return LSM_RET_DEFAULT(inode_getsecurity);

        return call_int_hook(inode_getsecurity, idmap, inode, name, buffer,
                             alloc);
}

/**
 * security_inode_setsecurity() - Set the xattr security label of an inode
 * @inode: inode
 * @name: xattr name
 * @value: security label
 * @size: length of security label
 * @flags: flags
 *
 * Set the security label associated with @name for @inode from the extended
 * attribute value @value.  @size indicates the size of the @value in bytes.
 * @flags may be XATTR_CREATE, XATTR_REPLACE, or 0. Note that @name is the
 * remainder of the attribute name after the security. prefix has been removed.
 *
 * Return: Returns 0 on success.
 */
int security_inode_setsecurity(struct inode *inode, const char *name,
                               const void *value, size_t size, int flags)
{
        if (unlikely(IS_PRIVATE(inode)))
                return LSM_RET_DEFAULT(inode_setsecurity);

        return call_int_hook(inode_setsecurity, inode, name, value, size,
                             flags);
}

/**
 * security_inode_listsecurity() - List the xattr security label names
 * @inode: inode
 * @buffer: buffer
 * @buffer_size: size of buffer
 *
 * Copy the extended attribute names for the security labels associated with
 * @inode into @buffer.  The maximum size of @buffer is specified by
 * @buffer_size.  @buffer may be NULL to request the size of the buffer
 * required.
 *
 * Return: Returns number of bytes used/required on success.
 */
int security_inode_listsecurity(struct inode *inode,
                                char *buffer, size_t buffer_size)
{
        if (unlikely(IS_PRIVATE(inode)))
                return 0;
        return call_int_hook(inode_listsecurity, inode, buffer, buffer_size);
}
EXPORT_SYMBOL(security_inode_listsecurity);

/**
 * security_inode_getlsmprop() - Get an inode's LSM data
 * @inode: inode
 * @prop: lsm specific information to return
 *
 * Get the lsm specific information associated with the node.
 */
void security_inode_getlsmprop(struct inode *inode, struct lsm_prop *prop)
{
        call_void_hook(inode_getlsmprop, inode, prop);
}

/**
 * security_inode_copy_up() - Create new creds for an overlayfs copy-up op
 * @src: union dentry of copy-up file
 * @new: newly created creds
 *
 * A file is about to be copied up from lower layer to upper layer of overlay
 * filesystem. Security module can prepare a set of new creds and modify as
 * need be and return new creds. Caller will switch to new creds temporarily to
 * create new file and release newly allocated creds.
 *
 * Return: Returns 0 on success or a negative error code on error.
 */
int security_inode_copy_up(struct dentry *src, struct cred **new)
{
        return call_int_hook(inode_copy_up, src, new);
}
EXPORT_SYMBOL(security_inode_copy_up);

/**
 * security_inode_copy_up_xattr() - Filter xattrs in an overlayfs copy-up op
 * @src: union dentry of copy-up file
 * @name: xattr name
 *
 * Filter the xattrs being copied up when a unioned file is copied up from a
 * lower layer to the union/overlay layer.   The caller is responsible for
 * reading and writing the xattrs, this hook is merely a filter.
 *
 * Return: Returns 0 to accept the xattr, -ECANCELED to discard the xattr,
 *         -EOPNOTSUPP if the security module does not know about attribute,
 *         or a negative error code to abort the copy up.
 */
int security_inode_copy_up_xattr(struct dentry *src, const char *name)
{
        int rc;

        rc = call_int_hook(inode_copy_up_xattr, src, name);
        if (rc != LSM_RET_DEFAULT(inode_copy_up_xattr))
                return rc;

        return LSM_RET_DEFAULT(inode_copy_up_xattr);
}
EXPORT_SYMBOL(security_inode_copy_up_xattr);

/**
 * security_inode_setintegrity() - Set the inode's integrity data
 * @inode: inode
 * @type: type of integrity, e.g. hash digest, signature, etc
 * @value: the integrity value
 * @size: size of the integrity value
 *
 * Register a verified integrity measurement of a inode with LSMs.
 * LSMs should free the previously saved data if @value is NULL.
 *
 * Return: Returns 0 on success, negative values on failure.
 */
int security_inode_setintegrity(const struct inode *inode,
                                enum lsm_integrity_type type, const void *value,
                                size_t size)
{
        return call_int_hook(inode_setintegrity, inode, type, value, size);
}
EXPORT_SYMBOL(security_inode_setintegrity);

/**
 * security_kernfs_init_security() - Init LSM context for a kernfs node
 * @kn_dir: parent kernfs node
 * @kn: the kernfs node to initialize
 *
 * Initialize the security context of a newly created kernfs node based on its
 * own and its parent's attributes.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_kernfs_init_security(struct kernfs_node *kn_dir,
                                  struct kernfs_node *kn)
{
        return call_int_hook(kernfs_init_security, kn_dir, kn);
}

/**
 * security_file_permission() - Check file permissions
 * @file: file
 * @mask: requested permissions
 *
 * Check file permissions before accessing an open file.  This hook is called
 * by various operations that read or write files.  A security module can use
 * this hook to perform additional checking on these operations, e.g. to
 * revalidate permissions on use to support privilege bracketing or policy
 * changes.  Notice that this hook is used when the actual read/write
 * operations are performed, whereas the inode_security_ops hook is called when
 * a file is opened (as well as many other operations).  Although this hook can
 * be used to revalidate permissions for various system call operations that
 * read or write files, it does not address the revalidation of permissions for
 * memory-mapped files.  Security modules must handle this separately if they
 * need such revalidation.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_permission(struct file *file, int mask)
{
        return call_int_hook(file_permission, file, mask);
}

/**
 * security_file_alloc() - Allocate and init a file's LSM blob
 * @file: the file
 *
 * Allocate and attach a security structure to the file->f_security field.  The
 * security field is initialized to NULL when the structure is first created.
 *
 * Return: Return 0 if the hook is successful and permission is granted.
 */
int security_file_alloc(struct file *file)
{
        int rc = lsm_file_alloc(file);

        if (rc)
                return rc;
        rc = call_int_hook(file_alloc_security, file);
        if (unlikely(rc))
                security_file_free(file);
        return rc;
}

/**
 * security_file_release() - Perform actions before releasing the file ref
 * @file: the file
 *
 * Perform actions before releasing the last reference to a file.
 */
void security_file_release(struct file *file)
{
        call_void_hook(file_release, file);
}

/**
 * security_file_free() - Free a file's LSM blob
 * @file: the file
 *
 * Deallocate and free any security structures stored in file->f_security.
 */
void security_file_free(struct file *file)
{
        void *blob;

        call_void_hook(file_free_security, file);

        blob = file->f_security;
        if (blob) {
                file->f_security = NULL;
                kmem_cache_free(lsm_file_cache, blob);
        }
}

/**
 * security_file_ioctl() - Check if an ioctl is allowed
 * @file: associated file
 * @cmd: ioctl cmd
 * @arg: ioctl arguments
 *
 * Check permission for an ioctl operation on @file.  Note that @arg sometimes
 * represents a user space pointer; in other cases, it may be a simple integer
 * value.  When @arg represents a user space pointer, it should never be used
 * by the security module.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        return call_int_hook(file_ioctl, file, cmd, arg);
}
EXPORT_SYMBOL_GPL(security_file_ioctl);

/**
 * security_file_ioctl_compat() - Check if an ioctl is allowed in compat mode
 * @file: associated file
 * @cmd: ioctl cmd
 * @arg: ioctl arguments
 *
 * Compat version of security_file_ioctl() that correctly handles 32-bit
 * processes running on 64-bit kernels.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_ioctl_compat(struct file *file, unsigned int cmd,
                               unsigned long arg)
{
        return call_int_hook(file_ioctl_compat, file, cmd, arg);
}
EXPORT_SYMBOL_GPL(security_file_ioctl_compat);

static inline unsigned long mmap_prot(struct file *file, unsigned long prot)
{
        /*
         * Does we have PROT_READ and does the application expect
         * it to imply PROT_EXEC?  If not, nothing to talk about...
         */
        if ((prot & (PROT_READ | PROT_EXEC)) != PROT_READ)
                return prot;
        if (!(current->personality & READ_IMPLIES_EXEC))
                return prot;
        /*
         * if that's an anonymous mapping, let it.
         */
        if (!file)
                return prot | PROT_EXEC;
        /*
         * ditto if it's not on noexec mount, except that on !MMU we need
         * NOMMU_MAP_EXEC (== VM_MAYEXEC) in this case
         */
        if (!path_noexec(&file->f_path)) {
#ifndef CONFIG_MMU
                if (file->f_op->mmap_capabilities) {
                        unsigned caps = file->f_op->mmap_capabilities(file);
                        if (!(caps & NOMMU_MAP_EXEC))
                                return prot;
                }
#endif
                return prot | PROT_EXEC;
        }
        /* anything on noexec mount won't get PROT_EXEC */
        return prot;
}

/**
 * security_mmap_file() - Check if mmap'ing a file is allowed
 * @file: file
 * @prot: protection applied by the kernel
 * @flags: flags
 *
 * Check permissions for a mmap operation.  The @file may be NULL, e.g. if
 * mapping anonymous memory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_mmap_file(struct file *file, unsigned long prot,
                       unsigned long flags)
{
        return call_int_hook(mmap_file, file, prot, mmap_prot(file, prot),
                             flags);
}

/**
 * security_mmap_addr() - Check if mmap'ing an address is allowed
 * @addr: address
 *
 * Check permissions for a mmap operation at @addr.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_mmap_addr(unsigned long addr)
{
        return call_int_hook(mmap_addr, addr);
}

/**
 * security_file_mprotect() - Check if changing memory protections is allowed
 * @vma: memory region
 * @reqprot: application requested protection
 * @prot: protection applied by the kernel
 *
 * Check permissions before changing memory access permissions.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
                           unsigned long prot)
{
        return call_int_hook(file_mprotect, vma, reqprot, prot);
}

/**
 * security_file_lock() - Check if a file lock is allowed
 * @file: file
 * @cmd: lock operation (e.g. F_RDLCK, F_WRLCK)
 *
 * Check permission before performing file locking operations.  Note the hook
 * mediates both flock and fcntl style locks.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_lock(struct file *file, unsigned int cmd)
{
        return call_int_hook(file_lock, file, cmd);
}

/**
 * security_file_fcntl() - Check if fcntl() op is allowed
 * @file: file
 * @cmd: fcntl command
 * @arg: command argument
 *
 * Check permission before allowing the file operation specified by @cmd from
 * being performed on the file @file.  Note that @arg sometimes represents a
 * user space pointer; in other cases, it may be a simple integer value.  When
 * @arg represents a user space pointer, it should never be used by the
 * security module.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
{
        return call_int_hook(file_fcntl, file, cmd, arg);
}

/**
 * security_file_set_fowner() - Set the file owner info in the LSM blob
 * @file: the file
 *
 * Save owner security information (typically from current->security) in
 * file->f_security for later use by the send_sigiotask hook.
 *
 * This hook is called with file->f_owner.lock held.
 *
 * Return: Returns 0 on success.
 */
void security_file_set_fowner(struct file *file)
{
        call_void_hook(file_set_fowner, file);
}

/**
 * security_file_send_sigiotask() - Check if sending SIGIO/SIGURG is allowed
 * @tsk: target task
 * @fown: signal sender
 * @sig: signal to be sent, SIGIO is sent if 0
 *
 * Check permission for the file owner @fown to send SIGIO or SIGURG to the
 * process @tsk.  Note that this hook is sometimes called from interrupt.  Note
 * that the fown_struct, @fown, is never outside the context of a struct file,
 * so the file structure (and associated security information) can always be
 * obtained: container_of(fown, struct file, f_owner).
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_send_sigiotask(struct task_struct *tsk,
                                 struct fown_struct *fown, int sig)
{
        return call_int_hook(file_send_sigiotask, tsk, fown, sig);
}

/**
 * security_file_receive() - Check if receiving a file via IPC is allowed
 * @file: file being received
 *
 * This hook allows security modules to control the ability of a process to
 * receive an open file descriptor via socket IPC.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_receive(struct file *file)
{
        return call_int_hook(file_receive, file);
}

/**
 * security_file_open() - Save open() time state for late use by the LSM
 * @file:
 *
 * Save open-time permission checking state for later use upon file_permission,
 * and recheck access if anything has changed since inode_permission.
 *
 * We can check if a file is opened for execution (e.g. execve(2) call), either
 * directly or indirectly (e.g. ELF's ld.so) by checking file->f_flags &
 * __FMODE_EXEC .
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_open(struct file *file)
{
        return call_int_hook(file_open, file);
}

/**
 * security_file_post_open() - Evaluate a file after it has been opened
 * @file: the file
 * @mask: access mask
 *
 * Evaluate an opened file and the access mask requested with open(). The hook
 * is useful for LSMs that require the file content to be available in order to
 * make decisions.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_post_open(struct file *file, int mask)
{
        return call_int_hook(file_post_open, file, mask);
}
EXPORT_SYMBOL_GPL(security_file_post_open);

/**
 * security_file_truncate() - Check if truncating a file is allowed
 * @file: file
 *
 * Check permission before truncating a file, i.e. using ftruncate.  Note that
 * truncation permission may also be checked based on the path, using the
 * @path_truncate hook.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_truncate(struct file *file)
{
        return call_int_hook(file_truncate, file);
}

/**
 * security_task_alloc() - Allocate a task's LSM blob
 * @task: the task
 * @clone_flags: flags indicating what is being shared
 *
 * Handle allocation of task-related resources.
 *
 * Return: Returns a zero on success, negative values on failure.
 */
int security_task_alloc(struct task_struct *task, u64 clone_flags)
{
        int rc = lsm_task_alloc(task);

        if (rc)
                return rc;
        rc = call_int_hook(task_alloc, task, clone_flags);
        if (unlikely(rc))
                security_task_free(task);
        return rc;
}

/**
 * security_task_free() - Free a task's LSM blob and related resources
 * @task: task
 *
 * Handle release of task-related resources.  Note that this can be called from
 * interrupt context.
 */
void security_task_free(struct task_struct *task)
{
        call_void_hook(task_free, task);

        kfree(task->security);
        task->security = NULL;
}

/**
 * security_cred_alloc_blank() - Allocate the min memory to allow cred_transfer
 * @cred: credentials
 * @gfp: gfp flags
 *
 * Only allocate sufficient memory and attach to @cred such that
 * cred_transfer() will not get ENOMEM.
 *
 * Return: Returns 0 on success, negative values on failure.
 */
int security_cred_alloc_blank(struct cred *cred, gfp_t gfp)
{
        int rc = lsm_cred_alloc(cred, gfp);

        if (rc)
                return rc;

        rc = call_int_hook(cred_alloc_blank, cred, gfp);
        if (unlikely(rc))
                security_cred_free(cred);
        return rc;
}

/**
 * security_cred_free() - Free the cred's LSM blob and associated resources
 * @cred: credentials
 *
 * Deallocate and clear the cred->security field in a set of credentials.
 */
void security_cred_free(struct cred *cred)
{
        /*
         * There is a failure case in prepare_creds() that
         * may result in a call here with ->security being NULL.
         */
        if (unlikely(cred->security == NULL))
                return;

        call_void_hook(cred_free, cred);

        kfree(cred->security);
        cred->security = NULL;
}

/**
 * security_prepare_creds() - Prepare a new set of credentials
 * @new: new credentials
 * @old: original credentials
 * @gfp: gfp flags
 *
 * Prepare a new set of credentials by copying the data from the old set.
 *
 * Return: Returns 0 on success, negative values on failure.
 */
int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp)
{
        int rc = lsm_cred_alloc(new, gfp);

        if (rc)
                return rc;

        rc = call_int_hook(cred_prepare, new, old, gfp);
        if (unlikely(rc))
                security_cred_free(new);
        return rc;
}

/**
 * security_transfer_creds() - Transfer creds
 * @new: target credentials
 * @old: original credentials
 *
 * Transfer data from original creds to new creds.
 */
void security_transfer_creds(struct cred *new, const struct cred *old)
{
        call_void_hook(cred_transfer, new, old);
}

/**
 * security_cred_getsecid() - Get the secid from a set of credentials
 * @c: credentials
 * @secid: secid value
 *
 * Retrieve the security identifier of the cred structure @c.  In case of
 * failure, @secid will be set to zero.
 */
void security_cred_getsecid(const struct cred *c, u32 *secid)
{
        *secid = 0;
        call_void_hook(cred_getsecid, c, secid);
}
EXPORT_SYMBOL(security_cred_getsecid);

/**
 * security_cred_getlsmprop() - Get the LSM data from a set of credentials
 * @c: credentials
 * @prop: destination for the LSM data
 *
 * Retrieve the security data of the cred structure @c.  In case of
 * failure, @prop will be cleared.
 */
void security_cred_getlsmprop(const struct cred *c, struct lsm_prop *prop)
{
        lsmprop_init(prop);
        call_void_hook(cred_getlsmprop, c, prop);
}
EXPORT_SYMBOL(security_cred_getlsmprop);

/**
 * security_kernel_act_as() - Set the kernel credentials to act as secid
 * @new: credentials
 * @secid: secid
 *
 * Set the credentials for a kernel service to act as (subjective context).
 * The current task must be the one that nominated @secid.
 *
 * Return: Returns 0 if successful.
 */
int security_kernel_act_as(struct cred *new, u32 secid)
{
        return call_int_hook(kernel_act_as, new, secid);
}

/**
 * security_kernel_create_files_as() - Set file creation context using an inode
 * @new: target credentials
 * @inode: reference inode
 *
 * Set the file creation context in a set of credentials to be the same as the
 * objective context of the specified inode.  The current task must be the one
 * that nominated @inode.
 *
 * Return: Returns 0 if successful.
 */
int security_kernel_create_files_as(struct cred *new, struct inode *inode)
{
        return call_int_hook(kernel_create_files_as, new, inode);
}

/**
 * security_kernel_module_request() - Check if loading a module is allowed
 * @kmod_name: module name
 *
 * Ability to trigger the kernel to automatically upcall to userspace for
 * userspace to load a kernel module with the given name.
 *
 * Return: Returns 0 if successful.
 */
int security_kernel_module_request(char *kmod_name)
{
        return call_int_hook(kernel_module_request, kmod_name);
}

/**
 * security_kernel_read_file() - Read a file specified by userspace
 * @file: file
 * @id: file identifier
 * @contents: trust if security_kernel_post_read_file() will be called
 *
 * Read a file specified by userspace.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_kernel_read_file(struct file *file, enum kernel_read_file_id id,
                              bool contents)
{
        return call_int_hook(kernel_read_file, file, id, contents);
}
EXPORT_SYMBOL_GPL(security_kernel_read_file);

/**
 * security_kernel_post_read_file() - Read a file specified by userspace
 * @file: file
 * @buf: file contents
 * @size: size of file contents
 * @id: file identifier
 *
 * Read a file specified by userspace.  This must be paired with a prior call
 * to security_kernel_read_file() call that indicated this hook would also be
 * called, see security_kernel_read_file() for more information.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_kernel_post_read_file(struct file *file, char *buf, loff_t size,
                                   enum kernel_read_file_id id)
{
        return call_int_hook(kernel_post_read_file, file, buf, size, id);
}
EXPORT_SYMBOL_GPL(security_kernel_post_read_file);

/**
 * security_kernel_load_data() - Load data provided by userspace
 * @id: data identifier
 * @contents: true if security_kernel_post_load_data() will be called
 *
 * Load data provided by userspace.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_kernel_load_data(enum kernel_load_data_id id, bool contents)
{
        return call_int_hook(kernel_load_data, id, contents);
}
EXPORT_SYMBOL_GPL(security_kernel_load_data);

/**
 * security_kernel_post_load_data() - Load userspace data from a non-file source
 * @buf: data
 * @size: size of data
 * @id: data identifier
 * @description: text description of data, specific to the id value
 *
 * Load data provided by a non-file source (usually userspace buffer).  This
 * must be paired with a prior security_kernel_load_data() call that indicated
 * this hook would also be called, see security_kernel_load_data() for more
 * information.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_kernel_post_load_data(char *buf, loff_t size,
                                   enum kernel_load_data_id id,
                                   char *description)
{
        return call_int_hook(kernel_post_load_data, buf, size, id, description);
}
EXPORT_SYMBOL_GPL(security_kernel_post_load_data);

/**
 * security_task_fix_setuid() - Update LSM with new user id attributes
 * @new: updated credentials
 * @old: credentials being replaced
 * @flags: LSM_SETID_* flag values
 *
 * Update the module's state after setting one or more of the user identity
 * attributes of the current process.  The @flags parameter indicates which of
 * the set*uid system calls invoked this hook.  If @new is the set of
 * credentials that will be installed.  Modifications should be made to this
 * rather than to @current->cred.
 *
 * Return: Returns 0 on success.
 */
int security_task_fix_setuid(struct cred *new, const struct cred *old,
                             int flags)
{
        return call_int_hook(task_fix_setuid, new, old, flags);
}

/**
 * security_task_fix_setgid() - Update LSM with new group id attributes
 * @new: updated credentials
 * @old: credentials being replaced
 * @flags: LSM_SETID_* flag value
 *
 * Update the module's state after setting one or more of the group identity
 * attributes of the current process.  The @flags parameter indicates which of
 * the set*gid system calls invoked this hook.  @new is the set of credentials
 * that will be installed.  Modifications should be made to this rather than to
 * @current->cred.
 *
 * Return: Returns 0 on success.
 */
int security_task_fix_setgid(struct cred *new, const struct cred *old,
                             int flags)
{
        return call_int_hook(task_fix_setgid, new, old, flags);
}

/**
 * security_task_fix_setgroups() - Update LSM with new supplementary groups
 * @new: updated credentials
 * @old: credentials being replaced
 *
 * Update the module's state after setting the supplementary group identity
 * attributes of the current process.  @new is the set of credentials that will
 * be installed.  Modifications should be made to this rather than to
 * @current->cred.
 *
 * Return: Returns 0 on success.
 */
int security_task_fix_setgroups(struct cred *new, const struct cred *old)
{
        return call_int_hook(task_fix_setgroups, new, old);
}

/**
 * security_task_setpgid() - Check if setting the pgid is allowed
 * @p: task being modified
 * @pgid: new pgid
 *
 * Check permission before setting the process group identifier of the process
 * @p to @pgid.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_setpgid(struct task_struct *p, pid_t pgid)
{
        return call_int_hook(task_setpgid, p, pgid);
}

/**
 * security_task_getpgid() - Check if getting the pgid is allowed
 * @p: task
 *
 * Check permission before getting the process group identifier of the process
 * @p.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_getpgid(struct task_struct *p)
{
        return call_int_hook(task_getpgid, p);
}

/**
 * security_task_getsid() - Check if getting the session id is allowed
 * @p: task
 *
 * Check permission before getting the session identifier of the process @p.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_getsid(struct task_struct *p)
{
        return call_int_hook(task_getsid, p);
}

/**
 * security_current_getlsmprop_subj() - Current task's subjective LSM data
 * @prop: lsm specific information
 *
 * Retrieve the subjective security identifier of the current task and return
 * it in @prop.
 */
void security_current_getlsmprop_subj(struct lsm_prop *prop)
{
        lsmprop_init(prop);
        call_void_hook(current_getlsmprop_subj, prop);
}
EXPORT_SYMBOL(security_current_getlsmprop_subj);

/**
 * security_task_getlsmprop_obj() - Get a task's objective LSM data
 * @p: target task
 * @prop: lsm specific information
 *
 * Retrieve the objective security identifier of the task_struct in @p and
 * return it in @prop.
 */
void security_task_getlsmprop_obj(struct task_struct *p, struct lsm_prop *prop)
{
        lsmprop_init(prop);
        call_void_hook(task_getlsmprop_obj, p, prop);
}
EXPORT_SYMBOL(security_task_getlsmprop_obj);

/**
 * security_task_setnice() - Check if setting a task's nice value is allowed
 * @p: target task
 * @nice: nice value
 *
 * Check permission before setting the nice value of @p to @nice.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_setnice(struct task_struct *p, int nice)
{
        return call_int_hook(task_setnice, p, nice);
}

/**
 * security_task_setioprio() - Check if setting a task's ioprio is allowed
 * @p: target task
 * @ioprio: ioprio value
 *
 * Check permission before setting the ioprio value of @p to @ioprio.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_setioprio(struct task_struct *p, int ioprio)
{
        return call_int_hook(task_setioprio, p, ioprio);
}

/**
 * security_task_getioprio() - Check if getting a task's ioprio is allowed
 * @p: task
 *
 * Check permission before getting the ioprio value of @p.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_getioprio(struct task_struct *p)
{
        return call_int_hook(task_getioprio, p);
}

/**
 * security_task_prlimit() - Check if get/setting resources limits is allowed
 * @cred: current task credentials
 * @tcred: target task credentials
 * @flags: LSM_PRLIMIT_* flag bits indicating a get/set/both
 *
 * Check permission before getting and/or setting the resource limits of
 * another task.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_prlimit(const struct cred *cred, const struct cred *tcred,
                          unsigned int flags)
{
        return call_int_hook(task_prlimit, cred, tcred, flags);
}

/**
 * security_task_setrlimit() - Check if setting a new rlimit value is allowed
 * @p: target task's group leader
 * @resource: resource whose limit is being set
 * @new_rlim: new resource limit
 *
 * Check permission before setting the resource limits of process @p for
 * @resource to @new_rlim.  The old resource limit values can be examined by
 * dereferencing (p->signal->rlim + resource).
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_setrlimit(struct task_struct *p, unsigned int resource,
                            struct rlimit *new_rlim)
{
        return call_int_hook(task_setrlimit, p, resource, new_rlim);
}

/**
 * security_task_setscheduler() - Check if setting sched policy/param is allowed
 * @p: target task
 *
 * Check permission before setting scheduling policy and/or parameters of
 * process @p.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_setscheduler(struct task_struct *p)
{
        return call_int_hook(task_setscheduler, p);
}

/**
 * security_task_getscheduler() - Check if getting scheduling info is allowed
 * @p: target task
 *
 * Check permission before obtaining scheduling information for process @p.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_getscheduler(struct task_struct *p)
{
        return call_int_hook(task_getscheduler, p);
}

/**
 * security_task_movememory() - Check if moving memory is allowed
 * @p: task
 *
 * Check permission before moving memory owned by process @p.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_movememory(struct task_struct *p)
{
        return call_int_hook(task_movememory, p);
}

/**
 * security_task_kill() - Check if sending a signal is allowed
 * @p: target process
 * @info: signal information
 * @sig: signal value
 * @cred: credentials of the signal sender, NULL if @current
 *
 * Check permission before sending signal @sig to @p.  @info can be NULL, the
 * constant 1, or a pointer to a kernel_siginfo structure.  If @info is 1 or
 * SI_FROMKERNEL(info) is true, then the signal should be viewed as coming from
 * the kernel and should typically be permitted.  SIGIO signals are handled
 * separately by the send_sigiotask hook in file_security_ops.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_kill(struct task_struct *p, struct kernel_siginfo *info,
                       int sig, const struct cred *cred)
{
        return call_int_hook(task_kill, p, info, sig, cred);
}

/**
 * security_task_prctl() - Check if a prctl op is allowed
 * @option: operation
 * @arg2: argument
 * @arg3: argument
 * @arg4: argument
 * @arg5: argument
 *
 * Check permission before performing a process control operation on the
 * current process.
 *
 * Return: Return -ENOSYS if no-one wanted to handle this op, any other value
 *         to cause prctl() to return immediately with that value.
 */
int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
                        unsigned long arg4, unsigned long arg5)
{
        int thisrc;
        int rc = LSM_RET_DEFAULT(task_prctl);
        struct lsm_static_call *scall;

        lsm_for_each_hook(scall, task_prctl) {
                thisrc = scall->hl->hook.task_prctl(option, arg2, arg3, arg4, arg5);
                if (thisrc != LSM_RET_DEFAULT(task_prctl)) {
                        rc = thisrc;
                        if (thisrc != 0)
                                break;
                }
        }
        return rc;
}

/**
 * security_task_to_inode() - Set the security attributes of a task's inode
 * @p: task
 * @inode: inode
 *
 * Set the security attributes for an inode based on an associated task's
 * security attributes, e.g. for /proc/pid inodes.
 */
void security_task_to_inode(struct task_struct *p, struct inode *inode)
{
        call_void_hook(task_to_inode, p, inode);
}

/**
 * security_create_user_ns() - Check if creating a new userns is allowed
 * @cred: prepared creds
 *
 * Check permission prior to creating a new user namespace.
 *
 * Return: Returns 0 if successful, otherwise < 0 error code.
 */
int security_create_user_ns(const struct cred *cred)
{
        return call_int_hook(userns_create, cred);
}

/**
 * security_ipc_permission() - Check if sysv ipc access is allowed
 * @ipcp: ipc permission structure
 * @flag: requested permissions
 *
 * Check permissions for access to IPC.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag)
{
        return call_int_hook(ipc_permission, ipcp, flag);
}

/**
 * security_ipc_getlsmprop() - Get the sysv ipc object LSM data
 * @ipcp: ipc permission structure
 * @prop: pointer to lsm information
 *
 * Get the lsm information associated with the ipc object.
 */

void security_ipc_getlsmprop(struct kern_ipc_perm *ipcp, struct lsm_prop *prop)
{
        lsmprop_init(prop);
        call_void_hook(ipc_getlsmprop, ipcp, prop);
}

/**
 * security_msg_msg_alloc() - Allocate a sysv ipc message LSM blob
 * @msg: message structure
 *
 * Allocate and attach a security structure to the msg->security field.  The
 * security field is initialized to NULL when the structure is first created.
 *
 * Return: Return 0 if operation was successful and permission is granted.
 */
int security_msg_msg_alloc(struct msg_msg *msg)
{
        int rc = lsm_msg_msg_alloc(msg);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(msg_msg_alloc_security, msg);
        if (unlikely(rc))
                security_msg_msg_free(msg);
        return rc;
}

/**
 * security_msg_msg_free() - Free a sysv ipc message LSM blob
 * @msg: message structure
 *
 * Deallocate the security structure for this message.
 */
void security_msg_msg_free(struct msg_msg *msg)
{
        call_void_hook(msg_msg_free_security, msg);
        kfree(msg->security);
        msg->security = NULL;
}

/**
 * security_msg_queue_alloc() - Allocate a sysv ipc msg queue LSM blob
 * @msq: sysv ipc permission structure
 *
 * Allocate and attach a security structure to @msg. The security field is
 * initialized to NULL when the structure is first created.
 *
 * Return: Returns 0 if operation was successful and permission is granted.
 */
int security_msg_queue_alloc(struct kern_ipc_perm *msq)
{
        int rc = lsm_ipc_alloc(msq);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(msg_queue_alloc_security, msq);
        if (unlikely(rc))
                security_msg_queue_free(msq);
        return rc;
}

/**
 * security_msg_queue_free() - Free a sysv ipc msg queue LSM blob
 * @msq: sysv ipc permission structure
 *
 * Deallocate security field @perm->security for the message queue.
 */
void security_msg_queue_free(struct kern_ipc_perm *msq)
{
        call_void_hook(msg_queue_free_security, msq);
        kfree(msq->security);
        msq->security = NULL;
}

/**
 * security_msg_queue_associate() - Check if a msg queue operation is allowed
 * @msq: sysv ipc permission structure
 * @msqflg: operation flags
 *
 * Check permission when a message queue is requested through the msgget system
 * call. This hook is only called when returning the message queue identifier
 * for an existing message queue, not when a new message queue is created.
 *
 * Return: Return 0 if permission is granted.
 */
int security_msg_queue_associate(struct kern_ipc_perm *msq, int msqflg)
{
        return call_int_hook(msg_queue_associate, msq, msqflg);
}

/**
 * security_msg_queue_msgctl() - Check if a msg queue operation is allowed
 * @msq: sysv ipc permission structure
 * @cmd: operation
 *
 * Check permission when a message control operation specified by @cmd is to be
 * performed on the message queue with permissions.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_msg_queue_msgctl(struct kern_ipc_perm *msq, int cmd)
{
        return call_int_hook(msg_queue_msgctl, msq, cmd);
}

/**
 * security_msg_queue_msgsnd() - Check if sending a sysv ipc message is allowed
 * @msq: sysv ipc permission structure
 * @msg: message
 * @msqflg: operation flags
 *
 * Check permission before a message, @msg, is enqueued on the message queue
 * with permissions specified in @msq.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_msg_queue_msgsnd(struct kern_ipc_perm *msq,
                              struct msg_msg *msg, int msqflg)
{
        return call_int_hook(msg_queue_msgsnd, msq, msg, msqflg);
}

/**
 * security_msg_queue_msgrcv() - Check if receiving a sysv ipc msg is allowed
 * @msq: sysv ipc permission structure
 * @msg: message
 * @target: target task
 * @type: type of message requested
 * @mode: operation flags
 *
 * Check permission before a message, @msg, is removed from the message        queue.
 * The @target task structure contains a pointer to the process that will be
 * receiving the message (not equal to the current process when inline receives
 * are being performed).
 *
 * Return: Returns 0 if permission is granted.
 */
int security_msg_queue_msgrcv(struct kern_ipc_perm *msq, struct msg_msg *msg,
                              struct task_struct *target, long type, int mode)
{
        return call_int_hook(msg_queue_msgrcv, msq, msg, target, type, mode);
}

/**
 * security_shm_alloc() - Allocate a sysv shm LSM blob
 * @shp: sysv ipc permission structure
 *
 * Allocate and attach a security structure to the @shp security field.  The
 * security field is initialized to NULL when the structure is first created.
 *
 * Return: Returns 0 if operation was successful and permission is granted.
 */
int security_shm_alloc(struct kern_ipc_perm *shp)
{
        int rc = lsm_ipc_alloc(shp);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(shm_alloc_security, shp);
        if (unlikely(rc))
                security_shm_free(shp);
        return rc;
}

/**
 * security_shm_free() - Free a sysv shm LSM blob
 * @shp: sysv ipc permission structure
 *
 * Deallocate the security structure @perm->security for the memory segment.
 */
void security_shm_free(struct kern_ipc_perm *shp)
{
        call_void_hook(shm_free_security, shp);
        kfree(shp->security);
        shp->security = NULL;
}

/**
 * security_shm_associate() - Check if a sysv shm operation is allowed
 * @shp: sysv ipc permission structure
 * @shmflg: operation flags
 *
 * Check permission when a shared memory region is requested through the shmget
 * system call. This hook is only called when returning the shared memory
 * region identifier for an existing region, not when a new shared memory
 * region is created.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_shm_associate(struct kern_ipc_perm *shp, int shmflg)
{
        return call_int_hook(shm_associate, shp, shmflg);
}

/**
 * security_shm_shmctl() - Check if a sysv shm operation is allowed
 * @shp: sysv ipc permission structure
 * @cmd: operation
 *
 * Check permission when a shared memory control operation specified by @cmd is
 * to be performed on the shared memory region with permissions in @shp.
 *
 * Return: Return 0 if permission is granted.
 */
int security_shm_shmctl(struct kern_ipc_perm *shp, int cmd)
{
        return call_int_hook(shm_shmctl, shp, cmd);
}

/**
 * security_shm_shmat() - Check if a sysv shm attach operation is allowed
 * @shp: sysv ipc permission structure
 * @shmaddr: address of memory region to attach
 * @shmflg: operation flags
 *
 * Check permissions prior to allowing the shmat system call to attach the
 * shared memory segment with permissions @shp to the data segment of the
 * calling process. The attaching address is specified by @shmaddr.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_shm_shmat(struct kern_ipc_perm *shp,
                       char __user *shmaddr, int shmflg)
{
        return call_int_hook(shm_shmat, shp, shmaddr, shmflg);
}

/**
 * security_sem_alloc() - Allocate a sysv semaphore LSM blob
 * @sma: sysv ipc permission structure
 *
 * Allocate and attach a security structure to the @sma security field. The
 * security field is initialized to NULL when the structure is first created.
 *
 * Return: Returns 0 if operation was successful and permission is granted.
 */
int security_sem_alloc(struct kern_ipc_perm *sma)
{
        int rc = lsm_ipc_alloc(sma);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(sem_alloc_security, sma);
        if (unlikely(rc))
                security_sem_free(sma);
        return rc;
}

/**
 * security_sem_free() - Free a sysv semaphore LSM blob
 * @sma: sysv ipc permission structure
 *
 * Deallocate security structure @sma->security for the semaphore.
 */
void security_sem_free(struct kern_ipc_perm *sma)
{
        call_void_hook(sem_free_security, sma);
        kfree(sma->security);
        sma->security = NULL;
}

/**
 * security_sem_associate() - Check if a sysv semaphore operation is allowed
 * @sma: sysv ipc permission structure
 * @semflg: operation flags
 *
 * Check permission when a semaphore is requested through the semget system
 * call. This hook is only called when returning the semaphore identifier for
 * an existing semaphore, not when a new one must be created.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sem_associate(struct kern_ipc_perm *sma, int semflg)
{
        return call_int_hook(sem_associate, sma, semflg);
}

/**
 * security_sem_semctl() - Check if a sysv semaphore operation is allowed
 * @sma: sysv ipc permission structure
 * @cmd: operation
 *
 * Check permission when a semaphore operation specified by @cmd is to be
 * performed on the semaphore.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sem_semctl(struct kern_ipc_perm *sma, int cmd)
{
        return call_int_hook(sem_semctl, sma, cmd);
}

/**
 * security_sem_semop() - Check if a sysv semaphore operation is allowed
 * @sma: sysv ipc permission structure
 * @sops: operations to perform
 * @nsops: number of operations
 * @alter: flag indicating changes will be made
 *
 * Check permissions before performing operations on members of the semaphore
 * set. If the @alter flag is nonzero, the semaphore set may be modified.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops,
                       unsigned nsops, int alter)
{
        return call_int_hook(sem_semop, sma, sops, nsops, alter);
}

/**
 * security_d_instantiate() - Populate an inode's LSM state based on a dentry
 * @dentry: dentry
 * @inode: inode
 *
 * Fill in @inode security information for a @dentry if allowed.
 */
void security_d_instantiate(struct dentry *dentry, struct inode *inode)
{
        if (unlikely(inode && IS_PRIVATE(inode)))
                return;
        call_void_hook(d_instantiate, dentry, inode);
}
EXPORT_SYMBOL(security_d_instantiate);

/*
 * Please keep this in sync with it's counterpart in security/lsm_syscalls.c
 */

/**
 * security_getselfattr - Read an LSM attribute of the current process.
 * @attr: which attribute to return
 * @uctx: the user-space destination for the information, or NULL
 * @size: pointer to the size of space available to receive the data
 * @flags: special handling options. LSM_FLAG_SINGLE indicates that only
 * attributes associated with the LSM identified in the passed @ctx be
 * reported.
 *
 * A NULL value for @uctx can be used to get both the number of attributes
 * and the size of the data.
 *
 * Returns the number of attributes found on success, negative value
 * on error. @size is reset to the total size of the data.
 * If @size is insufficient to contain the data -E2BIG is returned.
 */
int security_getselfattr(unsigned int attr, struct lsm_ctx __user *uctx,
                         u32 __user *size, u32 flags)
{
        struct lsm_static_call *scall;
        struct lsm_ctx lctx = { .id = LSM_ID_UNDEF, };
        u8 __user *base = (u8 __user *)uctx;
        u32 entrysize;
        u32 total = 0;
        u32 left;
        bool toobig = false;
        bool single = false;
        int count = 0;
        int rc;

        if (attr == LSM_ATTR_UNDEF)
                return -EINVAL;
        if (size == NULL)
                return -EINVAL;
        if (get_user(left, size))
                return -EFAULT;

        if (flags) {
                /*
                 * Only flag supported is LSM_FLAG_SINGLE
                 */
                if (flags != LSM_FLAG_SINGLE || !uctx)
                        return -EINVAL;
                if (copy_from_user(&lctx, uctx, sizeof(lctx)))
                        return -EFAULT;
                /*
                 * If the LSM ID isn't specified it is an error.
                 */
                if (lctx.id == LSM_ID_UNDEF)
                        return -EINVAL;
                single = true;
        }

        /*
         * In the usual case gather all the data from the LSMs.
         * In the single case only get the data from the LSM specified.
         */
        lsm_for_each_hook(scall, getselfattr) {
                if (single && lctx.id != scall->hl->lsmid->id)
                        continue;
                entrysize = left;
                if (base)
                        uctx = (struct lsm_ctx __user *)(base + total);
                rc = scall->hl->hook.getselfattr(attr, uctx, &entrysize, flags);
                if (rc == -EOPNOTSUPP)
                        continue;
                if (rc == -E2BIG) {
                        rc = 0;
                        left = 0;
                        toobig = true;
                } else if (rc < 0)
                        return rc;
                else
                        left -= entrysize;

                total += entrysize;
                count += rc;
                if (single)
                        break;
        }
        if (put_user(total, size))
                return -EFAULT;
        if (toobig)
                return -E2BIG;
        if (count == 0)
                return LSM_RET_DEFAULT(getselfattr);
        return count;
}

/*
 * Please keep this in sync with it's counterpart in security/lsm_syscalls.c
 */

/**
 * security_setselfattr - Set an LSM attribute on the current process.
 * @attr: which attribute to set
 * @uctx: the user-space source for the information
 * @size: the size of the data
 * @flags: reserved for future use, must be 0
 *
 * Set an LSM attribute for the current process. The LSM, attribute
 * and new value are included in @uctx.
 *
 * Returns 0 on success, -EINVAL if the input is inconsistent, -EFAULT
 * if the user buffer is inaccessible, E2BIG if size is too big, or an
 * LSM specific failure.
 */
int security_setselfattr(unsigned int attr, struct lsm_ctx __user *uctx,
                         u32 size, u32 flags)
{
        struct lsm_static_call *scall;
        struct lsm_ctx *lctx;
        int rc = LSM_RET_DEFAULT(setselfattr);
        u64 required_len;

        if (flags)
                return -EINVAL;
        if (size < sizeof(*lctx))
                return -EINVAL;
        if (size > PAGE_SIZE)
                return -E2BIG;

        lctx = memdup_user(uctx, size);
        if (IS_ERR(lctx))
                return PTR_ERR(lctx);

        if (size < lctx->len ||
            check_add_overflow(sizeof(*lctx), lctx->ctx_len, &required_len) ||
            lctx->len < required_len) {
                rc = -EINVAL;
                goto free_out;
        }

        lsm_for_each_hook(scall, setselfattr)
                if ((scall->hl->lsmid->id) == lctx->id) {
                        rc = scall->hl->hook.setselfattr(attr, lctx, size, flags);
                        break;
                }

free_out:
        kfree(lctx);
        return rc;
}

/**
 * security_getprocattr() - Read an attribute for a task
 * @p: the task
 * @lsmid: LSM identification
 * @name: attribute name
 * @value: attribute value
 *
 * Read attribute @name for task @p and store it into @value if allowed.
 *
 * Return: Returns the length of @value on success, a negative value otherwise.
 */
int security_getprocattr(struct task_struct *p, int lsmid, const char *name,
                         char **value)
{
        struct lsm_static_call *scall;

        lsm_for_each_hook(scall, getprocattr) {
                if (lsmid != 0 && lsmid != scall->hl->lsmid->id)
                        continue;
                return scall->hl->hook.getprocattr(p, name, value);
        }
        return LSM_RET_DEFAULT(getprocattr);
}

/**
 * security_setprocattr() - Set an attribute for a task
 * @lsmid: LSM identification
 * @name: attribute name
 * @value: attribute value
 * @size: attribute value size
 *
 * Write (set) the current task's attribute @name to @value, size @size if
 * allowed.
 *
 * Return: Returns bytes written on success, a negative value otherwise.
 */
int security_setprocattr(int lsmid, const char *name, void *value, size_t size)
{
        struct lsm_static_call *scall;

        lsm_for_each_hook(scall, setprocattr) {
                if (lsmid != 0 && lsmid != scall->hl->lsmid->id)
                        continue;
                return scall->hl->hook.setprocattr(name, value, size);
        }
        return LSM_RET_DEFAULT(setprocattr);
}

/**
 * security_ismaclabel() - Check if the named attribute is a MAC label
 * @name: full extended attribute name
 *
 * Check if the extended attribute specified by @name represents a MAC label.
 *
 * Return: Returns 1 if name is a MAC attribute otherwise returns 0.
 */
int security_ismaclabel(const char *name)
{
        return call_int_hook(ismaclabel, name);
}
EXPORT_SYMBOL(security_ismaclabel);

/**
 * security_secid_to_secctx() - Convert a secid to a secctx
 * @secid: secid
 * @cp: the LSM context
 *
 * Convert secid to security context.  If @cp is NULL the length of the
 * result will be returned, but no data will be returned.  This
 * does mean that the length could change between calls to check the length and
 * the next call which actually allocates and returns the data.
 *
 * Return: Return length of data on success, error on failure.
 */
int security_secid_to_secctx(u32 secid, struct lsm_context *cp)
{
        return call_int_hook(secid_to_secctx, secid, cp);
}
EXPORT_SYMBOL(security_secid_to_secctx);

/**
 * security_lsmprop_to_secctx() - Convert a lsm_prop to a secctx
 * @prop: lsm specific information
 * @cp: the LSM context
 * @lsmid: which security module to report
 *
 * Convert a @prop entry to security context.  If @cp is NULL the
 * length of the result will be returned. This does mean that the
 * length could change between calls to check the length and the
 * next call which actually allocates and returns the @cp.
 *
 * @lsmid identifies which LSM should supply the context.
 * A value of LSM_ID_UNDEF indicates that the first LSM suppling
 * the hook should be used. This is used in cases where the
 * ID of the supplying LSM is unambiguous.
 *
 * Return: Return length of data on success, error on failure.
 */
int security_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp,
                               int lsmid)
{
        struct lsm_static_call *scall;

        lsm_for_each_hook(scall, lsmprop_to_secctx) {
                if (lsmid != LSM_ID_UNDEF && lsmid != scall->hl->lsmid->id)
                        continue;
                return scall->hl->hook.lsmprop_to_secctx(prop, cp);
        }
        return LSM_RET_DEFAULT(lsmprop_to_secctx);
}
EXPORT_SYMBOL(security_lsmprop_to_secctx);

/**
 * security_secctx_to_secid() - Convert a secctx to a secid
 * @secdata: secctx
 * @seclen: length of secctx
 * @secid: secid
 *
 * Convert security context to secid.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid)
{
        *secid = 0;
        return call_int_hook(secctx_to_secid, secdata, seclen, secid);
}
EXPORT_SYMBOL(security_secctx_to_secid);

/**
 * security_release_secctx() - Free a secctx buffer
 * @cp: the security context
 *
 * Release the security context.
 */
void security_release_secctx(struct lsm_context *cp)
{
        call_void_hook(release_secctx, cp);
        memset(cp, 0, sizeof(*cp));
}
EXPORT_SYMBOL(security_release_secctx);

/**
 * security_inode_invalidate_secctx() - Invalidate an inode's security label
 * @inode: inode
 *
 * Notify the security module that it must revalidate the security context of
 * an inode.
 */
void security_inode_invalidate_secctx(struct inode *inode)
{
        call_void_hook(inode_invalidate_secctx, inode);
}
EXPORT_SYMBOL(security_inode_invalidate_secctx);

/**
 * security_inode_notifysecctx() - Notify the LSM of an inode's security label
 * @inode: inode
 * @ctx: secctx
 * @ctxlen: length of secctx
 *
 * Notify the security module of what the security context of an inode should
 * be.  Initializes the incore security context managed by the security module
 * for this inode.  Example usage: NFS client invokes this hook to initialize
 * the security context in its incore inode to the value provided by the server
 * for the file when the server returned the file's attributes to the client.
 * Must be called with inode->i_mutex locked.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen)
{
        return call_int_hook(inode_notifysecctx, inode, ctx, ctxlen);
}
EXPORT_SYMBOL(security_inode_notifysecctx);

/**
 * security_inode_setsecctx() - Change the security label of an inode
 * @dentry: inode
 * @ctx: secctx
 * @ctxlen: length of secctx
 *
 * Change the security context of an inode.  Updates the incore security
 * context managed by the security module and invokes the fs code as needed
 * (via __vfs_setxattr_noperm) to update any backing xattrs that represent the
 * context.  Example usage: NFS server invokes this hook to change the security
 * context in its incore inode and on the backing filesystem to a value
 * provided by the client on a SETATTR operation.  Must be called with
 * inode->i_mutex locked.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen)
{
        return call_int_hook(inode_setsecctx, dentry, ctx, ctxlen);
}
EXPORT_SYMBOL(security_inode_setsecctx);

/**
 * security_inode_getsecctx() - Get the security label of an inode
 * @inode: inode
 * @cp: security context
 *
 * On success, returns 0 and fills out @cp with the security context
 * for the given @inode.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_inode_getsecctx(struct inode *inode, struct lsm_context *cp)
{
        memset(cp, 0, sizeof(*cp));
        return call_int_hook(inode_getsecctx, inode, cp);
}
EXPORT_SYMBOL(security_inode_getsecctx);

#ifdef CONFIG_WATCH_QUEUE
/**
 * security_post_notification() - Check if a watch notification can be posted
 * @w_cred: credentials of the task that set the watch
 * @cred: credentials of the task which triggered the watch
 * @n: the notification
 *
 * Check to see if a watch notification can be posted to a particular queue.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_post_notification(const struct cred *w_cred,
                               const struct cred *cred,
                               struct watch_notification *n)
{
        return call_int_hook(post_notification, w_cred, cred, n);
}
#endif /* CONFIG_WATCH_QUEUE */

#ifdef CONFIG_KEY_NOTIFICATIONS
/**
 * security_watch_key() - Check if a task is allowed to watch for key events
 * @key: the key to watch
 *
 * Check to see if a process is allowed to watch for event notifications from
 * a key or keyring.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_watch_key(struct key *key)
{
        return call_int_hook(watch_key, key);
}
#endif /* CONFIG_KEY_NOTIFICATIONS */

#ifdef CONFIG_SECURITY_NETWORK
/**
 * security_netlink_send() - Save info and check if netlink sending is allowed
 * @sk: sending socket
 * @skb: netlink message
 *
 * Save security information for a netlink message so that permission checking
 * can be performed when the message is processed.  The security information
 * can be saved using the eff_cap field of the netlink_skb_parms structure.
 * Also may be used to provide fine grained control over message transmission.
 *
 * Return: Returns 0 if the information was successfully saved and message is
 *         allowed to be transmitted.
 */
int security_netlink_send(struct sock *sk, struct sk_buff *skb)
{
        return call_int_hook(netlink_send, sk, skb);
}

/**
 * security_unix_stream_connect() - Check if a AF_UNIX stream is allowed
 * @sock: originating sock
 * @other: peer sock
 * @newsk: new sock
 *
 * Check permissions before establishing a Unix domain stream connection
 * between @sock and @other.
 *
 * The @unix_stream_connect and @unix_may_send hooks were necessary because
 * Linux provides an alternative to the conventional file name space for Unix
 * domain sockets.  Whereas binding and connecting to sockets in the file name
 * space is mediated by the typical file permissions (and caught by the mknod
 * and permission hooks in inode_security_ops), binding and connecting to
 * sockets in the abstract name space is completely unmediated.  Sufficient
 * control of Unix domain sockets in the abstract name space isn't possible
 * using only the socket layer hooks, since we need to know the actual target
 * socket, which is not looked up until we are inside the af_unix code.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_unix_stream_connect(struct sock *sock, struct sock *other,
                                 struct sock *newsk)
{
        return call_int_hook(unix_stream_connect, sock, other, newsk);
}
EXPORT_SYMBOL(security_unix_stream_connect);

/**
 * security_unix_may_send() - Check if AF_UNIX socket can send datagrams
 * @sock: originating sock
 * @other: peer sock
 *
 * Check permissions before connecting or sending datagrams from @sock to
 * @other.
 *
 * The @unix_stream_connect and @unix_may_send hooks were necessary because
 * Linux provides an alternative to the conventional file name space for Unix
 * domain sockets.  Whereas binding and connecting to sockets in the file name
 * space is mediated by the typical file permissions (and caught by the mknod
 * and permission hooks in inode_security_ops), binding and connecting to
 * sockets in the abstract name space is completely unmediated.  Sufficient
 * control of Unix domain sockets in the abstract name space isn't possible
 * using only the socket layer hooks, since we need to know the actual target
 * socket, which is not looked up until we are inside the af_unix code.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_unix_may_send(struct socket *sock,  struct socket *other)
{
        return call_int_hook(unix_may_send, sock, other);
}
EXPORT_SYMBOL(security_unix_may_send);

/**
 * security_socket_create() - Check if creating a new socket is allowed
 * @family: protocol family
 * @type: communications type
 * @protocol: requested protocol
 * @kern: set to 1 if a kernel socket is requested
 *
 * Check permissions prior to creating a new socket.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_create(int family, int type, int protocol, int kern)
{
        return call_int_hook(socket_create, family, type, protocol, kern);
}

/**
 * security_socket_post_create() - Initialize a newly created socket
 * @sock: socket
 * @family: protocol family
 * @type: communications type
 * @protocol: requested protocol
 * @kern: set to 1 if a kernel socket is requested
 *
 * This hook allows a module to update or allocate a per-socket security
 * structure. Note that the security field was not added directly to the socket
 * structure, but rather, the socket security information is stored in the
 * associated inode.  Typically, the inode alloc_security hook will allocate
 * and attach security information to SOCK_INODE(sock)->i_security.  This hook
 * may be used to update the SOCK_INODE(sock)->i_security field with additional
 * information that wasn't available when the inode was allocated.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_post_create(struct socket *sock, int family,
                                int type, int protocol, int kern)
{
        return call_int_hook(socket_post_create, sock, family, type,
                             protocol, kern);
}

/**
 * security_socket_socketpair() - Check if creating a socketpair is allowed
 * @socka: first socket
 * @sockb: second socket
 *
 * Check permissions before creating a fresh pair of sockets.
 *
 * Return: Returns 0 if permission is granted and the connection was
 *         established.
 */
int security_socket_socketpair(struct socket *socka, struct socket *sockb)
{
        return call_int_hook(socket_socketpair, socka, sockb);
}
EXPORT_SYMBOL(security_socket_socketpair);

/**
 * security_socket_bind() - Check if a socket bind operation is allowed
 * @sock: socket
 * @address: requested bind address
 * @addrlen: length of address
 *
 * Check permission before socket protocol layer bind operation is performed
 * and the socket @sock is bound to the address specified in the @address
 * parameter.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_bind(struct socket *sock,
                         struct sockaddr *address, int addrlen)
{
        return call_int_hook(socket_bind, sock, address, addrlen);
}

/**
 * security_socket_connect() - Check if a socket connect operation is allowed
 * @sock: socket
 * @address: address of remote connection point
 * @addrlen: length of address
 *
 * Check permission before socket protocol layer connect operation attempts to
 * connect socket @sock to a remote address, @address.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_connect(struct socket *sock,
                            struct sockaddr *address, int addrlen)
{
        return call_int_hook(socket_connect, sock, address, addrlen);
}

/**
 * security_socket_listen() - Check if a socket is allowed to listen
 * @sock: socket
 * @backlog: connection queue size
 *
 * Check permission before socket protocol layer listen operation.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_listen(struct socket *sock, int backlog)
{
        return call_int_hook(socket_listen, sock, backlog);
}

/**
 * security_socket_accept() - Check if a socket is allowed to accept connections
 * @sock: listening socket
 * @newsock: newly creation connection socket
 *
 * Check permission before accepting a new connection.  Note that the new
 * socket, @newsock, has been created and some information copied to it, but
 * the accept operation has not actually been performed.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_accept(struct socket *sock, struct socket *newsock)
{
        return call_int_hook(socket_accept, sock, newsock);
}

/**
 * security_socket_sendmsg() - Check if sending a message is allowed
 * @sock: sending socket
 * @msg: message to send
 * @size: size of message
 *
 * Check permission before transmitting a message to another socket.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size)
{
        return call_int_hook(socket_sendmsg, sock, msg, size);
}

/**
 * security_socket_recvmsg() - Check if receiving a message is allowed
 * @sock: receiving socket
 * @msg: message to receive
 * @size: size of message
 * @flags: operational flags
 *
 * Check permission before receiving a message from a socket.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_recvmsg(struct socket *sock, struct msghdr *msg,
                            int size, int flags)
{
        return call_int_hook(socket_recvmsg, sock, msg, size, flags);
}

/**
 * security_socket_getsockname() - Check if reading the socket addr is allowed
 * @sock: socket
 *
 * Check permission before reading the local address (name) of the socket
 * object.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_getsockname(struct socket *sock)
{
        return call_int_hook(socket_getsockname, sock);
}

/**
 * security_socket_getpeername() - Check if reading the peer's addr is allowed
 * @sock: socket
 *
 * Check permission before the remote address (name) of a socket object.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_getpeername(struct socket *sock)
{
        return call_int_hook(socket_getpeername, sock);
}

/**
 * security_socket_getsockopt() - Check if reading a socket option is allowed
 * @sock: socket
 * @level: option's protocol level
 * @optname: option name
 *
 * Check permissions before retrieving the options associated with socket
 * @sock.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_getsockopt(struct socket *sock, int level, int optname)
{
        return call_int_hook(socket_getsockopt, sock, level, optname);
}

/**
 * security_socket_setsockopt() - Check if setting a socket option is allowed
 * @sock: socket
 * @level: option's protocol level
 * @optname: option name
 *
 * Check permissions before setting the options associated with socket @sock.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_setsockopt(struct socket *sock, int level, int optname)
{
        return call_int_hook(socket_setsockopt, sock, level, optname);
}

/**
 * security_socket_shutdown() - Checks if shutting down the socket is allowed
 * @sock: socket
 * @how: flag indicating how sends and receives are handled
 *
 * Checks permission before all or part of a connection on the socket @sock is
 * shut down.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_shutdown(struct socket *sock, int how)
{
        return call_int_hook(socket_shutdown, sock, how);
}

/**
 * security_sock_rcv_skb() - Check if an incoming network packet is allowed
 * @sk: destination sock
 * @skb: incoming packet
 *
 * Check permissions on incoming network packets.  This hook is distinct from
 * Netfilter's IP input hooks since it is the first time that the incoming
 * sk_buff @skb has been associated with a particular socket, @sk.  Must not
 * sleep inside this hook because some callers hold spinlocks.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        return call_int_hook(socket_sock_rcv_skb, sk, skb);
}
EXPORT_SYMBOL(security_sock_rcv_skb);

/**
 * security_socket_getpeersec_stream() - Get the remote peer label
 * @sock: socket
 * @optval: destination buffer
 * @optlen: size of peer label copied into the buffer
 * @len: maximum size of the destination buffer
 *
 * This hook allows the security module to provide peer socket security state
 * for unix or connected tcp sockets to userspace via getsockopt SO_GETPEERSEC.
 * For tcp sockets this can be meaningful if the socket is associated with an
 * ipsec SA.
 *
 * Return: Returns 0 if all is well, otherwise, typical getsockopt return
 *         values.
 */
int security_socket_getpeersec_stream(struct socket *sock, sockptr_t optval,
                                      sockptr_t optlen, unsigned int len)
{
        return call_int_hook(socket_getpeersec_stream, sock, optval, optlen,
                             len);
}

/**
 * security_socket_getpeersec_dgram() - Get the remote peer label
 * @sock: socket
 * @skb: datagram packet
 * @secid: remote peer label secid
 *
 * This hook allows the security module to provide peer socket security state
 * for udp sockets on a per-packet basis to userspace via getsockopt
 * SO_GETPEERSEC. The application must first have indicated the IP_PASSSEC
 * option via getsockopt. It can then retrieve the security state returned by
 * this hook for a packet via the SCM_SECURITY ancillary message type.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_socket_getpeersec_dgram(struct socket *sock,
                                     struct sk_buff *skb, u32 *secid)
{
        return call_int_hook(socket_getpeersec_dgram, sock, skb, secid);
}
EXPORT_SYMBOL(security_socket_getpeersec_dgram);

/**
 * lsm_sock_alloc - allocate a composite sock blob
 * @sock: the sock that needs a blob
 * @gfp: allocation mode
 *
 * Allocate the sock blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_sock_alloc(struct sock *sock, gfp_t gfp)
{
        return lsm_blob_alloc(&sock->sk_security, blob_sizes.lbs_sock, gfp);
}

/**
 * security_sk_alloc() - Allocate and initialize a sock's LSM blob
 * @sk: sock
 * @family: protocol family
 * @priority: gfp flags
 *
 * Allocate and attach a security structure to the sk->sk_security field, which
 * is used to copy security attributes between local stream sockets.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_sk_alloc(struct sock *sk, int family, gfp_t priority)
{
        int rc = lsm_sock_alloc(sk, priority);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(sk_alloc_security, sk, family, priority);
        if (unlikely(rc))
                security_sk_free(sk);
        return rc;
}

/**
 * security_sk_free() - Free the sock's LSM blob
 * @sk: sock
 *
 * Deallocate security structure.
 */
void security_sk_free(struct sock *sk)
{
        call_void_hook(sk_free_security, sk);
        kfree(sk->sk_security);
        sk->sk_security = NULL;
}

/**
 * security_sk_clone() - Clone a sock's LSM state
 * @sk: original sock
 * @newsk: target sock
 *
 * Clone/copy security structure.
 */
void security_sk_clone(const struct sock *sk, struct sock *newsk)
{
        call_void_hook(sk_clone_security, sk, newsk);
}
EXPORT_SYMBOL(security_sk_clone);

/**
 * security_sk_classify_flow() - Set a flow's secid based on socket
 * @sk: original socket
 * @flic: target flow
 *
 * Set the target flow's secid to socket's secid.
 */
void security_sk_classify_flow(const struct sock *sk, struct flowi_common *flic)
{
        call_void_hook(sk_getsecid, sk, &flic->flowic_secid);
}
EXPORT_SYMBOL(security_sk_classify_flow);

/**
 * security_req_classify_flow() - Set a flow's secid based on request_sock
 * @req: request_sock
 * @flic: target flow
 *
 * Sets @flic's secid to @req's secid.
 */
void security_req_classify_flow(const struct request_sock *req,
                                struct flowi_common *flic)
{
        call_void_hook(req_classify_flow, req, flic);
}
EXPORT_SYMBOL(security_req_classify_flow);

/**
 * security_sock_graft() - Reconcile LSM state when grafting a sock on a socket
 * @sk: sock being grafted
 * @parent: target parent socket
 *
 * Sets @parent's inode secid to @sk's secid and update @sk with any necessary
 * LSM state from @parent.
 */
void security_sock_graft(struct sock *sk, struct socket *parent)
{
        call_void_hook(sock_graft, sk, parent);
}
EXPORT_SYMBOL(security_sock_graft);

/**
 * security_inet_conn_request() - Set request_sock state using incoming connect
 * @sk: parent listening sock
 * @skb: incoming connection
 * @req: new request_sock
 *
 * Initialize the @req LSM state based on @sk and the incoming connect in @skb.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inet_conn_request(const struct sock *sk,
                               struct sk_buff *skb, struct request_sock *req)
{
        return call_int_hook(inet_conn_request, sk, skb, req);
}
EXPORT_SYMBOL(security_inet_conn_request);

/**
 * security_inet_csk_clone() - Set new sock LSM state based on request_sock
 * @newsk: new sock
 * @req: connection request_sock
 *
 * Set that LSM state of @sock using the LSM state from @req.
 */
void security_inet_csk_clone(struct sock *newsk,
                             const struct request_sock *req)
{
        call_void_hook(inet_csk_clone, newsk, req);
}

/**
 * security_inet_conn_established() - Update sock's LSM state with connection
 * @sk: sock
 * @skb: connection packet
 *
 * Update @sock's LSM state to represent a new connection from @skb.
 */
void security_inet_conn_established(struct sock *sk,
                                    struct sk_buff *skb)
{
        call_void_hook(inet_conn_established, sk, skb);
}
EXPORT_SYMBOL(security_inet_conn_established);

/**
 * security_secmark_relabel_packet() - Check if setting a secmark is allowed
 * @secid: new secmark value
 *
 * Check if the process should be allowed to relabel packets to @secid.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_secmark_relabel_packet(u32 secid)
{
        return call_int_hook(secmark_relabel_packet, secid);
}
EXPORT_SYMBOL(security_secmark_relabel_packet);

/**
 * security_secmark_refcount_inc() - Increment the secmark labeling rule count
 *
 * Tells the LSM to increment the number of secmark labeling rules loaded.
 */
void security_secmark_refcount_inc(void)
{
        call_void_hook(secmark_refcount_inc);
}
EXPORT_SYMBOL(security_secmark_refcount_inc);

/**
 * security_secmark_refcount_dec() - Decrement the secmark labeling rule count
 *
 * Tells the LSM to decrement the number of secmark labeling rules loaded.
 */
void security_secmark_refcount_dec(void)
{
        call_void_hook(secmark_refcount_dec);
}
EXPORT_SYMBOL(security_secmark_refcount_dec);

/**
 * security_tun_dev_alloc_security() - Allocate a LSM blob for a TUN device
 * @security: pointer to the LSM blob
 *
 * This hook allows a module to allocate a security structure for a TUN        device,
 * returning the pointer in @security.
 *
 * Return: Returns a zero on success, negative values on failure.
 */
int security_tun_dev_alloc_security(void **security)
{
        int rc;

        rc = lsm_blob_alloc(security, blob_sizes.lbs_tun_dev, GFP_KERNEL);
        if (rc)
                return rc;

        rc = call_int_hook(tun_dev_alloc_security, *security);
        if (rc) {
                kfree(*security);
                *security = NULL;
        }
        return rc;
}
EXPORT_SYMBOL(security_tun_dev_alloc_security);

/**
 * security_tun_dev_free_security() - Free a TUN device LSM blob
 * @security: LSM blob
 *
 * This hook allows a module to free the security structure for a TUN device.
 */
void security_tun_dev_free_security(void *security)
{
        kfree(security);
}
EXPORT_SYMBOL(security_tun_dev_free_security);

/**
 * security_tun_dev_create() - Check if creating a TUN device is allowed
 *
 * Check permissions prior to creating a new TUN device.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_tun_dev_create(void)
{
        return call_int_hook(tun_dev_create);
}
EXPORT_SYMBOL(security_tun_dev_create);

/**
 * security_tun_dev_attach_queue() - Check if attaching a TUN queue is allowed
 * @security: TUN device LSM blob
 *
 * Check permissions prior to attaching to a TUN device queue.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_tun_dev_attach_queue(void *security)
{
        return call_int_hook(tun_dev_attach_queue, security);
}
EXPORT_SYMBOL(security_tun_dev_attach_queue);

/**
 * security_tun_dev_attach() - Update TUN device LSM state on attach
 * @sk: associated sock
 * @security: TUN device LSM blob
 *
 * This hook can be used by the module to update any security state associated
 * with the TUN device's sock structure.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_tun_dev_attach(struct sock *sk, void *security)
{
        return call_int_hook(tun_dev_attach, sk, security);
}
EXPORT_SYMBOL(security_tun_dev_attach);

/**
 * security_tun_dev_open() - Update TUN device LSM state on open
 * @security: TUN device LSM blob
 *
 * This hook can be used by the module to update any security state associated
 * with the TUN device's security structure.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_tun_dev_open(void *security)
{
        return call_int_hook(tun_dev_open, security);
}
EXPORT_SYMBOL(security_tun_dev_open);

/**
 * security_sctp_assoc_request() - Update the LSM on a SCTP association req
 * @asoc: SCTP association
 * @skb: packet requesting the association
 *
 * Passes the @asoc and @chunk->skb of the association INIT packet to the LSM.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_sctp_assoc_request(struct sctp_association *asoc,
                                struct sk_buff *skb)
{
        return call_int_hook(sctp_assoc_request, asoc, skb);
}
EXPORT_SYMBOL(security_sctp_assoc_request);

/**
 * security_sctp_bind_connect() - Validate a list of addrs for a SCTP option
 * @sk: socket
 * @optname: SCTP option to validate
 * @address: list of IP addresses to validate
 * @addrlen: length of the address list
 *
 * Validiate permissions required for each address associated with sock        @sk.
 * Depending on @optname, the addresses will be treated as either a connect or
 * bind service. The @addrlen is calculated on each IPv4 and IPv6 address using
 * sizeof(struct sockaddr_in) or sizeof(struct sockaddr_in6).
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_sctp_bind_connect(struct sock *sk, int optname,
                               struct sockaddr *address, int addrlen)
{
        return call_int_hook(sctp_bind_connect, sk, optname, address, addrlen);
}
EXPORT_SYMBOL(security_sctp_bind_connect);

/**
 * security_sctp_sk_clone() - Clone a SCTP sock's LSM state
 * @asoc: SCTP association
 * @sk: original sock
 * @newsk: target sock
 *
 * Called whenever a new socket is created by accept(2) (i.e. a TCP style
 * socket) or when a socket is 'peeled off' e.g userspace calls
 * sctp_peeloff(3).
 */
void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk,
                            struct sock *newsk)
{
        call_void_hook(sctp_sk_clone, asoc, sk, newsk);
}
EXPORT_SYMBOL(security_sctp_sk_clone);

/**
 * security_sctp_assoc_established() - Update LSM state when assoc established
 * @asoc: SCTP association
 * @skb: packet establishing the association
 *
 * Passes the @asoc and @chunk->skb of the association COOKIE_ACK packet to the
 * security module.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sctp_assoc_established(struct sctp_association *asoc,
                                    struct sk_buff *skb)
{
        return call_int_hook(sctp_assoc_established, asoc, skb);
}
EXPORT_SYMBOL(security_sctp_assoc_established);

/**
 * security_mptcp_add_subflow() - Inherit the LSM label from the MPTCP socket
 * @sk: the owning MPTCP socket
 * @ssk: the new subflow
 *
 * Update the labeling for the given MPTCP subflow, to match the one of the
 * owning MPTCP socket. This hook has to be called after the socket creation and
 * initialization via the security_socket_create() and
 * security_socket_post_create() LSM hooks.
 *
 * Return: Returns 0 on success or a negative error code on failure.
 */
int security_mptcp_add_subflow(struct sock *sk, struct sock *ssk)
{
        return call_int_hook(mptcp_add_subflow, sk, ssk);
}

#endif        /* CONFIG_SECURITY_NETWORK */

#ifdef CONFIG_SECURITY_INFINIBAND
/**
 * security_ib_pkey_access() - Check if access to an IB pkey is allowed
 * @sec: LSM blob
 * @subnet_prefix: subnet prefix of the port
 * @pkey: IB pkey
 *
 * Check permission to access a pkey when modifying a QP.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_ib_pkey_access(void *sec, u64 subnet_prefix, u16 pkey)
{
        return call_int_hook(ib_pkey_access, sec, subnet_prefix, pkey);
}
EXPORT_SYMBOL(security_ib_pkey_access);

/**
 * security_ib_endport_manage_subnet() - Check if SMPs traffic is allowed
 * @sec: LSM blob
 * @dev_name: IB device name
 * @port_num: port number
 *
 * Check permissions to send and receive SMPs on a end port.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_ib_endport_manage_subnet(void *sec,
                                      const char *dev_name, u8 port_num)
{
        return call_int_hook(ib_endport_manage_subnet, sec, dev_name, port_num);
}
EXPORT_SYMBOL(security_ib_endport_manage_subnet);

/**
 * security_ib_alloc_security() - Allocate an Infiniband LSM blob
 * @sec: LSM blob
 *
 * Allocate a security structure for Infiniband objects.
 *
 * Return: Returns 0 on success, non-zero on failure.
 */
int security_ib_alloc_security(void **sec)
{
        int rc;

        rc = lsm_blob_alloc(sec, blob_sizes.lbs_ib, GFP_KERNEL);
        if (rc)
                return rc;

        rc = call_int_hook(ib_alloc_security, *sec);
        if (rc) {
                kfree(*sec);
                *sec = NULL;
        }
        return rc;
}
EXPORT_SYMBOL(security_ib_alloc_security);

/**
 * security_ib_free_security() - Free an Infiniband LSM blob
 * @sec: LSM blob
 *
 * Deallocate an Infiniband security structure.
 */
void security_ib_free_security(void *sec)
{
        kfree(sec);
}
EXPORT_SYMBOL(security_ib_free_security);
#endif        /* CONFIG_SECURITY_INFINIBAND */

#ifdef CONFIG_SECURITY_NETWORK_XFRM
/**
 * security_xfrm_policy_alloc() - Allocate a xfrm policy LSM blob
 * @ctxp: xfrm security context being added to the SPD
 * @sec_ctx: security label provided by userspace
 * @gfp: gfp flags
 *
 * Allocate a security structure to the xp->security field; the security field
 * is initialized to NULL when the xfrm_policy is allocated.
 *
 * Return:  Return 0 if operation was successful.
 */
int security_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp,
                               struct xfrm_user_sec_ctx *sec_ctx,
                               gfp_t gfp)
{
        return call_int_hook(xfrm_policy_alloc_security, ctxp, sec_ctx, gfp);
}
EXPORT_SYMBOL(security_xfrm_policy_alloc);

/**
 * security_xfrm_policy_clone() - Clone xfrm policy LSM state
 * @old_ctx: xfrm security context
 * @new_ctxp: target xfrm security context
 *
 * Allocate a security structure in new_ctxp that contains the information from
 * the old_ctx structure.
 *
 * Return: Return 0 if operation was successful.
 */
int security_xfrm_policy_clone(struct xfrm_sec_ctx *old_ctx,
                               struct xfrm_sec_ctx **new_ctxp)
{
        return call_int_hook(xfrm_policy_clone_security, old_ctx, new_ctxp);
}

/**
 * security_xfrm_policy_free() - Free a xfrm security context
 * @ctx: xfrm security context
 *
 * Free LSM resources associated with @ctx.
 */
void security_xfrm_policy_free(struct xfrm_sec_ctx *ctx)
{
        call_void_hook(xfrm_policy_free_security, ctx);
}
EXPORT_SYMBOL(security_xfrm_policy_free);

/**
 * security_xfrm_policy_delete() - Check if deleting a xfrm policy is allowed
 * @ctx: xfrm security context
 *
 * Authorize deletion of a SPD entry.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_xfrm_policy_delete(struct xfrm_sec_ctx *ctx)
{
        return call_int_hook(xfrm_policy_delete_security, ctx);
}

/**
 * security_xfrm_state_alloc() - Allocate a xfrm state LSM blob
 * @x: xfrm state being added to the SAD
 * @sec_ctx: security label provided by userspace
 *
 * Allocate a security structure to the @x->security field; the security field
 * is initialized to NULL when the xfrm_state is allocated. Set the context to
 * correspond to @sec_ctx.
 *
 * Return: Return 0 if operation was successful.
 */
int security_xfrm_state_alloc(struct xfrm_state *x,
                              struct xfrm_user_sec_ctx *sec_ctx)
{
        return call_int_hook(xfrm_state_alloc, x, sec_ctx);
}
EXPORT_SYMBOL(security_xfrm_state_alloc);

/**
 * security_xfrm_state_alloc_acquire() - Allocate a xfrm state LSM blob
 * @x: xfrm state being added to the SAD
 * @polsec: associated policy's security context
 * @secid: secid from the flow
 *
 * Allocate a security structure to the x->security field; the security field
 * is initialized to NULL when the xfrm_state is allocated.  Set the context to
 * correspond to secid.
 *
 * Return: Returns 0 if operation was successful.
 */
int security_xfrm_state_alloc_acquire(struct xfrm_state *x,
                                      struct xfrm_sec_ctx *polsec, u32 secid)
{
        return call_int_hook(xfrm_state_alloc_acquire, x, polsec, secid);
}

/**
 * security_xfrm_state_delete() - Check if deleting a xfrm state is allowed
 * @x: xfrm state
 *
 * Authorize deletion of x->security.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_xfrm_state_delete(struct xfrm_state *x)
{
        return call_int_hook(xfrm_state_delete_security, x);
}
EXPORT_SYMBOL(security_xfrm_state_delete);

/**
 * security_xfrm_state_free() - Free a xfrm state
 * @x: xfrm state
 *
 * Deallocate x->security.
 */
void security_xfrm_state_free(struct xfrm_state *x)
{
        call_void_hook(xfrm_state_free_security, x);
}

/**
 * security_xfrm_policy_lookup() - Check if using a xfrm policy is allowed
 * @ctx: target xfrm security context
 * @fl_secid: flow secid used to authorize access
 *
 * Check permission when a flow selects a xfrm_policy for processing XFRMs on a
 * packet.  The hook is called when selecting either a per-socket policy or a
 * generic xfrm policy.
 *
 * Return: Return 0 if permission is granted, -ESRCH otherwise, or -errno on
 *         other errors.
 */
int security_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid)
{
        return call_int_hook(xfrm_policy_lookup, ctx, fl_secid);
}

/**
 * security_xfrm_state_pol_flow_match() - Check for a xfrm match
 * @x: xfrm state to match
 * @xp: xfrm policy to check for a match
 * @flic: flow to check for a match.
 *
 * Check @xp and @flic for a match with @x.
 *
 * Return: Returns 1 if there is a match.
 */
int security_xfrm_state_pol_flow_match(struct xfrm_state *x,
                                       struct xfrm_policy *xp,
                                       const struct flowi_common *flic)
{
        struct lsm_static_call *scall;
        int rc = LSM_RET_DEFAULT(xfrm_state_pol_flow_match);

        /*
         * Since this function is expected to return 0 or 1, the judgment
         * becomes difficult if multiple LSMs supply this call. Fortunately,
         * we can use the first LSM's judgment because currently only SELinux
         * supplies this call.
         *
         * For speed optimization, we explicitly break the loop rather than
         * using the macro
         */
        lsm_for_each_hook(scall, xfrm_state_pol_flow_match) {
                rc = scall->hl->hook.xfrm_state_pol_flow_match(x, xp, flic);
                break;
        }
        return rc;
}

/**
 * security_xfrm_decode_session() - Determine the xfrm secid for a packet
 * @skb: xfrm packet
 * @secid: secid
 *
 * Decode the packet in @skb and return the security label in @secid.
 *
 * Return: Return 0 if all xfrms used have the same secid.
 */
int security_xfrm_decode_session(struct sk_buff *skb, u32 *secid)
{
        return call_int_hook(xfrm_decode_session, skb, secid, 1);
}

void security_skb_classify_flow(struct sk_buff *skb, struct flowi_common *flic)
{
        int rc = call_int_hook(xfrm_decode_session, skb, &flic->flowic_secid,
                               0);

        BUG_ON(rc);
}
EXPORT_SYMBOL(security_skb_classify_flow);
#endif        /* CONFIG_SECURITY_NETWORK_XFRM */

#ifdef CONFIG_KEYS
/**
 * security_key_alloc() - Allocate and initialize a kernel key LSM blob
 * @key: key
 * @cred: credentials
 * @flags: allocation flags
 *
 * Permit allocation of a key and assign security data. Note that key does not
 * have a serial number assigned at this point.
 *
 * Return: Return 0 if permission is granted, -ve error otherwise.
 */
int security_key_alloc(struct key *key, const struct cred *cred,
                       unsigned long flags)
{
        int rc = lsm_key_alloc(key);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(key_alloc, key, cred, flags);
        if (unlikely(rc))
                security_key_free(key);
        return rc;
}

/**
 * security_key_free() - Free a kernel key LSM blob
 * @key: key
 *
 * Notification of destruction; free security data.
 */
void security_key_free(struct key *key)
{
        kfree(key->security);
        key->security = NULL;
}

/**
 * security_key_permission() - Check if a kernel key operation is allowed
 * @key_ref: key reference
 * @cred: credentials of actor requesting access
 * @need_perm: requested permissions
 *
 * See whether a specific operational right is granted to a process on a key.
 *
 * Return: Return 0 if permission is granted, -ve error otherwise.
 */
int security_key_permission(key_ref_t key_ref, const struct cred *cred,
                            enum key_need_perm need_perm)
{
        return call_int_hook(key_permission, key_ref, cred, need_perm);
}

/**
 * security_key_getsecurity() - Get the key's security label
 * @key: key
 * @buffer: security label buffer
 *
 * Get a textual representation of the security context attached to a key for
 * the purposes of honouring KEYCTL_GETSECURITY.  This function allocates the
 * storage for the NUL-terminated string and the caller should free it.
 *
 * Return: Returns the length of @buffer (including terminating NUL) or -ve if
 *         an error occurs.  May also return 0 (and a NULL buffer pointer) if
 *         there is no security label assigned to the key.
 */
int security_key_getsecurity(struct key *key, char **buffer)
{
        *buffer = NULL;
        return call_int_hook(key_getsecurity, key, buffer);
}

/**
 * security_key_post_create_or_update() - Notification of key create or update
 * @keyring: keyring to which the key is linked to
 * @key: created or updated key
 * @payload: data used to instantiate or update the key
 * @payload_len: length of payload
 * @flags: key flags
 * @create: flag indicating whether the key was created or updated
 *
 * Notify the caller of a key creation or update.
 */
void security_key_post_create_or_update(struct key *keyring, struct key *key,
                                        const void *payload, size_t payload_len,
                                        unsigned long flags, bool create)
{
        call_void_hook(key_post_create_or_update, keyring, key, payload,
                       payload_len, flags, create);
}
#endif        /* CONFIG_KEYS */

#ifdef CONFIG_AUDIT
/**
 * security_audit_rule_init() - Allocate and init an LSM audit rule struct
 * @field: audit action
 * @op: rule operator
 * @rulestr: rule context
 * @lsmrule: receive buffer for audit rule struct
 * @gfp: GFP flag used for kmalloc
 *
 * Allocate and initialize an LSM audit rule structure.
 *
 * Return: Return 0 if @lsmrule has been successfully set, -EINVAL in case of
 *         an invalid rule.
 */
int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule,
                             gfp_t gfp)
{
        return call_int_hook(audit_rule_init, field, op, rulestr, lsmrule, gfp);
}

/**
 * security_audit_rule_known() - Check if an audit rule contains LSM fields
 * @krule: audit rule
 *
 * Specifies whether given @krule contains any fields related to the current
 * LSM.
 *
 * Return: Returns 1 in case of relation found, 0 otherwise.
 */
int security_audit_rule_known(struct audit_krule *krule)
{
        return call_int_hook(audit_rule_known, krule);
}

/**
 * security_audit_rule_free() - Free an LSM audit rule struct
 * @lsmrule: audit rule struct
 *
 * Deallocate the LSM audit rule structure previously allocated by
 * audit_rule_init().
 */
void security_audit_rule_free(void *lsmrule)
{
        call_void_hook(audit_rule_free, lsmrule);
}

/**
 * security_audit_rule_match() - Check if a label matches an audit rule
 * @prop: security label
 * @field: LSM audit field
 * @op: matching operator
 * @lsmrule: audit rule
 *
 * Determine if given @secid matches a rule previously approved by
 * security_audit_rule_known().
 *
 * Return: Returns 1 if secid matches the rule, 0 if it does not, -ERRNO on
 *         failure.
 */
int security_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op,
                              void *lsmrule)
{
        return call_int_hook(audit_rule_match, prop, field, op, lsmrule);
}
#endif /* CONFIG_AUDIT */

#ifdef CONFIG_BPF_SYSCALL
/**
 * security_bpf() - Check if the bpf syscall operation is allowed
 * @cmd: command
 * @attr: bpf attribute
 * @size: size
 * @kernel: whether or not call originated from kernel
 *
 * Do a initial check for all bpf syscalls after the attribute is copied into
 * the kernel. The actual security module can implement their own rules to
 * check the specific cmd they need.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_bpf(int cmd, union bpf_attr *attr, unsigned int size, bool kernel)
{
        return call_int_hook(bpf, cmd, attr, size, kernel);
}

/**
 * security_bpf_map() - Check if access to a bpf map is allowed
 * @map: bpf map
 * @fmode: mode
 *
 * Do a check when the kernel generates and returns a file descriptor for eBPF
 * maps.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_bpf_map(struct bpf_map *map, fmode_t fmode)
{
        return call_int_hook(bpf_map, map, fmode);
}

/**
 * security_bpf_prog() - Check if access to a bpf program is allowed
 * @prog: bpf program
 *
 * Do a check when the kernel generates and returns a file descriptor for eBPF
 * programs.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_bpf_prog(struct bpf_prog *prog)
{
        return call_int_hook(bpf_prog, prog);
}

/**
 * security_bpf_map_create() - Check if BPF map creation is allowed
 * @map: BPF map object
 * @attr: BPF syscall attributes used to create BPF map
 * @token: BPF token used to grant user access
 * @kernel: whether or not call originated from kernel
 *
 * Do a check when the kernel creates a new BPF map. This is also the
 * point where LSM blob is allocated for LSMs that need them.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
                            struct bpf_token *token, bool kernel)
{
        int rc;

        rc = lsm_bpf_map_alloc(map);
        if (unlikely(rc))
                return rc;

        rc = call_int_hook(bpf_map_create, map, attr, token, kernel);
        if (unlikely(rc))
                security_bpf_map_free(map);
        return rc;
}

/**
 * security_bpf_prog_load() - Check if loading of BPF program is allowed
 * @prog: BPF program object
 * @attr: BPF syscall attributes used to create BPF program
 * @token: BPF token used to grant user access to BPF subsystem
 * @kernel: whether or not call originated from kernel
 *
 * Perform an access control check when the kernel loads a BPF program and
 * allocates associated BPF program object. This hook is also responsible for
 * allocating any required LSM state for the BPF program.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
                           struct bpf_token *token, bool kernel)
{
        int rc;

        rc = lsm_bpf_prog_alloc(prog);
        if (unlikely(rc))
                return rc;

        rc = call_int_hook(bpf_prog_load, prog, attr, token, kernel);
        if (unlikely(rc))
                security_bpf_prog_free(prog);
        return rc;
}

/**
 * security_bpf_token_create() - Check if creating of BPF token is allowed
 * @token: BPF token object
 * @attr: BPF syscall attributes used to create BPF token
 * @path: path pointing to BPF FS mount point from which BPF token is created
 *
 * Do a check when the kernel instantiates a new BPF token object from BPF FS
 * instance. This is also the point where LSM blob can be allocated for LSMs.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr,
                              const struct path *path)
{
        int rc;

        rc = lsm_bpf_token_alloc(token);
        if (unlikely(rc))
                return rc;

        rc = call_int_hook(bpf_token_create, token, attr, path);
        if (unlikely(rc))
                security_bpf_token_free(token);
        return rc;
}

/**
 * security_bpf_token_cmd() - Check if BPF token is allowed to delegate
 * requested BPF syscall command
 * @token: BPF token object
 * @cmd: BPF syscall command requested to be delegated by BPF token
 *
 * Do a check when the kernel decides whether provided BPF token should allow
 * delegation of requested BPF syscall command.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
{
        return call_int_hook(bpf_token_cmd, token, cmd);
}

/**
 * security_bpf_token_capable() - Check if BPF token is allowed to delegate
 * requested BPF-related capability
 * @token: BPF token object
 * @cap: capabilities requested to be delegated by BPF token
 *
 * Do a check when the kernel decides whether provided BPF token should allow
 * delegation of requested BPF-related capabilities.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_bpf_token_capable(const struct bpf_token *token, int cap)
{
        return call_int_hook(bpf_token_capable, token, cap);
}

/**
 * security_bpf_map_free() - Free a bpf map's LSM blob
 * @map: bpf map
 *
 * Clean up the security information stored inside bpf map.
 */
void security_bpf_map_free(struct bpf_map *map)
{
        call_void_hook(bpf_map_free, map);
        kfree(map->security);
        map->security = NULL;
}

/**
 * security_bpf_prog_free() - Free a BPF program's LSM blob
 * @prog: BPF program struct
 *
 * Clean up the security information stored inside BPF program.
 */
void security_bpf_prog_free(struct bpf_prog *prog)
{
        call_void_hook(bpf_prog_free, prog);
        kfree(prog->aux->security);
        prog->aux->security = NULL;
}

/**
 * security_bpf_token_free() - Free a BPF token's LSM blob
 * @token: BPF token struct
 *
 * Clean up the security information stored inside BPF token.
 */
void security_bpf_token_free(struct bpf_token *token)
{
        call_void_hook(bpf_token_free, token);
        kfree(token->security);
        token->security = NULL;
}
#endif /* CONFIG_BPF_SYSCALL */

/**
 * security_locked_down() - Check if a kernel feature is allowed
 * @what: requested kernel feature
 *
 * Determine whether a kernel feature that potentially enables arbitrary code
 * execution in kernel space should be permitted.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_locked_down(enum lockdown_reason what)
{
        return call_int_hook(locked_down, what);
}
EXPORT_SYMBOL(security_locked_down);

/**
 * security_bdev_alloc() - Allocate a block device LSM blob
 * @bdev: block device
 *
 * Allocate and attach a security structure to @bdev->bd_security.  The
 * security field is initialized to NULL when the bdev structure is
 * allocated.
 *
 * Return: Return 0 if operation was successful.
 */
int security_bdev_alloc(struct block_device *bdev)
{
        int rc = 0;

        rc = lsm_bdev_alloc(bdev);
        if (unlikely(rc))
                return rc;

        rc = call_int_hook(bdev_alloc_security, bdev);
        if (unlikely(rc))
                security_bdev_free(bdev);

        return rc;
}
EXPORT_SYMBOL(security_bdev_alloc);

/**
 * security_bdev_free() - Free a block device's LSM blob
 * @bdev: block device
 *
 * Deallocate the bdev security structure and set @bdev->bd_security to NULL.
 */
void security_bdev_free(struct block_device *bdev)
{
        if (!bdev->bd_security)
                return;

        call_void_hook(bdev_free_security, bdev);

        kfree(bdev->bd_security);
        bdev->bd_security = NULL;
}
EXPORT_SYMBOL(security_bdev_free);

/**
 * security_bdev_setintegrity() - Set the device's integrity data
 * @bdev: block device
 * @type: type of integrity, e.g. hash digest, signature, etc
 * @value: the integrity value
 * @size: size of the integrity value
 *
 * Register a verified integrity measurement of a bdev with LSMs.
 * LSMs should free the previously saved data if @value is NULL.
 * Please note that the new hook should be invoked every time the security
 * information is updated to keep these data current. For example, in dm-verity,
 * if the mapping table is reloaded and configured to use a different dm-verity
 * target with a new roothash and signing information, the previously stored
 * data in the LSM blob will become obsolete. It is crucial to re-invoke the
 * hook to refresh these data and ensure they are up to date. This necessity
 * arises from the design of device-mapper, where a device-mapper device is
 * first created, and then targets are subsequently loaded into it. These
 * targets can be modified multiple times during the device's lifetime.
 * Therefore, while the LSM blob is allocated during the creation of the block
 * device, its actual contents are not initialized at this stage and can change
 * substantially over time. This includes alterations from data that the LSMs
 * 'trusts' to those they do not, making it essential to handle these changes
 * correctly. Failure to address this dynamic aspect could potentially allow
 * for bypassing LSM checks.
 *
 * Return: Returns 0 on success, negative values on failure.
 */
int security_bdev_setintegrity(struct block_device *bdev,
                               enum lsm_integrity_type type, const void *value,
                               size_t size)
{
        return call_int_hook(bdev_setintegrity, bdev, type, value, size);
}
EXPORT_SYMBOL(security_bdev_setintegrity);

#ifdef CONFIG_PERF_EVENTS
/**
 * security_perf_event_open() - Check if a perf event open is allowed
 * @type: type of event
 *
 * Check whether the @type of perf_event_open syscall is allowed.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_perf_event_open(int type)
{
        return call_int_hook(perf_event_open, type);
}

/**
 * security_perf_event_alloc() - Allocate a perf event LSM blob
 * @event: perf event
 *
 * Allocate and save perf_event security info.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_perf_event_alloc(struct perf_event *event)
{
        int rc;

        rc = lsm_blob_alloc(&event->security, blob_sizes.lbs_perf_event,
                            GFP_KERNEL);
        if (rc)
                return rc;

        rc = call_int_hook(perf_event_alloc, event);
        if (rc) {
                kfree(event->security);
                event->security = NULL;
        }
        return rc;
}

/**
 * security_perf_event_free() - Free a perf event LSM blob
 * @event: perf event
 *
 * Release (free) perf_event security info.
 */
void security_perf_event_free(struct perf_event *event)
{
        kfree(event->security);
        event->security = NULL;
}

/**
 * security_perf_event_read() - Check if reading a perf event label is allowed
 * @event: perf event
 *
 * Read perf_event security info if allowed.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_perf_event_read(struct perf_event *event)
{
        return call_int_hook(perf_event_read, event);
}

/**
 * security_perf_event_write() - Check if writing a perf event label is allowed
 * @event: perf event
 *
 * Write perf_event security info if allowed.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_perf_event_write(struct perf_event *event)
{
        return call_int_hook(perf_event_write, event);
}
#endif /* CONFIG_PERF_EVENTS */

#ifdef CONFIG_IO_URING
/**
 * security_uring_override_creds() - Check if overriding creds is allowed
 * @new: new credentials
 *
 * Check if the current task, executing an io_uring operation, is allowed to
 * override it's credentials with @new.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_uring_override_creds(const struct cred *new)
{
        return call_int_hook(uring_override_creds, new);
}

/**
 * security_uring_sqpoll() - Check if IORING_SETUP_SQPOLL is allowed
 *
 * Check whether the current task is allowed to spawn a io_uring polling thread
 * (IORING_SETUP_SQPOLL).
 *
 * Return: Returns 0 if permission is granted.
 */
int security_uring_sqpoll(void)
{
        return call_int_hook(uring_sqpoll);
}

/**
 * security_uring_cmd() - Check if a io_uring passthrough command is allowed
 * @ioucmd: command
 *
 * Check whether the file_operations uring_cmd is allowed to run.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_uring_cmd(struct io_uring_cmd *ioucmd)
{
        return call_int_hook(uring_cmd, ioucmd);
}

/**
 * security_uring_allowed() - Check if io_uring_setup() is allowed
 *
 * Check whether the current task is allowed to call io_uring_setup().
 *
 * Return: Returns 0 if permission is granted.
 */
int security_uring_allowed(void)
{
        return call_int_hook(uring_allowed);
}
#endif /* CONFIG_IO_URING */

/**
 * security_initramfs_populated() - Notify LSMs that initramfs has been loaded
 *
 * Tells the LSMs the initramfs has been unpacked into the rootfs.
 */
void security_initramfs_populated(void)
{
        call_void_hook(initramfs_populated);
}



































































































































































































    8 








    8 


    8 

    8 



    7 

    8 










    8 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
// SPDX-License-Identifier: GPL-2.0
#include <linux/irq_work.h>
#include <linux/spinlock.h>
#include <linux/task_work.h>
#include <linux/resume_user_mode.h>

static struct callback_head work_exited; /* all we need is ->next == NULL */

#ifdef CONFIG_IRQ_WORK
static void task_work_set_notify_irq(struct irq_work *entry)
{
        test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
}
static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) =
        IRQ_WORK_INIT_HARD(task_work_set_notify_irq);
#endif

/**
 * task_work_add - ask the @task to execute @work->func()
 * @task: the task which should run the callback
 * @work: the callback to run
 * @notify: how to notify the targeted task
 *
 * Queue @work for task_work_run() below and notify the @task if @notify
 * is @TWA_RESUME, @TWA_SIGNAL, @TWA_SIGNAL_NO_IPI or @TWA_NMI_CURRENT.
 *
 * @TWA_SIGNAL works like signals, in that the it will interrupt the targeted
 * task and run the task_work, regardless of whether the task is currently
 * running in the kernel or userspace.
 * @TWA_SIGNAL_NO_IPI works like @TWA_SIGNAL, except it doesn't send a
 * reschedule IPI to force the targeted task to reschedule and run task_work.
 * This can be advantageous if there's no strict requirement that the
 * task_work be run as soon as possible, just whenever the task enters the
 * kernel anyway.
 * @TWA_RESUME work is run only when the task exits the kernel and returns to
 * user mode, or before entering guest mode.
 * @TWA_NMI_CURRENT works like @TWA_RESUME, except it can only be used for the
 * current @task and if the current context is NMI.
 *
 * Fails if the @task is exiting/exited and thus it can't process this @work.
 * Otherwise @work->func() will be called when the @task goes through one of
 * the aforementioned transitions, or exits.
 *
 * If the targeted task is exiting, then an error is returned and the work item
 * is not queued. It's up to the caller to arrange for an alternative mechanism
 * in that case.
 *
 * Note: there is no ordering guarantee on works queued here. The task_work
 * list is LIFO.
 *
 * RETURNS:
 * 0 if succeeds or -ESRCH.
 */
int task_work_add(struct task_struct *task, struct callback_head *work,
                  enum task_work_notify_mode notify)
{
        struct callback_head *head;

        if (notify == TWA_NMI_CURRENT) {
                if (WARN_ON_ONCE(task != current))
                        return -EINVAL;
                if (!IS_ENABLED(CONFIG_IRQ_WORK))
                        return -EINVAL;
        } else {
                kasan_record_aux_stack(work);
        }

        head = READ_ONCE(task->task_works);
        do {
                if (unlikely(head == &work_exited))
                        return -ESRCH;
                work->next = head;
        } while (!try_cmpxchg(&task->task_works, &head, work));

        switch (notify) {
        case TWA_NONE:
                break;
        case TWA_RESUME:
                set_notify_resume(task);
                break;
        case TWA_SIGNAL:
                set_notify_signal(task);
                break;
        case TWA_SIGNAL_NO_IPI:
                __set_notify_signal(task);
                break;
#ifdef CONFIG_IRQ_WORK
        case TWA_NMI_CURRENT:
                irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume));
                break;
#endif
        default:
                WARN_ON_ONCE(1);
                break;
        }

        return 0;
}

/**
 * task_work_cancel_match - cancel a pending work added by task_work_add()
 * @task: the task which should execute the work
 * @match: match function to call
 * @data: data to be passed in to match function
 *
 * RETURNS:
 * The found work or NULL if not found.
 */
struct callback_head *
task_work_cancel_match(struct task_struct *task,
                       bool (*match)(struct callback_head *, void *data),
                       void *data)
{
        struct callback_head **pprev = &task->task_works;
        struct callback_head *work;
        unsigned long flags;

        if (likely(!task_work_pending(task)))
                return NULL;
        /*
         * If cmpxchg() fails we continue without updating pprev.
         * Either we raced with task_work_add() which added the
         * new entry before this work, we will find it again. Or
         * we raced with task_work_run(), *pprev == NULL/exited.
         */
        raw_spin_lock_irqsave(&task->pi_lock, flags);
        work = READ_ONCE(*pprev);
        while (work) {
                if (!match(work, data)) {
                        pprev = &work->next;
                        work = READ_ONCE(*pprev);
                } else if (try_cmpxchg(pprev, &work, work->next))
                        break;
        }
        raw_spin_unlock_irqrestore(&task->pi_lock, flags);

        return work;
}

static bool task_work_func_match(struct callback_head *cb, void *data)
{
        return cb->func == data;
}

/**
 * task_work_cancel_func - cancel a pending work matching a function added by task_work_add()
 * @task: the task which should execute the func's work
 * @func: identifies the func to match with a work to remove
 *
 * Find the last queued pending work with ->func == @func and remove
 * it from queue.
 *
 * RETURNS:
 * The found work or NULL if not found.
 */
struct callback_head *
task_work_cancel_func(struct task_struct *task, task_work_func_t func)
{
        return task_work_cancel_match(task, task_work_func_match, func);
}

static bool task_work_match(struct callback_head *cb, void *data)
{
        return cb == data;
}

/**
 * task_work_cancel - cancel a pending work added by task_work_add()
 * @task: the task which should execute the work
 * @cb: the callback to remove if queued
 *
 * Remove a callback from a task's queue if queued.
 *
 * RETURNS:
 * True if the callback was queued and got cancelled, false otherwise.
 */
bool task_work_cancel(struct task_struct *task, struct callback_head *cb)
{
        struct callback_head *ret;

        ret = task_work_cancel_match(task, task_work_match, cb);

        return ret == cb;
}

/**
 * task_work_run - execute the works added by task_work_add()
 *
 * Flush the pending works. Should be used by the core kernel code.
 * Called before the task returns to the user-mode or stops, or when
 * it exits. In the latter case task_work_add() can no longer add the
 * new work after task_work_run() returns.
 */
void task_work_run(void)
{
        struct task_struct *task = current;
        struct callback_head *work, *head, *next;

        for (;;) {
                /*
                 * work->func() can do task_work_add(), do not set
                 * work_exited unless the list is empty.
                 */
                work = READ_ONCE(task->task_works);
                do {
                        head = NULL;
                        if (!work) {
                                if (task->flags & PF_EXITING)
                                        head = &work_exited;
                                else
                                        break;
                        }
                } while (!try_cmpxchg(&task->task_works, &work, head));

                if (!work)
                        break;
                /*
                 * Synchronize with task_work_cancel_match(). It can not remove
                 * the first entry == work, cmpxchg(task_works) must fail.
                 * But it can remove another entry from the ->next list.
                 */
                raw_spin_lock_irq(&task->pi_lock);
                raw_spin_unlock_irq(&task->pi_lock);

                do {
                        next = work->next;
                        work->func(work);
                        work = next;
                        cond_resched();
                } while (work);
        }
}




























































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SWAPOPS_H
#define _LINUX_SWAPOPS_H

#include <linux/radix-tree.h>
#include <linux/bug.h>
#include <linux/mm_types.h>

#ifdef CONFIG_MMU

#ifdef CONFIG_SWAP
#include <linux/swapfile.h>
#endif        /* CONFIG_SWAP */

/*
 * swapcache pages are stored in the swapper_space radix tree.  We want to
 * get good packing density in that tree, so the index should be dense in
 * the low-order bits.
 *
 * We arrange the `type' and `offset' fields so that `type' is at the six
 * high-order bits of the swp_entry_t and `offset' is right-aligned in the
 * remaining bits.  Although `type' itself needs only five bits, we allow for
 * shmem/tmpfs to shift it all up a further one bit: see swp_to_radix_entry().
 *
 * swp_entry_t's are *never* stored anywhere in their arch-dependent format.
 */
#define SWP_TYPE_SHIFT        (BITS_PER_XA_VALUE - MAX_SWAPFILES_SHIFT)
#define SWP_OFFSET_MASK        ((1UL << SWP_TYPE_SHIFT) - 1)

/*
 * Definitions only for PFN swap entries (see is_pfn_swap_entry()).  To
 * store PFN, we only need SWP_PFN_BITS bits.  Each of the pfn swap entries
 * can use the extra bits to store other information besides PFN.
 */
#ifdef MAX_PHYSMEM_BITS
#define SWP_PFN_BITS                (MAX_PHYSMEM_BITS - PAGE_SHIFT)
#else  /* MAX_PHYSMEM_BITS */
#define SWP_PFN_BITS                min_t(int, \
                                      sizeof(phys_addr_t) * 8 - PAGE_SHIFT, \
                                      SWP_TYPE_SHIFT)
#endif        /* MAX_PHYSMEM_BITS */
#define SWP_PFN_MASK                (BIT(SWP_PFN_BITS) - 1)

/**
 * Migration swap entry specific bitfield definitions.  Layout:
 *
 *   |----------+--------------------|
 *   | swp_type | swp_offset         |
 *   |----------+--------+-+-+-------|
 *   |          | resv   |D|A|  PFN  |
 *   |----------+--------+-+-+-------|
 *
 * @SWP_MIG_YOUNG_BIT: Whether the page used to have young bit set (bit A)
 * @SWP_MIG_DIRTY_BIT: Whether the page used to have dirty bit set (bit D)
 *
 * Note: A/D bits will be stored in migration entries iff there're enough
 * free bits in arch specific swp offset.  By default we'll ignore A/D bits
 * when migrating a page.  Please refer to migration_entry_supports_ad()
 * for more information.  If there're more bits besides PFN and A/D bits,
 * they should be reserved and always be zeros.
 */
#define SWP_MIG_YOUNG_BIT                (SWP_PFN_BITS)
#define SWP_MIG_DIRTY_BIT                (SWP_PFN_BITS + 1)
#define SWP_MIG_TOTAL_BITS                (SWP_PFN_BITS + 2)

#define SWP_MIG_YOUNG                        BIT(SWP_MIG_YOUNG_BIT)
#define SWP_MIG_DIRTY                        BIT(SWP_MIG_DIRTY_BIT)

static inline bool is_pfn_swap_entry(swp_entry_t entry);

/* Clear all flags but only keep swp_entry_t related information */
static inline pte_t pte_swp_clear_flags(pte_t pte)
{
        if (pte_swp_exclusive(pte))
                pte = pte_swp_clear_exclusive(pte);
        if (pte_swp_soft_dirty(pte))
                pte = pte_swp_clear_soft_dirty(pte);
        if (pte_swp_uffd_wp(pte))
                pte = pte_swp_clear_uffd_wp(pte);
        return pte;
}

/*
 * Store a type+offset into a swp_entry_t in an arch-independent format
 */
static inline swp_entry_t swp_entry(unsigned long type, pgoff_t offset)
{
        swp_entry_t ret;

        ret.val = (type << SWP_TYPE_SHIFT) | (offset & SWP_OFFSET_MASK);
        return ret;
}

/*
 * Extract the `type' field from a swp_entry_t.  The swp_entry_t is in
 * arch-independent format
 */
static inline unsigned swp_type(swp_entry_t entry)
{
        return (entry.val >> SWP_TYPE_SHIFT);
}

/*
 * Extract the `offset' field from a swp_entry_t.  The swp_entry_t is in
 * arch-independent format
 */
static inline pgoff_t swp_offset(swp_entry_t entry)
{
        return entry.val & SWP_OFFSET_MASK;
}

/*
 * This should only be called upon a pfn swap entry to get the PFN stored
 * in the swap entry.  Please refers to is_pfn_swap_entry() for definition
 * of pfn swap entry.
 */
static inline unsigned long swp_offset_pfn(swp_entry_t entry)
{
        VM_BUG_ON(!is_pfn_swap_entry(entry));
        return swp_offset(entry) & SWP_PFN_MASK;
}

/* check whether a pte points to a swap entry */
static inline int is_swap_pte(pte_t pte)
{
        return !pte_none(pte) && !pte_present(pte);
}

/*
 * Convert the arch-dependent pte representation of a swp_entry_t into an
 * arch-independent swp_entry_t.
 */
static inline swp_entry_t pte_to_swp_entry(pte_t pte)
{
        swp_entry_t arch_entry;

        pte = pte_swp_clear_flags(pte);
        arch_entry = __pte_to_swp_entry(pte);
        return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
}

/*
 * Convert the arch-independent representation of a swp_entry_t into the
 * arch-dependent pte representation.
 */
static inline pte_t swp_entry_to_pte(swp_entry_t entry)
{
        swp_entry_t arch_entry;

        arch_entry = __swp_entry(swp_type(entry), swp_offset(entry));
        return __swp_entry_to_pte(arch_entry);
}

static inline swp_entry_t radix_to_swp_entry(void *arg)
{
        swp_entry_t entry;

        entry.val = xa_to_value(arg);
        return entry;
}

static inline void *swp_to_radix_entry(swp_entry_t entry)
{
        return xa_mk_value(entry.val);
}

#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset)
{
        return swp_entry(SWP_DEVICE_READ, offset);
}

static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset)
{
        return swp_entry(SWP_DEVICE_WRITE, offset);
}

static inline bool is_device_private_entry(swp_entry_t entry)
{
        int type = swp_type(entry);
        return type == SWP_DEVICE_READ || type == SWP_DEVICE_WRITE;
}

static inline bool is_writable_device_private_entry(swp_entry_t entry)
{
        return unlikely(swp_type(entry) == SWP_DEVICE_WRITE);
}

static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset)
{
        return swp_entry(SWP_DEVICE_EXCLUSIVE, offset);
}

static inline bool is_device_exclusive_entry(swp_entry_t entry)
{
        return swp_type(entry) == SWP_DEVICE_EXCLUSIVE;
}

#else /* CONFIG_DEVICE_PRIVATE */
static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset)
{
        return swp_entry(0, 0);
}

static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset)
{
        return swp_entry(0, 0);
}

static inline bool is_device_private_entry(swp_entry_t entry)
{
        return false;
}

static inline bool is_writable_device_private_entry(swp_entry_t entry)
{
        return false;
}

static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset)
{
        return swp_entry(0, 0);
}

static inline bool is_device_exclusive_entry(swp_entry_t entry)
{
        return false;
}

#endif /* CONFIG_DEVICE_PRIVATE */

#ifdef CONFIG_MIGRATION
static inline int is_migration_entry(swp_entry_t entry)
{
        return unlikely(swp_type(entry) == SWP_MIGRATION_READ ||
                        swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE ||
                        swp_type(entry) == SWP_MIGRATION_WRITE);
}

static inline int is_writable_migration_entry(swp_entry_t entry)
{
        return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE);
}

static inline int is_readable_migration_entry(swp_entry_t entry)
{
        return unlikely(swp_type(entry) == SWP_MIGRATION_READ);
}

static inline int is_readable_exclusive_migration_entry(swp_entry_t entry)
{
        return unlikely(swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE);
}

static inline swp_entry_t make_readable_migration_entry(pgoff_t offset)
{
        return swp_entry(SWP_MIGRATION_READ, offset);
}

static inline swp_entry_t make_readable_exclusive_migration_entry(pgoff_t offset)
{
        return swp_entry(SWP_MIGRATION_READ_EXCLUSIVE, offset);
}

static inline swp_entry_t make_writable_migration_entry(pgoff_t offset)
{
        return swp_entry(SWP_MIGRATION_WRITE, offset);
}

/*
 * Returns whether the host has large enough swap offset field to support
 * carrying over pgtable A/D bits for page migrations.  The result is
 * pretty much arch specific.
 */
static inline bool migration_entry_supports_ad(void)
{
#ifdef CONFIG_SWAP
        return swap_migration_ad_supported;
#else  /* CONFIG_SWAP */
        return false;
#endif        /* CONFIG_SWAP */
}

static inline swp_entry_t make_migration_entry_young(swp_entry_t entry)
{
        if (migration_entry_supports_ad())
                return swp_entry(swp_type(entry),
                                 swp_offset(entry) | SWP_MIG_YOUNG);
        return entry;
}

static inline bool is_migration_entry_young(swp_entry_t entry)
{
        if (migration_entry_supports_ad())
                return swp_offset(entry) & SWP_MIG_YOUNG;
        /* Keep the old behavior of aging page after migration */
        return false;
}

static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry)
{
        if (migration_entry_supports_ad())
                return swp_entry(swp_type(entry),
                                 swp_offset(entry) | SWP_MIG_DIRTY);
        return entry;
}

static inline bool is_migration_entry_dirty(swp_entry_t entry)
{
        if (migration_entry_supports_ad())
                return swp_offset(entry) & SWP_MIG_DIRTY;
        /* Keep the old behavior of clean page after migration */
        return false;
}

extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
                                        unsigned long address);
extern void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *pte);
#else  /* CONFIG_MIGRATION */
static inline swp_entry_t make_readable_migration_entry(pgoff_t offset)
{
        return swp_entry(0, 0);
}

static inline swp_entry_t make_readable_exclusive_migration_entry(pgoff_t offset)
{
        return swp_entry(0, 0);
}

static inline swp_entry_t make_writable_migration_entry(pgoff_t offset)
{
        return swp_entry(0, 0);
}

static inline int is_migration_entry(swp_entry_t swp)
{
        return 0;
}

static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
                                        unsigned long address) { }
static inline void migration_entry_wait_huge(struct vm_area_struct *vma,
                                             unsigned long addr, pte_t *pte) { }
static inline int is_writable_migration_entry(swp_entry_t entry)
{
        return 0;
}
static inline int is_readable_migration_entry(swp_entry_t entry)
{
        return 0;
}

static inline swp_entry_t make_migration_entry_young(swp_entry_t entry)
{
        return entry;
}

static inline bool is_migration_entry_young(swp_entry_t entry)
{
        return false;
}

static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry)
{
        return entry;
}

static inline bool is_migration_entry_dirty(swp_entry_t entry)
{
        return false;
}
#endif        /* CONFIG_MIGRATION */

#ifdef CONFIG_MEMORY_FAILURE

/*
 * Support for hardware poisoned pages
 */
static inline swp_entry_t make_hwpoison_entry(struct page *page)
{
        BUG_ON(!PageLocked(page));
        return swp_entry(SWP_HWPOISON, page_to_pfn(page));
}

static inline int is_hwpoison_entry(swp_entry_t entry)
{
        return swp_type(entry) == SWP_HWPOISON;
}

#else

static inline swp_entry_t make_hwpoison_entry(struct page *page)
{
        return swp_entry(0, 0);
}

static inline int is_hwpoison_entry(swp_entry_t swp)
{
        return 0;
}
#endif

typedef unsigned long pte_marker;

#define  PTE_MARKER_UFFD_WP                        BIT(0)
/*
 * "Poisoned" here is meant in the very general sense of "future accesses are
 * invalid", instead of referring very specifically to hardware memory errors.
 * This marker is meant to represent any of various different causes of this.
 *
 * Note that, when encountered by the faulting logic, PTEs with this marker will
 * result in VM_FAULT_HWPOISON and thus regardless trigger hardware memory error
 * logic.
 */
#define  PTE_MARKER_POISONED                        BIT(1)
/*
 * Indicates that, on fault, this PTE will case a SIGSEGV signal to be
 * sent. This means guard markers behave in effect as if the region were mapped
 * PROT_NONE, rather than if they were a memory hole or equivalent.
 */
#define  PTE_MARKER_GUARD                        BIT(2)
#define  PTE_MARKER_MASK                        (BIT(3) - 1)

static inline swp_entry_t make_pte_marker_entry(pte_marker marker)
{
        return swp_entry(SWP_PTE_MARKER, marker);
}

static inline bool is_pte_marker_entry(swp_entry_t entry)
{
        return swp_type(entry) == SWP_PTE_MARKER;
}

static inline pte_marker pte_marker_get(swp_entry_t entry)
{
        return swp_offset(entry) & PTE_MARKER_MASK;
}

static inline bool is_pte_marker(pte_t pte)
{
        return is_swap_pte(pte) && is_pte_marker_entry(pte_to_swp_entry(pte));
}

static inline pte_t make_pte_marker(pte_marker marker)
{
        return swp_entry_to_pte(make_pte_marker_entry(marker));
}

static inline swp_entry_t make_poisoned_swp_entry(void)
{
        return make_pte_marker_entry(PTE_MARKER_POISONED);
}

static inline int is_poisoned_swp_entry(swp_entry_t entry)
{
        return is_pte_marker_entry(entry) &&
            (pte_marker_get(entry) & PTE_MARKER_POISONED);

}

static inline swp_entry_t make_guard_swp_entry(void)
{
        return make_pte_marker_entry(PTE_MARKER_GUARD);
}

static inline int is_guard_swp_entry(swp_entry_t entry)
{
        return is_pte_marker_entry(entry) &&
                (pte_marker_get(entry) & PTE_MARKER_GUARD);
}

/*
 * This is a special version to check pte_none() just to cover the case when
 * the pte is a pte marker.  It existed because in many cases the pte marker
 * should be seen as a none pte; it's just that we have stored some information
 * onto the none pte so it becomes not-none any more.
 *
 * It should be used when the pte is file-backed, ram-based and backing
 * userspace pages, like shmem.  It is not needed upon pgtables that do not
 * support pte markers at all.  For example, it's not needed on anonymous
 * memory, kernel-only memory (including when the system is during-boot),
 * non-ram based generic file-system.  It's fine to be used even there, but the
 * extra pte marker check will be pure overhead.
 */
static inline int pte_none_mostly(pte_t pte)
{
        return pte_none(pte) || is_pte_marker(pte);
}

static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry)
{
        struct page *p = pfn_to_page(swp_offset_pfn(entry));

        /*
         * Any use of migration entries may only occur while the
         * corresponding page is locked
         */
        BUG_ON(is_migration_entry(entry) && !PageLocked(p));

        return p;
}

static inline struct folio *pfn_swap_entry_folio(swp_entry_t entry)
{
        struct folio *folio = pfn_folio(swp_offset_pfn(entry));

        /*
         * Any use of migration entries may only occur while the
         * corresponding folio is locked
         */
        BUG_ON(is_migration_entry(entry) && !folio_test_locked(folio));

        return folio;
}

/*
 * A pfn swap entry is a special type of swap entry that always has a pfn stored
 * in the swap offset. They can either be used to represent unaddressable device
 * memory, to restrict access to a page undergoing migration or to represent a
 * pfn which has been hwpoisoned and unmapped.
 */
static inline bool is_pfn_swap_entry(swp_entry_t entry)
{
        /* Make sure the swp offset can always store the needed fields */
        BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS);

        return is_migration_entry(entry) || is_device_private_entry(entry) ||
               is_device_exclusive_entry(entry) || is_hwpoison_entry(entry);
}

struct page_vma_mapped_walk;

#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
extern int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
                struct page *page);

extern void remove_migration_pmd(struct page_vma_mapped_walk *pvmw,
                struct page *new);

extern void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd);

static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
{
        swp_entry_t arch_entry;

        if (pmd_swp_soft_dirty(pmd))
                pmd = pmd_swp_clear_soft_dirty(pmd);
        if (pmd_swp_uffd_wp(pmd))
                pmd = pmd_swp_clear_uffd_wp(pmd);
        arch_entry = __pmd_to_swp_entry(pmd);
        return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
}

static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
{
        swp_entry_t arch_entry;

        arch_entry = __swp_entry(swp_type(entry), swp_offset(entry));
        return __swp_entry_to_pmd(arch_entry);
}

static inline int is_pmd_migration_entry(pmd_t pmd)
{
        return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd));
}
#else  /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
                struct page *page)
{
        BUILD_BUG();
}

static inline void remove_migration_pmd(struct page_vma_mapped_walk *pvmw,
                struct page *new)
{
        BUILD_BUG();
}

static inline void pmd_migration_entry_wait(struct mm_struct *m, pmd_t *p) { }

static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
{
        return swp_entry(0, 0);
}

static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
{
        return __pmd(0);
}

static inline int is_pmd_migration_entry(pmd_t pmd)
{
        return 0;
}
#endif  /* CONFIG_ARCH_ENABLE_THP_MIGRATION */

static inline int non_swap_entry(swp_entry_t entry)
{
        return swp_type(entry) >= MAX_SWAPFILES;
}

#endif /* CONFIG_MMU */
#endif /* _LINUX_SWAPOPS_H */
















































































































  316 












  316 







































    7 





























































  320 
  317 
















  320 
  316 

































































































































  312 











































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        Berkeley style UIO structures        -        Alan Cox 1994.
 */
#ifndef __LINUX_UIO_H
#define __LINUX_UIO_H

#include <linux/kernel.h>
#include <linux/mm_types.h>
#include <linux/ucopysize.h>
#include <uapi/linux/uio.h>

struct page;
struct folio_queue;

typedef unsigned int __bitwise iov_iter_extraction_t;

struct kvec {
        void *iov_base; /* and that should *never* hold a userland pointer */
        size_t iov_len;
};

enum iter_type {
        /* iter types */
        ITER_UBUF,
        ITER_IOVEC,
        ITER_BVEC,
        ITER_KVEC,
        ITER_FOLIOQ,
        ITER_XARRAY,
        ITER_DISCARD,
};

#define ITER_SOURCE        1        // == WRITE
#define ITER_DEST        0        // == READ

struct iov_iter_state {
        size_t iov_offset;
        size_t count;
        unsigned long nr_segs;
};

struct iov_iter {
        u8 iter_type;
        bool nofault;
        bool data_source;
        size_t iov_offset;
        /*
         * Hack alert: overlay ubuf_iovec with iovec + count, so
         * that the members resolve correctly regardless of the type
         * of iterator used. This means that you can use:
         *
         * &iter->__ubuf_iovec or iter->__iov
         *
         * interchangably for the user_backed cases, hence simplifying
         * some of the cases that need to deal with both.
         */
        union {
                /*
                 * This really should be a const, but we cannot do that without
                 * also modifying any of the zero-filling iter init functions.
                 * Leave it non-const for now, but it should be treated as such.
                 */
                struct iovec __ubuf_iovec;
                struct {
                        union {
                                /* use iter_iov() to get the current vec */
                                const struct iovec *__iov;
                                const struct kvec *kvec;
                                const struct bio_vec *bvec;
                                const struct folio_queue *folioq;
                                struct xarray *xarray;
                                void __user *ubuf;
                        };
                        size_t count;
                };
        };
        union {
                unsigned long nr_segs;
                u8 folioq_slot;
                loff_t xarray_start;
        };
};

typedef __u16 uio_meta_flags_t;

struct uio_meta {
        uio_meta_flags_t        flags;
        u16                        app_tag;
        u64                        seed;
        struct iov_iter                iter;
};

static inline const struct iovec *iter_iov(const struct iov_iter *iter)
{
        if (iter->iter_type == ITER_UBUF)
                return (const struct iovec *) &iter->__ubuf_iovec;
        return iter->__iov;
}

#define iter_iov_addr(iter)        (iter_iov(iter)->iov_base + (iter)->iov_offset)

static inline size_t iter_iov_len(const struct iov_iter *i)
{
        if (i->iter_type == ITER_UBUF)
                return i->count;
        return iter_iov(i)->iov_len - i->iov_offset;
}

static inline enum iter_type iov_iter_type(const struct iov_iter *i)
{
        return i->iter_type;
}

static inline void iov_iter_save_state(struct iov_iter *iter,
                                       struct iov_iter_state *state)
{
        state->iov_offset = iter->iov_offset;
        state->count = iter->count;
        state->nr_segs = iter->nr_segs;
}

static inline bool iter_is_ubuf(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_UBUF;
}

static inline bool iter_is_iovec(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_IOVEC;
}

static inline bool iov_iter_is_kvec(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_KVEC;
}

static inline bool iov_iter_is_bvec(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_BVEC;
}

static inline bool iov_iter_is_discard(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_DISCARD;
}

static inline bool iov_iter_is_folioq(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_FOLIOQ;
}

static inline bool iov_iter_is_xarray(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_XARRAY;
}

static inline unsigned char iov_iter_rw(const struct iov_iter *i)
{
        return i->data_source ? WRITE : READ;
}

static inline bool user_backed_iter(const struct iov_iter *i)
{
        return iter_is_ubuf(i) || iter_is_iovec(i);
}

/*
 * Total number of bytes covered by an iovec.
 *
 * NOTE that it is not safe to use this function until all the iovec's
 * segment lengths have been validated.  Because the individual lengths can
 * overflow a size_t when added together.
 */
static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs)
{
        unsigned long seg;
        size_t ret = 0;

        for (seg = 0; seg < nr_segs; seg++)
                ret += iov[seg].iov_len;
        return ret;
}

void iov_iter_advance(struct iov_iter *i, size_t bytes);
void iov_iter_revert(struct iov_iter *i, size_t bytes);
size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes);
size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t bytes);
size_t iov_iter_single_seg_count(const struct iov_iter *i);
size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
                         struct iov_iter *i);
size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
                         struct iov_iter *i);
size_t copy_folio_from_iter_atomic(struct folio *folio, size_t offset,
                size_t bytes, struct iov_iter *i);

size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i);
size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i);

static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset,
                size_t bytes, struct iov_iter *i)
{
        return copy_page_to_iter(&folio->page, offset, bytes, i);
}

static inline size_t copy_folio_from_iter(struct folio *folio, size_t offset,
                                          size_t bytes, struct iov_iter *i)
{
        return copy_page_from_iter(&folio->page, offset, bytes, i);
}

size_t copy_page_to_iter_nofault(struct page *page, unsigned offset,
                                 size_t bytes, struct iov_iter *i);

static __always_inline __must_check
size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
        if (check_copy_size(addr, bytes, true))
                return _copy_to_iter(addr, bytes, i);
        return 0;
}

static __always_inline __must_check
size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
{
        if (check_copy_size(addr, bytes, false))
                return _copy_from_iter(addr, bytes, i);
        return 0;
}

static __always_inline __must_check
bool copy_to_iter_full(const void *addr, size_t bytes, struct iov_iter *i)
{
        size_t copied = copy_to_iter(addr, bytes, i);
        if (likely(copied == bytes))
                return true;
        iov_iter_revert(i, copied);
        return false;
}

static __always_inline __must_check
bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i)
{
        size_t copied = copy_from_iter(addr, bytes, i);
        if (likely(copied == bytes))
                return true;
        iov_iter_revert(i, copied);
        return false;
}

static __always_inline __must_check
size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
{
        if (check_copy_size(addr, bytes, false))
                return _copy_from_iter_nocache(addr, bytes, i);
        return 0;
}

static __always_inline __must_check
bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i)
{
        size_t copied = copy_from_iter_nocache(addr, bytes, i);
        if (likely(copied == bytes))
                return true;
        iov_iter_revert(i, copied);
        return false;
}

#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
/*
 * Note, users like pmem that depend on the stricter semantics of
 * _copy_from_iter_flushcache() than _copy_from_iter_nocache() must check for
 * IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) before assuming that the
 * destination is flushed from the cache on return.
 */
size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i);
#else
#define _copy_from_iter_flushcache _copy_from_iter_nocache
#endif

#ifdef CONFIG_ARCH_HAS_COPY_MC
size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
#else
#define _copy_mc_to_iter _copy_to_iter
#endif

size_t iov_iter_zero(size_t bytes, struct iov_iter *);
unsigned long iov_iter_alignment(const struct iov_iter *i);
unsigned long iov_iter_gap_alignment(const struct iov_iter *i);
void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov,
                        unsigned long nr_segs, size_t count);
void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec *kvec,
                        unsigned long nr_segs, size_t count);
void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec,
                        unsigned long nr_segs, size_t count);
void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count);
void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction,
                          const struct folio_queue *folioq,
                          unsigned int first_slot, unsigned int offset, size_t count);
void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray,
                     loff_t start, size_t count);
ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages,
                        size_t maxsize, unsigned maxpages, size_t *start);
ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages,
                        size_t maxsize, size_t *start);
int iov_iter_npages(const struct iov_iter *i, int maxpages);
void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state);

const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags);

static inline size_t iov_iter_count(const struct iov_iter *i)
{
        return i->count;
}

/*
 * Cap the iov_iter by given limit; note that the second argument is
 * *not* the new size - it's upper limit for such.  Passing it a value
 * greater than the amount of data in iov_iter is fine - it'll just do
 * nothing in that case.
 */
static inline void iov_iter_truncate(struct iov_iter *i, u64 count)
{
        /*
         * count doesn't have to fit in size_t - comparison extends both
         * operands to u64 here and any value that would be truncated by
         * conversion in assignement is by definition greater than all
         * values of size_t, including old i->count.
         */
        if (i->count > count)
                i->count = count;
}

/*
 * reexpand a previously truncated iterator; count must be no more than how much
 * we had shrunk it.
 */
static inline void iov_iter_reexpand(struct iov_iter *i, size_t count)
{
        i->count = count;
}

static inline int
iov_iter_npages_cap(struct iov_iter *i, int maxpages, size_t max_bytes)
{
        size_t shorted = 0;
        int npages;

        if (iov_iter_count(i) > max_bytes) {
                shorted = iov_iter_count(i) - max_bytes;
                iov_iter_truncate(i, max_bytes);
        }
        npages = iov_iter_npages(i, maxpages);
        if (shorted)
                iov_iter_reexpand(i, iov_iter_count(i) + shorted);

        return npages;
}

struct iovec *iovec_from_user(const struct iovec __user *uvector,
                unsigned long nr_segs, unsigned long fast_segs,
                struct iovec *fast_iov, bool compat);
ssize_t import_iovec(int type, const struct iovec __user *uvec,
                 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
                 struct iov_iter *i);
ssize_t __import_iovec(int type, const struct iovec __user *uvec,
                 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
                 struct iov_iter *i, bool compat);
int import_ubuf(int type, void __user *buf, size_t len, struct iov_iter *i);

static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
                        void __user *buf, size_t count)
{
        WARN_ON(direction & ~(READ | WRITE));
        *i = (struct iov_iter) {
                .iter_type = ITER_UBUF,
                .data_source = direction,
                .ubuf = buf,
                .count = count,
                .nr_segs = 1
        };
}
/* Flags for iov_iter_get/extract_pages*() */
/* Allow P2PDMA on the extracted pages */
#define ITER_ALLOW_P2PDMA        ((__force iov_iter_extraction_t)0x01)

ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages,
                               size_t maxsize, unsigned int maxpages,
                               iov_iter_extraction_t extraction_flags,
                               size_t *offset0);

/**
 * iov_iter_extract_will_pin - Indicate how pages from the iterator will be retained
 * @iter: The iterator
 *
 * Examine the iterator and indicate by returning true or false as to how, if
 * at all, pages extracted from the iterator will be retained by the extraction
 * function.
 *
 * %true indicates that the pages will have a pin placed in them that the
 * caller must unpin.  This is must be done for DMA/async DIO to force fork()
 * to forcibly copy a page for the child (the parent must retain the original
 * page).
 *
 * %false indicates that no measures are taken and that it's up to the caller
 * to retain the pages.
 */
static inline bool iov_iter_extract_will_pin(const struct iov_iter *iter)
{
        return user_backed_iter(iter);
}

struct sg_table;
ssize_t extract_iter_to_sg(struct iov_iter *iter, size_t len,
                           struct sg_table *sgtable, unsigned int sg_max,
                           iov_iter_extraction_t extraction_flags);

#endif









































































































































































































































































































































































































    1 

































    4 








































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_SIGNAL_H
#define _LINUX_SCHED_SIGNAL_H

#include <linux/rculist.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/sched/jobctl.h>
#include <linux/sched/task.h>
#include <linux/cred.h>
#include <linux/refcount.h>
#include <linux/pid.h>
#include <linux/posix-timers.h>
#include <linux/mm_types.h>
#include <asm/ptrace.h>

/*
 * Types defining task->signal and task->sighand and APIs using them:
 */

struct sighand_struct {
        spinlock_t                siglock;
        refcount_t                count;
        wait_queue_head_t        signalfd_wqh;
        struct k_sigaction        action[_NSIG];
};

/*
 * Per-process accounting stats:
 */
struct pacct_struct {
        int                        ac_flag;
        long                        ac_exitcode;
        unsigned long                ac_mem;
        u64                        ac_utime, ac_stime;
        unsigned long                ac_minflt, ac_majflt;
};

struct cpu_itimer {
        u64 expires;
        u64 incr;
};

/*
 * This is the atomic variant of task_cputime, which can be used for
 * storing and updating task_cputime statistics without locking.
 */
struct task_cputime_atomic {
        atomic64_t utime;
        atomic64_t stime;
        atomic64_t sum_exec_runtime;
};

#define INIT_CPUTIME_ATOMIC \
        (struct task_cputime_atomic) {                                \
                .utime = ATOMIC64_INIT(0),                        \
                .stime = ATOMIC64_INIT(0),                        \
                .sum_exec_runtime = ATOMIC64_INIT(0),                \
        }
/**
 * struct thread_group_cputimer - thread group interval timer counts
 * @cputime_atomic:        atomic thread group interval timers.
 *
 * This structure contains the version of task_cputime, above, that is
 * used for thread group CPU timer calculations.
 */
struct thread_group_cputimer {
        struct task_cputime_atomic cputime_atomic;
};

struct multiprocess_signals {
        sigset_t signal;
        struct hlist_node node;
};

struct core_thread {
        struct task_struct *task;
        struct core_thread *next;
};

struct core_state {
        atomic_t nr_threads;
        struct core_thread dumper;
        struct completion startup;
};

/*
 * NOTE! "signal_struct" does not have its own
 * locking, because a shared signal_struct always
 * implies a shared sighand_struct, so locking
 * sighand_struct is always a proper superset of
 * the locking of signal_struct.
 */
struct signal_struct {
        refcount_t                sigcnt;
        atomic_t                live;
        int                        nr_threads;
        int                        quick_threads;
        struct list_head        thread_head;

        wait_queue_head_t        wait_chldexit;        /* for wait4() */

        /* current thread group signal load-balancing target: */
        struct task_struct        *curr_target;

        /* shared signal handling: */
        struct sigpending        shared_pending;

        /* For collecting multiprocess signals during fork */
        struct hlist_head        multiprocess;

        /* thread group exit support */
        int                        group_exit_code;
        /* notify group_exec_task when notify_count is less or equal to 0 */
        int                        notify_count;
        struct task_struct        *group_exec_task;

        /* thread group stop support, overloads group_exit_code too */
        int                        group_stop_count;
        unsigned int                flags; /* see SIGNAL_* flags below */

        struct core_state *core_state; /* coredumping support */

        /*
         * PR_SET_CHILD_SUBREAPER marks a process, like a service
         * manager, to re-parent orphan (double-forking) child processes
         * to this process instead of 'init'. The service manager is
         * able to receive SIGCHLD signals and is able to investigate
         * the process until it calls wait(). All children of this
         * process will inherit a flag if they should look for a
         * child_subreaper process at exit.
         */
        unsigned int                is_child_subreaper:1;
        unsigned int                has_child_subreaper:1;

#ifdef CONFIG_POSIX_TIMERS

        /* POSIX.1b Interval Timers */
        unsigned int                timer_create_restore_ids:1;
        atomic_t                next_posix_timer_id;
        struct hlist_head        posix_timers;
        struct hlist_head        ignored_posix_timers;

        /* ITIMER_REAL timer for the process */
        struct hrtimer real_timer;
        ktime_t it_real_incr;

        /*
         * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use
         * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these
         * values are defined to 0 and 1 respectively
         */
        struct cpu_itimer it[2];

        /*
         * Thread group totals for process CPU timers.
         * See thread_group_cputimer(), et al, for details.
         */
        struct thread_group_cputimer cputimer;

#endif
        /* Empty if CONFIG_POSIX_TIMERS=n */
        struct posix_cputimers posix_cputimers;

        /* PID/PID hash table linkage. */
        struct pid *pids[PIDTYPE_MAX];

#ifdef CONFIG_NO_HZ_FULL
        atomic_t tick_dep_mask;
#endif

        struct pid *tty_old_pgrp;

        /* boolean value for session group leader */
        int leader;

        struct tty_struct *tty; /* NULL if no tty */

#ifdef CONFIG_SCHED_AUTOGROUP
        struct autogroup *autogroup;
#endif
        /*
         * Cumulative resource counters for dead threads in the group,
         * and for reaped dead child processes forked by this group.
         * Live threads maintain their own counters and add to these
         * in __exit_signal, except for the group leader.
         */
        seqlock_t stats_lock;
        u64 utime, stime, cutime, cstime;
        u64 gtime;
        u64 cgtime;
        struct prev_cputime prev_cputime;
        unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
        unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
        unsigned long inblock, oublock, cinblock, coublock;
        unsigned long maxrss, cmaxrss;
        struct task_io_accounting ioac;

        /*
         * Cumulative ns of schedule CPU time fo dead threads in the
         * group, not including a zombie group leader, (This only differs
         * from jiffies_to_ns(utime + stime) if sched_clock uses something
         * other than jiffies.)
         */
        unsigned long long sum_sched_runtime;

        /*
         * We don't bother to synchronize most readers of this at all,
         * because there is no reader checking a limit that actually needs
         * to get both rlim_cur and rlim_max atomically, and either one
         * alone is a single word that can safely be read normally.
         * getrlimit/setrlimit use task_lock(current->group_leader) to
         * protect this instead of the siglock, because they really
         * have no need to disable irqs.
         */
        struct rlimit rlim[RLIM_NLIMITS];

#ifdef CONFIG_BSD_PROCESS_ACCT
        struct pacct_struct pacct;        /* per-process accounting information */
#endif
#ifdef CONFIG_TASKSTATS
        struct taskstats *stats;
#endif
#ifdef CONFIG_AUDIT
        unsigned audit_tty;
        struct tty_audit_buf *tty_audit_buf;
#endif

#ifdef CONFIG_CGROUPS
        struct rw_semaphore cgroup_threadgroup_rwsem;
#endif

        /*
         * Thread is the potential origin of an oom condition; kill first on
         * oom
         */
        bool oom_flag_origin;
        short oom_score_adj;                /* OOM kill score adjustment */
        short oom_score_adj_min;        /* OOM kill score adjustment min value.
                                         * Only settable by CAP_SYS_RESOURCE. */
        struct mm_struct *oom_mm;        /* recorded mm when the thread group got
                                         * killed by the oom killer */

        struct mutex cred_guard_mutex;        /* guard against foreign influences on
                                         * credential calculations
                                         * (notably. ptrace)
                                         * Deprecated do not use in new code.
                                         * Use exec_update_lock instead.
                                         */
        struct rw_semaphore exec_update_lock;        /* Held while task_struct is
                                                 * being updated during exec,
                                                 * and may have inconsistent
                                                 * permissions.
                                                 */
} __randomize_layout;

/*
 * Bits in flags field of signal_struct.
 */
#define SIGNAL_STOP_STOPPED        0x00000001 /* job control stop in effect */
#define SIGNAL_STOP_CONTINUED        0x00000002 /* SIGCONT since WCONTINUED reap */
#define SIGNAL_GROUP_EXIT        0x00000004 /* group exit in progress */
/*
 * Pending notifications to parent.
 */
#define SIGNAL_CLD_STOPPED        0x00000010
#define SIGNAL_CLD_CONTINUED        0x00000020
#define SIGNAL_CLD_MASK                (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED)

#define SIGNAL_UNKILLABLE        0x00000040 /* for init: ignore fatal signals */

#define SIGNAL_STOP_MASK (SIGNAL_CLD_MASK | SIGNAL_STOP_STOPPED | \
                          SIGNAL_STOP_CONTINUED)

static inline void signal_set_stop_flags(struct signal_struct *sig,
                                         unsigned int flags)
{
        WARN_ON(sig->flags & SIGNAL_GROUP_EXIT);
        sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags;
}

extern void flush_signals(struct task_struct *);
extern void ignore_signals(struct task_struct *);
extern void flush_signal_handlers(struct task_struct *, int force_default);
extern int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type);

static inline int kernel_dequeue_signal(void)
{
        struct task_struct *task = current;
        kernel_siginfo_t __info;
        enum pid_type __type;
        int ret;

        spin_lock_irq(&task->sighand->siglock);
        ret = dequeue_signal(&task->blocked, &__info, &__type);
        spin_unlock_irq(&task->sighand->siglock);

        return ret;
}

static inline void kernel_signal_stop(void)
{
        spin_lock_irq(&current->sighand->siglock);
        if (current->jobctl & JOBCTL_STOP_DEQUEUED) {
                current->jobctl |= JOBCTL_STOPPED;
                set_special_state(TASK_STOPPED);
        }
        spin_unlock_irq(&current->sighand->siglock);

        schedule();
}

int force_sig_fault_to_task(int sig, int code, void __user *addr,
                            struct task_struct *t);
int force_sig_fault(int sig, int code, void __user *addr);
int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t);

int force_sig_mceerr(int code, void __user *, short);
int send_sig_mceerr(int code, void __user *, short, struct task_struct *);

int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper);
int force_sig_pkuerr(void __user *addr, u32 pkey);
int send_sig_perf(void __user *addr, u32 type, u64 sig_data);

int force_sig_ptrace_errno_trap(int errno, void __user *addr);
int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno);
int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno,
                        struct task_struct *t);
int force_sig_seccomp(int syscall, int reason, bool force_coredump);

extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *);
extern void force_sigsegv(int sig);
extern int force_sig_info(struct kernel_siginfo *);
extern int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp);
extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid);
extern int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, struct pid *,
                                const struct cred *);
extern int kill_pgrp(struct pid *pid, int sig, int priv);
extern int kill_pid(struct pid *pid, int sig, int priv);
extern __must_check bool do_notify_parent(struct task_struct *, int);
extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
extern void force_sig(int);
extern void force_fatal_sig(int);
extern void force_exit_sig(int);
extern int send_sig(int, struct task_struct *, int);
extern int zap_other_threads(struct task_struct *p);
extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);

static inline void clear_notify_signal(void)
{
        clear_thread_flag(TIF_NOTIFY_SIGNAL);
        smp_mb__after_atomic();
}

/*
 * Returns 'true' if kick_process() is needed to force a transition from
 * user -> kernel to guarantee expedient run of TWA_SIGNAL based task_work.
 */
static inline bool __set_notify_signal(struct task_struct *task)
{
        return !test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) &&
               !wake_up_state(task, TASK_INTERRUPTIBLE);
}

/*
 * Called to break out of interruptible wait loops, and enter the
 * exit_to_user_mode_loop().
 */
static inline void set_notify_signal(struct task_struct *task)
{
        if (__set_notify_signal(task))
                kick_process(task);
}

static inline int restart_syscall(void)
{
        set_tsk_thread_flag(current, TIF_SIGPENDING);
        return -ERESTARTNOINTR;
}

static inline int task_sigpending(struct task_struct *p)
{
        return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
}

static inline int signal_pending(struct task_struct *p)
{
        /*
         * TIF_NOTIFY_SIGNAL isn't really a signal, but it requires the same
         * behavior in terms of ensuring that we break out of wait loops
         * so that notify signal callbacks can be processed.
         */
        if (unlikely(test_tsk_thread_flag(p, TIF_NOTIFY_SIGNAL)))
                return 1;
        return task_sigpending(p);
}

static inline int __fatal_signal_pending(struct task_struct *p)
{
        return unlikely(sigismember(&p->pending.signal, SIGKILL));
}

static inline int fatal_signal_pending(struct task_struct *p)
{
        return task_sigpending(p) && __fatal_signal_pending(p);
}

static inline int signal_pending_state(unsigned int state, struct task_struct *p)
{
        if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))
                return 0;
        if (!signal_pending(p))
                return 0;

        return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
}

/*
 * This should only be used in fault handlers to decide whether we
 * should stop the current fault routine to handle the signals
 * instead, especially with the case where we've got interrupted with
 * a VM_FAULT_RETRY.
 */
static inline bool fault_signal_pending(vm_fault_t fault_flags,
                                        struct pt_regs *regs)
{
        return unlikely((fault_flags & VM_FAULT_RETRY) &&
                        (fatal_signal_pending(current) ||
                         (user_mode(regs) && signal_pending(current))));
}

/*
 * Reevaluate whether the task has signals pending delivery.
 * Wake the task if so.
 * This is required every time the blocked sigset_t changes.
 * callers must hold sighand->siglock.
 */
extern void recalc_sigpending(void);
extern void calculate_sigpending(void);

extern void signal_wake_up_state(struct task_struct *t, unsigned int state);

static inline void signal_wake_up(struct task_struct *t, bool fatal)
{
        unsigned int state = 0;
        if (fatal && !(t->jobctl & JOBCTL_PTRACE_FROZEN)) {
                t->jobctl &= ~(JOBCTL_STOPPED | JOBCTL_TRACED);
                state = TASK_WAKEKILL | __TASK_TRACED;
        }
        signal_wake_up_state(t, state);
}
static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume)
{
        unsigned int state = 0;
        if (resume) {
                t->jobctl &= ~JOBCTL_TRACED;
                state = __TASK_TRACED;
        }
        signal_wake_up_state(t, state);
}

void task_join_group_stop(struct task_struct *task);

#ifdef TIF_RESTORE_SIGMASK
/*
 * Legacy restore_sigmask accessors.  These are inefficient on
 * SMP architectures because they require atomic operations.
 */

/**
 * set_restore_sigmask() - make sure saved_sigmask processing gets done
 *
 * This sets TIF_RESTORE_SIGMASK and ensures that the arch signal code
 * will run before returning to user mode, to process the flag.  For
 * all callers, TIF_SIGPENDING is already set or it's no harm to set
 * it.  TIF_RESTORE_SIGMASK need not be in the set of bits that the
 * arch code will notice on return to user mode, in case those bits
 * are scarce.  We set TIF_SIGPENDING here to ensure that the arch
 * signal code always gets run when TIF_RESTORE_SIGMASK is set.
 */
static inline void set_restore_sigmask(void)
{
        set_thread_flag(TIF_RESTORE_SIGMASK);
}

static inline void clear_tsk_restore_sigmask(struct task_struct *task)
{
        clear_tsk_thread_flag(task, TIF_RESTORE_SIGMASK);
}

static inline void clear_restore_sigmask(void)
{
        clear_thread_flag(TIF_RESTORE_SIGMASK);
}
static inline bool test_tsk_restore_sigmask(struct task_struct *task)
{
        return test_tsk_thread_flag(task, TIF_RESTORE_SIGMASK);
}
static inline bool test_restore_sigmask(void)
{
        return test_thread_flag(TIF_RESTORE_SIGMASK);
}
static inline bool test_and_clear_restore_sigmask(void)
{
        return test_and_clear_thread_flag(TIF_RESTORE_SIGMASK);
}

#else        /* TIF_RESTORE_SIGMASK */

/* Higher-quality implementation, used if TIF_RESTORE_SIGMASK doesn't exist. */
static inline void set_restore_sigmask(void)
{
        current->restore_sigmask = true;
}
static inline void clear_tsk_restore_sigmask(struct task_struct *task)
{
        task->restore_sigmask = false;
}
static inline void clear_restore_sigmask(void)
{
        current->restore_sigmask = false;
}
static inline bool test_restore_sigmask(void)
{
        return current->restore_sigmask;
}
static inline bool test_tsk_restore_sigmask(struct task_struct *task)
{
        return task->restore_sigmask;
}
static inline bool test_and_clear_restore_sigmask(void)
{
        if (!current->restore_sigmask)
                return false;
        current->restore_sigmask = false;
        return true;
}
#endif

static inline void restore_saved_sigmask(void)
{
        if (test_and_clear_restore_sigmask())
                __set_current_blocked(&current->saved_sigmask);
}

extern int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize);

static inline void restore_saved_sigmask_unless(bool interrupted)
{
        if (interrupted)
                WARN_ON(!signal_pending(current));
        else
                restore_saved_sigmask();
}

static inline sigset_t *sigmask_to_save(void)
{
        sigset_t *res = &current->blocked;
        if (unlikely(test_restore_sigmask()))
                res = &current->saved_sigmask;
        return res;
}

static inline int kill_cad_pid(int sig, int priv)
{
        return kill_pid(cad_pid, sig, priv);
}

/* These can be the second arg to send_sig_info/send_group_sig_info.  */
#define SEND_SIG_NOINFO ((struct kernel_siginfo *) 0)
#define SEND_SIG_PRIV        ((struct kernel_siginfo *) 1)

static inline int __on_sig_stack(unsigned long sp)
{
#ifdef CONFIG_STACK_GROWSUP
        return sp >= current->sas_ss_sp &&
                sp - current->sas_ss_sp < current->sas_ss_size;
#else
        return sp > current->sas_ss_sp &&
                sp - current->sas_ss_sp <= current->sas_ss_size;
#endif
}

/*
 * True if we are on the alternate signal stack.
 */
static inline int on_sig_stack(unsigned long sp)
{
        /*
         * If the signal stack is SS_AUTODISARM then, by construction, we
         * can't be on the signal stack unless user code deliberately set
         * SS_AUTODISARM when we were already on it.
         *
         * This improves reliability: if user state gets corrupted such that
         * the stack pointer points very close to the end of the signal stack,
         * then this check will enable the signal to be handled anyway.
         */
        if (current->sas_ss_flags & SS_AUTODISARM)
                return 0;

        return __on_sig_stack(sp);
}

static inline int sas_ss_flags(unsigned long sp)
{
        if (!current->sas_ss_size)
                return SS_DISABLE;

        return on_sig_stack(sp) ? SS_ONSTACK : 0;
}

static inline void sas_ss_reset(struct task_struct *p)
{
        p->sas_ss_sp = 0;
        p->sas_ss_size = 0;
        p->sas_ss_flags = SS_DISABLE;
}

static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig)
{
        if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp))
#ifdef CONFIG_STACK_GROWSUP
                return current->sas_ss_sp;
#else
                return current->sas_ss_sp + current->sas_ss_size;
#endif
        return sp;
}

extern void __cleanup_sighand(struct sighand_struct *);
extern void flush_itimer_signals(void);

#define tasklist_empty() \
        list_empty(&init_task.tasks)

#define next_task(p) \
        list_entry_rcu((p)->tasks.next, struct task_struct, tasks)

#define for_each_process(p) \
        for (p = &init_task ; (p = next_task(p)) != &init_task ; )

extern bool current_is_single_threaded(void);

/*
 * Without tasklist/siglock it is only rcu-safe if g can't exit/exec,
 * otherwise next_thread(t) will never reach g after list_del_rcu(g).
 */
#define while_each_thread(g, t) \
        while ((t = next_thread(t)) != g)

#define for_other_threads(p, t)        \
        for (t = p; (t = next_thread(t)) != p; )

#define __for_each_thread(signal, t)        \
        list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \
                lockdep_is_held(&tasklist_lock))

#define for_each_thread(p, t)                \
        __for_each_thread((p)->signal, t)

/* Careful: this is a double loop, 'break' won't work as expected. */
#define for_each_process_thread(p, t)        \
        for_each_process(p) for_each_thread(p, t)

typedef int (*proc_visitor)(struct task_struct *p, void *data);
void walk_process_tree(struct task_struct *top, proc_visitor, void *);

static inline
struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
{
        struct pid *pid;
        if (type == PIDTYPE_PID)
                pid = task_pid(task);
        else
                pid = task->signal->pids[type];
        return pid;
}

static inline struct pid *task_tgid(struct task_struct *task)
{
        return task->signal->pids[PIDTYPE_TGID];
}

/*
 * Without tasklist or RCU lock it is not safe to dereference
 * the result of task_pgrp/task_session even if task == current,
 * we can race with another thread doing sys_setsid/sys_setpgid.
 */
static inline struct pid *task_pgrp(struct task_struct *task)
{
        return task->signal->pids[PIDTYPE_PGID];
}

static inline struct pid *task_session(struct task_struct *task)
{
        return task->signal->pids[PIDTYPE_SID];
}

static inline int get_nr_threads(struct task_struct *task)
{
        return task->signal->nr_threads;
}

static inline bool thread_group_leader(struct task_struct *p)
{
        return p->exit_signal >= 0;
}

static inline
bool same_thread_group(struct task_struct *p1, struct task_struct *p2)
{
        return p1->signal == p2->signal;
}

/*
 * returns NULL if p is the last thread in the thread group
 */
static inline struct task_struct *__next_thread(struct task_struct *p)
{
        return list_next_or_null_rcu(&p->signal->thread_head,
                                        &p->thread_node,
                                        struct task_struct,
                                        thread_node);
}

static inline struct task_struct *next_thread(struct task_struct *p)
{
        return __next_thread(p) ?: p->group_leader;
}

static inline int thread_group_empty(struct task_struct *p)
{
        return thread_group_leader(p) &&
               list_is_last(&p->thread_node, &p->signal->thread_head);
}

#define delay_group_leader(p) \
                (thread_group_leader(p) && !thread_group_empty(p))

extern struct sighand_struct *__lock_task_sighand(struct task_struct *task,
                                                        unsigned long *flags);

static inline struct sighand_struct *lock_task_sighand(struct task_struct *task,
                                                       unsigned long *flags)
{
        struct sighand_struct *ret;

        ret = __lock_task_sighand(task, flags);
        (void)__cond_lock(&task->sighand->siglock, ret);
        return ret;
}

static inline void unlock_task_sighand(struct task_struct *task,
                                                unsigned long *flags)
{
        spin_unlock_irqrestore(&task->sighand->siglock, *flags);
}

#ifdef CONFIG_LOCKDEP
extern void lockdep_assert_task_sighand_held(struct task_struct *task);
#else
static inline void lockdep_assert_task_sighand_held(struct task_struct *task) { }
#endif

static inline unsigned long task_rlimit(const struct task_struct *task,
                unsigned int limit)
{
        return READ_ONCE(task->signal->rlim[limit].rlim_cur);
}

static inline unsigned long task_rlimit_max(const struct task_struct *task,
                unsigned int limit)
{
        return READ_ONCE(task->signal->rlim[limit].rlim_max);
}

static inline unsigned long rlimit(unsigned int limit)
{
        return task_rlimit(current, limit);
}

static inline unsigned long rlimit_max(unsigned int limit)
{
        return task_rlimit_max(current, limit);
}

#endif /* _LINUX_SCHED_SIGNAL_H */





















   39 

   39 
   39 




























































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __KERNEL_PRINTK__
#define __KERNEL_PRINTK__

#include <linux/stdarg.h>
#include <linux/init.h>
#include <linux/kern_levels.h>
#include <linux/linkage.h>
#include <linux/ratelimit_types.h>
#include <linux/once_lite.h>

struct console;

extern const char linux_banner[];
extern const char linux_proc_banner[];

extern int oops_in_progress;        /* If set, an oops, panic(), BUG() or die() is in progress */

#define PRINTK_MAX_SINGLE_HEADER_LEN 2

static inline int printk_get_level(const char *buffer)
{
        if (buffer[0] == KERN_SOH_ASCII && buffer[1]) {
                switch (buffer[1]) {
                case '0' ... '7':
                case 'c':        /* KERN_CONT */
                        return buffer[1];
                }
        }
        return 0;
}

static inline const char *printk_skip_level(const char *buffer)
{
        if (printk_get_level(buffer))
                return buffer + 2;

        return buffer;
}

static inline const char *printk_skip_headers(const char *buffer)
{
        while (printk_get_level(buffer))
                buffer = printk_skip_level(buffer);

        return buffer;
}

/* printk's without a loglevel use this.. */
#define MESSAGE_LOGLEVEL_DEFAULT CONFIG_MESSAGE_LOGLEVEL_DEFAULT

/* We show everything that is MORE important than this.. */
#define CONSOLE_LOGLEVEL_SILENT  0 /* Mum's the word */
#define CONSOLE_LOGLEVEL_MIN         1 /* Minimum loglevel we let people use */
#define CONSOLE_LOGLEVEL_DEBUG        10 /* issue debug messages */
#define CONSOLE_LOGLEVEL_MOTORMOUTH 15        /* You can't shut this one up */

/*
 * Default used to be hard-coded at 7, quiet used to be hardcoded at 4,
 * we're now allowing both to be set from kernel config.
 */
#define CONSOLE_LOGLEVEL_DEFAULT CONFIG_CONSOLE_LOGLEVEL_DEFAULT
#define CONSOLE_LOGLEVEL_QUIET         CONFIG_CONSOLE_LOGLEVEL_QUIET

int match_devname_and_update_preferred_console(const char *match,
                                               const char *name,
                                               const short idx);

extern int console_printk[];

#define console_loglevel (console_printk[0])
#define default_message_loglevel (console_printk[1])
#define minimum_console_loglevel (console_printk[2])
#define default_console_loglevel (console_printk[3])

extern void console_verbose(void);

/* strlen("ratelimit") + 1 */
#define DEVKMSG_STR_MAX_SIZE 10
extern char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE];
struct ctl_table;

extern int suppress_printk;

struct va_format {
        const char *fmt;
        va_list *va;
};

/*
 * FW_BUG
 * Add this to a message where you are sure the firmware is buggy or behaves
 * really stupid or out of spec. Be aware that the responsible BIOS developer
 * should be able to fix this issue or at least get a concrete idea of the
 * problem by reading your message without the need of looking at the kernel
 * code.
 *
 * Use it for definite and high priority BIOS bugs.
 *
 * FW_WARN
 * Use it for not that clear (e.g. could the kernel messed up things already?)
 * and medium priority BIOS bugs.
 *
 * FW_INFO
 * Use this one if you want to tell the user or vendor about something
 * suspicious, but generally harmless related to the firmware.
 *
 * Use it for information or very low priority BIOS bugs.
 */
#define FW_BUG                "[Firmware Bug]: "
#define FW_WARN                "[Firmware Warn]: "
#define FW_INFO                "[Firmware Info]: "

/*
 * HW_ERR
 * Add this to a message for hardware errors, so that user can report
 * it to hardware vendor instead of LKML or software vendor.
 */
#define HW_ERR                "[Hardware Error]: "

/*
 * DEPRECATED
 * Add this to a message whenever you want to warn user space about the use
 * of a deprecated aspect of an API so they can stop using it
 */
#define DEPRECATED        "[Deprecated]: "

/*
 * Dummy printk for disabled debugging statements to use whilst maintaining
 * gcc's format checking.
 */
#define no_printk(fmt, ...)                                \
({                                                        \
        if (0)                                                \
                _printk(fmt, ##__VA_ARGS__);                \
        0;                                                \
})

#ifdef CONFIG_EARLY_PRINTK
extern asmlinkage __printf(1, 2)
void early_printk(const char *fmt, ...);
#else
static inline __printf(1, 2) __cold
void early_printk(const char *s, ...) { }
#endif

struct dev_printk_info;

#ifdef CONFIG_PRINTK
asmlinkage __printf(4, 0)
int vprintk_emit(int facility, int level,
                 const struct dev_printk_info *dev_info,
                 const char *fmt, va_list args);

asmlinkage __printf(1, 0)
int vprintk(const char *fmt, va_list args);
__printf(1, 0)
int vprintk_deferred(const char *fmt, va_list args);

asmlinkage __printf(1, 2) __cold
int _printk(const char *fmt, ...);

/*
 * Special printk facility for scheduler/timekeeping use only, _DO_NOT_USE_ !
 */
__printf(1, 2) __cold int _printk_deferred(const char *fmt, ...);

extern void __printk_deferred_enter(void);
extern void __printk_deferred_exit(void);

extern void printk_force_console_enter(void);
extern void printk_force_console_exit(void);

/*
 * The printk_deferred_enter/exit macros are available only as a hack for
 * some code paths that need to defer all printk console printing. Interrupts
 * must be disabled for the deferred duration.
 */
#define printk_deferred_enter() __printk_deferred_enter()
#define printk_deferred_exit() __printk_deferred_exit()

/*
 * Please don't use printk_ratelimit(), because it shares ratelimiting state
 * with all other unrelated printk_ratelimit() callsites.  Instead use
 * printk_ratelimited() or plain old __ratelimit().
 */
extern int __printk_ratelimit(const char *func);
#define printk_ratelimit() __printk_ratelimit(__func__)
extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
                                   unsigned int interval_msec);

extern int printk_delay_msec;
extern int dmesg_restrict;

extern void wake_up_klogd(void);

char *log_buf_addr_get(void);
u32 log_buf_len_get(void);
void log_buf_vmcoreinfo_setup(void);
void __init setup_log_buf(int early);
__printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...);
void dump_stack_print_info(const char *log_lvl);
void show_regs_print_info(const char *log_lvl);
extern asmlinkage void dump_stack_lvl(const char *log_lvl) __cold;
extern asmlinkage void dump_stack(void) __cold;
void printk_trigger_flush(void);
void console_try_replay_all(void);
void printk_legacy_allow_panic_sync(void);
extern bool nbcon_device_try_acquire(struct console *con);
extern void nbcon_device_release(struct console *con);
void nbcon_atomic_flush_unsafe(void);
bool pr_flush(int timeout_ms, bool reset_on_progress);
#else
static inline __printf(1, 0)
int vprintk(const char *s, va_list args)
{
        return 0;
}
static inline __printf(1, 0)
int vprintk_deferred(const char *fmt, va_list args)
{
        return 0;
}
static inline __printf(1, 2) __cold
int _printk(const char *s, ...)
{
        return 0;
}
static inline __printf(1, 2) __cold
int _printk_deferred(const char *s, ...)
{
        return 0;
}

static inline void printk_deferred_enter(void)
{
}

static inline void printk_deferred_exit(void)
{
}

static inline void printk_force_console_enter(void)
{
}

static inline void printk_force_console_exit(void)
{
}

static inline int printk_ratelimit(void)
{
        return 0;
}
static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies,
                                          unsigned int interval_msec)
{
        return false;
}

static inline void wake_up_klogd(void)
{
}

static inline char *log_buf_addr_get(void)
{
        return NULL;
}

static inline u32 log_buf_len_get(void)
{
        return 0;
}

static inline void log_buf_vmcoreinfo_setup(void)
{
}

static inline void setup_log_buf(int early)
{
}

static inline __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...)
{
}

static inline void dump_stack_print_info(const char *log_lvl)
{
}

static inline void show_regs_print_info(const char *log_lvl)
{
}

static inline void dump_stack_lvl(const char *log_lvl)
{
}

static inline void dump_stack(void)
{
}
static inline void printk_trigger_flush(void)
{
}
static inline void console_try_replay_all(void)
{
}

static inline void printk_legacy_allow_panic_sync(void)
{
}

static inline bool nbcon_device_try_acquire(struct console *con)
{
        return false;
}

static inline void nbcon_device_release(struct console *con)
{
}

static inline void nbcon_atomic_flush_unsafe(void)
{
}

static inline bool pr_flush(int timeout_ms, bool reset_on_progress)
{
        return true;
}

#endif

#ifdef CONFIG_SMP
extern int __printk_cpu_sync_try_get(void);
extern void __printk_cpu_sync_wait(void);
extern void __printk_cpu_sync_put(void);

#else

#define __printk_cpu_sync_try_get() true
#define __printk_cpu_sync_wait()
#define __printk_cpu_sync_put()
#endif /* CONFIG_SMP */

/**
 * printk_cpu_sync_get_irqsave() - Disable interrupts and acquire the printk
 *                                 cpu-reentrant spinning lock.
 * @flags: Stack-allocated storage for saving local interrupt state,
 *         to be passed to printk_cpu_sync_put_irqrestore().
 *
 * If the lock is owned by another CPU, spin until it becomes available.
 * Interrupts are restored while spinning.
 *
 * CAUTION: This function must be used carefully. It does not behave like a
 * typical lock. Here are important things to watch out for...
 *
 *     * This function is reentrant on the same CPU. Therefore the calling
 *       code must not assume exclusive access to data if code accessing the
 *       data can run reentrant or within NMI context on the same CPU.
 *
 *     * If there exists usage of this function from NMI context, it becomes
 *       unsafe to perform any type of locking or spinning to wait for other
 *       CPUs after calling this function from any context. This includes
 *       using spinlocks or any other busy-waiting synchronization methods.
 */
#define printk_cpu_sync_get_irqsave(flags)                \
        for (;;) {                                        \
                local_irq_save(flags);                        \
                if (__printk_cpu_sync_try_get())        \
                        break;                                \
                local_irq_restore(flags);                \
                __printk_cpu_sync_wait();                \
        }

/**
 * printk_cpu_sync_put_irqrestore() - Release the printk cpu-reentrant spinning
 *                                    lock and restore interrupts.
 * @flags: Caller's saved interrupt state, from printk_cpu_sync_get_irqsave().
 */
#define printk_cpu_sync_put_irqrestore(flags)        \
        do {                                        \
                __printk_cpu_sync_put();        \
                local_irq_restore(flags);        \
        } while (0)

extern int kptr_restrict;

/**
 * pr_fmt - used by the pr_*() macros to generate the printk format string
 * @fmt: format string passed from a pr_*() macro
 *
 * This macro can be used to generate a unified format string for pr_*()
 * macros. A common use is to prefix all pr_*() messages in a file with a common
 * string. For example, defining this at the top of a source file:
 *
 *        #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 *
 * would prefix all pr_info, pr_emerg... messages in the file with the module
 * name.
 */
#ifndef pr_fmt
#define pr_fmt(fmt) fmt
#endif

struct module;

#ifdef CONFIG_PRINTK_INDEX
struct pi_entry {
        const char *fmt;
        const char *func;
        const char *file;
        unsigned int line;

        /*
         * While printk and pr_* have the level stored in the string at compile
         * time, some subsystems dynamically add it at runtime through the
         * format string. For these dynamic cases, we allow the subsystem to
         * tell us the level at compile time.
         *
         * NULL indicates that the level, if any, is stored in fmt.
         */
        const char *level;

        /*
         * The format string used by various subsystem specific printk()
         * wrappers to prefix the message.
         *
         * Note that the static prefix defined by the pr_fmt() macro is stored
         * directly in the message format (@fmt), not here.
         */
        const char *subsys_fmt_prefix;
} __packed;

#define __printk_index_emit(_fmt, _level, _subsys_fmt_prefix)                \
        do {                                                                \
                if (__builtin_constant_p(_fmt) && __builtin_constant_p(_level)) { \
                        /*
                         * We check __builtin_constant_p multiple times here
                         * for the same input because GCC will produce an error
                         * if we try to assign a static variable to fmt if it
                         * is not a constant, even with the outer if statement.
                         */                                                \
                        static const struct pi_entry _entry                \
                        __used = {                                        \
                                .fmt = __builtin_constant_p(_fmt) ? (_fmt) : NULL, \
                                .func = __func__,                        \
                                .file = __FILE__,                        \
                                .line = __LINE__,                        \
                                .level = __builtin_constant_p(_level) ? (_level) : NULL, \
                                .subsys_fmt_prefix = _subsys_fmt_prefix,\
                        };                                                \
                        static const struct pi_entry *_entry_ptr        \
                        __used __section(".printk_index") = &_entry;        \
                }                                                        \
        } while (0)

#else /* !CONFIG_PRINTK_INDEX */
#define __printk_index_emit(...) do {} while (0)
#endif /* CONFIG_PRINTK_INDEX */

/*
 * Some subsystems have their own custom printk that applies a va_format to a
 * generic format, for example, to include a device number or other metadata
 * alongside the format supplied by the caller.
 *
 * In order to store these in the way they would be emitted by the printk
 * infrastructure, the subsystem provides us with the start, fixed string, and
 * any subsequent text in the format string.
 *
 * We take a variable argument list as pr_fmt/dev_fmt/etc are sometimes passed
 * as multiple arguments (eg: `"%s: ", "blah"`), and we must only take the
 * first one.
 *
 * subsys_fmt_prefix must be known at compile time, or compilation will fail
 * (since this is a mistake). If fmt or level is not known at compile time, no
 * index entry will be made (since this can legitimately happen).
 */
#define printk_index_subsys_emit(subsys_fmt_prefix, level, fmt, ...) \
        __printk_index_emit(fmt, level, subsys_fmt_prefix)

#define printk_index_wrap(_p_func, _fmt, ...)                                \
        ({                                                                \
                __printk_index_emit(_fmt, NULL, NULL);                        \
                _p_func(_fmt, ##__VA_ARGS__);                                \
        })


/**
 * printk - print a kernel message
 * @fmt: format string
 *
 * This is printk(). It can be called from any context. We want it to work.
 *
 * If printk indexing is enabled, _printk() is called from printk_index_wrap.
 * Otherwise, printk is simply #defined to _printk.
 *
 * We try to grab the console_lock. If we succeed, it's easy - we log the
 * output and call the console drivers.  If we fail to get the semaphore, we
 * place the output into the log buffer and return. The current holder of
 * the console_sem will notice the new output in console_unlock(); and will
 * send it to the consoles before releasing the lock.
 *
 * One effect of this deferred printing is that code which calls printk() and
 * then changes console_loglevel may break. This is because console_loglevel
 * is inspected when the actual printing occurs.
 *
 * See also:
 * printf(3)
 *
 * See the vsnprintf() documentation for format string extensions over C99.
 */
#define printk(fmt, ...) printk_index_wrap(_printk, fmt, ##__VA_ARGS__)
#define printk_deferred(fmt, ...)                                        \
        printk_index_wrap(_printk_deferred, fmt, ##__VA_ARGS__)

/**
 * pr_emerg - Print an emergency-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_EMERG loglevel. It uses pr_fmt() to
 * generate the format string.
 */
#define pr_emerg(fmt, ...) \
        printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
/**
 * pr_alert - Print an alert-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_ALERT loglevel. It uses pr_fmt() to
 * generate the format string.
 */
#define pr_alert(fmt, ...) \
        printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
/**
 * pr_crit - Print a critical-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_CRIT loglevel. It uses pr_fmt() to
 * generate the format string.
 */
#define pr_crit(fmt, ...) \
        printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
/**
 * pr_err - Print an error-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_ERR loglevel. It uses pr_fmt() to
 * generate the format string.
 */
#define pr_err(fmt, ...) \
        printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
/**
 * pr_warn - Print a warning-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_WARNING loglevel. It uses pr_fmt()
 * to generate the format string.
 */
#define pr_warn(fmt, ...) \
        printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
/**
 * pr_notice - Print a notice-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_NOTICE loglevel. It uses pr_fmt() to
 * generate the format string.
 */
#define pr_notice(fmt, ...) \
        printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
/**
 * pr_info - Print an info-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_INFO loglevel. It uses pr_fmt() to
 * generate the format string.
 */
#define pr_info(fmt, ...) \
        printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)

/**
 * pr_cont - Continues a previous log message in the same line.
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_CONT loglevel. It should only be
 * used when continuing a log message with no newline ('\n') enclosed. Otherwise
 * it defaults back to KERN_DEFAULT loglevel.
 */
#define pr_cont(fmt, ...) \
        printk(KERN_CONT fmt, ##__VA_ARGS__)

/**
 * pr_devel - Print a debug-level message conditionally
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_DEBUG loglevel if DEBUG is
 * defined. Otherwise it does nothing.
 *
 * It uses pr_fmt() to generate the format string.
 */
#ifdef DEBUG
#define pr_devel(fmt, ...) \
        printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_devel(fmt, ...) \
        no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif


/* If you are writing a driver, please use dev_dbg instead */
#if defined(CONFIG_DYNAMIC_DEBUG) || \
        (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
#include <linux/dynamic_debug.h>

/**
 * pr_debug - Print a debug-level message conditionally
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to dynamic_pr_debug() if CONFIG_DYNAMIC_DEBUG is
 * set. Otherwise, if DEBUG is defined, it's equivalent to a printk with
 * KERN_DEBUG loglevel. If DEBUG is not defined it does nothing.
 *
 * It uses pr_fmt() to generate the format string (dynamic_pr_debug() uses
 * pr_fmt() internally).
 */
#define pr_debug(fmt, ...)                        \
        dynamic_pr_debug(fmt, ##__VA_ARGS__)
#elif defined(DEBUG)
#define pr_debug(fmt, ...) \
        printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_debug(fmt, ...) \
        no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif

/*
 * Print a one-time message (analogous to WARN_ONCE() et al):
 */

#ifdef CONFIG_PRINTK
#define printk_once(fmt, ...)                                        \
        DO_ONCE_LITE(printk, fmt, ##__VA_ARGS__)
#define printk_deferred_once(fmt, ...)                                \
        DO_ONCE_LITE(printk_deferred, fmt, ##__VA_ARGS__)
#else
#define printk_once(fmt, ...)                                        \
        no_printk(fmt, ##__VA_ARGS__)
#define printk_deferred_once(fmt, ...)                                \
        no_printk(fmt, ##__VA_ARGS__)
#endif

#define pr_emerg_once(fmt, ...)                                        \
        printk_once(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
#define pr_alert_once(fmt, ...)                                        \
        printk_once(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
#define pr_crit_once(fmt, ...)                                        \
        printk_once(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
#define pr_err_once(fmt, ...)                                        \
        printk_once(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
#define pr_warn_once(fmt, ...)                                        \
        printk_once(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
#define pr_notice_once(fmt, ...)                                \
        printk_once(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
#define pr_info_once(fmt, ...)                                        \
        printk_once(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
/* no pr_cont_once, don't do that... */

#if defined(DEBUG)
#define pr_devel_once(fmt, ...)                                        \
        printk_once(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_devel_once(fmt, ...)                                        \
        no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif

/* If you are writing a driver, please use dev_dbg instead */
#if defined(DEBUG)
#define pr_debug_once(fmt, ...)                                        \
        printk_once(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_debug_once(fmt, ...)                                        \
        no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif

/*
 * ratelimited messages with local ratelimit_state,
 * no local ratelimit_state used in the !PRINTK case
 */
#ifdef CONFIG_PRINTK
#define printk_ratelimited(fmt, ...)                                        \
({                                                                        \
        static DEFINE_RATELIMIT_STATE(_rs,                                \
                                      DEFAULT_RATELIMIT_INTERVAL,        \
                                      DEFAULT_RATELIMIT_BURST);                \
                                                                        \
        if (__ratelimit(&_rs))                                                \
                printk(fmt, ##__VA_ARGS__);                                \
})
#else
#define printk_ratelimited(fmt, ...)                                        \
        no_printk(fmt, ##__VA_ARGS__)
#endif

#define pr_emerg_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
#define pr_alert_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
#define pr_crit_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
#define pr_err_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
#define pr_warn_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
#define pr_notice_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
#define pr_info_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
/* no pr_cont_ratelimited, don't do that... */

#if defined(DEBUG)
#define pr_devel_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_devel_ratelimited(fmt, ...)                                        \
        no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif

/* If you are writing a driver, please use dev_dbg instead */
#if defined(CONFIG_DYNAMIC_DEBUG) || \
        (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
/* descriptor check is first to prevent flooding with "callbacks suppressed" */
#define pr_debug_ratelimited(fmt, ...)                                        \
do {                                                                        \
        static DEFINE_RATELIMIT_STATE(_rs,                                \
                                      DEFAULT_RATELIMIT_INTERVAL,        \
                                      DEFAULT_RATELIMIT_BURST);                \
        DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, pr_fmt(fmt));                \
        if (DYNAMIC_DEBUG_BRANCH(descriptor) &&                                \
            __ratelimit(&_rs))                                                \
                __dynamic_pr_debug(&descriptor, pr_fmt(fmt), ##__VA_ARGS__);        \
} while (0)
#elif defined(DEBUG)
#define pr_debug_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_debug_ratelimited(fmt, ...) \
        no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif

extern const struct file_operations kmsg_fops;

enum {
        DUMP_PREFIX_NONE,
        DUMP_PREFIX_ADDRESS,
        DUMP_PREFIX_OFFSET
};
extern int hex_dump_to_buffer(const void *buf, size_t len, int rowsize,
                              int groupsize, char *linebuf, size_t linebuflen,
                              bool ascii);
#ifdef CONFIG_PRINTK
extern void print_hex_dump(const char *level, const char *prefix_str,
                           int prefix_type, int rowsize, int groupsize,
                           const void *buf, size_t len, bool ascii);
#else
static inline void print_hex_dump(const char *level, const char *prefix_str,
                                  int prefix_type, int rowsize, int groupsize,
                                  const void *buf, size_t len, bool ascii)
{
}
static inline void print_hex_dump_bytes(const char *prefix_str, int prefix_type,
                                        const void *buf, size_t len)
{
}

#endif

#if defined(CONFIG_DYNAMIC_DEBUG) || \
        (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
#define print_hex_dump_debug(prefix_str, prefix_type, rowsize,        \
                             groupsize, buf, len, ascii)        \
        dynamic_hex_dump(prefix_str, prefix_type, rowsize,        \
                         groupsize, buf, len, ascii)
#elif defined(DEBUG)
#define print_hex_dump_debug(prefix_str, prefix_type, rowsize,                \
                             groupsize, buf, len, ascii)                \
        print_hex_dump(KERN_DEBUG, prefix_str, prefix_type, rowsize,        \
                       groupsize, buf, len, ascii)
#else
static inline void print_hex_dump_debug(const char *prefix_str, int prefix_type,
                                        int rowsize, int groupsize,
                                        const void *buf, size_t len, bool ascii)
{
}
#endif

/**
 * print_hex_dump_bytes - shorthand form of print_hex_dump() with default params
 * @prefix_str: string to prefix each line with;
 *  caller supplies trailing spaces for alignment if desired
 * @prefix_type: controls whether prefix of an offset, address, or none
 *  is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE)
 * @buf: data blob to dump
 * @len: number of bytes in the @buf
 *
 * Calls print_hex_dump(), with log level of KERN_DEBUG,
 * rowsize of 16, groupsize of 1, and ASCII output included.
 */
#define print_hex_dump_bytes(prefix_str, prefix_type, buf, len)        \
        print_hex_dump_debug(prefix_str, prefix_type, 16, 1, buf, len, true)

#endif







  317 




1
2
3
4
5
6
7
8
9
10
11
/* SPDX-License-Identifier: GPL-2.0 */
#include <asm/processor.h>

static inline int phys_addr_valid(resource_size_t addr)
{
#ifdef CONFIG_PHYS_ADDR_T_64BIT
        return !(addr >> boot_cpu_data.x86_phys_bits);
#else
        return 1;
#endif
}


































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGALLOC_H
#define _ASM_X86_PGALLOC_H

#include <linux/threads.h>
#include <linux/mm.h>                /* for struct page */
#include <linux/pagemap.h>

#include <asm/cpufeature.h>

#define __HAVE_ARCH_PTE_ALLOC_ONE
#define __HAVE_ARCH_PGD_FREE
#include <asm-generic/pgalloc.h>

static inline int  __paravirt_pgd_alloc(struct mm_struct *mm) { return 0; }

#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
#define paravirt_pgd_alloc(mm)        __paravirt_pgd_alloc(mm)
static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) {}
static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn)        {}
static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn)        {}
static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
                                            unsigned long start, unsigned long count) {}
static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn)        {}
static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn)        {}
static inline void paravirt_release_pte(unsigned long pfn) {}
static inline void paravirt_release_pmd(unsigned long pfn) {}
static inline void paravirt_release_pud(unsigned long pfn) {}
static inline void paravirt_release_p4d(unsigned long pfn) {}
#endif

/*
 * In case of Page Table Isolation active, we acquire two PGDs instead of one.
 * Being order-1, it is both 8k in size and 8k-aligned.  That lets us just
 * flip bit 12 in a pointer to swap between the two 4k halves.
 */
static inline unsigned int pgd_allocation_order(void)
{
        if (cpu_feature_enabled(X86_FEATURE_PTI))
                return 1;
        return 0;
}

/*
 * Allocate and free page tables.
 */
extern pgd_t *pgd_alloc(struct mm_struct *);
extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);

extern pgtable_t pte_alloc_one(struct mm_struct *);

extern void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte);

static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
                                  unsigned long address)
{
        ___pte_free_tlb(tlb, pte);
}

static inline void pmd_populate_kernel(struct mm_struct *mm,
                                       pmd_t *pmd, pte_t *pte)
{
        paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT);
        set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
}

static inline void pmd_populate_kernel_safe(struct mm_struct *mm,
                                       pmd_t *pmd, pte_t *pte)
{
        paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT);
        set_pmd_safe(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
}

static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
                                struct page *pte)
{
        unsigned long pfn = page_to_pfn(pte);

        paravirt_alloc_pte(mm, pfn);
        set_pmd(pmd, __pmd(((pteval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
}

#if CONFIG_PGTABLE_LEVELS > 2
extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);

static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
                                  unsigned long address)
{
        ___pmd_free_tlb(tlb, pmd);
}

#ifdef CONFIG_X86_PAE
extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd);
#else        /* !CONFIG_X86_PAE */
static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
{
        paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
        set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
}

static inline void pud_populate_safe(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
{
        paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
        set_pud_safe(pud, __pud(_PAGE_TABLE | __pa(pmd)));
}
#endif        /* CONFIG_X86_PAE */

#if CONFIG_PGTABLE_LEVELS > 3
static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
{
        paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
        set_p4d(p4d, __p4d(_PAGE_TABLE | __pa(pud)));
}

static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
{
        paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
        set_p4d_safe(p4d, __p4d(_PAGE_TABLE | __pa(pud)));
}

extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);

static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
                                  unsigned long address)
{
        ___pud_free_tlb(tlb, pud);
}

#if CONFIG_PGTABLE_LEVELS > 4
static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d)
{
        if (!pgtable_l5_enabled())
                return;
        paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT);
        set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d)));
}

static inline void pgd_populate_safe(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d)
{
        if (!pgtable_l5_enabled())
                return;
        paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT);
        set_pgd_safe(pgd, __pgd(_PAGE_TABLE | __pa(p4d)));
}

extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d);

static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
                                  unsigned long address)
{
        if (pgtable_l5_enabled())
                ___p4d_free_tlb(tlb, p4d);
}

#endif        /* CONFIG_PGTABLE_LEVELS > 4 */
#endif        /* CONFIG_PGTABLE_LEVELS > 3 */
#endif        /* CONFIG_PGTABLE_LEVELS > 2 */

#endif /* _ASM_X86_PGALLOC_H */












































   45 
   45 




   15 


   15 






   44 
   46 


   47 


   46 
   46 

   47 
    2 

   45 

   47 
   47 

   44 
    1 

   45 




   43 



   47 






    7 
    7 

    7 
    7 


    9 






    9 

    9 




    9 


    9 
    9 









    9 







    9 






    9 







    9 





    9 






    9 
    7 
































    8 









    8 





    8 


    1 
    8 
    7 









    6 

    1 








    7 

    8 
    7 

    8 









   47 



    8 

   46 
   45 



   47 













   47 









   47 






   46 






    9 


   46 



    9 
    9 







    9 








    9 




   47 











   47 


   47 
   47 




    1 





    9 
    9 
    9 




    3 

    3 

    3 








    6 

    6 





















    6 




    6 

    6 





    6 


    6 







    6 

    6 





    6 




























































   42 












   41 







   41 





   41 





   42 


   23 






   42 


   42 









   41 

































   45 






   45 





























   42 






   42 
   41 






   42 





























    3 


    3 



    3 

    3 

    3 
    3 



    2 



    1 




    2 
    1 
















    7 

    7 

    7 





















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Scatterlist Cryptographic API.
 *
 * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
 * Copyright (c) 2002 David S. Miller (davem@redhat.com)
 * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au>
 *
 * Portions derived from Cryptoapi, by Alexander Kjeldaas <astor@fast.no>
 * and Nettle, by Niels Möller.
 */

#include <linux/err.h>
#include <linux/errno.h>
#include <linux/jump_label.h>
#include <linux/kernel.h>
#include <linux/kmod.h>
#include <linux/module.h>
#include <linux/param.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/completion.h>
#include "internal.h"

LIST_HEAD(crypto_alg_list);
EXPORT_SYMBOL_GPL(crypto_alg_list);
DECLARE_RWSEM(crypto_alg_sem);
EXPORT_SYMBOL_GPL(crypto_alg_sem);

BLOCKING_NOTIFIER_HEAD(crypto_chain);
EXPORT_SYMBOL_GPL(crypto_chain);

#if IS_BUILTIN(CONFIG_CRYPTO_ALGAPI) && IS_ENABLED(CONFIG_CRYPTO_SELFTESTS)
DEFINE_STATIC_KEY_FALSE(__crypto_boot_test_finished);
#endif

static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg,
                                             u32 type, u32 mask);
static struct crypto_alg *crypto_alg_lookup(const char *name, u32 type,
                                            u32 mask);

struct crypto_alg *crypto_mod_get(struct crypto_alg *alg)
{
        return try_module_get(alg->cra_module) ? crypto_alg_get(alg) : NULL;
}
EXPORT_SYMBOL_GPL(crypto_mod_get);

void crypto_mod_put(struct crypto_alg *alg)
{
        struct module *module = alg->cra_module;

        crypto_alg_put(alg);
        module_put(module);
}
EXPORT_SYMBOL_GPL(crypto_mod_put);

static struct crypto_alg *__crypto_alg_lookup(const char *name, u32 type,
                                              u32 mask)
{
        struct crypto_alg *q, *alg = NULL;
        int best = -2;

        list_for_each_entry(q, &crypto_alg_list, cra_list) {
                int exact, fuzzy;

                if (crypto_is_moribund(q))
                        continue;

                if ((q->cra_flags ^ type) & mask)
                        continue;

                exact = !strcmp(q->cra_driver_name, name);
                fuzzy = !strcmp(q->cra_name, name);
                if (!exact && !(fuzzy && q->cra_priority > best))
                        continue;

                if (unlikely(!crypto_mod_get(q)))
                        continue;

                best = q->cra_priority;
                if (alg)
                        crypto_mod_put(alg);
                alg = q;

                if (exact)
                        break;
        }

        return alg;
}

static void crypto_larval_destroy(struct crypto_alg *alg)
{
        struct crypto_larval *larval = (void *)alg;

        BUG_ON(!crypto_is_larval(alg));
        if (!IS_ERR_OR_NULL(larval->adult))
                crypto_mod_put(larval->adult);
        kfree(larval);
}

struct crypto_larval *crypto_larval_alloc(const char *name, u32 type, u32 mask)
{
        struct crypto_larval *larval;

        larval = kzalloc(sizeof(*larval), GFP_KERNEL);
        if (!larval)
                return ERR_PTR(-ENOMEM);

        type &= ~CRYPTO_ALG_TYPE_MASK | (mask ?: CRYPTO_ALG_TYPE_MASK);

        larval->mask = mask;
        larval->alg.cra_flags = CRYPTO_ALG_LARVAL | type;
        larval->alg.cra_priority = -1;
        larval->alg.cra_destroy = crypto_larval_destroy;

        strscpy(larval->alg.cra_name, name, CRYPTO_MAX_ALG_NAME);
        init_completion(&larval->completion);

        return larval;
}
EXPORT_SYMBOL_GPL(crypto_larval_alloc);

static struct crypto_alg *crypto_larval_add(const char *name, u32 type,
                                            u32 mask)
{
        struct crypto_alg *alg;
        struct crypto_larval *larval;

        larval = crypto_larval_alloc(name, type, mask);
        if (IS_ERR(larval))
                return ERR_CAST(larval);

        refcount_set(&larval->alg.cra_refcnt, 2);

        down_write(&crypto_alg_sem);
        alg = __crypto_alg_lookup(name, type, mask);
        if (!alg) {
                alg = &larval->alg;
                list_add(&alg->cra_list, &crypto_alg_list);
        }
        up_write(&crypto_alg_sem);

        if (alg != &larval->alg) {
                kfree(larval);
                if (crypto_is_larval(alg))
                        alg = crypto_larval_wait(alg, type, mask);
        }

        return alg;
}

static void crypto_larval_kill(struct crypto_larval *larval)
{
        bool unlinked;

        down_write(&crypto_alg_sem);
        unlinked = list_empty(&larval->alg.cra_list);
        if (!unlinked)
                list_del_init(&larval->alg.cra_list);
        up_write(&crypto_alg_sem);

        if (unlinked)
                return;

        complete_all(&larval->completion);
        crypto_alg_put(&larval->alg);
}

void crypto_schedule_test(struct crypto_larval *larval)
{
        int err;

        err = crypto_probing_notify(CRYPTO_MSG_ALG_REGISTER, larval->adult);
        WARN_ON_ONCE(err != NOTIFY_STOP);
}
EXPORT_SYMBOL_GPL(crypto_schedule_test);

static void crypto_start_test(struct crypto_larval *larval)
{
        if (!crypto_is_test_larval(larval))
                return;

        if (larval->test_started)
                return;

        down_write(&crypto_alg_sem);
        if (larval->test_started) {
                up_write(&crypto_alg_sem);
                return;
        }

        larval->test_started = true;
        up_write(&crypto_alg_sem);

        crypto_schedule_test(larval);
}

static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg,
                                             u32 type, u32 mask)
{
        struct crypto_larval *larval;
        long time_left;

again:
        larval = container_of(alg, struct crypto_larval, alg);

        if (!crypto_boot_test_finished())
                crypto_start_test(larval);

        time_left = wait_for_completion_killable_timeout(
                &larval->completion, 60 * HZ);

        alg = larval->adult;
        if (time_left < 0)
                alg = ERR_PTR(-EINTR);
        else if (!time_left) {
                if (crypto_is_test_larval(larval))
                        crypto_larval_kill(larval);
                alg = ERR_PTR(-ETIMEDOUT);
        } else if (!alg || PTR_ERR(alg) == -EEXIST) {
                int err = alg ? -EEXIST : -EAGAIN;

                /*
                 * EEXIST is expected because two probes can be scheduled
                 * at the same time with one using alg_name and the other
                 * using driver_name.  Do a re-lookup but do not retry in
                 * case we hit a quirk like gcm_base(ctr(aes),...) which
                 * will never match.
                 */
                alg = &larval->alg;
                alg = crypto_alg_lookup(alg->cra_name, type, mask) ?:
                      ERR_PTR(err);
        } else if (IS_ERR(alg))
                ;
        else if (crypto_is_test_larval(larval) &&
                 !(alg->cra_flags & CRYPTO_ALG_TESTED))
                alg = ERR_PTR(-EAGAIN);
        else if (alg->cra_flags & CRYPTO_ALG_FIPS_INTERNAL)
                alg = ERR_PTR(-EAGAIN);
        else if (!crypto_mod_get(alg))
                alg = ERR_PTR(-EAGAIN);
        crypto_mod_put(&larval->alg);

        if (!IS_ERR(alg) && crypto_is_larval(alg))
                goto again;

        return alg;
}

static struct crypto_alg *crypto_alg_lookup(const char *name, u32 type,
                                            u32 mask)
{
        const u32 fips = CRYPTO_ALG_FIPS_INTERNAL;
        struct crypto_alg *alg;
        u32 test = 0;

        if (!((type | mask) & CRYPTO_ALG_TESTED))
                test |= CRYPTO_ALG_TESTED;

        down_read(&crypto_alg_sem);
        alg = __crypto_alg_lookup(name, (type | test) & ~fips,
                                  (mask | test) & ~fips);
        if (alg) {
                if (((type | mask) ^ fips) & fips)
                        mask |= fips;
                mask &= fips;

                if (!crypto_is_larval(alg) &&
                    ((type ^ alg->cra_flags) & mask)) {
                        /* Algorithm is disallowed in FIPS mode. */
                        crypto_mod_put(alg);
                        alg = ERR_PTR(-ENOENT);
                }
        } else if (test) {
                alg = __crypto_alg_lookup(name, type, mask);
                if (alg && !crypto_is_larval(alg)) {
                        /* Test failed */
                        crypto_mod_put(alg);
                        alg = ERR_PTR(-ELIBBAD);
                }
        }
        up_read(&crypto_alg_sem);

        return alg;
}

static struct crypto_alg *crypto_larval_lookup(const char *name, u32 type,
                                               u32 mask)
{
        struct crypto_alg *alg;

        if (!name)
                return ERR_PTR(-ENOENT);

        type &= ~(CRYPTO_ALG_LARVAL | CRYPTO_ALG_DEAD);
        mask &= ~(CRYPTO_ALG_LARVAL | CRYPTO_ALG_DEAD);

        alg = crypto_alg_lookup(name, type, mask);
        if (!alg && !(mask & CRYPTO_NOLOAD)) {
                request_module("crypto-%s", name);

                if (!((type ^ CRYPTO_ALG_NEED_FALLBACK) & mask &
                      CRYPTO_ALG_NEED_FALLBACK))
                        request_module("crypto-%s-all", name);

                alg = crypto_alg_lookup(name, type, mask);
        }

        if (!IS_ERR_OR_NULL(alg) && crypto_is_larval(alg))
                alg = crypto_larval_wait(alg, type, mask);
        else if (alg)
                ;
        else if (!(mask & CRYPTO_ALG_TESTED))
                alg = crypto_larval_add(name, type, mask);
        else
                alg = ERR_PTR(-ENOENT);

        return alg;
}

int crypto_probing_notify(unsigned long val, void *v)
{
        int ok;

        ok = blocking_notifier_call_chain(&crypto_chain, val, v);
        if (ok == NOTIFY_DONE) {
                request_module("cryptomgr");
                ok = blocking_notifier_call_chain(&crypto_chain, val, v);
        }

        return ok;
}
EXPORT_SYMBOL_GPL(crypto_probing_notify);

struct crypto_alg *crypto_alg_mod_lookup(const char *name, u32 type, u32 mask)
{
        struct crypto_alg *alg;
        struct crypto_alg *larval;
        int ok;

        /*
         * If the internal flag is set for a cipher, require a caller to
         * invoke the cipher with the internal flag to use that cipher.
         * Also, if a caller wants to allocate a cipher that may or may
         * not be an internal cipher, use type | CRYPTO_ALG_INTERNAL and
         * !(mask & CRYPTO_ALG_INTERNAL).
         */
        if (!((type | mask) & CRYPTO_ALG_INTERNAL))
                mask |= CRYPTO_ALG_INTERNAL;

        larval = crypto_larval_lookup(name, type, mask);
        if (IS_ERR(larval) || !crypto_is_larval(larval))
                return larval;

        ok = crypto_probing_notify(CRYPTO_MSG_ALG_REQUEST, larval);

        if (ok == NOTIFY_STOP)
                alg = crypto_larval_wait(larval, type, mask);
        else {
                crypto_mod_put(larval);
                alg = ERR_PTR(-ENOENT);
        }
        crypto_larval_kill(container_of(larval, struct crypto_larval, alg));
        return alg;
}
EXPORT_SYMBOL_GPL(crypto_alg_mod_lookup);

static void crypto_exit_ops(struct crypto_tfm *tfm)
{
        const struct crypto_type *type = tfm->__crt_alg->cra_type;

        if (type && tfm->exit)
                tfm->exit(tfm);
}

static unsigned int crypto_ctxsize(struct crypto_alg *alg, u32 type, u32 mask)
{
        const struct crypto_type *type_obj = alg->cra_type;
        unsigned int len;

        len = alg->cra_alignmask & ~(crypto_tfm_ctx_alignment() - 1);
        if (type_obj)
                return len + type_obj->ctxsize(alg, type, mask);

        switch (alg->cra_flags & CRYPTO_ALG_TYPE_MASK) {
        default:
                BUG();

        case CRYPTO_ALG_TYPE_CIPHER:
                len += crypto_cipher_ctxsize(alg);
                break;
        }

        return len;
}

void crypto_shoot_alg(struct crypto_alg *alg)
{
        down_write(&crypto_alg_sem);
        alg->cra_flags |= CRYPTO_ALG_DYING;
        up_write(&crypto_alg_sem);
}
EXPORT_SYMBOL_GPL(crypto_shoot_alg);

struct crypto_tfm *__crypto_alloc_tfmgfp(struct crypto_alg *alg, u32 type,
                                         u32 mask, gfp_t gfp)
{
        struct crypto_tfm *tfm;
        unsigned int tfm_size;
        int err = -ENOMEM;

        tfm_size = sizeof(*tfm) + crypto_ctxsize(alg, type, mask);
        tfm = kzalloc(tfm_size, gfp);
        if (tfm == NULL)
                goto out_err;

        tfm->__crt_alg = alg;
        refcount_set(&tfm->refcnt, 1);

        if (!tfm->exit && alg->cra_init && (err = alg->cra_init(tfm)))
                goto cra_init_failed;

        goto out;

cra_init_failed:
        crypto_exit_ops(tfm);
        if (err == -EAGAIN)
                crypto_shoot_alg(alg);
        kfree(tfm);
out_err:
        tfm = ERR_PTR(err);
out:
        return tfm;
}
EXPORT_SYMBOL_GPL(__crypto_alloc_tfmgfp);

struct crypto_tfm *__crypto_alloc_tfm(struct crypto_alg *alg, u32 type,
                                      u32 mask)
{
        return __crypto_alloc_tfmgfp(alg, type, mask, GFP_KERNEL);
}
EXPORT_SYMBOL_GPL(__crypto_alloc_tfm);

/*
 *        crypto_alloc_base - Locate algorithm and allocate transform
 *        @alg_name: Name of algorithm
 *        @type: Type of algorithm
 *        @mask: Mask for type comparison
 *
 *        This function should not be used by new algorithm types.
 *        Please use crypto_alloc_tfm instead.
 *
 *        crypto_alloc_base() will first attempt to locate an already loaded
 *        algorithm.  If that fails and the kernel supports dynamically loadable
 *        modules, it will then attempt to load a module of the same name or
 *        alias.  If that fails it will send a query to any loaded crypto manager
 *        to construct an algorithm on the fly.  A refcount is grabbed on the
 *        algorithm which is then associated with the new transform.
 *
 *        The returned transform is of a non-determinate type.  Most people
 *        should use one of the more specific allocation functions such as
 *        crypto_alloc_skcipher().
 *
 *        In case of error the return value is an error pointer.
 */
struct crypto_tfm *crypto_alloc_base(const char *alg_name, u32 type, u32 mask)
{
        struct crypto_tfm *tfm;
        int err;

        for (;;) {
                struct crypto_alg *alg;

                alg = crypto_alg_mod_lookup(alg_name, type, mask);
                if (IS_ERR(alg)) {
                        err = PTR_ERR(alg);
                        goto err;
                }

                tfm = __crypto_alloc_tfm(alg, type, mask);
                if (!IS_ERR(tfm))
                        return tfm;

                crypto_mod_put(alg);
                err = PTR_ERR(tfm);

err:
                if (err != -EAGAIN)
                        break;
                if (fatal_signal_pending(current)) {
                        err = -EINTR;
                        break;
                }
        }

        return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(crypto_alloc_base);

static void *crypto_alloc_tfmmem(struct crypto_alg *alg,
                                 const struct crypto_type *frontend, int node,
                                 gfp_t gfp)
{
        struct crypto_tfm *tfm;
        unsigned int tfmsize;
        unsigned int total;
        char *mem;

        tfmsize = frontend->tfmsize;
        total = tfmsize + sizeof(*tfm) + frontend->extsize(alg);

        mem = kzalloc_node(total, gfp, node);
        if (mem == NULL)
                return ERR_PTR(-ENOMEM);

        tfm = (struct crypto_tfm *)(mem + tfmsize);
        tfm->__crt_alg = alg;
        tfm->node = node;
        refcount_set(&tfm->refcnt, 1);

        return mem;
}

void *crypto_create_tfm_node(struct crypto_alg *alg,
                             const struct crypto_type *frontend,
                             int node)
{
        struct crypto_tfm *tfm;
        char *mem;
        int err;

        mem = crypto_alloc_tfmmem(alg, frontend, node, GFP_KERNEL);
        if (IS_ERR(mem))
                goto out;

        tfm = (struct crypto_tfm *)(mem + frontend->tfmsize);
        tfm->fb = tfm;

        err = frontend->init_tfm(tfm);
        if (err)
                goto out_free_tfm;

        if (!tfm->exit && alg->cra_init && (err = alg->cra_init(tfm)))
                goto cra_init_failed;

        goto out;

cra_init_failed:
        crypto_exit_ops(tfm);
out_free_tfm:
        if (err == -EAGAIN)
                crypto_shoot_alg(alg);
        kfree(mem);
        mem = ERR_PTR(err);
out:
        return mem;
}
EXPORT_SYMBOL_GPL(crypto_create_tfm_node);

void *crypto_clone_tfm(const struct crypto_type *frontend,
                       struct crypto_tfm *otfm)
{
        struct crypto_alg *alg = otfm->__crt_alg;
        struct crypto_tfm *tfm;
        char *mem;

        mem = ERR_PTR(-ESTALE);
        if (unlikely(!crypto_mod_get(alg)))
                goto out;

        mem = crypto_alloc_tfmmem(alg, frontend, otfm->node, GFP_ATOMIC);
        if (IS_ERR(mem)) {
                crypto_mod_put(alg);
                goto out;
        }

        tfm = (struct crypto_tfm *)(mem + frontend->tfmsize);
        tfm->crt_flags = otfm->crt_flags;
        tfm->fb = tfm;

out:
        return mem;
}
EXPORT_SYMBOL_GPL(crypto_clone_tfm);

struct crypto_alg *crypto_find_alg(const char *alg_name,
                                   const struct crypto_type *frontend,
                                   u32 type, u32 mask)
{
        if (frontend) {
                type &= frontend->maskclear;
                mask &= frontend->maskclear;
                type |= frontend->type;
                mask |= frontend->maskset;
        }

        return crypto_alg_mod_lookup(alg_name, type, mask);
}
EXPORT_SYMBOL_GPL(crypto_find_alg);

/*
 *        crypto_alloc_tfm_node - Locate algorithm and allocate transform
 *        @alg_name: Name of algorithm
 *        @frontend: Frontend algorithm type
 *        @type: Type of algorithm
 *        @mask: Mask for type comparison
 *        @node: NUMA node in which users desire to put requests, if node is
 *                NUMA_NO_NODE, it means users have no special requirement.
 *
 *        crypto_alloc_tfm() will first attempt to locate an already loaded
 *        algorithm.  If that fails and the kernel supports dynamically loadable
 *        modules, it will then attempt to load a module of the same name or
 *        alias.  If that fails it will send a query to any loaded crypto manager
 *        to construct an algorithm on the fly.  A refcount is grabbed on the
 *        algorithm which is then associated with the new transform.
 *
 *        The returned transform is of a non-determinate type.  Most people
 *        should use one of the more specific allocation functions such as
 *        crypto_alloc_skcipher().
 *
 *        In case of error the return value is an error pointer.
 */

void *crypto_alloc_tfm_node(const char *alg_name,
                       const struct crypto_type *frontend, u32 type, u32 mask,
                       int node)
{
        void *tfm;
        int err;

        for (;;) {
                struct crypto_alg *alg;

                alg = crypto_find_alg(alg_name, frontend, type, mask);
                if (IS_ERR(alg)) {
                        err = PTR_ERR(alg);
                        goto err;
                }

                tfm = crypto_create_tfm_node(alg, frontend, node);
                if (!IS_ERR(tfm))
                        return tfm;

                crypto_mod_put(alg);
                err = PTR_ERR(tfm);

err:
                if (err != -EAGAIN)
                        break;
                if (fatal_signal_pending(current)) {
                        err = -EINTR;
                        break;
                }
        }

        return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(crypto_alloc_tfm_node);

/*
 *        crypto_destroy_tfm - Free crypto transform
 *        @mem: Start of tfm slab
 *        @tfm: Transform to free
 *
 *        This function frees up the transform and any associated resources,
 *        then drops the refcount on the associated algorithm.
 */
void crypto_destroy_tfm(void *mem, struct crypto_tfm *tfm)
{
        struct crypto_alg *alg;

        if (IS_ERR_OR_NULL(mem))
                return;

        if (!refcount_dec_and_test(&tfm->refcnt))
                return;
        alg = tfm->__crt_alg;

        if (!tfm->exit && alg->cra_exit)
                alg->cra_exit(tfm);
        crypto_exit_ops(tfm);
        crypto_mod_put(alg);
        kfree_sensitive(mem);
}
EXPORT_SYMBOL_GPL(crypto_destroy_tfm);

int crypto_has_alg(const char *name, u32 type, u32 mask)
{
        int ret = 0;
        struct crypto_alg *alg = crypto_alg_mod_lookup(name, type, mask);

        if (!IS_ERR(alg)) {
                crypto_mod_put(alg);
                ret = 1;
        }

        return ret;
}
EXPORT_SYMBOL_GPL(crypto_has_alg);

void crypto_req_done(void *data, int err)
{
        struct crypto_wait *wait = data;

        if (err == -EINPROGRESS)
                return;

        wait->err = err;
        complete(&wait->completion);
}
EXPORT_SYMBOL_GPL(crypto_req_done);

void crypto_destroy_alg(struct crypto_alg *alg)
{
        if (alg->cra_type && alg->cra_type->destroy)
                alg->cra_type->destroy(alg);
        if (alg->cra_destroy)
                alg->cra_destroy(alg);
}
EXPORT_SYMBOL_GPL(crypto_destroy_alg);

struct crypto_async_request *crypto_request_clone(
        struct crypto_async_request *req, size_t total, gfp_t gfp)
{
        struct crypto_tfm *tfm = req->tfm;
        struct crypto_async_request *nreq;

        nreq = kmemdup(req, total, gfp);
        if (!nreq) {
                req->tfm = tfm->fb;
                return req;
        }

        nreq->flags &= ~CRYPTO_TFM_REQ_ON_STACK;
        return nreq;
}
EXPORT_SYMBOL_GPL(crypto_request_clone);

MODULE_DESCRIPTION("Cryptographic core API");
MODULE_LICENSE("GPL");





























































    2 













    3 






    2 



    3 







    3 

    2 
    3 

    3 



    2 




    2 



    2 













    1 

    2 


    2 

    1 












    2 




    2 





    2 


    2 



    2 






    1 
    2 

    2 
    2 


    2 
































































































    2 




    2 






    2 




















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (C)2003,2004 USAGI/WIDE Project
 *
 * Authors        Mitsuru KANDA  <mk@linux-ipv6.org>
 *                YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
 *
 * Based on net/ipv4/xfrm4_tunnel.c
 */
#include <linux/module.h>
#include <linux/xfrm.h>
#include <linux/slab.h>
#include <linux/rculist.h>
#include <net/ip.h>
#include <net/xfrm.h>
#include <net/ipv6.h>
#include <linux/ipv6.h>
#include <linux/icmpv6.h>
#include <linux/mutex.h>
#include <net/netns/generic.h>

#define XFRM6_TUNNEL_SPI_BYADDR_HSIZE 256
#define XFRM6_TUNNEL_SPI_BYSPI_HSIZE 256

#define XFRM6_TUNNEL_SPI_MIN        1
#define XFRM6_TUNNEL_SPI_MAX        0xffffffff

struct xfrm6_tunnel_net {
        struct hlist_head spi_byaddr[XFRM6_TUNNEL_SPI_BYADDR_HSIZE];
        struct hlist_head spi_byspi[XFRM6_TUNNEL_SPI_BYSPI_HSIZE];
        u32 spi;
};

static unsigned int xfrm6_tunnel_net_id __read_mostly;
static inline struct xfrm6_tunnel_net *xfrm6_tunnel_pernet(struct net *net)
{
        return net_generic(net, xfrm6_tunnel_net_id);
}

/*
 * xfrm_tunnel_spi things are for allocating unique id ("spi")
 * per xfrm_address_t.
 */
struct xfrm6_tunnel_spi {
        struct hlist_node        list_byaddr;
        struct hlist_node        list_byspi;
        xfrm_address_t                addr;
        u32                        spi;
        refcount_t                refcnt;
        struct rcu_head                rcu_head;
};

static DEFINE_SPINLOCK(xfrm6_tunnel_spi_lock);

static struct kmem_cache *xfrm6_tunnel_spi_kmem __read_mostly;

static inline unsigned int xfrm6_tunnel_spi_hash_byaddr(const xfrm_address_t *addr)
{
        unsigned int h;

        h = ipv6_addr_hash((const struct in6_addr *)addr);
        h ^= h >> 16;
        h ^= h >> 8;
        h &= XFRM6_TUNNEL_SPI_BYADDR_HSIZE - 1;

        return h;
}

static inline unsigned int xfrm6_tunnel_spi_hash_byspi(u32 spi)
{
        return spi % XFRM6_TUNNEL_SPI_BYSPI_HSIZE;
}

static struct xfrm6_tunnel_spi *__xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr)
{
        struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net);
        struct xfrm6_tunnel_spi *x6spi;

        hlist_for_each_entry_rcu(x6spi,
                             &xfrm6_tn->spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)],
                             list_byaddr, lockdep_is_held(&xfrm6_tunnel_spi_lock)) {
                if (xfrm6_addr_equal(&x6spi->addr, saddr))
                        return x6spi;
        }

        return NULL;
}

__be32 xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr)
{
        struct xfrm6_tunnel_spi *x6spi;
        u32 spi;

        rcu_read_lock_bh();
        x6spi = __xfrm6_tunnel_spi_lookup(net, saddr);
        spi = x6spi ? x6spi->spi : 0;
        rcu_read_unlock_bh();
        return htonl(spi);
}
EXPORT_SYMBOL(xfrm6_tunnel_spi_lookup);

static int __xfrm6_tunnel_spi_check(struct net *net, u32 spi)
{
        struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net);
        struct xfrm6_tunnel_spi *x6spi;
        int index = xfrm6_tunnel_spi_hash_byspi(spi);

        hlist_for_each_entry(x6spi,
                             &xfrm6_tn->spi_byspi[index],
                             list_byspi) {
                if (x6spi->spi == spi)
                        return -1;
        }
        return index;
}

static u32 __xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr)
{
        struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net);
        u32 spi;
        struct xfrm6_tunnel_spi *x6spi;
        int index;

        if (xfrm6_tn->spi < XFRM6_TUNNEL_SPI_MIN ||
            xfrm6_tn->spi >= XFRM6_TUNNEL_SPI_MAX)
                xfrm6_tn->spi = XFRM6_TUNNEL_SPI_MIN;
        else
                xfrm6_tn->spi++;

        for (spi = xfrm6_tn->spi; spi <= XFRM6_TUNNEL_SPI_MAX; spi++) {
                index = __xfrm6_tunnel_spi_check(net, spi);
                if (index >= 0)
                        goto alloc_spi;

                if (spi == XFRM6_TUNNEL_SPI_MAX)
                        break;
        }
        for (spi = XFRM6_TUNNEL_SPI_MIN; spi < xfrm6_tn->spi; spi++) {
                index = __xfrm6_tunnel_spi_check(net, spi);
                if (index >= 0)
                        goto alloc_spi;
        }
        spi = 0;
        goto out;
alloc_spi:
        xfrm6_tn->spi = spi;
        x6spi = kmem_cache_alloc(xfrm6_tunnel_spi_kmem, GFP_ATOMIC);
        if (!x6spi)
                goto out;

        memcpy(&x6spi->addr, saddr, sizeof(x6spi->addr));
        x6spi->spi = spi;
        refcount_set(&x6spi->refcnt, 1);

        hlist_add_head_rcu(&x6spi->list_byspi, &xfrm6_tn->spi_byspi[index]);

        index = xfrm6_tunnel_spi_hash_byaddr(saddr);
        hlist_add_head_rcu(&x6spi->list_byaddr, &xfrm6_tn->spi_byaddr[index]);
out:
        return spi;
}

__be32 xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr)
{
        struct xfrm6_tunnel_spi *x6spi;
        u32 spi;

        spin_lock_bh(&xfrm6_tunnel_spi_lock);
        x6spi = __xfrm6_tunnel_spi_lookup(net, saddr);
        if (x6spi) {
                refcount_inc(&x6spi->refcnt);
                spi = x6spi->spi;
        } else
                spi = __xfrm6_tunnel_alloc_spi(net, saddr);
        spin_unlock_bh(&xfrm6_tunnel_spi_lock);

        return htonl(spi);
}
EXPORT_SYMBOL(xfrm6_tunnel_alloc_spi);

static void x6spi_destroy_rcu(struct rcu_head *head)
{
        kmem_cache_free(xfrm6_tunnel_spi_kmem,
                        container_of(head, struct xfrm6_tunnel_spi, rcu_head));
}

static void xfrm6_tunnel_free_spi(struct net *net, xfrm_address_t *saddr)
{
        struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net);
        struct xfrm6_tunnel_spi *x6spi;
        struct hlist_node *n;

        spin_lock_bh(&xfrm6_tunnel_spi_lock);

        hlist_for_each_entry_safe(x6spi, n,
                                  &xfrm6_tn->spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)],
                                  list_byaddr)
        {
                if (xfrm6_addr_equal(&x6spi->addr, saddr)) {
                        if (refcount_dec_and_test(&x6spi->refcnt)) {
                                hlist_del_rcu(&x6spi->list_byaddr);
                                hlist_del_rcu(&x6spi->list_byspi);
                                call_rcu(&x6spi->rcu_head, x6spi_destroy_rcu);
                                break;
                        }
                }
        }
        spin_unlock_bh(&xfrm6_tunnel_spi_lock);
}

static int xfrm6_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
{
        skb_push(skb, -skb_network_offset(skb));
        return 0;
}

static int xfrm6_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
{
        return skb_network_header(skb)[IP6CB(skb)->nhoff];
}

static int xfrm6_tunnel_rcv(struct sk_buff *skb)
{
        struct net *net = dev_net(skb->dev);
        const struct ipv6hdr *iph = ipv6_hdr(skb);
        __be32 spi;

        spi = xfrm6_tunnel_spi_lookup(net, (const xfrm_address_t *)&iph->saddr);
        return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi, NULL);
}

static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
                            u8 type, u8 code, int offset, __be32 info)
{
        /* xfrm6_tunnel native err handling */
        switch (type) {
        case ICMPV6_DEST_UNREACH:
                switch (code) {
                case ICMPV6_NOROUTE:
                case ICMPV6_ADM_PROHIBITED:
                case ICMPV6_NOT_NEIGHBOUR:
                case ICMPV6_ADDR_UNREACH:
                case ICMPV6_PORT_UNREACH:
                default:
                        break;
                }
                break;
        case ICMPV6_PKT_TOOBIG:
                break;
        case ICMPV6_TIME_EXCEED:
                switch (code) {
                case ICMPV6_EXC_HOPLIMIT:
                        break;
                case ICMPV6_EXC_FRAGTIME:
                default:
                        break;
                }
                break;
        case ICMPV6_PARAMPROB:
                switch (code) {
                case ICMPV6_HDR_FIELD: break;
                case ICMPV6_UNK_NEXTHDR: break;
                case ICMPV6_UNK_OPTION: break;
                }
                break;
        default:
                break;
        }

        return 0;
}

static int xfrm6_tunnel_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
        if (x->props.mode != XFRM_MODE_TUNNEL) {
                NL_SET_ERR_MSG(extack, "IPv6 tunnel can only be used with tunnel mode");
                return -EINVAL;
        }

        if (x->encap) {
                NL_SET_ERR_MSG(extack, "IPv6 tunnel is not compatible with encapsulation");
                return -EINVAL;
        }

        x->props.header_len = sizeof(struct ipv6hdr);

        return 0;
}

static void xfrm6_tunnel_destroy(struct xfrm_state *x)
{
        struct net *net = xs_net(x);

        xfrm6_tunnel_free_spi(net, (xfrm_address_t *)&x->props.saddr);
}

static const struct xfrm_type xfrm6_tunnel_type = {
        .owner          = THIS_MODULE,
        .proto                = IPPROTO_IPV6,
        .init_state        = xfrm6_tunnel_init_state,
        .destructor        = xfrm6_tunnel_destroy,
        .input                = xfrm6_tunnel_input,
        .output                = xfrm6_tunnel_output,
};

static struct xfrm6_tunnel xfrm6_tunnel_handler __read_mostly = {
        .handler        = xfrm6_tunnel_rcv,
        .err_handler        = xfrm6_tunnel_err,
        .priority        = 3,
};

static struct xfrm6_tunnel xfrm46_tunnel_handler __read_mostly = {
        .handler        = xfrm6_tunnel_rcv,
        .err_handler        = xfrm6_tunnel_err,
        .priority        = 3,
};

static int __net_init xfrm6_tunnel_net_init(struct net *net)
{
        struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net);
        unsigned int i;

        for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++)
                INIT_HLIST_HEAD(&xfrm6_tn->spi_byaddr[i]);
        for (i = 0; i < XFRM6_TUNNEL_SPI_BYSPI_HSIZE; i++)
                INIT_HLIST_HEAD(&xfrm6_tn->spi_byspi[i]);
        xfrm6_tn->spi = 0;

        return 0;
}

static void __net_exit xfrm6_tunnel_net_exit(struct net *net)
{
        struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net);
        unsigned int i;

        xfrm_state_flush(net, 0, false);
        xfrm_flush_gc();

        for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++)
                WARN_ON_ONCE(!hlist_empty(&xfrm6_tn->spi_byaddr[i]));

        for (i = 0; i < XFRM6_TUNNEL_SPI_BYSPI_HSIZE; i++)
                WARN_ON_ONCE(!hlist_empty(&xfrm6_tn->spi_byspi[i]));
}

static struct pernet_operations xfrm6_tunnel_net_ops = {
        .init        = xfrm6_tunnel_net_init,
        .exit        = xfrm6_tunnel_net_exit,
        .id        = &xfrm6_tunnel_net_id,
        .size        = sizeof(struct xfrm6_tunnel_net),
};

static int __init xfrm6_tunnel_init(void)
{
        int rv;

        xfrm6_tunnel_spi_kmem = KMEM_CACHE(xfrm6_tunnel_spi, SLAB_HWCACHE_ALIGN);
        if (!xfrm6_tunnel_spi_kmem)
                return -ENOMEM;
        rv = register_pernet_subsys(&xfrm6_tunnel_net_ops);
        if (rv < 0)
                goto out_pernet;
        rv = xfrm_register_type(&xfrm6_tunnel_type, AF_INET6);
        if (rv < 0)
                goto out_type;
        rv = xfrm6_tunnel_register(&xfrm6_tunnel_handler, AF_INET6);
        if (rv < 0)
                goto out_xfrm6;
        rv = xfrm6_tunnel_register(&xfrm46_tunnel_handler, AF_INET);
        if (rv < 0)
                goto out_xfrm46;
        return 0;

out_xfrm46:
        xfrm6_tunnel_deregister(&xfrm6_tunnel_handler, AF_INET6);
out_xfrm6:
        xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6);
out_type:
        unregister_pernet_subsys(&xfrm6_tunnel_net_ops);
out_pernet:
        kmem_cache_destroy(xfrm6_tunnel_spi_kmem);
        return rv;
}

static void __exit xfrm6_tunnel_fini(void)
{
        xfrm6_tunnel_deregister(&xfrm46_tunnel_handler, AF_INET);
        xfrm6_tunnel_deregister(&xfrm6_tunnel_handler, AF_INET6);
        xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6);
        unregister_pernet_subsys(&xfrm6_tunnel_net_ops);
        /* Someone maybe has gotten the xfrm6_tunnel_spi.
         * So need to wait it.
         */
        rcu_barrier();
        kmem_cache_destroy(xfrm6_tunnel_spi_kmem);
}

module_init(xfrm6_tunnel_init);
module_exit(xfrm6_tunnel_fini);
MODULE_DESCRIPTION("IPv6 XFRM tunnel driver");
MODULE_LICENSE("GPL");
MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_IPV6);































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_NETFILTER_H
#define __LINUX_NETFILTER_H

#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/net.h>
#include <linux/if.h>
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/wait.h>
#include <linux/list.h>
#include <linux/static_key.h>
#include <linux/module.h>
#include <linux/netfilter_defs.h>
#include <linux/netdevice.h>
#include <linux/sockptr.h>
#include <net/net_namespace.h>

static inline int NF_DROP_GETERR(int verdict)
{
        return -(verdict >> NF_VERDICT_QBITS);
}

static __always_inline int
NF_DROP_REASON(struct sk_buff *skb, enum skb_drop_reason reason, u32 err)
{
        BUILD_BUG_ON(err > 0xffff);

        kfree_skb_reason(skb, reason);

        return ((err << 16) | NF_STOLEN);
}

static inline int nf_inet_addr_cmp(const union nf_inet_addr *a1,
                                   const union nf_inet_addr *a2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        const unsigned long *ul1 = (const unsigned long *)a1;
        const unsigned long *ul2 = (const unsigned long *)a2;

        return ((ul1[0] ^ ul2[0]) | (ul1[1] ^ ul2[1])) == 0UL;
#else
        return a1->all[0] == a2->all[0] &&
               a1->all[1] == a2->all[1] &&
               a1->all[2] == a2->all[2] &&
               a1->all[3] == a2->all[3];
#endif
}

static inline void nf_inet_addr_mask(const union nf_inet_addr *a1,
                                     union nf_inet_addr *result,
                                     const union nf_inet_addr *mask)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        const unsigned long *ua = (const unsigned long *)a1;
        unsigned long *ur = (unsigned long *)result;
        const unsigned long *um = (const unsigned long *)mask;

        ur[0] = ua[0] & um[0];
        ur[1] = ua[1] & um[1];
#else
        result->all[0] = a1->all[0] & mask->all[0];
        result->all[1] = a1->all[1] & mask->all[1];
        result->all[2] = a1->all[2] & mask->all[2];
        result->all[3] = a1->all[3] & mask->all[3];
#endif
}

int netfilter_init(void);

struct sk_buff;

struct nf_hook_ops;

struct sock;

struct nf_hook_state {
        u8 hook;
        u8 pf;
        struct net_device *in;
        struct net_device *out;
        struct sock *sk;
        struct net *net;
        int (*okfn)(struct net *, struct sock *, struct sk_buff *);
};

typedef unsigned int nf_hookfn(void *priv,
                               struct sk_buff *skb,
                               const struct nf_hook_state *state);
enum nf_hook_ops_type {
        NF_HOOK_OP_UNDEFINED,
        NF_HOOK_OP_NF_TABLES,
        NF_HOOK_OP_BPF,
        NF_HOOK_OP_NFT_FT,
};

struct nf_hook_ops {
        struct list_head        list;
        struct rcu_head                rcu;

        /* User fills in from here down. */
        nf_hookfn                *hook;
        struct net_device        *dev;
        void                        *priv;
        u8                        pf;
        enum nf_hook_ops_type        hook_ops_type:8;
        unsigned int                hooknum;
        /* Hooks are ordered in ascending priority. */
        int                        priority;
};

struct nf_hook_entry {
        nf_hookfn                        *hook;
        void                                *priv;
};

struct nf_hook_entries_rcu_head {
        struct rcu_head head;
        void        *allocation;
};

struct nf_hook_entries {
        u16                                num_hook_entries;
        /* padding */
        struct nf_hook_entry                hooks[];

        /* trailer: pointers to original orig_ops of each hook,
         * followed by rcu_head and scratch space used for freeing
         * the structure via call_rcu.
         *
         *   This is not part of struct nf_hook_entry since its only
         *   needed in slow path (hook register/unregister):
         * const struct nf_hook_ops     *orig_ops[]
         *
         *   For the same reason, we store this at end -- its
         *   only needed when a hook is deleted, not during
         *   packet path processing:
         * struct nf_hook_entries_rcu_head     head
         */
};

#ifdef CONFIG_NETFILTER
static inline struct nf_hook_ops **nf_hook_entries_get_hook_ops(const struct nf_hook_entries *e)
{
        unsigned int n = e->num_hook_entries;
        const void *hook_end;

        hook_end = &e->hooks[n]; /* this is *past* ->hooks[]! */

        return (struct nf_hook_ops **)hook_end;
}

static inline int
nf_hook_entry_hookfn(const struct nf_hook_entry *entry, struct sk_buff *skb,
                     struct nf_hook_state *state)
{
        return entry->hook(entry->priv, skb, state);
}

static inline void nf_hook_state_init(struct nf_hook_state *p,
                                      unsigned int hook,
                                      u_int8_t pf,
                                      struct net_device *indev,
                                      struct net_device *outdev,
                                      struct sock *sk,
                                      struct net *net,
                                      int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
        p->hook = hook;
        p->pf = pf;
        p->in = indev;
        p->out = outdev;
        p->sk = sk;
        p->net = net;
        p->okfn = okfn;
}



struct nf_sockopt_ops {
        struct list_head list;

        u_int8_t pf;

        /* Non-inclusive ranges: use 0/0/NULL to never get called. */
        int set_optmin;
        int set_optmax;
        int (*set)(struct sock *sk, int optval, sockptr_t arg,
                   unsigned int len);
        int get_optmin;
        int get_optmax;
        int (*get)(struct sock *sk, int optval, void __user *user, int *len);
        /* Use the module struct to lock set/get code in place */
        struct module *owner;
};

/* Function to register/unregister hook points. */
int nf_register_net_hook(struct net *net, const struct nf_hook_ops *ops);
void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *ops);
int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg,
                          unsigned int n);
void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg,
                             unsigned int n);

/* Functions to register get/setsockopt ranges (non-inclusive).  You
   need to check permissions yourself! */
int nf_register_sockopt(struct nf_sockopt_ops *reg);
void nf_unregister_sockopt(struct nf_sockopt_ops *reg);

#ifdef CONFIG_JUMP_LABEL
extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
#endif

int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
                 const struct nf_hook_entries *e, unsigned int i);

void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state,
                       const struct nf_hook_entries *e);
/**
 *        nf_hook - call a netfilter hook
 *
 *        Returns 1 if the hook has allowed the packet to pass.  The function
 *        okfn must be invoked by the caller in this case.  Any other return
 *        value indicates the packet has been consumed by the hook.
 */
static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
                          struct sock *sk, struct sk_buff *skb,
                          struct net_device *indev, struct net_device *outdev,
                          int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
        struct nf_hook_entries *hook_head = NULL;
        int ret = 1;

#ifdef CONFIG_JUMP_LABEL
        if (__builtin_constant_p(pf) &&
            __builtin_constant_p(hook) &&
            !static_key_false(&nf_hooks_needed[pf][hook]))
                return 1;
#endif

        rcu_read_lock();
        switch (pf) {
        case NFPROTO_IPV4:
                hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]);
                break;
        case NFPROTO_IPV6:
                hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]);
                break;
        case NFPROTO_ARP:
#ifdef CONFIG_NETFILTER_FAMILY_ARP
                if (WARN_ON_ONCE(hook >= ARRAY_SIZE(net->nf.hooks_arp)))
                        break;
                hook_head = rcu_dereference(net->nf.hooks_arp[hook]);
#endif
                break;
        case NFPROTO_BRIDGE:
#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
                hook_head = rcu_dereference(net->nf.hooks_bridge[hook]);
#endif
                break;
        default:
                WARN_ON_ONCE(1);
                break;
        }

        if (hook_head) {
                struct nf_hook_state state;

                nf_hook_state_init(&state, hook, pf, indev, outdev,
                                   sk, net, okfn);

                ret = nf_hook_slow(skb, &state, hook_head, 0);
        }
        rcu_read_unlock();

        return ret;
}

/* Activate hook; either okfn or kfree_skb called, unless a hook
   returns NF_STOLEN (in which case, it's up to the hook to deal with
   the consequences).

   Returns -ERRNO if packet dropped.  Zero means queued, stolen or
   accepted.
*/

/* RR:
   > I don't want nf_hook to return anything because people might forget
   > about async and trust the return value to mean "packet was ok".

   AK:
   Just document it clearly, then you can expect some sense from kernel
   coders :)
*/

static inline int
NF_HOOK_COND(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
             struct sk_buff *skb, struct net_device *in, struct net_device *out,
             int (*okfn)(struct net *, struct sock *, struct sk_buff *),
             bool cond)
{
        int ret;

        if (!cond ||
            ((ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn)) == 1))
                ret = okfn(net, sk, skb);
        return ret;
}

static inline int
NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb,
        struct net_device *in, struct net_device *out,
        int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
        int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn);
        if (ret == 1)
                ret = okfn(net, sk, skb);
        return ret;
}

static inline void
NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
             struct list_head *head, struct net_device *in, struct net_device *out,
             int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
        struct nf_hook_entries *hook_head = NULL;

#ifdef CONFIG_JUMP_LABEL
        if (__builtin_constant_p(pf) &&
            __builtin_constant_p(hook) &&
            !static_key_false(&nf_hooks_needed[pf][hook]))
                return;
#endif

        rcu_read_lock();
        switch (pf) {
        case NFPROTO_IPV4:
                hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]);
                break;
        case NFPROTO_IPV6:
                hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]);
                break;
        default:
                WARN_ON_ONCE(1);
                break;
        }

        if (hook_head) {
                struct nf_hook_state state;

                nf_hook_state_init(&state, hook, pf, in, out, sk, net, okfn);

                nf_hook_slow_list(head, &state, hook_head);
        }
        rcu_read_unlock();
}

/* Call setsockopt() */
int nf_setsockopt(struct sock *sk, u_int8_t pf, int optval, sockptr_t opt,
                  unsigned int len);
int nf_getsockopt(struct sock *sk, u_int8_t pf, int optval, char __user *opt,
                  int *len);

struct flowi;
struct nf_queue_entry;

__sum16 nf_checksum(struct sk_buff *skb, unsigned int hook,
                    unsigned int dataoff, u_int8_t protocol,
                    unsigned short family);

__sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook,
                            unsigned int dataoff, unsigned int len,
                            u_int8_t protocol, unsigned short family);
int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
             bool strict, unsigned short family);

#include <net/flow.h>

struct nf_conn;
enum nf_nat_manip_type;
struct nlattr;

struct nf_nat_hook {
        int (*parse_nat_setup)(struct nf_conn *ct, enum nf_nat_manip_type manip,
                               const struct nlattr *attr);
        void (*decode_session)(struct sk_buff *skb, struct flowi *fl);
        void (*remove_nat_bysrc)(struct nf_conn *ct);
};

extern const struct nf_nat_hook __rcu *nf_nat_hook;

static inline void
nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family)
{
#if IS_ENABLED(CONFIG_NF_NAT)
        const struct nf_nat_hook *nat_hook;

        rcu_read_lock();
        nat_hook = rcu_dereference(nf_nat_hook);
        if (nat_hook && nat_hook->decode_session)
                nat_hook->decode_session(skb, fl);
        rcu_read_unlock();
#endif
}

#else /* !CONFIG_NETFILTER */
static inline int
NF_HOOK_COND(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
             struct sk_buff *skb, struct net_device *in, struct net_device *out,
             int (*okfn)(struct net *, struct sock *, struct sk_buff *),
             bool cond)
{
        return okfn(net, sk, skb);
}

static inline int
NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
        struct sk_buff *skb, struct net_device *in, struct net_device *out,
        int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
        return okfn(net, sk, skb);
}

static inline void
NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
             struct list_head *head, struct net_device *in, struct net_device *out,
             int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
        /* nothing to do */
}

static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
                          struct sock *sk, struct sk_buff *skb,
                          struct net_device *indev, struct net_device *outdev,
                          int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
        return 1;
}
struct flowi;
static inline void
nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family)
{
}
#endif /*CONFIG_NETFILTER*/

#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <linux/netfilter/nf_conntrack_zones_common.h>

void nf_ct_attach(struct sk_buff *, const struct sk_buff *);
void nf_ct_set_closing(struct nf_conntrack *nfct);
struct nf_conntrack_tuple;
bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
                         const struct sk_buff *skb);
#else
static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
static inline void nf_ct_set_closing(struct nf_conntrack *nfct) {}
struct nf_conntrack_tuple;
static inline bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
                                       const struct sk_buff *skb)
{
        return false;
}
#endif

struct nf_conn;
enum ip_conntrack_info;

struct nf_ct_hook {
        int (*update)(struct net *net, struct sk_buff *skb);
        void (*destroy)(struct nf_conntrack *);
        bool (*get_tuple_skb)(struct nf_conntrack_tuple *,
                              const struct sk_buff *);
        void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb);
        void (*set_closing)(struct nf_conntrack *nfct);
        int (*confirm)(struct sk_buff *skb);
        u32 (*get_id)(const struct nf_conntrack *nfct);
};
extern const struct nf_ct_hook __rcu *nf_ct_hook;

struct nlattr;

struct nfnl_ct_hook {
        size_t (*build_size)(const struct nf_conn *ct);
        int (*build)(struct sk_buff *skb, struct nf_conn *ct,
                     enum ip_conntrack_info ctinfo,
                     u_int16_t ct_attr, u_int16_t ct_info_attr);
        int (*parse)(const struct nlattr *attr, struct nf_conn *ct);
        int (*attach_expect)(const struct nlattr *attr, struct nf_conn *ct,
                             u32 portid, u32 report);
        void (*seq_adjust)(struct sk_buff *skb, struct nf_conn *ct,
                           enum ip_conntrack_info ctinfo, s32 off);
};
extern const struct nfnl_ct_hook __rcu *nfnl_ct_hook;

struct nf_defrag_hook {
        struct module *owner;
        int (*enable)(struct net *net);
        void (*disable)(struct net *net);
};

extern const struct nf_defrag_hook __rcu *nf_defrag_v4_hook;
extern const struct nf_defrag_hook __rcu *nf_defrag_v6_hook;

/*
 * Contains bitmask of ctnetlink event subscribers, if any.
 * Can't be pernet due to NETLINK_LISTEN_ALL_NSID setsockopt flag.
 */
extern u8 nf_ctnetlink_has_listener;
#endif /*__LINUX_NETFILTER_H*/




























































    1 












































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * AEAD: Authenticated Encryption with Associated Data
 * 
 * Copyright (c) 2007-2015 Herbert Xu <herbert@gondor.apana.org.au>
 */

#ifndef _CRYPTO_INTERNAL_AEAD_H
#define _CRYPTO_INTERNAL_AEAD_H

#include <crypto/aead.h>
#include <crypto/algapi.h>
#include <linux/stddef.h>
#include <linux/types.h>

struct rtattr;

struct aead_instance {
        void (*free)(struct aead_instance *inst);
        union {
                struct {
                        char head[offsetof(struct aead_alg, base)];
                        struct crypto_instance base;
                } s;
                struct aead_alg alg;
        };
};

struct crypto_aead_spawn {
        struct crypto_spawn base;
};

struct aead_queue {
        struct crypto_queue base;
};

static inline void *crypto_aead_ctx(struct crypto_aead *tfm)
{
        return crypto_tfm_ctx(&tfm->base);
}

static inline void *crypto_aead_ctx_dma(struct crypto_aead *tfm)
{
        return crypto_tfm_ctx_dma(&tfm->base);
}

static inline struct crypto_instance *aead_crypto_instance(
        struct aead_instance *inst)
{
        return container_of(&inst->alg.base, struct crypto_instance, alg);
}

static inline struct aead_instance *aead_instance(struct crypto_instance *inst)
{
        return container_of(&inst->alg, struct aead_instance, alg.base);
}

static inline struct aead_instance *aead_alg_instance(struct crypto_aead *aead)
{
        return aead_instance(crypto_tfm_alg_instance(&aead->base));
}

static inline void *aead_instance_ctx(struct aead_instance *inst)
{
        return crypto_instance_ctx(aead_crypto_instance(inst));
}

static inline void *aead_request_ctx(struct aead_request *req)
{
        return req->__ctx;
}

static inline void *aead_request_ctx_dma(struct aead_request *req)
{
        unsigned int align = crypto_dma_align();

        if (align <= crypto_tfm_ctx_alignment())
                align = 1;

        return PTR_ALIGN(aead_request_ctx(req), align);
}

static inline void aead_request_complete(struct aead_request *req, int err)
{
        crypto_request_complete(&req->base, err);
}

static inline u32 aead_request_flags(struct aead_request *req)
{
        return req->base.flags;
}

static inline struct aead_request *aead_request_cast(
        struct crypto_async_request *req)
{
        return container_of(req, struct aead_request, base);
}

int crypto_grab_aead(struct crypto_aead_spawn *spawn,
                     struct crypto_instance *inst,
                     const char *name, u32 type, u32 mask);

static inline void crypto_drop_aead(struct crypto_aead_spawn *spawn)
{
        crypto_drop_spawn(&spawn->base);
}

static inline struct aead_alg *crypto_spawn_aead_alg(
        struct crypto_aead_spawn *spawn)
{
        return container_of(spawn->base.alg, struct aead_alg, base);
}

static inline struct crypto_aead *crypto_spawn_aead(
        struct crypto_aead_spawn *spawn)
{
        return crypto_spawn_tfm2(&spawn->base);
}

static inline void crypto_aead_set_reqsize(struct crypto_aead *aead,
                                           unsigned int reqsize)
{
        aead->reqsize = reqsize;
}

static inline void crypto_aead_set_reqsize_dma(struct crypto_aead *aead,
                                               unsigned int reqsize)
{
        reqsize += crypto_dma_align() & ~(crypto_tfm_ctx_alignment() - 1);
        aead->reqsize = reqsize;
}

static inline void aead_init_queue(struct aead_queue *queue,
                                   unsigned int max_qlen)
{
        crypto_init_queue(&queue->base, max_qlen);
}

static inline unsigned int crypto_aead_alg_chunksize(struct aead_alg *alg)
{
        return alg->chunksize;
}

/**
 * crypto_aead_chunksize() - obtain chunk size
 * @tfm: cipher handle
 *
 * The block size is set to one for ciphers such as CCM.  However,
 * you still need to provide incremental updates in multiples of
 * the underlying block size as the IV does not have sub-block
 * granularity.  This is known in this API as the chunk size.
 *
 * Return: chunk size in bytes
 */
static inline unsigned int crypto_aead_chunksize(struct crypto_aead *tfm)
{
        return crypto_aead_alg_chunksize(crypto_aead_alg(tfm));
}

int crypto_register_aead(struct aead_alg *alg);
void crypto_unregister_aead(struct aead_alg *alg);
int crypto_register_aeads(struct aead_alg *algs, int count);
void crypto_unregister_aeads(struct aead_alg *algs, int count);
int aead_register_instance(struct crypto_template *tmpl,
                           struct aead_instance *inst);

#endif        /* _CRYPTO_INTERNAL_AEAD_H */















































































































































































































































































































































































  317 





































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef _NET_CORE_DEV_H
#define _NET_CORE_DEV_H

#include <linux/cleanup.h>
#include <linux/types.h>
#include <linux/rwsem.h>
#include <linux/netdevice.h>
#include <net/netdev_lock.h>

struct net;
struct netlink_ext_ack;
struct cpumask;

/* Random bits of netdevice that don't need to be exposed */
#define FLOW_LIMIT_HISTORY        (1 << 7)  /* must be ^2 and !overflow buckets */
struct sd_flow_limit {
        struct rcu_head                rcu;
        unsigned int                count;
        u8                        log_buckets;
        unsigned int                history_head;
        u16                        history[FLOW_LIMIT_HISTORY];
        u8                        buckets[];
};

extern int netdev_flow_limit_table_len;

struct napi_struct *
netdev_napi_by_id_lock(struct net *net, unsigned int napi_id);
struct net_device *dev_get_by_napi_id(unsigned int napi_id);

struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex);
struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net);
struct net_device *
netdev_xa_find_lock(struct net *net, struct net_device *dev,
                    unsigned long *index);

DEFINE_FREE(netdev_unlock, struct net_device *, if (_T) netdev_unlock(_T));

#define for_each_netdev_lock_scoped(net, var_name, ifindex)                \
        for (struct net_device *var_name __free(netdev_unlock) = NULL;        \
             (var_name = netdev_xa_find_lock(net, var_name, &ifindex)); \
             ifindex++)

struct net_device *
netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex);
struct net_device *
netdev_xa_find_lock_ops_compat(struct net *net, struct net_device *dev,
                               unsigned long *index);

DEFINE_FREE(netdev_unlock_ops_compat, struct net_device *,
            if (_T) netdev_unlock_ops_compat(_T));

#define for_each_netdev_lock_ops_compat_scoped(net, var_name, ifindex)        \
        for (struct net_device *var_name __free(netdev_unlock_ops_compat) = NULL; \
             (var_name = netdev_xa_find_lock_ops_compat(net, var_name,        \
                                                        &ifindex));        \
             ifindex++)

#ifdef CONFIG_PROC_FS
int __init dev_proc_init(void);
#else
#define dev_proc_init() 0
#endif

void linkwatch_init_dev(struct net_device *dev);
void linkwatch_run_queue(void);

void dev_addr_flush(struct net_device *dev);
int dev_addr_init(struct net_device *dev);
void dev_addr_check(struct net_device *dev);

#if IS_ENABLED(CONFIG_NET_SHAPER)
void net_shaper_flush_netdev(struct net_device *dev);
void net_shaper_set_real_num_tx_queues(struct net_device *dev,
                                       unsigned int txq);
#else
static inline void net_shaper_flush_netdev(struct net_device *dev) {}
static inline void net_shaper_set_real_num_tx_queues(struct net_device *dev,
                                                     unsigned int txq) {}
#endif

/* sysctls not referred to from outside net/core/ */
extern int                netdev_unregister_timeout_secs;
extern int                weight_p;
extern int                dev_weight_rx_bias;
extern int                dev_weight_tx_bias;

extern struct rw_semaphore dev_addr_sem;

/* rtnl helpers */
extern struct list_head net_todo_list;
void netdev_run_todo(void);

/* netdev management, shared between various uAPI entry points */
struct netdev_name_node {
        struct hlist_node hlist;
        struct list_head list;
        struct net_device *dev;
        const char *name;
        struct rcu_head rcu;
};

int netdev_get_name(struct net *net, char *name, int ifindex);
int netif_change_name(struct net_device *dev, const char *newname);
int dev_change_name(struct net_device *dev, const char *newname);

#define netdev_for_each_altname(dev, namenode)                                \
        list_for_each_entry((namenode), &(dev)->name_node->list, list)
#define netdev_for_each_altname_safe(dev, namenode, next)                \
        list_for_each_entry_safe((namenode), (next), &(dev)->name_node->list, \
                                 list)

int netdev_name_node_alt_create(struct net_device *dev, const char *name);
int netdev_name_node_alt_destroy(struct net_device *dev, const char *name);

int dev_validate_mtu(struct net_device *dev, int mtu,
                     struct netlink_ext_ack *extack);
int netif_set_mtu_ext(struct net_device *dev, int new_mtu,
                      struct netlink_ext_ack *extack);

int dev_get_phys_port_id(struct net_device *dev,
                         struct netdev_phys_item_id *ppid);
int dev_get_phys_port_name(struct net_device *dev,
                           char *name, size_t len);

int netif_change_proto_down(struct net_device *dev, bool proto_down);
int dev_change_proto_down(struct net_device *dev, bool proto_down);
void netdev_change_proto_down_reason_locked(struct net_device *dev,
                                            unsigned long mask, u32 value);

typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
                      int fd, int expected_fd, u32 flags);

int netif_change_tx_queue_len(struct net_device *dev, unsigned long new_len);
int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len);
void netif_set_group(struct net_device *dev, int new_group);
void dev_set_group(struct net_device *dev, int new_group);
int netif_change_carrier(struct net_device *dev, bool new_carrier);
int dev_change_carrier(struct net_device *dev, bool new_carrier);

void __dev_set_rx_mode(struct net_device *dev);

void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
                        unsigned int gchanges, u32 portid,
                        const struct nlmsghdr *nlh);

void unregister_netdevice_many_notify(struct list_head *head,
                                      u32 portid, const struct nlmsghdr *nlh);

static inline void netif_set_up(struct net_device *dev, bool value)
{
        if (value)
                dev->flags |= IFF_UP;
        else
                dev->flags &= ~IFF_UP;

        if (!netdev_need_ops_lock(dev))
                netdev_lock(dev);
        dev->up = value;
        if (!netdev_need_ops_lock(dev))
                netdev_unlock(dev);
}

static inline void netif_set_gso_max_size(struct net_device *dev,
                                          unsigned int size)
{
        /* dev->gso_max_size is read locklessly from sk_setup_caps() */
        WRITE_ONCE(dev->gso_max_size, size);
        if (size <= GSO_LEGACY_MAX_SIZE)
                WRITE_ONCE(dev->gso_ipv4_max_size, size);
}

static inline void netif_set_gso_max_segs(struct net_device *dev,
                                          unsigned int segs)
{
        /* dev->gso_max_segs is read locklessly from sk_setup_caps() */
        WRITE_ONCE(dev->gso_max_segs, segs);
}

static inline void netif_set_gro_max_size(struct net_device *dev,
                                          unsigned int size)
{
        /* This pairs with the READ_ONCE() in skb_gro_receive() */
        WRITE_ONCE(dev->gro_max_size, size);
        if (size <= GRO_LEGACY_MAX_SIZE)
                WRITE_ONCE(dev->gro_ipv4_max_size, size);
}

static inline void netif_set_gso_ipv4_max_size(struct net_device *dev,
                                               unsigned int size)
{
        /* dev->gso_ipv4_max_size is read locklessly from sk_setup_caps() */
        WRITE_ONCE(dev->gso_ipv4_max_size, size);
}

static inline void netif_set_gro_ipv4_max_size(struct net_device *dev,
                                               unsigned int size)
{
        /* This pairs with the READ_ONCE() in skb_gro_receive() */
        WRITE_ONCE(dev->gro_ipv4_max_size, size);
}

/**
 * napi_get_defer_hard_irqs - get the NAPI's defer_hard_irqs
 * @n: napi struct to get the defer_hard_irqs field from
 *
 * Return: the per-NAPI value of the defar_hard_irqs field.
 */
static inline u32 napi_get_defer_hard_irqs(const struct napi_struct *n)
{
        return READ_ONCE(n->defer_hard_irqs);
}

/**
 * napi_set_defer_hard_irqs - set the defer_hard_irqs for a napi
 * @n: napi_struct to set the defer_hard_irqs field
 * @defer: the value the field should be set to
 */
static inline void napi_set_defer_hard_irqs(struct napi_struct *n, u32 defer)
{
        WRITE_ONCE(n->defer_hard_irqs, defer);
}

/**
 * netdev_set_defer_hard_irqs - set defer_hard_irqs for all NAPIs of a netdev
 * @netdev: the net_device for which all NAPIs will have defer_hard_irqs set
 * @defer: the defer_hard_irqs value to set
 */
static inline void netdev_set_defer_hard_irqs(struct net_device *netdev,
                                              u32 defer)
{
        unsigned int count = max(netdev->num_rx_queues,
                                 netdev->num_tx_queues);
        struct napi_struct *napi;
        int i;

        WRITE_ONCE(netdev->napi_defer_hard_irqs, defer);
        list_for_each_entry(napi, &netdev->napi_list, dev_list)
                napi_set_defer_hard_irqs(napi, defer);

        for (i = 0; i < count; i++)
                netdev->napi_config[i].defer_hard_irqs = defer;
}

/**
 * napi_get_gro_flush_timeout - get the gro_flush_timeout
 * @n: napi struct to get the gro_flush_timeout from
 *
 * Return: the per-NAPI value of the gro_flush_timeout field.
 */
static inline unsigned long
napi_get_gro_flush_timeout(const struct napi_struct *n)
{
        return READ_ONCE(n->gro_flush_timeout);
}

/**
 * napi_set_gro_flush_timeout - set the gro_flush_timeout for a napi
 * @n: napi struct to set the gro_flush_timeout
 * @timeout: timeout value to set
 *
 * napi_set_gro_flush_timeout sets the per-NAPI gro_flush_timeout
 */
static inline void napi_set_gro_flush_timeout(struct napi_struct *n,
                                              unsigned long timeout)
{
        WRITE_ONCE(n->gro_flush_timeout, timeout);
}

/**
 * netdev_set_gro_flush_timeout - set gro_flush_timeout of a netdev's NAPIs
 * @netdev: the net_device for which all NAPIs will have gro_flush_timeout set
 * @timeout: the timeout value to set
 */
static inline void netdev_set_gro_flush_timeout(struct net_device *netdev,
                                                unsigned long timeout)
{
        unsigned int count = max(netdev->num_rx_queues,
                                 netdev->num_tx_queues);
        struct napi_struct *napi;
        int i;

        WRITE_ONCE(netdev->gro_flush_timeout, timeout);
        list_for_each_entry(napi, &netdev->napi_list, dev_list)
                napi_set_gro_flush_timeout(napi, timeout);

        for (i = 0; i < count; i++)
                netdev->napi_config[i].gro_flush_timeout = timeout;
}

/**
 * napi_get_irq_suspend_timeout - get the irq_suspend_timeout
 * @n: napi struct to get the irq_suspend_timeout from
 *
 * Return: the per-NAPI value of the irq_suspend_timeout field.
 */
static inline unsigned long
napi_get_irq_suspend_timeout(const struct napi_struct *n)
{
        return READ_ONCE(n->irq_suspend_timeout);
}

/**
 * napi_set_irq_suspend_timeout - set the irq_suspend_timeout for a napi
 * @n: napi struct to set the irq_suspend_timeout
 * @timeout: timeout value to set
 *
 * napi_set_irq_suspend_timeout sets the per-NAPI irq_suspend_timeout
 */
static inline void napi_set_irq_suspend_timeout(struct napi_struct *n,
                                                unsigned long timeout)
{
        WRITE_ONCE(n->irq_suspend_timeout, timeout);
}

static inline enum netdev_napi_threaded napi_get_threaded(struct napi_struct *n)
{
        if (test_bit(NAPI_STATE_THREADED, &n->state))
                return NETDEV_NAPI_THREADED_ENABLED;

        return NETDEV_NAPI_THREADED_DISABLED;
}

static inline enum netdev_napi_threaded
napi_get_threaded_config(struct net_device *dev, struct napi_struct *n)
{
        if (n->config)
                return n->config->threaded;
        return dev->threaded;
}

int napi_set_threaded(struct napi_struct *n,
                      enum netdev_napi_threaded threaded);

int netif_set_threaded(struct net_device *dev,
                       enum netdev_napi_threaded threaded);

int rps_cpumask_housekeeping(struct cpumask *mask);

#if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
void xdp_do_check_flushed(struct napi_struct *napi);
#else
static inline void xdp_do_check_flushed(struct napi_struct *napi) { }
#endif

/* Best effort check that NAPI is not idle (can't be scheduled to run) */
static inline void napi_assert_will_not_race(const struct napi_struct *napi)
{
        /* uninitialized instance, can't race */
        if (!napi->poll_list.next)
                return;

        /* SCHED bit is set on disabled instances */
        WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state));
        WARN_ON(READ_ONCE(napi->list_owner) != -1);
}

void kick_defer_list_purge(unsigned int cpu);

#define XMIT_RECURSION_LIMIT        8

#ifndef CONFIG_PREEMPT_RT
static inline bool dev_xmit_recursion(void)
{
        return unlikely(__this_cpu_read(softnet_data.xmit.recursion) >
                        XMIT_RECURSION_LIMIT);
}

static inline void dev_xmit_recursion_inc(void)
{
        __this_cpu_inc(softnet_data.xmit.recursion);
}

static inline void dev_xmit_recursion_dec(void)
{
        __this_cpu_dec(softnet_data.xmit.recursion);
}
#else
static inline bool dev_xmit_recursion(void)
{
        return unlikely(current->net_xmit.recursion > XMIT_RECURSION_LIMIT);
}

static inline void dev_xmit_recursion_inc(void)
{
        current->net_xmit.recursion++;
}

static inline void dev_xmit_recursion_dec(void)
{
        current->net_xmit.recursion--;
}
#endif

int dev_set_hwtstamp_phylib(struct net_device *dev,
                            struct kernel_hwtstamp_config *cfg,
                            struct netlink_ext_ack *extack);
int dev_get_hwtstamp_phylib(struct net_device *dev,
                            struct kernel_hwtstamp_config *cfg);
int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg);

#endif









  202 




















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#undef TRACE_SYSTEM
#define TRACE_SYSTEM netlink

#if !defined(_TRACE_NETLINK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_NETLINK_H

#include <linux/tracepoint.h>

TRACE_EVENT(netlink_extack,

        TP_PROTO(const char *msg),

        TP_ARGS(msg),

        TP_STRUCT__entry(
                __string(        msg,        msg        )
        ),

        TP_fast_assign(
                __assign_str(msg);
        ),

        TP_printk("msg=%s", __get_str(msg))
);

#endif /* _TRACE_NETLINK_H */

/* This part must be outside protection */
#include <trace/define_trace.h>
























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_GENERIC_DELAY_H
#define __ASM_GENERIC_DELAY_H

#include <linux/math.h>
#include <vdso/time64.h>

/* Undefined functions to get compile-time errors */
extern void __bad_udelay(void);
extern void __bad_ndelay(void);

extern void __udelay(unsigned long usecs);
extern void __ndelay(unsigned long nsecs);
extern void __const_udelay(unsigned long xloops);
extern void __delay(unsigned long loops);

/*
 * The microseconds/nanosecond delay multiplicators are used to convert a
 * constant microseconds/nanoseconds value to a value which can be used by the
 * architectures specific implementation to transform it into loops.
 */
#define UDELAY_CONST_MULT        ((unsigned long)DIV_ROUND_UP(1ULL << 32, USEC_PER_SEC))
#define NDELAY_CONST_MULT        ((unsigned long)DIV_ROUND_UP(1ULL << 32, NSEC_PER_SEC))

/*
 * The maximum constant udelay/ndelay value picked out of thin air to prevent
 * too long constant udelays/ndelays.
 */
#define DELAY_CONST_MAX   20000

/**
 * udelay - Inserting a delay based on microseconds with busy waiting
 * @usec:        requested delay in microseconds
 *
 * When delaying in an atomic context ndelay(), udelay() and mdelay() are the
 * only valid variants of delaying/sleeping to go with.
 *
 * When inserting delays in non atomic context which are shorter than the time
 * which is required to queue e.g. an hrtimer and to enter then the scheduler,
 * it is also valuable to use udelay(). But it is not simple to specify a
 * generic threshold for this which will fit for all systems. An approximation
 * is a threshold for all delays up to 10 microseconds.
 *
 * When having a delay which is larger than the architecture specific
 * %MAX_UDELAY_MS value, please make sure mdelay() is used. Otherwise a overflow
 * risk is given.
 *
 * Please note that ndelay(), udelay() and mdelay() may return early for several
 * reasons (https://lists.openwall.net/linux-kernel/2011/01/09/56):
 *
 * #. computed loops_per_jiffy too low (due to the time taken to execute the
 *    timer interrupt.)
 * #. cache behaviour affecting the time it takes to execute the loop function.
 * #. CPU clock rate changes.
 */
static __always_inline void udelay(unsigned long usec)
{
        if (__builtin_constant_p(usec)) {
                if (usec >= DELAY_CONST_MAX)
                        __bad_udelay();
                else
                        __const_udelay(usec * UDELAY_CONST_MULT);
        } else {
                __udelay(usec);
        }
}

/**
 * ndelay - Inserting a delay based on nanoseconds with busy waiting
 * @nsec:        requested delay in nanoseconds
 *
 * See udelay() for basic information about ndelay() and it's variants.
 */
static __always_inline void ndelay(unsigned long nsec)
{
        if (__builtin_constant_p(nsec)) {
                if (nsec >= DELAY_CONST_MAX)
                        __bad_ndelay();
                else
                        __const_udelay(nsec * NDELAY_CONST_MULT);
        } else {
                __ndelay(nsec);
        }
}
#define ndelay(x) ndelay(x)

#endif /* __ASM_GENERIC_DELAY_H */





























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * net busy poll support
 * Copyright(c) 2013 Intel Corporation.
 *
 * Author: Eliezer Tamir
 *
 * Contact Information:
 * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
 */

#ifndef _LINUX_NET_BUSY_POLL_H
#define _LINUX_NET_BUSY_POLL_H

#include <linux/netdevice.h>
#include <linux/sched/clock.h>
#include <linux/sched/signal.h>
#include <net/ip.h>
#include <net/xdp.h>

/*                0 - Reserved to indicate value not set
 *     1..NR_CPUS - Reserved for sender_cpu
 *  NR_CPUS+1..~0 - Region available for NAPI IDs
 */
#define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1))

static inline bool napi_id_valid(unsigned int napi_id)
{
        return napi_id >= MIN_NAPI_ID;
}

#define BUSY_POLL_BUDGET 8

#ifdef CONFIG_NET_RX_BUSY_POLL

struct napi_struct;
extern unsigned int sysctl_net_busy_read __read_mostly;
extern unsigned int sysctl_net_busy_poll __read_mostly;

static inline bool net_busy_loop_on(void)
{
        return READ_ONCE(sysctl_net_busy_poll);
}

static inline bool sk_can_busy_loop(const struct sock *sk)
{
        return READ_ONCE(sk->sk_ll_usec) && !signal_pending(current);
}

bool sk_busy_loop_end(void *p, unsigned long start_time);

void napi_busy_loop(unsigned int napi_id,
                    bool (*loop_end)(void *, unsigned long),
                    void *loop_end_arg, bool prefer_busy_poll, u16 budget);

void napi_busy_loop_rcu(unsigned int napi_id,
                        bool (*loop_end)(void *, unsigned long),
                        void *loop_end_arg, bool prefer_busy_poll, u16 budget);

void napi_suspend_irqs(unsigned int napi_id);
void napi_resume_irqs(unsigned int napi_id);

#else /* CONFIG_NET_RX_BUSY_POLL */
static inline unsigned long net_busy_loop_on(void)
{
        return 0;
}

static inline bool sk_can_busy_loop(struct sock *sk)
{
        return false;
}

#endif /* CONFIG_NET_RX_BUSY_POLL */

static inline unsigned long busy_loop_current_time(void)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        return (unsigned long)(ktime_get_ns() >> 10);
#else
        return 0;
#endif
}

/* in poll/select we use the global sysctl_net_ll_poll value */
static inline bool busy_loop_timeout(unsigned long start_time)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        unsigned long bp_usec = READ_ONCE(sysctl_net_busy_poll);

        if (bp_usec) {
                unsigned long end_time = start_time + bp_usec;
                unsigned long now = busy_loop_current_time();

                return time_after(now, end_time);
        }
#endif
        return true;
}

static inline bool sk_busy_loop_timeout(struct sock *sk,
                                        unsigned long start_time)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        unsigned long bp_usec = READ_ONCE(sk->sk_ll_usec);

        if (bp_usec) {
                unsigned long end_time = start_time + bp_usec;
                unsigned long now = busy_loop_current_time();

                return time_after(now, end_time);
        }
#endif
        return true;
}

static inline void sk_busy_loop(struct sock *sk, int nonblock)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        unsigned int napi_id = READ_ONCE(sk->sk_napi_id);

        if (napi_id_valid(napi_id))
                napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk,
                               READ_ONCE(sk->sk_prefer_busy_poll),
                               READ_ONCE(sk->sk_busy_poll_budget) ?: BUSY_POLL_BUDGET);
#endif
}

/* used in the NIC receive handler to mark the skb */
static inline void __skb_mark_napi_id(struct sk_buff *skb,
                                      const struct gro_node *gro)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        /* If the skb was already marked with a valid NAPI ID, avoid overwriting
         * it.
         */
        if (!napi_id_valid(skb->napi_id))
                skb->napi_id = gro->cached_napi_id;
#endif
}

static inline void skb_mark_napi_id(struct sk_buff *skb,
                                    const struct napi_struct *napi)
{
        __skb_mark_napi_id(skb, &napi->gro);
}

/* used in the protocol handler to propagate the napi_id to the socket */
static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        if (unlikely(READ_ONCE(sk->sk_napi_id) != skb->napi_id))
                WRITE_ONCE(sk->sk_napi_id, skb->napi_id);
#endif
        sk_rx_queue_update(sk, skb);
}

/* Variant of sk_mark_napi_id() for passive flow setup,
 * as sk->sk_napi_id and sk->sk_rx_queue_mapping content
 * needs to be set.
 */
static inline void sk_mark_napi_id_set(struct sock *sk,
                                       const struct sk_buff *skb)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        WRITE_ONCE(sk->sk_napi_id, skb->napi_id);
#endif
        sk_rx_queue_set(sk, skb);
}

static inline void __sk_mark_napi_id_once(struct sock *sk, unsigned int napi_id)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        if (!READ_ONCE(sk->sk_napi_id))
                WRITE_ONCE(sk->sk_napi_id, napi_id);
#endif
}

/* variant used for unconnected sockets */
static inline void sk_mark_napi_id_once(struct sock *sk,
                                        const struct sk_buff *skb)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        __sk_mark_napi_id_once(sk, skb->napi_id);
#endif
}

#endif /* _LINUX_NET_BUSY_POLL_H */





























    9 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * x86-optimized SHA-512 block function
 *
 * Copyright 2025 Google LLC
 */
#include <asm/fpu/api.h>
#include <linux/static_call.h>

DEFINE_STATIC_CALL(sha512_blocks_x86, sha512_blocks_generic);

#define DEFINE_X86_SHA512_FN(c_fn, asm_fn)                                 \
        asmlinkage void asm_fn(struct sha512_block_state *state,           \
                               const u8 *data, size_t nblocks);            \
        static void c_fn(struct sha512_block_state *state, const u8 *data, \
                         size_t nblocks)                                   \
        {                                                                  \
                if (likely(irq_fpu_usable())) {                            \
                        kernel_fpu_begin();                                \
                        asm_fn(state, data, nblocks);                      \
                        kernel_fpu_end();                                  \
                } else {                                                   \
                        sha512_blocks_generic(state, data, nblocks);       \
                }                                                          \
        }

DEFINE_X86_SHA512_FN(sha512_blocks_ssse3, sha512_transform_ssse3);
DEFINE_X86_SHA512_FN(sha512_blocks_avx, sha512_transform_avx);
DEFINE_X86_SHA512_FN(sha512_blocks_avx2, sha512_transform_rorx);

static void sha512_blocks(struct sha512_block_state *state,
                          const u8 *data, size_t nblocks)
{
        static_call(sha512_blocks_x86)(state, data, nblocks);
}

#define sha512_mod_init_arch sha512_mod_init_arch
static void sha512_mod_init_arch(void)
{
        if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL) &&
            boot_cpu_has(X86_FEATURE_AVX)) {
                if (boot_cpu_has(X86_FEATURE_AVX2) &&
                    boot_cpu_has(X86_FEATURE_BMI2))
                        static_call_update(sha512_blocks_x86,
                                           sha512_blocks_avx2);
                else
                        static_call_update(sha512_blocks_x86,
                                           sha512_blocks_avx);
        } else if (boot_cpu_has(X86_FEATURE_SSSE3)) {
                static_call_update(sha512_blocks_x86, sha512_blocks_ssse3);
        }
}












































































































































































































































































































































  319 



















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (c) 2012-2014 Andy Lutomirski <luto@amacapital.net>
 *
 * Based on the original implementation which is:
 *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
 *  Copyright 2003 Andi Kleen, SuSE Labs.
 *
 *  Parts of the original code have been moved to arch/x86/vdso/vma.c
 *
 * This file implements vsyscall emulation.  vsyscalls are a legacy ABI:
 * Userspace can request certain kernel services by calling fixed
 * addresses.  This concept is problematic:
 *
 * - It interferes with ASLR.
 * - It's awkward to write code that lives in kernel addresses but is
 *   callable by userspace at fixed addresses.
 * - The whole concept is impossible for 32-bit compat userspace.
 * - UML cannot easily virtualize a vsyscall.
 *
 * As of mid-2014, I believe that there is no new userspace code that
 * will use a vsyscall if the vDSO is present.  I hope that there will
 * soon be no new userspace code that will ever use a vsyscall.
 *
 * The code in this file emulates vsyscalls when notified of a page
 * fault to a vsyscall address.
 */

#include <linux/kernel.h>
#include <linux/timer.h>
#include <linux/sched/signal.h>
#include <linux/mm_types.h>
#include <linux/syscalls.h>
#include <linux/ratelimit.h>

#include <asm/vsyscall.h>
#include <asm/unistd.h>
#include <asm/fixmap.h>
#include <asm/traps.h>
#include <asm/paravirt.h>

#define CREATE_TRACE_POINTS
#include "vsyscall_trace.h"

static enum { EMULATE, XONLY, NONE } vsyscall_mode __ro_after_init =
#ifdef CONFIG_LEGACY_VSYSCALL_NONE
        NONE;
#elif defined(CONFIG_LEGACY_VSYSCALL_XONLY)
        XONLY;
#else
        #error VSYSCALL config is broken
#endif

static int __init vsyscall_setup(char *str)
{
        if (str) {
                if (!strcmp("emulate", str))
                        vsyscall_mode = EMULATE;
                else if (!strcmp("xonly", str))
                        vsyscall_mode = XONLY;
                else if (!strcmp("none", str))
                        vsyscall_mode = NONE;
                else
                        return -EINVAL;

                return 0;
        }

        return -EINVAL;
}
early_param("vsyscall", vsyscall_setup);

static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
                              const char *message)
{
        if (!show_unhandled_signals)
                return;

        printk_ratelimited("%s%s[%d] %s ip:%lx cs:%x sp:%lx ax:%lx si:%lx di:%lx\n",
                           level, current->comm, task_pid_nr(current),
                           message, regs->ip, regs->cs,
                           regs->sp, regs->ax, regs->si, regs->di);
}

static int addr_to_vsyscall_nr(unsigned long addr)
{
        int nr;

        if ((addr & ~0xC00UL) != VSYSCALL_ADDR)
                return -EINVAL;

        nr = (addr & 0xC00UL) >> 10;
        if (nr >= 3)
                return -EINVAL;

        return nr;
}

static bool write_ok_or_segv(unsigned long ptr, size_t size)
{
        if (!access_ok((void __user *)ptr, size)) {
                struct thread_struct *thread = &current->thread;

                thread->error_code        = X86_PF_USER | X86_PF_WRITE;
                thread->cr2                = ptr;
                thread->trap_nr                = X86_TRAP_PF;

                force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)ptr);
                return false;
        } else {
                return true;
        }
}

bool emulate_vsyscall(unsigned long error_code,
                      struct pt_regs *regs, unsigned long address)
{
        unsigned long caller;
        int vsyscall_nr, syscall_nr, tmp;
        long ret;
        unsigned long orig_dx;

        /* Write faults or kernel-privilege faults never get fixed up. */
        if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER)
                return false;

        /*
         * Assume that faults at regs->ip are because of an
         * instruction fetch. Return early and avoid
         * emulation for faults during data accesses:
         */
        if (address != regs->ip) {
                /* Failed vsyscall read */
                if (vsyscall_mode == EMULATE)
                        return false;

                /*
                 * User code tried and failed to read the vsyscall page.
                 */
                warn_bad_vsyscall(KERN_INFO, regs, "vsyscall read attempt denied -- look up the vsyscall kernel parameter if you need a workaround");
                return false;
        }

        /*
         * X86_PF_INSTR is only set when NX is supported.  When
         * available, use it to double-check that the emulation code
         * is only being used for instruction fetches:
         */
        if (cpu_feature_enabled(X86_FEATURE_NX))
                WARN_ON_ONCE(!(error_code & X86_PF_INSTR));

        /*
         * No point in checking CS -- the only way to get here is a user mode
         * trap to a high address, which means that we're in 64-bit user code.
         */

        if (vsyscall_mode == NONE) {
                warn_bad_vsyscall(KERN_INFO, regs,
                                  "vsyscall attempted with vsyscall=none");
                return false;
        }

        vsyscall_nr = addr_to_vsyscall_nr(address);

        trace_emulate_vsyscall(vsyscall_nr);

        if (vsyscall_nr < 0) {
                warn_bad_vsyscall(KERN_WARNING, regs,
                                  "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround");
                goto sigsegv;
        }

        if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
                warn_bad_vsyscall(KERN_WARNING, regs,
                                  "vsyscall with bad stack (exploit attempt?)");
                goto sigsegv;
        }

        /*
         * Check for access_ok violations and find the syscall nr.
         *
         * NULL is a valid user pointer (in the access_ok sense) on 32-bit and
         * 64-bit, so we don't need to special-case it here.  For all the
         * vsyscalls, NULL means "don't write anything" not "write it at
         * address 0".
         */
        switch (vsyscall_nr) {
        case 0:
                if (!write_ok_or_segv(regs->di, sizeof(struct __kernel_old_timeval)) ||
                    !write_ok_or_segv(regs->si, sizeof(struct timezone))) {
                        ret = -EFAULT;
                        goto check_fault;
                }

                syscall_nr = __NR_gettimeofday;
                break;

        case 1:
                if (!write_ok_or_segv(regs->di, sizeof(__kernel_old_time_t))) {
                        ret = -EFAULT;
                        goto check_fault;
                }

                syscall_nr = __NR_time;
                break;

        case 2:
                if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
                    !write_ok_or_segv(regs->si, sizeof(unsigned))) {
                        ret = -EFAULT;
                        goto check_fault;
                }

                syscall_nr = __NR_getcpu;
                break;
        }

        /*
         * Handle seccomp.  regs->ip must be the original value.
         * See seccomp_send_sigsys and Documentation/userspace-api/seccomp_filter.rst.
         *
         * We could optimize the seccomp disabled case, but performance
         * here doesn't matter.
         */
        regs->orig_ax = syscall_nr;
        regs->ax = -ENOSYS;
        tmp = secure_computing();
        if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
                warn_bad_vsyscall(KERN_DEBUG, regs,
                                  "seccomp tried to change syscall nr or ip");
                force_exit_sig(SIGSYS);
                return true;
        }
        regs->orig_ax = -1;
        if (tmp)
                goto do_ret;  /* skip requested */

        /*
         * With a real vsyscall, page faults cause SIGSEGV.
         */
        ret = -EFAULT;
        switch (vsyscall_nr) {
        case 0:
                /* this decodes regs->di and regs->si on its own */
                ret = __x64_sys_gettimeofday(regs);
                break;

        case 1:
                /* this decodes regs->di on its own */
                ret = __x64_sys_time(regs);
                break;

        case 2:
                /* while we could clobber regs->dx, we didn't in the past... */
                orig_dx = regs->dx;
                regs->dx = 0;
                /* this decodes regs->di, regs->si and regs->dx on its own */
                ret = __x64_sys_getcpu(regs);
                regs->dx = orig_dx;
                break;
        }

check_fault:
        if (ret == -EFAULT) {
                /* Bad news -- userspace fed a bad pointer to a vsyscall. */
                warn_bad_vsyscall(KERN_INFO, regs,
                                  "vsyscall fault (exploit attempt?)");
                goto sigsegv;
        }

        regs->ax = ret;

do_ret:
        /* Emulate a ret instruction. */
        regs->ip = caller;
        regs->sp += 8;
        return true;

sigsegv:
        force_sig(SIGSEGV);
        return true;
}

/*
 * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
 * not need special handling anymore:
 */
static const char *gate_vma_name(struct vm_area_struct *vma)
{
        return "[vsyscall]";
}
static const struct vm_operations_struct gate_vma_ops = {
        .name = gate_vma_name,
};
static struct vm_area_struct gate_vma __ro_after_init = {
        .vm_start        = VSYSCALL_ADDR,
        .vm_end                = VSYSCALL_ADDR + PAGE_SIZE,
        .vm_page_prot        = PAGE_READONLY_EXEC,
        .vm_flags        = VM_READ | VM_EXEC,
        .vm_ops                = &gate_vma_ops,
};

struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
{
#ifdef CONFIG_COMPAT
        if (!mm || !test_bit(MM_CONTEXT_HAS_VSYSCALL, &mm->context.flags))
                return NULL;
#endif
        if (vsyscall_mode == NONE)
                return NULL;
        return &gate_vma;
}

int in_gate_area(struct mm_struct *mm, unsigned long addr)
{
        struct vm_area_struct *vma = get_gate_vma(mm);

        if (!vma)
                return 0;

        return (addr >= vma->vm_start) && (addr < vma->vm_end);
}

/*
 * Use this when you have no reliable mm, typically from interrupt
 * context. It is less reliable than using a task's mm and may give
 * false positives.
 */
int in_gate_area_no_mm(unsigned long addr)
{
        return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR;
}

/*
 * The VSYSCALL page is the only user-accessible page in the kernel address
 * range.  Normally, the kernel page tables can have _PAGE_USER clear, but
 * the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls
 * are enabled.
 *
 * Some day we may create a "minimal" vsyscall mode in which we emulate
 * vsyscalls but leave the page not present.  If so, we skip calling
 * this.
 */
void __init set_vsyscall_pgtable_user_bits(pgd_t *root)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;

        pgd = pgd_offset_pgd(root, VSYSCALL_ADDR);
        set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
        p4d = p4d_offset(pgd, VSYSCALL_ADDR);
        set_p4d(p4d, __p4d(p4d_val(*p4d) | _PAGE_USER));
        pud = pud_offset(p4d, VSYSCALL_ADDR);
        set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER));
        pmd = pmd_offset(pud, VSYSCALL_ADDR);
        set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER));
}

void __init map_vsyscall(void)
{
        extern char __vsyscall_page;
        unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);

        /*
         * For full emulation, the page needs to exist for real.  In
         * execute-only mode, there is no PTE at all backing the vsyscall
         * page.
         */
        if (vsyscall_mode == EMULATE) {
                __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
                             PAGE_KERNEL_VVAR);
                set_vsyscall_pgtable_user_bits(swapper_pg_dir);
        }

        if (vsyscall_mode == XONLY)
                vm_flags_init(&gate_vma, VM_EXEC);

        BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
                     (unsigned long)VSYSCALL_ADDR);
}























































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * NetLabel System
 *
 * The NetLabel system manages static and dynamic label mappings for network
 * protocols such as CIPSO and RIPSO.
 *
 * Author: Paul Moore <paul@paul-moore.com>
 */

/*
 * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
 */

#ifndef _NETLABEL_H
#define _NETLABEL_H

#include <linux/types.h>
#include <linux/slab.h>
#include <linux/net.h>
#include <linux/skbuff.h>
#include <linux/in.h>
#include <linux/in6.h>
#include <net/netlink.h>
#include <net/request_sock.h>
#include <linux/refcount.h>

struct cipso_v4_doi;
struct calipso_doi;

/*
 * NetLabel - A management interface for maintaining network packet label
 *            mapping tables for explicit packet labeling protocols.
 *
 * Network protocols such as CIPSO and RIPSO require a label translation layer
 * to convert the label on the packet into something meaningful on the host
 * machine.  In the current Linux implementation these mapping tables live
 * inside the kernel; NetLabel provides a mechanism for user space applications
 * to manage these mapping tables.
 *
 * NetLabel makes use of the Generic NETLINK mechanism as a transport layer to
 * send messages between kernel and user space.  The general format of a
 * NetLabel message is shown below:
 *
 *  +-----------------+-------------------+--------- --- -- -
 *  | struct nlmsghdr | struct genlmsghdr | payload
 *  +-----------------+-------------------+--------- --- -- -
 *
 * The 'nlmsghdr' and 'genlmsghdr' structs should be dealt with like normal.
 * The payload is dependent on the subsystem specified in the
 * 'nlmsghdr->nlmsg_type' and should be defined below, supporting functions
 * should be defined in the corresponding net/netlabel/netlabel_<subsys>.h|c
 * file.  All of the fields in the NetLabel payload are NETLINK attributes, see
 * the include/net/netlink.h file for more information on NETLINK attributes.
 *
 */

/*
 * NetLabel NETLINK protocol
 */

/* NetLabel NETLINK protocol version
 *  1: initial version
 *  2: added static labels for unlabeled connections
 *  3: network selectors added to the NetLabel/LSM domain mapping and the
 *     CIPSO_V4_MAP_LOCAL CIPSO mapping was added
 */
#define NETLBL_PROTO_VERSION            3

/* NetLabel NETLINK types/families */
#define NETLBL_NLTYPE_NONE              0
#define NETLBL_NLTYPE_MGMT              1
#define NETLBL_NLTYPE_MGMT_NAME         "NLBL_MGMT"
#define NETLBL_NLTYPE_RIPSO             2
#define NETLBL_NLTYPE_RIPSO_NAME        "NLBL_RIPSO"
#define NETLBL_NLTYPE_CIPSOV4           3
#define NETLBL_NLTYPE_CIPSOV4_NAME      "NLBL_CIPSOv4"
#define NETLBL_NLTYPE_CIPSOV6           4
#define NETLBL_NLTYPE_CIPSOV6_NAME      "NLBL_CIPSOv6"
#define NETLBL_NLTYPE_UNLABELED         5
#define NETLBL_NLTYPE_UNLABELED_NAME    "NLBL_UNLBL"
#define NETLBL_NLTYPE_ADDRSELECT        6
#define NETLBL_NLTYPE_ADDRSELECT_NAME   "NLBL_ADRSEL"
#define NETLBL_NLTYPE_CALIPSO           7
#define NETLBL_NLTYPE_CALIPSO_NAME      "NLBL_CALIPSO"

/*
 * NetLabel - Kernel API for accessing the network packet label mappings.
 *
 * The following functions are provided for use by other kernel modules,
 * specifically kernel LSM modules, to provide a consistent, transparent API
 * for dealing with explicit packet labeling protocols such as CIPSO and
 * RIPSO.  The functions defined here are implemented in the
 * net/netlabel/netlabel_kapi.c file.
 *
 */

/* NetLabel audit information */
struct netlbl_audit {
        struct lsm_prop prop;
        kuid_t loginuid;
        unsigned int sessionid;
};

/*
 * LSM security attributes
 */

/**
 * struct netlbl_lsm_cache - NetLabel LSM security attribute cache
 * @refcount: atomic reference counter
 * @free: LSM supplied function to free the cache data
 * @data: LSM supplied cache data
 *
 * Description:
 * This structure is provided for LSMs which wish to make use of the NetLabel
 * caching mechanism to store LSM specific data/attributes in the NetLabel
 * cache.  If the LSM has to perform a lot of translation from the NetLabel
 * security attributes into it's own internal representation then the cache
 * mechanism can provide a way to eliminate some or all of that translation
 * overhead on a cache hit.
 *
 */
struct netlbl_lsm_cache {
        refcount_t refcount;
        void (*free) (const void *data);
        void *data;
};

/**
 * struct netlbl_lsm_catmap - NetLabel LSM secattr category bitmap
 * @startbit: the value of the lowest order bit in the bitmap
 * @bitmap: the category bitmap
 * @next: pointer to the next bitmap "node" or NULL
 *
 * Description:
 * This structure is used to represent category bitmaps.  Due to the large
 * number of categories supported by most labeling protocols it is not
 * practical to transfer a full bitmap internally so NetLabel adopts a sparse
 * bitmap structure modeled after SELinux's ebitmap structure.
 * The catmap bitmap field MUST be a power of two in length and large
 * enough to hold at least 240 bits.  Special care (i.e. check the code!)
 * should be used when changing these values as the LSM implementation
 * probably has functions which rely on the sizes of these types to speed
 * processing.
 *
 */
#define NETLBL_CATMAP_MAPCNT            4
#define NETLBL_CATMAP_MAPSIZE           (sizeof(u64) * 8)
#define NETLBL_CATMAP_SIZE              (NETLBL_CATMAP_MAPSIZE * \
                                         NETLBL_CATMAP_MAPCNT)
#define NETLBL_CATMAP_BIT               ((u64)0x01)
struct netlbl_lsm_catmap {
        u32 startbit;
        u64 bitmap[NETLBL_CATMAP_MAPCNT];
        struct netlbl_lsm_catmap *next;
};

/**
 * struct netlbl_lsm_secattr - NetLabel LSM security attributes
 * @flags: indicate structure attributes, see NETLBL_SECATTR_*
 * @type: indicate the NLTYPE of the attributes
 * @domain: the NetLabel LSM domain
 * @cache: NetLabel LSM specific cache
 * @attr.mls: MLS sensitivity label
 * @attr.mls.cat: MLS category bitmap
 * @attr.mls.lvl: MLS sensitivity level
 * @attr.secid: LSM specific secid token
 *
 * Description:
 * This structure is used to pass security attributes between NetLabel and the
 * LSM modules.  The flags field is used to specify which fields within the
 * struct are valid and valid values can be created by bitwise OR'ing the
 * NETLBL_SECATTR_* defines.  The domain field is typically set by the LSM to
 * specify domain specific configuration settings and is not usually used by
 * NetLabel itself when returning security attributes to the LSM.
 *
 */
struct netlbl_lsm_secattr {
        u32 flags;
        /* bitmap values for 'flags' */
#define NETLBL_SECATTR_NONE             0x00000000
#define NETLBL_SECATTR_DOMAIN           0x00000001
#define NETLBL_SECATTR_DOMAIN_CPY       (NETLBL_SECATTR_DOMAIN | \
                                         NETLBL_SECATTR_FREE_DOMAIN)
#define NETLBL_SECATTR_CACHE            0x00000002
#define NETLBL_SECATTR_MLS_LVL          0x00000004
#define NETLBL_SECATTR_MLS_CAT          0x00000008
#define NETLBL_SECATTR_SECID            0x00000010
        /* bitmap meta-values for 'flags' */
#define NETLBL_SECATTR_FREE_DOMAIN      0x01000000
#define NETLBL_SECATTR_CACHEABLE        (NETLBL_SECATTR_MLS_LVL | \
                                         NETLBL_SECATTR_MLS_CAT | \
                                         NETLBL_SECATTR_SECID)
        u32 type;
        char *domain;
        struct netlbl_lsm_cache *cache;
        struct {
                struct {
                        struct netlbl_lsm_catmap *cat;
                        u32 lvl;
                } mls;
                u32 secid;
        } attr;
};

/**
 * struct netlbl_calipso_ops - NetLabel CALIPSO operations
 * @doi_add: add a CALIPSO DOI
 * @doi_free: free a CALIPSO DOI
 * @doi_remove: remove a CALIPSO DOI
 * @doi_getdef: returns a reference to a DOI
 * @doi_putdef: releases a reference of a DOI
 * @doi_walk: enumerate the DOI list
 * @sock_getattr: retrieve the socket's attr
 * @sock_setattr: set the socket's attr
 * @sock_delattr: remove the socket's attr
 * @req_setattr: set the req socket's attr
 * @req_delattr: remove the req socket's attr
 * @opt_getattr: retrieve attr from memory block
 * @skbuff_optptr: find option in packet
 * @skbuff_setattr: set the skbuff's attr
 * @skbuff_delattr: remove the skbuff's attr
 * @cache_invalidate: invalidate cache
 * @cache_add: add cache entry
 *
 * Description:
 * This structure is filled out by the CALIPSO engine and passed
 * to the NetLabel core via a call to netlbl_calipso_ops_register().
 * It enables the CALIPSO engine (and hence IPv6) to be compiled
 * as a module.
 */
struct netlbl_calipso_ops {
        int (*doi_add)(struct calipso_doi *doi_def,
                       struct netlbl_audit *audit_info);
        void (*doi_free)(struct calipso_doi *doi_def);
        int (*doi_remove)(u32 doi, struct netlbl_audit *audit_info);
        struct calipso_doi *(*doi_getdef)(u32 doi);
        void (*doi_putdef)(struct calipso_doi *doi_def);
        int (*doi_walk)(u32 *skip_cnt,
                        int (*callback)(struct calipso_doi *doi_def, void *arg),
                        void *cb_arg);
        int (*sock_getattr)(struct sock *sk,
                            struct netlbl_lsm_secattr *secattr);
        int (*sock_setattr)(struct sock *sk,
                            const struct calipso_doi *doi_def,
                            const struct netlbl_lsm_secattr *secattr);
        void (*sock_delattr)(struct sock *sk);
        int (*req_setattr)(struct request_sock *req,
                           const struct calipso_doi *doi_def,
                           const struct netlbl_lsm_secattr *secattr);
        void (*req_delattr)(struct request_sock *req);
        int (*opt_getattr)(const unsigned char *calipso,
                           struct netlbl_lsm_secattr *secattr);
        unsigned char *(*skbuff_optptr)(const struct sk_buff *skb);
        int (*skbuff_setattr)(struct sk_buff *skb,
                              const struct calipso_doi *doi_def,
                              const struct netlbl_lsm_secattr *secattr);
        int (*skbuff_delattr)(struct sk_buff *skb);
        void (*cache_invalidate)(void);
        int (*cache_add)(const unsigned char *calipso_ptr,
                         const struct netlbl_lsm_secattr *secattr);
};

/*
 * LSM security attribute operations (inline)
 */

/**
 * netlbl_secattr_cache_alloc - Allocate and initialize a secattr cache
 * @flags: the memory allocation flags
 *
 * Description:
 * Allocate and initialize a netlbl_lsm_cache structure.  Returns a pointer
 * on success, NULL on failure.
 *
 */
static inline struct netlbl_lsm_cache *netlbl_secattr_cache_alloc_noprof(gfp_t flags)
{
        struct netlbl_lsm_cache *cache;

        cache = kzalloc_noprof(sizeof(*cache), flags);
        if (cache)
                refcount_set(&cache->refcount, 1);
        return cache;
}
#define netlbl_secattr_cache_alloc(...)        \
                alloc_hooks(netlbl_secattr_cache_alloc_noprof(__VA_ARGS__))

/**
 * netlbl_secattr_cache_free - Frees a netlbl_lsm_cache struct
 * @cache: the struct to free
 *
 * Description:
 * Frees @secattr including all of the internal buffers.
 *
 */
static inline void netlbl_secattr_cache_free(struct netlbl_lsm_cache *cache)
{
        if (!refcount_dec_and_test(&cache->refcount))
                return;

        if (cache->free)
                cache->free(cache->data);
        kfree(cache);
}

/**
 * netlbl_catmap_alloc - Allocate a LSM secattr catmap
 * @flags: memory allocation flags
 *
 * Description:
 * Allocate memory for a LSM secattr catmap, returns a pointer on success, NULL
 * on failure.
 *
 */
static inline struct netlbl_lsm_catmap *netlbl_catmap_alloc_noprof(gfp_t flags)
{
        return kzalloc_noprof(sizeof(struct netlbl_lsm_catmap), flags);
}
#define netlbl_catmap_alloc(...)        alloc_hooks(netlbl_catmap_alloc_noprof(__VA_ARGS__))

/**
 * netlbl_catmap_free - Free a LSM secattr catmap
 * @catmap: the category bitmap
 *
 * Description:
 * Free a LSM secattr catmap.
 *
 */
static inline void netlbl_catmap_free(struct netlbl_lsm_catmap *catmap)
{
        struct netlbl_lsm_catmap *iter;

        while (catmap) {
                iter = catmap;
                catmap = catmap->next;
                kfree(iter);
        }
}

/**
 * netlbl_secattr_init - Initialize a netlbl_lsm_secattr struct
 * @secattr: the struct to initialize
 *
 * Description:
 * Initialize an already allocated netlbl_lsm_secattr struct.
 *
 */
static inline void netlbl_secattr_init(struct netlbl_lsm_secattr *secattr)
{
        memset(secattr, 0, sizeof(*secattr));
}

/**
 * netlbl_secattr_destroy - Clears a netlbl_lsm_secattr struct
 * @secattr: the struct to clear
 *
 * Description:
 * Destroys the @secattr struct, including freeing all of the internal buffers.
 * The struct must be reset with a call to netlbl_secattr_init() before reuse.
 *
 */
static inline void netlbl_secattr_destroy(struct netlbl_lsm_secattr *secattr)
{
        if (secattr->flags & NETLBL_SECATTR_FREE_DOMAIN)
                kfree(secattr->domain);
        if (secattr->flags & NETLBL_SECATTR_CACHE)
                netlbl_secattr_cache_free(secattr->cache);
        if (secattr->flags & NETLBL_SECATTR_MLS_CAT)
                netlbl_catmap_free(secattr->attr.mls.cat);
}

/**
 * netlbl_secattr_alloc - Allocate and initialize a netlbl_lsm_secattr struct
 * @flags: the memory allocation flags
 *
 * Description:
 * Allocate and initialize a netlbl_lsm_secattr struct.  Returns a valid
 * pointer on success, or NULL on failure.
 *
 */
static inline struct netlbl_lsm_secattr *netlbl_secattr_alloc_noprof(gfp_t flags)
{
        return kzalloc_noprof(sizeof(struct netlbl_lsm_secattr), flags);
}
#define netlbl_secattr_alloc(...)        alloc_hooks(netlbl_secattr_alloc_noprof(__VA_ARGS__))

/**
 * netlbl_secattr_free - Frees a netlbl_lsm_secattr struct
 * @secattr: the struct to free
 *
 * Description:
 * Frees @secattr including all of the internal buffers.
 *
 */
static inline void netlbl_secattr_free(struct netlbl_lsm_secattr *secattr)
{
        netlbl_secattr_destroy(secattr);
        kfree(secattr);
}

#ifdef CONFIG_NETLABEL
/*
 * LSM configuration operations
 */
int netlbl_cfg_map_del(const char *domain,
                       u16 family,
                       const void *addr,
                       const void *mask,
                       struct netlbl_audit *audit_info);
int netlbl_cfg_unlbl_map_add(const char *domain,
                             u16 family,
                             const void *addr,
                             const void *mask,
                             struct netlbl_audit *audit_info);
int netlbl_cfg_unlbl_static_add(struct net *net,
                                const char *dev_name,
                                const void *addr,
                                const void *mask,
                                u16 family,
                                u32 secid,
                                struct netlbl_audit *audit_info);
int netlbl_cfg_unlbl_static_del(struct net *net,
                                const char *dev_name,
                                const void *addr,
                                const void *mask,
                                u16 family,
                                struct netlbl_audit *audit_info);
int netlbl_cfg_cipsov4_add(struct cipso_v4_doi *doi_def,
                           struct netlbl_audit *audit_info);
void netlbl_cfg_cipsov4_del(u32 doi, struct netlbl_audit *audit_info);
int netlbl_cfg_cipsov4_map_add(u32 doi,
                               const char *domain,
                               const struct in_addr *addr,
                               const struct in_addr *mask,
                               struct netlbl_audit *audit_info);
int netlbl_cfg_calipso_add(struct calipso_doi *doi_def,
                           struct netlbl_audit *audit_info);
void netlbl_cfg_calipso_del(u32 doi, struct netlbl_audit *audit_info);
int netlbl_cfg_calipso_map_add(u32 doi,
                               const char *domain,
                               const struct in6_addr *addr,
                               const struct in6_addr *mask,
                               struct netlbl_audit *audit_info);
/*
 * LSM security attribute operations
 */
int netlbl_catmap_walk(struct netlbl_lsm_catmap *catmap, u32 offset);
int netlbl_catmap_walkrng(struct netlbl_lsm_catmap *catmap, u32 offset);
int netlbl_catmap_getlong(struct netlbl_lsm_catmap *catmap,
                          u32 *offset,
                          unsigned long *bitmap);
int netlbl_catmap_setbit(struct netlbl_lsm_catmap **catmap,
                         u32 bit,
                         gfp_t flags);
int netlbl_catmap_setrng(struct netlbl_lsm_catmap **catmap,
                         u32 start,
                         u32 end,
                         gfp_t flags);
int netlbl_catmap_setlong(struct netlbl_lsm_catmap **catmap,
                          u32 offset,
                          unsigned long bitmap,
                          gfp_t flags);

/* Bitmap functions
 */
int netlbl_bitmap_walk(const unsigned char *bitmap, u32 bitmap_len,
                       u32 offset, u8 state);
void netlbl_bitmap_setbit(unsigned char *bitmap, u32 bit, u8 state);

/*
 * LSM protocol operations (NetLabel LSM/kernel API)
 */
int netlbl_enabled(void);
int netlbl_sock_setattr(struct sock *sk,
                        u16 family,
                        const struct netlbl_lsm_secattr *secattr,
                        bool sk_locked);
void netlbl_sock_delattr(struct sock *sk);
int netlbl_sock_getattr(struct sock *sk,
                        struct netlbl_lsm_secattr *secattr);
int netlbl_conn_setattr(struct sock *sk,
                        struct sockaddr *addr,
                        const struct netlbl_lsm_secattr *secattr);
int netlbl_req_setattr(struct request_sock *req,
                       const struct netlbl_lsm_secattr *secattr);
void netlbl_req_delattr(struct request_sock *req);
int netlbl_skbuff_setattr(struct sk_buff *skb,
                          u16 family,
                          const struct netlbl_lsm_secattr *secattr);
int netlbl_skbuff_getattr(const struct sk_buff *skb,
                          u16 family,
                          struct netlbl_lsm_secattr *secattr);
void netlbl_skbuff_err(struct sk_buff *skb, u16 family, int error, int gateway);
bool netlbl_sk_lock_check(struct sock *sk);

/*
 * LSM label mapping cache operations
 */
void netlbl_cache_invalidate(void);
int netlbl_cache_add(const struct sk_buff *skb, u16 family,
                     const struct netlbl_lsm_secattr *secattr);

/*
 * Protocol engine operations
 */
struct audit_buffer *netlbl_audit_start(int type,
                                        struct netlbl_audit *audit_info);
#else
static inline int netlbl_cfg_map_del(const char *domain,
                                     u16 family,
                                     const void *addr,
                                     const void *mask,
                                     struct netlbl_audit *audit_info)
{
        return -ENOSYS;
}
static inline int netlbl_cfg_unlbl_map_add(const char *domain,
                                           u16 family,
                                           void *addr,
                                           void *mask,
                                           struct netlbl_audit *audit_info)
{
        return -ENOSYS;
}
static inline int netlbl_cfg_unlbl_static_add(struct net *net,
                                              const char *dev_name,
                                              const void *addr,
                                              const void *mask,
                                              u16 family,
                                              u32 secid,
                                              struct netlbl_audit *audit_info)
{
        return -ENOSYS;
}
static inline int netlbl_cfg_unlbl_static_del(struct net *net,
                                              const char *dev_name,
                                              const void *addr,
                                              const void *mask,
                                              u16 family,
                                              struct netlbl_audit *audit_info)
{
        return -ENOSYS;
}
static inline int netlbl_cfg_cipsov4_add(struct cipso_v4_doi *doi_def,
                                         struct netlbl_audit *audit_info)
{
        return -ENOSYS;
}
static inline void netlbl_cfg_cipsov4_del(u32 doi,
                                          struct netlbl_audit *audit_info)
{
        return;
}
static inline int netlbl_cfg_cipsov4_map_add(u32 doi,
                                             const char *domain,
                                             const struct in_addr *addr,
                                             const struct in_addr *mask,
                                             struct netlbl_audit *audit_info)
{
        return -ENOSYS;
}
static inline int netlbl_cfg_calipso_add(struct calipso_doi *doi_def,
                                         struct netlbl_audit *audit_info)
{
        return -ENOSYS;
}
static inline void netlbl_cfg_calipso_del(u32 doi,
                                          struct netlbl_audit *audit_info)
{
        return;
}
static inline int netlbl_cfg_calipso_map_add(u32 doi,
                                             const char *domain,
                                             const struct in6_addr *addr,
                                             const struct in6_addr *mask,
                                             struct netlbl_audit *audit_info)
{
        return -ENOSYS;
}
static inline int netlbl_catmap_walk(struct netlbl_lsm_catmap *catmap,
                                     u32 offset)
{
        return -ENOENT;
}
static inline int netlbl_catmap_walkrng(struct netlbl_lsm_catmap *catmap,
                                        u32 offset)
{
        return -ENOENT;
}
static inline int netlbl_catmap_getlong(struct netlbl_lsm_catmap *catmap,
                                        u32 *offset,
                                        unsigned long *bitmap)
{
        return 0;
}
static inline int netlbl_catmap_setbit(struct netlbl_lsm_catmap **catmap,
                                       u32 bit,
                                       gfp_t flags)
{
        return 0;
}
static inline int netlbl_catmap_setrng(struct netlbl_lsm_catmap **catmap,
                                       u32 start,
                                       u32 end,
                                       gfp_t flags)
{
        return 0;
}
static inline int netlbl_catmap_setlong(struct netlbl_lsm_catmap **catmap,
                                        u32 offset,
                                        unsigned long bitmap,
                                        gfp_t flags)
{
        return 0;
}
static inline int netlbl_enabled(void)
{
        return 0;
}
static inline int netlbl_sock_setattr(struct sock *sk,
                                      u16 family,
                                      const struct netlbl_lsm_secattr *secattr,
                                      bool sk_locked)
{
        return -ENOSYS;
}
static inline void netlbl_sock_delattr(struct sock *sk)
{
}
static inline int netlbl_sock_getattr(struct sock *sk,
                                      struct netlbl_lsm_secattr *secattr)
{
        return -ENOSYS;
}
static inline int netlbl_conn_setattr(struct sock *sk,
                                      struct sockaddr *addr,
                                      const struct netlbl_lsm_secattr *secattr)
{
        return -ENOSYS;
}
static inline int netlbl_req_setattr(struct request_sock *req,
                                     const struct netlbl_lsm_secattr *secattr)
{
        return -ENOSYS;
}
static inline void netlbl_req_delattr(struct request_sock *req)
{
        return;
}
static inline int netlbl_skbuff_setattr(struct sk_buff *skb,
                                      u16 family,
                                      const struct netlbl_lsm_secattr *secattr)
{
        return -ENOSYS;
}
static inline int netlbl_skbuff_getattr(const struct sk_buff *skb,
                                        u16 family,
                                        struct netlbl_lsm_secattr *secattr)
{
        return -ENOSYS;
}
static inline void netlbl_skbuff_err(struct sk_buff *skb,
                                     int error,
                                     int gateway)
{
        return;
}
static inline void netlbl_cache_invalidate(void)
{
        return;
}
static inline int netlbl_cache_add(const struct sk_buff *skb, u16 family,
                                   const struct netlbl_lsm_secattr *secattr)
{
        return 0;
}
static inline struct audit_buffer *netlbl_audit_start(int type,
                                                struct netlbl_audit *audit_info)
{
        return NULL;
}

static inline bool netlbl_sk_lock_check(struct sock *sk)
{
        return true;
}
#endif /* CONFIG_NETLABEL */

const struct netlbl_calipso_ops *
netlbl_calipso_ops_register(const struct netlbl_calipso_ops *ops);

#endif /* _NETLABEL_H */



































































   13 



   13 












   13 



   13 










   13 




   13 






































   13 

   13 


   13 





   13 
   12 



   13 





   13 






   13 




   12 
   13 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
/*
 * kmod - the kernel module loader
 *
 * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
 */

#include <linux/module.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/binfmts.h>
#include <linux/syscalls.h>
#include <linux/unistd.h>
#include <linux/kmod.h>
#include <linux/slab.h>
#include <linux/completion.h>
#include <linux/cred.h>
#include <linux/file.h>
#include <linux/workqueue.h>
#include <linux/security.h>
#include <linux/mount.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/resource.h>
#include <linux/notifier.h>
#include <linux/suspend.h>
#include <linux/rwsem.h>
#include <linux/ptrace.h>
#include <linux/async.h>
#include <linux/uaccess.h>

#include <trace/events/module.h>
#include "internal.h"

/*
 * Assuming:
 *
 * threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE,
 *                       (u64) THREAD_SIZE * 8UL);
 *
 * If you need less than 50 threads would mean we're dealing with systems
 * smaller than 3200 pages. This assumes you are capable of having ~13M memory,
 * and this would only be an upper limit, after which the OOM killer would take
 * effect. Systems like these are very unlikely if modules are enabled.
 */
#define MAX_KMOD_CONCURRENT 50
static DEFINE_SEMAPHORE(kmod_concurrent_max, MAX_KMOD_CONCURRENT);

/*
 * This is a restriction on having *all* MAX_KMOD_CONCURRENT threads
 * running at the same time without returning. When this happens we
 * believe you've somehow ended up with a recursive module dependency
 * creating a loop.
 *
 * We have no option but to fail.
 *
 * Userspace should proactively try to detect and prevent these.
 */
#define MAX_KMOD_ALL_BUSY_TIMEOUT 5

/*
        modprobe_path is set via /proc/sys.
*/
char modprobe_path[KMOD_PATH_LEN] = CONFIG_MODPROBE_PATH;

static void free_modprobe_argv(struct subprocess_info *info)
{
        kfree(info->argv[3]); /* check call_modprobe() */
        kfree(info->argv);
}

static int call_modprobe(char *orig_module_name, int wait)
{
        struct subprocess_info *info;
        static char *envp[] = {
                "HOME=/",
                "TERM=linux",
                "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
                NULL
        };
        char *module_name;
        int ret;

        char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL);
        if (!argv)
                goto out;

        module_name = kstrdup(orig_module_name, GFP_KERNEL);
        if (!module_name)
                goto free_argv;

        argv[0] = modprobe_path;
        argv[1] = "-q";
        argv[2] = "--";
        argv[3] = module_name;        /* check free_modprobe_argv() */
        argv[4] = NULL;

        info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL,
                                         NULL, free_modprobe_argv, NULL);
        if (!info)
                goto free_module_name;

        ret = call_usermodehelper_exec(info, wait | UMH_KILLABLE);
        kmod_dup_request_announce(orig_module_name, ret);
        return ret;

free_module_name:
        kfree(module_name);
free_argv:
        kfree(argv);
out:
        kmod_dup_request_announce(orig_module_name, -ENOMEM);
        return -ENOMEM;
}

/**
 * __request_module - try to load a kernel module
 * @wait: wait (or not) for the operation to complete
 * @fmt: printf style format string for the name of the module
 * @...: arguments as specified in the format string
 *
 * Load a module using the user mode module loader. The function returns
 * zero on success or a negative errno code or positive exit code from
 * "modprobe" on failure. Note that a successful module load does not mean
 * the module did not then unload and exit on an error of its own. Callers
 * must check that the service they requested is now available not blindly
 * invoke it.
 *
 * If module auto-loading support is disabled then this function
 * simply returns -ENOENT.
 */
int __request_module(bool wait, const char *fmt, ...)
{
        va_list args;
        char module_name[MODULE_NAME_LEN];
        int ret, dup_ret;

        /*
         * We don't allow synchronous module loading from async.  Module
         * init may invoke async_synchronize_full() which will end up
         * waiting for this task which already is waiting for the module
         * loading to complete, leading to a deadlock.
         */
        WARN_ON_ONCE(wait && current_is_async());

        if (!modprobe_path[0])
                return -ENOENT;

        va_start(args, fmt);
        ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
        va_end(args);
        if (ret >= MODULE_NAME_LEN)
                return -ENAMETOOLONG;

        ret = security_kernel_module_request(module_name);
        if (ret)
                return ret;

        ret = down_timeout(&kmod_concurrent_max, MAX_KMOD_ALL_BUSY_TIMEOUT * HZ);
        if (ret) {
                pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now",
                                    module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT);
                return ret;
        }

        trace_module_request(module_name, wait, _RET_IP_);

        if (kmod_dup_request_exists_wait(module_name, wait, &dup_ret)) {
                ret = dup_ret;
                goto out;
        }

        ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);

out:
        up(&kmod_concurrent_max);

        return ret;
}
EXPORT_SYMBOL(__request_module);




































































   13 















   13 


















































   61 




   64 
   62 





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/sched/task_stack.h>
#include <linux/security.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/sysctl.h>
#include <linux/mman.h>
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
#include <linux/userfaultfd_k.h>
#include <linux/elf.h>
#include <linux/elf-randomize.h>
#include <linux/personality.h>
#include <linux/random.h>
#include <linux/processor.h>
#include <linux/sizes.h>
#include <linux/compat.h>
#include <linux/fsnotify.h>
#include <linux/page_idle.h>

#include <linux/uaccess.h>

#include <kunit/visibility.h>

#include "internal.h"
#include "swap.h"

/**
 * kfree_const - conditionally free memory
 * @x: pointer to the memory
 *
 * Function calls kfree only if @x is not in .rodata section.
 */
void kfree_const(const void *x)
{
        if (!is_kernel_rodata((unsigned long)x))
                kfree(x);
}
EXPORT_SYMBOL(kfree_const);

/**
 * __kmemdup_nul - Create a NUL-terminated string from @s, which might be unterminated.
 * @s: The data to copy
 * @len: The size of the data, not including the NUL terminator
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Return: newly allocated copy of @s with NUL-termination or %NULL in
 * case of error
 */
static __always_inline char *__kmemdup_nul(const char *s, size_t len, gfp_t gfp)
{
        char *buf;

        /* '+1' for the NUL terminator */
        buf = kmalloc_track_caller(len + 1, gfp);
        if (!buf)
                return NULL;

        memcpy(buf, s, len);
        /* Ensure the buf is always NUL-terminated, regardless of @s. */
        buf[len] = '\0';
        return buf;
}

/**
 * kstrdup - allocate space for and copy an existing string
 * @s: the string to duplicate
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Return: newly allocated copy of @s or %NULL in case of error
 */
noinline
char *kstrdup(const char *s, gfp_t gfp)
{
        return s ? __kmemdup_nul(s, strlen(s), gfp) : NULL;
}
EXPORT_SYMBOL(kstrdup);

/**
 * kstrdup_const - conditionally duplicate an existing const string
 * @s: the string to duplicate
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Note: Strings allocated by kstrdup_const should be freed by kfree_const and
 * must not be passed to krealloc().
 *
 * Return: source string if it is in .rodata section otherwise
 * fallback to kstrdup.
 */
const char *kstrdup_const(const char *s, gfp_t gfp)
{
        if (is_kernel_rodata((unsigned long)s))
                return s;

        return kstrdup(s, gfp);
}
EXPORT_SYMBOL(kstrdup_const);

/**
 * kstrndup - allocate space for and copy an existing string
 * @s: the string to duplicate
 * @max: read at most @max chars from @s
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Note: Use kmemdup_nul() instead if the size is known exactly.
 *
 * Return: newly allocated copy of @s or %NULL in case of error
 */
char *kstrndup(const char *s, size_t max, gfp_t gfp)
{
        return s ? __kmemdup_nul(s, strnlen(s, max), gfp) : NULL;
}
EXPORT_SYMBOL(kstrndup);

/**
 * kmemdup - duplicate region of memory
 *
 * @src: memory region to duplicate
 * @len: memory region length
 * @gfp: GFP mask to use
 *
 * Return: newly allocated copy of @src or %NULL in case of error,
 * result is physically contiguous. Use kfree() to free.
 */
void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp)
{
        void *p;

        p = kmalloc_node_track_caller_noprof(len, gfp, NUMA_NO_NODE, _RET_IP_);
        if (p)
                memcpy(p, src, len);
        return p;
}
EXPORT_SYMBOL(kmemdup_noprof);

/**
 * kmemdup_array - duplicate a given array.
 *
 * @src: array to duplicate.
 * @count: number of elements to duplicate from array.
 * @element_size: size of each element of array.
 * @gfp: GFP mask to use.
 *
 * Return: duplicated array of @src or %NULL in case of error,
 * result is physically contiguous. Use kfree() to free.
 */
void *kmemdup_array(const void *src, size_t count, size_t element_size, gfp_t gfp)
{
        return kmemdup(src, size_mul(element_size, count), gfp);
}
EXPORT_SYMBOL(kmemdup_array);

/**
 * kvmemdup - duplicate region of memory
 *
 * @src: memory region to duplicate
 * @len: memory region length
 * @gfp: GFP mask to use
 *
 * Return: newly allocated copy of @src or %NULL in case of error,
 * result may be not physically contiguous. Use kvfree() to free.
 */
void *kvmemdup(const void *src, size_t len, gfp_t gfp)
{
        void *p;

        p = kvmalloc(len, gfp);
        if (p)
                memcpy(p, src, len);
        return p;
}
EXPORT_SYMBOL(kvmemdup);

/**
 * kmemdup_nul - Create a NUL-terminated string from unterminated data
 * @s: The data to stringify
 * @len: The size of the data
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Return: newly allocated copy of @s with NUL-termination or %NULL in
 * case of error
 */
char *kmemdup_nul(const char *s, size_t len, gfp_t gfp)
{
        return s ? __kmemdup_nul(s, len, gfp) : NULL;
}
EXPORT_SYMBOL(kmemdup_nul);

static kmem_buckets *user_buckets __ro_after_init;

static int __init init_user_buckets(void)
{
        user_buckets = kmem_buckets_create("memdup_user", 0, 0, INT_MAX, NULL);

        return 0;
}
subsys_initcall(init_user_buckets);

/**
 * memdup_user - duplicate memory region from user space
 *
 * @src: source address in user space
 * @len: number of bytes to copy
 *
 * Return: an ERR_PTR() on failure.  Result is physically
 * contiguous, to be freed by kfree().
 */
void *memdup_user(const void __user *src, size_t len)
{
        void *p;

        p = kmem_buckets_alloc_track_caller(user_buckets, len, GFP_USER | __GFP_NOWARN);
        if (!p)
                return ERR_PTR(-ENOMEM);

        if (copy_from_user(p, src, len)) {
                kfree(p);
                return ERR_PTR(-EFAULT);
        }

        return p;
}
EXPORT_SYMBOL(memdup_user);

/**
 * vmemdup_user - duplicate memory region from user space
 *
 * @src: source address in user space
 * @len: number of bytes to copy
 *
 * Return: an ERR_PTR() on failure.  Result may be not
 * physically contiguous.  Use kvfree() to free.
 */
void *vmemdup_user(const void __user *src, size_t len)
{
        void *p;

        p = kmem_buckets_valloc(user_buckets, len, GFP_USER);
        if (!p)
                return ERR_PTR(-ENOMEM);

        if (copy_from_user(p, src, len)) {
                kvfree(p);
                return ERR_PTR(-EFAULT);
        }

        return p;
}
EXPORT_SYMBOL(vmemdup_user);

/**
 * strndup_user - duplicate an existing string from user space
 * @s: The string to duplicate
 * @n: Maximum number of bytes to copy, including the trailing NUL.
 *
 * Return: newly allocated copy of @s or an ERR_PTR() in case of error
 */
char *strndup_user(const char __user *s, long n)
{
        char *p;
        long length;

        length = strnlen_user(s, n);

        if (!length)
                return ERR_PTR(-EFAULT);

        if (length > n)
                return ERR_PTR(-EINVAL);

        p = memdup_user(s, length);

        if (IS_ERR(p))
                return p;

        p[length - 1] = '\0';

        return p;
}
EXPORT_SYMBOL(strndup_user);

/**
 * memdup_user_nul - duplicate memory region from user space and NUL-terminate
 *
 * @src: source address in user space
 * @len: number of bytes to copy
 *
 * Return: an ERR_PTR() on failure.
 */
void *memdup_user_nul(const void __user *src, size_t len)
{
        char *p;

        p = kmem_buckets_alloc_track_caller(user_buckets, len + 1, GFP_USER | __GFP_NOWARN);
        if (!p)
                return ERR_PTR(-ENOMEM);

        if (copy_from_user(p, src, len)) {
                kfree(p);
                return ERR_PTR(-EFAULT);
        }
        p[len] = '\0';

        return p;
}
EXPORT_SYMBOL(memdup_user_nul);

/* Check if the vma is being used as a stack by this task */
int vma_is_stack_for_current(const struct vm_area_struct *vma)
{
        struct task_struct * __maybe_unused t = current;

        return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
}

/*
 * Change backing file, only valid to use during initial VMA setup.
 */
void vma_set_file(struct vm_area_struct *vma, struct file *file)
{
        /* Changing an anonymous vma with this is illegal */
        get_file(file);
        swap(vma->vm_file, file);
        fput(file);
}
EXPORT_SYMBOL(vma_set_file);

#ifndef STACK_RND_MASK
#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12))     /* 8MB of VA */
#endif

unsigned long randomize_stack_top(unsigned long stack_top)
{
        unsigned long random_variable = 0;

        if (current->flags & PF_RANDOMIZE) {
                random_variable = get_random_long();
                random_variable &= STACK_RND_MASK;
                random_variable <<= PAGE_SHIFT;
        }
#ifdef CONFIG_STACK_GROWSUP
        return PAGE_ALIGN(stack_top) + random_variable;
#else
        return PAGE_ALIGN(stack_top) - random_variable;
#endif
}

/**
 * randomize_page - Generate a random, page aligned address
 * @start:        The smallest acceptable address the caller will take.
 * @range:        The size of the area, starting at @start, within which the
 *                random address must fall.
 *
 * If @start + @range would overflow, @range is capped.
 *
 * NOTE: Historical use of randomize_range, which this replaces, presumed that
 * @start was already page aligned.  We now align it regardless.
 *
 * Return: A page aligned address within [start, start + range).  On error,
 * @start is returned.
 */
unsigned long randomize_page(unsigned long start, unsigned long range)
{
        if (!PAGE_ALIGNED(start)) {
                range -= PAGE_ALIGN(start) - start;
                start = PAGE_ALIGN(start);
        }

        if (start > ULONG_MAX - range)
                range = ULONG_MAX - start;

        range >>= PAGE_SHIFT;

        if (range == 0)
                return start;

        return start + (get_random_long() % range << PAGE_SHIFT);
}

#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
unsigned long __weak arch_randomize_brk(struct mm_struct *mm)
{
        /* Is the current task 32bit ? */
        if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task())
                return randomize_page(mm->brk, SZ_32M);

        return randomize_page(mm->brk, SZ_1G);
}

unsigned long arch_mmap_rnd(void)
{
        unsigned long rnd;

#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
        if (is_compat_task())
                rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
        else
#endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */
                rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);

        return rnd << PAGE_SHIFT;
}

static int mmap_is_legacy(const struct rlimit *rlim_stack)
{
        if (current->personality & ADDR_COMPAT_LAYOUT)
                return 1;

        /* On parisc the stack always grows up - so a unlimited stack should
         * not be an indicator to use the legacy memory layout. */
        if (rlim_stack->rlim_cur == RLIM_INFINITY &&
                !IS_ENABLED(CONFIG_STACK_GROWSUP))
                return 1;

        return sysctl_legacy_va_layout;
}

/*
 * Leave enough space between the mmap area and the stack to honour ulimit in
 * the face of randomisation.
 */
#define MIN_GAP                (SZ_128M)
#define MAX_GAP                (STACK_TOP / 6 * 5)

static unsigned long mmap_base(const unsigned long rnd, const struct rlimit *rlim_stack)
{
#ifdef CONFIG_STACK_GROWSUP
        /*
         * For an upwards growing stack the calculation is much simpler.
         * Memory for the maximum stack size is reserved at the top of the
         * task. mmap_base starts directly below the stack and grows
         * downwards.
         */
        return PAGE_ALIGN_DOWN(mmap_upper_limit(rlim_stack) - rnd);
#else
        unsigned long gap = rlim_stack->rlim_cur;
        unsigned long pad = stack_guard_gap;

        /* Account for stack randomization if necessary */
        if (current->flags & PF_RANDOMIZE)
                pad += (STACK_RND_MASK << PAGE_SHIFT);

        /* Values close to RLIM_INFINITY can overflow. */
        if (gap + pad > gap)
                gap += pad;

        if (gap < MIN_GAP && MIN_GAP < MAX_GAP)
                gap = MIN_GAP;
        else if (gap > MAX_GAP)
                gap = MAX_GAP;

        return PAGE_ALIGN(STACK_TOP - gap - rnd);
#endif
}

void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack)
{
        unsigned long random_factor = 0UL;

        if (current->flags & PF_RANDOMIZE)
                random_factor = arch_mmap_rnd();

        if (mmap_is_legacy(rlim_stack)) {
                mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
                mm_flags_clear(MMF_TOPDOWN, mm);
        } else {
                mm->mmap_base = mmap_base(random_factor, rlim_stack);
                mm_flags_set(MMF_TOPDOWN, mm);
        }
}
#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack)
{
        mm->mmap_base = TASK_UNMAPPED_BASE;
        mm_flags_clear(MMF_TOPDOWN, mm);
}
#endif
#ifdef CONFIG_MMU
EXPORT_SYMBOL_IF_KUNIT(arch_pick_mmap_layout);
#endif

/**
 * __account_locked_vm - account locked pages to an mm's locked_vm
 * @mm:          mm to account against
 * @pages:       number of pages to account
 * @inc:         %true if @pages should be considered positive, %false if not
 * @task:        task used to check RLIMIT_MEMLOCK
 * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
 *
 * Assumes @task and @mm are valid (i.e. at least one reference on each), and
 * that mmap_lock is held as writer.
 *
 * Return:
 * * 0       on success
 * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
 */
int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
                        const struct task_struct *task, bool bypass_rlim)
{
        unsigned long locked_vm, limit;
        int ret = 0;

        mmap_assert_write_locked(mm);

        locked_vm = mm->locked_vm;
        if (inc) {
                if (!bypass_rlim) {
                        limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
                        if (locked_vm + pages > limit)
                                ret = -ENOMEM;
                }
                if (!ret)
                        mm->locked_vm = locked_vm + pages;
        } else {
                WARN_ON_ONCE(pages > locked_vm);
                mm->locked_vm = locked_vm - pages;
        }

        pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
                 (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
                 locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
                 ret ? " - exceeded" : "");

        return ret;
}
EXPORT_SYMBOL_GPL(__account_locked_vm);

/**
 * account_locked_vm - account locked pages to an mm's locked_vm
 * @mm:          mm to account against, may be NULL
 * @pages:       number of pages to account
 * @inc:         %true if @pages should be considered positive, %false if not
 *
 * Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
 *
 * Return:
 * * 0       on success, or if mm is NULL
 * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
 */
int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
{
        int ret;

        if (pages == 0 || !mm)
                return 0;

        mmap_write_lock(mm);
        ret = __account_locked_vm(mm, pages, inc, current,
                                  capable(CAP_IPC_LOCK));
        mmap_write_unlock(mm);

        return ret;
}
EXPORT_SYMBOL_GPL(account_locked_vm);

unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
        unsigned long len, unsigned long prot,
        unsigned long flag, unsigned long pgoff)
{
        loff_t off = (loff_t)pgoff << PAGE_SHIFT;
        unsigned long ret;
        struct mm_struct *mm = current->mm;
        unsigned long populate;
        LIST_HEAD(uf);

        ret = security_mmap_file(file, prot, flag);
        if (!ret)
                ret = fsnotify_mmap_perm(file, prot, off, len);
        if (!ret) {
                if (mmap_write_lock_killable(mm))
                        return -EINTR;
                ret = do_mmap(file, addr, len, prot, flag, 0, pgoff, &populate,
                              &uf);
                mmap_write_unlock(mm);
                userfaultfd_unmap_complete(mm, &uf);
                if (populate)
                        mm_populate(ret, populate);
        }
        return ret;
}

/*
 * Perform a userland memory mapping into the current process address space. See
 * the comment for do_mmap() for more details on this operation in general.
 *
 * This differs from do_mmap() in that:
 *
 * a. An offset parameter is provided rather than pgoff, which is both checked
 *    for overflow and page alignment.
 * b. mmap locking is performed on the caller's behalf.
 * c. Userfaultfd unmap events and memory population are handled.
 *
 * This means that this function performs essentially the same work as if
 * userland were invoking mmap (2).
 *
 * Returns either an error, or the address at which the requested mapping has
 * been performed.
 */
unsigned long vm_mmap(struct file *file, unsigned long addr,
        unsigned long len, unsigned long prot,
        unsigned long flag, unsigned long offset)
{
        if (unlikely(offset + PAGE_ALIGN(len) < offset))
                return -EINVAL;
        if (unlikely(offset_in_page(offset)))
                return -EINVAL;

        return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
}
EXPORT_SYMBOL(vm_mmap);

/**
 * __vmalloc_array - allocate memory for a virtually contiguous array.
 * @n: number of elements.
 * @size: element size.
 * @flags: the type of memory to allocate (see kmalloc).
 */
void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(n, size, &bytes)))
                return NULL;
        return __vmalloc_noprof(bytes, flags);
}
EXPORT_SYMBOL(__vmalloc_array_noprof);

/**
 * vmalloc_array - allocate memory for a virtually contiguous array.
 * @n: number of elements.
 * @size: element size.
 */
void *vmalloc_array_noprof(size_t n, size_t size)
{
        return __vmalloc_array_noprof(n, size, GFP_KERNEL);
}
EXPORT_SYMBOL(vmalloc_array_noprof);

/**
 * __vcalloc - allocate and zero memory for a virtually contiguous array.
 * @n: number of elements.
 * @size: element size.
 * @flags: the type of memory to allocate (see kmalloc).
 */
void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags)
{
        return __vmalloc_array_noprof(n, size, flags | __GFP_ZERO);
}
EXPORT_SYMBOL(__vcalloc_noprof);

/**
 * vcalloc - allocate and zero memory for a virtually contiguous array.
 * @n: number of elements.
 * @size: element size.
 */
void *vcalloc_noprof(size_t n, size_t size)
{
        return __vmalloc_array_noprof(n, size, GFP_KERNEL | __GFP_ZERO);
}
EXPORT_SYMBOL(vcalloc_noprof);

struct anon_vma *folio_anon_vma(const struct folio *folio)
{
        unsigned long mapping = (unsigned long)folio->mapping;

        if ((mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON)
                return NULL;
        return (void *)(mapping - FOLIO_MAPPING_ANON);
}

/**
 * folio_mapping - Find the mapping where this folio is stored.
 * @folio: The folio.
 *
 * For folios which are in the page cache, return the mapping that this
 * page belongs to.  Folios in the swap cache return the swap mapping
 * this page is stored in (which is different from the mapping for the
 * swap file or swap device where the data is stored).
 *
 * You can call this for folios which aren't in the swap cache or page
 * cache and it will return NULL.
 */
struct address_space *folio_mapping(const struct folio *folio)
{
        struct address_space *mapping;

        /* This happens if someone calls flush_dcache_page on slab page */
        if (unlikely(folio_test_slab(folio)))
                return NULL;

        if (unlikely(folio_test_swapcache(folio)))
                return swap_address_space(folio->swap);

        mapping = folio->mapping;
        if ((unsigned long)mapping & FOLIO_MAPPING_FLAGS)
                return NULL;

        return mapping;
}
EXPORT_SYMBOL(folio_mapping);

/**
 * folio_copy - Copy the contents of one folio to another.
 * @dst: Folio to copy to.
 * @src: Folio to copy from.
 *
 * The bytes in the folio represented by @src are copied to @dst.
 * Assumes the caller has validated that @dst is at least as large as @src.
 * Can be called in atomic context for order-0 folios, but if the folio is
 * larger, it may sleep.
 */
void folio_copy(struct folio *dst, struct folio *src)
{
        long i = 0;
        long nr = folio_nr_pages(src);

        for (;;) {
                copy_highpage(folio_page(dst, i), folio_page(src, i));
                if (++i == nr)
                        break;
                cond_resched();
        }
}
EXPORT_SYMBOL(folio_copy);

int folio_mc_copy(struct folio *dst, struct folio *src)
{
        long nr = folio_nr_pages(src);
        long i = 0;

        for (;;) {
                if (copy_mc_highpage(folio_page(dst, i), folio_page(src, i)))
                        return -EHWPOISON;
                if (++i == nr)
                        break;
                cond_resched();
        }

        return 0;
}
EXPORT_SYMBOL(folio_mc_copy);

int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
static int sysctl_overcommit_ratio __read_mostly = 50;
static unsigned long sysctl_overcommit_kbytes __read_mostly;
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */

#ifdef CONFIG_SYSCTL

static int overcommit_ratio_handler(const struct ctl_table *table, int write,
                                void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;

        ret = proc_dointvec(table, write, buffer, lenp, ppos);
        if (ret == 0 && write)
                sysctl_overcommit_kbytes = 0;
        return ret;
}

static void sync_overcommit_as(struct work_struct *dummy)
{
        percpu_counter_sync(&vm_committed_as);
}

static int overcommit_policy_handler(const struct ctl_table *table, int write,
                                void *buffer, size_t *lenp, loff_t *ppos)
{
        struct ctl_table t;
        int new_policy = -1;
        int ret;

        /*
         * The deviation of sync_overcommit_as could be big with loose policy
         * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
         * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
         * with the strict "NEVER", and to avoid possible race condition (even
         * though user usually won't too frequently do the switching to policy
         * OVERCOMMIT_NEVER), the switch is done in the following order:
         *        1. changing the batch
         *        2. sync percpu count on each CPU
         *        3. switch the policy
         */
        if (write) {
                t = *table;
                t.data = &new_policy;
                ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
                if (ret || new_policy == -1)
                        return ret;

                mm_compute_batch(new_policy);
                if (new_policy == OVERCOMMIT_NEVER)
                        schedule_on_each_cpu(sync_overcommit_as);
                sysctl_overcommit_memory = new_policy;
        } else {
                ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        }

        return ret;
}

static int overcommit_kbytes_handler(const struct ctl_table *table, int write,
                                void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;

        ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write)
                sysctl_overcommit_ratio = 0;
        return ret;
}

static const struct ctl_table util_sysctl_table[] = {
        {
                .procname        = "overcommit_memory",
                .data                = &sysctl_overcommit_memory,
                .maxlen                = sizeof(sysctl_overcommit_memory),
                .mode                = 0644,
                .proc_handler        = overcommit_policy_handler,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_TWO,
        },
        {
                .procname        = "overcommit_ratio",
                .data                = &sysctl_overcommit_ratio,
                .maxlen                = sizeof(sysctl_overcommit_ratio),
                .mode                = 0644,
                .proc_handler        = overcommit_ratio_handler,
        },
        {
                .procname        = "overcommit_kbytes",
                .data                = &sysctl_overcommit_kbytes,
                .maxlen                = sizeof(sysctl_overcommit_kbytes),
                .mode                = 0644,
                .proc_handler        = overcommit_kbytes_handler,
        },
        {
                .procname        = "user_reserve_kbytes",
                .data                = &sysctl_user_reserve_kbytes,
                .maxlen                = sizeof(sysctl_user_reserve_kbytes),
                .mode                = 0644,
                .proc_handler        = proc_doulongvec_minmax,
        },
        {
                .procname        = "admin_reserve_kbytes",
                .data                = &sysctl_admin_reserve_kbytes,
                .maxlen                = sizeof(sysctl_admin_reserve_kbytes),
                .mode                = 0644,
                .proc_handler        = proc_doulongvec_minmax,
        },
};

static int __init init_vm_util_sysctls(void)
{
        register_sysctl_init("vm", util_sysctl_table);
        return 0;
}
subsys_initcall(init_vm_util_sysctls);
#endif /* CONFIG_SYSCTL */

/*
 * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
 */
unsigned long vm_commit_limit(void)
{
        unsigned long allowed;

        if (sysctl_overcommit_kbytes)
                allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
        else
                allowed = ((totalram_pages() - hugetlb_total_pages())
                           * sysctl_overcommit_ratio / 100);
        allowed += total_swap_pages;

        return allowed;
}

/*
 * Make sure vm_committed_as in one cacheline and not cacheline shared with
 * other variables. It can be updated by several CPUs frequently.
 */
struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;

/*
 * The global memory commitment made in the system can be a metric
 * that can be used to drive ballooning decisions when Linux is hosted
 * as a guest. On Hyper-V, the host implements a policy engine for dynamically
 * balancing memory across competing virtual machines that are hosted.
 * Several metrics drive this policy engine including the guest reported
 * memory commitment.
 *
 * The time cost of this is very low for small platforms, and for big
 * platform like a 2S/36C/72T Skylake server, in worst case where
 * vm_committed_as's spinlock is under severe contention, the time cost
 * could be about 30~40 microseconds.
 */
unsigned long vm_memory_committed(void)
{
        return percpu_counter_sum_positive(&vm_committed_as);
}
EXPORT_SYMBOL_GPL(vm_memory_committed);

/*
 * Check that a process has enough memory to allocate a new virtual
 * mapping. 0 means there is enough memory for the allocation to
 * succeed and -ENOMEM implies there is not.
 *
 * We currently support three overcommit policies, which are set via the
 * vm.overcommit_memory sysctl.  See Documentation/mm/overcommit-accounting.rst
 *
 * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
 * Additional code 2002 Jul 20 by Robert Love.
 *
 * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
 *
 * Note this is a helper function intended to be used by LSMs which
 * wish to use this logic.
 */
int __vm_enough_memory(const struct mm_struct *mm, long pages, int cap_sys_admin)
{
        long allowed;
        unsigned long bytes_failed;

        vm_acct_memory(pages);

        /*
         * Sometimes we want to use more memory than we have
         */
        if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
                return 0;

        if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
                if (pages > totalram_pages() + total_swap_pages)
                        goto error;
                return 0;
        }

        allowed = vm_commit_limit();
        /*
         * Reserve some for root
         */
        if (!cap_sys_admin)
                allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);

        /*
         * Don't let a single process grow so big a user can't recover
         */
        if (mm) {
                long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);

                allowed -= min_t(long, mm->total_vm / 32, reserve);
        }

        if (percpu_counter_read_positive(&vm_committed_as) < allowed)
                return 0;
error:
        bytes_failed = pages << PAGE_SHIFT;
        pr_warn_ratelimited("%s: pid: %d, comm: %s, bytes: %lu not enough memory for the allocation\n",
                            __func__, current->pid, current->comm, bytes_failed);
        vm_unacct_memory(pages);

        return -ENOMEM;
}

/**
 * get_cmdline() - copy the cmdline value to a buffer.
 * @task:     the task whose cmdline value to copy.
 * @buffer:   the buffer to copy to.
 * @buflen:   the length of the buffer. Larger cmdline values are truncated
 *            to this length.
 *
 * Return: the size of the cmdline field copied. Note that the copy does
 * not guarantee an ending NULL byte.
 */
int get_cmdline(struct task_struct *task, char *buffer, int buflen)
{
        int res = 0;
        unsigned int len;
        struct mm_struct *mm = get_task_mm(task);
        unsigned long arg_start, arg_end, env_start, env_end;
        if (!mm)
                goto out;
        if (!mm->arg_end)
                goto out_mm;        /* Shh! No looking before we're done */

        spin_lock(&mm->arg_lock);
        arg_start = mm->arg_start;
        arg_end = mm->arg_end;
        env_start = mm->env_start;
        env_end = mm->env_end;
        spin_unlock(&mm->arg_lock);

        len = arg_end - arg_start;

        if (len > buflen)
                len = buflen;

        res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE);

        /*
         * If the nul at the end of args has been overwritten, then
         * assume application is using setproctitle(3).
         */
        if (res > 0 && buffer[res-1] != '\0' && len < buflen) {
                len = strnlen(buffer, res);
                if (len < res) {
                        res = len;
                } else {
                        len = env_end - env_start;
                        if (len > buflen - res)
                                len = buflen - res;
                        res += access_process_vm(task, env_start,
                                                 buffer+res, len,
                                                 FOLL_FORCE);
                        res = strnlen(buffer, res);
                }
        }
out_mm:
        mmput(mm);
out:
        return res;
}

int __weak memcmp_pages(struct page *page1, struct page *page2)
{
        char *addr1, *addr2;
        int ret;

        addr1 = kmap_local_page(page1);
        addr2 = kmap_local_page(page2);
        ret = memcmp(addr1, addr2, PAGE_SIZE);
        kunmap_local(addr2);
        kunmap_local(addr1);
        return ret;
}

#ifdef CONFIG_PRINTK
/**
 * mem_dump_obj - Print available provenance information
 * @object: object for which to find provenance information.
 *
 * This function uses pr_cont(), so that the caller is expected to have
 * printed out whatever preamble is appropriate.  The provenance information
 * depends on the type of object and on how much debugging is enabled.
 * For example, for a slab-cache object, the slab name is printed, and,
 * if available, the return address and stack trace from the allocation
 * and last free path of that object.
 */
void mem_dump_obj(void *object)
{
        const char *type;

        if (kmem_dump_obj(object))
                return;

        if (vmalloc_dump_obj(object))
                return;

        if (is_vmalloc_addr(object))
                type = "vmalloc memory";
        else if (virt_addr_valid(object))
                type = "non-slab/vmalloc memory";
        else if (object == NULL)
                type = "NULL pointer";
        else if (object == ZERO_SIZE_PTR)
                type = "zero-size pointer";
        else
                type = "non-paged memory";

        pr_cont(" %s\n", type);
}
EXPORT_SYMBOL_GPL(mem_dump_obj);
#endif

/*
 * A driver might set a page logically offline -- PageOffline() -- and
 * turn the page inaccessible in the hypervisor; after that, access to page
 * content can be fatal.
 *
 * Some special PFN walkers -- i.e., /proc/kcore -- read content of random
 * pages after checking PageOffline(); however, these PFN walkers can race
 * with drivers that set PageOffline().
 *
 * page_offline_freeze()/page_offline_thaw() allows for a subsystem to
 * synchronize with such drivers, achieving that a page cannot be set
 * PageOffline() while frozen.
 *
 * page_offline_begin()/page_offline_end() is used by drivers that care about
 * such races when setting a page PageOffline().
 */
static DECLARE_RWSEM(page_offline_rwsem);

void page_offline_freeze(void)
{
        down_read(&page_offline_rwsem);
}

void page_offline_thaw(void)
{
        up_read(&page_offline_rwsem);
}

void page_offline_begin(void)
{
        down_write(&page_offline_rwsem);
}
EXPORT_SYMBOL(page_offline_begin);

void page_offline_end(void)
{
        up_write(&page_offline_rwsem);
}
EXPORT_SYMBOL(page_offline_end);

#ifndef flush_dcache_folio
void flush_dcache_folio(struct folio *folio)
{
        long i, nr = folio_nr_pages(folio);

        for (i = 0; i < nr; i++)
                flush_dcache_page(folio_page(folio, i));
}
EXPORT_SYMBOL(flush_dcache_folio);
#endif

/**
 * __compat_vma_mmap_prepare() - See description for compat_vma_mmap_prepare()
 * for details. This is the same operation, only with a specific file operations
 * struct which may or may not be the same as vma->vm_file->f_op.
 * @f_op: The file operations whose .mmap_prepare() hook is specified.
 * @file: The file which backs or will back the mapping.
 * @vma: The VMA to apply the .mmap_prepare() hook to.
 * Returns: 0 on success or error.
 */
int __compat_vma_mmap_prepare(const struct file_operations *f_op,
                struct file *file, struct vm_area_struct *vma)
{
        struct vm_area_desc desc = {
                .mm = vma->vm_mm,
                .file = file,
                .start = vma->vm_start,
                .end = vma->vm_end,

                .pgoff = vma->vm_pgoff,
                .vm_file = vma->vm_file,
                .vm_flags = vma->vm_flags,
                .page_prot = vma->vm_page_prot,
        };
        int err;

        err = f_op->mmap_prepare(&desc);
        if (err)
                return err;
        set_vma_from_desc(vma, &desc);

        return 0;
}
EXPORT_SYMBOL(__compat_vma_mmap_prepare);

/**
 * compat_vma_mmap_prepare() - Apply the file's .mmap_prepare() hook to an
 * existing VMA.
 * @file: The file which possesss an f_op->mmap_prepare() hook.
 * @vma: The VMA to apply the .mmap_prepare() hook to.
 *
 * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain
 * stacked filesystems invoke a nested mmap hook of an underlying file.
 *
 * Until all filesystems are converted to use .mmap_prepare(), we must be
 * conservative and continue to invoke these stacked filesystems using the
 * deprecated .mmap() hook.
 *
 * However we have a problem if the underlying file system possesses an
 * .mmap_prepare() hook, as we are in a different context when we invoke the
 * .mmap() hook, already having a VMA to deal with.
 *
 * compat_vma_mmap_prepare() is a compatibility function that takes VMA state,
 * establishes a struct vm_area_desc descriptor, passes to the underlying
 * .mmap_prepare() hook and applies any changes performed by it.
 *
 * Once the conversion of filesystems is complete this function will no longer
 * be required and will be removed.
 *
 * Returns: 0 on success or error.
 */
int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma)
{
        return __compat_vma_mmap_prepare(file->f_op, file, vma);
}
EXPORT_SYMBOL(compat_vma_mmap_prepare);

static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio,
                         const struct page *page)
{
        /*
         * Only the first page of a high-order buddy page has PageBuddy() set.
         * So we have to check manually whether this page is part of a high-
         * order buddy page.
         */
        if (PageBuddy(page))
                ps->flags |= PAGE_SNAPSHOT_PG_BUDDY;
        else if (page_count(page) == 0 && is_free_buddy_page(page))
                ps->flags |= PAGE_SNAPSHOT_PG_BUDDY;

        if (folio_test_idle(folio))
                ps->flags |= PAGE_SNAPSHOT_PG_IDLE;
}

/**
 * snapshot_page() - Create a snapshot of a struct page
 * @ps: Pointer to a struct page_snapshot to store the page snapshot
 * @page: The page to snapshot
 *
 * Create a snapshot of the page and store both its struct page and struct
 * folio representations in @ps.
 *
 * A snapshot is marked as "faithful" if the compound state of @page was
 * stable and allowed safe reconstruction of the folio representation. In
 * rare cases where this is not possible (e.g. due to folio splitting),
 * snapshot_page() falls back to treating @page as a single page and the
 * snapshot is marked as "unfaithful". The snapshot_page_is_faithful()
 * helper can be used to check for this condition.
 */
void snapshot_page(struct page_snapshot *ps, const struct page *page)
{
        unsigned long head, nr_pages = 1;
        struct folio *foliop;
        int loops = 5;

        ps->pfn = page_to_pfn(page);
        ps->flags = PAGE_SNAPSHOT_FAITHFUL;

again:
        memset(&ps->folio_snapshot, 0, sizeof(struct folio));
        memcpy(&ps->page_snapshot, page, sizeof(*page));
        head = ps->page_snapshot.compound_head;
        if ((head & 1) == 0) {
                ps->idx = 0;
                foliop = (struct folio *)&ps->page_snapshot;
                if (!folio_test_large(foliop)) {
                        set_ps_flags(ps, page_folio(page), page);
                        memcpy(&ps->folio_snapshot, foliop,
                               sizeof(struct page));
                        return;
                }
                foliop = (struct folio *)page;
        } else {
                foliop = (struct folio *)(head - 1);
                ps->idx = folio_page_idx(foliop, page);
        }

        if (ps->idx < MAX_FOLIO_NR_PAGES) {
                memcpy(&ps->folio_snapshot, foliop, 2 * sizeof(struct page));
                nr_pages = folio_nr_pages(&ps->folio_snapshot);
                if (nr_pages > 1)
                        memcpy(&ps->folio_snapshot.__page_2, &foliop->__page_2,
                               sizeof(struct page));
                set_ps_flags(ps, foliop, page);
        }

        if (ps->idx > nr_pages) {
                if (loops-- > 0)
                        goto again;
                clear_compound_head(&ps->page_snapshot);
                foliop = (struct folio *)&ps->page_snapshot;
                memcpy(&ps->folio_snapshot, foliop, sizeof(struct page));
                ps->flags = 0;
                ps->idx = 0;
        }
}

#ifdef CONFIG_MMU
/**
 * folio_pte_batch - detect a PTE batch for a large folio
 * @folio: The large folio to detect a PTE batch for.
 * @ptep: Page table pointer for the first entry.
 * @pte: Page table entry for the first page.
 * @max_nr: The maximum number of table entries to consider.
 *
 * This is a simplified variant of folio_pte_batch_flags().
 *
 * Detect a PTE batch: consecutive (present) PTEs that map consecutive
 * pages of the same large folio in a single VMA and a single page table.
 *
 * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN,
 * the accessed bit, writable bit, dirt-bit and soft-dirty bit.
 *
 * ptep must map any page of the folio. max_nr must be at least one and
 * must be limited by the caller so scanning cannot exceed a single VMA and
 * a single page table.
 *
 * Return: the number of table entries in the batch.
 */
unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte,
                unsigned int max_nr)
{
        return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, 0);
}
#endif /* CONFIG_MMU */

#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
/**
 * page_range_contiguous - test whether the page range is contiguous
 * @page: the start of the page range.
 * @nr_pages: the number of pages in the range.
 *
 * Test whether the page range is contiguous, such that they can be iterated
 * naively, corresponding to iterating a contiguous PFN range.
 *
 * This function should primarily only be used for debug checks, or when
 * working with page ranges that are not naturally contiguous (e.g., pages
 * within a folio are).
 *
 * Returns true if contiguous, otherwise false.
 */
bool page_range_contiguous(const struct page *page, unsigned long nr_pages)
{
        const unsigned long start_pfn = page_to_pfn(page);
        const unsigned long end_pfn = start_pfn + nr_pages;
        unsigned long pfn;

        /*
         * The memmap is allocated per memory section, so no need to check
         * within the first section. However, we need to check each other
         * spanned memory section once, making sure the first page in a
         * section could similarly be reached by just iterating pages.
         */
        for (pfn = ALIGN(start_pfn, PAGES_PER_SECTION);
             pfn < end_pfn; pfn += PAGES_PER_SECTION)
                if (unlikely(page + (pfn - start_pfn) != pfn_to_page(pfn)))
                        return false;
        return true;
}
EXPORT_SYMBOL(page_range_contiguous);
#endif







































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for inet_sock
 *
 * Authors:        Many, reorganised here by
 *                 Arnaldo Carvalho de Melo <acme@mandriva.com>
 */
#ifndef _INET_SOCK_H
#define _INET_SOCK_H

#include <linux/bitops.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/jhash.h>
#include <linux/netdevice.h>

#include <net/flow.h>
#include <net/inet_dscp.h>
#include <net/sock.h>
#include <net/request_sock.h>
#include <net/netns/hash.h>
#include <net/tcp_states.h>
#include <net/l3mdev.h>

/** struct ip_options - IP Options
 *
 * @faddr - Saved first hop address
 * @nexthop - Saved nexthop address in LSRR and SSRR
 * @is_strictroute - Strict source route
 * @srr_is_hit - Packet destination addr was our one
 * @is_changed - IP checksum more not valid
 * @rr_needaddr - Need to record addr of outgoing dev
 * @ts_needtime - Need to record timestamp
 * @ts_needaddr - Need to record addr of outgoing dev
 */
struct ip_options {
        __be32                faddr;
        __be32                nexthop;
        unsigned char        optlen;
        unsigned char        srr;
        unsigned char        rr;
        unsigned char        ts;
        unsigned char        is_strictroute:1,
                        srr_is_hit:1,
                        is_changed:1,
                        rr_needaddr:1,
                        ts_needtime:1,
                        ts_needaddr:1;
        unsigned char        router_alert;
        unsigned char        cipso;
        unsigned char        __pad2;
        unsigned char        __data[];
};

struct ip_options_rcu {
        struct rcu_head rcu;
        struct ip_options opt;
};

struct ip_options_data {
        struct ip_options_rcu        opt;
        char                        data[40];
};

struct inet_request_sock {
        struct request_sock        req;
#define ir_loc_addr                req.__req_common.skc_rcv_saddr
#define ir_rmt_addr                req.__req_common.skc_daddr
#define ir_num                        req.__req_common.skc_num
#define ir_rmt_port                req.__req_common.skc_dport
#define ir_v6_rmt_addr                req.__req_common.skc_v6_daddr
#define ir_v6_loc_addr                req.__req_common.skc_v6_rcv_saddr
#define ir_iif                        req.__req_common.skc_bound_dev_if
#define ir_cookie                req.__req_common.skc_cookie
#define ireq_net                req.__req_common.skc_net
#define ireq_state                req.__req_common.skc_state
#define ireq_family                req.__req_common.skc_family

        u16                        snd_wscale : 4,
                                rcv_wscale : 4,
                                tstamp_ok  : 1,
                                sack_ok           : 1,
                                wscale_ok  : 1,
                                ecn_ok           : 1,
                                acked           : 1,
                                no_srccheck: 1,
                                smc_ok           : 1;
        u32                     ir_mark;
        union {
                struct ip_options_rcu __rcu        *ireq_opt;
#if IS_ENABLED(CONFIG_IPV6)
                struct {
                        struct ipv6_txoptions        *ipv6_opt;
                        struct sk_buff                *pktopts;
                };
#endif
        };
};

static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk)
{
        return (struct inet_request_sock *)sk;
}

static inline u32 inet_request_mark(const struct sock *sk, struct sk_buff *skb)
{
        u32 mark = READ_ONCE(sk->sk_mark);

        if (!mark && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept))
                return skb->mark;

        return mark;
}

static inline int inet_request_bound_dev_if(const struct sock *sk,
                                            struct sk_buff *skb)
{
        int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
#ifdef CONFIG_NET_L3_MASTER_DEV
        struct net *net = sock_net(sk);

        if (!bound_dev_if && READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept))
                return l3mdev_master_ifindex_by_index(net, skb->skb_iif);
#endif

        return bound_dev_if;
}

static inline int inet_sk_bound_l3mdev(const struct sock *sk)
{
#ifdef CONFIG_NET_L3_MASTER_DEV
        struct net *net = sock_net(sk);

        if (!READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept))
                return l3mdev_master_ifindex_by_index(net,
                                                      sk->sk_bound_dev_if);
#endif

        return 0;
}

static inline bool inet_bound_dev_eq(bool l3mdev_accept, int bound_dev_if,
                                     int dif, int sdif)
{
        if (!bound_dev_if)
                return !sdif || l3mdev_accept;
        return bound_dev_if == dif || bound_dev_if == sdif;
}

static inline bool inet_sk_bound_dev_eq(const struct net *net,
                                        int bound_dev_if,
                                        int dif, int sdif)
{
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
        return inet_bound_dev_eq(!!READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept),
                                 bound_dev_if, dif, sdif);
#else
        return inet_bound_dev_eq(true, bound_dev_if, dif, sdif);
#endif
}

struct inet_cork {
        unsigned int                flags;
        __be32                        addr;
        struct ip_options        *opt;
        unsigned int                fragsize;
        int                        length; /* Total length of all frames */
        struct dst_entry        *dst;
        u8                        tx_flags;
        __u8                        ttl;
        __s16                        tos;
        u32                        priority;
        __u16                        gso_size;
        u32                        ts_opt_id;
        u64                        transmit_time;
        u32                        mark;
};

struct inet_cork_full {
        struct inet_cork        base;
        struct flowi                fl;
};

struct ip_mc_socklist;
struct ipv6_pinfo;
struct rtable;

/** struct inet_sock - representation of INET sockets
 *
 * @sk - ancestor class
 * @pinet6 - pointer to IPv6 control block
 * @inet_daddr - Foreign IPv4 addr
 * @inet_rcv_saddr - Bound local IPv4 addr
 * @inet_dport - Destination port
 * @inet_num - Local port
 * @inet_flags - various atomic flags
 * @inet_saddr - Sending source
 * @uc_ttl - Unicast TTL
 * @inet_sport - Source port
 * @inet_id - ID counter for DF pkts
 * @tos - TOS
 * @mc_ttl - Multicasting TTL
 * @uc_index - Unicast outgoing device index
 * @mc_index - Multicast device index
 * @mc_list - Group array
 * @cork - info to build ip hdr on each ip frag while socket is corked
 */
struct inet_sock {
        /* sk and pinet6 has to be the first two members of inet_sock */
        struct sock                sk;
#if IS_ENABLED(CONFIG_IPV6)
        struct ipv6_pinfo        *pinet6;
#endif
        /* Socket demultiplex comparisons on incoming packets. */
#define inet_daddr                sk.__sk_common.skc_daddr
#define inet_rcv_saddr                sk.__sk_common.skc_rcv_saddr
#define inet_dport                sk.__sk_common.skc_dport
#define inet_num                sk.__sk_common.skc_num

        unsigned long                inet_flags;
        __be32                        inet_saddr;
        __s16                        uc_ttl;
        __be16                        inet_sport;
        struct ip_options_rcu __rcu        *inet_opt;
        atomic_t                inet_id;

        __u8                        tos;
        __u8                        min_ttl;
        __u8                        mc_ttl;
        __u8                        pmtudisc;
        __u8                        rcv_tos;
        __u8                        convert_csum;
        int                        uc_index;
        int                        mc_index;
        __be32                        mc_addr;
        u32                        local_port_range;        /* high << 16 | low */

        struct ip_mc_socklist __rcu        *mc_list;
        struct inet_cork_full        cork;
};

#define IPCORK_OPT                1        /* ip-options has been held in ipcork.opt */
#define IPCORK_TS_OPT_ID        2        /* ts_opt_id field is valid, overriding sk_tskey */

enum {
        INET_FLAGS_PKTINFO        = 0,
        INET_FLAGS_TTL                = 1,
        INET_FLAGS_TOS                = 2,
        INET_FLAGS_RECVOPTS        = 3,
        INET_FLAGS_RETOPTS        = 4,
        INET_FLAGS_PASSSEC        = 5,
        INET_FLAGS_ORIGDSTADDR        = 6,
        INET_FLAGS_CHECKSUM        = 7,
        INET_FLAGS_RECVFRAGSIZE        = 8,

        INET_FLAGS_RECVERR        = 9,
        INET_FLAGS_RECVERR_RFC4884 = 10,
        INET_FLAGS_FREEBIND        = 11,
        INET_FLAGS_HDRINCL        = 12,
        INET_FLAGS_MC_LOOP        = 13,
        INET_FLAGS_MC_ALL        = 14,
        INET_FLAGS_TRANSPARENT        = 15,
        INET_FLAGS_IS_ICSK        = 16,
        INET_FLAGS_NODEFRAG        = 17,
        INET_FLAGS_BIND_ADDRESS_NO_PORT = 18,
        INET_FLAGS_DEFER_CONNECT = 19,
        INET_FLAGS_MC6_LOOP        = 20,
        INET_FLAGS_RECVERR6_RFC4884 = 21,
        INET_FLAGS_MC6_ALL        = 22,
        INET_FLAGS_AUTOFLOWLABEL_SET = 23,
        INET_FLAGS_AUTOFLOWLABEL = 24,
        INET_FLAGS_DONTFRAG        = 25,
        INET_FLAGS_RECVERR6        = 26,
        INET_FLAGS_REPFLOW        = 27,
        INET_FLAGS_RTALERT_ISOLATE = 28,
        INET_FLAGS_SNDFLOW        = 29,
        INET_FLAGS_RTALERT        = 30,
};

/* cmsg flags for inet */
#define IP_CMSG_PKTINFO                BIT(INET_FLAGS_PKTINFO)
#define IP_CMSG_TTL                BIT(INET_FLAGS_TTL)
#define IP_CMSG_TOS                BIT(INET_FLAGS_TOS)
#define IP_CMSG_RECVOPTS        BIT(INET_FLAGS_RECVOPTS)
#define IP_CMSG_RETOPTS                BIT(INET_FLAGS_RETOPTS)
#define IP_CMSG_PASSSEC                BIT(INET_FLAGS_PASSSEC)
#define IP_CMSG_ORIGDSTADDR        BIT(INET_FLAGS_ORIGDSTADDR)
#define IP_CMSG_CHECKSUM        BIT(INET_FLAGS_CHECKSUM)
#define IP_CMSG_RECVFRAGSIZE        BIT(INET_FLAGS_RECVFRAGSIZE)

#define IP_CMSG_ALL        (IP_CMSG_PKTINFO | IP_CMSG_TTL |                \
                         IP_CMSG_TOS | IP_CMSG_RECVOPTS |                \
                         IP_CMSG_RETOPTS | IP_CMSG_PASSSEC |                \
                         IP_CMSG_ORIGDSTADDR | IP_CMSG_CHECKSUM |        \
                         IP_CMSG_RECVFRAGSIZE)

static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet)
{
        return READ_ONCE(inet->inet_flags) & IP_CMSG_ALL;
}

static inline dscp_t inet_sk_dscp(const struct inet_sock *inet)
{
        return inet_dsfield_to_dscp(READ_ONCE(inet->tos));
}

#define inet_test_bit(nr, sk)                        \
        test_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
#define inet_set_bit(nr, sk)                        \
        set_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
#define inet_clear_bit(nr, sk)                        \
        clear_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
#define inet_assign_bit(nr, sk, val)                \
        assign_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags, val)

/**
 * sk_to_full_sk - Access to a full socket
 * @sk: pointer to a socket
 *
 * SYNACK messages might be attached to request sockets.
 * Some places want to reach the listener in this case.
 */
static inline struct sock *sk_to_full_sk(struct sock *sk)
{
#ifdef CONFIG_INET
        if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV)
                sk = inet_reqsk(sk)->rsk_listener;
        if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT)
                sk = NULL;
#endif
        return sk;
}

/* sk_to_full_sk() variant with a const argument */
static inline const struct sock *sk_const_to_full_sk(const struct sock *sk)
{
#ifdef CONFIG_INET
        if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV)
                sk = ((const struct request_sock *)sk)->rsk_listener;
        if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT)
                sk = NULL;
#endif
        return sk;
}

static inline struct sock *skb_to_full_sk(const struct sk_buff *skb)
{
        return sk_to_full_sk(skb->sk);
}

#define inet_sk(ptr) container_of_const(ptr, struct inet_sock, sk)

static inline void __inet_sk_copy_descendant(struct sock *sk_to,
                                             const struct sock *sk_from,
                                             const int ancestor_size)
{
        memcpy(inet_sk(sk_to) + 1, inet_sk(sk_from) + 1,
               sk_from->sk_prot->obj_size - ancestor_size);
}

int inet_sk_rebuild_header(struct sock *sk);

/**
 * inet_sk_state_load - read sk->sk_state for lockless contexts
 * @sk: socket pointer
 *
 * Paired with inet_sk_state_store(). Used in places we don't hold socket lock:
 * tcp_diag_get_info(), tcp_get_info(), tcp_poll(), get_tcp4_sock() ...
 */
static inline int inet_sk_state_load(const struct sock *sk)
{
        /* state change might impact lockless readers. */
        return smp_load_acquire(&sk->sk_state);
}

/**
 * inet_sk_state_store - update sk->sk_state
 * @sk: socket pointer
 * @newstate: new state
 *
 * Paired with inet_sk_state_load(). Should be used in contexts where
 * state change might impact lockless readers.
 */
void inet_sk_state_store(struct sock *sk, int newstate);

void inet_sk_set_state(struct sock *sk, int state);

static inline unsigned int __inet_ehashfn(const __be32 laddr,
                                          const __u16 lport,
                                          const __be32 faddr,
                                          const __be16 fport,
                                          u32 initval)
{
        return jhash_3words((__force __u32) laddr,
                            (__force __u32) faddr,
                            ((__u32) lport) << 16 | (__force __u32)fport,
                            initval);
}

struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
                                      struct sock *sk_listener,
                                      bool attach_listener);

static inline __u8 inet_sk_flowi_flags(const struct sock *sk)
{
        __u8 flags = 0;

        if (inet_test_bit(TRANSPARENT, sk) || inet_test_bit(HDRINCL, sk))
                flags |= FLOWI_FLAG_ANYSRC;
        return flags;
}

static inline void inet_inc_convert_csum(struct sock *sk)
{
        inet_sk(sk)->convert_csum++;
}

static inline void inet_dec_convert_csum(struct sock *sk)
{
        if (inet_sk(sk)->convert_csum > 0)
                inet_sk(sk)->convert_csum--;
}

static inline bool inet_get_convert_csum(struct sock *sk)
{
        return !!inet_sk(sk)->convert_csum;
}


static inline bool inet_can_nonlocal_bind(struct net *net,
                                          struct inet_sock *inet)
{
        return READ_ONCE(net->ipv4.sysctl_ip_nonlocal_bind) ||
                test_bit(INET_FLAGS_FREEBIND, &inet->inet_flags) ||
                test_bit(INET_FLAGS_TRANSPARENT, &inet->inet_flags);
}

static inline bool inet_addr_valid_or_nonlocal(struct net *net,
                                               struct inet_sock *inet,
                                               __be32 addr,
                                               int addr_type)
{
        return inet_can_nonlocal_bind(net, inet) ||
                addr == htonl(INADDR_ANY) ||
                addr_type == RTN_LOCAL ||
                addr_type == RTN_MULTICAST ||
                addr_type == RTN_BROADCAST;
}

#endif        /* _INET_SOCK_H */













































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * A security context is a set of security attributes
 * associated with each subject and object controlled
 * by the security policy.  Security contexts are
  * externally represented as variable-length strings
 * that can be interpreted by a user or application
 * with an understanding of the security policy.
 * Internally, the security server uses a simple
 * structure.  This structure is private to the
 * security server and can be changed without affecting
 * clients of the security server.
 *
 * Author : Stephen Smalley, <stephen.smalley.work@gmail.com>
 */

#ifndef _SS_CONTEXT_H_
#define _SS_CONTEXT_H_

#include "ebitmap.h"
#include "mls_types.h"
#include "security.h"

/*
 * A security context consists of an authenticated user
 * identity, a role, a type and a MLS range.
 */
struct context {
        u32 user;
        u32 role;
        u32 type;
        u32 len; /* length of string in bytes */
        struct mls_range range;
        char *str; /* string representation if context cannot be mapped. */
};

static inline void mls_context_init(struct context *c)
{
        memset(&c->range, 0, sizeof(c->range));
}

static inline int mls_context_cpy(struct context *dst,
                                  const struct context *src)
{
        int rc;

        dst->range.level[0].sens = src->range.level[0].sens;
        rc = ebitmap_cpy(&dst->range.level[0].cat, &src->range.level[0].cat);
        if (rc)
                goto out;

        dst->range.level[1].sens = src->range.level[1].sens;
        rc = ebitmap_cpy(&dst->range.level[1].cat, &src->range.level[1].cat);
        if (rc)
                ebitmap_destroy(&dst->range.level[0].cat);
out:
        return rc;
}

/*
 * Sets both levels in the MLS range of 'dst' to the low level of 'src'.
 */
static inline int mls_context_cpy_low(struct context *dst,
                                      const struct context *src)
{
        int rc;

        dst->range.level[0].sens = src->range.level[0].sens;
        rc = ebitmap_cpy(&dst->range.level[0].cat, &src->range.level[0].cat);
        if (rc)
                goto out;

        dst->range.level[1].sens = src->range.level[0].sens;
        rc = ebitmap_cpy(&dst->range.level[1].cat, &src->range.level[0].cat);
        if (rc)
                ebitmap_destroy(&dst->range.level[0].cat);
out:
        return rc;
}

/*
 * Sets both levels in the MLS range of 'dst' to the high level of 'src'.
 */
static inline int mls_context_cpy_high(struct context *dst,
                                       const struct context *src)
{
        int rc;

        dst->range.level[0].sens = src->range.level[1].sens;
        rc = ebitmap_cpy(&dst->range.level[0].cat, &src->range.level[1].cat);
        if (rc)
                goto out;

        dst->range.level[1].sens = src->range.level[1].sens;
        rc = ebitmap_cpy(&dst->range.level[1].cat, &src->range.level[1].cat);
        if (rc)
                ebitmap_destroy(&dst->range.level[0].cat);
out:
        return rc;
}

static inline int mls_context_glblub(struct context *dst,
                                     const struct context *c1,
                                     const struct context *c2)
{
        struct mls_range *dr = &dst->range;
        const struct mls_range *r1 = &c1->range, *r2 = &c2->range;
        int rc = 0;

        if (r1->level[1].sens < r2->level[0].sens ||
            r2->level[1].sens < r1->level[0].sens)
                /* These ranges have no common sensitivities */
                return -EINVAL;

        /* Take the greatest of the low */
        dr->level[0].sens = max(r1->level[0].sens, r2->level[0].sens);

        /* Take the least of the high */
        dr->level[1].sens = min(r1->level[1].sens, r2->level[1].sens);

        rc = ebitmap_and(&dr->level[0].cat, &r1->level[0].cat,
                         &r2->level[0].cat);
        if (rc)
                goto out;

        rc = ebitmap_and(&dr->level[1].cat, &r1->level[1].cat,
                         &r2->level[1].cat);
        if (rc)
                goto out;

out:
        return rc;
}

static inline bool mls_context_equal(const struct context *c1,
                                     const struct context *c2)
{
        return ((c1->range.level[0].sens == c2->range.level[0].sens) &&
                ebitmap_equal(&c1->range.level[0].cat, &c2->range.level[0].cat) &&
                (c1->range.level[1].sens == c2->range.level[1].sens) &&
                ebitmap_equal(&c1->range.level[1].cat, &c2->range.level[1].cat));
}

static inline void mls_context_destroy(struct context *c)
{
        ebitmap_destroy(&c->range.level[0].cat);
        ebitmap_destroy(&c->range.level[1].cat);
        mls_context_init(c);
}

static inline void context_init(struct context *c)
{
        memset(c, 0, sizeof(*c));
}

static inline int context_cpy(struct context *dst, const struct context *src)
{
        int rc;

        dst->user = src->user;
        dst->role = src->role;
        dst->type = src->type;
        if (src->str) {
                dst->str = kstrdup(src->str, GFP_ATOMIC);
                if (!dst->str)
                        return -ENOMEM;
                dst->len = src->len;
        } else {
                dst->str = NULL;
                dst->len = 0;
        }
        rc = mls_context_cpy(dst, src);
        if (rc) {
                kfree(dst->str);
                dst->str = NULL;
                dst->len = 0;
                return rc;
        }
        return 0;
}

static inline void context_destroy(struct context *c)
{
        c->user = c->role = c->type = 0;
        kfree(c->str);
        c->str = NULL;
        c->len = 0;
        mls_context_destroy(c);
}

static inline bool context_equal(const struct context *c1,
                                 const struct context *c2)
{
        if (c1->len && c2->len)
                return (c1->len == c2->len && !strcmp(c1->str, c2->str));
        if (c1->len || c2->len)
                return 0;
        return ((c1->user == c2->user) && (c1->role == c2->role) &&
                (c1->type == c2->type) && mls_context_equal(c1, c2));
}

u32 context_compute_hash(const struct context *c);

#endif /* _SS_CONTEXT_H_ */

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000840)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000003240)=@updsa={0x180, 0x1a, 0x1, 0x70bd27, 0x25dfdbfd, {{@in=@loopback, @in6=@loopback, 0x4e22, 0xffef, 0x4e20, 0x2, 0xa, 0x20, 0x80}, {@in=@empty, 0x4d2, 0x33}, @in=@initdev={0xac, 0x1e, 0xff, 0x0}, {0x8000, 0x6, 0x9, 0x200, 0x5, 0xa3a7, 0x43, 0x3}, {0x5, 0x342, 0x2, 0x7}, {0x1, 0xb3e1, 0x1}, 0x70bd2a, 0x0, 0xa, 0x0, 0x1, 0x21}, [@algo_auth={0x48, 0x1, {{'sha256-mb\x00'}}}, @algo_comp={0x48, 0x3, {{'lzjh\x00'}}}]}, 0x180}, 0x1, 0x0, 0x0, 0x40000}, 0x40000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000001280)={0x0, 0x0, &(0x7f0000001240)={&(0x7f00000000c0)=@updsa={0xf0, 0x1a, 0x1, 0x70bd27, 0x25dfdbfe, {{@in6=@private0={0xfc, 0x0, '\x00', 0x1}, @in=@rand_addr=0x64010100, 0x4e22, 0x0, 0x4e20, 0xfffa, 0xa, 0x0, 0xe0, 0x33}, {@in=@multicast1, 0x4d6, 0x3c}, @in=@empty, {0xfff, 0x2, 0x8000000000000000, 0x200000000009, 0x8, 0x2, 0x5, 0xfffffffffffffffd}, {0x7, 0x7, 0x800, 0x8}, {0x0, 0x2}, 0x70bd2a, 0x3503, 0x2, 0x2, 0x5, 0x21}}, 0xf0}, 0x1, 0x0, 0x0, 0x4001}, 0x8044)

      
      geteuid()
sendmsg$nl_xfrm(0xffffffffffffffff, 0x0, 0x0)
r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(0xffffffffffffffff, 0x0, 0x4000004)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000001f40)={&(0x7f00000005c0)=@updpolicy={0xb8, 0x19, 0x1, 0x0, 0x0, {{@in=@empty, @in6=@mcast2, 0x0, 0x8, 0x0, 0x0, 0xa, 0x0, 0x10, 0x0, 0x0, 0xffffffffffffffff}, {0x0, 0xa9, 0x0, 0x1, 0x0, 0xffffffffffffffff}, {0x0, 0xa00, 0x40800000000000, 0x800000000000002}}}, 0xb8}}, 0x10)
sendmsg$nl_xfrm(0xffffffffffffffff, 0x0, 0x800)
socket$nl_xfrm(0x10, 0x3, 0x6)
r1 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r1, &(0x7f0000000380)={0x0, 0x0, &(0x7f0000000340)={&(0x7f0000001100)=@migrate={0xa0, 0x21, 0x1, 0x4, 0x0, {{@in6=@private2={0xfc, 0x2, '\x00', 0x1}, @in6=@dev={0xfe, 0x80, '\x00', 0x3a}, 0x0, 0x0, 0x0, 0x2, 0xa, 0x0, 0xa0, 0x2e}}, [@migrate={0x50, 0x11, [{@in=@private=0xa010101, @in6=@empty, @in=@broadcast, @in=@dev={0xac, 0x14, 0x14, 0x34}, 0x3c, 0x4, 0x0, 0x2e00, 0x8, 0x2}]}]}, 0xa0}, 0x1, 0x0, 0x0, 0x40000}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm_esn_out(r0, &(0x7f0000000d80)={0x0, 0x0, &(0x7f0000000d40)={&(0x7f0000000ac0)=@newsa_esn_out={0xf0, 0x10, 0x2, 0x70bd2d, 0x25dfdbfc, {{@in6=@private1, @in6=@initdev={0xfe, 0x88, '\x00', 0x1, 0x0}, 0x4e23, 0x48, 0x4e21, 0x0, 0x2, 0x80, 0x0, 0x6}, {@in=@multicast2, 0x4d6}, @in6=@private0={0xfc, 0x0, '\x00', 0x1}, {0x0, 0x9, 0x7fffffffffffffff, 0x9, 0x2, 0xfcb1, 0xbf}, {0xfff, 0xffffffffffffffff, 0x2, 0x9}, {0xfffffff8, 0x3a}, 0x70bd2c, 0x34ff, 0x2}}, 0xf0}}, 0x44000814)
setsockopt$netlink_NETLINK_ADD_MEMBERSHIP(r0, 0x10e, 0x1, &(0x7f0000000a80)=0x1, 0x4)
r1 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r1, &(0x7f0000000300)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000ac0)=ANY=[@ANYBLOB="8c0300001700010029bd7000fcdbdf2500000000000000000000000000000000000004d33c000000fe88000000000000000000000000000100000000000000000000000000000000ff0100000000000000000000000000014e2100034e2000060200200087000000", @ANYRES32=0x0, @ANYRES32, @ANYBLOB="e0000001000000000000000000000000ac1414aa0000000000000000000000004e2280004e23fff9020020203b000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="0400860000000000ff0300000000000006000000000000000104000000000000fdffffffffffffff0800000000000000040000000000000002000000000000000800000000000000ffffffffffe4ffff0a000000000000000f000000000000000f000000000000000100020100000000010001001c000000ff03000025bd700084000500fe8000000000000000000000000000bb000004d33200000002000000ff010000000000000000000000000001023540000001070007000000010000000400000000000000000000000000ffff7f000001000004d4330000000a000000fe8000000000000000000000000000310535000004"], 0x38c}, 0x1, 0x0, 0x0, 0x10}, 0x4000004)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000180)={&(0x7f0000000540)=ANY=[@ANYBLOB="540400001600010225bd7000fbdbdf25fe8000000000000000000000000000aaac1414bb0000000000000000000000004e2301004e210006020080a067000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="ff010000000000000000000000000001000004d23200000000000000000000000000000000000001000000000000000001000000010000001000000000000000950700000000000003000000000000000600000000000000ff7f0000000000005e000000000000000700000000000000070000000000000000ac23140000000003000000000000000100000005000000fbffffff27bd7000000000000a0001010200000000000000060000000104000034001700060000002dbd7000020000002abd700029bd70000100000001000000080000000900000006000000f43e0000380000000a00100001000000000000008e001400736861332d3338342d67656e657269630000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001002000080000000aa2d25bccfc477bf8ee4e3b23b0a13c703eb67e2f5adba4b32d7ec107b3f26d4dea3683566eface16e04a62b1c7eb937472a56434de600602b15ee961e2d4a241b390000ba0002006c72772d63617374362d61767800000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000090030000a322cb3fbd8d768eb01fdf7c04341414c173a50bd12ee88ae13c73140e6dc5485edba0800c0266fe5974f4401e086b03778d05b87d7cd78a2539a83df568a1cdc5adafef7514a26ac114330393a33a95bc7005ebcdd9e8b5ceb2cc626e45a538b09c443665373e203d4089dcd54e533f7e0d000008000b000b000000ac0007"], 0x454}, 0x1, 0x0, 0x0, 0x4000000}, 0x24000014)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000140)={&(0x7f0000000300)=@migrate={0xa0, 0x21, 0x1, 0x70bd2b, 0x25dfdbfc, {{@in=@multicast2, @in6=@private1, 0x4e24, 0x4, 0x4e23, 0xd, 0x2, 0x0, 0x0, 0x3a}, 0x6e6bb2}, [@migrate={0x50, 0x11, [{@in6=@private0={0xfc, 0x0, '\x00', 0x1}, @in=@rand_addr=0x64010104, @in6=@rand_addr=' \x01\x00', @in=@rand_addr=0x64010100, 0x3c, 0x0, 0x0, 0x0, 0x2, 0x2}]}]}, 0xa0}, 0x1, 0x0, 0x0, 0x81}, 0x20000000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000040)={&(0x7f00000003c0)=ANY=[@ANYBLOB="4000000012000501"], 0x40}, 0x13, 0x0, 0x0, 0x880}, 0x20040840)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000009c0)={0x0, 0x0, &(0x7f0000000980)={&(0x7f0000000840)=@acquire={0x128, 0x17, 0x301, 0x70bd29, 0x25dfdbff, {{@in=@remote, 0x4d4, 0x2b}, @in6=@private0={0xfc, 0x0, '\x00', 0x1}, {@in6=@private1, @in6=@private1, 0x4e24, 0x100, 0x4e21, 0x54d, 0x2, 0x20, 0x10, 0x2c}, {{@in6=@ipv4={'\x00', '\xff\xff', @private=0xa010101}, @in6=@mcast2, 0x4e20, 0x0, 0x4e22, 0xe, 0x2, 0x80, 0x80, 0x16}, {0xaa, 0x4, 0x5, 0x5, 0x0, 0x83, 0x0, 0x4}, {0x1000, 0x9000000000000, 0x0, 0x10001}, 0x80000001, 0x6e6bbe, 0x0, 0x0, 0x1}, 0x10000, 0x40, 0x6, 0x70bd2d}}, 0x128}, 0x1, 0x0, 0x0, 0x44000}, 0x8000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000000400)=@updsa={0x138, 0x1a, 0x1, 0x70bd27, 0x25dfdbfc, {{@in=@loopback, @in6=@local, 0x4e22, 0x0, 0x4e20, 0x2, 0xa, 0x60, 0x80}, {@in6=@rand_addr=' \x01\x00', 0x4d6, 0x33}, @in6=@private0, {0x8000, 0x6, 0x9, 0x2, 0x5, 0xa3a7, 0x3f}, {0x5, 0x3, 0x2, 0x7}, {0x1, 0xb3e1, 0x10000}, 0x70bd2a, 0x3503, 0xa, 0x1, 0x1, 0x21}, [@algo_auth={0x48, 0x1, {{'cmac(aes)\x00'}}}]}, 0x138}, 0x1, 0x0, 0x0, 0x4040000}, 0x40000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000300)={0x0, 0x0, &(0x7f00000002c0)={0x0, 0x1}}, 0x0)
sendmsg$nl_xfrm(r0, &(0x7f0000000500)={0x0, 0x0, &(0x7f0000000280)={&(0x7f0000000340)=@updpolicy={0xb8, 0x19, 0x309, 0x70bd26, 0x25dfdbfe, {{@in6=@ipv4={'\x00', '\xff\xff', @multicast1}, @in6=@loopback, 0x4e23, 0xf11, 0x4e20, 0x1ff, 0xa, 0x20, 0x80, 0x2b}, {0x6a, 0xfffffffffffffff8, 0x4, 0x8, 0x200000, 0x1, 0x17c, 0xffffffffffffffff}, {0x9, 0xc3ef, 0x4, 0x400}, 0x0, 0x6e6bb6, 0x2, 0x1}}, 0xb8}, 0x1, 0x0, 0x0, 0x880}, 0xc800)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000400)={0x0, 0x0, &(0x7f00000003c0)={&(0x7f0000000180)=ANY=[@ANYBLOB="6800000015000100000000000000ff00fe8800000000000000000000800000010000000000000000000000000000000000040000000000000000b10000000000", @ANYRES32, @ANYBLOB='\x00\x00\x00\x00\x00\x00\x00\x00\f'], 0x68}}, 0x0)

      
      sendmsg$nl_xfrm(0xffffffffffffffff, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000140)={&(0x7f00000001c0)=@updpolicy={0xfc, 0x19, 0x1, 0x70b52a, 0x25dfdbfb, {{@in6=@rand_addr=' \x01\x00', @in6=@private1, 0x4e1e, 0x3, 0x4e21, 0x3ff, 0x2, 0x0, 0x20, 0x1}, {0x5c0, 0x1, 0x7ffe, 0x3, 0xa, 0x6, 0x400, 0x1}, {0x7fffffff, 0xc, 0x7, 0xe}, 0x3, 0x0, 0x2, 0x1, 0x1}, [@tmpl={0x44, 0x5, [{{@in=@rand_addr=0x64010102, 0x4d2, 0x3c}, 0x2, @in=@loopback, 0x0, 0x5, 0x3, 0xbb, 0x5ca, 0x84, 0x10001}]}]}, 0xfc}, 0x1, 0x0, 0x0, 0x4008000}, 0x40040)
r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000000140)=ANY=[@ANYBLOB="500100001a00010027bd7000fedbdf257f000001000000000000000000000000ac14143f000000000000000000000000ce1effff4e2000020a00608021000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="ff010000000000000000000000000001000004d6320000000000000000000000000000000000000000800000000000000600000000000000090000000000000002000000000000000400000000000000a7a30000000000003f000000000000000000000000000000050000000000000003000000000000000200000000000000070000000000000000000000fdffffff000001002abd7000ff3400000a000100210000000000000048000100636d616300000000000000000000000000000000000000000000000000000000000000005cca88e628104527cd723538fc00000000000000000000000000000000000000050021000200000010000a"], 0x150}, 0x1, 0x0, 0x0, 0x40400c5}, 0x88d0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000080)={0x0, 0x0, &(0x7f0000000040)={&(0x7f00000000c0)=ANY=[@ANYBLOB="3800000012000501000000000000000028001a000a0101020000000020000000000000000a010109302189db45d67a9375ed5109e6b2b5c1a9"], 0x38}}, 0x20040810)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000000440)={&(0x7f00000007c0)=@updsa={0x140, 0x1a, 0x1, 0x70bd25, 0x25dfdbfe, {{@in=@rand_addr=0x64010100, @in6=@remote, 0x4e23, 0x7ff, 0x4e22, 0x7, 0xa, 0x80, 0x80, 0x6c}, {@in6=@rand_addr=' \x01\x00', 0x4d2, 0x6c}, @in=@multicast2, {0x480000000, 0x5246, 0x8, 0x6b, 0x80, 0x3, 0x6, 0x1}, {0xa, 0xfffffffffffffffc, 0xc, 0x4}, {0x1, 0x7, 0x7fff}, 0x70bd27, 0x0, 0x2, 0x1, 0x0, 0x8}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}, @tfcpad={0x8, 0x16, 0x1}]}, 0x140}, 0x1, 0x0, 0x0, 0x895}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000340)={0x0, 0x0, &(0x7f0000000300)={&(0x7f0000000140)=@allocspi={0xf8, 0x16, 0x1, 0x0, 0x0, {{{@in6=@dev={0xfe, 0x80, '\x00', 0x1e}, @in=@initdev={0xac, 0x1e, 0x0, 0x0}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0xa0}, {@in6=@dev={0xfe, 0x80, '\x00', 0x36}, 0x0, 0x33}, @in6=@dev={0xfe, 0x80, '\x00', 0xe}, {0x0, 0x0, 0x0, 0x0, 0xffffffffffffffff, 0xffffffffffffffed, 0x3}, {0x0, 0x0, 0x2, 0xfffbfffffffffffe}, {0x0, 0x0, 0x796}, 0x0, 0x0, 0x0, 0x0, 0x2, 0xe55286f1921f74be}, 0x0, 0x1a0b1}}, 0xf8}, 0x1, 0x0, 0x0, 0x40040}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000000140)=@updsa={0x150, 0x1a, 0x1, 0x70bd27, 0x25dfdbfe, {{@in=@loopback, @in=@dev={0xac, 0x14, 0x14, 0x3f}, 0xce1e, 0xffff, 0x4e20, 0x2, 0xa, 0x60, 0x80, 0x21}, {@in6=@mcast1, 0x4d6, 0x32}, @in6=@empty, {0x8000, 0x6, 0x9, 0x2, 0x4, 0xa3a7, 0x3f}, {0x5, 0x3, 0x2, 0x7}, {0x0, 0xfffffffd, 0x10000}, 0x70bd2a, 0x34ff, 0xa, 0x1, 0x0, 0x21}, [@algo_auth={0x48, 0x1, {{'cmac(aes)\x00'}}}, @XFRMA_SA_DIR={0x5, 0x21, 0x2}, @replay_val={0x10, 0xa, {0x70bd2b, 0x70bd25, 0x9}}]}, 0x150}, 0x1, 0x0, 0x0, 0x40400c5}, 0x88d0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000001c0)={0x0, 0x0, &(0x7f0000000180)={&(0x7f0000000000)=ANY=[@ANYBLOB="f80000001600010225bd7000fbdbdf25fe8000000000000000000000000000aaac1414bb0000000000000000000000004e2301004e210006020080a06c000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="ff010000000000000000000000000001000004d232000000000000000000000000000000000000010000000000000000010000000100000010000000000000009507000000000000030000000000002541af502b2415000600000000000000ff7f0000000000005e000000000000000700000000000000070000000000000000ac23140000000003000000000000000100000005000000fbffffff27bd70000000000002000101020000000000000006"], 0xf8}, 0x1, 0x0, 0x0, 0x4000000}, 0x24000014)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000580)={0x0, 0x0, &(0x7f0000000180)={&(0x7f0000000000)=@updpolicy={0xb8, 0x13, 0xcb23c9c9931e99e9, 0x70bd25, 0x0, {{@in6=@empty, @in=@local, 0x0, 0x0, 0x0, 0x0, 0xa, 0x30, 0x80, 0x3c, 0x0, 0xee01}, {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x802000000000000}, {0xfffffffffffffffe}}}, 0xb8}}, 0x40004)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000000440)={&(0x7f0000000000)=ANY=[@ANYBLOB="6501"], 0x188}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
fstat(r0, &(0x7f0000000040))
r1 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r1, &(0x7f00000004c0)={0x0, 0x0, &(0x7f00000000c0)={&(0x7f0000000040)=ANY=[@ANYBLOB="50000000270001"], 0x50}}, 0x1000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000005c0)={0x0, 0x0, &(0x7f0000000080)={&(0x7f0000000300)=@polexpire={0xc8, 0x1b, 0x1, 0x70bd2c, 0x25dfdbfb, {{{@in6=@ipv4={'\x00', '\xff\xff', @local}, @in=@initdev={0xac, 0x1e, 0x1, 0x0}, 0x4e22, 0x0, 0x4e23, 0x7346, 0xa, 0x20, 0x20, 0x6c}, {0x26e9, 0x8, 0x3, 0x40, 0xdcd, 0xfffffffffffffeff, 0xffff, 0x8}, {0x6, 0x7, 0x5, 0x9}, 0x8, 0x0, 0x0, 0x1}, 0x1}, [@XFRMA_SA_DIR={0x5, 0x21, 0x1}]}, 0xc8}, 0x1, 0x0, 0x0, 0x4004}, 0x4)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000040)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000780)=@newsa={0x144, 0x10, 0x1, 0x9ffffffe, 0x100, {{@in6=@initdev={0xfe, 0x88, '\x00', 0x0, 0x0}, @in=@local, 0x1, 0x794, 0x4e23, 0x5, 0x0, 0x0, 0x0, 0x3a}, {@in6=@private1, 0x4d4, 0x6c}, @in=@loopback, {0xfffffffffffffffc, 0x9, 0x6, 0xffff, 0x8251c, 0x2, 0xfffffffffffffff8, 0x3}, {0xffffffffffffffff, 0x0, 0x1f, 0x1ff}, {0x2, 0xfffffffc}, 0x70bd2a, 0x3504, 0x2, 0x1, 0xfd, 0x20}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}, @mark={0xc, 0x15, {0x35075a, 0x3}}]}, 0x144}, 0x1, 0x0, 0x0, 0x8801}, 0x10)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000340)={0x0, 0x0, &(0x7f0000000280)={&(0x7f0000000600)=@allocspi={0x100, 0x16, 0x1, 0x70bd25, 0x25dfdbfe, {{{@in6=@private1, @in=@initdev={0xac, 0x1e, 0x1, 0x0}, 0x4e22, 0x0, 0x4e23, 0x8, 0xa, 0x20, 0x80, 0x0, 0x0, 0xee00}, {@in=@broadcast, 0x4d6, 0x33}, @in=@rand_addr=0x64010100, {0x9, 0x9, 0x8, 0x9, 0xfffffffffffffffb, 0x1741, 0x9, 0x4}, {0x5, 0x2, 0x6bd6f733}, {0x7, 0x7fe4, 0x1}, 0x70bd28, 0x3502, 0xa, 0x3, 0x31, 0x10}, 0x1, 0x9}, [@XFRMA_SA_PCPU={0x8}]}, 0x100}, 0x1, 0x0, 0x0, 0x20048884}, 0x20044890)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000001f40)={&(0x7f00000005c0)=@updpolicy={0xfc, 0x19, 0x1, 0x0, 0x0, {{@in=@empty, @in6=@mcast2, 0x0, 0x8, 0x0, 0x0, 0xa, 0x0, 0x0, 0x0, 0x0, 0xffffffffffffffff}, {0x0, 0xa9, 0x0, 0x1, 0x0, 0xffffffffffffffff}, {0x0, 0xa00, 0x40800000000000, 0x800000000000002}}, [@tmpl={0x44, 0x5, [{{@in6=@private0={0xfc, 0x0, '\x00', 0x1}, 0x0, 0x3c}, 0x0, @in=@loopback, 0x2, 0x4}]}]}, 0xfc}}, 0x0)
r1 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r1, &(0x7f0000000380)={0x0, 0x0, &(0x7f0000000340)={&(0x7f00000004c0)=ANY=[@ANYBLOB="bc000000210001000400000000000000fc020000000000000000000000000001ff02000000000000000000000000000100000000000000020a0000a02e000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="000000000000000050001100fc0000000000000000000000000009017f000001000000000000000000000000fc020000000000000000000000000001e00000010000000000000000000000003c040000020000"], 0xbc}, 0x1, 0x0, 0x0, 0x40000}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000000440)={&(0x7f00000001c0)=@updsa={0x138, 0x1a, 0x1, 0x70bd25, 0x25dfdbfe, {{@in=@rand_addr=0x64010100, @in6=@remote, 0x4e23, 0x7ff, 0x4e22, 0x7, 0xa, 0x80, 0x80, 0x6c}, {@in6=@rand_addr=' \x01\x00', 0x4d2, 0x6c}, @in=@multicast2, {0x480000000, 0x5246, 0x8, 0x6b, 0x80, 0x3, 0x2, 0x1}, {0xa, 0xfffffffffffffffc, 0xc, 0x4}, {0x1, 0x7, 0x7fff}, 0x70bd27, 0x0, 0x2, 0x1, 0x0, 0x8}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}]}, 0x138}, 0x1, 0x0, 0x0, 0x40895}, 0x0)
r1 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r1, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000000440)={&(0x7f0000000680)=@updsa={0x138, 0x1a, 0x1, 0x70bd25, 0x25dfdbfe, {{@in=@rand_addr=0x64010100, @in6=@remote, 0x4e23, 0x7ff, 0x4e22, 0x7, 0xa, 0x80, 0x80, 0x6c}, {@in6=@rand_addr=' \x01\x00', 0x4d2, 0x6c}, @in=@multicast2, {0x480000000, 0x5246, 0x8, 0x6b, 0x80, 0x3, 0x6, 0x1}, {0xa, 0xfffffffffffffffc, 0xc, 0x4}, {0x1, 0x7, 0x7fff}, 0x70bd27, 0x0, 0x2, 0x1, 0x0, 0x8}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}]}, 0x138}, 0x1, 0x0, 0x0, 0x895}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000580)={0x0, 0x0, &(0x7f0000000180)={&(0x7f0000001c00)=@updpolicy={0xb8, 0x13, 0xcb23c9c9931e99e9, 0x0, 0x0, {{@in=@dev={0xac, 0x14, 0x14, 0x33}, @in6=@private0={0xfc, 0x0, '\x00', 0x1}, 0x0, 0x0, 0x0, 0x0, 0x2, 0x0, 0x10, 0x0, 0x0, 0xee01}, {0x0, 0x0, 0x0, 0x9, 0x0, 0xfffffffffffffffd}, {}, 0x0, 0x0, 0x0, 0x0, 0x0, 0xb9daee93b50574d9}}, 0xb8}}, 0x4000080)

      
      socket$nl_xfrm(0x10, 0x3, 0x6)
r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000000440)={&(0x7f0000000680)=@updsa={0x138, 0x1a, 0x1, 0x70bd25, 0x25dfdbfe, {{@in=@rand_addr=0x64010100, @in6=@remote, 0x4e23, 0x7ff, 0x4e22, 0x7, 0xa, 0x80, 0x80, 0x6c}, {@in6=@rand_addr=' \x01\x00', 0x4d2, 0x6c}, @in=@multicast2, {0x480000000, 0x5246, 0x8, 0x6b, 0x80, 0x3, 0x6, 0x1}, {0xa, 0xfffffffffffffffc, 0xc, 0x4}, {0x1, 0x7, 0x7fff}, 0x70bd27, 0x0, 0x2, 0x1, 0x0, 0x8}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}]}, 0x138}, 0x1, 0x0, 0x0, 0x895}, 0x0)
r1 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r1, &(0x7f0000000a80)={0x0, 0x0, &(0x7f0000000a40)={&(0x7f0000000000)=@flushsa={0x14, 0x1c, 0x1, 0x70bd2c, 0x25dfdbfb, {0xff}}, 0x14}, 0x1, 0x0, 0x0, 0x81}, 0x8800)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000000440)={&(0x7f0000000200)=@updsa={0x154, 0x1a, 0x1, 0x70bd25, 0x25dfdbfe, {{@in=@rand_addr=0x64010100, @in6=@remote, 0x4e23, 0x7ff, 0x4e22, 0x7, 0xa, 0x80, 0x80, 0x6c}, {@in6=@rand_addr=' \x01\x00', 0x4d2, 0x6c}, @in=@multicast2, {0x480000000, 0x5246, 0x8, 0x6b, 0x80, 0x3, 0x6, 0xff}, {0x9, 0xfffffffffffffffc, 0xc, 0x4}, {0x1, 0x7, 0x7fff}, 0x70bd27, 0x0, 0xa, 0x1, 0x0, 0x8}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}, @encap={0x1c, 0x4, {0x0, 0x4e22, 0x4e20, @in=@initdev={0xac, 0x1e, 0x0, 0x0}}}]}, 0x154}, 0x1, 0x0, 0x0, 0x895}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000100)={0x0, 0x0, &(0x7f00000000c0)={&(0x7f0000000480)=@migrate={0xa0, 0x21, 0x1, 0x70bd2d, 0x25dfdc00, {{@in6=@local, @in=@multicast2, 0x4e22, 0x1, 0x4e20, 0x7, 0x0, 0x1e0, 0xa0, 0x2f}, 0x6e6bc0}, [@migrate={0x50, 0x11, [{@in6=@local, @in6=@private2={0xfc, 0x2, '\x00', 0x1}, @in=@dev={0xac, 0x14, 0x14, 0x20}, @in=@rand_addr=0x64010101, 0x2b, 0x3, 0x0, 0x350b, 0xa, 0xa}]}]}, 0xa0}, 0x1, 0x0, 0x0, 0x40000}, 0x800)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000300)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000ac0)=ANY=[@ANYBLOB="8c0300001700010029bd7000fcdbdf2500000000000000000000000000000000000004d33c000000fe88000000000000000000000000000100000000000000000000000000000000ff0100000000000000000000000000014e2100034e2000060200200087000000", @ANYRES32=0x0, @ANYRES32, @ANYBLOB="e0000001000000000000000000000000ac1414aa0000000000000000000000004e2280004e23fff9020020203b0000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="0400860000000000ff0300000000000006000000000000000104000000000000fdffffffffffffff0800000000000000040000000000000002000000000000000800000000000000ffffffffffe4ffff0a000000000000000f000000000000000f00000000000000010002"], 0x38c}, 0x1, 0x0, 0x0, 0x10}, 0x4000004)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000600)={0x0, 0x0, &(0x7f00000005c0)={&(0x7f0000000240)=@newae={0x50, 0x1e, 0x519, 0x70bd29, 0x25dfdbfb, {{@in6=@mcast1, 0x4d3, 0xa, 0x6c}, @in=@loopback, 0x4, 0x3500}, [@replay_val={0x10, 0xa, {0x70bd29, 0x70bd29, 0x5}}]}, 0x50}, 0x1, 0x0, 0x0, 0x2000}, 0x80)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000500)={&(0x7f00000001c0)=ANY=[@ANYBLOB="3c010000100033060000000000000000ffffffff000000000000000000000000fe8000000000000000000000000000aa00000000000000000200000062000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="000000000000000000000000000000010000000232000000fe880000000000000000000000000001000000000000009aaf0a3f012d94f600000000000000000000000000000000000000000000000000fdffffffdfffff070000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000020000000a000000000000004c001400726d64313630"], 0x13c}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
fstat(r0, &(0x7f00000088c0)={0x0, 0x0, 0x0, 0x0, <r1=>0x0})
setuid(r1)
r2 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r2, &(0x7f0000001400)={0x0, 0x0, &(0x7f00000013c0)={&(0x7f0000000f40)=@allocspi={0xf8, 0x16, 0x17, 0x70bd25, 0x25dfdbfe, {{{@in=@loopback, @in=@remote, 0x4e21, 0xd21, 0x4e24, 0x3, 0x2, 0x0, 0x20, 0x2b}, {@in=@multicast2, 0x4d3, 0x33}, @in6=@dev={0xfe, 0x80, '\x00', 0x14}, {0x800, 0xe34, 0x3, 0x1, 0x800000000000000, 0xf4, 0xa, 0xffffffff}, {0x3, 0x1ff, 0xfffffffffffffff8, 0x4}, {0x9, 0x1, 0x7}, 0x70bd25, 0x3505, 0x2, 0x2, 0x8, 0xa0}, 0x411, 0x58a}}, 0xf8}, 0x1, 0x0, 0x0, 0x20014}, 0x20008000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000003c0)={0x0, 0x0, &(0x7f0000000380)={&(0x7f0000000600)=@delpolicy={0x50, 0x14, 0x1, 0x70bd26, 0x25dfdbfd, {{@in6=@loopback, @in6=@private1, 0x4e20, 0x1f5, 0x4e24, 0x0, 0xa, 0xc0, 0x20, 0x2c}, 0x6e6bb5, 0x2}}, 0x50}, 0x1, 0x0, 0x0, 0x820}, 0x4cc00)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000500)={&(0x7f0000000680)=@newsa={0x170, 0x10, 0x633, 0x0, 0x0, {{@in6=@private0, @in=@broadcast, 0x0, 0x4000, 0x0, 0x8001, 0x0, 0x20}, {@in=@dev, 0x0, 0x32}, @in6=@dev={0xfe, 0x80, '\x00', 0xb}, {0x323}, {0x0, 0x6, 0x1}, {0x0, 0x8}, 0x70bd29, 0x0, 0xa, 0x1}, [@algo_crypt={0x48, 0x2, {{'ecb(cipher_null)\x00'}}}, @encap={0x1c, 0x20, {0x2, 0x4e22, 0x1, @in6=@empty}}, @encap={0x1c, 0x4, {0x2, 0x4e21, 0x0, @in6=@private2={0xfc, 0x2, '\x00', 0x1}}}]}, 0x170}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000001f40)={&(0x7f00000004c0)=@updpolicy={0xfc, 0x19, 0x1, 0x0, 0x0, {{@in=@multicast2, @in=@local, 0x0, 0x5, 0x0, 0x0, 0xa, 0x0, 0x0, 0x0, 0x0, 0xffffffffffffffff}, {0x0, 0xa9, 0x0, 0x0, 0x0, 0xffffffffffffffff}, {0x0, 0xfffffffffffffbff, 0x40800000000000, 0x800000000000000}}, [@tmpl={0x44, 0x5, [{{@in6=@dev={0xfe, 0x80, '\x00', 0x2e}, 0x0, 0x3c}, 0x0, @in=@broadcast, 0x0, 0x0, 0x3}]}]}, 0xfc}}, 0x20000000)
sendmsg$nl_xfrm(r0, &(0x7f0000000380)={0x0, 0x0, &(0x7f0000000340)={&(0x7f00000005c0)=@migrate={0xcc, 0x21, 0x1, 0x70bd27, 0xfffffffe, {{@in6=@private2, @in6=@private2={0xfc, 0x2, '\x00', 0x1}, 0xdffc, 0x0, 0x0, 0x2, 0xa, 0xe0}, 0x2}, [@migrate={0x50, 0x11, [{@in6=@dev={0xfe, 0x80, '\x00', 0x11}, @in=@local, @in=@local, @in6=@remote, 0x3c, 0x0, 0x0, 0x0, 0xa, 0xa}]}, @user_kmaddress={0x2c, 0x13, {@in6=@dev={0xfe, 0x80, '\x00', 0x2b}, @in6=@initdev={0xfe, 0x88, '\x00', 0x1, 0x0}, 0x0, 0xa}}]}, 0xcc}, 0x1, 0x0, 0x0, 0x800}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000840)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000000880)=@allocspi={0x104, 0x16, 0x401, 0x0, 0x0, {{{@in=@local, @in6=@mcast1, 0x0, 0xfd1, 0x4, 0x2, 0x2, 0x0, 0x0, 0xc}, {@in6=@private2, 0x0, 0x33}, @in6=@loopback, {0x0, 0x0, 0x0, 0x0, 0x8000000000000000, 0x80000001}, {0x101, 0x0, 0x0, 0x800}, {0x4, 0x6, 0x400}, 0x0, 0x0, 0xa, 0x0, 0x0, 0x94}, 0x0, 0x4ad}, [@sec_ctx={0xc, 0x8, {0x8, 0x8, 0x1, 0xb2}}]}, 0x104}, 0x1, 0x0, 0x0, 0x20040080}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000040)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000500)=ANY=[@ANYBLOB="3801000018000100feffffff0001000000000000000000000000ffffe0000002fc0100000000000000000000000000010001071c4e230005000000003a000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="ff020000000000000000000000000001000004d46c000000ac14142500000000000000000000000000000000000000009201000000000000a39b000000000000ffff0000000000001c250800000000000500000000000000fcffffffffffffff0000000000000000ffffffffffffffff00000000000000001f00000000000000fefffffffffffffffafffffffcffffff00000000800000000035000002"], 0x138}, 0x1, 0x0, 0x0, 0x8801}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000300)={0x0, 0x0, &(0x7f00000002c0)={&(0x7f0000000640)=ANY=[@ANYBLOB="440000002400150327bd7000fbdbdf259a0d00000600040000000000060003"], 0x44}, 0x1, 0x0, 0x0, 0x10}, 0x4000004)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000500)={&(0x7f0000000000)=@newsa={0x154, 0x10, 0x633, 0x0, 0x0, {{@in=@initdev={0xac, 0x1e, 0x0, 0x0}, @in=@broadcast}, {@in=@private, 0x0, 0x32}, @in6=@rand_addr=' \x01\x00', {}, {}, {}, 0x0, 0x0, 0x2}, [@algo_crypt={0x48, 0x2, {{'ecb(cipher_null)\x00'}}}, @encap={0x1c, 0x8, {0x0, 0x0, 0x0, @in6=@rand_addr=' \x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02'}}]}, 0x154}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, 0x0, 0x4004)
sendmsg$nl_xfrm(r0, &(0x7f00000002c0)={0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000000)=@updpolicy={0xb8, 0x14, 0x1, 0x0, 0x800, {{@in=@multicast1=0xe0000002, @in, 0x0, 0x800, 0x1000, 0x0, 0xa, 0x20}, {}, {0x1, 0x6, 0x0, 0xfffffffffffffffe}, 0x0, 0x6e6bbc}}, 0xb8}, 0x1, 0x0, 0x0, 0x404c830}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000040)={&(0x7f0000000680)=@newsa={0x150, 0x10, 0x713, 0x0, 0x25dfdbfc, {{@in6=@rand_addr=' \x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01', @in6=@rand_addr=' \x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01', 0x0, 0x0, 0x4e22, 0x2, 0xa, 0x0, 0x0, 0x6c, 0x0, 0xee00}, {@in6=@private1, 0xfe, 0x32}, @in6=@ipv4={'\x00', '\xff\xff', @remote}, {0x0, 0x0, 0x0, 0x8, 0x0, 0x0, 0x5, 0x546}, {0x7, 0x7fffffffffffffff}, {}, 0x70bd2d, 0x3500, 0xa, 0x1, 0x0, 0x50}, [@algo_aead={0x60, 0x12, {{'rfc4106(gcm(aes))\x00'}, 0xa0, 0x60, "210466d38547aa140db90100000000c54222cb7a"}}]}, 0x150}}, 0x0)

      
      sendmsg$nl_xfrm_esn_in(0xffffffffffffffff, &(0x7f0000000300)={0x0, 0x0, &(0x7f0000000280)={&(0x7f0000000180)=@newsa_esn_in={0xf0, 0x10, 0x0, 0x70bd26, 0x25dfdbfb, {{@in6=@private2={0xfc, 0x2, '\x00', 0x1}, @in6=@private2, 0x4e21, 0x5e9, 0x4e21, 0x5, 0x2, 0x20, 0xc0, 0x73}, {@in=@remote, 0x4d4}, @in=@broadcast, {0x8, 0x7, 0x7, 0x4445, 0x8, 0x2, 0xffffffffffffffb6, 0x9}, {0x779, 0x9d09, 0x5, 0xe000000000000000}, {0x9, 0x6, 0x8}, 0x70bd27, 0x3506, 0xa, 0x5}}, 0xf0}, 0x1, 0x0, 0x0, 0x7a3368f0fdc56bd}, 0x41)
r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000040)={&(0x7f0000000100)=ANY=[@ANYBLOB="6c0100001000130700000000fcdbdf25e0000001000000000000000000000000ff02000000000000000000000000000100040b6e4e2100020000008021000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="7f000001000000000000000000000000000004d432000000000000000000000000000000000000000000000000000000040000000000000001000000000000000100000000000080fffffffffeffffff1000000000000000018000000000000043050000000000000400000000000000ffffffffffffff7f0b00000000000000fdffffffffffffff090000000e000000000000002cbd7000003500000a000000980000000000000060001200726663343130362867636d2861657329290000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000a00000004000000066d38547aa140d000000000000000086cae9fb4f1c0017"], 0x16c}, 0x1, 0x0, 0x0, 0x880}, 0x2094)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000000c0)={0x0, 0x0, &(0x7f0000000080)={&(0x7f0000000040)=ANY=[@ANYBLOB="1c0000001d00010000000000000000000a00100009"], 0x1c}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f000014f000)={0x0, 0x0, &(0x7f00000bfff0)={&(0x7f0000006440)=@updpolicy={0xb8, 0x19, 0x1, 0x0, 0x0, {{@in6=@mcast1, @in=@multicast1, 0x0, 0x0, 0x0, 0x0, 0xa, 0x60}}}, 0xb8}}, 0x0)
sendmsg$nl_xfrm(r0, 0x0, 0x40000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000040)={0x0, 0x0, &(0x7f0000000380)={&(0x7f0000000500)=@newsa={0x13c, 0x10, 0x713, 0x0, 0x0, {{@in6=@local, @in6=@local}, {@in6=@private1, 0x0, 0x33}, @in6=@local, {}, {0x0, 0x8}, {}, 0x0, 0x0, 0x2}, [@algo_auth_trunc={0x4c, 0x14, {{'cmac(aes)\x00'}}}]}, 0x13c}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000003c0)={0x0, 0x0, &(0x7f0000000380)={&(0x7f0000000600)=@delpolicy={0x5c, 0x14, 0x1, 0x70bd26, 0x25dfdbfd, {{@in6=@loopback, @in6=@private1, 0x4e20, 0x1f5, 0x4e24, 0x0, 0xa, 0xc0, 0x20, 0x2c}, 0x6e6bb5, 0x2}, [@mark={0xc, 0x15, {0x35075c, 0x800}}]}, 0x5c}, 0x1, 0x0, 0x0, 0x820}, 0x4cc00)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000840)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000000000)=@updsa={0x140, 0x1a, 0x1, 0x70bd27, 0x25dfdbfd, {{@in=@loopback, @in=@broadcast, 0x4e22, 0xffef, 0x4e30, 0x2, 0xa, 0x60, 0x80}, {@in=@multicast1, 0x4d6, 0x32}, @in6=@private0, {0x8000, 0x6, 0x9, 0x2, 0x5, 0xa3a7, 0x9da}, {0x5, 0x0, 0x2, 0x7}, {0x1, 0xb3e1, 0x6}, 0x70bd2a, 0x0, 0xa, 0x1, 0x1, 0x21}, [@algo_auth={0x48, 0x1, {{'sha256-arm64-neon\x00'}}}, @XFRMA_IPTFS_REORDER_WINDOW={0x6, 0x25, 0x4}]}, 0x140}, 0x1, 0x0, 0x0, 0x40000}, 0x40000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000000840)=@updsa={0x140, 0x1a, 0x1, 0x70bd27, 0x25dfdbfe, {{@in=@loopback, @in=@dev={0xac, 0x14, 0x14, 0x3f}, 0x4e1e, 0xffff, 0x4e20, 0x2, 0xa, 0x60, 0x80}, {@in=@multicast1, 0x4d6, 0x32}, @in6=@empty, {0x8000, 0x6, 0x9, 0x2, 0x80005, 0xa3a7, 0x3f}, {0x5, 0x3, 0x2, 0x7}, {0x1, 0x0, 0x10000}, 0x70bd2a, 0x0, 0xa, 0x1, 0x1, 0x21}, [@algo_auth={0x48, 0x1, {{'cmac(aes)\x00'}}}, @XFRMA_SA_DIR={0x5, 0x21, 0x2}]}, 0x140}, 0x1, 0x0, 0x0, 0x4040041}, 0x88d0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000000c0)={0x0, 0x0, &(0x7f0000000080)={&(0x7f0000000600)=@updsa={0xf0, 0x1a, 0x1, 0x70bd26, 0x0, {{@in=@private=0xa010101, @in=@empty}, {@in=@multicast1, 0x0, 0x33}, @in6=@rand_addr=' \x01\x00', {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x4}, {0x0, 0x4000000000}, {}, 0x0, 0x20000, 0x2}}, 0xf0}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000800)={&(0x7f00000002c0)=ANY=[@ANYBLOB="5c0100001a00010027bd7000fedbdf257f000001000000000000000000000000ac14143f000000000000000000000000ce1effff4e2000020a00608021000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="ff010000000000000000000000000001000004d6320000000000000000000000000000000000000000800000000000000600000000000000090000000000000002000000000000000400000000000000a7a30000000000003f000000000000000000000000000000050000000000000003000000000000000200000000000000070000000000000000000000fdffffff000001002abd7000ff3400000a000100210000000000000048000100636d61632861657329000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000005002100020000001c001700000000002bbd70bd70"], 0x15c}, 0x1, 0x0, 0x0, 0x40400c5}, 0x88d0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f000014f000)={0x0, 0x0, &(0x7f0000bd7000)={&(0x7f0000c07e98)=ANY=[], 0x4}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000840)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000000e40)=ANY=[@ANYBLOB="0c0300001a00010027bd7000fddbdf257f000001000000000000000000000000000000000000000000000000000000014e22ffef4e2000020a00208000000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="e0000001000000000000000000000000000004d232000000ac1e000100000000000000000000000000800000000000000600000000000000090000000000000002000000000000000500000000000000a7a30000000000003f000000000000000300000000000000050000000000000000000000000000000200000000000000070000000000000004000000e1b30000060000002abd7000033500000a0000012100000000000000480001007368613531322d7373736533000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000800180005"], 0x30c}, 0x1, 0x0, 0x0, 0x40000}, 0x40000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000580)={0x0, 0x0, &(0x7f00000000c0)={&(0x7f0000000000)=@updpolicy={0xb8, 0x13, 0xcb23c9c9931e99e9, 0x0, 0x0, {{@in6=@private0, @in6=@ipv4={'\x00', '\xff\xff', @loopback}, 0x0, 0x0, 0x0, 0x0, 0xa, 0xa0, 0x0, 0x0, 0x0, 0xee01}, {0x0, 0x0, 0xaa3, 0xfffffffffffffff8}, {0x0, 0x8}}}, 0xb8}, 0x1, 0x0, 0x0, 0x800}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000040)={&(0x7f00000001c0)=@newae={0x48, 0x1e, 0x1, 0x70bd2c, 0x25dfdbfd, {{@in=@private=0xa010100, 0x4d4, 0xa, 0x32}, @in=@multicast2, 0x3, 0x3503}, [@replay_thresh={0x8, 0xb, 0x10}]}, 0x48}}, 0x20000800)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000000300)=@updsa={0x148, 0x1a, 0x1, 0x70bd2b, 0x25dfdbfd, {{@in=@loopback, @in=@private=0xa010102, 0x4e22, 0xffef, 0x4e30, 0x2, 0xa, 0x60, 0x80}, {@in6=@local, 0x4d6, 0x32}, @in6=@private0, {0x8000, 0x6, 0x9, 0x2, 0x5, 0xa3a7, 0x9da}, {0x5, 0x0, 0x2, 0x7}, {0x1, 0xb3e1, 0x6}, 0x70bd2a, 0x0, 0xa, 0x1, 0x1, 0x21}, [@algo_auth={0x48, 0x1, {{'sha256-arm64-neon\x00'}}}, @tfcpad={0x8, 0x16, 0x7}, @XFRMA_IPTFS_DROP_TIME={0x8, 0x24, 0x10}]}, 0x148}, 0x1, 0x0, 0x0, 0x40000}, 0x40000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000140)={&(0x7f0000000000)=@updpolicy={0xc0, 0x19, 0x1, 0x70bd2c, 0x25dfdbfb, {{@in=@remote, @in=@remote, 0x4e22, 0x3, 0x4e22, 0x3ff, 0x2, 0x0, 0x20, 0x11}, {0x7fffffff, 0x7f, 0x8002, 0x3, 0x7fff, 0x100000004, 0x2, 0x8}, {0x56d, 0x40c, 0x7, 0x1000}, 0x401, 0x0, 0x0, 0x0, 0x0, 0x3}, [@XFRMA_IF_ID={0x8, 0x1f, 0x4}]}, 0xc0}, 0x1, 0x0, 0x0, 0x8000}, 0x40040)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000001740)={0x0, 0x0, &(0x7f0000001700)={&(0x7f00000015c0)=@newspdinfo={0x1c, 0x24, 0x125, 0x0, 0x0, 0x0, [@XFRMA_SPD_IPV4_HTHRESH]}, 0x1c}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000001f40)={&(0x7f00000005c0)=@updpolicy={0xfc, 0x19, 0x1, 0x0, 0x0, {{@in=@empty, @in6=@mcast2, 0x0, 0x8, 0x0, 0x0, 0xa, 0x0, 0x0, 0x0, 0x0, 0xffffffffffffffff}, {0x0, 0xa9, 0x0, 0x1, 0x0, 0xffffffffffffffff}, {0x0, 0xa00, 0x40800000000000, 0x800000000000002}}, [@tmpl={0x44, 0x5, [{{@in6=@private0={0xfc, 0x0, '\x00', 0x1}, 0x0, 0x3c}, 0x0, @in=@loopback, 0x2, 0x4}]}]}, 0xfc}}, 0x0)
r1 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r1, &(0x7f0000000380)={0x0, 0x0, &(0x7f0000000340)={&(0x7f00000004c0)=@migrate={0xa0, 0x21, 0x1, 0x4, 0x0, {{@in=@remote, @in6=@mcast2, 0x0, 0x0, 0x0, 0x2, 0xa, 0x0, 0xa0, 0x2e}}, [@migrate={0x50, 0x11, [{@in=@rand_addr=0x64010101, @in6=@dev={0xfe, 0x80, '\x00', 0x10}, @in6=@private2={0xfc, 0x2, '\x00', 0x1}, @in=@multicast1, 0x3c, 0x4, 0x0, 0x0, 0x2, 0xa}]}]}, 0xa0}, 0x1, 0x0, 0x0, 0x40000}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f00000001c0)={&(0x7f00000003c0)=@newsa={0x154, 0x10, 0x1, 0x0, 0x25dfdbfc, {{@in6=@remote, @in6=@loopback}, {@in=@empty, 0x0, 0x32}, @in6=@loopback, {0x0, 0x0, 0x0, 0x0, 0x6, 0x9}, {0x0, 0x6}, {0x0, 0x0, 0x4}, 0x0, 0x0, 0xa, 0x0, 0x0, 0xad}, [@algo_crypt={0x48, 0x2, {{'ecb(cipher_null)\x00'}}}, @replay_esn_val={0x1c, 0x17, {0x0, 0x0, 0x0, 0x0, 0x0, 0x2}}]}, 0x154}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000000300)=@updsa={0x140, 0x1a, 0x1, 0x70bd2b, 0x25dfdbfd, {{@in=@loopback, @in=@private=0xa010102, 0x4e22, 0xffef, 0x4e30, 0x2, 0xa, 0x60, 0x80}, {@in6=@local, 0x4d6, 0x32}, @in6=@private0, {0x8000, 0x6, 0x9, 0x2, 0x5, 0xa3a7, 0x9da}, {0x5, 0x0, 0x2, 0x7}, {0x1, 0xb3e1, 0x6}, 0x70bd2a, 0x0, 0xa, 0x1, 0x1, 0x21}, [@algo_auth={0x48, 0x1, {{'sha256-arm64-neon\x00'}}}, @tfcpad={0x8, 0x16, 0x7}]}, 0x140}, 0x1, 0x0, 0x0, 0x40000}, 0x40000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000005c0)={0x0, 0x0, &(0x7f0000000580)={&(0x7f0000000480)=@polexpire={0xcc, 0x1b, 0x1, 0x70bd2c, 0x25dfdbfb, {{{@in6=@ipv4={'\x00', '\xff\xff', @local}, @in=@initdev={0xac, 0x1e, 0x1, 0x0}, 0x4e22, 0x0, 0x4e23, 0x7346, 0xa, 0x20, 0x20, 0x6c}, {0x26e9, 0x8, 0x3, 0x40, 0xdcd, 0x7, 0xffff, 0x8}, {0x2, 0x7, 0x5, 0x9}, 0x8, 0x1, 0x1, 0x1}, 0x1}, [@mark={0xc, 0x15, {0x35075b, 0xfffff887}}]}, 0xcc}, 0x1, 0x0, 0x0, 0x4004}, 0x4)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000400)={0x0, 0x0, &(0x7f00000003c0)={&(0x7f0000000180)=ANY=[@ANYBLOB="6800000015000100000000000000ff00fe8800000000000000000000800000010000000000000000000000000000000000040000000000000000b10000000000", @ANYRES32, @ANYRES32, @ANYBLOB='\x00\x00\x00\x00\x00\x00\x00\x00\f\x00\b'], 0x68}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000140)={&(0x7f0000000040)=@migrate={0xa0, 0x21, 0x1, 0x70bd2c, 0x25dfdbfd, {{@in=@multicast2, @in6=@initdev={0xfe, 0x88, '\x00', 0x1, 0x0}, 0x4e24, 0x4, 0x4e23, 0xd, 0xa, 0x0, 0x0, 0xff}, 0x6e6bb2, 0x1}, [@migrate={0x50, 0x11, [{@in6=@initdev={0xfe, 0x88, '\x00', 0x0, 0x0}, @in=@rand_addr=0x64010100, @in=@private=0xa010100, @in6=@private0={0xfc, 0x0, '\x00', 0x1}, 0x32, 0x2, 0x0, 0x3506, 0x4d21fe76179a5365, 0x2}]}]}, 0xa0}, 0x1, 0x0, 0x0, 0x85}, 0x20000000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000800)={&(0x7f00000002c0)=ANY=[@ANYBLOB="5c0100001a00010027bd7000fedbdf257f000001000000000000000000000000ac14143f000000000000000000000000ce1effff4e2000020a00608021000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="ff010000000000000000000000000001000004d6320000000000000000000000000000000000000000800000000000000600000000000000090000000000000002000000000000000400000000000000a7a30000000000003f000000000000000000000000000000050000000000000003000000000000000200000000000000070000000000000000000000fdffffff000001002abd7000ff3400000a000100210000000000000048000100636d61632861657329000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000005002100020000001c0017"], 0x15c}, 0x1, 0x0, 0x0, 0x40400c5}, 0x88d0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000005c0)={0x0, 0x0, &(0x7f0000000580)={&(0x7f0000000480)=@polexpire={0xc0, 0x1b, 0x1, 0x70bd2c, 0x25dfdbfb, {{{@in=@local, @in=@initdev={0xac, 0x1e, 0x1, 0x0}, 0x4e22, 0x0, 0x4e23, 0x7346, 0x2, 0x20, 0x0, 0x6c}, {0x26e9, 0x8, 0x3, 0x3d, 0xdcd, 0x7, 0xffff, 0x8}, {0x2, 0x7, 0x5, 0x9}, 0x8, 0x0, 0x1, 0x1}, 0x1}}, 0xc0}, 0x1, 0x0, 0x0, 0x40001}, 0x4)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000140)={&(0x7f00000001c0)=@migrate={0xac, 0x21, 0x1, 0x70bd2c, 0x25dfdbfd, {{@in=@multicast2, @in6=@remote, 0x4e24, 0x4, 0x4e23, 0xd, 0x2}, 0x6e6bb2}, [@migrate={0x50, 0x11, [{@in6=@dev={0xfe, 0x80, '\x00', 0x21}, @in=@rand_addr=0x64010100, @in=@loopback, @in6=@ipv4={'\x00', '\xff\xff', @multicast2}, 0x32, 0x1, 0x0, 0x3501, 0x4d21fe76179a5365, 0x2}]}, @policy_type={0xa, 0x10, {0xe52f06e6cbbf34bb}}]}, 0xac}, 0x1, 0x0, 0x0, 0x81}, 0x20000000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000001c0)={0x0, 0x0, &(0x7f0000000180)={&(0x7f0000000440)=@allocspi={0x104, 0x16, 0x201, 0x70bd25, 0x25dfdbfb, {{{@in6=@local, @in=@remote, 0x4e23, 0x100, 0x4e21, 0x6, 0x2, 0x80, 0xa0, 0x6c}, {@in6=@mcast1, 0x4d2, 0x32}, @in6=@loopback, {0x0, 0x100000001, 0x10, 0x795, 0x3, 0x6, 0x7fff, 0x5e}, {0x7, 0x7, 0x1423ac00, 0x3}, {0x1, 0x5, 0xfffffffb}, 0x70bd27, 0x0, 0x2, 0x1, 0x1, 0x2}, 0x6, 0x401}, [@mark={0xc, 0x15, {0x350759, 0x80}}]}, 0x104}, 0x1, 0x0, 0x0, 0x4000000}, 0x24000014)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000100)={0x0, 0x0, &(0x7f00000000c0)={&(0x7f00000002c0)=@flushpolicy={0x10, 0x1d, 0x1, 0x70bd2a, 0x25dfdbfd}, 0x10}, 0x1, 0x0, 0x0, 0x4000020}, 0x4000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000005c0)={0x0, 0x0, &(0x7f0000000580)={&(0x7f0000000000)=@polexpire={0xcc, 0x1b, 0x1, 0x70bd2c, 0x25dfdbfb, {{{@in6=@ipv4={'\x00', '\xff\xff', @local}, @in=@initdev={0xac, 0x1e, 0x1, 0x0}, 0x4e22, 0x0, 0x4e23, 0x7346, 0xa, 0x20, 0x20, 0x84}, {0x26e9, 0x8, 0x3, 0x40, 0xdcd, 0x7, 0xffff, 0x8}, {0x2, 0x7, 0x5, 0x9}, 0x8, 0x0, 0x1, 0x1}, 0x1}, [@sec_ctx={0xc, 0x8, {0x8, 0x8, 0x0, 0x30}}]}, 0xcc}, 0x1, 0x0, 0x0, 0x4004}, 0x4)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000840)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000000480)=@updsa={0x164, 0x1a, 0x1, 0x70bd27, 0x25dfdbfd, {{@in=@loopback, @in6=@loopback, 0x4e22, 0xffff, 0x4e22, 0x2, 0xa, 0x20, 0x80}, {@in=@multicast1, 0x4d2, 0x32}, @in=@initdev={0xac, 0x1e, 0x0, 0x0}, {0x8000, 0x6, 0x9, 0x6, 0x5, 0xa3a7, 0x3f, 0x3}, {0x5, 0x0, 0x2, 0x7}, {0x4, 0xb3e1, 0x6}, 0x70bd2a, 0x0, 0xa, 0x0, 0x1, 0x21}, [@encap={0x1c, 0x4, {0x2, 0x4e24, 0x4e21, @in=@rand_addr=0x64010102}}, @XFRMA_MTIMER_THRESH={0x8, 0x20, 0x1}, @algo_auth={0x48, 0x1, {{'ghash-clmulni\x00'}}}, @XFRMA_SA_DIR={0x5, 0x21, 0x2}]}, 0x164}, 0x1, 0x0, 0x0, 0x40000}, 0x40000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000100)={0x0, 0x0, &(0x7f00000000c0)={&(0x7f0000000480)=@migrate={0xa0, 0x21, 0x1, 0x70bd2d, 0x25dfdbfe, {{@in=@multicast1, @in=@multicast2, 0x4e22, 0x1, 0x4e20, 0x7, 0xa, 0x1e0, 0xa0, 0x3a}, 0x6e6bc0}, [@migrate={0x50, 0x11, [{@in=@private=0xa010101, @in6=@private2={0xfc, 0x2, '\x00', 0x1}, @in=@dev={0xac, 0x14, 0x14, 0x20}, @in=@rand_addr=0x64010101, 0x2b, 0x3, 0x0, 0x350b, 0xa, 0xa}]}]}, 0xa0}, 0x1, 0x0, 0x0, 0x40000}, 0x800)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000000440)={&(0x7f0000000880)=@updsa={0x180, 0x1a, 0x1, 0x70bd23, 0x25dfdbfe, {{@in=@rand_addr=0x64010100, @in6=@remote, 0x4e23, 0x7ff, 0x4e1f, 0x7, 0xa, 0x80, 0x80, 0x6c}, {@in=@multicast1, 0x4d2, 0x6c}, @in=@multicast1, {0x480000000, 0x5246, 0x8, 0x6b, 0x80, 0x3, 0x6, 0xff}, {0x9, 0xffffffffffffffff, 0xc, 0x4}, {0x1, 0x7, 0x1533c}, 0x70bd27, 0x0, 0xa, 0x1, 0x0, 0x8}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}, @algo_auth={0x48, 0x1, {{'rmd128-generic\x00'}}}]}, 0x180}, 0x1, 0x0, 0x0, 0x895}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000280)={0x0, 0x0, &(0x7f0000000240)={&(0x7f0000000100)=@getpolicy={0x5c, 0x15, 0x801, 0x70bd2a, 0x25dfdbfd, {{@in6=@dev={0xfe, 0x80, '\x00', 0x37}, @in6=@private2, 0x4e22, 0x0, 0x4e20, 0x2, 0x2, 0x20, 0x60, 0xff}, 0x0, 0x1}, [@sec_ctx={0xc, 0x8, {0x8, 0x8, 0x0, 0x9}}]}, 0x5c}, 0x1, 0x0, 0x0, 0x80}, 0x14044000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000800)={0x0}, 0x1, 0x0, 0x0, 0x40400c5}, 0x88d0)

      
      socket$nl_xfrm(0x10, 0x3, 0x6)
r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000100)={0x0, 0x0, &(0x7f00000000c0)={&(0x7f0000000480)=ANY=[@ANYBLOB="a0000000210001002dbd7000fedbdf2500000000000000000000000000000000e00000020000000000000000000000004e2200014e2000070a00e0a03b000000", @ANYRES32=r0, @ANYRES32=0x0, @ANYBLOB="c36b6e0000000000500011"], 0xa0}, 0x1, 0x0, 0x0, 0x40000}, 0x800)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f000014f000)={0x0, 0x0, &(0x7f00000bfff0)={&(0x7f0000006440)=@updpolicy={0xb8, 0x19, 0x1, 0x0, 0x0, {{@in6=@mcast1, @in=@multicast1, 0x0, 0x0, 0x0, 0x0, 0xa, 0x760}}}, 0xb8}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000080)={0x0, 0x0, &(0x7f00000bfff0)={&(0x7f0000000240)=@acquire={0x134, 0x17, 0x1, 0x0, 0x0, {{@in6=@ipv4}, @in6=@initdev={0xfe, 0x88, '\x00', 0x0, 0x0}, {@in, @in6=@local}, {{@in=@multicast1, @in6=@ipv4={'\x00', '\xff\xff', @remote}, 0x0, 0x0, 0x0, 0x0, 0xa}}}, [@sec_ctx={0xc, 0x8, {0x8, 0x8, 0x1, 0x1, 0x300}}]}, 0x134}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000840)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000000000)=@updsa={0x140, 0x1a, 0x1, 0x70bd27, 0x25dfdbfd, {{@in=@loopback, @in6=@loopback, 0x4e22, 0xffef, 0x4e20, 0x2, 0xa, 0x20, 0x80}, {@in=@multicast1, 0x4d2, 0x33}, @in=@initdev={0xac, 0x1e, 0xff, 0x0}, {0x8000, 0x6, 0x9, 0x200, 0x5, 0xa3a7, 0x43, 0x3}, {0x5, 0x342, 0x2, 0x7}, {0x1, 0xb3e1, 0x6}, 0x70bd2a, 0x0, 0xa, 0x0, 0x1, 0x21}, [@algo_auth={0x48, 0x1, {{'sha256-mb\x00'}}}, @tfcpad={0x8, 0x16, 0x3}]}, 0x140}, 0x1, 0x0, 0x0, 0x40000}, 0x40000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000001c0)={0x0, 0x0, &(0x7f0000000180)={&(0x7f0000000140)=@getsadinfo={0x14, 0x23, 0x1, 0x70bd28, 0x25dfdbfe}, 0x14}, 0x1, 0x0, 0x0, 0x40}, 0x26008000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000040)={&(0x7f00000003c0)=ANY=[@ANYBLOB="400000001200050104000000ffdbdf2528001a00ac1414aa0000000000000000000000000a0101000008000000000000000000000200061005001900"], 0x40}, 0x13, 0x0, 0x0, 0x880}, 0x20040840)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000840)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000000880)=@updsa={0x148, 0x1a, 0x1, 0x70bd2d, 0x25dfdbfd, {{@in=@remote, @in6=@empty, 0x4e22, 0xffef, 0x4e20, 0x2, 0xa, 0x20, 0x80}, {@in=@initdev={0xac, 0x1e, 0x0, 0x0}, 0x4d2, 0x32}, @in=@initdev={0xac, 0x1e, 0x0, 0x0}, {0x8003, 0x6, 0x9, 0x2, 0x5, 0xa3a7, 0x800000000000003f, 0x3}, {0x4, 0x0, 0x2, 0x7}, {0x4, 0xb3e1, 0x6}, 0x70bd2a, 0x0, 0xa, 0x4, 0x1, 0x21}, [@algo_auth={0x48, 0x1, {{'sha256-mb\x00'}}}, @XFRMA_SA_DIR={0x5, 0x21, 0x1}, @extra_flags={0x8, 0x18, 0x1}]}, 0x148}, 0x1, 0x0, 0x0, 0x40000}, 0x40000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000080)={0x0, 0x0, &(0x7f0000000040)={&(0x7f00000000c0)=@migrate={0xa0, 0x21, 0x1, 0x70bd2b, 0x25dfdbfd, {{@in=@empty, @in=@dev={0xac, 0x14, 0x14, 0xb}, 0x4e20, 0x0, 0x4e22, 0x7ff, 0x2, 0x80, 0x80, 0x11}, 0x0, 0x1}, [@migrate={0x50, 0x11, [{@in6=@private1={0xfc, 0x1, '\x00', 0x1}, @in6=@mcast2, @in=@empty, @in=@rand_addr=0x64010101, 0x6c, 0x0, 0x0, 0x3502, 0x2, 0xa}]}]}, 0xa0}, 0x1, 0x0, 0x0, 0x20000016}, 0x4000)

      
      socket$nl_xfrm(0x10, 0x3, 0x6)
r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000000440)={&(0x7f0000000800)=@updsa={0x140, 0x1a, 0x1, 0x70bd65, 0x25dfdbfe, {{@in=@rand_addr=0x64010100, @in6=@remote, 0x4e23, 0x7ff, 0x4e22, 0x7, 0xa, 0x0, 0x80, 0x6c}, {@in6=@rand_addr=' \x01\x00', 0x4d5, 0x6c}, @in=@multicast2, {0x480000000, 0x5246, 0x8, 0x6b, 0x80, 0x3, 0x6, 0x1}, {0xa, 0xfffffffffffffffc, 0xc, 0x4}, {0x1, 0x7, 0x7fff}, 0x70bd27, 0x0, 0x2, 0x1, 0x0, 0xc}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}, @XFRMA_SA_DIR={0x5, 0x21, 0x1}]}, 0x140}, 0x1, 0x0, 0x0, 0x895}, 0x0)
sendmsg$nl_xfrm(0xffffffffffffffff, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000001f40)={&(0x7f00000005c0)=@updpolicy={0xfc, 0x19, 0x1, 0x0, 0x0, {{@in=@empty, @in6=@mcast2, 0x0, 0x8, 0x0, 0x0, 0xa, 0x0, 0x0, 0x0, 0x0, 0xffffffffffffffff}, {0x0, 0xa9, 0x0, 0x1, 0x0, 0xffffffffffffffff}, {0x0, 0xa00, 0x40800000000000, 0x800000000000002}}, [@tmpl={0x44, 0x5, [{{@in6=@private0={0xfc, 0x0, '\x00', 0x1}, 0x0, 0x3c}, 0x0, @in=@loopback, 0x2, 0x4}]}]}, 0xfc}}, 0x0)
r1 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r1, &(0x7f0000000380)={0x0, 0x0, &(0x7f0000000340)={&(0x7f00000004c0)=ANY=[@ANYBLOB="bc000000210001000400000000000000fc020000000000000000000000000001ff02000000000000000000000000000100000000000000020a0000a02e000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="000000000000000050001100fc0000000040000000000000000000017f000001000000000000000000000000fc020000000000000000000000000001e00000010000000000000000000000003c040000000000000a000a001c000400ffff4e244e2400000a01010200"/116], 0xbc}, 0x1, 0x0, 0x0, 0x40000}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000002c0)={0x0, 0x0, &(0x7f0000001fc0)={&(0x7f0000003280)=@migrate={0x138, 0x21, 0x217, 0x70bd26, 0x25dfdbfb, {{@in=@multicast1, @in=@broadcast, 0x4e24, 0x1, 0x4e24, 0x3, 0x2, 0x20, 0x80}, 0x6e6bbd}, [@migrate={0xe8, 0x11, [{@in=@multicast2, @in6=@mcast1, @in=@multicast2, @in=@multicast2, 0x2b, 0x0, 0x0, 0x0, 0x2, 0xa}, {@in=@initdev={0xac, 0x1e, 0x0, 0x0}, @in6=@private1={0xfc, 0x1, '\x00', 0x1}, @in=@initdev={0xac, 0x1e, 0x0, 0x0}, @in6=@local, 0x2b, 0x4, 0x0, 0x3503, 0x2, 0xa}, {@in6=@private1={0xfc, 0x1, '\x00', 0x1}, @in6=@mcast1, @in6=@private0, @in=@multicast1, 0x33, 0x4, 0x0, 0x3500, 0x2, 0x2}]}]}, 0x138}, 0x1, 0x0, 0x0, 0x40820}, 0x632fe732f9b010e)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000140)={&(0x7f0000000000)=@updpolicy={0xcc, 0x19, 0x1, 0x70bd2c, 0x25dfdbfb, {{@in=@remote, @in=@remote, 0x4e22, 0x3, 0x4e22, 0x3ff, 0x2, 0x0, 0x20, 0x11}, {0x7fffffff, 0x7f, 0x8002, 0x3, 0x7fff, 0x100000004, 0x2, 0x8}, {0x56d, 0x40c, 0x7, 0x1000}, 0x401, 0x0, 0x0, 0x0, 0x0, 0x3}, [@XFRMA_IF_ID={0x8, 0x1f, 0x4}, @mark={0xc, 0x15, {0x35075c, 0x9}}]}, 0xcc}, 0x1, 0x0, 0x0, 0x8000}, 0x40040)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000000200)={&(0x7f00000004c0)=@updpolicy={0xb8, 0x19, 0x1, 0x0, 0x1, {{@in6=@initdev={0xfe, 0x88, '\x00', 0x1, 0x0}, @in=@remote, 0x0, 0x0, 0x0, 0x0, 0x2, 0x0, 0x0, 0x0, 0x0, 0xffffffffffffffff}, {0x0, 0x0, 0x0, 0x0, 0x0, 0x3}, {0x0, 0x0, 0x800}, 0x200, 0x0, 0x0, 0x0, 0x1}}, 0xb8}, 0x1, 0x0, 0x0, 0x1}, 0x4000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000000440)={&(0x7f0000000240)=@updsa={0x140, 0x1a, 0x1, 0x70bd25, 0x25dfdbfe, {{@in=@rand_addr=0x64010100, @in6=@remote, 0x4e23, 0x7ff, 0x4e22, 0x7, 0xa, 0x0, 0x80, 0x6c}, {@in6=@rand_addr=' \x01\x00', 0x4d2, 0x6c}, @in=@multicast2, {0x480000000, 0x5246, 0x8, 0x6b, 0x80, 0x3, 0x6, 0x1}, {0xa, 0xfffffffffffffffc, 0xc, 0x4}, {0x1, 0x7, 0x7fff}, 0x70bd27, 0x0, 0x2, 0x1, 0x0, 0x8}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}, @XFRMA_NAT_KEEPALIVE_INTERVAL={0x8, 0x22, 0xfffffffd}]}, 0x140}, 0x1, 0x0, 0x0, 0x895}, 0x80)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f00000001c0)={&(0x7f00000003c0)=@newsa={0x158, 0x10, 0x1, 0x0, 0x25dfdbfc, {{@in6=@remote, @in6=@loopback}, {@in=@empty, 0x0, 0x32}, @in6=@loopback, {0x0, 0x0, 0x0, 0x0, 0x6, 0x9}, {0x0, 0x6}, {0x0, 0x0, 0x4}, 0x0, 0x0, 0xa, 0x0, 0x0, 0xad}, [@algo_crypt={0x48, 0x2, {{'ecb(cipher_null)\x00'}}}, @replay_esn_val={0x20, 0x17, {0x60, 0x0, 0x0, 0x0, 0x0, 0x2, [0x0]}}]}, 0x158}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000140)={&(0x7f00000004c0)=ANY=[@ANYBLOB="fc0000001900010026bd7000fbdbdf2520010000000000000000000000000002fc0100000000000000000000000000004e1e00034e2103ff0200002001000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="ffffff7f00000000010000000000000001800000000000000300000000000000f700000000000000060000000000000002000000000000000100000000000000ffffff7f000000000c0000000000000007000000000000000e00000000000000030000000000010000010000000000004400050000000000000100000000ffff64010101000004d5000000000036"], 0xfc}, 0x1, 0x0, 0x0, 0x8000}, 0x40040)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000380)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000240)=@getae={0x4c, 0x1f, 0x1, 0x70bd27, 0x25dfdbff, {{@in6=@remote, 0x4da, 0x2, 0x6c}, @in=@loopback, 0x2, 0x34fd}, [@mark={0xc, 0x15, {0x35075c, 0x81}}]}, 0x4c}, 0x1, 0x0, 0x0, 0x20004045}, 0x40040)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000340)={&(0x7f0000000540)=@newsa={0x13c, 0x10, 0x713, 0x0, 0x0, {{@in6=@initdev={0xfe, 0x88, '\x00', 0x0, 0x0}, @in6=@mcast1, 0x0, 0x0, 0x0, 0x0, 0x2}, {@in=@multicast2, 0x0, 0x33}, @in6=@rand_addr=' \x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02', {}, {0x0, 0x0, 0x0, 0x4}, {}, 0x0, 0x3502, 0x2}, [@algo_auth_trunc={0x4c, 0x14, {{'sha1\x00'}}}]}, 0x13c}}, 0x40)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000002c0)={0x0, 0x0, &(0x7f0000001fc0)={&(0x7f0000003280)=@migrate={0xa0, 0x21, 0x217, 0x70bd26, 0x25dfdbfb, {{@in=@multicast1, @in=@broadcast, 0x4e24, 0x1, 0x4e24, 0x3, 0x2, 0x20, 0x80}, 0x6e6bbd}, [@migrate={0x50, 0x11, [{@in=@multicast2, @in6=@mcast1, @in=@multicast2, @in=@multicast2, 0x2b, 0x0, 0x0, 0x0, 0x2, 0xa}]}]}, 0xa0}, 0x1, 0x0, 0x0, 0x40820}, 0x632fe732f9b010e)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000840)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000000600)=@updsa={0xf0, 0x1a, 0x1, 0x70bd27, 0x25dfdbfd, {{@in=@loopback, @in6=@loopback, 0x4e22, 0xffff, 0x4e22, 0x2, 0xa, 0x20, 0x80}, {@in=@multicast1, 0x4d2, 0x13a707ea02a4b743}, @in=@initdev={0xac, 0x1e, 0x0, 0x0}, {0x8000, 0x6, 0x6, 0x6, 0x5, 0xa3a7, 0x3f, 0x3}, {0x5, 0x0, 0x2, 0x7}, {0x4, 0xb3e1, 0x6}, 0x70bd2a, 0x0, 0xa, 0x0, 0x1, 0x21}}, 0xf0}, 0x1, 0x0, 0x0, 0x40000}, 0x40000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000340)={0x0, 0x0, &(0x7f0000000240)={&(0x7f00000003c0)=ANY=[@ANYBLOB="5c000000150001082abd7000fddbdf2500000000000000000000000000000000fc0200000000000000000000000000004e2200004e20000302002060ff000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="00000000010000000c000800ea"], 0x5c}, 0x1, 0x0, 0x0, 0x80}, 0x14044000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000002c0)={0x0, 0x0, &(0x7f0000000280)={&(0x7f0000000100)=@delsa={0x34, 0x11, 0x1, 0x70bd25, 0x25dfdbfc, {@in=@initdev={0xac, 0x1e, 0x1, 0x0}, 0xfffffff9, 0x2, 0xff}, [@mark={0xc, 0x15, {0x35075a, 0xd}}]}, 0x34}, 0x1, 0x0, 0x0, 0x4000}, 0x20000000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000100)={0x0, 0x0, &(0x7f00000000c0)={&(0x7f00000001c0)=ANY=[@ANYBLOB="ec000000210001000000000000000000fc010000000000000000000000000000ac1414aa00000000000000000000000000000000000002090200000000000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="00000000000000009c00110000000000000000000000000000000000fe"], 0xec}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000300)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000040)=@acquire={0x13c, 0x17, 0x1, 0x70bd29, 0x25dfdbfc, {{@in6=@private2={0xfc, 0x2, '\x00', 0x1}, 0x4d3, 0x3c}, @in=@private=0xa010101, {@in=@empty, @in6=@empty, 0x4e21, 0x3, 0x4e20, 0x6, 0x2, 0x20, 0x0, 0x87, 0x0, 0xffffffffffffffff}, {{@in=@multicast1, @in=@local, 0x4e22, 0x8000, 0x4e23, 0xfff9, 0x2, 0x20, 0x20, 0x3b}, {0x4, 0x3ff, 0x6, 0x401, 0xfffffffffffffffd, 0x8, 0x4, 0x2}, {0x8, 0x5, 0xa, 0xf}, 0xf, 0x0, 0x1, 0x0, 0x2, 0x1}, 0x10001, 0x1f, 0x3ff, 0x70bd2c}, [@XFRMA_SA_PCPU={0x8, 0x23, 0x7}, @mark={0xc, 0x15, {0x35175c, 0x7}}]}, 0x13c}, 0x1, 0x0, 0x0, 0x40110}, 0x4000004)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000000440)={&(0x7f0000000840)=@updsa={0x184, 0x1a, 0x1, 0x70bd23, 0x25dfdbfe, {{@in6=@private2={0xfc, 0x2, '\x00', 0x1}, @in6=@remote, 0x4e23, 0x7ff, 0x4e1f, 0x0, 0xa, 0x80, 0x80, 0x6c}, {@in=@multicast1, 0x4d2, 0x6c}, @in=@multicast1, {0x480000000, 0x5246, 0x8, 0x6b, 0x80, 0x3, 0x6, 0xff}, {0x9, 0xffffffffffffffff, 0xc, 0x4}, {0x1, 0x7, 0x1533c}, 0x70bd27, 0x0, 0xa, 0x1, 0x0, 0x8}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}, @algo_auth_trunc={0x4c, 0x14, {{'cmac-aes-neon\x00'}, 0x0, 0x100}}]}, 0x184}, 0x1, 0x0, 0x0, 0x895}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000000c0)={0x0, 0x0, &(0x7f0000000040)={&(0x7f0000000440)=@polexpire={0xcc, 0x1b, 0x1, 0x70bd2d, 0x25dfdbfe, {{{@in6=@empty, @in=@private=0xa010101, 0x4e23, 0xfe, 0x4e24, 0x5, 0xa, 0x80, 0x80, 0x62}, {0x4, 0x5, 0x1, 0x8, 0x6, 0x2ff, 0x400, 0x2e6}, {0x7, 0x0, 0xff, 0x316}, 0xe, 0x6e6bbe, 0x2}, 0x4}, [@policy_type={0xa, 0x10, {0x3}}]}, 0xcc}, 0x1, 0x0, 0x0, 0x4010}, 0x4000000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000840)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000000880)=@updsa={0x138, 0x1a, 0x1, 0x70bd27, 0x25dfdbfd, {{@in=@loopback, @in6=@loopback, 0x4e21, 0xffef, 0x4e20, 0x2, 0xa, 0x20, 0x80}, {@in=@multicast1, 0x4d2, 0x2b}, @in=@initdev={0xac, 0x1e, 0x0, 0x0}, {0x8000, 0x6, 0x9, 0x2, 0x5, 0xa3a7, 0x3f, 0x2}, {0x5, 0x0, 0x2, 0x7}, {0x1000, 0xb3e1, 0x6}, 0x270bd2a, 0x0, 0xa, 0x0, 0x1, 0x21}, [@algo_auth={0x48, 0x1, {{'blake2b-256-generic\x00'}}}]}, 0x138}, 0x1, 0x0, 0x0, 0x40040}, 0x40000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000140)={&(0x7f00000002c0)=@updpolicy={0xb8, 0x19, 0x1, 0x70b92b, 0x25dfdbfb, {{@in6=@rand_addr=' \x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02', @in=@broadcast, 0x4e1e, 0x3, 0x4e21, 0x3ff, 0x2, 0x10, 0x20, 0x11}, {0x7fffffff, 0x7f, 0x8001, 0x5, 0xa, 0x100000000, 0x2, 0x1}, {0x56d, 0xc, 0x7, 0x1000}, 0x3, 0x0, 0x0, 0x1}}, 0xb8}, 0x1, 0x0, 0x0, 0x8000}, 0x40040)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
setsockopt$netlink_NETLINK_ADD_MEMBERSHIP(r0, 0x10e, 0x1, &(0x7f0000000e00)=0x14, 0x4)
sendmsg$nl_xfrm(r0, &(0x7f0000001180)={&(0x7f0000000ec0)={0x10, 0x0, 0x0, 0x10420800}, 0xc, &(0x7f0000001140)={&(0x7f0000000f00)=@expire={0xf8, 0x18, 0x8, 0x70bd2c, 0x25dfdbfd, {{{@in6=@remote, @in=@multicast1, 0x4e22, 0x286, 0x4e20, 0xd7, 0xa, 0x80, 0x0, 0x67}, {@in=@remote, 0x4d5, 0x3c}, @in=@rand_addr=0x64010100, {0x7, 0xc, 0x7, 0x1, 0x3ff, 0x100, 0x80000001, 0x4}, {0x9, 0x8, 0x100000000, 0x8000000000000000}, {0xf, 0x7, 0x8}, 0x70bd2d, 0x3503, 0x0, 0x2, 0x7, 0x98}, 0x4c}}, 0xf8}, 0x1, 0x0, 0x0, 0x51}, 0x4000800)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000000840)=@updsa={0x140, 0x1a, 0x1, 0x70bd27, 0x25dfdbfe, {{@in=@loopback, @in=@dev={0xac, 0x14, 0x14, 0x3f}, 0x4e1e, 0xffff, 0x4e20, 0x2, 0xa, 0x60, 0x20, 0x33}, {@in=@multicast1, 0x4d6, 0x32}, @in6=@empty, {0x8000, 0x5, 0x9, 0x2, 0x80009, 0xa3a7, 0x1}, {0x5, 0x3, 0x2, 0xa}, {0x1, 0x0, 0x10000}, 0x70bd3a, 0x0, 0xa, 0x1, 0x1, 0x2}, [@algo_auth={0x48, 0x1, {{'cmac(aes)\x00'}}}, @XFRMA_SA_DIR={0x5, 0x21, 0x2}]}, 0x140}, 0x1, 0x0, 0x0, 0x4040041}, 0x88d0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
r1 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r1, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000140)={&(0x7f00000004c0)=@updpolicy={0xb8, 0x19, 0x1, 0x70bd26, 0x25dfdbfb, {{@in=@rand_addr=0x64010101, @in6=@private1={0xfc, 0x1, '\x00', 0x80}, 0x4e24, 0x3, 0x4e22, 0x3ff, 0x2, 0x0, 0x20, 0x1}, {0x7fffffff, 0x1, 0x8001, 0x3, 0x2, 0x6, 0x2, 0x1}, {0x7fffffff, 0xc, 0x7, 0x200}, 0x3, 0x0, 0x0, 0x1}}, 0xb8}, 0x1, 0x0, 0x0, 0x8000}, 0x40040)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000300)=@migrate={0xa0, 0x21, 0x1, 0x70bd2b, 0x25dfdbfc, {{@in=@multicast2, @in6=@private1, 0x4e24, 0x4, 0x4e23, 0xd, 0x2, 0x0, 0x0, 0x87}, 0x6e6bb2}, [@migrate={0x50, 0x11, [{@in6=@private0={0xfc, 0x0, '\x00', 0x1}, @in=@rand_addr=0x64010104, @in6=@rand_addr=' \x01\x00', @in=@rand_addr=0x64010100, 0x3c, 0x0, 0x0, 0x0, 0x2, 0x2}]}]}, 0xa0}, 0x1, 0x0, 0x0, 0x81}, 0x20000000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000300)={0x0, 0x0, &(0x7f0000000000)={&(0x7f00000029c0)=@acquire={0x1ac, 0x17, 0x1, 0x70bd29, 0x25dfdbfc, {{@in6=@empty, 0x4d3, 0x3c}, @in6=@initdev={0xfe, 0x88, '\x00', 0x0, 0x0}, {@in=@empty, @in6=@mcast1, 0x4e21, 0x3, 0x4e20, 0x6, 0x2, 0x20, 0x0, 0x87, 0x0, 0xffffffffffffffff}, {{@in=@multicast1, @in=@local, 0x4e22, 0x8000, 0x4e23, 0xfff9, 0x2, 0x20, 0x20, 0x3b}, {0x4, 0x3ff, 0x6, 0x401, 0xfffffffffffffffd, 0x8, 0x4, 0x2}, {0x8, 0xffffffffffffffff, 0xa, 0xf}, 0xf, 0x0, 0x1, 0x0, 0x2, 0x1}, 0x10001, 0x1c, 0x3ff, 0x70bd25}, [@tmpl={0x84, 0x5, [{{@in6=@remote, 0x4d3, 0x32}, 0x2, @in6=@mcast1, 0x3502, 0x0, 0x1, 0x7, 0x7, 0x1, 0x4}, {{@in6=@ipv4={'\x00', '\xff\xff', @loopback}, 0x4d4, 0x33}, 0xa, @in6=@dev={0xfe, 0x80, '\x00', 0x31}, 0x3505, 0x4, 0x2, 0x0, 0x80000001, 0x8736, 0x8}]}]}, 0x1ac}, 0x1, 0x0, 0x0, 0x10}, 0x4000004)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000000840)=@updsa={0x140, 0x1a, 0x1, 0x70bd27, 0x25dfdbfe, {{@in=@loopback, @in=@dev={0xac, 0x14, 0x14, 0x3f}, 0x4e1e, 0xffff, 0x4e20, 0x2, 0xa, 0x60, 0x80, 0x33}, {@in=@multicast1, 0x4d6, 0x32}, @in6=@empty, {0x8000, 0x8, 0x9, 0x2, 0x80005, 0xa3a7, 0x1}, {0x5, 0x3, 0x2, 0x7}, {0x1, 0x0, 0x10000}, 0x70bd2a, 0x0, 0xa, 0x1, 0x1, 0x8}, [@algo_auth={0x48, 0x1, {{'cmac(aes)\x00'}}}, @XFRMA_SA_DIR={0x5, 0x21, 0x2}]}, 0x140}, 0x1, 0x0, 0x0, 0x4040041}, 0x88d0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000040)={0x0, 0x0, &(0x7f0000000000)={&(0x7f00000003c0)=@newsa={0x140, 0x10, 0x113, 0x0, 0x0, {{@in=@loopback, @in=@empty, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xee00}, {@in=@multicast1, 0x0, 0x32}, @in=@dev, {}, {}, {}, 0x0, 0x0, 0xa, 0x3}, [@algo_crypt={0x48, 0x2, {{'ecb(cipher_null)\x00'}}}, @tfcpad={0x8, 0x16, 0x1000}]}, 0x140}, 0x1, 0x0, 0x0, 0x40080}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000000440)={&(0x7f00000001c0)=@updsa={0x144, 0x1a, 0x1, 0x70bd25, 0x25dfdbfe, {{@in=@rand_addr=0x64010100, @in6=@remote, 0x4e23, 0x7ff, 0x4e22, 0x7, 0xa, 0x80, 0x80, 0x6c}, {@in6=@rand_addr=' \x01\x00', 0x4d2, 0x6c}, @in=@multicast2, {0x480000000, 0x5246, 0x8, 0x6b, 0x80, 0x3, 0x2, 0x1}, {0xa, 0xfffffffffffffffc, 0xc, 0x4}, {0x1, 0x7, 0x7fff}, 0x70bd27, 0x0, 0x2, 0x1, 0x0, 0x8}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}, @mark={0xc, 0x15, {0x35075a, 0xe90b}}]}, 0x144}, 0x1, 0x0, 0x0, 0x40895}, 0x0)
r1 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r1, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000000440)={&(0x7f0000000680)=@updsa={0x138, 0x1a, 0x1, 0x70bd25, 0x25dfdbfe, {{@in=@rand_addr=0x64010100, @in6=@remote, 0x4e23, 0x7ff, 0x4e22, 0x7, 0xa, 0x80, 0x80, 0x6c}, {@in6=@rand_addr=' \x01\x00', 0x4d2, 0x6c}, @in=@multicast2, {0x480000000, 0x5246, 0x8, 0x6b, 0x80, 0x3, 0x6, 0x1}, {0xa, 0xfffffffffffffffc, 0xc, 0x4}, {0x1, 0x7, 0x7fff}, 0x70bd27, 0x0, 0x2, 0x1, 0x0, 0x8}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}]}, 0x138}, 0x1, 0x0, 0x0, 0x895}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000040)={0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000700)=@delsa={0x28, 0x11, 0x1, 0x0, 0x0, {@in6=@loopback, 0x0, 0x2}}, 0x28}}, 0x0)

      
      sendmsg$nl_xfrm(0xffffffffffffffff, &(0x7f0000000180)={0x0, 0x0, &(0x7f00000001c0)={&(0x7f00000003c0)=@migrate={0x50, 0x21, 0x4, 0x70bd2d, 0x25dfdbff, {{@in=@empty, @in6=@loopback, 0x4e22, 0x3ff, 0x4e21, 0x3a6, 0xa, 0x20, 0xc0, 0x32}, 0x6e6bbe, 0x1}}, 0x50}, 0x1, 0x0, 0x0, 0xc080}, 0x4000031)
r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000040)={&(0x7f00000003c0)=ANY=[@ANYBLOB="400000001200050104000000ffdbdf2528001a00ac1414aa0000000000000000000000000a01010000080000000000000000000002000610050019"], 0x40}, 0x13, 0x0, 0x0, 0x880}, 0x20040840)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000005c0)={0x0, 0x0, &(0x7f0000000580)={&(0x7f0000000000)=ANY=[@ANYBLOB="cc0000001b0001002cbd7000fbdbdf2500000000000000000000ffffac1414aaac1e01010000000000000000000000004e2200004e2373460a00202084000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="e926000000000000080000000000000003000000000000004000000000000000cd0d0000000000000700000000000000ffff000000000000080000000000000002000000000000000700000000000000050000000000000009000000000000000800000000000000010100000000000001000000000000000c00080018"], 0xcc}, 0x1, 0x0, 0x0, 0x4004}, 0x4)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000100)={0x0, 0x0, &(0x7f00000000c0)={&(0x7f0000000200)=@newpolicy={0xc4, 0x13, 0x1, 0x70bd28, 0x25dfdbfd, {{@in6=@ipv4={'\x00', '\xff\xff', @local}, @in6=@dev={0xfe, 0x80, '\x00', 0x29}, 0x4e24, 0xa3, 0x4e21, 0x7ff, 0xa, 0x80, 0x0, 0x2c}, {0x2, 0x0, 0x1, 0x2, 0xea1, 0x7, 0x9, 0x1}, {0x272dcfdb, 0x7, 0x5, 0xffff}, 0x1, 0x0, 0x0, 0x0, 0x1}, [@mark={0xc, 0x15, {0x35075b, 0x5}}]}, 0xc4}, 0x1, 0x0, 0x0, 0x8000}, 0x4)
r1 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r1, &(0x7f0000000180)={0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000540)=@getpolicy={0x50, 0x15, 0x103, 0x70bd2d, 0x25dfdbfe, {{@in6=@private0={0xfc, 0x0, '\x00', 0x1}, @in=@remote, 0x4e24, 0x3, 0x4e20, 0xfffb, 0xa, 0x0, 0x0, 0x2b}, 0x6e6bba, 0x2}}, 0x50}, 0x1, 0x0, 0x0, 0x44}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000000440)={&(0x7f0000000980)=@updsa={0x138, 0x1a, 0x1, 0x70bd25, 0x25dfdbfe, {{@in=@rand_addr=0x64010100, @in=@local, 0x4e23, 0x7ff, 0x4e22, 0x7, 0xa, 0x80, 0x80, 0x6c}, {@in6=@rand_addr=' \x01\x00', 0x4d2, 0x6c}, @in=@multicast2, {0x480000000, 0x5246, 0x8, 0x6b, 0x80, 0x3, 0x6, 0xff}, {0x9, 0xfffffffffffffffc, 0xc, 0x4}, {0x1, 0x7, 0x7fff}, 0x70bd27, 0x0, 0xa, 0x4, 0x0, 0x1}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}]}, 0x138}, 0x1, 0x0, 0x0, 0x895}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000001400)={0x0, 0x0, &(0x7f00000013c0)={&(0x7f0000000100)=@allocspi={0x104, 0x16, 0x17, 0x70bd25, 0x25dfdbfe, {{{@in=@loopback, @in=@remote, 0x4e21, 0xd21, 0x4e24, 0x3, 0x2, 0x0, 0x20, 0x2b}, {@in=@multicast2, 0x4d3, 0x33}, @in6=@dev={0xfe, 0x80, '\x00', 0x14}, {0x800, 0xe34, 0x3, 0x1, 0x800000000000000, 0xf4, 0xa, 0xffffffff}, {0x3, 0x1ff, 0xfffffffffffffff8, 0x4}, {0x9, 0x1, 0x7}, 0x70bd25, 0x3505, 0x2, 0x2, 0x8, 0xa0}, 0x411, 0x58a}, [@lastused={0xc, 0xf, 0x9}]}, 0x104}, 0x1, 0x0, 0x0, 0x20014}, 0x20008000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000001c0)={0x0, 0x0, &(0x7f0000000180)={&(0x7f0000000000)=@allocspi={0x11c, 0x16, 0x201, 0x70bd25, 0x25dfdbfd, {{{@in=@loopback, @in=@dev={0xac, 0x14, 0x14, 0x3f}, 0x4e23, 0x100, 0x4e25, 0x6, 0x2, 0x80, 0xa0, 0x6c}, {@in6=@mcast1, 0x4d2, 0x6c}, @in=@private=0xa010101, {0xfffffffffffffffe, 0x100000001, 0x10, 0x795, 0x2, 0x6, 0x7ffd, 0x62}, {0x7, 0x7, 0x1423ac00, 0x3}, {0x1ff, 0x5, 0xfffffffb}, 0x70bd27, 0x0, 0x2, 0x1, 0x1, 0x6f}, 0x6, 0x401}, [@lifetime_val={0x24, 0x9, {0x2, 0x80000000, 0x4, 0x1}}]}, 0x11c}, 0x1, 0x0, 0x0, 0x20008804}, 0x24000014)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000380)={0x0, 0x0, &(0x7f0000000340)={&(0x7f00000005c0)=@migrate={0xa0, 0x21, 0x1, 0x0, 0x4, {{@in6=@mcast1, @in6=@private2, 0x0, 0x0, 0x0, 0x0, 0xa}}, [@migrate={0x50, 0x11, [{@in6=@remote, @in=@initdev={0xac, 0x1e, 0x0, 0x0}, @in6=@rand_addr=' \x01\x00', @in6=@initdev={0xfe, 0x88, '\x00', 0x1, 0x0}, 0x2b, 0x2, 0x0, 0x3501, 0x2, 0x2}]}]}, 0xa0}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000001280)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000300)=@updsa={0x10c, 0x1a, 0x1, 0x70bd2d, 0x25dfdbfe, {{@in6=@private0={0xfc, 0x0, '\x00', 0x1}, @in=@remote, 0x4e22, 0x100, 0x4e20, 0xfffc, 0xa, 0x0, 0x80, 0x33}, {@in=@multicast1, 0x4d9, 0x3c}, @in=@empty, {0xfff, 0x2, 0x8000000000000000, 0x9, 0x8, 0x2, 0x5, 0x400}, {0x7, 0x1, 0x801, 0x8}, {0x0, 0x6}, 0x70bd2a, 0x3503, 0x2, 0x2, 0xe1, 0x21}, [@encap={0x1c, 0x4, {0x1, 0x4e21, 0x4e23, @in=@loopback}}]}, 0x10c}, 0x1, 0x0, 0x0, 0x4001}, 0x40004)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000080)={0x0, 0x0, &(0x7f0000000040)={&(0x7f0000000340)=ANY=[@ANYBLOB="3800000012000501000000000000000028001a000a0101020000000020000000000000000a010100000000000a000609302189db45d67a93"], 0x38}}, 0x20040810)

      
      sendmsg$nl_xfrm(0xffffffffffffffff, &(0x7f00000001c0)={0x0, 0x0, &(0x7f0000000180)={&(0x7f0000000440)=@newae={0x40, 0x1e, 0x1, 0x70bd2d, 0x25dfdbfe, {{@in6=@private1, 0x4d3, 0x8, 0x32}, @in=@remote, 0xea7, 0x3502}}, 0x40}}, 0x24000014)
r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x5, &(0x7f0000000140)={&(0x7f0000000380)=@migrate={0xec, 0x21, 0x1, 0x70bd25, 0x25dfdbfd, {{@in=@multicast2, @in=@dev={0xac, 0x14, 0x14, 0x1e}, 0x4e24, 0x4, 0x4e23, 0xd, 0x2, 0x0, 0xc0, 0xc}, 0x6e6bb2, 0x1}, [@migrate={0x9c}]}, 0xec}, 0x1, 0x0, 0x0, 0x81}, 0x20004000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000440)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000c00)=@newsa={0x148, 0x10, 0x1, 0xfffffffe, 0x80000100, {{@in6=@private0={0xfc, 0x0, '\x00', 0x1}, @in=@rand_addr=0x64010101, 0x1, 0x714, 0x4e23, 0x5, 0x0, 0x0, 0x0, 0x3a}, {@in6=@mcast2, 0x4d4, 0x6c}, @in=@dev={0xac, 0x14, 0x14, 0x3f}, {0x0, 0x192, 0x6, 0x10003, 0x8251c, 0x2, 0x1000000000000ba0}, {0xffffffffffffffff, 0x0, 0x1f, 0xfffffffffffffffe}, {0x2, 0xfffffffc}, 0x70bd2a, 0x3504, 0x2, 0x1, 0x0, 0x20}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}, @XFRMA_SA_DIR={0x5, 0x21, 0x2}, @XFRMA_NAT_KEEPALIVE_INTERVAL={0x8, 0x22, 0x78}]}, 0x148}, 0x1, 0x0, 0x0, 0x8801}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000035c0)={0x0, 0x0, &(0x7f0000003580)={&(0x7f0000000a40)=@newsa={0x14c, 0x10, 0x1, 0x0, 0x0, {{@in=@empty, @in6=@remote, 0x0, 0xfff7, 0x2000, 0x1, 0x0, 0x0, 0x0, 0x3b, 0x0, 0xffffffffffffffff}, {@in=@rand_addr=0x64010102, 0x0, 0x6c}, @in=@remote, {0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x4}, {0x800000000000, 0x4, 0x40000000}, {}, 0x70bd25, 0x0, 0x2, 0x0, 0x1}, [@XFRMA_IF_ID={0x8, 0x1f, 0x4}, @algo_comp={0x48, 0x3, {{'deflate\x00'}}}, @offload={0xc, 0x1c, {0x0, 0x2}}]}, 0x14c}}, 0x4810)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000002a80)={0x0, 0x0, &(0x7f0000002a40)={&(0x7f0000000080)=ANY=[@ANYBLOB="28000000120001", @ANYBLOB="1a10c1"], 0x28}, 0x1, 0x0, 0x0, 0x4000}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000002c0)={0x0, 0x0, &(0x7f0000000280)={&(0x7f0000000100)=@delsa={0x28, 0x11, 0x1, 0x70bd25, 0x25dfdbfc, {@in=@initdev={0xac, 0x1e, 0x1, 0x0}, 0xfffffff9, 0x2, 0xff}}, 0x28}, 0x1, 0x0, 0x0, 0x4000}, 0x20000000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000080)={0x0, 0x0, &(0x7f0000000040)={&(0x7f00000006c0)=ANY=[@ANYBLOB="3c000000120001002bbd7000fedbdf2500000000000000000000000000000000000000000000bf0014000d"], 0x3c}, 0x1, 0x0, 0x0, 0x40804}, 0x40000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000140)={&(0x7f0000000040)=@migrate={0x50, 0x21, 0x1, 0x70bd2c, 0x25dfdbfd, {{@in=@multicast2, @in=@dev={0xac, 0x14, 0x14, 0x1f}, 0x4e24, 0x4, 0x4e23, 0xd, 0x2}, 0x6e6bb2, 0x1}}, 0x50}, 0x1, 0x0, 0x0, 0x81}, 0x20000000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000080)={0x0, 0x0, &(0x7f0000000040)={&(0x7f0000000700)=@newsa={0x184, 0x10, 0x1, 0x70bd25, 0x25dfdbfb, {{@in=@loopback, @in=@local, 0x4e24, 0x1ff, 0x4e24, 0x0, 0xa, 0x80, 0x0, 0xe5}, {@in=@dev={0xac, 0x14, 0x14, 0xc}, 0x4d6, 0x33}, @in=@empty, {0x76b0, 0xacf, 0x8, 0x2, 0x8, 0x3, 0x80, 0x9}, {0xc, 0x7f, 0x1, 0x8000000000000000}, {0x32, 0x200, 0x8}, 0x70bd28, 0x3501, 0x2, 0x0, 0x6, 0xc2}, [@algo_auth_trunc={0x4c, 0x14, {{'rmd128\x00'}, 0x0, 0xa0}}, @algo_crypt={0x48, 0x2, {{'xts-serpent-sse2\x00'}}}]}, 0x184}, 0x1, 0x0, 0x0, 0x880}, 0x4980)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000840)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000000000)=@updsa={0x144, 0x1a, 0x1, 0x70bd27, 0x25dfdbfd, {{@in=@loopback, @in=@broadcast, 0x4e22, 0xffef, 0x4e20, 0x2, 0xa, 0x60, 0x80}, {@in=@multicast1, 0x4d6, 0x32}, @in6=@private0, {0x8000, 0x6, 0x9, 0x2, 0x5, 0xa3a7, 0x3f}, {0x5, 0x0, 0x2, 0x7}, {0x1, 0xb3e1, 0x6}, 0x70bd2a, 0x0, 0xa, 0x1, 0x1, 0x21}, [@algo_auth={0x48, 0x1, {{'sha256-mb\x00'}}}, @sec_ctx={0xc, 0x8, {0xb, 0x8, 0x1, 0x1}}]}, 0x144}, 0x1, 0x0, 0x0, 0x40000}, 0x40000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000004c0)={0x0, 0x0, &(0x7f00000000c0)={&(0x7f0000000280)=ANY=[@ANYBLOB="1c00000028000100800000000000000000000000050021"], 0x1c}, 0x1, 0x0, 0x0, 0x8c4}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000300)={0x0, 0x0, &(0x7f00000002c0)={&(0x7f0000000640)=ANY=[@ANYBLOB="1c000000240015032700000000000000000d1a0006000300fa"], 0x1c}}, 0x4000004)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000580)={0x0, 0x0, &(0x7f0000000180)={&(0x7f0000000000)=@updpolicy={0xb8, 0x13, 0xcb23c9c9931e99e9, 0x70bd25, 0x0, {{@in6=@empty, @in=@local, 0x0, 0x0, 0x0, 0x0, 0xa, 0x30, 0x80, 0x3c, 0x0, 0xee01}, {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x802000000000000}, {0xfffffffffffffffe}}}, 0xb8}}, 0x40004)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x2500, &(0x7f0000000440)={&(0x7f0000000000)=ANY=[@ANYBLOB="6501000014"], 0x188}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000040)={&(0x7f0000000900)=@newsa={0x13c, 0x10, 0x713, 0x0, 0x25dfdbfd, {{@in=@rand_addr=0x64010101, @in6=@rand_addr=' \x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01', 0x0, 0x0, 0x4e21, 0x2, 0xa, 0x0, 0x0, 0x1d}, {@in6=@private1={0xfc, 0x1, '\x00', 0xff}, 0xfe, 0x32}, @in6=@empty, {0x0, 0x0, 0xfffffffffffffffd, 0x8, 0x1, 0x9, 0x7fffffff, 0x543}, {0x4, 0x7fffffffffffffff, 0xfffffffffffffffc, 0xebe2}, {0x2}, 0x70bd2c, 0x3500, 0xa, 0x4, 0x80, 0x50}, [@algo_aead={0x4c, 0x12, {{'rfc4106(gcm(aes))\x00'}, 0x0, 0x60}}]}, 0x13c}, 0x1, 0x0, 0x0, 0x880}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000340)={&(0x7f0000000800)=@newsa={0x13c, 0x10, 0x713, 0x0, 0x0, {{@in=@private=0xa010100, @in=@initdev={0xac, 0x1e, 0x0, 0x0}}, {@in6=@local, 0x4d3, 0x33}, @in6=@rand_addr=' \x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02', {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xffffffffffffffff}, {0x8000000000000001, 0x0, 0xfffffffffffffffe}, {}, 0x0, 0x3502, 0x2, 0x2}, [@algo_auth_trunc={0x4c, 0x14, {{'sha1\x00'}}}]}, 0x13c}}, 0x4000050)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000001c0)={0x0, 0x0, &(0x7f0000000180)={&(0x7f0000000040)=@newae={0x5c, 0x1e, 0x1, 0x70bd2d, 0x25dfdbfe, {{@in6=@private1, 0x4d3, 0x8, 0x32}, @in=@remote, 0xea9, 0x3502}, [@replay_esn_val={0x1c, 0x17, {0x0, 0x70bd2a, 0x70bd2c, 0x70bd29, 0x70bd25, 0xffffffff}}]}, 0x5c}}, 0x24000014)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000000)=@updpolicy={0xb8, 0x19, 0x1, 0x0, 0x0, {{@in=@multicast1=0xe0000002, @in6=@loopback, 0x0, 0x0, 0x0, 0x0, 0xa, 0x0, 0x0, 0x87}, {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xfffffffffffffffb}, {0xfffffffffffffffd}}}, 0xb8}}, 0x0)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f00000000c0)={&(0x7f0000000000)=ANY=[@ANYBLOB="b800000015"], 0xb8}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, 0x0, 0x0)
sendmsg$nl_xfrm(r0, &(0x7f0000000580)={0x0, 0x0, &(0x7f0000000180)={&(0x7f00000001c0)=ANY=[@ANYBLOB="b80000001300e9990000000000000000fc000000000000000000000000000000ac1e000100000000000000000000000000000000000000000a0040"], 0xb8}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000040)={&(0x7f00000001c0)=@newsa={0x158, 0x10, 0x1, 0x0, 0x0, {{@in6=@private1, @in=@private, 0x800, 0x0, 0x0, 0x0, 0x2}, {@in, 0x0, 0x32}, @in=@local, {0x3, 0x6}, {}, {}, 0x0, 0x0, 0xa, 0x0, 0x0, 0xcd}, [@replay_esn_val={0x1c}, @algo_auth_trunc={0x4c, 0x14, {{'cmac(aes)\x00'}}}]}, 0x158}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000001280)={0x0, 0x0, &(0x7f0000001240)={&(0x7f00000001c0)=ANY=[@ANYBLOB="040100001a0001002dbd7000fedbdf25fc00000041d0c68f77c69b9d2af40ce73d15c232000000000000000000000001ac1414bb00"/64, @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="e0000001000000000000000000000000000004d63c00000000000000000000000000000000000000ff0f000000000000000000000000000000000000000000800900000000000000080000000000000002000000000000000500000000000000fdffffffffffffff07000000000000000100000000000000000800000000000008000000000000000000000002000000000000002abd70000335000002000205210000000000000014000e"], 0x104}, 0x1, 0x0, 0x0, 0x4001}, 0x8044)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000040)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000780)=@newsa={0x144, 0x10, 0x1, 0xbffffffe, 0x100, {{@in=@empty, @in6=@ipv4={'\x00', '\xff\xff', @remote}, 0x1, 0x394, 0x4e23, 0x5, 0x0, 0x0, 0x0, 0x3a}, {@in6=@mcast2, 0x4d4, 0x6c}, @in=@remote, {0x0, 0x9, 0x6, 0xffff, 0x8251c, 0x2, 0xfffffffffffffff8}, {0x6, 0x0, 0x1f, 0x1ff}, {0x2, 0xfffffffc}, 0x70bd2a, 0x3504, 0xa, 0x1, 0xfd, 0x20}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}, @mark={0xc, 0x15, {0x35075a, 0x4}}]}, 0x144}, 0x1, 0x0, 0x0, 0x8801}, 0x10)
r1 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r1, &(0x7f0000000040)={0x0, 0x0, &(0x7f0000000240)={&(0x7f0000000500)=@newsa={0x138, 0x18, 0x1, 0x70bd27, 0x100, {{@in6=@initdev={0xfe, 0x88, '\x00', 0x0, 0x0}, @in6=@private1={0xfc, 0x1, '\x00', 0x1}, 0x1, 0x714, 0x4e23, 0x5, 0x0, 0x0, 0x60, 0x3a}, {@in6=@mcast2, 0x4d4, 0x6c}, @in6=@ipv4={'\x00', '\xff\xff', @initdev={0xac, 0x1e, 0x0, 0x0}}, {0x0, 0x4, 0x6, 0xffff, 0x8251c, 0x2, 0x9}, {0xffffffffffffffff, 0x0, 0x1f, 0xfffffffffffffffe}, {0x2, 0xfffffffc}, 0x70bd2a, 0x3504, 0xa, 0x1, 0x4, 0x20}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}]}, 0x138}, 0x1, 0x0, 0x0, 0x8801}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000100)={&(0x7f0000000000), 0xc, &(0x7f0000000180)={&(0x7f0000000380)=@polexpire={0xc0, 0x1b, 0x1, 0x0, 0x0, {{{@in=@multicast2, @in6=@local}, {}, {}, 0x0, 0x8000000}}}, 0xc0}, 0x8}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000035c0)={0x0, 0x0, &(0x7f0000003580)={&(0x7f0000000bc0)=@newsa={0x138, 0x10, 0x1, 0x0, 0x0, {{@in=@empty, @in6=@remote, 0x0, 0xfff7, 0x2000, 0x1, 0xa, 0x0, 0x0, 0x3b, 0x0, 0xffffffffffffffff}, {@in=@rand_addr=0x64010102, 0xfffffffd, 0x6c}, @in=@broadcast, {0x0, 0x0, 0x0, 0x0, 0x3, 0x2000000}, {0x0, 0x4, 0x40000000, 0xfffffffffffffffc}, {0x40}, 0x0, 0x0, 0xa, 0x2, 0x1, 0xe0}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}]}, 0x138}, 0x1, 0x0, 0x0, 0x801}, 0x4810)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000003c0)={0x0, 0x0, &(0x7f0000000380)={&(0x7f00000000c0)=@newsa={0x104, 0x1a, 0x327, 0x70bd2a, 0x0, {{@in6=@dev={0xfe, 0x80, '\x00', 0x1b}, @in=@multicast1, 0xffff, 0x0, 0x4e22, 0x0, 0x0, 0x0, 0x0, 0x2, 0x0, 0xee00}, {@in6=@mcast1, 0x0, 0x2b}, @in6=@private0, {0x5a, 0xb400, 0x2, 0xfeffff7f00000001, 0x0, 0x60000}, {0x0, 0x200000, 0x6, 0xfffffffffffffffd}, {0x40000, 0x0, 0xae8}, 0x0, 0x0, 0xa, 0x0, 0x0, 0x70}, [@coaddr={0x14, 0xe, @in6=@remote}]}, 0x104}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000040)={&(0x7f0000000380)=ANY=[@ANYBLOB="84010000100013070000000000000000fe88000000000000000000000000000120010000000000000000000000000000000046a200"/64, @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="fe8000000000000000000000000000bb0000000032000000fe880000000000000000000000000001000000000000000000000000000000100000000000000000200000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000fcffffffffffffff00000000000000000000000000000000070000000000000000000000000000000000000002000000000000000000000048000200656362286369706865725f6e756c6c29000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000004c00140073686131000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000004201"], 0x184}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000000140)=@updsa={0x154, 0x1a, 0x1, 0x70bd27, 0x25dfdbfc, {{@in=@loopback, @in6=@mcast2, 0x4e1e, 0x4, 0x4e20, 0x2, 0xa, 0x60, 0x80}, {@in=@multicast1, 0x4d4, 0x32}, @in=@local, {0x8000, 0x6, 0x9, 0x10000002, 0x5, 0xa3a7, 0x3f, 0x10}, {0x5, 0x3, 0x2, 0x7}, {0x1, 0xb3e1, 0x10000}, 0x70bd2a, 0x0, 0x2, 0x1, 0x1, 0x21}, [@algo_crypt={0x48, 0x2, {{'ecb(cipher_null)\x00'}}}, @encap={0x1c, 0x4, {0x0, 0x4e21, 0x4e24, @in=@multicast2}}]}, 0x154}, 0x1, 0x0, 0x0, 0x4040000}, 0x40000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000840)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000000e40)=@updsa={0x184, 0x1a, 0x1, 0x70bd27, 0x25dfdbfd, {{@in=@loopback, @in6=@loopback, 0x4e22, 0xffef, 0x4e20, 0x2, 0xa, 0x20, 0x80}, {@in=@multicast1, 0x4d2, 0x32}, @in=@initdev={0xac, 0x1e, 0x0, 0x0}, {0x8000, 0x6, 0x9, 0x2, 0x5, 0xa3a7, 0x3f, 0x3}, {0x5, 0x0, 0x2, 0x7}, {0x4, 0xb3e1, 0x6}, 0x70bd2a, 0x0, 0xa, 0x0, 0x1, 0x21}, [@algo_auth={0x48, 0x1, {{'sha256-mb\x00'}}}, @algo_aead={0x4c, 0x12, {{'morus1280-avx2\x00'}, 0x0, 0xa0}}]}, 0x184}, 0x1, 0x0, 0x0, 0x40000}, 0x40000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000040)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000780)=@newsa={0x138, 0x10, 0x1, 0xbffffffe, 0x100, {{@in=@empty, @in6=@ipv4={'\x00', '\xff\xff', @remote}, 0x1, 0x394, 0x4e23, 0x5, 0x0, 0x0, 0x0, 0x3a}, {@in=@rand_addr=0x64010102, 0x4d4, 0x6c}, @in=@remote, {0x0, 0x9, 0x6, 0xffff, 0x8251c, 0x2, 0xfffffffffffffff8}, {0x6, 0x0, 0x1f, 0x1ff}, {0x2, 0xfffffffc}, 0x70bd2a, 0x3504, 0xa, 0x1, 0xfd, 0x20}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}]}, 0x138}, 0x1, 0x0, 0x0, 0x8801}, 0x10)
r1 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r1, &(0x7f0000000040)={0x0, 0x0, &(0x7f0000000240)={&(0x7f0000000500)=@newsa={0x138, 0x18, 0x1, 0x70bd27, 0x100, {{@in6=@initdev={0xfe, 0x88, '\x00', 0x0, 0x0}, @in6=@private1={0xfc, 0x1, '\x00', 0x1}, 0x1, 0x714, 0x4e23, 0x5, 0x0, 0x0, 0x60, 0x3a}, {@in6=@mcast2, 0x4d4, 0x6c}, @in6=@ipv4={'\x00', '\xff\xff', @initdev={0xac, 0x1e, 0x0, 0x0}}, {0x0, 0x4, 0x6, 0xffff, 0x8251c, 0x2, 0x9}, {0xffffffffffffffff, 0x0, 0x1f, 0xfffffffffffffffe}, {0x2, 0xfffffffc}, 0x70bd2a, 0x3504, 0xa, 0x1, 0x4, 0x20}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}]}, 0x138}, 0x1, 0x0, 0x0, 0x8801}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000140)={&(0x7f00000004c0)=ANY=[@ANYBLOB="fc0000001900010026bd7000fbdbdf2520010000000000000000000000000002fc0100000000000000000000000000004e1e00034e2103ff0200002001000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="ffffff7f00000000010000000000000001800000000000000300000000000000f700000000000000060000000000000002000000000000000100000000000000ffffff7f000000000c00000000000000070000"], 0xfc}, 0x1, 0x0, 0x0, 0x8000}, 0x40040)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000000300)=@updsa={0x140, 0x1a, 0x1, 0x70bd2b, 0x25dfdbfd, {{@in=@loopback, @in=@private=0xa010102, 0x4e22, 0xffef, 0x4e30, 0x2, 0xa, 0x60, 0x80}, {@in6=@local, 0x4d6, 0x32}, @in6=@private0, {0x8000, 0x6, 0x9, 0x2, 0x5, 0xa3a7, 0x9da}, {0x5, 0x0, 0x2, 0x7}, {0x1, 0xb3e1, 0x6}, 0x70bd2a, 0x0, 0xa, 0x1, 0x1, 0x21}, [@algo_auth={0x48, 0x1, {{'sha256-arm64-neon\x00'}}}, @XFRMA_IPTFS_DROP_TIME={0x8, 0x24, 0x10}]}, 0x140}, 0x1, 0x0, 0x0, 0x40000}, 0x40000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000300)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000040)=@acquire={0x130, 0x17, 0x1, 0x70bd29, 0x25dfdbfc, {{@in6=@private2={0xfc, 0x2, '\x00', 0x1}, 0x4d3, 0x3c}, @in=@private=0xa010101, {@in=@empty, @in6=@empty, 0x4e21, 0x3, 0x4e20, 0x6, 0x2, 0x20, 0x0, 0x87, 0x0, 0xffffffffffffffff}, {{@in=@multicast1, @in=@local, 0x4e22, 0x8000, 0x4e23, 0xfff9, 0x2, 0x20, 0x20, 0x3b}, {0x4, 0x3ff, 0x6, 0x401, 0xfffffffffffffffd, 0x8, 0x4, 0x2}, {0x8, 0x5, 0xa, 0xf}, 0xf, 0x0, 0x1, 0x0, 0x2, 0x1}, 0x10001, 0x1f, 0x3ff, 0x70bd2c}, [@XFRMA_SA_PCPU={0x8, 0x23, 0x7}]}, 0x130}, 0x1, 0x0, 0x0, 0x40110}, 0x4000004)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000080)={&(0x7f00000014c0)=@newsa={0x13c, 0x10, 0x413, 0x0, 0x0, {{@in6=@initdev={0xfe, 0x88, '\x00', 0x0, 0x0}, @in6=@rand_addr=' \x01\x00', 0xfffd, 0x0, 0x4e24, 0x0, 0x2, 0x0, 0x20, 0x0, 0x0, 0xee00}, {@in=@rand_addr=0x64010100, 0x20, 0x32}, @in6=@private0={0xfc, 0x0, '\x00', 0x1}, {0x0, 0x7, 0x0, 0x4, 0x2000000000000000, 0x4, 0x20000000008}, {0x100000001, 0x8, 0xcc, 0x8}, {0xf8}, 0x0, 0x4, 0x2, 0x0, 0x1}, [@algo_aead={0x4c, 0x12, {{'rfc4309(ccm(aes))\x00'}, 0x0, 0x40}}]}, 0x13c}}, 0x844)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000080)={0x0, 0x0, &(0x7f0000000040)={&(0x7f00000000c0)=@getsa={0x3c, 0x12, 0x1, 0x70bd6b, 0x25dfdbfe, {@in6=@local, 0x4d4, 0xa, 0x2b}, [@srcaddr={0x14, 0xd, @in6=@ipv4={'\x00', '\xff\xff', @multicast1}}]}, 0x3c}, 0x1, 0x0, 0x0, 0x40804}, 0x20000000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000080)={&(0x7f0000001100)=@newsa={0x158, 0x10, 0x1, 0x0, 0x0, {{@in6=@private1={0xfc, 0x1, '\x00', 0x1}, @in=@multicast1, 0x0, 0xecdf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x87}, {@in=@broadcast, 0x0, 0x32}, @in6=@private1, {0xfffffffffffffffe, 0x0, 0x0, 0x0, 0x0, 0xfffffffffffffffc}, {0x0, 0x800}, {}, 0x0, 0x0, 0x2, 0x0, 0x0, 0x81}, [@replay_esn_val={0x1c, 0x17, {0x0, 0x70bd2a, 0x70bd27}}, @algo_auth_trunc={0x4c, 0x14, {{'hmac(sha256)\x00'}, 0x0, 0x80}}]}, 0x158}}, 0x4050)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000000c0)={0x0, 0x0, &(0x7f0000000080)={&(0x7f0000000040)=@getspdinfo={0x14, 0x25, 0x1, 0x70bd2c, 0x25dfdbfc, 0x9}, 0x14}, 0x1, 0x0, 0x0, 0x40c0}, 0x94)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000200)={0x0, 0x0, &(0x7f00000001c0)={&(0x7f00000003c0)=@updpolicy={0xb8, 0x19, 0xfd3649826d894c67, 0x0, 0x0, {{@in6=@mcast1, @in=@initdev={0xac, 0x1e, 0x0, 0x0}, 0x0, 0x0, 0x0, 0x0, 0x2, 0x0, 0xc0}, {0x0, 0x0, 0x5, 0x0, 0x0, 0x0, 0x0, 0x1}, {}, 0x0, 0x0, 0x1, 0x0, 0x2}}, 0xb8}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000240)=@migrate={0xac, 0x21, 0x1, 0x70bd27, 0x25dfdbfb, {{@in6=@mcast1, @in6=@remote, 0x4e21, 0x0, 0x4e21, 0x9ffd, 0x7, 0x180, 0x20, 0x1d}, 0x6e6ba8, 0x4b75cd6ef3e93cb3}, [@migrate={0x50, 0x11, [{@in6=@private0={0xfc, 0x0, '\x00', 0x1}, @in6=@loopback, @in=@broadcast, @in6=@empty, 0x6c, 0x1, 0x0, 0x0, 0xa, 0x2}]}, @offload={0xc, 0x1c, {0x0, 0x2}}]}, 0xac}, 0x1, 0x0, 0x0, 0x4000840}, 0x20004040)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000040)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000780)=@newsa={0x138, 0x10, 0x1, 0xbffffffe, 0x100, {{@in=@empty, @in6=@ipv4={'\x00', '\xff\xff', @remote}, 0x1, 0x394, 0x4e23, 0x5, 0x0, 0x0, 0x0, 0x3a}, {@in6=@mcast2, 0x4d4, 0x6c}, @in=@remote, {0x0, 0x9, 0x6, 0xffff, 0x8251c, 0x2, 0xfffffffffffffff8}, {0x6, 0x0, 0x1f, 0x1ff}, {0x2, 0xfffffffc}, 0x70bd2a, 0x3504, 0xa, 0x1, 0xfd, 0x20}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}]}, 0x138}, 0x1, 0x0, 0x0, 0x8801}, 0x10)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000140)={&(0x7f0000000040)=@migrate={0xa0, 0x21, 0x1, 0x70bd2c, 0x25dfdbfd, {{@in=@multicast2, @in=@dev={0xac, 0x14, 0x14, 0x1f}, 0x4e24, 0x4, 0x4e23, 0xd, 0x2}, 0x6e6bb2, 0x1}, [@migrate={0x50, 0x11, [{@in6=@initdev={0xfe, 0x88, '\x00', 0x0, 0x0}, @in=@rand_addr=0x64010100, @in6=@empty, @in6=@private0={0xfc, 0x0, '\x00', 0x1}, 0x32, 0x0, 0x0, 0x3501, 0x4d21fe76179a5365, 0x2}]}]}, 0xa0}, 0x1, 0x0, 0x0, 0x81}, 0x20000000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000500)={&(0x7f00000001c0)=ANY=[@ANYBLOB="3c010000100033060000000000000000ffffffff000000000000000000000000fe8000000000000000000000000000aa00000000000000000200000062000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="000000000000000000000000000000010000000232000000fe880000000000000000000000000001000000000000009aaf0a3f012d94f600000000000000000000000000000000000000000000000000fdffffffdfffff070000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000020000000a000000000000004c001400726d643136300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000044"], 0x13c}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000000440)={&(0x7f00000004c0)=@updsa={0x180, 0x1a, 0x1, 0x70bd25, 0x25dfdbfe, {{@in=@rand_addr=0x64010100, @in6=@remote, 0x4e23, 0x7ff, 0x4e22, 0x7, 0xa, 0x80, 0x80, 0x6c}, {@in6=@rand_addr=' \x01\x00', 0x4d2, 0x6c}, @in=@multicast2, {0x480000000, 0x5246, 0x8, 0x6b, 0x80, 0x3, 0x6, 0x1}, {0xa, 0xfffffffffffffffc, 0xc, 0x4}, {0x4, 0x7, 0x7fff}, 0x70bd27, 0x0, 0x2, 0x1, 0x0, 0x8}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}, @algo_crypt={0x48, 0x2, {{'ctr-blowfish-asm\x00'}}}]}, 0x180}, 0x1, 0x0, 0x0, 0x895}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000540)=@getpolicy={0x50, 0x15, 0x103, 0x70bd2d, 0x25dfdbfe, {{@in6=@private0={0xfc, 0x0, '\x00', 0x1}, @in=@remote, 0x4e24, 0x3, 0x4e20, 0xfffb, 0xa, 0x0, 0x0, 0x2b}, 0x6e6bba, 0x2}}, 0x50}, 0x1, 0x0, 0x0, 0x44}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000100)={0x0, 0x0, &(0x7f00000000c0)={&(0x7f0000000480)=ANY=[@ANYBLOB="a0000000210001002dbd7000fedbdf2500000000000000000000000000000000e00000020000000000000000000000004e2200014e2000070a00e0a03b000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="c36b6e0000000000500011"], 0xa0}, 0x1, 0x0, 0x0, 0x40000}, 0x800)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000a80)={0x0, 0x0, &(0x7f0000000a40)={&(0x7f0000000740)=@newsa={0x13c, 0x10, 0x1, 0x70bd26, 0x25dfdbfb, {{@in=@dev={0xac, 0x14, 0x14, 0x31}, @in=@remote, 0x4e24, 0x8, 0x4e21, 0xd, 0xa, 0x20, 0x20, 0x87}, {@in=@initdev={0xac, 0x1e, 0x1, 0x0}, 0x4d5, 0x2b}, @in6=@mcast2, {0xffffffff, 0x0, 0x9, 0x2, 0x1, 0x7ff, 0x6, 0x7}, {0x8, 0x8dc000000000000, 0xffffffffffffff45, 0x7}, {0x112, 0x97d, 0x200}, 0x70bd29, 0x3503, 0x2, 0x2, 0x2, 0x15}, [@algo_auth_trunc={0x4c, 0x14, {{'sha512-arm64\x00'}, 0x0, 0xc0}}]}, 0x13c}, 0x1, 0x0, 0x0, 0x81}, 0x8800)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000000440)={&(0x7f0000000240)=@updsa={0x15c, 0x1a, 0x1, 0x70bd25, 0x25dfdbfe, {{@in=@rand_addr=0x64010100, @in6=@remote, 0x4e23, 0x7ff, 0x4e22, 0x7, 0xa, 0x0, 0x80, 0x6c}, {@in6=@rand_addr=' \x01\x00', 0x4d2, 0x6c}, @in=@multicast2, {0x480000000, 0x5246, 0x8, 0x6b, 0x80, 0x3, 0x6, 0x1}, {0xa, 0xfffffffffffffffc, 0xc, 0x4}, {0x1, 0x7, 0x7fff}, 0x70bd27, 0x0, 0x2, 0x1, 0x0, 0x8}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}, @lifetime_val={0x24, 0x9, {0x2000000000000000, 0x1000, 0x0, 0x5}}]}, 0x15c}, 0x1, 0x0, 0x0, 0x895}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000003c0)={0x0, 0x0, &(0x7f0000000380)={&(0x7f0000000000)=@newsa={0x120, 0x1a, 0x7, 0x0, 0x0, {{@in6=@mcast2, @in=@multicast2, 0xffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2, 0x0, 0xee00}, {@in6=@private0={0xfc, 0x0, '\x00', 0x1}, 0x0, 0x2b}, @in6=@private0, {0x4a9a, 0x0, 0x2, 0x9e6}, {0x0, 0x200000, 0x7}, {0x40002, 0x0, 0xae8}, 0x0, 0x0, 0xa, 0x2, 0x0, 0x70}, [@replay_esn_val={0x1c, 0x17, {0x0, 0x70bd26, 0x70bd2b, 0x70bd29, 0x70bd2c, 0x1}}, @coaddr={0x14, 0xe, @in=@empty}]}, 0x120}, 0x1, 0x0, 0x0, 0x48000}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000300)={0x0, 0x0, &(0x7f00000002c0)={&(0x7f0000000640)=ANY=[@ANYBLOB="440000002400150327bd7000fbdbdf259a0d00000600040000"], 0x44}, 0x1, 0x0, 0x0, 0x10}, 0x4000004)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000080)={0x0, 0x0, &(0x7f0000000040)={&(0x7f0000000280)=@migrate={0xf8, 0x21, 0x1, 0x0, 0x0, {{@in=@remote, @in=@remote, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xee01}}, [@policy={0xa8, 0x7, {{@in=@empty, @in6=@private0}}}]}, 0xf8}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000000440)={&(0x7f0000000800)=@updsa={0x140, 0x1a, 0x1, 0x70bd65, 0x25dfdbfe, {{@in=@rand_addr=0x64010100, @in6=@remote, 0x4e23, 0x7ff, 0x4e22, 0x7, 0xa, 0x0, 0x80, 0x6c}, {@in6=@rand_addr=' \x01\x00', 0x4d5, 0x6c}, @in=@multicast2, {0x480000000, 0x5246, 0x8, 0x6b, 0x80, 0x3, 0x6, 0x1}, {0xa, 0xfffffffffffffffc, 0xc, 0x4}, {0x1, 0x7, 0x7fff}, 0x70bd27, 0x0, 0x2, 0x1, 0x0, 0xc}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}, @XFRMA_SA_DIR={0x5, 0x21, 0x1}]}, 0x140}, 0x1, 0x0, 0x0, 0x895}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000002a80)={0x0, 0x0, &(0x7f0000002a40)={&(0x7f0000000040)=ANY=[@ANYBLOB="28000000120001", @ANYBLOB="1a10"], 0x28}, 0x1, 0x0, 0x0, 0x4000}, 0x4410)
sendmsg$nl_xfrm(r0, &(0x7f0000002a80)={0x0, 0x0, &(0x7f0000002a40)={&(0x7f0000000080)=ANY=[@ANYBLOB="28000000120001", @ANYBLOB="1a10"], 0x28}, 0x1, 0x0, 0x0, 0x4000}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000003c0)={0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000280)=@updsa={0x104, 0x1a, 0x1, 0x0, 0x0, {{@in6=@private1, @in=@initdev={0xac, 0x1e, 0x0, 0x0}}, {@in=@empty=0xe00, 0x2, 0x3c}, @in=@multicast1, {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x4}, {}, {}, 0x0, 0x0, 0xa, 0x4, 0x9, 0x60}, [@coaddr={0x14, 0xe, @in6=@private2}]}, 0x104}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000080)={0x0, 0x0, &(0x7f0000000040)={&(0x7f0000000100)=ANY=[@ANYBLOB="a0000000210001002bbd7000fddbdf2500000000000000000000000000000000ac14140b0000000000000000000000004e2000004e2207ff0200808011000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="000002101200af00500011"], 0xa0}, 0x1, 0x0, 0x0, 0x20000016}, 0x4000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000340)={0x0, 0x0, &(0x7f0000000280)={&(0x7f0000000600)=@allocspi={0x100, 0x16, 0x1, 0x70bd25, 0x25dfdbfe, {{{@in6=@private1, @in=@empty, 0x4e22, 0x0, 0x4e23, 0x8, 0xa, 0x20, 0x80, 0x0, 0x0, 0xee00}, {@in=@multicast2, 0x4d4, 0x33}, @in=@rand_addr=0x64010100, {0x9, 0x9, 0x8, 0x9, 0xfffffffffffffffb, 0x1741, 0x9, 0x4}, {0x5, 0x400, 0x6bd6f733}, {0x101, 0x7fe4, 0x1}, 0x70bd28, 0x3502, 0xa, 0x3, 0x31, 0x10}, 0x4, 0x9}, [@XFRMA_IF_ID={0x8, 0x1f, 0x3}]}, 0x100}, 0x1, 0x0, 0x0, 0x20048884}, 0x880)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000040)={0x0, 0x0, &(0x7f0000000000)={&(0x7f00000008c0)=ANY=[@ANYBLOB="680100001a000100feffffff0001000000000000000000000000ffffe0000002fe8800000000000000000000000001010001071c4e230005000000003a000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="ff020000000000000000000000000001000004d46c000000ac14142500000000000000000000000000000000000000009201000000000000a39b000000000000ffff0000000000001c250800000000000500000000000000fcffffffffffffff0000000000000000ffffffffffffffff00000000000000001f00000000000000fefffffffffffffffafffffffcffffff000000008000000000350000020001002000000000000000480003006465666c6174650000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000090000000008001f0003"], 0x168}, 0x1, 0x0, 0x0, 0x8801}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f00000002c0)={&(0x7f0000000040)=ANY=[@ANYBLOB="1c0000002400150327bd7000fbdbdf259a0d0000060004006ef5"], 0x1c}, 0x1, 0x0, 0x0, 0x10}, 0x4000004)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000005c0)={0x0, 0x0, &(0x7f0000000580)={&(0x7f0000000240)=@migrate={0x27c, 0x21, 0x1, 0x0, 0x1, {{@in, @in6=@remote}}, [@migrate={0x219, 0x11, [{@in=@multicast1, @in=@local, @in=@local, @in=@broadcast, 0x3c, 0x4, 0x0, 0x3505, 0xa, 0x2}, {@in=@empty, @in=@initdev={0xac, 0x1e, 0x0, 0x0}, @in=@multicast2, @in6=@empty, 0x6c, 0x1, 0x0, 0x3506, 0x2, 0x2}, {@in=@empty, @in=@private=0xa010100, @in6=@remote, @in6=@mcast2, 0x33, 0x2, 0x0, 0x0, 0xa, 0xa}, {@in=@local, @in6=@local, @in6=@empty, @in6=@private2, 0xff, 0x3, 0x0, 0x3503, 0x8, 0x2}, {@in=@broadcast, @in6=@ipv4={'\x00', '\xff\xff', @multicast2}, @in=@private, @in=@dev, 0x0, 0x1, 0x0, 0x3506, 0x2, 0xf}, {@in6=@mcast2, @in=@broadcast, @in6=@private2, @in6=@remote}]}, @policy_type={0xa}, @replay_esn_val={0x40, 0x17, {0x9, 0x70bd29, 0x70bd2d, 0x70bd27, 0x70bd28, 0x7f, [0x0, 0x7, 0x7, 0x5, 0x80, 0x0, 0x5, 0xfffffff7, 0x6]}}, @proto={0x5, 0x19, 0x6c}, @mark={0xc, 0x15, {0x35075d, 0x3}}]}, 0x27c}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f00000000c0)={&(0x7f0000000280)=ANY=[@ANYBLOB="c0000000190001000000000010000000e0000002000000000000000000000000ac1414bb00000000000000000000000000000000000000000a00008006000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="000000000000000002ee0000000000001910be1a000000000000000000000000000000000000000008000000000000000000000000000000fdfffffffffeffff000000000000000000000000000000000000000000000000000000000000000000000000b86b6e00000100010000000008001f000100"], 0xc0}, 0x1, 0x0, 0x0, 0x4008011}, 0x0)
r1 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r1, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000000200)={&(0x7f0000000c40)=ANY=[@ANYBLOB="d40000001b001d0328bd7000fcdbdf25ffffffff000000000000000000000000fe8000000000000000000000000000264e2000014e2404000200000087000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="08000000000000000100010000000000f4ffffffffffffff04000000000000000200000000000000ba410000000091ad07000000000000000000008000000000ffffff7f0000000003000000000000000400000000000000080000000000000003000000b86b6e000000030100000000040000000000000008001f0001"], 0xd4}, 0x1, 0x0, 0x0, 0x20044001}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000200)={0x0, 0x0, &(0x7f00000001c0)={&(0x7f00000003c0)=@updpolicy={0xb8, 0x19, 0xfd3649826d894c67, 0x0, 0x0, {{@in6=@mcast1, @in6=@initdev={0xfe, 0x88, '\x00', 0x0, 0x0}, 0x0, 0x0, 0x0, 0x0, 0xa, 0x0, 0x80}}}, 0xb8}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000280)={0x0, 0x0, &(0x7f0000000000)={&(0x7f00000007c0)=@newsa={0x1a0, 0x10, 0x1, 0x70bd2d, 0x0, {{@in6=@initdev={0xfe, 0x88, '\x00', 0x1, 0x0}, @in6=@initdev={0xfe, 0x88, '\x00', 0x0, 0x0}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x64}, {@in, 0x0, 0x32}, @in6=@dev={0xfe, 0x80, '\x00', 0x2c}, {0x0, 0x0, 0x0, 0x0, 0x0, 0x4}, {0x0, 0x0, 0x8000000, 0x200}, {0x0, 0x0, 0x2}, 0x0, 0x0, 0x2, 0x1, 0x0, 0xcd}, [@algo_crypt={0x48, 0x2, {{'cbc(aes)\x00'}}}, @replay_esn_val={0x1c, 0x17, {0x0, 0x70bd25, 0x0, 0x0, 0x70bd28, 0xeaa}}, @algo_auth_trunc={0x4c, 0x14, {{'cmac(aes)\x00'}, 0x0, 0x18}}]}, 0x1a0}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000040)={&(0x7f00000003c0)=ANY=[@ANYBLOB="400000001200050104000000ffdbdf2528001a"], 0x40}, 0x13, 0x0, 0x0, 0x880}, 0x20040840)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000340)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000040)=@newsa={0x138, 0x1a, 0x1, 0xfffffffd, 0x0, {{@in=@dev={0xac, 0x14, 0x14, 0x1a}, @in=@local, 0xfffc}, {@in=@rand_addr=0x64010100, 0x0, 0x33}, @in=@dev={0xac, 0x14, 0x14, 0x1}, {0x3}, {0x0, 0x7}, {0x0, 0x6, 0x2}, 0x0, 0x0, 0xa, 0x2, 0x0, 0x60}, [@algo_auth={0x48, 0x1, {{'sha1\x00'}}}]}, 0x138}}, 0x8000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000001f40)={&(0x7f00000005c0)=@updpolicy={0xfc, 0x19, 0x1, 0x0, 0x0, {{@in=@empty, @in6=@mcast2, 0x0, 0x8, 0x0, 0x0, 0xa, 0x0, 0x0, 0x0, 0x0, 0xffffffffffffffff}, {0x0, 0xa9, 0x0, 0x1, 0x0, 0xffffffffffffffff}, {0x0, 0xa00, 0x40800000000000, 0x800000000000002}}, [@tmpl={0x44, 0x5, [{{@in6=@private0={0xfc, 0x0, '\x00', 0x1}, 0x0, 0x3c}, 0x0, @in=@loopback, 0x2, 0x4}]}]}, 0xfc}}, 0x0)
r1 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r1, &(0x7f0000000380)={0x0, 0x0, &(0x7f0000000340)={&(0x7f00000004c0)=@migrate={0xbc, 0x21, 0x1, 0x4, 0x0, {{@in6=@private2={0xfc, 0x2, '\x00', 0x1}, @in6=@mcast2, 0x0, 0x0, 0x0, 0x2, 0xa, 0x0, 0xa0, 0x2e}}, [@migrate={0x50, 0x11, [{@in6=@private0={0xfc, 0x0, '\x00', 0x1}, @in=@loopback, @in6=@private2={0xfc, 0x2, '\x00', 0x1}, @in=@multicast1, 0x3c, 0x4, 0x0, 0x0, 0xa, 0xa}]}, @encap={0x1c, 0x4, {0xffffffffffffffff, 0x4e24, 0x4e24, @in=@private=0xa010102}}]}, 0xbc}, 0x1, 0x0, 0x0, 0x40000}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000040)={0x0, 0x0, &(0x7f0000000280)={&(0x7f00000002c0)=ANY=[@ANYBLOB="000100001600010025bd7000fedbdf25fc0100000000000000000000000000000a0101010000000000000000000000004e2100004e230008020000802e000000", @ANYRES32=0x0, @ANYRES32=0xee00, @ANYBLOB="00000000000000000000000000000001000004d3330000000000000000000000000002000000000109000000000000000900000000530000080000000063f8000900000000000000fbfffffffffffffffeffffffffffffff090000000000000003000000000000000800000000000000000400000000000033f7d66b00000000000000000000000001010000e47f00000100000028bd70000235000002000331b400000000000000040000000900000005001b"], 0x100}, 0x1, 0x0, 0x0, 0x24048885}, 0x880)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000280)={0x0, 0x0, &(0x7f0000000240)={&(0x7f0000000f00)=ANY=[@ANYBLOB="040100001600010026bd7000ffdbdf25ac1e0101000000000000000000000000ac1414440000000000000000000000004e2000004e2300080a0000a02f000000", @ANYRES32=0x0, @ANYRES32, @ANYBLOB="ac1414bb000000000000000000000000000004d233000000ac1414bb000000000000000000000000f8ffffffffffffff0300000000000000ec030000000000000200000000000000070000000000000006000000000000000e00000000000000080000000000000074e30000000000000000000000000000060000000000000077f30000000000007f00000000000000020000002cbd7000043500000200f1020000000000000000000000001d1b00000c000f10"], 0x104}, 0x1, 0x0, 0x0, 0x4004000}, 0x40040c0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000001c0)={0x0, 0x0, &(0x7f0000000180)={&(0x7f0000001a00)=ANY=[@ANYBLOB="241200001600010225bd7000fbdbdf25fc020000000000000000000000000001ff0200000000000000000000000000014e2301004e210006020080a02b000000", @ANYRES32=0x0, @ANYRES32, @ANYBLOB="ff010000000000000000000000000001000004d232000000000000000000000000000000000000010000000000000000010000000100000000000000000000009307000000000000030000000000000002000000000000000b000000000000005e0000000000000001000000000000000200000000000000090000000000000004000000000000000100000005000000fbffffff27bd7000000008000a000101020000000000000006000000010400000c0015005907350003000000050019003300000008000b00ca0300000c000f000400000000000000b4001400686d6163287368613235362900000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000400300008001000003188435ce2e6800567394af4546caa65c79349d62ca77f5841db5a435d52067d0fbd7697cfd73d4e5e5194ec211ac42d9b4b7619739b7a54dcec2ed420b12cf9bf13ec75df8e231050d58e2e4af70318fe85186e85a810ddf0fb5a59a1f77f06671cd24367a163908001f"], 0x1224}, 0x1, 0x0, 0x0, 0x4000000}, 0x24000014)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000300)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000340)=@acquire={0x134, 0x17, 0x1, 0x70bd29, 0x25dfdbfc, {{@in6=@empty, 0x4d3, 0x3c}, @in6=@initdev={0xfe, 0x88, '\x00', 0x0, 0x0}, {@in=@empty, @in=@multicast1, 0x4e21, 0x3, 0x4e20, 0x6, 0x2, 0x20, 0x0, 0x87, 0x0, 0xffffffffffffffff}, {{@in=@multicast1, @in6=@private2, 0x4e22, 0x8000, 0x4e23, 0xfff9, 0x2, 0x20, 0x20, 0x3b}, {0x4, 0x3ff, 0x6, 0x401, 0xfffffffffffffffd, 0x8, 0x4, 0x2}, {0x8, 0x5, 0xa, 0xf}, 0xf, 0x0, 0x1, 0x0, 0x2, 0x1}, 0x10001, 0x1c, 0x3ff, 0x70bd2c}, [@sec_ctx={0xc, 0x8, {0x8, 0x8, 0x1, 0x3}}]}, 0x134}, 0x1, 0x0, 0x0, 0x14}, 0x4000004)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000000000)={&(0x7f00000004c0)=ANY=[@ANYBLOB="6c0100001000130726bd70000000000000000000000000000000ffffe0000002ac1414130000000000000000000000004e22000100000003020000003a000000", @ANYRES32=0x0, @ANYRES32, @ANYBLOB="fe8000000000000000000000000000bb0000000032000000fe8000000000000000000000000000aa0000000000000000000000000000000008000000000000000a000000000000000600000000000000000000000000000000000000000000000300000000000000000000000000000000000000000000000200000000000000f8ffffffffffffff0c000000000000000200000029bd70000000000002000100280000000000000068001200726663343534332867636d2861657329290000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e000000080000000316f74eeac053deb73fc018493cc121927a9bca207141b9a451c00aa0800160004"], 0x16c}, 0x1, 0x0, 0x0, 0xc0}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000840)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000000e40)=@updsa={0x13c, 0x1a, 0x1, 0x70bd27, 0x25dfdbfd, {{@in=@loopback, @in6=@loopback, 0x4e22, 0xffef, 0x4e20, 0x2, 0xa, 0x20, 0x80}, {@in=@multicast1, 0x4d2, 0x32}, @in=@initdev={0xac, 0x1e, 0x0, 0x0}, {0x8000, 0x6, 0x9, 0x2, 0x5, 0xa3a7, 0x3f, 0x3}, {0x5, 0x0, 0x2, 0x7}, {0x4, 0xb3e1, 0x6}, 0x70bd2a, 0x0, 0xa, 0x0, 0x1, 0x21}, [@algo_aead={0x4c, 0x12, {{'morus1280-avx2\x00'}, 0x0, 0xa0}}]}, 0x13c}, 0x1, 0x0, 0x0, 0x40000}, 0x40000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000500)={&(0x7f0000000380)=ANY=[@ANYBLOB="54010000100033060000000000000000ffffffff000000000000000000000000e000000200"/64, @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="040000000000000000000000000000000000000032000000ac1e000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000a002300000000000000000048000200656362286369706865725f6e756c6c2900000000000000000000000000000000000000000000000000000000000000000000000000000000000000001c00040007"], 0x154}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000300)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000ac0)=ANY=[@ANYBLOB="8c030000170001"], 0x38c}, 0x1, 0x0, 0x0, 0x10}, 0x4000004)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000040)={&(0x7f0000000080)=@newpolicy={0xb8, 0x13, 0x601, 0x70bd27, 0x25dfdbfd, {{@in=@multicast1, @in6=@dev={0xfe, 0x80, '\x00', 0x17}, 0x4e23, 0x4, 0x4e22, 0x7, 0xa, 0x0, 0xa0, 0xff}, {0x1, 0x2, 0x0, 0x6, 0xc4, 0x3, 0x4, 0x708}, {0xa54, 0x80000001, 0x6, 0x10001}, 0x4, 0x6e6bb6, 0x2, 0x0, 0x2, 0x3}}, 0xb8}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000840)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000000880)=@updsa={0x22c, 0x1a, 0x1, 0x70bd2d, 0x25dfdbfd, {{@in=@loopback, @in6=@loopback, 0x4e22, 0xffef, 0x4e20, 0x2, 0xa, 0x20, 0x80}, {@in=@initdev={0xac, 0x1e, 0x0, 0x0}, 0x4d2, 0x32}, @in=@initdev={0xac, 0x1e, 0x0, 0x0}, {0x8000, 0x6, 0x9, 0x2, 0x5, 0xa3a7, 0x800000000000003f, 0x3}, {0x4, 0x0, 0x2, 0x7}, {0x4, 0xb3e1, 0x6}, 0x70bd2a, 0x0, 0xa, 0x4, 0x1, 0x21}, [@algo_auth={0x10e, 0x1, {{'sha256-mb\x00'}, 0x630, "f40849aba435a8fe6e27cfd6f7fe195095ab989142e97531597c50304e581cb8cef131a173ad036d8a4582459b551f6c6feb3c45f55b82258128a51d68130f1a924303d0e8da533f8f542f3c5f4eba6d2561fdd5d0effe589e8813b80b0a6b94481a59a3eb1892696652c02145efa77ffccc1babc3fec114e0b9c912a816ba90b15199b1d85424a91ad1e5902deca69094280cd1398f61c13c27f13050193b176be94afd70bc44c8dc36dd23601a9de5f6b8321c829d80a4588e158d36ba971aabac9a609890"}}, @replay_esn_val={0x1c, 0x17, {0x12, 0x70bd26, 0x70bd2d, 0x70bd27, 0x70bd29, 0x42}}, @extra_flags={0x8, 0x18, 0x100}, @extra_flags={0x8, 0x18, 0x2}]}, 0x22c}, 0x1, 0x0, 0x0, 0x40000}, 0x40000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000500)={0x0, 0x0, &(0x7f00000004c0)={&(0x7f0000000580)=@newsa={0xf0, 0x10, 0x1, 0x70bd27, 0x25dfdbfc, {{@in6=@local, @in=@initdev={0xac, 0x1e, 0x1, 0x0}, 0x4e21, 0x81, 0x4e23, 0x0, 0x2, 0x0, 0x20, 0x1}, {@in=@loopback, 0x4d4, 0x2b}, @in=@rand_addr=0x64010101, {0x4, 0x4, 0x30, 0x84, 0x7, 0x2, 0xd4, 0xfffffffffffffffd}, {0x0, 0xfffffffffffffffb, 0x2, 0x13c}, {0x7, 0x100, 0x3}, 0x70bd2c, 0x3506, 0xa, 0x3, 0x81, 0x2}}, 0xf0}, 0x1, 0x0, 0x0, 0x4010080}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000300)={0x0, 0x0, &(0x7f00000002c0)={&(0x7f0000000640)=@newspdinfo={0x1c, 0x24, 0x315, 0x70bd27, 0x25dfdbfb, 0xd9a, [@XFRMA_SPD_IPV4_HTHRESH={0x6, 0x3, {0x1}}]}, 0x1c}, 0x1, 0x0, 0x0, 0x10}, 0x4000004)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000140)={&(0x7f00000001c0)=ANY=[@ANYBLOB="fc0000001900010026bd7000fbdbdf2520010000000000000000000000000002fc0100000000000000000000000000004e1e00034e2103ff0200002001000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="ffffff7f000000000100000000000000018000000000000003000000000000000a00000000000000060000000000000002000000000000000100000000000000ffffff7f000000000c0000000000000007000000000000000e00000000000000030000000000000000010000000000004400050000000000000000000000ffff64010101000004d50000000000000000ffffffff000000000000000000000000053500a761"], 0xfc}, 0x1, 0x0, 0x0, 0x8000}, 0x40040)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000000c0)={0x0, 0x0, &(0x7f0000000080)={&(0x7f0000000480)=@delsa={0x34, 0x12, 0x1, 0x0, 0x0, {@in=@multicast2, 0x4d5, 0x0, 0x6c}, [@mark={0xc}]}, 0x47}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000100)={0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x8000}, 0x4)
r1 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r1, &(0x7f0000000100)={0x0, 0x0, &(0x7f00000000c0)={&(0x7f0000000200)=@newpolicy={0xc4, 0x13, 0x1, 0x70bd28, 0x25dfdbfd, {{@in6=@ipv4={'\x00', '\xff\xff', @local}, @in6=@dev={0xfe, 0x80, '\x00', 0x29}, 0x4e24, 0xa3, 0x4e21, 0x7ff, 0xa, 0x80, 0x0, 0x2c}, {0x2, 0x0, 0x1, 0x2, 0xea1, 0x7, 0x9, 0x1}, {0x272dcfdb, 0x7, 0x5, 0xffff}, 0x1, 0x0, 0x0, 0x0, 0x1}, [@mark={0xc, 0x15, {0x35075b, 0x5}}]}, 0xc4}, 0x1, 0x0, 0x0, 0x8000}, 0x4)

      
      sendmsg$nl_xfrm(0xffffffffffffffff, &(0x7f0000000180)={0x0, 0x0, &(0x7f00000001c0)={&(0x7f00000003c0)=@migrate={0x50, 0x21, 0x4, 0x70bd2d, 0x25dfdbff, {{@in=@empty, @in6=@loopback, 0x4e22, 0x3ff, 0x4e21, 0x3a6, 0xa, 0x20, 0xc0, 0x32}, 0x6e6bbe, 0x1}}, 0x50}, 0x1, 0x0, 0x0, 0xc080}, 0x4000031)
r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000040)={&(0x7f00000003c0)=ANY=[@ANYBLOB="400000001200050104000000ffdbdf2528001a00ac1414aa0000000000000000000000000a0101000008000000000000000000000200061005"], 0x40}, 0x13, 0x0, 0x0, 0x880}, 0x20040840)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000040)={0x0, 0x0, &(0x7f0000000280)={&(0x7f00000002c0)=@allocspi={0x100, 0x16, 0x1, 0x70bd25, 0x25dfdbfe, {{{@in6=@private1, @in=@private=0xa010101, 0x4e21, 0x0, 0x4e23, 0x8, 0x2, 0x0, 0x80, 0x2e, 0x0, 0xee00}, {@in6=@loopback, 0x4d3, 0x33}, @in6=@loopback, {0x9, 0x9, 0x8, 0x9, 0xfffffffffffffffb, 0xfffffffffffffffe, 0x9, 0x3}, {0x8, 0x400, 0x6bd6f733}, {0x101, 0x7fe4, 0x1}, 0x70bd28, 0x3502, 0x2, 0x3, 0x31, 0xb4}, 0x4, 0x9}, [@XFRMA_SA_DIR={0x5, 0x21, 0x1}]}, 0x100}, 0x1, 0x0, 0x0, 0x24048885}, 0x880)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f000014f000)={0x0, 0x0, &(0x7f00000bfff0)={&(0x7f0000006440)=@updpolicy={0xb8, 0x19, 0x1, 0x0, 0x0, {{@in=@multicast1, @in=@multicast1, 0x0, 0x0, 0x0, 0x0, 0xa, 0x760}, {0x0, 0x0, 0x100, 0x0, 0x0, 0x0, 0x7, 0x4}, {0x0, 0x0, 0x0, 0x8}, 0x0, 0x6e6bb0, 0x0, 0x1}}, 0xb8}, 0x1, 0x0, 0x0, 0x40050}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000001c0)={0x0, 0x0, &(0x7f0000000180)={&(0x7f0000000000)=ANY=[@ANYBLOB="1c0100001600010225bd7000fddbdf257f000001000000000000000000000000fc0200000000000000000000000000004e2301024e230006020000a06c000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="0a010101000000000000000000000000000004d26c0000000a010101000000000000000000000000fefffffffffffffffdffffff00000000100000000000000095070000000000000200000000c000000000000000000000fd7f00000000000062000000000000000700000066129d4bb564b2050000000000ac231400000000ffffffffffffffffff01000005000000fbffffff27bd700000000000020001016f000000000000000600000001040024"], 0x11c}, 0x1, 0x0, 0x0, 0x20008804}, 0x24040014)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000040)={&(0x7f0000000100)=ANY=[@ANYBLOB="6c0100001000130700000000fcdbdf25e0000001000000000000000000000000ff02000000000000000000000000000100040b6e4e2100020000008021000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="7f000001000000000000000000000000000004d432000000000000000000000000000000000000000000000000000000040000000000000001000000000000000100000000000080fffffffffeffffff1000000000000000018000000000000043050000000000000400000000000000ffffffffffffff7f0b00000000000000fdffffffffffffff090000000e000000000000002cbd7000003500000a000000980000000000000060001200726663343130362867636d2861657329290000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000a00000004000000066d38547aa140d000000000000000086cae9fb4f1c0017"], 0x16c}, 0x1, 0x0, 0x0, 0x880}, 0x2094)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000140)={&(0x7f0000000040)=@migrate={0xbc, 0x21, 0x1, 0x70bd2c, 0x25dfdbfd, {{@in=@multicast2, @in=@dev={0xac, 0x14, 0x14, 0x1f}, 0x4e24, 0x4, 0x4e23, 0xd, 0x2}, 0x6e6bb2, 0x1}, [@migrate={0x50, 0x11, [{@in6=@initdev={0xfe, 0x88, '\x00', 0x0, 0x0}, @in=@rand_addr=0x64010100, @in=@private=0xa010100, @in6=@private0={0xfc, 0x0, '\x00', 0x1}, 0x32, 0x0, 0x0, 0x3501, 0x4d21fe76179a5365, 0x2}]}, @encap={0x1c, 0x4, {0xffffffffffffffff, 0x4e22, 0x4e23, @in=@multicast1}}]}, 0xbc}, 0x1, 0x0, 0x0, 0x81}, 0x20000000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000000440)={&(0x7f0000000680)=@updsa={0x148, 0x1a, 0x1, 0x70bd26, 0x25dfdbfe, {{@in=@rand_addr=0x64010100, @in6=@remote, 0x4e23, 0x7ff, 0x4e22, 0x7, 0xa, 0x80, 0x80, 0x6c}, {@in6=@rand_addr=' \x01\x00', 0x4d2, 0x6c}, @in=@multicast2, {0x2, 0x5246, 0x8, 0xb, 0x80, 0x3, 0x6, 0xff}, {0x9, 0xfffffffffffffffc, 0xc, 0x4}, {0x1, 0x7, 0x7fff}, 0x70bd27, 0x0, 0xa, 0x0, 0x0, 0x8}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}, @replay_val={0x10, 0xa, {0x70bd2b, 0x70bd2a, 0xfffffffe}}]}, 0x148}, 0x1, 0x0, 0x0, 0x895}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000000440)={&(0x7f0000000680)=@updsa={0x138, 0x1a, 0x1, 0x70bd25, 0x25dfdbfe, {{@in=@rand_addr=0x64010100, @in6=@remote, 0x4e23, 0x7ff, 0x4e22, 0x7, 0xa, 0x80, 0x80, 0x6c}, {@in6=@rand_addr=' \x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02', 0x4d2, 0x6c}, @in=@multicast2, {0x480000000, 0x5246, 0x8, 0x6b, 0x80, 0x3, 0x3, 0x1}, {0xa, 0xfffffffffffffffc, 0xc, 0x4}, {0x1, 0x7, 0x7fff}, 0x70bd27, 0x0, 0x2, 0x4, 0x0, 0x8}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}]}, 0x138}, 0x1, 0x0, 0x0, 0x895}, 0x0)

      
      sendmsg$nl_xfrm(0xffffffffffffffff, &(0x7f0000000180)={0x0, 0x0, &(0x7f00000001c0)={&(0x7f00000003c0)=@updpolicy={0xb8, 0x19, 0x1, 0x0, 0x0, {{@in=@multicast1=0xe0000002, @in, 0x0, 0x0, 0x0, 0x0, 0xa, 0x0, 0x0, 0x87}, {0x0, 0x0, 0x6, 0x5, 0x0, 0x0, 0x0, 0xfffffffffffffffb}}}, 0xb8}}, 0x0)
r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000040)={&(0x7f00000003c0)=ANY=[@ANYBLOB="400000001200050104000000ffdbdf2528001a00ac1414aa0000000000000000000000000a01010000080000000000000000000002000610050019"], 0x40}, 0x13, 0x0, 0x0, 0x880}, 0x20040840)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000840)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000000880)=@updsa={0x148, 0x1a, 0x1, 0x70bd2d, 0x25dfdbfd, {{@in=@remote, @in6=@empty, 0x4e22, 0xffef, 0x4e20, 0x2, 0xa, 0x20, 0x80}, {@in=@initdev={0xac, 0x1e, 0x0, 0x0}, 0x4d2, 0x32}, @in=@initdev={0xac, 0x1e, 0x0, 0x0}, {0x8003, 0x6, 0x9, 0x2, 0x5, 0xa3a7, 0x800000000000003f, 0x3}, {0x4, 0x0, 0x2, 0x7}, {0x4, 0xb3e1, 0x6}, 0x70bd2a, 0x0, 0xa, 0x4, 0x1, 0x21}, [@algo_auth={0x48, 0x1, {{'sha256-mb\x00'}}}, @XFRMA_SA_DIR={0x5, 0x21, 0x1}, @extra_flags={0x8, 0x18, 0x2}]}, 0x148}, 0x1, 0x0, 0x0, 0x40000}, 0x40000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000000580)=@updsa={0x180, 0x1a, 0x1, 0x70bd27, 0x25dfdbfc, {{@in=@loopback, @in6=@mcast2, 0x4e1e, 0x0, 0x4e20, 0x2, 0xa, 0x60, 0x80}, {@in=@multicast1, 0x4d4, 0x32}, @in=@remote, {0x8000, 0x6, 0x9, 0x10000002, 0x5, 0xa3a7, 0x3f, 0x10}, {0x5, 0x3, 0x2, 0x7}, {0x1, 0xb3e1, 0x10000}, 0x70bd2a, 0x0, 0xa, 0x1, 0x1, 0x21}, [@algo_auth={0x48, 0x1, {{'cmac(aes)\x00'}}}, @algo_crypt={0x48, 0x2, {{'ecb(cipher_null)\x00'}}}]}, 0x180}, 0x1, 0x0, 0x0, 0x4040000}, 0x40000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000001240)={0x0, 0x0, &(0x7f0000001200)={&(0x7f0000000300)=ANY=[@ANYBLOB="7c02000013002904000000000000220000000000000000000000000000000000ac1414aa00000000000000000000000000000000000000000a00000000000000", @ANYRES32=0x0, @ANYRES32, @ANYBLOB="00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000c40105"], 0x27c}, 0x1, 0x0, 0x0, 0x804}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000340)={0x0, 0x0, &(0x7f0000000280)={&(0x7f0000000600)=@allocspi={0x100, 0x16, 0x1, 0x70bd25, 0x25dfdbfe, {{{@in6=@private1, @in=@initdev={0xac, 0x1e, 0x1, 0x0}, 0x4e22, 0x0, 0x4e23, 0x8, 0xa, 0x20, 0x80, 0x0, 0x0, 0xee00}, {@in6=@private0, 0x4d6, 0x33}, @in=@rand_addr=0x64010100, {0x9, 0x9, 0x8, 0x8, 0xfffffffffffffffb, 0x1741, 0x8, 0x4}, {0x5, 0x2, 0x6bd6f733}, {0x7, 0x7fe4, 0x1}, 0x70bd28, 0x3502, 0xa, 0x3, 0x31, 0x10}, 0x1, 0x9}, [@XFRMA_SA_PCPU={0x8, 0x23, 0x40000000}]}, 0x100}, 0x1, 0x0, 0x0, 0x20048884}, 0x20044890)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f00000001c0)={&(0x7f00000003c0)=@updpolicy={0xc4, 0x19, 0x1, 0x0, 0x0, {{@in=@multicast1=0xe0000002, @in, 0x0, 0x0, 0x0, 0x0, 0xa, 0x0, 0x0, 0x87}, {0x0, 0x0, 0x6, 0x5, 0x0, 0x0, 0x0, 0xfffffffffffffffb}}, [@sec_ctx={0xc, 0x8, {0x8}}]}, 0xc4}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000000840)=@updsa={0x140, 0x1a, 0x1, 0x70bd27, 0x25dfdbfe, {{@in=@loopback, @in=@dev={0xac, 0x14, 0x14, 0x3f}, 0x4e1e, 0xffff, 0x4e20, 0x2, 0xa, 0x60, 0x20, 0x33}, {@in=@multicast1, 0x4d6, 0x32}, @in6=@empty, {0x8000, 0x5, 0x6, 0x2, 0x80009, 0xa3a7, 0x1}, {0x5, 0x3, 0x2, 0xa}, {0x1, 0x0, 0x10000}, 0x70bd3a, 0x0, 0xa, 0x1, 0x1, 0x2}, [@algo_auth={0x48, 0x1, {{'cmac(aes)\x00'}}}, @XFRMA_IPTFS_INIT_DELAY={0x8, 0x27, 0x7f}]}, 0x140}, 0x1, 0x0, 0x0, 0x4040041}, 0x88d0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000001f40)={&(0x7f00000004c0)=@updpolicy={0xb8, 0x19, 0x1, 0x0, 0x0, {{@in=@multicast2, @in=@local, 0x0, 0x5, 0x0, 0x0, 0xa, 0x0, 0x0, 0x0, 0x0, 0xffffffffffffffff}, {0x0, 0xa9, 0x0, 0x0, 0x0, 0xffffffffffffffff}, {0x0, 0xfffffffffffffbff, 0x40800000000000, 0x800000000000000}}}, 0xb8}}, 0x20000000)
sendmsg$nl_xfrm(r0, &(0x7f0000000380)={0x0, 0x0, &(0x7f0000000340)={&(0x7f00000005c0)=@migrate={0xa0, 0x21, 0x1, 0x70bd27, 0xfffffffe, {{@in6=@private2, @in6=@private2={0xfc, 0x2, '\x00', 0x1}, 0xdffc, 0x0, 0x0, 0x2, 0xa, 0xe0, 0x0, 0x1}, 0x2}, [@migrate={0x50, 0x11, [{@in6=@dev={0xfe, 0x80, '\x00', 0x11}, @in=@local, @in=@local, @in6=@remote, 0x3c, 0x0, 0x0, 0x0, 0xa, 0xa}]}]}, 0xa0}, 0x1, 0x0, 0x0, 0x800}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000380)={0x0, 0x0, &(0x7f0000000340)={&(0x7f00000005c0)=@migrate={0xa0, 0x21, 0x1, 0x0, 0xfffffffe, {{@in6=@private2, @in6=@private2={0xfc, 0x2, '\x00', 0x1}, 0xfffc, 0x0, 0x2, 0x0, 0xa, 0xe0, 0x80}, 0x2}, [@migrate={0x50, 0x11, [{@in6=@initdev={0xfe, 0x88, '\x00', 0x1, 0x0}, @in6=@private0, @in6=@remote, @in=@broadcast, 0x6c, 0x1, 0x0, 0x0, 0x2, 0xa}]}]}, 0xa0}, 0x1, 0x0, 0x0, 0x800}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000080)={0x0, 0x0, &(0x7f0000000040)={&(0x7f0000001600)=ANY=[@ANYBLOB="081100001900010029bd7000fcdbc6b4c353e0a77103000000000000000000007f0000010000000000000000000000004e2400004e210000020000002c0000001f18c1ee8e38c039decd62c3b109341d3704b3c57d00b6eb55b874b56453d6", @ANYRES32=0x0, @ANYRES32, @ANYBLOB="0400000000000000ffffff7f000000000400000000000000a907000000000000750000000000000035ffffffffffffff0009000000000000000000000000000008000000000000001753"], 0x1108}, 0x1, 0x0, 0x0, 0x926d6fb68c90438b}, 0x10008000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000000440)={&(0x7f0000000680)=@updsa={0x138, 0x1a, 0x1, 0x70bd25, 0x25dfdbfe, {{@in=@rand_addr=0x64010100, @in6=@remote, 0x4e23, 0x7ff, 0x4e22, 0x5, 0xa, 0x80, 0x80, 0x6c}, {@in6=@rand_addr=' \x01\x00', 0x4d2, 0x6c}, @in=@multicast2, {0x480000000, 0x5246, 0x8, 0x6b, 0x80, 0x3, 0x6, 0xff}, {0x9, 0xfffffffffffffffc, 0xc, 0x4}, {0x1, 0x9, 0x7fff}, 0x70bd27, 0x0, 0xa, 0x1, 0x0, 0x8}, [@algo_comp={0x48, 0x3, {{'lzjh\x00'}}}]}, 0x138}, 0x1, 0x0, 0x0, 0x895}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000001280)={0x0, 0x0, &(0x7f0000001240)={&(0x7f00000004c0)=@updsa={0x13c, 0x1a, 0x1, 0x70bd2d, 0x25dfdbfe, {{@in6=@private0={0xfc, 0x0, '\x00', 0x1}, @in=@remote, 0x4e22, 0x0, 0x4e22, 0xfffa, 0xa, 0x0, 0x80, 0x33}, {@in=@multicast1, 0x4d6, 0x3c}, @in=@empty, {0xfff, 0x2, 0x8000000000000000, 0x9, 0x8, 0x2, 0x5, 0xfffffffffffffffd}, {0x7, 0x1, 0x800, 0x8}, {0x0, 0xd1}, 0x70bd2a, 0x3503, 0x2, 0x2, 0x5, 0x21}, [@algo_aead={0x4c, 0x12, {{'morus640-generic\x00'}, 0x0, 0xe0}}]}, 0x13c}, 0x1, 0x0, 0x0, 0x4001}, 0x8044)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000001280)={0x0, 0x0, &(0x7f0000001240)={&(0x7f00000004c0)=@updsa={0xfc, 0x1a, 0x1, 0x70bd2d, 0x25dfdbfe, {{@in6=@private0={0xfc, 0x0, '\x00', 0x1}, @in=@remote, 0x4e22, 0x0, 0x4e22, 0xfffa, 0xa, 0x0, 0x80, 0x33}, {@in=@multicast1, 0x4d6, 0x3c}, @in=@empty, {0xfff, 0x2, 0x8000000000000000, 0x9, 0x8, 0x2, 0x5, 0xfffffffffffffffd}, {0x7, 0x1, 0x800, 0x8}, {0x0, 0xd1}, 0x70bd2a, 0x3503, 0x2, 0x2, 0x5, 0x21}, [@sec_ctx={0xc, 0x8, {0x8, 0x8, 0x1, 0x1}}]}, 0xfc}, 0x1, 0x0, 0x0, 0x4001}, 0x8044)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f00000000c0)={&(0x7f0000000280)=ANY=[@ANYBLOB="c0000000190001000000000010000000e0000002000000000000000000000000ac1414bb00000000000000000000000000000000000000000a00008006000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="000000000000000002ee0000000000001910be1a000000000000000000000000000000000000000008000000000000000000000000000000fdfffffffffeffff000000000000000000000000000000000000000000000000000000000000000000000000b86b6e"], 0xc0}, 0x1, 0x0, 0x0, 0x4008011}, 0x0)
r1 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r1, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000000200)={&(0x7f0000000c40)=ANY=[@ANYBLOB="d40000001b001d0328bd7000fcdbdf25ffffffff000000000000000000000000fe8000000000000000000000000000264e2000014e2404000200000087000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="08000000000000000100010000000000f4ffffffffffffff04000000000000000200000000000000ba410000000091ad07000000000000000000008000000000ffffff7f0000000003000000000000000400000000000000080000000000000003000000b86b6e000000030100000000040000000000000008001f0001"], 0xd4}, 0x1, 0x0, 0x0, 0x20044001}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000003c0)={0x0, 0x0, &(0x7f0000000380)={&(0x7f00000004c0)=ANY=[@ANYBLOB="f00000001c0007000000000000000000ff020000000000000000000000000001e0000002000000", @ANYRES32=0xee00], 0xf0}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000300)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000040)=@acquire={0x134, 0x17, 0x1, 0x70bd29, 0x25dfdbfc, {{@in6=@empty, 0x4d3, 0x3c}, @in6=@initdev={0xfe, 0x88, '\x00', 0x0, 0x0}, {@in=@empty, @in=@multicast1, 0x4e21, 0x3, 0x4e20, 0x6, 0x2, 0x20, 0x0, 0x87, 0x0, 0xffffffffffffffff}, {{@in=@multicast1, @in=@local, 0x4e22, 0x8000, 0x4e23, 0xfff9, 0x2, 0x20, 0x20, 0x3b}, {0x4, 0x3ff, 0x6, 0x401, 0xfffffffffffffffd, 0x8, 0x4, 0x2}, {0x8, 0x5, 0xa, 0xf}, 0xf, 0x0, 0x1, 0x0, 0x2, 0x1}, 0x10001, 0x1c, 0x3ff, 0x70bd2c}, [@policy_type={0xa, 0x10, {0x3d409ee62271c5dd}}]}, 0x134}, 0x1, 0x0, 0x0, 0x10}, 0x4000004)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000640)={0x0, 0x0, &(0x7f0000000500)={&(0x7f0000000680)=@allocspi={0xf8, 0x16, 0x411, 0x0, 0x0, {{{@in6=@private2, @in=@private, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xff}, {@in6=@private1={0xfc, 0x1, '\x00', 0xff}, 0x4d6, 0xff}, @in=@empty, {0x7}, {}, {}, 0x8000000, 0x0, 0x2}, 0x0, 0xfdffff00}}, 0xf8}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000500)={&(0x7f0000000000)=@newsa={0x144, 0x10, 0x633, 0x0, 0x0, {{@in=@initdev={0xac, 0x1e, 0x0, 0x0}, @in=@broadcast, 0x4e20}, {@in=@private, 0x100, 0x32}, @in6=@rand_addr=' \x01\x00', {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1}, {}, {}, 0x0, 0x0, 0x2}, [@algo_crypt={0x48, 0x2, {{'ecb(cipher_null)\x00'}}}, @encap={0x1c, 0x8, {0x0, 0x0, 0x0, @in6=@rand_addr=' \x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02'}}]}, 0x154}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000080)={0x0, 0x0, &(0x7f0000000040)={&(0x7f0000000680)=@newsa={0x16c, 0x10, 0x713, 0x2, 0x25dfdbfc, {{@in=@multicast1, @in=@initdev={0xac, 0x1e, 0x0, 0x0}, 0x4e20, 0xef65, 0x4e1d, 0x602, 0xa, 0x20, 0x0, 0x6c, 0x0, 0xee00}, {@in=@rand_addr=0x64010101, 0xfc, 0x32}, @in6=@loopback, {0x0, 0x0, 0xc, 0x8, 0x0, 0x0, 0x5, 0x743}, {0x4, 0x7fffffffffffffff, 0x0, 0x4000000000000000}, {0x2}, 0x70bd29, 0x3500, 0xa, 0x4}, [@algo_aead={0x60, 0x12, {{'rfc4106(gcm(aes))\x00'}, 0xa0, 0x60, "217d66d38547aa140db8a200000000c538c7cb7a"}}, @encap={0x1c, 0x4, {0xfffffffffffffffc, 0x4e20, 0x4e24, @in=@dev={0xac, 0x14, 0x14, 0x30}}}]}, 0x16c}, 0x1, 0x0, 0x0, 0x20000890}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000100)={0x0, 0x0, &(0x7f00000000c0)={&(0x7f00000002c0)=@flushpolicy={0x1c, 0x1d, 0x1, 0x70bd2a, 0x25dfdbfd, "", [@policy_type={0xa, 0x10, {0x1}}]}, 0x1c}, 0x1, 0x0, 0x0, 0x4000020}, 0x4000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000a80)={0x0, 0x0, &(0x7f0000000a40)={&(0x7f0000000000)=@flushsa={0x14, 0x1c, 0x1, 0x70bd2c, 0x25dfdbfb, {0xff}}, 0x14}, 0x1, 0x0, 0x0, 0x81}, 0x8800)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000001c0)={0x0, 0x0, &(0x7f0000000180)={&(0x7f0000000000)=ANY=[@ANYBLOB="1c0100001600010225bd7000fddbdf257f000001000000000000000000000000fc0200000000000000000000000000004e2301024e230006020000a06c000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="0a010101000000000000000000000000000004d26c"], 0x11c}, 0x1, 0x0, 0x0, 0x20008804}, 0x24040014)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000440)={0x0, 0x0, &(0x7f0000000400)={&(0x7f0000000100)=@updsa={0xf8, 0x1a, 0x1, 0x70bd2d, 0x25dfdbff, {{@in=@private=0xa010101, @in6=@private0={0xfc, 0x0, '\x00', 0x1}, 0x4e24, 0x5, 0x4e21, 0x0, 0xa, 0x80, 0x20, 0x87}, {@in6=@remote, 0x4d6, 0x3c}, @in6=@private1, {0x80000000, 0x3, 0x8, 0x3, 0x6, 0xfffffffffffffffd, 0x8, 0x870b}, {0x100, 0x8, 0x3, 0xac5}, {0x9, 0x2, 0xc}, 0x70bd25, 0x3503, 0x2, 0x2, 0x9, 0x24}, [@tfcpad={0x8}]}, 0xf8}}, 0x800)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x5, &(0x7f0000000140)={&(0x7f0000000380)=@migrate={0xec, 0x21, 0x1, 0x70bd25, 0x25dfdbfd, {{@in=@multicast2, @in=@dev={0xac, 0x14, 0x14, 0x1e}, 0x4e24, 0x4, 0x4e23, 0xd, 0x2, 0x0, 0xc0, 0xc}, 0x6e6bb2, 0x1}, [@migrate={0x9c}]}, 0xec}, 0x1, 0x0, 0x0, 0x81}, 0x20004000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000800)={&(0x7f00000002c0)=@updsa={0x15c, 0x1a, 0x1, 0x70bd27, 0x25dfdbfe, {{@in=@loopback, @in=@dev={0xac, 0x14, 0x14, 0x3f}, 0xce1e, 0xffff, 0x4e20, 0x2, 0xa, 0x60, 0x80, 0x21}, {@in6=@mcast1, 0x4d6, 0x32}, @in6=@empty, {0x8000, 0x6, 0xac, 0x2, 0x4, 0xa3a7, 0x3f}, {0x5, 0x3, 0x2, 0x7}, {0x0, 0xfffffffd, 0x10000}, 0x70b92a, 0x34ff, 0xa, 0x1, 0x0, 0x21}, [@algo_auth={0x48, 0x1, {{'cmac(aes)\x00'}}}, @XFRMA_SA_DIR={0x5, 0x21, 0x1}, @replay_esn_val={0x1c, 0x17, {0x0, 0x70bd2b, 0x70bd2a, 0x70bd2b, 0x70bd2a, 0xfffffff9}}]}, 0x15c}, 0x1, 0x0, 0x0, 0x40400c5}, 0x88d0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000540)={0x0, 0x0, &(0x7f0000000500)={&(0x7f0000000d80)=@newsa={0x140, 0x10, 0x1, 0x0, 0x25dfdbfd, {{@in6=@private2, @in6=@empty, 0x4000, 0x0, 0x3, 0x3}, {@in=@broadcast, 0x0, 0x33}, @in6=@mcast2, {0x0, 0x0, 0x0, 0x401, 0x0, 0x0, 0x4}, {}, {}, 0x0, 0x0, 0xa, 0x1}, [@extra_flags={0x8, 0x18, 0xf63}, @algo_auth={0x48, 0x1, {{'sha256\x00'}}}]}, 0x140}}, 0x20000000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000004c0)={0x0, 0x0, &(0x7f00000000c0)={&(0x7f0000000040)=ANY=[@ANYBLOB="50000000270001"], 0x50}}, 0x1000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000001c0)={0x0, 0x0, &(0x7f0000000180)={&(0x7f0000000000)=ANY=[@ANYBLOB="1c0100001600010225bd7000fddbdf257f000001000000000000000000000000fc0200000000000000000000000000004e2301024e230006020000a06c000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="0a010101000000000000000000000000000004d26c0000000a010101000000000000000000000000fefffffffffffffffdffffff00000000100000000000000095070000000000000200000000c000000000000000000000fd7f00000000000062000000000000000700000066129d4bb564b2050000000000ac231400000000ffffffffffffffffff01000005000000fbffffff27bd700000000000020001016f0000000000000006"], 0x11c}, 0x1, 0x0, 0x0, 0x20008804}, 0x24040014)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000500)={&(0x7f0000000000)=@newsa={0x138, 0x10, 0x633, 0x0, 0x0, {{@in=@initdev={0xac, 0x1e, 0x0, 0x0}, @in=@broadcast}, {@in=@private, 0x0, 0x32}, @in6=@rand_addr=' \x01\x00', {}, {}, {}, 0x0, 0x0, 0x2}, [@algo_crypt={0x48, 0x2, {{'ecb(cipher_null)\x00'}}}]}, 0x138}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000000440)={&(0x7f0000000800)=@updsa={0x148, 0x1a, 0x1, 0x70bd65, 0x25dfdbfe, {{@in=@rand_addr=0x64010100, @in6=@remote, 0x4e23, 0x7ff, 0x4e22, 0x7, 0xa, 0x0, 0x80, 0x6c}, {@in6=@rand_addr=' \x01\x00', 0x4d5, 0x6c}, @in=@multicast2, {0x480000000, 0x5246, 0x8, 0x6b, 0x80, 0x3, 0x6, 0x1}, {0xa, 0xfffffffffffffffc, 0xc, 0x4}, {0x1, 0x7, 0x7fff}, 0x70bd27, 0x0, 0x2, 0x1, 0x0, 0x8}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}, @XFRMA_IPTFS_PKT_SIZE={0x8}, @XFRMA_SA_DIR={0x5, 0x21, 0x1}]}, 0x148}, 0x1, 0x0, 0x0, 0x895}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000000440)={&(0x7f0000000800)=@updsa={0x148, 0x1a, 0x1, 0x70bd65, 0x25dfdbfe, {{@in6=@loopback, @in6=@remote, 0x4e23, 0x7ff, 0x4e22, 0x7, 0xa, 0x0, 0x80, 0x6c}, {@in=@initdev={0xac, 0x1e, 0x0, 0x0}, 0x4d5, 0x6c}, @in=@multicast2, {0x480000000, 0x5246, 0x8, 0x6b, 0x80, 0x3, 0x6, 0x1}, {0xa, 0xfffffffffffffffc, 0xc, 0x4}, {0x1, 0x7, 0x7fff}, 0x70bd27, 0x0, 0x2, 0x1, 0x0, 0x8}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}, @XFRMA_IPTFS_INIT_DELAY={0x8, 0x27, 0x2}, @XFRMA_SA_DIR={0x5, 0x21, 0x1}]}, 0x148}, 0x1, 0x0, 0x0, 0x44805}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000001c0)={0x0, 0x0, &(0x7f0000000180)={&(0x7f0000000080)=ANY=[@ANYBLOB="f80000001600010225bd7000fbdbdf25fe8000000000000000000000000000aaac1414bb0000000000000000000000004e2301004e210006020080a06c000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="ff010000000000000000000000000001000004d232000000000000000000000000000000000000010000000000009e577355000001000000010000001000000000000000950700000000000003000000000000000600000000000000ff7f0000000000005e000000000000000700000000000000070000000000000000ac23140000000003000000000000000100000005000000fbffffff27bd700000000000020001010200000000000000ffffffff"], 0xf8}, 0x1, 0x0, 0x0, 0x4000000}, 0x24000014)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000300)={0x0, 0x0, &(0x7f00000002c0)={&(0x7f0000000640)=ANY=[@ANYBLOB="440000002400150327bd7000fbdbdf259a0d00000600040000000000060003000100000006000300000000000600030000000000060003000041"], 0x44}, 0x1, 0x0, 0x0, 0x10}, 0x4000004)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000000440)={&(0x7f0000000980)=@updsa={0x144, 0x1a, 0x1, 0x70bd25, 0x25dfdbfe, {{@in6=@mcast1, @in6=@remote, 0x4e23, 0x7ff, 0x4e22, 0x7, 0xa, 0x0, 0x80, 0x6c}, {@in6=@rand_addr=' \x01\x00', 0x4d5, 0x6c}, @in=@multicast2, {0x480000000, 0x5246, 0x8, 0x6b, 0x80, 0x7, 0x6, 0x1}, {0xa, 0xfffffffffffffffc, 0xc, 0x4}, {0x1, 0x7, 0x7fff}, 0x70bd27, 0x0, 0xa, 0x1, 0x0, 0x8}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}, @XFRMA_SA_DIR={0x5, 0x21, 0x1}, @XFRMA_IPTFS_DONT_FRAG={0x4}]}, 0x144}, 0x1, 0x0, 0x0, 0x895}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000140)={&(0x7f00000004c0)=ANY=[@ANYBLOB="c40000001900010026bd7000fbdbdf2520010000000000000000000000000002fc0100000000000000000000000000004e1e00034e2103ff0200002001000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="ffffff7f000000000100010000000000018000000000000003000000000000000a0000000000000006000000000000000200000000000000d50000000000000080ffff7f000000000c0000000000000007000000000000000e00000000000000030000000000000000010000000000000c000800f5"], 0xc4}, 0x1, 0x0, 0x0, 0x8000}, 0x40040)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000340)={&(0x7f0000000540)=@newsa={0x158, 0x10, 0x713, 0x0, 0x0, {{@in6=@initdev={0xfe, 0x88, '\x00', 0x0, 0x0}, @in6=@mcast1, 0x0, 0x0, 0x0, 0x0, 0x2}, {@in=@multicast2, 0x0, 0x33}, @in6=@rand_addr=' \x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02', {}, {0x0, 0x0, 0x0, 0x4}, {}, 0x0, 0x3502, 0x2}, [@encap={0x1c, 0x4, {0x3, 0x4e24, 0x4e22, @in6=@private0={0xfc, 0x0, '\x00', 0x1}}}, @algo_auth_trunc={0x4c, 0x14, {{'sha1\x00'}}}]}, 0x158}}, 0x40)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000000c0)={0x0, 0x0, &(0x7f0000000080)={&(0x7f0000000040)=ANY=[@ANYBLOB="1c0000001d00010003000000000000000a00100001"], 0x1c}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000040)={0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000700)=@delsa={0x3c, 0x11, 0x1, 0x0, 0x0, {@in6=@loopback, 0x0, 0x2}, [@srcaddr={0x14, 0xd, @in=@local}]}, 0x3c}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000001c0)={0x0, 0x0, &(0x7f0000000180)={&(0x7f0000000200)=@delsa={0xec4, 0x11, 0x400, 0x70bd2b, 0x25dfdbfc, {@in6=@private2={0xfc, 0x2, '\x00', 0x1}, 0x4d6, 0xa, 0xff}, [@srcaddr={0x14, 0xd, @in6=@local}, @replay_thresh={0x8, 0xb, 0xc}, @replay_val={0x10, 0xa, {0x70bd27, 0x70bd2d}}, @algo_aead={0xe6d, 0x12, {{'gcm_base(ctr(aes-aesni),ghash-generic)\x00'}, 0x7108, 0x0, "02f805c5cf1636e294dee08234d4f160f95f8a3b37f95f20e3eec2ae76411fc7d44a829914fe42afde82e8716247fe16b3d4a50b6fa2c9b82d63eabdf6a996152695f12b8a1efc28fad9166a8726b527bedf967a2bfc1fbe14dae5b89224514322de7fac9fb6a7371f31343ba05aa32c1c52c3e99696c4ac160643758f3bc2dcdcfaf318f1bc3c9c861ef0f91f9f3057f110c2396cff52adf039bc8be818f57a04880d9b69450eacc7f30a9f480759525548087810dd7a146b0c88ea58456ccab04107c516d7bb42f9d84baea240834740157cd57b1056bd3e90fcf93eb099a01eb127f9517513a13d8a93587eaec05702f6f305307c2150c5c37fefc43a176c1acdeac5bd861055c8fa2fa4f1b9504d2715b202c92ad8543c8cddeaad4b0ec64c70b43064be3a0fe9c91ffcaac31ad983d9c839d08515abc949c11dd661f6b76d630598d8ab87815f155079ff95751c9603d2b1b7208bab65a731681b59668060d0964e9a28e4156957e1384cd64b9f06a60a0b9d6b5cbdca8beee3b4741c1d05ecbf707e35d7777830eb24b168cf62e4ca35d6f636760c02e8decba4b8c610271c877a2457c2e3700a7c4982b2ae18f745b2312f2bf6004d9294bcd2082b538a820f0b628b756f054109046ead6ccebbe75301ee48b60d5ea048191992b900587719d410be1ef8eadae1052e24a7f7c0f72a038337ab6622e8c39837ca012d4775c81da11bb90f397c72d627e028b238757a38e576af5a5d90a73678ec512bdf3bb0fb041fe3ab4251a6ef3a73dfa6fd1e574784d29cb3e9d6d32961bf3326743cc54f54ea404cdc59e5a76cc2df631710c988d4da2ffd4274f22aa0fca577159bc8a1c7ed031f894629663c0cba60dfeb0bce47fa49c7032c0557796b9d20ea88401f0ae01b3a40b3fc512ab38bcff1abf72e1348ee05db62ede1cd477df5f7d5f6c0f4f25f71e5195366f25adf85f0bd97b3774416d6be55c20f6ed4f5d0f677f151e8d763153182cd3416753447a54e6ca5a227388b7649b7983f308a49c8aeff44be2ae43c2c4f9a68b849eaaa101bb086d4636c0f32a666c91d239fac80cd93934892e4735a2c880b9f911d56764feed8fa1c30251bc267ca1b94aad9f43f3b910fa1b8a933a83e4977fc3391405c102b0c5bdc213c3f4ae4a0d413fd572418a8bd9874da3e7d2f51d72216351c6742f5cc763a06105c48a9e1842ef93001d5870ee52ff7f77219e4e3f14acd8cc89d92cca0413fac6731d6b14bd307d0931c708c820a32912328b01b944520fab2c111a41f8be48302d0d3b2216bf932f743d0d99ac94d1b2bdbed77426413f9c987aee07f9dc51817826ec059da3f55860c31db5c5a7f2d9f8e0ee5ba74fb7701828ebc44952da11f6582a932933a3464bfeada799eb0bc626d68000f936e13fe84f67b3436e7250b1b3e207f5a11c79895d4de9647dc19aeefe97bcb3c96f132fec2bf3f923105eee85b14cbe4a8d8855b5064708c356724c6d578886265c0474129d9c6ae164a7bc158dc30fc8b085a3021ea2a53c7fc1d44790948cbd4f482279be069fe96c30215671cd84f5eabe484e4118585748282326f8a0a37666fab3abbfdbe408d879fa992d0351c2a083dc3a2b1af4b849052576b4a91f2d02527dba1432936f309654c8a6f4ebbba5dbf0216fce56c683d68a93d9592848209fc7f18aa08980cc20e567ce91cba79afc16b4f9ce49f270b427dbe7b7ab9b92b6155532c24235eac33f6087a82db5e84b7208c6ed87ed6e19d718f35340284e52c93d232c87f7a13aa443ab07d058a440abe79e8daa51c0178710d98ff044acf5e4b0d19f46c0532dbbe5dbd709c320de283e7992255c3b3d323fed149138af6452afd779cfb027e1ebd050602d12b14c08cff53bd03db65e4e47aa2c7db0da7b9e00485563a856d62e6619ca8f44ba18716702ec4dd59c34c0db5f8392ec5ad99bd7f0042aba036e94d4e0334e32ba6aa3c495db6677335c6e8208e04c65c861968cffd8ed187c3db37f73f5d8e4af53dd09939b704757138b724ac8683d7219fb5ce8c37eb7cc485128bc2c38bb484df838efdda00907edcf23a6d34e946424d5337efde234b8f3c192f0a4e81894d47d2e1d63656b581d4e3119dbfa2d51146c9c5acfabc02d8b17124b0a12d347c29fc6d6dfc169aa6ee97bb7d1491fe54ccb99ff986a0be3383e73167723f59d1dc902769737499a4c1eaa317326c14349070402b2f50f6e31855cc31648347efb5300577164db765ec87bd159970e52999b708348aa428e6802204f9fcd9a2a53c8dbf9acc3353345d4c622dd83ee57ea04ad6afc0f6e348aba72280dce2074b09ea21f08d7712c5fcbfc5d6669aa473e7d3160f0cccb1eff71ec5508a90cfb7b63b53fa5055cda4075440400b3183cc1dd9b102384ec777869fd3fcfce42fbca684c75412ab6dfff6f59c95f92afb60be9b814c3d67872f276ec36f895ef28406ef05e63b62b49ee6a863c0fd1fa7b128e8150b0d2a601ace5e536058790094135dc5aa1862fa6b81ef06b634243731fb41a7760508d9a8fc6b81d7378e40c0bf1b558399adab01b04162ac202b6abb2630bc4b8dd120cc9bc96526f290f1cc551cde18a905319454a88b0133e96974828863cef6906061a926373b1a24b5f0e8139fe2d9a6a5a1578ebb03fd47a800f14aa7b407d1662ff2ba09d3ca0b4cc70287e4f90584aab66baa786b9367d84d750661beeb8ff6318202c6c7a891af6500e727d312e73295d372c08c2b98378998bbc0a935d6c3308b69f1e83456d30be38d0776e0911113853c970d2bc292c879268409ffb2e2574c45f8d17925d0a59d07f837b3b09d4e1848fcc736e6e40becd5ed94b8c22690d6dee7c14f4715e481c222cd0f829c98d2f20bdae42f9ba04ae1e9fc969526807d0cb1ca42e9a817b48a3f81769687b69b36d9b5c07ed08e14f60b34a7e43de81adf4cb12daa79b28d0806f3a24253f5b0ad6625c9c8a238951ac8d9d5acdde30578592c2f68659370eff55fe490374ff964a44ccd187881a9870c258398d9288889f52d435719859118b31a28a076271a96d23b40de40dc3842b21349fb67c30b38dcddefdb568e7d7a7e780ca3f4e2b8401760388eba481c081bb2783bcdb3b8d76bbd65ce3e741657568cdd880a207dee97ca271ad47355ba3dc86cc3d9bdbba7bc74ef330d9bb29b1612801277df17111b5e6c1e98b879b317040364a0956cde998758059e9b4640d03ee986672de5161ff2f2d21432be33c45a1bd7fa2338a6ef07bf13eb42241135fba804941d751aa2aaf43d4966b235d67ce9fec5af245ea56214b202d67d19dd46965c8324907da04ad1d55ca19aa213131e025d2154040dbe9fc95cb5955bc5e13bd0175ed20c2e8d51669961d41766fb68be3933ba7a3cdbdb0ed8379dff0ba5d32e998a6aeb351659ad123fb54d6441eb496d12f17fbdc5cc31aa95f9175b5c187b8fbc6b97390fc6584fd817e04874096c059e09f07c3184ed7ea25bbcebf5581edc254273db69050ecf2fbebe0b5e5e43b8fdda86c162d674e2d6fb3ab3434c16794567331350dcc88082a2145902a27743f9e1ddd9c4f01b83989fa1872aec36fa73072d6ac51f8f8f365707e90514acb7f397d7ff5349c6def8c0c6a9a5eb2dea7f5fd0b31ac951db6569a91b35ae8a5be1626eec1bd0c5e3e66eab6c8a2d874cdef72126db49f8beefac6aad9509bcec5902783fef91e6badd64bd702396ba9b30145fdef1ee3b69a7977f575c766251956b84d67177f97e4eeb1399dc91f074d46a9330fd5aab9cec2ea933b38a1116b46ab79d63c1d25f9f48bc4107843e3f1dbf58a4868d6e6940b0c2f03109f65e9534c4c468782cecc0c80d402d0540bbfe02ad3c494afd6f282a68e16e8d324665af38a11ec01e93604872d834fed56bd294adc2014016511a19ad4efd3eb4da857d4411022c3257fdfd208d65a59659fc3af93e1152aa649fa1ba54ff66baeab24b73ebd0935d04a58eae93c78e1fcc34e12fa1266e1e3934b2fe8e1568c5eaa5271c44a34dd7ed8e007fbb9c47266972f27ecd5839687d5d9a86b070eece1cb9bc90a87fe1e794052f7da8971b71ead16f216b32c2a619781129178bfa90ddd3d603669d04da7d8eb8cd7b00600b80c6af0e2ecbaec0603083c19082da57a099330ff644b05c8219060bf0ca0f0e35a7310a016e381d74fa425ccf600ae2609321cc174450ea24fe9b76a32c01205531a1635215cf118dc0d82f08a9ec3e7376783f89175722350d27350eb59c27bcb151e44ffbccd25e48aeadb3c47e6ce75d5251738f33ebac4a32750a6678ce70461cb654d75b7c0b6ede469596d2aa3562ecb62bc2ffd04cc05b8cafac1d43df2c2627dcf0de628dc363990cf20a471da7baa291779b50f5346e6541da2ce20a52d93a6c216cce5a8fa35a825d138b1f4b50cc43bd74bd011cff21b19ca7e94124e62756c80e701eec0c1983bd44d36482a86f75b4f8a09ed93ebace03b9e50f3da3891d30efe03fe637d302298abbece3359b2aff9092a7cfdcf04bbe09273697ba5aeb403d6e4b2f84b4a1fa87fb67381f71f99355f2fbd8b7b27f651aa61274f200be057fba91bb7df3d21a8e52b5c0b578d4c90f4d6078d4710b9a996113dd0aa83ce3c3f40f63b65ae999c6b3db2668ee4710cd2c72a604efb53703bd89a6b5f7d701fd9c2b5c0c22a6135dd663a40ae01fcc03473dd47b1f739ad68da83710280075c06ef877283dfa3ddf64a88f65c8459d0f72c2d6353cce0bc77619b1557da4d28eec6b0fe2bc62808e06678796eb3d9953c3cc57b030bf27f46c875a448edd9568d95da2d753211756929c8d97d2f7e9261316ac9aa0e012dc100745b14a95e4d332d450d7cccf36c8a38a3759e82d6b07dc6f3f66f3704a4ff32edf6103f464670eef42558c613a8d26e0f22863d9cd28d1747aad2d4bbd6b72cdd7598aa3bc396e6b0c01cf3e79f6f879e877599ec2e0fc484a88f8f332260e1965cfe443838a6f7ee43a65f3e6f239fa47ed00ecec3f460e156fa725f8a7fc5b0d316a5c802e4227b402d44fbf196351ab73cd3b28eb229bc6"}}]}, 0xec4}, 0x1, 0x0, 0x0, 0x4000000}, 0x24000014)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000a80)={0x0, 0x0, &(0x7f0000000a40)={&(0x7f00000006c0)=@newsa={0x138, 0x10, 0x1, 0x70bd26, 0x25dfdbfb, {{@in=@dev={0xac, 0x14, 0x14, 0x31}, @in6=@local, 0x4e24, 0x8, 0x4e21, 0xd, 0xa, 0x20, 0x20, 0x87}, {@in=@initdev={0xac, 0x1e, 0x1, 0x0}, 0x4d5, 0x2b}, @in6=@loopback, {0xffffffff, 0x0, 0xc, 0x2, 0x1, 0x7ff, 0x6, 0x7}, {0x8, 0x8dc000000000000, 0xffffffffffffff45, 0x5}, {0x112, 0x97d, 0x200}, 0x70bd29, 0x3503, 0x2, 0x2, 0x2, 0x45}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}]}, 0x138}, 0x1, 0x0, 0x0, 0x81}, 0x8800)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000000200)={&(0x7f00000004c0)=@updpolicy={0x23c, 0x19, 0x1, 0x0, 0x1, {{@in6=@initdev={0xfe, 0x88, '\x00', 0x1, 0x0}, @in=@remote, 0x0, 0x0, 0x0, 0x0, 0x2, 0x0, 0x0, 0x0, 0x0, 0xffffffffffffffff}, {0x0, 0x0, 0x0, 0x0, 0x0, 0x3}, {0x0, 0x0, 0x800}, 0x200, 0x0, 0x0, 0x0, 0x1}, [@tmpl={0x184, 0x5, [{{@in=@local, 0x0, 0x33}, 0x0, @in=@private=0xa010100, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x40}, {{@in6=@remote, 0x0, 0x3c}, 0x0, @in6=@dev}, {{@in=@loopback, 0x0, 0x6c}, 0x0, @in=@broadcast}, {{@in=@dev={0xac, 0x14, 0x14, 0x43}, 0x0, 0x33}, 0x0, @in=@empty}, {{@in6=@rand_addr=' \x01\x00', 0x4d2, 0x6c}, 0x0, @in=@remote, 0x0, 0x0, 0x0, 0x0, 0xc}, {{@in=@remote, 0x0, 0x3c}, 0xa, @in6=@private0, 0x0, 0x5}]}]}, 0x23c}, 0x1, 0x0, 0x0, 0x1}, 0x4000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000a80)={0x0, 0x0, &(0x7f0000000a40)={&(0x7f0000000000)=@flushsa={0x14, 0x1c, 0x1, 0x70bd2c, 0x25dfdbfb, {0x32}}, 0x14}, 0x1, 0x0, 0x0, 0x81}, 0x8800)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000540)={0x0, 0x0, &(0x7f0000000240)={&(0x7f0000000880)=@newsa={0x14c, 0x10, 0x1, 0x8000000, 0x0, {{@in=@private=0xa010101, @in6=@empty, 0x0, 0x0, 0x4e24, 0x3, 0x0, 0x20}, {@in6=@remote, 0x0, 0x33}, @in=@broadcast, {0x4000000000, 0x0, 0x0, 0x0, 0x8000000000000}, {0x0, 0x6}, {0x10, 0xd29}, 0x0, 0x10000, 0xa, 0x1}, [@coaddr={0x14, 0xe, @in=@multicast1}, @algo_auth={0x48, 0x1, {{'sha256\x00'}}}]}, 0x14c}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000001c0)={0x0, 0x0, &(0x7f0000000180)={&(0x7f0000000000)=@newae={0x40, 0x1e, 0x1, 0x70bd2d, 0x25dfdbfc, {{@in=@rand_addr=0x64010100, 0x4d3, 0x8, 0x32}, @in=@broadcast, 0xea7, 0x3502}}, 0x40}, 0x1, 0x0, 0x0, 0x4000000}, 0x24000014)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000000c0)={0x0, 0x0, &(0x7f0000000080)={&(0x7f0000000040)=@getsadinfo={0x1c, 0x23, 0x1, 0x70bd2a, 0x25dfdbff, 0x0, [@XFRMA_SA_PCPU={0x8, 0x23, 0x3}]}, 0x1c}, 0x1, 0x0, 0x0, 0x20000000}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000080)={0x0, 0x0, &(0x7f0000000040)={&(0x7f0000000340)=@migrate={0xa0, 0x21, 0x1, 0x70bd2b, 0x25dfdbfd, {{@in=@empty, @in=@dev={0xac, 0x14, 0x14, 0xb}, 0x4e20, 0x0, 0x4e22, 0x7ff, 0x2, 0x80, 0x80, 0xff}, 0x0, 0x1}, [@migrate={0x50, 0x11, [{@in6=@empty, @in6=@ipv4={'\x00', '\xff\xff', @initdev={0xac, 0x1e, 0x1, 0x0}}, @in6=@dev={0xfe, 0x80, '\x00', 0xd}, @in=@remote, 0x6c, 0x0, 0x0, 0x0, 0xc79031d2a9592e9}]}]}, 0xa0}, 0x1, 0x0, 0x0, 0x20000016}, 0x4000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000340)={0x0, 0x0, &(0x7f0000000300)={&(0x7f0000000140)=@allocspi={0xf8, 0x16, 0x1, 0x0, 0x0, {{{@in6=@dev={0xfe, 0x80, '\x00', 0x1e}, @in=@initdev={0xac, 0x1e, 0x0, 0x0}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0xa0}, {@in6=@dev={0xfe, 0x80, '\x00', 0x36}, 0x0, 0x33}, @in6=@dev={0xfe, 0x80, '\x00', 0xe}, {0x0, 0x0, 0x0, 0x0, 0xffffffffffffffff, 0xffffffffffffffed, 0x3}, {0x0, 0x0, 0x2, 0xfffbfffffffffffe}, {0x0, 0x0, 0x796}, 0x0, 0x0, 0x0, 0x0, 0x2, 0xe55286f1921f74be}, 0x0, 0x1a0b1}}, 0xf8}, 0x1, 0x0, 0x0, 0x40040}, 0x0)
sendmsg$nl_xfrm(r0, &(0x7f0000000040)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000080)=ANY=[@ANYBLOB="20010000120013070000000000000000e0000001000000000000000000000000fc00"/64, @ANYRES32=0x0, @ANYRES32, @ANYBLOB="fc020000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000072c42572f64a264410b000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000fbc18c8582fc7800000000000000000000000050019000000000028001a"], 0x120}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
r1 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r1, &(0x7f0000000100)={0x0, 0x0, &(0x7f0000000180)={&(0x7f0000000280)=ANY=[@ANYBLOB="b80000001300e9990000000000000000fc000000000000000100000000000000ac1e000100000000000000000000000000000000000000000a0060", @ANYRESOCT], 0xb8}}, 0x20040014)
sendmsg$nl_xfrm(0xffffffffffffffff, &(0x7f0000000580)={0x0, 0x0, &(0x7f0000000180)={&(0x7f0000000000)=@updpolicy={0xb8, 0x13, 0x200, 0x0, 0x25dfdbfb, {{@in6=@private0={0xfc, 0x0, '\x00', 0x1}, @in6=@local, 0x0, 0x4, 0x0, 0x0, 0xa, 0x60, 0x80, 0x3b, 0x0, 0xee01}, {0x0, 0x4, 0x0, 0x0, 0x40, 0xfffffffffffffffd, 0x2}, {0xfffffffffffffffe}, 0x417, 0x40000000, 0x0, 0x1, 0x2}}, 0xfffffd74}, 0x1, 0x0, 0x0, 0x80}, 0x50)
r2 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r2, &(0x7f0000000580)={0x0, 0x0, &(0x7f0000000180)={&(0x7f0000000000)=ANY=[@ANYBLOB="b80000001300e9"], 0xb8}}, 0x0)
sendmsg$nl_xfrm(r1, &(0x7f0000000580)={0x0, 0x0, &(0x7f0000000180)={&(0x7f0000000000)=ANY=[@ANYBLOB="b80000001300e9990000000000000000fc000000000000000100000000000000ac1e0001"], 0xb8}}, 0x20004000)
sendmsg$nl_xfrm(r0, &(0x7f0000000580)={0x0, 0x0, &(0x7f0000000180)={&(0x7f0000000000)=ANY=[@ANYBLOB="b80000001300e9990000020000000000fc0000000000000000"], 0xb8}, 0x1, 0x0, 0x0, 0x80c0}, 0x0)
sendmsg$nl_xfrm(r0, &(0x7f0000000440)={0x0, 0x0, &(0x7f0000000180)={&(0x7f0000000800)=ANY=[@ANYBLOB="b80000001300e9990500000000000000fc000000000000000000000000000000fc00000000000000000000000000000000000000000000000a0030"], 0xb8}}, 0x4000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000000c0)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000d80)=ANY=[@ANYBLOB="a0010000100001002dbd700000000000fe880400000000000000000000000101ac1414bb00000000000000000000000000000000000000000000000064000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="0000000000000000000000000000000000000000320000000000000000000000000000000000000000000000000000008000000000000000000000000000000000000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000000000000800000000000200000000000000000000000000000200000000000000000000000a000100cd000000000000004800020063626328616573290000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001c0017000000000025bd7000000000000000000028bd7000aa0e00004c001400636d61632861657329000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000018"], 0x1a0}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000000c0)={0x0, 0x0, &(0x7f0000000080)={&(0x7f00000001c0)=@newae={0x48, 0x1e, 0x301, 0x70bd2a, 0x25dfdbfe, {{@in=@rand_addr=0x64010100, 0x4d4, 0x2, 0x2b}, @in=@private=0xa010101, 0x3, 0x3500}, [@etimer_thresh={0x8, 0xc, 0x10001}]}, 0x48}, 0x1, 0x0, 0x0, 0x40004}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000300)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000340)=@acquire={0x174, 0x17, 0x1, 0x70bd29, 0x25dfdbfc, {{@in6=@empty, 0x4d3, 0x3c}, @in6=@initdev={0xfe, 0x88, '\x00', 0x0, 0x0}, {@in=@empty, @in6=@mcast1, 0x4e21, 0x3, 0x4e20, 0x6, 0x2, 0x20, 0x0, 0x87, 0x0, 0xffffffffffffffff}, {{@in=@multicast1, @in=@local, 0x4e22, 0x8000, 0x4e23, 0xfff9, 0x2, 0x20, 0x20, 0x3b}, {0x8, 0x3ff, 0x6, 0x401, 0xfffffffffffffffd, 0x8, 0x4, 0x2}, {0x8, 0xffffffffffffffff, 0xa, 0xf}, 0xf, 0x0, 0x1, 0x0, 0x2, 0x1}, 0x10001, 0x1c, 0x3ff, 0x70bd2a}, [@tmpl={0x44, 0x5, [{{@in6=@private2, 0x4d3, 0x32}, 0x2, @in6=@mcast1, 0x3502, 0x0, 0x1, 0x7, 0x7, 0x1, 0x4}]}, @XFRMA_IF_ID={0x8, 0x1f, 0x2}]}, 0x174}, 0x1, 0x0, 0x0, 0x10}, 0x4000004)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000000140)=@updsa={0x148, 0x1a, 0x1, 0x70bd27, 0x25dfdbfe, {{@in=@loopback, @in=@dev={0xac, 0x14, 0x14, 0x3f}, 0xce1e, 0xffff, 0x4e20, 0x2, 0xa, 0x60, 0x80, 0x21}, {@in6=@mcast1, 0x4d6, 0x32}, @in6=@empty, {0x8000, 0x6, 0x9, 0x2, 0x4, 0xa3a7, 0x3f}, {0x5, 0x3, 0x2, 0x7}, {0x0, 0xfffffffd, 0x10000}, 0x70bd2a, 0x34ff, 0xa, 0x1, 0x0, 0x21}, [@algo_auth={0x48, 0x1, {{'cmac(aes)\x00'}}}, @XFRMA_SA_DIR={0x5, 0x21, 0x2}, @XFRMA_SA_PCPU={0x8, 0x23, 0x1ef}]}, 0x148}, 0x1, 0x0, 0x0, 0x40400c5}, 0x88d0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000004c0)={0x0, 0x0, &(0x7f00000000c0)={&(0x7f0000000240)=@delpolicy={0x50, 0x27, 0x1, 0x0, 0x0, {{@in=@private=0xa010100, @in6=@remote}, 0x0, 0x2}}, 0x50}}, 0x1000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000000440)={&(0x7f00000004c0)=@updsa={0x140, 0x1a, 0x1, 0x70bd25, 0x25dfdbfe, {{@in=@rand_addr=0x64010100, @in6=@remote, 0x4e23, 0x7ff, 0x4e22, 0x7, 0xa, 0x80, 0x80, 0x6c}, {@in6=@rand_addr=' \x01\x00', 0x4d2, 0x6c}, @in=@multicast2, {0x480000000, 0x5246, 0x8, 0x6b, 0x80, 0x3, 0x6, 0x1}, {0xa, 0xfffffffffffffffc, 0xc, 0x4}, {0x1, 0x7, 0x7fff}, 0x70bd27, 0x0, 0x2, 0x1, 0x0, 0x8}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}, @replay_thresh={0x8, 0xb, 0xcab1}]}, 0x140}, 0x1, 0x0, 0x0, 0x895}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000300)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000040)=@acquire={0x134, 0x17, 0x1, 0x70bd29, 0x25dfdbfc, {{@in6=@private2={0xfc, 0x2, '\x00', 0x1}, 0x4d3, 0x3c}, @in=@private=0xa010101, {@in=@empty, @in6=@empty, 0x4e21, 0x3, 0x4e20, 0x6, 0x2, 0x20, 0x0, 0x87, 0x0, 0xffffffffffffffff}, {{@in=@multicast1, @in=@local, 0x4e22, 0x8000, 0x4e23, 0xfff9, 0x2, 0x20, 0x20, 0x3b}, {0x4, 0x3ff, 0x6, 0x401, 0xfffffffffffffffd, 0x8, 0x4, 0x2}, {0x8, 0x5, 0xa, 0xf}, 0xf, 0x0, 0x1, 0x0, 0x2, 0x1}, 0x10001, 0x1f, 0x3ff, 0x70bd2c}, [@mark={0xc, 0x15, {0x35175c, 0x7}}]}, 0x134}, 0x1, 0x0, 0x0, 0x40110}, 0x4000004)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000180)={&(0x7f0000000540)=ANY=[@ANYBLOB="540400001600010225bd7000fbdbdf25fe8000000000000000000000000000aaac1414bb0000000000000000000000004e2301004e210006020080a067000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="ff010000000000000000000000000001000004d23200000000000000000000000000000000000001000000000000000001000000010000001000000000000000950700000000000003000000000000000600000000000000ff7f0000000000005e000000000000000700000000000000070000000000000000ac23140000000003000000000000000100000005000000fbffffff27bd7000000000000a0001010200000000"], 0x454}, 0x1, 0x0, 0x0, 0x4000000}, 0x24000014)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000000580)=@updsa={0x138, 0x1a, 0x1, 0x70bd27, 0x25dfdbfc, {{@in=@loopback, @in6=@mcast2, 0x4e1e, 0x0, 0x4e20, 0x2, 0xa, 0x60, 0x80}, {@in=@multicast1, 0x4d4, 0x32}, @in=@remote, {0x8000, 0x6, 0x9, 0x10000002, 0x5, 0xa3a7, 0x3f, 0x10}, {0x5, 0x3, 0x2, 0x7}, {0x1, 0xb3e1, 0x10000}, 0x70bd2a, 0x0, 0xa, 0x1, 0x1, 0x21}, [@algo_crypt={0x48, 0x2, {{'ecb(cipher_null)\x00'}}}]}, 0x138}, 0x1, 0x0, 0x0, 0x4040000}, 0x40000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000035c0)={0x0, 0x0, &(0x7f0000003580)={&(0x7f0000002380)=@newsa={0x154, 0x10, 0x1, 0x0, 0x0, {{@in=@empty, @in6=@remote, 0x0, 0xfff7, 0x2000, 0x1, 0x0, 0x0, 0x0, 0x3b, 0x0, 0xffffffffffffffff}, {@in=@rand_addr=0x64010102, 0x0, 0x6c}, @in=@remote, {0x0, 0x0, 0x0, 0x0, 0x3, 0x2, 0x4}, {0x7fffffffffff, 0x5, 0x40000000}, {}, 0x70bd25, 0x0, 0x2, 0x0, 0x1}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}, @encap={0x1c, 0x4, {0x0, 0x4e23, 0x4e23, @in6=@private0}}]}, 0x154}}, 0x4810)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000640)={0x0, 0x0, &(0x7f0000000500)={&(0x7f00000000c0)=ANY=[@ANYBLOB="14000000250001"], 0x14}, 0x1, 0x0, 0x0, 0x10}, 0xd0)

      
      sendmsg$nl_xfrm(0xffffffffffffffff, &(0x7f00000001c0)={0x0, 0x0, &(0x7f0000000180)={&(0x7f0000000440)=ANY=[@ANYBLOB="400000001e0001002dbd7000fedbdf25fc01000000000000ac1414bb000000000000000000000000a7"], 0x40}}, 0x24000014)
r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x5, &(0x7f0000000140)={&(0x7f0000000380)=@migrate={0xec, 0x21, 0x1, 0x70bd25, 0x25dfdbfd, {{@in=@multicast2, @in=@dev={0xac, 0x14, 0x14, 0x1e}, 0x4e24, 0x4, 0x4e23, 0xd, 0x2, 0x0, 0xc0, 0xc}, 0x6e6bb2, 0x1}, [@migrate={0x9c}]}, 0xec}, 0x1, 0x0, 0x0, 0x81}, 0x20004000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000000140)=@updsa={0x140, 0x1a, 0x1, 0x70bd27, 0x25dfdbfe, {{@in=@loopback, @in=@dev={0xac, 0x14, 0x14, 0x3f}, 0xce1e, 0xffff, 0x4e20, 0x2, 0xa, 0x60, 0x80, 0x21}, {@in6=@mcast1, 0x4d6, 0x32}, @in6=@empty, {0x8000, 0x6, 0x9, 0x2, 0x4, 0xa3a7, 0x3f}, {0x5, 0x3, 0x2, 0x7}, {0x0, 0xfffffffd, 0x10000}, 0x70bd2a, 0x34ff, 0xa, 0x1, 0x0, 0x21}, [@algo_auth={0x48, 0x1, {{'cmac(aes)\x00'}}}, @XFRMA_SA_PCPU={0x8, 0x23, 0x1ef}]}, 0x140}, 0x1, 0x0, 0x0, 0x40400c5}, 0x88d0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000000140)=@updsa={0x148, 0x1a, 0x1, 0x70bd27, 0x25dfdbfe, {{@in=@loopback, @in=@dev={0xac, 0x14, 0x14, 0x3f}, 0xce1e, 0xffff, 0x4e20, 0x2, 0xa, 0x60, 0x80, 0x21}, {@in6=@initdev={0xfe, 0x88, '\x00', 0x0, 0x0}, 0x4d6, 0x32}, @in6=@empty, {0x8000, 0x6, 0x9, 0x2, 0x4, 0xa3a7, 0x3f}, {0x5, 0x3, 0x2, 0x7}, {0x0, 0xfffffffd, 0x10000}, 0x70bd2a, 0x34ff, 0xa, 0x1, 0x0, 0x21}, [@algo_auth={0x48, 0x1, {{'cmac(aes)\x00'}}}, @XFRMA_SA_DIR={0x5, 0x21, 0x2}, @XFRMA_SA_PCPU={0x8}]}, 0x148}, 0x1, 0x0, 0x0, 0x40400c5}, 0x88d0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000000440)={&(0x7f00000007c0)=ANY=[@ANYBLOB="380100001a00010025bd7000fedbdf2564010100000000000000000000000000fe8000000000000000000000000000bb4e2307ff4e2200070a0080806c000000", @ANYRES32=0x0, @ANYRES32, @ANYBLOB="20010000000000000000000000000000000004d26c000000e00000020000000000000000000000000000008004000000465200000000000008000000000000006b000000000000008000000000000000030000000000000030d4000000468f00ff000000000000000900000000000000fcffffffffffffff0c0000000000000004000000000000000100000007000000ff7f000027bd7000000000000a0001000800000000000000480003006465666c6174650000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000d9"], 0x138}, 0x1, 0x0, 0x0, 0x895}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000540)={0x0, 0x0, &(0x7f0000000240)={&(0x7f0000000700)=@newsa={0x140, 0x10, 0x1, 0x8000000, 0x0, {{@in=@loopback, @in6=@empty}, {@in6=@private1={0xfc, 0x1, '\x00', 0x1}, 0x0, 0x33}, @in=@local, {0x0, 0x0, 0x0, 0x0, 0x8000000000000}, {}, {0x10, 0xd29}, 0x0, 0x0, 0xa, 0x1}, [@XFRMA_SA_DIR={0x5, 0x21, 0x1}, @algo_auth={0x48, 0x1, {{'sha256\x00'}}}]}, 0x140}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000000440)={&(0x7f0000000c80)=@updsa={0x144, 0x1a, 0x1, 0x70bd23, 0x25dfdbfe, {{@in6=@private2={0xfc, 0x2, '\x00', 0x1}, @in6=@remote, 0x4e23, 0x7ff, 0x4e1f, 0x0, 0xa, 0x80, 0x80, 0x6c}, {@in=@multicast1, 0x4d2, 0x6c}, @in=@multicast1, {0x480000000, 0x5246, 0x8, 0x6b, 0x80, 0x3, 0x6, 0xff}, {0x9, 0x1, 0xc, 0x4}, {0x1, 0x7, 0x1533c}, 0x70bd27, 0x0, 0xa, 0x1, 0x0, 0x8}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}, @sec_ctx={0xc, 0x8, {0x8, 0x8, 0x1, 0x1}}]}, 0x144}, 0x1, 0x0, 0x0, 0x895}, 0x0)

      
      sendmsg$nl_xfrm(0xffffffffffffffff, 0x0, 0x88d0)
r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000100)={0x0, 0x0, &(0x7f00000000c0)={&(0x7f0000000480)=ANY=[@ANYBLOB="a0000000210001002dbd7000fedbdf25e0000001000000000000000000000000e00000020000000000000000000000004e2200014e2000070a00e0a006ad6682", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="c06b6e0000000000500011000a01"], 0xa0}, 0x1, 0x0, 0x0, 0x40000}, 0x800)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000580)={0x0, 0x0, &(0x7f0000000180)={&(0x7f0000000000)=ANY=[@ANYBLOB="b80000001300e9990000020000000000fc00000000000000"], 0xb8}, 0x1, 0x0, 0x0, 0x80c0}, 0x0)
sendmsg$nl_xfrm(r0, &(0x7f0000000440)={0x0, 0x0, &(0x7f0000000180)={&(0x7f0000000800)=ANY=[@ANYBLOB="b80000001300e9990500000000000000fc000000000000000000000000000000fc00000000000000000000000000000000000000000000000a0030"], 0xb8}}, 0x4000)

      
      sendmsg$nl_xfrm(0xffffffffffffffff, &(0x7f0000000300)={0x0, 0x0, &(0x7f00000002c0)={&(0x7f0000000340)=@acquire={0x134, 0x17, 0x1, 0x70bd2b, 0x25dfdbfc, {{@in6=@rand_addr=' \x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02', 0x4d5, 0x3c}, @in6=@initdev={0xfe, 0x88, '\x00', 0x1, 0x0}, {@in=@private=0xa010102, @in6=@private1, 0x4e23, 0x1, 0x4e22, 0x401, 0x2, 0x20, 0x80, 0x32}, {{@in=@initdev={0xac, 0x1e, 0x0, 0x0}, @in6=@mcast2, 0x4e22, 0x1f, 0x4e23, 0x9, 0xa, 0x0, 0x20, 0x4}, {0x2, 0x6, 0xfffffffffffffffa, 0x3, 0x4, 0x0, 0xb, 0x3}, {0x7fffffffffffffff, 0xb5d, 0xb29, 0x7}, 0x80000000, 0x6e6bb1, 0x0, 0x1, 0x2, 0x2}, 0xfffffff8, 0x8, 0x6, 0x70bd2b}, [@mark={0xc, 0x15, {0x35075c, 0xe609}}]}, 0x134}, 0x1, 0x0, 0x0, 0x10}, 0x4)
r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000300)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000340)=ANY=[@ANYBLOB="340100001700010029bd7000fcdbdf2500000000000000000000000000000000000004d33c000000fe88000000000000000000000000000100000000000000000000000000000000e00000010000000000000000000000004e2100034e2000060200200087000000", @ANYRES32=0x0, @ANYRES32, @ANYBLOB="e0000001000000000000000000000000fc0200000000000000000000000000004e2280004e23fff9020020203b000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="0400000000000000ff0300000000000006000000000000000104000000000000fdffffffffffffff080000000000000004000000000000000200000000000000080000000000000005000000000000000a000000000000000f000000000000000f000000000000000100020100000000010001001c000000ff0300002cbd70000c0008"], 0x134}, 0x1, 0x0, 0x0, 0x14}, 0x4000004)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000740)={0x0, 0x0, &(0x7f0000000700)={&(0x7f00000004c0)=@polexpire={0xcc, 0x1b, 0x111, 0x70bd2d, 0x25dfdbfc, {{{@in6=@private2, @in6=@private2, 0x4e21, 0x0, 0x4e23, 0x3ff, 0x2, 0x60, 0x120, 0x73, 0x0, 0xffffffffffffffff}, {0x80000001, 0xfff, 0x6, 0x4, 0x10000000000, 0x7fffffff, 0x8, 0x7}, {0x1, 0x4c67, 0x9, 0x449f}, 0x9, 0x0, 0x2, 0x0, 0x1, 0x1}, 0x10}, [@sec_ctx={0xc, 0x8, {0x8, 0x8, 0x0, 0xc, 0x5d}}]}, 0xcc}, 0x1, 0x0, 0x0, 0x4090}, 0x4004)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f00000002c0)={&(0x7f0000000640)=@newspdinfo={0x1c, 0x24, 0x315, 0x70bd27, 0x25dfdbfb, 0xd9a, [@XFRMA_SPD_IPV6_HTHRESH={0x6, 0x4, {0xfd}}]}, 0x1c}, 0x1, 0x0, 0x0, 0x10}, 0x4000004)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000000440)={&(0x7f0000000980)=@updsa={0xf0, 0x1a, 0x1, 0x70bd25, 0x25dfdbfe, {{@in6=@mcast1, @in6=@remote, 0x4e23, 0x7ff, 0x4e22, 0x7, 0xa, 0x0, 0x80, 0x6c}, {@in6=@rand_addr=' \x01\x00', 0x4d5, 0x6c}, @in=@multicast2, {0x480000000, 0x5246, 0x8, 0x6b, 0x80, 0x7, 0x6, 0x1}, {0xa, 0xfffffffffffffffc, 0xc, 0x4}, {0x1, 0x7, 0x7fff}, 0x70bd27, 0x0, 0xa, 0x1, 0x0, 0x8}}, 0xf0}, 0x1, 0x0, 0x0, 0x895}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000140)={&(0x7f0000000480)=@migrate={0xcc, 0x21, 0x1, 0x70bd2c, 0x25dfdbfd, {{@in=@multicast2, @in=@dev={0xac, 0x14, 0x14, 0x1f}, 0x4e24, 0x4, 0x4e23, 0xd, 0x2}, 0x6e6bb2, 0x1}, [@migrate={0x50, 0x11, [{@in6=@initdev={0xfe, 0x88, '\x00', 0x0, 0x0}, @in=@rand_addr=0x64010100, @in=@private=0xa010100, @in6=@private0={0xfc, 0x0, '\x00', 0x1}, 0x32, 0x0, 0x0, 0x3501, 0x4d21fe76179a5365, 0x2}]}, @user_kmaddress={0x2c, 0x13, {@in=@rand_addr=0x64010102, @in6=@mcast2, 0x0, 0x2}}]}, 0xcc}, 0x1, 0x0, 0x0, 0x81}, 0x20000000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000003c0)={0x0, 0x0, &(0x7f0000000380)={&(0x7f00000004c0)=@newsa={0x128, 0x10, 0x7, 0x0, 0x0, {{@in6=@mcast2, @in=@multicast2, 0x4e20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2, 0x0, 0xee00}, {@in6=@mcast1, 0x0, 0x2b}, @in6=@private0, {0x0, 0x0, 0x2, 0x0, 0x10000000, 0xffffffffffffffff}, {0x0, 0x200000, 0x7}, {0x40000, 0xfffffffd, 0xae8}, 0x0, 0x0, 0xa, 0x2, 0x0, 0x34}, [@coaddr={0x14, 0xe, @in6=@remote}, @lifetime_val={0x24, 0x9, {0x3, 0x8000000000000001, 0xb4, 0x5}}]}, 0x128}}, 0x0)
r1 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r1, &(0x7f00000003c0)={0x0, 0x0, &(0x7f0000000380)={&(0x7f00000000c0)=@newsa={0x104, 0x1a, 0x7, 0x70bd2a, 0x0, {{@in6=@dev={0xfe, 0x80, '\x00', 0x1b}, @in=@multicast1, 0xffff, 0x0, 0x4e22, 0x0, 0x0, 0x0, 0x0, 0x2, 0x0, 0xee00}, {@in6=@mcast1, 0x0, 0x2b}, @in6=@private0, {0x5a, 0xb400, 0x2, 0xfeffff7f00000001, 0x0, 0x60000}, {0x0, 0x200000, 0x7, 0xfffffffffffffffd}, {0x40000, 0x0, 0xae8}, 0x0, 0x0, 0xa, 0x2, 0x0, 0x70}, [@coaddr={0x14, 0xe, @in6=@remote}]}, 0x104}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000900)={0x0, 0x0, &(0x7f00000008c0)={&(0x7f0000000500)=@polexpire={0xc8, 0x1b, 0x1, 0x0, 0x0, {{{@in6=@private0, @in6=@initdev={0xfe, 0x88, '\x00', 0x0, 0x0}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x20}, {}, {}, 0x40}}, [@XFRMA_IF_ID={0x8}]}, 0xc8}, 0x1, 0x0, 0x0, 0x4044001}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000280)={0x0, 0x0, &(0x7f0000000000)={&(0x7f00000007c0)=@newsa={0x1a0, 0x10, 0x1, 0x70bd2d, 0x0, {{@in6=@initdev={0xfe, 0x88, '\x00', 0x1, 0x0}, @in=@remote, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x64}, {@in, 0x0, 0x32}, @in6=@dev={0xfe, 0x80, '\x00', 0x2c}, {0x0, 0x0, 0x0, 0x0, 0x0, 0x4}, {0x0, 0x0, 0x8000000, 0x200}, {0x0, 0x0, 0x2}, 0x0, 0x0, 0xa, 0x1, 0x0, 0xcd}, [@algo_crypt={0x48, 0x2, {{'cbc(aes)\x00'}}}, @replay_esn_val={0x1c, 0x17, {0x80ff, 0x70bd25, 0x0, 0x0, 0x70bd28, 0xeaa}}, @algo_auth_trunc={0x4c, 0x14, {{'cmac(aes)\x00'}, 0x0, 0x18}}]}, 0x1a0}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000000100)=@updsa={0x154, 0x1a, 0x1, 0x70bd27, 0x25dfdbfc, {{@in=@loopback, @in6=@local, 0x4e22, 0x0, 0x4e20, 0x2, 0xa, 0x60, 0x80}, {@in6=@rand_addr=' \x01\x00', 0x4d6, 0x33}, @in6=@private0, {0x8000, 0x6, 0xe, 0x2, 0x5, 0xa3a7, 0x3f}, {0x5, 0x3, 0x2, 0x7}, {0x1, 0xb3e1, 0x10000}, 0x70bd2a, 0x80003503, 0xa, 0x1, 0x1, 0x21}, [@algo_auth={0x48, 0x1, {{'cmac(aes)\x00'}}}, @encap={0x1c, 0x4, {0x0, 0x4e23, 0x4e21, @in6=@mcast2}}]}, 0x154}, 0x1, 0x0, 0x0, 0x4040000}, 0x40000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000640)={0x0, 0x0, &(0x7f0000000500)={&(0x7f0000000680)=@allocspi={0x1dc, 0x16, 0x411, 0x0, 0x0, {{{@in6=@private2, @in=@private}, {@in6=@private1, 0x0, 0x33}, @in=@empty, {}, {}, {}, 0x8000000, 0x0, 0x2}, 0x0, 0xfdffff00}, [@sa={0xe4, 0x6, {{@in=@local, @in6=@private0={0xfc, 0x0, '\x00', 0x1}, 0x4e24, 0x1, 0x4e20, 0x8, 0x2, 0x20, 0xa0}, {@in6=@private2={0xfc, 0x2, '\x00', 0x1}, 0x4d3, 0xcd}, @in6=@mcast2, {0x4, 0x2, 0x6, 0x2, 0x7, 0x4, 0xa, 0xeb}, {0x8001, 0xc, 0x4, 0x9}, {0x5, 0x3, 0xd15}, 0x70bd2c, 0x3505, 0xa, 0x0, 0x1, 0x2}}]}, 0x1dc}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000040)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000780)=@newsa={0x144, 0x10, 0x1, 0x9ffffffe, 0x100, {{@in6=@initdev={0xfe, 0x88, '\x00', 0x0, 0x0}, @in=@local, 0x1, 0x794, 0x4e23, 0x5, 0x0, 0x0, 0x0, 0x3a}, {@in6=@private1, 0x4d4, 0x6c}, @in=@loopback, {0xfffffffffffffffc, 0x9, 0x6, 0xffff, 0x8251c, 0x2, 0xfffffffffffffff8, 0x3}, {0xffffffffffffffff, 0x0, 0x1f, 0x1ff}, {0x2, 0xfffffffc}, 0x70bd2a, 0x3504, 0x2, 0x1, 0xfd, 0x20}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}, @mark={0xc, 0x15, {0x35075a, 0x3}}]}, 0x144}, 0x1, 0x0, 0x0, 0x8801}, 0x10)
r1 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r1, &(0x7f0000000040)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000500)=@newsa={0x138, 0x10, 0x1, 0xfffffffe, 0x100, {{@in6=@private0={0xfc, 0x0, '\x00', 0x1}, @in=@rand_addr=0x64010101, 0x1, 0x714, 0x4e23, 0x5, 0x0, 0x0, 0x0, 0x3a}, {@in6=@mcast2, 0x4d4, 0x6c}, @in=@dev={0xac, 0x14, 0x14, 0x3f}, {0x0, 0x192, 0x6, 0xffff, 0x8251c, 0x2, 0xfffffffffffffff8}, {0xffffffffffffffff, 0x0, 0x1f, 0xfffffffffffffffe}, {0x2, 0xfffffffc}, 0x70bd2a, 0x3504, 0x2, 0x1, 0x0, 0x20}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}]}, 0x138}, 0x1, 0x0, 0x0, 0x8801}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000000440)={&(0x7f00000031c0)=@updsa={0x14c, 0x1a, 0x1, 0x70bd26, 0x25dfdbfe, {{@in=@rand_addr=0x64010100, @in6=@remote, 0x4e23, 0x7ff, 0x4e22, 0x7, 0xa, 0x80, 0x80, 0x6c}, {@in6=@rand_addr=' \x01\x00', 0x4d2, 0x6c}, @in=@multicast2, {0x480000000, 0x5246, 0x8, 0xb, 0x80, 0x3, 0x6, 0xff}, {0x9, 0xfffffffffffffffc, 0xc, 0x4}, {0x1, 0x7, 0x7fff}, 0x70bd27, 0x0, 0xa, 0x0, 0x0, 0x8}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}, @XFRMA_SET_MARK={0x8, 0x1d, 0x9}, @mark={0xc, 0x15, {0x350759, 0x8000}}]}, 0x14c}, 0x1, 0x0, 0x0, 0x895}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000040)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000980)=@newsa={0x140, 0x1a, 0x1, 0xfffffffe, 0x100, {{@in=@multicast1, @in6=@private1={0xfc, 0x1, '\x00', 0x1}, 0x1, 0x71c, 0x4e23, 0x5, 0x0, 0x0, 0x0, 0x3a}, {@in6=@mcast2, 0x4d4, 0x6c}, @in=@broadcast, {0x0, 0x192, 0x9ba3, 0xffff, 0x8251c, 0x5, 0xfffffffffffffffc}, {0xffffffffffffffff, 0x0, 0x1f, 0xfffffffffffffffe}, {0xfffffffa, 0x3fc}, 0x80, 0x3500, 0x2, 0x1, 0x0, 0x20}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}, @XFRMA_SET_MARK={0x8, 0x1d, 0xfffffeff}]}, 0x140}}, 0x844)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000080)={0x0, 0x0, &(0x7f0000000040)={&(0x7f0000000340)=ANY=[@ANYBLOB="3800000012000501000000000000000028001a000a0101020000000020000000000000000a010100000000000a000609302189db45d67a"], 0x38}}, 0x20040810)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000440)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000c00)=@newsa={0x148, 0x10, 0x1, 0xfffffffe, 0x80000100, {{@in6=@private0={0xfc, 0x0, '\x00', 0x1}, @in=@rand_addr=0x64010101, 0x1, 0x714, 0x4e23, 0x5, 0x0, 0x0, 0x0, 0x3a}, {@in6=@mcast2, 0x4d4, 0x6c}, @in=@dev={0xac, 0x14, 0x14, 0x3f}, {0x0, 0x192, 0x6, 0xffff, 0x8251c, 0x2, 0xba0}, {0xffffffffffffffff, 0x0, 0x1f, 0xfffffffffffffffe}, {0x2, 0xfffffffc}, 0x70bd2a, 0x3504, 0x2, 0x1, 0x0, 0x20}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}, @XFRMA_SA_DIR={0x5, 0x21, 0x2}, @XFRMA_IPTFS_REORDER_WINDOW={0x6, 0x25, 0x9b60}]}, 0x148}, 0x1, 0x0, 0x0, 0x8801}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000040)={&(0x7f00000001c0)=@newsa={0x13c, 0x10, 0x1, 0x0, 0x0, {{@in6=@private1, @in=@private, 0x800, 0x0, 0x0, 0x0, 0x2}, {@in, 0x0, 0x32}, @in=@local, {0x3, 0x6}, {}, {}, 0x0, 0x0, 0xa, 0x0, 0x0, 0xcd}, [@algo_auth_trunc={0x4c, 0x14, {{'cmac(aes)\x00'}}}]}, 0x13c}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000300)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000ac0)=ANY=[@ANYBLOB="8c0300001700010029bd7000fcdbdf2500000000000000000000000000000000000004d33c000000fe88000000000000000000000000000100000000000000000000000000000000ff0100000000000000000000000000014e2100034e2000060200200087000000", @ANYRES32=0x0, @ANYRES32, @ANYBLOB="e0000001000000000000000000000000ac1414aa0000000000000000000000004e2280004e23fff9020020203b000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="0400860000000000ff0300000000000006000000000000000104000000000000fdffffffffffffff0800000000000000040000000000000002000000000000000800000000000000ffffffffffe4ffff0a000000000000000f000000000000000f000000000000000100020100000000010001001c000000ff03000025bd700084000500fe8000000000000000000000000000bb000004d33200000002000000ff010000000000000000000000000001023540000001070007000000010000000400000000000000000000000000ffff7f000001000004d4330000000a000000fe800000000000000000000000000031053500000402000001000080368700000800000008002300000000000c0015005a"], 0x38c}, 0x1, 0x0, 0x0, 0x10}, 0x4000004)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000840)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000000000)=@updsa={0x138, 0x1a, 0x1, 0x70bd27, 0x25dfdbfd, {{@in=@loopback, @in=@broadcast, 0x4e22, 0xffef, 0x4e20, 0x2, 0xa, 0x60, 0x80}, {@in6=@rand_addr=' \x01\x00', 0x4d6, 0x33}, @in6=@private0, {0x8000, 0x6, 0x9, 0x2, 0x5, 0xa3a7, 0x3f}, {0x5, 0x0, 0x2, 0x7}, {0x1, 0xb3e1, 0x6}, 0x70bd2a, 0x0, 0xa, 0x5, 0x1, 0x21}, [@algo_auth={0x48, 0x1, {{'sha256-mb\x00'}}}]}, 0x138}, 0x1, 0x0, 0x0, 0x40000}, 0x40000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000100)={0x0, 0x0, &(0x7f00000000c0)={&(0x7f0000000480)=@migrate={0xa0, 0x21, 0x1, 0x70bd2d, 0x25dfdc00, {{@in6=@local, @in=@multicast2, 0x4e22, 0x1, 0x4e20, 0x7, 0xa, 0x1e0, 0xa0, 0x2f}, 0x6e6bc0}, [@migrate={0x50, 0x11, [{@in6=@local, @in6=@private2={0xfc, 0x2, '\x00', 0x1}, @in=@dev={0xac, 0x14, 0x14, 0x20}, @in=@rand_addr=0x64010101, 0x2b, 0x3, 0x0, 0x350b, 0xa, 0xa}]}]}, 0xa0}, 0x1, 0x0, 0x0, 0x40000}, 0x800)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000040)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000080)=@newsa={0x140, 0x10, 0x1, 0xfffffffe, 0x100, {{@in6=@initdev={0xfe, 0x88, '\x00', 0x0, 0x0}, @in6=@remote, 0x1, 0x1, 0x4e23, 0x5, 0x0, 0x0, 0x0, 0x3a}, {@in6=@mcast2, 0x4d4, 0x6c}, @in6=@mcast2, {0x0, 0x192, 0x6, 0xffff, 0x8251c, 0x2, 0xfffffffffffffff8, 0x3}, {0x0, 0xffffffff80000000, 0x1f, 0xfffffffffffffffe}, {0x2, 0xfffffffc}, 0x70bd2a, 0x3500, 0x2, 0x1, 0x0, 0x20}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}, @etimer_thresh={0x8, 0xc, 0x33}]}, 0x140}, 0x1, 0x0, 0x0, 0x8801}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000040)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000780)=@newsa={0x138, 0x10, 0x1, 0xbffffffe, 0x100, {{@in=@empty, @in6=@ipv4={'\x00', '\xff\xff', @remote}, 0x1, 0x394, 0x4e23, 0x5, 0x0, 0x0, 0x0, 0x3a}, {@in6=@mcast2, 0x4d4, 0x6c}, @in=@remote, {0x0, 0x9, 0x6, 0xffff, 0x8251c, 0x2, 0xfffffffffffffff8}, {0x6, 0x0, 0x1f, 0x1ff}, {0x2, 0xfffffffc}, 0x70bd2a, 0x3504, 0xa, 0x1, 0xfd, 0x20}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}]}, 0x138}, 0x1, 0x0, 0x0, 0x8801}, 0x10)
r1 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r1, &(0x7f0000000040)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000640)=ANY=[@ANYBLOB="4001000010000100feffffff00010000fe8000000000000000000000000000aafc010000000000000000000000000001000107144e230005000000003a000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="ff020000000000000000000000000001000004d46c000000ac1414bb000000000000000000000000000000000000000092010000000000000600000000000000ffff0000000000001c03080000000000020000000000000007000000000000000000000000000000ffffffffffffffff00000000000000000900000000000000feffffffffffffff02000000fcffffff070000002abd7000043500000a0001802000000000000000480003006465666c61746500"], 0x140}, 0x1, 0x0, 0x0, 0x8801}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000140)={&(0x7f00000004c0)=ANY=[@ANYBLOB="fc0000001900010026bd7000fbdbdf2520010000000000000000000000000002fc0100000000000000000000000000004e1e00034e2103ff0200002001000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="ffffff7f00000000010000000000000001800000000000000300000000000000f700000000000000060000000000000002000000000000000100000000000000ffffff7f000000000c0000000000000007000000000000000e00000000000000030000000000010000010000000000004400050000000000000100000000ffff64010101000004d5000000000036c2e943ffffff0000000000000000000000000535000001"], 0xfc}, 0x1, 0x0, 0x0, 0x8000}, 0x40040)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000040)={0x0, 0x0, &(0x7f0000000000)={&(0x7f00000003c0)=@newsa={0x138, 0x10, 0x113, 0x0, 0x0, {{@in=@loopback, @in=@empty, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xee00}, {@in=@multicast1, 0x0, 0x32}, @in=@dev, {}, {}, {}, 0x0, 0x0, 0xa, 0x3}, [@algo_crypt={0x48, 0x2, {{'ecb(cipher_null)\x00'}}}]}, 0x138}, 0x1, 0x0, 0x0, 0x40080}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000001400)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000040)=@expire={0xf8, 0x18, 0x401, 0x70bd2b, 0x25dfdbff, {{{@in6=@dev={0xfe, 0x80, '\x00', 0x2a}, @in=@private=0xa010102, 0x4e21, 0x0, 0x4e20, 0x7, 0x0, 0x20, 0x1a0, 0x11}, {@in=@dev={0xac, 0x14, 0x14, 0x15}, 0x4d3, 0x2b}, @in=@multicast2, {0x7, 0x7c9e, 0x3, 0x0, 0x9, 0x2, 0xfa3, 0x842f}, {0x5, 0x5, 0x7f, 0x8}, {0xd2bc, 0x0, 0x1}, 0x70bd25, 0x3501, 0xa, 0x1, 0x1}, 0x4}}, 0xf8}, 0x1, 0x0, 0x0, 0x20014}, 0x20008000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000540)={0x0, 0x0, &(0x7f0000000240)={&(0x7f0000000700)=@newsa={0x154, 0x10, 0x1, 0x8000000, 0x0, {{@in=@private=0xa010101, @in6=@empty}, {@in=@broadcast, 0x0, 0x33}, @in=@local, {0x0, 0x0, 0x0, 0x0, 0x8000000000000}, {}, {0x10, 0xd29}, 0x0, 0x0, 0xa, 0x1}, [@replay_esn_val={0x1c, 0x17, {0x0, 0x0, 0x0, 0x10000000}}, @algo_auth={0x48, 0x1, {{'sha256\x00'}}}]}, 0x154}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000100)={0x0, 0x0, &(0x7f00000000c0)={&(0x7f0000000200)=@newpolicy={0xc4, 0x13, 0x1, 0x70bd28, 0x25dfdbfd, {{@in6=@ipv4={'\x00', '\xff\xff', @local}, @in6=@dev={0xfe, 0x80, '\x00', 0x29}, 0x4e24, 0xa3, 0x4e21, 0x7ff, 0xa, 0x80, 0x0, 0x2c}, {0x2, 0x0, 0x1, 0x2, 0xea1, 0x7, 0x9, 0x1}, {0x272dcfdb, 0x7, 0x5, 0xffff}, 0x1, 0x0, 0x0, 0x0, 0x1}, [@mark={0xc, 0x15, {0x35075b, 0x5}}]}, 0xc4}, 0x1, 0x0, 0x0, 0x8000}, 0x4)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000001280)={0x0, 0x0, &(0x7f0000000080)={&(0x7f00000012c0)=ANY=[@ANYBLOB="cc0000001b00d115000000000000000000000000000000000000000000000000fe88000000000000000000000000000100"/64, @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000004000000000000000000000000000000000000000000000000000000000000000000000000000000000000f53b"], 0xcc}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000240)={0x0, 0x0, &(0x7f0000000200)={&(0x7f0000000180)=@report={0x4c, 0x20, 0x1, 0x70bd28, 0x25dfdbfb, {0x33, {@in6=@local, @in6=@ipv4={'\x00', '\xff\xff', @rand_addr=0x64010102}, 0x4e20, 0x9, 0x4e21, 0x8, 0x2, 0xa0, 0xa0, 0x21}}}, 0x4c}, 0x1, 0x0, 0x0, 0x200000c0}, 0xc0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000000140)=ANY=[@ANYBLOB="500100001a00010027bd7000fedbdf257f000001000000000000000000000000ac14143f000000000000000000000000ce1effff4e2000020a00608021000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="ff010000000000000000000000000001000004d6320000000000000000000000000000000000000000800000000000000600000000000000090000000000000002000000000000000400000000000000a7a30000000000003f000000000000000000000000000000050000000000000003000000000000000200000000000000070000000000000000000000fdffffff000001002abd7000ff3400000a000100210000000000000048000100636d616300000000000000000000000000000000000000000000000000000000000000005cca88e628104527cd723538fc00000000000000000000000000000000000000050021000200000010000a"], 0x150}, 0x1, 0x0, 0x0, 0x40400c5}, 0x88d0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000580)={0x0, 0x0, &(0x7f0000000180)={&(0x7f0000001000)=ANY=[@ANYBLOB="28040000130001000000000000000000000000000000000000000000000000017f0000010000000000000000000000000000000000000000020020202b000000", @ANYRES32=0x0, @ANYRES32=0xee01, @ANYBLOB="00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000030000000000f60003006c7a7300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000700500007442cbb642a39a6a8560bdc3f85c5a6073026d03e7544b998fbaa78ad39feb6576a02176e09f0bb6196d4a3573df76df94771d06619be28305346a0b8f8f202c3afa28d73a5adbc882338ec7ab88ba9c5c5b055749515bf3b3ef66485400c7705eac2b62f164e9056a5514083841b94a8785c0c7e4201f3ec117cac4dae80d29a7a0053344b8ba0d2969bc02230cfd122fbfb7532fc45df768aef314ad779c48f56d2e79c4ca6ceb899daf62167b000004010500ac141420000000000000000000000000000004d43c0000000b000000ff0100000000000000000000000000010735000000040200ff0000000900000006000000ac1e0101000000000000000000000000000004d4330000000a000000fc0200000000000000000000000000010635000003010000030000000100000040000000e0000001000000000000000000000000000004d32b000000020000000a010100000000000000000000000000000000000200a400050000001100000053a4000064010101000000000000000000000000000004d66c0000000a000000640101010000000000000000deffffff043500000402a4009f0a0000f8ffffffc5710000080016000700000008001e00f6ffffff0c0008"], 0x428}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000140)={&(0x7f00000004c0)=@updpolicy={0xfc, 0x19, 0x1, 0x70bd26, 0x25dfdbfb, {{@in6=@rand_addr=' \x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02', @in6=@private1, 0x4e1e, 0x3, 0x4e21, 0x3ff, 0x2, 0x0, 0x20, 0x1}, {0x7fffffff, 0x1, 0x8001, 0x3, 0xa, 0x6, 0x2, 0x1}, {0x7fffffff, 0xc, 0x7, 0xe}, 0x3, 0x0, 0x0, 0x1}, [@tmpl={0x44, 0x5, [{{@in6=@ipv4={'\x00', '\xff\xff', @rand_addr=0x64010101}, 0x4d5}, 0x0, @in=@broadcast, 0x3505, 0x1, 0x1, 0x8, 0x6, 0x0, 0x8000}]}]}, 0xfc}, 0x1, 0x0, 0x0, 0x8000}, 0x40040)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000340)={0x0, 0x0, &(0x7f0000000240)={&(0x7f0000000100)=@getpolicy={0x5c, 0x15, 0x801, 0x70bd2a, 0x25dfdbfd, {{@in6=@empty, @in6=@private2, 0x4e22, 0x0, 0x4e20, 0x3, 0x2, 0x20, 0x60, 0xff}, 0x0, 0x1}, [@mark={0xc, 0x15, {0x35075a, 0x2}}]}, 0x5c}, 0x1, 0x0, 0x0, 0x80}, 0x14044000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000004c0)={0x0, 0x0, &(0x7f00000000c0)={&(0x7f0000000280)=ANY=[@ANYBLOB="1c00000028000100"], 0x1c}, 0x1, 0x0, 0x0, 0x8c4}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000002c0)={0x0, 0x0, &(0x7f0000000240)={&(0x7f0000000100)=@getpolicy={0x58, 0x15, 0x801, 0x70bd2a, 0x25dfdbfd, {{@in6=@dev={0xfe, 0x80, '\x00', 0x35}, @in6=@private2, 0x4e22, 0x0, 0x4e20, 0x2, 0x2, 0x20, 0x60, 0xff}, 0x0, 0x1}, [@XFRMA_IF_ID={0x8, 0x1f, 0x3}]}, 0x58}, 0x1, 0x0, 0x0, 0x80}, 0x14044000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000200)={&(0x7f0000000800)=@newsa={0x138, 0x10, 0x713, 0x70bd2d, 0x25dfdbfc, {{@in=@private=0xa010100, @in6=@mcast1, 0x4, 0x0, 0x4e21, 0x1, 0x0, 0x0, 0x0, 0x2e, 0x0, 0xee00}, {@in=@remote, 0x4d6, 0x32}, @in=@initdev={0xac, 0x1e, 0x8, 0x0}, {0x6, 0x0, 0x0, 0x100, 0xffffffff00000001, 0x0, 0x3f, 0x543}, {0x4, 0x7fffffbfffffffff, 0xffffffffffffffff, 0x1000}, {}, 0x70bd2c, 0x3500, 0x2, 0x0, 0x0, 0x50}, [@algo_comp={0x48, 0x3, {{'lzjh\x00'}}}]}, 0x138}, 0x1, 0x0, 0x0, 0x890}, 0x2014)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000540)={0x0, 0x0, &(0x7f0000000240)={&(0x7f0000000880)=@newsa={0x184, 0x10, 0x1, 0x8000000, 0x0, {{@in=@private=0xa010101, @in6=@empty, 0x0, 0x0, 0x4e24, 0x3}, {@in=@broadcast, 0x0, 0x33}, @in=@local, {0x4000000000, 0x0, 0x0, 0x0, 0x8000000000000}, {0x0, 0x6}, {0x10, 0xd29}, 0x0, 0x0, 0xa, 0x1}, [@algo_aead={0x4c, 0x12, {{'morus1280-avx2\x00'}, 0x0, 0x180}}, @algo_auth={0x48, 0x1, {{'sha256\x00'}}}]}, 0x184}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000080)={&(0x7f0000000200)=@newsa={0x154, 0x10, 0x633, 0x0, 0x0, {{@in6=@private0, @in=@broadcast, 0x0, 0x4000, 0x0, 0x8004}, {@in=@dev, 0x0, 0x32}, @in6=@dev={0xfe, 0x80, '\x00', 0x1a}, {0x327, 0x0, 0x0, 0x0, 0xfff}, {}, {0x8f}, 0x70bd2d, 0x0, 0xa, 0x2}, [@encap={0x1c, 0x20, {0x0, 0x4e22, 0x0, @in=@remote}}, @algo_crypt={0x48, 0x2, {{'ecb(cipher_null)\x00'}}}]}, 0x154}, 0x1, 0x0, 0x0, 0x24004010}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000001280)={0x0, 0x0, &(0x7f0000000080)={&(0x7f00000012c0)=ANY=[@ANYBLOB="cc0000001b00d115000000000000000000000000000000000000000000000000fe88000000000000000000000000000100"/64, @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000004000000000000000000000000000000000000000000000000000000000000000000000000000000000000f5"], 0xcc}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000040)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000500)=@newsa={0x138, 0x10, 0x1, 0xfffffffe, 0x100, {{@in6=@private0={0xfc, 0x0, '\x00', 0x1}, @in=@rand_addr=0x64010101, 0x1, 0x714, 0x4e23, 0x5, 0x0, 0x0, 0x0, 0x3a}, {@in6=@mcast2, 0x4d4, 0x6c}, @in=@dev={0xac, 0x14, 0x14, 0x3f}, {0x0, 0x192, 0x6, 0xffff, 0x8251c, 0x2, 0xfffffffffffffff8}, {0xffffffffffffffff, 0x0, 0x1f, 0xfffffffffffffffe}, {0x2, 0xfffffffc}, 0x70bd2a, 0x3504, 0x2, 0x1, 0x0, 0x20}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}]}, 0x138}, 0x1, 0x0, 0x0, 0x8801}, 0x0)
r1 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r1, &(0x7f0000000040)={0x0, 0x0, &(0x7f0000000000)={&(0x7f0000000500)=@newsa={0x138, 0x18, 0x1, 0xfffffffe, 0x100, {{@in6=@ipv4={'\x00', '\xff\xff', @multicast2}, @in6=@private1={0xfc, 0x1, '\x00', 0x1}, 0x1, 0x71c, 0x4e23, 0x5, 0x0, 0x0, 0x0, 0x3a}, {@in6=@mcast2, 0x4d4, 0x6c}, @in=@dev={0xac, 0x14, 0x14, 0x25}, {0x0, 0x192, 0x9ba3, 0xffff, 0x8251c, 0x5, 0xfffffffffffffffc}, {0xffffffffffffffff, 0x0, 0x1f, 0xfffffffffffffffe}, {0xfffffffa, 0xfffffffc}, 0x80, 0x3500, 0x2, 0x1, 0x0, 0x20}, [@algo_comp={0x48, 0x3, {{'deflate\x00'}}}]}, 0x138}, 0x1, 0x0, 0x0, 0x8801}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000002c0)={0x0, 0x0, &(0x7f0000000280)={&(0x7f0000000100)=@delsa={0x28, 0x11, 0x1, 0x70bd25, 0x25dfdbfc, {@in6=@private2={0xfc, 0x2, '\x00', 0x1}, 0xfffffff9, 0x2, 0x33}}, 0x28}, 0x1, 0x0, 0x0, 0x4000}, 0x20000000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000001f40)={&(0x7f00000005c0)=@updpolicy={0xfc, 0x19, 0x1, 0x0, 0x0, {{@in=@empty, @in6=@mcast2, 0x0, 0x8, 0x0, 0x0, 0xa, 0x0, 0x0, 0x0, 0x0, 0xffffffffffffffff}, {0x0, 0xa9, 0x0, 0x1, 0x0, 0xffffffffffffffff}, {0x0, 0xa00, 0x40800000000000, 0x800000000000002}}, [@tmpl={0x44, 0x5, [{{@in6=@private0={0xfc, 0x0, '\x00', 0x1}, 0x0, 0x3c}, 0x0, @in=@loopback, 0x2, 0x4}]}]}, 0xfc}}, 0x0)
r1 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r1, &(0x7f0000000380)={0x0, 0x0, &(0x7f0000000340)={&(0x7f0000001100)=@migrate={0xa0, 0x21, 0x1, 0x4, 0x0, {{@in6=@private2={0xfc, 0x2, '\x00', 0x1}, @in6=@dev={0xfe, 0x80, '\x00', 0x3a}, 0x0, 0x0, 0x0, 0x2, 0xa, 0x0, 0xa0, 0x2e}}, [@migrate={0x50, 0x11, [{@in=@private=0xa010101, @in6=@empty, @in=@broadcast, @in=@dev={0xac, 0x14, 0x14, 0x34}, 0x3c, 0x4, 0x0, 0x2e00, 0x8, 0x2}]}]}, 0xa0}, 0x1, 0x0, 0x0, 0x40000}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000480)={0x0, 0x0, &(0x7f0000000200)={&(0x7f00000004c0)=@updpolicy={0xfc, 0x19, 0x1, 0x0, 0x0, {{@in6=@empty, @in6=@mcast1, 0x0, 0x0, 0x0, 0x0, 0x2, 0x0, 0x0, 0x0, 0x0, 0xffffffffffffffff}, {}, {}, 0x0, 0x0, 0x1}, [@tmpl={0x44, 0x5, [{{@in6=@empty, 0x0, 0x6c}, 0x0, @in=@empty, 0x0, 0x4, 0x0, 0x10}]}]}, 0xfc}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000800)={&(0x7f0000000840)=@updsa={0x140, 0x1a, 0x1, 0x70bd27, 0x25dfdbfe, {{@in=@loopback, @in=@dev={0xac, 0x14, 0x14, 0x3f}, 0x4e1e, 0x0, 0x4e20, 0x2, 0xa, 0x60, 0x80}, {@in=@multicast1, 0x4d6, 0x32}, @in6=@empty, {0x8000, 0x6, 0x9, 0x2, 0x80005, 0xa3a7, 0x3f}, {0x5, 0x3, 0x2, 0x7}, {0x1, 0x0, 0x10000}, 0x70bd2a, 0x0, 0xa, 0x1, 0x1, 0x11}, [@algo_auth={0x48, 0x1, {{'cmac(aes)\x00'}}}, @XFRMA_SA_DIR={0x5, 0x21, 0x2}]}, 0x140}, 0x1, 0x0, 0x0, 0x4040041}, 0x8854)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000140)={&(0x7f0000000300)=@migrate={0xa0, 0x21, 0x1, 0x70bd2b, 0x25dfdbfc, {{@in=@multicast2, @in6=@private1, 0x4e24, 0x4, 0x4e23, 0xd, 0x2, 0x0, 0x0, 0x84}, 0x6e6bb2}, [@migrate={0x50, 0x11, [{@in6=@private0={0xfc, 0x0, '\x00', 0x1}, @in=@rand_addr=0x64010104, @in6=@rand_addr=' \x01\x00', @in=@rand_addr=0x64010100, 0x3c, 0x0, 0x0, 0x0, 0x2, 0x2}]}]}, 0xa0}, 0x1, 0x0, 0x0, 0x81}, 0x20000000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000040)={0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000480)=ANY=[@ANYBLOB="fc0000001900674c0000000000000000e0000001000000000000000000000000e000000200000000000000000000000000000000000000000a00000000000000", @ANYRES32=0x0, @ANYRES32=0x0], 0xfc}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000035c0)={0x0, 0x0, &(0x7f0000003580)={&(0x7f00000005c0)=ANY=[@ANYBLOB="5c0000001500010028bd7000ffdbdf2500000000000000000000000000000000ac1414aa0000000000000000000000004e2100094e2300090a00a00032000000", @ANYRES32=0x0, @ANYRES32=0x0, @ANYBLOB="b36b6e00020000230a0010006a"], 0x5c}, 0x1, 0x0, 0x0, 0x4004050}, 0x20000000)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f00000001c0)={0x0, 0x0, &(0x7f0000000180)={&(0x7f0000000000)=@newae={0x64, 0x1e, 0x1, 0x70bd2d, 0x25dfdbfc, {{@in=@rand_addr=0x64010100, 0x4d3, 0x8, 0x32}, @in=@broadcast, 0xea7, 0x3502}, [@lifetime_val={0x24, 0x9, {0x8, 0xffffffffffffff80, 0xfffffffffffffffe, 0x5}}]}, 0x64}, 0x1, 0x0, 0x0, 0x4000000}, 0x24000014)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f00000001c0)={&(0x7f00000003c0)=@newsa={0x158, 0x10, 0x1, 0x0, 0x25dfdbfc, {{@in6=@remote, @in6=@loopback}, {@in=@empty, 0x0, 0x32}, @in6=@loopback, {0x0, 0x0, 0x0, 0x0, 0x6, 0x9}, {0x0, 0x6}, {0x0, 0x0, 0x4}, 0x0, 0x0, 0xa, 0x4, 0x0, 0xad}, [@algo_crypt={0x48, 0x2, {{'ecb(cipher_null)\x00'}}}, @replay_esn_val={0x20, 0x17, {0x1, 0x0, 0x0, 0x0, 0x0, 0x2, [0x0]}}]}, 0x158}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000500)={0x0, 0x0, &(0x7f00000004c0)={&(0x7f0000000580)=@newsa={0x138, 0x10, 0x1, 0x70bd27, 0x25dfdbfc, {{@in6=@local, @in=@initdev={0xac, 0x1e, 0x1, 0x0}, 0x4e21, 0x81, 0x4e23, 0x0, 0x2, 0x0, 0x20, 0x1}, {@in=@loopback, 0x4d4, 0x2b}, @in=@rand_addr=0x64010101, {0x4, 0x4, 0x30, 0x84, 0x7, 0x2, 0xd4, 0xfffffffffffffffd}, {0x0, 0xfffffffffffffffb, 0x2, 0x13c}, {0x7, 0x100, 0x3}, 0x70bd2c, 0x3506, 0xa, 0x3, 0x81, 0x2}, [@algo_crypt={0x48, 0x2, {{'ecb(cipher_null)\x00'}}}]}, 0x138}, 0x1, 0x0, 0x0, 0x4010080}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f000014f000)={0x0, 0x0, &(0x7f00000bfff0)={&(0x7f0000006440)=@updpolicy={0xb8, 0x19, 0x1, 0x0, 0x0, {{@in6=@mcast1={0xff, 0x2}, @in=@multicast1, 0x0, 0x0, 0x0, 0x0, 0xa, 0x60}}}, 0xb8}}, 0x0)

      
      r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f0000000140)={&(0x7f0000000040)=@migrate={0xa8, 0x21, 0x1, 0x70bd2c, 0x25dfdbfd, {{@in=@multicast2, @in=@dev={0xac, 0x14, 0x14, 0x1f}, 0x4e24, 0x4, 0x4e23, 0xd, 0x2}, 0x6e6bb2, 0x1}, [@migrate={0x50, 0x11, [{@in6=@initdev={0xfe, 0x88, '\x00', 0x0, 0x0}, @in=@rand_addr=0x64010100, @in=@private=0xa010100, @in6=@private0={0xfc, 0x0, '\x00', 0x1}, 0x32, 0x0, 0x0, 0x3501, 0x4d21fe76179a5365, 0x2}]}, @XFRMA_IF_ID={0x8, 0x1f, 0x2}]}, 0xa8}, 0x1, 0x0, 0x0, 0x81}, 0x20000000)

-----------
SUMMARY---of 0

ap_init_aperfmperf---of 5
arch_enable_hybrid_capacity_scale---of 15
arch_freq_get_on_cpu---of 16
arch_scale_cpu_capacity60%of 5
arch_scale_freq_tick---of 25
arch_set_cpu_capacity---of 5
arch_set_max_freq_ratio---of 3
disable_freq_invariance_workfn---of 9
freq_invariance_enable---of 5
freq_invariance_set_perf_ratio---of 3
init_counter_refs---of 7
-----------
SUMMARY60%of 5

-----------
SUMMARY---of 0

attach_auth67%of 18
attach_auth_trunc73%of 22
attach_crypt57%of 16
build_aevent---of 37
copy_from_user_policy100%of 1
copy_from_user_policy_type88%of 8
copy_sec_ctx---of 4
copy_templates100%of 3
copy_to_user_policy100%of 1
copy_to_user_policy_type100%of 1
copy_to_user_state100%of 1
copy_to_user_state_extra66%of 123
copy_to_user_tmpl74%of 15
copy_user_offload---of 7
dump_one_policy49%of 31
dump_one_state69%of 16
validate_tmpl83%of 34
verify_newpolicy_info.isra.081%of 36
verify_newsa_info75%of 304
verify_one_alg88%of 8
xfrm_add_acquire93%of 26
xfrm_add_pol_expire85%of 40
xfrm_add_policy87%of 22
xfrm_add_sa78%of 132
xfrm_add_sa_expire74%of 15
xfrm_alloc_userspi64%of 44
xfrm_compile_policy---of 18
xfrm_del_sa16%of 19
xfrm_do_migrate65%of 56
xfrm_dump_policy100%of 1
xfrm_dump_policy_done100%of 1
xfrm_dump_policy_start100%of 1
xfrm_dump_sa81%of 21
xfrm_dump_sa_done100%of 4
xfrm_flush_policy100%of 6
xfrm_flush_sa100%of 5
xfrm_get_ae15%of 21
xfrm_get_default67%of 9
xfrm_get_policy81%of 47
xfrm_get_sa25%of 12
xfrm_get_sadinfo50%of 18
xfrm_get_spdinfo63%of 29
xfrm_is_alive---of 19
xfrm_mark_get100%of 5
xfrm_netlink_rcv100%of 1
xfrm_new_ae29%of 46
xfrm_nlmsg_multicast.constprop.062%of 13
xfrm_policy_construct95%of 18
xfrm_send_acquire59%of 36
xfrm_send_mapping---of 13
xfrm_send_migrate65%of 31
xfrm_send_policy_notify62%of 81
xfrm_send_report---of 16
xfrm_send_state_notify66%of 114
xfrm_set_default65%of 28
xfrm_set_spdinfo79%of 41
xfrm_state_netlink60%of 5
xfrm_update_ae_params73%of 22
xfrm_user_net_exit---of 3
xfrm_user_net_init---of 4
xfrm_user_net_pre_exit---of 1
xfrm_user_rcv_msg76%of 58
xfrm_user_state_lookup.constprop.094%of 16
-----------
SUMMARY70%of 1682

-----------
SUMMARY---of 0

__rhashtable_walk_find_next---of 49
__rht_bucket_nested---of 18
bucket_table_alloc.isra.0---of 17
bucket_table_free---of 5
bucket_table_free_rcu---of 1
jhash---of 28
jhash2---of 10
lockdep_rht_bucket_is_held67%of 6
lockdep_rht_mutex_is_held75%of 4
nested_table_alloc.isra.0---of 20
nested_table_free---of 9
rhashtable_destroy---of 1
rhashtable_free_and_destroy---of 58
rhashtable_init_noprof---of 23
rhashtable_insert_slow---of 171
rhashtable_jhash2---of 1
rhashtable_last_table---of 13
rhashtable_rehash_alloc---of 8
rhashtable_walk_enter---of 6
rhashtable_walk_exit---of 4
rhashtable_walk_next---of 24
rhashtable_walk_peek---of 8
rhashtable_walk_start_check---of 69
rhashtable_walk_stop---of 14
rhltable_init_noprof---of 1
rht_bucket_nested---of 6
rht_bucket_nested_insert---of 6
rht_deferred_worker---of 173
-----------
SUMMARY70%of 10

-----------
SUMMARY---of 0

authenc_geniv_ahash_done---of 4
authenc_verify_ahash_done---of 8
crypto_authenc_create---of 15
crypto_authenc_decrypt---of 3
crypto_authenc_decrypt_tail---of 7
crypto_authenc_encrypt---of 6
crypto_authenc_encrypt_done---of 8
crypto_authenc_exit_tfm---of 1
crypto_authenc_extractkeys58%of 14
crypto_authenc_free---of 1
crypto_authenc_genicv---of 3
crypto_authenc_init_tfm67%of 6
crypto_authenc_setkey84%of 6
-----------
SUMMARY66%of 26

-----------
SUMMARY---of 0

msleep---of 3
msleep_interruptible---of 9
process_timeout---of 1
schedule_hrtimeout---of 1
schedule_hrtimeout_range---of 1
schedule_hrtimeout_range_clock---of 16
schedule_timeout63%of 8
schedule_timeout_idle---of 4
schedule_timeout_interruptible---of 4
schedule_timeout_killable---of 4
schedule_timeout_uninterruptible---of 4
usleep_range_state---of 10
-----------
SUMMARY63%of 8

-----------
SUMMARY---of 0

___pte_offset_map50%of 24
__pte_offset_map_lock53%of 21
p4d_clear_bad---of 7
pgd_clear_bad---of 8
pmd_clear_bad---of 1
pte_offset_map_ro_nolock---of 13
pte_offset_map_rw_nolock67%of 15
ptep_clear_flush---of 7
pud_clear_bad---of 1
-----------
SUMMARY56%of 60

-----------
SUMMARY---of 0

ida_alloc_range---of 52
ida_destroy---of 28
ida_find_first_range---of 19
ida_free---of 20
idr_alloc---of 9
idr_alloc_cyclic---of 12
idr_alloc_u32---of 8
idr_find100%of 1
idr_for_each38%of 16
idr_get_next---of 4
idr_get_next_ul---of 20
idr_remove---of 1
idr_replace---of 6
kzalloc_noprof.constprop.0---of 6
-----------
SUMMARY42%of 17

__bpf_trace_hrtimer_class---of 1
__bpf_trace_hrtimer_expire_entry---of 1
__bpf_trace_hrtimer_setup---of 1
__bpf_trace_hrtimer_start---of 1
__bpf_trace_itimer_expire---of 1
__bpf_trace_itimer_state---of 1
__bpf_trace_tick_stop---of 1
__bpf_trace_timer_base_idle---of 1
__bpf_trace_timer_class---of 1
__bpf_trace_timer_expire_entry---of 1
__bpf_trace_timer_start---of 1
__get_next_timer_interrupt---of 45
__mod_timer34%of 66
__probestub_hrtimer_cancel---of 1
__probestub_hrtimer_expire_entry---of 1
__probestub_hrtimer_expire_exit---of 1
__probestub_hrtimer_setup---of 1
__probestub_hrtimer_start---of 1
__probestub_itimer_expire---of 1
__probestub_itimer_state---of 1
__probestub_tick_stop---of 1
__probestub_timer_base_idle---of 1
__probestub_timer_cancel---of 1
__probestub_timer_expire_entry---of 1
__probestub_timer_expire_exit---of 1
__probestub_timer_init---of 1
__probestub_timer_start---of 1
__round_jiffies_relative---of 7
__round_jiffies_up_relative---of 6
__run_timer_base---of 48
__timer_delete72%of 7
__timer_delete_sync59%of 12
__traceiter_hrtimer_cancel---of 8
__traceiter_hrtimer_expire_entry---of 7
__traceiter_hrtimer_expire_exit---of 8
__traceiter_hrtimer_setup---of 7
__traceiter_hrtimer_start---of 7
__traceiter_itimer_expire---of 7
__traceiter_itimer_state---of 7
__traceiter_tick_stop---of 7
__traceiter_timer_base_idle---of 7
__traceiter_timer_cancel---of 8
__traceiter_timer_expire_entry---of 7
__traceiter_timer_expire_exit---of 8
__traceiter_timer_init---of 8
__traceiter_timer_start---of 7
__try_to_del_timer_sync60%of 5
add_timer60%of 5
add_timer_global---of 5
add_timer_local---of 5
add_timer_on---of 19
calc_wheel_index41%of 22
call_timer_fn---of 36
detach_if_pending100%of 12
enqueue_timer33%of 28
fetch_next_timer_interrupt---of 14
fetch_next_timer_interrupt_remote---of 10
get_next_timer_interrupt---of 1
internal_add_timer100%of 1
lock_timer_base67%of 9
mod_timer100%of 1
mod_timer_pending---of 1
next_timer_interrupt---of 7
perf_trace_hrtimer_class---of 7
perf_trace_hrtimer_expire_entry---of 7
perf_trace_hrtimer_setup---of 7
perf_trace_hrtimer_start---of 7
perf_trace_itimer_expire---of 10
perf_trace_itimer_state---of 7
perf_trace_tick_stop---of 7
perf_trace_timer_base_idle---of 7
perf_trace_timer_class---of 7
perf_trace_timer_expire_entry---of 7
perf_trace_timer_start---of 7
round_jiffies---of 7
round_jiffies_relative---of 7
round_jiffies_up---of 6
round_jiffies_up_relative---of 6
run_timer_softirq---of 5
timer_base_is_idle---of 1
timer_base_try_to_set_idle---of 4
timer_clear_idle---of 1
timer_delete100%of 1
timer_delete_sync100%of 1
timer_delete_sync_try---of 1
timer_expire_remote---of 1
timer_init_key36%of 17
timer_lock_remote_bases---of 7
timer_migration_handler---of 4
timer_recalc_next_expiry---of 15
timer_reduce---of 1
timer_shutdown---of 1
timer_shutdown_sync---of 1
timer_unlock_remote_bases---of 1
timer_update_keys---of 1
timers_dead_cpu---of 21
timers_prepare_cpu---of 3
timers_update_migration---of 7
timers_update_nohz---of 1
trace_event_raw_event_hrtimer_class---of 8
trace_event_raw_event_hrtimer_expire_entry---of 8
trace_event_raw_event_hrtimer_setup---of 8
trace_event_raw_event_hrtimer_start---of 8
trace_event_raw_event_itimer_expire---of 11
trace_event_raw_event_itimer_state---of 8
trace_event_raw_event_tick_stop---of 8
trace_event_raw_event_timer_base_idle---of 8
trace_event_raw_event_timer_class---of 8
trace_event_raw_event_timer_expire_entry---of 8
trace_event_raw_event_timer_start---of 8
trace_raw_output_hrtimer_class---of 5
trace_raw_output_hrtimer_expire_entry---of 4
trace_raw_output_hrtimer_setup---of 4
trace_raw_output_hrtimer_start---of 4
trace_raw_output_itimer_expire---of 4
trace_raw_output_itimer_state---of 4
trace_raw_output_tick_stop---of 4
trace_raw_output_timer_base_idle---of 4
trace_raw_output_timer_class---of 5
trace_raw_output_timer_expire_entry---of 4
trace_raw_output_timer_start---of 4
trace_timer_base_idle---of 15
trace_timer_cancel34%of 15
update_process_times---of 21
-----------
SUMMARY46%of 202

serial8250_backup_timeout---of 49
serial8250_get_port---of 1
serial8250_interrupt---of 14
serial8250_register_8250_port---of 112
serial8250_resume_port---of 6
serial8250_setup_port---of 6
serial8250_suspend_port---of 9
serial8250_timeout---of 3
serial8250_unregister_port---of 49
serial_8250_overrun_backoff_work---of 37
serial_do_unlink---of 13
univ8250_console_exit---of 1
univ8250_console_match---of 25
univ8250_console_setup---of 11
univ8250_console_write100%of 1
univ8250_release_irq---of 20
univ8250_setup_irq---of 30
univ8250_setup_timer---of 10
-----------
SUMMARY100%of 1

__blocking_notifier_chain_register---of 4
__bpf_trace_notifier_info---of 1
__probestub_notifier_register---of 1
__probestub_notifier_run---of 1
__probestub_notifier_unregister---of 1
__traceiter_notifier_register---of 8
__traceiter_notifier_run---of 8
__traceiter_notifier_unregister---of 8
atomic_notifier_call_chain---of 11
atomic_notifier_call_chain_is_empty---of 1
atomic_notifier_chain_register---of 1
atomic_notifier_chain_register_unique_prio---of 1
atomic_notifier_chain_unregister---of 1
blocking_notifier_call_chain75%of 4
blocking_notifier_call_chain_robust---of 4
blocking_notifier_chain_register---of 1
blocking_notifier_chain_register_unique_prio---of 1
blocking_notifier_chain_unregister---of 5
notifier_call_chain52%of 27
notifier_call_chain_robust---of 3
notifier_chain_register---of 26
notifier_chain_unregister---of 20
notify_die---of 6
perf_trace_notifier_info---of 7
raw_notifier_call_chain---of 1
raw_notifier_call_chain_robust---of 1
raw_notifier_chain_register---of 1
raw_notifier_chain_unregister---of 1
register_die_notifier---of 1
srcu_init_notifier_head---of 3
srcu_notifier_call_chain---of 3
srcu_notifier_chain_register---of 5
srcu_notifier_chain_unregister---of 5
trace_event_raw_event_notifier_info---of 8
trace_raw_output_notifier_info---of 5
unregister_die_notifier---of 1
-----------
SUMMARY55%of 31

-----------
SUMMARY---of 0

__clear_open_fd---of 3
__do_sys_close_range---of 43
__do_sys_dup2---of 28
__f_unlock_pos---of 1
__fget_files44%of 23
__file_ref_put---of 6
__file_ref_put_badval---of 9
__free_fdtable---of 1
__get_file_rcu---of 10
__get_unused_fd_flags---of 1
__ia32_sys_close_range---of 1
__ia32_sys_dup---of 7
__ia32_sys_dup2---of 1
__ia32_sys_dup3---of 1
__put_unused_fd---of 9
__set_close_on_exec---of 5
__x64_sys_close_range---of 1
__x64_sys_dup---of 7
__x64_sys_dup2---of 1
__x64_sys_dup3---of 1
alloc_fd---of 27
alloc_fdtable---of 15
bitmap_copy_and_extend---of 3
close_fd---of 4
do_close_on_exec---of 20
do_dup2---of 20
dup_fd---of 31
exit_files---of 5
expand_files---of 40
f_dupfd---of 8
fd_install---of 22
fdget40%of 10
fdget_pos---of 21
fdget_raw---of 8
fget---of 1
fget_raw---of 1
fget_task---of 4
fget_task_next---of 34
file_close_fd---of 1
file_close_fd_locked---of 17
file_seek_cur_needs_f_lock---of 4
free_fdtable_rcu---of 1
get_close_on_exec---of 17
get_file_active---of 13
get_file_rcu---of 4
get_unused_fd_flags---of 1
iterate_fd---of 23
ksys_dup3---of 20
put_files_struct---of 14
put_unused_fd---of 1
rcu_read_unlock_sched---of 9
receive_fd---of 12
receive_fd_replace---of 5
replace_fd---of 8
sane_fdtable_size---of 11
set_close_on_exec---of 7
-----------
SUMMARY43%of 33

-----------
SUMMARY---of 0

drbg_fini_hash_kernel---of 4
drbg_hmac_generate57%of 32
drbg_hmac_update52%of 33
drbg_init_hash_kernel62%of 13
drbg_kcapi_cleanup---of 1
drbg_kcapi_hash.isra.0100%of 3
drbg_kcapi_hmacsetkey67%of 6
drbg_kcapi_init100%of 1
drbg_kcapi_random41%of 54
drbg_kcapi_seed35%of 67
drbg_kcapi_set_entropy---of 1
drbg_seed26%of 39
drbg_uninstantiate.isra.0---of 12
-----------
SUMMARY43%of 248

crypto_alloc_rng100%of 1
crypto_del_default_rng---of 4
crypto_get_default_rng63%of 8
crypto_put_default_rng100%of 1
crypto_register_rng---of 6
crypto_register_rngs---of 8
crypto_rng_init_tfm100%of 1
crypto_rng_report---of 1
crypto_rng_reset34%of 9
crypto_rng_show---of 1
crypto_unregister_rng---of 1
crypto_unregister_rngs---of 3
rng_default_set_ent---of 1
-----------
SUMMARY56%of 20

cipher_crypt_one---of 7
crypto_cipher_decrypt_one---of 1
crypto_cipher_encrypt_one---of 1
crypto_cipher_setkey28%of 11
crypto_clone_cipher---of 8
-----------
SUMMARY28%of 11

-----------
SUMMARY---of 0

chacha_block_generic100%of 3
chacha_permute63%of 8
hchacha_block_generic---of 1
-----------
SUMMARY73%of 11

-----------
SUMMARY---of 0

____fput---of 1
__fput---of 47
__fput_deferred---of 14
__fput_sync---of 8
alloc_empty_backing_file---of 6
alloc_empty_file---of 12
alloc_empty_file_noaccount---of 7
alloc_file---of 4
alloc_file_clone---of 5
alloc_file_pseudo---of 6
alloc_file_pseudo_noaccount---of 6
alloc_path_pseudo---of 4
backing_file_set_user_path---of 1
backing_file_user_path---of 1
delayed_fput---of 3
file_free---of 12
file_init_path---of 24
flush_delayed_fput---of 1
fput50%of 8
fput_close---of 13
fput_close_sync---of 13
get_max_files---of 1
init_file---of 11
proc_nr_files---of 1
-----------
SUMMARY50%of 8

-----------
SUMMARY---of 0

__copy_xstate_to_uabi_buf---of 32
__raw_xsave_addr---of 8
__xfd_enable_feature---of 62
arch_set_user_pkey_access---of 22
compare_xstate_offsets---of 1
copy_from_buffer---of 12
copy_sigframe_from_user_to_xstate---of 1
copy_uabi_from_kernel_to_xstate---of 1
copy_uabi_to_xstate---of 30
copy_xstate_to_uabi_buf---of 1
cpu_has_xfeatures---of 7
elf_coredump_extra_notes_size---of 3
elf_coredump_extra_notes_write---of 22
fpstate_free---of 8
fpu__init_cpu_xstate---of 23
fpu__resume_cpu---of 20
fpu_xstate_prctl---of 45
get_xsave_addr---of 8
get_xsave_addr_user---of 5
get_xsave_desc_size---of 10
membuf_write.isra.0---of 3
membuf_zero.isra.0---of 3
proc_pid_arch_status---of 10
validate_independent_components---of 10
xfd_enable_feature---of 1
xfd_validate_state30%of 10
xfeature_get_offset---of 15
xfeature_size---of 5
xrstors---of 6
xsaves---of 6
xstate_calculate_size---of 11
xstate_get_guest_group_perm---of 1
-----------
SUMMARY30%of 10

-----------
SUMMARY---of 0

crypto_sha3_finup82%of 11
crypto_sha3_init100%of 1
crypto_sha3_update86%of 7
keccakf100%of 3
-----------
SUMMARY87%of 22

-----------
SUMMARY---of 0

__lockup_detector_reconfigure---of 24
arch_touch_nmi_watchdog100%of 1
hardlockup_count_show---of 1
lockup_detector_offline_cpu---of 4
lockup_detector_online_cpu---of 4
lockup_detector_reconfigure---of 1
lockup_detector_soft_poweroff---of 1
lockup_detector_update_enable---of 9
proc_nmi_watchdog---of 6
proc_soft_watchdog---of 1
proc_watchdog---of 1
proc_watchdog_common---of 8
proc_watchdog_cpumask---of 4
proc_watchdog_thresh---of 5
softlockup_count_show---of 1
softlockup_fn---of 1
softlockup_start_fn---of 1
softlockup_stop_fn---of 1
touch_all_softlockup_watchdogs---of 9
touch_softlockup_watchdog100%of 1
touch_softlockup_watchdog_sched---of 1
touch_softlockup_watchdog_sync---of 1
update_touch_ts---of 1
watchdog_disable---of 3
watchdog_enable---of 6
watchdog_hardlockup_check---of 32
watchdog_hardlockup_start---of 1
watchdog_hardlockup_stop---of 1
watchdog_hardlockup_touch_cpu---of 1
watchdog_timer_fn---of 34
-----------
SUMMARY100%of 2

can_stop_idle_tick---of 15
get_cpu_idle_time_us---of 1
get_cpu_iowait_time_us---of 1
get_cpu_sleep_time_us---of 19
get_jiffies_update---of 10
tick_check_oneshot_change---of 13
tick_clock_notify---of 9
tick_do_update_jiffies64---of 12
tick_get_tick_sched---of 1
tick_irq_enter---of 9
tick_nohz_get_idle_calls_cpu---of 1
tick_nohz_get_next_hrtimer---of 1
tick_nohz_get_sleep_length---of 8
tick_nohz_handler---of 21
tick_nohz_idle_enter---of 17
tick_nohz_idle_exit---of 20
tick_nohz_idle_got_tick---of 3
tick_nohz_idle_restart_tick---of 6
tick_nohz_idle_retain_tick---of 1
tick_nohz_idle_stop_tick---of 67
tick_nohz_irq_exit---of 4
tick_nohz_lowres_handler---of 5
tick_nohz_next_event---of 23
tick_nohz_restart_sched_tick---of 10
tick_nohz_start_idle---of 14
tick_nohz_stop_idle---of 20
tick_nohz_tick_stopped100%of 1
tick_nohz_tick_stopped_cpu---of 1
tick_oneshot_notify---of 1
tick_sched_timer_dying---of 3
tick_setup_sched_timer---of 30
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

aead_exit_geniv---of 1
aead_geniv_alloc---of 15
aead_geniv_free---of 1
aead_geniv_setauthsize100%of 1
aead_geniv_setkey100%of 1
aead_init_geniv63%of 8
-----------
SUMMARY70%of 10

___ratelimit33%of 52
-----------
SUMMARY33%of 52

-----------
SUMMARY---of 0

__bpf_getsockopt---of 17
__bpf_prog_release---of 8
__bpf_redirect---of 61
__bpf_setsockopt---of 12
__bpf_sk_lookup.isra.0---of 16
__bpf_skb_change_head---of 23
__bpf_skb_change_tail---of 41
__bpf_skb_load_bytes---of 16
__bpf_skb_store_bytes---of 22
__bpf_skc_lookup.isra.0---of 18
__bpf_xdp_load_bytes---of 7
__bpf_xdp_store_bytes---of 7
__dev_via_ifindex---of 4
__get_filter---of 17
__ipv6_neigh_lookup_noref_stub---of 12
__sk_attach_prog---of 17
__sk_filter_charge---of 5
_bpf_getsockopt---of 11
_bpf_setsockopt---of 11
bpf_bind---of 15
bpf_clone_redirect---of 10
bpf_convert_ctx_access---of 60
bpf_convert_filter---of 160
bpf_csum_diff---of 7
bpf_csum_level---of 20
bpf_csum_update---of 3
bpf_dispatcher_nop_func---of 1
bpf_dispatcher_xdp_func---of 1
bpf_dynptr_from_skb---of 4
bpf_dynptr_from_skb_meta---of 9
bpf_dynptr_from_skb_rdonly---of 3
bpf_dynptr_from_xdp---of 7
bpf_flow_dissector_load_bytes---of 16
bpf_gen_ld_abs---of 10
bpf_get_cgroup_classid---of 14
bpf_get_cgroup_classid_curr---of 1
bpf_get_hash_recalc---of 3
bpf_get_listener_sock---of 12
bpf_get_netns_cookie---of 6
bpf_get_netns_cookie_sk_msg---of 6
bpf_get_netns_cookie_sock---of 4
bpf_get_netns_cookie_sock_addr---of 6
bpf_get_netns_cookie_sock_ops---of 6
bpf_get_route_realm---of 1
bpf_get_skb_set_tunnel_proto---of 13
bpf_get_socket_cookie---of 5
bpf_get_socket_cookie_sock---of 1
bpf_get_socket_cookie_sock_addr---of 1
bpf_get_socket_cookie_sock_ops---of 1
bpf_get_socket_ptr_cookie---of 7
bpf_get_socket_uid---of 13
bpf_helper_changes_pkt_data---of 4
bpf_ipv4_fib_lookup---of 116
bpf_ipv6_fib_lookup---of 64
bpf_l3_csum_replace---of 16
bpf_l4_csum_replace---of 23
bpf_lwt_in_push_encap---of 1
bpf_lwt_xmit_push_encap---of 1
bpf_msg_apply_bytes---of 1
bpf_msg_cork_bytes---of 1
bpf_msg_pop_data---of 66
bpf_msg_pull_data---of 49
bpf_msg_push_data---of 81
bpf_net_ctx_get_ri---of 3
bpf_noop_prologue---of 1
bpf_prepare_filter---of 78
bpf_prog_change_xdp---of 1
bpf_prog_create---of 10
bpf_prog_create_from_user---of 18
bpf_prog_destroy---of 1
bpf_prog_store_orig_filter---of 5
bpf_redirect---of 5
bpf_redirect_neigh---of 9
bpf_redirect_peer---of 5
bpf_run_sk_reuseport---of 6
bpf_search_tcp_opt---of 23
bpf_set_hash---of 1
bpf_set_hash_invalid---of 1
bpf_sk_ancestor_cgroup_id---of 18
bpf_sk_assign---of 34
bpf_sk_assign_tcp_reqsk---of 45
bpf_sk_base_func_proto---of 15
bpf_sk_cgroup_id---of 12
bpf_sk_fullsock---of 3
bpf_sk_getsockopt---of 1
bpf_sk_lookup---of 16
bpf_sk_lookup_assign---of 33
bpf_sk_lookup_tcp---of 1
bpf_sk_lookup_udp---of 1
bpf_sk_release---of 9
bpf_sk_setsockopt---of 1
bpf_skb_adjust_room---of 136
bpf_skb_ancestor_cgroup_id---of 18
bpf_skb_cgroup_classid---of 12
bpf_skb_cgroup_id---of 12
bpf_skb_change_head---of 1
bpf_skb_change_proto---of 39
bpf_skb_change_protocol---of 15
bpf_skb_change_tail---of 1
bpf_skb_change_type---of 6
bpf_skb_check_mtu---of 22
bpf_skb_copy---of 13
bpf_skb_ecn_set_ce---of 66
bpf_skb_event_output---of 8
bpf_skb_fib_lookup---of 17
bpf_skb_get_nlattr---of 9
bpf_skb_get_nlattr_nest---of 13
bpf_skb_get_pay_offset---of 1
bpf_skb_get_tunnel_key---of 29
bpf_skb_get_tunnel_opt---of 10
bpf_skb_get_xfrm_state---of 15
bpf_skb_is_valid_access.isra.0---of 32
bpf_skb_load_bytes---of 16
bpf_skb_load_bytes_relative---of 13
bpf_skb_load_helper_16---of 8
bpf_skb_load_helper_16_no_cache---of 8
bpf_skb_load_helper_32---of 8
bpf_skb_load_helper_32_no_cache---of 8
bpf_skb_load_helper_8---of 8
bpf_skb_load_helper_8_no_cache---of 8
bpf_skb_load_helper_convert_offset---of 9
bpf_skb_meta_pointer---of 1
bpf_skb_net_hdr_pop---of 17
bpf_skb_net_hdr_push---of 3
bpf_skb_pull_data---of 3
bpf_skb_set_tstamp---of 13
bpf_skb_set_tunnel_key---of 39
bpf_skb_set_tunnel_opt---of 40
bpf_skb_store_bytes---of 22
bpf_skb_under_cgroup---of 19
bpf_skb_vlan_pop---of 13
bpf_skb_vlan_push---of 20
bpf_skc_lookup---of 4
bpf_skc_lookup_tcp---of 1
bpf_skc_to_mptcp_sock---of 1
bpf_skc_to_tcp6_sock---of 7
bpf_skc_to_tcp_request_sock---of 9
bpf_skc_to_tcp_sock---of 6
bpf_skc_to_tcp_timewait_sock---of 9
bpf_skc_to_udp6_sock---of 7
bpf_skc_to_unix_sock---of 6
bpf_sock_addr_getsockopt---of 1
bpf_sock_addr_set_sun_path---of 6
bpf_sock_addr_setsockopt---of 1
bpf_sock_addr_sk_lookup_tcp---of 1
bpf_sock_addr_sk_lookup_udp---of 1
bpf_sock_addr_skc_lookup_tcp---of 1
bpf_sock_common_is_valid_access---of 3
bpf_sock_convert_ctx_access---of 23
bpf_sock_destroy---of 8
bpf_sock_from_file---of 1
bpf_sock_is_valid_access---of 18
bpf_sock_ops_cb_flags_set---of 8
bpf_sock_ops_enable_tx_tstamp---of 6
bpf_sock_ops_get_syn.isra.0---of 20
bpf_sock_ops_getsockopt---of 15
bpf_sock_ops_load_hdr_opt---of 36
bpf_sock_ops_reserve_hdr_opt---of 10
bpf_sock_ops_setsockopt---of 4
bpf_sock_ops_store_hdr_opt---of 27
bpf_tc_sk_lookup_tcp---of 1
bpf_tc_sk_lookup_udp---of 1
bpf_tc_skc_lookup_tcp---of 1
bpf_tcp_check_syncookie---of 36
bpf_tcp_gen_syncookie---of 28
bpf_tcp_raw_check_syncookie_ipv4---of 5
bpf_tcp_raw_check_syncookie_ipv6---of 5
bpf_tcp_raw_gen_syncookie_ipv4---of 8
bpf_tcp_raw_gen_syncookie_ipv6---of 8
bpf_tcp_sock---of 6
bpf_tcp_sock_convert_ctx_access---of 29
bpf_tcp_sock_is_valid_access---of 10
bpf_unlocked_sk_getsockopt---of 1
bpf_unlocked_sk_setsockopt---of 1
bpf_warn_invalid_xdp_action---of 9
bpf_xdp_adjust_head---of 11
bpf_xdp_adjust_meta---of 12
bpf_xdp_adjust_tail---of 44
bpf_xdp_check_mtu---of 11
bpf_xdp_copy---of 1
bpf_xdp_copy_buf---of 22
bpf_xdp_event_output---of 11
bpf_xdp_fib_lookup---of 9
bpf_xdp_get_buff_len---of 4
bpf_xdp_load_bytes---of 7
bpf_xdp_pointer---of 25
bpf_xdp_pull_data---of 33
bpf_xdp_redirect---of 5
bpf_xdp_redirect_map---of 1
bpf_xdp_shrink_data---of 14
bpf_xdp_sk_lookup_tcp---of 1
bpf_xdp_sk_lookup_udp---of 1
bpf_xdp_skc_lookup_tcp---of 1
bpf_xdp_sock_convert_ctx_access---of 3
bpf_xdp_sock_is_valid_access---of 7
bpf_xdp_store_bytes---of 7
btf_id_cmp_func---of 1
cg_skb_func_proto---of 18
cg_skb_is_valid_access---of 19
convert_bpf_ld_abs---of 32
copy_bpf_fprog_from_user---of 11
copy_from_sockptr_offset.constprop.0---of 7
flow_dissector_convert_ctx_access---of 7
flow_dissector_func_proto---of 3
flow_dissector_is_valid_access---of 19
get_page---of 19
init_subsystem---of 1
ip_neigh_gw4---of 14
ip_neigh_gw6---of 12
kmalloc_array_noprof---of 5
lwt_in_func_proto---of 3
lwt_is_valid_access---of 13
lwt_out_func_proto---of 21
lwt_seg6local_func_proto---of 1
lwt_xmit_func_proto---of 18
neigh_hh_output---of 22
netkit_peer_dev---of 1
put_page---of 21
sk_attach_bpf---of 9
sk_attach_filter---of 7
sk_detach_filter---of 14
sk_filter_charge---of 17
sk_filter_func_proto---of 15
sk_filter_is_valid_access---of 9
sk_filter_release---of 8
sk_filter_release_rcu---of 1
sk_filter_trim_cap24%of 50
sk_filter_uncharge---of 1
sk_get_filter---of 25
sk_lookup.constprop.0---of 20
sk_lookup_convert_ctx_access---of 15
sk_lookup_func_proto---of 9
sk_lookup_is_valid_access---of 23
sk_msg_compute_data_pointers---of 4
sk_msg_convert_ctx_access---of 15
sk_msg_func_proto---of 14
sk_msg_is_valid_access---of 16
sk_msg_reset_curr---of 6
sk_reuseport_attach_bpf---of 22
sk_reuseport_attach_filter---of 7
sk_reuseport_convert_ctx_access---of 14
sk_reuseport_func_proto---of 13
sk_reuseport_is_valid_access---of 20
sk_reuseport_load_bytes---of 16
sk_reuseport_load_bytes_relative---of 13
sk_reuseport_prog_free---of 8
sk_select_reuseport---of 27
sk_skb_adjust_room---of 39
sk_skb_change_head---of 1
sk_skb_change_tail---of 1
sk_skb_convert_ctx_access---of 16
sk_skb_func_proto---of 18
sk_skb_is_valid_access---of 14
sk_skb_prologue---of 4
sk_skb_pull_data---of 3
skb_do_redirect---of 195
skb_tunnel_info---of 30
sock_addr_convert_ctx_access---of 82
sock_addr_func_proto---of 18
sock_addr_is_valid_access---of 52
sock_filter_func_proto---of 13
sock_filter_is_valid_access---of 14
sock_ops_convert_ctx_access---of 443
sock_ops_func_proto---of 17
sock_ops_is_valid_access---of 27
sol_ip_sockopt---of 10
sol_ipv6_sockopt---of 10
sol_socket_sockopt---of 17
sol_tcp_sockopt---of 62
tc_cls_act_btf_struct_access---of 4
tc_cls_act_convert_ctx_access---of 4
tc_cls_act_func_proto---of 62
tc_cls_act_is_valid_access---of 14
tc_cls_act_prologue---of 4
trace_xdp_redirect.constprop.0---of 15
trace_xdp_redirect_err---of 15
tracing_iter_filter---of 5
xdp_btf_struct_access---of 4
xdp_convert_ctx_access---of 12
xdp_do_flush---of 26
xdp_do_generic_redirect---of 31
xdp_do_redirect---of 31
xdp_do_redirect_frame---of 24
xdp_func_proto---of 26
xdp_is_valid_access---of 26
xdp_master_redirect---of 6
-----------
SUMMARY24%of 50

acomp_do_nondma---of 4
acomp_do_req_chain---of 15
acomp_reqchain_done---of 4
acomp_reqchain_virt---of 5
acomp_request_clone---of 9
acomp_stream_workfn---of 12
acomp_walk_done_dst---of 7
acomp_walk_done_src---of 7
acomp_walk_next_dst---of 17
acomp_walk_next_src---of 17
acomp_walk_virt---of 19
comp_prepare_alg---of 1
crypto_acomp_alloc_streams63%of 16
crypto_acomp_compress---of 9
crypto_acomp_decompress---of 9
crypto_acomp_exit_tfm---of 8
crypto_acomp_extsize60%of 5
crypto_acomp_free_streams---of 14
crypto_acomp_init_tfm40%of 15
crypto_acomp_lock_stream_bh---of 5
crypto_acomp_report---of 1
crypto_acomp_show---of 1
crypto_alloc_acomp100%of 1
crypto_alloc_acomp_node---of 1
crypto_register_acomp---of 1
crypto_register_acomps---of 8
crypto_unregister_acomp---of 1
crypto_unregister_acomps---of 3
-----------
SUMMARY55%of 37

__skb_fill_netmem_desc---of 18
esp4_err---of 17
esp4_rcv_cb---of 1
esp_alloc_tmp---of 3
esp_destroy---of 5
esp_init_aead.constprop.058%of 14
esp_init_authenc.constprop.058%of 54
esp_init_state75%of 24
esp_input---of 41
esp_input_done---of 1
esp_input_done2---of 56
esp_input_done_esn---of 1
esp_input_restore_header---of 3
esp_output---of 16
esp_output_done---of 42
esp_output_done_esn---of 1
esp_output_head---of 65
esp_output_restore_header---of 1
esp_output_tail---of 50
esp_ssg_unref.isra.0---of 23
get_page---of 19
-----------
SUMMARY62%of 92

__do_sys_set_mempolicy_home_node---of 44
__get_vma_policy---of 6
__ia32_sys_get_mempolicy---of 1
__ia32_sys_mbind---of 1
__ia32_sys_migrate_pages---of 1
__ia32_sys_set_mempolicy---of 1
__ia32_sys_set_mempolicy_home_node---of 1
__mpol_dup---of 9
__mpol_equal---of 15
__mpol_put---of 3
__x64_sys_get_mempolicy---of 1
__x64_sys_mbind---of 1
__x64_sys_migrate_pages---of 1
__x64_sys_set_mempolicy---of 1
__x64_sys_set_mempolicy_home_node---of 1
alloc_frozen_pages_noprof100%of 4
alloc_migration_target_by_mpol---of 20
alloc_pages_bulk_mempolicy_noprof---of 95
alloc_pages_mpol27%of 15
alloc_pages_noprof60%of 5
apply_policy_zone---of 5
do_mbind---of 74
do_migrate_pages---of 50
do_set_mempolicy---of 18
folio_alloc_mpol_noprof---of 4
folio_alloc_noprof---of 5
folio_maybe_mapped_shared---of 13
get_bitmap---of 14
get_il_weight---of 19
get_nodes---of 14
get_task_policy63%of 8
get_vma_policy---of 7
huge_node---of 1
init_nodemask_of_mempolicy---of 10
interleave_nid---of 11
interleave_nodes---of 15
kernel_get_mempolicy---of 133
kernel_mbind---of 16
kernel_migrate_pages---of 62
kernel_set_mempolicy---of 16
mbind_range---of 33
mempolicy_in_oom_domain---of 9
mempolicy_set_node_perf---of 26
mempolicy_slab_node17%of 24
mmap_write_unlock---of 9
mpol_free_shared_policy---of 7
mpol_misplaced---of 41
mpol_new---of 26
mpol_new_nodemask---of 4
mpol_new_preferred---of 6
mpol_parse_str---of 65
mpol_put_task_policy---of 5
mpol_rebind_default---of 1
mpol_rebind_mm---of 18
mpol_rebind_nodemask---of 8
mpol_rebind_policy---of 11
mpol_rebind_preferred---of 1
mpol_rebind_task---of 1
mpol_relative_nodemask---of 1
mpol_set_nodemask---of 14
mpol_set_shared_policy---of 44
mpol_shared_policy_init---of 13
mpol_shared_policy_lookup---of 10
mpol_to_str---of 25
nearest_node_nodemask---of 9
node_show---of 1
node_store---of 27
numa_default_policy---of 1
numa_nearest_node---of 16
policy_nodemask20%of 31
queue_folios_hugetlb---of 58
queue_folios_pte_range---of 54
queue_pages_range---of 9
queue_pages_test_walk---of 25
read_mems_allowed_begin---of 9
read_once_policy_nodemask---of 1
reduce_interleave_weights---of 26
set_page_refcounted42%of 12
sp_alloc---of 8
sp_free---of 4
sp_insert---of 10
sp_lookup.isra.0---of 14
vma_alloc_folio_noprof---of 7
vma_dup_policy---of 7
vma_migratable---of 17
vma_policy_mof---of 15
weighted_interleave_auto_show---of 25
weighted_interleave_auto_store---of 26
weighted_interleave_nid---of 48
weighted_interleave_nodes---of 20
wi_kobj_release---of 1
-----------
SUMMARY32%of 99

-----------
SUMMARY---of 0

ah4_err---of 17
ah4_rcv_cb---of 1
ah_alloc_tmp---of 1
ah_destroy---of 5
ah_init_state55%of 33
ah_input---of 60
ah_input_done---of 13
ah_output---of 31
ah_output_done---of 11
ip_clear_mutable_options---of 18
-----------
SUMMARY55%of 33

__bpf_trace_alloc_vmap_area---of 1
__bpf_trace_free_vmap_area_noflush---of 1
__bpf_trace_purge_vmap_area_lazy---of 1
__find_vmap_area.isra.0---of 9
__get_vm_area_caller---of 1
__get_vm_area_node---of 13
__probestub_alloc_vmap_area---of 1
__probestub_free_vmap_area_noflush---of 1
__probestub_purge_vmap_area_lazy---of 1
__purge_vmap_area_lazy---of 61
__traceiter_alloc_vmap_area---of 7
__traceiter_free_vmap_area_noflush---of 7
__traceiter_purge_vmap_area_lazy---of 7
__vmalloc_node_noprof---of 4
__vmalloc_node_range_noprof---of 100
__vmalloc_noprof---of 1
__vmap_pages_range_noflush---of 94
__vunmap_range_noflush---of 89
_vm_unmap_aliases---of 31
addr_to_vb_xa---of 6
aligned_vread_iter---of 9
alloc_vmap_area---of 234
check_sparse_vm_area---of 16
decay_va_pool_node---of 58
decode_vn_id---of 7
delayed_vfree_work---of 3
drain_vmap_area_work---of 1
find_unlink_vmap_area---of 9
find_vm_area---of 4
find_vmap_area---of 8
find_vmap_area_exceed_addr_lock---of 24
free_purged_blocks---of 3
free_vm_area---of 3
free_vmap_area_noflush---of 22
free_vmap_area_rb_augment_cb_copy---of 1
free_vmap_area_rb_augment_cb_propagate---of 13
free_vmap_area_rb_augment_cb_rotate---of 7
free_vmap_block---of 9
get_vm_area---of 4
get_vm_area_caller---of 4
get_vm_area_page_order---of 1
insert_vmap_area---of 20
insert_vmap_area_augment.constprop.0---of 40
ioremap_page_range---of 22
is_vmalloc_addr38%of 8
is_vmalloc_or_module_addr---of 3
kmalloc_array_noprof---of 5
mod_memcg_page_state.constprop.0---of 42
pcpu_free_vm_areas---of 3
pcpu_get_vm_areas---of 286
perf_trace_alloc_vmap_area---of 7
perf_trace_free_vmap_area_noflush---of 7
perf_trace_purge_vmap_area_lazy---of 7
pfn_valid---of 31
purge_fragmented_block---of 9
purge_vmap_node---of 12
pvm_determine_end_from_reverse---of 10
pvm_find_va_enclose_addr---of 9
rcu_read_unlock_sched---of 9
reclaim_and_purge_vmap_areas---of 26
reclaim_list_global---of 125
register_vmap_purge_notifier---of 1
remap_vmalloc_range---of 1
remap_vmalloc_range_partial---of 33
remove_vm_area---of 11
trace_event_raw_event_alloc_vmap_area---of 8
trace_event_raw_event_free_vmap_area_noflush---of 8
trace_event_raw_event_purge_vmap_area_lazy---of 8
trace_raw_output_alloc_vmap_area---of 4
trace_raw_output_free_vmap_area_noflush---of 4
trace_raw_output_purge_vmap_area_lazy---of 4
unregister_vmap_purge_notifier---of 1
vfree---of 38
vfree_atomic---of 10
vm_area_map_pages---of 4
vm_area_unmap_pages---of 3
vm_map_ram---of 170
vm_unmap_aliases---of 1
vm_unmap_ram---of 21
vmalloc_32_noprof---of 1
vmalloc_32_user_noprof---of 4
vmalloc_dump_obj---of 10
vmalloc_huge_node_noprof---of 4
vmalloc_info_show---of 53
vmalloc_node_noprof---of 1
vmalloc_noprof---of 1
vmalloc_nr_pages---of 1
vmalloc_to_page---of 40
vmalloc_to_pfn---of 1
vmalloc_user_noprof---of 4
vmap---of 14
vmap_block_vaddr---of 3
vmap_node_shrink_count---of 7
vmap_node_shrink_scan---of 3
vmap_page_range---of 3
vmap_pages_range---of 1
vmap_pages_range_noflush---of 1
vmap_pfn---of 6
vmap_pfn_apply---of 15
vmap_range_noflush---of 112
vread_iter---of 63
vrealloc_node_align_noprof---of 40
vunmap---of 9
vunmap_range---of 1
vunmap_range_noflush---of 1
vzalloc_node_noprof---of 1
vzalloc_noprof---of 1
zero_iter---of 9
-----------
SUMMARY38%of 8

-----------
SUMMARY---of 0

__nbcon_atomic_flush_pending---of 35
__nbcon_atomic_flush_pending_con---of 11
__nbcon_context_update_unsafe---of 12
nbcon_alloc---of 17
nbcon_atomic_flush_pending---of 1
nbcon_atomic_flush_unsafe---of 1
nbcon_can_proceed---of 1
nbcon_context_can_proceed---of 11
nbcon_context_release---of 6
nbcon_context_try_acquire---of 62
nbcon_context_try_acquire_requested---of 13
nbcon_cpu_emergency_enter---of 1
nbcon_cpu_emergency_exit---of 7
nbcon_device_release---of 29
nbcon_device_try_acquire---of 5
nbcon_emit_next_record---of 43
nbcon_emit_one---of 10
nbcon_enter_unsafe---of 3
nbcon_exit_unsafe---of 3
nbcon_free---of 6
nbcon_get_cpu_emergency_nesting60%of 5
nbcon_get_default_prio43%of 7
nbcon_irq_work---of 1
nbcon_kthread_create---of 10
nbcon_kthread_func---of 46
nbcon_kthread_stop---of 5
nbcon_kthreads_wake---of 17
nbcon_legacy_emit_next_record---of 7
nbcon_reacquire_nobuf---of 4
nbcon_seq_force---of 1
nbcon_seq_read---of 1
printk_get_console_flush_type---of 37
-----------
SUMMARY50%of 12

-----------
SUMMARY---of 0

__bpf_trace_task_newtask---of 1
__bpf_trace_task_prctl_unknown---of 1
__bpf_trace_task_rename---of 1
__cleanup_sighand---of 8
__delayed_free_task---of 1
__do_sys_clone---of 1
__do_sys_clone3---of 26
__do_sys_fork---of 1
__do_sys_vfork---of 1
__ia32_sys_clone---of 1
__ia32_sys_clone3---of 1
__ia32_sys_set_tid_address---of 1
__ia32_sys_unshare---of 1
__mmdrop---of 22
__mmput---of 10
__probestub_task_newtask---of 1
__probestub_task_prctl_unknown---of 1
__probestub_task_rename---of 1
__put_task_struct---of 19
__put_task_struct_rcu_cb---of 1
__refcount_add.constprop.0---of 10
__traceiter_task_newtask---of 7
__traceiter_task_prctl_unknown---of 7
__traceiter_task_rename---of 7
__x64_sys_clone---of 1
__x64_sys_clone3---of 1
__x64_sys_set_tid_address---of 1
__x64_sys_unshare---of 1
copy_clone_args_from_user---of 42
copy_process---of 327
create_io_thread---of 1
dup_mm_exe_file---of 10
exec_mm_release---of 1
exit_mm_release---of 1
exit_task_stack_account---of 4
free_signal_struct---of 9
free_task---of 7
get_mm_exe_file---of 11
get_task_exe_file---of 8
get_task_mm---of 8
idle_dummy---of 1
kernel_clone---of 51
kernel_thread---of 1
ksys_unshare---of 84
lockdep_tasklist_lock_is_held---of 1
mm_access---of 17
mm_alloc---of 4
mm_init---of 42
mm_release---of 16
mmdrop_async_fn---of 1
mmput---of 4
mmput_async---of 4
mmput_async_fn---of 1
nr_processes---of 8
perf_trace_task_newtask---of 7
perf_trace_task_prctl_unknown---of 7
perf_trace_task_rename---of 13
pidfd_prepare---of 21
ptrace_event_pid---of 24
put_task_stack55%of 11
replace_mm_exe_file---of 48
set_mm_exe_file---of 15
set_task_stack_end_magic---of 1
sighand_ctor---of 1
sysctl_max_threads---of 6
thread_stack_free_rcu---of 4
trace_event_raw_event_task_newtask---of 8
trace_event_raw_event_task_prctl_unknown---of 8
trace_event_raw_event_task_rename---of 14
trace_raw_output_task_newtask---of 4
trace_raw_output_task_prctl_unknown---of 4
trace_raw_output_task_rename---of 5
trace_task_newtask---of 15
unshare_fd.constprop.0---of 9
unshare_files---of 6
user_mode_thread---of 1
walk_process_tree---of 16
-----------
SUMMARY55%of 11

-----------
SUMMARY---of 0

__virt_addr_valid61%of 38
rcu_read_unlock_sched45%of 9
-----------
SUMMARY58%of 47

-----------
SUMMARY---of 0

__flow_hash_consistentify24%of 13
__get_hash_from_flowi6---of 1
__skb_flow_dissect7%of 458
__skb_get_hash_net50%of 12
__skb_get_hash_symmetric_net---of 8
__skb_get_poff---of 19
bpf_flow_dissect---of 22
flow_dissector_bpf_prog_attach_check---of 11
flow_get_u32_dst---of 7
flow_get_u32_src---of 8
flow_hash_from_keys---of 8
flow_hash_from_keys_seed---of 3
make_flow_keys_digest---of 1
skb_flow_dissect_ct---of 9
skb_flow_dissect_hash---of 5
skb_flow_dissect_meta---of 5
skb_flow_dissect_tunnel_info---of 73
skb_flow_dissector_init---of 12
skb_flow_get_icmp_tci---of 13
skb_flow_get_ports---of 17
skb_get_hash_perturb---of 3
skb_get_poff---of 3
-----------
SUMMARY9%of 483

-----------
SUMMARY---of 0

__hmac_sha512_final100%of 3
__hmac_sha512_init100%of 1
__hmac_sha512_preparekey70%of 10
__sha512_final84%of 6
__sha512_update45%of 18
hmac_sha384---of 1
hmac_sha384_final---of 1
hmac_sha384_init_usingrawkey---of 1
hmac_sha384_preparekey---of 1
hmac_sha384_usingrawkey---of 1
hmac_sha512---of 1
hmac_sha512_final100%of 1
hmac_sha512_init_usingrawkey---of 1
hmac_sha512_preparekey100%of 1
hmac_sha512_usingrawkey---of 1
sha384---of 1
sha384_final---of 1
sha384_init---of 1
sha512---of 1
sha512_block_generic---of 9
sha512_blocks_avx---of 7
sha512_blocks_avx243%of 7
sha512_blocks_generic---of 4
sha512_blocks_ssse3---of 7
sha512_final---of 1
sha512_init---of 1
-----------
SUMMARY62%of 47

-----------
SUMMARY---of 0

__timekeeping_advance.constprop.0---of 57
__timekeeping_inject_offset.constprop.0---of 17
change_clocksource---of 19
delta_to_ns_safe.isra.0---of 1
do_adjtimex---of 74
do_settimeofday64---of 29
do_timer---of 1
dummy_clock_read---of 5
get_device_system_crosststamp---of 80
getboottime64---of 1
ktime_get---of 16
ktime_get_boot_fast_ns---of 1
ktime_get_clock_ts64---of 11
ktime_get_coarse_real_ts64---of 10
ktime_get_coarse_real_ts64_mg---of 17
ktime_get_coarse_ts64---of 10
ktime_get_coarse_with_offset---of 11
ktime_get_mono_fast_ns---of 9
ktime_get_ntp_seconds---of 1
ktime_get_raw---of 15
ktime_get_raw_fast_ns---of 9
ktime_get_raw_ts64---of 19
ktime_get_real_fast_ns---of 9
ktime_get_real_seconds100%of 1
ktime_get_real_ts64---of 20
ktime_get_real_ts64_mg---of 22
ktime_get_resolution_ns---of 11
ktime_get_seconds---of 3
ktime_get_snapshot---of 21
ktime_get_tai_fast_ns---of 1
ktime_get_ts64---of 20
ktime_get_update_offsets_now---of 20
ktime_get_with_offset50%of 16
ktime_mono_to_any---of 1
ktime_real_to_base_clock---of 22
pvclock_gtod_register_notifier---of 4
pvclock_gtod_unregister_notifier---of 4
random_get_entropy_fallback---of 6
scale64_check_overflow---of 6
timekeeper_lock_irqsave---of 1
timekeeper_unlock_irqrestore---of 1
timekeeping_clocksource_has_base---of 6
timekeeping_forward_now---of 12
timekeeping_max_deferment---of 10
timekeeping_notify---of 5
timekeeping_restore_shadow.constprop.0---of 5
timekeeping_resume---of 22
timekeeping_suspend---of 9
timekeeping_update_from_shadow.constprop.0---of 20
timekeeping_valid_for_hres---of 10
timekeeping_warp_clock---of 8
tk_set_wall_to_mono---of 9
tk_setup_internals.constprop.0---of 9
tk_xtime_add---of 7
update_fast_timekeeper---of 1
update_wall_time---of 7
-----------
SUMMARY53%of 17

-----------
SUMMARY---of 0

__bpf_trace_exit_mmap---of 1
__bpf_trace_vm_unmapped_area---of 1
__do_sys_brk---of 34
__do_sys_remap_file_pages---of 65
__get_unmapped_area---of 28
__ia32_sys_brk---of 1
__ia32_sys_mmap_pgoff---of 1
__ia32_sys_munmap---of 1
__ia32_sys_remap_file_pages---of 1
__probestub_exit_mmap---of 1
__probestub_vm_unmapped_area---of 1
__traceiter_exit_mmap---of 8
__traceiter_vm_unmapped_area---of 7
__x64_sys_brk---of 1
__x64_sys_mmap_pgoff---of 1
__x64_sys_munmap---of 1
__x64_sys_remap_file_pages---of 1
_install_special_mapping---of 6
check_brk_limits---of 6
do_mmap---of 129
do_munmap---of 1
dup_mmap---of 117
exit_mmap---of 65
expand_stack---of 23
expand_stack_locked---of 1
find_extend_vma_locked---of 9
find_vma84%of 6
find_vma_intersection---of 6
find_vma_prev---of 3
generic_get_unmapped_area---of 27
generic_get_unmapped_area_topdown---of 33
init_admin_reserve---of 1
init_user_reserve---of 1
ksys_mmap_pgoff---of 27
may_expand_vm---of 15
mlock_future_ok---of 7
mm_get_unmapped_area---of 1
mm_get_unmapped_area_vmflags---of 4
mmap_read_lock_maybe_expand---of 31
mmap_write_lock_killable---of 9
mmap_write_unlock---of 9
perf_trace_exit_mmap---of 7
perf_trace_vm_unmapped_area---of 7
special_mapping_close---of 5
special_mapping_fault---of 27
special_mapping_mremap---of 8
special_mapping_name---of 1
special_mapping_split---of 1
trace_event_raw_event_exit_mmap---of 8
trace_event_raw_event_vm_unmapped_area---of 8
trace_raw_output_exit_mmap---of 5
trace_raw_output_vm_unmapped_area---of 7
vm_brk_flags---of 18
vm_munmap---of 1
vm_stat_account---of 10
vm_unmapped_area---of 18
vma_is_special_mapping---of 5
vma_set_page_prot---of 3
-----------
SUMMARY84%of 6

__bad_area---of 7
__bad_area_nosemaphore---of 29
__bpf_trace_exceptions---of 1
__pkru_allows_pkey50%of 8
__probestub_page_fault_kernel---of 1
__probestub_page_fault_user---of 1
__traceiter_page_fault_kernel---of 7
__traceiter_page_fault_user---of 7
access_error53%of 21
bad_area_access_error---of 13
bad_area_nosemaphore---of 1
do_kern_addr_fault---of 16
do_user_addr_fault29%of 110
dump_pagetable---of 33
fault_in_kernel_space60%of 5
fault_signal_pending25%of 12
is_prefetch.isra.0---of 37
kernelmode_fixup_or_oops.isra.0---of 7
page_fault_oops---of 51
perf_trace_exceptions---of 7
pgtable_bad---of 3
show_ldttss---of 8
spurious_kernel_fault---of 28
spurious_kernel_fault_check---of 14
trace_event_raw_event_exceptions---of 8
trace_page_fault_kernel34%of 15
trace_page_fault_user---of 15
trace_raw_output_exceptions---of 4
vma_refcount_put---of 10
-----------
SUMMARY34%of 171

-----------
SUMMARY---of 0

zlib_adler32---of 14
zlib_inflate---of 280
zlib_inflateEnd---of 5
zlib_inflateIncomp---of 6
zlib_inflateInit2---of 9
zlib_inflateReset---of 6
zlib_inflate_workspacesize100%of 1
zlib_updatewindow---of 9
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

ipcomp_destroy---of 5
ipcomp_init_state57%of 16
ipcomp_input---of 17
ipcomp_input_done---of 1
ipcomp_input_done2---of 5
ipcomp_output---of 10
ipcomp_output_done---of 8
ipcomp_output_done2---of 4
ipcomp_post_acomp---of 32
ipcomp_setup_req---of 46
-----------
SUMMARY57%of 16

-----------
SUMMARY---of 0

__bpf_trace_netlink_extack---of 1
__netlink_change_ngroups---of 15
__netlink_clear_multicast_users---of 7
__netlink_create---of 4
__netlink_dump_start64%of 33
__netlink_kernel_create---of 29
__netlink_lookup60%of 52
__netlink_ns_capable100%of 7
__netlink_sendskb100%of 1
__netlink_seq_next---of 12
__nlmsg_put100%of 1
__probestub_netlink_extack---of 1
__traceiter_netlink_extack---of 8
deferred_put_nlk_sk---of 8
do_trace_netlink_extack34%of 15
jhash2.constprop.0100%of 1
net_generic44%of 16
netlink_ack68%of 31
netlink_ack_tlv_fill---of 24
netlink_ack_tlv_len20%of 21
netlink_add_tap---of 4
netlink_alloc_large_skb73%of 11
netlink_attachskb14%of 43
netlink_autobind.isra.059%of 24
netlink_bind---of 46
netlink_broadcast---of 1
netlink_broadcast_filtered61%of 81
netlink_capable---of 1
netlink_change_ngroups---of 1
netlink_compare---of 5
netlink_compare_arg_init100%of 1
netlink_connect---of 21
netlink_create---of 20
netlink_data_ready---of 1
netlink_deliver_tap53%of 42
netlink_detachskb---of 6
netlink_dump60%of 42
netlink_dump_done47%of 15
netlink_getname---of 13
netlink_getsockbyfd---of 15
netlink_getsockopt---of 35
netlink_has_listeners---of 22
netlink_hash---of 1
netlink_insert41%of 125
netlink_ioctl---of 1
netlink_kernel_release---of 8
netlink_lookup48%of 21
netlink_net_capable100%of 1
netlink_net_exit---of 1
netlink_net_init---of 3
netlink_ns_capable---of 1
netlink_overrun---of 10
netlink_rcv_skb95%of 17
netlink_realloc_groups---of 10
netlink_recvmsg---of 38
netlink_register_notifier---of 1
netlink_release---of 144
netlink_remove_tap---of 8
netlink_sendmsg62%of 62
netlink_sendskb---of 6
netlink_seq_next---of 1
netlink_seq_show---of 14
netlink_seq_start---of 10
netlink_seq_stop---of 7
netlink_set_err---of 23
netlink_setsockopt---of 44
netlink_skb_destructor37%of 11
netlink_skb_set_owner_r58%of 7
netlink_sock_destruct---of 10
netlink_strict_get_check---of 1
netlink_table_grab---of 12
netlink_table_ungrab---of 1
netlink_tap_init_net---of 1
netlink_trim59%of 12
netlink_undo_bind---of 8
netlink_unicast56%of 49
netlink_unregister_notifier---of 1
netlink_update_listeners---of 18
netlink_update_socket_mc---of 4
netlink_update_subscriptions---of 12
nlmsg_check_in_payload---of 5
nlmsg_notify---of 21
perf_trace_netlink_extack---of 9
trace_event_get_offsets_netlink_extack---of 3
trace_event_raw_event_netlink_extack---of 10
trace_raw_output_netlink_extack---of 5
-----------
SUMMARY53%of 742

-----------
SUMMARY---of 0

always_on---of 1
nlmon_close---of 1
nlmon_get_stats64---of 1
nlmon_open---of 1
nlmon_setup---of 1
nlmon_validate---of 3
nlmon_xmit100%of 1
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

deflate_fast---of 42
deflate_slow---of 57
deflate_stored---of 32
fill_window---of 44
flush_pending---of 14
longest_match---of 42
zlib_deflate---of 70
zlib_deflateEnd---of 12
zlib_deflateInit2---of 24
zlib_deflateReset---of 10
zlib_deflate_dfltcc_enabled---of 1
zlib_deflate_workspacesize50%of 8
-----------
SUMMARY50%of 8

-----------
SUMMARY---of 0

__do_compat_sys_wait4---of 8
__do_compat_sys_waitid---of 26
__do_sys_exit.isra.0---of 1
__do_sys_exit_group.isra.0---of 1
__do_sys_wait4---of 8
__do_sys_waitid---of 26
__do_wait---of 42
__ia32_compat_sys_wait4---of 1
__ia32_compat_sys_waitid---of 1
__ia32_sys_exit---of 1
__ia32_sys_exit_group---of 1
__ia32_sys_wait4---of 1
__ia32_sys_waitid---of 1
__ia32_sys_waitpid---of 1
__wake_up_parent---of 1
__x64_sys_exit---of 1
__x64_sys_exit_group---of 1
__x64_sys_wait4---of 1
__x64_sys_waitid---of 1
__x64_sys_waitpid---of 1
abort---of 1
child_wait_callback---of 3
delayed_put_task_struct---of 20
do_exit---of 197
do_group_exit---of 9
do_wait---of 29
find_alive_thread.isra.0---of 5
is_current_pgrp_orphaned---of 1
is_effectively_child---of 10
kernel_wait---of 5
kernel_wait4---of 18
kernel_waitid---of 8
kernel_waitid_prepare---of 23
kill_orphaned_pgrp---of 20
make_task_dead---of 23
mm_update_next_owner---of 22
oops_count_show---of 1
pid_child_should_wake---of 13
put_task_struct_rcu_user75%of 8
rcuwait_wake_up---of 19
release_task---of 69
stack_not_used---of 4
task_stopped_code---of 12
try_to_set_owner---of 10
wait_consider_task---of 212
will_become_orphaned_pgrp---of 23
-----------
SUMMARY75%of 8

__hrtimer_cb_get_time28%of 11
__hrtimer_get_next_event---of 6
__hrtimer_get_remaining---of 1
__hrtimer_next_event_base---of 16
__hrtimer_reprogram.isra.0---of 7
__hrtimer_run_queues---of 53
__hrtimer_setup35%of 20
__ia32_sys_nanosleep---of 11
__ia32_sys_nanosleep_time32---of 13
__remove_hrtimer72%of 7
__x64_sys_nanosleep---of 11
__x64_sys_nanosleep_time32---of 13
clock_was_set---of 35
clock_was_set_delayed---of 1
clock_was_set_work---of 1
do_nanosleep---of 24
enqueue_hrtimer36%of 17
hrtimer_active---of 12
hrtimer_cancel---of 4
hrtimer_cb_get_time---of 1
hrtimer_dummy_timeout---of 1
hrtimer_force_reprogram---of 7
hrtimer_forward---of 21
hrtimer_get_next_event---of 3
hrtimer_interrupt---of 29
hrtimer_nanosleep---of 12
hrtimer_nanosleep_restart---of 1
hrtimer_next_event_without---of 6
hrtimer_reprogram35%of 20
hrtimer_run_queues---of 22
hrtimer_run_softirq---of 4
hrtimer_setup100%of 1
hrtimer_setup_on_stack---of 1
hrtimer_setup_sleeper_on_stack---of 1
hrtimer_sleeper_start_expires---of 1
hrtimer_start_range_ns45%of 52
hrtimer_try_to_cancel---of 8
hrtimer_update_next_event---of 8
hrtimer_wakeup---of 5
hrtimers_cpu_dying---of 9
hrtimers_cpu_starting---of 1
hrtimers_prepare_cpu---of 3
hrtimers_resume_local---of 7
ktime_add_safe---of 7
lock_hrtimer_base58%of 7
nanosleep_copyout---of 8
retrigger_next_event---of 4
trace_hrtimer_cancel34%of 15
trace_hrtimer_setup34%of 15
-----------
SUMMARY40%of 165

-----------
SUMMARY---of 0

ahash_def_finup_done1---of 9
ahash_def_finup_done2---of 5
ahash_def_finup_finish1---of 9
ahash_default_export_core---of 1
ahash_default_import_core---of 1
ahash_do_req_chain---of 31
ahash_finup_done---of 9
ahash_finup_finish---of 7
ahash_free_singlespawn_instance---of 1
ahash_nosetkey---of 1
ahash_prepare_alg---of 28
ahash_register_instance---of 7
ahash_request_free---of 8
ahash_update_done---of 9
ahash_update_finish---of 10
crypto_ahash_digest---of 9
crypto_ahash_exit_tfm---of 10
crypto_ahash_export---of 6
crypto_ahash_export_core---of 5
crypto_ahash_extsize60%of 5
crypto_ahash_finup---of 33
crypto_ahash_free_instance---of 1
crypto_ahash_import---of 8
crypto_ahash_import_core---of 6
crypto_ahash_init---of 12
crypto_ahash_init_tfm18%of 35
crypto_ahash_report---of 1
crypto_ahash_setkey42%of 12
crypto_ahash_show---of 3
crypto_ahash_update---of 29
crypto_alloc_ahash100%of 1
crypto_clone_ahash---of 37
crypto_exit_ahash_using_shash100%of 1
crypto_grab_ahash---of 1
crypto_has_ahash100%of 1
crypto_hash_alg_has_setkey---of 5
crypto_hash_digest---of 1
crypto_hash_walk_done---of 16
crypto_hash_walk_first---of 8
crypto_register_ahash---of 3
crypto_register_ahashes---of 8
crypto_unregister_ahash---of 1
crypto_unregister_ahashes---of 3
hash_walk_new_entry---of 4
shash_ahash_digest---of 10
shash_ahash_finup---of 8
shash_ahash_update---of 3
-----------
SUMMARY31%of 55

-----------
SUMMARY---of 0

__check_object_size49%of 49
check_stack_object67%of 27
usercopy_abort---of 7
-----------
SUMMARY56%of 76

authenc_esn_geniv_ahash_done---of 3
authenc_esn_verify_ahash_done---of 7
crypto_authenc_esn_create---of 15
crypto_authenc_esn_decrypt---of 10
crypto_authenc_esn_decrypt_tail---of 7
crypto_authenc_esn_encrypt---of 6
crypto_authenc_esn_encrypt_done---of 7
crypto_authenc_esn_exit_tfm---of 1
crypto_authenc_esn_free---of 1
crypto_authenc_esn_genicv---of 4
crypto_authenc_esn_genicv_tail.isra.0---of 1
crypto_authenc_esn_init_tfm67%of 6
crypto_authenc_esn_setauthsize100%of 3
crypto_authenc_esn_setkey67%of 6
-----------
SUMMARY74%of 15

-----------
SUMMARY---of 0

___pskb_trim---of 57
__alloc_skb50%of 22
__build_skb60%of 5
__build_skb_around29%of 7
__consume_stateless_skb---of 1
__copy_skb_header25%of 28
__finalize_skb_around100%of 1
__kfree_skb---of 1
__napi_alloc_frag_align---of 18
__napi_build_skb---of 5
__napi_kfree_skb---of 1
__netdev_alloc_frag_align---of 7
__netdev_alloc_skb---of 41
__pskb_copy_fclone---of 25
__pskb_pull_tail---of 81
__skb_checksum_complete---of 8
__skb_checksum_complete_head---of 8
__skb_clone75%of 4
__skb_complete_tx_timestamp---of 14
__skb_ext_alloc---of 4
__skb_ext_del---of 7
__skb_ext_put---of 13
__skb_ext_set---of 3
__skb_fill_netmem_desc---of 18
__skb_pad---of 24
__skb_send_sock---of 62
__skb_splice_bits.isra.0---of 32
__skb_to_sgvec---of 47
__skb_tstamp_tx---of 55
__skb_unclone_keeptruesize---of 12
__skb_vlan_pop---of 24
__skb_warn_lro_forwarding---of 4
__skb_zcopy_downgrade_managed---of 3
__splice_segment---of 27
alloc_skb_for_msg---of 5
alloc_skb_with_frags---of 24
build_skb---of 7
build_skb_around---of 6
consume_skb84%of 12
csum_and_copy_from_iter_full---of 134
drop_reasons_register_subsys---of 5
drop_reasons_unregister_subsys---of 5
folio_order---of 5
get_netmem---of 8
get_page---of 19
kfree_skb_list_reason55%of 22
kfree_skb_partial---of 5
kfree_skbmem20%of 15
kmalloc_reserve70%of 13
mm_account_pinned_pages---of 23
mm_unaccount_pinned_pages---of 5
msg_zerocopy_complete---of 28
msg_zerocopy_put_abort---of 4
msg_zerocopy_realloc---of 53
napi_alloc_skb---of 35
napi_build_skb---of 6
napi_consume_skb---of 18
napi_pp_put_page---of 5
napi_skb_cache_get---of 27
napi_skb_cache_get_bulk---of 29
napi_skb_cache_put---of 25
napi_skb_free_stolen_head---of 20
netmem_compound_head---of 17
page_pool_put_netmem.constprop.0---of 9
pskb_carve---of 76
pskb_expand_head21%of 49
pskb_extract---of 12
pskb_put---of 4
pskb_trim_rcsum_slow---of 15
put_netmem---of 8
put_page---of 21
sendmsg_locked---of 6
sendmsg_unlocked---of 4
sk_skb_reason_drop80%of 15
skb_abort_seq_read---of 7
skb_add_rx_frag_netmem---of 1
skb_append---of 1
skb_append_pagefrags---of 17
skb_attempt_defer_free---of 18
skb_checksum---of 38
skb_checksum_setup---of 38
skb_checksum_setup_ip---of 15
skb_checksum_trimmed---of 25
skb_clone48%of 19
skb_clone_fraglist.isra.0---of 10
skb_clone_sk---of 25
skb_coalesce_rx_frag---of 1
skb_complete_tx_timestamp---of 24
skb_complete_wifi_ack---of 23
skb_condense---of 12
skb_copy---of 12
skb_copy_and_csum_bits---of 38
skb_copy_and_csum_dev---of 9
skb_copy_bits---of 38
skb_copy_expand---of 14
skb_copy_from_linear_data---of 1
skb_copy_from_linear_data_offset---of 1
skb_copy_header---of 1
skb_copy_seq_read---of 7
skb_copy_ubufs---of 70
skb_cow_data---of 61
skb_cow_data_for_xdp---of 4
skb_crc32c---of 34
skb_dequeue---of 6
skb_dequeue_tail---of 6
skb_dump---of 53
skb_ensure_writable---of 18
skb_ensure_writable_head_tail---of 14
skb_errqueue_purge---of 11
skb_eth_pop---of 16
skb_eth_push---of 19
skb_expand_head---of 39
skb_ext_add---of 26
skb_ext_put_sp---of 8
skb_find_text---of 6
skb_free_head25%of 12
skb_headers_offset_update67%of 6
skb_kfree_head100%of 5
skb_may_tx_timestamp---of 11
skb_maybe_pull_tail---of 7
skb_mod_eth_type---of 3
skb_morph---of 1
skb_mpls_dec_ttl---of 15
skb_mpls_pop---of 18
skb_mpls_push---of 27
skb_mpls_update_lse---of 10
skb_panic---of 4
skb_partial_csum_set---of 8
skb_pp_cow_data---of 43
skb_prepare_seq_read---of 1
skb_pull67%of 6
skb_pull_data---of 9
skb_pull_rcsum---of 10
skb_push---of 3
skb_put60%of 5
skb_queue_head---of 1
skb_queue_purge_reason25%of 12
skb_queue_tail100%of 1
skb_rbtree_purge---of 3
skb_realloc_headroom---of 9
skb_release_all60%of 5
skb_release_data57%of 30
skb_release_head_state45%of 18
skb_scrub_packet---of 19
skb_segment---of 194
skb_segment_list---of 59
skb_send_sock---of 1
skb_send_sock_locked---of 1
skb_send_sock_locked_with_flags---of 1
skb_seq_read---of 48
skb_shift---of 86
skb_splice_bits---of 3
skb_splice_from_iter---of 47
skb_split---of 18
skb_store_bits---of 38
skb_to_sgvec---of 4
skb_to_sgvec_nomark---of 1
skb_trim80%of 5
skb_try_coalesce---of 65
skb_ts_finish---of 7
skb_ts_get_next_block---of 1
skb_tstamp_tx---of 1
skb_tx_error---of 13
skb_unlink---of 1
skb_vlan_pop---of 12
skb_vlan_push---of 21
skb_vlan_untag---of 42
skb_zerocopy---of 31
skb_zerocopy_clone---of 37
skb_zerocopy_headlen---of 10
skb_zerocopy_iter_stream---of 36
slab_build_skb---of 6
sock_dequeue_err_skb---of 29
sock_queue_err_skb---of 34
sock_rmem_free---of 1
sock_spd_release---of 1
trace_consume_skb34%of 15
trace_kfree_skb34%of 15
virt_to_head_page---of 13
-----------
SUMMARY46%of 347

-----------
SUMMARY---of 0

kasprintf---of 1
kvasprintf60%of 5
kvasprintf_const---of 9
-----------
SUMMARY60%of 5

__crypto_sha256_export---of 1
__crypto_sha256_export_core---of 1
__crypto_sha256_import---of 1
__crypto_sha256_import_core---of 1
crypto_hmac_sha224_digest---of 1
crypto_hmac_sha224_export---of 1
crypto_hmac_sha224_export_core---of 1
crypto_hmac_sha224_final---of 1
crypto_hmac_sha224_import---of 1
crypto_hmac_sha224_import_core---of 1
crypto_hmac_sha224_init---of 1
crypto_hmac_sha224_setkey---of 1
crypto_hmac_sha224_update---of 1
crypto_hmac_sha256_digest---of 1
crypto_hmac_sha256_export---of 1
crypto_hmac_sha256_export_core---of 1
crypto_hmac_sha256_final---of 1
crypto_hmac_sha256_import---of 1
crypto_hmac_sha256_import_core---of 1
crypto_hmac_sha256_init---of 1
crypto_hmac_sha256_setkey100%of 1
crypto_hmac_sha256_update---of 1
crypto_sha224_digest---of 1
crypto_sha224_export---of 1
crypto_sha224_export_core---of 1
crypto_sha224_final---of 1
crypto_sha224_import---of 1
crypto_sha224_import_core---of 1
crypto_sha224_init---of 1
crypto_sha224_update---of 1
crypto_sha256_digest---of 1
crypto_sha256_export---of 1
crypto_sha256_export_core---of 1
crypto_sha256_final---of 1
crypto_sha256_import---of 1
crypto_sha256_import_core---of 1
crypto_sha256_init---of 1
crypto_sha256_update---of 1
-----------
SUMMARY100%of 1

__irq_work_queue_local26%of 31
irq_work_claim80%of 5
irq_work_needs_cpu---of 10
irq_work_queue84%of 6
irq_work_queue_on---of 13
irq_work_run---of 1
irq_work_run_list---of 9
irq_work_single---of 7
irq_work_sync---of 18
irq_work_tick---of 5
-----------
SUMMARY41%of 42

-----------
SUMMARY---of 0

crypto_alloc_skcipher---of 1
crypto_alloc_sync_skcipher---of 6
crypto_grab_skcipher---of 1
crypto_has_skcipher100%of 1
crypto_register_skcipher---of 3
crypto_register_skciphers---of 8
crypto_skcipher_decrypt---of 7
crypto_skcipher_encrypt---of 7
crypto_skcipher_exit_tfm---of 1
crypto_skcipher_export---of 4
crypto_skcipher_extsize100%of 3
crypto_skcipher_free_instance---of 1
crypto_skcipher_import---of 4
crypto_skcipher_init_tfm64%of 11
crypto_skcipher_report---of 1
crypto_skcipher_setkey63%of 16
crypto_skcipher_show---of 3
crypto_unregister_skcipher---of 1
crypto_unregister_skciphers---of 3
skcipher_alloc_instance_simple---of 11
skcipher_exit_tfm_simple---of 1
skcipher_free_instance_simple---of 1
skcipher_init_tfm_simple---of 5
skcipher_noexport---of 1
skcipher_noimport---of 1
skcipher_prepare_alg---of 15
skcipher_prepare_alg_common---of 12
skcipher_register_instance---of 7
skcipher_setkey_simple---of 1
skcipher_walk_aead_common---of 18
skcipher_walk_aead_decrypt---of 1
skcipher_walk_aead_encrypt---of 1
skcipher_walk_virt---of 11
-----------
SUMMARY68%of 31

__cyc2ns_read---of 5
__set_cyc2ns_scale---of 9
calibrate_delay_is_known---of 20
check_tsc_unstable---of 1
cyc2ns_read_begin---of 4
cyc2ns_read_end---of 4
mark_tsc_unstable---of 7
native_calibrate_cpu---of 4
native_calibrate_cpu_early---of 42
native_calibrate_tsc---of 28
native_sched_clock_from_tsc---of 6
pit_hpet_ptimer_calibrate_cpu---of 69
read_tsc100%of 1
recalibrate_cpu_khz---of 1
sched_clock50%of 4
set_cyc2ns_scale---of 8
time_cpufreq_notifier---of 15
tsc_clocksource_watchdog_disabled---of 6
tsc_cs_enable---of 1
tsc_cs_mark_unstable---of 7
tsc_cs_tick_stable---of 7
tsc_read_refs---of 12
tsc_refine_calibration_work---of 40
tsc_restore_sched_clock_state---of 17
tsc_resume---of 1
tsc_save_sched_clock_state---of 6
unsynchronized_tsc---of 13
using_native_sched_clock---of 1
-----------
SUMMARY60%of 5

null_crypt---of 1
null_digest---of 1
null_final---of 1
null_hash_setkey100%of 1
null_init---of 1
null_setkey---of 1
null_skcipher_crypt---of 5
null_skcipher_setkey100%of 1
null_update---of 1
-----------
SUMMARY100%of 2

-----------
SUMMARY---of 0

__nla_parse100%of 1
__nla_put100%of 1
__nla_put_64bit100%of 1
__nla_put_nohdr---of 1
__nla_reserve100%of 1
__nla_reserve_64bit---of 1
__nla_reserve_nohdr---of 1
__nla_validate---of 1
__nla_validate_parse29%of 219
nla_append---of 6
nla_find---of 8
nla_get_range_signed---of 15
nla_get_range_unsigned40%of 20
nla_memcmp---of 3
nla_memcpy---of 3
nla_policy_len---of 9
nla_put84%of 6
nla_put_64bit84%of 6
nla_put_nohdr---of 6
nla_reserve100%of 6
nla_reserve_64bit---of 6
nla_reserve_nohdr---of 6
nla_strcmp---of 7
nla_strdup---of 8
nla_strscpy---of 12
-----------
SUMMARY35%of 261

-----------
SUMMARY---of 0

ipip_destroy---of 1
ipip_init_state40%of 10
ipip_output---of 1
ipip_xfrm_rcv---of 1
xfrm_tunnel_err---of 1
xfrm_tunnel_rcv---of 1
-----------
SUMMARY40%of 10

-----------
SUMMARY---of 0

__hsiphash_unaligned---of 6
__siphash_unaligned84%of 6
hsiphash_1u32---of 1
hsiphash_2u32---of 1
hsiphash_3u32---of 1
hsiphash_4u32---of 1
siphash_1u32---of 1
siphash_1u64---of 1
siphash_2u64---of 1
siphash_3u32---of 1
siphash_3u64---of 1
siphash_4u64---of 1
-----------
SUMMARY84%of 6

selinux_nlmsg_lookup20%of 31
-----------
SUMMARY20%of 31

-----------
SUMMARY---of 0

__inode_security_revalidate---of 6
audit_inode_permission---of 4
bad_option.isra.0---of 8
bpf_fd_pass---of 11
check_nnp_nosuid.isra.0---of 18
copy_to_sockptr_offset.constprop.0---of 10
cred_has_capability.isra.056%of 20
delayed_superblock_init---of 1
file_has_perm---of 11
file_map_prot_check---of 23
has_cap_mac_admin---of 7
inode_doinit_use_xattr---of 19
inode_doinit_with_dentry---of 75
inode_has_perm---of 7
ioctl_has_perm.constprop.0.isra.0---of 12
ipc_has_perm---of 1
match_file---of 11
may_context_mount_inode_relabel.isra.0---of 3
may_context_mount_sb_relabel.isra.0---of 3
may_create---of 10
may_link---of 16
ptrace_parent_sid---of 20
sb_finish_set_opts---of 43
selinux_add_opt---of 32
selinux_binder_set_context_mgr---of 1
selinux_binder_transaction---of 7
selinux_binder_transfer_binder---of 1
selinux_binder_transfer_file---of 23
selinux_bpf---of 7
selinux_bpf_map---of 5
selinux_bpf_map_create---of 1
selinux_bpf_prog---of 1
selinux_bpf_prog_load---of 1
selinux_bpf_token_create---of 1
selinux_bprm_committed_creds---of 11
selinux_bprm_committing_creds---of 21
selinux_bprm_creds_for_exec---of 30
selinux_capable100%of 1
selinux_capget---of 1
selinux_capset---of 1
selinux_complete_init---of 1
selinux_cred_getlsmprop---of 1
selinux_cred_getsecid---of 1
selinux_cred_prepare---of 1
selinux_cred_transfer---of 1
selinux_current_getlsmprop_subj---of 1
selinux_d_instantiate---of 5
selinux_dentry_create_files_as---of 12
selinux_dentry_init_security---of 15
selinux_determine_inode_label---of 15
selinux_file_alloc_security---of 1
selinux_file_fcntl---of 11
selinux_file_ioctl---of 9
selinux_file_ioctl_compat---of 7
selinux_file_lock---of 1
selinux_file_mprotect---of 26
selinux_file_open---of 18
selinux_file_permission---of 32
selinux_file_receive---of 9
selinux_file_send_sigiotask---of 4
selinux_file_set_fowner---of 1
selinux_free_mnt_opts---of 1
selinux_fs_context_dup---of 5
selinux_fs_context_parse_param---of 4
selinux_fs_context_submount---of 12
selinux_getprocattr---of 5
selinux_getselfattr---of 6
selinux_inet_conn_established---of 5
selinux_inet_conn_request---of 9
selinux_inet_csk_clone---of 1
selinux_inet_sys_rcv_skb---of 5
selinux_inode_alloc_security---of 4
selinux_inode_copy_up---of 8
selinux_inode_copy_up_xattr---of 5
selinux_inode_create---of 1
selinux_inode_file_getattr---of 6
selinux_inode_file_setattr---of 6
selinux_inode_follow_link---of 9
selinux_inode_free_security---of 9
selinux_inode_get_acl---of 6
selinux_inode_getattr---of 11
selinux_inode_getlsmprop---of 4
selinux_inode_getsecctx---of 5
selinux_inode_getsecurity---of 18
selinux_inode_getxattr---of 6
selinux_inode_init_security---of 28
selinux_inode_init_security_anon---of 20
selinux_inode_invalidate_secctx---of 4
selinux_inode_link---of 1
selinux_inode_listsecurity---of 9
selinux_inode_listxattr---of 6
selinux_inode_mkdir---of 1
selinux_inode_mknod---of 9
selinux_inode_notifysecctx---of 3
selinux_inode_permission---of 60
selinux_inode_post_setxattr---of 21
selinux_inode_readlink---of 6
selinux_inode_remove_acl---of 6
selinux_inode_removexattr---of 11
selinux_inode_rename---of 46
selinux_inode_rmdir---of 1
selinux_inode_set_acl---of 6
selinux_inode_setattr---of 23
selinux_inode_setsecctx---of 1
selinux_inode_setsecurity---of 22
selinux_inode_setxattr---of 35
selinux_inode_symlink---of 1
selinux_inode_unlink---of 1
selinux_inode_xattr_skipcap---of 1
selinux_ip_forward---of 18
selinux_ip_output---of 16
selinux_ip_postroute---of 71
selinux_ip_postroute_compat---of 15
selinux_ipc_getlsmprop---of 1
selinux_ipc_permission---of 9
selinux_ismaclabel---of 1
selinux_kernel_act_as---of 3
selinux_kernel_create_files_as---of 8
selinux_kernel_load_data---of 15
selinux_kernel_load_from_file---of 13
selinux_kernel_module_request100%of 1
selinux_kernel_read_file---of 15
selinux_kernfs_init_security---of 26
selinux_key_alloc---of 4
selinux_key_getsecurity---of 3
selinux_key_permission---of 13
selinux_lsm_getattr---of 38
selinux_lsm_notifier_avc_callback---of 4
selinux_lsm_setattr---of 53
selinux_lsmprop_to_secctx---of 1
selinux_mmap_addr---of 4
selinux_mmap_file---of 6
selinux_mount---of 9
selinux_move_mount---of 6
selinux_mptcp_add_subflow---of 1
selinux_msg_msg_alloc_security---of 1
selinux_msg_queue_alloc_security---of 1
selinux_msg_queue_associate---of 1
selinux_msg_queue_msgctl---of 10
selinux_msg_queue_msgrcv---of 3
selinux_msg_queue_msgsnd---of 7
selinux_netcache_avc_callback---of 4
selinux_netlink_send49%of 29
selinux_nf_register---of 1
selinux_nf_unregister---of 1
selinux_parse_skb.constprop.0---of 84
selinux_path_notify---of 18
selinux_peerlbl_enabled---of 6
selinux_perf_event_alloc---of 1
selinux_perf_event_open---of 4
selinux_perf_event_read---of 1
selinux_perf_event_write---of 1
selinux_ptrace_access_check---of 5
selinux_ptrace_traceme---of 1
selinux_quota_on---of 6
selinux_quotactl---of 9
selinux_release_secctx---of 4
selinux_req_classify_flow---of 1
selinux_sb_alloc_security---of 1
selinux_sb_clone_mnt_opts---of 78
selinux_sb_eat_lsm_opts---of 52
selinux_sb_kern_mount---of 1
selinux_sb_mnt_opts_compat---of 23
selinux_sb_remount---of 24
selinux_sb_show_options---of 29
selinux_sb_statfs---of 1
selinux_sctp_assoc_established---of 3
selinux_sctp_assoc_request---of 10
selinux_sctp_bind_connect---of 21
selinux_sctp_process_new_assoc---of 15
selinux_sctp_sk_clone---of 4
selinux_secctx_to_secid---of 1
selinux_secid_to_secctx---of 8
selinux_secmark_enabled---of 5
selinux_secmark_refcount_dec---of 1
selinux_secmark_refcount_inc---of 1
selinux_secmark_relabel_packet---of 1
selinux_sem_alloc_security---of 1
selinux_sem_associate---of 1
selinux_sem_semctl---of 13
selinux_sem_semop---of 3
selinux_set_mnt_opts---of 111
selinux_setprocattr---of 4
selinux_setselfattr---of 1
selinux_shm_alloc_security---of 1
selinux_shm_associate---of 1
selinux_shm_shmat---of 3
selinux_shm_shmctl---of 11
selinux_sk_alloc_security---of 1
selinux_sk_clone_security---of 1
selinux_sk_free_security---of 1
selinux_sk_getsecid---of 4
selinux_skb_peerlbl_sid---of 7
selinux_sock_graft---of 11
selinux_sock_rcv_skb_compat---of 7
selinux_socket_accept---of 10
selinux_socket_bind---of 48
selinux_socket_connect---of 3
selinux_socket_connect_helper.isra.0---of 25
selinux_socket_create---of 4
selinux_socket_getpeername---of 1
selinux_socket_getpeersec_dgram37%of 22
selinux_socket_getpeersec_stream---of 15
selinux_socket_getsockname---of 1
selinux_socket_getsockopt---of 1
selinux_socket_listen---of 1
selinux_socket_post_create---of 13
selinux_socket_recvmsg---of 1
selinux_socket_sendmsg100%of 1
selinux_socket_setsockopt---of 3
selinux_socket_shutdown---of 1
selinux_socket_sock_rcv_skb19%of 22
selinux_socket_socketpair---of 1
selinux_socket_unix_may_send---of 1
selinux_socket_unix_stream_connect---of 5
selinux_syslog---of 7
selinux_task_alloc---of 1
selinux_task_getioprio---of 1
selinux_task_getlsmprop_obj---of 1
selinux_task_getpgid---of 1
selinux_task_getscheduler---of 1
selinux_task_getsid---of 1
selinux_task_kill---of 8
selinux_task_movememory---of 1
selinux_task_prlimit---of 8
selinux_task_setioprio---of 1
selinux_task_setnice---of 1
selinux_task_setpgid---of 1
selinux_task_setrlimit---of 3
selinux_task_setscheduler---of 1
selinux_task_to_inode---of 12
selinux_tun_dev_alloc_security---of 1
selinux_tun_dev_attach---of 1
selinux_tun_dev_attach_queue---of 1
selinux_tun_dev_create---of 1
selinux_tun_dev_open---of 7
selinux_umount---of 1
selinux_uring_allowed---of 1
selinux_uring_cmd---of 4
selinux_uring_override_creds---of 1
selinux_uring_sqpoll---of 1
selinux_userns_create---of 1
selinux_vm_enough_memory---of 1
show_sid---of 5
sock_has_perm43%of 7
socket_sockcreate_sid.isra.0---of 5
socket_type_to_security_class---of 42
task_sid_obj---of 16
-----------
SUMMARY42%of 103

-----------
SUMMARY---of 0

aes_gcm_aad_update---of 6
aes_set_key---of 1
aes_set_key_common58%of 7
aesni_decrypt---of 5
aesni_encrypt---of 5
aesni_skcipher_setkey100%of 1
aesni_xts_decrypt---of 1
aesni_xts_encrypt---of 1
cbc_decrypt---of 3
cbc_encrypt---of 3
common_rfc4106_set_authsize75%of 4
ctr_crypt_aesni---of 10
ctr_crypt_aesni_avx---of 9
ctr_crypt_vaes_avx2---of 9
ctr_crypt_vaes_avx512---of 9
cts_cbc_decrypt---of 15
cts_cbc_encrypt---of 15
ecb_decrypt---of 3
ecb_encrypt---of 3
gcm_decrypt_aesni---of 10
gcm_decrypt_aesni_avx---of 10
gcm_decrypt_vaes_avx10_256---of 10
gcm_decrypt_vaes_avx10_512---of 10
gcm_encrypt_aesni---of 9
gcm_encrypt_aesni_avx---of 9
gcm_encrypt_vaes_avx10_256---of 9
gcm_encrypt_vaes_avx10_512---of 9
gcm_process_assoc---of 26
gcm_setkey57%of 30
gcm_setkey_aesni---of 1
gcm_setkey_aesni_avx100%of 1
gcm_setkey_vaes_avx10_256---of 1
gcm_setkey_vaes_avx10_512---of 1
generic_gcmaes_set_authsize75%of 4
rfc4106_decrypt_aesni---of 13
rfc4106_decrypt_aesni_avx---of 13
rfc4106_decrypt_vaes_avx10_256---of 13
rfc4106_decrypt_vaes_avx10_512---of 13
rfc4106_encrypt_aesni---of 12
rfc4106_encrypt_aesni_avx---of 12
rfc4106_encrypt_vaes_avx10_256---of 12
rfc4106_encrypt_vaes_avx10_512---of 12
rfc4106_setkey_aesni---of 1
rfc4106_setkey_aesni_avx100%of 1
rfc4106_setkey_vaes_avx10_256---of 1
rfc4106_setkey_vaes_avx10_512---of 1
unregister_avx_algs---of 14
xctr_crypt_aesni_avx---of 5
xctr_crypt_vaes_avx2---of 5
xctr_crypt_vaes_avx512---of 5
xts_crypt_slowpath---of 13
xts_decrypt_aesni---of 9
xts_decrypt_aesni_avx---of 9
xts_decrypt_vaes_avx2---of 9
xts_decrypt_vaes_avx512---of 9
xts_encrypt_aesni---of 9
xts_encrypt_aesni_avx---of 9
xts_encrypt_vaes_avx2---of 9
xts_encrypt_vaes_avx512---of 9
xts_setkey_aesni---of 8
-----------
SUMMARY63%of 48

-----------
SUMMARY---of 0

____sys_recvmsg---of 31
____sys_sendmsg25%of 40
___sys_recvmsg---of 7
___sys_sendmsg72%of 7
__copy_msghdr75%of 24
__do_sys_socketcall---of 38
__ia32_sys_accept---of 1
__ia32_sys_accept4---of 1
__ia32_sys_bind---of 1
__ia32_sys_connect---of 1
__ia32_sys_getpeername---of 1
__ia32_sys_getsockname---of 1
__ia32_sys_getsockopt---of 1
__ia32_sys_listen---of 1
__ia32_sys_recv---of 1
__ia32_sys_recvfrom---of 1
__ia32_sys_recvmmsg---of 4
__ia32_sys_recvmmsg_time32---of 4
__ia32_sys_recvmsg---of 1
__ia32_sys_send---of 1
__ia32_sys_sendmmsg---of 1
__ia32_sys_sendmsg---of 1
__ia32_sys_sendto---of 1
__ia32_sys_setsockopt---of 1
__ia32_sys_shutdown---of 1
__ia32_sys_socket---of 1
__ia32_sys_socketcall---of 1
__ia32_sys_socketpair---of 1
__sock_create---of 49
__sock_recv_cmsgs---of 28
__sock_recv_timestamp---of 82
__sock_recv_wifi_status---of 5
__sock_release---of 13
__sock_sendmsg75%of 4
__sock_tx_timestamp---of 9
__sys_accept4---of 12
__sys_bind---of 12
__sys_bind_socket---of 3
__sys_connect---of 7
__sys_connect_file---of 8
__sys_getpeername---of 14
__sys_getsockname---of 14
__sys_getsockopt---of 12
__sys_listen---of 10
__sys_listen_socket---of 5
__sys_recvfrom---of 22
__sys_recvmmsg---of 22
__sys_recvmsg---of 15
__sys_recvmsg_sock---of 1
__sys_sendmmsg---of 34
__sys_sendmsg67%of 15
__sys_sendmsg_sock---of 1
__sys_sendto---of 20
__sys_setsockopt---of 12
__sys_shutdown---of 10
__sys_shutdown_sock---of 3
__sys_socket---of 10
__sys_socket_create---of 5
__sys_socket_file---of 5
__sys_socketpair---of 30
__x64_sys_accept---of 1
__x64_sys_accept4---of 1
__x64_sys_bind---of 1
__x64_sys_connect---of 1
__x64_sys_getpeername---of 1
__x64_sys_getsockname---of 1
__x64_sys_getsockopt---of 1
__x64_sys_listen---of 1
__x64_sys_recv---of 1
__x64_sys_recvfrom---of 1
__x64_sys_recvmmsg---of 4
__x64_sys_recvmmsg_time32---of 4
__x64_sys_recvmsg---of 1
__x64_sys_send---of 1
__x64_sys_sendmmsg---of 1
__x64_sys_sendmsg100%of 1
__x64_sys_sendto---of 1
__x64_sys_setsockopt---of 1
__x64_sys_shutdown---of 1
__x64_sys_socket---of 1
__x64_sys_socketcall---of 1
__x64_sys_socketpair---of 1
br_ioctl_call---of 6
brioctl_set---of 1
call_trace_sock_recv_length---of 15
call_trace_sock_send_length.constprop.0---of 15
compat_sock_ioctl---of 30
copy_msghdr_from_user80%of 5
do_accept---of 20
do_recvmmsg---of 46
do_sock_getsockopt---of 20
do_sock_setsockopt---of 12
get_timestamp.constprop.0---of 16
get_user_ifreq---of 12
init_once---of 1
kernel_accept---of 7
kernel_bind---of 3
kernel_connect---of 3
kernel_getpeername---of 1
kernel_getsockname---of 1
kernel_listen---of 1
kernel_recvmsg---of 1
kernel_sendmsg---of 1
kernel_sock_ip_overhead---of 44
kernel_sock_shutdown---of 1
move_addr_to_kernel59%of 17
move_addr_to_user---of 23
put_user_ifreq---of 9
skb_get_tx_timestamp---of 18
skb_has_tx_timestamp---of 10
sock_alloc---of 4
sock_alloc_file---of 8
sock_alloc_inode---of 4
sock_close---of 1
sock_create---of 1
sock_create_kern---of 1
sock_create_lite---of 8
sock_do_ioctl---of 12
sock_fasync---of 7
sock_free_inode---of 1
sock_from_file---of 5
sock_ioctl---of 31
sock_is_registered---of 6
sock_mmap---of 1
sock_poll---of 20
sock_read_iter---of 9
sock_recvmsg---of 11
sock_register---of 11
sock_release---of 1
sock_sendmsg---of 6
sock_sendmsg_nosec64%of 11
sock_show_fdinfo---of 5
sock_splice_eof---of 5
sock_splice_read---of 4
sock_unregister---of 3
sock_wake_async---of 14
sock_write_iter---of 12
socket_seq_show---of 1
sockfd_lookup---of 10
sockfs_dname---of 1
sockfs_init_fs_context---of 4
sockfs_listxattr---of 15
sockfs_security_xattr_set---of 1
sockfs_setattr---of 7
sockfs_xattr_get---of 7
update_socket_protocol---of 1
vlan_ioctl_set---of 1
-----------
SUMMARY55%of 124

should_fail_usercopy100%of 1
-----------
SUMMARY100%of 1

__get_hash_thresh100%of 5
__xfrm_decode_session---of 46
__xfrm_dst_lookup---of 10
__xfrm_policy_bysel_ctx.isra.080%of 25
__xfrm_policy_check---of 191
__xfrm_policy_inexact_flush75%of 8
__xfrm_policy_inexact_prune_bin6%of 105
__xfrm_policy_link50%of 28
__xfrm_policy_unlink85%of 13
__xfrm_route_forward---of 20
__xfrm_sk_clone_policy---of 34
dst_copy_metrics---of 11
dst_discard---of 1
icmp_err_packet---of 11
jhash2100%of 10
nf_nat_decode_session.isra.0---of 21
policy_hash_bysel82%of 32
rt6_get_cookie---of 30
skb_dst_force.isra.0---of 26
xdst_queue_output---of 39
xfrm_alloc_dst---of 17
xfrm_audit_common_policyinfo---of 16
xfrm_audit_policy_add34%of 9
xfrm_audit_policy_delete34%of 9
xfrm_confirm_neigh---of 5
xfrm_default_advmss---of 7
xfrm_dev_policy_flush---of 23
xfrm_dst_check---of 77
xfrm_dst_ifdown---of 17
xfrm_expand_policies.constprop.0---of 16
xfrm_gen_index100%of 15
xfrm_get_dst_nexthop---of 9
xfrm_hash_rebuild---of 74
xfrm_hash_resize---of 79
xfrm_icmp_flow_decode---of 20
xfrm_if_register_cb---of 1
xfrm_if_unregister_cb---of 1
xfrm_in_fwd_icmp---of 8
xfrm_link_failure---of 1
xfrm_lookup---of 1
xfrm_lookup_route---of 14
xfrm_lookup_with_ifid---of 123
xfrm_migrate73%of 100
xfrm_migrate_policy_find64%of 38
xfrm_mtu---of 12
xfrm_negative_advice---of 4
xfrm_neigh_lookup---of 7
xfrm_net_exit---of 1
xfrm_net_init---of 28
xfrm_out_fwd_icmp---of 15
xfrm_pol_bin_cmp93%of 13
xfrm_pol_bin_key100%of 1
xfrm_pol_bin_obj---of 1
xfrm_pol_inexact_addr_use_any_list80%of 10
xfrm_policy_addr_delta84%of 18
xfrm_policy_alloc45%of 9
xfrm_policy_byid75%of 27
xfrm_policy_bysel_ctx71%of 51
xfrm_policy_delete75%of 4
xfrm_policy_destroy50%of 8
xfrm_policy_destroy_rcu---of 1
xfrm_policy_find_inexact_candidates64%of 11
xfrm_policy_fini---of 11
xfrm_policy_flush79%of 23
xfrm_policy_get_afinfo---of 20
xfrm_policy_hash_rebuild100%of 1
xfrm_policy_inexact_alloc_bin43%of 172
xfrm_policy_inexact_alloc_chain.isra.076%of 29
xfrm_policy_inexact_gc_tree89%of 9
xfrm_policy_inexact_insert82%of 16
xfrm_policy_inexact_insert_node.isra.074%of 57
xfrm_policy_inexact_list_reinsert62%of 44
xfrm_policy_inexact_lookup_rcu63%of 40
xfrm_policy_insert83%of 35
xfrm_policy_insert_list80%of 44
xfrm_policy_kill41%of 32
xfrm_policy_lookup---of 3
xfrm_policy_lookup_bytype72%of 116
xfrm_policy_lookup_inexact_addr89%of 17
xfrm_policy_match84%of 12
xfrm_policy_queue_process---of 142
xfrm_policy_register_afinfo---of 32
xfrm_policy_requeue11%of 29
xfrm_policy_timer---of 56
xfrm_policy_unregister_afinfo---of 6
xfrm_policy_walk70%of 23
xfrm_policy_walk_done60%of 5
xfrm_policy_walk_init100%of 1
xfrm_pols_put---of 8
xfrm_resolve_and_create_bundle---of 232
xfrm_selector_inner_icmp_match---of 6
xfrm_selector_match86%of 67
xfrm_sk_policy_insert---of 18
xfrm_sk_policy_lookup---of 47
xfrm_spd_getinfo100%of 1
xfrm_states_put---of 8
-----------
SUMMARY63%of 1322

__bpf_trace_cap_capable---of 1
__probestub_cap_capable---of 1
__traceiter_cap_capable---of 7
cap_bprm_creds_from_file---of 102
cap_capable31%of 26
cap_capget---of 16
cap_capset---of 13
cap_convert_nscap---of 31
cap_inode_getsecurity---of 41
cap_inode_killpriv---of 3
cap_inode_need_killpriv---of 1
cap_inode_removexattr---of 10
cap_inode_setxattr---of 6
cap_mmap_addr---of 5
cap_ptrace_access_check---of 25
cap_ptrace_traceme---of 22
cap_safe_nice---of 25
cap_settime---of 3
cap_task_fix_setuid---of 38
cap_task_prctl---of 66
cap_task_setioprio---of 1
cap_task_setnice---of 1
cap_task_setscheduler---of 1
cap_vm_enough_memory---of 1
get_vfs_caps_from_disk---of 28
perf_trace_cap_capable---of 9
rootid_owns_currentns---of 9
trace_event_raw_event_cap_capable---of 10
trace_raw_output_cap_capable---of 4
-----------
SUMMARY31%of 26

-----------
SUMMARY---of 0

__change_pid---of 18
__ia32_sys_pidfd_getfd---of 11
__ia32_sys_pidfd_open---of 8
__task_pid_nr_ns52%of 25
__x64_sys_pidfd_getfd---of 11
__x64_sys_pidfd_open---of 8
alloc_pid---of 52
attach_pid---of 12
change_pid---of 1
delayed_put_pid---of 1
detach_pid---of 1
disable_pid_allocation---of 1
exchange_tids---of 12
find_ge_pid---of 1
find_get_pid---of 21
find_get_task_by_vpid---of 21
find_pid_ns---of 1
find_task_by_pid_ns---of 6
find_task_by_vpid---of 4
find_vpid---of 4
free_pid---of 13
free_pids---of 6
get_pid_task---of 21
get_task_pid---of 29
idr_preload_end---of 9
pid_nr_ns75%of 8
pid_table_root_lookup---of 4
pid_table_root_permissions---of 7
pid_table_root_set_ownership---of 8
pid_task---of 13
pid_vnr70%of 10
pidfd_create---of 4
pidfd_get_pid---of 18
pidfd_get_task---of 9
pidfd_getfd---of 19
proc_do_cad_pid---of 10
put_pid46%of 11
register_pidns_sysctls---of 5
set_is_seen---of 4
task_active_pid_ns---of 5
transfer_pid---of 11
unregister_pidns_sysctls---of 1
-----------
SUMMARY58%of 54

-----------
SUMMARY---of 0

crypto_alg_put---of 8
cryptomgr_notify69%of 61
cryptomgr_probe---of 12
-----------
SUMMARY69%of 61

-----------
SUMMARY---of 0

__blkcg_rstat_flush.isra.0---of 31
__blkg_clear_stat---of 5
__blkg_prfill_u64---of 7
__blkg_release---of 12
bio_associate_blkg---of 20
bio_associate_blkg_from_css---of 50
bio_blkcg_css---of 6
bio_clone_blkg_association---of 8
blk_cgroup_bio_start---of 18
blk_cgroup_congested---of 16
blkcg_activate_policy---of 59
blkcg_add_delay---of 5
blkcg_css---of 13
blkcg_css_alloc---of 35
blkcg_css_free---of 6
blkcg_css_offline---of 1
blkcg_css_online---of 12
blkcg_deactivate_policy---of 20
blkcg_exit---of 4
blkcg_exit_disk---of 13
blkcg_free_all_cpd---of 6
blkcg_get_cgwb_list---of 1
blkcg_init_disk---of 9
blkcg_iostat_update---of 5
blkcg_maybe_throttle_current6%of 55
blkcg_pin_online---of 8
blkcg_policy_register---of 29
blkcg_policy_unregister---of 13
blkcg_print_blkgs---of 24
blkcg_print_stat---of 53
blkcg_reset_stats---of 23
blkcg_rstat_flush---of 5
blkcg_scale_delay---of 13
blkcg_schedule_throttle---of 15
blkcg_unpin_online---of 18
blkg_alloc---of 34
blkg_conf_exit---of 6
blkg_conf_exit_frozen---of 5
blkg_conf_init---of 1
blkg_conf_open_bdev---of 14
blkg_conf_open_bdev_frozen---of 5
blkg_conf_prep---of 30
blkg_create---of 64
blkg_destroy---of 28
blkg_dev_name---of 5
blkg_free---of 5
blkg_free_workfn---of 9
blkg_init_queue---of 1
blkg_iostat_add---of 3
blkg_lookup---of 17
blkg_release---of 1
percpu_ref_get_many.constprop.0---of 14
percpu_ref_put_many.constprop.0---of 15
percpu_ref_tryget_many.constprop.0---of 18
radix_tree_preload_end---of 9
-----------
SUMMARY6%of 55

-----------
SUMMARY---of 0

__bpf_trace_mmap_lock---of 1
__bpf_trace_mmap_lock_acquire_returned---of 1
__mmap_lock_do_trace_acquire_returned---of 1
__mmap_lock_do_trace_released---of 1
__mmap_lock_do_trace_start_locking---of 1
__probestub_mmap_lock_acquire_returned---of 1
__probestub_mmap_lock_released---of 1
__probestub_mmap_lock_start_locking---of 1
__traceiter_mmap_lock_acquire_returned---of 7
__traceiter_mmap_lock_released---of 7
__traceiter_mmap_lock_start_locking---of 7
__vma_enter_locked---of 23
__vma_start_write---of 11
cgroup_id_from_mm---of 21
lock_mm_and_find_vma11%of 75
lock_next_vma---of 86
lock_vma_under_rcu---of 29
perf_trace_mmap_lock---of 7
perf_trace_mmap_lock_acquire_returned---of 7
trace_event_raw_event_mmap_lock---of 8
trace_event_raw_event_mmap_lock_acquire_returned---of 8
trace_mmap_lock_acquire_returned---of 15
trace_mmap_lock_released---of 15
trace_mmap_lock_start_locking---of 15
trace_raw_output_mmap_lock---of 6
trace_raw_output_mmap_lock_acquire_returned---of 8
vma_mark_detached---of 26
vma_refcount_put---of 10
vma_start_read---of 42
-----------
SUMMARY11%of 75

-----------
SUMMARY---of 0

__do_sys_capget---of 36
__do_sys_capset---of 25
__ia32_sys_capget---of 1
__ia32_sys_capset---of 1
__x64_sys_capget---of 1
__x64_sys_capset---of 1
cap_validate_magic---of 16
capable---of 1
capable_wrt_inode_uidgid---of 5
file_ns_capable86%of 7
has_capability_noaudit---of 1
has_ns_capability---of 16
has_ns_capability_noaudit---of 16
ns_capable100%of 1
ns_capable_common80%of 5
ns_capable_noaudit---of 1
ns_capable_setid---of 1
privileged_wrt_inode_uidgid---of 5
ptracer_capable---of 19
-----------
SUMMARY85%of 13

-----------
SUMMARY---of 0

__printk_deferred_enter100%of 1
__printk_deferred_exit100%of 1
__printk_safe_enter100%of 1
__printk_safe_exit100%of 1
is_printk_force_console100%of 1
is_printk_legacy_deferred67%of 6
printk_force_console_enter---of 1
printk_force_console_exit---of 1
vprintk100%of 1
-----------
SUMMARY84%of 12

-----------
SUMMARY---of 0

__skb_fill_netmem_desc---of 18
esp6_destroy---of 5
esp6_err---of 14
esp6_init_state92%of 24
esp6_input---of 42
esp6_input_done2---of 65
esp6_output---of 16
esp6_output_head---of 51
esp6_output_tail---of 50
esp6_rcv_cb---of 1
esp_alloc_tmp---of 3
esp_init_aead.constprop.058%of 14
esp_init_authenc.constprop.067%of 54
esp_input_done---of 1
esp_input_done_esn---of 1
esp_input_restore_header---of 3
esp_output_done---of 42
esp_output_done_esn---of 1
esp_output_encap_csum---of 5
esp_output_restore_header---of 1
esp_ssg_unref.isra.0---of 23
get_page---of 19
-----------
SUMMARY72%of 92

-----------
SUMMARY---of 0

__disable_kprobe---of 28
__free_insn_slot---of 30
__get_insn_slot---of 39
__get_valid_kprobe---of 18
__is_insn_slot_addr38%of 16
__unregister_kprobe_bottom---of 10
__unregister_kprobe_top---of 33
__within_kprobe_blacklist---of 8
_kprobe_addr---of 12
aggr_post_handler---of 8
aggr_pre_handler---of 11
alloc_aggr_kprobe---of 6
alloc_insn_page---of 1
alloc_optinsn_page---of 1
arch_kprobe_get_kallsym---of 1
arch_within_kprobe_blacklist---of 6
arm_kprobe---of 15
collect_garbage_slots---of 10
collect_one_slot---of 9
copy_kprobe---of 1
disable_kprobe---of 6
disarm_kprobe---of 18
dump_kprobe---of 1
enable_kprobe---of 17
free_aggr_kprobe---of 1
free_insn_page---of 1
free_optinsn_page---of 1
get_kprobe---of 6
get_optimized_kprobe---of 10
init_aggr_kprobe---of 8
is_cfi_preamble_symbol---of 7
kill_kprobe---of 23
kprobe_add_area_blacklist---of 10
kprobe_add_ksym_blacklist---of 8
kprobe_blacklist_open---of 6
kprobe_blacklist_seq_next---of 1
kprobe_blacklist_seq_show---of 4
kprobe_blacklist_seq_start---of 1
kprobe_blacklist_seq_stop---of 1
kprobe_busy_begin---of 1
kprobe_busy_end---of 4
kprobe_cache_get_kallsym---of 17
kprobe_disarmed---of 8
kprobe_exceptions_notify---of 1
kprobe_free_init_mem---of 11
kprobe_get_kallsym---of 6
kprobe_lookup_name---of 1
kprobe_on_func_entry---of 5
kprobe_optimizer---of 51
kprobe_remove_area_blacklist---of 9
kprobe_seq_next---of 4
kprobe_seq_start---of 4
kprobe_seq_stop---of 1
kprobes_inc_nmissed_count---of 6
kprobes_module_callback---of 58
kprobes_open---of 6
kretprobe_rethook_handler---of 8
opt_pre_handler---of 9
optimize_all_kprobes---of 17
optimize_kprobe---of 23
optprobe_queued_unopt---of 8
pre_handler_kretprobe---of 10
proc_kprobes_optimization_handler---of 22
read_enabled_file_bool---of 4
register_kprobe---of 141
register_kprobes---of 9
register_kretprobe---of 27
register_kretprobes---of 9
report_probe---of 23
show_kprobe_addr---of 10
unoptimize_kprobe---of 19
unregister_kprobe---of 1
unregister_kprobes---of 15
unregister_kretprobe---of 1
unregister_kretprobes---of 16
wait_for_kprobe_optimizer---of 4
wait_for_kprobe_optimizer_locked---of 10
within_kprobe_blacklist---of 8
write_enabled_file_bool---of 44
-----------
SUMMARY38%of 16

____napi_schedule---of 18
____netdev_has_upper_dev---of 1
__dev_change_flags---of 25
__dev_change_net_namespace---of 105
__dev_close_many---of 29
__dev_direct_xmit---of 20
__dev_forward_skb---of 1
__dev_forward_skb2---of 24
__dev_get_by_index---of 9
__dev_get_by_name---of 5
__dev_notify_flags---of 10
__dev_open---of 41
__dev_queue_xmit13%of 277
__dev_remove_pack---of 25
__dev_set_promiscuity---of 28
__dev_set_rx_mode---of 16
__get_xps_queue_idx---of 19
__napi_busy_loop---of 72
__napi_hash_add_with_id---of 5
__napi_poll.constprop.0---of 27
__napi_schedule---of 5
__napi_schedule_irqoff---of 1
__netdev_adjacent_dev_insert---of 38
__netdev_adjacent_dev_remove---of 25
__netdev_adjacent_dev_set---of 17
__netdev_has_upper_dev---of 4
__netdev_name_node_alt_destroy---of 4
__netdev_notify_peers---of 4
__netdev_printk---of 24
__netdev_put_lock---of 13
__netdev_put_lock_ops_compat---of 21
__netdev_update_features---of 158
__netdev_update_lower_level---of 18
__netdev_update_upper_level---of 1
__netdev_upper_depth---of 10
__netdev_upper_dev_link---of 36
__netdev_upper_dev_unlink---of 4
__netdev_walk_all_lower_dev.constprop.0.isra.0---of 14
__netdev_walk_all_upper_dev---of 15
__netif_napi_del_locked---of 20
__netif_receive_skb---of 6
__netif_receive_skb_core.constprop.0---of 252
__netif_receive_skb_list_core---of 11
__netif_receive_skb_list_ptype---of 15
__netif_receive_skb_one_core---of 8
__netif_rx---of 6
__netif_schedule---of 7
__netif_set_mtu---of 5
__netif_set_xps_queue---of 180
__register_netdevice_notifier_net---of 8
__unregister_netdevice_notifier_net---of 3
__vlan_get_protocol_offset.constprop.018%of 23
alloc_netdev_dummy---of 1
alloc_netdev_mqs---of 54
backlog_napi_setup---of 1
backlog_napi_should_run---of 1
bpf_prog_run_generic_xdp---of 41
bpf_xdp_link_attach---of 34
bpf_xdp_link_dealloc---of 1
bpf_xdp_link_detach---of 1
bpf_xdp_link_fill_link_info---of 4
bpf_xdp_link_release---of 24
bpf_xdp_link_show_fdinfo---of 4
bpf_xdp_link_update---of 29
busy_poll_stop---of 25
call_netdevice_notifiers---of 1
call_netdevice_notifiers_info---of 6
call_netdevice_register_net_notifiers---of 21
call_netdevice_unregister_net_notifiers---of 3
call_netdevice_unregister_notifiers---of 3
clean_xps_maps---of 25
default_device_exit_batch---of 36
deliver_skb---of 16
dev_add_pack---of 17
dev_addr_cmp---of 5
dev_alloc_name---of 1
dev_change_xdp_fd---of 21
dev_cpu_dead---of 27
dev_fetch_sw_netstats---of 8
dev_fill_forward_path---of 19
dev_fill_metadata_dst---of 38
dev_forward_skb---of 4
dev_forward_skb_nomtu---of 4
dev_get_alias---of 19
dev_get_by_index---of 14
dev_get_by_index_rcu---of 9
dev_get_by_name---of 16
dev_get_by_name_rcu---of 5
dev_get_by_napi_id---of 9
dev_get_iflink---of 7
dev_get_min_mp_channel_count---of 17
dev_get_phys_port_id---of 4
dev_get_phys_port_name---of 5
dev_get_stats---of 27
dev_get_tstats64---of 1
dev_getbyhwaddr---of 10
dev_getbyhwaddr_rcu---of 7
dev_getfirstbyhwtype---of 17
dev_hard_start_xmit41%of 42
dev_index_reserve---of 9
dev_ingress_queue_create---of 11
dev_kfree_skb_any_reason---of 7
dev_kfree_skb_irq_reason---of 15
dev_loopback_xmit---of 5
dev_nit_active_rcu60%of 10
dev_pick_tx_zero---of 1
dev_prep_valid_name.isra.0---of 44
dev_qdisc_enqueue---of 17
dev_queue_xmit_nit---of 57
dev_remove_pack---of 1
dev_set_rx_mode---of 1
dev_valid_name---of 20
dev_validate_mtu---of 13
dev_xdp_attach---of 108
dev_xdp_install---of 36
dev_xdp_mode---of 11
dev_xdp_prog_count---of 7
dev_xdp_prog_id---of 9
dev_xdp_sb_prog_count---of 7
do_xdp_generic---of 71
enqueue_to_backlog---of 54
flush_backlog---of 35
flush_backlogs_alloc---of 1
free_netdev---of 33
generic_xdp_install---of 24
generic_xdp_tx---of 15
get_rps_cpu---of 83
init_dummy_netdev---of 1
is_skb_forwardable---of 8
kick_defer_list_purge---of 9
list_netdevice---of 11
napi_busy_loop---of 11
napi_busy_loop_rcu---of 3
napi_by_id---of 9
napi_complete_done---of 28
napi_disable---of 1
napi_disable_locked---of 14
napi_enable---of 1
napi_enable_locked---of 22
napi_hash_add---of 8
napi_hash_del---of 7
napi_kthread_create---of 4
napi_resume_irqs---of 17
napi_schedule_prep---of 6
napi_set_threaded---of 25
napi_suspend_irqs---of 15
napi_threaded_poll---of 21
napi_threaded_poll_loop---of 29
napi_watchdog---of 5
net_dec_egress_queue---of 1
net_dec_ingress_queue---of 1
net_disable_timestamp---of 6
net_enable_timestamp---of 6
net_inc_egress_queue---of 1
net_inc_ingress_queue---of 1
net_rps_action_and_irq_enable---of 8
net_rps_send_ipi---of 5
net_rx_action---of 53
net_tx_action---of 74
netdev_adjacent_change_abort---of 9
netdev_adjacent_change_commit---of 9
netdev_adjacent_change_prepare---of 12
netdev_adjacent_get_private---of 1
netdev_adjacent_rename_links---of 11
netdev_adjacent_sysfs_add---of 4
netdev_alert---of 1
netdev_bind_sb_channel_queue---of 10
netdev_bonding_info_change---of 1
netdev_change_features---of 1
netdev_change_proto_down_reason_locked---of 11
netdev_cmd_to_name---of 5
netdev_copy_name---of 14
netdev_core_pick_tx70%of 13
netdev_core_stats_inc---of 10
netdev_create_hash---of 5
netdev_crit---of 1
netdev_do_free_pcpu_stats---of 9
netdev_drivername---of 9
netdev_emerg---of 1
netdev_err---of 1
netdev_exit---of 6
netdev_features_change---of 1
netdev_get_by_flags_rcu---of 8
netdev_get_by_index---of 1
netdev_get_by_index_lock---of 5
netdev_get_by_index_lock_ops_compat---of 5
netdev_get_by_name---of 1
netdev_get_name---of 14
netdev_get_xmit_slave---of 4
netdev_has_any_upper_dev---of 4
netdev_has_upper_dev---of 4
netdev_has_upper_dev_all_rcu---of 1
netdev_increment_features---of 5
netdev_info---of 1
netdev_init---of 6
netdev_init_one_queue---of 6
netdev_is_rx_handler_busy---of 14
netdev_lower_dev_get_private---of 11
netdev_lower_get_first_private_rcu---of 6
netdev_lower_get_next---of 4
netdev_lower_get_next_private---of 4
netdev_lower_get_next_private_rcu---of 9
netdev_lower_state_changed---of 4
netdev_master_upper_dev_get---of 9
netdev_master_upper_dev_get_rcu---of 8
netdev_master_upper_dev_link---of 1
netdev_name_in_use---of 1
netdev_name_node_add---of 6
netdev_name_node_alloc---of 4
netdev_name_node_alt_create---of 6
netdev_name_node_alt_destroy---of 8
netdev_name_node_alt_free---of 1
netdev_name_node_lookup---of 7
netdev_name_node_lookup_rcu---of 7
netdev_napi_by_id---of 8
netdev_napi_by_id_lock---of 52
netdev_next_lower_dev_rcu---of 4
netdev_notice---of 1
netdev_notify_peers---of 1
netdev_offload_xstats_disable---of 9
netdev_offload_xstats_enable---of 17
netdev_offload_xstats_enabled---of 8
netdev_offload_xstats_get---of 12
netdev_offload_xstats_get_stats---of 10
netdev_offload_xstats_push_delta---of 10
netdev_offload_xstats_report_delta---of 1
netdev_offload_xstats_report_used---of 1
netdev_pick_tx23%of 80
netdev_port_same_parent_id---of 11
netdev_printk---of 1
netdev_refcnt_read---of 9
netdev_reset_tc---of 1
netdev_run_todo---of 69
netdev_rx_csum_fault---of 4
netdev_rx_handler_register---of 6
netdev_rx_handler_unregister---of 4
netdev_set_default_ethtool_ops---of 4
netdev_set_num_tc---of 4
netdev_set_sb_channel---of 6
netdev_set_tc_queue---of 4
netdev_sk_get_lowest_dev---of 7
netdev_stats_to_stats64---of 3
netdev_sw_irq_coalesce_default_on---of 15
netdev_txq_to_tc---of 6
netdev_unbind_all_sb_channels---of 6
netdev_unbind_sb_channel---of 6
netdev_update_features---of 3
netdev_upper_dev_link---of 1
netdev_upper_dev_unlink---of 1
netdev_upper_get_next_dev_rcu---of 9
netdev_walk_all_lower_dev---of 14
netdev_walk_all_lower_dev_rcu---of 14
netdev_walk_all_upper_dev_rcu---of 18
netdev_warn---of 1
netdev_xa_find_lock---of 24
netdev_xa_find_lock_ops_compat---of 28
netdev_xmit_skip_txqueue---of 1
netif_change_carrier---of 6
netif_change_flags---of 4
netif_change_name---of 24
netif_change_proto_down---of 9
netif_change_tx_queue_len---of 10
netif_close---of 3
netif_close_many---of 10
netif_close_many_and_unlock---of 3
netif_device_attach---of 6
netif_device_detach---of 5
netif_disable_lro---of 26
netif_enable_cpu_rmap---of 6
netif_get_flags---of 10
netif_get_mac_address---of 19
netif_get_num_default_rss_queues---of 13
netif_get_port_parent_id---of 19
netif_inherit_tso_max---of 8
netif_napi_add_weight_locked---of 35
netif_napi_affinity_release---of 11
netif_napi_irq_notify---of 8
netif_napi_set_irq_locked---of 30
netif_open---of 5
netif_pre_changeaddr_notify---of 4
netif_queue_set_napi---of 24
netif_receive_skb---of 55
netif_receive_skb_core---of 11
netif_receive_skb_list---of 38
netif_receive_skb_list_internal---of 54
netif_reset_xps_queues---of 8
netif_rx---of 5
netif_rx_internal---of 35
netif_schedule_queue---of 18
netif_set_affinity_auto---of 3
netif_set_alias---of 18
netif_set_allmulti---of 16
netif_set_group---of 1
netif_set_mac_address---of 14
netif_set_mtu---of 6
netif_set_mtu_ext---of 25
netif_set_promiscuity---of 5
netif_set_real_num_queues---of 23
netif_set_real_num_rx_queues---of 19
netif_set_real_num_tx_queues---of 43
netif_set_threaded---of 23
netif_set_tso_max_segs---of 3
netif_set_tso_max_size---of 6
netif_set_xps_queue---of 1
netif_skb_features15%of 78
netif_stacked_transfer_operstate---of 18
netif_state_change---of 15
netif_threaded_enable---of 4
netif_tx_stop_all_queues---of 3
netif_tx_wake_queue---of 19
netif_xdp_propagate---of 17
netstamp_clear---of 5
passthru_features_check---of 1
process_backlog---of 83
qdisc_run_end---of 5
register_netdev---of 4
register_netdevice---of 144
register_netdevice_notifier---of 12
register_netdevice_notifier_dev_net---of 3
register_netdevice_notifier_net---of 1
remove_xps_queue---of 18
reset_xps_maps---of 7
rps_may_expire_flow---of 24
rps_trigger_softirq---of 1
rtnl_net_dev_lock---of 26
rtnl_net_dev_unlock---of 1
run_backlog_napi---of 1
sd_has_rps_ipi_waiting---of 6
skb_checksum_help---of 30
skb_crc32c_csum_help---of 15
skb_csum_hwoffload_help---of 23
skb_defer_free_flush---of 12
skb_dst_force.isra.0---of 26
skb_network_protocol35%of 20
skb_warn_bad_offload---of 19
synchronize_net---of 7
tc_run---of 21
tcx_dec---of 1
tcx_inc---of 1
trace_napi_poll---of 15
trace_netif_rx_entry---of 15
trace_netif_rx_exit---of 15
trace_xdp_exception---of 15
trigger_rx_softirq---of 1
unlist_netdevice---of 15
unregister_netdev---of 1
unregister_netdevice_many---of 1
unregister_netdevice_many_notify---of 122
unregister_netdevice_notifier---of 6
unregister_netdevice_notifier_dev_net---of 1
unregister_netdevice_notifier_net---of 1
unregister_netdevice_queue---of 7
validate_xmit_skb.isra.012%of 71
validate_xmit_skb_list---of 10
-----------
SUMMARY19%of 614

ah6_destroy---of 5
ah6_err---of 14
ah6_init_state56%of 34
ah6_input---of 52
ah6_input_done---of 13
ah6_output---of 29
ah6_output_done---of 10
ah6_rcv_cb---of 1
ah6_restore_hdrs---of 5
ah_alloc_tmp---of 1
ipv6_clear_mutable_options.isra.0---of 23
-----------
SUMMARY56%of 34

-----------
SUMMARY---of 0

__kthread_bind_mask---of 5
__kthread_cancel_work_sync---of 13
__kthread_create_on_node60%of 10
__kthread_init_worker---of 1
__kthread_parkme---of 12
__kthread_queue_delayed_work---of 7
free_kthread_struct---of 9
get_kthread_comm---of 19
kthread---of 24
kthread_affine_preferred---of 15
kthread_associate_blkcg---of 41
kthread_bind---of 6
kthread_bind_mask---of 6
kthread_blkcg---of 7
kthread_cancel_delayed_work_sync---of 1
kthread_cancel_delayed_work_timer---of 1
kthread_cancel_work_sync---of 1
kthread_complete_and_exit---of 4
kthread_create_on_cpu---of 6
kthread_create_on_node100%of 1
kthread_create_worker_on_cpu---of 4
kthread_create_worker_on_node---of 6
kthread_data---of 3
kthread_delayed_work_timer_fn---of 10
kthread_destroy_worker---of 9
kthread_exit---of 8
kthread_fetch_affinity.isra.0---of 8
kthread_flush_work---of 12
kthread_flush_work_fn---of 1
kthread_flush_worker---of 1
kthread_freezable_should_stop---of 9
kthread_func---of 7
kthread_insert_work---of 20
kthread_insert_work_sanity_check---of 13
kthread_is_per_cpu58%of 7
kthread_mod_delayed_work---of 12
kthread_park---of 12
kthread_parkme---of 3
kthread_probe_data---of 6
kthread_queue_delayed_work---of 11
kthread_queue_work---of 11
kthread_set_per_cpu---of 12
kthread_should_park---of 3
kthread_should_stop---of 3
kthread_should_stop_or_park---of 7
kthread_stop---of 43
kthread_stop_put---of 6
kthread_unpark---of 9
kthread_unuse_mm---of 9
kthread_use_mm---of 9
kthread_worker_fn---of 66
kthreadd---of 25
kthreads_init---of 1
kthreads_online_cpu---of 14
set_kthread_struct---of 11
tsk_fork_get_node---of 4
-----------
SUMMARY62%of 18

__fib6_clean_all58%of 14
__fib6_drop_pcpu_from---of 32
__fib6_update_sernum_upto_root---of 13
call_fib6_entry_notifiers---of 1
call_fib6_entry_notifiers_replace---of 1
call_fib6_multipath_entry_notifiers---of 1
fib6_add---of 167
fib6_add_rt2node---of 212
fib6_clean_all---of 1
fib6_clean_all_skip_notify---of 1
fib6_clean_node32%of 22
fib6_clean_tree100%of 1
fib6_del---of 88
fib6_dump_done---of 5
fib6_dump_end---of 5
fib6_dump_node---of 8
fib6_dump_table.isra.0---of 11
fib6_find_prefix---of 30
fib6_flush_trees100%of 1
fib6_force_start_gc---of 4
fib6_gc_cleanup---of 1
fib6_gc_timer_cb---of 1
fib6_get_table---of 1
fib6_info_alloc---of 6
fib6_info_destroy_rcu---of 18
fib6_locate---of 50
fib6_lookup---of 1
fib6_metric_set---of 8
fib6_net_exit---of 5
fib6_net_init---of 14
fib6_new_sernum72%of 7
fib6_new_table---of 1
fib6_nh_drop_pcpu_from---of 1
fib6_node_dump---of 8
fib6_node_lookup---of 55
fib6_purge_rt---of 57
fib6_repair_tree---of 102
fib6_rule_lookup---of 20
fib6_run_gc---of 38
fib6_tables_dump---of 9
fib6_tables_seq_read---of 14
fib6_update_sernum---of 10
fib6_update_sernum_stub---of 1
fib6_update_sernum_upto_root---of 1
fib6_walk100%of 3
fib6_walk_continue83%of 41
inet6_dump_fib---of 66
ipv6_route_seq_next---of 22
ipv6_route_seq_next_table---of 17
ipv6_route_seq_setup_walk---of 1
ipv6_route_seq_show---of 33
ipv6_route_seq_start---of 9
ipv6_route_seq_stop---of 16
ipv6_route_yield---of 13
node_alloc---of 4
-----------
SUMMARY67%of 89

-----------
SUMMARY---of 0

__lock_sock---of 12
__lock_sock_fast---of 3
__receive_sock---of 5
__release_sock---of 17
__sk_backlog_rcv---of 8
__sk_destruct---of 31
__sk_dst_check---of 11
__sk_dst_get---of 11
__sk_flush_backlog---of 6
__sk_free---of 25
__sk_mem_raise_allocated---of 82
__sk_mem_reclaim---of 1
__sk_mem_reduce_allocated---of 18
__sk_mem_schedule---of 3
__sk_receive_skb---of 57
__sock_cmsg_send---of 39
__sock_queue_rcv_skb---of 52
__sock_set_timestamps---of 11
__sock_wfree---of 11
copy_from_sockptr_offset.constprop.0---of 7
copy_to_sockptr_offset---of 10
lock_sock_nested67%of 3
proto_exit_net---of 1
proto_init_net---of 3
proto_register---of 33
proto_seq_next---of 1
proto_seq_show---of 53
proto_seq_start---of 1
proto_seq_stop---of 1
proto_unregister---of 3
put_page---of 21
release_sock59%of 12
req_prot_cleanup---of 5
signal_pending---of 5
sk_alloc---of 21
sk_busy_loop_end---of 15
sk_capable---of 5
sk_clear_memalloc---of 7
sk_clone_lock---of 65
sk_common_release---of 13
sk_destruct---of 7
sk_dst_check---of 32
sk_error_report---of 18
sk_free---of 8
sk_get_meminfo---of 4
sk_getsockopt---of 196
sk_init_common---of 4
sk_ioctl---of 17
sk_leave_memory_pressure---of 10
sk_mc_loop---of 12
sk_net_capable---of 1
sk_net_refcnt_upgrade---of 10
sk_ns_capable---of 5
sk_page_frag_refill---of 10
sk_prot_alloc---of 18
sk_reset_timer---of 11
sk_send_sigurg---of 27
sk_set_memalloc---of 1
sk_set_peek_off---of 1
sk_set_prio_allowed---of 7
sk_setsockopt---of 281
sk_setup_caps---of 44
sk_stop_timer---of 6
sk_stop_timer_sync---of 6
sk_wait_data---of 16
sk_wake_async_rcu23%of 9
skb_dst_force.isra.0---of 26
skb_orphan_partial---of 33
skb_page_frag_refill---of 15
skb_set_owner_w---of 35
sock_alloc_send_pskb---of 39
sock_bind_add---of 4
sock_bindtoindex---of 5
sock_bindtoindex_locked---of 10
sock_cmsg_send---of 17
sock_common_getsockopt---of 1
sock_common_recvmsg---of 4
sock_common_setsockopt---of 1
sock_copy_user_timeval---of 17
sock_def_destruct---of 1
sock_def_error_report---of 21
sock_def_readable40%of 35
sock_def_wakeup---of 21
sock_def_write_space---of 23
sock_devmem_dontneed---of 40
sock_disable_timestamp---of 7
sock_efree---of 6
sock_enable_timestamp---of 7
sock_get_timeout---of 8
sock_gettstamp---of 10
sock_init_data---of 4
sock_init_data_uid---of 10
sock_inuse_exit_net---of 1
sock_inuse_get---of 9
sock_inuse_init_net---of 3
sock_ioctl_inout---of 16
sock_kfree_s---of 4
sock_kmalloc---of 7
sock_kmemdup---of 4
sock_kzfree_s---of 4
sock_load_diag_module---of 12
sock_lock_init---of 4
sock_no_accept---of 1
sock_no_bind---of 1
sock_no_connect---of 1
sock_no_getname---of 1
sock_no_ioctl---of 1
sock_no_linger---of 1
sock_no_listen---of 1
sock_no_mmap---of 1
sock_no_recvmsg---of 1
sock_no_sendmsg---of 1
sock_no_sendmsg_locked---of 1
sock_no_shutdown---of 1
sock_no_socketpair---of 1
sock_ofree---of 1
sock_omalloc---of 6
sock_pfree---of 19
sock_prot_inuse_get---of 11
sock_queue_rcv_skb_reason---of 11
sock_recv_errqueue---of 24
sock_reserve_memory---of 28
sock_rfree30%of 10
sock_set_flag---of 1
sock_set_keepalive---of 4
sock_set_mark---of 3
sock_set_priority---of 1
sock_set_rcvbuf---of 1
sock_set_reuseaddr---of 1
sock_set_reuseport---of 1
sock_set_sndtimeo---of 6
sock_set_timeout---of 16
sock_set_timestamp---of 11
sock_set_timestamping---of 46
sock_setsockopt---of 1
sock_update_netprioidx---of 14
sock_valbool_flag---of 5
sock_wfree---of 55
sock_wmalloc---of 7
sockopt_capable---of 5
sockopt_lock_sock---of 4
sockopt_ns_capable---of 5
sockopt_release_sock---of 4
task_cls_classid---of 13
task_css---of 10
tw_prot_cleanup---of 5
-----------
SUMMARY41%of 69

__async_schedule_node_domain---of 3
async_run_entry_fn---of 1
async_schedule_dev_nocall---of 7
async_schedule_node---of 1
async_schedule_node_domain---of 6
async_synchronize_cookie---of 1
async_synchronize_cookie_domain---of 9
async_synchronize_full---of 1
async_synchronize_full_domain---of 1
current_is_async40%of 10
lowest_in_progress---of 12
-----------
SUMMARY40%of 10

should_fail_alloc_page50%of 16
-----------
SUMMARY50%of 16

__blake2s_init.constprop.0---of 1
__do_sys_getrandom---of 16
__get_random_u32_below86%of 7
__ia32_sys_getrandom---of 1
__x64_sys_getrandom---of 1
_credit_init_bits---of 20
_get_random_bytes75%of 12
add_device_randomness---of 1
add_disk_randomness---of 8
add_hwgenerator_randomness---of 14
add_input_randomness---of 5
add_interrupt_randomness---of 14
add_timer_randomness---of 16
arch_get_random_longs---of 10
blake2s.constprop.0---of 1
crng_fast_key_erasure67%of 3
crng_make_state40%of 28
crng_reseed---of 10
crng_reseed_interval---of 6
crng_set_ready---of 1
entropy_timer---of 6
execute_with_initialized_rng---of 6
extract_entropy.constprop.0---of 11
fast_mix---of 1
get_random_bytes---of 1
get_random_bytes_user---of 20
get_random_u16---of 25
get_random_u3268%of 25
get_random_u64---of 25
get_random_u8---of 25
mix_interrupt_randomness---of 12
mix_pool_bytes---of 1
proc_do_rointvec---of 3
proc_do_uuid---of 9
rand_initialize_disk---of 4
random_fasync---of 1
random_ioctl---of 42
random_online_cpu---of 1
random_pm_notification---of 7
random_poll---of 10
random_prepare_cpu---of 1
random_read_iter---of 12
random_write_iter---of 1
rng_is_initialized60%of 5
signal_pending---of 5
try_to_generate_entropy---of 38
urandom_read_iter---of 15
wait_for_random_bytes---of 28
write_pool_user---of 14
-----------
SUMMARY60%of 80

___bpf_prog_run---of 384
__bpf_address_lookup---of 23
__bpf_call_base---of 1
__bpf_free_used_btfs---of 6
__bpf_free_used_maps---of 9
__bpf_prog_array_free_sleepable_cb---of 1
__bpf_prog_free---of 4
__bpf_prog_map_compatible---of 22
__bpf_prog_ret0_warn---of 1
__bpf_prog_ret1---of 1
__bpf_prog_run128---of 1
__bpf_prog_run160---of 1
__bpf_prog_run192---of 1
__bpf_prog_run224---of 1
__bpf_prog_run256---of 1
__bpf_prog_run288---of 1
__bpf_prog_run32---of 1
__bpf_prog_run320---of 1
__bpf_prog_run352---of 1
__bpf_prog_run384---of 1
__bpf_prog_run416---of 1
__bpf_prog_run448---of 1
__bpf_prog_run480---of 1
__bpf_prog_run512---of 1
__bpf_prog_run64---of 1
__bpf_prog_run96---of 1
__bpf_prog_run_args128---of 1
__bpf_prog_run_args160---of 1
__bpf_prog_run_args192---of 1
__bpf_prog_run_args224---of 1
__bpf_prog_run_args256---of 1
__bpf_prog_run_args288---of 1
__bpf_prog_run_args32---of 1
__bpf_prog_run_args320---of 1
__bpf_prog_run_args352---of 1
__bpf_prog_run_args384---of 1
__bpf_prog_run_args416---of 1
__bpf_prog_run_args448---of 1
__bpf_prog_run_args480---of 1
__bpf_prog_run_args512---of 1
__bpf_prog_run_args64---of 1
__bpf_prog_run_args96---of 1
__bpf_trace_bpf_xdp_link_attach_failed---of 1
__bpf_trace_mem_connect---of 1
__bpf_trace_mem_disconnect---of 1
__bpf_trace_xdp_bulk_tx---of 1
__bpf_trace_xdp_cpumap_enqueue---of 1
__bpf_trace_xdp_cpumap_kthread---of 1
__bpf_trace_xdp_devmap_xmit---of 1
__bpf_trace_xdp_exception---of 1
__bpf_trace_xdp_redirect_template---of 1
__probestub_bpf_xdp_link_attach_failed---of 1
__probestub_mem_connect---of 1
__probestub_mem_disconnect---of 1
__probestub_xdp_bulk_tx---of 1
__probestub_xdp_cpumap_enqueue---of 1
__probestub_xdp_cpumap_kthread---of 1
__probestub_xdp_devmap_xmit---of 1
__probestub_xdp_exception---of 1
__probestub_xdp_redirect---of 1
__probestub_xdp_redirect_err---of 1
__traceiter_bpf_xdp_link_attach_failed---of 7
__traceiter_mem_connect---of 7
__traceiter_mem_disconnect---of 7
__traceiter_xdp_bulk_tx---of 7
__traceiter_xdp_cpumap_enqueue---of 7
__traceiter_xdp_cpumap_kthread---of 7
__traceiter_xdp_devmap_xmit---of 7
__traceiter_xdp_exception---of 7
__traceiter_xdp_redirect---of 7
__traceiter_xdp_redirect_err---of 7
bpf_adj_branches---of 47
bpf_adj_delta_to_imm---of 12
bpf_check_timed_may_goto---of 5
bpf_get_kallsym---of 28
bpf_get_raw_cpu_id---of 1
bpf_internal_load_pointer_neg_helper---of 13
bpf_jit_add_poke_descriptor---of 22
bpf_jit_alloc_exec---of 1
bpf_jit_alloc_exec_limit---of 1
bpf_jit_binary_alloc---of 13
bpf_jit_binary_free---of 1
bpf_jit_binary_pack_alloc---of 15
bpf_jit_binary_pack_finalize---of 4
bpf_jit_binary_pack_free---of 1
bpf_jit_binary_pack_hdr---of 1
bpf_jit_blind_constants---of 43
bpf_jit_bypass_spec_v1---of 1
bpf_jit_bypass_spec_v4---of 1
bpf_jit_charge_modmem---of 8
bpf_jit_compile---of 1
bpf_jit_fill_hole_with_zero---of 1
bpf_jit_free_exec---of 1
bpf_jit_get_func_addr---of 18
bpf_jit_get_prog_name---of 5
bpf_jit_inlines_helper_call---of 1
bpf_jit_needs_zext---of 1
bpf_jit_prog_release_other---of 1
bpf_jit_supports_far_kfunc_call---of 1
bpf_jit_uncharge_modmem---of 1
bpf_ksym_add---of 17
bpf_ksym_del---of 4
bpf_ksym_find40%of 15
bpf_opcode_in_insntable---of 1
bpf_patch_call_args---of 1
bpf_patch_insn_single---of 21
bpf_prog_alloc---of 15
bpf_prog_alloc_jited_linfo---of 9
bpf_prog_alloc_no_stats---of 33
bpf_prog_array_alloc---of 3
bpf_prog_array_copy---of 42
bpf_prog_array_copy_core---of 7
bpf_prog_array_copy_info---of 15
bpf_prog_array_copy_to_user---of 12
bpf_prog_array_delete_safe---of 5
bpf_prog_array_delete_safe_at---of 1
bpf_prog_array_free---of 8
bpf_prog_array_free_sleepable---of 8
bpf_prog_array_is_empty---of 6
bpf_prog_array_length---of 7
bpf_prog_array_update_at---of 10
bpf_prog_calc_tag---of 15
bpf_prog_fill_jited_linfo---of 9
bpf_prog_find_from_stack---of 1
bpf_prog_free---of 4
bpf_prog_free_deferred---of 13
bpf_prog_get_file_line---of 30
bpf_prog_jit_attempt_done---of 8
bpf_prog_kallsyms_add---of 15
bpf_prog_kallsyms_del---of 7
bpf_prog_kallsyms_del_all---of 3
bpf_prog_ksym_find---of 9
bpf_prog_map_compatible---of 3
bpf_prog_pack_alloc---of 28
bpf_prog_pack_free---of 19
bpf_prog_realloc---of 11
bpf_prog_report_may_goto_violation---of 4
bpf_prog_select_runtime---of 31
bpf_remove_insns---of 3
bpf_user_rnd_init_once---of 6
bpf_user_rnd_u32---of 4
find_from_stack_cb---of 14
is_bpf_text_address46%of 11
perf_trace_bpf_xdp_link_attach_failed---of 9
perf_trace_mem_connect---of 7
perf_trace_mem_disconnect---of 7
perf_trace_xdp_bulk_tx---of 7
perf_trace_xdp_cpumap_enqueue---of 7
perf_trace_xdp_cpumap_kthread---of 7
perf_trace_xdp_devmap_xmit---of 7
perf_trace_xdp_exception---of 7
perf_trace_xdp_redirect_template---of 16
search_bpf_extables---of 16
trace_event_get_offsets_bpf_xdp_link_attach_failed---of 3
trace_event_raw_event_bpf_xdp_link_attach_failed---of 10
trace_event_raw_event_mem_connect---of 8
trace_event_raw_event_mem_disconnect---of 8
trace_event_raw_event_xdp_bulk_tx---of 8
trace_event_raw_event_xdp_cpumap_enqueue---of 8
trace_event_raw_event_xdp_cpumap_kthread---of 8
trace_event_raw_event_xdp_devmap_xmit---of 8
trace_event_raw_event_xdp_exception---of 8
trace_event_raw_event_xdp_redirect_template---of 17
trace_raw_output_bpf_xdp_link_attach_failed---of 5
trace_raw_output_mem_connect---of 4
trace_raw_output_mem_disconnect---of 4
trace_raw_output_xdp_bulk_tx---of 4
trace_raw_output_xdp_cpumap_enqueue---of 4
trace_raw_output_xdp_cpumap_kthread---of 4
trace_raw_output_xdp_devmap_xmit---of 4
trace_raw_output_xdp_exception---of 4
trace_raw_output_xdp_redirect_template---of 4
-----------
SUMMARY43%of 26

-----------
SUMMARY---of 0

__bpf_trace_selinux_audited---of 1
__probestub_selinux_audited---of 1
__traceiter_selinux_audited---of 7
avc_alloc_node---of 39
avc_audit_post_callback---of 37
avc_audit_pre_callback---of 14
avc_compute_av---of 23
avc_copy_xperms_decision---of 9
avc_denied---of 8
avc_get_cache_threshold---of 1
avc_get_hash_stats---of 22
avc_has_extended_perms---of 66
avc_has_perm64%of 11
avc_has_perm_noaudit37%of 22
avc_lookup75%of 12
avc_node_delete---of 4
avc_node_free---of 1
avc_node_kill---of 1
avc_node_populate---of 1
avc_node_replace---of 4
avc_perm_nonode---of 3
avc_policy_seqno---of 1
avc_set_cache_threshold---of 1
avc_ss_reset---of 27
avc_update_node.isra.0---of 33
avc_xperms_decision_alloc---of 14
avc_xperms_decision_free---of 10
avc_xperms_decision_lookup---of 6
avc_xperms_free---of 6
avc_xperms_populate---of 10
perf_trace_selinux_audited---of 13
selinux_avc_init---of 3
slow_avc_audit---of 6
trace_event_get_offsets_selinux_audited.isra.0---of 7
trace_event_raw_event_selinux_audited---of 14
trace_raw_output_selinux_audited---of 4
-----------
SUMMARY54%of 45

__bpf_trace_percpu_alloc_percpu---of 1
__bpf_trace_percpu_alloc_percpu_fail---of 1
__bpf_trace_percpu_create_chunk---of 1
__bpf_trace_percpu_destroy_chunk---of 1
__bpf_trace_percpu_free_percpu---of 1
__is_kernel_percpu_address58%of 14
__pcpu_chunk_move---of 6
__probestub_percpu_alloc_percpu---of 1
__probestub_percpu_alloc_percpu_fail---of 1
__probestub_percpu_create_chunk---of 1
__probestub_percpu_destroy_chunk---of 1
__probestub_percpu_free_percpu---of 1
__traceiter_percpu_alloc_percpu---of 7
__traceiter_percpu_alloc_percpu_fail---of 7
__traceiter_percpu_create_chunk---of 8
__traceiter_percpu_destroy_chunk---of 8
__traceiter_percpu_free_percpu---of 7
free_percpu---of 88
is_kernel_percpu_address---of 1
pcpu_addr_in_chunk---of 7
pcpu_alloc_area60%of 32
pcpu_alloc_noprof25%of 123
pcpu_balance_free---of 44
pcpu_balance_workfn---of 68
pcpu_block_refresh_hint72%of 7
pcpu_block_update38%of 32
pcpu_block_update_hint_alloc25%of 40
pcpu_chunk_depopulated---of 9
pcpu_chunk_populated---of 9
pcpu_chunk_refresh_hint---of 7
pcpu_chunk_relocate46%of 11
pcpu_chunk_slot58%of 7
pcpu_create_chunk---of 19
pcpu_depopulate_chunk---of 18
pcpu_dump_alloc_info---of 22
pcpu_find_block_fit42%of 17
pcpu_free_area---of 41
pcpu_free_chunk---of 5
pcpu_free_pages.isra.0---of 14
pcpu_get_pages---of 9
pcpu_init_md_blocks---of 4
pcpu_mem_zalloc---of 8
pcpu_memcg_post_alloc_hook8%of 38
pcpu_next_fit_region.constprop.055%of 22
pcpu_next_md_free_region---of 13
pcpu_nr_pages---of 1
pcpu_obj_full_size100%of 3
pcpu_populate_chunk---of 62
pcpu_post_unmap_tlb_flush---of 1
pcpu_reintegrate_chunk67%of 9
pcpu_schedule_balance_work---of 4
pcpu_size_to_slot60%of 5
per_cpu_ptr_to_phys---of 22
perf_trace_percpu_alloc_percpu---of 7
perf_trace_percpu_alloc_percpu_fail---of 7
perf_trace_percpu_create_chunk---of 7
perf_trace_percpu_destroy_chunk---of 7
perf_trace_percpu_free_percpu---of 7
trace_event_raw_event_percpu_alloc_percpu---of 8
trace_event_raw_event_percpu_alloc_percpu_fail---of 8
trace_event_raw_event_percpu_create_chunk---of 8
trace_event_raw_event_percpu_destroy_chunk---of 8
trace_event_raw_event_percpu_free_percpu---of 8
trace_percpu_create_chunk---of 15
trace_raw_output_percpu_alloc_percpu---of 6
trace_raw_output_percpu_alloc_percpu_fail---of 4
trace_raw_output_percpu_create_chunk---of 5
trace_raw_output_percpu_destroy_chunk---of 5
trace_raw_output_percpu_free_percpu---of 4
-----------
SUMMARY36%of 360

__crypto_shash_export---of 6
__crypto_shash_import---of 8
__crypto_shash_init100%of 3
crypto_alloc_shash100%of 1
crypto_clone_shash---of 31
crypto_grab_shash---of 1
crypto_has_shash---of 1
crypto_register_shash---of 3
crypto_register_shashes---of 8
crypto_shash_digest---of 4
crypto_shash_exit_tfm100%of 1
crypto_shash_export---of 4
crypto_shash_export_core---of 1
crypto_shash_finup83%of 28
crypto_shash_free_instance---of 1
crypto_shash_import---of 5
crypto_shash_import_core---of 1
crypto_shash_init75%of 4
crypto_shash_init_tfm100%of 12
crypto_shash_op_and_zero100%of 1
crypto_shash_report---of 1
crypto_shash_setkey86%of 7
crypto_shash_show---of 1
crypto_shash_tfm_digest---of 4
crypto_unregister_shash---of 1
crypto_unregister_shashes---of 3
hash_prepare_alg---of 6
shash_default_digest---of 3
shash_default_export_core---of 1
shash_default_finup100%of 3
shash_default_import_core---of 1
shash_free_singlespawn_instance---of 1
shash_no_setkey---of 1
shash_prepare_alg---of 31
shash_register_instance---of 7
-----------
SUMMARY89%of 60

-----------
SUMMARY---of 0

jent_get_nstime67%of 3
jent_hash_time62%of 18
jent_kcapi_cleanup---of 9
jent_kcapi_init56%of 9
jent_kcapi_random13%of 8
jent_kcapi_reset---of 1
jent_kvzalloc100%of 1
jent_kvzfree---of 1
jent_read_random_block---of 9
jent_zalloc100%of 1
jent_zfree---of 1
-----------
SUMMARY53%of 40

-----------
SUMMARY---of 0

_prb_commit50%of 6
_prb_read_valid46%of 48
data_alloc62%of 18
data_push_tail53%of 21
desc_make_final75%of 4
desc_make_reusable.isra.0100%of 1
desc_read82%of 16
desc_read_finalized_seq67%of 12
desc_update_last_finalized80%of 10
get_data43%of 21
prb_commit---of 4
prb_final_commit100%of 1
prb_first_seq75%of 4
prb_first_valid_seq---of 4
prb_init---of 1
prb_next_reserve_seq---of 9
prb_next_seq---of 7
prb_read_valid100%of 1
prb_read_valid_info---of 1
prb_record_text_space---of 1
prb_reserve55%of 51
prb_reserve_in_last---of 69
space_used.isra.058%of 7
-----------
SUMMARY58%of 221

-----------
SUMMARY---of 0

address_val---of 7
bdev_name.isra.0---of 14
bitmap_list_string.isra.0---of 17
bitmap_string.isra.0---of 12
bstr_printf---of 62
check_pointer40%of 10
clock.isra.0---of 4
date_str---of 7
default_pointer---of 16
dentry_name---of 36
device_node_string.isra.0---of 8
escaped_string---of 21
file_dentry_name---of 4
fill_ptr_key---of 1
flags_string---of 27
format_decode54%of 50
format_flags---of 12
fourcc_string---of 37
fwnode_full_name_string---of 14
fwnode_string---of 11
hex_range---of 7
hex_string---of 20
ip4_addr_string---of 1
ip4_addr_string_sa---of 10
ip4_string---of 14
ip6_addr_string---of 6
ip6_addr_string_sa---of 27
ip6_compressed_string---of 40
ip6_string---of 8
ip_addr_string---of 18
mac_address_string---of 18
netdev_bits---of 8
num_to_str---of 14
number48%of 71
pointer---of 43
pointer_string---of 4
ptr_to_hashval---of 4
put_dec50%of 4
put_dec_full8---of 1
put_dec_trunc890%of 10
range_string.isra.0---of 10
resource_or_range---of 6
resource_string.isra.0---of 100
restricted_pointer---of 19
rtc_str---of 20
scnprintf---of 5
set_precision---of 6
simple_strntoll---of 4
simple_strntoul---of 1
simple_strntoull---of 8
simple_strtol---of 4
simple_strtoll---of 1
simple_strtoul---of 1
simple_strtoull---of 1
snprintf100%of 1
special_hex_number---of 1
sprintf100%of 1
sscanf---of 1
string75%of 4
string_nocheck100%of 7
symbol_string---of 17
time64_str---of 1
time_and_date---of 7
time_str.isra.0---of 5
uuid_string---of 15
va_format---of 6
vbin_printf---of 67
vscnprintf60%of 5
vsnprintf42%of 73
vsprintf---of 1
vsscanf---of 192
widen_string12%of 17
-----------
SUMMARY49%of 253

-----------
SUMMARY---of 0

cmac_clone_tfm---of 5
cmac_create---of 13
cmac_exit_tfm100%of 1
cmac_init_tfm60%of 5
crypto_cmac_digest_finup---of 4
crypto_cmac_digest_init---of 1
crypto_cmac_digest_setkey43%of 7
crypto_cmac_digest_update---of 4
-----------
SUMMARY54%of 13

__cgroup_account_cputime100%of 1
__cgroup_account_cputime_field---of 6
bpf_rstat_flush---of 1
cgroup_base_stat_cputime_account_end.constprop.050%of 4
cgroup_base_stat_cputime_show---of 14
css_rstat_exit---of 25
css_rstat_flush---of 78
css_rstat_init---of 21
css_rstat_updated41%of 22
root_cgroup_cputime---of 9
trace_cgroup_rstat_lock_contended.constprop.0---of 15
trace_cgroup_rstat_locked---of 15
trace_cgroup_rstat_unlock.constprop.0---of 15
-----------
SUMMARY45%of 27

-----------
SUMMARY---of 0

__bpf_trace_x86_fpu---of 1
__probestub_x86_fpu_after_save---of 1
__probestub_x86_fpu_before_save---of 1
__probestub_x86_fpu_copy_dst---of 1
__probestub_x86_fpu_dropped---of 1
__probestub_x86_fpu_regs_activated---of 1
__probestub_x86_fpu_regs_deactivated---of 1
__probestub_x86_fpu_xstate_check_failed---of 1
__traceiter_x86_fpu_after_save---of 8
__traceiter_x86_fpu_before_save---of 8
__traceiter_x86_fpu_copy_dst---of 8
__traceiter_x86_fpu_dropped---of 8
__traceiter_x86_fpu_regs_activated---of 8
__traceiter_x86_fpu_regs_deactivated---of 8
__traceiter_x86_fpu_xstate_check_failed---of 8
fpregs_assert_state_consistent82%of 11
fpregs_lock_and_load---of 18
fpregs_mark_activate---of 4
fpstate_init_user---of 4
fpstate_reset---of 4
fpu__clear_user_states---of 25
fpu__drop---of 42
fpu__exception_code---of 18
fpu_clone---of 45
fpu_flush_thread---of 11
fpu_reset_from_exception_fixup---of 1
fpu_sync_fpstate---of 37
fpu_thread_struct_whitelist---of 1
irq_fpu_usable50%of 10
kernel_fpu_begin_mask73%of 18
kernel_fpu_end67%of 6
perf_trace_x86_fpu---of 9
restore_fpregs_from_fpstate47%of 15
save_fpregs_to_fpstate56%of 9
switch_fpu_return82%of 11
trace_event_raw_event_x86_fpu---of 10
trace_raw_output_x86_fpu---of 4
trace_x86_fpu_regs_activated34%of 15
x86_task_fpu---of 5
-----------
SUMMARY60%of 95

-----------
SUMMARY---of 0

xfrm_aevent_is_on---of 19
xfrm_init_replay93%of 14
xfrm_replay_advance---of 49
xfrm_replay_check---of 7
xfrm_replay_check_bmp---of 14
xfrm_replay_check_esn---of 28
xfrm_replay_check_legacy---of 11
xfrm_replay_notify---of 59
xfrm_replay_overflow---of 26
xfrm_replay_recheck---of 8
xfrm_replay_seqhi---of 8
-----------
SUMMARY93%of 14

-----------
SUMMARY---of 0

xfrm_alloc_compat83%of 63
xfrm_attr_cpy32---of 6
xfrm_user_policy_compat---of 13
xfrm_user_rcv_msg_compat---of 99
-----------
SUMMARY83%of 63

-----------
SUMMARY---of 0

crypto_gcm_base_create---of 7
crypto_gcm_create---of 6
crypto_gcm_create_common---of 22
crypto_gcm_decrypt---of 5
crypto_gcm_encrypt---of 3
crypto_gcm_exit_tfm---of 1
crypto_gcm_free---of 1
crypto_gcm_init_common---of 18
crypto_gcm_init_crypt---of 4
crypto_gcm_init_tfm---of 6
crypto_gcm_setauthsize---of 4
crypto_gcm_setkey---of 10
crypto_gcm_verify---of 4
crypto_rfc4106_create---of 16
crypto_rfc4106_crypt---of 21
crypto_rfc4106_decrypt---of 5
crypto_rfc4106_encrypt---of 5
crypto_rfc4106_exit_tfm---of 1
crypto_rfc4106_free---of 1
crypto_rfc4106_init_tfm---of 5
crypto_rfc4106_setauthsize---of 4
crypto_rfc4106_setkey---of 4
crypto_rfc4543_create---of 16
crypto_rfc4543_crypt---of 9
crypto_rfc4543_decrypt---of 5
crypto_rfc4543_encrypt---of 5
crypto_rfc4543_exit_tfm---of 1
crypto_rfc4543_free---of 1
crypto_rfc4543_init_tfm60%of 5
crypto_rfc4543_setauthsize75%of 4
crypto_rfc4543_setkey75%of 4
gcm_dec_hash_continue---of 4
gcm_decrypt_done---of 3
gcm_enc_copy_hash---of 1
gcm_encrypt_continue---of 8
gcm_encrypt_done---of 8
gcm_hash---of 3
gcm_hash_assoc_continue---of 6
gcm_hash_assoc_done---of 8
gcm_hash_assoc_remain_continue---of 5
gcm_hash_assoc_remain_done---of 8
gcm_hash_crypt_continue---of 6
gcm_hash_crypt_done---of 8
gcm_hash_crypt_remain_continue---of 3
gcm_hash_crypt_remain_done---of 8
gcm_hash_init_continue---of 6
gcm_hash_init_done---of 8
gcm_hash_len_continue---of 1
gcm_hash_len_done---of 8
gcm_hash_update---of 1
-----------
SUMMARY70%of 13

nat_keepalive_sk_fini---of 11
nat_keepalive_sk_init---of 19
nat_keepalive_work---of 3
nat_keepalive_work_single---of 66
xfrm_nat_keepalive_fini---of 7
xfrm_nat_keepalive_init---of 8
xfrm_nat_keepalive_net_fini---of 1
xfrm_nat_keepalive_net_init---of 1
xfrm_nat_keepalive_state_updated50%of 4
-----------
SUMMARY50%of 4

-----------
SUMMARY---of 0

__hmac_sha256_final---of 3
__hmac_sha256_init---of 1
__hmac_sha256_preparekey70%of 10
__sha256_final---of 6
__sha256_update---of 16
hmac_sha224---of 1
hmac_sha224_final---of 1
hmac_sha224_init_usingrawkey---of 1
hmac_sha224_preparekey---of 1
hmac_sha224_usingrawkey---of 1
hmac_sha256---of 1
hmac_sha256_final---of 1
hmac_sha256_init_usingrawkey---of 1
hmac_sha256_preparekey100%of 1
hmac_sha256_usingrawkey---of 1
sha224---of 1
sha224_final---of 1
sha224_init---of 1
sha256---of 1
sha256_blocks_avx---of 5
sha256_blocks_avx260%of 5
sha256_blocks_generic---of 8
sha256_blocks_ni---of 5
sha256_blocks_ssse3---of 5
sha256_final---of 1
sha256_finup_2x---of 11
sha256_finup_2x_is_optimized---of 1
sha256_finup_2x_sequential---of 1
sha256_init---of 1
-----------
SUMMARY69%of 16

PageHuge---of 11
__access_remote_vm---of 66
__apply_to_page_range---of 161
__do_fault---of 23
__get_locked_pte---of 7
__handle_mm_fault17%of 271
__might_fault80%of 5
__p4d_alloc---of 33
__pagetable_ctor---of 9
__pmd_alloc---of 33
__pte_alloc---of 9
__pte_alloc_kernel---of 18
__pud_alloc---of 28
__vm_insert_mixed---of 17
__vm_map_pages---of 10
__vmf_anon_prepare---of 18
access_process_vm---of 4
access_remote_vm---of 1
apply_to_existing_page_range---of 1
apply_to_page_range---of 1
clear_subpage---of 1
copy_folio_from_user---of 11
copy_page_range---of 215
copy_present_ptes---of 324
copy_remote_vm_str---of 57
copy_subpage---of 3
copy_user_large_folio---of 20
count_memcg_events_mm.constprop.048%of 23
do_page_mkwrite---of 15
do_set_pmd---of 1
do_swap_page---of 284
do_wp_page---of 299
fault_around_bytes_fops_open---of 1
fault_around_bytes_get---of 1
fault_around_bytes_set---of 4
fault_dirty_shared_page---of 37
finish_fault---of 82
finish_mkwrite_fault---of 13
folio_needs_cow_for_dma---of 9
folio_prealloc---of 19
folio_put---of 6
folio_try_dup_anon_rmap_ptes.isra.0---of 115
folio_zero_user---of 17
follow_pfnmap_end---of 11
follow_pfnmap_start---of 67
free_pgd_range---of 113
free_pgtables---of 35
generic_access_phys---of 22
handle_mm_fault42%of 58
insert_page---of 11
insert_page_into_pte_locked---of 60
insert_pfn---of 41
mm_trace_rss_stat---of 1
numa_migrate_check---of 20
pagetable_dtor---of 12
percpu_ref_put_many.constprop.0---of 15
pfn_swap_entry_folio---of 29
pfnmap_track_ctx_release---of 1
pmd_install---of 7
print_bad_page_map---of 44
print_vma_addr---of 17
process_huge_page---of 19
pte_write---of 7
ptlock_alloc---of 4
ptlock_free---of 5
put_page---of 21
rcu_read_unlock---of 6
rcu_read_unlock_sched---of 9
remap_pfn_range---of 24
remap_pfn_range_notrack---of 95
set_pte_range---of 31
swap_pte_batch---of 25
tlb_flush_mmu_tlbonly---of 20
trace_rss_stat---of 15
unmap_mapping_folio---of 10
unmap_mapping_pages---of 7
unmap_mapping_range---of 1
unmap_page_range---of 260
unmap_single_vma.isra.0---of 18
unmap_vmas---of 16
validate_page_before_insert---of 20
vm_insert_page---of 26
vm_insert_pages---of 55
vm_iomap_memory---of 8
vm_map_pages---of 1
vm_map_pages_zero---of 1
vm_mixed_zeropage_allowed---of 15
vm_normal_folio---of 14
vm_normal_folio_pmd---of 14
vm_normal_page---of 16
vm_normal_page_pmd---of 15
vm_normal_page_pud---of 15
vma_pgtable_walk_begin---of 4
vma_pgtable_walk_end---of 4
vma_refcount_put---of 10
vmf_insert_mixed---of 1
vmf_insert_mixed_mkwrite---of 1
vmf_insert_page_mkwrite---of 11
vmf_insert_pfn---of 1
vmf_insert_pfn_prot---of 47
walk_to_pmd---of 26
wp_page_reuse---of 45
zap_page_range_single---of 1
zap_page_range_single_batched---of 20
zap_vma_ptes---of 10
-----------
SUMMARY24%of 357

-----------
SUMMARY---of 0

__find_acq_core65%of 81
__km_new_mapping---of 18
__sk_dst_set.constprop.0---of 9
__xfrm6_sort---of 9
__xfrm6_state_sort_cmp---of 7
__xfrm6_tmpl_sort_cmp---of 7
__xfrm_find_acq_byseq58%of 19
__xfrm_init_state60%of 115
__xfrm_state_bump_genids100%of 19
__xfrm_state_delete70%of 36
__xfrm_state_destroy84%of 6
__xfrm_state_insert69%of 70
__xfrm_state_lookup.isra.075%of 35
__xfrm_state_lookup_byaddr.isra.038%of 35
jhash2.constprop.0100%of 1
km_migrate56%of 18
km_new_mapping---of 8
km_policy_expired100%of 1
km_policy_notify57%of 16
km_query60%of 15
km_report---of 18
km_state_expired100%of 1
km_state_notify57%of 16
verify_spi_info80%of 15
xfrm_alloc_spi50%of 92
xfrm_audit_helper_pktinfo---of 7
xfrm_audit_helper_sainfo---of 8
xfrm_audit_state_add34%of 9
xfrm_audit_state_delete34%of 9
xfrm_audit_state_icvfail---of 8
xfrm_audit_state_notfound---of 7
xfrm_audit_state_notfound_simple---of 7
xfrm_audit_state_replay---of 7
xfrm_audit_state_replay_overflow---of 7
xfrm_dev_state_flush---of 41
xfrm_dst_hash90%of 10
xfrm_find_acq100%of 1
xfrm_find_acq_byseq100%of 1
xfrm_flush_gc---of 1
xfrm_get_acqseq75%of 4
xfrm_get_mode82%of 11
xfrm_get_translator48%of 21
xfrm_hash_grow_check100%of 5
xfrm_hash_ptrs_get61%of 28
xfrm_hash_resize---of 107
xfrm_init_state58%of 7
xfrm_init_tempstate---of 36
xfrm_input_state_lookup---of 57
xfrm_migrate_state_find34%of 59
xfrm_put_translator100%of 1
xfrm_register_km---of 1
xfrm_register_mode_cbs---of 4
xfrm_register_translator---of 4
xfrm_register_type---of 32
xfrm_register_type_offload---of 15
xfrm_replay_timer_handler---of 23
xfrm_sad_getinfo100%of 1
xfrm_seq_hash84%of 6
xfrm_set_type_offload40%of 28
xfrm_src_hash90%of 10
xfrm_state_add54%of 43
xfrm_state_afinfo_get_rcu---of 9
xfrm_state_alloc75%of 4
xfrm_state_check_expire---of 14
xfrm_state_delete100%of 1
xfrm_state_delete_tunnel92%of 12
xfrm_state_find---of 332
xfrm_state_fini---of 13
xfrm_state_flush88%of 54
xfrm_state_free100%of 1
xfrm_state_gc_task---of 41
xfrm_state_get_afinfo35%of 20
xfrm_state_init---of 18
xfrm_state_insert100%of 1
xfrm_state_look_at---of 36
xfrm_state_lookup46%of 11
xfrm_state_lookup_byaddr46%of 11
xfrm_state_lookup_byspi---of 14
xfrm_state_migrate---of 65
xfrm_state_mtu---of 19
xfrm_state_register_afinfo---of 8
xfrm_state_sort---of 6
xfrm_state_unregister_afinfo---of 9
xfrm_state_update52%of 74
xfrm_state_update_stats---of 9
xfrm_state_walk82%of 53
xfrm_state_walk_done60%of 5
xfrm_state_walk_init100%of 1
xfrm_stateonly_find---of 39
xfrm_timer_handler---of 50
xfrm_tmpl_sort---of 6
xfrm_unregister_km---of 1
xfrm_unregister_mode_cbs---of 4
xfrm_unregister_translator---of 6
xfrm_unregister_type---of 33
xfrm_unregister_type_offload---of 15
xfrm_user_policy---of 48
-----------
SUMMARY61%of 1093

__hmac_sha1_preparekey88%of 8
__sha1_final---of 6
hmac_sha1---of 1
hmac_sha1_final---of 3
hmac_sha1_init---of 1
hmac_sha1_init_usingrawkey---of 1
hmac_sha1_preparekey100%of 1
hmac_sha1_usingrawkey---of 1
sha1---of 1
sha1_blocks_avx---of 5
sha1_blocks_avx263%of 8
sha1_blocks_generic---of 4
sha1_blocks_ni---of 5
sha1_blocks_ssse3---of 5
sha1_final---of 1
sha1_init---of 1
sha1_init_raw---of 1
sha1_transform---of 11
sha1_update---of 16
-----------
SUMMARY77%of 17

-----------
SUMMARY---of 0

timerqueue_add93%of 13
timerqueue_del60%of 5
timerqueue_iterate_next---of 6
-----------
SUMMARY84%of 18

-----------
SUMMARY---of 0

cbcmac_create---of 11
cbcmac_exit_tfm---of 1
cbcmac_init_tfm60%of 5
crypto_cbcmac_digest_finup---of 4
crypto_cbcmac_digest_init---of 1
crypto_cbcmac_digest_setkey---of 1
crypto_cbcmac_digest_update---of 4
crypto_ccm_auth---of 42
crypto_ccm_base_create---of 7
crypto_ccm_create---of 8
crypto_ccm_create_common---of 24
crypto_ccm_decrypt---of 19
crypto_ccm_decrypt_done---of 12
crypto_ccm_encrypt---of 19
crypto_ccm_encrypt_done---of 3
crypto_ccm_exit_tfm---of 1
crypto_ccm_free---of 1
crypto_ccm_init_crypt---of 20
crypto_ccm_init_tfm67%of 6
crypto_ccm_setauthsize---of 4
crypto_ccm_setkey67%of 3
crypto_rfc4309_create---of 16
crypto_rfc4309_crypt---of 21
crypto_rfc4309_decrypt---of 6
crypto_rfc4309_encrypt---of 6
crypto_rfc4309_exit_tfm---of 1
crypto_rfc4309_free---of 1
crypto_rfc4309_init_tfm60%of 5
crypto_rfc4309_setauthsize---of 4
crypto_rfc4309_setkey75%of 4
-----------
SUMMARY66%of 23

-----------
SUMMARY---of 0

__alloc_workqueue---of 94
__bpf_trace_workqueue_activate_work---of 1
__bpf_trace_workqueue_execute_end---of 1
__bpf_trace_workqueue_execute_start---of 1
__bpf_trace_workqueue_queue_work---of 1
__cancel_work---of 12
__cancel_work_sync---of 9
__flush_work---of 63
__flush_workqueue---of 56
__probestub_workqueue_activate_work---of 1
__probestub_workqueue_execute_end---of 1
__probestub_workqueue_execute_start---of 1
__probestub_workqueue_queue_work---of 1
__pwq_activate_work---of 5
__queue_delayed_work---of 22
__queue_work41%of 97
__traceiter_workqueue_activate_work---of 8
__traceiter_workqueue_execute_end---of 7
__traceiter_workqueue_execute_start---of 8
__traceiter_workqueue_queue_work---of 7
__warn_flushing_systemwide_wq---of 1
__wq_cpumask_show.isra.0---of 1
alloc_unbound_pwq---of 53
alloc_worker---of 4
alloc_workqueue_attrs_noprof---of 4
alloc_workqueue_lockdep_map---of 4
alloc_workqueue_noprof---of 6
apply_workqueue_attrs---of 1
apply_workqueue_attrs_locked---of 7
apply_wqattrs_cleanup---of 14
apply_wqattrs_commit---of 11
apply_wqattrs_prepare---of 34
assign_work---of 9
bh_pool_kick_highpri---of 1
bh_pool_kick_normal---of 1
bh_worker---of 33
cancel_delayed_work---of 1
cancel_delayed_work_sync---of 1
cancel_work---of 1
cancel_work_sync---of 1
check_flush_dependency---of 19
clear_pending_if_disabled50%of 6
cpumask_isolated_show---of 1
cpumask_requested_show---of 1
cpumask_show---of 1
cpumask_store---of 11
create_worker---of 27
current_is_workqueue_rescuer---of 8
current_work---of 8
delayed_work_timer_fn---of 1
destroy_workqueue---of 42
detach_worker---of 6
disable_delayed_work---of 1
disable_delayed_work_sync---of 1
disable_work---of 1
disable_work_sync---of 1
drain_dead_softirq_workfn---of 10
drain_workqueue---of 17
enable_delayed_work---of 1
enable_work---of 9
execute_in_process_context---of 4
find_worker_executing_work---of 6
flush_delayed_work---of 5
flush_rcu_work---of 5
flush_work---of 1
flush_workqueue_prep_pwqs---of 28
format_worker_id.isra.0---of 11
free_node_nr_active---of 7
free_workqueue_attrs---of 5
freeze_workqueues_begin---of 5
freeze_workqueues_busy---of 30
get_pwq75%of 8
get_work_pool58%of 14
idle_cull_fn---of 11
idle_worker_timeout---of 7
init_node_nr_active---of 1
init_pwq---of 3
init_rescuer---of 15
init_worker_pool---of 8
insert_work67%of 3
install_unbound_pwq---of 12
jhash---of 21
kick_pool44%of 30
kmalloc_array_noprof.constprop.0---of 5
link_pwq---of 9
max_active_show---of 1
max_active_store---of 6
mod_delayed_work_on---of 5
move_linked_works---of 8
parse_affn_scope---of 6
per_cpu_show---of 1
perf_trace_workqueue_activate_work---of 7
perf_trace_workqueue_execute_end---of 7
perf_trace_workqueue_execute_start---of 7
perf_trace_workqueue_queue_work---of 9
pool_mayday_timeout---of 19
pr_cont_pool_info---of 10
pr_cont_work---of 8
pr_cont_work_flush---of 16
pr_cont_worker_id---of 10
print_worker_info---of 11
process_scheduled_works---of 100
put_pwq---of 9
put_pwq_unlocked---of 5
put_unbound_pool---of 37
pwq_activate_first_inactive---of 7
pwq_dec_nr_in_flight---of 54
pwq_release_workfn---of 36
pwq_tryinc_nr_active52%of 31
queue_delayed_work_on---of 9
queue_rcu_work---of 7
queue_work_node---of 21
queue_work_on100%of 9
rcu_free_pool---of 4
rcu_free_wq---of 10
rcu_work_rcufn---of 3
reap_dying_workers---of 3
rescuer_thread---of 43
schedule_on_each_cpu---of 19
set_pf_worker---of 4
set_worker_desc---of 7
set_worker_dying---of 24
show_all_workqueues---of 35
show_freezable_workqueues---of 16
show_one_workqueue---of 13
show_pwq---of 48
thaw_workqueues---of 6
too_many_workers.isra.0---of 6
touch_wq_lockdep_map---of 7
trace_event_get_offsets_workqueue_queue_work.isra.0---of 7
trace_event_raw_event_workqueue_activate_work---of 8
trace_event_raw_event_workqueue_execute_end---of 8
trace_event_raw_event_workqueue_execute_start---of 8
trace_event_raw_event_workqueue_queue_work---of 10
trace_raw_output_workqueue_activate_work---of 4
trace_raw_output_workqueue_execute_end---of 4
trace_raw_output_workqueue_execute_start---of 4
trace_raw_output_workqueue_queue_work---of 4
trace_workqueue_activate_work34%of 15
tryinc_node_nr_active60%of 5
unbind_worker---of 11
unbound_pwq---of 11
unbound_wq_update_pwq---of 21
work_busy---of 19
work_for_cpu_fn---of 1
work_grab_pending---of 41
work_offqd_unpack---of 3
work_on_cpu_key---of 1
worker_attach_to_pool---of 13
worker_enter_idle---of 16
worker_leave_idle---of 10
worker_pool_assign_id---of 9
worker_thread---of 63
workqueue_apply_unbound_cpumask---of 19
workqueue_congested---of 6
workqueue_offline_cpu---of 31
workqueue_online_cpu---of 73
workqueue_prepare_cpu---of 8
workqueue_set_max_active---of 9
workqueue_set_min_active---of 8
workqueue_softirq_action---of 6
workqueue_softirq_dead---of 11
workqueue_sysfs_register---of 15
workqueue_unbound_exclude_cpumask---of 8
wq_adjust_max_active---of 22
wq_affinity_strict_show---of 1
wq_affinity_strict_store---of 9
wq_affn_dfl_get---of 1
wq_affn_dfl_set---of 14
wq_affn_scope_show---of 4
wq_affn_scope_store---of 9
wq_barrier_func---of 1
wq_calc_pod_cpumask---of 3
wq_clamp_max_active.isra.0---of 3
wq_cpumask_show---of 1
wq_cpumask_store---of 9
wq_device_release---of 1
wq_nice_show---of 1
wq_nice_store---of 11
wq_sysfs_prep_attrs---of 10
wq_update_node_max_active---of 45
wq_watchdog_param_set_thresh---of 6
wq_watchdog_reset_touched---of 8
wq_watchdog_set_thresh---of 4
wq_watchdog_timer_fn---of 61
wq_watchdog_touch72%of 7
wq_worker_comm---of 16
wq_worker_last_func---of 1
wq_worker_running---of 9
wq_worker_sleeping---of 12
wq_worker_tick---of 20
wqattrs_pod_type---of 15
-----------
SUMMARY49%of 225

-----------
SUMMARY---of 0

__import_iovec65%of 31
__iov_iter_get_pages_alloc---of 134
_copy_from_iter27%of 92
_copy_from_iter_flushcache---of 71
_copy_from_iter_nocache---of 73
_copy_mc_to_iter---of 93
_copy_to_iter---of 92
copy_compat_iovec_from_user---of 15
copy_folio_from_iter_atomic---of 105
copy_iovec_from_user67%of 15
copy_page_from_iter---of 11
copy_page_to_iter---of 13
copy_page_to_iter_nofault---of 105
dup_iter---of 8
fault_in_iov_iter_readable---of 12
fault_in_iov_iter_writeable---of 12
first_iovec_segment---of 12
folio_order---of 5
import_iovec67%of 3
import_ubuf63%of 8
iov_iter_advance---of 46
iov_iter_alignment---of 20
iov_iter_bvec---of 3
iov_iter_discard---of 3
iov_iter_extract_pages---of 129
iov_iter_folio_queue---of 3
iov_iter_gap_alignment---of 14
iov_iter_get_pages2---of 6
iov_iter_get_pages_alloc2---of 3
iov_iter_init---of 3
iov_iter_kvec---of 3
iov_iter_npages---of 26
iov_iter_restore---of 11
iov_iter_revert---of 30
iov_iter_single_seg_count---of 11
iov_iter_xarray---of 3
iov_iter_zero---of 79
iovec_from_user53%of 17
page_copy_sane---of 21
rcu_lock_acquire.constprop.0---of 1
rcu_read_unlock---of 6
want_pages_array---of 7
xas_next_entry.constprop.0---of 23
xas_reload---of 26
-----------
SUMMARY43%of 166

-----------
SUMMARY---of 0

deflate_alloc_stream75%of 4
deflate_compress---of 21
deflate_decompress---of 22
deflate_free_stream---of 1
deflate_init100%of 1
-----------
SUMMARY80%of 5

__crypto_sha1_export---of 1
__crypto_sha1_export_core---of 1
__crypto_sha1_import---of 1
__crypto_sha1_import_core---of 1
crypto_hmac_sha1_digest---of 1
crypto_hmac_sha1_export---of 1
crypto_hmac_sha1_export_core---of 1
crypto_hmac_sha1_final---of 1
crypto_hmac_sha1_import---of 1
crypto_hmac_sha1_import_core---of 1
crypto_hmac_sha1_init---of 1
crypto_hmac_sha1_setkey100%of 1
crypto_hmac_sha1_update---of 1
crypto_sha1_digest---of 1
crypto_sha1_export---of 1
crypto_sha1_export_core---of 1
crypto_sha1_final---of 1
crypto_sha1_import---of 1
crypto_sha1_import_core---of 1
crypto_sha1_init---of 1
crypto_sha1_update---of 1
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

__security_genfs_sid---of 24
__security_read_policy---of 4
aurule_avc_callback---of 4
constraint_expr_eval---of 92
context_destroy---of 1
context_struct_compute_av---of 68
context_struct_to_string---of 12
dump_masked_av_helper---of 3
get_classes_callback---of 3
get_permissions_callback---of 3
kmalloc_array_noprof---of 1
ocontext_to_sid.constprop.0---of 6
security_bounded_transition---of 34
security_change_sid---of 1
security_compute_av---of 76
security_compute_av_user---of 46
security_compute_sid---of 160
security_compute_validatetrans---of 48
security_compute_xperms_decision---of 76
security_context_str_to_sid---of 1
security_context_to_sid100%of 1
security_context_to_sid_core7%of 49
security_context_to_sid_default---of 1
security_context_to_sid_force---of 1
security_dump_masked_av.constprop.0---of 27
security_fs_use---of 42
security_genfs_sid---of 21
security_get_allow_unknown---of 19
security_get_bool_value---of 21
security_get_bools---of 20
security_get_classes---of 8
security_get_initial_sid_context---of 5
security_get_permissions---of 16
security_get_reject_unknown---of 19
security_get_user_sids---of 90
security_ib_endport_sid---of 34
security_ib_pkey_sid---of 35
security_load_policy---of 74
security_member_sid---of 1
security_mls_enabled---of 18
security_net_peersid_resolve---of 43
security_netif_sid---of 37
security_netlbl_secattr_to_sid---of 50
security_netlbl_sid_to_secattr---of 25
security_node_sid---of 47
security_policycap_supported---of 18
security_port_sid---of 35
security_read_policy---of 11
security_read_state_kernel---of 12
security_set_bools---of 20
security_sid_mls_copy---of 52
security_sid_to_context---of 1
security_sid_to_context_core---of 42
security_sid_to_context_force---of 1
security_sid_to_context_inval---of 1
security_sidtab_hash_stats---of 19
security_transition_sid---of 4
security_transition_sid_user---of 1
security_validate_transition---of 1
security_validate_transition_user---of 1
selinux_audit_rule_free---of 5
selinux_audit_rule_init---of 51
selinux_audit_rule_known---of 6
selinux_audit_rule_match---of 90
selinux_notify_policy_change---of 6
selinux_policy_cancel---of 6
selinux_policy_commit---of 38
selinux_policy_free---of 5
selinux_policy_genfs_sid---of 1
services_compute_xperms_decision---of 20
services_compute_xperms_drivers---of 7
services_convert_context---of 37
sidtab_entry_to_string---of 6
string_to_context_struct---of 26
type_attribute_bounds_av---of 12
update_xperms_extended_data---of 6
-----------
SUMMARY8%of 50

__peernet2id58%of 7
__put_net---of 6
cleanup_net---of 26
copy_net_ns---of 45
get_net_ns---of 7
get_net_ns_by_fd---of 15
get_net_ns_by_id---of 18
get_net_ns_by_pid---of 24
net_alloc_generic---of 4
net_drop_ns---of 5
net_eq_idr---of 4
net_ns_barrier---of 1
net_ns_get_ownership---of 10
net_ns_net_init---of 1
net_passive_dec---of 9
netns_get---of 13
netns_install---of 18
netns_owner---of 1
netns_put---of 6
ops_init---of 31
ops_undo_list---of 60
ops_undo_single.constprop.0---of 1
peernet2id46%of 11
peernet2id_alloc---of 18
peernet_has_id---of 1
preinit_net---of 7
refcount_inc_not_zero---of 13
register_pernet_device---of 4
register_pernet_operations---of 18
register_pernet_subsys---of 1
rtnl_net_dumpid---of 22
rtnl_net_dumpid_one---of 9
rtnl_net_fill---of 15
rtnl_net_getid---of 68
rtnl_net_newid---of 45
rtnl_net_notifyid---of 10
rtnl_valid_dump_net_req.constprop.0.isra.0---of 22
setup_net---of 6
unregister_pernet_device---of 3
unregister_pernet_operations---of 8
unregister_pernet_subsys---of 1
-----------
SUMMARY50%of 18

-----------
SUMMARY---of 0

selinux_xfrm_alloc_user58%of 19
selinux_xfrm_decode_session---of 5
selinux_xfrm_delete60%of 5
selinux_xfrm_free60%of 5
selinux_xfrm_policy_alloc100%of 1
selinux_xfrm_policy_clone---of 6
selinux_xfrm_policy_delete100%of 1
selinux_xfrm_policy_free100%of 1
selinux_xfrm_policy_lookup50%of 6
selinux_xfrm_postroute_last---of 21
selinux_xfrm_skb_sid---of 19
selinux_xfrm_skb_sid_ingress---of 16
selinux_xfrm_sock_rcv_skb---of 14
selinux_xfrm_state_alloc100%of 1
selinux_xfrm_state_alloc_acquire---of 13
selinux_xfrm_state_delete100%of 1
selinux_xfrm_state_free---of 1
selinux_xfrm_state_pol_flow_match---of 9
-----------
SUMMARY63%of 40

crypto_has_acomp100%of 1
xfrm_aalg_get_byid---of 1
xfrm_aalg_get_byidx---of 5
xfrm_aalg_get_byname100%of 1
xfrm_aead_get_byname100%of 1
xfrm_aead_name_match86%of 7
xfrm_alg_id_match---of 1
xfrm_alg_name_match90%of 10
xfrm_calg_get_byid---of 1
xfrm_calg_get_byname100%of 1
xfrm_count_pfkey_auth_supported---of 6
xfrm_count_pfkey_enc_supported---of 6
xfrm_ealg_get_byid---of 1
xfrm_ealg_get_byidx---of 5
xfrm_ealg_get_byname100%of 1
xfrm_find_algo92%of 12
xfrm_probe_algs---of 17
-----------
SUMMARY92%of 34

__radix_tree_delete---of 24
__radix_tree_lookup62%of 21
__radix_tree_preload---of 27
__radix_tree_replace---of 22
delete_node---of 37
idr_destroy---of 14
idr_get_free---of 45
idr_preload---of 9
node_tag_clear---of 10
radix_tree_cpu_dead---of 3
radix_tree_delete---of 1
radix_tree_delete_item---of 18
radix_tree_extend---of 29
radix_tree_gang_lookup---of 18
radix_tree_gang_lookup_tag---of 22
radix_tree_gang_lookup_tag_slot---of 17
radix_tree_insert---of 34
radix_tree_iter_delete---of 3
radix_tree_iter_replace---of 1
radix_tree_iter_resume---of 1
radix_tree_iter_tag_clear---of 1
radix_tree_lookup100%of 1
radix_tree_lookup_slot---of 4
radix_tree_maybe_preload---of 9
radix_tree_next_chunk18%of 51
radix_tree_node_alloc.constprop.0---of 15
radix_tree_node_ctor---of 1
radix_tree_node_rcu_free---of 1
radix_tree_preload---of 3
radix_tree_replace_slot---of 1
radix_tree_tag_clear---of 13
radix_tree_tag_get---of 16
radix_tree_tag_set---of 13
radix_tree_tagged---of 1
-----------
SUMMARY32%of 73

aead_register_instance---of 8
crypto_aead_decrypt---of 6
crypto_aead_encrypt---of 4
crypto_aead_exit_tfm---of 1
crypto_aead_free_instance---of 1
crypto_aead_init_tfm100%of 6
crypto_aead_report---of 1
crypto_aead_setauthsize73%of 11
crypto_aead_setkey67%of 9
crypto_aead_show---of 3
crypto_alloc_aead100%of 1
crypto_grab_aead---of 1
crypto_has_aead100%of 1
crypto_register_aead---of 6
crypto_register_aeads---of 8
crypto_unregister_aead---of 1
crypto_unregister_aeads---of 3
-----------
SUMMARY79%of 28

-----------
SUMMARY---of 0

ipcomp6_err---of 14
ipcomp6_init_state56%of 38
ipcomp6_rcv_cb---of 1
-----------
SUMMARY56%of 38

-----------
SUMMARY---of 0

__add_preferred_console.constprop.0---of 47
__bpf_trace_console---of 1
__console_rewind_all---of 16
__control_devkmsg---of 13
__down_trylock_console_sem72%of 7
__ia32_sys_syslog---of 1
__pr_flush---of 61
__printk_cpu_sync_put---of 5
__printk_cpu_sync_try_get---of 6
__printk_cpu_sync_wait---of 4
__printk_ratelimit---of 1
__probestub_console---of 1
__traceiter_console---of 7
__up_console_sem60%of 5
__wake_up_klogd46%of 11
__x64_sys_syslog---of 1
_printk100%of 1
_printk_deferred100%of 1
add_preferred_console---of 1
check_syslog_permissions---of 16
console_call_setup---of 5
console_conditional_schedule---of 4
console_cpu_notify---of 7
console_device---of 14
console_flush_all54%of 69
console_flush_on_panic---of 7
console_force_preferred_locked---of 13
console_list_lock---of 8
console_list_unlock---of 1
console_lock---of 4
console_lock_spinning_disable_and_check45%of 9
console_lock_spinning_enable75%of 4
console_prepend_dropped---of 1
console_prepend_message---of 14
console_prepend_replay---of 1
console_resume---of 6
console_resume_all---of 14
console_srcu_read_lock---of 1
console_srcu_read_lock_is_held---of 3
console_srcu_read_unlock---of 3
console_suspend---of 1
console_suspend_all---of 10
console_try_replay_all---of 9
console_trylock58%of 7
console_unblank---of 47
console_unlock65%of 14
console_verbose---of 4
defer_console_output---of 1
devkmsg_emit.constprop.0---of 1
devkmsg_llseek---of 10
devkmsg_open---of 11
devkmsg_poll---of 9
devkmsg_read---of 28
devkmsg_release---of 4
devkmsg_sysctl_set_loglvl---of 20
devkmsg_write---of 24
do_syslog---of 53
early_printk---of 4
find_first_fitting_seq---of 14
info_print_prefix80%of 5
is_console_locked---of 1
is_printk_cpu_sync_owner100%of 1
kmsg_dump_desc---of 19
kmsg_dump_get_buffer---of 25
kmsg_dump_get_line---of 17
kmsg_dump_reason_str---of 5
kmsg_dump_register---of 6
kmsg_dump_rewind---of 3
kmsg_dump_unregister---of 4
lockdep_assert_console_list_lock_held---of 7
log_buf_addr_get---of 1
log_buf_len_get---of 1
log_buf_vmcoreinfo_setup---of 1
match_devname_and_update_preferred_console---of 24
msg_add_dict_text---of 3
msg_add_ext_text---of 11
perf_trace_console---of 10
pr_flush---of 1
printk_get_console_flush_type19%of 37
printk_get_next_message56%of 20
printk_kthreads_check_locked---of 28
printk_kthreads_shutdown---of 11
printk_legacy_allow_panic_sync---of 4
printk_parse_prefix77%of 17
printk_percpu_data_ready100%of 1
printk_sprint50%of 22
printk_timed_ratelimit---of 6
printk_trigger_flush---of 1
record_print_text59%of 17
register_console---of 110
syslog_print---of 44
syslog_print_all---of 23
trace_event_raw_event_console---of 11
trace_raw_output_console---of 5
try_enable_preferred_console---of 28
unregister_console---of 1
unregister_console_locked---of 62
vprintk_default100%of 1
vprintk_deferred---of 1
vprintk_emit39%of 44
vprintk_store50%of 55
wake_up_klogd---of 1
wake_up_klogd_work_func---of 10
-----------
SUMMARY51%of 348

-----------
SUMMARY---of 0

_copy_from_user60%of 5
_copy_to_user---of 8
check_zeroed_user---of 22
-----------
SUMMARY60%of 5

ipcomp4_err---of 17
ipcomp4_init_state53%of 34
ipcomp4_rcv_cb---of 1
-----------
SUMMARY53%of 34

-----------
SUMMARY---of 0

__usermodehelper_disable---of 15
__usermodehelper_set_disable_depth---of 1
call_usermodehelper---of 6
call_usermodehelper_exec63%of 24
call_usermodehelper_exec_async---of 11
call_usermodehelper_exec_work---of 10
call_usermodehelper_freeinfo75%of 4
call_usermodehelper_setup45%of 9
proc_cap_handler---of 11
umh_complete---of 5
usermodehelper_read_lock_wait---of 10
usermodehelper_read_trylock---of 13
usermodehelper_read_unlock---of 1
-----------
SUMMARY60%of 37

-----------
SUMMARY---of 0

__bitmap_and---of 6
__bitmap_andnot---of 6
__bitmap_clear100%of 6
__bitmap_complement---of 4
__bitmap_equal---of 8
__bitmap_intersects---of 8
__bitmap_or---of 4
__bitmap_or_equal---of 8
__bitmap_replace---of 4
__bitmap_set100%of 6
__bitmap_shift_left---of 10
__bitmap_shift_right---of 15
__bitmap_subset---of 8
__bitmap_weight---of 5
__bitmap_weight_and---of 5
__bitmap_weight_andnot---of 5
__bitmap_xor---of 4
bitmap_alloc---of 1
bitmap_alloc_node---of 1
bitmap_bitremap---of 9
bitmap_cut---of 11
bitmap_find_next_zero_area_off---of 7
bitmap_fold---of 7
bitmap_free---of 1
bitmap_from_arr32---of 8
bitmap_onto---of 9
bitmap_pos_to_ord---of 6
bitmap_remap---of 15
bitmap_to_arr32---of 8
bitmap_zalloc---of 1
bitmap_zalloc_node---of 1
devm_bitmap_alloc---of 8
devm_bitmap_free---of 1
devm_bitmap_zalloc---of 1
kmalloc_array_node_noprof.constprop.0---of 1
kmalloc_array_noprof.constprop.0---of 1
-----------
SUMMARY100%of 12

-----------
SUMMARY---of 0

__start_tx---of 18
__stop_tx---of 28
__uart_port_nbcon_acquire58%of 19
__uart_port_nbcon_release59%of 17
autoconfig_read_divisor_id---of 1
class_uart_port_lock_irq_destructor.isra.0---of 20
class_uart_port_lock_irqsave_constructor---of 19
class_uart_port_lock_irqsave_destructor.isra.0---of 19
default_serial_dl_read---of 1
default_serial_dl_write---of 1
fifo_wait_for_lsr58%of 7
hub6_serial_in---of 1
hub6_serial_out---of 1
io_serial_in100%of 1
io_serial_out100%of 1
mem16_serial_in---of 1
mem16_serial_out---of 1
mem32_serial_in---of 1
mem32_serial_out---of 1
mem32be_serial_in---of 1
mem32be_serial_out---of 1
mem_serial_in---of 1
mem_serial_out---of 1
no_serial_in---of 1
no_serial_out---of 1
rx_trig_bytes_show---of 9
rx_trig_bytes_store---of 16
serial8250_break_ctl---of 9
serial8250_clear_IER60%of 5
serial8250_clear_and_reinit_fifos---of 1
serial8250_clear_fifos---of 3
serial8250_clear_interrupts---of 1
serial8250_config_port---of 143
serial8250_console_exit---of 5
serial8250_console_setup---of 14
serial8250_console_wait_putchar---of 1
serial8250_console_write42%of 72
serial8250_default_handle_irq---of 6
serial8250_do_get_mctrl---of 14
serial8250_do_pm---of 1
serial8250_do_set_divisor---of 4
serial8250_do_set_ldisc---of 17
serial8250_do_set_mctrl---of 11
serial8250_do_set_termios---of 70
serial8250_do_shutdown---of 9
serial8250_do_startup---of 80
serial8250_em485_config---of 16
serial8250_em485_destroy---of 5
serial8250_em485_handle_start_tx---of 21
serial8250_em485_handle_stop_tx---of 8
serial8250_em485_start_tx---of 10
serial8250_em485_stop_tx---of 14
serial8250_enable_ms---of 14
serial8250_flush_buffer---of 5
serial8250_get_baud_rate---of 4
serial8250_get_divisor---of 13
serial8250_get_mctrl---of 5
serial8250_handle_irq---of 38
serial8250_init_port---of 1
serial8250_modem_status---of 14
serial8250_pm---of 5
serial8250_read_char---of 40
serial8250_release_port---of 1
serial8250_release_std_resource---of 10
serial8250_request_port---of 1
serial8250_request_std_resource---of 14
serial8250_rpm_get---of 4
serial8250_rpm_put---of 4
serial8250_rx_chars---of 7
serial8250_set_defaults---of 16
serial8250_set_divisor---of 5
serial8250_set_ldisc---of 5
serial8250_set_mctrl---of 8
serial8250_set_sleep---of 36
serial8250_set_termios---of 5
serial8250_shutdown---of 5
serial8250_start_tx---of 26
serial8250_startup---of 5
serial8250_stop_rx---of 11
serial8250_stop_tx---of 8
serial8250_throttle---of 1
serial8250_tx_chars---of 34
serial8250_tx_empty---of 11
serial8250_tx_threshold_handle_irq---of 3
serial8250_type---of 5
serial8250_unthrottle---of 1
serial8250_update_uartclk---of 20
serial8250_verify_port---of 15
serial_icr_read---of 1
serial_icr_write---of 1
serial_port_out_sync.constprop.0---of 5
set_io_from_upio---of 9
size_fifo---of 14
wait_for_lsr67%of 9
wait_for_xmitr29%of 7
-----------
SUMMARY50%of 138

-----------
SUMMARY---of 0

__warn---of 24
__warn_printk---of 6
_print_tainted---of 20
add_taint---of 6
check_panic_on_warn---of 6
clear_warn_once_fops_open---of 1
clear_warn_once_set---of 1
do_oops_enter_exit---of 14
get_taint---of 1
nmi_panic---of 5
no_blink---of 1
oops_enter---of 4
oops_exit---of 1
oops_may_print---of 1
panic---of 1
panic_in_progress100%of 1
panic_on_other_cpu60%of 5
panic_on_this_cpu100%of 1
panic_print_deprecated---of 4
panic_print_get---of 1
panic_print_set---of 1
panic_reset---of 1
panic_smp_self_stop---of 2
panic_try_start---of 1
print_tainted---of 1
print_tainted_verbose---of 1
proc_taint---of 17
sysctl_panic_print_handler---of 1
test_taint---of 1
vpanic---of 52
warn_count_show---of 1
-----------
SUMMARY72%of 7

_sub_D_00100_1---of 1
_sub_I_00100_0---of 1
jent_apt_init75%of 4
jent_apt_insert50%of 10
jent_apt_reset---of 1
jent_condition_data100%of 2
jent_delta100%of 2
jent_entropy_collector_alloc70%of 10
jent_entropy_collector_free---of 1
jent_entropy_init---of 31
jent_gen_entropy89%of 9
jent_health_failure100%of 3
jent_loop_shuffle100%of 5
jent_measure_jitter67%of 6
jent_memaccess78%of 9
jent_rct_insert43%of 7
jent_read_entropy22%of 14
jent_stuck84%of 6
-----------
SUMMARY66%of 87

-----------
SUMMARY---of 0

__bpf_trace_ma_op---of 1
__bpf_trace_ma_read---of 1
__bpf_trace_ma_write---of 1
__mt_destroy---of 22
__mt_dup---of 6
__probestub_ma_op---of 1
__probestub_ma_read---of 1
__probestub_ma_write---of 1
__traceiter_ma_op---of 7
__traceiter_ma_read---of 7
__traceiter_ma_write---of 7
ma_free_rcu---of 7
mab_mas_cp---of 33
mab_shift_right---of 12
mas_alloc_cyclic---of 25
mas_alloc_nodes---of 22
mas_ascend---of 55
mas_commit_b_node.isra.0---of 7
mas_descend---of 27
mas_destroy---of 6
mas_dup_build.constprop.0---of 80
mas_dup_free---of 39
mas_empty_area---of 116
mas_empty_area_rev---of 162
mas_erase---of 43
mas_find---of 33
mas_find_child---of 26
mas_find_range---of 33
mas_find_range_rev---of 4
mas_find_rev---of 4
mas_find_rev_setup.constprop.0---of 48
mas_insert.isra.0---of 37
mas_leaf_max_gap---of 33
mas_mab_cp---of 34
mas_may_activate---of 9
mas_new_ma_node.isra.0---of 7
mas_new_root---of 32
mas_next---of 4
mas_next_range---of 4
mas_next_setup.constprop.0---of 24
mas_next_sibling---of 23
mas_next_slot---of 167
mas_nomem---of 12
mas_pause---of 1
mas_prealloc_calc---of 19
mas_preallocate---of 35
mas_prev---of 4
mas_prev_range---of 4
mas_prev_setup.constprop.0---of 44
mas_prev_slot---of 172
mas_push_data---of 55
mas_put_in_tree---of 16
mas_rebalance---of 42
mas_root_expand---of 28
mas_skip_node---of 34
mas_spanning_rebalance---of 221
mas_split---of 43
mas_start47%of 26
mas_store---of 36
mas_store_b_node---of 32
mas_store_gfp---of 43
mas_store_prealloc---of 58
mas_topiary_replace---of 94
mas_update_gap---of 59
mas_walk---of 20
mas_wr_bnode---of 1
mas_wr_node_store---of 30
mas_wr_node_walk---of 22
mas_wr_spanning_store---of 59
mas_wr_store_entry---of 87
mas_wr_store_type---of 97
mas_wr_walk_index---of 20
mast_fill_bnode---of 35
mast_spanning_rebalance.isra.0---of 48
mast_split_data---of 58
mt_destroy_walk---of 58
mt_find37%of 44
mt_find_after---of 4
mt_free_walk---of 28
mt_next---of 11
mt_prev---of 11
mte_dead_leaves---of 21
mte_dead_walk---of 13
mte_destroy_descend---of 32
mtree_alloc_cyclic---of 8
mtree_alloc_range---of 23
mtree_alloc_rrange---of 23
mtree_destroy---of 1
mtree_dup---of 6
mtree_erase---of 1
mtree_insert---of 1
mtree_insert_range---of 20
mtree_load---of 52
mtree_range_walk54%of 41
mtree_store---of 1
mtree_store_range---of 8
perf_trace_ma_op---of 7
perf_trace_ma_read---of 7
perf_trace_ma_write---of 7
trace_event_raw_event_ma_op---of 8
trace_event_raw_event_ma_read---of 8
trace_event_raw_event_ma_write---of 8
trace_ma_op---of 15
trace_ma_read.constprop.034%of 15
trace_ma_write---of 15
trace_raw_output_ma_op---of 4
trace_raw_output_ma_read---of 4
trace_raw_output_ma_write---of 4
-----------
SUMMARY44%of 126

-----------
SUMMARY---of 0

__crypto_sha512_export---of 1
__crypto_sha512_export_core---of 1
__crypto_sha512_import---of 1
__crypto_sha512_import_core---of 1
crypto_hmac_sha384_digest---of 1
crypto_hmac_sha384_export---of 1
crypto_hmac_sha384_export_core---of 1
crypto_hmac_sha384_final---of 1
crypto_hmac_sha384_import---of 1
crypto_hmac_sha384_import_core---of 1
crypto_hmac_sha384_init---of 1
crypto_hmac_sha384_setkey---of 1
crypto_hmac_sha384_update---of 1
crypto_hmac_sha512_digest---of 1
crypto_hmac_sha512_export---of 1
crypto_hmac_sha512_export_core---of 1
crypto_hmac_sha512_final100%of 1
crypto_hmac_sha512_import---of 1
crypto_hmac_sha512_import_core---of 1
crypto_hmac_sha512_init100%of 1
crypto_hmac_sha512_setkey100%of 1
crypto_hmac_sha512_update100%of 1
crypto_sha384_digest---of 1
crypto_sha384_export---of 1
crypto_sha384_export_core---of 1
crypto_sha384_final---of 1
crypto_sha384_import---of 1
crypto_sha384_import_core---of 1
crypto_sha384_init---of 1
crypto_sha384_update---of 1
crypto_sha512_digest---of 1
crypto_sha512_export---of 1
crypto_sha512_export_core---of 1
crypto_sha512_final---of 1
crypto_sha512_import---of 1
crypto_sha512_import_core---of 1
crypto_sha512_init---of 1
crypto_sha512_update---of 1
-----------
SUMMARY100%of 4

__crypto_inst_setname---of 5
__crypto_lookup_template---of 8
__crypto_register_alg---of 21
crypto_alg_extsize100%of 1
crypto_alg_finish_registration---of 16
crypto_alg_tested---of 24
crypto_attr_alg_name---of 9
crypto_check_alg---of 20
crypto_check_attr_type---of 12
crypto_dequeue_request---of 8
crypto_destroy_instance---of 1
crypto_destroy_instance_workfn---of 23
crypto_drop_spawn---of 9
crypto_enqueue_request---of 7
crypto_enqueue_request_head---of 3
crypto_free_alg---of 1
crypto_get_attr_type---of 9
crypto_grab_spawn---of 10
crypto_inc---of 10
crypto_init_queue---of 1
crypto_lookup_template---of 4
crypto_register_alg---of 21
crypto_register_algs---of 8
crypto_register_instance---of 20
crypto_register_notifier---of 1
crypto_register_template---of 8
crypto_register_templates---of 8
crypto_remove_alg---of 5
crypto_remove_final---of 8
crypto_remove_instance.isra.0---of 21
crypto_remove_spawns---of 35
crypto_spawn_alg22%of 19
crypto_spawn_tfm50%of 8
crypto_spawn_tfm250%of 6
crypto_type_has_alg100%of 5
crypto_unregister_alg---of 9
crypto_unregister_algs---of 3
crypto_unregister_instance---of 1
crypto_unregister_notifier---of 1
crypto_unregister_template---of 19
crypto_unregister_templates---of 3
-----------
SUMMARY44%of 39

-----------
SUMMARY---of 0

call_blocking_lsm_notifier---of 1
inode_free_by_rcu---of 7
lsm_append.constprop.0---of 13
lsm_blob_alloc---of 5
lsm_fill_user_ctx---of 20
register_blocking_lsm_notifier---of 1
security_audit_rule_free---of 8
security_audit_rule_init---of 9
security_audit_rule_known---of 11
security_audit_rule_match---of 9
security_bdev_alloc---of 13
security_bdev_free---of 11
security_bdev_setintegrity---of 9
security_binder_set_context_mgr---of 11
security_binder_transaction---of 11
security_binder_transfer_binder---of 11
security_binder_transfer_file---of 11
security_bpf---of 9
security_bpf_map---of 11
security_bpf_map_create---of 12
security_bpf_map_free---of 7
security_bpf_prog---of 11
security_bpf_prog_free---of 7
security_bpf_prog_load---of 12
security_bpf_token_capable---of 11
security_bpf_token_cmd---of 11
security_bpf_token_create---of 12
security_bpf_token_free---of 7
security_bprm_check---of 11
security_bprm_committed_creds---of 8
security_bprm_committing_creds---of 8
security_bprm_creds_for_exec---of 11
security_bprm_creds_from_file---of 11
security_capable78%of 9
security_capget---of 9
security_capset---of 9
security_create_user_ns---of 11
security_cred_alloc_blank---of 13
security_cred_free---of 11
security_cred_getlsmprop---of 8
security_cred_getsecid---of 8
security_current_getlsmprop_subj---of 8
security_d_instantiate---of 11
security_dentry_create_files_as---of 9
security_dentry_init_security---of 10
security_file_alloc---of 14
security_file_fcntl---of 11
security_file_free---of 11
security_file_ioctl---of 11
security_file_ioctl_compat---of 11
security_file_lock---of 11
security_file_mprotect---of 11
security_file_open---of 11
security_file_permission---of 11
security_file_post_open---of 11
security_file_receive---of 11
security_file_release---of 8
security_file_send_sigiotask---of 11
security_file_set_fowner---of 8
security_file_truncate---of 11
security_free_mnt_opts---of 11
security_fs_context_dup---of 11
security_fs_context_parse_param---of 9
security_fs_context_submount---of 11
security_getprocattr---of 11
security_getselfattr---of 43
security_inet_conn_established---of 8
security_inet_conn_request---of 11
security_inet_csk_clone---of 8
security_initramfs_populated---of 8
security_inode_alloc---of 14
security_inode_copy_up---of 11
security_inode_copy_up_xattr---of 12
security_inode_create---of 13
security_inode_file_getattr---of 11
security_inode_file_setattr---of 11
security_inode_follow_link---of 13
security_inode_free---of 11
security_inode_get_acl---of 13
security_inode_getattr---of 13
security_inode_getlsmprop---of 8
security_inode_getsecctx---of 12
security_inode_getsecurity---of 12
security_inode_getxattr---of 13
security_inode_init_security---of 25
security_inode_init_security_anon---of 11
security_inode_invalidate_secctx---of 8
security_inode_killpriv---of 11
security_inode_link---of 13
security_inode_listsecurity---of 13
security_inode_listxattr---of 13
security_inode_mkdir---of 13
security_inode_mknod---of 11
security_inode_need_killpriv---of 11
security_inode_notifysecctx---of 11
security_inode_permission---of 13
security_inode_post_create_tmpfile---of 11
security_inode_post_remove_acl---of 11
security_inode_post_removexattr---of 11
security_inode_post_set_acl---of 11
security_inode_post_setattr---of 11
security_inode_post_setxattr---of 9
security_inode_readlink---of 13
security_inode_remove_acl---of 13
security_inode_removexattr---of 23
security_inode_rename---of 24
security_inode_rmdir---of 13
security_inode_set_acl---of 11
security_inode_setattr---of 13
security_inode_setintegrity---of 9
security_inode_setsecctx---of 11
security_inode_setsecurity---of 12
security_inode_setxattr---of 21
security_inode_symlink---of 13
security_inode_unlink---of 13
security_ipc_getlsmprop---of 8
security_ipc_permission---of 11
security_ismaclabel---of 11
security_kernel_act_as---of 11
security_kernel_create_files_as---of 11
security_kernel_load_data---of 11
security_kernel_module_request46%of 11
security_kernel_post_load_data---of 9
security_kernel_post_read_file---of 9
security_kernel_read_file---of 11
security_kernfs_init_security---of 11
security_key_alloc---of 12
security_key_free---of 1
security_key_getsecurity---of 11
security_key_permission---of 11
security_key_post_create_or_update---of 7
security_locked_down46%of 11
security_lsmprop_to_secctx---of 11
security_mmap_addr---of 11
security_mmap_file---of 27
security_move_mount---of 11
security_mptcp_add_subflow---of 11
security_msg_msg_alloc---of 13
security_msg_msg_free---of 7
security_msg_queue_alloc---of 13
security_msg_queue_associate---of 11
security_msg_queue_free---of 7
security_msg_queue_msgctl---of 11
security_msg_queue_msgrcv---of 9
security_msg_queue_msgsnd---of 11
security_netlink_send46%of 11
security_path_notify---of 11
security_perf_event_alloc---of 14
security_perf_event_free---of 1
security_perf_event_open---of 11
security_perf_event_read---of 11
security_perf_event_write---of 11
security_post_notification---of 11
security_prepare_creds---of 12
security_ptrace_access_check---of 11
security_ptrace_traceme---of 11
security_quota_on---of 11
security_quotactl---of 9
security_release_secctx---of 7
security_req_classify_flow---of 8
security_sb_alloc---of 13
security_sb_clone_mnt_opts---of 9
security_sb_delete---of 8
security_sb_eat_lsm_opts---of 11
security_sb_free---of 7
security_sb_kern_mount---of 11
security_sb_mnt_opts_compat---of 11
security_sb_mount---of 9
security_sb_pivotroot---of 11
security_sb_remount---of 11
security_sb_set_mnt_opts---of 9
security_sb_show_options---of 11
security_sb_statfs---of 11
security_sb_umount---of 11
security_sctp_assoc_established---of 11
security_sctp_assoc_request---of 11
security_sctp_bind_connect---of 9
security_sctp_sk_clone---of 8
security_secctx_to_secid---of 9
security_secid_to_secctx---of 12
security_secmark_refcount_dec---of 8
security_secmark_refcount_inc---of 8
security_secmark_relabel_packet---of 11
security_sem_alloc---of 13
security_sem_associate---of 11
security_sem_free---of 7
security_sem_semctl---of 11
security_sem_semop---of 9
security_setprocattr---of 11
security_setselfattr---of 22
security_settime64---of 11
security_shm_alloc---of 13
security_shm_associate---of 11
security_shm_free---of 7
security_shm_shmat---of 11
security_shm_shmctl---of 11
security_sk_alloc---of 12
security_sk_classify_flow---of 8
security_sk_clone---of 8
security_sk_free---of 7
security_skb_classify_flow---of 9
security_sock_graft---of 8
security_sock_rcv_skb46%of 11
security_socket_accept---of 11
security_socket_bind---of 11
security_socket_connect---of 11
security_socket_create---of 9
security_socket_getpeername---of 11
security_socket_getpeersec_dgram42%of 12
security_socket_getpeersec_stream---of 10
security_socket_getsockname---of 11
security_socket_getsockopt---of 11
security_socket_listen---of 11
security_socket_post_create---of 9
security_socket_recvmsg---of 9
security_socket_sendmsg46%of 11
security_socket_setsockopt---of 11
security_socket_shutdown---of 11
security_socket_socketpair---of 11
security_syslog---of 11
security_task_alloc---of 13
security_task_fix_setgid---of 11
security_task_fix_setgroups---of 11
security_task_fix_setuid---of 11
security_task_free---of 7
security_task_getioprio---of 11
security_task_getlsmprop_obj---of 8
security_task_getpgid---of 11
security_task_getscheduler---of 11
security_task_getsid---of 11
security_task_kill---of 9
security_task_movememory---of 11
security_task_prctl---of 11
security_task_prlimit---of 11
security_task_setioprio---of 11
security_task_setnice---of 11
security_task_setpgid---of 11
security_task_setrlimit---of 11
security_task_setscheduler---of 11
security_task_to_inode---of 8
security_transfer_creds---of 8
security_tun_dev_alloc_security---of 14
security_tun_dev_attach---of 11
security_tun_dev_attach_queue---of 11
security_tun_dev_create---of 11
security_tun_dev_free_security---of 1
security_tun_dev_open---of 11
security_unix_may_send---of 11
security_unix_stream_connect---of 11
security_uring_allowed---of 11
security_uring_cmd---of 11
security_uring_override_creds---of 11
security_uring_sqpoll---of 11
security_vm_enough_memory_mm---of 8
security_xfrm_decode_session---of 11
security_xfrm_policy_alloc37%of 11
security_xfrm_policy_clone---of 11
security_xfrm_policy_delete46%of 11
security_xfrm_policy_free63%of 8
security_xfrm_policy_lookup46%of 11
security_xfrm_state_alloc37%of 11
security_xfrm_state_alloc_acquire---of 11
security_xfrm_state_delete46%of 11
security_xfrm_state_free---of 8
security_xfrm_state_pol_flow_match---of 6
unregister_blocking_lsm_notifier---of 1
-----------
SUMMARY47%of 139

task_work_add---of 27
task_work_cancel---of 1
task_work_cancel_func---of 1
task_work_cancel_match---of 11
task_work_func_match---of 1
task_work_match---of 1
task_work_run91%of 11
task_work_set_notify_irq---of 1
-----------
SUMMARY91%of 11

-----------
SUMMARY---of 0

__crypto_alg_lookup90%of 20
__crypto_alloc_tfm100%of 1
__crypto_alloc_tfmgfp53%of 17
crypto_alg_lookup72%of 14
crypto_alg_mod_lookup76%of 33
crypto_alloc_base---of 14
crypto_alloc_tfm_node36%of 14
crypto_alloc_tfmmem.isra.075%of 4
crypto_clone_tfm---of 6
crypto_create_tfm_node58%of 14
crypto_destroy_alg50%of 10
crypto_destroy_tfm53%of 17
crypto_exit_ops100%of 8
crypto_find_alg75%of 4
crypto_has_alg100%of 5
crypto_larval_alloc67%of 12
crypto_larval_destroy75%of 8
crypto_larval_kill70%of 10
crypto_larval_wait49%of 29
crypto_mod_get50%of 12
crypto_mod_put67%of 6
crypto_probing_notify50%of 4
crypto_req_done---of 5
crypto_request_clone---of 4
crypto_schedule_test---of 4
crypto_shoot_alg---of 1
-----------
SUMMARY65%of 242

__xfrm6_tunnel_spi_check34%of 9
__xfrm6_tunnel_spi_lookup80%of 5
net_generic44%of 16
x6spi_destroy_rcu---of 1
xfrm6_tunnel_alloc_spi57%of 32
xfrm6_tunnel_destroy---of 17
xfrm6_tunnel_err---of 1
xfrm6_tunnel_init_state40%of 10
xfrm6_tunnel_input---of 1
xfrm6_tunnel_net_exit---of 9
xfrm6_tunnel_net_init---of 5
xfrm6_tunnel_output---of 1
xfrm6_tunnel_rcv---of 1
xfrm6_tunnel_spi_lookup58%of 14
-----------
SUMMARY52%of 86

-----------
SUMMARY---of 0

__bpf_trace_emulate_vsyscall---of 1
__probestub_emulate_vsyscall---of 1
__traceiter_emulate_vsyscall---of 8
emulate_vsyscall---of 63
gate_vma_name---of 1
get_gate_vma---of 7
in_gate_area---of 7
in_gate_area_no_mm50%of 6
perf_trace_emulate_vsyscall---of 7
trace_event_raw_event_emulate_vsyscall---of 8
trace_raw_output_emulate_vsyscall---of 5
warn_bad_vsyscall---of 4
write_ok_or_segv---of 4
-----------
SUMMARY50%of 6

-----------
SUMMARY---of 0

__request_module49%of 41
free_modprobe_argv100%of 1
-----------
SUMMARY50%of 42

__account_locked_vm---of 14
__compat_vma_mmap_prepare---of 19
__vcalloc_noprof---of 5
__vm_enough_memory---of 17
__vmalloc_array_noprof---of 5
account_locked_vm---of 20
compat_vma_mmap_prepare---of 1
folio_anon_vma---of 5
folio_copy---of 7
folio_mapping---of 9
folio_mc_copy---of 10
folio_pte_batch---of 53
get_cmdline---of 20
kfree_const---of 6
kmemdup_array---of 3
kmemdup_noprof72%of 7
kmemdup_nul---of 11
kstrdup64%of 11
kstrdup_const---of 6
kstrndup---of 16
kvmemdup---of 7
mem_dump_obj---of 12
memcmp_pages---of 1
memdup_user---of 13
memdup_user_nul---of 13
overcommit_kbytes_handler---of 4
overcommit_policy_handler---of 10
overcommit_ratio_handler---of 4
page_offline_begin---of 1
page_offline_end---of 1
page_offline_freeze---of 1
page_offline_thaw---of 1
randomize_page---of 7
randomize_stack_top---of 5
set_ps_flags.isra.0---of 16
snapshot_page---of 27
strndup_user---of 7
sync_overcommit_as---of 1
vcalloc_noprof---of 5
vm_commit_limit---of 4
vm_memory_committed---of 1
vm_mmap---of 6
vm_mmap_pgoff---of 23
vma_is_stack_for_current---of 6
vma_set_file---of 4
vmalloc_array_noprof---of 5
vmemdup_user---of 13
-----------
SUMMARY67%of 18

-----------
SUMMARY---of 0